From 6ea1e3dc2b3f1a295275dbec0b4f6fe8129bf504 Mon Sep 17 00:00:00 2001 From: Darwin Date: Wed, 13 Jul 2011 23:06:20 +0000 Subject: [PATCH] xnu-1699.22.73 Imported from https://opensource.apple.com/tarballs/xnu/xnu-1699.22.73.tar.gz --- EXTERNAL_HEADERS/Availability.h | 156 + EXTERNAL_HEADERS/AvailabilityInternal.h | 393 + EXTERNAL_HEADERS/AvailabilityMacros.h | 820 ++ EXTERNAL_HEADERS/Makefile | 7 +- EXTERNAL_HEADERS/architecture/Makefile | 4 +- EXTERNAL_HEADERS/architecture/ppc/Makefile | 33 - EXTERNAL_HEADERS/architecture/ppc/asm_help.h | 456 - .../architecture/ppc/basic_regs.h | 306 - EXTERNAL_HEADERS/architecture/ppc/fp_regs.h | 153 - .../architecture/ppc/macro_help.h | 64 - .../architecture/ppc/pseudo_inst.h | 420 - EXTERNAL_HEADERS/architecture/ppc/reg_help.h | 230 - EXTERNAL_HEADERS/mach-o/arm/reloc.h | 42 - EXTERNAL_HEADERS/mach-o/loader.h | 6 + EXTERNAL_HEADERS/mach-o/ppc/reloc.h | 65 - EXTERNAL_HEADERS/stdarg.h | 164 +- Makefile | 36 +- README | 71 +- {osfmk/profiling/ppc => SETUP}/Makefile | 17 +- SETUP/config/Makefile | 42 + SETUP/config/config.h | 293 + .../doconf/doconf.csh => SETUP/config/doconf | 19 +- SETUP/config/externs.c | 82 + SETUP/config/lexer.l | 214 + SETUP/config/main.c | 296 + SETUP/config/mkglue.c | 331 + SETUP/config/mkheaders.c | 276 + SETUP/config/mkioconf.c | 2086 ++++ SETUP/config/mkmakefile.c | 1182 +++ SETUP/config/mkswapconf.c | 247 + SETUP/config/openp.c | 93 + SETUP/config/parser.y | 1278 +++ SETUP/config/searchp.c | 90 + SETUP/kextsymboltool/Makefile | 31 + SETUP/kextsymboltool/kextsymboltool.c | 912 ++ .../newvers/newvers.csh => SETUP/newvers | 0 SETUP/seed_objroot | 133 - SETUP/setsegname/Makefile | 31 + SETUP/setsegname/setsegname.c | 237 + bsd/Makefile | 17 +- bsd/bsm/Makefile | 4 - bsd/bsm/audit.h | 3 + bsd/bsm/audit_kevents.h | 12 +- bsd/conf/MASTER | 32 +- bsd/conf/MASTER.i386 | 13 +- bsd/conf/MASTER.ppc | 99 - bsd/conf/MASTER.x86_64 | 13 +- bsd/conf/Makefile | 100 +- bsd/conf/Makefile.i386 | 40 +- bsd/conf/Makefile.ppc | 53 - bsd/conf/Makefile.template | 79 +- bsd/conf/Makefile.x86_64 | 40 +- bsd/conf/files | 36 +- bsd/conf/files.i386 | 18 +- bsd/conf/files.ppc | 34 - bsd/conf/files.x86_64 | 16 +- bsd/conf/param.c | 3 +- bsd/conf/tools/Makefile | 32 - bsd/conf/tools/doconf/Makefile | 49 - bsd/crypto/Makefile | 6 +- bsd/crypto/aes/Assert.c | 34 + bsd/crypto/aes/Makefile | 6 +- bsd/crypto/aes/aes.h | 16 +- bsd/crypto/aes/gen/Makefile | 4 - bsd/crypto/aes/gen/aesopt.h | 4 +- bsd/crypto/aes/i386/AES.s | 143 + bsd/crypto/aes/i386/Context.h | 9 + bsd/crypto/aes/i386/Data.mk | 30 + bsd/crypto/aes/i386/Data.s | 5196 ++++++++++ bsd/crypto/aes/i386/EncryptDecrypt.s | 607 ++ bsd/crypto/aes/i386/ExpandKeyForDecryption.s | 1214 +++ bsd/crypto/aes/i386/ExpandKeyForEncryption.s | 801 ++ bsd/crypto/aes/i386/MakeData.c | 516 + bsd/crypto/aes/i386/Makefile | 16 +- bsd/crypto/aes/i386/ReadMe.txt | 22 + bsd/crypto/aes/i386/aes_crypt_hw.s | 472 + bsd/crypto/aes/i386/aes_key_hw.s | 405 + bsd/crypto/aes/i386/aes_modes.c | 471 - bsd/crypto/aes/i386/aes_modes_asm.s | 420 + bsd/crypto/aes/i386/aes_modes_hw.s | 1669 ++++ bsd/crypto/aes/i386/aes_x86_v2.s | 1298 --- bsd/crypto/aes/i386/aesopt.h | 719 -- bsd/crypto/aes/i386/aesxts.c | 392 + bsd/crypto/aes/i386/aesxts.h | 103 + bsd/crypto/aes/i386/aesxts_asm.s | 1305 +++ bsd/crypto/aes/i386/edefs.h | 130 - bsd/crypto/aes/ppc/Makefile | 36 - bsd/crypto/aes/ppc/aescrypt.c | 411 - bsd/crypto/aes/ppc/aeskey.c | 455 - bsd/crypto/aes/ppc/aesopt.h | 753 -- bsd/crypto/aes/ppc/aestab.c | 384 - bsd/crypto/aes/ppc/aestab.h | 175 - bsd/crypto/aes/test/ReadMe.txt | 97 + bsd/crypto/aes/test/makegenx86.sh | 8 + bsd/crypto/aes/test/makeoptx86.sh | 10 + bsd/crypto/aes/test/tstaes.c | 131 + bsd/crypto/blowfish/Makefile | 4 - bsd/crypto/cast128/Makefile | 4 - bsd/crypto/des/Makefile | 4 - bsd/crypto/doc/KernelCrypto.plist | 76 + bsd/crypto/doc/KernelCrypto.txt | 149 + bsd/crypto/rc4/Makefile | 4 - bsd/crypto/sha2/Makefile | 4 - bsd/crypto/sha2/intel/sha256.s | 617 ++ bsd/crypto/sha2/intel/sha256nossse3.s | 649 ++ bsd/crypto/sha2/sha2.c | 38 +- bsd/dev/Makefile | 4 - bsd/dev/chud/chud_bsd_callback.c | 48 +- bsd/dev/dtrace/dtrace.c | 1098 +- bsd/dev/dtrace/dtrace_glue.c | 55 +- bsd/dev/dtrace/dtrace_subr.c | 7 +- bsd/dev/dtrace/fasttrap.c | 24 +- bsd/dev/dtrace/fbt.c | 186 +- bsd/dev/dtrace/lockstat.c | 14 +- bsd/dev/dtrace/profile_prvd.c | 55 +- bsd/dev/dtrace/sdt.c | 240 +- bsd/dev/dtrace/sdt_subr.c | 72 +- bsd/dev/dtrace/systrace.c | 62 +- bsd/dev/i386/conf.c | 1 + bsd/dev/i386/dtrace_isa.c | 37 +- bsd/dev/i386/fasttrap_isa.c | 5 +- bsd/dev/i386/fbt_x86.c | 1729 ++-- bsd/dev/i386/mem.c | 17 +- bsd/dev/i386/munge.s | 139 +- bsd/dev/i386/sdt_x86.c | 112 + bsd/dev/i386/sysctl.c | 167 +- bsd/dev/i386/systemcalls.c | 98 +- bsd/dev/i386/unix_signal.c | 6 +- bsd/dev/memdev.c | 12 +- bsd/dev/ppc/conf.c | 354 - bsd/dev/ppc/cons.c | 139 - bsd/dev/ppc/dtrace_isa.c | 589 -- bsd/dev/ppc/dtrace_subr_ppc.c | 193 - bsd/dev/ppc/fasttrap_isa.c | 734 -- bsd/dev/ppc/fbt_ppc.c | 694 -- bsd/dev/ppc/ffs.c | 59 - bsd/dev/ppc/ffs.s | 70 - bsd/dev/ppc/kern_machdep.c | 263 - bsd/dev/ppc/km.c | 392 - bsd/dev/ppc/mem.c | 241 - bsd/dev/ppc/munge.s | 477 - bsd/dev/ppc/ppc_init.c | 276 - bsd/dev/ppc/sdt_ppc.c | 71 - bsd/dev/ppc/stubs.c | 102 - bsd/dev/ppc/systemcalls.c | 435 - bsd/dev/ppc/unix_signal.c | 953 -- bsd/dev/ppc/xsumas.s | 401 - bsd/dev/random/Makefile | 4 - bsd/dev/unix_startup.c | 72 +- bsd/dev/vn/Makefile | 5 - bsd/dev/vn/vn.c | 14 +- bsd/dev/x86_64/munge.s | 130 + bsd/hfs/Makefile | 4 - bsd/hfs/hfs.h | 131 +- bsd/hfs/hfs_attrlist.c | 46 +- bsd/hfs/hfs_btreeio.c | 17 +- bsd/hfs/hfs_catalog.c | 338 +- bsd/hfs/hfs_catalog.h | 23 +- bsd/hfs/hfs_chash.c | 57 +- bsd/hfs/hfs_cnode.c | 1154 ++- bsd/hfs/hfs_cnode.h | 59 +- bsd/hfs/hfs_cprotect.c | 908 ++ bsd/hfs/hfs_dbg.h | 2 +- bsd/hfs/hfs_encodings.c | 6 +- bsd/hfs/hfs_endian.c | 15 +- bsd/hfs/hfs_format.h | 25 +- bsd/hfs/hfs_fsctl.h | 9 +- bsd/hfs/hfs_hotfiles.c | 31 +- bsd/hfs/hfs_kdebug.h | 54 + bsd/hfs/hfs_link.c | 88 +- bsd/hfs/hfs_lookup.c | 155 +- bsd/hfs/hfs_mount.h | 1 + bsd/hfs/hfs_notification.c | 16 + bsd/hfs/hfs_readwrite.c | 811 +- bsd/hfs/hfs_search.c | 48 +- bsd/hfs/hfs_vfsops.c | 3015 ++++-- bsd/hfs/hfs_vfsutils.c | 583 +- bsd/hfs/hfs_vnops.c | 1418 ++- bsd/hfs/hfs_xattr.c | 1038 +- bsd/hfs/hfscommon/BTree/BTree.c | 8 +- bsd/hfs/hfscommon/BTree/BTreeAllocate.c | 29 +- bsd/hfs/hfscommon/BTree/BTreeScanner.c | 2 +- bsd/hfs/hfscommon/Catalog/FileIDsServices.c | 286 +- bsd/hfs/hfscommon/Misc/FileExtentMapping.c | 118 +- bsd/hfs/hfscommon/Misc/HybridAllocator.c | 533 + bsd/hfs/hfscommon/Misc/VolumeAllocation.c | 3141 +++++- bsd/hfs/hfscommon/headers/FileMgrInternal.h | 36 +- bsd/hfs/hfscommon/headers/HybridAllocator.h | 101 + bsd/hfs/hfscommon/headers/RedBlackTree.h | 969 ++ bsd/i386/param.h | 14 +- bsd/kern/Makefile | 26 + bsd/kern/bsd_init.c | 246 +- bsd/kern/bsd_stubs.c | 96 + bsd/kern/decmpfs.c | 16 +- bsd/kern/imageboot.c | 268 +- bsd/kern/kdebug.c | 1234 ++- bsd/kern/kern_acct.c | 8 +- bsd/kern/kern_aio.c | 2 +- bsd/kern/kern_authorization.c | 63 +- bsd/kern/kern_clock.c | 2 +- bsd/kern/kern_control.c | 157 +- bsd/kern/kern_core.c | 42 +- bsd/kern/kern_credential.c | 1171 ++- bsd/kern/kern_descrip.c | 572 +- bsd/kern/kern_event.c | 237 +- bsd/kern/kern_exec.c | 1183 ++- bsd/kern/kern_exit.c | 108 +- bsd/kern/kern_fork.c | 45 +- bsd/kern/kern_lockf.c | 153 +- bsd/kern/kern_malloc.c | 226 +- bsd/kern/kern_memorystatus.c | 839 +- bsd/kern/kern_mib.c | 273 +- bsd/kern/kern_mman.c | 108 +- bsd/kern/kern_newsysctl.c | 580 +- bsd/kern/kern_panicinfo.c | 49 +- bsd/kern/kern_priv.c | 119 + bsd/kern/kern_proc.c | 137 +- bsd/kern/kern_prot.c | 153 +- bsd/kern/kern_resource.c | 281 +- bsd/kern/kern_shutdown.c | 49 +- bsd/kern/kern_sig.c | 156 +- bsd/kern/kern_symfile.c | 227 +- bsd/kern/kern_synch.c | 35 +- bsd/kern/kern_sysctl.c | 1457 ++- bsd/kern/kern_time.c | 3 +- bsd/kern/kern_xxx.c | 2 +- bsd/kern/kpi_mbuf.c | 100 +- bsd/kern/kpi_socket.c | 250 +- bsd/kern/kpi_socketfilter.c | 1210 ++- bsd/kern/mach_loader.c | 552 +- bsd/kern/mach_loader.h | 3 + bsd/kern/mach_process.c | 14 +- bsd/kern/makesyscalls.sh | 42 +- bsd/kern/mcache.c | 48 +- bsd/kern/netboot.c | 171 +- bsd/kern/policy_check.c | 511 + bsd/kern/posix_sem.c | 44 +- bsd/kern/posix_shm.c | 194 +- bsd/kern/proc_info.c | 481 +- bsd/kern/process_policy.c | 460 + bsd/kern/pthread_support.c | 3311 ++++--- bsd/kern/pthread_synch.c | 300 +- bsd/kern/subr_log.c | 234 +- bsd/kern/subr_prof.c | 58 +- bsd/kern/sys_generic.c | 349 +- bsd/kern/sys_pipe.c | 53 +- bsd/kern/sys_socket.c | 28 +- bsd/kern/syscalls.master | 128 +- bsd/kern/sysv_ipc.c | 91 +- bsd/kern/sysv_msg.c | 6 +- bsd/kern/sysv_sem.c | 34 +- bsd/kern/sysv_shm.c | 18 +- bsd/kern/trace.codes | 2149 ++++ bsd/kern/tty.c | 32 +- bsd/kern/tty_ptmx.c | 340 +- bsd/kern/tty_subr.c | 4 +- bsd/kern/tty_tty.c | 14 +- bsd/kern/ubc_subr.c | 276 +- bsd/kern/uipc_domain.c | 99 +- bsd/kern/uipc_mbuf.c | 1958 ++-- bsd/kern/uipc_mbuf2.c | 244 +- bsd/kern/uipc_socket.c | 701 +- bsd/kern/uipc_socket2.c | 237 +- bsd/kern/uipc_syscalls.c | 261 +- bsd/kern/uipc_usrreq.c | 189 +- bsd/kern/vm_pressure.c | 236 + bsd/{ppc/reg.h => kern/vm_pressure.h} | 22 +- bsd/libkern/libkern.h | 11 +- bsd/machine/_limits.h | 4 +- bsd/machine/_param.h | 4 +- bsd/machine/_structs.h | 4 +- bsd/machine/_types.h | 4 +- bsd/machine/dis_tables.h | 4 +- bsd/machine/disklabel.h | 4 +- bsd/machine/endian.h | 4 +- bsd/machine/exec.h | 6 +- bsd/machine/fasttrap_isa.h | 4 +- bsd/machine/limits.h | 4 +- bsd/machine/param.h | 4 +- bsd/machine/profile.h | 4 +- bsd/machine/psl.h | 4 +- bsd/machine/ptrace.h | 4 +- bsd/machine/reboot.h | 4 +- bsd/machine/reg.h | 4 +- bsd/machine/setjmp.h | 4 +- bsd/machine/signal.h | 4 +- bsd/machine/types.h | 4 +- bsd/machine/ucontext.h | 4 +- bsd/machine/vmparam.h | 4 +- bsd/man/man2/Makefile | 14 +- bsd/man/man2/auditon.2 | 16 + bsd/man/man2/dup.2 | 16 +- bsd/man/man2/exchangedata.2 | 3 +- bsd/man/man2/fcntl.2 | 111 +- bsd/man/man2/getattrlist.2 | 85 +- bsd/man/man2/getaudit.2 | 15 +- bsd/man/man2/getauid.2 | 4 +- bsd/man/man2/getdirentries.2 | 20 +- bsd/man/man2/getdirentriesattr.2 | 23 +- bsd/man/man2/getdtablesize.2 | 63 + bsd/man/man2/getfsstat.2 | 113 +- bsd/man/man2/getgroups.2 | 32 +- bsd/man/man2/gettimeofday.2 | 1 - bsd/man/man2/kqueue.2 | 42 +- bsd/man/man2/madvise.2 | 2 +- bsd/man/man2/mmap.2 | 9 + bsd/man/man2/open.2 | 18 +- bsd/man/man2/pathconf.2 | 10 + bsd/man/man2/pipe.2 | 9 +- bsd/man/man2/posix_spawn.2 | 13 +- bsd/man/man2/quotactl.2 | 1 - bsd/man/man2/sem_close.2 | 60 + bsd/man/man2/sem_open.2 | 169 + bsd/man/man2/sem_post.2 | 65 + bsd/man/man2/sem_unlink.2 | 74 + bsd/man/man2/sem_wait.2 | 88 + bsd/man/man2/sendfile.2 | 14 +- bsd/man/man2/setaudit.2 | 38 +- bsd/man/man2/setgroups.2 | 6 +- bsd/man/man2/setregid.2 | 92 + bsd/man/man2/setreuid.2 | 90 + bsd/man/man2/setxattr.2 | 7 + bsd/man/man2/shm_open.2 | 179 + bsd/man/man2/shm_unlink.2 | 87 + bsd/man/man2/stat.2 | 152 +- bsd/man/man2/statfs.2 | 27 +- bsd/man/man2/undelete.2 | 108 + .../man3/posix_spawn_file_actions_addclose.3 | 50 +- bsd/man/man3/posix_spawnattr_setflags.3 | 12 +- bsd/man/man4/auditpipe.4 | 16 +- bsd/man/man4/gif.4 | 4 +- bsd/man/man4/icmp6.4 | 2 +- bsd/man/man4/netintro.4 | 3 +- bsd/man/man4/random.4 | 2 +- bsd/man/man5/Makefile | 2 - bsd/man/man5/dir.5 | 16 +- bsd/man/man5/fs.5 | 343 - bsd/man/man5/inode.5 | 1 - bsd/miscfs/Makefile | 4 - bsd/miscfs/devfs/Makefile | 4 - bsd/miscfs/devfs/devfs_tree.c | 2 + bsd/miscfs/devfs/devfs_vfsops.c | 11 +- bsd/miscfs/devfs/devfs_vnops.c | 178 +- bsd/miscfs/devfs/devfsdefs.h | 44 +- bsd/miscfs/fifofs/Makefile | 4 - bsd/miscfs/nullfs/null.h | 118 - bsd/miscfs/nullfs/null_subr.c | 304 - bsd/miscfs/nullfs/null_vfsops.c | 382 - bsd/miscfs/nullfs/null_vnops.c | 570 -- bsd/miscfs/specfs/Makefile | 4 - bsd/miscfs/specfs/spec_vnops.c | 628 +- bsd/miscfs/specfs/specdev.h | 1 + bsd/miscfs/union/Makefile | 4 - bsd/miscfs/union/union.h | 151 - bsd/miscfs/union/union_subr.c | 1604 --- bsd/miscfs/union/union_vfsops.c | 563 -- bsd/miscfs/union/union_vnops.c | 1726 ---- bsd/net/Makefile | 13 +- bsd/net/bpf.c | 447 +- bsd/net/bpf.h | 12 +- bsd/net/bpf_filter.c | 137 +- bsd/net/bpfdesc.h | 16 +- bsd/net/bridgestp.c | 2425 +++++ bsd/net/bridgestp.h | 441 + bsd/net/dlil.c | 4276 ++++---- bsd/net/dlil.h | 121 +- bsd/net/ether_if_module.c | 45 +- bsd/net/ether_inet6_pr_module.c | 174 +- bsd/net/ether_inet_pr_module.c | 512 +- bsd/net/ethernet.h | 28 +- bsd/net/if.c | 1851 ++-- bsd/net/if.h | 51 +- bsd/net/if_atm.h | 136 - bsd/net/if_bond.c | 101 +- bsd/net/if_bridge.c | 5138 ++++++++++ bsd/net/if_bridgevar.h | 499 + bsd/net/if_disc.c | 240 - bsd/net/if_dummy.c | 290 - bsd/net/if_ethersubr.c | 229 - bsd/net/if_fddisubr.c | 637 -- bsd/net/if_gif.c | 134 +- bsd/net/if_gif.h | 1 + bsd/net/if_llreach.c | 565 ++ bsd/net/if_llreach.h | 150 + bsd/net/if_loop.c | 22 +- bsd/net/if_media.h | 2 +- bsd/net/if_mib.c | 64 +- bsd/net/if_mib.h | 2 +- bsd/net/if_pflog.c | 5 +- bsd/net/if_stf.c | 119 +- bsd/net/if_types.h | 5 +- bsd/net/if_utun.c | 20 +- bsd/net/if_var.h | 747 +- bsd/net/if_vlan.c | 641 +- bsd/net/kext_net.h | 63 +- bsd/net/kpi_interface.c | 1243 +-- bsd/net/kpi_interface.h | 112 +- bsd/net/kpi_protocol.c | 20 +- bsd/net/multicast_list.c | 3 +- bsd/net/ndrv.c | 4 + bsd/net/ndrv.h | 2 +- bsd/net/net_osdep.h | 14 +- bsd/net/net_str_id.c | 12 +- bsd/net/netsrc.c | 253 + .../ppc/cframe.h => bsd/net/netsrc.h | 62 +- bsd/net/ntstat.c | 1954 ++++ bsd/net/ntstat.h | 348 + bsd/net/pf.c | 126 +- bsd/net/pf_if.c | 36 +- bsd/net/pf_ioctl.c | 445 +- bsd/net/pf_osfp.c | 11 +- bsd/net/pf_table.c | 13 +- bsd/net/pfkeyv2.h | 1 + bsd/net/pfvar.h | 75 +- bsd/net/ppp_deflate.c | 4 +- bsd/net/route.c | 974 +- bsd/net/route.h | 76 +- bsd/net/rtsock.c | 965 +- bsd/net/rtsock_mip.c | 76 - bsd/netat/Makefile | 4 - bsd/netat/asp_proto.c | 2 +- bsd/netat/at.c | 78 +- bsd/netat/at_var.h | 4 +- bsd/netat/ddp.c | 13 +- bsd/netat/ddp_lap.c | 10 +- bsd/netat/sys_glue.c | 4 +- bsd/netinet/Makefile | 9 +- bsd/netinet/icmp6.h | 195 +- bsd/netinet/if_atm.c | 303 - bsd/netinet/if_atm.h | 77 - bsd/netinet/if_fddi.h | 114 - bsd/netinet/igmp.c | 3949 +++++++- bsd/netinet/igmp.h | 104 +- bsd/netinet/igmp_var.h | 239 +- bsd/netinet/in.c | 767 +- bsd/netinet/in.h | 204 +- bsd/netinet/in_arp.c | 621 +- bsd/netinet/in_arp.h | 5 +- bsd/netinet/in_cksum.c | 78 +- bsd/netinet/in_dhcp.c | 22 +- bsd/netinet/in_gif.c | 14 +- bsd/netinet/in_mcast.c | 3641 +++++++ bsd/netinet/in_pcb.c | 436 +- bsd/netinet/in_pcb.h | 147 +- bsd/netinet/in_pcblist.c | 383 + bsd/netinet/in_proto.c | 2 +- bsd/netinet/in_rmx.c | 86 +- bsd/netinet/in_tclass.c | 850 ++ bsd/netinet/in_var.h | 285 +- bsd/netinet/ip6.h | 103 +- bsd/netinet/ip_divert.c | 48 +- bsd/netinet/ip_dummynet.c | 129 +- bsd/netinet/ip_dummynet.h | 2 +- bsd/netinet/ip_encap.c | 14 +- bsd/netinet/ip_encap.h | 2 +- bsd/netinet/ip_flow.c | 380 - bsd/netinet/ip_flow.h | 87 - bsd/netinet/ip_fw.h | 2 + bsd/netinet/ip_fw2.c | 49 +- bsd/netinet/ip_fw2.h | 2 + bsd/netinet/ip_fw2_compat.c | 4 +- bsd/netinet/ip_icmp.c | 56 +- bsd/netinet/ip_id.c | 1 + bsd/netinet/ip_input.c | 376 +- bsd/netinet/ip_mroute.c | 40 +- bsd/netinet/ip_mroute.h | 4 +- bsd/netinet/ip_output.c | 1092 +- bsd/netinet/ip_var.h | 60 +- bsd/netinet/kpi_ipfilter.c | 169 +- bsd/netinet/kpi_ipfilter.h | 9 +- bsd/netinet/raw_ip.c | 179 +- bsd/netinet/tcp.h | 88 +- bsd/netinet/tcp_cc.h | 124 + bsd/netinet/tcp_debug.c | 2 +- bsd/netinet/tcp_input.c | 1305 ++- bsd/netinet/tcp_ledbat.c | 434 + bsd/netinet/tcp_newreno.c | 344 + bsd/netinet/tcp_output.c | 318 +- bsd/netinet/tcp_sack.c | 20 +- bsd/netinet/tcp_seq.h | 2 + bsd/netinet/tcp_subr.c | 440 +- bsd/netinet/tcp_timer.c | 953 +- bsd/netinet/tcp_timer.h | 117 +- bsd/netinet/tcp_usrreq.c | 278 +- bsd/netinet/tcp_var.h | 269 +- bsd/netinet/udp_usrreq.c | 395 +- bsd/netinet6/Makefile | 8 +- bsd/netinet6/ah.h | 2 +- bsd/netinet6/ah6.h | 2 +- bsd/netinet6/ah_core.c | 82 +- bsd/netinet6/ah_input.c | 57 +- bsd/netinet6/dest6.c | 10 +- bsd/netinet6/esp6.h | 2 +- bsd/netinet6/esp_core.c | 2 +- bsd/netinet6/esp_input.c | 146 +- bsd/netinet6/frag6.c | 75 +- bsd/netinet6/icmp6.c | 758 +- bsd/netinet6/in6.c | 1609 +-- bsd/netinet6/in6.h | 279 +- bsd/netinet6/in6_cksum.c | 2 +- bsd/netinet6/in6_gif.c | 23 +- bsd/netinet6/in6_gif.h | 2 +- bsd/netinet6/in6_ifattach.c | 448 +- bsd/netinet6/in6_ifattach.h | 40 +- bsd/netinet6/in6_mcast.c | 3490 +++++++ bsd/netinet6/in6_pcb.c | 362 +- bsd/netinet6/in6_pcb.h | 12 +- bsd/netinet6/in6_prefix.c | 241 +- bsd/netinet6/in6_prefix.h | 1 - bsd/netinet6/in6_proto.c | 117 +- bsd/netinet6/in6_rmx.c | 124 +- bsd/netinet6/in6_src.c | 1471 ++- bsd/netinet6/in6_var.h | 391 +- bsd/netinet6/ip6_forward.c | 132 +- bsd/netinet6/ip6_fw.c | 46 +- bsd/netinet6/ip6_fw.h | 2 + bsd/netinet6/ip6_id.c | 304 + bsd/netinet6/ip6_input.c | 863 +- bsd/netinet6/ip6_mroute.c | 141 +- bsd/netinet6/ip6_mroute.h | 16 +- bsd/netinet6/ip6_output.c | 2978 +++--- bsd/netinet6/ip6_var.h | 192 +- bsd/netinet6/ip6protosw.h | 10 +- bsd/netinet6/ipcomp6.h | 2 +- bsd/netinet6/ipcomp_input.c | 11 +- bsd/netinet6/ipsec.c | 156 +- bsd/netinet6/mld6.c | 3577 ++++++- bsd/netinet6/mld6.h | 139 + bsd/netinet6/mld6_var.h | 215 +- bsd/netinet6/nd6.c | 878 +- bsd/netinet6/nd6.h | 243 +- bsd/netinet6/nd6_nbr.c | 1082 +- bsd/netinet6/nd6_rtr.c | 3033 ++++-- bsd/netinet6/raw_ip6.c | 230 +- bsd/netinet6/route6.c | 62 +- bsd/netinet6/scope6.c | 257 +- bsd/netinet6/scope6_var.h | 46 + bsd/netinet6/tcp6_var.h | 32 +- bsd/netinet6/udp6_output.c | 110 +- bsd/netinet6/udp6_usrreq.c | 177 +- bsd/netinet6/udp6_var.h | 2 +- bsd/netkey/Makefile | 4 - bsd/netkey/key.c | 726 +- bsd/nfs/Makefile | 4 - bsd/nfs/krpc.h | 22 +- bsd/nfs/nfs.h | 485 +- bsd/nfs/nfs4_subs.c | 1990 +++- bsd/nfs/nfs4_vnops.c | 6126 ++++++++---- bsd/nfs/nfs_bio.c | 260 +- bsd/nfs/nfs_boot.c | 8 +- bsd/nfs/nfs_gss.c | 338 +- bsd/nfs/nfs_gss.h | 17 +- bsd/nfs/nfs_lock.c | 791 +- bsd/nfs/nfs_lock.h | 23 +- bsd/nfs/nfs_node.c | 275 +- bsd/nfs/nfs_serv.c | 211 +- bsd/nfs/nfs_socket.c | 2481 ++++- bsd/nfs/nfs_srvcache.c | 31 +- bsd/nfs/nfs_subs.c | 687 +- bsd/nfs/nfs_syscalls.c | 163 +- bsd/nfs/nfs_vfsops.c | 3856 ++++++-- bsd/nfs/nfs_vnops.c | 1388 ++- bsd/nfs/nfsm_subs.h | 68 +- bsd/nfs/nfsmount.h | 229 +- bsd/nfs/nfsnode.h | 136 +- bsd/nfs/nfsproto.h | 28 +- bsd/nfs/nfsrvcache.h | 11 +- bsd/nfs/rpcv2.h | 5 +- bsd/nfs/xdr_subs.h | 417 +- bsd/ppc/Makefile | 33 - bsd/ppc/_limits.h | 27 - bsd/ppc/_param.h | 46 - bsd/ppc/_structs.h | 217 - bsd/ppc/_types.h | 120 - bsd/ppc/decodePPC.h | 919 -- bsd/ppc/endian.h | 124 - bsd/ppc/exec.h | 108 - bsd/ppc/fasttrap_isa.h | 106 - bsd/ppc/limits.h | 107 - bsd/ppc/param.h | 141 - bsd/ppc/profile.h | 58 - bsd/ppc/reboot.h | 55 - bsd/ppc/setjmp.h | 121 - bsd/ppc/signal.h | 83 - bsd/ppc/types.h | 172 - bsd/ppc/ucontext.h | 73 - bsd/ppc/vmparam.h | 66 - bsd/security/Makefile | 4 - bsd/security/audit/Makefile | 4 - bsd/security/audit/audit.c | 83 +- bsd/security/audit/audit.h | 19 +- bsd/security/audit/audit_arg.c | 8 +- bsd/security/audit/audit_bsd.c | 169 +- bsd/security/audit/audit_bsd.h | 74 +- bsd/security/audit/audit_bsm.c | 22 +- bsd/security/audit/audit_ioctl.h | 25 + bsd/security/audit/audit_private.h | 5 +- bsd/security/audit/audit_session.c | 1949 ++-- bsd/security/audit/audit_syscalls.c | 214 +- bsd/security/audit/audit_worker.c | 33 +- bsd/sys/Makefile | 49 +- bsd/sys/attr.h | 31 +- bsd/sys/buf.h | 62 +- bsd/sys/buf_internal.h | 36 +- bsd/sys/cdefs.h | 122 +- bsd/sys/codesign.h | 3 + bsd/sys/conf.h | 32 +- .../sys/content_protection.h | 32 +- bsd/sys/cprotect.h | 117 +- bsd/sys/decmpfs.h | 2 +- bsd/sys/disk.h | 17 +- bsd/sys/dtrace.h | 70 +- bsd/sys/dtrace_glue.h | 96 +- bsd/sys/dtrace_impl.h | 39 + bsd/sys/errno.h | 56 +- bsd/sys/event.h | 47 +- bsd/sys/fasttrap_impl.h | 5 +- bsd/sys/fbt.h | 15 +- bsd/sys/fcntl.h | 63 +- bsd/sys/file.h | 2 + bsd/sys/file_internal.h | 10 +- bsd/sys/filedesc.h | 6 +- osfmk/ppc/machine_cpu.h => bsd/sys/fileport.h | 39 +- bsd/sys/fsctl.h | 176 +- bsd/sys/fsevents.h | 1 + bsd/sys/fslog.h | 8 + bsd/sys/imageboot.h | 11 +- bsd/sys/imgact.h | 30 +- bsd/{ppc/ptrace.h => sys/imgsrc.h} | 46 +- bsd/sys/kauth.h | 31 +- bsd/sys/kdebug.h | 168 +- bsd/sys/kern_control.h | 6 + bsd/sys/kern_memorystatus.h | 42 +- bsd/sys/kpi_mbuf.h | 61 +- bsd/sys/kpi_socket.h | 32 +- bsd/sys/make_posix_availability.sh | 71 + bsd/sys/make_symbol_aliasing.sh | 86 + bsd/sys/malloc.h | 20 +- bsd/sys/mbuf.h | 623 +- bsd/sys/mcache.h | 51 +- bsd/sys/mman.h | 1 + bsd/sys/mount.h | 89 +- bsd/sys/mount_internal.h | 53 +- bsd/sys/msgbuf.h | 22 +- bsd/sys/namei.h | 56 +- bsd/{dev/ppc/memmove.c => sys/netboot.h} | 38 +- bsd/sys/priv.h | 95 + bsd/sys/proc.h | 36 +- bsd/sys/proc_info.h | 90 +- bsd/sys/proc_internal.h | 42 +- bsd/sys/process_policy.h | 177 + bsd/sys/protosw.h | 16 +- bsd/sys/pthread_internal.h | 41 +- bsd/sys/queue.h | 68 +- bsd/sys/reboot.h | 2 +- bsd/sys/resource.h | 13 +- bsd/sys/sdt_impl.h | 5 +- bsd/sys/signal.h | 6 - bsd/sys/socket.h | 91 +- bsd/sys/socketvar.h | 144 +- bsd/sys/sockio.h | 17 +- bsd/sys/spawn.h | 9 + bsd/sys/spawn_internal.h | 5 +- bsd/sys/stat.h | 13 +- bsd/sys/sys_domain.h | 3 +- bsd/sys/sysctl.h | 123 +- bsd/sys/sysent.h | 5 +- bsd/sys/syslog.h | 164 +- bsd/sys/systm.h | 11 +- bsd/sys/time.h | 4 +- bsd/sys/tree.h | 693 +- bsd/sys/tty.h | 2 + bsd/sys/ubc.h | 1 + bsd/sys/ubc_internal.h | 12 + bsd/sys/ucontext.h | 6 - bsd/sys/ucred.h | 12 +- bsd/sys/un.h | 5 + bsd/sys/unistd.h | 3 +- bsd/sys/unpcb.h | 5 +- bsd/sys/user.h | 30 +- bsd/sys/vfs_context.h | 2 + bsd/sys/vnode.h | 312 +- bsd/sys/vnode_if.h | 211 +- bsd/sys/vnode_internal.h | 99 +- bsd/sys/xattr.h | 16 +- bsd/uuid/Makefile | 4 - bsd/vfs/Makefile | 4 - bsd/vfs/kpi_vfs.c | 942 +- bsd/vfs/vfs_attrlist.c | 211 +- bsd/vfs/vfs_bio.c | 787 +- bsd/vfs/vfs_cache.c | 103 +- bsd/vfs/vfs_cluster.c | 538 +- bsd/vfs/vfs_conf.c | 26 +- bsd/vfs/vfs_fsevents.c | 28 +- bsd/vfs/vfs_fslog.c | 82 +- bsd/vfs/vfs_init.c | 18 +- bsd/vfs/vfs_journal.c | 4873 +++++---- bsd/vfs/vfs_journal.h | 51 +- bsd/vfs/vfs_lookup.c | 1533 +-- bsd/vfs/vfs_subr.c | 2005 +++- bsd/vfs/vfs_syscalls.c | 2813 ++++-- bsd/vfs/vfs_utfconv.c | 17 +- bsd/vfs/vfs_vnops.c | 518 +- bsd/vfs/vfs_xattr.c | 87 +- bsd/vfs/vnode_if.c | 95 + bsd/vm/Makefile | 4 - bsd/vm/dp_backing_file.c | 62 +- bsd/vm/vm_unix.c | 481 +- bsd/vm/vnode_pager.c | 120 +- config/BSDKernel.exports | 16 + config/BSDKernel.ppc.exports | 37 - config/Dummy.exports | 1 + config/IOKit.exports | 10 +- config/IOKit.i386.exports | 3 - config/IOKit.ppc.exports | 383 - config/IOKit.x86_64.exports | 3 - config/Libkern.exports | 3 +- config/Libkern.i386.exports | 3 + config/Libkern.ppc.exports | 29 - config/Libkern.x86_64.exports | 4 +- config/MACFramework.exports | 2 + config/MACFramework.ppc.exports | 9 - config/Mach.ppc.exports | 1 - config/Makefile | 153 +- config/MasterVersion | 2 +- config/Private.exports | 42 +- config/Private.i386.exports | 23 +- config/Private.ppc.exports | 2 - config/Private.x86_64.exports | 22 + config/System6.0.exports | 8 +- config/System6.0.i386.exports | 4 +- config/System6.0.ppc.exports | 256 - config/Unsupported.exports | 10 +- config/Unsupported.i386.exports | 4 +- config/Unsupported.ppc.exports | 118 - config/Unsupported.x86_64.exports | 1 + config/version.c | 2 + .../IOKit/AppleKeyStoreInterface.h | 50 +- iokit/IOKit/IOBufferMemoryDescriptor.h | 13 +- iokit/IOKit/IOCatalogue.h | 22 +- iokit/IOKit/IOCommandGate.h | 6 +- iokit/IOKit/IODMACommand.h | 2 +- iokit/IOKit/IODataQueueShared.h | 2 +- iokit/IOKit/IOEventSource.h | 33 +- iokit/IOKit/IOHibernatePrivate.h | 58 +- iokit/IOKit/IOInterruptEventSource.h | 11 + iokit/IOKit/IOKitDebug.h | 29 +- iokit/IOKit/IOKitKeys.h | 1 + iokit/IOKit/IOKitKeysPrivate.h | 22 +- iokit/IOKit/IOKitServer.h | 9 +- iokit/IOKit/IOLib.h | 32 +- iokit/IOKit/IOMemoryCursor.h | 80 - iokit/IOKit/IOMemoryDescriptor.h | 26 +- iokit/IOKit/IOMessage.h | 176 +- iokit/IOKit/IONVRAM.h | 12 +- iokit/IOKit/IOPlatformExpert.h | 9 +- iokit/IOKit/IOService.h | 96 +- iokit/IOKit/IOServicePM.h | 9 + iokit/IOKit/IOSharedLock.h | 79 +- iokit/IOKit/IOStatistics.h | 220 + iokit/IOKit/IOStatisticsPrivate.h | 359 + iokit/IOKit/IOTimeStamp.h | 6 +- iokit/IOKit/IOTimerEventSource.h | 6 +- iokit/IOKit/IOTypes.h | 5 + iokit/IOKit/IOUserClient.h | 16 +- iokit/IOKit/IOWorkLoop.h | 39 +- iokit/IOKit/Makefile | 29 +- iokit/IOKit/i386/IOSharedLockImp.h | 113 - iokit/IOKit/machine/Makefile | 2 - iokit/IOKit/nvram/Makefile | 2 - iokit/IOKit/platform/Makefile | 2 - iokit/IOKit/power/Makefile | 2 - iokit/IOKit/ppc/IODBDMA.h | 367 - iokit/IOKit/ppc/IOSharedLockImp.h | 199 - iokit/IOKit/ppc/Makefile | 32 - iokit/IOKit/pwr_mgt/IOPM.h | 105 +- iokit/IOKit/pwr_mgt/IOPMDeprecated.h | 177 - iokit/IOKit/pwr_mgt/IOPMPrivate.h | 436 +- iokit/IOKit/pwr_mgt/IOPowerConnection.h | 22 +- iokit/IOKit/pwr_mgt/Makefile | 4 +- iokit/IOKit/pwr_mgt/RootDomain.h | 348 +- iokit/IOKit/rtc/Makefile | 2 - iokit/IOKit/system_management/Makefile | 2 - iokit/Kernel/IOBufferMemoryDescriptor.cpp | 10 - iokit/Kernel/IOCPU.cpp | 13 +- iokit/Kernel/IOCatalogue.cpp | 248 +- iokit/Kernel/IOCommandGate.cpp | 40 +- iokit/Kernel/IOCommandQueue.cpp | 27 +- iokit/Kernel/IODMACommand.cpp | 26 +- iokit/Kernel/IODMAController.cpp | 4 +- iokit/Kernel/IODeviceTreeSupport.cpp | 54 +- iokit/Kernel/IOEventSource.cpp | 109 +- iokit/Kernel/IOFilterInterruptEventSource.cpp | 45 +- iokit/Kernel/IOHibernateIO.cpp | 663 +- iokit/Kernel/IOHibernateInternal.h | 13 +- iokit/Kernel/IOHibernateRestoreKernel.c | 751 +- iokit/Kernel/IOInterruptController.cpp | 66 +- iokit/Kernel/IOInterruptEventSource.cpp | 90 +- iokit/Kernel/IOKitDebug.cpp | 14 +- iokit/Kernel/IOKitKernelInternal.h | 27 + iokit/Kernel/IOLib.cpp | 80 +- iokit/Kernel/IOMemoryCursor.cpp | 63 - iokit/Kernel/IOMemoryDescriptor.cpp | 117 +- iokit/Kernel/IONVRAM.cpp | 93 +- iokit/Kernel/IOPMPowerSource.cpp | 20 +- iokit/Kernel/IOPMrootDomain.cpp | 6646 ++++++++----- iokit/Kernel/IOPlatformExpert.cpp | 127 +- iokit/Kernel/IORegistryEntry.cpp | 7 +- iokit/Kernel/IOService.cpp | 296 +- iokit/Kernel/IOServicePM.cpp | 4078 +++++--- iokit/Kernel/IOServicePMPrivate.h | 493 +- iokit/Kernel/IOServicePrivate.h | 4 + iokit/Kernel/IOStartIOKit.cpp | 23 +- iokit/Kernel/IOStatistics.cpp | 1279 +++ iokit/Kernel/IOTimerEventSource.cpp | 49 +- iokit/Kernel/IOUserClient.cpp | 287 +- iokit/Kernel/IOWorkLoop.cpp | 356 +- iokit/Kernel/RootDomainUserClient.cpp | 303 +- iokit/Kernel/RootDomainUserClient.h | 27 +- iokit/Kernel/i386/IOKeyStoreHelper.cpp | 104 + iokit/Kernel/i386/IOSharedLock.s | 59 +- iokit/Kernel/ppc/IOAsmSupport.s | 120 - iokit/Kernel/ppc/IODBDMA.cpp | 161 - iokit/Kernel/x86_64/IOSharedLock.s | 55 +- iokit/KernelConfigTables.cpp | 26 +- iokit/Makefile | 5 +- iokit/bsddev/DINetBootHook.cpp | 132 +- iokit/bsddev/IOKitBSDInit.cpp | 8 +- iokit/conf/MASTER | 13 +- iokit/conf/MASTER.i386 | 5 +- iokit/conf/MASTER.ppc | 18 - iokit/conf/MASTER.x86_64 | 5 +- iokit/conf/Makefile | 19 +- iokit/conf/Makefile.i386 | 14 +- iokit/conf/Makefile.ppc | 27 - iokit/conf/Makefile.template | 22 +- iokit/conf/Makefile.x86_64 | 14 +- iokit/conf/files | 4 +- iokit/conf/files.i386 | 5 +- iokit/conf/files.ppc | 20 - iokit/conf/files.x86_64 | 5 +- iokit/conf/tools/Makefile | 32 - iokit/conf/tools/doconf/Makefile | 47 - iokit/conf/tools/doconf/doconf.csh | 321 - kgmacros | 4263 ++++++-- libkern/Makefile | 17 +- libkern/OSKextLib.cpp | 67 +- libkern/OSKextVersion.c | 1 + libkern/c++/OSKext.cpp | 2820 ++++-- libkern/c++/OSMetaClass.cpp | 1 + libkern/c++/OSObject.cpp | 26 - libkern/c++/OSObjectAsm.s | 75 - libkern/c++/OSOrderedSet.cpp | 4 +- libkern/c++/OSRuntime.cpp | 43 +- libkern/c++/OSSet.cpp | 31 +- libkern/c++/OSSymbol.cpp | 11 +- .../TestSerialization/test1/test1_main.cpp | 0 libkern/conf/MASTER | 7 + libkern/conf/MASTER.i386 | 5 +- libkern/conf/MASTER.ppc | 19 - libkern/conf/MASTER.x86_64 | 5 +- libkern/conf/Makefile | 19 +- libkern/conf/Makefile.i386 | 6 + libkern/conf/Makefile.ppc | 7 - libkern/conf/Makefile.template | 17 +- libkern/conf/Makefile.x86_64 | 6 + libkern/conf/files | 3 +- libkern/conf/files.i386 | 7 + libkern/conf/files.ppc | 6 - libkern/conf/files.x86_64 | 7 + libkern/conf/tools/Makefile | 32 - libkern/conf/tools/doconf/Makefile | 47 - libkern/conf/tools/doconf/doconf.csh | 321 - libkern/crypto/intel/sha1edp.h | 51 + libkern/crypto/intel/sha1edp.s | 1481 +++ libkern/crypto/sha1.c | 55 +- libkern/gen/OSAtomicOperations.c | 11 +- libkern/gen/OSDebug.cpp | 43 +- libkern/kernel_mach_header.c | 27 + libkern/kmod/Makefile.kmod | 18 +- libkern/kmod/cplus_start.c | 7 +- libkern/kmod/cplus_stop.c | 7 +- libkern/kxld/Makefile | 94 +- {iokit/Kernel => libkern/kxld}/WKdmCompress.c | 6 +- .../Kernel => libkern/kxld}/WKdmDecompress.c | 0 libkern/kxld/i386/WKdmCompress.s | 597 ++ libkern/kxld/i386/WKdmDecompress.s | 675 ++ libkern/kxld/kxld.c | 456 +- libkern/kxld/kxld_array.c | 3 + libkern/kxld/kxld_copyright.c | 34 +- libkern/kxld/kxld_demangle.c | 28 + libkern/kxld/kxld_demangle.h | 28 + libkern/kxld/kxld_kext.c | 3260 ++---- libkern/kxld/kxld_kext.h | 85 +- libkern/kxld/kxld_object.c | 2185 ++++ libkern/kxld/kxld_object.h | 159 + libkern/kxld/kxld_reloc.c | 298 +- libkern/kxld/kxld_reloc.h | 54 +- libkern/kxld/kxld_sect.c | 25 +- libkern/kxld/kxld_sect.h | 5 +- libkern/kxld/kxld_seg.c | 44 +- libkern/kxld/kxld_seg.h | 10 +- libkern/kxld/kxld_state.c | 1072 -- libkern/kxld/kxld_state.h | 155 - libkern/kxld/kxld_stubs.c | 25 +- libkern/kxld/kxld_sym.c | 222 +- libkern/kxld/kxld_sym.h | 19 +- libkern/kxld/kxld_symtab.c | 212 +- libkern/kxld/kxld_symtab.h | 33 +- libkern/kxld/kxld_util.c | 28 +- libkern/kxld/kxld_util.h | 12 +- libkern/kxld/kxld_vtable.c | 531 +- libkern/kxld/kxld_vtable.h | 41 +- libkern/kxld/tests/kextcopyright.c | 29 + libkern/kxld/tests/kxld_array_test.c | 160 + libkern/kxld/tests/kxld_dict_test.c | 44 +- .../psl.h => libkern/kxld/tests/kxld_test.c | 26 +- .../kxld/tests/kxld_test.h | 13 +- libkern/kxld/tests/loadtest.py | 28 + libkern/libkern/Makefile | 13 +- libkern/libkern/OSAtomic.h | 113 +- libkern/libkern/OSAtomic.h.save | 305 - libkern/libkern/OSByteOrder.h | 4 +- libkern/libkern/OSCrossEndian.h | 21 - libkern/libkern/OSDebug.h | 3 + libkern/libkern/OSKextLib.h | 40 +- libkern/libkern/OSKextLibPrivate.h | 122 +- {iokit/Kernel => libkern/libkern}/WKdm.h | 6 +- libkern/libkern/_OSByteOrder.h | 2 +- libkern/libkern/c++/Makefile | 4 - libkern/libkern/c++/OSKext.h | 95 +- libkern/libkern/c++/OSMetaClass.h | 9 +- libkern/libkern/c++/OSObject.h | 21 +- libkern/libkern/c++/OSOrderedSet.h | 4 +- libkern/libkern/c++/OSSet.h | 36 +- libkern/libkern/crypto/Makefile | 4 - libkern/libkern/crypto/sha1.h | 2 + libkern/libkern/kernel_mach_header.h | 6 +- libkern/libkern/kext_request_keys.h | 42 +- libkern/libkern/kxld.h | 45 +- libkern/libkern/kxld_types.h | 26 +- libkern/libkern/machine/Makefile | 4 - libkern/libkern/mkext.h | 1 + libkern/libkern/ppc/Makefile | 31 - libkern/libkern/ppc/OSByteOrder.h | 206 - libkern/libkern/prelink.h | 2 +- libkern/libkern/tree.h | 802 ++ libkern/libkern/version.h.template | 42 +- libkern/ppc/OSAtomic.s | 104 - libkern/ppc/bcmp.s | 92 - libkern/ppc/memcmp.s | 106 - libkern/ppc/strlen.s | 118 - libkern/uuid/Makefile | 4 - libkern/uuid/uuid.c | 21 +- libkern/x86_64/OSAtomic.s | 16 +- libkern/zlib/adler32.c | 22 +- libkern/zlib/arm/adler32vec.s | 428 - libkern/zlib/arm/inffastS.s | 565 -- libkern/zlib/inffast.c | 9 +- libkern/zlib/intel/adler32vec.s | 1050 ++ libkern/zlib/intel/inffastS.s | 1179 +++ libsa/Makefile | 4 +- libsa/bootstrap.cpp | 113 +- libsa/conf/MASTER | 1 - libsa/conf/MASTER.i386 | 1 - libsa/conf/MASTER.ppc | 18 - libsa/conf/MASTER.x86_64 | 1 - libsa/conf/Makefile | 19 +- libsa/conf/Makefile.i386 | 1 + libsa/conf/Makefile.ppc | 7 - libsa/conf/Makefile.template | 17 +- libsa/conf/Makefile.x86_64 | 1 + libsa/conf/files.ppc | 1 - libsa/conf/tools/Makefile | 32 - libsa/conf/tools/doconf/Makefile | 47 - libsa/conf/tools/doconf/doconf.csh | 321 - libsa/lastkernelconstructor.c | 4 +- libsa/libsa/Makefile | 2 - libsyscall/BSDmakefile | 141 - libsyscall/GNUmakefile | 8 - libsyscall/Libsyscall.xcconfig | 31 + .../Libsyscall.xcodeproj/project.pbxproj | 1029 ++ libsyscall/Makefile | 65 - libsyscall/Makefile.inc | 52 - libsyscall/Makefile.xbs | 130 - libsyscall/Platforms/MacOSX/i386/syscall.map | 93 + .../Platforms/MacOSX/x86_64/syscall.map | 54 + libsyscall/Platforms/syscall.map | 16 + libsyscall/create-syscalls.pl | 266 - libsyscall/custom/SYS.h | 47 +- libsyscall/custom/__fork.s | 176 +- libsyscall/custom/__getpid.s | 40 +- libsyscall/custom/__gettimeofday.s | 16 +- libsyscall/custom/__lseek.s | 6 +- libsyscall/custom/__pipe.s | 16 +- libsyscall/custom/__psynch_cvbroad.s | 4 +- libsyscall/custom/__psynch_cvwait.s | 4 +- libsyscall/custom/__ptrace.s | 14 +- libsyscall/custom/__sigaltstack.s | 6 +- libsyscall/custom/__sigreturn.s | 6 +- libsyscall/custom/__syscall.s | 8 +- libsyscall/custom/__thread_selfid.s | 4 +- libsyscall/custom/__vfork.s | 51 +- libsyscall/custom/custom.s | 39 +- .../custom/errno.c | 3 +- libsyscall/include/Makefile.inc | 1 - libsyscall/include/processor_facilities.h | 36 - libsyscall/mach/Makefile.inc | 74 - .../mach/abort.h | 15 +- libsyscall/mach/bootstrap_ports.c | 72 - libsyscall/mach/brk.2 | 150 - libsyscall/mach/clock_sleep.c | 13 +- .../mach/dylib_link.c | 4 +- libsyscall/mach/err_iokit.sub | 16 +- libsyscall/mach/err_ipc.sub | 6 +- libsyscall/mach/err_kern.sub | 4 +- libsyscall/mach/err_libkern.sub | 6 +- libsyscall/mach/err_mach_ipc.sub | 6 +- libsyscall/mach/err_server.sub | 32 +- libsyscall/mach/error_codes.c | 4 +- libsyscall/mach/errorlib.h | 15 +- libsyscall/mach/exc_catcher.c | 36 +- libsyscall/mach/exc_catcher.h | 64 + libsyscall/mach/exc_catcher_state.c | 35 +- libsyscall/mach/exc_catcher_state_identity.c | 35 +- libsyscall/mach/fprintf_stderr.c | 22 +- libsyscall/mach/headers/Makefile.inc | 10 - libsyscall/mach/i386/Makefile.inc | 3 - libsyscall/mach/{headers => mach}/errorlib.h | 12 +- libsyscall/mach/{headers => mach}/mach.h | 0 .../mach/{headers => mach}/mach_error.h | 0 libsyscall/mach/{headers => mach}/mach_init.h | 7 +- .../mach/{headers => mach}/mach_interface.h | 0 libsyscall/mach/{headers => mach}/port_obj.h | 0 libsyscall/mach/{headers => mach}/sync.h | 0 libsyscall/mach/{headers => mach}/task.h | 4 - .../mach/{headers => mach}/thread_act.h | 4 - libsyscall/mach/{headers => mach}/vm_task.h | 0 libsyscall/mach/mach_error.c | 16 +- libsyscall/mach/mach_error_string.c | 1 - libsyscall/mach/mach_init.c | 197 +- libsyscall/mach/mach_init_libSystem.c | 58 - libsyscall/mach/mach_init_ports.c | 140 - .../mach/mach_legacy.c | 29 +- libsyscall/mach/mach_msg.c | 68 +- libsyscall/mach/mig_allocate.c | 2 +- libsyscall/mach/mig_deallocate.c | 2 +- libsyscall/mach/mig_reply_port.c | 95 + .../rpc.h => libsyscall/mach/mig_reply_port.h | 13 +- libsyscall/mach/mig_strncpy.c | 23 +- libsyscall/mach/ms_thread_switch.c | 10 +- libsyscall/mach/panic.c | 19 +- libsyscall/mach/port_obj.c | 2 +- libsyscall/mach/ppc/Makefile.inc | 3 - libsyscall/mach/ppc64/Makefile.inc | 4 - libsyscall/mach/sbrk.c | 78 - libsyscall/mach/servers/Makefile.inc | 16 - libsyscall/mach/slot_name.c | 20 - libsyscall/mach/string.c | 120 + .../mach/string.h | 41 +- libsyscall/mach/x86_64/Makefile.inc | 3 - libsyscall/wrappers/__get_cpu_capabilities.s | 49 + .../wrappers/_errno.h | 8 +- .../wrappers/_libc_funcptr.c | 78 +- .../wrappers/_libkernel_init.c | 36 +- .../wrappers/_libkernel_init.h | 38 +- libsyscall/wrappers/cancelable/fcntl-base.c | 65 + .../cancelable/fcntl-cancel.c} | 15 +- libsyscall/wrappers/cancelable/fcntl.c | 34 + .../wrappers/cancelable/select-cancel.c | 26 + libsyscall/wrappers/cancelable/select.c | 27 + .../wrappers/cancelable/sigsuspend-cancel.c | 26 + .../cancelable/sigsuspend.c} | 13 +- .../wrappers/init_cpu_capabilities.c | 33 +- libsyscall/wrappers/ioctl.c | 47 + libsyscall/wrappers/kill.c | 43 + libsyscall/wrappers/legacy/accept.c | 56 + libsyscall/wrappers/legacy/bind.c | 56 + libsyscall/wrappers/legacy/connect.c | 56 + libsyscall/wrappers/legacy/getattrlist.c | 65 + libsyscall/wrappers/legacy/getpeername.c | 56 + libsyscall/wrappers/legacy/getsockname.c | 56 + libsyscall/wrappers/legacy/kill.c | 30 + libsyscall/wrappers/legacy/lchown.c | 55 + .../wrappers/legacy/listen.c | 47 +- libsyscall/wrappers/legacy/mprotect.c | 69 + libsyscall/wrappers/legacy/msync.c | 53 + libsyscall/wrappers/legacy/munmap.c | 65 + libsyscall/wrappers/legacy/open.c | 54 + libsyscall/wrappers/legacy/recvfrom.c | 55 + libsyscall/wrappers/legacy/recvmsg.c | 55 + libsyscall/wrappers/legacy/select-pre1050.c | 32 + libsyscall/wrappers/legacy/select.c | 31 + libsyscall/wrappers/legacy/sendmsg.c | 56 + libsyscall/wrappers/legacy/sendto.c | 56 + libsyscall/wrappers/legacy/setattrlist.c | 65 + libsyscall/wrappers/legacy/sigsuspend.c | 31 + libsyscall/wrappers/legacy/socketpair.c | 57 + libsyscall/wrappers/memcpy.c | 143 + libsyscall/wrappers/remove-counter.c | 49 + libsyscall/wrappers/rename.c | 33 + libsyscall/wrappers/rmdir.c | 33 + libsyscall/wrappers/select-base.c | 82 + libsyscall/wrappers/sigsuspend-base.c | 41 + libsyscall/wrappers/unix03/chmod.c | 62 + libsyscall/wrappers/unix03/fchmod.c | 62 + libsyscall/wrappers/unix03/getrlimit.c | 46 + libsyscall/wrappers/unix03/mmap.c | 62 + libsyscall/wrappers/unix03/setrlimit.c | 46 + libsyscall/wrappers/unlink.c | 33 + libsyscall/xcodescripts/compat-symlinks.sh | 32 + libsyscall/xcodescripts/compile-syscalls.pl | 130 + libsyscall/xcodescripts/create-syscalls.pl | 403 + libsyscall/xcodescripts/mach_install_mig.sh | 97 + makedefs/MakeInc.cmd | 127 +- makedefs/MakeInc.def | 202 +- makedefs/MakeInc.dir | 147 +- makedefs/MakeInc.rule | 131 +- osfmk/Makefile | 16 +- osfmk/UserNotification/Makefile | 4 - osfmk/UserNotification/UNDRequest.defs | 5 +- osfmk/chud/chud_cpu.c | 12 - osfmk/chud/chud_thread.c | 47 +- osfmk/chud/chud_xnu.h | 23 +- osfmk/chud/chud_xnu_glue.h | 4 +- osfmk/chud/chud_xnu_private.h | 4 +- osfmk/chud/i386/chud_osfmk_callback_i386.c | 90 +- osfmk/chud/i386/chud_thread_i386.c | 7 - osfmk/chud/ppc/chud_cpu_asm.h | 38 - osfmk/chud/ppc/chud_cpu_asm.s | 593 -- osfmk/chud/ppc/chud_cpu_ppc.c | 1182 --- osfmk/chud/ppc/chud_osfmk_callback_ppc.c | 549 - osfmk/chud/ppc/chud_spr.h | 273 - osfmk/chud/ppc/chud_thread_ppc.c | 586 -- osfmk/chud/ppc/chud_xnu_private.h | 59 - osfmk/conf/MASTER | 29 +- osfmk/conf/MASTER.i386 | 6 +- osfmk/conf/MASTER.ppc | 67 - osfmk/conf/MASTER.x86_64 | 12 +- osfmk/conf/Makefile | 17 +- osfmk/conf/Makefile.i386 | 15 +- osfmk/conf/Makefile.ppc | 76 - osfmk/conf/Makefile.template | 24 +- osfmk/conf/Makefile.x86_64 | 27 +- osfmk/conf/files | 15 +- osfmk/conf/files.i386 | 44 +- osfmk/conf/files.ppc | 120 - osfmk/conf/files.x86_64 | 41 +- osfmk/conf/tools/Makefile | 32 - osfmk/conf/tools/doconf/Makefile | 47 - osfmk/conf/tools/doconf/doconf.csh | 321 - osfmk/console/i386/serial_console.c | 10 +- osfmk/console/ppc/serial_console.c | 329 - osfmk/console/ppc/video_scroll.s | 141 - osfmk/console/serial_general.c | 1 - osfmk/console/serial_protos.h | 11 +- osfmk/console/video_console.c | 10 +- osfmk/ddb/db_command.c | 70 - osfmk/ddb/db_print.c | 2 +- osfmk/ddb/db_sym.c | 4 +- osfmk/ddb/db_trap.c | 4 - osfmk/ddb/db_variables.c | 4 +- osfmk/ddb/db_variables.h | 2 +- osfmk/ddb/makedis.c | 5 +- osfmk/default_pager/default_pager.c | 1 + osfmk/default_pager/default_pager_internal.h | 13 +- osfmk/default_pager/dp_backing_store.c | 448 +- osfmk/default_pager/dp_memory_object.c | 28 + osfmk/device/device.defs | 8 +- osfmk/device/iokit_rpc.c | 49 +- osfmk/device/subrs.c | 8 +- osfmk/gssd/Makefile | 4 - osfmk/gssd/gssd_mach.defs | 106 +- osfmk/gssd/gssd_mach_types.h | 45 +- osfmk/i386/AT386/model_dep.c | 242 +- osfmk/i386/Diagnostics.h | 4 +- osfmk/i386/Makefile | 8 +- osfmk/i386/acpi.c | 23 +- osfmk/i386/asm.h | 100 + osfmk/i386/bsd_i386.c | 283 +- osfmk/i386/bsd_i386_native.c | 283 + osfmk/i386/bzero.s | 2 +- osfmk/i386/commpage/atomic.s | 396 - osfmk/i386/commpage/bcopy_scalar.s | 136 - osfmk/i386/commpage/bcopy_sse2.s | 473 - osfmk/i386/commpage/bcopy_sse3x.s | 823 -- osfmk/i386/commpage/bcopy_sse3x_64.s | 820 -- osfmk/i386/commpage/bcopy_sse42.s | 311 - osfmk/i386/commpage/bcopy_sse42_64.s | 301 - osfmk/i386/commpage/bzero_scalar.s | 115 - osfmk/i386/commpage/bzero_sse2.s | 162 - osfmk/i386/commpage/bzero_sse2_64.s | 161 - osfmk/i386/commpage/bzero_sse42.s | 151 - osfmk/i386/commpage/bzero_sse42_64.s | 148 - osfmk/i386/commpage/cacheflush.s | 79 - osfmk/i386/commpage/commpage.c | 149 +- osfmk/i386/commpage/commpage.h | 1 + osfmk/i386/commpage/commpage_asm.s | 78 - osfmk/i386/commpage/commpage_gettimeofday.s | 122 - .../commpage/commpage_mach_absolute_time.s | 173 - osfmk/i386/commpage/commpage_sigs.c | 189 - osfmk/i386/commpage/cpu_number.s | 77 - osfmk/i386/commpage/fifo_queues.s | 74 - osfmk/i386/commpage/longcopy_sse3x.s | 221 - osfmk/i386/commpage/longcopy_sse3x_64.s | 210 - osfmk/i386/commpage/memset_pattern_sse2.s | 183 - osfmk/i386/commpage/memset_pattern_sse2_64.s | 184 - osfmk/i386/commpage/pthreads.s | 111 - osfmk/i386/commpage/spinlocks.s | 189 - osfmk/i386/copyio.c | 621 ++ osfmk/i386/cpu.c | 16 +- osfmk/i386/cpu_capabilities.h | 131 +- osfmk/i386/cpu_data.h | 90 +- osfmk/i386/cpuid.c | 57 +- osfmk/i386/cpuid.h | 2 +- osfmk/i386/cswitch.s | 12 +- osfmk/i386/db_interface.c | 2 + osfmk/i386/db_machdep.h | 4 +- osfmk/i386/db_trace.c | 18 +- osfmk/i386/endian.h | 8 +- osfmk/i386/etimer.c | 126 +- osfmk/i386/fpu.c | 37 +- osfmk/i386/fpu.h | 10 +- osfmk/i386/gdt.c | 14 +- osfmk/i386/genassym.c | 107 +- osfmk/i386/hibernate_i386.c | 18 +- osfmk/i386/hibernate_restore.c | 96 +- osfmk/i386/hw_lock_types.h | 2 +- osfmk/i386/i386_init.c | 114 +- osfmk/i386/i386_lock.s | 705 +- osfmk/i386/i386_vm_init.c | 271 +- osfmk/i386/idle_pt.c | 16 +- osfmk/i386/idt.s | 545 +- osfmk/i386/idt64.s | 717 +- osfmk/i386/ipl.h | 112 - osfmk/i386/lapic.c | 866 +- osfmk/i386/lapic.h | 2 + osfmk/i386/lapic_native.c | 919 ++ osfmk/i386/ldt.c | 12 +- osfmk/i386/locks.h | 82 +- osfmk/i386/locks_i386.c | 219 +- osfmk/i386/locore.s | 1241 +-- osfmk/i386/loose_ends.c | 644 +- osfmk/i386/machine_check.c | 53 +- osfmk/i386/machine_check.h | 9 +- osfmk/i386/machine_cpu.h | 1 + osfmk/i386/machine_routines.c | 79 +- osfmk/i386/machine_routines.h | 31 +- osfmk/i386/machine_routines_asm.s | 93 +- osfmk/i386/misc_protos.h | 5 + osfmk/i386/mp.c | 628 +- osfmk/i386/mp.h | 51 +- osfmk/i386/mp_desc.c | 103 +- osfmk/i386/mp_desc.h | 65 +- osfmk/i386/mp_events.h | 2 +- osfmk/i386/mp_native.c | 126 + osfmk/i386/mtrr.c | 5 +- osfmk/i386/pal_hibernate.h | 45 + .../{ppc/cpu_number.h => i386/pal_lock_asm.h} | 19 +- osfmk/i386/pal_native.h | 102 + osfmk/i386/pal_routines.c | 349 + osfmk/i386/pal_routines.h | 184 + osfmk/i386/pal_routines_asm.s | 192 + osfmk/{ppc/mp.h => i386/pal_rtclock_asm.h} | 15 +- osfmk/i386/pcb.c | 665 +- osfmk/i386/pcb_native.c | 652 ++ osfmk/i386/pmCPU.c | 131 +- osfmk/i386/pmCPU.h | 19 +- osfmk/i386/pmap.c | 811 +- osfmk/i386/pmap.h | 119 +- osfmk/i386/pmap_common.c | 505 + osfmk/i386/pmap_internal.h | 515 +- osfmk/i386/pmap_pcid.h | 99 + osfmk/i386/pmap_x86_common.c | 438 +- osfmk/i386/proc_reg.h | 122 +- osfmk/i386/rtclock.c | 231 +- osfmk/i386/rtclock_asm.h | 290 + .../i386/{rtclock.h => rtclock_asm_native.h} | 64 +- osfmk/i386/rtclock_native.c | 202 + .../{ppc/rtclock.h => i386/rtclock_protos.h} | 47 +- osfmk/i386/seg.h | 4 +- osfmk/i386/serial_io.h | 4 +- osfmk/i386/simple_lock.h | 9 +- osfmk/i386/start.s | 27 +- osfmk/i386/startup64.c | 4 - osfmk/i386/thread.h | 166 +- osfmk/i386/trap.c | 448 +- osfmk/i386/trap.h | 11 +- osfmk/i386/trap_native.c | 295 + osfmk/i386/tsc.c | 1 - osfmk/i386/tsc.h | 4 +- osfmk/i386/ucode.c | 201 + osfmk/i386/ucode.h | 30 + osfmk/i386/user_ldt.c | 4 +- osfmk/i386/vmx/vmx_asm.h | 4 +- osfmk/i386/vmx/vmx_cpu.c | 13 +- osfmk/ipc/ipc_entry.c | 11 +- osfmk/ipc/ipc_entry.h | 2 + osfmk/ipc/ipc_init.c | 21 +- osfmk/ipc/ipc_kmsg.c | 378 +- osfmk/ipc/ipc_kmsg.h | 21 +- osfmk/ipc/ipc_labelh.c | 4 + osfmk/ipc/ipc_mqueue.c | 37 +- osfmk/ipc/ipc_mqueue.h | 2 +- osfmk/ipc/ipc_notify.c | 18 + osfmk/ipc/ipc_notify.h | 5 + osfmk/ipc/ipc_object.c | 46 +- osfmk/ipc/ipc_object.h | 5 + osfmk/ipc/ipc_port.c | 316 +- osfmk/ipc/ipc_port.h | 63 +- osfmk/ipc/ipc_pset.c | 4 +- osfmk/ipc/ipc_right.c | 357 +- osfmk/ipc/ipc_right.h | 15 +- osfmk/ipc/ipc_space.c | 5 + osfmk/ipc/ipc_table.c | 14 +- osfmk/ipc/ipc_table.h | 6 +- osfmk/ipc/ipc_types.h | 1 + osfmk/ipc/mach_debug.c | 32 +- osfmk/ipc/mach_msg.c | 32 +- osfmk/ipc/mach_port.c | 87 +- osfmk/kdp/kdp.c | 156 +- osfmk/kdp/kdp_core.h | 15 +- osfmk/kdp/kdp_dyld.h | 2 +- osfmk/kdp/kdp_en_debugger.h | 1 + osfmk/kdp/kdp_private.h | 1 + osfmk/kdp/kdp_udp.c | 498 +- osfmk/kdp/ml/i386/kdp_vm.c | 102 +- osfmk/kdp/ml/i386/kdp_x86_common.c | 10 +- osfmk/kdp/ml/ppc/kdp_asm.s | 95 - osfmk/kdp/ml/ppc/kdp_machdep.c | 827 -- osfmk/kdp/ml/ppc/kdp_misc.s | 71 - osfmk/kdp/ml/ppc/kdp_vm.c | 570 -- osfmk/kdp/ml/x86_64/kdp_machdep.c | 5 + osfmk/kdp/ml/x86_64/kdp_vm.c | 37 +- osfmk/kern/Makefile | 1 + osfmk/kern/ast.c | 11 +- osfmk/kern/audit_sessionport.c | 139 +- osfmk/kern/audit_sessionport.h | 8 +- osfmk/kern/bsd_kern.c | 54 +- osfmk/kern/call_entry.h | 121 +- osfmk/kern/clock.c | 104 +- osfmk/kern/clock_oldops.c | 2 +- osfmk/kern/debug.c | 103 +- osfmk/kern/debug.h | 43 +- osfmk/kern/etimer.h | 13 +- osfmk/kern/exception.c | 1 - osfmk/kern/extmod_statistics.c | 136 + .../PPCcalls.c => kern/extmod_statistics.h} | 33 +- osfmk/kern/hibernate.c | 11 + osfmk/kern/host.c | 121 +- osfmk/kern/host.h | 4 +- osfmk/kern/host_notify.c | 12 +- osfmk/kern/host_statistics.h | 7 - osfmk/kern/ipc_kobject.c | 2 - osfmk/kern/ipc_mig.c | 46 +- osfmk/kern/ipc_misc.c | 99 +- osfmk/kern/ipc_misc.h | 6 +- osfmk/kern/kalloc.c | 142 +- osfmk/kern/kalloc.h | 6 +- osfmk/kern/kern_types.h | 9 + osfmk/kern/kext_alloc.c | 7 +- osfmk/kern/kmod.c | 8 +- osfmk/kern/locks.c | 34 +- osfmk/kern/locks.h | 11 +- osfmk/kern/mach_param.h | 2 +- osfmk/kern/machine.c | 23 +- osfmk/kern/misc_protos.h | 5 +- osfmk/kern/mk_sp.c | 26 +- osfmk/kern/pms.h | 11 - osfmk/kern/printf.c | 16 +- osfmk/kern/priority.c | 181 +- osfmk/kern/processor.c | 41 +- osfmk/kern/processor.h | 32 +- osfmk/kern/processor_data.h | 42 + osfmk/kern/queue.c | 65 +- osfmk/kern/queue.h | 52 +- osfmk/kern/sched.h | 126 +- osfmk/kern/sched_average.c | 38 +- osfmk/kern/sched_fixedpriority.c | 727 ++ osfmk/kern/sched_grrr.c | 956 ++ osfmk/kern/sched_prim.c | 1623 ++- osfmk/kern/sched_prim.h | 368 +- osfmk/kern/sched_proto.c | 597 ++ osfmk/kern/stack.c | 111 +- osfmk/kern/startup.c | 113 +- osfmk/kern/startup.h | 1 + osfmk/kern/sync_lock.c | 10 +- osfmk/kern/sync_sema.c | 4 +- osfmk/kern/syscall_subr.c | 44 +- osfmk/kern/syscall_sw.c | 4 + osfmk/kern/syscall_sw.h | 29 +- osfmk/kern/task.c | 281 +- osfmk/kern/task.h | 211 +- osfmk/kern/task_policy.c | 1154 ++- osfmk/kern/thread.c | 195 +- osfmk/kern/thread.h | 93 +- osfmk/kern/thread_act.c | 78 +- osfmk/kern/thread_call.c | 187 +- osfmk/kern/thread_policy.c | 178 +- osfmk/kern/timer_call.c | 552 +- osfmk/kern/timer_call.h | 31 +- osfmk/kern/timer_queue.h | 26 +- osfmk/kern/wait_queue.c | 105 +- osfmk/kern/wait_queue.h | 19 +- osfmk/kern/zalloc.c | 1332 ++- osfmk/kern/zalloc.h | 76 +- osfmk/kextd/Makefile | 4 - osfmk/libsa/machine/types.h | 4 +- osfmk/libsa/ppc/types.h | 71 - osfmk/libsa/types.h | 1 - osfmk/lockd/Makefile | 4 - osfmk/mach/Makefile | 38 +- osfmk/mach/branch_predicates.h | 35 + osfmk/mach/clock_types.h | 1 + osfmk/mach/host_info.h | 45 + osfmk/mach/i386/_structs.h | 28 +- osfmk/mach/i386/_types.h | 221 - osfmk/mach/i386/sdt_isa.h | 8 +- osfmk/mach/i386/thread_status.h | 123 +- osfmk/mach/i386/vm_param.h | 25 +- osfmk/mach/mach_host.defs | 14 + osfmk/mach/mach_port.defs | 13 +- osfmk/mach/mach_traps.h | 51 +- osfmk/mach/mach_types.defs | 5 +- osfmk/mach/mach_types.h | 1 + osfmk/mach/mach_vm.defs | 4 + osfmk/mach/machine.h | 2 + osfmk/mach/machine/asm.h | 4 +- osfmk/mach/machine/boolean.h | 4 +- osfmk/mach/machine/exception.h | 4 +- osfmk/mach/machine/kern_return.h | 4 +- osfmk/mach/machine/machine_types.defs | 4 +- osfmk/mach/machine/ndr_def.h | 4 +- osfmk/mach/machine/processor_info.h | 4 +- osfmk/mach/machine/rpc.h | 4 +- osfmk/mach/machine/sdt.h | 64 + osfmk/mach/machine/sdt_isa.h | 4 +- osfmk/mach/machine/syscall_sw.h | 4 +- osfmk/mach/machine/thread_state.h | 4 +- osfmk/mach/machine/thread_status.h | 4 +- osfmk/mach/machine/vm_param.h | 4 +- osfmk/mach/machine/vm_types.h | 4 +- osfmk/mach/memory_object.defs | 4 + osfmk/mach/memory_object_types.h | 17 +- osfmk/mach/message.h | 4 +- osfmk/mach/notify.defs | 12 +- osfmk/mach/notify.h | 11 +- osfmk/mach/port.h | 4 +- osfmk/mach/ppc/Makefile | 35 - osfmk/mach/ppc/_structs.h | 392 - osfmk/mach/ppc/_types.h | 234 - osfmk/mach/ppc/boolean.h | 74 - osfmk/mach/ppc/exception.h | 119 - osfmk/mach/ppc/kern_return.h | 74 - osfmk/mach/ppc/machine_types.defs | 126 - osfmk/mach/ppc/ndr_def.h | 43 - osfmk/mach/ppc/processor_info.h | 176 - osfmk/mach/ppc/sdt_isa.h | 427 - osfmk/mach/ppc/syscall_sw.h | 79 - osfmk/mach/ppc/thread_status.h | 150 - osfmk/mach/ppc/vm_param.h | 110 - osfmk/mach/ppc/vm_types.h | 157 - osfmk/mach/processor.defs | 2 +- osfmk/mach/security.defs | 2 +- osfmk/mach/shared_region.h | 18 +- osfmk/mach/syscall_sw.h | 4 +- osfmk/mach/task.defs | 28 +- osfmk/mach/task_info.h | 30 +- osfmk/mach/task_policy.h | 1 + osfmk/mach/thread_act.defs | 14 +- osfmk/mach/thread_policy.h | 16 + osfmk/mach/vm_prot.h | 9 +- osfmk/mach/vm_region.h | 2 + osfmk/mach/vm_statistics.h | 64 +- osfmk/mach_debug/mach_debug_types.defs | 17 +- osfmk/mach_debug/zone_info.h | 47 +- osfmk/machine/Makefile | 2 + osfmk/machine/asm.h | 4 +- osfmk/machine/ast.h | 4 +- osfmk/machine/ast_types.h | 4 +- osfmk/machine/commpage.h | 4 +- osfmk/machine/cpu_affinity.h | 4 +- osfmk/machine/cpu_capabilities.h | 8 +- osfmk/machine/cpu_data.h | 4 +- osfmk/machine/cpu_number.h | 4 +- osfmk/machine/db_machdep.h | 4 +- osfmk/machine/endian.h | 4 +- osfmk/machine/io_map_entries.h | 4 +- osfmk/machine/lock.h | 4 +- osfmk/machine/locks.h | 4 +- osfmk/machine/machine_cpu.h | 4 +- osfmk/machine/machine_routines.h | 4 +- osfmk/machine/machine_rpc.h | 4 +- osfmk/machine/machlimits.h | 4 +- osfmk/machine/machparam.h | 4 +- .../machine/pal_hibernate.h | 18 +- .../machine/pal_routines.h | 11 +- osfmk/machine/pmap.h | 4 +- osfmk/machine/sched_param.h | 4 +- osfmk/machine/setjmp.h | 4 +- osfmk/machine/simple_lock.h | 4 +- osfmk/machine/task.h | 4 +- osfmk/machine/thread.h | 4 +- osfmk/machine/timer.h | 4 +- osfmk/machine/trap.h | 4 +- osfmk/machine/vm_tuning.h | 4 +- osfmk/machine/xpr.h | 4 +- osfmk/pmc/pmc.c | 33 +- osfmk/pmc/pmc.h | 10 + osfmk/ppc/AltiAssist.s | 91 - osfmk/ppc/Diagnostics.c | 571 -- osfmk/ppc/Diagnostics.h | 124 - osfmk/ppc/Emulate.s | 1445 --- osfmk/ppc/Emulate64.s | 957 -- osfmk/ppc/Firmware.h | 166 - osfmk/ppc/Firmware.s | 2517 ----- osfmk/ppc/FirmwareC.c | 338 - osfmk/ppc/FirmwareCalls.h | 81 - osfmk/ppc/Makefile | 36 - osfmk/ppc/PPCcalls.h | 84 - osfmk/ppc/Performance.s | 124 - osfmk/ppc/PseudoKernel.c | 450 - osfmk/ppc/PseudoKernel.h | 99 - osfmk/ppc/_setjmp.s | 194 - osfmk/ppc/aligned_data.s | 209 - osfmk/ppc/asm.h | 781 -- osfmk/ppc/ast.h | 43 - osfmk/ppc/ast_types.h | 41 - osfmk/ppc/atomic_switch.h | 130 - osfmk/ppc/atomic_switch.s | 238 - osfmk/ppc/bat_init.c | 301 - osfmk/ppc/bcopy.s | 981 -- osfmk/ppc/bcopytest.c | 621 -- osfmk/ppc/bits.s | 111 - osfmk/ppc/boot.h | 28 - osfmk/ppc/bzero.s | 331 - osfmk/ppc/cache.s | 389 - osfmk/ppc/commpage/atomic.s | 280 - osfmk/ppc/commpage/bcopy_64.s | 306 - osfmk/ppc/commpage/bcopy_970.s | 626 -- osfmk/ppc/commpage/bcopy_g3.s | 275 - osfmk/ppc/commpage/bcopy_g4.s | 622 -- osfmk/ppc/commpage/bigcopy_970.s | 331 - osfmk/ppc/commpage/bzero_128.s | 173 - osfmk/ppc/commpage/bzero_32.s | 129 - osfmk/ppc/commpage/cacheflush.s | 110 - osfmk/ppc/commpage/commpage.c | 679 -- osfmk/ppc/commpage/commpage.h | 92 - osfmk/ppc/commpage/commpage_asm.s | 272 - osfmk/ppc/commpage/gettimeofday.s | 255 - osfmk/ppc/commpage/mach_absolute_time.s | 80 - osfmk/ppc/commpage/memset_64.s | 96 - osfmk/ppc/commpage/memset_g3.s | 132 - osfmk/ppc/commpage/memset_g4.s | 131 - osfmk/ppc/commpage/memset_g5.s | 168 - osfmk/ppc/commpage/pthread.s | 121 - osfmk/ppc/commpage/spinlocks.s | 247 - osfmk/ppc/conf.c | 87 - osfmk/ppc/console_feed.c | 266 - osfmk/ppc/console_feed_entries.h | 48 - osfmk/ppc/cpu.c | 1184 --- osfmk/ppc/cpu_capabilities.h | 254 - osfmk/ppc/cpu_data.h | 63 - osfmk/ppc/cpu_internal.h | 89 - osfmk/ppc/cswtch.s | 2486 ----- osfmk/ppc/db_asm.s | 107 - osfmk/ppc/db_disasm.c | 232 - osfmk/ppc/db_interface.c | 592 -- osfmk/ppc/db_low_trace.c | 1106 --- osfmk/ppc/db_low_trace.h | 62 - osfmk/ppc/db_machdep.h | 186 - osfmk/ppc/db_trace.c | 1122 --- osfmk/ppc/endian.h | 93 - osfmk/ppc/etimer.c | 195 - osfmk/ppc/exception.h | 693 -- osfmk/ppc/fpu_protos.h | 41 - osfmk/ppc/genassym.c | 1438 --- osfmk/ppc/hexfont.h | 301 - osfmk/ppc/hibernate_ppc.c | 213 - osfmk/ppc/hibernate_restore.s | 192 - osfmk/ppc/hw_exception.s | 1832 ---- osfmk/ppc/hw_lock.s | 2187 ---- osfmk/ppc/hw_lock_types.h | 74 - osfmk/ppc/hw_perfmon.c | 959 -- osfmk/ppc/hw_perfmon.h | 122 - osfmk/ppc/hw_perfmon_mmcr.h | 186 - osfmk/ppc/hw_vm.s | 8794 ----------------- osfmk/ppc/instrumentation.h | 61 - osfmk/ppc/interrupt.c | 187 - osfmk/ppc/io_map.c | 131 - osfmk/ppc/io_map_entries.h | 45 - osfmk/ppc/lock.h | 86 - osfmk/ppc/locks.h | 220 - osfmk/ppc/locks_ppc.c | 2360 ----- osfmk/ppc/low_trace.h | 92 - osfmk/ppc/lowglobals.h | 102 - osfmk/ppc/lowmem_vectors.s | 4010 -------- osfmk/ppc/machine_routines.c | 847 -- osfmk/ppc/machine_routines.h | 338 - osfmk/ppc/machine_routines_asm.s | 2345 ----- osfmk/ppc/machine_task.c | 85 - osfmk/ppc/machlimits.h | 92 - osfmk/ppc/machparam.h | 86 - osfmk/ppc/mappings.c | 1805 ---- osfmk/ppc/mappings.h | 499 - osfmk/ppc/mcount.s | 81 - osfmk/ppc/mem.h | 68 - osfmk/ppc/misc.c | 120 - osfmk/ppc/misc_asm.s | 287 - osfmk/ppc/misc_protos.h | 138 - osfmk/ppc/model_dep.c | 1045 -- osfmk/ppc/movc.s | 1303 --- osfmk/ppc/new_screen.h | 48 - osfmk/ppc/pcb.c | 672 -- osfmk/ppc/pmap.c | 2121 ---- osfmk/ppc/pmap.h | 338 - osfmk/ppc/pms.c | 743 -- osfmk/ppc/pmsCPU.c | 313 - osfmk/ppc/ppc_disasm.i | 234 - osfmk/ppc/ppc_init.c | 302 - osfmk/ppc/ppc_vm_init.c | 427 - osfmk/ppc/proc_reg.h | 403 - osfmk/ppc/rtclock.c | 306 - osfmk/ppc/savearea.c | 327 - osfmk/ppc/savearea.h | 393 - osfmk/ppc/savearea_asm.s | 1621 --- osfmk/ppc/scc_8530.h | 428 - osfmk/ppc/sched_param.h | 70 - osfmk/ppc/screen_switch.h | 141 - osfmk/ppc/serial_defs.h | 83 - osfmk/ppc/serial_io.c | 659 -- osfmk/ppc/serial_io.h | 150 - osfmk/ppc/setjmp.h | 57 - osfmk/ppc/simple_lock.h | 178 - osfmk/ppc/skiplists.s | 1297 --- osfmk/ppc/spec_reg.h | 47 - osfmk/ppc/start.s | 1283 --- osfmk/ppc/status.c | 1820 ---- osfmk/ppc/task.h | 63 - osfmk/ppc/thread.h | 212 - osfmk/ppc/trap.c | 1012 -- osfmk/ppc/trap.h | 105 - osfmk/ppc/vm_tuning.h | 35 - osfmk/ppc/vmachmon.c | 2024 ---- osfmk/ppc/vmachmon.h | 498 - osfmk/ppc/vmachmon_asm.s | 2368 ----- osfmk/profiling/Makefile | 6 - osfmk/profiling/machine/profile-md.h | 4 +- osfmk/profiling/ppc/profile-md.h | 144 - osfmk/vm/bsd_vm.c | 58 +- osfmk/vm/default_freezer.c | 616 ++ osfmk/vm/default_freezer.h | 160 + osfmk/vm/device_vm.c | 3 +- osfmk/vm/memory_object.c | 613 +- osfmk/vm/memory_object.h | 11 + osfmk/vm/pmap.h | 24 +- osfmk/vm/vm_apple_protect.c | 12 +- osfmk/vm/vm_debug.c | 8 +- osfmk/vm/vm_fault.c | 404 +- osfmk/vm/vm_fault.h | 3 +- osfmk/vm/vm_init.c | 3 + osfmk/vm/vm_kern.c | 52 +- osfmk/vm/vm_map.c | 1433 ++- osfmk/vm/vm_map.h | 68 +- osfmk/vm/vm_map_store.c | 176 + osfmk/vm/vm_map_store.h | 135 + osfmk/vm/vm_map_store_ll.c | 246 + .../machdep.c => osfmk/vm/vm_map_store_ll.h | 52 +- osfmk/vm/vm_map_store_rb.c | 166 + osfmk/vm/vm_map_store_rb.h | 46 + osfmk/vm/vm_object.c | 1134 ++- osfmk/vm/vm_object.h | 90 +- osfmk/vm/vm_page.h | 142 +- osfmk/vm/vm_pageout.c | 1700 ++-- osfmk/vm/vm_pageout.h | 61 +- osfmk/vm/vm_protos.h | 48 +- osfmk/vm/vm_purgeable_internal.h | 6 - osfmk/vm/vm_resident.c | 635 +- osfmk/vm/vm_shared_region.c | 452 +- osfmk/vm/vm_shared_region.h | 54 +- osfmk/vm/vm_swapfile_pager.c | 3 +- osfmk/vm/vm_user.c | 148 +- osfmk/x86_64/bzero.s | 2 +- osfmk/x86_64/copyio.c | 351 + osfmk/x86_64/cswitch.s | 13 +- osfmk/x86_64/idt64.s | 487 +- osfmk/x86_64/idt_table.h | 93 +- osfmk/x86_64/locore.s | 26 +- osfmk/x86_64/loose_ends.c | 369 +- osfmk/x86_64/machine_routines_asm.s | 41 +- osfmk/x86_64/pal_routines_asm.s | 194 + osfmk/x86_64/pmap.c | 948 +- osfmk/x86_64/pmap_pcid.c | 310 + osfmk/x86_64/start.s | 74 +- pexpert/Makefile | 20 +- pexpert/conf/MASTER | 1 - pexpert/conf/MASTER.i386 | 1 - pexpert/conf/MASTER.ppc | 18 - pexpert/conf/MASTER.x86_64 | 1 - pexpert/conf/Makefile | 19 +- pexpert/conf/Makefile.ppc | 8 - pexpert/conf/Makefile.template | 10 +- pexpert/conf/files.ppc | 7 - pexpert/conf/tools/Makefile | 32 - pexpert/conf/tools/doconf/Makefile | 47 - pexpert/conf/tools/doconf/doconf.csh | 321 - pexpert/gen/bootargs.c | 107 +- pexpert/i386/pe_init.c | 4 +- pexpert/i386/pe_kprintf.c | 15 +- pexpert/i386/pe_serial.c | 1 - pexpert/pexpert/Makefile | 11 - pexpert/pexpert/i386/boot.h | 31 +- pexpert/pexpert/i386/efi.h | 24 +- pexpert/pexpert/machine/boot.h | 4 +- pexpert/pexpert/machine/protos.h | 4 +- pexpert/pexpert/pexpert.h | 11 + pexpert/pexpert/ppc/Makefile | 27 - pexpert/pexpert/ppc/boot.h | 92 - pexpert/pexpert/ppc/interrupts.h | 36 - pexpert/pexpert/ppc/powermac.h | 60 - pexpert/pexpert/ppc/protos.h | 160 - pexpert/pexpert/protos.h | 4 - pexpert/ppc/pe_clock_speed.c | 183 - pexpert/ppc/pe_clock_speed_asm.s | 116 - pexpert/ppc/pe_identify_machine.c | 194 - pexpert/ppc/pe_init.c | 269 - pexpert/ppc/pe_kprintf.c | 154 - security/Makefile | 7 +- security/conf/MASTER | 2 +- security/conf/MASTER.i386 | 2 +- security/conf/MASTER.ppc | 31 - security/conf/MASTER.x86_64 | 2 +- security/conf/Makefile | 19 +- security/conf/Makefile.i386 | 11 - security/conf/Makefile.ppc | 18 - security/conf/Makefile.template | 11 +- security/conf/Makefile.x86_64 | 11 - security/conf/files | 1 + security/conf/files.i386 | 1 - security/conf/files.ppc | 1 - security/conf/tools/Makefile | 32 - security/conf/tools/doconf/Makefile | 49 - security/conf/tools/doconf/doconf.csh | 321 - security/conf/tools/newvers/Makefile | 47 - security/mac.h | 18 + security/mac_alloc.h | 1 + security/mac_audit.c | 11 - security/mac_base.c | 147 +- security/mac_framework.h | 27 +- security/mac_internal.h | 38 + security/mac_iokit.c | 27 + security/mac_label.c | 1 + security/mac_net.c | 4 +- security/mac_policy.h | 253 +- security/mac_posix_shm.c | 2 +- security/mac_priv.c | 106 + security/mac_process.c | 48 +- security/mac_stub.c | 20 + security/mac_system.c | 13 + security/mac_vfs.c | 48 +- tools/lockstat/Makefile | 2 +- tools/lockstat/lockstat.c | 6 +- tools/symbolify.py | 82 + tools/tests/MPMMTest/KQMPMMtest.c | 23 + tools/tests/MPMMTest/Makefile | 2 +- tools/tests/affinity/Makefile | 4 +- tools/tests/execperf/Makefile | 79 + tools/tests/execperf/exit-asm.S | 42 + tools/tests/execperf/exit.c | 12 + tools/tests/execperf/printexecinfo.c | 68 + tools/tests/execperf/run.c | 89 + tools/tests/execperf/test.sh | 30 + tools/tests/jitter/Makefile | 16 + tools/tests/jitter/cpu_number.s | 33 + tools/tests/jitter/timer_jitter.c | 480 + tools/tests/kqueue_tests/Makefile | 8 +- ..._readwrite_tests.c => kqueue_file_tests.c} | 380 +- tools/tests/libMicro/AppleReadMe | 107 +- tools/tests/libMicro/Makefile | 31 +- tools/tests/libMicro/Makefile.Darwin | 20 +- tools/tests/libMicro/Makefile.com.Darwin | 0 tools/tests/libMicro/README | 11 + tools/tests/libMicro/apple/Makefile.Darwin | 27 +- .../tests/libMicro/apple/Makefile.benchmarks | 17 +- .../tests/libMicro/apple/Makefile.com.Darwin | 3 + tools/tests/libMicro/apple/getaddrinfo_host.c | 244 + tools/tests/libMicro/apple/getaddrinfo_port.c | 157 + tools/tests/libMicro/apple/getgrent.c | 163 + tools/tests/libMicro/apple/getgrgid.c | 228 + tools/tests/libMicro/apple/getgrnam.c | 231 + tools/tests/libMicro/apple/getpwent.c | 163 + tools/tests/libMicro/apple/getpwnam.c | 262 + tools/tests/libMicro/apple/getpwuid.c | 256 + tools/tests/libMicro/apple/lmbench_bw_mem.c | 29 +- .../libMicro/apple/mbr_check_membership.c | 254 + .../apple/mbr_check_service_membership.c | 281 + .../apple/od_query_create_with_node.c | 381 + tools/tests/libMicro/bench.sh | 56 +- tools/tests/libMicro/benchDS.sh | 324 + tools/tests/libMicro/coreos_bench.sh | 837 ++ tools/tests/libMicro/exp.c | 19 + tools/tests/libMicro/libmicro.h | 2 + tools/tests/libMicro/log.c | 19 + tools/tests/libMicro/longjmp.c | 7 +- tools/tests/libMicro/od_account_create.sh | 129 + tools/tests/libMicro/od_account_delete.sh | 98 + tools/tests/libMicro/siglongjmp.c | 4 + tools/tests/superpages/testsp.c | 210 +- .../testkext.xcodeproj/project.pbxproj | 105 + .../tests/testkext/testthreadcall-Info.plist | 47 + tools/tests/testkext/testthreadcall.cpp | 65 + tools/tests/testkext/testthreadcall.h | 18 + tools/tests/testkext/testvmx.cpp | 3 - tools/tests/testkext/testvmx.h | 3 - .../tests/xnu_quick_test/32bit_inode_tests.c | 1 - tools/tests/xnu_quick_test/README | 13 +- .../xnu_quick_test/atomic_fifo_queue_test.c | 33 + tools/tests/xnu_quick_test/commpage_tests.c | 361 + tools/tests/xnu_quick_test/helpers/arch.c | 6 - .../tests/xnu_quick_test/helpers/data_exec.c | 19 +- tools/tests/xnu_quick_test/helpers/launch.c | 37 - tools/tests/xnu_quick_test/kqueue_tests.c | 76 +- tools/tests/xnu_quick_test/machvm_tests.c | 146 +- tools/tests/xnu_quick_test/main.c | 64 +- tools/tests/xnu_quick_test/makefile | 32 +- tools/tests/xnu_quick_test/memory_tests.c | 157 +- tools/tests/xnu_quick_test/misc.c | 11 +- tools/tests/xnu_quick_test/sched_tests.c | 231 + tools/tests/xnu_quick_test/socket_tests.c | 123 +- tools/tests/xnu_quick_test/tests.c | 254 +- tools/tests/xnu_quick_test/tests.h | 18 +- tools/tests/xnu_quick_test/xattr_tests.c | 70 +- tools/tests/zero-to-n/Makefile | 5 + tools/tests/zero-to-n/zero-to-n.c | 579 ++ 1834 files changed, 222690 insertions(+), 195265 deletions(-) create mode 100644 EXTERNAL_HEADERS/Availability.h create mode 100644 EXTERNAL_HEADERS/AvailabilityInternal.h create mode 100644 EXTERNAL_HEADERS/AvailabilityMacros.h delete mode 100644 EXTERNAL_HEADERS/architecture/ppc/Makefile delete mode 100644 EXTERNAL_HEADERS/architecture/ppc/asm_help.h delete mode 100644 EXTERNAL_HEADERS/architecture/ppc/basic_regs.h delete mode 100644 EXTERNAL_HEADERS/architecture/ppc/fp_regs.h delete mode 100644 EXTERNAL_HEADERS/architecture/ppc/macro_help.h delete mode 100644 EXTERNAL_HEADERS/architecture/ppc/pseudo_inst.h delete mode 100644 EXTERNAL_HEADERS/architecture/ppc/reg_help.h delete mode 100644 EXTERNAL_HEADERS/mach-o/arm/reloc.h delete mode 100644 EXTERNAL_HEADERS/mach-o/ppc/reloc.h rename {osfmk/profiling/ppc => SETUP}/Makefile (66%) create mode 100644 SETUP/config/Makefile create mode 100644 SETUP/config/config.h rename bsd/conf/tools/doconf/doconf.csh => SETUP/config/doconf (94%) create mode 100644 SETUP/config/externs.c create mode 100644 SETUP/config/lexer.l create mode 100644 SETUP/config/main.c create mode 100644 SETUP/config/mkglue.c create mode 100644 SETUP/config/mkheaders.c create mode 100644 SETUP/config/mkioconf.c create mode 100644 SETUP/config/mkmakefile.c create mode 100644 SETUP/config/mkswapconf.c create mode 100644 SETUP/config/openp.c create mode 100644 SETUP/config/parser.y create mode 100644 SETUP/config/searchp.c create mode 100644 SETUP/kextsymboltool/Makefile create mode 100644 SETUP/kextsymboltool/kextsymboltool.c rename security/conf/tools/newvers/newvers.csh => SETUP/newvers (100%) mode change 100644 => 100755 delete mode 100755 SETUP/seed_objroot create mode 100644 SETUP/setsegname/Makefile create mode 100644 SETUP/setsegname/setsegname.c delete mode 100644 bsd/conf/MASTER.ppc delete mode 100644 bsd/conf/Makefile.ppc delete mode 100644 bsd/conf/files.ppc delete mode 100644 bsd/conf/tools/Makefile delete mode 100644 bsd/conf/tools/doconf/Makefile create mode 100644 bsd/crypto/aes/Assert.c mode change 100644 => 100755 bsd/crypto/aes/aes.h create mode 100644 bsd/crypto/aes/i386/AES.s create mode 100644 bsd/crypto/aes/i386/Context.h create mode 100644 bsd/crypto/aes/i386/Data.mk create mode 100644 bsd/crypto/aes/i386/Data.s create mode 100644 bsd/crypto/aes/i386/EncryptDecrypt.s create mode 100644 bsd/crypto/aes/i386/ExpandKeyForDecryption.s create mode 100644 bsd/crypto/aes/i386/ExpandKeyForEncryption.s create mode 100644 bsd/crypto/aes/i386/MakeData.c create mode 100644 bsd/crypto/aes/i386/ReadMe.txt create mode 100644 bsd/crypto/aes/i386/aes_crypt_hw.s create mode 100644 bsd/crypto/aes/i386/aes_key_hw.s delete mode 100644 bsd/crypto/aes/i386/aes_modes.c create mode 100644 bsd/crypto/aes/i386/aes_modes_asm.s create mode 100644 bsd/crypto/aes/i386/aes_modes_hw.s delete mode 100644 bsd/crypto/aes/i386/aes_x86_v2.s delete mode 100644 bsd/crypto/aes/i386/aesopt.h create mode 100644 bsd/crypto/aes/i386/aesxts.c create mode 100644 bsd/crypto/aes/i386/aesxts.h create mode 100644 bsd/crypto/aes/i386/aesxts_asm.s delete mode 100644 bsd/crypto/aes/i386/edefs.h delete mode 100644 bsd/crypto/aes/ppc/Makefile delete mode 100644 bsd/crypto/aes/ppc/aescrypt.c delete mode 100644 bsd/crypto/aes/ppc/aeskey.c delete mode 100644 bsd/crypto/aes/ppc/aesopt.h delete mode 100644 bsd/crypto/aes/ppc/aestab.c delete mode 100644 bsd/crypto/aes/ppc/aestab.h create mode 100644 bsd/crypto/aes/test/ReadMe.txt create mode 100755 bsd/crypto/aes/test/makegenx86.sh create mode 100755 bsd/crypto/aes/test/makeoptx86.sh create mode 100644 bsd/crypto/aes/test/tstaes.c create mode 100644 bsd/crypto/doc/KernelCrypto.plist create mode 100644 bsd/crypto/doc/KernelCrypto.txt create mode 100644 bsd/crypto/sha2/intel/sha256.s create mode 100644 bsd/crypto/sha2/intel/sha256nossse3.s delete mode 100644 bsd/dev/ppc/conf.c delete mode 100644 bsd/dev/ppc/cons.c delete mode 100644 bsd/dev/ppc/dtrace_isa.c delete mode 100644 bsd/dev/ppc/dtrace_subr_ppc.c delete mode 100644 bsd/dev/ppc/fasttrap_isa.c delete mode 100644 bsd/dev/ppc/fbt_ppc.c delete mode 100644 bsd/dev/ppc/ffs.c delete mode 100644 bsd/dev/ppc/ffs.s delete mode 100644 bsd/dev/ppc/kern_machdep.c delete mode 100644 bsd/dev/ppc/km.c delete mode 100644 bsd/dev/ppc/mem.c delete mode 100644 bsd/dev/ppc/munge.s delete mode 100644 bsd/dev/ppc/ppc_init.c delete mode 100644 bsd/dev/ppc/sdt_ppc.c delete mode 100644 bsd/dev/ppc/stubs.c delete mode 100644 bsd/dev/ppc/systemcalls.c delete mode 100644 bsd/dev/ppc/unix_signal.c delete mode 100644 bsd/dev/ppc/xsumas.s create mode 100644 bsd/hfs/hfs_cprotect.c create mode 100644 bsd/hfs/hfs_kdebug.h create mode 100644 bsd/hfs/hfscommon/Misc/HybridAllocator.c create mode 100644 bsd/hfs/hfscommon/headers/HybridAllocator.h create mode 100644 bsd/hfs/hfscommon/headers/RedBlackTree.h create mode 100644 bsd/kern/Makefile create mode 100644 bsd/kern/kern_priv.c create mode 100644 bsd/kern/policy_check.c create mode 100644 bsd/kern/process_policy.c create mode 100644 bsd/kern/trace.codes create mode 100644 bsd/kern/vm_pressure.c rename bsd/{ppc/reg.h => kern/vm_pressure.h} (80%) create mode 100644 bsd/man/man2/getdtablesize.2 create mode 100644 bsd/man/man2/sem_close.2 create mode 100644 bsd/man/man2/sem_open.2 create mode 100644 bsd/man/man2/sem_post.2 create mode 100644 bsd/man/man2/sem_unlink.2 create mode 100644 bsd/man/man2/sem_wait.2 create mode 100644 bsd/man/man2/setregid.2 create mode 100644 bsd/man/man2/setreuid.2 create mode 100644 bsd/man/man2/shm_open.2 create mode 100644 bsd/man/man2/shm_unlink.2 create mode 100644 bsd/man/man2/undelete.2 delete mode 100644 bsd/man/man5/fs.5 delete mode 100644 bsd/man/man5/inode.5 delete mode 100644 bsd/miscfs/nullfs/null.h delete mode 100644 bsd/miscfs/nullfs/null_subr.c delete mode 100644 bsd/miscfs/nullfs/null_vfsops.c delete mode 100644 bsd/miscfs/nullfs/null_vnops.c delete mode 100644 bsd/miscfs/union/union_subr.c delete mode 100644 bsd/miscfs/union/union_vfsops.c delete mode 100644 bsd/miscfs/union/union_vnops.c create mode 100644 bsd/net/bridgestp.c create mode 100644 bsd/net/bridgestp.h delete mode 100644 bsd/net/if_atm.h create mode 100644 bsd/net/if_bridge.c create mode 100644 bsd/net/if_bridgevar.h delete mode 100644 bsd/net/if_disc.c delete mode 100644 bsd/net/if_dummy.c delete mode 100644 bsd/net/if_ethersubr.c delete mode 100644 bsd/net/if_fddisubr.c create mode 100644 bsd/net/if_llreach.c create mode 100644 bsd/net/if_llreach.h create mode 100644 bsd/net/netsrc.c rename EXTERNAL_HEADERS/architecture/ppc/cframe.h => bsd/net/netsrc.h (58%) create mode 100644 bsd/net/ntstat.c create mode 100644 bsd/net/ntstat.h delete mode 100644 bsd/net/rtsock_mip.c delete mode 100644 bsd/netinet/if_atm.c delete mode 100644 bsd/netinet/if_atm.h delete mode 100644 bsd/netinet/if_fddi.h create mode 100644 bsd/netinet/in_mcast.c create mode 100644 bsd/netinet/in_pcblist.c create mode 100644 bsd/netinet/in_tclass.c delete mode 100644 bsd/netinet/ip_flow.c delete mode 100644 bsd/netinet/ip_flow.h create mode 100644 bsd/netinet/tcp_cc.h create mode 100644 bsd/netinet/tcp_ledbat.c create mode 100644 bsd/netinet/tcp_newreno.c create mode 100644 bsd/netinet6/in6_mcast.c create mode 100644 bsd/netinet6/ip6_id.c create mode 100644 bsd/netinet6/mld6.h delete mode 100644 bsd/ppc/Makefile delete mode 100644 bsd/ppc/_limits.h delete mode 100644 bsd/ppc/_param.h delete mode 100644 bsd/ppc/_structs.h delete mode 100644 bsd/ppc/_types.h delete mode 100644 bsd/ppc/decodePPC.h delete mode 100644 bsd/ppc/endian.h delete mode 100644 bsd/ppc/exec.h delete mode 100644 bsd/ppc/fasttrap_isa.h delete mode 100644 bsd/ppc/limits.h delete mode 100644 bsd/ppc/param.h delete mode 100644 bsd/ppc/profile.h delete mode 100644 bsd/ppc/reboot.h delete mode 100644 bsd/ppc/setjmp.h delete mode 100644 bsd/ppc/signal.h delete mode 100644 bsd/ppc/types.h delete mode 100644 bsd/ppc/ucontext.h delete mode 100644 bsd/ppc/vmparam.h rename osfmk/ppc/Performance.h => bsd/sys/content_protection.h (71%) rename osfmk/ppc/machine_cpu.h => bsd/sys/fileport.h (68%) rename bsd/{ppc/ptrace.h => sys/imgsrc.h} (74%) create mode 100755 bsd/sys/make_posix_availability.sh create mode 100755 bsd/sys/make_symbol_aliasing.sh rename bsd/{dev/ppc/memmove.c => sys/netboot.h} (72%) create mode 100644 bsd/sys/priv.h create mode 100644 bsd/sys/process_policy.h delete mode 100644 config/BSDKernel.ppc.exports create mode 100644 config/Dummy.exports delete mode 100644 config/IOKit.ppc.exports delete mode 100644 config/Libkern.ppc.exports delete mode 100644 config/MACFramework.ppc.exports delete mode 100644 config/Mach.ppc.exports delete mode 100644 config/Private.ppc.exports delete mode 100644 config/System6.0.ppc.exports delete mode 100644 config/Unsupported.ppc.exports rename osfmk/ppc/cpu_affinity.h => iokit/IOKit/AppleKeyStoreInterface.h (62%) create mode 100644 iokit/IOKit/IOStatistics.h create mode 100644 iokit/IOKit/IOStatisticsPrivate.h delete mode 100644 iokit/IOKit/i386/IOSharedLockImp.h delete mode 100644 iokit/IOKit/ppc/IODBDMA.h delete mode 100644 iokit/IOKit/ppc/IOSharedLockImp.h delete mode 100644 iokit/IOKit/ppc/Makefile delete mode 100644 iokit/IOKit/pwr_mgt/IOPMDeprecated.h create mode 100644 iokit/Kernel/IOStatistics.cpp create mode 100644 iokit/Kernel/i386/IOKeyStoreHelper.cpp delete mode 100644 iokit/Kernel/ppc/IOAsmSupport.s delete mode 100644 iokit/Kernel/ppc/IODBDMA.cpp delete mode 100644 iokit/conf/MASTER.ppc delete mode 100644 iokit/conf/Makefile.ppc delete mode 100644 iokit/conf/files.ppc delete mode 100644 iokit/conf/tools/Makefile delete mode 100644 iokit/conf/tools/doconf/Makefile delete mode 100755 iokit/conf/tools/doconf/doconf.csh delete mode 100644 libkern/c++/OSObjectAsm.s mode change 100644 => 100755 libkern/c++/Tests/TestSerialization/test1/test1_main.cpp delete mode 100644 libkern/conf/MASTER.ppc delete mode 100644 libkern/conf/Makefile.ppc delete mode 100644 libkern/conf/files.ppc delete mode 100644 libkern/conf/tools/Makefile delete mode 100644 libkern/conf/tools/doconf/Makefile delete mode 100755 libkern/conf/tools/doconf/doconf.csh create mode 100644 libkern/crypto/intel/sha1edp.h create mode 100644 libkern/crypto/intel/sha1edp.s rename {iokit/Kernel => libkern/kxld}/WKdmCompress.c (97%) rename {iokit/Kernel => libkern/kxld}/WKdmDecompress.c (100%) create mode 100644 libkern/kxld/i386/WKdmCompress.s create mode 100644 libkern/kxld/i386/WKdmDecompress.s create mode 100644 libkern/kxld/kxld_object.c create mode 100644 libkern/kxld/kxld_object.h delete mode 100644 libkern/kxld/kxld_state.c delete mode 100644 libkern/kxld/kxld_state.h create mode 100644 libkern/kxld/tests/kxld_array_test.c rename bsd/ppc/psl.h => libkern/kxld/tests/kxld_test.c (76%) rename osfmk/ppc/machine_rpc.h => libkern/kxld/tests/kxld_test.h (86%) delete mode 100644 libkern/libkern/OSAtomic.h.save rename {iokit/Kernel => libkern/libkern}/WKdm.h (97%) delete mode 100644 libkern/libkern/ppc/Makefile delete mode 100644 libkern/libkern/ppc/OSByteOrder.h create mode 100644 libkern/libkern/tree.h delete mode 100644 libkern/ppc/OSAtomic.s delete mode 100644 libkern/ppc/bcmp.s delete mode 100644 libkern/ppc/memcmp.s delete mode 100644 libkern/ppc/strlen.s delete mode 100644 libkern/zlib/arm/adler32vec.s delete mode 100644 libkern/zlib/arm/inffastS.s create mode 100644 libkern/zlib/intel/adler32vec.s create mode 100644 libkern/zlib/intel/inffastS.s delete mode 100644 libsa/conf/MASTER.ppc delete mode 100644 libsa/conf/Makefile.ppc delete mode 100644 libsa/conf/files.ppc delete mode 100644 libsa/conf/tools/Makefile delete mode 100644 libsa/conf/tools/doconf/Makefile delete mode 100755 libsa/conf/tools/doconf/doconf.csh delete mode 100644 libsyscall/BSDmakefile delete mode 100644 libsyscall/GNUmakefile create mode 100644 libsyscall/Libsyscall.xcconfig create mode 100644 libsyscall/Libsyscall.xcodeproj/project.pbxproj delete mode 100644 libsyscall/Makefile delete mode 100644 libsyscall/Makefile.inc delete mode 100644 libsyscall/Makefile.xbs create mode 100644 libsyscall/Platforms/MacOSX/i386/syscall.map create mode 100644 libsyscall/Platforms/MacOSX/x86_64/syscall.map create mode 100644 libsyscall/Platforms/syscall.map delete mode 100755 libsyscall/create-syscalls.pl rename osfmk/chud/ppc/chud_xnu_glue.h => libsyscall/custom/errno.c (95%) delete mode 100644 libsyscall/include/Makefile.inc delete mode 100644 libsyscall/include/processor_facilities.h delete mode 100644 libsyscall/mach/Makefile.inc rename osfmk/ppc/hardclock_entries.h => libsyscall/mach/abort.h (83%) delete mode 100644 libsyscall/mach/bootstrap_ports.c delete mode 100644 libsyscall/mach/brk.2 rename iokit/Kernel/ppc/IOSharedLock.s => libsyscall/mach/dylib_link.c (90%) create mode 100644 libsyscall/mach/exc_catcher.h delete mode 100644 libsyscall/mach/headers/Makefile.inc delete mode 100644 libsyscall/mach/i386/Makefile.inc rename libsyscall/mach/{headers => mach}/errorlib.h (94%) rename libsyscall/mach/{headers => mach}/mach.h (100%) rename libsyscall/mach/{headers => mach}/mach_error.h (100%) rename libsyscall/mach/{headers => mach}/mach_init.h (95%) rename libsyscall/mach/{headers => mach}/mach_interface.h (100%) rename libsyscall/mach/{headers => mach}/port_obj.h (100%) rename libsyscall/mach/{headers => mach}/sync.h (100%) rename libsyscall/mach/{headers => mach}/task.h (93%) rename libsyscall/mach/{headers => mach}/thread_act.h (92%) rename libsyscall/mach/{headers => mach}/vm_task.h (100%) delete mode 100644 libsyscall/mach/mach_init_libSystem.c delete mode 100644 libsyscall/mach/mach_init_ports.c rename pexpert/ppc/pe_bootargs.c => libsyscall/mach/mach_legacy.c (80%) create mode 100644 libsyscall/mach/mig_reply_port.c rename osfmk/mach/ppc/rpc.h => libsyscall/mach/mig_reply_port.h (85%) delete mode 100644 libsyscall/mach/ppc/Makefile.inc delete mode 100644 libsyscall/mach/ppc64/Makefile.inc delete mode 100644 libsyscall/mach/sbrk.c delete mode 100644 libsyscall/mach/servers/Makefile.inc create mode 100644 libsyscall/mach/string.c rename osfmk/mach/ppc/thread_state.h => libsyscall/mach/string.h (59%) delete mode 100644 libsyscall/mach/x86_64/Makefile.inc create mode 100644 libsyscall/wrappers/__get_cpu_capabilities.s rename osfmk/x86_64/genassym.c => libsyscall/wrappers/_errno.h (88%) rename osfmk/ppc/testjump.c => libsyscall/wrappers/_libc_funcptr.c (55%) rename bsd/hfs/cprotect.c => libsyscall/wrappers/_libkernel_init.c (67%) rename bsd/ppc/disklabel.h => libsyscall/wrappers/_libkernel_init.h (65%) create mode 100644 libsyscall/wrappers/cancelable/fcntl-base.c rename libsyscall/{mach/x86_64/mach_absolute_time.S => wrappers/cancelable/fcntl-cancel.c} (81%) create mode 100644 libsyscall/wrappers/cancelable/fcntl.c create mode 100644 libsyscall/wrappers/cancelable/select-cancel.c create mode 100644 libsyscall/wrappers/cancelable/select.c create mode 100644 libsyscall/wrappers/cancelable/sigsuspend-cancel.c rename libsyscall/{mach/i386/mach_absolute_time.S => wrappers/cancelable/sigsuspend.c} (81%) rename bsd/dev/ppc/sysctl.c => libsyscall/wrappers/init_cpu_capabilities.c (61%) create mode 100644 libsyscall/wrappers/ioctl.c create mode 100644 libsyscall/wrappers/kill.c create mode 100644 libsyscall/wrappers/legacy/accept.c create mode 100644 libsyscall/wrappers/legacy/bind.c create mode 100644 libsyscall/wrappers/legacy/connect.c create mode 100644 libsyscall/wrappers/legacy/getattrlist.c create mode 100644 libsyscall/wrappers/legacy/getpeername.c create mode 100644 libsyscall/wrappers/legacy/getsockname.c create mode 100644 libsyscall/wrappers/legacy/kill.c create mode 100644 libsyscall/wrappers/legacy/lchown.c rename osfmk/ppc/xpr.h => libsyscall/wrappers/legacy/listen.c (50%) create mode 100644 libsyscall/wrappers/legacy/mprotect.c create mode 100644 libsyscall/wrappers/legacy/msync.c create mode 100644 libsyscall/wrappers/legacy/munmap.c create mode 100644 libsyscall/wrappers/legacy/open.c create mode 100644 libsyscall/wrappers/legacy/recvfrom.c create mode 100644 libsyscall/wrappers/legacy/recvmsg.c create mode 100644 libsyscall/wrappers/legacy/select-pre1050.c create mode 100644 libsyscall/wrappers/legacy/select.c create mode 100644 libsyscall/wrappers/legacy/sendmsg.c create mode 100644 libsyscall/wrappers/legacy/sendto.c create mode 100644 libsyscall/wrappers/legacy/setattrlist.c create mode 100644 libsyscall/wrappers/legacy/sigsuspend.c create mode 100644 libsyscall/wrappers/legacy/socketpair.c create mode 100644 libsyscall/wrappers/memcpy.c create mode 100644 libsyscall/wrappers/remove-counter.c create mode 100644 libsyscall/wrappers/rename.c create mode 100644 libsyscall/wrappers/rmdir.c create mode 100644 libsyscall/wrappers/select-base.c create mode 100644 libsyscall/wrappers/sigsuspend-base.c create mode 100644 libsyscall/wrappers/unix03/chmod.c create mode 100644 libsyscall/wrappers/unix03/fchmod.c create mode 100644 libsyscall/wrappers/unix03/getrlimit.c create mode 100644 libsyscall/wrappers/unix03/mmap.c create mode 100644 libsyscall/wrappers/unix03/setrlimit.c create mode 100644 libsyscall/wrappers/unlink.c create mode 100755 libsyscall/xcodescripts/compat-symlinks.sh create mode 100755 libsyscall/xcodescripts/compile-syscalls.pl create mode 100755 libsyscall/xcodescripts/create-syscalls.pl create mode 100755 libsyscall/xcodescripts/mach_install_mig.sh delete mode 100644 osfmk/chud/ppc/chud_cpu_asm.h delete mode 100644 osfmk/chud/ppc/chud_cpu_asm.s delete mode 100644 osfmk/chud/ppc/chud_cpu_ppc.c delete mode 100644 osfmk/chud/ppc/chud_osfmk_callback_ppc.c delete mode 100644 osfmk/chud/ppc/chud_spr.h delete mode 100644 osfmk/chud/ppc/chud_thread_ppc.c delete mode 100644 osfmk/chud/ppc/chud_xnu_private.h delete mode 100644 osfmk/conf/MASTER.ppc delete mode 100644 osfmk/conf/Makefile.ppc delete mode 100644 osfmk/conf/files.ppc delete mode 100644 osfmk/conf/tools/Makefile delete mode 100644 osfmk/conf/tools/doconf/Makefile delete mode 100755 osfmk/conf/tools/doconf/doconf.csh delete mode 100644 osfmk/console/ppc/serial_console.c delete mode 100644 osfmk/console/ppc/video_scroll.s create mode 100644 osfmk/i386/bsd_i386_native.c delete mode 100644 osfmk/i386/commpage/atomic.s delete mode 100644 osfmk/i386/commpage/bcopy_scalar.s delete mode 100644 osfmk/i386/commpage/bcopy_sse2.s delete mode 100644 osfmk/i386/commpage/bcopy_sse3x.s delete mode 100644 osfmk/i386/commpage/bcopy_sse3x_64.s delete mode 100644 osfmk/i386/commpage/bcopy_sse42.s delete mode 100644 osfmk/i386/commpage/bcopy_sse42_64.s delete mode 100644 osfmk/i386/commpage/bzero_scalar.s delete mode 100644 osfmk/i386/commpage/bzero_sse2.s delete mode 100644 osfmk/i386/commpage/bzero_sse2_64.s delete mode 100644 osfmk/i386/commpage/bzero_sse42.s delete mode 100644 osfmk/i386/commpage/bzero_sse42_64.s delete mode 100644 osfmk/i386/commpage/cacheflush.s delete mode 100644 osfmk/i386/commpage/commpage_gettimeofday.s delete mode 100644 osfmk/i386/commpage/commpage_mach_absolute_time.s delete mode 100644 osfmk/i386/commpage/commpage_sigs.c delete mode 100644 osfmk/i386/commpage/cpu_number.s delete mode 100644 osfmk/i386/commpage/longcopy_sse3x.s delete mode 100644 osfmk/i386/commpage/longcopy_sse3x_64.s delete mode 100644 osfmk/i386/commpage/memset_pattern_sse2.s delete mode 100644 osfmk/i386/commpage/memset_pattern_sse2_64.s delete mode 100644 osfmk/i386/commpage/spinlocks.s create mode 100644 osfmk/i386/copyio.c delete mode 100644 osfmk/i386/ipl.h create mode 100644 osfmk/i386/lapic_native.c create mode 100644 osfmk/i386/mp_native.c create mode 100644 osfmk/i386/pal_hibernate.h rename osfmk/{ppc/cpu_number.h => i386/pal_lock_asm.h} (82%) create mode 100644 osfmk/i386/pal_native.h create mode 100644 osfmk/i386/pal_routines.c create mode 100644 osfmk/i386/pal_routines.h create mode 100644 osfmk/i386/pal_routines_asm.s rename osfmk/{ppc/mp.h => i386/pal_rtclock_asm.h} (86%) create mode 100644 osfmk/i386/pcb_native.c create mode 100644 osfmk/i386/pmap_common.c create mode 100644 osfmk/i386/pmap_pcid.h create mode 100644 osfmk/i386/rtclock_asm.h rename osfmk/i386/{rtclock.h => rtclock_asm_native.h} (67%) create mode 100644 osfmk/i386/rtclock_native.c rename osfmk/{ppc/rtclock.h => i386/rtclock_protos.h} (64%) create mode 100644 osfmk/i386/trap_native.c create mode 100644 osfmk/i386/ucode.c create mode 100644 osfmk/i386/ucode.h delete mode 100644 osfmk/kdp/ml/ppc/kdp_asm.s delete mode 100644 osfmk/kdp/ml/ppc/kdp_machdep.c delete mode 100644 osfmk/kdp/ml/ppc/kdp_misc.s delete mode 100644 osfmk/kdp/ml/ppc/kdp_vm.c create mode 100644 osfmk/kern/extmod_statistics.c rename osfmk/{ppc/PPCcalls.c => kern/extmod_statistics.h} (71%) create mode 100644 osfmk/kern/sched_fixedpriority.c create mode 100644 osfmk/kern/sched_grrr.c create mode 100644 osfmk/kern/sched_proto.c delete mode 100644 osfmk/libsa/ppc/types.h create mode 100644 osfmk/mach/branch_predicates.h delete mode 100644 osfmk/mach/i386/_types.h delete mode 100644 osfmk/mach/ppc/Makefile delete mode 100644 osfmk/mach/ppc/_structs.h delete mode 100644 osfmk/mach/ppc/_types.h delete mode 100644 osfmk/mach/ppc/boolean.h delete mode 100644 osfmk/mach/ppc/exception.h delete mode 100644 osfmk/mach/ppc/kern_return.h delete mode 100644 osfmk/mach/ppc/machine_types.defs delete mode 100644 osfmk/mach/ppc/ndr_def.h delete mode 100644 osfmk/mach/ppc/processor_info.h delete mode 100644 osfmk/mach/ppc/sdt_isa.h delete mode 100644 osfmk/mach/ppc/syscall_sw.h delete mode 100644 osfmk/mach/ppc/thread_status.h delete mode 100644 osfmk/mach/ppc/vm_param.h delete mode 100644 osfmk/mach/ppc/vm_types.h rename libsyscall/mach/ppc/mach_absolute_time.s => osfmk/machine/pal_hibernate.h (81%) rename iokit/IOKit/machine/IOSharedLockImp.h => osfmk/machine/pal_routines.h (85%) delete mode 100644 osfmk/ppc/AltiAssist.s delete mode 100644 osfmk/ppc/Diagnostics.c delete mode 100644 osfmk/ppc/Diagnostics.h delete mode 100644 osfmk/ppc/Emulate.s delete mode 100644 osfmk/ppc/Emulate64.s delete mode 100644 osfmk/ppc/Firmware.h delete mode 100644 osfmk/ppc/Firmware.s delete mode 100644 osfmk/ppc/FirmwareC.c delete mode 100644 osfmk/ppc/FirmwareCalls.h delete mode 100644 osfmk/ppc/Makefile delete mode 100644 osfmk/ppc/PPCcalls.h delete mode 100644 osfmk/ppc/Performance.s delete mode 100644 osfmk/ppc/PseudoKernel.c delete mode 100644 osfmk/ppc/PseudoKernel.h delete mode 100644 osfmk/ppc/_setjmp.s delete mode 100644 osfmk/ppc/aligned_data.s delete mode 100644 osfmk/ppc/asm.h delete mode 100644 osfmk/ppc/ast.h delete mode 100644 osfmk/ppc/ast_types.h delete mode 100644 osfmk/ppc/atomic_switch.h delete mode 100644 osfmk/ppc/atomic_switch.s delete mode 100644 osfmk/ppc/bat_init.c delete mode 100644 osfmk/ppc/bcopy.s delete mode 100644 osfmk/ppc/bcopytest.c delete mode 100644 osfmk/ppc/bits.s delete mode 100644 osfmk/ppc/boot.h delete mode 100644 osfmk/ppc/bzero.s delete mode 100644 osfmk/ppc/cache.s delete mode 100644 osfmk/ppc/commpage/atomic.s delete mode 100644 osfmk/ppc/commpage/bcopy_64.s delete mode 100644 osfmk/ppc/commpage/bcopy_970.s delete mode 100644 osfmk/ppc/commpage/bcopy_g3.s delete mode 100644 osfmk/ppc/commpage/bcopy_g4.s delete mode 100644 osfmk/ppc/commpage/bigcopy_970.s delete mode 100644 osfmk/ppc/commpage/bzero_128.s delete mode 100644 osfmk/ppc/commpage/bzero_32.s delete mode 100644 osfmk/ppc/commpage/cacheflush.s delete mode 100644 osfmk/ppc/commpage/commpage.c delete mode 100644 osfmk/ppc/commpage/commpage.h delete mode 100644 osfmk/ppc/commpage/commpage_asm.s delete mode 100644 osfmk/ppc/commpage/gettimeofday.s delete mode 100644 osfmk/ppc/commpage/mach_absolute_time.s delete mode 100644 osfmk/ppc/commpage/memset_64.s delete mode 100644 osfmk/ppc/commpage/memset_g3.s delete mode 100644 osfmk/ppc/commpage/memset_g4.s delete mode 100644 osfmk/ppc/commpage/memset_g5.s delete mode 100644 osfmk/ppc/commpage/pthread.s delete mode 100644 osfmk/ppc/commpage/spinlocks.s delete mode 100644 osfmk/ppc/conf.c delete mode 100644 osfmk/ppc/console_feed.c delete mode 100644 osfmk/ppc/console_feed_entries.h delete mode 100644 osfmk/ppc/cpu.c delete mode 100644 osfmk/ppc/cpu_capabilities.h delete mode 100644 osfmk/ppc/cpu_data.h delete mode 100644 osfmk/ppc/cpu_internal.h delete mode 100644 osfmk/ppc/cswtch.s delete mode 100644 osfmk/ppc/db_asm.s delete mode 100644 osfmk/ppc/db_disasm.c delete mode 100644 osfmk/ppc/db_interface.c delete mode 100644 osfmk/ppc/db_low_trace.c delete mode 100644 osfmk/ppc/db_low_trace.h delete mode 100644 osfmk/ppc/db_machdep.h delete mode 100644 osfmk/ppc/db_trace.c delete mode 100644 osfmk/ppc/endian.h delete mode 100644 osfmk/ppc/etimer.c delete mode 100644 osfmk/ppc/exception.h delete mode 100644 osfmk/ppc/fpu_protos.h delete mode 100644 osfmk/ppc/genassym.c delete mode 100644 osfmk/ppc/hexfont.h delete mode 100644 osfmk/ppc/hibernate_ppc.c delete mode 100644 osfmk/ppc/hibernate_restore.s delete mode 100644 osfmk/ppc/hw_exception.s delete mode 100644 osfmk/ppc/hw_lock.s delete mode 100644 osfmk/ppc/hw_lock_types.h delete mode 100644 osfmk/ppc/hw_perfmon.c delete mode 100644 osfmk/ppc/hw_perfmon.h delete mode 100644 osfmk/ppc/hw_perfmon_mmcr.h delete mode 100644 osfmk/ppc/hw_vm.s delete mode 100644 osfmk/ppc/instrumentation.h delete mode 100644 osfmk/ppc/interrupt.c delete mode 100644 osfmk/ppc/io_map.c delete mode 100644 osfmk/ppc/io_map_entries.h delete mode 100644 osfmk/ppc/lock.h delete mode 100644 osfmk/ppc/locks.h delete mode 100644 osfmk/ppc/locks_ppc.c delete mode 100644 osfmk/ppc/low_trace.h delete mode 100644 osfmk/ppc/lowglobals.h delete mode 100644 osfmk/ppc/lowmem_vectors.s delete mode 100644 osfmk/ppc/machine_routines.c delete mode 100644 osfmk/ppc/machine_routines.h delete mode 100644 osfmk/ppc/machine_routines_asm.s delete mode 100644 osfmk/ppc/machine_task.c delete mode 100644 osfmk/ppc/machlimits.h delete mode 100644 osfmk/ppc/machparam.h delete mode 100644 osfmk/ppc/mappings.c delete mode 100644 osfmk/ppc/mappings.h delete mode 100644 osfmk/ppc/mcount.s delete mode 100644 osfmk/ppc/mem.h delete mode 100644 osfmk/ppc/misc.c delete mode 100644 osfmk/ppc/misc_asm.s delete mode 100644 osfmk/ppc/misc_protos.h delete mode 100644 osfmk/ppc/model_dep.c delete mode 100644 osfmk/ppc/movc.s delete mode 100644 osfmk/ppc/new_screen.h delete mode 100644 osfmk/ppc/pcb.c delete mode 100644 osfmk/ppc/pmap.c delete mode 100644 osfmk/ppc/pmap.h delete mode 100644 osfmk/ppc/pms.c delete mode 100644 osfmk/ppc/pmsCPU.c delete mode 100644 osfmk/ppc/ppc_disasm.i delete mode 100644 osfmk/ppc/ppc_init.c delete mode 100644 osfmk/ppc/ppc_vm_init.c delete mode 100644 osfmk/ppc/proc_reg.h delete mode 100644 osfmk/ppc/rtclock.c delete mode 100644 osfmk/ppc/savearea.c delete mode 100644 osfmk/ppc/savearea.h delete mode 100644 osfmk/ppc/savearea_asm.s delete mode 100644 osfmk/ppc/scc_8530.h delete mode 100644 osfmk/ppc/sched_param.h delete mode 100644 osfmk/ppc/screen_switch.h delete mode 100644 osfmk/ppc/serial_defs.h delete mode 100644 osfmk/ppc/serial_io.c delete mode 100644 osfmk/ppc/serial_io.h delete mode 100644 osfmk/ppc/setjmp.h delete mode 100644 osfmk/ppc/simple_lock.h delete mode 100644 osfmk/ppc/skiplists.s delete mode 100644 osfmk/ppc/spec_reg.h delete mode 100644 osfmk/ppc/start.s delete mode 100644 osfmk/ppc/status.c delete mode 100644 osfmk/ppc/task.h delete mode 100644 osfmk/ppc/thread.h delete mode 100644 osfmk/ppc/trap.c delete mode 100644 osfmk/ppc/trap.h delete mode 100644 osfmk/ppc/vm_tuning.h delete mode 100644 osfmk/ppc/vmachmon.c delete mode 100644 osfmk/ppc/vmachmon.h delete mode 100644 osfmk/ppc/vmachmon_asm.s delete mode 100644 osfmk/profiling/ppc/profile-md.h create mode 100644 osfmk/vm/default_freezer.c create mode 100644 osfmk/vm/default_freezer.h create mode 100644 osfmk/vm/vm_map_store.c create mode 100644 osfmk/vm/vm_map_store.h create mode 100644 osfmk/vm/vm_map_store_ll.c rename bsd/dev/ppc/machdep.c => osfmk/vm/vm_map_store_ll.h (61%) create mode 100644 osfmk/vm/vm_map_store_rb.c create mode 100644 osfmk/vm/vm_map_store_rb.h create mode 100644 osfmk/x86_64/copyio.c create mode 100644 osfmk/x86_64/pal_routines_asm.s create mode 100644 osfmk/x86_64/pmap_pcid.c delete mode 100644 pexpert/conf/MASTER.ppc delete mode 100644 pexpert/conf/Makefile.ppc delete mode 100644 pexpert/conf/files.ppc delete mode 100644 pexpert/conf/tools/Makefile delete mode 100644 pexpert/conf/tools/doconf/Makefile delete mode 100755 pexpert/conf/tools/doconf/doconf.csh delete mode 100644 pexpert/pexpert/ppc/Makefile delete mode 100644 pexpert/pexpert/ppc/boot.h delete mode 100644 pexpert/pexpert/ppc/interrupts.h delete mode 100644 pexpert/pexpert/ppc/powermac.h delete mode 100644 pexpert/pexpert/ppc/protos.h delete mode 100644 pexpert/ppc/pe_clock_speed.c delete mode 100644 pexpert/ppc/pe_clock_speed_asm.s delete mode 100644 pexpert/ppc/pe_identify_machine.c delete mode 100644 pexpert/ppc/pe_init.c delete mode 100644 pexpert/ppc/pe_kprintf.c delete mode 100644 security/conf/MASTER.ppc delete mode 100644 security/conf/Makefile.ppc delete mode 100644 security/conf/files.ppc delete mode 100644 security/conf/tools/Makefile delete mode 100644 security/conf/tools/doconf/Makefile delete mode 100644 security/conf/tools/doconf/doconf.csh delete mode 100644 security/conf/tools/newvers/Makefile create mode 100644 security/mac_priv.c create mode 100755 tools/symbolify.py create mode 100644 tools/tests/execperf/Makefile create mode 100644 tools/tests/execperf/exit-asm.S create mode 100644 tools/tests/execperf/exit.c create mode 100644 tools/tests/execperf/printexecinfo.c create mode 100644 tools/tests/execperf/run.c create mode 100755 tools/tests/execperf/test.sh create mode 100644 tools/tests/jitter/Makefile create mode 100644 tools/tests/jitter/cpu_number.s create mode 100644 tools/tests/jitter/timer_jitter.c mode change 100644 => 100755 tools/tests/kqueue_tests/Makefile rename tools/tests/kqueue_tests/{kqueue_readwrite_tests.c => kqueue_file_tests.c} (98%) mode change 100644 => 100755 tools/tests/libMicro/Makefile.com.Darwin create mode 100644 tools/tests/libMicro/apple/getaddrinfo_host.c create mode 100644 tools/tests/libMicro/apple/getaddrinfo_port.c create mode 100644 tools/tests/libMicro/apple/getgrent.c create mode 100644 tools/tests/libMicro/apple/getgrgid.c create mode 100644 tools/tests/libMicro/apple/getgrnam.c create mode 100644 tools/tests/libMicro/apple/getpwent.c create mode 100644 tools/tests/libMicro/apple/getpwnam.c create mode 100644 tools/tests/libMicro/apple/getpwuid.c create mode 100644 tools/tests/libMicro/apple/mbr_check_membership.c create mode 100644 tools/tests/libMicro/apple/mbr_check_service_membership.c create mode 100644 tools/tests/libMicro/apple/od_query_create_with_node.c create mode 100644 tools/tests/libMicro/benchDS.sh create mode 100644 tools/tests/libMicro/coreos_bench.sh create mode 100644 tools/tests/libMicro/od_account_create.sh create mode 100644 tools/tests/libMicro/od_account_delete.sh create mode 100644 tools/tests/testkext/testthreadcall-Info.plist create mode 100644 tools/tests/testkext/testthreadcall.cpp create mode 100644 tools/tests/testkext/testthreadcall.h create mode 100644 tools/tests/xnu_quick_test/atomic_fifo_queue_test.c create mode 100644 tools/tests/xnu_quick_test/commpage_tests.c create mode 100644 tools/tests/xnu_quick_test/sched_tests.c create mode 100644 tools/tests/zero-to-n/Makefile create mode 100644 tools/tests/zero-to-n/zero-to-n.c diff --git a/EXTERNAL_HEADERS/Availability.h b/EXTERNAL_HEADERS/Availability.h new file mode 100644 index 000000000..e811335c1 --- /dev/null +++ b/EXTERNAL_HEADERS/Availability.h @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2007-2010 by Apple Inc.. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef __AVAILABILITY__ +#define __AVAILABILITY__ + /* + These macros are for use in OS header files. They enable function prototypes + and Objective-C methods to be tagged with the OS version in which they + were first available; and, if applicable, the OS version in which they + became deprecated. + + The desktop Mac OS X and the iPhone OS X each have different version numbers. + The __OSX_AVAILABLE_STARTING() macro allows you to specify both the desktop + and phone OS version numbers. For instance: + __OSX_AVAILABLE_STARTING(__MAC_10_2,__IPHONE_2_0) + means the function/method was first available on Mac OS X 10.2 on the desktop + and first available in OS X 2.0 on the iPhone. + + If a function is available on one platform, but not the other a _NA (not + applicable) parameter is used. For instance: + __OSX_AVAILABLE_STARTING(__MAC_10_3,__IPHONE_NA) + means that the function/method was first available on Mac OS X 10.3, and it + currently not implemented on the iPhone. + + At some point, a function/method may be deprecated. That means Apple + recommends applications stop using the function, either because there is a + better replacement or the functionality is being phased out. Deprecated + functions/methods can be tagged with a __OSX_AVAILABLE_BUT_DEPRECATED() + macro which specifies the OS version where the function became available + as well as the OS version in which it became deprecated. For instance: + __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0,__MAC_10_5,__IPHONE_NA,__IPHONE_NA) + means that the function/method was introduced in Mac OS X 10.0, then + became deprecated beginning in Mac OS X 10.5. On the iPhone the function + has never been available. + + For these macros to function properly, a program must specify the OS version range + it is targeting. The min OS version is specified as an option to the compiler: + -mmacosx-version-min=10.x when building for Mac OS X, and -miphone-version-min=1.x.x + when building for the iPhone. The upper bound for the OS version is rarely needed, + but it can be set on the command line via: -D__MAC_OS_X_VERSION_MAX_ALLOWED=10xx for + Mac OS X and __IPHONE_OS_VERSION_MAX_ALLOWED = 1xxx for iPhone. + + Examples: + + A function available in Mac OS X 10.5 and later, but not on the phone: + + extern void mymacfunc() __OSX_AVAILABLE_STARTING(__MAC_10_5,__IPHONE_NA); + + + An Objective-C method in Mac OS X 10.5 and later, but not on the phone: + + @interface MyClass : NSObject + -(void) mymacmethod __OSX_AVAILABLE_STARTING(__MAC_10_5,__IPHONE_NA); + @end + + + An enum available on the phone, but not available on Mac OS X: + + #if __IPHONE_OS_VERSION_MIN_REQUIRED + enum { myEnum = 1 }; + #endif + Note: this works when targeting the Mac OS X platform because + __IPHONE_OS_VERSION_MIN_REQUIRED is undefined which evaluates to zero. + + + An enum with values added in different iPhoneOS versions: + + enum { + myX = 1, // Usable on iPhoneOS 2.1 and later + myY = 2, // Usable on iPhoneOS 3.0 and later + myZ = 3, // Usable on iPhoneOS 3.0 and later + ... + Note: you do not want to use #if with enumeration values + when a client needs to see all values at compile time + and use runtime logic to only use the viable values. + + + It is also possible to use the *_VERSION_MIN_REQUIRED in source code to make one + source base that can be compiled to target a range of OS versions. It is best + to not use the _MAC_* and __IPHONE_* macros for comparisons, but rather their values. + That is because you might get compiled on an old OS that does not define a later + OS version macro, and in the C preprocessor undefined values evaluate to zero + in expresssions, which could cause the #if expression to evaluate in an unexpected + way. + + #ifdef __MAC_OS_X_VERSION_MIN_REQUIRED + // code only compiled when targeting Mac OS X and not iPhone + // note use of 1050 instead of __MAC_10_5 + #if __MAC_OS_X_VERSION_MIN_REQUIRED < 1050 + // code in here might run on pre-Leopard OS + #else + // code here can assume Leopard or later + #endif + #endif + + +*/ + +#define __MAC_10_0 1000 +#define __MAC_10_1 1010 +#define __MAC_10_2 1020 +#define __MAC_10_3 1030 +#define __MAC_10_4 1040 +#define __MAC_10_5 1050 +#define __MAC_10_6 1060 +#define __MAC_10_7 1070 +#define __MAC_NA 9999 /* not available */ + +#define __IPHONE_2_0 20000 +#define __IPHONE_2_1 20100 +#define __IPHONE_2_2 20200 +#define __IPHONE_3_0 30000 +#define __IPHONE_3_1 30100 +#define __IPHONE_3_2 30200 +#define __IPHONE_NA 99999 /* not available */ + +#include + + +#ifdef __IPHONE_OS_VERSION_MIN_REQUIRED + #define __OSX_AVAILABLE_STARTING(_mac, _iphone) __AVAILABILITY_INTERNAL##_iphone + #define __OSX_AVAILABLE_BUT_DEPRECATED(_macIntro, _macDep, _iphoneIntro, _iphoneDep) \ + __AVAILABILITY_INTERNAL##_iphoneIntro##_DEP##_iphoneDep + +#elif defined(__MAC_OS_X_VERSION_MIN_REQUIRED) + #define __OSX_AVAILABLE_STARTING(_mac, _iphone) __AVAILABILITY_INTERNAL##_mac + #define __OSX_AVAILABLE_BUT_DEPRECATED(_macIntro, _macDep, _iphoneIntro, _iphoneDep) \ + __AVAILABILITY_INTERNAL##_macIntro##_DEP##_macDep + +#else + #define __OSX_AVAILABLE_STARTING(_mac, _iphone) + #define __OSX_AVAILABLE_BUT_DEPRECATED(_macIntro, _macDep, _iphoneIntro, _iphoneDep) +#endif + + +#endif /* __AVAILABILITY__ */ diff --git a/EXTERNAL_HEADERS/AvailabilityInternal.h b/EXTERNAL_HEADERS/AvailabilityInternal.h new file mode 100644 index 000000000..a4524708e --- /dev/null +++ b/EXTERNAL_HEADERS/AvailabilityInternal.h @@ -0,0 +1,393 @@ +/* + * Copyright (c) 2007-2010 by Apple Inc.. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +/* + File: AvailabilityInternal.h + + Contains: implementation details of __OSX_AVAILABLE_* macros from + +*/ +#ifndef __AVAILABILITY_INTERNAL__ +#define __AVAILABILITY_INTERNAL__ + + + +#ifndef __IPHONE_OS_VERSION_MIN_REQUIRED + #ifdef __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ + /* compiler sets __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ when -miphoneos-version-min is used */ + #define __IPHONE_OS_VERSION_MIN_REQUIRED __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ + #endif +#endif + +#ifdef __IPHONE_OS_VERSION_MIN_REQUIRED + /* don't use visibility attribute for iPhoneOS */ + #define __AVAILABILITY_INTERNAL_DEPRECATED __attribute__((deprecated)) + #define __AVAILABILITY_INTERNAL_UNAVAILABLE __attribute__((unavailable)) + #define __AVAILABILITY_INTERNAL_WEAK_IMPORT __attribute__((weak_import)) + #define __AVAILABILITY_INTERNAL_REGULAR +#else + #define __AVAILABILITY_INTERNAL_DEPRECATED __attribute__((deprecated,visibility("default"))) + #define __AVAILABILITY_INTERNAL_UNAVAILABLE __attribute__((unavailable,visibility("default"))) + #define __AVAILABILITY_INTERNAL_WEAK_IMPORT __attribute__((weak_import,visibility("default"))) + #define __AVAILABILITY_INTERNAL_REGULAR __attribute__((visibility("default"))) +#endif + +#ifdef __IPHONE_OS_VERSION_MIN_REQUIRED + /* make sure a default max version is set */ + #ifndef __IPHONE_OS_VERSION_MAX_ALLOWED + #define __IPHONE_OS_VERSION_MAX_ALLOWED __IPHONE_3_2 + #endif + /* make sure a valid min is set */ + #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_0 + #undef __IPHONE_OS_VERSION_MIN_REQUIRED + #define __IPHONE_OS_VERSION_MIN_REQUIRED __IPHONE_2_0 + #endif + + /* set up internal macros (up to 2.0) */ + #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_2_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_2_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_2_0 __AVAILABILITY_INTERNAL_DEPRECATED + /* set up internal macros (up to 2.1) */ + #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_1 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_1 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_1 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_2_1 + #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_2_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_2_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_2_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_2_1 __AVAILABILITY_INTERNAL_DEPRECATED + #endif + /* set up internal macros (up to 2.2) */ + #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_2 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_2_2 + #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL_DEPRECATED + #endif + /* set up internal macros (up to 3.0) */ + #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_3_0 + #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_DEPRECATED + #endif + /* set up internal macros (up to 3.1) */ + #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_3_1 + #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_DEPRECATED + #endif + /* set up internal macros (up to 3.2) */ + #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_3_2 + #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_DEPRECATED + #endif + /* set up internal macros (n/a) */ + #define __AVAILABILITY_INTERNAL__IPHONE_NA __AVAILABILITY_INTERNAL_UNAVAILABLE + #define __AVAILABILITY_INTERNAL__IPHONE_NA_DEP__IPHONE_NA __AVAILABILITY_INTERNAL_UNAVAILABLE + +#elif defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) + /* compiler for Mac OS X sets __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ */ + #define __MAC_OS_X_VERSION_MIN_REQUIRED __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ + /* make sure a default max version is set */ + #ifndef __MAC_OS_X_VERSION_MAX_ALLOWED + #define __MAC_OS_X_VERSION_MAX_ALLOWED __MAC_10_7 + #endif + /* set up internal macros */ + #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_7 + #define __AVAILABILITY_INTERNAL__MAC_10_7 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_7 + #define __AVAILABILITY_INTERNAL__MAC_10_7 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__MAC_10_7 __AVAILABILITY_INTERNAL_REGULAR + #endif + #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_6 + #define __AVAILABILITY_INTERNAL__MAC_10_6 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_6 + #define __AVAILABILITY_INTERNAL__MAC_10_6 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__MAC_10_6 __AVAILABILITY_INTERNAL_REGULAR + #endif + #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_5 + #define __AVAILABILITY_INTERNAL__MAC_10_5 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_5 + #define __AVAILABILITY_INTERNAL__MAC_10_5 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__MAC_10_5 __AVAILABILITY_INTERNAL_REGULAR + #endif + #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_4 + #define __AVAILABILITY_INTERNAL__MAC_10_4 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_4 + #define __AVAILABILITY_INTERNAL__MAC_10_4 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__MAC_10_4 __AVAILABILITY_INTERNAL_REGULAR + #endif + #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_3 + #define __AVAILABILITY_INTERNAL__MAC_10_3 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_3 + #define __AVAILABILITY_INTERNAL__MAC_10_3 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__MAC_10_3 __AVAILABILITY_INTERNAL_REGULAR + #endif + #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_2 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_2 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__MAC_10_2 __AVAILABILITY_INTERNAL_REGULAR + #endif + #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_1 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_1 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__MAC_10_1 __AVAILABILITY_INTERNAL_REGULAR + #endif + #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_0 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_0 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__MAC_10_0 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__MAC_NA __AVAILABILITY_INTERNAL_UNAVAILABLE + #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_1 __AVAILABILITY_INTERNAL_DEPRECATED + #else + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_1 __AVAILABILITY_INTERNAL__MAC_10_0 + #endif + #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_2 __AVAILABILITY_INTERNAL_DEPRECATED + #else + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_2 __AVAILABILITY_INTERNAL__MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_2 __AVAILABILITY_INTERNAL__MAC_10_1 + #endif + #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_3 + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_3 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_3 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_3 __AVAILABILITY_INTERNAL_DEPRECATED + #else + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_3 __AVAILABILITY_INTERNAL__MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_3 __AVAILABILITY_INTERNAL__MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_3 __AVAILABILITY_INTERNAL__MAC_10_2 + #endif + #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_4 + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_4 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_4 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_4 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_4 __AVAILABILITY_INTERNAL_DEPRECATED + #else + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_4 __AVAILABILITY_INTERNAL__MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_4 __AVAILABILITY_INTERNAL__MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_4 __AVAILABILITY_INTERNAL__MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_4 __AVAILABILITY_INTERNAL__MAC_10_3 + #endif + #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_5 + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_5 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_5 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_5 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_5 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_5 __AVAILABILITY_INTERNAL_DEPRECATED + #else + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_5 __AVAILABILITY_INTERNAL__MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_5 __AVAILABILITY_INTERNAL__MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_5 __AVAILABILITY_INTERNAL__MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_5 __AVAILABILITY_INTERNAL__MAC_10_3 + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_5 __AVAILABILITY_INTERNAL__MAC_10_4 + #endif + #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_6 + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_6 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_6 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_6 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_6 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_6 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_6 __AVAILABILITY_INTERNAL_DEPRECATED + #else + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_6 __AVAILABILITY_INTERNAL__MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_6 __AVAILABILITY_INTERNAL__MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_6 __AVAILABILITY_INTERNAL__MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_6 __AVAILABILITY_INTERNAL__MAC_10_3 + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_6 __AVAILABILITY_INTERNAL__MAC_10_4 + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_6 __AVAILABILITY_INTERNAL__MAC_10_5 + #endif + #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_7 + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED + #else + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_3 + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_4 + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_5 + #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_6 + #endif + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_3 + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_4 + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_5 + #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_6 + #define __AVAILABILITY_INTERNAL__MAC_10_7_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_7 + #define __AVAILABILITY_INTERNAL__MAC_NA_DEP__MAC_NA __AVAILABILITY_INTERNAL_UNAVAILABLE +#endif + +#endif /* __AVAILABILITY_INTERNAL__ */ diff --git a/EXTERNAL_HEADERS/AvailabilityMacros.h b/EXTERNAL_HEADERS/AvailabilityMacros.h new file mode 100644 index 000000000..02981bd13 --- /dev/null +++ b/EXTERNAL_HEADERS/AvailabilityMacros.h @@ -0,0 +1,820 @@ +/* + * Copyright (c) 2001-2010 by Apple Inc.. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +/* + File: AvailabilityMacros.h + + More Info: See TechNote 2064 + + Contains: Autoconfiguration of AVAILABLE_ macros for Mac OS X + + This header enables a developer to specify build time + constraints on what Mac OS X versions the resulting + application will be run. There are two bounds a developer + can specify: + + MAC_OS_X_VERSION_MIN_REQUIRED + MAC_OS_X_VERSION_MAX_ALLOWED + + The lower bound controls which calls to OS functions will + be weak-importing (allowed to be unresolved at launch time). + The upper bound controls which OS functionality, if used, + will result in a compiler error because that functionality is + not available on on any OS is the specifed range. + + For example, suppose an application is compiled with: + + MAC_OS_X_VERSION_MIN_REQUIRED = MAC_OS_X_VERSION_10_2 + MAC_OS_X_VERSION_MAX_ALLOWED = MAC_OS_X_VERSION_10_3 + + and an OS header contains: + + extern void funcA(void) AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER; + extern void funcB(void) AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_2; + extern void funcC(void) AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3; + extern void funcD(void) AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER; + extern void funcE(void) AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER; + extern void funcF(void) AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER; + extern void funcG(void) AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER; + + typedef long TypeA DEPRECATED_IN_MAC_OS_X_VERSION_10_0_AND_LATER; + typedef long TypeB DEPRECATED_IN_MAC_OS_X_VERSION_10_1_AND_LATER; + typedef long TypeC DEPRECATED_IN_MAC_OS_X_VERSION_10_2_AND_LATER; + typedef long TypeD DEPRECATED_IN_MAC_OS_X_VERSION_10_3_AND_LATER; + typedef long TypeE DEPRECATED_IN_MAC_OS_X_VERSION_10_4_AND_LATER; + + Any application code which uses these declarations will get the following: + + compile link run + ------- ------ ------- + funcA: normal normal normal + funcB: warning normal normal + funcC: normal normal normal + funcD: normal normal normal + funcE: normal normal normal + funcF: normal weak on 10.3 normal, on 10.2 (&funcF == NULL) + funcG: error error n/a + typeA: warning + typeB: warning + typeC: warning + typeD: normal + typeE: normal + + +*/ +#ifndef __AVAILABILITYMACROS__ +#define __AVAILABILITYMACROS__ + + +/* + * Set up standard Mac OS X versions + */ +#define MAC_OS_X_VERSION_10_0 1000 +#define MAC_OS_X_VERSION_10_1 1010 +#define MAC_OS_X_VERSION_10_2 1020 +#define MAC_OS_X_VERSION_10_3 1030 +#define MAC_OS_X_VERSION_10_4 1040 +#define MAC_OS_X_VERSION_10_5 1050 +#define MAC_OS_X_VERSION_10_6 1060 +#define MAC_OS_X_VERSION_10_7 1070 + + +/* + * If min OS not specified, assume 10.1 for ppc and 10.4 for all others + * Note: gcc driver may set _ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED_ based on MACOSX_DEPLOYMENT_TARGET environment variable + */ +#ifndef MAC_OS_X_VERSION_MIN_REQUIRED + #ifdef __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ + #if (__i386__ || __x86_64__) && (__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < MAC_OS_X_VERSION_10_4) + #warning Building for Intel with Mac OS X Deployment Target < 10.4 is invalid. + #elif __ppc64__ && (__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < MAC_OS_X_VERSION_10_4) + #warning Building for ppc64 with Mac OS X Deployment Target < 10.4 is invalid. + #endif + #define MAC_OS_X_VERSION_MIN_REQUIRED __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ + #else + #if __ppc64__ || __i386__ || __x86_64__ + #define MAC_OS_X_VERSION_MIN_REQUIRED MAC_OS_X_VERSION_10_4 + #else + #define MAC_OS_X_VERSION_MIN_REQUIRED MAC_OS_X_VERSION_10_1 + #endif + #endif +#endif + +/* + * if max OS not specified, assume largerof(10.6, min) + */ +#ifndef MAC_OS_X_VERSION_MAX_ALLOWED + #if MAC_OS_X_VERSION_MIN_REQUIRED > MAC_OS_X_VERSION_10_7 + #define MAC_OS_X_VERSION_MAX_ALLOWED MAC_OS_X_VERSION_MIN_REQUIRED + #else + #define MAC_OS_X_VERSION_MAX_ALLOWED MAC_OS_X_VERSION_10_7 + #endif +#endif + +/* + * Error on bad values + */ +#if MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_MIN_REQUIRED + #error MAC_OS_X_VERSION_MAX_ALLOWED must be >= MAC_OS_X_VERSION_MIN_REQUIRED +#endif +#if MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_0 + #error MAC_OS_X_VERSION_MIN_REQUIRED must be >= MAC_OS_X_VERSION_10_0 +#endif + +/* + * only certain compilers support __attribute__((weak_import)) + */ +#if defined(__GNUC__) && ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))) && (MAC_OS_X_VERSION_MIN_REQUIRED >= 1020) + #define WEAK_IMPORT_ATTRIBUTE __attribute__((weak_import)) +#elif defined(__MWERKS__) && (__MWERKS__ >= 0x3205) && (MAC_OS_X_VERSION_MIN_REQUIRED >= 1020) && !defined(__INTEL__) + #define WEAK_IMPORT_ATTRIBUTE __attribute__((weak_import)) +#else + #define WEAK_IMPORT_ATTRIBUTE +#endif + +/* + * only certain compilers support __attribute__((deprecated)) + */ +#if defined(__GNUC__) && ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))) + #define DEPRECATED_ATTRIBUTE __attribute__((deprecated)) +#else + #define DEPRECATED_ATTRIBUTE +#endif + +/* + * only certain compilers support __attribute__((unavailable)) + */ +#if defined(__GNUC__) && ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))) + #define UNAVAILABLE_ATTRIBUTE __attribute__((unavailable)) +#else + #define UNAVAILABLE_ATTRIBUTE +#endif + + + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER + * + * Used on functions introduced in Mac OS X 10.0 + */ +#define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED + * + * Used on functions introduced in Mac OS X 10.0, + * and deprecated in Mac OS X 10.0 + */ +#define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE + +/* + * DEPRECATED_IN_MAC_OS_X_VERSION_10_0_AND_LATER + * + * Used on types deprecated in Mac OS X 10.0 + */ +#define DEPRECATED_IN_MAC_OS_X_VERSION_10_0_AND_LATER DEPRECATED_ATTRIBUTE + + + + + + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER + * + * Used on declarations introduced in Mac OS X 10.1 + */ +#if MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_1 + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER UNAVAILABLE_ATTRIBUTE +#elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_1 + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER WEAK_IMPORT_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED + * + * Used on declarations introduced in Mac OS X 10.1, + * and deprecated in Mac OS X 10.1 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_1 + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_1 + * + * Used on declarations introduced in Mac OS X 10.0, + * but later deprecated in Mac OS X 10.1 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_1 + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_1 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_1 AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER +#endif + +/* + * DEPRECATED_IN_MAC_OS_X_VERSION_10_1_AND_LATER + * + * Used on types deprecated in Mac OS X 10.1 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_1 + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_1_AND_LATER DEPRECATED_ATTRIBUTE +#else + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_1_AND_LATER +#endif + + + + + + + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER + * + * Used on declarations introduced in Mac OS X 10.2 + */ +#if MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_2 + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER UNAVAILABLE_ATTRIBUTE +#elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_2 + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER WEAK_IMPORT_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED + * + * Used on declarations introduced in Mac OS X 10.2, + * and deprecated in Mac OS X 10.2 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_2 + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_2 + * + * Used on declarations introduced in Mac OS X 10.0, + * but later deprecated in Mac OS X 10.2 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_2 + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_2 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_2 AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_2 + * + * Used on declarations introduced in Mac OS X 10.1, + * but later deprecated in Mac OS X 10.2 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_2 + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_2 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_2 AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER +#endif + +/* + * DEPRECATED_IN_MAC_OS_X_VERSION_10_2_AND_LATER + * + * Used on types deprecated in Mac OS X 10.2 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_2 + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_2_AND_LATER DEPRECATED_ATTRIBUTE +#else + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_2_AND_LATER +#endif + + + + + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER + * + * Used on declarations introduced in Mac OS X 10.3 + */ +#if MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_3 + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER UNAVAILABLE_ATTRIBUTE +#elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_3 + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER WEAK_IMPORT_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED + * + * Used on declarations introduced in Mac OS X 10.3, + * and deprecated in Mac OS X 10.3 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_3 + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 + * + * Used on declarations introduced in Mac OS X 10.0, + * but later deprecated in Mac OS X 10.3 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_3 + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 + * + * Used on declarations introduced in Mac OS X 10.1, + * but later deprecated in Mac OS X 10.3 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_3 + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 + * + * Used on declarations introduced in Mac OS X 10.2, + * but later deprecated in Mac OS X 10.3 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_3 + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER +#endif + +/* + * DEPRECATED_IN_MAC_OS_X_VERSION_10_3_AND_LATER + * + * Used on types deprecated in Mac OS X 10.3 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_3 + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_3_AND_LATER DEPRECATED_ATTRIBUTE +#else + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_3_AND_LATER +#endif + + + + + + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER + * + * Used on declarations introduced in Mac OS X 10.4 + */ +#if MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_4 + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER UNAVAILABLE_ATTRIBUTE +#elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_4 + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER WEAK_IMPORT_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED + * + * Used on declarations introduced in Mac OS X 10.4, + * and deprecated in Mac OS X 10.4 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_4 + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 + * + * Used on declarations introduced in Mac OS X 10.0, + * but later deprecated in Mac OS X 10.4 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_4 + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 + * + * Used on declarations introduced in Mac OS X 10.1, + * but later deprecated in Mac OS X 10.4 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_4 + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 + * + * Used on declarations introduced in Mac OS X 10.2, + * but later deprecated in Mac OS X 10.4 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_4 + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 + * + * Used on declarations introduced in Mac OS X 10.3, + * but later deprecated in Mac OS X 10.4 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_4 + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER +#endif + +/* + * DEPRECATED_IN_MAC_OS_X_VERSION_10_4_AND_LATER + * + * Used on types deprecated in Mac OS X 10.4 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_4 + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_4_AND_LATER DEPRECATED_ATTRIBUTE +#else + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_4_AND_LATER +#endif + + + + + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER + * + * Used on declarations introduced in Mac OS X 10.5 + */ +#if MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_5 + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER UNAVAILABLE_ATTRIBUTE +#elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_5 + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER WEAK_IMPORT_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED + * + * Used on declarations introduced in Mac OS X 10.5, + * and deprecated in Mac OS X 10.5 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 + * + * Used on declarations introduced in Mac OS X 10.0, + * but later deprecated in Mac OS X 10.5 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 + * + * Used on declarations introduced in Mac OS X 10.1, + * but later deprecated in Mac OS X 10.5 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 + * + * Used on declarations introduced in Mac OS X 10.2, + * but later deprecated in Mac OS X 10.5 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 + * + * Used on declarations introduced in Mac OS X 10.3, + * but later deprecated in Mac OS X 10.5 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 + * + * Used on declarations introduced in Mac OS X 10.4, + * but later deprecated in Mac OS X 10.5 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER +#endif + +/* + * DEPRECATED_IN_MAC_OS_X_VERSION_10_5_AND_LATER + * + * Used on types deprecated in Mac OS X 10.5 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_5_AND_LATER DEPRECATED_ATTRIBUTE +#else + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_5_AND_LATER +#endif + + + + + + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER + * + * Used on declarations introduced in Mac OS X 10.6 + */ +#if MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_6 + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER UNAVAILABLE_ATTRIBUTE +#elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_6 + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER WEAK_IMPORT_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED + * + * Used on declarations introduced in Mac OS X 10.6, + * and deprecated in Mac OS X 10.6 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 + * + * Used on declarations introduced in Mac OS X 10.0, + * but later deprecated in Mac OS X 10.6 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 + * + * Used on declarations introduced in Mac OS X 10.1, + * but later deprecated in Mac OS X 10.6 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 + * + * Used on declarations introduced in Mac OS X 10.2, + * but later deprecated in Mac OS X 10.6 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 + * + * Used on declarations introduced in Mac OS X 10.3, + * but later deprecated in Mac OS X 10.6 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 + * + * Used on declarations introduced in Mac OS X 10.4, + * but later deprecated in Mac OS X 10.6 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 + * + * Used on declarations introduced in Mac OS X 10.5, + * but later deprecated in Mac OS X 10.6 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER +#endif + +/* + * DEPRECATED_IN_MAC_OS_X_VERSION_10_6_AND_LATER + * + * Used on types deprecated in Mac OS X 10.6 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_6_AND_LATER DEPRECATED_ATTRIBUTE +#else + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_6_AND_LATER +#endif + + + + + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + * + * Used on declarations introduced in Mac OS X 10.7 + */ +#if MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_7 + #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER UNAVAILABLE_ATTRIBUTE +#elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_7 + #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER WEAK_IMPORT_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED + * + * Used on declarations introduced in Mac OS X 10.7, + * and deprecated in Mac OS X 10.7 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 + #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 + * + * Used on declarations introduced in Mac OS X 10.0, + * but later deprecated in Mac OS X 10.7 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 + * + * Used on declarations introduced in Mac OS X 10.1, + * but later deprecated in Mac OS X 10.7 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 + * + * Used on declarations introduced in Mac OS X 10.2, + * but later deprecated in Mac OS X 10.7 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 + * + * Used on declarations introduced in Mac OS X 10.3, + * but later deprecated in Mac OS X 10.7 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 + * + * Used on declarations introduced in Mac OS X 10.4, + * but later deprecated in Mac OS X 10.7 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 + * + * Used on declarations introduced in Mac OS X 10.5, + * but later deprecated in Mac OS X 10.7 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 + * + * Used on declarations introduced in Mac OS X 10.6, + * but later deprecated in Mac OS X 10.7 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER +#endif + +/* + * DEPRECATED_IN_MAC_OS_X_VERSION_10_7_AND_LATER + * + * Used on types deprecated in Mac OS X 10.7 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_7_AND_LATER DEPRECATED_ATTRIBUTE +#else + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_7_AND_LATER +#endif + +#endif /* __AVAILABILITYMACROS__ */ + + diff --git a/EXTERNAL_HEADERS/Makefile b/EXTERNAL_HEADERS/Makefile index 9f8e3535b..46ee40f90 100644 --- a/EXTERNAL_HEADERS/Makefile +++ b/EXTERNAL_HEADERS/Makefile @@ -11,9 +11,6 @@ INSTINC_SUBDIRS = \ architecture \ mach-o -INSTINC_SUBDIRS_PPC = \ - architecture - INSTINC_SUBDIRS_I386 = \ architecture @@ -23,8 +20,12 @@ INSTINC_SUBDIRS_X86_64 = \ INSTINC_SUBDIRS_ARM = \ architecture + EXPORT_FILES = \ AppleSecureBootEpoch.h \ + Availability.h \ + AvailabilityInternal.h \ + AvailabilityMacros.h \ ar.h \ stdarg.h \ stdbool.h \ diff --git a/EXTERNAL_HEADERS/architecture/Makefile b/EXTERNAL_HEADERS/architecture/Makefile index 8c929ba14..a322a080f 100644 --- a/EXTERNAL_HEADERS/architecture/Makefile +++ b/EXTERNAL_HEADERS/architecture/Makefile @@ -9,9 +9,6 @@ include $(MakeInc_def) INSTINC_SUBDIRS = -INSTINC_SUBDIRS_PPC = \ - ppc - INSTINC_SUBDIRS_I386 = \ i386 @@ -21,6 +18,7 @@ INSTINC_SUBDIRS_X86_64 = \ INSTINC_SUBDIRS_ARM = \ arm + EXPORT_FILES = INSTALL_MI_LIST = diff --git a/EXTERNAL_HEADERS/architecture/ppc/Makefile b/EXTERNAL_HEADERS/architecture/ppc/Makefile deleted file mode 100644 index 374f3bd9a..000000000 --- a/EXTERNAL_HEADERS/architecture/ppc/Makefile +++ /dev/null @@ -1,33 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -INSTINC_SUBDIRS_PPC = - -EXPORT_FILES = \ - asm_help.h \ - basic_regs.h \ - cframe.h \ - fp_regs.h \ - macro_help.h \ - pseudo_inst.h \ - reg_help.h - - -INSTALL_MD_LIST = - -INSTALL_MD_DIR = - -EXPORT_MD_LIST = ${EXPORT_FILES} - -EXPORT_MD_DIR = architecture/ppc - -include $(MakeInc_rule) -include $(MakeInc_dir) - - diff --git a/EXTERNAL_HEADERS/architecture/ppc/asm_help.h b/EXTERNAL_HEADERS/architecture/ppc/asm_help.h deleted file mode 100644 index 0ff2171c4..000000000 --- a/EXTERNAL_HEADERS/architecture/ppc/asm_help.h +++ /dev/null @@ -1,456 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1996 NeXT Software, Inc. All rights reserved. - * - * File: architecture/ppc/asm_help.h - * Author: Mike DeMoney, NeXT Software, Inc. - * - * This header file defines macros useful when writing assembly code - * for the PowerPC processors. - * r12 is used as the tmp register / PICIFY base. - * - * HISTORY - * 20-May-97 Umesh Vaishampayan (umeshv@apple.com) - * Implemented Dynamic / PIC macros. - * - * 28-Dec-96 Umesh Vaishampayan (umeshv@NeXT.com) - * added ".align" directive to various macros to avoid alignment - * faults. Moved Register Usage #defines to reg_help.h as that's - * where they should have been in the first place. - * Added Dynamic / PIC macroes for routines which refernce external - * symbols. Not implemented fully as yet. - * - * 05-Nov-92 Mike DeMoney (mike@next.com) - * Created. - */ - -#ifndef _ARCH_PPC_ASM_HELP_H_ -#define _ARCH_PPC_ASM_HELP_H_ - -#include - -#ifdef __ASSEMBLER__ -/* - * ppc stack frames look like this after procedure prolog has - * been executed: - * - * Higher address: - * ......... - * +-------------------------------+ - * | caller's LR | - * +-------------------------------+ - * | caller's CR | - * +-------------------------------+ - * Caller's SP->| caller's caller's sp | ^^ Caller's Frame ^^ - * +===============================+ vv Called Rtn Frame vv - * | Save Area for | FPF 31 - * .......... - * | Caller's FPF's | FPF n - * +-------------------------------+ - * | Save Area for | GRF 31 - * .......... - * | Caller's GRF's | GRF n - * +-------------------------------+ - * | alignment pad | - * ............ - * | (if necessary) | - * +-------------------------------+ - * | Local | - * ........ - * | Variables | - * +-------------------------------+ - * SP + X -> | aN for FUTURE call | - * +-------------------------------+ - * .......... - * +-------------------------------+ - * SP + 28 -> | a1 for FUTURE call | - * +-------------------------------+ - * SP + 24 -> | a0 for FUTURE call | - * +-------------------------------+ - * SP + 20 -> | caller's TOC | - * +-------------------------------+ - * SP + 16 -> | reserved | - * +-------------------------------+ - * SP + 12 -> | reserved | - * +-------------------------------+ - * SP + 8 -> | LR callee-save for FUTURE call| - * +-------------------------------+ - * SP + 4 -> | CR callee-save for FUTURE call| - * +-------------------------------+ - * SP -> | caller's sp | - * +===============================+ - * Lower address: - * - * NOTE: All state with the exception of LR and CR are saved in the - * called routines frame. LR and CR are saved in the CALLER'S FRAME. - * - * ALSO NOTE: Args to the called routine are found in the caller's frame. - */ - -/* - * ARG(n) -- stack offset to n'th argument - * - * NOTE CAREFULLY! These macros start numbering arguments at 1 (NOT 0) - * The first argument is ARG(1). - * - * ALSO NOTE: This stack offset is only valid if using routine - * DOES NOT alter SP. - * - */ -#define ARG(n) ((((n) - 1) * 4) + 24) - -/* - * Macros for building stack frame according to C calling conventions. - * lr, cr, and sp are saved. - * - * NOTE WELL: localvarsize is in bytes, maxargsout is a count of words, - * grfsaved and fpfsaved is a count of registers. BE SURE TO COUNT - * BOTH FP (r31) AND sN REGISTERS IN THE COUNT OF GRF REGISTERS SAVED! - * This will be TWO more than the N of the highest sN register you - * save: s2 implies you are saving s2, s1, s0, and fp => grfsaved - * should be 4! - * - * FURTHER NOTE: These macros do NOT SAVE GRF or FPF registers. User - * must do that. GRF sN regs should be saved via - * stmw sN,SAVED_GRF_S(N)(sp) - * where N is the highest numbered s* register to be saved. E.g. if - * s0, s1, and s2 are to be saved use: - * stmw s2,SAVED_GRF_S(2)(sp) - * Note that this also saves fp. - * An individual saved grf can be loaded via: - * lwz s2,SAVED_GRF_S(2)(sp) - * Analogous stuff works for fpf's. - * - * NOTE: these simple routines will be replaced with more complicated - * ones once we know what the linker and gdb will require as for as - * register use masks and frame declarations. - * - * Warning: ROUND_TO_STACK is only to be used in assembly language; - * for C usage, use ROUND_FRAME() in reg_help.h. - */ -#define ROUND_TO_STACK(len) \ - (((len) + STACK_INCR - 1) / STACK_INCR * STACK_INCR) - -#define BUILD_FRAME(localvarsize, maxargsout, grfsaved, fpfsaved) \ - .set __argoutsize, ROUND_TO_STACK((maxargsout) * 4) @\ - .if __argoutsize < 32 @\ - .set __argoutsize,32 @\ - .endif @\ - .set __framesize, ROUND_TO_STACK( \ - 24 + __argoutsize + (localvarsize) \ - + 4*(grfsaved) + 8*(fpfsaved)) @\ - .set __grfbase,(__framesize - 4*(grfsaved) - 8*(fpfsaved)) @\ - .set __fpfbase,(__framesize - 8*(fpfsaved)) @\ - mflr r0 @\ - mfcr r12 @\ - stw r0,8(sp) @\ - stw r12,4(sp) @\ - stwu r1,-__framesize(r1) - -/* - * Macros for referencing data in stack frame. - * - * NOTE WELL: ARG's and VAR's start at 1, NOT 0. Why ??? (FIXME) - */ -#define LOCAL_VAR(n) (((n)-1)*4 + __argoutsize + 24) -#define SAVED_GRF_S(n) (__grfbase + ((grfsaved) - (n) - 2) * 4) -#define SAVED_FRF_FS(n) (__fpfbase + ((fpfsaved) - (n) - 1) * 4) -#define ARG_IN(n) (ARG(n) + __framesize) -#define ARG_OUT(n) (ARG(n) + 0) -#define SAVED_FP (__grfbase + ((grfsaved) - 1) * 4) -#define SAVED_LR (__framesize + 8) -#define SAVED_CR (__framesize + 4) - -/* - * Macros for unwinding stack frame. - * NOTE: GRF's and FPF's are NOT RESTORED. User must do this before - * using this macro. - */ -#define RETURN \ - .if __framesize @\ - lwz32 r0,r1,SAVED_LR @\ - lwz32 r12,r1,SAVED_CR @\ - addic sp,r1,__framesize @\ - mtlr r0 @\ - mtcrf 0xff,r12 @\ - blr @\ - .else @\ - blr @\ - .endif - - -/* - * Macros for declaring procedures - * - * Use of these macros allows ctags to have a predictable way - * to find various types of declarations. They also simplify - * inserting appropriate symbol table information. - * - * NOTE: these simple stubs will be replaced with more - * complicated versions once we know what the linker and gdb - * will require as far as register use masks and frame declarations. - * These macros may also be ifdef'ed in the future to contain profiling - * code. - * - * FIXME: Document what makes a leaf a LEAF and a handler a HANDLER. - * (E.g. leaf's have return pc in lr, NESTED's have rpc in offset off - * sp, handlers have rpc in exception frame which is found via exception - * link, etc etc.) - */ - -/* - * TEXT -- declare start of text segment - */ -#define TEXT \ - .text @\ - .align 2 - -/* - * LEAF -- declare global leaf procedure - * NOTE: Control SHOULD NOT FLOW into a LEAF! A LEAF should only - * be jumped to. (A leaf may do an align.) Use a LABEL() if you - * need control to flow into the label. - */ -#define LEAF(name) \ - .align 2 @\ - .globl name @\ -name: @\ - .set __framesize,0 - -/* - * X_LEAF -- declare alternate global label for leaf - */ -#define X_LEAF(name, value) \ - .globl name @\ - .set name,value - -/* - * P_LEAF -- declare private leaf procedure - */ -#define P_LEAF(name) \ - .align 2 @\ -name: @\ - .set __framesize,0 - -/* - * LABEL -- declare a global code label - * MUST be used (rather than LEAF, NESTED, etc) if control - * "flows into" the label. - */ -#define LABEL(name) \ - .align 2 @\ - .globl name @\ -name: - -/* - * NESTED -- declare procedure that invokes other procedures - */ -#define NESTED(name, localvarsize, maxargsout, grfsaved, fpfsaved)\ - .align 2 @\ - .globl name @\ -name: @\ - BUILD_FRAME(localvarsize, maxargsout, grfsaved, fpfsaved) - -/* - * X_NESTED -- declare alternate global label for nested proc - */ -#define X_NESTED(name, value) \ - .globl name @\ - .set name,value - -/* - * P_NESTED -- declare private nested procedure - */ -#define P_NESTED(name, localvarsize, maxargsout, grfsaved, fpfsaved)\ - .align 2 @\ -name: @\ - BUILD_FRAME(locavarsize, maxargsout, grfsaved, fpfsaved) - -/* - * HANDLER -- declare procedure with exception frame rather than - * standard C frame - */ -#define HANDLER(name) \ - .align 2 @\ - .globl name @\ -name: - -/* - * X_HANDLER -- declare alternate name for exception handler - * (Should appear immediately before a HANDLER declaration or - * another X_HANDLER declaration) - */ -#define X_HANDLER(name) \ - .align 2 @\ - .globl name @\ -name: - -/* - * P_HANDLER -- declare private handler - */ -#define P_HANDLER(name) \ - .align 2 @\ -name: - -/* - * END -- mark end of procedure - * FIXME: Unimplemented for now. - */ -#define END(name) - -/* - * BL -- call procedure (relative) - */ -#define BL(name) \ - bl name - -/* - * Storage definition macros - * The main purpose of these is to allow an easy handle for ctags - */ - -/* - * IMPORT -- import symbol - */ -#define IMPORT(name) \ - .reference name - -/* - * ABS -- declare global absolute symbol - */ -#define ABS(name, value) \ - .globl name @\ - .set name,value - -/* - * P_ABS -- declare private absolute symbol - */ -#define P_ABS(name, value) \ - .set name,value - -/* - * EXPORT -- declare global label for data - */ -#define EXPORT(name) \ - .align 2 @\ - .globl name @\ -name: - -/* - * BSS -- declare global zero'ed storage - */ -#define BSS(name,size) \ - .comm name,size - - -/* - * P_BSS -- declare private zero'ed storage - */ -#define P_BSS(name,size) \ - .lcomm name,size - -/* - * dynamic/PIC macros for routines which reference external symbols - */ -#if defined(__DYNAMIC__) -#define PICIFY_REG r12 - -/* Assume that the lr is saved before calling any of these macros */ -/* using PICIFY() */ - -#define PICIFY(var) \ - mflr r0 @\ - bl 1f @\ -1: mflr PICIFY_REG @\ - mtlr r0 @\ - addis PICIFY_REG, PICIFY_REG, ha16(L ## var ## $non_lazy_ptr - 1b) @\ - lwz PICIFY_REG, lo16(L ## var ## $non_lazy_ptr - 1b)(PICIFY_REG) - -#define CALL_EXTERN_AGAIN(var) \ - PICIFY(var) @\ - mtctr PICIFY_REG @\ - mflr r0 @\ - stw r0,8(r1) @\ - stwu r1,-56(r1) @\ - bctrl @\ - addic r1,r1,56 @\ - lwz r0,8(r1) @\ - mtlr r0 - -#define NON_LAZY_STUB(var) \ - .non_lazy_symbol_pointer @\ - .align 2 @\ -L ## var ## $non_lazy_ptr: @\ - .indirect_symbol var @\ - .long 0 @\ - .text @\ - .align 2 - -#define BRANCH_EXTERN(var) \ - PICIFY(var) @\ - mtctr PICIFY_REG @\ - bctr @\ - NON_LAZY_STUB(var) - -#define CALL_EXTERN(var) \ - CALL_EXTERN_AGAIN(var) @\ - NON_LAZY_STUB(var) - -#define REG_TO_EXTERN(reg, var) \ - PICIFY(var) @\ - stw reg, 0(PICIFY_REG) @\ - NON_LAZY_STUB(var) - -#define EXTERN_TO_REG(reg, var) \ - PICIFY(var) @\ - lwz reg, 0(PICIFY_REG) @\ - NON_LAZY_STUB(var) - -#else /* ! __DYNAMIC__ */ -#define TMP_REG r12 -#define BRANCH_EXTERN(var) \ - b var - -#define CALL_EXTERN(var) \ - bl var - -#define CALL_EXTERN_AGAIN(var) \ - CALL_EXTERN(var) - -#define REG_TO_EXTERN(reg, var) \ - lis TMP_REG, ha16(var) @\ - stw reg, lo16(var)(TMP_REG) - -#define EXTERN_TO_REG(reg, var) \ - lis reg, ha16(var) @\ - lwz reg, lo16(var)(reg) - -#endif /* __DYNAMIC__ */ - -#endif /* __ASSEMBLER__ */ -#endif /* _ARCH_PPC_ASM_HELP_H_ */ diff --git a/EXTERNAL_HEADERS/architecture/ppc/basic_regs.h b/EXTERNAL_HEADERS/architecture/ppc/basic_regs.h deleted file mode 100644 index b9dbdf699..000000000 --- a/EXTERNAL_HEADERS/architecture/ppc/basic_regs.h +++ /dev/null @@ -1,306 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1996 NeXT Software, Inc. All rights reserved. - * - * File: architecture/ppc/basic_regs.h - * Author: Doug Mitchell, NeXT Software, Inc. - * - * Basic ppc registers. - * - * HISTORY - * 22-May-97 Umesh Vaishampayan (umeshv@apple.com) - Updated to match MPCFPE32B/AD 1/97 REV. 1 - * 29-Dec-96 Umesh Vaishampayan (umeshv@NeXT.com) - * Ported from m98k. - * 05-Nov-92 Doug Mitchell at NeXT - * Created. - */ - -#ifndef _ARCH_PPC_BASIC_REGS_H_ -#define _ARCH_PPC_BASIC_REGS_H_ - -#include -#include - -#if !defined(__ASSEMBLER__) - -/* - * Number of General Purpose registers. - */ -#define PPC_NGP_REGS 32 - -/* - * Common half-word used in Machine State Register and in - * various exception frames. Defined as a macro because the compiler - * will align a struct to a word boundary when used inside another struct. - */ -#define MSR_BITS \ - unsigned ee:BIT_WIDTH(15), /* external intr enable */ \ - pr:BIT_WIDTH(14), /* problem state */ \ - fp:BIT_WIDTH(13), /* floating point avail */ \ - me:BIT_WIDTH(12), /* machine check enable */ \ - fe0:BIT_WIDTH(11), /* fp exception mode 0 */ \ - se:BIT_WIDTH(10), /* single step enable */ \ - be:BIT_WIDTH(9), /* branch trace enable */ \ - fe1:BIT_WIDTH(8), /* fp exception mode 0 */ \ - rsvd1:BIT_WIDTH(7), /* reserved */ \ - ip:BIT_WIDTH(6), /* interrupt prefix */ \ - ir:BIT_WIDTH(5), /* instruction relocate */ \ - dr:BIT_WIDTH(4), /* data relocate */ \ - rsvd2:BITS_WIDTH(3,2), /* reserved */ \ - ri:BIT_WIDTH(1), /* recoverable exception */ \ - le:BIT_WIDTH(0) /* Little-endian mode */ - -/* - * Machine state register. - * Read and written via get_msr() and set_msr() inlines, below. - */ -typedef struct { - unsigned rsvd3:BITS_WIDTH(31,19), // reserved - pow:BIT_WIDTH(18), // Power management enable - rsvd0: BIT_WIDTH(17), // reserved - ile: BIT_WIDTH(16); // exception little endian - - MSR_BITS; // see above -} msr_t; - -/* - * Data Storage Interrupt Status Register (DSISR) - */ -typedef struct { - unsigned dse:BIT_WIDTH(31); // direct-store error - unsigned tnf:BIT_WIDTH(30); // translation not found - unsigned :BITS_WIDTH(29,28); - unsigned pe:BIT_WIDTH(27); // protection error - unsigned dsr:BIT_WIDTH(26); // lwarx/stwcx to direct-store - unsigned rw:BIT_WIDTH(25); // 1 => store, 0 => load - unsigned :BITS_WIDTH(24,23); - unsigned dab:BIT_WIDTH(22); // data address bkpt (601) - unsigned ssf:BIT_WIDTH(21); // seg table search failed - unsigned :BITS_WIDTH(20,0); -} dsisr_t; - -/* - * Instruction Storage Interrupt Status Register (really SRR1) - */ -typedef struct { - unsigned :BIT_WIDTH(31); - unsigned tnf:BIT_WIDTH(30); // translation not found - unsigned :BIT_WIDTH(29); - unsigned dse:BIT_WIDTH(28); // direct-store fetch error - unsigned pe:BIT_WIDTH(27); // protection error - unsigned :BITS_WIDTH(26,22); - unsigned ssf:BIT_WIDTH(21); // seg table search failed - unsigned :BITS_WIDTH(20,16); - MSR_BITS; -} isisr_t; - -/* - * Alignment Interrupt Status Register (really DSISR) - * NOTE: bit numbers in field *names* are in IBM'ese (0 is MSB). - * FIXME: Yuck!!! Double Yuck!!! - */ -typedef struct { - unsigned :BITS_WIDTH(31,20); - unsigned ds3031:BITS_WIDTH(19,18);// bits 30:31 if DS form - unsigned :BIT_WIDTH(17); - unsigned x2930:BITS_WIDTH(16,15); // bits 29:30 if X form - unsigned x25:BIT_WIDTH(14); // bit 25 if X form or - // bit 5 if D or DS form - unsigned x2124:BITS_WIDTH(13,10); // bits 21:24 if X form or - // bits 1:4 if D or DS form - unsigned all615:BITS_WIDTH(9,0); // bits 6:15 of instr - MSR_BITS; -} aisr_t; - -/* - * Program Interrupt Status Register (really SRR1) - */ -typedef struct { - unsigned :BITS_WIDTH(31,21); - unsigned fpee:BIT_WIDTH(20); // floating pt enable exception - unsigned ill:BIT_WIDTH(19); // illegal instruction - unsigned priv:BIT_WIDTH(18); // privileged instruction - unsigned trap:BIT_WIDTH(17); // trap program interrupt - unsigned subseq:BIT_WIDTH(16); // 1 => SRR0 points to - // subsequent instruction - MSR_BITS; -} pisr_t; - -/* - * Condition register. May not be useful in C, let's see... - */ -typedef struct { - unsigned lt:BIT_WIDTH(31), // negative - gt:BIT_WIDTH(30), // positive - eq:BIT_WIDTH(29), // equal to zero - so:BIT_WIDTH(28), // summary overflow - fx:BIT_WIDTH(27), // floating point exception - fex:BIT_WIDTH(26), // fp enabled exception - vx:BIT_WIDTH(25), // fp invalid operation - // exception - ox:BIT_WIDTH(24), // fp overflow exception - rsvd:BITS_WIDTH(23,0); // reserved -} cr_t; - -/* - * Abstract values representing fe0:fe1. - * See get_fp_exc_mode(), below. - */ -typedef enum { - FEM_IGNORE_EXCEP, // ignore exceptions - FEM_IMPR_NONREC, // imprecise nonrecoverable - FEM_IMPR_RECOV, // imprecise recoverable - FEM_PRECISE -} fp_exc_mode_t; - - -/* - * Special purpose registers. - */ - -/* - * Processor version register (special purpose register pvr). - */ -typedef struct { - unsigned version:BITS_WIDTH(31,16), - revision:BITS_WIDTH(15,0); -} pvr_t; - -/* - * Fixed point exception register (special purpose register xer) - */ -typedef struct { - unsigned so:BIT_WIDTH(31), // summary overflow - ov:BIT_WIDTH(30), // overflow - ca:BIT_WIDTH(29), // carry - rsvd1:BITS_WIDTH(28,7), // reserved - byte_count:BITS_WIDTH(6,0); -} xer_t; - -/* - * Inlines and macros to manipulate the above registers. - */ - -/* - * Get/set machine state register. - */ -static __inline__ msr_t -get_msr() -{ - msr_t __msr_tmp; - __asm__ volatile ("mfmsr %0 /* mfmsr */" : "=r" (__msr_tmp)); - return __msr_tmp; -} - -static __inline__ void -set_msr(msr_t msr) -{ - __asm__ volatile ("mtmsr %0 /* mtmsr */ " : : "r" (msr)); -} - -/* - * Determine current fp_exc_mode_t given prog_mode. - */ -static __inline__ fp_exc_mode_t -get_fp_exc_mode(pmr_t pmr) -{ - if(pmr.fe0) - return pmr.fe1 ? FEM_PRECISE : FEM_IMPR_RECOV; - else - return pmr.fe1 ? FEM_IMPR_NONREC : FEM_IGNORE_EXCEP; -} - -/* - * Software definitions for special purpose registers. - * The same register is used as per_cpu data pointer and - * vector base register. This requires that the vector - * table be the first item in the per_cpu table. - */ -#define SR_EXCEPTION_TMP_LR sprg0 -#define SR_EXCEPTION_TMP_CR sprg1 -#define SR_EXCEPTION_TMP_AT sprg2 -#define SR_PER_CPU_DATA sprg3 -#define SR_VBR sprg3 - -/* - * Get/set special purpose registers. - * - * GET_SPR - get SPR by name. - * - * Example usage: - * - * { - * xer_t some_xer; - * - * some_xer = GET_SPR(xer_t, xer); - * ... - * } - * - * This is a strange one. We're creating a list of C expressions within - * a set of curlies; the last expression ("__spr_tmp;") is the return value - * of the statement created by the curlies. - * - */ - -#define GET_SPR(type, spr) \ -({ \ - unsigned __spr_tmp; \ - __asm__ volatile ("mfspr %0, " STRINGIFY(spr) : "=r" (__spr_tmp)); \ - *(type *)&__spr_tmp; \ -}) - -/* - * Example usage of SET_SPR: - * - * { - * xer_t some_xer; - * - * ...set up some_xer... - * SET_SPR(xer, some_xer); - * } - */ -#define SET_SPR(spr, val) \ -MACRO_BEGIN \ - __typeof__ (val) __spr_tmp = (val); \ - __asm__ volatile ("mtspr "STRINGIFY(spr) ", %0" : : "r" (__spr_tmp)); \ -MACRO_END - -/* - * Fully synchronize instruction stream. - */ -static __inline__ void -ppc_sync() -{ - __asm__ volatile ("sync /* sync */" : : ); -} - -#endif /* ! __ASSEMBLER__ */ - -#endif /* _ARCH_PPC_BASIC_REGS_H_ */ - diff --git a/EXTERNAL_HEADERS/architecture/ppc/fp_regs.h b/EXTERNAL_HEADERS/architecture/ppc/fp_regs.h deleted file mode 100644 index ab48b8821..000000000 --- a/EXTERNAL_HEADERS/architecture/ppc/fp_regs.h +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1996 NeXT Software, Inc. All rights reserved. - * - * File: architecture/ppc/fp_regs.h - * Author: Doug Mitchell, NeXT Software, Inc. - * - * ppc floating point registers. - * - * HISTORY - * 29-Dec-96 Umesh Vaishampayan (umeshv@NeXT.com) - * Ported from m98k. - * 05-Nov-92 Doug Mitchell at NeXT - * Created. - */ - -#ifndef _ARCH_PPC_FP_REGS_H_ -#define _ARCH_PPC_FP_REGS_H_ - -#include - -#if !defined(__ASSEMBLER__) -/* - * Floating point status and control register. - * - * This struct is aligned to an 8-byte boundary because 64-bit - * load/store instructions (lfd/stfd) are used to access it. The - * FPSCR can only be read/written through other FP registers. - */ -typedef struct { - unsigned unused[1] __attribute__(( aligned(8) )); - unsigned fx:BIT_WIDTH(31), // exception summary - fex:BIT_WIDTH(30), // enabled exception summary - vx:BIT_WIDTH(29), // invalid op exception - // summary - ox:BIT_WIDTH(28), // overflow exception - ux:BIT_WIDTH(27), // underflow exception - zx:BIT_WIDTH(26), // divide by zero exception - xx:BIT_WIDTH(25), // inexact exception - vx_snan:BIT_WIDTH(24), // not a number exception - vx_isi:BIT_WIDTH(23), // exception - vx_idi:BIT_WIDTH(22), // exception - vx_zdz:BIT_WIDTH(21), // exception - vx_imz:BIT_WIDTH(20), // exception - vx_xvc:BIT_WIDTH(19), // exception - fr:BIT_WIDTH(18), // fraction rounded - fi:BIT_WIDTH(17), // fraction inexact - class:BIT_WIDTH(16), // class descriptor - fl:BIT_WIDTH(15), // negative - fg:BIT_WIDTH(14), // positive - fe:BIT_WIDTH(13), // equal or zero - fu:BIT_WIDTH(12), // not a number - rsvd1:BIT_WIDTH(11), // reserved - vx_soft:BIT_WIDTH(10), // software request exception - rsvd2:BIT_WIDTH(9), // reserved - vx_cvi:BIT_WIDTH(8), // invalid integer convert - // exception - ve:BIT_WIDTH(7), // invalid op exception enable - oe:BIT_WIDTH(6), // overflow exception enable - ue:BIT_WIDTH(5), // underflow exception enable - ze:BIT_WIDTH(4), // divide by zero exception - // enable - xe:BIT_WIDTH(3), // inexact exception enable - ni:BIT_WIDTH(2), // non-IEEE exception enable - rn:BITS_WIDTH(1,0); // rounding control -} ppc_fp_scr_t; - -/* - * Values for fp_scr_t.rn (rounding control). - */ -typedef enum { - RN_NEAREST = 0, - RN_TOWARD_ZERO = 1, - RN_TOWARD_PLUS = 2, - RN_TOWARD_MINUS = 3 -} ppc_fp_rn_t; - -/* - * ppc_fpf_t -- data types that MAY be in floating point register file - * Actual data types supported is implementation dependent - */ -typedef union { - float f; // 32 bit IEEE single - double d; // 64 bit IEEE double - - /* - * Insure compiler aligns struct appropriately - */ - unsigned x[2] __attribute__(( aligned(8) )); -} ppc_fpf_t; - -/* - * Number of FP registers. - */ -#define PPC_NFP_REGS 32 - -/* - * Read/write FPSCR. - * FIXME - these don't work, you need to go thru a fp register. - */ -typedef union { - double __dbl; - ppc_fp_scr_t __scr; -} __fp_un_t; - -static __inline__ ppc_fp_scr_t -get_fp_scr() -{ - __fp_un_t __fp_un; - - __asm__ volatile ("mffs. %0 /* mffs */" \ - : "=f" (__fp_un.__dbl)); - return (__fp_un.__scr); -} - -static __inline__ void -set_fp_scr(ppc_fp_scr_t fp_scr) -{ - __fp_un_t __fp_un; - - __fp_un.__scr = fp_scr; - __asm__ volatile ("mtfsf 0xff, %0; /* mtfsf */ " \ - : : "f" (__fp_un.__dbl)); -} - -#endif /* ! __ASSEMBLER__ */ - -#endif /* _ARCH_PPC_FP_REGS_H_ */ diff --git a/EXTERNAL_HEADERS/architecture/ppc/macro_help.h b/EXTERNAL_HEADERS/architecture/ppc/macro_help.h deleted file mode 100644 index a149f8eb0..000000000 --- a/EXTERNAL_HEADERS/architecture/ppc/macro_help.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1996 NeXT Software, Inc. - */ -/* - * Mach Operating System - * Copyright (c) 1989 Carnegie-Mellon University - * Copyright (c) 1988 Carnegie-Mellon University - * All rights reserved. The CMU software License Agreement specifies - * the terms and conditions for use and redistribution. - * - * File: architecture/ppc/macro_help.h - * - * Provide help in making lint-free macro routines - * - * HISTORY - * - * 29-Dec-96 Umesh Vaishampayan (umeshv@NeXT.com) - * Created from m98k version. - */ - -#ifndef _ARCH_PPC_MACRO_HELP_H_ -#define _ARCH_PPC_MACRO_HELP_H_ - -#ifndef MACRO_BEGIN -# define MACRO_BEGIN do { -#endif /* MACRO_BEGIN */ - -#ifndef MACRO_END -# define MACRO_END } while (0) -#endif /* MACRO_END */ - -#ifndef MACRO_RETURN -# define MACRO_RETURN if (1) return -#endif /* MACRO_RETURN */ - -#endif /* _ARCH_PPC_MACRO_HELP_H_ */ - diff --git a/EXTERNAL_HEADERS/architecture/ppc/pseudo_inst.h b/EXTERNAL_HEADERS/architecture/ppc/pseudo_inst.h deleted file mode 100644 index da4071e6b..000000000 --- a/EXTERNAL_HEADERS/architecture/ppc/pseudo_inst.h +++ /dev/null @@ -1,420 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1996 NeXT Software, Inc. All rights reserved. - * - * File: architecture/ppc/pseudo_inst.h - * Author: Mike DeMoney - * - * This header file defines assembler pseudo-instruction macros for - * for the ppc. - * - * NOTE: This is obviously only useful to include in assembly - * code source. - * - * ALSO NOTE: These macros don't attempt to be 64-bit compatable - * - * HISTORY - * 29-Dec-96 Umesh Vaishampayan (umeshv@NeXT.com) - * Ported from m98k. - * 05-Nov-92 Mike DeMoney (mike@next.com) - * Created. - */ - -#ifndef _ARCH_PPC_PSEUDO_INST_H_ -#define _ARCH_PPC_PSEUDO_INST_H_ - -#include -#include - -#ifdef __ASSEMBLER__ - -/* - * Pseudo instruction definitions - */ - -/* - * Macro package initialization - */ - .set __no_at,0 /* allow at by default */ - -/* - * .at_off -- disable use of at by macros - * .at_on -- enable use of at by macros - */ -.macro .at_off - .set __no_at,1 -.endmacro - -.macro .at_on - .set __no_at,0 -.endmacro - -/* - * li32 rD,IMMED - * - * Load 32-bit immediate into rD - * FIXME: Need a way to undefine built-in macro for this. - */ -.macro li32 // li32 rD,immed -.if $n != 2 - .abort "invalid operands of li32" -.endif -.abs __is_abs,$1 -.if !__is_abs - addis $0,0,hi16($1) - ori $0,$0,lo16($1) -.elseif $1 == 0 - addi $0,0,0 -.elseif ($1 & 0xffff) == 0 - addis $0,0,hi16($1) -.elseif ($1 & 0xffff8000) == 0 - addi $0,0,$1 -.elseif ($1 & 0xffff8000) == 0xffff8000 - addi $0,0,$1 -.else - addis $0,0,hi16($1) - ori $0,$0,lo16($1) -.endif -.endmacro - - -/* - * andi32. rD,rS1,IMMED - * - * Perform "andi." with (possibly) 32-bit immediate - */ -.macro andi32. // andi32. rD,rS1,IMMED -.if $n != 3 - .abort "invalid operands of andi." -.endif - .set __used_at,0 -.abs __is_abs,$2 -.if !__is_abs - .set __used_at,1 - li32 at,$2 - and. $0,$1,at -.elseif ($2 & 0xffff0000) == 0 - andi. $0,$1,$2 -.elseif ($2 & 0xffff) == 0 - andis. $0,$1,hi16($2) -.else - .set __used_at,1 - li32 at,$2 - and. $0,$1,at -.endif -.if __no_at & __used_at - .abort "Macro uses at while .no_at in effect" -.endif -.endmacro - -/* - * ori32 rD,rS1,IMMED - * - * Perform "ori" with (possibly) 32-bit immediate - */ -.macro ori32 // ori32 rD,rS1,IMMED -.if $n != 3 - .abort "invalid operands of ori" -.endif -.abs __is_abs,$2 -.if !__is_abs - oris $0,$1,hi16($2) - ori $0,$1,lo16($2) -.elseif ($2 & 0xffff0000) == 0 - ori $0,$1,$2 -.elseif ($2 & 0xffff) == 0 - oris $0,$1,hi16($2) -.else - oris $0,$1,hi16($2) - ori $0,$1,lo16($2) -.endif -.endmacro - -/* - * xori32 rD,rS1,IMMED - * - * Perform "xor" with (possibly) 32-bit immediate - */ -.macro xori32 // xori32 rD,rS1,IMMED -.if $n != 3 - .abort "invalid operands of xori" -.endif -.abs __is_abs,$2 -.if !__is_abs - xoris $0,$1,hi16($2) - xori $0,$1,lo16($2) -.elseif ($2 & 0xffff0000) == 0 - xori $0,$1,$2 -.elseif ($2 & 0xffff) == 0 - xoris $0,$1,hi16($2) -.else - xoris $0,$1,hi16($2) - xori $0,$1,lo16($2) -.endif -.endmacro - - -/* - * MEMREF_INST -- macros to memory referencing instructions - * "capable" of dealing with 32 bit offsets. - * - * NOTE: Because the assembler doesn't have any mechanism for easily - * parsing the d(rS) syntax of register-displacement form instructions, - * these instructions do NOT mirror the normal memory reference - * instructions. The following "transformation" is used: - * lbz rD,d(rS) - * becomes: - * lbz32 rD,rS,d - * I.e.: "32" is appended to the instruction name and the base register - * and displacement become the 2'nd and 3'rd comma-separated operands. - * - * The forms: - * lbz32 rD,d - * and: - * lbz32 rD,rS - * are also recognized and the missing operand is assumed 0. - * - * ALSO NOTE: r0 or zt should never be used as rS in these instructions. - * Use "0" as rS in this case. - */ -#define MEMREF_INST(op) \ -.macro op ## 32 @\ -.set __used_at,0 @\ -.if $n == 3 @\ - .greg __is_greg,$1 @\ - .abs __is_abs,$2 @\ - .if __is_abs @\ - .if ($2 & 0xffff8000) == 0 @\ - op $0,$2($1) @\ - .elseif ($2 & 0xffff8000) == 0xffff8000 @\ - op $0,$2($1) @\ - .else @\ - .if !__is_greg @\ - .set __used_at,1 @\ - lis at,ha16($2) @\ - op $0,lo16($2)(at) @\ - .else @\ - .set __used_at,1 @\ - lis at,ha16($2) @\ - add at,at,$1 @\ - op $0,lo16($2)(at) @\ - .endif @\ - .endif @\ - .else @\ - .if !__is_greg @\ - .set __used_at,1 @\ - lis at,ha16($2) @\ - op $0,lo16($2)(at) @\ - .else @\ - .set __used_at,1 @\ - lis at,ha16($2) @\ - add at,at,$1 @\ - op $0,lo16($2)(at) @\ - .endif @\ - .endif @\ -.elseif $n == 2 @\ - .greg __is_greg,$1 @\ - .if !__is_greg @\ - .abs __is_abs,$1 @\ - .if __is_abs @\ - .if ($1 & 0xffff8000) == 0 @\ - op $0,$1(0) @\ - .elseif ($1 & 0xffff8000) == 0xffff8000 @\ - op $0,$1(0) @\ - .else @\ - .set __used_at,1 @\ - lis at,ha16($1) @\ - op $0,lo16($1)(at) @\ - .endif @\ - .else @\ - .set __used_at,1 @\ - lis at,ha16($1) @\ - op $0,lo16($1)(at) @\ - .endif @\ - .else @\ - op $0,0($1) @\ - .endif @\ -.else @\ - .abort "Invalid operands of " #op "32" @\ -.endif @\ -.if __no_at & __used_at @\ - .abort "Macro uses at while .no_at in effect" @\ -.endif @\ -.endmacro - -MEMREF_INST(lbz) -MEMREF_INST(lhz) -MEMREF_INST(lha) -MEMREF_INST(lwz) -MEMREF_INST(lwa) -MEMREF_INST(ld) - -MEMREF_INST(stb) -MEMREF_INST(sth) -MEMREF_INST(stw) -MEMREF_INST(std) - -MEMREF_INST(lmw) -MEMREF_INST(lmd) -MEMREF_INST(stmw) -MEMREF_INST(stmd) - -/* - * ARITH_INST -- define 32-bit immediate forms of arithmetic - * instructions - * - * E.g. addi32 rD,rS,IMMED - */ -#define ARITH_INST(op, op3, sf) \ -.macro op ## 32 ## sf @\ -.if $n != 3 @\ - .abort "invalid operands to " #op "32" @\ -.endif @\ -.abs __is_abs,$2 @\ -.if __is_abs @\ - .if ($2 & 0xffff8000) == 0 @\ - op##sf $0,$1,$2 @\ - .elseif ($2 & 0xffff8000) == 0xffff8000 @\ - op##sf $0,$1,$2 @\ - .elseif __no_at @\ - .abort "Macro uses at while .no_at in effect" @\ - .else @\ - li32 at,$2 @\ - op3##sf $0,$1,at @\ - .endif @\ -.elseif __no_at @\ - .abort "Macro uses at while .no_at in effect" @\ -.else @\ - li32 at,$2 @\ - op3##sf $0,$1,at @\ -.endif @\ -.endmacro - -ARITH_INST(addi, add, ) -ARITH_INST(subi, sub, ) -ARITH_INST(addic, addc, ) -ARITH_INST(subic, subc, ) -ARITH_INST(addic, addc, .) -ARITH_INST(subic, subc, .) -ARITH_INST(mulli, mull, ) - -/* - * CMPEX_INST -- define 32-bit immediate forms of extended compare - * instructions - * - * E.g. cmpwi32 cr3,rS,IMMED - * cmpwi32 rS,IMMED - */ -#define CMPEX_INST(op, op3) \ -.macro op ## 32 @\ -.if $n == 3 @\ - .abs __is_abs,$2 @\ - .if __is_abs @\ - .if ($2 & 0xffff8000) == 0 @\ - op $0,$1,$2 @\ - .elseif ($2 & 0xffff8000) == 0xffff8000 @\ - op $0,$1,$2 @\ - .elseif __no_at @\ - .abort "Macro uses at while .no_at in effect" @\ - .else @\ - li32 at,$2 @\ - op3 $0,$1,at @\ - .endif @\ - .elseif __no_at @\ - .abort "Macro uses at while .no_at in effect" @\ - .else @\ - li32 at,$2 @\ - op3 $0,$1,at @\ - .endif @\ -.elseif $n == 2 @\ - .abs __is_abs,$1 @\ - .if __is_abs @\ - .if ($1 & 0xffff8000) == 0 @\ - op $0,$1 @\ - .elseif ($1 & 0xffff8000) == 0xffff8000 @\ - op $0,$1 @\ - .elseif __no_at @\ - .abort "Macro uses at while .no_at in effect" @\ - .else @\ - li32 at,$1 @\ - op3 $0,at @\ - .endif @\ - .elseif __no_at @\ - .abort "Macro uses at while .no_at in effect" @\ - .else @\ - li32 at,$1 @\ - op3 $0,at @\ - .endif @\ -.else @\ - .abort "invalid operands to " #op "32" @\ -.endif @\ -.endmacro - -CMPEX_INST(cmpdi, cmpd) -CMPEX_INST(cmpwi, cmpw) -CMPEX_INST(cmpldi, cmpld) -CMPEX_INST(cmplwi, cmplw) - -/* - * CMP_INST -- define 32-bit immediate forms of standard compare - * instructions - * - * E.g. cmpi32 cr3,0,rS,IMMED - */ -#define CMP_INST(op, op3) \ -.macro op ## 32 @\ -.if $n == 4 @\ - .abs __is_abs,$3 @\ - .if __is_abs @\ - .if ($3 & 0xffff8000) == 0 @\ - op $0,$1,$2,$3 @\ - .elseif ($3 & 0xffff8000) == 0xffff8000 @\ - op $0,$1,$2,$3 @\ - .elseif __no_at @\ - .abort "Macro uses at while .no_at in effect" @\ - .else @\ - li32 at,$3 @\ - op3 $0,$1,$2,at @\ - .endif @\ - .elseif __no_at @\ - .abort "Macro uses at while .no_at in effect" @\ - .else @\ - li32 at,$3 @\ - op3 $0,$1,$2,at @\ - .endif @\ -.else @\ - .abort "invalid operands to " #op "32" @\ -.endif @\ -.endmacro - -CMP_INST(cmpi, cmp) -CMP_INST(cmpli, cmpl) - -#endif /* __ASSEMBLER__ */ - -#endif /* _ARCH_PPC_PSEUDO_INST_H_ */ diff --git a/EXTERNAL_HEADERS/architecture/ppc/reg_help.h b/EXTERNAL_HEADERS/architecture/ppc/reg_help.h deleted file mode 100644 index 6a0e2842e..000000000 --- a/EXTERNAL_HEADERS/architecture/ppc/reg_help.h +++ /dev/null @@ -1,230 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1996 NeXT Software, Inc. All rights reserved. - * - * File: architecture/ppc/reg_help.h - * Author: Doug Mitchell, NeXT Computer, Inc. - * - * m98k-specific macros and inlines for defining machine registers. - * - * HISTORY - * 05-Nov-92 Doug Mitchell at NeXT - * Created. - * - * 29-Dec-96 Umesh Vaishampayan (umeshv@NeXT.com) - * Ported from m98k. Removed dependency on nrw directory. - * Merged code from architecture/nrw/reg_help.h. - * Moved Register Usage #defines from asm_help.h in here. - */ - -#ifndef _ARCH_PPC_REG_HELP_H_ -#define _ARCH_PPC_REG_HELP_H_ - -#if defined(__ASSEMBLER__) -/* - * GRF Register Usage Aliases - */ -#define zt r0 // architecturally 0 for mem refs only! - // real reg other inst, caller-saved -#define sp r1 // stack pointer, callee-saved -#define toc r2 // tbl of contents, callee-saved -#define a0 r3 // arg 0, return value 0, caller saved -#define a1 r4 // arg 1, return value 1, caller saved -#define a2 r5 // .... -#define a3 r6 -#define a4 r7 -#define a5 r8 -#define a6 r9 -#define a7 r10 // arg 7, return value 7, caller saved -#define ep r11 // environment ptr, caller saved -#define at r12 // assembler temp, caller saved -#define s17 r13 // callee-saved 17 -#define s16 r14 -#define s15 r15 -#define s14 r16 -#define s13 r17 -#define s12 r18 -#define s11 r19 -#define s10 r20 -#define s9 r21 -#define s8 r22 -#define s7 r23 -#define s6 r24 -#define s5 r25 -#define s4 r26 -#define s3 r27 -#define s2 r28 -#define s1 r29 // .... -#define s0 r30 // callee-saved 0 -#define fp r31 // frame-pointer, callee-saved - -/* - * Conversion of GRF aliases to register numbers - */ -#define GRF_ZT 0 // architecturally 0 for mem refs only! - // real reg other inst, caller-saved -#define GRF_SP 1 // stack pointer, callee-saved -#define GRF_TOC 2 // tbl of contents, callee-saved -#define GRF_A0 3 // arg 0, return value 0, caller saved -#define GRF_A1 4 // arg 1, return value 1, caller saved -#define GRF_A2 5 // .... -#define GRF_A3 6 -#define GRF_A4 7 -#define GRF_A5 8 -#define GRF_A6 9 -#define GRF_A7 10 // arg 7, return value 7, caller saved -#define GRF_EP 11 // environment ptr, caller saved -#define GRF_AT 12 // assembler temp, caller saved -#define GRF_S17 13 // callee-saved 17 -#define GRF_S16 14 -#define GRF_S15 15 -#define GRF_S14 16 -#define GRF_S13 17 -#define GRF_S12 18 -#define GRF_S11 19 -#define GRF_S10 20 -#define GRF_S9 21 -#define GRF_S8 22 -#define GRF_S7 23 -#define GRF_S6 24 -#define GRF_S5 25 -#define GRF_S4 26 -#define GRF_S3 27 -#define GRF_S2 28 -#define GRF_S1 29 // .... -#define GRF_S0 30 // callee-saved 0 -#define GRF_FP 31 // frame pointer, callee-saved - -/* - * FPF Register names - */ -#define ft0 f0 // scratch reg, caller-saved -#define fa0 f1 // fp arg 0, return 0, caller-saved -#define fa1 f2 // fp arg 1, caller-saved -#define fa2 f3 // fp arg 2, caller-saved -#define fa3 f4 -#define fa4 f5 -#define fa5 f6 -#define fa6 f7 -#define fa7 f8 -#define fa8 f9 -#define fa9 f10 -#define fa10 f11 -#define fa11 f12 -#define fa12 f13 // fp arg 12, caller-saved -#define fs17 f14 // callee-saved 17 -#define fs16 f15 -#define fs15 f16 -#define fs14 f17 -#define fs13 f18 -#define fs12 f19 -#define fs11 f20 -#define fs10 f21 -#define fs9 f22 -#define fs8 f23 -#define fs7 f24 -#define fs6 f25 -#define fs5 f26 -#define fs4 f27 -#define fs3 f28 -#define fs2 f29 -#define fs1 f30 -#define fs0 f31 // callee-saved 0 - -/* - * Conversion of FPF aliases to register numbers - */ -#define FPF_FT0 0 // scratch reg, caller-saved -#define FPF_FA0 1 // fp arg 0, return 0, caller-saved -#define FPF_FA1 2 // fp arg 1, caller-saved -#define FPF_FA2 3 // fp arg 2, caller-saved -#define FPF_FA3 4 -#define FPF_FA4 5 -#define FPF_FA5 6 -#define FPF_FA6 7 -#define FPF_FA7 8 -#define FPF_FA8 9 -#define FPF_FA9 10 -#define FPF_FA10 11 -#define FPF_FA11 12 -#define FPF_FA12 13 // fp arg 12, caller-saved -#define FPF_FS17 14 // callee-saved 17 -#define FPF_FS16 15 -#define FPF_FS15 16 -#define FPF_FS14 17 -#define FPF_FS13 18 -#define FPF_FS12 19 -#define FPF_FS11 20 -#define FPF_FS10 21 -#define FPF_FS9 22 -#define FPF_FS8 23 -#define FPF_FS7 24 -#define FPF_FS6 25 -#define FPF_FS5 26 -#define FPF_FS4 27 -#define FPF_FS3 28 -#define FPF_FS2 29 -#define FPF_FS1 30 -#define FPF_FS0 31 // callee-saved 0 - -#endif /* __ASSEMBLER__ */ - - -/* Bitfield definition aid */ -#define BITS_WIDTH(msb, lsb) ((msb)-(lsb)+1) -#define BIT_WIDTH(pos) (1) /* mostly to record the position */ - -/* Mask creation */ -#define MKMASK(width, offset) (((unsigned)-1)>>(32-(width))<<(offset)) -#define BITSMASK(msb, lsb) MKMASK(BITS_WIDTH(msb, lsb), lsb & 0x1f) -#define BITMASK(pos) MKMASK(BIT_WIDTH(pos), pos & 0x1f) - -/* Register addresses */ -#if __ASSEMBLER__ -# define REG_ADDR(type, addr) (addr) -#else /* ! __ASSEMBLER__ */ -# define REG_ADDR(type, addr) (*(volatile type *)(addr)) -#endif /* __ASSEMBLER__ */ - -/* Cast a register to be an unsigned */ -/* CAUTION : non naturally aligned foo can result into alignment traps - * use at own risk. - */ -#define CONTENTS(foo) (*(unsigned *) &(foo)) - -/* STRINGIFY -- perform all possible substitutions, then stringify */ -#define __STR(x) #x /* just a helper macro */ -#define STRINGIFY(x) __STR(x) - -/* - * Stack pointer must always be a multiple of 16 - */ -#define STACK_INCR 16 -#define ROUND_FRAME(x) ((((unsigned)(x)) + STACK_INCR - 1) & ~(STACK_INCR-1)) - -#endif /* _ARCH_PPC_REG_HELP_H_ */ diff --git a/EXTERNAL_HEADERS/mach-o/arm/reloc.h b/EXTERNAL_HEADERS/mach-o/arm/reloc.h deleted file mode 100644 index e2da8b80c..000000000 --- a/EXTERNAL_HEADERS/mach-o/arm/reloc.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this - * file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* - * Relocation types used in the arm implementation. Relocation entries for - * things other than instructions use the same generic relocation as discribed - * in and their r_type is ARM_RELOC_VANILLA, one of the - * *_SECTDIFF or the *_PB_LA_PTR types. The rest of the relocation types are - * for instructions. Since they are for instructions the r_address field - * indicates the 32 bit instruction that the relocation is to be preformed on. - */ -enum reloc_type_arm -{ - ARM_RELOC_VANILLA, /* generic relocation as discribed above */ - ARM_RELOC_PAIR, /* the second relocation entry of a pair */ - ARM_RELOC_SECTDIFF, /* a PAIR follows with subtract symbol value */ - ARM_RELOC_LOCAL_SECTDIFF, /* like ARM_RELOC_SECTDIFF, but the symbol - referenced was local. */ - ARM_RELOC_PB_LA_PTR,/* prebound lazy pointer */ - ARM_RELOC_BR24, /* 24 bit branch displacement (to a word address) */ - ARM_THUMB_RELOC_BR22, /* 22 bit branch displacement (to a half-word - address) */ -}; diff --git a/EXTERNAL_HEADERS/mach-o/loader.h b/EXTERNAL_HEADERS/mach-o/loader.h index b00ac7a67..9fecf2b4a 100644 --- a/EXTERNAL_HEADERS/mach-o/loader.h +++ b/EXTERNAL_HEADERS/mach-o/loader.h @@ -197,6 +197,12 @@ struct mach_header_64 { load the main executable at a random address. Only used in MH_EXECUTE filetypes. */ +#define MH_NO_HEAP_EXECUTION 0x1000000 /* When this bit is set, the OS will + run the main executable with + a non-executable heap even on + platforms (e.g. i386) that don't + require it. Only used in MH_EXECUTE + filetypes. */ /* * The load commands directly follow the mach_header. The total size of all diff --git a/EXTERNAL_HEADERS/mach-o/ppc/reloc.h b/EXTERNAL_HEADERS/mach-o/ppc/reloc.h deleted file mode 100644 index 7b564cc0a..000000000 --- a/EXTERNAL_HEADERS/mach-o/ppc/reloc.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this - * file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* - * Relocation types used in the ppc implementation. Relocation entries for - * things other than instructions use the same generic relocation as discribed - * above and their r_type is RELOC_VANILLA. The rest of the relocation types - * are for instructions. Since they are for instructions the r_address field - * indicates the 32 bit instruction that the relocation is to be preformed on. - * The fields r_pcrel and r_length are ignored for non-RELOC_VANILLA r_types - * except for PPC_RELOC_BR14. - * - * For PPC_RELOC_BR14 if the r_length is the unused value 3, then the branch was - * statically predicted setting or clearing the Y-bit based on the sign of the - * displacement or the opcode. If this is the case the static linker must flip - * the value of the Y-bit if the sign of the displacement changes for non-branch - * always conditions. - */ -enum reloc_type_ppc -{ - PPC_RELOC_VANILLA, /* generic relocation as discribed above */ - PPC_RELOC_PAIR, /* the second relocation entry of a pair */ - PPC_RELOC_BR14, /* 14 bit branch displacement (to a word address) */ - PPC_RELOC_BR24, /* 24 bit branch displacement (to a word address) */ - PPC_RELOC_HI16, /* a PAIR follows with the low half */ - PPC_RELOC_LO16, /* a PAIR follows with the high half */ - PPC_RELOC_HA16, /* Same as the RELOC_HI16 except the low 16 bits and the - * high 16 bits are added together with the low 16 bits - * sign extened first. This means if bit 15 of the low - * 16 bits is set the high 16 bits stored in the - * instruction will be adjusted. - */ - PPC_RELOC_LO14, /* Same as the LO16 except that the low 2 bits are not - * stored in the instruction and are always zero. This - * is used in double word load/store instructions. - */ - PPC_RELOC_SECTDIFF, /* a PAIR follows with subtract symbol value */ - PPC_RELOC_PB_LA_PTR,/* prebound lazy pointer */ - PPC_RELOC_HI16_SECTDIFF, /* section difference forms of above. a PAIR */ - PPC_RELOC_LO16_SECTDIFF, /* follows these with subtract symbol value */ - PPC_RELOC_HA16_SECTDIFF, - PPC_RELOC_JBSR, - PPC_RELOC_LO14_SECTDIFF, - PPC_RELOC_LOCAL_SECTDIFF /* like PPC_RELOC_SECTDIFF, but the symbol - referenced was local. */ -}; diff --git a/EXTERNAL_HEADERS/stdarg.h b/EXTERNAL_HEADERS/stdarg.h index f178505e8..bbbaff93e 100644 --- a/EXTERNAL_HEADERS/stdarg.h +++ b/EXTERNAL_HEADERS/stdarg.h @@ -1,133 +1,47 @@ -/* Copyright (C) 1989, 1997, 1998, 1999, 2000 Free Software Foundation, Inc. - -This file is part of GCC. - -GCC is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2, or (at your option) -any later version. - -GCC is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with GCC; see the file COPYING. If not, write to -the Free Software Foundation, 59 Temple Place - Suite 330, -Boston, MA 02111-1307, USA. */ - -/* As a special exception, if you include this header file into source - files compiled by GCC, this header file does not by itself cause - the resulting executable to be covered by the GNU General Public - License. This exception does not however invalidate any other - reasons why the executable file might be covered by the GNU General - Public License. */ - -/* - * ISO C Standard: 7.15 Variable arguments +/*===---- stdarg.h - Variable argument handling ----------------------------=== + * + * Copyright (c) 2008 Eli Friedman + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== */ -#ifndef _STDARG_H -#ifndef _ANSI_STDARG_H_ -#ifndef __need___va_list -#define _STDARG_H -#define _ANSI_STDARG_H_ -#endif /* not __need___va_list */ -#undef __need___va_list - -/* Define __gnuc_va_list. */ +#ifndef __STDARG_H +#define __STDARG_H -#ifndef __GNUC_VA_LIST -#define __GNUC_VA_LIST -typedef __builtin_va_list __gnuc_va_list; -#endif +typedef __builtin_va_list va_list; +#define va_start(ap, param) __builtin_va_start(ap, param) +#define va_end(ap) __builtin_va_end(ap) +#define va_arg(ap, type) __builtin_va_arg(ap, type) -/* Define the standard macros for the user, - if this invocation was from the user program. */ -#ifdef _STDARG_H - -#define va_start(v,l) __builtin_va_start(v,l) -#define va_end(v) __builtin_va_end(v) -#define va_arg(v,l) __builtin_va_arg(v,l) -#if !defined(__STRICT_ANSI__) || __STDC_VERSION__ + 0 >= 199900L -#define va_copy(d,s) __builtin_va_copy(d,s) -#endif -#define __va_copy(d,s) __builtin_va_copy(d,s) - -/* Define va_list, if desired, from __gnuc_va_list. */ -/* We deliberately do not define va_list when called from - stdio.h, because ANSI C says that stdio.h is not supposed to define - va_list. stdio.h needs to have access to that data type, - but must not use that name. It should use the name __gnuc_va_list, - which is safe because it is reserved for the implementation. */ - -#ifdef _HIDDEN_VA_LIST /* On OSF1, this means varargs.h is "half-loaded". */ -#undef _VA_LIST -#endif - -#ifdef _BSD_VA_LIST -#undef _BSD_VA_LIST -#endif +/* GCC always defines __va_copy, but does not define va_copy unless in c99 mode + * or -ansi is not specified, since it was not part of C90. + */ +#define __va_copy(d,s) __builtin_va_copy(d,s) -#if defined(__svr4__) || (defined(_SCO_DS) && !defined(__VA_LIST)) -/* SVR4.2 uses _VA_LIST for an internal alias for va_list, - so we must avoid testing it and setting it here. - SVR4 uses _VA_LIST as a flag in stdarg.h, but we should - have no conflict with that. */ -#ifndef _VA_LIST_ -#define _VA_LIST_ -#ifdef __i860__ -#ifndef _VA_LIST -#define _VA_LIST va_list -#endif -#endif /* __i860__ */ -typedef __gnuc_va_list va_list; -#ifdef _SCO_DS -#define __VA_LIST +#if __STDC_VERSION__ >= 199900L || !defined(__STRICT_ANSI__) +#define va_copy(dest, src) __builtin_va_copy(dest, src) #endif -#endif /* _VA_LIST_ */ -#else /* not __svr4__ || _SCO_DS */ -/* The macro _VA_LIST_ is the same thing used by this file in Ultrix. - But on BSD NET2 we must not test or define or undef it. - (Note that the comments in NET 2's ansi.h - are incorrect for _VA_LIST_--see stdio.h!) */ -#if !defined (_VA_LIST_) || defined (__BSD_NET2__) || defined (____386BSD____) || defined (__bsdi__) || defined (__sequent__) || defined (__FreeBSD__) || defined(WINNT) -/* The macro _VA_LIST_DEFINED is used in Windows NT 3.5 */ -#ifndef _VA_LIST_DEFINED -/* The macro _VA_LIST is used in SCO Unix 3.2. */ -#ifndef _VA_LIST -/* The macro _VA_LIST_T_H is used in the Bull dpx2 */ -#ifndef _VA_LIST_T_H -/* The macro __va_list__ is used by BeOS. */ -#ifndef __va_list__ -typedef __gnuc_va_list va_list; -#endif /* not __va_list__ */ -#endif /* not _VA_LIST_T_H */ -#endif /* not _VA_LIST */ -#endif /* not _VA_LIST_DEFINED */ -#if !(defined (__BSD_NET2__) || defined (____386BSD____) || defined (__bsdi__) || defined (__sequent__) || defined (__FreeBSD__)) -#define _VA_LIST_ -#endif -#ifndef _VA_LIST -#define _VA_LIST -#endif -#ifndef _VA_LIST_DEFINED -#define _VA_LIST_DEFINED -#endif -#ifndef _VA_LIST_T_H -#define _VA_LIST_T_H -#endif -#ifndef __va_list__ -#define __va_list__ -#endif - -#endif /* not _VA_LIST_, except on certain systems */ - -#endif /* not __svr4__ */ - -#endif /* _STDARG_H */ +/* Hack required to make standard headers work, at least on Ubuntu */ +#define __GNUC_VA_LIST 1 +typedef __builtin_va_list __gnuc_va_list; -#endif /* not _ANSI_STDARG_H_ */ -#endif /* not _STDARG_H */ +#endif /* __STDARG_H */ diff --git a/Makefile b/Makefile index 57c8a4c88..acd493419 100644 --- a/Makefile +++ b/Makefile @@ -32,40 +32,24 @@ ALL_SUBDIRS = \ libsa \ security -CONFIG_SUBDIRS_PPC = config - CONFIG_SUBDIRS_I386 = config - CONFIG_SUBDIRS_X86_64 = config - CONFIG_SUBDIRS_ARM = config INSTINC_SUBDIRS = $(ALL_SUBDIRS) EXTERNAL_HEADERS - -INSTINC_SUBDIRS_PPC = $(INSTINC_SUBDIRS) EXTERNAL_HEADERS - -INSTINC_SUBDIRS_I386 = $(INSTINC_SUBDIRS) EXTERNAL_HEADERS - -INSTINC_SUBDIRS_X86_64 = $(INSTINC_SUBDIRS) EXTERNAL_HEADERS - -INSTINC_SUBDIRS_ARM = $(INSTINC_SUBDIRS) EXTERNAL_HEADERS +INSTINC_SUBDIRS_I386 = $(INSTINC_SUBDIRS) +INSTINC_SUBDIRS_X86_64 = $(INSTINC_SUBDIRS) +INSTINC_SUBDIRS_ARM = $(INSTINC_SUBDIRS) EXPINC_SUBDIRS = $(ALL_SUBDIRS) - -EXPINC_SUBDIRS_PPC = $(EXPINC_SUBDIRS) - EXPINC_SUBDIRS_I386 = $(EXPINC_SUBDIRS) - EXPINC_SUBDIRS_X86_64 = $(EXPINC_SUBDIRS) - EXPINC_SUBDIRS_ARM = $(EXPINC_SUBDIRS) -COMP_SUBDIRS_PPC = $(ALL_SUBDIRS) +SETUP_SUBDIRS = SETUP COMP_SUBDIRS_I386 = $(ALL_SUBDIRS) - COMP_SUBDIRS_X86_64 = $(ALL_SUBDIRS) - COMP_SUBDIRS_ARM = $(ALL_SUBDIRS) INST_SUBDIRS = \ @@ -77,14 +61,18 @@ INST_SUBDIRS = \ config \ security -INSTALL_FILE_LIST= \ - mach_kernel +INSTALL_KERNEL_FILE = mach_kernel + +INSTALL_KERNEL_DIR = / -INSTALL_FILE_DIR= \ - / INSTMAN_SUBDIRS = \ bsd include $(MakeInc_rule) include $(MakeInc_dir) + +# This target is defined to compile and run xnu_quick_test under testbots +testbots: + /usr/bin/make MORECFLAGS="-D RUN_UNDER_TESTBOTS=1" testbots -C ./tools/tests/xnu_quick_test/ + diff --git a/README b/README index 2040c2cee..b9e102527 100644 --- a/README +++ b/README @@ -15,32 +15,17 @@ A. How to build XNU: By default, architecture defaults to the build machine architecture, and the kernel configuration is set to build for DEVELOPMENT. - The machine configuration defaults to S5L8900X for arm and default for i386 and ppc. This will also create a bootable image, mach_kernel, and a kernel binary with symbols, mach_kernel.sys. - - Examples: - /* make a debug kernel for H1 arm board */ - make TARGET_CONFIGS="debug arm s5l8900x" SDKROOT=/path/to/SDK - - $(OBJROOT)/DEBUG_ARM_S5L8900X/osfmk/DEBUG/osfmk.o: pre-linked object for osfmk component - $(OBJROOT)/DEBUG_ARM_S5L8900X/mach_kernel: bootable image - /* make debug and development kernels for H1 arm board */ - make TARGET_CONFIGS="debug arm s5l8900x development arm s5l8900x" SDKROOT=/path/to/SDK - - $(OBJROOT)/DEBUG_ARM_S5L8900X/osfmk/DEBUG/osfmk.o: pre-linked object for osfmk component - $(OBJROOT)/DEBUG_ARM_S5L8900X/mach_kernel: bootable image - $(OBJROOT)/DEVELOPMENT_ARM_S5L8900X/osfmk/DEVELOPMENT/osfmk.o: pre-linked object for osfmk component - $(OBJROOT)/DEVELOPMENT_ARM_S5L8900X/mach_kernel: bootable image - /* this is all you need to do to build H1 arm with DEVELOPMENT kernel configuration */ - make TARGET_CONFIGS="default arm default" SDKROOT=/path/to/SDK + /* this is all you need to do to build with RELEASE kernel configuration */ + make TARGET_CONFIGS="release x86_64 default" SDKROOT=/path/to/SDK or the following is equivalent (ommitted SDKROOT will use /) - make ARCH_CONFIGS=ARM + make ARCH_CONFIGS=X86_64 2) Building a Component @@ -64,7 +49,7 @@ A. How to build XNU: and KERNEL_CONFIGS). Example: - $(OBJROOT)/RELEASE_PPC/osfmk/RELEASE/osfmk.o: pre-linked object for osfmk component + $(OBJROOT)/RELEASE_X86_64/osfmk/RELEASE/osfmk.filelist: list of objects in osfmk component From the component top directory: @@ -81,36 +66,36 @@ A. How to build XNU: Define kernel configuration to DEBUG in your environment or when running a make command. Then, apply procedures 4, 5 - $ make TARGET_CONFIGS="DEBUG PPC DEFAULT" all + $ make TARGET_CONFIGS="DEBUG X86_64 DEFAULT" all or - $ make KERNEL_CONFIGS=DEBUG all + $ make KERNEL_CONFIGS=DEBUG ARCH_CONFIGS=X86_64 all or - $ export TARGET_CONFIGS="DEBUG ARM MX31ADS" + $ export TARGET_CONFIGS="DEBUG X86_64 DEFAULT" $ export SDKROOT=/path/to/SDK $ make all Example: - $(OBJROOT)/DEBUG_PPC/osfmk/DEBUG/osfmk.o: pre-linked object for osfmk component - $(OBJROOT)/DEBUG_PPC/mach_kernel: bootable image + $(OBJROOT)/DEBUG_X86_64/osfmk/DEBUG/osfmk.filelist: list of objects in osfmk component + $(OBJROOT)/DEBUG_X86_64/mach_kernel: bootable image 4) Building fat Define architectures in your environment or when running a make command. Apply procedures 3, 4, 5 - $ make TARGET_CONFIGS="RELEASE PPC default RELEASE I386 default" exporthdrs all + $ make TARGET_CONFIGS="RELEASE I386 DEFAULT RELEASE X86_64 DEFAULT" exporthdrs all or - $ make ARCH_CONFIGS="PPC I386" exporthdrs all + $ make ARCH_CONFIGS="I386 X86_64" exporthdrs all or - $ export ARCH_CONFIGS="PPC I386" + $ export ARCH_CONFIGS="I386 X86_64" $ make exporthdrs all 5) Verbose make @@ -127,16 +112,28 @@ A. How to build XNU: From the top directory, run: - $ ~rc/bin/buildit . -arch ppc -arch i386 -noinstallsrc -nosum - - or for multiple arm builds - - $ ~rc/bin/buildit . -noinstallsrc -nosum -- TARGET_CONFIGS="release arm MX31ADS release arm LN2410SBC" - - or for default arm build (kernel config DEVELOPMENT and machine config MX31ADS) - - $ ~rc/bin/buildit . -arch arm -noinstallsrc -nosum -- TARGET_CONFIGS="release arm MX31ADS release arm LN2410SBC" + $ ~rc/bin/buildit . -arch i386 -arch x86_64 -arch armv7 -arch ppc -noinstallsrc -nosum + + xnu supports a number of XBS build aliases, which allow B&I to build + the same source submission multiple times in different ways, to + produce different results. Each build alias supports the standard + "clean", "install", "installsrc", "installhdrs" targets, but + conditionalize their behavior on the RC_ProjectName make variable + which is passed as the -project argument to ~rc/bin/buildit, which + can be one of: + + -project xnu # the default, builds /mach_kernel, kernel-space + # headers, user-space headers, man pages, + # symbol-set kexts + + -project xnu_debug # a DEBUG kernel in /AppleInternal with dSYM + + -project libkxld # user-space version of kernel linker + + -project Libsyscall # automatically generate BSD syscall stubs + + 8) Creating tags and cscope @@ -157,6 +154,8 @@ A. How to build XNU: $ make -w # trace recursive make invocations. Useful in combination with VERBOSE=YES + $ make BUILD_LTO=1 # built with LLVM Link Time Optimization (experimental) + ============================================= B. How to install a new header file from XNU diff --git a/osfmk/profiling/ppc/Makefile b/SETUP/Makefile similarity index 66% rename from osfmk/profiling/ppc/Makefile rename to SETUP/Makefile index ebea6420f..7a0e5c5b4 100644 --- a/osfmk/profiling/ppc/Makefile +++ b/SETUP/Makefile @@ -7,19 +7,10 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir include $(MakeInc_cmd) include $(MakeInc_def) - -DATAFILES = \ - profile-md.h - -INSTALL_MD_LIST = ${DATAFILES} - -INSTALL_MD_DIR = profile/ppc - -EXPORT_MD_LIST = ${DATAFILES} - -EXPORT_MD_DIR = profile/ppc +SETUP_SUBDIRS = \ + config \ + kextsymboltool \ + setsegname include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/SETUP/config/Makefile b/SETUP/config/Makefile new file mode 100644 index 000000000..8889afef3 --- /dev/null +++ b/SETUP/config/Makefile @@ -0,0 +1,42 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + + +include $(MakeInc_cmd) +include $(MakeInc_def) + +OBJS = externs.o main.o mkglue.o mkheaders.o mkioconf.o mkmakefile.o \ + mkswapconf.o openp.o searchp.o lexer.yy.o parser.o + +CFLAGS = -isysroot $(HOST_SDKROOT) -g -O0 -I$(SOURCE) -I. + +WARNFLAGS = -Wall + +LDFLAGS = -isysroot $(HOST_SDKROOT) + +config: $(OBJS) + $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^ + @echo HOST_LD $@ + $(_v)$(HOST_CODESIGN) -s - $@ + @echo HOST_CODESIGN $@ + +.c.o: + $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $< + @echo HOST_CC $@ + +parser.c: parser.y + $(_v)$(HOST_BISON) -y -d -d -o $@ $< + @echo HOST_BISON $@ + +lexer.yy.c: lexer.l + $(_v)$(HOST_FLEX) --header-file=lexer.yy.h -o $@ $< + @echo HOST_FLEX $@ + +main.o mkglue.o mkheaders.o mkioconf.o mkmakefile.o lexer.yy.c: parser.c + +do_build_setup: config + +include $(MakeInc_rule) +include $(MakeInc_dir) diff --git a/SETUP/config/config.h b/SETUP/config/config.h new file mode 100644 index 000000000..54219e1db --- /dev/null +++ b/SETUP/config/config.h @@ -0,0 +1,293 @@ +/* + * Copyright (c) 1999-2009 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights + * Reserved. This file contains Original Code and/or Modifications of + * Original Code as defined in and that are subject to the Apple Public + * Source License Version 1.0 (the 'License'). You may not use this file + * except in compliance with the License. Please obtain a copy of the + * License at http://www.apple.com/publicsource and read it before using + * this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License." + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* + * Mach Operating System + * Copyright (c) 1990 Carnegie-Mellon University + * Copyright (c) 1989 Carnegie-Mellon University + * Copyright (c) 1988 Carnegie-Mellon University + * Copyright (c) 1987 Carnegie-Mellon University + * All rights reserved. The CMU software License Agreement specifies + * the terms and conditions for use and redistribution. + */ + +/* + * Copyright (c) 1980 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms are permitted + * provided that the above copyright notice and this paragraph are + * duplicated in all such forms and that any documentation, + * advertising materials, and other materials related to such + * distribution and use acknowledge that the software was developed + * by the University of California, Berkeley. The name of the + * University may not be used to endorse or promote products derived + * from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * @(#)config.h 5.8 (Berkeley) 6/18/88 + */ + +/* + * Config. + */ + +#include +#include +#include +#include +#include + +struct file_list { + struct file_list *f_next; + char *f_fn; /* the name */ + u_char f_type; /* see below */ + u_char f_flags; /* see below */ + short f_special; /* requires special make rule */ + char *f_needs; + char *f_extra; /* stuff to add to make line */ + /* + * Random values: + * swap space parameters for swap areas + * root device, etc. for system specifications + */ + union { + struct { /* when swap specification */ + dev_t fuw_swapdev; + int fuw_swapsize; + } fuw; + struct { /* when system specification */ + dev_t fus_rootdev; + dev_t fus_argdev; + dev_t fus_dumpdev; + } fus; + } fun; +#define f_swapdev fun.fuw.fuw_swapdev +#define f_swapsize fun.fuw.fuw_swapsize +#define f_rootdev fun.fus.fus_rootdev +#define f_argdev fun.fus.fus_argdev +#define f_dumpdev fun.fus.fus_dumpdev +}; + +/* + * Types. + */ +#define DRIVER 1 +#define NORMAL 2 +#define INVISIBLE 3 +#define PROFILING 4 +#define SYSTEMSPEC 5 +#define SWAPSPEC 6 + +/* + * Attributes (flags). + */ +#define CONFIGDEP 0x01 /* obsolete? */ +#define OPTIONSDEF 0x02 /* options definition entry */ +#define ORDERED 0x04 /* don't list in OBJ's, keep "files" order */ +#define SEDIT 0x08 /* run sed filter (SQT) */ + +/* + * Maximum number of fields for variable device fields (SQT). + */ +#define NFIELDS 10 + +struct idlst { + char *id; + struct idlst *id_next; + int id_vec; /* Sun interrupt vector number */ +}; + +struct device { + int d_type; /* CONTROLLER, DEVICE, bus adaptor */ + struct device *d_conn; /* what it is connected to */ + const char *d_name; /* name of device (e.g. rk11) */ + struct idlst *d_vec; /* interrupt vectors */ + int d_pri; /* interrupt priority */ + int d_addr; /* address of csr */ + int d_unit; /* unit number */ + int d_drive; /* drive number */ + int d_slave; /* slave number */ +#define QUES -1 /* -1 means '?' */ +#define UNKNOWN -2 /* -2 means not set yet */ + int d_dk; /* if init 1 set to number for iostat */ + int d_flags; /* nlags for device init */ + struct device *d_next; /* Next one in list */ + u_short d_mach; /* Sun - machine type (0 = all)*/ + u_short d_bus; /* Sun - bus type (0 = unknown) */ + u_long d_fields[NFIELDS]; /* fields values (SQT) */ + int d_bin; /* interrupt bin (SQT) */ + int d_addrmod; /* address modifier (MIPS) */ + char *d_init; /* pseudo device init routine name */ +}; +#define TO_NEXUS (struct device *)-1 +#define TO_SLOT (struct device *)-1 + +struct config { + char *c_dev; + char *s_sysname; +}; + +/* + * Config has a global notion of which machine type is + * being used. It uses the name of the machine in choosing + * files and directories. Thus if the name of the machine is ``vax'', + * it will build from ``Makefile.vax'' and use ``../vax/inline'' + * in the makerules, etc. + */ +extern int machine; +extern const char *machinename; +#define MACHINE_VAX 1 +#define MACHINE_SUN 2 +#define MACHINE_ROMP 3 +#define MACHINE_SUN2 4 +#define MACHINE_SUN3 5 +#define MACHINE_MMAX 6 +#define MACHINE_SQT 7 +#define MACHINE_SUN4 8 +#define MACHINE_I386 9 +#define MACHINE_IX 10 +#define MACHINE_MIPSY 11 +#define MACHINE_MIPS 12 +#define MACHINE_I860 13 +#define MACHINE_M68K 14 +#define MACHINE_M88K 15 +#define MACHINE_M98K 16 +#define MACHINE_HPPA 17 +#define MACHINE_SPARC 18 +#define MACHINE_PPC 19 +#define MACHINE_ARM 20 +#define MACHINE_X86_64 21 + +/* + * For each machine, a set of CPU's may be specified as supported. + * These and the options (below) are put in the C flags in the makefile. + */ +struct cputype { + char *cpu_name; + struct cputype *cpu_next; +}; + +extern struct cputype *cputype; + +/* + * In order to configure and build outside the kernel source tree, + * we may wish to specify where the source tree lives. + */ +extern const char *source_directory; +extern const char *object_directory; +extern char *config_directory; + +FILE *fopenp(const char *fpath, char *file, char *complete, const char *ftype); +const char *get_VPATH(void); +#define VPATH get_VPATH() + +/* + * A set of options may also be specified which are like CPU types, + * but which may also specify values for the options. + * A separate set of options may be defined for make-style options. + */ +struct opt { + char *op_name; + char *op_value; + struct opt *op_next; +}; + +extern struct opt *opt, *mkopt, *opt_tail, *mkopt_tail; + +extern char *ident; +const char *get_word(FILE *fp); +char *ns(const char *str); +char *qu(int num); +char *path(const char *file); + +extern int do_trace; + +#if MACHINE_VAX +extern int seen_mba, seen_uba; +#endif + +extern int seen_vme, seen_mbii; + +extern struct device *dtab; +dev_t nametodev(char *name, int defunit, char defpartition); +char *devtoname(dev_t dev); + +extern char errbuf[80]; +extern int yyline; + +extern struct file_list *ftab, *conf_list, **confp; +extern char *build_directory; + +extern int profiling; + +extern int maxusers; + +#define eq(a,b) (!strcmp(a,b)) + +#ifdef mips +#define DEV_MASK 0xf +#define DEV_SHIFT 4 +#else mips +#define DEV_MASK 0x7 +#define DEV_SHIFT 3 +#endif mips + +/* External function references */ +char *get_rest(FILE *fp); + +int yyparse(void); +void yyerror(const char *s); + +void vax_ioconf(void); +void sun_ioconf(void); +void romp_ioconf(void); +void mmax_ioconf(void); +void sqt_ioconf(void); +void i386_ioconf(void); +void mips_ioconf(void); +void m68k_ioconf(void); +void m88k_ioconf(void); +void m98k_ioconf(void); +void hppa_ioconf(void); +void sparc_ioconf(void); +void ppc_ioconf(void); +void arm_ioconf(void); +void x86_64_ioconf(void); + +void swapconf(void); + +void ubglue(void); +void mbglue(void); + +void makefile(void); +void headers(void); +int opteq(const char *cp, const char *dp); + +void init_dev(struct device *dp); +void newdev(struct device *dp); +void dev_param(struct device *dp, const char *str, long num); + +int searchp(const char *spath, char *file, char *fullname, int (*func)(char *)); diff --git a/bsd/conf/tools/doconf/doconf.csh b/SETUP/config/doconf similarity index 94% rename from bsd/conf/tools/doconf/doconf.csh rename to SETUP/config/doconf index 6fedb4786..2d4e952e9 100755 --- a/bsd/conf/tools/doconf/doconf.csh +++ b/SETUP/config/doconf @@ -69,17 +69,14 @@ set prog=$0 set prog=$prog:t set nonomatch set OBJDIR=../BUILD -if ("`/usr/bin/uname`" == "Rhapsody" ) then -set CONFIG_DIR=/usr/local/bin -else -set CONFIG_DIR=/usr/bin -endif +set CONFIG_DIR=$OBJROOT/SETUP/config unset domake unset doconfig unset beverbose unset MACHINE unset profile +unset SOC_CONFIG while ($#argv >= 1) if ("$argv[1]" =~ -*) then @@ -100,6 +97,14 @@ while ($#argv >= 1) set MACHINE="$argv[2]" shift breaksw + case "-soc": + if ($#argv < 2) then + echo "${prog}: missing argument to ${argv[1]}" + exit 1 + endif + set SOC_CONFIG="$argv[2]" + shift + breaksw case "-d": if ($#argv < 2) then echo "${prog}: missing argument to ${argv[1]}" @@ -168,11 +173,15 @@ set FEATURES_H=(cs_*.h mach_*.h net_*.h\ set MASTER_DIR=../conf set MASTER = ${MASTER_DIR}/MASTER set MASTER_CPU=${MASTER}.${cpu} +set MASTER_CPU_PER_SOC=${MASTER}.${cpu}.${SOC_CONFIG} +if (-f $MASTER_CPU_PER_SOC) set MASTER_CPU = ${MASTER_CPU_PER_SOC} set MASTER_LOCAL = ${MASTER}.local set MASTER_CPU_LOCAL = ${MASTER_CPU}.local +set MASTER_CPU_PER_SOC_LOCAL = ${MASTER_CPU_PER_SOC}.local if (! -f $MASTER_LOCAL) set MASTER_LOCAL = "" if (! -f $MASTER_CPU_LOCAL) set MASTER_CPU_LOCAL = "" +if (-f $MASTER_CPU_PER_SOC_LOCAL) set MASTER_CPU_LOCAL = ${MASTER_CPU_PER_SOC_LOCAL} if (! -d $OBJDIR) then if ($?beverbose) then diff --git a/SETUP/config/externs.c b/SETUP/config/externs.c new file mode 100644 index 000000000..d1bdd8942 --- /dev/null +++ b/SETUP/config/externs.c @@ -0,0 +1,82 @@ +/* + * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights + * Reserved. This file contains Original Code and/or Modifications of + * Original Code as defined in and that are subject to the Apple Public + * Source License Version 1.0 (the 'License'). You may not use this file + * except in compliance with the License. Please obtain a copy of the + * License at http://www.apple.com/publicsource and read it before using + * this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License." + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* Copyright (c) Apple Computer, Inc. All rights reserved. */ + +#include + + +/* + * Config has a global notion of which machine type is + * being used. It uses the name of the machine in choosing + * files and directories. Thus if the name of the machine is ``vax'', + * it will build from ``Makefile.vax'' and use ``../vax/inline'' + * in the makerules, etc. + */ +int machine; +const char *machinename; + +/* + * For each machine, a set of CPU's may be specified as supported. + * These and the options (below) are put in the C flags in the makefile. + */ + +struct cputype *cputype; + +/* + * In order to configure and build outside the kernel source tree, + * we may wish to specify where the source tree lives. + */ +const char *source_directory; +const char *object_directory; +char *config_directory; + +/* + * A set of options may also be specified which are like CPU types, + * but which may also specify values for the options. + * A separate set of options may be defined for make-style options. + */ +struct opt *opt, *mkopt, *opt_tail, *mkopt_tail; + +char *ident; + +int do_trace; + +#if MACHINE_VAX +int seen_mba, seen_uba; +#endif + +int seen_vme, seen_mbii; + +struct device *dtab; + +char errbuf[80]; +int yyline; + +struct file_list *ftab, *conf_list, **confp; +char *build_directory; + +int profiling = 0; + +int maxusers; + diff --git a/SETUP/config/lexer.l b/SETUP/config/lexer.l new file mode 100644 index 000000000..c5502b4ba --- /dev/null +++ b/SETUP/config/lexer.l @@ -0,0 +1,214 @@ +%{ +/* + * Mach Operating System + * Copyright (c) 1990 Carnegie-Mellon University + * Copyright (c) 1989 Carnegie-Mellon University + * Copyright (c) 1988 Carnegie-Mellon University + * Copyright (c) 1987 Carnegie-Mellon University + * All rights reserved. The CMU software License Agreement specifies + * the terms and conditions for use and redistribution. + */ + +/* + * Copyright (c) 1980 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms are permitted + * provided that the above copyright notice and this paragraph are + * duplicated in all such forms and that any documentation, + * advertising materials, and other materials related to such + * distribution and use acknowledge that the software was developed + * by the University of California, Berkeley. The name of the + * University may not be used to endorse or promote products derived + * from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * @(#)config.l 5.5 (Berkeley) 6/18/88 + */ + +#include +#include "parser.h" +#include "config.h" + +int kw_lookup(char *word); +int octal(char *str); +int hex(char *str); +int yylex(void); + +#define tprintf if (do_trace) printf + +/* + * Key word table + */ + +struct kt { + const char *kt_name; + int kt_val; +} key_words[] = { + { "and", AND }, + { "args", ARGS }, + { "at", AT }, + { "builddir", BUILDDIR }, + { "config", CONFIG }, + { "configdir", CONFIGDIR }, + { "controller", CONTROLLER }, + { "cpu", CPU }, + { "csr", CSR }, + { "device", DEVICE }, + { "disk", DISK }, + { "drive", DRIVE }, + { "dumps", DUMPS }, + { "flags", FLAGS }, + { "hz", HZ }, + { "ident", IDENT }, + { "init", INIT }, + { "machine", MACHINE }, + { "major", MAJOR }, + { "makeoptions", MAKEOPTIONS }, + { "makevariables", MAKEOPTIONS }, + { "master", MASTER }, + { "maxusers", MAXUSERS }, + { "mba", MBA }, + { "minor", MINOR }, + { "nexus", NEXUS }, + { "objectdir", OBJECTDIR }, + { "on", ON }, + { "options", OPTIONS }, + { "priority", PRIORITY }, + { "profile", PROFILE }, + { "pseudo-device",PSEUDO_DEVICE }, + { "root", ROOT }, + { "size", SIZE }, + { "slave", SLAVE }, + { "sourcedir", SOURCEDIR }, + { "swap", SWAP }, + { "tape", DEVICE }, + { "trace", TRACE }, + { "uba", UBA }, + { "vector", VECTOR }, + { "lun", LUN }, /* MMAX only */ + { "slot", SLOT }, /* MMAX only */ + { "tape", TAPE }, /* MMAX only */ + { "bin", BIN }, /* SQT ONLY */ + { "am", ADDRMOD }, /* MIPS */ + { "mbii", MBII }, /* MIPS */ + { "vme", VME }, /* MIPS */ + { 0, 0 }, +}; +%} + +%option nounput + +WORD ([A-Za-z_][-A-Za-z_]*|[A-Z][-A-Za-z_0-9]*) +WORD1 ([A-Za-z_][-A-Za-z_0-9]*) +%% +{WORD} | +{WORD1} { + int i; + + if ((i = kw_lookup(yytext)) == -1) + { + yylval.str = yytext; + tprintf("id(%s) ", yytext); + return ID; + } + tprintf("(%s) ", yytext); + return i; + } +\"[^"]+\" { + yytext[strlen(yytext)-1] = '\0'; + yylval.str = yytext + 1; + return ID; + } +0[0-7]* { + yylval.val = octal(yytext); + tprintf("#O:%o ", yylval.val); + return NUMBER; + } +0x[0-9a-fA-F]+ { + yylval.val = hex(yytext); + tprintf("#X:%x ", yylval.val); + return NUMBER; + } +[1-9][0-9]* { + yylval.val = atoi(yytext); + tprintf("#D:%d ", yylval.val); + return NUMBER; + } +[0-9]"."[0-9]* { + yylval.val = (int) (60 * atof(yytext) + 0.5); + return FPNUMBER; + } +"-" { + return MINUS; + } +"?" { + yylval.val = -1; + tprintf("? "); + return NUMBER; + } +\n/[ \t] { + yyline++; + tprintf("\n... "); + } +\n { + yyline++; + tprintf("\n"); + return SEMICOLON; + } +#.* { /* Ignored (comment) */; } +[ \t]* { /* Ignored (white space) */; } +";" { return SEMICOLON; } +"," { return COMMA; } +"=" { return EQUALS; } +"@" { return AT; } +. { return yytext[0]; } + + +%% +/* + * kw_lookup + * Look up a string in the keyword table. Returns a -1 if the + * string is not a keyword otherwise it returns the keyword number + */ + +int +kw_lookup(char *word) +{ + register struct kt *kp; + + for (kp = key_words; kp->kt_name != 0; kp++) + if (eq(word, kp->kt_name)) + return kp->kt_val; + return -1; +} + +/* + * Number conversion routines + */ + +int +octal(char *str) +{ + int num; + + (void) sscanf(str, "%o", &num); + return num; +} + +int +hex(char *str) +{ + int num; + + (void) sscanf(str+2, "%x", &num); + return num; +} + +int +yywrap() +{ + return 1; +} diff --git a/SETUP/config/main.c b/SETUP/config/main.c new file mode 100644 index 000000000..024b17be8 --- /dev/null +++ b/SETUP/config/main.c @@ -0,0 +1,296 @@ +/* + * Copyright (c) 1999-2009 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights + * Reserved. This file contains Original Code and/or Modifications of + * Original Code as defined in and that are subject to the Apple Public + * Source License Version 1.0 (the 'License'). You may not use this file + * except in compliance with the License. Please obtain a copy of the + * License at http://www.apple.com/publicsource and read it before using + * this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License." + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* + * Mach Operating System + * Copyright (c) 1990 Carnegie-Mellon University + * Copyright (c) 1989 Carnegie-Mellon University + * Copyright (c) 1988 Carnegie-Mellon University + * Copyright (c) 1987 Carnegie-Mellon University + * All rights reserved. The CMU software License Agreement specifies + * the terms and conditions for use and redistribution. + */ + +/* + * Copyright (c) 1980 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms are permitted + * provided that the above copyright notice and this paragraph are + * duplicated in all such forms and that any documentation, + * advertising materials, and other materials related to such + * distribution and use acknowledge that the software was developed + * by the University of California, Berkeley. The name of the + * University may not be used to endorse or promote products derived + * from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef lint +char copyright[] = +"@(#) Copyright (c) 1980 Regents of the University of California.\n\ + All rights reserved.\n"; +#endif /* not lint */ + +#ifndef lint +static char sccsid[] __attribute__((used)) = "@(#)main.c 5.9 (Berkeley) 6/18/88"; +#endif /* not lint */ + +#include +#include +#include "parser.h" +#include "config.h" + +/* + * Config builds a set of files for building a UNIX + * system given a description of the desired system. + */ +int +main(int argc, char *argv[]) +{ + + source_directory = ".."; /* default */ + object_directory = ".."; + config_directory = (char *) 0; + while ((argc > 1) && (argv[1][0] == '-')) { + char *c; + + argv++; argc--; + for (c = &argv[0][1]; *c ; c++) { + switch (*c) { + case 'b': + build_directory = argv[1]; + goto check_arg; + + case 'd': + source_directory = argv[1]; + goto check_arg; + + case 'o': + object_directory = argv[1]; + goto check_arg; + + case 'c': + config_directory = argv[1]; + + check_arg: + if (argv[1] == (char *) 0) + goto usage_error; + argv++; argc--; + break; + + case 'p': + profiling++; + break; + default: + goto usage_error; + } + } + } + if (config_directory == (char *) 0) { + config_directory = + malloc((unsigned) strlen(source_directory) + 6); + (void) sprintf(config_directory, "%s/conf", source_directory); + } + if (argc != 2) { + usage_error: ; + fprintf(stderr, "usage: config [ -bcdo dir ] [ -p ] sysname\n"); + exit(1); + } + if (!build_directory) + build_directory = argv[1]; + if (freopen(argv[1], "r", stdin) == NULL) { + perror(argv[1]); + exit(2); + } + dtab = NULL; + confp = &conf_list; + opt = 0; + if (yyparse()) + exit(3); + switch (machine) { + + case MACHINE_VAX: + vax_ioconf(); /* Print ioconf.c */ + ubglue(); /* Create ubglue.s */ + break; + + case MACHINE_SUN: + sun_ioconf(); + break; + + case MACHINE_SUN2: + case MACHINE_SUN3: + case MACHINE_SUN4: + sun_ioconf(); /* Print ioconf.c */ + mbglue(); /* Create mbglue.s */ + break; + + case MACHINE_ROMP: + romp_ioconf(); + break; + + case MACHINE_MMAX: + mmax_ioconf(); + break; + + case MACHINE_SQT: + sqt_ioconf(); + break; + + case MACHINE_I386: + case MACHINE_IX: + i386_ioconf(); + break; + + case MACHINE_MIPSY: + case MACHINE_MIPS: + mips_ioconf(); + break; + + case MACHINE_I860: + /* i860_ioconf(); */ + break; + + case MACHINE_M68K: + m68k_ioconf(); + break; + + case MACHINE_M88K: + m88k_ioconf(); + break; + + case MACHINE_M98K: + m98k_ioconf(); + break; + + case MACHINE_HPPA: + hppa_ioconf(); + break; + + case MACHINE_SPARC: + sparc_ioconf(); + break; + + case MACHINE_PPC: + ppc_ioconf(); + break; + + case MACHINE_ARM: + arm_ioconf(); + break; + + case MACHINE_X86_64: + x86_64_ioconf(); + break; + + default: + printf("Specify machine type, e.g. ``machine vax''\n"); + exit(1); + } + + makefile(); /* build Makefile */ + headers(); /* make a lot of .h files */ + swapconf(); /* swap config files */ + + return 0; +} + +/* + * get_word + * returns EOF on end of file + * NULL on end of line + * pointer to the word otherwise + */ +const char * +get_word(FILE *fp) +{ + static char line[80]; + register int ch; + register char *cp; + + while ((ch = getc(fp)) != EOF) + if (ch != ' ' && ch != '\t') + break; + if (ch == EOF) + return ((char *)EOF); + if (ch == '\n') + return (NULL); + if (ch == '|') + return( "|"); + cp = line; + *cp++ = ch; + while ((ch = getc(fp)) != EOF) { + if (isspace(ch)) + break; + *cp++ = ch; + } + *cp = 0; + if (ch == EOF) + return ((char *)EOF); + (void) ungetc(ch, fp); + return (line); +} + +/* + * get_rest + * returns EOF on end of file + * NULL on end of line + * pointer to the word otherwise + */ +char * +get_rest(FILE *fp) +{ + static char line[80]; + register int ch; + register char *cp; + + cp = line; + while ((ch = getc(fp)) != EOF) { + if (ch == '\n') + break; + *cp++ = ch; + } + *cp = 0; + if (ch == EOF) + return ((char *)EOF); + return (line); +} + +/* + * prepend the path to a filename + */ +char * +path(const char *file) +{ + register char *cp; + + cp = malloc((unsigned)(strlen(build_directory)+ + strlen(file)+ + strlen(object_directory)+ + 3)); + (void) sprintf(cp, "%s/%s/%s", object_directory, build_directory, file); + return (cp); +} diff --git a/SETUP/config/mkglue.c b/SETUP/config/mkglue.c new file mode 100644 index 000000000..9d4b5ac6f --- /dev/null +++ b/SETUP/config/mkglue.c @@ -0,0 +1,331 @@ +/* + * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights + * Reserved. This file contains Original Code and/or Modifications of + * Original Code as defined in and that are subject to the Apple Public + * Source License Version 1.0 (the 'License'). You may not use this file + * except in compliance with the License. Please obtain a copy of the + * License at http://www.apple.com/publicsource and read it before using + * this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License." + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* + * Mach Operating System + * Copyright (c) 1990 Carnegie-Mellon University + * Copyright (c) 1989 Carnegie-Mellon University + * Copyright (c) 1988 Carnegie-Mellon University + * Copyright (c) 1987 Carnegie-Mellon University + * All rights reserved. The CMU software License Agreement specifies + * the terms and conditions for use and redistribution. + */ + +/* + * Copyright (c) 1980 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms are permitted + * provided that the above copyright notice and this paragraph are + * duplicated in all such forms and that any documentation, + * advertising materials, and other materials related to such + * distribution and use acknowledge that the software was developed + * by the University of California, Berkeley. The name of the + * University may not be used to endorse or promote products derived + * from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef lint +static char sccsid[] __attribute__((used)) = "@(#)mkglue.c 5.6 (Berkeley) 6/18/88"; +#endif /* not lint */ + +/* + * Make the bus adaptor interrupt glue files. + */ +#include +#include +#include "config.h" +#include "parser.h" +#include + +void dump_mb_handler(FILE *fp, struct idlst *vec, int number); +void dump_ubavec(FILE *fp, char *vector, int number); +void dump_std(FILE *fp, FILE *gp); +void dump_intname(FILE *fp, char *vector, int number); +void dump_ctrs(FILE *fp); +void glue(FILE *fp, void (*dump_handler)(FILE *, struct idlst *, int)); + +/* + * Create the UNIBUS interrupt vector glue file. + */ +void +ubglue(void) +{ + register FILE *fp, *gp; + register struct device *dp, *mp; + + fp = fopen(path("ubglue.s"), "w"); + if (fp == 0) { + perror(path("ubglue.s")); + exit(1); + } + gp = fopen(path("ubvec.s"), "w"); + if (gp == 0) { + perror(path("ubvec.s")); + exit(1); + } + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (mp != 0 && mp != (struct device *)-1 && + !eq(mp->d_name, "mba")) { + struct idlst *id, *id2; + + for (id = dp->d_vec; id; id = id->id_next) { + for (id2 = dp->d_vec; id2; id2 = id2->id_next) { + if (id2 == id) { + dump_ubavec(fp, id->id, + dp->d_unit); + break; + } + if (!strcmp(id->id, id2->id)) + break; + } + } + } + } + dump_std(fp, gp); + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (mp != 0 && mp != (struct device *)-1 && + !eq(mp->d_name, "mba")) { + struct idlst *id, *id2; + + for (id = dp->d_vec; id; id = id->id_next) { + for (id2 = dp->d_vec; id2; id2 = id2->id_next) { + if (id2 == id) { + dump_intname(fp, id->id, + dp->d_unit); + break; + } + if (!strcmp(id->id, id2->id)) + break; + } + } + } + } + dump_ctrs(fp); + (void) fclose(fp); + (void) fclose(gp); +} + +static int cntcnt = 0; /* number of interrupt counters allocated */ + +/* + * Print a UNIBUS interrupt vector. + */ +void +dump_ubavec(FILE *fp, char *vector, int number) +{ + char nbuf[80]; + register char *v = nbuf; + + switch (machine) { + + case MACHINE_VAX: + (void) sprintf(v, "%s%d", vector, number); + fprintf(fp, "\t.globl\t_X%s\n\t.align\t2\n_X%s:\n", + v, v); + fprintf(fp,"\tTIM_PUSHR(0)\n"); + fprintf(fp, "\tincl\t_fltintrcnt+(4*%d)\n", cntcnt++); + if (strncmp(vector, "dzx", 3) == 0) + fprintf(fp, "\tmovl\t$%d,r0\n\tjmp\tdzdma\n\n", number); + else { + if (strncmp(vector, "uur", 3) == 0) { + fprintf(fp, "#ifdef UUDMA\n"); + fprintf(fp, "\tmovl\t$%d,r0\n\tjsb\tuudma\n", + number); + fprintf(fp, "#endif\n"); + } + fprintf(fp, "\tpushl\t$%d\n", number); + fprintf(fp, "\tcalls\t$1,_%s\n",vector); + fprintf(fp, "\tCOUNT(V_INTR)\n"); + fprintf(fp, "\tTSREI_POPR\n"); + } + break; + + case MACHINE_MIPSY: + case MACHINE_MIPS: + /* + * Actually, we should never get here! + * Main does not even call ubglue. + */ + if (strncmp(vector, "dzx", 3) == 0) + fprintf(fp, "\tDZINTR(%s,%d)\n", vector, number); + else + fprintf(fp, "\tDEVINTR(%s,%d)\n", vector, number); + break; + } + +} + +static const char *vaxinames[] = { + "clock", "cnr", "cnx", "tur", "tux", + "mba0", "mba1", "mba2", "mba3", + "uba0", "uba1", "uba2", "uba3" +}; +static struct stdintrs { + const char **si_names; /* list of standard interrupt names */ + int si_n; /* number of such names */ +} stdintrs[] = { + { vaxinames, sizeof (vaxinames) / sizeof (vaxinames[0]) }, +}; +/* + * Start the interrupt name table with the names + * of the standard vectors not directly associated + * with a bus. Also, dump the defines needed to + * reference the associated counters into a separate + * file which is prepended to locore.s. + */ +void +dump_std(FILE *fp, FILE *gp) +{ + register struct stdintrs *si = &stdintrs[machine-1]; + register const char **cpp; + register int i; + + fprintf(fp, "\n\t.globl\t_intrnames\n"); + fprintf(fp, "\n\t.globl\t_eintrnames\n"); + fprintf(fp, "\t.data\n"); + fprintf(fp, "_intrnames:\n"); + cpp = si->si_names; + for (i = 0; i < si->si_n; i++) { + const char *cp; + char *tp; + char buf[80]; + + cp = *cpp; + if (cp[0] == 'i' && cp[1] == 'n' && cp[2] == 't') { + cp += 3; + if (*cp == 'r') + cp++; + } + for (tp = buf; *cp; cp++) + if (islower(*cp)) + *tp++ = toupper(*cp); + else + *tp++ = *cp; + *tp = '\0'; + fprintf(gp, "#define\tI_%s\t%lu\n", buf, i*sizeof (long)); + fprintf(fp, "\t.asciz\t\"%s\"\n", *cpp); + cpp++; + } +} + +void +dump_intname(FILE *fp, char *vector, int number) +{ + register char *cp = vector; + + fprintf(fp, "\t.asciz\t\""); + /* + * Skip any "int" or "intr" in the name. + */ + while (*cp) + if (cp[0] == 'i' && cp[1] == 'n' && cp[2] == 't') { + cp += 3; + if (*cp == 'r') + cp++; + } else { + putc(*cp, fp); + cp++; + } + fprintf(fp, "%d\"\n", number); +} + +/* + * Reserve space for the interrupt counters. + */ +void +dump_ctrs(FILE *fp) +{ + struct stdintrs *si = &stdintrs[machine-1]; + + fprintf(fp, "_eintrnames:\n"); + fprintf(fp, "\n\t.globl\t_intrcnt\n"); + fprintf(fp, "\n\t.globl\t_eintrcnt\n"); + fprintf(fp, "\t.align 2\n"); + fprintf(fp, "_intrcnt:\n"); + fprintf(fp, "\t.space\t4 * %d\n", si->si_n); + fprintf(fp, "_fltintrcnt:\n"); + fprintf(fp, "\t.space\t4 * %d\n", cntcnt); + fprintf(fp, "_eintrcnt:\n\n"); + fprintf(fp, "\t.text\n"); +} + +/* + * Routines for making Sun mb interrupt file mbglue.s + */ + +/* + * print an interrupt handler for mainbus + */ +void +dump_mb_handler(FILE *fp, struct idlst *vec, int number) +{ + fprintf(fp, "\tVECINTR(_X%s%d, _%s, _V%s%d)\n", + vec->id, number, vec->id, vec->id, number); +} + +void +mbglue(void) +{ + register FILE *fp; + const char *name = "mbglue.s"; + + fp = fopen(path(name), "w"); + if (fp == 0) { + perror(path(name)); + exit(1); + } + fprintf(fp, "#include \n\n"); + glue(fp, dump_mb_handler); + (void) fclose(fp); +} + +void +glue(FILE *fp, void (*dump_handler)(FILE *, struct idlst *, int)) +{ + register struct device *dp, *mp; + + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (mp != 0 && mp != (struct device *)-1 && + !eq(mp->d_name, "mba")) { + struct idlst *vd, *vd2; + + for (vd = dp->d_vec; vd; vd = vd->id_next) { + for (vd2 = dp->d_vec; vd2; vd2 = vd2->id_next) { + if (vd2 == vd) { + (void)(*dump_handler) + (fp, vd, dp->d_unit); + break; + } + if (!strcmp(vd->id, vd2->id)) + break; + } + } + } + } +} diff --git a/SETUP/config/mkheaders.c b/SETUP/config/mkheaders.c new file mode 100644 index 000000000..a0e3fdc38 --- /dev/null +++ b/SETUP/config/mkheaders.c @@ -0,0 +1,276 @@ +/* + * Copyright (c) 1999-2006 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights + * Reserved. This file contains Original Code and/or Modifications of + * Original Code as defined in and that are subject to the Apple Public + * Source License Version 1.0 (the 'License'). You may not use this file + * except in compliance with the License. Please obtain a copy of the + * License at http://www.apple.com/publicsource and read it before using + * this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License." + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* + * Mach Operating System + * Copyright (c) 1990 Carnegie-Mellon University + * Copyright (c) 1989 Carnegie-Mellon University + * Copyright (c) 1988 Carnegie-Mellon University + * Copyright (c) 1987 Carnegie-Mellon University + * All rights reserved. The CMU software License Agreement specifies + * the terms and conditions for use and redistribution. + */ + +/* + * Copyright (c) 1980 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms are permitted + * provided that the above copyright notice and this paragraph are + * duplicated in all such forms and that any documentation, + * advertising materials, and other materials related to such + * distribution and use acknowledge that the software was developed + * by the University of California, Berkeley. The name of the + * University may not be used to endorse or promote products derived + * from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef lint +static char sccsid[] __attribute__((used)) = "@(#)mkheaders.c 5.5 (Berkeley) 6/18/88"; +#endif /* not lint */ + +/* + * Make all the .h files for the optional entries + */ + +#include +#include /* unlink */ +#include +#include "config.h" +#include "parser.h" + +static void do_count(const char *dev, const char *hname, int search); +static void do_header(const char *dev, const char *hname, int count); +static int file_needed(const char *name); +static char *toheader(const char *dev); +static char *tomacro(const char *dev); + +void +headers(void) +{ + struct file_list *fl; + + for (fl = ftab; fl != 0; fl = fl->f_next) + if (fl->f_needs != 0) + do_count(fl->f_needs, fl->f_needs, 1); +} + +/* + * count all the devices of a certain type and recurse to count + * whatever the device is connected to + */ +void +do_count(const char *dev, const char *hname, int search) +{ + struct device *dp, *mp; + int count; + + for (count = 0,dp = dtab; dp != 0; dp = dp->d_next) + if (dp->d_unit != -1 && eq(dp->d_name, dev)) { + /* + * Avoid making .h files for bus types on sun machines + */ + if ((machine == MACHINE_SUN2 || + machine == MACHINE_SUN3 || + machine == MACHINE_SUN4) + && dp->d_conn == TO_NEXUS){ + return; + } + if (dp->d_type == PSEUDO_DEVICE) { + count = + dp->d_slave != UNKNOWN ? dp->d_slave : 1; + if (dp->d_flags) + dev = NULL; + break; + } + if (machine != MACHINE_SUN2 && machine != MACHINE_SUN3 + && machine != MACHINE_SUN4) + /* avoid ie0,ie0,ie1 setting NIE to 3 */ + count++; + /* + * Allow holes in unit numbering, + * assumption is unit numbering starts + * at zero. + */ + if (dp->d_unit + 1 > count) + count = dp->d_unit + 1; + if (search) { + mp = dp->d_conn; + if (mp != 0 && mp != TO_NEXUS && + mp->d_conn != TO_NEXUS) { + /* + * Check for the case of the + * controller that the device + * is attached to is in a separate + * file (e.g. "sd" and "sc"). + * In this case, do NOT define + * the number of controllers + * in the hname .h file. + */ + if (!file_needed(mp->d_name)) + do_count(mp->d_name, hname, 0); + search = 0; + } + } + } + do_header(dev, hname, count); +} + +/* + * Scan the file list to see if name is needed to bring in a file. + */ +static int +file_needed(const char *name) +{ + struct file_list *fl; + + for (fl = ftab; fl != 0; fl = fl->f_next) { + if (fl->f_needs && strcmp(fl->f_needs, name) == 0) + return (1); + } + return (0); +} + +static void +do_header(const char *dev, const char *hname, int count) +{ + char *file, *name; + const char *inw; + char *inwcopy; + struct file_list *fl = NULL; /* may exit for(;;) uninitted */ + struct file_list *fl_head, *fl_prev; + FILE *inf, *outf; + int inc, oldcount; + + file = toheader(hname); + name = tomacro(dev?dev:hname) + (dev == NULL); + inf = fopen(file, "r"); + oldcount = -1; + if (inf == 0) { + (void) unlink(file); + outf = fopen(file, "w"); + if (outf == 0) { + perror(file); + exit(1); + } + fprintf(outf, "#define %s %d\n", name, count); + (void) fclose(outf); + file = path("meta_features.h"); + outf = fopen(file, "a"); + if (outf == 0) { + perror(file); + exit(1); + } + fprintf(outf, "#include <%s.h>\n", hname); + (void) fclose(outf); + return; + } + fl_head = 0; + for (;;) { + const char *cp; + if ((inw = get_word(inf)) == 0 || inw == (char *)EOF) + break; + if ((inw = get_word(inf)) == 0 || inw == (char *)EOF) + break; + inwcopy = ns(inw); + cp = get_word(inf); + if (cp == 0 || cp == (char *)EOF) + break; + inc = atoi(cp); + if (eq(inwcopy, name)) { + oldcount = inc; + inc = count; + } + cp = get_word(inf); + if (cp == (char *)EOF) + break; + fl = (struct file_list *) malloc(sizeof *fl); + fl->f_fn = inwcopy; + fl->f_type = inc; + fl->f_next = fl_head; + fl_head = fl; + } + (void) fclose(inf); + if (count == oldcount) { + while (fl !=0) { + fl_prev = fl; + fl = fl->f_next; + free((char *)fl_prev); + } + return; + } + if (oldcount == -1) { + fl = (struct file_list *) malloc(sizeof *fl); + fl->f_fn = name; + fl->f_type = count; + fl->f_next = fl_head; + fl_head = fl; + } + unlink(file); + outf = fopen(file, "w"); + if (outf == 0) { + perror(file); + exit(1); + } + for (fl = fl_head; fl != 0; fl = fl->f_next) { + fprintf(outf, "#define %s %d\n", + fl->f_fn, count ? fl->f_type : 0); + free((char *)fl); + } + (void) fclose(outf); +} + +/* + * convert a dev name to a .h file name + */ +static char * +toheader(const char *dev) +{ + static char hbuf[MAXPATHLEN]; + (void) snprintf(hbuf, sizeof hbuf, "%s.h", path(dev)); + hbuf[MAXPATHLEN-1] = '\0'; + return (hbuf); +} + +/* + * convert a dev name to a macro name + */ +static char * +tomacro(const char *dev) +{ + static char mbuf[FILENAME_MAX]; + char *cp; + + cp = mbuf; + *cp++ = 'N'; + while (*dev) + if (!islower(*dev)) + *cp++ = *dev++; + else + *cp++ = toupper(*dev++); + *cp++ = 0; + return (mbuf); +} diff --git a/SETUP/config/mkioconf.c b/SETUP/config/mkioconf.c new file mode 100644 index 000000000..90b6c2f97 --- /dev/null +++ b/SETUP/config/mkioconf.c @@ -0,0 +1,2086 @@ +/* + * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights + * Reserved. This file contains Original Code and/or Modifications of + * Original Code as defined in and that are subject to the Apple Public + * Source License Version 1.0 (the 'License'). You may not use this file + * except in compliance with the License. Please obtain a copy of the + * License at http://www.apple.com/publicsource and read it before using + * this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License." + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* + * Mach Operating System + * Copyright (c) 1990 Carnegie-Mellon University + * Copyright (c) 1989 Carnegie-Mellon University + * Copyright (c) 1988 Carnegie-Mellon University + * Copyright (c) 1987 Carnegie-Mellon University + * All rights reserved. The CMU software License Agreement specifies + * the terms and conditions for use and redistribution. + */ + +/* + * Copyright (c) 1980 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms are permitted + * provided that the above copyright notice and this paragraph are + * duplicated in all such forms and that any documentation, + * advertising materials, and other materials related to such + * distribution and use acknowledge that the software was developed + * by the University of California, Berkeley. The name of the + * University may not be used to endorse or promote products derived + * from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include +#include /* for unlink */ +#include "parser.h" +#include "config.h" + +/* + * build the ioconf.c file + */ +char *intv(struct device *dev); +char *intv2(struct device *dev); +void i386_pseudo_inits(FILE *fp); /* XXX function in wrong block */ +void check_vector(struct idlst *vec); +void nrw_ioconf(void); +void m88k_pseudo_inits(FILE *fp); +void m98k_pseudo_inits(FILE *fp); +char *m88k_dn(char *name); +char *m98k_dn(char *name); +char *concat3(char *buf, const char *p1, const char *p2, const char *p3); + +#if MACHINE_VAX + +void +vax_ioconf(void) +{ + register struct device *dp, *mp, *np; + register int uba_n, slave; + FILE *fp; + + fp = fopen(path("ioconf.c"), "w"); + if (fp == 0) { + perror(path("ioconf.c")); + exit(1); + } +/*MACH_KERNEL*/ + fprintf(fp, "#ifndef MACH_KERNEL\n"); +/*MACH_KERNEL*/ + fprintf(fp, "#include \n"); + fprintf(fp, "#include \n"); + fprintf(fp, "#include \n"); + fprintf(fp, "#include \n"); + fprintf(fp, "#include \n"); +/*MACH_KERNEL*/ + fprintf(fp, "#endif MACH_KERNEL\n"); +/*MACH_KERNEL*/ + fprintf(fp, "\n"); + fprintf(fp, "#include \n"); + fprintf(fp, "#include \n\n"); + fprintf(fp, "\n"); + fprintf(fp, "#define C (caddr_t)\n\n"); + /* + * First print the mba initialization structures + */ + if (seen_mba) { + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (mp == 0 || mp == TO_NEXUS || + !eq(mp->d_name, "mba")) + continue; + fprintf(fp, "extern struct mba_driver %sdriver;\n", + dp->d_name); + } + fprintf(fp, "\nstruct mba_device mbdinit[] = {\n"); + fprintf(fp, "\t/* Device, Unit, Mba, Drive, Dk */\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (dp->d_unit == QUES || mp == 0 || + mp == TO_NEXUS || !eq(mp->d_name, "mba")) + continue; + if (dp->d_addr) { + printf("can't specify csr address on mba for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_vec != 0) { + printf("can't specify vector for %s%d on mba\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive == UNKNOWN) { + printf("drive not specified for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_slave != UNKNOWN) { + printf("can't specify slave number for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + fprintf(fp, "\t{ &%sdriver, %d, %s,", + dp->d_name, dp->d_unit, qu(mp->d_unit)); + fprintf(fp, " %s, %d },\n", + qu(dp->d_drive), dp->d_dk); + } + fprintf(fp, "\t0\n};\n\n"); + /* + * Print the mbsinit structure + * Driver Controller Unit Slave + */ + fprintf(fp, "struct mba_slave mbsinit [] = {\n"); + fprintf(fp, "\t/* Driver, Ctlr, Unit, Slave */\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + /* + * All slaves are connected to something which + * is connected to the massbus. + */ + if ((mp = dp->d_conn) == 0 || mp == TO_NEXUS) + continue; + np = mp->d_conn; + if (np == 0 || np == TO_NEXUS || + !eq(np->d_name, "mba")) + continue; + fprintf(fp, "\t{ &%sdriver, %s", + mp->d_name, qu(mp->d_unit)); + fprintf(fp, ", %2d, %s },\n", + dp->d_unit, qu(dp->d_slave)); + } + fprintf(fp, "\t0\n};\n\n"); + } + /* + * Now generate interrupt vectors for the unibus + */ + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_vec != 0) { + struct idlst *ip; + mp = dp->d_conn; + if (mp == 0 || mp == TO_NEXUS || + !eq(mp->d_name, "uba")) + continue; + fprintf(fp, + "extern struct uba_driver %sdriver;\n", + dp->d_name); + fprintf(fp, "extern "); + ip = dp->d_vec; + for (;;) { + fprintf(fp, "X%s%d()", ip->id, dp->d_unit); + ip = ip->id_next; + if (ip == 0) + break; + fprintf(fp, ", "); + } + fprintf(fp, ";\n"); + fprintf(fp, "int\t (*%sint%d[])() = { ", dp->d_name, + dp->d_unit); + ip = dp->d_vec; + for (;;) { + fprintf(fp, "X%s%d", ip->id, dp->d_unit); + ip = ip->id_next; + if (ip == 0) + break; + fprintf(fp, ", "); + } + fprintf(fp, ", 0 } ;\n"); + } + } + fprintf(fp, "\nstruct uba_ctlr ubminit[] = {\n"); + fprintf(fp, "/*\t driver,\tctlr,\tubanum,\talive,\tintr,\taddr */\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (dp->d_type != CONTROLLER || mp == TO_NEXUS || mp == 0 || + !eq(mp->d_name, "uba")) + continue; + if (dp->d_vec == 0) { + printf("must specify vector for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addr == 0) { + printf("must specify csr address for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive != UNKNOWN || dp->d_slave != UNKNOWN) { + printf("drives need their own entries; dont "); + printf("specify drive or slave for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_flags) { + printf("controllers (e.g. %s%d) ", + dp->d_name, dp->d_unit); + printf("don't have flags, only devices do\n"); + continue; + } + fprintf(fp, + "\t{ &%sdriver,\t%d,\t%s,\t0,\t%sint%d, C 0%o },\n", + dp->d_name, dp->d_unit, qu(mp->d_unit), + dp->d_name, dp->d_unit, dp->d_addr); + } + fprintf(fp, "\t0\n};\n"); +/* unibus devices */ + fprintf(fp, "\nstruct uba_device ubdinit[] = {\n"); + fprintf(fp, +"\t/* driver, unit, ctlr, ubanum, slave, intr, addr, dk, flags*/\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (dp->d_unit == QUES || dp->d_type != DEVICE || mp == 0 || + mp == TO_NEXUS || mp->d_type == MASTER || + eq(mp->d_name, "mba")) + continue; + np = mp->d_conn; + if (np != 0 && np != TO_NEXUS && eq(np->d_name, "mba")) + continue; + np = 0; + if (eq(mp->d_name, "uba")) { + if (dp->d_vec == 0) { + printf("must specify vector for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addr == 0) { + printf("must specify csr for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive != UNKNOWN || dp->d_slave != UNKNOWN) { + printf("drives/slaves can be specified "); + printf("only for controllers, "); + printf("not for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + uba_n = mp->d_unit; + slave = QUES; + } else { + if ((np = mp->d_conn) == 0) { + printf("%s%d isn't connected to anything ", + mp->d_name, mp->d_unit); + printf(", so %s%d is unattached\n", + dp->d_name, dp->d_unit); + continue; + } + uba_n = np->d_unit; + if (dp->d_drive == UNKNOWN) { + printf("must specify ``drive number'' "); + printf("for %s%d\n", dp->d_name, dp->d_unit); + continue; + } + /* NOTE THAT ON THE UNIBUS ``drive'' IS STORED IN */ + /* ``SLAVE'' AND WE DON'T WANT A SLAVE SPECIFIED */ + if (dp->d_slave != UNKNOWN) { + printf("slave numbers should be given only "); + printf("for massbus tapes, not for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_vec != 0) { + printf("interrupt vectors should not be "); + printf("given for drive %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addr != 0) { + printf("csr addresses should be given only "); + printf("on controllers, not on %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + slave = dp->d_drive; + } + fprintf(fp, "\t{ &%sdriver, %2d, %s,", + eq(mp->d_name, "uba") ? dp->d_name : mp->d_name, dp->d_unit, + eq(mp->d_name, "uba") ? " -1" : qu(mp->d_unit)); + fprintf(fp, " %s, %2d, %s, C 0%-6o, %d, 0x%x },\n", + qu(uba_n), slave, intv(dp), dp->d_addr, dp->d_dk, + dp->d_flags); + } + fprintf(fp, "\t0\n};\n"); + (void) fclose(fp); +} +#endif + +#if MACHINE_SUN +#define SP_OBIO 0x0004 /* on board i/o (for sun/autoconf.h) */ + +#define VEC_LO 64 +#define VEC_HI 255 + +void pseudo_inits(FILE *fp); + +void +check_vector(struct idlst *vec) +{ + + if (vec->id_vec == 0) + fprintf(stderr, "vector number for %s not given\n", vec->id); + else if (vec->id_vec < VEC_LO || vec->id_vec > VEC_HI) + fprintf(stderr, + "vector number %d for %s is not between %d and %d\n", + vec->id_vec, vec->id, VEC_LO, VEC_HI); +} + +void +sun_ioconf(void) +{ + register struct device *dp, *mp; + register int slave; + register struct idlst *vp; + FILE *fp; + + fp = fopen(path("ioconf.c"), "w"); + if (fp == 0) { + perror(path("ioconf.c")); + exit(1); + } +/*MACH_KERNEL*/ + fprintf(fp, "#ifndef MACH_KERNEL\n"); +/*MACH_KERNEL*/ + fprintf(fp, "#include \n"); + fprintf(fp, "#include \n"); + fprintf(fp, "#include \n"); + fprintf(fp, "#include \n"); +/*MACH_KERNEL*/ + fprintf(fp, "#endif MACH_KERNEL\n"); +/*MACH_KERNEL*/ + fprintf(fp, "\n"); + fprintf(fp, "#include \n"); + fprintf(fp, "\n"); + fprintf(fp, "#define C (caddr_t)\n\n"); + fprintf(fp, "\n"); + + /* + * Now generate interrupt vectors for the Mainbus + */ + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (mp == TO_NEXUS || mp == 0 || mp->d_conn != TO_NEXUS) + continue; + fprintf(fp, "extern struct mb_driver %sdriver;\n", + dp->d_name); + if (dp->d_vec != 0) { + if (dp->d_pri == 0) + fprintf(stderr, + "no priority specified for %s%d\n", + dp->d_name, dp->d_unit); + fprintf(fp, "extern "); + for (vp = dp->d_vec;;) { + if (machine == MACHINE_SUN4) + fprintf(fp, "%s()", vp->id); + else + fprintf(fp, "X%s%d()", + vp->id, dp->d_unit); + vp = vp->id_next; + if (vp == 0) + break; + fprintf(fp, ", "); + } + fprintf(fp, ";\n"); + + for (vp = dp->d_vec; vp; vp = vp->id_next) { + fprintf(fp, "int V%s%d = %d;\n", + vp->id, dp->d_unit, dp->d_unit); + } + + fprintf(fp, "struct vec %s[] = { ", intv(dp)); + for (vp = dp->d_vec; vp != 0; vp = vp->id_next) { + if (machine == MACHINE_SUN4) + fprintf(fp, "{ %s, %d, &V%s%d }, ", + vp->id, vp->id_vec, + vp->id, dp->d_unit); + else + fprintf(fp, "{ X%s%d, %d, &V%s%d }, ", + vp->id, dp->d_unit, vp->id_vec, + vp->id, dp->d_unit); + check_vector(vp); + } + fprintf(fp, "0 };\n"); + } + } + + /* + * Now spew forth the mb_ctlr structures + */ + fprintf(fp, "\nstruct mb_ctlr mbcinit[] = {\n"); + fprintf(fp, +"/* driver,\tctlr,\talive,\taddress,\tintpri,\t intr,\tspace */\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (dp->d_type != CONTROLLER || mp == TO_NEXUS || mp == 0 || + mp->d_conn != TO_NEXUS) + continue; + if (dp->d_addr == UNKNOWN) { + printf("must specify csr address for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive != UNKNOWN || dp->d_slave != UNKNOWN) { + printf("drives need their own entries; "); + printf("don't specify drive or slave for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_flags) { + printf("controllers (e.g. %s%d) don't have flags, ", + dp->d_name, dp->d_unit); + printf("only devices do\n"); + continue; + } + if (machine == MACHINE_SUN4) + fprintf(fp, + "{ &%sdriver,\t%d,\t0,\tC 0x%08x,\t%d,\t%s, 0x%x },\n", + dp->d_name, dp->d_unit, dp->d_addr, + (dp->d_bus==SP_OBIO) ? (dp->d_pri << 1) : (dp->d_pri<<1)-1, + intv(dp), ((dp->d_mach << 16) | dp->d_bus)); + else + fprintf(fp, + "{ &%sdriver,\t%d,\t0,\tC 0x%08x,\t%d,\t%s, 0x%x },\n", + dp->d_name, dp->d_unit, dp->d_addr, + dp->d_pri, intv(dp), ((dp->d_mach << 16) | dp->d_bus)); + } + fprintf(fp, "\t0\n};\n"); + + /* + * Now we go for the mb_device stuff + */ + fprintf(fp, "\nstruct mb_device mbdinit[] = {\n"); + fprintf(fp, +"/* driver,\tunit, ctlr, slave, address, pri, dk, flags, intr, space */\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (dp->d_unit == QUES || dp->d_type != DEVICE || mp == 0 || + mp == TO_NEXUS || mp->d_type == MASTER) + continue; + if (mp->d_conn == TO_NEXUS) { + if (dp->d_addr == UNKNOWN) { + printf("must specify csr for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive != UNKNOWN || dp->d_slave != UNKNOWN) { + printf("drives/slaves can be specified only "); + printf("for controllers, not for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + slave = QUES; + } else { + if (mp->d_conn == 0) { + printf("%s%d isn't connected to anything, ", + mp->d_name, mp->d_unit); + printf("so %s%d is unattached\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive == UNKNOWN) { + printf("must specify ``drive number'' for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + /* NOTE THAT ON THE UNIBUS ``drive'' IS STORED IN */ + /* ``SLAVE'' AND WE DON'T WANT A SLAVE SPECIFIED */ + if (dp->d_slave != UNKNOWN) { + printf("slave numbers should be given only "); + printf("for massbus tapes, not for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_pri != 0) { + printf("interrupt priority should not be "); + printf("given for drive %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addr != UNKNOWN) { + printf("csr addresses should be given only"); + printf(" on controllers, not on %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + slave = dp->d_drive; + } + if (machine == MACHINE_SUN4) + fprintf(fp, +"{ &%sdriver,\t%d, %s, %2d, C 0x%08x, %d, %d, 0x%x, %s, 0x%x },\n", + mp->d_conn == TO_NEXUS? dp->d_name : mp->d_name, dp->d_unit, + mp->d_conn == TO_NEXUS? " -1" : qu(mp->d_unit), + slave, + dp->d_addr == UNKNOWN? 0 : dp->d_addr, + dp->d_pri * 2, dp->d_dk, dp->d_flags, intv(dp), + ((dp->d_mach << 16) | dp->d_bus)); + else + fprintf(fp, +"{ &%sdriver,\t%d, %s, %2d, C 0x%08x, %d, %d, 0x%x, %s, 0x%x },\n", + mp->d_conn == TO_NEXUS? dp->d_name : mp->d_name, dp->d_unit, + mp->d_conn == TO_NEXUS? " -1" : qu(mp->d_unit), + slave, + dp->d_addr == UNKNOWN? 0 : dp->d_addr, + dp->d_pri, dp->d_dk, dp->d_flags, intv(dp), + ((dp->d_mach << 16) | dp->d_bus)); + } + fprintf(fp, "\t0\n};\n"); + pseudo_inits(fp); + (void) fclose(fp); +} + +void +pseudo_inits(FILE *fp) +{ +#ifdef notdef + register struct device *dp; + int count; + + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + fprintf(fp, "extern int %s(int);\n", dp->d_init); + } +#endif notdef + fprintf(fp, "struct pseudo_init {\n"); + fprintf(fp, "\tint\tps_count;\n\tint\t(*ps_func)();\n"); + fprintf(fp, "} pseudo_inits[] = {\n"); +#ifdef notdef + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + count = dp->d_slave; + if (count <= 0) + count = 1; + fprintf(fp, "\t{%d,\t%s},\n", count, dp->d_init); + } +#endif notdef + fprintf(fp, "\t{0,\t0},\n};\n"); +} +#endif + +#if MACHINE_ROMP +void +romp_ioconf(void) +{ + register struct device *dp, *mp; + register int slave; + FILE *fp; + + fp = fopen(path("ioconf.c"), "w"); + if (fp == 0) { + perror(path("ioconf.c")); + exit(1); + } +/*MACH_KERNEL*/ + fprintf(fp, "#ifndef MACH_KERNEL\n"); +/*MACH_KERNEL*/ + fprintf(fp, "#include \n"); + fprintf(fp, "#include \n"); + fprintf(fp, "#include \n"); + fprintf(fp, "#include \n"); +/*MACH_KERNEL*/ + fprintf(fp, "#endif MACH_KERNEL\n"); +/*MACH_KERNEL*/ + fprintf(fp, "\n"); + fprintf(fp, "#include \n"); + fprintf(fp, "\n"); + fprintf(fp, "#define C (caddr_t)\n\n"); + fprintf(fp, "\n"); + + fprintf (fp, "struct iocc_hd iocc_hd[] = {{C 0xF0000000,}};\n"); + /* + * Now generate interrupt vectors for the Winnerbus + */ + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_pri != 0) { + mp = dp->d_conn; + if (mp == 0 || mp == TO_NEXUS || + !eq(mp->d_name, "iocc")) + continue; + fprintf(fp, "extern struct iocc_driver %sdriver;\n", + dp->d_name); + } + } + /* + * Now spew forth the iocc_cinfo structure + */ + fprintf(fp, "\nstruct iocc_ctlr iocccinit[] = {\n"); + fprintf(fp, "/*\t driver,\tctlr,\talive,\taddr,\tintpri */\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (dp->d_type != CONTROLLER) + continue; + if (mp == TO_NEXUS || mp == 0 || !eq(mp->d_name, "iocc")) + continue; + if (dp->d_unit == QUES && eq(dp->d_name,"hdc")) + continue; + if (dp->d_unit == QUES && eq(dp->d_name,"fdc")) + continue; + if (dp->d_pri == 0) { + printf("must specify priority for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addr == 0) { + printf("must specify csr address for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive != UNKNOWN || dp->d_slave != UNKNOWN) { + printf("drives need their own entries; "); + printf("dont specify drive or slave for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_flags) { + printf("controllers (e.g. %s%d) don't have flags, ", + dp->d_name, dp->d_unit); + printf("only devices do\n"); + continue; + } + fprintf(fp, "\t{ &%sdriver,\t%d,\t0,\tC 0x%x,\t%d },\n", + dp->d_name, dp->d_unit, dp->d_addr, dp->d_pri); + } + fprintf(fp, "\t0\n};\n"); + /* + * Now we go for the iocc_device stuff + */ + fprintf(fp, "\nstruct iocc_device ioccdinit[] = {\n"); + fprintf(fp, +"\t/* driver, unit, ctlr, slave, addr, pri, dk, flags*/\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (dp->d_unit == QUES || dp->d_type != DEVICE || mp == 0 || + mp == TO_NEXUS || mp->d_type == MASTER || + eq(mp->d_name, "iocca")) + continue; + if (eq(mp->d_name, "iocc")) { + if (dp->d_pri == 0) { + printf("must specify vector for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addr == 0) { + printf("must specify csr for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive != UNKNOWN || dp->d_slave != UNKNOWN) { + printf("drives/slaves can be specified only "); + printf("for controllers, not for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + slave = QUES; + } else { + if (mp->d_conn == 0) { + printf("%s%d isn't connected to anything, ", + mp->d_name, mp->d_unit); + printf("so %s%d is unattached\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive == UNKNOWN) { + printf("must specify ``drive number'' for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + /* NOTE THAT ON THE UNIBUS ``drive'' IS STORED IN */ + /* ``SLAVE'' AND WE DON'T WANT A SLAVE SPECIFIED */ + if (dp->d_slave != UNKNOWN) { + printf("slave numbers should be given only "); + printf("for massbus tapes, not for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_pri != 0) { + printf("interrupt priority should not be "); + printf("given for drive %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addr != 0) { + printf("csr addresses should be given only"); + printf("on controllers, not on %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + slave = dp->d_drive; + } + fprintf(fp, +"\t{ &%sdriver, %2d, %s, %2d, C 0x%x, %d, %d, 0x%x },\n", + eq(mp->d_name, "iocc") ? dp->d_name : mp->d_name, dp->d_unit, + eq(mp->d_name, "iocc") ? " -1" : qu(mp->d_unit), + slave, dp->d_addr, dp->d_pri, dp->d_dk, dp->d_flags); + } + fprintf(fp, "\t0\n};\n"); + (void) fclose(fp); +} + +#endif MACHINE_ROMP + +#if MACHINE_MMAX +void +mmax_ioconf(void) +{ + register struct device *dp, *dp1, *mp; + FILE *fp; + int unit; + + fp = fopen(path("ioconf.c"), "w"); + if (fp == 0) { + perror(path("ioconf.c")); + exit(1); + } + fprintf(fp, "#include \n\n"); + + /* + * Multimax code is a little messy because we have to + * scan the entire list for each device to generate the + * structures correctly. We cheat and use the d->d_pri + * field to avoid doing anything twice. -1000 is an obvious + * bogus value for this field. + */ + + for (dp1 = dtab; dp1 != 0; dp1 = dp1->d_next) { + /* + * If pri is not -1000, then haven't seen device yet. + */ + if (dp1->d_pri != -1000) switch (dp1->d_type) { + + case CONTROLLER: + fprintf(fp,"struct devaddr %s_devaddr[] = {\n", + dp1->d_name); + /* + * Now scan entire list and get all of them. Use + * unit to make sure unit numbers are right. + */ + unit = 0; + for (dp = dp1; dp != 0; dp = dp->d_next) { + if (!strcmp(dp->d_name, dp1->d_name)) { + mp = dp->d_conn; + if (mp != TO_SLOT) { + printf("%s%d: controller must be connected to slot.\n", + dp->d_name, dp->d_unit); + exit(1); + } + if (dp->d_vec != 0) { + printf("%s%d: cannot configure multimax interrupt vectors.\n", + dp->d_name, dp->d_unit); + } + if (dp->d_pri != 0) { + printf("%s%d: interrupt priority is nonsense on multimax.\n", + dp->d_name, dp->d_unit); + } + if ((dp->d_drive != UNKNOWN) || + (dp->d_slave !=UNKNOWN)) { + printf("%s%d: don't specify drive or slave for controller.\n", + dp->d_name, dp->d_unit); + } + /* + * Fix unit number if bogus + */ + if(dp->d_unit != unit) { + printf("Warning: %s%d configured as %s%d -- fix config file.\n", + dp->d_name,dp->d_unit,dp->d_name,unit); + dp->d_unit = unit; + } + unit++; + fprintf(fp,"\t{ %d, 0, 0},\n",dp->d_addr); + dp->d_pri = -1000; /* done this one */ + } + } + fprintf(fp,"} ;\n\n"); + break; + + case DEVICE: + fprintf(fp,"struct subdevaddr %s_subdevaddr[] = {\n", + dp1->d_name); + /* + * Now scan entire list and get all of them. Use + * unit to make sure unit numbers are right. + */ + unit = 0; + for (dp = dp1; dp != 0; dp = dp->d_next) { + if (!strcmp(dp->d_name, dp1->d_name)) { + mp = dp->d_conn; + if ( (mp == 0) || (mp == TO_SLOT) || + (mp->d_type != CONTROLLER)) { + printf("%s%d: device has no controller.\n", + dp->d_name, dp->d_unit); + exit(1); + } + if (dp->d_vec != 0) { + printf("%s%d: cannot configure multimax interrupt vectors.\n", + dp->d_name, dp->d_unit); + } + if (dp->d_pri != 0) { + printf("%s%d: interrupt priority is nonsense on multimax.\n", + dp->d_name, dp->d_unit); + } + if ((dp->d_drive != UNKNOWN) || + (dp->d_slave !=UNKNOWN)) { + printf("%s%d: use 'unit' instead of 'drive' or 'slave'.\n", + dp->d_name, dp->d_unit); + } + /* + * Fix unit number if bogus + */ + if(dp->d_unit != unit) { + printf("Warning: %s%d configured as %s%d -- fix config file.\n", + dp->d_name,dp->d_unit,dp->d_name,unit); + dp->d_unit = unit; + } + unit++; + if((dp->d_addr == 0) || (dp->d_addr == QUES)){ + printf("%s%d: must specify logical unit number.\n", + dp->d_name,dp->d_unit); + exit(1); + } + fprintf(fp,"\t{ %d, %d, 0},\n",mp->d_unit, + dp->d_addr); + dp->d_pri = -1000; /* don't do this again */ + } + } + fprintf(fp,"} ;\n\n"); + break; + + case PSEUDO_DEVICE: + /* + * Doesn't exist as far as ioconf.c is concerned. + */ + break; + + default: + printf("Bogus device type for %s\n", dp1->d_name); + exit(1); + break; + } + } + + (void) fclose(fp); +} + +#endif MACHINE_MMAX + +#if MACHINE_SQT + +/* + * Define prototype device spec lines. + * + * For now, have static set of controller prototypes. This should be + * upgraded to using (eg) controllers.balance (ala Sequent /etc/config) + * to support custom boards without need to edit this file. + */ + +/* + * flags for indicating presence of upper and lower bound values + */ + +#define P_LB 1 +#define P_UB 2 + +struct p_entry { + const char *p_name; /* name of field */ + long p_def; /* default value */ + long p_lb; /* lower bound for field */ + long p_ub; /* upper bound of field */ + char p_flags; /* bound valid flags */ +}; + +struct proto { + const char *p_name; /* name of controller type */ + struct p_entry p_fields[NFIELDS]; /* ordered list of fields */ + int p_seen; /* any seen? */ +}; + +/* + * MULTIBUS Adapter: + * type mbad index csr flags maps[0,256] bin[0,7] intr[0,7] + */ + +static struct proto mbad_proto = { + "mbad", + {{ "index", 0, 0, 0, 0 }, + { "csr", 0, 0, 0, 0 }, + { "flags", 0, 0, 0, 0 }, + { "maps", 0, 0, 256, P_LB|P_UB }, + { "bin", 0, 0, 7, P_LB|P_UB }, + { "intr", 0, 0, 7, P_LB|P_UB },}, + 0 +}; + +/* + * SCSI/Ether Controller: + * type sec flags bin[0,7] req doneq index target[0,7]=-1 unit + */ + +static struct proto sec_proto = { + "sec", + {{ "flags", 0, 0, 0, 0 }, + { "bin", 0, 0, 7, P_LB|P_UB } , + { "req", 0, 0, 0, 0 }, + { "doneq", 0, 0, 0, 0 }, + { "index", 0, 0, 0, 0 }, + { "target", -1, 0, 7, P_LB|P_UB }, + { "unit", 0, 0, 0, 0 },}, + 0 +}; + +/* + * "Zeke" (FAST) Disk Controller (Dual-Channel Disk Controller): + * type zdc index[0,31] drive[-1,7] drive_type[-1,1] + * + * Levgal values for drive_type: + * M2333K = 0 (swallow) + * M2351A = 1 (eagle) + * wildcard = -1 (run-time determined) + */ + +static struct proto zdc_proto = { + "zdc", + {{ "index", 0, 0, 31, P_LB|P_UB }, + { "drive", 0, -1, 7, P_LB|P_UB }, + { "drive_type", 0, -1, 1, P_LB|P_UB },}, + 0 +}; + +static struct proto *ptab[] = { + &mbad_proto, + &sec_proto, + &zdc_proto, + (struct proto *) 0 +}; + +/* + * locate a prototype structure in the queue of such structures. + * return NULL if not found. + */ + +static struct proto * +find_proto(const char *str) +{ + register struct proto *ptp; + register int ptbx; + + for (ptbx = 0; (ptp = ptab[ptbx]) != NULL; ptbx++) { + if (eq(str, ptp->p_name)) + return(ptp); + } + return(NULL); +} + +void +dev_param(struct device *dp, const char *str, long num) +{ + register struct p_entry *entry; + register struct proto *ptp; + + ptp = find_proto(dp->d_conn->d_name); + if (ptp == NULL) { + fprintf(stderr,"dev %s cont %s", dp->d_name, dp->d_conn->d_name); + yyerror("invalid controller"); + return; + } + + for (entry = ptp->p_fields; entry->p_name != NULL; entry++) { + if (eq(entry->p_name, str)) { + if ((entry->p_flags & P_LB) && (num < entry->p_lb)) { + yyerror("parameter below range"); + return; + } + if ((entry->p_flags & P_UB) && (num > entry->p_ub)) { + yyerror("parameter above range"); + return; + } + dp->d_fields[entry-ptp->p_fields] = num; + return; + } + } + + yyerror("invalid parameter"); +} + +void +sqt_ioconf(void) +{ + register struct device *dp, *mp; + register int count; + const char *namep; + register struct proto *ptp; + register struct p_entry *entry; + FILE *fp; + int bin_table[8]; + int ptbx; + int found; + + for (count = 0; count < 8; count++) + bin_table[count] = 0; + fp = fopen(path("ioconf.c"), "w"); + if (fp == NULL) { + perror(path("ioconf.c")); + exit(1); + } +/*MACH_KERNEL*/ + fprintf(fp, "#ifndef MACH_KERNEL\n"); +/*MACH_KERNEL*/ + fprintf(fp, "#include \n"); + fprintf(fp, "#include \n"); +/*MACH_KERNEL*/ + fprintf(fp, "#endif MACH_KERNEL\n"); +/*MACH_KERNEL*/ + fprintf(fp, "\n"); + fprintf(fp, "#include \n"); + + fprintf(fp, "\nu_long\tMBAd_IOwindow =\t\t3*256*1024;\t/* top 1/4 Meg */\n\n"); + + for (ptbx = 0; (ptp = ptab[ptbx]) != NULL; ptbx++) { + + fprintf(fp, "/*\n"); + fprintf(fp, " * %s device configuration.\n", ptp->p_name); + fprintf(fp, " */\n\n"); + fprintf(fp, "\n"); + fprintf(fp, "#include \n", ptp->p_name); + fprintf(fp, "\n"); + + /* + * Generate dev structures for this controller + */ + for (dp = dtab, namep = NULL; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (mp == 0 || mp == TO_NEXUS || + !eq(mp->d_name, ptp->p_name) || + (namep != NULL && eq(dp->d_name, namep)) ) + continue; + fprintf(fp, "extern\tstruct\t%s_driver\t%s_driver;\n", + ptp->p_name, namep = dp->d_name); + ptp->p_seen = 1; + } + + found = 0; + for (dp = dtab, namep = NULL; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (mp == 0 || mp == TO_NEXUS || + !eq(mp->d_name, ptp->p_name)) + continue; + if (namep == NULL || !eq(namep, dp->d_name)) { + count = 0; + if (namep != NULL) + fprintf(fp, "};\n"); + found = 1; + fprintf(fp, "\nstruct\t%s_dev %s_%s[] = {\n", + ptp->p_name, + ptp->p_name, + namep = dp->d_name); + fprintf(fp, "/*"); + entry = ptp->p_fields; + for (; entry->p_name != NULL; entry++) + fprintf(fp, "\t%s",entry->p_name); + fprintf(fp, " */\n"); + } + if (dp->d_bin != UNKNOWN) + bin_table[dp->d_bin]++; + fprintf(fp, "{"); + for (entry = ptp->p_fields; entry->p_name != NULL; entry++) { + if (eq(entry->p_name,"index")) + fprintf(fp, "\t%d,", mp->d_unit); + else + fprintf(fp, "\t%lu,", + dp->d_fields[entry-ptp->p_fields]); + } + fprintf(fp, "\t},\t/* %s%d */\n", dp->d_name, count++); + } + if (found) + fprintf(fp, "};\n\n"); + + /* + * Generate conf array + */ + fprintf(fp, "/*\n"); + fprintf(fp, " * %s_conf array collects all %s devices\n", + ptp->p_name, ptp->p_name); + fprintf(fp, " */\n\n"); + fprintf(fp, "struct\t%s_conf %s_conf[] = {\n", + ptp->p_name, ptp->p_name); + fprintf(fp, "/*\tDriver\t\t#Entries\tDevices\t\t*/\n"); + for (dp = dtab, namep = NULL; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (mp == 0 || mp == TO_NEXUS || + !eq(mp->d_name, ptp->p_name)) + continue; + if (namep == NULL || !eq(namep, dp->d_name)) { + if (namep != NULL) + fprintf(fp, + "{\t&%s_driver,\t%d,\t\t%s_%s,\t},\t/* %s */\n", + namep, count, ptp->p_name, namep, namep); + count = 0; + namep = dp->d_name; + } + ++count; + } + if (namep != NULL) { + fprintf(fp, + "{\t&%s_driver,\t%d,\t\t%s_%s,\t},\t/* %s */\n", + namep, count, ptp->p_name, namep, namep); + } + fprintf(fp, "\t{ 0 },\n"); + fprintf(fp, "};\n\n"); + + } + + /* + * Pseudo's + */ + + fprintf(fp, "/*\n"); + fprintf(fp, " * Pseudo-device configuration\n"); + fprintf(fp, " */\n\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type == PSEUDO_DEVICE) { + fprintf(fp, "extern\tint\t%sboot();\n", dp->d_name); + } + } + fprintf(fp, "\nstruct\tpseudo_dev pseudo_dev[] = {\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type == PSEUDO_DEVICE) { + fprintf(fp, "\t{ \"%s\",\t%d,\t%sboot,\t},\n", + dp->d_name, + dp->d_slave == UNKNOWN ? 32 : dp->d_slave, + dp->d_name); + } + } + fprintf(fp, "\t{ 0 },\n"); + fprintf(fp, "};\n\n"); + + /* + * Bin interrupt table and misc + */ + + fprintf(fp, "/*\n"); + fprintf(fp, " * Interrupt table\n"); + fprintf(fp, " */\n\n"); + fprintf(fp, "int\tbin_intr[8] = {\n"); + fprintf(fp, "\t\t0,\t\t\t\t/* bin 0, always zero */\n"); + for (count=1; count < 8; count++) { + fprintf(fp, "\t\t%d,\t\t\t\t/* bin %d */\n", + bin_table[count], count); + } + fprintf(fp, "};\n"); + + /* + * b8k_cntlrs[] + */ + + fprintf(fp, "/*\n"); + fprintf(fp, " * b8k_cntlrs array collects all controller entries\n"); + fprintf(fp, " */\n\n"); + for (ptbx = 0; (ptp = ptab[ptbx]) != NULL; ptbx++) { + if (ptp->p_seen) + fprintf(fp, "extern int conf_%s(),\tprobe_%s_devices(),\t%s_map();\n", + ptp->p_name, ptp->p_name, ptp->p_name); + } + fprintf(fp, "\n\nstruct\tcntlrs b8k_cntlrs[] = {\n"); + fprintf(fp, "/*\tconf\t\tprobe_devs\t\tmap\t*/\n"); + + for (ptbx = 0; (ptp = ptab[ptbx]) != NULL; ptbx++) { + if (ptp->p_seen) + fprintf(fp, "{\tconf_%s,\tprobe_%s_devices,\t%s_map\t}, \n", + ptp->p_name, ptp->p_name, ptp->p_name); + } + fprintf(fp, "{\t0,\t},\n"); + fprintf(fp, "};\n"); + + (void) fclose(fp); +} + +#endif MACHINE_SQT +#if MACHINE_I386 +void +i386_ioconf(void) +{ + FILE *fp; + + unlink(path("ioconf.c")); + fp = fopen(path("ioconf.c"), "w"); + if (fp == 0) { + perror(path("ioconf.c")); + exit(1); + } + fprintf(fp, "#include \n"); + fprintf(fp, "\n"); + fprintf(fp, "#define C (void *)\n"); + fprintf(fp, "\n"); + + i386_pseudo_inits (fp); + (void) fclose(fp); +} +#endif MACHINE_I386 + +#if MACHINE_MIPSY || MACHINE_MIPS + +void declare(const char *cp); +int is_declared(const char *cp); + +void +mips_ioconf(void) +{ + register struct device *dp, *mp, *np; + register int slave; + FILE *fp; + char buf1[64], buf2[64]; + + unlink(path("ioconf.c")); + fp = fopen(path("ioconf.c"), "w"); + if (fp == 0) { + perror(path("ioconf.c")); + exit(1); + } +/*MACH_KERNEL*/ + fprintf(fp, "#ifndef MACH_KERNEL\n"); +/*MACH_KERNEL*/ + fprintf(fp, "#include \n"); + fprintf(fp, "#include \n"); + fprintf(fp, "#include \n"); + fprintf(fp, "#include \n"); +/*MACH_KERNEL*/ + fprintf(fp, "#endif MACH_KERNEL\n"); +/*MACH_KERNEL*/ + fprintf(fp, "\n"); + if (seen_mbii && seen_vme) { + printf("can't have both vme and mbii devices\n"); + exit(1); + } + if (seen_mbii) + fprintf(fp, "#include \n"); + if (seen_vme) + fprintf(fp, "#include \n"); + fprintf(fp, "\n"); + fprintf(fp, "#define C (caddr_t)\n"); + fprintf(fp, "#define NULL 0\n\n"); + if (!seen_mbii) + goto checkvme; + /* + * MBII stuff should go here + */ + +checkvme: + if (!seen_vme) + goto closefile; + /* + * Now generate interrupt vectors for the vme bus + */ + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_vec != 0) { + struct idlst *ip; + mp = dp->d_conn; + if (mp == 0 || mp == TO_NEXUS || !eq(mp->d_name, "vme")) + continue; + if (is_declared(dp->d_name)) + continue; + declare(dp->d_name); + fprintf(fp, "extern struct vme_driver %sdriver;\n", + dp->d_name); + fprintf(fp, "extern "); + ip = dp->d_vec; + for (;;) { + fprintf(fp, "%s()", ip->id); + ip = ip->id_next; + if (ip == 0) + break; + fprintf(fp, ", "); + } + fprintf(fp, ";\n"); + fprintf(fp, "int (*_%sint%d[])() = { ", dp->d_name, + dp->d_unit); + ip = dp->d_vec; + for (;;) { + fprintf(fp, "%s", ip->id); + ip = ip->id_next; + if (ip == 0) + break; + fprintf(fp, ", "); + } + fprintf(fp, ", 0 } ;\n\n"); + } + } + fprintf(fp, "\nstruct vme_ctlr vmminit[] = {\n"); + fprintf(fp, +" /* driver ctlr alive intr addr am */\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (dp->d_type != CONTROLLER || mp == TO_NEXUS || mp == 0 || + !eq(mp->d_name, "vme")) + continue; + if (dp->d_vec == 0) { + printf("must specify vector for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addr == 0) { + printf("must specify csr address for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addrmod == 0) { + printf("must specify address modifier for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive != UNKNOWN || dp->d_slave != UNKNOWN) { + printf("drives need their own entries; dont "); + printf("specify drive or slave for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_flags) { + printf("controllers (e.g. %s%d) ", + dp->d_name, dp->d_unit); + printf("don't have flags, only devices do\n"); + continue; + } + fprintf(fp, +" { %14s, %3d, 0, %11s, C 0x%08x, 0x%02x },\n", + concat3(buf1, "&", dp->d_name, "driver"), + dp->d_unit, + concat3(buf2, "_", dp->d_name, "int"), + dp->d_addr, + dp->d_addrmod); + } + fprintf(fp, " { NULL }\n};\n"); + /* + * vme devices + */ + fprintf(fp, "\nstruct vme_device vmdinit[] = {\n"); + fprintf(fp, +"/* driver unit ctlr slave intr addr am dk flags */\n" + ); + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (dp->d_unit == QUES || dp->d_type != DEVICE || mp == 0 || + mp == TO_NEXUS || mp->d_type == MASTER) + continue; + for (np = mp; np && np != TO_NEXUS; np = np->d_conn) + if (eq(np->d_name, "vme")) + break; + if (np != 0 && np != TO_NEXUS && !eq(np->d_name, "vme")) + continue; + np = 0; + if (eq(mp->d_name, "vme")) { + if (dp->d_vec == 0) { + printf("must specify vector for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addr == 0) { + printf("must specify csr for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addrmod == 0) { + printf( + "must specify address modifier for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive != UNKNOWN || dp->d_slave != UNKNOWN) { + printf("drives/slaves can be specified "); + printf("only for controllers, "); + printf("not for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + slave = QUES; + } else { + if ((np = mp->d_conn) == 0) { + printf("%s%d isn't connected to anything ", + mp->d_name, mp->d_unit); + printf(", so %s%d is unattached\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive == UNKNOWN) { + printf("must specify ``drive number'' "); + printf("for %s%d\n", dp->d_name, dp->d_unit); + continue; + } + if (dp->d_slave != UNKNOWN) { + printf("slave numbers should be given only "); + printf("for massbus tapes, not for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_vec != 0) { + printf("interrupt vectors should not be "); + printf("given for drive %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addr != 0) { + printf("csr addresses should be given only "); + printf("on controllers, not on %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addrmod != 0) { + printf("address modifiers should be given only "); + printf("on controllers, not on %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + slave = dp->d_drive; + } + fprintf(fp, +"{%14s, %3d, %3s, %4d,%10s, C 0x%08x, 0x%02x, %1d, 0x%08x },\n", + concat3(buf1, "&", + eq(mp->d_name, "vme") ? dp->d_name : mp->d_name, + "driver"), + dp->d_unit, + eq(mp->d_name, "vme") ? "-1" : qu(mp->d_unit), + slave, + intv2(dp), + dp->d_addr, + dp->d_addrmod, + dp->d_dk, + dp->d_flags); + } + fprintf(fp, "{ NULL }\n};\n"); +closefile: + (void) fclose(fp); +} + +char * +intv2(struct device *dev) +{ + static char buf[20]; + + if (dev->d_vec == 0) { + strcpy(buf, "NULL"); + } else { + (void) sprintf(buf, "_%sint", dev->d_name); + } + return (buf); +} + +char * +concat3(char *buf, const char *p1, const char *p2, const char *p3) +{ + (void) sprintf(buf, "%s%s%s", p1, p2, p3); + return (buf); +} + +#define MAXDEVS 100 +#define DEVLEN 10 +char decl_devices[MAXDEVS][DEVLEN]; + +void +declare(const char *cp) +{ + register int i; + + for (i = 0; i < MAXDEVS; i++) + if (decl_devices[i][0] == 0) { + strncpy(decl_devices[i], cp, DEVLEN); + return; + } + printf("device table full, fix mkioconf.c\n"); + exit(1); +} + +int +is_declared(const char *cp) +{ + register int i; + + for (i = 0; i < MAXDEVS; i++) { + if (decl_devices[i][0] == 0) + return(0); + if (strncmp(decl_devices[i], cp, DEVLEN) == 0) + return(1); + } + return(0); +} +#endif MACHINE_MIPSY || MACHINE_MIPS + +#if MACHINE_M68K +char *m68k_dn(const char *name); +void m68k_pseudo_inits(FILE *fp); + +void +m68k_ioconf(void) +{ + register struct device *dp, *mp; + register int slave; + FILE *fp; + + unlink(path("ioconf.c")); + fp = fopen(path("ioconf.c"), "w"); + if (fp == 0) { + perror(path("ioconf.c")); + exit(1); + } + fprintf(fp, "#include \n"); + fprintf(fp, "\n"); + fprintf(fp, "#define C (void *)\n"); + fprintf(fp, "\n"); + + /* + * Now generate interrupt vectors for the bus + */ + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (mp == TO_NEXUS || mp == 0 || mp->d_conn != TO_NEXUS) + continue; + fprintf(fp, "extern struct bus_driver %sdriver;\n", + dp->d_name); + } + + /* + * Now spew forth the bus_ctrl structures + */ + fprintf(fp, "\nstruct bus_ctrl bus_cinit[] = {\n"); + fprintf(fp, +" /* driver ctrl ipl address */\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (dp->d_type != CONTROLLER || mp == TO_NEXUS || mp == 0 || + mp->d_conn != TO_NEXUS || dp->d_unit == QUES) + continue; + if (dp->d_addr == UNKNOWN) { + printf("must specify csr address for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive != UNKNOWN || dp->d_slave != UNKNOWN) { + printf("drives need their own entries; "); + printf("don't specify drive or slave for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_flags) { + printf("controllers (e.g. %s%d) don't have flags, ", + dp->d_name, dp->d_unit); + printf("only devices do\n"); + continue; + } + fprintf(fp, +" { %-12s, %5d, %4d, C 0x%08x },\n", + m68k_dn(dp->d_name), dp->d_unit, dp->d_pri, dp->d_addr); + } + fprintf(fp, " 0\n};\n"); + + /* + * Now we go for the bus_device stuff + */ + fprintf(fp, "\nstruct bus_device bus_dinit[] = {\n"); + fprintf(fp, +" /* driver unit ctrl slave ipl dk flags address name */\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (dp->d_unit == QUES || dp->d_type != DEVICE || mp == 0 || + mp == TO_NEXUS || mp->d_type == MASTER) + continue; + if (mp->d_conn == TO_NEXUS) { + if (dp->d_addr == UNKNOWN) { + printf("must specify csr for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive != UNKNOWN || dp->d_slave != UNKNOWN) { + printf("drives/slaves can be specified only "); + printf("for controllers, not for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + slave = UNKNOWN; + } else { + if (mp->d_conn == 0) { + printf("%s%d isn't connected to anything, ", + mp->d_name, mp->d_unit); + printf("so %s%d is unattached\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive == UNKNOWN) { + printf("must specify ``drive number'' for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + /* NOTE THAT ON THE UNIBUS ``drive'' IS STORED IN */ + /* ``SLAVE'' AND WE DON'T WANT A SLAVE SPECIFIED */ + if (dp->d_slave != UNKNOWN) { + printf("slave numbers should be given only "); + printf("for massbus tapes, not for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_pri != 0) { + printf("interrupt priority should not be "); + printf("given for drive %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addr != 0) { + printf("csr addresses should be given only"); + printf(" on controllers, not on %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + slave = dp->d_drive; + } + fprintf(fp, +" { %-12s, %3d, %s, %s,%3d,%3d, %#10x, C 0x%08x, \"%s\" },\n", + m68k_dn(mp->d_conn == TO_NEXUS? dp->d_name : mp->d_name), + dp->d_unit, + mp->d_conn == TO_NEXUS? " -1" : qu(mp->d_unit), + qu(slave), + dp->d_pri, -dp->d_dk, dp->d_flags, + dp->d_addr == UNKNOWN? 0 : dp->d_addr, + dp->d_name); + } + fprintf(fp, " 0\n};\n"); + m68k_pseudo_inits (fp); + (void) fclose(fp); +} + +void +m68k_pseudo_inits(FILE *fp) +{ + register struct device *dp; + int count; + + fprintf(fp, "\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + fprintf(fp, "extern int %s(int);\n", dp->d_init); + } + fprintf(fp, "\nstruct pseudo_init pseudo_inits[] = {\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + count = dp->d_slave; + if (count <= 0) + count = 1; + fprintf(fp, "\t{%d,\t%s},\n", count, dp->d_init); + } + fprintf(fp, "\t{0,\t0},\n};\n"); +} + +void +i386_pseudo_inits(FILE *fp) +{ + register struct device *dp; + int count; + + fprintf(fp, "\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + fprintf(fp, "extern int %s(int);\n", dp->d_init); + } + fprintf(fp, "\nstruct pseudo_init pseudo_inits[] = {\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + count = dp->d_slave; + if (count <= 0) + count = 1; + fprintf(fp, "\t{%d,\t%s},\n", count, dp->d_init); + } + fprintf(fp, "\t{0,\t0},\n};\n"); +} + +char * +m68k_dn(const char *name) +{ + sprintf(errbuf, "&%sdriver", name); return ns(errbuf); +} +#endif MACHINE_M68K + +#if MACHINE_M88K || MACHINE_M98K +char *nrw_dn(char *name); +void nrw_pseudo_inits(FILE *fp); + +void +nrw_ioconf(void) +{ + FILE *fp; + + unlink(path("ioconf.c")); + fp = fopen(path("ioconf.c"), "w"); + if (fp == 0) { + perror(path("ioconf.c")); + exit(1); + } + fprintf(fp, "#include \n"); + fprintf(fp, "\n"); + nrw_pseudo_inits (fp); + (void) fclose(fp); +} + +void +nrw_pseudo_inits(FILE *fp) +{ + register struct device *dp; + int count; + + fprintf(fp, "\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + fprintf(fp, "extern int %s(int);\n", dp->d_init); + } + fprintf(fp, "\nstruct pseudo_init pseudo_inits[] = {\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + count = dp->d_slave; + if (count <= 0) + count = 1; + fprintf(fp, "\t{%d,\t%s},\n", count, dp->d_init); + } + fprintf(fp, "\t{0,\t0},\n};\n"); +} + +char * +nrw_dn(char *name) +{ + sprintf(errbuf, "&%sdriver,", name); + return(errbuf); +} + +void +m88k_ioconf(void) +{ + nrw_ioconf(); +} + +void +m98k_ioconf(void) +{ + nrw_ioconf(); +} + +void +m88k_pseudo_inits(FILE *fp) +{ + nrw_pseudo_inits(fp); +} + +void +m98k_pseudo_inits(FILE *fp) +{ + nrw_pseudo_inits(fp); +} + +char * +m88k_dn(char *name) +{ + return(nrw_dn(name)); +} + +char * +m98k_dn(char *name) +{ + return(nrw_dn(name)); +} + + +#endif MACHINE_M88K || MACHINE_M98K + +#ifdef MACHINE_HPPA +char *hppa_dn(char *name); +void hppa_pseudo_inits(FILE *fp); + +void +hppa_ioconf(void) +{ + FILE *fp; + + unlink(path("ioconf.c")); + fp = fopen(path("ioconf.c"), "w"); + if (fp == 0) { + perror(path("ioconf.c")); + exit(1); + } + fprintf(fp, "#include \n"); + fprintf(fp, "\n"); + hppa_pseudo_inits (fp); + (void) fclose(fp); +} + +void +hppa_pseudo_inits(FILE *fp) +{ + register struct device *dp; + int count; + + fprintf(fp, "\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + fprintf(fp, "extern int %s(int);\n", dp->d_init); + } + fprintf(fp, "\nstruct pseudo_init pseudo_inits[] = {\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + count = dp->d_slave; + if (count <= 0) + count = 1; + fprintf(fp, "\t{%d,\t%s},\n", count, dp->d_init); + } + fprintf(fp, "\t{0,\t0},\n};\n"); +} + +char * +hppa_dn(char *name) +{ + sprintf(errbuf, "&%sdriver,", name); + + return (errbuf); +} + +#endif MACHINE_HPPA + +#ifdef MACHINE_SPARC +char *sparc_dn(char *name); +void sparc_pseudo_inits(FILE *fp); + +void +sparc_ioconf(void) +{ + FILE *fp; + + unlink(path("ioconf.c")); + fp = fopen(path("ioconf.c"), "w"); + if (fp == 0) { + perror(path("ioconf.c")); + exit(1); + } + fprintf(fp, "#include \n"); + fprintf(fp, "\n"); + sparc_pseudo_inits (fp); + (void) fclose(fp); +} + +void +sparc_pseudo_inits(FILE *fp) +{ + register struct device *dp; + int count; + + fprintf(fp, "\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + fprintf(fp, "extern int %s(int);\n", dp->d_init); + } + fprintf(fp, "\nstruct pseudo_init pseudo_inits[] = {\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + count = dp->d_slave; + if (count <= 0) + count = 1; + fprintf(fp, "\t{%d,\t%s},\n", count, dp->d_init); + } + fprintf(fp, "\t{0,\t0},\n};\n"); +} + +char * +sparc_dn(char *name) +{ + sprintf(errbuf, "&%sdriver,", name); + return (errbuf); +} + +#endif MACHINE_SPARC + +#ifdef MACHINE_PPC +char *ppc_dn(char *name); +void ppc_pseudo_inits(FILE *fp); + +void +ppc_ioconf(void) +{ + FILE *fp; + + unlink(path("ioconf.c")); + fp = fopen(path("ioconf.c"), "w"); + if (fp == 0) { + perror(path("ioconf.c")); + exit(1); + } + fprintf(fp, "#include \n"); + fprintf(fp, "\n"); + ppc_pseudo_inits (fp); + (void) fclose(fp); +} + +void +ppc_pseudo_inits(FILE *fp) +{ + register struct device *dp; + int count; + + fprintf(fp, "\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + fprintf(fp, "extern int %s(int);\n", dp->d_init); + } + fprintf(fp, "\nstruct pseudo_init pseudo_inits[] = {\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + count = dp->d_slave; + if (count <= 0) + count = 1; + fprintf(fp, "\t{%d,\t%s},\n", count, dp->d_init); + } + fprintf(fp, "\t{0,\t0},\n};\n"); +} + +char * +ppc_dn(name) + char *name; +{ + sprintf(errbuf, "&%sdriver,", name); + return (errbuf); +} + +#endif MACHINE_PPC + +#ifdef MACHINE_ARM +void arm_pseudo_inits(FILE *fp); + +void +arm_ioconf(void) +{ + FILE *fp; + + unlink(path("ioconf.c")); + fp = fopen(path("ioconf.c"), "w"); + if (fp == 0) { + perror(path("ioconf.c")); + exit(1); + } + fprintf(fp, "#include \n"); + fprintf(fp, "\n"); + arm_pseudo_inits (fp); + (void) fclose(fp); +} + +void +arm_pseudo_inits(FILE *fp) +{ + register struct device *dp; + int count; + + fprintf(fp, "\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + fprintf(fp, "extern int %s(int);\n", dp->d_init); + } + fprintf(fp, "\nstruct pseudo_init pseudo_inits[] = {\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + count = dp->d_slave; + if (count <= 0) + count = 1; + fprintf(fp, "\t{%d,\t%s},\n", count, dp->d_init); + } + fprintf(fp, "\t{0,\t0},\n};\n"); +} + +#endif /* MACHINE_ARM */ + +#ifdef MACHINE_X86_64 +void x86_64_pseudo_inits(FILE *fp); + +void +x86_64_ioconf(void) +{ + FILE *fp; + + unlink(path("ioconf.c")); + fp = fopen(path("ioconf.c"), "w"); + if (fp == 0) { + perror(path("ioconf.c")); + exit(1); + } + fprintf(fp, "#include \n"); + fprintf(fp, "\n"); + x86_64_pseudo_inits (fp); + (void) fclose(fp); +} + +void +x86_64_pseudo_inits(FILE *fp) +{ + register struct device *dp; + int count; + + fprintf(fp, "\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + fprintf(fp, "extern int %s(int);\n", dp->d_init); + } + fprintf(fp, "\nstruct pseudo_init pseudo_inits[] = {\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + count = dp->d_slave; + if (count <= 0) + count = 1; + fprintf(fp, "\t{%d,\t%s},\n", count, dp->d_init); + } + fprintf(fp, "\t{0,\t0},\n};\n"); +} + +#endif /* MACHINE_X86_64 */ + +char * +intv(struct device *dev) +{ + static char buf[20]; + + if (dev->d_vec == 0) { + strcpy(buf, " 0"); + } else { + (void) sprintf(buf, "%sint%d", dev->d_name, dev->d_unit); + } + return ns(buf); +} + +char * +qu(int num) +{ + + if (num == QUES) { + strcpy(errbuf, "'?'"); + } else if (num == UNKNOWN) { + strcpy(errbuf, " -1"); + } else { + (void) sprintf(errbuf, "%3d", num); + } + return ns(errbuf); +} diff --git a/SETUP/config/mkmakefile.c b/SETUP/config/mkmakefile.c new file mode 100644 index 000000000..6ac9aa099 --- /dev/null +++ b/SETUP/config/mkmakefile.c @@ -0,0 +1,1182 @@ +/* + * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights + * Reserved. This file contains Original Code and/or Modifications of + * Original Code as defined in and that are subject to the Apple Public + * Source License Version 1.0 (the 'License'). You may not use this file + * except in compliance with the License. Please obtain a copy of the + * License at http://www.apple.com/publicsource and read it before using + * this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License." + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* + * Mach Operating System + * Copyright (c) 1990 Carnegie-Mellon University + * Copyright (c) 1989 Carnegie-Mellon University + * Copyright (c) 1988 Carnegie-Mellon University + * Copyright (c) 1987 Carnegie-Mellon University + * All rights reserved. The CMU software License Agreement specifies + * the terms and conditions for use and redistribution. + */ + +/* + * Copyright (c) 1980 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms are permitted + * provided that the above copyright notice and this paragraph are + * duplicated in all such forms and that any documentation, + * advertising materials, and other materials related to such + * distribution and use acknowledge that the software was developed + * by the University of California, Berkeley. The name of the + * University may not be used to endorse or promote products derived + * from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef lint +static char sccsid[] __attribute__((used)) = "@(#)mkmakefile.c 5.21 (Berkeley) 6/18/88"; +#endif /* not lint */ + +/* + * Build the makefile for the system, from + * the information in the files files and the + * additional files for the machine being compiled to. + */ + +#include +#include /* for unlink */ +#include +#include "parser.h" +#include "config.h" + +void read_files(void); +void do_objs(FILE *fp, const char *msg, int ext); +void do_ordered(FILE *fp); +void do_files(FILE *fp, const char *msg, char ext); +void do_machdep(FILE *ofp); +void do_build(const char *name, void (*format)(FILE *)); +void do_rules(FILE *f); +void do_load(FILE *f); +struct file_list *do_systemspec(FILE *f, struct file_list *fl, int first); +void do_swapspec(FILE *f, const char *name, char *sysname); +void copy_dependencies(FILE *makin, FILE *makout); + +void build_cputypes(FILE *fp); +void build_confdep(FILE *fp); + +struct file_list *fl_lookup(char *file); +struct file_list *fltail_lookup(char *file); +struct file_list *new_fent(void); + +void put_source_file_name(FILE *fp, struct file_list *tp); + + +#define DO_SWAPFILE 0 + +#define next_word(fp, wd) \ + { register const char *word = get_word(fp); \ + if (word == (char *)EOF) \ + return; \ + else \ + wd = word; \ + } + +static struct file_list *fcur; +const char *tail(const char *fn); +char *allCaps(char *str); + +/* + * Lookup a file, by name. + */ +struct file_list * +fl_lookup(char *file) +{ + register struct file_list *fp; + + for (fp = ftab ; fp != 0; fp = fp->f_next) { + if (eq(fp->f_fn, file)) + return (fp); + } + return (0); +} + +/* + * Lookup a file, by final component name. + */ +struct file_list * +fltail_lookup(char *file) +{ + register struct file_list *fp; + + for (fp = ftab ; fp != 0; fp = fp->f_next) { + if (eq(tail(fp->f_fn), tail(file))) + return (fp); + } + return (0); +} + +/* + * Make a new file list entry + */ +struct file_list * +new_fent(void) +{ + register struct file_list *fp; + + fp = (struct file_list *) malloc(sizeof *fp); + fp->f_needs = 0; + fp->f_next = 0; + fp->f_flags = 0; + fp->f_type = 0; + fp->f_extra = (char *) 0; + if (fcur == 0) + fcur = ftab = fp; + else + fcur->f_next = fp; + fcur = fp; + return (fp); +} + +char *COPTS; +static struct users { + int u_default; + int u_min; + int u_max; +} users[] = { + { 24, 2, 1024 }, /* MACHINE_VAX */ + { 8, 2, 32 }, /* MACHINE_SUN */ + { 16, 4, 32 }, /* MACHINE_ROMP */ + { 8, 2, 32 }, /* MACHINE_SUN2 */ + { 8, 2, 32 }, /* MACHINE_SUN3 */ + { 24, 8, 1024}, /* MACHINE_MMAX */ + { 32, 8, 1024}, /* MACHINE_SQT */ + { 8, 2, 32 }, /* MACHINE_SUN4 */ + { 2, 2, 1024 }, /* MACHINE_I386 */ + { 32, 8, 1024 }, /* MACHINE_IX */ + { 32, 8, 1024 }, /* MACHINE_MIPSY */ + { 32, 8, 1024 }, /* MACHINE_MIPS*/ + { 32, 8, 1024 }, /* MACHINE_I860*/ + { 8, 2, 32 }, /* MACHINE_M68K */ + { 8, 2, 32 }, /* MACHINE_M88K */ + { 8, 2, 32 }, /* MACHINE_M98K */ + { 8, 2, 32 }, /* MACHINE_HPPA */ + { 8, 2, 32 }, /* MACHINE_SPARC */ + { 8, 2, 32 }, /* MACHINE_PPC */ + { 8, 2, 32 }, /* MACHINE_ARM */ + { 8, 2, 32 }, /* MACHINE_X86_64 */ +}; +#define NUSERS (sizeof (users) / sizeof (users[0])) + +const char * +get_VPATH(void) +{ + static char *vpath = NULL; + + if ((vpath == NULL) && + ((vpath = getenv("VPATH")) != NULL) && + (*vpath != ':')) { + register char *buf = malloc((unsigned)(strlen(vpath) + 2)); + + vpath = strcat(strcpy(buf, ":"), vpath); + } + + return vpath ? vpath : ""; +} + + +/* + * Build the makefile from the skeleton + */ +void +makefile(void) +{ + FILE *ifp, *ofp; + FILE *dfp; + char pname[BUFSIZ]; + char line[BUFSIZ]; + struct opt *op; + struct users *up; + + read_files(); + (void) sprintf(line, "%s/Makefile.template", config_directory); + ifp = fopenp(VPATH, line, pname, "r"); + if (ifp == 0) { + perror(line); + exit(1); + } + dfp = fopen(path("Makefile"), "r"); + rename(path("Makefile"), path("Makefile.old")); + unlink(path("Makefile.old")); + unlink(path("M.d")); + if ((ofp = fopen(path("M.d"), "w")) == NULL) { + perror(path("M.d")); + /* We'll let this error go */ + } + else + fclose(ofp); + ofp = fopen(path("Makefile"), "w"); + if (ofp == 0) { + perror(path("Makefile")); + exit(1); + } + fprintf(ofp, "SOURCE_DIR=%s\n", source_directory); + + if (machine == MACHINE_SUN || machine == MACHINE_SUN2 + || machine == MACHINE_SUN3 || machine == MACHINE_SUN4) + fprintf(ofp, "IDENT=-D%s -D%s", machinename, allCaps(ident)); + else + fprintf(ofp, "IDENT=-D%s", allCaps(ident)); + if (profiling) + fprintf(ofp, " -DGPROF"); + if (cputype == 0) { + printf("cpu type must be specified\n"); + exit(1); + } + do_build("cputypes.h", build_cputypes); + + for (op = opt; op; op = op->op_next) + if (op->op_value) + fprintf(ofp, " -D%s=\"%s\"", op->op_name, op->op_value); + else + fprintf(ofp, " -D%s", op->op_name); + fprintf(ofp, "\n"); + if ((unsigned)machine > NUSERS) { + printf("maxusers config info isn't present, using vax\n"); + up = &users[MACHINE_VAX-1]; + } else + up = &users[machine-1]; + if (maxusers < up->u_min) { + maxusers = up->u_min; + } else if (maxusers > up->u_max) + printf("warning: maxusers > %d (%d)\n", up->u_max, maxusers); + if (maxusers) { + do_build("confdep.h", build_confdep); + } + for (op = mkopt; op; op = op->op_next) + if (op->op_value) + fprintf(ofp, "%s=%s\n", op->op_name, op->op_value); + else + fprintf(ofp, "%s\n", op->op_name); + + while (fgets(line, BUFSIZ, ifp) != 0) { + if (*line == '%') + goto percent; + if (profiling && strncmp(line, "COPTS=", 6) == 0) { + register char *cp; + if (machine != MACHINE_MMAX) + fprintf(ofp, + "GPROF.EX=$(SOURCE_DIR)/machdep/%s/gmon.ex\n", machinename); + cp = index(line, '\n'); + if (cp) + *cp = 0; + cp = line + 6; + while (*cp && (*cp == ' ' || *cp == '\t')) + cp++; + COPTS = malloc((unsigned)(strlen(cp) + 1)); + if (COPTS == 0) { + printf("config: out of memory\n"); + exit(1); + } + strcpy(COPTS, cp); + if (machine == MACHINE_MIPSY || machine == MACHINE_MIPS) { + fprintf(ofp, "%s ${CCPROFOPT}\n", line); + fprintf(ofp, "PCOPTS=%s\n", cp); + } else if (machine == MACHINE_MMAX) + fprintf(ofp, "%s -p\n",line); + else + fprintf(ofp, "%s -pg\n", line); + continue; + } + fprintf(ofp, "%s", line); + continue; + percent: + if (eq(line, "%OBJS\n")) { + do_objs(ofp, "OBJS=", -1); + } else if (eq(line, "%CFILES\n")) { + do_files(ofp, "CFILES=", 'c'); + do_objs(ofp, "COBJS=", 'c'); + } else if (eq(line, "%MFILES\n")) { + do_files(ofp, "MFILES=", 'm'); + do_objs(ofp, "MOBJS=", 'm'); + } else if (eq(line, "%SFILES\n")) { + do_files(ofp, "SFILES=", 's'); + do_objs(ofp, "SOBJS=", 's'); + } else if (eq(line, "%BFILES\n")) + do_files(ofp, "BFILES=", 'b'); + else if (eq(line, "%MACHDEP\n")) { + /* + * Move do_machdep() after the mkopt stuff. + */ + for (op = mkopt; op; op = op->op_next) + fprintf(ofp, "%s=%s\n", op->op_name, op->op_value); + do_machdep(ofp); + } else if (eq(line, "%ORDERED\n")) + do_ordered(ofp); + else if (eq(line, "%RULES\n")) + do_rules(ofp); + else if (eq(line, "%LOAD\n")) + do_load(ofp); + else + fprintf(stderr, + "Unknown %% construct in generic makefile: %s", + line); + } + if (dfp != NULL) + { + copy_dependencies(dfp, ofp); + (void) fclose(dfp); + } + (void) fclose(ifp); + (void) fclose(ofp); +} + +/* + * Read in the information about files used in making the system. + * Store it in the ftab linked list. + */ +void +read_files(void) +{ + FILE *fp; + register struct file_list *tp, *pf; + register struct device *dp; + register struct opt *op; + const char *wd; + char *this, *needs; + const char *devorprof; + int options; + int not_option; + int ordered; + int sedit; /* SQT */ + char pname[BUFSIZ]; + char fname[1024]; + char *rest = (char *) 0; + struct cputype *cp; + int nreqs, first = 1, isdup; + + ftab = 0; + (void) sprintf(fname, "%s/files", config_directory); +openit: + fp = fopenp(VPATH, fname, pname, "r"); + if (fp == 0) { + perror(fname); + exit(1); + } +next: + options = 0; + rest = (char *) 0; + /* + * filename [ standard | optional ] + * [ dev* | profiling-routine ] [ device-driver] + */ + /* + * MACHINE_SQT ONLY: + * + * filename [ standard | optional ] + * [ ordered | sedit ] + * [ dev* | profiling-routine ] [ device-driver] + */ + wd = get_word(fp); + if (wd == (char *)EOF) { + (void) fclose(fp); + if (first == 1) { + (void) sprintf(fname, "%s/files.%s", config_directory, machinename); + first++; + goto openit; + } + if (first == 2) { + (void) sprintf(fname, "files.%s", allCaps(ident)); + first++; + fp = fopenp(VPATH, fname, pname, "r"); + if (fp != 0) + goto next; + } + return; + } + if (wd == 0) + goto next; + /* + * Allow comment lines beginning witha '#' character. + */ + if (*wd == '#') + { + while ((wd=get_word(fp)) && wd != (char *)EOF) + ; + goto next; + } + + this = ns(wd); + next_word(fp, wd); + if (wd == 0) { + printf("%s: No type for %s.\n", + fname, this); + exit(1); + } + if ((pf = fl_lookup(this)) && (pf->f_type != INVISIBLE || pf->f_flags)) + isdup = 1; + else + isdup = 0; + tp = 0; + if (first == 3 && (tp = fltail_lookup(this)) != 0) + printf("%s: Local file %s overrides %s.\n", + fname, this, tp->f_fn); + nreqs = 0; + devorprof = ""; + ordered = 0; + sedit = 1; /* SQT: assume sedit for now */ + needs = 0; + if (eq(wd, "standard")) + goto checkdev; + if (!eq(wd, "optional")) { + printf("%s: %s must be optional or standard\n", fname, this); + exit(1); + } + if (strncmp(this, "OPTIONS/", 8) == 0) + options++; + not_option = 0; +nextopt: + next_word(fp, wd); + if (wd == 0) + goto doneopt; + if (eq(wd, "ordered")) { + ordered++; + goto nextopt; + } + if (machine == MACHINE_SQT && eq(wd, "sedit")) { + sedit++; + goto nextopt; + } + if (eq(wd, "not")) { + not_option = !not_option; + goto nextopt; + } + devorprof = wd; + if (eq(wd, "device-driver") || eq(wd, "profiling-routine")) { + next_word(fp, wd); + goto save; + } + nreqs++; + if (needs == 0 && nreqs == 1) + needs = ns(wd); + if (isdup) + goto invis; + if (options) + { + struct opt *lop = 0; + struct device tdev; + + /* + * Allocate a pseudo-device entry which we will insert into + * the device list below. The flags field is set non-zero to + * indicate an internal entry rather than one generated from + * the configuration file. The slave field is set to define + * the corresponding symbol as 0 should we fail to find the + * option in the option list. + */ + init_dev(&tdev); + tdev.d_name = ns(wd); + tdev.d_type = PSEUDO_DEVICE; + tdev.d_flags++; + tdev.d_slave = 0; + + for (op=opt; op; lop=op, op=op->op_next) + { + char *od = allCaps(ns(wd)); + + /* + * Found an option which matches the current device + * dependency identifier. Set the slave field to + * define the option in the header file. + */ + if (strcmp(op->op_name, od) == 0) + { + tdev.d_slave = 1; + if (lop == 0) + opt = op->op_next; + else + lop->op_next = op->op_next; + free(op); + op = 0; + } + free(od); + if (op == 0) + break; + } + newdev(&tdev); + } + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (eq(dp->d_name, wd) && (dp->d_type != PSEUDO_DEVICE || dp->d_slave)) { + if (not_option) + goto invis; /* dont want file if option present */ + else + goto nextopt; + } + } + if (not_option) + goto nextopt; /* want file if option missing */ + + for (op = opt; op != 0; op = op->op_next) + if (op->op_value == 0 && opteq(op->op_name, wd)) { + if (nreqs == 1) { + free(needs); + needs = 0; + } + goto nextopt; + } + + for (cp = cputype; cp; cp = cp->cpu_next) + if (opteq(cp->cpu_name, wd)) { + if (nreqs == 1) { + free(needs); + needs = 0; + } + goto nextopt; + } + +invis: + while ((wd = get_word(fp)) != 0) + ; + if (tp == 0) + tp = new_fent(); + tp->f_fn = this; + tp->f_type = INVISIBLE; + tp->f_needs = needs; + tp->f_flags = isdup; + goto next; + +doneopt: + if (nreqs == 0) { + printf("%s: what is %s optional on?\n", + fname, this); + exit(1); + } + +checkdev: + if (wd) { + if (*wd == '|') + goto getrest; + next_word(fp, wd); + if (wd) { + if (eq(wd, "ordered")) { + ordered++; + goto checkdev; + } + if (machine == MACHINE_SQT && eq(wd, "sedit")) { + sedit++; + goto checkdev; + } + devorprof = wd; + next_word(fp, wd); + } + } + +save: +getrest: + if (wd) { + if (*wd == '|') { + rest = ns(get_rest(fp)); + } else { + printf("%s: syntax error describing %s\n", + fname, this); + exit(1); + } + } + if (eq(devorprof, "profiling-routine") && profiling == 0) + goto next; + if (tp == 0) + tp = new_fent(); + tp->f_fn = this; + tp->f_extra = rest; + if (options) + tp->f_type = INVISIBLE; + else + if (eq(devorprof, "device-driver")) + tp->f_type = DRIVER; + else if (eq(devorprof, "profiling-routine")) + tp->f_type = PROFILING; + else + tp->f_type = NORMAL; + tp->f_flags = 0; + if (ordered) + tp->f_flags |= ORDERED; + if (sedit) /* SQT */ + tp->f_flags |= SEDIT; + tp->f_needs = needs; + if (pf && pf->f_type == INVISIBLE) + pf->f_flags = 1; /* mark as duplicate */ + goto next; +} + +int +opteq(const char *cp, const char *dp) +{ + char c, d; + + for (; ; cp++, dp++) { + if (*cp != *dp) { + c = isupper(*cp) ? tolower(*cp) : *cp; + d = isupper(*dp) ? tolower(*dp) : *dp; + if (c != d) + return (0); + } + if (*cp == 0) + return (1); + } +} + +void +put_source_file_name(FILE *fp, struct file_list *tp) +{ + if ((tp->f_fn[0] == '.') && (tp->f_fn[1] == '/')) + fprintf(fp, "%s ", tp->f_fn); + else + fprintf(fp, "$(SOURCE_DIR)/%s ", tp->f_fn); +} + +void +do_objs(FILE *fp, const char *msg, int ext) +{ + register struct file_list *tp; + register int lpos, len; + char *cp; + char och; + const char *sp; +#if DO_SWAPFILE + register struct file_list *fl; + char swapname[32]; +#endif DO_SWAPFILE + + fprintf(fp, "%s", msg); + lpos = strlen(msg); + for (tp = ftab; tp != 0; tp = tp->f_next) { + if (tp->f_type == INVISIBLE) + continue; + + /* + * Check for '.o' file in list + */ + cp = tp->f_fn + (len = strlen(tp->f_fn)) - 1; + if ((ext == -1 && tp->f_flags & ORDERED) || /* not in objs */ + (ext != -1 && *cp != ext)) + continue; + else if (*cp == 'o') { + if (len + lpos > 72) { + lpos = 8; + fprintf(fp, "\\\n\t"); + } + put_source_file_name(fp, tp); + fprintf(fp, " "); + lpos += len + 1; + continue; + } + sp = tail(tp->f_fn); +#if DO_SWAPFILE + for (fl = conf_list; fl; fl = fl->f_next) { + if (fl->f_type != SWAPSPEC) + continue; + (void) sprintf(swapname, "swap%s.c", fl->f_fn); + if (eq(sp, swapname)) + goto cont; + } +#endif DO_SWAPFILE + cp = (char *)sp + (len = strlen(sp)) - 1; + och = *cp; + *cp = 'o'; + if (len + lpos > 72) { + lpos = 8; + fprintf(fp, "\\\n\t"); + } + fprintf(fp, "%s ", sp); + lpos += len + 1; + *cp = och; +#if DO_SWAPFILE +cont: + ; +#endif DO_SWAPFILE + } + if (lpos != 8) + putc('\n', fp); +} + +/* not presently used and probably broken, use ORDERED instead */ +void +do_ordered(FILE *fp) +{ + register struct file_list *tp; + register int lpos, len; + char *cp; + char och; + const char *sp; + + fprintf(fp, "ORDERED="); + lpos = 10; + for (tp = ftab; tp != 0; tp = tp->f_next) { + if ((tp->f_flags & ORDERED) != ORDERED) + continue; + sp = tail(tp->f_fn); + cp = (char *)sp + (len = strlen(sp)) - 1; + och = *cp; + *cp = 'o'; + if (len + lpos > 72) { + lpos = 8; + fprintf(fp, "\\\n\t"); + } + fprintf(fp, "%s ", sp); + lpos += len + 1; + *cp = och; + } + if (lpos != 8) + putc('\n', fp); +} + +void +do_files(FILE *fp, const char *msg, char ext) +{ + register struct file_list *tp; + register int lpos, len=0; /* dvw: init to 0 */ + + fprintf(fp, "%s", msg); + lpos = 8; + for (tp = ftab; tp != 0; tp = tp->f_next) { + if (tp->f_type == INVISIBLE) + continue; + if (tp->f_fn[strlen(tp->f_fn)-1] != ext) + continue; + /* + * Always generate a newline. + * Our Makefile's aren't readable anyway. + */ + + lpos = 8; + fprintf(fp, "\\\n\t"); + put_source_file_name(fp, tp); + lpos += len + 1; + } + if (lpos != 8) + putc('\n', fp); +} + +/* + * Include machine dependent makefile in output + */ + +void +do_machdep(FILE *ofp) +{ + FILE *ifp; + char pname[BUFSIZ]; + char line[BUFSIZ]; + + (void) sprintf(line, "%s/Makefile.%s", config_directory, machinename); + ifp = fopenp(VPATH, line, pname, "r"); + if (ifp == 0) { + perror(line); + exit(1); + } + while (fgets(line, BUFSIZ, ifp) != 0) { + if (profiling && (strncmp(line, "LIBS=", 5) == 0)) + fprintf(ofp,"LIBS=${LIBS_P}\n"); + else + fputs(line, ofp); + } + fclose(ifp); +} + + +/* + * Format configuration dependent parameter file. + */ + +void +build_confdep(FILE *fp) +{ + fprintf(fp, "#define MAXUSERS %d\n", maxusers); +} + +/* + * Format cpu types file. + */ + +void +build_cputypes(FILE *fp) +{ + struct cputype *cp; + + for (cp = cputype; cp; cp = cp->cpu_next) + fprintf(fp, "#define\t%s\t1\n", cp->cpu_name); +} + + + +/* + * Build a define parameter file. Create it first in a temporary location and + * determine if this new contents differs from the old before actually + * replacing the original (so as not to introduce avoidable extraneous + * compilations). + */ + +void +do_build(const char *name, void (*format)(FILE *)) +{ + static char temp[]="#config.tmp"; + FILE *tfp, *ofp; + int c; + + unlink(path(temp)); + tfp = fopen(path(temp), "w+"); + if (tfp == 0) { + perror(path(temp)); + exit(1); + } + unlink(path(temp)); + (*format)(tfp); + ofp = fopen(path(name), "r"); + if (ofp != 0) + { + fseek(tfp, 0, 0); + while ((c = fgetc(tfp)) != EOF) + if (fgetc(ofp) != c) + goto copy; + if (fgetc(ofp) == EOF) + goto same; + + } +copy: + if (ofp) + fclose(ofp); + unlink(path(name)); + ofp = fopen(path(name), "w"); + if (ofp == 0) { + perror(path(name)); + exit(1); + } + fseek(tfp, 0, 0); + while ((c = fgetc(tfp)) != EOF) + fputc(c, ofp); +same: + fclose(ofp); + fclose(tfp); +} + +const char * +tail(const char *fn) +{ + register const char *cp; + + cp = rindex(fn, '/'); + if (cp == 0) + return (fn); + return (cp+1); +} + +/* + * Create the makerules for each file + * which is part of the system. + * Devices are processed with the special c2 option -i + * which avoids any problem areas with i/o addressing + * (e.g. for the VAX); assembler files are processed by as. + */ +void +do_rules(FILE *f) +{ + char *cp; + char *np, och; + const char *tp; + register struct file_list *ftp; + const char *extras = ""; /* dvw: init to "" */ + char *source_dir; + char och_upper; + const char *nl = ""; + + for (ftp = ftab; ftp != 0; ftp = ftp->f_next) { + if (ftp->f_type == INVISIBLE) + continue; + cp = (np = ftp->f_fn) + strlen(ftp->f_fn) - 1; + och = *cp; + /* + * Don't compile '.o' files + */ + if (och == 'o') + continue; + /* + * Determine where sources should come from + */ + if ((np[0] == '.') && (np[1] == '/')) { + source_dir = ""; + np += 2; + } else + source_dir = "$(SOURCE_DIR)/"; + *cp = '\0'; + tp = tail(np); /* dvw: init tp before 'if' */ + if (och == 'o') { + fprintf(f, "%so: %so\n\t${O_RULE_1A}%s%.*s${O_RULE_1B}\n\n", + tp, np, source_dir, (int)(tp-np), np); + continue; + } + fprintf(f, "%so: %s%s%c\n", tp, source_dir, np, och); + if (och == 's') { + switch (machine) { + case MACHINE_MIPSY: + case MACHINE_MIPS: + switch (ftp->f_type) { + case NORMAL: + case DRIVER: + fprintf(f, "\t@${RM} %so\n", tp); + fprintf(f, "\t${CC} ${CCASFLAGS}%s %s%s%ss\n\n", + (ftp->f_extra?ftp->f_extra:""), extras, source_dir, np); + break; + + case PROFILING: + if (!profiling) + continue; + fprintf(f, "\t@${RM} %so\n", tp); + fprintf(f, "\t${CC} ${CCPASFLAGS}%s %s%s%ss\n\n", + (ftp->f_extra?ftp->f_extra:""), extras, source_dir, np); + break; + + default: + printf("Don't know rules for %s.s\n", np); + break; + } + break; + default: + fprintf(f, "\t${S_RULE_1A}%s%.*s${S_RULE_1B}%s\n", + source_dir, (int)(tp-np), np, nl); + fprintf(f, "\t${S_RULE_2}%s\n", nl); + fprintf(f, "\t${S_RULE_3}\n\n"); + } + continue; + } + if (och == 'b') { + fprintf(f, "\t${B_RULE_1A}%s%.*s${B_RULE_1B}\n\n", + source_dir, (int)(tp-np), np); + continue; + } + extras = ""; + switch (ftp->f_type) { + + case NORMAL: + switch (machine) { + + case MACHINE_MIPSY: + case MACHINE_MIPS: + fprintf(f, "\t@${RM} %so\n", tp); + fprintf(f, "\t${CC} ${CCNFLAGS}%s %s%s%sc\n\n", + (ftp->f_extra?ftp->f_extra:""), extras, source_dir, np); + continue; + #if 0 + case MACHINE_SQT: + if (ftp->f_flags & SEDIT) { + fprintf(f, "\t${CC} -SO ${COPTS} %s%s%sc | \\\n", extras, source_dir, np); + fprintf(f, "\t${SEDCMD} | ${C2} | ${AS} ${CAFLAGS} -o %so\n\n", tp); + } else { + fprintf(f, "\t${CC} -c -O ${COPTS} %s%s%sc\n\n", + source_dir, extras, np); + } + break; + #endif 0 + default: + goto common; + } + break; + + case DRIVER: + switch (machine) { + + case MACHINE_MIPSY: + case MACHINE_MIPS: + fprintf(f, "\t@${RM} %so\n", tp); + fprintf(f, "\t${CC} ${CCDFLAGS}%s %s%s%sc\n\n", + (ftp->f_extra?ftp->f_extra:""), extras, source_dir, np); + continue; + default: + extras = "_D"; + goto common; + } + break; + + case PROFILING: + if (!profiling) + continue; + if (COPTS == 0) { + fprintf(stderr, + "config: COPTS undefined in generic makefile"); + COPTS = ""; + } + switch (machine) { + case MACHINE_MIPSY: + case MACHINE_MIPS: + fprintf(f, "\t@${RM} %so\n", tp); + fprintf(f, "\t${CC} ${CCPFLAGS}%s %s../%sc\n\n", + (ftp->f_extra?ftp->f_extra:""), extras, np); + continue; + case MACHINE_VAX: + case MACHINE_ROMP: + case MACHINE_SQT: + case MACHINE_MMAX: + case MACHINE_SUN3: + case MACHINE_SUN4: + case MACHINE_I386: + case MACHINE_I860: + case MACHINE_HPPA: + case MACHINE_SPARC: + case MACHINE_PPC: + case MACHINE_ARM: + case MACHINE_X86_64: + extras = "_P"; + goto common; + default: + fprintf(stderr, + "config: don't know how to profile kernel on this cpu\n"); + break; + } + + common: + och_upper = och + 'A' - 'a'; + fprintf(f, "\t${%c_RULE_1A%s}", och_upper, extras); + if (ftp->f_extra) + fprintf(f, "%s", ftp->f_extra); + fprintf(f, "%s%.*s${%c_RULE_1B%s}%s\n", + source_dir, (int)(tp-np), np, och_upper, extras, nl); + fprintf(f, "\t${%c_RULE_2%s}%s\n", och_upper, extras, nl); + fprintf(f, "\t${%c_RULE_3%s}%s\n", och_upper, extras, nl); + fprintf(f, "\t${%c_RULE_4%s}\n\n", och_upper, extras); + break; + + default: + printf("Don't know rules for %s\n", np); + break; + } + *cp = och; + } +} + +/* + * Create the load strings + */ +void +do_load(FILE *f) +{ + register struct file_list *fl; + int first = 1; + + fl = conf_list; + while (fl) { + if (fl->f_type != SYSTEMSPEC) { + fl = fl->f_next; + continue; + } + fl = do_systemspec(f, fl, first); + if (first) + first = 0; + } + fprintf(f, "LOAD ="); + for (fl = conf_list; fl != 0; fl = fl->f_next) + if (fl->f_type == SYSTEMSPEC) + fprintf(f, " %s", fl->f_needs); +#ifdef multimax + fprintf(f, "\n\nall .ORDER: includelinks ${LOAD}\n"); +#else multimax + fprintf(f, "\n\nall: includelinks ${LOAD}\n"); +#endif multimax + fprintf(f, "\n"); +} + +struct file_list * +do_systemspec(FILE *f, struct file_list *fl, __unused int first) +{ + /* + * Variable for kernel name. + */ + fprintf(f, "KERNEL_NAME=%s\n", fl->f_needs); + + fprintf(f, "%s .ORDER: %s.sys ${SYSDEPS}\n", + fl->f_needs, fl->f_needs); + fprintf(f, "\t${SYS_RULE_1}\n"); + fprintf(f, "\t${SYS_RULE_2}\n"); + fprintf(f, "\t${SYS_RULE_3}\n"); + fprintf(f, "\t${SYS_RULE_4}\n\n"); + do_swapspec(f, fl->f_fn, fl->f_needs); + for (fl = fl->f_next; fl != NULL && fl->f_type == SWAPSPEC; fl = fl->f_next) + continue; + return (fl); +} + +void +do_swapspec(__unused FILE *f, __unused const char *name, __unused char *sysname) +{ + +#if DO_SWAPFILE + char *gdir = eq(name, "generic")?"$(MACHINEDIR)/":""; + + fprintf(f, "%s.sys:${P} ${PRELDDEPS} ${LDOBJS} ${LDDEPS}\n\n", sysname); + fprintf(f, "%s.swap: swap%s.o\n", sysname, name); + fprintf(f, "\t@rm -f $@\n"); + fprintf(f, "\t@cp swap%s.o $@\n\n", name); + fprintf(f, "swap%s.o: %sswap%s.c ${SWAPDEPS}\n", name, gdir, name); + if (machine == MACHINE_MIPSY || machine == MACHINE_MIPS) { + fprintf(f, "\t@${RM} swap%s.o\n", name); + fprintf(f, "\t${CC} ${CCNFLAGS} %sswap%s.c\n\n", gdir, name); + } else { + fprintf(f, "\t${C_RULE_1A}%s${C_RULE_1B}\n", gdir); + fprintf(f, "\t${C_RULE_2}\n"); + fprintf(f, "\t${C_RULE_3}\n"); + fprintf(f, "\t${C_RULE_4}\n\n"); + } +#endif DO_SWAPFILE +} + +char * +allCaps(str) + register char *str; +{ + register char *cp = str; + + while (*str) { + if (islower(*str)) + *str = toupper(*str); + str++; + } + return (cp); +} + +#define OLDSALUTATION "# DO NOT DELETE THIS LINE" + +#define LINESIZE 1024 +static char makbuf[LINESIZE]; /* one line buffer for makefile */ + +void +copy_dependencies(FILE *makin, FILE *makout) +{ + register int oldlen = (sizeof OLDSALUTATION - 1); + + while (fgets(makbuf, LINESIZE, makin) != NULL) { + if (! strncmp(makbuf, OLDSALUTATION, oldlen)) + break; + } + while (fgets(makbuf, LINESIZE, makin) != NULL) { + if (oldlen != 0) + { + if (makbuf[0] == '\n') + continue; + else + oldlen = 0; + } + fputs(makbuf, makout); + } +} diff --git a/SETUP/config/mkswapconf.c b/SETUP/config/mkswapconf.c new file mode 100644 index 000000000..fdd14d722 --- /dev/null +++ b/SETUP/config/mkswapconf.c @@ -0,0 +1,247 @@ +/* + * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights + * Reserved. This file contains Original Code and/or Modifications of + * Original Code as defined in and that are subject to the Apple Public + * Source License Version 1.0 (the 'License'). You may not use this file + * except in compliance with the License. Please obtain a copy of the + * License at http://www.apple.com/publicsource and read it before using + * this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License." + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* + * Mach Operating System + * Copyright (c) 1990 Carnegie-Mellon University + * Copyright (c) 1989 Carnegie-Mellon University + * All rights reserved. The CMU software License Agreement specifies + * the terms and conditions for use and redistribution. + */ + +/* + * Copyright (c) 1980 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms are permitted + * provided that the above copyright notice and this paragraph are + * duplicated in all such forms and that any documentation, + * advertising materials, and other materials related to such + * distribution and use acknowledge that the software was developed + * by the University of California, Berkeley. The name of the + * University may not be used to endorse or promote products derived + * from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef lint +static char sccsid[] __attribute__((used)) = "@(#)mkswapconf.c 5.6 (Berkeley) 6/18/88"; +#endif /* not lint */ + +/* + * Build a swap configuration file. + */ +#include "config.h" + +#include +#include /* for unlink */ +#include + +struct file_list *do_swap(struct file_list *fl); +void initdevtable(void); + +void +swapconf(void) +{ + register struct file_list *fl; + + fl = conf_list; + while (fl) { + if (fl->f_type != SYSTEMSPEC) { + fl = fl->f_next; + continue; + } + fl = do_swap(fl); + } +} + +struct file_list * +do_swap(struct file_list *fl) +{ + FILE *fp; + char swapname[80]; + register struct file_list *swap; + dev_t dev; + + if (eq(fl->f_fn, "generic")) { + fl = fl->f_next; + return (fl->f_next); + } + if (machine == MACHINE_MMAX) { + printf("Error: Multimax must specify swap generic only.\n"); + exit(1); + } + (void) sprintf(swapname, "swap%s.c", fl->f_fn); + fp = fopen(path(swapname), "w"); + if (fp == 0) { + perror(path(swapname)); + exit(1); + } + fprintf(fp, "#include \n"); + fprintf(fp, "#include \n"); + fprintf(fp, "\n"); + /* + * If there aren't any swap devices + * specified, just return, the error + * has already been noted. + */ + swap = fl->f_next; + if (swap == 0 || swap->f_type != SWAPSPEC) { + (void) unlink(path(swapname)); + fclose(fp); + return (swap); + } + fprintf(fp, "dev_t\trootdev = makedev(%d, %d);\n", + major(fl->f_rootdev), minor(fl->f_rootdev)); + fprintf(fp, "dev_t\targdev = makedev(%d, %d);\n", + major(fl->f_argdev), minor(fl->f_argdev)); + fprintf(fp, "dev_t\tdumpdev = makedev(%d, %d);\n", + major(fl->f_dumpdev), minor(fl->f_dumpdev)); + fprintf(fp, "\n"); + fprintf(fp, "struct\tswdevt swdevt[] = {\n"); + do { + dev = swap->f_swapdev; + fprintf(fp, "\t{ makedev(%d, %d),\t0,\t%d },\t/* %s */\n", + major(dev), minor(dev), swap->f_swapsize, swap->f_fn); + swap = swap->f_next; + } while (swap && swap->f_type == SWAPSPEC); + fprintf(fp, "\t{ 0, 0, 0 }\n"); + fprintf(fp, "};\n"); + if (machine == MACHINE_MIPSY || machine == MACHINE_MIPS) { + fprintf(fp, "\nsetconf()\n"); + fprintf(fp, "{\n"); + fprintf(fp, "\t/* resolve reference for non-generic kernels */\n"); + fprintf(fp, "}\n"); + } + fclose(fp); + return (swap); +} + +static int devtablenotread = 1; +static struct devdescription { + char *dev_name; + int dev_major; + struct devdescription *dev_next; +} *devtable; + +/* + * Given a device name specification figure out: + * major device number + * partition + * device name + * unit number + * This is a hack, but the system still thinks in + * terms of major/minor instead of string names. + */ +dev_t +nametodev(char *name, int defunit, char defpartition) +{ + char *cp, partition; + int unit; + register struct devdescription *dp; + + cp = name; + if (cp == 0) { + fprintf(stderr, "config: internal error, nametodev\n"); + exit(1); + } + while (*cp && !isdigit(*cp)) + cp++; + unit = *cp ? atoi(cp) : defunit; + if (unit < 0 || unit > 31) { + fprintf(stderr, +"config: %s: invalid device specification, unit out of range\n", name); + unit = defunit; /* carry on more checking */ + } + if (*cp) { + *cp++ = '\0'; + while (*cp && isdigit(*cp)) + cp++; + } + partition = *cp ? *cp : defpartition; + if (partition < 'a' || partition > 'h') { + fprintf(stderr, +"config: %c: invalid device specification, bad partition\n", *cp); + partition = defpartition; /* carry on */ + } + if (devtablenotread) + initdevtable(); + for (dp = devtable; dp->dev_next; dp = dp->dev_next) + if (eq(name, dp->dev_name)) + break; + if (dp == 0) { + fprintf(stderr, "config: %s: unknown device\n", name); + return (NODEV); + } + return (makedev(dp->dev_major, (unit << DEV_SHIFT) + (partition - 'a'))); +} + +char * +devtoname(dev_t dev) +{ + char buf[80]; + register struct devdescription *dp; + + if (devtablenotread) + initdevtable(); + for (dp = devtable; dp->dev_next; dp = dp->dev_next) + if (major(dev) == dp->dev_major) + break; + if (dp == 0) + dp = devtable; + (void) sprintf(buf, "%s%d%c", dp->dev_name, + minor(dev) >> DEV_SHIFT, (minor(dev) & DEV_MASK) + 'a'); + return (ns(buf)); +} + +void +initdevtable(void) +{ + char buf[BUFSIZ]; + char line[BUFSIZ]; + int maj; + register struct devdescription **dp = &devtable; + FILE *fp; + + (void) sprintf(buf, "%s/devices.%s", config_directory, machinename); + fp = fopenp(VPATH, buf, line, "r"); + if (fp == NULL) { + fprintf(stderr, "config: can't open %s\n", buf); + exit(1); + } + while (fgets(line, BUFSIZ, fp) != 0) { + if (*line == '#' || *line == '\n') + continue; + if (sscanf(line, "%s\t%d\n", buf, &maj) != 2) + break; + *dp = (struct devdescription *)malloc(sizeof (**dp)); + (*dp)->dev_name = ns(buf); + (*dp)->dev_major = maj; + dp = &(*dp)->dev_next; + } + *dp = 0; + fclose(fp); + devtablenotread = 0; +} diff --git a/SETUP/config/openp.c b/SETUP/config/openp.c new file mode 100644 index 000000000..c05cd9daf --- /dev/null +++ b/SETUP/config/openp.c @@ -0,0 +1,93 @@ +/* + * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights + * Reserved. This file contains Original Code and/or Modifications of + * Original Code as defined in and that are subject to the Apple Public + * Source License Version 1.0 (the 'License'). You may not use this file + * except in compliance with the License. Please obtain a copy of the + * License at http://www.apple.com/publicsource and read it before using + * this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License." + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* openp, fopenp -- search pathlist and open file + * + * Usage: + * i = openp (path,file,complete,flags,mode) + * f = fopenp (path,file,complete,type) + * int i,flags,mode; + * FILE *f; + * char *path,*file,*complete,*type; + * + * Openp searches for "file" in the pathlist "path"; + * when the file is found and can be opened by open() + * with the specified "flags" and "mode", then the full filename + * is copied into "complete" and openp returns the file + * descriptor. If no such file is found, openp returns -1. + * Fopenp performs the same function, using fopen() instead + * of open() and type instead of flags/mode; it returns 0 if no + * file is found. + * + * HISTORY + * 30-Apr-85 Steven Shafer (sas) at Carnegie-Mellon University + * Adapted for 4.2 BSD UNIX. Added new parameter to openp.c; + * changed names of flags, mode, and type parameters to reflect + * current manual entries for open and fopen. + * + * 20-Nov-79 Steven Shafer (sas) at Carnegie-Mellon University + * Created for VAX. + * + */ + +#include +#include /* open */ +#include "config.h" + + +int openp(const char *fpath, char *file, char *complete, int flags, int mode); + +static int flgs,mod,value; +static const char *ftyp; +static FILE *fvalue; + +static int +func(char *fnam) +{ + value = open (fnam,flgs,mod); + return (value < 0); +} + +static int +ffunc(char *fnam) +{ + fvalue = fopen (fnam,ftyp); + return (fvalue == 0); +} + +int +openp(const char *fpath, char *file, char *complete, int flags, int mode) +{ + flgs = flags; + mod = mode; + if (searchp(fpath,file,complete,func) < 0) return (-1); + return (value); +} + +FILE * +fopenp(const char *fpath, char *file, char *complete, const char *ftype) +{ + ftyp = ftype; + if (searchp(fpath,file,complete,ffunc) < 0) return (0); + return (fvalue); +} diff --git a/SETUP/config/parser.y b/SETUP/config/parser.y new file mode 100644 index 000000000..4f77b93e4 --- /dev/null +++ b/SETUP/config/parser.y @@ -0,0 +1,1278 @@ +/* + * Mach Operating System + * Copyright (c) 1990 Carnegie-Mellon University + * Copyright (c) 1989 Carnegie-Mellon University + * Copyright (c) 1988 Carnegie-Mellon University + * Copyright (c) 1987 Carnegie-Mellon University + * All rights reserved. The CMU software License Agreement specifies + * the terms and conditions for use and redistribution. + */ + +/* + * Copyright (c) 1988 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms are permitted + * provided that the above copyright notice and this paragraph are + * duplicated in all such forms and that any documentation, + * advertising materials, and other materials related to such + * distribution and use acknowledge that the software was developed + * by the University of California, Berkeley. The name of the + * University may not be used to endorse or promote products derived + * from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * @(#)config.y 5.8 (Berkeley) 6/18/88 + */ + +%union { + char *str; + int val; + struct file_list *file; + struct idlst *lst; +} + +%token ADDRMOD +%token AND +%token ANY +%token ARGS +%token AT +%token BIN +%token BUILDDIR +%token COMMA +%token CONFIG +%token CONFIGDIR +%token CONTROLLER +%token CPU +%token CSR +%token DEVICE +%token DISK +%token DRIVE +%token DST +%token DUMPS +%token EQUALS +%token FLAGS +%token HZ +%token IDENT +%token INIT +%token MACHINE +%token MAJOR +%token MASTER +%token MAXUSERS +%token MAXDSIZ +%token MBA +%token MBII +%token MINOR +%token MINUS +%token NEXUS +%token OBJECTDIR +%token ON +%token OPTIONS +%token MAKEOPTIONS +%token PRIORITY +%token PROFILE +%token PSEUDO_DEVICE +%token ROOT +%token SEMICOLON +%token SIZE +%token SLAVE +%token SOURCEDIR +%token SWAP +%token TIMEZONE +%token TRACE +%token UBA +%token VECTOR +%token VME +%token VME16D16 +%token VME24D16 +%token VME32D16 +%token VME16D32 +%token VME24D32 +%token VME32D32 + +/* following 3 are unique to CMU */ +%token LUN +%token SLOT +%token TAPE + +%token ID +%token NUMBER +%token FPNUMBER + +%type Save_id +%type Opt_value +%type Dev +%type Id_list +%type optional_size +%type device_name +%type major_minor +%type arg_device_spec +%type root_device_spec +%type dump_device_spec +%type swap_device_spec +%type Value + +%{ + +#include "config.h" +#include +#include + +struct device cur; +struct device *curp = 0; +char *temp_id; +char *val_id; +/* char *malloc(); */ + +int yylex(void); + +int finddev(dev_t dev); +int alreadychecked(dev_t dev, dev_t list[], dev_t *last); +void deverror(const char *systemname, const char *devtype); +void mkconf(char *sysname); +struct file_list *newswap(void); +void mkswap(struct file_list *syslist, struct file_list *fl, int size); +struct device *huhcon(const char *dev); +void check_nexus(struct device *dev, int num); +void check_slot(struct device *dev, int num); +void checksystemspec(struct file_list *fl); +void verifysystemspecs(void); +dev_t *verifyswap(struct file_list *fl, dev_t checked[], dev_t *pchecked); +struct device *dconnect(const char *dev, int num); + +%} +%% +Configuration: + Many_specs + { verifysystemspecs(); } + ; + +Many_specs: + Many_specs Spec + | + /* lambda */ + ; + +Spec: + Device_spec SEMICOLON + { newdev(&cur); } | + Config_spec SEMICOLON + | + TRACE SEMICOLON + { do_trace = !do_trace; } | + SEMICOLON + | + error SEMICOLON + ; + +Config_spec: + MACHINE Save_id + { + if (!strcmp($2, "vax")) { + machine = MACHINE_VAX; + machinename = "vax"; + } else if (!strcmp($2, "sun")) { + /* default to Sun 3 */ + machine = MACHINE_SUN3; + machinename = "sun3"; + } else if (!strcmp($2, "sun2")) { + machine = MACHINE_SUN2; + machinename = "sun2"; + } else if (!strcmp($2, "sun3")) { + machine = MACHINE_SUN3; + machinename = "sun3"; + } else if (!strcmp($2, "sun4")) { + machine = MACHINE_SUN4; + machinename = "sun4"; + } else if (!strcmp($2, "romp")) { + machine = MACHINE_ROMP; + machinename = "romp"; + } else if (!strcmp($2, "ca")) { + machine = MACHINE_ROMP; + machinename = "ca"; + } else if (!strcmp($2, "mmax")) { + machine = MACHINE_MMAX; + machinename = "mmax"; + } else if (!strcmp($2, "sqt")) { + machine = MACHINE_SQT; + machinename = "sqt"; + } else if (!strcmp($2, "i")) { + machine = MACHINE_I386; + machinename = "i386"; + } else if (!strcmp($2, "i386")) { + machine = MACHINE_I386; + machinename = "i386"; + } else if (!strcmp($2, "ix")) { + machine = MACHINE_IX; + machinename = "ix"; + } else if (!strcmp($2, "mipsy")) { + machine = MACHINE_MIPSY; + machinename = "mipsy"; + } else if (!strcmp($2, "mips")) { + machine = MACHINE_MIPS; + machinename = "mips"; + } else if (!strcmp($2, "i860")) { + machine = MACHINE_I860; + machinename = "i860"; + } else if (!strcmp($2, "m68k")) { + machine = MACHINE_M68K; + machinename = "m68k"; + } else if (!strcmp($2, "m88k")) { + machine = MACHINE_M88K; + machinename = "m88k"; + } else if (!strcmp($2, "m98k")) { + machine = MACHINE_M98K; + machinename = "m98k"; + } else if (!strcmp($2, "hppa")) { + machine = MACHINE_HPPA; + machinename = "hppa"; + } else if (!strcmp($2, "sparc")) { + machine = MACHINE_SPARC; + machinename = "sparc"; + } else if (!strcmp($2, "ppc")) { + machine = MACHINE_PPC; + machinename = "ppc"; + } else if (!strcmp($2, "arm")) { + machine = MACHINE_ARM; + machinename = "arm"; + } else if (!strcmp($2, "x86_64")) { + machine = MACHINE_X86_64; + machinename = "x86_64"; + } else + yyerror("Unknown machine type"); + } | + CPU Save_id + { + struct cputype *cp = + (struct cputype *)malloc(sizeof (struct cputype)); + cp->cpu_name = ns($2); + cp->cpu_next = cputype; + cputype = cp; + free(temp_id); + } | + OPTIONS Opt_list + | + MAKEOPTIONS Mkopt_list + | + IDENT ID + { ident = ns($2); } + | + System_spec + | + MAXUSERS NUMBER + { maxusers = $2; } + | + BUILDDIR Save_id + { build_directory = ns($2); } + | + CONFIGDIR Save_id + { config_directory = ns($2); } + | + OBJECTDIR Save_id + { object_directory = ns($2); } + | + SOURCEDIR Save_id + { source_directory = ns($2); } + | + PROFILE + { profiling++; } + ; + +System_spec: + System_id + { checksystemspec(*confp); } + | System_id System_parameter_list + { checksystemspec(*confp); } + ; + +System_id: + CONFIG Save_id + { mkconf($2); } + ; + +System_parameter_list: + System_parameter_list System_parameter + | System_parameter + ; + +System_parameter: + swap_spec + | root_spec + | dump_spec + | arg_spec + ; + +swap_spec: + SWAP optional_on swap_device_list + ; + +swap_device_list: + swap_device_list AND swap_device + | swap_device + ; + +swap_device: + swap_device_spec optional_size + { mkswap(*confp, $1, $2); } + ; + +swap_device_spec: + device_name + { + struct file_list *fl = newswap(); + + if (eq($1, "generic")) + fl->f_fn = $1; + else { + fl->f_swapdev = nametodev($1, 0, 'b'); + fl->f_fn = devtoname(fl->f_swapdev); + } + $$ = fl; + } + | major_minor + { + struct file_list *fl = newswap(); + + fl->f_swapdev = $1; + fl->f_fn = devtoname($1); + $$ = fl; + } + ; + +root_spec: + ROOT optional_on root_device_spec + { + struct file_list *fl = *confp; + + if (fl && fl->f_rootdev != NODEV) + yyerror("extraneous root device specification"); + else + fl->f_rootdev = $3; + } + ; + +root_device_spec: + device_name + { $$ = nametodev($1, 0, 'a'); } + | major_minor + ; + +dump_spec: + DUMPS optional_on dump_device_spec + { + struct file_list *fl = *confp; + + if (fl && fl->f_dumpdev != NODEV) + yyerror("extraneous dump device specification"); + else + fl->f_dumpdev = $3; + } + + ; + +dump_device_spec: + device_name + { $$ = nametodev($1, 0, 'b'); } + | major_minor + ; + +arg_spec: + ARGS optional_on arg_device_spec + { + struct file_list *fl = *confp; + + if (fl && fl->f_argdev != NODEV) + yyerror("extraneous arg device specification"); + else + fl->f_argdev = $3; + } + ; + +arg_device_spec: + device_name + { $$ = nametodev($1, 0, 'b'); } + | major_minor + ; + +major_minor: + MAJOR NUMBER MINOR NUMBER + { $$ = makedev($2, $4); } + ; + +optional_on: + ON + | /* empty */ + ; + +optional_size: + SIZE NUMBER + { $$ = $2; } + | /* empty */ + { $$ = 0; } + ; + +device_name: + Save_id + { $$ = $1; } + | Save_id NUMBER + { + char buf[80]; + + (void) sprintf(buf, "%s%d", $1, $2); + $$ = ns(buf); free($1); + } + | Save_id NUMBER ID + { + char buf[80]; + + (void) sprintf(buf, "%s%d%s", $1, $2, $3); + $$ = ns(buf); free($1); + } + ; + +Opt_list: + Opt_list COMMA Option + | + Option + ; + +Option: + Save_id + { + struct opt *op = (struct opt *)malloc(sizeof (struct opt)); + op->op_name = ns($1); + op->op_next = (struct opt *) 0; + op->op_value = 0; + if (opt == (struct opt *) 0) + opt = op; + else + opt_tail->op_next = op; + opt_tail = op; + free(temp_id); + } | + Save_id EQUALS Opt_value + { + struct opt *op = (struct opt *)malloc(sizeof (struct opt)); + op->op_name = ns($1); + op->op_next = (struct opt *) 0; + op->op_value = ns($3); + if (opt == (struct opt *) 0) + opt = op; + else + opt_tail->op_next = op; + opt_tail = op; + free(temp_id); + if (val_id) + free(val_id); + } ; + +Opt_value: + ID + { $$ = val_id = ns($1); } | + NUMBER + { char nb[16]; + (void) sprintf(nb, "%u", $1); + $$ = val_id = ns(nb); + } | + /* lambda from MIPS -- WHY */ + { $$ = val_id = ns(""); } + ; + +Save_id: + ID + { $$ = temp_id = ns($1); } + ; + +Mkopt_list: + Mkopt_list COMMA Mkoption + | + Mkoption + ; + +Mkoption: + Save_id + { + struct opt *op = (struct opt *)malloc(sizeof (struct opt)); + op->op_name = ns($1); + op->op_next = (struct opt *) 0; + op->op_value = 0; + mkopt = op; + free(temp_id); + } | + Save_id EQUALS Opt_value + { + struct opt *op = (struct opt *)malloc(sizeof (struct opt)); + op->op_name = ns($1); + op->op_next = (struct opt *) 0; + op->op_value = ns($3); + if (mkopt == (struct opt *) 0) + mkopt = op; + else + mkopt_tail->op_next = op; + mkopt_tail = op; + free(temp_id); + if (val_id) + free(val_id); + } ; + +Dev: + UBA + { $$ = ns("uba"); } | + MBA + { $$ = ns("mba"); } | + VME16D16 + { + if (machine != MACHINE_SUN2 && machine != MACHINE_SUN3 + && machine != MACHINE_SUN4) + yyerror("wrong machine type for vme16d16"); + $$ = ns("vme16d16"); + } | + VME24D16 + { + if (machine != MACHINE_SUN2 && machine != MACHINE_SUN3 + && machine != MACHINE_SUN4) + yyerror("wrong machine type for vme24d16"); + $$ = ns("vme24d16"); + } | + VME32D16 + { + if (machine != MACHINE_SUN3 && machine != MACHINE_SUN4) + + yyerror("wrong machine type for vme32d16"); + $$ = ns("vme32d16"); + } | + VME16D32 + { + if (machine != MACHINE_SUN3 && machine != MACHINE_SUN4) + yyerror("wrong machine type for vme16d32"); + $$ = ns("vme16d32"); + } | + VME24D32 + { + if (machine != MACHINE_SUN3 && machine != MACHINE_SUN4) + yyerror("wrong machine type for vme24d32"); + $$ = ns("vme24d32"); + } | + VME32D32 + { + if (machine != MACHINE_SUN3 && machine != MACHINE_SUN4) + yyerror("wrong machine type for vme32d32"); + $$ = ns("vme32d32"); + } | + VME + { + if (machine != MACHINE_MIPSY && machine != MACHINE_MIPS) + yyerror("wrong machine type for vme"); + $$ = ns("vme"); + } | + MBII + { + if (machine != MACHINE_MIPSY && machine != MACHINE_MIPS) + yyerror("wrong machine type for mbii"); + $$ = ns("mbii"); + } | + ID + { $$ = ns($1); } + ; + +Device_spec: + DEVICE Dev_name Dev_info Int_spec + { cur.d_type = DEVICE; } | + MASTER Dev_name Dev_info Int_spec + { cur.d_type = MASTER; } | + DISK Dev_name Dev_info Int_spec + { cur.d_dk = 1; cur.d_type = DEVICE; } | +/* TAPE rule is unique to CMU */ + TAPE Dev_name Dev_info Int_spec + { cur.d_type = DEVICE; } | + CONTROLLER Dev_name Dev_info Int_spec + { cur.d_type = CONTROLLER; } | + PSEUDO_DEVICE Init_dev Dev + { + cur.d_name = $3; + cur.d_type = PSEUDO_DEVICE; + } | + PSEUDO_DEVICE Init_dev Dev NUMBER + { + cur.d_name = $3; + cur.d_type = PSEUDO_DEVICE; + cur.d_slave = $4; + } | + PSEUDO_DEVICE Init_dev Dev INIT ID + { + cur.d_name = $3; + cur.d_type = PSEUDO_DEVICE; + cur.d_init = ns($5); + } | + PSEUDO_DEVICE Init_dev Dev NUMBER INIT ID + { + cur.d_name = $3; + cur.d_type = PSEUDO_DEVICE; + cur.d_slave = $4; + cur.d_init = ns($6); + }; + +Dev_name: + Init_dev Dev NUMBER + { + cur.d_name = $2; + if (eq($2, "mba")) + seen_mba = 1; + else if (eq($2, "uba")) + seen_uba = 1; + else if (eq($2, "mbii")) + seen_mbii = 1; + else if (eq($2, "vme")) + seen_vme = 1; + cur.d_unit = $3; + }; + +Init_dev: + /* lambda */ + { init_dev(&cur); }; + +Dev_info: + Con_info Info_list + | + /* lambda */ + ; + +Con_info: + AT Dev NUMBER + { + if (eq(cur.d_name, "mba") || eq(cur.d_name, "uba") + || eq(cur.d_name, "mbii") || eq(cur.d_name, "vme")) { + (void) sprintf(errbuf, + "%s must be connected to a nexus", cur.d_name); + yyerror(errbuf); + } + cur.d_conn = dconnect($2, $3); + if (machine == MACHINE_SQT) + dev_param(&cur, "index", cur.d_unit); + } | +/* AT SLOT NUMBER rule is unique to CMU */ + AT SLOT NUMBER + { + check_slot(&cur, $3); + cur.d_addr = $3; + cur.d_conn = TO_SLOT; + } | + AT NEXUS NUMBER + { check_nexus(&cur, $3); cur.d_conn = TO_NEXUS; }; + +Info_list: + Info_list Info + | + /* lambda */ + ; + +Info: + CSR NUMBER + { + cur.d_addr = $2; + if (machine == MACHINE_SQT) { + dev_param(&cur, "csr", $2); + } + } | + DRIVE NUMBER + { + cur.d_drive = $2; + if (machine == MACHINE_SQT) { + dev_param(&cur, "drive", $2); + } + } | + SLAVE NUMBER + { + if (cur.d_conn != 0 && cur.d_conn != TO_NEXUS && + cur.d_conn->d_type == MASTER) + cur.d_slave = $2; + else + yyerror("can't specify slave--not to master"); + } | +/* MIPS */ + ADDRMOD NUMBER + { cur.d_addrmod = $2; } | +/* LUN NUMBER rule is unique to CMU */ + LUN NUMBER + { + if ((cur.d_conn != 0) && (cur.d_conn != TO_SLOT) && + (cur.d_conn->d_type == CONTROLLER)) { + cur.d_addr = $2; + } + else { + yyerror("device requires controller card"); + } + } | + FLAGS NUMBER + { + cur.d_flags = $2; + if (machine == MACHINE_SQT) { + dev_param(&cur, "flags", $2); + } + } | + BIN NUMBER + { + if (machine != MACHINE_SQT) + yyerror("bin specification only valid on Sequent Balance"); + if ($2 < 1 || $2 > 7) + yyerror("bogus bin number"); + else { + cur.d_bin = $2; + dev_param(&cur, "bin", $2); + } + } | + Dev Value + { + if (machine != MACHINE_SQT) + yyerror("bad device spec"); + dev_param(&cur, $1, $2); + }; + +Value: + NUMBER + | + MINUS NUMBER + { $$ = -($2); } + ; + +Int_spec: + Vec_spec + { cur.d_pri = 0; } | + PRIORITY NUMBER + { cur.d_pri = $2; } | + PRIORITY NUMBER Vec_spec + { cur.d_pri = $2; } | + Vec_spec PRIORITY NUMBER + { cur.d_pri = $3; } | + /* lambda */ + ; + +Vec_spec: + VECTOR Id_list + { cur.d_vec = $2; }; + + +Id_list: + Save_id + { + struct idlst *a = (struct idlst *)malloc(sizeof(struct idlst)); + a->id = $1; a->id_next = 0; $$ = a; + a->id_vec = 0; + } | + Save_id Id_list + { + struct idlst *a = (struct idlst *)malloc(sizeof(struct idlst)); + a->id = $1; a->id_next = $2; $$ = a; + a->id_vec = 0; + } | + Save_id NUMBER + { + struct idlst *a = (struct idlst *)malloc(sizeof(struct idlst)); + a->id_next = 0; a->id = $1; $$ = a; + a->id_vec = $2; + } | + Save_id NUMBER Id_list + { + struct idlst *a = (struct idlst *)malloc(sizeof(struct idlst)); + a->id_next = $3; a->id = $1; $$ = a; + a->id_vec = $2; + }; + +%% + +void +yyerror(const char *s) +{ + fprintf(stderr, "config: line %d: %s\n", yyline, s); +} + +/* + * return the passed string in a new space + */ +char * +ns(const char *str) +{ + register char *cp; + + cp = malloc((unsigned)(strlen(str)+1)); + (void) strcpy(cp, str); + return (cp); +} + +/* + * add a device to the list of devices + */ +void +newdev(struct device *dp) +{ + register struct device *np; + + np = (struct device *) malloc(sizeof *np); + *np = *dp; + if (curp == 0) + dtab = np; + else + curp->d_next = np; + curp = np; + curp->d_next = 0; +} + +/* + * note that a configuration should be made + */ +void +mkconf(char *sysname) +{ + register struct file_list *fl, **flp; + + fl = (struct file_list *) malloc(sizeof *fl); + fl->f_type = SYSTEMSPEC; + fl->f_needs = sysname; + fl->f_rootdev = NODEV; + fl->f_argdev = NODEV; + fl->f_dumpdev = NODEV; + fl->f_fn = 0; + fl->f_next = 0; + for (flp = confp; *flp; flp = &(*flp)->f_next) + ; + *flp = fl; + confp = flp; +} + +struct file_list * +newswap(void) +{ + struct file_list *fl = (struct file_list *)malloc(sizeof (*fl)); + + fl->f_type = SWAPSPEC; + fl->f_next = 0; + fl->f_swapdev = NODEV; + fl->f_swapsize = 0; + fl->f_needs = 0; + fl->f_fn = 0; + return (fl); +} + +/* + * Add a swap device to the system's configuration + */ +void +mkswap(struct file_list *syslist, struct file_list *fl, int size) +{ + register struct file_list **flp; + + if (syslist == 0 || syslist->f_type != SYSTEMSPEC) { + yyerror("\"swap\" spec precedes \"config\" specification"); + return; + } + if (size < 0) { + yyerror("illegal swap partition size"); + return; + } + /* + * Append swap description to the end of the list. + */ + flp = &syslist->f_next; + for (; *flp && (*flp)->f_type == SWAPSPEC; flp = &(*flp)->f_next) + ; + fl->f_next = *flp; + *flp = fl; + fl->f_swapsize = size; + /* + * If first swap device for this system, + * set up f_fn field to insure swap + * files are created with unique names. + */ + if (syslist->f_fn) + return; + if (eq(fl->f_fn, "generic")) + syslist->f_fn = ns(fl->f_fn); + else + syslist->f_fn = ns(syslist->f_needs); +} + +/* + * find the pointer to connect to the given device and number. + * returns 0 if no such device and prints an error message + */ +struct device * +dconnect(const char *dev, int num) +{ + register struct device *dp; + + if (num == QUES) + return (huhcon(dev)); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if ((num != dp->d_unit) || !eq(dev, dp->d_name)) + continue; + if (dp->d_type != CONTROLLER && dp->d_type != MASTER) { + (void) sprintf(errbuf, + "%s connected to non-controller", dev); + yyerror(errbuf); + return (0); + } + return (dp); + } + (void) sprintf(errbuf, "%s %d not defined", dev, num); + yyerror(errbuf); + return (0); +} + +/* + * connect to an unspecific thing + */ +struct device * +huhcon(const char *dev) +{ + register struct device *dp, *dcp; + struct device rdev; /* only used if dp is NULL */ + int oldtype; + + memset(&rdev, 0, sizeof rdev); + + /* + * First make certain that there are some of these to wildcard on + */ + for (dp = dtab; dp != 0; dp = dp->d_next) + if (eq(dp->d_name, dev)) + break; + if (dp == 0) { + (void) sprintf(errbuf, "no %s's to wildcard", dev); + yyerror(errbuf); + return (0); + } + oldtype = dp->d_type; + dcp = dp->d_conn; + /* + * Now see if there is already a wildcard entry for this device + * (e.g. Search for a "uba ?") + */ + for (; dp != 0; dp = dp->d_next) + if (eq(dev, dp->d_name) && dp->d_unit == -1) + break; + /* + * If there isn't, make one because everything needs to be connected + * to something. + */ + if (dp == 0) { + dp = &rdev; + init_dev(dp); + dp->d_unit = QUES; + dp->d_name = ns(dev); + dp->d_type = oldtype; + newdev(dp); + dp = curp; + /* + * Connect it to the same thing that other similar things are + * connected to, but make sure it is a wildcard unit + * (e.g. up connected to sc ?, here we make connect sc? to a + * uba?). If other things like this are on the NEXUS or + * if they aren't connected to anything, then make the same + * connection, else call ourself to connect to another + * unspecific device. + */ + if (dcp == TO_NEXUS || dcp == 0) + dp->d_conn = dcp; + else + dp->d_conn = dconnect(dcp->d_name, QUES); + } + return (dp); +} + +void +init_dev(struct device *dp) +{ + + dp->d_name = "OHNO!!!"; + dp->d_type = DEVICE; + dp->d_conn = 0; + dp->d_vec = 0; + dp->d_addr = dp->d_pri = dp->d_flags = dp->d_dk = 0; + dp->d_slave = dp->d_drive = dp->d_unit = UNKNOWN; + if (machine == MACHINE_SUN2 || machine == MACHINE_SUN3 + || machine == MACHINE_SUN4){ + dp->d_addr = UNKNOWN; + dp->d_mach = dp->d_bus = 0; + } + if (machine == MACHINE_MIPSY || machine == MACHINE_MIPS){ + dp->d_addrmod = 0; + } + dp->d_init = 0; +} + +/* + * make certain that this is a reasonable type of thing to connect to a nexus + */ +void +check_nexus(struct device *dev, int num) +{ + + switch (machine) { + + case MACHINE_VAX: + if (!eq(dev->d_name, "uba") && !eq(dev->d_name, "mba")) + yyerror("only uba's and mba's should be connected to the nexus"); + if (num != QUES) + yyerror("can't give specific nexus numbers"); + break; + + case MACHINE_SUN: + if (!eq(dev->d_name, "mb")) + yyerror("only mb's should be connected to the nexus"); + break; + + case MACHINE_ROMP: + if (!eq(dev->d_name, "iocc")) + yyerror("only iocc's should be connected to the nexus"); + break; + case MACHINE_SUN2: + if (!eq(dev->d_name, "virtual") && + !eq(dev->d_name, "obmem") && + !eq(dev->d_name, "obio") && + !eq(dev->d_name, "mbmem") && + !eq(dev->d_name, "mbio") && + !eq(dev->d_name, "vme16d16") && + !eq(dev->d_name, "vme24d16")) { + (void)sprintf(errbuf, + "unknown bus type `%s' for nexus connection on %s", + dev->d_name, machinename); + yyerror(errbuf); + } + + case MACHINE_MMAX: + yyerror("don't grok 'nexus' on mmax -- try 'slot'."); + break; + case MACHINE_SUN3: + case MACHINE_SUN4: + if (!eq(dev->d_name, "virtual") && + !eq(dev->d_name, "obmem") && + !eq(dev->d_name, "obio") && + !eq(dev->d_name, "mbmem") && + !eq(dev->d_name, "mbio") && + !eq(dev->d_name, "vme16d16") && + !eq(dev->d_name, "vme24d16") && + !eq(dev->d_name, "vme32d16") && + !eq(dev->d_name, "vme16d32") && + !eq(dev->d_name, "vme24d32") && + !eq(dev->d_name, "vme32d32")) { + (void)sprintf(errbuf, + "unknown bus type `%s' for nexus connection on %s", + dev->d_name, machinename); + yyerror(errbuf); + } + break; + case MACHINE_MIPSY: + case MACHINE_MIPS: + if (!eq(dev->d_name, "vme") && !eq(dev->d_name, "mbii")) + yyerror("only vme's and mbii's should be connected to the nexus"); + if (num != QUES) + yyerror("can't give specific nexus numbers"); + break; + } +} + +/* + * make certain that this is a reasonable type of thing to connect to a slot + */ + +void +check_slot(struct device *dev, int num) +{ + + switch (machine) { + + case MACHINE_MMAX: + if (!eq(dev->d_name, "emc")) + yyerror("only emc's plug into backplane slots."); + if (num == QUES) + yyerror("specific slot numbers must be given"); + break; + + case MACHINE_SQT: + if (!eq(dev->d_name, "mbad") && + !eq(dev->d_name, "zdc") && + !eq(dev->d_name, "sec")) { + (void)sprintf(errbuf, + "unknown bus type `%s' for slot on %s", + dev->d_name, machinename); + yyerror(errbuf); + } + break; + + default: + yyerror("don't grok 'slot' for this machine -- try 'nexus'."); + break; + } +} + +/* + * Check system specification and apply defaulting + * rules on root, argument, dump, and swap devices. + */ +void +checksystemspec(struct file_list *fl) +{ + char buf[BUFSIZ]; + register struct file_list *swap; + int generic; + + if (fl == 0 || fl->f_type != SYSTEMSPEC) { + yyerror("internal error, bad system specification"); + exit(1); + } + swap = fl->f_next; + generic = swap && swap->f_type == SWAPSPEC && eq(swap->f_fn, "generic"); + if (fl->f_rootdev == NODEV && !generic) { + yyerror("no root device specified"); + exit(1); + } + /* + * Default swap area to be in 'b' partition of root's + * device. If root specified to be other than on 'a' + * partition, give warning, something probably amiss. + */ + if (swap == 0 || swap->f_type != SWAPSPEC) { + dev_t dev; + + swap = newswap(); + dev = fl->f_rootdev; + if (minor(dev) & DEV_MASK) { + (void) sprintf(buf, +"Warning, swap defaulted to 'b' partition with root on '%c' partition", + (minor(dev) & DEV_MASK) + 'a'); + yyerror(buf); + } + swap->f_swapdev = + makedev(major(dev), (minor(dev) &~ DEV_MASK) | ('b' - 'a')); + swap->f_fn = devtoname(swap->f_swapdev); + mkswap(fl, swap, 0); + } + /* + * Make sure a generic swap isn't specified, along with + * other stuff (user must really be confused). + */ + if (generic) { + if (fl->f_rootdev != NODEV) + yyerror("root device specified with generic swap"); + if (fl->f_argdev != NODEV) + yyerror("arg device specified with generic swap"); + if (fl->f_dumpdev != NODEV) + yyerror("dump device specified with generic swap"); + return; + } + /* + * Default argument device and check for oddball arrangements. + */ + if (fl->f_argdev == NODEV) + fl->f_argdev = swap->f_swapdev; + if (fl->f_argdev != swap->f_swapdev) + yyerror("Warning, arg device different than primary swap"); + /* + * Default dump device and warn if place is not a + * swap area or the argument device partition. + */ + if (fl->f_dumpdev == NODEV) + fl->f_dumpdev = swap->f_swapdev; + if (fl->f_dumpdev != swap->f_swapdev && fl->f_dumpdev != fl->f_argdev) { + struct file_list *p = swap->f_next; + + for (; p && p->f_type == SWAPSPEC; p = p->f_next) + if (fl->f_dumpdev == p->f_swapdev) + return; + (void) sprintf(buf, "Warning, orphaned dump device, %s", + "do you know what you're doing"); + yyerror(buf); + } +} + +/* + * Verify all devices specified in the system specification + * are present in the device specifications. + */ +void +verifysystemspecs(void) +{ + register struct file_list *fl; + dev_t checked[50]; + register dev_t *pchecked = checked; + + for (fl = conf_list; fl; fl = fl->f_next) { + if (fl->f_type != SYSTEMSPEC) + continue; + if (!finddev(fl->f_rootdev)) + deverror(fl->f_needs, "root"); + *pchecked++ = fl->f_rootdev; + pchecked = verifyswap(fl->f_next, checked, pchecked); +#define samedev(dev1, dev2) \ + ((minor(dev1) &~ DEV_MASK) != (minor(dev2) &~ DEV_MASK)) + if (!alreadychecked(fl->f_dumpdev, checked, pchecked)) { + if (!finddev(fl->f_dumpdev)) + deverror(fl->f_needs, "dump"); + *pchecked++ = fl->f_dumpdev; + } + if (!alreadychecked(fl->f_argdev, checked, pchecked)) { + if (!finddev(fl->f_argdev)) + deverror(fl->f_needs, "arg"); + *pchecked++ = fl->f_argdev; + } + } +} + +/* + * Do as above, but for swap devices. + */ +dev_t * +verifyswap(struct file_list *fl, dev_t checked[], dev_t *pchecked) +{ + + for (;fl && fl->f_type == SWAPSPEC; fl = fl->f_next) { + if (eq(fl->f_fn, "generic")) + continue; + if (alreadychecked(fl->f_swapdev, checked, pchecked)) + continue; + if (!finddev(fl->f_swapdev)) + fprintf(stderr, + "config: swap device %s not configured", fl->f_fn); + *pchecked++ = fl->f_swapdev; + } + return (pchecked); +} + +/* + * Has a device already been checked + * for it's existence in the configuration? + */ +int +alreadychecked(dev_t dev, dev_t list[], dev_t *last) +{ + register dev_t *p; + + for (p = list; p < last; p++) + if (samedev(*p, dev)) + return (1); + return (0); +} + +void +deverror(const char *systemname, const char *devtype) +{ + + fprintf(stderr, "config: %s: %s device not configured\n", + systemname, devtype); +} + +/* + * Look for the device in the list of + * configured hardware devices. Must + * take into account stuff wildcarded. + */ +/*ARGSUSED*/ +int +finddev(__unused dev_t dev) +{ + + /* punt on this right now */ + return (1); +} diff --git a/SETUP/config/searchp.c b/SETUP/config/searchp.c new file mode 100644 index 000000000..b79ca6a44 --- /dev/null +++ b/SETUP/config/searchp.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights + * Reserved. This file contains Original Code and/or Modifications of + * Original Code as defined in and that are subject to the Apple Public + * Source License Version 1.0 (the 'License'). You may not use this file + * except in compliance with the License. Please obtain a copy of the + * License at http://www.apple.com/publicsource and read it before using + * this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License." + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* searchp -- search through pathlist for file + * + * Usage: p = searchp (path,file,fullname,func); + * char *p, *path, *file, *fullname; + * int (*func)(); + * + * Searchp will parse "path", a list of pathnames separated + * by colons, prepending each pathname to "file". The resulting + * filename will be passed to "func", a function provided by the + * user. This function must return zero if the search is + * successful (i.e. ended), and non-zero if the search must + * continue. If the function returns zero (success), then + * searching stops, the full filename is placed into "fullname", + * and searchp returns 0. If the pathnames are all unsuccessfully + * examined, then searchp returns -1. + * If "file" begins with a slash, it is assumed to be an + * absolute pathname and the "path" list is not used. Note + * that this rule is used by Bell's cc also; whereas Bell's + * sh uses the rule that any filename which CONTAINS a slash + * is assumed to be absolute. The execlp and execvp procedures + * also use this latter rule. In my opinion, this is bogosity. + * + * HISTORY + * 01-Apr-86 Rudy Nedved (ern) at Carnegie-Mellon University + * 4.1BSD system ignores trailing slashes. 4.2BSD does not. + * Therefore don't add a seperating slash if there is a null + * filename. + * + * 23-Oct-82 Steven Shafer (sas) at Carnegie-Mellon University + * Fixed two bugs: (1) calling function as "func" instead of + * "(*func)", (2) omitting trailing null name implied by trailing + * colon in path. Latter bug fixed by introducing "lastchar" and + * changing final loop test to look for "*lastchar" instead of + * "*nextpath". + * + * 20-Nov-79 Steven Shafer (sas) at Carnegie-Mellon University + * Created for VAX. If you're thinking of using this, you probably + * should look at openp() and fopenp() (or the "want..." routines) + * instead. + * + */ +#include "config.h" + +int +searchp(const char *spath, char *file, char *fullname, int (*func)(char *)) +{ + const char *nextpath, *nextchar, *lastchar; + char *fname; + int failure; + + nextpath = ((*file == '/') ? "" : spath); + do { + fname = fullname; + nextchar = nextpath; + while (*nextchar && (*nextchar != ':')) + *fname++ = *nextchar++; + if (nextchar != nextpath && *file) *fname++ = '/'; + lastchar = nextchar; + nextpath = ((*nextchar) ? nextchar + 1 : nextchar); + nextchar = file; /* append file */ + while (*nextchar) *fname++ = *nextchar++; + *fname = '\0'; + failure = (*func) (fullname); + } + while (failure && (*lastchar)); + return (failure ? -1 : 0); +} diff --git a/SETUP/kextsymboltool/Makefile b/SETUP/kextsymboltool/Makefile new file mode 100644 index 000000000..137f253d2 --- /dev/null +++ b/SETUP/kextsymboltool/Makefile @@ -0,0 +1,31 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + + +include $(MakeInc_cmd) +include $(MakeInc_def) + +OBJS = kextsymboltool.o + +CFLAGS = -isysroot $(HOST_SDKROOT) -g -O0 -I$(SOURCE) -I. + +WARNFLAGS = -Wall + +LDFLAGS = -isysroot $(HOST_SDKROOT) -lstdc++ + +kextsymboltool: $(OBJS) + $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^ + @echo HOST_LD $@ + $(_v)$(HOST_CODESIGN) -s - $@ + @echo HOST_CODESIGN $@ + +.c.o: + $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $< + @echo HOST_CC $@ + +do_build_setup: kextsymboltool + +include $(MakeInc_rule) +include $(MakeInc_dir) diff --git a/SETUP/kextsymboltool/kextsymboltool.c b/SETUP/kextsymboltool/kextsymboltool.c new file mode 100644 index 000000000..ee46713e4 --- /dev/null +++ b/SETUP/kextsymboltool/kextsymboltool.c @@ -0,0 +1,912 @@ +/* + * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include + +#pragma mark Typedefs, Enums, Constants +/********************************************************************* +* Typedefs, Enums, Constants +*********************************************************************/ +typedef enum { + kErrorNone = 0, + kError, + kErrorFileAccess, + kErrorDiskFull, + kErrorDuplicate +} ToolError; + +#pragma mark Function Protos +/********************************************************************* +* Function Protos +*********************************************************************/ +__private_extern__ ToolError +readFile(const char *path, vm_offset_t * objAddr, vm_size_t * objSize); + +__private_extern__ ToolError +writeFile(int fd, const void * data, size_t length); + +extern char* __cxa_demangle (const char* mangled_name, + char* buf, + size_t* n, + int* status); + +#pragma mark Functions +/********************************************************************* +*********************************************************************/ +__private_extern__ ToolError +writeFile(int fd, const void * data, size_t length) +{ + ToolError err; + + if (length != (size_t)write(fd, data, length)) + err = kErrorDiskFull; + else + err = kErrorNone; + + if (kErrorNone != err) + perror("couldn't write output"); + + return( err ); +} + +/********************************************************************* +*********************************************************************/ +__private_extern__ ToolError +readFile(const char *path, vm_offset_t * objAddr, vm_size_t * objSize) +{ + ToolError err = kErrorFileAccess; + int fd; + struct stat stat_buf; + + *objAddr = 0; + *objSize = 0; + + do + { + if((fd = open(path, O_RDONLY)) == -1) + continue; + + if(fstat(fd, &stat_buf) == -1) + continue; + + if (0 == (stat_buf.st_mode & S_IFREG)) + continue; + + /* Don't try to map an empty file, it fails now due to conformance + * stuff (PR 4611502). + */ + if (0 == stat_buf.st_size) { + err = kErrorNone; + continue; + } + + *objSize = stat_buf.st_size; + + *objAddr = (vm_offset_t)mmap(NULL /* address */, *objSize, + PROT_READ|PROT_WRITE, MAP_FILE|MAP_PRIVATE /* flags */, + fd, 0 /* offset */); + + if ((void *)*objAddr == MAP_FAILED) { + *objAddr = 0; + *objSize = 0; + continue; + } + + err = kErrorNone; + + } while( false ); + + if (-1 != fd) + { + close(fd); + } + if (kErrorNone != err) + { + fprintf(stderr, "couldn't read %s: %s\n", path, strerror(errno)); + } + + return( err ); +} + + +enum { kExported = 0x00000001, kObsolete = 0x00000002 }; + +struct symbol { + char * name; + unsigned int name_len; + char * indirect; + unsigned int indirect_len; + unsigned int flags; + struct symbol * list; + unsigned int list_count; +}; + +static bool issymchar( char c ) +{ + return ((c > ' ') && (c <= '~') && (c != ':') && (c != '#')); +} + +static bool iswhitespace( char c ) +{ + return ((c == ' ') || (c == '\t')); +} + +/* + * Function for qsort for comparing symbol list names. + */ +static int +qsort_cmp(const void * _left, const void * _right) +{ + struct symbol * left = (struct symbol *) _left; + struct symbol * right = (struct symbol *) _right; + + return (strcmp(left->name, right->name)); +} + +/* + * Function for bsearch for finding a symbol name. + */ + +static int +bsearch_cmp( const void * _key, const void * _cmp) +{ + char * key = (char *)_key; + struct symbol * cmp = (struct symbol *) _cmp; + + return(strcmp(key, cmp->name)); +} + +struct bsearch_key +{ + char * name; + unsigned int name_len; +}; + +static int +bsearch_cmp_prefix( const void * _key, const void * _cmp) +{ + struct bsearch_key * key = (struct bsearch_key *)_key; + struct symbol * cmp = (struct symbol *) _cmp; + + return(strncmp(key->name, cmp->name, key->name_len)); +} + +static uint32_t +count_symbols(char * file, vm_size_t file_size) +{ + uint32_t nsyms = 0; + char * scan; + char * eol; + char * next; + + for (scan = file; true; scan = next) { + + eol = memchr(scan, '\n', file_size - (scan - file)); + if (eol == NULL) { + break; + } + next = eol + 1; + + /* Skip empty lines. + */ + if (eol == scan) { + continue; + } + + /* Skip comment lines. + */ + if (scan[0] == '#') { + continue; + } + + /* Scan past any non-symbol characters at the beginning of the line. */ + while ((scan < eol) && !issymchar(*scan)) { + scan++; + } + + /* No symbol on line? Move along. + */ + if (scan == eol) { + continue; + } + + /* Skip symbols starting with '.'. + */ + if (scan[0] == '.') { + continue; + } + nsyms++; + } + + return nsyms; +} + +static uint32_t +store_symbols(char * file, vm_size_t file_size, struct symbol * symbols, uint32_t idx, uint32_t max_symbols) +{ + char * scan; + char * line; + char * eol; + char * next; + + uint32_t strtabsize; + + strtabsize = 0; + + for (scan = file, line = file; true; scan = next, line = next) { + + char * name = NULL; + char * name_term = NULL; + unsigned int name_len = 0; + char * indirect = NULL; + char * indirect_term = NULL; + unsigned int indirect_len = 0; + char * option = NULL; + char * option_term = NULL; + unsigned int option_len = 0; + char optionstr[256]; + boolean_t obsolete = 0; + + eol = memchr(scan, '\n', file_size - (scan - file)); + if (eol == NULL) { + break; + } + next = eol + 1; + + /* Skip empty lines. + */ + if (eol == scan) { + continue; + } + + *eol = '\0'; + + /* Skip comment lines. + */ + if (scan[0] == '#') { + continue; + } + + /* Scan past any non-symbol characters at the beginning of the line. */ + while ((scan < eol) && !issymchar(*scan)) { + scan++; + } + + /* No symbol on line? Move along. + */ + if (scan == eol) { + continue; + } + + /* Skip symbols starting with '.'. + */ + if (scan[0] == '.') { + continue; + } + + name = scan; + + /* Find the end of the symbol. + */ + while ((*scan != '\0') && issymchar(*scan)) { + scan++; + } + + /* Note char past end of symbol. + */ + name_term = scan; + + /* Stored length must include the terminating nul char. + */ + name_len = name_term - name + 1; + + /* Now look for an indirect. + */ + if (*scan != '\0') { + while ((*scan != '\0') && iswhitespace(*scan)) { + scan++; + } + if (*scan == ':') { + scan++; + while ((*scan != '\0') && iswhitespace(*scan)) { + scan++; + } + if (issymchar(*scan)) { + indirect = scan; + + /* Find the end of the symbol. + */ + while ((*scan != '\0') && issymchar(*scan)) { + scan++; + } + + /* Note char past end of symbol. + */ + indirect_term = scan; + + /* Stored length must include the terminating nul char. + */ + indirect_len = indirect_term - indirect + 1; + + } else if (*scan == '\0') { + fprintf(stderr, "bad format in symbol line: %s\n", line); + exit(1); + } + } else if (*scan != '\0' && *scan != '-') { + fprintf(stderr, "bad format in symbol line: %s\n", line); + exit(1); + } + } + + /* Look for options. + */ + if (*scan != '\0') { + while ((*scan != '\0') && iswhitespace(*scan)) { + scan++; + } + + if (*scan == '-') { + scan++; + + if (isalpha(*scan)) { + option = scan; + + /* Find the end of the option. + */ + while ((*scan != '\0') && isalpha(*scan)) { + scan++; + } + + /* Note char past end of option. + */ + option_term = scan; + option_len = option_term - option; + + if (option_len >= sizeof(optionstr)) { + fprintf(stderr, "option too long in symbol line: %s\n", line); + exit(1); + } + memcpy(optionstr, option, option_len); + optionstr[option_len] = '\0'; + + /* Find the option. + */ + if (!strncmp(optionstr, "obsolete", option_len)) { + obsolete = TRUE; + } + + } else if (*scan == '\0') { + fprintf(stderr, "bad format in symbol line: %s\n", line); + exit(1); + } + + } + + } + + if(idx >= max_symbols) { + fprintf(stderr, "symbol[%d/%d] overflow: %s\n", idx, max_symbols, line); + exit(1); + } + + *name_term = '\0'; + if (indirect_term) { + *indirect_term = '\0'; + } + + symbols[idx].name = name; + symbols[idx].name_len = name_len; + symbols[idx].indirect = indirect; + symbols[idx].indirect_len = indirect_len; + symbols[idx].flags = (obsolete) ? kObsolete : 0; + + strtabsize += symbols[idx].name_len + symbols[idx].indirect_len; + idx++; + } + + return strtabsize; +} + +/********************************************************************* +*********************************************************************/ +int main(int argc, char * argv[]) +{ + ToolError err; + int i, fd; + const char * output_name = NULL; + uint32_t zero = 0, num_files = 0; + uint32_t filenum; + uint32_t strx, strtabsize, strtabpad; + struct symbol * import_symbols; + struct symbol * export_symbols; + uint32_t num_import_syms, num_export_syms; + uint32_t result_count, num_removed_syms; + uint32_t import_idx, export_idx; + const NXArchInfo * host_arch; + const NXArchInfo * target_arch; + boolean_t require_imports = true; + boolean_t diff = false; + + + struct file { + vm_offset_t mapped; + vm_size_t mapped_size; + uint32_t nsyms; + boolean_t import; + const char * path; + }; + struct file files[64]; + + host_arch = NXGetLocalArchInfo(); + target_arch = host_arch; + + for( i = 1; i < argc; i += 2) + { + boolean_t import; + + if (!strcmp("-sect", argv[i])) + { + require_imports = false; + i--; + continue; + } + if (!strcmp("-diff", argv[i])) + { + require_imports = false; + diff = true; + i--; + continue; + } + + if (i == (argc - 1)) + { + fprintf(stderr, "bad arguments: %s\n", argv[i]); + exit(1); + } + + if (!strcmp("-arch", argv[i])) + { + target_arch = NXGetArchInfoFromName(argv[i + 1]); + if (!target_arch) + { + fprintf(stderr, "unknown architecture name: %s\n", argv[i+1]); + exit(1); + } + continue; + } + if (!strcmp("-output", argv[i])) + { + output_name = argv[i+1]; + continue; + } + + if (!strcmp("-import", argv[i])) + import = true; + else if (!strcmp("-export", argv[i])) + import = false; + else + { + fprintf(stderr, "unknown option: %s\n", argv[i]); + exit(1); + } + + err = readFile(argv[i+1], &files[num_files].mapped, &files[num_files].mapped_size); + if (kErrorNone != err) + exit(1); + + if (files[num_files].mapped && files[num_files].mapped_size) + { + files[num_files].import = import; + files[num_files].path = argv[i+1]; + num_files++; + } + } + + if (!output_name) + { + fprintf(stderr, "no output file\n"); + exit(1); + } + + num_import_syms = 0; + num_export_syms = 0; + for (filenum = 0; filenum < num_files; filenum++) + { + files[filenum].nsyms = count_symbols((char *) files[filenum].mapped, files[filenum].mapped_size); + if (files[filenum].import) + num_import_syms += files[filenum].nsyms; + else + num_export_syms += files[filenum].nsyms; + } + if (!num_export_syms) + { + fprintf(stderr, "no export names\n"); + exit(1); + } + + import_symbols = calloc(num_import_syms, sizeof(struct symbol)); + export_symbols = calloc(num_export_syms, sizeof(struct symbol)); + + import_idx = 0; + export_idx = 0; + + for (filenum = 0; filenum < num_files; filenum++) + { + if (files[filenum].import) + { + store_symbols((char *) files[filenum].mapped, files[filenum].mapped_size, + import_symbols, import_idx, num_import_syms); + import_idx += files[filenum].nsyms; + } + else + { + store_symbols((char *) files[filenum].mapped, files[filenum].mapped_size, + export_symbols, export_idx, num_export_syms); + export_idx += files[filenum].nsyms; + } + if (false && !files[filenum].nsyms) + { + fprintf(stderr, "warning: file %s contains no names\n", files[filenum].path); + } + } + + + qsort(import_symbols, num_import_syms, sizeof(struct symbol), &qsort_cmp); + qsort(export_symbols, num_export_syms, sizeof(struct symbol), &qsort_cmp); + + result_count = 0; + num_removed_syms = 0; + strtabsize = 4; + if (num_import_syms) + { + for (export_idx = 0; export_idx < num_export_syms; export_idx++) + { + struct symbol * result; + char * name; + size_t len; + boolean_t wild; + + name = export_symbols[export_idx].indirect; + len = export_symbols[export_idx].indirect_len; + if (!name) + { + name = export_symbols[export_idx].name; + len = export_symbols[export_idx].name_len; + } + wild = ((len > 2) && ('*' == name[len-=2])); + if (wild) + { + struct bsearch_key key; + key.name = name; + key.name_len = len; + result = bsearch(&key, import_symbols, + num_import_syms, sizeof(struct symbol), &bsearch_cmp_prefix); + + if (result) + { + struct symbol * first; + struct symbol * last; + + strtabsize += (result->name_len + result->indirect_len); + + first = result; + while (--first >= &import_symbols[0]) + { + if (bsearch_cmp_prefix(&key, first)) + break; + strtabsize += (first->name_len + first->indirect_len); + } + first++; + + last = result; + while (++last < (&import_symbols[0] + num_import_syms)) + { + if (bsearch_cmp_prefix(&key, last)) + break; + strtabsize += (last->name_len + last->indirect_len); + } + result_count += last - first; + result = first; + export_symbols[export_idx].list = first; + export_symbols[export_idx].list_count = last - first; + export_symbols[export_idx].flags |= kExported; + } + } + else + result = bsearch(name, import_symbols, + num_import_syms, sizeof(struct symbol), &bsearch_cmp); + + if (!result && require_imports) + { + int status; + char * demangled_result = + __cxa_demangle(export_symbols[export_idx].name + 1, NULL, NULL, &status); + fprintf(stderr, "exported name not in import list: %s\n", + demangled_result ? demangled_result : export_symbols[export_idx].name); +// fprintf(stderr, " : %s\n", export_symbols[export_idx].name); + if (demangled_result) { + free(demangled_result); + } + num_removed_syms++; + } + if (diff) + { + if (!result) + result = &export_symbols[export_idx]; + else + result = NULL; + } + if (result && !wild) + { + export_symbols[export_idx].flags |= kExported; + strtabsize += (export_symbols[export_idx].name_len + export_symbols[export_idx].indirect_len); + result_count++; + export_symbols[export_idx].list = &export_symbols[export_idx]; + export_symbols[export_idx].list_count = 1; + } + } + } + strtabpad = (strtabsize + 3) & ~3; + + if (require_imports && num_removed_syms) + { + err = kError; + goto finish; + } + + fd = open(output_name, O_WRONLY|O_CREAT|O_TRUNC, 0755); + if (-1 == fd) + { + perror("couldn't write output"); + err = kErrorFileAccess; + goto finish; + } + + struct symtab_command symcmd; + struct uuid_command uuidcmd; + + symcmd.cmd = LC_SYMTAB; + symcmd.cmdsize = sizeof(symcmd); + symcmd.symoff = sizeof(symcmd) + sizeof(uuidcmd); + symcmd.nsyms = result_count; + symcmd.strsize = strtabpad; + + uuidcmd.cmd = LC_UUID; + uuidcmd.cmdsize = sizeof(uuidcmd); + uuid_generate(uuidcmd.uuid); + + if (CPU_ARCH_ABI64 & target_arch->cputype) + { + struct mach_header_64 hdr; + hdr.magic = MH_MAGIC_64; + hdr.cputype = target_arch->cputype; + hdr.cpusubtype = target_arch->cpusubtype; + hdr.filetype = MH_KEXT_BUNDLE; + hdr.ncmds = 2; + hdr.sizeofcmds = sizeof(symcmd) + sizeof(uuidcmd); + hdr.flags = MH_INCRLINK; + + symcmd.symoff += sizeof(hdr); + symcmd.stroff = result_count * sizeof(struct nlist_64) + + symcmd.symoff; + + if (target_arch->byteorder != host_arch->byteorder) + swap_mach_header_64(&hdr, target_arch->byteorder); + err = writeFile(fd, &hdr, sizeof(hdr)); + } + else + { + struct mach_header hdr; + hdr.magic = MH_MAGIC; + hdr.cputype = target_arch->cputype; + hdr.cpusubtype = target_arch->cpusubtype; + hdr.filetype = (target_arch->cputype == CPU_TYPE_I386) ? MH_OBJECT : MH_KEXT_BUNDLE; + hdr.ncmds = 2; + hdr.sizeofcmds = sizeof(symcmd) + sizeof(uuidcmd); + hdr.flags = MH_INCRLINK; + + symcmd.symoff += sizeof(hdr); + symcmd.stroff = result_count * sizeof(struct nlist) + + symcmd.symoff; + + if (target_arch->byteorder != host_arch->byteorder) + swap_mach_header(&hdr, target_arch->byteorder); + err = writeFile(fd, &hdr, sizeof(hdr)); + } + + if (kErrorNone != err) + goto finish; + + if (target_arch->byteorder != host_arch->byteorder) { + swap_symtab_command(&symcmd, target_arch->byteorder); + swap_uuid_command(&uuidcmd, target_arch->byteorder); + } + err = writeFile(fd, &symcmd, sizeof(symcmd)); + if (kErrorNone != err) + goto finish; + err = writeFile(fd, &uuidcmd, sizeof(uuidcmd)); + if (kErrorNone != err) + goto finish; + + strx = 4; + for (export_idx = 0; export_idx < num_export_syms; export_idx++) + { + if (!export_symbols[export_idx].name) + continue; + if (!(kExported & export_symbols[export_idx].flags)) + continue; + + if (export_idx + && export_symbols[export_idx - 1].name + && !strcmp(export_symbols[export_idx - 1].name, export_symbols[export_idx].name)) + { + fprintf(stderr, "duplicate export: %s\n", export_symbols[export_idx - 1].name); + err = kErrorDuplicate; + goto finish; + } + + for (import_idx = 0; import_idx < export_symbols[export_idx].list_count; import_idx++) + { + + if (export_symbols[export_idx].list != &export_symbols[export_idx]) + { + printf("wild: %s, %s\n", export_symbols[export_idx].name, + export_symbols[export_idx].list[import_idx].name); + } + if (CPU_ARCH_ABI64 & target_arch->cputype) + { + struct nlist_64 nl; + + nl.n_sect = 0; + nl.n_desc = 0; + nl.n_un.n_strx = strx; + strx += export_symbols[export_idx].list[import_idx].name_len; + + if (export_symbols[export_idx].flags & kObsolete) { + nl.n_desc |= N_DESC_DISCARDED; + } + + if (export_symbols[export_idx].list[import_idx].indirect) + { + nl.n_type = N_INDR | N_EXT; + nl.n_value = strx; + strx += export_symbols[export_idx].list[import_idx].indirect_len; + } + else + { + nl.n_type = N_UNDF | N_EXT; + nl.n_value = 0; + } + + if (target_arch->byteorder != host_arch->byteorder) + swap_nlist_64(&nl, 1, target_arch->byteorder); + + err = writeFile(fd, &nl, sizeof(nl)); + } + else + { + struct nlist nl; + + nl.n_sect = 0; + nl.n_desc = 0; + nl.n_un.n_strx = strx; + strx += export_symbols[export_idx].list[import_idx].name_len; + + if (export_symbols[export_idx].flags & kObsolete) { + nl.n_desc |= N_DESC_DISCARDED; + } + + if (export_symbols[export_idx].list[import_idx].indirect) + { + nl.n_type = N_INDR | N_EXT; + nl.n_value = strx; + strx += export_symbols[export_idx].list[import_idx].indirect_len; + } + else + { + nl.n_type = N_UNDF | N_EXT; + nl.n_value = 0; + } + + if (target_arch->byteorder != host_arch->byteorder) + swap_nlist(&nl, 1, target_arch->byteorder); + + err = writeFile(fd, &nl, sizeof(nl)); + } + } + + if (kErrorNone != err) + goto finish; + } + + strx = sizeof(uint32_t); + err = writeFile(fd, &zero, strx); + if (kErrorNone != err) + goto finish; + + for (export_idx = 0; export_idx < num_export_syms; export_idx++) + { + if (!export_symbols[export_idx].name) + continue; + + for (import_idx = 0; import_idx < export_symbols[export_idx].list_count; import_idx++) + { + err = writeFile(fd, export_symbols[export_idx].list[import_idx].name, + export_symbols[export_idx].list[import_idx].name_len); + if (kErrorNone != err) + goto finish; + if (export_symbols[export_idx].list[import_idx].indirect) + { + err = writeFile(fd, export_symbols[export_idx].list[import_idx].indirect, + export_symbols[export_idx].list[import_idx].indirect_len); + if (kErrorNone != err) + goto finish; + } + } + } + + err = writeFile(fd, &zero, strtabpad - strtabsize); + if (kErrorNone != err) + goto finish; + + close(fd); + + +finish: + for (filenum = 0; filenum < num_files; filenum++) { + // unmap file + if (files[filenum].mapped_size) + { + munmap((caddr_t)files[filenum].mapped, files[filenum].mapped_size); + files[filenum].mapped = 0; + files[filenum].mapped_size = 0; + } + + } + + if (kErrorNone != err) + { + if (output_name) + unlink(output_name); + exit(1); + } + else + exit(0); + return(0); +} + diff --git a/security/conf/tools/newvers/newvers.csh b/SETUP/newvers old mode 100644 new mode 100755 similarity index 100% rename from security/conf/tools/newvers/newvers.csh rename to SETUP/newvers diff --git a/SETUP/seed_objroot b/SETUP/seed_objroot deleted file mode 100755 index 6773e70e4..000000000 --- a/SETUP/seed_objroot +++ /dev/null @@ -1,133 +0,0 @@ -#!/bin/sh - -if [ ! $OBJROOT ] -then - echo "OBJROOT not defined" - exit 1 -fi - -if [ ! $PREBUILT_OBJROOT ] -then - PREBUILT_OBJROOT=/Prebuilt/$1/xnu/BUILD/obj -fi - -if [ ! -e $PREBUILT_OBJROOT ] -then - echo "$PREBUILT_OBJROOT doesn't exist" - exit 1 -else -if [ $# = 2 -a ! -e $PREBUILT_OBJROOT/$2 ] -then - echo "$PREBUILT_OBJROOT/$2 doesn't exist" - exit 1 -fi -if [ -e $PREBUILT_OBJROOT/BUILDING_SEED_OBJROOT ] -then - echo "Building $PREBUILT_OBJROOT, try later" - exit 1 -fi -fi - -cd $PREBUILT_OBJROOT - -if [ $# = 1 ] -then - -if [ ! -e $OBJROOT ] -then -mkdir -p $OBJROOT -echo "Copying $PREBUILT_OBJROOT in $OBJROOT" -pax -rw . $OBJROOT -else -echo "Remove $OBJROOT before calling seed_objroot" -exit 1 -fi - -else - -if [ ! -e $OBJROOT/$2 ] -then -mkdir -p $OBJROOT/$2 -echo "Copying $PREBUILT_OBJROOT/$2 in $OBJROOT/$2" -pax -rw $2 $OBJROOT -RELEASE_OBJ=`echo $2 | sed 's/DEBUG/RELEASE/'` -if [ $1 != $RELEASE_OBJ -a ! -e $OBJROOT/$RELEASE_OBJ ] -then -mkdir -p $OBJROOT/$RELEASE_OBJ -echo "Copying $PREBUILT_OBJROOT/$RELEASE_OBJ in $OBJROOT/$RELEASE_OBJ" -pax -rw $RELEASE_OBJ $OBJROOT -fi - -else -echo "remove $OBJROOT/$2 before calling seed_objroot" -exit 1 -fi - -fi - -if [ ! -e $OBJROOT/EXPORT_HDRS ] -then -echo "Copying $PREBUILT_OBJROOT/EXPORT_HDRS in $OBJROOT/EXPORT_HDRS" -mkdir -p $OBJROOT/EXPORT_HDRS -pax -rw EXPORT_HDRS $OBJROOT -fi - -cd $OBJROOT -if [ -e RELEASE_PPC/osfmk/RELEASE/config.RELEASE_PPC ] -then -PREV_OBJROOT=`grep objectdir RELEASE_PPC/osfmk/RELEASE/config.RELEASE_PPC | cut -f 2 -d\" | - sed 's|/RELEASE_PPC/osfmk/RELEASE||'` -fi -if [ -z $PREV_OBJROOT -a -e DEBUG_PPC/osfmk/DEBUG/config.DEBUG_PPC ] -then - PREV_OBJROOT=`grep objectdir DEBUG_PPC/osfmk/DEBUG/config.DEBUG_PPC | cut -f 2 -d\" | - sed 's|/DEBUG_PPC/osfmk/DEBUG||'` -fi -if [ -z $PREV_OBJROOT -a -e RELEASE_I386/osfmk/RELEASE/config.RELEASE_I386 ] -then - PREV_OBJROOT=`grep objectdir RELEASE_I386/osfmk/RELEASE/config.RELEASE_I386 | cut -f 2 -d\" | - sed 's|/RELEASE_I386/osfmk/RELEASE||'` -fi -if [ -z $PREV_OBJROOT -a -e DEBUG_I386/osfmk/DEBUG/config.DEBUG_I386 ] -then - PREV_OBJROOT=`grep objectdir DEBUG_I386/osfmk/DEBUG/config.DEBUG_I386 | cut -f 2 -d\" | - sed 's|/DEBUG_I386/osfmk/DEBUG||'` -fi -if [ -z $PREV_OBJROOT ] -then - echo "PREV_OBJROOT not found" - exit 1 -fi - -if [ -e RELEASE_PPC/osfmk/RELEASE/config.RELEASE_PPC ] -then -PREV_SRCROOT=`grep sourcedir RELEASE_PPC/osfmk/RELEASE/config.RELEASE_PPC | cut -f 2 -d\"` -fi -if [ -z $PREV_SRCROOT -a -e DEBUG_PPC/osfmk/DEBUG/config.DEBUG_PPC ] -then - PREV_SRCROOT=`grep sourcedir DEBUG_PPC/osfmk/DEBUG/config.DEBUG_PPC | cut -f 2 -d\"` -fi -if [ -z $PREV_SRCROOT -a -e RELEASE_I386/osfmk/RELEASE/config.RELEASE_I386 ] -then -PREV_SRCROOT=`grep sourcedir RELEASE_I386/osfmk/RELEASE/config.RELEASE_I386 | cut -f 2 -d\"` -fi -if [ -z $PREV_SRCROOT -a -e DEBUG_I386/osfmk/DEBUG/config.DEBUG_I386 ] -then - PREV_SRCROOT=`grep sourcedir DEBUG_I386/osfmk/DEBUG/config.DEBUG_I386 | cut -f 2 -d\"` -fi -if [ -z $PREV_SRCROOT ] -then - echo "PREV_SRCROOT not found" - exit 1 -fi - -echo "s|$PREV_OBJROOT|$OBJROOT|" > prebuild.sed -echo "s|$PREV_SRCROOT|$SRCROOT|" >>prebuild.sed - -for i in `find . -name Makedep -print` -do -sed -f prebuild.sed $i > $i.tmp -rm $i -mv $i.tmp $i -done -rm -f `find $OBJROOT -name Makefile -print` prebuild.sed diff --git a/SETUP/setsegname/Makefile b/SETUP/setsegname/Makefile new file mode 100644 index 000000000..70e5e2641 --- /dev/null +++ b/SETUP/setsegname/Makefile @@ -0,0 +1,31 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + + +include $(MakeInc_cmd) +include $(MakeInc_def) + +OBJS = setsegname.o + +CFLAGS = -isysroot $(HOST_SDKROOT) -g -O0 -I$(SOURCE) -I. + +WARNFLAGS = -Wall + +LDFLAGS = -isysroot $(HOST_SDKROOT) + +setsegname: $(OBJS) + $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^ + @echo HOST_LD $@ + $(_v)$(HOST_CODESIGN) -s - $@ + @echo HOST_CODESIGN $@ + +.c.o: + $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $< + @echo HOST_CC $@ + +do_build_setup: setsegname + +include $(MakeInc_rule) +include $(MakeInc_dir) diff --git a/SETUP/setsegname/setsegname.c b/SETUP/setsegname/setsegname.c new file mode 100644 index 000000000..9afd6bc5d --- /dev/null +++ b/SETUP/setsegname/setsegname.c @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2007 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +#include +#include + +#include +#include +#include + +#include + +#include + +/********************************************************************* +*********************************************************************/ +static int +writeFile(int fd, const void * data, size_t length) +{ + int error = 0; + + if (length != (size_t)write(fd, data, length)) { + error = -1; + } + + if (error != 0) { + perror("couldn't write output"); + } + + return error; +} + +/********************************************************************* +*********************************************************************/ +static int +readFile(const char *path, vm_offset_t * objAddr, vm_size_t * objSize) +{ + int error = -1; + int fd; + struct stat stat_buf; + + *objAddr = 0; + *objSize = 0; + + do { + if ((fd = open(path, O_RDONLY)) == -1) { + continue; + } + + if (fstat(fd, &stat_buf) == -1) { + continue; + } + + if (0 == (stat_buf.st_mode & S_IFREG)) { + continue; + } + + if (0 == stat_buf.st_size) { + error = 0; + continue; + } + + *objSize = stat_buf.st_size; + + *objAddr = (vm_offset_t)mmap(NULL /* address */, *objSize, + PROT_READ|PROT_WRITE, MAP_FILE|MAP_PRIVATE /* flags */, + fd, 0 /* offset */); + + if ((void *)*objAddr == MAP_FAILED) { + *objAddr = 0; + *objSize = 0; + continue; + } + + error = 0; + + } while (false); + + if (-1 != fd) { + close(fd); + } + if (error) { + fprintf(stderr, "couldn't read %s: %s\n", path, strerror(errno)); + } + + return error; +} + +/********************************************************************* +*********************************************************************/ +int main(int argc, char * argv[]) +{ + int error; + const char * output_name = NULL; + const char * newseg_name = NULL; + struct mach_header * hdr; + struct mach_header_64 * hdr64; + struct load_command * cmds; + boolean_t swap = false; + uint32_t ncmds, cmdtype; + uint32_t len; + vm_offset_t input; + vm_size_t input_size; + uint32_t nsects = 0; + uint32_t * flags = NULL; + uint32_t attr; + typedef char segname_t[16]; + segname_t * names = NULL; + + if ((argc != 5) || strcmp("-o", argv[3])) { + fprintf(stderr, "Usage: %s NEWSEGNAME input -o output\n", argv[0]); + exit(1); + } + + output_name = argv[4]; + newseg_name = argv[1]; + + error = readFile(argv[2], &input, &input_size); + if (error) { + exit(1); + } + + hdr = (typeof(hdr)) input; + switch (hdr->magic) { + case MH_CIGAM: + swap = true; + // fall thru + case MH_MAGIC: + ncmds = hdr->ncmds; + cmds = (typeof(cmds)) (hdr+1); + break; + + case MH_CIGAM_64: + swap = true; + // fall thru + case MH_MAGIC_64: + hdr64 = (typeof(hdr64)) hdr; + ncmds = hdr64->ncmds; + cmds = (typeof(cmds)) (hdr64+1); + break; + + default: + fprintf(stderr, "not macho input file\n"); + exit(1); + break; + } + + if (swap) { + ncmds = OSSwapInt32(ncmds); + } + while (ncmds--) { + cmdtype = cmds->cmd; + if (swap) { + cmdtype = OSSwapInt32(cmdtype); + } + nsects = 0; + len = 0; + if (LC_SEGMENT == cmdtype) { + struct segment_command * segcmd; + struct section * sects; + + segcmd = (typeof(segcmd)) cmds; + nsects = segcmd->nsects; + sects = (typeof(sects))(segcmd + 1); + names = §s->segname; + flags = §s->flags; + len = sizeof(*sects); + } else if (LC_SEGMENT_64 == cmdtype) { + struct segment_command_64 * segcmd; + struct section_64 * sects; + + segcmd = (typeof(segcmd)) cmds; + nsects = segcmd->nsects; + sects = (typeof(sects))(segcmd + 1); + names = §s->segname; + flags = §s->flags; + len = sizeof(*sects); + } + + if (swap) + nsects = OSSwapInt32(nsects); + while (nsects--) { + attr = *flags; + if (swap) { + attr = OSSwapInt32(attr); + } + + if (!(S_ATTR_DEBUG & attr)) { + strncpy((char *)names, newseg_name, sizeof(*names)); + } + + names = (typeof(names))(((uintptr_t) names) + len); + flags = (typeof(flags))(((uintptr_t) flags) + len); + } + + len = cmds->cmdsize; + if (swap) { + len = OSSwapInt32(len); + } + cmds = (typeof(cmds))(((uintptr_t) cmds) + len); + } + + int fd = open(output_name, O_WRONLY|O_CREAT|O_TRUNC, 0755); + if (-1 == fd) { + error = -1; + } else { + error = writeFile(fd, (const void *) input, input_size); + close(fd); + } + + if (error) { + fprintf(stderr, "couldn't write output: %s\n", strerror(errno)); + exit(1); + } + + exit(0); + return 0; +} diff --git a/bsd/Makefile b/bsd/Makefile index d4df2fc62..8beb22975 100644 --- a/bsd/Makefile +++ b/bsd/Makefile @@ -26,14 +26,13 @@ INSTINC_SUBDIRS = \ uuid \ vfs -INSTINC_SUBDIRS_PPC = \ - ppc - INSTINC_SUBDIRS_I386 = \ - i386 + i386 \ + crypto INSTINC_SUBDIRS_X86_64 = \ - i386 + i386 \ + crypto INSTINC_SUBDIRS_ARM = \ arm @@ -58,9 +57,6 @@ EXPINC_SUBDIRS = \ vfs \ vm -EXPINC_SUBDIRS_PPC = \ - ppc - EXPINC_SUBDIRS_I386 = \ i386 @@ -70,16 +66,17 @@ EXPINC_SUBDIRS_X86_64 = \ EXPINC_SUBDIRS_ARM = \ arm -SETUP_SUBDIRS = \ - conf +SETUP_SUBDIRS = COMP_SUBDIRS = \ conf INST_SUBDIRS = \ + kern INSTMAN_SUBDIRS = \ man + include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/bsd/bsm/Makefile b/bsd/bsm/Makefile index 0bb6f4dcf..f660aafb5 100644 --- a/bsd/bsm/Makefile +++ b/bsd/bsm/Makefile @@ -9,16 +9,12 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ INSTINC_SUBDIRS_X86_64 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ diff --git a/bsd/bsm/audit.h b/bsd/bsm/audit.h index bb4a9497b..a24cc88d7 100644 --- a/bsd/bsm/audit.h +++ b/bsd/bsm/audit.h @@ -125,6 +125,8 @@ #define A_SETQCTRL 36 #define A_GETCOND 37 #define A_SETCOND 38 +#define A_GETSFLAGS 39 +#define A_SETSFLAGS 40 /* * Audit policy controls. @@ -318,6 +320,7 @@ int setaudit_addr(const struct auditinfo_addr *, int); #include mach_port_name_t audit_session_self(void); au_asid_t audit_session_join(mach_port_name_t port); +int audit_session_port(au_asid_t asid, mach_port_name_t *portname); #endif /* __APPLE_API_PRIVATE */ #endif /* defined(_KERNEL) || defined(KERNEL) */ diff --git a/bsd/bsm/audit_kevents.h b/bsd/bsm/audit_kevents.h index 268c456c3..25e3eb829 100644 --- a/bsd/bsm/audit_kevents.h +++ b/bsd/bsm/audit_kevents.h @@ -36,7 +36,7 @@ * The reserved event numbers for kernel events are 1...2047 and 43001..44900. */ #define AUE_IS_A_KEVENT(e) (((e) > 0 && (e) < 2048) || \ - ((e) > 43000 && (e) < 45000)) + ((e) > 43000 && (e) < 44901)) /* * Values marked as AUE_NULL are not required to be audited as per CAPP. @@ -596,6 +596,16 @@ #define AUE_PWRITE 43193 /* Darwin/FreeBSD. */ #define AUE_FSCTL 43194 /* Darwin. */ #define AUE_FFSCTL 43195 /* Darwin. */ +#define AUE_LPATHCONF 43196 /* FreeBSD. */ +#define AUE_PDFORK 43197 /* FreeBSD. */ +#define AUE_PDKILL 43198 /* FreeBSD. */ +#define AUE_PDGETPID 43199 /* FreeBSD. */ +#define AUE_PDWAIT 43200 /* FreeBSD. */ + +#define AUE_SESSION_START 44901 /* Darwin. */ +#define AUE_SESSION_UPDATE 44902 /* Darwin. */ +#define AUE_SESSION_END 44903 /* Darwin. */ +#define AUE_SESSION_CLOSE 44904 /* Darwin. */ /* * Darwin BSM uses a number of AUE_O_* definitions, which are aliased to the diff --git a/bsd/conf/MASTER b/bsd/conf/MASTER index 93872ad07..bb57c6dae 100644 --- a/bsd/conf/MASTER +++ b/bsd/conf/MASTER @@ -101,6 +101,7 @@ options MACH_IPC_COMPAT # Enable old IPC interface # options MACH_IPC_DEBUG # Enable IPC debugging calls # options MACH_IPC_TEST # Testing code/printfs # options MACH_LDEBUG # Sanity-check simple locking # +options CONFIG_ZLEAKS # Live zone leak debug sysctls # options MACH_NP # Mach IPC support # options MACH_NBC # No buffer cache # options MACH_NET # Fast network access # @@ -123,15 +124,12 @@ options LLC # 802.2 support # options LOOP # loopback support # options MROUTING # multicast routing # options ROUTING # routing # -options NETMIBS # # options VLAN # # options BOND # # options PF # Packet Filter # options PF_PKTHDR # PF tag inside mbuf pkthdr # -options PKT_PRIORITY # Packet priority support # options PFLOG # PF log interface # options IPDIVERT # Divert sockets (for NAT) # -options IPFLOW # IP fast forwarding # options IPFIREWALL # IP Firewalling (used by NAT) # options IPFIREWALL_FORWARD #Transparent proxy # options IPFIREWALL_DEFAULT_TO_ACCEPT # allow everything by default # @@ -144,7 +142,6 @@ options RANDOM_IP_ID # random (not sequential) ip ids # options TCP_DROP_SYNFIN # Drop TCP packets with SYN+FIN set # options ICMP_BANDLIM # ICMP bandwidth limiting sysctl options IFNET_INPUT_SANITY_CHK # allow dlil/ifnet input sanity check # -options IFNET_ROUTE_REFCNT # count route references to ifnet # options SYSV_SEM # SVID semaphores # options SYSV_MSG # SVID messages # options SYSV_SHM # SVID shared mem # @@ -169,22 +166,18 @@ options NETWORKING # networking layer # options CONFIG_FSE # file system events # options CONFIG_IMAGEBOOT # local image boot # options CONFIG_SOWUPCALL # SB_UPCALL on sowwakeup # -options CONFIG_MBUF_NOEXPAND # limit mbuf expansion # options CONFIG_MBUF_JUMBO # jumbo cluster pool # -options CONFIG_MBUF_TAGS_MALLOC # use malloc for tags # options CONFIG_FORCE_OUT_IFP # Enable IP_FORCE_OUT_IFP # options CONFIG_IFEF_NOWINDOWSCALE # Scale TCP window per driver # options CONFIG_WORKQUEUE # - # # 4.4 filesystems # options FFS # Fast Filesystem Support # options HFS # HFS/HFS+ support # options FIFO # fifo support # -options UNION # union_fs support # options FDESC # fdesc_fs support # options DEVFS # devfs support # options JOURNALING # journaling support # @@ -199,6 +192,7 @@ options REV_ENDIAN_FS # Reverse Endian FS # options NAMEDSTREAMS # named stream vnop support # options CONFIG_VOLFS # volfs path support (legacy) # options CONFIG_IMGSRC_ACCESS # source of imageboot dmg # +options CONFIG_TRIGGERS # trigger vnodes # # # NFS support @@ -249,6 +243,7 @@ options randomipid # options ZLIB # inflate/deflate support # +options IF_BRIDGE # makeoptions LIBDRIVER = "libDriver_kern.o" # makeoptions LIBOBJC = "libkobjc.o" # @@ -292,7 +287,6 @@ options CONFIG_KN_HASHSIZE=20 # options CONFIG_VNODES=263168 # options CONFIG_VNODES=263168 # options CONFIG_VNODES=10240 # -options CONFIG_VNODES=1024 # options CONFIG_VNODES=750 # options CONFIG_VNODE_FREE_MIN=500 # @@ -396,6 +390,12 @@ options CONFIG_MFCTBLSIZ=256 # options CONFIG_MFCTBLSIZ=128 # options CONFIG_MFCTBLSIZ=16 # +# +# configurable kernel message buffer size +# +options CONFIG_MSG_BSIZE=4096 # +options CONFIG_MSG_BSIZE=16384 # + # # configurable kernel - use these options to strip strings from panic # and printf calls. @@ -406,6 +406,11 @@ options CONFIG_NO_PANIC_STRINGS # options CONFIG_NO_PRINTF_STRINGS # options CONFIG_NO_KPRINTF_STRINGS # +# +# use finer-grained lock groups for the proc subsystem +# +options CONFIG_FINE_LOCK_GROUPS # + # # configurable kernel - general switch to say we are building for an # embedded device @@ -433,6 +438,14 @@ options CONFIG_CODE_DECRYPTION # options CONFIG_PROTECT # +# +# freeze - support app hibernation, used on embedded +# CONFIG_FREEZE_SUSPENDED_MIN is the minimum number of suspended +# processes to be left unhibernated +# +options CONFIG_FREEZE # + +options CHECK_CS_VALIDATION_BITMAP # # # Ethernet (ARP) @@ -463,6 +476,7 @@ pseudo-device vndevice 16 init vndevice_init # pseudo-device vndevice 8 init vndevice_init # pseudo-device vndevice 4 init vndevice_init # pseudo-device vndevice 3 init vndevice_init # +pseudo-device vndevice 2 init vndevice_init # pseudo-device vndevice 2 init vndevice_init # # diff --git a/bsd/conf/MASTER.i386 b/bsd/conf/MASTER.i386 index 1e6641911..594f0fb51 100644 --- a/bsd/conf/MASTER.i386 +++ b/bsd/conf/MASTER.i386 @@ -44,21 +44,20 @@ # # Standard Apple Research Configurations: # -------- ----- -------- --------------- -# BASE = [ intel mach medium config_dtrace vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue psynch ] -# FILESYS = [ devfs revfs hfs journaling fdesc config_fse quota namedstreams fifo union config_volfs config_hfs_trim hfs_compression config_imgsrc_access ] -# NETWORKING = [ inet inet6 compat_oldsock tcpdrop_synfin bpfilter ipdivert ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile netmibs bond vlan gif stf zlib randomipid ifnet_input_chk config_mbuf_jumbo ipflow pkt_priority if_bridge ] +# BASE = [ intel mach medium config_dtrace vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue psynch zleaks ] +# FILESYS = [ devfs revfs hfs journaling fdesc config_fse quota namedstreams fifo config_volfs config_hfs_trim hfs_compression config_hfs_alloc_rbtree config_imgsrc_access config_triggers ] +# NETWORKING = [ inet inet6 compat_oldsock tcpdrop_synfin bpfilter ipdivert ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile bond vlan gif stf zlib randomipid ifnet_input_chk config_mbuf_jumbo if_bridge pf pflog pf_pkthdr ] # NFS = [ nfsclient nfsserver ] # VPN = [ ipsec ] # RELEASE = [ BASE NETWORKING NFS VPN FILESYS libdriver ] # PROFILE = [ RELEASE profile ] -# DEBUG = [ BASE NETWORKING NFS VPN FILESYS libdriver_g debug xpr_debug mach_assert pf pflog ] -# +# DEBUG = [ BASE NETWORKING NFS VPN FILESYS libdriver_g debug xpr_debug mach_assert ] # # EMBEDDED_BASE = [ intel mach bsmall vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue psynch ] # EMBEDDED_FILESYS = [ devfs hfs journaling fdesc fifo ] -# EMBEDDED_NET = [ inet compat_oldsock tcpdrop_synfin bpfilter config_mbuf_noexpand ] +# EMBEDDED_NET = [ inet compat_oldsock tcpdrop_synfin bpfilter ] # EMBEDDED = [ EMBEDDED_BASE EMBEDDED_NET VPN EMBEDDED_FILESYS libdriver no_printf_str no_kprintf_str no_kdebug ] -# DEVELOPMENT = [ EMBEDDED_BASE EMBEDDED_NET NFS VPN EMBEDDED_FILESYS libdriver netmibs development mach_assert config_dtrace ] +# DEVELOPMENT = [ EMBEDDED_BASE EMBEDDED_NET NFS VPN EMBEDDED_FILESYS libdriver development mach_assert config_dtrace ] # ###################################################################### # diff --git a/bsd/conf/MASTER.ppc b/bsd/conf/MASTER.ppc deleted file mode 100644 index d99b6e4f5..000000000 --- a/bsd/conf/MASTER.ppc +++ /dev/null @@ -1,99 +0,0 @@ -# -# Mach Operating System -# Copyright (c) 1986 Carnegie-Mellon University -# All rights reserved. The CMU software License Agreement -# specifies the terms and conditions for use and redistribution. -# -###################################################################### -# -# Master Apple configuration file (see the master machine independent -# configuration file for a description of the file format). -# -###################################################################### -# -# Apple (PSEUDO-)DEVICES (select any combination) -# ex = Excelan EXOS 202 Ethernet interface -# ip = Interphase V/SMD 3200 disk controller -# od = Canon OMD-1 Optical Disk -# rd = RAM disk -# sd = SCSI disk -# sg = Generic SCSI Device -# st = SCSI tape -# fd = Floppy Disk -# en = Integrated Ethernet controller -# dsp = DSP560001 digital signal processor -# iplmeas = ipl time measurement -# nextp = NeXT Laser Printer -# sound = sound I/O -# vol = removable volume support device -# venip = virtual Ethernet/IP network interface -# zs = Serial device -# -# MULTIPROCESSOR SUPPORT (select exactly one) -# multi = support 4 processors -# uni = supports single processor -# -# SPECIAL CHARACTERISTICS (select any combination) -# gdb = GNU kernel debugger -# posix_kern = POSIX support -# -# CPU TYPE (select exactly one) -# NeXT = FIXME -# -###################################################################### -# -# Standard Apple Research Configurations: -# -------- ----- -------- --------------- -# -# BASE = [ ppc mach medium config_dtrace vol pst gdb noprofiling simple_clock kernstack sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue ] -# FILESYS = [ devfs revfs hfs journaling fdesc config_fse quota namedstreams fifo union config_volfs config_hfs_trim hfs_compression ] -# NETWORKING = [ inet inet6 compat_oldsock tcpdrop_synfin bpfilter ipdivert ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile netmibs bond vlan gif stf zlib randomipid ifnet_input_chk ipflow pkt_priority ] -# NFS = [ nfsclient nfsserver ] -# VPN = [ ipsec ] -# RELEASE = [ BASE NETWORKING NFS VPN FILESYS libdriver ] -# DEVELOPMENT = [ RELEASE ] -# PROFILE = [ RELEASE profile ] -# DEBUG = [ BASE NETWORKING NFS VPN FILESYS libdriver_g debug xpr_debug mach_assert pf pflog ] -# -###################################################################### -# -machine "ppc" # -cpu "ppc" # - -options GDB # GNU kernel debugger # -options DEBUG # general debugging code # -options SHOW_SPACE # print size of structures # -options EVENTMETER # event meter support # -options FP_EMUL # floating point emulation # -options UXPR # user-level XPR package # -config mach_kernel swap generic # - -# -# Note: MAC/AUDIT options must be set in all the bsd/conf, osfmk/conf, and -# security/conf MASTER files. -# -options CONFIG_MACF # Mandatory Access Control Framework -options CONFIG_MACF_SOCKET_SUBSET # MAC socket subest (no labels) -#options CONFIG_MACF_SOCKET # MAC socket labels -#options CONFIG_MACF_NET # mbuf -#options CONFIG_MACF_DEBUG -#options CONFIG_MACF_MACH -options CONFIG_AUDIT # Kernel auditing - -options EVENT # - -# -# Ipl measurement system -# -pseudo-device iplmeas # - -# -# NFS measurement system -# -pseudo-device nfsmeas # - -# -# Removable Volume support -# -pseudo-device vol # - diff --git a/bsd/conf/MASTER.x86_64 b/bsd/conf/MASTER.x86_64 index 1050897d2..4bf42910b 100644 --- a/bsd/conf/MASTER.x86_64 +++ b/bsd/conf/MASTER.x86_64 @@ -44,21 +44,20 @@ # # Standard Apple Research Configurations: # -------- ----- -------- --------------- -# BASE = [ intel mach medium config_dtrace vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue psynch ] -# FILESYS = [ devfs revfs hfs journaling fdesc config_fse quota namedstreams fifo union config_volfs config_hfs_trim hfs_compression config_imgsrc_access ] -# NETWORKING = [ inet inet6 compat_oldsock tcpdrop_synfin bpfilter ipdivert ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile netmibs bond vlan gif stf zlib randomipid ifnet_input_chk config_mbuf_jumbo ipflow pkt_priority if_bridge ] +# BASE = [ intel mach medium config_dtrace vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue psynch zleaks ] +# FILESYS = [ devfs revfs hfs journaling fdesc config_fse quota namedstreams fifo config_volfs config_hfs_trim hfs_compression config_hfs_alloc_rbtree config_imgsrc_access config_triggers ] +# NETWORKING = [ inet inet6 compat_oldsock tcpdrop_synfin bpfilter ipdivert ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile bond vlan gif stf zlib randomipid ifnet_input_chk config_mbuf_jumbo if_bridge pf pflog pf_pkthdr ] # NFS = [ nfsclient nfsserver ] # VPN = [ ipsec ] # RELEASE = [ BASE NETWORKING NFS VPN FILESYS libdriver ] # PROFILE = [ RELEASE profile ] -# DEBUG = [ BASE NETWORKING NFS VPN FILESYS libdriver_g debug xpr_debug mach_assert pf pflog ] -# +# DEBUG = [ BASE NETWORKING NFS VPN FILESYS libdriver_g debug xpr_debug mach_assert ] # # EMBEDDED_BASE = [ intel mach bsmall vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue psynch ] # EMBEDDED_FILESYS = [ devfs hfs journaling fdesc fifo ] -# EMBEDDED_NET = [ inet compat_oldsock tcpdrop_synfin bpfilter config_mbuf_noexpand ] +# EMBEDDED_NET = [ inet compat_oldsock tcpdrop_synfin bpfilter ] # EMBEDDED = [ EMBEDDED_BASE EMBEDDED_NET VPN EMBEDDED_FILESYS libdriver no_printf_str no_kprintf_str no_kdebug ] -# DEVELOPMENT = [ EMBEDDED_BASE EMBEDDED_NET NFS VPN EMBEDDED_FILESYS libdriver netmibs development mach_assert ] +# DEVELOPMENT = [ EMBEDDED_BASE EMBEDDED_NET NFS VPN EMBEDDED_FILESYS libdriver development mach_assert ] # ###################################################################### # diff --git a/bsd/conf/Makefile b/bsd/conf/Makefile index a79644e77..afaf3eb89 100644 --- a/bsd/conf/Makefile +++ b/bsd/conf/Makefile @@ -3,92 +3,10 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir -export dp_backing_file.o_CFLAGS_ADD=-Werror -Wshorten-64-to-32 -export ubc_subr.o_CFLAGS_ADD=-Wno-discard-qual -Wshorten-64-to-32 -export vnode_pager.o_CFLAGS_ADD=-Werror -Wshorten-64-to-32 -export vm_unix.o_CFLAGS_ADD=-Werror -Wshorten-64-to-32 - -export if_mib.o_CFLAGS_ADD=-Wno-unused-parameter -export adsp_Write.o_CFLAGS_ADD=-Wno-sign-compare -export adsp_Packet.o_CFLAGS_ADD=-Wno-sign-compare -export adsp_Control.o_CFLAGS_ADD=-Wno-sign-compare -export adsp_RxAttn.o_CFLAGS_ADD=-Wno-sign-compare -export adsp_attention.o_CFLAGS_ADD=-Wno-sign-compare -export asp_proto.o_CFLAGS_ADD=-Wno-sign-compare -export drv_dep.o_CFLAGS_ADD=-Wno-sign-compare -export ddp_rtmp.o_CFLAGS_ADD=-Wno-sign-compare -export ddp_lap.o_CFLAGS_ADD=-Wno-sign-compare -export radix.o_CFLAGS_ADD=-Wno-sign-compare -export route.o_CFLAGS_ADD=-Wno-sign-compare -export rtsock.o_CFLAGS_ADD=-Wno-sign-compare -export dhcp_options.o_CFLAGS_ADD=-Wno-sign-compare -export igmp.o_CFLAGS_ADD=-Wno-sign-compare -export in_cksum.o_CFLAGS_ADD=-Wno-sign-compare -export ip_divert.o_CFLAGS_ADD=-Wno-sign-compare -export ip_dummynet.o_CFLAGS_ADD=-Wno-sign-compare -export ip_flow.o_CFLAGS_ADD=-Wno-sign-compare -export ip_fw2.o_CFLAGS_ADD=-Wno-sign-compare -export ip_fw2_compat.o_CFLAGS_ADD=-Wno-sign-compare -export ip_icmp.o_CFLAGS_ADD=-Wno-sign-compare -export ip_input.o_CFLAGS_ADD=-Wno-sign-compare -export ip_mroute.o_CFLAGS_ADD=-Wno-sign-compare -export ip_output.o_CFLAGS_ADD=-Wno-sign-compare -export raw_ip.o_CFLAGS_ADD=-Wno-sign-compare -export tcp_input.o_CFLAGS_ADD=-Wno-sign-compare -export tcp_output.o_CFLAGS_ADD=-Wno-sign-compare -export tcp_subr.o_CFLAGS_ADD=-Wno-sign-compare -export tcp_usrreq.o_CFLAGS_ADD=-Wno-sign-compare -export tcp_timer.o_CFLAGS_ADD=-Wno-sign-compare -export udp_usrreq.o_CFLAGS_ADD=-Wno-sign-compare -export ah_input.o_CFLAGS_ADD=-Wno-sign-compare -export ah_core.o_CFLAGS_ADD=-Wno-sign-compare -export ah_output.o_CFLAGS_ADD=-Wno-sign-compare -export esp_core.o_CFLAGS_ADD=-Wno-sign-compare -export esp_input.o_CFLAGS_ADD=-Wno-sign-compare -export esp_output.o_CFLAGS_ADD=-Wno-sign-compare -export esp_rijndael.o_CFLAGS_ADD=-Wno-sign-compare -export ipsec.o_CFLAGS_ADD=-Wno-sign-compare -export dest6.o_CFLAGS_ADD=-Wno-sign-compare -export frag6.o_CFLAGS_ADD=-Wno-sign-compare -export icmp6.o_CFLAGS_ADD=-Wno-sign-compare -export in6.o_CFLAGS_ADD=-Wno-sign-compare -export in6_src.o_CFLAGS_ADD=-Wno-sign-compare -export in6_cksum.o_CFLAGS_ADD=-Wno-sign-compare -export ip6_fw.o_CFLAGS_ADD=-Wno-sign-compare -export ip6_forward.o_CFLAGS_ADD=-Wno-sign-compare -export in6_ifattach.o_CFLAGS_ADD=-Wno-sign-compare -export ip6_input.o_CFLAGS_ADD=-Wno-sign-compare -export ip6_mroute.o_CFLAGS_ADD=-Wno-sign-compare -export ip6_output.o_CFLAGS_ADD=-Wno-sign-compare -export ipcomp_input.o_CFLAGS_ADD=-Wno-sign-compare -export ipcomp_output.o_CFLAGS_ADD=-Wno-sign-compare -export in6_proto.o_CFLAGS_ADD=-Wno-sign-compare -export mld6.o_CFLAGS_ADD=-Wno-sign-compare -export nd6.o_CFLAGS_ADD=-Wno-sign-compare -export nd6_nbr.o_CFLAGS_ADD=-Wno-sign-compare -export nd6_rtr.o_CFLAGS_ADD=-Wno-sign-compare -export raw_ip6.o_CFLAGS_ADD=-Wno-sign-compare -export route6.o_CFLAGS_ADD=-Wno-sign-compare -export scope6.o_CFLAGS_ADD=-Wno-sign-compare -export udp6_usrreq.o_CFLAGS_ADD=-Wno-sign-compare -export key.o_CFLAGS_ADD=-Wno-sign-compare -export keysock.o_CFLAGS_ADD=-Wno-sign-compare -export atp_write.o_CFLAGS_ADD=-Wno-sign-compare -export keydb.o_CFLAGS_ADD=-Wno-sign-compare -export des_setkey.o_CFLAGS_ADD=-Wno-sign-compare -export sys_socket.o_CFLAGS_ADD=-Wno-sign-compare -export sys_glue.o_CFLAGS_ADD=-Wno-sign-compare -export uipc_domain.o_CFLAGS_ADD=-Wno-sign-compare -export uipc_mbuf.o_CFLAGS_ADD=-Wno-sign-compare -export uipc_mbuf2.o_CFLAGS_ADD=-Wno-sign-compare -export uipc_socket.o_CFLAGS_ADD=-Wno-sign-compare -export uipc_socket2.o_CFLAGS_ADD=-Wno-sign-compare - include $(MakeInc_cmd) include $(MakeInc_def) -SETUP_SUBDIRS = \ - tools +SETUP_SUBDIRS = COMP_SUBDIRS = @@ -104,30 +22,24 @@ else export COMPOBJROOT=$(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)/$(COMPONENT) endif -$(COMPOBJROOT)/doconf: - @make build_setup +MASTER_CPU_PER_SOC = $(SOURCE)/MASTER.$(ARCH_CONFIG_LC).$(MACHINE_CONFIG_LC) $(COMPOBJROOT)/$(BSD_KERNEL_CONFIG)/Makefile : $(SOURCE)/MASTER \ $(SOURCE)/MASTER.$(ARCH_CONFIG_LC) \ $(SOURCE)/Makefile.template \ $(SOURCE)/Makefile.$(ARCH_CONFIG_LC) \ $(SOURCE)/files \ - $(SOURCE)/files.$(ARCH_CONFIG_LC) \ - $(COMPOBJROOT)/doconf + $(SOURCE)/files.$(ARCH_CONFIG_LC) $(_v)(doconf_target=$(addsuffix /conf, $(TARGET)); \ $(MKDIR) $${doconf_target}; \ cd $${doconf_target}; \ rm -f $(notdir $?); \ cp $? $${doconf_target}; \ - $(COMPOBJROOT)/doconf -c -cpu $(ARCH_CONFIG_LC) -d $(TARGET)/$(BSD_KERNEL_CONFIG) $(BSD_KERNEL_CONFIG); \ + if [ -f $(MASTER_CPU_PER_SOC) ]; then cp $(MASTER_CPU_PER_SOC) $${doconf_target}; fi; \ + $(SRCROOT)/SETUP/config/doconf -c -cpu $(ARCH_CONFIG_LC) -soc $(MACHINE_CONFIG_LC) -d $(TARGET)/$(BSD_KERNEL_CONFIG) $(BSD_KERNEL_CONFIG); \ ); -.ORDER: $(COMPOBJROOT)/$(BSD_KERNEL_CONFIG)/Makefile - -do_setup_conf: $(COMPOBJROOT)/doconf \ - $(COMPOBJROOT)/$(BSD_KERNEL_CONFIG)/Makefile - -do_all: do_setup_conf +do_all: $(COMPOBJROOT)/$(BSD_KERNEL_CONFIG)/Makefile $(_v)next_source=$(subst conf/,,$(SOURCE)); \ ${MAKE} -C $(COMPOBJROOT)/$(BSD_KERNEL_CONFIG) \ MAKEFILES=$(TARGET)/$(BSD_KERNEL_CONFIG)/Makefile \ diff --git a/bsd/conf/Makefile.i386 b/bsd/conf/Makefile.i386 index 0b5f62979..a46354589 100644 --- a/bsd/conf/Makefile.i386 +++ b/bsd/conf/Makefile.i386 @@ -2,46 +2,12 @@ #BEGIN Machine dependent Makefile fragment for i386 ###################################################################### -# files to build with certain warnings turned off +# Files to build with certain warnings turned off dis_tables.o_CFLAGS_ADD += -Wno-cast-qual fbt_x86.o_CFLAGS_ADD += -Wno-cast-qual - -# Enable -Werror for i386 builds -CFLAGS+=$(WERROR) -CWARNFLAGS= $(filter-out -Wbad-function-cast, $(CWARNFLAGS_STD)) - -# Objects that don't compile cleanly: -OBJS_NO_WERROR = \ - fifo_vnops.o \ - aescrypt.o \ - aeskey.o \ - des_setkey.o \ - sha2.o \ - if_ethersubr.o \ - if_media.o \ - kext_net.o \ - dhcp_options.o \ - in_bootp.o \ - krpc_subr.o \ - ux_exception.o \ - unix_startup.o \ - randomdev.o \ - vnode_pager.o \ - dp_backing_file.o \ - vm_unix.o \ - mem.o \ - km.o \ - init_sysent.o \ - drv_dep.o \ - sdt_x86.o \ - dtrace_isa.o \ - aes_modes.o - - -OBJS_WERROR=$(filter-out $(OBJS_NO_WERROR),$(OBJS)) - -$(OBJS_WERROR): WERROR=-Werror +# sha256 Files to build with -DSHA256_USE_ASSEMBLY=1 +sha2.o_CFLAGS_ADD += -DSHA256_USE_ASSEMBLY=1 ###################################################################### #END Machine dependent Makefile fragment for i386 diff --git a/bsd/conf/Makefile.ppc b/bsd/conf/Makefile.ppc deleted file mode 100644 index 2dd4e88b3..000000000 --- a/bsd/conf/Makefile.ppc +++ /dev/null @@ -1,53 +0,0 @@ -###################################################################### -#BEGIN Machine dependent Makefile fragment for ppc -###################################################################### - -# files to build with certain warnings turned off -dis_tables.o_CFLAGS_ADD += -Wno-cast-qual -fbt_ppc.o_CFLAGS_ADD += -Wno-cast-qual -Wno-pointer-to-int-cast - - -# Enable -Werror for ppc builds -CFLAGS+=$(WERROR) -CWARNFLAGS= $(filter-out -Wbad-function-cast, $(CWARNFLAGS_STD)) - -# Objects that don't compile cleanly: -OBJS_NO_WERROR = \ - fifo_vnops.o \ - aescrypt.o \ - aeskey.o \ - des_setkey.o \ - sha2.o \ - shadow.o \ - if_ethersubr.o \ - if_media.o \ - kext_net.o \ - dhcp_options.o \ - in_bootp.o \ - krpc_subr.o \ - ux_exception.o \ - sysctl.o \ - unix_startup.o \ - randomdev.o \ - devtimer.o \ - vnode_pager.o \ - dp_backing_file.o \ - vm_unix.o \ - mem.o \ - km.o \ - at.o \ - drv_dep.o \ - fbt_ppc.o \ - sdt_ppc.o \ - dtrace_isa.o \ - dtrace_subr_ppc.o - - -OBJS_WERROR=$(filter-out $(OBJS_NO_WERROR),$(OBJS)) - -$(OBJS_WERROR): WERROR=-Werror - -###################################################################### -#END Machine dependent Makefile fragment for ppc -###################################################################### - diff --git a/bsd/conf/Makefile.template b/bsd/conf/Makefile.template index fdee45a3e..8691ce705 100644 --- a/bsd/conf/Makefile.template +++ b/bsd/conf/Makefile.template @@ -43,10 +43,81 @@ include $(MakeInc_def) # # XXX: CFLAGS # -CFLAGS+= -imacros meta_features.h -DARCH_PRIVATE -DKERNEL -DDRIVER_PRIVATE \ +CFLAGS+= -include meta_features.h -DARCH_PRIVATE -DDRIVER_PRIVATE \ -D_KERNEL_BUILD -DKERNEL_BUILD -DMACH_KERNEL -DBSD_BUILD \ -DBSD_KERNEL_PRIVATE -DLP64KERN=1 -DLP64_DEBUG=0 -I. $(CFLAGS_INLINE_CONFIG) +dp_backing_file.o_CFLAGS_ADD += -Wshorten-64-to-32 +ubc_subr.o_CFLAGS_ADD += -Wshorten-64-to-32 +vnode_pager.o_CFLAGS_ADD += -Wshorten-64-to-32 +vm_unix.o_CFLAGS_ADD += -Wshorten-64-to-32 + +# Objects that don't want -Wsign-compare +OBJS_NO_SIGN_COMPARE = \ + radix.o \ + route.o \ + rtsock.o \ + dhcp_options.o \ + igmp.o \ + in_cksum.o \ + ip_divert.o \ + ip_dummynet.o \ + ip_flow.o \ + ip_fw2.o \ + ip_fw2_compat.o \ + ip_icmp.o \ + ip_input.o \ + ip_mroute.o \ + ip_output.o \ + raw_ip.o \ + tcp_input.o \ + tcp_output.o \ + tcp_subr.o \ + tcp_usrreq.o \ + tcp_timer.o \ + udp_usrreq.o \ + ah_input.o \ + ah_core.o \ + ah_output.o \ + esp_core.o \ + esp_input.o \ + esp_output.o \ + esp_rijndael.o \ + ipsec.o \ + dest6.o \ + frag6.o \ + icmp6.o \ + in6.o \ + in6_src.o \ + in6_cksum.o \ + ip6_fw.o \ + ip6_forward.o \ + in6_ifattach.o \ + ip6_input.o \ + ip6_mroute.o \ + ip6_output.o \ + ipcomp_input.o \ + ipcomp_output.o \ + in6_proto.o \ + mld6.o \ + nd6.o \ + nd6_nbr.o \ + nd6_rtr.o \ + raw_ip6.o \ + route6.o \ + scope6.o \ + udp6_usrreq.o \ + key.o \ + keysock.o \ + keydb.o \ + des_setkey.o \ + uipc_mbuf.o \ + uipc_mbuf2.o \ + uipc_socket.o \ + uipc_socket2.o + +$(foreach file,$(OBJS_NO_SIGN_COMPARE),$(eval $(call add_perfile_cflags,$(file),-Wno-sign-compare))) + # # Directories for mig generated files # @@ -98,11 +169,11 @@ ${OBJS}: ${OBJSDEPS} LDOBJS = $(OBJS) -$(COMPONENT).o: $(LDOBJS) +$(COMPONENT).filelist: $(LDOBJS) @echo LDFILELIST $(COMPONENT) $(_v)( for obj in ${LDOBJS}; do \ echo $(TARGET)$(COMP_OBJ_DIR)/$(KERNEL_CONFIG)/$${obj}; \ - done; ) > $(COMPONENT).o + done; ) > $(COMPONENT).filelist MAKESYSCALLS = $(SRCROOT)/bsd/kern/makesyscalls.sh @@ -121,7 +192,7 @@ audit_kevents.c: $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS) do_depend: do_all $(_v)${MD} -u Makedep -f -d `ls *.d`; -do_all: $(COMPONENT).o +do_all: $(COMPONENT).filelist do_build_all: do_depend diff --git a/bsd/conf/Makefile.x86_64 b/bsd/conf/Makefile.x86_64 index 83b41e2dd..29811299a 100644 --- a/bsd/conf/Makefile.x86_64 +++ b/bsd/conf/Makefile.x86_64 @@ -2,46 +2,12 @@ #BEGIN Machine dependent Makefile fragment for x86_64 ###################################################################### -# files to build with certain warnings turned off +# Files to build with certain warnings turned off dis_tables.o_CFLAGS_ADD += -Wno-cast-qual fbt_x86.o_CFLAGS_ADD += -Wno-cast-qual - -# Enable -Werror for x86_64 builds -CFLAGS+=$(WERROR) -CWARNFLAGS= $(filter-out -Wbad-function-cast, $(CWARNFLAGS_STD)) - -# Objects that don't compile cleanly: -OBJS_NO_WERROR = \ - fifo_vnops.o \ - aescrypt.o \ - aeskey.o \ - des_setkey.o \ - sha2.o \ - if_ethersubr.o \ - if_media.o \ - kext_net.o \ - dhcp_options.o \ - in_bootp.o \ - krpc_subr.o \ - ux_exception.o \ - unix_startup.o \ - randomdev.o \ - vnode_pager.o \ - dp_backing_file.o \ - vm_unix.o \ - mem.o \ - km.o \ - init_sysent.o \ - drv_dep.o \ - sdt_x86.o \ - dtrace_isa.o \ - aes_modes.o - - -OBJS_WERROR=$(filter-out $(OBJS_NO_WERROR),$(OBJS)) - -$(OBJS_WERROR): WERROR=-Werror +# sha256 Files to build with -DSHA256_USE_ASSEMBLY=1 +sha2.o_CFLAGS_ADD += -DSHA256_USE_ASSEMBLY=1 ###################################################################### #END Machine dependent Makefile fragment for x86_64 diff --git a/bsd/conf/files b/bsd/conf/files index 92ea7269a..b3a7b10c4 100644 --- a/bsd/conf/files +++ b/bsd/conf/files @@ -107,6 +107,7 @@ OPTIONS/ipfw2 optional ipfw2 OPTIONS/ipfirewall optional ipfirewall OPTIONS/ipv6firewall optional ipv6firewall OPTIONS/tcpdebug optional tcpdebug +OPTIONS/if_bridge optional if_bridge OPTIONS/faith optional faith OPTIONS/gif optional gif OPTIONS/netat optional netat @@ -124,8 +125,6 @@ OPTIONS/hfs optional hfs OPTIONS/mfs optional mfs OPTIONS/fdesc optional fdesc OPTIONS/fifo optional fifo -OPTIONS/nullfs optional nullfs -OPTIONS/union optional union OPTIONS/devfs optional devfs OPTIONS/crypto optional crypto OPTIONS/allcrypto optional allcrypto @@ -183,13 +182,7 @@ bsd/vfs/vfs_fsevents.c standard bsd/miscfs/deadfs/dead_vnops.c standard bsd/miscfs/devfs/devfs_fdesc_support.c optional fdesc bsd/miscfs/fifofs/fifo_vnops.c optional fifo sockets -bsd/miscfs/nullfs/null_subr.c optional nullfs -bsd/miscfs/nullfs/null_vfsops.c optional nullfs -bsd/miscfs/nullfs/null_vnops.c optional nullfs bsd/miscfs/specfs/spec_vnops.c standard -bsd/miscfs/union/union_subr.c optional union -bsd/miscfs/union/union_vfsops.c optional union -bsd/miscfs/union/union_vnops.c optional union bsd/miscfs/devfs/devfs_tree.c optional devfs bsd/miscfs/devfs/devfs_vnops.c optional devfs @@ -199,10 +192,10 @@ bsd/kern/decmpfs.c standard bsd/net/bpf.c optional bpfilter bsd/net/bpf_filter.c optional bpfilter +bsd/net/if_bridge.c optional if_bridge +bsd/net/bridgestp.c optional if_bridge bsd/net/bsd_comp.c optional ppp_bsdcomp bsd/net/if.c optional networking -bsd/net/if_atmsubr.c optional atm -bsd/net/if_disc.c optional disc bsd/net/init.c optional sockets bsd/net/dlil.c optional networking bsd/net/ether_if_module.c optional ether @@ -210,7 +203,7 @@ bsd/net/ether_at_pr_module.c optional ether netat bsd/net/ether_inet_pr_module.c optional ether inet bsd/net/ether_inet6_pr_module.c optional ether inet6 bsd/net/if_loop.c optional loop -bsd/net/if_mib.c optional netmibs +bsd/net/if_mib.c optional networking bsd/net/if_sl.c optional sl bsd/net/if_tun.c optional tun bsd/net/if_vlan.c optional vlan @@ -224,9 +217,10 @@ bsd/net/raw_cb.c optional networking bsd/net/raw_usrreq.c optional networking bsd/net/route.c optional networking bsd/net/rtsock.c optional networking +bsd/net/netsrc.c optional networking +bsd/net/ntstat.c optional networking bsd/net/slcompress.c optional ppp bsd/net/slcompress.c optional sl -bsd/net/if_dummy.c optional dummy bsd/net/if_gif.c optional gif bsd/net/if_stf.c optional stf bsd/net/net_osdep.c optional sockets @@ -243,19 +237,21 @@ bsd/net/pf_norm.c optional pf bsd/net/pf_osfp.c optional pf bsd/net/pf_ruleset.c optional pf bsd/net/pf_table.c optional pf +bsd/net/if_llreach.c optional networking -bsd/netinet/if_atm.c optional atm bsd/netinet/igmp.c optional inet bsd/netinet/in.c optional inet bsd/netinet/in_dhcp.c optional inet bsd/netinet/dhcp_options.c optional inet bsd/netinet/in_arp.c optional inet +bsd/netinet/in_mcast.c optional inet bsd/netinet/in_pcb.c optional inet +bsd/netinet/in_pcblist.c optional inet bsd/netinet/in_proto.c optional inet bsd/netinet/in_rmx.c optional inet +bsd/netinet/in_tclass.c optional inet bsd/netinet/ip_divert.c optional ipdivert bsd/netinet/ip_dummynet.c optional dummynet -bsd/netinet/ip_flow.c optional inet bsd/netinet/ip_fw2.c optional ipfw2 bsd/netinet/ip_fw2_compat.c optional ipfw2 bsd/netinet/ip_icmp.c optional inet @@ -271,6 +267,8 @@ bsd/netinet/tcp_sack.c optional inet bsd/netinet/tcp_subr.c optional inet bsd/netinet/tcp_timer.c optional inet bsd/netinet/tcp_usrreq.c optional inet +bsd/netinet/tcp_newreno.c optional inet +bsd/netinet/tcp_ledbat.c optional inet bsd/netinet/udp_usrreq.c optional inet bsd/netinet/in_gif.c optional gif inet bsd/netinet/ip_ecn.c optional inet @@ -300,8 +298,8 @@ bsd/netinet6/in6_src.c optional inet6 bsd/netinet6/ipcomp_core.c optional ipsec bsd/netinet6/ipcomp_input.c optional ipsec bsd/netinet6/ipcomp_output.c optional ipsec +bsd/netinet6/in6_mcast.c optional inet6 bsd/netinet6/in6_pcb.c optional inet6 -bsd/netinet6/in6_prefix.c optional inet6 bsd/netinet6/in6_proto.c optional inet6 bsd/netinet6/in6_rmx.c optional inet6 bsd/netinet6/mld6.c optional inet6 @@ -313,6 +311,7 @@ bsd/netinet6/route6.c optional inet6 bsd/netinet6/scope6.c optional inet6 bsd/netinet6/udp6_output.c optional inet6 bsd/netinet6/udp6_usrreq.c optional inet6 +bsd/netinet6/ip6_id.c optional inet6 bsd/netkey/key.c optional ipsec bsd/netkey/key_debug.c optional ipsec @@ -443,7 +442,7 @@ bsd/hfs/hfs_vfsutils.c optional hfs bsd/hfs/hfs_vnops.c optional hfs bsd/hfs/hfs_xattr.c optional hfs bsd/hfs/MacOSStubs.c optional hfs -bsd/hfs/cprotect.c optional hfs +bsd/hfs/hfs_cprotect.c optional hfs bsd/hfs/rangelist.c optional hfs bsd/hfs/hfscommon/BTree/BTree.c optional hfs bsd/hfs/hfscommon/BTree/BTreeAllocate.c optional hfs @@ -457,6 +456,7 @@ bsd/hfs/hfscommon/Catalog/FileIDsServices.c optional hfs bsd/hfs/hfscommon/Misc/BTreeWrapper.c optional hfs bsd/hfs/hfscommon/Misc/FileExtentMapping.c optional hfs bsd/hfs/hfscommon/Misc/VolumeAllocation.c optional hfs +bsd/hfs/hfscommon/Misc/HybridAllocator.c optional hfs bsd/hfs/hfscommon/Unicode/UnicodeWrappers.c optional hfs bsd/security/audit/audit.c optional config_audit @@ -498,6 +498,7 @@ bsd/kern/kern_malloc.c standard bsd/kern/kern_mman.c standard bsd/kern/kern_panicinfo.c optional panic_info bsd/kern/kern_physio.c standard +bsd/kern/kern_priv.c standard bsd/kern/kern_proc.c standard bsd/kern/kern_prot.c standard bsd/kern/kern_resource.c standard @@ -556,6 +557,8 @@ bsd/kern/kpi_socketfilter.c optional sockets bsd/kern/pthread_support.c optional psynch bsd/kern/pthread_synch.c standard bsd/kern/proc_info.c standard +bsd/kern/process_policy.c standard +bsd/kern/vm_pressure.c standard bsd/kern/socket_info.c optional sockets bsd/vm/vnode_pager.c standard @@ -585,3 +588,4 @@ bsd/dev/dtrace/profile_prvd.c optional config_dtrace bsd/dev/dtrace/fasttrap.c optional config_dtrace bsd/kern/imageboot.c optional config_imageboot + diff --git a/bsd/conf/files.i386 b/bsd/conf/files.i386 index 424cc3e3e..331f7202d 100644 --- a/bsd/conf/files.i386 +++ b/bsd/conf/files.i386 @@ -14,10 +14,19 @@ bsd/dev/i386/systemcalls.c standard bsd/dev/i386/sysctl.c standard bsd/dev/i386/unix_signal.c standard bsd/dev/i386/munge.s standard -bsd/crypto/aes/i386/aes_x86_v2.s optional crypto -bsd/crypto/aes/i386/aes_modes.c optional crypto +bsd/crypto/aes/i386/AES.s optional crypto +bsd/crypto/aes/i386/aes_modes_asm.s optional crypto +bsd/crypto/aes/i386/aes_modes_hw.s optional crypto +bsd/crypto/aes/i386/aes_key_hw.s optional crypto +bsd/crypto/aes/i386/aes_crypt_hw.s optional crypto +bsd/crypto/aes/i386/aesxts_asm.s optional crypto +bsd/crypto/aes/i386/aesxts.c optional crypto +bsd/crypto/sha2/intel/sha256.s optional crypto +bsd/crypto/sha2/intel/sha256nossse3.s optional crypto + +# Lightly ifdef'd to support K64 DTrace bsd/dev/i386/dtrace_isa.c optional config_dtrace bsd/dev/i386/dtrace_subr_x86.c optional config_dtrace bsd/dev/i386/fbt_x86.c optional config_dtrace @@ -26,6 +35,11 @@ bsd/dev/i386/fasttrap_isa.c optional config_dtrace bsd/dev/i386/instr_size.c optional config_dtrace bsd/dev/i386/dis_tables.c optional config_dtrace +# Support for identifying MACF calouts with locks held +bsd/kern/policy_check.c optional config_macf + bsd/kern/bsd_stubs.c standard bsd/netinet/in_cksum.c optional inet + + diff --git a/bsd/conf/files.ppc b/bsd/conf/files.ppc deleted file mode 100644 index 57e8870a7..000000000 --- a/bsd/conf/files.ppc +++ /dev/null @@ -1,34 +0,0 @@ -OPTIONS/show_space optional show_space -OPTIONS/gdb optional gdb -OPTIONS/iplmeas optional iplmeas - -bsd/netinet/in_cksum.c optional inet - -bsd/dev/ppc/conf.c standard -bsd/dev/ppc/cons.c standard -bsd/dev/ppc/mem.c standard -bsd/dev/ppc/unix_signal.c standard -bsd/dev/ppc/ffs.s standard -bsd/dev/ppc/memmove.c standard -bsd/dev/ppc/machdep.c standard -bsd/dev/ppc/kern_machdep.c standard -bsd/dev/ppc/stubs.c standard -bsd/dev/ppc/systemcalls.c standard -bsd/dev/ppc/km.c standard -bsd/dev/ppc/xsumas.s standard -bsd/dev/ppc/sysctl.c standard -bsd/dev/ppc/munge.s standard -bsd/crypto/aes/ppc/aescrypt.c optional crypto -bsd/crypto/aes/ppc/aeskey.c optional crypto -bsd/crypto/aes/ppc/aestab.c optional crypto - - -bsd/dev/ppc/dtrace_isa.c optional config_dtrace -bsd/dev/ppc/dtrace_subr_ppc.c optional config_dtrace -bsd/dev/ppc/fbt_ppc.c optional config_dtrace -bsd/dev/ppc/sdt_ppc.c optional config_dtrace -bsd/dev/ppc/fasttrap_isa.c optional config_dtrace - -bsd/kern/bsd_stubs.c standard - - diff --git a/bsd/conf/files.x86_64 b/bsd/conf/files.x86_64 index 322174554..fcb3be604 100644 --- a/bsd/conf/files.x86_64 +++ b/bsd/conf/files.x86_64 @@ -15,9 +15,16 @@ bsd/dev/i386/sysctl.c standard bsd/dev/i386/unix_signal.c standard bsd/dev/x86_64/munge.s standard -bsd/crypto/aes/gen/aescrypt.c optional crypto -bsd/crypto/aes/gen/aeskey.c optional crypto -bsd/crypto/aes/gen/aestab.c optional crypto +bsd/crypto/aes/i386/AES.s optional crypto +bsd/crypto/aes/i386/aes_modes_asm.s optional crypto +bsd/crypto/aes/i386/aes_modes_hw.s optional crypto +bsd/crypto/aes/i386/aes_key_hw.s optional crypto +bsd/crypto/aes/i386/aes_crypt_hw.s optional crypto +bsd/crypto/aes/i386/aesxts_asm.s optional crypto +bsd/crypto/aes/i386/aesxts.c optional crypto + +bsd/crypto/sha2/intel/sha256.s optional crypto +bsd/crypto/sha2/intel/sha256nossse3.s optional crypto # Lightly ifdef'd to support K64 DTrace bsd/dev/i386/dtrace_isa.c optional config_dtrace @@ -28,6 +35,9 @@ bsd/dev/i386/fasttrap_isa.c optional config_dtrace bsd/dev/i386/instr_size.c optional config_dtrace bsd/dev/i386/dis_tables.c optional config_dtrace +# Support for identifying MACF calouts with locks held +bsd/kern/policy_check.c optional config_macf + bsd/kern/bsd_stubs.c standard bsd/netinet/in_cksum.c optional inet diff --git a/bsd/conf/param.c b/bsd/conf/param.c index 9aafb343c..95c01ffb5 100644 --- a/bsd/conf/param.c +++ b/bsd/conf/param.c @@ -91,7 +91,8 @@ int maxprocperuid = NPROC/2; int nprocs = 0; /* XXX */ //#define NTEXT (80 + NPROC / 8) /* actually the object cache */ -int desiredvnodes = CONFIG_VNODES; +int desiredvnodes = 0; /* desiredvnodes is set explicitly in unix_startup.c */ +uint32_t kern_maxvnodes = 0; /* global, to be read from the device tree */ #define MAXFILES (OPEN_MAX + 2048) int maxfiles = MAXFILES; diff --git a/bsd/conf/tools/Makefile b/bsd/conf/tools/Makefile deleted file mode 100644 index 4f9ccd553..000000000 --- a/bsd/conf/tools/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -SETUP_SUBDIRS = doconf - -COMP_SUBDIRS = doconf - -INST_SUBDIRS = \ - - -setup_build_all: - @echo "[ $(SOURCE) ] make setup_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -do_build_all: - @echo "[ $(SOURCE) ] make do_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -setup_build_install: - @echo "[ $(SOURCE) ] make setup_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -do_build_install: - @echo "[ $(SOURCE) ] make do_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -include $(MakeInc_rule) -include $(MakeInc_dir) - - diff --git a/bsd/conf/tools/doconf/Makefile b/bsd/conf/tools/doconf/Makefile deleted file mode 100644 index 7794a4ceb..000000000 --- a/bsd/conf/tools/doconf/Makefile +++ /dev/null @@ -1,49 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -COMP_SUBDIRS = \ - -INST_SUBDIRS = \ - - -# -# Who and where -# -BINDIR= -ifneq ($(MACHINE_CONFIG), DEFAULT) -DSTDIR= $(strip $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)_$(MACHINE_CONFIG)/$(COMPONENT)/) -else -DSTDIR= $(strip $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)/$(COMPONENT)/) -endif -PROGRAM= $(DSTDIR)doconf - -# -# How to install it -# -IFLAGS= -c -m 555 - -$(PROGRAM): $(DSTDIR)% : $(SOURCE)%.csh - @-$(RM) $(RMFLAGS) $(notdir $(PROGRAM)).VERS - @sed -e "s/#PROGRAM.*/#`vers_string $(notdir $(PROGRAM))`/" \ - < $< >$(notdir $(PROGRAM)).VERS; - @install $(IFLAGS) $(notdir $(PROGRAM)).VERS $(PROGRAM); - @-$(RM) $(RMFLAGS) $(notdir $(PROGRAM)).VERS; - -do_build_setup: $(PROGRAM) - -do_build_all: - -setup_build_install: - -do_build_install: - -include $(MakeInc_rule) -include $(MakeInc_dir) - - diff --git a/bsd/crypto/Makefile b/bsd/crypto/Makefile index 0af469f52..ab0c4b986 100644 --- a/bsd/crypto/Makefile +++ b/bsd/crypto/Makefile @@ -16,18 +16,16 @@ INSTINC_SUBDIRS = \ sha2 -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ + aes INSTINC_SUBDIRS_X86_64 = \ + aes INSTINC_SUBDIRS_ARM = \ EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ diff --git a/bsd/crypto/aes/Assert.c b/bsd/crypto/aes/Assert.c new file mode 100644 index 000000000..5ba9c4472 --- /dev/null +++ b/bsd/crypto/aes/Assert.c @@ -0,0 +1,34 @@ +/* This module exists solely to check compile-time assertions. It should be + compiled when building the project, and building should be terminated if + errors are encountered. However, any object it produces need not be + included in the build. +*/ + + +#include + +#include "crypto/aes.h" +#include "Context.h" + +/* Declare CheckAssertion so that if any of the declarations below differ + from it, the compiler will report an error. +*/ +extern char CheckAssertion[1]; + +/* Ensure that ContextKey is the offset of the ks member of the AES context + structures. +*/ +extern char CheckAssertion[ContextKey == offsetof(aes_encrypt_ctx, ks)]; +extern char CheckAssertion[ContextKey == offsetof(aes_decrypt_ctx, ks)]; + /* If these assertions fail, change the definition of ContextKey in + Context.h to match the offset of the ks field. + */ + +/* Ensure that ContextKeyLength is the offset of the inf member of the AES + context structures. +*/ +extern char CheckAssertion[ContextKeyLength == offsetof(aes_encrypt_ctx, inf)]; +extern char CheckAssertion[ContextKeyLength == offsetof(aes_decrypt_ctx, inf)]; + /* If these assertions fail, change the definition of ContextKeyLength in + Context.h to match the offset of the inf field. + */ diff --git a/bsd/crypto/aes/Makefile b/bsd/crypto/aes/Makefile index 026261c65..6b96dbd34 100644 --- a/bsd/crypto/aes/Makefile +++ b/bsd/crypto/aes/Makefile @@ -9,18 +9,16 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ + i386 INSTINC_SUBDIRS_X86_64 = \ + i386 INSTINC_SUBDIRS_ARM = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ diff --git a/bsd/crypto/aes/aes.h b/bsd/crypto/aes/aes.h old mode 100644 new mode 100755 index eaba0a692..49c845da6 --- a/bsd/crypto/aes/aes.h +++ b/bsd/crypto/aes/aes.h @@ -80,14 +80,22 @@ typedef unsigned int aes_32t; #endif +#if 0 // defined (__i386__) || defined (__x86_64__) + +/* + looks like no other code for (i386/x86_64) is using the following definitions any more. + I comment this out, so the C code in the directory gen/ can be used to compile for test/development purpose. + Note : this is not going to change anything in the i386/x86_64 kernel. + (source code in i386/, mostly in assembly, does not reference to this header file.) + + cclee 10-20-2010 +*/ /* the character array 'inf' in the following structures is used */ /* to hold AES context information. This AES code uses cx->inf.b[0] */ /* to hold the number of rounds multiplied by 16. The other three */ /* elements can be used by code that implements additional modes */ -#if defined (__i386__) - #if defined( AES_ERR_CHK ) #define aes_rval int_ret #else @@ -166,7 +174,7 @@ aes_rval aes_encrypt_key256(const unsigned char *key, aes_encrypt_ctx cx[1]); aes_rval aes_encrypt_key(const unsigned char *key, int key_len, aes_encrypt_ctx cx[1]); #endif -#if defined (__i386__) +#if defined (__i386__) || defined (__x86_64__) aes_rval aes_encrypt(const unsigned char *in, unsigned char *out, const aes_encrypt_ctx cx[1]); #endif @@ -193,7 +201,7 @@ aes_rval aes_decrypt_key256(const unsigned char *key, aes_decrypt_ctx cx[1]); aes_rval aes_decrypt_key(const unsigned char *key, int key_len, aes_decrypt_ctx cx[1]); #endif -#if defined (__i386__) +#if defined (__i386__) || defined (__x86_64__) aes_rval aes_decrypt(const unsigned char *in, unsigned char *out, const aes_decrypt_ctx cx[1]); #endif diff --git a/bsd/crypto/aes/gen/Makefile b/bsd/crypto/aes/gen/Makefile index 7ea225c10..d32c71c39 100644 --- a/bsd/crypto/aes/gen/Makefile +++ b/bsd/crypto/aes/gen/Makefile @@ -9,14 +9,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ PRIVATE_DATAFILES = \ diff --git a/bsd/crypto/aes/gen/aesopt.h b/bsd/crypto/aes/gen/aesopt.h index 2b78eb920..fc28e4a48 100644 --- a/bsd/crypto/aes/gen/aesopt.h +++ b/bsd/crypto/aes/gen/aesopt.h @@ -585,12 +585,12 @@ #elif (ALGORITHM_BYTE_ORDER == PLATFORM_BYTE_ORDER) -#define word_in(x,c) (*((aes_32t*)(x)+(c))) +#define word_in(x,c) (*((const aes_32t*)(x)+(c))) #define word_out(x,c,v) (*((aes_32t*)(x)+(c)) = (v)) #else -#define word_in(x,c) aes_sw32(*((aes_32t*)(x)+(c))) +#define word_in(x,c) aes_sw32(*((const aes_32t*)(x)+(c))) #define word_out(x,c,v) (*((aes_32t*)(x)+(c)) = aes_sw32(v)) #endif diff --git a/bsd/crypto/aes/i386/AES.s b/bsd/crypto/aes/i386/AES.s new file mode 100644 index 000000000..9bf440a68 --- /dev/null +++ b/bsd/crypto/aes/i386/AES.s @@ -0,0 +1,143 @@ +/* AES.s -- Core AES routines for Intel processors. + + Written by Eric Postpischil, January 30, 2008. +*/ + + +/* We build these AES routines as a single module because the routines refer + to labels in Data.s and it is easier and faster to refer to them as local + labels. In my implementations of AES for CommonCrypto, both i386 and + x86_64 use position-independent code. For this in-kernel implementation, + i386 has been converted to absolute addressing, but x86_64 still uses PIC. + + A local label can be referred to with position-independent assembler + expressions such as "label-base(register)", where is a local label + whose address has been loaded into . (On i386, this is typically + done with the idiom of a call to the next instruction and a pop of that + return address into a register.) Without local labels, the references must + be done using spaces for addresses of "lazy symbols" that are filled in by + the dynamic loader and loaded by the code that wants the address. + + So the various routines in other files are assembled here via #include + directives. +*/ +#include "Data.s" + + +#define TableSize (256*4) + /* Each of the arrays defined in Data.s except for the round constants + in _AESRcon is composed of four tables of 256 entries of four bytes + each. TableSize is the number of bytes in one of those four tables. + */ + + +// Include constants describing the AES context structures. +#include "Context.h" + + +/* Define a macro to select a value based on architecture. This reduces + some of the architecture conditionalization later in the source. +*/ +#if defined __i386__ + #define Arch(i386, x86_64) i386 +#elif defined __x86_64__ + #define Arch(i386, x86_64) x86_64 +#endif + + +// Define an instruction for moving pointers. +#define movp Arch(movd, movd) + // Latter argument should be "movq", but the assembler uses "movd". + + +/* Rename the general registers. This makes it easier to keep track of them + and provides names for the "whole register" that are uniform between i386 + and x86_64. +*/ +#if defined __i386__ + #define r0 %eax // Available for any use. + #define r1 %ecx // Available for any use, some special purposes (loop). + #define r2 %edx // Available for any use. + #define r3 %ebx // Must be preserved by called routine. + #define r4 %esp // Stack pointer. + #define r5 %ebp // Frame pointer, must preserve, no bare indirect. + #define r6 %esi // Must be preserved by called routine. + #define r7 %edi // Must be preserved by called routine. +#elif defined __x86_64__ + #define r0 %rax // Available for any use. + #define r1 %rcx // Available for any use. + #define r2 %rdx // Available for any use. + #define r3 %rbx // Must be preserved by called routine. + #define r4 %rsp // Stack pointer. + #define r5 %rbp // Frame pointer. Must be preserved by called routine. + #define r6 %rsi // Available for any use. + #define r7 %rdi // Available for any use. + #define r8 %r8 // Available for any use. + #define r9 %r9 // Available for any use. + #define r10 %r10 // Available for any use. + #define r11 %r11 // Available for any use. + #define r12 %r12 // Must be preserved by called routine. + #define r13 %r13 // Must be preserved by called routine. + #define r14 %r14 // Must be preserved by called routine. + #define r15 %r15 // Must be preserved by called routine. +#else + #error "Unknown architecture." +#endif + +// Define names for parts of registers. + +#define r0d %eax // Low 32 bits of r0. +#define r1d %ecx // Low 32 bits of r1. +#define r2d %edx // Low 32 bits of r2. +#define r3d %ebx // Low 32 bits of r3. +#define r5d %ebp // Low 32 bits of r5. +#define r6d %esi // Low 32 bits of r6. +#define r7d %edi // Low 32 bits of r7. +#define r8d %r8d // Low 32 bits of r8. +#define r9d %r9d // Low 32 bits of r9. +#define r11d %r11d // Low 32 bits of r11. + +#define r0l %al // Low byte of r0. +#define r1l %cl // Low byte of r1. +#define r2l %dl // Low byte of r2. +#define r3l %bl // Low byte of r3. +#define r5l %bpl // Low byte of r5. + +#define r0h %ah // Second lowest byte of r0. +#define r1h %ch // Second lowest byte of r1. +#define r2h %dh // Second lowest byte of r2. +#define r3h %bh // Second lowest byte of r3. + + + .text + + +// Define encryption routine, _AESEncryptWithExpandedKey +#define Select 0 +#include "EncryptDecrypt.s" +#undef Select + + +// Define decryption routine, _AESDecryptWithExpandedKey +#define Select 1 +#include "EncryptDecrypt.s" +#undef Select + +// Define encryption routine, _AESEncryptWithExpandedKey +#define Select 2 +#include "EncryptDecrypt.s" +#undef Select + + +// Define decryption routine, _AESDecryptWithExpandedKey +#define Select 3 +#include "EncryptDecrypt.s" +#undef Select + + +// Define key expansion routine for encryption, _AESExpandKeyForEncryption. +#include "ExpandKeyForEncryption.s" + + +// Define key expansion for decryption routine, _AESExpandKeyForDecryption. +#include "ExpandKeyForDecryption.s" diff --git a/bsd/crypto/aes/i386/Context.h b/bsd/crypto/aes/i386/Context.h new file mode 100644 index 000000000..f53cb9514 --- /dev/null +++ b/bsd/crypto/aes/i386/Context.h @@ -0,0 +1,9 @@ +// Define byte offset of key within context structure. +#define ContextKey 0 + +/* Define byte offset of key length within context structure. The number + stored there is the number of bytes from the start of the first round key + to the start of the last round key. That is 16 less than the number of + bytes in the entire key. +*/ +#define ContextKeyLength 240 diff --git a/bsd/crypto/aes/i386/Data.mk b/bsd/crypto/aes/i386/Data.mk new file mode 100644 index 000000000..4b55d630f --- /dev/null +++ b/bsd/crypto/aes/i386/Data.mk @@ -0,0 +1,30 @@ +default: + @echo "This makefile builds Data.s, which contains constant data for the" + @echo "AES implementation. This file does not normally need to be rebuilt," + @echo "so it is checked into the source code repository. It should be" + @echo "changed only when the implementation changes and needs data in a" + @echo "different format. (This file can also build a C version, Data.c," + @echo "but that is not currently in use.)" + @echo "" + @echo "To rebuild the file(s), execute \"make -f Data.mk all\"." + +.PHONY: all clean +Targets = Data.s +all: $(Targets) + +CFLAGS += -O3 -std=c99 -Wmost -Werror + +.INTERMEDIATE: MakeData +MakeData: MakeData.c + +# Do not leave bad output files if the build fails. +.DELETE_ON_ERROR: $(Targets) + +Data.c: MakeData + ./$< >$@ C + +Data.s: MakeData + ./$< >$@ Intel + +clean: + -rm $(Targets) diff --git a/bsd/crypto/aes/i386/Data.s b/bsd/crypto/aes/i386/Data.s new file mode 100644 index 000000000..d330462d0 --- /dev/null +++ b/bsd/crypto/aes/i386/Data.s @@ -0,0 +1,5196 @@ +// This file was generated by MakeData.c. + + + .const + + +// Round constants. + .globl _AESRcon + .private_extern _AESRcon +_AESRcon: + .byte 0 // Not used, included for indexing simplicity. + .byte 0x01 + .byte 0x02 + .byte 0x04 + .byte 0x08 + .byte 0x10 + .byte 0x20 + .byte 0x40 + .byte 0x80 + .byte 0x1b + .byte 0x36 + + +// Tables for InvMixColumn. + .globl _AESInvMixColumnTable + .private_extern _AESInvMixColumnTable + .align 2 +_AESInvMixColumnTable: + // Table 0. + .long 0x00000000 + .long 0x0b0d090e + .long 0x161a121c + .long 0x1d171b12 + .long 0x2c342438 + .long 0x27392d36 + .long 0x3a2e3624 + .long 0x31233f2a + .long 0x58684870 + .long 0x5365417e + .long 0x4e725a6c + .long 0x457f5362 + .long 0x745c6c48 + .long 0x7f516546 + .long 0x62467e54 + .long 0x694b775a + .long 0xb0d090e0 + .long 0xbbdd99ee + .long 0xa6ca82fc + .long 0xadc78bf2 + .long 0x9ce4b4d8 + .long 0x97e9bdd6 + .long 0x8afea6c4 + .long 0x81f3afca + .long 0xe8b8d890 + .long 0xe3b5d19e + .long 0xfea2ca8c + .long 0xf5afc382 + .long 0xc48cfca8 + .long 0xcf81f5a6 + .long 0xd296eeb4 + .long 0xd99be7ba + .long 0x7bbb3bdb + .long 0x70b632d5 + .long 0x6da129c7 + .long 0x66ac20c9 + .long 0x578f1fe3 + .long 0x5c8216ed + .long 0x41950dff + .long 0x4a9804f1 + .long 0x23d373ab + .long 0x28de7aa5 + .long 0x35c961b7 + .long 0x3ec468b9 + .long 0x0fe75793 + .long 0x04ea5e9d + .long 0x19fd458f + .long 0x12f04c81 + .long 0xcb6bab3b + .long 0xc066a235 + .long 0xdd71b927 + .long 0xd67cb029 + .long 0xe75f8f03 + .long 0xec52860d + .long 0xf1459d1f + .long 0xfa489411 + .long 0x9303e34b + .long 0x980eea45 + .long 0x8519f157 + .long 0x8e14f859 + .long 0xbf37c773 + .long 0xb43ace7d + .long 0xa92dd56f + .long 0xa220dc61 + .long 0xf66d76ad + .long 0xfd607fa3 + .long 0xe07764b1 + .long 0xeb7a6dbf + .long 0xda595295 + .long 0xd1545b9b + .long 0xcc434089 + .long 0xc74e4987 + .long 0xae053edd + .long 0xa50837d3 + .long 0xb81f2cc1 + .long 0xb31225cf + .long 0x82311ae5 + .long 0x893c13eb + .long 0x942b08f9 + .long 0x9f2601f7 + .long 0x46bde64d + .long 0x4db0ef43 + .long 0x50a7f451 + .long 0x5baafd5f + .long 0x6a89c275 + .long 0x6184cb7b + .long 0x7c93d069 + .long 0x779ed967 + .long 0x1ed5ae3d + .long 0x15d8a733 + .long 0x08cfbc21 + .long 0x03c2b52f + .long 0x32e18a05 + .long 0x39ec830b + .long 0x24fb9819 + .long 0x2ff69117 + .long 0x8dd64d76 + .long 0x86db4478 + .long 0x9bcc5f6a + .long 0x90c15664 + .long 0xa1e2694e + .long 0xaaef6040 + .long 0xb7f87b52 + .long 0xbcf5725c + .long 0xd5be0506 + .long 0xdeb30c08 + .long 0xc3a4171a + .long 0xc8a91e14 + .long 0xf98a213e + .long 0xf2872830 + .long 0xef903322 + .long 0xe49d3a2c + .long 0x3d06dd96 + .long 0x360bd498 + .long 0x2b1ccf8a + .long 0x2011c684 + .long 0x1132f9ae + .long 0x1a3ff0a0 + .long 0x0728ebb2 + .long 0x0c25e2bc + .long 0x656e95e6 + .long 0x6e639ce8 + .long 0x737487fa + .long 0x78798ef4 + .long 0x495ab1de + .long 0x4257b8d0 + .long 0x5f40a3c2 + .long 0x544daacc + .long 0xf7daec41 + .long 0xfcd7e54f + .long 0xe1c0fe5d + .long 0xeacdf753 + .long 0xdbeec879 + .long 0xd0e3c177 + .long 0xcdf4da65 + .long 0xc6f9d36b + .long 0xafb2a431 + .long 0xa4bfad3f + .long 0xb9a8b62d + .long 0xb2a5bf23 + .long 0x83868009 + .long 0x888b8907 + .long 0x959c9215 + .long 0x9e919b1b + .long 0x470a7ca1 + .long 0x4c0775af + .long 0x51106ebd + .long 0x5a1d67b3 + .long 0x6b3e5899 + .long 0x60335197 + .long 0x7d244a85 + .long 0x7629438b + .long 0x1f6234d1 + .long 0x146f3ddf + .long 0x097826cd + .long 0x02752fc3 + .long 0x335610e9 + .long 0x385b19e7 + .long 0x254c02f5 + .long 0x2e410bfb + .long 0x8c61d79a + .long 0x876cde94 + .long 0x9a7bc586 + .long 0x9176cc88 + .long 0xa055f3a2 + .long 0xab58faac + .long 0xb64fe1be + .long 0xbd42e8b0 + .long 0xd4099fea + .long 0xdf0496e4 + .long 0xc2138df6 + .long 0xc91e84f8 + .long 0xf83dbbd2 + .long 0xf330b2dc + .long 0xee27a9ce + .long 0xe52aa0c0 + .long 0x3cb1477a + .long 0x37bc4e74 + .long 0x2aab5566 + .long 0x21a65c68 + .long 0x10856342 + .long 0x1b886a4c + .long 0x069f715e + .long 0x0d927850 + .long 0x64d90f0a + .long 0x6fd40604 + .long 0x72c31d16 + .long 0x79ce1418 + .long 0x48ed2b32 + .long 0x43e0223c + .long 0x5ef7392e + .long 0x55fa3020 + .long 0x01b79aec + .long 0x0aba93e2 + .long 0x17ad88f0 + .long 0x1ca081fe + .long 0x2d83bed4 + .long 0x268eb7da + .long 0x3b99acc8 + .long 0x3094a5c6 + .long 0x59dfd29c + .long 0x52d2db92 + .long 0x4fc5c080 + .long 0x44c8c98e + .long 0x75ebf6a4 + .long 0x7ee6ffaa + .long 0x63f1e4b8 + .long 0x68fcedb6 + .long 0xb1670a0c + .long 0xba6a0302 + .long 0xa77d1810 + .long 0xac70111e + .long 0x9d532e34 + .long 0x965e273a + .long 0x8b493c28 + .long 0x80443526 + .long 0xe90f427c + .long 0xe2024b72 + .long 0xff155060 + .long 0xf418596e + .long 0xc53b6644 + .long 0xce366f4a + .long 0xd3217458 + .long 0xd82c7d56 + .long 0x7a0ca137 + .long 0x7101a839 + .long 0x6c16b32b + .long 0x671bba25 + .long 0x5638850f + .long 0x5d358c01 + .long 0x40229713 + .long 0x4b2f9e1d + .long 0x2264e947 + .long 0x2969e049 + .long 0x347efb5b + .long 0x3f73f255 + .long 0x0e50cd7f + .long 0x055dc471 + .long 0x184adf63 + .long 0x1347d66d + .long 0xcadc31d7 + .long 0xc1d138d9 + .long 0xdcc623cb + .long 0xd7cb2ac5 + .long 0xe6e815ef + .long 0xede51ce1 + .long 0xf0f207f3 + .long 0xfbff0efd + .long 0x92b479a7 + .long 0x99b970a9 + .long 0x84ae6bbb + .long 0x8fa362b5 + .long 0xbe805d9f + .long 0xb58d5491 + .long 0xa89a4f83 + .long 0xa397468d + // Table 1. + .long 0x00000000 + .long 0x0d090e0b + .long 0x1a121c16 + .long 0x171b121d + .long 0x3424382c + .long 0x392d3627 + .long 0x2e36243a + .long 0x233f2a31 + .long 0x68487058 + .long 0x65417e53 + .long 0x725a6c4e + .long 0x7f536245 + .long 0x5c6c4874 + .long 0x5165467f + .long 0x467e5462 + .long 0x4b775a69 + .long 0xd090e0b0 + .long 0xdd99eebb + .long 0xca82fca6 + .long 0xc78bf2ad + .long 0xe4b4d89c + .long 0xe9bdd697 + .long 0xfea6c48a + .long 0xf3afca81 + .long 0xb8d890e8 + .long 0xb5d19ee3 + .long 0xa2ca8cfe + .long 0xafc382f5 + .long 0x8cfca8c4 + .long 0x81f5a6cf + .long 0x96eeb4d2 + .long 0x9be7bad9 + .long 0xbb3bdb7b + .long 0xb632d570 + .long 0xa129c76d + .long 0xac20c966 + .long 0x8f1fe357 + .long 0x8216ed5c + .long 0x950dff41 + .long 0x9804f14a + .long 0xd373ab23 + .long 0xde7aa528 + .long 0xc961b735 + .long 0xc468b93e + .long 0xe757930f + .long 0xea5e9d04 + .long 0xfd458f19 + .long 0xf04c8112 + .long 0x6bab3bcb + .long 0x66a235c0 + .long 0x71b927dd + .long 0x7cb029d6 + .long 0x5f8f03e7 + .long 0x52860dec + .long 0x459d1ff1 + .long 0x489411fa + .long 0x03e34b93 + .long 0x0eea4598 + .long 0x19f15785 + .long 0x14f8598e + .long 0x37c773bf + .long 0x3ace7db4 + .long 0x2dd56fa9 + .long 0x20dc61a2 + .long 0x6d76adf6 + .long 0x607fa3fd + .long 0x7764b1e0 + .long 0x7a6dbfeb + .long 0x595295da + .long 0x545b9bd1 + .long 0x434089cc + .long 0x4e4987c7 + .long 0x053eddae + .long 0x0837d3a5 + .long 0x1f2cc1b8 + .long 0x1225cfb3 + .long 0x311ae582 + .long 0x3c13eb89 + .long 0x2b08f994 + .long 0x2601f79f + .long 0xbde64d46 + .long 0xb0ef434d + .long 0xa7f45150 + .long 0xaafd5f5b + .long 0x89c2756a + .long 0x84cb7b61 + .long 0x93d0697c + .long 0x9ed96777 + .long 0xd5ae3d1e + .long 0xd8a73315 + .long 0xcfbc2108 + .long 0xc2b52f03 + .long 0xe18a0532 + .long 0xec830b39 + .long 0xfb981924 + .long 0xf691172f + .long 0xd64d768d + .long 0xdb447886 + .long 0xcc5f6a9b + .long 0xc1566490 + .long 0xe2694ea1 + .long 0xef6040aa + .long 0xf87b52b7 + .long 0xf5725cbc + .long 0xbe0506d5 + .long 0xb30c08de + .long 0xa4171ac3 + .long 0xa91e14c8 + .long 0x8a213ef9 + .long 0x872830f2 + .long 0x903322ef + .long 0x9d3a2ce4 + .long 0x06dd963d + .long 0x0bd49836 + .long 0x1ccf8a2b + .long 0x11c68420 + .long 0x32f9ae11 + .long 0x3ff0a01a + .long 0x28ebb207 + .long 0x25e2bc0c + .long 0x6e95e665 + .long 0x639ce86e + .long 0x7487fa73 + .long 0x798ef478 + .long 0x5ab1de49 + .long 0x57b8d042 + .long 0x40a3c25f + .long 0x4daacc54 + .long 0xdaec41f7 + .long 0xd7e54ffc + .long 0xc0fe5de1 + .long 0xcdf753ea + .long 0xeec879db + .long 0xe3c177d0 + .long 0xf4da65cd + .long 0xf9d36bc6 + .long 0xb2a431af + .long 0xbfad3fa4 + .long 0xa8b62db9 + .long 0xa5bf23b2 + .long 0x86800983 + .long 0x8b890788 + .long 0x9c921595 + .long 0x919b1b9e + .long 0x0a7ca147 + .long 0x0775af4c + .long 0x106ebd51 + .long 0x1d67b35a + .long 0x3e58996b + .long 0x33519760 + .long 0x244a857d + .long 0x29438b76 + .long 0x6234d11f + .long 0x6f3ddf14 + .long 0x7826cd09 + .long 0x752fc302 + .long 0x5610e933 + .long 0x5b19e738 + .long 0x4c02f525 + .long 0x410bfb2e + .long 0x61d79a8c + .long 0x6cde9487 + .long 0x7bc5869a + .long 0x76cc8891 + .long 0x55f3a2a0 + .long 0x58faacab + .long 0x4fe1beb6 + .long 0x42e8b0bd + .long 0x099fead4 + .long 0x0496e4df + .long 0x138df6c2 + .long 0x1e84f8c9 + .long 0x3dbbd2f8 + .long 0x30b2dcf3 + .long 0x27a9ceee + .long 0x2aa0c0e5 + .long 0xb1477a3c + .long 0xbc4e7437 + .long 0xab55662a + .long 0xa65c6821 + .long 0x85634210 + .long 0x886a4c1b + .long 0x9f715e06 + .long 0x9278500d + .long 0xd90f0a64 + .long 0xd406046f + .long 0xc31d1672 + .long 0xce141879 + .long 0xed2b3248 + .long 0xe0223c43 + .long 0xf7392e5e + .long 0xfa302055 + .long 0xb79aec01 + .long 0xba93e20a + .long 0xad88f017 + .long 0xa081fe1c + .long 0x83bed42d + .long 0x8eb7da26 + .long 0x99acc83b + .long 0x94a5c630 + .long 0xdfd29c59 + .long 0xd2db9252 + .long 0xc5c0804f + .long 0xc8c98e44 + .long 0xebf6a475 + .long 0xe6ffaa7e + .long 0xf1e4b863 + .long 0xfcedb668 + .long 0x670a0cb1 + .long 0x6a0302ba + .long 0x7d1810a7 + .long 0x70111eac + .long 0x532e349d + .long 0x5e273a96 + .long 0x493c288b + .long 0x44352680 + .long 0x0f427ce9 + .long 0x024b72e2 + .long 0x155060ff + .long 0x18596ef4 + .long 0x3b6644c5 + .long 0x366f4ace + .long 0x217458d3 + .long 0x2c7d56d8 + .long 0x0ca1377a + .long 0x01a83971 + .long 0x16b32b6c + .long 0x1bba2567 + .long 0x38850f56 + .long 0x358c015d + .long 0x22971340 + .long 0x2f9e1d4b + .long 0x64e94722 + .long 0x69e04929 + .long 0x7efb5b34 + .long 0x73f2553f + .long 0x50cd7f0e + .long 0x5dc47105 + .long 0x4adf6318 + .long 0x47d66d13 + .long 0xdc31d7ca + .long 0xd138d9c1 + .long 0xc623cbdc + .long 0xcb2ac5d7 + .long 0xe815efe6 + .long 0xe51ce1ed + .long 0xf207f3f0 + .long 0xff0efdfb + .long 0xb479a792 + .long 0xb970a999 + .long 0xae6bbb84 + .long 0xa362b58f + .long 0x805d9fbe + .long 0x8d5491b5 + .long 0x9a4f83a8 + .long 0x97468da3 + // Table 2. + .long 0x00000000 + .long 0x090e0b0d + .long 0x121c161a + .long 0x1b121d17 + .long 0x24382c34 + .long 0x2d362739 + .long 0x36243a2e + .long 0x3f2a3123 + .long 0x48705868 + .long 0x417e5365 + .long 0x5a6c4e72 + .long 0x5362457f + .long 0x6c48745c + .long 0x65467f51 + .long 0x7e546246 + .long 0x775a694b + .long 0x90e0b0d0 + .long 0x99eebbdd + .long 0x82fca6ca + .long 0x8bf2adc7 + .long 0xb4d89ce4 + .long 0xbdd697e9 + .long 0xa6c48afe + .long 0xafca81f3 + .long 0xd890e8b8 + .long 0xd19ee3b5 + .long 0xca8cfea2 + .long 0xc382f5af + .long 0xfca8c48c + .long 0xf5a6cf81 + .long 0xeeb4d296 + .long 0xe7bad99b + .long 0x3bdb7bbb + .long 0x32d570b6 + .long 0x29c76da1 + .long 0x20c966ac + .long 0x1fe3578f + .long 0x16ed5c82 + .long 0x0dff4195 + .long 0x04f14a98 + .long 0x73ab23d3 + .long 0x7aa528de + .long 0x61b735c9 + .long 0x68b93ec4 + .long 0x57930fe7 + .long 0x5e9d04ea + .long 0x458f19fd + .long 0x4c8112f0 + .long 0xab3bcb6b + .long 0xa235c066 + .long 0xb927dd71 + .long 0xb029d67c + .long 0x8f03e75f + .long 0x860dec52 + .long 0x9d1ff145 + .long 0x9411fa48 + .long 0xe34b9303 + .long 0xea45980e + .long 0xf1578519 + .long 0xf8598e14 + .long 0xc773bf37 + .long 0xce7db43a + .long 0xd56fa92d + .long 0xdc61a220 + .long 0x76adf66d + .long 0x7fa3fd60 + .long 0x64b1e077 + .long 0x6dbfeb7a + .long 0x5295da59 + .long 0x5b9bd154 + .long 0x4089cc43 + .long 0x4987c74e + .long 0x3eddae05 + .long 0x37d3a508 + .long 0x2cc1b81f + .long 0x25cfb312 + .long 0x1ae58231 + .long 0x13eb893c + .long 0x08f9942b + .long 0x01f79f26 + .long 0xe64d46bd + .long 0xef434db0 + .long 0xf45150a7 + .long 0xfd5f5baa + .long 0xc2756a89 + .long 0xcb7b6184 + .long 0xd0697c93 + .long 0xd967779e + .long 0xae3d1ed5 + .long 0xa73315d8 + .long 0xbc2108cf + .long 0xb52f03c2 + .long 0x8a0532e1 + .long 0x830b39ec + .long 0x981924fb + .long 0x91172ff6 + .long 0x4d768dd6 + .long 0x447886db + .long 0x5f6a9bcc + .long 0x566490c1 + .long 0x694ea1e2 + .long 0x6040aaef + .long 0x7b52b7f8 + .long 0x725cbcf5 + .long 0x0506d5be + .long 0x0c08deb3 + .long 0x171ac3a4 + .long 0x1e14c8a9 + .long 0x213ef98a + .long 0x2830f287 + .long 0x3322ef90 + .long 0x3a2ce49d + .long 0xdd963d06 + .long 0xd498360b + .long 0xcf8a2b1c + .long 0xc6842011 + .long 0xf9ae1132 + .long 0xf0a01a3f + .long 0xebb20728 + .long 0xe2bc0c25 + .long 0x95e6656e + .long 0x9ce86e63 + .long 0x87fa7374 + .long 0x8ef47879 + .long 0xb1de495a + .long 0xb8d04257 + .long 0xa3c25f40 + .long 0xaacc544d + .long 0xec41f7da + .long 0xe54ffcd7 + .long 0xfe5de1c0 + .long 0xf753eacd + .long 0xc879dbee + .long 0xc177d0e3 + .long 0xda65cdf4 + .long 0xd36bc6f9 + .long 0xa431afb2 + .long 0xad3fa4bf + .long 0xb62db9a8 + .long 0xbf23b2a5 + .long 0x80098386 + .long 0x8907888b + .long 0x9215959c + .long 0x9b1b9e91 + .long 0x7ca1470a + .long 0x75af4c07 + .long 0x6ebd5110 + .long 0x67b35a1d + .long 0x58996b3e + .long 0x51976033 + .long 0x4a857d24 + .long 0x438b7629 + .long 0x34d11f62 + .long 0x3ddf146f + .long 0x26cd0978 + .long 0x2fc30275 + .long 0x10e93356 + .long 0x19e7385b + .long 0x02f5254c + .long 0x0bfb2e41 + .long 0xd79a8c61 + .long 0xde94876c + .long 0xc5869a7b + .long 0xcc889176 + .long 0xf3a2a055 + .long 0xfaacab58 + .long 0xe1beb64f + .long 0xe8b0bd42 + .long 0x9fead409 + .long 0x96e4df04 + .long 0x8df6c213 + .long 0x84f8c91e + .long 0xbbd2f83d + .long 0xb2dcf330 + .long 0xa9ceee27 + .long 0xa0c0e52a + .long 0x477a3cb1 + .long 0x4e7437bc + .long 0x55662aab + .long 0x5c6821a6 + .long 0x63421085 + .long 0x6a4c1b88 + .long 0x715e069f + .long 0x78500d92 + .long 0x0f0a64d9 + .long 0x06046fd4 + .long 0x1d1672c3 + .long 0x141879ce + .long 0x2b3248ed + .long 0x223c43e0 + .long 0x392e5ef7 + .long 0x302055fa + .long 0x9aec01b7 + .long 0x93e20aba + .long 0x88f017ad + .long 0x81fe1ca0 + .long 0xbed42d83 + .long 0xb7da268e + .long 0xacc83b99 + .long 0xa5c63094 + .long 0xd29c59df + .long 0xdb9252d2 + .long 0xc0804fc5 + .long 0xc98e44c8 + .long 0xf6a475eb + .long 0xffaa7ee6 + .long 0xe4b863f1 + .long 0xedb668fc + .long 0x0a0cb167 + .long 0x0302ba6a + .long 0x1810a77d + .long 0x111eac70 + .long 0x2e349d53 + .long 0x273a965e + .long 0x3c288b49 + .long 0x35268044 + .long 0x427ce90f + .long 0x4b72e202 + .long 0x5060ff15 + .long 0x596ef418 + .long 0x6644c53b + .long 0x6f4ace36 + .long 0x7458d321 + .long 0x7d56d82c + .long 0xa1377a0c + .long 0xa8397101 + .long 0xb32b6c16 + .long 0xba25671b + .long 0x850f5638 + .long 0x8c015d35 + .long 0x97134022 + .long 0x9e1d4b2f + .long 0xe9472264 + .long 0xe0492969 + .long 0xfb5b347e + .long 0xf2553f73 + .long 0xcd7f0e50 + .long 0xc471055d + .long 0xdf63184a + .long 0xd66d1347 + .long 0x31d7cadc + .long 0x38d9c1d1 + .long 0x23cbdcc6 + .long 0x2ac5d7cb + .long 0x15efe6e8 + .long 0x1ce1ede5 + .long 0x07f3f0f2 + .long 0x0efdfbff + .long 0x79a792b4 + .long 0x70a999b9 + .long 0x6bbb84ae + .long 0x62b58fa3 + .long 0x5d9fbe80 + .long 0x5491b58d + .long 0x4f83a89a + .long 0x468da397 + // Table 3. + .long 0x00000000 + .long 0x0e0b0d09 + .long 0x1c161a12 + .long 0x121d171b + .long 0x382c3424 + .long 0x3627392d + .long 0x243a2e36 + .long 0x2a31233f + .long 0x70586848 + .long 0x7e536541 + .long 0x6c4e725a + .long 0x62457f53 + .long 0x48745c6c + .long 0x467f5165 + .long 0x5462467e + .long 0x5a694b77 + .long 0xe0b0d090 + .long 0xeebbdd99 + .long 0xfca6ca82 + .long 0xf2adc78b + .long 0xd89ce4b4 + .long 0xd697e9bd + .long 0xc48afea6 + .long 0xca81f3af + .long 0x90e8b8d8 + .long 0x9ee3b5d1 + .long 0x8cfea2ca + .long 0x82f5afc3 + .long 0xa8c48cfc + .long 0xa6cf81f5 + .long 0xb4d296ee + .long 0xbad99be7 + .long 0xdb7bbb3b + .long 0xd570b632 + .long 0xc76da129 + .long 0xc966ac20 + .long 0xe3578f1f + .long 0xed5c8216 + .long 0xff41950d + .long 0xf14a9804 + .long 0xab23d373 + .long 0xa528de7a + .long 0xb735c961 + .long 0xb93ec468 + .long 0x930fe757 + .long 0x9d04ea5e + .long 0x8f19fd45 + .long 0x8112f04c + .long 0x3bcb6bab + .long 0x35c066a2 + .long 0x27dd71b9 + .long 0x29d67cb0 + .long 0x03e75f8f + .long 0x0dec5286 + .long 0x1ff1459d + .long 0x11fa4894 + .long 0x4b9303e3 + .long 0x45980eea + .long 0x578519f1 + .long 0x598e14f8 + .long 0x73bf37c7 + .long 0x7db43ace + .long 0x6fa92dd5 + .long 0x61a220dc + .long 0xadf66d76 + .long 0xa3fd607f + .long 0xb1e07764 + .long 0xbfeb7a6d + .long 0x95da5952 + .long 0x9bd1545b + .long 0x89cc4340 + .long 0x87c74e49 + .long 0xddae053e + .long 0xd3a50837 + .long 0xc1b81f2c + .long 0xcfb31225 + .long 0xe582311a + .long 0xeb893c13 + .long 0xf9942b08 + .long 0xf79f2601 + .long 0x4d46bde6 + .long 0x434db0ef + .long 0x5150a7f4 + .long 0x5f5baafd + .long 0x756a89c2 + .long 0x7b6184cb + .long 0x697c93d0 + .long 0x67779ed9 + .long 0x3d1ed5ae + .long 0x3315d8a7 + .long 0x2108cfbc + .long 0x2f03c2b5 + .long 0x0532e18a + .long 0x0b39ec83 + .long 0x1924fb98 + .long 0x172ff691 + .long 0x768dd64d + .long 0x7886db44 + .long 0x6a9bcc5f + .long 0x6490c156 + .long 0x4ea1e269 + .long 0x40aaef60 + .long 0x52b7f87b + .long 0x5cbcf572 + .long 0x06d5be05 + .long 0x08deb30c + .long 0x1ac3a417 + .long 0x14c8a91e + .long 0x3ef98a21 + .long 0x30f28728 + .long 0x22ef9033 + .long 0x2ce49d3a + .long 0x963d06dd + .long 0x98360bd4 + .long 0x8a2b1ccf + .long 0x842011c6 + .long 0xae1132f9 + .long 0xa01a3ff0 + .long 0xb20728eb + .long 0xbc0c25e2 + .long 0xe6656e95 + .long 0xe86e639c + .long 0xfa737487 + .long 0xf478798e + .long 0xde495ab1 + .long 0xd04257b8 + .long 0xc25f40a3 + .long 0xcc544daa + .long 0x41f7daec + .long 0x4ffcd7e5 + .long 0x5de1c0fe + .long 0x53eacdf7 + .long 0x79dbeec8 + .long 0x77d0e3c1 + .long 0x65cdf4da + .long 0x6bc6f9d3 + .long 0x31afb2a4 + .long 0x3fa4bfad + .long 0x2db9a8b6 + .long 0x23b2a5bf + .long 0x09838680 + .long 0x07888b89 + .long 0x15959c92 + .long 0x1b9e919b + .long 0xa1470a7c + .long 0xaf4c0775 + .long 0xbd51106e + .long 0xb35a1d67 + .long 0x996b3e58 + .long 0x97603351 + .long 0x857d244a + .long 0x8b762943 + .long 0xd11f6234 + .long 0xdf146f3d + .long 0xcd097826 + .long 0xc302752f + .long 0xe9335610 + .long 0xe7385b19 + .long 0xf5254c02 + .long 0xfb2e410b + .long 0x9a8c61d7 + .long 0x94876cde + .long 0x869a7bc5 + .long 0x889176cc + .long 0xa2a055f3 + .long 0xacab58fa + .long 0xbeb64fe1 + .long 0xb0bd42e8 + .long 0xead4099f + .long 0xe4df0496 + .long 0xf6c2138d + .long 0xf8c91e84 + .long 0xd2f83dbb + .long 0xdcf330b2 + .long 0xceee27a9 + .long 0xc0e52aa0 + .long 0x7a3cb147 + .long 0x7437bc4e + .long 0x662aab55 + .long 0x6821a65c + .long 0x42108563 + .long 0x4c1b886a + .long 0x5e069f71 + .long 0x500d9278 + .long 0x0a64d90f + .long 0x046fd406 + .long 0x1672c31d + .long 0x1879ce14 + .long 0x3248ed2b + .long 0x3c43e022 + .long 0x2e5ef739 + .long 0x2055fa30 + .long 0xec01b79a + .long 0xe20aba93 + .long 0xf017ad88 + .long 0xfe1ca081 + .long 0xd42d83be + .long 0xda268eb7 + .long 0xc83b99ac + .long 0xc63094a5 + .long 0x9c59dfd2 + .long 0x9252d2db + .long 0x804fc5c0 + .long 0x8e44c8c9 + .long 0xa475ebf6 + .long 0xaa7ee6ff + .long 0xb863f1e4 + .long 0xb668fced + .long 0x0cb1670a + .long 0x02ba6a03 + .long 0x10a77d18 + .long 0x1eac7011 + .long 0x349d532e + .long 0x3a965e27 + .long 0x288b493c + .long 0x26804435 + .long 0x7ce90f42 + .long 0x72e2024b + .long 0x60ff1550 + .long 0x6ef41859 + .long 0x44c53b66 + .long 0x4ace366f + .long 0x58d32174 + .long 0x56d82c7d + .long 0x377a0ca1 + .long 0x397101a8 + .long 0x2b6c16b3 + .long 0x25671bba + .long 0x0f563885 + .long 0x015d358c + .long 0x13402297 + .long 0x1d4b2f9e + .long 0x472264e9 + .long 0x492969e0 + .long 0x5b347efb + .long 0x553f73f2 + .long 0x7f0e50cd + .long 0x71055dc4 + .long 0x63184adf + .long 0x6d1347d6 + .long 0xd7cadc31 + .long 0xd9c1d138 + .long 0xcbdcc623 + .long 0xc5d7cb2a + .long 0xefe6e815 + .long 0xe1ede51c + .long 0xf3f0f207 + .long 0xfdfbff0e + .long 0xa792b479 + .long 0xa999b970 + .long 0xbb84ae6b + .long 0xb58fa362 + .long 0x9fbe805d + .long 0x91b58d54 + .long 0x83a89a4f + .long 0x8da39746 + + +// Tables for main encryption iterations. + .globl _AESEncryptTable + .private_extern _AESEncryptTable + .align 2 +_AESEncryptTable: + // Table 0. + .long 0xa56363c6 + .long 0x847c7cf8 + .long 0x997777ee + .long 0x8d7b7bf6 + .long 0x0df2f2ff + .long 0xbd6b6bd6 + .long 0xb16f6fde + .long 0x54c5c591 + .long 0x50303060 + .long 0x03010102 + .long 0xa96767ce + .long 0x7d2b2b56 + .long 0x19fefee7 + .long 0x62d7d7b5 + .long 0xe6abab4d + .long 0x9a7676ec + .long 0x45caca8f + .long 0x9d82821f + .long 0x40c9c989 + .long 0x877d7dfa + .long 0x15fafaef + .long 0xeb5959b2 + .long 0xc947478e + .long 0x0bf0f0fb + .long 0xecadad41 + .long 0x67d4d4b3 + .long 0xfda2a25f + .long 0xeaafaf45 + .long 0xbf9c9c23 + .long 0xf7a4a453 + .long 0x967272e4 + .long 0x5bc0c09b + .long 0xc2b7b775 + .long 0x1cfdfde1 + .long 0xae93933d + .long 0x6a26264c + .long 0x5a36366c + .long 0x413f3f7e + .long 0x02f7f7f5 + .long 0x4fcccc83 + .long 0x5c343468 + .long 0xf4a5a551 + .long 0x34e5e5d1 + .long 0x08f1f1f9 + .long 0x937171e2 + .long 0x73d8d8ab + .long 0x53313162 + .long 0x3f15152a + .long 0x0c040408 + .long 0x52c7c795 + .long 0x65232346 + .long 0x5ec3c39d + .long 0x28181830 + .long 0xa1969637 + .long 0x0f05050a + .long 0xb59a9a2f + .long 0x0907070e + .long 0x36121224 + .long 0x9b80801b + .long 0x3de2e2df + .long 0x26ebebcd + .long 0x6927274e + .long 0xcdb2b27f + .long 0x9f7575ea + .long 0x1b090912 + .long 0x9e83831d + .long 0x742c2c58 + .long 0x2e1a1a34 + .long 0x2d1b1b36 + .long 0xb26e6edc + .long 0xee5a5ab4 + .long 0xfba0a05b + .long 0xf65252a4 + .long 0x4d3b3b76 + .long 0x61d6d6b7 + .long 0xceb3b37d + .long 0x7b292952 + .long 0x3ee3e3dd + .long 0x712f2f5e + .long 0x97848413 + .long 0xf55353a6 + .long 0x68d1d1b9 + .long 0x00000000 + .long 0x2cededc1 + .long 0x60202040 + .long 0x1ffcfce3 + .long 0xc8b1b179 + .long 0xed5b5bb6 + .long 0xbe6a6ad4 + .long 0x46cbcb8d + .long 0xd9bebe67 + .long 0x4b393972 + .long 0xde4a4a94 + .long 0xd44c4c98 + .long 0xe85858b0 + .long 0x4acfcf85 + .long 0x6bd0d0bb + .long 0x2aefefc5 + .long 0xe5aaaa4f + .long 0x16fbfbed + .long 0xc5434386 + .long 0xd74d4d9a + .long 0x55333366 + .long 0x94858511 + .long 0xcf45458a + .long 0x10f9f9e9 + .long 0x06020204 + .long 0x817f7ffe + .long 0xf05050a0 + .long 0x443c3c78 + .long 0xba9f9f25 + .long 0xe3a8a84b + .long 0xf35151a2 + .long 0xfea3a35d + .long 0xc0404080 + .long 0x8a8f8f05 + .long 0xad92923f + .long 0xbc9d9d21 + .long 0x48383870 + .long 0x04f5f5f1 + .long 0xdfbcbc63 + .long 0xc1b6b677 + .long 0x75dadaaf + .long 0x63212142 + .long 0x30101020 + .long 0x1affffe5 + .long 0x0ef3f3fd + .long 0x6dd2d2bf + .long 0x4ccdcd81 + .long 0x140c0c18 + .long 0x35131326 + .long 0x2fececc3 + .long 0xe15f5fbe + .long 0xa2979735 + .long 0xcc444488 + .long 0x3917172e + .long 0x57c4c493 + .long 0xf2a7a755 + .long 0x827e7efc + .long 0x473d3d7a + .long 0xac6464c8 + .long 0xe75d5dba + .long 0x2b191932 + .long 0x957373e6 + .long 0xa06060c0 + .long 0x98818119 + .long 0xd14f4f9e + .long 0x7fdcdca3 + .long 0x66222244 + .long 0x7e2a2a54 + .long 0xab90903b + .long 0x8388880b + .long 0xca46468c + .long 0x29eeeec7 + .long 0xd3b8b86b + .long 0x3c141428 + .long 0x79dedea7 + .long 0xe25e5ebc + .long 0x1d0b0b16 + .long 0x76dbdbad + .long 0x3be0e0db + .long 0x56323264 + .long 0x4e3a3a74 + .long 0x1e0a0a14 + .long 0xdb494992 + .long 0x0a06060c + .long 0x6c242448 + .long 0xe45c5cb8 + .long 0x5dc2c29f + .long 0x6ed3d3bd + .long 0xefacac43 + .long 0xa66262c4 + .long 0xa8919139 + .long 0xa4959531 + .long 0x37e4e4d3 + .long 0x8b7979f2 + .long 0x32e7e7d5 + .long 0x43c8c88b + .long 0x5937376e + .long 0xb76d6dda + .long 0x8c8d8d01 + .long 0x64d5d5b1 + .long 0xd24e4e9c + .long 0xe0a9a949 + .long 0xb46c6cd8 + .long 0xfa5656ac + .long 0x07f4f4f3 + .long 0x25eaeacf + .long 0xaf6565ca + .long 0x8e7a7af4 + .long 0xe9aeae47 + .long 0x18080810 + .long 0xd5baba6f + .long 0x887878f0 + .long 0x6f25254a + .long 0x722e2e5c + .long 0x241c1c38 + .long 0xf1a6a657 + .long 0xc7b4b473 + .long 0x51c6c697 + .long 0x23e8e8cb + .long 0x7cdddda1 + .long 0x9c7474e8 + .long 0x211f1f3e + .long 0xdd4b4b96 + .long 0xdcbdbd61 + .long 0x868b8b0d + .long 0x858a8a0f + .long 0x907070e0 + .long 0x423e3e7c + .long 0xc4b5b571 + .long 0xaa6666cc + .long 0xd8484890 + .long 0x05030306 + .long 0x01f6f6f7 + .long 0x120e0e1c + .long 0xa36161c2 + .long 0x5f35356a + .long 0xf95757ae + .long 0xd0b9b969 + .long 0x91868617 + .long 0x58c1c199 + .long 0x271d1d3a + .long 0xb99e9e27 + .long 0x38e1e1d9 + .long 0x13f8f8eb + .long 0xb398982b + .long 0x33111122 + .long 0xbb6969d2 + .long 0x70d9d9a9 + .long 0x898e8e07 + .long 0xa7949433 + .long 0xb69b9b2d + .long 0x221e1e3c + .long 0x92878715 + .long 0x20e9e9c9 + .long 0x49cece87 + .long 0xff5555aa + .long 0x78282850 + .long 0x7adfdfa5 + .long 0x8f8c8c03 + .long 0xf8a1a159 + .long 0x80898909 + .long 0x170d0d1a + .long 0xdabfbf65 + .long 0x31e6e6d7 + .long 0xc6424284 + .long 0xb86868d0 + .long 0xc3414182 + .long 0xb0999929 + .long 0x772d2d5a + .long 0x110f0f1e + .long 0xcbb0b07b + .long 0xfc5454a8 + .long 0xd6bbbb6d + .long 0x3a16162c + // Table 1. + .long 0x6363c6a5 + .long 0x7c7cf884 + .long 0x7777ee99 + .long 0x7b7bf68d + .long 0xf2f2ff0d + .long 0x6b6bd6bd + .long 0x6f6fdeb1 + .long 0xc5c59154 + .long 0x30306050 + .long 0x01010203 + .long 0x6767cea9 + .long 0x2b2b567d + .long 0xfefee719 + .long 0xd7d7b562 + .long 0xabab4de6 + .long 0x7676ec9a + .long 0xcaca8f45 + .long 0x82821f9d + .long 0xc9c98940 + .long 0x7d7dfa87 + .long 0xfafaef15 + .long 0x5959b2eb + .long 0x47478ec9 + .long 0xf0f0fb0b + .long 0xadad41ec + .long 0xd4d4b367 + .long 0xa2a25ffd + .long 0xafaf45ea + .long 0x9c9c23bf + .long 0xa4a453f7 + .long 0x7272e496 + .long 0xc0c09b5b + .long 0xb7b775c2 + .long 0xfdfde11c + .long 0x93933dae + .long 0x26264c6a + .long 0x36366c5a + .long 0x3f3f7e41 + .long 0xf7f7f502 + .long 0xcccc834f + .long 0x3434685c + .long 0xa5a551f4 + .long 0xe5e5d134 + .long 0xf1f1f908 + .long 0x7171e293 + .long 0xd8d8ab73 + .long 0x31316253 + .long 0x15152a3f + .long 0x0404080c + .long 0xc7c79552 + .long 0x23234665 + .long 0xc3c39d5e + .long 0x18183028 + .long 0x969637a1 + .long 0x05050a0f + .long 0x9a9a2fb5 + .long 0x07070e09 + .long 0x12122436 + .long 0x80801b9b + .long 0xe2e2df3d + .long 0xebebcd26 + .long 0x27274e69 + .long 0xb2b27fcd + .long 0x7575ea9f + .long 0x0909121b + .long 0x83831d9e + .long 0x2c2c5874 + .long 0x1a1a342e + .long 0x1b1b362d + .long 0x6e6edcb2 + .long 0x5a5ab4ee + .long 0xa0a05bfb + .long 0x5252a4f6 + .long 0x3b3b764d + .long 0xd6d6b761 + .long 0xb3b37dce + .long 0x2929527b + .long 0xe3e3dd3e + .long 0x2f2f5e71 + .long 0x84841397 + .long 0x5353a6f5 + .long 0xd1d1b968 + .long 0x00000000 + .long 0xededc12c + .long 0x20204060 + .long 0xfcfce31f + .long 0xb1b179c8 + .long 0x5b5bb6ed + .long 0x6a6ad4be + .long 0xcbcb8d46 + .long 0xbebe67d9 + .long 0x3939724b + .long 0x4a4a94de + .long 0x4c4c98d4 + .long 0x5858b0e8 + .long 0xcfcf854a + .long 0xd0d0bb6b + .long 0xefefc52a + .long 0xaaaa4fe5 + .long 0xfbfbed16 + .long 0x434386c5 + .long 0x4d4d9ad7 + .long 0x33336655 + .long 0x85851194 + .long 0x45458acf + .long 0xf9f9e910 + .long 0x02020406 + .long 0x7f7ffe81 + .long 0x5050a0f0 + .long 0x3c3c7844 + .long 0x9f9f25ba + .long 0xa8a84be3 + .long 0x5151a2f3 + .long 0xa3a35dfe + .long 0x404080c0 + .long 0x8f8f058a + .long 0x92923fad + .long 0x9d9d21bc + .long 0x38387048 + .long 0xf5f5f104 + .long 0xbcbc63df + .long 0xb6b677c1 + .long 0xdadaaf75 + .long 0x21214263 + .long 0x10102030 + .long 0xffffe51a + .long 0xf3f3fd0e + .long 0xd2d2bf6d + .long 0xcdcd814c + .long 0x0c0c1814 + .long 0x13132635 + .long 0xececc32f + .long 0x5f5fbee1 + .long 0x979735a2 + .long 0x444488cc + .long 0x17172e39 + .long 0xc4c49357 + .long 0xa7a755f2 + .long 0x7e7efc82 + .long 0x3d3d7a47 + .long 0x6464c8ac + .long 0x5d5dbae7 + .long 0x1919322b + .long 0x7373e695 + .long 0x6060c0a0 + .long 0x81811998 + .long 0x4f4f9ed1 + .long 0xdcdca37f + .long 0x22224466 + .long 0x2a2a547e + .long 0x90903bab + .long 0x88880b83 + .long 0x46468cca + .long 0xeeeec729 + .long 0xb8b86bd3 + .long 0x1414283c + .long 0xdedea779 + .long 0x5e5ebce2 + .long 0x0b0b161d + .long 0xdbdbad76 + .long 0xe0e0db3b + .long 0x32326456 + .long 0x3a3a744e + .long 0x0a0a141e + .long 0x494992db + .long 0x06060c0a + .long 0x2424486c + .long 0x5c5cb8e4 + .long 0xc2c29f5d + .long 0xd3d3bd6e + .long 0xacac43ef + .long 0x6262c4a6 + .long 0x919139a8 + .long 0x959531a4 + .long 0xe4e4d337 + .long 0x7979f28b + .long 0xe7e7d532 + .long 0xc8c88b43 + .long 0x37376e59 + .long 0x6d6ddab7 + .long 0x8d8d018c + .long 0xd5d5b164 + .long 0x4e4e9cd2 + .long 0xa9a949e0 + .long 0x6c6cd8b4 + .long 0x5656acfa + .long 0xf4f4f307 + .long 0xeaeacf25 + .long 0x6565caaf + .long 0x7a7af48e + .long 0xaeae47e9 + .long 0x08081018 + .long 0xbaba6fd5 + .long 0x7878f088 + .long 0x25254a6f + .long 0x2e2e5c72 + .long 0x1c1c3824 + .long 0xa6a657f1 + .long 0xb4b473c7 + .long 0xc6c69751 + .long 0xe8e8cb23 + .long 0xdddda17c + .long 0x7474e89c + .long 0x1f1f3e21 + .long 0x4b4b96dd + .long 0xbdbd61dc + .long 0x8b8b0d86 + .long 0x8a8a0f85 + .long 0x7070e090 + .long 0x3e3e7c42 + .long 0xb5b571c4 + .long 0x6666ccaa + .long 0x484890d8 + .long 0x03030605 + .long 0xf6f6f701 + .long 0x0e0e1c12 + .long 0x6161c2a3 + .long 0x35356a5f + .long 0x5757aef9 + .long 0xb9b969d0 + .long 0x86861791 + .long 0xc1c19958 + .long 0x1d1d3a27 + .long 0x9e9e27b9 + .long 0xe1e1d938 + .long 0xf8f8eb13 + .long 0x98982bb3 + .long 0x11112233 + .long 0x6969d2bb + .long 0xd9d9a970 + .long 0x8e8e0789 + .long 0x949433a7 + .long 0x9b9b2db6 + .long 0x1e1e3c22 + .long 0x87871592 + .long 0xe9e9c920 + .long 0xcece8749 + .long 0x5555aaff + .long 0x28285078 + .long 0xdfdfa57a + .long 0x8c8c038f + .long 0xa1a159f8 + .long 0x89890980 + .long 0x0d0d1a17 + .long 0xbfbf65da + .long 0xe6e6d731 + .long 0x424284c6 + .long 0x6868d0b8 + .long 0x414182c3 + .long 0x999929b0 + .long 0x2d2d5a77 + .long 0x0f0f1e11 + .long 0xb0b07bcb + .long 0x5454a8fc + .long 0xbbbb6dd6 + .long 0x16162c3a + // Table 2. + .long 0x63c6a563 + .long 0x7cf8847c + .long 0x77ee9977 + .long 0x7bf68d7b + .long 0xf2ff0df2 + .long 0x6bd6bd6b + .long 0x6fdeb16f + .long 0xc59154c5 + .long 0x30605030 + .long 0x01020301 + .long 0x67cea967 + .long 0x2b567d2b + .long 0xfee719fe + .long 0xd7b562d7 + .long 0xab4de6ab + .long 0x76ec9a76 + .long 0xca8f45ca + .long 0x821f9d82 + .long 0xc98940c9 + .long 0x7dfa877d + .long 0xfaef15fa + .long 0x59b2eb59 + .long 0x478ec947 + .long 0xf0fb0bf0 + .long 0xad41ecad + .long 0xd4b367d4 + .long 0xa25ffda2 + .long 0xaf45eaaf + .long 0x9c23bf9c + .long 0xa453f7a4 + .long 0x72e49672 + .long 0xc09b5bc0 + .long 0xb775c2b7 + .long 0xfde11cfd + .long 0x933dae93 + .long 0x264c6a26 + .long 0x366c5a36 + .long 0x3f7e413f + .long 0xf7f502f7 + .long 0xcc834fcc + .long 0x34685c34 + .long 0xa551f4a5 + .long 0xe5d134e5 + .long 0xf1f908f1 + .long 0x71e29371 + .long 0xd8ab73d8 + .long 0x31625331 + .long 0x152a3f15 + .long 0x04080c04 + .long 0xc79552c7 + .long 0x23466523 + .long 0xc39d5ec3 + .long 0x18302818 + .long 0x9637a196 + .long 0x050a0f05 + .long 0x9a2fb59a + .long 0x070e0907 + .long 0x12243612 + .long 0x801b9b80 + .long 0xe2df3de2 + .long 0xebcd26eb + .long 0x274e6927 + .long 0xb27fcdb2 + .long 0x75ea9f75 + .long 0x09121b09 + .long 0x831d9e83 + .long 0x2c58742c + .long 0x1a342e1a + .long 0x1b362d1b + .long 0x6edcb26e + .long 0x5ab4ee5a + .long 0xa05bfba0 + .long 0x52a4f652 + .long 0x3b764d3b + .long 0xd6b761d6 + .long 0xb37dceb3 + .long 0x29527b29 + .long 0xe3dd3ee3 + .long 0x2f5e712f + .long 0x84139784 + .long 0x53a6f553 + .long 0xd1b968d1 + .long 0x00000000 + .long 0xedc12ced + .long 0x20406020 + .long 0xfce31ffc + .long 0xb179c8b1 + .long 0x5bb6ed5b + .long 0x6ad4be6a + .long 0xcb8d46cb + .long 0xbe67d9be + .long 0x39724b39 + .long 0x4a94de4a + .long 0x4c98d44c + .long 0x58b0e858 + .long 0xcf854acf + .long 0xd0bb6bd0 + .long 0xefc52aef + .long 0xaa4fe5aa + .long 0xfbed16fb + .long 0x4386c543 + .long 0x4d9ad74d + .long 0x33665533 + .long 0x85119485 + .long 0x458acf45 + .long 0xf9e910f9 + .long 0x02040602 + .long 0x7ffe817f + .long 0x50a0f050 + .long 0x3c78443c + .long 0x9f25ba9f + .long 0xa84be3a8 + .long 0x51a2f351 + .long 0xa35dfea3 + .long 0x4080c040 + .long 0x8f058a8f + .long 0x923fad92 + .long 0x9d21bc9d + .long 0x38704838 + .long 0xf5f104f5 + .long 0xbc63dfbc + .long 0xb677c1b6 + .long 0xdaaf75da + .long 0x21426321 + .long 0x10203010 + .long 0xffe51aff + .long 0xf3fd0ef3 + .long 0xd2bf6dd2 + .long 0xcd814ccd + .long 0x0c18140c + .long 0x13263513 + .long 0xecc32fec + .long 0x5fbee15f + .long 0x9735a297 + .long 0x4488cc44 + .long 0x172e3917 + .long 0xc49357c4 + .long 0xa755f2a7 + .long 0x7efc827e + .long 0x3d7a473d + .long 0x64c8ac64 + .long 0x5dbae75d + .long 0x19322b19 + .long 0x73e69573 + .long 0x60c0a060 + .long 0x81199881 + .long 0x4f9ed14f + .long 0xdca37fdc + .long 0x22446622 + .long 0x2a547e2a + .long 0x903bab90 + .long 0x880b8388 + .long 0x468cca46 + .long 0xeec729ee + .long 0xb86bd3b8 + .long 0x14283c14 + .long 0xdea779de + .long 0x5ebce25e + .long 0x0b161d0b + .long 0xdbad76db + .long 0xe0db3be0 + .long 0x32645632 + .long 0x3a744e3a + .long 0x0a141e0a + .long 0x4992db49 + .long 0x060c0a06 + .long 0x24486c24 + .long 0x5cb8e45c + .long 0xc29f5dc2 + .long 0xd3bd6ed3 + .long 0xac43efac + .long 0x62c4a662 + .long 0x9139a891 + .long 0x9531a495 + .long 0xe4d337e4 + .long 0x79f28b79 + .long 0xe7d532e7 + .long 0xc88b43c8 + .long 0x376e5937 + .long 0x6ddab76d + .long 0x8d018c8d + .long 0xd5b164d5 + .long 0x4e9cd24e + .long 0xa949e0a9 + .long 0x6cd8b46c + .long 0x56acfa56 + .long 0xf4f307f4 + .long 0xeacf25ea + .long 0x65caaf65 + .long 0x7af48e7a + .long 0xae47e9ae + .long 0x08101808 + .long 0xba6fd5ba + .long 0x78f08878 + .long 0x254a6f25 + .long 0x2e5c722e + .long 0x1c38241c + .long 0xa657f1a6 + .long 0xb473c7b4 + .long 0xc69751c6 + .long 0xe8cb23e8 + .long 0xdda17cdd + .long 0x74e89c74 + .long 0x1f3e211f + .long 0x4b96dd4b + .long 0xbd61dcbd + .long 0x8b0d868b + .long 0x8a0f858a + .long 0x70e09070 + .long 0x3e7c423e + .long 0xb571c4b5 + .long 0x66ccaa66 + .long 0x4890d848 + .long 0x03060503 + .long 0xf6f701f6 + .long 0x0e1c120e + .long 0x61c2a361 + .long 0x356a5f35 + .long 0x57aef957 + .long 0xb969d0b9 + .long 0x86179186 + .long 0xc19958c1 + .long 0x1d3a271d + .long 0x9e27b99e + .long 0xe1d938e1 + .long 0xf8eb13f8 + .long 0x982bb398 + .long 0x11223311 + .long 0x69d2bb69 + .long 0xd9a970d9 + .long 0x8e07898e + .long 0x9433a794 + .long 0x9b2db69b + .long 0x1e3c221e + .long 0x87159287 + .long 0xe9c920e9 + .long 0xce8749ce + .long 0x55aaff55 + .long 0x28507828 + .long 0xdfa57adf + .long 0x8c038f8c + .long 0xa159f8a1 + .long 0x89098089 + .long 0x0d1a170d + .long 0xbf65dabf + .long 0xe6d731e6 + .long 0x4284c642 + .long 0x68d0b868 + .long 0x4182c341 + .long 0x9929b099 + .long 0x2d5a772d + .long 0x0f1e110f + .long 0xb07bcbb0 + .long 0x54a8fc54 + .long 0xbb6dd6bb + .long 0x162c3a16 + // Table 3. + .long 0xc6a56363 + .long 0xf8847c7c + .long 0xee997777 + .long 0xf68d7b7b + .long 0xff0df2f2 + .long 0xd6bd6b6b + .long 0xdeb16f6f + .long 0x9154c5c5 + .long 0x60503030 + .long 0x02030101 + .long 0xcea96767 + .long 0x567d2b2b + .long 0xe719fefe + .long 0xb562d7d7 + .long 0x4de6abab + .long 0xec9a7676 + .long 0x8f45caca + .long 0x1f9d8282 + .long 0x8940c9c9 + .long 0xfa877d7d + .long 0xef15fafa + .long 0xb2eb5959 + .long 0x8ec94747 + .long 0xfb0bf0f0 + .long 0x41ecadad + .long 0xb367d4d4 + .long 0x5ffda2a2 + .long 0x45eaafaf + .long 0x23bf9c9c + .long 0x53f7a4a4 + .long 0xe4967272 + .long 0x9b5bc0c0 + .long 0x75c2b7b7 + .long 0xe11cfdfd + .long 0x3dae9393 + .long 0x4c6a2626 + .long 0x6c5a3636 + .long 0x7e413f3f + .long 0xf502f7f7 + .long 0x834fcccc + .long 0x685c3434 + .long 0x51f4a5a5 + .long 0xd134e5e5 + .long 0xf908f1f1 + .long 0xe2937171 + .long 0xab73d8d8 + .long 0x62533131 + .long 0x2a3f1515 + .long 0x080c0404 + .long 0x9552c7c7 + .long 0x46652323 + .long 0x9d5ec3c3 + .long 0x30281818 + .long 0x37a19696 + .long 0x0a0f0505 + .long 0x2fb59a9a + .long 0x0e090707 + .long 0x24361212 + .long 0x1b9b8080 + .long 0xdf3de2e2 + .long 0xcd26ebeb + .long 0x4e692727 + .long 0x7fcdb2b2 + .long 0xea9f7575 + .long 0x121b0909 + .long 0x1d9e8383 + .long 0x58742c2c + .long 0x342e1a1a + .long 0x362d1b1b + .long 0xdcb26e6e + .long 0xb4ee5a5a + .long 0x5bfba0a0 + .long 0xa4f65252 + .long 0x764d3b3b + .long 0xb761d6d6 + .long 0x7dceb3b3 + .long 0x527b2929 + .long 0xdd3ee3e3 + .long 0x5e712f2f + .long 0x13978484 + .long 0xa6f55353 + .long 0xb968d1d1 + .long 0x00000000 + .long 0xc12ceded + .long 0x40602020 + .long 0xe31ffcfc + .long 0x79c8b1b1 + .long 0xb6ed5b5b + .long 0xd4be6a6a + .long 0x8d46cbcb + .long 0x67d9bebe + .long 0x724b3939 + .long 0x94de4a4a + .long 0x98d44c4c + .long 0xb0e85858 + .long 0x854acfcf + .long 0xbb6bd0d0 + .long 0xc52aefef + .long 0x4fe5aaaa + .long 0xed16fbfb + .long 0x86c54343 + .long 0x9ad74d4d + .long 0x66553333 + .long 0x11948585 + .long 0x8acf4545 + .long 0xe910f9f9 + .long 0x04060202 + .long 0xfe817f7f + .long 0xa0f05050 + .long 0x78443c3c + .long 0x25ba9f9f + .long 0x4be3a8a8 + .long 0xa2f35151 + .long 0x5dfea3a3 + .long 0x80c04040 + .long 0x058a8f8f + .long 0x3fad9292 + .long 0x21bc9d9d + .long 0x70483838 + .long 0xf104f5f5 + .long 0x63dfbcbc + .long 0x77c1b6b6 + .long 0xaf75dada + .long 0x42632121 + .long 0x20301010 + .long 0xe51affff + .long 0xfd0ef3f3 + .long 0xbf6dd2d2 + .long 0x814ccdcd + .long 0x18140c0c + .long 0x26351313 + .long 0xc32fecec + .long 0xbee15f5f + .long 0x35a29797 + .long 0x88cc4444 + .long 0x2e391717 + .long 0x9357c4c4 + .long 0x55f2a7a7 + .long 0xfc827e7e + .long 0x7a473d3d + .long 0xc8ac6464 + .long 0xbae75d5d + .long 0x322b1919 + .long 0xe6957373 + .long 0xc0a06060 + .long 0x19988181 + .long 0x9ed14f4f + .long 0xa37fdcdc + .long 0x44662222 + .long 0x547e2a2a + .long 0x3bab9090 + .long 0x0b838888 + .long 0x8cca4646 + .long 0xc729eeee + .long 0x6bd3b8b8 + .long 0x283c1414 + .long 0xa779dede + .long 0xbce25e5e + .long 0x161d0b0b + .long 0xad76dbdb + .long 0xdb3be0e0 + .long 0x64563232 + .long 0x744e3a3a + .long 0x141e0a0a + .long 0x92db4949 + .long 0x0c0a0606 + .long 0x486c2424 + .long 0xb8e45c5c + .long 0x9f5dc2c2 + .long 0xbd6ed3d3 + .long 0x43efacac + .long 0xc4a66262 + .long 0x39a89191 + .long 0x31a49595 + .long 0xd337e4e4 + .long 0xf28b7979 + .long 0xd532e7e7 + .long 0x8b43c8c8 + .long 0x6e593737 + .long 0xdab76d6d + .long 0x018c8d8d + .long 0xb164d5d5 + .long 0x9cd24e4e + .long 0x49e0a9a9 + .long 0xd8b46c6c + .long 0xacfa5656 + .long 0xf307f4f4 + .long 0xcf25eaea + .long 0xcaaf6565 + .long 0xf48e7a7a + .long 0x47e9aeae + .long 0x10180808 + .long 0x6fd5baba + .long 0xf0887878 + .long 0x4a6f2525 + .long 0x5c722e2e + .long 0x38241c1c + .long 0x57f1a6a6 + .long 0x73c7b4b4 + .long 0x9751c6c6 + .long 0xcb23e8e8 + .long 0xa17cdddd + .long 0xe89c7474 + .long 0x3e211f1f + .long 0x96dd4b4b + .long 0x61dcbdbd + .long 0x0d868b8b + .long 0x0f858a8a + .long 0xe0907070 + .long 0x7c423e3e + .long 0x71c4b5b5 + .long 0xccaa6666 + .long 0x90d84848 + .long 0x06050303 + .long 0xf701f6f6 + .long 0x1c120e0e + .long 0xc2a36161 + .long 0x6a5f3535 + .long 0xaef95757 + .long 0x69d0b9b9 + .long 0x17918686 + .long 0x9958c1c1 + .long 0x3a271d1d + .long 0x27b99e9e + .long 0xd938e1e1 + .long 0xeb13f8f8 + .long 0x2bb39898 + .long 0x22331111 + .long 0xd2bb6969 + .long 0xa970d9d9 + .long 0x07898e8e + .long 0x33a79494 + .long 0x2db69b9b + .long 0x3c221e1e + .long 0x15928787 + .long 0xc920e9e9 + .long 0x8749cece + .long 0xaaff5555 + .long 0x50782828 + .long 0xa57adfdf + .long 0x038f8c8c + .long 0x59f8a1a1 + .long 0x09808989 + .long 0x1a170d0d + .long 0x65dabfbf + .long 0xd731e6e6 + .long 0x84c64242 + .long 0xd0b86868 + .long 0x82c34141 + .long 0x29b09999 + .long 0x5a772d2d + .long 0x1e110f0f + .long 0x7bcbb0b0 + .long 0xa8fc5454 + .long 0x6dd6bbbb + .long 0x2c3a1616 + + +// Tables for main decryption iterations. + .globl _AESDecryptTable + .private_extern _AESDecryptTable + .align 2 +_AESDecryptTable: + // Table 0. + .long 0x50a7f451 + .long 0x5365417e + .long 0xc3a4171a + .long 0x965e273a + .long 0xcb6bab3b + .long 0xf1459d1f + .long 0xab58faac + .long 0x9303e34b + .long 0x55fa3020 + .long 0xf66d76ad + .long 0x9176cc88 + .long 0x254c02f5 + .long 0xfcd7e54f + .long 0xd7cb2ac5 + .long 0x80443526 + .long 0x8fa362b5 + .long 0x495ab1de + .long 0x671bba25 + .long 0x980eea45 + .long 0xe1c0fe5d + .long 0x02752fc3 + .long 0x12f04c81 + .long 0xa397468d + .long 0xc6f9d36b + .long 0xe75f8f03 + .long 0x959c9215 + .long 0xeb7a6dbf + .long 0xda595295 + .long 0x2d83bed4 + .long 0xd3217458 + .long 0x2969e049 + .long 0x44c8c98e + .long 0x6a89c275 + .long 0x78798ef4 + .long 0x6b3e5899 + .long 0xdd71b927 + .long 0xb64fe1be + .long 0x17ad88f0 + .long 0x66ac20c9 + .long 0xb43ace7d + .long 0x184adf63 + .long 0x82311ae5 + .long 0x60335197 + .long 0x457f5362 + .long 0xe07764b1 + .long 0x84ae6bbb + .long 0x1ca081fe + .long 0x942b08f9 + .long 0x58684870 + .long 0x19fd458f + .long 0x876cde94 + .long 0xb7f87b52 + .long 0x23d373ab + .long 0xe2024b72 + .long 0x578f1fe3 + .long 0x2aab5566 + .long 0x0728ebb2 + .long 0x03c2b52f + .long 0x9a7bc586 + .long 0xa50837d3 + .long 0xf2872830 + .long 0xb2a5bf23 + .long 0xba6a0302 + .long 0x5c8216ed + .long 0x2b1ccf8a + .long 0x92b479a7 + .long 0xf0f207f3 + .long 0xa1e2694e + .long 0xcdf4da65 + .long 0xd5be0506 + .long 0x1f6234d1 + .long 0x8afea6c4 + .long 0x9d532e34 + .long 0xa055f3a2 + .long 0x32e18a05 + .long 0x75ebf6a4 + .long 0x39ec830b + .long 0xaaef6040 + .long 0x069f715e + .long 0x51106ebd + .long 0xf98a213e + .long 0x3d06dd96 + .long 0xae053edd + .long 0x46bde64d + .long 0xb58d5491 + .long 0x055dc471 + .long 0x6fd40604 + .long 0xff155060 + .long 0x24fb9819 + .long 0x97e9bdd6 + .long 0xcc434089 + .long 0x779ed967 + .long 0xbd42e8b0 + .long 0x888b8907 + .long 0x385b19e7 + .long 0xdbeec879 + .long 0x470a7ca1 + .long 0xe90f427c + .long 0xc91e84f8 + .long 0x00000000 + .long 0x83868009 + .long 0x48ed2b32 + .long 0xac70111e + .long 0x4e725a6c + .long 0xfbff0efd + .long 0x5638850f + .long 0x1ed5ae3d + .long 0x27392d36 + .long 0x64d90f0a + .long 0x21a65c68 + .long 0xd1545b9b + .long 0x3a2e3624 + .long 0xb1670a0c + .long 0x0fe75793 + .long 0xd296eeb4 + .long 0x9e919b1b + .long 0x4fc5c080 + .long 0xa220dc61 + .long 0x694b775a + .long 0x161a121c + .long 0x0aba93e2 + .long 0xe52aa0c0 + .long 0x43e0223c + .long 0x1d171b12 + .long 0x0b0d090e + .long 0xadc78bf2 + .long 0xb9a8b62d + .long 0xc8a91e14 + .long 0x8519f157 + .long 0x4c0775af + .long 0xbbdd99ee + .long 0xfd607fa3 + .long 0x9f2601f7 + .long 0xbcf5725c + .long 0xc53b6644 + .long 0x347efb5b + .long 0x7629438b + .long 0xdcc623cb + .long 0x68fcedb6 + .long 0x63f1e4b8 + .long 0xcadc31d7 + .long 0x10856342 + .long 0x40229713 + .long 0x2011c684 + .long 0x7d244a85 + .long 0xf83dbbd2 + .long 0x1132f9ae + .long 0x6da129c7 + .long 0x4b2f9e1d + .long 0xf330b2dc + .long 0xec52860d + .long 0xd0e3c177 + .long 0x6c16b32b + .long 0x99b970a9 + .long 0xfa489411 + .long 0x2264e947 + .long 0xc48cfca8 + .long 0x1a3ff0a0 + .long 0xd82c7d56 + .long 0xef903322 + .long 0xc74e4987 + .long 0xc1d138d9 + .long 0xfea2ca8c + .long 0x360bd498 + .long 0xcf81f5a6 + .long 0x28de7aa5 + .long 0x268eb7da + .long 0xa4bfad3f + .long 0xe49d3a2c + .long 0x0d927850 + .long 0x9bcc5f6a + .long 0x62467e54 + .long 0xc2138df6 + .long 0xe8b8d890 + .long 0x5ef7392e + .long 0xf5afc382 + .long 0xbe805d9f + .long 0x7c93d069 + .long 0xa92dd56f + .long 0xb31225cf + .long 0x3b99acc8 + .long 0xa77d1810 + .long 0x6e639ce8 + .long 0x7bbb3bdb + .long 0x097826cd + .long 0xf418596e + .long 0x01b79aec + .long 0xa89a4f83 + .long 0x656e95e6 + .long 0x7ee6ffaa + .long 0x08cfbc21 + .long 0xe6e815ef + .long 0xd99be7ba + .long 0xce366f4a + .long 0xd4099fea + .long 0xd67cb029 + .long 0xafb2a431 + .long 0x31233f2a + .long 0x3094a5c6 + .long 0xc066a235 + .long 0x37bc4e74 + .long 0xa6ca82fc + .long 0xb0d090e0 + .long 0x15d8a733 + .long 0x4a9804f1 + .long 0xf7daec41 + .long 0x0e50cd7f + .long 0x2ff69117 + .long 0x8dd64d76 + .long 0x4db0ef43 + .long 0x544daacc + .long 0xdf0496e4 + .long 0xe3b5d19e + .long 0x1b886a4c + .long 0xb81f2cc1 + .long 0x7f516546 + .long 0x04ea5e9d + .long 0x5d358c01 + .long 0x737487fa + .long 0x2e410bfb + .long 0x5a1d67b3 + .long 0x52d2db92 + .long 0x335610e9 + .long 0x1347d66d + .long 0x8c61d79a + .long 0x7a0ca137 + .long 0x8e14f859 + .long 0x893c13eb + .long 0xee27a9ce + .long 0x35c961b7 + .long 0xede51ce1 + .long 0x3cb1477a + .long 0x59dfd29c + .long 0x3f73f255 + .long 0x79ce1418 + .long 0xbf37c773 + .long 0xeacdf753 + .long 0x5baafd5f + .long 0x146f3ddf + .long 0x86db4478 + .long 0x81f3afca + .long 0x3ec468b9 + .long 0x2c342438 + .long 0x5f40a3c2 + .long 0x72c31d16 + .long 0x0c25e2bc + .long 0x8b493c28 + .long 0x41950dff + .long 0x7101a839 + .long 0xdeb30c08 + .long 0x9ce4b4d8 + .long 0x90c15664 + .long 0x6184cb7b + .long 0x70b632d5 + .long 0x745c6c48 + .long 0x4257b8d0 + // Table 1. + .long 0xa7f45150 + .long 0x65417e53 + .long 0xa4171ac3 + .long 0x5e273a96 + .long 0x6bab3bcb + .long 0x459d1ff1 + .long 0x58faacab + .long 0x03e34b93 + .long 0xfa302055 + .long 0x6d76adf6 + .long 0x76cc8891 + .long 0x4c02f525 + .long 0xd7e54ffc + .long 0xcb2ac5d7 + .long 0x44352680 + .long 0xa362b58f + .long 0x5ab1de49 + .long 0x1bba2567 + .long 0x0eea4598 + .long 0xc0fe5de1 + .long 0x752fc302 + .long 0xf04c8112 + .long 0x97468da3 + .long 0xf9d36bc6 + .long 0x5f8f03e7 + .long 0x9c921595 + .long 0x7a6dbfeb + .long 0x595295da + .long 0x83bed42d + .long 0x217458d3 + .long 0x69e04929 + .long 0xc8c98e44 + .long 0x89c2756a + .long 0x798ef478 + .long 0x3e58996b + .long 0x71b927dd + .long 0x4fe1beb6 + .long 0xad88f017 + .long 0xac20c966 + .long 0x3ace7db4 + .long 0x4adf6318 + .long 0x311ae582 + .long 0x33519760 + .long 0x7f536245 + .long 0x7764b1e0 + .long 0xae6bbb84 + .long 0xa081fe1c + .long 0x2b08f994 + .long 0x68487058 + .long 0xfd458f19 + .long 0x6cde9487 + .long 0xf87b52b7 + .long 0xd373ab23 + .long 0x024b72e2 + .long 0x8f1fe357 + .long 0xab55662a + .long 0x28ebb207 + .long 0xc2b52f03 + .long 0x7bc5869a + .long 0x0837d3a5 + .long 0x872830f2 + .long 0xa5bf23b2 + .long 0x6a0302ba + .long 0x8216ed5c + .long 0x1ccf8a2b + .long 0xb479a792 + .long 0xf207f3f0 + .long 0xe2694ea1 + .long 0xf4da65cd + .long 0xbe0506d5 + .long 0x6234d11f + .long 0xfea6c48a + .long 0x532e349d + .long 0x55f3a2a0 + .long 0xe18a0532 + .long 0xebf6a475 + .long 0xec830b39 + .long 0xef6040aa + .long 0x9f715e06 + .long 0x106ebd51 + .long 0x8a213ef9 + .long 0x06dd963d + .long 0x053eddae + .long 0xbde64d46 + .long 0x8d5491b5 + .long 0x5dc47105 + .long 0xd406046f + .long 0x155060ff + .long 0xfb981924 + .long 0xe9bdd697 + .long 0x434089cc + .long 0x9ed96777 + .long 0x42e8b0bd + .long 0x8b890788 + .long 0x5b19e738 + .long 0xeec879db + .long 0x0a7ca147 + .long 0x0f427ce9 + .long 0x1e84f8c9 + .long 0x00000000 + .long 0x86800983 + .long 0xed2b3248 + .long 0x70111eac + .long 0x725a6c4e + .long 0xff0efdfb + .long 0x38850f56 + .long 0xd5ae3d1e + .long 0x392d3627 + .long 0xd90f0a64 + .long 0xa65c6821 + .long 0x545b9bd1 + .long 0x2e36243a + .long 0x670a0cb1 + .long 0xe757930f + .long 0x96eeb4d2 + .long 0x919b1b9e + .long 0xc5c0804f + .long 0x20dc61a2 + .long 0x4b775a69 + .long 0x1a121c16 + .long 0xba93e20a + .long 0x2aa0c0e5 + .long 0xe0223c43 + .long 0x171b121d + .long 0x0d090e0b + .long 0xc78bf2ad + .long 0xa8b62db9 + .long 0xa91e14c8 + .long 0x19f15785 + .long 0x0775af4c + .long 0xdd99eebb + .long 0x607fa3fd + .long 0x2601f79f + .long 0xf5725cbc + .long 0x3b6644c5 + .long 0x7efb5b34 + .long 0x29438b76 + .long 0xc623cbdc + .long 0xfcedb668 + .long 0xf1e4b863 + .long 0xdc31d7ca + .long 0x85634210 + .long 0x22971340 + .long 0x11c68420 + .long 0x244a857d + .long 0x3dbbd2f8 + .long 0x32f9ae11 + .long 0xa129c76d + .long 0x2f9e1d4b + .long 0x30b2dcf3 + .long 0x52860dec + .long 0xe3c177d0 + .long 0x16b32b6c + .long 0xb970a999 + .long 0x489411fa + .long 0x64e94722 + .long 0x8cfca8c4 + .long 0x3ff0a01a + .long 0x2c7d56d8 + .long 0x903322ef + .long 0x4e4987c7 + .long 0xd138d9c1 + .long 0xa2ca8cfe + .long 0x0bd49836 + .long 0x81f5a6cf + .long 0xde7aa528 + .long 0x8eb7da26 + .long 0xbfad3fa4 + .long 0x9d3a2ce4 + .long 0x9278500d + .long 0xcc5f6a9b + .long 0x467e5462 + .long 0x138df6c2 + .long 0xb8d890e8 + .long 0xf7392e5e + .long 0xafc382f5 + .long 0x805d9fbe + .long 0x93d0697c + .long 0x2dd56fa9 + .long 0x1225cfb3 + .long 0x99acc83b + .long 0x7d1810a7 + .long 0x639ce86e + .long 0xbb3bdb7b + .long 0x7826cd09 + .long 0x18596ef4 + .long 0xb79aec01 + .long 0x9a4f83a8 + .long 0x6e95e665 + .long 0xe6ffaa7e + .long 0xcfbc2108 + .long 0xe815efe6 + .long 0x9be7bad9 + .long 0x366f4ace + .long 0x099fead4 + .long 0x7cb029d6 + .long 0xb2a431af + .long 0x233f2a31 + .long 0x94a5c630 + .long 0x66a235c0 + .long 0xbc4e7437 + .long 0xca82fca6 + .long 0xd090e0b0 + .long 0xd8a73315 + .long 0x9804f14a + .long 0xdaec41f7 + .long 0x50cd7f0e + .long 0xf691172f + .long 0xd64d768d + .long 0xb0ef434d + .long 0x4daacc54 + .long 0x0496e4df + .long 0xb5d19ee3 + .long 0x886a4c1b + .long 0x1f2cc1b8 + .long 0x5165467f + .long 0xea5e9d04 + .long 0x358c015d + .long 0x7487fa73 + .long 0x410bfb2e + .long 0x1d67b35a + .long 0xd2db9252 + .long 0x5610e933 + .long 0x47d66d13 + .long 0x61d79a8c + .long 0x0ca1377a + .long 0x14f8598e + .long 0x3c13eb89 + .long 0x27a9ceee + .long 0xc961b735 + .long 0xe51ce1ed + .long 0xb1477a3c + .long 0xdfd29c59 + .long 0x73f2553f + .long 0xce141879 + .long 0x37c773bf + .long 0xcdf753ea + .long 0xaafd5f5b + .long 0x6f3ddf14 + .long 0xdb447886 + .long 0xf3afca81 + .long 0xc468b93e + .long 0x3424382c + .long 0x40a3c25f + .long 0xc31d1672 + .long 0x25e2bc0c + .long 0x493c288b + .long 0x950dff41 + .long 0x01a83971 + .long 0xb30c08de + .long 0xe4b4d89c + .long 0xc1566490 + .long 0x84cb7b61 + .long 0xb632d570 + .long 0x5c6c4874 + .long 0x57b8d042 + // Table 2. + .long 0xf45150a7 + .long 0x417e5365 + .long 0x171ac3a4 + .long 0x273a965e + .long 0xab3bcb6b + .long 0x9d1ff145 + .long 0xfaacab58 + .long 0xe34b9303 + .long 0x302055fa + .long 0x76adf66d + .long 0xcc889176 + .long 0x02f5254c + .long 0xe54ffcd7 + .long 0x2ac5d7cb + .long 0x35268044 + .long 0x62b58fa3 + .long 0xb1de495a + .long 0xba25671b + .long 0xea45980e + .long 0xfe5de1c0 + .long 0x2fc30275 + .long 0x4c8112f0 + .long 0x468da397 + .long 0xd36bc6f9 + .long 0x8f03e75f + .long 0x9215959c + .long 0x6dbfeb7a + .long 0x5295da59 + .long 0xbed42d83 + .long 0x7458d321 + .long 0xe0492969 + .long 0xc98e44c8 + .long 0xc2756a89 + .long 0x8ef47879 + .long 0x58996b3e + .long 0xb927dd71 + .long 0xe1beb64f + .long 0x88f017ad + .long 0x20c966ac + .long 0xce7db43a + .long 0xdf63184a + .long 0x1ae58231 + .long 0x51976033 + .long 0x5362457f + .long 0x64b1e077 + .long 0x6bbb84ae + .long 0x81fe1ca0 + .long 0x08f9942b + .long 0x48705868 + .long 0x458f19fd + .long 0xde94876c + .long 0x7b52b7f8 + .long 0x73ab23d3 + .long 0x4b72e202 + .long 0x1fe3578f + .long 0x55662aab + .long 0xebb20728 + .long 0xb52f03c2 + .long 0xc5869a7b + .long 0x37d3a508 + .long 0x2830f287 + .long 0xbf23b2a5 + .long 0x0302ba6a + .long 0x16ed5c82 + .long 0xcf8a2b1c + .long 0x79a792b4 + .long 0x07f3f0f2 + .long 0x694ea1e2 + .long 0xda65cdf4 + .long 0x0506d5be + .long 0x34d11f62 + .long 0xa6c48afe + .long 0x2e349d53 + .long 0xf3a2a055 + .long 0x8a0532e1 + .long 0xf6a475eb + .long 0x830b39ec + .long 0x6040aaef + .long 0x715e069f + .long 0x6ebd5110 + .long 0x213ef98a + .long 0xdd963d06 + .long 0x3eddae05 + .long 0xe64d46bd + .long 0x5491b58d + .long 0xc471055d + .long 0x06046fd4 + .long 0x5060ff15 + .long 0x981924fb + .long 0xbdd697e9 + .long 0x4089cc43 + .long 0xd967779e + .long 0xe8b0bd42 + .long 0x8907888b + .long 0x19e7385b + .long 0xc879dbee + .long 0x7ca1470a + .long 0x427ce90f + .long 0x84f8c91e + .long 0x00000000 + .long 0x80098386 + .long 0x2b3248ed + .long 0x111eac70 + .long 0x5a6c4e72 + .long 0x0efdfbff + .long 0x850f5638 + .long 0xae3d1ed5 + .long 0x2d362739 + .long 0x0f0a64d9 + .long 0x5c6821a6 + .long 0x5b9bd154 + .long 0x36243a2e + .long 0x0a0cb167 + .long 0x57930fe7 + .long 0xeeb4d296 + .long 0x9b1b9e91 + .long 0xc0804fc5 + .long 0xdc61a220 + .long 0x775a694b + .long 0x121c161a + .long 0x93e20aba + .long 0xa0c0e52a + .long 0x223c43e0 + .long 0x1b121d17 + .long 0x090e0b0d + .long 0x8bf2adc7 + .long 0xb62db9a8 + .long 0x1e14c8a9 + .long 0xf1578519 + .long 0x75af4c07 + .long 0x99eebbdd + .long 0x7fa3fd60 + .long 0x01f79f26 + .long 0x725cbcf5 + .long 0x6644c53b + .long 0xfb5b347e + .long 0x438b7629 + .long 0x23cbdcc6 + .long 0xedb668fc + .long 0xe4b863f1 + .long 0x31d7cadc + .long 0x63421085 + .long 0x97134022 + .long 0xc6842011 + .long 0x4a857d24 + .long 0xbbd2f83d + .long 0xf9ae1132 + .long 0x29c76da1 + .long 0x9e1d4b2f + .long 0xb2dcf330 + .long 0x860dec52 + .long 0xc177d0e3 + .long 0xb32b6c16 + .long 0x70a999b9 + .long 0x9411fa48 + .long 0xe9472264 + .long 0xfca8c48c + .long 0xf0a01a3f + .long 0x7d56d82c + .long 0x3322ef90 + .long 0x4987c74e + .long 0x38d9c1d1 + .long 0xca8cfea2 + .long 0xd498360b + .long 0xf5a6cf81 + .long 0x7aa528de + .long 0xb7da268e + .long 0xad3fa4bf + .long 0x3a2ce49d + .long 0x78500d92 + .long 0x5f6a9bcc + .long 0x7e546246 + .long 0x8df6c213 + .long 0xd890e8b8 + .long 0x392e5ef7 + .long 0xc382f5af + .long 0x5d9fbe80 + .long 0xd0697c93 + .long 0xd56fa92d + .long 0x25cfb312 + .long 0xacc83b99 + .long 0x1810a77d + .long 0x9ce86e63 + .long 0x3bdb7bbb + .long 0x26cd0978 + .long 0x596ef418 + .long 0x9aec01b7 + .long 0x4f83a89a + .long 0x95e6656e + .long 0xffaa7ee6 + .long 0xbc2108cf + .long 0x15efe6e8 + .long 0xe7bad99b + .long 0x6f4ace36 + .long 0x9fead409 + .long 0xb029d67c + .long 0xa431afb2 + .long 0x3f2a3123 + .long 0xa5c63094 + .long 0xa235c066 + .long 0x4e7437bc + .long 0x82fca6ca + .long 0x90e0b0d0 + .long 0xa73315d8 + .long 0x04f14a98 + .long 0xec41f7da + .long 0xcd7f0e50 + .long 0x91172ff6 + .long 0x4d768dd6 + .long 0xef434db0 + .long 0xaacc544d + .long 0x96e4df04 + .long 0xd19ee3b5 + .long 0x6a4c1b88 + .long 0x2cc1b81f + .long 0x65467f51 + .long 0x5e9d04ea + .long 0x8c015d35 + .long 0x87fa7374 + .long 0x0bfb2e41 + .long 0x67b35a1d + .long 0xdb9252d2 + .long 0x10e93356 + .long 0xd66d1347 + .long 0xd79a8c61 + .long 0xa1377a0c + .long 0xf8598e14 + .long 0x13eb893c + .long 0xa9ceee27 + .long 0x61b735c9 + .long 0x1ce1ede5 + .long 0x477a3cb1 + .long 0xd29c59df + .long 0xf2553f73 + .long 0x141879ce + .long 0xc773bf37 + .long 0xf753eacd + .long 0xfd5f5baa + .long 0x3ddf146f + .long 0x447886db + .long 0xafca81f3 + .long 0x68b93ec4 + .long 0x24382c34 + .long 0xa3c25f40 + .long 0x1d1672c3 + .long 0xe2bc0c25 + .long 0x3c288b49 + .long 0x0dff4195 + .long 0xa8397101 + .long 0x0c08deb3 + .long 0xb4d89ce4 + .long 0x566490c1 + .long 0xcb7b6184 + .long 0x32d570b6 + .long 0x6c48745c + .long 0xb8d04257 + // Table 3. + .long 0x5150a7f4 + .long 0x7e536541 + .long 0x1ac3a417 + .long 0x3a965e27 + .long 0x3bcb6bab + .long 0x1ff1459d + .long 0xacab58fa + .long 0x4b9303e3 + .long 0x2055fa30 + .long 0xadf66d76 + .long 0x889176cc + .long 0xf5254c02 + .long 0x4ffcd7e5 + .long 0xc5d7cb2a + .long 0x26804435 + .long 0xb58fa362 + .long 0xde495ab1 + .long 0x25671bba + .long 0x45980eea + .long 0x5de1c0fe + .long 0xc302752f + .long 0x8112f04c + .long 0x8da39746 + .long 0x6bc6f9d3 + .long 0x03e75f8f + .long 0x15959c92 + .long 0xbfeb7a6d + .long 0x95da5952 + .long 0xd42d83be + .long 0x58d32174 + .long 0x492969e0 + .long 0x8e44c8c9 + .long 0x756a89c2 + .long 0xf478798e + .long 0x996b3e58 + .long 0x27dd71b9 + .long 0xbeb64fe1 + .long 0xf017ad88 + .long 0xc966ac20 + .long 0x7db43ace + .long 0x63184adf + .long 0xe582311a + .long 0x97603351 + .long 0x62457f53 + .long 0xb1e07764 + .long 0xbb84ae6b + .long 0xfe1ca081 + .long 0xf9942b08 + .long 0x70586848 + .long 0x8f19fd45 + .long 0x94876cde + .long 0x52b7f87b + .long 0xab23d373 + .long 0x72e2024b + .long 0xe3578f1f + .long 0x662aab55 + .long 0xb20728eb + .long 0x2f03c2b5 + .long 0x869a7bc5 + .long 0xd3a50837 + .long 0x30f28728 + .long 0x23b2a5bf + .long 0x02ba6a03 + .long 0xed5c8216 + .long 0x8a2b1ccf + .long 0xa792b479 + .long 0xf3f0f207 + .long 0x4ea1e269 + .long 0x65cdf4da + .long 0x06d5be05 + .long 0xd11f6234 + .long 0xc48afea6 + .long 0x349d532e + .long 0xa2a055f3 + .long 0x0532e18a + .long 0xa475ebf6 + .long 0x0b39ec83 + .long 0x40aaef60 + .long 0x5e069f71 + .long 0xbd51106e + .long 0x3ef98a21 + .long 0x963d06dd + .long 0xddae053e + .long 0x4d46bde6 + .long 0x91b58d54 + .long 0x71055dc4 + .long 0x046fd406 + .long 0x60ff1550 + .long 0x1924fb98 + .long 0xd697e9bd + .long 0x89cc4340 + .long 0x67779ed9 + .long 0xb0bd42e8 + .long 0x07888b89 + .long 0xe7385b19 + .long 0x79dbeec8 + .long 0xa1470a7c + .long 0x7ce90f42 + .long 0xf8c91e84 + .long 0x00000000 + .long 0x09838680 + .long 0x3248ed2b + .long 0x1eac7011 + .long 0x6c4e725a + .long 0xfdfbff0e + .long 0x0f563885 + .long 0x3d1ed5ae + .long 0x3627392d + .long 0x0a64d90f + .long 0x6821a65c + .long 0x9bd1545b + .long 0x243a2e36 + .long 0x0cb1670a + .long 0x930fe757 + .long 0xb4d296ee + .long 0x1b9e919b + .long 0x804fc5c0 + .long 0x61a220dc + .long 0x5a694b77 + .long 0x1c161a12 + .long 0xe20aba93 + .long 0xc0e52aa0 + .long 0x3c43e022 + .long 0x121d171b + .long 0x0e0b0d09 + .long 0xf2adc78b + .long 0x2db9a8b6 + .long 0x14c8a91e + .long 0x578519f1 + .long 0xaf4c0775 + .long 0xeebbdd99 + .long 0xa3fd607f + .long 0xf79f2601 + .long 0x5cbcf572 + .long 0x44c53b66 + .long 0x5b347efb + .long 0x8b762943 + .long 0xcbdcc623 + .long 0xb668fced + .long 0xb863f1e4 + .long 0xd7cadc31 + .long 0x42108563 + .long 0x13402297 + .long 0x842011c6 + .long 0x857d244a + .long 0xd2f83dbb + .long 0xae1132f9 + .long 0xc76da129 + .long 0x1d4b2f9e + .long 0xdcf330b2 + .long 0x0dec5286 + .long 0x77d0e3c1 + .long 0x2b6c16b3 + .long 0xa999b970 + .long 0x11fa4894 + .long 0x472264e9 + .long 0xa8c48cfc + .long 0xa01a3ff0 + .long 0x56d82c7d + .long 0x22ef9033 + .long 0x87c74e49 + .long 0xd9c1d138 + .long 0x8cfea2ca + .long 0x98360bd4 + .long 0xa6cf81f5 + .long 0xa528de7a + .long 0xda268eb7 + .long 0x3fa4bfad + .long 0x2ce49d3a + .long 0x500d9278 + .long 0x6a9bcc5f + .long 0x5462467e + .long 0xf6c2138d + .long 0x90e8b8d8 + .long 0x2e5ef739 + .long 0x82f5afc3 + .long 0x9fbe805d + .long 0x697c93d0 + .long 0x6fa92dd5 + .long 0xcfb31225 + .long 0xc83b99ac + .long 0x10a77d18 + .long 0xe86e639c + .long 0xdb7bbb3b + .long 0xcd097826 + .long 0x6ef41859 + .long 0xec01b79a + .long 0x83a89a4f + .long 0xe6656e95 + .long 0xaa7ee6ff + .long 0x2108cfbc + .long 0xefe6e815 + .long 0xbad99be7 + .long 0x4ace366f + .long 0xead4099f + .long 0x29d67cb0 + .long 0x31afb2a4 + .long 0x2a31233f + .long 0xc63094a5 + .long 0x35c066a2 + .long 0x7437bc4e + .long 0xfca6ca82 + .long 0xe0b0d090 + .long 0x3315d8a7 + .long 0xf14a9804 + .long 0x41f7daec + .long 0x7f0e50cd + .long 0x172ff691 + .long 0x768dd64d + .long 0x434db0ef + .long 0xcc544daa + .long 0xe4df0496 + .long 0x9ee3b5d1 + .long 0x4c1b886a + .long 0xc1b81f2c + .long 0x467f5165 + .long 0x9d04ea5e + .long 0x015d358c + .long 0xfa737487 + .long 0xfb2e410b + .long 0xb35a1d67 + .long 0x9252d2db + .long 0xe9335610 + .long 0x6d1347d6 + .long 0x9a8c61d7 + .long 0x377a0ca1 + .long 0x598e14f8 + .long 0xeb893c13 + .long 0xceee27a9 + .long 0xb735c961 + .long 0xe1ede51c + .long 0x7a3cb147 + .long 0x9c59dfd2 + .long 0x553f73f2 + .long 0x1879ce14 + .long 0x73bf37c7 + .long 0x53eacdf7 + .long 0x5f5baafd + .long 0xdf146f3d + .long 0x7886db44 + .long 0xca81f3af + .long 0xb93ec468 + .long 0x382c3424 + .long 0xc25f40a3 + .long 0x1672c31d + .long 0xbc0c25e2 + .long 0x288b493c + .long 0xff41950d + .long 0x397101a8 + .long 0x08deb30c + .long 0xd89ce4b4 + .long 0x6490c156 + .long 0x7b6184cb + .long 0xd570b632 + .long 0x48745c6c + .long 0xd04257b8 + + +// SubBytes embedded in words tables. + .globl _AESSubBytesWordTable + .private_extern _AESSubBytesWordTable + .align 2 +_AESSubBytesWordTable: + // Table 0. + .long 0x00000063 + .long 0x0000007c + .long 0x00000077 + .long 0x0000007b + .long 0x000000f2 + .long 0x0000006b + .long 0x0000006f + .long 0x000000c5 + .long 0x00000030 + .long 0x00000001 + .long 0x00000067 + .long 0x0000002b + .long 0x000000fe + .long 0x000000d7 + .long 0x000000ab + .long 0x00000076 + .long 0x000000ca + .long 0x00000082 + .long 0x000000c9 + .long 0x0000007d + .long 0x000000fa + .long 0x00000059 + .long 0x00000047 + .long 0x000000f0 + .long 0x000000ad + .long 0x000000d4 + .long 0x000000a2 + .long 0x000000af + .long 0x0000009c + .long 0x000000a4 + .long 0x00000072 + .long 0x000000c0 + .long 0x000000b7 + .long 0x000000fd + .long 0x00000093 + .long 0x00000026 + .long 0x00000036 + .long 0x0000003f + .long 0x000000f7 + .long 0x000000cc + .long 0x00000034 + .long 0x000000a5 + .long 0x000000e5 + .long 0x000000f1 + .long 0x00000071 + .long 0x000000d8 + .long 0x00000031 + .long 0x00000015 + .long 0x00000004 + .long 0x000000c7 + .long 0x00000023 + .long 0x000000c3 + .long 0x00000018 + .long 0x00000096 + .long 0x00000005 + .long 0x0000009a + .long 0x00000007 + .long 0x00000012 + .long 0x00000080 + .long 0x000000e2 + .long 0x000000eb + .long 0x00000027 + .long 0x000000b2 + .long 0x00000075 + .long 0x00000009 + .long 0x00000083 + .long 0x0000002c + .long 0x0000001a + .long 0x0000001b + .long 0x0000006e + .long 0x0000005a + .long 0x000000a0 + .long 0x00000052 + .long 0x0000003b + .long 0x000000d6 + .long 0x000000b3 + .long 0x00000029 + .long 0x000000e3 + .long 0x0000002f + .long 0x00000084 + .long 0x00000053 + .long 0x000000d1 + .long 0x00000000 + .long 0x000000ed + .long 0x00000020 + .long 0x000000fc + .long 0x000000b1 + .long 0x0000005b + .long 0x0000006a + .long 0x000000cb + .long 0x000000be + .long 0x00000039 + .long 0x0000004a + .long 0x0000004c + .long 0x00000058 + .long 0x000000cf + .long 0x000000d0 + .long 0x000000ef + .long 0x000000aa + .long 0x000000fb + .long 0x00000043 + .long 0x0000004d + .long 0x00000033 + .long 0x00000085 + .long 0x00000045 + .long 0x000000f9 + .long 0x00000002 + .long 0x0000007f + .long 0x00000050 + .long 0x0000003c + .long 0x0000009f + .long 0x000000a8 + .long 0x00000051 + .long 0x000000a3 + .long 0x00000040 + .long 0x0000008f + .long 0x00000092 + .long 0x0000009d + .long 0x00000038 + .long 0x000000f5 + .long 0x000000bc + .long 0x000000b6 + .long 0x000000da + .long 0x00000021 + .long 0x00000010 + .long 0x000000ff + .long 0x000000f3 + .long 0x000000d2 + .long 0x000000cd + .long 0x0000000c + .long 0x00000013 + .long 0x000000ec + .long 0x0000005f + .long 0x00000097 + .long 0x00000044 + .long 0x00000017 + .long 0x000000c4 + .long 0x000000a7 + .long 0x0000007e + .long 0x0000003d + .long 0x00000064 + .long 0x0000005d + .long 0x00000019 + .long 0x00000073 + .long 0x00000060 + .long 0x00000081 + .long 0x0000004f + .long 0x000000dc + .long 0x00000022 + .long 0x0000002a + .long 0x00000090 + .long 0x00000088 + .long 0x00000046 + .long 0x000000ee + .long 0x000000b8 + .long 0x00000014 + .long 0x000000de + .long 0x0000005e + .long 0x0000000b + .long 0x000000db + .long 0x000000e0 + .long 0x00000032 + .long 0x0000003a + .long 0x0000000a + .long 0x00000049 + .long 0x00000006 + .long 0x00000024 + .long 0x0000005c + .long 0x000000c2 + .long 0x000000d3 + .long 0x000000ac + .long 0x00000062 + .long 0x00000091 + .long 0x00000095 + .long 0x000000e4 + .long 0x00000079 + .long 0x000000e7 + .long 0x000000c8 + .long 0x00000037 + .long 0x0000006d + .long 0x0000008d + .long 0x000000d5 + .long 0x0000004e + .long 0x000000a9 + .long 0x0000006c + .long 0x00000056 + .long 0x000000f4 + .long 0x000000ea + .long 0x00000065 + .long 0x0000007a + .long 0x000000ae + .long 0x00000008 + .long 0x000000ba + .long 0x00000078 + .long 0x00000025 + .long 0x0000002e + .long 0x0000001c + .long 0x000000a6 + .long 0x000000b4 + .long 0x000000c6 + .long 0x000000e8 + .long 0x000000dd + .long 0x00000074 + .long 0x0000001f + .long 0x0000004b + .long 0x000000bd + .long 0x0000008b + .long 0x0000008a + .long 0x00000070 + .long 0x0000003e + .long 0x000000b5 + .long 0x00000066 + .long 0x00000048 + .long 0x00000003 + .long 0x000000f6 + .long 0x0000000e + .long 0x00000061 + .long 0x00000035 + .long 0x00000057 + .long 0x000000b9 + .long 0x00000086 + .long 0x000000c1 + .long 0x0000001d + .long 0x0000009e + .long 0x000000e1 + .long 0x000000f8 + .long 0x00000098 + .long 0x00000011 + .long 0x00000069 + .long 0x000000d9 + .long 0x0000008e + .long 0x00000094 + .long 0x0000009b + .long 0x0000001e + .long 0x00000087 + .long 0x000000e9 + .long 0x000000ce + .long 0x00000055 + .long 0x00000028 + .long 0x000000df + .long 0x0000008c + .long 0x000000a1 + .long 0x00000089 + .long 0x0000000d + .long 0x000000bf + .long 0x000000e6 + .long 0x00000042 + .long 0x00000068 + .long 0x00000041 + .long 0x00000099 + .long 0x0000002d + .long 0x0000000f + .long 0x000000b0 + .long 0x00000054 + .long 0x000000bb + .long 0x00000016 + // Table 1. + .long 0x00006300 + .long 0x00007c00 + .long 0x00007700 + .long 0x00007b00 + .long 0x0000f200 + .long 0x00006b00 + .long 0x00006f00 + .long 0x0000c500 + .long 0x00003000 + .long 0x00000100 + .long 0x00006700 + .long 0x00002b00 + .long 0x0000fe00 + .long 0x0000d700 + .long 0x0000ab00 + .long 0x00007600 + .long 0x0000ca00 + .long 0x00008200 + .long 0x0000c900 + .long 0x00007d00 + .long 0x0000fa00 + .long 0x00005900 + .long 0x00004700 + .long 0x0000f000 + .long 0x0000ad00 + .long 0x0000d400 + .long 0x0000a200 + .long 0x0000af00 + .long 0x00009c00 + .long 0x0000a400 + .long 0x00007200 + .long 0x0000c000 + .long 0x0000b700 + .long 0x0000fd00 + .long 0x00009300 + .long 0x00002600 + .long 0x00003600 + .long 0x00003f00 + .long 0x0000f700 + .long 0x0000cc00 + .long 0x00003400 + .long 0x0000a500 + .long 0x0000e500 + .long 0x0000f100 + .long 0x00007100 + .long 0x0000d800 + .long 0x00003100 + .long 0x00001500 + .long 0x00000400 + .long 0x0000c700 + .long 0x00002300 + .long 0x0000c300 + .long 0x00001800 + .long 0x00009600 + .long 0x00000500 + .long 0x00009a00 + .long 0x00000700 + .long 0x00001200 + .long 0x00008000 + .long 0x0000e200 + .long 0x0000eb00 + .long 0x00002700 + .long 0x0000b200 + .long 0x00007500 + .long 0x00000900 + .long 0x00008300 + .long 0x00002c00 + .long 0x00001a00 + .long 0x00001b00 + .long 0x00006e00 + .long 0x00005a00 + .long 0x0000a000 + .long 0x00005200 + .long 0x00003b00 + .long 0x0000d600 + .long 0x0000b300 + .long 0x00002900 + .long 0x0000e300 + .long 0x00002f00 + .long 0x00008400 + .long 0x00005300 + .long 0x0000d100 + .long 0x00000000 + .long 0x0000ed00 + .long 0x00002000 + .long 0x0000fc00 + .long 0x0000b100 + .long 0x00005b00 + .long 0x00006a00 + .long 0x0000cb00 + .long 0x0000be00 + .long 0x00003900 + .long 0x00004a00 + .long 0x00004c00 + .long 0x00005800 + .long 0x0000cf00 + .long 0x0000d000 + .long 0x0000ef00 + .long 0x0000aa00 + .long 0x0000fb00 + .long 0x00004300 + .long 0x00004d00 + .long 0x00003300 + .long 0x00008500 + .long 0x00004500 + .long 0x0000f900 + .long 0x00000200 + .long 0x00007f00 + .long 0x00005000 + .long 0x00003c00 + .long 0x00009f00 + .long 0x0000a800 + .long 0x00005100 + .long 0x0000a300 + .long 0x00004000 + .long 0x00008f00 + .long 0x00009200 + .long 0x00009d00 + .long 0x00003800 + .long 0x0000f500 + .long 0x0000bc00 + .long 0x0000b600 + .long 0x0000da00 + .long 0x00002100 + .long 0x00001000 + .long 0x0000ff00 + .long 0x0000f300 + .long 0x0000d200 + .long 0x0000cd00 + .long 0x00000c00 + .long 0x00001300 + .long 0x0000ec00 + .long 0x00005f00 + .long 0x00009700 + .long 0x00004400 + .long 0x00001700 + .long 0x0000c400 + .long 0x0000a700 + .long 0x00007e00 + .long 0x00003d00 + .long 0x00006400 + .long 0x00005d00 + .long 0x00001900 + .long 0x00007300 + .long 0x00006000 + .long 0x00008100 + .long 0x00004f00 + .long 0x0000dc00 + .long 0x00002200 + .long 0x00002a00 + .long 0x00009000 + .long 0x00008800 + .long 0x00004600 + .long 0x0000ee00 + .long 0x0000b800 + .long 0x00001400 + .long 0x0000de00 + .long 0x00005e00 + .long 0x00000b00 + .long 0x0000db00 + .long 0x0000e000 + .long 0x00003200 + .long 0x00003a00 + .long 0x00000a00 + .long 0x00004900 + .long 0x00000600 + .long 0x00002400 + .long 0x00005c00 + .long 0x0000c200 + .long 0x0000d300 + .long 0x0000ac00 + .long 0x00006200 + .long 0x00009100 + .long 0x00009500 + .long 0x0000e400 + .long 0x00007900 + .long 0x0000e700 + .long 0x0000c800 + .long 0x00003700 + .long 0x00006d00 + .long 0x00008d00 + .long 0x0000d500 + .long 0x00004e00 + .long 0x0000a900 + .long 0x00006c00 + .long 0x00005600 + .long 0x0000f400 + .long 0x0000ea00 + .long 0x00006500 + .long 0x00007a00 + .long 0x0000ae00 + .long 0x00000800 + .long 0x0000ba00 + .long 0x00007800 + .long 0x00002500 + .long 0x00002e00 + .long 0x00001c00 + .long 0x0000a600 + .long 0x0000b400 + .long 0x0000c600 + .long 0x0000e800 + .long 0x0000dd00 + .long 0x00007400 + .long 0x00001f00 + .long 0x00004b00 + .long 0x0000bd00 + .long 0x00008b00 + .long 0x00008a00 + .long 0x00007000 + .long 0x00003e00 + .long 0x0000b500 + .long 0x00006600 + .long 0x00004800 + .long 0x00000300 + .long 0x0000f600 + .long 0x00000e00 + .long 0x00006100 + .long 0x00003500 + .long 0x00005700 + .long 0x0000b900 + .long 0x00008600 + .long 0x0000c100 + .long 0x00001d00 + .long 0x00009e00 + .long 0x0000e100 + .long 0x0000f800 + .long 0x00009800 + .long 0x00001100 + .long 0x00006900 + .long 0x0000d900 + .long 0x00008e00 + .long 0x00009400 + .long 0x00009b00 + .long 0x00001e00 + .long 0x00008700 + .long 0x0000e900 + .long 0x0000ce00 + .long 0x00005500 + .long 0x00002800 + .long 0x0000df00 + .long 0x00008c00 + .long 0x0000a100 + .long 0x00008900 + .long 0x00000d00 + .long 0x0000bf00 + .long 0x0000e600 + .long 0x00004200 + .long 0x00006800 + .long 0x00004100 + .long 0x00009900 + .long 0x00002d00 + .long 0x00000f00 + .long 0x0000b000 + .long 0x00005400 + .long 0x0000bb00 + .long 0x00001600 + // Table 2. + .long 0x00630000 + .long 0x007c0000 + .long 0x00770000 + .long 0x007b0000 + .long 0x00f20000 + .long 0x006b0000 + .long 0x006f0000 + .long 0x00c50000 + .long 0x00300000 + .long 0x00010000 + .long 0x00670000 + .long 0x002b0000 + .long 0x00fe0000 + .long 0x00d70000 + .long 0x00ab0000 + .long 0x00760000 + .long 0x00ca0000 + .long 0x00820000 + .long 0x00c90000 + .long 0x007d0000 + .long 0x00fa0000 + .long 0x00590000 + .long 0x00470000 + .long 0x00f00000 + .long 0x00ad0000 + .long 0x00d40000 + .long 0x00a20000 + .long 0x00af0000 + .long 0x009c0000 + .long 0x00a40000 + .long 0x00720000 + .long 0x00c00000 + .long 0x00b70000 + .long 0x00fd0000 + .long 0x00930000 + .long 0x00260000 + .long 0x00360000 + .long 0x003f0000 + .long 0x00f70000 + .long 0x00cc0000 + .long 0x00340000 + .long 0x00a50000 + .long 0x00e50000 + .long 0x00f10000 + .long 0x00710000 + .long 0x00d80000 + .long 0x00310000 + .long 0x00150000 + .long 0x00040000 + .long 0x00c70000 + .long 0x00230000 + .long 0x00c30000 + .long 0x00180000 + .long 0x00960000 + .long 0x00050000 + .long 0x009a0000 + .long 0x00070000 + .long 0x00120000 + .long 0x00800000 + .long 0x00e20000 + .long 0x00eb0000 + .long 0x00270000 + .long 0x00b20000 + .long 0x00750000 + .long 0x00090000 + .long 0x00830000 + .long 0x002c0000 + .long 0x001a0000 + .long 0x001b0000 + .long 0x006e0000 + .long 0x005a0000 + .long 0x00a00000 + .long 0x00520000 + .long 0x003b0000 + .long 0x00d60000 + .long 0x00b30000 + .long 0x00290000 + .long 0x00e30000 + .long 0x002f0000 + .long 0x00840000 + .long 0x00530000 + .long 0x00d10000 + .long 0x00000000 + .long 0x00ed0000 + .long 0x00200000 + .long 0x00fc0000 + .long 0x00b10000 + .long 0x005b0000 + .long 0x006a0000 + .long 0x00cb0000 + .long 0x00be0000 + .long 0x00390000 + .long 0x004a0000 + .long 0x004c0000 + .long 0x00580000 + .long 0x00cf0000 + .long 0x00d00000 + .long 0x00ef0000 + .long 0x00aa0000 + .long 0x00fb0000 + .long 0x00430000 + .long 0x004d0000 + .long 0x00330000 + .long 0x00850000 + .long 0x00450000 + .long 0x00f90000 + .long 0x00020000 + .long 0x007f0000 + .long 0x00500000 + .long 0x003c0000 + .long 0x009f0000 + .long 0x00a80000 + .long 0x00510000 + .long 0x00a30000 + .long 0x00400000 + .long 0x008f0000 + .long 0x00920000 + .long 0x009d0000 + .long 0x00380000 + .long 0x00f50000 + .long 0x00bc0000 + .long 0x00b60000 + .long 0x00da0000 + .long 0x00210000 + .long 0x00100000 + .long 0x00ff0000 + .long 0x00f30000 + .long 0x00d20000 + .long 0x00cd0000 + .long 0x000c0000 + .long 0x00130000 + .long 0x00ec0000 + .long 0x005f0000 + .long 0x00970000 + .long 0x00440000 + .long 0x00170000 + .long 0x00c40000 + .long 0x00a70000 + .long 0x007e0000 + .long 0x003d0000 + .long 0x00640000 + .long 0x005d0000 + .long 0x00190000 + .long 0x00730000 + .long 0x00600000 + .long 0x00810000 + .long 0x004f0000 + .long 0x00dc0000 + .long 0x00220000 + .long 0x002a0000 + .long 0x00900000 + .long 0x00880000 + .long 0x00460000 + .long 0x00ee0000 + .long 0x00b80000 + .long 0x00140000 + .long 0x00de0000 + .long 0x005e0000 + .long 0x000b0000 + .long 0x00db0000 + .long 0x00e00000 + .long 0x00320000 + .long 0x003a0000 + .long 0x000a0000 + .long 0x00490000 + .long 0x00060000 + .long 0x00240000 + .long 0x005c0000 + .long 0x00c20000 + .long 0x00d30000 + .long 0x00ac0000 + .long 0x00620000 + .long 0x00910000 + .long 0x00950000 + .long 0x00e40000 + .long 0x00790000 + .long 0x00e70000 + .long 0x00c80000 + .long 0x00370000 + .long 0x006d0000 + .long 0x008d0000 + .long 0x00d50000 + .long 0x004e0000 + .long 0x00a90000 + .long 0x006c0000 + .long 0x00560000 + .long 0x00f40000 + .long 0x00ea0000 + .long 0x00650000 + .long 0x007a0000 + .long 0x00ae0000 + .long 0x00080000 + .long 0x00ba0000 + .long 0x00780000 + .long 0x00250000 + .long 0x002e0000 + .long 0x001c0000 + .long 0x00a60000 + .long 0x00b40000 + .long 0x00c60000 + .long 0x00e80000 + .long 0x00dd0000 + .long 0x00740000 + .long 0x001f0000 + .long 0x004b0000 + .long 0x00bd0000 + .long 0x008b0000 + .long 0x008a0000 + .long 0x00700000 + .long 0x003e0000 + .long 0x00b50000 + .long 0x00660000 + .long 0x00480000 + .long 0x00030000 + .long 0x00f60000 + .long 0x000e0000 + .long 0x00610000 + .long 0x00350000 + .long 0x00570000 + .long 0x00b90000 + .long 0x00860000 + .long 0x00c10000 + .long 0x001d0000 + .long 0x009e0000 + .long 0x00e10000 + .long 0x00f80000 + .long 0x00980000 + .long 0x00110000 + .long 0x00690000 + .long 0x00d90000 + .long 0x008e0000 + .long 0x00940000 + .long 0x009b0000 + .long 0x001e0000 + .long 0x00870000 + .long 0x00e90000 + .long 0x00ce0000 + .long 0x00550000 + .long 0x00280000 + .long 0x00df0000 + .long 0x008c0000 + .long 0x00a10000 + .long 0x00890000 + .long 0x000d0000 + .long 0x00bf0000 + .long 0x00e60000 + .long 0x00420000 + .long 0x00680000 + .long 0x00410000 + .long 0x00990000 + .long 0x002d0000 + .long 0x000f0000 + .long 0x00b00000 + .long 0x00540000 + .long 0x00bb0000 + .long 0x00160000 + // Table 3. + .long 0x63000000 + .long 0x7c000000 + .long 0x77000000 + .long 0x7b000000 + .long 0xf2000000 + .long 0x6b000000 + .long 0x6f000000 + .long 0xc5000000 + .long 0x30000000 + .long 0x01000000 + .long 0x67000000 + .long 0x2b000000 + .long 0xfe000000 + .long 0xd7000000 + .long 0xab000000 + .long 0x76000000 + .long 0xca000000 + .long 0x82000000 + .long 0xc9000000 + .long 0x7d000000 + .long 0xfa000000 + .long 0x59000000 + .long 0x47000000 + .long 0xf0000000 + .long 0xad000000 + .long 0xd4000000 + .long 0xa2000000 + .long 0xaf000000 + .long 0x9c000000 + .long 0xa4000000 + .long 0x72000000 + .long 0xc0000000 + .long 0xb7000000 + .long 0xfd000000 + .long 0x93000000 + .long 0x26000000 + .long 0x36000000 + .long 0x3f000000 + .long 0xf7000000 + .long 0xcc000000 + .long 0x34000000 + .long 0xa5000000 + .long 0xe5000000 + .long 0xf1000000 + .long 0x71000000 + .long 0xd8000000 + .long 0x31000000 + .long 0x15000000 + .long 0x04000000 + .long 0xc7000000 + .long 0x23000000 + .long 0xc3000000 + .long 0x18000000 + .long 0x96000000 + .long 0x05000000 + .long 0x9a000000 + .long 0x07000000 + .long 0x12000000 + .long 0x80000000 + .long 0xe2000000 + .long 0xeb000000 + .long 0x27000000 + .long 0xb2000000 + .long 0x75000000 + .long 0x09000000 + .long 0x83000000 + .long 0x2c000000 + .long 0x1a000000 + .long 0x1b000000 + .long 0x6e000000 + .long 0x5a000000 + .long 0xa0000000 + .long 0x52000000 + .long 0x3b000000 + .long 0xd6000000 + .long 0xb3000000 + .long 0x29000000 + .long 0xe3000000 + .long 0x2f000000 + .long 0x84000000 + .long 0x53000000 + .long 0xd1000000 + .long 0x00000000 + .long 0xed000000 + .long 0x20000000 + .long 0xfc000000 + .long 0xb1000000 + .long 0x5b000000 + .long 0x6a000000 + .long 0xcb000000 + .long 0xbe000000 + .long 0x39000000 + .long 0x4a000000 + .long 0x4c000000 + .long 0x58000000 + .long 0xcf000000 + .long 0xd0000000 + .long 0xef000000 + .long 0xaa000000 + .long 0xfb000000 + .long 0x43000000 + .long 0x4d000000 + .long 0x33000000 + .long 0x85000000 + .long 0x45000000 + .long 0xf9000000 + .long 0x02000000 + .long 0x7f000000 + .long 0x50000000 + .long 0x3c000000 + .long 0x9f000000 + .long 0xa8000000 + .long 0x51000000 + .long 0xa3000000 + .long 0x40000000 + .long 0x8f000000 + .long 0x92000000 + .long 0x9d000000 + .long 0x38000000 + .long 0xf5000000 + .long 0xbc000000 + .long 0xb6000000 + .long 0xda000000 + .long 0x21000000 + .long 0x10000000 + .long 0xff000000 + .long 0xf3000000 + .long 0xd2000000 + .long 0xcd000000 + .long 0x0c000000 + .long 0x13000000 + .long 0xec000000 + .long 0x5f000000 + .long 0x97000000 + .long 0x44000000 + .long 0x17000000 + .long 0xc4000000 + .long 0xa7000000 + .long 0x7e000000 + .long 0x3d000000 + .long 0x64000000 + .long 0x5d000000 + .long 0x19000000 + .long 0x73000000 + .long 0x60000000 + .long 0x81000000 + .long 0x4f000000 + .long 0xdc000000 + .long 0x22000000 + .long 0x2a000000 + .long 0x90000000 + .long 0x88000000 + .long 0x46000000 + .long 0xee000000 + .long 0xb8000000 + .long 0x14000000 + .long 0xde000000 + .long 0x5e000000 + .long 0x0b000000 + .long 0xdb000000 + .long 0xe0000000 + .long 0x32000000 + .long 0x3a000000 + .long 0x0a000000 + .long 0x49000000 + .long 0x06000000 + .long 0x24000000 + .long 0x5c000000 + .long 0xc2000000 + .long 0xd3000000 + .long 0xac000000 + .long 0x62000000 + .long 0x91000000 + .long 0x95000000 + .long 0xe4000000 + .long 0x79000000 + .long 0xe7000000 + .long 0xc8000000 + .long 0x37000000 + .long 0x6d000000 + .long 0x8d000000 + .long 0xd5000000 + .long 0x4e000000 + .long 0xa9000000 + .long 0x6c000000 + .long 0x56000000 + .long 0xf4000000 + .long 0xea000000 + .long 0x65000000 + .long 0x7a000000 + .long 0xae000000 + .long 0x08000000 + .long 0xba000000 + .long 0x78000000 + .long 0x25000000 + .long 0x2e000000 + .long 0x1c000000 + .long 0xa6000000 + .long 0xb4000000 + .long 0xc6000000 + .long 0xe8000000 + .long 0xdd000000 + .long 0x74000000 + .long 0x1f000000 + .long 0x4b000000 + .long 0xbd000000 + .long 0x8b000000 + .long 0x8a000000 + .long 0x70000000 + .long 0x3e000000 + .long 0xb5000000 + .long 0x66000000 + .long 0x48000000 + .long 0x03000000 + .long 0xf6000000 + .long 0x0e000000 + .long 0x61000000 + .long 0x35000000 + .long 0x57000000 + .long 0xb9000000 + .long 0x86000000 + .long 0xc1000000 + .long 0x1d000000 + .long 0x9e000000 + .long 0xe1000000 + .long 0xf8000000 + .long 0x98000000 + .long 0x11000000 + .long 0x69000000 + .long 0xd9000000 + .long 0x8e000000 + .long 0x94000000 + .long 0x9b000000 + .long 0x1e000000 + .long 0x87000000 + .long 0xe9000000 + .long 0xce000000 + .long 0x55000000 + .long 0x28000000 + .long 0xdf000000 + .long 0x8c000000 + .long 0xa1000000 + .long 0x89000000 + .long 0x0d000000 + .long 0xbf000000 + .long 0xe6000000 + .long 0x42000000 + .long 0x68000000 + .long 0x41000000 + .long 0x99000000 + .long 0x2d000000 + .long 0x0f000000 + .long 0xb0000000 + .long 0x54000000 + .long 0xbb000000 + .long 0x16000000 + + +// InvSubBytes embedded in words tables. + .globl _AESInvSubBytesWordTable + .private_extern _AESInvSubBytesWordTable + .align 2 +_AESInvSubBytesWordTable: + // Table 0. + .long 0x00000052 + .long 0x00000009 + .long 0x0000006a + .long 0x000000d5 + .long 0x00000030 + .long 0x00000036 + .long 0x000000a5 + .long 0x00000038 + .long 0x000000bf + .long 0x00000040 + .long 0x000000a3 + .long 0x0000009e + .long 0x00000081 + .long 0x000000f3 + .long 0x000000d7 + .long 0x000000fb + .long 0x0000007c + .long 0x000000e3 + .long 0x00000039 + .long 0x00000082 + .long 0x0000009b + .long 0x0000002f + .long 0x000000ff + .long 0x00000087 + .long 0x00000034 + .long 0x0000008e + .long 0x00000043 + .long 0x00000044 + .long 0x000000c4 + .long 0x000000de + .long 0x000000e9 + .long 0x000000cb + .long 0x00000054 + .long 0x0000007b + .long 0x00000094 + .long 0x00000032 + .long 0x000000a6 + .long 0x000000c2 + .long 0x00000023 + .long 0x0000003d + .long 0x000000ee + .long 0x0000004c + .long 0x00000095 + .long 0x0000000b + .long 0x00000042 + .long 0x000000fa + .long 0x000000c3 + .long 0x0000004e + .long 0x00000008 + .long 0x0000002e + .long 0x000000a1 + .long 0x00000066 + .long 0x00000028 + .long 0x000000d9 + .long 0x00000024 + .long 0x000000b2 + .long 0x00000076 + .long 0x0000005b + .long 0x000000a2 + .long 0x00000049 + .long 0x0000006d + .long 0x0000008b + .long 0x000000d1 + .long 0x00000025 + .long 0x00000072 + .long 0x000000f8 + .long 0x000000f6 + .long 0x00000064 + .long 0x00000086 + .long 0x00000068 + .long 0x00000098 + .long 0x00000016 + .long 0x000000d4 + .long 0x000000a4 + .long 0x0000005c + .long 0x000000cc + .long 0x0000005d + .long 0x00000065 + .long 0x000000b6 + .long 0x00000092 + .long 0x0000006c + .long 0x00000070 + .long 0x00000048 + .long 0x00000050 + .long 0x000000fd + .long 0x000000ed + .long 0x000000b9 + .long 0x000000da + .long 0x0000005e + .long 0x00000015 + .long 0x00000046 + .long 0x00000057 + .long 0x000000a7 + .long 0x0000008d + .long 0x0000009d + .long 0x00000084 + .long 0x00000090 + .long 0x000000d8 + .long 0x000000ab + .long 0x00000000 + .long 0x0000008c + .long 0x000000bc + .long 0x000000d3 + .long 0x0000000a + .long 0x000000f7 + .long 0x000000e4 + .long 0x00000058 + .long 0x00000005 + .long 0x000000b8 + .long 0x000000b3 + .long 0x00000045 + .long 0x00000006 + .long 0x000000d0 + .long 0x0000002c + .long 0x0000001e + .long 0x0000008f + .long 0x000000ca + .long 0x0000003f + .long 0x0000000f + .long 0x00000002 + .long 0x000000c1 + .long 0x000000af + .long 0x000000bd + .long 0x00000003 + .long 0x00000001 + .long 0x00000013 + .long 0x0000008a + .long 0x0000006b + .long 0x0000003a + .long 0x00000091 + .long 0x00000011 + .long 0x00000041 + .long 0x0000004f + .long 0x00000067 + .long 0x000000dc + .long 0x000000ea + .long 0x00000097 + .long 0x000000f2 + .long 0x000000cf + .long 0x000000ce + .long 0x000000f0 + .long 0x000000b4 + .long 0x000000e6 + .long 0x00000073 + .long 0x00000096 + .long 0x000000ac + .long 0x00000074 + .long 0x00000022 + .long 0x000000e7 + .long 0x000000ad + .long 0x00000035 + .long 0x00000085 + .long 0x000000e2 + .long 0x000000f9 + .long 0x00000037 + .long 0x000000e8 + .long 0x0000001c + .long 0x00000075 + .long 0x000000df + .long 0x0000006e + .long 0x00000047 + .long 0x000000f1 + .long 0x0000001a + .long 0x00000071 + .long 0x0000001d + .long 0x00000029 + .long 0x000000c5 + .long 0x00000089 + .long 0x0000006f + .long 0x000000b7 + .long 0x00000062 + .long 0x0000000e + .long 0x000000aa + .long 0x00000018 + .long 0x000000be + .long 0x0000001b + .long 0x000000fc + .long 0x00000056 + .long 0x0000003e + .long 0x0000004b + .long 0x000000c6 + .long 0x000000d2 + .long 0x00000079 + .long 0x00000020 + .long 0x0000009a + .long 0x000000db + .long 0x000000c0 + .long 0x000000fe + .long 0x00000078 + .long 0x000000cd + .long 0x0000005a + .long 0x000000f4 + .long 0x0000001f + .long 0x000000dd + .long 0x000000a8 + .long 0x00000033 + .long 0x00000088 + .long 0x00000007 + .long 0x000000c7 + .long 0x00000031 + .long 0x000000b1 + .long 0x00000012 + .long 0x00000010 + .long 0x00000059 + .long 0x00000027 + .long 0x00000080 + .long 0x000000ec + .long 0x0000005f + .long 0x00000060 + .long 0x00000051 + .long 0x0000007f + .long 0x000000a9 + .long 0x00000019 + .long 0x000000b5 + .long 0x0000004a + .long 0x0000000d + .long 0x0000002d + .long 0x000000e5 + .long 0x0000007a + .long 0x0000009f + .long 0x00000093 + .long 0x000000c9 + .long 0x0000009c + .long 0x000000ef + .long 0x000000a0 + .long 0x000000e0 + .long 0x0000003b + .long 0x0000004d + .long 0x000000ae + .long 0x0000002a + .long 0x000000f5 + .long 0x000000b0 + .long 0x000000c8 + .long 0x000000eb + .long 0x000000bb + .long 0x0000003c + .long 0x00000083 + .long 0x00000053 + .long 0x00000099 + .long 0x00000061 + .long 0x00000017 + .long 0x0000002b + .long 0x00000004 + .long 0x0000007e + .long 0x000000ba + .long 0x00000077 + .long 0x000000d6 + .long 0x00000026 + .long 0x000000e1 + .long 0x00000069 + .long 0x00000014 + .long 0x00000063 + .long 0x00000055 + .long 0x00000021 + .long 0x0000000c + .long 0x0000007d + // Table 1. + .long 0x00005200 + .long 0x00000900 + .long 0x00006a00 + .long 0x0000d500 + .long 0x00003000 + .long 0x00003600 + .long 0x0000a500 + .long 0x00003800 + .long 0x0000bf00 + .long 0x00004000 + .long 0x0000a300 + .long 0x00009e00 + .long 0x00008100 + .long 0x0000f300 + .long 0x0000d700 + .long 0x0000fb00 + .long 0x00007c00 + .long 0x0000e300 + .long 0x00003900 + .long 0x00008200 + .long 0x00009b00 + .long 0x00002f00 + .long 0x0000ff00 + .long 0x00008700 + .long 0x00003400 + .long 0x00008e00 + .long 0x00004300 + .long 0x00004400 + .long 0x0000c400 + .long 0x0000de00 + .long 0x0000e900 + .long 0x0000cb00 + .long 0x00005400 + .long 0x00007b00 + .long 0x00009400 + .long 0x00003200 + .long 0x0000a600 + .long 0x0000c200 + .long 0x00002300 + .long 0x00003d00 + .long 0x0000ee00 + .long 0x00004c00 + .long 0x00009500 + .long 0x00000b00 + .long 0x00004200 + .long 0x0000fa00 + .long 0x0000c300 + .long 0x00004e00 + .long 0x00000800 + .long 0x00002e00 + .long 0x0000a100 + .long 0x00006600 + .long 0x00002800 + .long 0x0000d900 + .long 0x00002400 + .long 0x0000b200 + .long 0x00007600 + .long 0x00005b00 + .long 0x0000a200 + .long 0x00004900 + .long 0x00006d00 + .long 0x00008b00 + .long 0x0000d100 + .long 0x00002500 + .long 0x00007200 + .long 0x0000f800 + .long 0x0000f600 + .long 0x00006400 + .long 0x00008600 + .long 0x00006800 + .long 0x00009800 + .long 0x00001600 + .long 0x0000d400 + .long 0x0000a400 + .long 0x00005c00 + .long 0x0000cc00 + .long 0x00005d00 + .long 0x00006500 + .long 0x0000b600 + .long 0x00009200 + .long 0x00006c00 + .long 0x00007000 + .long 0x00004800 + .long 0x00005000 + .long 0x0000fd00 + .long 0x0000ed00 + .long 0x0000b900 + .long 0x0000da00 + .long 0x00005e00 + .long 0x00001500 + .long 0x00004600 + .long 0x00005700 + .long 0x0000a700 + .long 0x00008d00 + .long 0x00009d00 + .long 0x00008400 + .long 0x00009000 + .long 0x0000d800 + .long 0x0000ab00 + .long 0x00000000 + .long 0x00008c00 + .long 0x0000bc00 + .long 0x0000d300 + .long 0x00000a00 + .long 0x0000f700 + .long 0x0000e400 + .long 0x00005800 + .long 0x00000500 + .long 0x0000b800 + .long 0x0000b300 + .long 0x00004500 + .long 0x00000600 + .long 0x0000d000 + .long 0x00002c00 + .long 0x00001e00 + .long 0x00008f00 + .long 0x0000ca00 + .long 0x00003f00 + .long 0x00000f00 + .long 0x00000200 + .long 0x0000c100 + .long 0x0000af00 + .long 0x0000bd00 + .long 0x00000300 + .long 0x00000100 + .long 0x00001300 + .long 0x00008a00 + .long 0x00006b00 + .long 0x00003a00 + .long 0x00009100 + .long 0x00001100 + .long 0x00004100 + .long 0x00004f00 + .long 0x00006700 + .long 0x0000dc00 + .long 0x0000ea00 + .long 0x00009700 + .long 0x0000f200 + .long 0x0000cf00 + .long 0x0000ce00 + .long 0x0000f000 + .long 0x0000b400 + .long 0x0000e600 + .long 0x00007300 + .long 0x00009600 + .long 0x0000ac00 + .long 0x00007400 + .long 0x00002200 + .long 0x0000e700 + .long 0x0000ad00 + .long 0x00003500 + .long 0x00008500 + .long 0x0000e200 + .long 0x0000f900 + .long 0x00003700 + .long 0x0000e800 + .long 0x00001c00 + .long 0x00007500 + .long 0x0000df00 + .long 0x00006e00 + .long 0x00004700 + .long 0x0000f100 + .long 0x00001a00 + .long 0x00007100 + .long 0x00001d00 + .long 0x00002900 + .long 0x0000c500 + .long 0x00008900 + .long 0x00006f00 + .long 0x0000b700 + .long 0x00006200 + .long 0x00000e00 + .long 0x0000aa00 + .long 0x00001800 + .long 0x0000be00 + .long 0x00001b00 + .long 0x0000fc00 + .long 0x00005600 + .long 0x00003e00 + .long 0x00004b00 + .long 0x0000c600 + .long 0x0000d200 + .long 0x00007900 + .long 0x00002000 + .long 0x00009a00 + .long 0x0000db00 + .long 0x0000c000 + .long 0x0000fe00 + .long 0x00007800 + .long 0x0000cd00 + .long 0x00005a00 + .long 0x0000f400 + .long 0x00001f00 + .long 0x0000dd00 + .long 0x0000a800 + .long 0x00003300 + .long 0x00008800 + .long 0x00000700 + .long 0x0000c700 + .long 0x00003100 + .long 0x0000b100 + .long 0x00001200 + .long 0x00001000 + .long 0x00005900 + .long 0x00002700 + .long 0x00008000 + .long 0x0000ec00 + .long 0x00005f00 + .long 0x00006000 + .long 0x00005100 + .long 0x00007f00 + .long 0x0000a900 + .long 0x00001900 + .long 0x0000b500 + .long 0x00004a00 + .long 0x00000d00 + .long 0x00002d00 + .long 0x0000e500 + .long 0x00007a00 + .long 0x00009f00 + .long 0x00009300 + .long 0x0000c900 + .long 0x00009c00 + .long 0x0000ef00 + .long 0x0000a000 + .long 0x0000e000 + .long 0x00003b00 + .long 0x00004d00 + .long 0x0000ae00 + .long 0x00002a00 + .long 0x0000f500 + .long 0x0000b000 + .long 0x0000c800 + .long 0x0000eb00 + .long 0x0000bb00 + .long 0x00003c00 + .long 0x00008300 + .long 0x00005300 + .long 0x00009900 + .long 0x00006100 + .long 0x00001700 + .long 0x00002b00 + .long 0x00000400 + .long 0x00007e00 + .long 0x0000ba00 + .long 0x00007700 + .long 0x0000d600 + .long 0x00002600 + .long 0x0000e100 + .long 0x00006900 + .long 0x00001400 + .long 0x00006300 + .long 0x00005500 + .long 0x00002100 + .long 0x00000c00 + .long 0x00007d00 + // Table 2. + .long 0x00520000 + .long 0x00090000 + .long 0x006a0000 + .long 0x00d50000 + .long 0x00300000 + .long 0x00360000 + .long 0x00a50000 + .long 0x00380000 + .long 0x00bf0000 + .long 0x00400000 + .long 0x00a30000 + .long 0x009e0000 + .long 0x00810000 + .long 0x00f30000 + .long 0x00d70000 + .long 0x00fb0000 + .long 0x007c0000 + .long 0x00e30000 + .long 0x00390000 + .long 0x00820000 + .long 0x009b0000 + .long 0x002f0000 + .long 0x00ff0000 + .long 0x00870000 + .long 0x00340000 + .long 0x008e0000 + .long 0x00430000 + .long 0x00440000 + .long 0x00c40000 + .long 0x00de0000 + .long 0x00e90000 + .long 0x00cb0000 + .long 0x00540000 + .long 0x007b0000 + .long 0x00940000 + .long 0x00320000 + .long 0x00a60000 + .long 0x00c20000 + .long 0x00230000 + .long 0x003d0000 + .long 0x00ee0000 + .long 0x004c0000 + .long 0x00950000 + .long 0x000b0000 + .long 0x00420000 + .long 0x00fa0000 + .long 0x00c30000 + .long 0x004e0000 + .long 0x00080000 + .long 0x002e0000 + .long 0x00a10000 + .long 0x00660000 + .long 0x00280000 + .long 0x00d90000 + .long 0x00240000 + .long 0x00b20000 + .long 0x00760000 + .long 0x005b0000 + .long 0x00a20000 + .long 0x00490000 + .long 0x006d0000 + .long 0x008b0000 + .long 0x00d10000 + .long 0x00250000 + .long 0x00720000 + .long 0x00f80000 + .long 0x00f60000 + .long 0x00640000 + .long 0x00860000 + .long 0x00680000 + .long 0x00980000 + .long 0x00160000 + .long 0x00d40000 + .long 0x00a40000 + .long 0x005c0000 + .long 0x00cc0000 + .long 0x005d0000 + .long 0x00650000 + .long 0x00b60000 + .long 0x00920000 + .long 0x006c0000 + .long 0x00700000 + .long 0x00480000 + .long 0x00500000 + .long 0x00fd0000 + .long 0x00ed0000 + .long 0x00b90000 + .long 0x00da0000 + .long 0x005e0000 + .long 0x00150000 + .long 0x00460000 + .long 0x00570000 + .long 0x00a70000 + .long 0x008d0000 + .long 0x009d0000 + .long 0x00840000 + .long 0x00900000 + .long 0x00d80000 + .long 0x00ab0000 + .long 0x00000000 + .long 0x008c0000 + .long 0x00bc0000 + .long 0x00d30000 + .long 0x000a0000 + .long 0x00f70000 + .long 0x00e40000 + .long 0x00580000 + .long 0x00050000 + .long 0x00b80000 + .long 0x00b30000 + .long 0x00450000 + .long 0x00060000 + .long 0x00d00000 + .long 0x002c0000 + .long 0x001e0000 + .long 0x008f0000 + .long 0x00ca0000 + .long 0x003f0000 + .long 0x000f0000 + .long 0x00020000 + .long 0x00c10000 + .long 0x00af0000 + .long 0x00bd0000 + .long 0x00030000 + .long 0x00010000 + .long 0x00130000 + .long 0x008a0000 + .long 0x006b0000 + .long 0x003a0000 + .long 0x00910000 + .long 0x00110000 + .long 0x00410000 + .long 0x004f0000 + .long 0x00670000 + .long 0x00dc0000 + .long 0x00ea0000 + .long 0x00970000 + .long 0x00f20000 + .long 0x00cf0000 + .long 0x00ce0000 + .long 0x00f00000 + .long 0x00b40000 + .long 0x00e60000 + .long 0x00730000 + .long 0x00960000 + .long 0x00ac0000 + .long 0x00740000 + .long 0x00220000 + .long 0x00e70000 + .long 0x00ad0000 + .long 0x00350000 + .long 0x00850000 + .long 0x00e20000 + .long 0x00f90000 + .long 0x00370000 + .long 0x00e80000 + .long 0x001c0000 + .long 0x00750000 + .long 0x00df0000 + .long 0x006e0000 + .long 0x00470000 + .long 0x00f10000 + .long 0x001a0000 + .long 0x00710000 + .long 0x001d0000 + .long 0x00290000 + .long 0x00c50000 + .long 0x00890000 + .long 0x006f0000 + .long 0x00b70000 + .long 0x00620000 + .long 0x000e0000 + .long 0x00aa0000 + .long 0x00180000 + .long 0x00be0000 + .long 0x001b0000 + .long 0x00fc0000 + .long 0x00560000 + .long 0x003e0000 + .long 0x004b0000 + .long 0x00c60000 + .long 0x00d20000 + .long 0x00790000 + .long 0x00200000 + .long 0x009a0000 + .long 0x00db0000 + .long 0x00c00000 + .long 0x00fe0000 + .long 0x00780000 + .long 0x00cd0000 + .long 0x005a0000 + .long 0x00f40000 + .long 0x001f0000 + .long 0x00dd0000 + .long 0x00a80000 + .long 0x00330000 + .long 0x00880000 + .long 0x00070000 + .long 0x00c70000 + .long 0x00310000 + .long 0x00b10000 + .long 0x00120000 + .long 0x00100000 + .long 0x00590000 + .long 0x00270000 + .long 0x00800000 + .long 0x00ec0000 + .long 0x005f0000 + .long 0x00600000 + .long 0x00510000 + .long 0x007f0000 + .long 0x00a90000 + .long 0x00190000 + .long 0x00b50000 + .long 0x004a0000 + .long 0x000d0000 + .long 0x002d0000 + .long 0x00e50000 + .long 0x007a0000 + .long 0x009f0000 + .long 0x00930000 + .long 0x00c90000 + .long 0x009c0000 + .long 0x00ef0000 + .long 0x00a00000 + .long 0x00e00000 + .long 0x003b0000 + .long 0x004d0000 + .long 0x00ae0000 + .long 0x002a0000 + .long 0x00f50000 + .long 0x00b00000 + .long 0x00c80000 + .long 0x00eb0000 + .long 0x00bb0000 + .long 0x003c0000 + .long 0x00830000 + .long 0x00530000 + .long 0x00990000 + .long 0x00610000 + .long 0x00170000 + .long 0x002b0000 + .long 0x00040000 + .long 0x007e0000 + .long 0x00ba0000 + .long 0x00770000 + .long 0x00d60000 + .long 0x00260000 + .long 0x00e10000 + .long 0x00690000 + .long 0x00140000 + .long 0x00630000 + .long 0x00550000 + .long 0x00210000 + .long 0x000c0000 + .long 0x007d0000 + // Table 3. + .long 0x52000000 + .long 0x09000000 + .long 0x6a000000 + .long 0xd5000000 + .long 0x30000000 + .long 0x36000000 + .long 0xa5000000 + .long 0x38000000 + .long 0xbf000000 + .long 0x40000000 + .long 0xa3000000 + .long 0x9e000000 + .long 0x81000000 + .long 0xf3000000 + .long 0xd7000000 + .long 0xfb000000 + .long 0x7c000000 + .long 0xe3000000 + .long 0x39000000 + .long 0x82000000 + .long 0x9b000000 + .long 0x2f000000 + .long 0xff000000 + .long 0x87000000 + .long 0x34000000 + .long 0x8e000000 + .long 0x43000000 + .long 0x44000000 + .long 0xc4000000 + .long 0xde000000 + .long 0xe9000000 + .long 0xcb000000 + .long 0x54000000 + .long 0x7b000000 + .long 0x94000000 + .long 0x32000000 + .long 0xa6000000 + .long 0xc2000000 + .long 0x23000000 + .long 0x3d000000 + .long 0xee000000 + .long 0x4c000000 + .long 0x95000000 + .long 0x0b000000 + .long 0x42000000 + .long 0xfa000000 + .long 0xc3000000 + .long 0x4e000000 + .long 0x08000000 + .long 0x2e000000 + .long 0xa1000000 + .long 0x66000000 + .long 0x28000000 + .long 0xd9000000 + .long 0x24000000 + .long 0xb2000000 + .long 0x76000000 + .long 0x5b000000 + .long 0xa2000000 + .long 0x49000000 + .long 0x6d000000 + .long 0x8b000000 + .long 0xd1000000 + .long 0x25000000 + .long 0x72000000 + .long 0xf8000000 + .long 0xf6000000 + .long 0x64000000 + .long 0x86000000 + .long 0x68000000 + .long 0x98000000 + .long 0x16000000 + .long 0xd4000000 + .long 0xa4000000 + .long 0x5c000000 + .long 0xcc000000 + .long 0x5d000000 + .long 0x65000000 + .long 0xb6000000 + .long 0x92000000 + .long 0x6c000000 + .long 0x70000000 + .long 0x48000000 + .long 0x50000000 + .long 0xfd000000 + .long 0xed000000 + .long 0xb9000000 + .long 0xda000000 + .long 0x5e000000 + .long 0x15000000 + .long 0x46000000 + .long 0x57000000 + .long 0xa7000000 + .long 0x8d000000 + .long 0x9d000000 + .long 0x84000000 + .long 0x90000000 + .long 0xd8000000 + .long 0xab000000 + .long 0x00000000 + .long 0x8c000000 + .long 0xbc000000 + .long 0xd3000000 + .long 0x0a000000 + .long 0xf7000000 + .long 0xe4000000 + .long 0x58000000 + .long 0x05000000 + .long 0xb8000000 + .long 0xb3000000 + .long 0x45000000 + .long 0x06000000 + .long 0xd0000000 + .long 0x2c000000 + .long 0x1e000000 + .long 0x8f000000 + .long 0xca000000 + .long 0x3f000000 + .long 0x0f000000 + .long 0x02000000 + .long 0xc1000000 + .long 0xaf000000 + .long 0xbd000000 + .long 0x03000000 + .long 0x01000000 + .long 0x13000000 + .long 0x8a000000 + .long 0x6b000000 + .long 0x3a000000 + .long 0x91000000 + .long 0x11000000 + .long 0x41000000 + .long 0x4f000000 + .long 0x67000000 + .long 0xdc000000 + .long 0xea000000 + .long 0x97000000 + .long 0xf2000000 + .long 0xcf000000 + .long 0xce000000 + .long 0xf0000000 + .long 0xb4000000 + .long 0xe6000000 + .long 0x73000000 + .long 0x96000000 + .long 0xac000000 + .long 0x74000000 + .long 0x22000000 + .long 0xe7000000 + .long 0xad000000 + .long 0x35000000 + .long 0x85000000 + .long 0xe2000000 + .long 0xf9000000 + .long 0x37000000 + .long 0xe8000000 + .long 0x1c000000 + .long 0x75000000 + .long 0xdf000000 + .long 0x6e000000 + .long 0x47000000 + .long 0xf1000000 + .long 0x1a000000 + .long 0x71000000 + .long 0x1d000000 + .long 0x29000000 + .long 0xc5000000 + .long 0x89000000 + .long 0x6f000000 + .long 0xb7000000 + .long 0x62000000 + .long 0x0e000000 + .long 0xaa000000 + .long 0x18000000 + .long 0xbe000000 + .long 0x1b000000 + .long 0xfc000000 + .long 0x56000000 + .long 0x3e000000 + .long 0x4b000000 + .long 0xc6000000 + .long 0xd2000000 + .long 0x79000000 + .long 0x20000000 + .long 0x9a000000 + .long 0xdb000000 + .long 0xc0000000 + .long 0xfe000000 + .long 0x78000000 + .long 0xcd000000 + .long 0x5a000000 + .long 0xf4000000 + .long 0x1f000000 + .long 0xdd000000 + .long 0xa8000000 + .long 0x33000000 + .long 0x88000000 + .long 0x07000000 + .long 0xc7000000 + .long 0x31000000 + .long 0xb1000000 + .long 0x12000000 + .long 0x10000000 + .long 0x59000000 + .long 0x27000000 + .long 0x80000000 + .long 0xec000000 + .long 0x5f000000 + .long 0x60000000 + .long 0x51000000 + .long 0x7f000000 + .long 0xa9000000 + .long 0x19000000 + .long 0xb5000000 + .long 0x4a000000 + .long 0x0d000000 + .long 0x2d000000 + .long 0xe5000000 + .long 0x7a000000 + .long 0x9f000000 + .long 0x93000000 + .long 0xc9000000 + .long 0x9c000000 + .long 0xef000000 + .long 0xa0000000 + .long 0xe0000000 + .long 0x3b000000 + .long 0x4d000000 + .long 0xae000000 + .long 0x2a000000 + .long 0xf5000000 + .long 0xb0000000 + .long 0xc8000000 + .long 0xeb000000 + .long 0xbb000000 + .long 0x3c000000 + .long 0x83000000 + .long 0x53000000 + .long 0x99000000 + .long 0x61000000 + .long 0x17000000 + .long 0x2b000000 + .long 0x04000000 + .long 0x7e000000 + .long 0xba000000 + .long 0x77000000 + .long 0xd6000000 + .long 0x26000000 + .long 0xe1000000 + .long 0x69000000 + .long 0x14000000 + .long 0x63000000 + .long 0x55000000 + .long 0x21000000 + .long 0x0c000000 + .long 0x7d000000 diff --git a/bsd/crypto/aes/i386/EncryptDecrypt.s b/bsd/crypto/aes/i386/EncryptDecrypt.s new file mode 100644 index 000000000..6a6147a11 --- /dev/null +++ b/bsd/crypto/aes/i386/EncryptDecrypt.s @@ -0,0 +1,607 @@ +/* This file defines _aes_encrypt or _aes_decrypt, according to the value of + the Select preprocessor symbol. This file is designed to be included in + another assembly file using the preprocessor #include directive, to benefit + from some assembly-time calculations. + + These two routines are nearly identical. They differ only in the tables + they use, the direction they iterate through the key, and the permutation + performed on part of the state. + + Written by Eric Postpischil, January 2008. +*/ + +/* add AES HW detection and HW-specific program branch cclee 3-12-10 */ +#ifdef KERNEL +#include +#else +#include +#endif + +#if Select == 0 + #define Name _aes_encrypt // Routine name. + #define MTable _AESEncryptTable // Main table. + #define FTable _AESSubBytesWordTable // Final table. + #define P0 S0 // State permutation. + #define P1 S1 + #define P2 S2 + #define P3 S3 + #define Increment +16 // ExpandedKey increment. +#elif Select == 1 + #define Name _aes_decrypt // Routine name. + #define MTable _AESDecryptTable // Main table. + #define FTable _AESInvSubBytesWordTable // Final table. + #define P0 S2 // State permutation. + #define P1 S3 + #define P2 S0 + #define P3 S1 + #define Increment -16 // ExpandedKey increment. +#elif Select == 2 + #define Name _aes_encrypt_xmm_no_save // Routine name. + #define MTable _AESEncryptTable // Main table. + #define FTable _AESSubBytesWordTable // Final table. + #define P0 S0 // State permutation. + #define P1 S1 + #define P2 S2 + #define P3 S3 + #define Increment +16 // ExpandedKey increment. +#elif Select == 3 + #define Name _aes_decrypt_xmm_no_save // Routine name. + #define MTable _AESDecryptTable // Main table. + #define FTable _AESInvSubBytesWordTable // Final table. + #define P0 S2 // State permutation. + #define P1 S3 + #define P2 S0 + #define P3 S1 + #define Increment -16 // ExpandedKey increment. +#endif // Select + + +/* Routine: + + _AESEncryptWithExpandedKey (if Select is 0) or + _AESDecryptWithExpandedKey (if Select is 1). + + Function: + + Perform the AES cipher or its inverse as defined in Federal Information + Processing Standards Publication 197 (FIPS-197), November 26, 2001. + + The inverse cipher here is the "Equivalent Inverse Cipher" in FIPS-197. + + Input: + + Constant data: + + The following names must be locally defined so the assembler + can calculate certain offsets. + + For encryption: + + static const Word _AESEncryptTable[4][256]. + + _AESEncryptTable[i] contains the tables T[i] defined in AES + Proposal: Rijndael, version 2, 03/09/99, by Joan Daemen and + Vincent Rijmen, section 5.2.1, page 18. These tables + combine the SubBytes and MixColumns operations. + + static const Word _AESSubBytesWordTable[256]. + + _AESSubBytesWordTable[i][j] = SubBytes(j) << 8*i, where + SubBytes is defined in FIPS-197. _AESSubBytesWordTable + differs from _AESEncryptTable in that it does not include + the MixColumn operation. It is used in performing the last + round, which differs fromm the previous rounds in that it + does not include the MixColumn operation. + + For decryption: + + static const Word _AESDecryptTable[4][256]. + + The analog of _AESEncryptTable for decryption. + + static const Word _AESSubBytesWordTable[256]. + + _AESInvSubBytesWordTable[i][j] = InvSubBytes(j) << 8*i, + where InvSubBytes is defined in FIPS-197. + _AESInvSubBytesWordTable differs from _AESDecryptTable in + that it does not include the InvMixColumn operation. It is + used in performing the last round, which differs from the + previous rounds in that it does not include the + InvMixColumn operation. + + Arguments: + + const Byte *InputText. + + Address of input, 16 bytes. Best if four-byte aligned. + + Byte *OutputText. + + Address of output, 16 bytes. Best if four-byte aligned. + + aes_encrypt_ctx *Context or aes_decrypt_ctx *Context + + aes_encrypt_ctx and aes_decrypt_ctx are identical except the + former is used for encryption and the latter for decryption. + + Each is a structure containing the expanded key beginning at + offset ContextKey and a four-byte "key length" beginning at + offset ContextKeyLength. The "key length" is the number of + bytes from the start of the first round key to the start of the + last round key. That is 16 less than the number of bytes in + the entire key. + + Output: + + Encrypted or decrypted data is written to *OutputText. + + Return: + + aes_rval // -1 if "key length" is invalid. 0 otherwise. +*/ + + .text + .globl Name +Name: + + // detect AES HW, cclee 3-13-10 +#if Select < 2 // only for aes_encrypt/aes_decrypt +#if defined __x86_64__ + movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities + mov (%rax), %eax // %eax = __cpu_capabilities +#else +#if defined KERNEL + leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities + mov (%eax), %eax // %eax = __cpu_capabilities +#else + mov _COMM_PAGE_CPU_CAPABILITIES, %eax +#endif +#endif + test $(kHasAES), %eax // __cpu_capabilities & kHasAES +#if Select == 0 + jne _aes_encrypt_hw // if AES HW detected, branch to HW specific code +#else + jne _aes_decrypt_hw // if AES HW detected, branch to HW specific code +#endif +#endif // Select + + // Push new stack frame. + push r5 + + /* Save registers and set SaveSize to the number of bytes pushed onto the + stack so far, including the caller's return address. + */ + push r3 + #if defined __i386__ + push r6 + push r7 + #define SaveSize (5*4) + #else + #define SaveSize (3*8) + #endif + + /* Number of bytes used for local variables: + + 4 (i386) or 0 (x86_64) bytes for ExpandedKeyEnd. + + 5 (i386) or 3 (x86_64) 16-byte spaces to save XMM registers. + */ + #define LocalsSize (Arch(4, 0) + Arch(5, 3)*16) + + #if 0 < LocalsSize + // Padding to position stack pointer at a multiple of 16 bytes. + #define Padding (15 & -(SaveSize + LocalsSize)) + sub $Padding + LocalsSize, r4 // Allocate space on stack. + #else + #define Padding 0 + #endif + +#ifdef KERNEL +#if Select < 2 + // Save XMM registers. + movaps %xmm0, 0*16(r4) + movaps %xmm1, 1*16(r4) + movaps %xmm2, 2*16(r4) +#if defined __i386__ + movaps %xmm3, 3*16(r4) + movaps %xmm4, 4*16(r4) +#endif +#endif // Select +#endif // KERNEL + +#if defined __i386__ + + // Number of bytes from caller's stack pointer to ours. + #define StackFrame (SaveSize + Padding + LocalsSize) + + // Define location of argument i (presuming 4-byte arguments). + #define Argument(i) StackFrame+4*(i)(%esp) + + #define ArgInputText Argument(0) + #define ArgOutputText Argument(1) + #define ArgContext Argument(2) + +#elif defined __x86_64__ + + // Arguments. + #define InputText r7 // Used early then overwritten for other use. + #define OutputText r6 // Needed near end of routine. + #define ArgContext r2 + /* The argument passed in r2 overlaps registers we need for other + work, so it must be moved early in the routine. + */ + +#endif + +#define BaseP Arch(r6, r9) // Base pointer for addressing global data. +#define ExpandedKey Arch(t0, r10) // Address of expanded key. + +/* The Work registers defined below are used to hold parts of the AES state + while we dissect or assemble it. They must be assigned to the A, B, C, and + D registers so that we can access the bytes in %al, %ah, and so on. +*/ +#define Work0d r0d +#define Work0l r0l +#define Work0h r0h +#define Work1d r3d +#define Work1l r3l +#define Work1h r3h +#define Work2d r1d +#define Work2l r1l +#define Work2h r1h +#define Work3d r2d +#define Work3l r2l +#define Work3h r2h + +#define t0 r5 +#define t0d r5d // Low 32 bits of t0. +#define t0l r5l // Low byte of t0. + +#define t1 r7 + +/* S0, S1, S2, and S3 are where we assemble the new AES state when computing + a regular round. S1, S2, and S3 are assigned to the Work registers, but + S0 needs to go somewhere else because Work0 holds part of the old state. +*/ +#define S0 Arch(t1, r8d) +#define S1 Work1d +#define S2 Work2d +#define S3 Work3d + +/* These XMM registers are used as holding space, because it is faster to + spill to these registers than to the stack. (On x86_64, we do not need + to spill, because there are additional general registers available. + However, using more general registers requires saving them to the stack + and restoring them. I timed it, and no time was saved.) +*/ +#define vS1 %xmm0 +#define vS2 %xmm1 +#define vS3 %xmm2 +#if defined __i386__ + #define vExpandedKey %xmm3 + #define vIncrement %xmm4 +#endif + + // Get address of expanded key. + mov ArgContext, ExpandedKey + #if 0 != ContextKey + add $ContextKey, ExpandedKey + #endif + +/* Store sentinel value of ExpandedKey on the stack on i386, a register on + x86_64. +*/ +#define ExpandedKeyEnd Arch(5*16(r4), r11) + + // Get and check "key length". + movzx ContextKeyLength(ExpandedKey), r0 + cmp $160, r0 + je 2f + cmp $192, r0 + je 2f + cmp $224, r0 + je 2f + mov $-1, r0 // Return error. + jmp 9f +2: + + #if (Select == 0 || Select == 2) + // For encryption, prepare to iterate forward through expanded key. + add ExpandedKey, r0 + mov r0, ExpandedKeyEnd + #else + // For decryption, prepare to iterate backward through expanded key. + mov ExpandedKey, ExpandedKeyEnd + add r0, ExpandedKey + #endif + + // Initialize State from input text. + #if defined __i386__ + mov ArgInputText, BaseP + #define InputText BaseP + #endif + mov 0*4(InputText), Work0d + mov 1*4(InputText), S1 + mov 2*4(InputText), S2 + mov 3*4(InputText), S3 +#undef InputText // Register is reused after this for other purposes. + + // Add round key and save results. + xor 0*4(ExpandedKey), Work0d // S0 is in dissection register. + xor 1*4(ExpandedKey), S1 + movd S1, vS1 // Save S1 to S3 in vector registers. + xor 2*4(ExpandedKey), S2 + movd S2, vS2 + xor 3*4(ExpandedKey), S3 + movd S3, vS3 + + add $Increment, ExpandedKey // Advance to next round key. + + #if defined __i386__ + // Save expanded key address and increment in vector registers. + mov $Increment, t1 + movp ExpandedKey, vExpandedKey + movp t1, vIncrement + #endif + + // Set up relative addressing. + #if defined __i386__ + + // Get address of 0 in BaseP. + call 0f // Push program counter onto stack. + 0: + pop BaseP // Get program counter. + + // Define macros to help address data. +#define LookupM(table, index) MTable-0b+(table)*TableSize(BaseP, index, 4) +#define LookupF(table, index) FTable-0b+(table)*TableSize(BaseP, index, 4) + + #elif defined __x86_64__ + + lea MTable(%rip), BaseP + + // Define macros to help address data. + #define LookupM(table, index) (table)*TableSize(BaseP, index, 4) + #define LookupF(table, index) (table)*TableSize(BaseP, index, 4) + +/* With these definitions of LookupM and LookupF, BaseP must be loaded with + the address of the table at the point where it is used. So we need an + instruction to change BaseP after we are done with MTable and before we + start using FTable. I would prefer to use something like: + + .set FMinusM, FTable - MTable + #define LookupF(table, index) \ + FMinusM+(table)*TableSize(BaseP, index, 4) + + Then BaseP would not need to change. However, this fails due to an + assembler/linker bug, . +*/ + + #endif + + // Get round key. + mov 0*4(ExpandedKey), S0 + mov 1*4(ExpandedKey), S1 + mov 2*4(ExpandedKey), S2 + mov 3*4(ExpandedKey), S3 + +1: + /* Word 0 of the current state must be in Work0 now, and the next round + key must be in S0 to S3. + */ + + // Process previous S0. + movzx Work0l, t0 + xor LookupM(0, t0), S0 + movzx Work0h, t0d + xor LookupM(1, t0), P3 + shr $16, Work0d + movzx Work0l, t0d + xor LookupM(2, t0), S2 + movzx Work0h, t0d + xor LookupM(3, t0), P1 + + // Process previous S1. + movd vS1, Work0d + movzx Work0l, t0d + xor LookupM(0, t0), S1 + movzx Work0h, t0d + xor LookupM(1, t0), P0 + shr $16, Work0d + movzx Work0l, t0d + xor LookupM(2, t0), S3 + movzx Work0h, t0d + xor LookupM(3, t0), P2 + + // Process previous S2. + movd vS2, Work0d + movzx Work0l, t0d + xor LookupM(0, t0), S2 + movzx Work0h, t0d + xor LookupM(1, t0), P1 + shr $16, Work0d + movzx Work0l, t0d + xor LookupM(2, t0), S0 + movzx Work0h, t0d + xor LookupM(3, t0), P3 + + // Process previous S3. + movd vS3, Work0d + movzx Work0l, t0d + xor LookupM(0, t0), S3 + movzx Work0h, t0d + xor LookupM(1, t0), P2 + shr $16, Work0d + movzx Work0l, t0d + xor LookupM(2, t0), S1 + movzx Work0h, t0d + xor LookupM(3, t0), P0 + + #if defined __i386__ + paddd vIncrement, vExpandedKey + movp vExpandedKey, ExpandedKey + #else + add $Increment, ExpandedKey + #endif + + // Save state for next iteration and load next round key. + mov S0, Work0d + mov 0*4(ExpandedKey), S0 + movd S1, vS1 + mov 1*4(ExpandedKey), S1 + movd S2, vS2 + mov 2*4(ExpandedKey), S2 + movd S3, vS3 + mov 3*4(ExpandedKey), S3 + + cmp ExpandedKeyEnd, ExpandedKey + jne 1b + + /* Word 0 of the current state must be in Work0 now, and the next round + key must be in S0 to S3. + */ + + // Work around assembler bug. See comments above about Radar 5683882. + #if defined __x86_64__ + lea FTable(%rip), BaseP + #endif + + // Process previous S0. + movzx Work0l, t0 + xor LookupF(0, t0), S0 + movzx Work0h, t0d + xor LookupF(1, t0), P3 + shr $16, Work0d + movzx Work0l, t0d + xor LookupF(2, t0), S2 + movzx Work0h, t0d + xor LookupF(3, t0), P1 + + // Process previous S1. + movd vS1, Work0d + movzx Work0l, t0d + xor LookupF(0, t0), S1 + movzx Work0h, t0d + xor LookupF(1, t0), P0 + shr $16, Work0d + movzx Work0l, t0d + xor LookupF(2, t0), S3 + movzx Work0h, t0d + xor LookupF(3, t0), P2 + + // Process previous S2. + movd vS2, Work0d + movzx Work0l, t0d + xor LookupF(0, t0), S2 + movzx Work0h, t0d + xor LookupF(1, t0), P1 + shr $16, Work0d + movzx Work0l, t0d + xor LookupF(2, t0), S0 + movzx Work0h, t0d + xor LookupF(3, t0), P3 + + // Process previous S3. + movd vS3, Work0d + movzx Work0l, t0d + xor LookupF(0, t0), S3 + movzx Work0h, t0d + xor LookupF(1, t0), P2 + shr $16, Work0d + movzx Work0l, t0d + xor LookupF(2, t0), S1 + movzx Work0h, t0d + xor LookupF(3, t0), P0 + + #if defined __i386__ // Architecture. + // Get OutputText address. + #define OutputText BaseP + mov ArgOutputText, OutputText + #endif // Architecture. + + // Write output. + mov S0, 0*4(OutputText) + mov S1, 1*4(OutputText) + mov S2, 2*4(OutputText) + mov S3, 3*4(OutputText) + + xor r0, r0 // Return success. + +9: + // Pop stack and restore registers. +#ifdef KERNEL +#if Select < 2 +#if defined __i386__ + movaps 4*16(r4), %xmm4 + movaps 3*16(r4), %xmm3 +#endif + movaps 2*16(r4), %xmm2 + movaps 1*16(r4), %xmm1 + movaps 0*16(r4), %xmm0 +#endif // Select +#endif // KERNEL + #if 0 < LocalsSize + add $Padding + LocalsSize, r4 + #endif + #if defined __i386__ + pop r7 + pop r6 + #elif defined __x86_64__ + #endif + pop r3 + pop r5 + + ret + + +#undef ArgExpandedKey +#undef ArgInputText +#undef ArgNr +#undef ArgOutputText +#undef Argument +#undef BaseP +#undef ExpandedKey +#undef ExpandedKeyEnd +#undef FTable +#undef InputText +#undef LocalsSize +#undef LookupM +#undef LookupF +#undef MTable +#undef OutputText +#undef Padding +#undef SaveSize +#undef S0 +#undef S1 +#undef S2 +#undef S3 +#undef StackFrame +#undef Work0d +#undef Work0h +#undef Work0l +#undef Work1d +#undef Work1h +#undef Work1l +#undef Work2d +#undef Work2h +#undef Work2l +#undef Work3d +#undef Work3h +#undef Work3l +#undef t0 +#undef t0d +#undef t0l +#undef t1 +#undef vExpandedKey +#undef vS1 +#undef vS2 +#undef vS3 + +#undef Name +#undef MTable +#undef FTable +#undef P0 +#undef P1 +#undef P2 +#undef P3 +#undef Increment diff --git a/bsd/crypto/aes/i386/ExpandKeyForDecryption.s b/bsd/crypto/aes/i386/ExpandKeyForDecryption.s new file mode 100644 index 000000000..457508a9a --- /dev/null +++ b/bsd/crypto/aes/i386/ExpandKeyForDecryption.s @@ -0,0 +1,1214 @@ +/* This file defines _aes_decrypt_key, _aes_decrypt_key128, + _aes_decrypt_key192, and _aes_decrypt_key256. It is designed to be + included in another assembly file with the preprocessor #include directive, + to benefit from some assembly-time calculations. + + Written by Eric Postpischil, January 2008. + + The comments here do not say much about the algorithm; the code just + follows the FIPS-197 specification. I recommend reading the specification + before working with this code or examining the C code in the parent + directory that illustrates key expansion. + + One complication is that this routine both expands the key and applies + InvMixColumn to most of the words in the expanded key. This modifies the + key for use with the Equivalent Inverse Cipher. + + During key expansion, there are sequences of four or six words that are + produced like this: + + E[i+0] = E[i+0-Nk] ^ f(E[i-1]), where f is some function. + E[i+1] = E[i+1-Nk] ^ E[i+0]. + E[i+2] = E[i+2-Nk] ^ E[i+1]. + E[i+3] = E[i+3-Nk] ^ E[i+2]. + + When Nk is four or eight, the sequence stops there. When it is six, it + goes on for two more words. Let I be the InvMixColumn function. for the + Equivalent Inverse Cipher, we want to store I(E[i+0]), I(E[i+1]), + I(E[i+2]), I(E[i+3]) (and two more when Nk is six). However, we do not + need to calculate I four times. In AES' finite field, I is a linear + combination of the four bytes of its input. The ^ operation on the bits + that represent field elements is an addition in the Galois field. So + I(a ^ b) = I(a) ^ I(b). Then we have: + + I(E[i+0]) = I(E[i+0-Nk] ^ f(E[i-1])) = I(E[i+0-Nk]) ^ I(f(E[i-1])). + I(E[i+1]) = I(E[i+1-Nk]) ^ I(E[i+0]). + I(E[i+2]) = I(E[i+2-Nk]) ^ I(E[i+1]). + I(E[i+3]) = I(E[i+3-Nk]) ^ I(E[i+2]). + + To compute this, we compute I(f(E[i-1])) and XOR it with the previously + stored E[i+0-Nk])) to get I(E[i+0])). Then we XOR that with the previously + stored E[i+1-Nk])) to get I(E[i+1])), and so on. + + Note that to compute I(f(E[i-1])), we need to have E[i-1]. So we have to + compute the pre-InvMixColumn words of the expanded key; it is not + sufficient to have the post-InvMixColumn words. +*/ + + +/* Routine: + + _aes_decrypt_key. + + _aes_decrypt_key128, _aes_decrypt_key192, and _aes_decrypt_key256. + + Function: + + Expand the user's cipher key into the key schedule, as defined in + Federal Information Processing Standards Publication 197 (FIPS-197), + November 26, 2001. + + For decryption, the key is modified as shown in Figure 15 in FIPS-197, + to support the Equivalent Inverse Cipher. + + Input: + + Constant data: + + The following names must be locally defined so the assembler + can calculate certain offsets. + + static const Word _AESSubBytesWordTable[4][256]. + + _AESSubBytesWordTable[i][j] = SubBytes(j) << 8*i, where + SubBytes is defined in FIPS-197. _AESSubBytesWordTable + differs from _AESEncryptTable in that it does not include + the MixColumn operation. It is used in performing the last + round, which differs fromm the previous rounds in that it + does not include the MixColumn operation. + + static const Word _AESSInvMixColumnTable[4][256]. + + _AESInvMixColumnTable[i][j] contains the contribution of byte + j to element i of the InvMixColumn operation. + + The four bytes of the word _AESInvMixColumnTable[0][j] are: + + {0xe}*{j}, {0x9}*{j}, {0xd}*{j}, {0xb}*{j}, + + listed in increasing address order, where multiplication is + performed in the Galois field. {j} designates the element of + the Galois field represented by j. _AESInvMixColumn[i][j] has + the same bytes, rotated right in the order shown above. + + static const Byte _AESRcon[]. + + Round constants, beginning with AESRcon[1] for the first round + (AESRcon[0] is padding.) + + Arguments: + + const uint8_t *Key + + Address of user's cipher key. + + int Length + + Number of bytes (16, 24, or 32) or bits (128, 192, or 256) in + user's cipher key. + + This argument is used with _aes_decrypt_key. It is not + present for the other routines. In those routines, Context + is the second argument. + + aes_decrypt_ctx *Context + + Structure to contain the expanded key beginning at offset + ContextKey and a four-byte "key length" beginning at offset + ContextKeyLength. The "key length" is the number of bytes from + the start of the first round key to the startof the last rond + key. That is 16 less than the number of bytes in the entire + key. + + Output: + + The expanded key and the "key length" are written to *Context. + + Return: + + aes_rval // -1 if "key length" is invalid. 0 otherwise. +*/ +/* add AES HW detection and program branch if AES HW is detected cclee 3-12-10 */ + +#ifdef KERNEL +#include +#else +#include +#endif + +#define dr r0d // Dissection register. +#define drl r0l // Low 8 bits of dissection register. +#define drh r0h // Second-lowest 8 bits of dissection register. + +#define t0 r1 +#define t0d r1d // Low 32 bits of t0. + +#define STable r2 // Address of SubBytes table. Overlaps Nk. +#define ITable r3 // Address of InvMixColumn table. +#define offset Arch(r5, r11) // Address offset and loop sentinel. + +#define R r7 // Address of round constant. +#define K r7 // User key pointer. + // R and K overlap. + +#define E r6 // Expanded key pointer. + +#define ve0 %xmm0 +#define ve1 %xmm1 +#define ve2 %xmm2 +#define ve3 %xmm3 +#define ve4 %xmm4 +#define ve5 %xmm5 +#define vt1 %xmm6 +#define vt0 %xmm7 + +#define LookupS(table, index) (table)*TableSize(STable, index, 4) +#define LookupI(table, index) (table)*TableSize(ITable, index, 4) + + +/* InvMixColumn puts InvMixColumn(dr) into vt0. This is a non-standard + subroutine. It does not conform to the ABI. It is an integral part of + _ExpandKeyForDecryption and shares register use with it. +*/ +InvMixColumn: + movzx drl, t0 + movd LookupI(0, t0), vt0 // Look up byte 0 in table 0. + movzx drh, t0d + movd LookupI(1, t0), vt1 // Look up byte 1 in table 1. + pxor vt1, vt0 + shr $16, dr + movzx drl, t0d + movd LookupI(2, t0), vt1 // Look up byte 2 in table 2. + pxor vt1, vt0 + movzx drh, t0d + movd LookupI(3, t0), vt1 // Look up byte 3 in table 3. + pxor vt1, vt0 + ret + + + // SubWordRotWord adds (XORs) SubWord(RotWord(dr)) to vt0. + .macro SubWordRotWord + movzx drl, t0 + movd LookupS(3, t0), vt1 // Look up byte 0 in table 3. + pxor vt1, vt0 + movzx drh, t0d + movd LookupS(0, t0), vt1 // Look up byte 1 in table 0. + pxor vt1, vt0 + shr $$16, dr + movzx drl, t0d + movd LookupS(1, t0), vt1 // Look up byte 2 in table 1. + pxor vt1, vt0 + movzx drh, t0d + movd LookupS(2, t0), vt1 // Look up byte 3 in table 2. + pxor vt1, vt0 + .endmacro + + + // SubWord puts SubWord(dr) into vt0. + .macro SubWord + movzx drl, t0 + movd LookupS(0, t0), vt0 // Look up byte 0 in table 0. + movzx drh, t0d + movd LookupS(1, t0), vt1 // Look up byte 1 in table 1. + pxor vt1,vt0 + shr $$16, dr + movzx drl, t0d + movd LookupS(2, t0), vt1 // Look up byte 2 in table 2. + pxor vt1,vt0 + movzx drh, t0d + movd LookupS(3, t0), vt1 // Look up byte 3 in table 3. + pxor vt1,vt0 + .endmacro + + .text + .globl _aes_decrypt_key +// .private_extern _aes_decrypt_key +_aes_decrypt_key: + + // detect AES HW, cclee 3-13-10 +#if defined __x86_64__ + movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities + mov (%rax), %eax // %eax = __cpu_capabilities +#else +#if defined KERNEL + leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities + mov (%eax), %eax // %eax = __cpu_capabilities +#else + mov _COMM_PAGE_CPU_CAPABILITIES, %eax +#endif + +#endif + test $(kHasAES), %eax // __cpu_capabilities & kHasAES + jne _aes_decrypt_key_hw // if AES HW detected, branch to _aes_decrypt_key_hw + /* Save registers and set SaveSize to the number of bytes pushed onto the + stack so far, including the caller's return address. + */ + push r3 + #if defined __i386__ + push r5 + push r6 + push r7 + #define SaveSize (5*4) + #else + #define SaveSize (2*8) + #endif + + /* Number of bytes used for local variables: + + 8 16-byte spaces to save XMM registers. + + 8 four-byte spaces for work. + */ + #define LocalsSize (8*16 + 8*4) + + // Define stack offset to storage space for local data. + #define Local (8*16) + + #if 0 < LocalsSize + // Padding to position stack pointer at a multiple of 16 bytes. + #define Padding (15 & -(SaveSize + LocalsSize)) + sub $Padding + LocalsSize, r4 // Allocate space on stack. + #else + #define Padding 0 + #endif + + /* StackFrame is the number of bytes in our stack frame, from caller's + stack pointer to ours (so it includes the return address). + */ + #define StackFrame (SaveSize + Padding + LocalsSize) + + // Save xmm registers. + movaps %xmm0, 0*16(r4) + movaps %xmm1, 1*16(r4) + movaps %xmm2, 2*16(r4) + movaps %xmm3, 3*16(r4) + movaps %xmm4, 4*16(r4) + movaps %xmm5, 5*16(r4) + movaps %xmm6, 6*16(r4) + movaps %xmm7, 7*16(r4) + +#if defined __i386__ + + // Define location of argument i. + #define Argument(i) StackFrame+4*(i)(r4) + + #define Nk t0d + + // Load arguments. + mov Argument(2), E + mov Argument(1), Nk + mov Argument(0), K + +#elif defined __x86_64__ + + #define Nk r9d // Number of words in key. + mov r6d, Nk // Move Nk argument out of way. + mov r2, E // Move E argument to common register. + +#endif + + // Dispatch on key length. + cmp $128, Nk + jge 2f + shl $3, Nk // Convert from bytes to bits. + cmp $128, Nk +2: + je DKeyHas4Words + cmp $192, Nk + je DKeyHas6Words + cmp $256, Nk + je DKeyHas8Words + mov $-1, r0 // Return error. + jmp 9f + + + .globl _aes_decrypt_key128 +// .private_extern _aes_decrypt_key128 +_aes_decrypt_key128: + + /* Save registers and set SaveSize to the number of bytes pushed onto the + stack so far, including the caller's return address. + */ + push r3 + #if defined __i386__ + push r5 + push r6 + push r7 + #define SaveSize (5*4) + #else + #define SaveSize (2*8) + #endif + + /* Number of bytes used for local variables: + + 8 16-byte spaces to save XMM registers. + + 8 four-byte spaces for work. + */ + #define LocalsSize (8*16 + 8*4) + + // Define stack offset to storage space for local data. + #define Local (8*16) + + #if 0 < LocalsSize + // Padding to position stack pointer at a multiple of 16 bytes. + #define Padding (15 & -(SaveSize + LocalsSize)) + sub $Padding + LocalsSize, r4 // Allocate space on stack. + #else + #define Padding 0 + #endif + + /* StackFrame is the number of bytes in our stack frame, from caller's + stack pointer to ours (so it includes the return address). + */ + #define StackFrame (SaveSize + Padding + LocalsSize) + + // Save xmm registers. + movaps %xmm0, 0*16(r4) + movaps %xmm1, 1*16(r4) + movaps %xmm2, 2*16(r4) + movaps %xmm3, 3*16(r4) + movaps %xmm4, 4*16(r4) + movaps %xmm5, 5*16(r4) + movaps %xmm6, 6*16(r4) + movaps %xmm7, 7*16(r4) + +#if defined __i386__ + + // Load arguments. + #define Argument(i) StackFrame+4*(i)(r4) + mov Argument(1), E + mov Argument(0), K + +#endif + +// Merge point for _aes_decrypt_key and _aes_decrypt_key128. +DKeyHas4Words: + + // First words of expanded key are copied from user key. + movd 0*4(K), ve0 + movd 1*4(K), ve1 + movd 2*4(K), ve2 + movd 3*4(K), ve3 + + movl $10*16, ContextKeyLength(E) // Set "key length." + + #if 0 != ContextKey + add $ContextKey, E + #endif + + // K cannot be used after we write to R, since they use the same register. + + #if defined __i386__ + + lea _AESRcon, R + lea _AESInvMixColumnTable, ITable + lea _AESSubBytesWordTable, STable + + #elif defined __x86_64__ + + lea _AESRcon(%rip), R + lea _AESInvMixColumnTable(%rip), ITable + lea _AESSubBytesWordTable(%rip), STable + + #endif + + /* With a four-word key, there are ten rounds (eleven 16-byte key blocks), + nine of which have InvMixColumn applied. + */ + mov $-9*4*4, offset + sub offset, E + + // Store initial words of expanded key, which are copies of user's key. + movd ve0, 0*4(E, offset) + movd ve1, 1*4(E, offset) + movd ve2, 2*4(E, offset) + movd ve3, 3*4(E, offset) + +/* Here is the first iteration of the key expansion. It is separate from the + main loop below because we need to apply InvMixColumn to each of the + outputs, in ve0 through ve3. In the main loop, the technique described at + the top of this file is used to compute the proper outputs while using + InvMixColumn only once. +*/ + add $1, R // Advance pointer. + movd ve3, dr // Put previous word into work register. + movzx (R), t0d // Get round constant. + movd t0d, vt0 + + SubWordRotWord + pxor vt0, ve0 + + // Chain to successive words. + pxor ve0, ve1 + pxor ve1, ve2 + pxor ve2, ve3 + + add $4*4, offset + + /* Apply InvMixColumn to each word. The transformed values are stored in + the expanded key. The original values are retained in registers for + further computation. + */ + movd ve0, dr + call InvMixColumn + movd vt0, 0*4(E, offset) + + movd ve1, dr + call InvMixColumn + movd vt0, 1*4(E, offset) + + movd ve2, dr + call InvMixColumn + movd vt0, 2*4(E, offset) + + movd ve3, dr + call InvMixColumn + movd vt0, 3*4(E, offset) + +// Here is the main loop. +1: + add $1, R // Advance pointer. + movd ve3, dr // Put previous word into work register. + movzx (R), t0d // Get round constant. + movd t0d, vt0 + + SubWordRotWord + pxor vt0, ve0 + + // Chain to successive words. + pxor ve0, ve1 + pxor ve1, ve2 + pxor ve2, ve3 + /* Dr. Brian Gladman uses a technique with a single XOR here instead + of the previous four. There is some periodic behavior in the key + expansion, and Gladman maintains E[4*i+3] for the latest four + values of i. XORing the value in vt0 with one of these yields its + replacement. However, using this technique requires additional + instructions before the loop (to initialize the values) and after + it (to extract the final values to be stored) and either some way + to rotate or index four values in the loop or a four-fold unrolling + of the loop to provide the indexing. Experiment suggests the + former is not worthwhile. Unrolling the loop might give a small + gain, at the cost of increased use of instruction cache, increased + instructions loads the first time the routine is executed, and + increased code complexity, so I decided against it. + */ + + // Apply InvMixColumn to the difference. + movd vt0, dr + call InvMixColumn + + add $4*4, offset + + // Chain the transformed difference to previously transformed outputs. + movd (0-4)*4(E, offset), vt1 + pxor vt1, vt0 + movd vt0, 0*4(E, offset) + + movd (1-4)*4(E, offset), vt1 + pxor vt1, vt0 + movd vt0, 1*4(E, offset) + + movd (2-4)*4(E, offset), vt1 + pxor vt1, vt0 + movd vt0, 2*4(E, offset) + + movd (3-4)*4(E, offset), vt1 + pxor vt1, vt0 + movd vt0, 3*4(E, offset) + + jl 1b + +// Here is the final iteration, which does not perform InvMixColumn. + + movd ve3, dr // Put previous word into work register. + movzx 1(R), t0d // Get round constant. + movd t0d, vt0 + + SubWordRotWord + pxor vt0, ve0 + + // Chain to successive words. + movd ve0, 4*4(E, offset) + pxor ve0, ve1 + movd ve1, 5*4(E, offset) + pxor ve1, ve2 + movd ve2, 6*4(E, offset) + pxor ve2, ve3 + movd ve3, 7*4(E, offset) + + xor r0, r0 // Return success. + +9: + // Pop stack and restore registers. + movaps 7*16(r4), %xmm7 + movaps 6*16(r4), %xmm6 + movaps 5*16(r4), %xmm5 + movaps 4*16(r4), %xmm4 + movaps 3*16(r4), %xmm3 + movaps 2*16(r4), %xmm2 + movaps 1*16(r4), %xmm1 + movaps 0*16(r4), %xmm0 + #if 0 < LocalsSize + add $Padding + LocalsSize, r4 + #endif + #if defined __i386__ + pop r7 + pop r6 + pop r5 + #endif + pop r3 + + ret + + + .globl _aes_decrypt_key192 +// .private_extern _aes_decrypt_key192 +_aes_decrypt_key192: + + /* Save registers and set SaveSize to the number of bytes pushed onto the + stack so far, including the caller's return address. + */ + push r3 + #if defined __i386__ + push r5 + push r6 + push r7 + #define SaveSize (5*4) + #else + #define SaveSize (2*8) + #endif + + /* Number of bytes used for local variables: + + 8 16-byte spaces to save XMM registers. + + 8 four-byte spaces for work. + */ + #define LocalsSize (8*16 + 8*4) + + // Define stack offset to storage space for local data. + #define Local (8*16) + + #if 0 < LocalsSize + // Padding to position stack pointer at a multiple of 16 bytes. + #define Padding (15 & -(SaveSize + LocalsSize)) + sub $Padding + LocalsSize, r4 // Allocate space on stack. + #else + #define Padding 0 + #endif + + /* StackFrame is the number of bytes in our stack frame, from caller's + stack pointer to ours (so it includes the return address). + */ + #define StackFrame (SaveSize + Padding + LocalsSize) + + // Save xmm registers. + movaps %xmm0, 0*16(r4) + movaps %xmm1, 1*16(r4) + movaps %xmm2, 2*16(r4) + movaps %xmm3, 3*16(r4) + movaps %xmm4, 4*16(r4) + movaps %xmm5, 5*16(r4) + movaps %xmm6, 6*16(r4) + movaps %xmm7, 7*16(r4) + +#if defined __i386__ + + // Load arguments. + #define Argument(i) StackFrame+4*(i)(r4) + mov Argument(1), E + mov Argument(0), K + +#endif + +// Merge point for _aes_decrypt_key and _aes_decrypt_key192. +DKeyHas6Words: + + // First words of expanded key are copied from user key. + movd 0*4(K), ve0 + movd 1*4(K), ve1 + movd 2*4(K), ve2 + movd 3*4(K), ve3 + + movl $12*16, ContextKeyLength(E) // Set "key length." + + #if 0 != ContextKey + add $ContextKey, E + #endif + + movd 4*4(K), ve4 + movd 5*4(K), ve5 + + // K cannot be used after we write to R, since they use the same register. + + #if defined __i386__ + + lea _AESRcon, R + lea _AESInvMixColumnTable, ITable + lea _AESSubBytesWordTable, STable + + #elif defined __x86_64__ + + lea _AESRcon(%rip), R + lea _AESInvMixColumnTable(%rip), ITable + lea _AESSubBytesWordTable(%rip), STable + + #endif + + /* With a six-word key, there are twelve rounds (thirteen 16-byte key + blocks), eleven of which have InvMixColumn applied. The key expansion + proceeds in iterations of six four-byte words, so the termination + condition is a bit complicated. We set offset to the negative of 10 + four four-byte words, and the loop branch does another iteration if + offset is less than or equal to zero, meaning the number of iterations + performed so far is less than or equal to 10. Thus, after ten + iterations, it branches again. After the eleventh iteration, it + stops. Code after the end of the loop computes the twelfth key block, + which does not have InvMixColumn applied. + */ + mov $-10*4*4, offset + sub offset, E + + // Store initial words of expanded key, which are copies of user's key. + movd ve0, 0*4(E, offset) + movd ve1, 1*4(E, offset) + movd ve2, 2*4(E, offset) + movd ve3, 3*4(E, offset) + + /* The first four words are stored untransformed. After that, words in + the expanded key are transformed by InvMixColumn. + */ + movd ve4, dr + call InvMixColumn + movd vt0, 4*4(E, offset) + + movd ve5, dr + call InvMixColumn + movd vt0, 5*4(E, offset) + +/* Here is the first iteration of the key expansion. It is separate from the + main loop below because we need to apply InvMixColumn to each of the + outputs, in ve0 through ve5. In the main loop, the technique described at + the top of this file is used to compute the proper outputs while using + InvMixColumn only once. +*/ + add $1, R // Advance pointer. + movd ve5, dr // Put previous word into work register. + movzx (R), t0d // Get round constant. + movd t0d, vt0 + + SubWordRotWord + pxor vt0, ve0 + + // Chain to successive words. + pxor ve0, ve1 + pxor ve1, ve2 + pxor ve2, ve3 + pxor ve3, ve4 + pxor ve4, ve5 + + add $6*4, offset + + /* Apply InvMixColumn to each word. The transformed values are stored in + the expanded key. The original values are retained in registers for + further computation. + */ + movd ve0, dr + call InvMixColumn + movd vt0, 0*4(E, offset) + + movd ve1, dr + call InvMixColumn + movd vt0, 1*4(E, offset) + + movd ve2, dr + call InvMixColumn + movd vt0, 2*4(E, offset) + + movd ve3, dr + call InvMixColumn + movd vt0, 3*4(E, offset) + + movd (4-6)*4(E, offset), vt1 + pxor vt1, vt0 + movd vt0, 4*4(E, offset) + + movd (5-6)*4(E, offset), vt1 + pxor vt1, vt0 + movd vt0, 5*4(E, offset) + +// Here is the main loop. +1: + add $1, R // Advance pointer. + movd ve5, dr // Put previous word into work register. + movzx (R), t0d // Get round constant. + movd t0d, vt0 + + SubWordRotWord + pxor vt0, ve0 + + // Chain to successive words. + pxor ve0, ve1 + pxor ve1, ve2 + pxor ve2, ve3 + pxor ve3, ve4 + pxor ve4, ve5 + + // Apply InvMixColumn to the difference. + movd vt0, dr + call InvMixColumn + + add $6*4, offset + + // Chain the transformed difference to previously transformed outputs. + movd (0-6)*4(E, offset), vt1 + pxor vt1, vt0 + movd vt0, 0*4(E, offset) + + movd (1-6)*4(E, offset), vt1 + pxor vt1, vt0 + movd vt0, 1*4(E, offset) + + movd (2-6)*4(E, offset), vt1 + pxor vt1, vt0 + movd vt0, 2*4(E, offset) + + movd (3-6)*4(E, offset), vt1 + pxor vt1, vt0 + movd vt0, 3*4(E, offset) + + movd (4-6)*4(E, offset), vt1 + pxor vt1, vt0 + movd vt0, 4*4(E, offset) + + movd (5-6)*4(E, offset), vt1 + pxor vt1, vt0 + movd vt0, 5*4(E, offset) + + jle 1b + +// Here is the final iteration, which does not perform InvMixColumn. + + movd ve5, dr // Put previous word into work register. + movzx 1(R), t0d // Get round constant. + movd t0d, vt0 + + SubWordRotWord + pxor vt0, ve0 + + // Chain to successive words. + movd ve0, 6*4(E, offset) + pxor ve0, ve1 + movd ve1, 7*4(E, offset) + pxor ve1, ve2 + movd ve2, 8*4(E, offset) + pxor ve2, ve3 + movd ve3, 9*4(E, offset) + + xor r0, r0 // Return success. + + // Pop stack and restore registers. + movaps 7*16(r4), %xmm7 + movaps 6*16(r4), %xmm6 + movaps 5*16(r4), %xmm5 + movaps 4*16(r4), %xmm4 + movaps 3*16(r4), %xmm3 + movaps 2*16(r4), %xmm2 + movaps 1*16(r4), %xmm1 + movaps 0*16(r4), %xmm0 + #if 0 < LocalsSize + add $Padding + LocalsSize, r4 + #endif + #if defined __i386__ + pop r7 + pop r6 + pop r5 + #endif + pop r3 + + ret + + + .globl _aes_decrypt_key256 +// .private_extern _aes_decrypt_key256 +_aes_decrypt_key256: + + /* Save registers and set SaveSize to the number of bytes pushed onto the + stack so far, including the caller's return address. + */ + push r3 + #if defined __i386__ + push r5 + push r6 + push r7 + #define SaveSize (5*4) + #else + #define SaveSize (2*8) + #endif + + /* Number of bytes used for local variables: + + 8 16-byte spaces to save XMM registers. + + 8 four-byte spaces for work. + */ + #define LocalsSize (8*16 + 8*4) + + // Define stack offset to storage space for local data. + #define Local (8*16) + + #if 0 < LocalsSize + // Padding to position stack pointer at a multiple of 16 bytes. + #define Padding (15 & -(SaveSize + LocalsSize)) + sub $Padding + LocalsSize, r4 // Allocate space on stack. + #else + #define Padding 0 + #endif + + /* StackFrame is the number of bytes in our stack frame, from caller's + stack pointer to ours (so it includes the return address). + */ + #define StackFrame (SaveSize + Padding + LocalsSize) + + // Save xmm registers. + movaps %xmm0, 0*16(r4) + movaps %xmm1, 1*16(r4) + movaps %xmm2, 2*16(r4) + movaps %xmm3, 3*16(r4) + movaps %xmm4, 4*16(r4) + movaps %xmm5, 5*16(r4) + movaps %xmm6, 6*16(r4) + movaps %xmm7, 7*16(r4) + +#if defined __i386__ + + // Load arguments. + #define Argument(i) StackFrame+4*(i)(r4) + mov Argument(1), E + mov Argument(0), K + +#endif + +// Merge point for _aes_decrypt_key and _aes_decrypt_key256. +DKeyHas8Words: + + // First words of expanded key are copied from user key. + movd 0*4(K), ve0 + movd 1*4(K), ve1 + movd 2*4(K), ve2 + movd 3*4(K), ve3 + + movl $14*16, ContextKeyLength(E) // Set "key length." + + #if 0 != ContextKey + add $ContextKey, E + #endif + + // Store initial words of expanded key, which are copies of user's key. + movd ve0, 0*4(E) + movd ve1, 1*4(E) + movd ve2, 2*4(E) + movd ve3, 3*4(E) + movd 4*4(K), ve0 + movd 5*4(K), ve1 + movd 6*4(K), ve2 + movd 7*4(K), ve3 + + // K cannot be used after we write to R, since they use the same register. + + #if defined __i386__ + + lea _AESRcon, R + lea _AESInvMixColumnTable, ITable + lea _AESSubBytesWordTable, STable + + #elif defined __x86_64__ + + lea _AESRcon(%rip), R + lea _AESInvMixColumnTable(%rip), ITable + lea _AESSubBytesWordTable(%rip), STable + + #endif + + /* With an eight-word key, there are fourteen rounds (fifteen 16-byte key + blocks), thirteen of which have InvMixColumn applied. + */ + mov $-12*4*4, offset + sub offset, E + + // Save untransformed values in stack area. + movd ve0, 4*4+Local(r4) + movd ve1, 5*4+Local(r4) + movd ve2, 6*4+Local(r4) + movd ve3, 7*4+Local(r4) + + /* Apply InvMixColumn to words 4 through 7. The transformed values are + stored in the expanded key. The original values are saved in the stack + area for further computation. + */ + movd ve0, dr + call InvMixColumn + movd vt0, 4*4(E, offset) + + movd ve1, dr + call InvMixColumn + movd vt0, 5*4(E, offset) + + movd ve2, dr + call InvMixColumn + movd vt0, 6*4(E, offset) + + movd ve3, dr + call InvMixColumn + movd vt0, 7*4(E, offset) + +/* Here is the first iteration of the key expansion. It is separate from the + main loop below because we need to apply InvMixColumn to each of the + outputs, in ve0 through ve3. In the main loop, the technique described at + the top of this file is used to compute the proper outputs while using + InvMixColumn only once. +*/ + add $1, R // Advance pointer. + movd ve3, dr // Put previous word into work register. + movzx (R), t0d // Get round constant. + movd t0d, vt0 + + SubWordRotWord + + add $8*4, offset + + movd (0-8)*4(E, offset), ve0 // Get old word. + pxor vt0, ve0 + movd ve0, 0*4+Local(r4) // Save on stack. + movd ve0, dr + call InvMixColumn + movd vt0, 0*4(E, offset) // Write to expanded key. + + /* Chain to successive words and apply InvMixColumn to each word. The + transformed values are stored in the expanded key. The original + values are retained in local data for further computation. + */ + movd (1-8)*4(E, offset), ve1 // Get old word. + pxor ve0, ve1 // Chain. + movd ve1, 1*4+Local(r4) // Save on stack. + movd ve1, dr + call InvMixColumn + movd vt0, 1*4(E, offset) // Write to expanded key. + + movd (2-8)*4(E, offset), ve2 // Get old word. + pxor ve1, ve2 // Chain. + movd ve2, 2*4+Local(r4) // Save on stack. + movd ve2, dr + call InvMixColumn + movd vt0, 2*4(E, offset) // Write to expanded key. + + movd (3-8)*4(E, offset), ve3 // Get old word. + pxor ve2, ve3 // Chain. + movd ve3, 3*4+Local(r4) // Save on stack. + movd ve3, dr + call InvMixColumn + movd vt0, 3*4(E, offset) // Write to expanded key. + + movd ve3, dr // Put previous word into work register. + SubWord + + movd 4*4+Local(r4), ve0 // Get old word. + pxor vt0, ve0 // Chain. + movd ve0, 4*4+Local(r4) // Save on stack. + + movd 5*4+Local(r4), ve1 // Get old word. + pxor ve0, ve1 // Chain. + movd ve1, 5*4+Local(r4) // Save on stack. + + movd 6*4+Local(r4), ve2 // Get old word. + pxor ve1, ve2 // Chain. + movd ve2, 6*4+Local(r4) // Save on stack. + + movd 7*4+Local(r4), ve3 // Get old word. + pxor ve2, ve3 // Chain. + movd ve3, 7*4+Local(r4) // Save on stack. + + movd vt0, dr // Move change to work register. + call InvMixColumn + + movd (4-8)*4(E, offset), vt1 // Get old word. + pxor vt1, vt0 // Chain. + movd vt0, 4*4(E, offset) // Write new word to expanded key. + + movd (5-8)*4(E, offset), vt1 // Get old word. + pxor vt1, vt0 // Chain. + movd vt0, 5*4(E, offset) // Write new word to expanded key. + + movd (6-8)*4(E, offset), vt1 // Get old word. + pxor vt1, vt0 // Chain. + movd vt0, 6*4(E, offset) // Write new word to expanded key. + + movd (7-8)*4(E, offset), vt1 // Get old word. + pxor vt1, vt0 // Chain. + movd vt0, 7*4(E, offset) // Write new word to expanded key. + +// Here is the main loop. +1: + add $1, R // Advance pointer. + movd ve3, dr // Put previous word into work register. + movzx (R), t0d // Get round constant. + movd t0d, vt0 + + SubWordRotWord + + movd 0*4+Local(r4), ve0 // Get old word. + pxor vt0, ve0 + movd ve0, 0*4+Local(r4) // Save on stack. + + // Chain to successive words. + movd 1*4+Local(r4), ve1 // Get old word. + pxor ve0, ve1 // Chain. + movd ve1, 1*4+Local(r4) // Save on stack. + + movd 2*4+Local(r4), ve2 // Get old word. + pxor ve1, ve2 // Chain. + movd ve2, 2*4+Local(r4) // Save on stack. + + movd 3*4+Local(r4), ve3 // Get old word. + pxor ve2, ve3 // Chain. + movd ve3, 3*4+Local(r4) // Save on stack. + + movd vt0, dr // Move change to work register. + call InvMixColumn + + movd 0*4(E, offset), vt1 // Get old word. + pxor vt1, vt0 // Chain. + movd vt0, (0+8)*4(E, offset) // Write new word to expanded key. + + movd 1*4(E, offset), vt1 // Get old word. + pxor vt1, vt0 // Chain. + movd vt0, (1+8)*4(E, offset) // Write new word to expanded key. + + movd 2*4(E, offset), vt1 // Get old word. + pxor vt1, vt0 // Chain. + movd vt0, (2+8)*4(E, offset) // Write new word to expanded key. + + movd 3*4(E, offset), vt1 // Get old word. + pxor vt1, vt0 // Chain. + movd vt0, (3+8)*4(E, offset) // Write new word to expanded key. + + movd ve3, dr // Put previous word into work register. + SubWord + + movd 4*4+Local(r4), ve0 // Get old word. + pxor vt0, ve0 // Chain. + movd ve0, 4*4+Local(r4) // Save on stack. + + movd 5*4+Local(r4), ve1 // Get old word. + pxor ve0, ve1 // Chain. + movd ve1, 5*4+Local(r4) // Save on stack. + + movd 6*4+Local(r4), ve2 // Get old word. + pxor ve1, ve2 // Chain. + movd ve2, 6*4+Local(r4) // Save on stack. + + movd 7*4+Local(r4), ve3 // Get old word. + pxor ve2, ve3 // Chain. + movd ve3, 7*4+Local(r4) // Save on stack. + + movd vt0, dr // Move change to work register. + call InvMixColumn + + movd 4*4(E, offset), vt1 // Get old word. + pxor vt1, vt0 // Chain. + movd vt0, (4+8)*4(E, offset) // Write new word to expanded key. + + movd 5*4(E, offset), vt1 // Get old word. + pxor vt1, vt0 // Chain. + movd vt0, (5+8)*4(E, offset) // Write new word to expanded key. + + movd 6*4(E, offset), vt1 // Get old word. + pxor vt1, vt0 // Chain. + movd vt0, (6+8)*4(E, offset) // Write new word to expanded key. + + movd 7*4(E, offset), vt1 // Get old word. + pxor vt1, vt0 // Chain. + movd vt0, (7+8)*4(E, offset) // Write new word to expanded key. + + add $8*4, offset + + jl 1b + + movd ve3, dr // Put previous word into work register. + movzx 1(R), t0d // Get round constant. + movd t0d, vt0 + + SubWordRotWord + + movd 0*4+Local(r4), ve0 // Get old word. + pxor vt0, ve0 // Chain. + movd ve0, (0+8)*4(E, offset) + + // Chain to successive words. + movd 1*4+Local(r4), ve1 // Get old word. + pxor ve0, ve1 // Chain. + movd ve1, (1+8)*4(E, offset) + + movd 2*4+Local(r4), ve2 // Get old word. + pxor ve1, ve2 // Chain. + movd ve2, (2+8)*4(E, offset) + + movd 3*4+Local(r4), ve3 // Get old word. + pxor ve2, ve3 // Chain. + movd ve3, (3+8)*4(E, offset) + + xor r0, r0 // Return success. + + // Pop stack and restore registers. + movaps 7*16(r4), %xmm7 + movaps 6*16(r4), %xmm6 + movaps 5*16(r4), %xmm5 + movaps 4*16(r4), %xmm4 + movaps 3*16(r4), %xmm3 + movaps 2*16(r4), %xmm2 + movaps 1*16(r4), %xmm1 + movaps 0*16(r4), %xmm0 + #if 0 < LocalsSize + add $Padding + LocalsSize, r4 + #endif + #if defined __i386__ + pop r7 + pop r6 + pop r5 + #endif + pop r3 + + ret + + +#undef Address +#undef Argument +#undef E +#undef ITable +#undef K +#undef Local +#undef LocalsSize +#undef LookupI +#undef LookupS +#undef Nk +#undef Padding +#undef R +#undef SaveSize +#undef STable +#undef StackFrame +#undef dr +#undef drh +#undef drl +#undef offset +#undef t0 +#undef t0d +#undef ve0 +#undef ve1 +#undef ve2 +#undef ve3 +#undef ve4 +#undef ve5 +#undef vt0 +#undef vt1 diff --git a/bsd/crypto/aes/i386/ExpandKeyForEncryption.s b/bsd/crypto/aes/i386/ExpandKeyForEncryption.s new file mode 100644 index 000000000..1ce3c9553 --- /dev/null +++ b/bsd/crypto/aes/i386/ExpandKeyForEncryption.s @@ -0,0 +1,801 @@ +/* This file defines _aes_encrypt_key, _aes_encrypt_key128, + _aes_encrypt_key192, and _aes_encrypt_key256. It is designed to be + included in another assembly file with the preprocessor #include directive, + to benefit from some assembly-time calculations. + + Written by Eric Postpischil, January 2008. + + The comments here do not say much about the algorithm; the code just + follows the FIPS-197 specification. I recommend reading the specification + before working with this code or examining the C code in the parent + directory that illustrates key expansion. +*/ + + +/* Routines: + + _aes_encrypt_key. + + _aes_encrypt_key128, _aes_encrypt_key192, and _aes_encrypt_key256. + + Function: + + Expand the user's cipher key into the key schedule, as defined in + Federal Information Processing Standards Publication 197 (FIPS-197), + November 26, 2001. + + Input: + + Constant data: + + The following names must be locally defined so the assembler + can calculate certain offsets. + + static const Word _AESSubBytesWordTable[4][256]. + + _AESSubBytesWordTable[i][j] = SubBytes(j) << 8*i, where + SubBytes is defined in FIPS-197. _AESSubBytesWordTable + differs from _AESEncryptTable in that it does not include + the MixColumn operation. It is used in performing the last + round, which differs fromm the previous rounds in that it + does not include the MixColumn operation. + + static const Byte _AESRcon[]. + + Round constants, beginning with AESRcon[1] for the first round + (AESRcon[0] is padding.) + + Arguments: + + const uint8_t *Key + + Address of user's cipher key. + + int Length + + Number of bytes (16, 24, or 32) or bits (128, 192, or 256) in + user's cipher key. + + This argument is used with _aes_encrypt_key. It is not + present for the other routines. In those routines, Context + is the second argument. + + aes_encrypt_ctx *Context + + Structure to contain the expanded key beginning at offset + ContextKey and a four-byte "key length" beginning at offset + ContextKeyLength. The "key length" is the number of bytes from + the start of the first round key to the start of the last round + key. That is 16 less than the number of bytes in the entire + key. + + Output: + + The expanded key and the "key length" are written to *Context. + + Return: + + aes_rval // -1 if "key length" is invalid. 0 otherwise. +*/ + +/* add AES HW detection and program branch if AES HW is detected cclee 3-12-10 */ +#ifdef KERNEL +#include +#else +#include +#endif + + .text + .globl _aes_encrypt_key +// .private_extern _aes_encrypt_key +_aes_encrypt_key: + + // detect AES HW, cclee-3-13-10 +#if defined __x86_64__ + movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities + mov (%rax), %eax // %eax = __cpu_capabilities +#else +#if defined KERNEL + leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities + mov (%eax), %eax // %eax = __cpu_capabilities +#else + mov _COMM_PAGE_CPU_CAPABILITIES, %eax +#endif +#endif + test $(kHasAES), %eax // __cpu_capabilities & kHasAES + jne _aes_encrypt_key_hw // if AES HW detected, branch to _aes_encrypt_key_hw + +#define dr r0d // Dissection register. +#define drl r0l // Low 8 bits of dissection register. +#define drh r0h // Second-lowest 8 bits of dissection register. + +#define t0 r1 +#define t0d r1d // Low 32 bits of t0. + +#define offset Arch(r5, r11) // Address offset and loop sentinel. + +#define R r7 // Address of round constant. +#define K r7 // User key pointer. + // R and K overlap. + +#define E r6 // Expanded key pointer. + +#define ve0 %xmm0 +#define ve1 %xmm1 +#define ve2 %xmm2 +#define ve3 %xmm3 +#define vt3 %xmm4 +#define vt2 %xmm5 +#define vt1 %xmm6 +#define vt0 %xmm7 + +#if defined __i386__ + #define LookupS(table, index) \ + _AESSubBytesWordTable+(table)*TableSize(, index, 4) +#elif defined __x86_64__ + #define LookupS(table, index) (table)*TableSize(STable, index, 4) +#endif + + /* Save registers and set SaveSize to the number of bytes pushed onto the + stack so far, including the caller's return address. + */ + push r3 + #if defined __i386__ + push r5 + push r6 + push r7 + #define SaveSize (5*4) + #else + #define SaveSize (2*8) + #endif + + /* Number of bytes used for local variables: + + 8 16-byte spaces to save XMM registers. + */ + #define LocalsSize (8*16) + + #if 0 < LocalsSize + // Padding to position stack pointer at a multiple of 16 bytes. + #define Padding (15 & -(SaveSize + LocalsSize)) + sub $Padding + LocalsSize, r4 // Allocate space on stack. + #else + #define Padding 0 + #endif + + /* StackFrame is the number of bytes in our stack frame, from caller's + stack pointer to ours (so it includes the return address). + */ + #define StackFrame (SaveSize + Padding + LocalsSize) + + // Save xmm registers. + movaps %xmm0, 0*16(r4) + movaps %xmm1, 1*16(r4) + movaps %xmm2, 2*16(r4) + movaps %xmm3, 3*16(r4) + movaps %xmm4, 4*16(r4) + movaps %xmm5, 5*16(r4) + movaps %xmm6, 6*16(r4) + movaps %xmm7, 7*16(r4) + +#if defined __i386__ + + // Define location of argument i. + #define Argument(i) StackFrame+4*(i)(r4) + + #define Nk t0d + + // Load arguments. + mov Argument(2), E + mov Argument(1), Nk + mov Argument(0), K + +#elif defined __x86_64__ + + #define Nk r9d // Number of words in key. + mov r6d, Nk // Move Nk argument out of way. + mov r2, E // Move E argument to common register. + +#endif + + // Dispatch on key length. + cmp $128, Nk + jge 2f + shl $3, Nk // Convert from bytes to bits. + cmp $128, Nk +2: + je EKeyHas4Words + cmp $192, Nk + je EKeyHas6Words + cmp $256, Nk + je EKeyHas8Words + mov $-1, r0 // Return error. + jmp 9f + +// Stop using Nk. +#undef Nk + + .globl _aes_encrypt_key128 +// .private_extern _aes_encrypt_key128 +_aes_encrypt_key128: + + /* Save registers and set SaveSize to the number of bytes pushed onto the + stack so far, including the caller's return address. + */ + push r3 + #if defined __i386__ + push r5 + push r6 + push r7 + #define SaveSize (5*4) + #else + #define SaveSize (2*8) + #endif + + /* Number of bytes used for local variables: + + 8 16-byte spaces to save XMM registers. + */ + #define LocalsSize (8*16) + + #if 0 < LocalsSize + // Padding to position stack pointer at a multiple of 16 bytes. + #define Padding (15 & -(SaveSize + LocalsSize)) + sub $Padding + LocalsSize, r4 // Allocate space on stack. + #else + #define Padding 0 + #endif + + /* StackFrame is the number of bytes in our stack frame, from caller's + stack pointer to ours (so it includes the return address). + */ + #define StackFrame (SaveSize + Padding + LocalsSize) + + // Save xmm registers. + movaps %xmm0, 0*16(r4) + movaps %xmm1, 1*16(r4) + movaps %xmm2, 2*16(r4) + movaps %xmm3, 3*16(r4) + movaps %xmm4, 4*16(r4) + movaps %xmm5, 5*16(r4) + movaps %xmm6, 6*16(r4) + movaps %xmm7, 7*16(r4) + + #if defined __i386__ + + // Load arguments. + #define Argument(i) StackFrame+4*(i)(r4) + mov Argument(1), E + mov Argument(0), K + + #endif + +// Merge point for _aes_encrypt_key and _aes_encrypt_key128. +EKeyHas4Words: + +#define e0 r2d +#define e1 r3d +#define e2 Arch(r5d, r11d) +#define e3 r7d + + // First words of expanded key are copied from user key. + mov 0*4(K), e0 + mov 1*4(K), e1 + mov 2*4(K), e2 + mov 3*4(K), e3 + + movl $10*16, ContextKeyLength(E) // Set "key length." + + #if 0 != ContextKey + add $ContextKey, E + #endif + + // K cannot be used after we write to R, since they use the same register. + + // Cache round constants in output buffer. The last is a sentinel. + movb $0x01, 1*16(E) + movb $0x02, 2*16(E) + movb $0x04, 3*16(E) + movb $0x08, 4*16(E) + movb $0x10, 5*16(E) + movb $0x20, 6*16(E) + movb $0x40, 7*16(E) + movb $0x80, 8*16(E) + movb $0x1b, 9*16(E) + movb $0x36, 10*16(E) + + #if defined __x86_64__ + + #define STable r8 + lea _AESSubBytesWordTable(%rip), STable + + #endif + + // Store initial words of expanded key, which are copies of user's key. + mov e0, 0*4(E) + mov e1, 1*4(E) + mov e2, 2*4(E) + mov e3, 3*4(E) + +1: + mov e3, dr // Put previous word into dissection register. + + // Perform SubWord(RotWord(dr)). + movzx drl, t0 + xor LookupS(3, t0), e0 // Look up byte 0 in table 3. + movzx drh, t0d + xor LookupS(0, t0), e0 // Look up byte 1 in table 0. + shr $16, dr + movzx drl, t0d + xor LookupS(1, t0), e0 // Look up byte 2 in table 1. + movzx drh, t0d + xor LookupS(2, t0), e0 // Look up byte 3 in table 2. + + add $4*4, E + + movzx (E), t0d // Get cached round constant. + xor t0d, e0 // XOR with word from four words back. + + // Chain to successive words. + mov e0, 0*4(E) + xor e0, e1 + mov e1, 1*4(E) + xor e1, e2 + mov e2, 2*4(E) + xor e2, e3 + mov e3, 3*4(E) + + cmp $0x36, t0d // Was this the last round constant? + + jne 1b + + xor r0, r0 // Return success. + +9: + // Pop stack and restore registers. + movaps 7*16(r4), %xmm7 + movaps 6*16(r4), %xmm6 + movaps 5*16(r4), %xmm5 + movaps 4*16(r4), %xmm4 + movaps 3*16(r4), %xmm3 + movaps 2*16(r4), %xmm2 + movaps 1*16(r4), %xmm1 + movaps 0*16(r4), %xmm0 + #if 0 < LocalsSize + add $Padding + LocalsSize, r4 + #endif + #if defined __i386__ + pop r7 + pop r6 + pop r5 + #endif + pop r3 + + ret + + +// Reset definitions for next case. +#undef e0 +#undef e1 +#undef e2 +#undef e3 + +#undef vt3 +#undef vt2 +#define ve4 %xmm4 +#define ve5 %xmm5 + + + .globl _aes_encrypt_key192 +// .private_extern _aes_encrypt_key192 +_aes_encrypt_key192: + + /* Save registers and set SaveSize to the number of bytes pushed onto the + stack so far, including the caller's return address. + */ + push r3 + #if defined __i386__ + push r5 + push r6 + push r7 + #define SaveSize (5*4) + #else + #define SaveSize (2*8) + #endif + + /* Number of bytes used for local variables: + + 8 16-byte spaces to save XMM registers. + */ + #define LocalsSize (8*16) + + #if 0 < LocalsSize + // Padding to position stack pointer at a multiple of 16 bytes. + #define Padding (15 & -(SaveSize + LocalsSize)) + sub $Padding + LocalsSize, r4 // Allocate space on stack. + #else + #define Padding 0 + #endif + + /* StackFrame is the number of bytes in our stack frame, from caller's + stack pointer to ours (so it includes the return address). + */ + #define StackFrame (SaveSize + Padding + LocalsSize) + + // Save xmm registers. + movaps %xmm0, 0*16(r4) + movaps %xmm1, 1*16(r4) + movaps %xmm2, 2*16(r4) + movaps %xmm3, 3*16(r4) + movaps %xmm4, 4*16(r4) + movaps %xmm5, 5*16(r4) + movaps %xmm6, 6*16(r4) + movaps %xmm7, 7*16(r4) + + #if defined __i386__ + + // Load arguments. + #define Argument(i) StackFrame+4*(i)(r4) + mov Argument(1), E + mov Argument(0), K + + #endif + +// Merge point for _aes_encrypt_key and _aes_encrypt_key192. +EKeyHas6Words: + + // First words of expanded key are copied from user key. + movd 0*4(K), ve0 + movd 1*4(K), ve1 + movd 2*4(K), ve2 + movd 3*4(K), ve3 + + movl $12*16, ContextKeyLength(E) // Set "key length." + + #if 0 != ContextKey + add $ContextKey, E + #endif + + movd 4*4(K), ve4 + movd 5*4(K), ve5 + + // K cannot be used after we write to R, since they use the same register. + + #if defined __i386__ + + lea _AESRcon, R + + #elif defined __x86_64__ + + lea _AESRcon(%rip), R + lea _AESSubBytesWordTable(%rip), STable + + #endif + + /* With a six-word key, there are twelve rounds (thirteen 16-byte key + blocks). + */ + mov $-12*4*4, offset + sub offset, E + + // Store initial words of expanded key, which are copies of user's key. + movd ve0, 0*4(E, offset) + movd ve1, 1*4(E, offset) + movd ve2, 2*4(E, offset) + movd ve3, 3*4(E, offset) + movd ve4, 4*4(E, offset) + movd ve5, 5*4(E, offset) + +/* Jump into loop body. The key expansion processes six four-byte words per + iteration. 52 are needed in the key. So only four are needed in the last + iteration. +*/ + jmp 2f +1: + // Continue chaining to successive words. + pxor ve3, ve4 + movd ve4, 4*4(E, offset) + pxor ve4, ve5 + movd ve5, 5*4(E, offset) +2: + add $1, R // Advance pointer. + movd ve5, dr // Put previous word into dissection register. + movzx (R), t0 // Get round constant. + movd t0d, vt1 + pxor vt1, ve0 // XOR with word from six words back. + + // Perform SubWord(RotWord(dr)). + movzx drl, t0d + movd LookupS(3, t0), vt0 // Look up byte 0 in table 3. + movzx drh, t0d + movd LookupS(0, t0), vt1 // Look up byte 1 in table 0. + shr $16, dr + movzx drl, t0d + pxor vt1, vt0 + pxor vt0, ve0 + movd LookupS(1, t0), vt0 // Look up byte 2 in table 1. + movzx drh, t0d + movd LookupS(2, t0), vt1 // Look up byte 3 in table 2. + pxor vt1, vt0 + pxor vt0, ve0 + + add $6*4, offset + + // Chain to successive words. + movd ve0, 0*4(E, offset) + pxor ve0, ve1 + movd ve1, 1*4(E, offset) + pxor ve1, ve2 + movd ve2, 2*4(E, offset) + pxor ve2, ve3 + movd ve3, 3*4(E, offset) + + jne 1b + + xor r0, r0 // Return success. + + // Pop stack and restore registers. + movaps 7*16(r4), %xmm7 + movaps 6*16(r4), %xmm6 + movaps 5*16(r4), %xmm5 + movaps 4*16(r4), %xmm4 + movaps 3*16(r4), %xmm3 + movaps 2*16(r4), %xmm2 + movaps 1*16(r4), %xmm1 + movaps 0*16(r4), %xmm0 + #if 0 < LocalsSize + add $Padding + LocalsSize, r4 + #endif + #if defined __i386__ + pop r7 + pop r6 + pop r5 + #endif + pop r3 + + ret + + +// Reset definitions for next case. +#undef ve4 +#undef ve5 +#define vt3 %xmm4 +#define vt2 %xmm5 + + + .globl _aes_encrypt_key256 +// .private_extern _aes_encrypt_key256 +_aes_encrypt_key256: + + /* Save registers and set SaveSize to the number of bytes pushed onto the + stack so far, including the caller's return address. + */ + push r3 + #if defined __i386__ + push r5 + push r6 + push r7 + #define SaveSize (5*4) + #else + #define SaveSize (2*8) + #endif + + /* Number of bytes used for local variables: + + 8 16-byte spaces to save XMM registers. + */ + #define LocalsSize (8*16) + + #if 0 < LocalsSize + // Padding to position stack pointer at a multiple of 16 bytes. + #define Padding (15 & -(SaveSize + LocalsSize)) + sub $Padding + LocalsSize, r4 // Allocate space on stack. + #else + #define Padding 0 + #endif + + /* StackFrame is the number of bytes in our stack frame, from caller's + stack pointer to ours (so it includes the return address). + */ + #define StackFrame (SaveSize + Padding + LocalsSize) + + // Save xmm registers. + movaps %xmm0, 0*16(r4) + movaps %xmm1, 1*16(r4) + movaps %xmm2, 2*16(r4) + movaps %xmm3, 3*16(r4) + movaps %xmm4, 4*16(r4) + movaps %xmm5, 5*16(r4) + movaps %xmm6, 6*16(r4) + movaps %xmm7, 7*16(r4) + + #if defined __i386__ + + // Load arguments. + #define Argument(i) StackFrame+4*(i)(r4) + mov Argument(1), E + mov Argument(0), K + + #endif + +// Merge point for _aes_encrypt_key and _aes_encrypt_key256. +EKeyHas8Words: + + // First words of expanded key are copied from user key. + movd 0*4(K), ve0 + movd 1*4(K), ve1 + movd 2*4(K), ve2 + movd 3*4(K), ve3 + + movl $14*16, ContextKeyLength(E) // Set "key length." + + #if 0 != ContextKey + add $ContextKey, E + #endif + + // Store initial words of expanded key, which are copies of user's key. + movd ve0, 0*4(E) + movd ve1, 1*4(E) + movd ve2, 2*4(E) + movd ve3, 3*4(E) + movd 4*4(K), ve0 + movd 5*4(K), ve1 + movd 6*4(K), ve2 + movd 7*4(K), ve3 + + // K cannot be used after we write to R, since they use the same register. + + #if defined __i386__ + + lea _AESRcon, R + + #elif defined __x86_64__ + + lea _AESRcon(%rip), R + lea _AESSubBytesWordTable(%rip), STable + + #endif + + /* With an eight-word key, there are fourteen rounds (fifteen 16-byte key + blocks). + */ + mov $-14*4*4, offset + sub offset, E + + // Store initial words of expanded key, which are copies of user's key. + movd ve0, 4*4(E, offset) + movd ve1, 5*4(E, offset) + movd ve2, 6*4(E, offset) + movd ve3, 7*4(E, offset) + +/* Jump into loop body. The key expansion processes eight four-byte words per + iteration. 60 are needed in the key. So only four are needed in the last + iteration. +*/ + jmp 2f +1: + movd ve3, dr // Put previous word into dissection register. + + /* Get word from eight words back (it is four words back from where E + currently points, and we use it to prepare the value to be stored + four words beyond where E currently points). + */ + movd -4*4(E, offset), ve0 + + // Perform SubWord(dr). + movzx drl, t0 + movd LookupS(0, t0), vt0 // Look up byte 0 in table 0. + movzx drh, t0d + movd LookupS(1, t0), vt1 // Look up byte 1 in table 1. + shr $16, dr + movzx drl, t0d + movd LookupS(2, t0), vt2 // Look up byte 2 in table 2. + movzx drh, t0d + movd LookupS(3, t0), vt3 // Look up byte 3 in table 3. + pxor vt1, vt0 + pxor vt3, vt2 + pxor vt0, ve0 + pxor vt2, ve0 + + movd -3*4(E, offset), ve1 // Get words from eight words back. + movd -2*4(E, offset), ve2 + movd -1*4(E, offset), ve3 + + // Chain to successive words. + movd ve0, 4*4(E, offset) + pxor ve0, ve1 + movd ve1, 5*4(E, offset) + pxor ve1, ve2 + movd ve2, 6*4(E, offset) + pxor ve2, ve3 + movd ve3, 7*4(E, offset) + +2: + add $1, R // Advance pointer. + movd ve3, dr // Put previous word into dissection register. + movzx (R), t0d // Get round constant. + movd t0d, vt1 + movd 0*4(E, offset), ve0 // Get word from eight words back. + pxor vt1, ve0 + + // Perform SubWord(RotWord(dr)). + movzx drl, t0 + movd LookupS(3, t0), vt0 // Look up byte 0 in table 3. + movzx drh, t0d + movd LookupS(0, t0), vt1 // Look up byte 1 in table 0. + shr $16, dr + movzx drl, t0d + movd LookupS(1, t0), vt2 // Look up byte 2 in table 1. + movzx drh, t0d + movd LookupS(2, t0), vt3 // Look up byte 3 in table 2. + pxor vt1, vt0 + pxor vt3, vt2 + pxor vt0, ve0 + pxor vt2, ve0 + + movd 1*4(E, offset), ve1 + movd 2*4(E, offset), ve2 + movd 3*4(E, offset), ve3 + + add $8*4, offset + + // Chain to successive words. + movd ve0, 0*4(E, offset) + pxor ve0, ve1 + movd ve1, 1*4(E, offset) + pxor ve1, ve2 + movd ve2, 2*4(E, offset) + pxor ve2, ve3 + movd ve3, 3*4(E, offset) + + jne 1b + + xor r0, r0 // Return success. + + // Pop stack and restore registers. + movaps 7*16(r4), %xmm7 + movaps 6*16(r4), %xmm6 + movaps 5*16(r4), %xmm5 + movaps 4*16(r4), %xmm4 + movaps 3*16(r4), %xmm3 + movaps 2*16(r4), %xmm2 + movaps 1*16(r4), %xmm1 + movaps 0*16(r4), %xmm0 + #if 0 < LocalsSize + add $Padding + LocalsSize, r4 + #endif + #if defined __i386__ + pop r7 + pop r6 + pop r5 + #endif + pop r3 + + ret + + +#undef Address +#undef Argument +#undef E +#undef K +#undef LocalsSize +#undef LookupS +#undef Padding +#undef R +#undef SaveSize +#undef STable +#undef StackFrame +#undef dr +#undef drh +#undef drl +#undef offset +#undef t0 +#undef t0d +#undef ve0 +#undef ve1 +#undef ve2 +#undef ve3 +#undef vt0 +#undef vt1 +#undef vt2 +#undef vt3 diff --git a/bsd/crypto/aes/i386/MakeData.c b/bsd/crypto/aes/i386/MakeData.c new file mode 100644 index 000000000..262dc5996 --- /dev/null +++ b/bsd/crypto/aes/i386/MakeData.c @@ -0,0 +1,516 @@ +#include +#include +#include +#include + +#define MaxRcon 11 + +typedef uint8_t Byte; +typedef uint32_t Word; + + +/* In comments below, {n} designates the Galois field element represented by + the byte n. See notes about Galois field multiplication in ReadMe.txt. + + So 3+5 is addition of ordinary integers, and 3+5 == 8, while {3}+{5} is + addition in the field, and {3} + {5} = {3 XOR 5} = {6}.) +*/ + + +// Define constants for languages. +typedef enum { C, IntelAssembly } Language; + + +/* LogBase3[i] will contain the base-three logarithm of i in the 256-element + Galois field defined by AES. That is, {3}**LogBase3[i] == {3}**i. +*/ +static Byte LogBase3[256]; + +/* AntilogBase3[i] will contain {3}**i in the 256-element Galois field defined + by AES. It contains extra elements so that the antilog of a+b can be found + by looking up a+b directly, without having to reduce modulo the period, for + 0 <= a, b < 255. + + (254 is the greatest value we encounter. Each a or b we use is the + base-three logarithm of some element. As a primitive root, the powers of + three cycle through all non-zero elements of the field, of which there are + 255, so the exponents cover 0 to 254 before the powers repeat.) +*/ +static Byte AntilogBase3[254+254+1]; + + +static void InitializeLogTables(void) +{ + // log({1}) is zero, so start {p} (power) at {1} and l (logarithm) at 0. + Byte p = 1; + int l = 0; + do + { + // Record table entries. + LogBase3[p] = l; + AntilogBase3[l] = p; + + /* Observe that {2}*{p} is {p << 1 ^ (a & 0x80 ? 0x1b : 0)}, per notes + in ReadMe.txt. We produce {3}*{p}: + + {3}*{p} + = {1}*{p} + {2}*{p} + = {1}*{p} + {p << 1 ^ (a & 0x80 ? 0x1b : 0)} + = {p ^ p << 1 ^ (p & 0x80 ? 0x1b : 0)}. + */ + p ^= p << 1 ^ (p & 0x80 ? 0x1b : 0); + ++l; + + } while (p != 1); // Stop when we have gone around completely. + + /* The antilogarithms are periodic with a period of 255, and we want to + look up elements as high as 254+254 (the largest that a sum of two + logarithms could be), so we replicate the table beyond the first + period. + */ + for (l = 255; l < 254+254; ++l) + AntilogBase3[l] = AntilogBase3[l-255]; +} + + +/* MultiplyByte(Byte b, Byte c) returns {b}*{c}. It requires tables that must + be initialized before this routine is used. +*/ +static Byte MultiplyByte(Byte b, Byte c) +{ + // Calculate product by adding logarithms, but avoid logarithms of zero. + return b == 0 || c == 0 ? 0 : AntilogBase3[LogBase3[b] + LogBase3[c]]; +} + + +// Return {0} if {b} is {0} and the multiplicative inverse of {b} otherwise. +static Byte InverseByte(Byte b) +{ + return b == 0 ? 0 : AntilogBase3[255 - LogBase3[b]]; +} + + +// Perform AES' SubBytes operation on a single byte. +static Byte SubByte(Byte b) +{ + unsigned int r = InverseByte(b); + + // Duplicate r as a proxy for a rotate operation. + r = r | r<<8; + + // Apply the standard's affine transformation. + return r ^ r>>4 ^ r>>5 ^ r>>6 ^ r>>7 ^ 0x63; +} + + +// Define and populate tables for the SubBytes and InvSubBytes operations. +static Byte SubBytesTable[256]; +static Byte InvSubBytesTable[256]; + + +static void InitializeSubBytesTable(void) +{ + for (int i = 0; i < 256; ++i) + SubBytesTable[i] = SubByte((Byte) i); +} + + +static void InitializeInvSubBytesTable(void) +{ + for (int i = 0; i < 256; ++i) + InvSubBytesTable[SubByte((Byte) i)] = i; +} + + +/* Print tables for SubBytes function providing the output byte embedded in + various places in a word, so that the table entries can be used with + fewer byte manipulations. +*/ +static void PrintSubBytesWordTable(Language language) +{ + switch (language) + { + case C: + printf("\n\n" + "// SubBytes embedded in words tables.\n" + "const Word AESSubBytesWordTable[4][256] =\n" + "{\n"); + for (int j = 0; j < 4; ++j) + { + printf("\t{\n"); + for (int i = 0; i < 256; ++i) + printf("\t\t0x%08x,\n", SubBytesTable[i] << j*8); + printf("\t},\n"); + } + printf("};\n"); + break; + + case IntelAssembly: + printf("\n\n" + "// SubBytes embedded in words tables.\n" + "\t.globl\t_AESSubBytesWordTable\n" + "\t.private_extern\t_AESSubBytesWordTable\n" + "\t.align\t2\n" + "_AESSubBytesWordTable:\n"); + for (int j = 0; j < 4; ++j) + { + printf("\t// Table %d.\n", j); + for (int i = 0; i < 256; ++i) + printf("\t.long\t0x%08x\n", SubBytesTable[i] << j*8); + } + break; + } +} + + +/* Print tables for InvSubBytes function providing the output byte embedded in + various places in a word, so that the table entries can be used with + fewer byte manipulations. +*/ +static void PrintInvSubBytesWordTable(Language language) +{ + switch (language) + { + case C: + printf("\n\n" + "// InvSubBytes embedded in words tables.\n" + "const Word AESInvSubBytesWordTable[4][256] =\n" + "{\n"); + for (int j = 0; j < 4; ++j) + { + printf("\t{\n"); + for (int i = 0; i < 256; ++i) + printf("\t\t0x%08x,\n", InvSubBytesTable[i] << j*8); + printf("\t},\n"); + } + printf("};\n"); + break; + + case IntelAssembly: + printf("\n\n" + "// InvSubBytes embedded in words tables.\n" + "\t.globl\t_AESInvSubBytesWordTable\n" + "\t.private_extern\t_AESInvSubBytesWordTable\n" + "\t.align\t2\n" + "_AESInvSubBytesWordTable:\n"); + for (int j = 0; j < 4; ++j) + { + printf("\t// Table %d.\n", j); + for (int i = 0; i < 256; ++i) + printf("\t.long\t0x%08x\n", InvSubBytesTable[i] << j*8); + } + break; + } +} + + +// Print the round constants. +static void PrintRcon(Language language) +{ + union { Byte c[4]; Word w; } t = { { 1, 0, 0, 0 } }; + + switch (language) + { + case C: + printf("\n\n" + "// Round constants.\n" + "const Byte AESRcon[] =\n" + "{\n" + "\t0,\t// Not used, included for indexing simplicity.\n"); + for (int i = 1; i < MaxRcon; ++i) + { + printf("\t0x%02x,\n", t.w); + t.c[0] = MultiplyByte(0x2, t.c[0]); + } + printf("};\n"); + break; + + case IntelAssembly: + printf("\n\n" + "// Round constants.\n" + "\t.globl\t_AESRcon\n" + "\t.private_extern\t_AESRcon\n" + "_AESRcon:\n" + "\t.byte\t0\t// Not used, included for indexing simplicity.\n"); + for (int i = 1; i < MaxRcon; ++i) + { + printf("\t.byte\t0x%02x\n", t.w); + t.c[0] = MultiplyByte(0x2, t.c[0]); + } + break; + } +} + + +// Print tables for the InvMixColumn operation. +static void PrintInvMixColumnTable(Language language) +{ + Word T[4][256]; + + for (int i = 0; i < 256; ++i) + { + union { Byte b[4]; Word w; } c; + + Byte s9 = MultiplyByte(0x9, i); + Byte sb = MultiplyByte(0xb, i); + Byte sd = MultiplyByte(0xd, i); + Byte se = MultiplyByte(0xe, i); + + c.b[0] = se; + c.b[1] = s9; + c.b[2] = sd; + c.b[3] = sb; + T[0][i] = c.w; + + c.b[0] = sb; + c.b[1] = se; + c.b[2] = s9; + c.b[3] = sd; + T[1][i] = c.w; + + c.b[0] = sd; + c.b[1] = sb; + c.b[2] = se; + c.b[3] = s9; + T[2][i] = c.w; + + c.b[0] = s9; + c.b[1] = sd; + c.b[2] = sb; + c.b[3] = se; + T[3][i] = c.w; + } + + switch (language) + { + case C: + printf("\n\n" + "// Tables for InvMixColumn.\n" + "const Word AESInvMixColumnTable[4][256] =\n" + "{\n"); + for (int i = 0; i < 4; ++i) + { + printf("\t{\n"); + for (int j = 0; j < 256; ++j) + printf("\t\t0x%08x,\n", T[i][j]); + printf("\t},\n"); + } + printf("};\n"); + break; + + case IntelAssembly: + printf("\n\n" + "// Tables for InvMixColumn.\n" + "\t.globl\t_AESInvMixColumnTable\n" + "\t.private_extern\t_AESInvMixColumnTable\n" + "\t.align\t2\n" + "_AESInvMixColumnTable:\n"); + for (int i = 0; i < 4; ++i) + { + printf("\t// Table %d.\n", i); + for (int j = 0; j < 256; ++j) + printf("\t.long\t0x%08x\n", T[i][j]); + } + break; + } +} + + +/* Print the tables defined AES Proposal: Rijndael, amended, 9/04/2003, + section 5.2.1. These combine the MixColumn and SubBytes operations. +*/ +static void PrintEncryptTable(Language language) +{ + Word T[4][256]; + + for (int i = 0; i < 256; ++i) + { + union { Byte b[4]; Word w; } c; + + Byte s1 = SubBytesTable[i]; + Byte s2 = MultiplyByte(0x2, s1); + Byte s3 = s1 ^ s2; + + c.b[0] = s2; + c.b[1] = s1; + c.b[2] = s1; + c.b[3] = s3; + T[0][i] = c.w; + + c.b[0] = s3; + c.b[1] = s2; + //c.b[2] = s1; + c.b[3] = s1; + T[1][i] = c.w; + + c.b[0] = s1; + c.b[1] = s3; + c.b[2] = s2; + //c.b[3] = s1; + T[2][i] = c.w; + + //c.b[0] = s1; + c.b[1] = s1; + c.b[2] = s3; + c.b[3] = s2; + T[3][i] = c.w; + } + + switch (language) + { + case C: + printf("\n\n" + "// Tables for main encryption iterations.\n" + "const Word AESEncryptTable[4][256] =\n" + "{\n"); + for (int i = 0; i < 4; ++i) + { + printf("\t{\n"); + for (int j = 0; j < 256; ++j) + printf("\t\t0x%08x,\n", T[i][j]); + printf("\t},\n"); + } + printf("};\n"); + break; + + case IntelAssembly: + printf("\n\n" + "// Tables for main encryption iterations.\n" + "\t.globl\t_AESEncryptTable\n" + "\t.private_extern\t_AESEncryptTable\n" + "\t.align\t2\n" + "_AESEncryptTable:\n"); + for (int i = 0; i < 4; ++i) + { + printf("\t// Table %d.\n", i); + for (int j = 0; j < 256; ++j) + printf("\t.long\t0x%08x\n", T[i][j]); + } + break; + } +} + + +/* Print the inverse tables. These correspond to the tables above, but for + decyrption. These combine the InvSubBytes and InvMixColumn operations. +*/ +static void PrintDecryptTable(Language language) +{ + Word T[4][256]; + + for (int i = 0; i < 256; ++i) + { + union { Byte b[4]; Word w; } c; + + Byte si = InvSubBytesTable[i]; + + Byte s9 = MultiplyByte(0x9, si); + Byte sb = MultiplyByte(0xb, si); + Byte sd = MultiplyByte(0xd, si); + Byte se = MultiplyByte(0xe, si); + + c.b[0] = se; + c.b[1] = s9; + c.b[2] = sd; + c.b[3] = sb; + T[0][i] = c.w; + + c.b[0] = sb; + c.b[1] = se; + c.b[2] = s9; + c.b[3] = sd; + T[1][i] = c.w; + + c.b[0] = sd; + c.b[1] = sb; + c.b[2] = se; + c.b[3] = s9; + T[2][i] = c.w; + + c.b[0] = s9; + c.b[1] = sd; + c.b[2] = sb; + c.b[3] = se; + T[3][i] = c.w; + } + + switch (language) + { + case C: + printf("\n\n" + "// Tables for main decryption iterations.\n" + "const Word AESDecryptTable[4][256] =\n" + "{\n"); + for (int i = 0; i < 4; ++i) + { + printf("\t{\n"); + for (int j = 0; j < 256; ++j) + printf("\t\t0x%08x,\n", T[i][j]); + printf("\t},\n"); + } + printf("};\n"); + break; + + case IntelAssembly: + printf("\n\n" + "// Tables for main decryption iterations.\n" + "\t.globl\t_AESDecryptTable\n" + "\t.private_extern\t_AESDecryptTable\n" + "\t.align\t2\n" + "_AESDecryptTable:\n"); + for (int i = 0; i < 4; ++i) + { + printf("\t// Table %d.\n", i); + for (int j = 0; j < 256; ++j) + printf("\t.long\t0x%08x\n", T[i][j]); + } + break; + } +} + + +static void Usage(const char *ProgramName) +{ + fprintf(stderr, + "%s: This program must have exactly one argument, \"C\" to generate\n" + "C or \"Intel\" to generate GCC i386/x86_64 assembly.\n", ProgramName); + exit(EXIT_FAILURE); +} + + +int main(int argc, char *argv[]) +{ + if (argc != 2) + Usage(argv[0]); + + Language language; + + // Figure out which language to generate, C or Intel assembly. + if (0 == strcmp(argv[1], "C")) + language = C; + else if (0 == strcmp(argv[1], "Intel")) + language = IntelAssembly; + else + Usage(argv[0]); + + printf("// This file was generated by " __FILE__ ".\n"); + + if (language == C) + printf("\n\n#include \"AES.h\"\n"); + + if (language == IntelAssembly) + printf("\n\n\t.const\n"); + + InitializeLogTables(); + InitializeSubBytesTable(); + InitializeInvSubBytesTable(); + + PrintRcon(language); + PrintInvMixColumnTable(language); + PrintEncryptTable(language); + PrintDecryptTable(language); + PrintSubBytesWordTable(language); + PrintInvSubBytesWordTable(language); + + return 0; +} diff --git a/bsd/crypto/aes/i386/Makefile b/bsd/crypto/aes/i386/Makefile index f116db347..851f7b2ac 100644 --- a/bsd/crypto/aes/i386/Makefile +++ b/bsd/crypto/aes/i386/Makefile @@ -7,28 +7,26 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir include $(MakeInc_cmd) include $(MakeInc_def) -INSTINC_SUBDIRS = \ +include $(MakeInc_cmd) +include $(MakeInc_def) -INSTINC_SUBDIRS_PPC = \ +INSTINC_SUBDIRS = \ INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ -PRIVATE_DATAFILES = \ - aesopt.h edefs.h - INSTALL_MI_DIR = crypto EXPORT_MI_DIR = ${INSTALL_MI_DIR} -INSTALL_KF_MI_LIST = +PRIVATE_DATAFILES = \ + aesxts.h -INSTALL_KF_MI_LCL_LIST = ${PRIVATE_DATAFILES} +# /System/Library/Frameworks/Kernel.framework/PrivateHeaders +INSTALL_KF_MD_LCL_LIST = ${PRIVATE_DATAFILES} include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/bsd/crypto/aes/i386/ReadMe.txt b/bsd/crypto/aes/i386/ReadMe.txt new file mode 100644 index 000000000..7ac833117 --- /dev/null +++ b/bsd/crypto/aes/i386/ReadMe.txt @@ -0,0 +1,22 @@ +This directory contains a hybrid AES implementation. The core AES routines +(the actual encryption, decryption, and key expansion) are in: + + AES.s + Data.mk + Data.s + EncryptDecrypt.s + ExpandKeyForDecryption.s + ExpandKeyForEncryption.s + MakeData.c + +Although the above files do not explicitly include aes.h, they confirm to +certain things defined in it, notably the aes_rval type and the layout of the +aes_encrypt_ctx and aes_decrypt_ctx structures. These must be kept +compatibility; the definitions of ContextKey and ContextKeyLength in AES.s must +match the offsets of the key ("ks") and key_length ("inf") members of +aes_encrypt_ctx and aes_decrypt_ctx. (For some reason, aes_inf is a union that +is written as a 32-bit integer and read as an 8-bit integer. I do not know +why but have reproduced that behavior in the new implementation.) + +aes_modes.c extends the API, most notably by implementing CBC mode using the +basic AES block encryption. It uses aesopt.h and edefs.h. diff --git a/bsd/crypto/aes/i386/aes_crypt_hw.s b/bsd/crypto/aes/i386/aes_crypt_hw.s new file mode 100644 index 000000000..2edc3e2fd --- /dev/null +++ b/bsd/crypto/aes/i386/aes_crypt_hw.s @@ -0,0 +1,472 @@ +/* This files defines _aes_encrypt_hw and _aes_decrypt_hw --- Intel Westmere HW AES-based implementation + of _aes_encrypt and _aes_decrypt. + + These 2 functions SHOULD BE entried ONLY after the AES HW is verified to be available. + They SHOULD NOT be called without AES HW detection. It might cause xnu to crash. + + The AES HW is detected 1st thing in + _aes_encrypt (EncryptDecrypt.s) + _aes_decrypt (EncryptDecrypt.s) + and, if AES HW is detected, branch without link (ie, jump) to the functions here. + + The implementation here follows the examples in an Intel White Paper + "Intel Advanced Encryption Standard (AES) Instruction Set" Rev.2 01 + + Note: Rev. 03 Final 2010 01 26 is available. Looks like some code change from Rev.2 01 + + cclee 3-13-10 +*/ + + .text + .align 4,0x90 +.globl _aes_encrypt_hw +_aes_encrypt_hw: + +#if defined __i386__ + movl 4(%esp), %eax // in + movl 12(%esp), %edx // ctx + movl 8(%esp), %ecx // out + + #define LOCAL_SIZE (12+16+16) // 16-byte align (-4 for return address) + 16 (xmm0) + 16 (xmm1) + #define in %eax + #define ctx %edx + #define out %ecx + #define r13 %esp + +#else // x86_64 + + #define LOCAL_SIZE (8+16+16) // 16-byte align (-8 for return address) + 16 (xmm0) + 16 (xmm1) + #define in %rdi + #define ctx %rdx + #define out %rsi + #define r13 %rsp + +#endif // i386 or x86_64 + +#ifdef KERNEL + sub $LOCAL_SIZE, r13 + movaps %xmm0, (r13) +#endif + movups (in), %xmm0 + + // key length identification + movl 240(ctx), %eax // key length + cmp $160, %eax + je L_AES_128 + cmp $192, %eax + je L_AES_192 + cmp $224, %eax + je L_AES_256 + mov $-1, %eax // return ERROR +#ifdef KERNEL + movaps (r13), %xmm0 + add $LOCAL_SIZE, r13 +#endif + ret + +L_AES_128: + testb $15, %dl // check whether expanded key is 16-byte aligned + jne 0f // if not 16-byte aligned, aesenc xmm, m128 won't work + pxor (ctx), %xmm0 + aesenc 16(ctx), %xmm0 + aesenc 32(ctx), %xmm0 + aesenc 48(ctx), %xmm0 + aesenc 64(ctx), %xmm0 + aesenc 80(ctx), %xmm0 + aesenc 96(ctx), %xmm0 + aesenc 112(ctx), %xmm0 + aesenc 128(ctx), %xmm0 + aesenc 144(ctx), %xmm0 + aesenclast 160(ctx), %xmm0 + xorl %eax, %eax + movups %xmm0, (out) +#ifdef KERNEL + movaps (r13), %xmm0 + add $LOCAL_SIZE, r13 +#endif + ret +0: // special case expanded key is not 16-byte aligned +#ifdef KERNEL + movaps %xmm1, 16(r13) // save xmm1 into stack +#endif + movups (ctx), %xmm1 + pxor %xmm1, %xmm0 + movups 16(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 32(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 48(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 64(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 80(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 96(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 112(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 128(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 144(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 160(ctx), %xmm1 + aesenclast %xmm1, %xmm0 + xorl %eax, %eax + movups %xmm0, (out) +#ifdef KERNEL + movaps (r13), %xmm0 + movaps 16(r13), %xmm1 + add $LOCAL_SIZE, r13 +#endif + ret + +L_AES_192: + testb $15, %dl // check whether expanded key is 16-byte aligned + jne 0f // if not 16-byte aligned, aesenc xmm, m128 won't work + pxor (ctx), %xmm0 + aesenc 16(ctx), %xmm0 + aesenc 32(ctx), %xmm0 + aesenc 48(ctx), %xmm0 + aesenc 64(ctx), %xmm0 + aesenc 80(ctx), %xmm0 + aesenc 96(ctx), %xmm0 + aesenc 112(ctx), %xmm0 + aesenc 128(ctx), %xmm0 + aesenc 144(ctx), %xmm0 + aesenc 160(ctx), %xmm0 + aesenc 176(ctx), %xmm0 + aesenclast 192(ctx), %xmm0 + xorl %eax, %eax + movups %xmm0, (out) +#ifdef KERNEL + movaps (r13), %xmm0 + add $LOCAL_SIZE, r13 +#endif + ret +0: // special case expanded key is not 16-byte aligned +#ifdef KERNEL + movaps %xmm1, 16(r13) // save xmm1 into stack +#endif + movups (ctx), %xmm1 + pxor %xmm1, %xmm0 + movups 16(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 32(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 48(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 64(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 80(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 96(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 112(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 128(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 144(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 160(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 176(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 192(ctx), %xmm1 + aesenclast %xmm1, %xmm0 + xorl %eax, %eax + movups %xmm0, (out) +#ifdef KERNEL + movaps (r13), %xmm0 + movaps 16(r13), %xmm1 + add $LOCAL_SIZE, r13 +#endif + ret + +L_AES_256: + testb $15, %dl // check whether expanded key is 16-byte aligned + jne 0f // if not 16-byte aligned, aesenc xmm, m128 won't work + pxor (ctx), %xmm0 + aesenc 16(ctx), %xmm0 + aesenc 32(ctx), %xmm0 + aesenc 48(ctx), %xmm0 + aesenc 64(ctx), %xmm0 + aesenc 80(ctx), %xmm0 + aesenc 96(ctx), %xmm0 + aesenc 112(ctx), %xmm0 + aesenc 128(ctx), %xmm0 + aesenc 144(ctx), %xmm0 + aesenc 160(ctx), %xmm0 + aesenc 176(ctx), %xmm0 + aesenc 192(ctx), %xmm0 + aesenc 208(ctx), %xmm0 + aesenclast 224(ctx), %xmm0 + xorl %eax, %eax + movups %xmm0, (out) +#ifdef KERNEL + movaps (r13), %xmm0 + add $LOCAL_SIZE, r13 +#endif + ret +0: // special case expanded key is not 16-byte aligned +#ifdef KERNEL + movaps %xmm1, 16(r13) // save xmm1 into stack +#endif + movups (ctx), %xmm1 + pxor %xmm1, %xmm0 + movups 16(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 32(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 48(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 64(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 80(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 96(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 112(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 128(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 144(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 160(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 176(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 192(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 208(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 224(ctx), %xmm1 + aesenclast %xmm1, %xmm0 + xorl %eax, %eax + movups %xmm0, (out) +#ifdef KERNEL + movaps (r13), %xmm0 + movaps 16(r13), %xmm1 + add $LOCAL_SIZE, r13 +#endif + ret + + + .text + .align 4,0x90 +.globl _aes_decrypt_hw +_aes_decrypt_hw: + +#if defined __i386__ + movl 4(%esp), %eax // in + movl 12(%esp), %edx // ctx + movl 8(%esp), %ecx // out + +#endif + +#ifdef KERNEL + sub $LOCAL_SIZE, r13 + movaps %xmm0, (r13) +#endif + movups (in), %xmm0 + + // key length identification + movl 240(ctx), %eax // key length + cmp $160, %eax + je 0f // AES-128 + cmp $192, %eax + je 1f // AES-192 + cmp $224, %eax + je 2f // AES-256 + mov $-1, %eax // return ERROR +#ifdef KERNEL + movaps (r13), %xmm0 + add $LOCAL_SIZE, r13 +#endif + ret + +0: // AES-128 + testb $15, %dl // check whether expanded key is 16-byte aligned + jne 9f // if not 16-byte aligned, aesenc xmm, m128 won't work + pxor 160(ctx), %xmm0 + aesdec 144(ctx), %xmm0 + aesdec 128(ctx), %xmm0 + aesdec 112(ctx), %xmm0 + aesdec 96(ctx), %xmm0 + aesdec 80(ctx), %xmm0 + aesdec 64(ctx), %xmm0 + aesdec 48(ctx), %xmm0 + aesdec 32(ctx), %xmm0 + aesdec 16(ctx), %xmm0 + aesdeclast (ctx), %xmm0 + xorl %eax, %eax + movups %xmm0, (out) +#ifdef KERNEL + movaps (r13), %xmm0 + add $LOCAL_SIZE, r13 +#endif + ret +9: // AES-128 Decrypt : special case expanded key is not 16-byte aligned +#ifdef KERNEL + movaps %xmm1, 16(r13) // save xmm1 into stack +#endif + movups 160(ctx), %xmm1 + pxor %xmm1, %xmm0 + movups 144(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 128(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 112(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 96(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 80(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 64(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 48(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 32(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 16(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups (ctx), %xmm1 + aesdeclast %xmm1, %xmm0 + xorl %eax, %eax + movups %xmm0, (out) +#ifdef KERNEL + movaps (r13), %xmm0 + movaps 16(r13), %xmm1 + add $LOCAL_SIZE, r13 +#endif + ret + +1: // AES-192 + testb $15, %dl // check whether expanded key is 16-byte aligned + jne 9f // if not 16-byte aligned, aesenc xmm, m128 won't work + pxor 192(ctx), %xmm0 + aesdec 176(ctx), %xmm0 + aesdec 160(ctx), %xmm0 + aesdec 144(ctx), %xmm0 + aesdec 128(ctx), %xmm0 + aesdec 112(ctx), %xmm0 + aesdec 96(ctx), %xmm0 + aesdec 80(ctx), %xmm0 + aesdec 64(ctx), %xmm0 + aesdec 48(ctx), %xmm0 + aesdec 32(ctx), %xmm0 + aesdec 16(ctx), %xmm0 + aesdeclast (ctx), %xmm0 + xorl %eax, %eax + movups %xmm0, (out) +#ifdef KERNEL + movaps (r13), %xmm0 + add $LOCAL_SIZE, r13 +#endif + ret +9: // AES-192 Decrypt : special case expanded key is not 16-byte aligned +#ifdef KERNEL + movaps %xmm1, 16(r13) // save xmm1 into stack +#endif + movups 192(ctx), %xmm1 + pxor %xmm1, %xmm0 + movups 176(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 160(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 144(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 128(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 112(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 96(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 80(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 64(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 48(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 32(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 16(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups (ctx), %xmm1 + aesdeclast %xmm1, %xmm0 + xorl %eax, %eax + movups %xmm0, (out) +#ifdef KERNEL + movaps (r13), %xmm0 + movaps 16(r13), %xmm1 + add $LOCAL_SIZE, r13 +#endif + ret + +2: // AES-256 + testb $15, %dl // check whether expanded key is 16-byte aligned + jne 9f // if not 16-byte aligned, aesenc xmm, m128 won't work + pxor 224(ctx), %xmm0 + aesdec 208(ctx), %xmm0 + aesdec 192(ctx), %xmm0 + aesdec 176(ctx), %xmm0 + aesdec 160(ctx), %xmm0 + aesdec 144(ctx), %xmm0 + aesdec 128(ctx), %xmm0 + aesdec 112(ctx), %xmm0 + aesdec 96(ctx), %xmm0 + aesdec 80(ctx), %xmm0 + aesdec 64(ctx), %xmm0 + aesdec 48(ctx), %xmm0 + aesdec 32(ctx), %xmm0 + aesdec 16(ctx), %xmm0 + aesdeclast (ctx), %xmm0 + xorl %eax, %eax + movups %xmm0, (out) +#ifdef KERNEL + movaps (r13), %xmm0 + add $LOCAL_SIZE, r13 +#endif + ret +9: // AES-256 Decrypt : special case expanded key is not 16-byte aligned +#ifdef KERNEL + movaps %xmm1, 16(r13) // save xmm1 into stack +#endif + movups 224(ctx), %xmm1 + pxor %xmm1, %xmm0 + movups 208(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 192(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 176(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 160(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 144(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 128(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 112(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 96(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 80(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 64(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 48(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 32(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 16(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups (ctx), %xmm1 + aesdeclast %xmm1, %xmm0 + xorl %eax, %eax + movups %xmm0, (out) +#ifdef KERNEL + movaps (r13), %xmm0 + movaps 16(r13), %xmm1 + add $LOCAL_SIZE, r13 +#endif + ret + diff --git a/bsd/crypto/aes/i386/aes_key_hw.s b/bsd/crypto/aes/i386/aes_key_hw.s new file mode 100644 index 000000000..434fa553c --- /dev/null +++ b/bsd/crypto/aes/i386/aes_key_hw.s @@ -0,0 +1,405 @@ +/* This files defines _aes_encrypt_key_hw and _aes_decrypt_key_hw --- Intel Westmere HW AES-based implementation + of _aes_encrypt_key and _aes_decrypt_key. + + These 2 functions SHOULD BE entried ONLY after the AES HW is verified to be available. + They SHOULD NOT be called without AES HW detection. It might cause xnu to crash. + + The AES HW is detected 1st thing in + _aes_encrypt_key (ExpandKeyForEncryption.s) + _aes_decrypt_key (ExpandKeyForDecryption.s) + and, if AES HW is detected, branch without link (ie, jump) to the functions here. + + The implementation here follows the examples in an Intel White Paper + "Intel Advanced Encryption Standard (AES) Instruction Set" Rev.2 01 + + Note: Rev. 03 Final 2010 01 26 is available. Looks like some code change from Rev.2 01 + + cclee 3-13-10 +*/ + + .text + .align 4,0x90 + + // hw_aes_encrypt_key(key, klen, hwectx); + // klen = 16, 24, or 32, or (128/192/256) + + .globl _aes_encrypt_key_hw +_aes_encrypt_key_hw: + +#ifdef __i386__ + push %ebp + mov %esp, %ebp + push %ebx + push %edi + mov 8(%ebp), %eax // pointer to key + mov 12(%ebp), %ebx // klen + mov 16(%ebp), %edi // ctx + #define pkey %eax + #define klen %ebx + #define ctx %edi + #define sp %esp + #define cx %ecx +#else + #define pkey %rdi + #define klen %rsi + #define ctx %rdx + #define sp %rsp + #define cx %rcx + push %rbp + mov %rsp, %rbp +#endif + +#ifdef KERNEL + // for xmm registers save and restore + sub $(16*4), sp +#endif + + cmp $32, klen + jg 0f // klen>32 + shl $3, klen // convert 16/24/32 to 128/192/256 +0: + + cmp $128, klen // AES-128 ? + je L_AES_128_Encrypt_Key + cmp $192, klen // AES-192 ? + je L_AES_192_Encrypt_Key + cmp $256, klen // AES-256 ? + je L_AES_256_Encrypt_Key + mov $1, %eax // return error for wrong klen +L_Encrypt_Key_2_return: +#ifdef KERNEL + add $(16*4), sp +#endif +#ifdef __i386__ + pop %edi + pop %ebx +#endif + leave + ret + +L_AES_128_Encrypt_Key: +#ifdef KERNEL + // save xmm registers + movaps %xmm1, (sp) + movaps %xmm2, 16(sp) + movaps %xmm3, 32(sp) +#endif // KERNEL + + movl $160, 240(ctx) // write expanded key length to ctx + xor cx, cx + + movups (pkey), %xmm1 + movups %xmm1, (ctx) + aeskeygenassist $1, %xmm1, %xmm2 + call L_key_expansion_128 + aeskeygenassist $2, %xmm1, %xmm2 + call L_key_expansion_128 + aeskeygenassist $4, %xmm1, %xmm2 + call L_key_expansion_128 + aeskeygenassist $8, %xmm1, %xmm2 + call L_key_expansion_128 + aeskeygenassist $0x10, %xmm1, %xmm2 + call L_key_expansion_128 + aeskeygenassist $0x20, %xmm1, %xmm2 + call L_key_expansion_128 + aeskeygenassist $0x40, %xmm1, %xmm2 + call L_key_expansion_128 + aeskeygenassist $0x80, %xmm1, %xmm2 + call L_key_expansion_128 + aeskeygenassist $0x1b, %xmm1, %xmm2 + call L_key_expansion_128 + aeskeygenassist $0x36, %xmm1, %xmm2 + call L_key_expansion_128 + +#ifdef KERNEL + // restore xmm registers + movaps (sp), %xmm1 + movaps 16(sp), %xmm2 + movaps 32(sp), %xmm3 +#endif // KERNEL + xor %eax, %eax // return 0 for success + jmp L_Encrypt_Key_2_return + + .align 4, 0x90 +L_key_expansion_128: + pshufd $0xff, %xmm2, %xmm2 + movaps %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + movaps %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + movaps %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + add $16, cx + movups %xmm1, (ctx, cx) + ret + +L_AES_192_Encrypt_Key: +#ifdef KERNEL + // save xmm registers + movaps %xmm1, (sp) + movaps %xmm2, 16(sp) + movaps %xmm3, 32(sp) + movaps %xmm4, 48(sp) +#endif // KERNEL + movl $192, 240(ctx) // write expanded key length to ctx + + movups (pkey), %xmm1 + movq 16(pkey), %xmm3 + + movups %xmm1, (ctx) + movq %xmm3, 16(ctx) + + lea 24(ctx), cx + + aeskeygenassist $1, %xmm3, %xmm2 + call L_key_expansion_192 + aeskeygenassist $2, %xmm3, %xmm2 + call L_key_expansion_192 + aeskeygenassist $4, %xmm3, %xmm2 + call L_key_expansion_192 + aeskeygenassist $8, %xmm3, %xmm2 + call L_key_expansion_192 + aeskeygenassist $0x10, %xmm3, %xmm2 + call L_key_expansion_192 + aeskeygenassist $0x20, %xmm3, %xmm2 + call L_key_expansion_192 + aeskeygenassist $0x40, %xmm3, %xmm2 + call L_key_expansion_192 + aeskeygenassist $0x80, %xmm3, %xmm2 + call L_key_expansion_192 + +#ifdef KERNEL + // restore xmm registers + movaps (sp), %xmm1 + movaps 16(sp), %xmm2 + movaps 32(sp), %xmm3 + movaps 48(sp), %xmm4 +#endif // KERNEL + xor %eax, %eax // return 0 for success + jmp L_Encrypt_Key_2_return + + .align 4, 0x90 +L_key_expansion_192: + pshufd $0x55, %xmm2, %xmm2 + + movaps %xmm1, %xmm4 + pslldq $4, %xmm4 + + pxor %xmm4, %xmm1 + pslldq $4, %xmm4 + + pxor %xmm4, %xmm1 + pslldq $4, %xmm4 + + pxor %xmm4, %xmm1 + pxor %xmm2, %xmm1 + + pshufd $0xff, %xmm1, %xmm2 + + movaps %xmm3, %xmm4 + pslldq $4, %xmm4 + + pxor %xmm4, %xmm3 + pxor %xmm2, %xmm3 + + movups %xmm1, (cx) + movq %xmm3, 16(cx) + + add $24, cx + ret + +L_AES_256_Encrypt_Key: +#ifdef KERNEL + // save xmm registers + movaps %xmm1, (sp) + movaps %xmm2, 16(sp) + movaps %xmm3, 32(sp) + movaps %xmm4, 48(sp) +#endif // KERNEL + movl $224, 240(ctx) // write expanded key length to ctx + + movups (pkey), %xmm1 + movups 16(pkey), %xmm3 + movups %xmm1, (ctx) + movups %xmm3, 16(ctx) + + lea 32(ctx), cx + + aeskeygenassist $1, %xmm3, %xmm2 + call L_key_expansion_256 + aeskeygenassist $2, %xmm3, %xmm2 + call L_key_expansion_256 + aeskeygenassist $4, %xmm3, %xmm2 + call L_key_expansion_256 + aeskeygenassist $8, %xmm3, %xmm2 + call L_key_expansion_256 + aeskeygenassist $0x10, %xmm3, %xmm2 + call L_key_expansion_256 + aeskeygenassist $0x20, %xmm3, %xmm2 + call L_key_expansion_256 + aeskeygenassist $0x40, %xmm3, %xmm2 + call L_key_expansion_256_final + +#ifdef KERNEL + // restore xmm registers + movaps (sp), %xmm1 + movaps 16(sp), %xmm2 + movaps 32(sp), %xmm3 + movaps 48(sp), %xmm4 +#endif // KERNEL + xor %eax, %eax // return 0 for success + jmp L_Encrypt_Key_2_return + + .align 4, 0x90 +L_key_expansion_256: + + pshufd $0xff, %xmm2, %xmm2 + + movaps %xmm1, %xmm4 + pslldq $4, %xmm4 + + pxor %xmm4, %xmm1 + pslldq $4, %xmm4 + + pxor %xmm4, %xmm1 + pslldq $4, %xmm4 + + pxor %xmm4, %xmm1 + pxor %xmm2, %xmm1 + + movups %xmm1, (cx) + + aeskeygenassist $0, %xmm1, %xmm4 + + pshufd $0xaa, %xmm4, %xmm2 + + movaps %xmm3, %xmm4 + pslldq $4, %xmm4 + + pxor %xmm4, %xmm3 + pslldq $4, %xmm4 + + pxor %xmm4, %xmm3 + pslldq $4, %xmm4 + + pxor %xmm4, %xmm3 + pxor %xmm2, %xmm3 + + movups %xmm3, 16(cx) + + add $32, cx + ret + + .align 4, 0x90 +L_key_expansion_256_final: + + pshufd $0xff, %xmm2, %xmm2 + + movaps %xmm1, %xmm4 + pslldq $4, %xmm4 + + pxor %xmm4, %xmm1 + pslldq $4, %xmm4 + + pxor %xmm4, %xmm1 + pslldq $4, %xmm4 + + pxor %xmm4, %xmm1 + pxor %xmm2, %xmm1 + + movups %xmm1, (cx) + ret + +// _aes_decrypt_key_hw is implemented as +// 1. call _aes_encrypt_key_hw +// 2. use aesimc to convert the expanded round keys (except the 1st and last round keys) + + .text + .align 4, 0x90 + .globl _aes_decrypt_key_hw +_aes_decrypt_key_hw: + +#ifdef __i386__ + + push %ebp + mov %esp, %ebp + sub $(8+16), %esp + + // copy input arguments for calling aes_decrypt_key_hw + + mov 8(%ebp), %eax + mov %eax, (%esp) + mov 12(%ebp), %eax + mov %eax, 4(%esp) + mov 16(%ebp), %eax + mov %eax, 8(%esp) + +#else + + push %rbp + mov %rsp, %rbp + sub $16, %rsp + + // calling arguments %rdi/%rsi/%rdx will be used for encrypt_key + // %rdx (ctx) will return unchanged + // %rsi (klen) will (<<3) if <= 32 + +#endif + call _aes_encrypt_key_hw + cmp $0, %eax + je L_decrypt_inv +L_decrypt_almost_done: +#ifdef __i386__ + add $(8+16), %esp +#else + add $16, %rsp +#endif + leave + ret + +L_decrypt_inv: +#ifdef KERNEL + movaps %xmm0, (sp) +#endif + +#ifdef __i386__ + #undef klen + #undef ctx + mov 12(%ebp), %eax // klen + mov 16(%ebp), %edx // ctx + #define klen %eax + #define ctx %edx + cmp $32, klen + jg 0f // klen>32 + shl $3, klen // convert 16/24/32 to 128/192/256 +0: +#endif + + mov $9, cx // default is AES-128 + cmp $128, klen + je L_Decrypt_Key + add $2, cx + cmp $192, klen + je L_Decrypt_Key + add $2, cx + +L_Decrypt_Key: + add $16, ctx + movups (ctx), %xmm0 + aesimc %xmm0, %xmm0 + movups %xmm0, (ctx) + sub $1, cx + jg L_Decrypt_Key + +#ifdef KERNEL + movaps (sp), %xmm0 +#endif +#ifdef __i386__ + xor %eax, %eax +#endif + jmp L_decrypt_almost_done + diff --git a/bsd/crypto/aes/i386/aes_modes.c b/bsd/crypto/aes/i386/aes_modes.c deleted file mode 100644 index fd8b1401b..000000000 --- a/bsd/crypto/aes/i386/aes_modes.c +++ /dev/null @@ -1,471 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software in both source and binary - form is allowed (with or without changes) provided that: - - 1. distributions of this source code include the above copyright - notice, this list of conditions and the following disclaimer; - - 2. distributions in binary form include the above copyright - notice, this list of conditions and the following disclaimer - in the documentation and/or other associated materials; - - 3. the copyright holder's name is not used to endorse products - built using this software without specific written permission. - - ALTERNATIVELY, provided that this notice is retained in full, this product - may be distributed under the terms of the GNU General Public License (GPL), - in which case the provisions of the GPL apply INSTEAD OF those given above. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue 31/01/2006 - - These subroutines implement multiple block AES modes for ECB, CBC, CFB, - OFB and CTR encryption, The code provides support for the VIA Advanced - Cryptography Engine (ACE). - - NOTE: In the following subroutines, the AES contexts (ctx) must be - 16 byte aligned if VIA ACE is being used -*/ - -//#include -#include - -#include "aesopt.h" - -#if defined( AES_MODES ) -#if defined(__cplusplus) -extern "C" -{ -#endif - -#if defined( _MSC_VER ) && ( _MSC_VER > 800 ) -#pragma intrinsic(memcpy) -#define in_line __inline -#else -#define in_line -#endif - -#define BFR_BLOCKS 8 - -/* These values are used to detect long word alignment in order to */ -/* speed up some buffer operations. This facility may not work on */ -/* some machines so this define can be commented out if necessary */ - -#define FAST_BUFFER_OPERATIONS -#pragma warning( disable : 4311 4312 ) - -#define lp08(x) ((uint_8t*)(x)) -#define lp32(x) ((uint_32t*)(x)) -#define addr_mod_04(x) ((unsigned long)(x) & 3) -#define addr_mod_16(x) ((unsigned long)(x) & 15) - -#if defined( USE_VIA_ACE_IF_PRESENT ) - -#include "via_ace.h" - -#pragma pack(16) - -aligned_array(unsigned long, enc_gen_table, 12, 16) = NEH_ENC_GEN_DATA; -aligned_array(unsigned long, enc_load_table, 12, 16) = NEH_ENC_LOAD_DATA; -aligned_array(unsigned long, enc_hybrid_table, 12, 16) = NEH_ENC_HYBRID_DATA; -aligned_array(unsigned long, dec_gen_table, 12, 16) = NEH_DEC_GEN_DATA; -aligned_array(unsigned long, dec_load_table, 12, 16) = NEH_DEC_LOAD_DATA; -aligned_array(unsigned long, dec_hybrid_table, 12, 16) = NEH_DEC_HYBRID_DATA; - -/* NOTE: These control word macros must only be used after */ -/* a key has been set up because they depend on key size */ - -#if NEH_KEY_TYPE == NEH_LOAD -#define kd_adr(c) ((uint_8t*)(c)->ks) -#elif NEH_KEY_TYPE == NEH_GENERATE -#define kd_adr(c) ((uint_8t*)(c)->ks + (c)->inf.b[0]) -#else -#define kd_adr(c) ((uint_8t*)(c)->ks + ((c)->inf.b[0] == 160 ? 160 : 0)) -#endif - -#else - -#define aligned_array(type, name, no, stride) type name[no] -#define aligned_auto(type, name, no, stride) type name[no] - -#endif - -#if defined( _MSC_VER ) && _MSC_VER > 1200 - -#define via_cwd(cwd, ty, dir, len) unsigned long* cwd = (dir##_##ty##_table + ((len - 128) >> 4)) - -#else - -#define via_cwd(cwd, ty, dir, len) \ - aligned_auto(unsigned long, cwd, 4, 16); \ - cwd[1] = cwd[2] = cwd[3] = 0; \ - cwd[0] = neh_##dir##_##ty##_key(len) - -#endif - -/* implemented in case of wrong call for fixed tables */ -void gen_tabs(void) -{ -} - -aes_rval aes_mode_reset(aes_encrypt_ctx ctx[1]) -{ - ctx->inf.b[2] = 0; - return 0; -} - -aes_rval aes_ecb_encrypt(const unsigned char *ibuf, unsigned char *obuf, - int len, const aes_encrypt_ctx ctx[1]) -{ int nb = len >> 4; - - if(len & (AES_BLOCK_SIZE - 1)) - return 1; - -#if defined( USE_VIA_ACE_IF_PRESENT ) - - if(ctx->inf.b[1] == 0xff) - { uint_8t *ksp = (uint_8t*)(ctx->ks); - via_cwd(cwd, hybrid, enc, 2* ctx->inf.b[0] - 192); - - if(addr_mod_16(ctx)) - return 1; - - if(!addr_mod_16(ibuf) && !addr_mod_16(obuf)) - { - via_ecb_op5(ksp,cwd,ibuf,obuf,nb); - } - else - { aligned_auto(uint_8t, buf, BFR_BLOCKS * AES_BLOCK_SIZE, 16); - uint_8t *ip, *op; - - while(nb) - { - int m = (nb > BFR_BLOCKS ? BFR_BLOCKS : nb); - - ip = (addr_mod_16(ibuf) ? buf : (uint_8t*)ibuf); - op = (addr_mod_16(obuf) ? buf : obuf); - - if(ip != ibuf) - memcpy(buf, ibuf, m * AES_BLOCK_SIZE); - - via_ecb_op5(ksp,cwd,ip,op,m); - - if(op != obuf) - memcpy(obuf, buf, m * AES_BLOCK_SIZE); - - ibuf += m * AES_BLOCK_SIZE; - obuf += m * AES_BLOCK_SIZE; - nb -= m; - } - } - - return 0; - } - -#endif - -#if !defined( ASSUME_VIA_ACE_PRESENT ) - while(nb--) - { - aes_encrypt(ibuf, obuf, ctx); - ibuf += AES_BLOCK_SIZE; - obuf += AES_BLOCK_SIZE; - } -#endif - return 0; -} - -aes_rval aes_ecb_decrypt(const unsigned char *ibuf, unsigned char *obuf, - int len, const aes_decrypt_ctx ctx[1]) -{ int nb = len >> 4; - - if(len & (AES_BLOCK_SIZE - 1)) - return 1; - -#if defined( USE_VIA_ACE_IF_PRESENT ) - - if(ctx->inf.b[1] == 0xff) - { uint_8t *ksp = kd_adr(ctx); - via_cwd(cwd, hybrid, dec, 2* ctx->inf.b[0] - 192); - - if(addr_mod_16(ctx)) - return 1; - - if(!addr_mod_16(ibuf) && !addr_mod_16(obuf)) - { - via_ecb_op5(ksp,cwd,ibuf,obuf,nb); - } - else - { aligned_auto(uint_8t, buf, BFR_BLOCKS * AES_BLOCK_SIZE, 16); - uint_8t *ip, *op; - - while(nb) - { - int m = (nb > BFR_BLOCKS ? BFR_BLOCKS : nb); - - ip = (addr_mod_16(ibuf) ? buf : (uint_8t*)ibuf); - op = (addr_mod_16(obuf) ? buf : obuf); - - if(ip != ibuf) - memcpy(buf, ibuf, m * AES_BLOCK_SIZE); - - via_ecb_op5(ksp,cwd,ip,op,m); - - if(op != obuf) - memcpy(obuf, buf, m * AES_BLOCK_SIZE); - - ibuf += m * AES_BLOCK_SIZE; - obuf += m * AES_BLOCK_SIZE; - nb -= m; - } - } - - return 0; - } - -#endif - -#if !defined( ASSUME_VIA_ACE_PRESENT ) - while(nb--) - { - aes_decrypt(ibuf, obuf, ctx); - ibuf += AES_BLOCK_SIZE; - obuf += AES_BLOCK_SIZE; - } -#endif - return 0; -} - -aes_rval aes_cbc_encrypt(const unsigned char *ibuf, unsigned char *obuf, - int len, unsigned char *iv, const aes_encrypt_ctx ctx[1]) -{ int nb = len >> 4; - - if(len & (AES_BLOCK_SIZE - 1)) - return 1; - -#if defined( USE_VIA_ACE_IF_PRESENT ) - - if(ctx->inf.b[1] == 0xff) - { uint_8t *ksp = (uint_8t*)(ctx->ks), *ivp = iv; - aligned_auto(uint_8t, liv, AES_BLOCK_SIZE, 16); - via_cwd(cwd, hybrid, enc, 2* ctx->inf.b[0] - 192); - - if(addr_mod_16(ctx)) - return 1; - - if(addr_mod_16(iv)) /* ensure an aligned iv */ - { - ivp = liv; - memcpy(liv, iv, AES_BLOCK_SIZE); - } - - if(!addr_mod_16(ibuf) && !addr_mod_16(obuf) && !addr_mod_16(iv)) - { - via_cbc_op7(ksp,cwd,ibuf,obuf,nb,ivp,ivp); - } - else - { aligned_auto(uint_8t, buf, BFR_BLOCKS * AES_BLOCK_SIZE, 16); - uint_8t *ip, *op; - - while(nb) - { - int m = (nb > BFR_BLOCKS ? BFR_BLOCKS : nb); - - ip = (addr_mod_16(ibuf) ? buf : (uint_8t*)ibuf); - op = (addr_mod_16(obuf) ? buf : obuf); - - if(ip != ibuf) - memcpy(buf, ibuf, m * AES_BLOCK_SIZE); - - via_cbc_op7(ksp,cwd,ip,op,m,ivp,ivp); - - if(op != obuf) - memcpy(obuf, buf, m * AES_BLOCK_SIZE); - - ibuf += m * AES_BLOCK_SIZE; - obuf += m * AES_BLOCK_SIZE; - nb -= m; - } - } - - if(iv != ivp) - memcpy(iv, ivp, AES_BLOCK_SIZE); - - return 0; - } - -#endif - -#if !defined( ASSUME_VIA_ACE_PRESENT ) -# ifdef FAST_BUFFER_OPERATIONS - if(!addr_mod_04(ibuf) && !addr_mod_04(iv)) - while(nb--) - { - lp32(iv)[0] ^= lp32(ibuf)[0]; - lp32(iv)[1] ^= lp32(ibuf)[1]; - lp32(iv)[2] ^= lp32(ibuf)[2]; - lp32(iv)[3] ^= lp32(ibuf)[3]; - aes_encrypt(iv, iv, ctx); - memcpy(obuf, iv, AES_BLOCK_SIZE); - ibuf += AES_BLOCK_SIZE; - obuf += AES_BLOCK_SIZE; - } - else -# endif - while(nb--) - { - iv[ 0] ^= ibuf[ 0]; iv[ 1] ^= ibuf[ 1]; - iv[ 2] ^= ibuf[ 2]; iv[ 3] ^= ibuf[ 3]; - iv[ 4] ^= ibuf[ 4]; iv[ 5] ^= ibuf[ 5]; - iv[ 6] ^= ibuf[ 6]; iv[ 7] ^= ibuf[ 7]; - iv[ 8] ^= ibuf[ 8]; iv[ 9] ^= ibuf[ 9]; - iv[10] ^= ibuf[10]; iv[11] ^= ibuf[11]; - iv[12] ^= ibuf[12]; iv[13] ^= ibuf[13]; - iv[14] ^= ibuf[14]; iv[15] ^= ibuf[15]; - aes_encrypt(iv, iv, ctx); - memcpy(obuf, iv, AES_BLOCK_SIZE); - ibuf += AES_BLOCK_SIZE; - obuf += AES_BLOCK_SIZE; - } -#endif - return 0; -} - -aes_rval aes_encrypt_cbc(const unsigned char *in_blk, const unsigned char *in_iv, unsigned int num_blk, - unsigned char *out_blk, const aes_encrypt_ctx cx[1]) -{ - unsigned char tmp_iv[16]; - int i; - - for (i = 0; i < 16; i++) - tmp_iv[i] = *(in_iv + i); - - return aes_cbc_encrypt(in_blk, out_blk, num_blk<<4, tmp_iv, cx); - -} - -aes_rval aes_cbc_decrypt(const unsigned char *ibuf, unsigned char *obuf, - int len, unsigned char *iv, const aes_decrypt_ctx ctx[1]) -{ unsigned char tmp[AES_BLOCK_SIZE]; - int nb = len >> 4; - - if(len & (AES_BLOCK_SIZE - 1)) - return 1; - -#if defined( USE_VIA_ACE_IF_PRESENT ) - - if(ctx->inf.b[1] == 0xff) - { uint_8t *ksp = kd_adr(ctx), *ivp = iv; - aligned_auto(uint_8t, liv, AES_BLOCK_SIZE, 16); - via_cwd(cwd, hybrid, dec, 2* ctx->inf.b[0] - 192); - - if(addr_mod_16(ctx)) - return 1; - - if(addr_mod_16(iv)) /* ensure an aligned iv */ - { - ivp = liv; - memcpy(liv, iv, AES_BLOCK_SIZE); - } - - if(!addr_mod_16(ibuf) && !addr_mod_16(obuf) && !addr_mod_16(iv)) - { - via_cbc_op6(ksp,cwd,ibuf,obuf,nb,ivp); - } - else - { aligned_auto(uint_8t, buf, BFR_BLOCKS * AES_BLOCK_SIZE, 16); - uint_8t *ip, *op; - - while(nb) - { - int m = (nb > BFR_BLOCKS ? BFR_BLOCKS : nb); - - ip = (addr_mod_16(ibuf) ? buf : (uint_8t*)ibuf); - op = (addr_mod_16(obuf) ? buf : obuf); - - if(ip != ibuf) - memcpy(buf, ibuf, m * AES_BLOCK_SIZE); - - via_cbc_op6(ksp,cwd,ip,op,m,ivp); - - if(op != obuf) - memcpy(obuf, buf, m * AES_BLOCK_SIZE); - - ibuf += m * AES_BLOCK_SIZE; - obuf += m * AES_BLOCK_SIZE; - nb -= m; - } - } - - if(iv != ivp) - memcpy(iv, ivp, AES_BLOCK_SIZE); - - return 0; - } -#endif - -#if !defined( ASSUME_VIA_ACE_PRESENT ) -# ifdef FAST_BUFFER_OPERATIONS - if(!addr_mod_04(obuf) && !addr_mod_04(iv)) - while(nb--) - { - memcpy(tmp, ibuf, AES_BLOCK_SIZE); - aes_decrypt(ibuf, obuf, ctx); - lp32(obuf)[0] ^= lp32(iv)[0]; - lp32(obuf)[1] ^= lp32(iv)[1]; - lp32(obuf)[2] ^= lp32(iv)[2]; - lp32(obuf)[3] ^= lp32(iv)[3]; - memcpy(iv, tmp, AES_BLOCK_SIZE); - ibuf += AES_BLOCK_SIZE; - obuf += AES_BLOCK_SIZE; - } - else -# endif - while(nb--) - { - memcpy(tmp, ibuf, AES_BLOCK_SIZE); - aes_decrypt(ibuf, obuf, ctx); - obuf[ 0] ^= iv[ 0]; obuf[ 1] ^= iv[ 1]; - obuf[ 2] ^= iv[ 2]; obuf[ 3] ^= iv[ 3]; - obuf[ 4] ^= iv[ 4]; obuf[ 5] ^= iv[ 5]; - obuf[ 6] ^= iv[ 6]; obuf[ 7] ^= iv[ 7]; - obuf[ 8] ^= iv[ 8]; obuf[ 9] ^= iv[ 9]; - obuf[10] ^= iv[10]; obuf[11] ^= iv[11]; - obuf[12] ^= iv[12]; obuf[13] ^= iv[13]; - obuf[14] ^= iv[14]; obuf[15] ^= iv[15]; - memcpy(iv, tmp, AES_BLOCK_SIZE); - ibuf += AES_BLOCK_SIZE; - obuf += AES_BLOCK_SIZE; - } -#endif - return 0; -} - -aes_rval aes_decrypt_cbc(const unsigned char *in_blk, const unsigned char *in_iv, unsigned int num_blk, - unsigned char *out_blk, const aes_decrypt_ctx cx[1]) -{ - unsigned char tmp_iv[16]; - int i; - - for (i = 0; i < 16; i++) - tmp_iv[i] = *(in_iv + i); - - return aes_cbc_decrypt(in_blk, out_blk, num_blk<<4, tmp_iv, cx); - -} - - -#if defined(__cplusplus) -} -#endif -#endif diff --git a/bsd/crypto/aes/i386/aes_modes_asm.s b/bsd/crypto/aes/i386/aes_modes_asm.s new file mode 100644 index 000000000..3b0f29aa1 --- /dev/null +++ b/bsd/crypto/aes/i386/aes_modes_asm.s @@ -0,0 +1,420 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue 31/01/2006 + + These subroutines implement multiple block AES modes for ECB, CBC, CFB, + OFB and CTR encryption, The code provides support for the VIA Advanced + Cryptography Engine (ACE). + + NOTE: In the following subroutines, the AES contexts (ctx) must be + 16 byte aligned if VIA ACE is being used +*/ + +/* modified 3/5/10 cclee */ +/* Clean up those related to VIA ACE and hand optimize aes_cbc_encrypt and aes_cbc_decrypt */ +/* move the xmm registers save/restore originally inside the callee functions into these 2 caller functions */ + +/* add code comments/description and HW AES detection and execution branch cclee 3-13-10 */ + +#ifdef KERNEL +#include // to use __cpu_capabilities&kHasAES to detect Intel Westmere AES HW +#else +#include // to use __cpu_capabilities&kHasAES to detect Intel Westmere AES HW +#endif + +#if 0 + +// TODO: +// aes_ecb_encrypt and aes_ecb_decrypt are not present in gen/aescrypt.c +// would add the implementation if needed +// they are now compiled from aes_modes.c + +aes_rval aes_ecb_encrypt(const unsigned char *ibuf, unsigned char *obuf, + int len, const aes_encrypt_ctx ctx[1]) +{ int nb = len >> 4; + + if(len & (AES_BLOCK_SIZE - 1)) return 1; + while(nb--) { + aes_encrypt(ibuf, obuf, ctx); + ibuf += AES_BLOCK_SIZE; + obuf += AES_BLOCK_SIZE; + } + return 0; +} + +aes_rval aes_ecb_decrypt(const unsigned char *ibuf, unsigned char *obuf, + int len, const aes_decrypt_ctx ctx[1]) +{ int nb = len >> 4; + + if(len & (AES_BLOCK_SIZE - 1)) return 1; + while(nb--) { + aes_decrypt(ibuf, obuf, ctx); + ibuf += AES_BLOCK_SIZE; + obuf += AES_BLOCK_SIZE; + } + return 0; +} +#endif + +#if 0 +aes_rval aes_encrypt_cbc(const unsigned char *ibuf, const unsigned char *in_iv, unsigned int num_blk, + unsigned char *obuf, const aes_encrypt_ctx ctx[1]) +{ + unsigned char iv[16]; + int i; + + for (i = 0; i < 16; i++) iv[i] = *(in_iv + i); + + while (num_blk--) { + iv ^= ibuf; // 128-bit + aes_encrypt(iv, iv, ctx); + memcpy(obuf, iv, AES_BLOCK_SIZE); + ibuf += AES_BLOCK_SIZE; + obuf += AES_BLOCK_SIZE; + + } + + return 0; +} +#endif + + .text + .align 4,0x90 + .globl _aes_encrypt_cbc +_aes_encrypt_cbc: + + // detect AES HW + // if AES HW detected, branch to AES-HW-specific function _aes_encrypt_cbc_hw (aes_modes_hw.s) + // o.w., fall through to the original AES-SW function + +#if defined __x86_64__ + movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capability + mov (%rax), %eax // %eax = __cpu_capabilities +#else +#ifdef KERNEL + leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities + mov (%eax), %eax // %eax = __cpu_capabilities +#else + mov _COMM_PAGE_CPU_CAPABILITIES, %eax +#endif +#endif + test $(kHasAES), %eax // kHasAES & __cpu_capabilities + jne _aes_encrypt_cbc_hw // if AES HW detected, branch to HW-specific code + + // save registers and allocate stack memory for xmm registers and calling arguments (i386 only) +#if defined __i386__ + push %ebp + mov %esp, %ebp + push %ebx // to be used as ibuf + push %edi // to be used as obuf + sub $(16+16+7*16), %esp // 12 (calling arguments) + 4 (%esi) + 16 (iv) + 7*16 (xmm) + mov %esi, 12(%esp) // save %esp in the unused 4-bytes, to be used as num_blk + + #define sp %esp +#else // __x86_64__ + push %rbp + mov %rsp, %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + sub $(8+16+5*16+16), %rsp // 8 (align) + 16 (dummy iv) + 5*16 (xmm) + 16 (for i386-x86_64 consistency) + + #define sp %rsp +#endif + + // save xmm registers for kernel use + // xmm6-xmm7 will be used locally + // xmm0-xmm2 (x86_64) or xmm0-/xmm4 (i386) will be used inside _aes_encrypt_xmm_no_save (non-restored) + // there is a hole not used for xmm, which is 48(sp). + // it has been used to store iv (16-bytes) in i386 code + // for consistency between i386 and x86_64, this hole is dummied in x86_64 code + // also the 1st 16 bytes (sp) is dummied in x86_64 code + +#ifdef KERNEL + movaps %xmm7, 16(sp) + movaps %xmm6, 32(sp) + movaps %xmm0, 64(sp) + movaps %xmm1, 80(sp) + movaps %xmm2, 96(sp) +#if defined __i386__ + movaps %xmm3, 112(sp) + movaps %xmm4, 128(sp) +#endif +#endif + + // set up registers from calling arguments + +#if defined __i386__ + + mov 12(%ebp), %eax // in_iv + mov 24(%ebp), %edx // ctx + movups (%eax), %xmm7 // in_iv + lea 48(%esp), %eax // &iv[0] + mov %eax, (%esp) // 1st iv for aes_encrypt + mov %eax, 4(%esp) // 2nd iv for aes_encrypt + mov %edx, 8(%esp) // ctx for aes_encrypt + mov 8(%ebp), %ebx // ibuf + mov 16(%ebp), %esi // num_blk + mov 20(%ebp), %edi // obuf + + #define ibuf %ebx + #define obuf %edi + #define num_blk %esi + +#else // __x86_64__, calling arguments order : rdi/rsi/rdx/rcx/r8 + + mov %rdi, %rbx // ibuf + lea 48(sp), %r12 // &iv + movups (%rsi), %xmm7 // in_iv + mov %rdx, %r13 // num_blk + mov %rcx, %r14 // obuf + mov %r8, %r15 // ctx + + #define ibuf %rbx + #define iv %r12 + #define num_blk %r13d + #define obuf %r14 + #define ctx %r15 + +#endif + + cmp $1, num_blk // num_blk vs 1 + jl 9f // if num_blk < 1, branch to bypass the main loop +0: + movups (ibuf), %xmm6 // ibuf +#if defined __i386__ + lea 48(sp), %eax // &iv[0] + pxor %xmm6, %xmm7 // iv ^= ibuf + movups %xmm7, (%eax) // save iv +#else + pxor %xmm6, %xmm7 // iv ^= ibuf + movups %xmm7, (iv) // save iv + mov iv, %rdi // 1st calling argument for aes_encrypt + mov iv, %rsi // 2nd calling argument for aes_encrypt + mov ctx, %rdx // 3rd calling argument for aes_encrypt +#endif + call _aes_encrypt_xmm_no_save // aes_encrypt(iv, iv, ctx) +#if defined __i386__ + leal 48(%esp), %eax // &iv[0] + movups (%eax), %xmm7 // read iv +#else + movups (iv), %xmm7 // read iv +#endif + movups %xmm7, (obuf) // memcpy(obuf, iv, AES_BLOCK_SIZE); + add $16, ibuf // ibuf += AES_BLOCK_SIZE; + add $16, obuf // obuf += AES_BLOCK_SIZE; + sub $1, num_blk // num_blk -- + jg 0b // if num_blk > 0, repeat the loop +9: + +L_crypt_cbc_done: + + // restore xmm registers due to kernel use +#ifdef KERNEL + movaps 16(sp), %xmm7 + movaps 32(sp), %xmm6 + movaps 64(sp), %xmm0 + movaps 80(sp), %xmm1 + movaps 96(sp), %xmm2 +#if defined __i386__ + movaps 112(sp), %xmm3 + movaps 128(sp), %xmm4 +#endif +#endif + + xor %eax, %eax // to return 0 for SUCCESS + +#if defined __i386__ + mov 12(%esp), %esi // restore %esi + add $(16+16+7*16), %esp // 12 (calling arguments) + 4 (%esi) + 16 (iv) + 7*16 (xmm) + pop %edi + pop %ebx +#else + add $(8+16+5*16+16), %rsp // 8 (align) + 16 (dummy iv) + 5*16 (xmm) + 16 (for i386-x86_64 consistency) + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx +#endif + leave + ret + +#if 0 +aes_rval aes_decrypt_cbc(const unsigned char *ibuf, const unsigned char *in_iv, unsigned int num_blk, + unsigned char *obuf, const aes_decrypt_ctx cx[1]) +{ + unsigned char iv[16], tmp[16]; + int i; + + for (i = 0; i < 16; i++) iv[i] = *(in_iv + i); + + while (num_blk--) { + + memcpy(tmp, ibuf, AES_BLOCK_SIZE); + aes_decrypt(ibuf, obuf, ctx); + obuf ^= iv; + memcpy(iv, tmp, AES_BLOCK_SIZE); + ibuf += AES_BLOCK_SIZE; + obuf += AES_BLOCK_SIZE; + } + + return 0; +} +#endif + + .text + .align 4,0x90 + .globl _aes_decrypt_cbc +_aes_decrypt_cbc: + + // detect AES HW + // if AES HW detected, branch to AES-HW-specific function _aes_decrypt_cbc_hw (aes_modes_hw.s) + // o.w., fall through to the original AES-SW function + +#if defined __x86_64__ + movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capability + mov (%rax), %eax // %eax = __cpu_capabilities +#else +#ifdef KERNEL + leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities + mov (%eax), %eax // %eax = __cpu_capabilities +#else + mov _COMM_PAGE_CPU_CAPABILITIES, %eax +#endif +#endif + test $(kHasAES), %eax // kHasAES & __cpu_capabilities + jne _aes_decrypt_cbc_hw + + // save registers and allocate stack memory for xmm registers and calling arguments (i386 only) +#if defined __i386__ + push %ebp + mov %esp, %ebp + push %ebx // to be used as ibuf + push %edi // to be used as obuf + sub $(16+16+7*16), %esp // 12 (calling arguments) + 4 (%esi) + 16 (iv) + 7*16 (xmm) + mov %esi, 12(%esp) // save %esp in the unused 4-bytes, to be used as num_blk + + #define sp %esp +#else // __x86_64__ + push %rbp + mov %rsp, %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + sub $(8+16+5*16+16), %rsp // 8 (align) + 16 (dummy iv) + 5*16 (xmm) + 16 (for i386-x86_64 consistency) + + #define sp %rsp +#endif + + // save xmm registers for kernel use + // xmm6-xmm7 will be used locally + // xmm0-xmm2 (x86_64) or xmm0-/xmm4 (i386) will be used inside _aes_encrypt_xmm_no_save (non-restored) + // there is a hole not used for xmm, which is 48(sp). + // it has been used to store iv (16-bytes) in i386 code + // for consistency between i386 and x86_64, this hole is dummied in x86_64 code + // also the 1st 16 bytes (sp) is dummied in x86_64 code + +#ifdef KERNEL + movaps %xmm7, 16(sp) + movaps %xmm6, 32(sp) + movaps %xmm0, 64(sp) + movaps %xmm1, 80(sp) + movaps %xmm2, 96(sp) +#if defined __i386__ + movaps %xmm3, 112(sp) + movaps %xmm4, 128(sp) +#endif +#endif + + // set up registers from calling arguments + +#if defined __i386__ + mov 12(%ebp), %eax // in_iv + mov 24(%ebp), %edx // ctx + movups (%eax), %xmm7 // in_iv + mov %edx, 8(%esp) // ctx for aes_encrypt + mov 8(%ebp), %ebx // ibuf + mov 16(%ebp), %esi // num_blk + mov 20(%ebp), %edi // obuf + + #define ibuf %ebx + #define obuf %edi + #define num_blk %esi +#else // __x86_64__, rdi/rsi/rdx/rcx/r8 + mov %rdi, %rbx // ibuf + movups (%rsi), %xmm7 // in_iv + mov %rdx, %r13 // num_blk + mov %rcx, %r14 // obuf + mov %r8, %r15 // ctx + + #define ibuf %rbx + #define num_blk %r13d + #define obuf %r14 + #define ctx %r15 + +#endif + // memcpy(tmp, ibuf, AES_BLOCK_SIZE); + // aes_decrypt(ibuf, obuf, ctx); + // obuf ^= iv; + // memcpy(iv, tmp, AES_BLOCK_SIZE); + // ibuf += AES_BLOCK_SIZE; + // obuf += AES_BLOCK_SIZE; + + cmp $1, num_blk // num_blk vs 1 + jl L_crypt_cbc_done // if num_blk < 1, bypass the main loop, jump to finishing code +0: + movups (ibuf), %xmm6 // tmp +#if defined __i386__ + mov ibuf, (sp) // ibuf + mov obuf, 4(sp) // obuf +#else + mov ibuf, %rdi // ibuf + mov obuf, %rsi // obuf + mov ctx, %rdx // ctx +#endif + call _aes_decrypt_xmm_no_save // aes_decrypt(ibuf, obuf, ctx) + movups (obuf), %xmm0 // obuf + pxor %xmm7, %xmm0 // obuf ^= iv; + movaps %xmm6, %xmm7 // memcpy(iv, tmp, AES_BLOCK_SIZE); + movups %xmm0, (obuf) // update obuf + add $16, ibuf // ibuf += AES_BLOCK_SIZE; + add $16, obuf // obuf += AES_BLOCK_SIZE; + sub $1, num_blk // num_blk -- + jg 0b // if num_blk > 0, repeat the loop +9: + + // we are done here, the finishing code is identical to that in aes_encrypt_cbc, so just jump to there + jmp L_crypt_cbc_done + diff --git a/bsd/crypto/aes/i386/aes_modes_hw.s b/bsd/crypto/aes/i386/aes_modes_hw.s new file mode 100644 index 000000000..401fd3dd9 --- /dev/null +++ b/bsd/crypto/aes/i386/aes_modes_hw.s @@ -0,0 +1,1669 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue 31/01/2006 + + These subroutines implement multiple block AES modes for ECB, CBC, CFB, + OFB and CTR encryption, The code provides support for the VIA Advanced + Cryptography Engine (ACE). + + NOTE: In the following subroutines, the AES contexts (ctx) must be + 16 byte aligned if VIA ACE is being used +*/ + +/* modified 3/5/10 cclee */ +/* Clean up those related to VIA ACE and hand optimize aes_cbc_encrypt and aes_cbc_decrypt */ +/* move the xmm registers save/restore originally inside the callee functions into these 2 caller functions */ + +/* HW-AES specific implementation cclee 3-12-10 */ +/* In aes_encrypt_cbc and aes_decrypt_cbc, __cpu_capabilities is polled, + and if kHasAES is detected, branch to the hw-specific functions here */ + + +/* + This files defines _aes_encrypt_cbc_hw and _aes_decrypt_cbc_hw --- Intel Westmere HW AES-based implementation + of _aes_encrypt_cbc and _aes_decrypt_cbc. + + These 2 functions SHOULD BE entried ONLY after the AES HW is verified to be available. + They SHOULD NOT be called without AES HW detection. It might cause xnu to crash. + + The AES HW is detected 1st thing in + _aes_encrypt_cbc (aes_modes_asm.s) + _aes_decrypt_cbc (aes_modes_asm.s) + and, if AES HW is detected, branch without link (ie, jump) to the functions here. + + The implementation here follows the examples in an Intel White Paper + "Intel Advanced Encryption Standard (AES) Instruction Set" Rev.2 01 + + Note: Rev. 03 Final 2010 01 26 is available. Looks like some code change from Rev.2 01 + + cclee 3-13-10 +*/ + +/* + The function _aes_decrypt_cbc_hw previously simply serially decrypts block by block + in our group meeting, Eric/Ali suggested that I perhaps should take a look of combining multiple blocks + in a loop and interleaving multiple aesdec instructions to absorb/hide stalls to improve the decrypt thoughput. + + The idea was actually described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55) + + This modification interleaves the aesdec/aesdeclast instructions for 4 blocks in cbc mode. + On a K18 (2.4GHz core-i5/2.66GHz core-i7), the x86_64 decrypt throughput (in xnu-iokit) has been improved + from 1180/1332 to 1667/1858 MBytes/sec. This is approximately 1.40 times speedup in the decryption. + The encrypt throughput is not changed. + + I also enhanced the assembly code comments. + + cclee-4-30-10 (Do you know 4-30 is National Honesty Day in the US? No need to know. I've been honest all the time.) + +*/ + +/* ---------------------------------------------------------------------------------------------------------------- + + aes_encrypt_cbc function (see aes_modes.c or aes_modes_asm.s) : + + For simplicity, I am assuming all variables are in 128-bit data type. + + aes_rval aes_encrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_encrypt_ctx *ctx) + { + while(num_blk--) { + *iv ^= *ibuf++; + aes_encrypt(iv, iv, ctx); + *obuf++ = *iv; + } + return 0; + } + + The following is an implementation of this function using Intel AESNI. + This function _aes_encrypt_cbc_hw SHOULD NOT be called directly. + Developer should still call _aes_encrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch + to this aesni-based function should it detecs that aesni is available. + Blindly call this function SURELY will cause a CRASH on systems with no aesni support. + + Note that each block starts with *iv, which is the output of the previous block. Therefore, the cbc blocks + are serially chained. This prevents us from arranging several blocks for encryption in parallel. + + ----------------------------------------------------------------------------------------------------------------*/ + + .text + .align 4,0x90 + .globl _aes_encrypt_cbc_hw +_aes_encrypt_cbc_hw: + + // push/save registers for local use +#if defined __i386__ + + push %ebp + movl %esp, %ebp + push %ebx + push %edi + + #define sp %esp + +#else // __x86_64__ + + push %rbp + mov %rsp, %rbp + push %rbx + push %r13 + push %r14 + push %r15 + + #define sp %rsp + +#endif + + // if this is kernel code, need to save used xmm registers +#ifdef KERNEL + +#if defined __i386__ + sub $(8*16), %esp // for possible xmm0-xmm7 save/restore +#else + sub $(16*16), %rsp // xmm0-xmm15 save/restore +#endif + + movaps %xmm0, (sp) + movaps %xmm1, 16(sp) + movaps %xmm2, 32(sp) + movaps %xmm3, 48(sp) + movaps %xmm4, 64(sp) + movaps %xmm5, 80(sp) + movaps %xmm6, 96(sp) + movaps %xmm7, 112(sp) +#if defined __x86_64__ + movaps %xmm8, 16*8(sp) + movaps %xmm9, 16*9(sp) + movaps %xmm10, 16*10(sp) + movaps %xmm11, 16*11(sp) + movaps %xmm12, 16*12(sp) + movaps %xmm13, 16*13(sp) + movaps %xmm14, 16*14(sp) + movaps %xmm15, 16*15(sp) +#endif // __x86_64__ + +#endif // KERNEL + + #define iv %xmm0 + +#ifdef __i386__ + + mov 12(%ebp), %eax // in_iv + mov 24(%ebp), %edx // ctx + movups (%eax), iv // iv = in_iv + mov 8(%ebp), %ebx // ibuf + mov 16(%ebp), %ecx // num_blk + mov 20(%ebp), %edi // obuf + + #define ibuf %ebx + #define obuf %edi + #define num_blk %ecx + #define ctx %edx + +#else + + mov %rdi, %rbx // ibuf + movups (%rsi), iv // iv = in_iv + mov %rdx, %r13 // num_blk + mov %rcx, %r14 // obuf + mov %r8, %r15 // ctx + + #define ibuf %rbx + #define num_blk %r13d + #define obuf %r14 + #define ctx %r15 + +#endif + + mov 240(ctx), %eax // aes length + cmp $160, %eax // aes-128 encrypt ? + je L_encrypt_128 + cmp $192, %eax // aes-192 encrypt ? + je L_encrypt_192 + cmp $224, %eax // aes-256 encrypt ? + je L_encrypt_256 + mov $-1, %eax // return error + jmp L_error + + // + // aes-128 encrypt_cbc operation, up to L_HW_cbc_done + // + +L_encrypt_128: + + cmp $1, num_blk // check number of block + jl L_HW_cbc_done // should it be less than 1, nothing to do + + movups (ctx), %xmm2 // key0 + movups 16(ctx), %xmm3 // key1 + movups 32(ctx), %xmm4 // key2 + movups 48(ctx), %xmm5 // key3 + movups 64(ctx), %xmm6 // key4 + movups 80(ctx), %xmm7 // key5 +#if defined __x86_64__ + movups 96(ctx), %xmm8 // key6 + movups 112(ctx), %xmm9 // key7 + movups 128(ctx), %xmm10 // key8 + movups 144(ctx), %xmm11 // key9 + movups 160(ctx), %xmm12 // keyA +#endif + + // while (num_blk--) { + // *iv ^= *ibuf++; + // aes_encrypt(iv, iv, ctx); + // *obuf++ = *iv; + // } +0: + movups (ibuf), %xmm1 // *ibuf + pxor %xmm2, iv // 1st instruction inside aes_encrypt + pxor %xmm1, iv // *iv ^= *ibuf + + // finishing up the rest of aes_encrypt + aesenc %xmm3, iv + aesenc %xmm4, iv + aesenc %xmm5, iv + aesenc %xmm6, iv + aesenc %xmm7, iv +#if defined __x86_64__ + aesenc %xmm8, iv + aesenc %xmm9, iv + aesenc %xmm10, iv + aesenc %xmm11, iv + aesenclast %xmm12, iv +#else + movups 96(ctx), %xmm1 // key6 + aesenc %xmm1, iv + movups 112(ctx), %xmm1 // key7 + aesenc %xmm1, iv + movups 128(ctx), %xmm1 // key8 + aesenc %xmm1, iv + movups 144(ctx), %xmm1 // key9 + aesenc %xmm1, iv + movups 160(ctx), %xmm1 // keyA + aesenclast %xmm1, iv +#endif + + movups iv, (obuf) // *obuf = *iv; + add $16, obuf // obuf++; + add $16, ibuf // ibuf++; + sub $1, num_blk // num_blk -- + jg 0b // if num_blk > 0, repeat the loop + + // the following will be branched to from all other cases (encrypt/decrypt 128/192/256) + +L_HW_cbc_done: + + xor %eax, %eax // to return CRYPT_OK + +L_error: + + // if kernel, restore xmm registers +#ifdef KERNEL + movaps 0(sp), %xmm0 + movaps 16(sp), %xmm1 + movaps 32(sp), %xmm2 + movaps 48(sp), %xmm3 + movaps 64(sp), %xmm4 + movaps 80(sp), %xmm5 + movaps 96(sp), %xmm6 + movaps 112(sp), %xmm7 +#if defined __x86_64__ + movaps 16*8(sp), %xmm8 + movaps 16*9(sp), %xmm9 + movaps 16*10(sp), %xmm10 + movaps 16*11(sp), %xmm11 + movaps 16*12(sp), %xmm12 + movaps 16*13(sp), %xmm13 + movaps 16*14(sp), %xmm14 + movaps 16*15(sp), %xmm15 +#endif // __x86_64__ +#endif // KERNEL + + // release used stack memory, restore used callee-saved registers, and return +#if defined __i386__ +#ifdef KERNEL + add $(8*16), %esp +#endif + pop %edi + pop %ebx +#else +#ifdef KERNEL + add $(16*16), %rsp +#endif + pop %r15 + pop %r14 + pop %r13 + pop %rbx +#endif + leave + ret + + // + // aes-192 encrypt_cbc operation, after completion, branch to L_HW_cbc_done + // + +L_encrypt_192: + + cmp $1, num_blk // check number of block + jl L_HW_cbc_done // should it be less than 1, nothing to do + + movups (ctx), %xmm2 // key0 + movups 16(ctx), %xmm3 // key1 + movups 32(ctx), %xmm4 // key2 + movups 48(ctx), %xmm5 // key3 + movups 64(ctx), %xmm6 // key4 + movups 80(ctx), %xmm7 // key5 +#if defined __x86_64__ + movups 96(ctx), %xmm8 // key6 + movups 112(ctx), %xmm9 // key7 + movups 128(ctx), %xmm10 // key8 + movups 144(ctx), %xmm11 // key9 + movups 160(ctx), %xmm12 // keyA + movups 176(ctx), %xmm13 // keyB + movups 192(ctx), %xmm14 // keyC +#endif + + // while (num_blk--) { + // *iv ^= *ibuf++; + // aes_encrypt(iv, iv, ctx); + // *obuf++ = *iv; + // } +0: + movups (ibuf), %xmm1 // *ibuf + pxor %xmm1, iv // *iv ^= ibuf + + // aes_encrypt(iv, iv, ctx); + + pxor %xmm2, iv + aesenc %xmm3, iv + aesenc %xmm4, iv + aesenc %xmm5, iv + aesenc %xmm6, iv + aesenc %xmm7, iv +#if defined __x86_64__ + aesenc %xmm8, iv + aesenc %xmm9, iv + aesenc %xmm10, iv + aesenc %xmm11, iv + aesenc %xmm12, iv + aesenc %xmm13, iv + aesenclast %xmm14, iv +#else + movups 96(ctx), %xmm1 + aesenc %xmm1, iv + movups 112(ctx), %xmm1 + aesenc %xmm1, iv + movups 128(ctx), %xmm1 + aesenc %xmm1, iv + movups 144(ctx), %xmm1 + aesenc %xmm1, iv + movups 160(ctx), %xmm1 + aesenc %xmm1, iv + movups 176(ctx), %xmm1 + aesenc %xmm1, iv + movups 192(ctx), %xmm1 + aesenclast %xmm1, iv +#endif + + movups iv, (obuf) // *obuf = *iv; + add $16, ibuf // ibuf++ + add $16, obuf // obuf++ + + sub $1, num_blk // num_blk -- + jg 0b // if num_blk > 0, repeat the loop + + jmp L_HW_cbc_done // share with the common exit code + + // + // aes-256 encrypt_cbc operation, after completion, branch to L_HW_cbc_done + // + +L_encrypt_256: + + cmp $1, num_blk // check number of block + jl L_HW_cbc_done // should it be less than 1, nothing to do + + movups (ctx), %xmm2 // key0 + movups 16(ctx), %xmm3 // key1 + movups 32(ctx), %xmm4 // key2 + movups 48(ctx), %xmm5 // key3 + movups 64(ctx), %xmm6 // key4 + movups 80(ctx), %xmm7 // key5 +#if defined __x86_64__ + movups 96(ctx), %xmm8 // key6 + movups 112(ctx), %xmm9 // key7 + movups 128(ctx), %xmm10 // key8 + movups 144(ctx), %xmm11 // key9 + movups 160(ctx), %xmm12 // keyA + movups 176(ctx), %xmm13 // keyB + movups 192(ctx), %xmm14 // keyC + movups 208(ctx), %xmm15 // keyD + // movups 224(ctx), %xmm1 // keyE +#endif + + // while (num_blk--) { + // *iv ^= *ibuf++; + // aes_encrypt(iv, iv, ctx); + // *obuf++ = *iv; + // } +0: + movups (ibuf), %xmm1 // *ibuf + pxor %xmm1, iv // *iv ^= ibuf + + // aes_encrypt(iv, iv, ctx); + pxor %xmm2, iv + aesenc %xmm3, iv + aesenc %xmm4, iv + aesenc %xmm5, iv + aesenc %xmm6, iv + aesenc %xmm7, iv +#if defined __x86_64__ + movups 224(ctx), %xmm1 // keyE + aesenc %xmm8, iv + aesenc %xmm9, iv + aesenc %xmm10, iv + aesenc %xmm11, iv + aesenc %xmm12, iv + aesenc %xmm13, iv + aesenc %xmm14, iv + aesenc %xmm15, iv + aesenclast %xmm1, iv +#else + movups 96(ctx), %xmm1 // key6 + aesenc %xmm1, iv + movups 112(ctx), %xmm1 // key7 + aesenc %xmm1, iv + movups 128(ctx), %xmm1 // key8 + aesenc %xmm1, iv + movups 144(ctx), %xmm1 // key9 + aesenc %xmm1, iv + movups 160(ctx), %xmm1 // keyA + aesenc %xmm1, iv + movups 176(ctx), %xmm1 // keyB + aesenc %xmm1, iv + movups 192(ctx), %xmm1 // keyC + aesenc %xmm1, iv + movups 208(ctx), %xmm1 // keyD + aesenc %xmm1, iv + movups 224(ctx), %xmm1 // keyE + aesenclast %xmm1, iv +#endif + + movups iv, (obuf) // *obuf = *iv; + add $16, ibuf // ibuf++ + add $16, obuf // obuf++ + + sub $1, num_blk // num_blk -- + jg 0b // if num_blk > 0, repeat the loop + + jmp L_HW_cbc_done // share with the common exit code + + + + // + // --------- END of aes_encrypt_cbc_hw ------------------- + // + + +/* ---------------------------------------------------------------------------------------------------------------- + + aes_decrypt_cbc function (see aes_modes.c or aes_modes_asm.s) : + + For simplicity, I am assuming all variables are in 128-bit data type. + + aes_rval aes_decrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_decrypt_ctx *ctx) + { + while(num_blk--) { + aes_decrypt(ibuf, obuf, ctx); + *obuf++ ^= *iv; + *iv = *ibuf++; + } + return 0; + } + + The following is an implementation of this function using Intel AESNI. + This function _aes_decrypt_cbc_hw SHOULD NOT be called directly. + Developer should still call _aes_decrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch + to this aesni-based function should it detecs that aesni is available. + Blindly call this function SURELY will cause a CRASH on systems with no aesni support. + + Note that the decryption operation is not related over blocks. + This gives opportunity of arranging aes_decrypt operations in parallel to speed up code. + This is equivalent to what has been described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55) + The following assembly code exploits this idea to achieve ~ 1.4 speed up in aes_decrypt_cbc. + + Example C code for packing 4 blocks in an iteration is shown as follows: + + while ((num_blk-=4)>=0) { + + // the following 4 functions can be interleaved to exploit parallelism + aes_decrypt(ibuf, obuf, ctx); + aes_decrypt(ibuf+1, obuf+1, ctx); + aes_decrypt(ibuf+2, obuf+2, ctx); + aes_decrypt(ibuf+3, obuf+3, ctx); + + obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2]; + *iv = ibuf[3]; ibuf += 4; obuf += 4; + } + num_blk+=4; + + ----------------------------------------------------------------------------------------------------------------*/ + + .text + .align 4,0x90 + .globl _aes_decrypt_cbc_hw +_aes_decrypt_cbc_hw: + + // push/save registers for local use +#if defined __i386__ + + push %ebp + movl %esp, %ebp + push %ebx // ibuf + push %edi // obuf + + #define sp %esp + +#else // __x86_64__ + + push %rbp + mov %rsp, %rbp + push %rbx + push %r13 + push %r14 + push %r15 + + #define sp %rsp + +#endif + + + // if kernel, allocate stack space to save xmm registers +#ifdef KERNEL +#if defined __i386__ + sub $(8*16), %esp +#else + sub $(16*16), %rsp +#endif + movaps %xmm0, (sp) + movaps %xmm1, 16(sp) + movaps %xmm2, 32(sp) + movaps %xmm3, 48(sp) + movaps %xmm4, 64(sp) + movaps %xmm5, 80(sp) + movaps %xmm6, 96(sp) + movaps %xmm7, 112(sp) +#if defined __x86_64__ + movaps %xmm8, 16*8(sp) + movaps %xmm9, 16*9(sp) + movaps %xmm10, 16*10(sp) + movaps %xmm11, 16*11(sp) + movaps %xmm12, 16*12(sp) + movaps %xmm13, 16*13(sp) + movaps %xmm14, 16*14(sp) + movaps %xmm15, 16*15(sp) +#endif // __x86_64__ +#endif + + #undef iv + #define iv %xmm0 + +#if defined __i386__ + mov 12(%ebp), %eax // in_iv + mov 24(%ebp), %edx // ctx + movups (%eax), iv // iv = in_iv + mov 8(%ebp), %ebx // ibuf + mov 16(%ebp), %ecx // num_blk + mov 20(%ebp), %edi // obuf + + #define ibuf %ebx + #define obuf %edi + #define num_blk %ecx + #define ctx %edx + +#else // __x86_64__, rdi/rsi/rdx/rcx/r8 + + mov %rdi, %rbx // ibuf + movups (%rsi), iv // iv = in_iv + mov %rdx, %r13 // num_blk + mov %rcx, %r14 // obuf + mov %r8, %r15 // ctx + + #define ibuf %rbx + #define num_blk %r13d + #define obuf %r14 + #define ctx %r15 + +#endif + + mov 240(ctx), %eax // aes length + cmp $160, %eax // aes-128 decrypt + je L_decrypt_128 + cmp $192, %eax // aes-192 decrypt + je L_decrypt_192 + cmp $224, %eax // aes-256 decrypt + je L_decrypt_256 + + mov $-1, %eax // wrong aes length, to return -1 + jmp L_error // early exit due to wrong aes length + + + // + // aes-128 decrypt_cbc operation, after completion, branch to L_HW_cbc_done + // + +L_decrypt_128: + + cmp $1, num_blk + jl L_HW_cbc_done // if num_blk < 1, early return + + // aes-128 decrypt expanded keys + movups 160(ctx), %xmm3 + movups 144(ctx), %xmm4 + movups 128(ctx), %xmm5 + movups 112(ctx), %xmm6 + movups 96(ctx), %xmm7 +#if defined __x86_64__ + movups 80(ctx), %xmm8 + movups 64(ctx), %xmm9 + movups 48(ctx), %xmm10 + movups 32(ctx), %xmm11 + movups 16(ctx), %xmm12 + movups 0(ctx), %xmm13 +#endif + + // performs 4 block decryption in an iteration to exploit decrypt in parallel + + // while ((num_blk-=4)>=0) { + // aes_decrypt(ibuf, obuf, ctx); + // aes_decrypt(ibuf+1, obuf+1, ctx); + // aes_decrypt(ibuf+2, obuf+2, ctx); + // aes_decrypt(ibuf+3, obuf+3, ctx); + // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2]; + // *iv = ibuf[3]; ibuf += 4; obuf += 4; + // } + + sub $4, num_blk // pre decrement num_blk by 4 + jl 9f // if num_blk < 4, skip the per-4-blocks processing code + +0: + + +#if defined __x86_64__ + + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + movups 32(ibuf), %xmm14 // tmp = 3rd ibuf + movups 48(ibuf), %xmm15 // tmp = 4th ibuf + + // for x86_64, the expanded keys are already stored in xmm3-xmm13 + + // aes-128 decrypt round 0 per 4 blocks + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + pxor %xmm3, %xmm14 + pxor %xmm3, %xmm15 + + // aes-128 decrypt round 1 per 4 blocks + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm14 + aesdec %xmm4, %xmm15 + + // aes-128 decrypt round 2 per 4 blocks + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm14 + aesdec %xmm5, %xmm15 + + // aes-128 decrypt round 3 per 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm14 + aesdec %xmm6, %xmm15 + + // aes-128 decrypt round 4 per 4 blocks + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm14 + aesdec %xmm7, %xmm15 + + // aes-128 decrypt round 5 per 4 blocks + aesdec %xmm8, %xmm1 + aesdec %xmm8, %xmm2 + aesdec %xmm8, %xmm14 + aesdec %xmm8, %xmm15 + + // aes-128 decrypt round 6 per 4 blocks + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm14 + aesdec %xmm9, %xmm15 + + // aes-128 decrypt round 7 per 4 blocks + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm14 + aesdec %xmm10, %xmm15 + + // aes-128 decrypt round 8 per 4 blocks + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm14 + aesdec %xmm11, %xmm15 + + // aes-128 decrypt round 9 per 4 blocks + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm14 + aesdec %xmm12, %xmm15 + + // aes-128 decrypt round 10 (last) per 4 blocks + aesdeclast %xmm13, %xmm1 + aesdeclast %xmm13, %xmm2 + aesdeclast %xmm13, %xmm14 + aesdeclast %xmm13, %xmm15 + + pxor iv, %xmm1 // obuf[0] ^= *iv; + movups (ibuf), iv // ibuf[0] + pxor iv, %xmm2 // obuf[1] ^= ibuf[0]; + movups 16(ibuf), iv // ibuf[1] + pxor iv, %xmm14 // obuf[2] ^= ibuf[1]; + movups 32(ibuf), iv // ibuf[2] + pxor iv, %xmm15 // obuf[3] ^= obuf[2]; + movups 48(ibuf), iv // *iv = ibuf[3] + + movups %xmm1, (obuf) // write 1st obuf + movups %xmm2, 16(obuf) // write 2nd obuf + movups %xmm14, 32(obuf) // write 3rd obuf + movups %xmm15, 48(obuf) // write 4th obuf + + +#else + + // aes_decrypt_cbc per 4 blocks using aes-128 for i386 + // xmm1/xmm2/xmm4/xmm5 used for obuf per block + // xmm3 = key0 + // xmm0 = iv + // xmm6/xmm7 dynamically load with other expanded keys + + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + movups 32(ibuf), %xmm4 // tmp = 3rd ibuf + movups 48(ibuf), %xmm5 // tmp = 4th ibuf + + // aes_decrypt + // for i386, sequentially load expanded keys into xmm6/xmm7 + + movups 144(ctx), %xmm6 // key1 + + // aes-128 decrypt round 0 per 4 blocks + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + pxor %xmm3, %xmm4 + pxor %xmm3, %xmm5 + + movups 128(ctx), %xmm7 // key2 + + // aes-128 decrypt round 1 per 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 112(ctx), %xmm6 // key3 + + // aes-128 decrypt round 2 per 4 blocks + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 96(ctx), %xmm7 // key4 + + // aes-128 decrypt round 3 per 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 80(ctx), %xmm6 // key5 + + // aes-128 decrypt round 4 per 4 blocks + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 64(ctx), %xmm7 // key6 + + // aes-128 decrypt round 5 per 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 48(ctx), %xmm6 // key7 + + // aes-128 decrypt round 6 per 4 blocks + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 32(ctx), %xmm7 // key8 + + // aes-128 decrypt round 7 per 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 16(ctx), %xmm6 // key9 + + // aes-128 decrypt round 8 per 4 blocks + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 0(ctx), %xmm7 // keyA + + // aes-128 decrypt round 9 per 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + // aes-128 decrypt round 10 (last) per 4 blocks + aesdeclast %xmm7, %xmm1 + aesdeclast %xmm7, %xmm2 + aesdeclast %xmm7, %xmm4 + aesdeclast %xmm7, %xmm5 + + pxor iv, %xmm1 // 1st obuf ^= iv; + movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm2 // 2nd obuf ^= iv; + movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm4 // 3rd obuf ^= iv; + movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm5 // 4th obuf ^= iv; + movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE); + + movups %xmm1, (obuf) // write 1st obuf + movups %xmm2, 16(obuf) // write 2nd obuf + movups %xmm4, 32(obuf) // write 3rd obuf + movups %xmm5, 48(obuf) // write 4th obuf +#endif + + add $64, ibuf // ibuf += 4; + add $64, obuf // obuf += 4; + + sub $4, num_blk // num_blk -= 4 + jge 0b // if num_blk > 0, repeat the loop + +9: add $4, num_blk // post incremtn num_blk by 4 + je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code + +#if defined __i386__ + // updated as they might be needed as expanded keys in the remaining + movups 144(ctx), %xmm4 + movups 128(ctx), %xmm5 + movups 112(ctx), %xmm6 + movups 96(ctx), %xmm7 +#endif + + test $2, num_blk // check whether num_blk has 2 blocks + je 9f // if num_blk & 2 == 0, skip the per-pair processing code + + // do the remaining 2 blocks together + + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + + // aes_decrypt + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 +#if defined __x86_64__ + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm8, %xmm1 + aesdec %xmm8, %xmm2 + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdeclast %xmm13, %xmm1 + aesdeclast %xmm13, %xmm2 +#else + movups 80(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + movups 64(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + movups 48(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + movups 32(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + movups 16(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + movups 0(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdeclast %xmm7, %xmm1 + aesdeclast %xmm7, %xmm2 + movups 112(ctx), %xmm6 + movups 96(ctx), %xmm7 +#endif + + pxor iv, %xmm1 // obuf[0] ^= *iv; + movups (ibuf), iv // ibuf[0] + pxor iv, %xmm2 // obuf[1] ^= ibuf[0] + movups 16(ibuf), iv // *iv = ibuf[1] + + movups %xmm1, (obuf) // write obuf[0] + movups %xmm2, 16(obuf) // write obuf[1] + + add $32, ibuf // ibuf += 2 + add $32, obuf // obuf += 2 + +9: + test $1, num_blk // check whether num_blk has residual 1 block + je L_HW_cbc_done // if num_blk == 0, no need for residual processing code + + movups (ibuf), %xmm2 // tmp = ibuf + // aes_decrypt + pxor %xmm3, %xmm2 + aesdec %xmm4, %xmm2 + aesdec %xmm5, %xmm2 + aesdec %xmm6, %xmm2 + aesdec %xmm7, %xmm2 +#if defined __x86_64__ + aesdec %xmm8, %xmm2 + aesdec %xmm9, %xmm2 + aesdec %xmm10, %xmm2 + aesdec %xmm11, %xmm2 + aesdec %xmm12, %xmm2 + aesdeclast %xmm13, %xmm2 +#else + movups 80(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 64(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 48(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 32(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 16(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups (ctx), %xmm1 + aesdeclast %xmm1, %xmm2 +#endif + + pxor iv, %xmm2 // *obuf ^= *iv; + movups (ibuf), iv // *iv = *ibuf; + movups %xmm2, (obuf) // write *obuf + + jmp L_HW_cbc_done + + // + // aes-192 decrypt_cbc operation, after completion, branch to L_HW_cbc_done + // + +L_decrypt_192: + + cmp $1, num_blk + jl L_HW_cbc_done // if num_blk < 1, early return + + // aes-192 decryp expanded keys + movups 192(ctx), %xmm3 + movups 176(ctx), %xmm4 + movups 160(ctx), %xmm5 + movups 144(ctx), %xmm6 + movups 128(ctx), %xmm7 +#if defined __x86_64__ + movups 112(ctx), %xmm8 + movups 96(ctx), %xmm9 + movups 80(ctx), %xmm10 + movups 64(ctx), %xmm11 + movups 48(ctx), %xmm12 + movups 32(ctx), %xmm13 + movups 16(ctx), %xmm14 + movups (ctx), %xmm15 +#endif + + // performs 4 block decryption in an iteration to exploit decrypt in parallel + + // while ((num_blk-=4)>=0) { + // aes_decrypt(ibuf, obuf, ctx); + // aes_decrypt(ibuf+1, obuf+1, ctx); + // aes_decrypt(ibuf+2, obuf+2, ctx); + // aes_decrypt(ibuf+3, obuf+3, ctx); + // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2]; + // *iv = ibuf[3]; ibuf += 4; obuf += 4; + // } + + sub $4, num_blk // pre decrement num_blk by 4 + jl 9f // if num_blk < 4, skip the per-4-blocks processing code +0: + +#if defined __x86_64__ + + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + movups 32(ibuf), %xmm14 // tmp = 3rd ibuf + movups 48(ibuf), %xmm15 // tmp = 4th ibuf + + // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13 + // use %xmm12/%xmm13 ts dynamic keys in the middle, restored afterwards + + // round 0 for 4 blocks + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + pxor %xmm3, %xmm14 + pxor %xmm3, %xmm15 + + // round 1 for 4 blocks + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm14 + aesdec %xmm4, %xmm15 + + // round 2 for 4 blocks + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm14 + aesdec %xmm5, %xmm15 + + // round 3 for 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm14 + aesdec %xmm6, %xmm15 + + // round 4 for 4 blocks + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm14 + aesdec %xmm7, %xmm15 + + // round 5 for 4 blocks + aesdec %xmm8, %xmm1 + aesdec %xmm8, %xmm2 + aesdec %xmm8, %xmm14 + aesdec %xmm8, %xmm15 + + // round 6 for 4 blocks + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm14 + aesdec %xmm9, %xmm15 + + // round 7 for 4 blocks + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm14 + aesdec %xmm10, %xmm15 + + // round 8 for 4 blocks + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm14 + aesdec %xmm11, %xmm15 + + // round 9 for 4 blocks + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm14 + aesdec %xmm12, %xmm15 + + movups 16(ctx), %xmm12 + + // round A for 4 blocks + aesdec %xmm13, %xmm1 + aesdec %xmm13, %xmm2 + aesdec %xmm13, %xmm14 + aesdec %xmm13, %xmm15 + + movups (ctx), %xmm13 + + // round B for 4 blocks + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm14 + aesdec %xmm12, %xmm15 + + movups 48(ctx), %xmm12 // restore %xmm12 to its original key + + // round C (last) for 4 blocks + aesdeclast %xmm13, %xmm1 + aesdeclast %xmm13, %xmm2 + aesdeclast %xmm13, %xmm14 + aesdeclast %xmm13, %xmm15 + + movups 32(ctx), %xmm13 // restore %xmm13 to its original key + + pxor iv, %xmm1 // obuf[0] ^= *iv; + movups (ibuf), iv // ibuf[0] + pxor iv, %xmm2 // obuf[1] ^= ibuf[0] + movups 16(ibuf), iv // ibuf[1] + pxor iv, %xmm14 // obuf[2] ^= ibuf[1] + movups 32(ibuf), iv // ibuf[2] + pxor iv, %xmm15 // obuf[3] ^= ibuf[2] + movups 48(ibuf), iv // *iv = ibuf[3] + + movups %xmm1, (obuf) // write 1st obuf + movups %xmm2, 16(obuf) // write 2nd obuf + movups %xmm14, 32(obuf) // write 3rd obuf + movups %xmm15, 48(obuf) // write 4th obuf + + add $64, ibuf // ibuf += 4; + add $64, obuf // obuf += 4; + + sub $4, num_blk // num_blk -= 4 + jge 0b // if num_blk > 0, repeat the loop + +9: add $4, num_blk // post incremtn num_blk by 4 + je L_HW_cbc_done // if num_blk == 0, prepare to return + + movups 16(ctx), %xmm14 // restore %xmm14 to its key + movups (ctx), %xmm15 // restore %xmm15 to its key + +#else + + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + movups 32(ibuf), %xmm4 // tmp = 3rd ibuf + movups 48(ibuf), %xmm5 // tmp = 4th ibuf + + // aes_decrypt + // for i386, sequentially load expanded keys into xmm6/xmm7 + movups 176(ctx), %xmm6 + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + pxor %xmm3, %xmm4 + pxor %xmm3, %xmm5 + + movups 160(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 144(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 128(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 112(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 96(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 80(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 64(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 48(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 32(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 16(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 0(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + aesdeclast %xmm7, %xmm1 + aesdeclast %xmm7, %xmm2 + aesdeclast %xmm7, %xmm4 + aesdeclast %xmm7, %xmm5 + + pxor iv, %xmm1 // 1st obuf ^= iv; + movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm2 // 2nd obuf ^= iv; + movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm4 // 3rd obuf ^= iv; + movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm5 // 4th obuf ^= iv; + movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE); + movups %xmm1, (obuf) // write 1st obuf + movups %xmm2, 16(obuf) // write 2nd obuf + movups %xmm4, 32(obuf) // write 3rd obuf + movups %xmm5, 48(obuf) // write 4th obuf + + add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4; + add $64, obuf // obuf += AES_BLOCK_SIZE * 4; + + sub $4, num_blk // num_blk -= 4 + jge 0b // if num_blk > 0, repeat the loop + + +9: add $4, num_blk // post incremtn num_blk by 4 + je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code + + movups 176(ctx), %xmm4 + movups 160(ctx), %xmm5 + movups 144(ctx), %xmm6 + movups 128(ctx), %xmm7 + +#endif + + // per-block aes_decrypt_cbc loop + +0: + movups (ibuf), %xmm2 // tmp = ibuf + + // aes_decrypt + pxor %xmm3, %xmm2 + aesdec %xmm4, %xmm2 + aesdec %xmm5, %xmm2 + aesdec %xmm6, %xmm2 + aesdec %xmm7, %xmm2 +#if defined __x86_64__ + aesdec %xmm8, %xmm2 + aesdec %xmm9, %xmm2 + aesdec %xmm10, %xmm2 + aesdec %xmm11, %xmm2 + aesdec %xmm12, %xmm2 + aesdec %xmm13, %xmm2 + aesdec %xmm14, %xmm2 + aesdeclast %xmm15, %xmm2 +#else + movups 112(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 96(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 80(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 64(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 48(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 32(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 16(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups (ctx), %xmm1 + aesdeclast %xmm1, %xmm2 +#endif + + pxor iv, %xmm2 // obuf ^= iv; + movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); + + movups %xmm2, (obuf) // write obuf + + add $16, ibuf // ibuf += AES_BLOCK_SIZE; + add $16, obuf // obuf += AES_BLOCK_SIZE; + sub $1, num_blk // num_blk -- + jg 0b // if num_blk > 0, repeat the loop + + jmp L_HW_cbc_done + + // + // aes-256 decrypt_cbc operation, after completion, branch to L_HW_cbc_done + // + +L_decrypt_256: + + cmp $1, num_blk + jl L_HW_cbc_done + + movups 224(ctx), %xmm3 + movups 208(ctx), %xmm4 + movups 192(ctx), %xmm5 + movups 176(ctx), %xmm6 + movups 160(ctx), %xmm7 +#if defined __x86_64__ + movups 144(ctx), %xmm8 + movups 128(ctx), %xmm9 + movups 112(ctx), %xmm10 + movups 96(ctx), %xmm11 + movups 80(ctx), %xmm12 + movups 64(ctx), %xmm13 + movups 48(ctx), %xmm14 + movups 32(ctx), %xmm15 +// movups 16(ctx), %xmm14 +// movups (ctx), %xmm15 +#endif + +#if defined __x86_64__ + + sub $4, num_blk // pre decrement num_blk by 4 + jl 9f // if num_blk < 4, skip the per-4-blocks processing code +0: + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + movups 32(ibuf), %xmm14 // tmp = 3rd ibuf + movups 48(ibuf), %xmm15 // tmp = 4th ibuf + + // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13 + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + pxor %xmm3, %xmm14 + pxor %xmm3, %xmm15 + + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm14 + aesdec %xmm4, %xmm15 + + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm14 + aesdec %xmm5, %xmm15 + + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm14 + aesdec %xmm6, %xmm15 + + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm14 + aesdec %xmm7, %xmm15 + + aesdec %xmm8, %xmm1 + aesdec %xmm8, %xmm2 + aesdec %xmm8, %xmm14 + aesdec %xmm8, %xmm15 + + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm14 + aesdec %xmm9, %xmm15 + + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm14 + aesdec %xmm10, %xmm15 + + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm14 + aesdec %xmm11, %xmm15 + + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm14 + aesdec %xmm12, %xmm15 + movups 48(ctx), %xmm12 + + aesdec %xmm13, %xmm1 + aesdec %xmm13, %xmm2 + aesdec %xmm13, %xmm14 + aesdec %xmm13, %xmm15 + movups 32(ctx), %xmm13 + + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm14 + aesdec %xmm12, %xmm15 + movups 16(ctx), %xmm12 + + aesdec %xmm13, %xmm1 + aesdec %xmm13, %xmm2 + aesdec %xmm13, %xmm14 + aesdec %xmm13, %xmm15 + movups (ctx), %xmm13 + + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm14 + aesdec %xmm12, %xmm15 + movups 80(ctx), %xmm12 + + aesdeclast %xmm13, %xmm1 + aesdeclast %xmm13, %xmm2 + aesdeclast %xmm13, %xmm14 + aesdeclast %xmm13, %xmm15 + movups 64(ctx), %xmm13 + + pxor iv, %xmm1 // obuf ^= iv; + movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm2 // obuf ^= iv; + movups 16(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm14 // obuf ^= iv; + movups 32(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm15 // obuf ^= iv; + movups 48(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); + + movups %xmm1, (obuf) // write 1st obuf + movups %xmm2, 16(obuf) // write 2nd obuf + movups %xmm14, 32(obuf) // write 3rd obuf + movups %xmm15, 48(obuf) // write 4th obuf + + add $64, ibuf // ibuf += AES_BLOCK_SIZE*4; + add $64, obuf // obuf += AES_BLOCK_SIZE*4; + + sub $4, num_blk // num_blk -= 4 + jge 0b // if num_blk > 0, repeat the loop + +9: add $4, num_blk // post incremtn num_blk by 4 + je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code + + movups 48(ctx), %xmm14 + movups 32(ctx), %xmm15 + +#else + + sub $4, num_blk // pre decrement num_blk by 4 + jl 9f // if num_blk < 4, skip the per-pair processing code +0: + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + movups 32(ibuf), %xmm4 // tmp = 3rd ibuf + movups 48(ibuf), %xmm5 // tmp = 4th ibuf + + // aes_decrypt + // for i386, sequentially load expanded keys into xmm6/xmm7 + movups 208(ctx), %xmm6 + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + pxor %xmm3, %xmm4 + pxor %xmm3, %xmm5 + + movups 192(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 176(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 160(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 144(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 128(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 112(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 96(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 80(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 64(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 48(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 32(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 16(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 0(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + aesdeclast %xmm7, %xmm1 + aesdeclast %xmm7, %xmm2 + aesdeclast %xmm7, %xmm4 + aesdeclast %xmm7, %xmm5 + + pxor iv, %xmm1 // 1st obuf ^= iv; + movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm2 // 2nd obuf ^= iv; + movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm4 // 3rd obuf ^= iv; + movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm5 // 4th obuf ^= iv; + movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE); + movups %xmm1, (obuf) // write 1st obuf + movups %xmm2, 16(obuf) // write 2nd obuf + movups %xmm4, 32(obuf) // write 3rd obuf + movups %xmm5, 48(obuf) // write 4th obuf + + add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4; + add $64, obuf // obuf += AES_BLOCK_SIZE * 4; + + sub $4, num_blk // num_blk -= 4 + jge 0b // if num_blk > 0, repeat the loop + + +9: add $4, num_blk // post incremtn num_blk by 4 + je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code + + movups 208(ctx), %xmm4 + movups 192(ctx), %xmm5 + movups 176(ctx), %xmm6 + movups 160(ctx), %xmm7 + +#endif + +0: + movups (ibuf), %xmm2 // tmp = ibuf + + // aes_decrypt + pxor %xmm3, %xmm2 + aesdec %xmm4, %xmm2 + aesdec %xmm5, %xmm2 + aesdec %xmm6, %xmm2 + aesdec %xmm7, %xmm2 +#if defined __x86_64__ + aesdec %xmm8, %xmm2 + aesdec %xmm9, %xmm2 + aesdec %xmm10, %xmm2 + aesdec %xmm11, %xmm2 + aesdec %xmm12, %xmm2 + aesdec %xmm13, %xmm2 + aesdec %xmm14, %xmm2 + aesdec %xmm15, %xmm2 +#else + movups 144(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 128(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 112(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 96(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 80(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 64(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 48(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 32(ctx), %xmm1 + aesdec %xmm1, %xmm2 +#endif + movups 16(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups (ctx), %xmm1 + aesdeclast %xmm1, %xmm2 + + pxor iv, %xmm2 // obuf ^= iv; + movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); + + movups %xmm2, (obuf) // write obuf + + add $16, ibuf // ibuf += AES_BLOCK_SIZE; + add $16, obuf // obuf += AES_BLOCK_SIZE; + sub $1, num_blk // num_blk -- + jg 0b // if num_blk > 0, repeat the loop + + jmp L_HW_cbc_done + + // + // --------- END of aes_decrypt_cbc_hw ------------------- + // diff --git a/bsd/crypto/aes/i386/aes_x86_v2.s b/bsd/crypto/aes/i386/aes_x86_v2.s deleted file mode 100644 index 7ed98adb8..000000000 --- a/bsd/crypto/aes/i386/aes_x86_v2.s +++ /dev/null @@ -1,1298 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * --------------------------------------------------------------------------- - * Copyright (c) 2002, Dr Brian Gladman, Worcester, UK. All rights reserved. - * - * LICENSE TERMS - * - * The free distribution and use of this software in both source and binary - * form is allowed (with or without changes) provided that: - * - * 1. distributions of this source code include the above copyright - * notice, this list of conditions and the following disclaimer; - * - * 2. distributions in binary form include the above copyright - * notice, this list of conditions and the following disclaimer - * in the documentation and/or other associated materials; - * - * 3. the copyright holder's name is not used to endorse products - * built using this software without specific written permission. - * - * ALTERNATIVELY, provided that this notice is retained in full, this product - * may be distributed under the terms of the GNU General Public License (GPL), - * in which case the provisions of the GPL apply INSTEAD OF those given above. - * - * DISCLAIMER - * - * This software is provided 'as is' with no explicit or implied warranties - * in respect of its properties, including, but not limited to, correctness - * and/or fitness for purpose. - * --------------------------------------------------------------------------- - * Issue 31/01/2006 - * - * This code requires either ASM_X86_V2 or ASM_X86_V2C to be set in aesopt.h - * and the same define to be set here as well. If AES_V2C is set this file - * requires the C files aeskey.c and aestab.c for support. - * - * This is a full assembler implementation covering encryption, decryption and - * key scheduling. It uses 2k bytes of tables but its encryption and decryption - * performance is very close to that obtained using large tables. Key schedule - * expansion is slower for both encryption and decryption but this is likely to - * be offset by the much smaller load that this version places on the processor - * cache. I acknowledge the contribution made by Daniel Bernstein to aspects of - * the design of the AES round function used here. - * - * This code provides the standard AES block size (128 bits, 16 bytes) and the - * three standard AES key sizes (128, 192 and 256 bits). It has the same call - * interface as my C implementation. The ebx, esi, edi and ebp registers are - * preserved across calls but eax, ecx and edx and the artihmetic status flags - * are not. - */ - -#include - -#define AES_128 /* define if AES with 128 bit keys is needed */ -#define AES_192 /* define if AES with 192 bit keys is needed */ -#define AES_256 /* define if AES with 256 bit keys is needed */ -#define AES_VAR /* define if a variable key size is needed */ -#define ENCRYPTION /* define if encryption is needed */ -#define DECRYPTION /* define if decryption is needed */ -#define AES_REV_DKS /* define if key decryption schedule is reversed */ - -#ifndef ASM_X86_V2C -#define ENCRYPTION_KEY_SCHEDULE /* define if enc. key expansion is needed */ -#define DECRYPTION_KEY_SCHEDULE /* define if dec. key expansion is needed */ -#endif - -/* - * The encryption key schedule has the following in memory layout where N is the - * number of rounds (10, 12 or 14): - * - * lo: | input key (round 0) | ; each round is four 32-bit words - * | encryption round 1 | - * | encryption round 2 | - * .... - * | encryption round N-1 | - * hi: | encryption round N | - * - * The decryption key schedule is normally set up so that it has the same - * layout as above by actually reversing the order of the encryption key - * schedule in memory (this happens when AES_REV_DKS is set): - * - * lo: | decryption round 0 | = | encryption round N | - * | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ] - * | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ] - * .... .... - * | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ] - * hi: | decryption round N | = | input key (round 0) | - * - * with rounds except the first and last modified using inv_mix_column() - * But if AES_REV_DKS is NOT set the order of keys is left as it is for - * encryption so that it has to be accessed in reverse when used for - * decryption (although the inverse mix column modifications are done) - * - * lo: | decryption round 0 | = | input key (round 0) | - * | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ] - * | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ] - * .... .... - * | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ] - * hi: | decryption round N | = | encryption round N | - * - * This layout is faster when the assembler key scheduling provided here - * is used. - */ - -/* End of user defines */ - -#ifdef AES_VAR -#ifndef AES_128 -#define AES_128 -#endif -#ifndef AES_192 -#define AES_192 -#endif -#ifndef AES_256 -#define AES_256 -#endif -#endif - -#ifdef AES_VAR -#define KS_LENGTH 60 -#else -#ifdef AES_256 -#define KS_LENGTH 60 -#else -#ifdef AES_192 -#define KS_LENGTH 52 -#else -#define KS_LENGTH 44 -#endif -#endif -#endif - -/* - * These macros implement stack based local variables - */ -#define save(r1) \ - movl %r1, (%esp); - -#define restore(r1) \ - movl (%esp), %r1; - -#define do_call(f, n) \ - call EXT(f); \ - addl $(n), %esp; - -/* - * finite field multiplies by {02}, {04} and {08} - */ -#define f2(x) ((x<<1)^(((x>>7)&1)*0x11b)) -#define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b)) -#define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b)) - -/* - * finite field multiplies required in table generation - */ -#define f3(x) (f2(x) ^ x) -#define f9(x) (f8(x) ^ x) -#define fb(x) (f8(x) ^ f2(x) ^ x) -#define fd(x) (f8(x) ^ f4(x) ^ x) -#define fe(x) (f8(x) ^ f4(x) ^ f2(x)) - -#define etab_0(x) enc_tab+4(,x,8) -#define etab_1(x) enc_tab+3(,x,8) -#define etab_2(x) enc_tab+2(,x,8) -#define etab_3(x) enc_tab+1(,x,8) - -#define etab_b(x) etab_3(x) - -#define btab_0(x) enc_tab+6(,x,8) -#define btab_1(x) enc_tab+5(,x,8) -#define btab_2(x) enc_tab+4(,x,8) -#define btab_3(x) enc_tab+3(,x,8) - -/* - * ROUND FUNCTION. Build column[2] on ESI and column[3] on EDI that have the - * round keys pre-loaded. Build column[0] in EBP and column[1] in EBX. - * - * Input: - * - * EAX column[0] - * EBX column[1] - * ECX column[2] - * EDX column[3] - * ESI column key[round][2] - * EDI column key[round][3] - * EBP scratch - * - * Output: - * - * EBP column[0] unkeyed - * EBX column[1] unkeyed - * ESI column[2] keyed - * EDI column[3] keyed - * EAX scratch - * ECX scratch - * EDX scratch - */ -#define rnd_fun(m1, m2) \ - roll $16, %ebx; \ - \ - ## m1 ## _zo(esi, cl, 0, ebp); \ - m1(esi, dh, 1, ebp); \ - m1(esi, bh, 3, ebp); \ - ## m1 ## _zo(edi, dl, 0, ebp); \ - m1(edi, ah, 1, ebp); \ - m1(edi, bl, 2, ebp); \ - ## m2 ## _zo(ebp, al, 0, ebp); \ - \ - shrl $16, %ebx; \ - andl $0xffff0000, %eax; \ - orl %ebx, %eax; \ - shrl $16, %edx; \ - \ - m1(ebp, ah, 1, ebx); \ - m1(ebp, dh, 3, ebx); \ - m2(ebx, dl, 2, ebx); \ - m1(ebx, ch, 1, edx); \ - ## m1 ## _zo(ebx, al, 0, edx); \ - \ - shrl $16, %eax; \ - shrl $16, %ecx; \ - \ - m1(ebp, cl, 2, edx); \ - m1(edi, ch, 3, edx); \ - m1(esi, al, 2, edx); \ - m1(ebx, ah, 3, edx) - -/* - * Basic MOV and XOR Operations for normal rounds - */ -#define nr_xor_zo nr_xor -#define nr_xor(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - xorl etab_ ## r3(%r4), %r1; - -#define nr_mov_zo nr_mov -#define nr_mov(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - movl etab_ ## r3(%r4), %r1; - -/* - * Basic MOV and XOR Operations for last round - */ - -#if 1 - -#define lr_xor_zo(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - movzbl etab_b(%r4), %r4; \ - xor %r4, %r1; - -#define lr_xor(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - movzbl etab_b(%r4), %r4; \ - shll $(8*r3), %r4; \ - xor %r4, %r1; - -#define lr_mov_zo(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - movzbl etab_b(%r4), %r1; - -#define lr_mov(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - movzbl etab_b(%r4), %r1; \ - shll $(8*r3), %r1; - -#else /* less effective but worth leaving as an option */ - -#define lr_xor_zo lr_xor -#define lr_xor(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - mov btab_ ## r3(%r4), %r4; \ - andl $(0x000000ff << 8 * r3), %r4; \ - xor %r4, %r1; - -#define lr_mov_zo lr_mov -#define lr_mov(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - mov btab_ ## r3(%r4), %r1; \ - andl $(0x000000ff << 8 * r3), %r1; - -#endif - -/* - * Apply S-Box to the 4 bytes in a 32-bit word and rotate left 3 byte positions - * - * r1 : output is xored into this register - * r2 : input: a => eax, b => ebx, c => ecx, d => edx - * r3 : scratch register - */ - -#define l3s_col(r1, r2, r3) \ - lr_xor_zo(r1, ## r2 ## h, 0, r3); \ - lr_xor(r1, ## r2 ## l, 3, r3); \ - shrl $16, %e ## r2 ## x; \ - lr_xor(r1, ## r2 ## h, 2, r3); \ - lr_xor(r1, ## r2 ## l, 1, r3); - -/* - * offsets to parameters - */ -#define in_blk 4 /* input byte array address parameter */ -#define out_blk 8 /* output byte array address parameter */ -#define ctx 12 /* AES context structure */ -#define stk_spc 20 /* stack space */ - -#ifdef ENCRYPTION - -#define ENCRYPTION_TABLE - -#define enc_round \ - addl $16, %ebp; \ - save(ebp); \ - movl 8(%ebp), %esi; \ - movl 12(%ebp), %edi; \ - \ - rnd_fun(nr_xor, nr_mov); \ - \ - movl %ebp, %eax; \ - movl %esi, %ecx; \ - movl %edi, %edx; \ - restore(ebp); \ - xorl (%ebp), %eax; \ - xorl 4(%ebp), %ebx; - -#define enc_last_round \ - addl $16, %ebp; \ - save(ebp); \ - movl 8(%ebp), %esi; \ - movl 12(%ebp), %edi; \ - \ - rnd_fun(lr_xor, lr_mov); \ - \ - movl %ebp, %eax; \ - restore(ebp); \ - xorl (%ebp), %eax; \ - xorl 4(%ebp), %ebx; - - .section __TEXT, __text - -/* - * AES Encryption Subroutine - */ -Entry(aes_encrypt) - - subl $stk_spc, %esp - movl %ebp, 16(%esp) - movl %ebx, 12(%esp) - movl %esi, 8(%esp) - movl %edi, 4(%esp) - - movl in_blk+stk_spc(%esp), %esi /* input pointer */ - movl (%esi), %eax - movl 4(%esi), %ebx - movl 8(%esi), %ecx - movl 12(%esi), %edx - - movl ctx+stk_spc(%esp), %ebp /* key pointer */ - movzbl 4*KS_LENGTH(%ebp), %edi - xorl (%ebp), %eax - xorl 4(%ebp), %ebx - xorl 8(%ebp), %ecx - xorl 12(%ebp), %edx - - /* - * determine the number of rounds - */ - cmpl $10*16, %edi - je aes_encrypt.3 - cmpl $12*16, %edi - je aes_encrypt.2 - cmpl $14*16, %edi - je aes_encrypt.1 - movl $-1, %eax - jmp aes_encrypt.5 - -aes_encrypt.1: - enc_round - enc_round -aes_encrypt.2: - enc_round - enc_round -aes_encrypt.3: - enc_round - enc_round - enc_round - enc_round - enc_round - enc_round - enc_round - enc_round - enc_round - enc_last_round - - movl out_blk+stk_spc(%esp), %edx - movl %eax, (%edx) - movl %ebx, 4(%edx) - movl %esi, 8(%edx) - movl %edi, 12(%edx) - xorl %eax, %eax - -aes_encrypt.5: - movl 16(%esp), %ebp - movl 12(%esp), %ebx - movl 8(%esp), %esi - movl 4(%esp), %edi - addl $stk_spc, %esp - ret - -#endif - -/* - * For r2 == 16, or r2 == 24 && r1 == 7, or r2 ==32 && r1 == 6 - */ -#define f_key(r1, r2, rc_val) \ - l3s_col(esi, a, ebx); \ - xorl $rc_val, %esi; \ - \ - movl %esi, r1*r2(%ebp); \ - xorl %esi, %edi; \ - movl %edi, r1*r2+4(%ebp); \ - xorl %edi, %ecx; \ - movl %ecx, r1*r2+8(%ebp); \ - xorl %ecx, %edx; \ - movl %edx, r1*r2+12(%ebp); \ - movl %edx, %eax; - -/* - * For r2 == 24 && r1 == 0 to 6 - */ -#define f_key_24(r1, r2, rc_val) \ - f_key(r1, r2, rc_val); \ - \ - xorl r1*r2+16-r2(%ebp), %eax; \ - movl %eax, r1*r2+16(%ebp); \ - xorl r1*r2+20-r2(%ebp), %eax; \ - movl %eax, r1*r2+20(%ebp); - -/* - * For r2 ==32 && r1 == 0 to 5 - */ -#define f_key_32(r1, r2, rc_val) \ - f_key(r1, r2, rc_val); \ - \ - roll $8, %eax; \ - pushl %edx; \ - movl r1*r2+16-r2(%ebp), %edx; \ - l3s_col(edx, a, ebx); \ - movl %edx, %eax; \ - popl %edx; \ - movl %eax, r1*r2+16(%ebp); \ - xorl r1*r2+20-r2(%ebp), %eax; \ - movl %eax, r1*r2+20(%ebp); \ - xorl r1*r2+24-r2(%ebp), %eax; \ - movl %eax, r1*r2+24(%ebp); \ - xorl r1*r2+28-r2(%ebp), %eax; \ - movl %eax, r1*r2+28(%ebp); - -#ifdef ENCRYPTION_KEY_SCHEDULE - -#ifdef AES_128 - -#ifndef ENCRYPTION_TABLE -#define ENCRYPTION_TABLE -#endif - -Entry(aes_encrypt_key128) - - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - - movl 24(%esp), %ebp - movl $10*16, 4*KS_LENGTH(%ebp) - movl 20(%esp), %ebx - - movl (%ebx), %esi - movl %esi, (%ebp) - movl 4(%ebx), %edi - movl %edi, 4(%ebp) - movl 8(%ebx), %ecx - movl %ecx, 8(%ebp) - movl 12(%ebx), %edx - movl %edx, 12(%ebp) - addl $16, %ebp - movl %edx, %eax - - f_key(0, 16, 1) - f_key(1, 16, 2) - f_key(2, 16, 4) - f_key(3, 16, 8) - f_key(4, 16, 16) - f_key(5, 16, 32) - f_key(6, 16, 64) - f_key(7, 16, 128) - f_key(8, 16, 27) - f_key(9, 16, 54) - - popl %edi - popl %esi - popl %ebx - popl %ebp - xorl %eax, %eax - ret - -#endif - -#ifdef AES_192 - -#ifndef ENCRYPTION_TABLE -#define ENCRYPTION_TABLE -#endif - -Entry(aes_encrypt_key192) - - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - - movl 24(%esp), %ebp - movl $12*16, 4*KS_LENGTH(%ebp) - movl 20(%esp), %ebx - - movl (%ebx), %esi - movl %esi, (%ebp) - movl 4(%ebx), %edi - movl %edi, 4(%ebp) - movl 8(%ebx), %ecx - movl %ecx, 8(%ebp) - movl 12(%ebx), %edx - movl %edx, 12(%ebp) - movl 16(%ebx), %eax - movl %eax, 16(%ebp) - movl 20(%ebx), %eax - movl %eax, 20(%ebp) - addl $24, %ebp - - f_key_24(0, 24, 1) - f_key_24(1, 24, 2) - f_key_24(2, 24, 4) - f_key_24(3, 24, 8) - f_key_24(4, 24, 16) - f_key_24(5, 24, 32) - f_key_24(6, 24, 64) - f_key(7, 24, 128) - - popl %edi - popl %esi - popl %ebx - popl %ebp - xorl %eax, %eax - ret - -#endif - -#ifdef AES_256 - -#ifndef ENCRYPTION_TABLE -#define ENCRYPTION_TABLE -#endif - -Entry(aes_encrypt_key256) - - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - - movl 24(%esp), %ebp - movl $14*16, 4*KS_LENGTH(%ebp) - movl 20(%esp), %ebx - - movl (%ebx), %esi - movl %esi, (%ebp) - movl 4(%ebx), %edi - movl %edi, 4(%ebp) - movl 8(%ebx), %ecx - movl %ecx, 8(%ebp) - movl 12(%ebx), %edx - movl %edx, 12(%ebp) - movl 16(%ebx), %eax - movl %eax, 16(%ebp) - movl 20(%ebx), %eax - movl %eax, 20(%ebp) - movl 24(%ebx), %eax - movl %eax, 24(%ebp) - movl 28(%ebx), %eax - movl %eax, 28(%ebp) - addl $32, %ebp - - f_key_32(0, 32, 1) - f_key_32(1, 32, 2) - f_key_32(2, 32, 4) - f_key_32(3, 32, 8) - f_key_32(4, 32, 16) - f_key_32(5, 32, 32) - f_key(6, 32, 64) - - popl %edi - popl %esi - popl %ebx - popl %ebp - xorl %eax, %eax - ret - -#endif - -#ifdef AES_VAR - -#ifndef ENCRYPTION_TABLE -#define ENCRYPTION_TABLE -#endif - -Entry(aes_encrypt_key) - - movl 4(%esp), %ecx - movl 8(%esp), %eax - movl 12(%esp), %edx - pushl %edx - pushl %ecx - - cmpl $16, %eax - je aes_encrypt_key.1 - cmpl $128, %eax - je aes_encrypt_key.1 - - cmpl $24, %eax - je aes_encrypt_key.2 - cmpl $192, %eax - je aes_encrypt_key.2 - - cmpl $32, %eax - je aes_encrypt_key.3 - cmpl $256, %eax - je aes_encrypt_key.3 - movl $-1, %eax - addl $8, %esp - ret - -aes_encrypt_key.1: - do_call(aes_encrypt_key128, 8) - ret -aes_encrypt_key.2: - do_call(aes_encrypt_key192, 8) - ret -aes_encrypt_key.3: - do_call(aes_encrypt_key256, 8) - ret - -#endif - -#endif - -#ifdef ENCRYPTION_TABLE - -# S-box data - 256 entries - - .section __DATA, __data - .align ALIGN - -#define u8(x) 0, x, x, f3(x), f2(x), x, x, f3(x) - -enc_tab: - .byte u8(0x63),u8(0x7c),u8(0x77),u8(0x7b),u8(0xf2),u8(0x6b),u8(0x6f),u8(0xc5) - .byte u8(0x30),u8(0x01),u8(0x67),u8(0x2b),u8(0xfe),u8(0xd7),u8(0xab),u8(0x76) - .byte u8(0xca),u8(0x82),u8(0xc9),u8(0x7d),u8(0xfa),u8(0x59),u8(0x47),u8(0xf0) - .byte u8(0xad),u8(0xd4),u8(0xa2),u8(0xaf),u8(0x9c),u8(0xa4),u8(0x72),u8(0xc0) - .byte u8(0xb7),u8(0xfd),u8(0x93),u8(0x26),u8(0x36),u8(0x3f),u8(0xf7),u8(0xcc) - .byte u8(0x34),u8(0xa5),u8(0xe5),u8(0xf1),u8(0x71),u8(0xd8),u8(0x31),u8(0x15) - .byte u8(0x04),u8(0xc7),u8(0x23),u8(0xc3),u8(0x18),u8(0x96),u8(0x05),u8(0x9a) - .byte u8(0x07),u8(0x12),u8(0x80),u8(0xe2),u8(0xeb),u8(0x27),u8(0xb2),u8(0x75) - .byte u8(0x09),u8(0x83),u8(0x2c),u8(0x1a),u8(0x1b),u8(0x6e),u8(0x5a),u8(0xa0) - .byte u8(0x52),u8(0x3b),u8(0xd6),u8(0xb3),u8(0x29),u8(0xe3),u8(0x2f),u8(0x84) - .byte u8(0x53),u8(0xd1),u8(0x00),u8(0xed),u8(0x20),u8(0xfc),u8(0xb1),u8(0x5b) - .byte u8(0x6a),u8(0xcb),u8(0xbe),u8(0x39),u8(0x4a),u8(0x4c),u8(0x58),u8(0xcf) - .byte u8(0xd0),u8(0xef),u8(0xaa),u8(0xfb),u8(0x43),u8(0x4d),u8(0x33),u8(0x85) - .byte u8(0x45),u8(0xf9),u8(0x02),u8(0x7f),u8(0x50),u8(0x3c),u8(0x9f),u8(0xa8) - .byte u8(0x51),u8(0xa3),u8(0x40),u8(0x8f),u8(0x92),u8(0x9d),u8(0x38),u8(0xf5) - .byte u8(0xbc),u8(0xb6),u8(0xda),u8(0x21),u8(0x10),u8(0xff),u8(0xf3),u8(0xd2) - .byte u8(0xcd),u8(0x0c),u8(0x13),u8(0xec),u8(0x5f),u8(0x97),u8(0x44),u8(0x17) - .byte u8(0xc4),u8(0xa7),u8(0x7e),u8(0x3d),u8(0x64),u8(0x5d),u8(0x19),u8(0x73) - .byte u8(0x60),u8(0x81),u8(0x4f),u8(0xdc),u8(0x22),u8(0x2a),u8(0x90),u8(0x88) - .byte u8(0x46),u8(0xee),u8(0xb8),u8(0x14),u8(0xde),u8(0x5e),u8(0x0b),u8(0xdb) - .byte u8(0xe0),u8(0x32),u8(0x3a),u8(0x0a),u8(0x49),u8(0x06),u8(0x24),u8(0x5c) - .byte u8(0xc2),u8(0xd3),u8(0xac),u8(0x62),u8(0x91),u8(0x95),u8(0xe4),u8(0x79) - .byte u8(0xe7),u8(0xc8),u8(0x37),u8(0x6d),u8(0x8d),u8(0xd5),u8(0x4e),u8(0xa9) - .byte u8(0x6c),u8(0x56),u8(0xf4),u8(0xea),u8(0x65),u8(0x7a),u8(0xae),u8(0x08) - .byte u8(0xba),u8(0x78),u8(0x25),u8(0x2e),u8(0x1c),u8(0xa6),u8(0xb4),u8(0xc6) - .byte u8(0xe8),u8(0xdd),u8(0x74),u8(0x1f),u8(0x4b),u8(0xbd),u8(0x8b),u8(0x8a) - .byte u8(0x70),u8(0x3e),u8(0xb5),u8(0x66),u8(0x48),u8(0x03),u8(0xf6),u8(0x0e) - .byte u8(0x61),u8(0x35),u8(0x57),u8(0xb9),u8(0x86),u8(0xc1),u8(0x1d),u8(0x9e) - .byte u8(0xe1),u8(0xf8),u8(0x98),u8(0x11),u8(0x69),u8(0xd9),u8(0x8e),u8(0x94) - .byte u8(0x9b),u8(0x1e),u8(0x87),u8(0xe9),u8(0xce),u8(0x55),u8(0x28),u8(0xdf) - .byte u8(0x8c),u8(0xa1),u8(0x89),u8(0x0d),u8(0xbf),u8(0xe6),u8(0x42),u8(0x68) - .byte u8(0x41),u8(0x99),u8(0x2d),u8(0x0f),u8(0xb0),u8(0x54),u8(0xbb),u8(0x16) - -#endif - -#ifdef DECRYPTION - -#define DECRYPTION_TABLE - -#define dtab_0(x) dec_tab(,x,8) -#define dtab_1(x) dec_tab+3(,x,8) -#define dtab_2(x) dec_tab+2(,x,8) -#define dtab_3(x) dec_tab+1(,x,8) -#define dtab_x(x) dec_tab+7(,x,8) - -#define irn_fun(m1, m2) \ - roll $16, %eax; \ - \ - ## m1 ## _zo(esi, cl, 0, ebp); \ - m1(esi, bh, 1, ebp); \ - m1(esi, al, 2, ebp); \ - ## m1 ## _zo(edi, dl, 0, ebp); \ - m1(edi, ch, 1, ebp); \ - m1(edi, ah, 3, ebp); \ - ## m2 ## _zo(ebp, bl, 0, ebp); \ - \ - shrl $16, %eax; \ - andl $0xffff0000, %ebx; \ - orl %eax, %ebx; \ - shrl $16, %ecx; \ - \ - m1(ebp, bh, 1, eax); \ - m1(ebp, ch, 3, eax); \ - m2(eax, cl, 2, ecx); \ - ## m1 ## _zo(eax, bl, 0, ecx); \ - m1(eax, dh, 1, ecx); \ - \ - shrl $16, %ebx; \ - shrl $16, %edx; \ - \ - m1(esi, dh, 3, ecx); \ - m1(ebp, dl, 2, ecx); \ - m1(eax, bh, 3, ecx); \ - m1(edi, bl, 2, ecx); - -/* - * Basic MOV and XOR Operations for normal rounds - */ -#define ni_xor_zo ni_xor -#define ni_xor(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - xorl dtab_ ## r3 ## (%r4), %r1; - -#define ni_mov_zo ni_mov -#define ni_mov(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - movl dtab_ ## r3 ## (%r4), %r1; - -/* - * Basic MOV and XOR Operations for last round - */ - -#define li_xor_zo(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - movzbl dtab_x(%r4), %r4; \ - xor %r4, %r1; - -#define li_xor(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - movzbl dtab_x(%r4), %r4; \ - shll $(8*r3), %r4; \ - xor %r4, %r1; - -#define li_mov_zo(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - movzbl dtab_x(%r4), %r1; - -#define li_mov(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - movzbl dtab_x(%r4), %r1; \ - shl $(8*r3), %r1; - -#ifdef AES_REV_DKS - -#define dec_round \ - addl $16, %ebp; \ - save(ebp); \ - movl 8(%ebp), %esi; \ - movl 12(%ebp), %edi; \ - \ - irn_fun(ni_xor, ni_mov); \ - \ - movl %ebp, %ebx; \ - movl %esi, %ecx; \ - movl %edi, %edx; \ - restore(ebp); \ - xorl (%ebp), %eax; \ - xorl 4(%ebp), %ebx; - -#define dec_last_round \ - addl $16, %ebp; \ - save(ebp); \ - movl 8(%ebp), %esi; \ - movl 12(%ebp), %edi; \ - \ - irn_fun(li_xor, li_mov); \ - \ - movl %ebp, %ebx; \ - restore(ebp); \ - xorl (%ebp), %eax; \ - xorl 4(%ebp), %ebx; - -#else - -#define dec_round \ - subl $16, %ebp; \ - save(ebp); \ - movl 8(%ebp), %esi; \ - movl 12(%ebp), %edi; \ - \ - irn_fun(ni_xor, ni_mov); \ - \ - movl %ebp, %ebx; \ - movl %esi, %ecx; \ - movl %edi, %edx; \ - restore(ebp); \ - xorl (%ebp), %eax; \ - xorl 4(%ebp), %ebx; - -#define dec_last_round \ - subl $16, %ebp; \ - save(ebp); \ - movl 8(%ebp), %esi; \ - movl 12(%ebp), %edi; \ - \ - irn_fun(li_xor, li_mov); \ - \ - movl %ebp, %ebx; \ - restore(ebp); \ - xorl (%ebp), %eax; \ - xorl 4(%ebp), %ebx; - -#endif /* AES_REV_DKS */ - - .section __TEXT, __text - -/* - * AES Decryption Subroutine - */ -Entry(aes_decrypt) - - subl $stk_spc, %esp - movl %ebp, 16(%esp) - movl %ebx, 12(%esp) - movl %esi, 8(%esp) - movl %edi, 4(%esp) - - /* - * input four columns and xor in first round key - */ - movl in_blk+stk_spc(%esp), %esi /* input pointer */ - movl (%esi), %eax - movl 4(%esi), %ebx - movl 8(%esi), %ecx - movl 12(%esi), %edx - leal 16(%esi), %esi - - movl ctx+stk_spc(%esp), %ebp /* key pointer */ - movzbl 4*KS_LENGTH(%ebp), %edi -#ifndef AES_REV_DKS /* if decryption key schedule is not reversed */ - leal (%ebp,%edi), %ebp /* we have to access it from the top down */ -#endif - xorl (%ebp), %eax /* key schedule */ - xorl 4(%ebp), %ebx - xorl 8(%ebp), %ecx - xorl 12(%ebp), %edx - - /* - * determine the number of rounds - */ - cmpl $10*16, %edi - je aes_decrypt.3 - cmpl $12*16, %edi - je aes_decrypt.2 - cmpl $14*16, %edi - je aes_decrypt.1 - movl $-1, %eax - jmp aes_decrypt.5 - -aes_decrypt.1: - dec_round - dec_round -aes_decrypt.2: - dec_round - dec_round -aes_decrypt.3: - dec_round - dec_round - dec_round - dec_round - dec_round - dec_round - dec_round - dec_round - dec_round - dec_last_round - - /* - * move final values to the output array. - */ - movl out_blk+stk_spc(%esp), %ebp - movl %eax, (%ebp) - movl %ebx, 4(%ebp) - movl %esi, 8(%ebp) - movl %edi, 12(%ebp) - xorl %eax, %eax - -aes_decrypt.5: - movl 16(%esp), %ebp - movl 12(%esp), %ebx - movl 8(%esp), %esi - movl 4(%esp), %edi - addl $stk_spc, %esp - ret - -#endif - -#define inv_mix_col \ - movzbl %dl, %ebx; \ - movzbl etab_b(%ebx), %ebx; \ - movl dtab_0(%ebx), %eax; \ - movzbl %dh, %ebx; \ - shrl $16, %edx; \ - movzbl etab_b(%ebx), %ebx; \ - xorl dtab_1(%ebx), %eax; \ - movzbl %dl, %ebx; \ - movzbl etab_b(%ebx), %ebx; \ - xorl dtab_2(%ebx), %eax; \ - movzbl %dh, %ebx; \ - movzbl etab_b(%ebx), %ebx; \ - xorl dtab_3(%ebx), %eax; - -#ifdef DECRYPTION_KEY_SCHEDULE - -#ifdef AES_128 - -#ifndef DECRYPTION_TABLE -#define DECRYPTION_TABLE -#endif - -Entry(aes_decrypt_key128) - - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 24(%esp), %eax /* context */ - movl 20(%esp), %edx /* key */ - pushl %eax - pushl %edx - do_call(aes_encrypt_key128, 8) - movl $10*16, %eax - movl 24(%esp), %esi /* pointer to first round key */ - leal (%esi,%eax), %edi /* pointer to last round key */ - addl $32, %esi - /* the inverse mix column transformation */ - movl -16(%esi), %edx /* needs to be applied to all round keys */ - inv_mix_col - movl %eax, -16(%esi) /* transforming the four sub-keys in the */ - movl -12(%esi), %edx /* second round key */ - inv_mix_col - movl %eax, -12(%esi) /* transformations for subsequent rounds */ - movl -8(%esi), %edx /* can then be made more efficient by */ - inv_mix_col - movl %eax, -8(%esi) /* in the encryption round key ek[r]: */ - movl -4(%esi), %edx - inv_mix_col - movl %eax, -4(%esi) /* where n is 1..3. Hence the corresponding */ - -aes_decrypt_key128.0: - movl (%esi), %edx /* subkeys in the decryption round key dk[r] */ - inv_mix_col - movl %eax, (%esi) /* GF(256): */ - xorl -12(%esi), %eax - movl %eax, 4(%esi) /* dk[r][n] = dk[r][n-1] ^ dk[r-1][n] */ - xorl -8(%esi), %eax - movl %eax, 8(%esi) /* So we only need one inverse mix column */ - xorl -4(%esi), %eax /* operation (n = 0) for each four word cycle */ - movl %eax, 12(%esi) /* in the expanded key. */ - addl $16, %esi - cmpl %esi, %edi - jg aes_decrypt_key128.0 - jmp dec_end - -#endif - -#ifdef AES_192 - -#ifndef DECRYPTION_TABLE -#define DECRYPTION_TABLE -#endif - -Entry(aes_decrypt_key192) - - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 24(%esp), %eax /* context */ - movl 20(%esp), %edx /* key */ - pushl %eax - pushl %edx - do_call(aes_encrypt_key192, 8) - movl $12*16, %eax - movl 24(%esp), %esi /* first round key */ - leal (%esi,%eax), %edi /* last round key */ - addl $48, %esi /* the first 6 words are the key, of */ - /* which the top 2 words are part of */ - movl -32(%esi), %edx /* the second round key and hence */ - inv_mix_col - movl %eax, -32(%esi) /* need to do a further six values prior */ - movl -28(%esi), %edx /* to using a more efficient technique */ - inv_mix_col - movl %eax, -28(%esi) - /* dk[r][n] = dk[r][n-1] ^ dk[r-1][n] */ - movl -24(%esi), %edx - inv_mix_col - movl %eax, -24(%esi) /* cycle is now 6 words long */ - movl -20(%esi), %edx - inv_mix_col - movl %eax, -20(%esi) - movl -16(%esi), %edx - inv_mix_col - movl %eax, -16(%esi) - movl -12(%esi), %edx - inv_mix_col - movl %eax, -12(%esi) - movl -8(%esi), %edx - inv_mix_col - movl %eax, -8(%esi) - movl -4(%esi), %edx - inv_mix_col - movl %eax, -4(%esi) - -aes_decrypt_key192.0: - movl (%esi), %edx /* expanded key is 13 * 4 = 44 32-bit words */ - inv_mix_col - movl %eax, (%esi) /* using inv_mix_col. We have already done 8 */ - xorl -20(%esi), %eax /* of these so 36 are left - hence we need */ - movl %eax, 4(%esi) /* exactly 6 loops of six here */ - xorl -16(%esi), %eax - movl %eax, 8(%esi) - xorl -12(%esi), %eax - movl %eax, 12(%esi) - xorl -8(%esi), %eax - movl %eax, 16(%esi) - xorl -4(%esi), %eax - movl %eax, 20(%esi) - addl $24, %esi - cmpl %esi, %edi - jg aes_decrypt_key192.0 - jmp dec_end - -#endif - -#ifdef AES_256 - -#ifndef DECRYPTION_TABLE -#define DECRYPTION_TABLE -#endif - -Entry(aes_decrypt_key256) - - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 24(%esp), %eax - movl 20(%esp), %edx - pushl %eax - pushl %edx - do_call(aes_encrypt_key256, 8) - movl $14*16, %eax - movl 24(%esp), %esi - leal (%esi,%eax), %edi - addl $64, %esi - - movl -48(%esi), %edx /* the primary key is 8 words, of which */ - inv_mix_col - movl %eax, -48(%esi) - movl -44(%esi), %edx - inv_mix_col - movl %eax, -44(%esi) - movl -40(%esi), %edx - inv_mix_col - movl %eax, -40(%esi) - movl -36(%esi), %edx - inv_mix_col - movl %eax, -36(%esi) - - movl -32(%esi), %edx /* the encryption key expansion cycle is */ - inv_mix_col - movl %eax, -32(%esi) /* start by doing one complete block */ - movl -28(%esi), %edx - inv_mix_col - movl %eax, -28(%esi) - movl -24(%esi), %edx - inv_mix_col - movl %eax, -24(%esi) - movl -20(%esi), %edx - inv_mix_col - movl %eax, -20(%esi) - movl -16(%esi), %edx - inv_mix_col - movl %eax, -16(%esi) - movl -12(%esi), %edx - inv_mix_col - movl %eax, -12(%esi) - movl -8(%esi), %edx - inv_mix_col - movl %eax, -8(%esi) - movl -4(%esi), %edx - inv_mix_col - movl %eax, -4(%esi) - -aes_decrypt_key256.0: - movl (%esi), %edx /* we can now speed up the remaining */ - inv_mix_col - movl %eax, (%esi) /* outlined earlier. But note that */ - xorl -28(%esi), %eax /* there is one extra inverse mix */ - movl %eax, 4(%esi) /* column operation as the 256 bit */ - xorl -24(%esi), %eax /* key has an extra non-linear step */ - movl %eax, 8(%esi) /* for the midway element. */ - xorl -20(%esi), %eax - movl %eax, 12(%esi) /* the expanded key is 15 * 4 = 60 */ - movl 16(%esi), %edx /* 32-bit words of which 52 need to */ - inv_mix_col - movl %eax, 16(%esi) /* 12 so 40 are left - which means */ - xorl -12(%esi), %eax /* that we need exactly 5 loops of 8 */ - movl %eax, 20(%esi) - xorl -8(%esi), %eax - movl %eax, 24(%esi) - xorl -4(%esi), %eax - movl %eax, 28(%esi) - addl $32, %esi - cmpl %esi, %edi - jg aes_decrypt_key256.0 - -#endif - -dec_end: - -#ifdef AES_REV_DKS - - movl 24(%esp), %esi /* this reverses the order of the */ -dec_end.1: - movl (%esi), %eax /* round keys if required */ - movl 4(%esi), %ebx - movl (%edi), %ebp - movl 4(%edi), %edx - movl %ebp, (%esi) - movl %edx, 4(%esi) - movl %eax, (%edi) - movl %ebx, 4(%edi) - - movl 8(%esi), %eax - movl 12(%esi), %ebx - movl 8(%edi), %ebp - movl 12(%edi), %edx - movl %ebp, 8(%esi) - movl %edx, 12(%esi) - movl %eax, 8(%edi) - movl %ebx, 12(%edi) - - addl $16, %esi - subl $16, %edi - cmpl %esi, %edi - jg dec_end.1 - -#endif - - popl %edi - popl %esi - popl %ebx - popl %ebp - xorl %eax, %eax - ret - -#ifdef AES_VAR - -Entry(aes_decrypt_key) - - movl 4(%esp), %ecx - movl 8(%esp), %eax - movl 12(%esp), %edx - pushl %edx - pushl %ecx - - cmpl $16, %eax - je aes_decrypt_key.1 - cmpl $128, %eax - je aes_decrypt_key.1 - - cmpl $24, %eax - je aes_decrypt_key.2 - cmpl $192, %eax - je aes_decrypt_key.2 - - cmpl $32, %eax - je aes_decrypt_key.3 - cmpl $256, %eax - je aes_decrypt_key.3 - movl $-1, %eax - addl $8, %esp - ret - -aes_decrypt_key.1: - do_call(aes_decrypt_key128, 8) - ret -aes_decrypt_key.2: - do_call(aes_decrypt_key192, 8) - ret -aes_decrypt_key.3: - do_call(aes_decrypt_key256, 8) - ret - -#endif - -#endif - -#ifdef DECRYPTION_TABLE - -/* - * Inverse S-box data - 256 entries - */ - - .section __DATA, __data - .align ALIGN - -#define v8(x) fe(x), f9(x), fd(x), fb(x), fe(x), f9(x), fd(x), x - -dec_tab: - .byte v8(0x52),v8(0x09),v8(0x6a),v8(0xd5),v8(0x30),v8(0x36),v8(0xa5),v8(0x38) - .byte v8(0xbf),v8(0x40),v8(0xa3),v8(0x9e),v8(0x81),v8(0xf3),v8(0xd7),v8(0xfb) - .byte v8(0x7c),v8(0xe3),v8(0x39),v8(0x82),v8(0x9b),v8(0x2f),v8(0xff),v8(0x87) - .byte v8(0x34),v8(0x8e),v8(0x43),v8(0x44),v8(0xc4),v8(0xde),v8(0xe9),v8(0xcb) - .byte v8(0x54),v8(0x7b),v8(0x94),v8(0x32),v8(0xa6),v8(0xc2),v8(0x23),v8(0x3d) - .byte v8(0xee),v8(0x4c),v8(0x95),v8(0x0b),v8(0x42),v8(0xfa),v8(0xc3),v8(0x4e) - .byte v8(0x08),v8(0x2e),v8(0xa1),v8(0x66),v8(0x28),v8(0xd9),v8(0x24),v8(0xb2) - .byte v8(0x76),v8(0x5b),v8(0xa2),v8(0x49),v8(0x6d),v8(0x8b),v8(0xd1),v8(0x25) - .byte v8(0x72),v8(0xf8),v8(0xf6),v8(0x64),v8(0x86),v8(0x68),v8(0x98),v8(0x16) - .byte v8(0xd4),v8(0xa4),v8(0x5c),v8(0xcc),v8(0x5d),v8(0x65),v8(0xb6),v8(0x92) - .byte v8(0x6c),v8(0x70),v8(0x48),v8(0x50),v8(0xfd),v8(0xed),v8(0xb9),v8(0xda) - .byte v8(0x5e),v8(0x15),v8(0x46),v8(0x57),v8(0xa7),v8(0x8d),v8(0x9d),v8(0x84) - .byte v8(0x90),v8(0xd8),v8(0xab),v8(0x00),v8(0x8c),v8(0xbc),v8(0xd3),v8(0x0a) - .byte v8(0xf7),v8(0xe4),v8(0x58),v8(0x05),v8(0xb8),v8(0xb3),v8(0x45),v8(0x06) - .byte v8(0xd0),v8(0x2c),v8(0x1e),v8(0x8f),v8(0xca),v8(0x3f),v8(0x0f),v8(0x02) - .byte v8(0xc1),v8(0xaf),v8(0xbd),v8(0x03),v8(0x01),v8(0x13),v8(0x8a),v8(0x6b) - .byte v8(0x3a),v8(0x91),v8(0x11),v8(0x41),v8(0x4f),v8(0x67),v8(0xdc),v8(0xea) - .byte v8(0x97),v8(0xf2),v8(0xcf),v8(0xce),v8(0xf0),v8(0xb4),v8(0xe6),v8(0x73) - .byte v8(0x96),v8(0xac),v8(0x74),v8(0x22),v8(0xe7),v8(0xad),v8(0x35),v8(0x85) - .byte v8(0xe2),v8(0xf9),v8(0x37),v8(0xe8),v8(0x1c),v8(0x75),v8(0xdf),v8(0x6e) - .byte v8(0x47),v8(0xf1),v8(0x1a),v8(0x71),v8(0x1d),v8(0x29),v8(0xc5),v8(0x89) - .byte v8(0x6f),v8(0xb7),v8(0x62),v8(0x0e),v8(0xaa),v8(0x18),v8(0xbe),v8(0x1b) - .byte v8(0xfc),v8(0x56),v8(0x3e),v8(0x4b),v8(0xc6),v8(0xd2),v8(0x79),v8(0x20) - .byte v8(0x9a),v8(0xdb),v8(0xc0),v8(0xfe),v8(0x78),v8(0xcd),v8(0x5a),v8(0xf4) - .byte v8(0x1f),v8(0xdd),v8(0xa8),v8(0x33),v8(0x88),v8(0x07),v8(0xc7),v8(0x31) - .byte v8(0xb1),v8(0x12),v8(0x10),v8(0x59),v8(0x27),v8(0x80),v8(0xec),v8(0x5f) - .byte v8(0x60),v8(0x51),v8(0x7f),v8(0xa9),v8(0x19),v8(0xb5),v8(0x4a),v8(0x0d) - .byte v8(0x2d),v8(0xe5),v8(0x7a),v8(0x9f),v8(0x93),v8(0xc9),v8(0x9c),v8(0xef) - .byte v8(0xa0),v8(0xe0),v8(0x3b),v8(0x4d),v8(0xae),v8(0x2a),v8(0xf5),v8(0xb0) - .byte v8(0xc8),v8(0xeb),v8(0xbb),v8(0x3c),v8(0x83),v8(0x53),v8(0x99),v8(0x61) - .byte v8(0x17),v8(0x2b),v8(0x04),v8(0x7e),v8(0xba),v8(0x77),v8(0xd6),v8(0x26) - .byte v8(0xe1),v8(0x69),v8(0x14),v8(0x63),v8(0x55),v8(0x21),v8(0x0c),v8(0x7d) - -#endif diff --git a/bsd/crypto/aes/i386/aesopt.h b/bsd/crypto/aes/i386/aesopt.h deleted file mode 100644 index 025eb5fcf..000000000 --- a/bsd/crypto/aes/i386/aesopt.h +++ /dev/null @@ -1,719 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software in both source and binary - form is allowed (with or without changes) provided that: - - 1. distributions of this source code include the above copyright - notice, this list of conditions and the following disclaimer; - - 2. distributions in binary form include the above copyright - notice, this list of conditions and the following disclaimer - in the documentation and/or other associated materials; - - 3. the copyright holder's name is not used to endorse products - built using this software without specific written permission. - - ALTERNATIVELY, provided that this notice is retained in full, this product - may be distributed under the terms of the GNU General Public License (GPL), - in which case the provisions of the GPL apply INSTEAD OF those given above. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue 31/01/2006 - - This file contains the compilation options for AES (Rijndael) and code - that is common across encryption, key scheduling and table generation. - - OPERATION - - These source code files implement the AES algorithm Rijndael designed by - Joan Daemen and Vincent Rijmen. This version is designed for the standard - block size of 16 bytes and for key sizes of 128, 192 and 256 bits (16, 24 - and 32 bytes). - - This version is designed for flexibility and speed using operations on - 32-bit words rather than operations on bytes. It can be compiled with - either big or little endian internal byte order but is faster when the - native byte order for the processor is used. - - THE CIPHER INTERFACE - - The cipher interface is implemented as an array of bytes in which lower - AES bit sequence indexes map to higher numeric significance within bytes. - - uint_8t (an unsigned 8-bit type) - uint_32t (an unsigned 32-bit type) - struct aes_encrypt_ctx (structure for the cipher encryption context) - struct aes_decrypt_ctx (structure for the cipher decryption context) - aes_rval the function return type - - C subroutine calls: - - aes_rval aes_encrypt_key128(const unsigned char *key, aes_encrypt_ctx cx[1]); - aes_rval aes_encrypt_key192(const unsigned char *key, aes_encrypt_ctx cx[1]); - aes_rval aes_encrypt_key256(const unsigned char *key, aes_encrypt_ctx cx[1]); - aes_rval aes_encrypt(const unsigned char *in, unsigned char *out, - const aes_encrypt_ctx cx[1]); - - aes_rval aes_decrypt_key128(const unsigned char *key, aes_decrypt_ctx cx[1]); - aes_rval aes_decrypt_key192(const unsigned char *key, aes_decrypt_ctx cx[1]); - aes_rval aes_decrypt_key256(const unsigned char *key, aes_decrypt_ctx cx[1]); - aes_rval aes_decrypt(const unsigned char *in, unsigned char *out, - const aes_decrypt_ctx cx[1]); - - IMPORTANT NOTE: If you are using this C interface with dynamic tables make sure that - you call gen_tabs() before AES is used so that the tables are initialised. - - C++ aes class subroutines: - - Class AESencrypt for encryption - - Construtors: - AESencrypt(void) - AESencrypt(const unsigned char *key) - 128 bit key - Members: - aes_rval key128(const unsigned char *key) - aes_rval key192(const unsigned char *key) - aes_rval key256(const unsigned char *key) - aes_rval encrypt(const unsigned char *in, unsigned char *out) const - - Class AESdecrypt for encryption - Construtors: - AESdecrypt(void) - AESdecrypt(const unsigned char *key) - 128 bit key - Members: - aes_rval key128(const unsigned char *key) - aes_rval key192(const unsigned char *key) - aes_rval key256(const unsigned char *key) - aes_rval decrypt(const unsigned char *in, unsigned char *out) const -*/ - -#if !defined( _AESOPT_H ) -#define _AESOPT_H - -#if defined( __cplusplus ) -#include "aescpp.h" -#else -#include "crypto/aes.h" -#endif - -/* PLATFORM SPECIFIC INCLUDES */ - -#include "edefs.h" - -/* CONFIGURATION - THE USE OF DEFINES - - Later in this section there are a number of defines that control the - operation of the code. In each section, the purpose of each define is - explained so that the relevant form can be included or excluded by - setting either 1's or 0's respectively on the branches of the related - #if clauses. The following local defines should not be changed. -*/ - -#define ENCRYPTION_IN_C 1 -#define DECRYPTION_IN_C 2 -#define ENC_KEYING_IN_C 4 -#define DEC_KEYING_IN_C 8 - -#define NO_TABLES 0 -#define ONE_TABLE 1 -#define FOUR_TABLES 4 -#define NONE 0 -#define PARTIAL 1 -#define FULL 2 - -/* --- START OF USER CONFIGURED OPTIONS --- */ - -/* 1. BYTE ORDER WITHIN 32 BIT WORDS - - The fundamental data processing units in Rijndael are 8-bit bytes. The - input, output and key input are all enumerated arrays of bytes in which - bytes are numbered starting at zero and increasing to one less than the - number of bytes in the array in question. This enumeration is only used - for naming bytes and does not imply any adjacency or order relationship - from one byte to another. When these inputs and outputs are considered - as bit sequences, bits 8*n to 8*n+7 of the bit sequence are mapped to - byte[n] with bit 8n+i in the sequence mapped to bit 7-i within the byte. - In this implementation bits are numbered from 0 to 7 starting at the - numerically least significant end of each byte (bit n represents 2^n). - - However, Rijndael can be implemented more efficiently using 32-bit - words by packing bytes into words so that bytes 4*n to 4*n+3 are placed - into word[n]. While in principle these bytes can be assembled into words - in any positions, this implementation only supports the two formats in - which bytes in adjacent positions within words also have adjacent byte - numbers. This order is called big-endian if the lowest numbered bytes - in words have the highest numeric significance and little-endian if the - opposite applies. - - This code can work in either order irrespective of the order used by the - machine on which it runs. Normally the internal byte order will be set - to the order of the processor on which the code is to be run but this - define can be used to reverse this in special situations - - WARNING: Assembler code versions rely on PLATFORM_BYTE_ORDER being set. - This define will hence be redefined later (in section 4) if necessary -*/ - -#if 1 -#define ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER -#elif 0 -#define ALGORITHM_BYTE_ORDER IS_LITTLE_ENDIAN -#elif 0 -#define ALGORITHM_BYTE_ORDER IS_BIG_ENDIAN -#else -#error The algorithm byte order is not defined -#endif - -/* 2. VIA ACE SUPPORT - - Define this option if support for the VIA ACE is required. This uses - inline assembler instructions and is only implemented for the Microsoft, - Intel and GCC compilers. If VIA ACE is known to be present, then defining - ASSUME_VIA_ACE_PRESENT will remove the ordinary encryption/decryption - code. If USE_VIA_ACE_IF_PRESENT is defined then VIA ACE will be used if - it is detected (both present and enabled) but the normal AES code will - also be present. - - When VIA ACE is to be used, all AES encryption contexts MUST be 16 byte - aligned; other input/output buffers do not need to be 16 byte aligned - but there are very large performance gains if this can be arranged. - VIA ACE also requires the decryption key schedule to be in reverse - order (which the following defines ensure). -*/ - -#if 0 && !defined( _WIN64 ) && !defined( USE_VIA_ACE_IF_PRESENT ) -#define USE_VIA_ACE_IF_PRESENT -#endif - -#if 0 && !defined( _WIN64 ) && !defined( ASSUME_VIA_ACE_PRESENT ) -#define ASSUME_VIA_ACE_PRESENT -#endif - -/* 3. ASSEMBLER SUPPORT - - This define (which can be on the command line) enables the use of the - assembler code routines for encryption, decryption and key scheduling - as follows: - - ASM_X86_V1C uses the assembler (aes_x86_v1.asm) with large tables for - encryption and decryption and but with key scheduling in C - ASM_X86_V2 uses assembler (aes_x86_v2.asm) with compressed tables for - encryption, decryption and key scheduling - ASM_X86_V2C uses assembler (aes_x86_v2.asm) with compressed tables for - encryption and decryption and but with key scheduling in C - ASM_AMD64_C uses assembler (aes_amd64.asm) with compressed tables for - encryption and decryption and but with key scheduling in C - - Change one 'if 0' below to 'if 1' to select the version or define - as a compilation option. -*/ - -#if defined ( ASM_X86_V1C ) || defined( ASM_X86_V2 ) || defined( ASM_X86_V2C ) -# if defined( _M_IX86 ) -# if 0 && !defined( ASM_X86_V1C ) -# define ASM_X86_V1C -# elif 0 && !defined( ASM_X86_V2 ) -# define ASM_X86_V2 -# elif 0 && !defined( ASM_X86_V2C ) -# define ASM_X86_V2C -# endif -# else -# error Assembler code is only available for x86 and AMD64 systems -# endif -#elif defined( ASM_AMD64_C ) -# if defined( _M_X64 ) -# if 0 && !defined( ASM_AMD64_C ) -# define ASM_AMD64_C -# endif -# else -# error Assembler code is only available for x86 and AMD64 systems -# endif -#endif - -/* 4. FAST INPUT/OUTPUT OPERATIONS. - - On some machines it is possible to improve speed by transferring the - bytes in the input and output arrays to and from the internal 32-bit - variables by addressing these arrays as if they are arrays of 32-bit - words. On some machines this will always be possible but there may - be a large performance penalty if the byte arrays are not aligned on - the normal word boundaries. On other machines this technique will - lead to memory access errors when such 32-bit word accesses are not - properly aligned. The option SAFE_IO avoids such problems but will - often be slower on those machines that support misaligned access - (especially so if care is taken to align the input and output byte - arrays on 32-bit word boundaries). If SAFE_IO is not defined it is - assumed that access to byte arrays as if they are arrays of 32-bit - words will not cause problems when such accesses are misaligned. -*/ -#if 1 && !defined( _MSC_VER ) -#define SAFE_IO -#endif - -/* 5. LOOP UNROLLING - - The code for encryption and decrytpion cycles through a number of rounds - that can be implemented either in a loop or by expanding the code into a - long sequence of instructions, the latter producing a larger program but - one that will often be much faster. The latter is called loop unrolling. - There are also potential speed advantages in expanding two iterations in - a loop with half the number of iterations, which is called partial loop - unrolling. The following options allow partial or full loop unrolling - to be set independently for encryption and decryption -*/ -#if 1 -#define ENC_UNROLL FULL -#elif 0 -#define ENC_UNROLL PARTIAL -#else -#define ENC_UNROLL NONE -#endif - -#if 1 -#define DEC_UNROLL FULL -#elif 0 -#define DEC_UNROLL PARTIAL -#else -#define DEC_UNROLL NONE -#endif - -/* 6. FAST FINITE FIELD OPERATIONS - - If this section is included, tables are used to provide faster finite - field arithmetic (this has no effect if FIXED_TABLES is defined). -*/ -#if 1 -#define FF_TABLES -#endif - -/* 7. INTERNAL STATE VARIABLE FORMAT - - The internal state of Rijndael is stored in a number of local 32-bit - word varaibles which can be defined either as an array or as individual - names variables. Include this section if you want to store these local - varaibles in arrays. Otherwise individual local variables will be used. -*/ -#if 1 -#define ARRAYS -#endif - -/* 8. FIXED OR DYNAMIC TABLES - - When this section is included the tables used by the code are compiled - statically into the binary file. Otherwise the subroutine gen_tabs() - must be called to compute them before the code is first used. -*/ -#if 0 && !(defined( _MSC_VER ) && ( _MSC_VER <= 800 )) -#define FIXED_TABLES -#endif - -/* 9. TABLE ALIGNMENT - - On some sytsems speed will be improved by aligning the AES large lookup - tables on particular boundaries. This define should be set to a power of - two giving the desired alignment. It can be left undefined if alignment - is not needed. This option is specific to the Microsft VC++ compiler - - it seems to sometimes cause trouble for the VC++ version 6 compiler. -*/ - -#if 1 && defined( _MSC_VER ) && ( _MSC_VER >= 1300 ) -#define TABLE_ALIGN 32 -#endif - -/* 10. TABLE OPTIONS - - This cipher proceeds by repeating in a number of cycles known as 'rounds' - which are implemented by a round function which can optionally be speeded - up using tables. The basic tables are each 256 32-bit words, with either - one or four tables being required for each round function depending on - how much speed is required. The encryption and decryption round functions - are different and the last encryption and decrytpion round functions are - different again making four different round functions in all. - - This means that: - 1. Normal encryption and decryption rounds can each use either 0, 1 - or 4 tables and table spaces of 0, 1024 or 4096 bytes each. - 2. The last encryption and decryption rounds can also use either 0, 1 - or 4 tables and table spaces of 0, 1024 or 4096 bytes each. - - Include or exclude the appropriate definitions below to set the number - of tables used by this implementation. -*/ - -#if 1 /* set tables for the normal encryption round */ -#define ENC_ROUND FOUR_TABLES -#elif 0 -#define ENC_ROUND ONE_TABLE -#else -#define ENC_ROUND NO_TABLES -#endif - -#if 1 /* set tables for the last encryption round */ -#define LAST_ENC_ROUND FOUR_TABLES -#elif 0 -#define LAST_ENC_ROUND ONE_TABLE -#else -#define LAST_ENC_ROUND NO_TABLES -#endif - -#if 1 /* set tables for the normal decryption round */ -#define DEC_ROUND FOUR_TABLES -#elif 0 -#define DEC_ROUND ONE_TABLE -#else -#define DEC_ROUND NO_TABLES -#endif - -#if 1 /* set tables for the last decryption round */ -#define LAST_DEC_ROUND FOUR_TABLES -#elif 0 -#define LAST_DEC_ROUND ONE_TABLE -#else -#define LAST_DEC_ROUND NO_TABLES -#endif - -/* The decryption key schedule can be speeded up with tables in the same - way that the round functions can. Include or exclude the following - defines to set this requirement. -*/ -#if 1 -#define KEY_SCHED FOUR_TABLES -#elif 0 -#define KEY_SCHED ONE_TABLE -#else -#define KEY_SCHED NO_TABLES -#endif - -/* ---- END OF USER CONFIGURED OPTIONS ---- */ - -/* VIA ACE support is only available for VC++ and GCC */ - -#if !defined( _MSC_VER ) && !defined( __GNUC__ ) -# if defined( ASSUME_VIA_ACE_PRESENT ) -# undef ASSUME_VIA_ACE_PRESENT -# endif -# if defined( USE_VIA_ACE_IF_PRESENT ) -# undef USE_VIA_ACE_IF_PRESENT -# endif -#endif - -#if defined( ASSUME_VIA_ACE_PRESENT ) && !defined( USE_VIA_ACE_IF_PRESENT ) -#define USE_VIA_ACE_IF_PRESENT -#endif - -#if defined( USE_VIA_ACE_IF_PRESENT ) && !defined ( AES_REV_DKS ) -#define AES_REV_DKS -#endif - -/* Assembler support requires the use of platform byte order */ - -#if ( defined( ASM_X86_V1C ) || defined( ASM_X86_V2C ) || defined( ASM_AMD64_C ) ) && (ALGORITHM_BYTE_ORDER != PLATFORM_BYTE_ORDER) -#undef ALGORITHM_BYTE_ORDER -#define ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER -#endif - -/* In this implementation the columns of the state array are each held in - 32-bit words. The state array can be held in various ways: in an array - of words, in a number of individual word variables or in a number of - processor registers. The following define maps a variable name x and - a column number c to the way the state array variable is to be held. - The first define below maps the state into an array x[c] whereas the - second form maps the state into a number of individual variables x0, - x1, etc. Another form could map individual state colums to machine - register names. -*/ - -#if defined( ARRAYS ) -#define s(x,c) x[c] -#else -#define s(x,c) x##c -#endif - -/* This implementation provides subroutines for encryption, decryption - and for setting the three key lengths (separately) for encryption - and decryption. Since not all functions are needed, masks are set - up here to determine which will be implemented in C -*/ - -#if !defined( AES_ENCRYPT ) -# define EFUNCS_IN_C 0 -#elif defined( ASSUME_VIA_ACE_PRESENT ) || defined( ASM_X86_V1C ) - || defined( ASM_X86_V2C ) || defined( ASM_AMD64_C ) -# define EFUNCS_IN_C ENC_KEYING_IN_C -#elif !defined( ASM_X86_V2 ) -# define EFUNCS_IN_C ( ENCRYPTION_IN_C | ENC_KEYING_IN_C ) -#else -# define EFUNCS_IN_C 0 -#endif - -#if !defined( AES_DECRYPT ) -# define DFUNCS_IN_C 0 -#elif defined( ASSUME_VIA_ACE_PRESENT ) || defined( ASM_X86_V1C ) - || defined( ASM_X86_V2C ) || defined( ASM_AMD64_C ) -# define DFUNCS_IN_C DEC_KEYING_IN_C -#elif !defined( ASM_X86_V2 ) -# define DFUNCS_IN_C ( DECRYPTION_IN_C | DEC_KEYING_IN_C ) -#else -# define DFUNCS_IN_C 0 -#endif - -#define FUNCS_IN_C ( EFUNCS_IN_C | DFUNCS_IN_C ) - -/* END OF CONFIGURATION OPTIONS */ - -#define RC_LENGTH (5 * (AES_BLOCK_SIZE / 4 - 2)) - -/* Disable or report errors on some combinations of options */ - -#if ENC_ROUND == NO_TABLES && LAST_ENC_ROUND != NO_TABLES -#undef LAST_ENC_ROUND -#define LAST_ENC_ROUND NO_TABLES -#elif ENC_ROUND == ONE_TABLE && LAST_ENC_ROUND == FOUR_TABLES -#undef LAST_ENC_ROUND -#define LAST_ENC_ROUND ONE_TABLE -#endif - -#if ENC_ROUND == NO_TABLES && ENC_UNROLL != NONE -#undef ENC_UNROLL -#define ENC_UNROLL NONE -#endif - -#if DEC_ROUND == NO_TABLES && LAST_DEC_ROUND != NO_TABLES -#undef LAST_DEC_ROUND -#define LAST_DEC_ROUND NO_TABLES -#elif DEC_ROUND == ONE_TABLE && LAST_DEC_ROUND == FOUR_TABLES -#undef LAST_DEC_ROUND -#define LAST_DEC_ROUND ONE_TABLE -#endif - -#if DEC_ROUND == NO_TABLES && DEC_UNROLL != NONE -#undef DEC_UNROLL -#define DEC_UNROLL NONE -#endif - -#if defined( bswap32 ) -#define aes_sw32 bswap32 -#elif defined( bswap_32 ) -#define aes_sw32 bswap_32 -#else -#define brot(x,n) (((uint_32t)(x) << n) | ((uint_32t)(x) >> (32 - n))) -#define aes_sw32(x) ((brot((x),8) & 0x00ff00ff) | (brot((x),24) & 0xff00ff00)) -#endif - -/* upr(x,n): rotates bytes within words by n positions, moving bytes to - higher index positions with wrap around into low positions - ups(x,n): moves bytes by n positions to higher index positions in - words but without wrap around - bval(x,n): extracts a byte from a word - - WARNING: The definitions given here are intended only for use with - unsigned variables and with shift counts that are compile - time constants -*/ - -#if ( ALGORITHM_BYTE_ORDER == IS_LITTLE_ENDIAN ) -#define upr(x,n) (((uint_32t)(x) << (8 * (n))) | ((uint_32t)(x) >> (32 - 8 * (n)))) -#define ups(x,n) ((uint_32t) (x) << (8 * (n))) -#define bval(x,n) ((uint_8t)((x) >> (8 * (n)))) -#define bytes2word(b0, b1, b2, b3) \ - (((uint_32t)(b3) << 24) | ((uint_32t)(b2) << 16) | ((uint_32t)(b1) << 8) | (b0)) -#endif - -#if ( ALGORITHM_BYTE_ORDER == IS_BIG_ENDIAN ) -#define upr(x,n) (((uint_32t)(x) >> (8 * (n))) | ((uint_32t)(x) << (32 - 8 * (n)))) -#define ups(x,n) ((uint_32t) (x) >> (8 * (n))) -#define bval(x,n) ((uint_8t)((x) >> (24 - 8 * (n)))) -#define bytes2word(b0, b1, b2, b3) \ - (((uint_32t)(b0) << 24) | ((uint_32t)(b1) << 16) | ((uint_32t)(b2) << 8) | (b3)) -#endif - -#if defined( SAFE_IO ) - -#define word_in(x,c) bytes2word(((const uint_8t*)(x)+4*c)[0], ((const uint_8t*)(x)+4*c)[1], \ - ((const uint_8t*)(x)+4*c)[2], ((const uint_8t*)(x)+4*c)[3]) -#define word_out(x,c,v) { ((uint_8t*)(x)+4*c)[0] = bval(v,0); ((uint_8t*)(x)+4*c)[1] = bval(v,1); \ - ((uint_8t*)(x)+4*c)[2] = bval(v,2); ((uint_8t*)(x)+4*c)[3] = bval(v,3); } - -#elif ( ALGORITHM_BYTE_ORDER == PLATFORM_BYTE_ORDER ) - -#define word_in(x,c) (*((uint_32t*)(x)+(c))) -#define word_out(x,c,v) (*((uint_32t*)(x)+(c)) = (v)) - -#else - -#define word_in(x,c) aes_sw32(*((uint_32t*)(x)+(c))) -#define word_out(x,c,v) (*((uint_32t*)(x)+(c)) = aes_sw32(v)) - -#endif - -/* the finite field modular polynomial and elements */ - -#define WPOLY 0x011b -#define BPOLY 0x1b - -/* multiply four bytes in GF(2^8) by 'x' {02} in parallel */ - -#define m1 0x80808080 -#define m2 0x7f7f7f7f -#define gf_mulx(x) ((((x) & m2) << 1) ^ ((((x) & m1) >> 7) * BPOLY)) - -/* The following defines provide alternative definitions of gf_mulx that might - give improved performance if a fast 32-bit multiply is not available. Note - that a temporary variable u needs to be defined where gf_mulx is used. - -#define gf_mulx(x) (u = (x) & m1, u |= (u >> 1), ((x) & m2) << 1) ^ ((u >> 3) | (u >> 6)) -#define m4 (0x01010101 * BPOLY) -#define gf_mulx(x) (u = (x) & m1, ((x) & m2) << 1) ^ ((u - (u >> 7)) & m4) -*/ - -/* Work out which tables are needed for the different options */ - -#if defined( ASM_X86_V1C ) -#if defined( ENC_ROUND ) -#undef ENC_ROUND -#endif -#define ENC_ROUND FOUR_TABLES -#if defined( LAST_ENC_ROUND ) -#undef LAST_ENC_ROUND -#endif -#define LAST_ENC_ROUND FOUR_TABLES -#if defined( DEC_ROUND ) -#undef DEC_ROUND -#endif -#define DEC_ROUND FOUR_TABLES -#if defined( LAST_DEC_ROUND ) -#undef LAST_DEC_ROUND -#endif -#define LAST_DEC_ROUND FOUR_TABLES -#if defined( KEY_SCHED ) -#undef KEY_SCHED -#define KEY_SCHED FOUR_TABLES -#endif -#endif - -#if ( FUNCS_IN_C & ENCRYPTION_IN_C ) || defined( ASM_X86_V1C ) -#if ENC_ROUND == ONE_TABLE -#define FT1_SET -#elif ENC_ROUND == FOUR_TABLES -#define FT4_SET -#else -#define SBX_SET -#endif -#if LAST_ENC_ROUND == ONE_TABLE -#define FL1_SET -#elif LAST_ENC_ROUND == FOUR_TABLES -#define FL4_SET -#elif !defined( SBX_SET ) -#define SBX_SET -#endif -#endif - -#if ( FUNCS_IN_C & DECRYPTION_IN_C ) || defined( ASM_X86_V1C ) -#if DEC_ROUND == ONE_TABLE -#define IT1_SET -#elif DEC_ROUND == FOUR_TABLES -#define IT4_SET -#else -#define ISB_SET -#endif -#if LAST_DEC_ROUND == ONE_TABLE -#define IL1_SET -#elif LAST_DEC_ROUND == FOUR_TABLES -#define IL4_SET -#elif !defined(ISB_SET) -#define ISB_SET -#endif -#endif - -#if (FUNCS_IN_C & ENC_KEYING_IN_C) || (FUNCS_IN_C & DEC_KEYING_IN_C) -#if KEY_SCHED == ONE_TABLE -#define LS1_SET -#elif KEY_SCHED == FOUR_TABLES -#define LS4_SET -#elif !defined( SBX_SET ) -#define SBX_SET -#endif -#endif - -#if (FUNCS_IN_C & DEC_KEYING_IN_C) -#if KEY_SCHED == ONE_TABLE -#define IM1_SET -#elif KEY_SCHED == FOUR_TABLES -#define IM4_SET -#elif !defined( SBX_SET ) -#define SBX_SET -#endif -#endif - -/* generic definitions of Rijndael macros that use tables */ - -#define no_table(x,box,vf,rf,c) bytes2word( \ - box[bval(vf(x,0,c),rf(0,c))], \ - box[bval(vf(x,1,c),rf(1,c))], \ - box[bval(vf(x,2,c),rf(2,c))], \ - box[bval(vf(x,3,c),rf(3,c))]) - -#define one_table(x,op,tab,vf,rf,c) \ - ( tab[bval(vf(x,0,c),rf(0,c))] \ - ^ op(tab[bval(vf(x,1,c),rf(1,c))],1) \ - ^ op(tab[bval(vf(x,2,c),rf(2,c))],2) \ - ^ op(tab[bval(vf(x,3,c),rf(3,c))],3)) - -#define four_tables(x,tab,vf,rf,c) \ - ( tab[0][bval(vf(x,0,c),rf(0,c))] \ - ^ tab[1][bval(vf(x,1,c),rf(1,c))] \ - ^ tab[2][bval(vf(x,2,c),rf(2,c))] \ - ^ tab[3][bval(vf(x,3,c),rf(3,c))]) - -#define vf1(x,r,c) (x) -#define rf1(r,c) (r) -#define rf2(r,c) ((8+r-c)&3) - -/* perform forward and inverse column mix operation on four bytes in long word x in */ -/* parallel. NOTE: x must be a simple variable, NOT an expression in these macros. */ - -#if defined( FM4_SET ) /* not currently used */ -#define fwd_mcol(x) four_tables(x,t_use(f,m),vf1,rf1,0) -#elif defined( FM1_SET ) /* not currently used */ -#define fwd_mcol(x) one_table(x,upr,t_use(f,m),vf1,rf1,0) -#else -#define dec_fmvars uint_32t g2 -#define fwd_mcol(x) (g2 = gf_mulx(x), g2 ^ upr((x) ^ g2, 3) ^ upr((x), 2) ^ upr((x), 1)) -#endif - -#if defined( IM4_SET ) -#define inv_mcol(x) four_tables(x,t_use(i,m),vf1,rf1,0) -#elif defined( IM1_SET ) -#define inv_mcol(x) one_table(x,upr,t_use(i,m),vf1,rf1,0) -#else -#define dec_imvars uint_32t g2, g4, g9 -#define inv_mcol(x) (g2 = gf_mulx(x), g4 = gf_mulx(g2), g9 = (x) ^ gf_mulx(g4), g4 ^= g9, \ - (x) ^ g2 ^ g4 ^ upr(g2 ^ g9, 3) ^ upr(g4, 2) ^ upr(g9, 1)) -#endif - -#if defined( FL4_SET ) -#define ls_box(x,c) four_tables(x,t_use(f,l),vf1,rf2,c) -#elif defined( LS4_SET ) -#define ls_box(x,c) four_tables(x,t_use(l,s),vf1,rf2,c) -#elif defined( FL1_SET ) -#define ls_box(x,c) one_table(x,upr,t_use(f,l),vf1,rf2,c) -#elif defined( LS1_SET ) -#define ls_box(x,c) one_table(x,upr,t_use(l,s),vf1,rf2,c) -#else -#define ls_box(x,c) no_table(x,t_use(s,box),vf1,rf2,c) -#endif - -#if defined( ASM_X86_V1C ) && defined( AES_DECRYPT ) && !defined( ISB_SET ) -#define ISB_SET -#endif - -#endif diff --git a/bsd/crypto/aes/i386/aesxts.c b/bsd/crypto/aes/i386/aesxts.c new file mode 100644 index 000000000..c0eaaa609 --- /dev/null +++ b/bsd/crypto/aes/i386/aesxts.c @@ -0,0 +1,392 @@ +/* + * Copyright (c) 2010 Apple Inc. All Rights Reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include "aesxts.h" +#include +#include +#include + +int +aes_encrypt_key(const uint8_t *key, int key_len, aesedp_encrypt_ctx cx[1]); + +int +aes_decrypt_key(const uint8_t *key, int key_len, aesedp_decrypt_ctx cx[1]); + +int +aes_encrypt(const uint8_t *Plaintext, uint8_t *Ciphertext, aesedp_encrypt_ctx *ctx); + +int +aes_decrypt(const uint8_t *Ciphertext, uint8_t *Plaintext, aesedp_decrypt_ctx *ctx); + + +/* error codes [will be expanded in future releases] */ +enum { + CRYPT_OK=0, /* Result OK */ + CRYPT_ERROR=1, /* Generic Error */ + CRYPT_INVALID_KEYSIZE=3, /* Invalid key size given */ + CRYPT_INVALID_ARG=16, /* Generic invalid argument */ +}; + +static int +aesedp_keysize(int *keysize) +{ + switch (*keysize) { + case 16: + case 24: + case 32: + return CRYPT_OK; + default: + return CRYPT_INVALID_KEYSIZE; + } +} + +static int +aesedp_setup(const uint8_t *key, int keylen, int num_rounds __unused, aesedp_ctx *skey) +{ + aesedp_ctx *ctx = (aesedp_ctx *) skey; + int retval; + + if((retval = aesedp_keysize(&keylen)) != CRYPT_OK) return retval; + if((retval = aes_encrypt_key(key, keylen, &ctx->encrypt)) != CRYPT_OK) return CRYPT_ERROR; + if((retval = aes_decrypt_key(key, keylen, &ctx->decrypt)) != CRYPT_OK) return CRYPT_ERROR; + return CRYPT_OK; +} + +#ifdef ZZZNEVER +static int +aesedp_ecb_encrypt(const uint8_t *pt, uint8_t *ct, aesedp_ctx *skey) +{ + aesedp_ctx *ctx = (aesedp_ctx *) skey; + return aes_encrypt(pt, ct, &ctx->encrypt); +} + + + +static int +aesedp_ecb_decrypt(const uint8_t *ct, uint8_t *pt, aesedp_ctx *skey) +{ + return aes_decrypt(ct, pt, &skey->decrypt); +} +#endif + + +static void +aesedp_done(aesedp_ctx *skey __unused) +{ +} + +/** Start XTS mode + @param cipher The index of the cipher to use + @param key1 The encrypt key + @param key2 The tweak encrypt key + @param keylen The length of the keys (each) in octets + @param num_rounds The number of rounds for the cipher (0 == default) + @param xts [out] XTS structure + Returns CRYPT_OK upon success. +*/ + +uint32_t +xts_start(uint32_t cipher, // ignored - we're doing this for xts-aes only + const uint8_t *IV __unused, // ignored + const uint8_t *key1, int keylen, + const uint8_t *key2, int tweaklen __unused, // both keys are the same size for xts + uint32_t num_rounds, // ignored + uint32_t options __unused, // ignored + symmetric_xts *xts) +{ + uint32_t err; + + /* check inputs */ + if((key1 == NULL)|| (key2 == NULL) || (xts == NULL)) return CRYPT_INVALID_ARG; + + /* schedule the two ciphers */ + if ((err = aesedp_setup(key1, keylen, num_rounds, &xts->key1)) != 0) { + return err; + } + if ((err = aesedp_setup(key2, keylen, num_rounds, &xts->key2)) != 0) { + return err; + } + xts->cipher = cipher; + + return err; +} + + + + +/** multiply by x + @param I The value to multiply by x (LFSR shift) +*/ +#if defined __x86_64__ || defined __i386__ +extern void xts_mult_x(uint8_t *I); +#else +static void xts_mult_x(uint8_t *I) +{ + uint32_t x; + uint8_t t, tt; + + for (x = t = 0; x < 16; x++) { + tt = I[x] >> 7; + I[x] = ((I[x] << 1) | t) & 0xFF; + t = tt; + } + if (tt) { + I[0] ^= 0x87; + } +} +#endif + +#if defined __x86_64__ || defined __i386__ +extern int tweak_crypt(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx); +extern int tweak_crypt_group(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx, uint32_t lim); +#else +static int tweak_crypt(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx) +{ + uint32_t x; + uint32_t err; + + /* tweak encrypt block i */ + for (x = 0; x < 16; x += sizeof(uint64_t)) { + *((uint64_t*)&C[x]) = *((uint64_t*)&P[x]) ^ *((uint64_t*)&T[x]); + } + + if ((err = aes_encrypt(C, C, ctx)) != CRYPT_OK) { + return CRYPT_INVALID_KEYSIZE; + } + + for (x = 0; x < 16; x += sizeof(uint64_t)) { + *((uint64_t*)&C[x]) ^= *((uint64_t*)&T[x]); + } + + /* LFSR the tweak */ + xts_mult_x(T); + + return CRYPT_OK; +} +#endif + +/** XTS Encryption + @param pt [in] Plaintext + @param ptlen Length of plaintext (and ciphertext) + @param ct [out] Ciphertext + @param tweak [in] The 128--bit encryption tweak (e.g. sector number) + @param xts The XTS structure + Returns CRYPT_OK upon success +*/ +int xts_encrypt( + const uint8_t *pt, unsigned long ptlen, + uint8_t *ct, + const uint8_t *tweak, + symmetric_xts *xts) +{ + aesedp_encrypt_ctx *encrypt_ctx = &xts->key1.encrypt; + uint8_t PP[16], CC[16], T[16]; + uint32_t i, m, mo, lim; + uint32_t err; + + /* check inputs */ + if((pt == NULL) || (ct == NULL)|| (tweak == NULL) || (xts == NULL)) return 1; + + /* get number of blocks */ + m = ptlen >> 4; + mo = ptlen & 15; + + /* must have at least one full block */ + if (m == 0) { + return CRYPT_INVALID_ARG; + } + + /* encrypt the tweak */ + if ((err = aes_encrypt(tweak, T, &xts->key2.encrypt)) != 0) { + return CRYPT_INVALID_KEYSIZE; + } + + /* for i = 0 to m-2 do */ + if (mo == 0) { + lim = m; + } else { + lim = m - 1; + } + +#if defined __x86_64__ || defined __i386__ + if (lim>0) { + err = tweak_crypt_group(pt, ct, T, encrypt_ctx, lim); + ct += (lim<<4); + pt += (lim<<4); + } +#else + for (i = 0; i < lim; i++) { + err = tweak_crypt(pt, ct, T, encrypt_ctx); + ct += 16; + pt += 16; + } +#endif + + /* if ptlen not divide 16 then */ + if (mo > 0) { + /* CC = tweak encrypt block m-1 */ + if ((err = tweak_crypt(pt, CC, T, encrypt_ctx)) != 0) { + return err; + } + + /* Cm = first ptlen % 16 bytes of CC */ + for (i = 0; i < mo; i++) { + PP[i] = pt[16+i]; + ct[16+i] = CC[i]; + } + + for (; i < 16; i++) { + PP[i] = CC[i]; + } + + /* Cm-1 = Tweak encrypt PP */ + if ((err = tweak_crypt(PP, ct, T, encrypt_ctx)) != 0) { + return err; + } + } + + return err; +} + +#if defined __x86_64__ || defined __i386__ +extern int tweak_uncrypt(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx); +extern int tweak_uncrypt_group(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx, uint32_t lim); +#else +static int tweak_uncrypt(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx) +{ + uint32_t x; + uint32_t err; + + /* tweak encrypt block i */ + for (x = 0; x < 16; x += sizeof(uint64_t)) { + *((uint64_t*)&P[x]) = *((uint64_t*)&C[x]) ^ *((uint64_t*)&T[x]); + } + + err = aes_decrypt(P, P, ctx); + + for (x = 0; x < 16; x += sizeof(uint64_t)) { + *((uint64_t*)&P[x]) ^= *((uint64_t*)&T[x]); + } + + /* LFSR the tweak */ + xts_mult_x(T); + + return err; +} +#endif + +/** XTS Decryption + @param ct [in] Ciphertext + @param ptlen Length of plaintext (and ciphertext) + @param pt [out] Plaintext + @param tweak [in] The 128--bit encryption tweak (e.g. sector number) + @param xts The XTS structure + Returns CRYPT_OK upon success +*/ + +int xts_decrypt( + const uint8_t *ct, unsigned long ptlen, + uint8_t *pt, + const uint8_t *tweak, + symmetric_xts *xts) +{ + aesedp_decrypt_ctx *decrypt_ctx = &xts->key1.decrypt; + uint8_t PP[16], CC[16], T[16]; + uint32_t i, m, mo, lim; + uint32_t err; + + /* check inputs */ + if((pt == NULL) || (ct == NULL)|| (tweak == NULL) || (xts == NULL)) return 1; + + /* get number of blocks */ + m = ptlen >> 4; + mo = ptlen & 15; + + /* must have at least one full block */ + if (m == 0) { + return CRYPT_INVALID_ARG; + } + + /* encrypt the tweak , yes - encrypt */ + if ((err = aes_encrypt(tweak, T, &xts->key2.encrypt)) != 0) { + return CRYPT_INVALID_KEYSIZE; + } + + /* for i = 0 to m-2 do */ + if (mo == 0) { + lim = m; + } else { + lim = m - 1; + } + +#if defined __x86_64__ || defined __i386__ + if (lim>0) { + err = tweak_uncrypt_group(ct, pt, T, decrypt_ctx, lim); + ct += (lim<<4); + pt += (lim<<4); + } +#else + for (i = 0; i < lim; i++) { + err = tweak_uncrypt(ct, pt, T, decrypt_ctx); + ct += 16; + pt += 16; + } +#endif + + /* if ptlen not divide 16 then */ + if (mo > 0) { + memcpy(CC, T, 16); + xts_mult_x(CC); + + /* PP = tweak decrypt block m-1 */ + if ((err = tweak_uncrypt(ct, PP, CC, decrypt_ctx)) != CRYPT_OK) { + return err; + } + + /* Pm = first ptlen % 16 bytes of PP */ + for (i = 0; i < mo; i++) { + CC[i] = ct[16+i]; + pt[16+i] = PP[i]; + } + for (; i < 16; i++) { + CC[i] = PP[i]; + } + + /* Pm-1 = Tweak uncrypt CC */ + if ((err = tweak_uncrypt(CC, pt, T, decrypt_ctx)) != CRYPT_OK) { + return err; + } + } + + return CRYPT_OK; +} + + + +void xts_done(symmetric_xts *xts) +{ + if(xts == NULL) return; + aesedp_done(&xts->key1); + aesedp_done(&xts->key2); +} + diff --git a/bsd/crypto/aes/i386/aesxts.h b/bsd/crypto/aes/i386/aesxts.h new file mode 100644 index 000000000..fe7618066 --- /dev/null +++ b/bsd/crypto/aes/i386/aesxts.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2010 Apple Inc. All Rights Reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +/* + * aesxts.h + * + * + */ + +#include "stdint.h" + + +#ifndef _AESXTS_H +#define _AESXTS_H + +#if defined(__cplusplus) +extern "C" +{ +#endif + +/* + * The context for XTS-AES + */ + + +#define KS_LENGTH 60 + +typedef struct { + uint32_t ks[KS_LENGTH]; + uint32_t rn; +} aesedp_encrypt_ctx; + +typedef struct { + uint32_t ks[KS_LENGTH]; + uint32_t rn; +} aesedp_decrypt_ctx; + +typedef struct { + aesedp_decrypt_ctx decrypt; + aesedp_encrypt_ctx encrypt; +} aesedp_ctx; + +// xts mode context + +typedef struct { + aesedp_ctx key1, key2; + uint32_t cipher; // ignore - this is to fit with the library, but in this case we're only using aes +} symmetric_xts; + + +/* + * These are the interfaces required for XTS-AES support + */ + +uint32_t +xts_start(uint32_t cipher, // ignored - we're doing this for xts-aes only + const uint8_t *IV, // ignored + const uint8_t *key1, int keylen, + const uint8_t *key2, int tweaklen, // both keys are the same size for xts + uint32_t num_rounds, // ignored + uint32_t options, // ignored + symmetric_xts *xts); + +int xts_encrypt( + const uint8_t *pt, unsigned long ptlen, + uint8_t *ct, + const uint8_t *tweak, // this can be considered the sector IV for this use + symmetric_xts *xts); + +int xts_decrypt( + const uint8_t *ct, unsigned long ptlen, + uint8_t *pt, + const uint8_t *tweak, // this can be considered the sector IV for this use + symmetric_xts *xts); + + +void xts_done(symmetric_xts *xts); + +#if defined(__cplusplus) +} +#endif + +#endif /* _AESXTS_H */ \ No newline at end of file diff --git a/bsd/crypto/aes/i386/aesxts_asm.s b/bsd/crypto/aes/i386/aesxts_asm.s new file mode 100644 index 000000000..ec6b924b7 --- /dev/null +++ b/bsd/crypto/aes/i386/aesxts_asm.s @@ -0,0 +1,1305 @@ +/* + This file "aesxts.s" provides x86_64 / i386 optimization of the following functions + + 0. xts_mult_x_on_xmm7 : a code macro that is used throughout all other functions + 1. void xts_mult_x(uint8_t *I); + 2. int tweak_crypt(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx); + 3. int tweak_crypt_group(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx, uint32_t lim); + 4. int tweak_uncrypt(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx); + 5. int tweak_uncrypt_group(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx, uint32_t lim); + + This file should be compiled together with xtsClearC.c + + functions 1,2,4 are supposed to replace the C functions in xtsClearC.c for x86_64/i386 architectures + functions 3,5 are only given here, no C code is available, they are called in xts_encrypt/xts_decrypt (xtsClearC.c) + - we can possibly add C code for functions 3 and 5 for future porting to other architectures + + cclee 4-29-10 + +*/ + +#ifdef KERNEL +#include +#else +#include +#endif +#define CRYPT_OK 0 // can not include "crypt.h" in which CRYPT_OK is from enum + +/* + The following macro is used throughout the functions in this file. + It is the core function within the function xts_mult_x defined in (xtsClearC.c) + + upon entry, %xmm7 = the input tweak (128-bit), + on return, %xmm7 = the updated tweak (128-bit) + the macro uses %xmm1/%xmm2/%ecx in the computation + the operation can be described as follows : + 0. let x = %xmm7; // 128-bit little-endian input + 1. x = rotate_left(x,1); // rotate left by 1 -bit + 2. if (x&1) x ^= 0x0000...0086; // if least significant bit = 1, least significant byte ^= 0x86; + 3. return x; + + It's a pity that SSE does not support shifting of the whole 128-bit xmm registers. + The workaround is + 1. using parallel dual quad (8-byte) shifting, 1 for the 2 bottom 63-bits, 1 for the 2 leading bits + 2. manipulating the shifted quad words to form the 128-bit shifted result. + + Input : %xmm7 + Output : %xmm7 + Used : %xmm1/%xmm2/%ecx + + The macro is good for both x86_64 and i386. + +*/ + + .macro xts_mult_x_on_xmm7 // input : x = %xmm7, MS = most significant, LS = least significant + movaps %xmm7, %xmm1 // %xmm1 = a copy of x + movaps %xmm7, %xmm2 // %xmm2 = a copy of x + psllq $$1, %xmm7 // 1-bit left shift of 2 quad words (x1<<1, x0<<1), zero-filled + psrlq $$63, %xmm1 // 2 leading bits, each in the least significant bit of a quad word + psrad $$31, %xmm2 // the MS 32-bit will be either 0 or -1, depending on the MS bit of x + pshufd $$0xc6, %xmm1, %xmm1 // switch the positions of the 2 leading bits + pshufd $$0x03, %xmm2, %xmm2 // the LS 32-bit will be either 0 or -1, depending on the MS bit of x + por %xmm1, %xmm7 // we finally has %xmm7 = rotate_left(x,1); + movl $$0x86, %ecx // a potential byte to xor the bottom byte + movd %ecx, %xmm1 // copy it to %xmm1, the other is 0 + pand %xmm2, %xmm1 // %xmm1 = 0 or 0x86, depending on the MS bit of x + pxor %xmm1, %xmm7 // rotate_left(x,1) ^= 0 or 0x86 depending on the MS bit of x + .endm + + +/* + function : void xts_mult_x(uint8_t *I); + + 1. load (__m128*) (I) into xmm7 + 2. macro xts_mult_x_on_xmm7 (i/o @ xmm7, used xmm1/xmm2/ecx) + 3. save output (%xmm7) to memory pointed by I + + input : 16-byte memory pointed by I + output : same 16-byte memory pointed by I + + if kernel code, xmm1/xmm2/xmm7 saved and restored + other used registers : eax/ecx + + */ + .text + .align 4,0x90 + .globl _xts_mult_x +_xts_mult_x: + +#if defined __x86_64__ + #define I %rdi // 1st argument at %rdi for x86_64 + #define sp %rsp +#else + mov 4(%esp), %eax // 1st argument at stack, offset 4 for ret_addr for i386 + #define I %eax + #define sp %esp +#endif + + // if KERNEL code, allocate memory and save xmm1/xmm2/xmm7 +#ifdef KERNEL +#if defined __x86_64__ + sub $0x38, sp // 8-bytes alignment + 3 * 16 bytes +#else + sub $0x3c, sp // 12-bytes alignment + 3 * 16 bytes +#endif + movaps %xmm1, (sp) + movaps %xmm2, 16(sp) + movaps %xmm7, 32(sp) +#endif + + // load, compute, and save + movups (I), %xmm7 // load input tweak 128-bit into %xmm7 + xts_mult_x_on_xmm7 // the macro (also used else where) will update %xmm7 as the output + movups %xmm7, (I) // save the xts_mult_x output + + // if KERNEL code, restore xmm1/xmm2/xmm7 and deallocate stack memory +#ifdef KERNEL + movaps (sp), %xmm1 + movaps 16(sp), %xmm2 + movaps 32(sp), %xmm7 +#if defined __x86_64__ + add $0x38, sp // 8-bytes alignment + 3 * 16 bytes +#else + add $0x3c, sp // 12-bytes alignment + 3 * 16 bytes +#endif +#endif + + ret // return + + #undef I + #undef sp + +/* + The following is x86_64/i386 assembly implementation of + + int tweak_crypt(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx); + + Its C code implementation is given in xtsClearC.c + + all pointers P/C/T points to a block of 16 bytes. In the following description, P/C/T represent 128-bit data. + + The operation of tweak_crypt + + 1. C = P ^ T + 2. err = aes_encryp(C, C, ctx); if (err != CRYPT_OK) return err; + 3. C = C ^ T + 4. xts_mult_x(T) + 5. return CRYPT_OK; + + The following is the assembly implementation flow + + 1. save used xmm registers (xmm1/xmm7) if kernel code + 2. load xmm1 = P, xmm7 = T + 3. xmm1 = C = P ^ T + 4. write xmm1 to C + 5. call aes_encryp(C,C,ctx); note that it will use aesni if available, also xmm will return intact + 6. load xmm1 = C + 7. xmm1 = C = C^T = xmm1 ^ xmm7 + 8. write xmm1 to C + 9. update T (in xmm7) via xts_mult_x macro + a. restore xmm registers (xmm1/xmm7) if kernel code + b. return CRYPT_OK (in eax) + + Note: used xmm registers : xmm1/xmm2/xmm7, xmm2 in xts_mult_x macro + +*/ + + .text + .align 4,0x90 + .globl _tweak_crypt +_tweak_crypt: +#if defined __i386__ + + // push into stack for local use + push %ebp + mov %esp, %ebp + push %ebx + push %edi + push %esi + + // alllocate stack memory for local use + sub $12+16*4, %esp // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments) + + // load with called arguments + mov 8(%ebp), %eax // P, we need this only briefly, so eax is fine + mov 12(%ebp), %edi // C + mov 16(%ebp), %ebx // T + mov 20(%ebp), %esi // ctx + + #define P %eax + #define C %edi + #define T %ebx + #define ctx %esi + #define sp %esp + +#else + // x86_64 calling argument order : rdi/rsi/rdx/rcx/r8 + + // push into stack for local use + push %rbp + mov %rsp, %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + // alllocate stack memory for local use, if kernel code, need to save/restore xmm registers +#ifdef KERNEL + sub $4*16, %rsp // only need 3*16, add 16 extra so to make save/restore xmm common to i386 +#endif + + // load with called arguments, release rdi/rsi/rdx/rcx/r8, as need to call aes_encrypt + mov %rsi, %r13 + mov %rdx, %r14 + mov %rcx, %r15 + + #define P %rdi + #define C %r13 + #define T %r14 + #define ctx %r15 + #define sp %rsp + +#endif + + // if kernel, save used xmm registers +#ifdef KERNEL + movaps %xmm1, 16(sp) + movaps %xmm2, 32(sp) + movaps %xmm7, 48(sp) +#endif + + movups (P), %xmm1 // P + movups (T), %xmm7 // T + + // setup caliing arguments for aes_encrypt +#if defined __i386__ + mov C, (%esp) // C + mov C, 4(%esp) // C + mov ctx, 8(%esp) // ctx +#else + mov C, %rdi // C + mov C, %rsi // C + mov ctx, %rdx // ctx +#endif + + pxor %xmm7, %xmm1 // C = P ^ T + movups %xmm1, (C) // save C into memory + + call _aes_encrypt // err = aes_encrypt(C,C,ctx); + + cmp $CRYPT_OK, %eax // check err == CRYPT_OK + jne 9f // if err != CRYPT_OK, exit + + movups (C), %xmm1 // load xmm1 = C + pxor %xmm7, %xmm1 // C ^= T + movups %xmm1, (C) // write C with xmm1, xmm1 is freed now, will be changed in the following macro + + xts_mult_x_on_xmm7 // update T (on xmm7) + + movups %xmm7, (T) // write xmm7 to T +9: + + // restore used xmm registers if this is for kernel +#ifdef KERNEL + movaps 16(sp), %xmm1 + movaps 32(sp), %xmm2 + movaps 48(sp), %xmm7 +#endif + + // free stack memory and restore callee registers +#if defined __i386__ + add $12+16*4, %esp // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments) + pop %esi + pop %edi + pop %ebx +#else +#ifdef KERNEL + add $4*16, %rsp // only need 3*16, add 16 extra so make save/restore xmm common to i386 +#endif + pop %r15 + pop %r14 + pop %r13 + pop %r12 +#endif + + // return, eax/rax already has the return val + leave + ret + + #undef P + #undef C + #undef T + #undef ctx + #undef sp + +/* + The following is x86_64/i386 assembly implementation of + + int tweak_crypt_group(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx, uint32_t lim); + + TODO : Its C code implementation is YET to be provided in xtsClearC.c (for the benefit of porting to other ISAs) + This function is grouped version of the above function tweak_crypt(), so xmm registers save/restore only need + to happen once for all grouped blocks. + + The implementation here probes __cpu_capabilities to detect whether aesni (or hw-aes instruction) is available. + If aesni is available, the code branch to optimized code that uses aesni. + + The optimized aesni code operates as follows: + + while (more than 4 consecutive blocks available) { + + do xts_mult_x macro 4 times and write the 4 tweaks on stack (16-byte aligned) + + perform 4 C = P ^ T; // T is on 16-byte aligned stack + + perform 4 aes_encrypt (all aes_encrypt instruction interleaved to achieve better throughtput) + + perform 4 C = C ^ T // T is on 16-byte aligned stack + + } + + The code then falls through to the scalar code, that sequentially performs what tweak_crypt does + + 1. C = P ^ T + 2. err = aes_encryp(C, C, ctx); if (err != CRYPT_OK) return err; + 3. C = C ^ T + 4. xts_mult_x(T) + + Note: used xmm registers : + xmm0-xmm5, xmm7 if aesni is available + xmm0-xmm4, xmm7 if aesni is not available. + +*/ + + .text + .align 4,0x90 + .globl _tweak_crypt_group +_tweak_crypt_group: + +#if defined __i386__ + + // push callee-saved registers for local use + push %ebp + mov %esp, %ebp + push %ebx + push %edi + push %esi + + // allocate stack memory for local use and/or xmm register save for kernel code + sub $(12+8*16+16*4), %esp // 12 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) aesni + // 12 (alignment) + 8*16 (xmm) + 4*16 (only 12 used for aes_encrypt) no aesni + // transfer calling arguments + mov 20(%ebp), %eax // ctx + mov 12(%ebp), %edi // C + mov 16(%ebp), %ebx // T + mov 8(%ebp), %esi // P + mov %eax, 8(%esp) // ctx as the 3rd parameter to aes_decrypt + + #define P %esi + #define C %edi + #define T %ebx + #define lim 24(%ebp) + #define sp %esp + +#else + + // push callee-saved registers for local use + push %rbp + mov %rsp, %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + // allocate stack memory for local use and/or xmm register save for kernel code + sub $(8+8*16+16*5), %rsp // 8 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) + 16 (common to i386) + + // rdi/rsi/rdx/rcx/r8 + // transfer calling arguments + mov %rdi, %r12 + mov %rsi, %r13 + mov %rdx, %r14 + mov %rcx, %r15 + mov %r8, %rbx + + #define P %r12 + #define C %r13 + #define T %r14 + #define ctx %r15 + #define lim %ebx + #define sp %rsp +#endif + +#ifdef KERNEL + movaps %xmm0, 0x50(sp) + movaps %xmm1, 0x60(sp) + movaps %xmm2, 0x70(sp) + movaps %xmm3, 0x80(sp) + movaps %xmm4, 0x90(sp) + movaps %xmm7, 0xa0(sp) +#endif + + // probe __cpu_capabilities to detect aesni +#if defined __x86_64__ + movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities + mov (%rax), %eax // %eax = __cpu_capabilities +#else // i386 +#if defined KERNEL + leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities + mov (%eax), %eax // %eax = __cpu_capabilities +#else + movl _COMM_PAGE_CPU_CAPABILITIES, %eax +#endif +#endif + test $(kHasAES), %eax + je L_crypt_group_sw // if aesni not available, jump to sw-based implementation + + // aesni-based implementation + + sub $4, lim // pre-decrement lim by 4 + jl 9f // if lim < 4, skip the following code + + movups (T), %xmm7 // xmm7 is the tweak before encrypting every 4 blocks +#ifdef KERNEL + movaps %xmm5, 0xb0(sp) // hw-aes-based uses extra xmm5 +#endif + +0: + // derive 4 tweaks using xts_mult_x macro, and save on aligned stack space + // xmm7 will be the tweak for next 4-blocks iteration + + #define tweak1 16(sp) + #define tweak2 32(sp) + #define tweak3 48(sp) + #define tweak4 64(sp) + + movaps %xmm7, tweak1 // save 1st tweak on stack + xts_mult_x_on_xmm7 // compute 2nd tweak + movaps %xmm7, tweak2 // save 2nd tweak on stack + xts_mult_x_on_xmm7 // compute 3rd tweak + movaps %xmm7, tweak3 // save 3rd tweak on stack + xts_mult_x_on_xmm7 // compute 4th tweak + movaps %xmm7, tweak4 // save 4th tweak on stack + xts_mult_x_on_xmm7 // compute 1st tweak for next iteration + + // read 4 Ps + movups (P), %xmm0 + movups 16(P), %xmm1 + movups 32(P), %xmm2 + movups 48(P), %xmm3 + + // 4 C = P ^ T + pxor tweak1, %xmm0 + pxor tweak2, %xmm1 + pxor tweak3, %xmm2 + pxor tweak4, %xmm3 + + // 4 interleaved aes_encrypt + +#if defined __i386__ + mov 8(sp), %ecx // ctx + #undef ctx + #define ctx %ecx +#endif + + mov 240(ctx), %eax // aes length + + cmp $160, %eax // AES-128 ? + je 160f + cmp $192, %eax // AES-192 ? + je 192f + cmp $224, %eax // AES-256 ? + je 224f + mov $-1, %eax // error : non-supported aes length +#ifdef KERNEL + movaps 0xb0(sp), %xmm5 // hw-aes-based uses extra xmm5 +#endif + jmp L_error_crypt + + // definitions, macros, and constructs for 4 blocks hw-aes-encrypt + + // the following key definitions will also be used in tweak_uncrypt_group + #define key0 0(ctx) + #define key1 16(ctx) + #define key2 32(ctx) + #define key3 48(ctx) + #define key4 64(ctx) + #define key5 80(ctx) + #define key6 96(ctx) + #define key7 112(ctx) + #define key8 128(ctx) + #define key9 144(ctx) + #define keyA 160(ctx) + #define keyB 176(ctx) + #define keyC 192(ctx) + #define keyD 208(ctx) + #define keyE 224(ctx) + + #define aes aesenc + #define aeslast aesenclast + + // all aes encrypt operations start with the following sequence + .macro aes_common_part + movups key0, %xmm4 + movups key1, %xmm5 + pxor %xmm4, %xmm0 + pxor %xmm4, %xmm1 + pxor %xmm4, %xmm2 + pxor %xmm4, %xmm3 + movups key2, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + movups key3, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + movups key4, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + movups key5, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + movups key6, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + movups key7, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + movups key8, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + movups key9, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + movups keyA, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + .endm + + // all aes encypt operations end with the following 4 instructions + .macro aes_last + aeslast %xmm4, %xmm0 + aeslast %xmm4, %xmm1 + aeslast %xmm4, %xmm2 + aeslast %xmm4, %xmm3 + .endm + + .macro aes_128 + aes_common_part // encrypt common part + aes_last // encrypt ending part + .endm + + .macro aes_192 + aes_common_part // encrypt common part + + // 10 extra instructions in between common and ending + movups keyB, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + movups keyC, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + + aes_last // encrypt ending part + .endm + + .macro aes_256 + aes_common_part // encrypt common part + + // 20 extra instructions in between common and ending + movups keyB, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + movups keyC, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + movups keyD, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + movups keyE, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + + aes_last // encrypt ending part + .endm + +160: // AES-128 encrypt + aes_128 + jmp 8f + +192: // AES-192 encrypt + aes_192 + jmp 8f + +224: // AES-256 encrypt + aes_256 + +8: + + // 4 C = C ^ T + pxor tweak1, %xmm0 + pxor tweak2, %xmm1 + pxor tweak3, %xmm2 + pxor tweak4, %xmm3 + + // write 4 Cs + movups %xmm0, (C) + movups %xmm1, 16(C) + movups %xmm2, 32(C) + movups %xmm3, 48(C) + + add $64, P + add $64, C + + sub $4, lim + jge 0b + +#ifdef KERNEL + movaps 0xb0(sp), %xmm5 // hw-aes-based uses extra xmm5 +#endif + movups %xmm7, (T) + +9: + xor %eax, %eax // to return CRYPT_OK + add $4, lim // post-increment lim by 4 + je 9f // if lim==0, branch to prepare to return + +L_crypt_group_sw: + + movups (T), %xmm7 // T, xmm7 will be used as T (128-bit) throughtout the loop + + sub $1, lim // pre-decrement lim by 1 + jl 1f // if lim < 1, branch to prepare to return +0: + movups (P), %xmm0 // P + + // prepare for calling aes_encrypt +#if defined __i386__ + mov C, (%esp) // C + mov C, 4(%esp) // C + // ctx was prepared previously in preamble +#else + mov C, %rdi // C + mov C, %rsi // C + mov ctx, %rdx // ctx +#endif + + pxor %xmm7, %xmm0 // C = P ^ T + movups %xmm0, (C) // save C into memory + + call _aes_encrypt_xmm_no_save // err = aes_encrypt(C,C,ctx); + + cmp $CRYPT_OK, %eax // err == CRYPT_OK ? + jne 9f // if err != CRYPT_OK, branch to exit with error + + movups (C), %xmm0 // load xmm0 with C + pxor %xmm7, %xmm0 // C ^= T + movups %xmm0, (C) // save output C + + xts_mult_x_on_xmm7 + + add $16, C // next C + add $16, P // next P + sub $1, lim // lim-- + jge 0b // if (lim>0) repeat the scalar loop + +1: movups %xmm7, (T) // save final tweak +L_error_crypt: +9: + // if kernel, restore used xmm registers +#ifdef KERNEL + movaps 0x50(sp), %xmm0 + movaps 0x60(sp), %xmm1 + movaps 0x70(sp), %xmm2 + movaps 0x80(sp), %xmm3 + movaps 0x90(sp), %xmm4 + movaps 0xa0(sp), %xmm7 +#endif + +#if defined __i386__ + add $(12+16*8+16*4), %esp + pop %esi + pop %edi + pop %ebx +#else + add $(8+16*8+16*5), %rsp + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx +#endif + leave + ret + + #undef P + #undef C + #undef T + #undef ctx + #undef sp + +/* + The following is x86_64/i386 assembly implementation of + + int tweak_uncrypt(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx); + + Its C code implementation is given in xtsClearC.c + + all pointers C/P/T points to a block of 16 bytes. In the following description, C/P/T represent 128-bit data. + + The operation of tweak_crypt + + 1. P = C ^ T + 2. err = aes_decryp(P, P, ctx); if (err != CRYPT_OK) return err; + 3. P = P ^ T + 4. xts_mult_x(T) + 5. return CRYPT_OK; + + The following is the assembly implementation flow + + 1. save used xmm registers (xmm1/xmm7) if kernel code + 2. load xmm1 = C, xmm7 = T + 3. xmm1 = P = C ^ T + 4. write xmm1 to P + 5. call aes_decryp(P,P,ctx); note that it will use aesni if available, also xmm will return intact + 6. load xmm1 = P + 7. xmm1 = P = P^T = xmm1 ^ xmm7 + 8. write xmm1 to P + 9. update T (in xmm7) via xts_mult_x macro + a. restore xmm registers (xmm1/xmm7) if kernel code + b. return CRYPT_OK (in eax) + + Note: used xmm registers : xmm1/xmm2/xmm7, xmm2 in xts_mult_x macro + +*/ + + .text + .align 4,0x90 + .globl _tweak_uncrypt +_tweak_uncrypt: +#if defined __i386__ + + // push into stack for local use + push %ebp + mov %esp, %ebp + push %ebx + push %edi + push %esi + + // alllocate stack memory for local use + sub $12+16*4, %esp // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments) + + // load with called arguments + mov 8(%ebp), %eax // C, we need this only briefly, so eax is fine + mov 12(%ebp), %edi // P + mov 16(%ebp), %ebx // T + mov 20(%ebp), %esi // ctx + + #define C %eax + #define P %edi + #define T %ebx + #define ctx %esi + #define sp %esp + +#else + // x86_64 calling argument order : rdi/rsi/rdx/rcx/r8 + + // push into stack for local use + push %rbp + mov %rsp, %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + // alllocate stack memory for local use, if kernel code, need to save/restore xmm registers +#ifdef KERNEL + sub $4*16, %rsp // only need 3*16, add 16 extra so to make save/restore xmm common to i386 +#endif + + // load with called arguments, release rdi/rsi/rdx/rcx/r8, as need to call aes_decrypt + mov %rsi, %r13 + mov %rdx, %r14 + mov %rcx, %r15 + + #define C %rdi + #define P %r13 + #define T %r14 + #define ctx %r15 + #define sp %rsp + +#endif + + // if kernel, save used xmm registers +#ifdef KERNEL + movaps %xmm1, 16(sp) + movaps %xmm2, 32(sp) + movaps %xmm7, 48(sp) +#endif + + movups (C), %xmm1 // C + movups (T), %xmm7 // T + + // setup caliing arguments for aes_decrypt +#if defined __i386__ + mov P, (%esp) // P + mov P, 4(%esp) // P + mov ctx, 8(%esp) // ctx +#else + mov P, %rdi // P + mov P, %rsi // P + mov ctx, %rdx // ctx +#endif + + pxor %xmm7, %xmm1 // P = C ^ T + movups %xmm1, (P) // save P into memory + + call _aes_decrypt // err = aes_decrypt(P,P,ctx); + + cmp $CRYPT_OK, %eax // check err == CRYPT_OK + jne 9f // if err != CRYPT_OK, exit + + movups (P), %xmm1 // load xmm1 = P + pxor %xmm7, %xmm1 // P ^= T + movups %xmm1, (P) // write P with xmm1, xmm1 is freed now, will be changed in the following macro + + xts_mult_x_on_xmm7 // update T (on xmm7) + + movups %xmm7, (T) // write xmm7 to T +9: + + // restore used xmm registers if this is for kernel +#ifdef KERNEL + movaps 16(sp), %xmm1 + movaps 32(sp), %xmm2 + movaps 48(sp), %xmm7 +#endif + + // free stack memory and restore callee registers +#if defined __i386__ + add $12+16*4, %esp // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments) + pop %esi + pop %edi + pop %ebx +#else +#ifdef KERNEL + add $4*16, %rsp // only need 3*16, add 16 extra so make save/restore xmm common to i386 +#endif + pop %r15 + pop %r14 + pop %r13 + pop %r12 +#endif + + // return, eax/rax already has the return val + leave + ret + + #undef P + #undef C + #undef T + #undef ctx + #undef sp + +/* + The following is x86_64/i386 assembly implementation of + + int tweak_uncrypt_group(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx, uint32_t lim); + + TODO : Its C code implementation is YET to be provided in xtsClearC.c (for the benefit of porting to other ISAs) + This function is grouped version of the above function tweak_uncrypt(), so xmm registers save/restore only need + to happen once for all grouped blocks. + + The implementation here probes __cpu_capabilities to detect whether aesni (or hw-aes instruction) is available. + If aesni is available, the code branch to optimized code that uses aesni. + + The optimized aesni code operates as follows: + + while (more than 4 consecutive blocks available) { + + do xts_mult_x macro 4 times and write the 4 tweaks on stack (16-byte aligned) + + perform 4 P = C ^ T; // T is on 16-byte aligned stack + + perform 4 aes_decrypt (all aes_decrypt instruction interleaved to achieve better throughtput) + + perform 4 P = P ^ T // T is on 16-byte aligned stack + + } + + The code then falls through to the scalar code, that sequentially performs what tweak_crypt does + + 1. P = C ^ T + 2. err = aes_decryp(P, P, ctx); if (err != CRYPT_OK) return err; + 3. P = P ^ T + 4. xts_mult_x(T) + + Note: used xmm registers : + xmm0-xmm5, xmm7 if aesni is available + xmm0-xmm4, xmm7 if aesni is not available. + +*/ + + .text + .align 4,0x90 + .globl _tweak_uncrypt_group +_tweak_uncrypt_group: + +#if defined __i386__ + + // push callee-saved registers for local use + push %ebp + mov %esp, %ebp + push %ebx + push %edi + push %esi + + // allocate stack memory for local use and/or xmm register save for kernel code + sub $(12+8*16+16*4), %esp // 12 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) aesni + // 12 (alignment) + 8*16 (xmm) + 4*16 (only 12 used for aes_decrypt) no aesni + // transfer calling arguments + mov 20(%ebp), %eax // ctx + mov 12(%ebp), %edi // P + mov 16(%ebp), %ebx // T + mov 8(%ebp), %esi // C + mov %eax, 8(%esp) // ctx as the 3rd parameter to aes_decrypt + + #define C %esi + #define P %edi + #define T %ebx + #define lim 24(%ebp) + #define sp %esp + +#else + + // push callee-saved registers for local use + push %rbp + mov %rsp, %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + // allocate stack memory for local use and/or xmm register save for kernel code + sub $(8+8*16+16*5), %rsp // 8 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) + 16 (common to i386) + + // rdi/rsi/rdx/rcx/r8 + // transfer calling arguments + mov %rdi, %r12 + mov %rsi, %r13 + mov %rdx, %r14 + mov %rcx, %r15 + mov %r8, %rbx + + #define C %r12 + #define P %r13 + #define T %r14 + #define ctx %r15 + #define lim %ebx + #define sp %rsp +#endif + +#ifdef KERNEL + movaps %xmm0, 0x50(sp) + movaps %xmm1, 0x60(sp) + movaps %xmm2, 0x70(sp) + movaps %xmm3, 0x80(sp) + movaps %xmm4, 0x90(sp) + movaps %xmm7, 0xa0(sp) +#endif + + // probe __cpu_capabilities to detect aesni +#if defined __x86_64__ + movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities + mov (%rax), %eax // %eax = __cpu_capabilities +#else // i386 +#if defined KERNEL + leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities + mov (%eax), %eax // %eax = __cpu_capabilities +#else + movl _COMM_PAGE_CPU_CAPABILITIES, %eax +#endif +#endif + test $(kHasAES), %eax + je L_uncrypt_group_sw // if aesni not available, jump to sw-based implementation + + // aesni-based implementation + + sub $4, lim // pre-decrement lim by 4 + jl 9f // if lim < 4, skip the following code + + movups (T), %xmm7 // xmm7 is the tweak before decrypting every 4 blocks +#ifdef KERNEL + movaps %xmm5, 0xb0(sp) // hw-aes-based uses extra xmm5 +#endif + +0: + // derive 4 tweaks using xts_mult_x macro, and save on aligned stack space + // xmm7 will be the tweak for next 4-blocks iteration + + #define tweak1 16(sp) + #define tweak2 32(sp) + #define tweak3 48(sp) + #define tweak4 64(sp) + + movaps %xmm7, tweak1 // save 1st tweak on stack + xts_mult_x_on_xmm7 // compute 2nd tweak + movaps %xmm7, tweak2 // save 2nd tweak on stack + xts_mult_x_on_xmm7 // compute 3rd tweak + movaps %xmm7, tweak3 // save 3rd tweak on stack + xts_mult_x_on_xmm7 // compute 4th tweak + movaps %xmm7, tweak4 // save 4th tweak on stack + xts_mult_x_on_xmm7 // compute 1st tweak for next iteration + + // read 4 Cs + movups (C), %xmm0 + movups 16(C), %xmm1 + movups 32(C), %xmm2 + movups 48(C), %xmm3 + + // 4 P = C ^ T + pxor tweak1, %xmm0 + pxor tweak2, %xmm1 + pxor tweak3, %xmm2 + pxor tweak4, %xmm3 + + // 4 interleaved aes_decrypt + +#if defined __i386__ + mov 8(sp), %ecx // ctx + #undef ctx + #define ctx %ecx +#endif + + mov 240(ctx), %eax // aes length + + cmp $160, %eax // AES-128 ? + je 160f + cmp $192, %eax // AES-192 ? + je 192f + cmp $224, %eax // AES-256 ? + je 224f + mov $-1, %eax // error : non-supported aes length +#ifdef KERNEL + movaps 0xb0(sp), %xmm5 // hw-aes-based uses extra xmm5 +#endif + jmp L_error_uncrypt + + // definitions, macros to construc hw-aes-decrypt + // will reuse previously defined key0 = (ctx), key1 = 16(ctx), .... + #undef aes + #undef aeslast + #define aes aesdec + #define aeslast aesdeclast + + .macro aes_decrypt_common + movups key8, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + movups key7, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + movups key6, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + movups key5, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + movups key4, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + movups key3, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + movups key2, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + movups key1, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + movups key0, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + aeslast %xmm4, %xmm0 + aeslast %xmm4, %xmm1 + aeslast %xmm4, %xmm2 + aeslast %xmm4, %xmm3 + .endm + + .macro aes_dec_128 + movups keyA, %xmm4 + movups key9, %xmm5 + pxor %xmm4, %xmm0 + pxor %xmm4, %xmm1 + pxor %xmm4, %xmm2 + pxor %xmm4, %xmm3 + aes_decrypt_common + .endm + + .macro aes_dec_192 + movups keyC, %xmm4 + movups keyB, %xmm5 + pxor %xmm4, %xmm0 + pxor %xmm4, %xmm1 + pxor %xmm4, %xmm2 + pxor %xmm4, %xmm3 + movups keyA, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + movups key9, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + aes_decrypt_common + .endm + + .macro aes_dec_256 + movups keyE, %xmm4 + movups keyD, %xmm5 + pxor %xmm4, %xmm0 + pxor %xmm4, %xmm1 + pxor %xmm4, %xmm2 + pxor %xmm4, %xmm3 + movups keyC, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + movups keyB, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + movups keyA, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + movups key9, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + aes_decrypt_common + .endm + +160: // AES-128 decrypt + aes_dec_128 + jmp 8f + +192: // AES-192 decrypt + aes_dec_192 + jmp 8f + +224: // AES-256 decrypt + aes_dec_256 + +8: + + // 4 P = P ^ T + pxor tweak1, %xmm0 + pxor tweak2, %xmm1 + pxor tweak3, %xmm2 + pxor tweak4, %xmm3 + + // write 4 Ps + movups %xmm0, (P) + movups %xmm1, 16(P) + movups %xmm2, 32(P) + movups %xmm3, 48(P) + + add $64, C + add $64, P + + sub $4, lim + jge 0b + +#ifdef KERNEL + movaps 0xb0(sp), %xmm5 // hw-aes-based uses extra xmm5 +#endif + movups %xmm7, (T) + +9: + xor %eax, %eax // to return CRYPT_OK + add $4, lim // post-increment lim by 4 + je 9f // if lim==0, branch to prepare to return + +L_uncrypt_group_sw: + + movups (T), %xmm7 // T, xmm7 will be used as T (128-bit) throughtout the loop + + sub $1, lim // pre-decrement lim by 1 + jl 1f // if lim < 1, branch to prepare to return +0: + movups (C), %xmm0 // C + + // prepare for calling aes_decrypt +#if defined __i386__ + mov P, (%esp) // P + mov P, 4(%esp) // P + // ctx was prepared previously in preamble +#else + mov P, %rdi // P + mov P, %rsi // P + mov ctx, %rdx // ctx +#endif + + pxor %xmm7, %xmm0 // P = C ^ T + movups %xmm0, (P) // save P into memory + + call _aes_decrypt_xmm_no_save // err = aes_decrypt(P,P,ctx); + + cmp $CRYPT_OK, %eax // err == CRYPT_OK ? + jne 9f // if err != CRYPT_OK, branch to exit with error + + movups (P), %xmm0 // load xmm0 with P + pxor %xmm7, %xmm0 // P ^= T + movups %xmm0, (P) // save output P + + xts_mult_x_on_xmm7 + + add $16, C // next C + add $16, P // next P + sub $1, lim // lim-- + jge 0b // if (lim>0) repeat the scalar loop + +1: movups %xmm7, (T) // save final tweak +L_error_uncrypt: +9: + // if kernel, restore used xmm registers +#ifdef KERNEL + movaps 0x50(sp), %xmm0 + movaps 0x60(sp), %xmm1 + movaps 0x70(sp), %xmm2 + movaps 0x80(sp), %xmm3 + movaps 0x90(sp), %xmm4 + movaps 0xa0(sp), %xmm7 +#endif + +#if defined __i386__ + add $(12+16*8+16*4), %esp + pop %esi + pop %edi + pop %ebx +#else + add $(8+16*8+16*5), %rsp + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx +#endif + leave + ret diff --git a/bsd/crypto/aes/i386/edefs.h b/bsd/crypto/aes/i386/edefs.h deleted file mode 100644 index d25bef89c..000000000 --- a/bsd/crypto/aes/i386/edefs.h +++ /dev/null @@ -1,130 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software in both source and binary - form is allowed (with or without changes) provided that: - - 1. distributions of this source code include the above copyright - notice, this list of conditions and the following disclaimer; - - 2. distributions in binary form include the above copyright - notice, this list of conditions and the following disclaimer - in the documentation and/or other associated materials; - - 3. the copyright holder's name is not used to endorse products - built using this software without specific written permission. - - ALTERNATIVELY, provided that this notice is retained in full, this product - may be distributed under the terms of the GNU General Public License (GPL), - in which case the provisions of the GPL apply INSTEAD OF those given above. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue 31/01/2006 -*/ - -#ifndef EDEFS_H -#define EDEFS_H -#if defined(__cplusplus) -extern "C" -{ -#endif - -#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ -#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ - -#if defined(__GNUC__) || defined(__GNU_LIBRARY__) -# if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) -# include -# elif defined( BSD ) && ( BSD >= 199103 ) || defined( __DJGPP__ ) || defined( __CYGWIN32__ ) -# include -# elif defined(__APPLE__) -# if defined(__BIG_ENDIAN__) && !defined( BIG_ENDIAN ) -# define BIG_ENDIAN -# elif defined(__LITTLE_ENDIAN__) && !defined( LITTLE_ENDIAN ) -# define LITTLE_ENDIAN -# endif -# elif !defined( __MINGW32__ ) -# include -# if !defined(__BEOS__) -# include -# endif -# endif -#endif - -#if !defined(PLATFORM_BYTE_ORDER) -# if defined(LITTLE_ENDIAN) || defined(BIG_ENDIAN) -# if defined(LITTLE_ENDIAN) && !defined(BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# elif !defined(LITTLE_ENDIAN) && defined(BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined(BYTE_ORDER) && (BYTE_ORDER == LITTLE_ENDIAN) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# elif defined(BYTE_ORDER) && (BYTE_ORDER == BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# endif -# elif defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN) -# if defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# elif !defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined(_BYTE_ORDER) && (_BYTE_ORDER == _LITTLE_ENDIAN) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# elif defined(_BYTE_ORDER) && (_BYTE_ORDER == _BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# endif -# elif defined(__LITTLE_ENDIAN__) || defined(__BIG_ENDIAN__) -# if defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# elif !defined(__LITTLE_ENDIAN__) && defined(__BIG_ENDIAN__) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __LITTLE_ENDIAN__) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __BIG_ENDIAN__) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# endif -# endif -#endif - -/* if the platform is still unknown, try to find its byte order */ -/* from commonly used machine defines */ - -#if !defined(PLATFORM_BYTE_ORDER) - -#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \ - defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \ - defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \ - defined( vax ) || defined( vms ) || defined( VMS ) || \ - defined( __VMS ) || defined( _M_X64 ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN - -#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \ - defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \ - defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \ - defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \ - defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \ - defined( __TANDEM ) || defined( THINK_C ) || defined( __VMCMS__ ) || \ - defined( __VOS__ ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN - -#elif 0 /* **** EDIT HERE IF NECESSARY **** */ -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#elif 0 /* **** EDIT HERE IF NECESSARY **** */ -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#else -# error Please edit edefs.h (lines 117 or 119) to set the platform byte order -#endif - -#endif - -#if defined(__cplusplus) -} -#endif -#endif diff --git a/bsd/crypto/aes/ppc/Makefile b/bsd/crypto/aes/ppc/Makefile deleted file mode 100644 index 99755ad2e..000000000 --- a/bsd/crypto/aes/ppc/Makefile +++ /dev/null @@ -1,36 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -INSTINC_SUBDIRS = \ - -INSTINC_SUBDIRS_PPC = \ - -INSTINC_SUBDIRS_I386 = \ - -EXPINC_SUBDIRS = \ - -EXPINC_SUBDIRS_PPC = \ - -EXPINC_SUBDIRS_I386 = \ - -PRIVATE_DATAFILES = \ - aestab.h aesopt.h - -INSTALL_MI_DIR = crypto - -EXPORT_MI_DIR = ${INSTALL_MI_DIR} - -INSTALL_KF_MI_LIST = - -INSTALL_KF_MI_LCL_LIST = ${PRIVATE_DATAFILES} - -include $(MakeInc_rule) -include $(MakeInc_dir) - - diff --git a/bsd/crypto/aes/ppc/aescrypt.c b/bsd/crypto/aes/ppc/aescrypt.c deleted file mode 100644 index 31d4c81af..000000000 --- a/bsd/crypto/aes/ppc/aescrypt.c +++ /dev/null @@ -1,411 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software in both source and binary - form is allowed (with or without changes) provided that: - - 1. distributions of this source code include the above copyright - notice, this list of conditions and the following disclaimer; - - 2. distributions in binary form include the above copyright - notice, this list of conditions and the following disclaimer - in the documentation and/or other associated materials; - - 3. the copyright holder's name is not used to endorse products - built using this software without specific written permission. - - ALTERNATIVELY, provided that this notice is retained in full, this product - may be distributed under the terms of the GNU General Public License (GPL), - in which case the provisions of the GPL apply INSTEAD OF those given above. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue 28/01/2004 - - This file contains the code for implementing encryption and decryption - for AES (Rijndael) for block and key sizes of 16, 24 and 32 bytes. It - can optionally be replaced by code written in assembler using NASM. For - further details see the file aesopt.h -*/ - -#include "aesopt.h" -#include "aestab.h" - -#if defined(__cplusplus) -extern "C" -{ -#endif - -#define ki(y,x,k,c) (s(y,c) = s(x, c) ^ (k)[c]) -#define xo(y,x,c) (s(y,c) ^= s(x, c)) -#define si(y,x,c) (s(y,c) = word_in(x, c)) -#define so(y,x,c) word_out(y, c, s(x,c)) - -#if defined(ARRAYS) -#define locals(y,x) x[4],y[4] -#else -#define locals(y,x) x##0,x##1,x##2,x##3,y##0,y##1,y##2,y##3 -#endif - -#define dtables(tab) const aes_32t *tab##0, *tab##1, *tab##2, *tab##3 -#define itables(tab) tab##0 = tab[0]; tab##1 = tab[1]; tab##2 = tab[2]; tab##3 = tab[3] - -#define l_copy(y, x) s(y,0) = s(x,0); s(y,1) = s(x,1); \ - s(y,2) = s(x,2); s(y,3) = s(x,3); - -#define key_in(y,x,k) ki(y,x,k,0); ki(y,x,k,1); ki(y,x,k,2); ki(y,x,k,3) -#define cbc(y,x) xo(y,x,0); xo(y,x,1); xo(y,x,2); xo(y,x,3) -#define state_in(y,x) si(y,x,0); si(y,x,1); si(y,x,2); si(y,x,3) -#define state_out(y,x) so(y,x,0); so(y,x,1); so(y,x,2); so(y,x,3) -#define round(rm,y,x,k) rm(y,x,k,0); rm(y,x,k,1); rm(y,x,k,2); rm(y,x,k,3) - -#if defined(ENCRYPTION) && !defined(AES_ASM) - -/* Visual C++ .Net v7.1 provides the fastest encryption code when using - Pentium optimiation with small code but this is poor for decryption - so we need to control this with the following VC++ pragmas -*/ - -#if defined(_MSC_VER) -#pragma optimize( "s", on ) -#endif - -/* Given the column (c) of the output state variable, the following - macros give the input state variables which are needed in its - computation for each row (r) of the state. All the alternative - macros give the same end values but expand into different ways - of calculating these values. In particular the complex macro - used for dynamically variable block sizes is designed to expand - to a compile time constant whenever possible but will expand to - conditional clauses on some branches (I am grateful to Frank - Yellin for this construction) -*/ - -#define fwd_var(x,r,c)\ - ( r == 0 ? ( c == 0 ? s(x,0) : c == 1 ? s(x,1) : c == 2 ? s(x,2) : s(x,3))\ - : r == 1 ? ( c == 0 ? s(x,1) : c == 1 ? s(x,2) : c == 2 ? s(x,3) : s(x,0))\ - : r == 2 ? ( c == 0 ? s(x,2) : c == 1 ? s(x,3) : c == 2 ? s(x,0) : s(x,1))\ - : ( c == 0 ? s(x,3) : c == 1 ? s(x,0) : c == 2 ? s(x,1) : s(x,2))) - -#if defined(FT4_SET) -#undef dec_fmvars -# if defined(ENC_ROUND_CACHE_TABLES) -#define fwd_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_cached_tables(x,t_fn,fwd_var,rf1,c)) -# else -#define fwd_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_tables(x,t_fn,fwd_var,rf1,c)) -# endif -#elif defined(FT1_SET) -#undef dec_fmvars -#define fwd_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ one_table(x,upr,t_fn,fwd_var,rf1,c)) -#else -#define fwd_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ fwd_mcol(no_table(x,t_sbox,fwd_var,rf1,c))) -#endif - -#if defined(FL4_SET) -# if defined(LAST_ENC_ROUND_CACHE_TABLES) -#define fwd_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_cached_tables(x,t_fl,fwd_var,rf1,c)) -# else -#define fwd_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_tables(x,t_fl,fwd_var,rf1,c)) -# endif -#elif defined(FL1_SET) -#define fwd_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ one_table(x,ups,t_fl,fwd_var,rf1,c)) -#else -#define fwd_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ no_table(x,t_sbox,fwd_var,rf1,c)) -#endif - -aes_rval aes_encrypt_cbc(const unsigned char *in, const unsigned char *in_iv, unsigned int num_blk, - unsigned char *out, const aes_encrypt_ctx cx[1]) -{ aes_32t locals(b0, b1); - const aes_32t *kp; - const aes_32t *kptr = cx->ks; -#if defined(ENC_ROUND_CACHE_TABLES) - dtables(t_fn); -#endif -#if defined(LAST_ENC_ROUND_CACHE_TABLES) - dtables(t_fl); -#endif - -#if defined( dec_fmvars ) - dec_fmvars; /* declare variables for fwd_mcol() if needed */ -#endif - -#if defined( AES_ERR_CHK ) - if( cx->rn != 10 && cx->rn != 12 && cx->rn != 14 ) - return aes_error; -#endif - - // Load IV into b0. - state_in(b0, in_iv); - - for (;num_blk; in += AES_BLOCK_SIZE, out += AES_BLOCK_SIZE, --num_blk) - { - kp = kptr; -#if 0 - // Read the plaintext into b1 - state_in(b1, in); - // Do the CBC with b0 which is either the iv or the ciphertext of the previous block. - cbc(b1, b0); - - // Xor b1 with the key schedule to get things started. - key_in(b0, b1, kp); -#else - // Since xor is associative we mess with the ordering here to get the loads started early - key_in(b1, b0, kp); // Xor b0(IV) with the key schedule and assign to b1 - state_in(b0, in); // Load block into b0 - cbc(b0, b1); // Xor b0 with b1 and store in b0 -#endif - -#if defined(ENC_ROUND_CACHE_TABLES) - itables(t_fn); -#endif - -#if (ENC_UNROLL == FULL) - - switch(cx->rn) - { - case 14: - round(fwd_rnd, b1, b0, kp + 1 * N_COLS); - round(fwd_rnd, b0, b1, kp + 2 * N_COLS); - kp += 2 * N_COLS; - case 12: - round(fwd_rnd, b1, b0, kp + 1 * N_COLS); - round(fwd_rnd, b0, b1, kp + 2 * N_COLS); - kp += 2 * N_COLS; - case 10: - default: - round(fwd_rnd, b1, b0, kp + 1 * N_COLS); - round(fwd_rnd, b0, b1, kp + 2 * N_COLS); - round(fwd_rnd, b1, b0, kp + 3 * N_COLS); - round(fwd_rnd, b0, b1, kp + 4 * N_COLS); - round(fwd_rnd, b1, b0, kp + 5 * N_COLS); - round(fwd_rnd, b0, b1, kp + 6 * N_COLS); - round(fwd_rnd, b1, b0, kp + 7 * N_COLS); - round(fwd_rnd, b0, b1, kp + 8 * N_COLS); - round(fwd_rnd, b1, b0, kp + 9 * N_COLS); -#if defined(LAST_ENC_ROUND_CACHE_TABLES) - itables(t_fl); -#endif - round(fwd_lrnd, b0, b1, kp +10 * N_COLS); - } - -#else - - { aes_32t rnd; -#if (ENC_UNROLL == PARTIAL) - for(rnd = 0; rnd < (cx->rn >> 1) - 1; ++rnd) - { - kp += N_COLS; - round(fwd_rnd, b1, b0, kp); - kp += N_COLS; - round(fwd_rnd, b0, b1, kp); - } - kp += N_COLS; - round(fwd_rnd, b1, b0, kp); -#else - for(rnd = 0; rnd < cx->rn - 1; ++rnd) - { - kp += N_COLS; - round(fwd_rnd, b1, b0, kp); - l_copy(b0, b1); - } -#endif -#if defined(LAST_ENC_ROUND_CACHE_TABLES) - itables(t_fl); -#endif - kp += N_COLS; - round(fwd_lrnd, b0, b1, kp); - } -#endif - - state_out(out, b0); - } - -#if defined( AES_ERR_CHK ) - return aes_good; -#endif -} - -#endif - -#if defined(DECRYPTION) && !defined(AES_ASM) - -/* Visual C++ .Net v7.1 provides the fastest encryption code when using - Pentium optimiation with small code but this is poor for decryption - so we need to control this with the following VC++ pragmas -*/ - -#if defined(_MSC_VER) -#pragma optimize( "t", on ) -#endif - -/* Given the column (c) of the output state variable, the following - macros give the input state variables which are needed in its - computation for each row (r) of the state. All the alternative - macros give the same end values but expand into different ways - of calculating these values. In particular the complex macro - used for dynamically variable block sizes is designed to expand - to a compile time constant whenever possible but will expand to - conditional clauses on some branches (I am grateful to Frank - Yellin for this construction) -*/ - -#define inv_var(x,r,c)\ - ( r == 0 ? ( c == 0 ? s(x,0) : c == 1 ? s(x,1) : c == 2 ? s(x,2) : s(x,3))\ - : r == 1 ? ( c == 0 ? s(x,3) : c == 1 ? s(x,0) : c == 2 ? s(x,1) : s(x,2))\ - : r == 2 ? ( c == 0 ? s(x,2) : c == 1 ? s(x,3) : c == 2 ? s(x,0) : s(x,1))\ - : ( c == 0 ? s(x,1) : c == 1 ? s(x,2) : c == 2 ? s(x,3) : s(x,0))) - -#if defined(IT4_SET) -#undef dec_imvars -# if defined(DEC_ROUND_CACHE_TABLES) -#define inv_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_cached_tables(x,t_in,inv_var,rf1,c)) -# else -#define inv_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_tables(x,t_in,inv_var,rf1,c)) -# endif -#elif defined(IT1_SET) -#undef dec_imvars -#define inv_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ one_table(x,upr,t_in,inv_var,rf1,c)) -#else -#define inv_rnd(y,x,k,c) (s(y,c) = inv_mcol((k)[c] ^ no_table(x,t_ibox,inv_var,rf1,c))) -#endif - -#if defined(IL4_SET) -# if defined(LAST_DEC_ROUND_CACHE_TABLES) -#define inv_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_cached_tables(x,t_il,inv_var,rf1,c)) -# else -#define inv_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_tables(x,t_il,inv_var,rf1,c)) -# endif -#elif defined(IL1_SET) -#define inv_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ one_table(x,ups,t_il,inv_var,rf1,c)) -#else -#define inv_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ no_table(x,t_ibox,inv_var,rf1,c)) -#endif - -aes_rval aes_decrypt_cbc(const unsigned char *in, const unsigned char *in_iv, unsigned int num_blk, - unsigned char *out, const aes_decrypt_ctx cx[1]) -{ aes_32t locals(b0, b1); - const aes_32t *kptr = cx->ks + cx->rn * N_COLS; - const aes_32t *kp; -#if defined(DEC_ROUND_CACHE_TABLES) - dtables(t_in); -#endif -#if defined(LAST_DEC_ROUND_CACHE_TABLES) - dtables(t_il); -#endif - -#if defined( dec_imvars ) - dec_imvars; /* declare variables for inv_mcol() if needed */ -#endif - -#if defined( AES_ERR_CHK ) - if( cx->rn != 10 && cx->rn != 12 && cx->rn != 14 ) - return aes_error; -#endif - -#if defined(DEC_ROUND_CACHE_TABLES) - itables(t_in); -#endif - - in += AES_BLOCK_SIZE * (num_blk - 1); - out += AES_BLOCK_SIZE * (num_blk - 1); - // Load the last block's ciphertext into b1 - state_in(b1, in); - - for (;num_blk; out -= AES_BLOCK_SIZE, --num_blk) - { - kp = kptr; - // Do the xor part of state_in, where b1 is the previous block's ciphertext. - key_in(b0, b1, kp); - -#if (DEC_UNROLL == FULL) - - switch(cx->rn) - { - case 14: - round(inv_rnd, b1, b0, kp - 1 * N_COLS); - round(inv_rnd, b0, b1, kp - 2 * N_COLS); - kp -= 2 * N_COLS; - case 12: - round(inv_rnd, b1, b0, kp - 1 * N_COLS); - round(inv_rnd, b0, b1, kp - 2 * N_COLS); - kp -= 2 * N_COLS; - case 10: - default: - round(inv_rnd, b1, b0, kp - 1 * N_COLS); - round(inv_rnd, b0, b1, kp - 2 * N_COLS); - round(inv_rnd, b1, b0, kp - 3 * N_COLS); - round(inv_rnd, b0, b1, kp - 4 * N_COLS); - round(inv_rnd, b1, b0, kp - 5 * N_COLS); - round(inv_rnd, b0, b1, kp - 6 * N_COLS); - round(inv_rnd, b1, b0, kp - 7 * N_COLS); - round(inv_rnd, b0, b1, kp - 8 * N_COLS); - round(inv_rnd, b1, b0, kp - 9 * N_COLS); -#if defined(LAST_DEC_ROUND_CACHE_TABLES) - itables(t_il); -#endif - round(inv_lrnd, b0, b1, kp - 10 * N_COLS); - } - -#else - - { aes_32t rnd; -#if (DEC_UNROLL == PARTIAL) - for(rnd = 0; rnd < (cx->rn >> 1) - 1; ++rnd) - { - kp -= N_COLS; - round(inv_rnd, b1, b0, kp); - kp -= N_COLS; - round(inv_rnd, b0, b1, kp); - } - kp -= N_COLS; - round(inv_rnd, b1, b0, kp); -#else - for(rnd = 0; rnd < cx->rn - 1; ++rnd) - { - kp -= N_COLS; - round(inv_rnd, b1, b0, kp); - l_copy(b0, b1); - } -#endif -#if defined(LAST_DEC_ROUND_CACHE_TABLES) - itables(t_il); -#endif - kp -= N_COLS; - round(inv_lrnd, b0, b1, kp); - } -#endif - - if (num_blk == 1) - { - // We are doing the first block so we need the IV rather than the previous - // block for CBC (there is no previous block) - state_in(b1, in_iv); - } - else - { - in -= AES_BLOCK_SIZE; - state_in(b1, in); - } - - // Do the CBC with b1 which is either the IV or the ciphertext of the previous block. - cbc(b0, b1); - - state_out(out, b0); - } -#if defined( AES_ERR_CHK ) - return aes_good; -#endif -} - -#endif - -#if defined(__cplusplus) -} -#endif diff --git a/bsd/crypto/aes/ppc/aeskey.c b/bsd/crypto/aes/ppc/aeskey.c deleted file mode 100644 index 5e0a6453c..000000000 --- a/bsd/crypto/aes/ppc/aeskey.c +++ /dev/null @@ -1,455 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software in both source and binary - form is allowed (with or without changes) provided that: - - 1. distributions of this source code include the above copyright - notice, this list of conditions and the following disclaimer; - - 2. distributions in binary form include the above copyright - notice, this list of conditions and the following disclaimer - in the documentation and/or other associated materials; - - 3. the copyright holder's name is not used to endorse products - built using this software without specific written permission. - - ALTERNATIVELY, provided that this notice is retained in full, this product - may be distributed under the terms of the GNU General Public License (GPL), - in which case the provisions of the GPL apply INSTEAD OF those given above. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue Date: 26/08/2003 - - This file contains the code for implementing the key schedule for AES - (Rijndael) for block and key sizes of 16, 24, and 32 bytes. See aesopt.h - for further details including optimisation. -*/ - -#include "aesopt.h" -#include "aestab.h" - -#if defined(__cplusplus) -extern "C" -{ -#endif - -/* Initialise the key schedule from the user supplied key. The key - length can be specified in bytes, with legal values of 16, 24 - and 32, or in bits, with legal values of 128, 192 and 256. These - values correspond with Nk values of 4, 6 and 8 respectively. - - The following macros implement a single cycle in the key - schedule generation process. The number of cycles needed - for each cx->n_col and nk value is: - - nk = 4 5 6 7 8 - ------------------------------ - cx->n_col = 4 10 9 8 7 7 - cx->n_col = 5 14 11 10 9 9 - cx->n_col = 6 19 15 12 11 11 - cx->n_col = 7 21 19 16 13 14 - cx->n_col = 8 29 23 19 17 14 -*/ - -#define ke4(k,i) \ -{ k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; k[4*(i)+5] = ss[1] ^= ss[0]; \ - k[4*(i)+6] = ss[2] ^= ss[1]; k[4*(i)+7] = ss[3] ^= ss[2]; \ -} -#define kel4(k,i) \ -{ k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; k[4*(i)+5] = ss[1] ^= ss[0]; \ - k[4*(i)+6] = ss[2] ^= ss[1]; k[4*(i)+7] = ss[3] ^= ss[2]; \ -} - -#define ke6(k,i) \ -{ k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; k[6*(i)+ 7] = ss[1] ^= ss[0]; \ - k[6*(i)+ 8] = ss[2] ^= ss[1]; k[6*(i)+ 9] = ss[3] ^= ss[2]; \ - k[6*(i)+10] = ss[4] ^= ss[3]; k[6*(i)+11] = ss[5] ^= ss[4]; \ -} -#define kel6(k,i) \ -{ k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; k[6*(i)+ 7] = ss[1] ^= ss[0]; \ - k[6*(i)+ 8] = ss[2] ^= ss[1]; k[6*(i)+ 9] = ss[3] ^= ss[2]; \ -} - -#define ke8(k,i) \ -{ k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; k[8*(i)+ 9] = ss[1] ^= ss[0]; \ - k[8*(i)+10] = ss[2] ^= ss[1]; k[8*(i)+11] = ss[3] ^= ss[2]; \ - k[8*(i)+12] = ss[4] ^= ls_box(ss[3],0); k[8*(i)+13] = ss[5] ^= ss[4]; \ - k[8*(i)+14] = ss[6] ^= ss[5]; k[8*(i)+15] = ss[7] ^= ss[6]; \ -} -#define kel8(k,i) \ -{ k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; k[8*(i)+ 9] = ss[1] ^= ss[0]; \ - k[8*(i)+10] = ss[2] ^= ss[1]; k[8*(i)+11] = ss[3] ^= ss[2]; \ -} - -#if defined(ENCRYPTION_KEY_SCHEDULE) - -#if defined(AES_128) || defined(AES_VAR) - -aes_rval aes_encrypt_key128(const unsigned char *key, aes_encrypt_ctx cx[1]) -{ aes_32t ss[4]; - - cx->ks[0] = ss[0] = word_in(key, 0); - cx->ks[1] = ss[1] = word_in(key, 1); - cx->ks[2] = ss[2] = word_in(key, 2); - cx->ks[3] = ss[3] = word_in(key, 3); - -#if ENC_UNROLL == NONE - { aes_32t i; - - for(i = 0; i < ((11 * N_COLS - 5) / 4); ++i) - ke4(cx->ks, i); - } -#else - ke4(cx->ks, 0); ke4(cx->ks, 1); - ke4(cx->ks, 2); ke4(cx->ks, 3); - ke4(cx->ks, 4); ke4(cx->ks, 5); - ke4(cx->ks, 6); ke4(cx->ks, 7); - ke4(cx->ks, 8); -#endif - kel4(cx->ks, 9); - cx->rn = 10; -#if defined( AES_ERR_CHK ) - return aes_good; -#endif -} - -#endif - -#if defined(AES_192) || defined(AES_VAR) - -aes_rval aes_encrypt_key192(const unsigned char *key, aes_encrypt_ctx cx[1]) -{ aes_32t ss[6]; - - cx->ks[0] = ss[0] = word_in(key, 0); - cx->ks[1] = ss[1] = word_in(key, 1); - cx->ks[2] = ss[2] = word_in(key, 2); - cx->ks[3] = ss[3] = word_in(key, 3); - cx->ks[4] = ss[4] = word_in(key, 4); - cx->ks[5] = ss[5] = word_in(key, 5); - -#if ENC_UNROLL == NONE - { aes_32t i; - - for(i = 0; i < (13 * N_COLS - 7) / 6; ++i) - ke6(cx->ks, i); - } -#else - ke6(cx->ks, 0); ke6(cx->ks, 1); - ke6(cx->ks, 2); ke6(cx->ks, 3); - ke6(cx->ks, 4); ke6(cx->ks, 5); - ke6(cx->ks, 6); -#endif - kel6(cx->ks, 7); - cx->rn = 12; -#if defined( AES_ERR_CHK ) - return aes_good; -#endif -} - -#endif - -#if defined(AES_256) || defined(AES_VAR) - -aes_rval aes_encrypt_key256(const unsigned char *key, aes_encrypt_ctx cx[1]) -{ aes_32t ss[8]; - - cx->ks[0] = ss[0] = word_in(key, 0); - cx->ks[1] = ss[1] = word_in(key, 1); - cx->ks[2] = ss[2] = word_in(key, 2); - cx->ks[3] = ss[3] = word_in(key, 3); - cx->ks[4] = ss[4] = word_in(key, 4); - cx->ks[5] = ss[5] = word_in(key, 5); - cx->ks[6] = ss[6] = word_in(key, 6); - cx->ks[7] = ss[7] = word_in(key, 7); - -#if ENC_UNROLL == NONE - { aes_32t i; - - for(i = 0; i < (15 * N_COLS - 9) / 8; ++i) - ke8(cx->ks, i); - } -#else - ke8(cx->ks, 0); ke8(cx->ks, 1); - ke8(cx->ks, 2); ke8(cx->ks, 3); - ke8(cx->ks, 4); ke8(cx->ks, 5); -#endif - kel8(cx->ks, 6); - cx->rn = 14; -#if defined( AES_ERR_CHK ) - return aes_good; -#endif -} - -#endif - -#if defined(AES_VAR) - -aes_rval aes_encrypt_key(const unsigned char *key, int key_len, aes_encrypt_ctx cx[1]) -{ - switch(key_len) - { -#if defined( AES_ERR_CHK ) - case 16: case 128: return aes_encrypt_key128(key, cx); - case 24: case 192: return aes_encrypt_key192(key, cx); - case 32: case 256: return aes_encrypt_key256(key, cx); - default: return aes_error; -#else - case 16: case 128: aes_encrypt_key128(key, cx); return; - case 24: case 192: aes_encrypt_key192(key, cx); return; - case 32: case 256: aes_encrypt_key256(key, cx); return; -#endif - } -} - -#endif - -#endif - -#if defined(DECRYPTION_KEY_SCHEDULE) - -#if DEC_ROUND == NO_TABLES -#define ff(x) (x) -#else -#define ff(x) inv_mcol(x) -#if defined( dec_imvars ) -#define d_vars dec_imvars -#endif -#endif - -#if 1 -#define kdf4(k,i) \ -{ ss[0] = ss[0] ^ ss[2] ^ ss[1] ^ ss[3]; ss[1] = ss[1] ^ ss[3]; ss[2] = ss[2] ^ ss[3]; ss[3] = ss[3]; \ - ss[4] = ls_box(ss[(i+3) % 4], 3) ^ t_use(r,c)[i]; ss[i % 4] ^= ss[4]; \ - ss[4] ^= k[4*(i)]; k[4*(i)+4] = ff(ss[4]); ss[4] ^= k[4*(i)+1]; k[4*(i)+5] = ff(ss[4]); \ - ss[4] ^= k[4*(i)+2]; k[4*(i)+6] = ff(ss[4]); ss[4] ^= k[4*(i)+3]; k[4*(i)+7] = ff(ss[4]); \ -} -#define kd4(k,i) \ -{ ss[4] = ls_box(ss[(i+3) % 4], 3) ^ t_use(r,c)[i]; ss[i % 4] ^= ss[4]; ss[4] = ff(ss[4]); \ - k[4*(i)+4] = ss[4] ^= k[4*(i)]; k[4*(i)+5] = ss[4] ^= k[4*(i)+1]; \ - k[4*(i)+6] = ss[4] ^= k[4*(i)+2]; k[4*(i)+7] = ss[4] ^= k[4*(i)+3]; \ -} -#define kdl4(k,i) \ -{ ss[4] = ls_box(ss[(i+3) % 4], 3) ^ t_use(r,c)[i]; ss[i % 4] ^= ss[4]; \ - k[4*(i)+4] = (ss[0] ^= ss[1]) ^ ss[2] ^ ss[3]; k[4*(i)+5] = ss[1] ^ ss[3]; \ - k[4*(i)+6] = ss[0]; k[4*(i)+7] = ss[1]; \ -} -#else -#define kdf4(k,i) \ -{ ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; k[4*(i)+ 4] = ff(ss[0]); ss[1] ^= ss[0]; k[4*(i)+ 5] = ff(ss[1]); \ - ss[2] ^= ss[1]; k[4*(i)+ 6] = ff(ss[2]); ss[3] ^= ss[2]; k[4*(i)+ 7] = ff(ss[3]); \ -} -#define kd4(k,i) \ -{ ss[4] = ls_box(ss[3],3) ^ t_use(r,c)[i]; \ - ss[0] ^= ss[4]; ss[4] = ff(ss[4]); k[4*(i)+ 4] = ss[4] ^= k[4*(i)]; \ - ss[1] ^= ss[0]; k[4*(i)+ 5] = ss[4] ^= k[4*(i)+ 1]; \ - ss[2] ^= ss[1]; k[4*(i)+ 6] = ss[4] ^= k[4*(i)+ 2]; \ - ss[3] ^= ss[2]; k[4*(i)+ 7] = ss[4] ^= k[4*(i)+ 3]; \ -} -#define kdl4(k,i) \ -{ ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; k[4*(i)+ 4] = ss[0]; ss[1] ^= ss[0]; k[4*(i)+ 5] = ss[1]; \ - ss[2] ^= ss[1]; k[4*(i)+ 6] = ss[2]; ss[3] ^= ss[2]; k[4*(i)+ 7] = ss[3]; \ -} -#endif - -#define kdf6(k,i) \ -{ ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; k[6*(i)+ 6] = ff(ss[0]); ss[1] ^= ss[0]; k[6*(i)+ 7] = ff(ss[1]); \ - ss[2] ^= ss[1]; k[6*(i)+ 8] = ff(ss[2]); ss[3] ^= ss[2]; k[6*(i)+ 9] = ff(ss[3]); \ - ss[4] ^= ss[3]; k[6*(i)+10] = ff(ss[4]); ss[5] ^= ss[4]; k[6*(i)+11] = ff(ss[5]); \ -} -#define kd6(k,i) \ -{ ss[6] = ls_box(ss[5],3) ^ t_use(r,c)[i]; \ - ss[0] ^= ss[6]; ss[6] = ff(ss[6]); k[6*(i)+ 6] = ss[6] ^= k[6*(i)]; \ - ss[1] ^= ss[0]; k[6*(i)+ 7] = ss[6] ^= k[6*(i)+ 1]; \ - ss[2] ^= ss[1]; k[6*(i)+ 8] = ss[6] ^= k[6*(i)+ 2]; \ - ss[3] ^= ss[2]; k[6*(i)+ 9] = ss[6] ^= k[6*(i)+ 3]; \ - ss[4] ^= ss[3]; k[6*(i)+10] = ss[6] ^= k[6*(i)+ 4]; \ - ss[5] ^= ss[4]; k[6*(i)+11] = ss[6] ^= k[6*(i)+ 5]; \ -} -#define kdl6(k,i) \ -{ ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; k[6*(i)+ 6] = ss[0]; ss[1] ^= ss[0]; k[6*(i)+ 7] = ss[1]; \ - ss[2] ^= ss[1]; k[6*(i)+ 8] = ss[2]; ss[3] ^= ss[2]; k[6*(i)+ 9] = ss[3]; \ -} - -#define kdf8(k,i) \ -{ ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; k[8*(i)+ 8] = ff(ss[0]); ss[1] ^= ss[0]; k[8*(i)+ 9] = ff(ss[1]); \ - ss[2] ^= ss[1]; k[8*(i)+10] = ff(ss[2]); ss[3] ^= ss[2]; k[8*(i)+11] = ff(ss[3]); \ - ss[4] ^= ls_box(ss[3],0); k[8*(i)+12] = ff(ss[4]); ss[5] ^= ss[4]; k[8*(i)+13] = ff(ss[5]); \ - ss[6] ^= ss[5]; k[8*(i)+14] = ff(ss[6]); ss[7] ^= ss[6]; k[8*(i)+15] = ff(ss[7]); \ -} -#define kd8(k,i) \ -{ aes_32t g = ls_box(ss[7],3) ^ t_use(r,c)[i]; \ - ss[0] ^= g; g = ff(g); k[8*(i)+ 8] = g ^= k[8*(i)]; \ - ss[1] ^= ss[0]; k[8*(i)+ 9] = g ^= k[8*(i)+ 1]; \ - ss[2] ^= ss[1]; k[8*(i)+10] = g ^= k[8*(i)+ 2]; \ - ss[3] ^= ss[2]; k[8*(i)+11] = g ^= k[8*(i)+ 3]; \ - g = ls_box(ss[3],0); \ - ss[4] ^= g; g = ff(g); k[8*(i)+12] = g ^= k[8*(i)+ 4]; \ - ss[5] ^= ss[4]; k[8*(i)+13] = g ^= k[8*(i)+ 5]; \ - ss[6] ^= ss[5]; k[8*(i)+14] = g ^= k[8*(i)+ 6]; \ - ss[7] ^= ss[6]; k[8*(i)+15] = g ^= k[8*(i)+ 7]; \ -} -#define kdl8(k,i) \ -{ ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; k[8*(i)+ 8] = ss[0]; ss[1] ^= ss[0]; k[8*(i)+ 9] = ss[1]; \ - ss[2] ^= ss[1]; k[8*(i)+10] = ss[2]; ss[3] ^= ss[2]; k[8*(i)+11] = ss[3]; \ -} - -#if defined(AES_128) || defined(AES_VAR) - -aes_rval aes_decrypt_key128(const unsigned char *key, aes_decrypt_ctx cx[1]) -{ aes_32t ss[5]; -#if defined( d_vars ) - d_vars; -#endif - cx->ks[0] = ss[0] = word_in(key, 0); - cx->ks[1] = ss[1] = word_in(key, 1); - cx->ks[2] = ss[2] = word_in(key, 2); - cx->ks[3] = ss[3] = word_in(key, 3); - -#if DEC_UNROLL == NONE - { aes_32t i; - - for(i = 0; i < (11 * N_COLS - 5) / 4; ++i) - ke4(cx->ks, i); - kel4(cx->ks, 9); -#if !(DEC_ROUND == NO_TABLES) - for(i = N_COLS; i < 10 * N_COLS; ++i) - cx->ks[i] = inv_mcol(cx->ks[i]); -#endif - } -#else - kdf4(cx->ks, 0); kd4(cx->ks, 1); - kd4(cx->ks, 2); kd4(cx->ks, 3); - kd4(cx->ks, 4); kd4(cx->ks, 5); - kd4(cx->ks, 6); kd4(cx->ks, 7); - kd4(cx->ks, 8); kdl4(cx->ks, 9); -#endif - cx->rn = 10; -#if defined( AES_ERR_CHK ) - return aes_good; -#endif -} - -#endif - -#if defined(AES_192) || defined(AES_VAR) - -aes_rval aes_decrypt_key192(const unsigned char *key, aes_decrypt_ctx cx[1]) -{ aes_32t ss[7]; -#if defined( d_vars ) - d_vars; -#endif - cx->ks[0] = ss[0] = word_in(key, 0); - cx->ks[1] = ss[1] = word_in(key, 1); - cx->ks[2] = ss[2] = word_in(key, 2); - cx->ks[3] = ss[3] = word_in(key, 3); - -#if DEC_UNROLL == NONE - cx->ks[4] = ss[4] = word_in(key, 4); - cx->ks[5] = ss[5] = word_in(key, 5); - { aes_32t i; - - for(i = 0; i < (13 * N_COLS - 7) / 6; ++i) - ke6(cx->ks, i); - kel6(cx->ks, 7); -#if !(DEC_ROUND == NO_TABLES) - for(i = N_COLS; i < 12 * N_COLS; ++i) - cx->ks[i] = inv_mcol(cx->ks[i]); -#endif - } -#else - cx->ks[4] = ff(ss[4] = word_in(key, 4)); - cx->ks[5] = ff(ss[5] = word_in(key, 5)); - kdf6(cx->ks, 0); kd6(cx->ks, 1); - kd6(cx->ks, 2); kd6(cx->ks, 3); - kd6(cx->ks, 4); kd6(cx->ks, 5); - kd6(cx->ks, 6); kdl6(cx->ks, 7); -#endif - cx->rn = 12; -#if defined( AES_ERR_CHK ) - return aes_good; -#endif -} - -#endif - -#if defined(AES_256) || defined(AES_VAR) - -aes_rval aes_decrypt_key256(const unsigned char *key, aes_decrypt_ctx cx[1]) -{ aes_32t ss[8]; -#if defined( d_vars ) - d_vars; -#endif - cx->ks[0] = ss[0] = word_in(key, 0); - cx->ks[1] = ss[1] = word_in(key, 1); - cx->ks[2] = ss[2] = word_in(key, 2); - cx->ks[3] = ss[3] = word_in(key, 3); - -#if DEC_UNROLL == NONE - cx->ks[4] = ss[4] = word_in(key, 4); - cx->ks[5] = ss[5] = word_in(key, 5); - cx->ks[6] = ss[6] = word_in(key, 6); - cx->ks[7] = ss[7] = word_in(key, 7); - { aes_32t i; - - for(i = 0; i < (15 * N_COLS - 9) / 8; ++i) - ke8(cx->ks, i); - kel8(cx->ks, i); -#if !(DEC_ROUND == NO_TABLES) - for(i = N_COLS; i < 14 * N_COLS; ++i) - cx->ks[i] = inv_mcol(cx->ks[i]); - -#endif - } -#else - cx->ks[4] = ff(ss[4] = word_in(key, 4)); - cx->ks[5] = ff(ss[5] = word_in(key, 5)); - cx->ks[6] = ff(ss[6] = word_in(key, 6)); - cx->ks[7] = ff(ss[7] = word_in(key, 7)); - kdf8(cx->ks, 0); kd8(cx->ks, 1); - kd8(cx->ks, 2); kd8(cx->ks, 3); - kd8(cx->ks, 4); kd8(cx->ks, 5); - kdl8(cx->ks, 6); -#endif - cx->rn = 14; -#if defined( AES_ERR_CHK ) - return aes_good; -#endif -} - -#endif - -#if defined(AES_VAR) - -aes_rval aes_decrypt_key(const unsigned char *key, int key_len, aes_decrypt_ctx cx[1]) -{ - switch(key_len) - { -#if defined( AES_ERR_CHK ) - case 16: case 128: return aes_decrypt_key128(key, cx); - case 24: case 192: return aes_decrypt_key192(key, cx); - case 32: case 256: return aes_decrypt_key256(key, cx); - default: return aes_error; -#else - case 16: case 128: aes_decrypt_key128(key, cx); return; - case 24: case 192: aes_decrypt_key192(key, cx); return; - case 32: case 256: aes_decrypt_key256(key, cx); return; -#endif - } -} - -#endif - -#endif - -#if defined(__cplusplus) -} -#endif diff --git a/bsd/crypto/aes/ppc/aesopt.h b/bsd/crypto/aes/ppc/aesopt.h deleted file mode 100644 index 2b78eb920..000000000 --- a/bsd/crypto/aes/ppc/aesopt.h +++ /dev/null @@ -1,753 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software in both source and binary - form is allowed (with or without changes) provided that: - - 1. distributions of this source code include the above copyright - notice, this list of conditions and the following disclaimer; - - 2. distributions in binary form include the above copyright - notice, this list of conditions and the following disclaimer - in the documentation and/or other associated materials; - - 3. the copyright holder's name is not used to endorse products - built using this software without specific written permission. - - ALTERNATIVELY, provided that this notice is retained in full, this product - may be distributed under the terms of the GNU General Public License (GPL), - in which case the provisions of the GPL apply INSTEAD OF those given above. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue 28/01/2004 - - My thanks go to Dag Arne Osvik for devising the schemes used here for key - length derivation from the form of the key schedule - - This file contains the compilation options for AES (Rijndael) and code - that is common across encryption, key scheduling and table generation. - - OPERATION - - These source code files implement the AES algorithm Rijndael designed by - Joan Daemen and Vincent Rijmen. This version is designed for the standard - block size of 16 bytes and for key sizes of 128, 192 and 256 bits (16, 24 - and 32 bytes). - - This version is designed for flexibility and speed using operations on - 32-bit words rather than operations on bytes. It can be compiled with - either big or little endian internal byte order but is faster when the - native byte order for the processor is used. - - THE CIPHER INTERFACE - - The cipher interface is implemented as an array of bytes in which lower - AES bit sequence indexes map to higher numeric significance within bytes. - - aes_08t (an unsigned 8-bit type) - aes_32t (an unsigned 32-bit type) - struct aes_encrypt_ctx (structure for the cipher encryption context) - struct aes_decrypt_ctx (structure for the cipher decryption context) - aes_rval the function return type - - C subroutine calls: - - aes_rval aes_encrypt_key128(const unsigned char *key, aes_encrypt_ctx cx[1]); - aes_rval aes_encrypt_key192(const unsigned char *key, aes_encrypt_ctx cx[1]); - aes_rval aes_encrypt_key256(const unsigned char *key, aes_encrypt_ctx cx[1]); - aes_rval aes_encrypt(const unsigned char *in, unsigned char *out, - const aes_encrypt_ctx cx[1]); - - aes_rval aes_decrypt_key128(const unsigned char *key, aes_decrypt_ctx cx[1]); - aes_rval aes_decrypt_key192(const unsigned char *key, aes_decrypt_ctx cx[1]); - aes_rval aes_decrypt_key256(const unsigned char *key, aes_decrypt_ctx cx[1]); - aes_rval aes_decrypt(const unsigned char *in, unsigned char *out, - const aes_decrypt_ctx cx[1]); - - IMPORTANT NOTE: If you are using this C interface with dynamic tables make sure that - you call genTabs() before AES is used so that the tables are initialised. - - C++ aes class subroutines: - - Class AESencrypt for encryption - - Construtors: - AESencrypt(void) - AESencrypt(const unsigned char *key) - 128 bit key - Members: - aes_rval key128(const unsigned char *key) - aes_rval key192(const unsigned char *key) - aes_rval key256(const unsigned char *key) - aes_rval encrypt(const unsigned char *in, unsigned char *out) const - - Class AESdecrypt for encryption - Construtors: - AESdecrypt(void) - AESdecrypt(const unsigned char *key) - 128 bit key - Members: - aes_rval key128(const unsigned char *key) - aes_rval key192(const unsigned char *key) - aes_rval key256(const unsigned char *key) - aes_rval decrypt(const unsigned char *in, unsigned char *out) const - - COMPILATION - - The files used to provide AES (Rijndael) are - - a. aes.h for the definitions needed for use in C. - b. aescpp.h for the definitions needed for use in C++. - c. aesopt.h for setting compilation options (also includes common code). - d. aescrypt.c for encryption and decrytpion, or - e. aeskey.c for key scheduling. - f. aestab.c for table loading or generation. - g. aescrypt.asm for encryption and decryption using assembler code. - h. aescrypt.mmx.asm for encryption and decryption using MMX assembler. - - To compile AES (Rijndael) for use in C code use aes.h and set the - defines here for the facilities you need (key lengths, encryption - and/or decryption). Do not define AES_DLL or AES_CPP. Set the options - for optimisations and table sizes here. - - To compile AES (Rijndael) for use in in C++ code use aescpp.h but do - not define AES_DLL - - To compile AES (Rijndael) in C as a Dynamic Link Library DLL) use - aes.h and include the AES_DLL define. - - CONFIGURATION OPTIONS (here and in aes.h) - - a. set AES_DLL in aes.h if AES (Rijndael) is to be compiled as a DLL - b. You may need to set PLATFORM_BYTE_ORDER to define the byte order. - c. If you want the code to run in a specific internal byte order, then - ALGORITHM_BYTE_ORDER must be set accordingly. - d. set other configuration options decribed below. -*/ - -#if !defined( _AESOPT_H ) -#define _AESOPT_H - -#include - -/* CONFIGURATION - USE OF DEFINES - - Later in this section there are a number of defines that control the - operation of the code. In each section, the purpose of each define is - explained so that the relevant form can be included or excluded by - setting either 1's or 0's respectively on the branches of the related - #if clauses. - - PLATFORM SPECIFIC INCLUDES AND BYTE ORDER IN 32-BIT WORDS - - To obtain the highest speed on processors with 32-bit words, this code - needs to determine the byte order of the target machine. The following - block of code is an attempt to capture the most obvious ways in which - various environemnts define byte order. It may well fail, in which case - the definitions will need to be set by editing at the points marked - **** EDIT HERE IF NECESSARY **** below. My thanks go to Peter Gutmann - for his assistance with this endian detection nightmare. -*/ - -#define BRG_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ -#define BRG_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ - -#if defined(__GNUC__) || defined(__GNU_LIBRARY__) -# if defined(__FreeBSD__) || defined(__OpenBSD__) -# include -# elif defined( BSD ) && BSD >= 199103 -# include -# elif defined(__APPLE__) -# if defined(__BIG_ENDIAN__) && !defined( BIG_ENDIAN ) -# define BIG_ENDIAN -# elif defined(__LITTLE_ENDIAN__) && !defined( LITTLE_ENDIAN ) -# define LITTLE_ENDIAN -# endif -# else -# include -# if defined(__BEOS__) -# include -# endif -# endif -#endif - -#if !defined(PLATFORM_BYTE_ORDER) -# if defined(LITTLE_ENDIAN) || defined(BIG_ENDIAN) -# if defined(LITTLE_ENDIAN) && !defined(BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN -# elif !defined(LITTLE_ENDIAN) && defined(BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN -# elif defined(BYTE_ORDER) && (BYTE_ORDER == LITTLE_ENDIAN) -# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN -# elif defined(BYTE_ORDER) && (BYTE_ORDER == BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN -# endif -# elif defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN) -# if defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN -# elif !defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN -# elif defined(_BYTE_ORDER) && (_BYTE_ORDER == _LITTLE_ENDIAN) -# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN -# elif defined(_BYTE_ORDER) && (_BYTE_ORDER == _BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN -# endif -# elif defined(__LITTLE_ENDIAN__) || defined(__BIG_ENDIAN__) -# if defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) -# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN -# elif !defined(__LITTLE_ENDIAN__) && defined(__BIG_ENDIAN__) -# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN -# elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __LITTLE_ENDIAN__) -# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN -# elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __BIG_ENDIAN__) -# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN -# endif -# endif -#endif - -/* if the platform is still unknown, try to find its byte order */ -/* from commonly used machine defines */ - -#if !defined(PLATFORM_BYTE_ORDER) - -#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \ - defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \ - defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \ - defined( vax ) || defined( vms ) || defined( VMS ) || \ - defined( __VMS ) -# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN - -#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \ - defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \ - defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \ - defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \ - defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \ - defined( __TANDEM ) || defined( THINK_C ) || defined( __VMCMS__ ) -# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN - -#elif 0 /* **** EDIT HERE IF NECESSARY **** */ -# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN -#elif 0 /* **** EDIT HERE IF NECESSARY **** */ -# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN -#else -# error Please edit aesopt.h (line 234 or 236) to set the platform byte order -#endif - -#endif - -/* SOME LOCAL DEFINITIONS */ - -#define NO_TABLES 0 -#define ONE_TABLE 1 -#define FOUR_TABLES 4 -#define NONE 0 -#define PARTIAL 1 -#define FULL 2 - -#if defined(bswap32) -#define aes_sw32 bswap32 -#elif defined(bswap_32) -#define aes_sw32 bswap_32 -#else -#define brot(x,n) (((aes_32t)(x) << n) | ((aes_32t)(x) >> (32 - n))) -#define aes_sw32(x) ((brot((x),8) & 0x00ff00ff) | (brot((x),24) & 0xff00ff00)) -#endif - -/* 1. FUNCTIONS REQUIRED - - This implementation provides subroutines for encryption, decryption - and for setting the three key lengths (separately) for encryption - and decryption. When the assembler code is not being used the following - definition blocks allow the selection of the routines that are to be - included in the compilation. -*/ -#if defined( AES_ENCRYPT ) -#define ENCRYPTION -#define ENCRYPTION_KEY_SCHEDULE -#endif - -#if defined( AES_DECRYPT ) -#define DECRYPTION -#define DECRYPTION_KEY_SCHEDULE -#endif - -/* 2. ASSEMBLER SUPPORT - - This define (which can be on the command line) enables the use of the - assembler code routines for encryption and decryption with the C code - only providing key scheduling -*/ -#if 0 && !defined(AES_ASM) -#define AES_ASM -#endif - -/* 3. BYTE ORDER WITHIN 32 BIT WORDS - - The fundamental data processing units in Rijndael are 8-bit bytes. The - input, output and key input are all enumerated arrays of bytes in which - bytes are numbered starting at zero and increasing to one less than the - number of bytes in the array in question. This enumeration is only used - for naming bytes and does not imply any adjacency or order relationship - from one byte to another. When these inputs and outputs are considered - as bit sequences, bits 8*n to 8*n+7 of the bit sequence are mapped to - byte[n] with bit 8n+i in the sequence mapped to bit 7-i within the byte. - In this implementation bits are numbered from 0 to 7 starting at the - numerically least significant end of each byte (bit n represents 2^n). - - However, Rijndael can be implemented more efficiently using 32-bit - words by packing bytes into words so that bytes 4*n to 4*n+3 are placed - into word[n]. While in principle these bytes can be assembled into words - in any positions, this implementation only supports the two formats in - which bytes in adjacent positions within words also have adjacent byte - numbers. This order is called big-endian if the lowest numbered bytes - in words have the highest numeric significance and little-endian if the - opposite applies. - - This code can work in either order irrespective of the order used by the - machine on which it runs. Normally the internal byte order will be set - to the order of the processor on which the code is to be run but this - define can be used to reverse this in special situations - - NOTE: Assembler code versions rely on PLATFORM_BYTE_ORDER being set -*/ -#if 1 || defined(AES_ASM) -#define ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER -#elif 0 -#define ALGORITHM_BYTE_ORDER BRG_LITTLE_ENDIAN -#elif 0 -#define ALGORITHM_BYTE_ORDER BRG_BIG_ENDIAN -#else -#error The algorithm byte order is not defined -#endif - -/* 4. FAST INPUT/OUTPUT OPERATIONS. - - On some machines it is possible to improve speed by transferring the - bytes in the input and output arrays to and from the internal 32-bit - variables by addressing these arrays as if they are arrays of 32-bit - words. On some machines this will always be possible but there may - be a large performance penalty if the byte arrays are not aligned on - the normal word boundaries. On other machines this technique will - lead to memory access errors when such 32-bit word accesses are not - properly aligned. The option SAFE_IO avoids such problems but will - often be slower on those machines that support misaligned access - (especially so if care is taken to align the input and output byte - arrays on 32-bit word boundaries). If SAFE_IO is not defined it is - assumed that access to byte arrays as if they are arrays of 32-bit - words will not cause problems when such accesses are misaligned. -*/ -#if 0 && !defined(_MSC_VER) -#define SAFE_IO -#endif - -/* 5. LOOP UNROLLING - - The code for encryption and decrytpion cycles through a number of rounds - that can be implemented either in a loop or by expanding the code into a - long sequence of instructions, the latter producing a larger program but - one that will often be much faster. The latter is called loop unrolling. - There are also potential speed advantages in expanding two iterations in - a loop with half the number of iterations, which is called partial loop - unrolling. The following options allow partial or full loop unrolling - to be set independently for encryption and decryption -*/ -#if 1 -#define ENC_UNROLL FULL -#elif 0 -#define ENC_UNROLL PARTIAL -#else -#define ENC_UNROLL NONE -#endif - -#if 1 -#define DEC_UNROLL FULL -#elif 0 -#define DEC_UNROLL PARTIAL -#else -#define DEC_UNROLL NONE -#endif - -/* 6. FAST FINITE FIELD OPERATIONS - - If this section is included, tables are used to provide faster finite - field arithmetic (this has no effect if FIXED_TABLES is defined). -*/ -#if 1 -#define FF_TABLES -#endif - -/* 7. INTERNAL STATE VARIABLE FORMAT - - The internal state of Rijndael is stored in a number of local 32-bit - word varaibles which can be defined either as an array or as individual - names variables. Include this section if you want to store these local - varaibles in arrays. Otherwise individual local variables will be used. -*/ -#if 0 -#define ARRAYS -#endif - -/* In this implementation the columns of the state array are each held in - 32-bit words. The state array can be held in various ways: in an array - of words, in a number of individual word variables or in a number of - processor registers. The following define maps a variable name x and - a column number c to the way the state array variable is to be held. - The first define below maps the state into an array x[c] whereas the - second form maps the state into a number of individual variables x0, - x1, etc. Another form could map individual state colums to machine - register names. -*/ - -#if defined(ARRAYS) -#define s(x,c) x[c] -#else -#define s(x,c) x##c -#endif - -/* 8. FIXED OR DYNAMIC TABLES - - When this section is included the tables used by the code are compiled - statically into the binary file. Otherwise the subroutine gen_tabs() - must be called to compute them before the code is first used. -*/ -#if 1 -#define FIXED_TABLES -#endif - -/* 9. TABLE ALIGNMENT - - On some sytsems speed will be improved by aligning the AES large lookup - tables on particular boundaries. This define should be set to a power of - two giving the desired alignment. It can be left undefined if alignment - is not needed. This option is specific to the Microsft VC++ compiler - - it seems to sometimes cause trouble for the VC++ version 6 compiler. -*/ - -#if 0 && defined(_MSC_VER) && (_MSC_VER >= 1300) -#define TABLE_ALIGN 64 -#endif - -/* 10. INTERNAL TABLE CONFIGURATION - - This cipher proceeds by repeating in a number of cycles known as 'rounds' - which are implemented by a round function which can optionally be speeded - up using tables. The basic tables are each 256 32-bit words, with either - one or four tables being required for each round function depending on - how much speed is required. The encryption and decryption round functions - are different and the last encryption and decrytpion round functions are - different again making four different round functions in all. - - This means that: - 1. Normal encryption and decryption rounds can each use either 0, 1 - or 4 tables and table spaces of 0, 1024 or 4096 bytes each. - 2. The last encryption and decryption rounds can also use either 0, 1 - or 4 tables and table spaces of 0, 1024 or 4096 bytes each. - - Include or exclude the appropriate definitions below to set the number - of tables used by this implementation. -*/ - -#if 1 /* set tables for the normal encryption round */ -#define ENC_ROUND FOUR_TABLES -#elif 0 -#define ENC_ROUND ONE_TABLE -#else -#define ENC_ROUND NO_TABLES -#endif - -#if 1 /* set tables for the last encryption round */ -#define LAST_ENC_ROUND FOUR_TABLES -#elif 0 -#define LAST_ENC_ROUND ONE_TABLE -#else -#define LAST_ENC_ROUND NO_TABLES -#endif - -#if 1 /* set tables for the normal decryption round */ -#define DEC_ROUND FOUR_TABLES -#elif 0 -#define DEC_ROUND ONE_TABLE -#else -#define DEC_ROUND NO_TABLES -#endif - -#if 1 /* set tables for the last decryption round */ -#define LAST_DEC_ROUND FOUR_TABLES -#elif 0 -#define LAST_DEC_ROUND ONE_TABLE -#else -#define LAST_DEC_ROUND NO_TABLES -#endif - -/* The decryption key schedule can be speeded up with tables in the same - way that the round functions can. Include or exclude the following - defines to set this requirement. -*/ -#if 1 -#define KEY_SCHED FOUR_TABLES -#elif 0 -#define KEY_SCHED ONE_TABLE -#else -#define KEY_SCHED NO_TABLES -#endif - -/* 11. TABLE POINTER CACHING - - Normally tables are referenced directly, Enable this option if you wish to - cache pointers to the tables in the encrypt/decrypt code. Note that this - only works if you are using FOUR_TABLES for the ROUND you enable this for. -*/ -#if 1 -#define ENC_ROUND_CACHE_TABLES -#endif -#if 1 -#define LAST_ENC_ROUND_CACHE_TABLES -#endif -#if 1 -#define DEC_ROUND_CACHE_TABLES -#endif -#if 1 -#define LAST_DEC_ROUND_CACHE_TABLES -#endif - - -/* END OF CONFIGURATION OPTIONS */ - -#define RC_LENGTH (5 * (AES_BLOCK_SIZE / 4 - 2)) - -/* Disable or report errors on some combinations of options */ - -#if ENC_ROUND == NO_TABLES && LAST_ENC_ROUND != NO_TABLES -#undef LAST_ENC_ROUND -#define LAST_ENC_ROUND NO_TABLES -#elif ENC_ROUND == ONE_TABLE && LAST_ENC_ROUND == FOUR_TABLES -#undef LAST_ENC_ROUND -#define LAST_ENC_ROUND ONE_TABLE -#endif - -#if ENC_ROUND == NO_TABLES && ENC_UNROLL != NONE -#undef ENC_UNROLL -#define ENC_UNROLL NONE -#endif - -#if DEC_ROUND == NO_TABLES && LAST_DEC_ROUND != NO_TABLES -#undef LAST_DEC_ROUND -#define LAST_DEC_ROUND NO_TABLES -#elif DEC_ROUND == ONE_TABLE && LAST_DEC_ROUND == FOUR_TABLES -#undef LAST_DEC_ROUND -#define LAST_DEC_ROUND ONE_TABLE -#endif - -#if DEC_ROUND == NO_TABLES && DEC_UNROLL != NONE -#undef DEC_UNROLL -#define DEC_UNROLL NONE -#endif - -/* upr(x,n): rotates bytes within words by n positions, moving bytes to - higher index positions with wrap around into low positions - ups(x,n): moves bytes by n positions to higher index positions in - words but without wrap around - bval(x,n): extracts a byte from a word - - NOTE: The definitions given here are intended only for use with - unsigned variables and with shift counts that are compile - time constants -*/ - -#if (ALGORITHM_BYTE_ORDER == BRG_LITTLE_ENDIAN) -#define upr(x,n) (((aes_32t)(x) << (8 * (n))) | ((aes_32t)(x) >> (32 - 8 * (n)))) -#define ups(x,n) ((aes_32t) (x) << (8 * (n))) -#define bval(x,n) ((aes_08t)((x) >> (8 * (n)))) -#define bytes2word(b0, b1, b2, b3) \ - (((aes_32t)(b3) << 24) | ((aes_32t)(b2) << 16) | ((aes_32t)(b1) << 8) | (b0)) -#endif - -#if (ALGORITHM_BYTE_ORDER == BRG_BIG_ENDIAN) -#define upr(x,n) (((aes_32t)(x) >> (8 * (n))) | ((aes_32t)(x) << (32 - 8 * (n)))) -#define ups(x,n) ((aes_32t) (x) >> (8 * (n)))) -#define bval(x,n) ((aes_08t)((x) >> (24 - 8 * (n)))) -#define bytes2word(b0, b1, b2, b3) \ - (((aes_32t)(b0) << 24) | ((aes_32t)(b1) << 16) | ((aes_32t)(b2) << 8) | (b3)) -#endif - -#if defined(SAFE_IO) - -#define word_in(x,c) bytes2word(((aes_08t*)(x)+4*c)[0], ((aes_08t*)(x)+4*c)[1], \ - ((aes_08t*)(x)+4*c)[2], ((aes_08t*)(x)+4*c)[3]) -#define word_out(x,c,v) { ((aes_08t*)(x)+4*c)[0] = bval(v,0); ((aes_08t*)(x)+4*c)[1] = bval(v,1); \ - ((aes_08t*)(x)+4*c)[2] = bval(v,2); ((aes_08t*)(x)+4*c)[3] = bval(v,3); } - -#elif (ALGORITHM_BYTE_ORDER == PLATFORM_BYTE_ORDER) - -#define word_in(x,c) (*((aes_32t*)(x)+(c))) -#define word_out(x,c,v) (*((aes_32t*)(x)+(c)) = (v)) - -#else - -#define word_in(x,c) aes_sw32(*((aes_32t*)(x)+(c))) -#define word_out(x,c,v) (*((aes_32t*)(x)+(c)) = aes_sw32(v)) - -#endif - -/* the finite field modular polynomial and elements */ - -#define WPOLY 0x011b -#define BPOLY 0x1b - -/* multiply four bytes in GF(2^8) by 'x' {02} in parallel */ - -#define m1 0x80808080 -#define m2 0x7f7f7f7f -#define gf_mulx(x) ((((x) & m2) << 1) ^ ((((x) & m1) >> 7) * BPOLY)) - -/* The following defines provide alternative definitions of gf_mulx that might - give improved performance if a fast 32-bit multiply is not available. Note - that a temporary variable u needs to be defined where gf_mulx is used. - -#define gf_mulx(x) (u = (x) & m1, u |= (u >> 1), ((x) & m2) << 1) ^ ((u >> 3) | (u >> 6)) -#define m4 (0x01010101 * BPOLY) -#define gf_mulx(x) (u = (x) & m1, ((x) & m2) << 1) ^ ((u - (u >> 7)) & m4) -*/ - -/* Work out which tables are needed for the different options */ - -#if defined( AES_ASM ) -#if defined( ENC_ROUND ) -#undef ENC_ROUND -#endif -#define ENC_ROUND FOUR_TABLES -#if defined( LAST_ENC_ROUND ) -#undef LAST_ENC_ROUND -#endif -#define LAST_ENC_ROUND FOUR_TABLES -#if defined( DEC_ROUND ) -#undef DEC_ROUND -#endif -#define DEC_ROUND FOUR_TABLES -#if defined( LAST_DEC_ROUND ) -#undef LAST_DEC_ROUND -#endif -#define LAST_DEC_ROUND FOUR_TABLES -#if defined( KEY_SCHED ) -#undef KEY_SCHED -#define KEY_SCHED FOUR_TABLES -#endif -#endif - -#if defined(ENCRYPTION) || defined(AES_ASM) -#if ENC_ROUND == ONE_TABLE -#define FT1_SET -#elif ENC_ROUND == FOUR_TABLES -#define FT4_SET -#else -#define SBX_SET -#endif -#if LAST_ENC_ROUND == ONE_TABLE -#define FL1_SET -#elif LAST_ENC_ROUND == FOUR_TABLES -#define FL4_SET -#elif !defined(SBX_SET) -#define SBX_SET -#endif -#endif - -#if defined(DECRYPTION) || defined(AES_ASM) -#if DEC_ROUND == ONE_TABLE -#define IT1_SET -#elif DEC_ROUND == FOUR_TABLES -#define IT4_SET -#else -#define ISB_SET -#endif -#if LAST_DEC_ROUND == ONE_TABLE -#define IL1_SET -#elif LAST_DEC_ROUND == FOUR_TABLES -#define IL4_SET -#elif !defined(ISB_SET) -#define ISB_SET -#endif -#endif - -#if defined(ENCRYPTION_KEY_SCHEDULE) || defined(DECRYPTION_KEY_SCHEDULE) -#if KEY_SCHED == ONE_TABLE -#define LS1_SET -#define IM1_SET -#elif KEY_SCHED == FOUR_TABLES -#define LS4_SET -#define IM4_SET -#elif !defined(SBX_SET) -#define SBX_SET -#endif -#endif - -/* generic definitions of Rijndael macros that use tables */ - -#define no_table(x,box,vf,rf,c) bytes2word( \ - box[bval(vf(x,0,c),rf(0,c))], \ - box[bval(vf(x,1,c),rf(1,c))], \ - box[bval(vf(x,2,c),rf(2,c))], \ - box[bval(vf(x,3,c),rf(3,c))]) - -#define one_table(x,op,tab,vf,rf,c) \ - ( tab[bval(vf(x,0,c),rf(0,c))] \ - ^ op(tab[bval(vf(x,1,c),rf(1,c))],1) \ - ^ op(tab[bval(vf(x,2,c),rf(2,c))],2) \ - ^ op(tab[bval(vf(x,3,c),rf(3,c))],3)) - -#define four_tables(x,tab,vf,rf,c) \ - ( tab[0][bval(vf(x,0,c),rf(0,c))] \ - ^ tab[1][bval(vf(x,1,c),rf(1,c))] \ - ^ tab[2][bval(vf(x,2,c),rf(2,c))] \ - ^ tab[3][bval(vf(x,3,c),rf(3,c))]) - -#define four_cached_tables(x,tab,vf,rf,c) \ -( tab##0[bval(vf(x,0,c),rf(0,c))] \ - ^ tab##1[bval(vf(x,1,c),rf(1,c))] \ - ^ tab##2[bval(vf(x,2,c),rf(2,c))] \ - ^ tab##3[bval(vf(x,3,c),rf(3,c))]) - -#define vf1(x,r,c) (x) -#define rf1(r,c) (r) -#define rf2(r,c) ((8+r-c)&3) - -/* perform forward and inverse column mix operation on four bytes in long word x in */ -/* parallel. NOTE: x must be a simple variable, NOT an expression in these macros. */ - -#if defined(FM4_SET) /* not currently used */ -#define fwd_mcol(x) four_tables(x,t_use(f,m),vf1,rf1,0) -#elif defined(FM1_SET) /* not currently used */ -#define fwd_mcol(x) one_table(x,upr,t_use(f,m),vf1,rf1,0) -#else -#define dec_fmvars aes_32t g2 -#define fwd_mcol(x) (g2 = gf_mulx(x), g2 ^ upr((x) ^ g2, 3) ^ upr((x), 2) ^ upr((x), 1)) -#endif - -#if defined(IM4_SET) -#define inv_mcol(x) four_tables(x,t_use(i,m),vf1,rf1,0) -#elif defined(IM1_SET) -#define inv_mcol(x) one_table(x,upr,t_use(i,m),vf1,rf1,0) -#else -#define dec_imvars aes_32t g2, g4, g9 -#define inv_mcol(x) (g2 = gf_mulx(x), g4 = gf_mulx(g2), g9 = (x) ^ gf_mulx(g4), g4 ^= g9, \ - (x) ^ g2 ^ g4 ^ upr(g2 ^ g9, 3) ^ upr(g4, 2) ^ upr(g9, 1)) -#endif - -#if defined(FL4_SET) -#define ls_box(x,c) four_tables(x,t_use(f,l),vf1,rf2,c) -#elif defined(LS4_SET) -#define ls_box(x,c) four_tables(x,t_use(l,s),vf1,rf2,c) -#elif defined(FL1_SET) -#define ls_box(x,c) one_table(x,upr,t_use(f,l),vf1,rf2,c) -#elif defined(LS1_SET) -#define ls_box(x,c) one_table(x,upr,t_use(l,s),vf1,rf2,c) -#else -#define ls_box(x,c) no_table(x,t_use(s,box),vf1,rf2,c) -#endif - -#endif diff --git a/bsd/crypto/aes/ppc/aestab.c b/bsd/crypto/aes/ppc/aestab.c deleted file mode 100644 index dfd2ee969..000000000 --- a/bsd/crypto/aes/ppc/aestab.c +++ /dev/null @@ -1,384 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software in both source and binary - form is allowed (with or without changes) provided that: - - 1. distributions of this source code include the above copyright - notice, this list of conditions and the following disclaimer; - - 2. distributions in binary form include the above copyright - notice, this list of conditions and the following disclaimer - in the documentation and/or other associated materials; - - 3. the copyright holder's name is not used to endorse products - built using this software without specific written permission. - - ALTERNATIVELY, provided that this notice is retained in full, this product - may be distributed under the terms of the GNU General Public License (GPL), - in which case the provisions of the GPL apply INSTEAD OF those given above. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue 28/01/2004 - -*/ - -#if defined(__cplusplus) -extern "C" -{ -#endif - -#define DO_TABLES - -#include "aesopt.h" - -#if defined(FIXED_TABLES) - -#define sb_data(w) {\ - w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\ - w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\ - w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\ - w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\ - w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\ - w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\ - w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\ - w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\ - w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\ - w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\ - w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\ - w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\ - w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\ - w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\ - w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\ - w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\ - w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\ - w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\ - w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\ - w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\ - w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\ - w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\ - w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\ - w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\ - w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\ - w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\ - w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\ - w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\ - w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\ - w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\ - w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\ - w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) } - -#define isb_data(w) {\ - w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), w(0x38),\ - w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), w(0xd7), w(0xfb),\ - w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), w(0x2f), w(0xff), w(0x87),\ - w(0x34), w(0x8e), w(0x43), w(0x44), w(0xc4), w(0xde), w(0xe9), w(0xcb),\ - w(0x54), w(0x7b), w(0x94), w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d),\ - w(0xee), w(0x4c), w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e),\ - w(0x08), w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2),\ - w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), w(0x25),\ - w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), w(0x98), w(0x16),\ - w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), w(0x65), w(0xb6), w(0x92),\ - w(0x6c), w(0x70), w(0x48), w(0x50), w(0xfd), w(0xed), w(0xb9), w(0xda),\ - w(0x5e), w(0x15), w(0x46), w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84),\ - w(0x90), w(0xd8), w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a),\ - w(0xf7), w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06),\ - w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), w(0x02),\ - w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), w(0x8a), w(0x6b),\ - w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), w(0x67), w(0xdc), w(0xea),\ - w(0x97), w(0xf2), w(0xcf), w(0xce), w(0xf0), w(0xb4), w(0xe6), w(0x73),\ - w(0x96), w(0xac), w(0x74), w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85),\ - w(0xe2), w(0xf9), w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e),\ - w(0x47), w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89),\ - w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), w(0x1b),\ - w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), w(0x79), w(0x20),\ - w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), w(0xcd), w(0x5a), w(0xf4),\ - w(0x1f), w(0xdd), w(0xa8), w(0x33), w(0x88), w(0x07), w(0xc7), w(0x31),\ - w(0xb1), w(0x12), w(0x10), w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f),\ - w(0x60), w(0x51), w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d),\ - w(0x2d), w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef),\ - w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), w(0xb0),\ - w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), w(0x99), w(0x61),\ - w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), w(0x77), w(0xd6), w(0x26),\ - w(0xe1), w(0x69), w(0x14), w(0x63), w(0x55), w(0x21), w(0x0c), w(0x7d) } - -#define mm_data(w) {\ - w(0x00), w(0x01), w(0x02), w(0x03), w(0x04), w(0x05), w(0x06), w(0x07),\ - w(0x08), w(0x09), w(0x0a), w(0x0b), w(0x0c), w(0x0d), w(0x0e), w(0x0f),\ - w(0x10), w(0x11), w(0x12), w(0x13), w(0x14), w(0x15), w(0x16), w(0x17),\ - w(0x18), w(0x19), w(0x1a), w(0x1b), w(0x1c), w(0x1d), w(0x1e), w(0x1f),\ - w(0x20), w(0x21), w(0x22), w(0x23), w(0x24), w(0x25), w(0x26), w(0x27),\ - w(0x28), w(0x29), w(0x2a), w(0x2b), w(0x2c), w(0x2d), w(0x2e), w(0x2f),\ - w(0x30), w(0x31), w(0x32), w(0x33), w(0x34), w(0x35), w(0x36), w(0x37),\ - w(0x38), w(0x39), w(0x3a), w(0x3b), w(0x3c), w(0x3d), w(0x3e), w(0x3f),\ - w(0x40), w(0x41), w(0x42), w(0x43), w(0x44), w(0x45), w(0x46), w(0x47),\ - w(0x48), w(0x49), w(0x4a), w(0x4b), w(0x4c), w(0x4d), w(0x4e), w(0x4f),\ - w(0x50), w(0x51), w(0x52), w(0x53), w(0x54), w(0x55), w(0x56), w(0x57),\ - w(0x58), w(0x59), w(0x5a), w(0x5b), w(0x5c), w(0x5d), w(0x5e), w(0x5f),\ - w(0x60), w(0x61), w(0x62), w(0x63), w(0x64), w(0x65), w(0x66), w(0x67),\ - w(0x68), w(0x69), w(0x6a), w(0x6b), w(0x6c), w(0x6d), w(0x6e), w(0x6f),\ - w(0x70), w(0x71), w(0x72), w(0x73), w(0x74), w(0x75), w(0x76), w(0x77),\ - w(0x78), w(0x79), w(0x7a), w(0x7b), w(0x7c), w(0x7d), w(0x7e), w(0x7f),\ - w(0x80), w(0x81), w(0x82), w(0x83), w(0x84), w(0x85), w(0x86), w(0x87),\ - w(0x88), w(0x89), w(0x8a), w(0x8b), w(0x8c), w(0x8d), w(0x8e), w(0x8f),\ - w(0x90), w(0x91), w(0x92), w(0x93), w(0x94), w(0x95), w(0x96), w(0x97),\ - w(0x98), w(0x99), w(0x9a), w(0x9b), w(0x9c), w(0x9d), w(0x9e), w(0x9f),\ - w(0xa0), w(0xa1), w(0xa2), w(0xa3), w(0xa4), w(0xa5), w(0xa6), w(0xa7),\ - w(0xa8), w(0xa9), w(0xaa), w(0xab), w(0xac), w(0xad), w(0xae), w(0xaf),\ - w(0xb0), w(0xb1), w(0xb2), w(0xb3), w(0xb4), w(0xb5), w(0xb6), w(0xb7),\ - w(0xb8), w(0xb9), w(0xba), w(0xbb), w(0xbc), w(0xbd), w(0xbe), w(0xbf),\ - w(0xc0), w(0xc1), w(0xc2), w(0xc3), w(0xc4), w(0xc5), w(0xc6), w(0xc7),\ - w(0xc8), w(0xc9), w(0xca), w(0xcb), w(0xcc), w(0xcd), w(0xce), w(0xcf),\ - w(0xd0), w(0xd1), w(0xd2), w(0xd3), w(0xd4), w(0xd5), w(0xd6), w(0xd7),\ - w(0xd8), w(0xd9), w(0xda), w(0xdb), w(0xdc), w(0xdd), w(0xde), w(0xdf),\ - w(0xe0), w(0xe1), w(0xe2), w(0xe3), w(0xe4), w(0xe5), w(0xe6), w(0xe7),\ - w(0xe8), w(0xe9), w(0xea), w(0xeb), w(0xec), w(0xed), w(0xee), w(0xef),\ - w(0xf0), w(0xf1), w(0xf2), w(0xf3), w(0xf4), w(0xf5), w(0xf6), w(0xf7),\ - w(0xf8), w(0xf9), w(0xfa), w(0xfb), w(0xfc), w(0xfd), w(0xfe), w(0xff) } - -#define rc_data(w) {\ - w(0x01), w(0x02), w(0x04), w(0x08), w(0x10),w(0x20), w(0x40), w(0x80),\ - w(0x1b), w(0x36) } - -#define h0(x) (x) - -#define w0(p) bytes2word(p, 0, 0, 0) -#define w1(p) bytes2word(0, p, 0, 0) -#define w2(p) bytes2word(0, 0, p, 0) -#define w3(p) bytes2word(0, 0, 0, p) - -#define u0(p) bytes2word(f2(p), p, p, f3(p)) -#define u1(p) bytes2word(f3(p), f2(p), p, p) -#define u2(p) bytes2word(p, f3(p), f2(p), p) -#define u3(p) bytes2word(p, p, f3(p), f2(p)) - -#define v0(p) bytes2word(fe(p), f9(p), fd(p), fb(p)) -#define v1(p) bytes2word(fb(p), fe(p), f9(p), fd(p)) -#define v2(p) bytes2word(fd(p), fb(p), fe(p), f9(p)) -#define v3(p) bytes2word(f9(p), fd(p), fb(p), fe(p)) - -#endif - -#if defined(FIXED_TABLES) || !defined(FF_TABLES) - -#define f2(x) ((x<<1) ^ (((x>>7) & 1) * WPOLY)) -#define f4(x) ((x<<2) ^ (((x>>6) & 1) * WPOLY) ^ (((x>>6) & 2) * WPOLY)) -#define f8(x) ((x<<3) ^ (((x>>5) & 1) * WPOLY) ^ (((x>>5) & 2) * WPOLY) \ - ^ (((x>>5) & 4) * WPOLY)) -#define f3(x) (f2(x) ^ x) -#define f9(x) (f8(x) ^ x) -#define fb(x) (f8(x) ^ f2(x) ^ x) -#define fd(x) (f8(x) ^ f4(x) ^ x) -#define fe(x) (f8(x) ^ f4(x) ^ f2(x)) - -#else - -#define f2(x) ((x) ? pow[log[x] + 0x19] : 0) -#define f3(x) ((x) ? pow[log[x] + 0x01] : 0) -#define f9(x) ((x) ? pow[log[x] + 0xc7] : 0) -#define fb(x) ((x) ? pow[log[x] + 0x68] : 0) -#define fd(x) ((x) ? pow[log[x] + 0xee] : 0) -#define fe(x) ((x) ? pow[log[x] + 0xdf] : 0) -#define fi(x) ((x) ? pow[ 255 - log[x]] : 0) - -#endif - -#include "aestab.h" - -#if defined(FIXED_TABLES) - -/* implemented in case of wrong call for fixed tables */ - -void gen_tabs(void) -{ -} - -#else /* dynamic table generation */ - -#if !defined(FF_TABLES) - -/* Generate the tables for the dynamic table option - - It will generally be sensible to use tables to compute finite - field multiplies and inverses but where memory is scarse this - code might sometimes be better. But it only has effect during - initialisation so its pretty unimportant in overall terms. -*/ - -/* return 2 ^ (n - 1) where n is the bit number of the highest bit - set in x with x in the range 1 < x < 0x00000200. This form is - used so that locals within fi can be bytes rather than words -*/ - -static aes_08t hibit(const aes_32t x) -{ aes_08t r = (aes_08t)((x >> 1) | (x >> 2)); - - r |= (r >> 2); - r |= (r >> 4); - return (r + 1) >> 1; -} - -/* return the inverse of the finite field element x */ - -static aes_08t fi(const aes_08t x) -{ aes_08t p1 = x, p2 = BPOLY, n1 = hibit(x), n2 = 0x80, v1 = 1, v2 = 0; - - if(x < 2) return x; - - for(;;) - { - if(!n1) return v1; - - while(n2 >= n1) - { - n2 /= n1; p2 ^= p1 * n2; v2 ^= v1 * n2; n2 = hibit(p2); - } - - if(!n2) return v2; - - while(n1 >= n2) - { - n1 /= n2; p1 ^= p2 * n1; v1 ^= v2 * n1; n1 = hibit(p1); - } - } -} - -#endif - -/* The forward and inverse affine transformations used in the S-box */ - -#define fwd_affine(x) \ - (w = (aes_32t)x, w ^= (w<<1)^(w<<2)^(w<<3)^(w<<4), 0x63^(aes_08t)(w^(w>>8))) - -#define inv_affine(x) \ - (w = (aes_32t)x, w = (w<<1)^(w<<3)^(w<<6), 0x05^(aes_08t)(w^(w>>8))) - -static int init = 0; - -void gen_tabs(void) -{ aes_32t i, w; - -#if defined(FF_TABLES) - - aes_08t pow[512], log[256]; - - if(init) return; - /* log and power tables for GF(2^8) finite field with - WPOLY as modular polynomial - the simplest primitive - root is 0x03, used here to generate the tables - */ - - i = 0; w = 1; - do - { - pow[i] = (aes_08t)w; - pow[i + 255] = (aes_08t)w; - log[w] = (aes_08t)i++; - w ^= (w << 1) ^ (w & 0x80 ? WPOLY : 0); - } - while (w != 1); - -#else - if(init) return; -#endif - - for(i = 0, w = 1; i < RC_LENGTH; ++i) - { - t_set(r,c)[i] = bytes2word(w, 0, 0, 0); - w = f2(w); - } - - for(i = 0; i < 256; ++i) - { aes_08t b; - - b = fwd_affine(fi((aes_08t)i)); - w = bytes2word(f2(b), b, b, f3(b)); - -#if defined( SBX_SET ) - t_set(s,box)[i] = b; -#endif - -#if defined( FT1_SET ) /* tables for a normal encryption round */ - t_set(f,n)[i] = w; -#endif -#if defined( FT4_SET ) - t_set(f,n)[0][i] = w; - t_set(f,n)[1][i] = upr(w,1); - t_set(f,n)[2][i] = upr(w,2); - t_set(f,n)[3][i] = upr(w,3); -#endif - w = bytes2word(b, 0, 0, 0); - -#if defined( FL1_SET ) /* tables for last encryption round (may also */ - t_set(f,l)[i] = w; /* be used in the key schedule) */ -#endif -#if defined( FL4_SET ) - t_set(f,l)[0][i] = w; - t_set(f,l)[1][i] = upr(w,1); - t_set(f,l)[2][i] = upr(w,2); - t_set(f,l)[3][i] = upr(w,3); -#endif - -#if defined( LS1_SET ) /* table for key schedule if t_set(f,l) above is */ - t_set(l,s)[i] = w; /* not of the required form */ -#endif -#if defined( LS4_SET ) - t_set(l,s)[0][i] = w; - t_set(l,s)[1][i] = upr(w,1); - t_set(l,s)[2][i] = upr(w,2); - t_set(l,s)[3][i] = upr(w,3); -#endif - - b = fi(inv_affine((aes_08t)i)); - w = bytes2word(fe(b), f9(b), fd(b), fb(b)); - -#if defined( IM1_SET ) /* tables for the inverse mix column operation */ - t_set(i,m)[b] = w; -#endif -#if defined( IM4_SET ) - t_set(i,m)[0][b] = w; - t_set(i,m)[1][b] = upr(w,1); - t_set(i,m)[2][b] = upr(w,2); - t_set(i,m)[3][b] = upr(w,3); -#endif - -#if defined( ISB_SET ) - t_set(i,box)[i] = b; -#endif -#if defined( IT1_SET ) /* tables for a normal decryption round */ - t_set(i,n)[i] = w; -#endif -#if defined( IT4_SET ) - t_set(i,n)[0][i] = w; - t_set(i,n)[1][i] = upr(w,1); - t_set(i,n)[2][i] = upr(w,2); - t_set(i,n)[3][i] = upr(w,3); -#endif - w = bytes2word(b, 0, 0, 0); -#if defined( IL1_SET ) /* tables for last decryption round */ - t_set(i,l)[i] = w; -#endif -#if defined( IL4_SET ) - t_set(i,l)[0][i] = w; - t_set(i,l)[1][i] = upr(w,1); - t_set(i,l)[2][i] = upr(w,2); - t_set(i,l)[3][i] = upr(w,3); -#endif - } - init = 1; -} - -#endif - -#if defined(__cplusplus) -} -#endif - diff --git a/bsd/crypto/aes/ppc/aestab.h b/bsd/crypto/aes/ppc/aestab.h deleted file mode 100644 index 004ef9e74..000000000 --- a/bsd/crypto/aes/ppc/aestab.h +++ /dev/null @@ -1,175 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software in both source and binary - form is allowed (with or without changes) provided that: - - 1. distributions of this source code include the above copyright - notice, this list of conditions and the following disclaimer; - - 2. distributions in binary form include the above copyright - notice, this list of conditions and the following disclaimer - in the documentation and/or other associated materials; - - 3. the copyright holder's name is not used to endorse products - built using this software without specific written permission. - - ALTERNATIVELY, provided that this notice is retained in full, this product - may be distributed under the terms of the GNU General Public License (GPL), - in which case the provisions of the GPL apply INSTEAD OF those given above. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue 28/01/2004 - - This file contains the code for declaring the tables needed to implement - AES. The file aesopt.h is assumed to be included before this header file. - If there are no global variables, the definitions here can be used to put - the AES tables in a structure so that a pointer can then be added to the - AES context to pass them to the AES routines that need them. If this - facility is used, the calling program has to ensure that this pointer is - managed appropriately. In particular, the value of the t_dec(in,it) item - in the table structure must be set to zero in order to ensure that the - tables are initialised. In practice the three code sequences in aeskey.c - that control the calls to gen_tabs() and the gen_tabs() routine itself will - have to be changed for a specific implementation. If global variables are - available it will generally be preferable to use them with the precomputed - FIXED_TABLES option that uses static global tables. - - The following defines can be used to control the way the tables - are defined, initialised and used in embedded environments that - require special features for these purposes - - the 't_dec' construction is used to declare fixed table arrays - the 't_set' construction is used to set fixed table values - the 't_use' construction is used to access fixed table values - - 256 byte tables: - - t_xxx(s,box) => forward S box - t_xxx(i,box) => inverse S box - - 256 32-bit word OR 4 x 256 32-bit word tables: - - t_xxx(f,n) => forward normal round - t_xxx(f,l) => forward last round - t_xxx(i,n) => inverse normal round - t_xxx(i,l) => inverse last round - t_xxx(l,s) => key schedule table - t_xxx(i,m) => key schedule table - - Other variables and tables: - - t_xxx(r,c) => the rcon table -*/ - -#if !defined( _AESTAB_H ) -#define _AESTAB_H - -#define t_dec(m,n) t_##m##n -#define t_set(m,n) t_##m##n -#define t_use(m,n) t_##m##n - -#if defined(FIXED_TABLES) -#define Const const -#else -#define Const -#endif - -#if defined(DO_TABLES) -#define Extern -#else -#define Extern extern -#endif - -#if defined(_MSC_VER) && defined(TABLE_ALIGN) -#define Align __declspec(align(TABLE_ALIGN)) -#else -#define Align -#endif - -#if defined(__cplusplus) -extern "C" -{ -#endif - -#if defined(DO_TABLES) && defined(FIXED_TABLES) -#define d_1(t,n,b,e) Align Const t n[256] = b(e) -#define d_4(t,n,b,e,f,g,h) Align Const t n[4][256] = { b(e), b(f), b(g), b(h) } -Extern Align Const aes_32t t_dec(r,c)[RC_LENGTH] = rc_data(w0); -#else -#define d_1(t,n,b,e) Extern Align Const t n[256] -#define d_4(t,n,b,e,f,g,h) Extern Align Const t n[4][256] -Extern Align Const aes_32t t_dec(r,c)[RC_LENGTH]; -#endif - -#if defined( SBX_SET ) - d_1(aes_08t, t_dec(s,box), sb_data, h0); -#endif -#if defined( ISB_SET ) - d_1(aes_08t, t_dec(i,box), isb_data, h0); -#endif - -#if defined( FT1_SET ) - d_1(aes_32t, t_dec(f,n), sb_data, u0); -#endif -#if defined( FT4_SET ) - d_4(aes_32t, t_dec(f,n), sb_data, u0, u1, u2, u3); -#endif - -#if defined( FL1_SET ) - d_1(aes_32t, t_dec(f,l), sb_data, w0); -#endif -#if defined( FL4_SET ) - d_4(aes_32t, t_dec(f,l), sb_data, w0, w1, w2, w3); -#endif - -#if defined( IT1_SET ) - d_1(aes_32t, t_dec(i,n), isb_data, v0); -#endif -#if defined( IT4_SET ) - d_4(aes_32t, t_dec(i,n), isb_data, v0, v1, v2, v3); -#endif - -#if defined( IL1_SET ) - d_1(aes_32t, t_dec(i,l), isb_data, w0); -#endif -#if defined( IL4_SET ) - d_4(aes_32t, t_dec(i,l), isb_data, w0, w1, w2, w3); -#endif - -#if defined( LS1_SET ) -#if defined( FL1_SET ) -#undef LS1_SET -#else - d_1(aes_32t, t_dec(l,s), sb_data, w0); -#endif -#endif - -#if defined( LS4_SET ) -#if defined( FL4_SET ) -#undef LS4_SET -#else - d_4(aes_32t, t_dec(l,s), sb_data, w0, w1, w2, w3); -#endif -#endif - -#if defined( IM1_SET ) - d_1(aes_32t, t_dec(i,m), mm_data, v0); -#endif -#if defined( IM4_SET ) - d_4(aes_32t, t_dec(i,m), mm_data, v0, v1, v2, v3); -#endif - -#if defined(__cplusplus) -} -#endif - -#endif diff --git a/bsd/crypto/aes/test/ReadMe.txt b/bsd/crypto/aes/test/ReadMe.txt new file mode 100644 index 000000000..1329e84be --- /dev/null +++ b/bsd/crypto/aes/test/ReadMe.txt @@ -0,0 +1,97 @@ +This directory contains file and shell scripts + + tstaes.c + makegenarm.sh + makegenx86.sh + makeoptx86.sh + +that can be used to build executables. These executable are used to validate the implementation +and to benchmark the performance of the aes functions in the kernel. This directory also serves +as a development environment for porting of the aes functions to any new architectures. + +On xnu-1699.20.6 (from which we add this work), the generic aes source code sits at bsd/crypto/aes/gen. The x86_64 +and i386 architectural optimization is given in bsd/crypto/aes/i386. + +After making some code corrections (aes.h and most assembly code in i386), now you can build a test executable +that is functionally equivalent to aes in the kernel code. + +To generate a test executable for the aes in x86_64/i386 kernel, + + $ makeoptx86.sh + +This will build a test executable tstaesoptx86 (x86_64/i386). The executable will automatically detects the +CPU clock rates. You specify the number of iterations and the number of 16-byte blocks for simulation. +The executable generates (random number) the test data, and calls aes_encrypt_cbc to encrypt the plain data +into cipher data, and then calls aes_decrypt_cbc to decrypt cipher into decrypted data. Afterwards, it compares +the decrypted data against the plain data. Should there be a mismatch, the code breaks and exit. +Otherwise, it measures the times the system spends on the 2 functions under test. Afterwards, it prints out +the performance profiling data. + +On K5, + +$ tstaesoptx86 1000 2560 +device max CPU clock rate = 2659.00 MHz +40960 bytes per cbc call + aes_encrypt_cbc : time elapsed = 220.24 usecs, 177.37 MBytes/sec, 14.30 cycles/byte + best iteration : time elapsed = 218.30 usecs, 178.94 MBytes/sec, 14.17 cycles/byte + worst iteration : time elapsed = 286.14 usecs, 136.51 MBytes/sec, 18.58 cycles/byte + + aes_decrypt_cbc : time elapsed = 199.85 usecs, 195.46 MBytes/sec, 12.97 cycles/byte + best iteration : time elapsed = 198.17 usecs, 197.12 MBytes/sec, 12.86 cycles/byte + worst iteration : time elapsed = 228.12 usecs, 171.23 MBytes/sec, 14.81 cycles/byte + +On K5B (with aesni) + +$ tstaesoptx86 1000 256 +device max CPU clock rate = 2400.00 MHz +4096 bytes per cbc call + aes_encrypt_cbc : time elapsed = 6.69 usecs, 583.67 MBytes/sec, 3.92 cycles/byte + best iteration : time elapsed = 6.38 usecs, 612.46 MBytes/sec, 3.74 cycles/byte + worst iteration : time elapsed = 9.72 usecs, 401.96 MBytes/sec, 5.69 cycles/byte + + aes_decrypt_cbc : time elapsed = 2.05 usecs, 1902.65 MBytes/sec, 1.20 cycles/byte + best iteration : time elapsed = 1.96 usecs, 1997.06 MBytes/sec, 1.15 cycles/byte + worst iteration : time elapsed = 4.60 usecs, 849.00 MBytes/sec, 2.70 cycles/byte + +You can also build a test executable using the generic source code for the i386/x86_64 architecture. + + $ makegenx86.sh + +When run on K5, + +$ tstaesgenx86 1000 2560 +device max CPU clock rate = 2659.00 MHz +40960 bytes per cbc call + aes_encrypt_cbc : time elapsed = 278.05 usecs, 140.49 MBytes/sec, 18.05 cycles/byte + best iteration : time elapsed = 274.63 usecs, 142.24 MBytes/sec, 17.83 cycles/byte + worst iteration : time elapsed = 309.70 usecs, 126.13 MBytes/sec, 20.10 cycles/byte + + aes_decrypt_cbc : time elapsed = 265.43 usecs, 147.17 MBytes/sec, 17.23 cycles/byte + best iteration : time elapsed = 262.20 usecs, 148.98 MBytes/sec, 17.02 cycles/byte + worst iteration : time elapsed = 296.19 usecs, 131.88 MBytes/sec, 19.23 cycles/byte + +We can see the current AES implementation in the x86_64 kernel has been improved from 17.83/17.02 +down to 14.12/12.86 cycles/byte for aes_encrypt_cbc and aes_decrypt_cbc, respectively. + + + --------- iOS --------- + +Similarly, you can build a test executable for the aes in the armv7 kernel (which uses the generic source code) + + $ makegenarm.sh + +Note that you need the iOS SDK installed. We can then copy this executable to iOS devices for simulation. + +On N88, + +iPhone:~ root# ./tstaesgenarm 1000 2560 +device max CPU clock rate = 600.00 MHz +40960 bytes per cbc call + aes_encrypt_cbc : time elapsed = 2890.18 usecs, 13.52 MBytes/sec, 42.34 cycles/byte + best iteration : time elapsed = 2692.00 usecs, 14.51 MBytes/sec, 39.43 cycles/byte + worst iteration : time elapsed = 18248.33 usecs, 2.14 MBytes/sec, 267.31 cycles/byte + + aes_decrypt_cbc : time elapsed = 3078.20 usecs, 12.69 MBytes/sec, 45.09 cycles/byte + best iteration : time elapsed = 2873.33 usecs, 13.59 MBytes/sec, 42.09 cycles/byte + worst iteration : time elapsed = 9664.79 usecs, 4.04 MBytes/sec, 141.57 cycles/byte + diff --git a/bsd/crypto/aes/test/makegenx86.sh b/bsd/crypto/aes/test/makegenx86.sh new file mode 100755 index 000000000..ea4de6f63 --- /dev/null +++ b/bsd/crypto/aes/test/makegenx86.sh @@ -0,0 +1,8 @@ +#!/bin/ksh + +cc -Os -c -arch i386 -arch x86_64 -I ../../../ ../gen/aescrypt.c -o aescrypt.o +cc -Os -c -arch i386 -arch x86_64 -I ../../../ ../gen/aeskey.c -o aeskey.o +cc -Os -c -arch i386 -arch x86_64 -I ../../../ ../gen/aestab.c -o aestab.o + +cc -arch i386 -arch x86_64 -Os tstaes.c aescrypt.o aeskey.o aestab.o -o tstaesgenx86 +rm -fr aescrypt.o aeskey.o aestab.o diff --git a/bsd/crypto/aes/test/makeoptx86.sh b/bsd/crypto/aes/test/makeoptx86.sh new file mode 100755 index 000000000..3732e037f --- /dev/null +++ b/bsd/crypto/aes/test/makeoptx86.sh @@ -0,0 +1,10 @@ +#!/bin/ksh + +cc -c -Os -arch i386 -arch x86_64 ../i386/AES.s -o AES.o +cc -c -Os -arch i386 -arch x86_64 ../i386/aes_crypt_hw.s -o aes_crypt_hw.o +cc -c -Os -arch i386 -arch x86_64 ../i386/aes_key_hw.s -o aes_key_hw.o +cc -c -Os -arch i386 -arch x86_64 ../i386/aes_modes_asm.s -o aes_modes_asm.o +cc -c -Os -arch i386 -arch x86_64 ../i386/aes_modes_hw.s -o aes_modes_hw.o + +cc -Os -arch i386 -arch x86_64 tstaes.c AES.o aes_crypt_hw.o aes_key_hw.o aes_modes_asm.o aes_modes_hw.o -o tstaesoptx86 +rm -fr AES.o aes_crypt_hw.o aes_key_hw.o aes_modes_asm.o aes_modes_hw.o diff --git a/bsd/crypto/aes/test/tstaes.c b/bsd/crypto/aes/test/tstaes.c new file mode 100644 index 000000000..cbe364ed7 --- /dev/null +++ b/bsd/crypto/aes/test/tstaes.c @@ -0,0 +1,131 @@ + +#include +#include +#include "../aes.h" +#include +#include + + +aes_encrypt_ctx encrypt_ctx; +aes_decrypt_ctx decrypt_ctx; + +size_t getFreq() +{ + int mib[2]; + size_t cpufreq, len; + mib[0] = CTL_HW; + mib[1] = HW_CPU_FREQ; + len = sizeof(cpufreq); + + sysctl(mib, 2, &cpufreq, &len, NULL, 0); + + return cpufreq; +} + + +uint32_t cpu_freq; + +main(int argc, char **argv) +{ + + char *plain; + char *cipher; + char *decrypt; + +uint32_t ITERATIONS; +uint32_t NUM_BLOCKS; +uint32_t data_size; + + char key[32]; + char iv[16]; + int checksum=0; + int i, j, iterations; + uint64_t t0, t1, t2, sum=0, max_time=0, min_time=-1, sum1=0, max_time1=0, min_time1=-1; + float time, time_max, time_min, time1, time_max1, time_min1; + + cpu_freq = getFreq(); + + if (cpu_freq == 0) { + fprintf(stderr, "this appears to be an N90 device, where cpu_freq can not be detected. set to 800MHz.\n"); + cpu_freq = 800000000; + } else { + fprintf(stderr, "device max CPU clock rate = %.2f MHz\n", cpu_freq/1.e6); + } + + mach_timebase_info_data_t info; + kern_return_t err = mach_timebase_info( &info ); + + if (argc!=3) { + fprintf(stderr, "usage : %s iterations num_16bytes_block\n", argv[0]); + exit(1); + } + ITERATIONS = atoi(argv[1]); + NUM_BLOCKS = atoi(argv[2]); + data_size = 16*NUM_BLOCKS; + + plain = malloc(data_size); + cipher = malloc(data_size); + decrypt = malloc(data_size); + + if ((plain==NULL) || (cipher==NULL) || (decrypt==NULL)) { + fprintf(stderr,"malloc error.\n"); + exit(1); + } + + for (i=0;imax_time) max_time = t1; + if (t1max_time1) max_time1 = t2; + if (t2 + + + + + OpenSourceLicense + Other + OpenSourceLicenseFile + KernelCrypto.txt + OpenSourceModifications + Extensive customization for OS X + OpenSourceProject + openssl + OpenSourceURL + http://www.openssl.org/source/openssl-0.9.6.tar.gz + OpenSourceVersion + openssl-0.9.6 + OpenSourceWebsiteURL + http://www.openssl.org/ + + + OpenSourceImportDate + 2004-04-07 + OpenSourceLicense + Other + OpenSourceLicenseFile + KernelCrypto.txt + OpenSourceModifications + Customization for OS X + OpenSourceProject + Gladman AES + OpenSourceURL + http://fp.gladman.plus.com/AES/aesfull.zip + OpenSourceVersion + aes-src-26-08-05 + OpenSourceWebsiteURL + http://fp.gladman.plus.com/AES/index.htm + + + OpenSourceImportDate + 2005-09-02 + OpenSourceLicense + Other + OpenSourceLicenseFile + KernelCrypto.txt + OpenSourceModifications + Customization for OS X + OpenSourceProject + Gladman SHA2 + OpenSourceURL + http://fp.gladman.plus.com/cryptography_technology/sha/sha-26-08-05.zip + OpenSourceVersion + sha-26-08-05 + OpenSourceWebsiteURL + http://fp.gladman.plus.com/cryptography_technology/sha/index.htm + + + OpenSourceImportDate + 2010-04-14 + OpenSourceLicense + Other + OpenSourceLicenseFile + KernelCrypto.txt + OpenSourceModifications + Customization for OS X + OpenSourceProject + Gladman XTS-AES + OpenSourceURL + http://gladman.plushost.co.uk/oldsite/AES/xts-vs2008-17-07-09.zip + OpenSourceVersion + xts-vs2008-17-07-09 + OpenSourceWebsiteURL + http://gladman.plushost.co.uk/oldsite/AES/index.php + + + diff --git a/bsd/crypto/doc/KernelCrypto.txt b/bsd/crypto/doc/KernelCrypto.txt new file mode 100644 index 000000000..611542795 --- /dev/null +++ b/bsd/crypto/doc/KernelCrypto.txt @@ -0,0 +1,149 @@ + Original SSLeay License + ----------------------- + +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + + --------------------------------------------------------------------------- + License for Dr. Brian Gladman's SHA2 implementation + --------------------------------------------------------------------------- + + Copyright (c) 2002, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + + --------------------------------------------------------------------------- + License for Dr. Brian Gladman's AES implementation + --------------------------------------------------------------------------- + Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + + +--------------------------------------------------------------------------- + License for Dr. Brian Gladman's XTS implementation + --------------------------------------------------------------------------- + +Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved. + +LICENSE TERMS + +The free distribution and use of this software is allowed (with or without +changes) provided that: + +1. source code distributions include the above copyright notice, this + list of conditions and the following disclaimer; + +2. binary distributions include the above copyright notice, this list + of conditions and the following disclaimer in their documentation; + +3. the name of the copyright holder is not used to endorse products + built using this software without specific written permission. + +DISCLAIMER + +This software is provided 'as is' with no explicit or implied warranties +in respect of its properties, including, but not limited to, correctness +and/or fitness for purpose. + + diff --git a/bsd/crypto/rc4/Makefile b/bsd/crypto/rc4/Makefile index 4de505de8..9aad66e3a 100644 --- a/bsd/crypto/rc4/Makefile +++ b/bsd/crypto/rc4/Makefile @@ -9,8 +9,6 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ INSTINC_SUBDIRS_X86_64 = \ @@ -19,8 +17,6 @@ INSTINC_SUBDIRS_ARM = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ diff --git a/bsd/crypto/sha2/Makefile b/bsd/crypto/sha2/Makefile index 8e85f612c..4cc93fb76 100644 --- a/bsd/crypto/sha2/Makefile +++ b/bsd/crypto/sha2/Makefile @@ -9,8 +9,6 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ INSTINC_SUBDIRS_X86_64 = \ @@ -19,8 +17,6 @@ INSTINC_SUBDIRS_ARM = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ diff --git a/bsd/crypto/sha2/intel/sha256.s b/bsd/crypto/sha2/intel/sha256.s new file mode 100644 index 000000000..59353ff4b --- /dev/null +++ b/bsd/crypto/sha2/intel/sha256.s @@ -0,0 +1,617 @@ +/* + This file provides x86_64/i386 hand implementation of the following function + + void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks); + + which is a C function in sha2.c (from xnu). + + The code 1st probes cpu_capabilities to detect whether ssse3 is supported. If not, it branches to + SHA256_Transform_nossse3 (in a separate source file sha256nossse3.s) that was cloned from this file + with all ssse3 instructions replaced with sse3 or below instructions. + + sha256 algorithm per block description: + + 1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte) + 2. load 8 digests a-h from ctx->state + 3. for r = 0:15 + T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r]; + d += T1; + h = T1 + Sigma0(a) + Maj(a,b,c) + permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g + 4. for r = 16:63 + W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]); + T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r]; + d += T1; + h = T1 + Sigma0(a) + Maj(a,b,c) + permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g + + In the assembly implementation: + - a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm3 + - its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer + - the 8 digests (a-h) will be stored in GPR or m32 (all in GPR for x86_64, and some in m32 for i386) + + the implementation per block looks like + + ---------------------------------------------------------------------------- + + load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3 + pre_calculate and store W+K(0:15) in stack + + load digests a-h from ctx->state; + + for (r=0;r<48;r+=4) { + digests a-h update and permute round r:r+3 + update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration + } + + for (r=48;r<64;r+=4) { + digests a-h update and permute round r:r+3 + } + + ctx->states += digests a-h; + + ---------------------------------------------------------------------------- + + our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block + into the last 16 rounds of its previous block: + + ---------------------------------------------------------------------------- + + load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3 + pre_calculate and store W+K(0:15) in stack + +L_loop: + + load digests a-h from ctx->state; + + for (r=0;r<48;r+=4) { + digests a-h update and permute round r:r+3 + update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration + } + + num_block--; + if (num_block==0) jmp L_last_block; + + for (r=48;r<64;r+=4) { + digests a-h update and permute round r:r+3 + load W([r:r+3]%16) (big-endian per 4 bytes) into xmm0:xmm3 + pre_calculate and store W+K([r:r+3]%16) in stack + } + + ctx->states += digests a-h; + + jmp L_loop; + +L_last_block: + + for (r=48;r<64;r+=4) { + digests a-h update and permute round r:r+3 + } + + ctx->states += digests a-h; + + ------------------------------------------------------------------------ + + Apple CoreOS vector & numerics + cclee 8-3-10 +*/ + +#if defined KERNEL +#include +#else +#include +#endif + + // associate variables with registers or memory + +#if defined (__x86_64__) + #define sp %rsp + #define ctx %rdi + #define data %rsi + #define num_blocks %rdx + + #define a %r8d + #define b %r9d + #define c %r10d + #define d %r11d + #define e %r12d + #define f %r13d + #define g %r14d + #define h %r15d + + #define K %rbx + #define stack_size (8+16*8+16+64) // 8 (align) + xmm0:xmm7 + L_aligned_bswap + WK(0:15) + + #define L_aligned_bswap 64(sp) // bswap : big-endian loading of 4-byte words + #define xmm_save 80(sp) // starting address for xmm save/restore +#else + #define sp %esp + #define stack_size (12+16*8+16+16+64) // 12 (align) + xmm0:xmm7 + 16 (c,f,h,K) + L_aligned_bswap + WK(0:15) + #define ctx_addr 20+stack_size(sp) // ret_addr + 4 registers = 20, 1st caller argument + #define data_addr 24+stack_size(sp) // 2nd caller argument + #define num_blocks 28+stack_size(sp) // 3rd caller argument + + #define a %ebx + #define b %edx + #define c 64(sp) + #define d %ebp + #define e %esi + #define f 68(sp) + #define g %edi + #define h 72(sp) + + #define K 76(sp) // pointer to K256[] table + #define L_aligned_bswap 80(sp) // bswap : big-endian loading of 4-byte words + #define xmm_save 96(sp) // starting address for xmm save/restore +#endif + + // 2 local variables + #define t %eax + #define s %ecx + + // a window (16 words) of message scheule + #define W0 %xmm0 + #define W1 %xmm1 + #define W2 %xmm2 + #define W3 %xmm3 + + // circular buffer for WK[(r:r+15)%16] + #define WK(x) (x&15)*4(sp) + +// #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z))) + + .macro Ch + mov $0, t // x + mov $0, s // x + not t // ~x + and $1, s // x & y + and $2, t // ~x & z + xor s, t // t = ((x) & (y)) ^ ((~(x)) & (z)); + .endm + +// #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) + + .macro Maj + mov $0, t // x + mov $1, s // y + and s, t // x&y + and $2, s // y&z + xor s, t // (x&y) ^ (y&z) + mov $2, s // z + and $0, s // (x&z) + xor s, t // t = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) + .endm + +/* Shift-right (used in SHA-256, SHA-384, and SHA-512): */ +// #define R(b,x) ((x) >> (b)) +/* 32-bit Rotate-right (used in SHA-256): */ +// #define S32(b,x) (((x) >> (b)) | ((x) << (32 - (b)))) + +// #define sigma0_256(x) (S32(7, (x)) ^ S32(18, (x)) ^ R(3 , (x))) + + // performs sigma0_256 on 4 words on an xmm registers + // use xmm6/xmm7 as intermediate registers + .macro sigma0 + movdqa $0, %xmm6 + movdqa $0, %xmm7 + psrld $$3, $0 // SHR3(x) + psrld $$7, %xmm6 // part of ROTR7 + pslld $$14, %xmm7 // part of ROTR18 + pxor %xmm6, $0 + pxor %xmm7, $0 + psrld $$11, %xmm6 // part of ROTR18 + pslld $$11, %xmm7 // part of ROTR7 + pxor %xmm6, $0 + pxor %xmm7, $0 + .endm + +// #define sigma1_256(x) (S32(17, (x)) ^ S32(19, (x)) ^ R(10, (x))) + + // performs sigma1_256 on 4 words on an xmm registers + // use xmm6/xmm7 as intermediate registers + .macro sigma1 + movdqa $0, %xmm6 + movdqa $0, %xmm7 + psrld $$10, $0 // SHR10(x) + psrld $$17, %xmm6 // part of ROTR17 + pxor %xmm6, $0 + pslld $$13, %xmm7 // part of ROTR19 + pxor %xmm7, $0 + psrld $$2, %xmm6 // part of ROTR19 + pxor %xmm6, $0 + pslld $$2, %xmm7 // part of ROTR17 + pxor %xmm7, $0 + .endm + +// #define Sigma0_256(x) (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) + + .macro Sigma0 + mov $0, t // x + mov $0, s // x + ror $$2, t // S32(2, (x)) + ror $$13, s // S32(13, (x)) + xor s, t // S32(2, (x)) ^ S32(13, (x)) + ror $$9, s // S32(22, (x)) + xor s, t // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) + .endm + +// #define Sigma1_256(x) (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x))) + + .macro Sigma1 + mov $0, s // x + ror $$6, s // S32(6, (x)) + mov s, t // S32(6, (x)) + ror $$5, s // S32(11, (x)) + xor s, t // S32(6, (x)) ^ S32(11, (x)) + ror $$14, s // S32(25, (x)) + xor s, t // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x))) + .endm + + // per round digests update + .macro round + Sigma1 $4 // t = T1 + add t, $7 // use h to store h+Sigma1(e) + Ch $4, $5, $6 // t = Ch (e, f, g); + add $7, t // t = h+Sigma1(e)+Ch(e,f,g); + add WK($8), t // h = T1 + add t, $3 // d += T1; + mov t, $7 // h = T1 + Sigma0 $0 // t = Sigma0(a); + add t, $7 // h = T1 + Sigma0(a); + Maj $0, $1, $2 // t = Maj(a,b,c) + add t, $7 // h = T1 + Sigma0(a) + Maj(a,b,c); + .endm + + // per 4 rounds digests update and permutation + // permutation is absorbed by rotating the roles of digests a-h + .macro rounds + round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8 + round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8 + round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8 + round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8 + .endm + + // update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future + .macro message_schedule + + // 4 32-bit K256 words in xmm5 +#if defined (__x86_64__) + movdqu (K), %xmm5 +#else + mov K, t + movdqu (t), %xmm5 +#endif + add $$16, K // K points to next K256 word for next iteration + movdqa $1, %xmm4 // W7:W4 + palignr $$4, $0, %xmm4 // W4:W1 + sigma0 %xmm4 // sigma0(W4:W1) + movdqa $3, %xmm6 // W15:W12 + paddd %xmm4, $0 // $0 = W3:W0 + sigma0(W4:W1) + palignr $$4, $2, %xmm6 // W12:W9 + paddd %xmm6, $0 // $0 = W12:W9 + sigma0(W4:W1) + W3:W0 + movdqa $3, %xmm4 // W15:W12 + psrldq $$8, %xmm4 // 0,0,W15,W14 + sigma1 %xmm4 // sigma1(0,0,W15,W14) + paddd %xmm4, $0 // sigma1(0,0,W15,W14) + W12:W9 + sigma0(W4:W1) + W3:W0 + movdqa $0, %xmm4 // W19-sigma1(W17), W18-sigma1(W16), W17, W16 + pslldq $$8, %xmm4 // W17, W16, 0, 0 + sigma1 %xmm4 // sigma1(W17,W16,0,0) + paddd %xmm4, $0 // W19:W16 + paddd $0, %xmm5 // WK + movdqa %xmm5, WK($4) + .endm + + // this macro is used in the last 16 rounds of a current block + // it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3] + // and save into stack to prepare for next block + + .macro update_W_WK +#if defined (__x86_64__) + movdqu $0*16(data), $1 // read 4 4-byte words + pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3] + movdqu $0*16(K), %xmm4 // K[r:r+3] +#else + mov data_addr, t + movdqu $0*16(t), $1 // read 4 4-byte words + pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3] + mov K, t + movdqu $0*16(t), %xmm4 // K[r:r+3] +#endif + paddd $1, %xmm4 // WK[r:r+3] + movdqa %xmm4, WK($0*4) // save WK[r:r+3] into stack circular buffer + .endm + + .text + +#if defined (__x86_64__) || defined (__i386__) + + .globl _SHA256_Transform + +_SHA256_Transform: + + + // detect SSSE3 and dispatch appropriate code branch + #if defined __x86_64__ + movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities + mov (%rax), %eax // %eax = __cpu_capabilities + #else // i386 + #if defined KERNEL + leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities + mov (%eax), %eax // %eax = __cpu_capabilities + #else + mov _COMM_PAGE_CPU_CAPABILITIES, %eax + #endif + #endif + test $(kHasSupplementalSSE3), %eax + je _SHA256_Transform_nossse3 // branch to no-ssse3 code + + // push callee-saved registers +#if defined (__x86_64__) + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 +#else + push %ebp + push %ebx + push %esi + push %edi +#endif + + // allocate stack space + sub $stack_size, sp + + // if kernel code, save used xmm registers +#if KERNEL + movdqa %xmm0, 0*16+xmm_save + movdqa %xmm1, 1*16+xmm_save + movdqa %xmm2, 2*16+xmm_save + movdqa %xmm3, 3*16+xmm_save + movdqa %xmm4, 4*16+xmm_save + movdqa %xmm5, 5*16+xmm_save + movdqa %xmm6, 6*16+xmm_save + movdqa %xmm7, 7*16+xmm_save +#endif + + // set up bswap parameters in the aligned stack space and pointer to table K256[] +#if defined (__x86_64__) + lea _K256(%rip), K + lea L_bswap(%rip), %rax + movdqa (%rax), %xmm0 +#else + lea _K256, t + mov t, K + lea L_bswap, %eax + movdqa (%eax), %xmm0 +#endif + movdqa %xmm0, L_aligned_bswap + + // load W[0:15] into xmm0-xmm3 +#if defined (__x86_64__) + movdqu 0*16(data), W0 + movdqu 1*16(data), W1 + movdqu 2*16(data), W2 + movdqu 3*16(data), W3 + add $64, data +#else + mov data_addr, t + movdqu 0*16(t), W0 + movdqu 1*16(t), W1 + movdqu 2*16(t), W2 + movdqu 3*16(t), W3 + add $64, data_addr +#endif + pshufb L_aligned_bswap, W0 + pshufb L_aligned_bswap, W1 + pshufb L_aligned_bswap, W2 + pshufb L_aligned_bswap, W3 + + // compute WK[0:15] and save in stack +#if defined (__x86_64__) + movdqu 0*16(K), %xmm4 + movdqu 1*16(K), %xmm5 + movdqu 2*16(K), %xmm6 + movdqu 3*16(K), %xmm7 +#else + mov K, t + movdqu 0*16(t), %xmm4 + movdqu 1*16(t), %xmm5 + movdqu 2*16(t), %xmm6 + movdqu 3*16(t), %xmm7 +#endif + add $64, K + paddd %xmm0, %xmm4 + paddd %xmm1, %xmm5 + paddd %xmm2, %xmm6 + paddd %xmm3, %xmm7 + movdqa %xmm4, WK(0) + movdqa %xmm5, WK(4) + movdqa %xmm6, WK(8) + movdqa %xmm7, WK(12) + +L_loop: + + // digests a-h = ctx->states; +#if defined (__x86_64__) + mov 0*4(ctx), a + mov 1*4(ctx), b + mov 2*4(ctx), c + mov 3*4(ctx), d + mov 4*4(ctx), e + mov 5*4(ctx), f + mov 6*4(ctx), g + mov 7*4(ctx), h +#else + mov ctx_addr, t + mov 0*4(t), a + mov 1*4(t), b + mov 2*4(t), s + mov s, c + mov 3*4(t), d + mov 4*4(t), e + mov 5*4(t), s + mov s, f + mov 6*4(t), g + mov 7*4(t), s + mov s, h +#endif + + // rounds 0:47 interleaved with W/WK update for rounds 16:63 + rounds a, b, c, d, e, f, g, h, 0 + message_schedule W0,W1,W2,W3,16 + rounds e, f, g, h, a, b, c, d, 4 + message_schedule W1,W2,W3,W0,20 + rounds a, b, c, d, e, f, g, h, 8 + message_schedule W2,W3,W0,W1,24 + rounds e, f, g, h, a, b, c, d, 12 + message_schedule W3,W0,W1,W2,28 + rounds a, b, c, d, e, f, g, h, 16 + message_schedule W0,W1,W2,W3,32 + rounds e, f, g, h, a, b, c, d, 20 + message_schedule W1,W2,W3,W0,36 + rounds a, b, c, d, e, f, g, h, 24 + message_schedule W2,W3,W0,W1,40 + rounds e, f, g, h, a, b, c, d, 28 + message_schedule W3,W0,W1,W2,44 + rounds a, b, c, d, e, f, g, h, 32 + message_schedule W0,W1,W2,W3,48 + rounds e, f, g, h, a, b, c, d, 36 + message_schedule W1,W2,W3,W0,52 + rounds a, b, c, d, e, f, g, h, 40 + message_schedule W2,W3,W0,W1,56 + rounds e, f, g, h, a, b, c, d, 44 + message_schedule W3,W0,W1,W2,60 + + // revert K to the beginning of K256[] +#if defined __x86_64__ + sub $256, K +#else + subl $256, K +#endif + + sub $1, num_blocks // num_blocks-- + je L_final_block // if final block, wrap up final rounds + + // rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15 + rounds a, b, c, d, e, f, g, h, 48 + update_W_WK 0, W0 + rounds e, f, g, h, a, b, c, d, 52 + update_W_WK 1, W1 + rounds a, b, c, d, e, f, g, h, 56 + update_W_WK 2, W2 + rounds e, f, g, h, a, b, c, d, 60 + update_W_WK 3, W3 + + add $64, K +#if defined (__x86_64__) + add $64, data +#else + add $64, data_addr +#endif + + // ctx->states += digests a-h +#if defined (__x86_64__) + add a, 0*4(ctx) + add b, 1*4(ctx) + add c, 2*4(ctx) + add d, 3*4(ctx) + add e, 4*4(ctx) + add f, 5*4(ctx) + add g, 6*4(ctx) + add h, 7*4(ctx) +#else + mov ctx_addr, t + add a, 0*4(t) + add b, 1*4(t) + mov c, s + add s, 2*4(t) + add d, 3*4(t) + add e, 4*4(t) + mov f, s + add s, 5*4(t) + add g, 6*4(t) + mov h, s + add s, 7*4(t) +#endif + + jmp L_loop // branch for next block + + // wrap up digest update round 48:63 for final block +L_final_block: + rounds a, b, c, d, e, f, g, h, 48 + rounds e, f, g, h, a, b, c, d, 52 + rounds a, b, c, d, e, f, g, h, 56 + rounds e, f, g, h, a, b, c, d, 60 + + // ctx->states += digests a-h +#if defined (__x86_64__) + add a, 0*4(ctx) + add b, 1*4(ctx) + add c, 2*4(ctx) + add d, 3*4(ctx) + add e, 4*4(ctx) + add f, 5*4(ctx) + add g, 6*4(ctx) + add h, 7*4(ctx) +#else + mov ctx_addr, t + add a, 0*4(t) + add b, 1*4(t) + mov c, s + add s, 2*4(t) + add d, 3*4(t) + add e, 4*4(t) + mov f, s + add s, 5*4(t) + add g, 6*4(t) + mov h, s + add s, 7*4(t) +#endif + + // if kernel, restore xmm0-xmm7 +#if KERNEL + movdqa 0*16+xmm_save, %xmm0 + movdqa 1*16+xmm_save, %xmm1 + movdqa 2*16+xmm_save, %xmm2 + movdqa 3*16+xmm_save, %xmm3 + movdqa 4*16+xmm_save, %xmm4 + movdqa 5*16+xmm_save, %xmm5 + movdqa 6*16+xmm_save, %xmm6 + movdqa 7*16+xmm_save, %xmm7 +#endif + + // free allocated stack memory + add $stack_size, sp + + // restore callee-saved registers +#if defined (__x86_64__) + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + pop %rbp +#else + pop %edi + pop %esi + pop %ebx + pop %ebp +#endif + + // return + ret + + + .const + .align 4, 0x90 + +L_bswap: + .long 0x00010203 + .long 0x04050607 + .long 0x08090a0b + .long 0x0c0d0e0f + +#endif // x86_64/i386 + diff --git a/bsd/crypto/sha2/intel/sha256nossse3.s b/bsd/crypto/sha2/intel/sha256nossse3.s new file mode 100644 index 000000000..b4dd0a035 --- /dev/null +++ b/bsd/crypto/sha2/intel/sha256nossse3.s @@ -0,0 +1,649 @@ +/* + This file provides x86_64/i386 hand implementation of the following function + + void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks); + + which is a C function in sha2.c (from xnu). + + The code SHA256_Transform_nossse3 is a clone of SHA256_Transform + with all ssse3 instructions replaced with sse3 or below instructions. + + For performance reason, this function should not be called directly. This file should be working + together with the one that implements SHA256_Transform. There, cpu_capabilities is probed to detect + ssse3. If ssse3 is not supported, the execution will be branched to this no-ssse3-specific function. + + sha256 algorithm per block description: + + 1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte) + 2. load 8 digests a-h from ctx->state + 3. for r = 0:15 + T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r]; + d += T1; + h = T1 + Sigma0(a) + Maj(a,b,c) + permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g + 4. for r = 16:63 + W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]); + T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r]; + d += T1; + h = T1 + Sigma0(a) + Maj(a,b,c) + permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g + + In the assembly implementation: + - a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm3 + - its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer + - the 8 digests (a-h) will be stored in GPR or m32 (all in GPR for x86_64, and some in m32 for i386) + + the implementation per block looks like + + ---------------------------------------------------------------------------- + + load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3 + pre_calculate and store W+K(0:15) in stack + + load digests a-h from ctx->state; + + for (r=0;r<48;r+=4) { + digests a-h update and permute round r:r+3 + update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration + } + + for (r=48;r<64;r+=4) { + digests a-h update and permute round r:r+3 + } + + ctx->states += digests a-h; + + ---------------------------------------------------------------------------- + + our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block + into the last 16 rounds of its previous block: + + ---------------------------------------------------------------------------- + + load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3 + pre_calculate and store W+K(0:15) in stack + +L_loop: + + load digests a-h from ctx->state; + + for (r=0;r<48;r+=4) { + digests a-h update and permute round r:r+3 + update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration + } + + num_block--; + if (num_block==0) jmp L_last_block; + + for (r=48;r<64;r+=4) { + digests a-h update and permute round r:r+3 + load W([r:r+3]%16) (big-endian per 4 bytes) into xmm0:xmm3 + pre_calculate and store W+K([r:r+3]%16) in stack + } + + ctx->states += digests a-h; + + jmp L_loop; + +L_last_block: + + for (r=48;r<64;r+=4) { + digests a-h update and permute round r:r+3 + } + + ctx->states += digests a-h; + + ------------------------------------------------------------------------ + + Apple CoreOS vector & numerics + cclee 8-3-10 +*/ + +#if defined KERNEL +#include +#else +#include +#endif + + // associate variables with registers or memory + +#if defined (__x86_64__) + #define sp %rsp + #define ctx %rdi + #define data %rsi + #define num_blocks %rdx + + #define a %r8d + #define b %r9d + #define c %r10d + #define d %r11d + #define e %r12d + #define f %r13d + #define g %r14d + #define h %r15d + + #define K %rbx + #define stack_size (8+16*8+16+64) // 8 (align) + xmm0:xmm7 + L_aligned_bswap + WK(0:15) + + #define xmm_save 80(sp) // starting address for xmm save/restore +#else + #define sp %esp + #define stack_size (12+16*8+16+16+64) // 12 (align) + xmm0:xmm7 + 16 (c,f,h,K) + L_aligned_bswap + WK(0:15) + #define ctx_addr 20+stack_size(sp) // ret_addr + 4 registers = 20, 1st caller argument + #define data_addr 24+stack_size(sp) // 2nd caller argument + #define num_blocks 28+stack_size(sp) // 3rd caller argument + + #define a %ebx + #define b %edx + #define c 64(sp) + #define d %ebp + #define e %esi + #define f 68(sp) + #define g %edi + #define h 72(sp) + + #define K 76(sp) // pointer to K256[] table + #define xmm_save 96(sp) // starting address for xmm save/restore +#endif + + // 2 local variables + #define t %eax + #define s %ecx + + // a window (16 words) of message scheule + #define W0 %xmm0 + #define W1 %xmm1 + #define W2 %xmm2 + #define W3 %xmm3 + + // circular buffer for WK[(r:r+15)%16] + #define WK(x) (x&15)*4(sp) + +// #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z))) + + .macro Ch + mov $0, t // x + mov $0, s // x + not t // ~x + and $1, s // x & y + and $2, t // ~x & z + xor s, t // t = ((x) & (y)) ^ ((~(x)) & (z)); + .endm + +// #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) + + .macro Maj + mov $0, t // x + mov $1, s // y + and s, t // x&y + and $2, s // y&z + xor s, t // (x&y) ^ (y&z) + mov $2, s // z + and $0, s // (x&z) + xor s, t // t = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) + .endm + +/* Shift-right (used in SHA-256, SHA-384, and SHA-512): */ +// #define R(b,x) ((x) >> (b)) +/* 32-bit Rotate-right (used in SHA-256): */ +// #define S32(b,x) (((x) >> (b)) | ((x) << (32 - (b)))) + +// #define sigma0_256(x) (S32(7, (x)) ^ S32(18, (x)) ^ R(3 , (x))) + + // performs sigma0_256 on 4 words on an xmm registers + // use xmm6/xmm7 as intermediate registers + .macro sigma0 + movdqa $0, %xmm6 + movdqa $0, %xmm7 + psrld $$3, $0 // SHR3(x) + psrld $$7, %xmm6 // part of ROTR7 + pslld $$14, %xmm7 // part of ROTR18 + pxor %xmm6, $0 + pxor %xmm7, $0 + psrld $$11, %xmm6 // part of ROTR18 + pslld $$11, %xmm7 // part of ROTR7 + pxor %xmm6, $0 + pxor %xmm7, $0 + .endm + +// #define sigma1_256(x) (S32(17, (x)) ^ S32(19, (x)) ^ R(10, (x))) + + // performs sigma1_256 on 4 words on an xmm registers + // use xmm6/xmm7 as intermediate registers + .macro sigma1 + movdqa $0, %xmm6 + movdqa $0, %xmm7 + psrld $$10, $0 // SHR10(x) + psrld $$17, %xmm6 // part of ROTR17 + pxor %xmm6, $0 + pslld $$13, %xmm7 // part of ROTR19 + pxor %xmm7, $0 + psrld $$2, %xmm6 // part of ROTR19 + pxor %xmm6, $0 + pslld $$2, %xmm7 // part of ROTR17 + pxor %xmm7, $0 + .endm + +// #define Sigma0_256(x) (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) + + .macro Sigma0 + mov $0, t // x + mov $0, s // x + ror $$2, t // S32(2, (x)) + ror $$13, s // S32(13, (x)) + xor s, t // S32(2, (x)) ^ S32(13, (x)) + ror $$9, s // S32(22, (x)) + xor s, t // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) + .endm + +// #define Sigma1_256(x) (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x))) + + .macro Sigma1 + mov $0, s // x + ror $$6, s // S32(6, (x)) + mov s, t // S32(6, (x)) + ror $$5, s // S32(11, (x)) + xor s, t // S32(6, (x)) ^ S32(11, (x)) + ror $$14, s // S32(25, (x)) + xor s, t // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x))) + .endm + + // per round digests update + .macro round + Sigma1 $4 // t = T1 + add t, $7 // use h to store h+Sigma1(e) + Ch $4, $5, $6 // t = Ch (e, f, g); + add $7, t // t = h+Sigma1(e)+Ch(e,f,g); + add WK($8), t // h = T1 + add t, $3 // d += T1; + mov t, $7 // h = T1 + Sigma0 $0 // t = Sigma0(a); + add t, $7 // h = T1 + Sigma0(a); + Maj $0, $1, $2 // t = Maj(a,b,c) + add t, $7 // h = T1 + Sigma0(a) + Maj(a,b,c); + .endm + + // per 4 rounds digests update and permutation + // permutation is absorbed by rotating the roles of digests a-h + .macro rounds + round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8 + round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8 + round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8 + round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8 + .endm + + // update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future + .macro message_schedule + + // 4 32-bit K256 words in xmm5 +#if defined (__x86_64__) + movdqu (K), %xmm5 +#else + mov K, t + movdqu (t), %xmm5 +#endif + add $$16, K // K points to next K256 word for next iteration + movdqa $1, %xmm4 // W7:W4 +#if 0 + palignr $$4, $0, %xmm4 // W4:W1 +#else // no-ssse3 implementation of palignr + movdqa $0, %xmm7 + pslldq $$12, %xmm4 + psrldq $$4, %xmm7 + por %xmm7, %xmm4 +#endif + sigma0 %xmm4 // sigma0(W4:W1) + movdqa $3, %xmm6 // W15:W12 + paddd %xmm4, $0 // $0 = W3:W0 + sigma0(W4:W1) +#if 0 + palignr $$4, $2, %xmm6 // W12:W9 +#else // no-ssse3 implementation of palignr + movdqa $2, %xmm7 + pslldq $$12, %xmm6 + psrldq $$4, %xmm7 + por %xmm7, %xmm6 +#endif + paddd %xmm6, $0 // $0 = W12:W9 + sigma0(W4:W1) + W3:W0 + movdqa $3, %xmm4 // W15:W12 + psrldq $$8, %xmm4 // 0,0,W15,W14 + sigma1 %xmm4 // sigma1(0,0,W15,W14) + paddd %xmm4, $0 // sigma1(0,0,W15,W14) + W12:W9 + sigma0(W4:W1) + W3:W0 + movdqa $0, %xmm4 // W19-sigma1(W17), W18-sigma1(W16), W17, W16 + pslldq $$8, %xmm4 // W17, W16, 0, 0 + sigma1 %xmm4 // sigma1(W17,W16,0,0) + paddd %xmm4, $0 // W19:W16 + paddd $0, %xmm5 // WK + movdqa %xmm5, WK($4) + .endm + + // this macro is used in the last 16 rounds of a current block + // it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3] + // and save into stack to prepare for next block + + .macro update_W_WK +#if defined (__x86_64__) +#if 0 + movdqu $0*16(data), $1 // read 4 4-byte words + pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3] +#else // no-ssse3 implementation + mov 0+$0*16(data), s + bswap s + mov s, 0+WK($0*4) + mov 4+$0*16(data), s + bswap s + mov s, 4+WK($0*4) + mov 8+$0*16(data), s + bswap s + mov s, 8+WK($0*4) + mov 12+$0*16(data), s + bswap s + mov s, 12+WK($0*4) + movdqa WK($0*4), $1 +#endif + movdqu $0*16(K), %xmm4 // K[r:r+3] +#else + mov data_addr, t +#if 0 + movdqu $0*16(t), $1 // read 4 4-byte words + pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3] +#else // no-ssse3 implementation + mov 0+$0*16(t), s + bswap s + mov s, 0+WK($0*4) + mov 4+$0*16(t), s + bswap s + mov s, 4+WK($0*4) + mov 8+$0*16(t), s + bswap s + mov s, 8+WK($0*4) + mov 12+$0*16(t), s + bswap s + mov s, 12+WK($0*4) + movdqa WK($0*4), $1 +#endif + mov K, t + movdqu $0*16(t), %xmm4 // K[r:r+3] +#endif + paddd $1, %xmm4 // WK[r:r+3] + movdqa %xmm4, WK($0*4) // save WK[r:r+3] into stack circular buffer + .endm + + .text + +#if defined (__x86_64__) || defined (__i386__) + + .globl _SHA256_Transform_nossse3 + +_SHA256_Transform_nossse3: + + // push callee-saved registers +#if defined (__x86_64__) + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 +#else + push %ebp + push %ebx + push %esi + push %edi +#endif + + // allocate stack space + sub $stack_size, sp + + // if kernel code, save used xmm registers +#if KERNEL + movdqa %xmm0, 0*16+xmm_save + movdqa %xmm1, 1*16+xmm_save + movdqa %xmm2, 2*16+xmm_save + movdqa %xmm3, 3*16+xmm_save + movdqa %xmm4, 4*16+xmm_save + movdqa %xmm5, 5*16+xmm_save + movdqa %xmm6, 6*16+xmm_save + movdqa %xmm7, 7*16+xmm_save +#endif + + // set up pointer to table K256[] +#if defined (__x86_64__) + lea _K256(%rip), K +#else + lea _K256, t + mov t, K +#endif + + // load W[0:15] into xmm0-xmm3 + .macro mybswap + movl 0+$0*16($1), a + movl 4+$0*16($1), b + movl 8+$0*16($1), e + movl 12+$0*16($1), d + bswap a + bswap b + bswap e + bswap d + movl a, $0*16(sp) + movl b, 4+$0*16(sp) + movl e, 8+$0*16(sp) + movl d, 12+$0*16(sp) + .endm + +#if defined (__x86_64__) + mybswap 0, data + mybswap 1, data + mybswap 2, data + mybswap 3, data + add $64, data +#else + mov data_addr, t + mybswap 0, t + mybswap 1, t + mybswap 2, t + mybswap 3, t + add $64, data_addr +#endif + movdqa 0*16(sp), W0 + movdqa 1*16(sp), W1 + movdqa 2*16(sp), W2 + movdqa 3*16(sp), W3 + + // compute WK[0:15] and save in stack +#if defined (__x86_64__) + movdqu 0*16(K), %xmm4 + movdqu 1*16(K), %xmm5 + movdqu 2*16(K), %xmm6 + movdqu 3*16(K), %xmm7 +#else + mov K, t + movdqu 0*16(t), %xmm4 + movdqu 1*16(t), %xmm5 + movdqu 2*16(t), %xmm6 + movdqu 3*16(t), %xmm7 +#endif + add $64, K + paddd %xmm0, %xmm4 + paddd %xmm1, %xmm5 + paddd %xmm2, %xmm6 + paddd %xmm3, %xmm7 + movdqa %xmm4, WK(0) + movdqa %xmm5, WK(4) + movdqa %xmm6, WK(8) + movdqa %xmm7, WK(12) + +L_loop: + + // digests a-h = ctx->states; +#if defined (__x86_64__) + mov 0*4(ctx), a + mov 1*4(ctx), b + mov 2*4(ctx), c + mov 3*4(ctx), d + mov 4*4(ctx), e + mov 5*4(ctx), f + mov 6*4(ctx), g + mov 7*4(ctx), h +#else + mov ctx_addr, t + mov 0*4(t), a + mov 1*4(t), b + mov 2*4(t), s + mov s, c + mov 3*4(t), d + mov 4*4(t), e + mov 5*4(t), s + mov s, f + mov 6*4(t), g + mov 7*4(t), s + mov s, h +#endif + + // rounds 0:47 interleaved with W/WK update for rounds 16:63 + rounds a, b, c, d, e, f, g, h, 0 + message_schedule W0,W1,W2,W3,16 + rounds e, f, g, h, a, b, c, d, 4 + message_schedule W1,W2,W3,W0,20 + rounds a, b, c, d, e, f, g, h, 8 + message_schedule W2,W3,W0,W1,24 + rounds e, f, g, h, a, b, c, d, 12 + message_schedule W3,W0,W1,W2,28 + rounds a, b, c, d, e, f, g, h, 16 + message_schedule W0,W1,W2,W3,32 + rounds e, f, g, h, a, b, c, d, 20 + message_schedule W1,W2,W3,W0,36 + rounds a, b, c, d, e, f, g, h, 24 + message_schedule W2,W3,W0,W1,40 + rounds e, f, g, h, a, b, c, d, 28 + message_schedule W3,W0,W1,W2,44 + rounds a, b, c, d, e, f, g, h, 32 + message_schedule W0,W1,W2,W3,48 + rounds e, f, g, h, a, b, c, d, 36 + message_schedule W1,W2,W3,W0,52 + rounds a, b, c, d, e, f, g, h, 40 + message_schedule W2,W3,W0,W1,56 + rounds e, f, g, h, a, b, c, d, 44 + message_schedule W3,W0,W1,W2,60 + + // revert K to the beginning of K256[] +#if defined __x86_64__ + sub $256, K +#else + subl $256, K +#endif + + sub $1, num_blocks // num_blocks-- + je L_final_block // if final block, wrap up final rounds + + // rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15 + rounds a, b, c, d, e, f, g, h, 48 + update_W_WK 0, W0 + rounds e, f, g, h, a, b, c, d, 52 + update_W_WK 1, W1 + rounds a, b, c, d, e, f, g, h, 56 + update_W_WK 2, W2 + rounds e, f, g, h, a, b, c, d, 60 + update_W_WK 3, W3 + + add $64, K +#if defined (__x86_64__) + add $64, data +#else + add $64, data_addr +#endif + + // ctx->states += digests a-h +#if defined (__x86_64__) + add a, 0*4(ctx) + add b, 1*4(ctx) + add c, 2*4(ctx) + add d, 3*4(ctx) + add e, 4*4(ctx) + add f, 5*4(ctx) + add g, 6*4(ctx) + add h, 7*4(ctx) +#else + mov ctx_addr, t + add a, 0*4(t) + add b, 1*4(t) + mov c, s + add s, 2*4(t) + add d, 3*4(t) + add e, 4*4(t) + mov f, s + add s, 5*4(t) + add g, 6*4(t) + mov h, s + add s, 7*4(t) +#endif + + jmp L_loop // branch for next block + + // wrap up digest update round 48:63 for final block +L_final_block: + rounds a, b, c, d, e, f, g, h, 48 + rounds e, f, g, h, a, b, c, d, 52 + rounds a, b, c, d, e, f, g, h, 56 + rounds e, f, g, h, a, b, c, d, 60 + + // ctx->states += digests a-h +#if defined (__x86_64__) + add a, 0*4(ctx) + add b, 1*4(ctx) + add c, 2*4(ctx) + add d, 3*4(ctx) + add e, 4*4(ctx) + add f, 5*4(ctx) + add g, 6*4(ctx) + add h, 7*4(ctx) +#else + mov ctx_addr, t + add a, 0*4(t) + add b, 1*4(t) + mov c, s + add s, 2*4(t) + add d, 3*4(t) + add e, 4*4(t) + mov f, s + add s, 5*4(t) + add g, 6*4(t) + mov h, s + add s, 7*4(t) +#endif + + // if kernel, restore xmm0-xmm7 +#if KERNEL + movdqa 0*16+xmm_save, %xmm0 + movdqa 1*16+xmm_save, %xmm1 + movdqa 2*16+xmm_save, %xmm2 + movdqa 3*16+xmm_save, %xmm3 + movdqa 4*16+xmm_save, %xmm4 + movdqa 5*16+xmm_save, %xmm5 + movdqa 6*16+xmm_save, %xmm6 + movdqa 7*16+xmm_save, %xmm7 +#endif + + // free allocated stack memory + add $stack_size, sp + + // restore callee-saved registers +#if defined (__x86_64__) + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + pop %rbp +#else + pop %edi + pop %esi + pop %ebx + pop %ebp +#endif + + // return + ret + + +#endif // x86_64/i386 + diff --git a/bsd/crypto/sha2/sha2.c b/bsd/crypto/sha2/sha2.c index c306068dc..603d32834 100644 --- a/bsd/crypto/sha2/sha2.c +++ b/bsd/crypto/sha2/sha2.c @@ -63,7 +63,7 @@ * */ -#ifndef assert(x) +#ifndef assert #define assert(x) do {} while(0) #endif @@ -202,13 +202,21 @@ typedef u_int64_t sha2_word64; /* Exactly 8 bytes */ * only. */ void SHA512_Last(SHA512_CTX*); +#if defined (SHA256_USE_ASSEMBLY) && (defined(__x86_64__)||defined(__i386__)) +void SHA256_Transform(SHA256_CTX*, const sha2_word32*, unsigned int num_blocks); +#else void SHA256_Transform(SHA256_CTX*, const sha2_word32*); +#endif void SHA512_Transform(SHA512_CTX*, const sha2_word64*); /*** SHA-XYZ INITIAL HASH VALUES AND CONSTANTS ************************/ /* Hash constant words K for SHA-256: */ +#if defined (SHA256_USE_ASSEMBLY) && (defined(__x86_64__)||defined(__i386__)) +const sha2_word32 K256[64] = { // assembly code will need to read this table +#else static const sha2_word32 K256[64] = { +#endif 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, 0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, 0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL, @@ -324,6 +332,8 @@ void SHA256_Init(SHA256_CTX* context) { context->bitcount = 0; } +#if !(defined (SHA256_USE_ASSEMBLY) && (defined(__x86_64__)||defined(__i386__))) + #ifdef SHA2_UNROLL_TRANSFORM /* Unrolled SHA-256 round macros: */ @@ -499,6 +509,8 @@ void SHA256_Transform(SHA256_CTX* context, const sha2_word32* data) { #endif /* SHA2_UNROLL_TRANSFORM */ +#endif // defined (SHA256_USE_ASSEMBLY) && (defined(__x86_64__)||defined(__i386__)) + void SHA256_Update(SHA256_CTX* context, const sha2_byte *data, size_t len) { unsigned int freespace, usedspace; @@ -521,7 +533,11 @@ void SHA256_Update(SHA256_CTX* context, const sha2_byte *data, size_t len) { context->bitcount += freespace << 3; len -= freespace; data += freespace; +#if defined (SHA256_USE_ASSEMBLY) && (defined(__x86_64__)||defined(__i386__)) + SHA256_Transform(context, (sha2_word32*)context->buffer, 1); +#else SHA256_Transform(context, (sha2_word32*)context->buffer); +#endif } else { /* The buffer is not yet full */ bcopy(data, &context->buffer[usedspace], len); @@ -531,6 +547,17 @@ void SHA256_Update(SHA256_CTX* context, const sha2_byte *data, size_t len) { return; } } +#if defined (SHA256_USE_ASSEMBLY) && (defined(__x86_64__)||defined(__i386__)) + { + unsigned int kk = len/SHA256_BLOCK_LENGTH; + if (kk>0) { + SHA256_Transform(context, (const sha2_word32*)data, kk); + context->bitcount += (SHA256_BLOCK_LENGTH << 3)*kk; + len -= SHA256_BLOCK_LENGTH*kk; + data += SHA256_BLOCK_LENGTH*kk; + } + } +#else while (len >= SHA256_BLOCK_LENGTH) { /* Process as many complete blocks as we can */ SHA256_Transform(context, (const sha2_word32*)data); @@ -538,6 +565,7 @@ void SHA256_Update(SHA256_CTX* context, const sha2_byte *data, size_t len) { len -= SHA256_BLOCK_LENGTH; data += SHA256_BLOCK_LENGTH; } +#endif if (len > 0) { /* There's left-overs, so save 'em */ bcopy(data, context->buffer, len); @@ -573,7 +601,11 @@ void SHA256_Final(sha2_byte digest[], SHA256_CTX* context) { bzero(&context->buffer[usedspace], SHA256_BLOCK_LENGTH - usedspace); } /* Do second-to-last transform: */ +#if defined (SHA256_USE_ASSEMBLY) && (defined(__x86_64__)||defined(__i386__)) + SHA256_Transform(context, (sha2_word32*)context->buffer, 1); +#else SHA256_Transform(context, (sha2_word32*)context->buffer); +#endif /* And set-up for the last transform: */ bzero(context->buffer, SHA256_SHORT_BLOCK_LENGTH); @@ -589,7 +621,11 @@ void SHA256_Final(sha2_byte digest[], SHA256_CTX* context) { *(sha2_word64*)&context->buffer[SHA256_SHORT_BLOCK_LENGTH] = context->bitcount; /* Final transform: */ +#if defined (SHA256_USE_ASSEMBLY) && (defined(__x86_64__)||defined(__i386__)) + SHA256_Transform(context, (sha2_word32*)context->buffer, 1); +#else SHA256_Transform(context, (sha2_word32*)context->buffer); +#endif #if BYTE_ORDER == LITTLE_ENDIAN { diff --git a/bsd/dev/Makefile b/bsd/dev/Makefile index b2f00140a..01f00592f 100644 --- a/bsd/dev/Makefile +++ b/bsd/dev/Makefile @@ -9,16 +9,12 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ INSTINC_SUBDIRS_X86_64 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ diff --git a/bsd/dev/chud/chud_bsd_callback.c b/bsd/dev/chud/chud_bsd_callback.c index 6fad80050..a28bebf46 100644 --- a/bsd/dev/chud/chud_bsd_callback.c +++ b/bsd/dev/chud/chud_bsd_callback.c @@ -36,15 +36,10 @@ #include /* struct sysent */ #include #include /* KDEBUG_ENABLE_CHUD */ +#include /* kauth_cred_get */ #include - -#ifdef __ppc__ -#include - -#define FM_ARG0 0x38ULL // offset from r1 to first argument -#define SPILLED_WORD_COUNT 7 // number of 32-bit words spilled to the stack - -extern struct savearea * find_user_regs( thread_t act); +#if CONFIG_MACF +#include /* mac_system_check_chud */ #endif #pragma mark **** kern debug **** @@ -87,8 +82,6 @@ chudxnu_kdebug_callback_enter(chudxnu_kdebug_callback_func_t func) (void * volatile *)&kdebug_callback_fn)) { kdbg_control_chud(TRUE, (void *)chudxnu_private_kdebug_callback); - OSBitOrAtomic((UInt32)KDEBUG_ENABLE_CHUD, (volatile UInt32 *)&kdebug_enable); - return KERN_SUCCESS; } return KERN_FAILURE; @@ -97,7 +90,6 @@ chudxnu_kdebug_callback_enter(chudxnu_kdebug_callback_func_t func) __private_extern__ kern_return_t chudxnu_kdebug_callback_cancel(void) { - OSBitAndAtomic((UInt32)~(KDEBUG_ENABLE_CHUD), (volatile UInt32 *)&kdebug_enable); kdbg_control_chud(FALSE, NULL); chudxnu_kdebug_callback_func_t old = kdebug_callback_fn; @@ -175,40 +167,18 @@ static kern_return_t chud_null_syscall(uint64_t code __unused, int chud(__unused proc_t p, struct chud_args *uap, int32_t *retval) { +#if CONFIG_MACF + int error = mac_system_check_chud(kauth_cred_get()); + if (error) + return error; +#endif + chudxnu_syscall_callback_func_t fn = syscall_callback_fn; if(!fn) { return EINVAL; } -#ifdef __ppc__ - // ppc32 user land spills 2.5 64-bit args (5 x 32-bit) to the stack - // here we have to copy them out. r1 is the stack pointer in this world. - // the offset is calculated according to the PPC32 ABI - // Important: this only happens for 32-bit user threads - - if(!IS_64BIT_PROCESS(p)) { - struct savearea *regs = find_user_regs(current_thread()); - if(!regs) { - return EINVAL; - } - - // %r1 is the stack pointer on ppc32 - uint32_t stackPointer = regs->save_r1; - - // calculate number of bytes spilled to the stack - uint32_t spilledSize = sizeof(struct chud_args) - (sizeof(uint32_t) * SPILLED_WORD_COUNT); - - // obtain offset to arguments spilled onto user-thread stack - user_addr_t incomingAddr = (user_addr_t)stackPointer + FM_ARG0; - - // destination is halfway through arg3 - uint8_t *dstAddr = (uint8_t*)(&(uap->arg3)) + sizeof(uint32_t); - - copyin(incomingAddr, dstAddr, spilledSize); - } -#endif - *retval = fn(uap->code, uap->arg1, uap->arg2, uap->arg3, uap->arg4, uap->arg5); return 0; diff --git a/bsd/dev/dtrace/dtrace.c b/bsd/dev/dtrace/dtrace.c index 081f70dc3..745a0fa01 100644 --- a/bsd/dev/dtrace/dtrace.c +++ b/bsd/dev/dtrace/dtrace.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -100,6 +100,7 @@ #include #include #include +#include #include #include #include @@ -112,13 +113,17 @@ #include #include #include +#include #include #include #include #if defined(__APPLE__) +#include extern uint32_t pmap_find_phys(void *, uint64_t); extern boolean_t pmap_valid_page(uint32_t); +extern void OSKextRegisterKextsWithDTrace(void); +extern kmod_info_t g_kernel_kmod_info; #endif /* __APPLE__ */ @@ -140,6 +145,7 @@ extern void dtrace_postinit(void); extern kern_return_t chudxnu_dtrace_callback (uint64_t selector, uint64_t *args, uint32_t count); + #endif /* __APPLE__ */ /* @@ -170,7 +176,7 @@ size_t dtrace_global_maxsize = (16 * 1024); size_t dtrace_actions_max = (16 * 1024); size_t dtrace_retain_max = 1024; dtrace_optval_t dtrace_helper_actions_max = 32; -dtrace_optval_t dtrace_helper_providers_max = 32; +dtrace_optval_t dtrace_helper_providers_max = 64; dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024); size_t dtrace_strsize_default = 256; dtrace_optval_t dtrace_cleanrate_default = 9900990; /* 101 hz */ @@ -238,6 +244,12 @@ static dtrace_genid_t dtrace_retained_gen; /* current retained enab gen */ static dtrace_dynvar_t dtrace_dynhash_sink; /* end of dynamic hash chains */ #if defined(__APPLE__) static int dtrace_dof_mode; /* See dtrace_impl.h for a description of Darwin's dof modes. */ + + /* + * This does't quite fit as an internal variable, as it must be accessed in + * fbt_provide and sdt_provide. Its clearly not a dtrace tunable variable either... + */ +int dtrace_kernel_symbol_mode; /* See dtrace_impl.h for a description of Darwin's kernel symbol modes. */ #endif #if defined(__APPLE__) @@ -249,6 +261,8 @@ static int dtrace_dof_mode; /* See dtrace_impl.h for a description of Darwin's */ struct zone *dtrace_probe_t_zone; + +static int dtrace_module_unloaded(struct kmod_info *kmod); #endif /* __APPLE__ */ /* @@ -328,10 +342,16 @@ static void dtrace_nullop(void) {} +static int +dtrace_enable_nullop(void) +{ + return (0); +} + static dtrace_pops_t dtrace_provider_ops = { (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop, (void (*)(void *, struct modctl *))dtrace_nullop, - (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, + (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop, (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, @@ -429,8 +449,8 @@ static lck_mtx_t dtrace_errlock; (where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \ (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \ } -#else -#if (defined(__x86_64__) || defined(__ppc64__)) +#else +#if defined(__x86_64__) /* FIXME: two function calls!! */ #define DTRACE_TLS_THRKEY(where) { \ uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \ @@ -542,12 +562,11 @@ dtrace_load##bits(uintptr_t addr) \ return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0); \ } #else /* __APPLE__ */ -#define RECOVER_LABEL(bits) __asm__ volatile("_dtraceLoadRecover" #bits ":" ); +#define RECOVER_LABEL(bits) dtraceLoadRecover##bits: #if (defined(__i386__) || defined (__x86_64__)) #define DTRACE_LOADFUNC(bits) \ /*CSTYLED*/ \ -extern vm_offset_t dtraceLoadRecover##bits; \ uint##bits##_t dtrace_load##bits(uintptr_t addr); \ \ uint##bits##_t \ @@ -578,7 +597,7 @@ dtrace_load##bits(uintptr_t addr) \ } \ \ { \ - volatile vm_offset_t recover = (vm_offset_t)&dtraceLoadRecover##bits; \ + volatile vm_offset_t recover = (vm_offset_t)&&dtraceLoadRecover##bits; \ *flags |= CPU_DTRACE_NOFAULT; \ recover = dtrace_set_thread_recover(current_thread(), recover); \ /*CSTYLED*/ \ @@ -598,7 +617,6 @@ dtrace_load##bits(uintptr_t addr) \ #else /* all other architectures */ #define DTRACE_LOADFUNC(bits) \ /*CSTYLED*/ \ -extern vm_offset_t dtraceLoadRecover##bits; \ uint##bits##_t dtrace_load##bits(uintptr_t addr); \ \ uint##bits##_t \ @@ -629,7 +647,7 @@ dtrace_load##bits(uintptr_t addr) \ } \ \ { \ - volatile vm_offset_t recover = (vm_offset_t)&dtraceLoadRecover##bits; \ + volatile vm_offset_t recover = (vm_offset_t)&&dtraceLoadRecover##bits; \ *flags |= CPU_DTRACE_NOFAULT; \ recover = dtrace_set_thread_recover(current_thread(), recover); \ /*CSTYLED*/ \ @@ -654,6 +672,7 @@ dtrace_load##bits(uintptr_t addr) \ #define DTRACE_DYNHASH_SINK 1 #define DTRACE_DYNHASH_VALID 2 +#define DTRACE_MATCH_FAIL -1 #define DTRACE_MATCH_NEXT 0 #define DTRACE_MATCH_DONE 1 #define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0') @@ -1291,12 +1310,12 @@ dtrace_priv_proc_common_user(dtrace_state_t *state) #else if ((cr = dtrace_CRED()) != NULL && #endif /* __APPLE__ */ - s_cr->cr_uid == cr->cr_uid && - s_cr->cr_uid == cr->cr_ruid && - s_cr->cr_uid == cr->cr_suid && - s_cr->cr_gid == cr->cr_gid && - s_cr->cr_gid == cr->cr_rgid && - s_cr->cr_gid == cr->cr_sgid) + posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_uid && + posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_ruid && + posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_suid && + posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_gid && + posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_rgid && + posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_sgid) return (1); return (0); @@ -4946,15 +4965,20 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, #if !defined(__APPLE__) ipaddr_t ip4; #else - in_addr_t ip4; + uint32_t ip4; #endif /* __APPLE__ */ uint8_t *ptr8, val; /* * Safely load the IPv4 address. */ +#if !defined(__APPLE__) ip4 = dtrace_load32(tupregs[argi].dttk_value); - +#else + dtrace_bcopy( + (void *)(uintptr_t)tupregs[argi].dttk_value, + (void *)(uintptr_t)&ip4, sizeof (ip4)); +#endif /* __APPLE__ */ /* * Check an IPv4 string will fit in scratch. */ @@ -6180,7 +6204,7 @@ dtrace_action_raise(uint64_t sig) if (uthread && uthread->t_dtrace_sig == 0) { uthread->t_dtrace_sig = sig; - astbsd_on(); + act_set_astbsd(current_thread()); } #endif /* __APPLE__ */ } @@ -6198,21 +6222,55 @@ dtrace_action_stop(void) aston(curthread); } #else - uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread()); - - if (uthread && uthread->t_dtrace_stop == 0) { + uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread()); + if (uthread) { + /* + * The currently running process will be set to task_suspend + * when it next leaves the kernel. + */ uthread->t_dtrace_stop = 1; - astbsd_on(); + act_set_astbsd(current_thread()); } + #endif /* __APPLE__ */ } +#if defined(__APPLE__) +static void +dtrace_action_pidresume(uint64_t pid) +{ + if (dtrace_destructive_disallow) + return; + + if (kauth_cred_issuser(kauth_cred_get()) == 0) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); + return; + } + + uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread()); + + /* + * When the currently running process leaves the kernel, it attempts to + * task_resume the process (denoted by pid), if that pid appears to have + * been stopped by dtrace_action_stop(). + * The currently running process has a pidresume() queue depth of 1 -- + * subsequent invocations of the pidresume() action are ignored. + */ + + if (pid != 0 && uthread && uthread->t_dtrace_resumepid == 0) { + uthread->t_dtrace_resumepid = pid; + act_set_astbsd(current_thread()); + } +} +#endif /* __APPLE__ */ + + static void dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val) { hrtime_t now; volatile uint16_t *flags; - cpu_t *cpu = CPU; + dtrace_cpu_t *cpu = CPU; if (dtrace_destructive_disallow) return; @@ -6601,17 +6659,21 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1, ASSERT(s_cr != NULL); + /* + * XXX this is hackish, but so is setting a variable + * XXX in a McCarthy OR... + */ #if !defined(__APPLE__) if ((cr = CRED()) == NULL || #else if ((cr = dtrace_CRED()) == NULL || #endif /* __APPLE__ */ - s_cr->cr_uid != cr->cr_uid || - s_cr->cr_uid != cr->cr_ruid || - s_cr->cr_uid != cr->cr_suid || - s_cr->cr_gid != cr->cr_gid || - s_cr->cr_gid != cr->cr_rgid || - s_cr->cr_gid != cr->cr_sgid || + posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_uid || + posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_ruid || + posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_suid || + posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_gid || + posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_rgid || + posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_sgid || #if !defined(__APPLE__) (proc = ttoproc(curthread)) == NULL || (proc->p_flag & SNOCD)) @@ -6868,6 +6930,13 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1, dtrace_action_raise(val); continue; +#if defined(__APPLE__) + case DTRACEACT_PIDRESUME: + if (dtrace_priv_proc_destructive(state)) + dtrace_action_pidresume(val); + continue; +#endif /* __APPLE__ */ + case DTRACEACT_COMMIT: ASSERT(!committed); @@ -7126,12 +7195,13 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1, on some function in the transitive closure of the call to dtrace_probe(). Solaris has some strong guarantees that this won't happen, the Darwin implementation is not so mature as to make those guarantees. */ + void dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1, uint64_t arg2, uint64_t arg3, uint64_t arg4) { thread_t thread = current_thread(); - + disable_preemption(); if (id == dtrace_probeid_error) { __dtrace_probe(id, arg0, arg1, arg2, arg3, arg4); dtrace_getipl(); /* Defeat tail-call optimization of __dtrace_probe() */ @@ -7143,6 +7213,7 @@ dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1, #if DEBUG else __dtrace_probe(dtrace_probeid_error, 0, id, 1, -1, DTRACEFLT_UNKNOWN); #endif + enable_preemption(); } #endif /* __APPLE__ */ @@ -7733,7 +7804,7 @@ dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid, { dtrace_probe_t template, *probe; dtrace_hash_t *hash = NULL; - int len, best = INT_MAX, nmatched = 0; + int len, rc, best = INT_MAX, nmatched = 0; dtrace_id_t i; lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED); @@ -7745,7 +7816,8 @@ dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid, if (pkp->dtpk_id != DTRACE_IDNONE) { if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL && dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) { - (void) (*matched)(probe, arg); + if ((*matched)(probe, arg) == DTRACE_MATCH_FAIL) + return (DTRACE_MATCH_FAIL); nmatched++; } return (nmatched); @@ -7802,8 +7874,11 @@ dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid, nmatched++; - if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT) - break; + if ((rc = (*matched)(probe, arg)) != DTRACE_MATCH_NEXT) { + if (rc == DTRACE_MATCH_FAIL) + return (DTRACE_MATCH_FAIL); + break; + } } return (nmatched); @@ -7822,8 +7897,11 @@ dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid, nmatched++; - if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT) - break; + if ((rc = (*matched)(probe, arg)) != DTRACE_MATCH_NEXT) { + if (rc == DTRACE_MATCH_FAIL) + return (DTRACE_MATCH_FAIL); + break; + } } return (nmatched); @@ -8051,7 +8129,7 @@ dtrace_unregister(dtrace_provider_id_t id) dtrace_probe_t *probe, *first = NULL; if (old->dtpv_pops.dtps_enable == - (void (*)(void *, dtrace_id_t, void *))dtrace_nullop) { + (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) { /* * If DTrace itself is the provider, we're called with locks * already held. @@ -8201,7 +8279,7 @@ dtrace_invalidate(dtrace_provider_id_t id) dtrace_provider_t *pvp = (dtrace_provider_t *)id; ASSERT(pvp->dtpv_pops.dtps_enable != - (void (*)(void *, dtrace_id_t, void *))dtrace_nullop); + (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop); lck_mtx_lock(&dtrace_provider_lock); lck_mtx_lock(&dtrace_lock); @@ -8242,7 +8320,7 @@ dtrace_condense(dtrace_provider_id_t id) * Make sure this isn't the dtrace provider itself. */ ASSERT(prov->dtpv_pops.dtps_enable != - (void (*)(void *, dtrace_id_t, void *))dtrace_nullop); + (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop); lck_mtx_lock(&dtrace_provider_lock); lck_mtx_lock(&dtrace_lock); @@ -8508,7 +8586,6 @@ dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv) { struct modctl *ctl; int all = 0; -#pragma unused(ctl) /* __APPLE__ */ lck_mtx_assert(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED); @@ -8516,22 +8593,22 @@ dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv) all = 1; prv = dtrace_provider; } - + do { /* * First, call the blanket provide operation. */ prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc); - -#if !defined(__APPLE__) + /* * Now call the per-module provide operation. We will grab * mod_lock to prevent the list from being modified. Note * that this also prevents the mod_busy bits from changing. * (mod_busy can only be changed with mod_lock held.) */ - mutex_enter(&mod_lock); - + lck_mtx_lock(&mod_lock); + +#if !defined(__APPLE__) ctl = &modules; do { if (ctl->mod_busy || ctl->mod_mp == NULL) @@ -8540,29 +8617,15 @@ dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv) prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl); } while ((ctl = ctl->mod_next) != &modules); - - mutex_exit(&mod_lock); #else -#if 0 /* FIXME: Workaround for PR_4643546 */ - /* NOTE: kmod_lock has been removed. */ - simple_lock(&kmod_lock); - - kmod_info_t *ktl = kmod; - while (ktl) { - prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ktl); - ktl = ktl->next; + ctl = dtrace_modctl_list; + while (ctl) { + prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl); + ctl = ctl->mod_next; } - - simple_unlock(&kmod_lock); -#else - /* - * Don't bother to iterate over the kmod list. At present only fbt - * offers a provide_module in its dtpv_pops, and then it ignores the - * module anyway. - */ - prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, NULL); #endif -#endif /* __APPLE__ */ + + lck_mtx_unlock(&mod_lock); } while (all && (prv = prv->dtpv_next) != NULL); } @@ -9295,7 +9358,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs, break; default: - err += efunc(dp->dtdo_len - 1, "bad return size"); + err += efunc(dp->dtdo_len - 1, "bad return size\n"); } } @@ -10356,7 +10419,7 @@ dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe) return (ecb); } -static void +static int dtrace_ecb_enable(dtrace_ecb_t *ecb) { dtrace_probe_t *probe = ecb->dte_probe; @@ -10369,7 +10432,7 @@ dtrace_ecb_enable(dtrace_ecb_t *ecb) /* * This is the NULL probe -- there's nothing to do. */ - return; + return(0); } if (probe->dtpr_ecb == NULL) { @@ -10383,8 +10446,8 @@ dtrace_ecb_enable(dtrace_ecb_t *ecb) if (ecb->dte_predicate != NULL) probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid; - prov->dtpv_pops.dtps_enable(prov->dtpv_arg, - probe->dtpr_id, probe->dtpr_arg); + return (prov->dtpv_pops.dtps_enable(prov->dtpv_arg, + probe->dtpr_id, probe->dtpr_arg)); } else { /* * This probe is already active. Swing the last pointer to @@ -10397,6 +10460,7 @@ dtrace_ecb_enable(dtrace_ecb_t *ecb) probe->dtpr_predcache = 0; dtrace_sync(); + return(0); } } @@ -10860,6 +10924,9 @@ dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc) case DTRACEACT_CHILL: case DTRACEACT_DISCARD: case DTRACEACT_RAISE: +#if defined(__APPLE__) + case DTRACEACT_PIDRESUME: +#endif /* __APPLE__ */ if (dp == NULL) return (EINVAL); break; @@ -11196,7 +11263,9 @@ dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg) if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL) return (DTRACE_MATCH_DONE); - dtrace_ecb_enable(ecb); + if (dtrace_ecb_enable(ecb) < 0) + return (DTRACE_MATCH_FAIL); + return (DTRACE_MATCH_NEXT); } @@ -11313,7 +11382,7 @@ static int dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags, processorid_t cpu) { - cpu_t *cp; + dtrace_cpu_t *cp; dtrace_buffer_t *buf; lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED); @@ -12052,7 +12121,7 @@ static int dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched) { int i = 0; - int matched = 0; + int total_matched = 0, matched = 0; lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED); lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED); @@ -12063,7 +12132,14 @@ dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched) enab->dten_current = ep; enab->dten_error = 0; - matched += dtrace_probe_enable(&ep->dted_probe, enab); + /* + * If a provider failed to enable a probe then get out and + * let the consumer know we failed. + */ + if ((matched = dtrace_probe_enable(&ep->dted_probe, enab)) < 0) + return (EBUSY); + + total_matched += matched; if (enab->dten_error != 0) { /* @@ -12091,7 +12167,7 @@ dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched) enab->dten_probegen = dtrace_probegen; if (nmatched != NULL) - *nmatched = matched; + *nmatched = total_matched; return (0); } @@ -12351,16 +12427,22 @@ dtrace_dof_copyin(user_addr_t uarg, int *errp) #if !defined(__APPLE__) dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP); - if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0) { + if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 || + dof->dofh_loadsz != hdr.dofh_loadsz) { + kmem_free(dof, hdr.dofh_loadsz); + *errp = EFAULT; + return (NULL); + } #else dof = dt_kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP); - if (copyin(uarg, dof, hdr.dofh_loadsz) != 0) { + if (copyin(uarg, dof, hdr.dofh_loadsz) != 0 || + dof->dofh_loadsz != hdr.dofh_loadsz) { + dt_kmem_free_aligned(dof, hdr.dofh_loadsz); + *errp = EFAULT; + return (NULL); + } #endif - dt_kmem_free_aligned(dof, hdr.dofh_loadsz); - *errp = EFAULT; - return (NULL); - } return (dof); } @@ -16079,30 +16161,257 @@ dtrace_helpers_duplicate(proc_t *from, proc_t *to) /* * DTrace Hook Functions */ + +#if defined(__APPLE__) +/* + * Routines to manipulate the modctl list within dtrace + */ + +modctl_t *dtrace_modctl_list; + +static void +dtrace_modctl_add(struct modctl * newctl) +{ + struct modctl *nextp, *prevp; + + ASSERT(newctl != NULL); + lck_mtx_assert(&mod_lock, LCK_MTX_ASSERT_OWNED); + + // Insert new module at the front of the list, + + newctl->mod_next = dtrace_modctl_list; + dtrace_modctl_list = newctl; + + /* + * If a module exists with the same name, then that module + * must have been unloaded with enabled probes. We will move + * the unloaded module to the new module's stale chain and + * then stop traversing the list. + */ + + prevp = newctl; + nextp = newctl->mod_next; + + while (nextp != NULL) { + if (nextp->mod_loaded) { + /* This is a loaded module. Keep traversing. */ + prevp = nextp; + nextp = nextp->mod_next; + continue; + } + else { + /* Found an unloaded module */ + if (strncmp (newctl->mod_modname, nextp->mod_modname, KMOD_MAX_NAME)) { + /* Names don't match. Keep traversing. */ + prevp = nextp; + nextp = nextp->mod_next; + continue; + } + else { + /* We found a stale entry, move it. We're done. */ + prevp->mod_next = nextp->mod_next; + newctl->mod_stale = nextp; + nextp->mod_next = NULL; + break; + } + } + } +} + +static modctl_t * +dtrace_modctl_lookup(struct kmod_info * kmod) +{ + lck_mtx_assert(&mod_lock, LCK_MTX_ASSERT_OWNED); + + struct modctl * ctl; + + for (ctl = dtrace_modctl_list; ctl; ctl=ctl->mod_next) { + if (ctl->mod_id == kmod->id) + return(ctl); + } + return (NULL); +} + +/* + * This routine is called from dtrace_module_unloaded(). + * It removes a modctl structure and its stale chain + * from the kext shadow list. + */ +static void +dtrace_modctl_remove(struct modctl * ctl) +{ + ASSERT(ctl != NULL); + lck_mtx_assert(&mod_lock, LCK_MTX_ASSERT_OWNED); + modctl_t *prevp, *nextp, *curp; + + // Remove stale chain first + for (curp=ctl->mod_stale; curp != NULL; curp=nextp) { + nextp = curp->mod_stale; + /* There should NEVER be user symbols allocated at this point */ + ASSERT(curp->mod_user_symbols == NULL); + kmem_free(curp, sizeof(modctl_t)); + } + + prevp = NULL; + curp = dtrace_modctl_list; + + while (curp != ctl) { + prevp = curp; + curp = curp->mod_next; + } + + if (prevp != NULL) { + prevp->mod_next = ctl->mod_next; + } + else { + dtrace_modctl_list = ctl->mod_next; + } + + /* There should NEVER be user symbols allocated at this point */ + ASSERT(ctl->mod_user_symbols == NULL); + + kmem_free (ctl, sizeof(modctl_t)); +} + +#endif /* __APPLE__ */ + +/* + * APPLE NOTE: The kext loader will call dtrace_module_loaded + * when the kext is loaded in memory, but before calling the + * kext's start routine. + * + * Return 0 on success + * Return -1 on failure + */ + +#if !defined (__APPLE__) static void dtrace_module_loaded(struct modctl *ctl) +#else +static int +dtrace_module_loaded(struct kmod_info *kmod) +#endif /* __APPLE__ */ { dtrace_provider_t *prv; - lck_mtx_lock(&dtrace_provider_lock); - lck_mtx_lock(&mod_lock); - #if !defined(__APPLE__) + mutex_enter(&dtrace_provider_lock); + mutex_enter(&mod_lock); + ASSERT(ctl->mod_busy); #else - /* FIXME: awaits kmod awareness PR_4648477. */ -#endif /* __APPLE__ */ + + /* + * If kernel symbols have been disabled, return immediately + * DTRACE_KERNEL_SYMBOLS_NEVER is a permanent mode, it is safe to test without holding locks + */ + if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER) + return 0; + + struct modctl *ctl = NULL; + if (!kmod || kmod->address == 0 || kmod->size == 0) + return(-1); + + lck_mtx_lock(&dtrace_provider_lock); + lck_mtx_lock(&mod_lock); + + /* + * Have we seen this kext before? + */ + ctl = dtrace_modctl_lookup(kmod); + + if (ctl != NULL) { + /* bail... we already have this kext in the modctl list */ + lck_mtx_unlock(&mod_lock); + lck_mtx_unlock(&dtrace_provider_lock); + if (dtrace_err_verbose) + cmn_err(CE_WARN, "dtrace load module already exists '%s %u' is failing against '%s %u'", kmod->name, (uint_t)kmod->id, ctl->mod_modname, ctl->mod_id); + return(-1); + } + else { + ctl = kmem_alloc(sizeof(struct modctl), KM_SLEEP); + if (ctl == NULL) { + if (dtrace_err_verbose) + cmn_err(CE_WARN, "dtrace module load '%s %u' is failing ", kmod->name, (uint_t)kmod->id); + lck_mtx_unlock(&mod_lock); + lck_mtx_unlock(&dtrace_provider_lock); + return (-1); + } + ctl->mod_next = NULL; + ctl->mod_stale = NULL; + strlcpy (ctl->mod_modname, kmod->name, sizeof(ctl->mod_modname)); + ctl->mod_loadcnt = kmod->id; + ctl->mod_nenabled = 0; + ctl->mod_address = kmod->address; + ctl->mod_size = kmod->size; + ctl->mod_id = kmod->id; + ctl->mod_loaded = 1; + ctl->mod_flags = 0; + ctl->mod_user_symbols = NULL; + + /* + * Find the UUID for this module, if it has one + */ + kernel_mach_header_t* header = (kernel_mach_header_t *)ctl->mod_address; + struct load_command* load_cmd = (struct load_command *)&header[1]; + uint32_t i; + for (i = 0; i < header->ncmds; i++) { + if (load_cmd->cmd == LC_UUID) { + struct uuid_command* uuid_cmd = (struct uuid_command *)load_cmd; + memcpy(ctl->mod_uuid, uuid_cmd->uuid, sizeof(uuid_cmd->uuid)); + ctl->mod_flags |= MODCTL_HAS_UUID; + break; + } + load_cmd = (struct load_command *)((caddr_t)load_cmd + load_cmd->cmdsize); + } + + if (ctl->mod_address == g_kernel_kmod_info.address) { + ctl->mod_flags |= MODCTL_IS_MACH_KERNEL; + } + } + dtrace_modctl_add(ctl); + + /* + * We must hold the dtrace_lock to safely test non permanent dtrace_fbt_symbol_mode(s) + */ + lck_mtx_lock(&dtrace_lock); + + /* + * If the module does not have a valid UUID, we will not be able to find symbols for it from + * userspace. Go ahead and instrument it now. + */ + if (MOD_HAS_UUID(ctl) && (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE)) { + lck_mtx_unlock(&dtrace_lock); + lck_mtx_unlock(&mod_lock); + lck_mtx_unlock(&dtrace_provider_lock); + return 0; + } + + ctl->mod_flags |= MODCTL_HAS_KERNEL_SYMBOLS; + + lck_mtx_unlock(&dtrace_lock); +#endif /* __APPLE__ */ + /* * We're going to call each providers per-module provide operation * specifying only this module. */ for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next) - prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl); - + prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl); + +#if defined(__APPLE__) + /* + * The contract with the kext loader is that once this function has completed, + * it may delete kernel symbols at will. We must set this while still holding + * the mod_lock. + */ + ctl->mod_flags &= ~MODCTL_HAS_KERNEL_SYMBOLS; +#endif + lck_mtx_unlock(&mod_lock); lck_mtx_unlock(&dtrace_provider_lock); - + /* * If we have any retained enablings, we need to match against them. * Enabling probes requires that cpu_lock be held, and we cannot hold @@ -16112,17 +16421,22 @@ dtrace_module_loaded(struct modctl *ctl) * our task queue to do the match for us. */ lck_mtx_lock(&dtrace_lock); - + if (dtrace_retained == NULL) { lck_mtx_unlock(&dtrace_lock); +#if !defined(__APPLE__) return; +#else + return 0; +#endif } - + +#if !defined(__APPLE__) (void) taskq_dispatch(dtrace_taskq, - (task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP); - - lck_mtx_unlock(&dtrace_lock); - + (task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP); + + mutex_exit(&dtrace_lock); + /* * And now, for a little heuristic sleaze: in general, we want to * match modules as soon as they load. However, we cannot guarantee @@ -16134,8 +16448,23 @@ dtrace_module_loaded(struct modctl *ctl) * just loaded may not be immediately instrumentable. */ delay(1); +#else + /* APPLE NOTE! + * + * The cpu_lock mentioned above is only held by dtrace code, Apple's xnu never actually + * holds it for any reason. Thus the comment above is invalid, we can directly invoke + * dtrace_enabling_matchall without jumping through all the hoops, and we can avoid + * the delay call as well. + */ + lck_mtx_unlock(&dtrace_lock); + + dtrace_enabling_matchall(); + + return 0; +#endif /* __APPLE__ */ } - + +#if !defined(__APPLE__) static void dtrace_module_unloaded(struct modctl *ctl) { @@ -16144,27 +16473,27 @@ dtrace_module_unloaded(struct modctl *ctl) template.dtpr_mod = ctl->mod_modname; - lck_mtx_lock(&dtrace_provider_lock); - lck_mtx_lock(&mod_lock); - lck_mtx_lock(&dtrace_lock); + mutex_enter(&dtrace_provider_lock); + mutex_enter(&mod_lock); + mutex_enter(&dtrace_lock); if (dtrace_bymod == NULL) { /* * The DTrace module is loaded (obviously) but not attached; * we don't have any work to do. */ - lck_mtx_unlock(&dtrace_provider_lock); - lck_mtx_unlock(&mod_lock); - lck_mtx_unlock(&dtrace_lock); + mutex_exit(&dtrace_provider_lock); + mutex_exit(&mod_lock); + mutex_exit(&dtrace_lock); return; } for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template); probe != NULL; probe = probe->dtpr_nextmod) { if (probe->dtpr_ecb != NULL) { - lck_mtx_unlock(&dtrace_provider_lock); - lck_mtx_unlock(&mod_lock); - lck_mtx_unlock(&dtrace_lock); + mutex_exit(&dtrace_provider_lock); + mutex_exit(&mod_lock); + mutex_exit(&dtrace_lock); /* * This shouldn't _actually_ be possible -- we're @@ -16222,25 +16551,185 @@ dtrace_module_unloaded(struct modctl *ctl) kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1); kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1); vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1); -#if !defined(__APPLE__) kmem_free(probe, sizeof (dtrace_probe_t)); -#else - zfree(dtrace_probe_t_zone, probe); -#endif /* __APPLE__ */ } - lck_mtx_unlock(&dtrace_lock); - lck_mtx_unlock(&mod_lock); - lck_mtx_unlock(&dtrace_provider_lock); + mutex_exit(&dtrace_lock); + mutex_exit(&mod_lock); + mutex_exit(&dtrace_provider_lock); } +#else /* __APPLE__ */ -void -dtrace_suspend(void) +/* + * Return 0 on success + * Return -1 on failure + */ +static int +dtrace_module_unloaded(struct kmod_info *kmod) { - dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend)); -} + dtrace_probe_t template, *probe, *first, *next; + dtrace_provider_t *prov; + struct modctl *ctl = NULL; + struct modctl *syncctl = NULL; + struct modctl *nextsyncctl = NULL; + int syncmode = 0; + + lck_mtx_lock(&dtrace_provider_lock); + lck_mtx_lock(&mod_lock); + lck_mtx_lock(&dtrace_lock); -void + if (kmod == NULL) { + syncmode = 1; + } + else { + ctl = dtrace_modctl_lookup(kmod); + if (ctl == NULL) + { + lck_mtx_unlock(&dtrace_lock); + lck_mtx_unlock(&mod_lock); + lck_mtx_unlock(&dtrace_provider_lock); + return (-1); + } + ctl->mod_loaded = 0; + ctl->mod_address = 0; + ctl->mod_size = 0; + } + + if (dtrace_bymod == NULL) { + /* + * The DTrace module is loaded (obviously) but not attached; + * we don't have any work to do. + */ + if (ctl != NULL) + (void)dtrace_modctl_remove(ctl); + lck_mtx_unlock(&dtrace_provider_lock); + lck_mtx_unlock(&mod_lock); + lck_mtx_unlock(&dtrace_lock); + return(0); + } + + /* Syncmode set means we target and traverse entire modctl list. */ + if (syncmode) + nextsyncctl = dtrace_modctl_list; + +syncloop: + if (syncmode) + { + /* find a stale modctl struct */ + for (syncctl = nextsyncctl; syncctl != NULL; syncctl=syncctl->mod_next) { + if (syncctl->mod_address == 0) + break; + } + if (syncctl==NULL) + { + /* We have no more work to do */ + lck_mtx_unlock(&dtrace_provider_lock); + lck_mtx_unlock(&mod_lock); + lck_mtx_unlock(&dtrace_lock); + return(0); + } + else { + /* keep track of next syncctl in case this one is removed */ + nextsyncctl = syncctl->mod_next; + ctl = syncctl; + } + } + + template.dtpr_mod = ctl->mod_modname; + + for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template); + probe != NULL; probe = probe->dtpr_nextmod) { + if (probe->dtpr_ecb != NULL) { + /* + * This shouldn't _actually_ be possible -- we're + * unloading a module that has an enabled probe in it. + * (It's normally up to the provider to make sure that + * this can't happen.) However, because dtps_enable() + * doesn't have a failure mode, there can be an + * enable/unload race. Upshot: we don't want to + * assert, but we're not going to disable the + * probe, either. + */ + + + if (syncmode) { + /* We're syncing, let's look at next in list */ + goto syncloop; + } + + lck_mtx_unlock(&dtrace_provider_lock); + lck_mtx_unlock(&mod_lock); + lck_mtx_unlock(&dtrace_lock); + + if (dtrace_err_verbose) { + cmn_err(CE_WARN, "unloaded module '%s' had " + "enabled probes", ctl->mod_modname); + } + return(-1); + } + } + + probe = first; + + for (first = NULL; probe != NULL; probe = next) { + ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe); + + dtrace_probes[probe->dtpr_id - 1] = NULL; + + next = probe->dtpr_nextmod; + dtrace_hash_remove(dtrace_bymod, probe); + dtrace_hash_remove(dtrace_byfunc, probe); + dtrace_hash_remove(dtrace_byname, probe); + + if (first == NULL) { + first = probe; + probe->dtpr_nextmod = NULL; + } else { + probe->dtpr_nextmod = first; + first = probe; + } + } + + /* + * We've removed all of the module's probes from the hash chains and + * from the probe array. Now issue a dtrace_sync() to be sure that + * everyone has cleared out from any probe array processing. + */ + dtrace_sync(); + + for (probe = first; probe != NULL; probe = first) { + first = probe->dtpr_nextmod; + prov = probe->dtpr_provider; + prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id, + probe->dtpr_arg); + kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1); + kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1); + kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1); + vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1); + + zfree(dtrace_probe_t_zone, probe); + } + + dtrace_modctl_remove(ctl); + + if (syncmode) + goto syncloop; + + lck_mtx_unlock(&dtrace_lock); + lck_mtx_unlock(&mod_lock); + lck_mtx_unlock(&dtrace_provider_lock); + + return(0); +} +#endif /* __APPLE__ */ + +void +dtrace_suspend(void) +{ + dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend)); +} + +void dtrace_resume(void) { dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume)); @@ -16463,13 +16952,6 @@ dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) dtrace_provider, NULL, NULL, "END", 0, NULL); dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t) dtrace_provider, NULL, NULL, "ERROR", 1, NULL); -#elif defined(__ppc__) || defined(__ppc64__) - dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t) - dtrace_provider, NULL, NULL, "BEGIN", 2, NULL); - dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t) - dtrace_provider, NULL, NULL, "END", 1, NULL); - dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t) - dtrace_provider, NULL, NULL, "ERROR", 4, NULL); #elif (defined(__i386__) || defined (__x86_64__)) dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t) dtrace_provider, NULL, NULL, "BEGIN", 1, NULL); @@ -16505,6 +16987,15 @@ dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) if (dtrace_anon.dta_enabling != NULL) { ASSERT(dtrace_retained == dtrace_anon.dta_enabling); +#if defined(__APPLE__) + /* + * If there is anonymous dof, we should switch symbol modes. + */ + if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) { + dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL; + } +#endif + dtrace_enabling_provide(NULL); state = dtrace_anon.dta_state; @@ -16612,7 +17103,7 @@ dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) lck_mtx_unlock(&cpu_lock); if (state == NULL) { - if (--dtrace_opens == 0) + if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE); lck_mtx_unlock(&dtrace_lock); return (EAGAIN); @@ -16624,7 +17115,7 @@ dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) lck_mtx_unlock(&cpu_lock); if (rv != 0 || state == NULL) { - if (--dtrace_opens == 0) + if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE); lck_mtx_unlock(&dtrace_lock); /* propagate EAGAIN or ERESTART */ @@ -16656,6 +17147,27 @@ dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) } lck_rw_unlock_exclusive(&dtrace_dof_mode_lock); + + /* + * Update kernel symbol state. + * + * We must own the provider and dtrace locks. + * + * NOTE! It may appear there is a race by setting this value so late + * after dtrace_probe_provide. However, any kext loaded after the + * call to probe provide and before we set LAZY_OFF will be marked as + * eligible for symbols from userspace. The same dtrace that is currently + * calling dtrace_open() (this call!) will get a list of kexts needing + * symbols and fill them in, thus closing the race window. + * + * We want to set this value only after it certain it will succeed, as + * this significantly reduces the complexity of error exits. + */ + lck_mtx_lock(&dtrace_lock); + if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) { + dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL; + } + lck_mtx_unlock(&dtrace_lock); #endif /* __APPLE__ */ return (0); @@ -16691,31 +17203,52 @@ dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p) dtrace_state_destroy(state); ASSERT(dtrace_opens > 0); - if (--dtrace_opens == 0) - (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE); + /* + * Only relinquish control of the kernel debugger interface when there + * are no consumers and no anonymous enablings. + */ + if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) + (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE); + lck_mtx_unlock(&dtrace_lock); lck_mtx_unlock(&cpu_lock); #if defined(__APPLE__) - /* * Lock ordering requires the dof mode lock be taken before * the dtrace_lock. */ lck_rw_lock_exclusive(&dtrace_dof_mode_lock); lck_mtx_lock(&dtrace_lock); + + if (dtrace_opens == 0) { + /* + * If we are currently lazy-off, and this is the last close, transition to + * lazy state. + */ + if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF) { + dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON; + } - /* - * If we are currently lazy-off, and this is the last close, transition to - * lazy state. - */ - if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF && dtrace_opens == 0) { - dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON; + /* + * If we are the last dtrace client, switch back to lazy (from userspace) symbols + */ + if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_KERNEL) { + dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE; + } } - + lck_mtx_unlock(&dtrace_lock); lck_rw_unlock_exclusive(&dtrace_dof_mode_lock); + + /* + * Kext probes may be retained past the end of the kext's lifespan. The + * probes are kept until the last reference to them has been removed. + * Since closing an active dtrace context is likely to drop that last reference, + * lets take a shot at cleaning out the orphaned probes now. + */ + dtrace_module_unloaded(NULL); #endif /* __APPLE__ */ return (0); @@ -18437,8 +18970,254 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv return (0); } - default: - break; + case DTRACEIOC_MODUUIDSLIST: { + size_t module_uuids_list_size; + dtrace_module_uuids_list_t* uuids_list; + uint64_t dtmul_count; + + /* + * Fail if the kernel symbol mode makes this operation illegal. + * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check + * for them without holding the dtrace_lock. + */ + if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER || + dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) { + cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_MODUUIDSLIST", dtrace_kernel_symbol_mode); + return (EPERM); + } + + /* + * Read the number of symbolsdesc structs being passed in. + */ + if (copyin(arg + offsetof(dtrace_module_uuids_list_t, dtmul_count), + &dtmul_count, + sizeof(dtmul_count))) { + cmn_err(CE_WARN, "failed to copyin dtmul_count"); + return (EFAULT); + } + + /* + * Range check the count. More than 2k kexts is probably an error. + */ + if (dtmul_count > 2048) { + cmn_err(CE_WARN, "dtmul_count is not valid"); + return (EINVAL); + } + + /* + * For all queries, we return EINVAL when the user specified + * count does not match the actual number of modules we find + * available. + * + * If the user specified count is zero, then this serves as a + * simple query to count the available modules in need of symbols. + */ + + rval = 0; + + if (dtmul_count == 0) + { + lck_mtx_lock(&mod_lock); + struct modctl* ctl = dtrace_modctl_list; + while (ctl) { + ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl)); + if (!MOD_SYMBOLS_DONE(ctl)) { + dtmul_count++; + rval = EINVAL; + } + ctl = ctl->mod_next; + } + lck_mtx_unlock(&mod_lock); + + if (copyout(&dtmul_count, arg, sizeof (dtmul_count)) != 0) + return (EFAULT); + else + return (rval); + } + + /* + * If we reach this point, then we have a request for full list data. + * Allocate a correctly sized structure and copyin the data. + */ + module_uuids_list_size = DTRACE_MODULE_UUIDS_LIST_SIZE(dtmul_count); + if ((uuids_list = kmem_alloc(module_uuids_list_size, KM_SLEEP)) == NULL) + return (ENOMEM); + + /* NOTE! We can no longer exit this method via return */ + if (copyin(arg, uuids_list, module_uuids_list_size) != 0) { + cmn_err(CE_WARN, "failed copyin of dtrace_module_uuids_list_t"); + rval = EFAULT; + goto moduuidslist_cleanup; + } + + /* + * Check that the count didn't change between the first copyin and the second. + */ + if (uuids_list->dtmul_count != dtmul_count) { + rval = EINVAL; + goto moduuidslist_cleanup; + } + + /* + * Build the list of UUID's that need symbols + */ + lck_mtx_lock(&mod_lock); + + dtmul_count = 0; + + struct modctl* ctl = dtrace_modctl_list; + while (ctl) { + /* + * We assume that userspace symbols will be "better" than kernel level symbols, + * as userspace can search for dSYM(s) and symbol'd binaries. Even if kernel syms + * are available, add user syms if the module might use them. + */ + ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl)); + if (!MOD_SYMBOLS_DONE(ctl)) { + UUID* uuid = &uuids_list->dtmul_uuid[dtmul_count]; + if (dtmul_count++ < uuids_list->dtmul_count) { + memcpy(uuid, ctl->mod_uuid, sizeof(UUID)); + } + } + ctl = ctl->mod_next; + } + + lck_mtx_unlock(&mod_lock); + + if (uuids_list->dtmul_count < dtmul_count) + rval = EINVAL; + + uuids_list->dtmul_count = dtmul_count; + + /* + * Copyout the symbols list (or at least the count!) + */ + if (copyout(uuids_list, arg, module_uuids_list_size) != 0) { + cmn_err(CE_WARN, "failed copyout of dtrace_symbolsdesc_list_t"); + rval = EFAULT; + } + + moduuidslist_cleanup: + /* + * If we had to allocate struct memory, free it. + */ + if (uuids_list != NULL) { + kmem_free(uuids_list, module_uuids_list_size); + } + + return rval; + } + + case DTRACEIOC_PROVMODSYMS: { + size_t module_symbols_size; + dtrace_module_symbols_t* module_symbols; + uint64_t dtmodsyms_count; + + /* + * Fail if the kernel symbol mode makes this operation illegal. + * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check + * for them without holding the dtrace_lock. + */ + if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER || + dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) { + cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_PROVMODSYMS", dtrace_kernel_symbol_mode); + return (EPERM); + } + + /* + * Read the number of module symbols structs being passed in. + */ + if (copyin(arg + offsetof(dtrace_module_symbols_t, dtmodsyms_count), + &dtmodsyms_count, + sizeof(dtmodsyms_count))) { + cmn_err(CE_WARN, "failed to copyin dtmodsyms_count"); + return (EFAULT); + } + + /* + * Range check the count. How much data can we pass around? + * FIX ME! + */ + if (dtmodsyms_count == 0 || (dtmodsyms_count > 100 * 1024)) { + cmn_err(CE_WARN, "dtmodsyms_count is not valid"); + return (EINVAL); + } + + /* + * Allocate a correctly sized structure and copyin the data. + */ + module_symbols_size = DTRACE_MODULE_SYMBOLS_SIZE(dtmodsyms_count); + if ((module_symbols = kmem_alloc(module_symbols_size, KM_SLEEP)) == NULL) + return (ENOMEM); + + rval = 0; + + /* NOTE! We can no longer exit this method via return */ + if (copyin(arg, module_symbols, module_symbols_size) != 0) { + cmn_err(CE_WARN, "failed copyin of dtrace_module_symbols_t, symbol count %llu", module_symbols->dtmodsyms_count); + rval = EFAULT; + goto module_symbols_cleanup; + } + + /* + * Check that the count didn't change between the first copyin and the second. + */ + if (module_symbols->dtmodsyms_count != dtmodsyms_count) { + rval = EINVAL; + goto module_symbols_cleanup; + } + + /* + * Find the modctl to add symbols to. + */ + lck_mtx_lock(&dtrace_provider_lock); + lck_mtx_lock(&mod_lock); + + struct modctl* ctl = dtrace_modctl_list; + while (ctl) { + ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl)); + if (MOD_HAS_UUID(ctl) && !MOD_SYMBOLS_DONE(ctl)) { + if (memcmp(module_symbols->dtmodsyms_uuid, ctl->mod_uuid, sizeof(UUID)) == 0) { + /* BINGO! */ + ctl->mod_user_symbols = module_symbols; + break; + } + } + ctl = ctl->mod_next; + } + + if (ctl) { + dtrace_provider_t *prv; + + /* + * We're going to call each providers per-module provide operation + * specifying only this module. + */ + for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next) + prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl); + + /* + * We gave every provider a chance to provide with the user syms, go ahead and clear them + */ + ctl->mod_user_symbols = NULL; /* MUST reset this to clear HAS_USERSPACE_SYMBOLS */ + } + + lck_mtx_unlock(&mod_lock); + lck_mtx_unlock(&dtrace_provider_lock); + + module_symbols_cleanup: + /* + * If we had to allocate struct memory, free it. + */ + if (module_symbols != NULL) { + kmem_free(module_symbols, module_symbols_size); + } + + return rval; + } + + default: + break; } return (ENOTTY); @@ -18912,12 +19691,14 @@ dtrace_init( void ) lck_mtx_init(&cpu_lock, dtrace_lck_grp, dtrace_lck_attr); lck_mtx_init(&mod_lock, dtrace_lck_grp, dtrace_lck_attr); + dtrace_modctl_list = NULL; + cpu_core = (cpu_core_t *)kmem_zalloc( ncpu * sizeof(cpu_core_t), KM_SLEEP ); for (i = 0; i < ncpu; ++i) { lck_mtx_init(&cpu_core[i].cpuc_pid_lock, dtrace_lck_grp, dtrace_lck_attr); } - cpu_list = (cpu_t *)kmem_zalloc( ncpu * sizeof(cpu_t), KM_SLEEP ); + cpu_list = (dtrace_cpu_t *)kmem_zalloc( ncpu * sizeof(dtrace_cpu_t), KM_SLEEP ); for (i = 0; i < ncpu; ++i) { cpu_list[i].cpu_id = (processorid_t)i; cpu_list[i].cpu_next = &(cpu_list[(i+1) % ncpu]); @@ -18965,6 +19746,14 @@ dtrace_init( void ) break; } + /* + * See dtrace_impl.h for a description of kernel symbol modes. + * The default is to wait for symbols from userspace (lazy symbols). + */ + if (!PE_parse_boot_argn("dtrace_kernel_symbol_mode", &dtrace_kernel_symbol_mode, sizeof (dtrace_kernel_symbol_mode))) { + dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE; + } + gDTraceInited = 1; } else @@ -18974,12 +19763,29 @@ dtrace_init( void ) void dtrace_postinit(void) { - /* - * Called from bsd_init after all provider's *_init() routines have been - * run. That way, anonymous DOF enabled under dtrace_attach() is safe - * to go. - */ - dtrace_attach( (dev_info_t *)(uintptr_t)makedev(gMajDevNo, 0), 0 ); /* Punning a dev_t to a dev_info_t* */ + /* + * Called from bsd_init after all provider's *_init() routines have been + * run. That way, anonymous DOF enabled under dtrace_attach() is safe + * to go. + */ + dtrace_attach( (dev_info_t *)(uintptr_t)makedev(gMajDevNo, 0), 0 ); /* Punning a dev_t to a dev_info_t* */ + + /* + * Add the mach_kernel to the module list for lazy processing + */ + struct kmod_info fake_kernel_kmod; + memset(&fake_kernel_kmod, 0, sizeof(fake_kernel_kmod)); + + strlcpy(fake_kernel_kmod.name, "mach_kernel", sizeof(fake_kernel_kmod.name)); + fake_kernel_kmod.id = 1; + fake_kernel_kmod.address = g_kernel_kmod_info.address; + fake_kernel_kmod.size = g_kernel_kmod_info.size; + + if (dtrace_module_loaded(&fake_kernel_kmod) != 0) { + printf("dtrace_postinit: Could not register mach_kernel modctl\n"); + } + + (void)OSKextRegisterKextsWithDTrace(); } #undef DTRACE_MAJOR diff --git a/bsd/dev/dtrace/dtrace_glue.c b/bsd/dev/dtrace/dtrace_glue.c index 6d4586e2c..a046e3eac 100644 --- a/bsd/dev/dtrace/dtrace_glue.c +++ b/bsd/dev/dtrace/dtrace_glue.c @@ -227,7 +227,7 @@ uwrite(proc_t *p, void *buf, user_size_t len, user_addr_t a) lck_mtx_t cpu_lock; lck_mtx_t mod_lock; -cpu_t *cpu_list; +dtrace_cpu_t *cpu_list; cpu_core_t *cpu_core; /* XXX TLB lockdown? */ /* @@ -267,41 +267,38 @@ PRIV_POLICY_ONLY(void *cr, int priv, int boolean) return kauth_cred_issuser(cr); /* XXX TODO: HAS_PRIVILEGE(cr, priv); */ } +/* XXX Get around const poisoning using structure assigns */ gid_t -crgetgid(const cred_t *cr) { return cr->cr_groups[0]; } +crgetgid(const cred_t *cr) { cred_t copy_cr = *cr; return kauth_cred_getgid(©_cr); } uid_t -crgetuid(const cred_t *cr) { return cr->cr_uid; } +crgetuid(const cred_t *cr) { cred_t copy_cr = *cr; return kauth_cred_getuid(©_cr); } /* * "cyclic" */ /* osfmk/kern/timer_call.h */ -typedef void *call_entry_param_t; -typedef void (*call_entry_func_t)( - call_entry_param_t param0, - call_entry_param_t param1); - -typedef struct call_entry { - queue_chain_t q_link; - call_entry_func_t func; - call_entry_param_t param0; - call_entry_param_t param1; - uint64_t deadline; - enum { - IDLE, - PENDING, - DELAYED } state; -} call_entry_data_t; - - -typedef struct call_entry *timer_call_t; typedef void *timer_call_param_t; typedef void (*timer_call_func_t)( timer_call_param_t param0, timer_call_param_t param1); +typedef struct timer_call { + queue_chain_t q_link; + queue_t queue; + timer_call_func_t func; + timer_call_param_t param0; + timer_call_param_t param1; + decl_simple_lock_data(,lock); + uint64_t deadline; + uint64_t soft_deadline; + uint32_t flags; + boolean_t async_dequeue; +} timer_call_data_t; + +typedef struct timer_call *timer_call_t; + extern void timer_call_setup( timer_call_t call, @@ -312,7 +309,13 @@ extern boolean_t timer_call_enter1( timer_call_t call, timer_call_param_t param1, - uint64_t deadline); + uint64_t deadline, + uint32_t flags); + +#ifndef TIMER_CALL_CRITICAL +#define TIMER_CALL_CRITICAL 0x1 +#define TIMER_CALL_LOCAL 0x2 +#endif /* TIMER_CALL_CRITICAL */ extern boolean_t timer_call_cancel( @@ -322,7 +325,7 @@ typedef struct wrap_timer_call { cyc_handler_t hdlr; cyc_time_t when; uint64_t deadline; - struct call_entry call; + struct timer_call call; } wrap_timer_call_t; #define WAKEUP_REAPER 0x7FFFFFFFFFFFFFFFLL @@ -337,7 +340,7 @@ _timer_call_apply_cyclic( void *ignore, void *vTChdl ) (*(wrapTC->hdlr.cyh_func))( wrapTC->hdlr.cyh_arg ); clock_deadline_for_periodic_event( wrapTC->when.cyt_interval, mach_absolute_time(), &(wrapTC->deadline) ); - timer_call_enter1( &(wrapTC->call), (void *)wrapTC, wrapTC->deadline ); + timer_call_enter1( &(wrapTC->call), (void *)wrapTC, wrapTC->deadline, TIMER_CALL_CRITICAL | TIMER_CALL_LOCAL ); /* Did timer_call_remove_cyclic request a wakeup call when this timer call was re-armed? */ if (wrapTC->when.cyt_interval == WAKEUP_REAPER) @@ -359,7 +362,7 @@ timer_call_add_cyclic(wrap_timer_call_t *wrapTC, cyc_handler_t *handler, cyc_tim wrapTC->deadline = now; clock_deadline_for_periodic_event( wrapTC->when.cyt_interval, now, &(wrapTC->deadline) ); - timer_call_enter1( &(wrapTC->call), (void *)wrapTC, wrapTC->deadline ); + timer_call_enter1( &(wrapTC->call), (void *)wrapTC, wrapTC->deadline, TIMER_CALL_CRITICAL | TIMER_CALL_LOCAL ); return (cyclic_id_t)wrapTC; } diff --git a/bsd/dev/dtrace/dtrace_subr.c b/bsd/dev/dtrace/dtrace_subr.c index 3d8e65309..c3a69c48f 100644 --- a/bsd/dev/dtrace/dtrace_subr.c +++ b/bsd/dev/dtrace/dtrace_subr.c @@ -49,11 +49,14 @@ int (*dtrace_fasttrap_probe_ptr)(struct regs *); * They're assigned in dtrace.c but Darwin never calls them. */ void (*dtrace_cpu_init)(processorid_t); +#if !defined(__APPLE__) void (*dtrace_modload)(struct modctl *); void (*dtrace_modunload)(struct modctl *); -#if defined(__APPLE__) +#else +int (*dtrace_modload)(struct kmod_info *); +int (*dtrace_modunload)(struct kmod_info *); void (*dtrace_helpers_cleanup)(proc_t *); -#endif +#endif /*__APPLE__*/ void (*dtrace_helpers_fork)(proc_t *, proc_t *); void (*dtrace_cpustart_init)(void); void (*dtrace_cpustart_fini)(void); diff --git a/bsd/dev/dtrace/fasttrap.c b/bsd/dev/dtrace/fasttrap.c index 814778290..f75e9df72 100644 --- a/bsd/dev/dtrace/fasttrap.c +++ b/bsd/dev/dtrace/fasttrap.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -961,7 +961,7 @@ fasttrap_disable_callbacks(void) ASSERT(fasttrap_pid_count > 0); fasttrap_pid_count--; if (fasttrap_pid_count == 0) { - cpu_t *cur, *cpu = CPU; + dtrace_cpu_t *cur, *cpu = CPU; /* * APPLE NOTE: This loop seems broken, it touches every CPU @@ -987,7 +987,7 @@ fasttrap_disable_callbacks(void) } /*ARGSUSED*/ -static void +static int fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg, id) @@ -1016,7 +1016,7 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) * provider can't go away while we're in this code path. */ if (probe->ftp_prov->ftp_retired) - return; + return(0); /* * If we can't find the process, it may be that we're in the context of @@ -1030,11 +1030,11 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) * does not return process's with SIDL set, but we always return * the child process. */ - return; + return(0); #else if ((curproc->p_flag & SFORKING) == 0) - return; + return(0); lck_mtx_lock(&pidlock); p = prfind(probe->ftp_pid); @@ -1109,7 +1109,7 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) * drop our reference on the trap table entry. */ fasttrap_disable_callbacks(); - return; + return(0); } } @@ -1117,6 +1117,7 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) sprunlock(p); probe->ftp_enabled = 1; + return (0); } /*ARGSUSED*/ @@ -2155,9 +2156,6 @@ fasttrap_meta_create_probe(void *arg, void *parg, * Both 32 & 64 bit want to go back one byte, to point at the first NOP */ tp->ftt_pc = dhpb->dthpb_base + (int64_t)dhpb->dthpb_offs[i] - 1; -#elif defined(__ppc__) - /* All PPC probes are zero offset. */ - tp->ftt_pc = dhpb->dthpb_base + (int64_t)dhpb->dthpb_offs[i]; #else #error "Architecture not supported" #endif @@ -2199,9 +2197,6 @@ fasttrap_meta_create_probe(void *arg, void *parg, * Both 32 & 64 bit want to go forward two bytes, to point at a single byte nop. */ tp->ftt_pc = dhpb->dthpb_base + (int64_t)dhpb->dthpb_enoffs[j] + 2; -#elif defined(__ppc__) - /* All PPC is-enabled probes are zero offset. */ - tp->ftt_pc = dhpb->dthpb_base + (int64_t)dhpb->dthpb_enoffs[j]; #else #error "Architecture not supported" #endif @@ -2294,7 +2289,8 @@ fasttrap_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int * probe = kmem_alloc(size, KM_SLEEP); - if (copyin(arg, probe, size) != 0) { + if (copyin(arg, probe, size) != 0 || + probe->ftps_noffs != noffs) { kmem_free(probe, size); return (EFAULT); } diff --git a/bsd/dev/dtrace/fbt.c b/bsd/dev/dtrace/fbt.c index 94e15da00..5a6570ed1 100644 --- a/bsd/dev/dtrace/fbt.c +++ b/bsd/dev/dtrace/fbt.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -52,16 +52,10 @@ /* #include */ struct savearea_t; /* Used anonymously */ -typedef kern_return_t (*perfCallback)(int, struct savearea_t *, int, int); +typedef kern_return_t (*perfCallback)(int, struct savearea_t *, uintptr_t *, int); -#if defined (__ppc__) || defined (__ppc64__) -extern perfCallback tempDTraceTrapHook, tempDTraceIntHook; -extern kern_return_t fbt_perfCallback(int, struct savearea_t *, int, int); -extern kern_return_t fbt_perfIntCallback(int, struct savearea_t *, int, int); -#else extern perfCallback tempDTraceTrapHook; -extern kern_return_t fbt_perfCallback(int, struct savearea_t *, int, int); -#endif +extern kern_return_t fbt_perfCallback(int, struct savearea_t *, uintptr_t *); #define FBT_ADDR2NDX(addr) ((((uintptr_t)(addr)) >> 4) & fbt_probetab_mask) #define FBT_PROBETAB_SIZE 0x8000 /* 32k entries -- 128K total */ @@ -111,25 +105,42 @@ fbt_destroy(void *arg, dtrace_id_t id, void *parg) } /*ARGSUSED*/ -static void +int fbt_enable(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg,id) fbt_probe_t *fbt = parg; - struct modctl *ctl = fbt->fbtp_ctl; + struct modctl *ctl = NULL; + + for (; fbt != NULL; fbt = fbt->fbtp_next) { -#if defined (__ppc__) || defined (__ppc64__) - dtrace_casptr(&tempDTraceIntHook, NULL, fbt_perfIntCallback); - if (tempDTraceIntHook != (perfCallback)fbt_perfIntCallback) { + ctl = fbt->fbtp_ctl; + + if (!ctl->mod_loaded) { if (fbt_verbose) { - cmn_err(CE_NOTE, "fbt_enable is failing for probe %s " - "in module %s: tempDTraceIntHook already occupied.", + cmn_err(CE_NOTE, "fbt is failing for probe %s " + "(module %s unloaded)", fbt->fbtp_name, ctl->mod_modname); } - return; + + continue; } -#endif - + + /* + * Now check that our modctl has the expected load count. If it + * doesn't, this module must have been unloaded and reloaded -- and + * we're not going to touch it. + */ + if (ctl->mod_loadcnt != fbt->fbtp_loadcnt) { + if (fbt_verbose) { + cmn_err(CE_NOTE, "fbt is failing for probe %s " + "(module %s reloaded)", + fbt->fbtp_name, ctl->mod_modname); + } + + continue; + } + dtrace_casptr(&tempDTraceTrapHook, NULL, fbt_perfCallback); if (tempDTraceTrapHook != (perfCallback)fbt_perfCallback) { if (fbt_verbose) { @@ -137,14 +148,21 @@ fbt_enable(void *arg, dtrace_id_t id, void *parg) "in module %s: tempDTraceTrapHook already occupied.", fbt->fbtp_name, ctl->mod_modname); } - return; + continue; } - for (; fbt != NULL; fbt = fbt->fbtp_next) + if (fbt->fbtp_currentval != fbt->fbtp_patchval) { (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_patchval, (vm_offset_t)fbt->fbtp_patchpoint, sizeof(fbt->fbtp_patchval)); - - dtrace_membar_consumer(); + fbt->fbtp_currentval = fbt->fbtp_patchval; + ctl->mod_nenabled++; + } + + } + + dtrace_membar_consumer(); + + return (0); } /*ARGSUSED*/ @@ -153,11 +171,22 @@ fbt_disable(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg,id) fbt_probe_t *fbt = parg; + struct modctl *ctl = NULL; + + for (; fbt != NULL; fbt = fbt->fbtp_next) { + ctl = fbt->fbtp_ctl; + + if (!ctl->mod_loaded || (ctl->mod_loadcnt != fbt->fbtp_loadcnt)) + continue; - for (; fbt != NULL; fbt = fbt->fbtp_next) + if (fbt->fbtp_currentval != fbt->fbtp_savedval) { (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_savedval, (vm_offset_t)fbt->fbtp_patchpoint, sizeof(fbt->fbtp_savedval)); - + fbt->fbtp_currentval = fbt->fbtp_savedval; + ASSERT(ctl->mod_nenabled > 0); + ctl->mod_nenabled--; + } + } dtrace_membar_consumer(); } @@ -167,11 +196,20 @@ fbt_suspend(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg,id) fbt_probe_t *fbt = parg; + struct modctl *ctl = NULL; - for (; fbt != NULL; fbt = fbt->fbtp_next) - (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_savedval, (vm_offset_t)fbt->fbtp_patchpoint, + for (; fbt != NULL; fbt = fbt->fbtp_next) { + ctl = fbt->fbtp_ctl; + + ASSERT(ctl->mod_nenabled > 0); + if (!ctl->mod_loaded || (ctl->mod_loadcnt != fbt->fbtp_loadcnt)) + continue; + + (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_savedval, (vm_offset_t)fbt->fbtp_patchpoint, sizeof(fbt->fbtp_savedval)); - + fbt->fbtp_currentval = fbt->fbtp_savedval; + } + dtrace_membar_consumer(); } @@ -181,34 +219,30 @@ fbt_resume(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg,id) fbt_probe_t *fbt = parg; - struct modctl *ctl = fbt->fbtp_ctl; + struct modctl *ctl = NULL; -#if defined (__ppc__) || defined (__ppc64__) - dtrace_casptr(&tempDTraceIntHook, NULL, fbt_perfIntCallback); - if (tempDTraceIntHook != (perfCallback)fbt_perfIntCallback) { - if (fbt_verbose) { - cmn_err(CE_NOTE, "fbt_enable is failing for probe %s " - "in module %s: tempDTraceIntHook already occupied.", - fbt->fbtp_name, ctl->mod_modname); - } - return; - } -#endif + for (; fbt != NULL; fbt = fbt->fbtp_next) { + ctl = fbt->fbtp_ctl; + + ASSERT(ctl->mod_nenabled > 0); + if (!ctl->mod_loaded || (ctl->mod_loadcnt != fbt->fbtp_loadcnt)) + continue; - dtrace_casptr(&tempDTraceTrapHook, NULL, fbt_perfCallback); - if (tempDTraceTrapHook != (perfCallback)fbt_perfCallback) { + dtrace_casptr(&tempDTraceTrapHook, NULL, fbt_perfCallback); + if (tempDTraceTrapHook != (perfCallback)fbt_perfCallback) { if (fbt_verbose) { cmn_err(CE_NOTE, "fbt_resume is failing for probe %s " "in module %s: tempDTraceTrapHook already occupied.", fbt->fbtp_name, ctl->mod_modname); } return; - } + } - for (; fbt != NULL; fbt = fbt->fbtp_next) - (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_patchval, (vm_offset_t)fbt->fbtp_patchpoint, + (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_patchval, (vm_offset_t)fbt->fbtp_patchpoint, sizeof(fbt->fbtp_patchval)); - + fbt->fbtp_currentval = fbt->fbtp_patchval; + } + dtrace_membar_consumer(); } @@ -422,8 +456,8 @@ static struct cdevsw fbt_cdevsw = 0 /* type */ }; -static int gDisableFBT = 0; -struct modctl g_fbt_kernctl; +int gIgnoreFBTBlacklist = 0; +static int gFBTInited = 0; #undef kmem_alloc /* from its binding to dt_kmem_alloc glue */ #undef kmem_free /* from its binding to dt_kmem_free glue */ #include @@ -431,66 +465,22 @@ struct modctl g_fbt_kernctl; void fbt_init( void ) { - - PE_parse_boot_argn("DisableFBT", &gDisableFBT, sizeof (gDisableFBT)); - - if (0 == gDisableFBT) + if (0 == gFBTInited) { int majdevno = cdevsw_add(FBT_MAJOR, &fbt_cdevsw); - unsigned long size = 0, header_size, round_size; - kern_return_t ret; - void *p, *q; if (majdevno < 0) { printf("fbt_init: failed to allocate a major number!\n"); return; } - - /* - * Capture the kernel's mach_header in its entirety and the contents of - * its LINKEDIT segment (and only that segment). This is sufficient to - * build all the fbt probes lazily the first time a client looks to - * the fbt provider. Remeber these on the global struct modctl g_fbt_kernctl. - */ - header_size = sizeof(kernel_mach_header_t) + _mh_execute_header.sizeofcmds; - p = getsegdatafromheader(&_mh_execute_header, SEG_LINKEDIT, &size); - - round_size = round_page(header_size + size); - /* "q" will accomodate copied kernel_mach_header_t, its load commands, and LINKEIT segment. */ - ret = kmem_alloc_pageable(kernel_map, (vm_offset_t *)&q, round_size); - - if (p && (ret == KERN_SUCCESS)) { - kernel_segment_command_t *sgp; - - bcopy( (void *)&_mh_execute_header, q, header_size); - bcopy( p, (char *)q + header_size, size); - - sgp = getsegbynamefromheader(q, SEG_LINKEDIT); - - if (sgp) { - sgp->vmaddr = (uintptr_t)((char *)q + header_size); - g_fbt_kernctl.address = (vm_address_t)q; - g_fbt_kernctl.size = header_size + size; - } else { - kmem_free(kernel_map, (vm_offset_t)q, round_size); - g_fbt_kernctl.address = (vm_address_t)NULL; - g_fbt_kernctl.size = 0; - } - } else { - if (ret == KERN_SUCCESS) - kmem_free(kernel_map, (vm_offset_t)q, round_size); - g_fbt_kernctl.address = (vm_address_t)NULL; - g_fbt_kernctl.size = 0; - } - - strncpy((char *)&(g_fbt_kernctl.mod_modname), "mach_kernel", KMOD_MAX_NAME); - ((char *)&(g_fbt_kernctl.mod_modname))[KMOD_MAX_NAME -1] = '\0'; + + PE_parse_boot_argn("IgnoreFBTBlacklist", &gIgnoreFBTBlacklist, sizeof (gIgnoreFBTBlacklist)); fbt_attach( (dev_info_t *)(uintptr_t)majdevno, DDI_ATTACH ); - - gDisableFBT = 1; /* Ensure this initialization occurs just one time. */ + + gFBTInited = 1; /* Ensure this initialization occurs just one time. */ } else - printf("fbt_init: DisableFBT non-zero, no FBT probes will be provided.\n"); + panic("fbt_init: called twice!\n"); } #undef FBT_MAJOR diff --git a/bsd/dev/dtrace/lockstat.c b/bsd/dev/dtrace/lockstat.c index 0f9d6d4ff..a9f003e65 100644 --- a/bsd/dev/dtrace/lockstat.c +++ b/bsd/dev/dtrace/lockstat.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -59,10 +59,6 @@ #define NOP 0x90 #define RET 0xc3 #define LOCKSTAT_AFRAMES 1 -#elif __ppc__ -#define NOP 0x60000000 -#define RET 0x4e800020 /* blr */ -#define LOCKSTAT_AFRAMES 2 #else #error "not ported to this architecture" #endif @@ -188,11 +184,6 @@ void lockstat_hot_patch(boolean_t active) instr = (active ? NOP : RET ); (void) ml_nofault_copy( (vm_offset_t)&instr, *(assembly_probes[i]), sizeof(instr)); -#endif -#ifdef __ppc__ - uint32_t instr; - instr = (active ? NOP : RET ); - (void) ml_nofault_copy( (vm_offset_t)&instr, *(assembly_probes[i]), sizeof(instr)); #endif } } @@ -206,7 +197,7 @@ static dev_info_t *lockstat_devi; /* saved in xxattach() for xxinfo() */ static dtrace_provider_id_t lockstat_id; /*ARGSUSED*/ -static void +static int lockstat_enable(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg) /* __APPLE__ */ @@ -220,6 +211,7 @@ lockstat_enable(void *arg, dtrace_id_t id, void *parg) lockstat_hot_patch(TRUE); membar_producer(); + return(0); } diff --git a/bsd/dev/dtrace/profile_prvd.c b/bsd/dev/dtrace/profile_prvd.c index a74254c5c..69f3aadd5 100644 --- a/bsd/dev/dtrace/profile_prvd.c +++ b/bsd/dev/dtrace/profile_prvd.c @@ -49,6 +49,7 @@ #define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from mach/ppc/thread_status.h */ #include #include +#include #include #include @@ -65,9 +66,9 @@ #include -#if defined(__ppc__) || defined(__ppc64__) -extern struct savearea *find_kern_regs(thread_t); -#elif defined(__i386__) || defined(__x86_64__) +#include + +#if defined(__i386__) || defined(__x86_64__) extern x86_saved_state_t *find_kern_regs(thread_t); #else #error Unknown architecture @@ -127,9 +128,7 @@ static dtrace_provider_id_t profile_id; #else /* is Mac OS X */ -#if defined(__ppc__) || defined(__ppc64__) -#define PROF_ARTIFICIAL_FRAMES 8 -#elif defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) #define PROF_ARTIFICIAL_FRAMES 9 #else #error Unknown architecture @@ -185,7 +184,6 @@ static int profile_ticks[] = { static uint32_t profile_max; /* maximum number of profile probes */ static uint32_t profile_total; /* current number of profile probes */ - static void profile_fire(void *arg) { @@ -200,22 +198,7 @@ profile_fire(void *arg) dtrace_probe(prof->prof_id, CPU->cpu_profile_pc, CPU->cpu_profile_upc, late, 0, 0); #else -#if defined(__ppc__) || defined(__ppc64__) - { - struct savearea *sv = find_kern_regs(current_thread()); - - if (sv) { - if (USERMODE(sv->save_srr1)) { - dtrace_probe(prof->prof_id, 0x0, sv->save_srr0, late, 0, 0); - } else { - dtrace_probe(prof->prof_id, sv->save_srr0, 0x0, late, 0, 0); - } - } else { - dtrace_probe(prof->prof_id, 0xcafebabe, - 0x0, late, 0, 0); /* XXX_BOGUS also see profile_usermode() below. */ - } - } -#elif defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) x86_saved_state_t *kern_regs = find_kern_regs(current_thread()); if (NULL != kern_regs) { @@ -228,6 +211,7 @@ profile_fire(void *arg) #error Unknown arch #endif } else { + pal_register_cache_state(current_thread(), VALID); /* Possibly a user interrupt */ x86_saved_state_t *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread()); @@ -260,22 +244,7 @@ profile_tick(void *arg) dtrace_probe(prof->prof_id, CPU->cpu_profile_pc, CPU->cpu_profile_upc, 0, 0, 0); #else -#if defined(__ppc__) || defined(__ppc64__) - { - struct savearea *sv = find_kern_regs(current_thread()); - - if (sv) { - if (USERMODE(sv->save_srr1)) { - dtrace_probe(prof->prof_id, 0x0, sv->save_srr0, 0, 0, 0); - } else { - dtrace_probe(prof->prof_id, sv->save_srr0, 0x0, 0, 0, 0); - } - } else { - dtrace_probe(prof->prof_id, 0xcafebabe, - 0x0, 0, 0, 0); /* XXX_BOGUS also see profile_usermode() below. */ - } - } -#elif defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) x86_saved_state_t *kern_regs = find_kern_regs(current_thread()); if (NULL != kern_regs) { @@ -288,6 +257,7 @@ profile_tick(void *arg) #error Unknown arch #endif } else { + pal_register_cache_state(current_thread(), VALID); /* Possibly a user interrupt */ x86_saved_state_t *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread()); @@ -550,7 +520,7 @@ profile_destroy(void *arg, dtrace_id_t id, void *parg) /*ARGSUSED*/ static void -profile_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when) +profile_online(void *arg, dtrace_cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when) { #pragma unused(cpu) /* __APPLE__ */ profile_probe_t *prof = arg; @@ -580,7 +550,7 @@ profile_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when) /*ARGSUSED*/ static void -profile_offline(void *arg, cpu_t *cpu, void *oarg) +profile_offline(void *arg, dtrace_cpu_t *cpu, void *oarg) { profile_probe_percpu_t *pcpu = oarg; @@ -593,7 +563,7 @@ profile_offline(void *arg, cpu_t *cpu, void *oarg) } /*ARGSUSED*/ -static void +static int profile_enable(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg,id) /* __APPLE__ */ @@ -636,6 +606,7 @@ profile_enable(void *arg, dtrace_id_t id, void *parg) prof->prof_cyclic = (cyclic_id_t)cyclic_add_omni(&omni); /* cast puns cyclic_id_list_t with cyclic_id_t */ } #endif /* __APPLE__ */ + return(0); } /*ARGSUSED*/ diff --git a/bsd/dev/dtrace/sdt.c b/bsd/dev/dtrace/sdt.c index 725ab5585..bca167f01 100644 --- a/bsd/dev/dtrace/sdt.c +++ b/bsd/dev/dtrace/sdt.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -46,18 +46,12 @@ #include #include +extern int dtrace_kernel_symbol_mode; struct savearea_t; /* Used anonymously */ -typedef kern_return_t (*perfCallback)(int, struct savearea_t *, int, int); +typedef kern_return_t (*perfCallback)(int, struct savearea_t *, uintptr_t *, int); -#if defined (__ppc__) || defined (__ppc64__) -extern perfCallback tempDTraceTrapHook, tempDTraceIntHook; -extern kern_return_t fbt_perfCallback(int, struct savearea_t *, int, int); -extern kern_return_t fbt_perfIntCallback(int, struct savearea_t *, int, int); - -#define SDT_PATCHVAL 0x7c810808 -#define SDT_AFRAMES 6 -#elif defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) extern perfCallback tempDTraceTrapHook; extern kern_return_t fbt_perfCallback(int, struct savearea_t *, int, int); @@ -86,7 +80,7 @@ static void __sdt_provide_module(void *arg, struct modctl *ctl) { #pragma unused(arg) - struct module *mp = (struct module *)ctl->address; + struct module *mp = (struct module *)ctl->mod_address; char *modname = ctl->mod_modname; sdt_probedesc_t *sdpd; sdt_probe_t *sdp, *old; @@ -220,14 +214,13 @@ sdt_destroy(void *arg, dtrace_id_t id, void *parg) } /*ARGSUSED*/ -static void +static int sdt_enable(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg,id) sdt_probe_t *sdp = parg; struct modctl *ctl = sdp->sdp_ctl; -#if !defined(__APPLE__) ctl->mod_nenabled++; /* @@ -256,20 +249,7 @@ sdt_enable(void *arg, dtrace_id_t id, void *parg) } goto err; } -#endif /* __APPLE__ */ -#if defined (__ppc__) || defined (__ppc64__) - dtrace_casptr(&tempDTraceIntHook, NULL, fbt_perfIntCallback); - if (tempDTraceIntHook != (perfCallback)fbt_perfIntCallback) { - if (sdt_verbose) { - cmn_err(CE_NOTE, "sdt_enable is failing for probe %s " - "in module %s: tempDTraceIntHook already occupied.", - sdp->sdp_name, ctl->mod_modname); - } - return; - } -#endif - dtrace_casptr(&tempDTraceTrapHook, NULL, fbt_perfCallback); if (tempDTraceTrapHook != (perfCallback)fbt_perfCallback) { if (sdt_verbose) { @@ -277,7 +257,7 @@ sdt_enable(void *arg, dtrace_id_t id, void *parg) "in module %s: tempDTraceTrapHook already occupied.", sdp->sdp_name, ctl->mod_modname); } - return; + return (0); } while (sdp != NULL) { @@ -285,10 +265,9 @@ sdt_enable(void *arg, dtrace_id_t id, void *parg) (vm_size_t)sizeof(sdp->sdp_patchval)); sdp = sdp->sdp_next; } -#if !defined(__APPLE__) + err: -#endif /* __APPLE__ */ - ; + return (0); } /*ARGSUSED*/ @@ -297,14 +276,12 @@ sdt_disable(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg,id) sdt_probe_t *sdp = parg; -#if !defined(__APPLE__) struct modctl *ctl = sdp->sdp_ctl; ctl->mod_nenabled--; if (!ctl->mod_loaded || ctl->mod_loadcnt != sdp->sdp_loadcnt) goto err; -#endif /* __APPLE__ */ while (sdp != NULL) { (void)ml_nofault_copy( (vm_offset_t)&sdp->sdp_savedval, (vm_offset_t)sdp->sdp_patchpoint, @@ -312,19 +289,10 @@ sdt_disable(void *arg, dtrace_id_t id, void *parg) sdp = sdp->sdp_next; } -#if !defined(__APPLE__) err: -#endif /* __APPLE__ */ ; } -static uint64_t -sdt_getarg(void *arg, dtrace_id_t id, void *parg, int argno, int aframes) -{ -#pragma unused(arg,id,parg) /* __APPLE__ */ - return dtrace_getarg(argno, aframes); -} - static dtrace_pops_t sdt_pops = { NULL, sdt_provide_module, @@ -561,107 +529,116 @@ void sdt_init( void ) } if (KERNEL_MAGIC != _mh_execute_header.magic) { - g_sdt_kernctl.address = (vm_address_t)NULL; - g_sdt_kernctl.size = 0; + g_sdt_kernctl.mod_address = (vm_address_t)NULL; + g_sdt_kernctl.mod_size = 0; } else { - kernel_mach_header_t *mh; - struct load_command *cmd; - kernel_segment_command_t *orig_ts = NULL, *orig_le = NULL; - struct symtab_command *orig_st = NULL; - kernel_nlist_t *sym = NULL; - char *strings; - unsigned int i; - - g_sdt_mach_module.sdt_nprobes = 0; - g_sdt_mach_module.sdt_probes = NULL; - - g_sdt_kernctl.address = (vm_address_t)&g_sdt_mach_module; - g_sdt_kernctl.size = 0; - strncpy((char *)&(g_sdt_kernctl.mod_modname), "mach_kernel", KMOD_MAX_NAME); - - mh = &_mh_execute_header; - cmd = (struct load_command*) &mh[1]; - for (i = 0; i < mh->ncmds; i++) { - if (cmd->cmd == LC_SEGMENT_KERNEL) { - kernel_segment_command_t *orig_sg = (kernel_segment_command_t *) cmd; - - if (LIT_STRNEQL(orig_sg->segname, SEG_TEXT)) - orig_ts = orig_sg; - else if (LIT_STRNEQL(orig_sg->segname, SEG_LINKEDIT)) - orig_le = orig_sg; - else if (LIT_STRNEQL(orig_sg->segname, "")) - orig_ts = orig_sg; /* kexts have a single unnamed segment */ - } - else if (cmd->cmd == LC_SYMTAB) - orig_st = (struct symtab_command *) cmd; - - cmd = (struct load_command *) ((uintptr_t) cmd + cmd->cmdsize); - } - - if ((orig_ts == NULL) || (orig_st == NULL) || (orig_le == NULL)) - return; - - sym = (kernel_nlist_t *)(orig_le->vmaddr + orig_st->symoff - orig_le->fileoff); - strings = (char *)(orig_le->vmaddr + orig_st->stroff - orig_le->fileoff); - - for (i = 0; i < orig_st->nsyms; i++) { - uint8_t n_type = sym[i].n_type & (N_TYPE | N_EXT); - char *name = strings + sym[i].n_un.n_strx; + kernel_mach_header_t *mh; + struct load_command *cmd; + kernel_segment_command_t *orig_ts = NULL, *orig_le = NULL; + struct symtab_command *orig_st = NULL; + kernel_nlist_t *sym = NULL; + char *strings; + unsigned int i; + + g_sdt_mach_module.sdt_nprobes = 0; + g_sdt_mach_module.sdt_probes = NULL; + + g_sdt_kernctl.mod_address = (vm_address_t)&g_sdt_mach_module; + g_sdt_kernctl.mod_size = 0; + strncpy((char *)&(g_sdt_kernctl.mod_modname), "mach_kernel", KMOD_MAX_NAME); + + g_sdt_kernctl.mod_next = NULL; + g_sdt_kernctl.mod_stale = NULL; + g_sdt_kernctl.mod_id = 0; + g_sdt_kernctl.mod_loadcnt = 1; + g_sdt_kernctl.mod_loaded = 1; + g_sdt_kernctl.mod_flags = 0; + g_sdt_kernctl.mod_nenabled = 0; + + mh = &_mh_execute_header; + cmd = (struct load_command*) &mh[1]; + for (i = 0; i < mh->ncmds; i++) { + if (cmd->cmd == LC_SEGMENT_KERNEL) { + kernel_segment_command_t *orig_sg = (kernel_segment_command_t *) cmd; + + if (LIT_STRNEQL(orig_sg->segname, SEG_TEXT)) + orig_ts = orig_sg; + else if (LIT_STRNEQL(orig_sg->segname, SEG_LINKEDIT)) + orig_le = orig_sg; + else if (LIT_STRNEQL(orig_sg->segname, "")) + orig_ts = orig_sg; /* kexts have a single unnamed segment */ + } + else if (cmd->cmd == LC_SYMTAB) + orig_st = (struct symtab_command *) cmd; + + cmd = (struct load_command *) ((uintptr_t) cmd + cmd->cmdsize); + } + + if ((orig_ts == NULL) || (orig_st == NULL) || (orig_le == NULL)) + return; + + sym = (kernel_nlist_t *)(orig_le->vmaddr + orig_st->symoff - orig_le->fileoff); + strings = (char *)(orig_le->vmaddr + orig_st->stroff - orig_le->fileoff); + + for (i = 0; i < orig_st->nsyms; i++) { + uint8_t n_type = sym[i].n_type & (N_TYPE | N_EXT); + char *name = strings + sym[i].n_un.n_strx; const char *prev_name; unsigned long best; unsigned int j; - - /* Check that the symbol is a global and that it has a name. */ - if (((N_SECT | N_EXT) != n_type && (N_ABS | N_EXT) != n_type)) - continue; - - if (0 == sym[i].n_un.n_strx) /* iff a null, "", name. */ - continue; - - /* Lop off omnipresent leading underscore. */ - if (*name == '_') - name += 1; - - if (strstr(name, DTRACE_PROBE_PREFIX)) { + + /* Check that the symbol is a global and that it has a name. */ + if (((N_SECT | N_EXT) != n_type && (N_ABS | N_EXT) != n_type)) + continue; + + if (0 == sym[i].n_un.n_strx) /* iff a null, "", name. */ + continue; + + /* Lop off omnipresent leading underscore. */ + if (*name == '_') + name += 1; + + if (strncmp(name, DTRACE_PROBE_PREFIX, sizeof(DTRACE_PROBE_PREFIX) - 1) == 0) { sdt_probedesc_t *sdpd = kmem_alloc(sizeof(sdt_probedesc_t), KM_SLEEP); int len = strlen(name) + 1; - + sdpd->sdpd_name = kmem_alloc(len, KM_SLEEP); strncpy(sdpd->sdpd_name, name, len); /* NUL termination is ensured. */ - + prev_name = ""; best = 0; - /* Avoid shadow build warnings */ + /* + * Find the symbol immediately preceding the sdt probe site just discovered, + * that symbol names the function containing the sdt probe. + */ for (j = 0; j < orig_st->nsyms; j++) { uint8_t jn_type = sym[j].n_type & (N_TYPE | N_EXT); char *jname = strings + sym[j].n_un.n_strx; - + if (((N_SECT | N_EXT) != jn_type && (N_ABS | N_EXT) != jn_type)) continue; - + if (0 == sym[j].n_un.n_strx) /* iff a null, "", name. */ continue; - + if (*jname == '_') jname += 1; - if (strstr(jname, DTRACE_PROBE_PREFIX)) - continue; - + if (*(unsigned long *)sym[i].n_value <= (unsigned long)sym[j].n_value) continue; - + if ((unsigned long)sym[j].n_value > best) { best = (unsigned long)sym[j].n_value; prev_name = jname; } } - + sdpd->sdpd_func = kmem_alloc((len = strlen(prev_name) + 1), KM_SLEEP); strncpy(sdpd->sdpd_func, prev_name, len); /* NUL termination is ensured. */ - + sdpd->sdpd_offset = *(unsigned long *)sym[i].n_value; - + sdpd->sdpd_next = g_sdt_mach_module.sdt_probes; g_sdt_mach_module.sdt_probes = sdpd; } else { @@ -669,9 +646,9 @@ void sdt_init( void ) } } } - + sdt_attach( (dev_info_t *)(uintptr_t)majdevno, DDI_ATTACH ); - + gSDTInited = 1; } else panic("sdt_init: called twice!\n"); @@ -683,19 +660,32 @@ void sdt_init( void ) void sdt_provide_module(void *arg, struct modctl *ctl) { -#pragma unused(ctl) #pragma unused(arg) - __sdt_provide_module(arg, &g_sdt_kernctl); - - sdt_probedesc_t *sdpd = g_sdt_mach_module.sdt_probes; - while (sdpd) { - sdt_probedesc_t *this_sdpd = sdpd; - kmem_free((void *)sdpd->sdpd_name, strlen(sdpd->sdpd_name) + 1); - kmem_free((void *)sdpd->sdpd_func, strlen(sdpd->sdpd_func) + 1); - sdpd = sdpd->sdpd_next; - kmem_free((void *)this_sdpd, sizeof(sdt_probedesc_t)); + ASSERT(ctl != NULL); + ASSERT(dtrace_kernel_symbol_mode != DTRACE_KERNEL_SYMBOLS_NEVER); + lck_mtx_assert(&mod_lock, LCK_MTX_ASSERT_OWNED); + + if (MOD_SDT_DONE(ctl)) + return; + + if (MOD_IS_MACH_KERNEL(ctl)) { + __sdt_provide_module(arg, &g_sdt_kernctl); + + sdt_probedesc_t *sdpd = g_sdt_mach_module.sdt_probes; + while (sdpd) { + sdt_probedesc_t *this_sdpd = sdpd; + kmem_free((void *)sdpd->sdpd_name, strlen(sdpd->sdpd_name) + 1); + kmem_free((void *)sdpd->sdpd_func, strlen(sdpd->sdpd_func) + 1); + sdpd = sdpd->sdpd_next; + kmem_free((void *)this_sdpd, sizeof(sdt_probedesc_t)); + } + g_sdt_mach_module.sdt_probes = NULL; + } else { + /* FIXME -- sdt in kext not yet supported */ } - g_sdt_mach_module.sdt_probes = NULL; + + /* Need to mark this module as completed */ + ctl->mod_flags |= MODCTL_SDT_PROBES_PROVIDED; } #endif /* __APPLE__ */ diff --git a/bsd/dev/dtrace/sdt_subr.c b/bsd/dev/dtrace/sdt_subr.c index 90ea1331a..891207713 100644 --- a/bsd/dev/dtrace/sdt_subr.c +++ b/bsd/dev/dtrace/sdt_subr.c @@ -92,6 +92,7 @@ sdt_provider_t sdt_providers[] = { { "proc", "__proc____", &stab_attr, 0 }, { "io", "__io____", &stab_attr, 0 }, { "ip", "__ip____", &stab_attr, 0 }, + { "tcp", "__tcp____", &stab_attr, 0 }, { "mib", "__mib____", &stab_attr, 0 }, { "fsinfo", "__fsinfo____", &fsinfo_attr, 0 }, { "nfsv3", "__nfsv3____", &stab_attr, 0 }, @@ -808,21 +809,66 @@ sdt_argdesc_t sdt_args[] = { "nfsv4cbinfo_t *" }, { "nfsv4", "cb-recall-done", 2, 2, "CB_RECALL4res *", NULL }, - { "ip", "send", 0, 0, "mblk_t *", "pktinfo_t *" }, - { "ip", "send", 1, 1, "conn_t *", "csinfo_t *" }, + { "ip", "send", 0, 0, "struct mbuf *", "pktinfo_t *" }, + { "ip", "send", 1, 1, "struct inpcb *", "csinfo_t *" }, { "ip", "send", 2, 2, "void_ip_t *", "ipinfo_t *" }, - { "ip", "send", 3, 3, "__dtrace_ipsr_ill_t *", "ifinfo_t *" }, - { "ip", "send", 4, 4, "ipha_t *", "ipv4info_t *" }, - { "ip", "send", 5, 5, "ip6_t *", "ipv6info_t *" }, - { "ip", "send", 6, 6, "int", NULL }, /* used by __dtrace_ipsr_ill_t */ - { "ip", "receive", 0, 0, "mblk_t *", "pktinfo_t *" }, - { "ip", "receive", 1, 1, "conn_t *", "csinfo_t *" }, + { "ip", "send", 3, 3, "struct ifnet *", "ifinfo_t *" }, + { "ip", "send", 4, 4, "struct ip *", "ipv4info_t *" }, + { "ip", "send", 5, 5, "struct ip6_hdr *", "ipv6info_t *" }, + { "ip", "receive", 0, 0, "struct mbuf *", "pktinfo_t *" }, + { "ip", "receive", 1, 1, "struct inpcb *", "csinfo_t *" }, { "ip", "receive", 2, 2, "void_ip_t *", "ipinfo_t *" }, - { "ip", "receive", 3, 3, "__dtrace_ipsr_ill_t *", "ifinfo_t *" }, - { "ip", "receive", 4, 4, "ipha_t *", "ipv4info_t *" }, - { "ip", "receive", 5, 5, "ip6_t *", "ipv6info_t *" }, - { "ip", "receive", 6, 6, "int", NULL }, /* used by __dtrace_ipsr_ill_t */ - + { "ip", "receive", 3, 3, "struct ifnet *", "ifinfo_t *" }, + { "ip", "receive", 4, 4, "struct ip *", "ipv4info_t *" }, + { "ip", "receive", 5, 5, "struct ip6_hdr *", "ipv6info_t *" }, + + { "tcp", "connect-established", 0, 0, "struct mbuf *", "pktinfo_t *" }, + { "tcp", "connect-established", 1, 1, "struct inpcb *", "csinfo_t *" }, + { "tcp", "connect-established", 2, 2, "void_ip_t *", "ipinfo_t *" }, + { "tcp", "connect-established", 3, 3, "struct tcpcb *", "tcpsinfo_t *" }, + { "tcp", "connect-established", 4, 4, "struct tcphdr *", "tcpinfo_t *" }, + { "tcp", "connect-refused", 0, 0, "struct mbuf *", "pktinfo_t *" }, + { "tcp", "connect-refused", 1, 1, "struct inpcb *", "csinfo_t *" }, + { "tcp", "connect-refused", 2, 2, "void_ip_t *", "ipinfo_t *" }, + { "tcp", "connect-refused", 3, 3, "struct tcpcb *", "tcpsinfo_t *" }, + { "tcp", "connect-refused", 4, 4, "struct tcphdr *", "tcpinfo_t *" }, + { "tcp", "connect-request", 0, 0, "struct mbuf *", "pktinfo_t *" }, + { "tcp", "connect-request", 1, 1, "struct inpcb *", "csinfo_t *" }, + { "tcp", "connect-request", 2, 2, "void_ip_t *", "ipinfo_t *" }, + { "tcp", "connect-request", 3, 3, "struct tcpcb *", "tcpsinfo_t *" }, + { "tcp", "connect-request", 4, 4, "struct tcphdr *", "tcpinfo_t *" }, + { "tcp", "accept-established", 0, 0, "struct mbuf *", "pktinfo_t *" }, + { "tcp", "accept-established", 1, 1, "struct inpcb *", "csinfo_t *" }, + { "tcp", "accept-established", 2, 2, "void_ip_t *", "ipinfo_t *" }, + { "tcp", "accept-established", 3, 3, "struct tcpcb *", "tcpsinfo_t *" }, + { "tcp", "accept-established", 4, 4, "struct tcphdr *", "tcpinfo_t *" }, + { "tcp", "accept-refused", 0, 0, "struct mbuf *", "pktinfo_t *" }, + { "tcp", "accept-refused", 1, 1, "struct inpcb *", "csinfo_t *" }, + { "tcp", "accept-refused", 2, 2, "void_ip_t *", "ipinfo_t *" }, + { "tcp", "accept-refused", 3, 3, "struct tcpcb *", "tcpsinfo_t *" }, + { "tcp", "accept-refused", 4, 4, "struct tcphdr *", "tcpinfo_t *" }, + { "tcp", "state-change", 0, 0, "void", "void" }, + { "tcp", "state-change", 1, 1, "struct inpcb *", "csinfo_t *" }, + { "tcp", "state-change", 2, 2, "struct tcpcb *", "tcpsinfo_t *" }, + { "tcp", "state-change", 3, 3, "int32_t", "tcpnsinfo_t *" }, + { "tcp", "send", 0, 0, "struct mbuf *", "pktinfo_t *" }, + { "tcp", "send", 1, 1, "struct inpcb *", "csinfo_t *" }, + { "tcp", "send", 2, 2, "void_ip_t *", "ipinfo_t *" }, + { "tcp", "send", 3, 3, "struct tcpcb *", "tcpsinfo_t *" }, + { "tcp", "send", 4, 4, "struct tcphdr *", "tcpinfo_t *" }, + { "tcp", "receive", 0, 0, "struct mbuf *", "pktinfo_t *" }, + { "tcp", "receive", 1, 1, "struct inpcb *", "csinfo_t *" }, + { "tcp", "receive", 2, 2, "void_ip_t *", "ipinfo_t *" }, + { "tcp", "receive", 3, 3, "struct tcpcb *", "tcpsinfo_t *" }, + { "tcp", "receive", 4, 4, "struct tcphdr *", "tcpinfo_t *" }, + { "tcp", "cc", 0, 0, "struct mbuf *", "pktinfo_t *"}, + { "tcp", "cc", 1, 1, "struct inpcb *", "csinfo_t *"}, + { "tcp", "cc", 2, 2, "struct tcpcb *", "tcpsinfo_t *"}, + { "tcp", "cc", 3, 3, "struct tcphdr *", "tcpinfo_t *"}, + { "tcp", "cc", 4, 4, "int32_t", "tcpccevent_t *"}, + { "tcp", "iaj", 0, 0, "struct tcpcb *", "tcpsinfo_t *"}, + { "tcp", "iaj", 1, 1, "uint32_t", NULL}, + { "tcp", "iaj", 2, 2, "uint32_t", NULL}, { "sysevent", "post", 0, 0, "evch_bind_t *", "syseventchaninfo_t *" }, { "sysevent", "post", 1, 1, "sysevent_impl_t *", "syseventinfo_t *" }, diff --git a/bsd/dev/dtrace/systrace.c b/bsd/dev/dtrace/systrace.c index 74ab8a105..271b2a0e1 100644 --- a/bsd/dev/dtrace/systrace.c +++ b/bsd/dev/dtrace/systrace.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -73,10 +73,9 @@ typedef x86_saved_state_t savearea_t; #include #include -#if defined (__ppc__) || defined (__ppc64__) -#define SYSTRACE_ARTIFICIAL_FRAMES 3 -#define MACHTRACE_ARTIFICIAL_FRAMES 4 -#elif defined(__i386__) || defined (__x86_64__) +#include + +#if defined(__i386__) || defined (__x86_64__) #define SYSTRACE_ARTIFICIAL_FRAMES 2 #define MACHTRACE_ARTIFICIAL_FRAMES 3 #else @@ -107,7 +106,6 @@ systrace_stub(dtrace_id_t id, uint64_t arg0, uint64_t arg1, #pragma unused(id,arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7) } - int32_t dtrace_systrace_syscall(struct proc *pp, void *uap, int *rv) { @@ -122,24 +120,10 @@ dtrace_systrace_syscall(struct proc *pp, void *uap, int *rv) #endif syscall_arg_t *ip = (syscall_arg_t *)uap; -#if defined (__ppc__) || defined (__ppc64__) - { - savearea_t *regs = (savearea_t *)find_user_regs(current_thread()); - - flavor = (((unsigned int)regs->save_r0) == 0)? 1: 0; - - if (flavor) - code = regs->save_r3; - else - code = regs->save_r0; - - /* - * FIXME: unix_syscall screens for "unsafe calls" and instead calls nosys(), *not* sysent[code] ! - */ - } -#elif defined(__i386__) || defined (__x86_64__) +#if defined(__i386__) || defined (__x86_64__) #pragma unused(flavor) { + pal_register_cache_state(current_thread(), VALID); x86_saved_state_t *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread()); if (is_saved_state64(tagged_regs)) { @@ -482,7 +466,7 @@ systrace_destroy(void *arg, dtrace_id_t id, void *parg) } /*ARGSUSED*/ -static void +static int systrace_enable(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg) /* __APPLE__ */ @@ -505,7 +489,7 @@ systrace_enable(void *arg, dtrace_id_t id, void *parg) if (enabled) { ASSERT(sysent[sysnum].sy_callc == dtrace_systrace_syscall); - return; + return(0); } (void) casptr(&sysent[sysnum].sy_callc, @@ -516,6 +500,7 @@ systrace_enable(void *arg, dtrace_id_t id, void *parg) (void *)systrace_sysent32[sysnum].stsy_underlying, (void *)dtrace_systrace_syscall32); #endif + return (0); } /*ARGSUSED*/ @@ -740,17 +725,13 @@ typedef void mach_munge_t(const void *, void *); typedef struct { int mach_trap_arg_count; int (*mach_trap_function)(void); -#if defined(__i386__) - boolean_t mach_trap_stack; -#else +#if 0 /* no active architectures use mungers for mach traps */ mach_munge_t *mach_trap_arg_munge32; /* system call arguments for 32-bit */ mach_munge_t *mach_trap_arg_munge64; /* system call arguments for 64-bit */ #endif -#if !MACH_ASSERT - int mach_trap_unused; -#else +#if MACH_ASSERT const char* mach_trap_name; -#endif /* !MACH_ASSERT */ +#endif /* MACH_ASSERT */ } mach_trap_t; extern mach_trap_t mach_trap_table[]; @@ -803,20 +784,10 @@ dtrace_machtrace_syscall(struct mach_call_args *args) syscall_arg_t *ip = (syscall_arg_t *)args; mach_call_t mach_call; -#if defined (__ppc__) || defined (__ppc64__) - { - savearea_t *regs = (savearea_t *)find_user_regs(current_thread()); - - flavor = (((unsigned int)regs->save_r0) == 0)? 1: 0; - - if (flavor) - code = -regs->save_r3; - else - code = -regs->save_r0; - } -#elif defined(__i386__) || defined (__x86_64__) +#if defined(__i386__) || defined (__x86_64__) #pragma unused(flavor) { + pal_register_cache_state(current_thread(), VALID); x86_saved_state_t *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread()); if (is_saved_state64(tagged_regs)) { @@ -937,7 +908,7 @@ machtrace_destroy(void *arg, dtrace_id_t id, void *parg) } /*ARGSUSED*/ -static void +static int machtrace_enable(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg) /* __APPLE__ */ @@ -954,12 +925,13 @@ machtrace_enable(void *arg, dtrace_id_t id, void *parg) if (enabled) { ASSERT(sysent[sysnum].sy_callc == (void *)dtrace_machtrace_syscall); - return; + return(0); } (void) casptr(&mach_trap_table[sysnum].mach_trap_function, (void *)machtrace_sysent[sysnum].stsy_underlying, (void *)dtrace_machtrace_syscall); + return(0); } /*ARGSUSED*/ diff --git a/bsd/dev/i386/conf.c b/bsd/dev/i386/conf.c index 964f945bf..b7de69df7 100644 --- a/bsd/dev/i386/conf.c +++ b/bsd/dev/i386/conf.c @@ -288,6 +288,7 @@ struct cdevsw cdevsw[] = }; int nchrdev = sizeof (cdevsw) / sizeof (cdevsw[0]); +uint64_t cdevsw_flags[sizeof (cdevsw) / sizeof (cdevsw[0])]; #include /* for VCHR and VBLK */ /* diff --git a/bsd/dev/i386/dtrace_isa.c b/bsd/dev/i386/dtrace_isa.c index 65749f9df..88e789fce 100644 --- a/bsd/dev/i386/dtrace_isa.c +++ b/bsd/dev/i386/dtrace_isa.c @@ -48,6 +48,8 @@ typedef x86_saved_state_t savearea_t; #include #include #include +#include +#include /* * APPLE NOTE: The regmap is used to decode which 64bit uregs[] register @@ -126,11 +128,6 @@ dtrace_getipl(void) /* * MP coordination */ - -extern void mp_broadcast( - void (*action_func)(void *), - void *arg); - typedef struct xcArg { processorid_t cpu; dtrace_xcall_t f; @@ -147,6 +144,7 @@ xcRemote( void *foo ) } } + /* * dtrace_xcall() is not called from probe context. */ @@ -159,13 +157,17 @@ dtrace_xcall(processorid_t cpu, dtrace_xcall_t f, void *arg) xcArg.f = f; xcArg.arg = arg; - mp_broadcast( xcRemote, (void *)&xcArg); + if (cpu == DTRACE_CPUALL) { + mp_cpus_call (CPUMASK_ALL, SYNC, xcRemote, (void*)&xcArg); + } + else { + mp_cpus_call (cpu_to_cpumask((cpu_t)cpu), SYNC, xcRemote, (void*)&xcArg); + } } /* * Runtime and ABI */ - uint64_t dtrace_getreg(struct regs *savearea, uint_t reg) { @@ -420,6 +422,7 @@ dtrace_getupcstack(uint64_t *pcstack, int pcstack_limit) if (thread == NULL) goto zero; + pal_register_cache_state(thread, VALID); regs = (x86_saved_state_t *)find_user_regs(thread); if (regs == NULL) goto zero; @@ -483,6 +486,7 @@ dtrace_getustackdepth(void) if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT)) return (-1); + pal_register_cache_state(thread, VALID); regs = (x86_saved_state_t *)find_user_regs(thread); if (regs == NULL) return 0; @@ -746,7 +750,9 @@ dtrace_getarg(int arg, int aframes) fp = fp->backchain; pc = fp->retaddr; - if (pc == (uintptr_t)dtrace_invop_callsite) { + if (dtrace_invop_callsite_pre != NULL + && pc > (uintptr_t)dtrace_invop_callsite_pre + && pc <= (uintptr_t)dtrace_invop_callsite_post) { #if defined(__i386__) /* * If we pass through the invalid op handler, we will @@ -783,8 +789,10 @@ dtrace_getarg(int arg, int aframes) if (arg <= inreg) { stack = (uintptr_t *)&saved_state->rdi; } else { - stack = (uintptr_t *)(saved_state->isf.rsp); - arg -= inreg; + fp = (struct frame *)(saved_state->isf.rsp); + stack = (uintptr_t *)&fp[1]; /* Find marshalled + arguments */ + arg -= inreg + 1; } #else #error Unknown arch @@ -794,7 +802,11 @@ dtrace_getarg(int arg, int aframes) } /* - * Arrive here when provider has called dtrace_probe directly. + * We know that we did not come through a trap to get into + * dtrace_probe() -- We arrive here when the provider has + * called dtrace_probe() directly. + * The probe ID is the first argument to dtrace_probe(). + * We must advance beyond that to get the argX. */ arg++; /* Advance past probeID */ @@ -815,7 +827,8 @@ dtrace_getarg(int arg, int aframes) load: DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); - val = *(((uint64_t *)stack) + arg); /* dtrace_probe arguments arg0 .. arg4 are 64bits wide */ + /* dtrace_probe arguments arg0 ... arg4 are 64bits wide */ + val = (uint64_t)(*(((uintptr_t *)stack) + arg)); DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); return (val); diff --git a/bsd/dev/i386/fasttrap_isa.c b/bsd/dev/i386/fasttrap_isa.c index be620b517..e3bfb9402 100644 --- a/bsd/dev/i386/fasttrap_isa.c +++ b/bsd/dev/i386/fasttrap_isa.c @@ -45,6 +45,8 @@ extern dtrace_id_t dtrace_probeid_error; #include #include +#include + /* Solaris proc_t is the struct. Darwin's proc_t is a pointer to it. */ #define proc_t struct proc /* Steer clear of the Darwin typedef for proc_t */ @@ -2207,11 +2209,11 @@ fasttrap_return_probe(x86_saved_state_t *regs) return (0); } - uint64_t fasttrap_pid_getarg(void *arg, dtrace_id_t id, void *parg, int argno, int aframes) { + pal_register_cache_state(current_thread(), VALID); #pragma unused(arg, id, parg, aframes) return (fasttrap_anarg((x86_saved_state_t *)find_user_regs(current_thread()), 1, argno)); } @@ -2220,6 +2222,7 @@ uint64_t fasttrap_usdt_getarg(void *arg, dtrace_id_t id, void *parg, int argno, int aframes) { + pal_register_cache_state(current_thread(), VALID); #pragma unused(arg, id, parg, aframes) return (fasttrap_anarg((x86_saved_state_t *)find_user_regs(current_thread()), 0, argno)); } diff --git a/bsd/dev/i386/fbt_x86.c b/bsd/dev/i386/fbt_x86.c index 19d461ac2..baec24f83 100644 --- a/bsd/dev/i386/fbt_x86.c +++ b/bsd/dev/i386/fbt_x86.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -101,7 +102,9 @@ extern dtrace_provider_id_t fbt_id; extern fbt_probe_t **fbt_probetab; extern int fbt_probetab_mask; -kern_return_t fbt_perfCallback(int, x86_saved_state_t *, __unused int, __unused int); +extern int gIgnoreFBTBlacklist; /* From fbt_init */ + +kern_return_t fbt_perfCallback(int, x86_saved_state_t *, uintptr_t *, __unused int); /* * Critical routines that must not be probed. PR_5221096, PR_5379018. @@ -144,6 +147,7 @@ static const char * critical_blacklist[] = "cpu_topology_start_cpu", "cpu_type", "cpuid_cpu_display", + "cpuid_extfeatures", "handle_pending_TLB_flushes", "hw_compare_and_store", "machine_idle_cstate", @@ -171,8 +175,8 @@ static const char * probe_ctx_closure[] = "IS_64BIT_PROCESS", "OSCompareAndSwap", "absolutetime_to_microtime", + "act_set_astbsd", "ast_pending", - "astbsd_on", "clock_get_calendar_nanotime_nowait", "copyin", "copyin_user", @@ -257,6 +261,238 @@ static const void * bsearch( return (NULL); } +/* + * Module validation + */ +static int +is_module_valid(struct modctl* ctl) +{ + ASSERT(!MOD_FBT_PROBES_PROVIDED(ctl)); + ASSERT(!MOD_FBT_INVALID(ctl)); + + if (0 == ctl->mod_address || 0 == ctl->mod_size) { + return FALSE; + } + + if (0 == ctl->mod_loaded) { + return FALSE; + } + + if (strstr(ctl->mod_modname, "CHUD") != NULL) + return FALSE; + + /* + * If the user sets this, trust they know what they are doing. + */ + if (gIgnoreFBTBlacklist) /* per boot-arg set in fbt_init() */ + return TRUE; + + /* + * These drivers control low level functions that when traced + * cause problems, especially in the sleep/wake paths. + * If somebody really wants to drill in on one of these kexts, then + * they can override blacklisting using the boot-arg above. + */ + + if (strstr(ctl->mod_modname, "AppleACPIEC") != NULL) + return FALSE; + + if (strstr(ctl->mod_modname, "AppleACPIPlatform") != NULL) + return FALSE; + + if (strstr(ctl->mod_modname, "AppleRTC") != NULL) + return FALSE; + + if (strstr(ctl->mod_modname, "IOACPIFamily") != NULL) + return FALSE; + + if (strstr(ctl->mod_modname, "AppleIntelCPUPowerManagement") != NULL) + return FALSE; + + if (strstr(ctl->mod_modname, "AppleProfile") != NULL) + return FALSE; + + if (strstr(ctl->mod_modname, "AppleIntelProfile") != NULL) + return FALSE; + + + + return TRUE; +} + +/* + * FBT probe name validation + */ +static int +is_symbol_valid(const char* name) +{ + /* + * If the user set this, trust they know what they are doing. + */ + if (gIgnoreFBTBlacklist) + return TRUE; + + if (LIT_STRNSTART(name, "dtrace_") && !LIT_STRNSTART(name, "dtrace_safe_")) { + /* + * Anything beginning with "dtrace_" may be called + * from probe context unless it explitly indicates + * that it won't be called from probe context by + * using the prefix "dtrace_safe_". + */ + return FALSE; + } + + if (LIT_STRNSTART(name, "fasttrap_") || + LIT_STRNSTART(name, "fuword") || + LIT_STRNSTART(name, "suword") || + LIT_STRNEQL(name, "sprlock") || + LIT_STRNEQL(name, "sprunlock") || + LIT_STRNEQL(name, "uread") || + LIT_STRNEQL(name, "uwrite")) { + return FALSE; /* Fasttrap inner-workings. */ + } + + if (LIT_STRNSTART(name, "dsmos_")) + return FALSE; /* Don't Steal Mac OS X! */ + + if (LIT_STRNSTART(name, "_dtrace")) + return FALSE; /* Shims in dtrace.c */ + + if (LIT_STRNSTART(name, "chud")) + return FALSE; /* Professional courtesy. */ + + if (LIT_STRNSTART(name, "hibernate_")) + return FALSE; /* Let sleeping dogs lie. */ + + if (LIT_STRNEQL(name, "_ZNK6OSData14getBytesNoCopyEv")) + return FALSE; /* Data::getBytesNoCopy, IOHibernateSystemWake path */ + + if (LIT_STRNEQL(name, "_ZN9IOService14newTemperatureElPS_") || /* IOService::newTemperature */ + LIT_STRNEQL(name, "_ZN9IOService26temperatureCriticalForZoneEPS_")) { /* IOService::temperatureCriticalForZone */ + return FALSE; /* Per the fire code */ + } + + /* + * Place no probes (illegal instructions) in the exception handling path! + */ + if (LIT_STRNEQL(name, "t_invop") || + LIT_STRNEQL(name, "enter_lohandler") || + LIT_STRNEQL(name, "lo_alltraps") || + LIT_STRNEQL(name, "kernel_trap") || + LIT_STRNEQL(name, "interrupt") || + LIT_STRNEQL(name, "i386_astintr")) { + return FALSE; + } + + if (LIT_STRNEQL(name, "current_thread") || + LIT_STRNEQL(name, "ast_pending") || + LIT_STRNEQL(name, "fbt_perfCallback") || + LIT_STRNEQL(name, "machine_thread_get_kern_state") || + LIT_STRNEQL(name, "get_threadtask") || + LIT_STRNEQL(name, "ml_set_interrupts_enabled") || + LIT_STRNEQL(name, "dtrace_invop") || + LIT_STRNEQL(name, "fbt_invop") || + LIT_STRNEQL(name, "sdt_invop") || + LIT_STRNEQL(name, "max_valid_stack_address")) { + return FALSE; + } + + /* + * Voodoo. + */ + if (LIT_STRNSTART(name, "machine_stack_") || + LIT_STRNSTART(name, "mapping_") || + LIT_STRNEQL(name, "tmrCvt") || + + LIT_STRNSTART(name, "tsc_") || + + LIT_STRNSTART(name, "pmCPU") || + LIT_STRNEQL(name, "pmKextRegister") || + LIT_STRNEQL(name, "pmMarkAllCPUsOff") || + LIT_STRNEQL(name, "pmSafeMode") || + LIT_STRNEQL(name, "pmTimerSave") || + LIT_STRNEQL(name, "pmTimerRestore") || + LIT_STRNEQL(name, "pmUnRegister") || + LIT_STRNSTART(name, "pms") || + LIT_STRNEQL(name, "power_management_init") || + LIT_STRNSTART(name, "usimple_") || + LIT_STRNSTART(name, "lck_spin_lock") || + LIT_STRNSTART(name, "lck_spin_unlock") || + + LIT_STRNSTART(name, "rtc_") || + LIT_STRNSTART(name, "_rtc_") || + LIT_STRNSTART(name, "rtclock_") || + LIT_STRNSTART(name, "clock_") || + LIT_STRNSTART(name, "absolutetime_to_") || + LIT_STRNEQL(name, "setPop") || + LIT_STRNEQL(name, "nanoseconds_to_absolutetime") || + LIT_STRNEQL(name, "nanotime_to_absolutetime") || + + LIT_STRNSTART(name, "etimer_") || + + LIT_STRNSTART(name, "commpage_") || + LIT_STRNSTART(name, "pmap_") || + LIT_STRNSTART(name, "ml_") || + LIT_STRNSTART(name, "PE_") || + LIT_STRNEQL(name, "kprintf") || + LIT_STRNSTART(name, "lapic_") || + LIT_STRNSTART(name, "act_machine") || + LIT_STRNSTART(name, "acpi_") || + LIT_STRNSTART(name, "pal_")){ + return FALSE; + } + + /* + * Avoid machine_ routines. PR_5346750. + */ + if (LIT_STRNSTART(name, "machine_")) + return FALSE; + + if (LIT_STRNEQL(name, "handle_pending_TLB_flushes")) + return FALSE; + + /* + * Place no probes on critical routines. PR_5221096 + */ + if (bsearch( name, critical_blacklist, CRITICAL_BLACKLIST_COUNT, sizeof(name), _cmp ) != NULL) + return FALSE; + + /* + * Place no probes that could be hit in probe context. + */ + if (bsearch( name, probe_ctx_closure, PROBE_CTX_CLOSURE_COUNT, sizeof(name), _cmp ) != NULL) { + return FALSE; + } + + /* + * Place no probes that could be hit on the way to the debugger. + */ + if (LIT_STRNSTART(name, "kdp_") || + LIT_STRNSTART(name, "kdb_") || + LIT_STRNSTART(name, "kdbg_") || + LIT_STRNSTART(name, "kdebug_") || + LIT_STRNSTART(name, "kernel_debug") || + LIT_STRNEQL(name, "Debugger") || + LIT_STRNEQL(name, "Call_DebuggerC") || + LIT_STRNEQL(name, "lock_debugger") || + LIT_STRNEQL(name, "unlock_debugger") || + LIT_STRNEQL(name, "SysChoked")) { + return FALSE; + } + + + /* + * Place no probes that could be hit on the way to a panic. + */ + if (NULL != strstr(name, "panic_") || + LIT_STRNEQL(name, "panic") || + LIT_STRNEQL(name, "preemption_underflow_panic")) { + return FALSE; + } + + return TRUE; +} + #if defined(__i386__) int fbt_invop(uintptr_t addr, uintptr_t *stack, uintptr_t rval) @@ -313,8 +549,8 @@ kern_return_t fbt_perfCallback( int trapno, x86_saved_state_t *tagged_regs, - __unused int unused1, - __unused int unused2) + uintptr_t *lo_spp, + __unused int unused ) { kern_return_t retval = KERN_FAILURE; x86_saved_state32_t *saved_state = saved_state32(tagged_regs); @@ -322,7 +558,8 @@ fbt_perfCallback( if (FBT_EXCEPTION_CODE == trapno && !IS_USER_TRAP(saved_state)) { boolean_t oldlevel, cpu_64bit; - uint32_t esp_probe, *ebp, edi, fp, *pDst, delta = 0; + uint32_t esp_probe, fp, *pDst, delta = 0; + uintptr_t old_sp; int emul; cpu_64bit = ml_is64bit(); @@ -335,10 +572,26 @@ fbt_perfCallback( esp_probe = (uint32_t)&(regs[1]); /* Nasty, infer the location above the save area */ } + __asm__ volatile( + "Ldtrace_invop_callsite_pre_label:\n" + ".data\n" + ".private_extern _dtrace_invop_callsite_pre\n" + "_dtrace_invop_callsite_pre:\n" + " .long Ldtrace_invop_callsite_pre_label\n" + ".text\n" + ); + emul = dtrace_invop( saved_state->eip, (uintptr_t *)esp_probe, saved_state->eax ); - __asm__ volatile(".globl _dtrace_invop_callsite"); - __asm__ volatile("_dtrace_invop_callsite:"); + __asm__ volatile( + "Ldtrace_invop_callsite_post_label:\n" + ".data\n" + ".private_extern _dtrace_invop_callsite_post\n" + "_dtrace_invop_callsite_post:\n" + " .long Ldtrace_invop_callsite_post_label\n" + ".text\n" + ); + switch (emul) { case DTRACE_INVOP_NOP: saved_state->eip += DTRACE_INVOP_NOP_SKIP; /* Skip over the patched NOP (planted by sdt.) */ @@ -379,27 +632,18 @@ fbt_perfCallback( if (cpu_64bit) saved_state->uesp += (delta << 2); - -/* XXX Fragile in the extreme. Obtain the value of %edi that our caller pushed - * (on behalf of its caller -- trap_from_kernel()). Ultimately, - * trap_from_kernel's stack pointer is restored from this slot. - * This is sensitive to the manner in which the compiler preserves %edi, - * and trap_from_kernel()'s internals. - */ - ebp = (uint32_t *)__builtin_frame_address(0); - ebp = (uint32_t *)*ebp; - edi = *(ebp - 1); +/* Obtain the stack pointer recorded by the trampolines */ + old_sp = *lo_spp; /* Shift contents of stack */ for (pDst = (uint32_t *)fp; - pDst > (((uint32_t *)edi)); + pDst > (((uint32_t *)old_sp)); pDst--) *pDst = pDst[-delta]; /* Track the stack lift in "saved_state". */ saved_state = (x86_saved_state32_t *) (((uintptr_t)saved_state) + (delta << 2)); - -/* Now adjust the value of %edi in our caller (kernel_trap)'s frame */ - *(ebp - 1) = edi + (delta << 2); +/* Adjust the stack pointer utilized by the trampolines */ + *lo_spp = old_sp + (delta << 2); retval = KERN_SUCCESS; break; @@ -418,47 +662,299 @@ fbt_perfCallback( /*ARGSUSED*/ static void -__fbt_provide_module(void *arg, struct modctl *ctl) +__provide_probe_32(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, char *modname, char* symbolName, machine_inst_t* symbolStart) { -#pragma unused(arg) - kernel_mach_header_t *mh; - struct load_command *cmd; - kernel_segment_command_t *orig_ts = NULL, *orig_le = NULL; - struct symtab_command *orig_st = NULL; - struct nlist *sym = NULL; - char *strings; - uintptr_t instrLow, instrHigh; - char *modname; - unsigned int i, j; + unsigned int j; + unsigned int doenable = 0; + dtrace_id_t thisid; - int gIgnoreFBTBlacklist = 0; - PE_parse_boot_argn("IgnoreFBTBlacklist", &gIgnoreFBTBlacklist, sizeof (gIgnoreFBTBlacklist)); - - mh = (kernel_mach_header_t *)(ctl->address); - modname = ctl->mod_modname; + fbt_probe_t *newfbt, *retfbt, *entryfbt; + machine_inst_t *instr, *limit, theInstr, i1, i2; + int size; - if (0 == ctl->address || 0 == ctl->size) /* Has the linker been jettisoned? */ + for (j = 0, instr = symbolStart, theInstr = 0; + (j < 4) && ((uintptr_t)instr >= instrLow) && (instrHigh > (uintptr_t)(instr + 2)); + j++) { + theInstr = instr[0]; + if (theInstr == FBT_PUSHL_EBP || theInstr == FBT_RET || theInstr == FBT_RET_IMM16) + break; + + if ((size = dtrace_instr_size(instr)) <= 0) + break; + + instr += size; + } + + if (theInstr != FBT_PUSHL_EBP) return; - + + i1 = instr[1]; + i2 = instr[2]; + + limit = (machine_inst_t *)instrHigh; + + if ((i1 == FBT_MOVL_ESP_EBP0_V0 && i2 == FBT_MOVL_ESP_EBP1_V0) || + (i1 == FBT_MOVL_ESP_EBP0_V1 && i2 == FBT_MOVL_ESP_EBP1_V1)) { + instr += 1; /* Advance to the movl %esp,%ebp */ + theInstr = i1; + } else { + /* + * Sometimes, the compiler will schedule an intervening instruction + * in the function prologue. Example: + * + * _mach_vm_read: + * 000006d8 pushl %ebp + * 000006d9 movl $0x00000004,%edx + * 000006de movl %esp,%ebp + * + * Try the next instruction, to see if it is a movl %esp,%ebp + */ + + instr += 1; /* Advance past the pushl %ebp */ + if ((size = dtrace_instr_size(instr)) <= 0) + return; + + instr += size; + + if ((instr + 1) >= limit) + return; + + i1 = instr[0]; + i2 = instr[1]; + + if (!(i1 == FBT_MOVL_ESP_EBP0_V0 && i2 == FBT_MOVL_ESP_EBP1_V0) && + !(i1 == FBT_MOVL_ESP_EBP0_V1 && i2 == FBT_MOVL_ESP_EBP1_V1)) + return; + + /* instr already points at the movl %esp,%ebp */ + theInstr = i1; + } + + thisid = dtrace_probe_lookup(fbt_id, modname, symbolName, FBT_ENTRY); + newfbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP); + strlcpy( (char *)&(newfbt->fbtp_name), symbolName, MAX_FBTP_NAME_CHARS ); + + if (thisid != 0) { + /* + * The dtrace_probe previously existed, so we have to hook + * the newfbt entry onto the end of the existing fbt's chain. + * If we find an fbt entry that was previously patched to + * fire, (as indicated by the current patched value), then + * we want to enable this newfbt on the spot. + */ + entryfbt = dtrace_probe_arg (fbt_id, thisid); + ASSERT (entryfbt != NULL); + for(; entryfbt != NULL; entryfbt = entryfbt->fbtp_next) { + if (entryfbt->fbtp_currentval == entryfbt->fbtp_patchval) + doenable++; + + if (entryfbt->fbtp_next == NULL) { + entryfbt->fbtp_next = newfbt; + newfbt->fbtp_id = entryfbt->fbtp_id; + break; + } + } + } + else { + /* + * The dtrace_probe did not previously exist, so we + * create it and hook in the newfbt. Since the probe is + * new, we obviously do not need to enable it on the spot. + */ + newfbt->fbtp_id = dtrace_probe_create(fbt_id, modname, symbolName, FBT_ENTRY, FBT_AFRAMES_ENTRY, newfbt); + doenable = 0; + } + + + newfbt->fbtp_patchpoint = instr; + newfbt->fbtp_ctl = ctl; + newfbt->fbtp_loadcnt = ctl->mod_loadcnt; + newfbt->fbtp_rval = DTRACE_INVOP_MOVL_ESP_EBP; + newfbt->fbtp_savedval = theInstr; + newfbt->fbtp_patchval = FBT_PATCHVAL; + newfbt->fbtp_currentval = 0; + newfbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)]; + fbt_probetab[FBT_ADDR2NDX(instr)] = newfbt; + + if (doenable) + fbt_enable(NULL, newfbt->fbtp_id, newfbt); + /* - * Employees of dtrace and their families are ineligible. Void - * where prohibited. + * The fbt entry chain is in place, one entry point per symbol. + * The fbt return chain can have multiple return points per symbol. + * Here we find the end of the fbt return chain. */ - - if (LIT_STRNEQL(modname, "com.apple.driver.dtrace")) + + doenable=0; + + thisid = dtrace_probe_lookup(fbt_id, modname, symbolName, FBT_RETURN); + if (thisid != 0) { + /* The dtrace_probe previously existed, so we have to + * find the end of the existing fbt chain. If we find + * an fbt return that was previously patched to fire, + * (as indicated by the currrent patched value), then + * we want to enable any new fbts on the spot. + */ + retfbt = dtrace_probe_arg (fbt_id, thisid); + ASSERT(retfbt != NULL); + for (; retfbt != NULL; retfbt = retfbt->fbtp_next) { + if (retfbt->fbtp_currentval == retfbt->fbtp_patchval) + doenable++; + if(retfbt->fbtp_next == NULL) + break; + } + } + else { + doenable = 0; + retfbt = NULL; + } + +again: + if (instr >= limit) return; - - if (strstr(modname, "CHUD") != NULL) + + /* + * If this disassembly fails, then we've likely walked off into + * a jump table or some other unsuitable area. Bail out of the + * disassembly now. + */ + if ((size = dtrace_instr_size(instr)) <= 0) + return; + + /* + * We (desperately) want to avoid erroneously instrumenting a + * jump table, especially given that our markers are pretty + * short: two bytes on x86, and just one byte on amd64. To + * determine if we're looking at a true instruction sequence + * or an inline jump table that happens to contain the same + * byte sequences, we resort to some heuristic sleeze: we + * treat this instruction as being contained within a pointer, + * and see if that pointer points to within the body of the + * function. If it does, we refuse to instrument it. + */ + for (j = 0; j < sizeof (uintptr_t); j++) { + uintptr_t check = (uintptr_t)instr - j; + uint8_t *ptr; + + if (check < (uintptr_t)symbolStart) + break; + + if (check + sizeof (uintptr_t) > (uintptr_t)limit) + continue; + + ptr = *(uint8_t **)check; + + if (ptr >= (uint8_t *)symbolStart && ptr < limit) { + instr += size; + goto again; + } + } + + /* + * OK, it's an instruction. + */ + theInstr = instr[0]; + + /* Walked onto the start of the next routine? If so, bail out of this function. */ + if (theInstr == FBT_PUSHL_EBP) + return; + + if (!(size == 1 && (theInstr == FBT_POPL_EBP || theInstr == FBT_LEAVE))) { + instr += size; + goto again; + } + + /* + * Found the popl %ebp; or leave. + */ + machine_inst_t *patch_instr = instr; + + /* + * Scan forward for a "ret", or "jmp". + */ + instr += size; + if (instr >= limit) + return; + + size = dtrace_instr_size(instr); + if (size <= 0) /* Failed instruction decode? */ + return; + + theInstr = instr[0]; + + if (!(size == FBT_RET_LEN && (theInstr == FBT_RET)) && + !(size == FBT_RET_IMM16_LEN && (theInstr == FBT_RET_IMM16)) && + !(size == FBT_JMP_SHORT_REL_LEN && (theInstr == FBT_JMP_SHORT_REL)) && + !(size == FBT_JMP_NEAR_REL_LEN && (theInstr == FBT_JMP_NEAR_REL)) && + !(size == FBT_JMP_FAR_ABS_LEN && (theInstr == FBT_JMP_FAR_ABS))) return; + + /* + * popl %ebp; ret; or leave; ret; or leave; jmp tailCalledFun; -- We have a winner! + */ + newfbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP); + strlcpy( (char *)&(newfbt->fbtp_name), symbolName, MAX_FBTP_NAME_CHARS ); + + if (retfbt == NULL) { + newfbt->fbtp_id = dtrace_probe_create(fbt_id, modname, + symbolName, FBT_RETURN, FBT_AFRAMES_RETURN, newfbt); + } else { + retfbt->fbtp_next = newfbt; + newfbt->fbtp_id = retfbt->fbtp_id; + } + + retfbt = newfbt; + newfbt->fbtp_patchpoint = patch_instr; + newfbt->fbtp_ctl = ctl; + newfbt->fbtp_loadcnt = ctl->mod_loadcnt; + + if (*patch_instr == FBT_POPL_EBP) { + newfbt->fbtp_rval = DTRACE_INVOP_POPL_EBP; + } else { + ASSERT(*patch_instr == FBT_LEAVE); + newfbt->fbtp_rval = DTRACE_INVOP_LEAVE; + } + newfbt->fbtp_roffset = + (uintptr_t)(patch_instr - (uint8_t *)symbolStart); + + newfbt->fbtp_savedval = *patch_instr; + newfbt->fbtp_patchval = FBT_PATCHVAL; + newfbt->fbtp_currentval = 0; + newfbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(patch_instr)]; + fbt_probetab[FBT_ADDR2NDX(patch_instr)] = newfbt; + + if (doenable) + fbt_enable(NULL, newfbt->fbtp_id, newfbt); + + instr += size; + goto again; +} +static void +__kernel_syms_provide_module(void *arg, struct modctl *ctl) +{ +#pragma unused(arg) + kernel_mach_header_t *mh; + struct load_command *cmd; + kernel_segment_command_t *orig_ts = NULL, *orig_le = NULL; + struct symtab_command *orig_st = NULL; + struct nlist *sym = NULL; + char *strings; + uintptr_t instrLow, instrHigh; + char *modname; + unsigned int i; + + mh = (kernel_mach_header_t *)(ctl->mod_address); + modname = ctl->mod_modname; + if (mh->magic != MH_MAGIC) return; - + cmd = (struct load_command *) &mh[1]; for (i = 0; i < mh->ncmds; i++) { if (cmd->cmd == LC_SEGMENT_KERNEL) { kernel_segment_command_t *orig_sg = (kernel_segment_command_t *) cmd; - + if (LIT_STRNEQL(orig_sg->segname, SEG_TEXT)) orig_ts = orig_sg; else if (LIT_STRNEQL(orig_sg->segname, SEG_LINKEDIT)) @@ -468,370 +964,75 @@ __fbt_provide_module(void *arg, struct modctl *ctl) } else if (cmd->cmd == LC_SYMTAB) orig_st = (struct symtab_command *) cmd; - + cmd = (struct load_command *) ((caddr_t) cmd + cmd->cmdsize); } - + if ((orig_ts == NULL) || (orig_st == NULL) || (orig_le == NULL)) return; - + sym = (struct nlist *)(orig_le->vmaddr + orig_st->symoff - orig_le->fileoff); strings = (char *)(orig_le->vmaddr + orig_st->stroff - orig_le->fileoff); - + /* Find extent of the TEXT section */ instrLow = (uintptr_t)orig_ts->vmaddr; instrHigh = (uintptr_t)(orig_ts->vmaddr + orig_ts->vmsize); - + for (i = 0; i < orig_st->nsyms; i++) { - fbt_probe_t *fbt, *retfbt; - machine_inst_t *instr, *limit, theInstr, i1, i2; uint8_t n_type = sym[i].n_type & (N_TYPE | N_EXT); char *name = strings + sym[i].n_un.n_strx; - int size; - + /* Check that the symbol is a global and that it has a name. */ if (((N_SECT | N_EXT) != n_type && (N_ABS | N_EXT) != n_type)) continue; - + if (0 == sym[i].n_un.n_strx) /* iff a null, "", name. */ continue; /* Lop off omnipresent leading underscore. */ if (*name == '_') name += 1; - - if (LIT_STRNSTART(name, "dtrace_") && !LIT_STRNSTART(name, "dtrace_safe_")) { - /* - * Anything beginning with "dtrace_" may be called - * from probe context unless it explitly indicates - * that it won't be called from probe context by - * using the prefix "dtrace_safe_". - */ - continue; - } - - if (LIT_STRNSTART(name, "dsmos_")) - continue; /* Don't Steal Mac OS X! */ - - if (LIT_STRNSTART(name, "_dtrace")) - continue; /* Shims in dtrace.c */ - - if (LIT_STRNSTART(name, "chud")) - continue; /* Professional courtesy. */ - - if (LIT_STRNSTART(name, "hibernate_")) - continue; /* Let sleeping dogs lie. */ - if (LIT_STRNEQL(name, "_ZN9IOService14newTemperatureElPS_") || /* IOService::newTemperature */ - LIT_STRNEQL(name, "_ZN9IOService26temperatureCriticalForZoneEPS_")) /* IOService::temperatureCriticalForZone */ - continue; /* Per the fire code */ - /* - * Place no probes (illegal instructions) in the exception handling path! + * We're only blacklisting functions in the kernel for now. */ - if (LIT_STRNEQL(name, "t_invop") || - LIT_STRNEQL(name, "enter_lohandler") || - LIT_STRNEQL(name, "lo_alltraps") || - LIT_STRNEQL(name, "kernel_trap") || - LIT_STRNEQL(name, "interrupt") || - LIT_STRNEQL(name, "i386_astintr")) - continue; - - if (LIT_STRNEQL(name, "current_thread") || - LIT_STRNEQL(name, "ast_pending") || - LIT_STRNEQL(name, "fbt_perfCallback") || - LIT_STRNEQL(name, "machine_thread_get_kern_state") || - LIT_STRNEQL(name, "get_threadtask") || - LIT_STRNEQL(name, "ml_set_interrupts_enabled") || - LIT_STRNEQL(name, "dtrace_invop") || - LIT_STRNEQL(name, "fbt_invop") || - LIT_STRNEQL(name, "sdt_invop") || - LIT_STRNEQL(name, "max_valid_stack_address")) + if (MOD_IS_MACH_KERNEL(ctl) && !is_symbol_valid(name)) continue; + + __provide_probe_32(ctl, instrLow, instrHigh, modname, name, (machine_inst_t*)sym[i].n_value); + } +} - /* - * Voodoo. - */ - if (LIT_STRNSTART(name, "machine_stack_") || - LIT_STRNSTART(name, "mapping_") || - LIT_STRNEQL(name, "tmrCvt") || - - LIT_STRNSTART(name, "tsc_") || - - LIT_STRNSTART(name, "pmCPU") || - LIT_STRNEQL(name, "pmKextRegister") || - LIT_STRNEQL(name, "pmMarkAllCPUsOff") || - LIT_STRNEQL(name, "pmSafeMode") || - LIT_STRNEQL(name, "pmTimerSave") || - LIT_STRNEQL(name, "pmTimerRestore") || - LIT_STRNEQL(name, "pmUnRegister") || - LIT_STRNSTART(name, "pms") || - LIT_STRNEQL(name, "power_management_init") || - LIT_STRNSTART(name, "usimple_") || - LIT_STRNEQL(name, "lck_spin_lock") || - LIT_STRNEQL(name, "lck_spin_unlock") || - - LIT_STRNSTART(name, "rtc_") || - LIT_STRNSTART(name, "_rtc_") || - LIT_STRNSTART(name, "rtclock_") || - LIT_STRNSTART(name, "clock_") || - LIT_STRNSTART(name, "absolutetime_to_") || - LIT_STRNEQL(name, "setPop") || - LIT_STRNEQL(name, "nanoseconds_to_absolutetime") || - LIT_STRNEQL(name, "nanotime_to_absolutetime") || - - LIT_STRNSTART(name, "etimer_") || - - LIT_STRNSTART(name, "commpage_") || - LIT_STRNSTART(name, "pmap_") || - LIT_STRNSTART(name, "ml_") || - LIT_STRNSTART(name, "PE_") || - LIT_STRNEQL(name, "kprintf") || - LIT_STRNSTART(name, "lapic_") || - LIT_STRNSTART(name, "acpi_")) - continue; +static void +__user_syms_provide_module(void *arg, struct modctl *ctl) +{ +#pragma unused(arg) + char *modname; + unsigned int i; + + modname = ctl->mod_modname; + + dtrace_module_symbols_t* module_symbols = ctl->mod_user_symbols; + if (module_symbols) { + for (i=0; idtmodsyms_count; i++) { + dtrace_symbol_t* symbol = &module_symbols->dtmodsyms_symbols[i]; + char* name = symbol->dtsym_name; + + /* Lop off omnipresent leading underscore. */ + if (*name == '_') + name += 1; - /* - * Avoid machine_ routines. PR_5346750. - */ - if (LIT_STRNSTART(name, "machine_")) - continue; + /* + * We're only blacklisting functions in the kernel for now. + */ + if (MOD_IS_MACH_KERNEL(ctl) && !is_symbol_valid(name)) + continue; - if (LIT_STRNEQL(name, "handle_pending_TLB_flushes")) - continue; + __provide_probe_32(ctl, (uintptr_t)symbol->dtsym_addr, (uintptr_t)(symbol->dtsym_addr + symbol->dtsym_size), modname, name, (machine_inst_t*)(uintptr_t)symbol->dtsym_addr); + } + } +} - /* - * Place no probes on critical routines. PR_5221096 - */ - if (!gIgnoreFBTBlacklist && - bsearch( name, critical_blacklist, CRITICAL_BLACKLIST_COUNT, sizeof(name), _cmp ) != NULL) - continue; - - /* - * Place no probes that could be hit in probe context. - */ - if (!gIgnoreFBTBlacklist && - bsearch( name, probe_ctx_closure, PROBE_CTX_CLOSURE_COUNT, sizeof(name), _cmp ) != NULL) - continue; - - /* - * Place no probes that could be hit on the way to the debugger. - */ - if (LIT_STRNSTART(name, "kdp_") || - LIT_STRNSTART(name, "kdb_") || - LIT_STRNSTART(name, "kdbg_") || - LIT_STRNSTART(name, "kdebug_") || - LIT_STRNEQL(name, "kernel_debug") || - LIT_STRNEQL(name, "Debugger") || - LIT_STRNEQL(name, "Call_DebuggerC") || - LIT_STRNEQL(name, "lock_debugger") || - LIT_STRNEQL(name, "unlock_debugger") || - LIT_STRNEQL(name, "SysChoked")) - continue; - - /* - * Place no probes that could be hit on the way to a panic. - */ - if (NULL != strstr(name, "panic_") || - LIT_STRNEQL(name, "panic") || - LIT_STRNEQL(name, "handleMck") || - LIT_STRNEQL(name, "unresolved_kernel_trap")) - continue; - - if (dtrace_probe_lookup(fbt_id, modname, name, NULL) != 0) - continue; - - for (j = 0, instr = (machine_inst_t *)sym[i].n_value, theInstr = 0; - (j < 4) && ((uintptr_t)instr >= instrLow) && (instrHigh > (uintptr_t)(instr + 2)); - j++) { - theInstr = instr[0]; - if (theInstr == FBT_PUSHL_EBP || theInstr == FBT_RET || theInstr == FBT_RET_IMM16) - break; - - if ((size = dtrace_instr_size(instr)) <= 0) - break; - - instr += size; - } - - if (theInstr != FBT_PUSHL_EBP) - continue; - - i1 = instr[1]; - i2 = instr[2]; - - limit = (machine_inst_t *)instrHigh; - - if ((i1 == FBT_MOVL_ESP_EBP0_V0 && i2 == FBT_MOVL_ESP_EBP1_V0) || - (i1 == FBT_MOVL_ESP_EBP0_V1 && i2 == FBT_MOVL_ESP_EBP1_V1)) { - instr += 1; /* Advance to the movl %esp,%ebp */ - theInstr = i1; - } else { - /* - * Sometimes, the compiler will schedule an intervening instruction - * in the function prologue. Example: - * - * _mach_vm_read: - * 000006d8 pushl %ebp - * 000006d9 movl $0x00000004,%edx - * 000006de movl %esp,%ebp - * - * Try the next instruction, to see if it is a movl %esp,%ebp - */ - - instr += 1; /* Advance past the pushl %ebp */ - if ((size = dtrace_instr_size(instr)) <= 0) - continue; - - instr += size; - - if ((instr + 1) >= limit) - continue; - - i1 = instr[0]; - i2 = instr[1]; - - if (!(i1 == FBT_MOVL_ESP_EBP0_V0 && i2 == FBT_MOVL_ESP_EBP1_V0) && - !(i1 == FBT_MOVL_ESP_EBP0_V1 && i2 == FBT_MOVL_ESP_EBP1_V1)) - continue; - - /* instr already points at the movl %esp,%ebp */ - theInstr = i1; - } - - fbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP); - strlcpy( (char *)&(fbt->fbtp_name), name, MAX_FBTP_NAME_CHARS ); - fbt->fbtp_id = dtrace_probe_create(fbt_id, modname, name, FBT_ENTRY, FBT_AFRAMES_ENTRY, fbt); - fbt->fbtp_patchpoint = instr; - fbt->fbtp_ctl = ctl; - fbt->fbtp_loadcnt = ctl->mod_loadcnt; - fbt->fbtp_rval = DTRACE_INVOP_MOVL_ESP_EBP; - fbt->fbtp_savedval = theInstr; - fbt->fbtp_patchval = FBT_PATCHVAL; - - fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)]; - fbt->fbtp_symndx = i; - fbt_probetab[FBT_ADDR2NDX(instr)] = fbt; - - retfbt = NULL; -again: - if (instr >= limit) - continue; - - /* - * If this disassembly fails, then we've likely walked off into - * a jump table or some other unsuitable area. Bail out of the - * disassembly now. - */ - if ((size = dtrace_instr_size(instr)) <= 0) - continue; - - /* - * We (desperately) want to avoid erroneously instrumenting a - * jump table, especially given that our markers are pretty - * short: two bytes on x86, and just one byte on amd64. To - * determine if we're looking at a true instruction sequence - * or an inline jump table that happens to contain the same - * byte sequences, we resort to some heuristic sleeze: we - * treat this instruction as being contained within a pointer, - * and see if that pointer points to within the body of the - * function. If it does, we refuse to instrument it. - */ - for (j = 0; j < sizeof (uintptr_t); j++) { - uintptr_t check = (uintptr_t)instr - j; - uint8_t *ptr; - - if (check < sym[i].n_value) - break; - - if (check + sizeof (uintptr_t) > (uintptr_t)limit) - continue; - - ptr = *(uint8_t **)check; - - if (ptr >= (uint8_t *)sym[i].n_value && ptr < limit) { - instr += size; - goto again; - } - } - - /* - * OK, it's an instruction. - */ - theInstr = instr[0]; - - /* Walked onto the start of the next routine? If so, bail out of this function. */ - if (theInstr == FBT_PUSHL_EBP) - continue; - - if (!(size == 1 && (theInstr == FBT_POPL_EBP || theInstr == FBT_LEAVE))) { - instr += size; - goto again; - } - - /* - * Found the popl %ebp; or leave. - */ - machine_inst_t *patch_instr = instr; - - /* - * Scan forward for a "ret", or "jmp". - */ - instr += size; - if (instr >= limit) - continue; - - size = dtrace_instr_size(instr); - if (size <= 0) /* Failed instruction decode? */ - continue; - - theInstr = instr[0]; - - if (!(size == FBT_RET_LEN && (theInstr == FBT_RET)) && - !(size == FBT_RET_IMM16_LEN && (theInstr == FBT_RET_IMM16)) && - !(size == FBT_JMP_SHORT_REL_LEN && (theInstr == FBT_JMP_SHORT_REL)) && - !(size == FBT_JMP_NEAR_REL_LEN && (theInstr == FBT_JMP_NEAR_REL)) && - !(size == FBT_JMP_FAR_ABS_LEN && (theInstr == FBT_JMP_FAR_ABS))) - continue; - - /* - * popl %ebp; ret; or leave; ret; or leave; jmp tailCalledFun; -- We have a winner! - */ - fbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP); - strlcpy( (char *)&(fbt->fbtp_name), name, MAX_FBTP_NAME_CHARS ); - - if (retfbt == NULL) { - fbt->fbtp_id = dtrace_probe_create(fbt_id, modname, - name, FBT_RETURN, FBT_AFRAMES_RETURN, fbt); - } else { - retfbt->fbtp_next = fbt; - fbt->fbtp_id = retfbt->fbtp_id; - } - - retfbt = fbt; - fbt->fbtp_patchpoint = patch_instr; - fbt->fbtp_ctl = ctl; - fbt->fbtp_loadcnt = ctl->mod_loadcnt; - - if (*patch_instr == FBT_POPL_EBP) { - fbt->fbtp_rval = DTRACE_INVOP_POPL_EBP; - } else { - ASSERT(*patch_instr == FBT_LEAVE); - fbt->fbtp_rval = DTRACE_INVOP_LEAVE; - } - fbt->fbtp_roffset = - (uintptr_t)(patch_instr - (uint8_t *)sym[i].n_value); - - fbt->fbtp_savedval = *patch_instr; - fbt->fbtp_patchval = FBT_PATCHVAL; - fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(patch_instr)]; - fbt->fbtp_symndx = i; - fbt_probetab[FBT_ADDR2NDX(patch_instr)] = fbt; - - instr += size; - goto again; - } -} #elif defined(__x86_64__) int fbt_invop(uintptr_t addr, uintptr_t *state, uintptr_t rval) @@ -870,7 +1071,7 @@ kern_return_t fbt_perfCallback( int trapno, x86_saved_state_t *tagged_regs, - __unused int unused1, + uintptr_t *lo_spp, __unused int unused2) { kern_return_t retval = KERN_FAILURE; @@ -878,18 +1079,36 @@ fbt_perfCallback( if (FBT_EXCEPTION_CODE == trapno && !IS_USER_TRAP(saved_state)) { boolean_t oldlevel; - uint64_t rsp_probe, *rbp, r12, fp, delta = 0; + uint64_t rsp_probe, fp, delta = 0; + uintptr_t old_sp; uint32_t *pDst; int emul; + oldlevel = ml_set_interrupts_enabled(FALSE); /* Calculate where the stack pointer was when the probe instruction "fired." */ rsp_probe = saved_state->isf.rsp; /* Easy, x86_64 establishes this value in idt64.s */ + __asm__ volatile( + "Ldtrace_invop_callsite_pre_label:\n" + ".data\n" + ".private_extern _dtrace_invop_callsite_pre\n" + "_dtrace_invop_callsite_pre:\n" + " .quad Ldtrace_invop_callsite_pre_label\n" + ".text\n" + ); + emul = dtrace_invop( saved_state->isf.rip, (uintptr_t *)saved_state, saved_state->rax ); - __asm__ volatile(".globl _dtrace_invop_callsite"); - __asm__ volatile("_dtrace_invop_callsite:"); + + __asm__ volatile( + "Ldtrace_invop_callsite_post_label:\n" + ".data\n" + ".private_extern _dtrace_invop_callsite_post\n" + "_dtrace_invop_callsite_post:\n" + " .quad Ldtrace_invop_callsite_post_label\n" + ".text\n" + ); switch (emul) { case DTRACE_INVOP_NOP: @@ -929,25 +1148,18 @@ fbt_perfCallback( */ delta += 2; saved_state->isf.rsp += (delta << 2); - -/* XXX Fragile in the extreme. - * This is sensitive to trap_from_kernel()'s internals. - */ - rbp = (uint64_t *)__builtin_frame_address(0); - rbp = (uint64_t *)*rbp; - r12 = *(rbp - 4); - +/* Obtain the stack pointer recorded by the trampolines */ + old_sp = *lo_spp; /* Shift contents of stack */ for (pDst = (uint32_t *)fp; - pDst > (((uint32_t *)r12)); + pDst > (((uint32_t *)old_sp)); pDst--) *pDst = pDst[-delta]; /* Track the stack lift in "saved_state". */ saved_state = (x86_saved_state64_t *) (((uintptr_t)saved_state) + (delta << 2)); - -/* Now adjust the value of %r12 in our caller (kernel_trap)'s frame */ - *(rbp - 4) = r12 + (delta << 2); +/* Adjust the stack pointer utilized by the trampolines */ + *lo_spp = old_sp + (delta << 2); retval = KERN_SUCCESS; break; @@ -966,47 +1178,301 @@ fbt_perfCallback( /*ARGSUSED*/ static void -__fbt_provide_module(void *arg, struct modctl *ctl) +__provide_probe_64(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, char *modname, char* symbolName, machine_inst_t* symbolStart) { -#pragma unused(arg) - kernel_mach_header_t *mh; - struct load_command *cmd; - kernel_segment_command_t *orig_ts = NULL, *orig_le = NULL; - struct symtab_command *orig_st = NULL; - struct nlist_64 *sym = NULL; - char *strings; - uintptr_t instrLow, instrHigh; - char *modname; - unsigned int i, j; - - int gIgnoreFBTBlacklist = 0; - PE_parse_boot_argn("IgnoreFBTBlacklist", &gIgnoreFBTBlacklist, sizeof (gIgnoreFBTBlacklist)); - - mh = (kernel_mach_header_t *)(ctl->address); - modname = ctl->mod_modname; + unsigned int j; + unsigned int doenable = 0; + dtrace_id_t thisid; - if (0 == ctl->address || 0 == ctl->size) /* Has the linker been jettisoned? */ + fbt_probe_t *newfbt, *retfbt, *entryfbt; + machine_inst_t *instr, *limit, theInstr, i1, i2, i3; + int size; + + for (j = 0, instr = symbolStart, theInstr = 0; + (j < 4) && ((uintptr_t)instr >= instrLow) && (instrHigh > (uintptr_t)(instr + 2)); + j++) { + theInstr = instr[0]; + if (theInstr == FBT_PUSH_RBP || theInstr == FBT_RET || theInstr == FBT_RET_IMM16) + break; + + if ((size = dtrace_instr_size(instr)) <= 0) + break; + + instr += size; + } + + if (theInstr != FBT_PUSH_RBP) return; - + + i1 = instr[1]; + i2 = instr[2]; + i3 = instr[3]; + + limit = (machine_inst_t *)instrHigh; + + if (i1 == FBT_REX_RSP_RBP && i2 == FBT_MOV_RSP_RBP0 && i3 == FBT_MOV_RSP_RBP1) { + instr += 1; /* Advance to the mov %rsp,%rbp */ + theInstr = i1; + } else { + return; + } +#if 0 + else { + /* + * Sometimes, the compiler will schedule an intervening instruction + * in the function prologue. Example: + * + * _mach_vm_read: + * 000006d8 pushl %ebp + * 000006d9 movl $0x00000004,%edx + * 000006de movl %esp,%ebp + * + * Try the next instruction, to see if it is a movl %esp,%ebp + */ + + instr += 1; /* Advance past the pushl %ebp */ + if ((size = dtrace_instr_size(instr)) <= 0) + return; + + instr += size; + + if ((instr + 1) >= limit) + return; + + i1 = instr[0]; + i2 = instr[1]; + + if (!(i1 == FBT_MOVL_ESP_EBP0_V0 && i2 == FBT_MOVL_ESP_EBP1_V0) && + !(i1 == FBT_MOVL_ESP_EBP0_V1 && i2 == FBT_MOVL_ESP_EBP1_V1)) + return; + + /* instr already points at the movl %esp,%ebp */ + theInstr = i1; + } +#endif + thisid = dtrace_probe_lookup(fbt_id, modname, symbolName, FBT_ENTRY); + newfbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP); + strlcpy( (char *)&(newfbt->fbtp_name), symbolName, MAX_FBTP_NAME_CHARS ); + + if (thisid != 0) { + /* + * The dtrace_probe previously existed, so we have to hook + * the newfbt entry onto the end of the existing fbt's chain. + * If we find an fbt entry that was previously patched to + * fire, (as indicated by the current patched value), then + * we want to enable this newfbt on the spot. + */ + entryfbt = dtrace_probe_arg (fbt_id, thisid); + ASSERT (entryfbt != NULL); + for(; entryfbt != NULL; entryfbt = entryfbt->fbtp_next) { + if (entryfbt->fbtp_currentval == entryfbt->fbtp_patchval) + doenable++; + + if (entryfbt->fbtp_next == NULL) { + entryfbt->fbtp_next = newfbt; + newfbt->fbtp_id = entryfbt->fbtp_id; + break; + } + } + } + else { + /* + * The dtrace_probe did not previously exist, so we + * create it and hook in the newfbt. Since the probe is + * new, we obviously do not need to enable it on the spot. + */ + newfbt->fbtp_id = dtrace_probe_create(fbt_id, modname, symbolName, FBT_ENTRY, FBT_AFRAMES_ENTRY, newfbt); + doenable = 0; + } + + newfbt->fbtp_patchpoint = instr; + newfbt->fbtp_ctl = ctl; + newfbt->fbtp_loadcnt = ctl->mod_loadcnt; + newfbt->fbtp_rval = DTRACE_INVOP_MOV_RSP_RBP; + newfbt->fbtp_savedval = theInstr; + newfbt->fbtp_patchval = FBT_PATCHVAL; + newfbt->fbtp_currentval = 0; + newfbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)]; + fbt_probetab[FBT_ADDR2NDX(instr)] = newfbt; + + if (doenable) + fbt_enable(NULL, newfbt->fbtp_id, newfbt); + /* - * Employees of dtrace and their families are ineligible. Void - * where prohibited. + * The fbt entry chain is in place, one entry point per symbol. + * The fbt return chain can have multiple return points per symbol. + * Here we find the end of the fbt return chain. */ - - if (LIT_STRNEQL(modname, "com.apple.driver.dtrace")) + + doenable=0; + + thisid = dtrace_probe_lookup(fbt_id, modname, symbolName, FBT_RETURN); + if (thisid != 0) { + /* The dtrace_probe previously existed, so we have to + * find the end of the existing fbt chain. If we find + * an fbt return that was previously patched to fire, + * (as indicated by the currrent patched value), then + * we want to enable any new fbts on the spot. + */ + retfbt = dtrace_probe_arg (fbt_id, thisid); + ASSERT(retfbt != NULL); + for (; retfbt != NULL; retfbt = retfbt->fbtp_next) { + if (retfbt->fbtp_currentval == retfbt->fbtp_patchval) + doenable++; + if(retfbt->fbtp_next == NULL) + break; + } + } + else { + doenable = 0; + retfbt = NULL; + } + +again: + if (instr >= limit) return; - - if (strstr(modname, "CHUD") != NULL) + + /* + * If this disassembly fails, then we've likely walked off into + * a jump table or some other unsuitable area. Bail out of the + * disassembly now. + */ + if ((size = dtrace_instr_size(instr)) <= 0) + return; + + /* + * We (desperately) want to avoid erroneously instrumenting a + * jump table, especially given that our markers are pretty + * short: two bytes on x86, and just one byte on amd64. To + * determine if we're looking at a true instruction sequence + * or an inline jump table that happens to contain the same + * byte sequences, we resort to some heuristic sleeze: we + * treat this instruction as being contained within a pointer, + * and see if that pointer points to within the body of the + * function. If it does, we refuse to instrument it. + */ + for (j = 0; j < sizeof (uintptr_t); j++) { + uintptr_t check = (uintptr_t)instr - j; + uint8_t *ptr; + + if (check < (uintptr_t)symbolStart) + break; + + if (check + sizeof (uintptr_t) > (uintptr_t)limit) + continue; + + ptr = *(uint8_t **)check; + + if (ptr >= (uint8_t *)symbolStart && ptr < limit) { + instr += size; + goto again; + } + } + + /* + * OK, it's an instruction. + */ + theInstr = instr[0]; + + /* Walked onto the start of the next routine? If so, bail out of this function. */ + if (theInstr == FBT_PUSH_RBP) return; + + if (!(size == 1 && (theInstr == FBT_POP_RBP || theInstr == FBT_LEAVE))) { + instr += size; + goto again; + } + + /* + * Found the pop %rbp; or leave. + */ + machine_inst_t *patch_instr = instr; + + /* + * Scan forward for a "ret", or "jmp". + */ + instr += size; + if (instr >= limit) + return; + + size = dtrace_instr_size(instr); + if (size <= 0) /* Failed instruction decode? */ + return; + + theInstr = instr[0]; + + if (!(size == FBT_RET_LEN && (theInstr == FBT_RET)) && + !(size == FBT_RET_IMM16_LEN && (theInstr == FBT_RET_IMM16)) && + !(size == FBT_JMP_SHORT_REL_LEN && (theInstr == FBT_JMP_SHORT_REL)) && + !(size == FBT_JMP_NEAR_REL_LEN && (theInstr == FBT_JMP_NEAR_REL)) && + !(size == FBT_JMP_FAR_ABS_LEN && (theInstr == FBT_JMP_FAR_ABS))) + return; + + /* + * pop %rbp; ret; or leave; ret; or leave; jmp tailCalledFun; -- We have a winner! + */ + newfbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP); + strlcpy( (char *)&(newfbt->fbtp_name), symbolName, MAX_FBTP_NAME_CHARS ); + + if (retfbt == NULL) { + newfbt->fbtp_id = dtrace_probe_create(fbt_id, modname, + symbolName, FBT_RETURN, FBT_AFRAMES_RETURN, newfbt); + } else { + retfbt->fbtp_next = newfbt; + newfbt->fbtp_id = retfbt->fbtp_id; + } + + retfbt = newfbt; + newfbt->fbtp_patchpoint = patch_instr; + newfbt->fbtp_ctl = ctl; + newfbt->fbtp_loadcnt = ctl->mod_loadcnt; + + if (*patch_instr == FBT_POP_RBP) { + newfbt->fbtp_rval = DTRACE_INVOP_POP_RBP; + } else { + ASSERT(*patch_instr == FBT_LEAVE); + newfbt->fbtp_rval = DTRACE_INVOP_LEAVE; + } + newfbt->fbtp_roffset = + (uintptr_t)(patch_instr - (uint8_t *)symbolStart); + + newfbt->fbtp_savedval = *patch_instr; + newfbt->fbtp_patchval = FBT_PATCHVAL; + newfbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(patch_instr)]; + fbt_probetab[FBT_ADDR2NDX(patch_instr)] = newfbt; + + if (doenable) + fbt_enable(NULL, newfbt->fbtp_id, newfbt); + + instr += size; + goto again; +} +static void +__kernel_syms_provide_module(void *arg, struct modctl *ctl) +{ +#pragma unused(arg) + kernel_mach_header_t *mh; + struct load_command *cmd; + kernel_segment_command_t *orig_ts = NULL, *orig_le = NULL; + struct symtab_command *orig_st = NULL; + struct nlist_64 *sym = NULL; + char *strings; + uintptr_t instrLow, instrHigh; + char *modname; + unsigned int i; + + mh = (kernel_mach_header_t *)(ctl->mod_address); + modname = ctl->mod_modname; + if (mh->magic != MH_MAGIC_64) return; - + cmd = (struct load_command *) &mh[1]; for (i = 0; i < mh->ncmds; i++) { if (cmd->cmd == LC_SEGMENT_KERNEL) { kernel_segment_command_t *orig_sg = (kernel_segment_command_t *) cmd; - + if (LIT_STRNEQL(orig_sg->segname, SEG_TEXT)) orig_ts = orig_sg; else if (LIT_STRNEQL(orig_sg->segname, SEG_LINKEDIT)) @@ -1016,402 +1482,105 @@ __fbt_provide_module(void *arg, struct modctl *ctl) } else if (cmd->cmd == LC_SYMTAB) orig_st = (struct symtab_command *) cmd; - + cmd = (struct load_command *) ((caddr_t) cmd + cmd->cmdsize); } - + if ((orig_ts == NULL) || (orig_st == NULL) || (orig_le == NULL)) return; - + sym = (struct nlist_64 *)(orig_le->vmaddr + orig_st->symoff - orig_le->fileoff); strings = (char *)(orig_le->vmaddr + orig_st->stroff - orig_le->fileoff); - + /* Find extent of the TEXT section */ instrLow = (uintptr_t)orig_ts->vmaddr; instrHigh = (uintptr_t)(orig_ts->vmaddr + orig_ts->vmsize); - + for (i = 0; i < orig_st->nsyms; i++) { - fbt_probe_t *fbt, *retfbt; - machine_inst_t *instr, *limit, theInstr, i1, i2, i3; uint8_t n_type = sym[i].n_type & (N_TYPE | N_EXT); char *name = strings + sym[i].n_un.n_strx; - int size; - + /* Check that the symbol is a global and that it has a name. */ if (((N_SECT | N_EXT) != n_type && (N_ABS | N_EXT) != n_type)) continue; - + if (0 == sym[i].n_un.n_strx) /* iff a null, "", name. */ continue; /* Lop off omnipresent leading underscore. */ if (*name == '_') name += 1; - - if (LIT_STRNSTART(name, "dtrace_") && !LIT_STRNSTART(name, "dtrace_safe_")) { - /* - * Anything beginning with "dtrace_" may be called - * from probe context unless it explitly indicates - * that it won't be called from probe context by - * using the prefix "dtrace_safe_". - */ - continue; - } - - if (LIT_STRNSTART(name, "fasttrap_") || - LIT_STRNSTART(name, "fuword") || - LIT_STRNSTART(name, "suword") || - LIT_STRNEQL(name, "sprlock") || - LIT_STRNEQL(name, "sprunlock") || - LIT_STRNEQL(name, "uread") || - LIT_STRNEQL(name, "uwrite")) - continue; /* Fasttrap inner-workings. */ - - if (LIT_STRNSTART(name, "dsmos_")) - continue; /* Don't Steal Mac OS X! */ - - if (LIT_STRNSTART(name, "_dtrace")) - continue; /* Shims in dtrace.c */ - - if (LIT_STRNSTART(name, "chud")) - continue; /* Professional courtesy. */ - - if (LIT_STRNSTART(name, "hibernate_")) - continue; /* Let sleeping dogs lie. */ - if (LIT_STRNEQL(name, "ZN9IOService14newTemperatureElPS_") || /* IOService::newTemperature */ - LIT_STRNEQL(name, "ZN9IOService26temperatureCriticalForZoneEPS_")) /* IOService::temperatureCriticalForZone */ - continue; /* Per the fire code */ - - /* - * Place no probes (illegal instructions) in the exception handling path! - */ - if (LIT_STRNEQL(name, "t_invop") || - LIT_STRNEQL(name, "enter_lohandler") || - LIT_STRNEQL(name, "lo_alltraps") || - LIT_STRNEQL(name, "kernel_trap") || - LIT_STRNEQL(name, "interrupt") || - LIT_STRNEQL(name, "i386_astintr")) - continue; - - if (LIT_STRNEQL(name, "current_thread") || - LIT_STRNEQL(name, "ast_pending") || - LIT_STRNEQL(name, "fbt_perfCallback") || - LIT_STRNEQL(name, "machine_thread_get_kern_state") || - LIT_STRNEQL(name, "get_threadtask") || - LIT_STRNEQL(name, "ml_set_interrupts_enabled") || - LIT_STRNEQL(name, "dtrace_invop") || - LIT_STRNEQL(name, "fbt_invop") || - LIT_STRNEQL(name, "sdt_invop") || - LIT_STRNEQL(name, "max_valid_stack_address")) - continue; - - /* - * Voodoo. - */ - if (LIT_STRNSTART(name, "machine_stack_") || - LIT_STRNSTART(name, "mapping_") || - LIT_STRNEQL(name, "tmrCvt") || - - LIT_STRNSTART(name, "tsc_") || - - LIT_STRNSTART(name, "pmCPU") || - LIT_STRNEQL(name, "pmKextRegister") || - LIT_STRNEQL(name, "pmMarkAllCPUsOff") || - LIT_STRNEQL(name, "pmSafeMode") || - LIT_STRNEQL(name, "pmTimerSave") || - LIT_STRNEQL(name, "pmTimerRestore") || - LIT_STRNEQL(name, "pmUnRegister") || - LIT_STRNSTART(name, "pms") || - LIT_STRNEQL(name, "power_management_init") || - LIT_STRNSTART(name, "usimple_") || - LIT_STRNSTART(name, "lck_spin_lock") || - LIT_STRNSTART(name, "lck_spin_unlock") || - - LIT_STRNSTART(name, "rtc_") || - LIT_STRNSTART(name, "_rtc_") || - LIT_STRNSTART(name, "rtclock_") || - LIT_STRNSTART(name, "clock_") || - LIT_STRNSTART(name, "absolutetime_to_") || - LIT_STRNEQL(name, "setPop") || - LIT_STRNEQL(name, "nanoseconds_to_absolutetime") || - LIT_STRNEQL(name, "nanotime_to_absolutetime") || - - LIT_STRNSTART(name, "etimer_") || - - LIT_STRNSTART(name, "commpage_") || - LIT_STRNSTART(name, "pmap_") || - LIT_STRNSTART(name, "ml_") || - LIT_STRNSTART(name, "PE_") || - LIT_STRNEQL(name, "kprintf") || - LIT_STRNSTART(name, "lapic_") || - LIT_STRNSTART(name, "acpi_")) - continue; - - /* - * Avoid machine_ routines. PR_5346750. - */ - if (LIT_STRNSTART(name, "machine_")) - continue; - - if (LIT_STRNEQL(name, "handle_pending_TLB_flushes")) - continue; - - /* - * Place no probes on critical routines. PR_5221096 - */ - if (!gIgnoreFBTBlacklist && - bsearch( name, critical_blacklist, CRITICAL_BLACKLIST_COUNT, sizeof(name), _cmp ) != NULL) - continue; - - /* - * Place no probes that could be hit in probe context. - */ - if (!gIgnoreFBTBlacklist && - bsearch( name, probe_ctx_closure, PROBE_CTX_CLOSURE_COUNT, sizeof(name), _cmp ) != NULL) - continue; - - /* - * Place no probes that could be hit on the way to the debugger. - */ - if (LIT_STRNSTART(name, "kdp_") || - LIT_STRNSTART(name, "kdb_") || - LIT_STRNSTART(name, "kdbg_") || - LIT_STRNSTART(name, "kdebug_") || - LIT_STRNEQL(name, "kernel_debug") || - LIT_STRNEQL(name, "Debugger") || - LIT_STRNEQL(name, "Call_DebuggerC") || - LIT_STRNEQL(name, "lock_debugger") || - LIT_STRNEQL(name, "unlock_debugger") || - LIT_STRNEQL(name, "SysChoked")) - continue; - /* - * Place no probes that could be hit on the way to a panic. + * We're only blacklisting functions in the kernel for now. */ - if (NULL != strstr(name, "panic_") || - LIT_STRNEQL(name, "panic") || - LIT_STRNEQL(name, "handleMck") || - LIT_STRNEQL(name, "unresolved_kernel_trap")) + if (MOD_IS_MACH_KERNEL(ctl) && !is_symbol_valid(name)) continue; - if (dtrace_probe_lookup(fbt_id, modname, name, NULL) != 0) - continue; - - for (j = 0, instr = (machine_inst_t *)sym[i].n_value, theInstr = 0; - (j < 4) && ((uintptr_t)instr >= instrLow) && (instrHigh > (uintptr_t)(instr + 2)); - j++) { - theInstr = instr[0]; - if (theInstr == FBT_PUSH_RBP || theInstr == FBT_RET || theInstr == FBT_RET_IMM16) - break; - - if ((size = dtrace_instr_size(instr)) <= 0) - break; - - instr += size; - } - - if (theInstr != FBT_PUSH_RBP) - continue; - - i1 = instr[1]; - i2 = instr[2]; - i3 = instr[3]; - - limit = (machine_inst_t *)instrHigh; + __provide_probe_64(ctl, instrLow, instrHigh, modname, name, (machine_inst_t*)sym[i].n_value); + } +} - if (i1 == FBT_REX_RSP_RBP && i2 == FBT_MOV_RSP_RBP0 && i3 == FBT_MOV_RSP_RBP1) { - instr += 1; /* Advance to the mov %rsp,%rbp */ - theInstr = i1; - } else { - continue; - } -#if 0 - else { +static void +__user_syms_provide_module(void *arg, struct modctl *ctl) +{ +#pragma unused(arg) + char *modname; + unsigned int i; + + modname = ctl->mod_modname; + + dtrace_module_symbols_t* module_symbols = ctl->mod_user_symbols; + if (module_symbols) { + for (i=0; idtmodsyms_count; i++) { + dtrace_symbol_t* symbol = &module_symbols->dtmodsyms_symbols[i]; + char* name = symbol->dtsym_name; + + /* Lop off omnipresent leading underscore. */ + if (*name == '_') + name += 1; + /* - * Sometimes, the compiler will schedule an intervening instruction - * in the function prologue. Example: - * - * _mach_vm_read: - * 000006d8 pushl %ebp - * 000006d9 movl $0x00000004,%edx - * 000006de movl %esp,%ebp - * - * Try the next instruction, to see if it is a movl %esp,%ebp + * We're only blacklisting functions in the kernel for now. */ - - instr += 1; /* Advance past the pushl %ebp */ - if ((size = dtrace_instr_size(instr)) <= 0) - continue; - - instr += size; - - if ((instr + 1) >= limit) - continue; - - i1 = instr[0]; - i2 = instr[1]; - - if (!(i1 == FBT_MOVL_ESP_EBP0_V0 && i2 == FBT_MOVL_ESP_EBP1_V0) && - !(i1 == FBT_MOVL_ESP_EBP0_V1 && i2 == FBT_MOVL_ESP_EBP1_V1)) + if (MOD_IS_MACH_KERNEL(ctl) && !is_symbol_valid(name)) continue; - - /* instr already points at the movl %esp,%ebp */ - theInstr = i1; - } -#endif - - fbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP); - strlcpy( (char *)&(fbt->fbtp_name), name, MAX_FBTP_NAME_CHARS ); - fbt->fbtp_id = dtrace_probe_create(fbt_id, modname, name, FBT_ENTRY, FBT_AFRAMES_ENTRY, fbt); - fbt->fbtp_patchpoint = instr; - fbt->fbtp_ctl = ctl; - fbt->fbtp_loadcnt = ctl->mod_loadcnt; - fbt->fbtp_rval = DTRACE_INVOP_MOV_RSP_RBP; - fbt->fbtp_savedval = theInstr; - fbt->fbtp_patchval = FBT_PATCHVAL; - - fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)]; - fbt->fbtp_symndx = i; - fbt_probetab[FBT_ADDR2NDX(instr)] = fbt; - - retfbt = NULL; -again: - if (instr >= limit) - continue; - - /* - * If this disassembly fails, then we've likely walked off into - * a jump table or some other unsuitable area. Bail out of the - * disassembly now. - */ - if ((size = dtrace_instr_size(instr)) <= 0) - continue; - - /* - * We (desperately) want to avoid erroneously instrumenting a - * jump table, especially given that our markers are pretty - * short: two bytes on x86, and just one byte on amd64. To - * determine if we're looking at a true instruction sequence - * or an inline jump table that happens to contain the same - * byte sequences, we resort to some heuristic sleeze: we - * treat this instruction as being contained within a pointer, - * and see if that pointer points to within the body of the - * function. If it does, we refuse to instrument it. - */ - for (j = 0; j < sizeof (uintptr_t); j++) { - uintptr_t check = (uintptr_t)instr - j; - uint8_t *ptr; - - if (check < sym[i].n_value) - break; - - if (check + sizeof (uintptr_t) > (uintptr_t)limit) - continue; - - ptr = *(uint8_t **)check; - - if (ptr >= (uint8_t *)sym[i].n_value && ptr < limit) { - instr += size; - goto again; - } - } - - /* - * OK, it's an instruction. - */ - theInstr = instr[0]; - - /* Walked onto the start of the next routine? If so, bail out of this function. */ - if (theInstr == FBT_PUSH_RBP) - continue; - - if (!(size == 1 && (theInstr == FBT_POP_RBP || theInstr == FBT_LEAVE))) { - instr += size; - goto again; - } - - /* - * Found the pop %rbp; or leave. - */ - machine_inst_t *patch_instr = instr; - - /* - * Scan forward for a "ret", or "jmp". - */ - instr += size; - if (instr >= limit) - continue; - - size = dtrace_instr_size(instr); - if (size <= 0) /* Failed instruction decode? */ - continue; - - theInstr = instr[0]; - - if (!(size == FBT_RET_LEN && (theInstr == FBT_RET)) && - !(size == FBT_RET_IMM16_LEN && (theInstr == FBT_RET_IMM16)) && - !(size == FBT_JMP_SHORT_REL_LEN && (theInstr == FBT_JMP_SHORT_REL)) && - !(size == FBT_JMP_NEAR_REL_LEN && (theInstr == FBT_JMP_NEAR_REL)) && - !(size == FBT_JMP_FAR_ABS_LEN && (theInstr == FBT_JMP_FAR_ABS))) - continue; - - /* - * pop %rbp; ret; or leave; ret; or leave; jmp tailCalledFun; -- We have a winner! - */ - fbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP); - strlcpy( (char *)&(fbt->fbtp_name), name, MAX_FBTP_NAME_CHARS ); - - if (retfbt == NULL) { - fbt->fbtp_id = dtrace_probe_create(fbt_id, modname, - name, FBT_RETURN, FBT_AFRAMES_RETURN, fbt); - } else { - retfbt->fbtp_next = fbt; - fbt->fbtp_id = retfbt->fbtp_id; - } - - retfbt = fbt; - fbt->fbtp_patchpoint = patch_instr; - fbt->fbtp_ctl = ctl; - fbt->fbtp_loadcnt = ctl->mod_loadcnt; - - if (*patch_instr == FBT_POP_RBP) { - fbt->fbtp_rval = DTRACE_INVOP_POP_RBP; - } else { - ASSERT(*patch_instr == FBT_LEAVE); - fbt->fbtp_rval = DTRACE_INVOP_LEAVE; + + __provide_probe_64(ctl, (uintptr_t)symbol->dtsym_addr, (uintptr_t)(symbol->dtsym_addr + symbol->dtsym_size), modname, name, (machine_inst_t*)(uintptr_t)symbol->dtsym_addr); } - fbt->fbtp_roffset = - (uintptr_t)(patch_instr - (uint8_t *)sym[i].n_value); - - fbt->fbtp_savedval = *patch_instr; - fbt->fbtp_patchval = FBT_PATCHVAL; - fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(patch_instr)]; - fbt->fbtp_symndx = i; - fbt_probetab[FBT_ADDR2NDX(patch_instr)] = fbt; - - instr += size; - goto again; } } #else #error Unknown arch #endif -extern struct modctl g_fbt_kernctl; -#undef kmem_alloc /* from its binding to dt_kmem_alloc glue */ -#undef kmem_free /* from its binding to dt_kmem_free glue */ -#include +extern int dtrace_kernel_symbol_mode; /*ARGSUSED*/ void fbt_provide_module(void *arg, struct modctl *ctl) { -#pragma unused(ctl) - __fbt_provide_module(arg, &g_fbt_kernctl); + ASSERT(ctl != NULL); + ASSERT(dtrace_kernel_symbol_mode != DTRACE_KERNEL_SYMBOLS_NEVER); + lck_mtx_assert(&mod_lock, LCK_MTX_ASSERT_OWNED); - if ( (vm_offset_t)g_fbt_kernctl.address != (vm_offset_t )NULL ) - kmem_free(kernel_map, (vm_offset_t)g_fbt_kernctl.address, round_page(g_fbt_kernctl.size)); - g_fbt_kernctl.address = 0; - g_fbt_kernctl.size = 0; + if (MOD_FBT_DONE(ctl)) + return; + + if (!is_module_valid(ctl)) { + ctl->mod_flags |= MODCTL_FBT_INVALID; + return; + } + + if (MOD_HAS_KERNEL_SYMBOLS(ctl)) { + __kernel_syms_provide_module(arg, ctl); + ctl->mod_flags |= MODCTL_FBT_PROBES_PROVIDED; + return; + } + + if (MOD_HAS_USERSPACE_SYMBOLS(ctl)) { + __user_syms_provide_module(arg, ctl); + ctl->mod_flags |= MODCTL_FBT_PROBES_PROVIDED; + return; + } } diff --git a/bsd/dev/i386/mem.c b/bsd/dev/i386/mem.c index e598cdf67..4b4589295 100644 --- a/bsd/dev/i386/mem.c +++ b/bsd/dev/i386/mem.c @@ -88,7 +88,9 @@ extern addr64_t kvtophys(vm_offset_t va); extern boolean_t kernacc(off_t, size_t ); +#if !defined(SECURE_KERNEL) extern int setup_kmem; +#endif static caddr_t devzerobuf; @@ -117,8 +119,11 @@ mmioctl(dev_t dev, u_long cmd, __unused caddr_t data, { int minnum = minor(dev); - if ((setup_kmem == 0) && ((minnum == 0) || (minnum == 1))) - return(EINVAL); + if ((minnum == 0) || (minnum == 1)) +#if !defined(SECURE_KERNEL) + if (setup_kmem == 0) + return(EINVAL); +#endif switch (cmd) { case FIONBIO: @@ -149,8 +154,12 @@ mmrw(dev_t dev, struct uio *uio, enum uio_rw rw) /* minor device 0 is physical memory */ case 0: +#if defined(SECURE_KERNEL) + return(ENODEV); +#else if (setup_kmem == 0) return(ENODEV); +#endif v = trunc_page(uio->uio_offset); if (uio->uio_offset >= (off_t)mem_size) @@ -169,8 +178,12 @@ mmrw(dev_t dev, struct uio *uio, enum uio_rw rw) /* minor device 1 is kernel memory */ case 1: +#if defined(SECURE_KERNEL) + return(ENODEV); +#else if (setup_kmem == 0) return(ENODEV); +#endif /* Do some sanity checking */ if (((vm_address_t)uio->uio_offset >= VM_MAX_KERNEL_ADDRESS) || ((vm_address_t)uio->uio_offset <= VM_MIN_KERNEL_AND_KEXT_ADDRESS)) diff --git a/bsd/dev/i386/munge.s b/bsd/dev/i386/munge.s index d174c06e3..9df397097 100644 --- a/bsd/dev/i386/munge.s +++ b/bsd/dev/i386/munge.s @@ -140,16 +140,92 @@ Entry(munge_wl) /* Costs an extra w move to do this */ ENTRY(munge_wlw) movl 8(%esp),%ecx // get &uu_args xorl %edx,%edx - movl 12(%ecx),%eax +Lwlw: + movl 12(%ecx),%eax //l movl %eax,16(%ecx) movl %edx,20(%ecx) - movl 8(%ecx),%eax +Lwl: + movl 8(%ecx),%eax //l movl %eax,12(%ecx) movl 4(%ecx),%eax movl %eax,8(%ecx) - movl %edx,4(%ecx) + movl %edx,4(%ecx) //w ret +ENTRY(munge_wlwwwll) + movl 8(%esp),%ecx // get &uu_args + xorl %edx,%edx +Lwlwwwll: + movl 36(%ecx),%eax + movl %eax,52(%ecx) + movl 32(%ecx),%eax + movl %eax,48(%ecx) + movl 28(%ecx),%eax + movl %eax,44(%ecx) + movl 24(%ecx),%eax + movl %eax,40(%ecx) + movl 20(%ecx),%eax + movl %eax,32(%ecx) + movl %edx,36(%ecx) +Lwlww: + movl 16(%ecx),%eax + movl %eax,24(%ecx) + movl %edx,28(%ecx) + jmp Lwlw + +ENTRY(munge_wlwwwllw) + movl 8(%esp),%ecx // get &uu_args + xorl %edx,%edx + movl 40(%ecx),%eax + movl %eax,56(%ecx) + movl %edx,60(%ecx) + jmp Lwlwwwll + +ENTRY(munge_wlwwlwlw) + movl 8(%esp),%ecx // get &uu_args + xorl %edx,%edx + movl 40(%ecx),%eax + movl %eax,56(%ecx) + movl %edx,60(%ecx) + movl 36(%ecx),%eax + movl %eax,52(%ecx) + movl 32(%ecx),%eax + movl %eax,48(%ecx) + movl 28(%ecx),%eax + movl %eax,40(%ecx) + movl %edx,44(%ecx) + movl 24(%ecx),%eax + movl %eax,36(%ecx) + movl 20(%ecx),%eax + movl %eax,32(%ecx) + jmp Lwlww + +ENTRY(munge_wllwwll) + movl 8(%esp),%ecx // get &uu_args + xorl %edx,%edx + + movl 40(%ecx),%eax // l + movl %eax,52(%ecx) + movl 36(%ecx),%eax + movl %eax,48(%ecx) + movl 32(%ecx),%eax // l + movl %eax,44(%ecx) + movl 28(%ecx),%eax + movl %eax,40(%ecx) + + movl 24(%ecx),%eax //w + movl %eax,32(%ecx) + movl %edx,36(%ecx) + movl 20(%ecx),%eax //w + movl %eax,24(%ecx) + movl %edx,28(%ecx) + + movl 16(%ecx),%eax //l + movl %eax,20(%ecx) + movl 12(%ecx),%eax + movl %eax,16(%ecx) + jmp Lwl + Entry(munge_wwwlw) movl 8(%esp),%ecx // get &uu_args xorl %edx,%edx @@ -195,6 +271,63 @@ ENTRY(munge_wwwwwl) movl %eax,44(%ecx) jmp Lw5 +ENTRY(munge_wwwwwlww) + movl 8(%esp),%ecx // get &uu_args + xorl %edx,%edx + movl 32(%ecx),%eax + movl %eax,56(%ecx) + movl %edx,60(%ecx) + movl 28(%ecx),%eax + movl %eax,48(%ecx) + movl %edx,52(%ecx) + movl 20(%ecx),%eax + movl %eax,40(%ecx) + movl 24(%ecx),%eax + movl %eax,44(%ecx) + jmp Lw5 + +ENTRY(munge_wwwwwllw) + movl 8(%esp),%ecx // get &uu_args + xorl %edx,%edx + movl 36(%ecx),%eax + movl %eax,56(%ecx) + movl %edx,60(%ecx) + movl 28(%ecx),%eax + movl %eax,48(%ecx) + movl 32(%ecx),%eax + movl %eax,52(%ecx) + movl 20(%ecx),%eax + movl %eax,40(%ecx) + movl 24(%ecx),%eax + movl %eax,44(%ecx) + jmp Lw5 + +ENTRY(munge_wwwwwlll) + movl 8(%esp),%ecx // get &uu_args + xorl %edx,%edx + movl 36(%ecx),%eax + movl %eax,56(%ecx) + movl 40(%ecx),%eax + movl %eax,60(%ecx) + movl 28(%ecx),%eax + movl %eax,48(%ecx) + movl 32(%ecx),%eax + movl %eax,52(%ecx) + movl 20(%ecx),%eax + movl %eax,40(%ecx) + movl 24(%ecx),%eax + movl %eax,44(%ecx) + jmp Lw5 + +ENTRY(munge_wwwwwwl) + movl 8(%esp),%ecx // get &uu_args + xorl %edx,%edx + movl 24(%ecx),%eax + movl %eax,48(%ecx) + movl 28(%ecx),%eax + movl %eax,52(%ecx) + jmp Lw6 + ENTRY(munge_wwwwwwlw) movl 8(%esp),%ecx // get &uu_args xorl %edx,%edx diff --git a/bsd/dev/i386/sdt_x86.c b/bsd/dev/i386/sdt_x86.c index c354b303e..680ed779b 100644 --- a/bsd/dev/i386/sdt_x86.c +++ b/bsd/dev/i386/sdt_x86.c @@ -107,3 +107,115 @@ sdt_invop(uintptr_t addr, uintptr_t *stack, uintptr_t eax) #endif +struct frame { + struct frame *backchain; + uintptr_t retaddr; +}; + +/*ARGSUSED*/ +uint64_t +sdt_getarg(void *arg, dtrace_id_t id, void *parg, int argno, int aframes) +{ +#pragma unused(arg, id, parg) + uint64_t val; + struct frame *fp = (struct frame *)__builtin_frame_address(0); + uintptr_t *stack; + uintptr_t pc; + int i; + +#if defined(__x86_64__) + /* + * A total of 6 arguments are passed via registers; any argument with + * index of 5 or lower is therefore in a register. + */ + int inreg = 5; +#endif + + for (i = 1; i <= aframes; i++) { + fp = fp->backchain; + pc = fp->retaddr; + + if (dtrace_invop_callsite_pre != NULL + && pc > (uintptr_t)dtrace_invop_callsite_pre + && pc <= (uintptr_t)dtrace_invop_callsite_post) { +#if defined(__i386__) + /* + * If we pass through the invalid op handler, we will + * use the pointer that it passed to the stack as the + * second argument to dtrace_invop() as the pointer to + * the frame we're hunting for. + */ + + stack = (uintptr_t *)&fp[1]; /* Find marshalled arguments */ + fp = (struct frame *)stack[1]; /* Grab *second* argument */ + stack = (uintptr_t *)&fp[0]; /* Find marshalled arguments */ +#elif defined(__x86_64__) + /* + * In the case of x86_64, we will use the pointer to the + * save area structure that was pushed when we took the + * trap. To get this structure, we must increment + * beyond the frame structure. If the + * argument that we're seeking is passed on the stack, + * we'll pull the true stack pointer out of the saved + * registers and decrement our argument by the number + * of arguments passed in registers; if the argument + * we're seeking is passed in regsiters, we can just + * load it directly. + */ + + /* fp points to frame of dtrace_invop() activation. */ + fp = fp->backchain; /* to fbt_perfcallback() activation. */ + fp = fp->backchain; /* to kernel_trap() activation. */ + fp = fp->backchain; /* to trap_from_kernel() activation. */ + + x86_saved_state_t *tagged_regs = (x86_saved_state_t *)&fp[1]; + x86_saved_state64_t *saved_state = saved_state64(tagged_regs); + + if (argno <= inreg) { + stack = (uintptr_t *)&saved_state->rdi; + } else { + fp = (struct frame *)(saved_state->isf.rsp); + stack = (uintptr_t *)&fp[0]; /* Find marshalled + arguments */ + argno -= (inreg +1); + } +#else +#error Unknown arch +#endif + goto load; + } + } + + /* + * We know that we did not come through a trap to get into + * dtrace_probe() -- We arrive here when the provider has + * called dtrace_probe() directly. + * The probe ID is the first argument to dtrace_probe(). + * We must advance beyond that to get the argX. + */ + argno++; /* Advance past probeID */ + +#if defined(__x86_64__) + if (argno <= inreg) { + /* + * This shouldn't happen. If the argument is passed in a + * register then it should have been, well, passed in a + * register... + */ + DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); + return (0); + } + + argno -= (inreg + 1); +#endif + stack = (uintptr_t *)&fp[1]; /* Find marshalled arguments */ + +load: + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + /* dtrace_probe arguments arg0 ... arg4 are 64bits wide */ + val = (uint64_t)(*(((uintptr_t *)stack) + argno)); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + + return (val); +} + diff --git a/bsd/dev/i386/sysctl.c b/bsd/dev/i386/sysctl.c index c255529a1..ba3bfc1ee 100644 --- a/bsd/dev/i386/sysctl.c +++ b/bsd/dev/i386/sysctl.c @@ -33,6 +33,9 @@ #include #include #include +#include +#include +#include static int _i386_cpu_info SYSCTL_HANDLER_ARGS @@ -201,6 +204,42 @@ cpu_flex_ratio_max SYSCTL_HANDLER_ARGS return SYSCTL_OUT(req, &flex_ratio_max, sizeof(flex_ratio_max)); } +static int +cpu_ucode_update SYSCTL_HANDLER_ARGS +{ + __unused struct sysctl_oid *unused_oidp = oidp; + __unused void *unused_arg1 = arg1; + __unused int unused_arg2 = arg2; + uint64_t addr; + int error; + + error = SYSCTL_IN(req, &addr, sizeof(addr)); + if (error) + return error; + + int ret = ucode_interface(addr); + return ret; +} + +extern uint64_t panic_restart_timeout; +static int +panic_set_restart_timeout(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + int new_value = 0, old_value = 0, changed = 0, error; + uint64_t nstime; + + if (panic_restart_timeout) { + absolutetime_to_nanoseconds(panic_restart_timeout, &nstime); + old_value = nstime / NSEC_PER_SEC; + } + + error = sysctl_io_number(req, old_value, sizeof(int), &new_value, &changed); + if (error == 0 && changed) { + nanoseconds_to_absolutetime(((uint64_t)new_value) * NSEC_PER_SEC, &panic_restart_timeout); + } + return error; +} + /* * Populates the {CPU, vector, latency} triple for the maximum observed primary * interrupt latency @@ -226,107 +265,113 @@ misc_interrupt_latency_max(__unused struct sysctl_oid *oidp, __unused void *arg1 SYSCTL_NODE(_machdep, OID_AUTO, cpu, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "CPU info"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, max_basic, CTLTYPE_INT | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, max_basic, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_max_basic),sizeof(uint32_t), i386_cpu_info, "IU", "Max Basic Information value"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, max_ext, CTLTYPE_INT | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, max_ext, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_max_ext), sizeof(uint32_t), i386_cpu_info, "IU", "Max Extended Function Information value"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, vendor, CTLTYPE_STRING | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, vendor, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_vendor), 0, i386_cpu_info, "A", "CPU vendor"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, brand_string, CTLTYPE_STRING | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, brand_string, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_brand_string), 0, i386_cpu_info, "A", "CPU brand string"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, family, CTLTYPE_INT | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, family, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_family), sizeof(uint8_t), i386_cpu_info, "I", "CPU family"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, model, CTLTYPE_INT | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, model, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_model), sizeof(uint8_t), i386_cpu_info, "I", "CPU model"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, extmodel, CTLTYPE_INT | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, extmodel, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_extmodel), sizeof(uint8_t), i386_cpu_info, "I", "CPU extended model"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, extfamily, CTLTYPE_INT | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, extfamily, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_extfamily), sizeof(uint8_t), i386_cpu_info, "I", "CPU extended family"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, stepping, CTLTYPE_INT | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, stepping, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_stepping), sizeof(uint8_t), i386_cpu_info, "I", "CPU stepping"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, feature_bits, CTLTYPE_QUAD | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, feature_bits, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_features), sizeof(uint64_t), i386_cpu_info, "IU", "CPU features"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, extfeature_bits, CTLTYPE_QUAD | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, extfeature_bits, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_extfeatures), sizeof(uint64_t), i386_cpu_info, "IU", "CPU extended features"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, signature, CTLTYPE_INT | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, signature, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_signature), sizeof(uint32_t), i386_cpu_info, "I", "CPU signature"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, brand, CTLTYPE_INT | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, brand, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_brand), sizeof(uint8_t), i386_cpu_info, "I", "CPU brand"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, features, CTLTYPE_STRING | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, features, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, cpu_features, "A", "CPU feature names"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, extfeatures, CTLTYPE_STRING | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, extfeatures, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, cpu_extfeatures, "A", "CPU extended feature names"); SYSCTL_PROC(_machdep_cpu, OID_AUTO, logical_per_package, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, cpu_logical_per_package, "I", "CPU logical cpus per package"); SYSCTL_PROC(_machdep_cpu, OID_AUTO, cores_per_package, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_cores_per_package), sizeof(uint32_t), i386_cpu_info, "I", "CPU cores per package"); SYSCTL_PROC(_machdep_cpu, OID_AUTO, microcode_version, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_microcode_version), sizeof(uint32_t), i386_cpu_info, "I", "Microcode version number"); +SYSCTL_PROC(_machdep_cpu, OID_AUTO, processor_flag, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, + (void *)offsetof(i386_cpu_info_t, cpuid_processor_flag), + sizeof(uint32_t), + i386_cpu_info, "I", "CPU processor flag"); + SYSCTL_NODE(_machdep_cpu, OID_AUTO, mwait, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "mwait"); SYSCTL_PROC(_machdep_cpu_mwait, OID_AUTO, linesize_min, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_mwait_leaf_t, linesize_min), sizeof(uint32_t), cpu_mwait, "I", "Monitor/mwait minimum line size"); SYSCTL_PROC(_machdep_cpu_mwait, OID_AUTO, linesize_max, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_mwait_leaf_t, linesize_max), sizeof(uint32_t), cpu_mwait, "I", "Monitor/mwait maximum line size"); SYSCTL_PROC(_machdep_cpu_mwait, OID_AUTO, extensions, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_mwait_leaf_t, extensions), sizeof(uint32_t), cpu_mwait, "I", "Monitor/mwait extensions"); SYSCTL_PROC(_machdep_cpu_mwait, OID_AUTO, sub_Cstates, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_mwait_leaf_t, sub_Cstates), sizeof(uint32_t), cpu_mwait, "I", "Monitor/mwait sub C-states"); @@ -336,31 +381,31 @@ SYSCTL_NODE(_machdep_cpu, OID_AUTO, thermal, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "thermal"); SYSCTL_PROC(_machdep_cpu_thermal, OID_AUTO, sensor, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_thermal_leaf_t, sensor), sizeof(boolean_t), cpu_thermal, "I", "Thermal sensor present"); SYSCTL_PROC(_machdep_cpu_thermal, OID_AUTO, dynamic_acceleration, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_thermal_leaf_t, dynamic_acceleration), sizeof(boolean_t), cpu_thermal, "I", "Dynamic Acceleration Technology (Turbo Mode)"); SYSCTL_PROC(_machdep_cpu_thermal, OID_AUTO, invariant_APIC_timer, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_thermal_leaf_t, invariant_APIC_timer), sizeof(boolean_t), cpu_thermal, "I", "Invariant APIC Timer"); SYSCTL_PROC(_machdep_cpu_thermal, OID_AUTO, thresholds, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_thermal_leaf_t, thresholds), sizeof(uint32_t), cpu_thermal, "I", "Number of interrupt thresholds"); SYSCTL_PROC(_machdep_cpu_thermal, OID_AUTO, ACNT_MCNT, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_thermal_leaf_t, ACNT_MCNT), sizeof(boolean_t), cpu_thermal, "I", "ACNT_MCNT capability"); @@ -410,43 +455,43 @@ SYSCTL_NODE(_machdep_cpu, OID_AUTO, arch_perf, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "arch_perf"); SYSCTL_PROC(_machdep_cpu_arch_perf, OID_AUTO, version, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_arch_perf_leaf_t, version), sizeof(uint8_t), cpu_arch_perf, "I", "Architectural Performance Version Number"); SYSCTL_PROC(_machdep_cpu_arch_perf, OID_AUTO, number, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_arch_perf_leaf_t, number), sizeof(uint8_t), cpu_arch_perf, "I", "Number of counters per logical cpu"); SYSCTL_PROC(_machdep_cpu_arch_perf, OID_AUTO, width, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_arch_perf_leaf_t, width), sizeof(uint8_t), cpu_arch_perf, "I", "Bit width of counters"); SYSCTL_PROC(_machdep_cpu_arch_perf, OID_AUTO, events_number, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_arch_perf_leaf_t, events_number), sizeof(uint8_t), cpu_arch_perf, "I", "Number of monitoring events"); SYSCTL_PROC(_machdep_cpu_arch_perf, OID_AUTO, events, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_arch_perf_leaf_t, events), sizeof(uint32_t), cpu_arch_perf, "I", "Bit vector of events"); SYSCTL_PROC(_machdep_cpu_arch_perf, OID_AUTO, fixed_number, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_arch_perf_leaf_t, fixed_number), sizeof(uint8_t), cpu_arch_perf, "I", "Number of fixed-function counters"); SYSCTL_PROC(_machdep_cpu_arch_perf, OID_AUTO, fixed_width, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_arch_perf_leaf_t, fixed_width), sizeof(uint8_t), cpu_arch_perf, "I", "Bit-width of fixed-function counters"); @@ -456,19 +501,19 @@ SYSCTL_NODE(_machdep_cpu, OID_AUTO, cache, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "cache"); SYSCTL_PROC(_machdep_cpu_cache, OID_AUTO, linesize, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_cache_linesize), sizeof(uint32_t), i386_cpu_info, "I", "Cacheline size"); SYSCTL_PROC(_machdep_cpu_cache, OID_AUTO, L2_associativity, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_cache_L2_associativity), sizeof(uint32_t), i386_cpu_info, "I", "L2 cache associativity"); SYSCTL_PROC(_machdep_cpu_cache, OID_AUTO, size, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_cache_size), sizeof(uint32_t), i386_cpu_info, "I", "Cache size (in Kbytes)"); @@ -482,7 +527,7 @@ SYSCTL_NODE(_machdep_cpu_tlb, OID_AUTO, data, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "data"); SYSCTL_PROC(_machdep_cpu_tlb_inst, OID_AUTO, small, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_tlb[TLB_INST][TLB_SMALL][0]), sizeof(uint32_t), @@ -490,7 +535,7 @@ SYSCTL_PROC(_machdep_cpu_tlb_inst, OID_AUTO, small, "Number of small page instruction TLBs"); SYSCTL_PROC(_machdep_cpu_tlb_data, OID_AUTO, small, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_tlb[TLB_DATA][TLB_SMALL][0]), sizeof(uint32_t), @@ -498,7 +543,7 @@ SYSCTL_PROC(_machdep_cpu_tlb_data, OID_AUTO, small, "Number of small page data TLBs (1st level)"); SYSCTL_PROC(_machdep_cpu_tlb_data, OID_AUTO, small_level1, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_tlb[TLB_DATA][TLB_SMALL][1]), sizeof(uint32_t), @@ -506,7 +551,7 @@ SYSCTL_PROC(_machdep_cpu_tlb_data, OID_AUTO, small_level1, "Number of small page data TLBs (2nd level)"); SYSCTL_PROC(_machdep_cpu_tlb_inst, OID_AUTO, large, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_tlb[TLB_INST][TLB_LARGE][0]), sizeof(uint32_t), @@ -514,7 +559,7 @@ SYSCTL_PROC(_machdep_cpu_tlb_inst, OID_AUTO, large, "Number of large page instruction TLBs"); SYSCTL_PROC(_machdep_cpu_tlb_data, OID_AUTO, large, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_tlb[TLB_DATA][TLB_LARGE][0]), sizeof(uint32_t), @@ -522,7 +567,7 @@ SYSCTL_PROC(_machdep_cpu_tlb_data, OID_AUTO, large, "Number of large page data TLBs (1st level)"); SYSCTL_PROC(_machdep_cpu_tlb_data, OID_AUTO, large_level1, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_tlb[TLB_DATA][TLB_LARGE][1]), sizeof(uint32_t), @@ -530,7 +575,7 @@ SYSCTL_PROC(_machdep_cpu_tlb_data, OID_AUTO, large_level1, "Number of large page data TLBs (2nd level)"); SYSCTL_PROC(_machdep_cpu_tlb, OID_AUTO, shared, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_stlb), sizeof(uint32_t), i386_cpu_info_nonzero, "I", @@ -541,26 +586,26 @@ SYSCTL_NODE(_machdep_cpu, OID_AUTO, address_bits, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "address_bits"); SYSCTL_PROC(_machdep_cpu_address_bits, OID_AUTO, physical, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_address_bits_physical), sizeof(uint32_t), i386_cpu_info, "I", "Number of physical address bits"); SYSCTL_PROC(_machdep_cpu_address_bits, OID_AUTO, virtual, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_address_bits_virtual), sizeof(uint32_t), i386_cpu_info, "I", "Number of virtual address bits"); SYSCTL_PROC(_machdep_cpu, OID_AUTO, core_count, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, core_count), sizeof(uint32_t), i386_cpu_info, "I", "Number of enabled cores per package"); SYSCTL_PROC(_machdep_cpu, OID_AUTO, thread_count, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, thread_count), sizeof(uint32_t), i386_cpu_info, "I", "Number of enabled threads per package"); @@ -569,34 +614,40 @@ SYSCTL_NODE(_machdep_cpu, OID_AUTO, flex_ratio, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Flex ratio"); SYSCTL_PROC(_machdep_cpu_flex_ratio, OID_AUTO, desired, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, cpu_flex_ratio_desired, "I", "Flex ratio desired (0 disabled)"); SYSCTL_PROC(_machdep_cpu_flex_ratio, OID_AUTO, min, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, cpu_flex_ratio_min, "I", "Flex ratio min (efficiency)"); SYSCTL_PROC(_machdep_cpu_flex_ratio, OID_AUTO, max, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, cpu_flex_ratio_max, "I", "Flex ratio max (non-turbo)"); +SYSCTL_PROC(_machdep_cpu, OID_AUTO, ucupdate, + CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED, 0, 0, + cpu_ucode_update, "S", "Microcode update interface"); + uint64_t pmap_pv_hashlist_walks; uint64_t pmap_pv_hashlist_cnts; uint32_t pmap_pv_hashlist_max; uint32_t pmap_kernel_text_ps = PAGE_SIZE; +extern uint32_t pv_hashed_kern_low_water_mark; /*extern struct sysctl_oid_list sysctl__machdep_pmap_children;*/ SYSCTL_NODE(_machdep, OID_AUTO, pmap, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "PMAP info"); -SYSCTL_QUAD (_machdep_pmap, OID_AUTO, hashwalks, CTLFLAG_RD | CTLFLAG_KERN, &pmap_pv_hashlist_walks, ""); -SYSCTL_QUAD (_machdep_pmap, OID_AUTO, hashcnts, CTLFLAG_RD | CTLFLAG_KERN, &pmap_pv_hashlist_cnts, ""); -SYSCTL_INT (_machdep_pmap, OID_AUTO, hashmax, CTLFLAG_RD | CTLFLAG_KERN, &pmap_pv_hashlist_max, 0, ""); -SYSCTL_INT (_machdep_pmap, OID_AUTO, kernel_text_ps, CTLFLAG_RD | CTLFLAG_KERN, &pmap_kernel_text_ps, 0, ""); +SYSCTL_QUAD (_machdep_pmap, OID_AUTO, hashwalks, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &pmap_pv_hashlist_walks, ""); +SYSCTL_QUAD (_machdep_pmap, OID_AUTO, hashcnts, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &pmap_pv_hashlist_cnts, ""); +SYSCTL_INT (_machdep_pmap, OID_AUTO, hashmax, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &pmap_pv_hashlist_max, 0, ""); +SYSCTL_INT (_machdep_pmap, OID_AUTO, kernel_text_ps, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &pmap_kernel_text_ps, 0, ""); +SYSCTL_INT (_machdep_pmap, OID_AUTO, kern_pv_reserve, CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, &pv_hashed_kern_low_water_mark, 0, ""); SYSCTL_NODE(_machdep, OID_AUTO, memmap, CTLFLAG_RD|CTLFLAG_LOCKED, NULL, "physical memory map"); @@ -621,9 +672,15 @@ SYSCTL_QUAD(_machdep_memmap, OID_AUTO, Other, CTLFLAG_RD|CTLFLAG_LOCKED, &firmwa SYSCTL_NODE(_machdep, OID_AUTO, tsc, CTLFLAG_RD|CTLFLAG_LOCKED, NULL, "Timestamp counter parameters"); SYSCTL_QUAD(_machdep_tsc, OID_AUTO, frequency, CTLFLAG_RD|CTLFLAG_LOCKED, &tscFreq, ""); + SYSCTL_NODE(_machdep, OID_AUTO, misc, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Miscellaneous x86 kernel parameters"); +SYSCTL_PROC(_machdep_misc, OID_AUTO, panic_restart_timeout, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, + panic_set_restart_timeout, "I", "Panic restart timeout in seconds"); + SYSCTL_PROC(_machdep_misc, OID_AUTO, interrupt_latency_max, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, misc_interrupt_latency_max, "A", "Maximum Interrupt latency"); diff --git a/bsd/dev/i386/systemcalls.c b/bsd/dev/i386/systemcalls.c index 660d0d1aa..7a849ca31 100644 --- a/bsd/dev/i386/systemcalls.c +++ b/bsd/dev/i386/systemcalls.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -54,6 +55,8 @@ #include #include +#include + #if CONFIG_DTRACE extern int32_t dtrace_systrace_syscall(struct proc *, void *, int *); extern void dtrace_systrace_syscall_return(unsigned short, int, int *); @@ -69,6 +72,15 @@ extern boolean_t x86_sysenter_arg_store_isvalid(thread_t thread); /* dynamically generated at build time based on syscalls.master */ extern const char *syscallnames[]; +/* + * This needs to be a single switch so that it's "all on" or "all off", + * rather than being turned on for some code paths and not others, as this + * has a tendency to introduce "blame the next guy" bugs. + */ +#if DEBUG +#define FUNNEL_DEBUG 1 /* Check for funnel held on exit */ +#endif + /* * Function: unix_syscall * @@ -90,6 +102,7 @@ unix_syscall(x86_saved_state_t *state) struct uthread *uthread; x86_saved_state32_t *regs; boolean_t args_in_uthread; + boolean_t is_vfork; assert(is_saved_state32(state)); regs = saved_state32(state); @@ -100,15 +113,15 @@ unix_syscall(x86_saved_state_t *state) thread = current_thread(); uthread = get_bsdthread_info(thread); - /* Get the approriate proc; may be different from task's for vfork() */ - if (!(uthread->uu_flag & UT_VFORK)) - p = (struct proc *)get_bsdtask_info(current_task()); - else + is_vfork = uthread->uu_flag & UT_VFORK; + if (__improbable(is_vfork != 0)) p = current_proc(); + else + p = (struct proc *)get_bsdtask_info(current_task()); /* Verify that we are not being called from a task without a proc */ - if (p == NULL) { + if (__improbable(p == NULL)) { regs->eax = EPERM; regs->efl |= EFL_CF; task_terminate_internal(current_task()); @@ -126,7 +139,7 @@ unix_syscall(x86_saved_state_t *state) callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; - if (callp == sysent) { + if (__improbable(callp == sysent)) { code = fuword(params); params += sizeof(int); callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; @@ -151,7 +164,7 @@ unix_syscall(x86_saved_state_t *state) } } - if (code != 180) { + if (__probable(code != 180)) { int *ip = (int *)vt; KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, @@ -191,9 +204,6 @@ unix_syscall(x86_saved_state_t *state) AUDIT_SYSCALL_ENTER(code, p, uthread); error = (*(callp->sy_call))((void *) p, (void *) vt, &(uthread->uu_rval[0])); AUDIT_SYSCALL_EXIT(code, p, uthread, error); -#if CONFIG_MACF - mac_thread_userret(code, error, thread); -#endif #ifdef JOE_DEBUG if (uthread->uu_iocount) @@ -203,7 +213,7 @@ unix_syscall(x86_saved_state_t *state) uthread->t_dtrace_errno = error; #endif /* CONFIG_DTRACE */ - if (error == ERESTART) { + if (__improbable(error == ERESTART)) { /* * Move the user's pc back to repeat the syscall: * 5 bytes for a sysenter, or 2 for an int 8x. @@ -211,14 +221,10 @@ unix_syscall(x86_saved_state_t *state) * - see debug trap handler in idt.s/idt64.s */ - if (regs->cs == SYSENTER_CS || regs->cs == SYSENTER_TF_CS) { - regs->eip -= 5; - } - else - regs->eip -= 2; + pal_syscall_restart(thread, state); } else if (error != EJUSTRETURN) { - if (error) { + if (__improbable(error)) { regs->eax = error; regs->efl |= EFL_CF; /* carry bit */ } else { /* (not error) */ @@ -232,13 +238,14 @@ unix_syscall(x86_saved_state_t *state) error, regs->eax, regs->edx); uthread->uu_flag &= ~UT_NOTCANCELPT; -#if DEBUG +#if FUNNEL_DEBUG /* * if we're holding the funnel panic */ syscall_exit_funnelcheck(); -#endif /* DEBUG */ - if (uthread->uu_lowpri_window) { +#endif /* FUNNEL_DEBUG */ + + if (__improbable(uthread->uu_lowpri_window)) { /* * task is marked as a low priority I/O type * and the I/O we issued while in this system call @@ -248,10 +255,13 @@ unix_syscall(x86_saved_state_t *state) */ throttle_lowpri_io(TRUE); } - if (code != 180) + if (__probable(code != 180)) KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0); + if (__improbable(!is_vfork && callp->sy_call == (sy_call_t *)execve && !error)) { + pal_execve_return(thread); + } thread_exception_return(); /* NOTREACHED */ @@ -273,21 +283,21 @@ unix_syscall64(x86_saved_state_t *state) assert(is_saved_state64(state)); regs = saved_state64(state); - +#if DEBUG if (regs->rax == 0x2000800) thread_exception_return(); - +#endif thread = current_thread(); uthread = get_bsdthread_info(thread); /* Get the approriate proc; may be different from task's for vfork() */ - if (!(uthread->uu_flag & UT_VFORK)) + if (__probable(!(uthread->uu_flag & UT_VFORK))) p = (struct proc *)get_bsdtask_info(current_task()); else p = current_proc(); /* Verify that we are not being called from a task without a proc */ - if (p == NULL) { + if (__improbable(p == NULL)) { regs->rax = EPERM; regs->isf.rflags |= EFL_CF; task_terminate_internal(current_task()); @@ -303,7 +313,7 @@ unix_syscall64(x86_saved_state_t *state) callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; uargp = (void *)(®s->rdi); - if (callp == sysent) { + if (__improbable(callp == sysent)) { /* * indirect system call... system call number * passed as 'arg0' @@ -323,7 +333,7 @@ unix_syscall64(x86_saved_state_t *state) } assert(callp->sy_narg <= 8); - if (callp->sy_narg > args_in_regs) { + if (__improbable(callp->sy_narg > args_in_regs)) { int copyin_count; copyin_count = (callp->sy_narg - args_in_regs) * sizeof(uint64_t); @@ -339,7 +349,7 @@ unix_syscall64(x86_saved_state_t *state) /* * XXX Turn 64 bit unsafe calls into nosys() */ - if (callp->sy_flags & UNSAFE_64BIT) { + if (__improbable(callp->sy_flags & UNSAFE_64BIT)) { callp = &sysent[63]; goto unsafe; } @@ -360,25 +370,34 @@ unix_syscall64(x86_saved_state_t *state) uthread->uu_flag |= UT_NOTCANCELPT; +#ifdef JOE_DEBUG + uthread->uu_iocount = 0; + uthread->uu_vpindex = 0; +#endif AUDIT_SYSCALL_ENTER(code, p, uthread); error = (*(callp->sy_call))((void *) p, uargp, &(uthread->uu_rval[0])); AUDIT_SYSCALL_EXIT(code, p, uthread, error); +#ifdef JOE_DEBUG + if (uthread->uu_iocount) + printf("system call returned with uu_iocount != 0\n"); +#endif + #if CONFIG_DTRACE uthread->t_dtrace_errno = error; #endif /* CONFIG_DTRACE */ - if (error == ERESTART) { + if (__improbable(error == ERESTART)) { /* * all system calls come through via the syscall instruction * in 64 bit mode... its 2 bytes in length * move the user's pc back to repeat the syscall: */ - regs->isf.rip -= 2; + pal_syscall_restart( thread, state ); } else if (error != EJUSTRETURN) { - if (error) { + if (__improbable(error)) { regs->rax = error; regs->isf.rflags |= EFL_CF; /* carry bit */ } else { /* (not error) */ @@ -416,12 +435,14 @@ unix_syscall64(x86_saved_state_t *state) uthread->uu_flag &= ~UT_NOTCANCELPT; +#if FUNNEL_DEBUG /* * if we're holding the funnel panic */ syscall_exit_funnelcheck(); +#endif /* FUNNEL_DEBUG */ - if (uthread->uu_lowpri_window) { + if (__improbable(uthread->uu_lowpri_window)) { /* * task is marked as a low priority I/O type * and the I/O we issued while in this system call @@ -431,7 +452,7 @@ unix_syscall64(x86_saved_state_t *state) */ throttle_lowpri_io(TRUE); } - if (code != 180) + if (__probable(code != 180)) KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0); @@ -453,6 +474,7 @@ unix_syscall_return(int error) thread = current_thread(); uthread = get_bsdthread_info(thread); + pal_register_cache_state(thread, DIRTY); p = current_proc(); @@ -480,11 +502,9 @@ unix_syscall_return(int error) if (error == ERESTART) { /* - * all system calls come through via the syscall instruction - * in 64 bit mode... its 2 bytes in length - * move the user's pc back to repeat the syscall: + * repeat the syscall */ - regs->isf.rip -= 2; + pal_syscall_restart( thread, find_user_regs(thread) ); } else if (error != EJUSTRETURN) { if (error) { @@ -542,7 +562,7 @@ unix_syscall_return(int error) code = fuword(params); } if (error == ERESTART) { - regs->eip -= ((regs->cs & 0xffff) == SYSENTER_CS) ? 5 : 2; + pal_syscall_restart( thread, find_user_regs(thread) ); } else if (error != EJUSTRETURN) { if (error) { @@ -561,10 +581,12 @@ unix_syscall_return(int error) uthread->uu_flag &= ~UT_NOTCANCELPT; +#if FUNNEL_DEBUG /* * if we're holding the funnel panic */ syscall_exit_funnelcheck(); +#endif /* FUNNEL_DEBUG */ if (uthread->uu_lowpri_window) { /* diff --git a/bsd/dev/i386/unix_signal.c b/bsd/dev/i386/unix_signal.c index 06ed4172c..4292d6515 100644 --- a/bsd/dev/i386/unix_signal.c +++ b/bsd/dev/i386/unix_signal.c @@ -54,12 +54,12 @@ #include #include -#include +#include +#include #include - /* Forward: */ extern boolean_t machine_exception(int, mach_exception_code_t, mach_exception_subcode_t, int *, mach_exception_subcode_t *); @@ -610,6 +610,8 @@ sendsig(struct proc *p, user_addr_t ua_catcher, int sig, int mask, __unused uint goto bad; ml_fp_setvalid(FALSE); + /* Tell the PAL layer about the signal */ + pal_set_signal_delivery( thread ); proc_lock(p); diff --git a/bsd/dev/memdev.c b/bsd/dev/memdev.c index fe07f5e53..c425c7e08 100644 --- a/bsd/dev/memdev.c +++ b/bsd/dev/memdev.c @@ -175,7 +175,7 @@ int mdevBMajor = -1; int mdevCMajor = -1; static int mdevioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p, int is_char); -dev_t mdevadd(int devid, ppnum_t base, unsigned int size, int phys); +dev_t mdevadd(int devid, uint64_t base, unsigned int size, int phys); dev_t mdevlookup(int devid); void mdevremoveall(void); @@ -543,7 +543,7 @@ char *cvtnum(char *pos, char *end, unsigned int *num) { /* Convert to a number #endif /* CONFIG_MEMDEV_INSECURE */ -dev_t mdevadd(int devid, ppnum_t base, unsigned int size, int phys) { +dev_t mdevadd(int devid, uint64_t base, unsigned int size, int phys) { int i; @@ -556,7 +556,7 @@ dev_t mdevadd(int devid, ppnum_t base, unsigned int size, int phys) { continue; /* Skip check */ } if(!(((base + size -1 ) < mdev[i].mdBase) || ((mdev[i].mdBase + mdev[i].mdSize - 1) < base))) { /* Is there any overlap? */ - panic("mdevadd: attempt to add overlapping memory device at %08lX-%08lX\n", (long) mdev[i].mdBase, (long) mdev[i].mdBase + mdev[i].mdSize - 1); + panic("mdevadd: attempt to add overlapping memory device at %016llX-%016llX\n", mdev[i].mdBase, mdev[i].mdBase + mdev[i].mdSize - 1); } } if(devid < 0) { /* Do we have free slots? */ @@ -567,7 +567,7 @@ dev_t mdevadd(int devid, ppnum_t base, unsigned int size, int phys) { if(devid >= 16) { /* Giving us something bogus? */ panic("mdevadd: attempt to explicitly add a bogus memory device: %08X\n", devid); } - if(mdev[devid].mdFlags &mdInited) { /* Already there? */ + if(mdev[devid].mdFlags & mdInited) { /* Already there? */ panic("mdevadd: attempt to explicitly add a previously defined memory device: %08X\n", devid); } } @@ -611,8 +611,8 @@ dev_t mdevadd(int devid, ppnum_t base, unsigned int size, int phys) { mdev[devid].mdSecsize = DEV_BSIZE; /* Set starting block size */ if(phys) mdev[devid].mdFlags |= mdPhys; /* Show that we are in physical memory */ mdev[devid].mdFlags |= mdInited; /* Show we are all set up */ - printf("Added memory device md%x/rmd%x (%08X/%08X) at %08X for %08X\n", - devid, devid, mdev[devid].mdBDev, mdev[devid].mdCDev, base << 12, size << 12); + printf("Added memory device md%x/rmd%x (%08X/%08X) at %016llX for %016llX\n", + devid, devid, mdev[devid].mdBDev, mdev[devid].mdCDev, base << 12, (uint64_t)size << 12); return mdev[devid].mdBDev; } diff --git a/bsd/dev/ppc/conf.c b/bsd/dev/ppc/conf.c deleted file mode 100644 index acc9a8545..000000000 --- a/bsd/dev/ppc/conf.c +++ /dev/null @@ -1,354 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1997 by Apple Computer, Inc., all rights reserved - * Copyright (c) 1993 NeXT Computer, Inc. - * - * UNIX Device switch tables. - * - * HISTORY - * - * 30 July 1997 Umesh Vaishampayan (umeshv@apple.com) - * enabled file descriptor pseudo-device. - * 18 June 1993 ? at NeXT - * Cleaned up a lot of stuff in this file. - */ - -#include -#include -#include -#include -#include -#include - - -struct bdevsw bdevsw[] = -{ - /* - * For block devices, every other block of 8 slots is - * reserved to NeXT. The other slots are available for - * the user. This way we can both add new entries without - * running into each other. Be sure to fill in NeXT's - * 8 reserved slots when you jump over us -- we'll do the - * same for you. - */ - - /* 0 - 7 are reserved to NeXT */ - - NO_BDEVICE, /* 0*/ - NO_BDEVICE, /* 1*/ - NO_BDEVICE, /* 2*/ - NO_BDEVICE, /* 3*/ - NO_BDEVICE, /* 4*/ - NO_BDEVICE, /* 5*/ - NO_BDEVICE, /* 6*/ - NO_BDEVICE, /* 7*/ - - /* 8 - 15 are reserved to the user */ - NO_BDEVICE, /* 8*/ - NO_BDEVICE, /* 9*/ - NO_BDEVICE, /*10*/ - NO_BDEVICE, /*11*/ - NO_BDEVICE, /*12*/ - NO_BDEVICE, /*13*/ - NO_BDEVICE, /*14*/ - NO_BDEVICE, /*15*/ - - /* 16 - 23 are reserved to NeXT */ - NO_BDEVICE, /*16*/ - NO_BDEVICE, /*17*/ - NO_BDEVICE, /*18*/ - NO_BDEVICE, /*18*/ - NO_BDEVICE, /*20*/ - NO_BDEVICE, /*21*/ - NO_BDEVICE, /*22*/ - NO_BDEVICE, /*23*/ -}; - -int nblkdev = sizeof (bdevsw) / sizeof (bdevsw[0]); - -extern struct tty *km_tty[]; - -dev_t chrtoblk(dev_t dev); -int chrtoblk_set(int cdev, int bdev); -int iskmemdev(dev_t dev); - - -/* XXX No support for linker sets, so must declare here */ -int cttyopen(dev_t dev, int flag, int mode, struct proc *p); -int cttyread(dev_t dev, struct uio *uio, int flag); -int cttywrite(dev_t dev, struct uio *uio, int flag); -int cttyioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, struct proc *p); -int cttyselect(dev_t dev, int flag, void* wql, struct proc *p); - -/* XXX bsd/dev/ppc/mem.c */ -int mmread(dev_t dev, struct uio *uio, int flag); -int mmioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p); -int mmwrite(dev_t dev, struct uio *uio, int flag); - -#define mmselect (select_fcn_t *)seltrue - -#if 1 -#ifdef NPTY -#undef NPTY -#endif /* NPTY */ -#define NPTY 32 -#else /* 1 */ -#include -#endif /* 1 */ -#if NPTY > 0 -extern struct tty *pt_tty[]; -extern d_open_t ptsopen; -extern d_close_t ptsclose; -extern d_read_t ptsread; -extern d_write_t ptswrite; -extern d_stop_t ptsstop; -extern d_open_t ptcopen; -extern d_close_t ptcclose; -extern d_read_t ptcread; -extern d_write_t ptcwrite; -extern d_select_t ptcselect; -extern d_ioctl_t ptyioctl; -#else -#define ptsopen eno_opcl -#define ptsclose eno_opcl -#define ptsread eno_rdwrt -#define ptswrite eno_rdwrt -#define ptsstop nulldev - -#define ptcopen eno_opcl -#define ptcclose eno_opcl -#define ptcread eno_rdwrt -#define ptcwrite eno_rdwrt -#define ptcselect eno_select -#define ptyioctl eno_ioctl -#endif - -extern d_open_t logopen; -extern d_close_t logclose; -extern d_read_t logread; -extern d_ioctl_t logioctl; -extern d_select_t logselect; - -struct cdevsw cdevsw[] = -{ - /* - * For character devices, every other block of 16 slots is - * reserved to NeXT. The other slots are available for - * the user. This way we can both add new entries without - * running into each other. Be sure to fill in NeXT's - * 16 reserved slots when you jump over us -- we'll do the - * same for you. - */ - - /* 0 - 15 are reserved to NeXT */ - - { - consopen, consclose, consread, conswrite, /* 0*/ - consioctl, ((stop_fcn_t *)&nulldev), - ((reset_fcn_t *)&nulldev), - 0, consselect, - eno_mmap, eno_strat, eno_getc, eno_putc, D_TTY - }, - NO_CDEVICE, /* 1*/ - { - cttyopen, ((open_close_fcn_t *)&nulldev), - cttyread, cttywrite, /* 2*/ - cttyioctl, ((stop_fcn_t *)&nulldev), - ((reset_fcn_t *)&nulldev), - 0, cttyselect, - eno_mmap, eno_strat, eno_getc, eno_putc, D_TTY - }, - { - ((open_close_fcn_t *)&nulldev), - ((open_close_fcn_t *)&nulldev), - mmread, mmwrite, /* 3*/ - mmioctl, ((stop_fcn_t *)&nulldev), - ((reset_fcn_t *)&nulldev), - 0, mmselect, - eno_mmap, eno_strat, eno_getc, eno_putc, D_DISK - }, - { - ptsopen, ptsclose, ptsread, ptswrite, /* 4*/ - ptyioctl, ptsstop, ((reset_fcn_t *)&nulldev), - pt_tty, ttselect, - eno_mmap, eno_strat, eno_getc, eno_putc, D_TTY - }, - { - ptcopen, ptcclose, ptcread, ptcwrite, /* 5*/ - ptyioctl, ((stop_fcn_t *)&nulldev), - ((reset_fcn_t *)&nulldev), - 0, ptcselect, - eno_mmap, eno_strat, eno_getc, eno_putc, D_TTY - }, - { - logopen, logclose, logread, eno_rdwrt, /* 6*/ - logioctl, eno_stop, ((reset_fcn_t *)&nulldev), - 0, logselect, - eno_mmap, eno_strat, eno_getc, eno_putc, 0 - }, - NO_CDEVICE, /* 7*/ - NO_CDEVICE, /* 8*/ - NO_CDEVICE, /* 9*/ - NO_CDEVICE, /*10*/ - NO_CDEVICE, /*11*/ - { - kmopen, kmclose, kmread, kmwrite, /*12*/ - kmioctl, ((stop_fcn_t *)&nulldev), - ((reset_fcn_t *)&nulldev), - km_tty, ttselect, - eno_mmap, eno_strat, eno_getc, eno_putc, 0 - }, - NO_CDEVICE, /*13*/ - NO_CDEVICE, /*14*/ - NO_CDEVICE, /*15*/ - - /* 16 - 31 are reserved to the user */ - NO_CDEVICE, /*16*/ - NO_CDEVICE, /*17*/ - NO_CDEVICE, /*18*/ - NO_CDEVICE, /*19*/ - NO_CDEVICE, /*20*/ - NO_CDEVICE, /*21*/ - NO_CDEVICE, /*22*/ - NO_CDEVICE, /*23*/ - NO_CDEVICE, /*24*/ - NO_CDEVICE, /*25*/ - NO_CDEVICE, /*26*/ - NO_CDEVICE, /*27*/ - NO_CDEVICE, /*28*/ - NO_CDEVICE, /*29*/ - NO_CDEVICE, /*30*/ - NO_CDEVICE, /*31*/ - - /* 32 - 47 are reserved to NeXT */ - NO_CDEVICE, /*32*/ - NO_CDEVICE, /*33*/ - NO_CDEVICE, /*34*/ - NO_CDEVICE, /*35*/ - NO_CDEVICE, /*36*/ - /* 37 used to be for nvram */ - NO_CDEVICE, /*37*/ - NO_CDEVICE, /*38*/ - NO_CDEVICE, /*39*/ - NO_CDEVICE, /*40*/ - /* 41 used to be for fd */ - NO_CDEVICE, /*41*/ - NO_CDEVICE, /*42*/ -}; -int nchrdev = sizeof (cdevsw) / sizeof (cdevsw[0]); - - -#include /* for VCHR and VBLK */ -/* - * return true if a disk - */ -int -isdisk(dev_t dev, int type) -{ - dev_t maj = major(dev); - - switch (type) { - case VCHR: - maj = chrtoblk(maj); - if (maj == NODEV) { - break; - } - /* FALL THROUGH */ - case VBLK: - if (bdevsw[maj].d_type == D_DISK) { - return (1); - } - break; - } - return(0); -} - -static int chrtoblktab[] = { - /* CHR*/ /* BLK*/ /* CHR*/ /* BLK*/ - /* 0 */ NODEV, /* 1 */ NODEV, - /* 2 */ NODEV, /* 3 */ NODEV, - /* 4 */ NODEV, /* 5 */ NODEV, - /* 6 */ NODEV, /* 7 */ NODEV, - /* 8 */ NODEV, /* 9 */ NODEV, - /* 10 */ NODEV, /* 11 */ NODEV, - /* 12 */ NODEV, /* 13 */ NODEV, - /* 14 */ 6, /* 15 */ NODEV, - /* 16 */ NODEV, /* 17 */ NODEV, - /* 18 */ NODEV, /* 19 */ NODEV, - /* 20 */ NODEV, /* 21 */ NODEV, - /* 22 */ NODEV, /* 23 */ NODEV, - /* 24 */ NODEV, /* 25 */ NODEV, - /* 26 */ NODEV, /* 27 */ NODEV, - /* 28 */ NODEV, /* 29 */ NODEV, - /* 30 */ NODEV, /* 31 */ NODEV, - /* 32 */ NODEV, /* 33 */ NODEV, - /* 34 */ NODEV, /* 35 */ NODEV, - /* 36 */ NODEV, /* 37 */ NODEV, - /* 38 */ NODEV, /* 39 */ NODEV, - /* 40 */ NODEV, /* 41 */ 1, - /* 42 */ NODEV, /* 43 */ NODEV, - /* 44 */ NODEV, -}; - -/* - * convert chr dev to blk dev - */ -dev_t -chrtoblk(dev_t dev) -{ - int blkmaj; - - if (major(dev) >= nchrdev) - return(NODEV); - blkmaj = chrtoblktab[major(dev)]; - if (blkmaj == NODEV) - return(NODEV); - return(makedev(blkmaj, minor(dev))); -} - -int -chrtoblk_set(int cdev, int bdev) -{ - if (cdev >= nchrdev) - return (NODEV); - if (bdev != NODEV && bdev >= nblkdev) - return (NODEV); - chrtoblktab[cdev] = bdev; - return 0; -} - -/* - * Returns true if dev is /dev/mem or /dev/kmem. - */ -int -iskmemdev(dev_t dev) -{ - - return (major(dev) == 3 && minor(dev) < 2); -} diff --git a/bsd/dev/ppc/cons.c b/bsd/dev/ppc/cons.c deleted file mode 100644 index 207ee03ae..000000000 --- a/bsd/dev/ppc/cons.c +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1987, 1988 NeXT, Inc. - * - * HISTORY - * 7-Jan-93 Mac Gillon (mgillon) at NeXT - * Integrated POSIX support - * - * 12-Aug-87 John Seamons (jks) at NeXT - * Ported to NeXT. - */ - -/* - * Indirect driver for console. - */ -#include -#include -#include -#include -#include -#include -#include -#include - -struct tty *constty; /* current console device */ - -/* - * The km driver supplied the default console device for the systems - * (usually a raw frame buffer driver, but potentially a serial driver). - */ -extern struct tty *km_tty[1]; - -static dev_t -cndev(void) -{ - if (constty) - return constty->t_dev; - else - return km_tty[0]->t_dev; -} - -/*ARGSUSED*/ -int -consopen(__unused dev_t dev, int flag, int devtype, struct proc *pp) -{ - dev = cndev(); - return ((*cdevsw[major(dev)].d_open)(dev, flag, devtype, pp)); -} - - -/*ARGSUSED*/ -int -consclose(__unused dev_t dev, int flag, int mode, struct proc *pp) -{ - dev = cndev(); - return ((*cdevsw[major(dev)].d_close)(dev, flag, mode, pp)); -} - - -/*ARGSUSED*/ -int -consread(__unused dev_t dev, struct uio *uio, int ioflag) -{ - dev = cndev(); - return ((*cdevsw[major(dev)].d_read)(dev, uio, ioflag)); -} - - -/*ARGSUSED*/ -int -conswrite(__unused dev_t dev, struct uio *uio, int ioflag) -{ - dev = cndev(); - return ((*cdevsw[major(dev)].d_write)(dev, uio, ioflag)); -} - - -/*ARGSUSED*/ -int -consioctl(__unused dev_t dev, u_long cmd, caddr_t addr, int flag, struct proc *p) -{ - dev = cndev(); -#if 0 - /* - * Superuser can always use this to wrest control of console - * output from the "virtual" console. - * - * XXX Unfortunately, this code doesn't do what the author thougt - * XXX it did; use of the console device, a TIOCCONS would always - * XXX disassociate the console from a virtual terminal and send - * XXX it back to the fake tty. - */ - if ((unsigned) cmd == TIOCCONS && constty) { - int error = proc_suser(p); - if (!error) { - constty = NULL; - } - return(error); - } -#endif /* 0 */ - - return ((*cdevsw[major(dev)].d_ioctl)(dev, cmd, addr, flag, p)); -} - - -/*ARGSUSED*/ -/* called with funnel held */ -int -consselect(__unused dev_t dev, int flag, void *wql, struct proc *p) -{ - dev = cndev(); - return ((*cdevsw[major(dev)].d_select)(dev, flag, wql, p)); -} diff --git a/bsd/dev/ppc/dtrace_isa.c b/bsd/dev/ppc/dtrace_isa.c deleted file mode 100644 index 21b49bdc4..000000000 --- a/bsd/dev/ppc/dtrace_isa.c +++ /dev/null @@ -1,589 +0,0 @@ -/* - * Copyright (c) 2005-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from mach/ppc/thread_status.h */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -extern dtrace_id_t dtrace_probeid_error; /* special ERROR probe */ - -void -dtrace_probe_error(dtrace_state_t *state, dtrace_epid_t epid, int which, - int fltoffs, int fault, uint64_t illval) -{ - /* - * dtrace_getarg() is a lost cause on PPC. For the case of the error probe firing lets - * stash away "illval" here, and special-case retrieving it in DIF_VARIABLE_ARG. - */ - state->dts_arg_error_illval = illval; - dtrace_probe( dtrace_probeid_error, (uint64_t)(uintptr_t)state, epid, which, fltoffs, fault ); -} - -/* - * Atomicity and synchronization - */ -void -dtrace_membar_producer(void) -{ - __asm__ volatile("sync"); -} - -void -dtrace_membar_consumer(void) -{ - __asm__ volatile("isync"); -} - -/* - * Interrupt manipulation - * XXX dtrace_getipl() can be called from probe context. - */ -int -dtrace_getipl(void) -{ - return (ml_at_interrupt_context() ? 1: 0); -} - -/* - * MP coordination - */ -typedef void (*broadcastFunc) (uint32_t); - -int32_t cpu_broadcast(uint32_t *, broadcastFunc, uint32_t); /* osfmk/ppc/machine_cpu.h */ - -typedef struct xcArg { - processorid_t cpu; - dtrace_xcall_t f; - void *arg; - uint32_t waitVar; -} xcArg_t; - -static void -xcRemote( uint32_t foo ) -{ - xcArg_t *pArg = (xcArg_t *)foo; - - if ( pArg->cpu == CPU->cpu_id || pArg->cpu == DTRACE_CPUALL ) { - (pArg->f)(pArg->arg); - } - - if(!hw_atomic_sub(&(pArg->waitVar), 1)) { /* Drop the wait count */ - thread_wakeup((event_t)&(pArg->waitVar)); /* If we were the last, wake up the signaller */ - } -} - -/* - * dtrace_xcall() is not called from probe context. - */ -void -dtrace_xcall(processorid_t cpu, dtrace_xcall_t f, void *arg) -{ - xcArg_t xcArg; - - /* Talking to ourselves, are we? */ - if ( cpu == CPU->cpu_id ) { - (*f)(arg); - return; - } - - if ( cpu == DTRACE_CPUALL ) { - (*f)(arg); - } - - xcArg.cpu = cpu; - xcArg.f = f; - xcArg.arg = arg; - xcArg.waitVar = 0; - - (void)cpu_broadcast(&(xcArg.waitVar), xcRemote, (uint32_t)&xcArg); -} - -/* - * Runtime and ABI - */ -uint64_t -dtrace_getreg(struct regs *savearea, uint_t reg) -{ - ppc_saved_state_t *regs = (ppc_saved_state_t *)savearea; - uint64_t mask = (_cpu_capabilities & k64Bit) ? 0xffffffffffffffffULL : 0x00000000ffffffffULL; - - /* See osfmk/ppc/savearea.h */ - if (reg > 68) { /* beyond mmcr2 */ - DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); - return (0); - } - - switch (reg) { - /* First 38 registers are saved to 64 bits r0-r31, srr0, srr1, xer, lr, ctr, dar. */ - default: - return (((uint64_t *)(&(regs->save_r0)))[reg]) & mask; - - /* Handle the 32-bit registers */ - case 38: case 39: case 40: case 41: /* cr, dsisr, exception, vrsave */ - case 42: case 43: case 44: case 45: /* vscr[4] */ - case 46: case 47: case 48: case 49: /* fpscrpad, fpscr, save_1d8[2] */ - case 50: case 51: case 52: case 53: /* save_1E0[8] */ - case 54: case 55: case 56: case 57: - case 58: case 59: case 60: case 61: /* save_pmc[8] */ - case 62: case 63: case 64: case 65: - return (uint64_t)(((unsigned int *)(&(regs->save_cr)))[reg - 38]); - - case 66: - return regs->save_mmcr0 & mask; - case 67: - return regs->save_mmcr1 & mask; - case 68: - return regs->save_mmcr2 & mask; - } -} - -#define RETURN_OFFSET 8 -#define RETURN_OFFSET64 16 -#define REGPC save_srr0 -#define REGSP save_r1 - -/* - * XXX dtrace_getustack_common() can be called from probe context. - */ -static int -dtrace_getustack_common(uint64_t *pcstack, int pcstack_limit, user_addr_t pc, - user_addr_t sp) -{ -#if 0 - volatile uint16_t *flags = - (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags; - - uintptr_t oldcontext = lwp->lwp_oldcontext; /* XXX signal stack crawl*/ - size_t s1, s2; -#endif - int ret = 0; - boolean_t is64Bit = proc_is64bit(current_proc()); - - ASSERT(pcstack == NULL || pcstack_limit > 0); - -#if 0 /* XXX signal stack crawl*/ - if (p->p_model == DATAMODEL_NATIVE) { - s1 = sizeof (struct frame) + 2 * sizeof (long); - s2 = s1 + sizeof (siginfo_t); - } else { - s1 = sizeof (struct frame32) + 3 * sizeof (int); - s2 = s1 + sizeof (siginfo32_t); - } -#endif - - while (pc != 0) { - ret++; - if (pcstack != NULL) { - *pcstack++ = (uint64_t)pc; - pcstack_limit--; - if (pcstack_limit <= 0) - break; - } - - if (sp == 0) - break; - -#if 0 /* XXX signal stack crawl*/ - if (oldcontext == sp + s1 || oldcontext == sp + s2) { - if (p->p_model == DATAMODEL_NATIVE) { - ucontext_t *ucp = (ucontext_t *)oldcontext; - greg_t *gregs = ucp->uc_mcontext.gregs; - - sp = dtrace_fulword(&gregs[REG_FP]); - pc = dtrace_fulword(&gregs[REG_PC]); - - oldcontext = dtrace_fulword(&ucp->uc_link); - } else { - ucontext32_t *ucp = (ucontext32_t *)oldcontext; - greg32_t *gregs = ucp->uc_mcontext.gregs; - - sp = dtrace_fuword32(&gregs[EBP]); - pc = dtrace_fuword32(&gregs[EIP]); - - oldcontext = dtrace_fuword32(&ucp->uc_link); - } - } - else -#endif - { - if (is64Bit) { - pc = dtrace_fuword64((sp + RETURN_OFFSET64)); - sp = dtrace_fuword64(sp); - } else { - pc = dtrace_fuword32((sp + RETURN_OFFSET)); - sp = dtrace_fuword32(sp); - } - } - } - - return (ret); -} - -void -dtrace_getupcstack(uint64_t *pcstack, int pcstack_limit) -{ - thread_t thread = current_thread(); - ppc_saved_state_t *regs; - user_addr_t pc, sp; - volatile uint16_t *flags = - (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags; - int n; - boolean_t is64Bit = proc_is64bit(current_proc()); - - if (*flags & CPU_DTRACE_FAULT) - return; - - if (pcstack_limit <= 0) - return; - - /* - * If there's no user context we still need to zero the stack. - */ - if (thread == NULL) - goto zero; - - regs = (ppc_saved_state_t *)find_user_regs(thread); - if (regs == NULL) - goto zero; - - *pcstack++ = (uint64_t)proc_selfpid(); - pcstack_limit--; - - if (pcstack_limit <= 0) - return; - - pc = regs->REGPC; - sp = regs->REGSP; - - if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_ENTRY)) { - *pcstack++ = (uint64_t)pc; - pcstack_limit--; - if (pcstack_limit <= 0) - return; - - pc = regs->save_lr; - } - - if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_USTACK_FP)) { - /* - * If the ustack fp flag is set, the stack frame from sp to - * fp contains no valid call information. Start with the fp. - */ - if (is64Bit) - sp = dtrace_fuword64(sp); - else - sp = (user_addr_t)dtrace_fuword32(sp); - } - - n = dtrace_getustack_common(pcstack, pcstack_limit, pc, sp); - ASSERT(n >= 0); - ASSERT(n <= pcstack_limit); - - pcstack += n; - pcstack_limit -= n; - -zero: - while (pcstack_limit-- > 0) - *pcstack++ = 0; -} - -int -dtrace_getustackdepth(void) -{ - thread_t thread = current_thread(); - ppc_saved_state_t *regs; - user_addr_t pc, sp; - int n = 0; - boolean_t is64Bit = proc_is64bit(current_proc()); - - if (thread == NULL) - return 0; - - if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT)) - return (-1); - - regs = (ppc_saved_state_t *)find_user_regs(thread); - if (regs == NULL) - return 0; - - pc = regs->REGPC; - sp = regs->REGSP; - - if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_ENTRY)) { - n++; - pc = regs->save_lr; - } - - if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_USTACK_FP)) { - /* - * If the ustack fp flag is set, the stack frame from sp to - * fp contains no valid call information. Start with the fp. - */ - if (is64Bit) - sp = dtrace_fuword64(sp); - else - sp = (user_addr_t)dtrace_fuword32(sp); - } - - n += dtrace_getustack_common(NULL, 0, pc, sp); - - return (n); -} - -void -dtrace_getufpstack(uint64_t *pcstack, uint64_t *fpstack, int pcstack_limit) -{ - thread_t thread = current_thread(); - ppc_saved_state_t *regs; - user_addr_t pc, sp; - volatile uint16_t *flags = - (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags; -#if 0 - uintptr_t oldcontext; - size_t s1, s2; -#endif - boolean_t is64Bit = proc_is64bit(current_proc()); - - if (*flags & CPU_DTRACE_FAULT) - return; - - if (pcstack_limit <= 0) - return; - - /* - * If there's no user context we still need to zero the stack. - */ - if (thread == NULL) - goto zero; - - regs = (ppc_saved_state_t *)find_user_regs(thread); - if (regs == NULL) - goto zero; - - *pcstack++ = (uint64_t)proc_selfpid(); - pcstack_limit--; - - if (pcstack_limit <= 0) - return; - - pc = regs->REGPC; - sp = regs->REGSP; - -#if 0 /* XXX signal stack crawl*/ - oldcontext = lwp->lwp_oldcontext; - - if (p->p_model == DATAMODEL_NATIVE) { - s1 = sizeof (struct frame) + 2 * sizeof (long); - s2 = s1 + sizeof (siginfo_t); - } else { - s1 = sizeof (struct frame32) + 3 * sizeof (int); - s2 = s1 + sizeof (siginfo32_t); - } -#endif - - if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_ENTRY)) { - *pcstack++ = (uint64_t)pc; - *fpstack++ = 0; - pcstack_limit--; - if (pcstack_limit <= 0) - return; - - /* - * XXX This is wrong, but we do not yet support stack helpers. - */ - if (is64Bit) - pc = dtrace_fuword64(sp); - else - pc = dtrace_fuword32(sp); - } - - while (pc != 0) { - *pcstack++ = (uint64_t)pc; - *fpstack++ = sp; - pcstack_limit--; - if (pcstack_limit <= 0) - break; - - if (sp == 0) - break; - -#if 0 /* XXX signal stack crawl*/ - if (oldcontext == sp + s1 || oldcontext == sp + s2) { - if (p->p_model == DATAMODEL_NATIVE) { - ucontext_t *ucp = (ucontext_t *)oldcontext; - greg_t *gregs = ucp->uc_mcontext.gregs; - - sp = dtrace_fulword(&gregs[REG_FP]); - pc = dtrace_fulword(&gregs[REG_PC]); - - oldcontext = dtrace_fulword(&ucp->uc_link); - } else { - ucontext_t *ucp = (ucontext_t *)oldcontext; - greg_t *gregs = ucp->uc_mcontext.gregs; - - sp = dtrace_fuword32(&gregs[EBP]); - pc = dtrace_fuword32(&gregs[EIP]); - - oldcontext = dtrace_fuword32(&ucp->uc_link); - } - } - else -#endif - { - if (is64Bit) { - pc = dtrace_fuword64((sp + RETURN_OFFSET64)); - sp = dtrace_fuword64(sp); - } else { - pc = dtrace_fuword32((sp + RETURN_OFFSET)); - sp = dtrace_fuword32(sp); - } - } - } - -zero: - while (pcstack_limit-- > 0) - *pcstack++ = 0; -} - -void -dtrace_getpcstack(pc_t *pcstack, int pcstack_limit, int aframes, - uint32_t *intrpc) -{ - struct frame *fp = (struct frame *)__builtin_frame_address(0); - struct frame *nextfp, *minfp, *stacktop; - int depth = 0; - int last = 0; - uintptr_t pc; - uintptr_t caller = CPU->cpu_dtrace_caller; - int on_intr; - - if ((on_intr = CPU_ON_INTR(CPU)) != 0) - stacktop = (struct frame *)dtrace_get_cpu_int_stack_top(); - else - stacktop = (struct frame *)(dtrace_get_kernel_stack(current_thread()) + kernel_stack_size); - - minfp = fp; - - aframes++; - - if (intrpc != NULL && depth < pcstack_limit) - pcstack[depth++] = (pc_t)intrpc; - - while (depth < pcstack_limit) { - nextfp = *(struct frame **)fp; - pc = *(uintptr_t *)(((uintptr_t)fp) + RETURN_OFFSET); - - if (nextfp <= minfp || nextfp >= stacktop) { - if (on_intr) { - /* - * Hop from interrupt stack to thread stack. - */ - vm_offset_t kstack_base = dtrace_get_kernel_stack(current_thread()); - - minfp = (struct frame *)kstack_base; - stacktop = (struct frame *)(kstack_base + kernel_stack_size); - - on_intr = 0; - continue; - } - /* - * This is the last frame we can process; indicate - * that we should return after processing this frame. - */ - last = 1; - } - - if (aframes > 0) { - if (--aframes == 0 && caller != 0) { - /* - * We've just run out of artificial frames, - * and we have a valid caller -- fill it in - * now. - */ - ASSERT(depth < pcstack_limit); - pcstack[depth++] = (pc_t)caller; - caller = 0; - } - } else { - if (depth < pcstack_limit) - pcstack[depth++] = (pc_t)pc; - } - - if (last) { - while (depth < pcstack_limit) - pcstack[depth++] = 0; - return; - } - - fp = nextfp; - minfp = fp; - } -} - -uint64_t -dtrace_getarg(int arg, int aframes) -{ -#pragma unused(arg,aframes) - return 0xfeedfacedeafbeadLL; /* XXX Only called for arg >= 5 */ -} - -/* - * Load/Store Safety - */ - -void -dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit)) -{ - /* - * "base" is the smallest toxic address in the range, "limit" is the first - * VALID address greater than "base". - */ - func(0x0, VM_MIN_KERNEL_ADDRESS); - if (VM_MAX_KERNEL_ADDRESS < ~(uintptr_t)0) - func(VM_MAX_KERNEL_ADDRESS + 1, ~(uintptr_t)0); -} - -extern void *mapping_phys_lookup(ppnum_t, unsigned int *); - diff --git a/bsd/dev/ppc/dtrace_subr_ppc.c b/bsd/dev/ppc/dtrace_subr_ppc.c deleted file mode 100644 index 5040a9183..000000000 --- a/bsd/dev/ppc/dtrace_subr_ppc.c +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright (c) 2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * #pragma ident "@(#)dtrace_subr.c 1.12 05/06/08 SMI" - */ - -#define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from mach/ppc/thread_status.h */ -#include -#include -#include -#include -#include -#include -#include -#include - -int (*dtrace_pid_probe_ptr)(ppc_saved_state_t *); -int (*dtrace_return_probe_ptr)(ppc_saved_state_t *); -kern_return_t dtrace_user_probe(ppc_saved_state_t *sv); - -kern_return_t -dtrace_user_probe(ppc_saved_state_t *sv) -{ - - lck_rw_t *rwp; - struct proc *p = current_proc(); - - uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread()); - /* - * DTrace accesses t_cred in probe context. t_cred - * must always be either NULL, or point to a valid, - * allocated cred structure. - */ - kauth_cred_uthread_update(uthread, p); - - if (sv->save_exception == T_DTRACE_RET) { - -/* - * T_DTRACE_RET is generated by the kernel when an emulation sequence - * ends. Unlike the x86 implementation, this can not be caused by - * a user state trap instruction. It is a system error if it occurs - * when not stepping and is, therefore, a panickable offence. - */ - - if(uthread->t_dtrace_step == 0) { /* Are we supposed to be tracing? */ - panic("dtrace_user_probe: T_DTRACE_RET when not stepping\n"); - } - - if (uthread->t_dtrace_ast) { - printf("dtrace_user_probe() should be calling aston()\n"); - // aston(uthread); - // uthread->t_sig_check = 1; - } - - /* - * Clear all user tracing flags. - */ - uthread->t_dtrace_ft = 0; - - /* - * We need to wait until after we've called the - * dtrace_return_probe_ptr function pointer to step the pc. - */ - rwp = &CPU->cpu_ft_lock; - lck_rw_lock_shared(rwp); - - if (dtrace_return_probe_ptr != NULL) (void)(*dtrace_return_probe_ptr)(sv); - lck_rw_unlock_shared(rwp); - - sv->save_srr0 = sv->save_srr0 + 4; /* Step to next instruction */ - if(!(sv->save_srr1 & 0x8000000000000000ULL)) sv->save_srr0 &= 0x00000000FFFFFFFF; /* Trim if in 32-bit mode */ - - return KERN_SUCCESS; - - } else { - -/* - * We have taken our normal trap to get here. Make sure we expect it - */ - uint32_t instr; - rwp = &CPU->cpu_ft_lock; - - /* - * The DTrace fasttrap provider uses a trap, "twi 31,r31,0xDDDD". - * We will only be here if dtrace (or someone pretending to be us) - * sets the trap. - * We let DTrace take the first crack at handling - * this trap; if it's not a probe that DTrace knowns about, - * we call into the trap() routine to handle it like a - * breakpoint placed by a conventional debugger. - */ - - /* - * APPLE NOTE: I believe the purpose of the reader/writers lock - * is thus: There are times which dtrace needs to prevent calling - * dtrace_pid_probe_ptr(). Sun's original impl grabbed a plain - * mutex here. However, that serialized all probe calls, and - * destroyed MP behavior. So now they use a RW lock, with probes - * as readers, and the top level synchronization as a writer. - */ - lck_rw_lock_shared(rwp); - if (dtrace_pid_probe_ptr != NULL && - (*dtrace_pid_probe_ptr)(sv) == 0) { - lck_rw_unlock_shared(rwp); - return KERN_SUCCESS; - } - lck_rw_unlock_shared(rwp); - - /* - * If the instruction that caused the breakpoint trap doesn't - * look like our trap anymore, it may be that this tracepoint - * was removed just after the user thread executed it. In - * that case, return to user land to retry the instuction. - * - * Note that the PC is correct because we do not advance it until after emulation. - */ - if (fuword32(sv->save_srr0, &instr) == 0 && instr != FASTTRAP_INSTR) { - return KERN_SUCCESS; - } - - } - -/* - * If we get here, we go back to throw an exception - */ - - return KERN_FAILURE; -} - -void -dtrace_safe_synchronous_signal(void) -{ -// This is commented out of the x86 code and is never called. -} - -int -dtrace_safe_defer_signal(void) -{ -// This is commented out of the x86 code and is never called. - return 0; -} diff --git a/bsd/dev/ppc/fasttrap_isa.c b/bsd/dev/ppc/fasttrap_isa.c deleted file mode 100644 index 10e2edd08..000000000 --- a/bsd/dev/ppc/fasttrap_isa.c +++ /dev/null @@ -1,734 +0,0 @@ -/* - * Copyright (c) 2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * #pragma ident "@(#)fasttrap_isa.c 1.27 08/04/09 SMI" - */ - -#ifdef KERNEL -#ifndef _KERNEL -#define _KERNEL /* Solaris vs. Darwin */ -#endif -#endif - -#define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from mach/ppc/thread_status.h */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include /* All the bits we care about are guarded by MACH_KERNEL_PRIVATE :-( */ -extern dtrace_id_t dtrace_probeid_error; - -/* Solaris proc_t is the struct. Darwin's proc_t is a pointer to it. */ -#define proc_t struct proc /* Steer clear of the Darwin typedef for proc_t */ - -static int32_t branchtaken(int32_t bo, int32_t bi, ppc_saved_state_t *sv); -static int32_t dtrace_decode_ppc(uint32_t inst); -int patchInst(task_t task, addr64_t vaddr, uint32_t inst); -kern_return_t dtrace_user_probe(ppc_saved_state_t *sv); - -/* - * Lossless User-Land Tracing on PPC - * --------------------------------- - * - * PPC uses a different technique to emulate user-land instruction replaces by a probe - * trap than x86. - * - * Like x86, it will emulate all forms of branch instructions. We will not attempt - * to emulate any instruction that we know will cause an interruption or exception - * (system call, trap, privileged instruction, instruction that uses a privileged - * register). - * - * NOTE: I am thinking that we should punish tight loopers, e.g., branch-to-dot. - * Depending upon clock resolution and how fast we can process these guys, it is - * possible that its quantum will never decrease. Maybe we could just manually - * end the guy's quantum and let the next guy go... - * - * When fasttrap_tracepoint_init is called, we fetch the instruction and decode it. - * If we don't recognize it or find it is a "banned" instruction, we return -1, - * telling our caller to forget it. Otherwise we save the instruction image and - * enough of the decode to quickly handle it at probe time. We cram it into - * the fasttrap_machtp_t structure. - * - * When the probe hits, we verify that the PC is still a probe point and if not, - * we bail. Otherwise we have a bit more to do. - * - * If DTFTP_ENTRY is set, we have an entry probe and need to call dtrace_probe. - * - * If DTFTP_IS_ENABLED is set, all we need to do is to return a 1. - * - * If ftp_argmap is NULL, we call dtrace_probe - * - * Otherwise, we figure out what the arguments are and pass them to dtrace_probe - * - * Next, we need to set up to emulate the probed instruction and here is where we are - * the most different than the x86 code. - * - * Like x86, we first check to see if the instruction is any form of branch. If so, - * we emulate it completely within the kernel and are done. - * - * If it is anything else, we build a code stream within the kernel to execute the - * instruction. Note that this is very different from x86 which build the code in - * userland. - * - * The generated stream needs to be executed within the kernel's code space but with - * the user address space and registers. Because PPC allows different translation modes - * for instruction fetch and data fetch, this is not too difficult. - * - * There are two kinds streams needed: execute and continue, and execute and return, - * which are used for entry/offset and exit probes respectivily. - * - * The probe code will copy the instruction image into the current user savearea (which - * also contains the complete user state register context). A flag that requests either - * execute/continue or execute/return is also set in the savearea. - * - * We now exit the dtrace code and the marked context makes its way back to the point - * where it will be dispatched on the processor. - * - * The exception return code will start to restore the user context, including registers - * and address space. However, before dispatching the user, it will notice that the - * emulate flags are set. At this point the code will build a code stream - * in an area in the per_proc that consists of - * the original instruction followed by a trap instruction. It will set the new MSR (in - * SRR1) to have address translation enable for data, translation disabled for instruction - * fetches, interruptions disabled, and supervisor state. - * - * The new PC and MSR are loaded via a RFID and the generated stream is executed. If a - * synchronous fault occurs, it is either handled (PTE miss, FPU or vector unavailable), - * emulated (alignment or denorm), or passed on to the user. - * - * Assuming the emulated instruction completes, the trap will execute. When that happens, - * low-level trap handler will check its flags. If the trap corresponds to an - * execute/continue stream, the trap handler will adjust the PC and complete the - * transition into user space. - * - * If the trap corresponds to an execute/return stream, the handler will generate - * a T_DTRACE_RET exception and let the trap handler pass it along to dtrace_user_probe. - * - */ - - -static uint64_t -fasttrap_anarg(ppc_saved_state_t *sv, int function_entry, int argno) -{ -#pragma unused(function_entry) - uint32_t farg; - uint64_t value; - - /* The first 8 arguments (argno 0-7) are in registers */ - if (argno < 8) { - value = (&sv->save_r3)[argno]; - } else { - if (sv->save_srr1 & 0x8000000000000000ULL) { - /* 64-bit */ - /* Grab argument >= 8 from stack */ - fasttrap_fuword64_noerr(sv->save_r1 + 48 + ((argno)* sizeof(uint64_t)), &value); - } else { - /* 32-bit */ - /* Grab argument >= 8 from stack */ - fasttrap_fuword32_noerr(sv->save_r1 + 24 + ((argno) * sizeof(uint32_t)), &farg); - value = (uint64_t)farg; - } - } - - return (value); -} - -/*ARGSUSED*/ -int -fasttrap_tracepoint_init(proc_t *p, fasttrap_tracepoint_t *tp, user_addr_t pc, - fasttrap_probe_type_t type) -{ -#pragma unused(type) - - uint32_t instr, testr1, testr2, testr3; - user_addr_t targpc; - int32_t target, optype; - - /* - * Read the instruction at the given address out of the process's - * address space. We don't have to worry about a debugger - * changing this instruction before we overwrite it with our trap - * instruction since P_PR_LOCK is set. Since instructions can span - * pages, we potentially read the instruction in two parts. If the - * second part fails, we just zero out that part of the instruction. - */ - /* - * APPLE NOTE: Of course, we do not have a P_PR_LOCK, so this is racey... - */ - - if (uread(p, &instr, 4, pc) != 0) return (-1); /* Grab instruction, return suddenly if read fails... */ - - optype = dtrace_decode_ppc(instr); /* See if we have an instruction we can probe */ - - tp->ftt_instr = instr; /* Save the instruction image */ - testr1 = tp->ftt_bo = (uint8_t)((instr >> (31 - 10)) & 0x1F); /* Extract branch options */ - testr2 = tp->ftt_bi = (uint8_t)((instr >> (31 - 15)) & 0x1F); /* Extract condition register bit */ - testr3 = (instr >> (31 - 20)) & 0x1F; /* Get that last register */ - tp->ftt_flgs = (uint8_t)(instr & 3); /* Set the absolute address and link flags */ - - switch(optype) { /* Do instruction specific decode */ - - case diCMN: /* Common instruction */ - tp->ftt_type = ftmtCommon; /* Mark as common instruction */ - break; - - case diINV: /* Invalid */ - case diTRP: /* Trap */ - case diSC: /* System Call */ - case diRFI: /* Return from interrupt */ - case diPRV: /* Priviliged instruction */ - return (-1); /* We will not emulate these... */ - break; - - case diB: /* Branch */ - tp->ftt_type = ftmtB; /* Mark as branch instruction */ - target = instr & 0x03FFFFFC; /* Extract address or offset */ - if(target & 0x02000000) target |= 0xFC000000; /* Sign extend */ - tp->ftt_trgt = target; /* Trim back down and save */ - - targpc = (user_addr_t)((int64_t)target); /* Generate a target address, hopefully we sign extend... */ - if(!(tp->ftt_flgs & ftmtAbs)) { /* Are we dealing with an offset here? */ - targpc = targpc + pc; /* Apply offset to get target address */ - } - - if(targpc == pc) return -1; /* Branching to self is a sin and is forbidden... */ - break; - - case diBC: /* Branch conditional */ - tp->ftt_type = ftmtBC; /* Mark as branch conditional */ - target = instr & 0x0000FFFC; /* Extract address or offset */ - if(target & 0x00008000) target |= 0xFFFF0000; /* Sign extend */ - tp->ftt_trgt = target; /* Trim back down and save */ - - targpc = (user_addr_t)((int64_t)target); /* Generate a target address, hopefully we sign extend... */ - if(!(tp->ftt_flgs & ftmtAbs)) { /* Are we dealing with an offset here? */ - targpc = targpc + pc; /* Apply offset to get target address */ - } - - if(targpc == pc) return -1; /* Branching to self is a sin and is forbidden... */ - break; - - case diBLR: /* Branch conditional to link register */ - tp->ftt_type = ftmtBLR; /* Mark as branch conditional to link register */ - break; - - case diBCTR: /* Branch conditional to count register */ - tp->ftt_type = ftmtBCTR; /* Mark as branch conditional to count register */ - break; - - case diOR: /* OR */ - if((instr >> 26) == 24) { /* Is this the ORI nop? */ - if((testr1 == testr2) && ((instr & 0x0000FFFF) == 0)) tp->ftt_type = ftmtNOP; /* Remember if this is a NOP instruction */ - else tp->ftt_type = ftmtCommon; /* Otherwise it is a common ORI instruction */ - } - else if((testr1 == testr2) && (testr1 == testr3)) tp->ftt_type = ftmtNOP; /* If all three registers are the same, this is a NOP */ - else tp->ftt_type = ftmtCommon; /* Otherwise it is a common OR instruction */ - - break; - - default: - panic("fasttrap_tracepoint_init: invalid branch decode, inst = %08X, optype = %d\n", instr, optype); - break; - - } - - return (0); -} - -int -fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp) -{ - return patchInst(p->task, tp->ftt_pc, FASTTRAP_INSTR); /* Patch the instruction and flush it */ -} - -extern void dbgTrace(uint32_t, uint32_t, uint32_t, uint32_t, uint32_t); - -int -fasttrap_tracepoint_remove(proc_t *p, fasttrap_tracepoint_t *tp) -{ - uint32_t instr; - - /* - * Distinguish between read or write failures and a changed - * instruction. - */ - if (uread(p, &instr, 4, tp->ftt_pc) != 0) return (0); /* Get the instruction, but exit if not mapped */ - -// dbgTrace(0x99999999, (uint32_t)tp->ftt_pc, tp->ftt_instr, instr, 0); /* (TRACE/DEBUG) */ - - if (instr != FASTTRAP_INSTR) return (0); /* Did someone change it? If so, just leave */ - - return patchInst(p->task, tp->ftt_pc, tp->ftt_instr); /* Patch the old instruction back in and flush it */ -} - -static void -fasttrap_return_common(ppc_saved_state_t *sv, user_addr_t pc, pid_t pid, user_addr_t new_pc) -{ - - fasttrap_tracepoint_t *tp; - fasttrap_bucket_t *bucket; - fasttrap_id_t *id; - lck_mtx_t *pid_mtx; - - pid_mtx = &cpu_core[CPU->cpu_id].cpuc_pid_lock; - lck_mtx_lock(pid_mtx); - bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)]; - - for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) { - if (pid == tp->ftt_pid && pc == tp->ftt_pc && - tp->ftt_proc->ftpc_acount != 0) - break; - } - - /* - * Don't sweat it if we can't find the tracepoint again. Unlike - * when we're in fasttrap_pid_probe(), finding the tracepoint here - * is not essential to the correct execution of the process. - */ - if (tp == NULL) { - lck_mtx_unlock(pid_mtx); - return; - } - - for (id = tp->ftt_retids; id != NULL; id = id->fti_next) { - /* - * If there's a branch that could act as a return site, we - * need to trace it, and check here if the program counter is - * external to the function. - */ - if((new_pc - id->fti_probe->ftp_faddr) < id->fti_probe->ftp_fsize) /* Is target within the function? */ - continue; /* Yeah, skip this one... */ - - DTRACE_CPUFLAG_SET(CPU_DTRACE_USTACK_FP); - if (ISSET(current_proc()->p_lflag, P_LNOATTACH)) { - dtrace_probe(dtrace_probeid_error, 0 /* state */, - id->fti_probe->ftp_id, 1 /* ndx */, -1 /* offset */, - DTRACEFLT_UPRIV); - } else { - dtrace_probe(id->fti_probe->ftp_id, - pc - id->fti_probe->ftp_faddr, - sv->save_r3, sv->save_r4, 0, 0); - } - DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_USTACK_FP); - } - - lck_mtx_unlock(pid_mtx); -} - -static void -fasttrap_usdt_args(fasttrap_probe_t *probe, ppc_saved_state_t *sv, int argc, - uint64_t *argv) -{ - int i, x, cap = MIN(argc, probe->ftp_nargs); - uint32_t farg; - - for (i = 0; i < cap; i++) { - x = probe->ftp_argmap[i]; - - if (x <= 8) { /* Is this argument in a register? */ - argv[i] = (&sv->save_r0)[x]; - } else { - if(sv->save_srr1 & 0x8000000000000000ULL) { /* Are we running in 64-bit? */ - fasttrap_fuword64_noerr(sv->save_r1 + 48 + (x * sizeof(uint64_t)), &argv[i]); /* Grab argument > 8 from stack */ - } - else { - fasttrap_fuword32_noerr(sv->save_r1 + 24 + (x * sizeof(uint32_t)), &farg); /* Grab argument > 8 from stack */ - argv[i] = (uint64_t)farg; /* Convert to 64-bit */ - } - } - } - - for (; i < argc; i++) { - argv[i] = 0; - } -} - -int -fasttrap_pid_probe(ppc_saved_state_t *sv) -{ - proc_t *p = current_proc(); - fasttrap_bucket_t *bucket; - lck_mtx_t *pid_mtx; - fasttrap_tracepoint_t *tp, tp_local; - pid_t pid; - dtrace_icookie_t cookie; - uint_t is_enabled = 0; - user_addr_t new_pc = 0; - user_addr_t pc; - user_addr_t addrmask; - - pc = sv->save_srr0; /* Remember the PC for later */ - if(sv->save_srr1 & 0x8000000000000000ULL) addrmask = 0xFFFFFFFFFFFFFFFFULL; /* Set 64-bit addressing if enabled */ - else addrmask = 0x00000000FFFFFFFFULL; /* Otherwise set 32-bit */ - - uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread()); - - /* - * Clear all user tracing flags. - */ - uthread->t_dtrace_ft = 0; - - /* - * Treat a child created by a call to vfork(2) as if it were its - * parent. We know that there's only one thread of control in such a - * process: this one. - */ - /* - * APPLE NOTE: Terry says: "You need to hold the process locks (currently: kernel funnel) for this traversal" - * FIXME: How do we assert this? - */ - while (p->p_lflag & P_LINVFORK) p = p->p_pptr; /* Search the end */ - - pid = p->p_pid; - pid_mtx = &cpu_core[CPU->cpu_id].cpuc_pid_lock; - lck_mtx_lock(pid_mtx); - bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, sv->save_srr0)]; /* Get the bucket that corresponds to out PC */ - - /* - * Lookup the tracepoint that the process just hit. - */ - for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) { - if (pid == tp->ftt_pid && (sv->save_srr0 == tp->ftt_pc) && - tp->ftt_proc->ftpc_acount != 0) - break; - } - - /* - * If we couldn't find a matching tracepoint, either a tracepoint has - * been inserted without using the pid ioctl interface (see - * fasttrap_ioctl), or somehow we have mislaid this tracepoint. - */ - if (tp == NULL) { - lck_mtx_unlock(pid_mtx); - return (-1); - } - - if (tp->ftt_ids != NULL) { - fasttrap_id_t *id; - - for (id = tp->ftt_ids; id != NULL; id = id->fti_next) { - fasttrap_probe_t *probe = id->fti_probe; - - if (ISSET(current_proc()->p_lflag, P_LNOATTACH)) { - dtrace_probe(dtrace_probeid_error, 0 /* state */, - id->fti_probe->ftp_id, 1 /* ndx */, -1 /* offset */, - DTRACEFLT_UPRIV); - } else if (id->fti_ptype == DTFTP_ENTRY) { - /* - * We note that this was an entry - * probe to help ustack() find the - * first caller. - */ - cookie = dtrace_interrupt_disable(); - DTRACE_CPUFLAG_SET(CPU_DTRACE_USTACK_FP | CPU_DTRACE_ENTRY); - dtrace_probe(probe->ftp_id, sv->save_r3, sv->save_r4, /* Call the main probe routine with the first 5 args */ - sv->save_r5, sv->save_r6, sv->save_r7); - DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_USTACK_FP | CPU_DTRACE_ENTRY); - dtrace_interrupt_enable(cookie); - - } else if (id->fti_ptype == DTFTP_IS_ENABLED) { - /* - * Note that in this case, we don't - * call dtrace_probe() since it's only - * an artificial probe meant to change - * the flow of control so that it - * encounters the true probe. - */ - is_enabled = 1; - - } else if (probe->ftp_argmap == NULL) { - DTRACE_CPUFLAG_SET(CPU_DTRACE_USTACK_FP); - dtrace_probe(probe->ftp_id, sv->save_r3, sv->save_r4, /* Call the main probe routine with the first 5 args */ - sv->save_r5, sv->save_r6, sv->save_r7); - DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_USTACK_FP); - - } else { - uint64_t t[5]; - - fasttrap_usdt_args(probe, sv, 5, t); /* Grab 5 arguments */ - - DTRACE_CPUFLAG_SET(CPU_DTRACE_USTACK_FP); - dtrace_probe(probe->ftp_id, t[0], t[1], - t[2], t[3], t[4]); - DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_USTACK_FP); - } - - /* APPLE NOTE: Oneshot probes get one and only one chance... */ - if (probe->ftp_prov->ftp_provider_type == DTFTP_PROVIDER_ONESHOT) { - fasttrap_tracepoint_remove(p, tp); - } - } - } - - /* - * We're about to do a bunch of work so we cache a local copy of - * the tracepoint to emulate the instruction, and then find the - * tracepoint again later if we need to light up any return probes. - */ - tp_local = *tp; - lck_mtx_unlock(pid_mtx); - tp = &tp_local; - - /* - * If there's an is-enabled probe connected to this tracepoint it - * means that there was a 'xor r3,r3,r3' - * instruction that was placed there by DTrace when the binary was - * linked. As this probe is, in fact, enabled, we need to stuff 1 - * into R3. Accordingly, we can bypass all the instruction - * emulation logic since we know the inevitable result. It's possible - * that a user could construct a scenario where the 'is-enabled' - * probe was on some other instruction, but that would be a rather - * exotic way to shoot oneself in the foot. - */ - if (is_enabled) { - sv->save_r3 = 1; /* Set condition to true */ - new_pc = (sv->save_srr0 + 4) & addrmask; /* Just fall through to the next instruction */ - goto done; - } - - /* - * We emulate certain types of instructions to ensure correctness - * (in the case of position dependent instructions) or optimize - * common cases. The rest we execute in the kernel, but with - * most of the user's context active. - */ - switch (tp->ftt_type) { - - case ftmtNOP: /* NOP */ - new_pc = (sv->save_srr0 + 4) & addrmask; /* Just fall through to the next instruction */ - break; - - case ftmtB: /* Plain unconditional branch */ - new_pc = (user_addr_t)((int64_t)tp->ftt_trgt); /* Assume target is absolute address for the moment */ - if(!(tp->ftt_flgs & ftmtAbs)) new_pc = (new_pc + sv->save_srr0) & addrmask; /* We don't have absolute address, use as offset from instruction address */ - - if(tp->ftt_flgs & ftmtLink) sv->save_lr = (sv->save_srr0 + 4) & addrmask; /* Set the LR to the next instruction if needed */ - break; - - case ftmtBC: /* Conditional PC relative or absolute branch */ - new_pc = (user_addr_t)((int64_t)tp->ftt_trgt); /* Assume target is absolute address for the moment */ - if(!(tp->ftt_flgs & ftmtAbs)) new_pc = new_pc + sv->save_srr0; /* We don't have absolute address, use as offset from instruction address */ - - if(tp->ftt_flgs & ftmtLink) sv->save_lr = (sv->save_srr0 + 4) & addrmask; /* Set the LR to the next instruction if needed */ - if(!branchtaken(tp->ftt_bo, tp->ftt_bi, sv)) new_pc = (sv->save_srr0 + 4) & addrmask; /* If branch was not taken, set PC to next address */ - break; - - case ftmtBLR: /* Conditional branch to LR */ - new_pc = sv->save_lr; /* Branch target comes from the LR */ - - if(tp->ftt_flgs & ftmtLink) sv->save_lr = (sv->save_srr0 + 4) & addrmask; /* Set the LR to the next instruction if needed */ - if(!branchtaken(tp->ftt_bo, tp->ftt_bi, sv)) new_pc = (sv->save_srr0 + 4) & addrmask; /* If branch was not taken, set PC to next address */ - break; - - case ftmtBCTR: /* Conditional branch to CTR */ - new_pc = sv->save_ctr; /* Branch target comes from the CTR */ - - if(tp->ftt_flgs & ftmtLink) sv->save_lr = (sv->save_srr0 + 4) & addrmask; /* Set the LR to the next instruction if needed */ - if(!branchtaken(tp->ftt_bo, tp->ftt_bi, sv)) new_pc = (sv->save_srr0 + 4) & addrmask; /* If branch was not taken, set PC to next address */ - break; - - case ftmtCommon: /* Common, non-in-kernel emulated instruction */ - sv->save_instr[0] = 1; /* We only have one instruction to inject */ - sv->save_instr[1] = tp->ftt_instr; /* Set the instruction */ - sv->save_hdr.save_flags = sv->save_hdr.save_flags | SAVinject; /* Tell low-level exception return to inject the instruction */ - uthread->t_dtrace_step = 1; /* Let it be known that a trace return is imminent */ - return 0; /* Go and don't dome back until you are done... */ - - default: - panic("fasttrap_pid_probe: invalid ftt_type = %08X\n", tp->ftt_type); /* Huh, wha happened? */ - break; - } - - -done: - - /* - * If there were no return probes when we first found the tracepoint, - * we should feel no obligation to honor any return probes that were - * subsequently enabled -- they'll just have to wait until the next - * time around. - */ - sv->save_srr0 = new_pc; /* Set the new PC */ - if (tp->ftt_retids != NULL) fasttrap_return_common(sv, pc, pid, new_pc); - - return (0); -} - - -int -fasttrap_return_probe(ppc_saved_state_t *sv) -{ - - user_addr_t pc, npc; - - proc_t *p = current_proc(); - - - /* - * Treat a child created by a call to vfork(2) as if it were its - * parent. We know that there's only one thread of control in such a - * process: this one. - */ - /* - * APPLE NOTE: Terry says: "You need to hold the process locks (currently: kernel funnel) for this traversal" - * How do we assert this? - */ - while (p->p_lflag & P_LINVFORK) { - p = p->p_pptr; - } - - pc = sv->save_srr0; /* Get the PC of the probed instruction */ - npc = pc + 4; /* Get next PC */ - if(!(sv->save_srr1 & 0x8000000000000000ULL)) npc &= 0x00000000FFFFFFFF; /* Wrap new PC if running 32-bit */ - fasttrap_return_common(sv, pc, p->p_pid, npc); - - return (0); -} - -uint64_t -fasttrap_pid_getarg(void *arg, dtrace_id_t id, void *parg, int argno, - int aframes) -{ -#pragma unused(arg, id, parg, aframes) - return (fasttrap_anarg((ppc_saved_state_t *)find_user_regs(current_thread()), 1, argno)); -} - -uint64_t -fasttrap_usdt_getarg(void *arg, dtrace_id_t id, void *parg, int argno, - int aframes) -{ -#pragma unused(arg, id, parg, aframes) - return (fasttrap_anarg((ppc_saved_state_t *)find_user_regs(current_thread()), 0, argno)); -} - - -static int32_t branchtaken(int32_t bo, int32_t bi, ppc_saved_state_t *sv) { - int32_t bcond, czero, crmatch; - uint64_t ctr; - - if((bo & 0x14) == 0x14) return 1; /* If this is a branch always, exit with true... */ - - czero = 0; /* Assume that we have not just decremented the CTR to 0 */ - - if(!(bo & 4)) { /* Skip the next bit if we do NOT muck with the CTR */ - ctr = sv->save_ctr = sv->save_ctr - 1; /* Decrement the CTR */ - if(!(sv->save_srr1 & 0x8000000000000000ULL)) ctr &= 0x00000000FFFFFFFF; /* Only look at the bottom 32 bits if 32-bit mode */ - czero = (ctr == 0); /* Remember if we just hit zero */ - } - - bcond = (bo >> 3); /* If 1, branch if CR flag is 1. If 0, branch if 0 */ - crmatch = bo >> 4; /* If bo[0] is set, do not check CR flag */ - crmatch = crmatch | (((sv->save_cr >> (31 - bi)) ^ bcond) ^ 1); /* Low bit is now set if CR flag matches or CR is not checked. Other bits are trash. */ - -// dbgTrace(0x77777777, bo, bi, sv->save_cr, ((czero | crmatch) & 1)); /* (TRACE/DEBUG) */ - - return ((czero | crmatch) & 1); /* Return 1 if branch taken, 0 if not... */ -} - -static int32_t dtrace_decode_ppc(uint32_t inst) { - - int32_t curdcd, lastmask, newmask, spr, bit, bito, word; - uint16_t xop = 0; - dcdtab *dcd; - - curdcd = inst >> 26; /* Isolate major op code to start decode */ - lastmask = 99; /* Always force a new xop at the start */ - - while(1) { /* Loop until we find instruction or fail */ - dcd = &insts[curdcd]; /* Point to the current decode table entry */ - if(dcd->dcdFlgs & dcdJump) { /* Should we jump to a new spot in the decode table? */ - curdcd = dcd->dcdMatch; /* Jump */ - continue; - } - - newmask = dcd->dcdFlgs & dcdMask; /* Isolate the mask index */ - if(lastmask != newmask) { /* Are we changing masks? */ - if(!newmask) break; /* If the mask is 0, we match everything and succeed... (note: lastmask can never be 0) */ - xop = inst & masktab[newmask]; /* Clear all extra bits to make match */ - lastmask = newmask; /* Remember */ - } - - if(xop == dcd->dcdMatch) break; /* We found our guy! */ - - if(!(dcd->dcdFlgs & dcdStep)) { /* No stepping, we failed */ - dcd = &dcdfail; /* Point to a failure entry */ - break; /* Leave... */ - } - - curdcd = curdcd + 1; /* Step to the next decode entry */ - } - - if(dcd->dcdType != diSPR) return (int32_t)(dcd->dcdType); /* Return what we found */ - - spr = (inst >> (31 - 20)) & 0x3FF; /* Get the source */ - spr = ((spr << 5) & 0x3E0) | ((spr >> 5) & 0x1F); /* Flip to right order */ - - word = spr >> 5; /* Get word index into table */ - bito = spr & 0x1F; /* Get bit offset into entry */ - bit = 0x80000000 >> bito; /* Position bit for a test */ - - if(!(sprtbl[word] & bit)) return (diINV); /* Bogus SPR so whole instruction is invalid... */ - - if(spr & 0x10) return (diPRV); /* This is a priviliged SPR so instruction is priviliged... */ - return (diCMN); /* Just a common SPR so instruction is the same... */ -} diff --git a/bsd/dev/ppc/fbt_ppc.c b/bsd/dev/ppc/fbt_ppc.c deleted file mode 100644 index 0a505d23e..000000000 --- a/bsd/dev/ppc/fbt_ppc.c +++ /dev/null @@ -1,694 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* #pragma ident "@(#)fbt.c 1.15 05/09/19 SMI" */ - -#ifdef KERNEL -#ifndef _KERNEL -#define _KERNEL /* Solaris vs. Darwin */ -#endif -#endif - -#define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from mach/ppc/thread_status.h */ -#include -#include -#include - -#include -#include - -extern struct mach_header _mh_execute_header; /* the kernel's mach header */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include - -#define DTRACE_INVOP_NOP_SKIP 4 - -#define DTRACE_INVOP_MFLR_R0 11 -#define DTRACE_INVOP_MFLR_R0_SKIP 4 - -#define FBT_MFLR_R0 0x7c0802a6 - -#define FBT_MTLR_R0 0x7c0803a6 -#define FBT_BLR 0x4e800020 -#define FBT_BCTR 0x4e800420 - -#define FBT_LI_MASK 0x03fffffc -#define FBT_JUMP 0x48000000 -#define IS_JUMP(instr) (((instr) & ~FBT_LI_MASK) == FBT_JUMP) /* Relative, No LR update -- AA == 0b, LK == 0b */ -#define FBT_LI_EXTD64(instr) \ - (((instr) & 0x02000000) ? \ - (((uint64_t)((instr) & FBT_LI_MASK)) | 0xfffffffffc000000ULL) : \ - ((uint64_t)((instr) & FBT_LI_MASK))) - -#define FBT_PATCHVAL 0x7c810808 -#define FBT_AFRAMES_ENTRY 6 -#define FBT_AFRAMES_RETURN 6 - -#define FBT_ENTRY "entry" -#define FBT_RETURN "return" -#define FBT_ADDR2NDX(addr) ((((uintptr_t)(addr)) >> 4) & fbt_probetab_mask) - -extern dtrace_provider_id_t fbt_id; -extern fbt_probe_t **fbt_probetab; -extern int fbt_probetab_mask; - -kern_return_t fbt_perfCallback(int, ppc_saved_state_t *, int, int); -kern_return_t fbt_perfIntCallback(int, ppc_saved_state_t *, int, int); - -/* - * Critical routines that must not be probed. PR_5221096, PR_5379018. - */ - -static const char * critical_blacklist[] = -{ - "bcopy_phys", - "bcopy_physvir_32", - "cpu_control", - "cpu_exit_wait", - "cpu_info", - "cpu_info_count", - "cpu_init", - "cpu_machine_init", - "cpu_per_proc_alloc", - "cpu_per_proc_free", - "cpu_signal_handler", - "cpu_sleep", - "cpu_start", - "cpu_subtype", - "cpu_threadtype", - "cpu_to_processor", - "cpu_type", - "mapSkipListVerifyC", - "ml_nofault_copy", - "register_cpu_setup_func", - "unregister_cpu_setup_func" -}; -#define CRITICAL_BLACKLIST_COUNT (sizeof(critical_blacklist)/sizeof(critical_blacklist[0])) - -/* - * The transitive closure of entry points that can be reached from probe context. - * (Apart from routines whose names begin with dtrace_). - */ -static const char * probe_ctx_closure[] = -{ - "Debugger", - "MapUserMemoryWindow", - "OSCompareAndSwap", - "absolutetime_to_microtime", - "bcopy", - "clock_get_calendar_nanotime_nowait", - "copyin", - "copyinstr", - "copyout", - "copyoutstr", - "cpu_number", - "current_proc", - "current_processor", - "current_task", - "current_thread", - "debug_enter", - "find_user_regs", - "getPerProc", - "get_bsdtask_info", - "get_bsdthread_info", - "get_threadtask", - "hw_atomic_and", - "hw_compare_and_store", - "hw_find_map", - "kauth_cred_get", - "kauth_getgid", - "kauth_getuid", - "mach_absolute_time", - "mapping_drop_busy", - "mapping_find", - "mapping_phys_lookup", - "max_valid_stack_address", - "ml_at_interrupt_context", - "ml_phys_write_byte_64", - "ml_phys_write_half_64", - "ml_phys_write_word_64", - "ml_set_interrupts_enabled", - "panic", - "pmap_find_phys", - "prf", - "proc_is64bit", - "proc_selfname", - "proc_selfpid", - "proc_selfppid", - "psignal_lock", - "sdt_getargdesc", - "splhigh", - "splx", - "strlcpy", - "systrace_stub", - "timer_grab" -}; -#define PROBE_CTX_CLOSURE_COUNT (sizeof(probe_ctx_closure)/sizeof(probe_ctx_closure[0])) - -static int _cmp(const void *a, const void *b) -{ - return strncmp((const char *)a, *(const char **)b, strlen((const char *)a) + 1); -} - -static const void * bsearch( - register const void *key, - const void *base0, - size_t nmemb, - register size_t size, - register int (*compar)(const void *, const void *)) { - - register const char *base = base0; - register size_t lim; - register int cmp; - register const void *p; - - for (lim = nmemb; lim != 0; lim >>= 1) { - p = base + (lim >> 1) * size; - cmp = (*compar)(key, p); - if (cmp == 0) - return p; - if (cmp > 0) { /* key > p: move right */ - base = (const char *)p + size; - lim--; - } /* else move left */ - } - return (NULL); -} - -int -fbt_invop(uintptr_t addr, uintptr_t *stack, uintptr_t rval) -{ - fbt_probe_t *fbt = fbt_probetab[FBT_ADDR2NDX(addr)]; - uint64_t mask = (_cpu_capabilities & k64Bit) ? 0xffffffffffffffffULL : 0x00000000ffffffffULL; - - for (; fbt != NULL; fbt = fbt->fbtp_hashnext) { - if ((uintptr_t)fbt->fbtp_patchpoint == addr) { - - if (fbt->fbtp_roffset == 0) { - ppc_saved_state_t *regs = (ppc_saved_state_t *)stack; - - CPU->cpu_dtrace_caller = regs->save_lr; - - dtrace_probe(fbt->fbtp_id, regs->save_r3 & mask, regs->save_r4 & mask, - regs->save_r5 & mask, regs->save_r6 & mask, regs->save_r7 & mask); - - CPU->cpu_dtrace_caller = (uintptr_t)NULL; - } else { - - dtrace_probe(fbt->fbtp_id, fbt->fbtp_roffset, rval, 0, 0, 0); - - if (fbt->fbtp_rval == DTRACE_INVOP_TAILJUMP) { - ppc_saved_state_t *regs = (ppc_saved_state_t *)stack; - - regs->save_srr0 = (uint64_t)fbt->fbtp_patchpoint + FBT_LI_EXTD64(fbt->fbtp_savedval); - regs->save_srr0 &= mask; - } - - CPU->cpu_dtrace_caller = (uintptr_t)NULL; - } - - return (fbt->fbtp_rval); - } - } - - return (0); -} - -#include /* For USER_MODE */ -#define IS_USER_TRAP(regs) USER_MODE((regs)->save_srr1) -#define T_VECTOR_SIZE 4 /* function pointer size */ -#define T_PROGRAM (0x07 * T_VECTOR_SIZE) -#define FBT_EXCEPTION_CODE T_PROGRAM - -kern_return_t -fbt_perfCallback( - int trapno, - ppc_saved_state_t *regs, - int unused1, - int unused2) -{ -#pragma unused (unused1) -#pragma unused (unused2) - kern_return_t retval = KERN_FAILURE; - - if (!IS_USER_TRAP(regs) && FBT_EXCEPTION_CODE == trapno) { - boolean_t oldlevel; - - oldlevel = ml_set_interrupts_enabled(FALSE); - - switch (dtrace_invop( regs->save_srr0, (uintptr_t *)regs, regs->save_r3 )) { - case DTRACE_INVOP_NOP: - regs->save_srr0 += DTRACE_INVOP_NOP_SKIP; /* Skip over the bytes of the patched NOP */ - retval = KERN_SUCCESS; - break; - - case DTRACE_INVOP_MFLR_R0: - regs->save_r0 = regs->save_lr; /* Emulate patched mflr r0 */ - regs->save_srr0 += DTRACE_INVOP_MFLR_R0_SKIP; /* Skip over the bytes of the patched mflr r0 */ - retval = KERN_SUCCESS; - break; - - case DTRACE_INVOP_RET: - regs->save_srr0 = regs->save_lr; /* Emulate patched blr by resuming execution at the LR */ - retval = KERN_SUCCESS; - break; - - case DTRACE_INVOP_BCTR: - regs->save_srr0 = regs->save_ctr; /* Emulate patched bctr by resuming execution at the CTR */ - retval = KERN_SUCCESS; - break; - - case DTRACE_INVOP_TAILJUMP: - retval = KERN_SUCCESS; - break; - - default: - retval = KERN_FAILURE; - break; - } - ml_set_interrupts_enabled(oldlevel); - } - - return retval; -} - -kern_return_t -fbt_perfIntCallback( - int trapno, - ppc_saved_state_t *regs, - int unused1, - int unused2) -{ - kern_return_t retval = KERN_FAILURE; - - if (KERN_SUCCESS == (retval = fbt_perfCallback(trapno, regs, unused1, unused2))) - enable_preemption(); - - return retval; -} - -/*ARGSUSED*/ -static void -__fbt_provide_module(void *arg, struct modctl *ctl) -{ -#pragma unused(arg) - struct mach_header *mh; - struct load_command *cmd; - struct segment_command *orig_ts = NULL, *orig_le = NULL; - struct symtab_command *orig_st = NULL; - struct nlist *sym = NULL; - char *strings; - uintptr_t instrLow, instrHigh; - char *modname; - unsigned int i; - - int gIgnoreFBTBlacklist = 0; - PE_parse_boot_argn("IgnoreFBTBlacklist", &gIgnoreFBTBlacklist, sizeof (gIgnoreFBTBlacklist)); - - mh = (struct mach_header *)(ctl->address); - modname = ctl->mod_modname; - - if (0 == ctl->address || 0 == ctl->size) /* Has the linker been jettisoned? */ - return; - - /* - * Employees of dtrace and their families are ineligible. Void - * where prohibited. - */ - - if (LIT_STRNEQL(modname, "com.apple.driver.dtrace")) - return; - - if (strstr(modname, "CHUD") != NULL) - return; - - if (mh->magic != MH_MAGIC) - return; - - cmd = (struct load_command *) &mh[1]; - for (i = 0; i < mh->ncmds; i++) { - if (cmd->cmd == LC_SEGMENT) { - struct segment_command *orig_sg = (struct segment_command *) cmd; - - if (LIT_STRNEQL(orig_sg->segname, SEG_TEXT)) - orig_ts = orig_sg; - else if (LIT_STRNEQL(orig_sg->segname, SEG_LINKEDIT)) - orig_le = orig_sg; - else if (LIT_STRNEQL(orig_sg->segname, "")) - orig_ts = orig_sg; /* kexts have a single unnamed segment */ - } - else if (cmd->cmd == LC_SYMTAB) - orig_st = (struct symtab_command *) cmd; - - cmd = (struct load_command *) ((caddr_t) cmd + cmd->cmdsize); - } - - if ((orig_ts == NULL) || (orig_st == NULL) || (orig_le == NULL)) - return; - - sym = (struct nlist *)(orig_le->vmaddr + orig_st->symoff - orig_le->fileoff); - strings = (char *)(orig_le->vmaddr + orig_st->stroff - orig_le->fileoff); - - /* Find extent of the TEXT section */ - instrLow = (uintptr_t)orig_ts->vmaddr; - instrHigh = (uintptr_t)(orig_ts->vmaddr + orig_ts->vmsize); - - for (i = 0; i < orig_st->nsyms; i++) { - fbt_probe_t *fbt, *retfbt; - machine_inst_t *instr, *limit, theInstr; - uint8_t n_type = sym[i].n_type & (N_TYPE | N_EXT); - char *name = strings + sym[i].n_un.n_strx; - int j; - - /* Check that the symbol is a global and that it has a name. */ - if (((N_SECT | N_EXT) != n_type && (N_ABS | N_EXT) != n_type)) - continue; - - if (0 == sym[i].n_un.n_strx) /* iff a null, "", name. */ - continue; - - /* Lop off omnipresent leading underscore. */ - if (*name == '_') - name += 1; - - if (LIT_STRNSTART(name, "dtrace_") && !LIT_STRNSTART(name, "dtrace_safe_")) { - /* - * Anything beginning with "dtrace_" may be called - * from probe context unless it explitly indicates - * that it won't be called from probe context by - * using the prefix "dtrace_safe_". - */ - continue; - } - - if (LIT_STRNSTART(name, "fasttrap_") || - LIT_STRNSTART(name, "fuword") || - LIT_STRNSTART(name, "suword") || - LIT_STRNEQL(name, "sprlock") || - LIT_STRNEQL(name, "sprunlock") || - LIT_STRNEQL(name, "uread") || - LIT_STRNEQL(name, "uwrite")) - continue; /* Fasttrap inner-workings. */ - - if (LIT_STRNSTART(name, "dsmos_")) - continue; /* Don't Steal Mac OS X! */ - - if (LIT_STRNSTART(name, "_dtrace")) - continue; /* Shims in dtrace.c */ - - if (LIT_STRNSTART(name, "chud")) - continue; /* Professional courtesy. */ - - if (LIT_STRNSTART(name, "hibernate_")) - continue; /* Let sleeping dogs lie. */ - - if (LIT_STRNEQL(name, "_ZN9IOService14newTemperatureElPS_") || /* IOService::newTemperature */ - LIT_STRNEQL(name, "_ZN9IOService26temperatureCriticalForZoneEPS_")) /* IOService::temperatureCriticalForZone */ - continue; /* Per the fire code */ - - /* - * Place no probes (illegal instructions) in the exception handling path! - */ - if (LIT_STRNEQL(name, "L_handler700") || - LIT_STRNEQL(name, "save_get_phys_64") || - LIT_STRNEQL(name, "save_get_phys_32") || - LIT_STRNEQL(name, "EmulExit") || - LIT_STRNEQL(name, "Emulate") || - LIT_STRNEQL(name, "Emulate64") || - LIT_STRNEQL(name, "switchSegs") || - LIT_STRNEQL(name, "save_ret_phys")) - continue; - - if (LIT_STRNEQL(name, "thandler") || - LIT_STRNEQL(name, "versave") || - LIT_STRNEQL(name, "timer_event") || - LIT_STRNEQL(name, "hw_atomic_or") || - LIT_STRNEQL(name, "trap")) - continue; - - if (LIT_STRNEQL(name, "fbt_perfCallback") || - LIT_STRNEQL(name, "fbt_perfIntCallback") || - LIT_STRNEQL(name, "ml_set_interrupts_enabled") || - LIT_STRNEQL(name, "dtrace_invop") || - LIT_STRNEQL(name, "fbt_invop") || - LIT_STRNEQL(name, "sdt_invop") || - LIT_STRNEQL(name, "max_valid_stack_address")) - continue; - - /* - * Probes encountered while we're on the interrupt stack are routed along - * the interrupt handling path. No probes allowed there either! - */ - if (LIT_STRNEQL(name, "ihandler") || - LIT_STRNEQL(name, "interrupt") || - LIT_STRNEQL(name, "disable_preemption")) - continue; - - /* - * Avoid weird stack voodoo in and under machine_stack_handoff et al - */ - if (LIT_STRNSTART(name, "machine_stack") || - LIT_STRNEQL(name, "getPerProc") || /* Called in machine_stack_handoff with weird stack state */ - LIT_STRNEQL(name, "fpu_save") || /* Called in machine_stack_handoff with weird stack state */ - LIT_STRNEQL(name, "vec_save") || /* Called in machine_stack_handoff with weird stack state */ - LIT_STRNEQL(name, "pmap_switch")) /* Called in machine_stack_handoff with weird stack state */ - continue; - - /* - * Avoid machine_ routines. PR_5346750. - */ - if (LIT_STRNSTART(name, "machine_")) - continue; - - /* - * Avoid low level pmap and virtual machine monitor PowerPC routines. See PR_5379018. - */ - - if (LIT_STRNSTART(name, "hw_") || - LIT_STRNSTART(name, "mapping_") || - LIT_STRNSTART(name, "commpage_") || - LIT_STRNSTART(name, "pmap_") || - LIT_STRNSTART(name, "vmm_")) - continue; - /* - * Place no probes on critical routines. PR_5221096 - */ - if (!gIgnoreFBTBlacklist && - bsearch( name, critical_blacklist, CRITICAL_BLACKLIST_COUNT, sizeof(name), _cmp ) != NULL) - continue; - - /* - * Place no probes that could be hit in probe context. - */ - if (!gIgnoreFBTBlacklist && - bsearch( name, probe_ctx_closure, PROBE_CTX_CLOSURE_COUNT, sizeof(name), _cmp ) != NULL) - continue; - - /* - * Place no probes that could be hit on the way to the debugger. - */ - if (LIT_STRNSTART(name, "kdp_") || - LIT_STRNSTART(name, "kdb_") || - LIT_STRNSTART(name, "kdbg_") || - LIT_STRNSTART(name, "kdebug_") || - LIT_STRNEQL(name, "kernel_debug") || - LIT_STRNEQL(name, "Debugger") || - LIT_STRNEQL(name, "Call_DebuggerC") || - LIT_STRNEQL(name, "lock_debugger") || - LIT_STRNEQL(name, "unlock_debugger") || - LIT_STRNEQL(name, "SysChoked")) - continue; - - /* - * Place no probes that could be hit on the way to a panic. - */ - if (NULL != strstr(name, "panic_") || - LIT_STRNEQL(name, "panic") || - LIT_STRNEQL(name, "handleMck") || - LIT_STRNEQL(name, "unresolved_kernel_trap")) - continue; - - if (dtrace_probe_lookup(fbt_id, modname, name, NULL) != 0) - continue; - - /* - * Scan forward for mflr r0. - */ - for (j = 0, instr = (machine_inst_t *)sym[i].n_value, theInstr = 0; - (j < 4) && ((uintptr_t)instr >= instrLow) && (instrHigh > (uintptr_t)instr); - j++, instr++) - { - theInstr = *instr; - if (theInstr == FBT_MFLR_R0) /* Place the entry probe here. */ - break; - if (theInstr == FBT_MTLR_R0) /* We've gone too far, bail. */ - break; - if (theInstr == FBT_BLR) /* We've gone too far, bail. */ - break; - } - - if (theInstr != FBT_MFLR_R0) - continue; - - limit = (machine_inst_t *)instrHigh; - - fbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP); - strlcpy( (char *)&(fbt->fbtp_name), name, MAX_FBTP_NAME_CHARS ); - fbt->fbtp_id = dtrace_probe_create(fbt_id, modname, name, FBT_ENTRY, FBT_AFRAMES_ENTRY, fbt); - fbt->fbtp_patchpoint = instr; - fbt->fbtp_ctl = ctl; - fbt->fbtp_loadcnt = ctl->mod_loadcnt; - fbt->fbtp_rval = DTRACE_INVOP_MFLR_R0; - fbt->fbtp_savedval = theInstr; - fbt->fbtp_patchval = FBT_PATCHVAL; - - fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)]; - fbt->fbtp_symndx = i; - fbt_probetab[FBT_ADDR2NDX(instr)] = fbt; - - instr++; /* Move on down the line */ - retfbt = NULL; -again: - if (instr >= limit) - continue; - - /* - * We (desperately) want to avoid erroneously instrumenting a - * jump table. To determine if we're looking at a true instruction - * or an inline jump table that happens to contain the same - * byte sequences, we resort to some heuristic sleeze: we - * treat this instruction as being contained within a pointer, - * and see if that pointer points to within the body of the - * function. If it does, we refuse to instrument it. - */ - { - machine_inst_t *ptr = *(machine_inst_t **)instr; - - if (ptr >= (machine_inst_t *)sym[i].n_value && ptr < limit) { - instr++; - goto again; - } - } - - /* - * OK, it's an instruction. - */ - theInstr = *instr; - - /* Walked onto the start of the next routine? If so, bail out from this function. */ - if (theInstr == FBT_MFLR_R0) - continue; - - if (theInstr != FBT_MTLR_R0) { - instr++; - goto again; - } - - /* - * Found mtlr r0; - * Scan forward for a blr, bctr, or a jump (relative, no LR change). - */ - instr++; - for (j = 0; (j < 12) && (instr < limit); j++, instr++) { - theInstr = *instr; - if (theInstr == FBT_BLR || theInstr == FBT_BCTR || IS_JUMP(theInstr) || - theInstr == FBT_MFLR_R0 || theInstr == FBT_MTLR_R0) - break; - } - - if (!(theInstr == FBT_BLR || theInstr == FBT_BCTR || IS_JUMP(theInstr))) - goto again; - - /* - * We have a winner: "mtlr r0; ... ; {blr, bctr, j}" ! - */ - fbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP); - strlcpy( (char *)&(fbt->fbtp_name), name, MAX_FBTP_NAME_CHARS ); - - if (retfbt == NULL) { - fbt->fbtp_id = dtrace_probe_create(fbt_id, modname, - name, FBT_RETURN, FBT_AFRAMES_RETURN, fbt); - } else { - retfbt->fbtp_next = fbt; - fbt->fbtp_id = retfbt->fbtp_id; - } - - retfbt = fbt; - fbt->fbtp_patchpoint = instr; - fbt->fbtp_ctl = ctl; - fbt->fbtp_loadcnt = ctl->mod_loadcnt; - - if (theInstr == FBT_BLR) - fbt->fbtp_rval = DTRACE_INVOP_RET; - else if (theInstr == FBT_BCTR) - fbt->fbtp_rval = DTRACE_INVOP_BCTR; - else - fbt->fbtp_rval = DTRACE_INVOP_TAILJUMP; - - fbt->fbtp_roffset = - (uintptr_t)((uint8_t *)instr - (uint8_t *)sym[i].n_value); - - fbt->fbtp_savedval = *instr; - fbt->fbtp_patchval = FBT_PATCHVAL; - fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)]; - fbt->fbtp_symndx = i; - fbt_probetab[FBT_ADDR2NDX(instr)] = fbt; - instr++; - goto again; - } -} - -extern struct modctl g_fbt_kernctl; -#undef kmem_alloc /* from its binding to dt_kmem_alloc glue */ -#undef kmem_free /* from its binding to dt_kmem_free glue */ -#include - -/*ARGSUSED*/ -void -fbt_provide_module(void *arg, struct modctl *ctl) -{ -#pragma unused(ctl) - __fbt_provide_module(arg, &g_fbt_kernctl); - - if ( (vm_offset_t)g_fbt_kernctl.address != (vm_offset_t )NULL ) - kmem_free(kernel_map, (vm_offset_t)g_fbt_kernctl.address, round_page(g_fbt_kernctl.size)); - g_fbt_kernctl.address = 0; - g_fbt_kernctl.size = 0; -} diff --git a/bsd/dev/ppc/ffs.c b/bsd/dev/ppc/ffs.c deleted file mode 100644 index c3f06a74f..000000000 --- a/bsd/dev/ppc/ffs.c +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1991 NeXT Computer, Inc. All rights reserved. - * - * File: machdep/i386/libc/ffs.c - * Author: Bruce Martin, NeXT Computer, Inc. - * - * This file contains machine dependent code for the ffs function - * on NeXT i386-based products. Currently tuned for the i486. - * - * HISTORY - * 27-Sep-92 Bruce Martin (Bruce_Martin@NeXT.COM) - * Created: stolen from Mike's code. - */ - -unsigned -ffs(unsigned mask) -{ - unsigned bitpos; - - if (mask == 0) - return 0; - - bitpos = 1; - while ((mask & 0xff) == 0) { - bitpos += 8; - mask >>= 8; - } - while ((mask & 1) == 0) { - bitpos += 1; - mask >>= 1; - } - return bitpos; -} diff --git a/bsd/dev/ppc/ffs.s b/bsd/dev/ppc/ffs.s deleted file mode 100644 index 290053a82..000000000 --- a/bsd/dev/ppc/ffs.s +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1992, 1997-1998 Apple Computer, Inc. All rights reserved. - * - * File: machdep/ppc/libc/ffs.s - * - * int ffs(int value) - * - * DESCRIPTION - * The ffs() function finds the first bit set in value and returns the - * index of that bit. Bits are numbered starting from 1, starting at - * the right-most bit. A return value of 0 means that the argument was - * - * HISTORY - * 14-Aug-1998 Umesh Vaishampayan (umeshv@apple.com) - * Optimized! - * - * 10-Mar-1998 Matt Watson (mwatson@apple.com) - * Correctified - * - * 19-Jan-1998 Matt Watson (mwatson@apple.com) - * Simplified - * - * 24-Jan-1997 Umesh Vaishampayan (umeshv@NeXT.com) - * Ported to PPC. - */ - -.text -.align 4 -.globl _ffs -_ffs: /* Cycles */ - neg r0,r3 /* 0 */ - and r3,r0,r3 /* 1 */ - li r4, 32 /* 1 */ - cntlzw r3,r3 /* 2 */ - subf r3,r3,r4 /* 3 */ - blr - - .globl _abs -_abs: - srawi r0,r3,31 - xor r3,r0,r3 - subf r3,r0,r3 - blr - diff --git a/bsd/dev/ppc/kern_machdep.c b/bsd/dev/ppc/kern_machdep.c deleted file mode 100644 index 1f45bd131..000000000 --- a/bsd/dev/ppc/kern_machdep.c +++ /dev/null @@ -1,263 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (C) 1990, 1993 NeXT, Inc. - * Copyright (C) 1997 Apple Computer, Inc. - * - * File: next/kern_machdep.c - * Author: John Seamons - * - * Machine-specific kernel routines. - */ - -#include -#include -#include -#include -#include -#include -#include - -boolean_t kernacc(off_t, size_t ); - - -/* - * Routine: grade_binary() - * - * Function: - * Return a relative preference for exectypes and execsubtypes in fat - * executable files. The higher the grade, the higher the preference. - * A grade of 0 means not acceptable. - * - * Note: We really don't care about the real cpu_type() here, - * because machines can only have one type. - */ -int -grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype) -{ - int cpusubtype = cpu_subtype(); - - /* - * This code should match cpusubtype_findbestarch() in best_arch.c - * in the cctools project. As of 2/16/98 this is what has been - * agreed upon for the PowerPC subtypes. If an exact match is not - * found the subtype will be picked from the following order: - * 970(but only on 970), 7450, 7400, 750, ALL - * Note the 601 is NOT in the list above. It is only picked via - * an exact match. For details see Radar 2213821. - */ - - switch (cpusubtype) { - case CPU_SUBTYPE_POWERPC_970: - switch(exectype) { - case CPU_TYPE_POWERPC64: /* CPU_IS64BIT | CPU_POWERPC */ - switch(execsubtype) { - /* - * Prefer 64 bit architecture specific binaries; note - * that this value does not mean the same thing here - * as it does below. - */ - case CPU_SUBTYPE_POWERPC_970: - return 8; - /* Prefer generic binaries */ - case CPU_SUBTYPE_POWERPC_ALL: - return 7; - default: - return 0; - } - /* NOTREACHED */ - - case CPU_TYPE_POWERPC: - switch(execsubtype) { - /* - * Prefer 32 bit binaries with 64 bit leaf functions; - * this is actually bogus use of the subtype to encode - * CPU feature bits. - */ - case CPU_SUBTYPE_POWERPC_970: - return 6; - case CPU_SUBTYPE_POWERPC_7450: - return 4; - case CPU_SUBTYPE_POWERPC_7400: - return 3; - case CPU_SUBTYPE_POWERPC_750: - return 2; - case CPU_SUBTYPE_POWERPC_ALL: - return 1; - default: - return 0; - } - /* NOTREACHED */ - - default: - return 0; - } - /* NOTREACHED */ - - case CPU_SUBTYPE_POWERPC_7450: - switch(exectype) { - case CPU_TYPE_POWERPC64: /* CPU_IS64BIT | CPU_POWERPC */ - return 0; - - case CPU_TYPE_POWERPC: - switch(execsubtype) { - case CPU_SUBTYPE_POWERPC_7450: - return 6; - case CPU_SUBTYPE_POWERPC_7400: - return 4; - case CPU_SUBTYPE_POWERPC_750: - return 3; - case CPU_SUBTYPE_POWERPC_ALL: - return 1; - default: - return 0; - } - /* NOTREACHED */ - - default: - return 0; - } - /* NOTREACHED */ - - case CPU_SUBTYPE_POWERPC_7400: - switch(exectype) { - case CPU_TYPE_POWERPC64: /* CPU_IS64BIT | CPU_POWERPC */ - return 0; - - case CPU_TYPE_POWERPC: - switch(execsubtype) { - case CPU_SUBTYPE_POWERPC_7400: - return 6; - case CPU_SUBTYPE_POWERPC_7450: - return 4; - case CPU_SUBTYPE_POWERPC_750: - return 3; - case CPU_SUBTYPE_POWERPC_ALL: - return 1; - default: - return 0; - } - /* NOTREACHED */ - - default: - return 0; - } - /* NOTREACHED */ - - case CPU_SUBTYPE_POWERPC_750: - switch(exectype) { - case CPU_TYPE_POWERPC64: /* CPU_IS64BIT | CPU_POWERPC */ - return 0; - - case CPU_TYPE_POWERPC: - switch(execsubtype) { - case CPU_SUBTYPE_POWERPC_750: - return 6; -#ifndef ADDRESS_RADAR_2678019 - /* - * Currently implemented because dropping this would - * turn the executable subtype into a "has Altivec" - * flag, which we do not want to permit. It could - * also break working third party applications - * already in use in the field. - */ - case CPU_SUBTYPE_POWERPC_7400: - return 4; - case CPU_SUBTYPE_POWERPC_7450: - return 3; -#endif /* ADDRESS_RADAR_2678019 */ - case CPU_SUBTYPE_POWERPC_ALL: - return 1; - default: - return 0; - } - /* NOTREACHED */ - - default: - return 0; - } - /* NOTREACHED */ - - default: - switch(exectype) { - case CPU_TYPE_POWERPC64: /* CPU_IS64BIT | CPU_POWERPC */ - return 0; - - case CPU_TYPE_POWERPC: - /* Special case for PPC601 */ - if (cpusubtype == execsubtype) - return 6; - /* - * If we get here it is because it is a cpusubtype we - * don't support or a new cpusubtype that was added - * since this code was written. Both will be - * considered unacceptable. - */ - return 0; - /* NOTREACHED */ - - default: - return 0; - } - /* NOTREACHED */ - } - /* NOTREACHED */ -} - -extern vm_map_offset_t kvtophys64(vm_map_offset_t); - -boolean_t -kernacc( - off_t start, - size_t len -) -{ - off_t base; - off_t end; - - base = trunc_page_64(start); - end = start + len; - - while (base < end) { - if(kvtophys64((vm_map_offset_t)base) == (vm_map_offset_t)0) - return(FALSE); - base += page_size; - } - - return (TRUE); -} - -void -md_prepare_for_shutdown(int paniced, int howto, char * command); - -void -md_prepare_for_shutdown(__unused int paniced, __unused int howto, - __unused char * command) -{ - return; -} diff --git a/bsd/dev/ppc/km.c b/bsd/dev/ppc/km.c deleted file mode 100644 index e82d6be27..000000000 --- a/bsd/dev/ppc/km.c +++ /dev/null @@ -1,392 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1992 NeXT Computer, Inc. All rights reserved. - * - * km.m - kernel keyboard/monitor module, procedural interface. - * - * HISTORY - */ - -#include -#include - -#include -#include -#include -#include -#include /* for kmopen */ -#include -#include /* for kmopen */ -#include -#include -#include -#include - -/* - * 'Global' variables, shared only by this file and conf.c. - */ -struct tty *km_tty[1] = { 0 }; - -/* - * this works early on, after initialize_screen() but before autoconf (and thus - * before we have a kmDevice). - */ -int disableConsoleOutput; - -static int initialized = 0; - -extern void kminit(void); - -// used by or implemented in the osfmk project -extern void cnputcusr(char); // From osfmk -extern int cngetc(void); // From osfmk -extern void cons_cinput(char ch); // Used by osfmk - -static int kmoutput(struct tty *tp); -static void kmtimeout(void *tp); -static void kmstart(struct tty *tp); - -extern void KeyboardOpen(void); - -void -kminit(void) -{ - km_tty[0] = ttymalloc(); - km_tty[0]->t_dev = makedev(12, 0); - initialized = 1; -} - -/* - * cdevsw interface to km driver. - */ -int -kmopen(dev_t dev, int flag, __unused int devtype, proc_t pp) -{ - int unit; - struct tty *tp; - struct winsize *wp; - int ret; - - unit = minor(dev); - if(unit >= 1) - return (ENXIO); - - tp = km_tty[unit]; - - tty_lock(tp); - - tp->t_oproc = kmstart; - tp->t_param = NULL; - tp->t_dev = dev; - - if ( !(tp->t_state & TS_ISOPEN) ) { - tp->t_iflag = TTYDEF_IFLAG; - tp->t_oflag = TTYDEF_OFLAG; - tp->t_cflag = (CREAD | CS8 | CLOCAL); - tp->t_lflag = TTYDEF_LFLAG; - tp->t_ispeed = tp->t_ospeed = TTYDEF_SPEED; - termioschars(&tp->t_termios); - ttsetwater(tp); - } else if ((tp->t_state & TS_XCLUDE) && proc_suser(pp)) { - ret = EBUSY; - goto out; - } - - tp->t_state |= TS_CARR_ON; /* lie and say carrier exists and is on. */ - - ret = ((*linesw[tp->t_line].l_open)(dev, tp)); - { - PE_Video video; - wp = &tp->t_winsize; - /* - * Magic numbers. These are CHARWIDTH and CHARHEIGHT - * from osfmk/ppc/POWERMAC/video_console.c - */ - wp->ws_xpixel = 8; - wp->ws_ypixel = 16; - - tty_unlock(tp); /* XXX race window */ - - if (flag & O_POPUP) - PE_initialize_console(0, kPETextScreen); - - bzero(&video, sizeof(video)); - PE_current_console(&video); - - tty_lock(tp); - - if( video.v_width != 0 && video.v_height != 0 ) { - wp->ws_col = video.v_width / wp->ws_xpixel; - wp->ws_row = video.v_height / wp->ws_ypixel; - } else { - wp->ws_col = 100; - wp->ws_row = 36; - } - } - -out: - tty_unlock(tp); - - return ret; -} - -int -kmclose(dev_t dev, __unused int flag, __unused int mode, __unused proc_t p) -{ - int ret; - struct tty *tp = km_tty[minor(dev)]; - - tty_lock(tp); - ret = (*linesw[tp->t_line].l_close)(tp,flag); - ttyclose(tp); - tty_unlock(tp); - - return (ret); -} - -int -kmread(dev_t dev, struct uio *uio, int ioflag) -{ - int ret; - struct tty *tp = km_tty[minor(dev)]; - - tty_lock(tp); - ret = (*linesw[tp->t_line].l_read)(tp, uio, ioflag); - tty_unlock(tp); - - return (ret); -} - -int -kmwrite(dev_t dev, struct uio *uio, int ioflag) -{ - int ret; - struct tty *tp = km_tty[minor(dev)]; - - tty_lock(tp); - ret = (*linesw[tp->t_line].l_write)(tp, uio, ioflag); - tty_unlock(tp); - - return (ret); -} - -int -kmioctl(dev_t dev, u_long cmd, caddr_t data, int flag, proc_t p) -{ - int error = 0; - struct tty *tp = km_tty[minor(dev)]; - struct winsize *wp; - - tty_lock(tp); - - switch (cmd) { - case KMIOCSIZE: - wp = (struct winsize *)data; - *wp = tp->t_winsize; - break; - - case TIOCSWINSZ: - /* Prevent changing of console size -- - * this ensures that login doesn't revert to the - * termcap-defined size - */ - error = EINVAL; - break; - - /* Bodge in the CLOCAL flag as the km device is always local */ - case TIOCSETA_32: - case TIOCSETAW_32: - case TIOCSETAF_32: - { - struct termios32 *t = (struct termios32 *)data; - t->c_cflag |= CLOCAL; - /* No Break */ - } - goto fallthrough; - case TIOCSETA_64: - case TIOCSETAW_64: - case TIOCSETAF_64: - { - struct user_termios *t = (struct user_termios *)data; - t->c_cflag |= CLOCAL; - /* No Break */ - } -fallthrough: - default: - error = (*linesw[tp->t_line].l_ioctl)(tp, cmd, data, flag, p); - if (ENOTTY != error) - break; - error = ttioctl_locked(tp, cmd, data, flag, p); - break; - } - - tty_unlock(tp); - - return (error); -} - -/* - * kmputc - * - * Output a character to the serial console driver via cnputcusr(), - * which is exported by that driver. - * - * Locks: Assumes tp in the calling tty driver code is locked on - * entry, remains locked on exit - * - * Notes: Called from kmoutput(); giving the locking output - * assumptions here, this routine should be static (and - * inlined, given there is only one call site). - */ -int -kmputc(__unused dev_t dev, char c) -{ - if(!disableConsoleOutput && initialized) { - /* OCRNL */ - if(c == '\n') - cnputcusr('\r'); - cnputcusr(c); - } - - return (0); -} - - -/* - * Callouts from linesw. - */ - -#define KM_LOWAT_DELAY ((ns_time_t)1000) - -/* - * t_oproc for this driver; called from within the line discipline - * - * Locks: Assumes tp is locked on entry, remains locked on exit - */ -static void -kmstart(struct tty *tp) -{ - if (tp->t_state & (TS_TIMEOUT | TS_BUSY | TS_TTSTOP)) - goto out; - if (tp->t_outq.c_cc == 0) - goto out; - tp->t_state |= TS_BUSY; - kmoutput(tp); - return; - -out: - (*linesw[tp->t_line].l_start)(tp); - return; -} - -/* - * One-shot output retry timeout from kmoutput(); re-calls kmoutput() at - * intervals until the output queue for the tty is empty, at which point - * the timeout is not rescheduled by kmoutput() - * - * This function must take the tty_lock() around the kmoutput() call; it - * ignores the return value. - */ -static void -kmtimeout(void *arg) -{ - struct tty *tp = (struct tty *)arg; - - tty_lock(tp); - (void)kmoutput(tp); - tty_unlock(tp); -} - -/* - * kmoutput - * - * Locks: Assumes tp is locked on entry, remains locked on exit - * - * Notes: Called from kmstart() and kmtimeout(); kmtimeout() is a - * timer initiated by this routine to deal with pending - * output not yet flushed (output is flushed at a maximum - * of sizeof(buf) charatcers at a time before dropping into - * the timeout code). - */ -static int -kmoutput(struct tty *tp) -{ - char buf[80]; /* buffer; limits output per call */ - char *cp; - int cc = -1; - - - /* While there is data available to be output... */ - while (tp->t_outq.c_cc > 0) { - cc = ndqb(&tp->t_outq, 0); - if (cc == 0) - break; - /* - * attempt to output as many characters as are available, - * up to the available transfer buffer size. - */ - cc = min(cc, sizeof buf); - /* copy the output queue contents to the buffer */ - (void) q_to_b(&tp->t_outq, (unsigned char *)buf, cc); - for (cp = buf; cp < &buf[cc]; cp++) { - /* output the buffer one charatcer at a time */ - kmputc(tp->t_dev, *cp & 0x7f); - } - } - if (tp->t_outq.c_cc > 0) { - timeout((timeout_fcn_t)kmtimeout, tp, hz); - } - tp->t_state &= ~TS_BUSY; - (*linesw[tp->t_line].l_start)(tp); - - return 0; -} - -/* - * cons_cinput - * - * Driver character input from the polled mode serial console driver calls - * this routine to input a character from the serial driver into the tty - * line discipline specific input processing receiv interrupt routine, - * l_rint(). - * - * Locks: Assumes that the tty_lock() is NOT held on the tp, so a - * serial driver should NOT call this function as a result - * of being called from a function which already holds the - * lock; ECHOE will be handled at the line discipline, if - * output echo processing is going to occur. - */ -void -cons_cinput(char ch) -{ - struct tty *tp = km_tty[0]; /* XXX */ - - tty_lock(tp); - (*linesw[tp->t_line].l_rint) (ch, tp); - tty_unlock(tp); -} diff --git a/bsd/dev/ppc/mem.c b/bsd/dev/ppc/mem.c deleted file mode 100644 index fc2d39efb..000000000 --- a/bsd/dev/ppc/mem.c +++ /dev/null @@ -1,241 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/*- - * Copyright (c) 1988 University of Utah. - * Copyright (c) 1982, 1986, 1990, 1993 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * the Systems Programming Group of the University of Utah Computer - * Science Department, and code derived from software contributed to - * Berkeley by William Jolitz. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * from: Utah $Hdr: mem.c 1.13 89/10/08$ - * @(#)mem.c 8.1 (Berkeley) 6/11/93 - */ - -#include - -/* - * Memory special file - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include - -static caddr_t devzerobuf; - -extern boolean_t kernacc(off_t, size_t ); -extern int setup_kmem; - -int mmread(dev_t dev, struct uio *uio, int flag); -int mmrw(dev_t dev, struct uio *uio, enum uio_rw rw); -int mmioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p); -int mmwrite(dev_t dev, struct uio *uio, int flag); - -int -mmread(dev_t dev, struct uio *uio, __unused int flag) -{ - - return (mmrw(dev, uio, UIO_READ)); -} - -int -mmwrite(dev_t dev, struct uio *uio, __unused int flag) -{ - - return (mmrw(dev, uio, UIO_WRITE)); -} - -int -mmioctl(dev_t dev, u_long cmd, __unused caddr_t data, - __unused int flag, __unused struct proc *p) -{ - int minnum = minor(dev); - - if ((setup_kmem == 0) && ((minnum == 0) || (minnum == 1))) - return(EINVAL); - - switch (cmd) { - case FIONBIO: - case FIOASYNC: - /* OK to do nothing: we always return immediately */ - break; - default: - return ENODEV; - } - - return (0); -} - -int -mmrw(dev, uio, rw) - dev_t dev; - struct uio *uio; - enum uio_rw rw; -{ - register int o; -#if LP64KERN - register uint64_t c; -#else - register uint c; -#endif - addr64_t vll; - int error = 0; - vm_offset_t where; - - while (uio_resid(uio) > 0 && error == 0) { - uio_update(uio, 0); - - switch (minor(dev)) { - -/* minor device 0 is physical memory */ - case 0: - if (setup_kmem == 0) - return(ENODEV); - vll = trunc_page_64(uio->uio_offset); - if(((vll >> 31) == 1) || vll >= ((dgWork.dgFlags & enaDiagDM) ? mem_actual : max_mem)) - goto fault; - - if(dgWork.dgFlags & enaDiagDM) { /* Can we really get all memory? */ - if (kmem_alloc_pageable(kernel_map, &where, PAGE_SIZE) != KERN_SUCCESS) { - goto fault; - } - else { - addr64_t collad; - - collad = mapping_make(kernel_pmap, (addr64_t)where, (ppnum_t)(vll >> 12), 0, 1, VM_PROT_READ); /* Map it in for the moment */ - if(collad) { /* See if it failed (shouldn't happen) */ - kmem_free(kernel_map, where, PAGE_SIZE); /* Toss the page */ - goto fault; /* Kill the transfer */ - } - } - } - else { - if (kmem_alloc(kernel_map, &where, 4096) - != KERN_SUCCESS) { - goto fault; - } - } - o = uio->uio_offset - vll; - c = min(PAGE_SIZE - o, uio_curriovlen(uio)); - error = uiomove((caddr_t)(where + o), c, uio); - - if(dgWork.dgFlags & enaDiagDM) (void)mapping_remove(kernel_pmap, (addr64_t)where); /* Unmap it */ - kmem_free(kernel_map, where, PAGE_SIZE); - continue; - - /* minor device 1 is kernel memory */ - case 1: - if (setup_kmem == 0) - return(ENODEV); - /* Do some sanity checking */ - if (((addr64_t)uio->uio_offset > vm_last_addr) || - ((addr64_t)uio->uio_offset < VM_MIN_KERNEL_ADDRESS)) - goto fault; - c = uio_curriovlen(uio); - if (!kernacc(uio->uio_offset, c)) - goto fault; - error = uiomove64(uio->uio_offset, c, uio); - continue; - - /* minor device 2 is EOF/RATHOLE */ - case 2: - if (rw == UIO_READ) - return (0); - c = uio_curriovlen(uio); - break; - /* minor device 3 is ZERO/RATHOLE */ - case 3: - if(devzerobuf == NULL) { - MALLOC(devzerobuf, caddr_t,PAGE_SIZE, M_TEMP, M_WAITOK); - bzero(devzerobuf, PAGE_SIZE); - } - if(uio->uio_rw == UIO_WRITE) { - c = uio_curriovlen(uio); - break; - } - c = min(uio_curriovlen(uio), PAGE_SIZE); - error = uiomove(devzerobuf, c, uio); - continue; - default: - goto fault; - break; - } - - if (error) - break; - uio_update(uio, c); - } - return (error); -fault: - return (EFAULT); -} - diff --git a/bsd/dev/ppc/munge.s b/bsd/dev/ppc/munge.s deleted file mode 100644 index 9e33bc326..000000000 --- a/bsd/dev/ppc/munge.s +++ /dev/null @@ -1,477 +0,0 @@ -/* - * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * Syscall argument mungers. - * - * Passed a pointer to the users register array in the savearea, we copy args into - * the uu_arg[] array, padding etc as appropriate. The issue is that parameters - * passed in registers from a 32-bit address space do not map directly into the uu_args. - * For example, a 32-bit long-long comes in two registers, but we need to combine - * them into one 64-bit long-long in the uu_args. - * - * There are several functions in this file. Each takes two parameters: - * - * void munge_XXXX( const void *regs, void *uu_args); - * - * The name of the function encodes the number and type of the parameters, as follows: - * - * w = a 32-bit value such as an int or a 32-bit ptr, that does not require - * sign extension. These are handled by skipping a word in the input, - * zeroing a word of output, and copying a word from input to output. - * - * s = a 32-bit value such as a long, which must be sign-extended to a 64-bit - * long-long in the uu_args. These are handled by skipping a word of - * input, loading a word of input and sign extending it to a double, - * and storing two words of output. - * - * l = a 64-bit long-long, passed in two registers. These are handled by skipping - * a word of input, copying a word, skipping another word of input, and - * copying another word. - * - * d = a 32-bit int or a 64-bit ptr or long, passed in via a 64-bit GPR - * from a 64-bit process. We copy two words from input to output. - * - * For example, "munge_wls" takes a word, a long-long, and a word. This takes - * four registers: the first word is in one, the long-long takes two, and the - * final word is in the fourth. We store six words: a 0, the low words of the - * first three registers, and the two words resulting from sign-extending the - * low word of the fourth register. - * - * As you can see, we save a lot of code by collapsing mungers that are prefixes - * of each other, into the more general routine. This ends up copying a few extra - * bytes of parameters, but big deal. The old kernel copied all eight words for - * every system call. - * - * These routines assume explicit pad words in the uu_arg structures, that fill out - * int parameters to 64 bits. Having pad words makes munging args for 64-bit - * processes the equivalent of a simple bcopy(), though it does introduce an - * endian dependency. - */ - - .align 5 - .globl _munge_dddddddd // that is 8 'd's -_munge_dddddddd: - .globl _munge_ddddddd -_munge_ddddddd: - .globl _munge_dddddd -_munge_dddddd: - .globl _munge_ddddd -_munge_ddddd: - ld r5,0*8+0(r3) - ld r6,1*8+0(r3) - ld r7,2*8+0(r3) - ld r8,3*8+0(r3) - ld r9,4*8+0(r3) - ld r10,5*8+0(r3) - ld r11,6*8+0(r3) - ld r12,7*8+0(r3) - - std r5,0*8+0(r4) - std r6,1*8+0(r4) - std r7,2*8+0(r4) - std r8,3*8+0(r4) - std r9,4*8+0(r4) - std r10,5*8+0(r4) - std r11,6*8+0(r4) - std r12,7*8+0(r4) - - blr - - - .align 5 - .globl _munge_dddd -_munge_dddd: - .globl _munge_ddd -_munge_ddd: - .globl _munge_dd -_munge_dd: - .globl _munge_d -_munge_d: - ld r5,0*8+0(r3) - ld r6,1*8+0(r3) - ld r7,2*8+0(r3) - ld r8,3*8+0(r3) - - std r5,0*8+0(r4) - std r6,1*8+0(r4) - std r7,2*8+0(r4) - std r8,3*8+0(r4) - - blr - - - .align 5 - .globl _munge_wwwwwwww // that is 8 'w's -_munge_wwwwwwww: - .globl _munge_wwwwwww -_munge_wwwwwww: - .globl _munge_wwwwww -_munge_wwwwww: - .globl _munge_wwwww -_munge_wwwww: - li r0,0 - lwz r5,0*8+4(r3) - lwz r6,1*8+4(r3) - lwz r7,2*8+4(r3) - lwz r8,3*8+4(r3) - lwz r9,4*8+4(r3) - lwz r10,5*8+4(r3) - lwz r11,6*8+4(r3) - lwz r12,7*8+4(r3) - - stw r0,0*8+0(r4) - stw r5,0*8+4(r4) - stw r0,1*8+0(r4) - stw r6,1*8+4(r4) - stw r0,2*8+0(r4) - stw r7,2*8+4(r4) - stw r0,3*8+0(r4) - stw r8,3*8+4(r4) - stw r0,4*8+0(r4) - stw r9,4*8+4(r4) - stw r0,5*8+0(r4) - stw r10,5*8+4(r4) - stw r0,6*8+0(r4) - stw r11,6*8+4(r4) - stw r0,7*8+0(r4) - stw r12,7*8+4(r4) - - blr - - - .align 5 - .globl _munge_wwww -_munge_wwww: - .globl _munge_www -_munge_www: - .globl _munge_ww -_munge_ww: - .globl _munge_w -_munge_w: - li r0,0 - lwz r5,0*8+4(r3) - lwz r6,1*8+4(r3) - lwz r7,2*8+4(r3) - lwz r8,3*8+4(r3) - - stw r0,0*8+0(r4) - stw r5,0*8+4(r4) - stw r0,1*8+0(r4) - stw r6,1*8+4(r4) - stw r0,2*8+0(r4) - stw r7,2*8+4(r4) - stw r0,3*8+0(r4) - stw r8,3*8+4(r4) - - blr - - .align 5 - .globl _munge_l -_munge_l: - li r0,0 - lwz r5,0*8+4(r3) - lwz r6,1*8+4(r3) - - stw r5,0*8+0(r4) - stw r6,0*8+4(r4) - - blr - - .align 5 - .globl _munge_wlw -_munge_wlw: - .globl _munge_wl -_munge_wl: - li r0,0 - lwz r5,0*8+4(r3) - lwz r6,1*8+4(r3) - lwz r7,2*8+4(r3) - lwz r8,3*8+4(r3) - - stw r0,0*8+0(r4) - stw r5,0*8+4(r4) - stw r6,1*8+0(r4) - stw r7,1*8+4(r4) - stw r0,2*8+0(r4) - stw r8,2*8+4(r4) - - blr - - - .align 5 - .globl _munge_wwwl -_munge_wwwl: - li r0,0 - lwz r5,0*8+4(r3) - lwz r6,1*8+4(r3) - lwz r7,2*8+4(r3) - lwz r8,3*8+4(r3) - lwz r9,4*8+4(r3) - - stw r0,0*8+0(r4) - stw r5,0*8+4(r4) - stw r0,1*8+0(r4) - stw r6,1*8+4(r4) - stw r0,2*8+0(r4) - stw r7,2*8+4(r4) - stw r8,3*8+0(r4) - stw r9,3*8+4(r4) - - blr - - - .align 5 - .globl _munge_wwwlww -_munge_wwwlww: - li r0,0 - lwz r5,0*8+4(r3) - lwz r6,1*8+4(r3) - lwz r7,2*8+4(r3) - lwz r8,3*8+4(r3) - lwz r9,4*8+4(r3) - lwz r10,5*8+4(r3) - lwz r11,6*8+4(r3) - - stw r0,0*8+0(r4) - stw r5,0*8+4(r4) - stw r0,1*8+0(r4) - stw r6,1*8+4(r4) - stw r0,2*8+0(r4) - stw r7,2*8+4(r4) - stw r8,3*8+0(r4) - stw r9,3*8+4(r4) - stw r0,4*8+0(r4) - stw r10,4*8+4(r4) - stw r0,5*8+0(r4) - stw r11,5*8+4(r4) - - blr - - - .align 5 - .globl _munge_wwlwww -_munge_wwlwww: - li r0,0 - lwz r5,0*8+4(r3) // Wwlwww - lwz r6,1*8+4(r3) // wWlwww - lwz r7,2*8+4(r3) // wwLwww (hi) - lwz r8,3*8+4(r3) // wwLwww (lo) - lwz r9,4*8+4(r3) // wwlWww - lwz r10,5*8+4(r3) // wwlwWw - lwz r11,6*8+4(r3) // wwlwwW - - stw r0,0*8+0(r4) // 0wlwww - stw r5,0*8+4(r4) // Wwlwww - stw r0,1*8+0(r4) // w0lwww - stw r6,1*8+4(r4) // wWlwww - stw r7,2*8+0(r4) // wwLwww (hi) - stw r8,2*8+4(r4) // wwLwww (lo) - stw r0,3*8+0(r4) // wwl0ww - stw r9,3*8+4(r4) // wwlwww - stw r0, 4*8+0(r4) // wwlw0w - stw r10,4*8+4(r4) // wwlwWw - stw r0, 5*8+0(r4) // wwlww0 - stw r11,5*8+4(r4) // wwlwwW - - blr - - .align 5 - .globl _munge_wwwwlw // 4 'w's and an l an w -_munge_wwwwlw: - li r0,0 - lwz r5,0*8+4(r3) - lwz r6,1*8+4(r3) - lwz r7,2*8+4(r3) - lwz r8,3*8+4(r3) - lwz r9,4*8+4(r3) - lwz r10,5*8+4(r3) - lwz r11,6*8+4(r3) - - stw r0,0*8+0(r4) - stw r5,0*8+4(r4) - stw r0,1*8+0(r4) - stw r6,1*8+4(r4) - stw r0,2*8+0(r4) - stw r7,2*8+4(r4) - stw r0,3*8+0(r4) - stw r8,3*8+4(r4) - stw r9,4*8+0(r4) - stw r10,4*8+4(r4) - stw r0,5*8+0(r4) - stw r11,5*8+4(r4) - - blr - - - .align 5 - .globl _munge_wwwwl // 4 'w's and an l -_munge_wwwwl: - li r0,0 - lwz r5,0*8+4(r3) - lwz r6,1*8+4(r3) - lwz r7,2*8+4(r3) - lwz r8,3*8+4(r3) - lwz r9,4*8+4(r3) - lwz r10,5*8+4(r3) - - stw r0,0*8+0(r4) - stw r5,0*8+4(r4) - stw r0,1*8+0(r4) - stw r6,1*8+4(r4) - stw r0,2*8+0(r4) - stw r7,2*8+4(r4) - stw r0,3*8+0(r4) - stw r8,3*8+4(r4) - stw r9,4*8+0(r4) - stw r10,4*8+4(r4) - - blr - - - .align 5 - .globl _munge_wwwwwl // 5 'w's and an l -_munge_wwwwwl: - li r0,0 - lwz r5,0*8+4(r3) - lwz r6,1*8+4(r3) - lwz r7,2*8+4(r3) - lwz r8,3*8+4(r3) - lwz r9,4*8+4(r3) - lwz r10,5*8+4(r3) - lwz r11,6*8+4(r3) - - stw r0,0*8+0(r4) - stw r5,0*8+4(r4) - stw r0,1*8+0(r4) - stw r6,1*8+4(r4) - stw r0,2*8+0(r4) - stw r7,2*8+4(r4) - stw r0,3*8+0(r4) - stw r8,3*8+4(r4) - stw r0,4*8+0(r4) - stw r9,4*8+4(r4) - stw r10,5*8+0(r4) - stw r11,5*8+4(r4) - - blr - - - .align 5 - .globl _munge_wsw -_munge_wsw: - li r0,0 - lwz r5,0*8+4(r3) - lwz r6,1*8+4(r3) - lwz r7,2*8+4(r3) - - stw r0,0*8+0(r4) - srawi r2,r6,31 - stw r5,0*8+4(r4) - stw r2,1*8+0(r4) - stw r6,1*8+4(r4) - stw r0,2*8+0(r4) - stw r7,2*8+4(r4) - - blr - - - .align 5 - .globl _munge_wws -_munge_wws: - li r0,0 - lwz r5,0*8+4(r3) - lwz r6,1*8+4(r3) - lwz r7,2*8+4(r3) - - stw r0,0*8+0(r4) - stw r5,0*8+4(r4) - stw r0,1*8+0(r4) - srawi r2,r7,31 - stw r6,1*8+4(r4) - stw r2,2*8+0(r4) - stw r7,2*8+4(r4) - - blr - - - .align 5 - .globl _munge_wwwsw -_munge_wwwsw: - li r0,0 - lwz r5,0*8+4(r3) - lwz r6,1*8+4(r3) - lwz r7,2*8+4(r3) - lwz r8,3*8+4(r3) - lwz r9,4*8+4(r3) - - stw r0,0*8+0(r4) - stw r5,0*8+4(r4) - stw r0,1*8+0(r4) - stw r6,1*8+4(r4) - srawi r2,r8,31 - stw r0,2*8+0(r4) - stw r7,2*8+4(r4) - stw r2,3*8+0(r4) - stw r8,3*8+4(r4) - stw r0,4*8+0(r4) - stw r9,4*8+4(r4) - - blr - - .align 5 - .globl _munge_llllll -_munge_llllll: - li r0,0 - lwz r5,0*8+4(r3) // l1 - lwz r6,1*8+4(r3) - lwz r7,2*8+4(r3) // l2 - lwz r8,3*8+4(r3) - lwz r9,4*8+4(r3) // l3 - lwz r10,5*8+4(r3) - lwz r11,6*8+4(r3) // l4 - - stw r5,0*8+0(r4) - stw r6,0*8+4(r4) - stw r7,1*8+0(r4) - stw r8,1*8+4(r4) - stw r9,2*8+0(r4) - stw r10,2*8+4(r4) - stw r11,3*8+0(r4) - - // the rest spill to the stack (r1) - // we'll zero fill for now - // and make the syscall handler - // do the copyin from the user stack - stw r0,3*8+4(r4) - stw r0,4*8+0(r4) - stw r0,4*8+4(r4) - stw r0,5*8+0(r4) - stw r0,5*8+4(r4) - - blr diff --git a/bsd/dev/ppc/ppc_init.c b/bsd/dev/ppc/ppc_init.c deleted file mode 100644 index 545cfe5ae..000000000 --- a/bsd/dev/ppc/ppc_init.c +++ /dev/null @@ -1,276 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef __MACHO__ -#include -#endif - -/* External references */ - -extern unsigned int intstack[]; /* declared in start.s */ -extern unsigned int intstack_top_ss; /* declared in start.s */ -#if MACH_KGDB -extern unsigned int gdbstackptr; /* declared in start.s */ -extern unsigned int gdbstack_top_ss; /* declared in start.s */ -#endif /* MACH_KGDB */ - -/* Stuff declared in kern/bootstrap.c which we may need to initialise */ - -extern vm_offset_t boot_start; -extern vm_size_t boot_size; -extern vm_offset_t boot_region_desc; -extern vm_size_t boot_region_count; -extern int boot_thread_state_flavor; -extern thread_state_t boot_thread_state; -extern unsigned int boot_thread_state_count; - -/* Trap handling function prototypes */ - -extern void thandler(void); /* trap handler */ -extern void ihandler(void); /* interrupt handler */ -extern void shandler(void); /* syscall handler */ -extern void gdbhandler(void); /* debugger handler */ -extern void fpu_switch(void); /* fp handler */ -extern void atomic_switch_trap(void); /* fast path atomic thread switch */ - -/* definitions */ - -struct ppc_thread_state boot_task_thread_state; - - - - - -#if 1 /* TODO NMGS - vm_map_steal_memory shouldn't use these - remove */ -vm_offset_t avail_start; -vm_offset_t avail_end; -#endif -unsigned int avail_remaining = 0; -vm_offset_t first_avail; - -/* - * Mach-O Support - */ - - -#ifdef __MACHO__ -void *sectTEXTB; -unsigned long sectSizeTEXT; -void *sectDATAB; -unsigned long sectSizeDATA; -void *sectOBJCB; -unsigned long sectSizeOBJC; -void *sectLINKB; -unsigned long sectSizeLINK; - -vm_offset_t end, etext, edata; -#define ETEXT etext -#endif - - - -void ppc_vm_init(unsigned int memory_size, boot_args *args) -{ - unsigned int htabmask; - unsigned int i; - vm_offset_t addr; - int boot_task_end_offset; - - printf("mem_size = %d M\n",memory_size / (1024 * 1024)); - -#ifdef __MACHO__ - /* Now retrieve addresses for end, edata, and etext - * from MACH-O headers. - */ - - - etext = (vm_offset_t) sectTEXTB + sectSizeTEXT; - edata = (vm_offset_t) sectDATAB + sectSizeDATA; - end = getlastaddr(); -#endif - - /* Stitch valid memory regions together - they may be contiguous - * even though they're not already glued together - */ - - /* Go through the list of memory regions passed in via the args - * and copy valid entries into the pmap_mem_regions table, adding - * further calculated entries. - */ - - - /* Initialise the pmap system, using space above `first_avail'*/ - -#ifndef __MACHO__ - free_regions[free_regions_count].start = - round_page((unsigned int)&_ExceptionVectorsEnd - - (unsigned int)&_ExceptionVectorsStart); -#else - /* On MACH-O generated kernels, the Exception Vectors - * are already mapped and loaded at 0 -- no relocation - * or freeing of memory is needed - */ - - free_regions[free_regions_count].start = round_page((unsigned int)&_ExceptionVectorsEnd) + 4096; -#endif - - /* If we are on a PDM machine memory at 1M might be used - * for video. TODO NMGS call video driver to do this - * somehow - */ - - - /* For PowerMac, first_avail is set to above the bootstrap task. - * TODO NMGS - different screen modes - might free mem? - */ - - first_avail = round_page(args->first_avail); - - - /* map in the exception vectors */ - /* - * map the kernel text, data and bss. Don't forget other regions too - */ - for (i = 0; i < args->kern_info.region_count; i++) { -#if MACH_KDB - if (args->kern_info.regions[i].prot == VM_PROT_NONE && - i == args->kern_info.region_count - 1) { - /* assume that's the kernel symbol table */ - kern_sym_start = args->kern_info.regions[i].addr; - kern_sym_size = args->kern_info.regions[i].size; - printf("kernel symbol table at 0x%x size 0x%x\n", - kern_sym_start, kern_sym_size); - args->kern_info.regions[i].prot |= - (VM_PROT_WRITE|VM_PROT_READ); - } -#endif /* MACH_KDB */ - -#ifdef __MACHO__ - /* Skip the VECTORS segment */ - if (args->kern_info.regions[i].addr == 0) - continue; -#endif - - boot_region_count = args->task_info.region_count; - boot_size = 0; - boot_task_end_offset = 0; - /* Map bootstrap task pages 1-1 so that user_bootstrap can find it */ - for (i = 0; i < boot_region_count; i++) { - if (args->task_info.regions[i].mapped) { - /* kernel requires everything page aligned */ -#if DEBUG - printf("mapping virt 0x%08x to phys 0x%08x end 0x%x, prot=0x%b\n", - ppc_trunc_page(args->task_info.base_addr + - args->task_info.regions[i].offset), - ppc_trunc_page(args->task_info.base_addr + - args->task_info.regions[i].offset), - ppc_round_page(args->task_info.base_addr + - args->task_info.regions[i].offset + - args->task_info.regions[i].size), - args->task_info.regions[i].prot, - "\x10\1READ\2WRITE\3EXEC"); -#endif /* DEBUG */ - - (void)pmap_map( - ppc_trunc_page(args->task_info.base_addr + - args->task_info.regions[i].offset), - ppc_trunc_page(args->task_info.base_addr + - args->task_info.regions[i].offset), - ppc_round_page(args->task_info.base_addr + - args->task_info.regions[i].offset + - args->task_info.regions[i].size), - args->task_info.regions[i].prot); - - /* Count the size of mapped space */ - boot_size += args->task_info.regions[i].size; - - /* There may be an overlapping physical page - * mapped to two different virtual addresses - */ - if (boot_task_end_offset > - args->task_info.regions[i].offset) { - boot_size -= boot_task_end_offset - - args->task_info.regions[i].offset; -#if DEBUG - printf("WARNING - bootstrap overlaps regions\n"); -#endif /* DEBUG */ - } - - boot_task_end_offset = - args->task_info.regions[i].offset + - args->task_info.regions[i].size; - } - } - - if (boot_region_count) { - - /* Add a new region to the bootstrap task for it's stack */ - args->task_info.regions[boot_region_count].addr = - BOOT_STACK_BASE; - args->task_info.regions[boot_region_count].size = - BOOT_STACK_SIZE; - args->task_info.regions[boot_region_count].mapped = FALSE; - boot_region_count++; - - boot_start = args->task_info.base_addr; - boot_region_desc = (vm_offset_t) args->task_info.regions; - /* TODO NMGS need to put param info onto top of boot stack */ - boot_task_thread_state.r1 = BOOT_STACK_PTR-0x100; - boot_task_thread_state.srr0 = args->task_info.entry; - boot_task_thread_state.srr1 = - MSR_MARK_SYSCALL(MSR_EXPORT_MASK_SET); - - boot_thread_state_flavor = PPC_THREAD_STATE; - boot_thread_state_count = PPC_THREAD_STATE_COUNT; - boot_thread_state = - (thread_state_t)&boot_task_thread_state; - } - - - -} - diff --git a/bsd/dev/ppc/sdt_ppc.c b/bsd/dev/ppc/sdt_ppc.c deleted file mode 100644 index bcd10c967..000000000 --- a/bsd/dev/ppc/sdt_ppc.c +++ /dev/null @@ -1,71 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* #pragma ident "@(#)sdt.c 1.6 06/03/24 SMI" */ - -#ifdef KERNEL -#ifndef _KERNEL -#define _KERNEL /* Solaris vs. Darwin */ -#endif -#endif - -#define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from mach/ppc/thread_status.h */ -#include -#include -#include - -#include -#include - -#include - -#include -#include - -extern sdt_probe_t **sdt_probetab; - -/*ARGSUSED*/ -int -sdt_invop(uintptr_t addr, uintptr_t *stack, uintptr_t eax) -{ - uint64_t mask = (_cpu_capabilities & k64Bit) ? 0xffffffffffffffffULL : 0x00000000ffffffffULL; - -#pragma unused(eax) - sdt_probe_t *sdt = sdt_probetab[SDT_ADDR2NDX(addr)]; - - for (; sdt != NULL; sdt = sdt->sdp_hashnext) { - if ((uintptr_t)sdt->sdp_patchpoint == addr) { - ppc_saved_state_t *regs = (ppc_saved_state_t *)stack; - - dtrace_probe(sdt->sdp_id, regs->save_r3 & mask, regs->save_r4 & mask, - regs->save_r5 & mask, regs->save_r6 & mask, regs->save_r7 & mask); - - return (DTRACE_INVOP_NOP); - } - } - - return (0); -} - diff --git a/bsd/dev/ppc/stubs.c b/bsd/dev/ppc/stubs.c deleted file mode 100644 index 55e2f0170..000000000 --- a/bsd/dev/ppc/stubs.c +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1997 by Apple Computer, Inc., all rights reserved - * Copyright (c) 1993 NeXT Computer, Inc. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -extern void *get_bsduthreadarg(thread_t); -extern int *get_bsduthreadrval(thread_t); - -/* - * copy a null terminated string from one point to another in - * the kernel address space. - * - no access checks are performed. - * - if the end of string isn't found before - * maxlen bytes are copied, return ENAMETOOLONG, - * indicating an incomplete copy. - * - otherwise, return 0, indicating success. - * the number of bytes copied is always returned in lencopied. - */ -/* from ppc/fault_copy.c -Titan1T4 VERSION */ -int -copystr(const void *vfrom, void *vto, size_t maxlen, size_t *lencopied) -{ - register unsigned l; - const char *from; - char *to; - - from = vfrom; - to = vto; - for (l = 0; l < maxlen; l++) - if ((*to++ = *from++) == '\0') { - if (lencopied) - *lencopied = l + 1; - return 0; - } - if (lencopied) - *lencopied = maxlen; - return ENAMETOOLONG; -} - -int copywithin(src, dst, count) -void * src, *dst; -size_t count; -{ - bcopy(src,dst,count); - return 0; -} - -void * -get_bsduthreadarg(thread_t th) -{ -struct uthread *ut; - ut = get_bsdthread_info(th); - return((void *)(ut->uu_arg)); -} - -int * -get_bsduthreadrval(thread_t th) -{ -struct uthread *ut; - ut = get_bsdthread_info(th); - return(&ut->uu_rval[0]); -} - diff --git a/bsd/dev/ppc/systemcalls.c b/bsd/dev/ppc/systemcalls.c deleted file mode 100644 index a8fd2dcfd..000000000 --- a/bsd/dev/ppc/systemcalls.c +++ /dev/null @@ -1,435 +0,0 @@ -/* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * NOTICE: This file was modified by McAfee Research in 2004 to introduce - * support for mandatory and extensible security protections. This notice - * is included in support of clause 2.2 (b) of the Apple Public License, - * Version 2.0. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#if CONFIG_DTRACE -extern int32_t dtrace_systrace_syscall(struct proc *, void *, int *); -extern void dtrace_systrace_syscall_return(unsigned short, int, int *); -#endif - -extern void -unix_syscall(struct savearea *regs); - -extern struct savearea * -find_user_regs( - thread_t act); - -extern lck_spin_t * tz_slock; - -/* - * Function: unix_syscall - * - * Inputs: regs - pointer to Process Control Block - * - * Outputs: none - */ -void -unix_syscall(struct savearea *regs) -{ - thread_t thread_act; - struct uthread *uthread; - struct proc *proc; - struct sysent *callp; - int error; - unsigned int code; - boolean_t flavor; - - flavor = (((unsigned int)regs->save_r0) == 0)? 1: 0; - - if (flavor) - code = regs->save_r3; - else - code = regs->save_r0; - - if (kdebug_enable && (code != 180)) { - if (flavor) - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, - regs->save_r4, regs->save_r5, regs->save_r6, regs->save_r7, 0); - else - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, - regs->save_r3, regs->save_r4, regs->save_r5, regs->save_r6, 0); - } - thread_act = current_thread(); - uthread = get_bsdthread_info(thread_act); - - if (!(uthread->uu_flag & UT_VFORK)) - proc = (struct proc *)get_bsdtask_info(current_task()); - else - proc = current_proc(); - - /* Make sure there is a process associated with this task */ - if (proc == NULL) { - regs->save_r3 = (long long)EPERM; - /* set the "pc" to execute cerror routine */ - regs->save_srr0 -= 4; - task_terminate_internal(current_task()); - thread_exception_return(); - /* NOTREACHED */ - } - - /* - * Delayed binding of thread credential to process credential, if we - * are not running with an explicitly set thread credential. - */ - kauth_cred_uthread_update(uthread, proc); - - callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; - - if (callp->sy_narg != 0) { - void *regsp; - sy_munge_t *mungerp; - - if (IS_64BIT_PROCESS(proc)) { - /* XXX Turn 64 bit unsafe calls into nosys() */ - if (callp->sy_flags & UNSAFE_64BIT) { - callp = &sysent[63]; - goto unsafe; - } - mungerp = callp->sy_arg_munge64; - } - else { - mungerp = callp->sy_arg_munge32; - } - if ( !flavor) { - regsp = (void *) ®s->save_r3; - } else { - /* indirect system call consumes an argument so only 7 are supported */ - if (callp->sy_narg > 7) { - callp = &sysent[63]; - goto unsafe; - } - regsp = (void *) ®s->save_r4; - } - /* call syscall argument munger to copy in arguments (see xnu/bsd/dev/ppc/munge.s) */ - (*mungerp)(regsp, (void *) &uthread->uu_arg[0]); - } - -unsafe: - - uthread->uu_flag |= UT_NOTCANCELPT; - - uthread->uu_rval[0] = 0; - - /* - * r4 is volatile, if we set it to regs->save_r4 here the child - * will have parents r4 after execve - */ - uthread->uu_rval[1] = 0; - - error = 0; - - /* - * PPC runtime calls cerror after every unix system call, so - * assume no error and adjust the "pc" to skip this call. - * It will be set back to the cerror call if an error is detected. - */ - regs->save_srr0 += 4; - -#ifdef JOE_DEBUG - uthread->uu_iocount = 0; - uthread->uu_vpindex = 0; -#endif - AUDIT_SYSCALL_ENTER(code, proc, uthread); - error = (*(callp->sy_call))(proc, (void *)uthread->uu_arg, &(uthread->uu_rval[0])); - AUDIT_SYSCALL_EXIT(code, proc, uthread, error); -#if CONFIG_MACF - mac_thread_userret(code, error, thread_act); -#endif - - -#ifdef JOE_DEBUG - if (uthread->uu_iocount) - printf("system call returned with uu_iocount != 0\n"); -#endif -#if CONFIG_DTRACE - uthread->t_dtrace_errno = error; -#endif /* CONFIG_DTRACE */ - - regs = find_user_regs(thread_act); - - if (error == ERESTART) { - regs->save_srr0 -= 8; - } else if (error != EJUSTRETURN) { - if (error) { - regs->save_r3 = (long long)error; - /* set the "pc" to execute cerror routine */ - regs->save_srr0 -= 4; - } else { /* (not error) */ - switch (callp->sy_return_type) { - case _SYSCALL_RET_INT_T: - regs->save_r3 = uthread->uu_rval[0]; - regs->save_r4 = uthread->uu_rval[1]; - break; - case _SYSCALL_RET_UINT_T: - regs->save_r3 = ((u_int)uthread->uu_rval[0]); - regs->save_r4 = ((u_int)uthread->uu_rval[1]); - break; - case _SYSCALL_RET_OFF_T: - case _SYSCALL_RET_UINT64_T: - /* return 64 bits split across two registers for 32 bit */ - /* process and in one register for 64 bit process */ - if (IS_64BIT_PROCESS(proc)) { - u_int64_t *retp = (u_int64_t *)&uthread->uu_rval[0]; - regs->save_r3 = *retp; - regs->save_r4 = 0; - } - else { - regs->save_r3 = uthread->uu_rval[0]; - regs->save_r4 = uthread->uu_rval[1]; - } - break; - case _SYSCALL_RET_ADDR_T: - case _SYSCALL_RET_SIZE_T: - case _SYSCALL_RET_SSIZE_T: - /* the variable length return types (user_addr_t, user_ssize_t, - * and user_size_t) are always the largest possible size in the - * kernel (we use uu_rval[0] and [1] as one 64 bit value). - */ - { - user_addr_t *retp = (user_addr_t *)&uthread->uu_rval[0]; - regs->save_r3 = *retp; - regs->save_r4 = 0; - } - break; - case _SYSCALL_RET_NONE: - break; - default: - panic("unix_syscall: unknown return type"); - break; - } - } - } - /* else (error == EJUSTRETURN) { nothing } */ - - - uthread->uu_flag &= ~UT_NOTCANCELPT; - - /* panic if funnel is held */ - syscall_exit_funnelcheck(); - - if (uthread->uu_lowpri_window) { - /* - * task is marked as a low priority I/O type - * and the I/O we issued while in this system call - * collided with normal I/O operations... we'll - * delay in order to mitigate the impact of this - * task on the normal operation of the system - */ - throttle_lowpri_io(TRUE); - } - if (kdebug_enable && (code != 180)) { - - if (callp->sy_return_type == _SYSCALL_RET_SSIZE_T) - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, - error, uthread->uu_rval[1], 0, proc->p_pid, 0); - else - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, - error, uthread->uu_rval[0], uthread->uu_rval[1], proc->p_pid, 0); - } - - thread_exception_return(); - /* NOTREACHED */ -} - -void -unix_syscall_return(int error) -{ - thread_t thread_act; - struct uthread *uthread; - struct proc *proc; - struct savearea *regs; - unsigned int code; - struct sysent *callp; - - thread_act = current_thread(); - proc = current_proc(); - uthread = get_bsdthread_info(thread_act); - - regs = find_user_regs(thread_act); - - if (regs->save_r0 != 0) - code = regs->save_r0; - else - code = regs->save_r3; - - callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; - -#if CONFIG_DTRACE - if (callp->sy_call == dtrace_systrace_syscall) - dtrace_systrace_syscall_return( code, error, uthread->uu_rval ); -#endif /* CONFIG_DTRACE */ - AUDIT_SYSCALL_EXIT(code, proc, uthread, error); - - /* - * Get index into sysent table - */ - if (error == ERESTART) { - regs->save_srr0 -= 8; - } else if (error != EJUSTRETURN) { - if (error) { - regs->save_r3 = (long long)error; - /* set the "pc" to execute cerror routine */ - regs->save_srr0 -= 4; - } else { /* (not error) */ - switch (callp->sy_return_type) { - case _SYSCALL_RET_INT_T: - regs->save_r3 = uthread->uu_rval[0]; - regs->save_r4 = uthread->uu_rval[1]; - break; - case _SYSCALL_RET_UINT_T: - regs->save_r3 = ((u_int)uthread->uu_rval[0]); - regs->save_r4 = ((u_int)uthread->uu_rval[1]); - break; - case _SYSCALL_RET_OFF_T: - case _SYSCALL_RET_UINT64_T: - /* return 64 bits split across two registers for 32 bit */ - /* process and in one register for 64 bit process */ - if (IS_64BIT_PROCESS(proc)) { - u_int64_t *retp = (u_int64_t *)&uthread->uu_rval[0]; - regs->save_r3 = *retp; - } - else { - regs->save_r3 = uthread->uu_rval[0]; - regs->save_r4 = uthread->uu_rval[1]; - } - break; - case _SYSCALL_RET_ADDR_T: - case _SYSCALL_RET_SIZE_T: - case _SYSCALL_RET_SSIZE_T: - /* the variable length return types (user_addr_t, user_ssize_t, - * and user_size_t) are always the largest possible size in the - * kernel (we use uu_rval[0] and [1] as one 64 bit value). - */ - { - u_int64_t *retp = (u_int64_t *)&uthread->uu_rval[0]; - regs->save_r3 = *retp; - } - break; - case _SYSCALL_RET_NONE: - break; - default: - panic("unix_syscall: unknown return type"); - break; - } - } - } - /* else (error == EJUSTRETURN) { nothing } */ - - - uthread->uu_flag &= ~UT_NOTCANCELPT; - - /* panic if funnel is held */ - syscall_exit_funnelcheck(); - - if (uthread->uu_lowpri_window) { - /* - * task is marked as a low priority I/O type - * and the I/O we issued while in this system call - * collided with normal I/O operations... we'll - * delay in order to mitigate the impact of this - * task on the normal operation of the system - */ - throttle_lowpri_io(TRUE); - } - if (kdebug_enable && (code != 180)) { - if (callp->sy_return_type == _SYSCALL_RET_SSIZE_T) - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, - error, uthread->uu_rval[1], 0, proc->p_pid, 0); - else - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, - error, uthread->uu_rval[0], uthread->uu_rval[1], proc->p_pid, 0); - } - - thread_exception_return(); - /* NOTREACHED */ -} - -void -munge_lwww( - const void *in32, - void *out64) -{ - const uint32_t *arg32; - uint64_t *arg64; - - arg32 = (const uint32_t *) in32; - arg64 = (uint64_t *) out64; - - arg64[3] = arg32[9]; /* lwwW */ - arg64[2] = arg32[7]; /* lwWw */ - arg64[1] = arg32[5]; /* lWww */ - arg64[0] = ((uint64_t) arg32[1]) << 32; /* Lwww (hi) */ - arg64[0] |= (uint64_t) arg32[3]; /* Lwww (lo) */ -} - -void -munge_lw( - const void *in32, - void *out64) -{ - const uint32_t *arg32; - uint64_t *arg64; - - arg32 = (const uint32_t *) in32; - arg64 = (uint64_t *) out64; - - arg64[1] = arg32[5]; /* lW */ - arg64[0] = ((uint64_t) arg32[1]) << 32; /* Lw (hi) */ - arg64[0] |= (uint64_t) arg32[3]; /* Lw (lo) */ -} diff --git a/bsd/dev/ppc/unix_signal.c b/bsd/dev/ppc/unix_signal.c deleted file mode 100644 index 4ca48b0b7..000000000 --- a/bsd/dev/ppc/unix_signal.c +++ /dev/null @@ -1,953 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. - */ - -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include - -// #include XXX include path messed up for some reason... - -/* XXX functions not in a Mach headers */ -extern kern_return_t thread_getstatus(register thread_t act, int flavor, - thread_state_t tstate, mach_msg_type_number_t *count); -extern unsigned int get_msr_exportmask(void); -extern kern_return_t thread_setstatus(thread_t thread, int flavor, - thread_state_t tstate, mach_msg_type_number_t count); -extern void ppc_checkthreadstate(void *, int); -extern struct savearea_vec *find_user_vec_curr(void); -extern int thread_enable_fpe(thread_t act, int onoff); - - - -#define C_32_REDZONE_LEN 224 -#define C_32_STK_ALIGN 16 -#define C_32_PARAMSAVE_LEN 64 -#define C_32_LINKAGE_LEN 48 - -#define C_64_REDZONE_LEN 320 -#define C_64_STK_ALIGN 32 -#define C_64_PARAMSAVE_LEN 64 -#define C_64_LINKAGE_LEN 48 - -#define TRUNC_DOWN32(a,b,c) ((((uint32_t)a)-(b)) & ((uint32_t)(-(c)))) -#define TRUNC_DOWN64(a,b,c) ((((uint64_t)a)-(b)) & ((uint64_t)(-(c)))) - -/* - * The stack layout possibilities (info style); This needs to mach with signal trampoline code - * - * Traditional: 1 - * Traditional64: 20 - * Traditional64with vec: 25 - * 32bit context 30 - * 32bit context with vector 35 - * 64bit context 40 - * 64bit context with vector 45 - * Dual context 50 - * Dual context with vector 55 - * - */ - -#define UC_TRAD 1 -#define UC_TRAD_VEC 6 -#define UC_TRAD64 20 -#define UC_TRAD64_VEC 25 -#define UC_FLAVOR 30 -#define UC_FLAVOR_VEC 35 -#define UC_FLAVOR64 40 -#define UC_FLAVOR64_VEC 45 -#define UC_DUAL 50 -#define UC_DUAL_VEC 55 -#define UC_SET_ALT_STACK 0x40000000 -#define UC_RESET_ALT_STACK 0x80000000 - - /* The following are valid mcontext sizes */ -#define UC_FLAVOR_SIZE ((PPC_THREAD_STATE_COUNT + PPC_EXCEPTION_STATE_COUNT + PPC_FLOAT_STATE_COUNT) * sizeof(int)) - -#define UC_FLAVOR_VEC_SIZE ((PPC_THREAD_STATE_COUNT + PPC_EXCEPTION_STATE_COUNT + PPC_FLOAT_STATE_COUNT + PPC_VECTOR_STATE_COUNT) * sizeof(int)) - -#define UC_FLAVOR64_SIZE ((PPC_THREAD_STATE64_COUNT + PPC_EXCEPTION_STATE64_COUNT + PPC_FLOAT_STATE_COUNT) * sizeof(int)) - -#define UC_FLAVOR64_VEC_SIZE ((PPC_THREAD_STATE64_COUNT + PPC_EXCEPTION_STATE64_COUNT + PPC_FLOAT_STATE_COUNT + PPC_VECTOR_STATE_COUNT) * sizeof(int)) - - -/* - * NOTE: Source and target may *NOT* overlap! - */ -static void -ucontext_32to64(struct ucontext64 *in, struct user_ucontext64 *out) -{ - out->uc_onstack = in->uc_onstack; - out->uc_sigmask = in->uc_sigmask; - - /* internal "structure assign" */ - out->uc_stack.ss_sp = CAST_USER_ADDR_T(in->uc_stack.ss_sp); - out->uc_stack.ss_size = in->uc_stack.ss_size; - out->uc_stack.ss_flags = in->uc_stack.ss_flags; - - out->uc_link = CAST_USER_ADDR_T(in->uc_link); - out->uc_mcsize = in->uc_mcsize; - out->uc_mcontext64 = CAST_USER_ADDR_T(in->uc_mcontext64); -} - -/* - * This conversion is safe, since if we are converting for a 32 bit process, - * then it's values of uc-stack.ss_size and uc_mcsize will never exceed 4G. - * - * NOTE: Source and target may *NOT* overlap! - */ -static void -ucontext_64to32(struct user_ucontext64 *in, struct ucontext64 *out) -{ - out->uc_onstack = in->uc_onstack; - out->uc_sigmask = in->uc_sigmask; - - /* internal "structure assign" */ - out->uc_stack.ss_sp = CAST_DOWN(void *,in->uc_stack.ss_sp); - out->uc_stack.ss_size = in->uc_stack.ss_size; /* range reduction */ - out->uc_stack.ss_flags = in->uc_stack.ss_flags; - - out->uc_link = CAST_DOWN(void *,in->uc_link); - out->uc_mcsize = in->uc_mcsize; /* range reduction */ - out->uc_mcontext64 = CAST_DOWN(void *,in->uc_mcontext64); -} - -/* - * NOTE: Source and target may *NOT* overlap! - */ -static void -siginfo_user_to_user32(user_siginfo_t *in, user32_siginfo_t *out) -{ - out->si_signo = in->si_signo; - out->si_errno = in->si_errno; - out->si_code = in->si_code; - out->si_pid = in->si_pid; - out->si_uid = in->si_uid; - out->si_status = in->si_status; - out->si_addr = CAST_DOWN_EXPLICIT(user32_addr_t,in->si_addr); - /* following cast works for sival_int because of padding */ - out->si_value.sival_ptr = CAST_DOWN_EXPLICIT(user32_addr_t,in->si_value.sival_ptr); - out->si_band = in->si_band; /* range reduction */ - out->__pad[0] = in->pad[0]; /* mcontext.ss.r1 */ -} - -static void -siginfo_user_to_user64(user_siginfo_t *in, user64_siginfo_t *out) -{ - out->si_signo = in->si_signo; - out->si_errno = in->si_errno; - out->si_code = in->si_code; - out->si_pid = in->si_pid; - out->si_uid = in->si_uid; - out->si_status = in->si_status; - out->si_addr = in->si_addr; - out->si_value.sival_ptr = in->si_value.sival_ptr; - out->si_band = in->si_band; /* range reduction */ - out->__pad[0] = in->pad[0]; /* mcontext.ss.r1 */ -} - - -/* - * Arrange for this process to run a signal handler - */ - -void -sendsig(struct proc *p, user_addr_t catcher, int sig, int mask, __unused uint32_t code) -{ - kern_return_t kretn; - struct mcontext mctx; - user_addr_t p_mctx = USER_ADDR_NULL; /* mcontext dest. */ - struct mcontext64 mctx64; - user_addr_t p_mctx64 = USER_ADDR_NULL; /* mcontext dest. */ - struct user_ucontext64 uctx; - user_addr_t p_uctx; /* user stack addr top copy ucontext */ - user_siginfo_t sinfo; - user_addr_t p_sinfo; /* user stack addr top copy siginfo */ - struct sigacts *ps = p->p_sigacts; - int oonstack; - user_addr_t sp; - mach_msg_type_number_t state_count; - thread_t th_act; - struct uthread *ut; - int infostyle = UC_TRAD; - int dualcontext =0; - user_addr_t trampact; - int vec_used = 0; - int stack_size = 0; - void * tstate; - int flavor; - int ctx32 = 1; - - th_act = current_thread(); - ut = get_bsdthread_info(th_act); - - /* - * XXX We conditionalize type passed here based on SA_SIGINFO, but - * XXX we always send up all the information, regardless; perhaps - * XXX this should not be conditionalized? Defer making this change - * XXX now, due to possible tools impact. - */ - if (p->p_sigacts->ps_siginfo & sigmask(sig)) { - /* - * If SA_SIGINFO is set, then we must provide the user - * process both a siginfo_t and a context argument. We call - * this "FLAVORED", as opposed to "TRADITIONAL", which doesn't - * expect a context. "DUAL" is a type of "FLAVORED". - */ - if (is_64signalregset()) { - /* - * If this is a 64 bit CPU, we must include a 64 bit - * context in the data we pass to user space; we may - * or may not also include a 32 bit context at the - * same time, for non-leaf functions. - * - * The user may also explicitly choose to not receive - * a 32 bit context, at their option; we only allow - * this to happen on 64 bit processors, for obvious - * reasons. - */ - if (IS_64BIT_PROCESS(p) || - (p->p_sigacts->ps_64regset & sigmask(sig))) { - /* - * For a 64 bit process, there is no 32 bit - * context. - */ - ctx32 = 0; - infostyle = UC_FLAVOR64; - } else { - /* - * For a 32 bit process on a 64 bit CPU, we - * may have 64 bit leaf functions, so we need - * both contexts. - */ - dualcontext = 1; - infostyle = UC_DUAL; - } - } else { - /* - * If this is a 32 bit CPU, then we only have a 32 bit - * context to contend with. - */ - infostyle = UC_FLAVOR; - } - } else { - /* - * If SA_SIGINFO is not set, then we have a traditional style - * call which does not need additional context passed. The - * default is 32 bit traditional. - * - * XXX The second check is redundant on PPC32; keep it anyway. - */ - if (is_64signalregset() || IS_64BIT_PROCESS(p)) { - /* - * However, if this is a 64 bit CPU, we need to change - * this to 64 bit traditional, and drop the 32 bit - * context. - */ - ctx32 = 0; - infostyle = UC_TRAD64; - } - } - - proc_unlock(p); - - /* I need this for SIGINFO anyway */ - flavor = PPC_THREAD_STATE; - tstate = (void *)&mctx.ss; - state_count = PPC_THREAD_STATE_COUNT; - if (thread_getstatus(th_act, flavor, (thread_state_t)tstate, &state_count) != KERN_SUCCESS) - goto bad; - - if ((ctx32 == 0) || dualcontext) { - flavor = PPC_THREAD_STATE64; - tstate = (void *)&mctx64.ss; - state_count = PPC_THREAD_STATE64_COUNT; - if (thread_getstatus(th_act, flavor, (thread_state_t)tstate, &state_count) != KERN_SUCCESS) - goto bad; - } - - if ((ctx32 == 1) || dualcontext) { - flavor = PPC_EXCEPTION_STATE; - tstate = (void *)&mctx.es; - state_count = PPC_EXCEPTION_STATE_COUNT; - if (thread_getstatus(th_act, flavor, (thread_state_t)tstate, &state_count) != KERN_SUCCESS) - goto bad; - } - - if ((ctx32 == 0) || dualcontext) { - flavor = PPC_EXCEPTION_STATE64; - tstate = (void *)&mctx64.es; - state_count = PPC_EXCEPTION_STATE64_COUNT; - - if (thread_getstatus(th_act, flavor, (thread_state_t)tstate, &state_count) != KERN_SUCCESS) - goto bad; - - } - - - if ((ctx32 == 1) || dualcontext) { - flavor = PPC_FLOAT_STATE; - tstate = (void *)&mctx.fs; - state_count = PPC_FLOAT_STATE_COUNT; - if (thread_getstatus(th_act, flavor, (thread_state_t)tstate, &state_count) != KERN_SUCCESS) - goto bad; - } - - if ((ctx32 == 0) || dualcontext) { - flavor = PPC_FLOAT_STATE; - tstate = (void *)&mctx64.fs; - state_count = PPC_FLOAT_STATE_COUNT; - if (thread_getstatus(th_act, flavor, (thread_state_t)tstate, &state_count) != KERN_SUCCESS) - goto bad; - - } - - - if (find_user_vec_curr()) { - vec_used = 1; - - if ((ctx32 == 1) || dualcontext) { - flavor = PPC_VECTOR_STATE; - tstate = (void *)&mctx.vs; - state_count = PPC_VECTOR_STATE_COUNT; - if (thread_getstatus(th_act, flavor, (thread_state_t)tstate, &state_count) != KERN_SUCCESS) - goto bad; - infostyle += 5; - } - - if ((ctx32 == 0) || dualcontext) { - flavor = PPC_VECTOR_STATE; - tstate = (void *)&mctx64.vs; - state_count = PPC_VECTOR_STATE_COUNT; - if (thread_getstatus(th_act, flavor, (thread_state_t)tstate, &state_count) != KERN_SUCCESS) - goto bad; - infostyle += 5; - } - } - - trampact = ps->ps_trampact[sig]; - oonstack = ut->uu_sigstk.ss_flags & SA_ONSTACK; - - /* figure out where our new stack lives */ - if ((ut->uu_flag & UT_ALTSTACK) && !oonstack && - (ps->ps_sigonstack & sigmask(sig))) { - sp = ut->uu_sigstk.ss_sp; - sp += ut->uu_sigstk.ss_size; - stack_size = ut->uu_sigstk.ss_size; - ut->uu_sigstk.ss_flags |= SA_ONSTACK; - } - else { - if (ctx32 == 0) - sp = mctx64.ss.r1; - else - sp = CAST_USER_ADDR_T(mctx.ss.r1); - } - - - /* put siginfo on top */ - - /* preserve RED ZONE area */ - if (IS_64BIT_PROCESS(p)) - sp = TRUNC_DOWN64(sp, C_64_REDZONE_LEN, C_64_STK_ALIGN); - else - sp = TRUNC_DOWN32(sp, C_32_REDZONE_LEN, C_32_STK_ALIGN); - - /* next are the saved registers */ - if ((ctx32 == 0) || dualcontext) { - sp -= sizeof(struct mcontext64); - p_mctx64 = sp; - } - if ((ctx32 == 1) || dualcontext) { - sp -= sizeof(struct mcontext); - p_mctx = sp; - } - - if (IS_64BIT_PROCESS(p)) { - /* context goes first on stack */ - sp -= sizeof(struct user_ucontext64); - p_uctx = sp; - - /* this is where siginfo goes on stack */ - sp -= sizeof(user64_siginfo_t); - p_sinfo = sp; - - sp = TRUNC_DOWN64(sp, C_64_PARAMSAVE_LEN+C_64_LINKAGE_LEN, C_64_STK_ALIGN); - } else { - /* - * struct ucontext and struct ucontext64 are identical in - * size and content; the only difference is the internal - * pointer type for the last element, which makes no - * difference for the copyout(). - */ - - /* context goes first on stack */ - sp -= sizeof(struct ucontext64); - p_uctx = sp; - - /* this is where siginfo goes on stack */ - sp -= sizeof(user32_siginfo_t); - p_sinfo = sp; - - sp = TRUNC_DOWN32(sp, C_32_PARAMSAVE_LEN+C_32_LINKAGE_LEN, C_32_STK_ALIGN); - } - - uctx.uc_onstack = oonstack; - uctx.uc_sigmask = mask; - uctx.uc_stack.ss_sp = sp; - uctx.uc_stack.ss_size = stack_size; - if (oonstack) - uctx.uc_stack.ss_flags |= SS_ONSTACK; - - uctx.uc_link = 0; - if (ctx32 == 0) - uctx.uc_mcsize = (size_t)((PPC_EXCEPTION_STATE64_COUNT + PPC_THREAD_STATE64_COUNT + PPC_FLOAT_STATE_COUNT) * sizeof(int)); - else - uctx.uc_mcsize = (size_t)((PPC_EXCEPTION_STATE_COUNT + PPC_THREAD_STATE_COUNT + PPC_FLOAT_STATE_COUNT) * sizeof(int)); - - if (vec_used) - uctx.uc_mcsize += (size_t)(PPC_VECTOR_STATE_COUNT * sizeof(int)); - - if (ctx32 == 0) - uctx.uc_mcontext64 = p_mctx64; - else - uctx.uc_mcontext64 = p_mctx; - - /* setup siginfo */ - bzero((caddr_t)&sinfo, sizeof(sinfo)); - sinfo.si_signo = sig; - if (ctx32 == 0) { - sinfo.si_addr = mctx64.ss.srr0; - sinfo.pad[0] = mctx64.ss.r1; - } else { - sinfo.si_addr = CAST_USER_ADDR_T(mctx.ss.srr0); - sinfo.pad[0] = CAST_USER_ADDR_T(mctx.ss.r1); - } - - switch (sig) { - case SIGILL: - /* - * If it's 64 bit and not a dual context, mctx will - * contain uninitialized data, so we have to use - * mctx64 here. - */ - if(ctx32 == 0) { - if (mctx64.ss.srr1 & (1 << (31 - SRR1_PRG_ILL_INS_BIT))) - sinfo.si_code = ILL_ILLOPC; - else if (mctx64.ss.srr1 & (1 << (31 - SRR1_PRG_PRV_INS_BIT))) - sinfo.si_code = ILL_PRVOPC; - else if (mctx64.ss.srr1 & (1 << (31 - SRR1_PRG_TRAP_BIT))) - sinfo.si_code = ILL_ILLTRP; - else - sinfo.si_code = ILL_NOOP; - } else { - if (mctx.ss.srr1 & (1 << (31 - SRR1_PRG_ILL_INS_BIT))) - sinfo.si_code = ILL_ILLOPC; - else if (mctx.ss.srr1 & (1 << (31 - SRR1_PRG_PRV_INS_BIT))) - sinfo.si_code = ILL_PRVOPC; - else if (mctx.ss.srr1 & (1 << (31 - SRR1_PRG_TRAP_BIT))) - sinfo.si_code = ILL_ILLTRP; - else - sinfo.si_code = ILL_NOOP; - } - break; - case SIGFPE: -#define FPSCR_VX 2 -#define FPSCR_OX 3 -#define FPSCR_UX 4 -#define FPSCR_ZX 5 -#define FPSCR_XX 6 - /* - * If it's 64 bit and not a dual context, mctx will - * contain uninitialized data, so we have to use - * mctx64 here. - */ - if(ctx32 == 0) { - if (mctx64.fs.fpscr & (1 << (31 - FPSCR_VX))) - sinfo.si_code = FPE_FLTINV; - else if (mctx64.fs.fpscr & (1 << (31 - FPSCR_OX))) - sinfo.si_code = FPE_FLTOVF; - else if (mctx64.fs.fpscr & (1 << (31 - FPSCR_UX))) - sinfo.si_code = FPE_FLTUND; - else if (mctx64.fs.fpscr & (1 << (31 - FPSCR_ZX))) - sinfo.si_code = FPE_FLTDIV; - else if (mctx64.fs.fpscr & (1 << (31 - FPSCR_XX))) - sinfo.si_code = FPE_FLTRES; - else - sinfo.si_code = FPE_NOOP; - } else { - if (mctx.fs.fpscr & (1 << (31 - FPSCR_VX))) - sinfo.si_code = FPE_FLTINV; - else if (mctx.fs.fpscr & (1 << (31 - FPSCR_OX))) - sinfo.si_code = FPE_FLTOVF; - else if (mctx.fs.fpscr & (1 << (31 - FPSCR_UX))) - sinfo.si_code = FPE_FLTUND; - else if (mctx.fs.fpscr & (1 << (31 - FPSCR_ZX))) - sinfo.si_code = FPE_FLTDIV; - else if (mctx.fs.fpscr & (1 << (31 - FPSCR_XX))) - sinfo.si_code = FPE_FLTRES; - else - sinfo.si_code = FPE_NOOP; - } - break; - - case SIGBUS: - if (ctx32 == 0) { - sinfo.si_addr = mctx64.es.dar; - } else { - sinfo.si_addr = CAST_USER_ADDR_T(mctx.es.dar); - } - /* on ppc we generate only if EXC_PPC_UNALIGNED */ - sinfo.si_code = BUS_ADRALN; - break; - - case SIGSEGV: - /* - * If it's 64 bit and not a dual context, mctx will - * contain uninitialized data, so we have to use - * mctx64 here. - */ - if (ctx32 == 0) { - sinfo.si_addr = mctx64.es.dar; - /* First check in srr1 and then in dsisr */ - if (mctx64.ss.srr1 & (1 << (31 - DSISR_PROT_BIT))) - sinfo.si_code = SEGV_ACCERR; - else if (mctx64.es.dsisr & (1 << (31 - DSISR_PROT_BIT))) - sinfo.si_code = SEGV_ACCERR; - else - sinfo.si_code = SEGV_MAPERR; - } else { - sinfo.si_addr = CAST_USER_ADDR_T(mctx.es.dar); - /* First check in srr1 and then in dsisr */ - if (mctx.ss.srr1 & (1 << (31 - DSISR_PROT_BIT))) - sinfo.si_code = SEGV_ACCERR; - else if (mctx.es.dsisr & (1 << (31 - DSISR_PROT_BIT))) - sinfo.si_code = SEGV_ACCERR; - else - sinfo.si_code = SEGV_MAPERR; - } - break; - default: - { - int status_and_exitcode; - - /* - * All other signals need to fill out a minimum set of - * information for the siginfo structure passed into - * the signal handler, if SA_SIGINFO was specified. - * - * p->si_status actually contains both the status and - * the exit code; we save it off in its own variable - * for later breakdown. - */ - proc_lock(p); - sinfo.si_pid = p->si_pid; - p->si_pid = 0; - status_and_exitcode = p->si_status; - p->si_status = 0; - sinfo.si_uid = p->si_uid; - p->si_uid = 0; - sinfo.si_code = p->si_code; - p->si_code = 0; - proc_unlock(p); - if (sinfo.si_code == CLD_EXITED) { - if (WIFEXITED(status_and_exitcode)) - sinfo.si_code = CLD_EXITED; - else if (WIFSIGNALED(status_and_exitcode)) { - if (WCOREDUMP(status_and_exitcode)) { - sinfo.si_code = CLD_DUMPED; - status_and_exitcode = W_EXITCODE(status_and_exitcode,status_and_exitcode); - } else { - sinfo.si_code = CLD_KILLED; - status_and_exitcode = W_EXITCODE(status_and_exitcode,status_and_exitcode); - } - } - } - /* - * The recorded status contains the exit code and the - * signal information, but the information to be passed - * in the siginfo to the handler is supposed to only - * contain the status, so we have to shift it out. - */ - sinfo.si_status = WEXITSTATUS(status_and_exitcode); - break; - } - } - - - /* copy info out to user space */ - if (IS_64BIT_PROCESS(p)) { - user64_siginfo_t sinfo64; - - siginfo_user_to_user64(&sinfo,&sinfo64); - -#if CONFIG_DTRACE - bzero((caddr_t)&(ut->t_dtrace_siginfo), sizeof(ut->t_dtrace_siginfo)); - - ut->t_dtrace_siginfo.si_signo = sinfo.si_signo; - ut->t_dtrace_siginfo.si_code = sinfo.si_code; - ut->t_dtrace_siginfo.si_pid = sinfo.si_pid; - ut->t_dtrace_siginfo.si_uid = sinfo.si_uid; - ut->t_dtrace_siginfo.si_status = sinfo.si_status; - /* XXX truncates faulting address to void * on K32 */ - ut->t_dtrace_siginfo.si_addr = CAST_DOWN(void *, sinfo.si_addr); - - - /* Fire DTrace proc:::fault probe when signal is generated by hardware. */ - switch (sig) { - case SIGILL: case SIGBUS: case SIGSEGV: case SIGFPE: case SIGTRAP: - DTRACE_PROC2(fault, int, (int)(ut->uu_code), siginfo_t *, &(ut->t_dtrace_siginfo)); - break; - default: - break; - } - - /* XXX truncates catcher address to uintptr_t */ - DTRACE_PROC3(signal__handle, int, sig, siginfo_t *, &(ut->t_dtrace_siginfo), - void (*)(void), CAST_DOWN(sig_t, catcher)); -#endif /* CONFIG_DTRACE */ - - if (copyout(&uctx, p_uctx, sizeof(struct user_ucontext64))) - goto bad; - if (copyout(&sinfo64, p_sinfo, sizeof(sinfo64))) - goto bad; - } else { - struct ucontext64 uctx32; - user32_siginfo_t sinfo32; - - ucontext_64to32(&uctx, &uctx32); - siginfo_user_to_user32(&sinfo,&sinfo32); - -#if CONFIG_DTRACE - bzero((caddr_t)&(ut->t_dtrace_siginfo), sizeof(ut->t_dtrace_siginfo)); - - ut->t_dtrace_siginfo.si_signo = sinfo.si_signo; - ut->t_dtrace_siginfo.si_code = sinfo.si_code; - ut->t_dtrace_siginfo.si_pid = sinfo.si_pid; - ut->t_dtrace_siginfo.si_uid = sinfo.si_uid; - ut->t_dtrace_siginfo.si_status = sinfo.si_status; - ut->t_dtrace_siginfo.si_addr = CAST_DOWN(void *, sinfo.si_addr); - - - /* Fire DTrace proc:::fault probe when signal is generated by hardware. */ - switch (sig) { - case SIGILL: case SIGBUS: case SIGSEGV: case SIGFPE: case SIGTRAP: - DTRACE_PROC2(fault, int, (int)(ut->uu_code), siginfo_t *, &(ut->t_dtrace_siginfo)); - break; - default: - break; - } - - DTRACE_PROC3(signal__handle, int, sig, siginfo_t *, &(ut->t_dtrace_siginfo), - void (*)(void), CAST_DOWN(sig_t, catcher)); -#endif /* CONFIG_DTRACE */ - - if (copyout(&uctx32, p_uctx, sizeof(struct ucontext64))) - goto bad; - - if (copyout(&sinfo32, p_sinfo, sizeof(sinfo32))) - goto bad; - } - if ((ctx32 == 0) || dualcontext) { - /* - * NOTE: Size of mcontext is not variant between 64bit and - * 32bit programs usng 64bit registers. - */ - if (copyout(&mctx64, p_mctx64, (vec_used? UC_FLAVOR64_VEC_SIZE: UC_FLAVOR64_SIZE))) - goto bad; - } - if ((ctx32 == 1) || dualcontext) { - if (copyout(&mctx, p_mctx, uctx.uc_mcsize)) - goto bad; - } - - - /* Place our arguments in arg registers: rtm dependent */ - if(IS_64BIT_PROCESS(p)) { - mctx64.ss.r3 = catcher; - mctx64.ss.r4 = CAST_USER_ADDR_T(infostyle); - mctx64.ss.r5 = CAST_USER_ADDR_T(sig); - mctx64.ss.r6 = p_sinfo; - mctx64.ss.r7 = p_uctx; - - mctx64.ss.srr0 = trampact; - /* MSR_EXPORT_MASK_SET */ - mctx64.ss.srr1 = CAST_USER_ADDR_T(get_msr_exportmask()); - mctx64.ss.r1 = sp; - state_count = PPC_THREAD_STATE64_COUNT; - if ((kretn = thread_setstatus(th_act, PPC_THREAD_STATE64, (void *)&mctx64.ss, state_count)) != KERN_SUCCESS) { - panic("sendsig: thread_setstatus failed, ret = %08X\n", kretn); - } - } else { - mctx.ss.r3 = CAST_DOWN(uint32_t,catcher); - mctx.ss.r4 = (uint32_t)infostyle; - mctx.ss.r5 = (uint32_t)sig; - mctx.ss.r6 = CAST_DOWN(uint32_t,p_sinfo); - mctx.ss.r7 = CAST_DOWN(uint32_t,p_uctx); - - mctx.ss.srr0 = CAST_DOWN(uint32_t,trampact); - /* MSR_EXPORT_MASK_SET */ - mctx.ss.srr1 = get_msr_exportmask(); - mctx.ss.r1 = CAST_DOWN(uint32_t,sp); - state_count = PPC_THREAD_STATE_COUNT; - if ((kretn = thread_setstatus(th_act, PPC_THREAD_STATE, (void *)&mctx.ss, state_count)) != KERN_SUCCESS) { - panic("sendsig: thread_setstatus failed, ret = %08X\n", kretn); - } - } - - proc_lock(p); - return; - -bad: - proc_lock(p); - SIGACTION(p, SIGILL) = SIG_DFL; - sig = sigmask(SIGILL); - p->p_sigignore &= ~sig; - p->p_sigcatch &= ~sig; - ut->uu_sigmask &= ~sig; - /* sendsig is called with signal lock held */ - proc_unlock(p); - psignal_locked(p, SIGILL); - proc_lock(p); - return; -} - -/* - * System call to cleanup state after a signal - * has been taken. Reset signal mask and - * stack state from context left by sendsig (above). - * Return to previous pc and psl as specified by - * context left by sendsig. Check carefully to - * make sure that the user has not modified the - * psl to gain improper priviledges or to cause - * a machine fault. - */ - -/* ARGSUSED */ -int -sigreturn(struct proc *p, struct sigreturn_args *uap, __unused int *retval) -{ - struct user_ucontext64 uctx; - - char mactx[sizeof(struct mcontext64)]; - struct mcontext *p_mctx; - struct mcontext64 *p_64mctx; - int error; - thread_t th_act; - struct sigacts *ps = p->p_sigacts; - sigset_t mask; - user_addr_t action; - uint32_t state_count; - unsigned int state_flavor; - struct uthread * ut; - int vec_used = 0; - void *tsptr, *fptr, *vptr; - int infostyle = uap->infostyle; - - th_act = current_thread(); - - ut = (struct uthread *)get_bsdthread_info(th_act); - - /* - * If we are being asked to change the altstack flag on the thread, we - * just rest it and return (the uap->uctx is not used). - */ - if (infostyle == UC_SET_ALT_STACK) { - ut->uu_sigstk.ss_flags |= SA_ONSTACK; - return (0); - } else if ((unsigned int)infostyle == UC_RESET_ALT_STACK) { - ut->uu_sigstk.ss_flags &= ~SA_ONSTACK; - return (0); - } - - if (IS_64BIT_PROCESS(p)) { - error = copyin(uap->uctx, &uctx, sizeof(struct user_ucontext64)); - if (error) - return(error); - } else { - struct ucontext64 uctx32; - - /* - * struct ucontext and struct ucontext64 are identical in - * size and content; the only difference is the internal - * pointer type for the last element, which makes no - * difference for the copyin(). - */ - error = copyin(uap->uctx, &uctx32, sizeof(struct ucontext)); - if (error) - return(error); - ucontext_32to64(&uctx32, &uctx); - } - - - /* validate the machine context size */ - switch (uctx.uc_mcsize) { - case UC_FLAVOR64_VEC_SIZE: - case UC_FLAVOR64_SIZE: - case UC_FLAVOR_VEC_SIZE: - case UC_FLAVOR_SIZE: - break; - default: - return(EINVAL); - } - - /* - * The 64 bit process mcontext is identical to the mcontext64, so - * there is no conversion necessary. - */ - error = copyin(uctx.uc_mcontext64, mactx, uctx.uc_mcsize); - if (error) - return(error); - - if ((uctx.uc_onstack & 01)) - ut->uu_sigstk.ss_flags |= SA_ONSTACK; - else - ut->uu_sigstk.ss_flags &= ~SA_ONSTACK; - - ut->uu_sigmask = uctx.uc_sigmask & ~sigcantmask; - if (ut->uu_siglist & ~ut->uu_sigmask) - signal_setast(current_thread()); - - vec_used = 0; - switch (infostyle) { - case UC_FLAVOR64_VEC: - case UC_TRAD64_VEC: - vec_used = 1; - case UC_TRAD64: - case UC_FLAVOR64: { - p_64mctx = (struct mcontext64 *)mactx; - tsptr = (void *)&p_64mctx->ss; - fptr = (void *)&p_64mctx->fs; - vptr = (void *)&p_64mctx->vs; - state_flavor = PPC_THREAD_STATE64; - state_count = PPC_THREAD_STATE64_COUNT; - } - break; - case UC_FLAVOR_VEC : - case UC_TRAD_VEC : - vec_used = 1; - case UC_FLAVOR : - case UC_TRAD : - default: { - p_mctx = (struct mcontext *)mactx; - tsptr = (void *)&p_mctx->ss; - fptr = (void *)&p_mctx->fs; - vptr = (void *)&p_mctx->vs; - state_flavor = PPC_THREAD_STATE; - state_count = PPC_THREAD_STATE_COUNT; - } - break; - } /* switch () */ - - /* validate the thread state, set/reset appropriate mode bits in srr1 */ - (void)ppc_checkthreadstate(tsptr, state_flavor); - - if (thread_setstatus(th_act, state_flavor, tsptr, state_count) != KERN_SUCCESS) { - return(EINVAL); - } - - state_count = PPC_FLOAT_STATE_COUNT; - if (thread_setstatus(th_act, PPC_FLOAT_STATE, fptr, state_count) != KERN_SUCCESS) { - return(EINVAL); - } - - mask = sigmask(SIGFPE); - if (((ut->uu_sigmask & mask) == 0) && (p->p_sigcatch & mask) && ((p->p_sigignore & mask) == 0)) { - action = ps->ps_sigact[SIGFPE]; - if((action != SIG_DFL) && (action != SIG_IGN)) { - thread_enable_fpe(th_act, 1); - } - } - - if (vec_used) { - state_count = PPC_VECTOR_STATE_COUNT; - if (thread_setstatus(th_act, PPC_VECTOR_STATE, vptr, state_count) != KERN_SUCCESS) { - return(EINVAL); - } - } - return (EJUSTRETURN); -} - -/* - * machine_exception() performs MD translation - * of a mach exception to a unix signal and code. - */ - -boolean_t -machine_exception( - int exception, - mach_exception_code_t code, - __unused mach_exception_subcode_t subcode, - int *unix_signal, - mach_exception_code_t *unix_code) -{ - switch(exception) { - - case EXC_BAD_INSTRUCTION: - *unix_signal = SIGILL; - *unix_code = code; - break; - - case EXC_ARITHMETIC: - *unix_signal = SIGFPE; - *unix_code = code; - break; - - case EXC_SOFTWARE: - if (code == EXC_PPC_TRAP) { - *unix_signal = SIGTRAP; - *unix_code = code; - break; - } else - return(FALSE); - - default: - return(FALSE); - } - - return(TRUE); -} - diff --git a/bsd/dev/ppc/xsumas.s b/bsd/dev/ppc/xsumas.s deleted file mode 100644 index 6ac06e947..000000000 --- a/bsd/dev/ppc/xsumas.s +++ /dev/null @@ -1,401 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#define kShort 11 -#define cr1_gt 5 // bit 1 of cr1 - -/* - * short xsum_assym( short *p, int len, short xsum, boolean odd); - * - * r3 - Pointer to data - * r4 - Length of data - * r5 - Accumulated sum value - * r6 -"Starting on odd address" flag (relative to byte 0 of the checksumed data) - * - * Note: If the "odd" flag is set, the address in r3 will be even. Nonetheless, we - * correctly handle the case where the flag is set and the address is odd. - * - * This is the internet (IP, TCP) checksum algorithm, which is the 1s-complement sum - * of the data, treated as an array of 16-bit integers. 1s-complement sums are done - * via "add with carry" operations on a 2s-complement machine like PPC. Note that - * the adds can be done in parallel on 32-bit (or 64-bit) registers, as long as the - * final sum is folded down to 16 bits. On 32-bit machines we use "adde", which is - * perfect except that it serializes the adds on the carry bit. On 64-bit machines - * we avoid this serialization by adding 32-bit words into 64-bit sums, then folding - * all 64-bits into a 16-bit sum at the end. We cannot use "adde" on 64-bit sums, - * because the kernel runs in 32-bit mode even on 64-bit machines (so the carry bit - * is set on the low 32-bits of the sum.) - * - * Using Altivec is tempting, but the performance impact of the greatly increased - * number of exceptions and register save/restore traffic probably make it impractical - * for now. - */ - .globl _xsum_assym - .globl _xsum_nop_if_32bit - .text - .align 5 -_xsum_assym: - cmplwi cr0,r4,kShort ; too short to word align? - rlwinm r2,r3,0,0x3 ; get byte offset in word - dcbt 0,r3 ; touch in 1st cache line - cmpwi cr6,r2,0 ; is address word aligned? - ble cr0,Lshort ; skip if too short to bother aligning - - subfic r0,r2,4 ; get #bytes in partial word - cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set - addic r0,r0,0 ; turn off carry - beq cr6,Laligned ; skip if already word aligned (r2==0 if aligned) - -; Partial word at start: zero filled on left, it becomes initial checksum. - - rlwinm r3,r3,0,0,29 ; word align address - mtcrf 0x01,r2 ; move byte offset to cr7 - lwz r6,0(r3) ; get partial word - li r7,-1 ; start of mask for partial fill - slwi r8,r2,3 ; multiply byte offset by 8 - sub r4,r4,r0 ; adjust length for bytes in partial word - crxor cr1_gt,31,cr1_gt; set flag if byte-lane swap will be necessary - srw r7,r7,r8 ; get mask for bytes to keep in partial word - addi r3,r3,4 ; point to next word of input - and r2,r6,r7 ; zero fill on left - -; Address is now word aligned. Prepare for inner loop over 32-byte chunks. -; r2 = initial checksum -; r3 = word aligned address -; r4 = length remaining -; r5 = accumulated sum parameter -; carry = off -; cr1_gt = "starting on odd address" flag - -Laligned: - srwi. r0,r4,5 ; get count of 32-byte chunks - mtcrf 0x02,r4 ; move residual length to cr6 and cr7 - mtcrf 0x01,r4 - beq cr0,Lleftovers ; no chunks - - mtctr r0 ; set up loop count - li r4,32 ; offset to next chunk -_xsum_nop_if_32bit: - b L64BitPath ; use the 64-bit path (patched to nop on 32-bit machine) - dcbt r4,r3 ; touch in 2nd cache line - li r0,96 ; get touch offset - b LInnerLoop32 ; enter 32-bit loop - -; Inner loop for 32-bit machines. - - .align 4 -LInnerLoop32: - lwz r4,0(r3) - lwz r6,4(r3) - lwz r7,8(r3) - lwz r8,12(r3) - adde r2,r2,r4 - lwz r9,16(r3) - adde r2,r2,r6 - lwz r10,20(r3) - adde r2,r2,r7 - lwz r11,24(r3) - adde r2,r2,r8 - lwz r12,28(r3) - adde r2,r2,r9 - dcbt r3,r0 - adde r2,r2,r10 - addi r3,r3,32 - adde r2,r2,r11 - adde r2,r2,r12 - bdnz+ LInnerLoop32 - -; Handle leftover bytes. -; r2 = checksum so far -; r3 = word aligned address -; r5 = accumulated sum parameter -; carry = live -; cr1_gt = "starting on odd address" flag -; cr6,cr7 = residual length - -Lleftovers: - bf 27,Lleftover8 ; test 0x10 bit of residual length - lwz r4,0(r3) - lwz r6,4(r3) - lwz r7,8(r3) - lwz r8,12(r3) - addi r3,r3,16 - adde r2,r2,r4 - adde r2,r2,r6 - adde r2,r2,r7 - adde r2,r2,r8 -Lleftover8: - bf 28,Lleftover4 - lwz r4,0(r3) - lwz r6,4(r3) - addi r3,r3,8 - adde r2,r2,r4 - adde r2,r2,r6 -Lleftover4: - bf 29,Lleftover2 - lwz r4,0(r3) - addi r3,r3,4 - adde r2,r2,r4 -Lleftover2: - bf 30,Lleftover1 - lhz r4,0(r3) - addi r3,r3,2 - adde r2,r2,r4 -Lleftover1: - bf 31,Lwrapup - lbz r4,0(r3) - slwi r4,r4,8 ; shift last byte into proper lane - adde r2,r2,r4 - -; All data bytes checksummed. Wrap up. -; r2 = checksum so far (word parallel) -; r5 = accumulated sum parameter -; carry = live -; cr1_gt = "starting on odd address" flag - -Lwrapup: - addze r2,r2 ; add in last carry - addze r2,r2 ; in case the "addze" carries -Lwrapupx: ; here from short-operand case, with xer(ca) undefined - srwi r6,r2,16 ; top half of 32-bit checksum - rlwinm r7,r2,0,0xFFFF ; lower half - add r2,r6,r7 ; add them together - srwi r6,r2,16 ; then do it again, in case first carried - rlwinm r7,r2,0,0xFFFF - add r2,r6,r7 - bf cr1_gt,Lswapped ; test "starting on odd address" flag - -; The checksum began on an odd address, so swap bytes. - - rlwinm r6,r2,24,0x00FF ; move top byte to bottom - rlwinm r7,r2,8,0xFF00 ; bottom to top - or r2,r6,r7 ; rejoin - -; Finally, add in checksum passed in as a parameter. - -Lswapped: - add r2,r2,r5 ; add passed-in checksum - srwi r6,r2,16 ; top half of 32-bit checksum - rlwinm r7,r2,0,0xFFFF ; lower half - add r2,r6,r7 ; add them together - srwi r6,r2,16 ; then do it again, in case first carried - rlwinm r7,r2,0,0xFFFF - add r3,r6,r7 ; steer result into r3 - blr - -; Handle short operands. Do a halfword at a time. -; r3 = address -; r4 = length (<= kShort) -; r5 = accumulated sum parameter -; r6 = "starting on odd byte" flag - -Lshort: - cmpwi cr6,r4,2 ; at least two bytes? - andi. r0,r4,1 ; odd length? - li r2,0 ; initialize checksum - cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set - blt cr6,Lshort2 ; fewer than two bytes, so skip -Lshort1: - cmpwi cr6,r4,4 ; two more bytes (after we decrement)? - lhz r7,0(r3) - subi r4,r4,2 - addi r3,r3,2 - add r2,r2,r7 ; note no need for "adde" - bge cr6,Lshort1 ; loop for 2 more bytes -Lshort2: - beq Lwrapupx ; no byte at end, proceed to checkout with carry undefined - lbz r7,0(r3) - slwi r7,r7,8 ; shift last byte into proper lane - add r2,r2,r7 - b Lwrapupx - -; Handle 64-bit machine. The major improvement over the 32-bit path is that we use -; four parallel 32-bit accumulators, which carry into the upper half naturally so we -; do not have to use "adde", which serializes on the carry bit. Note that we cannot -; do 64-bit "adde"s, because we run in 32-bit mode so carry would not be set correctly. -; r2 = checksum so far (ie, the zero-filled partial first word) -; r3 = word aligned address -; r5 = accumulated sum parameter -; ctr = number of 32-byte chunks of input -; carry = unused in this code -; cr1_gt = "starting on odd address" flag -; cr6,cr7 = residual length - -L64BitPath: - stw r13,-4(r1) ; save a few nonvolatile regs in red zone so we can use them - stw r14,-8(r1) - stw r15,-12(r1) - stw r16,-16(r1) - li r0,128 ; to touch next line - li r13,0 ; r13-r15 are the accumulators, so initialize them - dcbt r3,r0 ; touch in next cache line, and keep loads away from the above stores - lwz r4,0(r3) ; start pipeline by loading first 32 bytes into r4, r6-r12 - lwz r6,4(r3) - lwz r7,8(r3) - mr r14,r2 ; just copy incoming partial word into one of the accumulators - li r15,0 - lwz r8,12(r3) - lwz r9,16(r3) - li r16,0 - li r0,256 ; get touch offset - lwz r10,20(r3) - lwz r11,24(r3) - lwz r12,28(r3) ; load last word of previous chunk - addi r3,r3,32 ; skip past the chunk - bdnz++ LInnerLoop64 ; enter loop if another chunk to go - - b LAddLastChunk ; only one chunk - -; Inner loop for 64-bit processors. This loop is scheduled for the 970. -; It is pipelined (loads are one iteration ahead of adds), and unrolled. -; It should take 9-10 cycles per iteration, which consumes 64 bytes of input. - - .align 5 -LInnerLoop64: ; 64 bytes/iteration - add r13,r13,r4 ; cycle 1 - add r14,r14,r6 - dcbt r3,r0 ; touch in 2 lines ahead - lwz r4,0(r3) - - add r15,r15,r7 ; cycle 2, etc - lwz r6,4(r3) - lwz r7,8(r3) - add r16,r16,r8 - - lwz r8,12(r3) - add r13,r13,r9 - add r14,r14,r10 - lwz r9,16(r3) - - add r15,r15,r11 - lwz r10,20(r3) - lwz r11,24(r3) - add r16,r16,r12 - bdz-- LEarlyExit ; early exit if no more chunks - - lwz r12,28(r3) - add r13,r13,r4 - add r14,r14,r6 - lwz r4,32(r3) - - add r15,r15,r7 - lwz r6,36(r3) - lwz r7,40(r3) - add r16,r16,r8 - - lwz r8,44(r3) - add r13,r13,r9 - add r14,r14,r10 - lwz r9,48(r3) - - add r15,r15,r11 - lwz r10,52(r3) - lwz r11,56(r3) - add r16,r16,r12 - - nop ; position last load in 2nd dispatch slot - lwz r12,60(r3) - addi r3,r3,64 - bdnz++ LInnerLoop64 - - b LAddLastChunk - -; Add in the last 32-byte chunk, and any leftover bytes. -; r3 = word aligned address of next byte of data -; r5 = accumulated sum parameter -; r13-r16 = the four accumulators -; cr1_gt = "starting on odd address" flag -; cr6,cr7 = residual length - -LEarlyExit: ; here from middle of inner loop - lwz r12,28(r3) ; load last word of last chunk - addi r3,r3,32 -LAddLastChunk: ; last 32-byte chunk of input is in r4,r6-r12 - add r13,r13,r4 ; add in last chunk - add r14,r14,r6 ; these are 64-bit adds - add r15,r15,r7 - add r16,r16,r8 - add r13,r13,r9 - add r14,r14,r10 - add r15,r15,r11 - add r16,r16,r12 - -; Handle leftover bytes, if any. - - bf 27,Lleft1 ; test 0x10 bit of residual length - lwz r4,0(r3) - lwz r6,4(r3) - lwz r7,8(r3) - lwz r8,12(r3) - addi r3,r3,16 - add r13,r13,r4 - add r14,r14,r6 - add r15,r15,r7 - add r16,r16,r8 -Lleft1: - bf 28,Lleft2 - lwz r4,0(r3) - lwz r6,4(r3) - addi r3,r3,8 - add r13,r13,r4 - add r14,r14,r6 -Lleft2: - bf 29,Lleft3 - lwz r4,0(r3) - addi r3,r3,4 - add r14,r14,r4 -Lleft3: - bf 30,Lleft4 - lhz r4,0(r3) - addi r3,r3,2 - add r15,r15,r4 -Lleft4: - bf 31,Lleft5 - lbz r4,0(r3) - slwi r4,r4,8 ; shift last byte into proper lane - add r16,r16,r4 - -; All data bytes have been checksummed. Now we must add together the four -; accumulators and restore the regs from the red zone. -; r3 = word aligned address of next byte of data -; r5 = accumulated sum parameter -; r13-r16 = the four accumulators -; carry = not used so far -; cr1_gt = "starting on odd address" flag - -Lleft5: - add r8,r13,r14 ; add the four accumulators together - add r9,r15,r16 - lwz r13,-4(r1) ; start to restore nonvolatiles from red zone - lwz r14,-8(r1) - add r8,r8,r9 ; now r8 is 64-bit sum of the four accumulators - lwz r15,-12(r1) - lwz r16,-16(r1) - srdi r7,r8,32 ; get upper half of 64-bit sum - addc r2,r7,r8 ; finally, do a 32-bit add of the two halves of r8 (setting carry) - b Lwrapup ; merge r2, r5, and carry into a 16-bit checksum diff --git a/bsd/dev/random/Makefile b/bsd/dev/random/Makefile index 1190bc1ff..7a07200d9 100644 --- a/bsd/dev/random/Makefile +++ b/bsd/dev/random/Makefile @@ -9,16 +9,12 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ INSTINC_SUBDIRS_X86_64 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ diff --git a/bsd/dev/unix_startup.c b/bsd/dev/unix_startup.c index 4ec548794..f167a1752 100644 --- a/bsd/dev/unix_startup.c +++ b/bsd/dev/unix_startup.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -50,7 +51,9 @@ #include #include #include +#include +extern uint32_t kern_maxvnodes; extern vm_map_t mb_map; #if INET || INET6 @@ -62,7 +65,7 @@ void bsd_bufferinit(void) __attribute__((section("__TEXT, initcode")) extern void md_prepare_for_shutdown(int, int, char *); unsigned int bsd_mbuf_cluster_reserve(boolean_t *); -void bsd_srv_setup(int); +void bsd_scale_setup(int); void bsd_exec_setup(int); /* @@ -71,7 +74,7 @@ void bsd_exec_setup(int); #ifdef NBUF int max_nbuf_headers = NBUF; -int niobuf_headers = NBUF / 2; +int niobuf_headers = (NBUF / 2) + 2048; int nbuf_hashelements = NBUF; int nbuf_headers = NBUF; #else @@ -81,11 +84,11 @@ int nbuf_hashelements = 0; int nbuf_headers = 0; #endif -SYSCTL_INT (_kern, OID_AUTO, nbuf, CTLFLAG_RD, &nbuf_headers, 0, ""); -SYSCTL_INT (_kern, OID_AUTO, maxnbuf, CTLFLAG_RW, &max_nbuf_headers, 0, ""); +SYSCTL_INT (_kern, OID_AUTO, nbuf, CTLFLAG_RD | CTLFLAG_LOCKED, &nbuf_headers, 0, ""); +SYSCTL_INT (_kern, OID_AUTO, maxnbuf, CTLFLAG_RW | CTLFLAG_LOCKED, &max_nbuf_headers, 0, ""); __private_extern__ int customnbuf = 0; -int srv = 0; /* Flag indicates a server boot when set */ +int serverperfmode = 0; /* Flag indicates a server boot when set */ int ncl = 0; static unsigned int mbuf_poolsz; @@ -118,10 +121,12 @@ bsd_startupearly(void) } else nbuf_hashelements = max_nbuf_headers; - if (niobuf_headers == 0) - niobuf_headers = max_nbuf_headers; - if (niobuf_headers > 4096) - niobuf_headers = 4096; + if (niobuf_headers == 0) { + if (max_nbuf_headers < 4096) + niobuf_headers = max_nbuf_headers; + else + niobuf_headers = (max_nbuf_headers / 2) + 2048; + } if (niobuf_headers < CONFIG_MIN_NIOBUF) niobuf_headers = CONFIG_MIN_NIOBUF; @@ -176,18 +181,23 @@ bsd_startupearly(void) #endif /* SOCKETS */ if (vnodes_sized == 0) { - /* - * Size vnodes based on memory - * Number vnodes is (memsize/64k) + 1024 - * This is the calculation that is used by launchd in tiger - * we are clipping the max based on 16G - * ie ((16*1024*1024*1024)/(64 *1024)) + 1024 = 263168; - * CONFIG_VNODES is set to 263168 for "medium" configurations (the default) - * but can be smaller or larger. - */ - desiredvnodes = (sane_size/65536) + 1024; - if (desiredvnodes > CONFIG_VNODES) - desiredvnodes = CONFIG_VNODES; + if (!PE_get_default("kern.maxvnodes", &desiredvnodes, sizeof(desiredvnodes))) { + /* + * Size vnodes based on memory + * Number vnodes is (memsize/64k) + 1024 + * This is the calculation that is used by launchd in tiger + * we are clipping the max based on 16G + * ie ((16*1024*1024*1024)/(64 *1024)) + 1024 = 263168; + * CONFIG_VNODES is set to 263168 for "medium" configurations (the default) + * but can be smaller or larger. + */ + desiredvnodes = (sane_size/65536) + 1024; +#ifdef CONFIG_VNODES + if (desiredvnodes > CONFIG_VNODES) + desiredvnodes = CONFIG_VNODES; +#endif + } + vnodes_sized = 1; } } @@ -252,7 +262,6 @@ bsd_mbuf_cluster_reserve(boolean_t *overridden) * to correctly compute the size of the low-memory VM pool. It is * redundant but rather harmless. */ - //(void) PE_parse_boot_argn("srv", &srv, sizeof (srv)); (void) PE_parse_boot_argn("ncl", &ncl, sizeof (ncl)); (void) PE_parse_boot_argn("mbuf_pool", &mbuf_pool, sizeof (mbuf_pool)); @@ -265,12 +274,12 @@ bsd_mbuf_cluster_reserve(boolean_t *overridden) if (sane_size > (64 * 1024 * 1024) || ncl != 0) { - if (ncl || srv) + if (ncl || serverperfmode) was_overridden = TRUE; if ((nmbclusters = ncl) == 0) { /* Auto-configure the mbuf pool size */ - nmbclusters = mbuf_default_ncl(srv, sane_size); + nmbclusters = mbuf_default_ncl(serverperfmode, sane_size); } else { /* Make sure it's not odd in case ncl is manually set */ if (nmbclusters & 0x1) @@ -280,6 +289,9 @@ bsd_mbuf_cluster_reserve(boolean_t *overridden) if (nmbclusters > MAX_NCL) nmbclusters = MAX_NCL; } + + /* Round it down to nearest multiple of 4KB clusters */ + nmbclusters = P2ROUNDDOWN(nmbclusters, NCLPBG); } mbuf_poolsz = nmbclusters << MCLSHIFT; done: @@ -296,11 +308,15 @@ void IOSleep(int); void -bsd_srv_setup(int scale) +bsd_scale_setup(int scale) { #if defined(__LP64__) - /* if memory is more than 16G, then apply rules for processes */ - if (scale > 0) { + if ((scale > 0) && (serverperfmode == 0)) { + maxproc *= scale; + maxprocperuid = (maxproc * 2) / 3; + } + /* Apply server scaling rules */ + if ((scale > 0) && (serverperfmode !=0)) { maxproc = 2500 * scale; hard_maxproc = maxproc; /* no fp usage */ diff --git a/bsd/dev/vn/Makefile b/bsd/dev/vn/Makefile index 64ae209ac..b4e415a16 100644 --- a/bsd/dev/vn/Makefile +++ b/bsd/dev/vn/Makefile @@ -3,23 +3,18 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir -CFLAGS+=$(WERROR) include $(MakeInc_cmd) include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ INSTINC_SUBDIRS_X86_64 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ diff --git a/bsd/dev/vn/vn.c b/bsd/dev/vn/vn.c index bac913331..2a0001d48 100644 --- a/bsd/dev/vn/vn.c +++ b/bsd/dev/vn/vn.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1141,10 +1141,10 @@ vniocattach_file(struct vn_softc *vn, flags = FREAD|FWRITE; if (in_kernel) { - NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, vniop->vn_file, ctx); + NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW, UIO_SYSSPACE, vniop->vn_file, ctx); } else { - NDINIT(&nd, LOOKUP, FOLLOW, + NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW, (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32), vniop->vn_file, ctx); } @@ -1156,11 +1156,11 @@ vniocattach_file(struct vn_softc *vn, } flags &= ~FWRITE; if (in_kernel) { - NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, + NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW, UIO_SYSSPACE, vniop->vn_file, ctx); } else { - NDINIT(&nd, LOOKUP, FOLLOW, + NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW, (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32), vniop->vn_file, ctx); } @@ -1221,10 +1221,10 @@ vniocattach_shadow(struct vn_softc *vn, struct vn_ioctl_64 *vniop, flags = FREAD|FWRITE; if (in_kernel) { - NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, vniop->vn_file, ctx); + NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW, UIO_SYSSPACE, vniop->vn_file, ctx); } else { - NDINIT(&nd, LOOKUP, FOLLOW, + NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW, (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32), vniop->vn_file, ctx); } diff --git a/bsd/dev/x86_64/munge.s b/bsd/dev/x86_64/munge.s index ec5b6123b..cb2f6dfc0 100644 --- a/bsd/dev/x86_64/munge.s +++ b/bsd/dev/x86_64/munge.s @@ -132,16 +132,91 @@ Lw2: Entry(munge_wl) /* Costs an extra w move to do this */ ENTRY(munge_wlw) xorl %edx,%edx +Lwlw: movl 12(%rsi),%eax movl %eax,16(%rsi) movl %edx,20(%rsi) +Lwl: movl 8(%rsi),%eax movl %eax,12(%rsi) movl 4(%rsi),%eax movl %eax,8(%rsi) + movl %edx,4(%rsi) ret +ENTRY(munge_wlwwwll) + xorl %edx,%edx +Lwlwwwll: + movl 36(%rsi),%eax + movl %eax,52(%rsi) + movl 32(%rsi),%eax + movl %eax,48(%rsi) + + movl 28(%rsi),%eax + movl %eax,44(%rsi) + movl 24(%rsi),%eax + movl %eax,40(%rsi) + + movl 20(%rsi),%eax + movl %eax,32(%rsi) + movl %edx,36(%rsi) +Lwlww: + movl 16(%rsi),%eax + movl %eax,24(%rsi) + movl %edx,28(%rsi) + jmp Lwlw + +ENTRY(munge_wlwwwllw) + xorl %edx,%edx + movl 40(%rsi),%eax + movl %eax,56(%rsi) + movl %edx,60(%rsi) + jmp Lwlwwwll + +ENTRY(munge_wlwwlwlw) + xorl %edx,%edx + movl 40(%rsi),%eax + movl %eax,56(%rsi) + movl %edx,60(%rsi) + movl 36(%rsi),%eax + movl %eax,52(%rsi) + movl 32(%rsi),%eax + movl %eax,48(%rsi) + movl 28(%rsi),%eax + movl %eax,40(%rsi) + movl %edx,44(%rsi) + movl 24(%rsi),%eax + movl %eax,36(%rsi) + movl 20(%rsi),%eax + movl %eax,32(%rsi) + jmp Lwlww + + +ENTRY(munge_wllwwll) + xorl %edx,%edx + + movl 40(%rsi),%eax //l + movl %eax,52(%rsi) + movl 36(%rsi),%eax + movl %eax,48(%rsi) + movl 32(%rsi),%eax //l + movl %eax,44(%rsi) + movl 28(%rsi),%eax + movl %eax,40(%rsi) + movl 24(%rsi),%eax //w + movl %eax,32(%rsi) + movl %edx,36(%rsi) + movl 20(%rsi),%eax //w + movl %eax,24(%rsi) + movl %edx,28(%rsi) + movl 16(%rsi),%eax //l + movl %eax,20(%rsi) + movl 12(%rsi),%eax + movl %eax,16(%rsi) + + jmp Lwl + Entry(munge_wwwlw) xorl %edx,%edx movl 20(%rsi),%eax @@ -183,6 +258,61 @@ ENTRY(munge_wwwwwl) movl %eax,44(%rsi) jmp Lw5 + +ENTRY(munge_wwwwwlww) + xorl %edx,%edx + movl 32(%rsi),%eax + movl %eax,56(%rsi) + movl %edx,60(%rsi) + movl 28(%rsi),%eax + movl %eax,48(%rsi) + movl %edx,52(%rsi) + movl 20(%rsi),%eax + movl %eax,40(%rsi) + movl 24(%rsi),%eax + movl %eax,44(%rsi) + + jmp Lw5 + +ENTRY(munge_wwwwwllw) + xorl %edx,%edx + movl 36(%rsi),%eax + movl %eax,56(%rsi) + movl %edx,60(%rsi) + movl 28(%rsi),%eax + movl %eax,48(%rsi) + movl 32(%rsi),%eax + movl %eax,52(%rsi) + movl 20(%rsi),%eax + movl %eax,40(%rsi) + movl 24(%rsi),%eax + movl %eax,44(%rsi) + jmp Lw5 + +ENTRY(munge_wwwwwlll) + xorl %edx,%edx + movl 36(%rsi),%eax + movl %eax,56(%rsi) + movl 40(%rsi),%eax + movl %eax,60(%rsi) + movl 28(%rsi),%eax + movl %eax,48(%rsi) + movl 32(%rsi),%eax + movl %eax,52(%rsi) + movl 20(%rsi),%eax + movl %eax,40(%rsi) + movl 24(%rsi),%eax + movl %eax,44(%rsi) + jmp Lw5 + +ENTRY(munge_wwwwwwl) + xorl %edx,%edx + movl 24(%rsi),%eax + movl %eax,48(%rsi) + movl 28(%rsi),%eax + movl %eax,52(%rsi) + jmp Lw6 + ENTRY(munge_wwwwwwlw) xorl %edx,%edx movl 32(%rsi),%eax diff --git a/bsd/hfs/Makefile b/bsd/hfs/Makefile index 814b9184d..27705308f 100644 --- a/bsd/hfs/Makefile +++ b/bsd/hfs/Makefile @@ -9,16 +9,12 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ INSTINC_SUBDIRS_X86_64 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ diff --git a/bsd/hfs/hfs.h b/bsd/hfs/hfs.h index 24807f7f3..114fcecc7 100644 --- a/bsd/hfs/hfs.h +++ b/bsd/hfs/hfs.h @@ -29,6 +29,14 @@ #ifndef __HFS__ #define __HFS__ +/* If set to 1, enables the code to allocate blocks from the start + * of the disk instead of the nextAllocation for sparse devices like + * sparse disk images or sparsebundle images. The free extent cache + * for such volumes is also maintained based on the start block instead + * of number of contiguous allocation blocks. These devices prefer + * allocation of blocks near the start of the disk to avoid the + * increasing the image size, but it can also result in file fragmentation. + */ #define HFS_SPARSE_DEV 1 #if DEBUG @@ -60,6 +68,10 @@ #include #include +#if CONFIG_HFS_ALLOC_RBTREE +#include +#endif + /* * Just reported via MIG interface. */ @@ -110,11 +122,11 @@ extern struct timezone gTimeZone; */ #define HFS_ROOTVERYLOWDISKTRIGGERFRACTION 5 -#define HFS_ROOTVERYLOWDISKTRIGGERLEVEL ((u_int64_t)(125*1024*1024)) +#define HFS_ROOTVERYLOWDISKTRIGGERLEVEL ((u_int64_t)(512*1024*1024)) #define HFS_ROOTLOWDISKTRIGGERFRACTION 10 -#define HFS_ROOTLOWDISKTRIGGERLEVEL ((u_int64_t)(250*1024*1024)) +#define HFS_ROOTLOWDISKTRIGGERLEVEL ((u_int64_t)(1024*1024*1024)) #define HFS_ROOTLOWDISKSHUTOFFFRACTION 11 -#define HFS_ROOTLOWDISKSHUTOFFLEVEL ((u_int64_t)(375*1024*1024)) +#define HFS_ROOTLOWDISKSHUTOFFLEVEL ((u_int64_t)(1024*1024*1024 + 250*1024*1024)) #define HFS_VERYLOWDISKTRIGGERFRACTION 1 #define HFS_VERYLOWDISKTRIGGERLEVEL ((u_int64_t)(100*1024*1024)) @@ -178,8 +190,9 @@ typedef struct hfsmount { int16_t vcbFlags; /* Runtime flag to indicate if volume is dirty/clean */ u_int32_t vcbAtrb; u_int32_t vcbJinfoBlock; - time_t hfs_itime; /* file system creation time */ - time_t hfs_btime; /* file system last backup time */ + u_int32_t localCreateDate;/* volume create time from volume header (For HFS+, value is in local time) */ + time_t hfs_itime; /* file system creation time (creation date of the root folder) */ + time_t hfs_btime; /* file system last backup time */ u_int32_t blockSize; /* size of allocation blocks */ u_int32_t totalBlocks; /* total allocation blocks */ u_int32_t allocLimit; /* Do not allocate this block or beyond */ @@ -204,11 +217,33 @@ typedef struct hfsmount { /* cache of largest known free extents */ u_int32_t vcbFreeExtCnt; HFSPlusExtentDescriptor vcbFreeExt[kMaxFreeExtents]; + lck_spin_t vcbFreeExtLock; + +#if CONFIG_HFS_ALLOC_RBTREE + /* + * Access to these fields should only be done + * after acquiring the bitmap lock. Note that the + * "offset_block_end" field indicates the portion of + * the bitmap that is currently managed by the red-black tree. + */ + + /* Normal Allocation Tree */ + extent_tree_offset_t offset_tree; + u_int32_t offset_free_extents; /* number of free extents managed by tree */ + u_int32_t offset_block_end; +#endif + /* + * For setting persistent in-mount fields that relate + * to the use of the extent trees. See HFS Red-Black + * Tree Allocator Flags below. + */ + u_int32_t extent_tree_flags; + + u_int32_t reserveBlocks; /* free block reserve */ u_int32_t loanedBlocks; /* blocks on loan for delayed allocations */ - u_int32_t localCreateDate; /* creation times for HFS+ volumes are in local time */ /* * HFS+ Private system directories (two). Any access @@ -232,8 +267,9 @@ typedef struct hfsmount { u_int32_t jnl_size; u_int32_t hfs_jnlfileid; u_int32_t hfs_jnlinfoblkid; - lck_rw_t hfs_global_lock; + lck_rw_t hfs_global_lock; u_int32_t hfs_global_lock_nesting; + void* hfs_global_lockowner; /* Notification variables: */ u_int32_t hfs_notification_conditions; @@ -266,7 +302,7 @@ typedef struct hfsmount { int hfc_maxfiles; /* maximum files to track */ struct vnode * hfc_filevp; -#ifdef HFS_SPARSE_DEV +#if HFS_SPARSE_DEV /* Sparse device variables: */ struct vnode * hfs_backingfs_rootvp; u_int32_t hfs_last_backingstatfs; @@ -281,8 +317,9 @@ typedef struct hfsmount { lck_rw_t hfs_insync; /* protects sync/freeze interaction */ /* Resize variables: */ - u_int32_t hfs_resize_filesmoved; - u_int32_t hfs_resize_totalfiles; + u_int32_t hfs_resize_blocksmoved; + u_int32_t hfs_resize_totalblocks; + u_int32_t hfs_resize_progress; /* Per mount cnode hash variables: */ lck_mtx_t hfs_chash_mutex; /* protects access to cnode hash table */ @@ -313,6 +350,7 @@ typedef struct hfsmount { u_int64_t hfs_max_pending_io; thread_call_t hfs_syncer; // removeable devices get sync'ed by this guy + } hfsmount_t; #define HFS_META_DELAY (100) @@ -321,7 +359,6 @@ typedef struct hfsmount { typedef hfsmount_t ExtendedVCB; /* Aliases for legacy (Mac OS 9) field names */ -#define vcbCrDate hfs_itime #define vcbLsMod hfs_mtime #define vcbVolBkUp hfs_btime #define extentsRefNum hfs_extents_vp @@ -362,6 +399,15 @@ static __inline__ Boolean IsVCBDirty(ExtendedVCB *vcb) */ enum privdirtype {FILE_HARDLINKS, DIR_HARDLINKS}; +/* HFS Red-Black Tree Allocator Flags */ +#define HFS_ALLOC_RB_ENABLED 0x000001 /* trees in use */ +#define HFS_ALLOC_RB_ERRORED 0x000002 /* tree hit error; disabled for the mount */ +#define HFS_ALLOC_RB_MZACTIVE 0x000004 /* metazone tree has finished building */ +#define HFS_ALLOC_RB_ACTIVE 0x000008 /* normalzone tree finished building */ + +/* HFS Red-Black Unmount Synch. Flags */ +#define HFS_ALLOC_TREEBUILD_INFLIGHT 0x000010 +#define HFS_ALLOC_TEARDOWN_INFLIGHT 0x000020 /* HFS mount point flags */ #define HFS_READ_ONLY 0x00001 @@ -380,6 +426,7 @@ enum privdirtype {FILE_HARDLINKS, DIR_HARDLINKS}; #define HFS_CREATING_BTREE 0x02000 /* When set, do not update nextAllocation in the mount structure */ #define HFS_SKIP_UPDATE_NEXT_ALLOCATION 0x04000 +/* When set, the file system supports extent-based extended attributes */ #define HFS_XATTR_EXTENTS 0x08000 #define HFS_FOLDERCOUNT 0x10000 /* When set, the file system exists on a virtual device, like disk image */ @@ -391,7 +438,7 @@ enum privdirtype {FILE_HARDLINKS, DIR_HARDLINKS}; */ #define HFS_RDONLY_DOWNGRADE 0x80000 #define HFS_DID_CONTIG_SCAN 0x100000 -#define HFS_UNMAP 0x200000 +#define HFS_SSD 0x400000 /* Macro to update next allocation block in the HFS mount structure. If @@ -416,9 +463,6 @@ enum privdirtype {FILE_HARDLINKS, DIR_HARDLINKS}; lck_mtx_unlock(&(hfsmp)->hfs_mutex); \ } \ -#define hfs_global_exclusive_lock_acquire(hfsmp) lck_rw_lock_exclusive(&(hfsmp)->hfs_global_lock) -#define hfs_global_exclusive_lock_release(hfsmp) lck_rw_unlock_exclusive(&(hfsmp)->hfs_global_lock) - /* Macro for incrementing and decrementing the folder count in a cnode * attribute only if the HFS_FOLDERCOUNT bit is set in the mount flags * and kHFSHasFolderCount bit is set in the cnode flags. Currently these @@ -517,12 +561,10 @@ enum { kHFSPlusMaxFileNameBytes = kHFSPlusMaxFileNameChars * 3 }; /* * HFS specific fcntl()'s */ -#define HFS_BULKACCESS (FCNTL_FS_SPECIFIC_BASE + 0x00001) -#define HFS_GET_MOUNT_TIME (FCNTL_FS_SPECIFIC_BASE + 0x00002) -#define HFS_GET_LAST_MTIME (FCNTL_FS_SPECIFIC_BASE + 0x00003) #define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004) #define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005) -#define HFS_EXT_BULKACCESS (FCNTL_FS_SPECIFIC_BASE + 0x00006) +/* See HFSIOC_EXT_BULKACCESS and friends for HFS specific fsctls*/ + /* @@ -537,7 +579,6 @@ enum { kHFSPlusMaxFileNameBytes = kHFSPlusMaxFileNameChars * 3 }; FUNCTION PROTOTYPES ******************************************************************************/ - /***************************************************************************** hfs_vnop_xxx functions from different files ******************************************************************************/ @@ -545,6 +586,9 @@ int hfs_vnop_readdirattr(struct vnop_readdirattr_args *); /* in hfs_attrlist.c int hfs_vnop_inactive(struct vnop_inactive_args *); /* in hfs_cnode.c */ int hfs_vnop_reclaim(struct vnop_reclaim_args *); /* in hfs_cnode.c */ +int hfs_set_backingstore (struct vnode *vp, int val); /* in hfs_cnode.c */ +int hfs_is_backingstore (struct vnode *vp, int *val); /* in hfs_cnode.c */ + int hfs_vnop_link(struct vnop_link_args *); /* in hfs_link.c */ @@ -633,6 +677,11 @@ extern int hfs_relocate(struct vnode *, u_int32_t, kauth_cred_t, struct proc extern int hfs_truncate(struct vnode *, off_t, int, int, int, vfs_context_t); +extern int hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork, + struct filefork *rsrcfork, u_int32_t fileid); + +extern int hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp); + extern int hfs_bmap(struct vnode *, daddr_t, struct vnode **, daddr64_t *, unsigned int *); extern int hfs_fsync(struct vnode *, int, int, struct proc *); @@ -643,9 +692,12 @@ extern int hfs_removeallattr(struct hfsmount *hfsmp, u_int32_t fileid); extern int hfs_set_volxattr(struct hfsmount *hfsmp, unsigned int xattrtype, int state); -extern void hfs_check_volxattr(struct hfsmount *hfsmp, unsigned int xattrtype); +extern int hfs_isallocated(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBlocks); -extern int hfs_isallocated(struct hfsmount *, u_int32_t, u_int32_t); +extern int hfs_count_allocated(struct hfsmount *hfsmp, u_int32_t startBlock, + u_int32_t numBlocks, u_int32_t *alloc_count); + +extern int hfs_isrbtree_active (struct hfsmount *hfsmp); /***************************************************************************** @@ -656,7 +708,7 @@ int hfs_mountroot(mount_t mp, vnode_t rvp, vfs_context_t context); /* used as a callback by the journaling code */ extern void hfs_sync_metadata(void *arg); -extern int hfs_vget(struct hfsmount *, cnid_t, struct vnode **, int); +extern int hfs_vget(struct hfsmount *, cnid_t, struct vnode **, int, int); extern void hfs_setencodingbits(struct hfsmount *hfsmp, u_int32_t encoding); @@ -694,6 +746,15 @@ extern int overflow_extents(struct filefork *fp); extern int hfs_owner_rights(struct hfsmount *hfsmp, uid_t cnode_uid, kauth_cred_t cred, struct proc *p, int invokesuperuserstatus); +extern int check_for_tracked_file(struct vnode *vp, time_t ctime, uint64_t op_type, void *arg); +extern int check_for_dataless_file(struct vnode *vp, uint64_t op_type); + +/* + * Journal lock function prototypes + */ +int hfs_lock_global (struct hfsmount *hfsmp, enum hfslocktype locktype); +void hfs_unlock_global (struct hfsmount *hfsmp); + /* HFS System file locking */ #define SFL_CATALOG 0x0001 @@ -717,7 +778,7 @@ extern u_int32_t hfs_freeblks(struct hfsmount * hfsmp, int wantreserve); short MacToVFSError(OSErr err); -void hfs_metadatazone_init(struct hfsmount *hfsmp); +void hfs_metadatazone_init(struct hfsmount *hfsmp, int disable); /* HFS directory hint functions. */ extern directoryhint_t * hfs_getdirhint(struct cnode *, int, int); @@ -735,9 +796,11 @@ extern int hfs_virtualmetafile(struct cnode *); extern int hfs_start_transaction(struct hfsmount *hfsmp); extern int hfs_end_transaction(struct hfsmount *hfsmp); -extern int hfs_journal_flush(struct hfsmount *hfsmp); +extern int hfs_journal_flush(struct hfsmount *hfsmp, boolean_t wait_for_IO); extern void hfs_sync_ejectable(struct hfsmount *hfsmp); +extern void hfs_trim_callback(void *arg, uint32_t extent_count, const dk_extent_t *extents); + /* Erase unused Catalog nodes due to . */ extern int hfs_erase_unused_nodes(struct hfsmount *hfsmp); @@ -758,7 +821,7 @@ extern int hfs_btsync(struct vnode *vp, int sync_transaction); extern void replace_desc(struct cnode *cp, struct cat_desc *cdp); extern int hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, - struct vnode **rvpp, int can_drop_lock, int error_on_unlinked); + struct vnode **rvpp, int can_drop_lock, int error_on_unlink); extern int hfs_update(struct vnode *, int); @@ -766,10 +829,24 @@ extern int hfs_update(struct vnode *, int); /***************************************************************************** Functions from hfs_xattr.c ******************************************************************************/ + +/* Maximum extended attribute size supported for all extended attributes except + * resource fork and finder info. + */ +#define HFS_XATTR_MAXSIZE (128 * 1024) + +/* Number of bits used to represent maximum extended attribute size */ +#define HFS_XATTR_SIZE_BITS 18 + int hfs_attrkeycompare(HFSPlusAttrKey *searchKey, HFSPlusAttrKey *trialKey); int hfs_buildattrkey(u_int32_t fileID, const char *attrname, HFSPlusAttrKey *key); void hfs_xattr_init(struct hfsmount * hfsmp); int file_attribute_exist(struct hfsmount *hfsmp, uint32_t fileID); +int init_attrdata_vnode(struct hfsmount *hfsmp); +int hfs_getxattr_internal(struct cnode *, struct vnop_getxattr_args *, + struct hfsmount *, u_int32_t); +int hfs_setxattr_internal(struct cnode *, caddr_t, size_t, + struct vnop_setxattr_args *, struct hfsmount *, u_int32_t); @@ -779,7 +856,7 @@ int file_attribute_exist(struct hfsmount *hfsmp, uint32_t fileID); extern int hfs_unlink(struct hfsmount *hfsmp, struct vnode *dvp, struct vnode *vp, struct componentname *cnp, int skip_reserve); -extern int hfs_lookuplink(struct hfsmount *hfsmp, cnid_t linkfileid, +extern int hfs_lookup_siblinglinks(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t *prevlinkid, cnid_t *nextlinkid); extern void hfs_privatedir_init(struct hfsmount *, enum privdirtype); diff --git a/bsd/hfs/hfs_attrlist.c b/bsd/hfs/hfs_attrlist.c index dc24f99a7..3f1e6da64 100644 --- a/bsd/hfs/hfs_attrlist.c +++ b/bsd/hfs/hfs_attrlist.c @@ -41,6 +41,7 @@ #include #include #include +#include #include @@ -80,7 +81,6 @@ static u_int32_t hfs_real_user_access(vnode_t vp, vfs_context_t ctx); * apply for the file system you are doing the readdirattr on. To make life * simpler, this call will only return entries in its directory, hfs like. */ -__private_extern__ int hfs_vnop_readdirattr(ap) struct vnop_readdirattr_args /* { @@ -138,6 +138,19 @@ hfs_vnop_readdirattr(ap) (alist->forkattr != 0)) { return (EINVAL); } + + if (VTOC(dvp)->c_flags & UF_COMPRESSED) { + int compressed = hfs_file_is_compressed(VTOC(dvp), 0); /* 0 == take the cnode lock */ + + if (!compressed) { + error = check_for_dataless_file(dvp, NAMESPACE_HANDLER_READ_OP); + if (error) { + return error; + } + } + } + + /* * Take an exclusive directory lock since we manipulate the directory hints */ @@ -256,12 +269,12 @@ hfs_vnop_readdirattr(ap) /* * Obtain vnode for our vnode_authorize() calls. */ - if (hfs_vget(hfsmp, cattrp->ca_fileid, &vp, 0) != 0) { + if (hfs_vget(hfsmp, cattrp->ca_fileid, &vp, 0, 0) != 0) { vp = NULL; } } else if (!(ap->a_options & FSOPT_NOINMEMUPDATE)) { /* Get in-memory cnode data (if any). */ - vp = hfs_chash_getvnode(hfsmp, cattrp->ca_fileid, 0, 0); + vp = hfs_chash_getvnode(hfsmp, cattrp->ca_fileid, 0, 0, 0); } if (vp != NULL) { cp = VTOC(vp); @@ -405,7 +418,7 @@ hfs_vnop_readdirattr(ap) /* * Pack cnode attributes into an attribute block. */ - __private_extern__ +__private_extern__ void hfs_packattrblk(struct attrblock *abp, struct hfsmount *hfsmp, @@ -654,7 +667,10 @@ packcommonattr( } } if (ATTR_CMN_FNDRINFO & attr) { + u_int8_t *finfo = NULL; bcopy(&cap->ca_finderinfo, attrbufptr, sizeof(u_int8_t) * 32); + finfo = (u_int8_t*)attrbufptr; + /* Don't expose a symlink's private type/creator. */ if (S_ISLNK(cap->ca_mode)) { struct FndrFileInfo *fip; @@ -663,6 +679,18 @@ packcommonattr( fip->fdType = 0; fip->fdCreator = 0; } + + /* advance 16 bytes into the attrbuf */ + finfo = finfo + 16; + if (S_ISREG(cap->ca_mode)) { + struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; + extinfo->date_added = 0; + } + else if (S_ISDIR(cap->ca_mode)) { + struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; + extinfo->date_added = 0; + } + attrbufptr = (char *)attrbufptr + sizeof(u_int8_t) * 32; } if (ATTR_CMN_OWNERID & attr) { @@ -814,7 +842,10 @@ packfileattr( off_t datasize = datafork->cf_size; off_t totalsize = datasize + rsrcfork->cf_size; #if HFS_COMPRESSION - if ( cattrp->ca_flags & UF_COMPRESSED ) { + int handle_compressed; + handle_compressed = (cattrp->ca_flags & UF_COMPRESSED);// && hfs_file_is_compressed(VTOC(vp), 1); + + if (handle_compressed) { if (attr & (ATTR_FILE_DATALENGTH|ATTR_FILE_TOTALSIZE)) { if ( 0 == hfs_uncompressed_size_of_compressed_file(hfsmp, vp, cattrp->ca_fileid, &datasize, 1) ) { /* 1 == don't take the cnode lock */ /* total size of a compressed file is just the data size */ @@ -865,7 +896,7 @@ packfileattr( * passed by hfs_vnop_readdirattr() may be null. */ - if ( cattrp->ca_flags & UF_COMPRESSED ) { + if ( handle_compressed ) { if (attr & ATTR_FILE_DATAALLOCSIZE) { *((off_t *)attrbufptr) = (off_t)rsrcfork->cf_blocks * (off_t)allocblksize; attrbufptr = ((off_t *)attrbufptr) + 1; @@ -902,7 +933,7 @@ packfileattr( /* * Calculate the total size of an attribute block. */ - __private_extern__ +__private_extern__ int hfs_attrblksize(struct attrlist *attrlist) { @@ -1015,7 +1046,6 @@ hfs_real_user_access(vnode_t vp, vfs_context_t ctx) } -__private_extern__ u_int32_t DerivePermissionSummary(uid_t obj_uid, gid_t obj_gid, mode_t obj_mode, struct mount *mp, kauth_cred_t cred, __unused struct proc *p) diff --git a/bsd/hfs/hfs_btreeio.c b/bsd/hfs/hfs_btreeio.c index ddcb91277..7295cee54 100644 --- a/bsd/hfs/hfs_btreeio.c +++ b/bsd/hfs/hfs_btreeio.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -53,8 +53,8 @@ extern int bdwrite_internal(struct buf *, int); static int ClearBTNodes(struct vnode *vp, long blksize, off_t offset, off_t amount); static int btree_journal_modify_block_end(struct hfsmount *hfsmp, struct buf *bp); +void btree_swap_node(struct buf *bp, __unused void *arg); -__private_extern__ OSStatus SetBTreeBlockSize(FileReference vp, ByteCount blockSize, __unused ItemCount minBlockCount) { BTreeControlBlockPtr bTreePtr; @@ -71,7 +71,6 @@ OSStatus SetBTreeBlockSize(FileReference vp, ByteCount blockSize, __unused ItemC } -__private_extern__ OSStatus GetBTreeBlock(FileReference vp, u_int32_t blockNum, GetBlockOptions options, BlockDescriptor *block) { OSStatus retval = E_NONE; @@ -165,7 +164,6 @@ OSStatus GetBTreeBlock(FileReference vp, u_int32_t blockNum, GetBlockOptions opt } -__private_extern__ void ModifyBlockStart(FileReference vp, BlockDescPtr blockPtr) { struct hfsmount *hfsmp = VTOHFS(vp); @@ -185,7 +183,7 @@ void ModifyBlockStart(FileReference vp, BlockDescPtr blockPtr) blockPtr->isModified = 1; } -static void +void btree_swap_node(struct buf *bp, __unused void *arg) { // struct hfsmount *hfsmp = (struct hfsmount *)arg; @@ -218,7 +216,6 @@ btree_journal_modify_block_end(struct hfsmount *hfsmp, struct buf *bp) } -__private_extern__ OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, ReleaseBlockOptions options) { struct hfsmount *hfsmp = VTOHFS(vp); @@ -331,7 +328,6 @@ OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, ReleaseBlock } -__private_extern__ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF) { #pragma unused (maxEOF) @@ -467,7 +463,7 @@ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF) trim = ((filePtr->fcbEOF - origSize) % btInfo.nodeSize); } - ret = TruncateFileC(vcb, filePtr, filePtr->fcbEOF - trim, 0); + ret = TruncateFileC(vcb, filePtr, filePtr->fcbEOF - trim, 0, 0, FTOC(filePtr)->c_fileid, 0); filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize; // XXXdbg - panic if the file didn't get trimmed back properly @@ -611,6 +607,8 @@ hfs_create_attr_btree(struct hfsmount *hfsmp, u_int32_t nodesize, u_int32_t node u_int16_t offset; int intrans = 0; int result; + int newvnode_flags = 0; + again: /* * Serialize creation using HFS_CREATING_BTREE flag. @@ -654,7 +652,8 @@ hfs_create_attr_btree(struct hfsmount *hfsmp, u_int32_t nodesize, u_int32_t node bzero(&cfork, sizeof(cfork)); cfork.cf_clump = nodesize * nodecnt; - result = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, &vp); + result = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, + &cfork, &vp, &newvnode_flags); if (result) { goto exit; } diff --git a/bsd/hfs/hfs_catalog.c b/bsd/hfs/hfs_catalog.c index 0ae0e2600..0ab6f4585 100644 --- a/bsd/hfs/hfs_catalog.c +++ b/bsd/hfs/hfs_catalog.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -90,10 +90,12 @@ u_char modetodirtype[16] = { #define MODE_TO_DT(mode) (modetodirtype[((mode) & S_IFMT) >> 12]) -static int cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int allow_system_files, u_int32_t hint, int wantrsrc, +#define HFS_LOOKUP_SYSFILE 0x1 /* If set, allow lookup of system files */ +#define HFS_LOOKUP_HARDLINK 0x2 /* If set, allow lookup of hard link records and not resolve the hard links */ +static int cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int flags, u_int32_t hint, int wantrsrc, struct cat_desc *descp, struct cat_attr *attrp, struct cat_fork *forkp, cnid_t *desc_cnid); -static int cat_lookupmangled(struct hfsmount *hfsmp, struct cat_desc *descp, int wantrsrc, +int cat_lookupmangled(struct hfsmount *hfsmp, struct cat_desc *descp, int wantrsrc, struct cat_desc *outdescp, struct cat_attr *attrp, struct cat_fork *forkp); /* Internal catalog support routines */ @@ -133,8 +135,9 @@ static int buildthread(void *keyp, void *recp, int std_hfs, int directory); static int cat_makealias(struct hfsmount *hfsmp, u_int32_t inode_num, struct HFSPlusCatalogFile *crp); +static int cat_update_internal(struct hfsmount *hfsmp, int update_hardlink, struct cat_desc *descp, struct cat_attr *attrp, + struct cat_fork *dataforkp, struct cat_fork *rsrcforkp); -__private_extern__ int cat_preflight(struct hfsmount *hfsmp, catops_t ops, cat_cookie_t *cookie, __unused proc_t p) { @@ -152,7 +155,6 @@ cat_preflight(struct hfsmount *hfsmp, catops_t ops, cat_cookie_t *cookie, __unus return MacToVFSError(result); } -__private_extern__ void cat_postflight(struct hfsmount *hfsmp, cat_cookie_t *cookie, __unused proc_t p) { @@ -167,8 +169,7 @@ cat_postflight(struct hfsmount *hfsmp, cat_cookie_t *cookie, __unused proc_t p) hfs_systemfile_unlock(hfsmp, lockflags); } - -__private_extern__ +__private_extern__ void cat_convertattr( struct hfsmount *hfsmp, @@ -297,7 +298,6 @@ cat_releasedesc(struct cat_desc *descp) * Note: The caller is responsible for releasing the output * catalog descriptor (when supplied outdescp is non-null). */ -__private_extern__ int cat_lookup(struct hfsmount *hfsmp, struct cat_desc *descp, int wantrsrc, struct cat_desc *outdescp, struct cat_attr *attrp, @@ -344,7 +344,6 @@ cat_lookup(struct hfsmount *hfsmp, struct cat_desc *descp, int wantrsrc, return (result); } -__private_extern__ int cat_insertfilethread(struct hfsmount *hfsmp, struct cat_desc *descp) { @@ -409,7 +408,6 @@ cat_insertfilethread(struct hfsmount *hfsmp, struct cat_desc *descp) * catalog descriptor (when supplied outdescp is non-null). */ -__private_extern__ int cat_findname(struct hfsmount *hfsmp, cnid_t cnid, struct cat_desc *outdescp) { @@ -482,7 +480,6 @@ cat_findname(struct hfsmount *hfsmp, cnid_t cnid, struct cat_desc *outdescp) * Note: The caller is responsible for releasing the output * catalog descriptor (when supplied outdescp is non-null). */ -__private_extern__ int cat_idlookup(struct hfsmount *hfsmp, cnid_t cnid, int allow_system_files, struct cat_desc *outdescp, struct cat_attr *attrp, struct cat_fork *forkp) @@ -543,7 +540,9 @@ cat_idlookup(struct hfsmount *hfsmp, cnid_t cnid, int allow_system_files, goto exit; } - result = cat_lookupbykey(hfsmp, keyp, allow_system_files, 0, 0, outdescp, attrp, forkp, NULL); + result = cat_lookupbykey(hfsmp, keyp, + ((allow_system_files != 0) ? HFS_LOOKUP_SYSFILE : 0), + 0, 0, outdescp, attrp, forkp, NULL); /* No corresponding file/folder record found for a thread record, * mark the volume inconsistent. */ @@ -569,7 +568,7 @@ cat_idlookup(struct hfsmount *hfsmp, cnid_t cnid, int allow_system_files, /* * cat_lookupmangled - lookup a catalog node using a mangled name */ -static int +int cat_lookupmangled(struct hfsmount *hfsmp, struct cat_desc *descp, int wantrsrc, struct cat_desc *outdescp, struct cat_attr *attrp, struct cat_fork *forkp) { @@ -625,7 +624,7 @@ cat_lookupmangled(struct hfsmount *hfsmp, struct cat_desc *descp, int wantrsrc, * cat_lookupbykey - lookup a catalog node using a cnode key */ static int -cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int allow_system_files, u_int32_t hint, int wantrsrc, +cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int flags, u_int32_t hint, int wantrsrc, struct cat_desc *descp, struct cat_attr *attrp, struct cat_fork *forkp, cnid_t *desc_cnid) { struct BTreeIterator * iterator; @@ -637,6 +636,7 @@ cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int allow_system_files u_int32_t ilink = 0; cnid_t cnid = 0; u_int32_t encoding = 0; + cnid_t parentid = 0; std_hfs = (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord); @@ -652,16 +652,18 @@ cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int allow_system_files if (result) goto exit; - /* Save the cnid and encoding now in case there's a hard link */ + /* Save the cnid, parentid, and encoding now in case there's a hard link or inode */ cnid = getcnid(recp); + if (!std_hfs) { + parentid = keyp->hfsPlus.parentID; + } encoding = getencoding(recp); hint = iterator->hint.nodeNum; /* Hide the journal files (if any) */ if ((hfsmp->jnl || ((HFSTOVCB(hfsmp)->vcbAtrb & kHFSVolumeJournaledMask) && (hfsmp->hfs_flags & HFS_READ_ONLY))) && ((cnid == hfsmp->hfs_jnlfileid) || (cnid == hfsmp->hfs_jnlinfoblkid)) && - !allow_system_files) { - + !(flags & HFS_LOOKUP_SYSFILE)) { result = ENOENT; goto exit; } @@ -674,7 +676,7 @@ cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int allow_system_files if (!std_hfs && (attrp || forkp) && (recp->recordType == kHFSPlusFileRecord) - && ((to_bsd_time(recp->hfsPlusFile.createDate) == (time_t)hfsmp->vcbCrDate) || + && ((to_bsd_time(recp->hfsPlusFile.createDate) == (time_t)hfsmp->hfs_itime) || (to_bsd_time(recp->hfsPlusFile.createDate) == (time_t)hfsmp->hfs_metadata_createdate))) { int isdirlink = 0; int isfilelink = 0; @@ -687,7 +689,7 @@ cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int allow_system_files (SWAP_BE32(recp->hfsPlusFile.userInfo.fdCreator) == kHFSAliasCreator)) { isdirlink = 1; } - if (isfilelink || isdirlink) { + if ((isfilelink || isdirlink) && !(flags & HFS_LOOKUP_HARDLINK)) { ilink = recp->hfsPlusFile.hl_linkReference; (void) cat_resolvelink(hfsmp, ilink, isdirlink, (struct HFSPlusCatalogFile *)recp); } @@ -701,8 +703,50 @@ cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int allow_system_files getbsdattr(hfsmp, &cnoderec, attrp); } else { getbsdattr(hfsmp, (struct HFSPlusCatalogFile *)recp, attrp); - if (ilink) + if (ilink) { + /* Update the inode number for this hard link */ attrp->ca_linkref = ilink; + } + + /* + * Set kHFSHasLinkChainBit for hard links, and reset it for all + * other items. Also set linkCount to 1 for regular files. + * + * Due to some bug (rdar://8505977), some regular files can have + * kHFSHasLinkChainBit set and linkCount more than 1 even if they + * are not really hard links. The runtime code should not consider + * these files has hard links. Therefore we reset the kHFSHasLinkChainBit + * and linkCount for regular file before we vend it out. This might + * also result in repairing the bad files on disk, if the corresponding + * file is modified and updated on disk. + */ + if (ilink) { + /* This is a hard link and the link count bit was not set */ + if (!(attrp->ca_recflags & kHFSHasLinkChainMask)) { + printf ("hfs: set hardlink bit on vol=%s cnid=%u inoid=%u\n", hfsmp->vcbVN, cnid, ilink); + attrp->ca_recflags |= kHFSHasLinkChainMask; + } + } else { + /* Make sure that this non-hard link (regular) record is not + * an inode record or a valid hard link being that is not + * resolved for volume resize purposes. We do not want to + * reset the hard link bit or reset link count on these records. + */ + if (!(flags & HFS_LOOKUP_HARDLINK) && + (parentid != hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) && + (parentid != hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid)) { + /* This is not a hard link or inode and the link count bit was set */ + if (attrp->ca_recflags & kHFSHasLinkChainMask) { + printf ("hfs: clear hardlink bit on vol=%s cnid=%u\n", hfsmp->vcbVN, cnid); + attrp->ca_recflags &= ~kHFSHasLinkChainMask; + } + /* This is a regular file and the link count was more than 1 */ + if (S_ISREG(attrp->ca_mode) && (attrp->ca_linkcount > 1)) { + printf ("hfs: set linkcount=1 on vol=%s cnid=%u old=%u\n", hfsmp->vcbVN, cnid, attrp->ca_linkcount); + attrp->ca_linkcount = 1; + } + } + } } } if (forkp != NULL) { @@ -765,6 +809,22 @@ cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int allow_system_files if ((validblks < forkp->cf_blocks) && (forkp->cf_extents[7].blockCount == 0)) { off_t psize; + /* + * This is technically a volume corruption. + * If the total number of blocks calculated by iterating + summing + * the extents in the resident extent records, is less than that + * which is reported in the catalog entry, we should force a fsck. + * Only modifying ca_blocks here is not guaranteed to make it out + * to disk; it is a runtime-only field. + * + * Note that we could have gotten into this state if we had invalid ranges + * that existed in borrowed blocks that somehow made it out to disk. + * The cnode's on disk block count should never be greater + * than that which is in its extent records. + */ + + (void) hfs_mark_volume_inconsistent (hfsmp); + forkp->cf_blocks = validblks; if (attrp != NULL) { attrp->ca_blocks = validblks + recp->hfsPlusFile.resourceFork.totalBlocks; @@ -813,7 +873,6 @@ cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int allow_system_files * The caller is responsible for releasing the output * catalog descriptor (when supplied outdescp is non-null). */ -__private_extern__ int cat_create(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attrp, struct cat_desc *out_descp) @@ -988,7 +1047,6 @@ cat_create(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr * Note: The caller is responsible for releasing the output * catalog descriptor (when supplied out_cdp is non-null). */ -__private_extern__ int cat_rename ( struct hfsmount * hfsmp, @@ -1287,7 +1345,6 @@ cat_rename ( * 2. BTDeleteRecord(thread); * 3. BTUpdateRecord(parent); */ -__private_extern__ int cat_delete(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attrp) { @@ -1378,12 +1435,13 @@ cat_delete(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr /* - * cnode_update - update the catalog node described by descp - * using the data from attrp and forkp. + * cat_update_internal - update the catalog node described by descp + * using the data from attrp and forkp. + * If update_hardlink is true, the hard link catalog record is updated + * and not the inode catalog record. */ -__private_extern__ -int -cat_update(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attrp, +static int +cat_update_internal(struct hfsmount *hfsmp, int update_hardlink, struct cat_desc *descp, struct cat_attr *attrp, struct cat_fork *dataforkp, struct cat_fork *rsrcforkp) { FCB * fcb; @@ -1408,13 +1466,14 @@ cat_update(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr * For open-deleted files we need to do a lookup by cnid * (using thread rec). * - * For hard links, the target of the update is the inode - * itself (not the link record) so a lookup by fileid - * (i.e. thread rec) is needed. + * For hard links and if not requested by caller, the target + * of the update is the inode itself (not the link record) + * so a lookup by fileid (i.e. thread rec) is needed. */ - if ((descp->cd_cnid != attrp->ca_fileid) || - (descp->cd_namelen == 0) || - (attrp->ca_recflags & kHFSHasLinkChainMask)) { + if ((update_hardlink == false) && + ((descp->cd_cnid != attrp->ca_fileid) || + (descp->cd_namelen == 0) || + (attrp->ca_recflags & kHFSHasLinkChainMask))) { result = getkey(hfsmp, attrp->ca_fileid, (CatalogKey *)&iterator->key); } else { result = buildkey(hfsmp, descp, (HFSPlusCatalogKey *)&iterator->key, 0); @@ -1439,6 +1498,17 @@ cat_update(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr return MacToVFSError(result); } +/* + * cat_update - update the catalog node described by descp + * using the data from attrp and forkp. + */ +int +cat_update(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attrp, + struct cat_fork *dataforkp, struct cat_fork *rsrcforkp) +{ + return cat_update_internal(hfsmp, false, descp, attrp, dataforkp, rsrcforkp); +} + /* * catrec_update - Update the fields of a catalog record * This is called from within BTUpdateRecord. @@ -1585,6 +1655,7 @@ catrec_update(const CatalogKey *ckp, CatalogRecord *crp, struct update_state *st } case kHFSPlusFileRecord: { HFSPlusCatalogFile *file; + int is_dirlink; file = (struct HFSPlusCatalogFile *)crp; /* Do a quick sanity check */ @@ -1627,13 +1698,22 @@ catrec_update(const CatalogKey *ckp, CatalogRecord *crp, struct update_state *st * supplied values (which will be default), which has the * same effect as creating a new file while * MNT_UNKNOWNPERMISSIONS is set. + * + * Do not modify bsdInfo for directory hard link records. + * They are set during creation and are not modifiable, so just + * leave them alone. */ - if ((file->bsdInfo.fileMode != 0) || - (attrp->ca_flags != 0) || - (attrp->ca_uid != hfsmp->hfs_uid) || - (attrp->ca_gid != hfsmp->hfs_gid) || - ((attrp->ca_mode & ALLPERMS) != - (hfsmp->hfs_file_mask & ACCESSPERMS))) { + is_dirlink = (file->flags & kHFSHasLinkChainMask) && + (SWAP_BE32(file->userInfo.fdType) == kHFSAliasType) && + (SWAP_BE32(file->userInfo.fdCreator) == kHFSAliasCreator); + + if (!is_dirlink && + ((file->bsdInfo.fileMode != 0) || + (attrp->ca_flags != 0) || + (attrp->ca_uid != hfsmp->hfs_uid) || + (attrp->ca_gid != hfsmp->hfs_gid) || + ((attrp->ca_mode & ALLPERMS) != + (hfsmp->hfs_file_mask & ACCESSPERMS)))) { if ((file->bsdInfo.fileMode == 0) || (((unsigned int)vfs_flags(HFSTOVFS(hfsmp))) & MNT_UNKNOWNPERMISSIONS) == 0) { file->bsdInfo.ownerID = attrp->ca_uid; @@ -1679,8 +1759,18 @@ catrec_update(const CatalogKey *ckp, CatalogRecord *crp, struct update_state *st /* Push out special field if necessary */ if (S_ISBLK(attrp->ca_mode) || S_ISCHR(attrp->ca_mode)) { file->bsdInfo.special.rawDevice = attrp->ca_rdev; - } else if (descp->cd_cnid != attrp->ca_fileid || attrp->ca_linkcount == 2) { - file->hl_linkCount = attrp->ca_linkcount; + } + else { + /* + * Protect against the degenerate case where the descriptor contains the + * raw inode ID in its CNID field. If the HFSPlusCatalogFile record indicates + * the linkcount was greater than 1 (the default value), then it must have become + * a hardlink. In this case, update the linkcount from the cat_attr passed in. + */ + if ((descp->cd_cnid != attrp->ca_fileid) || (attrp->ca_linkcount > 1 ) || + (file->hl_linkCount > 1)) { + file->hl_linkCount = attrp->ca_linkcount; + } } break; } @@ -1809,7 +1899,7 @@ cat_check_link_ancestry(struct hfsmount *hfsmp, cnid_t cnid, cnid_t pointed_at_c /* - * updatelink_callback - update a link's chain + * update_siblinglinks_callback - update a link's chain */ struct linkupdate_state { @@ -1819,12 +1909,12 @@ struct linkupdate_state { }; static int -updatelink_callback(__unused const CatalogKey *ckp, CatalogRecord *crp, struct linkupdate_state *state) +update_siblinglinks_callback(__unused const CatalogKey *ckp, CatalogRecord *crp, struct linkupdate_state *state) { HFSPlusCatalogFile *file; if (crp->recordType != kHFSPlusFileRecord) { - printf("hfs: updatelink_callback: unexpected rec type %d\n", crp->recordType); + printf("hfs: update_siblinglinks_callback: unexpected rec type %d\n", crp->recordType); return (btNotFound); } @@ -1837,17 +1927,16 @@ updatelink_callback(__unused const CatalogKey *ckp, CatalogRecord *crp, struct l file->hl_nextLinkID = state->nextlinkid; } } else { - printf("hfs: updatelink_callback: file %d isn't a chain\n", file->fileID); + printf("hfs: update_siblinglinks_callback: file %d isn't a chain\n", file->fileID); } return (0); } /* - * cat_updatelink - update a link's chain + * cat_update_siblinglinks - update a link's chain */ -__private_extern__ int -cat_updatelink(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t prevlinkid, cnid_t nextlinkid) +cat_update_siblinglinks(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t prevlinkid, cnid_t nextlinkid) { FCB * fcb; BTreeIterator * iterator; @@ -1859,24 +1948,25 @@ cat_updatelink(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t prevlinkid, cni state.prevlinkid = prevlinkid; state.nextlinkid = nextlinkid; - /* Borrow the btcb iterator since we have an exclusive catalog lock. */ - iterator = &((BTreeControlBlockPtr)(fcb->ff_sysfileinfo))->iterator; - iterator->hint.nodeNum = 0; + /* Create an iterator for use by us temporarily */ + MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); + bzero(iterator, sizeof(*iterator)); result = getkey(hfsmp, linkfileid, (CatalogKey *)&iterator->key); if (result == 0) { - result = BTUpdateRecord(fcb, iterator, (IterateCallBackProcPtr)updatelink_callback, &state); + result = BTUpdateRecord(fcb, iterator, (IterateCallBackProcPtr)update_siblinglinks_callback, &state); (void) BTFlushPath(fcb); } else { - printf("hfs: cat_updatelink: couldn't resolve cnid %d\n", linkfileid); + printf("hfs: cat_update_siblinglinks: couldn't resolve cnid %d\n", linkfileid); } + + FREE (iterator, M_TEMP); return MacToVFSError(result); } /* * cat_lookuplink - lookup a link by it's name */ -__private_extern__ int cat_lookuplink(struct hfsmount *hfsmp, struct cat_desc *descp, cnid_t *linkfileid, cnid_t *prevlinkid, cnid_t *nextlinkid) { @@ -1888,9 +1978,9 @@ cat_lookuplink(struct hfsmount *hfsmp, struct cat_desc *descp, cnid_t *linkfilei fcb = hfsmp->hfs_catalog_cp->c_datafork; - /* Borrow the btcb iterator since we have an exclusive catalog lock. */ - iterator = &((BTreeControlBlockPtr)(fcb->ff_sysfileinfo))->iterator; - iterator->hint.nodeNum = 0; + /* Create an iterator for use by us temporarily */ + MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); + bzero(iterator, sizeof(*iterator)); if ((result = buildkey(hfsmp, descp, (HFSPlusCatalogKey *)&iterator->key, 0))) { goto exit; @@ -1914,16 +2004,16 @@ cat_lookuplink(struct hfsmount *hfsmp, struct cat_desc *descp, cnid_t *linkfilei *nextlinkid = 0; } exit: + FREE(iterator, M_TEMP); return MacToVFSError(result); } /* - * cat_lookuplink - lookup a link by its cnid + * cat_lookup_siblinglinks - lookup previous and next link ID for link using its cnid */ -__private_extern__ int -cat_lookuplinkbyid(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t *prevlinkid, cnid_t *nextlinkid) +cat_lookup_siblinglinks(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t *prevlinkid, cnid_t *nextlinkid) { FCB * fcb; BTreeIterator * iterator; @@ -1933,18 +2023,19 @@ cat_lookuplinkbyid(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t *prevlinkid fcb = hfsmp->hfs_catalog_cp->c_datafork; - /* Borrow the btcb iterator since we have an exclusive catalog lock. */ - iterator = &((BTreeControlBlockPtr)(fcb->ff_sysfileinfo))->iterator; - iterator->hint.nodeNum = 0; + /* Create an iterator for use by us temporarily */ + MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); + bzero(iterator, sizeof(*iterator)); + if ((result = getkey(hfsmp, linkfileid, (CatalogKey *)&iterator->key))) { - printf("hfs: cat_lookuplinkbyid: getkey for %d failed %d\n", linkfileid, result); + printf("hfs: cat_lookup_siblinglinks: getkey for %d failed %d\n", linkfileid, result); goto exit; } BDINIT(btdata, &file); if ((result = BTSearchRecord(fcb, iterator, &btdata, NULL, NULL))) { - printf("hfs: cat_lookuplinkbyid: cannot find %d\n", linkfileid); + printf("hfs: cat_lookup_siblinglinks: cannot find %d\n", linkfileid); goto exit; } /* The prev/next chain is only valid when kHFSHasLinkChainMask is set. */ @@ -1953,7 +2044,7 @@ cat_lookuplinkbyid(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t *prevlinkid parent = ((HFSPlusCatalogKey *)&iterator->key)->parentID; - /* ADL inodes don't have a chain (its in an EA) */ + /* directory inodes don't have a chain (its in an EA) */ if (parent == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { result = ENOLINK; /* signal to caller to get head of list */ } else if (parent == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) { @@ -1968,6 +2059,7 @@ cat_lookuplinkbyid(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t *prevlinkid *nextlinkid = 0; } exit: + FREE(iterator, M_TEMP); return MacToVFSError(result); } @@ -1983,7 +2075,6 @@ cat_lookuplinkbyid(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t *prevlinkid * ca_flags * ca_finderinfo (type and creator) */ -__private_extern__ int cat_createlink(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attrp, cnid_t nextlinkid, cnid_t *linkfileid) @@ -2278,7 +2369,6 @@ cat_makealias(struct hfsmount *hfsmp, u_int32_t inode_num, struct HFSPlusCatalog /* * cat_deletelink - delete a link from the catalog */ -__private_extern__ int cat_deletelink(struct hfsmount *hfsmp, struct cat_desc *descp) { @@ -2455,7 +2545,6 @@ getentriesattr_callback(const CatalogKey *key, const CatalogRecord *rec, * * Note: index is zero relative */ -__private_extern__ int cat_getentriesattr(struct hfsmount *hfsmp, directoryhint_t *dirhint, struct cat_entrylist *ce_list) { @@ -2690,7 +2779,7 @@ getdirentries_callback(const CatalogKey *ckp, const CatalogRecord *crp, * regardless, so it's slightly safer to let that logic mark the boolean, * especially since it's closer to the return of this function. */ - + if (state->cbs_extended) { /* The last record has not been returned yet, so we * want to stop after packing the last item @@ -3043,7 +3132,6 @@ getdirentries_std_callback(const CatalogKey *ckp, const CatalogRecord *crp, /* * Pack a uio buffer with directory entries from the catalog */ -__private_extern__ int cat_getdirentries(struct hfsmount *hfsmp, int entrycnt, directoryhint_t *dirhint, uio_t uio, int extended, int * items, int * eofflag) @@ -3087,7 +3175,7 @@ cat_getdirentries(struct hfsmount *hfsmp, int entrycnt, directoryhint_t *dirhint * field to track whether or not we've returned EOF from the iterator function. */ state.cbs_eof = false; - + iterator = (BTreeIterator *) ((char *)state.cbs_linkinfo + (maxlinks * sizeof(linkinfo_t))); key = (CatalogKey *)&iterator->key; have_key = 0; @@ -3215,12 +3303,13 @@ cat_getdirentries(struct hfsmount *hfsmp, int entrycnt, directoryhint_t *dirhint /* Note that state.cbs_index is still valid on errors */ *items = state.cbs_index - index; index = state.cbs_index; - + /* * Also note that cbs_eof is set in all cases if we ever hit EOF * during the enumeration by the catalog callback. Mark the directory's hint * descriptor as having hit EOF. */ + if (state.cbs_eof) { dirhint->dh_desc.cd_flags |= CD_EOF; *eofflag = 1; @@ -3335,7 +3424,6 @@ cat_findposition(const CatalogKey *ckp, const CatalogRecord *crp, * The name portion of the key is compared using a 16-bit binary comparison. * This is called from the b-tree code. */ -__private_extern__ int cat_binarykeycompare(HFSPlusCatalogKey *searchKey, HFSPlusCatalogKey *trialKey) { @@ -3517,7 +3605,6 @@ buildkey(struct hfsmount *hfsmp, struct cat_desc *descp, /* * Resolve hard link reference to obtain the inode record. */ -__private_extern__ int cat_resolvelink(struct hfsmount *hfsmp, u_int32_t linkref, int isdirlink, struct HFSPlusCatalogFile *recp) { @@ -3657,7 +3744,6 @@ getkey(struct hfsmount *hfsmp, cnid_t cnid, CatalogKey * key) * The key's parent id is the only part of the key expected to be used by the caller. * The name portion of the key may not always be valid (ie in the case of a hard link). */ -__private_extern__ int cat_getkeyplusattr(struct hfsmount *hfsmp, cnid_t cnid, CatalogKey * key, struct cat_attr *attrp) { @@ -3684,7 +3770,7 @@ cat_getkeyplusattr(struct hfsmount *hfsmp, cnid_t cnid, CatalogKey * key, struct * Pick up the first link in the chain and get a descriptor for it. * This allows blind bulk access checks to work for hardlinks. */ - if ((cat_lookuplinkbyid(hfsmp, cnid, &prevlinkid, &nextlinkid) == 0) && + if ((cat_lookup_siblinglinks(hfsmp, cnid, &prevlinkid, &nextlinkid) == 0) && (nextlinkid != 0)) { if (cat_findname(hfsmp, nextlinkid, &linkdesc) == 0) { key->hfsPlus.parentID = linkdesc.cd_parentcnid; @@ -4203,3 +4289,99 @@ isadir(const CatalogRecord *crp) crp->recordType == kHFSPlusFolderRecord); } +/* + * cat_lookup_dirlink - lookup a catalog record for directory hard link + * (not inode) using catalog record id. Note that this function does + * NOT resolve directory hard link to its directory inode and return + * the link record. + * + * Note: The caller is responsible for releasing the output catalog + * descriptor (when supplied outdescp is non-null). + */ +int +cat_lookup_dirlink(struct hfsmount *hfsmp, cnid_t dirlink_id, + u_int8_t forktype, struct cat_desc *outdescp, + struct cat_attr *attrp, struct cat_fork *forkp) +{ + struct BTreeIterator *iterator = NULL; + FSBufferDescriptor btdata; + u_int16_t datasize; + CatalogKey *keyp; + CatalogRecord *recp = NULL; + int error; + + /* No directory hard links on standard HFS */ + if (hfsmp->vcbSigWord == kHFSSigWord) { + return ENOTSUP; + } + + MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); + if (iterator == NULL) { + return ENOMEM; + } + bzero(iterator, sizeof(*iterator)); + buildthreadkey(dirlink_id, 1, (CatalogKey *)&iterator->key); + + MALLOC(recp, CatalogRecord *, sizeof(CatalogRecord), M_TEMP, M_WAITOK); + if (recp == NULL) { + error = ENOMEM; + goto out; + } + BDINIT(btdata, recp); + + error = BTSearchRecord(VTOF(HFSTOVCB(hfsmp)->catalogRefNum), iterator, + &btdata, &datasize, iterator); + if (error) { + goto out; + } + /* Directory hard links are catalog file record */ + if (recp->recordType != kHFSPlusFileThreadRecord) { + error = ENOENT; + goto out; + } + + keyp = (CatalogKey *)&recp->hfsPlusThread.reserved; + keyp->hfsPlus.keyLength = kHFSPlusCatalogKeyMinimumLength + + (keyp->hfsPlus.nodeName.length * 2); + if (forktype == kHFSResourceForkType) { + /* Lookup resource fork for directory hard link */ + error = cat_lookupbykey(hfsmp, keyp, HFS_LOOKUP_HARDLINK, 0, true, outdescp, attrp, forkp, NULL); + } else { + /* Lookup data fork, if any, for directory hard link */ + error = cat_lookupbykey(hfsmp, keyp, HFS_LOOKUP_HARDLINK, 0, false, outdescp, attrp, forkp, NULL); + } + if (error) { + printf ("hfs: cat_lookup_dirlink(): Error looking up file record for id=%u (error=%d)\n", dirlink_id, error); + hfs_mark_volume_inconsistent(hfsmp); + goto out; + } + /* Just for sanity, make sure that id in catalog record and thread record match */ + if ((outdescp != NULL) && (dirlink_id != outdescp->cd_cnid)) { + printf ("hfs: cat_lookup_dirlink(): Requested cnid=%u != found_cnid=%u\n", dirlink_id, outdescp->cd_cnid); + hfs_mark_volume_inconsistent(hfsmp); + error = ENOENT; + } + +out: + if (recp) { + FREE(recp, M_TEMP); + } + FREE(iterator, M_TEMP); + + return MacToVFSError(error); +} + +/* + * cnode_update_dirlink - update the catalog node for directory hard link + * described by descp using the data from attrp and forkp. + */ +int +cat_update_dirlink(struct hfsmount *hfsmp, u_int8_t forktype, + struct cat_desc *descp, struct cat_attr *attrp, struct cat_fork *forkp) +{ + if (forktype == kHFSResourceForkType) { + return cat_update_internal(hfsmp, true, descp, attrp, NULL, forkp); + } else { + return cat_update_internal(hfsmp, true, descp, attrp, forkp, NULL); + } +} diff --git a/bsd/hfs/hfs_catalog.h b/bsd/hfs/hfs_catalog.h index 6c1eaa130..e8574e17d 100644 --- a/bsd/hfs/hfs_catalog.h +++ b/bsd/hfs/hfs_catalog.h @@ -63,7 +63,7 @@ struct cat_desc { const u_int8_t * cd_nameptr; /* pointer to cnode name */ }; -/* cd_flags +/* cd_flags * * CD_EOF is used by hfs_vnop_readdir / cat_getdirentries to indicate EOF was * encountered during a directory enumeration. When this flag is observed @@ -258,6 +258,11 @@ union CatalogRecord { }; typedef union CatalogRecord CatalogRecord; +/* Constants for HFS fork types */ +enum { + kHFSDataForkType = 0x0, /* data fork */ + kHFSResourceForkType = 0xff /* resource fork */ +}; /* * Catalog Interface @@ -404,7 +409,7 @@ enum { extern int cat_deletelink( struct hfsmount *hfsmp, struct cat_desc *descp); -extern int cat_updatelink( struct hfsmount *hfsmp, +extern int cat_update_siblinglinks( struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t prevlinkid, cnid_t nextlinkid); @@ -415,11 +420,23 @@ extern int cat_lookuplink( struct hfsmount *hfsmp, cnid_t *prevlinkid, cnid_t *nextlinkid); -extern int cat_lookuplinkbyid( struct hfsmount *hfsmp, +extern int cat_lookup_siblinglinks( struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t *prevlinkid, cnid_t *nextlinkid); +extern int cat_lookup_dirlink(struct hfsmount *hfsmp, + cnid_t dirlink_id, + u_int8_t forktype, + struct cat_desc *outdescp, + struct cat_attr *attrp, + struct cat_fork *forkp); + +extern int cat_update_dirlink(struct hfsmount *hfsmp, + u_int8_t forktype, + struct cat_desc *descp, + struct cat_attr *attrp, + struct cat_fork *rsrcforkp); #endif /* __APPLE_API_PRIVATE */ #endif /* KERNEL */ diff --git a/bsd/hfs/hfs_chash.c b/bsd/hfs/hfs_chash.c index b2db58e75..997d247ae 100644 --- a/bsd/hfs/hfs_chash.c +++ b/bsd/hfs/hfs_chash.c @@ -146,9 +146,8 @@ hfs_delete_chash(struct hfsmount *hfsmp) * * If it is in core, but locked, wait for it. */ -__private_extern__ struct vnode * -hfs_chash_getvnode(struct hfsmount *hfsmp, ino_t inum, int wantrsrc, int skiplock) +hfs_chash_getvnode(struct hfsmount *hfsmp, ino_t inum, int wantrsrc, int skiplock, int allow_deleted) { struct cnode *cp; struct vnode *vp; @@ -201,13 +200,15 @@ hfs_chash_getvnode(struct hfsmount *hfsmp, ino_t inum, int wantrsrc, int skiploc * lock on the cnode which would allow the node to be * unlinked */ - if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { - if (!skiplock) - hfs_unlock(cp); - vnode_put(vp); - - return (NULL); - } + if (!allow_deleted) { + if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { + if (!skiplock) + hfs_unlock(cp); + vnode_put(vp); + + return (NULL); + } + } return (vp); } exit: @@ -218,8 +219,12 @@ hfs_chash_getvnode(struct hfsmount *hfsmp, ino_t inum, int wantrsrc, int skiploc /* * Use the device, fileid pair to snoop an incore cnode. + * + * A cnode can exists in chash even after it has been + * deleted from the catalog, so this function returns + * ENOENT if C_NOEXIST is set in the cnode's flag. + * */ -__private_extern__ int hfs_chash_snoop(struct hfsmount *hfsmp, ino_t inum, int (*callout)(const struct cat_desc *, const struct cat_attr *, void *), void * arg) @@ -237,6 +242,10 @@ hfs_chash_snoop(struct hfsmount *hfsmp, ino_t inum, int (*callout)(const struct for (cp = CNODEHASH(hfsmp, inum)->lh_first; cp; cp = cp->c_hash.le_next) { if (cp->c_fileid != inum) continue; + /* Skip cnodes that have been removed from the catalog */ + if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { + break; + } /* Skip cnodes being created or reclaimed. */ if (!ISSET(cp->c_hflag, H_ALLOC | H_TRANSIT | H_ATTACH)) { result = callout(&cp->c_desc, &cp->c_attr, arg); @@ -257,10 +266,16 @@ hfs_chash_snoop(struct hfsmount *hfsmp, ino_t inum, int (*callout)(const struct * * If the cnode is C_DELETED, then return NULL since that * inum is no longer valid for lookups (open-unlinked file). + * + * If the cnode is C_DELETED but also marked C_RENAMED, then that means + * the cnode was renamed over and a new entry exists in its place. The caller + * should re-drive the lookup to get the newer entry. In that case, we'll still + * return NULL for the cnode, but also return GNV_CHASH_RENAMED in the output flags + * of this function to indicate the caller that they should re-drive. */ -__private_extern__ struct cnode * -hfs_chash_getcnode(struct hfsmount *hfsmp, ino_t inum, struct vnode **vpp, int wantrsrc, int skiplock) +hfs_chash_getcnode(struct hfsmount *hfsmp, ino_t inum, struct vnode **vpp, + int wantrsrc, int skiplock, int *out_flags, int *hflags) { struct cnode *cp; struct cnode *ncp = NULL; @@ -295,6 +310,7 @@ hfs_chash_getcnode(struct hfsmount *hfsmp, ino_t inum, struct vnode **vpp, int w * The desired vnode isn't there so tag the cnode. */ SET(cp->c_hflag, H_ATTACH); + *hflags |= H_ATTACH; hfs_chash_unlock(hfsmp); } else { @@ -311,7 +327,7 @@ hfs_chash_getcnode(struct hfsmount *hfsmp, ino_t inum, struct vnode **vpp, int w * this cnode and add it to the hash * just dump our allocation */ - FREE_ZONE(ncp, sizeof(struct cnode), M_HFSNODE); + FREE_ZONE(ncp, sizeof(struct cnode), M_HFSNODE); ncp = NULL; } @@ -330,13 +346,19 @@ hfs_chash_getcnode(struct hfsmount *hfsmp, ino_t inum, struct vnode **vpp, int w * is no longer valid for lookups. */ if ((cp->c_flag & (C_NOEXISTS | C_DELETED)) && !wantrsrc) { + int renamed = 0; + if (cp->c_flag & C_RENAMED) { + renamed = 1; + } if (!skiplock) hfs_unlock(cp); if (vp != NULLVP) { vnode_put(vp); } else { hfs_chash_lock_spin(hfsmp); - CLR(cp->c_hflag, H_ATTACH); + CLR(cp->c_hflag, H_ATTACH); + *hflags &= ~H_ATTACH; + if (ISSET(cp->c_hflag, H_WAITING)) { CLR(cp->c_hflag, H_WAITING); wakeup((caddr_t)cp); @@ -345,6 +367,9 @@ hfs_chash_getcnode(struct hfsmount *hfsmp, ino_t inum, struct vnode **vpp, int w } vp = NULL; cp = NULL; + if (renamed) { + *out_flags = GNV_CHASH_RENAMED; + } } *vpp = vp; return (cp); @@ -358,8 +383,7 @@ hfs_chash_getcnode(struct hfsmount *hfsmp, ino_t inum, struct vnode **vpp, int w if (ncp == NULL) { hfs_chash_unlock(hfsmp); - - MALLOC_ZONE(ncp, struct cnode *, sizeof(struct cnode), M_HFSNODE, M_WAITOK); + MALLOC_ZONE(ncp, struct cnode *, sizeof(struct cnode), M_HFSNODE, M_WAITOK); /* * since we dropped the chash lock, * we need to go back and re-verify @@ -372,6 +396,7 @@ hfs_chash_getcnode(struct hfsmount *hfsmp, ino_t inum, struct vnode **vpp, int w bzero(ncp, sizeof(struct cnode)); SET(ncp->c_hflag, H_ALLOC); + *hflags |= H_ALLOC; ncp->c_fileid = inum; TAILQ_INIT(&ncp->c_hintlist); /* make the list empty */ TAILQ_INIT(&ncp->c_originlist); diff --git a/bsd/hfs/hfs_cnode.c b/bsd/hfs/hfs_cnode.c index c17c8d4dd..8703ecb9b 100644 --- a/bsd/hfs/hfs_cnode.c +++ b/bsd/hfs/hfs_cnode.c @@ -36,6 +36,7 @@ #include #include #include +#include #include @@ -46,6 +47,7 @@ #include #include #include +#include extern int prtactive; @@ -53,88 +55,192 @@ extern lck_attr_t * hfs_lock_attr; extern lck_grp_t * hfs_mutex_group; extern lck_grp_t * hfs_rwlock_group; -static int hfs_filedone(struct vnode *vp, vfs_context_t context); - static void hfs_reclaim_cnode(struct cnode *); - +static int hfs_cnode_teardown (struct vnode *vp, vfs_context_t ctx, int reclaim); static int hfs_isordered(struct cnode *, struct cnode *); -inline int hfs_checkdeleted (struct cnode *cp) { - return ((cp->c_flag & (C_DELETED | C_NOEXISTS)) ? ENOENT : 0); +__inline__ int hfs_checkdeleted (struct cnode *cp) { + return ((cp->c_flag & (C_DELETED | C_NOEXISTS)) ? ENOENT : 0); } /* - * Last reference to an cnode. If necessary, write or delete it. + * Function used by a special fcntl() that decorates a cnode/vnode that + * indicates it is backing another filesystem, like a disk image. + * + * the argument 'val' indicates whether or not to set the bit in the cnode flags + * + * Returns non-zero on failure. 0 on success */ -__private_extern__ -int -hfs_vnop_inactive(struct vnop_inactive_args *ap) -{ - struct vnode *vp = ap->a_vp; - struct cnode *cp; - struct hfsmount *hfsmp = VTOHFS(vp); - struct proc *p = vfs_context_proc(ap->a_context); - int error = 0; - int recycle = 0; - int forkcount = 0; - int truncated = 0; - int started_tr = 0; - int took_trunc_lock = 0; - cat_cookie_t cookie; - int cat_reserve = 0; - int lockflags; - enum vtype v_type; - - v_type = vnode_vtype(vp); +int hfs_set_backingstore (struct vnode *vp, int val) { + struct cnode *cp = NULL; + int err = 0; + cp = VTOC(vp); - - if ((hfsmp->hfs_flags & HFS_READ_ONLY) || vnode_issystem(vp) || - (hfsmp->hfs_freezing_proc == p)) { - return (0); + if (vnode_isdir(vp)) { + return EINVAL; } - /* - * Ignore nodes related to stale file handles. - * We are peeking at the cnode flag without the lock, but if C_NOEXISTS - * is set, that means the cnode doesn't have any backing store in the - * catalog anymore, and is otherwise safe to force a recycle - */ + /* lock the cnode */ + err = hfs_lock (cp, HFS_EXCLUSIVE_LOCK); + if (err) { + return err; + } - if (cp->c_flag & C_NOEXISTS) { - vnode_recycle(vp); - return (0); + if (val) { + cp->c_flag |= C_BACKINGSTORE; + } + else { + cp->c_flag &= ~C_BACKINGSTORE; } - if ((v_type == VREG || v_type == VLNK)) { - hfs_lock_truncate(cp, TRUE); - took_trunc_lock = 1; + /* unlock everything */ + hfs_unlock (cp); + + return err; +} + +/* + * Function used by a special fcntl() that check to see if a cnode/vnode + * indicates it is backing another filesystem, like a disk image. + * + * the argument 'val' is an output argument for whether or not the bit is set + * + * Returns non-zero on failure. 0 on success + */ + +int hfs_is_backingstore (struct vnode *vp, int *val) { + struct cnode *cp = NULL; + int err = 0; + + if (!vnode_isreg(vp)) { + *val = 0; + return 0; } - (void) hfs_lock(cp, HFS_FORCE_LOCK); + cp = VTOC(vp); + + /* lock the cnode */ + err = hfs_lock (cp, HFS_SHARED_LOCK); + if (err) { + return err; + } + + if (cp->c_flag & C_BACKINGSTORE) { + *val = 1; + } + else { + *val = 0; + } + + /* unlock everything */ + hfs_unlock (cp); - if (cp->c_datafork) + return err; +} + + +/* + * hfs_cnode_teardown + * + * This is an internal function that is invoked from both hfs_vnop_inactive + * and hfs_vnop_reclaim. As VNOP_INACTIVE is not necessarily called from vnodes + * being recycled and reclaimed, it is important that we do any post-processing + * necessary for the cnode in both places. Important tasks include things such as + * releasing the blocks from an open-unlinked file when all references to it have dropped, + * and handling resource forks separately from data forks. + * + * Note that we take only the vnode as an argument here (rather than the cnode). + * Recall that each cnode supports two forks (rsrc/data), and we can always get the right + * cnode from either of the vnodes, but the reverse is not true -- we can't determine which + * vnode we need to reclaim if only the cnode is supplied. + * + * This function is idempotent and safe to call from both hfs_vnop_inactive and hfs_vnop_reclaim + * if both are invoked right after the other. In the second call, most of this function's if() + * conditions will fail, since they apply generally to cnodes still marked with C_DELETED. + * As a quick check to see if this function is necessary, determine if the cnode is already + * marked C_NOEXISTS. If it is, then it is safe to skip this function. The only tasks that + * remain for cnodes marked in such a fashion is to teardown their fork references and + * release all directory hints and hardlink origins. However, both of those are done + * in hfs_vnop_reclaim. hfs_update, by definition, is not necessary if the cnode's catalog + * entry is no longer there. + * + * 'reclaim' argument specifies whether or not we were called from hfs_vnop_reclaim. If we are + * invoked from hfs_vnop_reclaim, we can not call functions that cluster_push since the UBC info + * is totally gone by that point. + * + * Assumes that both truncate and cnode locks for 'cp' are held. + */ +static +int hfs_cnode_teardown (struct vnode *vp, vfs_context_t ctx, int reclaim) { + + int forkcount = 0; + enum vtype v_type; + struct cnode *cp; + int error = 0; + int started_tr = 0; + struct hfsmount *hfsmp = VTOHFS(vp); + struct proc *p = vfs_context_proc(ctx); + int truncated = 0; + cat_cookie_t cookie; + int cat_reserve = 0; + int lockflags; + int ea_error = 0; + + v_type = vnode_vtype(vp); + cp = VTOC(vp); + + if (cp->c_datafork) { ++forkcount; - if (cp->c_rsrcfork) + } + if (cp->c_rsrcfork) { ++forkcount; - + } + + /* - * We should lock cnode before checking the flags in the - * condition below and should unlock the cnode before calling - * ubc_setsize() as cluster code can call other HFS vnops which - * will try to acquire the same cnode lock and cause deadlock. - * Only call ubc_setsize to 0 if we are the last fork. - */ - if ((v_type == VREG || v_type == VLNK) && + * Skip the call to ubc_setsize if we're being invoked on behalf of reclaim. + * The dirty regions would have already been synced to disk, so informing UBC + * that they can toss the pages doesn't help anyone at this point. + * + * Note that this is a performance problem if the vnode goes straight to reclaim + * (and skips inactive), since there would be no way for anyone to notify the UBC + * that all pages in this file are basically useless. + */ + if (reclaim == 0) { + /* + * Check whether we are tearing down a cnode with only one remaining fork. + * If there are blocks in its filefork, then we need to unlock the cnode + * before calling ubc_setsize. The cluster layer may re-enter the filesystem + * (i.e. VNOP_BLOCKMAP), and if we retain the cnode lock, we could double-lock + * panic. + */ + + if ((v_type == VREG || v_type == VLNK) && (cp->c_flag & C_DELETED) && (VTOF(vp)->ff_blocks != 0) && (forkcount == 1)) { - hfs_unlock(cp); - ubc_setsize(vp, 0); - (void) hfs_lock(cp, HFS_FORCE_LOCK); + hfs_unlock(cp); + /* ubc_setsize just fails if we were to call this from VNOP_RECLAIM */ + ubc_setsize(vp, 0); + (void) hfs_lock(cp, HFS_FORCE_LOCK); + } } - - if (v_type == VREG && !ISSET(cp->c_flag, C_DELETED) && VTOF(vp)->ff_blocks) { - hfs_filedone(vp, ap->a_context); + + /* + * Push file data out for normal files that haven't been evicted from + * the namespace. We only do this if this function was not called from reclaim, + * because by that point the UBC information has been totally torn down. + * + * There should also be no way that a normal file that has NOT been deleted from + * the namespace to skip INACTIVE and go straight to RECLAIM. That race only happens + * when the file becomes open-unlinked. + */ + if ((v_type == VREG) && + (!ISSET(cp->c_flag, C_DELETED)) && + (!ISSET(cp->c_flag, C_NOEXISTS)) && + (VTOF(vp)->ff_blocks) && + (reclaim == 0)) { + hfs_filedone(vp, ctx); } /* * Remove any directory hints or cached origins @@ -145,12 +251,7 @@ hfs_vnop_inactive(struct vnop_inactive_args *ap) if (cp->c_flag & C_HARDLINK) { hfs_relorigins(cp); } - - /* Hurry the recycling process along if we're an open-unlinked file */ - if((v_type == VREG || v_type == VLNK) && (cp->c_flag & C_DELETED)) { - recycle = 1; - } - + /* * This check is slightly complicated. We should only truncate data * in very specific cases for open-unlinked files. This is because @@ -162,7 +263,7 @@ hfs_vnop_inactive(struct vnop_inactive_args *ap) * If we're the last fork, then we have cleaning up to do. * * A) last fork, and vp == c_vp - * Truncate away own fork dat. If rsrc fork is not in core, truncate it too. + * Truncate away own fork data. If rsrc fork is not in core, truncate it too. * * B) last fork, and vp == c_rsrc_vp * Truncate ourselves, assume data fork has been cleaned due to C). @@ -177,192 +278,320 @@ hfs_vnop_inactive(struct vnop_inactive_args *ap) * D) not the last fork, vp == c_rsrc_vp * Don't enter the block below, just clean up vnode and push it out of core. */ - - if ((v_type == VREG || v_type == VLNK) && (cp->c_flag & C_DELETED) && - ((forkcount == 1) || (!VNODE_IS_RSRC(vp)))) { + + if ((v_type == VREG || v_type == VLNK) && + (cp->c_flag & C_DELETED) && + ((forkcount == 1) || (!VNODE_IS_RSRC(vp)))) { + + /* Truncate away our own fork data. (Case A, B, C above) */ if (VTOF(vp)->ff_blocks != 0) { /* * Since we're already inside a transaction, * tell hfs_truncate to skip the ubc_setsize. + * + * This truncate call (and the one below) is fine from VNOP_RECLAIM's + * context because we're only removing blocks, not zero-filling new + * ones. The C_DELETED check above makes things much simpler. */ - error = hfs_truncate(vp, (off_t)0, IO_NDELAY, 1, 0, ap->a_context); - if (error) + error = hfs_truncate(vp, (off_t)0, IO_NDELAY, 1, 0, ctx); + if (error) { goto out; + } truncated = 1; } - + /* - * If c_blocks > 0 and we are the last fork (data fork), then - * we can go and and truncate away the rsrc fork blocks if - * they were not in core. + * Truncate away the resource fork, if we represent the data fork and + * it is the last fork. That means, by definition, the rsrc fork is not in + * core. So we bring it into core, and then truncate it away. + * + * This is invoked via case A above only. */ if ((cp->c_blocks > 0) && (forkcount == 1) && (vp != cp->c_rsrc_vp)) { struct vnode *rvp = NULLVP; - + + /* + * It is safe for us to pass FALSE to the argument can_drop_lock + * on this call to hfs_vgetrsrc. We know that the resource fork does not + * exist in core, so we'll have to go to the catalog to retrieve its + * information. That will attach the resource fork vnode to our cnode. + */ error = hfs_vgetrsrc(hfsmp, vp, &rvp, FALSE, FALSE); - if (error) + if (error) { goto out; + } /* * Defer the vnode_put and ubc_setsize on rvp until hfs_unlock(). + * + * By bringing the vnode into core above, we may force hfs_vnop_reclaim + * to only partially finish if that's what called us. Bringing the + * resource fork into core results in a new rsrc vnode that will get + * immediately marked for termination below. It will get recycled/reclaimed + * as soon as possible, but that could cause another round of inactive and reclaim. */ cp->c_flag |= C_NEED_RVNODE_PUT | C_NEED_RSRC_SETSIZE; - error = hfs_truncate(rvp, (off_t)0, IO_NDELAY, 1, 0, ap->a_context); - if (error) + error = hfs_truncate(rvp, (off_t)0, IO_NDELAY, 1, 0, ctx); + if (error) { goto out; + } + + /* + * Note that the following call to vnode_recycle is safe from within the + * context of hfs_vnop_inactive or hfs_vnop_reclaim. It is being invoked + * on the RSRC fork vp (which is not our current vnode) As such, we hold + * an iocount on it and vnode_recycle will just add the MARKTERM bit at this + * point. + */ vnode_recycle(rvp); /* all done with this vnode */ } } - - // If needed, get rid of any xattrs that this file (or directory) may have. - // Note that this must happen outside of any other transactions - // because it starts/ends its own transactions and grabs its - // own locks. This is to prevent a file with a lot of attributes - // from creating a transaction that is too large (which panics). - // - if ((cp->c_attr.ca_recflags & kHFSHasAttributesMask) != 0 && - (cp->c_flag & C_DELETED) && (forkcount <= 1)) { - hfs_removeallattr(hfsmp, cp->c_fileid); - } - + /* - * Check for a postponed deletion. - * (only delete cnode when the last fork goes inactive) + * If we represent the last fork (or none in the case of a dir), + * and the cnode has become open-unlinked, + * AND it has EA's, then we need to get rid of them. + * + * Note that this must happen outside of any other transactions + * because it starts/ends its own transactions and grabs its + * own locks. This is to prevent a file with a lot of attributes + * from creating a transaction that is too large (which panics). */ - if ((cp->c_flag & C_DELETED) && (forkcount <= 1)) { - /* - * Mark cnode in transit so that no one can get this - * cnode from cnode hash. - */ - // hfs_chash_mark_in_transit(hfsmp, cp); - // XXXdbg - remove the cnode from the hash table since it's deleted - // otherwise someone could go to sleep on the cnode and not - // be woken up until this vnode gets recycled which could be - // a very long time... - hfs_chashremove(hfsmp, cp); - - cp->c_flag |= C_NOEXISTS; // XXXdbg - cp->c_rdev = 0; - - if (started_tr == 0) { - if (hfs_start_transaction(hfsmp) != 0) { - error = EINVAL; - goto out; - } - started_tr = 1; - } + if ((cp->c_attr.ca_recflags & kHFSHasAttributesMask) != 0 && + (cp->c_flag & C_DELETED) && + (forkcount <= 1)) { - /* - * Reserve some space in the Catalog file. - */ - if ((error = cat_preflight(hfsmp, CAT_DELETE, &cookie, p))) { + ea_error = hfs_removeallattr(hfsmp, cp->c_fileid); + } + + + /* + * If the cnode represented an open-unlinked file, then now + * actually remove the cnode's catalog entry and release all blocks + * it may have been using. + */ + if ((cp->c_flag & C_DELETED) && (forkcount <= 1)) { + /* + * Mark cnode in transit so that no one can get this + * cnode from cnode hash. + */ + // hfs_chash_mark_in_transit(hfsmp, cp); + // XXXdbg - remove the cnode from the hash table since it's deleted + // otherwise someone could go to sleep on the cnode and not + // be woken up until this vnode gets recycled which could be + // a very long time... + hfs_chashremove(hfsmp, cp); + + cp->c_flag |= C_NOEXISTS; // XXXdbg + cp->c_rdev = 0; + + if (started_tr == 0) { + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + goto out; + } + started_tr = 1; + } + + /* + * Reserve some space in the Catalog file. + */ + if ((error = cat_preflight(hfsmp, CAT_DELETE, &cookie, p))) { + goto out; + } + cat_reserve = 1; + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); + + if (cp->c_blocks > 0) { + printf("hfs_inactive: deleting non-empty%sfile %d, " + "blks %d\n", VNODE_IS_RSRC(vp) ? " rsrc " : " ", + (int)cp->c_fileid, (int)cp->c_blocks); + } + + // + // release the name pointer in the descriptor so that + // cat_delete() will use the file-id to do the deletion. + // in the case of hard links this is imperative (in the + // case of regular files the fileid and cnid are the + // same so it doesn't matter). + // + cat_releasedesc(&cp->c_desc); + + /* + * The descriptor name may be zero, + * in which case the fileid is used. + */ + error = cat_delete(hfsmp, &cp->c_desc, &cp->c_attr); + + if (error && truncated && (error != ENXIO)) + printf("hfs_inactive: couldn't delete a truncated file!"); + + /* Update HFS Private Data dir */ + if (error == 0) { + hfsmp->hfs_private_attr[FILE_HARDLINKS].ca_entries--; + if (vnode_isdir(vp)) { + DEC_FOLDERCOUNT(hfsmp, hfsmp->hfs_private_attr[FILE_HARDLINKS]); + } + (void)cat_update(hfsmp, &hfsmp->hfs_private_desc[FILE_HARDLINKS], + &hfsmp->hfs_private_attr[FILE_HARDLINKS], NULL, NULL); + } + + hfs_systemfile_unlock(hfsmp, lockflags); + + if (error) { goto out; } - cat_reserve = 1; - - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); - - if (cp->c_blocks > 0) { - printf("hfs_inactive: deleting non-empty%sfile %d, " - "blks %d\n", VNODE_IS_RSRC(vp) ? " rsrc " : " ", - (int)cp->c_fileid, (int)cp->c_blocks); - } - - // - // release the name pointer in the descriptor so that - // cat_delete() will use the file-id to do the deletion. - // in the case of hard links this is imperative (in the - // case of regular files the fileid and cnid are the - // same so it doesn't matter). - // - cat_releasedesc(&cp->c_desc); - /* - * The descriptor name may be zero, - * in which case the fileid is used. - */ - error = cat_delete(hfsmp, &cp->c_desc, &cp->c_attr); +#if QUOTA + if (hfsmp->hfs_flags & HFS_QUOTAS) + (void)hfs_chkiq(cp, -1, NOCRED, 0); +#endif /* QUOTA */ - if (error && truncated && (error != ENXIO)) - printf("hfs_inactive: couldn't delete a truncated file!"); - - /* Update HFS Private Data dir */ - if (error == 0) { - hfsmp->hfs_private_attr[FILE_HARDLINKS].ca_entries--; - if (vnode_isdir(vp)) { - DEC_FOLDERCOUNT(hfsmp, hfsmp->hfs_private_attr[FILE_HARDLINKS]); + /* Already set C_NOEXISTS at the beginning of this block */ + cp->c_flag &= ~C_DELETED; + cp->c_touch_chgtime = TRUE; + cp->c_touch_modtime = TRUE; + + if (error == 0) + hfs_volupdate(hfsmp, (v_type == VDIR) ? VOL_RMDIR : VOL_RMFILE, 0); + } + + /* + * A file may have had delayed allocations, in which case hfs_update + * would not have updated the catalog record (cat_update). We need + * to do that now, before we lose our fork data. We also need to + * force the update, or hfs_update will again skip the cat_update. + * + * If the file has C_NOEXISTS set, then we can skip the hfs_update call + * because the catalog entry has already been removed. There would be no point + * to looking up the entry in the catalog to modify it when we already know it's gone + */ + if ((!ISSET(cp->c_flag, C_NOEXISTS)) && + ((cp->c_flag & C_MODIFIED) || cp->c_touch_acctime || + cp->c_touch_chgtime || cp->c_touch_modtime)) { + + if ((cp->c_flag & C_MODIFIED) || cp->c_touch_modtime){ + cp->c_flag |= C_FORCEUPDATE; } - (void)cat_update(hfsmp, &hfsmp->hfs_private_desc[FILE_HARDLINKS], - &hfsmp->hfs_private_attr[FILE_HARDLINKS], NULL, NULL); + hfs_update(vp, 0); } + +out: + if (cat_reserve) + cat_postflight(hfsmp, &cookie, p); + + // XXXdbg - have to do this because a goto could have come here + if (started_tr) { + hfs_end_transaction(hfsmp); + started_tr = 0; + } + + + return error; +} - hfs_systemfile_unlock(hfsmp, lockflags); - - if (error) - goto out; -#if QUOTA - if (hfsmp->hfs_flags & HFS_QUOTAS) - (void)hfs_chkiq(cp, -1, NOCRED, 0); -#endif /* QUOTA */ - - /* Already set C_NOEXISTS at the beginning of this block */ - cp->c_flag &= ~C_DELETED; - cp->c_touch_chgtime = TRUE; - cp->c_touch_modtime = TRUE; - if (error == 0) - hfs_volupdate(hfsmp, (v_type == VDIR) ? VOL_RMDIR : VOL_RMFILE, 0); - } +/* + * hfs_vnop_inactive + * + * The last usecount on the vnode has gone away, so we need to tear down + * any remaining data still residing in the cnode. If necessary, write out + * remaining blocks or delete the cnode's entry in the catalog. + */ +int +hfs_vnop_inactive(struct vnop_inactive_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct cnode *cp; + struct hfsmount *hfsmp = VTOHFS(vp); + struct proc *p = vfs_context_proc(ap->a_context); + int error = 0; + int took_trunc_lock = 0; + enum vtype v_type; + + v_type = vnode_vtype(vp); + cp = VTOC(vp); + if ((hfsmp->hfs_flags & HFS_READ_ONLY) || vnode_issystem(vp) || + (hfsmp->hfs_freezing_proc == p)) { + error = 0; + goto inactive_done; + } + /* - * A file may have had delayed allocations, in which case hfs_update - * would not have updated the catalog record (cat_update). We need - * to do that now, before we lose our fork data. We also need to - * force the update, or hfs_update will again skip the cat_update. + * For safety, do NOT call vnode_recycle from inside this function. This can cause + * problems in the following scenario: + * + * vnode_create -> vnode_reclaim_internal -> vclean -> VNOP_INACTIVE + * + * If we're being invoked as a result of a reclaim that was already in-flight, then we + * cannot call vnode_recycle again. Being in reclaim means that there are no usecounts or + * iocounts by definition. As a result, if we were to call vnode_recycle, it would immediately + * try to re-enter reclaim again and panic. + * + * Currently, there are three things that can cause us (VNOP_INACTIVE) to get called. + * 1) last usecount goes away on the vnode (vnode_rele) + * 2) last iocount goes away on a vnode that previously had usecounts but didn't have + * vnode_recycle called (vnode_put) + * 3) vclean by way of reclaim + * + * In this function we would generally want to call vnode_recycle to speed things + * along to ensure that we don't leak blocks due to open-unlinked files. However, by + * virtue of being in this function already, we can call hfs_cnode_teardown, which + * will release blocks held by open-unlinked files, and mark them C_NOEXISTS so that + * there's no entry in the catalog and no backing store anymore. If that's the case, + * then we really don't care all that much when the vnode actually goes through reclaim. + * Further, the HFS VNOPs that manipulated the namespace in order to create the open- + * unlinked file in the first place should have already called vnode_recycle on the vnode + * to guarantee that it would go through reclaim in a speedy way. */ - if ((cp->c_flag & C_MODIFIED) || - cp->c_touch_acctime || cp->c_touch_chgtime || cp->c_touch_modtime) { - if ((cp->c_flag & C_MODIFIED) || cp->c_touch_modtime){ - cp->c_flag |= C_FORCEUPDATE; - } - hfs_update(vp, 0); + + if (cp->c_flag & C_NOEXISTS) { + /* + * If the cnode has already had its cat entry removed, then + * just skip to the end. We don't need to do anything here. + */ + error = 0; + goto inactive_done; } -out: - if (cat_reserve) - cat_postflight(hfsmp, &cookie, p); - - // XXXdbg - have to do this because a goto could have come here - if (started_tr) { - hfs_end_transaction(hfsmp); - started_tr = 0; + + if ((v_type == VREG || v_type == VLNK)) { + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK); + took_trunc_lock = 1; } + + (void) hfs_lock(cp, HFS_FORCE_LOCK); + /* - * This has been removed from the namespace and has no backing store - * in the catalog, so we should force a reclaim as soon as possible. - * Also, we want to check the flag while we still have the cnode lock. + * Call cnode_teardown to push out dirty blocks to disk, release open-unlinked + * files' blocks from being in use, and move the cnode from C_DELETED to C_NOEXISTS. */ - if (cp->c_flag & C_NOEXISTS) - recycle = 1; + error = hfs_cnode_teardown (vp, ap->a_context, 0); + /* + * Drop the truncate lock before unlocking the cnode + * (which can potentially perform a vnode_put and + * recycle the vnode which in turn might require the + * truncate lock) + */ + if (took_trunc_lock) { + hfs_unlock_truncate(cp, 0); + } + hfs_unlock(cp); - - if (took_trunc_lock) - hfs_unlock_truncate(cp, TRUE); - - /* - * If we are done with the vnode, reclaim it - * so that it can be reused immediately. - */ - if (recycle) - vnode_recycle(vp); - - return (error); + +inactive_done: + + return error; } + /* * File clean-up (zero fill and shrink peof). */ -static int + +int hfs_filedone(struct vnode *vp, vfs_context_t context) { struct cnode *cp; @@ -371,6 +600,8 @@ hfs_filedone(struct vnode *vp, vfs_context_t context) struct rl_entry *invalid_range; off_t leof; u_int32_t blks, blocksize; + int cluster_flags = IO_CLOSE; + int cluster_zero_flags = IO_HEADZEROFILL | IO_NOZERODIRTY | IO_NOCACHE; cp = VTOC(vp); fp = VTOF(vp); @@ -380,8 +611,18 @@ hfs_filedone(struct vnode *vp, vfs_context_t context) if ((hfsmp->hfs_flags & HFS_READ_ONLY) || (fp->ff_blocks == 0)) return (0); + /* + * If we are being invoked from F_SWAPDATAEXTENTS, then we + * need to issue synchronous IO; Unless we are sure that all + * of the data has been written to the disk, we won't know + * that all of the blocks have been allocated properly. + */ + if (cp->c_flag & C_SWAPINPROGRESS) { + cluster_flags |= IO_SYNC; + } + hfs_unlock(cp); - (void) cluster_push(vp, IO_CLOSE); + (void) cluster_push(vp, cluster_flags); hfs_lock(cp, HFS_FORCE_LOCK); /* @@ -400,8 +641,7 @@ hfs_filedone(struct vnode *vp, vfs_context_t context) hfs_unlock(cp); (void) cluster_write(vp, (struct uio *) 0, - leof, end + 1, start, (off_t)0, - IO_HEADZEROFILL | IO_NOZERODIRTY | IO_NOCACHE); + leof, end + 1, start, (off_t)0, cluster_zero_flags); hfs_lock(cp, HFS_FORCE_LOCK); cp->c_flag |= C_MODIFIED; } @@ -417,7 +657,7 @@ hfs_filedone(struct vnode *vp, vfs_context_t context) if (blks < fp->ff_blocks) (void) hfs_truncate(vp, leof, IO_NDELAY, 0, 0, context); hfs_unlock(cp); - (void) cluster_push(vp, IO_CLOSE); + (void) cluster_push(vp, cluster_flags); hfs_lock(cp, HFS_FORCE_LOCK); /* @@ -435,7 +675,6 @@ hfs_filedone(struct vnode *vp, vfs_context_t context) /* * Reclaim a cnode so that it can be used for other purposes. */ -__private_extern__ int hfs_vnop_reclaim(struct vnop_reclaim_args *ap) { @@ -444,23 +683,30 @@ hfs_vnop_reclaim(struct vnop_reclaim_args *ap) struct filefork *fp = NULL; struct filefork *altfp = NULL; struct hfsmount *hfsmp = VTOHFS(vp); + vfs_context_t ctx = ap->a_context; int reclaim_cnode = 0; - - (void) hfs_lock(VTOC(vp), HFS_FORCE_LOCK); + int err = 0; + enum vtype v_type; + + v_type = vnode_vtype(vp); cp = VTOC(vp); - /* - * A file may have had delayed allocations, in which case hfs_update - * would not have updated the catalog record (cat_update). We need - * to do that now, before we lose our fork data. We also need to - * force the update, or hfs_update will again skip the cat_update. + /* + * We don't take the truncate lock since by the time reclaim comes along, + * all dirty pages have been synced and nobody should be competing + * with us for this thread. */ - if ((cp->c_flag & C_MODIFIED) || - cp->c_touch_acctime || cp->c_touch_chgtime || cp->c_touch_modtime) { - if ((cp->c_flag & C_MODIFIED) || cp->c_touch_modtime){ - cp->c_flag |= C_FORCEUPDATE; - } - hfs_update(vp, 0); + (void) hfs_lock (cp, HFS_FORCE_LOCK); + + /* + * Sync to disk any remaining data in the cnode/vnode. This includes + * a call to hfs_update if the cnode has outbound data. + * + * If C_NOEXISTS is set on the cnode, then there's nothing teardown needs to do + * because the catalog entry for this cnode is already gone. + */ + if (!ISSET(cp->c_flag, C_NOEXISTS)) { + err = hfs_cnode_teardown(vp, ctx, 1); } /* @@ -525,7 +771,12 @@ hfs_vnop_reclaim(struct vnop_reclaim_args *ap) if (reclaim_cnode) { hfs_chashwakeup(hfsmp, cp, H_ALLOC | H_TRANSIT); hfs_reclaim_cnode(cp); - } else /* cnode in use */ { + } + else { + /* + * cnode in use. If it is a directory, it could have + * no live forks. Just release the lock. + */ hfs_unlock(cp); } @@ -546,7 +797,6 @@ extern int (**hfs_fifoop_p) (void *); * * The vnode is returned with an iocount and the cnode locked */ -__private_extern__ int hfs_getnewvnode( struct hfsmount *hfsmp, @@ -556,7 +806,8 @@ hfs_getnewvnode( int flags, struct cat_attr *attrp, struct cat_fork *forkp, - struct vnode **vpp) + struct vnode **vpp, + int *out_flags) { struct mount *mp = HFSTOVFS(hfsmp); struct vnode *vp = NULL; @@ -568,12 +819,13 @@ hfs_getnewvnode( int retval; int issystemfile; int wantrsrc; + int hflags = 0; struct vnode_fsparam vfsp; enum vtype vtype; #if QUOTA int i; #endif /* QUOTA */ - + hfs_standard = (hfsmp->hfs_flags & HFS_STANDARD); if (attrp->ca_fileid == 0) { @@ -591,6 +843,9 @@ hfs_getnewvnode( issystemfile = (descp->cd_flags & CD_ISMETA) && (vtype == VREG); wantrsrc = flags & GNV_WANTRSRC; + /* Zero out the out_flags */ + *out_flags = 0; + #ifdef HFS_CHECK_LOCK_ORDER /* * The only case were its permissible to hold the parent cnode @@ -607,7 +862,8 @@ hfs_getnewvnode( /* * Get a cnode (new or existing) */ - cp = hfs_chash_getcnode(hfsmp, attrp->ca_fileid, vpp, wantrsrc, (flags & GNV_SKIPLOCK)); + cp = hfs_chash_getcnode(hfsmp, attrp->ca_fileid, vpp, wantrsrc, + (flags & GNV_SKIPLOCK), out_flags, &hflags); /* * If the id is no longer valid for lookups we'll get back a NULL cp. @@ -615,20 +871,76 @@ hfs_getnewvnode( if (cp == NULL) { return (ENOENT); } - + /* - * Hardlinks may need an updated catalog descriptor. However, if - * the cnode has already been marked as open-unlinked (C_DELETED), then don't - * replace its descriptor. + * If we get a cnode/vnode pair out of hfs_chash_getcnode, then update the + * descriptor in the cnode as needed if the cnode represents a hardlink. + * We want the caller to get the most up-to-date copy of the descriptor + * as possible. However, we only do anything here if there was a valid vnode. + * If there isn't a vnode, then the cnode is brand new and needs to be initialized + * as it doesn't have a descriptor or cat_attr yet. + * + * If we are about to replace the descriptor with the user-supplied one, then validate + * that the descriptor correctly acknowledges this item is a hardlink. We could be + * subject to a race where the calling thread invoked cat_lookup, got a valid lookup + * result but the file was not yet a hardlink. With sufficient delay between there + * and here, we might accidentally copy in the raw inode ID into the descriptor in the + * call below. If the descriptor's CNID is the same as the fileID then it must + * not yet have been a hardlink when the lookup occurred. */ + if (!(hfs_checkdeleted(cp))) { if ((cp->c_flag & C_HARDLINK) && descp->cd_nameptr && descp->cd_namelen > 0) { - replace_desc(cp, descp); + /* If cnode is uninitialized, its c_attr will be zeroed out; cnids wont match. */ + if ((descp->cd_cnid == cp->c_attr.ca_fileid) && + (attrp->ca_linkcount != cp->c_attr.ca_linkcount)){ + if ((flags & GNV_SKIPLOCK) == 0) { + /* + * Then we took the lock. Drop it before calling + * vnode_put, which may invoke hfs_vnop_inactive and need to take + * the cnode lock again. + */ + hfs_unlock(cp); + } + + /* + * Emit ERECYCLE and GNV_CAT_ATTRCHANGED to + * force a re-drive in the lookup routine. + * Drop the iocount on the vnode obtained from + * chash_getcnode if needed. + */ + if (*vpp != NULL) { + vnode_put (*vpp); + *vpp = NULL; + } + + /* + * If we raced with VNOP_RECLAIM for this vnode, the hash code could + * have observed it after the c_vp or c_rsrc_vp fields had been torn down; + * the hash code peeks at those fields without holding the cnode lock because + * it needs to be fast. As a result, we may have set H_ATTACH in the chash + * call above. Since we're bailing out, unset whatever flags we just set, and + * wake up all waiters for this cnode. + */ + if (hflags) { + hfs_chashwakeup(hfsmp, cp, hflags); + } + + *out_flags = GNV_CAT_ATTRCHANGED; + return ERECYCLE; + } + else { + /* Otherwise, CNID != fileid. Go ahead and copy in the new descriptor */ + replace_desc(cp, descp); + } } } + + /* Check if we found a matching vnode */ - if (*vpp != NULL) + if (*vpp != NULL) { return (0); + } /* * If this is a new cnode then initialize it. @@ -640,12 +952,38 @@ hfs_getnewvnode( #endif /* Make sure its still valid (ie exists on disk). */ - if (!(flags & GNV_CREATE) && - !hfs_valid_cnode(hfsmp, dvp, (wantrsrc ? NULL : cnp), cp->c_fileid)) { - hfs_chash_abort(hfsmp, cp); - hfs_reclaim_cnode(cp); - *vpp = NULL; - return (ENOENT); + if (!(flags & GNV_CREATE)) { + int error = 0; + if (!hfs_valid_cnode (hfsmp, dvp, (wantrsrc ? NULL : cnp), cp->c_fileid, attrp, &error)) { + hfs_chash_abort(hfsmp, cp); + hfs_reclaim_cnode(cp); + *vpp = NULL; + /* + * If we hit this case, that means that the entry was there in the catalog when + * we did a cat_lookup earlier. Think hfs_lookup. However, in between the time + * that we checked the catalog and the time we went to get a vnode/cnode for it, + * it had been removed from the namespace and the vnode totally reclaimed. As a result, + * it's not there in the catalog during the check in hfs_valid_cnode and we bubble out + * an ENOENT. To indicate to the caller that they should really double-check the + * entry (it could have been renamed over and gotten a new fileid), we mark a bit + * in the output flags. + */ + if (error == ENOENT) { + *out_flags = GNV_CAT_DELETED; + return ENOENT; + } + + /* + * Also, we need to protect the cat_attr acquired during hfs_lookup and passed into + * this function as an argument because the catalog may have changed w.r.t hardlink + * link counts and the firstlink field. If that validation check fails, then let + * lookup re-drive itself to get valid/consistent data with the same failure condition below. + */ + if (error == ERECYCLE) { + *out_flags = GNV_CAT_ATTRCHANGED; + return (ERECYCLE); + } + } } bcopy(attrp, &cp->c_attr, sizeof(struct cat_attr)); bcopy(descp, &cp->c_desc, sizeof(struct cat_desc)); @@ -695,6 +1033,8 @@ hfs_getnewvnode( cp->c_dquot[i] = NODQUOT; } #endif /* QUOTA */ + /* Mark the output flag that we're vending a new cnode */ + *out_flags |= GNV_NEW_CNODE; } if (vtype == VDIR) { @@ -802,7 +1142,7 @@ hfs_getnewvnode( vfsp.vnfs_filesize = 0; vfsp.vnfs_flags = VNFS_ADDFSREF; - if (dvp == NULLVP || cnp == NULL || !(cnp->cn_flags & MAKEENTRY)) + if (dvp == NULLVP || cnp == NULL || !(cnp->cn_flags & MAKEENTRY) || (flags & GNV_NOCACHE)) vfsp.vnfs_flags |= VNFS_NOCACHE; /* Tag system files */ @@ -868,6 +1208,11 @@ hfs_getnewvnode( if (!(flags & GNV_CREATE) && (vtype != VDIR) && !issystemfile) { (void) hfs_removehotfile(vp); } + +#if CONFIG_PROTECT + if (!issystemfile && (*out_flags & GNV_NEW_CNODE)) + cp_entry_init(cp, mp); +#endif *vpp = vp; return (0); @@ -900,7 +1245,16 @@ hfs_reclaim_cnode(struct cnode *cp) cp->c_desc.cd_namelen = 0; vfs_removename(nameptr); } - + + /* + * We only call this function if we are in hfs_vnop_reclaim and + * attempting to reclaim a cnode with only one live fork. Because the vnode + * went through reclaim, any future attempts to use this item will have to + * go through lookup again, which will need to create a new vnode. Thus, + * destroying the locks below (while they were still held during our parent + * function hfs_vnop_reclaim) is safe. + */ + lck_rw_destroy(&cp->c_rwlock, hfs_rwlock_group); lck_rw_destroy(&cp->c_truncatelock, hfs_rwlock_group); #if HFS_COMPRESSION @@ -909,14 +1263,27 @@ hfs_reclaim_cnode(struct cnode *cp) FREE_ZONE(cp->c_decmp, sizeof(*(cp->c_decmp)), M_DECMPFS_CNODE); } #endif +#if CONFIG_PROTECT + cp_entry_destroy(cp); +#endif + + bzero(cp, sizeof(struct cnode)); FREE_ZONE(cp, sizeof(struct cnode), M_HFSNODE); } -__private_extern__ +/* + * hfs_valid_cnode + * + * This function is used to validate data that is stored in-core against what is contained + * in the catalog. Common uses include validating that the parent-child relationship still exist + * for a specific directory entry (guaranteeing it has not been renamed into a different spot) at + * the point of the check. + */ int -hfs_valid_cnode(struct hfsmount *hfsmp, struct vnode *dvp, struct componentname *cnp, cnid_t cnid) +hfs_valid_cnode(struct hfsmount *hfsmp, struct vnode *dvp, struct componentname *cnp, + cnid_t cnid, struct cat_attr *cattr, int *error) { struct cat_attr attr; struct cat_desc cndesc; @@ -924,34 +1291,181 @@ hfs_valid_cnode(struct hfsmount *hfsmp, struct vnode *dvp, struct componentname int lockflags; /* System files are always valid */ - if (cnid < kHFSFirstUserCatalogNodeID) + if (cnid < kHFSFirstUserCatalogNodeID) { + *error = 0; return (1); + } /* XXX optimization: check write count in dvp */ lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); if (dvp && cnp) { + int lookup = 0; + struct cat_fork fork; + bzero(&cndesc, sizeof(cndesc)); cndesc.cd_nameptr = (const u_int8_t *)cnp->cn_nameptr; cndesc.cd_namelen = cnp->cn_namelen; cndesc.cd_parentcnid = VTOC(dvp)->c_fileid; cndesc.cd_hint = VTOC(dvp)->c_childhint; - if ((cat_lookup(hfsmp, &cndesc, 0, NULL, &attr, NULL, NULL) == 0) && - (cnid == attr.ca_fileid)) { + /* + * We have to be careful when calling cat_lookup. The result argument + * 'attr' may get different results based on whether or not you ask + * for the filefork to be supplied as output. This is because cat_lookupbykey + * will attempt to do basic validation/smoke tests against the resident + * extents if there are no overflow extent records, but it needs someplace + * in memory to store the on-disk fork structures. + * + * Since hfs_lookup calls cat_lookup with a filefork argument, we should + * do the same here, to verify that block count differences are not + * due to calling the function with different styles. cat_lookupbykey + * will request the volume be fsck'd if there is true on-disk corruption + * where the number of blocks does not match the number generated by + * summing the number of blocks in the resident extents. + */ + + lookup = cat_lookup (hfsmp, &cndesc, 0, NULL, &attr, &fork, NULL); + if ((lookup == 0) && (cnid == attr.ca_fileid)) { stillvalid = 1; + *error = 0; + } + else { + *error = ENOENT; + } + + /* + * In hfs_getnewvnode, we may encounter a time-of-check vs. time-of-vnode creation + * race. Specifically, if there is no vnode/cnode pair for the directory entry + * being looked up, we have to go to the catalog. But since we don't hold any locks (aside + * from the dvp in 'shared' mode) there is nothing to protect us against the catalog record + * changing in between the time we do the cat_lookup there and the time we re-grab the + * catalog lock above to do another cat_lookup. + * + * However, we need to check more than just the CNID and parent-child name relationships above. + * Hardlinks can suffer the same race in the following scenario: Suppose we do a + * cat_lookup, and find a leaf record and a raw inode for a hardlink. Now, we have + * the cat_attr in hand (passed in above). But in between then and now, the vnode was + * created by a competing hfs_getnewvnode call, and is manipulated and reclaimed before we get + * a chance to do anything. This is possible if there are a lot of threads thrashing around + * with the cnode hash. In this case, if we don't check/validate the cat_attr in-hand, we will + * blindly stuff it into the cnode, which will make the in-core data inconsistent with what is + * on disk. So validate the cat_attr below, if required. This race cannot happen if the cnode/vnode + * already exists, as it does in the case of rename and delete. + */ + if (stillvalid && cattr != NULL) { + if (cattr->ca_linkcount != attr.ca_linkcount) { + stillvalid = 0; + *error = ERECYCLE; + goto notvalid; + } + + if (cattr->ca_union1.cau_linkref != attr.ca_union1.cau_linkref) { + stillvalid = 0; + *error = ERECYCLE; + goto notvalid; + } + + if (cattr->ca_union3.cau_firstlink != attr.ca_union3.cau_firstlink) { + stillvalid = 0; + *error = ERECYCLE; + goto notvalid; + } + + if (cattr->ca_union2.cau_blocks != attr.ca_union2.cau_blocks) { + stillvalid = 0; + *error = ERECYCLE; + goto notvalid; + } } } else { if (cat_idlookup(hfsmp, cnid, 0, NULL, NULL, NULL) == 0) { stillvalid = 1; + *error = 0; + } + else { + *error = ENOENT; } } +notvalid: hfs_systemfile_unlock(hfsmp, lockflags); return (stillvalid); } +/* + * Per HI and Finder requirements, HFS should add in the + * date/time that a particular directory entry was added + * to the containing directory. + * This is stored in the extended Finder Info for the + * item in question. + * + * Note that this field is also set explicitly in the hfs_vnop_setxattr code. + * We must ignore user attempts to set this part of the finderinfo, and + * so we need to save a local copy of the date added, write in the user + * finderinfo, then stuff the value back in. + */ +void hfs_write_dateadded (struct cat_attr *attrp, u_int32_t dateadded) { + u_int8_t *finfo = NULL; + + /* overlay the FinderInfo to the correct pointer, and advance */ + finfo = (u_int8_t*)attrp->ca_finderinfo; + finfo = finfo + 16; + + /* + * Make sure to write it out as big endian, since that's how + * finder info is defined. + * + * NOTE: This is a Unix-epoch timestamp, not a HFS/Traditional Mac timestamp. + */ + if (S_ISREG(attrp->ca_mode)) { + struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; + extinfo->date_added = OSSwapHostToBigInt32(dateadded); + attrp->ca_recflags |= kHFSHasDateAddedMask; + } + else if (S_ISDIR(attrp->ca_mode)) { + struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; + extinfo->date_added = OSSwapHostToBigInt32(dateadded); + attrp->ca_recflags |= kHFSHasDateAddedMask; + } + + /* If it were neither directory/file, then we'd bail out */ + return; +} + +u_int32_t hfs_get_dateadded (struct cnode *cp) { + u_int8_t *finfo = NULL; + u_int32_t dateadded = 0; + + if ((cp->c_attr.ca_recflags & kHFSHasDateAddedMask) == 0) { + /* Date added was never set. Return 0. */ + return dateadded; + } + + + /* overlay the FinderInfo to the correct pointer, and advance */ + finfo = (u_int8_t*)cp->c_finderinfo; + finfo = finfo + 16; + + /* + * FinderInfo is written out in big endian... make sure to convert it to host + * native before we use it. + */ + if (S_ISREG(cp->c_attr.ca_mode)) { + struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; + dateadded = OSSwapBigToHostInt32 (extinfo->date_added); + } + else if (S_ISDIR(cp->c_attr.ca_mode)) { + struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; + dateadded = OSSwapBigToHostInt32 (extinfo->date_added); + } + + return dateadded; +} + + + /* * Touch cnode times based on c_touch_xxx flags * @@ -959,21 +1473,23 @@ hfs_valid_cnode(struct hfsmount *hfsmp, struct vnode *dvp, struct componentname * * This will also update the volume modify time */ -__private_extern__ void hfs_touchtimes(struct hfsmount *hfsmp, struct cnode* cp) { + vfs_context_t ctx; /* don't modify times if volume is read-only */ if (hfsmp->hfs_flags & HFS_READ_ONLY) { cp->c_touch_acctime = FALSE; cp->c_touch_chgtime = FALSE; cp->c_touch_modtime = FALSE; + return; } else if (hfsmp->hfs_flags & HFS_STANDARD) { /* HFS Standard doesn't support access times */ cp->c_touch_acctime = FALSE; } + ctx = vfs_context_current(); /* * Skip access time updates if: * . MNT_NOATIME is set @@ -985,10 +1501,13 @@ hfs_touchtimes(struct hfsmount *hfsmp, struct cnode* cp) if ((vfs_flags(hfsmp->hfs_mp) & MNT_NOATIME) || (hfsmp->hfs_freezing_proc != NULL) || (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) || - (cp->c_vp && vnode_israge(cp->c_vp))) + (cp->c_vp && ((vnode_israge(cp->c_vp) || (vfs_ctx_skipatime(ctx)))))) { + cp->c_touch_acctime = FALSE; + } } - if (cp->c_touch_acctime || cp->c_touch_chgtime || cp->c_touch_modtime) { + if (cp->c_touch_acctime || cp->c_touch_chgtime || + cp->c_touch_modtime || (cp->c_flag & C_NEEDS_DATEADDED)) { struct timeval tv; int touchvol = 0; @@ -1027,6 +1546,14 @@ hfs_touchtimes(struct hfsmount *hfsmp, struct cnode* cp) cp->c_flag |= C_MODIFIED; touchvol = 1; } + + if (cp->c_flag & C_NEEDS_DATEADDED) { + hfs_write_dateadded (&(cp->c_attr), tv.tv_sec); + cp->c_flag |= C_MODIFIED; + /* untwiddle the bit */ + cp->c_flag &= ~C_NEEDS_DATEADDED; + touchvol = 1; + } /* Touch the volume modtime if needed */ if (touchvol) { @@ -1039,7 +1566,6 @@ hfs_touchtimes(struct hfsmount *hfsmp, struct cnode* cp) /* * Lock a cnode. */ -__private_extern__ int hfs_lock(struct cnode *cp, enum hfslocktype locktype) { @@ -1122,7 +1648,6 @@ hfs_lock(struct cnode *cp, enum hfslocktype locktype) /* * Lock a pair of cnodes. */ -__private_extern__ int hfs_lockpair(struct cnode *cp1, struct cnode *cp2, enum hfslocktype locktype) { @@ -1182,7 +1707,6 @@ hfs_isordered(struct cnode *cp1, struct cnode *cp2) * - only one lock taken per cnode (dup cnodes are skipped) * - some of the cnode pointers may be null */ -__private_extern__ int hfs_lockfour(struct cnode *cp1, struct cnode *cp2, struct cnode *cp3, struct cnode *cp4, enum hfslocktype locktype, struct cnode **error_cnode) @@ -1245,7 +1769,6 @@ hfs_lockfour(struct cnode *cp1, struct cnode *cp2, struct cnode *cp3, /* * Unlock a cnode. */ -__private_extern__ void hfs_unlock(struct cnode *cp) { @@ -1299,7 +1822,6 @@ hfs_unlock(struct cnode *cp) /* * Unlock a pair of cnodes. */ -__private_extern__ void hfs_unlockpair(struct cnode *cp1, struct cnode *cp2) { @@ -1311,7 +1833,6 @@ hfs_unlockpair(struct cnode *cp1, struct cnode *cp2) /* * Unlock a group of cnodes. */ -__private_extern__ void hfs_unlockfour(struct cnode *cp1, struct cnode *cp2, struct cnode *cp3, struct cnode *cp4) { @@ -1358,34 +1879,119 @@ hfs_unlockfour(struct cnode *cp1, struct cnode *cp2, struct cnode *cp3, struct c * * The process doing a truncation must take the lock * exclusive. The read/write processes can take it - * non-exclusive. + * shared. The locktype argument is the same as supplied to + * hfs_lock. */ -__private_extern__ void -hfs_lock_truncate(struct cnode *cp, int exclusive) +hfs_lock_truncate(struct cnode *cp, enum hfslocktype locktype) { -#ifdef HFS_CHECK_LOCK_ORDER - if (cp->c_lockowner == current_thread()) - panic("hfs_lock_truncate: cnode %p locked!", cp); -#endif /* HFS_CHECK_LOCK_ORDER */ + void * thread = current_thread(); - if (exclusive) - lck_rw_lock_exclusive(&cp->c_truncatelock); - else + if (cp->c_truncatelockowner == thread) { + /* + * Only HFS_RECURSE_TRUNCLOCK is allowed to recurse. + * + * This is needed on the hfs_vnop_pagein path where we need to ensure + * the file does not change sizes while we are paging in. However, + * we may already hold the lock exclusive due to another + * VNOP from earlier in the call stack. So if we already hold + * the truncate lock exclusive, allow it to proceed, but ONLY if + * it's in the recursive case. + */ + if (locktype != HFS_RECURSE_TRUNCLOCK) { + panic("hfs_lock_truncate: cnode %p locked!", cp); + } + } + /* HFS_RECURSE_TRUNCLOCK takes a shared lock if it is not already locked */ + else if ((locktype == HFS_SHARED_LOCK) || (locktype == HFS_RECURSE_TRUNCLOCK)) { lck_rw_lock_shared(&cp->c_truncatelock); + cp->c_truncatelockowner = HFS_SHARED_OWNER; + } + else { /* must be an HFS_EXCLUSIVE_LOCK */ + lck_rw_lock_exclusive(&cp->c_truncatelock); + cp->c_truncatelockowner = thread; + } } -__private_extern__ -void -hfs_unlock_truncate(struct cnode *cp, int exclusive) -{ - if (exclusive) { - lck_rw_unlock_exclusive(&cp->c_truncatelock); - } else { - lck_rw_unlock_shared(&cp->c_truncatelock); - } + +/* + * Attempt to get the truncate lock. If it cannot be acquired, error out. + * This function is needed in the degenerate hfs_vnop_pagein during force unmount + * case. To prevent deadlocks while a VM copy object is moving pages, HFS vnop pagein will + * temporarily need to disable V2 semantics. + */ +int hfs_try_trunclock (struct cnode *cp, enum hfslocktype locktype) { + void * thread = current_thread(); + boolean_t didlock = false; + + if (cp->c_truncatelockowner == thread) { + /* + * Only HFS_RECURSE_TRUNCLOCK is allowed to recurse. + * + * This is needed on the hfs_vnop_pagein path where we need to ensure + * the file does not change sizes while we are paging in. However, + * we may already hold the lock exclusive due to another + * VNOP from earlier in the call stack. So if we already hold + * the truncate lock exclusive, allow it to proceed, but ONLY if + * it's in the recursive case. + */ + if (locktype != HFS_RECURSE_TRUNCLOCK) { + panic("hfs_lock_truncate: cnode %p locked!", cp); + } + } + /* HFS_RECURSE_TRUNCLOCK takes a shared lock if it is not already locked */ + else if ((locktype == HFS_SHARED_LOCK) || (locktype == HFS_RECURSE_TRUNCLOCK)) { + didlock = lck_rw_try_lock(&cp->c_truncatelock, LCK_RW_TYPE_SHARED); + if (didlock) { + cp->c_truncatelockowner = HFS_SHARED_OWNER; + } + } + else { /* must be an HFS_EXCLUSIVE_LOCK */ + didlock = lck_rw_try_lock (&cp->c_truncatelock, LCK_RW_TYPE_EXCLUSIVE); + if (didlock) { + cp->c_truncatelockowner = thread; + } + } + + return didlock; } +/* + * Unlock the truncate lock, which protects against size changes. + * + * The been_recursed argument is used when we may need to return + * from this function without actually unlocking the truncate lock. + */ +void +hfs_unlock_truncate(struct cnode *cp, int been_recursed) +{ + void *thread = current_thread(); + /* + * If been_recursed is nonzero AND the current lock owner of the + * truncate lock is our current thread, then we must have recursively + * taken the lock earlier on. If the lock were unlocked, + * HFS_RECURSE_TRUNCLOCK took a shared lock and it would fall through + * to the SHARED case below. + * + * If been_recursed is zero (most of the time) then we check the + * lockowner field to infer whether the lock was taken exclusively or + * shared in order to know what underlying lock routine to call. + */ + if (been_recursed) { + if (cp->c_truncatelockowner == thread) { + return; + } + } + /* HFS_LOCK_EXCLUSIVE */ + if (thread == cp->c_truncatelockowner) { + cp->c_truncatelockowner = NULL; + lck_rw_unlock_exclusive(&cp->c_truncatelock); + } + /* HFS_LOCK_SHARED */ + else { + lck_rw_unlock_shared(&cp->c_truncatelock); + } +} diff --git a/bsd/hfs/hfs_cnode.h b/bsd/hfs/hfs_cnode.h index 9ffb9a8ca..73c2f664a 100644 --- a/bsd/hfs/hfs_cnode.h +++ b/bsd/hfs/hfs_cnode.h @@ -45,6 +45,10 @@ #if HFS_COMPRESSION #include #endif +#if CONFIG_PROTECT +#include +#endif + /* * The filefork is used to represent an HFS file fork (data or resource). @@ -106,6 +110,7 @@ struct cnode { lck_rw_t c_rwlock; /* cnode's lock */ void * c_lockowner; /* cnode's lock owner (exclusive case only) */ lck_rw_t c_truncatelock; /* protects file from truncation during read/write */ + void * c_truncatelockowner; /* truncate lock owner (exclusive case only) */ LIST_ENTRY(cnode) c_hash; /* cnode's hash chain */ u_int32_t c_flag; /* cnode's runtime flags */ u_int32_t c_hflag; /* cnode's flags for maintaining hash - protected by global hash lock */ @@ -132,6 +137,10 @@ struct cnode { #if HFS_COMPRESSION decmpfs_cnode *c_decmp; #endif /* HFS_COMPRESSION */ +#if CONFIG_PROTECT + cprotect_t c_cpentry; /* content protection data */ +#endif + }; typedef struct cnode cnode_t; @@ -183,13 +192,16 @@ typedef struct cnode cnode_t; #define C_FORCEUPDATE 0x00100 /* force the catalog entry update */ #define C_HASXATTRS 0x00200 /* cnode has extended attributes */ #define C_NEG_ENTRIES 0x00400 /* directory has negative name entries */ -#define C_WARNED_RSRC 0x00800 /* cnode lookup warning has been issued */ +#define C_SWAPINPROGRESS 0x00800 /* cnode's data is about to be swapped. Issue synchronous cluster io */ #define C_NEED_DATA_SETSIZE 0x01000 /* Do a ubc_setsize(0) on c_rsrc_vp after the unlock */ #define C_NEED_RSRC_SETSIZE 0x02000 /* Do a ubc_setsize(0) on c_vp after the unlock */ #define C_DIR_MODIFICATION 0x04000 /* Directory is being modified, wait for lookups */ #define C_ALWAYS_ZEROFILL 0x08000 /* Always zero-fill the file on an fsync */ +#define C_RENAMED 0x10000 /* cnode was deleted as part of rename; C_DELETED should also be set */ +#define C_NEEDS_DATEADDED 0x20000 /* cnode needs date-added written to the finderinfo bit */ +#define C_BACKINGSTORE 0x40000 /* cnode is a backing store for an existing or currently-mounting filesystem */ #define ZFTIMELIMIT (5 * 60) /* @@ -236,7 +248,7 @@ enum { kFinderInvisibleMask = 1 << 14 }; * upon the VNOP in question. Sometimes it is OK to use an open-unlinked file, for example, in, * reading. But other times, such as on the source of a VNOP_RENAME, it should be disallowed. */ -int hfs_checkdeleted (struct cnode *cp); +int hfs_checkdeleted(struct cnode *cp); /* * Test for a resource fork @@ -271,16 +283,28 @@ struct hfsfid { /* Get new default vnode */ extern int hfs_getnewvnode(struct hfsmount *hfsmp, struct vnode *dvp, struct componentname *cnp, struct cat_desc *descp, int flags, struct cat_attr *attrp, - struct cat_fork *forkp, struct vnode **vpp); + struct cat_fork *forkp, struct vnode **vpp, int *out_flags); +/* Input flags for hfs_getnewvnode */ #define GNV_WANTRSRC 0x01 /* Request the resource fork vnode. */ #define GNV_SKIPLOCK 0x02 /* Skip taking the cnode lock (when getting resource fork). */ #define GNV_CREATE 0x04 /* The vnode is for a newly created item. */ +#define GNV_NOCACHE 0x08 /* Delay entering this item in the name cache */ +/* Output flags for hfs_getnewvnode */ +#define GNV_CHASH_RENAMED 0x01 /* The cnode was renamed in-flight */ +#define GNV_CAT_DELETED 0x02 /* The cnode was deleted from the catalog */ +#define GNV_NEW_CNODE 0x04 /* We are vending out a newly initialized cnode */ +#define GNV_CAT_ATTRCHANGED 0x08 /* Something in struct cat_attr changed in between cat_lookups */ /* Touch cnode times based on c_touch_xxx flags */ extern void hfs_touchtimes(struct hfsmount *, struct cnode *); +extern void hfs_write_dateadded (struct cat_attr *cattrp, u_int32_t dateadded); +extern u_int32_t hfs_get_dateadded (struct cnode *cp); + +/* Zero-fill file and push regions out to disk */ +extern int hfs_filedone(struct vnode *vp, vfs_context_t context); /* * HFS cnode hash functions. @@ -294,11 +318,14 @@ extern void hfs_chash_rehash(struct hfsmount *hfsmp, struct cnode *cp1, struct extern void hfs_chashwakeup(struct hfsmount *hfsmp, struct cnode *cp, int flags); extern void hfs_chash_mark_in_transit(struct hfsmount *hfsmp, struct cnode *cp); -extern struct vnode * hfs_chash_getvnode(struct hfsmount *hfsmp, ino_t inum, int wantrsrc, int skiplock); -extern struct cnode * hfs_chash_getcnode(struct hfsmount *hfsmp, ino_t inum, struct vnode **vpp, int wantrsrc, int skiplock); +extern struct vnode * hfs_chash_getvnode(struct hfsmount *hfsmp, ino_t inum, int wantrsrc, + int skiplock, int allow_deleted); +extern struct cnode * hfs_chash_getcnode(struct hfsmount *hfsmp, ino_t inum, struct vnode **vpp, + int wantrsrc, int skiplock, int *out_flags, int *hflags); extern int hfs_chash_snoop(struct hfsmount *, ino_t, int (*)(const struct cat_desc *, const struct cat_attr *, void *), void *); -extern int hfs_valid_cnode(struct hfsmount *hfsmp, struct vnode *dvp, struct componentname *cnp, cnid_t cnid); +extern int hfs_valid_cnode(struct hfsmount *hfsmp, struct vnode *dvp, struct componentname *cnp, + cnid_t cnid, struct cat_attr *cattr, int *error); extern int hfs_chash_set_childlinkbit(struct hfsmount *hfsmp, cnid_t cnid); @@ -319,20 +346,22 @@ extern int hfs_chash_set_childlinkbit(struct hfsmount *hfsmp, cnid_t cnid); * 5. hfs mount point (always last) * */ -enum hfslocktype {HFS_SHARED_LOCK = 1, HFS_EXCLUSIVE_LOCK = 2, HFS_FORCE_LOCK = 3}; +enum hfslocktype {HFS_SHARED_LOCK = 1, HFS_EXCLUSIVE_LOCK = 2, HFS_FORCE_LOCK = 3, HFS_RECURSE_TRUNCLOCK = 4}; #define HFS_SHARED_OWNER (void *)0xffffffff -extern int hfs_lock(struct cnode *, enum hfslocktype); -extern int hfs_lockpair(struct cnode *, struct cnode *, enum hfslocktype); -extern int hfs_lockfour(struct cnode *, struct cnode *, struct cnode *, struct cnode *, +int hfs_lock(struct cnode *, enum hfslocktype); +int hfs_lockpair(struct cnode *, struct cnode *, enum hfslocktype); +int hfs_lockfour(struct cnode *, struct cnode *, struct cnode *, struct cnode *, enum hfslocktype, struct cnode **); -extern void hfs_unlock(struct cnode *); -extern void hfs_unlockpair(struct cnode *, struct cnode *); -extern void hfs_unlockfour(struct cnode *, struct cnode *, struct cnode *, struct cnode *); +void hfs_unlock(struct cnode *); +void hfs_unlockpair(struct cnode *, struct cnode *); +void hfs_unlockfour(struct cnode *, struct cnode *, struct cnode *, struct cnode *); + +void hfs_lock_truncate(struct cnode *, enum hfslocktype); +void hfs_unlock_truncate(struct cnode *, int been_recursed); -extern void hfs_lock_truncate(struct cnode *, int); -extern void hfs_unlock_truncate(struct cnode *, int); +int hfs_try_trunclock(struct cnode *, enum hfslocktype); #endif /* __APPLE_API_PRIVATE */ #endif /* KERNEL */ diff --git a/bsd/hfs/hfs_cprotect.c b/bsd/hfs/hfs_cprotect.c new file mode 100644 index 000000000..0345e4d9e --- /dev/null +++ b/bsd/hfs/hfs_cprotect.c @@ -0,0 +1,908 @@ +/* + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hfs.h" +#include "hfs_cnode.h" + +#ifdef CONFIG_PROTECT +static struct cp_wrap_func g_cp_wrap_func = {NULL, NULL}; +static struct cp_global_state g_cp_state = {0, 0}; + +extern int (**hfs_vnodeop_p) (void *); + +/* + * CP private functions + */ +static int cp_is_valid_class(int); +static int cp_getxattr(cnode_t *, struct cp_xattr *); +static int cp_setxattr(cnode_t *, struct cp_xattr *, int); +static struct cprotect *cp_entry_alloc(void); +static int cp_make_keys (struct cprotect *); +static int cp_restore_keys(struct cprotect *); +static int cp_lock_vfs_callback(mount_t, void *); +static int cp_lock_vnode_callback(vnode_t, void *); +static int cp_vnode_is_eligible (vnode_t); +static int cp_check_access (cnode_t *, int); +static int cp_wrap(int, void *, void *); +static int cp_unwrap(int, void *, void *); + + + +#if DEVELOPMENT || DEBUG +#define CP_ASSERT(x) \ + if ((x) == 0) { \ + panic("CP: failed assertion in %s", __FUNCTION__); \ + } +#else +#define CP_ASSERT(x) +#endif + +int +cp_key_store_action(int action) +{ + g_cp_state.lock_state = action; + if (action == CP_LOCKED_STATE) + return vfs_iterate(0, cp_lock_vfs_callback, (void *)action); + else + return 0; +} + + +int +cp_register_wraps(cp_wrap_func_t key_store_func) +{ + g_cp_wrap_func.wrapper = key_store_func->wrapper; + g_cp_wrap_func.unwrapper = key_store_func->unwrapper; + + g_cp_state.wrap_functions_set = 1; + + return 0; +} + +/* + * Allocate and initialize a cprotect blob for a new cnode. + * Called from hfs_getnewcnode: cnode is locked exclusive. + * Read xattr data off the cnode. Then, if conditions permit, + * unwrap the file key and cache it in the cprotect blob. + */ +int +cp_entry_init(cnode_t *cnode, struct mount *mp) +{ + struct cprotect *entry; + struct cp_xattr xattr; + int error = 0; + + if (!cp_fs_protected (mp)) { + cnode->c_cpentry = NULL; + return 0; + } + + if (!S_ISREG(cnode->c_mode)) { + cnode->c_cpentry = NULL; + return 0; + } + + if (!g_cp_state.wrap_functions_set) { + printf("hfs: cp_update_entry: wrap functions not yet set\n"); + return ENXIO; + } + + CP_ASSERT (cnode->c_cpentry == NULL); + + entry = cp_entry_alloc(); + if (!entry) + return ENOMEM; + + entry->cp_flags |= CP_KEY_FLUSHED; + cnode->c_cpentry = entry; + + error = cp_getxattr(cnode, &xattr); + if (error == ENOATTR) { + /* + * Can't tell if the file is new, or was previously created but never + * written to or set-classed. In either case, it'll need a fresh + * per-file key. + */ + entry->cp_flags |= CP_NEEDS_KEYS; + error = 0; + } else { + if (xattr.xattr_major_version != CP_CURRENT_MAJOR_VERS) { + printf("hfs: cp_entry_init: bad xattr version\n"); + error = EINVAL; + goto out; + } + + /* set up entry with information from xattr */ + entry->cp_pclass = xattr.persistent_class; + bcopy(&xattr.persistent_key, &entry->cp_persistent_key, CP_WRAPPEDKEYSIZE); + } + +out: + if (error) { + cp_entry_destroy (cnode); + } + return error; +} + +/* + * Set up initial key/class pair on cnode. The cnode is locked exclusive. + */ +int +cp_entry_create_keys(cnode_t *cnode) +{ + struct cprotect *entry = cnode->c_cpentry; + + if (!entry) { + //unprotected file: continue + return 0; + } + + CP_ASSERT((entry->cp_flags & CP_NEEDS_KEYS)); + + return cp_make_keys(entry); +} + +/* + * Tear down and clear a cprotect blob for a closing file. + * Called at hfs_reclaim_cnode: cnode is locked exclusive. + */ +void +cp_entry_destroy(cnode_t *cnode) +{ + struct cprotect *entry = cnode->c_cpentry; + if (!entry) { + /* nothing to clean up */ + return; + } + cnode->c_cpentry = NULL; + bzero(entry, sizeof(*entry)); + FREE(entry, M_TEMP); +} + +int +cp_fs_protected (mount_t mnt) { + return (vfs_flags(mnt) & MNT_CPROTECT); +} + + +/* + * Return a pointer to underlying cnode if there is one for this vnode. + * Done without taking cnode lock, inspecting only vnode state. + */ +cnode_t * +cp_get_protected_cnode(vnode_t vp) +{ + if (!cp_vnode_is_eligible(vp)) { + return NULL; + } + + if (!cp_fs_protected(VTOVFS(vp))) { + /* mount point doesn't support it */ + return NULL; + } + + return (cnode_t *) vp->v_data; +} + + +/* + * Sets *class to persistent class associated with vnode, + * or returns error. + */ +int +cp_vnode_getclass(vnode_t vp, int *class) +{ + struct cp_xattr xattr; + int error = 0; + struct cnode *cnode; + + if (!cp_vnode_is_eligible (vp)) { + return EBADF; + } + + cnode = VTOC(vp); + + hfs_lock(cnode, HFS_SHARED_LOCK); + + if (cp_fs_protected(VTOVFS(vp))) { + /* pull the class from the live entry */ + struct cprotect *entry = cnode->c_cpentry; + if (!entry) { + panic("Content Protection: uninitialized cnode %p", cnode); + } + + if ((entry->cp_flags & CP_NEEDS_KEYS)) { + error = cp_make_keys(entry); + } + *class = entry->cp_pclass; + + } else { + /* + * Mount point is not formatted for content protection. If a class + * has been specified anyway, report it. Otherwise, report D. + */ + error = cp_getxattr(cnode, &xattr); + if (error == ENOATTR) { + *class = PROTECTION_CLASS_D; + error = 0; + } else if (error == 0) { + *class = xattr.persistent_class; + } + } + + hfs_unlock(cnode); + return error; +} + + +/* + * Sets persistent class for this file. + * If vnode cannot be protected (system file, non-regular file, non-hfs), EBADF. + * If the new class can't be accessed now, EPERM. + * Otherwise, record class and re-wrap key if the mount point is content-protected. + */ +int +cp_vnode_setclass(vnode_t vp, uint32_t newclass) +{ + struct cnode *cnode; + struct cp_xattr xattr; + struct cprotect *entry = 0; + int error = 0; + + if (!cp_is_valid_class(newclass)) { + printf("hfs: CP: cp_setclass called with invalid class %d\n", newclass); + return EINVAL; + } + + /* is this an interesting file? */ + if (!cp_vnode_is_eligible(vp)) { + return EBADF; + } + + cnode = VTOC(vp); + + if (hfs_lock(cnode, HFS_EXCLUSIVE_LOCK)) { + return EINVAL; + } + + /* is the volume formatted for content protection? */ + if (cp_fs_protected(VTOVFS(vp))) { + entry = cnode->c_cpentry; + if (entry == NULL) { + error = EINVAL; + goto out; + } + + if ((entry->cp_flags & CP_NEEDS_KEYS)) { + if ((error = cp_make_keys(entry)) != 0) { + goto out; + } + } + + if (entry->cp_flags & CP_KEY_FLUSHED) { + error = cp_restore_keys(entry); + if (error) + goto out; + } + + /* re-wrap per-file key with new class */ + error = cp_wrap(newclass, + &entry->cp_cache_key[0], + &entry->cp_persistent_key[0]); + if (error) { + /* we didn't have perms to set this class. leave file as-is and error out */ + goto out; + } + + entry->cp_pclass = newclass; + + /* prepare to write the xattr out */ + bcopy(&entry->cp_persistent_key, &xattr.persistent_key, CP_WRAPPEDKEYSIZE); + } else { + /* no live keys for this file. just remember intended class */ + bzero(&xattr.persistent_key, CP_WRAPPEDKEYSIZE); + } + + xattr.xattr_major_version = CP_CURRENT_MAJOR_VERS; + xattr.xattr_minor_version = CP_CURRENT_MINOR_VERS; + xattr.key_size = CP_WRAPPEDKEYSIZE; + xattr.flags = 0; + xattr.persistent_class = newclass; + error = cp_setxattr(cnode, &xattr, XATTR_REPLACE); + + if (error == ENOATTR) { + error = cp_setxattr (cnode, &xattr, XATTR_CREATE); + } + +out: + hfs_unlock(cnode); + return error; +} + +/* + * Check permission for the given operation (read, write, page in) on this node. + * Additionally, if the node needs work, do it: + * - create a new key for the file if one hasn't been set before + * - write out the xattr if it hasn't already been saved + * - unwrap the key if needed + * + * Takes cnode lock, and upgrades to exclusive if modifying cprotect. + */ + int +cp_handle_vnop(cnode_t *cnode, int vnop) +{ + struct cprotect *entry; + int error = 0; + struct cp_xattr xattr; + + if ((error = hfs_lock(cnode, HFS_SHARED_LOCK)) != KERN_SUCCESS) { + return error; + } + + entry = cnode->c_cpentry; + if (!entry) + goto out; + + if ((error = cp_check_access(cnode, vnop)) != KERN_SUCCESS) { + goto out; + } + + if (entry->cp_flags == 0) { + /* no more work to do */ + goto out; + } + + /* upgrade to exclusive lock */ + if (lck_rw_lock_shared_to_exclusive(&cnode->c_rwlock) == FALSE) { + if ((error = hfs_lock(cnode, HFS_EXCLUSIVE_LOCK)) != KERN_SUCCESS) { + return error; + } + } else { + cnode->c_lockowner = current_thread(); + } + + /* generate new keys if none have ever been saved */ + if ((entry->cp_flags & CP_NEEDS_KEYS)) { + if ((error = cp_make_keys(entry)) != 0) { + goto out; + } + } + + /* unwrap keys if needed */ + if (entry->cp_flags & CP_KEY_FLUSHED) { + error = cp_restore_keys(entry); + if (error) + goto out; + } + + /* write out the xattr if it's new */ + if (entry->cp_flags & CP_NO_XATTR) { + bcopy(&entry->cp_persistent_key[0], &xattr.persistent_key, CP_WRAPPEDKEYSIZE); + xattr.xattr_major_version = CP_CURRENT_MAJOR_VERS; + xattr.xattr_minor_version = CP_CURRENT_MINOR_VERS; + xattr.key_size = CP_WRAPPEDKEYSIZE; + xattr.persistent_class = entry->cp_pclass; + error = cp_setxattr(cnode, &xattr, XATTR_CREATE); + } + +out: + hfs_unlock(cnode); + return error; +} + +/* + * During hfs resize operations, we have slightly different constraints than during + * normal VNOPS that read/write data to files. Specifically, we already have the cnode + * locked (so nobody else can modify it), and we are doing the IO with root privileges, since + * we are moving the data behind the user's back. So, we skip access checks here (for unlock + * vs. lock), and don't worry about non-existing keys. If the file exists on-disk with valid + * payload, then it must have keys set up already by definition. + */ +int cp_handle_relocate (cnode_t *cp) { + struct cprotect *entry; + int error = -1; + + /* cp is already locked */ + entry = cp->c_cpentry; + if (!entry) + goto out; + + /* + * Still need to validate whether to permit access to the file or not + * based on lock status + */ + if ((error = cp_check_access(cp, CP_READ_ACCESS | CP_WRITE_ACCESS)) != KERN_SUCCESS) { + goto out; + } + + if (entry->cp_flags == 0) { + /* no more work to do */ + error = 0; + goto out; + } + + /* it must have keys since it is an existing file with actual payload */ + + /* unwrap keys if needed */ + if (entry->cp_flags & CP_KEY_FLUSHED) { + error = cp_restore_keys(entry); + } + + /* don't need to write out the EA since the file is extant */ +out: + + /* return the cp still locked */ + return error; +} + + + +/* + * cp_getrootxattr: + * Gets the EA we set on the root folder (fileid 1) to get information about the + * version of Content Protection that was used to write to this filesystem. + * Note that all multi-byte fields are written to disk little endian so they must be + * converted to native endian-ness as needed. + */ + +int cp_getrootxattr(struct hfsmount* hfsmp, struct cp_root_xattr *outxattr) { + uio_t auio; + char uio_buf[UIO_SIZEOF(1)]; + size_t attrsize = sizeof(struct cp_root_xattr); + int error = 0; + struct vnop_getxattr_args args; + + if (!outxattr) { + panic("cp_xattr called with xattr == NULL"); + } + + auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf)); + uio_addiov(auio, CAST_USER_ADDR_T(outxattr), attrsize); + + args.a_desc = NULL; // unused + args.a_vp = NULL; //unused since we're writing EA to root folder. + args.a_name = CONTENT_PROTECTION_XATTR_NAME; + args.a_uio = auio; + args.a_size = &attrsize; + args.a_options = XATTR_REPLACE; + args.a_context = NULL; // unused + + error = hfs_getxattr_internal(NULL, &args, hfsmp, 1); + + /* Now convert the multi-byte fields to native endianness */ + outxattr->major_version = OSSwapLittleToHostInt16(outxattr->major_version); + outxattr->minor_version = OSSwapLittleToHostInt16(outxattr->minor_version); + outxattr->flags = OSSwapLittleToHostInt64(outxattr->flags); + + if (error != KERN_SUCCESS) { + goto out; + } + +out: + uio_free(auio); + return error; +} + +/* + * cp_setrootxattr: + * Sets the EA we set on the root folder (fileid 1) to get information about the + * version of Content Protection that was used to write to this filesystem. + * Note that all multi-byte fields are written to disk little endian so they must be + * converted to little endian as needed. + * + * This will be written to the disk when it detects the EA is not there, or when we need + * to make a modification to the on-disk version that can be done in-place. + */ + int +cp_setrootxattr(struct hfsmount *hfsmp, struct cp_root_xattr *newxattr) +{ + int error = 0; + struct vnop_setxattr_args args; + + args.a_desc = NULL; + args.a_vp = NULL; + args.a_name = CONTENT_PROTECTION_XATTR_NAME; + args.a_uio = NULL; //pass data ptr instead + args.a_options = 0; + args.a_context = NULL; //no context needed, only done from mount. + + /* Now convert the multi-byte fields to little endian before writing to disk. */ + newxattr->major_version = OSSwapHostToLittleInt16(newxattr->major_version); + newxattr->minor_version = OSSwapHostToLittleInt16(newxattr->minor_version); + newxattr->flags = OSSwapHostToLittleInt64(newxattr->flags); + + error = hfs_setxattr_internal(NULL, (caddr_t)newxattr, + sizeof(struct cp_root_xattr), &args, hfsmp, 1); + return error; +} + + + + +/******************** + * Private Functions + *******************/ + +static int +cp_vnode_is_eligible(vnode_t vp) +{ + return ((vp->v_op == hfs_vnodeop_p) && + (!vnode_issystem(vp)) && + (vnode_isreg(vp))); +} + + + +static int +cp_is_valid_class(int class) +{ + return ((class >= PROTECTION_CLASS_A) && + (class <= PROTECTION_CLASS_F)); +} + + +static struct cprotect * +cp_entry_alloc(void) +{ + struct cprotect *cp_entry; + + MALLOC(cp_entry, struct cprotect *, sizeof(struct cprotect), + M_TEMP, M_WAITOK); + if (cp_entry == NULL) + return (NULL); + + bzero(cp_entry, sizeof(*cp_entry)); + return (cp_entry); +} + + +/* + * Reads xattr data off the cnode and into provided xattr. + * cnode lock held shared + */ +static int +cp_getxattr(cnode_t *cnode, struct cp_xattr *outxattr) +{ + uio_t auio; + char uio_buf[UIO_SIZEOF(1)]; + size_t attrsize = sizeof(struct cp_xattr); + int error = 0; + struct vnop_getxattr_args args; + + auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf)); + uio_addiov(auio, CAST_USER_ADDR_T(outxattr), attrsize); + + args.a_desc = NULL; // unused + args.a_vp = cnode->c_vp; + args.a_name = CONTENT_PROTECTION_XATTR_NAME; + args.a_uio = auio; + args.a_size = &attrsize; + args.a_options = XATTR_REPLACE; + args.a_context = vfs_context_current(); // unused + error = hfs_getxattr_internal(cnode, &args, VTOHFS(cnode->c_vp), 0); + if (error != KERN_SUCCESS) { + goto out; + } + + /* Endian swap the multi-byte fields into host endianness from L.E. */ + outxattr->xattr_major_version = OSSwapLittleToHostInt16(outxattr->xattr_major_version); + outxattr->xattr_minor_version = OSSwapLittleToHostInt16(outxattr->xattr_minor_version); + outxattr->key_size = OSSwapLittleToHostInt32(outxattr->key_size); + outxattr->flags = OSSwapLittleToHostInt32(outxattr->flags); + outxattr->persistent_class = OSSwapLittleToHostInt32(outxattr->persistent_class); + +out: + uio_free(auio); + return error; +} + +/* + * Stores new xattr data on the cnode. + * cnode lock held exclusive + */ +static int +cp_setxattr(cnode_t *cnode, struct cp_xattr *newxattr, int options) +{ + int error = 0; + struct vnop_setxattr_args args; + + args.a_desc = NULL; + args.a_vp = cnode->c_vp; + args.a_name = CONTENT_PROTECTION_XATTR_NAME; + args.a_uio = NULL; //pass data ptr instead + args.a_options = options; + args.a_context = vfs_context_current(); + + /* Endian swap the multi-byte fields into L.E from host. */ + newxattr->xattr_major_version = OSSwapHostToLittleInt16(newxattr->xattr_major_version); + newxattr->xattr_minor_version = OSSwapHostToLittleInt16(newxattr->xattr_minor_version); + newxattr->key_size = OSSwapHostToLittleInt32(newxattr->key_size); + newxattr->flags = OSSwapHostToLittleInt32(newxattr->flags); + newxattr->persistent_class = OSSwapHostToLittleInt32(newxattr->persistent_class); + + error = hfs_setxattr_internal(cnode, (caddr_t)newxattr, + sizeof(struct cp_xattr), &args, VTOHFS(cnode->c_vp), 0); + + if ((error == KERN_SUCCESS) && (cnode->c_cpentry)) { + cnode->c_cpentry->cp_flags &= ~CP_NO_XATTR; + } + + return error; +} + + +/* + * Make a new random per-file key and wrap it. + */ +static int +cp_make_keys(struct cprotect *entry) +{ + int error = 0; + + if (g_cp_state.wrap_functions_set != 1) { + printf("hfs: CP: could not create keys: no wrappers set\n"); + return ENXIO; + } + + /* create new cp data: key and class */ + read_random(&entry->cp_cache_key[0], CP_KEYSIZE); + entry->cp_pclass = PROTECTION_CLASS_D; + + /* wrap the new key in the class key */ + error = cp_wrap(PROTECTION_CLASS_D, + &entry->cp_cache_key[0], + &entry->cp_persistent_key[0]); + + if (error) { + panic("could not wrap new key in class D\n"); + } + + /* ready for business */ + entry->cp_flags &= ~CP_NEEDS_KEYS; + entry->cp_flags |= CP_NO_XATTR; + + return error; +} + +/* + * If permitted, restore entry's unwrapped key from the persistent key. + * If not, clear key and set CP_ENTRY_FLUSHED. + * cnode lock held exclusive + */ +static int +cp_restore_keys(struct cprotect *entry) +{ + int error = 0; + + error = cp_unwrap(entry->cp_pclass, + &entry->cp_persistent_key[0], + &entry->cp_cache_key[0]); + + if (error) { + entry->cp_flags |= CP_KEY_FLUSHED; + bzero(entry->cp_cache_key, CP_KEYSIZE); + error = EPERM; + } + else { + entry->cp_flags &= ~CP_KEY_FLUSHED; + } + return error; +} + +static int +cp_lock_vfs_callback(mount_t mp, void *arg) +{ + if (!cp_fs_protected(mp)) { + /* not interested in this mount point */ + return 0; + } + + return vnode_iterate(mp, 0, cp_lock_vnode_callback, arg); +} + + +/* + * Deny access to protected files if keys have been locked. + * + * cnode lock is taken shared. + */ + static int +cp_check_access(cnode_t *cnode, int vnop) +{ + int error = 0; + + if (g_cp_state.lock_state == CP_UNLOCKED_STATE) { + return KERN_SUCCESS; + } + + if (!cnode->c_cpentry) { + /* unprotected node */ + return KERN_SUCCESS; + } + + /* Deny all access for class A files, and read access for class B */ + switch (cnode->c_cpentry->cp_pclass) { + case PROTECTION_CLASS_A: { + error = EPERM; + break; + } + case PROTECTION_CLASS_B: { + if (vnop & CP_READ_ACCESS) + error = EPERM; + else + error = 0; + break; + } + default: + error = 0; + break; + } + + return error; +} + + + +/* + * Respond to a lock or unlock event. + * On lock: clear out keys from memory, then flush file contents. + * On unlock: nothing (function not called). + */ +static int +cp_lock_vnode_callback(vnode_t vp, void *arg) +{ + cnode_t *cp = NULL; + struct cprotect *entry = NULL; + int error = 0; + int locked = 1; + int action = 0; + + error = vnode_getwithref (vp); + if (error) { + return error; + } + + cp = VTOC(vp); + hfs_lock(cp, HFS_FORCE_LOCK); + + entry = cp->c_cpentry; + if (!entry) { + /* unprotected vnode: not a regular file */ + goto out; + } + + action = (int)((uintptr_t) arg); + switch (action) { + case CP_LOCKED_STATE: { + vfs_context_t ctx; + if (entry->cp_pclass != PROTECTION_CLASS_A) { + /* no change at lock for other classes */ + goto out; + } + + /* Before doing anything else, zero-fille sparse ranges as needed */ + ctx = vfs_context_current(); + (void) hfs_filedone (vp, ctx); + + /* first, sync back dirty pages */ + hfs_unlock (cp); + ubc_msync (vp, 0, ubc_getsize(vp), NULL, UBC_PUSHALL | UBC_INVALIDATE | UBC_SYNC); + hfs_lock (cp, HFS_FORCE_LOCK); + + /* flush keys */ + entry->cp_flags |= CP_KEY_FLUSHED; + bzero(&entry->cp_cache_key, CP_KEYSIZE); + /* some write may have arrived in the mean time. dump those pages */ + hfs_unlock(cp); + locked = 0; + + ubc_msync (vp, 0, ubc_getsize(vp), NULL, UBC_INVALIDATE | UBC_SYNC); + break; + } + case CP_UNLOCKED_STATE: { + /* no-op */ + break; + } + default: + panic("unknown lock action %d\n", action); + } + +out: + if (locked) + hfs_unlock(cp); + vnode_put (vp); + return error; +} + +static int +cp_wrap(int class, void *inkey, void *outkey) +{ + int error = 0; + size_t keyln = CP_WRAPPEDKEYSIZE; + + if (class == PROTECTION_CLASS_F) { + bzero(outkey, CP_WRAPPEDKEYSIZE); + return 0; + } + + error = g_cp_wrap_func.wrapper(class, + inkey, + CP_KEYSIZE, + outkey, + &keyln); + + return error; +} + + +static int +cp_unwrap(int class, void *inkey, void *outkey) +{ + int error = 0; + size_t keyln = CP_KEYSIZE; + + if (class == PROTECTION_CLASS_F) { + /* we didn't save a wrapped key, so nothing to unwrap */ + return EPERM; + } + + error = g_cp_wrap_func.unwrapper(class, + inkey, + CP_WRAPPEDKEYSIZE, + outkey, + &keyln); + + return error; + +} + + +#else + +int cp_key_store_action(int action __unused) +{ + return ENOTSUP; +} + + +int cp_register_wraps(cp_wrap_func_t key_store_func __unused) +{ + return ENOTSUP; +} + +#endif /* CONFIG_PROTECT */ diff --git a/bsd/hfs/hfs_dbg.h b/bsd/hfs/hfs_dbg.h index ef0423083..f39271fe4 100644 --- a/bsd/hfs/hfs_dbg.h +++ b/bsd/hfs/hfs_dbg.h @@ -94,7 +94,7 @@ extern int hfs_dbg_err; #if (HFS_DEBUG_STAGE == 4) char gDebugAssertStr[255]; #define DBG_ASSERT(a) { if (!(a)) { \ - sprintf(gDebugAssertStr,"Oops - File "__FILE__", line %d: assertion '%s' failed.\n", __LINE__, #a); \ + snprintf(gDebugAssertStr, sizeof (gDebugAssertStr), "Oops - File "__FILE__", line %d: assertion '%s' failed.\n", __LINE__, #a); \ Debugger(gDebugAssertStr); } } #else #define DBG_ASSERT(a) { if (!(a)) { panic("File "__FILE__", line %d: assertion '%s' failed.\n", __LINE__, #a); } } diff --git a/bsd/hfs/hfs_encodings.c b/bsd/hfs/hfs_encodings.c index 4a67567b2..13c9781f8 100644 --- a/bsd/hfs/hfs_encodings.c +++ b/bsd/hfs/hfs_encodings.c @@ -239,8 +239,8 @@ hfs_to_utf8(ExtendedVCB *vcb, const Str31 hfs_str, ByteCount maxDstLen, ByteCoun UniChar uniStr[MAX_HFS_UNICODE_CHARS]; ItemCount uniCount; size_t utf8len; - u_int8_t pascal_length = 0; hfs_to_unicode_func_t hfs_get_unicode = VCBTOHFS(vcb)->hfs_get_unicode; + u_int8_t pascal_length = 0; /* * Validate the length of the Pascal-style string before passing it @@ -252,7 +252,7 @@ hfs_to_utf8(ExtendedVCB *vcb, const Str31 hfs_str, ByteCount maxDstLen, ByteCoun error = EINVAL; return error; } - + error = hfs_get_unicode(hfs_str, uniStr, MAX_HFS_UNICODE_CHARS, &uniCount); if (uniCount == 0) @@ -292,7 +292,7 @@ mac_roman_to_utf8(const Str31 hfs_str, ByteCount maxDstLen, ByteCount *actualDst /* invalid string; longer than 31 bytes */ error = EINVAL; return error; - } + } error = mac_roman_to_unicode(hfs_str, uniStr, MAX_HFS_UNICODE_CHARS, &uniCount); diff --git a/bsd/hfs/hfs_endian.c b/bsd/hfs/hfs_endian.c index 6f840045d..367785b29 100644 --- a/bsd/hfs/hfs_endian.c +++ b/bsd/hfs/hfs_endian.c @@ -49,13 +49,14 @@ * The direction parameter must be kSwapBTNodeBigToHost or kSwapBTNodeHostToBig. * The kSwapBTNodeHeaderRecordOnly "direction" is not valid for these routines. */ -static int hfs_swap_HFSPlusBTInternalNode (BlockDescriptor *src, HFSCatalogNodeID fileID, enum HFSBTSwapDirection direction); -static int hfs_swap_HFSBTInternalNode (BlockDescriptor *src, HFSCatalogNodeID fileID, enum HFSBTSwapDirection direction); +int hfs_swap_HFSPlusBTInternalNode (BlockDescriptor *src, HFSCatalogNodeID fileID, enum HFSBTSwapDirection direction); +int hfs_swap_HFSBTInternalNode (BlockDescriptor *src, HFSCatalogNodeID fileID, enum HFSBTSwapDirection direction); +void hfs_swap_HFSPlusForkData (HFSPlusForkData *src); /* * hfs_swap_HFSPlusForkData */ -static void +void hfs_swap_HFSPlusForkData ( HFSPlusForkData *src ) @@ -160,7 +161,7 @@ hfs_swap_BTNode ( /* * Check srcDesc->height. Don't swap it because it's only one byte. */ - if (srcDesc->height > btcb->treeDepth) { + if (srcDesc->height > kMaxTreeDepth) { printf("hfs_swap_BTNode: invalid node height (%d)\n", srcDesc->height); error = fsBTInvalidHeaderErr; goto fail; @@ -314,7 +315,7 @@ hfs_swap_BTNode ( /* * Check srcDesc->height. Don't swap it because it's only one byte. */ - if (srcDesc->height > btcb->treeDepth) { + if (srcDesc->height > kMaxTreeDepth) { panic("hfs_UNswap_BTNode: invalid node height (%d)\n", srcDesc->height); error = fsBTInvalidHeaderErr; goto fail; @@ -389,7 +390,7 @@ hfs_swap_BTNode ( return (error); } -static int +int hfs_swap_HFSPlusBTInternalNode ( BlockDescriptor *src, HFSCatalogNodeID fileID, @@ -925,7 +926,7 @@ hfs_swap_HFSPlusBTInternalNode ( return (0); } -static int +int hfs_swap_HFSBTInternalNode ( BlockDescriptor *src, HFSCatalogNodeID fileID, diff --git a/bsd/hfs/hfs_format.h b/bsd/hfs/hfs_format.h index 151cadde7..ae1039a3e 100644 --- a/bsd/hfs/hfs_format.h +++ b/bsd/hfs/hfs_format.h @@ -232,6 +232,21 @@ struct FndrOpaqueInfo { } __attribute__((aligned(2), packed)); typedef struct FndrOpaqueInfo FndrOpaqueInfo; +struct FndrExtendedDirInfo { + u_int32_t point; + u_int32_t date_added; + u_int16_t extended_flags; + u_int16_t reserved3; + u_int32_t reserved4; +} __attribute__((aligned(2), packed)); + +struct FndrExtendedFileInfo { + u_int32_t reserved1; + u_int32_t date_added; + u_int16_t extended_flags; + u_int16_t reserved2; + u_int32_t reserved3; +} __attribute__((aligned(2), packed)); /* HFS Plus Fork data info - 80 bytes */ struct HFSPlusForkData { @@ -354,7 +369,11 @@ enum { kHFSHasLinkChainMask = 0x0020, kHFSHasChildLinkBit = 0x0006, /* folder has a child that's a dir link */ - kHFSHasChildLinkMask = 0x0040 + kHFSHasChildLinkMask = 0x0040, + + kHFSHasDateAddedBit = 0x0007, /* File/Folder has the date-added stored in the finder info. */ + kHFSHasDateAddedMask = 0x0080 + }; @@ -577,7 +596,8 @@ enum { * Therefore, bits 16-31 can only be used on HFS Plus. */ kHFSUnusedNodeFixBit = 31, /* Unused nodes in the Catalog B-tree have been zero-filled. See Radar #6947811. */ - + kHFSContentProtectionBit = 30, /* Volume has per-file content protection */ + kHFSVolumeHardwareLockMask = 1 << kHFSVolumeHardwareLockBit, kHFSVolumeUnmountedMask = 1 << kHFSVolumeUnmountedBit, kHFSVolumeSparedBlocksMask = 1 << kHFSVolumeSparedBlocksBit, @@ -588,6 +608,7 @@ enum { kHFSVolumeInconsistentMask = 1 << kHFSVolumeInconsistentBit, kHFSVolumeSoftwareLockMask = 1 << kHFSVolumeSoftwareLockBit, kHFSUnusedNodeFixMask = 1 << kHFSUnusedNodeFixBit, + kHFSContentProtectionMask = 1 << kHFSContentProtectionBit, kHFSMDBAttributesMask = 0x8380 }; diff --git a/bsd/hfs/hfs_fsctl.h b/bsd/hfs/hfs_fsctl.h index 7759e799a..7bebee3fb 100644 --- a/bsd/hfs/hfs_fsctl.h +++ b/bsd/hfs/hfs_fsctl.h @@ -81,8 +81,8 @@ struct hfs_journal_info { #define HFSIOC_BULKACCESS _IOW('h', 9, struct user32_access_t) #define HFS_BULKACCESS_FSCTL IOCBASECMD(HFSIOC_BULKACCESS) -#define HFSIOC_SETACLSTATE _IOW('h', 10, int32_t) -#define HFS_SETACLSTATE IOCBASECMD(HFSIOC_SETACLSTATE) +/* Unsupported - Previously used to enable/disable ACLs */ +#define HFSIOC_UNSUPPORTED _IOW('h', 10, int32_t) #define HFSIOC_PREV_LINK _IOWR('h', 11, u_int32_t) #define HFS_PREV_LINK IOCBASECMD(HFSIOC_PREV_LINK) @@ -121,7 +121,10 @@ struct hfs_journal_info { #define HFSIOC_VOLUME_STATUS _IOR('h', 24, u_int32_t) #define HFS_VOLUME_STATUS IOCBASECMD(HFSIOC_VOLUME_STATUS) -#endif /* __APPLE_API_UNSTABLE */ +/* Disable metadata zone for given volume */ +#define HFSIOC_DISABLE_METAZONE _IO('h', 25) +#define HFS_DISABLE_METAZONE IOCBASECMD(HFSIOC_DISABLE_METAZONE) +#endif /* __APPLE_API_UNSTABLE */ #endif /* ! _HFS_FSCTL_H_ */ diff --git a/bsd/hfs/hfs_hotfiles.c b/bsd/hfs/hfs_hotfiles.c index ce0fe4dcf..66e273b5d 100644 --- a/bsd/hfs/hfs_hotfiles.c +++ b/bsd/hfs/hfs_hotfiles.c @@ -428,7 +428,6 @@ hfs_recording_stop(struct hfsmount *hfsmp) /* * Suspend recording the hotest files on a file system. */ -__private_extern__ int hfs_recording_suspend(struct hfsmount *hfsmp) { @@ -511,7 +510,6 @@ hfs_recording_suspend(struct hfsmount *hfsmp) /* * */ -__private_extern__ int hfs_recording_init(struct hfsmount *hfsmp) { @@ -559,12 +557,17 @@ hfs_recording_init(struct hfsmount *hfsmp) hfsmp->hfc_stage = HFC_IDLE; return (0); } + + if (hfs_start_transaction(hfsmp) != 0) { + return EINVAL; + } + error = hfc_btree_create(hfsmp, HFSTOVCB(hfsmp)->blockSize, HFC_DEFAULT_FILE_COUNT); if (error) { #if HFC_VERBOSE printf("hfs: Error %d creating hot file b-tree on %s \n", error, hfsmp->vcbVN); #endif - return (error); + goto out2; } /* * Open the Hot File B-tree file for writing. @@ -576,7 +579,7 @@ hfs_recording_init(struct hfsmount *hfsmp) #if HFC_VERBOSE printf("hfs: Error %d opening hot file b-tree on %s \n", error, hfsmp->vcbVN); #endif - return (error); + goto out2; } MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); if (iterator == NULL) { @@ -697,6 +700,7 @@ hfs_recording_init(struct hfsmount *hfsmp) out1: (void) BTScanTerminate(&scanstate, &data, &data, &data); out2: + hfs_end_transaction(hfsmp); if (iterator) FREE(iterator, M_TEMP); if (hfsmp->hfc_filevp) { @@ -712,7 +716,6 @@ hfs_recording_init(struct hfsmount *hfsmp) /* * Use sync to perform ocassional background work. */ -__private_extern__ int hfs_hotfilesync(struct hfsmount *hfsmp, vfs_context_t ctx) { @@ -759,7 +762,6 @@ hfs_hotfilesync(struct hfsmount *hfsmp, vfs_context_t ctx) * * Note: the cnode is locked on entry. */ -__private_extern__ int hfs_addhotfile(struct vnode *vp) { @@ -847,7 +849,6 @@ hfs_addhotfile_internal(struct vnode *vp) * * Note: the cnode is locked on entry. */ -__private_extern__ int hfs_removehotfile(struct vnode *vp) { @@ -1128,7 +1129,7 @@ hotfiles_adopt(struct hfsmount *hfsmp) /* * Acquire a vnode for this file. */ - error = hfs_vget(hfsmp, listp->hfl_hotfile[i].hf_fileid, &vp, 0); + error = hfs_vget(hfsmp, listp->hfl_hotfile[i].hf_fileid, &vp, 0, 0); if (error) { if (error == ENOENT) { error = 0; @@ -1350,7 +1351,7 @@ hotfiles_evict(struct hfsmount *hfsmp, vfs_context_t ctx) /* * Aquire the vnode for this file. */ - error = hfs_vget(hfsmp, key->fileID, &vp, 0); + error = hfs_vget(hfsmp, key->fileID, &vp, 0, 0); if (error) { if (error == ENOENT) { goto delete; /* stale entry, go to next */ @@ -1684,6 +1685,7 @@ hfc_btree_open(struct hfsmount *hfsmp, struct vnode **vpp) int error; int retry = 0; int lockflags; + int newvnode_flags = 0; *vpp = NULL; p = current_proc(); @@ -1705,7 +1707,8 @@ hfc_btree_open(struct hfsmount *hfsmp, struct vnode **vpp) } again: cdesc.cd_flags |= CD_ISMETA; - error = hfs_getnewvnode(hfsmp, NULL, NULL, &cdesc, 0, &cattr, &cfork, &vp); + error = hfs_getnewvnode(hfsmp, NULL, NULL, &cdesc, 0, &cattr, + &cfork, &vp, &newvnode_flags); if (error) { printf("hfs: hfc_btree_open: hfs_getnewvnode error %d\n", error); cat_releasedesc(&cdesc); @@ -1757,7 +1760,7 @@ hfc_btree_close(struct hfsmount *hfsmp, struct vnode *vp) if (hfsmp->jnl) { - hfs_journal_flush(hfsmp); + hfs_journal_flush(hfsmp, FALSE); } if (vnode_get(vp) == 0) { @@ -1814,6 +1817,11 @@ hfc_btree_create(struct hfsmount *hfsmp, unsigned int nodesize, unsigned int ent VATTR_SET(&va, va_uid, 0); VATTR_SET(&va, va_gid, 0); + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + goto out; + } + /* call ourselves directly, ignore the higher-level VFS file creation code */ error = VNOP_CREATE(dvp, &vp, &cname, &va, ctx); if (error) { @@ -1941,6 +1949,7 @@ hfc_btree_create(struct hfsmount *hfsmp, unsigned int nodesize, unsigned int ent kmem_free(kernel_map, (vm_offset_t)buffer, nodesize); } out: + hfs_end_transaction(hfsmp); if (dvp) { vnode_put(dvp); } diff --git a/bsd/hfs/hfs_kdebug.h b/bsd/hfs/hfs_kdebug.h new file mode 100644 index 000000000..5dd5d6a9c --- /dev/null +++ b/bsd/hfs/hfs_kdebug.h @@ -0,0 +1,54 @@ +#include + +/* + * KERNEL_DEBUG related definitions for HFS. + * + * NOTE: The Class DBG_FSYSTEM = 3, and Subclass DBG_HFS = 8, so these + * debug codes are of the form 0x0308nnnn. + */ +#define HFSDBG_CODE(code) FSDBG_CODE(DBG_HFS, code) + +enum { + HFSDBG_UNMAP_FREE = HFSDBG_CODE(0), /* 0x03080000 */ + HFSDBG_UNMAP_ALLOC = HFSDBG_CODE(1), /* 0x03080004 */ + HFSDBG_UNMAP_CALLBACK = HFSDBG_CODE(2), /* 0x03080008 */ + /* 0x0308000C is unused */ + HFSDBG_BLOCK_ALLOCATE = HFSDBG_CODE(4), /* 0x03080010 */ + HFSDBG_BLOCK_DEALLOCATE = HFSDBG_CODE(5), /* 0x03080014 */ + HFSDBG_READ_BITMAP_BLOCK = HFSDBG_CODE(6), /* 0x03080018 */ + HFSDBG_RELEASE_BITMAP_BLOCK = HFSDBG_CODE(7), /* 0x0308001C */ + HFSDBG_ALLOC_CONTIG_BITMAP = HFSDBG_CODE(8), /* 0x03080020 */ + HFSDBG_ALLOC_ANY_BITMAP = HFSDBG_CODE(9), /* 0x03080024 */ + HFSDBG_ALLOC_KNOWN_BITMAP = HFSDBG_CODE(10), /* 0x03080028 */ + HFSDBG_MARK_ALLOC_BITMAP = HFSDBG_CODE(11), /* 0x0308002C */ + HFSDBG_MARK_FREE_BITMAP = HFSDBG_CODE(12), /* 0x03080030 */ + HFSDBG_BLOCK_FIND_CONTIG = HFSDBG_CODE(13), /* 0x03080034 */ + HFSDBG_IS_ALLOCATED = HFSDBG_CODE(14), /* 0x03080038 */ + /* 0x0308003C is unused */ + HFSDBG_RESET_EXTENT_CACHE = HFSDBG_CODE(16), /* 0x03080040 */ + HFSDBG_REMOVE_EXTENT_CACHE = HFSDBG_CODE(17), /* 0x03080044 */ + HFSDBG_ADD_EXTENT_CACHE = HFSDBG_CODE(18), /* 0x03080048 */ +}; + +/* + Parameters logged by the above + EVENT CODE DBG_FUNC_START arg1, arg2, arg3, arg4 ... DBG_FUNC_END arg1, arg2, arg3, arg4 + --------------------------- + HFSDBG_UNMAP_CALLBACK 0, extentCount, 0, 0 ... 0, 0, 0, 0 + HFSDBG_UNMAP_FREE startBlock, blockCount, 0, 0 ... err, 0, 0, 0 + HFSDBG_UNMAP_ALLOC startBlock, blockCount, 0, 0 ... err, 0, 0, 0 + HFSDBG_REMOVE_EXTENT_CACHE startBlock, blockCount, 0, 0 ... 0, 0, 0, 0 + HFSDBG_ADD_EXTENT_CACHE startBlock, blockCount, 0, 0 ... err, 0, 0, 0 + HFSDBG_MARK_ALLOC_BITMAP startBlock, blockCount, 0, 0 ... err, 0, 0, 0 + HFSDBG_MARK_FREE_BITMAP startBlock, blockCount, valid, 0 ... err, 0, 0, 0 + HFSDBG_BLOCK_DEALLOCATE startBlock, blockCount, flags, 0 ... err, 0, 0, 0 + HFSDBG_IS_ALLOCATED startBlock, blockCount, stop, 0 ... err, 0, actualBlockCount, 0 + HFSDBG_BLOCK_ALLOCATE startBlock, minBlocks, maxBlocks, flags ... err, actualStartBlock, actualBlockCount, 0 + HFSDBG_ALLOC_CONTIG_BITMAP startBlock, minBlocks, maxBlocks, useMeta ... err, actualStartBlock, actualBlockCount, 0 + HFSDBG_ALLOC_ANY_BITMAP startBlock, endBlock, maxBlocks, useMeta ... err, actualStartBlock, actualBlockCount, 0 + HFSDBG_ALLOC_KNOWN_BITMAP 0, 0, maxBlocks, 0 ... err, actualStartBlock, actualBlockCount, 0 + HFSDBG_BLOCK_FIND_CONTIG startBlock, endBlock, minBlocks, maxBlocks ... err, actualStartBlock, actualBlockCount, 0 + HFSDBG_READ_BITMAP_BLOCK startBlock, 0, 0, 0 ... err, 0, 0, 0 + HFSDBG_RELEASE_BITMAP_BLOCK dirty, 0, 0, 0 ... 0, 0, 0, 0 + HFSDBG_RESET_EXTENT_CACHE 0, 0, 0, 0 ... 0, 0, 0, 0 +*/ diff --git a/bsd/hfs/hfs_link.c b/bsd/hfs/hfs_link.c index 878c9def0..d24a92011 100644 --- a/bsd/hfs/hfs_link.c +++ b/bsd/hfs/hfs_link.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2008 Apple Inc. All rights reserved. + * Copyright (c) 1999-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -35,6 +35,7 @@ #include #include #include +#include #include "hfs.h" #include "hfs_catalog.h" @@ -61,6 +62,8 @@ const char *hfs_private_names[] = { static int setfirstlink(struct hfsmount * hfsmp, cnid_t fileid, cnid_t firstlink); static int getfirstlink(struct hfsmount * hfsmp, cnid_t fileid, cnid_t *firstlink); +int hfs_makelink(struct hfsmount *hfsmp, struct vnode *src_vp, struct cnode *cp, + struct cnode *dcp, struct componentname *cnp); /* * Create a new catalog link record * @@ -92,7 +95,7 @@ createindirectlink(struct hfsmount *hfsmp, u_int32_t linknum, struct cat_desc *d /* Links are matched to inodes by link ID and to volumes by create date */ attr.ca_linkref = linknum; - attr.ca_itime = hfsmp->hfs_itime; + attr.ca_itime = hfsmp->hfs_metadata_createdate; attr.ca_mode = S_IFREG | S_IRUSR | S_IRGRP | S_IROTH; attr.ca_recflags = kHFSHasLinkChainMask | kHFSThreadExistsMask; attr.ca_flags = UF_IMMUTABLE; @@ -121,13 +124,15 @@ createindirectlink(struct hfsmount *hfsmp, u_int32_t linknum, struct cat_desc *d /* * Make a link to the cnode cp in the directory dp - * using the name in cnp. + * using the name in cnp. src_vp is the vnode that + * corresponds to 'cp' which was part of the arguments to + * hfs_vnop_link. * * The cnodes cp and dcp must be locked. */ -static int -hfs_makelink(struct hfsmount *hfsmp, struct cnode *cp, struct cnode *dcp, - struct componentname *cnp) +int +hfs_makelink(struct hfsmount *hfsmp, struct vnode *src_vp, struct cnode *cp, + struct cnode *dcp, struct componentname *cnp) { vfs_context_t ctx = cnp->cn_context; struct proc *p = vfs_context_proc(ctx); @@ -291,7 +296,7 @@ hfs_makelink(struct hfsmount *hfsmp, struct cnode *cp, struct cnode *dcp, /* Update the original first link to point back to the new first link. */ if (cp->c_attr.ca_recflags & kHFSHasLinkChainMask) { - (void) cat_updatelink(hfsmp, orig_firstlink, linkcnid, HFS_IGNORABLE_LINK); + (void) cat_update_siblinglinks(hfsmp, orig_firstlink, linkcnid, HFS_IGNORABLE_LINK); /* Update the inode's first link value. */ if (type == DIR_HARDLINKS) { @@ -327,17 +332,46 @@ hfs_makelink(struct hfsmount *hfsmp, struct cnode *cp, struct cnode *dcp, panic("hfs_makelink: cat_update of privdir failed! (%d)\n", retval); } cp->c_flag |= C_HARDLINK; + + /* + * Now we need to mark the vnodes as being hardlinks via the vnode_setmultipath call. + * Note that we're calling vnode_get here, which should simply add an iocount if possible, without + * doing much checking. It's safe to call this because we are protected by the cnode lock, which + * ensures that anyone trying to reclaim it will block until we release it. vnode_get will usually + * give us an extra iocount, unless the vnode is about to be reclaimed (and has no iocounts). + * In that case, we'd error out, but we'd also not care if we added the VISHARDLINK bit to the vnode. + * + * As for the iocount we're about to add, we can't necessarily always call vnode_put here. + * If the one we add is the only iocount on the vnode, and there was + * sufficient vnode pressure, it could go through VNOP_INACTIVE immediately, which would + * require the cnode lock and cause us to double-lock panic. We can only call vnode_put if we know + * that the vnode we're operating on is the one with which we came into hfs_vnop_link, because + * that means VFS took an iocount on it for us. If it's *not* the one that we came into the call + * with, then mark it as NEED_VNODE_PUT to have hfs_unlock drop it for us. hfs_vnop_link will + * unlock the cnode when it is finished. + */ if ((vp = cp->c_vp) != NULLVP) { - if (vnode_get(vp) == 0) { - vnode_setmultipath(vp); - vnode_put(vp); - } + if (vnode_get(vp) == 0) { + vnode_setmultipath(vp); + if (vp == src_vp) { + /* we have an iocount on data fork vnode already. */ + vnode_put(vp); + } + else { + cp->c_flag |= C_NEED_DVNODE_PUT; + } + } } if ((vp = cp->c_rsrc_vp) != NULLVP) { - if (vnode_get(vp) == 0) { - vnode_setmultipath(vp); - vnode_put(vp); - } + if (vnode_get(vp) == 0) { + vnode_setmultipath(vp); + if (vp == src_vp) { + vnode_put(vp); + } + else { + cp->c_flag |= C_NEED_RVNODE_PUT; + } + } } cp->c_touch_chgtime = TRUE; cp->c_flag |= C_FORCEUPDATE; @@ -364,7 +398,6 @@ hfs_makelink(struct hfsmount *hfsmp, struct cnode *cp, struct cnode *dcp, * IN struct componentname *a_cnp; * IN vfs_context_t a_context; */ -__private_extern__ int hfs_vnop_link(struct vnop_link_args *ap) { @@ -408,7 +441,7 @@ hfs_vnop_link(struct vnop_link_args *ap) return (EPERM); } /* Directory hardlinks also need the parent of the original directory. */ - if ((error = hfs_vget(hfsmp, hfs_currentparent(VTOC(vp)), &fdvp, 1))) { + if ((error = hfs_vget(hfsmp, hfs_currentparent(VTOC(vp)), &fdvp, 1, 0))) { return (error); } } else { @@ -423,6 +456,10 @@ hfs_vnop_link(struct vnop_link_args *ap) } return (ENOSPC); } + + check_for_tracked_file(vp, VTOC(vp)->c_ctime, NAMESPACE_HANDLER_LINK_CREATE, NULL); + + /* Lock the cnodes. */ if (fdvp) { if ((error = hfs_lockfour(VTOC(tdvp), VTOC(vp), VTOC(fdvp), NULL, HFS_EXCLUSIVE_LOCK, NULL))) { @@ -543,7 +580,7 @@ hfs_vnop_link(struct vnop_link_args *ap) cp->c_linkcount++; cp->c_touch_chgtime = TRUE; - error = hfs_makelink(hfsmp, cp, tdcp, cnp); + error = hfs_makelink(hfsmp, vp, cp, tdcp, cnp); if (error) { cp->c_linkcount--; hfs_volupdate(hfsmp, VOL_UPDATE, 0); @@ -634,7 +671,6 @@ hfs_vnop_link(struct vnop_link_args *ap) * * Note: dvp and vp cnodes are already locked. */ -__private_extern__ int hfs_unlink(struct hfsmount *hfsmp, struct vnode *dvp, struct vnode *vp, struct componentname *cnp, int skip_reserve) { @@ -806,11 +842,11 @@ hfs_unlink(struct hfsmount *hfsmp, struct vnode *dvp, struct vnode *vp, struct c } /* Update previous link. */ if (prevlinkid) { - (void) cat_updatelink(hfsmp, prevlinkid, HFS_IGNORABLE_LINK, nextlinkid); + (void) cat_update_siblinglinks(hfsmp, prevlinkid, HFS_IGNORABLE_LINK, nextlinkid); } /* Update next link. */ if (nextlinkid) { - (void) cat_updatelink(hfsmp, nextlinkid, prevlinkid, HFS_IGNORABLE_LINK); + (void) cat_update_siblinglinks(hfsmp, nextlinkid, prevlinkid, HFS_IGNORABLE_LINK); } } @@ -860,7 +896,6 @@ hfs_unlink(struct hfsmount *hfsmp, struct vnode *dvp, struct vnode *vp, struct c * * This call is assumed to be made during mount. */ -__private_extern__ void hfs_privatedir_init(struct hfsmount * hfsmp, enum privdirtype type) { @@ -909,7 +944,7 @@ hfs_privatedir_init(struct hfsmount * hfsmp, enum privdirtype type) } /* Grab the root directory so we can update it later. */ - if (hfs_vget(hfsmp, kRootDirID, &dvp, 0) != 0) { + if (hfs_vget(hfsmp, kRootDirID, &dvp, 0, 0) != 0) { goto exit; } dcp = VTOC(dvp); @@ -965,7 +1000,7 @@ hfs_privatedir_init(struct hfsmount * hfsmp, enum privdirtype type) goto exit; } if (type == FILE_HARDLINKS) { - hfsmp->hfs_metadata_createdate = hfsmp->hfs_itime; + hfsmp->hfs_metadata_createdate = priv_attrp->ca_itime; } hfs_volupdate(hfsmp, VOL_MKDIR, 1); exit: @@ -985,9 +1020,8 @@ hfs_privatedir_init(struct hfsmount * hfsmp, enum privdirtype type) /* * Lookup a hardlink link (from chain) */ -__private_extern__ int -hfs_lookuplink(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t *prevlinkid, cnid_t *nextlinkid) +hfs_lookup_siblinglinks(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t *prevlinkid, cnid_t *nextlinkid) { int lockflags; int error; @@ -997,7 +1031,7 @@ hfs_lookuplink(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t *prevlinkid, c lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - error = cat_lookuplinkbyid(hfsmp, linkfileid, prevlinkid, nextlinkid); + error = cat_lookup_siblinglinks(hfsmp, linkfileid, prevlinkid, nextlinkid); if (error == ENOLINK) { hfs_systemfile_unlock(hfsmp, lockflags); lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); diff --git a/bsd/hfs/hfs_lookup.c b/bsd/hfs/hfs_lookup.c index c82e68cb4..13cb1aa48 100644 --- a/bsd/hfs/hfs_lookup.c +++ b/bsd/hfs/hfs_lookup.c @@ -164,8 +164,10 @@ hfs_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, int struct cat_attr attr; struct cat_fork fork; int lockflags; + int newvnode_flags; retry: + newvnode_flags = 0; dcp = NULL; hfsmp = VTOHFS(dvp); *vpp = NULL; @@ -227,8 +229,16 @@ hfs_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, int * Note: We must drop the parent lock here before calling * hfs_getnewvnode (which takes the child lock). */ - hfs_unlock(dcp); - dcp = NULL; + hfs_unlock(dcp); + dcp = NULL; + + /* Verify that the item just looked up isn't one of the hidden directories. */ + if (desc.cd_cnid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid || + desc.cd_cnid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { + retval = ENOENT; + goto exit; + } + goto found; } notfound: @@ -301,37 +311,14 @@ hfs_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, int * Directory hard links can have multiple parents so * find the appropriate parent for the current thread. */ - if ((retval = hfs_vget(hfsmp, hfs_currentparent(VTOC(dvp)), &tvp, 0))) { + if ((retval = hfs_vget(hfsmp, hfs_currentparent(VTOC(dvp)), &tvp, 0, 0))) { goto exit; } *cnode_locked = 1; *vpp = tvp; } else { int type = (attr.ca_mode & S_IFMT); -#if NAMEDRSRCFORK - int rsrc_warn = 0; - /* - * Check if caller wants the resource fork but utilized - * the legacy "file/rsrc" access path. - * - * This is deprecated behavior and support for it will not - * be allowed beyond case insensitive HFS+ and even that - * support will be removed in the next major OS release. - */ - if ((type == S_IFREG) && - ((flags & ISLASTCN) == 0) && - (cnp->cn_nameptr[cnp->cn_namelen] == '/') && - (bcmp(&cnp->cn_nameptr[cnp->cn_namelen+1], "rsrc", 5) == 0) && - ((hfsmp->hfs_flags & (HFS_STANDARD | HFS_CASE_SENSITIVE)) == 0)) { - - cnp->cn_consume = 5; - cnp->cn_flags |= CN_WANTSRSRCFORK | ISLASTCN | NOCACHE; - cnp->cn_flags &= ~MAKEENTRY; - flags |= ISLASTCN; - rsrc_warn = 1; - } -#endif if (!(flags & ISLASTCN) && (type != S_IFDIR) && (type != S_IFLNK)) { retval = ENOTDIR; goto exit; @@ -344,22 +331,65 @@ hfs_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, int if (cnp->cn_namelen != desc.cd_namelen) cnp->cn_flags &= ~MAKEENTRY; - retval = hfs_getnewvnode(hfsmp, dvp, cnp, &desc, 0, &attr, &fork, &tvp); + retval = hfs_getnewvnode(hfsmp, dvp, cnp, &desc, 0, &attr, &fork, &tvp, &newvnode_flags); if (retval) { /* - * If this was a create operation lookup and another - * process removed the object before we had a chance - * to create the vnode, then just treat it as the not - * found case above and return EJUSTRETURN. - * We should do the same for the RENAME operation since we are - * going to write it in regardless. - */ + * If this was a create/rename operation lookup, then by this point + * we expected to see the item returned from hfs_getnewvnode above. + * In the create case, it would probably eventually bubble out an EEXIST + * because the item existed when we were trying to create it. In the + * rename case, it would let us know that we need to go ahead and + * delete it as part of the rename. However, if we hit the condition below + * then it means that we found the element during cat_lookup above, but + * it is now no longer there. We simply behave as though we never found + * the element at all and return EJUSTRETURN. + */ if ((retval == ENOENT) && - ((cnp->cn_nameiop == CREATE) || (cnp->cn_nameiop == RENAME)) && - (flags & ISLASTCN)) { + ((cnp->cn_nameiop == CREATE) || (cnp->cn_nameiop == RENAME)) && + (flags & ISLASTCN)) { retval = EJUSTRETURN; } + + /* + * If this was a straight lookup operation, we may need to redrive the entire + * lookup starting from cat_lookup if the element was deleted as the result of + * a rename operation. Since rename is supposed to guarantee atomicity, then + * lookups cannot fail because the underlying element is deleted as a result of + * the rename call -- either they returned the looked up element prior to rename + * or return the newer element. If we are in this region, then all we can do is add + * workarounds to guarantee the latter case. The element has already been deleted, so + * we just re-try the lookup to ensure the caller gets the most recent element. + */ + if ((retval == ENOENT) && (cnp->cn_nameiop == LOOKUP) && + (newvnode_flags & (GNV_CHASH_RENAMED | GNV_CAT_DELETED))) { + if (dcp) { + hfs_unlock (dcp); + } + /* get rid of any name buffers that may have lingered from the cat_lookup call */ + cat_releasedesc (&desc); + goto retry; + } + + /* Also, re-drive the lookup if the item we looked up was a hardlink, and the number + * or name of hardlinks has changed in the interim between the cat_lookup above, and + * our call to hfs_getnewvnode. hfs_getnewvnode will validate the cattr we passed it + * against what is actually in the catalog after the cnode is created. If there were + * any issues, it will bubble out ERECYCLE, which we need to swallow and use as the + * key to redrive as well. We need to special case this below because in this case, + * it needs to occur regardless of the type of lookup we're doing here. + */ + if ((retval == ERECYCLE) && (newvnode_flags & GNV_CAT_ATTRCHANGED)) { + if (dcp) { + hfs_unlock (dcp); + } + /* get rid of any name buffers that may have lingered from the cat_lookup call */ + cat_releasedesc (&desc); + retval = 0; + goto retry; + } + + /* skip to the error-handling code if we can't retry */ goto exit; } @@ -375,15 +405,6 @@ hfs_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, int } *cnode_locked = 1; *vpp = tvp; -#if NAMEDRSRCFORK - if (rsrc_warn) { - if ((VTOC(tvp)->c_flag & C_WARNED_RSRC) == 0) { - VTOC(tvp)->c_flag |= C_WARNED_RSRC; - printf("hfs: %.200s: file access by '/rsrc' was deprecated in 10.4\n", - cnp->cn_nameptr); - } - } -#endif } exit: if (dcp) { @@ -415,7 +436,6 @@ hfs_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, int #define S_IXALL 0000111 -__private_extern__ int hfs_vnop_lookup(struct vnop_lookup_args *ap) { @@ -423,6 +443,7 @@ hfs_vnop_lookup(struct vnop_lookup_args *ap) struct vnode *vp; struct cnode *cp; struct cnode *dcp; + struct hfsmount *hfsmp; int error; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; @@ -431,6 +452,8 @@ hfs_vnop_lookup(struct vnop_lookup_args *ap) *vpp = NULL; dcp = VTOC(dvp); + + hfsmp = VTOHFS(dvp); /* * Lookup an entry in the cache @@ -455,14 +478,24 @@ hfs_vnop_lookup(struct vnop_lookup_args *ap) */ error = 0; vp = *vpp; - + cp = VTOC(vp); + + /* We aren't allowed to vend out vp's via lookup to the hidden directory */ + if (cp->c_cnid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid || + cp->c_cnid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { + /* Drop the iocount from cache_lookup */ + vnode_put (vp); + error = ENOENT; + goto exit; + } + + /* * If this is a hard-link vnode then we need to update * the name (of the link), the parent ID, the cnid, the * text encoding and the catalog hint. This enables * getattrlist calls to return the correct link info. */ - cp = VTOC(vp); if ((flags & ISLASTCN) && (cp->c_flag & C_HARDLINK)) { hfs_lock(cp, HFS_FORCE_LOCK); @@ -501,33 +534,7 @@ hfs_vnop_lookup(struct vnop_lookup_args *ap) } hfs_unlock(cp); } -#if NAMEDRSRCFORK - /* - * Check if caller wants the resource fork but utilized - * the legacy "file/rsrc" access path. - * - * This is deprecated behavior and support for it will not - * be allowed beyond case insensitive HFS+ and even that - * support will be removed in the next major OS release. - */ - if ((dvp != vp) && - ((flags & ISLASTCN) == 0) && - vnode_isreg(vp) && - (cnp->cn_nameptr[cnp->cn_namelen] == '/') && - (bcmp(&cnp->cn_nameptr[cnp->cn_namelen+1], "rsrc", 5) == 0) && - ((VTOHFS(vp)->hfs_flags & (HFS_STANDARD | HFS_CASE_SENSITIVE)) == 0)) { - cnp->cn_consume = 5; - cnp->cn_flags |= CN_WANTSRSRCFORK | ISLASTCN | NOCACHE; - cnp->cn_flags &= ~MAKEENTRY; - hfs_lock(cp, HFS_FORCE_LOCK); - if ((cp->c_flag & C_WARNED_RSRC) == 0) { - cp->c_flag |= C_WARNED_RSRC; - printf("hfs: %.200s: file access by '/rsrc' was deprecated in 10.4\n", cnp->cn_nameptr); - } - hfs_unlock(cp); - } -#endif return (error); lookup: diff --git a/bsd/hfs/hfs_mount.h b/bsd/hfs/hfs_mount.h index 5782bd6f6..ca4f8703f 100644 --- a/bsd/hfs/hfs_mount.h +++ b/bsd/hfs/hfs_mount.h @@ -79,6 +79,7 @@ struct hfs_mount_args { #define HFS_GET_JOURNAL_INFO 0x6a6e6c69 #define HFS_SET_PKG_EXTENSIONS 0x121031 #define HFS_REPLAY_JOURNAL 0x6a6e6c72 +#define HFS_ENABLE_RESIZE_DEBUG 4 /* enable debug code for volume resizing */ #endif /* __APPLE_API_UNSTABLE */ diff --git a/bsd/hfs/hfs_notification.c b/bsd/hfs/hfs_notification.c index 517c8ecdc..227e744b2 100644 --- a/bsd/hfs/hfs_notification.c +++ b/bsd/hfs/hfs_notification.c @@ -32,7 +32,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -71,10 +73,24 @@ void hfs_generate_volume_notifications(struct hfsmount *hfsmp) } if (state == 2 && !(hfsmp->hfs_notification_conditions & VQ_VERYLOWDISK)) { + /* Dump some logging to track down intermittent issues */ + printf("HFS: Very Low Disk: freeblks: %d, dangerlimit: %d\n", freeblks, hfsmp->hfs_freespace_notify_dangerlimit); +#if HFS_SPARSE_DEV + if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { + if (hfsmp->hfs_backingfs_rootvp) { + struct mount *mp = vnode_mount (hfsmp->hfs_backingfs_rootvp); + /* If we're a sparse device, dump some info about the backing store... */ + if (mp) { + printf("HFS: Very Low Disk: backingstore b_avail %lld, tag %d\n", mp->mnt_vfsstat.f_bavail, hfsmp->hfs_backingfs_rootvp->v_tag); + } + } + } +#endif hfsmp->hfs_notification_conditions |= (VQ_VERYLOWDISK|VQ_LOWDISK); vfs_event_signal(&fsid, hfsmp->hfs_notification_conditions, (intptr_t)NULL); } else if (state == 1) { if (!(hfsmp->hfs_notification_conditions & VQ_LOWDISK)) { + printf("HFS: Low Disk: freeblks: %d, warninglimit: %d\n", freeblks, hfsmp->hfs_freespace_notify_warninglimit); hfsmp->hfs_notification_conditions |= VQ_LOWDISK; vfs_event_signal(&fsid, hfsmp->hfs_notification_conditions, (intptr_t)NULL); } else if (hfsmp->hfs_notification_conditions & VQ_VERYLOWDISK) { diff --git a/bsd/hfs/hfs_readwrite.c b/bsd/hfs/hfs_readwrite.c index 9fcd6a02d..27901f5de 100644 --- a/bsd/hfs/hfs_readwrite.c +++ b/bsd/hfs/hfs_readwrite.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -90,8 +90,7 @@ static int do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, int flush_cache_on_write = 0; -SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files"); - +SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW | CTLFLAG_LOCKED, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files"); /* * Read data from a file. @@ -109,6 +108,7 @@ hfs_vnop_read(struct vnop_read_args *ap) off_t start_resid = uio_resid(uio); off_t offset = uio_offset(uio); int retval = 0; + int took_truncate_lock = 0; /* Preflight checks */ if (!vnode_isreg(vp)) { @@ -147,6 +147,14 @@ hfs_vnop_read(struct vnop_read_args *ap) } /* otherwise the file was converted back to a regular file while we were reading it */ retval = 0; + } else if ((VTOC(vp)->c_flags & UF_COMPRESSED)) { + int error; + + error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP); + if (error) { + return error; + } + } } #endif /* HFS_COMPRESSION */ @@ -155,8 +163,15 @@ hfs_vnop_read(struct vnop_read_args *ap) fp = VTOF(vp); hfsmp = VTOHFS(vp); +#if CONFIG_PROTECT + if ((retval = cp_handle_vnop (cp, CP_READ_ACCESS)) != 0) { + goto exit; + } +#endif + /* Protect against a size change. */ - hfs_lock_truncate(cp, 0); + hfs_lock_truncate(cp, HFS_SHARED_LOCK); + took_truncate_lock = 1; filesize = fp->ff_size; filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize; @@ -209,7 +224,10 @@ hfs_vnop_read(struct vnop_read_args *ap) hfs_unlock(cp); } exit: - hfs_unlock_truncate(cp, 0); + if (took_truncate_lock) { + hfs_unlock_truncate(cp, 0); + } + return (retval); } @@ -238,7 +256,9 @@ hfs_vnop_write(struct vnop_write_args *ap) int lockflags; int cnode_locked = 0; int partialwrite = 0; - int exclusive_lock = 0; + int do_snapshot = 1; + time_t orig_ctime=VTOC(vp)->c_ctime; + int took_truncate_lock = 0; #if HFS_COMPRESSION if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */ @@ -247,23 +267,34 @@ hfs_vnop_write(struct vnop_write_args *ap) case FILE_IS_COMPRESSED: return EACCES; case FILE_IS_CONVERTING: - /* if FILE_IS_CONVERTING, we allow writes */ + /* if FILE_IS_CONVERTING, we allow writes but do not + bother with snapshots or else we will deadlock. + */ + do_snapshot = 0; break; default: printf("invalid state %d for compressed file\n", state); /* fall through */ } + } else if ((VTOC(vp)->c_flags & UF_COMPRESSED)) { + int error; + + error = check_for_dataless_file(vp, NAMESPACE_HANDLER_WRITE_OP); + if (error != 0) { + return error; + } } + + if (do_snapshot) { + check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_WRITE_OP, uio); + } + #endif // LP64todo - fix this! uio_resid may be 64-bit value resid = uio_resid(uio); offset = uio_offset(uio); - if (ioflag & IO_APPEND) { - exclusive_lock = 1; - } - if (offset < 0) return (EINVAL); if (resid == 0) @@ -275,8 +306,14 @@ hfs_vnop_write(struct vnop_write_args *ap) fp = VTOF(vp); hfsmp = VTOHFS(vp); +#if CONFIG_PROTECT + if ((retval = cp_handle_vnop (cp, CP_WRITE_ACCESS)) != 0) { + goto exit; + } +#endif + eflags = kEFDeferMask; /* defer file block allocations */ -#ifdef HFS_SPARSE_DEV +#if HFS_SPARSE_DEV /* * When the underlying device is sparse and space * is low (< 8MB), stop doing delayed allocations @@ -291,8 +328,15 @@ hfs_vnop_write(struct vnop_write_args *ap) again: /* Protect against a size change. */ - hfs_lock_truncate(cp, exclusive_lock); + if (ioflag & IO_APPEND) { + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK); + } + else { + hfs_lock_truncate(cp, HFS_SHARED_LOCK); + } + took_truncate_lock = 1; + /* Update UIO */ if (ioflag & IO_APPEND) { uio_setoffset(uio, fp->ff_size); offset = fp->ff_size; @@ -313,13 +357,16 @@ hfs_vnop_write(struct vnop_write_args *ap) * grab the truncate lock exclusive even if we're not allocating new blocks * because we could still be growing past the LEOF. */ - if ((exclusive_lock == 0) && + if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) && ((fp->ff_unallocblocks != 0) || (writelimit > origFileSize))) { - exclusive_lock = 1; /* Lock upgrade failed and we lost our shared lock, try again */ if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) { goto again; } + else { + /* Store the owner in the c_truncatelockowner field if we successfully upgrade */ + cp->c_truncatelockowner = current_thread(); + } } if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) { @@ -327,7 +374,7 @@ hfs_vnop_write(struct vnop_write_args *ap) } cnode_locked = 1; - if (!exclusive_lock) { + if (cp->c_truncatelockowner == HFS_SHARED_OWNER) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_START, (int)offset, uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0); @@ -632,7 +679,10 @@ hfs_vnop_write(struct vnop_write_args *ap) exit: if (cnode_locked) hfs_unlock(cp); - hfs_unlock_truncate(cp, exclusive_lock); + + if (took_truncate_lock) { + hfs_unlock_truncate(cp, 0); + } return (retval); } @@ -1004,7 +1054,7 @@ do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HF struct vnode *vp; /* get the vnode for this cnid */ - myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0); + myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0, 0); if ( myErr ) { myResult = 0; goto ExitThisRoutine; @@ -1027,21 +1077,19 @@ do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HF } } else { unsigned int flags; - - myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, - cnattr.ca_mode, hfsmp->hfs_mp, - myp_ucred, theProcPtr); + int mode = cnattr.ca_mode & S_IFMT; + myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, cnattr.ca_mode, hfsmp->hfs_mp,myp_ucred, theProcPtr); - if (cnattr.ca_mode & S_IFDIR) { - flags = R_OK | X_OK; - } else { - flags = R_OK; - } - if ( (myPerms & flags) != flags) { - myResult = 0; - myErr = EACCES; - goto ExitThisRoutine; /* no access */ - } + if (mode == S_IFDIR) { + flags = R_OK | X_OK; + } else { + flags = R_OK; + } + if ( (myPerms & flags) != flags) { + myResult = 0; + myErr = EACCES; + goto ExitThisRoutine; /* no access */ + } /* up the hierarchy we go */ thisNodeID = catkey.hfsPlus.parentID; @@ -1284,7 +1332,7 @@ do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp, struct vnode *cvp; int myErr = 0; /* get the vnode for this cnid */ - myErr = hfs_vget(hfsmp, cnid, &cvp, 0); + myErr = hfs_vget(hfsmp, cnid, &cvp, 0, 0); if ( myErr ) { access[i] = myErr; continue; @@ -1432,6 +1480,15 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { is64bit = proc_is64bit(p); +#if CONFIG_PROTECT + { + int error = 0; + if ((error = cp_handle_vnop(VTOC(vp), CP_WRITE_ACCESS)) != 0) { + return error; + } + } +#endif /* CONFIG_PROTECT */ + switch (ap->a_command) { case HFS_GETPATH: @@ -1491,7 +1548,7 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { if (linkfileid < kHFSFirstUserCatalogNodeID) { return (EINVAL); } - if ((error = hfs_lookuplink(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) { + if ((error = hfs_lookup_siblinglinks(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) { return (error); } if (ap->a_command == HFS_NEXT_LINK) { @@ -1591,7 +1648,7 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { return (error); } -#ifdef HFS_SPARSE_DEV +#if HFS_SPARSE_DEV case HFS_SETBACKINGSTOREINFO: { struct vnode * bsfs_rootvp; struct vnode * di_vp; @@ -1641,7 +1698,17 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { vnode_put(bsfs_rootvp); hfsmp->hfs_backingfs_rootvp = bsfs_rootvp; + hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE; + /* The free extent cache is managed differently for sparse devices. + * There is a window between which the volume is mounted and the + * device is marked as sparse, so the free extent cache for this + * volume is currently initialized as normal volume (sorted by block + * count). Reset the cache so that it will be rebuilt again + * for sparse device (sorted by start block). + */ + ResetVCBFreeExtCache(hfsmp); + hfsmp->hfs_sparsebandblks = bsdata->bandsize / HFSTOVCB(hfsmp)->blockSize; hfsmp->hfs_sparsebandblks *= 4; @@ -1717,14 +1784,18 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { // note: can't do this after taking the lock as it will // deadlock against ourselves. vnode_iterate(mp, 0, hfs_freezewrite_callback, NULL); - hfs_global_exclusive_lock_acquire(hfsmp); + hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK); // DO NOT call hfs_journal_flush() because that takes a // shared lock on the global exclusive lock! - journal_flush(hfsmp->jnl); + journal_flush(hfsmp->jnl, TRUE); // don't need to iterate on all vnodes, we just need to // wait for writes to the system files and the device vnode + // + // Now that journal flush waits for all metadata blocks to + // be written out, waiting for btree writes is probably no + // longer required. if (HFSTOVCB(hfsmp)->extentsRefNum) vnode_waitforwrites(HFSTOVCB(hfsmp)->extentsRefNum, 0, 0, 0, "hfs freeze"); if (HFSTOVCB(hfsmp)->catalogRefNum) @@ -1756,7 +1827,7 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { // code that "thaws" the fs in hfs_vnop_close() // hfsmp->hfs_freezing_proc = NULL; - hfs_global_exclusive_lock_release(hfsmp); + hfs_unlock_global (hfsmp); lck_rw_unlock_exclusive(&hfsmp->hfs_insync); return (0); @@ -1794,30 +1865,6 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { return do_bulk_access_check(hfsmp, vp, ap, size, context); } - case HFS_SETACLSTATE: { - int state; - - if (ap->a_data == NULL) { - return (EINVAL); - } - - vfsp = vfs_statfs(HFSTOVFS(hfsmp)); - state = *(int *)ap->a_data; - - if (hfsmp->hfs_flags & HFS_READ_ONLY) { - return (EROFS); - } - // super-user can enable or disable acl's on a volume. - // the volume owner can only enable acl's - if (!is_suser() && (state == 0 || kauth_cred_getuid(cred) != vfsp->f_owner)) { - return (EPERM); - } - if (state == 0 || state == 1) - return hfs_set_volxattr(hfsmp, HFS_SETACLSTATE, state); - else - return (EINVAL); - } - case HFS_SET_XATTREXTENTS_STATE: { int state; @@ -1833,6 +1880,9 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { /* Super-user can enable or disable extent-based extended * attribute support on a volume + * Note: Starting Mac OS X 10.7, extent-based extended attributes + * are enabled by default, so any change will be transient only + * till the volume is remounted. */ if (!is_suser()) { return (EPERM); @@ -1891,7 +1941,7 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { fp = VTOF(vp); /* Protect against a size change. */ - hfs_lock_truncate(VTOC(vp), TRUE); + hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK); #if HFS_COMPRESSION if (compressed && (uncompressed_size == -1)) { @@ -1910,7 +1960,7 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count); } - hfs_unlock_truncate(VTOC(vp), TRUE); + hfs_unlock_truncate(VTOC(vp), 0); return (error); } @@ -1934,21 +1984,22 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { * to a user_fbootstraptransfer_t else we get a pointer to a * fbootstraptransfer_t which we munge into a user_fbootstraptransfer_t */ - if (hfsmp->hfs_flags & HFS_READ_ONLY) { + if ((hfsmp->hfs_flags & HFS_READ_ONLY) + && (ap->a_command == F_WRITEBOOTSTRAP)) { return (EROFS); } if (is64bit) { user_bootstrapp = (user_fbootstraptransfer_t *)ap->a_data; } else { - user32_fbootstraptransfer_t *bootstrapp = (user32_fbootstraptransfer_t *)ap->a_data; + user32_fbootstraptransfer_t *bootstrapp = (user32_fbootstraptransfer_t *)ap->a_data; user_bootstrapp = &user_bootstrap; user_bootstrap.fbt_offset = bootstrapp->fbt_offset; user_bootstrap.fbt_length = bootstrapp->fbt_length; user_bootstrap.fbt_buffer = CAST_USER_ADDR_T(bootstrapp->fbt_buffer); } - if ((user_bootstrapp->fbt_offset < 0) || (user_bootstrapp->fbt_offset > 1024) || + if ((user_bootstrapp->fbt_offset < 0) || (user_bootstrapp->fbt_offset > 1024) || (user_bootstrapp->fbt_length > 1024)) { return EINVAL; } @@ -1956,7 +2007,7 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { if (user_bootstrapp->fbt_offset + user_bootstrapp->fbt_length > 1024) return EINVAL; - devvp = VTOHFS(vp)->hfs_devvp; + devvp = VTOHFS(vp)->hfs_devvp; auio = uio_create(1, user_bootstrapp->fbt_offset, is64bit ? UIO_USERSPACE64 : UIO_USERSPACE32, (ap->a_command == F_WRITEBOOTSTRAP) ? UIO_WRITE : UIO_READ); @@ -2116,6 +2167,21 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { break; } + case HFS_DISABLE_METAZONE: { + /* Only root can disable metadata zone */ + if (!is_suser()) { + return EACCES; + } + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + + /* Disable metadata zone now */ + (void) hfs_metadatazone_init(hfsmp, true); + printf ("hfs: Disabling metadata zone on %s\n", hfsmp->vcbVN); + break; + } + default: return (ENOTTY); } @@ -2541,8 +2607,32 @@ hfs_vnop_strategy(struct vnop_strategy_args *ap) { buf_t bp = ap->a_bp; vnode_t vp = buf_vnode(bp); + int error = 0; + +#if CONFIG_PROTECT + cnode_t *cp = NULL; + + if ((cp = cp_get_protected_cnode(vp)) != NULL) { + /* + * Some paths to hfs_vnop_strategy will take the cnode lock, + * and some won't. But since content protection is only enabled + * for files that (a) aren't system files and (b) are regular + * files, any valid cnode here will be unlocked. + */ + hfs_lock(cp, HFS_SHARED_LOCK); + buf_setcpaddr(bp, cp->c_cpentry); + } +#endif /* CONFIG_PROTECT */ + + error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap); - return (buf_strategy(VTOHFS(vp)->hfs_devvp, ap)); +#if CONFIG_PROTECT + if (cp) { + hfs_unlock(cp); + } +#endif + + return error; } static int @@ -2556,7 +2646,7 @@ hfs_minorupdate(struct vnode *vp) { return 0; } -static int +int do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_context_t context) { register struct cnode *cp = VTOC(vp); @@ -2801,8 +2891,8 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_c lockflags |= SFL_EXTENTS; lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); - retval = MacToVFSError(TruncateFileC(VTOVCB(vp), - (FCB*)fp, length, false)); + retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, 0, + FORK_IS_RSRC (fp), FTOC(fp)->c_fileid, false)); hfs_systemfile_unlock(hfsmp, lockflags); } @@ -2860,13 +2950,201 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_c return (retval); } +/* + * Preparation which must be done prior to deleting the catalog record + * of a file or directory. In order to make the on-disk as safe as possible, + * we remove the catalog entry before releasing the bitmap blocks and the + * overflow extent records. However, some work must be done prior to deleting + * the catalog record. + * + * When calling this function, the cnode must exist both in memory and on-disk. + * If there are both resource fork and data fork vnodes, this function should + * be called on both. + */ + +int +hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) { + + struct filefork *fp = VTOF(vp); + struct cnode *cp = VTOC(vp); + int retval = 0; + + /* Cannot truncate an HFS directory! */ + if (vnode_isdir(vp)) { + return (EISDIR); + } + + /* + * See the comment below in hfs_truncate for why we need to call + * setsize here. Essentially we want to avoid pending IO if we + * already know that the blocks are going to be released here. + * This function is only called when totally removing all storage for a file, so + * we can take a shortcut and immediately setsize (0); + */ + ubc_setsize(vp, 0); + + /* This should only happen with a corrupt filesystem */ + if ((off_t)fp->ff_size < 0) + return (EINVAL); + + /* + * We cannot just check if fp->ff_size == length (as an optimization) + * since there may be extra physical blocks that also need truncation. + */ +#if QUOTA + if ((retval = hfs_getinoquota(cp))) { + return(retval); + } +#endif /* QUOTA */ + + /* Wipe out any invalid ranges which have yet to be backed by disk */ + rl_remove(0, fp->ff_size - 1, &fp->ff_invalidranges); + + /* + * Account for any unmapped blocks. Since we're deleting the + * entire file, we don't have to worry about just shrinking + * to a smaller number of borrowed blocks. + */ + if (fp->ff_unallocblocks > 0) { + u_int32_t loanedBlocks; + + HFS_MOUNT_LOCK(hfsmp, TRUE); + + loanedBlocks = fp->ff_unallocblocks; + cp->c_blocks -= loanedBlocks; + fp->ff_blocks -= loanedBlocks; + fp->ff_unallocblocks = 0; + + hfsmp->loanedBlocks -= loanedBlocks; + + HFS_MOUNT_UNLOCK(hfsmp, TRUE); + } + + return 0; +} + + +/* + * Special wrapper around calling TruncateFileC. This function is useable + * even when the catalog record does not exist any longer, making it ideal + * for use when deleting a file. The simplification here is that we know + * that we are releasing all blocks. + * + * The caller is responsible for saving off a copy of the filefork(s) + * embedded within the cnode prior to calling this function. The pointers + * supplied as arguments must be valid even if the cnode is no longer valid. + */ + +int +hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork, + struct filefork *rsrcfork, u_int32_t fileid) { + + off_t filebytes; + u_int32_t fileblocks; + int blksize = 0; + int error = 0; + int lockflags; + + blksize = hfsmp->blockSize; + + /* Data Fork */ + if (datafork->ff_blocks > 0) { + fileblocks = datafork->ff_blocks; + filebytes = (off_t)fileblocks * (off_t)blksize; + + /* We killed invalid ranges and loaned blocks before we removed the catalog entry */ + + while (filebytes > 0) { + if (filebytes > HFS_BIGFILE_SIZE && overflow_extents(datafork)) { + filebytes -= HFS_BIGFILE_SIZE; + } else { + filebytes = 0; + } + + /* Start a transaction, and wipe out as many blocks as we can in this iteration */ + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + break; + } + + if (datafork->ff_unallocblocks == 0) { + /* Protect extents b-tree and allocation bitmap */ + lockflags = SFL_BITMAP; + if (overflow_extents(datafork)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + + error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), datafork, filebytes, 1, 0, fileid, false)); + + hfs_systemfile_unlock(hfsmp, lockflags); + } + if (error == 0) { + datafork->ff_size = filebytes; + } + (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); + + /* Finish the transaction and start over if necessary */ + hfs_end_transaction(hfsmp); + + if (error) { + break; + } + } + } + + /* Resource fork */ + if (error == 0 && (rsrcfork != NULL) && rsrcfork->ff_blocks > 0) { + fileblocks = rsrcfork->ff_blocks; + filebytes = (off_t)fileblocks * (off_t)blksize; + + /* We killed invalid ranges and loaned blocks before we removed the catalog entry */ + + while (filebytes > 0) { + if (filebytes > HFS_BIGFILE_SIZE && overflow_extents(rsrcfork)) { + filebytes -= HFS_BIGFILE_SIZE; + } else { + filebytes = 0; + } + + /* Start a transaction, and wipe out as many blocks as we can in this iteration */ + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + break; + } + + if (rsrcfork->ff_unallocblocks == 0) { + /* Protect extents b-tree and allocation bitmap */ + lockflags = SFL_BITMAP; + if (overflow_extents(rsrcfork)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + + error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), rsrcfork, filebytes, 1, 1, fileid, false)); + + hfs_systemfile_unlock(hfsmp, lockflags); + } + if (error == 0) { + rsrcfork->ff_size = filebytes; + } + (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); + + /* Finish the transaction and start over if necessary */ + hfs_end_transaction(hfsmp); + + if (error) { + break; + } + } + } + + return error; +} /* * Truncate a cnode to at most length size, freeing (or adding) the * disk blocks. */ -__private_extern__ int hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize, int skipupdate, vfs_context_t context) @@ -2980,6 +3258,7 @@ hfs_vnop_allocate(struct vnop_allocate_args /* { struct hfsmount *hfsmp; kauth_cred_t cred = vfs_context_ucred(ap->a_context); int lockflags; + time_t orig_ctime; *(ap->a_bytesallocated) = 0; @@ -2990,7 +3269,11 @@ hfs_vnop_allocate(struct vnop_allocate_args /* { cp = VTOC(vp); - hfs_lock_truncate(cp, TRUE); + orig_ctime = VTOC(vp)->c_ctime; + + check_for_tracked_file(vp, orig_ctime, ap->a_length == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL); + + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK); if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) { goto Err_Exit; @@ -3181,7 +3464,7 @@ hfs_vnop_allocate(struct vnop_allocate_args /* { if (retval == 0) retval = retval2; Err_Exit: - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); hfs_unlock(cp); return (retval); } @@ -3204,74 +3487,298 @@ hfs_vnop_pagein(struct vnop_pagein_args *ap) }; */ { - vnode_t vp = ap->a_vp; - int error; + vnode_t vp; + struct cnode *cp; + struct filefork *fp; + int error = 0; + upl_t upl; + upl_page_info_t *pl; + off_t f_offset; + int offset; + int isize; + int pg_index; + boolean_t truncate_lock_held = FALSE; + boolean_t file_converted = FALSE; + kern_return_t kret; + + vp = ap->a_vp; + cp = VTOC(vp); + fp = VTOF(vp); + +#if CONFIG_PROTECT + if ((error = cp_handle_vnop(cp, CP_READ_ACCESS | CP_WRITE_ACCESS)) != 0) { + return error; + } +#endif /* CONFIG_PROTECT */ + + if (ap->a_pl != NULL) { + /* + * this can only happen for swap files now that + * we're asking for V2 paging behavior... + * so don't need to worry about decompression, or + * keeping track of blocks read or taking the truncate lock + */ + error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset, + ap->a_size, (off_t)fp->ff_size, ap->a_flags); + goto pagein_done; + } + +retry_pagein: + /* + * take truncate lock (shared/recursive) to guard against + * zero-fill thru fsync interfering, but only for v2 + * + * the HFS_RECURSE_TRUNCLOCK arg indicates that we want the + * lock shared and we are allowed to recurse 1 level if this thread already + * owns the lock exclusively... this can legally occur + * if we are doing a shrinking ftruncate against a file + * that is mapped private, and the pages being truncated + * do not currently exist in the cache... in that case + * we will have to page-in the missing pages in order + * to provide them to the private mapping... we must + * also call hfs_unlock_truncate with a postive been_recursed + * arg to indicate that if we have recursed, there is no need to drop + * the lock. Allowing this simple recursion is necessary + * in order to avoid a certain deadlock... since the ftruncate + * already holds the truncate lock exclusively, if we try + * to acquire it shared to protect the pagein path, we will + * hang this thread + * + * NOTE: The if () block below is a workaround in order to prevent a + * VM deadlock. See rdar://7853471. + * + * If we are in a forced unmount, then launchd will still have the + * dyld_shared_cache file mapped as it is trying to reboot. If we + * take the truncate lock here to service a page fault, then our + * thread could deadlock with the forced-unmount. The forced unmount + * thread will try to reclaim the dyld_shared_cache vnode, but since it's + * marked C_DELETED, it will call ubc_setsize(0). As a result, the unmount + * thread will think it needs to copy all of the data out of the file + * and into a VM copy object. If we hold the cnode lock here, then that + * VM operation will not be able to proceed, because we'll set a busy page + * before attempting to grab the lock. Note that this isn't as simple as "don't + * call ubc_setsize" because doing that would just shift the problem to the + * ubc_msync done before the vnode is reclaimed. + * + * So, if a forced unmount on this volume is in flight AND the cnode is + * marked C_DELETED, then just go ahead and do the page in without taking + * the lock (thus suspending pagein_v2 semantics temporarily). Since it's on a file + * that is not going to be available on the next mount, this seems like a + * OK solution from a correctness point of view, even though it is hacky. + */ + if (vfs_isforce(vp->v_mount)) { + if (cp->c_flag & C_DELETED) { + /* If we don't get it, then just go ahead and operate without the lock */ + truncate_lock_held = hfs_try_trunclock(cp, HFS_RECURSE_TRUNCLOCK); + } + } + else { + hfs_lock_truncate(cp, HFS_RECURSE_TRUNCLOCK); + truncate_lock_held = TRUE; + } + + kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT); + + if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) { + error = EINVAL; + goto pagein_done; + } + isize = ap->a_size; + + /* + * Scan from the back to find the last page in the UPL, so that we + * aren't looking at a UPL that may have already been freed by the + * preceding aborts/completions. + */ + for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) { + if (upl_page_present(pl, --pg_index)) + break; + if (pg_index == 0) { + /* + * no absent pages were found in the range specified + * just abort the UPL to get rid of it and then we're done + */ + ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY); + goto pagein_done; + } + } + /* + * initialize the offset variables before we touch the UPL. + * f_offset is the position into the file, in bytes + * offset is the position into the UPL, in bytes + * pg_index is the pg# of the UPL we're operating on + * isize is the offset into the UPL of the last page that is present. + */ + isize = ((pg_index + 1) * PAGE_SIZE); + pg_index = 0; + offset = 0; + f_offset = ap->a_f_offset; + + while (isize) { + int xsize; + int num_of_pages; + + if ( !upl_page_present(pl, pg_index)) { + /* + * we asked for RET_ONLY_ABSENT, so it's possible + * to get back empty slots in the UPL. + * just skip over them + */ + f_offset += PAGE_SIZE; + offset += PAGE_SIZE; + isize -= PAGE_SIZE; + pg_index++; + + continue; + } + /* + * We know that we have at least one absent page. + * Now checking to see how many in a row we have + */ + num_of_pages = 1; + xsize = isize - PAGE_SIZE; + + while (xsize) { + if ( !upl_page_present(pl, pg_index + num_of_pages)) + break; + num_of_pages++; + xsize -= PAGE_SIZE; + } + xsize = num_of_pages * PAGE_SIZE; #if HFS_COMPRESSION - if (VNODE_IS_RSRC(vp)) { - /* allow pageins of the resource fork */ - } else { - int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */ - if (compressed) { - error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp)); + if (VNODE_IS_RSRC(vp)) { + /* allow pageins of the resource fork */ + } else { + int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */ + if (compressed) { - if (error == 0) { - /* successful page-in, update the access time */ - VTOC(vp)->c_touch_acctime = TRUE; + if (truncate_lock_held) { + /* + * can't hold the truncate lock when calling into the decmpfs layer + * since it calls back into this layer... even though we're only + * holding the lock in shared mode, and the re-entrant path only + * takes the lock shared, we can deadlock if some other thread + * tries to grab the lock exclusively in between. + */ + hfs_unlock_truncate(cp, 1); + truncate_lock_held = FALSE; + } + ap->a_pl = upl; + ap->a_pl_offset = offset; + ap->a_f_offset = f_offset; + ap->a_size = xsize; + + error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp)); + /* + * note that decpfs_pagein_compressed can change the state of + * 'compressed'... it will set it to 0 if the file is no longer + * compressed once the compression lock is successfully taken + * i.e. we would block on that lock while the file is being inflated + */ + if (compressed) { + if (error == 0) { + /* successful page-in, update the access time */ + VTOC(vp)->c_touch_acctime = TRUE; - /* compressed files are not hot file candidates */ - if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) { - VTOF(vp)->ff_bytesread = 0; + /* compressed files are not hot file candidates */ + if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) { + fp->ff_bytesread = 0; + } + } else if (error == EAGAIN) { + /* + * EAGAIN indicates someone else already holds the compression lock... + * to avoid deadlocking, we'll abort this range of pages with an + * indication that the pagein needs to be redriven + */ + ubc_upl_abort_range(upl, (upl_offset_t) offset, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART); } + goto pagein_next_range; + } + else { + /* + * Set file_converted only if the file became decompressed while we were + * paging in. If it were still compressed, we would re-start the loop using the goto + * in the above block. This avoid us overloading truncate_lock_held as our retry_pagein + * condition below, since we could have avoided taking the truncate lock to prevent + * a deadlock in the force unmount case. + */ + file_converted = TRUE; } - return error; } - /* otherwise the file was converted back to a regular file while we were reading it */ + if (file_converted == TRUE) { + /* + * the file was converted back to a regular file after we first saw it as compressed + * we need to abort the upl, retake the truncate lock, recreate the UPL and start over + * reset a_size so that we consider what remains of the original request + * and null out a_upl and a_pl_offset. + * + * We should only be able to get into this block if the decmpfs_pagein_compressed + * successfully decompressed the range in question for this file. + */ + ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY); + + ap->a_size = isize; + ap->a_pl = NULL; + ap->a_pl_offset = 0; + + /* Reset file_converted back to false so that we don't infinite-loop. */ + file_converted = FALSE; + goto retry_pagein; + } } - } #endif + error = cluster_pagein(vp, upl, offset, f_offset, xsize, (off_t)fp->ff_size, ap->a_flags); - error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset, - ap->a_size, (off_t)VTOF(vp)->ff_size, ap->a_flags); - /* - * Keep track of blocks read. - */ - if (!vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) { - struct cnode *cp; - struct filefork *fp; - int bytesread; - int took_cnode_lock = 0; + /* + * Keep track of blocks read. + */ + if ( !vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) { + int bytesread; + int took_cnode_lock = 0; - cp = VTOC(vp); - fp = VTOF(vp); + if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE) + bytesread = fp->ff_size; + else + bytesread = xsize; - if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE) - bytesread = fp->ff_size; - else - bytesread = ap->a_size; + /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */ + if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) { + hfs_lock(cp, HFS_FORCE_LOCK); + took_cnode_lock = 1; + } + /* + * If this file hasn't been seen since the start of + * the current sampling period then start over. + */ + if (cp->c_atime < VTOHFS(vp)->hfc_timebase) { + struct timeval tv; - /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */ - if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) { - hfs_lock(cp, HFS_FORCE_LOCK); - took_cnode_lock = 1; + fp->ff_bytesread = bytesread; + microtime(&tv); + cp->c_atime = tv.tv_sec; + } else { + fp->ff_bytesread += bytesread; + } + cp->c_touch_acctime = TRUE; + if (took_cnode_lock) + hfs_unlock(cp); } - /* - * If this file hasn't been seen since the start of - * the current sampling period then start over. - */ - if (cp->c_atime < VTOHFS(vp)->hfc_timebase) { - struct timeval tv; +pagein_next_range: + f_offset += xsize; + offset += xsize; + isize -= xsize; + pg_index += num_of_pages; - fp->ff_bytesread = bytesread; - microtime(&tv); - cp->c_atime = tv.tv_sec; - } else { - fp->ff_bytesread += bytesread; - } - cp->c_touch_acctime = TRUE; - if (took_cnode_lock) - hfs_unlock(cp); + error = 0; } + +pagein_done: + if (truncate_lock_held == TRUE) { + /* Note 1 is passed to hfs_unlock_truncate in been_recursed argument */ + hfs_unlock_truncate(cp, 1); + } + return (error); } @@ -3338,7 +3845,7 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap) * take truncate lock (shared) to guard against * zero-fill thru fsync interfering, but only for v2 */ - hfs_lock_truncate(cp, 0); + hfs_lock_truncate(cp, HFS_SHARED_LOCK); if (a_flags & UPL_MSYNC) { request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY; @@ -3346,6 +3853,7 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap) else { request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY; } + kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags); if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) { @@ -3649,7 +4157,6 @@ hfs_vnop_bwrite(struct vnop_bwrite_args *ap) * * During step 3 page-ins to the file get suspended. */ -__private_extern__ int hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, struct proc *p) @@ -3685,6 +4192,22 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, fp = VTOF(vp); if (fp->ff_unallocblocks) return (EINVAL); + +#if CONFIG_PROTECT + /* + * + * Disable HFS file relocation on content-protected filesystems + */ + if (cp_fs_protected (hfsmp->hfs_mp)) { + return EINVAL; + } +#endif + + /* If it's an SSD, also disable HFS relocation */ + if (hfsmp->hfs_flags & HFS_SSD) { + return EINVAL; + } + blksize = hfsmp->blockSize; if (blockHint == 0) blockHint = hfsmp->nextAllocation; @@ -3707,15 +4230,15 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, if (!vnode_issystem(vp) && (vnodetype != VLNK)) { hfs_unlock(cp); - hfs_lock_truncate(cp, TRUE); + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK); /* Force lock since callers expects lock to be held. */ if ((retval = hfs_lock(cp, HFS_FORCE_LOCK))) { - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); return (retval); } /* No need to continue if file was removed. */ if (cp->c_flag & C_NOEXISTS) { - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); return (ENOENT); } took_trunc_lock = 1; @@ -3730,7 +4253,7 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, if (hfs_start_transaction(hfsmp) != 0) { if (took_trunc_lock) - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); return (EINVAL); } started_tr = 1; @@ -3850,7 +4373,7 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, goto restore; out: if (took_trunc_lock) - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); if (lockflags) { hfs_systemfile_unlock(hfsmp, lockflags); @@ -3876,7 +4399,7 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, restore: if (fp->ff_blocks == headblks) { if (took_trunc_lock) - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); goto exit; } /* @@ -3889,13 +4412,14 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); } - (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, false); + (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, 0, FORK_IS_RSRC(fp), + FTOC(fp)->c_fileid, false); hfs_systemfile_unlock(hfsmp, lockflags); lockflags = 0; if (took_trunc_lock) - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); goto exit; } @@ -3954,10 +4478,19 @@ hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize) iosize = bufsize = MIN(copysize, 128 * 1024); offset = 0; + hfs_unlock(VTOC(vp)); + +#if CONFIG_PROTECT + if ((error = cp_handle_vnop(VTOC(vp), CP_WRITE_ACCESS)) != 0) { + hfs_lock(VTOC(vp), HFS_FORCE_LOCK); + return (error); + } +#endif /* CONFIG_PROTECT */ + if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) { + hfs_lock(VTOC(vp), HFS_FORCE_LOCK); return (ENOMEM); - } - hfs_unlock(VTOC(vp)); + } auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ); diff --git a/bsd/hfs/hfs_search.c b/bsd/hfs/hfs_search.c index 6a8a8b74f..878c70dc5 100644 --- a/bsd/hfs/hfs_search.c +++ b/bsd/hfs/hfs_search.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997-2007 Apple Inc. All rights reserved. + * Copyright (c) 1997-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -48,6 +48,7 @@ #include #include #include +#include #if CONFIG_MACF #include @@ -154,7 +155,6 @@ vnop_searchfs { }; */ -__private_extern__ int hfs_vnop_search(ap) struct vnop_searchfs_args *ap; /* @@ -186,7 +186,6 @@ hfs_vnop_search(ap) struct proc *p = current_proc(); int err = E_NONE; int isHFSPlus; - int timerExpired = false; CatalogKey * myCurrentKeyPtr; CatalogRecord * myCurrentDataPtr; CatPosition * myCatPositionPtr; @@ -195,6 +194,9 @@ hfs_vnop_search(ap) user_size_t user_len = 0; int32_t searchTime; int lockflags; + struct uthread *ut; + boolean_t timerExpired = FALSE; + boolean_t needThrottle = FALSE; /* XXX Parameter check a_searchattrs? */ @@ -307,7 +309,7 @@ hfs_vnop_search(ap) (void) hfs_fsync(vcb->catalogRefNum, MNT_WAIT, 0, p); if (hfsmp->jnl) { hfs_systemfile_unlock(hfsmp, lockflags); - hfs_journal_flush(hfsmp); + hfs_journal_flush(hfsmp, FALSE); lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); } @@ -336,6 +338,8 @@ hfs_vnop_search(ap) if (err) goto ExitThisRoutine; + if (throttle_get_io_policy(&ut) == IOPOL_THROTTLE) + needThrottle = TRUE; /* * Check all the catalog btree records... * return the attributes for matching items @@ -373,18 +377,24 @@ hfs_vnop_search(ap) if (*(ap->a_nummatches) >= ap->a_maxmatches) break; } - - /* - * Check our elapsed time and bail if we've hit the max. - * The idea here is to throttle the amount of time we - * spend in the kernel. - */ - microuptime(&myCurrentTime); - timersub(&myCurrentTime, &myBTScanState.startTime, &myElapsedTime); - /* Note: assumes kMaxMicroSecsInKernel is less than 1,000,000 */ - if (myElapsedTime.tv_sec > 0 - || myElapsedTime.tv_usec >= searchTime) { - timerExpired = true; + if (timerExpired == FALSE) { + /* + * Check our elapsed time and bail if we've hit the max. + * The idea here is to throttle the amount of time we + * spend in the kernel. + */ + microuptime(&myCurrentTime); + timersub(&myCurrentTime, &myBTScanState.startTime, &myElapsedTime); + /* + * Note: assumes kMaxMicroSecsInKernel is less than 1,000,000 + */ + if (myElapsedTime.tv_sec > 0 + || myElapsedTime.tv_usec >= searchTime) { + timerExpired = TRUE; + } else if (needThrottle == TRUE) { + if (throttle_io_will_be_throttled(ut->uu_lowpri_window, HFSTOVFS(hfsmp))) + timerExpired = TRUE; + } } } @@ -436,12 +446,12 @@ ResolveHardlink(struct hfsmount *hfsmp, HFSPlusCatalogFile *recp) filecreatedate = to_bsd_time(recp->createDate); if ((type == kHardLinkFileType && creator == kHFSPlusCreator) && - (filecreatedate == (time_t)hfsmp->vcbCrDate || + (filecreatedate == (time_t)hfsmp->hfs_itime || filecreatedate == (time_t)hfsmp->hfs_metadata_createdate)) { isfilelink = 1; } else if ((type == kHFSAliasType && creator == kHFSAliasCreator) && (recp->flags & kHFSHasLinkChainMask) && - (filecreatedate == (time_t)hfsmp->vcbCrDate || + (filecreatedate == (time_t)hfsmp->hfs_itime || filecreatedate == (time_t)hfsmp->hfs_metadata_createdate)) { isdirlink = 1; } @@ -556,7 +566,7 @@ CheckAccess(ExtendedVCB *theVCBPtr, u_long searchBits, CatalogKey *theKeyPtr, st cnode_t * cp; /* now go get catalog data for this directory */ - myErr = hfs_vget(hfsmp, myNodeID, &vp, 0); + myErr = hfs_vget(hfsmp, myNodeID, &vp, 0, 0); if ( myErr ) { goto ExitThisRoutine; /* no access */ } diff --git a/bsd/hfs/hfs_vfsops.c b/bsd/hfs/hfs_vfsops.c index 7a049916f..1a1ca2aa4 100644 --- a/bsd/hfs/hfs_vfsops.c +++ b/bsd/hfs/hfs_vfsops.c @@ -87,6 +87,7 @@ #include #include #include +#include #include @@ -109,6 +110,16 @@ #include "hfscommon/headers/FileMgrInternal.h" #include "hfscommon/headers/BTreesInternal.h" +#if CONFIG_PROTECT +#include +#endif + +#if CONFIG_HFS_ALLOC_RBTREE +#include "hfscommon/headers/HybridAllocator.h" +#endif + +#define HFS_MOUNT_DEBUG 1 + #if HFS_DIAGNOSTIC int hfs_dbg_all = 0; int hfs_dbg_err = 0; @@ -121,6 +132,7 @@ lck_grp_attr_t * hfs_group_attr; lck_attr_t * hfs_lock_attr; lck_grp_t * hfs_mutex_group; lck_grp_t * hfs_rwlock_group; +lck_grp_t * hfs_spinlock_group; extern struct vnodeopv_desc hfs_vnodeop_opv_desc; extern struct vnodeopv_desc hfs_std_vnodeop_opv_desc; @@ -134,29 +146,30 @@ static int hfs_flushfiles(struct mount *, int, struct proc *); static int hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush); static int hfs_getmountpoint(struct vnode *vp, struct hfsmount **hfsmpp); static int hfs_init(struct vfsconf *vfsp); -static int hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t context); -static int hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, int journal_replay_only, vfs_context_t context); -static int hfs_reload(struct mount *mp); static int hfs_vfs_root(struct mount *mp, struct vnode **vpp, vfs_context_t context); static int hfs_quotactl(struct mount *, int, uid_t, caddr_t, vfs_context_t context); static int hfs_start(struct mount *mp, int flags, vfs_context_t context); -static int hfs_statfs(struct mount *mp, register struct vfsstatfs *sbp, vfs_context_t context); -static int hfs_sync(struct mount *mp, int waitfor, vfs_context_t context); -static int hfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, - user_addr_t newp, size_t newlen, vfs_context_t context); -static int hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context); static int hfs_vptofh(struct vnode *vp, int *fhlenp, unsigned char *fhp, vfs_context_t context); - -static int hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t reclaimblks, vfs_context_t context); -static int hfs_overlapped_overflow_extents(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t fileID); +static int hfs_file_extent_overlaps(struct hfsmount *hfsmp, u_int32_t allocLimit, struct HFSPlusCatalogFile *filerec); static int hfs_journal_replay(vnode_t devvp, vfs_context_t context); +static int hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t allocLimit, u_int32_t reclaimblks, vfs_context_t context); +void hfs_initialize_allocator (struct hfsmount *hfsmp); +int hfs_teardown_allocator (struct hfsmount *hfsmp); + +int hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t context); +int hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, int journal_replay_only, vfs_context_t context); +int hfs_reload(struct mount *mp); +int hfs_statfs(struct mount *mp, register struct vfsstatfs *sbp, vfs_context_t context); +int hfs_sync(struct mount *mp, int waitfor, vfs_context_t context); +int hfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen, vfs_context_t context); +int hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context); /* * Called by vfs_mountroot when mounting HFS Plus as root. */ -__private_extern__ int hfs_mountroot(mount_t mp, vnode_t rvp, vfs_context_t context) { @@ -165,8 +178,13 @@ hfs_mountroot(mount_t mp, vnode_t rvp, vfs_context_t context) struct vfsstatfs *vfsp; int error; - if ((error = hfs_mountfs(rvp, mp, NULL, 0, context))) + if ((error = hfs_mountfs(rvp, mp, NULL, 0, context))) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountroot: hfs_mountfs returned %d, rvp (%p) name (%s) \n", + error, rvp, (rvp->v_name ? rvp->v_name : "unknown device")); + } return (error); + } /* Init hfsmp */ hfsmp = VFSTOHFS(mp); @@ -194,7 +212,7 @@ hfs_mountroot(mount_t mp, vnode_t rvp, vfs_context_t context) * mount system call */ -static int +int hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t context) { struct proc *p = vfs_context_proc(context); @@ -204,6 +222,9 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte u_int32_t cmdflags; if ((retval = copyin(data, (caddr_t)&args, sizeof(args)))) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mount: copyin returned %d for fs\n", retval); + } return (retval); } cmdflags = (u_int32_t)vfs_flags(mp) & MNT_CMDFLAGS; @@ -212,10 +233,19 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte /* Reload incore data after an fsck. */ if (cmdflags & MNT_RELOAD) { - if (vfs_isrdonly(mp)) - return hfs_reload(mp); - else + if (vfs_isrdonly(mp)) { + int error = hfs_reload(mp); + if (error && HFS_MOUNT_DEBUG) { + printf("hfs_mount: hfs_reload returned %d on %s \n", error, hfsmp->vcbVN); + } + return error; + } + else { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mount: MNT_RELOAD not supported on rdwr filesystem %s\n", hfsmp->vcbVN); + } return (EINVAL); + } } /* Change to a read-only file system. */ @@ -227,16 +257,19 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte * is in progress and therefore block any further * modifications to the file system. */ - hfs_global_exclusive_lock_acquire(hfsmp); + hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK); hfsmp->hfs_flags |= HFS_RDONLY_DOWNGRADE; hfsmp->hfs_downgrading_proc = current_thread(); - hfs_global_exclusive_lock_release(hfsmp); + hfs_unlock_global (hfsmp); /* use VFS_SYNC to push out System (btree) files */ retval = VFS_SYNC(mp, MNT_WAIT, context); if (retval && ((cmdflags & MNT_FORCE) == 0)) { hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE; hfsmp->hfs_downgrading_proc = NULL; + if (HFS_MOUNT_DEBUG) { + printf("hfs_mount: VFS_SYNC returned %d during b-tree sync of %s \n", retval, hfsmp->vcbVN); + } goto out; } @@ -247,6 +280,9 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte if ((retval = hfs_flushfiles(mp, flags, p))) { hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE; hfsmp->hfs_downgrading_proc = NULL; + if (HFS_MOUNT_DEBUG) { + printf("hfs_mount: hfs_flushfiles returned %d on %s \n", retval, hfsmp->vcbVN); + } goto out; } @@ -266,13 +302,16 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte } } if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mount: FSYNC on devvp returned %d for fs %s\n", retval, hfsmp->vcbVN); + } hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE; hfsmp->hfs_downgrading_proc = NULL; hfsmp->hfs_flags &= ~HFS_READ_ONLY; goto out; } if (hfsmp->jnl) { - hfs_global_exclusive_lock_acquire(hfsmp); + hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK); journal_close(hfsmp->jnl); hfsmp->jnl = NULL; @@ -281,14 +320,20 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte // access to the jvp because we may need // it later if we go back to being read-write. - hfs_global_exclusive_lock_release(hfsmp); + hfs_unlock_global (hfsmp); } +#if CONFIG_HFS_ALLOC_RBTREE + (void) hfs_teardown_allocator(hfsmp); +#endif hfsmp->hfs_downgrading_proc = NULL; } /* Change to a writable file system. */ if (vfs_iswriteupgrade(mp)) { +#if CONFIG_HFS_ALLOC_RBTREE + thread_t allocator_thread; +#endif /* * On inconsistent disks, do not allow read-write mount @@ -296,6 +341,9 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte */ if (!(vfs_flags(mp) & MNT_ROOTFS) && (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mount: attempting to mount inconsistent non-root volume %s\n", (hfsmp->vcbVN)); + } retval = EINVAL; goto out; } @@ -310,39 +358,52 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte if (hfsmp->hfs_flags & HFS_NEED_JNL_RESET) { jflags = JOURNAL_RESET; - } else { + } else { jflags = 0; - } - - hfs_global_exclusive_lock_acquire(hfsmp); - - hfsmp->jnl = journal_open(hfsmp->jvp, - (hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset, - hfsmp->jnl_size, - hfsmp->hfs_devvp, - hfsmp->hfs_logical_block_size, - jflags, - 0, - hfs_sync_metadata, hfsmp->hfs_mp); - - hfs_global_exclusive_lock_release(hfsmp); - - if (hfsmp->jnl == NULL) { - retval = EINVAL; - goto out; - } else { - hfsmp->hfs_flags &= ~HFS_NEED_JNL_RESET; - } + } + + hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK); + + hfsmp->jnl = journal_open(hfsmp->jvp, + (hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset, + hfsmp->jnl_size, + hfsmp->hfs_devvp, + hfsmp->hfs_logical_block_size, + jflags, + 0, + hfs_sync_metadata, hfsmp->hfs_mp); + + /* + * Set up the trim callback function so that we can add + * recently freed extents to the free extent cache once + * the transaction that freed them is written to the + * journal on disk. + */ + if (hfsmp->jnl) + journal_trim_set_callback(hfsmp->jnl, hfs_trim_callback, hfsmp); + + hfs_unlock_global (hfsmp); + + if (hfsmp->jnl == NULL) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mount: journal_open == NULL; couldn't be opened on %s \n", (hfsmp->vcbVN)); + } + retval = EINVAL; + goto out; + } else { + hfsmp->hfs_flags &= ~HFS_NEED_JNL_RESET; + } } /* See if we need to erase unused Catalog nodes due to . */ retval = hfs_erase_unused_nodes(hfsmp); - if (retval != E_NONE) + if (retval != E_NONE) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mount: hfs_erase_unused_nodes returned %d for fs %s\n", retval, hfsmp->vcbVN); + } goto out; - - /* Only clear HFS_READ_ONLY after a successful write */ - hfsmp->hfs_flags &= ~HFS_READ_ONLY; + } /* If this mount point was downgraded from read-write * to read-only, clear that information as we are now @@ -355,8 +416,16 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte hfsmp->vcbAtrb &= ~kHFSVolumeUnmountedMask; retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0); - if (retval != E_NONE) + if (retval != E_NONE) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mount: hfs_flushvolumeheader returned %d for fs %s\n", retval, hfsmp->vcbVN); + } goto out; + } + + /* Only clear HFS_READ_ONLY after a successful write */ + hfsmp->hfs_flags &= ~HFS_READ_ONLY; + if (!(hfsmp->hfs_flags & (HFS_READ_ONLY | HFS_STANDARD))) { /* Setup private/hidden directories for hardlinks. */ @@ -368,8 +437,8 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte /* * Allow hot file clustering if conditions allow. */ - if ((hfsmp->hfs_flags & HFS_METADATA_ZONE) && - ((hfsmp->hfs_mp->mnt_kern_flag & MNTK_SSD) == 0)) { + if ((hfsmp->hfs_flags & HFS_METADATA_ZONE) && + ((hfsmp->hfs_flags & HFS_SSD) == 0)) { (void) hfs_recording_init(hfsmp); } /* Force ACLs on HFS+ file systems. */ @@ -377,10 +446,45 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte vfs_setextendedsecurity(HFSTOVFS(hfsmp)); } } + +#if CONFIG_HFS_ALLOC_RBTREE + /* + * Like the normal mount case, we need to handle creation of the allocation red-black tree + * if we're upgrading from read-only to read-write. + * + * We spawn a thread to create the pair of red-black trees for this volume. + * However, in so doing, we must be careful to ensure that if this thread is still + * running after mount has finished, it doesn't interfere with an unmount. Specifically, + * we'll need to set a bit that indicates we're in progress building the trees here. + * Unmount will check for this bit, and then if it's set, mark a corresponding bit that + * notifies the tree generation code that an unmount is waiting. Also, mark the extent + * tree flags that the allocator is enabled for use before we spawn the thread that will start + * scanning the RB tree. + * + * Only do this if we're operating on a read-write mount (we wouldn't care for read-only), + * which has not previously encountered a bad error on the red-black tree code. Also, don't + * try to re-build a tree that already exists. + */ + + if (hfsmp->extent_tree_flags == 0) { + hfsmp->extent_tree_flags |= (HFS_ALLOC_TREEBUILD_INFLIGHT | HFS_ALLOC_RB_ENABLED); + /* Initialize EOF counter so that the thread can assume it started at initial values */ + hfsmp->offset_block_end = 0; + + InitTree(hfsmp); + + kernel_thread_start ((thread_continue_t) hfs_initialize_allocator , hfsmp, &allocator_thread); + thread_deallocate(allocator_thread); + } + +#endif } /* Update file system parameters. */ retval = hfs_changefs(mp, &args); + if (retval && HFS_MOUNT_DEBUG) { + printf("hfs_mount: hfs_changefs returned %d for %s\n", retval, hfsmp->vcbVN); + } } else /* not an update request */ { @@ -388,6 +492,44 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte vfs_setflags(mp, (u_int64_t)((unsigned int)MNT_DOVOLFS)); retval = hfs_mountfs(devvp, mp, &args, 0, context); + if (retval && HFS_MOUNT_DEBUG) { + printf("hfs_mount: hfs_mountfs returned %d\n", retval); + } +#if CONFIG_PROTECT + /* + * If above mount call was successful, and this mount is content protection + * enabled, then verify the on-disk EA on the root to ensure that the filesystem + * is of a suitable vintage to allow the mount to proceed. + */ + if ((retval == 0) && (cp_fs_protected (mp))) { + int err = 0; + struct cp_root_xattr xattr; + bzero (&xattr, sizeof(struct cp_root_xattr)); + hfsmp = vfs_fsprivate(mp); + + /* go get the EA to get the version information */ + err = cp_getrootxattr (hfsmp, &xattr); + /* If there was no EA there, then write one out. */ + if (err == ENOATTR) { + bzero(&xattr, sizeof(struct cp_root_xattr)); + xattr.major_version = CP_CURRENT_MAJOR_VERS; + xattr.minor_version = CP_CURRENT_MINOR_VERS; + xattr.flags = 0; + + err = cp_setrootxattr (hfsmp, &xattr); + } + /* + * For any other error, including having an out of date CP version in the + * EA, or for an error out of cp_setrootxattr, deny the mount + * and do not proceed further. + */ + if (err || xattr.major_version != CP_CURRENT_MAJOR_VERS) { + /* Deny the mount and tear down. */ + retval = EPERM; + (void) hfs_unmount (mp, MNT_FORCE, context); + } + } +#endif } out: if (retval == 0) { @@ -629,7 +771,7 @@ hfs_reload_callback(struct vnode *vp, void *cargs) /* * Re-read cnode data for all active vnodes (non-metadata files). */ - if (!vnode_issystem(vp) && !VNODE_IS_RSRC(vp)) { + if (!vnode_issystem(vp) && !VNODE_IS_RSRC(vp) && (cp->c_fileid >= kHFSFirstUserCatalogNodeID)) { struct cat_fork *datafork; struct cat_desc desc; @@ -663,7 +805,7 @@ hfs_reload_callback(struct vnode *vp, void *cargs) * re-load B-tree header data. * re-read cnode data for all active vnodes. */ -static int +int hfs_reload(struct mount *mountp) { register struct vnode *devvp; @@ -877,7 +1019,7 @@ hfs_syncer(void *arg0, void *unused) } if (hfsmp->jnl) { - journal_flush(hfsmp->jnl); + journal_flush(hfsmp->jnl, FALSE); } else { hfs_sync(hfsmp->hfs_mp, MNT_WAIT, vfs_context_kernel()); } @@ -918,11 +1060,11 @@ hfs_syncer(void *arg0, void *unused) // now. Else we defer the sync and reschedule it. // if (hfsmp->jnl) { - lck_rw_lock_shared(&hfsmp->hfs_global_lock); + hfs_lock_global (hfsmp, HFS_SHARED_LOCK); - journal_flush(hfsmp->jnl); + journal_flush(hfsmp->jnl, FALSE); - lck_rw_unlock_shared(&hfsmp->hfs_global_lock); + hfs_unlock_global (hfsmp); } else { hfs_sync(hfsmp->hfs_mp, MNT_WAIT, vfs_context_kernel()); } @@ -957,10 +1099,119 @@ hfs_syncer(void *arg0, void *unused) extern int IOBSDIsMediaEjectable( const char *cdev_name ); +/* + * Initialization code for Red-Black Tree Allocator + * + * This function will build the two red-black trees necessary for allocating space + * from the metadata zone as well as normal allocations. Currently, we use + * an advisory read to get most of the data into the buffer cache. + * This function is intended to be run in a separate thread so as not to slow down mount. + * + */ + +void +hfs_initialize_allocator (struct hfsmount *hfsmp) { + +#if CONFIG_HFS_ALLOC_RBTREE + u_int32_t err; + + /* + * Take the allocation file lock. Journal transactions will block until + * we're done here. + */ + int flags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + + /* + * GenerateTree assumes that the bitmap lock is held when you call the function. + * It will drop and re-acquire the lock periodically as needed to let other allocations + * through. It returns with the bitmap lock held. Since we only maintain one tree, + * we don't need to specify a start block (always starts at 0). + */ + err = GenerateTree(hfsmp, hfsmp->totalBlocks, &flags, 1); + if (err) { + goto bailout; + } + /* Mark offset tree as built */ + hfsmp->extent_tree_flags |= HFS_ALLOC_RB_ACTIVE; + +bailout: + /* + * GenerateTree may drop the bitmap lock during operation in order to give other + * threads a chance to allocate blocks, but it will always return with the lock held, so + * we don't need to re-grab the lock in order to update the TREEBUILD_INFLIGHT bit. + */ + hfsmp->extent_tree_flags &= ~HFS_ALLOC_TREEBUILD_INFLIGHT; + if (err != 0) { + /* Wakeup any waiters on the allocation bitmap lock */ + wakeup((caddr_t)&hfsmp->extent_tree_flags); + } + + hfs_systemfile_unlock(hfsmp, flags); +#else +#pragma unused (hfsmp) +#endif +} + + +/* + * Teardown code for the Red-Black Tree allocator. + * This function consolidates the code which serializes with respect + * to a thread that may be potentially still building the tree when we need to begin + * tearing it down. Since the red-black tree may not be live when we enter this function + * we return: + * 1 -> Tree was live. + * 0 -> Tree was not active at time of call. + */ + +int +hfs_teardown_allocator (struct hfsmount *hfsmp) { + int rb_used = 0; + +#if CONFIG_HFS_ALLOC_RBTREE + + int flags = 0; + + /* + * Check to see if the tree-generation is still on-going. + * If it is, then block until it's done. + */ + + flags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + + + while (hfsmp->extent_tree_flags & HFS_ALLOC_TREEBUILD_INFLIGHT) { + hfsmp->extent_tree_flags |= HFS_ALLOC_TEARDOWN_INFLIGHT; + + lck_rw_sleep(&(VTOC(hfsmp->hfs_allocation_vp))->c_rwlock, LCK_SLEEP_EXCLUSIVE, + &hfsmp->extent_tree_flags, THREAD_UNINT); + } + + if (hfs_isrbtree_active (hfsmp)) { + rb_used = 1; + + /* Tear down the RB Trees while we have the bitmap locked */ + DestroyTrees(hfsmp); + + } + + hfs_systemfile_unlock(hfsmp, flags); +#else + #pragma unused (hfsmp) +#endif + return rb_used; + +} + + +static int hfs_root_unmounted_cleanly = 0; + +SYSCTL_DECL(_vfs_generic); +SYSCTL_INT(_vfs_generic, OID_AUTO, root_unmounted_cleanly, CTLFLAG_RD, &hfs_root_unmounted_cleanly, 0, "Root filesystem was unmounted cleanly"); + /* * Common code for mount and mountroot */ -static int +int hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, int journal_replay_only, vfs_context_t context) { @@ -985,7 +1236,10 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, daddr64_t mdb_offset; int isvirtual = 0; int isroot = 0; - u_int32_t device_features = 0; + int isssd; +#if CONFIG_HFS_ALLOC_RBTREE + thread_t allocator_thread; +#endif if (args == NULL) { /* only hfs_mountroot passes us NULL as the 'args' argument */ @@ -1007,6 +1261,9 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, /* Get the logical block size (treated as physical block size everywhere) */ if (VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)&log_blksize, 0, context)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCGETBLOCKSIZE failed\n"); + } retval = ENXIO; goto error_exit; } @@ -1020,6 +1277,9 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, retval = VNOP_IOCTL(devvp, DKIOCGETPHYSICALBLOCKSIZE, (caddr_t)&phys_blksize, 0, context); if (retval) { if ((retval != ENOTSUP) && (retval != ENOTTY)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCGETPHYSICALBLOCKSIZE failed\n"); + } retval = ENXIO; goto error_exit; } @@ -1039,6 +1299,9 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, u_int32_t size512 = 512; if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, context)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCSETBLOCKSIZE failed \n"); + } retval = ENXIO; goto error_exit; } @@ -1047,7 +1310,9 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) { /* resetting block size may fail if getting block count did */ (void)VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context); - + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCGETBLOCKCOUNT failed\n"); + } retval = ENXIO; goto error_exit; } @@ -1083,11 +1348,17 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, /* Now switch to our preferred physical block size. */ if (log_blksize > 512) { if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCSETBLOCKSIZE (2) failed\n"); + } retval = ENXIO; goto error_exit; } /* Get the count of physical blocks. */ if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCGETBLOCKCOUNT (2) failed\n"); + } retval = ENXIO; goto error_exit; } @@ -1103,11 +1374,17 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, if ((retval = (int)buf_meta_bread(devvp, HFS_PHYSBLK_ROUNDDOWN(mdb_offset, (phys_blksize/log_blksize)), phys_blksize, cred, &bp))) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: buf_meta_bread failed with %d\n", retval); + } goto error_exit; } MALLOC(mdbp, HFSMasterDirectoryBlock *, kMDBSize, M_TEMP, M_WAITOK); if (mdbp == NULL) { retval = ENOMEM; + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: MALLOC failed\n"); + } goto error_exit; } bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize), mdbp, kMDBSize); @@ -1116,25 +1393,27 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, MALLOC(hfsmp, struct hfsmount *, sizeof(struct hfsmount), M_HFSMNT, M_WAITOK); if (hfsmp == NULL) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: MALLOC (2) failed\n"); + } retval = ENOMEM; goto error_exit; } bzero(hfsmp, sizeof(struct hfsmount)); hfs_chashinit_finish(hfsmp); - + /* - * See if the disk supports unmap (trim). - * - * NOTE: vfs_init_io_attributes has not been called yet, so we can't use the io_flags field - * returned by vfs_ioattr. We need to call VNOP_IOCTL ourselves. + * See if the disk is a solid state device. We need this to decide what to do about + * hotfiles. */ - if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&device_features, 0, context) == 0) { - if (device_features & DK_FEATURE_UNMAP) { - hfsmp->hfs_flags |= HFS_UNMAP; + if (VNOP_IOCTL(devvp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, context) == 0) { + if (isssd) { + hfsmp->hfs_flags |= HFS_SSD; } } - + + /* * Init the volume information structure */ @@ -1143,7 +1422,8 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, lck_mtx_init(&hfsmp->hfc_mutex, hfs_mutex_group, hfs_lock_attr); lck_rw_init(&hfsmp->hfs_global_lock, hfs_rwlock_group, hfs_lock_attr); lck_rw_init(&hfsmp->hfs_insync, hfs_rwlock_group, hfs_lock_attr); - + lck_spin_init(&hfsmp->vcbFreeExtLock, hfs_spinlock_group, hfs_lock_attr); + vfs_setfsprivate(mp, hfsmp); hfsmp->hfs_mp = mp; /* Make VFSTOHFS work */ hfsmp->hfs_raw_dev = vnode_specrdev(devvp); @@ -1216,6 +1496,9 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, retval = EROFS; goto error_exit; } + + printf("hfs_mountfs: Mounting HFS Standard volumes was deprecated in Mac OS 10.7 \n"); + /* Treat it as if it's read-only and not writeable */ hfsmp->hfs_flags |= HFS_READ_ONLY; hfsmp->hfs_flags &= ~HFS_WRITEABLE_MEDIA; @@ -1287,11 +1570,18 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, log_blksize = 512; if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) { + + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCSETBLOCKSIZE (3) failed\n"); + } retval = ENXIO; goto error_exit; } if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCGETBLOCKCOUNT (3) failed\n"); + } retval = ENXIO; goto error_exit; } @@ -1314,8 +1604,12 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, mdb_offset = (daddr64_t)((embeddedOffset / log_blksize) + HFS_PRI_SECTOR(log_blksize)); retval = (int)buf_meta_bread(devvp, HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys), phys_blksize, cred, &bp); - if (retval) + if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: buf_meta_bread (2) failed with %d\n", retval); + } goto error_exit; + } bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize), mdbp, 512); buf_brelse(bp); bp = NULL; @@ -1326,6 +1620,10 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, vhp = (HFSPlusVolumeHeader*) mdbp; } + if (isroot) { + hfs_root_unmounted_cleanly = (SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) != 0; + } + /* * On inconsistent disks, do not allow read-write mount * unless it is the boot volume being mounted. We also @@ -1338,6 +1636,10 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, && (SWAP_BE32(vhp->attributes) & kHFSVolumeInconsistentMask) && !journal_replay_only && !(hfsmp->hfs_flags & HFS_READ_ONLY)) { + + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: failed to mount non-root inconsistent disk\n"); + } retval = EINVAL; goto error_exit; } @@ -1375,6 +1677,9 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, // EROFS is a special error code that means the volume has an external // journal which we couldn't find. in that case we do not want to // rewrite the volume header - we'll just refuse to mount the volume. + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: hfs_early_journal_init indicated external jnl \n"); + } retval = EINVAL; goto error_exit; } @@ -1383,7 +1688,11 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, // to be "FSK!" which fsck_hfs will see and force the fsck instead // of just bailing out because the volume is journaled. if (!ronly) { - HFSPlusVolumeHeader *jvhp; + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: hfs_early_journal_init failed, setting to FSK \n"); + } + + HFSPlusVolumeHeader *jvhp; hfsmp->hfs_flags |= HFS_NEED_JNL_RESET; @@ -1418,6 +1727,9 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, // in the hopes that fsck_hfs will be able to // fix any damage that exists on the volume. if ( !(vfs_flags(mp) & MNT_ROOTFS)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: hfs_early_journal_init failed, erroring out \n"); + } retval = EINVAL; goto error_exit; } @@ -1446,10 +1758,16 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, "(%d) switching to 512\n", log_blksize); log_blksize = 512; if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCSETBLOCKSIZE (4) failed \n"); + } retval = ENXIO; goto error_exit; } if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCGETBLOCKCOUNT (4) failed \n"); + } retval = ENXIO; goto error_exit; } @@ -1470,6 +1788,9 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, // to be "FSK!" which fsck_hfs will see and force the fsck instead // of just bailing out because the volume is journaled. if (!ronly) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: hfs_early_journal_init (2) resetting.. \n"); + } HFSPlusVolumeHeader *jvhp; hfsmp->hfs_flags |= HFS_NEED_JNL_RESET; @@ -1504,6 +1825,9 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, // in the hopes that fsck_hfs will be able to // fix any damage that exists on the volume. if ( !(vfs_flags(mp) & MNT_ROOTFS)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: hfs_early_journal_init (2) failed \n"); + } retval = EINVAL; goto error_exit; } @@ -1512,6 +1836,9 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, /* Try again with a smaller block size... */ retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args, cred); + if (retval && HFS_MOUNT_DEBUG) { + printf("hfs_MountHFSPlusVolume (late) returned %d\n",retval); + } } if (retval) (void) hfs_relconverter(0); @@ -1522,6 +1849,9 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, hfsmp->hfs_last_mounted_mtime = hfsmp->hfs_mtime; if ( retval ) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: encountered failure %d \n", retval); + } goto error_exit; } @@ -1538,7 +1868,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, mp->mnt_vtable->vfc_vfsflags |= VFC_VFSDIRLINKS; } else { /* HFS standard doesn't support extended readdir! */ - mp->mnt_vtable->vfc_vfsflags &= ~VFC_VFSREADDIR_EXTENDED; + mount_set_noreaddirext (mp); } if (args) { @@ -1563,10 +1893,10 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, /* * Set the free space warning levels for the root volume: * - * Set the "danger" limit to 5% of the volume size or 125MB, whichever - * is less. Set the "warning" limit to 10% of the volume size or 250MB, + * Set the "danger" limit to 5% of the volume size or 512MB, whichever + * is less. Set the "warning" limit to 10% of the volume size or 1GB, * whichever is less. And last, set the "desired" freespace level to - * to 11% of the volume size or 375MB, whichever is less. + * to 11% of the volume size or 1.25GB, whichever is less. */ hfsmp->hfs_freespace_notify_dangerlimit = MIN(HFS_ROOTVERYLOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize, @@ -1598,6 +1928,32 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, } } } + +#if CONFIG_HFS_ALLOC_RBTREE + /* + * We spawn a thread to create the pair of red-black trees for this volume. + * However, in so doing, we must be careful to ensure that if this thread is still + * running after mount has finished, it doesn't interfere with an unmount. Specifically, + * we'll need to set a bit that indicates we're in progress building the trees here. + * Unmount will check for this bit, and then if it's set, mark a corresponding bit that + * notifies the tree generation code that an unmount is waiting. Also mark the bit that + * indicates the tree is live and operating. + * + * Only do this if we're operating on a read-write mount (we wouldn't care for read-only). + */ + + if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) { + hfsmp->extent_tree_flags |= (HFS_ALLOC_TREEBUILD_INFLIGHT | HFS_ALLOC_RB_ENABLED); + + /* Initialize EOF counter so that the thread can assume it started at initial values */ + hfsmp->offset_block_end = 0; + InitTree(hfsmp); + + kernel_thread_start ((thread_continue_t) hfs_initialize_allocator , hfsmp, &allocator_thread); + thread_deallocate(allocator_thread); + } + +#endif /* * Start looking for free space to drop below this level and generate a @@ -1628,7 +1984,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, vnode_rele(hfsmp->hfs_devvp); } hfs_delete_chash(hfsmp); - + FREE(hfsmp, M_HFSMNT); vfs_setfsprivate(mp, NULL); } @@ -1651,7 +2007,7 @@ hfs_start(__unused struct mount *mp, __unused int flags, __unused vfs_context_t /* * unmount system call */ -static int +int hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context) { struct proc *p = vfs_context_proc(context); @@ -1660,6 +2016,7 @@ hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context) int flags; int force; int started_tr = 0; + int rb_used = 0; flags = 0; force = 0; @@ -1706,6 +2063,10 @@ hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context) panic("hfs_unmount: pm_sync_incomplete underflow!\n"); } +#if CONFIG_HFS_ALLOC_RBTREE + rb_used = hfs_teardown_allocator(hfsmp); +#endif + /* * Flush out the b-trees, volume bitmap and Volume Header */ @@ -1768,22 +2129,31 @@ hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context) HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeUnmountedMask; } - if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { - int i; - u_int32_t min_start = hfsmp->totalBlocks; - - // set the nextAllocation pointer to the smallest free block number - // we've seen so on the next mount we won't rescan unnecessarily - for(i=0; i < (int)hfsmp->vcbFreeExtCnt; i++) { - if (hfsmp->vcbFreeExt[i].startBlock < min_start) { - min_start = hfsmp->vcbFreeExt[i].startBlock; + + if (rb_used) { + /* If the rb-tree was live, just set min_start to 0 */ + hfsmp->nextAllocation = 0; + } + else { + if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { + int i; + u_int32_t min_start = hfsmp->totalBlocks; + + // set the nextAllocation pointer to the smallest free block number + // we've seen so on the next mount we won't rescan unnecessarily + lck_spin_lock(&hfsmp->vcbFreeExtLock); + for(i=0; i < (int)hfsmp->vcbFreeExtCnt; i++) { + if (hfsmp->vcbFreeExt[i].startBlock < min_start) { + min_start = hfsmp->vcbFreeExt[i].startBlock; + } + } + lck_spin_unlock(&hfsmp->vcbFreeExtLock); + if (min_start < hfsmp->nextAllocation) { + hfsmp->nextAllocation = min_start; } - } - if (min_start < hfsmp->nextAllocation) { - hfsmp->nextAllocation = min_start; } } - + retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0); if (retval) { @@ -1799,7 +2169,7 @@ hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context) } if (hfsmp->jnl) { - hfs_journal_flush(hfsmp); + hfs_journal_flush(hfsmp, FALSE); } /* @@ -1807,11 +2177,6 @@ hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context) */ (void) hfsUnmount(hfsmp, p); - /* - * Last chance to dump unreferenced system files. - */ - (void) vflush(mp, NULLVP, FORCECLOSE); - if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord) (void) hfs_relconverter(hfsmp->hfs_encoding); @@ -1833,7 +2198,12 @@ hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context) } // XXXdbg -#ifdef HFS_SPARSE_DEV + /* + * Last chance to dump unreferenced system files. + */ + (void) vflush(mp, NULLVP, FORCECLOSE); + +#if HFS_SPARSE_DEV /* Drop our reference on the backing fs (if any). */ if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) && hfsmp->hfs_backingfs_rootvp) { struct vnode * tmpvp; @@ -1845,6 +2215,7 @@ hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context) } #endif /* HFS_SPARSE_DEV */ lck_mtx_destroy(&hfsmp->hfc_mutex, hfs_mutex_group); + lck_spin_destroy(&hfsmp->vcbFreeExtLock, hfs_spinlock_group); vnode_rele(hfsmp->hfs_devvp); hfs_delete_chash(hfsmp); @@ -1866,7 +2237,7 @@ hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context) static int hfs_vfs_root(struct mount *mp, struct vnode **vpp, __unused vfs_context_t context) { - return hfs_vget(VFSTOHFS(mp), (cnid_t)kHFSRootFolderID, vpp, 1); + return hfs_vget(VFSTOHFS(mp), (cnid_t)kHFSRootFolderID, vpp, 1, 0); } @@ -1887,7 +2258,7 @@ hfs_quotactl(struct mount *mp, int cmds, uid_t uid, caddr_t datap, vfs_context_t int cmd, type, error; if (uid == ~0U) - uid = vfs_context_ucred(context)->cr_ruid; + uid = kauth_cred_getuid(vfs_context_ucred(context)); cmd = cmds >> SUBCMDSHIFT; switch (cmd) { @@ -1895,7 +2266,7 @@ hfs_quotactl(struct mount *mp, int cmds, uid_t uid, caddr_t datap, vfs_context_t case Q_QUOTASTAT: break; case Q_GETQUOTA: - if (uid == vfs_context_ucred(context)->cr_ruid) + if (uid == kauth_cred_getuid(vfs_context_ucred(context))) break; /* fall through */ default: @@ -1958,7 +2329,7 @@ hfs_quotactl(struct mount *mp, int cmds, uid_t uid, caddr_t datap, vfs_context_t /* * Get file system statistics. */ -static int +int hfs_statfs(struct mount *mp, register struct vfsstatfs *sbp, __unused vfs_context_t context) { ExtendedVCB *vcb = VFSTOVCB(mp); @@ -2099,7 +2470,7 @@ hfs_sync_callback(struct vnode *vp, void *cargs) * * Note: we are always called with the filesystem marked `MPBUSY'. */ -static int +int hfs_sync(struct mount *mp, int waitfor, vfs_context_t context) { struct proc *p = vfs_context_proc(context); @@ -2203,7 +2574,7 @@ hfs_sync(struct mount *mp, int waitfor, vfs_context_t context) } if (hfsmp->jnl) { - hfs_journal_flush(hfsmp); + hfs_journal_flush(hfsmp, FALSE); } { @@ -2244,7 +2615,7 @@ hfs_fhtovp(struct mount *mp, int fhlen, unsigned char *fhp, struct vnode **vpp, if (fhlen < (int)sizeof(struct hfsfid)) return (EINVAL); - result = hfs_vget(VFSTOHFS(mp), ntohl(hfsfhp->hfsfid_cnid), &nvp, 0); + result = hfs_vget(VFSTOHFS(mp), ntohl(hfsfhp->hfsfid_cnid), &nvp, 0, 0); if (result) { if (result == ENOENT) result = ESTALE; @@ -2319,6 +2690,7 @@ hfs_init(__unused struct vfsconf *vfsp) hfs_group_attr = lck_grp_attr_alloc_init(); hfs_mutex_group = lck_grp_alloc_init("hfs-mutex", hfs_group_attr); hfs_rwlock_group = lck_grp_alloc_init("hfs-rwlock", hfs_group_attr); + hfs_spinlock_group = lck_grp_alloc_init("hfs-spinlock", hfs_group_attr); #if HFS_COMPRESSION decmpfs_init(); @@ -2359,7 +2731,7 @@ hfs_getmountpoint(struct vnode *vp, struct hfsmount **hfsmpp) /* * HFS filesystem related variables. */ -static int +int hfs_sysctl(int *name, __unused u_int namelen, user_addr_t oldp, size_t *oldlenp, user_addr_t newp, size_t newlen, vfs_context_t context) { @@ -2505,6 +2877,15 @@ hfs_sysctl(int *name, __unused u_int namelen, user_addr_t oldp, size_t *oldlenp, 0, hfs_sync_metadata, hfsmp->hfs_mp); + /* + * Set up the trim callback function so that we can add + * recently freed extents to the free extent cache once + * the transaction that freed them is written to the + * journal on disk. + */ + if (jnl) + journal_trim_set_callback(jnl, hfs_trim_callback, hfsmp); + if (jnl == NULL) { printf("hfs: FAILED to create the journal!\n"); if (jvp && jvp != hfsmp->hfs_devvp) { @@ -2516,17 +2897,17 @@ hfs_sysctl(int *name, __unused u_int namelen, user_addr_t oldp, size_t *oldlenp, return EINVAL; } - hfs_global_exclusive_lock_acquire(hfsmp); - + hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK); + /* * Flush all dirty metadata buffers. */ - buf_flushdirtyblks(hfsmp->hfs_devvp, MNT_WAIT, 0, "hfs_sysctl"); - buf_flushdirtyblks(hfsmp->hfs_extents_vp, MNT_WAIT, 0, "hfs_sysctl"); - buf_flushdirtyblks(hfsmp->hfs_catalog_vp, MNT_WAIT, 0, "hfs_sysctl"); - buf_flushdirtyblks(hfsmp->hfs_allocation_vp, MNT_WAIT, 0, "hfs_sysctl"); + buf_flushdirtyblks(hfsmp->hfs_devvp, TRUE, 0, "hfs_sysctl"); + buf_flushdirtyblks(hfsmp->hfs_extents_vp, TRUE, 0, "hfs_sysctl"); + buf_flushdirtyblks(hfsmp->hfs_catalog_vp, TRUE, 0, "hfs_sysctl"); + buf_flushdirtyblks(hfsmp->hfs_allocation_vp, TRUE, 0, "hfs_sysctl"); if (hfsmp->hfs_attribute_vp) - buf_flushdirtyblks(hfsmp->hfs_attribute_vp, MNT_WAIT, 0, "hfs_sysctl"); + buf_flushdirtyblks(hfsmp->hfs_attribute_vp, TRUE, 0, "hfs_sysctl"); HFSTOVCB(hfsmp)->vcbJinfoBlock = name[1]; HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeJournaledMask; @@ -2541,7 +2922,7 @@ hfs_sysctl(int *name, __unused u_int namelen, user_addr_t oldp, size_t *oldlenp, vfs_setflags(hfsmp->hfs_mp, (u_int64_t)((unsigned int)MNT_JOURNALED)); - hfs_global_exclusive_lock_release(hfsmp); + hfs_unlock_global (hfsmp); hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1); { @@ -2576,7 +2957,7 @@ hfs_sysctl(int *name, __unused u_int namelen, user_addr_t oldp, size_t *oldlenp, printf("hfs: disabling journaling for mount @ %p\n", vnode_mount(vp)); - hfs_global_exclusive_lock_acquire(hfsmp); + hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK); // Lights out for you buddy! journal_close(hfsmp->jnl); @@ -2595,7 +2976,8 @@ hfs_sysctl(int *name, __unused u_int namelen, user_addr_t oldp, size_t *oldlenp, HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeJournaledMask; - hfs_global_exclusive_lock_release(hfsmp); + hfs_unlock_global (hfsmp); + hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1); { @@ -2676,6 +3058,10 @@ hfs_sysctl(int *name, __unused u_int namelen, user_addr_t oldp, size_t *oldlenp, file_drop(device_fd); vnode_put(devvp); return error; + } else if (name[0] == HFS_ENABLE_RESIZE_DEBUG) { + hfs_resize_debug = 1; + printf ("hfs_sysctl: Enabled volume resize debugging.\n"); + return 0; } return (ENOTSUP); @@ -2696,7 +3082,7 @@ hfs_vfs_vget(struct mount *mp, ino64_t ino, struct vnode **vpp, __unused vfs_con hfsmp = VFSTOHFS(mp); - error = hfs_vget(hfsmp, (cnid_t)ino, vpp, 1); + error = hfs_vget(hfsmp, (cnid_t)ino, vpp, 1, 0); if (error) return (error); @@ -2737,9 +3123,8 @@ hfs_vfs_vget(struct mount *mp, ino64_t ino, struct vnode **vpp, __unused vfs_con * * If the object is a file then it will represent the data fork. */ -__private_extern__ int -hfs_vget(struct hfsmount *hfsmp, cnid_t cnid, struct vnode **vpp, int skiplock) +hfs_vget(struct hfsmount *hfsmp, cnid_t cnid, struct vnode **vpp, int skiplock, int allow_deleted) { struct vnode *vp = NULLVP; struct cat_desc cndesc; @@ -2761,7 +3146,7 @@ hfs_vget(struct hfsmount *hfsmp, cnid_t cnid, struct vnode **vpp, int skiplock) /* * Check the hash first */ - vp = hfs_chash_getvnode(hfsmp, cnid, 0, skiplock); + vp = hfs_chash_getvnode(hfsmp, cnid, 0, skiplock, allow_deleted); if (vp) { *vpp = vp; return(0); @@ -2841,7 +3226,7 @@ hfs_vget(struct hfsmount *hfsmp, cnid_t cnid, struct vnode **vpp, int skiplock) * Pick up the first link in the chain and get a descriptor for it. * This allows blind volfs paths to work for hardlinks. */ - if ((hfs_lookuplink(hfsmp, linkref, &prevlinkid, &nextlinkid) == 0) && + if ((hfs_lookup_siblinglinks(hfsmp, linkref, &prevlinkid, &nextlinkid) == 0) && (nextlinkid != 0)) { lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); error = cat_findname(hfsmp, nextlinkid, &linkdesc); @@ -2854,13 +3239,17 @@ hfs_vget(struct hfsmount *hfsmp, cnid_t cnid, struct vnode **vpp, int skiplock) } if (linkref) { - error = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cnfork, &vp); + int newvnode_flags = 0; + + error = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, + &cnfork, &vp, &newvnode_flags); if (error == 0) { VTOC(vp)->c_flag |= C_HARDLINK; vnode_setmultipath(vp); } } else { struct componentname cn; + int newvnode_flags = 0; /* Supply hfs_getnewvnode with a component name. */ MALLOC_ZONE(cn.cn_pnbuf, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); @@ -2874,7 +3263,8 @@ hfs_vget(struct hfsmount *hfsmp, cnid_t cnid, struct vnode **vpp, int skiplock) cn.cn_consume = 0; bcopy(cndesc.cd_nameptr, cn.cn_nameptr, cndesc.cd_namelen + 1); - error = hfs_getnewvnode(hfsmp, NULLVP, &cn, &cndesc, 0, &cnattr, &cnfork, &vp); + error = hfs_getnewvnode(hfsmp, NULLVP, &cn, &cndesc, 0, &cnattr, + &cnfork, &vp, &newvnode_flags); if (error == 0 && (VTOC(vp)->c_flag & C_HARDLINK)) { hfs_savelinkorigin(VTOC(vp), cndesc.cd_parentcnid); @@ -2927,7 +3317,7 @@ hfs_flushfiles(struct mount *mp, int flags, __unused struct proc *p) } /* Obtain the root vnode so we can skip over it. */ - skipvp = hfs_chash_getvnode(hfsmp, kHFSRootFolderID, 0, 0); + skipvp = hfs_chash_getvnode(hfsmp, kHFSRootFolderID, 0, 0, 0); } #endif /* QUOTA */ @@ -3004,7 +3394,6 @@ hfs_setencodingbits(struct hfsmount *hfsmp, u_int32_t encoding) * * On journal volumes this will cause a volume header flush */ -__private_extern__ int hfs_volupdate(struct hfsmount *hfsmp, enum volop op, int inroot) { @@ -3079,7 +3468,7 @@ hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush) mdb = (HFSMasterDirectoryBlock *)(buf_dataptr(bp) + HFS_PRI_OFFSET(sectorsize)); - mdb->drCrDate = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->vcbCrDate))); + mdb->drCrDate = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->hfs_itime))); mdb->drLsMod = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->vcbLsMod))); mdb->drAtrb = SWAP_BE16 (vcb->vcbAtrb); mdb->drNmFls = SWAP_BE16 (vcb->vcbNmFls); @@ -3156,7 +3545,6 @@ hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush) * not flushed since the on-disk "H+" and "HX" signatures * are always stored in-memory as "H+". */ -__private_extern__ int hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush) { @@ -3464,7 +3852,6 @@ hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush) /* * Extend a file system. */ -__private_extern__ int hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) { @@ -3509,7 +3896,7 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) * ownership and check permissions. */ if (suser(cred, NULL)) { - error = hfs_vget(hfsmp, kHFSRootFolderID, &vp, 0); + error = hfs_vget(hfsmp, kHFSRootFolderID, &vp, 0, 0); if (error) return (error); @@ -3562,7 +3949,11 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) addblks = newblkcnt - vcb->totalBlocks; - printf("hfs_extendfs: growing %s by %d blocks\n", vcb->vcbVN, addblks); + if (hfs_resize_debug) { + printf ("hfs_extendfs: old: size=%qu, blkcnt=%u\n", oldsize, hfsmp->totalBlocks); + printf ("hfs_extendfs: new: size=%qu, blkcnt=%u, addblks=%u\n", newsize, (u_int32_t)newblkcnt, addblks); + } + printf("hfs_extendfs: will extend \"%s\" by %d blocks\n", vcb->vcbVN, addblks); HFS_MOUNT_LOCK(hfsmp, TRUE); if (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) { @@ -3573,9 +3964,6 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) hfsmp->hfs_flags |= HFS_RESIZE_IN_PROGRESS; HFS_MOUNT_UNLOCK(hfsmp, TRUE); - /* Invalidate the current free extent cache */ - invalidate_free_extent_cache(hfsmp); - /* * Enclose changes inside a transaction. */ @@ -3604,6 +3992,17 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) else bitmapblks = 0; + /* + * The allocation bitmap can contain unused bits that are beyond end of + * current volume's allocation blocks. Usually they are supposed to be + * zero'ed out but there can be cases where they might be marked as used. + * After extending the file system, those bits can represent valid + * allocation blocks, so we mark all the bits from the end of current + * volume to end of allocation bitmap as "free". + */ + BlockMarkFreeUnused(vcb, vcb->totalBlocks, + (fp->ff_blocks * vcb->blockSize * 8) - vcb->totalBlocks); + if (bitmapblks > 0) { daddr64_t blkno; daddr_t blkcnt; @@ -3623,8 +4022,8 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) * zone. */ error = ExtendFileC(vcb, fp, bitmapblks * vcb->blockSize, 0, - kEFAllMask | kEFNoClumpMask | kEFReserveMask | kEFMetadataMask, - &bytesAdded); + kEFAllMask | kEFNoClumpMask | kEFReserveMask + | kEFMetadataMask | kEFContigMask, &bytesAdded); if (error == 0) { usedExtendFileC = true; @@ -3736,7 +4135,8 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) * Restore to old state. */ if (usedExtendFileC) { - (void) TruncateFileC(vcb, fp, oldBitmapSize, false); + (void) TruncateFileC(vcb, fp, oldBitmapSize, 0, FORK_IS_RSRC(fp), + FTOC(fp)->c_fileid, false); } else { fp->ff_blocks -= bitmapblks; fp->ff_size -= (u_int64_t)bitmapblks * (u_int64_t)vcb->blockSize; @@ -3752,10 +4152,15 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) hfsmp->hfs_logical_block_count = prev_phys_block_count; hfsmp->hfs_alt_id_sector = prev_alt_sector; MarkVCBDirty(vcb); - if (vcb->blockSize == 512) - (void) BlockMarkAllocated(vcb, vcb->totalBlocks - 2, 2); - else - (void) BlockMarkAllocated(vcb, vcb->totalBlocks - 1, 1); + if (vcb->blockSize == 512) { + if (BlockMarkAllocated(vcb, vcb->totalBlocks - 2, 2)) { + hfs_mark_volume_inconsistent(hfsmp); + } + } else { + if (BlockMarkAllocated(vcb, vcb->totalBlocks - 1, 1)) { + hfs_mark_volume_inconsistent(hfsmp); + } + } goto out; } /* @@ -3779,7 +4184,7 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) /* * Update the metadata zone size based on current volume size */ - hfs_metadatazone_init(hfsmp); + hfs_metadatazone_init(hfsmp, false); /* * Adjust the size of hfsmp->hfs_attrdata_vp @@ -3801,21 +4206,36 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) } } + /* + * Update the R/B Tree if necessary. Since we don't have to drop the systemfile + * locks in the middle of these operations like we do in the truncate case + * where we have to relocate files, we can only update the red-black tree + * if there were actual changes made to the bitmap. Also, we can't really scan the + * new portion of the bitmap before it has been allocated. The BlockMarkAllocated + * routines are smart enough to avoid the r/b tree if the portion they are manipulating is + * not currently controlled by the tree. + * + * We only update hfsmp->allocLimit if totalBlocks actually increased. + */ + + if (error == 0) { + UpdateAllocLimit(hfsmp, hfsmp->totalBlocks); + } + + /* Log successful extending */ + printf("hfs_extendfs: extended \"%s\" to %d blocks (was %d blocks)\n", + hfsmp->vcbVN, hfsmp->totalBlocks, (u_int32_t)(oldsize/hfsmp->blockSize)); + out: if (error && fp) { /* Restore allocation fork. */ bcopy(&forkdata, &fp->ff_data, sizeof(forkdata)); VTOC(vp)->c_blocks = fp->ff_blocks; - + } - /* - Regardless of whether or not the totalblocks actually increased, - we should reset the allocLimit field. If it changed, it will - get updated; if not, it will remain the same. - */ + HFS_MOUNT_LOCK(hfsmp, TRUE); hfsmp->hfs_flags &= ~HFS_RESIZE_IN_PROGRESS; - hfsmp->allocLimit = vcb->totalBlocks; HFS_MOUNT_UNLOCK(hfsmp, TRUE); if (lockflags) { hfs_systemfile_unlock(hfsmp, lockflags); @@ -3824,7 +4244,7 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) hfs_end_transaction(hfsmp); } - return (error); + return MacToVFSError(error); } #define HFS_MIN_SIZE (32LL * 1024LL * 1024LL) @@ -3832,7 +4252,6 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) /* * Truncate a file system (while still mounted). */ -__private_extern__ int hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) { @@ -3843,17 +4262,19 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) int lockflags = 0; int transaction_begun = 0; Boolean updateFreeBlocks = false; - int error; + Boolean disable_sparse = false; + int error = 0; - HFS_MOUNT_LOCK(hfsmp, TRUE); + lck_mtx_lock(&hfsmp->hfs_mutex); if (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) { - HFS_MOUNT_UNLOCK(hfsmp, TRUE); + lck_mtx_unlock(&hfsmp->hfs_mutex); return (EALREADY); } hfsmp->hfs_flags |= HFS_RESIZE_IN_PROGRESS; - hfsmp->hfs_resize_filesmoved = 0; - hfsmp->hfs_resize_totalfiles = 0; - HFS_MOUNT_UNLOCK(hfsmp, TRUE); + hfsmp->hfs_resize_blocksmoved = 0; + hfsmp->hfs_resize_totalblocks = 0; + hfsmp->hfs_resize_progress = 0; + lck_mtx_unlock(&hfsmp->hfs_mutex); /* * - Journaled HFS Plus volumes only. @@ -3882,25 +4303,66 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) error = EINVAL; goto out; } - /* Make sure that the file system has enough free blocks reclaim */ - if (reclaimblks >= hfs_freeblks(hfsmp, 1)) { - printf("hfs_truncatefs: insufficient space (need %u blocks; have %u free blocks)\n", reclaimblks, hfs_freeblks(hfsmp, 1)); - error = ENOSPC; - goto out; - } - - /* Invalidate the current free extent cache */ - invalidate_free_extent_cache(hfsmp); - - /* Start with a clean journal. */ - hfs_journal_flush(hfsmp); + + /* + * Make sure that the file system has enough free blocks reclaim. + * + * Before resize, the disk is divided into four zones - + * A. Allocated_Stationary - These are allocated blocks that exist + * before the new end of disk. These blocks will not be + * relocated or modified during resize. + * B. Free_Stationary - These are free blocks that exist before the + * new end of disk. These blocks can be used for any new + * allocations during resize, including allocation for relocating + * data from the area of disk being reclaimed. + * C. Allocated_To-Reclaim - These are allocated blocks that exist + * beyond the new end of disk. These blocks need to be reclaimed + * during resize by allocating equal number of blocks in Free + * Stationary zone and copying the data. + * D. Free_To-Reclaim - These are free blocks that exist beyond the + * new end of disk. Nothing special needs to be done to reclaim + * them. + * + * Total number of blocks on the disk before resize: + * ------------------------------------------------ + * Total Blocks = Allocated_Stationary + Free_Stationary + + * Allocated_To-Reclaim + Free_To-Reclaim + * + * Total number of blocks that need to be reclaimed: + * ------------------------------------------------ + * Blocks to Reclaim = Allocated_To-Reclaim + Free_To-Reclaim + * + * Note that the check below also makes sure that we have enough space + * to relocate data from Allocated_To-Reclaim to Free_Stationary. + * Therefore we do not need to check total number of blocks to relocate + * later in the code. + * + * The condition below gets converted to: + * + * Allocated To-Reclaim + Free To-Reclaim >= Free Stationary + Free To-Reclaim + * + * which is equivalent to: + * + * Allocated To-Reclaim >= Free Stationary + */ + if (reclaimblks >= hfs_freeblks(hfsmp, 1)) { + printf("hfs_truncatefs: insufficient space (need %u blocks; have %u free blocks)\n", reclaimblks, hfs_freeblks(hfsmp, 1)); + error = ENOSPC; + goto out; + } + + /* Start with a clean journal. */ + hfs_journal_flush(hfsmp, TRUE); if (hfs_start_transaction(hfsmp) != 0) { error = EINVAL; goto out; } transaction_begun = 1; - + + /* Take the bitmap lock to update the alloc limit field */ + lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + /* * Prevent new allocations from using the part we're trying to truncate. * @@ -3909,12 +4371,36 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) * interfere with allocating the new alternate volume header, and no files * in the allocation blocks beyond (i.e. the blocks we're trying to * truncate away. + * + * Also shrink the red-black tree if needed. + */ + if (hfsmp->blockSize == 512) { + error = UpdateAllocLimit (hfsmp, newblkcnt - 2); + } + else { + error = UpdateAllocLimit (hfsmp, newblkcnt - 1); + } + + /* Sparse devices use first fit allocation which is not ideal + * for volume resize which requires best fit allocation. If a + * sparse device is being truncated, disable the sparse device + * property temporarily for the duration of resize. Also reset + * the free extent cache so that it is rebuilt as sorted by + * totalBlocks instead of startBlock. + * + * Note that this will affect all allocations on the volume and + * ideal fix would be just to modify resize-related allocations, + * but it will result in complexity like handling of two free + * extent caches sorted differently, etc. So we stick to this + * solution for now. */ HFS_MOUNT_LOCK(hfsmp, TRUE); - if (hfsmp->blockSize == 512) - hfsmp->allocLimit = newblkcnt - 2; - else - hfsmp->allocLimit = newblkcnt - 1; + if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { + hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE; + ResetVCBFreeExtCache(hfsmp); + disable_sparse = true; + } + /* * Update the volume free block count to reflect the total number * of free blocks that will exist after a successful resize. @@ -3928,16 +4414,28 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) */ hfsmp->freeBlocks -= reclaimblks; updateFreeBlocks = true; - HFS_MOUNT_UNLOCK(hfsmp, TRUE); - + HFS_MOUNT_UNLOCK(hfsmp, TRUE); + + if (lockflags) { + hfs_systemfile_unlock(hfsmp, lockflags); + lockflags = 0; + } + /* - * Update the metadata zone size, and, if required, disable it + * Update the metadata zone size to match the new volume size, + * and if it too less, metadata zone might be disabled. */ - hfs_metadatazone_init(hfsmp); + hfs_metadatazone_init(hfsmp, false); /* - * Look for files that have blocks at or beyond the location of the - * new alternate volume header + * If some files have blocks at or beyond the location of the + * new alternate volume header, recalculate free blocks and + * reclaim blocks. Otherwise just update free blocks count. + * + * The current allocLimit is set to the location of new alternate + * volume header, and reclaimblks are the total number of blocks + * that need to be reclaimed. So the check below is really + * ignoring the blocks allocated for old alternate volume header. */ if (hfs_isallocated(hfsmp, hfsmp->allocLimit, reclaimblks)) { /* @@ -3967,23 +4465,14 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) error = EAGAIN; /* tell client to try again */ goto out; } - } - + } + /* * Note: we take the attributes lock in case we have an attribute data vnode * which needs to change size. */ lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE | SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); - /* - * Mark the old alternate volume header as free. - * We don't bother shrinking allocation bitmap file. - */ - if (hfsmp->blockSize == 512) - (void) BlockMarkFree(hfsmp, hfsmp->totalBlocks - 2, 2); - else - (void) BlockMarkFree(hfsmp, hfsmp->totalBlocks - 1, 1); - /* * Allocate last 1KB for alternate volume header. */ @@ -3993,6 +4482,15 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) goto out; } + /* + * Mark the old alternate volume header as free. + * We don't bother shrinking allocation bitmap file. + */ + if (hfsmp->blockSize == 512) + (void) BlockMarkFree(hfsmp, hfsmp->totalBlocks - 2, 2); + else + (void) BlockMarkFree(hfsmp, hfsmp->totalBlocks - 1, 1); + /* * Invalidate the existing alternate volume header. * @@ -4028,7 +4526,7 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH); if (error) panic("hfs_truncatefs: unexpected error flushing volume header (%d)\n", error); - + /* * Adjust the size of hfsmp->hfs_attrdata_vp */ @@ -4050,17 +4548,36 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) } out: - lck_mtx_lock(&hfsmp->hfs_mutex); - if (error && (updateFreeBlocks == true)) + /* + * Update the allocLimit to acknowledge the last one or two blocks now. + * Add it to the tree as well if necessary. + */ + UpdateAllocLimit (hfsmp, hfsmp->totalBlocks); + + HFS_MOUNT_LOCK(hfsmp, TRUE); + if (disable_sparse == true) { + /* Now that resize is completed, set the volume to be sparse + * device again so that all further allocations will be first + * fit instead of best fit. Reset free extent cache so that + * it is rebuilt. + */ + hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE; + ResetVCBFreeExtCache(hfsmp); + } + + if (error && (updateFreeBlocks == true)) { hfsmp->freeBlocks += reclaimblks; - hfsmp->allocLimit = hfsmp->totalBlocks; - if (hfsmp->nextAllocation >= hfsmp->allocLimit) + } + + if (hfsmp->nextAllocation >= hfsmp->allocLimit) { hfsmp->nextAllocation = hfsmp->hfs_metazone_end + 1; + } hfsmp->hfs_flags &= ~HFS_RESIZE_IN_PROGRESS; - HFS_MOUNT_UNLOCK(hfsmp, TRUE); + HFS_MOUNT_UNLOCK(hfsmp, TRUE); + /* On error, reset the metadata zone for original volume size */ if (error && (updateFreeBlocks == true)) { - hfs_metadatazone_init(hfsmp); + hfs_metadatazone_init(hfsmp, false); } if (lockflags) { @@ -4068,12 +4585,12 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) } if (transaction_begun) { hfs_end_transaction(hfsmp); - hfs_journal_flush(hfsmp); + hfs_journal_flush(hfsmp, FALSE); /* Just to be sure, sync all data to the disk */ (void) VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context); } - return (error); + return MacToVFSError(error); } @@ -4135,6 +4652,9 @@ hfs_copy_extent( u_int32_t ioSizeSectors; /* Device sectors in this I/O */ daddr64_t srcSector, destSector; u_int32_t sectorsPerBlock = hfsmp->blockSize / hfsmp->hfs_logical_block_size; +#if CONFIG_PROTECT + int cpenabled = 0; +#endif /* * Sanity check that we have locked the vnode of the file we're copying. @@ -4147,6 +4667,25 @@ hfs_copy_extent( if (cp != hfsmp->hfs_allocation_cp && cp->c_lockowner != current_thread()) panic("hfs_copy_extent: vp=%p (cp=%p) not owned?\n", vp, cp); +#if CONFIG_PROTECT + /* Prepare the CP blob and get it ready for use */ + if (!vnode_issystem (vp) && vnode_isreg(vp) && + cp_fs_protected (hfsmp->hfs_mp)) { + int cp_err = 0; + cp_err = cp_handle_relocate (cp); + if (cp_err) { + /* + * can't copy the file because we couldn't set up keys. + * bail out + */ + return cp_err; + } + else { + cpenabled = 1; + } + } +#endif + /* * Determine the I/O size to use * @@ -4176,7 +4715,14 @@ hfs_copy_extent( buf_setcount(bp, ioSize); buf_setblkno(bp, srcSector); buf_setlblkno(bp, srcSector); - + + /* Attach the CP to the buffer */ +#if CONFIG_PROTECT + if (cpenabled) { + buf_setcpaddr (bp, cp->c_cpentry); + } +#endif + /* Do the read */ err = VNOP_STRATEGY(bp); if (!err) @@ -4194,6 +4740,13 @@ hfs_copy_extent( buf_setlblkno(bp, destSector); if (vnode_issystem(vp) && journal_uses_fua(hfsmp->jnl)) buf_markfua(bp); + +#if CONFIG_PROTECT + /* Attach the CP to the buffer */ + if (cpenabled) { + buf_setcpaddr (bp, cp->c_cpentry); + } +#endif /* Do the write */ vnode_startwrite(hfsmp->hfs_devvp); @@ -4230,342 +4783,941 @@ hfs_copy_extent( } -static int -hfs_relocate_callback(__unused HFSPlusExtentKey *key, HFSPlusExtentRecord *record, HFSPlusExtentRecord *state) -{ - bcopy(state, record, sizeof(HFSPlusExtentRecord)); - return 0; -} +/* Structure to store state of reclaiming extents from a + * given file. hfs_reclaim_file()/hfs_reclaim_xattr() + * initializes the values in this structure which are then + * used by code that reclaims and splits the extents. + */ +struct hfs_reclaim_extent_info { + struct vnode *vp; + u_int32_t fileID; + u_int8_t forkType; + u_int8_t is_dirlink; /* Extent belongs to directory hard link */ + u_int8_t is_sysfile; /* Extent belongs to system file */ + u_int8_t is_xattr; /* Extent belongs to extent-based xattr */ + u_int8_t extent_index; + int lockflags; /* Locks that reclaim and split code should grab before modifying the extent record */ + u_int32_t blocks_relocated; /* Total blocks relocated for this file till now */ + u_int32_t recStartBlock; /* File allocation block number (FABN) for current extent record */ + u_int32_t cur_blockCount; /* Number of allocation blocks that have been checked for reclaim */ + struct filefork *catalog_fp; /* If non-NULL, extent is from catalog record */ + union record { + HFSPlusExtentRecord overflow;/* Extent record from overflow extents btree */ + HFSPlusAttrRecord xattr; /* Attribute record for large EAs */ + } record; + HFSPlusExtentDescriptor *extents; /* Pointer to current extent record being processed. + * For catalog extent record, points to the correct + * extent information in filefork. For overflow extent + * record, or xattr record, points to extent record + * in the structure above + */ + struct cat_desc *dirlink_desc; + struct cat_attr *dirlink_attr; + struct filefork *dirlink_fork; /* For directory hard links, fp points actually to this */ + struct BTreeIterator *iterator; /* Shared read/write iterator, hfs_reclaim_file/xattr() + * use it for reading and hfs_reclaim_extent()/hfs_split_extent() + * use it for writing updated extent record + */ + struct FSBufferDescriptor btdata; /* Shared btdata for reading/writing extent record, same as iterator above */ + u_int16_t recordlen; + int overflow_count; /* For debugging, counter for overflow extent record */ + FCB *fcb; /* Pointer to the current btree being traversed */ +}; -/* - * Reclaim space at the end of a volume, used by a given file. +/* + * Split the current extent into two extents, with first extent + * to contain given number of allocation blocks. Splitting of + * extent creates one new extent entry which can result in + * shifting of many entries through all the extent records of a + * file, and/or creating a new extent record in the overflow + * extent btree. * - * This routine attempts to move any extent which contains allocation blocks - * at or after "startblk." A separate transaction is used to do the move. - * The contents of any moved extents are read and written via the volume's - * device vnode -- NOT via "vp." During the move, moved blocks which are part - * of a transaction have their physical block numbers invalidated so they will - * eventually be written to their new locations. + * Example: + * The diagram below represents two consecutive extent records, + * for simplicity, lets call them record X and X+1 respectively. + * Interesting extent entries have been denoted by letters. + * If the letter is unchanged before and after split, it means + * that the extent entry was not modified during the split. + * A '.' means that the entry remains unchanged after the split + * and is not relevant for our example. A '0' means that the + * extent entry is empty. * - * Inputs: - * hfsmp The volume being resized. - * startblk Blocks >= this allocation block need to be moved. - * locks Which locks need to be taken for the given system file. - * vp The vnode for the system file. + * If there isn't sufficient contiguous free space to relocate + * an extent (extent "C" below), we will have to break the one + * extent into multiple smaller extents, and relocate each of + * the smaller extents individually. The way we do this is by + * finding the largest contiguous free space that is currently + * available (N allocation blocks), and then convert extent "C" + * into two extents, C1 and C2, that occupy exactly the same + * allocation blocks as extent C. Extent C1 is the first + * N allocation blocks of extent C, and extent C2 is the remainder + * of extent C. Then we can relocate extent C1 since we know + * we have enough contiguous free space to relocate it in its + * entirety. We then repeat the process starting with extent C2. + * + * In record X, only the entries following entry C are shifted, and + * the original entry C is replaced with two entries C1 and C2 which + * are actually two extent entries for contiguous allocation blocks. + * + * Note that the entry E from record X is shifted into record X+1 as + * the new first entry. Since the first entry of record X+1 is updated, + * the FABN will also get updated with the blockCount of entry E. + * This also results in shifting of all extent entries in record X+1. + * Note that the number of empty entries after the split has been + * changed from 3 to 2. + * + * Before: + * record X record X+1 + * ---------------------===--------- --------------------------------- + * | A | . | . | . | B | C | D | E | | F | . | . | . | G | 0 | 0 | 0 | + * ---------------------===--------- --------------------------------- * - * The caller of this function, hfs_reclaimspace(), grabs cnode lock - * for non-system files before calling this function. + * After: + * ---------------------=======----- --------------------------------- + * | A | . | . | . | B | C1| C2| D | | E | F | . | . | . | G | 0 | 0 | + * ---------------------=======----- --------------------------------- * - * Outputs: - * blks_moved Total number of allocation blocks moved by this routine. + * C1.startBlock = C.startBlock + * C1.blockCount = N + * + * C2.startBlock = C.startBlock + N + * C2.blockCount = C.blockCount - N + * + * FABN = old FABN - E.blockCount + * + * Inputs: + * extent_info - This is the structure that contains state about + * the current file, extent, and extent record that + * is being relocated. This structure is shared + * among code that traverses through all the extents + * of the file, code that relocates extents, and + * code that splits the extent. + * Output: + * Zero on success, non-zero on failure. */ -static int -hfs_reclaim_file(struct hfsmount *hfsmp, struct vnode *vp, u_long startblk, int locks, u_int32_t *blks_moved, vfs_context_t context) +static int +hfs_split_extent(struct hfs_reclaim_extent_info *extent_info, uint32_t newBlockCount) { - int error; - int lockflags; + int error = 0; + int index = extent_info->extent_index; int i; - u_long datablks; - u_long end_block; - u_int32_t oldStartBlock; - u_int32_t newStartBlock; - u_int32_t oldBlockCount; - u_int32_t newBlockCount; - struct filefork *fp; - struct cnode *cp; - int is_sysfile; - int took_truncate_lock = 0; - struct BTreeIterator *iterator = NULL; - u_int8_t forktype; - u_int32_t fileID; - u_int32_t alloc_flags; - - /* If there is no vnode for this file, then there's nothing to do. */ - if (vp == NULL) - return 0; + HFSPlusExtentDescriptor shift_extent; + HFSPlusExtentDescriptor last_extent; + HFSPlusExtentDescriptor *extents; /* Pointer to current extent record being manipulated */ + HFSPlusExtentRecord *extents_rec = NULL; + HFSPlusExtentKey *extents_key = NULL; + HFSPlusAttrRecord *xattr_rec = NULL; + HFSPlusAttrKey *xattr_key = NULL; + struct BTreeIterator iterator; + struct FSBufferDescriptor btdata; + uint16_t reclen; + uint32_t read_recStartBlock; /* Starting allocation block number to read old extent record */ + uint32_t write_recStartBlock; /* Starting allocation block number to insert newly updated extent record */ + Boolean create_record = false; + Boolean is_xattr; + + is_xattr = extent_info->is_xattr; + extents = extent_info->extents; - cp = VTOC(vp); - fileID = cp->c_cnid; - is_sysfile = vnode_issystem(vp); - forktype = VNODE_IS_RSRC(vp) ? 0xFF : 0; + if (hfs_resize_debug) { + printf ("hfs_split_extent: Split record:%u recStartBlock=%u %u:(%u,%u) for %u blocks\n", extent_info->overflow_count, extent_info->recStartBlock, index, extents[index].startBlock, extents[index].blockCount, newBlockCount); + } - /* Flush all the buffer cache blocks and cluster pages associated with - * this vnode. - * - * If the current vnode is a system vnode, all the buffer cache blocks - * associated with it should already be sync'ed to the disk as part of - * journal flush in hfs_truncatefs(). Normally there should not be - * buffer cache blocks for regular files, but for objects like symlinks, - * we can have buffer cache blocks associated with the vnode. Therefore - * we call buf_flushdirtyblks() always. Resource fork data for directory - * hard links are directly written using buffer cache for device vnode, - * which should also be sync'ed as part of journal flush in hfs_truncatefs(). - * - * Flushing cluster pages should be the normal case for regular files, - * and really should not do anything for system files. But just to be - * sure that all blocks associated with this vnode is sync'ed to the - * disk, we call both buffer cache and cluster layer functions. + /* Determine the starting allocation block number for the following + * overflow extent record, if any, before the current record + * gets modified. */ - buf_flushdirtyblks(vp, MNT_NOWAIT, 0, "hfs_reclaim_file"); - - if (!is_sysfile) { - /* The caller grabs cnode lock for non-system files only, therefore - * we unlock only non-system files before calling cluster layer. - */ - hfs_unlock(cp); - hfs_lock_truncate(cp, TRUE); - took_truncate_lock = 1; + read_recStartBlock = extent_info->recStartBlock; + for (i = 0; i < kHFSPlusExtentDensity; i++) { + if (extents[i].blockCount == 0) { + break; + } + read_recStartBlock += extents[i].blockCount; } - (void) cluster_push(vp, 0); - if (!is_sysfile) { - error = hfs_lock(cp, HFS_FORCE_LOCK); - if (error) { - hfs_unlock_truncate(cp, TRUE); - return error; + + /* Shift and split */ + if (index == kHFSPlusExtentDensity-1) { + /* The new extent created after split will go into following overflow extent record */ + shift_extent.startBlock = extents[index].startBlock + newBlockCount; + shift_extent.blockCount = extents[index].blockCount - newBlockCount; + + /* Last extent in the record will be split, so nothing to shift */ + } else { + /* Splitting of extents can result in at most of one + * extent entry to be shifted into following overflow extent + * record. So, store the last extent entry for later. + */ + shift_extent = extents[kHFSPlusExtentDensity-1]; + + /* Start shifting extent information from the end of the extent + * record to the index where we want to insert the new extent. + * Note that kHFSPlusExtentDensity-1 is already saved above, and + * does not need to be shifted. The extent entry that is being + * split does not get shifted. + */ + for (i = kHFSPlusExtentDensity-2; i > index; i--) { + if (hfs_resize_debug) { + if (extents[i].blockCount) { + printf ("hfs_split_extent: Shift %u:(%u,%u) to %u:(%u,%u)\n", i, extents[i].startBlock, extents[i].blockCount, i+1, extents[i].startBlock, extents[i].blockCount); + } + } + extents[i+1] = extents[i]; } + } - /* If the file no longer exists, nothing left to do */ - if (cp->c_flag & C_NOEXISTS) { - hfs_unlock_truncate(cp, TRUE); - return 0; + if (index == kHFSPlusExtentDensity-1) { + /* The second half of the extent being split will be the overflow + * entry that will go into following overflow extent record. The + * value has been stored in 'shift_extent' above, so there is + * nothing to be done here. + */ + } else { + /* Update the values in the second half of the extent being split + * before updating the first half of the split. Note that the + * extent to split or first half of the split is at index 'index' + * and a new extent or second half of the split will be inserted at + * 'index+1' or into following overflow extent record. + */ + extents[index+1].startBlock = extents[index].startBlock + newBlockCount; + extents[index+1].blockCount = extents[index].blockCount - newBlockCount; + } + /* Update the extent being split, only the block count will change */ + extents[index].blockCount = newBlockCount; + + if (hfs_resize_debug) { + printf ("hfs_split_extent: Split %u:(%u,%u) and ", index, extents[index].startBlock, extents[index].blockCount); + if (index != kHFSPlusExtentDensity-1) { + printf ("%u:(%u,%u)\n", index+1, extents[index+1].startBlock, extents[index+1].blockCount); + } else { + printf ("overflow:(%u,%u)\n", shift_extent.startBlock, shift_extent.blockCount); } } - /* Wait for any in-progress writes to this vnode to complete, so that we'll - * be copying consistent bits. (Otherwise, it's possible that an async - * write will complete to the old extent after we read from it. That - * could lead to corruption.) + /* If the newly split extent is for large EAs or in overflow extent + * record, so update it directly in the btree using the iterator + * information from the shared extent_info structure */ - error = vnode_waitforwrites(vp, 0, 0, 0, "hfs_reclaim_file"); - if (error) { - printf("hfs_reclaim_file: Error %d from vnode_waitforwrites\n", error); - return error; + if (extent_info->catalog_fp == NULL) { + error = BTReplaceRecord(extent_info->fcb, extent_info->iterator, + &(extent_info->btdata), extent_info->recordlen); + if (error) { + printf ("hfs_split_extent: fileID=%u BTReplaceRecord returned error=%d\n", extent_info->fileID, error); + goto out; + } } - - if (hfs_resize_debug) { - printf("hfs_reclaim_file: Start relocating %sfork for fileid=%u name=%.*s\n", (forktype ? "rsrc" : "data"), fileID, cp->c_desc.cd_namelen, cp->c_desc.cd_nameptr); + + /* No extent entry to be shifted into another extent overflow record */ + if (shift_extent.blockCount == 0) { + if (hfs_resize_debug) { + printf ("hfs_split_extent: No extent entry to be shifted into overflow records\n"); + } + error = 0; + goto out; } - /* We always need the allocation bitmap and extents B-tree */ - locks |= SFL_BITMAP | SFL_EXTENTS; + /* The overflow extent entry has to be shifted into an extent + * overflow record. This would mean that we have to shift + * extent entries from all overflow records by one. We will + * start iteration from the first record to the last record, + * and shift the extent entry from one record to another. + * We might have to create a new record for the last extent + * entry for the file. + */ - error = hfs_start_transaction(hfsmp); - if (error) { - printf("hfs_reclaim_file: hfs_start_transaction returned %d\n", error); - if (took_truncate_lock) { - hfs_unlock_truncate(cp, TRUE); + /* Initialize iterator to search the next record */ + bzero(&iterator, sizeof(iterator)); + if (is_xattr) { + /* Copy the key from the iterator that was to update the modified attribute record. */ + xattr_key = (HFSPlusAttrKey *)&(iterator.key); + bcopy((HFSPlusAttrKey *)&(extent_info->iterator->key), xattr_key, sizeof(HFSPlusAttrKey)); + /* Note: xattr_key->startBlock will be initialized later in the iteration loop */ + + MALLOC(xattr_rec, HFSPlusAttrRecord *, + sizeof(HFSPlusAttrRecord), M_TEMP, M_WAITOK); + if (xattr_rec == NULL) { + error = ENOMEM; + goto out; } - return error; + btdata.bufferAddress = xattr_rec; + btdata.itemSize = sizeof(HFSPlusAttrRecord); + btdata.itemCount = 1; + extents = xattr_rec->overflowExtents.extents; + } else { + extents_key = (HFSPlusExtentKey *) &(iterator.key); + extents_key->keyLength = kHFSPlusExtentKeyMaximumLength; + extents_key->forkType = extent_info->forkType; + extents_key->fileID = extent_info->fileID; + /* Note: extents_key->startBlock will be initialized later in the iteration loop */ + + MALLOC(extents_rec, HFSPlusExtentRecord *, + sizeof(HFSPlusExtentRecord), M_TEMP, M_WAITOK); + if (extents_rec == NULL) { + error = ENOMEM; + goto out; + } + btdata.bufferAddress = extents_rec; + btdata.itemSize = sizeof(HFSPlusExtentRecord); + btdata.itemCount = 1; + extents = extents_rec[0]; } - lockflags = hfs_systemfile_lock(hfsmp, locks, HFS_EXCLUSIVE_LOCK); - fp = VTOF(vp); - datablks = 0; - *blks_moved = 0; - /* Relocate non-overflow extents */ - for (i = 0; i < kHFSPlusExtentDensity; ++i) { - if (fp->ff_extents[i].blockCount == 0) - break; - oldStartBlock = fp->ff_extents[i].startBlock; - oldBlockCount = fp->ff_extents[i].blockCount; - datablks += oldBlockCount; - end_block = oldStartBlock + oldBlockCount; - /* Check if the file overlaps the target space */ - if (end_block > startblk) { - alloc_flags = HFS_ALLOC_FORCECONTIG | HFS_ALLOC_SKIPFREEBLKS; - if (is_sysfile) { - alloc_flags |= HFS_ALLOC_METAZONE; + /* An extent entry still needs to be shifted into following overflow + * extent record. This will result in the starting allocation block + * number of the extent record being changed which is part of the key + * for the extent record. Since the extent record key is changing, + * the record can not be updated, instead has to be deleted and + * inserted again. + */ + while (shift_extent.blockCount) { + if (hfs_resize_debug) { + printf ("hfs_split_extent: Will shift (%u,%u) into record with startBlock=%u\n", shift_extent.startBlock, shift_extent.blockCount, read_recStartBlock); + } + + /* Search if there is any existing overflow extent record. + * For this, the logical start block number in the key is + * the value calculated based on the logical start block + * number of the current extent record and the total number + * of blocks existing in the current extent record. + */ + if (is_xattr) { + xattr_key->startBlock = read_recStartBlock; + } else { + extents_key->startBlock = read_recStartBlock; + } + error = BTSearchRecord(extent_info->fcb, &iterator, &btdata, &reclen, &iterator); + if (error) { + if (error != btNotFound) { + printf ("hfs_split_extent: fileID=%u startBlock=%u BTSearchRecord error=%d\n", extent_info->fileID, read_recStartBlock, error); + goto out; } - error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, alloc_flags, &newStartBlock, &newBlockCount); - if (error) { - if (!is_sysfile && ((error == dskFulErr) || (error == ENOSPC))) { - /* Try allocating again using the metadata zone */ - alloc_flags |= HFS_ALLOC_METAZONE; - error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, alloc_flags, &newStartBlock, &newBlockCount); - } - if (error) { - printf("hfs_reclaim_file: BlockAllocate(metazone) (error=%d) for fileID=%u %u:(%u,%u)\n", error, fileID, i, oldStartBlock, oldBlockCount); - goto fail; - } else { - if (hfs_resize_debug) { - printf("hfs_reclaim_file: BlockAllocate(metazone) success for fileID=%u %u:(%u,%u)\n", fileID, i, newStartBlock, newBlockCount); - } - } + create_record = true; + } + + /* The extra extent entry from the previous record is being inserted + * as the first entry in the current extent record. This will change + * the file allocation block number (FABN) of the current extent + * record, which is the startBlock value from the extent record key. + * Since one extra entry is being inserted in the record, the new + * FABN for the record will less than old FABN by the number of blocks + * in the new extent entry being inserted at the start. We have to + * do this before we update read_recStartBlock to point at the + * startBlock of the following record. + */ + write_recStartBlock = read_recStartBlock - shift_extent.blockCount; + if (hfs_resize_debug) { + if (create_record) { + printf ("hfs_split_extent: No records found for startBlock=%u, will create new with startBlock=%u\n", read_recStartBlock, write_recStartBlock); } + } - /* Copy data from old location to new location */ - error = hfs_copy_extent(hfsmp, vp, oldStartBlock, newStartBlock, newBlockCount, context); - if (error) { - printf("hfs_reclaim_file: hfs_copy_extent error=%d for fileID=%u %u:(%u,%u) to %u:(%u,%u)\n", error, fileID, i, oldStartBlock, oldBlockCount, i, newStartBlock, newBlockCount); - if (BlockDeallocate(hfsmp, newStartBlock, newBlockCount, HFS_ALLOC_SKIPFREEBLKS)) { - hfs_mark_volume_inconsistent(hfsmp); - } - goto fail; + /* Now update the read_recStartBlock to account for total number + * of blocks in this extent record. It will now point to the + * starting allocation block number for the next extent record. + */ + for (i = 0; i < kHFSPlusExtentDensity; i++) { + if (extents[i].blockCount == 0) { + break; } - fp->ff_extents[i].startBlock = newStartBlock; - cp->c_flag |= C_MODIFIED; - *blks_moved += newBlockCount; + read_recStartBlock += extents[i].blockCount; + } - /* Deallocate the old extent */ - error = BlockDeallocate(hfsmp, oldStartBlock, oldBlockCount, HFS_ALLOC_SKIPFREEBLKS); - if (error) { - printf("hfs_reclaim_file: BlockDeallocate returned %d\n", error); - hfs_mark_volume_inconsistent(hfsmp); - goto fail; + if (create_record == true) { + /* Initialize new record content with only one extent entry */ + bzero(extents, sizeof(HFSPlusExtentRecord)); + /* The new record will contain only one extent entry */ + extents[0] = shift_extent; + /* There are no more overflow extents to be shifted */ + shift_extent.startBlock = shift_extent.blockCount = 0; + + if (is_xattr) { + xattr_rec->recordType = kHFSPlusAttrExtents; + xattr_rec->overflowExtents.reserved = 0; + reclen = sizeof(HFSPlusAttrExtents); + } else { + reclen = sizeof(HFSPlusExtentRecord); } - - /* If this is a system file, sync the volume header on disk */ - if (is_sysfile) { - error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH); - if (error) { - printf("hfs_reclaim_file: hfs_flushvolumeheader returned %d\n", error); - hfs_mark_volume_inconsistent(hfsmp); - goto fail; - } + } else { + /* The overflow extent entry from previous record will be + * the first entry in this extent record. If the last + * extent entry in this record is valid, it will be shifted + * into the following extent record as its first entry. So + * save the last entry before shifting entries in current + * record. + */ + last_extent = extents[kHFSPlusExtentDensity-1]; + + /* Shift all entries by one index towards the end */ + for (i = kHFSPlusExtentDensity-2; i >= 0; i--) { + extents[i+1] = extents[i]; } + /* Overflow extent entry saved from previous record + * is now the first entry in the current record. + */ + extents[0] = shift_extent; + if (hfs_resize_debug) { - printf ("hfs_reclaim_file: Relocated %u:(%u,%u) to %u:(%u,%u)\n", i, oldStartBlock, oldBlockCount, i, newStartBlock, newBlockCount); + printf ("hfs_split_extent: Shift overflow=(%u,%u) to record with updated startBlock=%u\n", shift_extent.startBlock, shift_extent.blockCount, write_recStartBlock); } - } - } - - /* Relocate overflow extents (if any) */ - if (i == kHFSPlusExtentDensity && fp->ff_blocks > datablks) { - struct FSBufferDescriptor btdata; - HFSPlusExtentRecord record; - HFSPlusExtentKey *key; - FCB *fcb; - int overflow_count = 0; - - if (kmem_alloc(kernel_map, (vm_offset_t*) &iterator, sizeof(*iterator))) { - printf("hfs_reclaim_file: kmem_alloc failed!\n"); - error = ENOMEM; - goto fail; - } - bzero(iterator, sizeof(*iterator)); - key = (HFSPlusExtentKey *) &iterator->key; - key->keyLength = kHFSPlusExtentKeyMaximumLength; - key->forkType = forktype; - key->fileID = fileID; - key->startBlock = datablks; - - btdata.bufferAddress = &record; - btdata.itemSize = sizeof(record); - btdata.itemCount = 1; - - fcb = VTOF(hfsmp->hfs_extents_vp); + /* The last entry from current record will be the + * overflow entry which will be the first entry for + * the following extent record. + */ + shift_extent = last_extent; - error = BTSearchRecord(fcb, iterator, &btdata, NULL, iterator); - while (error == 0) { - /* Stop when we encounter a different file or fork. */ - if ((key->fileID != fileID) || - (key->forkType != forktype)) { - break; + /* Since the key->startBlock is being changed for this record, + * it should be deleted and inserted with the new key. + */ + error = BTDeleteRecord(extent_info->fcb, &iterator); + if (error) { + printf ("hfs_split_extent: fileID=%u startBlock=%u BTDeleteRecord error=%d\n", extent_info->fileID, read_recStartBlock, error); + goto out; } - - /* Just track the overflow extent record number for debugging... */ if (hfs_resize_debug) { - overflow_count++; + printf ("hfs_split_extent: Deleted record with startBlock=%u\n", (is_xattr ? xattr_key->startBlock : extents_key->startBlock)); } + } - /* - * Check if the file overlaps target space. - */ - for (i = 0; i < kHFSPlusExtentDensity; ++i) { - if (record[i].blockCount == 0) { - goto fail; - } - oldStartBlock = record[i].startBlock; - oldBlockCount = record[i].blockCount; - end_block = oldStartBlock + oldBlockCount; - if (end_block > startblk) { - alloc_flags = HFS_ALLOC_FORCECONTIG | HFS_ALLOC_SKIPFREEBLKS; - if (is_sysfile) { - alloc_flags |= HFS_ALLOC_METAZONE; - } - error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, alloc_flags, &newStartBlock, &newBlockCount); - if (error) { - if (!is_sysfile && ((error == dskFulErr) || (error == ENOSPC))) { - /* Try allocating again using the metadata zone */ - alloc_flags |= HFS_ALLOC_METAZONE; - error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, alloc_flags, &newStartBlock, &newBlockCount); - } - if (error) { - printf("hfs_reclaim_file: BlockAllocate(metazone) (error=%d) for fileID=%u %u:(%u,%u)\n", error, fileID, i, oldStartBlock, oldBlockCount); - goto fail; - } else { - if (hfs_resize_debug) { - printf("hfs_reclaim_file: BlockAllocate(metazone) success for fileID=%u %u:(%u,%u)\n", fileID, i, newStartBlock, newBlockCount); - } - } - } - error = hfs_copy_extent(hfsmp, vp, oldStartBlock, newStartBlock, newBlockCount, context); - if (error) { - printf("hfs_reclaim_file: hfs_copy_extent error=%d for fileID=%u (%u,%u) to (%u,%u)\n", error, fileID, oldStartBlock, oldBlockCount, newStartBlock, newBlockCount); - if (BlockDeallocate(hfsmp, newStartBlock, newBlockCount, HFS_ALLOC_SKIPFREEBLKS)) { - hfs_mark_volume_inconsistent(hfsmp); - } - goto fail; - } - record[i].startBlock = newStartBlock; - cp->c_flag |= C_MODIFIED; - *blks_moved += newBlockCount; - - /* - * NOTE: To support relocating overflow extents of the - * allocation file, we must update the BTree record BEFORE - * deallocating the old extent so that BlockDeallocate will - * use the extent's new location to calculate physical block - * numbers. (This is for the case where the old extent's - * bitmap bits actually reside in the extent being moved.) - */ - error = BTUpdateRecord(fcb, iterator, (IterateCallBackProcPtr) hfs_relocate_callback, &record); - if (error) { - printf("hfs_reclaim_file: BTUpdateRecord returned %d\n", error); - hfs_mark_volume_inconsistent(hfsmp); - goto fail; - } - error = BlockDeallocate(hfsmp, oldStartBlock, oldBlockCount, HFS_ALLOC_SKIPFREEBLKS); - if (error) { - printf("hfs_reclaim_file: BlockDeallocate returned %d\n", error); - hfs_mark_volume_inconsistent(hfsmp); - goto fail; - } - if (hfs_resize_debug) { - printf ("hfs_reclaim_file: Relocated overflow#%d %u:(%u,%u) to %u:(%u,%u)\n", overflow_count, i, oldStartBlock, oldBlockCount, i, newStartBlock, newBlockCount); - } - } - } - /* Look for more records. */ - error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL); - if (error == btNotFound) { - error = 0; - break; + /* Insert the newly created or modified extent record */ + bzero(&iterator.hint, sizeof(iterator.hint)); + if (is_xattr) { + xattr_key->startBlock = write_recStartBlock; + } else { + extents_key->startBlock = write_recStartBlock; + } + error = BTInsertRecord(extent_info->fcb, &iterator, &btdata, reclen); + if (error) { + printf ("hfs_split_extent: fileID=%u, startBlock=%u BTInsertRecord error=%d\n", extent_info->fileID, write_recStartBlock, error); + goto out; + } + if (hfs_resize_debug) { + printf ("hfs_split_extent: Inserted extent record with startBlock=%u\n", write_recStartBlock); + } + } + BTFlushPath(extent_info->fcb); +out: + if (extents_rec) { + FREE (extents_rec, M_TEMP); + } + if (xattr_rec) { + FREE (xattr_rec, M_TEMP); + } + return error; +} + + +/* + * Relocate an extent if it lies beyond the expected end of volume. + * + * This function is called for every extent of the file being relocated. + * It allocates space for relocation, copies the data, deallocates + * the old extent, and update corresponding on-disk extent. If the function + * does not find contiguous space to relocate an extent, it splits the + * extent in smaller size to be able to relocate it out of the area of + * disk being reclaimed. As an optimization, if an extent lies partially + * in the area of the disk being reclaimed, it is split so that we only + * have to relocate the area that was overlapping with the area of disk + * being reclaimed. + * + * Note that every extent is relocated in its own transaction so that + * they do not overwhelm the journal. This function handles the extent + * record that exists in the catalog record, extent record from overflow + * extents btree, and extents for large EAs. + * + * Inputs: + * extent_info - This is the structure that contains state about + * the current file, extent, and extent record that + * is being relocated. This structure is shared + * among code that traverses through all the extents + * of the file, code that relocates extents, and + * code that splits the extent. + */ +static int +hfs_reclaim_extent(struct hfsmount *hfsmp, const u_long allocLimit, struct hfs_reclaim_extent_info *extent_info, vfs_context_t context) +{ + int error = 0; + int index; + struct cnode *cp; + u_int32_t oldStartBlock; + u_int32_t oldBlockCount; + u_int32_t newStartBlock; + u_int32_t newBlockCount; + u_int32_t alloc_flags; + int blocks_allocated = false; + + index = extent_info->extent_index; + cp = VTOC(extent_info->vp); + + oldStartBlock = extent_info->extents[index].startBlock; + oldBlockCount = extent_info->extents[index].blockCount; + + if (0 && hfs_resize_debug) { + printf ("hfs_reclaim_extent: Examine record:%u recStartBlock=%u, %u:(%u,%u)\n", extent_info->overflow_count, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount); + } + + /* Check if the current extent lies completely within allocLimit */ + if ((oldStartBlock + oldBlockCount) <= allocLimit) { + extent_info->cur_blockCount += oldBlockCount; + return error; + } + + /* Every extent should be relocated in its own transaction + * to make sure that we don't overflow the journal buffer. + */ + error = hfs_start_transaction(hfsmp); + if (error) { + return error; + } + extent_info->lockflags = hfs_systemfile_lock(hfsmp, extent_info->lockflags, HFS_EXCLUSIVE_LOCK); + + /* Check if the extent lies partially in the area to reclaim, + * i.e. it starts before allocLimit and ends beyond allocLimit. + * We have already skipped extents that lie completely within + * allocLimit in the check above, so we only check for the + * startBlock. If it lies partially, split it so that we + * only relocate part of the extent. + */ + if (oldStartBlock < allocLimit) { + newBlockCount = allocLimit - oldStartBlock; + error = hfs_split_extent(extent_info, newBlockCount); + if (error == 0) { + /* After successful split, the current extent does not + * need relocation, so just return back. + */ + goto out; + } + /* Ignore error and try relocating the entire extent instead */ + } + + alloc_flags = HFS_ALLOC_FORCECONTIG | HFS_ALLOC_SKIPFREEBLKS; + if (extent_info->is_sysfile) { + alloc_flags |= HFS_ALLOC_METAZONE; + } + + error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, alloc_flags, + &newStartBlock, &newBlockCount); + if ((extent_info->is_sysfile == false) && + ((error == dskFulErr) || (error == ENOSPC))) { + /* For non-system files, try reallocating space in metadata zone */ + alloc_flags |= HFS_ALLOC_METAZONE; + error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, + alloc_flags, &newStartBlock, &newBlockCount); + } + if ((error == dskFulErr) || (error == ENOSPC)) { + /* We did not find desired contiguous space for this extent. + * So try to allocate the maximum contiguous space available. + */ + alloc_flags &= ~HFS_ALLOC_FORCECONTIG; + + error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, + alloc_flags, &newStartBlock, &newBlockCount); + if (error) { + printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) BlockAllocate error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error); + goto out; + } + blocks_allocated = true; + + error = hfs_split_extent(extent_info, newBlockCount); + if (error) { + printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) split error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error); + goto out; + } + oldBlockCount = newBlockCount; + } + if (error) { + printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) contig BlockAllocate error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error); + goto out; + } + blocks_allocated = true; + + /* Copy data from old location to new location */ + error = hfs_copy_extent(hfsmp, extent_info->vp, oldStartBlock, + newStartBlock, newBlockCount, context); + if (error) { + printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u)=>(%u,%u) hfs_copy_extent error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, newStartBlock, newBlockCount, error); + goto out; + } + + /* Update the extent record with the new start block information */ + extent_info->extents[index].startBlock = newStartBlock; + + /* Sync the content back to the disk */ + if (extent_info->catalog_fp) { + /* Update the extents in catalog record */ + if (extent_info->is_dirlink) { + error = cat_update_dirlink(hfsmp, extent_info->forkType, + extent_info->dirlink_desc, extent_info->dirlink_attr, + &(extent_info->dirlink_fork->ff_data)); + } else { + cp->c_flag |= C_MODIFIED; + /* If this is a system file, sync volume headers on disk */ + if (extent_info->is_sysfile) { + error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH); } } + } else { + /* Replace record for extents overflow or extents-based xattrs */ + error = BTReplaceRecord(extent_info->fcb, extent_info->iterator, + &(extent_info->btdata), extent_info->recordlen); } - -fail: - if (iterator) { - kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator)); + if (error) { + printf ("hfs_reclaim_extent: fileID=%u, update record error=%u\n", extent_info->fileID, error); + goto out; + } + + /* Deallocate the old extent */ + error = BlockDeallocate(hfsmp, oldStartBlock, oldBlockCount, HFS_ALLOC_SKIPFREEBLKS); + if (error) { + printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) BlockDeallocate error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error); + goto out; } + extent_info->blocks_relocated += newBlockCount; - (void) hfs_systemfile_unlock(hfsmp, lockflags); + if (hfs_resize_debug) { + printf ("hfs_reclaim_extent: Relocated record:%u %u:(%u,%u) to (%u,%u)\n", extent_info->overflow_count, index, oldStartBlock, oldBlockCount, newStartBlock, newBlockCount); + } - if ((*blks_moved != 0) && (is_sysfile == false)) { - (void) hfs_update(vp, MNT_WAIT); +out: + if (error != 0) { + if (blocks_allocated == true) { + BlockDeallocate(hfsmp, newStartBlock, newBlockCount, HFS_ALLOC_SKIPFREEBLKS); + } + } else { + /* On success, increment the total allocation blocks processed */ + extent_info->cur_blockCount += newBlockCount; } - (void) hfs_end_transaction(hfsmp); + hfs_systemfile_unlock(hfsmp, extent_info->lockflags); - if (took_truncate_lock) { - hfs_unlock_truncate(cp, TRUE); + /* For a non-system file, if an extent entry from catalog record + * was modified, sync the in-memory changes to the catalog record + * on disk before ending the transaction. + */ + if ((error == 0) && + (extent_info->overflow_count < kHFSPlusExtentDensity) && + (extent_info->is_sysfile == false)) { + (void) hfs_update(extent_info->vp, MNT_WAIT); + } + + hfs_end_transaction(hfsmp); + + return error; +} + +/* Report intermediate progress during volume resize */ +static void +hfs_truncatefs_progress(struct hfsmount *hfsmp) +{ + u_int32_t cur_progress; + + hfs_resize_progress(hfsmp, &cur_progress); + if (cur_progress > (hfsmp->hfs_resize_progress + 9)) { + printf("hfs_truncatefs: %d%% done...\n", cur_progress); + hfsmp->hfs_resize_progress = cur_progress; + } + return; +} + +/* + * Reclaim space at the end of a volume for given file and forktype. + * + * This routine attempts to move any extent which contains allocation blocks + * at or after "allocLimit." A separate transaction is used for every extent + * that needs to be moved. If there is not contiguous space available for + * moving an extent, it can be split into smaller extents. The contents of + * any moved extents are read and written via the volume's device vnode -- + * NOT via "vp." During the move, moved blocks which are part of a transaction + * have their physical block numbers invalidated so they will eventually be + * written to their new locations. + * + * This function is also called for directory hard links. Directory hard links + * are regular files with no data fork and resource fork that contains alias + * information for backward compatibility with pre-Leopard systems. However + * non-Mac OS X implementation can add/modify data fork or resource fork + * information to directory hard links, so we check, and if required, relocate + * both data fork and resource fork. + * + * Inputs: + * hfsmp The volume being resized. + * vp The vnode for the system file. + * fileID ID of the catalog record that needs to be relocated + * forktype The type of fork that needs relocated, + * kHFSResourceForkType for resource fork, + * kHFSDataForkType for data fork + * allocLimit Allocation limit for the new volume size, + * do not use this block or beyond. All extents + * that use this block or any blocks beyond this limit + * will be relocated. + * + * Side Effects: + * hfsmp->hfs_resize_blocksmoved is incremented by the number of allocation + * blocks that were relocated. + */ +static int +hfs_reclaim_file(struct hfsmount *hfsmp, struct vnode *vp, u_int32_t fileID, + u_int8_t forktype, u_long allocLimit, vfs_context_t context) +{ + int error = 0; + struct hfs_reclaim_extent_info *extent_info; + int i; + int lockflags = 0; + struct cnode *cp; + struct filefork *fp; + int took_truncate_lock = false; + int release_desc = false; + HFSPlusExtentKey *key; + + /* If there is no vnode for this file, then there's nothing to do. */ + if (vp == NULL) { + return 0; + } + + cp = VTOC(vp); + + MALLOC(extent_info, struct hfs_reclaim_extent_info *, + sizeof(struct hfs_reclaim_extent_info), M_TEMP, M_WAITOK); + if (extent_info == NULL) { + return ENOMEM; + } + bzero(extent_info, sizeof(struct hfs_reclaim_extent_info)); + extent_info->vp = vp; + extent_info->fileID = fileID; + extent_info->forkType = forktype; + extent_info->is_sysfile = vnode_issystem(vp); + if (vnode_isdir(vp) && (cp->c_flag & C_HARDLINK)) { + extent_info->is_dirlink = true; + } + /* We always need allocation bitmap and extent btree lock */ + lockflags = SFL_BITMAP | SFL_EXTENTS; + if ((fileID == kHFSCatalogFileID) || (extent_info->is_dirlink == true)) { + lockflags |= SFL_CATALOG; + } else if (fileID == kHFSAttributesFileID) { + lockflags |= SFL_ATTRIBUTE; + } else if (fileID == kHFSStartupFileID) { + lockflags |= SFL_STARTUP; + } + extent_info->lockflags = lockflags; + extent_info->fcb = VTOF(hfsmp->hfs_extents_vp); + + /* Flush data associated with current file on disk. + * + * If the current vnode is directory hard link, no flushing of + * journal or vnode is required. The current kernel does not + * modify data/resource fork of directory hard links, so nothing + * will be in the cache. If a directory hard link is newly created, + * the resource fork data is written directly using devvp and + * the code that actually relocates data (hfs_copy_extent()) also + * uses devvp for its I/O --- so they will see a consistent copy. + */ + if (extent_info->is_sysfile) { + /* If the current vnode is system vnode, flush journal + * to make sure that all data is written to the disk. + */ + error = hfs_journal_flush(hfsmp, TRUE); + if (error) { + printf ("hfs_reclaim_file: journal_flush returned %d\n", error); + goto out; + } + } else if (extent_info->is_dirlink == false) { + /* Flush all blocks associated with this regular file vnode. + * Normally there should not be buffer cache blocks for regular + * files, but for objects like symlinks, we can have buffer cache + * blocks associated with the vnode. Therefore we call + * buf_flushdirtyblks() also. + */ + buf_flushdirtyblks(vp, 0, BUF_SKIP_LOCKED, "hfs_reclaim_file"); + + hfs_unlock(cp); + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK); + took_truncate_lock = true; + (void) cluster_push(vp, 0); + error = hfs_lock(cp, HFS_FORCE_LOCK); + if (error) { + goto out; + } + + /* If the file no longer exists, nothing left to do */ + if (cp->c_flag & C_NOEXISTS) { + error = 0; + goto out; + } + + /* Wait for any in-progress writes to this vnode to complete, so that we'll + * be copying consistent bits. (Otherwise, it's possible that an async + * write will complete to the old extent after we read from it. That + * could lead to corruption.) + */ + error = vnode_waitforwrites(vp, 0, 0, 0, "hfs_reclaim_file"); + if (error) { + goto out; + } + } + + if (hfs_resize_debug) { + printf("hfs_reclaim_file: === Start reclaiming %sfork for %sid=%u ===\n", (forktype ? "rsrc" : "data"), (extent_info->is_dirlink ? "dirlink" : "file"), fileID); + } + + if (extent_info->is_dirlink) { + MALLOC(extent_info->dirlink_desc, struct cat_desc *, + sizeof(struct cat_desc), M_TEMP, M_WAITOK); + MALLOC(extent_info->dirlink_attr, struct cat_attr *, + sizeof(struct cat_attr), M_TEMP, M_WAITOK); + MALLOC(extent_info->dirlink_fork, struct filefork *, + sizeof(struct filefork), M_TEMP, M_WAITOK); + if ((extent_info->dirlink_desc == NULL) || + (extent_info->dirlink_attr == NULL) || + (extent_info->dirlink_fork == NULL)) { + error = ENOMEM; + goto out; + } + + /* Lookup catalog record for directory hard link and + * create a fake filefork for the value looked up from + * the disk. + */ + fp = extent_info->dirlink_fork; + bzero(extent_info->dirlink_fork, sizeof(struct filefork)); + extent_info->dirlink_fork->ff_cp = cp; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + error = cat_lookup_dirlink(hfsmp, fileID, forktype, + extent_info->dirlink_desc, extent_info->dirlink_attr, + &(extent_info->dirlink_fork->ff_data)); + hfs_systemfile_unlock(hfsmp, lockflags); + if (error) { + printf ("hfs_reclaim_file: cat_lookup_dirlink for fileID=%u returned error=%u\n", fileID, error); + goto out; + } + release_desc = true; + } else { + fp = VTOF(vp); + } + + extent_info->catalog_fp = fp; + extent_info->recStartBlock = 0; + extent_info->extents = extent_info->catalog_fp->ff_extents; + /* Relocate extents from the catalog record */ + for (i = 0; i < kHFSPlusExtentDensity; ++i) { + if (fp->ff_extents[i].blockCount == 0) { + break; + } + extent_info->extent_index = i; + error = hfs_reclaim_extent(hfsmp, allocLimit, extent_info, context); + if (error) { + printf ("hfs_reclaim_file: fileID=%u #%d %u:(%u,%u) hfs_reclaim_extent error=%d\n", fileID, extent_info->overflow_count, i, fp->ff_extents[i].startBlock, fp->ff_extents[i].blockCount, error); + goto out; + } + } + + /* If the number of allocation blocks processed for reclaiming + * are less than total number of blocks for the file, continuing + * working on overflow extents record. + */ + if (fp->ff_blocks <= extent_info->cur_blockCount) { + if (0 && hfs_resize_debug) { + printf ("hfs_reclaim_file: Nothing more to relocate, offset=%d, ff_blocks=%u, cur_blockCount=%u\n", i, fp->ff_blocks, extent_info->cur_blockCount); + } + goto out; + } + + if (hfs_resize_debug) { + printf ("hfs_reclaim_file: Will check overflow records, offset=%d, ff_blocks=%u, cur_blockCount=%u\n", i, fp->ff_blocks, extent_info->cur_blockCount); + } + + MALLOC(extent_info->iterator, struct BTreeIterator *, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK); + if (extent_info->iterator == NULL) { + error = ENOMEM; + goto out; } + bzero(extent_info->iterator, sizeof(struct BTreeIterator)); + key = (HFSPlusExtentKey *) &(extent_info->iterator->key); + key->keyLength = kHFSPlusExtentKeyMaximumLength; + key->forkType = forktype; + key->fileID = fileID; + key->startBlock = extent_info->cur_blockCount; + + extent_info->btdata.bufferAddress = extent_info->record.overflow; + extent_info->btdata.itemSize = sizeof(HFSPlusExtentRecord); + extent_info->btdata.itemCount = 1; + extent_info->catalog_fp = NULL; + + /* Search the first overflow extent with expected startBlock as 'cur_blockCount' */ + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + error = BTSearchRecord(extent_info->fcb, extent_info->iterator, + &(extent_info->btdata), &(extent_info->recordlen), + extent_info->iterator); + hfs_systemfile_unlock(hfsmp, lockflags); + while (error == 0) { + extent_info->overflow_count++; + extent_info->recStartBlock = key->startBlock; + extent_info->extents = extent_info->record.overflow; + for (i = 0; i < kHFSPlusExtentDensity; i++) { + if (extent_info->record.overflow[i].blockCount == 0) { + goto out; + } + extent_info->extent_index = i; + error = hfs_reclaim_extent(hfsmp, allocLimit, extent_info, context); + if (error) { + printf ("hfs_reclaim_file: fileID=%u #%d %u:(%u,%u) hfs_reclaim_extent error=%d\n", fileID, extent_info->overflow_count, i, extent_info->record.overflow[i].startBlock, extent_info->record.overflow[i].blockCount, error); + goto out; + } + } + + /* Look for more overflow records */ + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + error = BTIterateRecord(extent_info->fcb, kBTreeNextRecord, + extent_info->iterator, &(extent_info->btdata), + &(extent_info->recordlen)); + hfs_systemfile_unlock(hfsmp, lockflags); + if (error) { + break; + } + /* Stop when we encounter a different file or fork. */ + if ((key->fileID != fileID) || (key->forkType != forktype)) { + break; + } + } + if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) { + error = 0; + } + +out: + /* If any blocks were relocated, account them and report progress */ + if (extent_info->blocks_relocated) { + hfsmp->hfs_resize_blocksmoved += extent_info->blocks_relocated; + hfs_truncatefs_progress(hfsmp); + if (fileID < kHFSFirstUserCatalogNodeID) { + printf ("hfs_reclaim_file: Relocated %u blocks from fileID=%u on \"%s\"\n", + extent_info->blocks_relocated, fileID, hfsmp->vcbVN); + } + } + if (extent_info->iterator) { + FREE(extent_info->iterator, M_TEMP); + } + if (release_desc == true) { + cat_releasedesc(extent_info->dirlink_desc); + } + if (extent_info->dirlink_desc) { + FREE(extent_info->dirlink_desc, M_TEMP); + } + if (extent_info->dirlink_attr) { + FREE(extent_info->dirlink_attr, M_TEMP); + } + if (extent_info->dirlink_fork) { + FREE(extent_info->dirlink_fork, M_TEMP); + } + if ((extent_info->blocks_relocated != 0) && (extent_info->is_sysfile == false)) { + (void) hfs_update(vp, MNT_WAIT); + } + if (took_truncate_lock) { + hfs_unlock_truncate(cp, 0); + } + if (extent_info) { + FREE(extent_info, M_TEMP); + } if (hfs_resize_debug) { - printf("hfs_reclaim_file: Finished relocating %sfork for fileid=%u (error=%d)\n", (forktype ? "rsrc" : "data"), fileID, error); + printf("hfs_reclaim_file: === Finished relocating %sfork for fileid=%u (error=%d) ===\n", (forktype ? "rsrc" : "data"), fileID, error); } return error; @@ -4604,6 +5756,9 @@ hfs_journal_relocate_callback(void *_args) hfsmp->blockSize, vfs_context_ucred(args->context), &bp); if (error) { printf("hfs_reclaim_journal_file: failed to read JIB (%d)\n", error); + if (bp) { + buf_brelse(bp); + } return error; } jibp = (JournalInfoBlock*) buf_dataptr(bp); @@ -4629,9 +5784,10 @@ hfs_journal_relocate_callback(void *_args) static int -hfs_reclaim_journal_file(struct hfsmount *hfsmp, vfs_context_t context) +hfs_reclaim_journal_file(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context) { int error; + int journal_err; int lockflags; u_int32_t oldStartBlock; u_int32_t newStartBlock; @@ -4642,6 +5798,11 @@ hfs_reclaim_journal_file(struct hfsmount *hfsmp, vfs_context_t context) struct cat_fork journal_fork; struct hfs_journal_relocate_args callback_args; + if (hfsmp->jnl_start + (hfsmp->jnl_size / hfsmp->blockSize) <= allocLimit) { + /* The journal does not require relocation */ + return 0; + } + error = hfs_start_transaction(hfsmp); if (error) { printf("hfs_reclaim_journal_file: hfs_start_transaction returned %d\n", error); @@ -4708,13 +5869,24 @@ hfs_reclaim_journal_file(struct hfsmount *hfsmp, vfs_context_t context) printf("hfs_reclaim_journal_file: hfs_end_transaction returned %d\n", error); } - if (!error && hfs_resize_debug) { - printf ("hfs_reclaim_journal_file: Successfully relocated journal from (%u,%u) to (%u,%u)\n", oldStartBlock, oldBlockCount, newStartBlock, newBlockCount); + /* Account for the blocks relocated and print progress */ + hfsmp->hfs_resize_blocksmoved += oldBlockCount; + hfs_truncatefs_progress(hfsmp); + if (!error) { + printf ("hfs_reclaim_journal_file: Relocated %u blocks from journal on \"%s\"\n", + oldBlockCount, hfsmp->vcbVN); + if (hfs_resize_debug) { + printf ("hfs_reclaim_journal_file: Successfully relocated journal from (%u,%u) to (%u,%u)\n", oldStartBlock, oldBlockCount, newStartBlock, newBlockCount); + } } return error; free_fail: - (void) BlockDeallocate(hfsmp, newStartBlock, newBlockCount, HFS_ALLOC_SKIPFREEBLKS); + journal_err = BlockDeallocate(hfsmp, newStartBlock, newBlockCount, HFS_ALLOC_SKIPFREEBLKS); + if (journal_err) { + printf("hfs_reclaim_journal_file: BlockDeallocate returned %d\n", error); + hfs_mark_volume_inconsistent(hfsmp); + } fail: hfs_systemfile_unlock(hfsmp, lockflags); (void) hfs_end_transaction(hfsmp); @@ -4731,9 +5903,10 @@ hfs_reclaim_journal_file(struct hfsmount *hfsmp, vfs_context_t context) * the field in the volume header and the catalog record. */ static int -hfs_reclaim_journal_info_block(struct hfsmount *hfsmp, vfs_context_t context) +hfs_reclaim_journal_info_block(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context) { int error; + int journal_err; int lockflags; u_int32_t oldBlock; u_int32_t newBlock; @@ -4742,6 +5915,11 @@ hfs_reclaim_journal_info_block(struct hfsmount *hfsmp, vfs_context_t context) struct cat_attr jib_attr; struct cat_fork jib_fork; buf_t old_bp, new_bp; + + if (hfsmp->vcbJinfoBlock <= allocLimit) { + /* The journal info block does not require relocation */ + return 0; + } error = hfs_start_transaction(hfsmp); if (error) { @@ -4773,6 +5951,9 @@ hfs_reclaim_journal_info_block(struct hfsmount *hfsmp, vfs_context_t context) hfsmp->blockSize, vfs_context_ucred(context), &old_bp); if (error) { printf("hfs_reclaim_journal_info_block: failed to read JIB (%d)\n", error); + if (old_bp) { + buf_brelse(old_bp); + } goto free_fail; } new_bp = buf_getblk(hfsmp->hfs_devvp, @@ -4820,101 +6001,537 @@ hfs_reclaim_journal_info_block(struct hfsmount *hfsmp, vfs_context_t context) printf("hfs_reclaim_journal_info_block: hfs_flushvolumeheader returned %d\n", error); goto fail; } - hfs_systemfile_unlock(hfsmp, lockflags); - error = hfs_end_transaction(hfsmp); - if (error) { - printf("hfs_reclaim_journal_info_block: hfs_end_transaction returned %d\n", error); + hfs_systemfile_unlock(hfsmp, lockflags); + error = hfs_end_transaction(hfsmp); + if (error) { + printf("hfs_reclaim_journal_info_block: hfs_end_transaction returned %d\n", error); + } + error = hfs_journal_flush(hfsmp, FALSE); + if (error) { + printf("hfs_reclaim_journal_info_block: journal_flush returned %d\n", error); + } + + /* Account for the block relocated and print progress */ + hfsmp->hfs_resize_blocksmoved += 1; + hfs_truncatefs_progress(hfsmp); + if (!error) { + printf ("hfs_reclaim_journal_info: Relocated 1 block from journal info on \"%s\"\n", + hfsmp->vcbVN); + if (hfs_resize_debug) { + printf ("hfs_reclaim_journal_info_block: Successfully relocated journal info block from (%u,%u) to (%u,%u)\n", oldBlock, blockCount, newBlock, blockCount); + } + } + return error; + +free_fail: + journal_err = BlockDeallocate(hfsmp, newBlock, blockCount, HFS_ALLOC_SKIPFREEBLKS); + if (journal_err) { + printf("hfs_reclaim_journal_info_block: BlockDeallocate returned %d\n", error); + hfs_mark_volume_inconsistent(hfsmp); + } + +fail: + hfs_systemfile_unlock(hfsmp, lockflags); + (void) hfs_end_transaction(hfsmp); + if (hfs_resize_debug) { + printf ("hfs_reclaim_journal_info_block: Error relocating journal info block (error=%d)\n", error); + } + return error; +} + + +/* + * This function traverses through all extended attribute records for a given + * fileID, and calls function that reclaims data blocks that exist in the + * area of the disk being reclaimed which in turn is responsible for allocating + * new space, copying extent data, deallocating new space, and if required, + * splitting the extent. + * + * Note: The caller has already acquired the cnode lock on the file. Therefore + * we are assured that no other thread would be creating/deleting/modifying + * extended attributes for this file. + * + * Side Effects: + * hfsmp->hfs_resize_blocksmoved is incremented by the number of allocation + * blocks that were relocated. + * + * Returns: + * 0 on success, non-zero on failure. + */ +static int +hfs_reclaim_xattr(struct hfsmount *hfsmp, struct vnode *vp, u_int32_t fileID, u_int32_t allocLimit, vfs_context_t context) +{ + int error = 0; + struct hfs_reclaim_extent_info *extent_info; + int i; + HFSPlusAttrKey *key; + int *lockflags; + + if (hfs_resize_debug) { + printf("hfs_reclaim_xattr: === Start reclaiming xattr for id=%u ===\n", fileID); + } + + MALLOC(extent_info, struct hfs_reclaim_extent_info *, + sizeof(struct hfs_reclaim_extent_info), M_TEMP, M_WAITOK); + if (extent_info == NULL) { + return ENOMEM; + } + bzero(extent_info, sizeof(struct hfs_reclaim_extent_info)); + extent_info->vp = vp; + extent_info->fileID = fileID; + extent_info->is_xattr = true; + extent_info->is_sysfile = vnode_issystem(vp); + extent_info->fcb = VTOF(hfsmp->hfs_attribute_vp); + lockflags = &(extent_info->lockflags); + *lockflags = SFL_ATTRIBUTE | SFL_BITMAP; + + /* Initialize iterator from the extent_info structure */ + MALLOC(extent_info->iterator, struct BTreeIterator *, + sizeof(struct BTreeIterator), M_TEMP, M_WAITOK); + if (extent_info->iterator == NULL) { + error = ENOMEM; + goto out; + } + bzero(extent_info->iterator, sizeof(struct BTreeIterator)); + + /* Build attribute key */ + key = (HFSPlusAttrKey *)&(extent_info->iterator->key); + error = hfs_buildattrkey(fileID, NULL, key); + if (error) { + goto out; + } + + /* Initialize btdata from extent_info structure. Note that the + * buffer pointer actually points to the xattr record from the + * extent_info structure itself. + */ + extent_info->btdata.bufferAddress = &(extent_info->record.xattr); + extent_info->btdata.itemSize = sizeof(HFSPlusAttrRecord); + extent_info->btdata.itemCount = 1; + + /* + * Sync all extent-based attribute data to the disk. + * + * All extent-based attribute data I/O is performed via cluster + * I/O using a virtual file that spans across entire file system + * space. + */ + hfs_lock_truncate(VTOC(hfsmp->hfs_attrdata_vp), HFS_EXCLUSIVE_LOCK); + (void)cluster_push(hfsmp->hfs_attrdata_vp, 0); + error = vnode_waitforwrites(hfsmp->hfs_attrdata_vp, 0, 0, 0, "hfs_reclaim_xattr"); + hfs_unlock_truncate(VTOC(hfsmp->hfs_attrdata_vp), 0); + if (error) { + goto out; + } + + /* Search for extended attribute for current file. This + * will place the iterator before the first matching record. + */ + *lockflags = hfs_systemfile_lock(hfsmp, *lockflags, HFS_EXCLUSIVE_LOCK); + error = BTSearchRecord(extent_info->fcb, extent_info->iterator, + &(extent_info->btdata), &(extent_info->recordlen), + extent_info->iterator); + hfs_systemfile_unlock(hfsmp, *lockflags); + if (error) { + if (error != btNotFound) { + goto out; + } + /* btNotFound is expected here, so just mask it */ + error = 0; + } + + while (1) { + /* Iterate to the next record */ + *lockflags = hfs_systemfile_lock(hfsmp, *lockflags, HFS_EXCLUSIVE_LOCK); + error = BTIterateRecord(extent_info->fcb, kBTreeNextRecord, + extent_info->iterator, &(extent_info->btdata), + &(extent_info->recordlen)); + hfs_systemfile_unlock(hfsmp, *lockflags); + + /* Stop the iteration if we encounter end of btree or xattr with different fileID */ + if (error || key->fileID != fileID) { + if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) { + error = 0; + } + break; + } + + /* We only care about extent-based EAs */ + if ((extent_info->record.xattr.recordType != kHFSPlusAttrForkData) && + (extent_info->record.xattr.recordType != kHFSPlusAttrExtents)) { + continue; + } + + if (extent_info->record.xattr.recordType == kHFSPlusAttrForkData) { + extent_info->overflow_count = 0; + extent_info->extents = extent_info->record.xattr.forkData.theFork.extents; + } else if (extent_info->record.xattr.recordType == kHFSPlusAttrExtents) { + extent_info->overflow_count++; + extent_info->extents = extent_info->record.xattr.overflowExtents.extents; + } + + extent_info->recStartBlock = key->startBlock; + for (i = 0; i < kHFSPlusExtentDensity; i++) { + if (extent_info->extents[i].blockCount == 0) { + break; + } + extent_info->extent_index = i; + error = hfs_reclaim_extent(hfsmp, allocLimit, extent_info, context); + if (error) { + printf ("hfs_reclaim_xattr: fileID=%u hfs_reclaim_extent error=%d\n", fileID, error); + goto out; + } + } + } + +out: + /* If any blocks were relocated, account them and report progress */ + if (extent_info->blocks_relocated) { + hfsmp->hfs_resize_blocksmoved += extent_info->blocks_relocated; + hfs_truncatefs_progress(hfsmp); + } + if (extent_info->iterator) { + FREE(extent_info->iterator, M_TEMP); + } + if (extent_info) { + FREE(extent_info, M_TEMP); + } + if (hfs_resize_debug) { + printf("hfs_reclaim_xattr: === Finished relocating xattr for fileid=%u (error=%d) ===\n", fileID, error); + } + return error; +} + +/* + * Reclaim any extent-based extended attributes allocation blocks from + * the area of the disk that is being truncated. + * + * The function traverses the attribute btree to find out the fileIDs + * of the extended attributes that need to be relocated. For every + * file whose large EA requires relocation, it looks up the cnode and + * calls hfs_reclaim_xattr() to do all the work for allocating + * new space, copying data, deallocating old space, and if required, + * splitting the extents. + * + * Inputs: + * allocLimit - starting block of the area being reclaimed + * + * Returns: + * returns 0 on success, non-zero on failure. + */ +static int +hfs_reclaim_xattrspace(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context) +{ + int error = 0; + FCB *fcb; + struct BTreeIterator *iterator = NULL; + struct FSBufferDescriptor btdata; + HFSPlusAttrKey *key; + HFSPlusAttrRecord rec; + int lockflags = 0; + cnid_t prev_fileid = 0; + struct vnode *vp; + int need_relocate; + int btree_operation; + u_int32_t files_moved = 0; + u_int32_t prev_blocksmoved; + int i; + + fcb = VTOF(hfsmp->hfs_attribute_vp); + /* Store the value to print total blocks moved by this function in end */ + prev_blocksmoved = hfsmp->hfs_resize_blocksmoved; + + if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator))) { + return ENOMEM; + } + bzero(iterator, sizeof(*iterator)); + key = (HFSPlusAttrKey *)&iterator->key; + btdata.bufferAddress = &rec; + btdata.itemSize = sizeof(rec); + btdata.itemCount = 1; + + need_relocate = false; + btree_operation = kBTreeFirstRecord; + /* Traverse the attribute btree to find extent-based EAs to reclaim */ + while (1) { + lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); + error = BTIterateRecord(fcb, btree_operation, iterator, &btdata, NULL); + hfs_systemfile_unlock(hfsmp, lockflags); + if (error) { + if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) { + error = 0; + } + break; + } + btree_operation = kBTreeNextRecord; + + /* If the extents of current fileID were already relocated, skip it */ + if (prev_fileid == key->fileID) { + continue; + } + + /* Check if any of the extents in the current record need to be relocated */ + need_relocate = false; + switch(rec.recordType) { + case kHFSPlusAttrForkData: + for (i = 0; i < kHFSPlusExtentDensity; i++) { + if (rec.forkData.theFork.extents[i].blockCount == 0) { + break; + } + if ((rec.forkData.theFork.extents[i].startBlock + + rec.forkData.theFork.extents[i].blockCount) > allocLimit) { + need_relocate = true; + break; + } + } + break; + + case kHFSPlusAttrExtents: + for (i = 0; i < kHFSPlusExtentDensity; i++) { + if (rec.overflowExtents.extents[i].blockCount == 0) { + break; + } + if ((rec.overflowExtents.extents[i].startBlock + + rec.overflowExtents.extents[i].blockCount) > allocLimit) { + need_relocate = true; + break; + } + } + break; + }; + + /* Continue iterating to next attribute record */ + if (need_relocate == false) { + continue; + } + + /* Look up the vnode for corresponding file. The cnode + * will be locked which will ensure that no one modifies + * the xattrs when we are relocating them. + * + * We want to allow open-unlinked files to be moved, + * so provide allow_deleted == 1 for hfs_vget(). + */ + if (hfs_vget(hfsmp, key->fileID, &vp, 0, 1) != 0) { + continue; + } + + error = hfs_reclaim_xattr(hfsmp, vp, key->fileID, allocLimit, context); + hfs_unlock(VTOC(vp)); + vnode_put(vp); + if (error) { + printf ("hfs_reclaim_xattrspace: Error relocating xattrs for fileid=%u (error=%d)\n", key->fileID, error); + break; + } + prev_fileid = key->fileID; + files_moved++; + } + + if (files_moved) { + printf("hfs_reclaim_xattrspace: Relocated %u xattr blocks from %u files on \"%s\"\n", + (hfsmp->hfs_resize_blocksmoved - prev_blocksmoved), + files_moved, hfsmp->vcbVN); } - error = hfs_journal_flush(hfsmp); - if (error) { - printf("hfs_reclaim_journal_info_block: journal_flush returned %d\n", error); + + kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator)); + return error; +} + +/* + * Reclaim blocks from regular files. + * + * This function iterates over all the record in catalog btree looking + * for files with extents that overlap into the space we're trying to + * free up. If a file extent requires relocation, it looks up the vnode + * and calls function to relocate the data. + * + * Returns: + * Zero on success, non-zero on failure. + */ +static int +hfs_reclaim_filespace(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context) +{ + int error; + FCB *fcb; + struct BTreeIterator *iterator = NULL; + struct FSBufferDescriptor btdata; + int btree_operation; + int lockflags; + struct HFSPlusCatalogFile filerec; + struct vnode *vp; + struct vnode *rvp; + struct filefork *datafork; + u_int32_t files_moved = 0; + u_int32_t prev_blocksmoved; + + fcb = VTOF(hfsmp->hfs_catalog_vp); + /* Store the value to print total blocks moved by this function at the end */ + prev_blocksmoved = hfsmp->hfs_resize_blocksmoved; + + if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator))) { + return ENOMEM; } + bzero(iterator, sizeof(*iterator)); + + btdata.bufferAddress = &filerec; + btdata.itemSize = sizeof(filerec); + btdata.itemCount = 1; + + btree_operation = kBTreeFirstRecord; + while (1) { + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + error = BTIterateRecord(fcb, btree_operation, iterator, &btdata, NULL); + hfs_systemfile_unlock(hfsmp, lockflags); + if (error) { + if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) { + error = 0; + } + break; + } + btree_operation = kBTreeNextRecord; + + if (filerec.recordType != kHFSPlusFileRecord) { + continue; + } + + /* Check if any of the extents require relocation */ + if (hfs_file_extent_overlaps(hfsmp, allocLimit, &filerec) == false) { + continue; + } - if (!error && hfs_resize_debug) { - printf ("hfs_reclaim_journal_info_block: Successfully relocated journal info block from (%u,%u) to (%u,%u)\n", oldBlock, blockCount, newBlock, blockCount); + /* We want to allow open-unlinked files to be moved, so allow_deleted == 1 */ + if (hfs_vget(hfsmp, filerec.fileID, &vp, 0, 1) != 0) { + continue; + } + + /* If data fork exists or item is a directory hard link, relocate blocks */ + datafork = VTOF(vp); + if ((datafork && datafork->ff_blocks > 0) || vnode_isdir(vp)) { + error = hfs_reclaim_file(hfsmp, vp, filerec.fileID, + kHFSDataForkType, allocLimit, context); + if (error) { + printf ("hfs_reclaimspace: Error reclaiming datafork blocks of fileid=%u (error=%d)\n", filerec.fileID, error); + hfs_unlock(VTOC(vp)); + vnode_put(vp); + break; + } + } + + /* If resource fork exists or item is a directory hard link, relocate blocks */ + if (((VTOC(vp)->c_blocks - (datafork ? datafork->ff_blocks : 0)) > 0) || vnode_isdir(vp)) { + if (vnode_isdir(vp)) { + /* Resource fork vnode lookup is invalid for directory hard link. + * So we fake data fork vnode as resource fork vnode. + */ + rvp = vp; + } else { + error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, FALSE); + if (error) { + printf ("hfs_reclaimspace: Error looking up rvp for fileid=%u (error=%d)\n", filerec.fileID, error); + hfs_unlock(VTOC(vp)); + vnode_put(vp); + break; + } + VTOC(rvp)->c_flag |= C_NEED_RVNODE_PUT; + } + + error = hfs_reclaim_file(hfsmp, rvp, filerec.fileID, + kHFSResourceForkType, allocLimit, context); + if (error) { + printf ("hfs_reclaimspace: Error reclaiming rsrcfork blocks of fileid=%u (error=%d)\n", filerec.fileID, error); + hfs_unlock(VTOC(vp)); + vnode_put(vp); + break; + } + } + + /* The file forks were relocated successfully, now drop the + * cnode lock and vnode reference, and continue iterating to + * next catalog record. + */ + hfs_unlock(VTOC(vp)); + vnode_put(vp); + files_moved++; } - return error; -free_fail: - (void) BlockDeallocate(hfsmp, newBlock, blockCount, HFS_ALLOC_SKIPFREEBLKS); -fail: - hfs_systemfile_unlock(hfsmp, lockflags); - (void) hfs_end_transaction(hfsmp); - if (hfs_resize_debug) { - printf ("hfs_reclaim_journal_info_block: Error relocating journal info block (error=%d)\n", error); + if (files_moved) { + printf("hfs_reclaim_filespace: Relocated %u blocks from %u files on \"%s\"\n", + (hfsmp->hfs_resize_blocksmoved - prev_blocksmoved), + files_moved, hfsmp->vcbVN); } + + kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator)); return error; } - /* * Reclaim space at the end of a file system. * * Inputs - - * startblk - start block of the space being reclaimed + * allocLimit - start block of the space being reclaimed * reclaimblks - number of allocation blocks to reclaim */ static int -hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t reclaimblks, vfs_context_t context) +hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t allocLimit, u_int32_t reclaimblks, vfs_context_t context) { - struct vnode *vp = NULL; - FCB *fcb; - struct BTreeIterator * iterator = NULL; - struct FSBufferDescriptor btdata; - struct HFSPlusCatalogFile filerec; - u_int32_t saved_next_allocation; - cnid_t * cnidbufp; - size_t cnidbufsize; - int filecnt = 0; - int maxfilecnt; - u_int32_t block; - int lockflags; - int i, j; - int error; - int lastprogress = 0; - u_int32_t blks_moved = 0; - u_int32_t total_blks_moved = 0; - Boolean need_relocate; + int error = 0; + + /* + * Preflight the bitmap to find out total number of blocks that need + * relocation. + * + * Note: Since allocLimit is set to the location of new alternate volume + * header, the check below does not account for blocks allocated for old + * alternate volume header. + */ + error = hfs_count_allocated(hfsmp, allocLimit, reclaimblks, &(hfsmp->hfs_resize_totalblocks)); + if (error) { + printf ("hfs_reclaimspace: Unable to determine total blocks to reclaim error=%d\n", error); + return error; + } + if (hfs_resize_debug) { + printf ("hfs_reclaimspace: Total number of blocks to reclaim = %u\n", hfsmp->hfs_resize_totalblocks); + } /* Relocate extents of the Allocation file if they're in the way. */ - error = hfs_reclaim_file(hfsmp, hfsmp->hfs_allocation_vp, startblk, SFL_BITMAP, &blks_moved, context); + error = hfs_reclaim_file(hfsmp, hfsmp->hfs_allocation_vp, kHFSAllocationFileID, + kHFSDataForkType, allocLimit, context); if (error) { printf("hfs_reclaimspace: reclaim allocation file returned %d\n", error); return error; } - total_blks_moved += blks_moved; /* Relocate extents of the Extents B-tree if they're in the way. */ - error = hfs_reclaim_file(hfsmp, hfsmp->hfs_extents_vp, startblk, SFL_EXTENTS, &blks_moved, context); + error = hfs_reclaim_file(hfsmp, hfsmp->hfs_extents_vp, kHFSExtentsFileID, + kHFSDataForkType, allocLimit, context); if (error) { printf("hfs_reclaimspace: reclaim extents b-tree returned %d\n", error); return error; } - total_blks_moved += blks_moved; /* Relocate extents of the Catalog B-tree if they're in the way. */ - error = hfs_reclaim_file(hfsmp, hfsmp->hfs_catalog_vp, startblk, SFL_CATALOG, &blks_moved, context); + error = hfs_reclaim_file(hfsmp, hfsmp->hfs_catalog_vp, kHFSCatalogFileID, + kHFSDataForkType, allocLimit, context); if (error) { printf("hfs_reclaimspace: reclaim catalog b-tree returned %d\n", error); return error; } - total_blks_moved += blks_moved; /* Relocate extents of the Attributes B-tree if they're in the way. */ - error = hfs_reclaim_file(hfsmp, hfsmp->hfs_attribute_vp, startblk, SFL_ATTRIBUTE, &blks_moved, context); + error = hfs_reclaim_file(hfsmp, hfsmp->hfs_attribute_vp, kHFSAttributesFileID, + kHFSDataForkType, allocLimit, context); if (error) { printf("hfs_reclaimspace: reclaim attribute b-tree returned %d\n", error); return error; } - total_blks_moved += blks_moved; /* Relocate extents of the Startup File if there is one and they're in the way. */ - error = hfs_reclaim_file(hfsmp, hfsmp->hfs_startup_vp, startblk, SFL_STARTUP, &blks_moved, context); + error = hfs_reclaim_file(hfsmp, hfsmp->hfs_startup_vp, kHFSStartupFileID, + kHFSDataForkType, allocLimit, context); if (error) { printf("hfs_reclaimspace: reclaim startup file returned %d\n", error); return error; } - total_blks_moved += blks_moved; /* * We need to make sure the alternate volume header gets flushed if we moved @@ -4922,249 +6539,98 @@ hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t reclaimbl * shrinking the size of the volume, or else the journal code will panic * with an invalid (too large) block number. * - * Note that total_blks_moved will be set if ANY extent was moved, even + * Note that blks_moved will be set if ANY extent was moved, even * if it was just an overflow extent. In this case, the journal_flush isn't * strictly required, but shouldn't hurt. */ - if (total_blks_moved) { - hfs_journal_flush(hfsmp); + if (hfsmp->hfs_resize_blocksmoved) { + hfs_journal_flush(hfsmp, FALSE); } - if (hfsmp->jnl_start + (hfsmp->jnl_size / hfsmp->blockSize) > startblk) { - error = hfs_reclaim_journal_file(hfsmp, context); - if (error) { - printf("hfs_reclaimspace: hfs_reclaim_journal_file failed (%d)\n", error); - return error; - } - } - - if (hfsmp->vcbJinfoBlock >= startblk) { - error = hfs_reclaim_journal_info_block(hfsmp, context); - if (error) { - printf("hfs_reclaimspace: hfs_reclaim_journal_info_block failed (%d)\n", error); - return error; - } + /* Relocate journal file blocks if they're in the way. */ + error = hfs_reclaim_journal_file(hfsmp, allocLimit, context); + if (error) { + printf("hfs_reclaimspace: hfs_reclaim_journal_file failed (%d)\n", error); + return error; } - /* For now move a maximum of 250,000 files. */ - maxfilecnt = MIN(hfsmp->hfs_filecount, 250000); - maxfilecnt = MIN((u_int32_t)maxfilecnt, reclaimblks); - cnidbufsize = maxfilecnt * sizeof(cnid_t); - if (kmem_alloc(kernel_map, (vm_offset_t *)&cnidbufp, cnidbufsize)) { - return (ENOMEM); - } - if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator))) { - kmem_free(kernel_map, (vm_offset_t)cnidbufp, cnidbufsize); - return (ENOMEM); - } - - saved_next_allocation = hfsmp->nextAllocation; - /* Always try allocating new blocks after the metadata zone */ - HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_start); - - fcb = VTOF(hfsmp->hfs_catalog_vp); - bzero(iterator, sizeof(*iterator)); - - btdata.bufferAddress = &filerec; - btdata.itemSize = sizeof(filerec); - btdata.itemCount = 1; - - /* Keep the Catalog and extents files locked during iteration. */ - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_EXTENTS, HFS_SHARED_LOCK); - - error = BTIterateRecord(fcb, kBTreeFirstRecord, iterator, NULL, NULL); + /* Relocate journal info block blocks if they're in the way. */ + error = hfs_reclaim_journal_info_block(hfsmp, allocLimit, context); if (error) { - goto end_iteration; - } - /* - * Iterate over all the catalog records looking for files - * that overlap into the space we're trying to free up and - * the total number of blocks that will require relocation. - */ - for (filecnt = 0; filecnt < maxfilecnt; ) { - error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL); - if (error) { - if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) { - error = 0; - } - break; - } - if (filerec.recordType != kHFSPlusFileRecord) { - continue; - } - - need_relocate = false; - /* Check if data fork overlaps the target space */ - for (i = 0; i < kHFSPlusExtentDensity; ++i) { - if (filerec.dataFork.extents[i].blockCount == 0) { - break; - } - block = filerec.dataFork.extents[i].startBlock + - filerec.dataFork.extents[i].blockCount; - if (block >= startblk) { - if ((filerec.fileID == hfsmp->hfs_jnlfileid) || - (filerec.fileID == hfsmp->hfs_jnlinfoblkid)) { - printf("hfs_reclaimspace: cannot move active journal\n"); - error = EPERM; - goto end_iteration; - } - need_relocate = true; - goto save_fileid; - } - } - - /* Check if resource fork overlaps the target space */ - for (j = 0; j < kHFSPlusExtentDensity; ++j) { - if (filerec.resourceFork.extents[j].blockCount == 0) { - break; - } - block = filerec.resourceFork.extents[j].startBlock + - filerec.resourceFork.extents[j].blockCount; - if (block >= startblk) { - need_relocate = true; - goto save_fileid; - } - } - - /* Check if any forks' overflow extents overlap the target space */ - if ((i == kHFSPlusExtentDensity) || (j == kHFSPlusExtentDensity)) { - if (hfs_overlapped_overflow_extents(hfsmp, startblk, filerec.fileID)) { - need_relocate = true; - goto save_fileid; - } - } - -save_fileid: - if (need_relocate == true) { - cnidbufp[filecnt++] = filerec.fileID; - if (hfs_resize_debug) { - printf ("hfs_reclaimspace: Will relocate extents for fileID=%u\n", filerec.fileID); - } - } + printf("hfs_reclaimspace: hfs_reclaim_journal_info_block failed (%d)\n", error); + return error; } -end_iteration: - /* If no regular file was found to be relocated and - * no system file was moved, we probably do not have - * enough space to relocate the system files, or - * something else went wrong. - */ - if ((filecnt == 0) && (total_blks_moved == 0)) { - printf("hfs_reclaimspace: no files moved\n"); - error = ENOSPC; + /* Reclaim extents from catalog file records */ + error = hfs_reclaim_filespace(hfsmp, allocLimit, context); + if (error) { + printf ("hfs_reclaimspace: hfs_reclaim_filespace returned error=%d\n", error); + return error; } - /* All done with catalog. */ - hfs_systemfile_unlock(hfsmp, lockflags); - if (error || filecnt == 0) - goto out; - - hfsmp->hfs_resize_filesmoved = 0; - hfsmp->hfs_resize_totalfiles = filecnt; - - /* Now move any files that are in the way. */ - for (i = 0; i < filecnt; ++i) { - struct vnode *rvp; - struct cnode *cp; - struct filefork *datafork; - - if (hfs_vget(hfsmp, cnidbufp[i], &vp, 0) != 0) - continue; - - cp = VTOC(vp); - datafork = VTOF(vp); - - /* Relocating directory hard links is not supported, so we punt (see radar 6217026). */ - if ((cp->c_flag & C_HARDLINK) && vnode_isdir(vp)) { - printf("hfs_reclaimspace: Unable to relocate directory hard link id=%d\n", cp->c_cnid); - error = EINVAL; - goto out; - } - - /* Relocate any overlapping data fork blocks. */ - if (datafork && datafork->ff_blocks > 0) { - error = hfs_reclaim_file(hfsmp, vp, startblk, 0, &blks_moved, context); - if (error) { - printf ("hfs_reclaimspace: Error reclaiming datafork blocks of fileid=%u (error=%d)\n", cnidbufp[i], error); - break; - } - total_blks_moved += blks_moved; - } - - /* Relocate any overlapping resource fork blocks. */ - if ((cp->c_blocks - (datafork ? datafork->ff_blocks : 0)) > 0) { - error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, TRUE); - if (error) { - printf ("hfs_reclaimspace: Error looking up rvp for fileid=%u (error=%d)\n", cnidbufp[i], error); - break; - } - error = hfs_reclaim_file(hfsmp, rvp, startblk, 0, &blks_moved, context); - VTOC(rvp)->c_flag |= C_NEED_RVNODE_PUT; - if (error) { - printf ("hfs_reclaimspace: Error reclaiming rsrcfork blocks of fileid=%u (error=%d)\n", cnidbufp[i], error); - break; - } - total_blks_moved += blks_moved; - } - hfs_unlock(cp); - vnode_put(vp); - vp = NULL; - - ++hfsmp->hfs_resize_filesmoved; - /* Report intermediate progress. */ - if (filecnt > 100) { - int progress; - - progress = (i * 100) / filecnt; - if (progress > (lastprogress + 9)) { - printf("hfs_reclaimspace: %d%% done...\n", progress); - lastprogress = progress; - } - } - } - if (vp) { - hfs_unlock(VTOC(vp)); - vnode_put(vp); - vp = NULL; - } - if (hfsmp->hfs_resize_filesmoved != 0) { - printf("hfs_reclaimspace: relocated %u blocks from %d files on \"%s\"\n", - total_blks_moved, (int)hfsmp->hfs_resize_filesmoved, hfsmp->vcbVN); + /* Reclaim extents from extent-based extended attributes, if any */ + error = hfs_reclaim_xattrspace(hfsmp, allocLimit, context); + if (error) { + printf ("hfs_reclaimspace: hfs_reclaim_xattrspace returned error=%d\n", error); + return error; } -out: - kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator)); - kmem_free(kernel_map, (vm_offset_t)cnidbufp, cnidbufsize); - /* - * Restore the roving allocation pointer on errors. - * (but only if we didn't move any files) - */ - if (error && hfsmp->hfs_resize_filesmoved == 0) { - HFS_UPDATE_NEXT_ALLOCATION(hfsmp, saved_next_allocation); - } - return (error); + return error; } /* - * Check if there are any overflow data or resource fork extents that overlap + * Check if there are any extents (including overflow extents) that overlap * into the disk space that is being reclaimed. * * Output - - * 1 - One of the overflow extents need to be relocated - * 0 - No overflow extents need to be relocated, or there was an error + * true - One of the extents need to be relocated + * false - No overflow extents need to be relocated, or there was an error */ static int -hfs_overlapped_overflow_extents(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t fileID) +hfs_file_extent_overlaps(struct hfsmount *hfsmp, u_int32_t allocLimit, struct HFSPlusCatalogFile *filerec) { struct BTreeIterator * iterator = NULL; struct FSBufferDescriptor btdata; HFSPlusExtentRecord extrec; HFSPlusExtentKey *extkeyptr; FCB *fcb; - int overlapped = 0; - int i; + int overlapped = false; + int i, j; int error; + int lockflags = 0; + u_int32_t endblock; + + /* Check if data fork overlaps the target space */ + for (i = 0; i < kHFSPlusExtentDensity; ++i) { + if (filerec->dataFork.extents[i].blockCount == 0) { + break; + } + endblock = filerec->dataFork.extents[i].startBlock + + filerec->dataFork.extents[i].blockCount; + if (endblock > allocLimit) { + overlapped = true; + goto out; + } + } + + /* Check if resource fork overlaps the target space */ + for (j = 0; j < kHFSPlusExtentDensity; ++j) { + if (filerec->resourceFork.extents[j].blockCount == 0) { + break; + } + endblock = filerec->resourceFork.extents[j].startBlock + + filerec->resourceFork.extents[j].blockCount; + if (endblock > allocLimit) { + overlapped = true; + goto out; + } + } + + /* Return back if there are no overflow extents for this file */ + if ((i < kHFSPlusExtentDensity) && (j < kHFSPlusExtentDensity)) { + goto out; + } if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator))) { return 0; @@ -5173,7 +6639,7 @@ hfs_overlapped_overflow_extents(struct hfsmount *hfsmp, u_int32_t startblk, u_in extkeyptr = (HFSPlusExtentKey *)&iterator->key; extkeyptr->keyLength = kHFSPlusExtentKeyMaximumLength; extkeyptr->forkType = 0; - extkeyptr->fileID = fileID; + extkeyptr->fileID = filerec->fileID; extkeyptr->startBlock = 0; btdata.bufferAddress = &extrec; @@ -5182,6 +6648,8 @@ hfs_overlapped_overflow_extents(struct hfsmount *hfsmp, u_int32_t startblk, u_in fcb = VTOF(hfsmp->hfs_extents_vp); + lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_SHARED_LOCK); + /* This will position the iterator just before the first overflow * extent record for given fileID. It will always return btNotFound, * so we special case the error code. @@ -5197,7 +6665,7 @@ hfs_overlapped_overflow_extents(struct hfsmount *hfsmp, u_int32_t startblk, u_in error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL); while (error == 0) { /* Stop when we encounter a different file. */ - if (extkeyptr->fileID != fileID) { + if (extkeyptr->fileID != filerec->fileID) { break; } /* Check if any of the forks exist in the target space. */ @@ -5205,8 +6673,9 @@ hfs_overlapped_overflow_extents(struct hfsmount *hfsmp, u_int32_t startblk, u_in if (extrec[i].blockCount == 0) { break; } - if ((extrec[i].startBlock + extrec[i].blockCount) >= startblk) { - overlapped = 1; + endblock = extrec[i].startBlock + extrec[i].blockCount; + if (endblock > allocLimit) { + overlapped = true; goto out; } } @@ -5215,7 +6684,12 @@ hfs_overlapped_overflow_extents(struct hfsmount *hfsmp, u_int32_t startblk, u_in } out: - kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator)); + if (lockflags) { + hfs_systemfile_unlock(hfsmp, lockflags); + } + if (iterator) { + kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator)); + } return overlapped; } @@ -5231,10 +6705,11 @@ hfs_resize_progress(struct hfsmount *hfsmp, u_int32_t *progress) return (ENXIO); } - if (hfsmp->hfs_resize_totalfiles > 0) - *progress = (hfsmp->hfs_resize_filesmoved * 100) / hfsmp->hfs_resize_totalfiles; - else + if (hfsmp->hfs_resize_totalblocks > 0) { + *progress = (u_int32_t)((hfsmp->hfs_resize_blocksmoved * 100ULL) / hfsmp->hfs_resize_totalblocks); + } else { *progress = 0; + } return (0); } @@ -5270,6 +6745,7 @@ hfs_vfs_getattr(struct mount *mp, struct vfs_attr *fsap, __unused vfs_context_t { #define HFS_ATTR_CMN_VALIDMASK (ATTR_CMN_VALIDMASK & ~(ATTR_CMN_NAMEDATTRCOUNT | ATTR_CMN_NAMEDATTRLIST)) #define HFS_ATTR_FILE_VALIDMASK (ATTR_FILE_VALIDMASK & ~(ATTR_FILE_FILETYPE | ATTR_FILE_FORKCOUNT | ATTR_FILE_FORKLIST)) +#define HFS_ATTR_CMN_VOL_VALIDMASK (ATTR_CMN_VALIDMASK & ~(ATTR_CMN_NAMEDATTRCOUNT | ATTR_CMN_NAMEDATTRLIST | ATTR_CMN_ACCTIME)) ExtendedVCB *vcb = VFSTOVCB(mp); struct hfsmount *hfsmp = VFSTOHFS(mp); @@ -5396,20 +6872,20 @@ hfs_vfs_getattr(struct mount *mp, struct vfs_attr *fsap, __unused vfs_context_t if (VFSATTR_IS_ACTIVE(fsap, f_attributes)) { vol_attributes_attr_t *attrp = &fsap->f_attributes; - attrp->validattr.commonattr = HFS_ATTR_CMN_VALIDMASK; + attrp->validattr.commonattr = HFS_ATTR_CMN_VOL_VALIDMASK; attrp->validattr.volattr = ATTR_VOL_VALIDMASK & ~ATTR_VOL_INFO; attrp->validattr.dirattr = ATTR_DIR_VALIDMASK; attrp->validattr.fileattr = HFS_ATTR_FILE_VALIDMASK; attrp->validattr.forkattr = 0; - attrp->nativeattr.commonattr = HFS_ATTR_CMN_VALIDMASK; + attrp->nativeattr.commonattr = HFS_ATTR_CMN_VOL_VALIDMASK; attrp->nativeattr.volattr = ATTR_VOL_VALIDMASK & ~ATTR_VOL_INFO; attrp->nativeattr.dirattr = ATTR_DIR_VALIDMASK; attrp->nativeattr.fileattr = HFS_ATTR_FILE_VALIDMASK; attrp->nativeattr.forkattr = 0; VFSATTR_SET_SUPPORTED(fsap, f_attributes); } - fsap->f_create_time.tv_sec = hfsmp->vcbCrDate; + fsap->f_create_time.tv_sec = hfsmp->hfs_itime; fsap->f_create_time.tv_nsec = 0; VFSATTR_SET_SUPPORTED(fsap, f_create_time); fsap->f_modify_time.tv_sec = hfsmp->vcbLsMod; @@ -5470,6 +6946,10 @@ hfs_rename_volume(struct vnode *vp, const char *name, proc_t p) cat_cookie_t cookie; int lockflags; int error = 0; + char converted_volname[256]; + size_t volname_length = 0; + size_t conv_volname_length = 0; + /* * Ignore attempts to rename a volume to a zero-length name. @@ -5504,8 +6984,16 @@ hfs_rename_volume(struct vnode *vp, const char *name, proc_t p) */ if (!error) { strlcpy((char *)vcb->vcbVN, name, sizeof(vcb->vcbVN)); + volname_length = strlen ((const char*)vcb->vcbVN); +#define DKIOCCSSETLVNAME _IOW('d', 198, char[1024]) + /* Send the volume name down to CoreStorage if necessary */ + error = utf8_normalizestr(vcb->vcbVN, volname_length, (u_int8_t*)converted_volname, &conv_volname_length, 256, UTF_PRECOMPOSED); + if (error == 0) { + (void) VNOP_IOCTL (hfsmp->hfs_devvp, DKIOCCSSETLVNAME, converted_volname, 0, vfs_context_current()); + } + error = 0; } - + hfs_systemfile_unlock(hfsmp, lockflags); cat_postflight(hfsmp, &cookie, p); @@ -5604,7 +7092,7 @@ static int hfs_journal_replay(vnode_t devvp, vfs_context_t context) struct hfs_mount_args *args = NULL; /* Replay allowed only on raw devices */ - if (!vnode_ischr(devvp)) { + if (!vnode_ischr(devvp) && !vnode_isblk(devvp)) { retval = EINVAL; goto out; } @@ -5626,7 +7114,10 @@ static int hfs_journal_replay(vnode_t devvp, vfs_context_t context) bzero(args, sizeof(struct hfs_mount_args)); retval = hfs_mountfs(devvp, mp, args, 1, context); - buf_flushdirtyblks(devvp, MNT_WAIT, 0, "hfs_journal_replay"); + buf_flushdirtyblks(devvp, TRUE, 0, "hfs_journal_replay"); + + /* FSYNC the devnode to be sure all data has been flushed */ + retval = VNOP_FSYNC(devvp, MNT_WAIT, context); out: if (mp) { diff --git a/bsd/hfs/hfs_vfsutils.c b/bsd/hfs/hfs_vfsutils.c index 97559487f..103232431 100644 --- a/bsd/hfs/hfs_vfsutils.c +++ b/bsd/hfs/hfs_vfsutils.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -68,6 +69,8 @@ static int hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *v static u_int32_t hfs_hotfile_freeblocks(struct hfsmount *); +#define HFS_MOUNT_DEBUG 1 + //******************************************************************************* // Note: Finder information in the HFS/HFS+ metadata are considered opaque and @@ -87,7 +90,6 @@ unsigned char hfs_attrname[] = "Attribute B-tree"; unsigned char hfs_startupname[] = "Startup File"; -__private_extern__ OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, __unused struct proc *p) { @@ -97,6 +99,7 @@ OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, struct cat_desc cndesc; struct cat_attr cnattr; struct cat_fork fork; + int newvnode_flags = 0; /* Block size must be a multiple of 512 */ if (SWAP_BE32(mdb->drAlBlkSiz) == 0 || @@ -115,7 +118,7 @@ OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, * */ vcb->vcbSigWord = SWAP_BE16 (mdb->drSigWord); - vcb->vcbCrDate = to_bsd_time(LocalToUTC(SWAP_BE32(mdb->drCrDate))); + vcb->hfs_itime = to_bsd_time(LocalToUTC(SWAP_BE32(mdb->drCrDate))); vcb->localCreateDate = SWAP_BE32 (mdb->drCrDate); vcb->vcbLsMod = to_bsd_time(LocalToUTC(SWAP_BE32(mdb->drLsMod))); vcb->vcbAtrb = SWAP_BE16 (mdb->drAtrb); @@ -145,12 +148,13 @@ OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, * volume encoding we use MacRoman as a fallback. */ if (error || (utf8chars == 0)) { - (void) mac_roman_to_utf8(mdb->drVN, NAME_MAX, &utf8chars, vcb->vcbVN); - /* If we fail to encode to UTF8 from Mac Roman, the name is bad. Deny mount */ + error = mac_roman_to_utf8(mdb->drVN, NAME_MAX, &utf8chars, vcb->vcbVN); + /* If we fail to encode to UTF8 from Mac Roman, the name is bad. Deny the mount */ if (error) { goto MtVolErr; } } + hfsmp->hfs_logBlockSize = BestBlockSizeFit(vcb->blockSize, MAXBSIZE, hfsmp->hfs_logical_block_size); vcb->vcbVBMIOSize = kHFSBlockSize; @@ -184,11 +188,19 @@ OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, cnattr.ca_blocks = fork.cf_blocks; error = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &fork, - &hfsmp->hfs_extents_vp); - if (error) goto MtVolErr; + &hfsmp->hfs_extents_vp, &newvnode_flags); + if (error) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfs (std): error creating Ext Vnode (%d) \n", error); + } + goto MtVolErr; + } error = MacToVFSError(BTOpenPath(VTOF(hfsmp->hfs_extents_vp), (KeyCompareProcPtr)CompareExtentKeys)); if (error) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfs (std): error opening Ext Vnode (%d) \n", error); + } hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); goto MtVolErr; } @@ -213,14 +225,20 @@ OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, cnattr.ca_blocks = fork.cf_blocks; error = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &fork, - &hfsmp->hfs_catalog_vp); + &hfsmp->hfs_catalog_vp, &newvnode_flags); if (error) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfs (std): error creating catalog Vnode (%d) \n", error); + } hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); goto MtVolErr; } error = MacToVFSError(BTOpenPath(VTOF(hfsmp->hfs_catalog_vp), (KeyCompareProcPtr)CompareCatalogKeys)); if (error) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfs (std): error opening catalog Vnode (%d) \n", error); + } hfs_unlock(VTOC(hfsmp->hfs_catalog_vp)); hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); goto MtVolErr; @@ -237,37 +255,41 @@ OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, cnattr.ca_blocks = 0; error = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &fork, - &hfsmp->hfs_allocation_vp); + &hfsmp->hfs_allocation_vp, &newvnode_flags); if (error) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfs (std): error creating bitmap Vnode (%d) \n", error); + } hfs_unlock(VTOC(hfsmp->hfs_catalog_vp)); hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); goto MtVolErr; } hfsmp->hfs_allocation_cp = VTOC(hfsmp->hfs_allocation_vp); - /* mark the volume dirty (clear clean unmount bit) */ + /* mark the volume dirty (clear clean unmount bit) */ vcb->vcbAtrb &= ~kHFSVolumeUnmountedMask; - if (error == noErr) - { + if (error == noErr) { error = cat_idlookup(hfsmp, kHFSRootFolderID, 0, NULL, NULL, NULL); - } - - if ( error == noErr ) - { - if ( !(vcb->vcbAtrb & kHFSVolumeHardwareLockMask) ) // if the disk is not write protected - { - MarkVCBDirty( vcb ); // mark VCB dirty so it will be written - } - } - + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfs (std): error looking up root folder (%d) \n", error); + } + } + + if (error == noErr) { + /* If the disk isn't write protected.. */ + if ( !(vcb->vcbAtrb & kHFSVolumeHardwareLockMask)) { + MarkVCBDirty (vcb); // mark VCB dirty so it will be written + } + } + /* * all done with system files so we can unlock now... */ hfs_unlock(VTOC(hfsmp->hfs_allocation_vp)); hfs_unlock(VTOC(hfsmp->hfs_catalog_vp)); hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); - + if (error == noErr) { /* If successful, then we can just return once we've unlocked the cnodes */ return error; @@ -275,9 +297,7 @@ OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, //-- Release any resources allocated so far before exiting with an error: MtVolErr: - ReleaseMetaFileVNode(hfsmp->hfs_catalog_vp); - ReleaseMetaFileVNode(hfsmp->hfs_extents_vp); - ReleaseMetaFileVNode(hfsmp->hfs_allocation_vp); + hfsUnmount(hfsmp, NULL); return (error); } @@ -288,7 +308,6 @@ OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, // //******************************************************************************* -__private_extern__ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, off_t embeddedOffset, u_int64_t disksize, __unused struct proc *p, void *args, kauth_cred_t cred) { @@ -301,8 +320,12 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, struct BTreeInfoRec btinfo; u_int16_t signature; u_int16_t hfs_version; + int newvnode_flags = 0; int i; OSErr retval; + char converted_volname[256]; + size_t volname_length = 0; + size_t conv_volname_length = 0; signature = SWAP_BE16(vhp->signature); hfs_version = SWAP_BE16(vhp->version); @@ -324,23 +347,38 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, /* Removed printf for invalid HFS+ signature because it gives * false error for UFS root volume */ + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: unknown Volume Signature\n"); + } return (EINVAL); } /* Block size must be at least 512 and a power of 2 */ blockSize = SWAP_BE32(vhp->blockSize); - if (blockSize < 512 || !powerof2(blockSize)) + if (blockSize < 512 || !powerof2(blockSize)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: invalid blocksize (%d) \n", blockSize); + } return (EINVAL); + } /* don't mount a writable volume if its dirty, it must be cleaned by fsck_hfs */ if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0 && hfsmp->jnl == NULL && - (SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) == 0) + (SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) == 0) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: cannot mount dirty non-journaled volumes\n"); + } return (EINVAL); + } /* Make sure we can live with the physical block size. */ if ((disksize & (hfsmp->hfs_logical_block_size - 1)) || (embeddedOffset & (hfsmp->hfs_logical_block_size - 1)) || (blockSize < hfsmp->hfs_logical_block_size)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: invalid physical blocksize (%d), hfs_logical_blocksize (%d) \n", + blockSize, hfsmp->hfs_logical_block_size); + } return (ENXIO); } @@ -445,9 +483,12 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, SWAP_BE32 (vhp->extentsFile.extents[i].blockCount); } retval = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, - &hfsmp->hfs_extents_vp); + &hfsmp->hfs_extents_vp, &newvnode_flags); if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: hfs_getnewvnode returned (%d) getting extentoverflow BT\n", retval); + } goto ErrorExit; } hfsmp->hfs_extents_cp = VTOC(hfsmp->hfs_extents_vp); @@ -457,6 +498,9 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, (KeyCompareProcPtr) CompareExtentKeysPlus)); if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: BTOpenPath returned (%d) getting extentoverflow BT\n", retval); + } goto ErrorExit; } /* @@ -478,8 +522,11 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, SWAP_BE32 (vhp->catalogFile.extents[i].blockCount); } retval = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, - &hfsmp->hfs_catalog_vp); + &hfsmp->hfs_catalog_vp, &newvnode_flags); if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: hfs_getnewvnode returned (%d) getting catalog BT\n", retval); + } goto ErrorExit; } hfsmp->hfs_catalog_cp = VTOC(hfsmp->hfs_catalog_vp); @@ -488,6 +535,9 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, retval = MacToVFSError(BTOpenPath(VTOF(hfsmp->hfs_catalog_vp), (KeyCompareProcPtr) CompareExtendedCatalogKeys)); if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: BTOpenPath returned (%d) getting catalog BT\n", retval); + } goto ErrorExit; } if ((hfsmp->hfs_flags & HFS_X) && @@ -519,8 +569,11 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, SWAP_BE32 (vhp->allocationFile.extents[i].blockCount); } retval = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, - &hfsmp->hfs_allocation_vp); + &hfsmp->hfs_allocation_vp, &newvnode_flags); if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: hfs_getnewvnode returned (%d) getting bitmap\n", retval); + } goto ErrorExit; } hfsmp->hfs_allocation_cp = VTOC(hfsmp->hfs_allocation_vp); @@ -546,8 +599,11 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, SWAP_BE32 (vhp->attributesFile.extents[i].blockCount); } retval = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, - &hfsmp->hfs_attribute_vp); + &hfsmp->hfs_attribute_vp, &newvnode_flags); if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: hfs_getnewvnode returned (%d) getting EA BT\n", retval); + } goto ErrorExit; } hfsmp->hfs_attribute_cp = VTOC(hfsmp->hfs_attribute_vp); @@ -555,6 +611,22 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, retval = MacToVFSError(BTOpenPath(VTOF(hfsmp->hfs_attribute_vp), (KeyCompareProcPtr) hfs_attrkeycompare)); if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: BTOpenPath returned (%d) getting EA BT\n", retval); + } + goto ErrorExit; + } + + /* Initialize vnode for virtual attribute data file that spans the + * entire file system space for performing I/O to attribute btree + * We hold iocount on the attrdata vnode for the entire duration + * of mount (similar to btree vnodes) + */ + retval = init_attrdata_vnode(hfsmp); + if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: init_attrdata_vnode returned (%d) for virtual EA file\n", retval); + } goto ErrorExit; } } @@ -579,8 +651,11 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, SWAP_BE32 (vhp->startupFile.extents[i].blockCount); } retval = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, - &hfsmp->hfs_startup_vp); + &hfsmp->hfs_startup_vp, &newvnode_flags); if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: hfs_getnewvnode returned (%d) getting startup file\n", retval); + } goto ErrorExit; } hfsmp->hfs_startup_cp = VTOC(hfsmp->hfs_startup_vp); @@ -590,13 +665,29 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, /* Pick up volume name and create date */ retval = cat_idlookup(hfsmp, kHFSRootFolderID, 0, &cndesc, &cnattr, NULL); if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: cat_idlookup returned (%d) getting rootfolder \n", retval); + } goto ErrorExit; } - vcb->vcbCrDate = cnattr.ca_itime; + vcb->hfs_itime = cnattr.ca_itime; vcb->volumeNameEncodingHint = cndesc.cd_encoding; bcopy(cndesc.cd_nameptr, vcb->vcbVN, min(255, cndesc.cd_namelen)); + volname_length = strlen ((const char*)vcb->vcbVN); cat_releasedesc(&cndesc); + +#define DKIOCCSSETLVNAME _IOW('d', 198, char[1024]) + + /* Send the volume name down to CoreStorage if necessary */ + retval = utf8_normalizestr(vcb->vcbVN, volname_length, (u_int8_t*)converted_volname, &conv_volname_length, 256, UTF_PRECOMPOSED); + if (retval == 0) { + (void) VNOP_IOCTL (hfsmp->hfs_devvp, DKIOCCSSETLVNAME, converted_volname, 0, vfs_context_current()); + } + + /* reset retval == 0. we don't care about errors in volname conversion */ + retval = 0; + /* mark the volume dirty (clear clean unmount bit) */ vcb->vcbAtrb &= ~kHFSVolumeUnmountedMask; if (hfsmp->jnl && (hfsmp->hfs_flags & HFS_READ_ONLY) == 0) { @@ -624,6 +715,9 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, // EROFS is a special error code that means the volume has an external // journal which we couldn't find. in that case we do not want to // rewrite the volume header - we'll just refuse to mount the volume. + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: hfs_late_journal_init returned (%d), maybe an external jnl?\n", retval); + } retval = EINVAL; goto ErrorExit; } @@ -663,7 +757,10 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, bp = NULL; } } - + + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: hfs_late_journal_init returned (%d)\n", retval); + } retval = EINVAL; goto ErrorExit; } else if (hfsmp->jnl) { @@ -697,7 +794,7 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, /* * Establish a metadata allocation zone. */ - hfs_metadatazone_init(hfsmp); + hfs_metadatazone_init(hfsmp, false); /* * Make any metadata zone adjustments. @@ -726,8 +823,13 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) { retval = hfs_erase_unused_nodes(hfsmp); - if (retval) + if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: hfs_erase_unused_nodes returned (%d) for %s \n", retval, hfsmp->vcbVN); + } + goto ErrorExit; + } } if ( !(vcb->vcbAtrb & kHFSVolumeHardwareLockMask) ) // if the disk is not write protected @@ -739,30 +841,33 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, * Allow hot file clustering if conditions allow. */ if ((hfsmp->hfs_flags & HFS_METADATA_ZONE) && - ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) && - ((hfsmp->hfs_mp->mnt_kern_flag & MNTK_SSD) == 0)) { + ((hfsmp->hfs_flags & (HFS_READ_ONLY | HFS_SSD)) == 0)) { (void) hfs_recording_init(hfsmp); } /* Force ACLs on HFS+ file systems. */ vfs_setextendedsecurity(HFSTOVFS(hfsmp)); - /* Check if volume supports writing of extent-based extended attributes */ - hfs_check_volxattr(hfsmp, HFS_SET_XATTREXTENTS_STATE); + /* Enable extent-based extended attributes by default */ + hfsmp->hfs_flags |= HFS_XATTR_EXTENTS; + + /* See if this volume should have per-file content protection enabled */ + if (vcb->vcbAtrb & kHFSContentProtectionMask) { + vfs_setflags (hfsmp->hfs_mp, MNT_CPROTECT); + } return (0); ErrorExit: /* - * A fatal error occurred and the volume cannot be mounted - * release any resources that we aquired... + * A fatal error occurred and the volume cannot be mounted, so + * release any resources that we acquired... */ - if (hfsmp->hfs_attribute_vp) - ReleaseMetaFileVNode(hfsmp->hfs_attribute_vp); - ReleaseMetaFileVNode(hfsmp->hfs_allocation_vp); - ReleaseMetaFileVNode(hfsmp->hfs_catalog_vp); - ReleaseMetaFileVNode(hfsmp->hfs_extents_vp); - + hfsUnmount(hfsmp, NULL); + + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: encountered errorr (%d)\n", retval); + } return (retval); } @@ -797,44 +902,47 @@ static void ReleaseMetaFileVNode(struct vnode *vp) * *************************************************************/ -__private_extern__ int hfsUnmount( register struct hfsmount *hfsmp, __unused struct proc *p) { - /* Get rid of our attribute data vnode (if any). */ + /* Get rid of our attribute data vnode (if any). This is done + * after the vflush() during mount, so we don't need to worry + * about any locks. + */ if (hfsmp->hfs_attrdata_vp) { - vnode_t advp = hfsmp->hfs_attrdata_vp; - - if (vnode_get(advp) == 0) { - vnode_rele_ext(advp, O_EVTONLY, 0); - vnode_put(advp); - } + ReleaseMetaFileVNode(hfsmp->hfs_attrdata_vp); hfsmp->hfs_attrdata_vp = NULLVP; } - if (hfsmp->hfs_startup_vp) + if (hfsmp->hfs_startup_vp) { ReleaseMetaFileVNode(hfsmp->hfs_startup_vp); - - if (hfsmp->hfs_allocation_vp) - ReleaseMetaFileVNode(hfsmp->hfs_allocation_vp); - - if (hfsmp->hfs_attribute_vp) + hfsmp->hfs_startup_cp = NULL; + hfsmp->hfs_startup_vp = NULL; + } + + if (hfsmp->hfs_attribute_vp) { ReleaseMetaFileVNode(hfsmp->hfs_attribute_vp); + hfsmp->hfs_attribute_cp = NULL; + hfsmp->hfs_attribute_vp = NULL; + } - ReleaseMetaFileVNode(hfsmp->hfs_catalog_vp); - ReleaseMetaFileVNode(hfsmp->hfs_extents_vp); + if (hfsmp->hfs_catalog_vp) { + ReleaseMetaFileVNode(hfsmp->hfs_catalog_vp); + hfsmp->hfs_catalog_cp = NULL; + hfsmp->hfs_catalog_vp = NULL; + } - /* - * Setting these pointers to NULL so that any references - * past this point will fail, and tell us the point of failure. - * Also, facilitates a check in hfs_update for a null catalog - * vp - */ - hfsmp->hfs_allocation_vp = NULL; - hfsmp->hfs_attribute_vp = NULL; - hfsmp->hfs_catalog_vp = NULL; - hfsmp->hfs_extents_vp = NULL; - hfsmp->hfs_startup_vp = NULL; + if (hfsmp->hfs_extents_vp) { + ReleaseMetaFileVNode(hfsmp->hfs_extents_vp); + hfsmp->hfs_extents_cp = NULL; + hfsmp->hfs_extents_vp = NULL; + } + + if (hfsmp->hfs_allocation_vp) { + ReleaseMetaFileVNode(hfsmp->hfs_allocation_vp); + hfsmp->hfs_allocation_cp = NULL; + hfsmp->hfs_allocation_vp = NULL; + } return (0); } @@ -880,11 +988,56 @@ overflow_extents(struct filefork *fp) return (fp->ff_blocks > blocks); } +/* + * Lock the HFS global journal lock + */ +int +hfs_lock_global (struct hfsmount *hfsmp, enum hfslocktype locktype) { + + void *thread = current_thread(); + + if (hfsmp->hfs_global_lockowner == thread) { + panic ("hfs_lock_global: locking against myself!"); + } + + /* HFS_SHARED_LOCK */ + if (locktype == HFS_SHARED_LOCK) { + lck_rw_lock_shared (&hfsmp->hfs_global_lock); + hfsmp->hfs_global_lockowner = HFS_SHARED_OWNER; + } + /* HFS_EXCLUSIVE_LOCK */ + else { + lck_rw_lock_exclusive (&hfsmp->hfs_global_lock); + hfsmp->hfs_global_lockowner = thread; + } + + return 0; +} + + +/* + * Unlock the HFS global journal lock + */ +void +hfs_unlock_global (struct hfsmount *hfsmp) { + + void *thread = current_thread(); + + /* HFS_LOCK_EXCLUSIVE */ + if (hfsmp->hfs_global_lockowner == thread) { + hfsmp->hfs_global_lockowner = NULL; + lck_rw_unlock_exclusive (&hfsmp->hfs_global_lock); + } + /* HFS_LOCK_SHARED */ + else { + lck_rw_unlock_shared (&hfsmp->hfs_global_lock); + } +} + /* * Lock HFS system file(s). */ -__private_extern__ int hfs_systemfile_lock(struct hfsmount *hfsmp, int flags, enum hfslocktype locktype) { @@ -905,7 +1058,12 @@ hfs_systemfile_lock(struct hfsmount *hfsmp, int flags, enum hfslocktype locktype } #endif /* HFS_CHECK_LOCK_ORDER */ - (void) hfs_lock(hfsmp->hfs_catalog_cp, locktype); + if (hfsmp->hfs_catalog_cp) { + (void) hfs_lock(hfsmp->hfs_catalog_cp, locktype); + } else { + flags &= ~SFL_CATALOG; + } + /* * When the catalog file has overflow extents then * also acquire the extents b-tree lock if its not @@ -949,7 +1107,12 @@ hfs_systemfile_lock(struct hfsmount *hfsmp, int flags, enum hfslocktype locktype } #endif /* HFS_CHECK_LOCK_ORDER */ - (void) hfs_lock(hfsmp->hfs_startup_cp, locktype); + if (hfsmp->hfs_startup_cp) { + (void) hfs_lock(hfsmp->hfs_startup_cp, locktype); + } else { + flags &= ~SFL_STARTUP; + } + /* * When the startup file has overflow extents then * also acquire the extents b-tree lock if its not @@ -966,17 +1129,14 @@ hfs_systemfile_lock(struct hfsmount *hfsmp, int flags, enum hfslocktype locktype */ if (flags & (SFL_BITMAP | SFL_EXTENTS)) { /* - * Since the only bitmap operations are clearing and - * setting bits we always need exclusive access. And - * when we have a journal, we can "hide" behind that - * lock since we can only change the bitmap from - * within a transaction. + * If there's no bitmap cnode, ignore the bitmap lock. */ - if (hfsmp->jnl || (hfsmp->hfs_allocation_cp == NULL)) { + if (hfsmp->hfs_allocation_cp == NULL) { flags &= ~SFL_BITMAP; } else { (void) hfs_lock(hfsmp->hfs_allocation_cp, HFS_EXCLUSIVE_LOCK); - /* The bitmap lock is also grabbed when only extent lock + /* + * The bitmap lock is also grabbed when only extent lock * was requested. Set the bitmap lock bit in the lock * flags which callers will use during unlock. */ @@ -988,7 +1148,11 @@ hfs_systemfile_lock(struct hfsmount *hfsmp, int flags, enum hfslocktype locktype * Since the extents btree lock is recursive we always * need exclusive access. */ - (void) hfs_lock(hfsmp->hfs_extents_cp, HFS_EXCLUSIVE_LOCK); + if (hfsmp->hfs_extents_cp) { + (void) hfs_lock(hfsmp->hfs_extents_cp, HFS_EXCLUSIVE_LOCK); + } else { + flags &= ~SFL_EXTENTS; + } } return (flags); } @@ -996,7 +1160,6 @@ hfs_systemfile_lock(struct hfsmount *hfsmp, int flags, enum hfslocktype locktype /* * unlock HFS system file(s). */ -__private_extern__ void hfs_systemfile_unlock(struct hfsmount *hfsmp, int flags) { @@ -1023,7 +1186,7 @@ hfs_systemfile_unlock(struct hfsmount *hfsmp, int flags) } hfs_unlock(hfsmp->hfs_attribute_cp); } - if (flags & SFL_CATALOG) { + if (flags & SFL_CATALOG && hfsmp->hfs_catalog_cp) { if (hfsmp->jnl == NULL) { BTGetLastSync((FCB*)VTOF(hfsmp->hfs_catalog_vp), &lastfsync); numOfLockedBuffs = count_lock_queue(); @@ -1035,10 +1198,10 @@ hfs_systemfile_unlock(struct hfsmount *hfsmp, int flags) } hfs_unlock(hfsmp->hfs_catalog_cp); } - if (flags & SFL_BITMAP) { + if (flags & SFL_BITMAP && hfsmp->hfs_allocation_cp) { hfs_unlock(hfsmp->hfs_allocation_cp); } - if (flags & SFL_EXTENTS) { + if (flags & SFL_EXTENTS && hfsmp->hfs_extents_cp) { if (hfsmp->jnl == NULL) { BTGetLastSync((FCB*)VTOF(hfsmp->hfs_extents_vp), &lastfsync); numOfLockedBuffs = count_lock_queue(); @@ -1168,7 +1331,6 @@ u_int32_t BestBlockSizeFit(u_int32_t allocationBlockSize, } -__private_extern__ u_int32_t GetFileInfo(ExtendedVCB *vcb, __unused u_int32_t dirid, const char *name, struct cat_attr *fattr, struct cat_fork *forkinfo) @@ -1208,7 +1370,6 @@ GetFileInfo(ExtendedVCB *vcb, __unused u_int32_t dirid, const char *name, * If the volume was not cleanly unmounted then some of these may * have persisted and need to be removed. */ -__private_extern__ void hfs_remove_orphans(struct hfsmount * hfsmp) { @@ -1286,8 +1447,9 @@ hfs_remove_orphans(struct hfsmount * hfsmp) */ if (bcmp(tempname, filename, namelen) == 0) { struct filefork dfork; - struct filefork rfork; + struct filefork rfork; struct cnode cnode; + int mode = 0; bzero(&dfork, sizeof(dfork)); bzero(&rfork, sizeof(rfork)); @@ -1344,8 +1506,10 @@ hfs_remove_orphans(struct hfsmount * hfsmp) fsize = 0; } - if (TruncateFileC(vcb, (FCB*)&dfork, fsize, false) != 0) { - printf("hfs: error truncting data fork!\n"); + if (TruncateFileC(vcb, (FCB*)&dfork, fsize, 1, 0, + cnode.c_attr.ca_fileid, false) != 0) { + printf("hfs: error truncating data fork!\n"); + break; } @@ -1376,8 +1540,8 @@ hfs_remove_orphans(struct hfsmount * hfsmp) rfork.ff_cp = &cnode; cnode.c_datafork = NULL; cnode.c_rsrcfork = &rfork; - if (TruncateFileC(vcb, (FCB*)&rfork, 0, false) != 0) { - printf("hfs: error truncting rsrc fork!\n"); + if (TruncateFileC(vcb, (FCB*)&rfork, 0, 1, 1, cnode.c_attr.ca_fileid, false) != 0) { + printf("hfs: error truncating rsrc fork!\n"); break; } } @@ -1391,7 +1555,9 @@ hfs_remove_orphans(struct hfsmount * hfsmp) break; } - if (cnode.c_attr.ca_mode & S_IFDIR) { + mode = cnode.c_attr.ca_mode & S_IFMT; + + if (mode == S_IFDIR) { orphaned_dirs++; } else { @@ -1400,7 +1566,7 @@ hfs_remove_orphans(struct hfsmount * hfsmp) /* Update parent and volume counts */ hfsmp->hfs_private_attr[FILE_HARDLINKS].ca_entries--; - if (cnode.c_attr.ca_mode & S_IFDIR) { + if (mode == S_IFDIR) { DEC_FOLDERCOUNT(hfsmp, hfsmp->hfs_private_attr[FILE_HARDLINKS]); } @@ -1416,7 +1582,7 @@ hfs_remove_orphans(struct hfsmount * hfsmp) Now that Catalog is unlocked, update the volume info, making sure to differentiate between files and directories */ - if (cnode.c_attr.ca_mode & S_IFDIR) { + if (mode == S_IFDIR) { hfs_volupdate(hfsmp, VOL_RMDIR, 0); } else{ @@ -1489,7 +1655,6 @@ u_int32_t logBlockSize; return logBlockSize; } -__private_extern__ u_int32_t hfs_freeblks(struct hfsmount * hfsmp, int wantreserve) { @@ -1517,7 +1682,7 @@ hfs_freeblks(struct hfsmount * hfsmp, int wantreserve) else freeblks = 0; -#ifdef HFS_SPARSE_DEV +#if HFS_SPARSE_DEV /* * When the underlying device is sparse, check the * available space on the backing store volume. @@ -1841,7 +2006,7 @@ journal_open_cb(const char *bsd_dev_name, const char *uuid_str, void *arg) // desired uuid so let's try to open the device for writing and // see if it works. if it does, we'll use it. - NDINIT(&nd, LOOKUP, LOCKLEAF, UIO_SYSSPACE32, CAST_USER_ADDR_T(bsd_name), vfs_context_kernel()); + NDINIT(&nd, LOOKUP, OP_LOOKUP, LOCKLEAF, UIO_SYSSPACE32, CAST_USER_ADDR_T(bsd_name), vfs_context_kernel()); if ((error = namei(&nd))) { printf("hfs: journal open cb: error %d looking up device %s (dev uuid %s)\n", error, bsd_name, uuid_str); return 1; // keep iterating @@ -1888,7 +2053,6 @@ journal_open_cb(const char *bsd_dev_name, const char *uuid_str, void *arg) extern dev_t IOBSDGetMediaWithUUID(const char *uuid_cstring, char *bsd_name, int bsd_name_len, int timeout); extern void IOBSDIterateMediaWithContent(const char *uuid_cstring, int (*func)(const char *bsd_dev_name, const char *uuid_str, void *arg), void *arg); -extern kern_return_t IOBSDGetPlatformUUID(__darwin_uuid_t uuid, mach_timespec_t timeoutp); kern_return_t IOBSDGetPlatformSerialNumber(char *serial_number_str, u_int32_t len); @@ -1940,7 +2104,6 @@ open_journal_dev(const char *vol_device, } -__private_extern__ int hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_args, off_t embeddedOffset, daddr64_t mdb_offset, @@ -2063,6 +2226,8 @@ hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, arg_flags, arg_tbufsz, hfs_sync_metadata, hfsmp->hfs_mp); + if (hfsmp->jnl) + journal_trim_set_callback(hfsmp->jnl, hfs_trim_callback, hfsmp); // no need to start a transaction here... if this were to fail // we'd just re-init it on the next mount. @@ -2084,6 +2249,8 @@ hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, arg_flags, arg_tbufsz, hfs_sync_metadata, hfsmp->hfs_mp); + if (hfsmp->jnl) + journal_trim_set_callback(hfsmp->jnl, hfs_trim_callback, hfsmp); if (write_jibp) { buf_bwrite(jinfo_bp); @@ -2323,6 +2490,8 @@ hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_a arg_flags, arg_tbufsz, hfs_sync_metadata, hfsmp->hfs_mp); + if (hfsmp->jnl) + journal_trim_set_callback(hfsmp->jnl, hfs_trim_callback, hfsmp); // no need to start a transaction here... if this were to fail // we'd just re-init it on the next mount. @@ -2352,6 +2521,8 @@ hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_a arg_flags, arg_tbufsz, hfs_sync_metadata, hfsmp->hfs_mp); + if (hfsmp->jnl) + journal_trim_set_callback(hfsmp->jnl, hfs_trim_callback, hfsmp); } @@ -2408,8 +2579,15 @@ hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_a #define HOTBAND_MINIMUM_SIZE (10*1024*1024) #define HOTBAND_MAXIMUM_SIZE (512*1024*1024) +/* Initialize the metadata zone. + * + * If the size of the volume is less than the minimum size for + * metadata zone, metadata zone is disabled. + * + * If disable is true, disable metadata zone unconditionally. + */ void -hfs_metadatazone_init(struct hfsmount *hfsmp) +hfs_metadatazone_init(struct hfsmount *hfsmp, int disable) { ExtendedVCB *vcb; u_int64_t fs_size; @@ -2436,6 +2614,11 @@ hfs_metadatazone_init(struct hfsmount *hfsmp) really_do_it = 0; } + /* If caller wants to disable metadata zone, do it */ + if (disable == true) { + really_do_it = 0; + } + /* * Start with space for the boot blocks and Volume Header. * 1536 = byte offset from start of volume to end of volume header: @@ -2626,7 +2809,6 @@ hfs_hotfile_freeblocks(struct hfsmount *hfsmp) * Determine if a file is a "virtual" metadata file. * This includes journal and quota files. */ -__private_extern__ int hfs_virtualmetafile(struct cnode *cp) { @@ -2698,7 +2880,6 @@ hfs_sync_ejectable(struct hfsmount *hfsmp) } -__private_extern__ int hfs_start_transaction(struct hfsmount *hfsmp) { @@ -2723,11 +2904,11 @@ hfs_start_transaction(struct hfsmount *hfsmp) } #endif /* HFS_CHECK_LOCK_ORDER */ - if (hfsmp->jnl == NULL || journal_owner(hfsmp->jnl) != thread) { - lck_rw_lock_shared(&hfsmp->hfs_global_lock); - OSAddAtomic(1, (SInt32 *)&hfsmp->hfs_active_threads); - unlock_on_err = 1; - } + if (hfsmp->jnl == NULL || journal_owner(hfsmp->jnl) != thread) { + hfs_lock_global (hfsmp, HFS_SHARED_LOCK); + OSAddAtomic(1, (SInt32 *)&hfsmp->hfs_active_threads); + unlock_on_err = 1; + } /* If a downgrade to read-only mount is in progress, no other * process than the downgrade process is allowed to modify @@ -2739,67 +2920,89 @@ hfs_start_transaction(struct hfsmount *hfsmp) goto out; } - if (hfsmp->jnl) { - ret = journal_start_transaction(hfsmp->jnl); - if (ret == 0) { - OSAddAtomic(1, &hfsmp->hfs_global_lock_nesting); + if (hfsmp->jnl) { + ret = journal_start_transaction(hfsmp->jnl); + if (ret == 0) { + OSAddAtomic(1, &hfsmp->hfs_global_lock_nesting); + } + } else { + ret = 0; } - } else { - ret = 0; - } out: - if (ret != 0 && unlock_on_err) { - lck_rw_unlock_shared(&hfsmp->hfs_global_lock); - OSAddAtomic(-1, (SInt32 *)&hfsmp->hfs_active_threads); - } + if (ret != 0 && unlock_on_err) { + hfs_unlock_global (hfsmp); + OSAddAtomic(-1, (SInt32 *)&hfsmp->hfs_active_threads); + } return ret; } -__private_extern__ int hfs_end_transaction(struct hfsmount *hfsmp) { int need_unlock=0, ret; - if ( hfsmp->jnl == NULL - || ( journal_owner(hfsmp->jnl) == current_thread() + if ((hfsmp->jnl == NULL) || ( journal_owner(hfsmp->jnl) == current_thread() && (OSAddAtomic(-1, &hfsmp->hfs_global_lock_nesting) == 1)) ) { - need_unlock = 1; } - if (hfsmp->jnl) { - ret = journal_end_transaction(hfsmp->jnl); - } else { - ret = 0; - } + if (hfsmp->jnl) { + ret = journal_end_transaction(hfsmp->jnl); + } else { + ret = 0; + } - if (need_unlock) { - OSAddAtomic(-1, (SInt32 *)&hfsmp->hfs_active_threads); - lck_rw_unlock_shared(&hfsmp->hfs_global_lock); - hfs_sync_ejectable(hfsmp); - } + if (need_unlock) { + OSAddAtomic(-1, (SInt32 *)&hfsmp->hfs_active_threads); + hfs_unlock_global (hfsmp); + hfs_sync_ejectable(hfsmp); + } return ret; } -__private_extern__ +/* + * Flush the contents of the journal to the disk. + * + * Input: + * wait_for_IO - + * If TRUE, wait to write in-memory journal to the disk + * consistently, and also wait to write all asynchronous + * metadata blocks to its corresponding locations + * consistently on the disk. This means that the journal + * is empty at this point and does not contain any + * transactions. This is overkill in normal scenarios + * but is useful whenever the metadata blocks are required + * to be consistent on-disk instead of just the journal + * being consistent; like before live verification + * and live volume resizing. + * + * If FALSE, only wait to write in-memory journal to the + * disk consistently. This means that the journal still + * contains uncommitted transactions and the file system + * metadata blocks in the journal transactions might be + * written asynchronously to the disk. But there is no + * guarantee that they are written to the disk before + * returning to the caller. Note that this option is + * sufficient for file system data integrity as it + * guarantees consistent journal content on the disk. + */ int -hfs_journal_flush(struct hfsmount *hfsmp) +hfs_journal_flush(struct hfsmount *hfsmp, boolean_t wait_for_IO) { int ret; - + /* Only peek at hfsmp->jnl while holding the global lock */ - lck_rw_lock_shared(&hfsmp->hfs_global_lock); + hfs_lock_global (hfsmp, HFS_SHARED_LOCK); if (hfsmp->jnl) { - ret = journal_flush(hfsmp->jnl); + ret = journal_flush(hfsmp->jnl, wait_for_IO); } else { ret = 0; } - lck_rw_unlock_shared(&hfsmp->hfs_global_lock); + hfs_unlock_global (hfsmp); return ret; } @@ -2824,7 +3027,6 @@ hfs_journal_flush(struct hfsmount *hfsmp) * unused nodes have been repaired. A newer newfs_hfs will set this bit. * As will fsck_hfs when it repairs the unused nodes. */ -__private_extern__ int hfs_erase_unused_nodes(struct hfsmount *hfsmp) { int result; @@ -2877,3 +3079,92 @@ int hfs_erase_unused_nodes(struct hfsmount *hfsmp) done: return result; } + + +extern time_t snapshot_timestamp; + +int +check_for_tracked_file(struct vnode *vp, time_t ctime, uint64_t op_type, void *arg) +{ + int tracked_error = 0, snapshot_error = 0; + + if (vp == NULL) { + return 0; + } + + if (VTOC(vp)->c_flags & UF_TRACKED) { + // the file has the tracked bit set, so send an event to the tracked-file handler + int error; + + // printf("hfs: tracked-file: encountered a file with the tracked bit set! (vp %p)\n", vp); + error = resolve_nspace_item(vp, op_type | NAMESPACE_HANDLER_TRACK_EVENT); + if (error) { + if (error == EAGAIN) { + printf("hfs: tracked-file: timed out waiting for namespace handler...\n"); + + } else if (error == EINTR) { + // printf("hfs: tracked-file: got a signal while waiting for namespace handler...\n"); + tracked_error = EINTR; + } + } + } + + if (ctime != 0 && snapshot_timestamp != 0 && (ctime <= snapshot_timestamp || vnode_needssnapshots(vp))) { + // the change time is within this epoch + int error; + + error = resolve_nspace_item_ext(vp, op_type | NAMESPACE_HANDLER_SNAPSHOT_EVENT, arg); + if (error == EDEADLK) { + snapshot_error = 0; + } else if (error) { + if (error == EAGAIN) { + printf("hfs: cow-snapshot: timed out waiting for namespace handler...\n"); + } else if (error == EINTR) { + // printf("hfs: cow-snapshot: got a signal while waiting for namespace handler...\n"); + snapshot_error = EINTR; + } + } + } + + if (tracked_error) return tracked_error; + if (snapshot_error) return snapshot_error; + + return 0; +} + +int +check_for_dataless_file(struct vnode *vp, uint64_t op_type) +{ + int error; + + if (vp == NULL || (VTOC(vp)->c_flags & UF_COMPRESSED) == 0 || VTOCMP(vp) == NULL || VTOCMP(vp)->cmp_type != DATALESS_CMPFS_TYPE) { + // there's nothing to do, it's not dataless + return 0; + } + + // printf("hfs: dataless: encountered a file with the dataless bit set! (vp %p)\n", vp); + error = resolve_nspace_item(vp, op_type | NAMESPACE_HANDLER_NSPACE_EVENT); + if (error == EDEADLK && op_type == NAMESPACE_HANDLER_WRITE_OP) { + error = 0; + } else if (error) { + if (error == EAGAIN) { + printf("hfs: dataless: timed out waiting for namespace handler...\n"); + // XXXdbg - return the fabled ENOTPRESENT (i.e. EJUKEBOX)? + return 0; + } else if (error == EINTR) { + // printf("hfs: dataless: got a signal while waiting for namespace handler...\n"); + return EINTR; + } + } else if (VTOC(vp)->c_flags & UF_COMPRESSED) { + // + // if we're here, the dataless bit is still set on the file + // which means it didn't get handled. we return an error + // but it's presently ignored by all callers of this function. + // + // XXXdbg - EDATANOTPRESENT is what we really need... + // + return EBADF; + } + + return error; +} diff --git a/bsd/hfs/hfs_vnops.c b/bsd/hfs/hfs_vnops.c index 49973d29c..4c526f77b 100644 --- a/bsd/hfs/hfs_vnops.c +++ b/bsd/hfs/hfs_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -45,6 +45,10 @@ #include #include #include +#include +#include + +#include #include #include @@ -72,20 +76,23 @@ /* Always F_FULLFSYNC? 1=yes,0=no (default due to "various" reasons is 'no') */ int always_do_fullfsync = 0; SYSCTL_DECL(_vfs_generic); -SYSCTL_INT (_vfs_generic, OID_AUTO, always_do_fullfsync, CTLFLAG_RW, &always_do_fullfsync, 0, "always F_FULLFSYNC when fsync is called"); +SYSCTL_INT (_vfs_generic, OID_AUTO, always_do_fullfsync, CTLFLAG_RW | CTLFLAG_LOCKED, &always_do_fullfsync, 0, "always F_FULLFSYNC when fsync is called"); -static int hfs_makenode(struct vnode *dvp, struct vnode **vpp, +int hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx); +int hfs_metasync(struct hfsmount *hfsmp, daddr64_t node, __unused struct proc *p); +int hfs_metasync_all(struct hfsmount *hfsmp); -static int hfs_metasync(struct hfsmount *hfsmp, daddr64_t node, __unused struct proc *p); -static int hfs_metasync_all(struct hfsmount *hfsmp); +int hfs_removedir(struct vnode *, struct vnode *, struct componentname *, + int, int); +int hfs_removefile(struct vnode *, struct vnode *, struct componentname *, + int, int, int, struct vnode *, int); -static int hfs_removedir(struct vnode *, struct vnode *, struct componentname *, - int); +int hfs_movedata (struct vnode *, struct vnode*); +static int hfs_move_fork (struct filefork *srcfork, struct cnode *src, + struct filefork *dstfork, struct cnode *dst); -static int hfs_removefile(struct vnode *, struct vnode *, struct componentname *, - int, int, int, struct vnode *); #if FIFO static int hfsfifo_read(struct vnop_read_args *); @@ -95,26 +102,27 @@ static int hfsfifo_close(struct vnop_close_args *); extern int (**fifo_vnodeop_p)(void *); #endif /* FIFO */ -static int hfs_vnop_close(struct vnop_close_args*); -static int hfs_vnop_create(struct vnop_create_args*); -static int hfs_vnop_exchange(struct vnop_exchange_args*); -static int hfs_vnop_fsync(struct vnop_fsync_args*); -static int hfs_vnop_mkdir(struct vnop_mkdir_args*); -static int hfs_vnop_mknod(struct vnop_mknod_args*); -static int hfs_vnop_getattr(struct vnop_getattr_args*); -static int hfs_vnop_open(struct vnop_open_args*); -static int hfs_vnop_readdir(struct vnop_readdir_args*); -static int hfs_vnop_remove(struct vnop_remove_args*); -static int hfs_vnop_rename(struct vnop_rename_args*); -static int hfs_vnop_rmdir(struct vnop_rmdir_args*); -static int hfs_vnop_symlink(struct vnop_symlink_args*); -static int hfs_vnop_setattr(struct vnop_setattr_args*); -static int hfs_vnop_readlink(struct vnop_readlink_args *); -static int hfs_vnop_pathconf(struct vnop_pathconf_args *); -static int hfs_vnop_whiteout(struct vnop_whiteout_args *); -static int hfsspec_read(struct vnop_read_args *); -static int hfsspec_write(struct vnop_write_args *); -static int hfsspec_close(struct vnop_close_args *); +int hfs_vnop_close(struct vnop_close_args*); +int hfs_vnop_create(struct vnop_create_args*); +int hfs_vnop_exchange(struct vnop_exchange_args*); +int hfs_vnop_fsync(struct vnop_fsync_args*); +int hfs_vnop_mkdir(struct vnop_mkdir_args*); +int hfs_vnop_mknod(struct vnop_mknod_args*); +int hfs_vnop_getattr(struct vnop_getattr_args*); +int hfs_vnop_open(struct vnop_open_args*); +int hfs_vnop_readdir(struct vnop_readdir_args*); +int hfs_vnop_remove(struct vnop_remove_args*); +int hfs_vnop_rename(struct vnop_rename_args*); +int hfs_vnop_rmdir(struct vnop_rmdir_args*); +int hfs_vnop_symlink(struct vnop_symlink_args*); +int hfs_vnop_setattr(struct vnop_setattr_args*); +int hfs_vnop_readlink(struct vnop_readlink_args *); +int hfs_vnop_pathconf(struct vnop_pathconf_args *); +int hfs_vnop_whiteout(struct vnop_whiteout_args *); +int hfs_vnop_mmap(struct vnop_mmap_args *ap); +int hfsspec_read(struct vnop_read_args *); +int hfsspec_write(struct vnop_write_args *); +int hfsspec_close(struct vnop_close_args *); /* Options for hfs_removedir and hfs_removefile */ #define HFSRM_SKIP_RESERVE 0x01 @@ -131,7 +139,7 @@ static int hfsspec_close(struct vnop_close_args *); /* * Create a regular file. */ -static int +int hfs_vnop_create(struct vnop_create_args *ap) { int error; @@ -164,6 +172,7 @@ hfs_vnop_create(struct vnop_create_args *ap) /* Make sure it was file. */ if ((error == 0) && !vnode_isreg(*args.a_vpp)) { vnode_put(*args.a_vpp); + *args.a_vpp = NULLVP; error = EEXIST; } args.a_cnp->cn_nameiop = CREATE; @@ -174,7 +183,7 @@ hfs_vnop_create(struct vnop_create_args *ap) /* * Make device special file. */ -static int +int hfs_vnop_mknod(struct vnop_mknod_args *ap) { struct vnode_attr *vap = ap->a_vap; @@ -245,7 +254,7 @@ hfs_ref_data_vp(struct cnode *cp, struct vnode **data_vp, int skiplock) return EINVAL; } - if (0 == hfs_vget(VTOHFS(cp->c_rsrc_vp), cp->c_cnid, data_vp, 1) && + if (0 == hfs_vget(VTOHFS(cp->c_rsrc_vp), cp->c_cnid, data_vp, 1, 0) && 0 != data_vp) { vref = vnode_ref(*data_vp); vnode_put(*data_vp); @@ -334,6 +343,8 @@ hfs_file_is_compressed(struct cnode *cp, int skiplock) * if the caller has passed a valid vnode (has a ref count > 0), then hfsmp and fid are not required. * if the caller doesn't have a vnode, pass NULL in vp, and pass valid hfsmp and fid. * files size is returned in size (required) + * if the indicated file is a directory (or something that doesn't have a data fork), then this call + * will return an error and the caller should fall back to treating the item as an uncompressed file */ int hfs_uncompressed_size_of_compressed_file(struct hfsmount *hfsmp, struct vnode *vp, cnid_t fid, off_t *size, int skiplock) @@ -349,7 +360,7 @@ hfs_uncompressed_size_of_compressed_file(struct hfsmount *hfsmp, struct vnode *v if (!hfsmp || !fid) { /* make sure we have the required parameters */ return EINVAL; } - if (0 != hfs_vget(hfsmp, fid, &vp, skiplock)) { /* vnode is null, use hfs_vget() to get it */ + if (0 != hfs_vget(hfsmp, fid, &vp, skiplock, 0)) { /* vnode is null, use hfs_vget() to get it */ vp = NULL; } else { putaway = 1; /* note that hfs_vget() was used to aquire the vnode */ @@ -359,10 +370,27 @@ hfs_uncompressed_size_of_compressed_file(struct hfsmount *hfsmp, struct vnode *v * ensures the cached size is present in case decmpfs hasn't * encountered this node yet. */ - if ( ( NULL != vp ) && hfs_file_is_compressed(VTOC(vp), skiplock) ) { - *size = decmpfs_cnode_get_vnode_cached_size(VTOCMP(vp)); /* file info will be cached now, so get size */ - } else { - ret = EINVAL; + if (vp) { + if (hfs_file_is_compressed(VTOC(vp), skiplock) ) { + *size = decmpfs_cnode_get_vnode_cached_size(VTOCMP(vp)); /* file info will be cached now, so get size */ + } else { + if (VTOCMP(vp) && VTOCMP(vp)->cmp_type >= CMP_MAX) { + if (VTOCMP(vp)->cmp_type != DATALESS_CMPFS_TYPE) { + // if we don't recognize this type, just use the real data fork size + if (VTOC(vp)->c_datafork) { + *size = VTOC(vp)->c_datafork->ff_size; + ret = 0; + } else { + ret = EINVAL; + } + } else { + *size = decmpfs_cnode_get_vnode_cached_size(VTOCMP(vp)); /* file info will be cached now, so get size */ + ret = 0; + } + } else { + ret = EINVAL; + } + } } if (putaway) { /* did we use hfs_vget() to get this vnode? */ @@ -396,7 +424,7 @@ hfs_hides_xattr(vfs_context_t ctx, struct cnode *cp, const char *name, int skipl /* * Open a file/directory. */ -static int +int hfs_vnop_open(struct vnop_open_args *ap) { struct vnode *vp = ap->a_vp; @@ -516,7 +544,7 @@ hfs_vnop_open(struct vnop_open_args *ap) /* * Close a file/directory. */ -static int +int hfs_vnop_close(ap) struct vnop_close_args /* { struct vnode *a_vp; @@ -559,11 +587,11 @@ hfs_vnop_close(ap) // release cnode lock; must acquire truncate lock BEFORE cnode lock hfs_unlock(cp); - hfs_lock_truncate(cp, TRUE); + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK); tooktrunclock = 1; if (hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK) != 0) { - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); // bail out if we can't re-acquire cnode lock return 0; } @@ -585,8 +613,8 @@ hfs_vnop_close(ap) // if we froze the fs and we're exiting, then "thaw" the fs if (hfsmp->hfs_freezing_proc == p && proc_exiting(p)) { hfsmp->hfs_freezing_proc = NULL; - hfs_global_exclusive_lock_release(hfsmp); - lck_rw_unlock_exclusive(&hfsmp->hfs_insync); + hfs_unlock_global (hfsmp); + lck_rw_unlock_exclusive(&hfsmp->hfs_insync); } busy = vnode_isinuse(vp, 1); @@ -601,7 +629,7 @@ hfs_vnop_close(ap) } if (tooktrunclock){ - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); } hfs_unlock(cp); @@ -615,7 +643,7 @@ hfs_vnop_close(ap) /* * Get basic attributes. */ -static int +int hfs_vnop_getattr(struct vnop_getattr_args *ap) { #define VNODE_ATTR_TIMES \ @@ -648,10 +676,16 @@ hfs_vnop_getattr(struct vnop_getattr_args *ap) /* if it's a data fork, we need to know if it was compressed so we can report the uncompressed size */ compressed = hfs_file_is_compressed(cp, 0); } - if (compressed && (VATTR_IS_ACTIVE(vap, va_data_size) || VATTR_IS_ACTIVE(vap, va_total_size))) { - if (0 != hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0)) { - /* failed to get the uncompressed size, we'll check for this later */ - uncompressed_size = -1; + if ((VATTR_IS_ACTIVE(vap, va_data_size) || VATTR_IS_ACTIVE(vap, va_total_size))) { + // if it's compressed + if (compressed || (!VNODE_IS_RSRC(vp) && cp->c_decmp && cp->c_decmp->cmp_type >= CMP_MAX)) { + if (0 != hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0)) { + /* failed to get the uncompressed size, we'll check for this later */ + uncompressed_size = -1; + } else { + // fake that it's compressed + compressed = 1; + } } } } @@ -812,13 +846,17 @@ hfs_vnop_getattr(struct vnop_getattr_args *ap) if (cp->c_blocks - VTOF(vp)->ff_blocks) { /* We deal with rsrc fork vnode iocount at the end of the function */ - error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, TRUE); + error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, FALSE); if (error) { - /* - * hfs_vgetrsrc may have returned a vnode in rvp even though - * we got an error, because we specified error_on_unlinked. - * We need to drop the iocount after we release the cnode lock, so - * it will be taken care of at the end of the function if it's needed. + /* + * Note that we call hfs_vgetrsrc with error_on_unlinked + * set to FALSE. This is because we may be invoked via + * fstat() on an open-unlinked file descriptor and we must + * continue to support access to the rsrc fork until it disappears. + * The code at the end of this function will be + * responsible for releasing the iocount generated by + * hfs_vgetrsrc. This is because we can't drop the iocount + * without unlocking the cnode first. */ goto out; } @@ -876,6 +914,17 @@ hfs_vnop_getattr(struct vnop_getattr_args *ap) vap->va_backup_time.tv_sec = cp->c_btime; vap->va_backup_time.tv_nsec = 0; + /* See if we need to emit the date added field to the user */ + if (VATTR_IS_ACTIVE(vap, va_addedtime)) { + u_int32_t dateadded = hfs_get_dateadded (cp); + if (dateadded) { + vap->va_addedtime.tv_sec = dateadded; + vap->va_addedtime.tv_nsec = 0; + VATTR_SET_SUPPORTED (vap, va_addedtime); + } + } + + /* XXX is this really a good 'optimal I/O size'? */ vap->va_iosize = hfsmp->hfs_logBlockSize; vap->va_uid = cp->c_uid; @@ -972,7 +1021,7 @@ hfs_vnop_getattr(struct vnop_getattr_args *ap) * have an open-unlinked file. Go to the next link in this case. */ if ((cp->c_desc.cd_namelen == 0) && (vap->va_linkid == cp->c_fileid)) { - if ((error = hfs_lookuplink(hfsmp, vap->va_linkid, &prevlinkid, &nextlinkid))){ + if ((error = hfs_lookup_siblinglinks(hfsmp, vap->va_linkid, &prevlinkid, &nextlinkid))){ goto out; } } @@ -1029,7 +1078,7 @@ hfs_vnop_getattr(struct vnop_getattr_args *ap) return (error); } -static int +int hfs_vnop_setattr(ap) struct vnop_setattr_args /* { struct vnode *a_vp; @@ -1046,7 +1095,10 @@ hfs_vnop_setattr(ap) int error = 0; uid_t nuid; gid_t ngid; + time_t orig_ctime; + orig_ctime = VTOC(vp)->c_ctime; + #if HFS_COMPRESSION int decmpfs_reset_state = 0; /* @@ -1056,8 +1108,23 @@ hfs_vnop_setattr(ap) error = decmpfs_update_attributes(vp, vap); if (error) return error; + + // + // if this is not a size-changing setattr and it is not just + // an atime update, then check for a snapshot. + // + if (!VATTR_IS_ACTIVE(vap, va_data_size) && !(vap->va_active == VNODE_ATTR_va_access_time)) { + check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_METADATA_MOD, NULL); + } #endif + +#if CONFIG_PROTECT + if ((error = cp_handle_vnop(VTOC(vp), CP_WRITE_ACCESS)) != 0) { + return (error); + } +#endif /* CONFIG_PROTECT */ + hfsmp = VTOHFS(vp); /* Don't allow modification of the journal file. */ @@ -1090,6 +1157,8 @@ hfs_vnop_setattr(ap) } } + check_for_tracked_file(vp, orig_ctime, vap->va_data_size == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL); + decmpfs_lock_compressed_data(dp, 1); if (hfs_file_is_compressed(VTOC(vp), 1)) { error = decmpfs_decompress_file(vp, dp, -1/*vap->va_data_size*/, 0, 1); @@ -1101,13 +1170,13 @@ hfs_vnop_setattr(ap) #endif /* Take truncate lock before taking cnode lock. */ - hfs_lock_truncate(VTOC(vp), TRUE); + hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK); /* Perform the ubc_setsize before taking the cnode lock. */ ubc_setsize(vp, vap->va_data_size); if ((error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) { - hfs_unlock_truncate(VTOC(vp), TRUE); + hfs_unlock_truncate(VTOC(vp), 0); #if HFS_COMPRESSION decmpfs_unlock_compressed_data(dp, 1); #endif @@ -1117,7 +1186,7 @@ hfs_vnop_setattr(ap) error = hfs_truncate(vp, vap->va_data_size, vap->va_vaflags & 0xffff, 1, 0, ap->a_context); - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); #if HFS_COMPRESSION decmpfs_unlock_compressed_data(dp, 1); #endif @@ -1297,7 +1366,6 @@ hfs_vnop_setattr(ap) * Change the mode on a file. * cnode must be locked before calling. */ -__private_extern__ int hfs_chmod(struct vnode *vp, int mode, __unused kauth_cred_t cred, __unused struct proc *p) { @@ -1328,7 +1396,6 @@ hfs_chmod(struct vnode *vp, int mode, __unused kauth_cred_t cred, __unused struc } -__private_extern__ int hfs_write_access(struct vnode *vp, kauth_cred_t cred, struct proc *p, Boolean considerFlags) { @@ -1378,7 +1445,6 @@ hfs_write_access(struct vnode *vp, kauth_cred_t cred, struct proc *p, Boolean co * Perform chown operation on cnode cp; * code must be locked prior to call. */ -__private_extern__ int #if !QUOTA hfs_chown(struct vnode *vp, uid_t uid, gid_t gid, __unused kauth_cred_t cred, @@ -1512,7 +1578,7 @@ hfs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred, * case the file is being tracked through its file ID. Typically * its used after creating a new file during a safe-save. */ -static int +int hfs_vnop_exchange(ap) struct vnop_exchange_args /* { struct vnode *a_fvp; @@ -1538,6 +1604,7 @@ hfs_vnop_exchange(ap) int lockflags; int error = 0, started_tr = 0, got_cookie = 0; cat_cookie_t cookie; + time_t orig_from_ctime, orig_to_ctime; /* The files must be on the same volume. */ if (vnode_mount(from_vp) != vnode_mount(to_vp)) @@ -1546,6 +1613,9 @@ hfs_vnop_exchange(ap) if (from_vp == to_vp) return (EINVAL); + orig_from_ctime = VTOC(from_vp)->c_ctime; + orig_to_ctime = VTOC(to_vp)->c_ctime; + #if HFS_COMPRESSION if ( hfs_file_is_compressed(VTOC(from_vp), 0) ) { if ( 0 != ( error = decmpfs_decompress_file(from_vp, VTOCMP(from_vp), -1, 0, 1) ) ) { @@ -1560,6 +1630,50 @@ hfs_vnop_exchange(ap) } #endif // HFS_COMPRESSION + /* + * Normally, we want to notify the user handlers about the event, + * except if it's a handler driving the event. + */ + if ((ap->a_options & FSOPT_EXCHANGE_DATA_ONLY) == 0) { + check_for_tracked_file(from_vp, orig_from_ctime, NAMESPACE_HANDLER_WRITE_OP, NULL); + check_for_tracked_file(to_vp, orig_to_ctime, NAMESPACE_HANDLER_WRITE_OP, NULL); + } + else { + /* + * We're doing a data-swap. + * Take the truncate lock/cnode lock, then verify there are no mmap references. + * Issue a hfs_filedone to flush out all of the remaining state for this file. + * Allow the rest of the codeflow to re-acquire the cnode locks in order. + */ + + hfs_lock_truncate (VTOC(from_vp), HFS_SHARED_LOCK); + + if ((error = hfs_lock(VTOC(from_vp), HFS_EXCLUSIVE_LOCK))) { + hfs_unlock_truncate (VTOC(from_vp), 0); + return error; + } + + /* Verify the source file is not in use by anyone besides us (including mmap refs) */ + if (vnode_isinuse(from_vp, 1)) { + error = EBUSY; + hfs_unlock(VTOC(from_vp)); + hfs_unlock_truncate (VTOC(from_vp), 0); + return error; + } + + /* Flush out the data in the source file */ + VTOC(from_vp)->c_flag |= C_SWAPINPROGRESS; + error = hfs_filedone (from_vp, ap->a_context); + VTOC(from_vp)->c_flag &= ~C_SWAPINPROGRESS; + hfs_unlock(VTOC(from_vp)); + hfs_unlock_truncate(VTOC(from_vp), 0); + + if (error) { + return error; + } + } + + if ((error = hfs_lockpair(VTOC(from_vp), VTOC(to_vp), HFS_EXCLUSIVE_LOCK))) return (error); @@ -1595,6 +1709,16 @@ hfs_vnop_exchange(ap) } } + /* + * Ok, now that all of the pre-flighting is done, call the underlying + * function if needed. + */ + if (ap->a_options & FSOPT_EXCHANGE_DATA_ONLY) { + error = hfs_movedata(from_vp, to_vp); + goto exit; + } + + if ((error = hfs_start_transaction(hfsmp)) != 0) { goto exit; } @@ -1729,11 +1853,338 @@ hfs_vnop_exchange(ap) return (error); } +int +hfs_vnop_mmap(struct vnop_mmap_args *ap) +{ + struct vnode *vp = ap->a_vp; + int error; + + if (VNODE_IS_RSRC(vp)) { + /* allow pageins of the resource fork */ + } else { + int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */ + time_t orig_ctime = VTOC(vp)->c_ctime; + + if (!compressed && (VTOC(vp)->c_flags & UF_COMPRESSED)) { + error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP); + if (error != 0) { + return error; + } + } + + if (ap->a_fflags & PROT_WRITE) { + check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_WRITE_OP, NULL); + } + } + + // + // NOTE: we return ENOTSUP because we want the cluster layer + // to actually do all the real work. + // + return (ENOTSUP); +} + +/* + * hfs_movedata + * + * This is a non-symmetric variant of exchangedata. In this function, + * the contents of the fork in from_vp are moved to the fork + * specified by to_vp. + * + * The cnodes pointed to by 'from_vp' and 'to_vp' must be locked. + * + * The vnode pointed to by 'to_vp' *must* be empty prior to invoking this function. + * We impose this restriction because we may not be able to fully delete the entire + * file's contents in a single transaction, particularly if it has a lot of extents. + * In the normal file deletion codepath, the file is screened for two conditions: + * 1) bigger than 400MB, and 2) more than 8 extents. If so, the file is relocated to + * the hidden directory and the deletion is broken up into multiple truncates. We can't + * do that here because both files need to exist in the namespace. The main reason this + * is imposed is that we may have to touch a whole lot of bitmap blocks if there are + * many extents. + * + * Any data written to 'from_vp' after this call completes is not guaranteed + * to be moved. + * + * Arguments: + * vnode from_vp: source file + * vnode to_vp: destination file; must be empty + * + * Returns: + * EFBIG - Destination file was not empty + * 0 - success + * + * + */ +int hfs_movedata (struct vnode *from_vp, struct vnode *to_vp) { + + struct cnode *from_cp; + struct cnode *to_cp; + struct hfsmount *hfsmp = NULL; + int error = 0; + int started_tr = 0; + int lockflags = 0; + int overflow_blocks; + int rsrc = 0; + + + /* Get the HFS pointers */ + from_cp = VTOC(from_vp); + to_cp = VTOC(to_vp); + hfsmp = VTOHFS(from_vp); + + /* Verify that neither source/dest file is open-unlinked */ + if (from_cp->c_flag & (C_DELETED | C_NOEXISTS)) { + error = EBUSY; + goto movedata_exit; + } + + if (to_cp->c_flag & (C_DELETED | C_NOEXISTS)) { + error = EBUSY; + goto movedata_exit; + } + + /* + * Verify the source file is not in use by anyone besides us. + * + * This function is typically invoked by a namespace handler + * process responding to a temporarily stalled system call. + * The FD that it is working off of is opened O_EVTONLY, so + * it really has no active usecounts (the kusecount from O_EVTONLY + * is subtracted from the total usecounts). + * + * As a result, we shouldn't have any active usecounts against + * this vnode when we go to check it below. + */ + if (vnode_isinuse(from_vp, 0)) { + error = EBUSY; + goto movedata_exit; + } + + if (from_cp->c_rsrc_vp == from_vp) { + rsrc = 1; + } + + /* + * We assume that the destination file is already empty. + * Verify that it is. + */ + if (rsrc) { + if (to_cp->c_rsrcfork->ff_size > 0) { + error = EFBIG; + goto movedata_exit; + } + } + else { + if (to_cp->c_datafork->ff_size > 0) { + error = EFBIG; + goto movedata_exit; + } + } + + /* If the source has the rsrc open, make sure the destination is also the rsrc */ + if (rsrc) { + if (to_vp != to_cp->c_rsrc_vp) { + error = EINVAL; + goto movedata_exit; + } + } + else { + /* Verify that both forks are data forks */ + if (to_vp != to_cp->c_vp) { + error = EINVAL; + goto movedata_exit; + } + } + + /* + * See if the source file has overflow extents. If it doesn't, we don't + * need to call into MoveData, and the catalog will be enough. + */ + if (rsrc) { + overflow_blocks = overflow_extents(from_cp->c_rsrcfork); + } + else { + overflow_blocks = overflow_extents(from_cp->c_datafork); + } + + if ((error = hfs_start_transaction (hfsmp)) != 0) { + goto movedata_exit; + } + started_tr = 1; + + /* Lock the system files: catalog, extents, attributes */ + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_EXTENTS | SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); + + /* Copy over any catalog allocation data into the new spot. */ + if (rsrc) { + if ((error = hfs_move_fork (from_cp->c_rsrcfork, from_cp, to_cp->c_rsrcfork, to_cp))){ + hfs_systemfile_unlock(hfsmp, lockflags); + goto movedata_exit; + } + } + else { + if ((error = hfs_move_fork (from_cp->c_datafork, from_cp, to_cp->c_datafork, to_cp))) { + hfs_systemfile_unlock(hfsmp, lockflags); + goto movedata_exit; + } + } + + /* + * Note that because all we're doing is moving the extents around, we can + * probably do this in a single transaction: Each extent record (group of 8) + * is 64 bytes. A extent overflow B-Tree node is typically 4k. This means + * each node can hold roughly ~60 extent records == (480 extents). + * + * If a file was massively fragmented and had 20k extents, this means we'd + * roughly touch 20k/480 == 41 to 42 nodes, plus the index nodes, for half + * of the operation. (inserting or deleting). So if we're manipulating 80-100 + * nodes, this is basically 320k of data to write to the journal in + * a bad case. + */ + if (overflow_blocks != 0) { + if (rsrc) { + error = MoveData(hfsmp, from_cp->c_cnid, to_cp->c_cnid, 1); + } + else { + error = MoveData (hfsmp, from_cp->c_cnid, to_cp->c_cnid, 0); + } + } + + if (error) { + /* Reverse the operation. Copy the fork data back into the source */ + if (rsrc) { + hfs_move_fork (to_cp->c_rsrcfork, to_cp, from_cp->c_rsrcfork, from_cp); + } + else { + hfs_move_fork (to_cp->c_datafork, to_cp, from_cp->c_datafork, from_cp); + } + } + else { + struct cat_fork *src_data = NULL; + struct cat_fork *src_rsrc = NULL; + struct cat_fork *dst_data = NULL; + struct cat_fork *dst_rsrc = NULL; + + /* Touch the times*/ + to_cp->c_touch_acctime = TRUE; + to_cp->c_touch_chgtime = TRUE; + to_cp->c_touch_modtime = TRUE; + + from_cp->c_touch_acctime = TRUE; + from_cp->c_touch_chgtime = TRUE; + from_cp->c_touch_modtime = TRUE; + + hfs_touchtimes(hfsmp, to_cp); + hfs_touchtimes(hfsmp, from_cp); + + if (from_cp->c_datafork) { + src_data = &from_cp->c_datafork->ff_data; + } + if (from_cp->c_rsrcfork) { + src_rsrc = &from_cp->c_rsrcfork->ff_data; + } + + if (to_cp->c_datafork) { + dst_data = &to_cp->c_datafork->ff_data; + } + if (to_cp->c_rsrcfork) { + dst_rsrc = &to_cp->c_rsrcfork->ff_data; + } + + /* Update the catalog nodes */ + (void) cat_update(hfsmp, &from_cp->c_desc, &from_cp->c_attr, + src_data, src_rsrc); + + (void) cat_update(hfsmp, &to_cp->c_desc, &to_cp->c_attr, + dst_data, dst_rsrc); + + } + /* unlock the system files */ + hfs_systemfile_unlock(hfsmp, lockflags); + + +movedata_exit: + if (started_tr) { + hfs_end_transaction(hfsmp); + } + + return error; + +} + +/* + * Copy all of the catalog and runtime data in srcfork to dstfork. + * + * This allows us to maintain the invalid ranges across the movedata operation so + * we don't need to force all of the pending IO right now. In addition, we move all + * non overflow-extent extents into the destination here. + */ +static int hfs_move_fork (struct filefork *srcfork, struct cnode *src_cp, + struct filefork *dstfork, struct cnode *dst_cp) { + struct rl_entry *invalid_range; + int size = sizeof(struct HFSPlusExtentDescriptor); + size = size * kHFSPlusExtentDensity; + + /* If the dstfork has any invalid ranges, bail out */ + invalid_range = TAILQ_FIRST(&dstfork->ff_invalidranges); + if (invalid_range != NULL) { + return EFBIG; + } + + if (dstfork->ff_data.cf_size != 0 || dstfork->ff_data.cf_new_size != 0) { + return EFBIG; + } + + /* First copy the invalid ranges */ + while ((invalid_range = TAILQ_FIRST(&srcfork->ff_invalidranges))) { + off_t start = invalid_range->rl_start; + off_t end = invalid_range->rl_end; + + /* Remove it from the srcfork and add it to dstfork */ + rl_remove(start, end, &srcfork->ff_invalidranges); + rl_add(start, end, &dstfork->ff_invalidranges); + } + + /* + * Ignore the ff_union. We don't move symlinks or system files. + * Now copy the in-catalog extent information + */ + dstfork->ff_data.cf_size = srcfork->ff_data.cf_size; + dstfork->ff_data.cf_new_size = srcfork->ff_data.cf_new_size; + dstfork->ff_data.cf_vblocks = srcfork->ff_data.cf_vblocks; + dstfork->ff_data.cf_blocks = srcfork->ff_data.cf_blocks; + + /* just memcpy the whole array of extents to the new location. */ + memcpy (dstfork->ff_data.cf_extents, srcfork->ff_data.cf_extents, size); + + /* + * Copy the cnode attribute data. + * + */ + src_cp->c_blocks -= srcfork->ff_data.cf_vblocks; + src_cp->c_blocks -= srcfork->ff_data.cf_blocks; + + dst_cp->c_blocks += srcfork->ff_data.cf_vblocks; + dst_cp->c_blocks += srcfork->ff_data.cf_blocks; + + /* Now delete the entries in the source fork */ + srcfork->ff_data.cf_size = 0; + srcfork->ff_data.cf_new_size = 0; + srcfork->ff_data.cf_union.cfu_bytesread = 0; + srcfork->ff_data.cf_vblocks = 0; + srcfork->ff_data.cf_blocks = 0; + + /* Zero out the old extents */ + bzero (srcfork->ff_data.cf_extents, size); + return 0; +} + + /* * cnode must be locked */ -__private_extern__ int hfs_fsync(struct vnode *vp, int waitfor, int fullsync, struct proc *p) { @@ -1747,7 +2198,6 @@ hfs_fsync(struct vnode *vp, int waitfor, int fullsync, struct proc *p) int wait; /* all other attributes (e.g. atime, etc.) */ int lockflag; int took_trunc_lock = 0; - boolean_t trunc_lock_exclusive = FALSE; /* * Applications which only care about data integrity rather than full @@ -1777,14 +2227,13 @@ hfs_fsync(struct vnode *vp, int waitfor, int fullsync, struct proc *p) } } else if (UBCINFOEXISTS(vp)) { hfs_unlock(cp); - hfs_lock_truncate(cp, trunc_lock_exclusive); + hfs_lock_truncate(cp, HFS_SHARED_LOCK); took_trunc_lock = 1; if (fp->ff_unallocblocks != 0) { - hfs_unlock_truncate(cp, trunc_lock_exclusive); + hfs_unlock_truncate(cp, 0); - trunc_lock_exclusive = TRUE; - hfs_lock_truncate(cp, trunc_lock_exclusive); + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK); } /* Don't hold cnode lock when calling into cluster layer. */ (void) cluster_push(vp, waitdata ? IO_SYNC : 0); @@ -1811,13 +2260,12 @@ hfs_fsync(struct vnode *vp, int waitfor, int fullsync, struct proc *p) goto datasync; } if (!TAILQ_EMPTY(&fp->ff_invalidranges)) { - if (!took_trunc_lock || trunc_lock_exclusive == FALSE) { + if (!took_trunc_lock || (cp->c_truncatelockowner == HFS_SHARED_OWNER)) { hfs_unlock(cp); - if (took_trunc_lock) - hfs_unlock_truncate(cp, trunc_lock_exclusive); - - trunc_lock_exclusive = TRUE; - hfs_lock_truncate(cp, trunc_lock_exclusive); + if (took_trunc_lock) { + hfs_unlock_truncate(cp, 0); + } + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK); hfs_lock(cp, HFS_FORCE_LOCK); took_trunc_lock = 1; } @@ -1848,7 +2296,7 @@ hfs_fsync(struct vnode *vp, int waitfor, int fullsync, struct proc *p) } datasync: if (took_trunc_lock) { - hfs_unlock_truncate(cp, trunc_lock_exclusive); + hfs_unlock_truncate(cp, 0); took_trunc_lock = 0; } /* @@ -1899,13 +2347,23 @@ hfs_fsync(struct vnode *vp, int waitfor, int fullsync, struct proc *p) * changes get to stable storage. */ if (fullsync) { - if (hfsmp->jnl) { - hfs_journal_flush(hfsmp); - } else { - retval = hfs_metasync_all(hfsmp); - /* XXX need to pass context! */ - VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL); - } + if (hfsmp->jnl) { + hfs_journal_flush(hfsmp, FALSE); + + if (journal_uses_fua(hfsmp->jnl)) { + /* + * the journal_flush did NOT issue a sync track cache command, + * and the fullsync indicates we are supposed to flush all cached + * data to the media, so issue the sync track cache command + * explicitly + */ + VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL); + } + } else { + retval = hfs_metasync_all(hfsmp); + /* XXX need to pass context! */ + VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL); + } } } @@ -1914,7 +2372,7 @@ hfs_fsync(struct vnode *vp, int waitfor, int fullsync, struct proc *p) /* Sync an hfs catalog b-tree node */ -static int +int hfs_metasync(struct hfsmount *hfsmp, daddr64_t node, __unused struct proc *p) { vnode_t vp; @@ -1960,7 +2418,7 @@ hfs_metasync(struct hfsmount *hfsmp, daddr64_t node, __unused struct proc *p) * we rely on fsck_hfs to fix that up (which it can do without any loss * of data). */ -static int +int hfs_metasync_all(struct hfsmount *hfsmp) { int lockflags; @@ -2002,7 +2460,6 @@ hfs_btsync_callback(struct buf *bp, __unused void *dummy) } -__private_extern__ int hfs_btsync(struct vnode *vp, int sync_transaction) { @@ -2030,7 +2487,7 @@ hfs_btsync(struct vnode *vp, int sync_transaction) /* * Remove a directory. */ -static int +int hfs_vnop_rmdir(ap) struct vnop_rmdir_args /* { struct vnode *a_dvp; @@ -2044,6 +2501,9 @@ hfs_vnop_rmdir(ap) struct cnode *dcp = VTOC(dvp); struct cnode *cp = VTOC(vp); int error; + time_t orig_ctime; + + orig_ctime = VTOC(vp)->c_ctime; if (!S_ISDIR(cp->c_mode)) { return (ENOTDIR); @@ -2051,6 +2511,10 @@ hfs_vnop_rmdir(ap) if (dvp == vp) { return (EINVAL); } + + check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_DELETE_OP, NULL); + cp = VTOC(vp); + if ((error = hfs_lockpair(dcp, cp, HFS_EXCLUSIVE_LOCK))) { return (error); } @@ -2060,7 +2524,7 @@ hfs_vnop_rmdir(ap) hfs_unlockpair (dcp, cp); return ENOENT; } - error = hfs_removedir(dvp, vp, ap->a_cnp, 0); + error = hfs_removedir(dvp, vp, ap->a_cnp, 0, 0); hfs_unlockpair(dcp, cp); @@ -2072,9 +2536,9 @@ hfs_vnop_rmdir(ap) * * Both dvp and vp cnodes are locked */ -static int +int hfs_removedir(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, - int skip_reserve) + int skip_reserve, int only_unlink) { struct cnode *cp; struct cnode *dcp; @@ -2096,24 +2560,77 @@ hfs_removedir(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, if (cp->c_entries != 0) { return (ENOTEMPTY); } + + /* + * If the directory is open or in use (e.g. opendir() or current working + * directory for some process); wait for inactive/reclaim to actually + * remove cnode from the catalog. Both inactive and reclaim codepaths are capable + * of removing open-unlinked directories from the catalog, as well as getting rid + * of EAs still on the element. So change only_unlink to true, so that it will get + * cleaned up below. + * + * Otherwise, we can get into a weird old mess where the directory has C_DELETED, + * but it really means C_NOEXISTS because the item was actually removed from the + * catalog. Then when we try to remove the entry from the catalog later on, it won't + * really be there anymore. + */ + if (vnode_isinuse(vp, 0)) { + only_unlink = 1; + } - /* Check if we're removing the last link to an empty directory. */ + /* Deal with directory hardlinks */ if (cp->c_flag & C_HARDLINK) { - /* We could also return EBUSY here */ + /* + * Note that if we have a directory which was a hardlink at any point, + * its actual directory data is stored in the directory inode in the hidden + * directory rather than the leaf element(s) present in the namespace. + * + * If there are still other hardlinks to this directory, + * then we'll just eliminate this particular link and the vnode will still exist. + * If this is the last link to an empty directory, then we'll open-unlink the + * directory and it will be only tagged with C_DELETED (as opposed to C_NOEXISTS). + * + * We could also return EBUSY here. + */ + return hfs_unlink(hfsmp, dvp, vp, cnp, skip_reserve); } /* - * We want to make sure that if the directory has a lot of attributes, we process them - * in separate transactions to ensure we don't panic in the journal with a gigantic - * transaction. This means we'll let hfs_removefile deal with the directory, which generally - * follows the same codepath as open-unlinked files. Note that the last argument to - * hfs_removefile specifies that it is supposed to handle directories for this case. - */ - if ((hfsmp->hfs_attribute_vp != NULL) && - (cp->c_attr.ca_recflags & kHFSHasAttributesMask) != 0) { - - return hfs_removefile(dvp, vp, cnp, 0, 0, 1, NULL); + * In a few cases, we may want to allow the directory to persist in an + * open-unlinked state. If the directory is being open-unlinked (still has usecount + * references), or if it has EAs, or if it was being deleted as part of a rename, + * then we go ahead and move it to the hidden directory. + * + * If the directory is being open-unlinked, then we want to keep the catalog entry + * alive so that future EA calls and fchmod/fstat etc. do not cause issues later. + * + * If the directory had EAs, then we want to use the open-unlink trick so that the + * EA removal is not done in one giant transaction. Otherwise, it could cause a panic + * due to overflowing the journal. + * + * Finally, if it was deleted as part of a rename, we move it to the hidden directory + * in order to maintain rename atomicity. + * + * Note that the allow_dirs argument to hfs_removefile specifies that it is + * supposed to handle directories for this case. + */ + + if (((hfsmp->hfs_attribute_vp != NULL) && + ((cp->c_attr.ca_recflags & kHFSHasAttributesMask) != 0)) || + (only_unlink != 0)) { + + int ret = hfs_removefile(dvp, vp, cnp, 0, 0, 1, NULL, only_unlink); + /* + * Even though hfs_vnop_rename calls vnode_recycle for us on tvp we call + * it here just in case we were invoked by rmdir() on a directory that had + * EAs. To ensure that we start reclaiming the space as soon as possible, + * we call vnode_recycle on the directory. + */ + vnode_recycle(vp); + + return ret; + } dcp->c_flag |= C_DIR_MODIFICATION; @@ -2155,7 +2672,7 @@ hfs_removedir(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, desc.cd_encoding = cp->c_encoding; desc.cd_hint = 0; - if (!hfs_valid_cnode(hfsmp, dvp, cnp, cp->c_fileid)) { + if (!hfs_valid_cnode(hfsmp, dvp, cnp, cp->c_fileid, NULL, &error)) { error = 0; goto out; } @@ -2199,16 +2716,8 @@ hfs_removedir(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, hfs_volupdate(hfsmp, VOL_RMDIR, (dcp->c_cnid == kHFSRootFolderID)); - /* - * directory open or in use (e.g. opendir() or current working - * directory for some process); wait for inactive to actually - * remove catalog entry - */ - if (vnode_isinuse(vp, 0)) { - cp->c_flag |= C_DELETED; - } else { - cp->c_flag |= C_NOEXISTS; - } + /* Mark C_NOEXISTS since the catalog entry is now gone */ + cp->c_flag |= C_NOEXISTS; out: dcp->c_flag &= ~C_DIR_MODIFICATION; wakeup((caddr_t)&dcp->c_flag); @@ -2224,7 +2733,7 @@ hfs_removedir(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, /* * Remove a file or link. */ -static int +int hfs_vnop_remove(ap) struct vnop_remove_args /* { struct vnode *a_dvp; @@ -2237,17 +2746,29 @@ hfs_vnop_remove(ap) struct vnode *dvp = ap->a_dvp; struct vnode *vp = ap->a_vp; struct cnode *dcp = VTOC(dvp); - struct cnode *cp = VTOC(vp); + struct cnode *cp; struct vnode *rvp = NULL; struct hfsmount *hfsmp = VTOHFS(vp); int error=0, recycle_rsrc=0; int drop_rsrc_vnode = 0; - int vref; + time_t orig_ctime; if (dvp == vp) { return (EINVAL); } + orig_ctime = VTOC(vp)->c_ctime; + if (!vnode_isnamedstream(vp)) { + error = check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_DELETE_OP, NULL); + if (error) { + // XXXdbg - decide on a policy for handling namespace handler failures! + // for now we just let them proceed. + } + } + error = 0; + + cp = VTOC(vp); + /* * We need to grab the cnode lock on 'cp' before the lockpair() * to get an iocount on the rsrc fork BEFORE we enter hfs_removefile. @@ -2269,23 +2790,25 @@ hfs_vnop_remove(ap) if ((error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK))) { return (error); } + error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, TRUE); hfs_unlock(cp); if (error) { - /* We may have gotten a rsrc vp out even though we got an error back. */ + /* we may have gotten an rsrc vp even though we got an error */ if (rvp) { vnode_put(rvp); rvp = NULL; } - return error; + return (error); } drop_rsrc_vnode = 1; } /* Now that we may have an iocount on rvp, do the lock pair */ - hfs_lock_truncate(cp, TRUE); + + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK); if ((error = hfs_lockpair(dcp, cp, HFS_EXCLUSIVE_LOCK))) { - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); /* drop the iocount on rvp if necessary */ if (drop_rsrc_vnode) { vnode_put (rvp); @@ -2302,20 +2825,27 @@ hfs_vnop_remove(ap) goto rm_done; } - error = hfs_removefile(dvp, vp, ap->a_cnp, ap->a_flags, 0, 0, rvp); - - // - // If the remove succeeded and it's an open-unlinked file that has - // a resource fork vnode that's not in use, we will want to recycle - // the rvp *after* we're done unlocking everything. Otherwise the - // resource vnode will keep a v_parent reference on this vnode which - // prevents it from going through inactive/reclaim which means that - // the disk space associated with this file won't get free'd until - // something forces the resource vnode to get recycled (and that can - // take a very long time). - // - if (error == 0 && (cp->c_flag & C_DELETED) && - (rvp) && !vnode_isinuse(rvp, 0)) { + error = hfs_removefile(dvp, vp, ap->a_cnp, ap->a_flags, 0, 0, rvp, 0); + + /* + * If the remove succeeded in deleting the file, then we may need to mark + * the resource fork for recycle so that it is reclaimed as quickly + * as possible. If it were not recycled quickly, then this resource fork + * vnode could keep a v_parent reference on the data fork, which prevents it + * from going through reclaim (by giving it extra usecounts), except in the force- + * unmount case. + * + * However, a caveat: we need to continue to supply resource fork + * access to open-unlinked files even if the resource fork is not open. This is + * a requirement for the compressed files work. Luckily, hfs_vgetrsrc will handle + * this already if the data fork has been re-parented to the hidden directory. + * + * As a result, all we really need to do here is mark the resource fork vnode + * for recycle. If it goes out of core, it can be brought in again if needed. + * If the cnode was instead marked C_NOEXISTS, then there wouldn't be any + * more work. + */ + if ((error == 0) && (rvp)) { recycle_rsrc = 1; } @@ -2326,15 +2856,11 @@ hfs_vnop_remove(ap) * truncate lock) */ rm_done: - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); hfs_unlockpair(dcp, cp); if (recycle_rsrc) { - vref = vnode_ref(rvp); - if (vref == 0) { - /* vnode_ref could return an error, only release if we got a ref */ - vnode_rele(rvp); - } + /* inactive or reclaim on rvp will clean up the blocks from the rsrc fork */ vnode_recycle(rvp); } @@ -2376,24 +2902,24 @@ hfs_removefile_callback(struct buf *bp, void *hfsmp) { * * Requires cnode and truncate locks to be held. */ -static int +int hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, - int flags, int skip_reserve, int allow_dirs, struct vnode *rvp) + int flags, int skip_reserve, int allow_dirs, + struct vnode *rvp, int only_unlink) { struct cnode *cp; struct cnode *dcp; struct hfsmount *hfsmp; struct cat_desc desc; struct timeval tv; - vfs_context_t ctx = cnp->cn_context; int dataforkbusy = 0; int rsrcforkbusy = 0; - int truncated = 0; int lockflags; int error = 0; int started_tr = 0; int isbigfile = 0, defer_remove=0, isdir=0; - + int update_vh = 0; + cp = VTOC(vp); dcp = VTOC(dvp); hfsmp = VTOHFS(vp); @@ -2403,7 +2929,7 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, return (0); } - if (!hfs_valid_cnode(hfsmp, dvp, cnp, cp->c_fileid)) { + if (!hfs_valid_cnode(hfsmp, dvp, cnp, cp->c_fileid, NULL, &error)) { return 0; } @@ -2485,6 +3011,11 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, (cp->c_attr.ca_recflags & kHFSHasAttributesMask) != 0) { defer_remove = 1; } + + /* If we are explicitly told to only unlink item and move to hidden dir, then do it */ + if (only_unlink) { + defer_remove = 1; + } /* * Carbon semantics prohibit deleting busy files. @@ -2502,9 +3033,16 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, if (hfsmp->hfs_flags & HFS_QUOTAS) (void)hfs_getinoquota(cp); #endif /* QUOTA */ - - /* Check if we need a ubc_setsize. */ - if (isdir == 0 && (!dataforkbusy || !rsrcforkbusy)) { + + /* + * Do a ubc_setsize to indicate we need to wipe contents if: + * 1) item is a regular file. + * 2) Neither fork is busy AND we are not told to unlink this. + * + * We need to check for the defer_remove since it can be set without + * having a busy data or rsrc fork + */ + if (isdir == 0 && (!dataforkbusy || !rsrcforkbusy) && (defer_remove == 0)) { /* * A ubc_setsize can cause a pagein so defer it * until after the cnode lock is dropped. The @@ -2525,40 +3063,46 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, started_tr = 1; // XXXdbg - if we're journaled, kill any dirty symlink buffers - if (hfsmp->jnl && vnode_islnk(vp)) + if (hfsmp->jnl && vnode_islnk(vp) && (defer_remove == 0)) { buf_iterate(vp, hfs_removefile_callback, BUF_SKIP_NONLOCKED, (void *)hfsmp); + } /* - * Truncate any non-busy forks. Busy forks will + * Prepare to truncate any non-busy forks. Busy forks will * get truncated when their vnode goes inactive. * Note that we will only enter this region if we * can avoid creating an open-unlinked file. If * either region is busy, we will have to create an open * unlinked file. - * Since we're already inside a transaction, - * tell hfs_truncate to skip the ubc_setsize. + * + * Since we are deleting the file, we need to stagger the runtime + * modifications to do things in such a way that a crash won't + * result in us getting overlapped extents or any other + * bad inconsistencies. As such, we call prepare_release_storage + * which updates the UBC, updates quota information, and releases + * any loaned blocks that belong to this file. No actual + * truncation or bitmap manipulation is done until *AFTER* + * the catalog record is removed. */ - if (isdir == 0 && (!dataforkbusy && !rsrcforkbusy)) { - /* - * Note that 5th argument to hfs_truncate indicates whether or not - * hfs_update calls should be suppressed in call to do_hfs_truncate - */ + if (isdir == 0 && (!dataforkbusy && !rsrcforkbusy) && (only_unlink == 0)) { + if (!dataforkbusy && !isbigfile && cp->c_datafork->ff_blocks != 0) { - /* skip update in hfs_truncate */ - error = hfs_truncate(vp, (off_t)0, IO_NDELAY, 1, 1, ctx); - if (error) + + error = hfs_prepare_release_storage (hfsmp, vp); + if (error) { goto out; - truncated = 1; + } + update_vh = 1; } if (!rsrcforkbusy && rvp) { - /* skip update in hfs_truncate */ - error = hfs_truncate(rvp, (off_t)0, IO_NDELAY, 1, 1, ctx); - if (error) + error = hfs_prepare_release_storage (hfsmp, rvp); + if (error) { goto out; - truncated = 1; + } + update_vh = 1; } } - + /* * Protect against a race with rename by using the component * name passed in and parent id from dvp (instead of using @@ -2658,15 +3202,15 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, if (error) goto out; - } else /* Not busy */ { - - if (cp->c_blocks > 0) { - printf("hfs_remove: attempting to delete a non-empty file %s\n", - cp->c_desc.cd_nameptr); - error = EBUSY; - goto out; - } - + } + else /* Not busy */ { + +#if QUOTA + off_t savedbytes; + int blksize = hfsmp->blockSize; +#endif + u_int32_t fileid = cp->c_fileid; + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); if (!skip_reserve) { if ((error = cat_preflight(hfsmp, CAT_DELETE, NULL, 0))) { @@ -2674,30 +3218,14 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, goto out; } } - + error = cat_delete(hfsmp, &desc, &cp->c_attr); - - if (error && error != ENXIO && error != ENOENT && truncated) { - if ((cp->c_datafork && cp->c_datafork->ff_size != 0) || - (cp->c_rsrcfork && cp->c_rsrcfork->ff_size != 0)) { - off_t data_size = 0; - off_t rsrc_size = 0; - if (cp->c_datafork) { - data_size = cp->c_datafork->ff_size; - } - if (cp->c_rsrcfork) { - rsrc_size = cp->c_rsrcfork->ff_size; - } - printf("hfs: remove: couldn't delete a truncated file (%s)" - "(error %d, data sz %lld; rsrc sz %lld)", - cp->c_desc.cd_nameptr, error, data_size, rsrc_size); - hfs_mark_volume_inconsistent(hfsmp); - } else { - printf("hfs: remove: strangely enough, deleting truncated file %s (%d) got err %d\n", - cp->c_desc.cd_nameptr, cp->c_attr.ca_fileid, error); - } + + if (error && error != ENXIO && error != ENOENT) { + printf("hfs_removefile: deleting file %s (%d), err: %d\n", + cp->c_desc.cd_nameptr, cp->c_attr.ca_fileid, error); } - + if (error == 0) { /* Update the parent directory */ if (dcp->c_entries > 0) @@ -2708,26 +3236,65 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, (void) cat_update(hfsmp, &dcp->c_desc, &dcp->c_attr, NULL, NULL); } hfs_systemfile_unlock(hfsmp, lockflags); - if (error) + if (error) { goto out; - + } + + /* + * Now that we've wiped out the catalog record, the file effectively doesn't + * exist anymore. So update the quota records to reflect the loss of the + * data fork and the resource fork. + */ #if QUOTA - if (hfsmp->hfs_flags & HFS_QUOTAS) + if (cp->c_datafork->ff_blocks > 0) { + savedbytes = ((off_t)cp->c_datafork->ff_blocks * (off_t)blksize); + (void) hfs_chkdq(cp, (int64_t)-(savedbytes), NOCRED, 0); + } + + if (cp->c_rsrcfork && (cp->c_rsrcfork->ff_blocks > 0)) { + savedbytes = ((off_t)cp->c_rsrcfork->ff_blocks * (off_t)blksize); + (void) hfs_chkdq(cp, (int64_t)-(savedbytes), NOCRED, 0); + } + + if (hfsmp->hfs_flags & HFS_QUOTAS) { (void)hfs_chkiq(cp, -1, NOCRED, 0); -#endif /* QUOTA */ - + } +#endif + + + /* + * If we didn't get any errors deleting the catalog entry, then go ahead + * and release the backing store now. The filefork pointers are still valid. + */ + error = hfs_release_storage (hfsmp, cp->c_datafork, cp->c_rsrcfork, fileid); + + if (error) { + /* + * If we encountered an error updating the extents and bitmap, + * mark the volume inconsistent. At this point, the catalog record has + * already been deleted, so we can't recover it at this point. We need + * to proceed and update the volume header and mark the cnode C_NOEXISTS. + * The subsequent fsck should be able to recover the free space for us. + */ + hfs_mark_volume_inconsistent(hfsmp); + } + else { + /* reset update_vh to 0, since hfs_release_storage should have done it for us */ + update_vh = 0; + } + cp->c_flag |= C_NOEXISTS; cp->c_flag &= ~C_DELETED; - truncated = 0; // because the catalog entry is gone - + cp->c_touch_chgtime = TRUE; /* XXX needed ? */ --cp->c_linkcount; - + /* * We must never get a directory if we're in this else block. We could * accidentally drop the number of files in the volume header if we did. */ hfs_volupdate(hfsmp, VOL_RMFILE, (dcp->c_cnid == kHFSRootFolderID)); + } /* @@ -2744,14 +3311,14 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, if (error) { cp->c_flag &= ~C_DELETED; } - - /* Commit the truncation to the catalog record */ - if (truncated) { - cp->c_flag |= C_FORCEUPDATE; - cp->c_touch_chgtime = TRUE; - cp->c_touch_modtime = TRUE; - (void) hfs_update(vp, 0); - } + + if (update_vh) { + /* + * If we bailed out earlier, we may need to update the volume header + * to deal with the borrowed blocks accounting. + */ + hfs_volupdate (hfsmp, VOL_UPDATE, 0); + } if (started_tr) { hfs_end_transaction(hfsmp); @@ -2789,7 +3356,6 @@ replace_desc(struct cnode *cp, struct cat_desc *cdp) cdp->cd_flags &= ~CD_HASBUF; } - /* * Rename a cnode. * @@ -2813,7 +3379,7 @@ replace_desc(struct cnode *cp, struct cat_desc *cdp) * been locked. By taking the rsrc fork vnodes up front we ensure that they * cannot be recycled, and that the situation mentioned above cannot happen. */ -static int +int hfs_vnop_rename(ap) struct vnop_rename_args /* { struct vnode *a_fdvp; @@ -2849,9 +3415,21 @@ hfs_vnop_rename(ap) int took_trunc_lock = 0; int lockflags; int error; - int recycle_rsrc = 0; + time_t orig_from_ctime, orig_to_ctime; + + orig_from_ctime = VTOC(fvp)->c_ctime; + if (tvp && VTOC(tvp)) { + orig_to_ctime = VTOC(tvp)->c_ctime; + } else { + orig_to_ctime = ~0; + } + check_for_tracked_file(fvp, orig_from_ctime, NAMESPACE_HANDLER_RENAME_OP, NULL); + if (tvp && VTOC(tvp)) { + check_for_tracked_file(tvp, orig_to_ctime, NAMESPACE_HANDLER_DELETE_OP, NULL); + } + /* * Before grabbing the four locks, we may need to get an iocount on the resource fork * vnodes in question, just like hfs_vnop_remove. If fvp and tvp are not @@ -2867,16 +3445,15 @@ hfs_vnop_rename(ap) if ((error = hfs_lock (VTOC(fvp), HFS_EXCLUSIVE_LOCK))) { return (error); } - /* - * We care if we race against rename/delete with this cnode, so we'll - * error out if this file becomes open-unlinked during this call. + * We care if we race against rename/delete with this cp, so we'll error out + * if the file becomes open-unlinked during this call. */ error = hfs_vgetrsrc(VTOHFS(fvp), fvp, &fvp_rsrc, TRUE, TRUE); hfs_unlock (VTOC(fvp)); if (error) { if (fvp_rsrc) { - vnode_put (fvp_rsrc); + vnode_put(fvp_rsrc); } return error; } @@ -2890,7 +3467,6 @@ hfs_vnop_rename(ap) */ if (hfs_lock (VTOC(tvp), HFS_EXCLUSIVE_LOCK) == 0) { tcp = VTOC(tvp); - /* * We only care if we get an open-unlinked file on the dst so we * know to null out tvp/tcp to make the rename operation act @@ -2898,18 +3474,19 @@ hfs_vnop_rename(ap) * namespace already it's fine to do this. If this is true, then * make sure to unlock the cnode and drop the iocount only after the unlock. */ + error = hfs_vgetrsrc(VTOHFS(tvp), tvp, &tvp_rsrc, TRUE, TRUE); hfs_unlock (tcp); if (error) { /* - * Since we specify TRUE for error-on-unlinked in hfs_vgetrsrc, - * we can get a rsrc fork vp even if it returns an error. + * Since we specify TRUE for error_on_unlinked in hfs_vgetrsrc, + * we can get a rsrc fork vnode even if it returns an error. */ tcp = NULL; tvp = NULL; if (tvp_rsrc) { vnode_put (tvp_rsrc); - tvp_rsrc = NULLVP; + tvp_rsrc = NULL; } /* just bypass truncate lock and act as if we never got tcp/tvp */ goto retry; @@ -2919,7 +3496,7 @@ hfs_vnop_rename(ap) /* When tvp exists, take the truncate lock for hfs_removefile(). */ if (tvp && (vnode_isreg(tvp) || vnode_islnk(tvp))) { - hfs_lock_truncate(VTOC(tvp), TRUE); + hfs_lock_truncate(VTOC(tvp), HFS_EXCLUSIVE_LOCK); took_trunc_lock = 1; } @@ -2928,7 +3505,7 @@ hfs_vnop_rename(ap) HFS_EXCLUSIVE_LOCK, &error_cnode); if (error) { if (took_trunc_lock) { - hfs_unlock_truncate(VTOC(tvp), TRUE); + hfs_unlock_truncate(VTOC(tvp), 0); took_trunc_lock = 0; } /* @@ -2974,21 +3551,22 @@ hfs_vnop_rename(ap) * the parent/child relationship with fdcp and tdcp, as well as the * component name of the target cnodes. */ - if ((fcp->c_flag & (C_NOEXISTS | C_DELETED)) || !hfs_valid_cnode(hfsmp, fdvp, fcnp, fcp->c_fileid)) { + if ((fcp->c_flag & (C_NOEXISTS | C_DELETED)) || !hfs_valid_cnode(hfsmp, fdvp, fcnp, fcp->c_fileid, NULL, &error)) { error = ENOENT; goto out; } - if (tcp && ((tcp->c_flag & (C_NOEXISTS | C_DELETED)) || !hfs_valid_cnode(hfsmp, tdvp, tcnp, tcp->c_fileid))) { + if (tcp && ((tcp->c_flag & (C_NOEXISTS | C_DELETED)) || !hfs_valid_cnode(hfsmp, tdvp, tcnp, tcp->c_fileid, NULL, &error))) { // // hmm, the destination vnode isn't valid any more. // in this case we can just drop him and pretend he // never existed in the first place. // if (took_trunc_lock) { - hfs_unlock_truncate(VTOC(tvp), TRUE); - took_trunc_lock = 0; + hfs_unlock_truncate(VTOC(tvp), 0); + took_trunc_lock = 0; } + error = 0; hfs_unlockfour(fdcp, fcp, tdcp, tcp); @@ -3186,7 +3764,33 @@ hfs_vnop_rename(ap) got_cookie = 1; /* - * If the destination exists then it may need to be removed. + * If the destination exists then it may need to be removed. + * + * Due to HFS's locking system, we should always move the + * existing 'tvp' element to the hidden directory in hfs_vnop_rename. + * Because the VNOP_LOOKUP call enters and exits the filesystem independently + * of the actual vnop that it was trying to do (stat, link, readlink), + * we must release the cnode lock of that element during the interim to + * do MAC checking, vnode authorization, and other calls. In that time, + * the item can be deleted (or renamed over). However, only in the rename + * case is it inappropriate to return ENOENT from any of those calls. Either + * the call should return information about the old element (stale), or get + * information about the newer element that we are about to write in its place. + * + * HFS lookup has been modified to detect a rename and re-drive its + * lookup internally. For other calls that have already succeeded in + * their lookup call and are waiting to acquire the cnode lock in order + * to proceed, that cnode lock will not fail due to the cnode being marked + * C_NOEXISTS, because it won't have been marked as such. It will only + * have C_DELETED. Thus, they will simply act on the stale open-unlinked + * element. All future callers will get the new element. + * + * To implement this behavior, we pass the "only_unlink" argument to + * hfs_removefile and hfs_removedir. This will result in the vnode acting + * as though it is open-unlinked. Additionally, when we are done moving the + * element to the hidden directory, we vnode_recycle the target so that it is + * reclaimed as soon as possible. Reclaim and inactive are both + * capable of clearing out unused blocks for an open-unlinked file or dir. */ if (tvp) { /* @@ -3209,28 +3813,54 @@ hfs_vnop_rename(ap) } } - if (vnode_isdir(tvp)) - error = hfs_removedir(tdvp, tvp, tcnp, HFSRM_SKIP_RESERVE); - else { - error = hfs_removefile(tdvp, tvp, tcnp, 0, HFSRM_SKIP_RESERVE, 0, tvp_rsrc); - - /* - * If the destination file had a rsrc fork vnode, it may have been cleaned up - * in hfs_removefile if it was not busy (had no usecounts). This is possible - * because we grabbed the iocount on the rsrc fork safely at the beginning - * of the function before we did the lockfour. However, we may still need - * to take action to prevent block leaks, so aggressively recycle the vnode - * if possible. The vnode cannot be recycled because we hold an iocount on it. + + if (vnode_isdir(tvp)) { + /* + * hfs_removedir will eventually call hfs_removefile on the directory + * we're working on, because only hfs_removefile does the renaming of the + * item to the hidden directory. The directory will stay around in the + * hidden directory with C_DELETED until it gets an inactive or a reclaim. + * That way, we can destroy all of the EAs as needed and allow new ones to be + * written. */ - - if ((error == 0) && (tcp->c_flag & C_DELETED) && tvp_rsrc && !vnode_isinuse(tvp_rsrc, 0)) { - recycle_rsrc = 1; - } + error = hfs_removedir(tdvp, tvp, tcnp, HFSRM_SKIP_RESERVE, 1); + } + else { + error = hfs_removefile(tdvp, tvp, tcnp, 0, HFSRM_SKIP_RESERVE, 0, tvp_rsrc, 1); + + /* + * If the destination file had a resource fork vnode, then we need to get rid of + * its blocks when there are no more references to it. Because the call to + * hfs_removefile above always open-unlinks things, we need to force an inactive/reclaim + * on the resource fork vnode, in order to prevent block leaks. Otherwise, + * the resource fork vnode could prevent the data fork vnode from going out of scope + * because it holds a v_parent reference on it. So we mark it for termination + * with a call to vnode_recycle. hfs_vnop_reclaim has been modified so that it + * can clean up the blocks of open-unlinked files and resource forks. + * + * We can safely call vnode_recycle on the resource fork because we took an iocount + * reference on it at the beginning of the function. + */ + + if ((error == 0) && (tcp->c_flag & C_DELETED) && (tvp_rsrc)) { + vnode_recycle(tvp_rsrc); + } } - if (error) + if (error) { goto out; + } + tvp_deleted = 1; + + /* Mark 'tcp' as being deleted due to a rename */ + tcp->c_flag |= C_RENAMED; + + /* + * Aggressively mark tvp/tcp for termination to ensure that we recover all blocks + * as quickly as possible. + */ + vnode_recycle(tvp); } skip_rm: /* @@ -3268,6 +3898,11 @@ hfs_vnop_rename(ap) replace_desc(fcp, &out_desc); fcp->c_parentcnid = tdcp->c_fileid; fcp->c_hint = 0; + + /* Now indicate this cnode needs to have date-added written to the finderinfo */ + fcp->c_flag |= C_NEEDS_DATEADDED; + (void) hfs_update (fvp, 0); + hfs_volupdate(hfsmp, vnode_isdir(fvp) ? VOL_RMDIR : VOL_RMFILE, (fdcp->c_cnid == kHFSRootFolderID)); @@ -3327,28 +3962,12 @@ hfs_vnop_rename(ap) wakeup((caddr_t)&tdcp->c_flag); } - if (took_trunc_lock) - hfs_unlock_truncate(VTOC(tvp), TRUE); + if (took_trunc_lock) { + hfs_unlock_truncate(VTOC(tvp), 0); + } hfs_unlockfour(fdcp, fcp, tdcp, tcp); - /* - * Now that we've dropped all of the locks, we need to force an inactive and a recycle - * on the old destination's rsrc fork to prevent a leak of its blocks. Note that - * doing the ref/rele is to twiddle the VL_NEEDINACTIVE bit of the vnode's flags, so that - * on the last vnode_put for this vnode, we will force inactive to get triggered. - * We hold an iocount from the beginning of this function so we know it couldn't have been - * recycled already. - */ - if (recycle_rsrc) { - int vref; - vref = vnode_ref(tvp_rsrc); - if (vref == 0) { - vnode_rele(tvp_rsrc); - } - vnode_recycle(tvp_rsrc); - } - /* Now vnode_put the resource forks vnodes if necessary */ if (tvp_rsrc) { vnode_put(tvp_rsrc); @@ -3368,7 +3987,7 @@ hfs_vnop_rename(ap) /* * Make a directory. */ -static int +int hfs_vnop_mkdir(struct vnop_mkdir_args *ap) { /***** HACK ALERT ********/ @@ -3380,7 +3999,7 @@ hfs_vnop_mkdir(struct vnop_mkdir_args *ap) /* * Create a symbolic link. */ -static int +int hfs_vnop_symlink(struct vnop_symlink_args *ap) { struct vnode **vpp = ap->a_vpp; @@ -3456,7 +4075,7 @@ hfs_vnop_symlink(struct vnop_symlink_args *ap) /* hfs_removefile() requires holding the truncate lock */ hfs_unlock(cp); - hfs_lock_truncate(cp, TRUE); + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK); hfs_lock(cp, HFS_FORCE_LOCK); if (hfs_start_transaction(hfsmp) != 0) { @@ -3465,8 +4084,8 @@ hfs_vnop_symlink(struct vnop_symlink_args *ap) goto out; } - (void) hfs_removefile(dvp, vp, ap->a_cnp, 0, 0, 0, NULL); - hfs_unlock_truncate(cp, TRUE); + (void) hfs_removefile(dvp, vp, ap->a_cnp, 0, 0, 0, NULL, 0); + hfs_unlock_truncate(cp, 0); goto out; } @@ -3562,7 +4181,7 @@ typedef union { * If the directory is marked as deleted-but-in-use (cp->c_flag & C_DELETED), * do NOT synthesize entries for "." and "..". */ -static int +int hfs_vnop_readdir(ap) struct vnop_readdir_args /* { vnode_t a_vp; @@ -3601,11 +4220,23 @@ hfs_vnop_readdir(ap) /* Sanity check the uio data. */ if (uio_iovcnt(uio) > 1) return (EINVAL); + + if (VTOC(vp)->c_flags & UF_COMPRESSED) { + int compressed = hfs_file_is_compressed(VTOC(vp), 0); /* 0 == take the cnode lock */ + if (VTOCMP(vp) != NULL && !compressed) { + error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP); + if (error) { + return error; + } + } + } + + cp = VTOC(vp); + hfsmp = VTOHFS(vp); + /* Note that the dirhint calls require an exclusive lock. */ if ((error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) return (error); - cp = VTOC(vp); - hfsmp = VTOHFS(vp); /* Pick up cnid hint (if any). */ if (nfs_cookies) { @@ -3741,7 +4372,7 @@ hfs_vnop_readdir(ap) if (index == 0) { dirhint->dh_threadhint = cp->c_dirthreadhint; - } + } else { /* * If we have a non-zero index, there is a possibility that during the last @@ -3822,7 +4453,7 @@ hfs_vnop_readdir(ap) /* * Read contents of a symbolic link. */ -static int +int hfs_vnop_readlink(ap) struct vnop_readlink_args /* { struct vnode *a_vp; @@ -3845,7 +4476,6 @@ hfs_vnop_readlink(ap) /* Zero length sym links are not allowed */ if (fp->ff_size == 0 || fp->ff_size > MAXPATHLEN) { - printf("hfs: zero length symlink on fileid %d\n", cp->c_fileid); error = EINVAL; goto exit; } @@ -3907,7 +4537,7 @@ hfs_vnop_readlink(ap) /* * Get configurable pathname variables. */ -static int +int hfs_vnop_pathconf(ap) struct vnop_pathconf_args /* { struct vnode *a_vp; @@ -3925,9 +4555,9 @@ hfs_vnop_pathconf(ap) break; case _PC_NAME_MAX: if (VTOHFS(ap->a_vp)->hfs_flags & HFS_STANDARD) - *ap->a_retval = kHFSMaxFileNameChars; /* 255 */ + *ap->a_retval = kHFSMaxFileNameChars; /* 31 */ else - *ap->a_retval = kHFSPlusMaxFileNameChars; /* 31 */ + *ap->a_retval = kHFSPlusMaxFileNameChars; /* 255 */ break; case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; /* 1024 */ @@ -3942,7 +4572,10 @@ hfs_vnop_pathconf(ap) *ap->a_retval = 200112; /* _POSIX_NO_TRUNC */ break; case _PC_NAME_CHARS_MAX: - *ap->a_retval = kHFSPlusMaxFileNameChars; + if (VTOHFS(ap->a_vp)->hfs_flags & HFS_STANDARD) + *ap->a_retval = kHFSMaxFileNameChars; /* 31 */ + else + *ap->a_retval = kHFSPlusMaxFileNameChars; /* 255 */ break; case _PC_CASE_SENSITIVE: if (VTOHFS(ap->a_vp)->hfs_flags & HFS_CASE_SENSITIVE) @@ -3959,6 +4592,10 @@ hfs_vnop_pathconf(ap) else *ap->a_retval = 64; /* number of bits to store max file size */ break; + case _PC_XATTR_SIZE_BITS: + /* Number of bits to store maximum extended attribute size */ + *ap->a_retval = HFS_XATTR_SIZE_BITS; + break; default: return (EINVAL); } @@ -3975,7 +4612,6 @@ hfs_vnop_pathconf(ap) * * The cnode must be locked exclusive */ -__private_extern__ int hfs_update(struct vnode *vp, __unused int waitfor) { @@ -4040,28 +4676,50 @@ hfs_update(struct vnode *vp, __unused int waitfor) return error; } - /* - * For files with invalid ranges (holes) the on-disk - * field representing the size of the file (cf_size) - * must be no larger than the start of the first hole. + /* + * Modify the values passed to cat_update based on whether or not + * the file has invalid ranges or borrowed blocks. */ - if (dataforkp && !TAILQ_EMPTY(&cp->c_datafork->ff_invalidranges)) { + if (dataforkp) { + off_t numbytes = 0; + + /* copy the datafork into a temporary copy so we don't pollute the cnode's */ bcopy(dataforkp, &datafork, sizeof(datafork)); - datafork.cf_size = TAILQ_FIRST(&cp->c_datafork->ff_invalidranges)->rl_start; dataforkp = &datafork; - } else if (dataforkp && (cp->c_datafork->ff_unallocblocks != 0)) { - // always make sure the block count and the size - // of the file match the number of blocks actually - // allocated to the file on disk - bcopy(dataforkp, &datafork, sizeof(datafork)); - // make sure that we don't assign a negative block count - if (cp->c_datafork->ff_blocks < cp->c_datafork->ff_unallocblocks) { - panic("hfs: ff_blocks %d is less than unalloc blocks %d\n", - cp->c_datafork->ff_blocks, cp->c_datafork->ff_unallocblocks); + + /* + * If there are borrowed blocks, ensure that they are subtracted + * from the total block count before writing the cnode entry to disk. + * Only extents that have actually been marked allocated in the bitmap + * should be reflected in the total block count for this fork. + */ + if (cp->c_datafork->ff_unallocblocks != 0) { + // make sure that we don't assign a negative block count + if (cp->c_datafork->ff_blocks < cp->c_datafork->ff_unallocblocks) { + panic("hfs: ff_blocks %d is less than unalloc blocks %d\n", + cp->c_datafork->ff_blocks, cp->c_datafork->ff_unallocblocks); + } + + /* Also cap the LEOF to the total number of bytes that are allocated. */ + datafork.cf_blocks = (cp->c_datafork->ff_blocks - cp->c_datafork->ff_unallocblocks); + datafork.cf_size = datafork.cf_blocks * HFSTOVCB(hfsmp)->blockSize; + } + + /* + * For files with invalid ranges (holes) the on-disk + * field representing the size of the file (cf_size) + * must be no larger than the start of the first hole. + * However, note that if the first invalid range exists + * solely within borrowed blocks, then our LEOF and block + * count should both be zero. As a result, set it to the + * min of the current cf_size and the start of the first + * invalid range, because it may have already been reduced + * to zero by the borrowed blocks check above. + */ + if (!TAILQ_EMPTY(&cp->c_datafork->ff_invalidranges)) { + numbytes = TAILQ_FIRST(&cp->c_datafork->ff_invalidranges)->rl_start; + datafork.cf_size = MIN((numbytes), (datafork.cf_size)); } - datafork.cf_blocks = (cp->c_datafork->ff_blocks - cp->c_datafork->ff_unallocblocks); - datafork.cf_size = datafork.cf_blocks * HFSTOVCB(hfsmp)->blockSize; - dataforkp = &datafork; } /* @@ -4098,7 +4756,7 @@ hfs_update(struct vnode *vp, __unused int waitfor) * Allocate a new node * Note - Function does not create and return a vnode for whiteout creation. */ -static int +int hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx) { @@ -4113,16 +4771,19 @@ hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, int error, started_tr = 0; enum vtype vnodetype; int mode; + int newvnode_flags = 0; + int nocache = 0; + u_int32_t gnv_flags = 0; if ((error = hfs_lock(VTOC(dvp), HFS_EXCLUSIVE_LOCK))) return (error); /* set the cnode pointer only after successfully acquiring lock */ dcp = VTOC(dvp); - + /* Don't allow creation of new entries in open-unlinked directories */ - if ((error = hfs_checkdeleted (dcp))) { - hfs_unlock (dcp); + if ((error = hfs_checkdeleted(dcp))) { + hfs_unlock(dcp); return error; } @@ -4139,6 +4800,13 @@ hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, vnodetype = VREG; mode = MAKEIMODE(vnodetype, vap->va_mode); +#if CONFIG_PROTECT + /* If we're creating a regular file on a CP filesystem, then delay caching */ + if ((vnodetype == VREG ) && (cp_fs_protected (VTOVFS(dvp)))) { + nocache = 1; + } +#endif + /* Check if were out of usable disk space. */ if ((hfs_freeblks(hfsmp, 1) == 0) && (vfs_context_suser(ctx) != 0)) { error = ENOSPC; @@ -4169,7 +4837,7 @@ hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, VATTR_SET_SUPPORTED(vap, va_flags); attr.ca_flags = vap->va_flags; } - + /* * HFS+ only: all files get ThreadExists * HFSX only: dirs get HasFolderCount @@ -4183,6 +4851,9 @@ hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, } } + /* Add the date added to the item */ + hfs_write_dateadded (&attr, attr.ca_atime); + attr.ca_uid = vap->va_uid; attr.ca_gid = vap->va_gid; VATTR_SET_SUPPORTED(vap, va_mode); @@ -4282,6 +4953,11 @@ hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, /* Do not create vnode for whiteouts */ if (S_ISWHT(mode)) { goto exit; + } + + gnv_flags |= GNV_CREATE; + if (nocache) { + gnv_flags |= GNV_NOCACHE; } /* @@ -4297,15 +4973,72 @@ hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, * try to create a new vnode, and then end up reclaiming another shadow vnode to * create the new one. However, if everything is working properly, this should * be a non-issue as we would never enter that reclaim codepath. - * + * * The cnode is locked on successful return. */ - error = hfs_getnewvnode(hfsmp, dvp, cnp, &out_desc, GNV_CREATE, &attr, NULL, &tvp); + error = hfs_getnewvnode(hfsmp, dvp, cnp, &out_desc, gnv_flags, &attr, + NULL, &tvp, &newvnode_flags); if (error) goto exit; cp = VTOC(tvp); *vpp = tvp; + +#if CONFIG_PROTECT + error = cp_entry_create_keys(cp); + /* + * If we fail to create keys, then do NOT allow this vnode to percolate out into the + * namespace. Delete it and return the errno that cp_entry_create_keys generated. + * Luckily, we can do this without issues because the entry was newly created + * and we're still holding the directory cnode lock. Because we prevented it from + * getting inserted into the namecache upon vnode creation, all accesss to this file + * would have to go through the directory, whose lock we are still holding. + */ + if (error) { + /* + * If we fail to remove/recycle the item here, we can't do much about it. Log + * a message to the console and then we can backtrack it. The ultimate error + * that will get emitted to userland will be from the failure to create the EA blob. + */ + int err = hfs_removefile (dvp, tvp, cnp, 0, 0, 0, NULL, 0); + if (err) { + printf("hfs_makenode: removefile failed (%d) for CP file %p\n", err, tvp); + } + hfs_unlock (cp); + err = vnode_recycle (tvp); + if (err) { + printf("hfs_makenode: vnode_recycle failed (%d) for CP file %p\n", err, tvp); + } + /* Drop the iocount on the new vnode to force reclamation/recycling */ + vnode_put (tvp); + cp = NULL; + *vpp = NULL; + } + else { + /* insert item into name cache if it wasn't already inserted.*/ + if (nocache) { + cache_enter (dvp, tvp, cnp); + } + } + +#endif +/* + * If CONFIG_PROTECT is not enabled, then all items will get automatically added into + * the namecache, as nocache will be set to 0. + */ + +#if QUOTA + /* + * Once we create this vnode, we need to initialize its quota data + * structures, if necessary. We know that it is OK to just go ahead and + * initialize because we've already validated earlier (through the hfs_quotacheck + * function) to see if creating this cnode/vnode would cause us to go over quota. + */ + if (hfsmp->hfs_flags & HFS_QUOTAS) { + (void) hfs_getinoquota(cp); + } +#endif + exit: cat_releasedesc(&out_desc); @@ -4330,8 +5063,8 @@ hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, } - -/* hfs_vgetrsrc acquires a resource fork vnode corresponding to the cnode that is +/* + * hfs_vgetrsrc acquires a resource fork vnode corresponding to the cnode that is * found in 'vp'. The rsrc fork vnode is returned with the cnode locked and iocount * on the rsrc vnode. * @@ -4351,10 +5084,9 @@ hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, * there's really no reason to double-check for errors on the cnode. */ -__private_extern__ int -hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, - struct vnode **rvpp, int can_drop_lock, int error_on_unlinked) +hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, struct vnode **rvpp, + int can_drop_lock, int error_on_unlinked) { struct vnode *rvp; struct vnode *dvp = NULLVP; @@ -4363,18 +5095,21 @@ hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, int vid; int delete_status = 0; - + if (vnode_vtype(vp) == VDIR) { + return EINVAL; + } + /* - * Need to check the status of the cnode to validate it hasn't - * gone open-unlinked on us before we can actually do work with it. + * Need to check the status of the cnode to validate it hasn't gone + * open-unlinked on us before we can actually do work with it. */ - delete_status = hfs_checkdeleted (cp); + delete_status = hfs_checkdeleted(cp); if ((delete_status) && (error_on_unlinked)) { return delete_status; } restart: - /* Attempt to use exising vnode */ + /* Attempt to use existing vnode */ if ((rvp = cp->c_rsrc_vp)) { vid = vnode_vid(rvp); @@ -4410,11 +5145,10 @@ hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, if ((delete_status = hfs_checkdeleted(cp))) { /* * If error == 0, this means that we succeeded in acquiring an iocount on the - * rsrc fork vnode. However, if we're in this block of code, that - * means that we noticed that the cnode has gone open-unlinked. In - * this case, the caller requested that we not do any other work and - * return an errno. The caller will be responsible for dropping the - * iocount we just acquired because we can't do it until we've released + * rsrc fork vnode. However, if we're in this block of code, that means that we noticed + * that the cnode has gone open-unlinked. In this case, the caller requested that we + * not do any other work and return an errno. The caller will be responsible for + * dropping the iocount we just acquired because we can't do it until we've released * the cnode lock. */ if (error == 0) { @@ -4447,7 +5181,8 @@ hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, struct cat_desc to_desc; char delname[32]; int lockflags; - + int newvnode_flags = 0; + /* * Make sure cnode lock is exclusive, if not upgrade it. * @@ -4478,7 +5213,7 @@ hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, */ if ((error_on_unlinked) && (can_drop_lock)) { - if ((error = hfs_checkdeleted (cp))) { + if ((error = hfs_checkdeleted(cp))) { return error; } } @@ -4530,7 +5265,7 @@ hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, dvp = vnode_getparent(vp); error = hfs_getnewvnode(hfsmp, dvp, cn.cn_pnbuf ? &cn : NULL, descptr, GNV_WANTRSRC | GNV_SKIPLOCK, &cp->c_attr, - &rsrcfork, &rvp); + &rsrcfork, &rvp, &newvnode_flags); if (dvp) vnode_put(dvp); if (cn.cn_pnbuf) @@ -4546,7 +5281,7 @@ hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, /* * Wrapper for special device reads */ -static int +int hfsspec_read(ap) struct vnop_read_args /* { struct vnode *a_vp; @@ -4565,7 +5300,7 @@ hfsspec_read(ap) /* * Wrapper for special device writes */ -static int +int hfsspec_write(ap) struct vnop_write_args /* { struct vnode *a_vp; @@ -4587,7 +5322,7 @@ hfsspec_write(ap) * * Update the times on the cnode then do device close. */ -static int +int hfsspec_close(ap) struct vnop_close_args /* { struct vnode *a_vp; @@ -4680,7 +5415,7 @@ hfsfifo_close(ap) /* * Synchronize a file's in-core state with that on disk. */ -static int +int hfs_vnop_fsync(ap) struct vnop_fsync_args /* { struct vnode *a_vp; @@ -4691,6 +5426,21 @@ hfs_vnop_fsync(ap) struct vnode* vp = ap->a_vp; int error; + /* Note: We check hfs flags instead of vfs mount flag because during + * read-write update, hfs marks itself read-write much earlier than + * the vfs, and hence won't result in skipping of certain writes like + * zero'ing out of unused nodes, creation of hotfiles btree, etc. + */ + if (VTOHFS(vp)->hfs_flags & HFS_READ_ONLY) { + return 0; + } + +#if CONFIG_PROTECT + if ((error = cp_handle_vnop(VTOC(vp), CP_WRITE_ACCESS)) != 0) { + return (error); + } +#endif /* CONFIG_PROTECT */ + /* * We need to allow ENOENT lock errors since unlink * systenm call can call VNOP_FSYNC during vclean. @@ -4706,7 +5456,7 @@ hfs_vnop_fsync(ap) } -static int +int hfs_vnop_whiteout(ap) struct vnop_whiteout_args /* { struct vnode *a_dvp; @@ -4858,7 +5608,7 @@ struct vnodeopv_entry_desc hfs_vnodeop_entries[] = { { &vnop_select_desc, (VOPFUNC)hfs_vnop_select }, /* select */ { &vnop_revoke_desc, (VOPFUNC)nop_revoke }, /* revoke */ { &vnop_exchange_desc, (VOPFUNC)hfs_vnop_exchange }, /* exchange */ - { &vnop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */ + { &vnop_mmap_desc, (VOPFUNC)hfs_vnop_mmap }, /* mmap */ { &vnop_fsync_desc, (VOPFUNC)hfs_vnop_fsync }, /* fsync */ { &vnop_remove_desc, (VOPFUNC)hfs_vnop_remove }, /* remove */ { &vnop_link_desc, (VOPFUNC)hfs_vnop_link }, /* link */ diff --git a/bsd/hfs/hfs_xattr.c b/bsd/hfs/hfs_xattr.c index 6eec7028b..8091dfaa2 100644 --- a/bsd/hfs/hfs_xattr.c +++ b/bsd/hfs/hfs_xattr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2008 Apple Inc. All rights reserved. + * Copyright (c) 2004-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -66,7 +67,6 @@ struct listattr_callback_state { #endif /* HFS_COMPRESSION */ }; -#define HFS_MAXATTRIBUTESIZE (128 * 1024) #define HFS_MAXATTRBLKS (32 * 1024) @@ -80,6 +80,8 @@ struct listattr_callback_state { static u_int32_t emptyfinfo[8] = {0}; +static int hfs_zero_dateadded (struct cnode *cp, u_int8_t *finderinfo); + const char hfs_attrdatafilename[] = "Attribute Data"; static int listattr_callback(const HFSPlusAttrKey *key, const HFSPlusAttrData *data, @@ -216,7 +218,7 @@ hfs_vnop_removenamedstream(struct vnop_removenamedstream_args* ap) scp = VTOC(svp); /* Take truncate lock before taking cnode lock. */ - hfs_lock_truncate(scp, TRUE); + hfs_lock_truncate(scp, HFS_EXCLUSIVE_LOCK); if ((error = hfs_lock(scp, HFS_EXCLUSIVE_LOCK))) { goto out; } @@ -225,16 +227,38 @@ hfs_vnop_removenamedstream(struct vnop_removenamedstream_args* ap) } hfs_unlock(scp); out: - hfs_unlock_truncate(scp, TRUE); + hfs_unlock_truncate(scp, 0); return (error); } #endif +/* Zero out the date added field for the specified cnode */ +static int hfs_zero_dateadded (struct cnode *cp, u_int8_t *finderinfo) { + u_int8_t *finfo = finderinfo; + + /* Advance finfo by 16 bytes to the 2nd half of the finderinfo */ + finfo = finfo + 16; + + if (S_ISREG(cp->c_attr.ca_mode)) { + struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; + extinfo->date_added = 0; + } + else if (S_ISDIR(cp->c_attr.ca_mode)) { + struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; + extinfo->date_added = 0; + } + else { + /* Return an error */ + return -1; + } + return 0; + +} + /* * Retrieve the data of an extended attribute. */ -__private_extern__ int hfs_vnop_getxattr(struct vnop_getxattr_args *ap) /* @@ -253,13 +277,7 @@ hfs_vnop_getxattr(struct vnop_getxattr_args *ap) struct cnode *cp; struct hfsmount *hfsmp; uio_t uio = ap->a_uio; - struct BTreeIterator * iterator = NULL; - struct filefork *btfile; - FSBufferDescriptor btdata; - HFSPlusAttrRecord * recp = NULL; size_t bufsize; - u_int16_t datasize; - int lockflags; int result; cp = VTOC(vp); @@ -281,6 +299,9 @@ hfs_vnop_getxattr(struct vnop_getxattr_args *ap) /* Make a copy since we may not export all of it. */ bcopy(cp->c_finderinfo, finderinfo, sizeof(finderinfo)); hfs_unlock(cp); + + /* Zero out the date added field in the local copy */ + hfs_zero_dateadded (cp, finderinfo); /* Don't expose a symlink's private type/creator. */ if (vnode_islnk(vp)) { @@ -347,17 +368,17 @@ hfs_vnop_getxattr(struct vnop_getxattr_args *ap) (result == 0) && (uio_resid(uio) == uio_size)) { /* - we intentionally make the above call to VNOP_READ so that - it can return an authorization/permission/etc. error - based on ap->a_context and thus deny this operation; - in that case, result != 0 and we won't proceed - - however, if result == 0, it will have returned no data - because hfs_vnop_read hid the resource fork - (hence uio_resid(uio) == uio_size, i.e. the uio is untouched) - - in that case, we try again with the decmpfs_ctx context - to get the actual data + * We intentionally make the above call to VNOP_READ so that + * it can return an authorization/permission/etc. Error + * based on ap->a_context and thus deny this operation; + * in that case, result != 0 and we won't proceed. + * + * However, if result == 0, it will have returned no data + * because hfs_vnop_read hid the resource fork + * (hence uio_resid(uio) == uio_size, i.e. the uio is untouched) + * + * In that case, we try again with the decmpfs_ctx context + * to get the actual data */ result = VNOP_READ(rvp, uio, 0, decmpfs_ctx); } @@ -387,24 +408,78 @@ hfs_vnop_getxattr(struct vnop_getxattr_args *ap) if ((result = hfs_lock(cp, HFS_SHARED_LOCK))) { return (result); } - /* Bail if we don't have any extended attributes. */ + + /* Check for non-rsrc, non-finderinfo EAs */ + result = hfs_getxattr_internal (cp, ap, VTOHFS(cp->c_vp), 0); + + hfs_unlock(cp); + + return MacToVFSError(result); +} + + + +/* + * getxattr_internal + * + * We break out this internal function which searches the attributes B-Tree and the + * overflow extents file to find non-resource, non-finderinfo EAs. There may be cases + * where we need to get EAs in contexts where we are already holding the cnode lock, + * and to re-enter hfs_vnop_getxattr would cause us to double-lock the cnode. Instead, + * we can just directly call this function. + * + * We pass the hfsmp argument directly here because we may not necessarily have a cnode to + * operate on. Under normal conditions, we have a file or directory to query, but if we + * are operating on the root directory (id 1), then we may not have a cnode. In this case, if hte + * 'cp' argument is NULL, then we need to use the 'fileid' argument as the entry to manipulate + * + * NOTE: This function assumes the cnode lock for 'cp' is held exclusive or shared. + */ + + +int hfs_getxattr_internal (struct cnode *cp, struct vnop_getxattr_args *ap, + struct hfsmount *hfsmp, u_int32_t fileid) { + + struct filefork *btfile; + struct BTreeIterator * iterator = NULL; + size_t bufsize = 0; + HFSPlusAttrRecord *recp = NULL; + FSBufferDescriptor btdata; + int lockflags = 0; + int result = 0; + u_int16_t datasize = 0; + uio_t uio = ap->a_uio; + u_int32_t target_id = 0; + + if (cp) { + target_id = cp->c_fileid; + } + else { + target_id = fileid; + } + + + /* Bail if we don't have an EA B-Tree. */ if ((hfsmp->hfs_attribute_vp == NULL) || - (cp->c_attr.ca_recflags & kHFSHasAttributesMask) == 0) { - result = ENOATTR; + ((cp) && (cp->c_attr.ca_recflags & kHFSHasAttributesMask) == 0)) { + result = ENOATTR; goto exit; } + + /* Initialize the B-Tree iterator for searching for the proper EA */ btfile = VTOF(hfsmp->hfs_attribute_vp); - + MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); if (iterator == NULL) { result = ENOMEM; goto exit; } bzero(iterator, sizeof(*iterator)); - + bufsize = sizeof(HFSPlusAttrData) - 2; - if (uio) + if (uio) { bufsize += uio_resid(uio); + } bufsize = MAX(bufsize, sizeof(HFSPlusAttrRecord)); MALLOC(recp, HFSPlusAttrRecord *, bufsize, M_TEMP, M_WAITOK); if (recp == NULL) { @@ -414,132 +489,146 @@ hfs_vnop_getxattr(struct vnop_getxattr_args *ap) btdata.bufferAddress = recp; btdata.itemSize = bufsize; btdata.itemCount = 1; + + result = hfs_buildattrkey(target_id, ap->a_name, (HFSPlusAttrKey *)&iterator->key); + if (result) { + goto exit; + } - result = hfs_buildattrkey(VTOC(vp)->c_fileid, ap->a_name, (HFSPlusAttrKey *)&iterator->key); - if (result) - goto exit; - - /* Lookup the attribute. */ + /* Lookup the attribute in the Attribute B-Tree */ lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); result = BTSearchRecord(btfile, iterator, &btdata, &datasize, NULL); hfs_systemfile_unlock(hfsmp, lockflags); - + if (result) { - if (result == btNotFound) + if (result == btNotFound) { result = ENOATTR; + } goto exit; } - + + /* + * Operate differently if we have inline EAs that can fit in the attribute B-Tree or if + * we have extent based EAs. + */ switch (recp->recordType) { - case kHFSPlusAttrInlineData: - /* - * Sanity check record size. It's not required to have any - * user data, so the minimum size is 2 bytes less that the - * size of HFSPlusAttrData (since HFSPlusAttrData struct - * has 2 bytes set aside for attribute data). - */ - if (datasize < (sizeof(HFSPlusAttrData) - 2)) { - printf("hfs_getxattr: %d,%s invalid record size %d (expecting %lu)\n", - VTOC(vp)->c_fileid, ap->a_name, datasize, sizeof(HFSPlusAttrData)); - result = ENOATTR; - break; - } - *ap->a_size = recp->attrData.attrSize; - if (uio && recp->attrData.attrSize != 0) { - if (*ap->a_size > (user_size_t)uio_resid(uio)) - result = ERANGE; - else - result = uiomove((caddr_t) &recp->attrData.attrData , recp->attrData.attrSize, uio); - } - break; - - case kHFSPlusAttrForkData: - if (datasize < sizeof(HFSPlusAttrForkData)) { - printf("hfs_getxattr: %d,%s invalid record size %d (expecting %lu)\n", - VTOC(vp)->c_fileid, ap->a_name, datasize, sizeof(HFSPlusAttrForkData)); - result = ENOATTR; - break; - } - *ap->a_size = recp->forkData.theFork.logicalSize; - if (uio == NULL) { - break; - } - if (*ap->a_size > (user_size_t)uio_resid(uio)) { - result = ERANGE; + /* Attribute fits in the Attribute B-Tree */ + case kHFSPlusAttrInlineData: + /* + * Sanity check record size. It's not required to have any + * user data, so the minimum size is 2 bytes less that the + * size of HFSPlusAttrData (since HFSPlusAttrData struct + * has 2 bytes set aside for attribute data). + */ + if (datasize < (sizeof(HFSPlusAttrData) - 2)) { + printf("hfs_getxattr: %d,%s invalid record size %d (expecting %lu)\n", + target_id, ap->a_name, datasize, sizeof(HFSPlusAttrData)); + result = ENOATTR; + break; + } + *ap->a_size = recp->attrData.attrSize; + if (uio && recp->attrData.attrSize != 0) { + if (*ap->a_size > (user_size_t)uio_resid(uio)) { + result = ERANGE; + } + else { + result = uiomove((caddr_t) &recp->attrData.attrData , recp->attrData.attrSize, uio); + } + } break; - } - /* Process overflow extents if necessary. */ - if (has_overflow_extents(&recp->forkData.theFork)) { - HFSPlusExtentDescriptor *extentbuf; - HFSPlusExtentDescriptor *extentptr; - size_t extentbufsize; - u_int32_t totalblocks; - u_int32_t blkcnt; - u_int32_t attrlen; - - totalblocks = recp->forkData.theFork.totalBlocks; - /* Ignore bogus block counts. */ - if (totalblocks > HFS_MAXATTRBLKS) { - result = ERANGE; + /* Extent-Based EAs */ + case kHFSPlusAttrForkData: { + if (datasize < sizeof(HFSPlusAttrForkData)) { + printf("hfs_getxattr: %d,%s invalid record size %d (expecting %lu)\n", + target_id, ap->a_name, datasize, sizeof(HFSPlusAttrForkData)); + result = ENOATTR; break; } - attrlen = recp->forkData.theFork.logicalSize; - - /* Get a buffer to hold the worst case amount of extents. */ - extentbufsize = totalblocks * sizeof(HFSPlusExtentDescriptor); - extentbufsize = roundup(extentbufsize, sizeof(HFSPlusExtentRecord)); - MALLOC(extentbuf, HFSPlusExtentDescriptor *, extentbufsize, M_TEMP, M_WAITOK); - if (extentbuf == NULL) { - result = ENOMEM; + *ap->a_size = recp->forkData.theFork.logicalSize; + if (uio == NULL) { break; } - bzero(extentbuf, extentbufsize); - extentptr = extentbuf; - - /* Grab the first 8 extents. */ - bcopy(&recp->forkData.theFork.extents[0], extentptr, sizeof(HFSPlusExtentRecord)); - extentptr += kHFSPlusExtentDensity; - blkcnt = count_extent_blocks(totalblocks, recp->forkData.theFork.extents); - - /* Now lookup the overflow extents. */ - lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); - while (blkcnt < totalblocks) { - ((HFSPlusAttrKey *)&iterator->key)->startBlock = blkcnt; - result = BTSearchRecord(btfile, iterator, &btdata, &datasize, NULL); - if (result || - (recp->recordType != kHFSPlusAttrExtents) || - (datasize < sizeof(HFSPlusAttrExtents))) { - printf("hfs_getxattr: %s missing extents, only %d blks of %d found\n", - ap->a_name, blkcnt, totalblocks); - result = ENOATTR; - break; /* break from while */ + if (*ap->a_size > (user_size_t)uio_resid(uio)) { + result = ERANGE; + break; + } + /* Process overflow extents if necessary. */ + if (has_overflow_extents(&recp->forkData.theFork)) { + HFSPlusExtentDescriptor *extentbuf; + HFSPlusExtentDescriptor *extentptr; + size_t extentbufsize; + u_int32_t totalblocks; + u_int32_t blkcnt; + u_int32_t attrlen; + + totalblocks = recp->forkData.theFork.totalBlocks; + /* Ignore bogus block counts. */ + if (totalblocks > HFS_MAXATTRBLKS) { + result = ERANGE; + break; + } + attrlen = recp->forkData.theFork.logicalSize; + + /* Get a buffer to hold the worst case amount of extents. */ + extentbufsize = totalblocks * sizeof(HFSPlusExtentDescriptor); + extentbufsize = roundup(extentbufsize, sizeof(HFSPlusExtentRecord)); + MALLOC(extentbuf, HFSPlusExtentDescriptor *, extentbufsize, M_TEMP, M_WAITOK); + if (extentbuf == NULL) { + result = ENOMEM; + break; } - /* Grab the next 8 extents. */ - bcopy(&recp->overflowExtents.extents[0], extentptr, sizeof(HFSPlusExtentRecord)); + bzero(extentbuf, extentbufsize); + extentptr = extentbuf; + + /* Grab the first 8 extents. */ + bcopy(&recp->forkData.theFork.extents[0], extentptr, sizeof(HFSPlusExtentRecord)); extentptr += kHFSPlusExtentDensity; - blkcnt += count_extent_blocks(totalblocks, recp->overflowExtents.extents); - } - hfs_systemfile_unlock(hfsmp, lockflags); - - if (blkcnt < totalblocks) { - result = ENOATTR; - } else { - result = read_attr_data(hfsmp, uio, attrlen, extentbuf); + blkcnt = count_extent_blocks(totalblocks, recp->forkData.theFork.extents); + + /* Now lookup the overflow extents. */ + lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); + while (blkcnt < totalblocks) { + ((HFSPlusAttrKey *)&iterator->key)->startBlock = blkcnt; + result = BTSearchRecord(btfile, iterator, &btdata, &datasize, NULL); + if (result || + (recp->recordType != kHFSPlusAttrExtents) || + (datasize < sizeof(HFSPlusAttrExtents))) { + printf("hfs_getxattr: %s missing extents, only %d blks of %d found\n", + ap->a_name, blkcnt, totalblocks); + result = ENOATTR; + break; /* break from while */ + } + /* Grab the next 8 extents. */ + bcopy(&recp->overflowExtents.extents[0], extentptr, sizeof(HFSPlusExtentRecord)); + extentptr += kHFSPlusExtentDensity; + blkcnt += count_extent_blocks(totalblocks, recp->overflowExtents.extents); + } + + /* Release Attr B-Tree lock */ + hfs_systemfile_unlock(hfsmp, lockflags); + + if (blkcnt < totalblocks) { + result = ENOATTR; + } + else { + result = read_attr_data(hfsmp, uio, attrlen, extentbuf); + } + FREE(extentbuf, M_TEMP); + + } + else /* No overflow extents. */ { + result = read_attr_data(hfsmp, uio, recp->forkData.theFork.logicalSize, recp->forkData.theFork.extents); } - FREE(extentbuf, M_TEMP); - - } else /* No overflow extents. */ { - result = read_attr_data(hfsmp, uio, recp->forkData.theFork.logicalSize, recp->forkData.theFork.extents); + break; } - break; - - default: - result = ENOATTR; - break; + + default: + /* We only support Extent or inline EAs. Default to ENOATTR for anything else */ + result = ENOATTR; + break; } -exit: - hfs_unlock(cp); - + +exit: if (iterator) { FREE(iterator, M_TEMP); } @@ -547,13 +636,14 @@ hfs_vnop_getxattr(struct vnop_getxattr_args *ap) FREE(recp, M_TEMP); } - return MacToVFSError(result); + return result; + } + /* * Set the data of an extended attribute. */ -__private_extern__ int hfs_vnop_setxattr(struct vnop_setxattr_args *ap) /* @@ -571,19 +661,10 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) struct cnode *cp = NULL; struct hfsmount *hfsmp; uio_t uio = ap->a_uio; - struct BTreeIterator * iterator = NULL; - struct filefork *btfile = NULL; size_t attrsize; - FSBufferDescriptor btdata; - HFSPlusAttrRecord *recp = NULL; - HFSPlusExtentDescriptor *extentptr = NULL; - HFSPlusAttrRecord attrdata; /* 90 bytes */ void * user_data_ptr = NULL; - int started_transaction = 0; - int lockflags = 0; - int exists; - int allocatedblks = 0; int result; + time_t orig_ctime=VTOC(vp)->c_ctime; if (ap->a_name == NULL || ap->a_name[0] == '\0') { return (EINVAL); /* invalid name */ @@ -599,6 +680,8 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) if (result != 0) return result; } + + check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_METADATA_WRITE_OP, NULL); #endif /* HFS_COMPRESSION */ /* Set the Finder Info. */ @@ -606,7 +689,9 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) u_int8_t finderinfo[32]; struct FndrFileInfo *fip; void * finderinfo_start; + u_int8_t *finfo = NULL; u_int16_t fdFlags; + u_int32_t dateadded = 0; attrsize = sizeof(VTOC(vp)->c_finderinfo); @@ -641,6 +726,12 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) } } + /* Grab the current date added from the cnode */ + dateadded = hfs_get_dateadded (cp); + + /* Zero out the date added field to ignore user's attempts to set it */ + hfs_zero_dateadded(cp, finderinfo); + if (bcmp(finderinfo_start, emptyfinfo, attrsize)) { /* attr exists and "create" was specified. */ if (ap->a_options & XATTR_CREATE) { @@ -654,12 +745,33 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) return (ENOATTR); } } + + /* + * Now restore the date added to the finderinfo to be written out. + * Advance to the 2nd half of the finderinfo to write out the date added + * into the buffer. + * + * Make sure to endian swap the date added back into big endian. When we used + * hfs_get_dateadded above to retrieve it, it swapped into local endianness + * for us. But now that we're writing it out, put it back into big endian. + */ + finfo = &finderinfo[16]; + + if (S_ISREG(cp->c_attr.ca_mode)) { + struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; + extinfo->date_added = OSSwapHostToBigInt32(dateadded); + } + else if (S_ISDIR(cp->c_attr.ca_mode)) { + struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; + extinfo->date_added = OSSwapHostToBigInt32(dateadded); + } + /* Set the cnode's Finder Info. */ if (attrsize == sizeof(cp->c_finderinfo)) bcopy(&finderinfo[0], finderinfo_start, attrsize); else bcopy(&finderinfo[8], finderinfo_start, attrsize); - + /* Updating finderInfo updates change time and modified time */ cp->c_touch_chgtime = TRUE; cp->c_flag |= C_MODIFIED; @@ -724,32 +836,29 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) if (result) { return (result); } - /* - * VNOP_WRITE marks the vnode as needing a modtime update. - */ + /* VNOP_WRITE marks cnode as needing a modtime update */ result = VNOP_WRITE(rvp, uio, 0, ap->a_context); - /* if open unlinked, force it inactive and recycle */ + /* if open unlinked, force it inactive */ if (openunlinked) { int vref; vref = vnode_ref (rvp); if (vref == 0) { vnode_rele(rvp); } - vnode_recycle (rvp); + vnode_recycle (rvp); } else { - /* re-lock the cnode so we can update the modtimes */ - if ((result = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) { - vnode_recycle(rvp); + /* cnode is not open-unlinked, so re-lock cnode to sync */ + if ((result = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) { + vnode_recycle (rvp); vnode_put(rvp); - return (result); + return result; } - - /* HFS fsync the resource fork to force it out to disk */ - result = hfs_fsync (rvp, MNT_NOWAIT, 0, vfs_context_proc(ap->a_context)); - - hfs_unlock(cp); + + /* hfs fsync rsrc fork to force to disk and update modtime */ + result = hfs_fsync (rvp, MNT_NOWAIT, 0, vfs_context_proc (ap->a_context)); + hfs_unlock (cp); } vnode_put(rvp); @@ -764,8 +873,9 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) attrsize = uio_resid(uio); /* Enforce an upper limit. */ - if (attrsize > HFS_MAXATTRIBUTESIZE) { - return (E2BIG); + if (attrsize > HFS_XATTR_MAXSIZE) { + result = E2BIG; + goto exit; } /* @@ -791,23 +901,82 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) goto exit; } cp = VTOC(vp); + + /* + * If we're trying to set a non-finderinfo, non-resourcefork EA, then + * call the breakout function. + */ + result = hfs_setxattr_internal (cp, user_data_ptr, attrsize, ap, VTOHFS(vp), 0); + exit: + if (cp) { + hfs_unlock(cp); + } + if (user_data_ptr) { + FREE(user_data_ptr, M_TEMP); + } + + return (result == btNotFound ? ENOATTR : MacToVFSError(result)); +} + + +/* + * hfs_setxattr_internal + * + * Internal function to set non-rsrc, non-finderinfo EAs to either the attribute B-Tree or + * extent-based EAs. + * + * See comments from hfs_getxattr_internal on why we need to pass 'hfsmp' and fileid here. + * The gist is that we could end up writing to the root folder which may not have a cnode. + * + * Assumptions: + * 1. cnode 'cp' is locked EXCLUSIVE before calling this function. + * 2. data_ptr contains data to be written. If gathering data from userland, this must be + * done before calling this function. + * 3. If data originates entirely in-kernel, use a null UIO, and ensure the size is less than + * hfsmp->hfs_max_inline_attrsize bytes long. + */ +int hfs_setxattr_internal (struct cnode *cp, caddr_t data_ptr, size_t attrsize, + struct vnop_setxattr_args *ap, struct hfsmount *hfsmp, + u_int32_t fileid) { + uio_t uio = ap->a_uio; + struct vnode *vp = ap->a_vp; + int started_transaction = 0; + struct BTreeIterator * iterator = NULL; + struct filefork *btfile = NULL; + FSBufferDescriptor btdata; + HFSPlusAttrRecord attrdata; /* 90 bytes */ + HFSPlusAttrRecord *recp = NULL; + HFSPlusExtentDescriptor *extentptr = NULL; + int result = 0; + int lockflags = 0; + int exists = 0; + int allocatedblks = 0; + u_int32_t target_id; + + if (cp) { + target_id = cp->c_fileid; + } + else { + target_id = fileid; + } + /* Start a transaction for our changes. */ if (hfs_start_transaction(hfsmp) != 0) { result = EINVAL; goto exit; } started_transaction = 1; - + /* * Once we started the transaction, nobody can compete * with us, so make sure this file is still there. */ - if (cp->c_flag & C_NOEXISTS) { + if ((cp) && (cp->c_flag & C_NOEXISTS)) { result = ENOENT; goto exit; } - + /* * If there isn't an attributes b-tree then create one. */ @@ -821,10 +990,10 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) if (hfsmp->hfs_max_inline_attrsize == 0) { hfsmp->hfs_max_inline_attrsize = getmaxinlineattrsize(hfsmp->hfs_attribute_vp); } - + /* Take exclusive access to the attributes b-tree. */ lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); - + /* Build the b-tree key. */ MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); if (iterator == NULL) { @@ -832,18 +1001,18 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) goto exit; } bzero(iterator, sizeof(*iterator)); - result = hfs_buildattrkey(VTOC(vp)->c_fileid, ap->a_name, (HFSPlusAttrKey *)&iterator->key); + result = hfs_buildattrkey(target_id, ap->a_name, (HFSPlusAttrKey *)&iterator->key); if (result) { goto exit; } - + /* Preflight for replace/create semantics. */ btfile = VTOF(hfsmp->hfs_attribute_vp); btdata.bufferAddress = &attrdata; btdata.itemSize = sizeof(attrdata); btdata.itemCount = 1; exists = BTSearchRecord(btfile, iterator, &btdata, NULL, NULL) == 0; - + /* Replace requires that the attribute already exists. */ if ((ap->a_options & XATTR_REPLACE) && !exists) { result = ENOATTR; @@ -854,6 +1023,7 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) result = EEXIST; goto exit; } + /* If it won't fit inline then use extent-based attributes. */ if (attrsize > hfsmp->hfs_max_inline_attrsize) { size_t extentbufsize; @@ -861,13 +1031,17 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) int extentblks; u_int32_t *keystartblk; int i; - - /* Check if volume supports extent-based attributes */ - if ((hfsmp->hfs_flags & HFS_XATTR_EXTENTS) == 0) { - result = E2BIG; + + if (uio == NULL) { + /* + * setxattrs originating from in-kernel are not supported if they are bigger + * than the inline max size. Just return ENOATTR and force them to do it with a + * smaller EA. + */ + result = EPERM; goto exit; - } - + } + /* Get some blocks. */ blkcnt = howmany(attrsize, hfsmp->blockSize); extentbufsize = blkcnt * sizeof(HFSPlusExtentDescriptor); @@ -886,11 +1060,13 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) /* Copy data into the blocks. */ result = write_attr_data(hfsmp, uio, attrsize, extentptr); if (result) { - const char *name = vnode_getname(vp); - printf("hfs_setxattr: write_attr_data err (%d) %s:%s\n", - result, name ? name : "", ap->a_name); - if (name) - vnode_putname(name); + if (vp) { + const char *name = vnode_getname(vp); + printf("hfs_setxattr: write_attr_data err (%d) %s:%s\n", + result, name ? name : "", ap->a_name); + if (name) + vnode_putname(name); + } goto exit; } @@ -898,15 +1074,16 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) if (exists) { result = remove_attribute_records(hfsmp, iterator); if (result) { - const char *name = vnode_getname(vp); - printf("hfs_setxattr: remove_attribute_records err (%d) %s:%s\n", - result, name ? name : "", ap->a_name); - if (name) - vnode_putname(name); - goto exit; + if (vp) { + const char *name = vnode_getname(vp); + printf("hfs_setxattr: remove_attribute_records err (%d) %s:%s\n", + result, name ? name : "", ap->a_name); + if (name) + vnode_putname(name); + } + goto exit; } } - /* Create attribute fork data record. */ MALLOC(recp, HFSPlusAttrRecord *, sizeof(HFSPlusAttrRecord), M_TEMP, M_WAITOK); if (recp == NULL) { @@ -916,32 +1093,27 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) btdata.bufferAddress = recp; btdata.itemCount = 1; btdata.itemSize = sizeof(HFSPlusAttrForkData); - + recp->recordType = kHFSPlusAttrForkData; recp->forkData.reserved = 0; recp->forkData.theFork.logicalSize = attrsize; recp->forkData.theFork.clumpSize = 0; recp->forkData.theFork.totalBlocks = blkcnt; bcopy(extentptr, recp->forkData.theFork.extents, sizeof(HFSPlusExtentRecord)); - - (void) hfs_buildattrkey(VTOC(vp)->c_fileid, ap->a_name, (HFSPlusAttrKey *)&iterator->key); - + + (void) hfs_buildattrkey(target_id, ap->a_name, (HFSPlusAttrKey *)&iterator->key); + result = BTInsertRecord(btfile, iterator, &btdata, btdata.itemSize); if (result) { -#if HFS_XATTR_VERBOSE - const char *name = vnode_getname(vp); - printf("hfs_setxattr: BTInsertRecord err (%d) %s:%s\n", - MacToVFSError(result), name ? name : "", ap->a_name); - if (name) - vnode_putname(name); -#endif + printf ("hfs_setxattr: BTInsertRecord() - %d,%s err=%d\n", + target_id, ap->a_name, result); goto exit; } extentblks = count_extent_blocks(blkcnt, recp->forkData.theFork.extents); blkcnt -= extentblks; keystartblk = &((HFSPlusAttrKey *)&iterator->key)->startBlock; i = 0; - + /* Create overflow extents as needed. */ while (blkcnt > 0) { /* Initialize the key and record. */ @@ -949,31 +1121,29 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) btdata.itemSize = sizeof(HFSPlusAttrExtents); recp->recordType = kHFSPlusAttrExtents; recp->overflowExtents.reserved = 0; - + /* Copy the next set of extents. */ i += kHFSPlusExtentDensity; bcopy(&extentptr[i], recp->overflowExtents.extents, sizeof(HFSPlusExtentRecord)); - + result = BTInsertRecord(btfile, iterator, &btdata, btdata.itemSize); if (result) { - const char *name = vnode_getname(vp); - printf("hfs_setxattr: BTInsertRecord err (%d) %s:%s\n", - MacToVFSError(result), name ? name : "", ap->a_name); - if (name) - vnode_putname(name); + printf ("hfs_setxattr: BTInsertRecord() overflow - %d,%s err=%d\n", + target_id, ap->a_name, result); goto exit; } extentblks = count_extent_blocks(blkcnt, recp->overflowExtents.extents); blkcnt -= extentblks; } - } else /* Inline data */ { + } + else { /* Inline data */ if (exists) { result = remove_attribute_records(hfsmp, iterator); if (result) { goto exit; } } - + /* Calculate size of record rounded up to multiple of 2 bytes. */ btdata.itemSize = sizeof(HFSPlusAttrData) - 2 + attrsize + ((attrsize & 1) ? 1 : 0); MALLOC(recp, HFSPlusAttrRecord *, btdata.itemSize, M_TEMP, M_WAITOK); @@ -985,24 +1155,36 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) recp->attrData.reserved[0] = 0; recp->attrData.reserved[1] = 0; recp->attrData.attrSize = attrsize; - + /* Copy in the attribute data (if any). */ if (attrsize > 0) { - if (user_data_ptr) - bcopy(user_data_ptr, &recp->attrData.attrData, attrsize); - else + if (data_ptr) { + bcopy(data_ptr, &recp->attrData.attrData, attrsize); + } + else { + /* + * A null UIO meant it originated in-kernel. If they didn't supply data_ptr + * then deny the copy operation. + */ + if (uio == NULL) { + result = EPERM; + goto exit; + } result = uiomove((caddr_t)&recp->attrData.attrData, attrsize, uio); + } + if (result) { goto exit; } } - - (void) hfs_buildattrkey(VTOC(vp)->c_fileid, ap->a_name, (HFSPlusAttrKey *)&iterator->key); - + + (void) hfs_buildattrkey(target_id, ap->a_name, (HFSPlusAttrKey *)&iterator->key); + btdata.bufferAddress = recp; btdata.itemCount = 1; result = BTInsertRecord(btfile, iterator, &btdata, btdata.itemSize); } + exit: if (btfile && started_transaction) { (void) BTFlushPath(btfile); @@ -1011,16 +1193,18 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) hfs_systemfile_unlock(hfsmp, lockflags); } if (result == 0) { - cp = VTOC(vp); - /* Setting an attribute only updates change time and not - * modified time of the file. - */ - cp->c_touch_chgtime = TRUE; - cp->c_attr.ca_recflags |= kHFSHasAttributesMask; - if ((bcmp(ap->a_name, KAUTH_FILESEC_XATTR, sizeof(KAUTH_FILESEC_XATTR)) == 0)) { - cp->c_attr.ca_recflags |= kHFSHasSecurityMask; + if (vp) { + cp = VTOC(vp); + /* Setting an attribute only updates change time and not + * modified time of the file. + */ + cp->c_touch_chgtime = TRUE; + cp->c_attr.ca_recflags |= kHFSHasAttributesMask; + if ((bcmp(ap->a_name, KAUTH_FILESEC_XATTR, sizeof(KAUTH_FILESEC_XATTR)) == 0)) { + cp->c_attr.ca_recflags |= kHFSHasSecurityMask; + } + (void) hfs_update(vp, 0); } - (void) hfs_update(vp, 0); } if (started_transaction) { if (result && allocatedblks) { @@ -1028,12 +1212,7 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) } hfs_end_transaction(hfsmp); } - if (cp) { - hfs_unlock(cp); - } - if (user_data_ptr) { - FREE(user_data_ptr, M_TEMP); - } + if (recp) { FREE(recp, M_TEMP); } @@ -1043,13 +1222,16 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) if (iterator) { FREE(iterator, M_TEMP); } - return (result == btNotFound ? ENOATTR : MacToVFSError(result)); + + return result; } + + + /* * Remove an extended attribute. */ -__private_extern__ int hfs_vnop_removexattr(struct vnop_removexattr_args *ap) /* @@ -1068,6 +1250,7 @@ hfs_vnop_removexattr(struct vnop_removexattr_args *ap) struct BTreeIterator * iterator = NULL; int lockflags; int result; + time_t orig_ctime=VTOC(vp)->c_ctime; if (ap->a_name == NULL || ap->a_name[0] == '\0') { return (EINVAL); /* invalid name */ @@ -1078,8 +1261,11 @@ hfs_vnop_removexattr(struct vnop_removexattr_args *ap) } #if HFS_COMPRESSION - if (hfs_hides_xattr(ap->a_context, VTOC(vp), ap->a_name, 1) && !(ap->a_options & XATTR_SHOWCOMPRESSION)) + if (hfs_hides_xattr(ap->a_context, VTOC(vp), ap->a_name, 1) && !(ap->a_options & XATTR_SHOWCOMPRESSION)) { return ENOATTR; + } + + check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_METADATA_DELETE_OP, NULL); #endif /* HFS_COMPRESSION */ /* If Resource Fork is non-empty then truncate it. */ @@ -1102,9 +1288,9 @@ hfs_vnop_removexattr(struct vnop_removexattr_args *ap) return (result); } - hfs_lock_truncate(VTOC(rvp), TRUE); + hfs_lock_truncate(VTOC(rvp), HFS_EXCLUSIVE_LOCK); if ((result = hfs_lock(VTOC(rvp), HFS_EXCLUSIVE_LOCK))) { - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); vnode_put(rvp); return (result); } @@ -1113,7 +1299,7 @@ hfs_vnop_removexattr(struct vnop_removexattr_args *ap) * hfs_truncate() and hfs_update() */ if ((result = hfs_start_transaction(hfsmp))) { - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); hfs_unlock(cp); vnode_put(rvp); return (result); @@ -1127,7 +1313,7 @@ hfs_vnop_removexattr(struct vnop_removexattr_args *ap) } hfs_end_transaction(hfsmp); - hfs_unlock_truncate(VTOC(rvp), TRUE); + hfs_unlock_truncate(VTOC(rvp), 0); hfs_unlock(VTOC(rvp)); vnode_put(rvp); @@ -1137,34 +1323,80 @@ hfs_vnop_removexattr(struct vnop_removexattr_args *ap) if (bcmp(ap->a_name, XATTR_FINDERINFO_NAME, sizeof(XATTR_FINDERINFO_NAME)) == 0) { void * finderinfo_start; int finderinfo_size; - + u_int8_t finderinfo[32]; + u_int32_t date_added; + u_int8_t *finfo = NULL; + if ((result = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) { return (result); } - - /* Symlink's don't have an external type/creator. */ + + /* Use the local copy to store our temporary changes. */ + bcopy(cp->c_finderinfo, finderinfo, sizeof(finderinfo)); + + + /* Zero out the date added field in the local copy */ + hfs_zero_dateadded (cp, finderinfo); + + /* Don't expose a symlink's private type/creator. */ if (vnode_islnk(vp)) { - /* Skip over type/creator fields. */ + struct FndrFileInfo *fip; + + fip = (struct FndrFileInfo *)&finderinfo; + fip->fdType = 0; + fip->fdCreator = 0; + } + + /* Do the byte compare against the local copy */ + if (bcmp(finderinfo, emptyfinfo, sizeof(emptyfinfo)) == 0) { + hfs_unlock (cp); + return (ENOATTR); + } + + /* + * If there was other content, zero out everything except + * type/creator and date added. First, save the date added. + */ + finfo = cp->c_finderinfo; + finfo = finfo + 16; + if (S_ISREG(cp->c_attr.ca_mode)) { + struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; + date_added = extinfo->date_added; + } + else if (S_ISDIR(cp->c_attr.ca_mode)) { + struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; + date_added = extinfo->date_added; + } + + if (vnode_islnk(vp)) { + /* Ignore type/creator */ finderinfo_start = &cp->c_finderinfo[8]; finderinfo_size = sizeof(cp->c_finderinfo) - 8; - } else { + } + else { finderinfo_start = &cp->c_finderinfo[0]; finderinfo_size = sizeof(cp->c_finderinfo); } - if (bcmp(finderinfo_start, emptyfinfo, finderinfo_size) == 0) { - hfs_unlock(cp); - return (ENOATTR); - } - bzero(finderinfo_start, finderinfo_size); - + + + /* Now restore the date added */ + if (S_ISREG(cp->c_attr.ca_mode)) { + struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; + extinfo->date_added = date_added; + } + else if (S_ISDIR(cp->c_attr.ca_mode)) { + struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; + extinfo->date_added = date_added; + } + /* Updating finderInfo updates change time and modified time */ cp->c_touch_chgtime = TRUE; cp->c_flag |= C_MODIFIED; hfs_update(vp, FALSE); - + hfs_unlock(cp); - + return (0); } /* @@ -1305,7 +1537,7 @@ file_attribute_exist(struct hfsmount *hfsmp, uint32_t fileID) * - The Allocation Bitmap file must be locked exclusive. * - The iterator key must be initialized. */ -static int +int remove_attribute_records(struct hfsmount *hfsmp, BTreeIterator * iterator) { struct filefork *btfile; @@ -1334,11 +1566,9 @@ remove_attribute_records(struct hfsmount *hfsmp, BTreeIterator * iterator) int extentblks; u_int32_t *keystartblk; -#if HFS_XATTR_VERBOSE if (datasize < sizeof(HFSPlusAttrForkData)) { - printf("hfs: remove_attribute_records: bad record size %d (expecting %d)\n", datasize, sizeof(HFSPlusAttrForkData)); + printf("hfs: remove_attribute_records: bad record size %d (expecting %lu)\n", datasize, sizeof(HFSPlusAttrForkData)); } -#endif totalblks = attrdata.forkData.theFork.totalBlocks; /* Process the first 8 extents. */ @@ -1385,7 +1615,6 @@ remove_attribute_records(struct hfsmount *hfsmp, BTreeIterator * iterator) /* * Retrieve the list of extended attribute names. */ -__private_extern__ int hfs_vnop_listxattr(struct vnop_listxattr_args *ap) /* @@ -1405,12 +1634,11 @@ hfs_vnop_listxattr(struct vnop_listxattr_args *ap) struct BTreeIterator * iterator = NULL; struct filefork *btfile; struct listattr_callback_state state; - void * finderinfo_start; - int finderinfo_size; user_addr_t user_start = 0; user_size_t user_len = 0; int lockflags; int result; + u_int8_t finderinfo[32]; if (VNODE_IS_RSRC(vp)) { return (EPERM); @@ -1427,17 +1655,26 @@ hfs_vnop_listxattr(struct vnop_listxattr_args *ap) return (result); } + /* + * Make a copy of the cnode's finderinfo to a local so we can + * zero out the date added field. Also zero out the private type/creator + * for symlinks. + */ + bcopy(cp->c_finderinfo, finderinfo, sizeof(finderinfo)); + hfs_zero_dateadded (cp, finderinfo); + /* Don't expose a symlink's private type/creator. */ if (vnode_islnk(vp)) { - /* Skip over type/creator fields. */ - finderinfo_start = &cp->c_finderinfo[8]; - finderinfo_size = sizeof(cp->c_finderinfo) - 8; - } else { - finderinfo_start = &cp->c_finderinfo[0]; - finderinfo_size = sizeof(cp->c_finderinfo); - } + struct FndrFileInfo *fip; + + fip = (struct FndrFileInfo *)&finderinfo; + fip->fdType = 0; + fip->fdCreator = 0; + } + + /* If Finder Info is non-empty then export it's name. */ - if (bcmp(finderinfo_start, emptyfinfo, finderinfo_size) != 0) { + if (bcmp(finderinfo, emptyfinfo, sizeof(emptyfinfo)) != 0) { if (uio == NULL) { *ap->a_size += sizeof(XATTR_FINDERINFO_NAME); } else if ((user_size_t)uio_resid(uio) < sizeof(XATTR_FINDERINFO_NAME)) { @@ -1546,11 +1783,9 @@ hfs_vnop_listxattr(struct vnop_listxattr_args *ap) if (user_start) { vsunlock(user_start, user_len, TRUE); } - if (iterator) { FREE(iterator, M_TEMP); } - hfs_unlock(cp); return MacToVFSError(result); @@ -1558,7 +1793,7 @@ hfs_vnop_listxattr(struct vnop_listxattr_args *ap) /* - * Callback - called for each attribute + * Callback - called for each attribute record */ static int listattr_callback(const HFSPlusAttrKey *key, __unused const HFSPlusAttrData *data, struct listattr_callback_state *state) @@ -1621,7 +1856,6 @@ listattr_callback(const HFSPlusAttrKey *key, __unused const HFSPlusAttrData *dat * This function takes the necessary locks on the attribute * b-tree file and the allocation (bitmap) file. */ -__private_extern__ int hfs_removeallattr(struct hfsmount *hfsmp, u_int32_t fileid) { @@ -1699,10 +1933,8 @@ hfs_xattr_init(struct hfsmount * hfsmp) /* * Enable/Disable volume attributes stored as EA for root file system. * Supported attributes are - - * 1. ACLs - * 2. Extent-based Extended Attributes + * 1. Extent-based Extended Attributes */ -__private_extern__ int hfs_set_volxattr(struct hfsmount *hfsmp, unsigned int xattrtype, int state) { @@ -1714,6 +1946,9 @@ hfs_set_volxattr(struct hfsmount *hfsmp, unsigned int xattrtype, int state) if (hfsmp->hfs_flags & HFS_STANDARD) { return (ENOTSUP); } + if (xattrtype != HFS_SET_XATTREXTENTS_STATE) { + return EINVAL; + } /* * If there isn't an attributes b-tree then create one. @@ -1736,18 +1971,8 @@ hfs_set_volxattr(struct hfsmount *hfsmp, unsigned int xattrtype, int state) * Build a b-tree key. * We use the root's parent id (1) to hold this volume attribute. */ - if (xattrtype == HFS_SETACLSTATE) { - /* ACL */ - (void) hfs_buildattrkey(kHFSRootParentID, XATTR_EXTENDEDSECURITY_NAME, - (HFSPlusAttrKey *)&iterator->key); - } else if (xattrtype == HFS_SET_XATTREXTENTS_STATE) { - /* Extent-based extended attributes */ - (void) hfs_buildattrkey(kHFSRootParentID, XATTR_XATTREXTENTS_NAME, - (HFSPlusAttrKey *)&iterator->key); - } else { - result = EINVAL; - goto exit; - } + (void) hfs_buildattrkey(kHFSRootParentID, XATTR_XATTREXTENTS_NAME, + (HFSPlusAttrKey *)&iterator->key); /* Start a transaction for our changes. */ if (hfs_start_transaction(hfsmp) != 0) { @@ -1790,91 +2015,21 @@ hfs_set_volxattr(struct hfsmount *hfsmp, unsigned int xattrtype, int state) /* Finish the transaction of our changes. */ hfs_end_transaction(hfsmp); -exit: - if (iterator) { - FREE(iterator, M_TEMP); - } - if (result == 0) { - if (xattrtype == HFS_SETACLSTATE) { - if (state == 0) { - vfs_clearextendedsecurity(HFSTOVFS(hfsmp)); - } else { - vfs_setextendedsecurity(HFSTOVFS(hfsmp)); - } - } else { - /* HFS_SET_XATTREXTENTS_STATE */ - HFS_MOUNT_LOCK(hfsmp, TRUE); - if (state == 0) { - hfsmp->hfs_flags &= ~HFS_XATTR_EXTENTS; - } else { - hfsmp->hfs_flags |= HFS_XATTR_EXTENTS; - } - HFS_MOUNT_UNLOCK(hfsmp, TRUE); - } - } - - return MacToVFSError(result); -} - - /* - * Check for volume attributes stored as EA for root file system. - * Supported attributes are - - * 1. ACLs - * 2. Extent-based Extended Attributes - */ -__private_extern__ -void -hfs_check_volxattr(struct hfsmount *hfsmp, unsigned int xattrtype) -{ - struct BTreeIterator * iterator; - struct filefork *btfile; - int lockflags; - int result; - - if (hfsmp->hfs_flags & HFS_STANDARD || - hfsmp->hfs_attribute_vp == NULL) { - return; - } - - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - if (iterator == NULL) { - return; - } - bzero(iterator, sizeof(*iterator)); - - /* - * Build a b-tree key. - * We use the root's parent id (1) to hold this volume attribute. - */ - if (xattrtype == HFS_SETACLSTATE) { - /* ACLs */ - (void) hfs_buildattrkey(kHFSRootParentID, XATTR_EXTENDEDSECURITY_NAME, - (HFSPlusAttrKey *)&iterator->key); + /* Update the state in the mount point */ + HFS_MOUNT_LOCK(hfsmp, TRUE); + if (state == 0) { + hfsmp->hfs_flags &= ~HFS_XATTR_EXTENTS; } else { - /* Extent-based extended attributes */ - (void) hfs_buildattrkey(kHFSRootParentID, XATTR_XATTREXTENTS_NAME, - (HFSPlusAttrKey *)&iterator->key); + hfsmp->hfs_flags |= HFS_XATTR_EXTENTS; } - btfile = VTOF(hfsmp->hfs_attribute_vp); - - lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); - - /* Check for our attribute. */ - result = BTSearchRecord(btfile, iterator, NULL, NULL, NULL); + HFS_MOUNT_UNLOCK(hfsmp, TRUE); - hfs_systemfile_unlock(hfsmp, lockflags); - FREE(iterator, M_TEMP); - - if (result == 0) { - if (xattrtype == HFS_SETACLSTATE) { - vfs_setextendedsecurity(HFSTOVFS(hfsmp)); - } else { - HFS_MOUNT_LOCK(hfsmp, TRUE); - hfsmp->hfs_flags |= HFS_XATTR_EXTENTS; - HFS_MOUNT_UNLOCK(hfsmp, TRUE); - } +exit: + if (iterator) { + FREE(iterator, M_TEMP); } + return MacToVFSError(result); } @@ -2036,76 +2191,70 @@ getmaxinlineattrsize(struct vnode * attrvp) } /* - * Get a referenced vnode for attribute data I/O. + * Initialize vnode for attribute data I/O. + * + * On success, + * - returns zero + * - the attrdata vnode is initialized as hfsmp->hfs_attrdata_vp + * - an iocount is taken on the attrdata vnode which exists + * for the entire duration of the mount. It is only dropped + * during unmount + * - the attrdata cnode is not locked + * + * On failure, + * - returns non-zero value + * - the caller does not have to worry about any locks or references */ -static int -get_attr_data_vnode(struct hfsmount *hfsmp, vnode_t *vpp) +int init_attrdata_vnode(struct hfsmount *hfsmp) { vnode_t vp; int result = 0; + struct cat_desc cat_desc; + struct cat_attr cat_attr; + struct cat_fork cat_fork; + int newvnode_flags = 0; + + bzero(&cat_desc, sizeof(cat_desc)); + cat_desc.cd_parentcnid = kHFSRootParentID; + cat_desc.cd_nameptr = (const u_int8_t *)hfs_attrdatafilename; + cat_desc.cd_namelen = strlen(hfs_attrdatafilename); + cat_desc.cd_cnid = kHFSAttributeDataFileID; + /* Tag vnode as system file, note that we can still use cluster I/O */ + cat_desc.cd_flags |= CD_ISMETA; + + bzero(&cat_attr, sizeof(cat_attr)); + cat_attr.ca_linkcount = 1; + cat_attr.ca_mode = S_IFREG; + cat_attr.ca_fileid = cat_desc.cd_cnid; + cat_attr.ca_blocks = hfsmp->totalBlocks; - vp = hfsmp->hfs_attrdata_vp; - if (vp == NULLVP) { - struct cat_desc cat_desc; - struct cat_attr cat_attr; - struct cat_fork cat_fork; - - /* We don't tag it as a system file since we intend to use cluster I/O. */ - bzero(&cat_desc, sizeof(cat_desc)); - cat_desc.cd_parentcnid = kHFSRootParentID; - cat_desc.cd_nameptr = (const u_int8_t *)hfs_attrdatafilename; - cat_desc.cd_namelen = strlen(hfs_attrdatafilename); - cat_desc.cd_cnid = kHFSAttributeDataFileID; - - bzero(&cat_attr, sizeof(cat_attr)); - cat_attr.ca_linkcount = 1; - cat_attr.ca_mode = S_IFREG; - cat_attr.ca_fileid = cat_desc.cd_cnid; - cat_attr.ca_blocks = hfsmp->totalBlocks; - - /* - * The attribute data file is a virtual file that spans the - * entire file system space. - * - * Each extent-based attribute occupies a unique portion of - * in this virtual file. The cluster I/O is done using actual - * allocation block offsets so no additional mapping is needed - * for the VNOP_BLOCKMAP call. - * - * This approach allows the attribute data to be cached without - * incurring the high cost of using a separate vnode per attribute. - * - * Since we need to acquire the attribute b-tree file lock anyways, - * the virtual file doesn't introduce any additional serialization. - */ - bzero(&cat_fork, sizeof(cat_fork)); - cat_fork.cf_size = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize; - cat_fork.cf_blocks = hfsmp->totalBlocks; - cat_fork.cf_extents[0].startBlock = 0; - cat_fork.cf_extents[0].blockCount = cat_fork.cf_blocks; - - result = hfs_getnewvnode(hfsmp, NULL, NULL, &cat_desc, 0, &cat_attr, &cat_fork, &vp); - if (result == 0) { - HFS_MOUNT_LOCK(hfsmp, 1); - /* Check if someone raced us for creating this vnode. */ - if (hfsmp->hfs_attrdata_vp != NULLVP) { - HFS_MOUNT_UNLOCK(hfsmp, 1); - vnode_put(vp); - vnode_recycle(vp); - vp = hfsmp->hfs_attrdata_vp; - } else { - hfsmp->hfs_attrdata_vp = vp; - HFS_MOUNT_UNLOCK(hfsmp, 1); - /* Keep a reference on this vnode until unmount */ - vnode_ref_ext(vp, O_EVTONLY); - hfs_unlock(VTOC(vp)); - } - } - } else { - if ((result = vnode_get(vp))) - vp = NULLVP; + /* + * The attribute data file is a virtual file that spans the + * entire file system space. + * + * Each extent-based attribute occupies a unique portion of + * in this virtual file. The cluster I/O is done using actual + * allocation block offsets so no additional mapping is needed + * for the VNOP_BLOCKMAP call. + * + * This approach allows the attribute data to be cached without + * incurring the high cost of using a separate vnode per attribute. + * + * Since we need to acquire the attribute b-tree file lock anyways, + * the virtual file doesn't introduce any additional serialization. + */ + bzero(&cat_fork, sizeof(cat_fork)); + cat_fork.cf_size = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize; + cat_fork.cf_blocks = hfsmp->totalBlocks; + cat_fork.cf_extents[0].startBlock = 0; + cat_fork.cf_extents[0].blockCount = cat_fork.cf_blocks; + + result = hfs_getnewvnode(hfsmp, NULL, NULL, &cat_desc, 0, &cat_attr, + &cat_fork, &vp, &newvnode_flags); + if (result == 0) { + hfsmp->hfs_attrdata_vp = vp; + hfs_unlock(VTOC(vp)); } - *vpp = vp; return (result); } @@ -2115,7 +2264,7 @@ get_attr_data_vnode(struct hfsmount *hfsmp, vnode_t *vpp) static int read_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExtentDescriptor *extents) { - vnode_t evp = NULLVP; + vnode_t evp = hfsmp->hfs_attrdata_vp; int bufsize; int iosize; int attrsize; @@ -2123,10 +2272,7 @@ read_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExtent int i; int result = 0; - if ((result = get_attr_data_vnode(hfsmp, &evp))) { - return (result); - } - hfs_lock_truncate(VTOC(evp), 0); + hfs_lock_truncate(VTOC(evp), HFS_SHARED_LOCK); bufsize = (int)uio_resid(uio); attrsize = (int)datasize; @@ -2158,7 +2304,6 @@ read_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExtent uio_setoffset(uio, datasize); hfs_unlock_truncate(VTOC(evp), 0); - vnode_put(evp); return (result); } @@ -2168,7 +2313,7 @@ read_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExtent static int write_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExtentDescriptor *extents) { - vnode_t evp = NULLVP; + vnode_t evp = hfsmp->hfs_attrdata_vp; off_t filesize; int bufsize; int attrsize; @@ -2177,11 +2322,7 @@ write_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExten int i; int result = 0; - /* Get exclusive use of attribute data vnode. */ - if ((result = get_attr_data_vnode(hfsmp, &evp))) { - return (result); - } - hfs_lock_truncate(VTOC(evp), 0); + hfs_lock_truncate(VTOC(evp), HFS_SHARED_LOCK); bufsize = uio_resid(uio); attrsize = (int) datasize; @@ -2213,7 +2354,6 @@ write_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExten uio_setoffset(uio, datasize); hfs_unlock_truncate(VTOC(evp), 0); - vnode_put(evp); return (result); } @@ -2264,7 +2404,7 @@ alloc_attr_blks(struct hfsmount *hfsmp, size_t attrsize, size_t extentbufsize, H #if HFS_XATTR_VERBOSE printf("hfs: alloc_attr_blks: unexpected failure, %d blocks unallocated\n", blkcnt); #endif - for (; i <= 0; i--) { + for (; i >= 0; i--) { if ((blkcnt = extents[i].blockCount) != 0) { (void) BlockDeallocate(hfsmp, extents[i].startBlock, blkcnt, 0); extents[i].startBlock = 0; @@ -2283,14 +2423,11 @@ alloc_attr_blks(struct hfsmount *hfsmp, size_t attrsize, size_t extentbufsize, H static void free_attr_blks(struct hfsmount *hfsmp, int blkcnt, HFSPlusExtentDescriptor *extents) { - vnode_t evp = NULLVP; + vnode_t evp = hfsmp->hfs_attrdata_vp; int remblks = blkcnt; int lockflags; int i; - if (get_attr_data_vnode(hfsmp, &evp) != 0) { - evp = NULLVP; - } lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); for (i = 0; (remblks > 0) && (extents[i].blockCount != 0); i++) { @@ -2325,9 +2462,6 @@ free_attr_blks(struct hfsmount *hfsmp, int blkcnt, HFSPlusExtentDescriptor *exte } hfs_systemfile_unlock(hfsmp, lockflags); - if (evp) { - vnode_put(evp); - } } static int diff --git a/bsd/hfs/hfscommon/BTree/BTree.c b/bsd/hfs/hfscommon/BTree/BTree.c index 73a521c8f..be52f5900 100644 --- a/bsd/hfs/hfscommon/BTree/BTree.c +++ b/bsd/hfs/hfscommon/BTree/BTree.c @@ -1583,6 +1583,7 @@ BTUpdateRecord(FCB *filePtr, BTreeIterator *iterator, u_int16_t index; Boolean validHint; + ////////////////////////// Priliminary Checks /////////////////////////////// nodeRec.buffer = nil; // so we can call ReleaseNode @@ -1666,9 +1667,9 @@ BTUpdateRecord(FCB *filePtr, BTreeIterator *iterator, ////////////////////////////// Error Exit /////////////////////////////////// ErrorExit: - + (void) ReleaseNode (btreePtr, &nodeRec); - + iterator->hint.writeCount = 0; iterator->hint.nodeNum = 0; iterator->hint.index = 0; @@ -1996,7 +1997,6 @@ OSStatus BTSetLastSync (FCB *filePtr, return noErr; } - __private_extern__ OSStatus BTHasContiguousNodes (FCB *filePtr) { @@ -2021,7 +2021,6 @@ Routine: BTGetUserData Function: Read the user data area of the b-tree header node. -------------------------------------------------------------------------------*/ -__private_extern__ OSStatus BTGetUserData(FCB *filePtr, void * dataPtr, int dataSize) { @@ -2059,7 +2058,6 @@ Routine: BTSetUserData Function: Write the user data area of the b-tree header node. -------------------------------------------------------------------------------*/ -__private_extern__ OSStatus BTSetUserData(FCB *filePtr, void * dataPtr, int dataSize) { diff --git a/bsd/hfs/hfscommon/BTree/BTreeAllocate.c b/bsd/hfs/hfscommon/BTree/BTreeAllocate.c index 99d586408..fe2f91714 100644 --- a/bsd/hfs/hfscommon/BTree/BTreeAllocate.c +++ b/bsd/hfs/hfscommon/BTree/BTreeAllocate.c @@ -621,7 +621,6 @@ Routine: BTZeroUnusedNodes Function: Write zeros to all nodes in the B-tree that are not currently in use. -------------------------------------------------------------------------------*/ -__private_extern__ int BTZeroUnusedNodes(FCB *filePtr) { @@ -695,31 +694,39 @@ BTZeroUnusedNodes(FCB *filePtr) err = EIO; goto ErrorExit; } - + if (buf_flags(bp) & B_LOCKED) { /* - * This node is already part of a transaction and will be - * written when the transaction is committed so don't write it here. - * If we did, then we'd hit a panic in hfs_vnop_bwrite since - * B_LOCKED is still set + * This node is already part of a transaction and will be written when + * the transaction is committed, so don't write it here. If we did, then + * we'd hit a panic in hfs_vnop_bwrite because the B_LOCKED bit is still set. */ buf_brelse(bp); continue; } - buf_clear(bp); buf_markaged(bp); /* * Try not to hog the buffer cache. Wait for the write - * every 32 nodes. + * every 32 nodes. If VNOP_BWRITE reports an error, bail out and bubble + * it up to the function calling us. If we tried to update a read-only + * mount on read-only media, for example, catching the error will let + * us alert the callers of this function that they should maintain + * the mount in read-only mode. + */ ++numWritten; - if (numWritten % 32 == 0) - VNOP_BWRITE(bp); - else + if (numWritten % 32 == 0) { + err = VNOP_BWRITE(bp); + if (err) { + goto ErrorExit; + } + } + else { buf_bawrite(bp); + } } } diff --git a/bsd/hfs/hfscommon/BTree/BTreeScanner.c b/bsd/hfs/hfscommon/BTree/BTreeScanner.c index 1ce08e385..ea549278d 100644 --- a/bsd/hfs/hfscommon/BTree/BTreeScanner.c +++ b/bsd/hfs/hfscommon/BTree/BTreeScanner.c @@ -272,7 +272,7 @@ static int ReadMultipleNodes( BTScanState *theScanStatePtr ) } // now read blocks from the device - myErr = (int)buf_bread(myDevPtr, + myErr = (int)buf_meta_bread(myDevPtr, myPhyBlockNum, myBufferSize, NOCRED, diff --git a/bsd/hfs/hfscommon/Catalog/FileIDsServices.c b/bsd/hfs/hfscommon/Catalog/FileIDsServices.c index dbbc33b58..fee50fe6d 100644 --- a/bsd/hfs/hfscommon/Catalog/FileIDsServices.c +++ b/bsd/hfs/hfscommon/Catalog/FileIDsServices.c @@ -42,12 +42,63 @@ typedef struct ExtentsRecBuffer ExtentsRecBuffer; static u_int32_t CheckExtents( void *extents, u_int32_t blocks, Boolean isHFSPlus ); -static OSErr DeleteExtents( ExtendedVCB *vcb, u_int32_t fileNumber, Boolean isHFSPlus ); -static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t destFileID, Boolean isHFSPlus ); +static OSErr DeleteExtents( ExtendedVCB *vcb, u_int32_t fileNumber, int quitEarly, u_int8_t forkType, Boolean isHFSPlus ); +static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t destFileID, int quitEarly, u_int8_t forkType, Boolean isHFSPlus ); static void CopyCatalogNodeInfo( CatalogRecord *src, CatalogRecord *dest ); static void CopyBigCatalogNodeInfo( CatalogRecord *src, CatalogRecord *dest ); static void CopyExtentInfo( ExtentKey *key, ExtentRecord *data, ExtentsRecBuffer *buffer, u_int16_t bufferCount ); +/* + * This function moves the overflow extents associated with srcID into the file associated with dstID. + * We should have already verified that 'srcID' has overflow extents. So now we move all of the overflow + * extent records. + */ +OSErr MoveData( ExtendedVCB *vcb, HFSCatalogNodeID srcID, HFSCatalogNodeID destID, int rsrc) { + + OSErr err; + + /* + * Only the source file should have extents, so we just track those. + * We operate on the fork represented by the open FD that was used to call into this + * function + */ + if (rsrc) { + /* Copy the extent overflow blocks. */ + err = MoveExtents( vcb, srcID, destID, 1, (u_int8_t)0xff, 1); + if ( err != noErr ) { + if ( err != dskFulErr ) { + return( err ); + } + /* + * In case of error, we would have probably run into problems + * growing the extents b-tree. Since the move is actually a copy + delete + * just delete the new entries. Same for below. + */ + err = DeleteExtents( vcb, destID, 1, (u_int8_t)0xff, 1); + ReturnIfError( err ); // we are doomed. Just QUIT! + goto FlushAndReturn; + } + } + else { + /* Copy the extent overflow blocks. */ + err = MoveExtents( vcb, srcID, destID, 1, 0, 1); + if ( err != noErr ) { + if ( err != dskFulErr ) { + return( err ); + } + err = DeleteExtents( vcb, destID, 1, 0, 1); + ReturnIfError( err ); // we are doomed. Just QUIT! + goto FlushAndReturn; + } + } + +FlushAndReturn: + /* Write out the catalog and extent overflow B-Tree changes */ + err = FlushCatalog( vcb ); + err = FlushExtentFile( vcb ); + + return( err ); +} OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param destName, HFSCatalogNodeID srcID, HFSCatalogNodeID destID, u_int32_t srcHint, u_int32_t destHint ) @@ -61,13 +112,13 @@ OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param int16_t numDestExtentBlocks; OSErr err; Boolean isHFSPlus = ( vcb->vcbSigWord == kHFSPlusSigWord ); - + err = BuildCatalogKeyUTF8(vcb, srcID, srcName, kUndefinedStrLen, &srcKey, NULL); ReturnIfError(err); - + err = BuildCatalogKeyUTF8(vcb, destID, destName, kUndefinedStrLen, &destKey, NULL); ReturnIfError(err); - + if ( isHFSPlus ) { //-- Step 1: Check the catalog nodes for extents @@ -75,37 +126,37 @@ OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param //-- locate the source file, test for extents in extent file, and copy the cat record for later err = LocateCatalogNodeByKey( vcb, srcHint, &srcKey, &srcData, &srcHint ); ReturnIfError( err ); - + if ( srcData.recordType != kHFSPlusFileRecord ) return( cmFThdDirErr ); // Error "cmFThdDirErr = it is a directory" - + //-- Check if there are any extents in the source file //�� I am only checling the extents in the low 32 bits, routine will fail if files extents after 2 gig are in overflow numSrcExtentBlocks = CheckExtents( srcData.hfsPlusFile.dataFork.extents, srcData.hfsPlusFile.dataFork.totalBlocks, isHFSPlus ); if ( numSrcExtentBlocks == 0 ) // then check the resource fork extents numSrcExtentBlocks = CheckExtents( srcData.hfsPlusFile.resourceFork.extents, srcData.hfsPlusFile.resourceFork.totalBlocks, isHFSPlus ); - + //-- Check if there are any extents in the destination file err = LocateCatalogNodeByKey( vcb, destHint, &destKey, &destData, &destHint ); ReturnIfError( err ); - + if ( destData.recordType != kHFSPlusFileRecord ) return( cmFThdDirErr ); // Error "cmFThdDirErr = it is a directory" - + numDestExtentBlocks = CheckExtents( destData.hfsPlusFile.dataFork.extents, destData.hfsPlusFile.dataFork.totalBlocks, isHFSPlus ); if ( numDestExtentBlocks == 0 ) // then check the resource fork extents numDestExtentBlocks = CheckExtents( destData.hfsPlusFile.resourceFork.extents, destData.hfsPlusFile.resourceFork.totalBlocks, isHFSPlus ); - + //-- Step 2: Exchange the Extent key in the extent file //-- Exchange the extents key in the extent file - err = DeleteExtents( vcb, kHFSBogusExtentFileID, isHFSPlus ); + err = DeleteExtents( vcb, kHFSBogusExtentFileID, 0, 0, isHFSPlus ); ReturnIfError( err ); if ( numSrcExtentBlocks && numDestExtentBlocks ) // if both files have extents { //-- Change the source extents file ids to our known bogus value - err = MoveExtents( vcb, srcData.hfsPlusFile.fileID, kHFSBogusExtentFileID, isHFSPlus ); + err = MoveExtents( vcb, srcData.hfsPlusFile.fileID, kHFSBogusExtentFileID, 0, 0, isHFSPlus ); if ( err != noErr ) { if ( err != dskFulErr ) @@ -115,67 +166,67 @@ OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param } //-- Change the destination extents file id's to the source id's - err = MoveExtents( vcb, destData.hfsPlusFile.fileID, srcData.hfsPlusFile.fileID, isHFSPlus ); + err = MoveExtents( vcb, destData.hfsPlusFile.fileID, srcData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); if ( err != noErr ) { if ( err != dskFulErr ) return( err ); - -ExUndo2aPlus: err = DeleteExtents( vcb, srcData.hfsPlusFile.fileID, isHFSPlus ); + + ExUndo2aPlus: err = DeleteExtents( vcb, srcData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); ReturnIfError( err ); // we are doomed. Just QUIT! - - err = MoveExtents( vcb, kHFSBogusExtentFileID, srcData.hfsPlusFile.fileID, isHFSPlus ); // Move the extents back + + err = MoveExtents( vcb, kHFSBogusExtentFileID, srcData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); // Move the extents back ReturnIfError( err ); // we are doomed. Just QUIT! - + goto ExUndo1a; } //-- Change the bogus extents file id's to the dest id's - err = MoveExtents( vcb, kHFSBogusExtentFileID, destData.hfsPlusFile.fileID, isHFSPlus ); + err = MoveExtents( vcb, kHFSBogusExtentFileID, destData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); if ( err != noErr ) { if ( err != dskFulErr ) return( err ); - - err = DeleteExtents( vcb, destData.hfsPlusFile.fileID, isHFSPlus ); + + err = DeleteExtents( vcb, destData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); ReturnIfError( err ); // we are doomed. Just QUIT! - - err = MoveExtents( vcb, srcData.hfsPlusFile.fileID, destData.hfsPlusFile.fileID, isHFSPlus ); // Move the extents back + + err = MoveExtents( vcb, srcData.hfsPlusFile.fileID, destData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); // Move the extents back ReturnIfError( err ); // we are doomed. Just QUIT! - + goto ExUndo2aPlus; } } else if ( numSrcExtentBlocks ) // just the source file has extents { - err = MoveExtents( vcb, srcData.hfsPlusFile.fileID, destData.hfsPlusFile.fileID, isHFSPlus ); + err = MoveExtents( vcb, srcData.hfsPlusFile.fileID, destData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); if ( err != noErr ) { if ( err != dskFulErr ) return( err ); - - err = DeleteExtents( vcb, srcData.hfsPlusFile.fileID, isHFSPlus ); + + err = DeleteExtents( vcb, srcData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); ReturnIfError( err ); // we are doomed. Just QUIT! - + goto FlushAndReturn; } } else if ( numDestExtentBlocks ) // just the destination file has extents { - err = MoveExtents( vcb, destData.hfsPlusFile.fileID, srcData.hfsPlusFile.fileID, isHFSPlus ); + err = MoveExtents( vcb, destData.hfsPlusFile.fileID, srcData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); if ( err != noErr ) { if ( err != dskFulErr ) return( err ); - - err = DeleteExtents( vcb, destData.hfsPlusFile.fileID, isHFSPlus ); + + err = DeleteExtents( vcb, destData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); ReturnIfError( err ); // we are doomed. Just QUIT! - + goto FlushAndReturn; } } - + //-- Step 3: Change the data in the catalog nodes //-- find the source cnode and put dest info in it @@ -188,12 +239,12 @@ ExUndo2aPlus: err = DeleteExtents( vcb, srcData.hfsPlusFile.fileID, isHFSPlus ); err = ReplaceBTreeRecord( vcb->catalogRefNum, &srcKey, srcHint, &srcData, sizeof(HFSPlusCatalogFile), &srcHint ); ReturnIfError( err ); - + // find the destination cnode and put source info in it err = LocateCatalogNodeByKey( vcb, destHint, &destKey, &destData, &destHint ); if ( err != noErr ) return( cmBadNews ); - + CopyBigCatalogNodeInfo( &swapData, &destData ); err = ReplaceBTreeRecord( vcb->catalogRefNum, &destKey, destHint, &destData, sizeof(HFSPlusCatalogFile), &destHint ); ReturnIfError( err ); @@ -205,10 +256,10 @@ ExUndo2aPlus: err = DeleteExtents( vcb, srcData.hfsPlusFile.fileID, isHFSPlus ); //-- locate the source file, test for extents in extent file, and copy the cat record for later err = LocateCatalogNodeByKey( vcb, srcHint, &srcKey, &srcData, &srcHint ); ReturnIfError( err ); - + if ( srcData.recordType != kHFSFileRecord ) return( cmFThdDirErr ); // Error "cmFThdDirErr = it is a directory" - + //-- Check if there are any extents in the source file numSrcExtentBlocks = CheckExtents( srcData.hfsFile.dataExtents, srcData.hfsFile.dataPhysicalSize / vcb->blockSize, isHFSPlus ); if ( numSrcExtentBlocks == 0 ) // then check the resource fork extents @@ -217,106 +268,106 @@ ExUndo2aPlus: err = DeleteExtents( vcb, srcData.hfsPlusFile.fileID, isHFSPlus ); //�� Do we save the found source node for later use? - + //-- Check if there are any extents in the destination file err = LocateCatalogNodeByKey( vcb, destHint, &destKey, &destData, &destHint ); ReturnIfError( err ); - + if ( destData.recordType != kHFSFileRecord ) return( cmFThdDirErr ); // Error "cmFThdDirErr = it is a directory" - + numDestExtentBlocks = CheckExtents( destData.hfsFile.dataExtents, destData.hfsFile.dataPhysicalSize / vcb->blockSize, isHFSPlus ); if ( numDestExtentBlocks == 0 ) // then check the resource fork extents numDestExtentBlocks = CheckExtents( destData.hfsFile.rsrcExtents, destData.hfsFile.rsrcPhysicalSize / vcb->blockSize, isHFSPlus ); - + //�� Do we save the found destination node for later use? - - + + //-- Step 2: Exchange the Extent key in the extent file //-- Exchange the extents key in the extent file - err = DeleteExtents( vcb, kHFSBogusExtentFileID, isHFSPlus ); + err = DeleteExtents( vcb, kHFSBogusExtentFileID, 0, 0, isHFSPlus ); ReturnIfError( err ); if ( numSrcExtentBlocks && numDestExtentBlocks ) // if both files have extents { //-- Change the source extents file ids to our known bogus value - err = MoveExtents( vcb, srcData.hfsFile.fileID, kHFSBogusExtentFileID, isHFSPlus ); + err = MoveExtents( vcb, srcData.hfsFile.fileID, kHFSBogusExtentFileID, 0, 0, isHFSPlus ); if ( err != noErr ) { if ( err != dskFulErr ) return( err ); - -ExUndo1a: err = DeleteExtents( vcb, kHFSBogusExtentFileID, isHFSPlus ); + + ExUndo1a: err = DeleteExtents( vcb, kHFSBogusExtentFileID, 0, 0, isHFSPlus ); ReturnIfError( err ); // we are doomed. Just QUIT! - + err = FlushCatalog( vcb ); // flush the catalog err = FlushExtentFile( vcb ); // flush the extent file (unneeded for common case, but it's cheap) return( dskFulErr ); } //-- Change the destination extents file id's to the source id's - err = MoveExtents( vcb, destData.hfsFile.fileID, srcData.hfsFile.fileID, isHFSPlus ); + err = MoveExtents( vcb, destData.hfsFile.fileID, srcData.hfsFile.fileID, 0, 0, isHFSPlus ); if ( err != noErr ) { if ( err != dskFulErr ) return( err ); - -ExUndo2a: err = DeleteExtents( vcb, srcData.hfsFile.fileID, isHFSPlus ); + + ExUndo2a: err = DeleteExtents( vcb, srcData.hfsFile.fileID, 0, 0, isHFSPlus ); ReturnIfError( err ); // we are doomed. Just QUIT! - - err = MoveExtents( vcb, kHFSBogusExtentFileID, srcData.hfsFile.fileID, isHFSPlus ); // Move the extents back + + err = MoveExtents( vcb, kHFSBogusExtentFileID, srcData.hfsFile.fileID, 0, 0, isHFSPlus ); // Move the extents back ReturnIfError( err ); // we are doomed. Just QUIT! - + goto ExUndo1a; } //-- Change the bogus extents file id's to the dest id's - err = MoveExtents( vcb, kHFSBogusExtentFileID, destData.hfsFile.fileID, isHFSPlus ); + err = MoveExtents( vcb, kHFSBogusExtentFileID, destData.hfsFile.fileID, 0, 0, isHFSPlus ); if ( err != noErr ) { if ( err != dskFulErr ) return( err ); - - err = DeleteExtents( vcb, destData.hfsFile.fileID, isHFSPlus ); + + err = DeleteExtents( vcb, destData.hfsFile.fileID, 0, 0, isHFSPlus ); ReturnIfError( err ); // we are doomed. Just QUIT! - - err = MoveExtents( vcb, srcData.hfsFile.fileID, destData.hfsFile.fileID, isHFSPlus ); // Move the extents back + + err = MoveExtents( vcb, srcData.hfsFile.fileID, destData.hfsFile.fileID, 0, 0, isHFSPlus ); // Move the extents back ReturnIfError( err ); // we are doomed. Just QUIT! - + goto ExUndo2a; } } else if ( numSrcExtentBlocks ) // just the source file has extents { - err = MoveExtents( vcb, srcData.hfsFile.fileID, destData.hfsFile.fileID, isHFSPlus ); + err = MoveExtents( vcb, srcData.hfsFile.fileID, destData.hfsFile.fileID, 0, 0, isHFSPlus ); if ( err != noErr ) { if ( err != dskFulErr ) return( err ); - - err = DeleteExtents( vcb, srcData.hfsFile.fileID, isHFSPlus ); + + err = DeleteExtents( vcb, srcData.hfsFile.fileID, 0, 0, isHFSPlus ); ReturnIfError( err ); // we are doomed. Just QUIT! - + goto FlushAndReturn; } } else if ( numDestExtentBlocks ) // just the destination file has extents { - err = MoveExtents( vcb, destData.hfsFile.fileID, srcData.hfsFile.fileID, isHFSPlus ); + err = MoveExtents( vcb, destData.hfsFile.fileID, srcData.hfsFile.fileID, 0, 0, isHFSPlus ); if ( err != noErr ) { if ( err != dskFulErr ) return( err ); - - err = DeleteExtents( vcb, destData.hfsFile.fileID, isHFSPlus ); + + err = DeleteExtents( vcb, destData.hfsFile.fileID, 0, 0, isHFSPlus ); ReturnIfError( err ); // we are doomed. Just QUIT! - + goto FlushAndReturn; } } - + //-- Step 3: Change the data in the catalog nodes //-- find the source cnode and put dest info in it @@ -330,23 +381,23 @@ ExUndo2a: err = DeleteExtents( vcb, srcData.hfsFile.fileID, isHFSPlus ); err = ReplaceBTreeRecord( vcb->catalogRefNum, &srcKey, srcHint, &srcData, sizeof(HFSCatalogFile), &srcHint ); ReturnIfError( err ); - + // find the destination cnode and put source info in it err = LocateCatalogNodeByKey( vcb, destHint, &destKey, &destData, &destHint ); if ( err != noErr ) return( cmBadNews ); - + CopyCatalogNodeInfo( &swapData, &destData ); err = ReplaceBTreeRecord( vcb->catalogRefNum, &destKey, destHint, &destData, sizeof(HFSCatalogFile), &destHint ); ReturnIfError( err ); } err = noErr; - + //-- Step 4: Error Handling section - - + + FlushAndReturn: err = FlushCatalog( vcb ); // flush the catalog err = FlushExtentFile( vcb ); // flush the extent file (unneeded for common case, but it's cheap) @@ -373,7 +424,7 @@ static void CopyBigCatalogNodeInfo( CatalogRecord *src, CatalogRecord *dest ) } -static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t destFileID, Boolean isHFSPlus ) +static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t destFileID, int quitEarly, u_int8_t forkType, Boolean isHFSPlus ) { FCB * fcb; ExtentsRecBuffer extentsBuffer[kNumExtentsToCache]; @@ -386,16 +437,16 @@ static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t dest int16_t i, j; OSErr err; - + fcb = GetFileControlBlock(vcb->extentsRefNum); (void) BTInvalidateHint(&btIterator); extentKeyPtr = (ExtentKey*) &btIterator.key; btRecord.bufferAddress = &extentData; btRecord.itemCount = 1; - + //-- Collect the extent records - + // // A search on the following key will cause the BTree to be positioned immediately // before the first extent record for file #srcFileID, but not actually positioned @@ -408,9 +459,9 @@ static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t dest if (isHFSPlus) { btRecord.itemSize = sizeof(HFSPlusExtentRecord); btKeySize = sizeof(HFSPlusExtentKey); - + extentKeyPtr->hfsPlus.keyLength = kHFSPlusExtentKeyMaximumLength; - extentKeyPtr->hfsPlus.forkType = 0; + extentKeyPtr->hfsPlus.forkType = forkType; extentKeyPtr->hfsPlus.pad = 0; extentKeyPtr->hfsPlus.fileID = srcFileID; extentKeyPtr->hfsPlus.startBlock = 0; @@ -418,7 +469,7 @@ static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t dest else { btRecord.itemSize = sizeof(HFSExtentRecord); btKeySize = sizeof(HFSExtentKey); - + extentKeyPtr->hfs.keyLength = kHFSExtentKeyMaximumLength; extentKeyPtr->hfs.forkType = 0; extentKeyPtr->hfs.fileID = srcFileID; @@ -440,7 +491,7 @@ static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t dest // of BTIterateRecord. We'd need to set up the key for BTSearchRecord to find the last record // we found, so that BTIterateRecord would get the next one (the first we haven't processed). // - + err = BTSearchRecord(fcb, &btIterator, &btRecord, &btRecordSize, &btIterator); // We expect a btNotFound here, since there shouldn't be an extent record with FABN = 0. @@ -454,16 +505,16 @@ static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t dest return err; } - + do { btRecord.bufferAddress = &extentData; btRecord.itemCount = 1; - + for ( i=0 ; ihfsPlus.fileID : extentKeyPtr->hfs.fileID; - if ( foundFileID == srcFileID ) - { + if ( foundFileID == srcFileID ) { + /* Check if we need to quit early. */ + if (quitEarly && isHFSPlus) { + if (extentKeyPtr->hfsPlus.forkType != forkType) { + break; + } + } CopyExtentInfo(extentKeyPtr, &extentData, extentsBuffer, i); } - else - { + else{ + /* The fileID's are of a different file. We're done here. */ break; } } @@ -486,21 +542,20 @@ static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t dest btRecordSize = sizeof(HFSPlusExtentRecord); else btRecordSize = sizeof(HFSExtentRecord); - - for ( j=0 ; jextentsRefNum); - + (void) BTInvalidateHint(&btIterator); extentKeyPtr = (ExtentKey*) &btIterator.key; btRecord.bufferAddress = &extentData; btRecord.itemCount = 1; - + // The algorithm is to position the BTree just before any extent records for fileID. // Then just keep getting successive records. If the record is still for fileID, // then delete it. if (isHFSPlus) { btRecord.itemSize = sizeof(HFSPlusExtentRecord); - + extentKeyPtr->hfsPlus.keyLength = kHFSPlusExtentKeyMaximumLength; - extentKeyPtr->hfsPlus.forkType = 0; + extentKeyPtr->hfsPlus.forkType = forkType; extentKeyPtr->hfsPlus.pad = 0; extentKeyPtr->hfsPlus.fileID = fileID; extentKeyPtr->hfsPlus.startBlock = 0; } else { btRecord.itemSize = sizeof(HFSExtentRecord); - + extentKeyPtr->hfs.keyLength = kHFSExtentKeyMaximumLength; - extentKeyPtr->hfs.forkType = 0; + extentKeyPtr->hfs.forkType = forkType; extentKeyPtr->hfs.fileID = fileID; extentKeyPtr->hfs.startBlock = 0; } - + err = BTSearchRecord(fcb, &btIterator, &btRecord, &btRecordSize, &btIterator); if ( err != btNotFound ) { @@ -587,25 +644,32 @@ static OSErr DeleteExtents( ExtendedVCB *vcb, u_int32_t fileID, Boolean isHFSPl return err; // Got some unexpected error, so return it } - + do { BTreeIterator tmpIterator; HFSCatalogNodeID foundFileID; - + err = BTIterateRecord(fcb, kBTreeNextRecord, &btIterator, &btRecord, &btRecordSize); if ( err != noErr ) { if (err == btNotFound) // If we hit the end of the BTree err = noErr; // then it's OK - + break; // We're done now. } foundFileID = isHFSPlus ? extentKeyPtr->hfsPlus.fileID : extentKeyPtr->hfs.fileID; - if ( foundFileID != fileID ) + if ( foundFileID != fileID ) { break; // numbers don't match, we must be done - + } + if (quitEarly && isHFSPlus) { + /* If we're only deleting one type of fork, then quit early if it doesn't match */ + if (extentKeyPtr->hfsPlus.forkType != forkType) { + break; + } + } + tmpIterator = btIterator; err = BTDeleteRecord( fcb, &tmpIterator ); if (err != noErr) diff --git a/bsd/hfs/hfscommon/Misc/FileExtentMapping.c b/bsd/hfs/hfscommon/Misc/FileExtentMapping.c index 5d037026b..ec9881da8 100644 --- a/bsd/hfs/hfscommon/Misc/FileExtentMapping.c +++ b/bsd/hfs/hfscommon/Misc/FileExtentMapping.c @@ -180,7 +180,8 @@ static OSErr TruncateExtents( static OSErr UpdateExtentRecord ( ExtendedVCB *vcb, - FCB *fcb, + FCB *fcb, + int deleted, const HFSPlusExtentKey *extentFileKey, const HFSPlusExtentRecord extentData, u_int32_t extentBTreeHint); @@ -456,7 +457,6 @@ static OSErr DeleteExtentRecord( // //_________________________________________________________________________________ -__private_extern__ OSErr MapFileBlockC ( ExtendedVCB *vcb, // volume that file resides on FCB *fcb, // FCB of file @@ -682,7 +682,6 @@ static OSErr DeallocateFork( // Function: Flushes the extent file for a specified volume //������������������������������������������������������������������������������� -__private_extern__ OSErr FlushExtentFile( ExtendedVCB *vcb ) { FCB * fcb; @@ -842,7 +841,6 @@ int32_t CompareExtentKeysPlus( const HFSPlusExtentKey *searchKey, const HFSPlusE * Used by hfs_extendfs to extend the volume allocation bitmap file. * */ -__private_extern__ int AddFileExtent(ExtendedVCB *vcb, FCB *fcb, u_int32_t startBlock, u_int32_t blockCount) { @@ -896,7 +894,7 @@ AddFileExtent(ExtendedVCB *vcb, FCB *fcb, u_int32_t startBlock, u_int32_t blockC */ foundData[foundIndex].startBlock = startBlock; foundData[foundIndex].blockCount = blockCount; - error = UpdateExtentRecord(vcb, fcb, &foundKey, foundData, hint); + error = UpdateExtentRecord(vcb, fcb, 0, &foundKey, foundData, hint); } (void) FlushExtentFile(vcb); @@ -912,7 +910,6 @@ AddFileExtent(ExtendedVCB *vcb, FCB *fcb, u_int32_t startBlock, u_int32_t blockC // //_________________________________________________________________________________ -__private_extern__ OSErr ExtendFileC ( ExtendedVCB *vcb, // volume that file resides on FCB *fcb, // FCB of file to truncate @@ -1087,21 +1084,44 @@ OSErr ExtendFileC ( * should only be aggressive with re-using once-allocated pieces * if we're not dealing with system files. If we're trying to operate * on behalf of a system file, we need the maximum contiguous amount - * possible. + * possible. For non-system files we favor locality and fragmentation over + * contiguity as it can result in fewer blocks being needed from the underlying + * filesystem that the sparse image resides upon. */ err = noErr; if ( (vcb->hfs_flags & HFS_HAS_SPARSE_DEVICE) - && (fcb->ff_cp->c_fileid >= kHFSFirstUserCatalogNodeID) - && (flags & kEFMetadataMask) == 0) { - if (vcb->hfs_flags & HFS_DID_CONTIG_SCAN) { - wantContig = false; - } else { - // we only want to do this once to scan the bitmap to - // fill in the vcbFreeExt table of free blocks - vcb->hfs_flags |= HFS_DID_CONTIG_SCAN; - wantContig = true; + && (fcb->ff_cp->c_fileid >= kHFSFirstUserCatalogNodeID) + && (flags & kEFMetadataMask) == 0) { + /* + * We want locality over contiguity so by default we set wantContig to + * false unless we hit one of the circumstances below. + */ + wantContig = false; + if (hfs_isrbtree_active(VCBTOHFS(vcb))) { + /* + * If the red-black tree is acive, we can always find a suitable contiguous + * chunk. So if the user specifically requests contiguous files, we should + * honor that no matter what kind of device it is. + */ + if (forceContig) { + wantContig = true; + } } - } else { + else { + /* + * If the red-black tree is not active, then only set wantContig to true + * if we have never done a contig scan on the device, which would populate + * the free extent cache. Note that the caller may explicitly unset the + * DID_CONTIG_SCAN bit in order to force us to vend a contiguous extent here + * if the caller wants to get a contiguous chunk. + */ + if ((vcb->hfs_flags & HFS_DID_CONTIG_SCAN) == 0) { + vcb->hfs_flags |= HFS_DID_CONTIG_SCAN; + wantContig = true; + } + } + } + else { wantContig = true; } useMetaZone = flags & kEFMetadataMask; @@ -1163,7 +1183,7 @@ OSErr ExtendFileC ( if ((actualStartBlock == startBlock) && (blockHint == 0)) { // We grew the file's last extent, so just adjust the number of blocks. foundData[foundIndex].blockCount += actualNumBlocks; - err = UpdateExtentRecord(vcb, fcb, &foundKey, foundData, hint); + err = UpdateExtentRecord(vcb, fcb, 0, &foundKey, foundData, hint); if (err != noErr) break; } else { @@ -1217,7 +1237,7 @@ OSErr ExtendFileC ( // Add a new extent into this record and update. foundData[foundIndex].startBlock = actualStartBlock; foundData[foundIndex].blockCount = actualNumBlocks; - err = UpdateExtentRecord(vcb, fcb, &foundKey, foundData, hint); + err = UpdateExtentRecord(vcb, fcb, 0, &foundKey, foundData, hint); if (err != noErr) break; } } @@ -1289,12 +1309,15 @@ OSErr ExtendFileC ( // //_________________________________________________________________________________ -__private_extern__ OSErr TruncateFileC ( ExtendedVCB *vcb, // volume that file resides on FCB *fcb, // FCB of file to truncate int64_t peof, // new physical size for file + int deleted, // if nonzero, the file's catalog record has already been deleted. + int rsrc, // does this represent a resource fork or not? + uint32_t fileid, // the fileid of the file we're manipulating. Boolean truncateToExtent) // if true, truncate to end of extent containing newPEOF + { OSErr err; u_int32_t nextBlock; // next file allocation block to consider @@ -1314,16 +1337,20 @@ OSErr TruncateFileC ( recordDeleted = false; - if (vcb->vcbSigWord == kHFSPlusSigWord) + if (vcb->vcbSigWord == kHFSPlusSigWord) { numExtentsPerRecord = kHFSPlusExtentDensity; - else + } + else { numExtentsPerRecord = kHFSExtentDensity; - - if (FORK_IS_RSRC(fcb)) + } + + if (rsrc) { forkType = kResourceForkType; - else + } + else { forkType = kDataForkType; - + } + temp64 = fcb->ff_blocks; physNumBlocks = (u_int32_t)temp64; @@ -1349,13 +1376,21 @@ OSErr TruncateFileC ( * XXX Any errors could cause ff_blocks and c_blocks to get out of sync... */ numBlocks = peof / vcb->blockSize; - FTOC(fcb)->c_blocks -= (fcb->ff_blocks - numBlocks); + if (!deleted) { + FTOC(fcb)->c_blocks -= (fcb->ff_blocks - numBlocks); + } fcb->ff_blocks = numBlocks; - + // this catalog entry is modified and *must* get forced // to disk when hfs_update() is called - FTOC(fcb)->c_flag |= C_MODIFIED | C_FORCEUPDATE; - + if (!deleted) { + /* + * If the file is already C_NOEXISTS, then the catalog record + * has been removed from disk already. We wouldn't need to force + * another update + */ + FTOC(fcb)->c_flag |= (C_MODIFIED | C_FORCEUPDATE); + } // // If the new PEOF is 0, then truncateToExtent has no meaning (we should always deallocate // all storage). @@ -1364,7 +1399,7 @@ OSErr TruncateFileC ( int i; // Deallocate all the extents for this fork - err = DeallocateFork(vcb, FTOC(fcb)->c_fileid, forkType, fcb->fcbExtents, &recordDeleted); + err = DeallocateFork(vcb, fileid, forkType, fcb->fcbExtents, &recordDeleted); if (err != noErr) goto ErrorExit; // got some error, so return it // Update the catalog extent record (making sure it's zeroed out) @@ -1440,7 +1475,7 @@ OSErr TruncateFileC ( // record (in the FCB, or extents file). // if (extentChanged) { - err = UpdateExtentRecord(vcb, fcb, &key, extentRecord, hint); + err = UpdateExtentRecord(vcb, fcb, deleted, &key, extentRecord, hint); if (err != noErr) goto ErrorExit; } @@ -1450,7 +1485,7 @@ OSErr TruncateFileC ( // blocks. // if (nextBlock < physNumBlocks) - err = TruncateExtents(vcb, forkType, FTOC(fcb)->c_fileid, nextBlock, &recordDeleted); + err = TruncateExtents(vcb, forkType, fileid, nextBlock, &recordDeleted); Done: ErrorExit: @@ -1465,7 +1500,6 @@ OSErr TruncateFileC ( * HFS Plus only * */ -__private_extern__ OSErr HeadTruncateFile ( ExtendedVCB *vcb, FCB *fcb, @@ -1824,6 +1858,7 @@ static OSErr SearchExtentFile( // // Input: vcb - the volume containing the extents // fcb - the file that owns the extents +// deleted - whether or not the file is already deleted // extentFileKey - pointer to extent key record (xkr) // If the key length is 0, then the extents are actually part // of the catalog record, stored in the FCB. @@ -1834,18 +1869,18 @@ static OSErr SearchExtentFile( // (other) = error from BTree //============================================================================ -static OSErr UpdateExtentRecord ( - ExtendedVCB *vcb, - FCB *fcb, - const HFSPlusExtentKey *extentFileKey, - const HFSPlusExtentRecord extentData, - u_int32_t extentBTreeHint) +static OSErr UpdateExtentRecord (ExtendedVCB *vcb, FCB *fcb, int deleted, + const HFSPlusExtentKey *extentFileKey, + const HFSPlusExtentRecord extentData, + u_int32_t extentBTreeHint) { OSErr err = noErr; if (extentFileKey->keyLength == 0) { // keyLength == 0 means the FCB's extent record BlockMoveData(extentData, fcb->fcbExtents, sizeof(HFSPlusExtentRecord)); - FTOC(fcb)->c_flag |= C_MODIFIED; + if (!deleted) { + FTOC(fcb)->c_flag |= C_MODIFIED; + } } else { BTreeIterator btIterator; @@ -2013,7 +2048,6 @@ static Boolean ExtentsAreIntegral( // Called by BTOpenPath during volume mount //_________________________________________________________________________________ -__private_extern__ Boolean NodesAreContiguous( ExtendedVCB *vcb, FCB *fcb, diff --git a/bsd/hfs/hfscommon/Misc/HybridAllocator.c b/bsd/hfs/hfscommon/Misc/HybridAllocator.c new file mode 100644 index 000000000..6e0e1f23a --- /dev/null +++ b/bsd/hfs/hfscommon/Misc/HybridAllocator.c @@ -0,0 +1,533 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#if CONFIG_HFS_ALLOC_RBTREE + +#define assert(a) { if (!(a)) { panic("File "__FILE__", line %d: assertion '%s' failed.\n", __LINE__, #a); } } + +//#include +#include "../../hfs_macos_defs.h" +#include "../headers/HybridAllocator.h" + +#define bool Boolean + +#define ALLOC_DEBUG 0 + +/* + * The rb_wrap macro in RedBlackTree.h automatically generates the source for a variety of functions that + * operate on the red-black trees. The bodies of these automatically generated functions are the corresponding + * macro from RedBlackTree.h. For example, the extent_tree_length_new() function invokes the rb_new() macro. + * We re-define actual wrapper functions around them so that we can re-name them and adjust the functions + * that are available to the allocator in VolumeAllocation.c. + * + * Here are the functions that get automatically generated: + * Offset-Tree Functions: + * + * initialize the tree + * static void extent_tree_offset_new(extent_tree_offset_t * tree) + * + * Get the first node in the tree. If it is empty, return NULL + * static extent_node_t* extent_tree_offset_first (extent_tree_offset_t * tree) + * + * Get the last node in the tree. If it is empty, return NULL + * static extent_node_t* extent_tree_offset_last (extent_tree_offset_t * tree) + * + * From a given extent_node_t, grab the next one. If no next exists, return NULL + * static extent_node_t* extent_tree_offset_next (extent_tree_offset_t * tree, extent_node_t * node) + * + * From a given extent_node_t, grab the previous. If no prev exists, return NULL + * static extent_node_t* extent_tree_offset_prev(extent_tree_offset_t * tree, extent_node_t * node) + * + * Find a extent_node_t with the specified key (search by offset). If it does not exist, return NULL + * static extent_node_t* extent_tree_offset_search(extent_tree_offset_t * tree, extent_node_t * key) + * + * Find an extent node_t withthe specified key (offset). If it does not exist, + * either grab the next node, if possible, or return NULL + * static extent_node_t* extent_tree_offset_nsearch(extent_tree_offset_t * tree, extent_node_t * key) + * + * Find an extent_node_t with the specified key (offset). If it does not exist, + * either grab the previous node, if possible, or return NULL + * static extent_node_t* extent_tree_offset_psearch(extent_tree_offset_t * tree, extent_node_t * key) + * + * Insert the specified node into the tree. + * static void extent_tree_offset_insert(extent_tree_offset_t * tree, extent_node_t * node) + * + * Remove the specified node from the tree. + * static void extent_tree_offset_remove(extent_tree_offset_t * tree, extent_node_t * node) + * + */ + + +/* Static Functions only used in this file */ +static int32_t +extent_tree_internal_alloc_space(extent_tree_offset_t *offset_tree, + u_int32_t size, u_int32_t offset, extent_node_t *node); + +/* + * cmp_offset_node + * + * Compare the extents in two nodes by offset. + * + * Returns: + * -1 if node 1's offset < node 2's offset. + * 1 if node 1's offset > node 2's offset. + */ + +__private_extern__ int +cmp_offset_node(extent_node_t *node_1, extent_node_t *node_2) { + u_int32_t addr_1 = node_1->offset; + u_int32_t addr_2 = node_2->offset; + + return ((addr_1 > addr_2) - (addr_1 < addr_2)); +} + +/* + * Allocate a new red-black tree node. + * + * Currently, we get memory from the M_TEMP zone. + * TODO: Need to get our own zone to avoid bloating the M_TEMP zone. + */ +__private_extern__ extent_node_t * +alloc_node(u_int32_t length, u_int32_t offset) { + extent_node_t *node; + MALLOC(node, extent_node_t *, sizeof(extent_node_t), M_TEMP, M_WAITOK); + + if (node) { + node->offset = offset; + node->length = length; + node->offset_next = NULL; + } + return node; +} + +/* + * De-allocate a red-black tree node. + * + * Currently, this goes back to the M_TEMP zone. + * TODO: May need to adjust this if we pull memory out of our own zone. + */ +__private_extern__ void +free_node(extent_node_t *node) { + FREE(node, M_TEMP); +} + +/* + * rb_wrap is a macro found in the rb.h header file. It builds functions that operate on + * the red-black tree based upon the types specified here. This code will build red-black tree + * search functions that operate on extent_node_t's and use cmp_length_node to do length searches. + * It uses cmp_offset_node to do offset searches. Ties are broken by offset. This will generate + * the functions specified above. + */ + +rb_wrap(__attribute__ ((unused)) static, extent_tree_offset_, extent_tree_offset_t, extent_node_t, offset_link, cmp_offset_node) + + +/* + * Create a new extent tree, composed of links sorted by offset. + */ +__private_extern__ void +extent_tree_init(extent_tree_offset_t *offset_tree) +{ + extent_node_t *node = NULL; + extent_tree_offset_new(offset_tree); + + node = extent_tree_off_first (offset_tree); + if (node) { + node->offset_next = NULL; + } +} + +/* + * Destroy an extent tree + * + * This function finds the first node in the specified red-black tree, then + * uses the embedded linked list to walk through the tree in O(n) time and destroy + * all of its nodes. + */ +__private_extern__ void +extent_tree_destroy(extent_tree_offset_t *off_tree) { + extent_node_t *node = NULL; + extent_node_t *next = NULL; + + node = extent_tree_offset_first (off_tree); + + while (node) { + next = node->offset_next; + extent_tree_offset_remove (off_tree, node); + free_node (node); + node = next; + } +} + +/* + * Search the extent tree by offset. The "key" argument is only used to extract + * the offset and length information. Its link fields are not used in the underlying + * tree code. + */ +__private_extern__ extent_node_t * +extent_tree_off_search(extent_tree_offset_t *tree, extent_node_t *key) { + return extent_tree_offset_search(tree, key); +} + +/* + * Search the extent tree by offset, finding the next node in the tree + * if the specified one does not exist. The "key" argument is only used to extract + * the offset and length information. Its link fields are not used in the underlying + * tree code. + */ +__private_extern__ extent_node_t * +extent_tree_off_search_next(extent_tree_offset_t *offset_tree, extent_node_t *key) { + + return extent_tree_offset_nsearch (offset_tree, key); +} + +/* + * Search the extent tree by offset to find a starting position. Then, do a linear search + * through the list of free extents to find the first free extent in the tree that has size + * greater than or equal to the specified size. The "key" argument is only used to extract + * the offset and length information. Its link fields are not used in the underlying + * tree code. + */ +__private_extern__ extent_node_t * +extent_tree_off_search_nextWithSize (extent_tree_offset_t *offset_tree, extent_node_t *key) { + + extent_node_t *current; + + u_int32_t min_size = key->length; + + current = extent_tree_offset_nsearch (offset_tree, key); + + while (current) { + if (current->length >= min_size) { + return current; + } + current = current->offset_next; + } + + /* return NULL if no free extent of suitable size could be found. */ + return NULL; +} + + +/* + * Search the extent tree by offset, finding the previous node in the tree + * if the specified one does not exist. The "key" argument is only used to extract + * the offset and length information. Its link fields are not used in the underlying + * tree code. + */ +__private_extern__ extent_node_t * +extent_tree_off_search_prev(extent_tree_offset_t *offset_tree, extent_node_t *key) { + + return extent_tree_offset_psearch (offset_tree, key); +} + + +/* + * Find the first node in the extent tree, by offset. This will be the first + * free space region relative to the start of the disk. + */ +__private_extern__ extent_node_t * +extent_tree_off_first (extent_tree_offset_t *offset_tree) { + return extent_tree_offset_first(offset_tree); +} + +/* + * From a given tree node (sorted by offset), get the next node in the tree. + */ +__private_extern__ extent_node_t * +extent_tree_off_next(extent_tree_offset_t * tree, extent_node_t *node) +{ + return extent_tree_offset_next(tree, node); +} + +/* + * From a given tree node (sorted by offset), get the previous node in the tree. + */ +__private_extern__ extent_node_t * +extent_tree_off_prev(extent_tree_offset_t * tree, extent_node_t *node) +{ + return extent_tree_offset_prev(tree, node); +} + + +/* + * For a node of a given offset and size, remove it from the extent tree and + * insert a new node that: + * + * A) increase its offset by that of the node we just removed + * B) decreases its size by that of the node we just removed. + * + * NOTE: Callers must ensure that the 'size' specified is less than or equal to the + * length of the extent represented by node. The node pointer must point to an + * extant node in the tree, as it will be removed from the tree. + */ +static int32_t +extent_tree_internal_alloc_space(extent_tree_offset_t *offset_tree, u_int32_t size, + u_int32_t offset, extent_node_t *node) +{ + if (node) { + extent_node_t *prev = NULL; + extent_node_t *next = NULL; + + if( ALLOC_DEBUG ) { + assert ((size <= node->length)); + assert ((offset == node->offset)); + } + + prev = extent_tree_offset_prev(offset_tree, node); + + /* + * Note that, unless the node is exactly the size of the amount of space + * requested, we do not need to remove it from the offset tree, now matter + * how much space we remove from the node. Remember that the offset tree is + * sorting the extents based on their offsets, and that each node is a discrete + * chunk of free space. + * + * If node A has offset B, with length C, in the offset tree, by definition, there + * can be no other node in the extent tree within the range {B, B+C}. If there were, + * we'd have overlapped extents. + * + * So in the normal case, we'll just update the offset node in place with the new offset + * and size. + * + * Otherwise, if we have an exact match, then just remove the node altogether. Don't forget + * to update the next pointer for the linked list if applicable. + */ + if (node->length == size) { + next = node->offset_next; + extent_tree_offset_remove(offset_tree, node); + free_node(node); + if (prev) { + prev->offset_next = next; + } + } + else { + node->offset = node->offset + size; + node->length -= size; + /* The next pointer does not change since we keep the node in place */ + } + return 0; + } + return -1; +} + +/* + * Search the extent tree for a region of free space after the specified + * offset and attempt to allocate it. + * + * This is expected to be used by attempts to grow a file contiguously. If we + * start at a file's EOF, then we can try to allocate space immediately after it + * if it's available. This function specifies a tail (the offset), and then passes it + * into extent_tree_offset_search. Note that this is not the search_prev or search_next + * variant, so if no node exists at the specified offset we'll fail out. + * + */ + +__private_extern__ int32_t +extent_tree_offset_alloc_space(extent_tree_offset_t *offset_tree, u_int32_t size, u_int32_t offset) { + extent_node_t search_sentinel = { .offset = offset }; + extent_node_t *node = extent_tree_offset_search(offset_tree, &search_sentinel); + if (node && (node->length < size)) { + /* It's too small. Fail the allocation */ + if ( ALLOC_DEBUG ) { + printf("HFS Allocator: internal_alloc_space, ptr (%p) node->length (%d), node->offset (%d), off(%d), size (%d) \n", + node, node->length, node->offset, offset, size); + } + return -1; + } + return extent_tree_internal_alloc_space(offset_tree, size, offset, node); +} + + +/* + * Search the extent tree for a region of free space at the specified + * offset and attempt to allocate it. + * + * This is a little bit more involved than the previous function. It is intended for use when + * we may be allocating space from the middle of an existing extent node. + * + */ + + +__private_extern__ int32_t +extent_tree_offset_alloc_unaligned(extent_tree_offset_t *offset_tree, u_int32_t size, u_int32_t offset) { + extent_node_t search_sentinel = { .offset = offset }; + extent_node_t *node= NULL; + + node = extent_tree_off_search_prev(offset_tree, &search_sentinel); + + if (node == NULL) { + return -1; + } + + if (node && (node->length < size)) { + /* It's too small. Fail the allocation */ + if ( ALLOC_DEBUG ) { + printf("HFS Allocator: internal_alloc_space, ptr (%p) node->length (%d), node->offset (%d), off(%d), size (%d) \n", + node, node->length, node->offset, offset, size); + } + return -1; + } + + /* Now see if we need to split this node because we're not allocating from the beginning */ + if (offset != node->offset) { + + if (ALLOC_DEBUG) { + assert ((offset + size) <= (node->offset + node->length)); + if (node->offset_next) { + assert ((offset > node->offset) && (offset < node->offset_next->offset)); + } + } + + u_int32_t end = node->offset + node->length; + node->length = offset - node->offset; + + /* + * Do we need to create a new node? If our extent we're carving away ends earlier than + * the current extent's length, then yes - we do. + */ + if ((offset + size) < (end)) { + u_int32_t newoff = offset + size; + u_int32_t newlen = end - newoff; + + extent_node_t* newnode = alloc_node(newlen, newoff); + extent_tree_offset_insert(offset_tree, newnode); + + extent_node_t *next = extent_tree_offset_next(offset_tree, newnode); + newnode->offset_next = next; + node->offset_next = newnode; + } + + return 0; + } + else { + return extent_tree_internal_alloc_space(offset_tree, size, offset, node); + } +} + + + +/* + * Mark an extent of space as being free. This means we need to insert + * this extent into our tree. + * + * Search the offset tree, based on the new offset that we construct by adding + * the length of our extent to be freed to its offset. If something exists at + * that offset, then we coalesce the nodes. In this case, we do not need to adjust + * the offset tree because our extent we wanted to add could not have been in the tree. + * + * If no node existed at the specified offset, then create a new one and insert it + * into the tree. + * + * Finally, search based on the node that would precede our newly created/inserted one. + * If possible, coalesce the previous node into our new one. + * + * We return the node which we are modifying in this function. + */ + +__private_extern__ extent_node_t * +extent_tree_free_space(extent_tree_offset_t *offset_tree, u_int32_t size, u_int32_t offset) +{ + extent_node_t *prev = NULL; + extent_node_t *node = NULL; + extent_node_t *next = NULL; + extent_node_t search_sentinel = { .offset = size + offset }; + + node = extent_tree_offset_nsearch(offset_tree, &search_sentinel); + /* Insert our node into the tree, and coalesce with the next one if necessary */ + + if ((node) && (node->offset == search_sentinel.offset)) { + node->offset = offset; + node->length += size; + next = node->offset_next; + } + else { + node = alloc_node(size, offset); + assert(node); + extent_tree_offset_insert(offset_tree, node); + + /* Find the next entry in the tree, if applicable. */ + next = extent_tree_offset_next(offset_tree, node); + node->offset_next = next; + } + + /* Coalesce with the previous if necessary */ + prev = extent_tree_offset_prev(offset_tree, node); + if (prev && (prev->offset + prev->length) == offset) { + extent_tree_offset_remove(offset_tree, prev); + node->offset = prev->offset; + node->length += prev->length; + free_node(prev); + prev = extent_tree_offset_prev(offset_tree, node); + } + + /* Update the next pointer for the previous entry (if necessary) */ + if (prev) { + prev->offset_next = node; + } + + return node; +} + +/* + * Remove the specified node from the offset_tree. Note that the parameter node + * must be an extant node in the tree. This function is used by the allocator when + * we are resizing a volume and need to directly manipulate the contents of the red-black + * tree without going through the normal allocation and deallocation routines. + */ +__private_extern__ void +extent_tree_remove_node (extent_tree_offset_t *offset_tree, extent_node_t * node) { + + if (node) { + /* Just remove the entry from the tree */ + extent_tree_offset_remove(offset_tree, node); + } + return; + +} + + + +#if ALLOC_DEBUG +/* + * For each node in the tree, print out its length and block offset. + */ +__private_extern__ void +extent_tree_offset_print(extent_tree_offset_t *offset_tree) +{ + extent_node_t *node = NULL; + + node = extent_tree_offset_first(offset_tree); + while (node) { + printf("length: %u, offset: %u\n", node->length, node->offset); + node = node->offset_next; + } +} +#endif + +#endif diff --git a/bsd/hfs/hfscommon/Misc/VolumeAllocation.c b/bsd/hfs/hfscommon/Misc/VolumeAllocation.c index bc58bd947..de2858418 100644 --- a/bsd/hfs/hfscommon/Misc/VolumeAllocation.c +++ b/bsd/hfs/hfscommon/Misc/VolumeAllocation.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -32,7 +32,7 @@ Version: HFS Plus 1.0 - Copyright: � 1996-2001 by Apple Computer, Inc., all rights reserved. + Copyright: ÔøΩ 1996-2009 by Apple Computer, Inc., all rights reserved. */ @@ -45,31 +45,92 @@ Public routines: blocks. (Will only do a single extent???) BlockDeallocate Deallocate a contiguous run of allocation blocks. - - invalidate_free_extent_cache Invalidate free extent cache for a given volume. - -Internal routines: + + BlockMarkAllocated + Exported wrapper to mark blocks as in-use. This will correctly determine + whether or not the red-black tree is enabled and call the appropriate function + if applicable. BlockMarkFree + Exported wrapper to mark blocks as freed. This will correctly determine whether or + not the red-black tree is enabled and call the appropriate function if applicable. + + + ResetVCBFreeExtCache + Since the red-black tree obviates the need to maintain the free extent cache, we do + not update it if the tree is also live. As a result, if we ever need to destroy the trees + we should reset the free extent cache so it doesn't confuse us when we need to fall back to the + bitmap scanning allocator. + We also reset and disable the free extent cache when volume resizing is + in flight. + + UpdateAllocLimit + Adjusts the AllocLimit field in the hfs mount point. This is used when we need to prevent + allocations from occupying space in the region we are modifying during a filesystem resize. + At other times, it should be consistent with the total number of allocation blocks in the + filesystem. It is also used to shrink or grow the number of blocks that the red-black tree should + know about. If growing, scan the new range of bitmap, and if shrinking, reduce the + number of items in the tree that we can allocate from. + +Internal routines: + Note that the RBTree routines are guarded by a cpp check for CONFIG_HFS_ALLOC_RBTREE. This + is to cut down on space for functions that could not possibly be used if they are not planning to + use the red-black tree code. + + BlockMarkFreeRBTree + Make an internal call to BlockMarkFree and then update + and/or create Red-Black Tree allocation tree nodes to correspond + to the free space being generated. + BlockMarkFreeInternal Mark a contiguous range of blocks as free. The corresponding - bits in the volume bitmap will be cleared. - BlockMarkAllocated + bits in the volume bitmap will be cleared. This will actually do the work + of modifying the bitmap for us. + + BlockMarkAllocatedRBTree + Make an internal call to BlockAllocateMarked, which will update the + bitmap on-disk when we allocate blocks. If that is successful, then + we'll remove the appropriate entries from the red-black tree. + BlockMarkAllocatedInternal Mark a contiguous range of blocks as allocated. The cor- responding bits in the volume bitmap are set. Also tests to see - if any of the blocks were previously unallocated. - FindContiguous + if any of the blocks were previously unallocated. + BlockFindContiguous Find a contiguous range of blocks of a given size. The caller specifies where to begin the search (by block number). The - block number of the first block in the range is returned. + block number of the first block in the range is returned. This is only + called by the bitmap scanning logic as the red-black tree should be able + to do this internally by searching its tree. BlockAllocateAny Find and allocate a contiguous range of blocks up to a given size. The first range of contiguous free blocks found are allocated, even if there are fewer blocks than requested (and even if a contiguous range of blocks of the given size exists elsewhere). + BlockAllocateAnyBitmap + Finds a range of blocks per the above requirements without using the + Allocation RB Tree. This relies on the bitmap-scanning logic in order to find + any valid range of free space needed. + BlockAllocateAnyRBTree + Finds a valid range of blocks per the above requirements by searching + the red-black tree. We can just make an internal call to + BlockAllocateContigRBTree to find the valid range. BlockAllocateContig Find and allocate a contiguous range of blocks of a given size. If a contiguous range of free blocks of the given size isn't found, then - the allocation fails (i.e. it is "all or nothing"). - + the allocation fails (i.e. it is "all or nothing"). This routine is + essentially a wrapper function around its related sub-functions, + BlockAllocateContigBitmap and BlockAllocateContigRBTree, which use, + respectively, the original HFS+ bitmap scanning logic and the new + Red-Black Tree to search and manage free-space decisions. This function + contains logic for when to use which of the allocation algorithms, + depending on the free space contained in the volume. + BlockAllocateContigBitmap + Finds and allocates a range of blocks specified by the size parameters + using the original HFS+ bitmap scanning logic. The red-black tree + will not be updated if this function is used. + BlockAllocateContigRBTree + Finds and allocates a range of blocks specified by the size parameters + using the new red/black tree data structure and search algorithms + provided by the tree library. Updates the red/black tree nodes after + the on-disk data structure (bitmap) has been updated. BlockAllocateKnown Try to allocate space from known free space in the volume's free extent cache. @@ -80,6 +141,57 @@ Internal routines: ReleaseBitmapBlock Release a bitmap block back into the buffer cache. + + +Debug/Test Routines + hfs_isallocated + Test to see if any blocks in a range are allocated. Journal or + allocation file lock must be held. + + hfs_isallocated_scan + Test to see if any blocks in a range are allocated. Releases and + invalidates the block used when finished. + + hfs_isrbtree_active + Test to see if the allocation red-black tree is live. This function + requires either an exclusive or shared lock on the allocation bitmap file + in the HFS mount structure, to prevent red-black tree pointers from disappearing. + + hfs_isrbtree_allocated + Test to see if the specified extent is marked as allocated in the red-black tree. + Multiplexes between the metadata zone trees and the normal allocation zone trees + depending on the offset of the extent specified. + + check_rbtree_extents + Void function that wraps around the above function (hfs_isrbtree_allocated) + and checks to see that the return value was appropriate based on the assertion we're + trying to validate (whether or not the specified extent should be marked as free + or allocated). + + hfs_validate_rbtree + Exhaustive search function that will check every allocation block for its status in the + red-black tree and then check the corresponding status in the bitmap file. If the two are out + of sync, it will panic. Note that this function is extremely expensive and must NEVER + be run outside of debug code. + + hfs_checktreelinks + Checks the embedded linked list structure of the red black tree for integrity. The next pointer + should always point to whatever extent_tree_offset_next returns. + + +Red Black Tree Specific Routines + GenerateTree + Build a red-black tree for the given filesystem's bitmap. + + DestroyTrees + Destroy the tree on the given filesystem + + + hfs_alloc_scan_block + Given a starting allocation block number, figures out which physical block contains that + allocation block's bit, and scans it from the starting bit until either the ending bit or + the end of the block. Free space extents are inserted into the appropriate red-black tree. + */ #include "../../hfs_macos_defs.h" @@ -89,19 +201,54 @@ Internal routines: #include #include #include +#include +#include #include #include "../../hfs.h" #include "../../hfs_dbg.h" #include "../../hfs_format.h" #include "../../hfs_endian.h" - +#include "../../hfs_macos_defs.h" #include "../headers/FileMgrInternal.h" +#include "../headers/HybridAllocator.h" +#include "../../hfs_kdebug.h" #ifndef CONFIG_HFS_TRIM #define CONFIG_HFS_TRIM 0 #endif +/* + * Use sysctl vfs.generic.hfs.kdebug.allocation to control which + * KERNEL_DEBUG_CONSTANT events are enabled at runtime. (They're + * disabled by default because there can be a lot of these events, + * and we don't want to overwhelm the kernel debug buffer. If you + * want to watch these events in particular, just set the sysctl.) + */ +static int hfs_kdebug_allocation = 0; +SYSCTL_DECL(_vfs_generic); +SYSCTL_NODE(_vfs_generic, OID_AUTO, hfs, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "HFS file system"); +SYSCTL_NODE(_vfs_generic_hfs, OID_AUTO, kdebug, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "HFS kdebug"); +SYSCTL_INT(_vfs_generic_hfs_kdebug, OID_AUTO, allocation, CTLFLAG_RW|CTLFLAG_LOCKED, &hfs_kdebug_allocation, 0, "Enable kdebug logging for HFS allocations"); +enum { + /* + * HFSDBG_ALLOC_ENABLED: Log calls to BlockAllocate and + * BlockDeallocate, including the internal BlockAllocateXxx + * routines so we can see how an allocation was satisfied. + * + * HFSDBG_EXT_CACHE_ENABLED: Log routines that read or write the + * free extent cache. + * + * HFSDBG_UNMAP_ENABLED: Log events involving the trim list. + * + * HFSDBG_BITMAP_ENABLED: Log accesses to the volume bitmap (setting + * or clearing bits, scanning the bitmap). + */ + HFSDBG_ALLOC_ENABLED = 1, + HFSDBG_EXT_CACHE_ENABLED = 2, + HFSDBG_UNMAP_ENABLED = 4, + HFSDBG_BITMAP_ENABLED = 8 +}; enum { kBytesPerWord = 4, @@ -116,6 +263,8 @@ enum { #define kAllBitsSetInWord 0xFFFFFFFFul +#define ALLOC_DEBUG 0 + static OSErr ReadBitmapBlock( ExtendedVCB *vcb, u_int32_t bit, @@ -136,6 +285,15 @@ static OSErr BlockAllocateAny( u_int32_t *actualStartBlock, u_int32_t *actualNumBlocks); +static OSErr BlockAllocateAnyBitmap( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t endingBlock, + u_int32_t maxBlocks, + Boolean useMetaZone, + u_int32_t *actualStartBlock, + u_int32_t *actualNumBlocks); + static OSErr BlockAllocateContig( ExtendedVCB *vcb, u_int32_t startingBlock, @@ -145,6 +303,15 @@ static OSErr BlockAllocateContig( u_int32_t *actualStartBlock, u_int32_t *actualNumBlocks); +static OSErr BlockAllocateContigBitmap( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t minBlocks, + u_int32_t maxBlocks, + Boolean useMetaZone, + u_int32_t *actualStartBlock, + u_int32_t *actualNumBlocks); + static OSErr BlockFindContiguous( ExtendedVCB *vcb, u_int32_t startingBlock, @@ -161,9 +328,136 @@ static OSErr BlockAllocateKnown( u_int32_t *actualStartBlock, u_int32_t *actualNumBlocks); -static int free_extent_cache_active( - ExtendedVCB *vcb); +static OSErr BlockMarkAllocatedInternal ( + ExtendedVCB *vcb, + u_int32_t startingBlock, + register u_int32_t numBlocks); + +static OSErr BlockMarkFreeInternal( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t numBlocks, + Boolean do_validate); + +#if CONFIG_HFS_ALLOC_RBTREE + +static OSErr ReleaseRBScanBitmapBlock( struct buf *bp ); + +static OSErr BlockAllocateAnyRBTree( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t maxBlocks, + Boolean useMetaZone, + u_int32_t *actualStartBlock, + u_int32_t *actualNumBlocks); + +static OSErr BlockAllocateContigRBTree( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t minBlocks, + u_int32_t maxBlocks, + Boolean useMetaZone, + u_int32_t *actualStartBlock, + u_int32_t *actualNumBlocks, + u_int32_t forceContig); + +static OSErr BlockMarkAllocatedRBTree( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t numBlocks); + +static OSErr BlockMarkFreeRBTree( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t numBlocks); + +static int +hfs_isrbtree_allocated (struct hfsmount * hfsmp, + u_int32_t startBlock, + u_int32_t numBlocks, + extent_node_t** node1); + +extern void +hfs_validate_rbtree (struct hfsmount *hfsmp, + u_int32_t start, + u_int32_t end); + +static void hfs_checktreelinks (struct hfsmount *hfsmp); + + +void check_rbtree_extents (struct hfsmount *hfsmp, + u_int32_t start, + u_int32_t numBlocks, + int shouldBeFree); + +int hfs_isallocated_scan (struct hfsmount *hfsmp, + u_int32_t startingBlock, + u_int32_t *bp_buf); + +static int hfs_alloc_scan_block(struct hfsmount *hfsmp, + u_int32_t startbit, + u_int32_t endBit, + u_int32_t *bitToScan); + +#define ASSERT_FREE 1 +#define ASSERT_ALLOC 0 + +#endif /* CONFIG_HFS_ALLOC_RBTREE */ + +/* Functions for manipulating free extent cache */ +static void remove_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBlock, u_int32_t blockCount); +static Boolean add_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBlock, u_int32_t blockCount); +static void sanity_check_free_ext(struct hfsmount *hfsmp, int check_allocated); + +#if ALLOC_DEBUG +/* + * Extra #includes for the debug function below. These are not normally #included because + * they would constitute a layering violation + */ +#include +#include + +/* + * Validation Routine to verify that the TRIM list maintained by the journal + * is in good shape relative to what we think the bitmap should have. We should + * never encounter allocated blocks in the TRIM list, so if we ever encounter them, + * we panic. + */ +int trim_validate_bitmap (struct hfsmount *hfsmp) { + u_int64_t blockno_offset; + u_int64_t numblocks; + int i; + int count; + u_int32_t startblk; + u_int32_t blks; + int err = 0; + uint32_t alloccount = 0; + + if (hfsmp->jnl) { + struct journal *jnl = (struct journal*)hfsmp->jnl; + if (jnl->active_tr) { + struct jnl_trim_list *trim = &(jnl->active_tr->trim); + count = trim->extent_count; + for (i = 0; i < count; i++) { + blockno_offset = trim->extents[i].offset; + blockno_offset = blockno_offset - (uint64_t)hfsmp->hfsPlusIOPosOffset; + blockno_offset = blockno_offset / hfsmp->blockSize; + numblocks = trim->extents[i].length / hfsmp->blockSize; + + startblk = (u_int32_t)blockno_offset; + blks = (u_int32_t) numblocks; + err = hfs_count_allocated (hfsmp, startblk, blks, &alloccount); + + if (err == 0 && alloccount != 0) { + panic ("trim_validate_bitmap: %d blocks @ ABN %d are allocated!", alloccount, startblk); + } + } + } + } + return 0; +} +#endif /* ;________________________________________________________________________________ @@ -188,22 +482,25 @@ static int free_extent_cache_active( */ static void hfs_unmap_free_extent(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBlocks) { - if (CONFIG_HFS_TRIM) { - u_int64_t offset; - u_int64_t length; - int err; - - if ((hfsmp->hfs_flags & HFS_UNMAP) && (hfsmp->jnl != NULL)) { - offset = (u_int64_t) startingBlock * hfsmp->blockSize + (u_int64_t) hfsmp->hfsPlusIOPosOffset; - length = (u_int64_t) numBlocks * hfsmp->blockSize; - - err = journal_trim_add_extent(hfsmp->jnl, offset, length); - if (err) { - printf("hfs_unmap_free_extent: error %d from journal_trim_add_extent", err); - hfsmp->hfs_flags &= ~HFS_UNMAP; - } + u_int64_t offset; + u_int64_t length; + int err; + + if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_FREE | DBG_FUNC_START, startingBlock, numBlocks, 0, 0, 0); + + if (hfsmp->jnl != NULL) { + offset = (u_int64_t) startingBlock * hfsmp->blockSize + (u_int64_t) hfsmp->hfsPlusIOPosOffset; + length = (u_int64_t) numBlocks * hfsmp->blockSize; + + err = journal_trim_add_extent(hfsmp->jnl, offset, length); + if (err) { + printf("hfs_unmap_free_extent: error %d from journal_trim_add_extent", err); } } + + if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_FREE | DBG_FUNC_END, err, 0, 0, 0, 0); } @@ -225,100 +522,117 @@ static void hfs_unmap_free_extent(struct hfsmount *hfsmp, u_int32_t startingBloc */ static void hfs_unmap_alloc_extent(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBlocks) { - if (CONFIG_HFS_TRIM) { - u_int64_t offset; - u_int64_t length; - int err; + u_int64_t offset; + u_int64_t length; + int err; + + if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_ALLOC | DBG_FUNC_START, startingBlock, numBlocks, 0, 0, 0); + + if (hfsmp->jnl != NULL) { + offset = (u_int64_t) startingBlock * hfsmp->blockSize + (u_int64_t) hfsmp->hfsPlusIOPosOffset; + length = (u_int64_t) numBlocks * hfsmp->blockSize; - if ((hfsmp->hfs_flags & HFS_UNMAP) && (hfsmp->jnl != NULL)) { - offset = (u_int64_t) startingBlock * hfsmp->blockSize + (u_int64_t) hfsmp->hfsPlusIOPosOffset; - length = (u_int64_t) numBlocks * hfsmp->blockSize; - - err = journal_trim_remove_extent(hfsmp->jnl, offset, length); - if (err) { - printf("hfs_unmap_alloc_extent: error %d from journal_trim_remove_extent", err); - hfsmp->hfs_flags &= ~HFS_UNMAP; - } + err = journal_trim_remove_extent(hfsmp->jnl, offset, length); + if (err) { + printf("hfs_unmap_alloc_extent: error %d from journal_trim_remove_extent", err); } } + + if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_ALLOC | DBG_FUNC_END, err, 0, 0, 0, 0); } /* ;________________________________________________________________________________ ; -; Routine: BlkAlloc -; -; Function: Allocate space on a volume. If contiguous allocation is requested, -; at least the requested number of bytes will be allocated or an -; error will be returned. If contiguous allocation is not forced, -; the space will be allocated at the first free fragment following -; the requested starting allocation block. If there is not enough -; room there, a block of less than the requested size will be -; allocated. +; Routine: hfs_trim_callback ; -; If the requested starting block is 0 (for new file allocations), -; the volume's allocation block pointer will be used as a starting -; point. +; Function: This function is called when a transaction that freed extents +; (via hfs_unmap_free_extent/journal_trim_add_extent) has been +; written to the on-disk journal. This routine will add those +; extents to the free extent cache so that they can be reused. ; -; Input Arguments: -; vcb - Pointer to ExtendedVCB for the volume to allocate space on -; fcb - Pointer to FCB for the file for which storage is being allocated -; startingBlock - Preferred starting allocation block, 0 = no preference -; forceContiguous - Force contiguous flag - if bit 0 set (NE), allocation is contiguous -; or an error is returned -; useMetaZone - -; minBlocks - Number of blocks requested. If the allocation is non-contiguous, -; less than this may actually be allocated -; maxBlocks - The maximum number of blocks to allocate. If there is additional free -; space after bytesRequested, then up to maxBlocks bytes should really -; be allocated. (Used by ExtendFileC to round up allocations to a multiple -; of the file's clump size.) +; CAUTION: This routine is called while the journal's trim lock +; is held shared, so that no other thread can reuse any portion +; of those extents. We must be very careful about which locks +; we take from within this callback, to avoid deadlock. The +; call to add_free_extent_cache will end up taking the cache's +; lock (just long enough to add these extents to the cache). ; -; Output: -; (result) - Error code, zero for successful allocation -; *startBlock - Actual starting allocation block -; *actualBlocks - Actual number of allocation blocks allocated +; CAUTION: If the journal becomes invalid (eg., due to an I/O +; error when trying to write to the journal), this callback +; will stop getting called, even if extents got freed before +; the journal became invalid! ; -; Side effects: -; The volume bitmap is read and updated; the volume bitmap cache may be changed. +; Input Arguments: +; arg - The hfsmount of the volume containing the extents. +; extent_count - The number of extents freed in the transaction. +; extents - An array of extents (byte ranges) that were freed. ;________________________________________________________________________________ */ -static void -sanity_check_free_ext(__unused ExtendedVCB *vcb, __unused int check_allocated) +__private_extern__ void +hfs_trim_callback(void *arg, uint32_t extent_count, const dk_extent_t *extents) { -#if DEBUG - u_int32_t i, j; - - for(i=0; i < vcb->vcbFreeExtCnt; i++) { - u_int32_t start, nblocks; - - start = vcb->vcbFreeExt[i].startBlock; - nblocks = vcb->vcbFreeExt[i].blockCount; - - - if (nblocks == 0) { - panic("hfs: %p: slot %d in the free extent array had a zero count (%d)\n", vcb, i, start); - } - - if (check_allocated && hfs_isallocated(vcb, start, nblocks)) { - panic("hfs: %p: slot %d in the free extent array is bad (%d / %d)\n", - vcb, i, start, nblocks); - } - - for(j=i+1; j < vcb->vcbFreeExtCnt; j++) { - if (start == vcb->vcbFreeExt[j].startBlock) { - panic("hfs: %p: slot %d/%d are dups?! (%d / %d ; %d / %d)\n", - vcb, i, j, start, nblocks, vcb->vcbFreeExt[i].startBlock, - vcb->vcbFreeExt[i].blockCount); - } - } + uint32_t i; + uint32_t startBlock, numBlocks; + struct hfsmount *hfsmp = arg; + + if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_CALLBACK | DBG_FUNC_START, 0, extent_count, 0, 0, 0); + + for (i=0; ihfsPlusIOPosOffset) / hfsmp->blockSize; + numBlocks = extents[i].length / hfsmp->blockSize; + (void) add_free_extent_cache(hfsmp, startBlock, numBlocks); } -#endif + + if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_CALLBACK | DBG_FUNC_END, 0, 0, 0, 0, 0); } -__private_extern__ +/* + ;________________________________________________________________________________ + ; + ; Routine: BlockAllocate + ; + ; Function: Allocate space on a volume. If contiguous allocation is requested, + ; at least the requested number of bytes will be allocated or an + ; error will be returned. If contiguous allocation is not forced, + ; the space will be allocated with the first largest extent available + ; at the requested starting allocation block. If there is not enough + ; room there, a block allocation of less than the requested size will be + ; allocated. + ; + ; If the requested starting block is 0 (for new file allocations), + ; the volume's allocation block pointer will be used as a starting + ; point. + ; + ; Input Arguments: + ; vcb - Pointer to ExtendedVCB for the volume to allocate space on + ; fcb - Pointer to FCB for the file for which storage is being allocated + ; startingBlock - Preferred starting allocation block, 0 = no preference + ; minBlocks - Number of blocks requested. If the allocation is non-contiguous, + ; less than this may actually be allocated + ; maxBlocks - The maximum number of blocks to allocate. If there is additional free + ; space after bytesRequested, then up to maxBlocks bytes should really + ; be allocated. (Used by ExtendFileC to round up allocations to a multiple + ; of the file's clump size.) + ; flags - Flags to specify options like contiguous, use metadata zone, + ; skip free block check, etc. + ; + ; Output: + ; (result) - Error code, zero for successful allocation + ; *startBlock - Actual starting allocation block + ; *actualBlocks - Actual number of allocation blocks allocated + ; + ; Side effects: + ; The volume bitmap is read and updated; the volume bitmap cache may be changed. + ;________________________________________________________________________________ + */ OSErr BlockAllocate ( ExtendedVCB *vcb, /* which volume to allocate space on */ u_int32_t startingBlock, /* preferred starting block, or 0 for no preference */ @@ -332,9 +646,13 @@ OSErr BlockAllocate ( u_int32_t freeBlocks; OSErr err; Boolean updateAllocPtr = false; // true if nextAllocation needs to be updated + struct hfsmount *hfsmp; Boolean useMetaZone; Boolean forceContiguous; + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_BLOCK_ALLOCATE | DBG_FUNC_START, startingBlock, minBlocks, maxBlocks, flags, 0); + if (flags & HFS_ALLOC_FORCECONTIG) { forceContiguous = true; } else { @@ -347,12 +665,27 @@ OSErr BlockAllocate ( useMetaZone = false; } + //TODO: Figure out when we need to re-enable the RB-Tree. + + + //TODO: Make sure we use allocLimit when appropriate. + + /* + * TODO: Update BlockAllocate and its sub-functions to do cooperative allocation and bitmap scanning + * in conjunction with the Generate Tree function. If the red-black tree does not currently contain + * an allocation block of appropriate size, then start scanning blocks FOR the tree generation function until + * we find what we need. We'll update the tree fields when we're done, indicating that we've advanced the + * high water mark for the tree. + */ + // // Initialize outputs in case we get an error // *actualStartBlock = 0; *actualNumBlocks = 0; - freeBlocks = hfs_freeblks(VCBTOHFS(vcb), 0); + hfsmp = VCBTOHFS (vcb); + freeBlocks = hfs_freeblks(hfsmp, 0); + /* Skip free block check if blocks are being allocated for relocating * data during truncating a volume. @@ -394,14 +727,19 @@ OSErr BlockAllocate ( // if (startingBlock == 0) { HFS_MOUNT_LOCK(vcb, TRUE); + + /* Sparse Allocation and nextAllocation are both used even if the R/B Tree is on */ if (vcb->hfs_flags & HFS_HAS_SPARSE_DEVICE) { startingBlock = vcb->sparseAllocation; - } else { + } + else { startingBlock = vcb->nextAllocation; } HFS_MOUNT_UNLOCK(vcb, TRUE); updateAllocPtr = true; } + + if (startingBlock >= vcb->allocLimit) { startingBlock = 0; /* overflow so start at beginning */ } @@ -414,33 +752,91 @@ OSErr BlockAllocate ( err = BlockAllocateContig(vcb, startingBlock, minBlocks, maxBlocks, useMetaZone, actualStartBlock, actualNumBlocks); /* - * If we allocated from a new position then - * also update the roving allocator. + * If we allocated from a new position then also update the roving allocator. + * This will keep the roving allocation pointer up-to-date even + * if we are using the new R/B tree allocator, since + * it doesn't matter to us here, how the underlying allocator found + * the block to vend out. */ if ((err == noErr) && (*actualStartBlock > startingBlock) && ((*actualStartBlock < VCBTOHFS(vcb)->hfs_metazone_start) || (*actualStartBlock > VCBTOHFS(vcb)->hfs_metazone_end))) { - updateAllocPtr = true; } } else { +#if CONFIG_HFS_ALLOC_RBTREE + /* + * If the RB-Tree Allocator is live, just go straight for a + * BlockAllocateAny call and return the result. Otherwise, + * resort to the bitmap scanner. + */ + if (hfs_isrbtree_active(VCBTOHFS(vcb))) { + /* Start by trying to allocate from the starting block forward */ + err = BlockAllocateAny(vcb, startingBlock, vcb->allocLimit, + maxBlocks, useMetaZone, actualStartBlock, + actualNumBlocks); + + /* + * Because the RB-Tree is live, the previous call to BlockAllocateAny + * will use the rbtree variant. As a result, it will automatically search the + * metadata zone for a valid extent if needed. If we get a return value of + * noErr, we found a valid extent and we can skip to the end. If the error indicates + * the disk is full, that's an equally valid return code and we can skip to the end, too. + */ + if (err == noErr || err == dskFulErr) { + goto Exit; + } + else { + //TODO: only tear down tree if the tree is finished building. + //Make sure to handle the ENOSPC condition properly. We shouldn't error out in that case. + /* Tear down tree if we encounter an error */ + if (hfsmp->extent_tree_flags & HFS_ALLOC_RB_ACTIVE) { + hfsmp->extent_tree_flags |= HFS_ALLOC_RB_ERRORED; + DestroyTrees(hfsmp); + ResetVCBFreeExtCache(hfsmp); + } + else { + goto Exit; + } + // fall through to the normal allocation since the rb-tree allocation failed. + } + } +#endif + /* * Scan the bitmap once, gather the N largest free extents, then * allocate from these largest extents. Repeat as needed until * we get all the space we needed. We could probably build up * that list when the higher level caller tried (and failed) a * contiguous allocation first. + * + * Note that the free-extent cache will be cease to be updated if + * we are using the red-black tree for allocations. If we jettison + * the tree, then we will reset the free-extent cache and start over. */ + err = BlockAllocateKnown(vcb, maxBlocks, actualStartBlock, actualNumBlocks); - if (err == dskFulErr) + /* dskFulErr out of BlockAllocateKnown indicates an empty Free Extent Cache */ + + if (err == dskFulErr) { + /* + * Now we have to do a bigger scan. Start at startingBlock and go up until the + * allocation limit. + */ err = BlockAllocateAny(vcb, startingBlock, vcb->allocLimit, maxBlocks, useMetaZone, actualStartBlock, actualNumBlocks); - if (err == dskFulErr) + } + if (err == dskFulErr) { + /* + * We may be out of space in the normal zone; go up to the starting block from + * the start of the volume. + */ err = BlockAllocateAny(vcb, 1, startingBlock, maxBlocks, useMetaZone, actualStartBlock, actualNumBlocks); + } } Exit: @@ -450,8 +846,6 @@ OSErr BlockAllocate ( // still need to update things like the free block count). // if (*actualNumBlocks != 0) { - int i,j; - // // If we used the volume's roving allocation pointer, then we need to update it. // Adding in the length of the current allocation might reduce the next allocate @@ -462,41 +856,24 @@ OSErr BlockAllocate ( // HFS_MOUNT_LOCK(vcb, TRUE); + lck_spin_lock(&hfsmp->vcbFreeExtLock); if (vcb->vcbFreeExtCnt == 0 && vcb->hfs_freed_block_count == 0) { vcb->sparseAllocation = *actualStartBlock; } + lck_spin_unlock(&hfsmp->vcbFreeExtLock); if (*actualNumBlocks < vcb->hfs_freed_block_count) { vcb->hfs_freed_block_count -= *actualNumBlocks; } else { vcb->hfs_freed_block_count = 0; } - + if (updateAllocPtr && - ((*actualStartBlock < VCBTOHFS(vcb)->hfs_metazone_start) || - (*actualStartBlock > VCBTOHFS(vcb)->hfs_metazone_end))) { + ((*actualStartBlock < VCBTOHFS(vcb)->hfs_metazone_start) || + (*actualStartBlock > VCBTOHFS(vcb)->hfs_metazone_end))) { HFS_UPDATE_NEXT_ALLOCATION(vcb, *actualStartBlock); } - for(i=0; i < (int)vcb->vcbFreeExtCnt; i++) { - u_int32_t start, end; - - start = vcb->vcbFreeExt[i].startBlock; - end = start + vcb->vcbFreeExt[i].blockCount; - - if ( (*actualStartBlock >= start && *actualStartBlock < end) - || ((*actualStartBlock + *actualNumBlocks) > start && *actualStartBlock < start)) { - - for(j=i; j < (int)vcb->vcbFreeExtCnt-1; j++) { - vcb->vcbFreeExt[j] = vcb->vcbFreeExt[j+1]; - } - - vcb->vcbFreeExtCnt--; - i--; // so we'll check the guy we just copied down... - - // keep looping because we may have invalidated more - // than one entry in the array - } - } + (void) remove_free_extent_cache(hfsmp, *actualStartBlock, *actualNumBlocks); /* * Update the number of free blocks on the volume @@ -510,11 +887,31 @@ OSErr BlockAllocate ( MarkVCBDirty(vcb); HFS_MOUNT_UNLOCK(vcb, TRUE); - sanity_check_free_ext(vcb, 1); - hfs_generate_volume_notifications(VCBTOHFS(vcb)); } + if (ALLOC_DEBUG) { + if (err == noErr) { + if (*actualStartBlock >= hfsmp->totalBlocks) { + panic ("BlockAllocate: vending invalid blocks!"); + } + if (*actualStartBlock >= hfsmp->allocLimit) { + panic ("BlockAllocate: vending block past allocLimit!"); + } + + if ((*actualStartBlock + *actualNumBlocks) >= hfsmp->totalBlocks) { + panic ("BlockAllocate: vending too many invalid blocks!"); + } + + if ((*actualStartBlock + *actualNumBlocks) >= hfsmp->allocLimit) { + panic ("BlockAllocate: vending too many invalid blocks past allocLimit!"); + } + } + } + + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_BLOCK_ALLOCATE | DBG_FUNC_END, err, *actualStartBlock, *actualNumBlocks, 0, 0); + return err; } @@ -522,7 +919,7 @@ OSErr BlockAllocate ( /* ;________________________________________________________________________________ ; -; Routine: BlkDealloc +; Routine: BlockDeallocate ; ; Function: Update the bitmap to deallocate a run of disk allocation blocks ; @@ -536,10 +933,10 @@ OSErr BlockAllocate ( ; ; Side effects: ; The volume bitmap is read and updated; the volume bitmap cache may be changed. +; The Allocator's red-black trees may also be modified as a result. ;________________________________________________________________________________ */ -__private_extern__ OSErr BlockDeallocate ( ExtendedVCB *vcb, // Which volume to deallocate space on u_int32_t firstBlock, // First block in range to deallocate @@ -547,8 +944,12 @@ OSErr BlockDeallocate ( u_int32_t flags) { OSErr err; - u_int32_t tempWord; + struct hfsmount *hfsmp; + hfsmp = VCBTOHFS(vcb); + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_BLOCK_DEALLOCATE | DBG_FUNC_START, firstBlock, numBlocks, flags, 0, 0); + // // If no blocks to deallocate, then exit early // @@ -556,11 +957,51 @@ OSErr BlockDeallocate ( err = noErr; goto Exit; } + + + if (ALLOC_DEBUG) { + if (firstBlock >= hfsmp->totalBlocks) { + panic ("BlockDeallocate: freeing invalid blocks!"); + } + + if ((firstBlock + numBlocks) >= hfsmp->totalBlocks) { + panic ("BlockDeallocate: freeing too many invalid blocks!"); + } + } + + + + + /* + * If we're using the red-black tree code, then try to free the + * blocks by marking them in the red-black tree first. If the tree + * is not active for whatever reason (or we're not using the + * R/B Tree code at all), then go straight for the BlockMarkFree + * function. + * + * Remember that we can get into this function if the tree isn't finished + * building. In that case, check to see if the block we're de-allocating is + * past the high watermark + */ +#if CONFIG_HFS_ALLOC_RBTREE + if (hfs_isrbtree_active(VCBTOHFS(vcb))) { + /* + * BlockMarkFreeRBTree deals with the case where we are resizing the + * filesystem (shrinking), and we need to manipulate the bitmap beyond the portion + * that is currenly controlled by the r/b tree. + */ + + //TODO: Update multiplexing code for the half-finished case. + err = BlockMarkFreeRBTree(vcb, firstBlock, numBlocks); + adjustFreeExtCache = 0; + } + else { + err = BlockMarkFreeInternal(vcb, firstBlock, numBlocks, true); + } - // - // Call internal routine to free the sequence of blocks - // - err = BlockMarkFree(vcb, firstBlock, numBlocks); +#else + err = BlockMarkFreeInternal(vcb, firstBlock, numBlocks, true); +#endif if (err) goto Exit; @@ -578,75 +1019,39 @@ OSErr BlockDeallocate ( } vcb->hfs_freed_block_count += numBlocks; - if (firstBlock < vcb->sparseAllocation) { - vcb->sparseAllocation = firstBlock; - } if (vcb->nextAllocation == (firstBlock + numBlocks)) { HFS_UPDATE_NEXT_ALLOCATION(vcb, (vcb->nextAllocation - numBlocks)); } - if (free_extent_cache_active(vcb) == 0) { - goto skip_cache; - } - - tempWord = vcb->vcbFreeExtCnt; - // Add this free chunk to the free extent list - if (vcb->hfs_flags & HFS_HAS_SPARSE_DEVICE) { - // Sorted by start block - if (tempWord == kMaxFreeExtents && vcb->vcbFreeExt[kMaxFreeExtents-1].startBlock > firstBlock) - --tempWord; - if (tempWord < kMaxFreeExtents) - { - // We're going to add this extent. Bubble any smaller extents down in the list. - while (tempWord && vcb->vcbFreeExt[tempWord-1].startBlock > firstBlock) - { - vcb->vcbFreeExt[tempWord] = vcb->vcbFreeExt[tempWord-1]; - if (vcb->vcbFreeExt[tempWord].startBlock < vcb->sparseAllocation) { - vcb->sparseAllocation = vcb->vcbFreeExt[tempWord].startBlock; - } - --tempWord; - } - vcb->vcbFreeExt[tempWord].startBlock = firstBlock; - vcb->vcbFreeExt[tempWord].blockCount = numBlocks; - - if (vcb->vcbFreeExtCnt < kMaxFreeExtents) { - ++vcb->vcbFreeExtCnt; - } - } - } else { - // Sorted by num blocks - if (tempWord == kMaxFreeExtents && vcb->vcbFreeExt[kMaxFreeExtents-1].blockCount < numBlocks) - --tempWord; - if (tempWord < kMaxFreeExtents) - { - // We're going to add this extent. Bubble any smaller extents down in the list. - while (tempWord && vcb->vcbFreeExt[tempWord-1].blockCount < numBlocks) - { - vcb->vcbFreeExt[tempWord] = vcb->vcbFreeExt[tempWord-1]; - if (vcb->vcbFreeExt[tempWord].startBlock < vcb->sparseAllocation) { - vcb->sparseAllocation = vcb->vcbFreeExt[tempWord].startBlock; - } - --tempWord; - } - vcb->vcbFreeExt[tempWord].startBlock = firstBlock; - vcb->vcbFreeExt[tempWord].blockCount = numBlocks; - - if (vcb->vcbFreeExtCnt < kMaxFreeExtents) { - ++vcb->vcbFreeExtCnt; - } + if (hfsmp->jnl == NULL) { + /* + * In the journal case, we'll add the free extent once the journal + * calls us back to tell us it wrote the transaction to disk. + */ + (void) add_free_extent_cache(vcb, firstBlock, numBlocks); + + /* + * If the journal case, we'll only update sparseAllocation once the + * free extent cache becomes empty (when we remove the last entry + * from the cache). Skipping it here means we're less likely to + * find a recently freed extent via the bitmap before it gets added + * to the free extent cache. + */ + if (firstBlock < vcb->sparseAllocation) { + vcb->sparseAllocation = firstBlock; } } - -skip_cache: + MarkVCBDirty(vcb); HFS_MOUNT_UNLOCK(vcb, TRUE); - sanity_check_free_ext(vcb, 1); - hfs_generate_volume_notifications(VCBTOHFS(vcb)); Exit: + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_BLOCK_DEALLOCATE | DBG_FUNC_END, err, 0, 0, 0, 0); + return err; } @@ -656,7 +1061,6 @@ u_int8_t freebitcount[16] = { 3, 2, 2, 1, 2, 1, 1, 0, /* 8 9 A B C D E F */ }; -__private_extern__ u_int32_t MetaZoneFreeBlocks(ExtendedVCB *vcb) { @@ -763,6 +1167,9 @@ static OSErr ReadBitmapBlock( daddr64_t block; u_int32_t blockSize; + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_READ_BITMAP_BLOCK | DBG_FUNC_START, bit, 0, 0, 0, 0); + /* * volume bitmap blocks are protected by the allocation file lock */ @@ -792,6 +1199,9 @@ static OSErr ReadBitmapBlock( } } + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_READ_BITMAP_BLOCK | DBG_FUNC_END, err, 0, 0, 0, 0); + return err; } @@ -816,6 +1226,9 @@ static OSErr ReleaseBitmapBlock( { struct buf *bp = (struct buf *)blockRef; + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_RELEASE_BITMAP_BLOCK | DBG_FUNC_START, dirty, 0, 0, 0, 0); + if (blockRef == 0) { if (dirty) panic("hfs: ReleaseBitmapBlock: missing bp"); @@ -837,9 +1250,42 @@ static OSErr ReleaseBitmapBlock( } } + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_RELEASE_BITMAP_BLOCK | DBG_FUNC_END, 0, 0, 0, 0, 0); + + return (0); +} + +#if CONFIG_HFS_ALLOC_RBTREE +/* + * ReleaseRBScanBitmapBlock is used to release struct bufs that were + * created for use by the Red-Black tree generation code. We want to force + * them to be purged out of the buffer cache ASAP, so we'll release them differently + * than in the ReleaseBitmapBlock case. Alternately, we know that we're only reading + * the blocks, so we will never dirty them as part of the tree building scan. + */ + +static OSErr ReleaseRBScanBitmapBlock(struct buf *bp ) { + + if (bp == NULL) { + return (0); + } + + if (bp) { + /* Mark the buffer invalid if it isn't locked, then release it */ + if ((buf_flags(bp) & B_LOCKED) == 0) { + buf_markinvalid(bp); + } + buf_brelse(bp); + } + return (0); + + } +#endif + /* _______________________________________________________________________ @@ -872,21 +1318,48 @@ static OSErr BlockAllocateContig( u_int32_t *actualStartBlock, u_int32_t *actualNumBlocks) { - OSErr err; - // - // Find a contiguous group of blocks at least minBlocks long. - // Determine the number of contiguous blocks available (up - // to maxBlocks). - // +#if CONFIG_HFS_ALLOC_RBTREE + if (hfs_isrbtree_active(VCBTOHFS(vcb))) { + return BlockAllocateContigRBTree(vcb, startingBlock, minBlocks, maxBlocks, useMetaZone, + actualStartBlock, actualNumBlocks, 1); + } +#endif + return BlockAllocateContigBitmap(vcb, startingBlock, minBlocks, + maxBlocks, useMetaZone, actualStartBlock, actualNumBlocks); +} - /* - * NOTE: If the only contiguous free extent of at least minBlocks - * crosses startingBlock (i.e. starts before, ends after), then we - * won't find it. Earlier versions *did* find this case by letting - * the second search look past startingBlock by minBlocks. But - * with the free extent cache, this can lead to duplicate entries - * in the cache, causing the same blocks to be allocated twice. +/* + * Variant of BlockAllocateContig that uses the original bitmap-searching logic + */ + +static OSErr BlockAllocateContigBitmap( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t minBlocks, + u_int32_t maxBlocks, + Boolean useMetaZone, + u_int32_t *actualStartBlock, + u_int32_t *actualNumBlocks) +{ + OSErr err; + + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_ALLOC_CONTIG_BITMAP | DBG_FUNC_START, startingBlock, minBlocks, maxBlocks, useMetaZone, 0); + + // + // Find a contiguous group of blocks at least minBlocks long. + // Determine the number of contiguous blocks available (up + // to maxBlocks). + // + + /* + * NOTE: If the only contiguous free extent of at least minBlocks + * crosses startingBlock (i.e. starts before, ends after), then we + * won't find it. Earlier versions *did* find this case by letting + * the second search look past startingBlock by minBlocks. But + * with the free extent cache, this can lead to duplicate entries + * in the cache, causing the same blocks to be allocated twice. */ err = BlockFindContiguous(vcb, startingBlock, vcb->allocLimit, minBlocks, maxBlocks, useMetaZone, actualStartBlock, actualNumBlocks); @@ -902,10 +1375,234 @@ static OSErr BlockAllocateContig( // Now mark those blocks allocated. // if (err == noErr) - err = BlockMarkAllocated(vcb, *actualStartBlock, *actualNumBlocks); + err = BlockMarkAllocatedInternal(vcb, *actualStartBlock, *actualNumBlocks); + + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_ALLOC_CONTIG_BITMAP | DBG_FUNC_END, err, *actualStartBlock, *actualNumBlocks, 0, 0); + + return err; +} + +#if CONFIG_HFS_ALLOC_RBTREE +/* + * Variant of BlockAllocateContig that uses the newer red-black tree library + * in order to manage free space extents. This will search the red-black tree + * and return results in the same fashion as BlockAllocateContigBitmap. + * + * Note that this function is invoked from both the red-black tree variant of BlockAllocateany + * as well as BlockAllocateContig. In order to determine when we should vend contiguous chunks over + * locality-based-searches, we use the forceContig argument to determine who called us. + */ + +static OSErr BlockAllocateContigRBTree( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t minBlocks, + u_int32_t maxBlocks, + Boolean useMetaZone, + u_int32_t *actualStartBlock, + u_int32_t *actualNumBlocks, + u_int32_t forceContig) +{ + OSErr err; + struct hfsmount *hfsmp = VCBTOHFS(vcb); + extent_node_t search_sentinel; + extent_node_t *node = NULL; + extent_node_t tempnode; + + bzero (&tempnode, sizeof(extent_node_t)); + + /* Begin search at the end of the file, via startingBlock */ + memset (&search_sentinel, 0, sizeof(extent_node_t)); + search_sentinel.offset = startingBlock; + + *actualStartBlock = 0; + *actualNumBlocks = 0; + + /* + * Find the first available extent that satifies the allocation by searching + * from the starting point and moving forward + */ + node = extent_tree_off_search_next(&hfsmp->offset_tree, &search_sentinel); + + if (node) { + *actualStartBlock = node->offset; + *actualNumBlocks = node->length; + } + + /* If we managed to grab at least minBlocks of space, then we're done. */ + + if (*actualNumBlocks >= minBlocks) { + if (*actualNumBlocks > maxBlocks) { + *actualNumBlocks = maxBlocks; + } + + + /* Check to see if blocks are already marked as in-use */ + if (ALLOC_DEBUG) { + REQUIRE_FILE_LOCK(vcb->hfs_allocation_vp, false); + if (hfs_isallocated(hfsmp, *actualStartBlock, *actualNumBlocks)) { + printf("bad node: %p, offset %d, length %d\n", node, node->offset,node->length); + panic ("HFS RBTree Allocator: Blocks starting @ %x for %x blocks in use already\n", + *actualStartBlock, *actualNumBlocks); + } + } + + /* + * BlockMarkAllocatedRBTree is responsible for removing the nodes + * from the red-black tree after the bitmap has been updated on-disk. + */ + err = BlockMarkAllocatedRBTree(vcb, *actualStartBlock, *actualNumBlocks); + if (err == noErr) { + + if ( ALLOC_DEBUG ) { + REQUIRE_FILE_LOCK(vcb->hfs_allocation_vp, false); + if (!hfs_isallocated(hfsmp, *actualStartBlock, *actualNumBlocks)) { + panic ("HFS RBTree Allocator: Blocks starting @ %x for %x blocks not in use yet\n", + *actualStartBlock, *actualNumBlocks); + } + check_rbtree_extents (VCBTOHFS(vcb), *actualStartBlock, *actualNumBlocks, ASSERT_ALLOC); + } + + return err; + } + } + + /* + * We may have failed to grow at the end of the file. We'll try to find + * appropriate free extents, searching by size in the normal allocation zone. + * + * However, if we're allocating on behalf of a sparse device that hasn't explicitly + * requested a contiguous chunk, then we try to search by offset, even if it + * means fragmenting the file. We want all available entries starting + * from the front of the disk to avoid creating new bandfiles. As a result, + * we'll start by searching the offset tree rather than the normal length + * tree. Note that this function can be invoked from BlockAllocateAny, in + * which the minimum block size is 1 block, making it easy to succeed. + */ + search_sentinel.offset = hfsmp->hfs_metazone_end; + search_sentinel.length = minBlocks; + + if ((vcb->hfs_flags & HFS_HAS_SPARSE_DEVICE) && (forceContig == 0)) { + /* just start with the first offset node */ + node = extent_tree_off_search_next(&hfsmp->offset_tree, &search_sentinel); + } + else { + /* + * Otherwise, start from the end of the metadata zone or our next allocation pointer, + * and try to find the first chunk of size >= min. + */ + node = extent_tree_off_search_nextWithSize (&hfsmp->offset_tree, &search_sentinel); + + if (node == NULL) { + extent_node_t *metaend_node; + /* + * Maybe there's a free extent coalesced with the space still in the metadata + * zone. If there is, find it and allocate from the middle of it, starting at + * the end of the metadata zone. + * + * If search_prev yields a result that is not offset == metazone_end, then that + * means no node existed at that offset. If the previous node's offset + length crosses + * the metazone boundary, then allocate from there. If it is too small to + * cross the metazone boundary, then it is of no importance and we'd have to + * report ENOSPC. + */ + metaend_node = extent_tree_off_search_prev(&hfsmp->offset_tree, &search_sentinel); + + if ((metaend_node) && (metaend_node->offset < hfsmp->hfs_metazone_end)) { + u_int32_t node_end = metaend_node->offset + metaend_node->length; + if (node_end > hfsmp->hfs_metazone_end) { + u_int32_t modified_length = node_end - hfsmp->hfs_metazone_end; + if (modified_length >= minBlocks) { + /* + * Then we can allocate it. Fill in the contents into tempnode, + * and BlockMarkAllocatedRBTree below will take care of the rest. + */ + tempnode.offset = hfsmp->hfs_metazone_end; + tempnode.length = MIN(minBlocks, node_end - tempnode.offset); + node = &tempnode; + } + } + } + } + } + + /* If we can't find anything useful, search the metadata zone as a last resort. */ + + if ((!node) && useMetaZone) { + search_sentinel.offset = 0; + search_sentinel.length = minBlocks; + node = extent_tree_off_search_nextWithSize (&hfsmp->offset_tree, &search_sentinel); + } + + /* If we found something useful, then go ahead and update the bitmap */ + if ((node) && (node->length >= minBlocks)) { + *actualStartBlock = node->offset; + if (node->length >= maxBlocks) { + *actualNumBlocks = maxBlocks; + } + else { + *actualNumBlocks = node->length; + } + + err = BlockMarkAllocatedRBTree(vcb, *actualStartBlock, *actualNumBlocks); + + if (err == noErr) { + if ( ALLOC_DEBUG ) { + REQUIRE_FILE_LOCK(vcb->hfs_allocation_vp, false); + if (!hfs_isallocated(hfsmp, *actualStartBlock, *actualNumBlocks)) { + panic ("HFS RBTree Allocator: Blocks starting @ %x for %x blocks not in use yet\n", + *actualStartBlock, *actualNumBlocks); + } + check_rbtree_extents (VCBTOHFS(vcb), *actualStartBlock, *actualNumBlocks, ASSERT_ALLOC); + } + } + } + else { + int destroy_trees = 0; + /* + * TODO: Add High-water mark check here. If we couldn't find anything useful, + * when do we tear down the tree? Or should the logic be in BlockAllocateContig?? + */ + if (destroy_trees) { + DestroyTrees(VCBTOHFS(vcb)); + /* Reset the Free Ext Cache since we'll be using it now. */ + ResetVCBFreeExtCache(VCBTOHFS(vcb)); + } + + if (ALLOC_DEBUG) { + printf("HFS allocator: No space on FS (%s). Node %p Start %d Min %d, Max %d, Tree still alive.\n", + hfsmp->vcbVN, node, startingBlock, minBlocks, maxBlocks); + + /* Dump the list ? */ + extent_tree_offset_print(&hfsmp->offset_tree); + + printf("HFS allocator: Done printing list on FS (%s). Min %d, Max %d, Tree still alive.\n", + hfsmp->vcbVN, minBlocks, maxBlocks); + + + + } + err = dskFulErr; + } + + if (err == noErr) { + if (ALLOC_DEBUG) { + if ((*actualStartBlock + *actualNumBlocks) > vcb->allocLimit) + panic("hfs: BlockAllocateAny: allocation overflow on \"%s\"", vcb->vcbVN); + } + } + else { + *actualStartBlock = 0; + *actualNumBlocks = 0; + } return err; + } +#endif + + /* _______________________________________________________________________ @@ -929,6 +1626,12 @@ Function: Allocate one or more allocation blocks. If there are fewer actualNumBlocks Number of blocks allocated, or 0 if error _______________________________________________________________________ */ + +/* + * BlockAllocateAny acts as a multiplexer between BlockAllocateAnyRBTree + * and BlockAllocateAnyBitmap, which uses the bitmap scanning logic. + */ + static OSErr BlockAllocateAny( ExtendedVCB *vcb, u_int32_t startingBlock, @@ -938,6 +1641,60 @@ static OSErr BlockAllocateAny( u_int32_t *actualStartBlock, u_int32_t *actualNumBlocks) { + +#if CONFIG_HFS_ALLOC_RBTREE + if (hfs_isrbtree_active(VCBTOHFS(vcb))) { + return BlockAllocateAnyRBTree(vcb, startingBlock, maxBlocks, useMetaZone, actualStartBlock, actualNumBlocks); + } +#endif + return BlockAllocateAnyBitmap(vcb, startingBlock, endingBlock, maxBlocks, useMetaZone, actualStartBlock, actualNumBlocks); + +} + + +#if CONFIG_HFS_ALLOC_RBTREE +/* + * BlockAllocateAnyRBTree finds one or more allocation blocks by using + * the red-black allocation tree to figure out where the free ranges are. + * This function is typically used as a last resort becuase we were unable to + * find the right ranges. Outputs are the same as BlockAllocateAnyBitmap. + */ +static OSErr BlockAllocateAnyRBTree( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t maxBlocks, + Boolean useMetaZone, + u_int32_t *actualStartBlock, + u_int32_t *actualNumBlocks) +{ + OSErr err; + + /* + * BlockAllocateContig + */ + /* If we're using the red-black tree, try searching at the specified offsets. */ + err = BlockAllocateContigRBTree(vcb, startingBlock, 1, maxBlocks, useMetaZone, + actualStartBlock, actualNumBlocks, 0); + return err; + +} +#endif + +/* + * BlockAllocateAnyBitmap finds free ranges by scanning the bitmap to figure out + * where the free allocation blocks are. Inputs and outputs are the same as for + * BlockAllocateAny and BlockAllocateAnyRBTree + */ + +static OSErr BlockAllocateAnyBitmap( + ExtendedVCB *vcb, + u_int32_t startingBlock, + register u_int32_t endingBlock, + u_int32_t maxBlocks, + Boolean useMetaZone, + u_int32_t *actualStartBlock, + u_int32_t *actualNumBlocks) +{ OSErr err; register u_int32_t block; // current block number register u_int32_t currentWord; // Pointer to current word within bitmap block @@ -951,6 +1708,9 @@ static OSErr BlockAllocateAny( Boolean dirty = false; struct hfsmount *hfsmp = VCBTOHFS(vcb); + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_ALLOC_ANY_BITMAP | DBG_FUNC_START, startingBlock, endingBlock, maxBlocks, useMetaZone, 0); + /* * When we're skipping the metadata zone and the start/end * range overlaps with the metadata zone then adjust the @@ -1128,11 +1888,19 @@ static OSErr BlockAllocateAny( if ((*actualStartBlock + *actualNumBlocks) > vcb->allocLimit) { panic("hfs: BlockAllocateAny: allocation overflow on \"%s\"", vcb->vcbVN); } - - /* Remove these blocks from the TRIM list if applicable */ - if (CONFIG_HFS_TRIM) { - hfs_unmap_alloc_extent(vcb, *actualStartBlock, *actualNumBlocks); - } + + /* + * Beware! + * Because this function directly manipulates the bitmap to mark the + * blocks it came across as allocated, we must inform the journal (and + * subsequently, the journal's trim list) that we are allocating these + * blocks, just like in BlockMarkAllocatedInternal. hfs_unmap_alloc_extent + * and the functions it calls will serialize behind the journal trim list lock + * to ensure that either the asynchronous flush/TRIM/UNMAP happens prior to + * us manipulating the trim list, or we get there first and successfully remove + * these bitmap blocks before the TRIM happens. + */ + hfs_unmap_alloc_extent (vcb, *actualStartBlock, *actualNumBlocks); } else { *actualStartBlock = 0; @@ -1142,6 +1910,9 @@ static OSErr BlockAllocateAny( if (currCache) (void) ReleaseBitmapBlock(vcb, blockRef, dirty); + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_ALLOC_ANY_BITMAP | DBG_FUNC_END, err, *actualStartBlock, *actualNumBlocks, 0, 0); + return err; } @@ -1178,15 +1949,25 @@ static OSErr BlockAllocateKnown( u_int32_t foundBlocks; u_int32_t newStartBlock, newBlockCount; + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_ALLOC_KNOWN_BITMAP | DBG_FUNC_START, 0, 0, maxBlocks, 0, 0); + HFS_MOUNT_LOCK(vcb, TRUE); - if (free_extent_cache_active(vcb) == 0 || - vcb->vcbFreeExtCnt == 0 || + lck_spin_lock(&vcb->vcbFreeExtLock); + if ((hfs_isrbtree_active(vcb) == true) || + vcb->vcbFreeExtCnt == 0 || vcb->vcbFreeExt[0].blockCount == 0) { + lck_spin_unlock(&vcb->vcbFreeExtLock); HFS_MOUNT_UNLOCK(vcb, TRUE); + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_ALLOC_KNOWN_BITMAP | DBG_FUNC_END, dskFulErr, *actualStartBlock, *actualNumBlocks, 0, 0); return dskFulErr; } + lck_spin_unlock(&vcb->vcbFreeExtLock); HFS_MOUNT_UNLOCK(vcb, TRUE); + lck_spin_lock(&vcb->vcbFreeExtLock); + // Just grab up to maxBlocks of the first (largest) free exent. *actualStartBlock = vcb->vcbFreeExt[0].startBlock; foundBlocks = vcb->vcbFreeExt[0].blockCount; @@ -1246,6 +2027,7 @@ static OSErr BlockAllocateKnown( } done: + lck_spin_unlock(&vcb->vcbFreeExtLock); // sanity check if ((*actualStartBlock + *actualNumBlocks) > vcb->allocLimit) { @@ -1260,24 +2042,83 @@ static OSErr BlockAllocateKnown( // // Now mark the found extent in the bitmap // - err = BlockMarkAllocated(vcb, *actualStartBlock, *actualNumBlocks); + err = BlockMarkAllocatedInternal(vcb, *actualStartBlock, *actualNumBlocks); } - sanity_check_free_ext(vcb, 1); + sanity_check_free_ext(vcb, 0); + + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_ALLOC_KNOWN_BITMAP | DBG_FUNC_END, err, *actualStartBlock, *actualNumBlocks, 0, 0); return err; } +/* + * BlockMarkAllocated + * + * This is a wrapper function around the internal calls which will actually mark the blocks + * as in-use. It will mark the blocks in the red-black tree if appropriate. We need to do + * this logic here to avoid callers having to deal with whether or not the red-black tree + * is enabled. + */ + + +OSErr BlockMarkAllocated( + ExtendedVCB *vcb, + u_int32_t startingBlock, + register u_int32_t numBlocks) +{ + struct hfsmount *hfsmp; + + hfsmp = VCBTOHFS(vcb); +#if CONFIG_HFS_ALLOC_RBTREE + if (hfs_isrbtree_active(hfsmp)) { + int err; + + if ((startingBlock >= hfsmp->offset_block_end) && + (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS)) { + /* + * We're manipulating a portion of the bitmap that is not controlled by the + * red-black tree. Just update the bitmap and don't bother manipulating the tree + */ + goto justbitmap; + } + + err = BlockMarkAllocatedRBTree(vcb, startingBlock, numBlocks); + if (err == noErr) { + if ( ALLOC_DEBUG ) { + REQUIRE_FILE_LOCK(hfsmp->hfs_allocation_vp, false); + if (!hfs_isallocated(hfsmp, startingBlock, numBlocks)) { + panic ("HFS RBTree Allocator: Blocks starting @ %x for %x blocks not in use yet\n", + startingBlock, numBlocks); + } + check_rbtree_extents (hfsmp, startingBlock, numBlocks, ASSERT_ALLOC); + } + } + return err; + + } +justbitmap: +#endif + + return BlockMarkAllocatedInternal(vcb, startingBlock, numBlocks); + +} + /* _______________________________________________________________________ -Routine: BlockMarkAllocated +Routine: BlockMarkAllocatedInternal Function: Mark a contiguous group of blocks as allocated (set in the bitmap). It assumes those bits are currently marked - deallocated (clear in the bitmap). + deallocated (clear in the bitmap). Note that this function + must be called regardless of whether or not the bitmap or + tree-based allocator is used, as all allocations must correctly + be marked on-disk. If the tree-based approach is running, then + this will be done before the node is removed from the tree. Inputs: vcb Pointer to volume where space is to be allocated @@ -1285,8 +2126,8 @@ Function: Mark a contiguous group of blocks as allocated (set in the numBlocks Number of blocks to mark as allocated _______________________________________________________________________ */ -__private_extern__ -OSErr BlockMarkAllocated( +static +OSErr BlockMarkAllocatedInternal ( ExtendedVCB *vcb, u_int32_t startingBlock, register u_int32_t numBlocks) @@ -1304,9 +2145,10 @@ OSErr BlockMarkAllocated( // XXXdbg struct hfsmount *hfsmp = VCBTOHFS(vcb); - if (CONFIG_HFS_TRIM) { - hfs_unmap_alloc_extent(vcb, startingBlock, numBlocks); - } + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_MARK_ALLOC_BITMAP | DBG_FUNC_START, startingBlock, numBlocks, 0, 0, 0); + + hfs_unmap_alloc_extent(vcb, startingBlock, numBlocks); // // Pre-read the bitmap block containing the first word of allocation @@ -1349,7 +2191,7 @@ OSErr BlockMarkAllocated( } #if DEBUG_BUILD if ((*currentWord & SWAP_BE32 (bitMask)) != 0) { - panic("hfs: BlockMarkAllocated: blocks already allocated!"); + panic("hfs: BlockMarkAllocatedInternal: blocks already allocated!"); } #endif *currentWord |= SWAP_BE32 (bitMask); // set the bits in the bitmap @@ -1387,7 +2229,7 @@ OSErr BlockMarkAllocated( } #if DEBUG_BUILD if (*currentWord != 0) { - panic("hfs: BlockMarkAllocated: blocks already allocated!"); + panic("hfs: BlockMarkAllocatedInternal: blocks already allocated!"); } #endif *currentWord = SWAP_BE32 (bitMask); @@ -1425,7 +2267,7 @@ OSErr BlockMarkAllocated( } #if DEBUG_BUILD if ((*currentWord & SWAP_BE32 (bitMask)) != 0) { - panic("hfs: BlockMarkAllocated: blocks already allocated!"); + panic("hfs: BlockMarkAllocatedInternal: blocks already allocated!"); } #endif *currentWord |= SWAP_BE32 (bitMask); // set the bits in the bitmap @@ -1438,73 +2280,322 @@ OSErr BlockMarkAllocated( if (buffer) (void)ReleaseBitmapBlock(vcb, blockRef, true); + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_MARK_ALLOC_BITMAP | DBG_FUNC_END, err, 0, 0, 0, 0); + return err; } - +#if CONFIG_HFS_ALLOC_RBTREE /* -_______________________________________________________________________ - -Routine: BlockMarkFree - -Function: Mark a contiguous group of blocks as free (clear in the - bitmap). It assumes those bits are currently marked - allocated (set in the bitmap). + * This is a wrapper function around BlockMarkAllocated. This function is + * called when the RB Tree-based allocator needs to mark a block as in-use. + * This function should take the locks that would not normally be + * necessary for the normal bitmap allocator, and then call the function. Once + * the on-disk data structures are updated properly, then this will remove the + * appropriate node from the tree. + */ -Inputs: - vcb Pointer to volume where space is to be freed - startingBlock First block number to mark as freed - numBlocks Number of blocks to mark as freed -_______________________________________________________________________ -*/ -__private_extern__ -OSErr BlockMarkFree( +static OSErr BlockMarkAllocatedRBTree( ExtendedVCB *vcb, - u_int32_t startingBlock_in, - register u_int32_t numBlocks_in) + u_int32_t startingBlock, + u_int32_t numBlocks) { - OSErr err; - u_int32_t startingBlock = startingBlock_in; - u_int32_t numBlocks = numBlocks_in; - register u_int32_t *currentWord; // Pointer to current word within bitmap block - register u_int32_t wordsLeft; // Number of words left in this bitmap block - register u_int32_t bitMask; // Word with given bits already set (ready to OR in) - u_int32_t firstBit; // Bit index within word of first bit to allocate - u_int32_t numBits; // Number of bits in word to allocate - u_int32_t *buffer = NULL; - uintptr_t blockRef; - u_int32_t bitsPerBlock; - u_int32_t wordsPerBlock; - // XXXdbg - struct hfsmount *hfsmp = VCBTOHFS(vcb); - - /* - * NOTE: We use vcb->totalBlocks instead of vcb->allocLimit because we - * need to be able to free blocks being relocated during hfs_truncatefs. - */ - if (startingBlock + numBlocks > vcb->totalBlocks) { - printf ("hfs: BlockMarkFree() trying to free non-existent blocks starting at %u (numBlock=%u) on volume %s\n", startingBlock, numBlocks, vcb->vcbVN); - hfs_mark_volume_inconsistent(vcb); - err = EIO; - goto Exit; - } - - // - // Pre-read the bitmap block containing the first word of allocation - // + OSErr err; + struct hfsmount *hfsmp = VCBTOHFS(vcb); + int rb_err = 0; - err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef); - if (err != noErr) goto Exit; - // XXXdbg - if (hfsmp->jnl) { - journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef); + + if (ALLOC_DEBUG) { + REQUIRE_FILE_LOCK(vcb->hfs_allocation_vp, false); + if (hfs_isallocated(hfsmp, startingBlock, numBlocks)) { + panic ("HFS RBTree Allocator: Blocks starting @ %x for %x blocks in use already\n", + startingBlock, numBlocks); + } + check_rbtree_extents (VCBTOHFS(vcb), startingBlock, numBlocks, ASSERT_FREE); } + + err = BlockMarkAllocatedInternal (vcb, startingBlock, numBlocks); + + if (err == noErr) { - // - // Initialize currentWord, and wordsLeft. - // - { - u_int32_t wordIndexInBlock; + if (ALLOC_DEBUG) { + if (!hfs_isallocated(hfsmp, startingBlock, numBlocks)) { + panic ("HFS RBTree Allocator: Blocks starting @ %x for %x blocks not in use yet!\n", + startingBlock, numBlocks); + } + } + + /* + * Mark the blocks in the offset tree. + */ + rb_err = extent_tree_offset_alloc_space(&hfsmp->offset_tree, numBlocks, startingBlock); + if (rb_err) { + if (ALLOC_DEBUG) { + printf("HFS RBTree Allocator: Could not mark blocks as in-use! %d \n", rb_err); + } + + /* + * We may be called from the BlockMarkAllocated interface, in which case, they would + * not be picking extents from their start. Do a check here, find if the specified + * extent is free, and if it is, then find the containing node. + */ + extent_node_t *node = NULL; + extent_node_t search_sentinel; + search_sentinel.offset = startingBlock; + + node = extent_tree_off_search_prev(&hfsmp->offset_tree, &search_sentinel); + + if (node) { + rb_err = extent_tree_offset_alloc_unaligned (&hfsmp->offset_tree, numBlocks, startingBlock); + } + + if (ALLOC_DEBUG) { + if (rb_err) { + printf ("HFS RBTree Allocator: Still Couldn't mark blocks as in-use! %d\n", rb_err); + } + } + } + if (ALLOC_DEBUG) { + check_rbtree_extents (VCBTOHFS(vcb), startingBlock, numBlocks, ASSERT_ALLOC); + } + } + + /* + * If we encountered a red-black tree error, for now, we immediately back off and force + * destruction of rb-tree. Set the persistent error-detected bit in the mount point. + * That will ensure that even if we reach a low-water-mark in the future we will still + * not allow the rb-tree to be used. On next mount, we will force a re-construction from + * on-disk state. As a fallback, we will now resort to the bitmap-scanning behavior. + */ + if (rb_err) { + /* Mark RB-Trees with error */ + hfsmp->extent_tree_flags |= HFS_ALLOC_RB_ERRORED; + DestroyTrees(hfsmp); + /* Reset the Free Ext Cache since we'll be using it now. */ + ResetVCBFreeExtCache(hfsmp); + printf("HFS: Red-Black Allocator Tree BlockMarkAllocated error\n"); + } + + return err; +} +#endif + + + +/* + * BlockMarkFree + * + * This is a wrapper function around the internal calls which will actually mark the blocks + * as freed. It will mark the blocks in the red-black tree if appropriate. We need to do + * this logic here to avoid callers having to deal with whether or not the red-black tree + * is enabled. + * + */ +OSErr BlockMarkFree( + ExtendedVCB *vcb, + u_int32_t startingBlock, + register u_int32_t numBlocks) +{ + struct hfsmount *hfsmp; + hfsmp = VCBTOHFS(vcb); +#if CONFIG_HFS_ALLOC_RBTREE + if (hfs_isrbtree_active(hfsmp)) { + int err; + + if ((startingBlock >= hfsmp->offset_block_end) && + (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS)) { + /* + * We're manipulating a portion of the bitmap that is not controlled by the + * red-black tree. Just update the bitmap and don't bother manipulating the tree + */ + goto justbitmap; + } + + err = BlockMarkFreeRBTree(vcb, startingBlock, numBlocks); + if (err == noErr) { + if ( ALLOC_DEBUG ) { + REQUIRE_FILE_LOCK(hfsmp->hfs_allocation_vp, false); + if (hfs_isallocated(hfsmp, startingBlock, numBlocks)) { + panic ("HFS RBTree Allocator: Blocks starting @ %x for %x blocks in use!\n", + startingBlock, numBlocks); + } + check_rbtree_extents (hfsmp, startingBlock, numBlocks, ASSERT_FREE); + } + } + return err; + } +justbitmap: +#endif + return BlockMarkFreeInternal(vcb, startingBlock, numBlocks, true); + +} + + +/* + * BlockMarkFreeUnused + * + * Scan the bitmap block beyond end of current file system for bits + * that are marked as used. If any of the bits are marked as used, + * this function marks them free. + * + * Note: This was specifically written to mark all bits beyond + * end of current file system during hfs_extendfs(), which makes + * sure that all the new blocks added to the file system are + * marked as free. We expect that all the blocks beyond end of + * current file system are always marked as free, but there might + * be cases where are marked as used. This function assumes that + * the number of blocks marked as used incorrectly are relatively + * small, otherwise this can overflow journal transaction size + * on certain file system configurations (example, large unused + * bitmap with relatively small journal). + * + * Input: + * startingBlock: First block of the range to mark unused + * numBlocks: Number of blocks in the range to mark unused + * + * Returns: zero on success, non-zero on error. + */ +OSErr BlockMarkFreeUnused(ExtendedVCB *vcb, u_int32_t startingBlock, register u_int32_t numBlocks) +{ + int error = 0; + struct hfsmount *hfsmp = VCBTOHFS(vcb); + u_int32_t curNumBlocks; + u_int32_t bitsPerBlock; + u_int32_t lastBit; + + /* Use the optimal bitmap I/O size instead of bitmap block size */ + bitsPerBlock = hfsmp->vcbVBMIOSize * kBitsPerByte; + + /* + * First clear any non bitmap allocation block aligned bits + * + * Calculate the first bit in the bitmap block next to + * the bitmap block containing the bit for startingBlock. + * Using this value, we calculate the total number of + * bits to be marked unused from startingBlock to the + * end of bitmap block containing startingBlock. + */ + lastBit = ((startingBlock + (bitsPerBlock - 1))/bitsPerBlock) * bitsPerBlock; + curNumBlocks = lastBit - startingBlock; + if (curNumBlocks > numBlocks) { + curNumBlocks = numBlocks; + } + error = BlockMarkFreeInternal(vcb, startingBlock, curNumBlocks, false); + if (error) { + return error; + } + startingBlock += curNumBlocks; + numBlocks -= curNumBlocks; + + /* + * Check a full bitmap block for any 'used' bit. If any bit is used, + * mark all the bits only in that bitmap block as free. This ensures + * that we do not write unmodified bitmap blocks and do not + * overwhelm the journal. + * + * The code starts by checking full bitmap block at a time, and + * marks entire bitmap block as free only if any bit in that bitmap + * block is marked as used. In the end, it handles the last bitmap + * block which might be partially full by only checking till the + * caller-specified last bit and if any bit is set, only mark that + * range as free. + */ + while (numBlocks) { + if (numBlocks >= bitsPerBlock) { + curNumBlocks = bitsPerBlock; + } else { + curNumBlocks = numBlocks; + } + if (hfs_isallocated(hfsmp, startingBlock, curNumBlocks) == true) { + error = BlockMarkFreeInternal(vcb, startingBlock, curNumBlocks, false); + if (error) { + return error; + } + } + startingBlock += curNumBlocks; + numBlocks -= curNumBlocks; + } + + return error; +} + +/* +_______________________________________________________________________ + +Routine: BlockMarkFreeInternal + +Function: Mark a contiguous group of blocks as free (clear in the + bitmap). It assumes those bits are currently marked + allocated (set in the bitmap). + +Inputs: + vcb Pointer to volume where space is to be freed + startingBlock First block number to mark as freed + numBlocks Number of blocks to mark as freed + do_validate If true, validate that the blocks being + deallocated to check if they are within totalBlocks + for current volume and whether they were allocated + before they are marked free. +_______________________________________________________________________ +*/ +static +OSErr BlockMarkFreeInternal( + ExtendedVCB *vcb, + u_int32_t startingBlock_in, + register u_int32_t numBlocks_in, + Boolean do_validate) +{ + OSErr err; + u_int32_t startingBlock = startingBlock_in; + u_int32_t numBlocks = numBlocks_in; + register u_int32_t *currentWord; // Pointer to current word within bitmap block + register u_int32_t wordsLeft; // Number of words left in this bitmap block + register u_int32_t bitMask; // Word with given bits already set (ready to OR in) + u_int32_t firstBit; // Bit index within word of first bit to allocate + u_int32_t numBits; // Number of bits in word to allocate + u_int32_t *buffer = NULL; + uintptr_t blockRef; + u_int32_t bitsPerBlock; + u_int32_t wordsPerBlock; + // XXXdbg + struct hfsmount *hfsmp = VCBTOHFS(vcb); + + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_MARK_FREE_BITMAP | DBG_FUNC_START, startingBlock_in, numBlocks_in, do_validate, 0, 0); + + /* + * NOTE: We use vcb->totalBlocks instead of vcb->allocLimit because we + * need to be able to free blocks being relocated during hfs_truncatefs. + */ + if ((do_validate == true) && + (startingBlock + numBlocks > vcb->totalBlocks)) { + if (ALLOC_DEBUG) { + panic ("BlockMarkFreeInternal() free non-existent blocks at %u (numBlock=%u) on vol %s\n", startingBlock, numBlocks, vcb->vcbVN); + } + + printf ("hfs: BlockMarkFreeInternal() trying to free non-existent blocks starting at %u (numBlock=%u) on volume %s\n", startingBlock, numBlocks, vcb->vcbVN); + hfs_mark_volume_inconsistent(vcb); + err = EIO; + goto Exit; + } + + // + // Pre-read the bitmap block containing the first word of allocation + // + + err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef); + if (err != noErr) goto Exit; + // XXXdbg + if (hfsmp->jnl) { + journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef); + } + + // + // Initialize currentWord, and wordsLeft. + // + { + u_int32_t wordIndexInBlock; bitsPerBlock = vcb->vcbVBMIOSize * kBitsPerByte; wordsPerBlock = vcb->vcbVBMIOSize / kBytesPerWord; @@ -1528,7 +2619,8 @@ OSErr BlockMarkFree( numBits = numBlocks; // entire allocation is inside this one word bitMask &= ~(kAllBitsSetInWord >> (firstBit + numBits)); // turn off bits after last } - if ((*currentWord & SWAP_BE32 (bitMask)) != SWAP_BE32 (bitMask)) { + if ((do_validate == true) && + (*currentWord & SWAP_BE32 (bitMask)) != SWAP_BE32 (bitMask)) { goto Corruption; } *currentWord &= SWAP_BE32 (~bitMask); // clear the bits in the bitmap @@ -1563,7 +2655,8 @@ OSErr BlockMarkFree( currentWord = buffer; wordsLeft = wordsPerBlock; } - if (*currentWord != SWAP_BE32 (kAllBitsSetInWord)) { + if ((do_validate == true) && + (*currentWord != SWAP_BE32 (kAllBitsSetInWord))) { goto Corruption; } *currentWord = 0; // clear the entire word @@ -1599,7 +2692,8 @@ OSErr BlockMarkFree( currentWord = buffer; wordsLeft = wordsPerBlock; } - if ((*currentWord & SWAP_BE32 (bitMask)) != SWAP_BE32 (bitMask)) { + if ((do_validate == true) && + (*currentWord & SWAP_BE32 (bitMask)) != SWAP_BE32 (bitMask)) { goto Corruption; } *currentWord &= SWAP_BE32 (~bitMask); // clear the bits in the bitmap @@ -1612,24 +2706,128 @@ OSErr BlockMarkFree( if (buffer) (void)ReleaseBitmapBlock(vcb, blockRef, true); - if (CONFIG_HFS_TRIM && err == noErr) { + if (err == noErr) { hfs_unmap_free_extent(vcb, startingBlock_in, numBlocks_in); } + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_MARK_FREE_BITMAP | DBG_FUNC_END, err, 0, 0, 0, 0); return err; Corruption: #if DEBUG_BUILD - panic("hfs: BlockMarkFree: blocks not allocated!"); + panic("hfs: BlockMarkFreeInternal: blocks not allocated!"); #else - printf ("hfs: BlockMarkFree() trying to free unallocated blocks on volume %s\n", vcb->vcbVN); + printf ("hfs: BlockMarkFreeInternal() trying to free unallocated blocks (%u,%u) on volume %s\n", startingBlock, numBlocks, vcb->vcbVN); hfs_mark_volume_inconsistent(vcb); err = EIO; goto Exit; #endif } +#if CONFIG_HFS_ALLOC_RBTREE +/* + * This is a wrapper function around BlockMarkFree. This function is + * called when the RB Tree-based allocator needs to mark a block as no longer + * in use. This function should take the locks that would not normally be + * necessary for the normal bitmap deallocator, and then call the function. Once + * the on-disk data structures are updated properly, then this will update an + * existing rb-tree node if possible, or else create a new one. + */ + +OSErr BlockMarkFreeRBTree( + ExtendedVCB *vcb, + u_int32_t startingBlock, + register u_int32_t numBlocks) +{ + OSErr err; + struct hfsmount *hfsmp = VCBTOHFS(vcb); + int rb_err = 0; + + if (ALLOC_DEBUG) { + REQUIRE_FILE_LOCK(vcb->hfs_allocation_vp, false); + if (!hfs_isallocated(hfsmp, startingBlock, numBlocks)) { + panic ("HFS RBTree Allocator: Trying to free blocks starting @ %x for %x but blocks not in use! \n", + startingBlock, numBlocks); + } + check_rbtree_extents (VCBTOHFS(vcb), startingBlock, numBlocks, ASSERT_ALLOC); + } + + err = BlockMarkFreeInternal(vcb, startingBlock, numBlocks, true); + + if (err == noErr) { + + /* + * During a filesystem truncation, we may need to relocate files out of the + * portion of the bitmap that is no longer controlled by the r/b tree. + * In this case, just update the bitmap and do not attempt to manipulate the tree. + */ + if ((startingBlock >= hfsmp->offset_block_end) && + (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS)) { + goto free_error; + } + + extent_node_t *newnode; + + if (ALLOC_DEBUG) { + /* + * Validate that the blocks in question are not allocated in the bitmap, and that they're + * not in the offset tree, since it should be tracking free extents, rather than allocated + * extents + */ + if (hfs_isallocated(hfsmp, startingBlock, numBlocks)) { + panic ("HFS RBTree Allocator: Blocks starting @ %x for %x blocks still marked in-use!\n", + startingBlock, numBlocks); + } + } + + if ((hfsmp->extent_tree_flags & HFS_ALLOC_RB_ACTIVE) == 0) { + if (startingBlock >= hfsmp->offset_block_end) { + /* + * If the tree generation code has not yet finished scanning the + * bitmap region containing this extent, do nothing. If the start + * of the range to be deallocated is greater than the current high + * watermark on the offset tree, just bail out and let the scanner catch up with us. + */ + rb_err = 0; + goto free_error; + } + } + + newnode = extent_tree_free_space(&hfsmp->offset_tree, numBlocks, startingBlock); + if (newnode == NULL) { + rb_err = 1; + goto free_error; + } + + if (ALLOC_DEBUG) { + check_rbtree_extents (VCBTOHFS(vcb), startingBlock, numBlocks, ASSERT_FREE); + } + + } + +free_error: + /* + * We follow the same principle as in BlockMarkAllocatedRB. + * If we encounter an error in adding the extents to the rb-tree, then immediately + * back off, destroy the trees, and persistently set a bit in the runtime hfsmp flags + * to indicate we should not use the rb-tree until next mount, when we can force a rebuild. + */ + if (rb_err) { + /* Mark RB-Trees with error */ + hfsmp->extent_tree_flags |= HFS_ALLOC_RB_ERRORED; + DestroyTrees(hfsmp); + /* Reset the Free Ext Cache since we'll be using it now. */ + ResetVCBFreeExtCache(hfsmp); + printf("HFS: Red-Black Allocator Tree BlockMarkFree error\n"); + } + + + return err; + +} +#endif /* _______________________________________________________________________ @@ -1639,6 +2837,9 @@ Routine: BlockFindContiguous Function: Find a contiguous range of blocks that are free (bits clear in the bitmap). If a contiguous range of the minimum size can't be found, an error will be returned. + This is only needed to support the bitmap-scanning logic, + as the red-black tree should be able to do this by internally + searching its tree. Inputs: vcb Pointer to volume where space is to be allocated @@ -1680,7 +2881,10 @@ static OSErr BlockFindContiguous( register u_int32_t tempWord; uintptr_t blockRef; u_int32_t wordsPerBlock; - u_int32_t j, updated_free_extents = 0, really_add; + u_int32_t updated_free_extent = 0; + + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_BLOCK_FIND_CONTIG | DBG_FUNC_START, startingBlock, endingBlock, minBlocks, maxBlocks, 0); /* * When we're skipping the metadata zone and the start/end @@ -1916,79 +3120,10 @@ static OSErr BlockFindContiguous( if (foundBlocks >= minBlocks) break; // Found what we needed! - HFS_MOUNT_LOCK(vcb, TRUE); - if (free_extent_cache_active(vcb) == 0) { - HFS_MOUNT_UNLOCK(vcb, TRUE); - goto skip_cache; - } - HFS_MOUNT_UNLOCK(vcb, TRUE); - - // This free chunk wasn't big enough. Try inserting it into the free extent cache in case - // the allocation wasn't forced contiguous. - really_add = 0; - for(j=0; j < vcb->vcbFreeExtCnt; j++) { - u_int32_t start, end; - - start = vcb->vcbFreeExt[j].startBlock; - end = start + vcb->vcbFreeExt[j].blockCount; - - if ( (firstBlock >= start && firstBlock < end) - || ((firstBlock + foundBlocks) > start && firstBlock < start)) { - - // there's overlap with an existing entry so do not add this - break; - } - - } - - if (j >= vcb->vcbFreeExtCnt) { - really_add = 1; - } - - tempWord = vcb->vcbFreeExtCnt; - if (really_add && (vcb->hfs_flags & HFS_HAS_SPARSE_DEVICE)) { - // Sorted by starting block - if (tempWord == kMaxFreeExtents && vcb->vcbFreeExt[kMaxFreeExtents-1].startBlock > firstBlock) - --tempWord; - if (tempWord < kMaxFreeExtents) - { - // We're going to add this extent. Bubble any smaller extents down in the list. - while (tempWord && vcb->vcbFreeExt[tempWord-1].startBlock > firstBlock) - { - vcb->vcbFreeExt[tempWord] = vcb->vcbFreeExt[tempWord-1]; - --tempWord; - } - vcb->vcbFreeExt[tempWord].startBlock = firstBlock; - vcb->vcbFreeExt[tempWord].blockCount = foundBlocks; - - if (vcb->vcbFreeExtCnt < kMaxFreeExtents) { - ++vcb->vcbFreeExtCnt; - } - updated_free_extents = 1; - } - } else if (really_add) { - // Sorted by blockCount - if (tempWord == kMaxFreeExtents && vcb->vcbFreeExt[kMaxFreeExtents-1].blockCount < foundBlocks) - --tempWord; - if (tempWord < kMaxFreeExtents) - { - // We're going to add this extent. Bubble any smaller extents down in the list. - while (tempWord && vcb->vcbFreeExt[tempWord-1].blockCount < foundBlocks) - { - vcb->vcbFreeExt[tempWord] = vcb->vcbFreeExt[tempWord-1]; - --tempWord; - } - vcb->vcbFreeExt[tempWord].startBlock = firstBlock; - vcb->vcbFreeExt[tempWord].blockCount = foundBlocks; - - if (vcb->vcbFreeExtCnt < kMaxFreeExtents) { - ++vcb->vcbFreeExtCnt; - } - updated_free_extents = 1; - } - } -skip_cache: - sanity_check_free_ext(vcb, 0); + /* We did not find the total blocks were were looking for, but + * lets add this free block run to our free extent cache list + */ + updated_free_extent = add_free_extent_cache(vcb, firstBlock, foundBlocks); } while (currentBlock < stopBlock); LoopExit: @@ -2017,17 +3152,19 @@ static OSErr BlockFindContiguous( } } - if (updated_free_extents && (vcb->hfs_flags & HFS_HAS_SPARSE_DEVICE)) { + if (updated_free_extent && (vcb->hfs_flags & HFS_HAS_SPARSE_DEVICE)) { int i; u_int32_t min_start = vcb->totalBlocks; // set the nextAllocation pointer to the smallest free block number // we've seen so on the next mount we won't rescan unnecessarily + lck_spin_lock(&vcb->vcbFreeExtLock); for(i=0; i < (int)vcb->vcbFreeExtCnt; i++) { if (vcb->vcbFreeExt[i].startBlock < min_start) { min_start = vcb->vcbFreeExt[i].startBlock; } } + lck_spin_unlock(&vcb->vcbFreeExtLock); if (min_start != vcb->totalBlocks) { if (min_start < vcb->nextAllocation) { vcb->nextAllocation = min_start; @@ -2041,71 +3178,296 @@ static OSErr BlockFindContiguous( if (buffer) (void) ReleaseBitmapBlock(vcb, blockRef, false); - sanity_check_free_ext(vcb, 1); + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_BLOCK_FIND_CONTIG | DBG_FUNC_END, err, *actualStartBlock, *actualNumBlocks, 0, 0); return err; } + +#if CONFIG_HFS_ALLOC_RBTREE /* - * Test to see if any blocks in a range are allocated. + * Wrapper function around hfs_isrbtree_allocated. This just takes the start offset, + * and the number of blocks, and whether or not we should check if the blocks are + * free or not. This function is designed to be used primarily with the debug #ifdef + * enabled, so it results in a panic if anything unexpected occurs. * - * The journal or allocation file lock must be held. + * shouldBeFree will be nonzero if the caller expects the zone to be free. */ -__private_extern__ -int -hfs_isallocated(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBlocks) -{ - u_int32_t *currentWord; // Pointer to current word within bitmap block - u_int32_t wordsLeft; // Number of words left in this bitmap block - u_int32_t bitMask; // Word with given bits already set (ready to test) - u_int32_t firstBit; // Bit index within word of first bit to allocate - u_int32_t numBits; // Number of bits in word to allocate - u_int32_t *buffer = NULL; - uintptr_t blockRef; - u_int32_t bitsPerBlock; - u_int32_t wordsPerBlock; - int inuse = 0; - int error; - - /* - * Pre-read the bitmap block containing the first word of allocation - */ - error = ReadBitmapBlock(hfsmp, startingBlock, &buffer, &blockRef); - if (error) - return (error); - - /* - * Initialize currentWord, and wordsLeft. - */ - { - u_int32_t wordIndexInBlock; - - bitsPerBlock = hfsmp->vcbVBMIOSize * kBitsPerByte; - wordsPerBlock = hfsmp->vcbVBMIOSize / kBytesPerWord; - wordIndexInBlock = (startingBlock & (bitsPerBlock-1)) / kBitsPerWord; - currentWord = buffer + wordIndexInBlock; - wordsLeft = wordsPerBlock - wordIndexInBlock; +void check_rbtree_extents (struct hfsmount *hfsmp, u_int32_t startBlocks, + u_int32_t numBlocks, int shouldBeFree) { + int alloc; + extent_node_t *node1 = NULL; + u_int32_t off1 = 0; + u_int32_t len1 = 0; + alloc = hfs_isrbtree_allocated (hfsmp, startBlocks, numBlocks, &node1); + + if (node1) { + off1 = node1->offset; + len1 = node1->length; } - /* - * First test any non word aligned bits. - */ - firstBit = startingBlock % kBitsPerWord; - if (firstBit != 0) { - bitMask = kAllBitsSetInWord >> firstBit; - numBits = kBitsPerWord - firstBit; - if (numBits > numBlocks) { - numBits = numBlocks; - bitMask &= ~(kAllBitsSetInWord >> (firstBit + numBits)); - } - if ((*currentWord & SWAP_BE32 (bitMask)) != 0) { - inuse = 1; - goto Exit; + if (shouldBeFree) { + /* + * If the region should be free, then we expect to see extents in the tree + * matching this start and length. Alloc != 0 means some portion of the extent + * specified was allocated. + */ + if (alloc != 0){ + panic ("HFS check_rbtree_extents: Node (%p) do not exist! " + "node1 off (%d),len(%d),, start(%d) end(%d)\n", + node1, off1, len1, startBlocks, numBlocks); } - numBlocks -= numBits; - ++currentWord; - --wordsLeft; + } + else { + /* + * Otherwise, this means that the region should be allocated, and if we find + * an extent matching it, that's bad. + */ + if (alloc == 0){ + panic ("HFS check_rbtree_extents: Node (%p) exists! " + "node1 off (%d),len(%d), start(%d) end(%d)\n", + node1, off1, len1, startBlocks, numBlocks); + } + } +} +#endif + +#if CONFIG_HFS_ALLOC_RBTREE +/* + * Exhaustive validation search. This function iterates over all allocation blocks and + * compares their status in the red-black tree vs. the allocation bitmap. If the two are out of sync + * then it will panic. Bitmap lock must be held while this function is run. + * + * Because this function requires a red-black tree search to validate every allocation block, it is + * very expensive and should ONLY be run in debug mode, and even then, infrequently. + * + * 'end' is non-inclusive, so it should represent the total number of blocks in the volume. + * + */ +void +hfs_validate_rbtree (struct hfsmount *hfsmp, u_int32_t start, u_int32_t end){ + + u_int32_t current; + extent_node_t* node1; + + hfs_checktreelinks (hfsmp); + + for (current = start; current < end; current++) { + node1 = NULL; + int rbtree = hfs_isrbtree_allocated(hfsmp, current, 1, &node1); + int bitmap = hfs_isallocated(hfsmp, current, 1); + + if (bitmap != rbtree){ + panic("HFS: Allocator mismatch @ block %d -- bitmap %d : rbtree %d\n", + current, bitmap, rbtree); + } + } +} + +/* + * Exhaustive Red-Black Tree Linked List verification routine. + * + * This function iterates through the red-black tree's nodes, and then verifies that the linked list + * embedded within each of the nodes accurately points to the correct node as its "next" pointer. + * The bitmap lock must be held while this function is run. + */ + +void +hfs_checktreelinks (struct hfsmount *hfsmp) { + extent_tree_offset_t *tree = &hfsmp->offset_tree; + + extent_node_t *current = NULL; + extent_node_t *next = NULL; + extent_node_t *treenext; + + current = extent_tree_off_first (tree); + + while (current) { + next = current->offset_next; + treenext = extent_tree_off_next (tree, current); + if (next != treenext) { + panic("hfs_checktreelinks: mismatch for node (%p), next: %p , treenext %p !\n", current, next, treenext); + } + current = treenext; + } +} + +#endif + + +#if CONFIG_HFS_ALLOC_RBTREE +/* + * Test to see if any free blocks exist at a given offset. + * If there exists a node at the specified offset, it will return the appropriate + * node. + * + * NULL indicates allocated blocks exist at that offset. + * + * Allocation file lock must be held. + * + * Returns: + * 1 if blocks in the range are allocated. + * 0 if all blocks in the range are free. + */ + +static int +hfs_isrbtree_allocated (struct hfsmount *hfsmp, u_int32_t startBlock, + u_int32_t numBlocks, extent_node_t **ret_node) { + + extent_node_t search_sentinel; + extent_node_t *node = NULL; + extent_node_t *nextnode = NULL; + + /* + * With only one tree, then we just have to validate that there are entries + * in the R/B tree at the specified offset if it really is free. + */ + search_sentinel.offset = startBlock; + search_sentinel.length = numBlocks; + + node = extent_tree_off_search_prev(&hfsmp->offset_tree, &search_sentinel); + if (node) { + + *ret_node = node; + nextnode = extent_tree_off_next (&hfsmp->offset_tree, node); + if (nextnode != node->offset_next) { + panic ("hfs_rbtree_isallocated: Next pointers out of sync!\n"); + } + + /* + * Check to see if it is a superset of our target range. Because we started + * with the offset or some offset prior to it, then we know the node's offset is + * at least <= startBlock. So, if the end of the node is greater than the end of + * our target range, then the whole range is free. + */ + + if ((node->offset + node->length) >= (startBlock + numBlocks)) { + if (node->offset > startBlock) { + panic ("hfs_rbtree_isallocated: bad node ordering!"); + } + return 0; + } + } + /* + * We got here if either our node search resulted in a node whose extent + * was strictly before our target offset, or we couldnt' find a previous node + * at all (the beginning of the volume). If the former, then we can infer that + * at least one block in the target range is allocated since the next node's offset + * must be greater than startBlock. + * + * Either way, this means that the target node is unavailable to allocate, so + * just return 1; + */ + return 1; +} + + +#endif + +/* + * Count number of bits set in the given 32-bit unsigned number + * + * Returns: + * Number of bits set + */ +static int num_bits_set(u_int32_t num) +{ + int count; + + for (count = 0; num; count++) { + num &= num - 1; + } + + return count; +} + +/* + * For a given range of blocks, find the total number of blocks + * allocated. If 'stop_on_first' is true, it stops as soon as it + * encounters the first allocated block. This option is useful + * to determine if any block is allocated or not. + * + * Inputs: + * startingBlock First allocation block number of the range to be scanned. + * numBlocks Total number of blocks that need to be scanned. + * stop_on_first Stop the search after the first allocated block is found. + * + * Output: + * allocCount Total number of allocation blocks allocated in the given range. + * + * On error, it is the number of allocated blocks found + * before the function got an error. + * + * If 'stop_on_first' is set, + * allocCount = 1 if any allocated block was found. + * allocCount = 0 if no allocated block was found. + * + * Returns: + * 0 on success, non-zero on failure. + */ +static int +hfs_isallocated_internal(struct hfsmount *hfsmp, u_int32_t startingBlock, + u_int32_t numBlocks, Boolean stop_on_first, u_int32_t *allocCount) +{ + u_int32_t *currentWord; // Pointer to current word within bitmap block + u_int32_t wordsLeft; // Number of words left in this bitmap block + u_int32_t bitMask; // Word with given bits already set (ready to test) + u_int32_t firstBit; // Bit index within word of first bit to allocate + u_int32_t numBits; // Number of bits in word to allocate + u_int32_t *buffer = NULL; + uintptr_t blockRef; + u_int32_t bitsPerBlock; + u_int32_t wordsPerBlock; + u_int32_t blockCount = 0; + int error; + + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_IS_ALLOCATED | DBG_FUNC_START, startingBlock, numBlocks, stop_on_first, 0, 0); + + /* + * Pre-read the bitmap block containing the first word of allocation + */ + error = ReadBitmapBlock(hfsmp, startingBlock, &buffer, &blockRef); + if (error) + goto JustReturn; + + /* + * Initialize currentWord, and wordsLeft. + */ + { + u_int32_t wordIndexInBlock; + + bitsPerBlock = hfsmp->vcbVBMIOSize * kBitsPerByte; + wordsPerBlock = hfsmp->vcbVBMIOSize / kBytesPerWord; + + wordIndexInBlock = (startingBlock & (bitsPerBlock-1)) / kBitsPerWord; + currentWord = buffer + wordIndexInBlock; + wordsLeft = wordsPerBlock - wordIndexInBlock; + } + + /* + * First test any non word aligned bits. + */ + firstBit = startingBlock % kBitsPerWord; + if (firstBit != 0) { + bitMask = kAllBitsSetInWord >> firstBit; + numBits = kBitsPerWord - firstBit; + if (numBits > numBlocks) { + numBits = numBlocks; + bitMask &= ~(kAllBitsSetInWord >> (firstBit + numBits)); + } + if ((*currentWord & SWAP_BE32 (bitMask)) != 0) { + if (stop_on_first) { + blockCount = 1; + goto Exit; + } + blockCount += num_bits_set(*currentWord & SWAP_BE32 (bitMask)); + } + numBlocks -= numBits; + ++currentWord; + --wordsLeft; } /* @@ -2128,8 +3490,11 @@ hfs_isallocated(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBl wordsLeft = wordsPerBlock; } if (*currentWord != 0) { - inuse = 1; - goto Exit; + if (stop_on_first) { + blockCount = 1; + goto Exit; + } + blockCount += num_bits_set(*currentWord); } numBlocks -= kBitsPerWord; ++currentWord; @@ -2156,54 +3521,922 @@ hfs_isallocated(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBl wordsLeft = wordsPerBlock; } if ((*currentWord & SWAP_BE32 (bitMask)) != 0) { - inuse = 1; - goto Exit; + if (stop_on_first) { + blockCount = 1; + goto Exit; + } + blockCount += num_bits_set(*currentWord & SWAP_BE32 (bitMask)); } } Exit: if (buffer) { (void)ReleaseBitmapBlock(hfsmp, blockRef, false); } + if (allocCount) { + *allocCount = blockCount; + } + +JustReturn: + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_IS_ALLOCATED | DBG_FUNC_END, error, 0, blockCount, 0, 0); + + return (error); +} + +/* + * Count total number of blocks that are allocated in the given + * range from the bitmap. This is used to preflight total blocks + * that need to be relocated during volume resize. + * + * The journal or allocation file lock must be held. + * + * Returns: + * 0 on success, non-zero on failure. + * On failure, allocCount is zero. + */ +int +hfs_count_allocated(struct hfsmount *hfsmp, u_int32_t startBlock, + u_int32_t numBlocks, u_int32_t *allocCount) +{ + return hfs_isallocated_internal(hfsmp, startBlock, numBlocks, false, allocCount); +} + +/* + * Test to see if any blocks in a range are allocated. + * + * Note: On error, this function returns 1, which means that + * one or more blocks in the range are allocated. This function + * is primarily used for volume resize and we do not want + * to report to the caller that the blocks are free when we + * were not able to deterministically find it out. So on error, + * we always report that the blocks are allocated. + * + * The journal or allocation file lock must be held. + * + * Returns + * 0 if all blocks in the range are free. + * 1 if blocks in the range are allocated, or there was an error. + */ +int +hfs_isallocated(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBlocks) +{ + int error; + u_int32_t allocCount; + + error = hfs_isallocated_internal(hfsmp, startingBlock, numBlocks, true, &allocCount); + if (error) { + /* On error, we always say that the blocks are allocated + * so that volume resize does not return false success. + */ + return 1; + } else { + /* The function was deterministically able to find out + * if there was any block allocated or not. In that case, + * the value in allocCount is good enough to be returned + * back to the caller. + */ + return allocCount; + } +} + +/* + * Check to see if the red-black tree is live. Allocation file lock must be held + * shared or exclusive to call this function. Note that we may call this even if + * HFS is built without activating the red-black tree code. + */ +__private_extern__ +int +hfs_isrbtree_active(struct hfsmount *hfsmp){ + + //TODO: Update this function to deal with a truncate/resize coming in when the tree + //isn't fully finished. maybe we need to check the flags for something other than ENABLED? + +#if CONFIG_HFS_ALLOC_RBTREE + if (ALLOC_DEBUG) { + REQUIRE_FILE_LOCK(hfsmp->hfs_allocation_vp, false); + } + if (hfsmp){ + + if (hfsmp->extent_tree_flags & HFS_ALLOC_RB_ENABLED) { + return 1; + } + } +#else + #pragma unused (hfsmp) +#endif + /* If the RB Tree code is not enabled, then just always return 0 */ + return 0; +} + +#if CONFIG_HFS_ALLOC_RBTREE +/* + * This function is basically the same as hfs_isallocated, except it's designed for + * use with the red-black tree validation code. It assumes we're only checking whether + * one bit is active, and that we're going to pass in the buf to use, since GenerateTree + * calls ReadBitmapBlock and will have that buf locked down for the duration of its operation. + * + * This should not be called in general purpose scanning code. + */ +int hfs_isallocated_scan(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t *bp_buf) { + + u_int32_t *currentWord; // Pointer to current word within bitmap block + u_int32_t bitMask; // Word with given bits already set (ready to test) + u_int32_t firstBit; // Bit index within word of first bit to allocate + u_int32_t numBits; // Number of bits in word to allocate + u_int32_t bitsPerBlock; + uintptr_t blockRef; + u_int32_t wordsPerBlock; + u_int32_t numBlocks = 1; + u_int32_t *buffer = NULL; + + int inuse = 0; + int error; + + + if (bp_buf) { + /* just use passed-in buffer if avail. */ + buffer = bp_buf; + } + else { + /* + * Pre-read the bitmap block containing the first word of allocation + */ + error = ReadBitmapBlock(hfsmp, startingBlock, &buffer, &blockRef); + if (error) + return (error); + } + + /* + * Initialize currentWord, and wordsLeft. + */ + u_int32_t wordIndexInBlock; + + bitsPerBlock = hfsmp->vcbVBMIOSize * kBitsPerByte; + wordsPerBlock = hfsmp->vcbVBMIOSize / kBytesPerWord; + + wordIndexInBlock = (startingBlock & (bitsPerBlock-1)) / kBitsPerWord; + currentWord = buffer + wordIndexInBlock; + + /* + * First test any non word aligned bits. + */ + firstBit = startingBlock % kBitsPerWord; + bitMask = kAllBitsSetInWord >> firstBit; + numBits = kBitsPerWord - firstBit; + if (numBits > numBlocks) { + numBits = numBlocks; + bitMask &= ~(kAllBitsSetInWord >> (firstBit + numBits)); + } + if ((*currentWord & SWAP_BE32 (bitMask)) != 0) { + inuse = 1; + goto Exit; + } + numBlocks -= numBits; + ++currentWord; + +Exit: + if(bp_buf == NULL) { + if (buffer) { + (void)ReleaseBitmapBlock(hfsmp, blockRef, false); + } + } return (inuse); + + + +} + +/* + * This function scans the specified block and adds it to the pair of trees specified + * in its arguments. We break this behavior out of GenerateTree so that an allocating + * thread can invoke this if the tree does not have enough extents to satisfy + * an allocation request. + * + * startbit - the allocation block represented by a bit in 'allocblock' where we need to + * start our scan. For instance, we may need to start the normal allocation scan + * in the middle of an existing allocation block. + * endBit - the allocation block where we should end this search (inclusive). + * bitToScan - output argument for this function to specify the next bit to scan. + * + * Returns: + * 0 on success + * nonzero on failure. + */ + +static int hfs_alloc_scan_block(struct hfsmount *hfsmp, u_int32_t startbit, + u_int32_t endBit, u_int32_t *bitToScan) { + + int error; + u_int32_t curAllocBlock; + struct buf *blockRef = NULL; + u_int32_t *buffer = NULL; + u_int32_t wordIndexInBlock; + u_int32_t blockSize = (u_int32_t)hfsmp->vcbVBMIOSize; + u_int32_t wordsPerBlock = blockSize / kBytesPerWord; + u_int32_t offset = 0; + u_int32_t size = 0; + + /* + * Read the appropriate block from the bitmap file. ReadBitmapBlock + * figures out which actual on-disk block corresponds to the bit we're + * looking at. + */ + error = ReadBitmapBlock(hfsmp, startbit, &buffer, (uintptr_t*)&blockRef); + if (error) { + return error; + } + + /* curAllocBlock represents the logical block we're analyzing. */ + curAllocBlock = startbit; + + /* Figure out which word curAllocBlock corresponds to in the block we read */ + wordIndexInBlock = (curAllocBlock / kBitsPerWord) % wordsPerBlock; + + /* Scan a word at a time */ + while (wordIndexInBlock < wordsPerBlock) { + u_int32_t currentWord = SWAP_BE32(buffer[wordIndexInBlock]); + u_int32_t curBit; + + /* modulate curBit because it may start in the middle of a word */ + for (curBit = curAllocBlock % kBitsPerWord; curBit < kBitsPerWord; curBit++) { + + u_int32_t is_allocated = currentWord & (1 << (kBitsWithinWordMask - curBit)); + if (ALLOC_DEBUG) { + u_int32_t res = hfs_isallocated_scan (hfsmp, curAllocBlock, buffer); + if ( ((res) && (!is_allocated)) || ((!res) && (is_allocated))) { + panic("hfs_alloc_scan: curAllocBit %u, curBit (%d), word (0x%x), is_allocated (0x%x) res(0x%x) \n", + curAllocBlock, curBit, currentWord, is_allocated, res); + } + } + /* + * If curBit is not allocated, keep track of the start of the free range. + * Increment a running tally on how many free blocks in a row we've seen. + */ + if (!is_allocated) { + size++; + if (offset == 0) { + offset = curAllocBlock; + } + } + else { + /* + * If we hit an allocated block, insert the extent that tracked the range + * we saw, and reset our tally counter. + */ + if (size != 0) { + extent_tree_free_space(&hfsmp->offset_tree, size, offset); + size = 0; + offset = 0; + } + } + curAllocBlock++; + /* + * Exit early if the next bit we'd analyze would take us beyond the end of the + * range that we're supposed to scan. + */ + if (curAllocBlock >= endBit) { + goto DoneScanning; + } + } + wordIndexInBlock++; + } +DoneScanning: + + /* We may have been tracking a range of free blocks that hasn't been inserted yet. */ + if (size != 0) { + extent_tree_free_space(&hfsmp->offset_tree, size, offset); + } + /* + * curAllocBlock represents the next block we need to scan while we're in this + * function. + */ + *bitToScan = curAllocBlock; + + ReleaseRBScanBitmapBlock(blockRef); + + return 0; +} + +/* + * Extern function that is called from mount and upgrade mount routines + * that enable us to initialize the tree. + */ + +__private_extern__ +u_int32_t InitTree(struct hfsmount *hfsmp) { + extent_tree_init (&(hfsmp->offset_tree)); + return 0; +} + + +/* + * This function builds the trees specified in its arguments. It uses + * buf_meta_breads to scan through the bitmap and re-build the tree state. + * It is very important to use buf_meta_bread because we need to ensure that we + * read the most current version of the blocks that we're scanning. If we used + * cluster_io, then journaled transactions could still be sitting in RAM since they are + * written to disk in the proper location asynchronously. + * + * Because this could be still running when mount has finished, we need to check + * after every allocation block that we're working on if an unmount or some other + * operation that would cause us to teardown has come in. (think downgrade mount). + * If an unmount has come in, then abort whatever we're doing and return -1 + * to indicate we hit an error. If we don't do this, we'd hold up unmount for + * a very long time. + * + * This function assumes that the bitmap lock is acquired exclusively before being + * called. It will drop the lock and then re-acquire it during operation, but + * will always return with the lock held. + */ +__private_extern__ +u_int32_t GenerateTree(struct hfsmount *hfsmp, u_int32_t endBlock, int *flags, int initialscan) { + + REQUIRE_FILE_LOCK(hfsmp->hfs_allocation_vp, false); + + u_int32_t *cur_block_eof; + int error = 0; + + int USE_FINE_GRAINED_LOCKING = 0; + + /* Initialize the block counter while we hold the bitmap lock */ + cur_block_eof = &hfsmp->offset_block_end; + + /* + * This loop advances over all allocation bitmap blocks of the current region + * to scan them and add the results into the red-black tree. We use the mount point + * variable offset_block_end as our loop counter. This gives us flexibility + * because we can release the allocation bitmap lock and allow a thread that wants + * to make an allocation to grab the lock and do some scanning on our behalf while we're + * waiting to re-acquire the lock. Then, the allocating thread will only do as much bitmap + * scanning as needed to fulfill its allocation. + * + * If the other thread does IO for us, then it will update the offset_block_end + * variable as well, since it will use the same hfs_alloc_scan_block function to do its bit + * scanning. So when we re-grab the lock, our current EOF/loop will immediately skip us to the next + * block that needs scanning. + */ + + while (*cur_block_eof < endBlock) { + + /* + * If the filesystem is being resized before the bitmap has been fully scanned, we'll + * update our endBlock to match the current allocation limit in the hfsmp struct. + * The allocLimit field would only be be updated while holding the bitmap lock, so we won't + * be executing this code at the same time that the resize is going on. + */ + if ((initialscan) && (endBlock != hfsmp->allocLimit)) { + + /* If we're past the new/modified allocLimit, then just stop immediately.*/ + if (*cur_block_eof >= hfsmp->allocLimit ) { + break; + } + endBlock = hfsmp->allocLimit; + } + + /* + * TODO: fix unmount stuff! + * See rdar://7391404 + * + * Once the RB allocator is checked in, we'll want to augment it to not hold the + * allocation bitmap lock for the entire duration of the tree scan. For a first check-in + * it's ok to do that but we can't leave it like that forever. + * + * The gist of the new algorithm will work as follows: + * if an unmount is in flight and has been detected: + * abort tree-build. + * unset tree-in-progress bit. + * wakeup unmount thread + * unlock allocation bitmap lock, fail out. + * + * The corresponding code in the unmount side should already be in place. + */ + + error = hfs_alloc_scan_block (hfsmp, *cur_block_eof, endBlock, cur_block_eof); + + //TODO: Fix this below! + if (USE_FINE_GRAINED_LOCKING){ + hfs_systemfile_unlock(hfsmp, *flags); + *flags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + } + //TODO: Infer that if *flags == 0, we don't actually need to lock/unlock. + } + + return error; } -/* Invalidate free extent cache for a given volume. - * This cache is invalidated and disabled when a volume is being resized - * (via hfs_trucatefs() or hfs_extendefs()). +/* + * This function destroys the specified rb-trees associated with the mount point. + */ +__private_extern__ +void DestroyTrees(struct hfsmount *hfsmp) { + + if (ALLOC_DEBUG) { + REQUIRE_FILE_LOCK(hfsmp->hfs_allocation_vp, false); + printf("DestroyTrees: Validating red/black tree for vol %s\n", (char*) hfsmp->vcbVN); + hfs_validate_rbtree (hfsmp, 0, hfsmp->offset_block_end ); + } + + /* + * extent_tree_destroy will start with the first entry in the tree (by offset), then + * iterate through the tree quickly using its embedded linked list. This results in tree + * destruction in O(n) time. + */ + + if (hfsmp->extent_tree_flags & HFS_ALLOC_RB_ENABLED) { + extent_tree_destroy(&hfsmp->offset_tree); + + /* Mark Trees as disabled */ + hfsmp->extent_tree_flags &= ~HFS_ALLOC_RB_ENABLED; + } + + return; +} + +#endif + +/* + * This function resets all of the data structures relevant to the + * free extent cache stored in the hfsmount struct. + * + * If we are using the red-black tree code then we need to account for the fact that + * we may encounter situations where we need to jettison the tree. If that is the + * case, then we fail-over to the bitmap scanning logic, but we need to ensure that + * the free ext cache is zeroed before we start using it. * - * Returns: Nothing + * We also reset and disable the cache when allocLimit is updated... which + * is when a volume is being resized (via hfs_truncatefs() or hfs_extendfs()). + * It is independent of the type of allocator being used currently. */ -void invalidate_free_extent_cache(ExtendedVCB *vcb) +void ResetVCBFreeExtCache(struct hfsmount *hfsmp) { - u_int32_t i; + int bytes; + void *freeExt; - HFS_MOUNT_LOCK(vcb, TRUE); - for (i = 0; i < vcb->vcbFreeExtCnt; i++) { - vcb->vcbFreeExt[i].startBlock = 0; - vcb->vcbFreeExt[i].blockCount = 0; + if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_RESET_EXTENT_CACHE | DBG_FUNC_START, 0, 0, 0, 0, 0); + + lck_spin_lock(&hfsmp->vcbFreeExtLock); + + /* reset Free Extent Count */ + hfsmp->vcbFreeExtCnt = 0; + + /* reset the actual array */ + bytes = kMaxFreeExtents * sizeof(HFSPlusExtentDescriptor); + freeExt = (void*)(hfsmp->vcbFreeExt); + + bzero (freeExt, bytes); + + lck_spin_unlock(&hfsmp->vcbFreeExtLock); + + if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_RESET_EXTENT_CACHE | DBG_FUNC_END, 0, 0, 0, 0, 0); + + return; +} + +/* + * This function is used to inform the allocator if we have to effectively shrink + * or grow the total number of allocation blocks via hfs_truncatefs or hfs_extendfs. + * + * The bitmap lock must be held when calling this function. This function also modifies the + * allocLimit field in the hfs mount point structure in the general case. + * + * In the shrinking case, we'll have to remove all free extents from the red-black + * tree past the specified offset new_end_block. In the growth case, we'll have to force + * a re-scan of the new allocation blocks from our current allocLimit to the new end block. + * + * new_end_block represents the total number of blocks available for allocation in the resized + * filesystem. Block #new_end_block should not be allocatable in the resized filesystem since it + * will be out of the (0, n-1) range that are indexable in the bitmap. + * + * Returns 0 on success + * errno on failure + */ +__private_extern__ +u_int32_t UpdateAllocLimit (struct hfsmount *hfsmp, u_int32_t new_end_block) { + + /* + * Update allocLimit to the argument specified, but don't do anything else + * if the red/black tree is not enabled. + */ + hfsmp->allocLimit = new_end_block; + + /* Invalidate the free extent cache completely so that + * it does not have any extents beyond end of current + * volume. + */ + ResetVCBFreeExtCache(hfsmp); + +#if CONFIG_HFS_ALLOC_RBTREE + /* Shrinking the existing filesystem */ + if ((new_end_block < hfsmp->offset_block_end) && + (hfsmp->extent_tree_flags & HFS_ALLOC_RB_ACTIVE)) { + extent_node_t search_sentinel; + extent_node_t *node = NULL; + /* Remover points to the current item to free/remove from the tree */ + extent_node_t *remover = NULL; + + /* Begin search at the specified offset */ + memset (&search_sentinel, 0, sizeof(extent_node_t)); + search_sentinel.offset = new_end_block; + + /* + * Find the first available extent that satifies the allocation by searching + * from the starting point or 1 earlier. We may need to split apart an existing node + * if it straddles the new alloc limit. + */ + node = extent_tree_off_search_prev(&hfsmp->offset_tree, &search_sentinel); + if (node) { + /* If it's an exact match, then just remove them all from this point forward */ + if (node->offset == new_end_block) { + /* + * Find the previous entry and update its next pointer to NULL + * since this entry is biting the dust. Update remover to node. + */ + extent_node_t *prev = NULL; + prev = extent_tree_off_prev (&hfsmp->offset_tree, node); + if (prev) { + prev->offset_next = NULL; + } + remover = node; + } + else { + /* See if we need to split this node */ + if ((node->offset + node->length) > new_end_block) { + /* + * Update node to reflect its new size up until new_end_block. + */ + remover = node->offset_next; + node->length = new_end_block - node->offset; + /* node is becoming the last free extent in the volume. */ + node->offset_next = NULL; + } + else { + if (node->offset_next == NULL) { + /* + * 'node' points to the last free extent in the volume. + * Coincidentally, it is also before the new cut-off point at which + * we will stop representing bitmap values in the tree. Just bail out now. + */ + return 0; + } + /* + * Otherwise, point our temp variable 'remover' to the node where + * we'll need to start yanking things out of the tree, and make 'node' + * the last element in the tree in the linked list. + */ + remover = node->offset_next; + if (remover->offset <= new_end_block) { + panic ("UpdateAllocLimit: Invalid RBTree node next ptr!"); + } + node->offset_next = NULL; + } + } + + /* + * Remover is our "temp" pointer that points to the current node to remove from + * the offset tree. We'll simply iterate through the tree linked list, removing the current + * element from the tree, freeing them as we come across them. + */ + while (remover) { + extent_node_t *next = remover->offset_next; + extent_tree_remove_node (&hfsmp->offset_tree, remover); + free_node (remover); + remover = next; + } + + if (ALLOC_DEBUG) { + printf ("UpdateAllocLimit: Validating rbtree after truncation\n"); + hfs_validate_rbtree (hfsmp, 0, new_end_block-1); + } + + /* + * Don't forget to shrink offset_block_end after a successful truncation + * new_end_block should represent the number of blocks available on the + * truncated volume. + */ + + hfsmp->offset_block_end = new_end_block; + + return 0; + } + else { + if (ALLOC_DEBUG) { + panic ("UpdateAllocLimit: no prev!"); + } + return ENOSPC; + } } - vcb->vcbFreeExtCnt = 0; - HFS_MOUNT_UNLOCK(vcb, TRUE); + /* Growing the existing filesystem */ + else if ((new_end_block > hfsmp->offset_block_end) && + (hfsmp->extent_tree_flags & HFS_ALLOC_RB_ACTIVE)) { + int flags = 0; + int retval = 0; + + if (ALLOC_DEBUG) { + printf ("UpdateAllocLimit: Validating rbtree prior to growth\n"); + hfs_validate_rbtree (hfsmp, 0, hfsmp->offset_block_end); + } + + + retval = GenerateTree (hfsmp, new_end_block, &flags, 0); + + /* + * Don't forget to update offset_block_end after a successful tree extension. + */ + if (retval == 0) { + + if (ALLOC_DEBUG) { + printf ("UpdateAllocLimit: Validating rbtree after growth\n"); + hfs_validate_rbtree (hfsmp, 0, new_end_block); + } + + hfsmp->offset_block_end = new_end_block; + } + + return retval; + } + /* Otherwise, do nothing. fall through to the code below. */ + printf ("error : off_block_end: %d, alloclimit: %d, new_end_block: %d\n", + hfsmp->offset_block_end,hfsmp->allocLimit, new_end_block); +#endif + + return 0; + +} + + +/* + * Remove an entry from free extent cache after it has been allocated. + * + * This function does not split extents to remove them from the allocated list. + * + * Inputs: + * hfsmp - mount point structure + * startBlock - starting block of the extent to be removed. + * blockCount - number of blocks of the extent to be removed. + */ +static void remove_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBlock, u_int32_t blockCount) +{ + int i, j; + int extentsRemoved = 0; + u_int32_t start, end; + +#if CONFIG_HFS_ALLOC_RBTREE + /* If red-black tree is enabled, no free extent cache is necessary */ + if (hfs_isrbtree_active(hfsmp) == true) { + return; + } +#endif + + if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_REMOVE_EXTENT_CACHE | DBG_FUNC_START, startBlock, blockCount, 0, 0, 0); + + lck_spin_lock(&hfsmp->vcbFreeExtLock); + + for (i = 0; i < (int)hfsmp->vcbFreeExtCnt; i++) { + start = hfsmp->vcbFreeExt[i].startBlock; + end = start + hfsmp->vcbFreeExt[i].blockCount; + + /* If the extent to remove from free extent list starts within + * this free extent, or, if it starts before this free extent + * but ends in this free extent, remove it by shifting all other + * extents. + */ + if (((startBlock >= start) && (startBlock < end)) || + ((startBlock < start) && (startBlock + blockCount) > start)) { + for (j = i; j < (int)hfsmp->vcbFreeExtCnt - 1; j++) { + hfsmp->vcbFreeExt[j] = hfsmp->vcbFreeExt[j+1]; + } + hfsmp->vcbFreeExtCnt--; + /* Decrement the index so that we check the extent + * that just got shifted to the current index. + */ + i--; + extentsRemoved++; + } + /* Continue looping as we might have to invalidate multiple extents, + * probably not possible in normal case, but does not hurt. + */ + } + + lck_spin_unlock(&hfsmp->vcbFreeExtLock); + + sanity_check_free_ext(hfsmp, 0); + + if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_REMOVE_EXTENT_CACHE | DBG_FUNC_END, 0, 0, 0, extentsRemoved, 0); return; } -/* Check whether free extent cache is active or not. - * This cache is invalidated and disabled when a volume is being resized - * (via hfs_trucatefs() or hfs_extendefs()). +/* + * Add an entry to free extent cache after it has been deallocated. * - * This function assumes that the caller is holding the lock on - * the mount point. + * If the extent provided has blocks beyond current allocLimit, it + * is clipped to allocLimit. This function does not merge contiguous + * extents, if they already exist in the list. * - * Returns: 0 if the cache is not active, - * 1 if the cache is active. + * Inputs: + * hfsmp - mount point structure + * startBlock - starting block of the extent to be removed. + * blockCount - number of blocks of the extent to be removed. + * + * Returns: + * true - if the extent was added successfully to the list + * false - if the extent was no added to the list, maybe because + * the extent was beyond allocLimit, or is not best + * candidate to be put in the cache. */ -static int free_extent_cache_active(ExtendedVCB *vcb) +static Boolean add_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBlock, u_int32_t blockCount) { - int retval = 1; + Boolean retval = false; + u_int32_t start, end; + int i; + + if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_ADD_EXTENT_CACHE | DBG_FUNC_START, startBlock, blockCount, 0, 0, 0); + + /* + * If using the red-black tree allocator, then there's no need to special case + * for the sparse device case. We'll simply add the region we've recently freed + * to the red-black tree, where it will get sorted by offset and length. The only special + * casing will need to be done on the allocation side, where we may favor free extents + * based on offset even if it will cause fragmentation. This may be true, for example, if + * we are trying to reduce the number of bandfiles created in a sparse bundle disk image. + */ +#if CONFIG_HFS_ALLOC_RBTREE + if (hfs_isrbtree_active(hfsmp) == true) { + goto out_not_locked; + } +#endif + + /* No need to add extent that is beyond current allocLimit */ + if (startBlock >= hfsmp->allocLimit) { + goto out_not_locked; + } + + /* If end of the free extent is beyond current allocLimit, clip the extent */ + if ((startBlock + blockCount) > hfsmp->allocLimit) { + blockCount = hfsmp->allocLimit - startBlock; + } + + lck_spin_lock(&hfsmp->vcbFreeExtLock); - if (vcb->hfs_flags & HFS_RESIZE_IN_PROGRESS) { - retval = 0; + /* If the free extent cache is full and the new extent fails to + * compare with the last extent, skip adding it to the list. + */ + if (hfsmp->vcbFreeExtCnt == kMaxFreeExtents) { + if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { + /* For sparse disks, free extent cache list is sorted by start block, lowest first */ + if (startBlock > hfsmp->vcbFreeExt[kMaxFreeExtents-1].startBlock) { + goto out; + } + } else { + /* For normal mounts, free extent cache list is sorted by total blocks, highest first */ + if (blockCount <= hfsmp->vcbFreeExt[kMaxFreeExtents-1].blockCount) { + goto out; + } + } } + + /* Check if the current extent overlaps with any of the existing + * extents. If yes, just skip adding it to the list. We have + * to do this check before shifting the extent records. + */ + for (i = 0; i < (int)hfsmp->vcbFreeExtCnt; i++) { + + start = hfsmp->vcbFreeExt[i].startBlock; + end = start + hfsmp->vcbFreeExt[i].blockCount; + + if (((startBlock >= start) && (startBlock < end)) || + ((startBlock < start) && (startBlock + blockCount) > start)) { + goto out; + } + } + + /* Scan the free extent cache array from tail to head till + * we find the entry after which our new entry should be + * inserted. After we break out of this loop, the new entry + * will be inserted at 'i+1'. + */ + for (i = (int)hfsmp->vcbFreeExtCnt-1; i >= 0; i--) { + if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { + /* For sparse devices, find entry with smaller start block than ours */ + if (hfsmp->vcbFreeExt[i].startBlock < startBlock) { + break; + } + } else { + /* For normal devices, find entry with greater block count than ours */ + if (hfsmp->vcbFreeExt[i].blockCount >= blockCount) { + break; + } + } + + /* If this is not the right spot to insert, and this is + * not the last entry in the array, just shift it and + * continue check another one. + */ + if ((i+1) < kMaxFreeExtents) { + hfsmp->vcbFreeExt[i+1] = hfsmp->vcbFreeExt[i]; + } + } + /* 'i' points to one index offset before which the new extent should be inserted */ + hfsmp->vcbFreeExt[i+1].startBlock = startBlock; + hfsmp->vcbFreeExt[i+1].blockCount = blockCount; + if (hfsmp->vcbFreeExtCnt < kMaxFreeExtents) { + hfsmp->vcbFreeExtCnt++; + } + retval = true; + +out: + lck_spin_unlock(&hfsmp->vcbFreeExtLock); +out_not_locked: + sanity_check_free_ext(hfsmp, 0); + + if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_ADD_EXTENT_CACHE | DBG_FUNC_END, 0, 0, 0, retval, 0); + return retval; } + +/* Debug function to check if the free extent cache is good or not */ +static void sanity_check_free_ext(struct hfsmount *hfsmp, int check_allocated) +{ + u_int32_t i, j; + + /* Do not do anything if debug is not on, or if we're using the red-black tree */ + if ((ALLOC_DEBUG == 0) || (hfs_isrbtree_active(hfsmp) == true)) { + return; + } + + lck_spin_lock(&hfsmp->vcbFreeExtLock); + + /* + * Iterate the Free extent cache and ensure no entries are bogus or refer to + * allocated blocks. + */ + for(i=0; i < hfsmp->vcbFreeExtCnt; i++) { + u_int32_t start, nblocks; + + start = hfsmp->vcbFreeExt[i].startBlock; + nblocks = hfsmp->vcbFreeExt[i].blockCount; + + //printf ("hfs: %p: slot:%d (%u,%u)\n", hfsmp, i, start, nblocks); + + /* Check if any of the blocks in free extent cache are allocated. + * This should not be enabled always because it might take + * very long for large extents that get added to the list. + * + * We have to drop vcbFreeExtLock while we call hfs_isallocated + * because it is going to do I/O. Note that the free extent + * cache could change. That's a risk we take when using this + * debugging code. (Another alternative would be to try to + * detect when the free extent cache changed, and perhaps + * restart if the list changed while we dropped the lock.) + */ + if (check_allocated) { + lck_spin_unlock(&hfsmp->vcbFreeExtLock); + if (hfs_isallocated(hfsmp, start, nblocks)) { + panic("hfs: %p: slot %d:(%u,%u) in the free extent array is allocated\n", + hfsmp, i, start, nblocks); + } + lck_spin_lock(&hfsmp->vcbFreeExtLock); + } + + /* Check if any part of the extent is beyond allocLimit */ + if ((start > hfsmp->allocLimit) || ((start + nblocks) > hfsmp->allocLimit)) { + panic ("hfs: %p: slot %d:(%u,%u) in the free extent array is beyond allocLimit=%u\n", + hfsmp, i, start, nblocks, hfsmp->allocLimit); + } + + /* Check if there are any duplicate start blocks */ + for(j=i+1; j < hfsmp->vcbFreeExtCnt; j++) { + if (start == hfsmp->vcbFreeExt[j].startBlock) { + panic("hfs: %p: slot %d:(%u,%u) and %d:(%u,%u) are duplicate\n", + hfsmp, i, start, nblocks, j, hfsmp->vcbFreeExt[j].startBlock, + hfsmp->vcbFreeExt[j].blockCount); + } + } + + /* Check if the entries are out of order */ + if ((i+1) != hfsmp->vcbFreeExtCnt) { + if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { + /* sparse devices are sorted by starting block number (ascending) */ + if (hfsmp->vcbFreeExt[i].startBlock > hfsmp->vcbFreeExt[i+1].startBlock) { + panic ("hfs: %p: SPARSE %d:(%u,%u) and %d:(%u,%u) are out of order\n", + hfsmp, i, start, nblocks, i+1, hfsmp->vcbFreeExt[i+1].startBlock, + hfsmp->vcbFreeExt[i+1].blockCount); + } + } else { + /* normally sorted by block count (descending) */ + if (hfsmp->vcbFreeExt[i].blockCount < hfsmp->vcbFreeExt[i+1].blockCount) { + panic ("hfs: %p: %d:(%u,%u) and %d:(%u,%u) are out of order\n", + hfsmp, i, start, nblocks, i+1, hfsmp->vcbFreeExt[i+1].startBlock, + hfsmp->vcbFreeExt[i+1].blockCount); + } + } + } + } + lck_spin_unlock(&hfsmp->vcbFreeExtLock); +} diff --git a/bsd/hfs/hfscommon/headers/FileMgrInternal.h b/bsd/hfs/hfscommon/headers/FileMgrInternal.h index 307178907..7276daa26 100644 --- a/bsd/hfs/hfscommon/headers/FileMgrInternal.h +++ b/bsd/hfs/hfscommon/headers/FileMgrInternal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -32,7 +32,7 @@ Version: HFS Plus 1.0 - Copyright: � 1996-2001 by Apple Computer, Inc., all rights reserved. + Copyright: � 1996-2001 by Apple Computer, Inc., all rights reserved. */ #ifndef __FILEMGRINTERNAL__ @@ -189,6 +189,8 @@ ExchangeFileIDs (ExtendedVCB * volume, u_int32_t srcHint, u_int32_t destHint ); +EXTERN_API_C( OSErr ) +MoveData( ExtendedVCB *vcb, HFSCatalogNodeID srcID, HFSCatalogNodeID destID, int rsrc); /* BTree Manager Routines*/ @@ -232,7 +234,7 @@ BlockDeallocate (ExtendedVCB * vcb, u_int32_t flags); EXTERN_API_C ( void ) -invalidate_free_extent_cache (ExtendedVCB * vcb); +ResetVCBFreeExtCache(struct hfsmount *hfsmp); EXTERN_API_C( OSErr ) BlockMarkAllocated(ExtendedVCB *vcb, u_int32_t startingBlock, u_int32_t numBlocks); @@ -240,8 +242,28 @@ BlockMarkAllocated(ExtendedVCB *vcb, u_int32_t startingBlock, u_int32_t numBlock EXTERN_API_C( OSErr ) BlockMarkFree( ExtendedVCB *vcb, u_int32_t startingBlock, u_int32_t numBlocks); +EXTERN_API_C( OSErr ) +BlockMarkFreeUnused( ExtendedVCB *vcb, u_int32_t startingBlock, u_int32_t numBlocks); + EXTERN_API_C( u_int32_t ) MetaZoneFreeBlocks(ExtendedVCB *vcb); + +EXTERN_API_C( u_int32_t ) +UpdateAllocLimit (struct hfsmount *hfsmp, u_int32_t new_end_block); + +#if CONFIG_HFS_ALLOC_RBTREE +EXTERN_API_C( u_int32_t ) +GenerateTree( struct hfsmount *hfsmp, u_int32_t end_block, int *flags, int initialscan); + +EXTERN_API_C( void ) +DestroyTrees( struct hfsmount *hfsmp); + +EXTERN_API_C( u_int32_t ) +InitTree(struct hfsmount *hfsmp); +#endif + + + /* File Extent Mapping routines*/ EXTERN_API_C( OSErr ) @@ -256,11 +278,9 @@ CompareExtentKeysPlus (const HFSPlusExtentKey *searchKey, const HFSPlusExtentKey *trialKey); EXTERN_API_C( OSErr ) -TruncateFileC (ExtendedVCB * vcb, - FCB * fcb, - int64_t peof, - Boolean truncateToExtent); - +TruncateFileC (ExtendedVCB *vcb, FCB *fcb, int64_t peof, int deleted, + int rsrc, uint32_t fileid, Boolean truncateToExtent); + EXTERN_API_C( OSErr ) ExtendFileC (ExtendedVCB * vcb, FCB * fcb, diff --git a/bsd/hfs/hfscommon/headers/HybridAllocator.h b/bsd/hfs/hfscommon/headers/HybridAllocator.h new file mode 100644 index 000000000..4add9daee --- /dev/null +++ b/bsd/hfs/hfscommon/headers/HybridAllocator.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + + +#ifndef __HYBRID_ALLOC__ +#define __HYBRID_ALLOC__ + +#include +#include "RedBlackTree.h" + +typedef struct extent_node extent_node_t; + +struct extent_node +{ + u_int32_t length; + u_int32_t offset; + struct extent_node *offset_next; + rb_node(extent_node_t) offset_link; +}; + +typedef rb_tree(extent_node_t) extent_tree_offset_t; + +extern extent_node_t * +alloc_node(u_int32_t length, u_int32_t offset); + +extern void +free_node(extent_node_t *node); + +extern extent_node_t * +extent_tree_free_space( extent_tree_offset_t *offset_tree, u_int32_t size, u_int32_t offset); + +extern void +extent_tree_offset_print(extent_tree_offset_t *offset_tree); + +extern int32_t +extent_tree_offset_alloc_space(extent_tree_offset_t *offset_tree, u_int32_t size, u_int32_t offset); + +extern int32_t +extent_tree_offset_alloc_unaligned(extent_tree_offset_t *tree, u_int32_t size, u_int32_t offset); + + +extern void +extent_tree_remove_node (extent_tree_offset_t *offset_tree, extent_node_t * node); + +extern extent_node_t * +extent_tree_off_first (extent_tree_offset_t *offset_tree); + +extern extent_node_t * +extent_tree_off_search(extent_tree_offset_t *offset_tree, extent_node_t *node); + +extern extent_node_t * +extent_tree_off_search_next(extent_tree_offset_t *offset_tree, extent_node_t *node); + +extern extent_node_t* +extent_tree_off_search_nextWithSize (extent_tree_offset_t *offset_tree, extent_node_t *node); + +extern extent_node_t * +extent_tree_off_search_prev(extent_tree_offset_t *offset_tree, extent_node_t *node); + +extern extent_node_t * +extent_tree_off_next(extent_tree_offset_t *offset_tree, extent_node_t *node); + +extern extent_node_t * +extent_tree_off_prev(extent_tree_offset_t *offset_tree, extent_node_t *node); + +extern void +extent_tree_init(extent_tree_offset_t *offset_tree); + +extern void +extent_tree_destroy(extent_tree_offset_t *offset_tree); + +extern int +cmp_offset_node(extent_node_t *node_1, extent_node_t *node_2); + + +#endif diff --git a/bsd/hfs/hfscommon/headers/RedBlackTree.h b/bsd/hfs/hfscommon/headers/RedBlackTree.h new file mode 100644 index 000000000..21342296c --- /dev/null +++ b/bsd/hfs/hfscommon/headers/RedBlackTree.h @@ -0,0 +1,969 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/****************************************************************************** + * + * Copyright (C) 2008 Jason Evans . + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice(s), this list of conditions and the following disclaimer + * unmodified other than the allowable addition of one or more + * copyright notices. + * 2. Redistributions in binary form must reproduce the above copyright + * notice(s), this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ****************************************************************************** + * + * cpp macro implementation of left-leaning red-black trees. + * + * Usage: + * + * (Optional, see assert(3).) + * #define NDEBUG + * + * (Required.) + * #include + * #include + * ... + * + * All operations are done non-recursively. Parent pointers are not used, and + * color bits are stored in the least significant bit of right-child pointers, + * thus making node linkage as compact as is possible for red-black trees. + * + * Some macros use a comparison function pointer, which is expected to have the + * following prototype: + * + * int (a_cmp *)(a_type *a_node, a_type *a_other); + * ^^^^^^ + * or a_key + * + * Interpretation of comparision function return values: + * + * -1 : a_node < a_other + * 0 : a_node == a_other + * 1 : a_node > a_other + * + * In all cases, the a_node or a_key macro argument is the first argument to the + * comparison function, which makes it possible to write comparison functions + * that treat the first argument specially. + * + ******************************************************************************/ + +#ifndef RB_H_ +#define RB_H_ + +#define RB_COMPACT +#ifdef RB_COMPACT +/* Node structure. */ +#define rb_node(a_type) \ +struct { \ + a_type *rbn_left; \ + a_type *rbn_right_red; \ +} +#else +#define rb_node(a_type) \ +struct { \ + a_type *rbn_left; \ + a_type *rbn_right; \ + bool rbn_red; \ +} +#endif + +/* Root structure. */ +#define rb_tree(a_type) \ +struct { \ + a_type *rbt_root; \ + a_type rbt_nil; \ +} + +/* Left accessors. */ +#define rbp_left_get(a_type, a_field, a_node) \ + ((a_node)->a_field.rbn_left) +#define rbp_left_set(a_type, a_field, a_node, a_left) do { \ + (a_node)->a_field.rbn_left = a_left; \ +} while (0) + +#ifdef RB_COMPACT +/* Right accessors. */ +#define rbp_right_get(a_type, a_field, a_node) \ + ((a_type *) (((intptr_t) (a_node)->a_field.rbn_right_red) \ + & ((ssize_t)-2))) +#define rbp_right_set(a_type, a_field, a_node, a_right) do { \ + (a_node)->a_field.rbn_right_red = (a_type *) (((uintptr_t) a_right) \ + | (((uintptr_t) (a_node)->a_field.rbn_right_red) & ((size_t)1))); \ +} while (0) + +/* Color accessors. */ +#define rbp_red_get(a_type, a_field, a_node) \ + ((bool) (((uintptr_t) (a_node)->a_field.rbn_right_red) \ + & ((size_t)1))) +#define rbp_color_set(a_type, a_field, a_node, a_red) do { \ + (a_node)->a_field.rbn_right_red = (a_type *) ((((intptr_t) \ + (a_node)->a_field.rbn_right_red) & ((ssize_t)-2)) \ + | ((ssize_t)a_red)); \ +} while (0) +#define rbp_red_set(a_type, a_field, a_node) do { \ + (a_node)->a_field.rbn_right_red = (a_type *) (((uintptr_t) \ + (a_node)->a_field.rbn_right_red) | ((size_t)1)); \ +} while (0) +#define rbp_black_set(a_type, a_field, a_node) do { \ + (a_node)->a_field.rbn_right_red = (a_type *) (((intptr_t) \ + (a_node)->a_field.rbn_right_red) & ((ssize_t)-2)); \ +} while (0) +#else +/* Right accessors. */ +#define rbp_right_get(a_type, a_field, a_node) \ + ((a_node)->a_field.rbn_right) +#define rbp_right_set(a_type, a_field, a_node, a_right) do { \ + (a_node)->a_field.rbn_right = a_right; \ +} while (0) + +/* Color accessors. */ +#define rbp_red_get(a_type, a_field, a_node) \ + ((a_node)->a_field.rbn_red) +#define rbp_color_set(a_type, a_field, a_node, a_red) do { \ + (a_node)->a_field.rbn_red = (a_red); \ +} while (0) +#define rbp_red_set(a_type, a_field, a_node) do { \ + (a_node)->a_field.rbn_red = true; \ +} while (0) +#define rbp_black_set(a_type, a_field, a_node) do { \ + (a_node)->a_field.rbn_red = false; \ +} while (0) +#endif + +/* Node initializer. */ +#define rbp_node_new(a_type, a_field, a_tree, a_node) do { \ + rbp_left_set(a_type, a_field, (a_node), &(a_tree)->rbt_nil); \ + rbp_right_set(a_type, a_field, (a_node), &(a_tree)->rbt_nil); \ + rbp_red_set(a_type, a_field, (a_node)); \ +} while (0) + +/* Tree initializer. */ +#define rb_new(a_type, a_field, a_tree) do { \ + (a_tree)->rbt_root = &(a_tree)->rbt_nil; \ + rbp_node_new(a_type, a_field, a_tree, &(a_tree)->rbt_nil); \ + rbp_black_set(a_type, a_field, &(a_tree)->rbt_nil); \ +} while (0) + +/* Tree operations. */ +#define rbp_black_height(a_type, a_field, a_tree, r_height) do { \ + a_type *rbp_bh_t; \ + for (rbp_bh_t = (a_tree)->rbt_root, (r_height) = 0; \ + rbp_bh_t != &(a_tree)->rbt_nil; \ + rbp_bh_t = rbp_left_get(a_type, a_field, rbp_bh_t)) { \ + if (rbp_red_get(a_type, a_field, rbp_bh_t) == false) { \ + (r_height)++; \ + } \ + } \ +} while (0) + +#define rbp_first(a_type, a_field, a_tree, a_root, r_node) do { \ + for ((r_node) = (a_root); \ + rbp_left_get(a_type, a_field, (r_node)) != &(a_tree)->rbt_nil; \ + (r_node) = rbp_left_get(a_type, a_field, (r_node))) { \ + } \ +} while (0) + +#define rbp_last(a_type, a_field, a_tree, a_root, r_node) do { \ + for ((r_node) = (a_root); \ + rbp_right_get(a_type, a_field, (r_node)) != &(a_tree)->rbt_nil; \ + (r_node) = rbp_right_get(a_type, a_field, (r_node))) { \ + } \ +} while (0) + +#define rbp_next(a_type, a_field, a_cmp, a_tree, a_node, r_node) do { \ + if (rbp_right_get(a_type, a_field, (a_node)) != &(a_tree)->rbt_nil) { \ + rbp_first(a_type, a_field, a_tree, rbp_right_get(a_type, \ + a_field, (a_node)), (r_node)); \ + } else { \ + a_type *rbp_n_t = (a_tree)->rbt_root; \ + assert(rbp_n_t != &(a_tree)->rbt_nil); \ + (r_node) = &(a_tree)->rbt_nil; \ + while (true) { \ + int rbp_n_cmp = (a_cmp)((a_node), rbp_n_t); \ + if (rbp_n_cmp < 0) { \ + (r_node) = rbp_n_t; \ + rbp_n_t = rbp_left_get(a_type, a_field, rbp_n_t); \ + } else if (rbp_n_cmp > 0) { \ + rbp_n_t = rbp_right_get(a_type, a_field, rbp_n_t); \ + } else { \ + break; \ + } \ + assert(rbp_n_t != &(a_tree)->rbt_nil); \ + } \ + } \ +} while (0) + +#define rbp_prev(a_type, a_field, a_cmp, a_tree, a_node, r_node) do { \ + if (rbp_left_get(a_type, a_field, (a_node)) != &(a_tree)->rbt_nil) {\ + rbp_last(a_type, a_field, a_tree, rbp_left_get(a_type, \ + a_field, (a_node)), (r_node)); \ + } else { \ + a_type *rbp_p_t = (a_tree)->rbt_root; \ + assert(rbp_p_t != &(a_tree)->rbt_nil); \ + (r_node) = &(a_tree)->rbt_nil; \ + while (true) { \ + int rbp_p_cmp = (a_cmp)((a_node), rbp_p_t); \ + if (rbp_p_cmp < 0) { \ + rbp_p_t = rbp_left_get(a_type, a_field, rbp_p_t); \ + } else if (rbp_p_cmp > 0) { \ + (r_node) = rbp_p_t; \ + rbp_p_t = rbp_right_get(a_type, a_field, rbp_p_t); \ + } else { \ + break; \ + } \ + assert(rbp_p_t != &(a_tree)->rbt_nil); \ + } \ + } \ +} while (0) + +#define rb_first(a_type, a_field, a_tree, r_node) do { \ + rbp_first(a_type, a_field, a_tree, (a_tree)->rbt_root, (r_node)); \ + if ((r_node) == &(a_tree)->rbt_nil) { \ + (r_node) = NULL; \ + } \ +} while (0) + +#define rb_last(a_type, a_field, a_tree, r_node) do { \ + rbp_last(a_type, a_field, a_tree, (a_tree)->rbt_root, r_node); \ + if ((r_node) == &(a_tree)->rbt_nil) { \ + (r_node) = NULL; \ + } \ +} while (0) + +#define rb_next(a_type, a_field, a_cmp, a_tree, a_node, r_node) do { \ + rbp_next(a_type, a_field, a_cmp, a_tree, (a_node), (r_node)); \ + if ((r_node) == &(a_tree)->rbt_nil) { \ + (r_node) = NULL; \ + } \ +} while (0) + +#define rb_prev(a_type, a_field, a_cmp, a_tree, a_node, r_node) do { \ + rbp_prev(a_type, a_field, a_cmp, a_tree, (a_node), (r_node)); \ + if ((r_node) == &(a_tree)->rbt_nil) { \ + (r_node) = NULL; \ + } \ +} while (0) + +#define rb_search(a_type, a_field, a_cmp, a_tree, a_key, r_node) do { \ + int rbp_se_cmp; \ + (r_node) = (a_tree)->rbt_root; \ + while ((r_node) != &(a_tree)->rbt_nil && (rbp_se_cmp = (a_cmp)((a_key), (r_node))) != 0) { \ + if (rbp_se_cmp < 0) { \ + (r_node) = rbp_left_get(a_type, a_field, (r_node)); \ + } else { \ + (r_node) = rbp_right_get(a_type, a_field, (r_node)); \ + } \ + } \ + if ((r_node) == &(a_tree)->rbt_nil) { \ + (r_node) = NULL; \ + } \ +} while (0) + +/* + * Find a match if it exists. Otherwise, find the next greater node, if one + * exists. + */ +#define rb_nsearch(a_type, a_field, a_cmp, a_tree, a_key, r_node) do { \ + a_type *rbp_ns_t = (a_tree)->rbt_root; \ + (r_node) = NULL; \ + while (rbp_ns_t != &(a_tree)->rbt_nil) { \ + int rbp_ns_cmp = (a_cmp)((a_key), rbp_ns_t); \ + if (rbp_ns_cmp < 0) { \ + (r_node) = rbp_ns_t; \ + rbp_ns_t = rbp_left_get(a_type, a_field, rbp_ns_t); \ + } else if (rbp_ns_cmp > 0) { \ + rbp_ns_t = rbp_right_get(a_type, a_field, rbp_ns_t); \ + } else { \ + (r_node) = rbp_ns_t; \ + break; \ + } \ + } \ +} while (0) + +/* + * Find a match if it exists. Otherwise, find the previous lesser node, if one + * exists. + */ +#define rb_psearch(a_type, a_field, a_cmp, a_tree, a_key, r_node) do { \ + a_type *rbp_ps_t = (a_tree)->rbt_root; \ + (r_node) = NULL; \ + while (rbp_ps_t != &(a_tree)->rbt_nil) { \ + int rbp_ps_cmp = (a_cmp)((a_key), rbp_ps_t); \ + if (rbp_ps_cmp < 0) { \ + rbp_ps_t = rbp_left_get(a_type, a_field, rbp_ps_t); \ + } else if (rbp_ps_cmp > 0) { \ + (r_node) = rbp_ps_t; \ + rbp_ps_t = rbp_right_get(a_type, a_field, rbp_ps_t); \ + } else { \ + (r_node) = rbp_ps_t; \ + break; \ + } \ + } \ +} while (0) + +#define rbp_rotate_left(a_type, a_field, a_node, r_node) do { \ + (r_node) = rbp_right_get(a_type, a_field, (a_node)); \ + rbp_right_set(a_type, a_field, (a_node), rbp_left_get(a_type, a_field, (r_node))); \ + rbp_left_set(a_type, a_field, (r_node), (a_node)); \ +} while (0) + +#define rbp_rotate_right(a_type, a_field, a_node, r_node) do { \ + (r_node) = rbp_left_get(a_type, a_field, (a_node)); \ + rbp_left_set(a_type, a_field, (a_node), rbp_right_get(a_type, a_field, (r_node))); \ + rbp_right_set(a_type, a_field, (r_node), (a_node)); \ +} while (0) + +#define rbp_lean_left(a_type, a_field, a_node, r_node) do { \ + bool rbp_ll_red; \ + rbp_rotate_left(a_type, a_field, (a_node), (r_node)); \ + rbp_ll_red = rbp_red_get(a_type, a_field, (a_node)); \ + rbp_color_set(a_type, a_field, (r_node), rbp_ll_red); \ + rbp_red_set(a_type, a_field, (a_node)); \ +} while (0) + +#define rbp_lean_right(a_type, a_field, a_node, r_node) do { \ + bool rbp_lr_red; \ + rbp_rotate_right(a_type, a_field, (a_node), (r_node)); \ + rbp_lr_red = rbp_red_get(a_type, a_field, (a_node)); \ + rbp_color_set(a_type, a_field, (r_node), rbp_lr_red); \ + rbp_red_set(a_type, a_field, (a_node)); \ +} while (0) + +#define rbp_move_red_left(a_type, a_field, a_node, r_node) do { \ + a_type *rbp_mrl_t, *rbp_mrl_u; \ + rbp_mrl_t = rbp_left_get(a_type, a_field, (a_node)); \ + rbp_red_set(a_type, a_field, rbp_mrl_t); \ + rbp_mrl_t = rbp_right_get(a_type, a_field, (a_node)); \ + rbp_mrl_u = rbp_left_get(a_type, a_field, rbp_mrl_t); \ + if (rbp_red_get(a_type, a_field, rbp_mrl_u)) { \ + rbp_rotate_right(a_type, a_field, rbp_mrl_t, rbp_mrl_u); \ + rbp_right_set(a_type, a_field, (a_node), rbp_mrl_u); \ + rbp_rotate_left(a_type, a_field, (a_node), (r_node)); \ + rbp_mrl_t = rbp_right_get(a_type, a_field, (a_node)); \ + if (rbp_red_get(a_type, a_field, rbp_mrl_t)) { \ + rbp_black_set(a_type, a_field, rbp_mrl_t); \ + rbp_red_set(a_type, a_field, (a_node)); \ + rbp_rotate_left(a_type, a_field, (a_node), rbp_mrl_t); \ + rbp_left_set(a_type, a_field, (r_node), rbp_mrl_t); \ + } else { \ + rbp_black_set(a_type, a_field, (a_node)); \ + } \ + } else { \ + rbp_red_set(a_type, a_field, (a_node)); \ + rbp_rotate_left(a_type, a_field, (a_node), (r_node)); \ + } \ +} while (0) + +#define rbp_move_red_right(a_type, a_field, a_node, r_node) do { \ + a_type *rbp_mrr_t; \ + rbp_mrr_t = rbp_left_get(a_type, a_field, (a_node)); \ + if (rbp_red_get(a_type, a_field, rbp_mrr_t)) { \ + a_type *rbp_mrr_u, *rbp_mrr_v; \ + rbp_mrr_u = rbp_right_get(a_type, a_field, rbp_mrr_t); \ + rbp_mrr_v = rbp_left_get(a_type, a_field, rbp_mrr_u); \ + if (rbp_red_get(a_type, a_field, rbp_mrr_v)) { \ + rbp_color_set(a_type, a_field, rbp_mrr_u, rbp_red_get(a_type, a_field, (a_node))); \ + rbp_black_set(a_type, a_field, rbp_mrr_v); \ + rbp_rotate_left(a_type, a_field, rbp_mrr_t, rbp_mrr_u); \ + rbp_left_set(a_type, a_field, (a_node), rbp_mrr_u); \ + rbp_rotate_right(a_type, a_field, (a_node), (r_node)); \ + rbp_rotate_left(a_type, a_field, (a_node), rbp_mrr_t); \ + rbp_right_set(a_type, a_field, (r_node), rbp_mrr_t); \ + } else { \ + rbp_color_set(a_type, a_field, rbp_mrr_t, rbp_red_get(a_type, a_field, (a_node))); \ + rbp_red_set(a_type, a_field, rbp_mrr_u); \ + rbp_rotate_right(a_type, a_field, (a_node), (r_node)); \ + rbp_rotate_left(a_type, a_field, (a_node), rbp_mrr_t); \ + rbp_right_set(a_type, a_field, (r_node), rbp_mrr_t); \ + } \ + rbp_red_set(a_type, a_field, (a_node)); \ + } else { \ + rbp_red_set(a_type, a_field, rbp_mrr_t); \ + rbp_mrr_t = rbp_left_get(a_type, a_field, rbp_mrr_t); \ + if (rbp_red_get(a_type, a_field, rbp_mrr_t)) { \ + rbp_black_set(a_type, a_field, rbp_mrr_t); \ + rbp_rotate_right(a_type, a_field, (a_node), (r_node)); \ + rbp_rotate_left(a_type, a_field, (a_node), rbp_mrr_t); \ + rbp_right_set(a_type, a_field, (r_node), rbp_mrr_t); \ + } else { \ + rbp_rotate_left(a_type, a_field, (a_node), (r_node)); \ + } \ + } \ +} while (0) + +#define rb_insert(a_type, a_field, a_cmp, a_tree, a_node) do { \ + a_type rbp_i_s; \ + a_type *rbp_i_g, *rbp_i_p, *rbp_i_c, *rbp_i_t, *rbp_i_u; \ + int rbp_i_cmp = 0; \ + rbp_i_g = &(a_tree)->rbt_nil; \ + rbp_left_set(a_type, a_field, &rbp_i_s, (a_tree)->rbt_root); \ + rbp_right_set(a_type, a_field, &rbp_i_s, &(a_tree)->rbt_nil); \ + rbp_black_set(a_type, a_field, &rbp_i_s); \ + rbp_i_p = &rbp_i_s; \ + rbp_i_c = (a_tree)->rbt_root; \ + /* Iteratively search down the tree for the insertion point, */\ + /* splitting 4-nodes as they are encountered. At the end of each */\ + /* iteration, rbp_i_g->rbp_i_p->rbp_i_c is a 3-level path down */\ + /* the tree, assuming a sufficiently deep tree. */\ + while (rbp_i_c != &(a_tree)->rbt_nil) { \ + rbp_i_t = rbp_left_get(a_type, a_field, rbp_i_c); \ + rbp_i_u = rbp_left_get(a_type, a_field, rbp_i_t); \ + if (rbp_red_get(a_type, a_field, rbp_i_t) \ + && rbp_red_get(a_type, a_field, rbp_i_u)) { \ + /* rbp_i_c is the top of a logical 4-node, so split it. */\ + /* This iteration does not move down the tree, due to the */\ + /* disruptiveness of node splitting. */\ + /* */\ + /* Rotate right. */\ + rbp_rotate_right(a_type, a_field, rbp_i_c, rbp_i_t); \ + /* Pass red links up one level. */\ + rbp_i_u = rbp_left_get(a_type, a_field, rbp_i_t); \ + rbp_black_set(a_type, a_field, rbp_i_u); \ + if (rbp_left_get(a_type, a_field, rbp_i_p) == rbp_i_c) { \ + rbp_left_set(a_type, a_field, rbp_i_p, rbp_i_t); \ + rbp_i_c = rbp_i_t; \ + } else { \ + /* rbp_i_c was the right child of rbp_i_p, so rotate */\ + /* left in order to maintain the left-leaning */\ + /* invariant. */\ + assert(rbp_right_get(a_type, a_field, rbp_i_p) == rbp_i_c); \ + rbp_right_set(a_type, a_field, rbp_i_p, rbp_i_t); \ + rbp_lean_left(a_type, a_field, rbp_i_p, rbp_i_u); \ + if (rbp_left_get(a_type, a_field, rbp_i_g) == rbp_i_p) {\ + rbp_left_set(a_type, a_field, rbp_i_g, rbp_i_u); \ + } else { \ + assert(rbp_right_get(a_type, a_field, rbp_i_g) == rbp_i_p); \ + rbp_right_set(a_type, a_field, rbp_i_g, rbp_i_u); \ + } \ + rbp_i_p = rbp_i_u; \ + rbp_i_cmp = (a_cmp)((a_node), rbp_i_p); \ + if (rbp_i_cmp < 0) { \ + rbp_i_c = rbp_left_get(a_type, a_field, rbp_i_p); \ + } else { \ + assert(rbp_i_cmp > 0); \ + rbp_i_c = rbp_right_get(a_type, a_field, rbp_i_p); \ + } \ + continue; \ + } \ + } \ + rbp_i_g = rbp_i_p; \ + rbp_i_p = rbp_i_c; \ + rbp_i_cmp = (a_cmp)((a_node), rbp_i_c); \ + if (rbp_i_cmp < 0) { \ + rbp_i_c = rbp_left_get(a_type, a_field, rbp_i_c); \ + } else { \ + assert(rbp_i_cmp > 0); \ + rbp_i_c = rbp_right_get(a_type, a_field, rbp_i_c); \ + } \ + } \ + /* rbp_i_p now refers to the node under which to insert. */\ + rbp_node_new(a_type, a_field, a_tree, (a_node)); \ + if (rbp_i_cmp > 0) { \ + rbp_right_set(a_type, a_field, rbp_i_p, (a_node)); \ + rbp_lean_left(a_type, a_field, rbp_i_p, rbp_i_t); \ + if (rbp_left_get(a_type, a_field, rbp_i_g) == rbp_i_p) { \ + rbp_left_set(a_type, a_field, rbp_i_g, rbp_i_t); \ + } else if (rbp_right_get(a_type, a_field, rbp_i_g) == rbp_i_p) {\ + rbp_right_set(a_type, a_field, rbp_i_g, rbp_i_t); \ + } \ + } else { \ + rbp_left_set(a_type, a_field, rbp_i_p, (a_node)); \ + } \ + /* Update the root and make sure that it is black. */\ + (a_tree)->rbt_root = rbp_left_get(a_type, a_field, &rbp_i_s); \ + rbp_black_set(a_type, a_field, (a_tree)->rbt_root); \ +} while (0) + +#define rb_remove(a_type, a_field, a_cmp, a_tree, a_node) do { \ + a_type rbp_r_s; \ + a_type *rbp_r_p, *rbp_r_c, *rbp_r_xp, *rbp_r_t, *rbp_r_u; \ + int rbp_r_cmp; \ + rbp_left_set(a_type, a_field, &rbp_r_s, (a_tree)->rbt_root); \ + rbp_right_set(a_type, a_field, &rbp_r_s, &(a_tree)->rbt_nil); \ + rbp_black_set(a_type, a_field, &rbp_r_s); \ + rbp_r_p = &rbp_r_s; \ + rbp_r_c = (a_tree)->rbt_root; \ + rbp_r_xp = &(a_tree)->rbt_nil; \ + /* Iterate down the tree, but always transform 2-nodes to 3- or */\ + /* 4-nodes in order to maintain the invariant that the current */\ + /* node is not a 2-node. This allows simple deletion once a leaf */\ + /* is reached. Handle the root specially though, since there may */\ + /* be no way to convert it from a 2-node to a 3-node. */\ + rbp_r_cmp = (a_cmp)((a_node), rbp_r_c); \ + if (rbp_r_cmp < 0) { \ + rbp_r_t = rbp_left_get(a_type, a_field, rbp_r_c); \ + rbp_r_u = rbp_left_get(a_type, a_field, rbp_r_t); \ + if (rbp_red_get(a_type, a_field, rbp_r_t) == false \ + && rbp_red_get(a_type, a_field, rbp_r_u) == false) { \ + /* Apply standard transform to prepare for left move. */\ + rbp_move_red_left(a_type, a_field, rbp_r_c, rbp_r_t); \ + rbp_black_set(a_type, a_field, rbp_r_t); \ + rbp_left_set(a_type, a_field, rbp_r_p, rbp_r_t); \ + rbp_r_c = rbp_r_t; \ + } else { \ + /* Move left. */\ + rbp_r_p = rbp_r_c; \ + rbp_r_c = rbp_left_get(a_type, a_field, rbp_r_c); \ + } \ + } else { \ + if (rbp_r_cmp == 0) { \ + assert((a_node) == rbp_r_c); \ + if (rbp_right_get(a_type, a_field, rbp_r_c) == &(a_tree)->rbt_nil) { \ + /* Delete root node (which is also a leaf node). */\ + if (rbp_left_get(a_type, a_field, rbp_r_c) != &(a_tree)->rbt_nil) { \ + rbp_lean_right(a_type, a_field, rbp_r_c, rbp_r_t); \ + rbp_right_set(a_type, a_field, rbp_r_t, &(a_tree)->rbt_nil); \ + } else { \ + rbp_r_t = &(a_tree)->rbt_nil; \ + } \ + rbp_left_set(a_type, a_field, rbp_r_p, rbp_r_t); \ + } else { \ + /* This is the node we want to delete, but we will */\ + /* instead swap it with its successor and delete the */\ + /* successor. Record enough information to do the */\ + /* swap later. rbp_r_xp is the a_node's parent. */\ + rbp_r_xp = rbp_r_p; \ + rbp_r_cmp = 1; /* Note that deletion is incomplete. */\ + } \ + } \ + if (rbp_r_cmp == 1) { \ + if (rbp_red_get(a_type, a_field, rbp_left_get(a_type, \ + a_field, rbp_right_get(a_type, a_field, rbp_r_c))) == false) { \ + rbp_r_t = rbp_left_get(a_type, a_field, rbp_r_c); \ + if (rbp_red_get(a_type, a_field, rbp_r_t)) { \ + /* Standard transform. */\ + rbp_move_red_right(a_type, a_field, rbp_r_c, rbp_r_t); \ + } else { \ + /* Root-specific transform. */\ + rbp_red_set(a_type, a_field, rbp_r_c); \ + rbp_r_u = rbp_left_get(a_type, a_field, rbp_r_t); \ + if (rbp_red_get(a_type, a_field, rbp_r_u)) { \ + rbp_black_set(a_type, a_field, rbp_r_u); \ + rbp_rotate_right(a_type, a_field, rbp_r_c, rbp_r_t); \ + rbp_rotate_left(a_type, a_field, rbp_r_c, rbp_r_u); \ + rbp_right_set(a_type, a_field, rbp_r_t, rbp_r_u); \ + } else { \ + rbp_red_set(a_type, a_field, rbp_r_t); \ + rbp_rotate_left(a_type, a_field, rbp_r_c, rbp_r_t); \ + } \ + } \ + rbp_left_set(a_type, a_field, rbp_r_p, rbp_r_t); \ + rbp_r_c = rbp_r_t; \ + } else { \ + /* Move right */\ + rbp_r_p = rbp_r_c; \ + rbp_r_c = rbp_right_get(a_type, a_field, rbp_r_c); \ + } \ + } \ + } \ + if (rbp_r_cmp != 0) { \ + while (true) { \ + assert(rbp_r_p != &(a_tree)->rbt_nil); \ + rbp_r_cmp = (a_cmp)((a_node), rbp_r_c); \ + if (rbp_r_cmp < 0) { \ + rbp_r_t = rbp_left_get(a_type, a_field, rbp_r_c); \ + if (rbp_r_t == &(a_tree)->rbt_nil) { \ + /* rbp_r_c now refers to the successor node to */\ + /* relocate, and rbp_r_xp/a_node refer to the */\ + /* context for the relocation. */\ + if (rbp_left_get(a_type, a_field, rbp_r_xp) == (a_node)) { \ + rbp_left_set(a_type, a_field, rbp_r_xp, rbp_r_c); \ + } else { \ + assert(rbp_right_get(a_type, a_field, rbp_r_xp) == (a_node)); \ + rbp_right_set(a_type, a_field, rbp_r_xp, rbp_r_c); \ + } \ + rbp_left_set(a_type, a_field, rbp_r_c, rbp_left_get(a_type, a_field, (a_node))); \ + rbp_right_set(a_type, a_field, rbp_r_c, rbp_right_get(a_type, a_field, (a_node))); \ + rbp_color_set(a_type, a_field, rbp_r_c, rbp_red_get(a_type, a_field, (a_node))); \ + if (rbp_left_get(a_type, a_field, rbp_r_p) == rbp_r_c) { \ + rbp_left_set(a_type, a_field, rbp_r_p, &(a_tree)->rbt_nil); \ + } else { \ + assert(rbp_right_get(a_type, a_field, rbp_r_p) == rbp_r_c); \ + rbp_right_set(a_type, a_field, rbp_r_p, &(a_tree)->rbt_nil); \ + } \ + break; \ + } \ + rbp_r_u = rbp_left_get(a_type, a_field, rbp_r_t); \ + if (rbp_red_get(a_type, a_field, rbp_r_t) == false \ + && rbp_red_get(a_type, a_field, rbp_r_u) == false) { \ + rbp_move_red_left(a_type, a_field, rbp_r_c, rbp_r_t); \ + if (rbp_left_get(a_type, a_field, rbp_r_p) == rbp_r_c) { \ + rbp_left_set(a_type, a_field, rbp_r_p, rbp_r_t);\ + } else { \ + rbp_right_set(a_type, a_field, rbp_r_p, rbp_r_t); \ + } \ + rbp_r_c = rbp_r_t; \ + } else { \ + rbp_r_p = rbp_r_c; \ + rbp_r_c = rbp_left_get(a_type, a_field, rbp_r_c); \ + } \ + } else { \ + /* Check whether to delete this node (it has to be */\ + /* the correct node and a leaf node). */\ + if (rbp_r_cmp == 0) { \ + assert((a_node) == rbp_r_c); \ + if (rbp_right_get(a_type, a_field, rbp_r_c) == &(a_tree)->rbt_nil) { \ + /* Delete leaf node. */\ + if (rbp_left_get(a_type, a_field, rbp_r_c) != &(a_tree)->rbt_nil) { \ + rbp_lean_right(a_type, a_field, rbp_r_c, rbp_r_t); \ + rbp_right_set(a_type, a_field, rbp_r_t, &(a_tree)->rbt_nil); \ + } else { \ + rbp_r_t = &(a_tree)->rbt_nil; \ + } \ + if (rbp_left_get(a_type, a_field, rbp_r_p) == rbp_r_c) { \ + rbp_left_set(a_type, a_field, rbp_r_p, rbp_r_t); \ + } else { \ + rbp_right_set(a_type, a_field, rbp_r_p, rbp_r_t); \ + } \ + break; \ + } else { \ + /* This is the node we want to delete, but we */\ + /* will instead swap it with its successor */\ + /* and delete the successor. Record enough */\ + /* information to do the swap later. */\ + /* rbp_r_xp is a_node's parent. */\ + rbp_r_xp = rbp_r_p; \ + } \ + } \ + rbp_r_t = rbp_right_get(a_type, a_field, rbp_r_c); \ + rbp_r_u = rbp_left_get(a_type, a_field, rbp_r_t); \ + if (rbp_red_get(a_type, a_field, rbp_r_u) == false) { \ + rbp_move_red_right(a_type, a_field, rbp_r_c, \ + rbp_r_t); \ + if (rbp_left_get(a_type, a_field, rbp_r_p) == rbp_r_c) { \ + rbp_left_set(a_type, a_field, rbp_r_p, rbp_r_t);\ + } else { \ + rbp_right_set(a_type, a_field, rbp_r_p, rbp_r_t); \ + } \ + rbp_r_c = rbp_r_t; \ + } else { \ + rbp_r_p = rbp_r_c; \ + rbp_r_c = rbp_right_get(a_type, a_field, rbp_r_c); \ + } \ + } \ + } \ + } \ + /* Update root. */\ + (a_tree)->rbt_root = rbp_left_get(a_type, a_field, &rbp_r_s); \ +} while (0) + +/* + * The rb_wrap() macro provides a convenient way to wrap functions around the + * cpp macros. The main benefits of wrapping are that 1) repeated macro + * expansion can cause code bloat, especially for rb_{insert,remove)(), and + * 2) type, linkage, comparison functions, etc. need not be specified at every + * call point. + */ + +#define rb_wrap(a_attr, a_prefix, a_tree_type, a_type, a_field, a_cmp) \ +a_attr void \ +a_prefix##new(a_tree_type *tree) { \ + rb_new(a_type, a_field, tree); \ +} \ +a_attr a_type * \ +a_prefix##first(a_tree_type *tree) { \ + a_type *ret; \ + rb_first(a_type, a_field, tree, ret); \ + return (ret); \ +} \ +a_attr a_type * \ +a_prefix##last(a_tree_type *tree) { \ + a_type *ret; \ + rb_last(a_type, a_field, tree, ret); \ + return (ret); \ +} \ +a_attr a_type * \ +a_prefix##next(a_tree_type *tree, a_type *node) { \ + a_type *ret; \ + rb_next(a_type, a_field, a_cmp, tree, node, ret); \ + return (ret); \ +} \ +a_attr a_type * \ +a_prefix##prev(a_tree_type *tree, a_type *node) { \ + a_type *ret; \ + rb_prev(a_type, a_field, a_cmp, tree, node, ret); \ + return (ret); \ +} \ +a_attr a_type * \ +a_prefix##search(a_tree_type *tree, a_type *key) { \ + a_type *ret; \ + rb_search(a_type, a_field, a_cmp, tree, key, ret); \ + return (ret); \ +} \ +a_attr a_type * \ +a_prefix##nsearch(a_tree_type *tree, a_type *key) { \ + a_type *ret; \ + rb_nsearch(a_type, a_field, a_cmp, tree, key, ret); \ + return (ret); \ +} \ +a_attr a_type * \ +a_prefix##psearch(a_tree_type *tree, a_type *key) { \ + a_type *ret; \ + rb_psearch(a_type, a_field, a_cmp, tree, key, ret); \ + return (ret); \ +} \ +a_attr void \ +a_prefix##insert(a_tree_type *tree, a_type *node) { \ + rb_insert(a_type, a_field, a_cmp, tree, node); \ +} \ +a_attr void \ +a_prefix##remove(a_tree_type *tree, a_type *node) { \ + rb_remove(a_type, a_field, a_cmp, tree, node); \ +} + +/* + * The iterators simulate recursion via an array of pointers that store the + * current path. This is critical to performance, since a series of calls to + * rb_{next,prev}() would require time proportional to (n lg n), whereas this + * implementation only requires time proportional to (n). + * + * Since the iterators cache a path down the tree, any tree modification may + * cause the cached path to become invalid. In order to continue iteration, + * use something like the following sequence: + * + * { + * a_type *node, *tnode; + * + * rb_foreach_begin(a_type, a_field, a_tree, node) { + * ... + * rb_next(a_type, a_field, a_cmp, a_tree, node, tnode); + * rb_remove(a_type, a_field, a_cmp, a_tree, node); + * rb_foreach_next(a_type, a_field, a_cmp, a_tree, tnode); + * ... + * } rb_foreach_end(a_type, a_field, a_tree, node) + * } + * + * Note that this idiom is not advised if every iteration modifies the tree, + * since in that case there is no algorithmic complexity improvement over a + * series of rb_{next,prev}() calls, thus making the setup overhead wasted + * effort. + */ + +#define rb_foreach_begin(a_type, a_field, a_tree, a_var) { /* brace A */ \ + /* Compute the maximum possible tree depth (3X the black height). */\ + unsigned rbp_f_height; \ + rbp_black_height(a_type, a_field, a_tree, rbp_f_height); \ + rbp_f_height *= 3; \ + { /* brace B */ \ + /* Initialize the path to contain the left spine. */\ + a_type *rbp_f_path[rbp_f_height]; \ + a_type *rbp_f_node; \ + bool rbp_f_synced = false; \ + unsigned rbp_f_depth = 0; \ + if ((a_tree)->rbt_root != &(a_tree)->rbt_nil) { \ + rbp_f_path[rbp_f_depth] = (a_tree)->rbt_root; \ + rbp_f_depth++; \ + while ((rbp_f_node = rbp_left_get(a_type, a_field, \ + rbp_f_path[rbp_f_depth-1])) != &(a_tree)->rbt_nil) { \ + rbp_f_path[rbp_f_depth] = rbp_f_node; \ + rbp_f_depth++; \ + } \ + } \ + /* While the path is non-empty, iterate. */\ + while (rbp_f_depth > 0) { /* brace C */ \ + (a_var) = rbp_f_path[rbp_f_depth-1]; + +/* + * Note that rb_foreach_begin omits closing }'s because + * it expects that it will be succeeded by a call to + * rb_foreach_end which will have the closing } + */ + +/* Only use if modifying the tree during iteration. */ +#define rb_foreach_next(a_type, a_field, a_cmp, a_tree, a_node) \ + /* Re-initialize the path to contain the path to a_node. */\ + rbp_f_depth = 0; \ + if (a_node != NULL) { \ + if ((a_tree)->rbt_root != &(a_tree)->rbt_nil) { \ + rbp_f_path[rbp_f_depth] = (a_tree)->rbt_root; \ + rbp_f_depth++; \ + rbp_f_node = rbp_f_path[0]; \ + while (true) { \ + int rbp_f_cmp = (a_cmp)((a_node), \ + rbp_f_path[rbp_f_depth-1]); \ + if (rbp_f_cmp < 0) { \ + rbp_f_node = rbp_left_get(a_type, a_field, \ + rbp_f_path[rbp_f_depth-1]); \ + } else if (rbp_f_cmp > 0) { \ + rbp_f_node = rbp_right_get(a_type, a_field, \ + rbp_f_path[rbp_f_depth-1]); \ + } else { \ + break; \ + } \ + assert(rbp_f_node != &(a_tree)->rbt_nil); \ + rbp_f_path[rbp_f_depth] = rbp_f_node; \ + rbp_f_depth++; \ + } \ + } \ + } \ + rbp_f_synced = true; + +#define rb_foreach_end(a_type, a_field, a_tree, a_var) \ + if (rbp_f_synced) { \ + rbp_f_synced = false; \ + continue; \ + } \ + /* Find the successor. */\ + if ((rbp_f_node = rbp_right_get(a_type, a_field, \ + rbp_f_path[rbp_f_depth-1])) != &(a_tree)->rbt_nil) { \ + /* The successor is the left-most node in the right */\ + /* subtree. */\ + rbp_f_path[rbp_f_depth] = rbp_f_node; \ + rbp_f_depth++; \ + while ((rbp_f_node = rbp_left_get(a_type, a_field, \ + rbp_f_path[rbp_f_depth-1])) != &(a_tree)->rbt_nil) { \ + rbp_f_path[rbp_f_depth] = rbp_f_node; \ + rbp_f_depth++; \ + } \ + } else { \ + /* The successor is above the current node. Unwind */\ + /* until a left-leaning edge is removed from the */\ + /* path, or the path is empty. */\ + for (rbp_f_depth--; rbp_f_depth > 0; rbp_f_depth--) { \ + if (rbp_left_get(a_type, a_field, rbp_f_path[rbp_f_depth-1]) \ + == rbp_f_path[rbp_f_depth]) { \ + break; \ + } \ + } \ + } \ + } /* close brace C */ \ + } /* close brace B */ \ +} /* close brace A */ + + + +#define rb_foreach_reverse_begin(a_type, a_field, a_tree, a_var) { /* brace A */ \ + /* Compute the maximum possible tree depth (3X the black height). */\ + unsigned rbp_fr_height; \ + rbp_black_height(a_type, a_field, a_tree, rbp_fr_height); \ + rbp_fr_height *= 3; \ + { /* brace B */ \ + /* Initialize the path to contain the right spine. */\ + a_type *rbp_fr_path[rbp_fr_height]; \ + a_type *rbp_fr_node; \ + bool rbp_fr_synced = false; \ + unsigned rbp_fr_depth = 0; \ + if ((a_tree)->rbt_root != &(a_tree)->rbt_nil) { \ + rbp_fr_path[rbp_fr_depth] = (a_tree)->rbt_root; \ + rbp_fr_depth++; \ + while ((rbp_fr_node = rbp_right_get(a_type, a_field, \ + rbp_fr_path[rbp_fr_depth-1])) != &(a_tree)->rbt_nil) { \ + rbp_fr_path[rbp_fr_depth] = rbp_fr_node; \ + rbp_fr_depth++; \ + } \ + } \ + /* While the path is non-empty, iterate. */\ + while (rbp_fr_depth > 0) { /* brace C */ \ + (a_var) = rbp_fr_path[rbp_fr_depth-1]; + + +/* Only use if modifying the tree during iteration. */ +#define rb_foreach_reverse_prev(a_type, a_field, a_cmp, a_tree, a_node) \ + /* Re-initialize the path to contain the path to a_node. */\ + rbp_fr_depth = 0; \ + if (a_node != NULL) { \ + if ((a_tree)->rbt_root != &(a_tree)->rbt_nil) { \ + rbp_fr_path[rbp_fr_depth] = (a_tree)->rbt_root; \ + rbp_fr_depth++; \ + rbp_fr_node = rbp_fr_path[0]; \ + while (true) { \ + int rbp_fr_cmp = (a_cmp)((a_node), rbp_fr_path[rbp_fr_depth-1]); \ + if (rbp_fr_cmp < 0) { \ + rbp_fr_node = rbp_left_get(a_type, a_field, \ + rbp_fr_path[rbp_fr_depth-1]); \ + } else if (rbp_fr_cmp > 0) { \ + rbp_fr_node = rbp_right_get(a_type, a_field, rbp_fr_path[rbp_fr_depth-1]); \ + } else { \ + break; \ + } \ + assert(rbp_fr_node != &(a_tree)->rbt_nil); \ + rbp_fr_path[rbp_fr_depth] = rbp_fr_node; \ + rbp_fr_depth++; \ + } \ + } \ + } \ + rbp_fr_synced = true; + +#define rb_foreach_reverse_end(a_type, a_field, a_tree, a_var) \ + if (rbp_fr_synced) { \ + rbp_fr_synced = false; \ + continue; \ + } \ + if (rbp_fr_depth == 0) { \ + /* rb_foreach_reverse_sync() was called with a NULL */\ + /* a_node. */\ + break; \ + } \ + /* Find the predecessor. */\ + if ((rbp_fr_node = rbp_left_get(a_type, a_field, \ + rbp_fr_path[rbp_fr_depth-1])) != &(a_tree)->rbt_nil) { \ + /* The predecessor is the right-most node in the left */\ + /* subtree. */\ + rbp_fr_path[rbp_fr_depth] = rbp_fr_node; \ + rbp_fr_depth++; \ + while ((rbp_fr_node = rbp_right_get(a_type, a_field, \ + rbp_fr_path[rbp_fr_depth-1])) != &(a_tree)->rbt_nil) {\ + rbp_fr_path[rbp_fr_depth] = rbp_fr_node; \ + rbp_fr_depth++; \ + } \ + } else { \ + /* The predecessor is above the current node. Unwind */\ + /* until a right-leaning edge is removed from the */\ + /* path, or the path is empty. */\ + for (rbp_fr_depth--; rbp_fr_depth > 0; rbp_fr_depth--) {\ + if (rbp_right_get(a_type, a_field, rbp_fr_path[rbp_fr_depth-1]) \ + == rbp_fr_path[rbp_fr_depth]) { \ + break; \ + } \ + } \ + } \ + } /* Close brace C */ \ + } /* close brace B */ \ +} /* close brace A*/ + +#endif /* RB_H_ */ diff --git a/bsd/i386/param.h b/bsd/i386/param.h index 03a38d2ce..0eae0fea5 100644 --- a/bsd/i386/param.h +++ b/bsd/i386/param.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -101,9 +101,15 @@ * clusters (MAPPED_MBUFS), MCLBYTES must also be an integral multiple * of the hardware page size. */ -#define MSIZE 256 /* size of an mbuf */ -#define MCLBYTES 2048 /* large enough for ether MTU */ -#define MCLSHIFT 11 +#define MSIZESHIFT 8 /* 256 */ +#define MSIZE (1 << MSIZESHIFT) /* size of an mbuf */ +#define MCLSHIFT 11 /* 2048 */ +#define MCLBYTES (1 << MCLSHIFT) /* size of an mbuf cluster */ +#define MBIGCLSHIFT 12 /* 4096 */ +#define MBIGCLBYTES (1 << MBIGCLSHIFT) /* size of a big cluster */ +#define M16KCLSHIFT 14 /* 16384 */ +#define M16KCLBYTES (1 << M16KCLSHIFT) /* size of a jumbo cluster */ + #define MCLOFSET (MCLBYTES - 1) #ifndef NMBCLUSTERS #ifdef GATEWAY diff --git a/bsd/kern/Makefile b/bsd/kern/Makefile new file mode 100644 index 000000000..c7eecbb12 --- /dev/null +++ b/bsd/kern/Makefile @@ -0,0 +1,26 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + +include $(MakeInc_cmd) +include $(MakeInc_def) + +INSTALL_SHARE_MISC_LIST = \ + trace.codes + +include $(MakeInc_rule) +include $(MakeInc_dir) + +SHARE_MISC_DIR = usr/share/misc + +INSTALL_SHARE_MISC_FILES = \ + $(addprefix $(DSTROOT)/$(SHARE_MISC_DIR)/, $(INSTALL_SHARE_MISC_LIST)) + +$(INSTALL_SHARE_MISC_FILES): $(DSTROOT)/$(SHARE_MISC_DIR)/% : % + @echo Installing $< in $(dir $@) + $(_v) $(MKDIR) $(DSTROOT)/$(SHARE_MISC_DIR); \ + $(RM) $(RMFLAGS) $@; \ + $(INSTALL) $(INSTALL_FLAGS) $< $(dir $@); + +do_build_install: $(INSTALL_SHARE_MISC_FILES) diff --git a/bsd/kern/bsd_init.c b/bsd/kern/bsd_init.c index e2868a40c..dac2c94d8 100644 --- a/bsd/kern/bsd_init.c +++ b/bsd/kern/bsd_init.c @@ -106,6 +106,7 @@ #include #include #include +#include #include @@ -132,6 +133,7 @@ #include /* for mcache_init() */ #include /* for mbinit() */ #include /* for knote_init() */ +#include /* for kern_memorystatus_init() */ #include /* for aio_init() */ #include /* for psem_cache_init() */ #include /* for dlil_init() */ @@ -151,6 +153,8 @@ #include /* for tty_init() */ #include /* for utun_register_control() */ #include /* for net_str_id_init() */ +#include /* for netsrc_init() */ +#include /* for assert() */ #include @@ -162,6 +166,10 @@ #include +#if NFSCLIENT +#include +#endif + #if CONFIG_IMAGEBOOT #include #endif @@ -171,6 +179,7 @@ #endif #include +#include void * get_user_regs(thread_t); /* XXX kludge for */ void IOKitInitializeTime(void); /* XXX */ @@ -216,9 +225,8 @@ char domainname[MAXDOMNAMELEN]; int domainnamelen; #if defined(__i386__) || defined(__x86_64__) struct exec_archhandler exec_archhandler_ppc = { - .path = "/usr/libexec/oah/translate", + .path = "/usr/libexec/oah/RosettaNonGrata", }; -const char * const kRosettaStandIn_str = "/usr/libexec/oah/RosettaNonGrata"; #else /* __i386__ */ struct exec_archhandler exec_archhandler_ppc; #endif /* __i386__ */ @@ -243,16 +251,16 @@ extern void file_lock_init(void); extern void kmeminit(void); extern void bsd_bufferinit(void); -extern int srv; +extern int serverperfmode; extern int ncl; vm_map_t bsd_pageable_map; vm_map_t mb_map; -static int bsd_simul_execs = BSD_SIMUL_EXECS; -static int bsd_pageable_map_size = BSD_PAGABLE_MAP_SIZE; -__private_extern__ int execargs_cache_size = BSD_SIMUL_EXECS; -__private_extern__ int execargs_free_count = BSD_SIMUL_EXECS; +static int bsd_simul_execs; +static int bsd_pageable_map_size; +__private_extern__ int execargs_cache_size = 0; +__private_extern__ int execargs_free_count = 0; __private_extern__ vm_offset_t * execargs_cache = NULL; void bsd_exec_setup(int); @@ -262,6 +270,14 @@ void bsd_exec_setup(int); * Intel only. */ __private_extern__ int bootarg_no64exec = 0; +__private_extern__ int bootarg_vnode_cache_defeat = 0; + +/* + * Prevent kernel-based ASLR from being used, for testing. + */ +#if DEVELOPMENT || DEBUG +__private_extern__ int bootarg_disable_aslr = 0; +#endif int cmask = CMASK; extern int customnbuf; @@ -274,6 +290,7 @@ static void parse_bsd_args(void); extern task_t bsd_init_task; extern char init_task_failure_data[]; extern void time_zone_slock_init(void); +extern void select_wait_queue_init(void); static void process_name(const char *, proc_t); static void setconf(void); @@ -289,17 +306,21 @@ extern void sysv_sem_lock_init(void); #if SYSV_MSG extern void sysv_msg_lock_init(void); #endif -extern void pthread_init(void); +#if !defined(SECURE_KERNEL) /* kmem access not enabled by default; can be changed with boot-args */ +/* We don't need to keep this symbol around in RELEASE kernel */ int setup_kmem = 0; +#endif -/* size of kernel trace buffer, disabled by default */ -unsigned int new_nkdbufs = 0; +#if CONFIG_MACF +#if defined (__i386__) || defined (__x86_64__) +/* MACF policy_check configuration flags; see policy_check.c for details */ +int policy_check_flags = 0; -/* mach leak logging */ -int log_leaks = 0; -int turn_on_log_leaks = 0; +extern int check_policy_init(int); +#endif +#endif /* CONFIG_MACF */ extern void stackshot_lock_init(void); @@ -343,8 +364,6 @@ struct rlimit vm_initial_limit_core = { DFLCSIZ, MAXCSIZ }; extern thread_t cloneproc(task_t, proc_t, int); extern int (*mountroot)(void); -extern int netboot_mountroot(void); /* netboot.c */ -extern int netboot_setup(void); lck_grp_t * proc_lck_grp; lck_grp_t * proc_slock_grp; @@ -386,6 +405,10 @@ bsd_init(void) struct vfs_context context; kern_return_t ret; struct ucred temp_cred; + struct posix_cred temp_pcred; +#if NFSCLIENT || CONFIG_IMAGEBOOT + boolean_t netboot = FALSE; +#endif #define bsd_init_kprintf(x...) /* kprintf("bsd_init: " x) */ @@ -427,7 +450,7 @@ bsd_init(void) proc_lck_grp_attr= lck_grp_attr_alloc_init(); proc_lck_grp = lck_grp_alloc_init("proc", proc_lck_grp_attr); -#ifndef CONFIG_EMBEDDED +#if CONFIG_FINE_LOCK_GROUPS proc_slock_grp = lck_grp_alloc_init("proc-slock", proc_lck_grp_attr); proc_fdmlock_grp = lck_grp_alloc_init("proc-fdmlock", proc_lck_grp_attr); proc_mlock_grp = lck_grp_alloc_init("proc-mlock", proc_lck_grp_attr); @@ -440,20 +463,21 @@ bsd_init(void) #endif #endif -#ifdef CONFIG_EMBEDDED - proc_list_mlock = lck_mtx_alloc_init(proc_lck_grp, proc_lck_attr); - proc_klist_mlock = lck_mtx_alloc_init(proc_lck_grp, proc_lck_attr); - lck_mtx_init(&kernproc->p_mlock, proc_lck_grp, proc_lck_attr); - lck_mtx_init(&kernproc->p_fdmlock, proc_lck_grp, proc_lck_attr); - lck_spin_init(&kernproc->p_slock, proc_lck_grp, proc_lck_attr); -#else +#if CONFIG_FINE_LOCK_GROUPS proc_list_mlock = lck_mtx_alloc_init(proc_mlock_grp, proc_lck_attr); proc_klist_mlock = lck_mtx_alloc_init(proc_mlock_grp, proc_lck_attr); lck_mtx_init(&kernproc->p_mlock, proc_mlock_grp, proc_lck_attr); lck_mtx_init(&kernproc->p_fdmlock, proc_fdmlock_grp, proc_lck_attr); lck_spin_init(&kernproc->p_slock, proc_slock_grp, proc_lck_attr); +#else + proc_list_mlock = lck_mtx_alloc_init(proc_lck_grp, proc_lck_attr); + proc_klist_mlock = lck_mtx_alloc_init(proc_lck_grp, proc_lck_attr); + lck_mtx_init(&kernproc->p_mlock, proc_lck_grp, proc_lck_attr); + lck_mtx_init(&kernproc->p_fdmlock, proc_lck_grp, proc_lck_attr); + lck_spin_init(&kernproc->p_slock, proc_lck_grp, proc_lck_attr); #endif + assert(bsd_simul_execs != 0); execargs_cache_lock = lck_mtx_alloc_init(proc_lck_grp, proc_lck_attr); execargs_cache_size = bsd_simul_execs; execargs_free_count = bsd_simul_execs; @@ -473,6 +497,14 @@ bsd_init(void) */ mac_policy_initbsd(); kernproc->p_mac_enforce = 0; + +#if defined (__i386__) || defined (__x86_64__) + /* + * We currently only support this on i386/x86_64, as that is the + * only lock code we have instrumented so far. + */ + check_policy_init(policy_check_flags); +#endif #endif /* MAC */ /* @@ -483,15 +515,16 @@ bsd_init(void) kernproc->p_pgrp = &pgrp0; LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash); LIST_INIT(&pgrp0.pg_members); -#ifdef CONFIG_EMBEDDED - lck_mtx_init(&pgrp0.pg_mlock, proc_lck_grp, proc_lck_attr); -#else +#ifdef CONFIG_FINE_LOCK_GROUPS lck_mtx_init(&pgrp0.pg_mlock, proc_mlock_grp, proc_lck_attr); +#else + lck_mtx_init(&pgrp0.pg_mlock, proc_lck_grp, proc_lck_attr); #endif /* There is no other bsd thread this point and is safe without pgrp lock */ LIST_INSERT_HEAD(&pgrp0.pg_members, kernproc, p_pglist); kernproc->p_listflag |= P_LIST_INPGRP; kernproc->p_pgrpid = 0; + kernproc->p_uniqueid = 0; pgrp0.pg_session = &session0; pgrp0.pg_membercnt = 1; @@ -499,10 +532,10 @@ bsd_init(void) session0.s_count = 1; session0.s_leader = kernproc; session0.s_listflags = 0; -#ifdef CONFIG_EMBEDDED - lck_mtx_init(&session0.s_mlock, proc_lck_grp, proc_lck_attr); -#else +#ifdef CONFIG_FINE_LOCK_GROUPS lck_mtx_init(&session0.s_mlock, proc_mlock_grp, proc_lck_attr); +#else + lck_mtx_init(&session0.s_mlock, proc_lck_grp, proc_lck_attr); #endif LIST_INSERT_HEAD(SESSHASH(0), &session0, s_hash); proc_list_unlock(); @@ -515,6 +548,14 @@ bsd_init(void) kernproc->p_stat = SRUN; kernproc->p_flag = P_SYSTEM; + kernproc->p_lflag = 0; + kernproc->p_ladvflag = 0; + +#if DEVELOPMENT || DEBUG + if (bootarg_disable_aslr) + kernproc->p_flag |= P_DISABLE_ASLR; +#endif + kernproc->p_nice = NZERO; kernproc->p_pptr = kernproc; @@ -531,15 +572,22 @@ bsd_init(void) */ bsd_init_kprintf("calling bzero\n"); bzero(&temp_cred, sizeof(temp_cred)); - temp_cred.cr_ngroups = 1; + bzero(&temp_pcred, sizeof(temp_pcred)); + temp_pcred.cr_ngroups = 1; - temp_cred.cr_audit.as_aia_p = &audit_default_aia; - /* XXX the following will go away with cr_au */ - temp_cred.cr_au.ai_auid = AU_DEFAUDITID; + temp_cred.cr_audit.as_aia_p = audit_default_aia_p; bsd_init_kprintf("calling kauth_cred_create\n"); + /* + * We have to label the temp cred before we create from it to + * properly set cr_ngroups, or the create will fail. + */ + posix_cred_label(&temp_cred, &temp_pcred); kernproc->p_ucred = kauth_cred_create(&temp_cred); + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(kernproc); + /* give the (already exisiting) initial thread a reference on it */ bsd_init_kprintf("calling kauth_cred_ref\n"); kauth_cred_ref(kernproc->p_ucred); @@ -598,6 +646,7 @@ bsd_init(void) vm_offset_t minimum; bsd_init_kprintf("calling kmem_suballoc\n"); + assert(bsd_pageable_map_size != 0); ret = kmem_suballoc(kernel_map, &minimum, (vm_size_t)bsd_pageable_map_size, @@ -630,15 +679,15 @@ bsd_init(void) bsd_init_kprintf("calling IOKitInitializeTime\n"); IOKitInitializeTime(); - if (turn_on_log_leaks && !new_nkdbufs) - new_nkdbufs = 200000; - start_kern_tracing(new_nkdbufs); - if (turn_on_log_leaks) - log_leaks = 1; - bsd_init_kprintf("calling ubc_init\n"); ubc_init(); + /* + * Initialize device-switches. + */ + bsd_init_kprintf("calling devsw_init() \n"); + devsw_init(); + /* Initialize the file systems. */ bsd_init_kprintf("calling vfsinit\n"); vfsinit(); @@ -702,6 +751,8 @@ bsd_init(void) psem_cache_init(); bsd_init_kprintf("calling time_zone_slock_init\n"); time_zone_slock_init(); + bsd_init_kprintf("calling select_wait_queue_init\n"); + select_wait_queue_init(); /* Stack snapshot facility lock */ stackshot_lock_init(); @@ -729,6 +780,12 @@ bsd_init(void) kernproc->p_fd->fd_cdir = NULL; kernproc->p_fd->fd_rdir = NULL; +#if CONFIG_FREEZE + /* Initialise background hibernation */ + bsd_init_kprintf("calling kern_hibernation_init\n"); + kern_hibernation_init(); +#endif + #if CONFIG_EMBEDDED /* Initialize kernel memory status notifications */ bsd_init_kprintf("calling kern_memorystatus_init\n"); @@ -780,6 +837,10 @@ bsd_init(void) /* register user tunnel kernel control handler */ utun_register_control(); + netsrc_init(); + + /* wait for network domain to finish */ + domainfin(); #endif /* NETWORKING */ bsd_init_kprintf("calling vnode_pager_bootstrap\n"); @@ -794,61 +855,22 @@ bsd_init(void) bsd_init_kprintf("calling inittodr\n"); inittodr(0); -#if CONFIG_EMBEDDED - { - /* print out early VM statistics */ - kern_return_t kr1; - vm_statistics_data_t stat; - mach_msg_type_number_t count; - - count = HOST_VM_INFO_COUNT; - kr1 = host_statistics(host_self(), - HOST_VM_INFO, - (host_info_t)&stat, - &count); - kprintf("Mach Virtual Memory Statistics (page size of 4096) bytes\n" - "Pages free:\t\t\t%u.\n" - "Pages active:\t\t\t%u.\n" - "Pages inactive:\t\t\t%u.\n" - "Pages wired down:\t\t%u.\n" - "\"Translation faults\":\t\t%u.\n" - "Pages copy-on-write:\t\t%u.\n" - "Pages zero filled:\t\t%u.\n" - "Pages reactivated:\t\t%u.\n" - "Pageins:\t\t\t%u.\n" - "Pageouts:\t\t\t%u.\n" - "Object cache: %u hits of %u lookups (%d%% hit rate)\n", - - stat.free_count, - stat.active_count, - stat.inactive_count, - stat.wire_count, - stat.faults, - stat.cow_faults, - stat.zero_fill_count, - stat.reactivations, - stat.pageins, - stat.pageouts, - stat.hits, - stat.lookups, - (stat.hits == 0) ? 100 : - ((stat.lookups * 100) / stat.hits)); - } -#endif /* CONFIG_EMBEDDED */ - /* Mount the root file system. */ while( TRUE) { int err; bsd_init_kprintf("calling setconf\n"); setconf(); +#if NFSCLIENT + netboot = (mountroot == netboot_mountroot); +#endif bsd_init_kprintf("vfs_mountroot\n"); if (0 == (err = vfs_mountroot())) break; rootdevice[0] = '\0'; #if NFSCLIENT - if (mountroot == netboot_mountroot) { + if (netboot) { PE_display_icon( 0, "noroot"); /* XXX a netboot-specific icon would be nicer */ vc_progress_set(FALSE, 0); for (i=1; 1; i*=2) { @@ -880,8 +902,10 @@ bsd_init(void) filedesc0.fd_cdir = rootvnode; #if NFSCLIENT - if (mountroot == netboot_mountroot) { + if (netboot) { int err; + + netboot = TRUE; /* post mount setup */ if ((err = netboot_setup()) != 0) { PE_display_icon( 0, "noroot"); /* XXX a netboot-specific icon would be nicer */ @@ -903,19 +927,12 @@ bsd_init(void) * See if a system disk image is present. If so, mount it and * switch the root vnode to point to it */ - - if(imageboot_needed()) { - int err; - - /* An image was found */ - if((err = imageboot_setup())) { - /* - * this is not fatal. Keep trying to root - * off the original media - */ - printf("%s: imageboot could not find root, %d\n", - __FUNCTION__, err); - } + if (netboot == FALSE && imageboot_needed()) { + /* + * An image was found. No turning back: we're booted + * with a kernel from the disk image. + */ + imageboot_setup(); } #endif /* CONFIG_IMAGEBOOT */ @@ -943,15 +960,12 @@ bsd_init(void) kernproc->p_flag |= P_LP64; printf("Kernel is LP64\n"); #endif + + pal_kernel_announce(); + #if __i386__ || __x86_64__ /* this should be done after the root filesystem is mounted */ error = set_archhandler(kernproc, CPU_TYPE_POWERPC); - // 10/30/08 - gab: - // if default 'translate' can't be found, see if the understudy is available - if (ENOENT == error) { - strlcpy(exec_archhandler_ppc.path, kRosettaStandIn_str, MAXPATHLEN); - error = set_archhandler(kernproc, CPU_TYPE_POWERPC); - } if (error) /* XXX make more generic */ exec_archhandler_ppc.path[0] = 0; #endif @@ -1117,13 +1131,19 @@ parse_bsd_args(void) if (PE_parse_boot_argn("-x", namep, sizeof (namep))) /* safe boot */ boothowto |= RB_SAFEBOOT; - if (PE_parse_boot_argn("-l", namep, sizeof (namep))) /* leaks logging */ - turn_on_log_leaks = 1; - /* disable 64 bit grading */ if (PE_parse_boot_argn("-no64exec", namep, sizeof (namep))) bootarg_no64exec = 1; + /* disable vnode_cache_is_authorized() by setting vnode_cache_defeat */ + if (PE_parse_boot_argn("-vnode_cache_defeat", namep, sizeof (namep))) + bootarg_vnode_cache_defeat = 1; + +#if DEVELOPMENT || DEBUG + if (PE_parse_boot_argn("-disable_aslr", namep, sizeof (namep))) + bootarg_disable_aslr = 1; +#endif + PE_parse_boot_argn("ncl", &ncl, sizeof (ncl)); if (PE_parse_boot_argn("nbuf", &max_nbuf_headers, sizeof (max_nbuf_headers))) { @@ -1132,11 +1152,20 @@ parse_bsd_args(void) #if !defined(SECURE_KERNEL) PE_parse_boot_argn("kmem", &setup_kmem, sizeof (setup_kmem)); #endif - PE_parse_boot_argn("trace", &new_nkdbufs, sizeof (new_nkdbufs)); + +#if CONFIG_MACF +#if defined (__i386__) || defined (__x86_64__) + PE_parse_boot_argn("policy_check", &policy_check_flags, sizeof (policy_check_flags)); +#endif +#endif /* CONFIG_MACF */ if (PE_parse_boot_argn("msgbuf", &msgbuf, sizeof (msgbuf))) { log_setsize(msgbuf); } + + if (PE_parse_boot_argn("-novfscache", namep, sizeof(namep))) { + nc_disabled = 1; + } } void @@ -1165,10 +1194,13 @@ bsd_exec_setup(int scale) break; } - bsd_pageable_map_size = (bsd_simul_execs * (NCARGS + PAGE_SIZE)); + bsd_pageable_map_size = (bsd_simul_execs * BSD_PAGEABLE_SIZE_PER_EXEC); } #if !NFSCLIENT +int +netboot_root(void); + int netboot_root(void) { diff --git a/bsd/kern/bsd_stubs.c b/bsd/kern/bsd_stubs.c index 64127d32a..19da61270 100644 --- a/bsd/kern/bsd_stubs.c +++ b/bsd/kern/bsd_stubs.c @@ -31,8 +31,10 @@ #include #include #include +#include #include #include +#include #include #include #include /* for SET */ @@ -49,6 +51,9 @@ extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int); void pcb_synch(void); void tbeproc(void *); +TAILQ_HEAD(,devsw_lock) devsw_locks; +lck_mtx_t devsw_lock_list_mtx; +lck_grp_t *devsw_lock_grp; /* Just to satisfy pstat command */ int dmmin, dmmax, dmtext; @@ -280,6 +285,7 @@ cdevsw_remove(int index, struct cdevsw * csw) return(-1); } cdevsw[index] = nocdev; + cdevsw_flags[index] = 0; return(index); } @@ -303,6 +309,28 @@ cdevsw_add_with_bdev(int index, struct cdevsw * csw, int bdev) return (index); } +int +cdevsw_setkqueueok(int index, struct cdevsw *csw, int use_offset) +{ + struct cdevsw *devsw; + uint64_t flags = CDEVSW_SELECT_KQUEUE; + + devsw = &cdevsw[index]; + if ((index < 0) || (index >= nchrdev) || + (memcmp((char *)devsw, + (char *)csw, + sizeof(struct cdevsw)) != 0)) { + return(-1); + } + + if (use_offset) { + flags |= CDEVSW_USE_OFFSET; + } + + cdevsw_flags[index] = flags; + return 0; +} + #include /* for PE_parse_boot_arg */ void @@ -336,3 +364,71 @@ bsd_hostname(char *buf, int bufsize, int *len) } } +void +devsw_lock(dev_t dev, int mode) +{ + devsw_lock_t newlock, tmplock; + int res; + + assert(0 <= major(dev) && major(dev) < nchrdev); + assert(mode == S_IFCHR || mode == S_IFBLK); + + MALLOC(newlock, devsw_lock_t, sizeof(struct devsw_lock), M_TEMP, M_WAITOK | M_ZERO); + newlock->dl_dev = dev; + newlock->dl_thread = current_thread(); + newlock->dl_mode = mode; + + lck_mtx_lock_spin(&devsw_lock_list_mtx); +retry: + TAILQ_FOREACH(tmplock, &devsw_locks, dl_list) { + if (tmplock->dl_dev == dev && tmplock->dl_mode == mode) { + res = msleep(tmplock, &devsw_lock_list_mtx, PVFS, "devsw_lock", NULL); + assert(res == 0); + goto retry; + } + } + + TAILQ_INSERT_TAIL(&devsw_locks, newlock, dl_list); + lck_mtx_unlock(&devsw_lock_list_mtx); + +} +void +devsw_unlock(dev_t dev, int mode) +{ + devsw_lock_t tmplock; + + assert(0 <= major(dev) && major(dev) < nchrdev); + + lck_mtx_lock_spin(&devsw_lock_list_mtx); + + TAILQ_FOREACH(tmplock, &devsw_locks, dl_list) { + if (tmplock->dl_dev == dev && tmplock->dl_mode == mode) { + break; + } + } + + if (tmplock == NULL) { + panic("Trying to unlock, and couldn't find lock."); + } + + if (tmplock->dl_thread != current_thread()) { + panic("Trying to unlock, but I don't hold the lock."); + } + + wakeup(tmplock); + TAILQ_REMOVE(&devsw_locks, tmplock, dl_list); + + lck_mtx_unlock(&devsw_lock_list_mtx); + + FREE(tmplock, M_TEMP); +} + +void +devsw_init() +{ + devsw_lock_grp = lck_grp_alloc_init("devsw", NULL); + assert(devsw_lock_grp != NULL); + + lck_mtx_init(&devsw_lock_list_mtx, devsw_lock_grp, NULL); + TAILQ_INIT(&devsw_locks); +} diff --git a/bsd/kern/decmpfs.c b/bsd/kern/decmpfs.c index d0483c0e4..33e3b3040 100644 --- a/bsd/kern/decmpfs.c +++ b/bsd/kern/decmpfs.c @@ -204,11 +204,12 @@ _decmp_get_func(uint32_t type, int offset) if (IOCatalogueMatchingDriversPresent(providesName)) { // there is a kext that says it will register for this type, so let's wait for it char resourceName[80]; + uint64_t delay = 10000000ULL; // 10 milliseconds. snprintf(resourceName, sizeof(resourceName), "com.apple.AppleFSCompression.Type%u", type); printf("waiting for %s\n", resourceName); while(decompressors[type] == NULL) { lck_rw_done(decompressorsLock); // we have to unlock to allow the kext to register - if (IOServiceWaitForMatchingResource(resourceName, 60)) { + if (IOServiceWaitForMatchingResource(resourceName, delay)) { break; } if (!IOCatalogueMatchingDriversPresent(providesName)) { @@ -217,6 +218,7 @@ _decmp_get_func(uint32_t type, int offset) break; } printf("still waiting for %s\n", resourceName); + delay *= 2; lck_rw_lock_shared(decompressorsLock); } // IOKit says the kext is loaded, so it should be registered too! @@ -659,11 +661,11 @@ decmpfs_file_is_compressed(vnode_t vp, decmpfs_cnode *cp) return 0; } - if (!vnode_isreg(vp)) { - /* only regular files can be compressed */ - ret = FILE_IS_NOT_COMPRESSED; - goto done; - } +// if (!vnode_isreg(vp)) { +// /* only regular files can be compressed */ +// ret = FILE_IS_NOT_COMPRESSED; +// goto done; +// } mp = vnode_mount(vp); if (mp == NULL) { @@ -1137,7 +1139,7 @@ decmpfs_pagein_compressed(struct vnop_pagein_args *ap, int *is_compressed, decmp else { if (!abort_pagein) { /* commit our pages */ - kr = commit_upl(pl, pl_offset, total_size, UPL_COMMIT_FREE_ON_EMPTY | UPL_COMMIT_INACTIVATE, 0); + kr = commit_upl(pl, pl_offset, total_size, UPL_COMMIT_FREE_ON_EMPTY, 0); } } diff --git a/bsd/kern/imageboot.c b/bsd/kern/imageboot.c index 0ed79dc69..8bc4ede36 100644 --- a/bsd/kern/imageboot.c +++ b/bsd/kern/imageboot.c @@ -35,6 +35,7 @@ #include #include #include +#include #include @@ -52,33 +53,68 @@ extern char rootdevice[]; #endif extern int di_root_image(const char *path, char devname[], dev_t *dev_p); +static boolean_t imageboot_setup_new(void); #define kIBFilePrefix "file://" -int +__private_extern__ int +imageboot_format_is_valid(const char *root_path) +{ + return (strncmp(root_path, kIBFilePrefix, + strlen(kIBFilePrefix)) == 0); +} + +static void +vnode_get_and_drop_always(vnode_t vp) +{ + vnode_getalways(vp); + vnode_rele(vp); + vnode_put(vp); +} + +__private_extern__ int imageboot_needed(void) { int result = 0; char *root_path = NULL; - + DBG_TRACE("%s: checking for presence of root path\n", __FUNCTION__); MALLOC_ZONE(root_path, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); if (root_path == NULL) panic("%s: M_NAMEI zone exhausted", __FUNCTION__); - if(PE_parse_boot_argn("rp", root_path, MAXPATHLEN) == TRUE) { - /* Got it, now verify scheme */ + /* Check for first layer */ + if (!(PE_parse_boot_argn("rp0", root_path, MAXPATHLEN) || + PE_parse_boot_argn("rp", root_path, MAXPATHLEN) || + PE_parse_boot_argn(IMAGEBOOT_ROOT_ARG, root_path, MAXPATHLEN))) { + goto out; + } + + /* Sanity-check first layer */ + if (imageboot_format_is_valid(root_path)) { + DBG_TRACE("%s: Found %s\n", __FUNCTION__, root_path); + } else { + goto out; + } - if (strncmp(root_path, kIBFilePrefix, - strlen(kIBFilePrefix)) == 0) { - DBG_TRACE("%s: Found %s\n", __FUNCTION__, root_path); - result = 1; - } else { - DBG_TRACE("%s: Invalid URL scheme for %s\n", - __FUNCTION__, root_path); - } + result = 1; + + /* Check for second layer */ + if (!(PE_parse_boot_argn("rp1", root_path, MAXPATHLEN) || + PE_parse_boot_argn(IMAGEBOOT_CONTAINER_ARG, root_path, MAXPATHLEN))) { + goto out; + } + + /* Sanity-check second layer */ + if (imageboot_format_is_valid(root_path)) { + DBG_TRACE("%s: Found %s\n", __FUNCTION__, root_path); + } else { + panic("%s: Invalid URL scheme for %s\n", + __FUNCTION__, root_path); } + +out: FREE_ZONE(root_path, MAXPATHLEN, M_NAMEI); return (result); @@ -86,97 +122,193 @@ imageboot_needed(void) /* - * We know there's an image. Attach it, and - * switch over to root off it - * - * NB: p is always kernproc + * Swaps in new root filesystem based on image path. + * Current root filesystem is removed from mount list and + * tagged MNTK_BACKS_ROOT, MNT_ROOTFS is cleared on it, and + * "rootvnode" is reset. Root vnode of currentroot filesystem + * is returned with usecount (no iocount). */ - -int -imageboot_setup() +__private_extern__ int +imageboot_mount_image(const char *root_path, int height) { - dev_t dev; - int error = 0; - char *root_path = NULL; + dev_t dev; + int error; + vnode_t old_rootvnode = NULL; + vnode_t newdp; + mount_t new_rootfs; - DBG_TRACE("%s: entry\n", __FUNCTION__); - - MALLOC_ZONE(root_path, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); - if (root_path == NULL) - return (ENOMEM); - - if(PE_parse_boot_argn("rp", root_path, MAXPATHLEN) == FALSE) { - error = ENOENT; - goto done; - } - - printf("%s: root image url is %s\n", __FUNCTION__, root_path); error = di_root_image(root_path, rootdevice, &dev); - if(error) { - printf("%s: di_root_image failed: %d\n", __FUNCTION__, error); - goto done; + if (error) { + panic("%s: di_root_image failed: %d\n", __FUNCTION__, error); } rootdev = dev; mountroot = NULL; printf("%s: root device 0x%x\n", __FUNCTION__, rootdev); error = vfs_mountroot(); + if (error != 0) { + panic("vfs_mountroot() failed.\n"); + } - if (error == 0 && rootvnode != NULL) { - vnode_t newdp, old_rootvnode; - mount_t new_rootfs, old_rootfs; + /* + * Get the vnode for '/'. + * Set fdp->fd_fd.fd_cdir to reference it. + */ + if (VFS_ROOT(TAILQ_LAST(&mountlist,mntlist), &newdp, vfs_context_kernel())) + panic("%s: cannot find root vnode", __FUNCTION__); - /* - * Get the vnode for '/'. - * Set fdp->fd_fd.fd_cdir to reference it. - */ - if (VFS_ROOT(TAILQ_LAST(&mountlist,mntlist), &newdp, vfs_context_kernel())) - panic("%s: cannot find root vnode", __FUNCTION__); + if (rootvnode != NULL) { + /* remember the old rootvnode, but remove it from mountlist */ + mount_t old_rootfs; old_rootvnode = rootvnode; old_rootfs = rootvnode->v_mount; - + mount_list_remove(old_rootfs); - + mount_lock(old_rootfs); #ifdef CONFIG_IMGSRC_ACCESS old_rootfs->mnt_kern_flag |= MNTK_BACKS_ROOT; #endif /* CONFIG_IMGSRC_ACCESS */ old_rootfs->mnt_flag &= ~MNT_ROOTFS; mount_unlock(old_rootfs); + } - rootvnode = newdp; + /* switch to the new rootvnode */ + rootvnode = newdp; - new_rootfs = rootvnode->v_mount; - mount_lock(new_rootfs); - new_rootfs->mnt_flag |= MNT_ROOTFS; - mount_unlock(new_rootfs); + new_rootfs = rootvnode->v_mount; + mount_lock(new_rootfs); + new_rootfs->mnt_flag |= MNT_ROOTFS; + mount_unlock(new_rootfs); - vnode_ref(newdp); - vnode_put(newdp); - filedesc0.fd_cdir = newdp; - DBG_TRACE("%s: root switched\n", __FUNCTION__); + vnode_ref(newdp); + vnode_put(newdp); + filedesc0.fd_cdir = newdp; + DBG_TRACE("%s: root switched\n", __FUNCTION__); + if (old_rootvnode != NULL) { #ifdef CONFIG_IMGSRC_ACCESS - if (PE_imgsrc_mount_supported()) { - imgsrc_rootvnode = old_rootvnode; - } else { - vnode_getalways(old_rootvnode); - vnode_rele(old_rootvnode); - vnode_put(old_rootvnode); - } + if (height >= 0 && PE_imgsrc_mount_supported()) { + imgsrc_rootvnodes[height] = old_rootvnode; + } else { + vnode_get_and_drop_always(old_rootvnode); + } #else - vnode_getalways(old_rootvnode); - vnode_rele(old_rootvnode); - vnode_put(old_rootvnode); + vnode_get_and_drop_always(old_rootvnode); #endif /* CONFIG_IMGSRC_ACCESS */ + } + return 0; +} +static boolean_t +imageboot_setup_new() +{ + int error; + char *root_path = NULL; + int height = 0; + boolean_t done = FALSE; + + MALLOC_ZONE(root_path, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); + assert(root_path != NULL); + + if(PE_parse_boot_argn(IMAGEBOOT_CONTAINER_ARG, root_path, MAXPATHLEN) == TRUE) { + printf("%s: container image url is %s\n", __FUNCTION__, root_path); + error = imageboot_mount_image(root_path, height); + if (error != 0) { + panic("Failed to mount container image."); + } + + height++; + } + + if (PE_parse_boot_argn(IMAGEBOOT_ROOT_ARG, root_path, MAXPATHLEN) == FALSE) { + if (height > 0) { + panic("%s specified without %s?\n", IMAGEBOOT_CONTAINER_ARG, IMAGEBOOT_ROOT_ARG); + } + goto out; } + + printf("%s: root image url is %s\n", __FUNCTION__, root_path); + + error = imageboot_mount_image(root_path, height); + if (error != 0) { + panic("Failed to mount root image."); + } + + done = TRUE; + +out: + FREE_ZONE(root_path, MAXPATHLEN, M_NAMEI); + return done; +} + +__private_extern__ void +imageboot_setup() +{ + int error = 0; + char *root_path = NULL; + + DBG_TRACE("%s: entry\n", __FUNCTION__); + + if (rootvnode == NULL) { + panic("imageboot_setup: rootvnode is NULL."); + } + + /* + * New boot-arg scheme: + * root-dmg : the dmg that will be the root filesystem. + * container-dmg : an optional dmg that contains the root-dmg. + */ + if (imageboot_setup_new()) { + return; + } + + MALLOC_ZONE(root_path, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); + assert(root_path != NULL); + + /* + * Look for outermost disk image to root from. If we're doing a nested boot, + * there's some sense in which the outer image never needs to be the root filesystem, + * but it does need very similar treatment: it must not be unmounted, needs a fake + * device vnode created for it, and should not show up in getfsstat() until exposed + * with MNT_IMGSRC. We just make it the temporary root. + */ + if((PE_parse_boot_argn("rp", root_path, MAXPATHLEN) == FALSE) && + (PE_parse_boot_argn("rp0", root_path, MAXPATHLEN) == FALSE)) { + panic("%s: no valid path to image.\n", __FUNCTION__); + } + + printf("%s: root image url is %s\n", __FUNCTION__, root_path); + + error = imageboot_mount_image(root_path, 0); + if (error) { + panic("Failed on first stage of imageboot."); + } + + /* + * See if we are rooting from a nested image + */ + if(PE_parse_boot_argn("rp1", root_path, MAXPATHLEN) == FALSE) { + goto done; + } + + printf("%s: second level root image url is %s\n", __FUNCTION__, root_path); + + /* + * If we fail to set up second image, it's not a given that we + * can safely root off the first. + */ + error = imageboot_mount_image(root_path, 1); + if (error) { + panic("Failed on second stage of imageboot."); + } + done: FREE_ZONE(root_path, MAXPATHLEN, M_NAMEI); DBG_TRACE("%s: exit\n", __FUNCTION__); - return (error); + return; } diff --git a/bsd/kern/kdebug.c b/bsd/kern/kdebug.c index 3eb9043dd..f7c7fa73a 100644 --- a/bsd/kern/kdebug.c +++ b/bsd/kern/kdebug.c @@ -20,6 +20,7 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ + #include #include @@ -30,6 +31,7 @@ #include #include #include +#include #define HZ 100 #include @@ -38,11 +40,18 @@ #include #if defined(__i386__) || defined(__x86_64__) -#include +#include +#include +#include #endif + +#include + #include #include #include +#include +#include #include #include #include @@ -54,10 +63,14 @@ #include #include #include +#include +#include #include /* for host_info() */ #include +#include + /* XXX should have prototypes, but Mach does not provide one */ void task_act_iterate_wth_args(task_t, void(*)(thread_t, void *), void *); int cpu_number(void); /* XXX include path broken */ @@ -74,18 +87,14 @@ int kdbg_setrtcdec(kd_regtype *); int kdbg_setpidex(kd_regtype *); int kdbg_setpid(kd_regtype *); void kdbg_mapinit(void); -int kdbg_reinit(void); -int kdbg_bootstrap(void); +int kdbg_reinit(boolean_t); +int kdbg_bootstrap(boolean_t); -static int create_buffers(void); +static int create_buffers(boolean_t); static void delete_buffers(void); extern void IOSleep(int); -#ifdef ppc -extern uint32_t maxDec; -#endif - /* trace enable status */ unsigned int kdebug_enable = 0; @@ -96,23 +105,38 @@ unsigned int kd_entropy_count = 0; unsigned int kd_entropy_indx = 0; vm_offset_t kd_entropy_buftomem = 0; +#define MAX_ENTROPY_COUNT (128 * 1024) + #define SLOW_NOLOG 0x01 #define SLOW_CHECKS 0x02 #define SLOW_ENTROPY 0x04 - -unsigned int kdebug_slowcheck = SLOW_NOLOG; +#define SLOW_CHUD 0x08 unsigned int kd_cpus; #define EVENTS_PER_STORAGE_UNIT 2048 #define MIN_STORAGE_UNITS_PER_CPU 4 +#define POINTER_FROM_KDS_PTR(x) (&kd_bufs[x.buffer_index].kdsb_addr[x.offset]) + +#define NATIVE_TRACE_FACILITY + +union kds_ptr { + struct { + uint32_t buffer_index:21; + uint16_t offset:11; + }; + uint32_t raw; +}; + struct kd_storage { - struct kd_storage *kds_next; - kd_buf *kds_bufptr; - kd_buf *kds_buflast; - kd_buf *kds_readlast; + union kds_ptr kds_next; + uint32_t kds_bufindx; + uint32_t kds_bufcnt; + uint32_t kds_readlast; + boolean_t kds_lostevents; + uint64_t kds_timestamp; kd_buf kds_records[EVENTS_PER_STORAGE_UNIT]; }; @@ -120,34 +144,52 @@ struct kd_storage { #define MAX_BUFFER_SIZE (1024 * 1024 * 128) #define N_STORAGE_UNITS_PER_BUFFER (MAX_BUFFER_SIZE / sizeof(struct kd_storage)) - struct kd_storage_buffers { struct kd_storage *kdsb_addr; uint32_t kdsb_size; }; - -struct kd_storage *kds_free_list = NULL; +#define KDS_PTR_NULL 0xffffffff struct kd_storage_buffers *kd_bufs = NULL; int n_storage_units = 0; int n_storage_buffers = 0; +int n_storage_threshold = 0; +int kds_waiter = 0; +int kde_waiter = 0; +#pragma pack(0) struct kd_bufinfo { - struct kd_storage *kd_list_head; - struct kd_storage *kd_list_tail; - struct kd_storage *kd_active; - uint64_t kd_prev_timebase; + union kds_ptr kd_list_head; + union kds_ptr kd_list_tail; + boolean_t kd_lostevents; + uint32_t _pad; + uint64_t kd_prev_timebase; + uint32_t num_bufs; } __attribute__(( aligned(CPU_CACHE_SIZE) )); +struct kd_ctrl_page_t { + union kds_ptr kds_free_list; + uint32_t enabled :1; + uint32_t _pad0 :31; + int kds_inuse_count; + uint32_t kdebug_flags; + uint32_t kdebug_slowcheck; + uint32_t _pad1; + struct { + uint64_t tsc_base; + uint64_t ns_base; + } cpu_timebase[32]; // should be max number of actual logical cpus +} kd_ctrl_page = {.kds_free_list = {.raw = KDS_PTR_NULL}, .enabled = 0, .kds_inuse_count = 0, .kdebug_flags = 0, .kdebug_slowcheck = SLOW_NOLOG}; +#pragma pack() + struct kd_bufinfo *kdbip = NULL; -#define KDCOPYBUF_COUNT 2048 +#define KDCOPYBUF_COUNT 8192 #define KDCOPYBUF_SIZE (KDCOPYBUF_COUNT * sizeof(kd_buf)) kd_buf *kdcopybuf = NULL; unsigned int nkdbufs = 8192; -unsigned int kdebug_flags = 0; unsigned int kdlog_beg=0; unsigned int kdlog_end=0; unsigned int kdlog_value1=0; @@ -155,6 +197,7 @@ unsigned int kdlog_value2=0; unsigned int kdlog_value3=0; unsigned int kdlog_value4=0; +static lck_spin_t * kdw_spin_lock; static lck_spin_t * kds_spin_lock; static lck_mtx_t * kd_trace_mtx_sysctl; static lck_grp_t * kd_trace_mtx_sysctl_grp; @@ -185,10 +228,21 @@ unsigned int kd_mapcount = 0; vm_offset_t kd_maptomem = 0; off_t RAW_file_offset = 0; +int RAW_file_written = 0; + +#define RAW_FLUSH_SIZE (2 * 1024 * 1024) + pid_t global_state_pid = -1; /* Used to control exclusive use of kd_buffer */ -#define DBG_FUNC_MASK 0xfffffffc +#define DBG_FUNC_MASK 0xfffffffc + +#define INTERRUPT 0x01050000 +#define MACH_vmfault 0x01300008 +#define BSC_SysCall 0x040c0000 +#define MACH_SysCall 0x010c0000 +#define DBG_SCALL_MASK 0xffff0000 + /* task to string structure */ struct tts @@ -202,10 +256,10 @@ typedef struct tts tts_t; struct krt { - kd_threadmap *map; /* pointer to the map buffer */ - int count; - int maxcount; - struct tts *atts; + kd_threadmap *map; /* pointer to the map buffer */ + int count; + int maxcount; + struct tts *atts; }; typedef struct krt krt_t; @@ -215,24 +269,102 @@ typedef void (*kd_chudhook_fn) (uint32_t debugid, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5); -kd_chudhook_fn kdebug_chudhook = 0; /* pointer to CHUD toolkit function */ +volatile kd_chudhook_fn kdebug_chudhook = 0; /* pointer to CHUD toolkit function */ __private_extern__ void stackshot_lock_init( void ) __attribute__((section("__TEXT, initcode"))); -/* Support syscall SYS_kdebug_trace */ -int -kdebug_trace(__unused struct proc *p, struct kdebug_trace_args *uap, __unused int32_t *retval) +static void +kdbg_set_tracing_enabled(boolean_t enabled) { - if ( (kdebug_enable == 0) ) - return(EINVAL); - - kernel_debug(uap->code, uap->arg1, uap->arg2, uap->arg3, uap->arg4, 0); - return(0); + int s = ml_set_interrupts_enabled(FALSE); + lck_spin_lock(kds_spin_lock); + + if (enabled) { + kdebug_enable |= KDEBUG_ENABLE_TRACE; + kd_ctrl_page.kdebug_slowcheck &= ~SLOW_NOLOG; + kd_ctrl_page.enabled = 1; + } else { + kdebug_enable &= ~KDEBUG_ENABLE_TRACE; + kd_ctrl_page.kdebug_slowcheck |= SLOW_NOLOG; + kd_ctrl_page.enabled = 0; + } + lck_spin_unlock(kds_spin_lock); + ml_set_interrupts_enabled(s); } +static void +kdbg_set_flags(int slowflag, int enableflag, boolean_t enabled) +{ + int s = ml_set_interrupts_enabled(FALSE); + lck_spin_lock(kds_spin_lock); + + if (enabled) { + kd_ctrl_page.kdebug_slowcheck |= slowflag; + kdebug_enable |= enableflag; + } else { + kd_ctrl_page.kdebug_slowcheck &= ~slowflag; + kdebug_enable &= ~enableflag; + } + lck_spin_unlock(kds_spin_lock); + ml_set_interrupts_enabled(s); +} + + +#ifdef NATIVE_TRACE_FACILITY +void +disable_wrap(uint32_t *old_slowcheck, uint32_t *old_flags) +{ + int s = ml_set_interrupts_enabled(FALSE); + lck_spin_lock(kds_spin_lock); + + *old_slowcheck = kd_ctrl_page.kdebug_slowcheck; + *old_flags = kd_ctrl_page.kdebug_flags; + + kd_ctrl_page.kdebug_flags &= ~KDBG_WRAPPED; + kd_ctrl_page.kdebug_flags |= KDBG_NOWRAP; + + lck_spin_unlock(kds_spin_lock); + ml_set_interrupts_enabled(s); +} + +void +enable_wrap(uint32_t old_slowcheck, boolean_t lostevents) +{ + int s = ml_set_interrupts_enabled(FALSE); + lck_spin_lock(kds_spin_lock); + + kd_ctrl_page.kdebug_flags &= ~KDBG_NOWRAP; + + if ( !(old_slowcheck & SLOW_NOLOG)) + kd_ctrl_page.kdebug_slowcheck &= ~SLOW_NOLOG; + + if (lostevents == TRUE) + kd_ctrl_page.kdebug_flags |= KDBG_WRAPPED; + + lck_spin_unlock(kds_spin_lock); + ml_set_interrupts_enabled(s); +} + +void trace_set_timebases(__unused uint64_t tsc, __unused uint64_t ns) +{ +} +#else +/* Begin functions that are defined twice */ +void trace_set_timebases(uint64_t tsc, uint64_t ns) +{ + int cpu = cpu_number(); + kd_ctrl_page.cpu_timebase[cpu].tsc_base = tsc; + kd_ctrl_page.cpu_timebase[cpu].ns_base = ns; +} + +#endif static int -create_buffers(void) +#if defined(__i386__) || defined(__x86_64__) +create_buffers(boolean_t early_trace) +#else +create_buffers(__unused boolean_t early_trace) +#endif { int i; int p_buffer_size; @@ -240,6 +372,42 @@ create_buffers(void) int f_buffers; int error = 0; + /* + * get the number of cpus and cache it + */ +#if defined(__i386__) || defined(__x86_64__) + if (early_trace == TRUE) { + /* + * we've started tracing before the + * IOKit has even started running... just + * use the static max value + */ + kd_cpus = max_ncpus; + } else +#endif + { + host_basic_info_data_t hinfo; + mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT; + +#define BSD_HOST 1 + host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count); + kd_cpus = hinfo.logical_cpu_max; + } + if (kmem_alloc(kernel_map, (vm_offset_t *)&kdbip, sizeof(struct kd_bufinfo) * kd_cpus) != KERN_SUCCESS) { + error = ENOSPC; + goto out; + } + + trace_handler_map_bufinfo((uintptr_t)kdbip, sizeof(struct kd_bufinfo) * kd_cpus); + +#if !defined(NATIVE_TRACE_FACILITY) + for(i=0;i<(int)kd_cpus;i++) { + get_nanotime_timebases(i, + &kd_ctrl_page.cpu_timebase[i].tsc_base, + &kd_ctrl_page.cpu_timebase[i].ns_base); + } +#endif + if (nkdbufs < (kd_cpus * EVENTS_PER_STORAGE_UNIT * MIN_STORAGE_UNITS_PER_CPU)) n_storage_units = kd_cpus * MIN_STORAGE_UNITS_PER_CPU; else @@ -275,6 +443,8 @@ create_buffers(void) error = ENOSPC; goto out; } + bzero(kd_bufs[i].kdsb_addr, f_buffer_size); + kd_bufs[i].kdsb_size = f_buffer_size; } if (p_buffer_size) { @@ -282,8 +452,11 @@ create_buffers(void) error = ENOSPC; goto out; } + bzero(kd_bufs[i].kdsb_addr, p_buffer_size); + kd_bufs[i].kdsb_size = p_buffer_size; } + n_storage_units = 0; for (i = 0; i < n_storage_buffers; i++) { struct kd_storage *kds; @@ -293,16 +466,31 @@ create_buffers(void) n_elements = kd_bufs[i].kdsb_size / sizeof(struct kd_storage); kds = kd_bufs[i].kdsb_addr; + trace_handler_map_buffer(i, (uintptr_t)kd_bufs[i].kdsb_addr, kd_bufs[i].kdsb_size); + for (n = 0; n < n_elements; n++) { - kds[n].kds_next = kds_free_list; - kds_free_list = &kds[n]; + kds[n].kds_next.buffer_index = kd_ctrl_page.kds_free_list.buffer_index; + kds[n].kds_next.offset = kd_ctrl_page.kds_free_list.offset; - kds[n].kds_buflast = &kds[n].kds_records[EVENTS_PER_STORAGE_UNIT]; + kd_ctrl_page.kds_free_list.buffer_index = i; + kd_ctrl_page.kds_free_list.offset = n; } + n_storage_units += n_elements; } + bzero((char *)kdbip, sizeof(struct kd_bufinfo) * kd_cpus); - kdebug_flags |= KDBG_BUFINIT; + for (i = 0; i < (int)kd_cpus; i++) { + kdbip[i].kd_list_head.raw = KDS_PTR_NULL; + kdbip[i].kd_list_tail.raw = KDS_PTR_NULL; + kdbip[i].kd_lostevents = FALSE; + kdbip[i].num_bufs = 0; + } + + kd_ctrl_page.kdebug_flags |= KDBG_BUFINIT; + + kd_ctrl_page.kds_inuse_count = 0; + n_storage_threshold = n_storage_units / 2; out: if (error) delete_buffers(); @@ -318,8 +506,10 @@ delete_buffers(void) if (kd_bufs) { for (i = 0; i < n_storage_buffers; i++) { - if (kd_bufs[i].kdsb_addr) + if (kd_bufs[i].kdsb_addr) { kmem_free(kernel_map, (vm_offset_t)kd_bufs[i].kdsb_addr, (vm_size_t)kd_bufs[i].kdsb_size); + trace_handler_unmap_buffer(i); + } } kmem_free(kernel_map, (vm_offset_t)kd_bufs, (vm_size_t)(n_storage_buffers * sizeof(struct kd_storage_buffers))); @@ -331,58 +521,92 @@ delete_buffers(void) kdcopybuf = NULL; } - kds_free_list = NULL; + kd_ctrl_page.kds_free_list.raw = KDS_PTR_NULL; - kdebug_flags &= ~KDBG_BUFINIT; + if (kdbip) { + trace_handler_unmap_bufinfo(); + + kmem_free(kernel_map, (vm_offset_t)kdbip, sizeof(struct kd_bufinfo) * kd_cpus); + + kdbip = NULL; + } + kd_ctrl_page.kdebug_flags &= ~KDBG_BUFINIT; } -static void -release_storage_unit(struct kd_bufinfo *kdbp, struct kd_storage *kdsp) +#ifdef NATIVE_TRACE_FACILITY +void +release_storage_unit(int cpu, uint32_t kdsp_raw) { - int s = 0; + struct kd_storage *kdsp_actual; + struct kd_bufinfo *kdbp; + union kds_ptr kdsp; + + kdsp.raw = kdsp_raw; + s = ml_set_interrupts_enabled(FALSE); lck_spin_lock(kds_spin_lock); - if (kdsp == kdbp->kd_list_head) { + kdbp = &kdbip[cpu]; + + if (kdsp.raw == kdbp->kd_list_head.raw) { /* - * its possible for the storage unit pointed to + * it's possible for the storage unit pointed to * by kdsp to have already been stolen... so - * check to see if its still the head of the list + * check to see if it's still the head of the list * now that we're behind the lock that protects * adding and removing from the queue... * since we only ever release and steal units from - * that position, if its no longer the head + * that position, if it's no longer the head * we having nothing to do in this context */ - kdbp->kd_list_head = kdsp->kds_next; + kdsp_actual = POINTER_FROM_KDS_PTR(kdsp); + kdbp->kd_list_head = kdsp_actual->kds_next; - kdsp->kds_next = kds_free_list; - kds_free_list = kdsp; + kdsp_actual->kds_next = kd_ctrl_page.kds_free_list; + kd_ctrl_page.kds_free_list = kdsp; + + kd_ctrl_page.kds_inuse_count--; } lck_spin_unlock(kds_spin_lock); ml_set_interrupts_enabled(s); } -/* - * Interrupts are disabled when we enter this routine. - */ -static struct kd_storage * -allocate_storage_unit(struct kd_bufinfo *kdbp) +boolean_t +allocate_storage_unit(int cpu) { - struct kd_storage *kdsp; - struct kd_bufinfo *kdbp_vict, *kdbp_try; + union kds_ptr kdsp; + struct kd_storage *kdsp_actual; + struct kd_bufinfo *kdbp, *kdbp_vict, *kdbp_try; uint64_t oldest_ts, ts; + boolean_t retval = TRUE; + int s = 0; + s = ml_set_interrupts_enabled(FALSE); lck_spin_lock(kds_spin_lock); - if ((kdsp = kds_free_list)) - kds_free_list = kdsp->kds_next; - else { - if (kdebug_flags & KDBG_NOWRAP) { - kdebug_slowcheck |= SLOW_NOLOG; + kdbp = &kdbip[cpu]; + + /* If someone beat us to the allocate, return success */ + if (kdbp->kd_list_tail.raw != KDS_PTR_NULL) { + kdsp_actual = POINTER_FROM_KDS_PTR(kdbp->kd_list_tail); + + if (kdsp_actual->kds_bufindx < EVENTS_PER_STORAGE_UNIT) + goto out; + } + + if ((kdsp = kd_ctrl_page.kds_free_list).raw != KDS_PTR_NULL) { + kdsp_actual = POINTER_FROM_KDS_PTR(kdsp); + kd_ctrl_page.kds_free_list = kdsp_actual->kds_next; + + kd_ctrl_page.kds_inuse_count++; + } else { + if (kd_ctrl_page.kdebug_flags & KDBG_NOWRAP) { + kd_ctrl_page.kdebug_slowcheck |= SLOW_NOLOG; + kdbp->kd_lostevents = TRUE; + retval = FALSE; goto out; } kdbp_vict = NULL; @@ -390,22 +614,25 @@ allocate_storage_unit(struct kd_bufinfo *kdbp) for (kdbp_try = &kdbip[0]; kdbp_try < &kdbip[kd_cpus]; kdbp_try++) { - if ((kdsp = kdbp_try->kd_list_head) == NULL) { + if (kdbp_try->kd_list_head.raw == KDS_PTR_NULL) { /* * no storage unit to steal */ continue; } - if (kdsp == kdbp_try->kd_active) { + + kdsp_actual = POINTER_FROM_KDS_PTR(kdbp_try->kd_list_head); + + if (kdsp_actual->kds_bufcnt < EVENTS_PER_STORAGE_UNIT) { /* * make sure we don't steal the storage unit - * being actively recorded to... this state - * also implies that this is the only unit assigned - * to this CPU, so we can immediately move on + * being actively recorded to... need to + * move on because we don't want an out-of-order + * set of events showing up later */ continue; } - ts = kdbg_get_timestamp(&(kdbp_try->kd_list_head->kds_records[0])); + ts = kdbg_get_timestamp(&kdsp_actual->kds_records[0]); if (ts < oldest_ts) { /* @@ -417,37 +644,52 @@ allocate_storage_unit(struct kd_bufinfo *kdbp) kdbp_vict = kdbp_try; } } -#if 1 if (kdbp_vict == NULL) { kdebug_enable = 0; - - panic("allocate_storage_unit: no storage units available\n"); + kd_ctrl_page.enabled = 0; + retval = FALSE; + goto out; } -#endif kdsp = kdbp_vict->kd_list_head; + kdsp_actual = POINTER_FROM_KDS_PTR(kdsp); - kdbp_vict->kd_list_head = kdsp->kds_next; + kdbp_vict->kd_list_head = kdsp_actual->kds_next; - kdebug_flags |= KDBG_WRAPPED; + kd_ctrl_page.kdebug_flags |= KDBG_WRAPPED; } - kdsp->kds_next = NULL; - kdsp->kds_bufptr = &kdsp->kds_records[0]; - kdsp->kds_readlast = kdsp->kds_bufptr; + kdsp_actual->kds_timestamp = mach_absolute_time(); + kdsp_actual->kds_next.raw = KDS_PTR_NULL; + kdsp_actual->kds_bufcnt = 0; + kdsp_actual->kds_readlast = 0; + + kdsp_actual->kds_lostevents = kdbp->kd_lostevents; + kdbp->kd_lostevents = FALSE; + kdsp_actual->kds_bufindx = 0; - if (kdbp->kd_list_head == NULL) + if (kdbp->kd_list_head.raw == KDS_PTR_NULL) kdbp->kd_list_head = kdsp; else - kdbp->kd_list_tail->kds_next = kdsp; + POINTER_FROM_KDS_PTR(kdbp->kd_list_tail)->kds_next = kdsp; kdbp->kd_list_tail = kdsp; out: lck_spin_unlock(kds_spin_lock); + ml_set_interrupts_enabled(s); - return (kdsp); + return (retval); } +#endif +void +kernel_debug_internal( + uint32_t debugid, + uintptr_t arg1, + uintptr_t arg2, + uintptr_t arg3, + uintptr_t arg4, + uintptr_t arg5, + int entropy_flag); - -static void +__attribute__((always_inline)) void kernel_debug_internal( uint32_t debugid, uintptr_t arg1, @@ -459,92 +701,118 @@ kernel_debug_internal( { struct proc *curproc; uint64_t now; - int s; + uint32_t bindx; + boolean_t s; kd_buf *kd; int cpu; struct kd_bufinfo *kdbp; - struct kd_storage *kdsp; + struct kd_storage *kdsp_actual; - s = ml_set_interrupts_enabled(FALSE); - now = mach_absolute_time() & KDBG_TIMESTAMP_MASK; - cpu = cpu_number(); - - if (kdebug_enable & KDEBUG_ENABLE_CHUD) { - if (kdebug_chudhook) - kdebug_chudhook(debugid, arg1, arg2, arg3, arg4, arg5); - - if ( !(kdebug_enable & (KDEBUG_ENABLE_ENTROPY | KDEBUG_ENABLE_TRACE))) - goto out; - } - if (kdebug_slowcheck == 0) - goto record_trace; + if (kd_ctrl_page.kdebug_slowcheck) { - if (entropy_flag && (kdebug_enable & KDEBUG_ENABLE_ENTROPY)) { - if (kd_entropy_indx < kd_entropy_count) { - kd_entropy_buffer [ kd_entropy_indx] = mach_absolute_time(); - kd_entropy_indx++; - } - - if (kd_entropy_indx == kd_entropy_count) { + if (kdebug_enable & KDEBUG_ENABLE_CHUD) { + kd_chudhook_fn chudhook; /* - * Disable entropy collection + * Mask interrupts to minimize the interval across + * which the driver providing the hook could be + * unloaded. */ - kdebug_enable &= ~KDEBUG_ENABLE_ENTROPY; - kdebug_slowcheck &= ~SLOW_ENTROPY; + s = ml_set_interrupts_enabled(FALSE); + chudhook = kdebug_chudhook; + if (chudhook) + chudhook(debugid, arg1, arg2, arg3, arg4, arg5); + ml_set_interrupts_enabled(s); } - } - if ( (kdebug_slowcheck & SLOW_NOLOG) ) - goto out; + if ((kdebug_enable & KDEBUG_ENABLE_ENTROPY) && entropy_flag) { + + now = mach_absolute_time(); + + s = ml_set_interrupts_enabled(FALSE); + lck_spin_lock(kds_spin_lock); + + if (kdebug_enable & KDEBUG_ENABLE_ENTROPY) { + + if (kd_entropy_indx < kd_entropy_count) { + kd_entropy_buffer[kd_entropy_indx] = now; + kd_entropy_indx++; + } + if (kd_entropy_indx == kd_entropy_count) { + /* + * Disable entropy collection + */ + kdebug_enable &= ~KDEBUG_ENABLE_ENTROPY; + kd_ctrl_page.kdebug_slowcheck &= ~SLOW_ENTROPY; + } + } + lck_spin_unlock(kds_spin_lock); + ml_set_interrupts_enabled(s); + } + if ( (kd_ctrl_page.kdebug_slowcheck & SLOW_NOLOG) || !(kdebug_enable & KDEBUG_ENABLE_TRACE)) + goto out1; - if (kdebug_flags & KDBG_PIDCHECK) { - /* - * If kdebug flag is not set for current proc, return - */ - curproc = current_proc(); + if ( !ml_at_interrupt_context()) { + if (kd_ctrl_page.kdebug_flags & KDBG_PIDCHECK) { + /* + * If kdebug flag is not set for current proc, return + */ + curproc = current_proc(); - if ((curproc && !(curproc->p_kdebug)) && - ((debugid & 0xffff0000) != (MACHDBG_CODE(DBG_MACH_SCHED, 0) | DBG_FUNC_NONE))) - goto out; - } - else if (kdebug_flags & KDBG_PIDEXCLUDE) { - /* - * If kdebug flag is set for current proc, return - */ - curproc = current_proc(); + if ((curproc && !(curproc->p_kdebug)) && + ((debugid & 0xffff0000) != (MACHDBG_CODE(DBG_MACH_SCHED, 0) | DBG_FUNC_NONE))) + goto out1; + } + else if (kd_ctrl_page.kdebug_flags & KDBG_PIDEXCLUDE) { + /* + * If kdebug flag is set for current proc, return + */ + curproc = current_proc(); - if ((curproc && curproc->p_kdebug) && - ((debugid & 0xffff0000) != (MACHDBG_CODE(DBG_MACH_SCHED, 0) | DBG_FUNC_NONE))) - goto out; - } - if (kdebug_flags & KDBG_RANGECHECK) { - if ((debugid < kdlog_beg) - || ((debugid >= kdlog_end) && (debugid >> 24 != DBG_TRACE))) - goto out; - } - else if (kdebug_flags & KDBG_VALCHECK) { - if ((debugid & DBG_FUNC_MASK) != kdlog_value1 && - (debugid & DBG_FUNC_MASK) != kdlog_value2 && - (debugid & DBG_FUNC_MASK) != kdlog_value3 && - (debugid & DBG_FUNC_MASK) != kdlog_value4 && - (debugid >> 24 != DBG_TRACE)) - goto out; + if ((curproc && curproc->p_kdebug) && + ((debugid & 0xffff0000) != (MACHDBG_CODE(DBG_MACH_SCHED, 0) | DBG_FUNC_NONE))) + goto out1; + } + } + if (kd_ctrl_page.kdebug_flags & KDBG_RANGECHECK) { + if ((debugid < kdlog_beg) + || ((debugid >= kdlog_end) && (debugid >> 24 != DBG_TRACE))) + goto out1; + } + else if (kd_ctrl_page.kdebug_flags & KDBG_VALCHECK) { + if ((debugid & DBG_FUNC_MASK) != kdlog_value1 && + (debugid & DBG_FUNC_MASK) != kdlog_value2 && + (debugid & DBG_FUNC_MASK) != kdlog_value3 && + (debugid & DBG_FUNC_MASK) != kdlog_value4 && + (debugid >> 24 != DBG_TRACE)) + goto out1; + } } - -record_trace: + disable_preemption(); + cpu = cpu_number(); kdbp = &kdbip[cpu]; - - if ((kdsp = kdbp->kd_active) == NULL) { - if ((kdsp = allocate_storage_unit(kdbp)) == NULL) { +retry_q: + if (kdbp->kd_list_tail.raw != KDS_PTR_NULL) { + kdsp_actual = POINTER_FROM_KDS_PTR(kdbp->kd_list_tail); + bindx = kdsp_actual->kds_bufindx; + } else + kdsp_actual = NULL; + + if (kdsp_actual == NULL || bindx >= EVENTS_PER_STORAGE_UNIT) { + if (allocate_storage_unit(cpu) == FALSE) { /* * this can only happen if wrapping * has been disabled */ goto out; } - kdbp->kd_active = kdsp; + goto retry_q; } - kd = kdsp->kds_bufptr; + now = mach_absolute_time() & KDBG_TIMESTAMP_MASK; + + if ( !OSCompareAndSwap(bindx, bindx + 1, &kdsp_actual->kds_bufindx)) + goto retry_q; + + kd = &kdsp_actual->kds_records[bindx]; kd->debugid = debugid; kd->arg1 = arg1; @@ -555,12 +823,56 @@ kernel_debug_internal( kdbg_set_timestamp_and_cpu(kd, now, cpu); - kdsp->kds_bufptr++; - - if (kdsp->kds_bufptr >= kdsp->kds_buflast) - kdbp->kd_active = NULL; + OSAddAtomic(1, &kdsp_actual->kds_bufcnt); out: - ml_set_interrupts_enabled(s); + enable_preemption(); +out1: + if ((kds_waiter && kd_ctrl_page.kds_inuse_count >= n_storage_threshold) || + (kde_waiter && kd_entropy_indx >= kd_entropy_count)) { + uint32_t etype; + uint32_t stype; + + etype = debugid & DBG_FUNC_MASK; + stype = debugid & DBG_SCALL_MASK; + + if (etype == INTERRUPT || etype == MACH_vmfault || + stype == BSC_SysCall || stype == MACH_SysCall) { + + boolean_t need_kds_wakeup = FALSE; + boolean_t need_kde_wakeup = FALSE; + + /* + * try to take the lock here to synchronize with the + * waiter entering the blocked state... use the try + * mode to prevent deadlocks caused by re-entering this + * routine due to various trace points triggered in the + * lck_spin_sleep_xxxx routines used to actually enter + * one of our 2 wait conditions... no problem if we fail, + * there will be lots of additional events coming in that + * will eventually succeed in grabbing this lock + */ + s = ml_set_interrupts_enabled(FALSE); + + if (lck_spin_try_lock(kdw_spin_lock)) { + + if (kds_waiter && kd_ctrl_page.kds_inuse_count >= n_storage_threshold) { + kds_waiter = 0; + need_kds_wakeup = TRUE; + } + if (kde_waiter && kd_entropy_indx >= kd_entropy_count) { + kde_waiter = 0; + need_kde_wakeup = TRUE; + } + lck_spin_unlock(kdw_spin_lock); + } + ml_set_interrupts_enabled(s); + + if (need_kds_wakeup == TRUE) + wakeup(&kds_waiter); + if (need_kde_wakeup == TRUE) + wakeup(&kde_waiter); + } + } } void @@ -584,27 +896,32 @@ kernel_debug1( uintptr_t arg4, uintptr_t arg5) { - kernel_debug_internal(debugid, arg1, arg2, arg3, arg4, arg5, 0); + kernel_debug_internal(debugid, arg1, arg2, arg3, arg4, arg5, 1); } -static void -kdbg_lock_init(void) +/* + * Support syscall SYS_kdebug_trace + */ +int +kdebug_trace(__unused struct proc *p, struct kdebug_trace_args *uap, __unused int32_t *retval) { - host_basic_info_data_t hinfo; - mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT; + if ( __probable(kdebug_enable == 0) ) + return(EINVAL); + + kernel_debug_internal(uap->code, uap->arg1, uap->arg2, uap->arg3, uap->arg4, (uintptr_t)thread_tid(current_thread()), 0); - if (kdebug_flags & KDBG_LOCKINIT) - return; + return(0); +} - /* get the number of cpus and cache it */ -#define BSD_HOST 1 - host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count); - kd_cpus = hinfo.logical_cpu_max; - if (kmem_alloc(kernel_map, (vm_offset_t *)&kdbip, - sizeof(struct kd_bufinfo) * kd_cpus) != KERN_SUCCESS) - return; +static void +kdbg_lock_init(void) +{ + if (kd_ctrl_page.kdebug_flags & KDBG_LOCKINIT) + return; + trace_handler_map_ctrl_page((uintptr_t)&kd_ctrl_page, sizeof(kd_ctrl_page), sizeof(struct kd_storage), sizeof(union kds_ptr)); + /* * allocate lock group attribute and group */ @@ -618,25 +935,26 @@ kdbg_lock_init(void) /* - * allocate and initialize spin lock and mutex + * allocate and initialize mutex's */ kd_trace_mtx_sysctl = lck_mtx_alloc_init(kd_trace_mtx_sysctl_grp, kd_trace_mtx_sysctl_attr); kds_spin_lock = lck_spin_alloc_init(kd_trace_mtx_sysctl_grp, kd_trace_mtx_sysctl_attr); + kdw_spin_lock = lck_spin_alloc_init(kd_trace_mtx_sysctl_grp, kd_trace_mtx_sysctl_attr); - kdebug_flags |= KDBG_LOCKINIT; + kd_ctrl_page.kdebug_flags |= KDBG_LOCKINIT; } int -kdbg_bootstrap(void) +kdbg_bootstrap(boolean_t early_trace) { - kdebug_flags &= ~KDBG_WRAPPED; + kd_ctrl_page.kdebug_flags &= ~KDBG_WRAPPED; - return (create_buffers()); + return (create_buffers(early_trace)); } int -kdbg_reinit(void) +kdbg_reinit(boolean_t early_trace) { int ret = 0; @@ -645,8 +963,7 @@ kdbg_reinit(void) * First make sure we're not in * the middle of cutting a trace */ - kdebug_enable &= ~KDEBUG_ENABLE_TRACE; - kdebug_slowcheck |= SLOW_NOLOG; + kdbg_set_tracing_enabled(FALSE); /* * make sure the SLOW_NOLOG is seen @@ -657,14 +974,17 @@ kdbg_reinit(void) delete_buffers(); - if ((kdebug_flags & KDBG_MAPINIT) && kd_mapsize && kd_mapptr) { + if ((kd_ctrl_page.kdebug_flags & KDBG_MAPINIT) && kd_mapsize && kd_mapptr) { kmem_free(kernel_map, (vm_offset_t)kd_mapptr, kd_mapsize); - kdebug_flags &= ~KDBG_MAPINIT; + kd_ctrl_page.kdebug_flags &= ~KDBG_MAPINIT; kd_mapsize = 0; kd_mapptr = (kd_threadmap *) 0; kd_mapcount = 0; } - ret = kdbg_bootstrap(); + ret = kdbg_bootstrap(early_trace); + + RAW_file_offset = 0; + RAW_file_written = 0; return(ret); } @@ -750,7 +1070,7 @@ kdbg_mapinit(void) vm_offset_t tts_maptomem=0; int i; - if (kdebug_flags & KDBG_MAPINIT) + if (kd_ctrl_page.kdebug_flags & KDBG_MAPINIT) return; /* @@ -821,7 +1141,7 @@ kdbg_mapinit(void) } if (kd_mapptr && tts_mapptr) { - kdebug_flags |= KDBG_MAPINIT; + kd_ctrl_page.kdebug_flags |= KDBG_MAPINIT; /* * Initialize thread map data @@ -847,9 +1167,7 @@ kdbg_clear(void) * First make sure we're not in * the middle of cutting a trace */ - - kdebug_enable &= ~KDEBUG_ENABLE_TRACE; - kdebug_slowcheck = SLOW_NOLOG; + kdbg_set_tracing_enabled(FALSE); /* * make sure the SLOW_NOLOG is seen @@ -858,24 +1176,24 @@ kdbg_clear(void) */ IOSleep(100); - if (kdebug_enable & KDEBUG_ENABLE_ENTROPY) - kdebug_slowcheck |= SLOW_ENTROPY; - global_state_pid = -1; - kdebug_flags &= (unsigned int)~KDBG_CKTYPES; - kdebug_flags &= ~(KDBG_NOWRAP | KDBG_RANGECHECK | KDBG_VALCHECK); - kdebug_flags &= ~(KDBG_PIDCHECK | KDBG_PIDEXCLUDE); + kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; + kd_ctrl_page.kdebug_flags &= ~(KDBG_NOWRAP | KDBG_RANGECHECK | KDBG_VALCHECK); + kd_ctrl_page.kdebug_flags &= ~(KDBG_PIDCHECK | KDBG_PIDEXCLUDE); delete_buffers(); /* Clean up the thread map buffer */ - kdebug_flags &= ~KDBG_MAPINIT; + kd_ctrl_page.kdebug_flags &= ~KDBG_MAPINIT; if (kd_mapptr) { kmem_free(kernel_map, (vm_offset_t)kd_mapptr, kd_mapsize); kd_mapptr = (kd_threadmap *) 0; } kd_mapsize = 0; kd_mapcount = 0; + + RAW_file_offset = 0; + RAW_file_written = 0; } int @@ -896,17 +1214,17 @@ kdbg_setpid(kd_regtype *kdr) /* * turn on pid check for this and all pids */ - kdebug_flags |= KDBG_PIDCHECK; - kdebug_flags &= ~KDBG_PIDEXCLUDE; - kdebug_slowcheck |= SLOW_CHECKS; - + kd_ctrl_page.kdebug_flags |= KDBG_PIDCHECK; + kd_ctrl_page.kdebug_flags &= ~KDBG_PIDEXCLUDE; + kdbg_set_flags(SLOW_CHECKS, 0, TRUE); + p->p_kdebug = 1; } else { /* * turn off pid check for this pid value * Don't turn off all pid checking though * - * kdebug_flags &= ~KDBG_PIDCHECK; + * kd_ctrl_page.kdebug_flags &= ~KDBG_PIDCHECK; */ p->p_kdebug = 0; } @@ -938,9 +1256,9 @@ kdbg_setpidex(kd_regtype *kdr) /* * turn on pid exclusion */ - kdebug_flags |= KDBG_PIDEXCLUDE; - kdebug_flags &= ~KDBG_PIDCHECK; - kdebug_slowcheck |= SLOW_CHECKS; + kd_ctrl_page.kdebug_flags |= KDBG_PIDEXCLUDE; + kd_ctrl_page.kdebug_flags &= ~KDBG_PIDCHECK; + kdbg_set_flags(SLOW_CHECKS, 0, TRUE); p->p_kdebug = 1; } @@ -949,7 +1267,7 @@ kdbg_setpidex(kd_regtype *kdr) * turn off pid exclusion for this pid value * Don't turn off all pid exclusion though * - * kdebug_flags &= ~KDBG_PIDEXCLUDE; + * kd_ctrl_page.kdebug_flags &= ~KDBG_PIDEXCLUDE; */ p->p_kdebug = 0; } @@ -975,14 +1293,8 @@ kdbg_setrtcdec(kd_regtype *kdr) if (decval && decval < KDBG_MINRTCDEC) ret = EINVAL; -#ifdef ppc - else { - maxDec = decval ? decval : 0x7FFFFFFF; /* Set or reset the max decrementer */ - } -#else else ret = ENOTSUP; -#endif /* ppc */ return(ret); } @@ -999,10 +1311,10 @@ kdbg_setreg(kd_regtype * kdr) val_2 = (kdr->value2 & 0xff); kdlog_beg = (val_1<<24); kdlog_end = (val_2<<24); - kdebug_flags &= (unsigned int)~KDBG_CKTYPES; - kdebug_flags &= ~KDBG_VALCHECK; /* Turn off specific value check */ - kdebug_flags |= (KDBG_RANGECHECK | KDBG_CLASSTYPE); - kdebug_slowcheck |= SLOW_CHECKS; + kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; + kd_ctrl_page.kdebug_flags &= ~KDBG_VALCHECK; /* Turn off specific value check */ + kd_ctrl_page.kdebug_flags |= (KDBG_RANGECHECK | KDBG_CLASSTYPE); + kdbg_set_flags(SLOW_CHECKS, 0, TRUE); break; case KDBG_SUBCLSTYPE : val_1 = (kdr->value1 & 0xff); @@ -1010,36 +1322,36 @@ kdbg_setreg(kd_regtype * kdr) val = val_2 + 1; kdlog_beg = ((val_1<<24) | (val_2 << 16)); kdlog_end = ((val_1<<24) | (val << 16)); - kdebug_flags &= (unsigned int)~KDBG_CKTYPES; - kdebug_flags &= ~KDBG_VALCHECK; /* Turn off specific value check */ - kdebug_flags |= (KDBG_RANGECHECK | KDBG_SUBCLSTYPE); - kdebug_slowcheck |= SLOW_CHECKS; + kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; + kd_ctrl_page.kdebug_flags &= ~KDBG_VALCHECK; /* Turn off specific value check */ + kd_ctrl_page.kdebug_flags |= (KDBG_RANGECHECK | KDBG_SUBCLSTYPE); + kdbg_set_flags(SLOW_CHECKS, 0, TRUE); break; case KDBG_RANGETYPE : kdlog_beg = (kdr->value1); kdlog_end = (kdr->value2); - kdebug_flags &= (unsigned int)~KDBG_CKTYPES; - kdebug_flags &= ~KDBG_VALCHECK; /* Turn off specific value check */ - kdebug_flags |= (KDBG_RANGECHECK | KDBG_RANGETYPE); - kdebug_slowcheck |= SLOW_CHECKS; + kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; + kd_ctrl_page.kdebug_flags &= ~KDBG_VALCHECK; /* Turn off specific value check */ + kd_ctrl_page.kdebug_flags |= (KDBG_RANGECHECK | KDBG_RANGETYPE); + kdbg_set_flags(SLOW_CHECKS, 0, TRUE); break; case KDBG_VALCHECK: kdlog_value1 = (kdr->value1); kdlog_value2 = (kdr->value2); kdlog_value3 = (kdr->value3); kdlog_value4 = (kdr->value4); - kdebug_flags &= (unsigned int)~KDBG_CKTYPES; - kdebug_flags &= ~KDBG_RANGECHECK; /* Turn off range check */ - kdebug_flags |= KDBG_VALCHECK; /* Turn on specific value check */ - kdebug_slowcheck |= SLOW_CHECKS; + kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; + kd_ctrl_page.kdebug_flags &= ~KDBG_RANGECHECK; /* Turn off range check */ + kd_ctrl_page.kdebug_flags |= KDBG_VALCHECK; /* Turn on specific value check */ + kdbg_set_flags(SLOW_CHECKS, 0, TRUE); break; case KDBG_TYPENONE : - kdebug_flags &= (unsigned int)~KDBG_CKTYPES; + kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; - if ( (kdebug_flags & (KDBG_RANGECHECK | KDBG_VALCHECK | KDBG_PIDCHECK | KDBG_PIDEXCLUDE)) ) - kdebug_slowcheck |= SLOW_CHECKS; + if ( (kd_ctrl_page.kdebug_flags & (KDBG_RANGECHECK | KDBG_VALCHECK | KDBG_PIDCHECK | KDBG_PIDEXCLUDE)) ) + kdbg_set_flags(SLOW_CHECKS, 0, TRUE); else - kdebug_slowcheck &= ~SLOW_CHECKS; + kdbg_set_flags(SLOW_CHECKS, 0, FALSE); kdlog_beg = 0; kdlog_end = 0; @@ -1064,8 +1376,8 @@ kdbg_getreg(__unused kd_regtype * kdr) val_2 = val_1 + 1; kdlog_beg = (val_1<<24); kdlog_end = (val_2<<24); - kdebug_flags &= (unsigned int)~KDBG_CKTYPES; - kdebug_flags |= (KDBG_RANGECHECK | KDBG_CLASSTYPE); + kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; + kd_ctrl_page.kdebug_flags |= (KDBG_RANGECHECK | KDBG_CLASSTYPE); break; case KDBG_SUBCLSTYPE : val_1 = (kdr->value1 & 0xff); @@ -1073,17 +1385,17 @@ kdbg_getreg(__unused kd_regtype * kdr) val = val_2 + 1; kdlog_beg = ((val_1<<24) | (val_2 << 16)); kdlog_end = ((val_1<<24) | (val << 16)); - kdebug_flags &= (unsigned int)~KDBG_CKTYPES; - kdebug_flags |= (KDBG_RANGECHECK | KDBG_SUBCLSTYPE); + kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; + kd_ctrl_page.kdebug_flags |= (KDBG_RANGECHECK | KDBG_SUBCLSTYPE); break; case KDBG_RANGETYPE : kdlog_beg = (kdr->value1); kdlog_end = (kdr->value2); - kdebug_flags &= (unsigned int)~KDBG_CKTYPES; - kdebug_flags |= (KDBG_RANGECHECK | KDBG_RANGETYPE); + kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; + kd_ctrl_page.kdebug_flags |= (KDBG_RANGECHECK | KDBG_RANGETYPE); break; case KDBG_TYPENONE : - kdebug_flags &= (unsigned int)~KDBG_CKTYPES; + kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; kdlog_beg = 0; kdlog_end = 0; break; @@ -1107,21 +1419,56 @@ kdbg_readmap(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx) if (count && (count <= kd_mapcount)) { - if ((kdebug_flags & KDBG_MAPINIT) && kd_mapsize && kd_mapptr) + if ((kd_ctrl_page.kdebug_flags & KDBG_MAPINIT) && kd_mapsize && kd_mapptr) { if (*number < kd_mapsize) ret = EINVAL; else { - if (vp) { - vn_rdwr(UIO_WRITE, vp, (caddr_t)&count, sizeof(uint32_t), RAW_file_offset, - UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); - RAW_file_offset += sizeof(uint32_t); - - vn_rdwr(UIO_WRITE, vp, (caddr_t)kd_mapptr, kd_mapsize, RAW_file_offset, - UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); + if (vp) + { + RAW_header header; + clock_sec_t secs; + clock_usec_t usecs; + char *pad_buf; + int pad_size; + + header.version_no = RAW_VERSION1; + header.thread_count = count; + + clock_get_calendar_microtime(&secs, &usecs); + header.TOD_secs = secs; + header.TOD_usecs = usecs; + + ret = vn_rdwr(UIO_WRITE, vp, (caddr_t)&header, sizeof(RAW_header), RAW_file_offset, + UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); + if (ret) + goto write_error; + RAW_file_offset += sizeof(RAW_header); + + ret = vn_rdwr(UIO_WRITE, vp, (caddr_t)kd_mapptr, kd_mapsize, RAW_file_offset, + UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); + if (ret) + goto write_error; RAW_file_offset += kd_mapsize; + pad_size = PAGE_SIZE - (RAW_file_offset & PAGE_MASK_64); + + if (pad_size) + { + pad_buf = (char *)kalloc(pad_size); + memset(pad_buf, 0, pad_size); + + ret = vn_rdwr(UIO_WRITE, vp, (caddr_t)pad_buf, pad_size, RAW_file_offset, + UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); + kfree(pad_buf, pad_size); + + if (ret) + goto write_error; + RAW_file_offset += pad_size; + } + RAW_file_written += sizeof(RAW_header) + kd_mapsize + pad_size; + } else { if (copyout(kd_mapptr, buffer, kd_mapsize)) ret = EINVAL; @@ -1134,22 +1481,24 @@ kdbg_readmap(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx) else ret = EINVAL; - if (ret && vp) { + if (ret && vp) + { count = 0; vn_rdwr(UIO_WRITE, vp, (caddr_t)&count, sizeof(uint32_t), RAW_file_offset, UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); RAW_file_offset += sizeof(uint32_t); + RAW_file_written += sizeof(uint32_t); } - if ((kdebug_flags & KDBG_MAPINIT) && kd_mapsize && kd_mapptr) +write_error: + if ((kd_ctrl_page.kdebug_flags & KDBG_MAPINIT) && kd_mapsize && kd_mapptr) { kmem_free(kernel_map, (vm_offset_t)kd_mapptr, kd_mapsize); - kdebug_flags &= ~KDBG_MAPINIT; + kd_ctrl_page.kdebug_flags &= ~KDBG_MAPINIT; kd_mapsize = 0; kd_mapptr = (kd_threadmap *) 0; kd_mapcount = 0; } - return(ret); } @@ -1158,44 +1507,85 @@ kdbg_getentropy (user_addr_t buffer, size_t *number, int ms_timeout) { int avail = *number; int ret = 0; + int s; + u_int64_t abstime; + u_int64_t ns; + int wait_result = THREAD_AWAKENED; + if (kd_entropy_buffer) return(EBUSY); - kd_entropy_count = avail/sizeof(mach_timespec_t); - kd_entropy_bufsize = kd_entropy_count * sizeof(mach_timespec_t); - kd_entropy_indx = 0; + if (ms_timeout < 0) + return(EINVAL); + + kd_entropy_count = avail/sizeof(uint64_t); + + if (kd_entropy_count > MAX_ENTROPY_COUNT || kd_entropy_count == 0) { + /* + * Enforce maximum entropy entries + */ + return(EINVAL); + } + kd_entropy_bufsize = kd_entropy_count * sizeof(uint64_t); /* - * Enforce maximum entropy entries here if needed * allocate entropy buffer */ - if (kmem_alloc(kernel_map, &kd_entropy_buftomem, - (vm_size_t)kd_entropy_bufsize) == KERN_SUCCESS) { + if (kmem_alloc(kernel_map, &kd_entropy_buftomem, (vm_size_t)kd_entropy_bufsize) == KERN_SUCCESS) { kd_entropy_buffer = (uint64_t *) kd_entropy_buftomem; } else { kd_entropy_buffer = (uint64_t *) 0; kd_entropy_count = 0; - kd_entropy_indx = 0; - return (EINVAL); + + return (ENOMEM); } + kd_entropy_indx = 0; - if (ms_timeout < 10) - ms_timeout = 10; + KERNEL_DEBUG_CONSTANT(0xbbbbf000 | DBG_FUNC_START, ms_timeout, kd_entropy_count, 0, 0, 0); /* * Enable entropy sampling */ - kdebug_enable |= KDEBUG_ENABLE_ENTROPY; - kdebug_slowcheck |= SLOW_ENTROPY; + kdbg_set_flags(SLOW_ENTROPY, KDEBUG_ENABLE_ENTROPY, TRUE); - ret = tsleep (kdbg_getentropy, PRIBIO | PCATCH, "kd_entropy", (ms_timeout/(1000/HZ))); + if (ms_timeout) { + ns = (u_int64_t)ms_timeout * (u_int64_t)(1000 * 1000); + nanoseconds_to_absolutetime(ns, &abstime ); + clock_absolutetime_interval_to_deadline( abstime, &abstime ); + } else + abstime = 0; + + s = ml_set_interrupts_enabled(FALSE); + lck_spin_lock(kdw_spin_lock); + + while (wait_result == THREAD_AWAKENED && kd_entropy_indx < kd_entropy_count) { + + kde_waiter = 1; + + if (abstime) { + /* + * wait for the specified timeout or + * until we've hit our sample limit + */ + wait_result = lck_spin_sleep_deadline(kdw_spin_lock, 0, &kde_waiter, THREAD_ABORTSAFE, abstime); + } else { + /* + * wait until we've hit our sample limit + */ + wait_result = lck_spin_sleep(kdw_spin_lock, 0, &kde_waiter, THREAD_ABORTSAFE); + } + kde_waiter = 0; + } + lck_spin_unlock(kdw_spin_lock); + ml_set_interrupts_enabled(s); /* * Disable entropy sampling */ - kdebug_enable &= ~KDEBUG_ENABLE_ENTROPY; - kdebug_slowcheck &= ~SLOW_ENTROPY; + kdbg_set_flags(SLOW_ENTROPY, KDEBUG_ENABLE_ENTROPY, FALSE); + + KERNEL_DEBUG_CONSTANT(0xbbbbf000 | DBG_FUNC_END, ms_timeout, kd_entropy_indx, 0, 0, 0); *number = 0; ret = 0; @@ -1204,10 +1594,10 @@ kdbg_getentropy (user_addr_t buffer, size_t *number, int ms_timeout) /* * copyout the buffer */ - if (copyout(kd_entropy_buffer, buffer, kd_entropy_indx * sizeof(mach_timespec_t))) + if (copyout(kd_entropy_buffer, buffer, kd_entropy_indx * sizeof(uint64_t))) ret = EINVAL; else - *number = kd_entropy_indx; + *number = kd_entropy_indx * sizeof(uint64_t); } /* * Always cleanup @@ -1250,14 +1640,16 @@ kdbg_set_nkdbufs(unsigned int value) void kdbg_control_chud(int val, void *fn) { - if (val) { - /* enable chudhook */ + kdbg_lock_init(); + + if (val) { + /* enable chudhook */ kdebug_chudhook = fn; - kdebug_enable |= KDEBUG_ENABLE_CHUD; + kdbg_set_flags(SLOW_CHUD, KDEBUG_ENABLE_CHUD, TRUE); } else { - /* disable chudhook */ - kdebug_enable &= ~KDEBUG_ENABLE_CHUD; + /* disable chudhook */ + kdbg_set_flags(SLOW_CHUD, KDEBUG_ENABLE_CHUD, FALSE); kdebug_chudhook = 0; } } @@ -1272,22 +1664,24 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) kd_regtype kd_Reg; kbufinfo_t kd_bufinfo; pid_t curpid; - struct proc *p, *curproc; + proc_t p, curproc; if (name[0] == KERN_KDGETENTROPY || + name[0] == KERN_KDWRITETR || + name[0] == KERN_KDWRITEMAP || name[0] == KERN_KDEFLAGS || name[0] == KERN_KDDFLAGS || name[0] == KERN_KDENABLE || name[0] == KERN_KDSETBUF) { if ( namelen < 2 ) - return(EINVAL); + return(EINVAL); value = name[1]; } kdbg_lock_init(); - if ( !(kdebug_flags & KDBG_LOCKINIT)) + if ( !(kd_ctrl_page.kdebug_flags & KDBG_LOCKINIT)) return(ENOSPC); lck_mtx_lock(kd_trace_mtx_sysctl); @@ -1308,12 +1702,12 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) kd_bufinfo.nkdbufs = nkdbufs; kd_bufinfo.nkdthreads = kd_mapsize / sizeof(kd_threadmap); - if ( (kdebug_slowcheck & SLOW_NOLOG) ) + if ( (kd_ctrl_page.kdebug_slowcheck & SLOW_NOLOG) ) kd_bufinfo.nolog = 1; else kd_bufinfo.nolog = 0; - kd_bufinfo.flags = kdebug_flags; + kd_bufinfo.flags = kd_ctrl_page.kdebug_flags; #if defined(__LP64__) kd_bufinfo.flags |= KDBG_LP64; #endif @@ -1371,11 +1765,11 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) switch(name[0]) { case KERN_KDEFLAGS: value &= KDBG_USERFLAGS; - kdebug_flags |= value; + kd_ctrl_page.kdebug_flags |= value; break; case KERN_KDDFLAGS: value &= KDBG_USERFLAGS; - kdebug_flags &= ~value; + kd_ctrl_page.kdebug_flags &= ~value; break; case KERN_KDENABLE: /* @@ -1385,25 +1779,22 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) /* * enable only if buffer is initialized */ - if (!(kdebug_flags & KDBG_BUFINIT)) { + if (!(kd_ctrl_page.kdebug_flags & KDBG_BUFINIT)) { ret = EINVAL; break; } kdbg_mapinit(); - kdebug_enable |= KDEBUG_ENABLE_TRACE; - kdebug_slowcheck &= ~SLOW_NOLOG; - } - else { - kdebug_enable &= ~KDEBUG_ENABLE_TRACE; - kdebug_slowcheck |= SLOW_NOLOG; + kdbg_set_tracing_enabled(TRUE); } + else + kdbg_set_tracing_enabled(FALSE); break; case KERN_KDSETBUF: kdbg_set_nkdbufs(value); break; case KERN_KDSETUP: - ret = kdbg_reinit(); + ret = kdbg_reinit(FALSE); break; case KERN_KDREMOVE: kdbg_clear(); @@ -1432,6 +1823,86 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) case KERN_KDREADTR: ret = kdbg_read(where, sizep, NULL, NULL); break; + case KERN_KDWRITETR: + case KERN_KDWRITEMAP: + { + struct vfs_context context; + struct fileproc *fp; + size_t number; + vnode_t vp; + int fd; + + if (name[0] == KERN_KDWRITETR) { + int s; + int wait_result = THREAD_AWAKENED; + u_int64_t abstime; + u_int64_t ns; + + if (*sizep) { + ns = ((u_int64_t)*sizep) * (u_int64_t)(1000 * 1000); + nanoseconds_to_absolutetime(ns, &abstime ); + clock_absolutetime_interval_to_deadline( abstime, &abstime ); + } else + abstime = 0; + + s = ml_set_interrupts_enabled(FALSE); + lck_spin_lock(kdw_spin_lock); + + while (wait_result == THREAD_AWAKENED && kd_ctrl_page.kds_inuse_count < n_storage_threshold) { + + kds_waiter = 1; + + if (abstime) + wait_result = lck_spin_sleep_deadline(kdw_spin_lock, 0, &kds_waiter, THREAD_ABORTSAFE, abstime); + else + wait_result = lck_spin_sleep(kdw_spin_lock, 0, &kds_waiter, THREAD_ABORTSAFE); + + kds_waiter = 0; + } + lck_spin_unlock(kdw_spin_lock); + ml_set_interrupts_enabled(s); + } + p = current_proc(); + fd = value; + + proc_fdlock(p); + if ( (ret = fp_lookup(p, fd, &fp, 1)) ) { + proc_fdunlock(p); + break; + } + context.vc_thread = current_thread(); + context.vc_ucred = fp->f_fglob->fg_cred; + + if (fp->f_fglob->fg_type != DTYPE_VNODE) { + fp_drop(p, fd, fp, 1); + proc_fdunlock(p); + + ret = EBADF; + break; + } + vp = (struct vnode *)fp->f_fglob->fg_data; + proc_fdunlock(p); + + if ((ret = vnode_getwithref(vp)) == 0) { + + if (name[0] == KERN_KDWRITETR) { + number = nkdbufs * sizeof(kd_buf); + + KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_INFO, 3)) | DBG_FUNC_START, 0, 0, 0, 0, 0); + ret = kdbg_read(0, &number, vp, &context); + KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_INFO, 3)) | DBG_FUNC_END, number, 0, 0, 0, 0); + + *sizep = number; + } else { + number = kd_mapsize; + kdbg_readmap(0, &number, vp, &context); + } + vnode_put(vp); + } + fp_drop(p, fd, fp, 0); + + break; + } case KERN_KDPIDTR: if (size < sizeof(kd_regtype)) { ret = EINVAL; @@ -1489,25 +1960,32 @@ int kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx) { unsigned int count; - unsigned int cpu, mincpu; + unsigned int cpu, min_cpu; uint64_t mintime, t; - int error = 0,s = 0; + int error = 0; kd_buf *tempbuf; - kd_buf *rcursor; - kd_buf *min_rcursor; - struct kd_storage *kdsp; + uint32_t rcursor; + kd_buf lostevent; + union kds_ptr kdsp; + struct kd_storage *kdsp_actual; struct kd_bufinfo *kdbp; + struct kd_bufinfo *min_kdbp; uint32_t tempbuf_count; uint32_t tempbuf_number; uint32_t old_kdebug_flags; uint32_t old_kdebug_slowcheck; + boolean_t lostevents = FALSE; + boolean_t out_of_events = FALSE; count = *number/sizeof(kd_buf); *number = 0; - if (count == 0 || !(kdebug_flags & KDBG_BUFINIT) || kdcopybuf == 0) + if (count == 0 || !(kd_ctrl_page.kdebug_flags & KDBG_BUFINIT) || kdcopybuf == 0) return EINVAL; + memset(&lostevent, 0, sizeof(lostevent)); + lostevent.debugid = TRACEDBG_CODE(DBG_TRACE_INFO, 2); + /* * because we hold kd_trace_mtx_sysctl, no other control threads can * be playing with kdebug_flags... the code that cuts new events could @@ -1515,17 +1993,8 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx) * storage chunk which is where it examines kdebug_flags... it its adding * to the same chunk we're reading from, no problem... */ - s = ml_set_interrupts_enabled(FALSE); - lck_spin_lock(kds_spin_lock); - old_kdebug_slowcheck = kdebug_slowcheck; - old_kdebug_flags = kdebug_flags; - - kdebug_flags &= ~KDBG_WRAPPED; - kdebug_flags |= KDBG_NOWRAP; - - lck_spin_unlock(kds_spin_lock); - ml_set_interrupts_enabled(s); + disable_wrap(&old_kdebug_slowcheck, &old_kdebug_flags); if (count > nkdbufs) count = nkdbufs; @@ -1538,66 +2007,86 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx) tempbuf_number = 0; while (tempbuf_count) { - mintime = 0xffffffffffffffffULL; /* all actual timestamps are below */ - mincpu = -1; - min_rcursor = NULL; + mintime = 0xffffffffffffffffULL; + min_kdbp = NULL; + min_cpu = 0; for (cpu = 0, kdbp = &kdbip[0]; cpu < kd_cpus; cpu++, kdbp++) { - if ((kdsp = kdbp->kd_list_head) == NULL) + if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) continue; - rcursor = kdsp->kds_readlast; + kdsp_actual = POINTER_FROM_KDS_PTR(kdsp); + + rcursor = kdsp_actual->kds_readlast; - if (rcursor == kdsp->kds_bufptr) + if (rcursor == kdsp_actual->kds_bufindx) continue; - t = kdbg_get_timestamp(rcursor); + t = kdbg_get_timestamp(&kdsp_actual->kds_records[rcursor]); + + if (t < kdsp_actual->kds_timestamp) { + /* + * indicates we've not yet completed filling + * in this event... + * this should only occur when we're looking + * at the buf that the record head is utilizing + * we'll pick these events up on the next + * call to kdbg_read + * we bail at this point so that we don't + * get an out-of-order timestream by continuing + * to read events from the other CPUs' timestream(s) + */ + out_of_events = TRUE; + break; + } if (t < mintime) { - mincpu = cpu; mintime = t; - min_rcursor = rcursor; + min_kdbp = kdbp; + min_cpu = cpu; } } - if (mincpu == (unsigned int)-1) - /* + if (min_kdbp == NULL || out_of_events == TRUE) { + /* * all buffers ran empty */ - break; - - kdbp = &kdbip[mincpu]; - kdsp = kdbp->kd_list_head; + out_of_events = TRUE; + break; + } + kdsp = min_kdbp->kd_list_head; + kdsp_actual = POINTER_FROM_KDS_PTR(kdsp); - *tempbuf = *min_rcursor; + if (kdsp_actual->kds_lostevents == TRUE) { + lostevent.timestamp = kdsp_actual->kds_records[kdsp_actual->kds_readlast].timestamp; + *tempbuf = lostevent; + + kdsp_actual->kds_lostevents = FALSE; + lostevents = TRUE; - if (mintime != kdbg_get_timestamp(tempbuf)) { - /* - * we stole this storage unit and used it - * before we could slurp the selected event out - * so we need to re-evaluate - */ - continue; + goto nextevent; } + *tempbuf = kdsp_actual->kds_records[kdsp_actual->kds_readlast++]; + + if (kdsp_actual->kds_readlast == EVENTS_PER_STORAGE_UNIT) + release_storage_unit(min_cpu, kdsp.raw); + /* * Watch for out of order timestamps */ - if (mintime < kdbp->kd_prev_timebase) { + if (mintime < min_kdbp->kd_prev_timebase) { /* * if so, use the previous timestamp + 1 cycle */ - kdbp->kd_prev_timebase++; - kdbg_set_timestamp_and_cpu(tempbuf, kdbp->kd_prev_timebase, mincpu); + min_kdbp->kd_prev_timebase++; + kdbg_set_timestamp_and_cpu(tempbuf, min_kdbp->kd_prev_timebase, kdbg_get_cpu(tempbuf)); } else - kdbp->kd_prev_timebase = mintime; - - if (min_rcursor == kdsp->kds_readlast) - kdsp->kds_readlast++; - - if (kdsp->kds_readlast == kdsp->kds_buflast) - release_storage_unit(kdbp, kdsp); - + min_kdbp->kd_prev_timebase = mintime; +nextevent: tempbuf_count--; tempbuf_number++; tempbuf++; + + if ((RAW_file_written += sizeof(kd_buf)) >= RAW_FLUSH_SIZE) + break; } if (tempbuf_number) { @@ -1606,6 +2095,12 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx) UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); RAW_file_offset += (tempbuf_number * sizeof(kd_buf)); + + if (RAW_file_written >= RAW_FLUSH_SIZE) { + cluster_push(vp, 0); + + RAW_file_written = 0; + } } else { error = copyout(kdcopybuf, buffer, tempbuf_number * sizeof(kd_buf)); buffer += (tempbuf_number * sizeof(kd_buf)); @@ -1618,7 +2113,7 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx) count -= tempbuf_number; *number += tempbuf_number; } - if (tempbuf_count) + if (out_of_events == TRUE) /* * all trace buffers are empty */ @@ -1628,17 +2123,7 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx) tempbuf_count = KDCOPYBUF_COUNT; } if ( !(old_kdebug_flags & KDBG_NOWRAP)) { - - s = ml_set_interrupts_enabled(FALSE); - lck_spin_lock(kds_spin_lock); - - kdebug_flags &= ~KDBG_NOWRAP; - - if ( !(old_kdebug_slowcheck & SLOW_NOLOG)) - kdebug_slowcheck &= ~SLOW_NOLOG; - - lck_spin_unlock(kds_spin_lock); - ml_set_interrupts_enabled(s); + enable_wrap(old_kdebug_slowcheck, lostevents); } return (error); } @@ -1656,9 +2141,6 @@ unsigned char *getProcName(struct proc *proc) { #if defined(__i386__) || defined (__x86_64__) #define TRAP_DEBUGGER __asm__ volatile("int3"); #endif -#ifdef __ppc__ -#define TRAP_DEBUGGER __asm__ volatile("tw 4,r3,r3"); -#endif #define SANE_TRACEBUF_SIZE (8 * 1024 * 1024) @@ -1701,7 +2183,6 @@ int stack_snapshot(struct proc *p, register struct stack_snapshot_args *uap, int32_t *retval) { int error = 0; - if ((error = suser(kauth_cred_get(), &p->p_acflag))) return(error); @@ -1779,14 +2260,13 @@ stack_snapshot2(pid_t pid, user_addr_t tracebuf, uint32_t tracebuf_size, uint32_ void start_kern_tracing(unsigned int new_nkdbufs) { + if (!new_nkdbufs) return; kdbg_set_nkdbufs(new_nkdbufs); kdbg_lock_init(); - kdbg_reinit(); - kdebug_enable |= KDEBUG_ENABLE_TRACE; - kdebug_slowcheck &= ~SLOW_NOLOG; - kdbg_mapinit(); + kdbg_reinit(TRUE); + kdbg_set_tracing_enabled(TRUE); #if defined(__i386__) || defined(__x86_64__) uint64_t now = mach_absolute_time(); @@ -1808,7 +2288,7 @@ kdbg_dump_trace_to_file(const char *filename) size_t number; - if (kdebug_enable & (KDEBUG_ENABLE_CHUD | KDEBUG_ENABLE_ENTROPY)) + if ( !(kdebug_enable & KDEBUG_ENABLE_TRACE)) return; if (global_state_pid != -1) { @@ -1824,6 +2304,7 @@ kdbg_dump_trace_to_file(const char *filename) KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_INFO, 0)) | DBG_FUNC_NONE, 0, 0, 0, 0, 0); kdebug_enable = 0; + kd_ctrl_page.enabled = 0; ctx = vfs_context_kernel(); @@ -1840,3 +2321,44 @@ kdbg_dump_trace_to_file(const char *filename) sync(current_proc(), (void *)NULL, (int *)NULL); } + +/* Helper function for filling in the BSD name for an address space + * Defined here because the machine bindings know only Mach threads + * and nothing about BSD processes. + * + * FIXME: need to grab a lock during this? + */ +void kdbg_get_task_name(char* name_buf, int len, task_t task) +{ + proc_t proc; + + /* Note: we can't use thread->task (and functions that rely on it) here + * because it hasn't been initialized yet when this function is called. + * We use the explicitly-passed task parameter instead. + */ + proc = get_bsdtask_info(task); + if (proc != PROC_NULL) + snprintf(name_buf, len, "%s/%d", proc->p_comm, proc->p_pid); + else + snprintf(name_buf, len, "%p [!bsd]", task); +} + + + +#if defined(NATIVE_TRACE_FACILITY) +void trace_handler_map_ctrl_page(__unused uintptr_t addr, __unused size_t ctrl_page_size, __unused size_t storage_size, __unused size_t kds_ptr_size) +{ +} +void trace_handler_map_bufinfo(__unused uintptr_t addr, __unused size_t size) +{ +} +void trace_handler_unmap_bufinfo(void) +{ +} +void trace_handler_map_buffer(__unused int index, __unused uintptr_t addr, __unused size_t size) +{ +} +void trace_handler_unmap_buffer(__unused int index) +{ +} +#endif diff --git a/bsd/kern/kern_acct.c b/bsd/kern/kern_acct.c index 747f09221..516de08dc 100644 --- a/bsd/kern/kern_acct.c +++ b/bsd/kern/kern_acct.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -161,7 +161,7 @@ acct(proc_t p, struct acct_args *uap, __unused int *retval) * writing and make sure it's a 'normal'. */ if (uap->path != USER_ADDR_NULL) { - NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, ctx); + NDINIT(&nd, LOOKUP, OP_OPEN, NOFOLLOW, UIO_USERSPACE, uap->path, ctx); if ((error = vn_open(&nd, FWRITE, 0))) return (error); #if CONFIG_MACF @@ -271,8 +271,8 @@ acct_process(proc_t p) /* (6) The UID and GID of the process */ safecred = kauth_cred_proc_ref(p); - an_acct.ac_uid = safecred->cr_ruid; - an_acct.ac_gid = safecred->cr_rgid; + an_acct.ac_uid = kauth_cred_getruid(safecred); + an_acct.ac_gid = kauth_cred_getrgid(safecred); /* (7) The terminal from which the process was started */ diff --git a/bsd/kern/kern_aio.c b/bsd/kern/kern_aio.c index a2983db0a..89a2ba012 100644 --- a/bsd/kern/kern_aio.c +++ b/bsd/kern/kern_aio.c @@ -1394,7 +1394,7 @@ aio_enqueue_work( proc_t procp, aio_workq_entry *entryp, int proc_locked) /* And work queue */ aio_workq_lock_spin(queue); aio_workq_add_entry_locked(queue, entryp); - wait_queue_wakeup_one(queue->aioq_waitq, queue, THREAD_AWAKENED); + wait_queue_wakeup_one(queue->aioq_waitq, queue, THREAD_AWAKENED, -1); aio_workq_unlock(queue); if (proc_locked == 0) { diff --git a/bsd/kern/kern_authorization.c b/bsd/kern/kern_authorization.c index 263fc28b4..f30df6d2d 100644 --- a/bsd/kern/kern_authorization.c +++ b/bsd/kern/kern_authorization.c @@ -185,10 +185,9 @@ kauth_alloc_scope(const char *identifier, kauth_scope_callback_t callback, void /* * Allocate and populate the scope structure. */ - MALLOC(sp, kauth_scope_t, sizeof(*sp), M_KAUTH, M_WAITOK); + MALLOC(sp, kauth_scope_t, sizeof(*sp), M_KAUTH, M_WAITOK | M_ZERO); if (sp == NULL) return(NULL); - bzero(&sp->ks_listeners, sizeof(sp->ks_listeners)); sp->ks_flags = 0; sp->ks_identifier = identifier; sp->ks_idata = idata; @@ -613,7 +612,7 @@ kauth_authorize_generic_callback(kauth_cred_t credential, __unused void *idata, int kauth_acl_evaluate(kauth_cred_t cred, kauth_acl_eval_t eval) { - int applies, error, i; + int applies, error, i, gotguid; kauth_ace_t ace; guid_t guid; uint32_t rights; @@ -632,9 +631,11 @@ kauth_acl_evaluate(kauth_cred_t cred, kauth_acl_eval_t eval) * Get our guid for comparison purposes. */ if ((error = kauth_cred_getguid(cred, &guid)) != 0) { - eval->ae_result = KAUTH_RESULT_DENY; - KAUTH_DEBUG(" ACL - can't get credential GUID (%d), ACL denied", error); - return(error); + KAUTH_DEBUG(" ACL - can't get credential GUID (%d)", error); + error = 0; + gotguid = 0; + } else { + gotguid = 1; } KAUTH_DEBUG(" ACL - %d entries, initial residual %x", eval->ae_count, eval->ae_residual); @@ -678,7 +679,7 @@ kauth_acl_evaluate(kauth_cred_t cred, kauth_acl_eval_t eval) /* we don't recognise this ACE, skip it */ continue; } - + /* * Verify whether this entry applies to the credential. */ @@ -688,7 +689,10 @@ kauth_acl_evaluate(kauth_cred_t cred, kauth_acl_eval_t eval) applies = eval->ae_options & KAUTH_AEVAL_IS_OWNER; break; case KAUTH_WKG_GROUP: - applies = eval->ae_options & KAUTH_AEVAL_IN_GROUP; + if (!gotguid || (eval->ae_options & KAUTH_AEVAL_IN_GROUP_UNKNOWN)) + applies = ((ace->ace_flags & KAUTH_ACE_KINDMASK) == KAUTH_ACE_DENY); + else + applies = eval->ae_options & KAUTH_AEVAL_IN_GROUP; break; /* we short-circuit these here rather than wasting time calling the group membership code */ case KAUTH_WKG_EVERYBODY: @@ -700,12 +704,12 @@ kauth_acl_evaluate(kauth_cred_t cred, kauth_acl_eval_t eval) default: /* check to see whether it's exactly us, or a group we are a member of */ - applies = kauth_guid_equal(&guid, &ace->ace_applicable); + applies = !gotguid ? 0 : kauth_guid_equal(&guid, &ace->ace_applicable); KAUTH_DEBUG(" ACL - ACE applicable " K_UUID_FMT " caller " K_UUID_FMT " %smatched", K_UUID_ARG(ace->ace_applicable), K_UUID_ARG(guid), applies ? "" : "not "); if (!applies) { - error = kauth_cred_ismember_guid(cred, &ace->ace_applicable, &applies); + error = !gotguid ? ENOENT : kauth_cred_ismember_guid(cred, &ace->ace_applicable, &applies); /* * If we can't resolve group membership, we have to limit misbehaviour. * If the ACE is an 'allow' ACE, assume the cred is not a member (avoid @@ -791,15 +795,37 @@ kauth_acl_inherit(vnode_t dvp, kauth_acl_t initial, kauth_acl_t *product, int is * XXX TODO: wants a "umask ACL" from the process. */ inherit = NULL; - if ((dvp != NULL) && !vfs_authopaque(vnode_mount(dvp))) { + /* + * If there is no initial ACL, or there is, and the initial ACLs + * flags do not request "no inheritance", then we inherit. This allows + * initial object creation via open_extended() and mkdir_extended() + * to reject inheritance for themselves and for inferior nodes by + * specifying a non-NULL inital ACL which has the KAUTH_ACL_NO_INHERIT + * flag set in the flags field. + */ + if ((initial == NULL || !(initial->acl_flags & KAUTH_ACL_NO_INHERIT)) && + (dvp != NULL) && !vfs_authopaque(vnode_mount(dvp))) { VATTR_INIT(&dva); VATTR_WANTED(&dva, va_acl); if ((error = vnode_getattr(dvp, &dva, ctx)) != 0) { KAUTH_DEBUG(" ERROR - could not get parent directory ACL for inheritance"); return(error); } - if (VATTR_IS_SUPPORTED(&dva, va_acl)) + if (VATTR_IS_SUPPORTED(&dva, va_acl)) { inherit = dva.va_acl; + /* + * If there is an ACL on the parent directory, then + * there are potentially inheritable ACE entries, but + * if the flags on the directory ACL say not to + * inherit, then we don't inherit. This allows for + * per directory rerooting of the inheritable ACL + * hierarchy. + */ + if (inherit != NULL && inherit->acl_flags & KAUTH_ACL_NO_INHERIT) { + kauth_acl_free(inherit); + inherit = NULL; + } + } } /* @@ -852,14 +878,17 @@ kauth_acl_inherit(vnode_t dvp, kauth_acl_t initial, kauth_acl_t *product, int is /* * Composition is simply: - * - initial - * - inherited + * - initial direct ACEs + * - inherited ACEs from new parent */ index = 0; if (initial != NULL) { - for (i = 0; i < initial->acl_entrycount; i++) - result->acl_ace[index++] = initial->acl_ace[i]; - KAUTH_DEBUG(" INHERIT - applied %d initial entries", index); + for (i = 0; i < initial->acl_entrycount; i++) { + if (!(initial->acl_ace[i].ace_flags & KAUTH_ACE_INHERITED)) { + result->acl_ace[index++] = initial->acl_ace[i]; + } + } + KAUTH_DEBUG(" INHERIT - applied %d of %d initial entries", index, initial->acl_entrycount); } if (inherit != NULL) { for (i = 0; i < inherit->acl_entrycount; i++) { diff --git a/bsd/kern/kern_clock.c b/bsd/kern/kern_clock.c index 0cbd41e1b..1aae2df47 100644 --- a/bsd/kern/kern_clock.c +++ b/bsd/kern/kern_clock.c @@ -241,7 +241,7 @@ sysctl_clockrate } SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, - CTLTYPE_STRUCT | CTLFLAG_RD, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_clockrate, "S,clockinfo", ""); diff --git a/bsd/kern/kern_control.c b/bsd/kern/kern_control.c index a76088eb4..92b15bc40 100644 --- a/bsd/kern/kern_control.c +++ b/bsd/kern/kern_control.c @@ -91,6 +91,7 @@ static int ctl_peeraddr(struct socket *so, struct sockaddr **nam); static struct kctl *ctl_find_by_name(const char *); static struct kctl *ctl_find_by_id_unit(u_int32_t id, u_int32_t unit); +static struct socket *kcb_find_socket(struct kctl *, u_int32_t unit); static struct ctl_cb *kcb_find(struct kctl *, u_int32_t unit); static void ctl_post_msg(u_int32_t event_code, u_int32_t id); @@ -255,7 +256,7 @@ ctl_sofreelastref(struct socket *so) if ((kctl = kcb->kctl) != 0) { lck_mtx_lock(ctl_mtx); TAILQ_REMOVE(&kctl->kcb_head, kcb, next); - lck_mtx_lock(ctl_mtx); + lck_mtx_unlock(ctl_mtx); } kcb_delete(kcb); } @@ -364,10 +365,16 @@ ctl_connect(struct socket *so, struct sockaddr *nam, __unused struct proc *p) error = (*kctl->connect)(kctl, &sa, &kcb->userdata); socket_lock(so, 0); if (error) - goto done; + goto end; soisconnected(so); +end: + if (error && kctl->disconnect) { + socket_unlock(so, 0); + (*kctl->disconnect)(kctl, kcb->unit, kcb->userdata); + socket_lock(so, 0); + } done: if (error) { soisdisconnected(so); @@ -393,12 +400,19 @@ ctl_disconnect(struct socket *so) (*kctl->disconnect)(kctl, kcb->unit, kcb->userdata); socket_lock(so, 0); } + + soisdisconnected(so); + + socket_unlock(so, 0); lck_mtx_lock(ctl_mtx); kcb->kctl = 0; kcb->unit = 0; + while (kcb->usecount != 0) { + msleep(&kcb->usecount, ctl_mtx, 0, "kcb->usecount", 0); + } TAILQ_REMOVE(&kctl->kcb_head, kcb, next); - soisdisconnected(so); lck_mtx_unlock(ctl_mtx); + socket_lock(so, 0); } return 0; } @@ -430,23 +444,29 @@ ctl_peeraddr(struct socket *so, struct sockaddr **nam) static int ctl_send(struct socket *so, int flags, struct mbuf *m, - __unused struct sockaddr *addr, __unused struct mbuf *control, + __unused struct sockaddr *addr, struct mbuf *control, __unused struct proc *p) { int error = 0; struct ctl_cb *kcb = (struct ctl_cb *)so->so_pcb; struct kctl *kctl; + if (control) m_freem(control); + if (kcb == NULL) /* sanity check */ - return(ENOTCONN); + error = ENOTCONN; - if ((kctl = kcb->kctl) == NULL) - return(EINVAL); + if (error == 0 && (kctl = kcb->kctl) == NULL) + error = EINVAL; - if (kctl->send) { + if (error == 0 && kctl->send) { socket_unlock(so, 0); error = (*kctl->send)(kctl, kcb->unit, kcb->userdata, m, flags); socket_lock(so, 0); + } else { + m_freem(m); + if (error == 0) + error = ENOTSUP; } return error; } @@ -454,23 +474,18 @@ ctl_send(struct socket *so, int flags, struct mbuf *m, errno_t ctl_enqueuembuf(void *kctlref, u_int32_t unit, struct mbuf *m, u_int32_t flags) { - struct ctl_cb *kcb; struct socket *so; errno_t error = 0; struct kctl *kctl = (struct kctl *)kctlref; if (kctl == NULL) return EINVAL; - - kcb = kcb_find(kctl, unit); - if (kcb == NULL) - return EINVAL; - so = (struct socket *)kcb->so; - if (so == NULL) + so = kcb_find_socket(kctl, unit); + + if (so == NULL) return EINVAL; - socket_lock(so, 1); if (sbspace(&so->so_rcv) < m->m_pkthdr.len) { error = ENOBUFS; goto bye; @@ -487,7 +502,6 @@ ctl_enqueuembuf(void *kctlref, u_int32_t unit, struct mbuf *m, u_int32_t flags) errno_t ctl_enqueuedata(void *kctlref, u_int32_t unit, void *data, size_t len, u_int32_t flags) { - struct ctl_cb *kcb; struct socket *so; struct mbuf *m; errno_t error = 0; @@ -499,15 +513,10 @@ ctl_enqueuedata(void *kctlref, u_int32_t unit, void *data, size_t len, u_int32_t if (kctlref == NULL) return EINVAL; - kcb = kcb_find(kctl, unit); - if (kcb == NULL) + so = kcb_find_socket(kctl, unit); + if (so == NULL) return EINVAL; - so = (struct socket *)kcb->so; - if (so == NULL) - return EINVAL; - - socket_lock(so, 1); if (sbspace(&so->so_rcv) < (int)len) { error = ENOBUFS; goto bye; @@ -545,27 +554,21 @@ ctl_enqueuedata(void *kctlref, u_int32_t unit, void *data, size_t len, u_int32_t errno_t ctl_getenqueuespace(kern_ctl_ref kctlref, u_int32_t unit, size_t *space) { - struct ctl_cb *kcb; struct kctl *kctl = (struct kctl *)kctlref; struct socket *so; long avail; if (kctlref == NULL || space == NULL) return EINVAL; - - kcb = kcb_find(kctl, unit); - if (kcb == NULL) - return EINVAL; - so = (struct socket *)kcb->so; - if (so == NULL) + so = kcb_find_socket(kctl, unit); + if (so == NULL) return EINVAL; - socket_lock(so, 1); avail = sbspace(&so->so_rcv); *space = (avail < 0) ? 0 : avail; socket_unlock(so, 1); - + return 0; } @@ -624,6 +627,9 @@ ctl_ctloutput(struct socket *so, struct sockopt *sopt) socket_unlock(so, 0); error = (*kctl->getopt)(kcb->kctl, kcb->unit, kcb->userdata, sopt->sopt_name, data, &len); + if (data != NULL && len > sopt->sopt_valsize) + panic_plain("ctl_ctloutput: ctl %s returned len (%lu) > sopt_valsize (%lu)\n", + kcb->kctl->name, len, sopt->sopt_valsize); socket_lock(so, 0); if (error == 0) { if (data != NULL) @@ -858,6 +864,46 @@ ctl_find_by_name(const char *name) return NULL; } +u_int32_t +ctl_id_by_name(const char *name) +{ + u_int32_t ctl_id = 0; + + lck_mtx_lock(ctl_mtx); + struct kctl *kctl = ctl_find_by_name(name); + if (kctl) ctl_id = kctl->id; + lck_mtx_unlock(ctl_mtx); + + return ctl_id; +} + +errno_t +ctl_name_by_id( + u_int32_t id, + char *out_name, + size_t maxsize) +{ + int found = 0; + + lck_mtx_lock(ctl_mtx); + struct kctl *kctl; + TAILQ_FOREACH(kctl, &ctl_head, next) { + if (kctl->id == id) + break; + } + + if (kctl && kctl->name) + { + if (maxsize > MAX_KCTL_NAME) + maxsize = MAX_KCTL_NAME; + strlcpy(out_name, kctl->name, maxsize); + found = 1; + } + lck_mtx_unlock(ctl_mtx); + + return found ? 0 : ENOENT; +} + /* * Must be called with global ctl_mtx lock taked * @@ -885,21 +931,58 @@ kcb_find(struct kctl *kctl, u_int32_t unit) struct ctl_cb *kcb; TAILQ_FOREACH(kcb, &kctl->kcb_head, next) - if ((kcb->unit == unit)) + if (kcb->unit == unit) return kcb; return NULL; } -/* - * Must be called witout lock - */ +static struct socket * +kcb_find_socket(struct kctl *kctl, u_int32_t unit) +{ + struct socket *so = NULL; + + lck_mtx_lock(ctl_mtx); + struct ctl_cb *kcb = kcb_find(kctl, unit); + if (kcb && kcb->kctl == kctl) { + so = kcb->so; + if (so) { + kcb->usecount++; + } + } + lck_mtx_unlock(ctl_mtx); + + if (so == NULL) { + return NULL; + } + + socket_lock(so, 1); + + lck_mtx_lock(ctl_mtx); + if (kcb->kctl == NULL) + { + lck_mtx_unlock(ctl_mtx); + socket_unlock(so, 1); + so = NULL; + lck_mtx_lock(ctl_mtx); + } + kcb->usecount--; + if (kcb->usecount == 0) + wakeup((event_t)&kcb->usecount); + lck_mtx_unlock(ctl_mtx); + + return so; +} + static void ctl_post_msg(u_int32_t event_code, u_int32_t id) { struct ctl_event_data ctl_ev_data; struct kev_msg ev_msg; + lck_mtx_assert(ctl_mtx, LCK_MTX_ASSERT_NOTOWNED); + + bzero(&ev_msg, sizeof(struct kev_msg)); ev_msg.vendor_code = KEV_VENDOR_APPLE; ev_msg.kev_class = KEV_SYSTEM_CLASS; diff --git a/bsd/kern/kern_core.c b/bsd/kern/kern_core.c index 52c0a3095..cf63621d9 100644 --- a/bsd/kern/kern_core.c +++ b/bsd/kern/kern_core.c @@ -70,24 +70,7 @@ typedef struct { mach_msg_type_number_t count; /* count of ints in this flavor */ } mythread_state_flavor_t; -#if defined (__ppc__) -/* 64 bit */ -mythread_state_flavor_t thread_flavor_array64[]={ - {PPC_THREAD_STATE64 , PPC_THREAD_STATE64_COUNT}, - {PPC_FLOAT_STATE, PPC_FLOAT_STATE_COUNT}, - {PPC_EXCEPTION_STATE64, PPC_EXCEPTION_STATE64_COUNT}, - {PPC_VECTOR_STATE, PPC_VECTOR_STATE_COUNT} - }; - -/* 32 bit */ -mythread_state_flavor_t thread_flavor_array[]={ - {PPC_THREAD_STATE , PPC_THREAD_STATE_COUNT}, - {PPC_FLOAT_STATE, PPC_FLOAT_STATE_COUNT}, - {PPC_EXCEPTION_STATE, PPC_EXCEPTION_STATE_COUNT}, - {PPC_VECTOR_STATE, PPC_VECTOR_STATE_COUNT} - }; - -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) mythread_state_flavor_t thread_flavor_array [] = { {x86_THREAD_STATE, x86_THREAD_STATE_COUNT}, {x86_FLOAT_STATE, x86_FLOAT_STATE_COUNT}, @@ -139,9 +122,6 @@ process_cpu_type(proc_t core_proc) } else { what_we_think = CPU_TYPE_I386; } -#elif defined (__ppc__) - #pragma unused(core_proc) - what_we_think = CPU_TYPE_POWERPC; #endif return what_we_think; } @@ -156,9 +136,6 @@ process_cpu_subtype(proc_t core_proc) } else { what_we_think = CPU_SUBTYPE_I386_ALL; } -#elif defined (__ppc__) - #pragma unused(core_proc) - what_we_think = CPU_SUBTYPE_POWERPC_ALL; #endif return what_we_think; } @@ -261,8 +238,8 @@ coredump(proc_t core_proc) if (do_coredump == 0 || /* Not dumping at all */ ( (sugid_coredump == 0) && /* Not dumping SUID/SGID binaries */ - ( (cred->cr_svuid != cred->cr_ruid) || - (cred->cr_svgid != cred->cr_rgid)))) { + ( (kauth_cred_getsvuid(cred) != kauth_cred_getruid(cred)) || + (kauth_cred_getsvgid(cred) != kauth_cred_getrgid(cred))))) { #if CONFIG_AUDIT audit_proc_coredump(core_proc, NULL, EFAULT); @@ -320,17 +297,8 @@ coredump(proc_t core_proc) thread_count = get_task_numacts(task); segment_count = get_vmmap_entries(map); /* XXX */ -#if defined (__ppc__) - if (is_64) { - tir1.flavor_count = sizeof(thread_flavor_array64)/sizeof(mythread_state_flavor_t); - bcopy(thread_flavor_array64, flavors,sizeof(thread_flavor_array64)); - } else { -#endif /* __ppc __ */ - tir1.flavor_count = sizeof(thread_flavor_array)/sizeof(mythread_state_flavor_t); - bcopy(thread_flavor_array, flavors,sizeof(thread_flavor_array)); -#if defined (__ppc__) - } -#endif /* __ppc __ */ + tir1.flavor_count = sizeof(thread_flavor_array)/sizeof(mythread_state_flavor_t); + bcopy(thread_flavor_array, flavors,sizeof(thread_flavor_array)); tstate_size = 0; for (i = 0; i < tir1.flavor_count; i++) tstate_size += sizeof(mythread_state_flavor_t) + diff --git a/bsd/kern/kern_credential.c b/bsd/kern/kern_credential.c index 4ab7311ae..484c86fff 100644 --- a/bsd/kern/kern_credential.c +++ b/bsd/kern/kern_credential.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2007 Apple Inc. All rights reserved. + * Copyright (c) 2004-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -52,6 +52,7 @@ #include #include +#include /* For manifest constants in posix_cred_access */ #include #include #include @@ -150,6 +151,7 @@ static int kauth_resolver_timeout = 30; /* default: 30 seconds */ struct kauth_resolver_work { TAILQ_ENTRY(kauth_resolver_work) kr_link; struct kauth_identity_extlookup kr_work; + uint64_t kr_extend; uint32_t kr_seqno; uint64_t kr_subtime; /* submission time */ int kr_refs; @@ -164,7 +166,7 @@ TAILQ_HEAD(kauth_resolver_unsubmitted_head, kauth_resolver_work) kauth_resolver_ TAILQ_HEAD(kauth_resolver_submitted_head, kauth_resolver_work) kauth_resolver_submitted; TAILQ_HEAD(kauth_resolver_done_head, kauth_resolver_work) kauth_resolver_done; -static int kauth_resolver_submit(struct kauth_identity_extlookup *lkp); +static int kauth_resolver_submit(struct kauth_identity_extlookup *lkp, uint64_t extend_data); static int kauth_resolver_complete(user_addr_t message); static int kauth_resolver_getwork(user_addr_t message); static int kauth_resolver_getwork2(user_addr_t message); @@ -246,21 +248,37 @@ kauth_resolver_init(void) * * Parameters: lkp A pointer to an external * lookup request + * extend_data extended data for kr_extend * * Returns: 0 Success * EWOULDBLOCK No resolver registered * EINTR Operation interrupted (e.g. by * a signal) * ENOMEM Could not allocate work item + * copyinstr:EFAULT Bad message from user space * workp->kr_result:??? An error from the user space * daemon (includes ENOENT!) * + * Implicit returns: + * *lkp Modified + * * Notes: Allocate a work queue entry, submit the work and wait for * the operation to either complete or time out. Outstanding * operations may also be cancelled. + * + * Submission is by means of placing the item on a work queue + * which is serviced by an external resolver thread calling + * into the kernel. The caller then sleeps until timeout, + * cancellation, or an external resolver thread calls in with + * a result message to kauth_resolver_complete(). All of these + * events wake the caller back up. + * + * This code is called from either kauth_cred_ismember_gid() + * for a group membership request, or it is called from + * kauth_cred_cache_lookup() when we get a cache miss. */ static int -kauth_resolver_submit(struct kauth_identity_extlookup *lkp) +kauth_resolver_submit(struct kauth_identity_extlookup *lkp, uint64_t extend_data) { struct kauth_resolver_work *workp, *killp; struct timespec ts; @@ -294,6 +312,7 @@ kauth_resolver_submit(struct kauth_identity_extlookup *lkp) return(ENOMEM); workp->kr_work = *lkp; + workp->kr_extend = extend_data; workp->kr_refs = 1; workp->kr_flags = KAUTH_REQUEST_UNSUBMITTED; workp->kr_result = 0; @@ -307,11 +326,19 @@ kauth_resolver_submit(struct kauth_identity_extlookup *lkp) workp->kr_work.el_result = KAUTH_EXTLOOKUP_INPROG; /* - * XXX As an optimisation, we could check the queue for identical - * XXX items and coalesce them + * XXX We *MUST NOT* attempt to coelesce identical work items due to + * XXX the inability to ensure order of update of the request item + * XXX extended data vs. the wakeup; instead, we let whoever is waiting + * XXX for each item repeat the update when they wake up. */ TAILQ_INSERT_TAIL(&kauth_resolver_unsubmitted, workp, kr_link); + /* + * Wake up an external resolver thread to deal with the new work; one + * may not be available, and if not, then the request will be grabed + * when a resolver thread comes back into the kernel to request new + * work. + */ wakeup_one((caddr_t)&kauth_resolver_unsubmitted); for (;;) { /* we could compute a better timeout here */ @@ -332,8 +359,9 @@ kauth_resolver_submit(struct kauth_identity_extlookup *lkp) } /* - * Update the moving average of how long it took; if it took longer - * than the time threshold, then we complain about it being slow. + * Update the moving average of how long the request took; if it + * took longer than the time threshold, then we complain about it + * being slow. */ duration = mach_absolute_time() - workp->kr_subtime; if (kco_ma_addsample(&resolver_ma, duration)) { @@ -401,15 +429,19 @@ kauth_resolver_submit(struct kauth_identity_extlookup *lkp) /* someone else still has a reference on this request */ shouldfree = 0; } + /* collect request result */ - if (error == 0) + if (error == 0) { error = workp->kr_result; + } KAUTH_RESOLVER_UNLOCK(); + /* * If we dropped the last reference, free the request. */ - if (shouldfree) + if (shouldfree) { FREE(workp, M_KAUTH); + } KAUTH_DEBUG("RESOLVER - returning %d", error); return(error); @@ -473,7 +505,7 @@ identitysvc(__unused struct proc *p, struct identitysvc_args *uap, __unused int3 * Allow user space resolver to override the * external resolution timeout */ - if (message >= 30 && message <= 10000) { + if (message > 30 && message < 10000) { kauth_resolver_timeout = message; KAUTH_DEBUG("RESOLVER - new resolver changes timeout to %d seconds\n", (int)message); } @@ -625,10 +657,54 @@ kauth_resolver_getwork2(user_addr_t message) */ workp = TAILQ_FIRST(&kauth_resolver_unsubmitted); - if ((error = copyout(&workp->kr_work, message, sizeof(workp->kr_work))) != 0) { + /* + * Copy out the external lookup structure for the request, not + * including the el_extend field, which contains the address of the + * external buffer provided by the external resolver into which we + * copy the extension request information. + */ + /* BEFORE FIELD */ + if ((error = copyout(&workp->kr_work, message, offsetof(struct kauth_identity_extlookup, el_extend))) != 0) { + KAUTH_DEBUG("RESOLVER - error submitting work to resolve"); + goto out; + } + /* AFTER FIELD */ + if ((error = copyout(&workp->kr_work.el_info_reserved_1, + message + offsetof(struct kauth_identity_extlookup, el_info_reserved_1), + sizeof(struct kauth_identity_extlookup) - offsetof(struct kauth_identity_extlookup, el_info_reserved_1))) != 0) { KAUTH_DEBUG("RESOLVER - error submitting work to resolve"); goto out; } + + /* + * Handle extended requests here; if we have a request of a type where + * the kernel wants a translation of extended information, then we need + * to copy it out into the extended buffer, assuming the buffer is + * valid; we only attempt to get the buffer address if we have request + * data to copy into it. + */ + + /* + * translate a user@domain string into a uid/gid/whatever + */ + if (workp->kr_work.el_flags & (KAUTH_EXTLOOKUP_VALID_PWNAM | KAUTH_EXTLOOKUP_VALID_GRNAM)) { + uint64_t uaddr; + + error = copyin(message + offsetof(struct kauth_identity_extlookup, el_extend), &uaddr, sizeof(uaddr)); + if (!error) { + size_t actual; /* not used */ + /* + * Use copyoutstr() to reduce the copy size; we let + * this catch a NULL uaddr because we shouldn't be + * asking in that case anyway. + */ + error = copyoutstr(CAST_DOWN(void *,workp->kr_extend), uaddr, MAXPATHLEN, &actual); + } + if (error) { + KAUTH_DEBUG("RESOLVER - error submitting work to resolve"); + goto out; + } + } TAILQ_REMOVE(&kauth_resolver_unsubmitted, workp, kr_link); workp->kr_flags &= ~KAUTH_REQUEST_UNSUBMITTED; workp->kr_flags |= KAUTH_REQUEST_SUBMITTED; @@ -706,6 +782,10 @@ kauth_resolver_complete(user_addr_t message) struct kauth_resolver_work *killp; int error, result; + /* + * Copy in the mesage, including the extension field, since we are + * copying into a local variable. + */ if ((error = copyin(message, &extl, sizeof(extl))) != 0) { KAUTH_DEBUG("RESOLVER - error getting completed work\n"); return(error); @@ -771,22 +851,66 @@ kauth_resolver_complete(user_addr_t message) } /* - * In the case of a fatal error, we assume that the resolver will restart - * quickly and re-collect all of the outstanding requests. Thus, we don't - * complete the request which returned the fatal error status. + * In the case of a fatal error, we assume that the resolver will + * restart quickly and re-collect all of the outstanding requests. + * Thus, we don't complete the request which returned the fatal + * error status. */ if (extl.el_result != KAUTH_EXTLOOKUP_FATAL) { /* scan our list for this request */ TAILQ_FOREACH(workp, &kauth_resolver_submitted, kr_link) { /* found it? */ if (workp->kr_seqno == extl.el_seqno) { - /* copy result */ - workp->kr_work = extl; - /* move onto completed list and wake up requester(s) */ + + /* + * Get the request of the submitted queue so + * that it is not cleaned up out from under + * us by a timeout. + */ TAILQ_REMOVE(&kauth_resolver_submitted, workp, kr_link); workp->kr_flags &= ~KAUTH_REQUEST_SUBMITTED; workp->kr_flags |= KAUTH_REQUEST_DONE; workp->kr_result = result; + + /* Copy the result message to the work item. */ + memcpy(&workp->kr_work, &extl, sizeof(struct kauth_identity_extlookup)); + + /* + * Check if we have a result in the extension + * field; if we do, then we need to separately + * copy the data from the message el_extend + * into the request buffer that's in the work + * item. We have to do it here because we do + * not want to wake up the waiter until the + * data is in their buffer, and because the + * actual request response may be destroyed + * by the time the requester wakes up, and they + * do not have access to the user space buffer + * address. + * + * It is safe to drop and reacquire the lock + * here because we've already removed the item + * from the submission queue, but have not yet + * moved it to the completion queue. Note that + * near simultaneous requests may result in + * duplication of requests for items in this + * window. This should not be a performance + * issue and is easily detectable by comparing + * time to live on last response vs. time of + * next request in the resolver logs. + */ + if (extl.el_flags & (KAUTH_EXTLOOKUP_VALID_PWNAM|KAUTH_EXTLOOKUP_VALID_GRNAM)) { + size_t actual; /* notused */ + + KAUTH_RESOLVER_UNLOCK(); + error = copyinstr(extl.el_extend, CAST_DOWN(void *, workp->kr_extend), MAXPATHLEN, &actual); + KAUTH_RESOLVER_LOCK(); + } + + /* + * Move the completed work item to the + * completion queue and wake up requester(s) + */ TAILQ_INSERT_TAIL(&kauth_resolver_done, workp, kr_link); wakeup(workp); break; @@ -814,14 +938,18 @@ struct kauth_identity { #define KI_VALID_GID (1<<1) #define KI_VALID_GUID (1<<2) #define KI_VALID_NTSID (1<<3) +#define KI_VALID_PWNAM (1<<4) /* Used for translation */ +#define KI_VALID_GRNAM (1<<5) /* Used for translation */ uid_t ki_uid; gid_t ki_gid; guid_t ki_guid; ntsid_t ki_ntsid; + const char *ki_name; /* string name from string cache */ /* - * Expiry times are the earliest time at which we will disregard the cached state and go to - * userland. Before then if the valid bit is set, we will return the cached value. If it's - * not set, we will not go to userland to resolve, just assume that there is no answer + * Expiry times are the earliest time at which we will disregard the + * cached state and go to userland. Before then if the valid bit is + * set, we will return the cached value. If it's not set, we will + * not go to userland to resolve, just assume that there is no answer * available. */ time_t ki_guid_expiry; @@ -838,16 +966,17 @@ static lck_mtx_t *kauth_identity_mtx; static struct kauth_identity *kauth_identity_alloc(uid_t uid, gid_t gid, guid_t *guidp, time_t guid_expiry, - ntsid_t *ntsidp, time_t ntsid_expiry); + ntsid_t *ntsidp, time_t ntsid_expiry, const char *name, int nametype); static void kauth_identity_register_and_free(struct kauth_identity *kip); -static void kauth_identity_updatecache(struct kauth_identity_extlookup *elp, struct kauth_identity *kip); +static void kauth_identity_updatecache(struct kauth_identity_extlookup *elp, struct kauth_identity *kip, uint64_t extend_data); static void kauth_identity_lru(struct kauth_identity *kip); static int kauth_identity_guid_expired(struct kauth_identity *kip); static int kauth_identity_ntsid_expired(struct kauth_identity *kip); -static int kauth_identity_find_uid(uid_t uid, struct kauth_identity *kir); -static int kauth_identity_find_gid(gid_t gid, struct kauth_identity *kir); -static int kauth_identity_find_guid(guid_t *guidp, struct kauth_identity *kir); -static int kauth_identity_find_ntsid(ntsid_t *ntsid, struct kauth_identity *kir); +static int kauth_identity_find_uid(uid_t uid, struct kauth_identity *kir, char *getname); +static int kauth_identity_find_gid(gid_t gid, struct kauth_identity *kir, char *getname); +static int kauth_identity_find_guid(guid_t *guidp, struct kauth_identity *kir, char *getname); +static int kauth_identity_find_ntsid(ntsid_t *ntsid, struct kauth_identity *kir, char *getname); +static int kauth_identity_find_nam(char *name, int valid, struct kauth_identity *kir); /* @@ -888,11 +1017,11 @@ kauth_identity_init(void) * structure, filled in * * Notes: It is illegal to translate between UID and GID; any given UUID - * or NTSID can oly refer to an NTSIDE or UUID (respectively), + * or NTSID can only refer to an NTSID or UUID (respectively), * and *either* a UID *or* a GID, but not both. */ static struct kauth_identity * -kauth_identity_alloc(uid_t uid, gid_t gid, guid_t *guidp, time_t guid_expiry, ntsid_t *ntsidp, time_t ntsid_expiry) +kauth_identity_alloc(uid_t uid, gid_t gid, guid_t *guidp, time_t guid_expiry, ntsid_t *ntsidp, time_t ntsid_expiry, const char *name, int nametype) { struct kauth_identity *kip; @@ -919,6 +1048,10 @@ kauth_identity_alloc(uid_t uid, gid_t gid, guid_t *guidp, time_t guid_expiry, nt kip->ki_valid |= KI_VALID_NTSID; } kip->ki_ntsid_expiry = ntsid_expiry; + if (name != NULL) { + kip->ki_name = name; + kip->ki_valid |= nametype; + } } return(kip); } @@ -928,7 +1061,7 @@ kauth_identity_alloc(uid_t uid, gid_t gid, guid_t *guidp, time_t guid_expiry, nt * kauth_identity_register_and_free * * Description: Register an association between identity tokens. The passed - * 'kip' is freed by this function. + * 'kip' is consumed by this function. * * Parameters: kip Pointer to kauth_identity * structure to register @@ -975,11 +1108,22 @@ kauth_identity_register_and_free(struct kauth_identity *kip) ip->ki_valid |= KI_VALID_NTSID; } ip->ki_ntsid_expiry = kip->ki_ntsid_expiry; - /* and discard the incoming identity */ - FREE(kip, M_KAUTH); - ip = NULL; + /* a valid ki_name field overwrites the previous name field */ + if (kip->ki_valid & (KI_VALID_PWNAM | KI_VALID_GRNAM)) { + /* if there's an old one, discard it */ + const char *oname = NULL; + if (ip->ki_valid & (KI_VALID_PWNAM | KI_VALID_GRNAM)) + oname = ip->ki_name; + ip->ki_name = kip->ki_name; + kip->ki_name = oname; + } + /* and discard the incoming entry */ + ip = kip; } else { - /* don't have any information on this identity, so just add it */ + /* + * if we don't have any information on this identity, add it; + * if it pushes us over our limit, discard the oldest one. + */ TAILQ_INSERT_HEAD(&kauth_identities, kip, ki_link); if (++kauth_identity_count > KAUTH_IDENTITY_CACHEMAX) { ip = TAILQ_LAST(&kauth_identities, kauth_identity_head); @@ -988,9 +1132,14 @@ kauth_identity_register_and_free(struct kauth_identity *kip) } } KAUTH_IDENTITY_UNLOCK(); - /* have to drop lock before freeing expired entry */ - if (ip != NULL) + /* have to drop lock before freeing expired entry (it may be in use) */ + if (ip != NULL) { + /* if the ki_name field is used, clear it first */ + if (ip->ki_valid & (KI_VALID_PWNAM | KI_VALID_GRNAM)) + vfs_removename(ip->ki_name); + /* free the expired entry */ FREE(ip, M_KAUTH); + } } @@ -998,25 +1147,51 @@ kauth_identity_register_and_free(struct kauth_identity *kip) * kauth_identity_updatecache * * Description: Given a lookup result, add any associations that we don't - * currently have. + * currently have; replace ones which have changed. * * Parameters: elp External lookup result from * user space daemon to kernel * rkip pointer to returned kauth * identity, or NULL + * extend_data Extended data (can vary) * * Returns: (void) * * Implicit returns: * *rkip Modified (if non-NULL) + * + * Notes: For extended information requests, this code relies on the fact + * that elp->el_flags is never used as an rvalue, and is only + * ever bit-tested for valid lookup information we are willing + * to cache. + * + * XXX: We may have to do the same in the case that extended data was + * passed out to user space to ensure that the request string + * gets cached; we may also be able to use the rkip as an + * input to avoid this. The jury is still out. + * + * XXX: This codes performance could be improved for multiple valid + * results by combining the loop iteration in a single loop. */ static void -kauth_identity_updatecache(struct kauth_identity_extlookup *elp, struct kauth_identity *rkip) +kauth_identity_updatecache(struct kauth_identity_extlookup *elp, struct kauth_identity *rkip, uint64_t extend_data) { struct timeval tv; struct kauth_identity *kip; + const char *speculative_name = NULL; microuptime(&tv); + + /* + * If there is extended data, and that data represents a name rather + * than something else, speculatively create an entry for it in the + * string cache. We do this to avoid holding the KAUTH_IDENTITY_LOCK + * over the allocation later. + */ + if (elp->el_flags & (KAUTH_EXTLOOKUP_VALID_PWNAM | KAUTH_EXTLOOKUP_VALID_GRNAM)) { + const char *tmp = CAST_DOWN(const char *,extend_data); + speculative_name = vfs_addname(tmp, strnlen(tmp, MAXPATHLEN - 1), 0, 0); + } /* user identity? */ if (elp->el_flags & KAUTH_EXTLOOKUP_VALID_UID) { @@ -1034,6 +1209,19 @@ kauth_identity_updatecache(struct kauth_identity_extlookup *elp, struct kauth_id kip->ki_valid |= KI_VALID_NTSID; } kip->ki_ntsid_expiry = tv.tv_sec + elp->el_usid_valid; + if (elp->el_flags & KAUTH_EXTLOOKUP_VALID_PWNAM) { + const char *oname = kip->ki_name; + kip->ki_name = speculative_name; + speculative_name = NULL; + kip->ki_valid |= KI_VALID_PWNAM; + if (oname) { + /* + * free oname (if any) outside + * the lock + */ + speculative_name = oname; + } + } kauth_identity_lru(kip); if (rkip != NULL) *rkip = *kip; @@ -1048,18 +1236,22 @@ kauth_identity_updatecache(struct kauth_identity_extlookup *elp, struct kauth_id (elp->el_flags & KAUTH_EXTLOOKUP_VALID_UGUID) ? &elp->el_uguid : NULL, tv.tv_sec + elp->el_uguid_valid, (elp->el_flags & KAUTH_EXTLOOKUP_VALID_USID) ? &elp->el_usid : NULL, - tv.tv_sec + elp->el_usid_valid); + tv.tv_sec + elp->el_usid_valid, + (elp->el_flags & KAUTH_EXTLOOKUP_VALID_PWNAM) ? speculative_name : NULL, + KI_VALID_PWNAM); if (kip != NULL) { if (rkip != NULL) *rkip = *kip; + if (elp->el_flags & KAUTH_EXTLOOKUP_VALID_PWNAM) + speculative_name = NULL; KAUTH_DEBUG("CACHE - learned %d is " K_UUID_FMT, kip->ki_uid, K_UUID_ARG(kip->ki_guid)); kauth_identity_register_and_free(kip); } } } - /* group identity? */ - if (elp->el_flags & KAUTH_EXTLOOKUP_VALID_GID) { + /* group identity? (ignore, if we already processed it as a user) */ + if (elp->el_flags & KAUTH_EXTLOOKUP_VALID_GID && !(elp->el_flags & KAUTH_EXTLOOKUP_VALID_UID)) { KAUTH_IDENTITY_LOCK(); TAILQ_FOREACH(kip, &kauth_identities, ki_link) { /* matching record */ @@ -1074,6 +1266,19 @@ kauth_identity_updatecache(struct kauth_identity_extlookup *elp, struct kauth_id kip->ki_valid |= KI_VALID_NTSID; } kip->ki_ntsid_expiry = tv.tv_sec + elp->el_gsid_valid; + if (elp->el_flags & KAUTH_EXTLOOKUP_VALID_GRNAM) { + const char *oname = kip->ki_name; + kip->ki_name = speculative_name; + speculative_name = NULL; + kip->ki_valid |= KI_VALID_GRNAM; + if (oname) { + /* + * free oname (if any) outside + * the lock + */ + speculative_name = oname; + } + } kauth_identity_lru(kip); if (rkip != NULL) *rkip = *kip; @@ -1088,16 +1293,24 @@ kauth_identity_updatecache(struct kauth_identity_extlookup *elp, struct kauth_id (elp->el_flags & KAUTH_EXTLOOKUP_VALID_GGUID) ? &elp->el_gguid : NULL, tv.tv_sec + elp->el_gguid_valid, (elp->el_flags & KAUTH_EXTLOOKUP_VALID_GSID) ? &elp->el_gsid : NULL, - tv.tv_sec + elp->el_gsid_valid); + tv.tv_sec + elp->el_gsid_valid, + (elp->el_flags & KAUTH_EXTLOOKUP_VALID_GRNAM) ? speculative_name : NULL, + KI_VALID_GRNAM); if (kip != NULL) { if (rkip != NULL) *rkip = *kip; + if (elp->el_flags & KAUTH_EXTLOOKUP_VALID_GRNAM) + speculative_name = NULL; KAUTH_DEBUG("CACHE - learned %d is " K_UUID_FMT, kip->ki_uid, K_UUID_ARG(kip->ki_guid)); kauth_identity_register_and_free(kip); } } } + /* If we have a name reference to drop, drop it here */ + if (speculative_name != NULL) { + vfs_removename(speculative_name); + } } @@ -1179,6 +1392,7 @@ kauth_identity_ntsid_expired(struct kauth_identity *kip) * * Parameters: uid UID to find * kir Pointer to return area + * getname Name buffer, if ki_name wanted * * Returns: 0 Found * ENOENT Not found @@ -1187,7 +1401,7 @@ kauth_identity_ntsid_expired(struct kauth_identity *kip) * *klr Modified, if found */ static int -kauth_identity_find_uid(uid_t uid, struct kauth_identity *kir) +kauth_identity_find_uid(uid_t uid, struct kauth_identity *kir, char *getname) { struct kauth_identity *kip; @@ -1197,6 +1411,9 @@ kauth_identity_find_uid(uid_t uid, struct kauth_identity *kir) kauth_identity_lru(kip); /* Copy via structure assignment */ *kir = *kip; + /* If a name is wanted and one exists, copy it out */ + if (getname != NULL && (kip->ki_valid & (KI_VALID_PWNAM | KI_VALID_GRNAM))) + strlcpy(getname, kip->ki_name, MAXPATHLEN); break; } } @@ -1206,12 +1423,13 @@ kauth_identity_find_uid(uid_t uid, struct kauth_identity *kir) /* - * kauth_identity_find_uid + * kauth_identity_find_gid * * Description: Search for an entry by GID * * Parameters: gid GID to find * kir Pointer to return area + * getname Name buffer, if ki_name wanted * * Returns: 0 Found * ENOENT Not found @@ -1220,7 +1438,7 @@ kauth_identity_find_uid(uid_t uid, struct kauth_identity *kir) * *klr Modified, if found */ static int -kauth_identity_find_gid(uid_t gid, struct kauth_identity *kir) +kauth_identity_find_gid(uid_t gid, struct kauth_identity *kir, char *getname) { struct kauth_identity *kip; @@ -1230,6 +1448,9 @@ kauth_identity_find_gid(uid_t gid, struct kauth_identity *kir) kauth_identity_lru(kip); /* Copy via structure assignment */ *kir = *kip; + /* If a name is wanted and one exists, copy it out */ + if (getname != NULL && (kip->ki_valid & (KI_VALID_PWNAM | KI_VALID_GRNAM))) + strlcpy(getname, kip->ki_name, MAXPATHLEN); break; } } @@ -1245,6 +1466,7 @@ kauth_identity_find_gid(uid_t gid, struct kauth_identity *kir) * * Parameters: guidp Pointer to GUID to find * kir Pointer to return area + * getname Name buffer, if ki_name wanted * * Returns: 0 Found * ENOENT Not found @@ -1256,13 +1478,49 @@ kauth_identity_find_gid(uid_t gid, struct kauth_identity *kir) * may elect to call out to userland to revalidate. */ static int -kauth_identity_find_guid(guid_t *guidp, struct kauth_identity *kir) +kauth_identity_find_guid(guid_t *guidp, struct kauth_identity *kir, char *getname) { struct kauth_identity *kip; KAUTH_IDENTITY_LOCK(); TAILQ_FOREACH(kip, &kauth_identities, ki_link) { if ((kip->ki_valid & KI_VALID_GUID) && (kauth_guid_equal(guidp, &kip->ki_guid))) { + kauth_identity_lru(kip); + /* Copy via structure assignment */ + *kir = *kip; + /* If a name is wanted and one exists, copy it out */ + if (getname != NULL && (kip->ki_valid & (KI_VALID_PWNAM | KI_VALID_GRNAM))) + strlcpy(getname, kip->ki_name, MAXPATHLEN); + break; + } + } + KAUTH_IDENTITY_UNLOCK(); + return((kip == NULL) ? ENOENT : 0); +} + +/* + * kauth_identity_find_nam + * + * Description: Search for an entry by name + * + * Parameters: name Pointer to name to find + * valid KI_VALID_PWNAM or KI_VALID_GRNAM + * kir Pointer to return aread + * + * Returns: 0 Found + * ENOENT Not found + * + * Implicit returns: + * *klr Modified, if found + */ +static int +kauth_identity_find_nam(char *name, int valid, struct kauth_identity *kir) +{ + struct kauth_identity *kip; + + KAUTH_IDENTITY_LOCK(); + TAILQ_FOREACH(kip, &kauth_identities, ki_link) { + if ((kip->ki_valid & valid) && !strcmp(name, kip->ki_name)) { kauth_identity_lru(kip); /* Copy via structure assignment */ *kir = *kip; @@ -1281,6 +1539,7 @@ kauth_identity_find_guid(guid_t *guidp, struct kauth_identity *kir) * * Parameters: ntsid Pointer to NTSID to find * kir Pointer to return area + * getname Name buffer, if ki_name wanted * * Returns: 0 Found * ENOENT Not found @@ -1292,7 +1551,7 @@ kauth_identity_find_guid(guid_t *guidp, struct kauth_identity *kir) * may elect to call out to userland to revalidate. */ static int -kauth_identity_find_ntsid(ntsid_t *ntsid, struct kauth_identity *kir) +kauth_identity_find_ntsid(ntsid_t *ntsid, struct kauth_identity *kir, char *getname) { struct kauth_identity *kip; @@ -1302,6 +1561,9 @@ kauth_identity_find_ntsid(ntsid_t *ntsid, struct kauth_identity *kir) kauth_identity_lru(kip); /* Copy via structure assignment */ *kir = *kip; + /* If a name is wanted and one exists, copy it out */ + if (getname != NULL && (kip->ki_valid & (KI_VALID_PWNAM | KI_VALID_GRNAM))) + strlcpy(getname, kip->ki_name, MAXPATHLEN); break; } } @@ -1351,7 +1613,7 @@ int kauth_wellknown_guid(guid_t *guid) { static char fingerprint[] = {0xab, 0xcd, 0xef, 0xab, 0xcd, 0xef, 0xab, 0xcd, 0xef, 0xab, 0xcd, 0xef}; - int code; + uint32_t code; /* * All WKGs begin with the same 12 bytes. */ @@ -1359,7 +1621,7 @@ kauth_wellknown_guid(guid_t *guid) /* * The final 4 bytes are our code (in network byte order). */ - code = OSSwapHostToBigInt32(*(u_int32_t *)&guid->g_guid[12]); + code = OSSwapHostToBigInt32(*(uint32_t *)&guid->g_guid[12]); switch(code) { case 0x0000000c: return(KAUTH_WKG_EVERYBODY); @@ -1445,16 +1707,17 @@ kauth_cred_change_egid(kauth_cred_t cred, gid_t new_egid) #if radar_4600026 int is_member; #endif /* radar_4600026 */ - gid_t old_egid = cred->cr_groups[0]; + gid_t old_egid = kauth_cred_getgid(cred); + posix_cred_t pcred = posix_cred_get(cred); /* Ignoring the first entry, scan for a match for the new egid */ - for (i = 1; i < cred->cr_ngroups; i++) { + for (i = 1; i < pcred->cr_ngroups; i++) { /* * If we find a match, swap them so we don't lose overall * group information */ - if (cred->cr_groups[i] == new_egid) { - cred->cr_groups[i] = old_egid; + if (pcred->cr_groups[i] == new_egid) { + pcred->cr_groups[i] = old_egid; DEBUG_CRED_CHANGE("kauth_cred_change_egid: unset displaced\n"); displaced = 0; break; @@ -1480,7 +1743,7 @@ conservative approach (i.e. less likely to cause things to break). * * NB: This is typically a cold code path. */ - if (displaced && !(cred->cr_flags & CRF_NOMEMBERD) && + if (displaced && !(pcred->cr_flags & CRF_NOMEMBERD) && kauth_cred_ismember_gid(cred, new_egid, &is_member) == 0 && is_member) { displaced = 0; @@ -1489,7 +1752,7 @@ conservative approach (i.e. less likely to cause things to break). #endif /* radar_4600026 */ /* set the new EGID into the old spot */ - cred->cr_groups[0] = new_egid; + pcred->cr_groups[0] = new_egid; return (displaced); } @@ -1508,7 +1771,41 @@ uid_t kauth_cred_getuid(kauth_cred_t cred) { NULLCRED_CHECK(cred); - return(cred->cr_uid); + return(posix_cred_get(cred)->cr_uid); +} + + +/* + * kauth_cred_getruid + * + * Description: Fetch RUID from credential + * + * Parameters: cred Credential to examine + * + * Returns: (uid_t) RUID associated with credential + */ +uid_t +kauth_cred_getruid(kauth_cred_t cred) +{ + NULLCRED_CHECK(cred); + return(posix_cred_get(cred)->cr_ruid); +} + + +/* + * kauth_cred_getsvuid + * + * Description: Fetch SVUID from credential + * + * Parameters: cred Credential to examine + * + * Returns: (uid_t) SVUID associated with credential + */ +uid_t +kauth_cred_getsvuid(kauth_cred_t cred) +{ + NULLCRED_CHECK(cred); + return(posix_cred_get(cred)->cr_svuid); } @@ -1521,11 +1818,139 @@ kauth_cred_getuid(kauth_cred_t cred) * * Returns: (gid_t) GID associated with credential */ -uid_t +gid_t kauth_cred_getgid(kauth_cred_t cred) { NULLCRED_CHECK(cred); - return(cred->cr_gid); + return(posix_cred_get(cred)->cr_gid); +} + + +/* + * kauth_cred_getrgid + * + * Description: Fetch RGID from credential + * + * Parameters: cred Credential to examine + * + * Returns: (gid_t) RGID associated with credential + */ +gid_t +kauth_cred_getrgid(kauth_cred_t cred) +{ + NULLCRED_CHECK(cred); + return(posix_cred_get(cred)->cr_rgid); +} + + +/* + * kauth_cred_getsvgid + * + * Description: Fetch SVGID from credential + * + * Parameters: cred Credential to examine + * + * Returns: (gid_t) SVGID associated with credential + */ +gid_t +kauth_cred_getsvgid(kauth_cred_t cred) +{ + NULLCRED_CHECK(cred); + return(posix_cred_get(cred)->cr_svgid); +} + + +/* + * kauth_cred_guid2pwnam + * + * Description: Fetch PWNAM from GUID + * + * Parameters: guidp Pointer to GUID to examine + * pwnam Pointer to user@domain buffer + * + * Returns: 0 Success + * kauth_cred_cache_lookup:EINVAL + * + * Implicit returns: + * *pwnam Modified, if successful + * + * Notes: pwnam is assumed to point to a buffer of MAXPATHLEN in size + */ +int +kauth_cred_guid2pwnam(guid_t *guidp, char *pwnam) +{ + return(kauth_cred_cache_lookup(KI_VALID_GUID, KI_VALID_PWNAM, guidp, pwnam)); +} + + +/* + * kauth_cred_guid2grnam + * + * Description: Fetch GRNAM from GUID + * + * Parameters: guidp Pointer to GUID to examine + * grnam Pointer to group@domain buffer + * + * Returns: 0 Success + * kauth_cred_cache_lookup:EINVAL + * + * Implicit returns: + * *grnam Modified, if successful + * + * Notes: grnam is assumed to point to a buffer of MAXPATHLEN in size + */ +int +kauth_cred_guid2grnam(guid_t *guidp, char *grnam) +{ + return(kauth_cred_cache_lookup(KI_VALID_GUID, KI_VALID_GRNAM, guidp, grnam)); +} + + +/* + * kauth_cred_pwnam2guid + * + * Description: Fetch PWNAM from GUID + * + * Parameters: pwnam String containing user@domain + * guidp Pointer to buffer for GUID + * + * Returns: 0 Success + * kauth_cred_cache_lookup:EINVAL + * + * Implicit returns: + * *guidp Modified, if successful + * + * Notes: pwnam should not point to a request larger than MAXPATHLEN + * bytes in size, including the NUL termination of the string. + */ +int +kauth_cred_pwnam2guid(char *pwnam, guid_t *guidp) +{ + return(kauth_cred_cache_lookup(KI_VALID_PWNAM, KI_VALID_GUID, pwnam, guidp)); +} + + +/* + * kauth_cred_grnam2guid + * + * Description: Fetch GRNAM from GUID + * + * Parameters: grnam String containing group@domain + * guidp Pointer to buffer for GUID + * + * Returns: 0 Success + * kauth_cred_cache_lookup:EINVAL + * + * Implicit returns: + * *guidp Modified, if successful + * + * Notes: grnam should not point to a request larger than MAXPATHLEN + * bytes in size, including the NUL termination of the string. + */ +int +kauth_cred_grnam2guid(char *grnam, guid_t *guidp) +{ + return(kauth_cred_cache_lookup(KI_VALID_GRNAM, KI_VALID_GUID, grnam, guidp)); } @@ -1806,27 +2231,40 @@ kauth_cred_cache_lookup(int from, int to, void *src, void *dst) struct kauth_identity ki; struct kauth_identity_extlookup el; int error; + uint64_t extend_data = 0ULL; int (* expired)(struct kauth_identity *kip); + char *namebuf = NULL; KAUTH_DEBUG("CACHE - translate %d to %d", from, to); /* * Look for an existing cache entry for this association. * If the entry has not expired, return the cached information. + * We do not cache user@domain translations here; they use too + * much memory to hold onto forever, and can not be updated + * atomically. */ + if (to == KI_VALID_PWNAM || to == KI_VALID_GRNAM) { + namebuf = dst; + } ki.ki_valid = 0; switch(from) { case KI_VALID_UID: - error = kauth_identity_find_uid(*(uid_t *)src, &ki); + error = kauth_identity_find_uid(*(uid_t *)src, &ki, namebuf); break; case KI_VALID_GID: - error = kauth_identity_find_gid(*(gid_t *)src, &ki); + error = kauth_identity_find_gid(*(gid_t *)src, &ki, namebuf); break; case KI_VALID_GUID: - error = kauth_identity_find_guid((guid_t *)src, &ki); + error = kauth_identity_find_guid((guid_t *)src, &ki, namebuf); break; case KI_VALID_NTSID: - error = kauth_identity_find_ntsid((ntsid_t *)src, &ki); + error = kauth_identity_find_ntsid((ntsid_t *)src, &ki, namebuf); + break; + case KI_VALID_PWNAM: + case KI_VALID_GRNAM: + /* Names are unique in their 'from' space */ + error = kauth_identity_find_nam((char *)src, from, &ki); break; default: return(EINVAL); @@ -1862,7 +2300,7 @@ kauth_cred_cache_lookup(int from, int to, void *src, void *dst) expired = NULL; } } - KAUTH_DEBUG("CACHE - found matching entry with valid %d", ki.ki_valid); + KAUTH_DEBUG("CACHE - found matching entry with valid 0x%08x", ki.ki_valid); /* * If no expiry function, or not expired, we have found * a hit. @@ -1882,13 +2320,33 @@ kauth_cred_cache_lookup(int from, int to, void *src, void *dst) * a better-than nothing alternative. */ KAUTH_DEBUG("CACHE - expired entry found"); + } else { + /* + * A guid can't both match a uid and a gid, so if we + * found a cache entry while looking for one or the + * other from a guid, the 'from' is KI_VALID_GUID, + * and the 'to' is one, and the other one is valid, + * then we immediately return ENOENT without calling + * the resolver again. + */ + if (from == KI_VALID_GUID && + (((ki.ki_valid & KI_VALID_UID) && + to == KI_VALID_GID) || + ((ki.ki_valid & KI_VALID_GID) && + to == KI_VALID_UID))) { + return (ENOENT); + } } } /* * We failed to find a cache entry; call the resolver. * - * Note: We ask for as much data as we can get. + * Note: We ask for as much non-extended data as we can get, + * and only provide (or ask for) extended information if + * we have a 'from' (or 'to') which requires it. This + * way we don't pay for the extra transfer overhead for + * data we don't need. */ bzero(&el, sizeof(el)); el.el_info_pid = current_proc()->p_pid; @@ -1911,6 +2369,16 @@ kauth_cred_cache_lookup(int from, int to, void *src, void *dst) el.el_usid = *(ntsid_t *)src; el.el_gsid = *(ntsid_t *)src; break; + case KI_VALID_PWNAM: + /* extra overhead */ + el.el_flags = KAUTH_EXTLOOKUP_VALID_PWNAM; + extend_data = CAST_USER_ADDR_T(src); + break; + case KI_VALID_GRNAM: + /* extra overhead */ + el.el_flags = KAUTH_EXTLOOKUP_VALID_GRNAM; + extend_data = CAST_USER_ADDR_T(src); + break; default: return(EINVAL); } @@ -1926,25 +2394,53 @@ kauth_cred_cache_lookup(int from, int to, void *src, void *dst) el.el_flags |= KAUTH_EXTLOOKUP_WANT_UID | KAUTH_EXTLOOKUP_WANT_GID | KAUTH_EXTLOOKUP_WANT_UGUID | KAUTH_EXTLOOKUP_WANT_GGUID | KAUTH_EXTLOOKUP_WANT_USID | KAUTH_EXTLOOKUP_WANT_GSID; + if (to == KI_VALID_PWNAM) { + /* extra overhead */ + el.el_flags |= KAUTH_EXTLOOKUP_WANT_PWNAM; + extend_data = CAST_USER_ADDR_T(dst); + } + if (to == KI_VALID_GRNAM) { + /* extra overhead */ + el.el_flags |= KAUTH_EXTLOOKUP_WANT_GRNAM; + extend_data = CAST_USER_ADDR_T(dst); + } + + /* Call resolver */ KAUTH_DEBUG("CACHE - calling resolver for %x", el.el_flags); - error = kauth_resolver_submit(&el); + error = kauth_resolver_submit(&el, extend_data); KAUTH_DEBUG("CACHE - resolver returned %d", error); - /* was the lookup successful? */ + + /* was the external lookup successful? */ if (error == 0) { /* - * Save the results from the lookup - may have other - * information even if we didn't get a guid. + * Save the results from the lookup - we may have other + * information, even if we didn't get a guid or the + * extended data. + * + * If we came from a name, we know the extend_data is valid. + */ + if (from == KI_VALID_PWNAM) + el.el_flags |= KAUTH_EXTLOOKUP_VALID_PWNAM; + else if (from == KI_VALID_GRNAM) + el.el_flags |= KAUTH_EXTLOOKUP_VALID_GRNAM; + + kauth_identity_updatecache(&el, &ki, extend_data); + + /* + * Check to see if we have a valid cache entry + * originating from the result. */ - kauth_identity_updatecache(&el, &ki); + if (!(ki.ki_valid & to)) { + error = ENOENT; + } } - /* - * Check to see if we have a valid result. - */ - if (!error && !(ki.ki_valid & to)) - error = ENOENT; if (error) return(error); found: + /* + * Copy from the appropriate struct kauth_identity cache entry + * structure into the destination buffer area. + */ switch(to) { case KI_VALID_UID: *(uid_t *)dst = ki.ki_uid; @@ -1958,6 +2454,10 @@ kauth_cred_cache_lookup(int from, int to, void *src, void *dst) case KI_VALID_NTSID: *(ntsid_t *)dst = ki.ki_ntsid; break; + case KI_VALID_PWNAM: + case KI_VALID_GRNAM: + /* handled in kauth_resolver_complete() */ + break; default: return(EINVAL); } @@ -2190,6 +2690,7 @@ kauth_groups_updatecache(struct kauth_identity_extlookup *el) int kauth_cred_ismember_gid(kauth_cred_t cred, gid_t gid, int *resultp) { + posix_cred_t pcred = posix_cred_get(cred); struct kauth_group_membership *gm; struct kauth_identity_extlookup el; int i, error; @@ -2200,8 +2701,8 @@ kauth_cred_ismember_gid(kauth_cred_t cred, gid_t gid, int *resultp) * We can conditionalise this on cred->cr_gmuid == KAUTH_UID_NONE since * the cache should be used for that case. */ - for (i = 0; i < cred->cr_ngroups; i++) { - if (gid == cred->cr_groups[i]) { + for (i = 0; i < pcred->cr_ngroups; i++) { + if (gid == pcred->cr_groups[i]) { *resultp = 1; return(0); } @@ -2211,7 +2712,7 @@ kauth_cred_ismember_gid(kauth_cred_t cred, gid_t gid, int *resultp) * If we don't have a UID for group membership checks, the in-cred list * was authoritative and we can stop here. */ - if (cred->cr_gmuid == KAUTH_UID_NONE) { + if (pcred->cr_gmuid == KAUTH_UID_NONE) { *resultp = 0; return(0); } @@ -2236,7 +2737,7 @@ kauth_cred_ismember_gid(kauth_cred_t cred, gid_t gid, int *resultp) */ KAUTH_GROUPS_LOCK(); TAILQ_FOREACH(gm, &kauth_groups, gm_link) { - if ((gm->gm_uid == cred->cr_gmuid) && (gm->gm_gid == gid) && !kauth_groups_expired(gm)) { + if ((gm->gm_uid == pcred->cr_gmuid) && (gm->gm_gid == gid) && !kauth_groups_expired(gm)) { kauth_groups_lru(gm); break; } @@ -2255,10 +2756,10 @@ kauth_cred_ismember_gid(kauth_cred_t cred, gid_t gid, int *resultp) bzero(&el, sizeof(el)); el.el_info_pid = current_proc()->p_pid; el.el_flags = KAUTH_EXTLOOKUP_VALID_UID | KAUTH_EXTLOOKUP_VALID_GID | KAUTH_EXTLOOKUP_WANT_MEMBERSHIP; - el.el_uid = cred->cr_gmuid; + el.el_uid = pcred->cr_gmuid; el.el_gid = gid; el.el_member_valid = 0; /* XXX set by resolver? */ - error = kauth_resolver_submit(&el); + error = kauth_resolver_submit(&el, 0ULL); if (error != 0) return(error); /* save the results from the lookup */ @@ -2332,7 +2833,7 @@ kauth_cred_ismember_guid(kauth_cred_t cred, guid_t *guidp, int *resultp) * this is expected to be a common case. */ ki.ki_valid = 0; - if ((error = kauth_identity_find_guid(guidp, &ki)) == 0 && + if ((error = kauth_identity_find_guid(guidp, &ki, NULL)) == 0 && !kauth_identity_guid_expired(&ki)) { if (ki.ki_valid & KI_VALID_GID) { /* It's a group after all... */ @@ -2395,38 +2896,40 @@ kauth_cred_gid_subset(kauth_cred_t cred1, kauth_cred_t cred2, int *resultp) { int i, err, res = 1; gid_t gid; + posix_cred_t pcred1 = posix_cred_get(cred1); + posix_cred_t pcred2 = posix_cred_get(cred2); /* First, check the local list of groups */ - for (i = 0; i < cred1->cr_ngroups; i++) { - gid = cred1->cr_groups[i]; + for (i = 0; i < pcred1->cr_ngroups; i++) { + gid = pcred1->cr_groups[i]; if ((err = kauth_cred_ismember_gid(cred2, gid, &res)) != 0) { return err; } - if (!res && gid != cred2->cr_rgid && gid != cred2->cr_svgid) { + if (!res && gid != pcred2->cr_rgid && gid != pcred2->cr_svgid) { *resultp = 0; return 0; } } /* Check real gid */ - if ((err = kauth_cred_ismember_gid(cred2, cred1->cr_rgid, &res)) != 0) { + if ((err = kauth_cred_ismember_gid(cred2, pcred1->cr_rgid, &res)) != 0) { return err; } - if (!res && cred1->cr_rgid != cred2->cr_rgid && - cred1->cr_rgid != cred2->cr_svgid) { + if (!res && pcred1->cr_rgid != pcred2->cr_rgid && + pcred1->cr_rgid != pcred2->cr_svgid) { *resultp = 0; return 0; } /* Finally, check saved gid */ - if ((err = kauth_cred_ismember_gid(cred2, cred1->cr_svgid, &res)) != 0){ + if ((err = kauth_cred_ismember_gid(cred2, pcred1->cr_svgid, &res)) != 0){ return err; } - if (!res && cred1->cr_svgid != cred2->cr_rgid && - cred1->cr_svgid != cred2->cr_svgid) { + if (!res && pcred1->cr_svgid != pcred2->cr_rgid && + pcred1->cr_svgid != pcred2->cr_svgid) { *resultp = 0; return 0; } @@ -2453,7 +2956,7 @@ kauth_cred_gid_subset(kauth_cred_t cred1, kauth_cred_t cred2, int *resultp) int kauth_cred_issuser(kauth_cred_t cred) { - return(cred->cr_uid == 0); + return(kauth_cred_getuid(cred) == 0); } @@ -2536,7 +3039,7 @@ kauth_cred_init(void) uid_t kauth_getuid(void) { - return(kauth_cred_get()->cr_uid); + return(kauth_cred_getuid(kauth_cred_get())); } @@ -2553,7 +3056,7 @@ kauth_getuid(void) uid_t kauth_getruid(void) { - return(kauth_cred_get()->cr_ruid); + return(kauth_cred_getruid(kauth_cred_get())); } @@ -2570,7 +3073,7 @@ kauth_getruid(void) gid_t kauth_getgid(void) { - return(kauth_cred_get()->cr_groups[0]); + return(kauth_cred_getgid(kauth_cred_get())); } @@ -2587,7 +3090,7 @@ kauth_getgid(void) gid_t kauth_getrgid(void) { - return(kauth_cred_get()->cr_rgid); + return(kauth_cred_getrgid(kauth_cred_get())); } @@ -2823,13 +3326,12 @@ kauth_cred_alloc(void) MALLOC_ZONE(newcred, kauth_cred_t, sizeof(*newcred), M_CRED, M_WAITOK); if (newcred != 0) { + posix_cred_t newpcred = posix_cred_get(newcred); bzero(newcred, sizeof(*newcred)); newcred->cr_ref = 1; - newcred->cr_audit.as_aia_p = &audit_default_aia; - /* XXX the following will go away with cr_au */ - newcred->cr_au.ai_auid = AU_DEFAUDITID; + newcred->cr_audit.as_aia_p = audit_default_aia_p; /* must do this, or cred has same group membership as uid 0 */ - newcred->cr_gmuid = KAUTH_UID_NONE; + newpcred->cr_gmuid = KAUTH_UID_NONE; #if CRED_DIAGNOSTIC } else { panic("kauth_cred_alloc: couldn't allocate credential"); @@ -2878,12 +3380,13 @@ kauth_cred_t kauth_cred_create(kauth_cred_t cred) { kauth_cred_t found_cred, new_cred = NULL; + posix_cred_t pcred = posix_cred_get(cred); int is_member = 0; KAUTH_CRED_HASH_LOCK_ASSERT(); - if (cred->cr_flags & CRF_NOMEMBERD) { - cred->cr_gmuid = KAUTH_UID_NONE; + if (pcred->cr_flags & CRF_NOMEMBERD) { + pcred->cr_gmuid = KAUTH_UID_NONE; } else { /* * If the template credential is not opting out of external @@ -2902,7 +3405,7 @@ kauth_cred_create(kauth_cred_t cred) * the answer, so long as it's something the external * resolver could have vended. */ - cred->cr_gmuid = cred->cr_uid; + pcred->cr_gmuid = pcred->cr_uid; } else { /* * It's not something the external resolver could @@ -2913,13 +3416,13 @@ kauth_cred_create(kauth_cred_t cred) * cost. Since most credentials are used multiple * times, we still get some performance win from this. */ - cred->cr_gmuid = KAUTH_UID_NONE; - cred->cr_flags |= CRF_NOMEMBERD; + pcred->cr_gmuid = KAUTH_UID_NONE; + pcred->cr_flags |= CRF_NOMEMBERD; } } /* Caller *must* specify at least the egid in cr_groups[0] */ - if (cred->cr_ngroups < 1) + if (pcred->cr_ngroups < 1) return(NULL); for (;;) { @@ -2943,22 +3446,20 @@ kauth_cred_create(kauth_cred_t cred) new_cred = kauth_cred_alloc(); if (new_cred != NULL) { int err; - new_cred->cr_uid = cred->cr_uid; - new_cred->cr_ruid = cred->cr_ruid; - new_cred->cr_svuid = cred->cr_svuid; - new_cred->cr_rgid = cred->cr_rgid; - new_cred->cr_svgid = cred->cr_svgid; - new_cred->cr_gmuid = cred->cr_gmuid; - new_cred->cr_ngroups = cred->cr_ngroups; - bcopy(&cred->cr_groups[0], &new_cred->cr_groups[0], sizeof(new_cred->cr_groups)); + posix_cred_t new_pcred = posix_cred_get(new_cred); + new_pcred->cr_uid = pcred->cr_uid; + new_pcred->cr_ruid = pcred->cr_ruid; + new_pcred->cr_svuid = pcred->cr_svuid; + new_pcred->cr_rgid = pcred->cr_rgid; + new_pcred->cr_svgid = pcred->cr_svgid; + new_pcred->cr_gmuid = pcred->cr_gmuid; + new_pcred->cr_ngroups = pcred->cr_ngroups; + bcopy(&pcred->cr_groups[0], &new_pcred->cr_groups[0], sizeof(new_pcred->cr_groups)); #if CONFIG_AUDIT bcopy(&cred->cr_audit, &new_cred->cr_audit, sizeof(new_cred->cr_audit)); - /* XXX the following bcopy() will go away with cr_au */ - bcopy(&cred->cr_au, &new_cred->cr_au, - sizeof(new_cred->cr_au)); #endif - new_cred->cr_flags = cred->cr_flags; + new_pcred->cr_flags = pcred->cr_flags; KAUTH_CRED_HASH_LOCK(); err = kauth_cred_add(new_cred); @@ -3017,6 +3518,8 @@ kauth_cred_t kauth_cred_setresuid(kauth_cred_t cred, uid_t ruid, uid_t euid, uid_t svuid, uid_t gmuid) { struct ucred temp_cred; + posix_cred_t temp_pcred = posix_cred_get(&temp_cred); + posix_cred_t pcred = posix_cred_get(cred); NULLCRED_CHECK(cred); @@ -3024,10 +3527,10 @@ kauth_cred_setresuid(kauth_cred_t cred, uid_t ruid, uid_t euid, uid_t svuid, uid * We don't need to do anything if the UIDs we are changing are * already the same as the UIDs passed in */ - if ((euid == KAUTH_UID_NONE || cred->cr_uid == euid) && - (ruid == KAUTH_UID_NONE || cred->cr_ruid == ruid) && - (svuid == KAUTH_UID_NONE || cred->cr_svuid == svuid) && - (cred->cr_gmuid == gmuid)) { + if ((euid == KAUTH_UID_NONE || pcred->cr_uid == euid) && + (ruid == KAUTH_UID_NONE || pcred->cr_ruid == ruid) && + (svuid == KAUTH_UID_NONE || pcred->cr_svuid == svuid) && + (pcred->cr_gmuid == gmuid)) { /* no change needed */ return(cred); } @@ -3038,13 +3541,13 @@ kauth_cred_setresuid(kauth_cred_t cred, uid_t ruid, uid_t euid, uid_t svuid, uid */ bcopy(cred, &temp_cred, sizeof(temp_cred)); if (euid != KAUTH_UID_NONE) { - temp_cred.cr_uid = euid; + temp_pcred->cr_uid = euid; } if (ruid != KAUTH_UID_NONE) { - temp_cred.cr_ruid = ruid; + temp_pcred->cr_ruid = ruid; } if (svuid != KAUTH_UID_NONE) { - temp_cred.cr_svuid = svuid; + temp_pcred->cr_svuid = svuid; } /* @@ -3052,8 +3555,8 @@ kauth_cred_setresuid(kauth_cred_t cred, uid_t ruid, uid_t euid, uid_t svuid, uid * opt out of participation in external group resolution, unless we * unless we explicitly opt back in later. */ - if ((temp_cred.cr_gmuid = gmuid) == KAUTH_UID_NONE) { - temp_cred.cr_flags |= CRF_NOMEMBERD; + if ((temp_pcred->cr_gmuid = gmuid) == KAUTH_UID_NONE) { + temp_pcred->cr_flags |= CRF_NOMEMBERD; } return(kauth_cred_update(cred, &temp_cred, TRUE)); @@ -3090,6 +3593,8 @@ kauth_cred_t kauth_cred_setresgid(kauth_cred_t cred, gid_t rgid, gid_t egid, gid_t svgid) { struct ucred temp_cred; + posix_cred_t temp_pcred = posix_cred_get(&temp_cred); + posix_cred_t pcred = posix_cred_get(cred); NULLCRED_CHECK(cred); DEBUG_CRED_ENTER("kauth_cred_setresgid %p %d %d %d\n", cred, rgid, egid, svgid); @@ -3098,9 +3603,9 @@ kauth_cred_setresgid(kauth_cred_t cred, gid_t rgid, gid_t egid, gid_t svgid) * We don't need to do anything if the given GID are already the * same as the GIDs in the credential. */ - if (cred->cr_groups[0] == egid && - cred->cr_rgid == rgid && - cred->cr_svgid == svgid) { + if (pcred->cr_groups[0] == egid && + pcred->cr_rgid == rgid && + pcred->cr_svgid == svgid) { /* no change needed */ return(cred); } @@ -3114,17 +3619,17 @@ kauth_cred_setresgid(kauth_cred_t cred, gid_t rgid, gid_t egid, gid_t svgid) /* displacing a supplementary group opts us out of memberd */ if (kauth_cred_change_egid(&temp_cred, egid)) { DEBUG_CRED_CHANGE("displaced!\n"); - temp_cred.cr_flags |= CRF_NOMEMBERD; - temp_cred.cr_gmuid = KAUTH_UID_NONE; + temp_pcred->cr_flags |= CRF_NOMEMBERD; + temp_pcred->cr_gmuid = KAUTH_UID_NONE; } else { DEBUG_CRED_CHANGE("not displaced\n"); } } if (rgid != KAUTH_GID_NONE) { - temp_cred.cr_rgid = rgid; + temp_pcred->cr_rgid = rgid; } if (svgid != KAUTH_GID_NONE) { - temp_cred.cr_svgid = svgid; + temp_pcred->cr_svgid = svgid; } return(kauth_cred_update(cred, &temp_cred, TRUE)); @@ -3185,16 +3690,20 @@ kauth_cred_setgroups(kauth_cred_t cred, gid_t *groups, int groupcount, uid_t gmu { int i; struct ucred temp_cred; + posix_cred_t temp_pcred = posix_cred_get(&temp_cred); + posix_cred_t pcred; NULLCRED_CHECK(cred); + pcred = posix_cred_get(cred); + /* * We don't need to do anything if the given list of groups does not * change. */ - if ((cred->cr_gmuid == gmuid) && (cred->cr_ngroups == groupcount)) { + if ((pcred->cr_gmuid == gmuid) && (pcred->cr_ngroups == groupcount)) { for (i = 0; i < groupcount; i++) { - if (cred->cr_groups[i] != groups[i]) + if (pcred->cr_groups[i] != groups[i]) break; } if (i == groupcount) { @@ -3211,17 +3720,46 @@ kauth_cred_setgroups(kauth_cred_t cred, gid_t *groups, int groupcount, uid_t gmu * using initgroups(). This is required for POSIX conformance. */ bcopy(cred, &temp_cred, sizeof(temp_cred)); - temp_cred.cr_ngroups = groupcount; - bcopy(groups, temp_cred.cr_groups, sizeof(temp_cred.cr_groups)); - temp_cred.cr_gmuid = gmuid; + temp_pcred->cr_ngroups = groupcount; + bcopy(groups, temp_pcred->cr_groups, sizeof(temp_pcred->cr_groups)); + temp_pcred->cr_gmuid = gmuid; if (gmuid == KAUTH_UID_NONE) - temp_cred.cr_flags |= CRF_NOMEMBERD; + temp_pcred->cr_flags |= CRF_NOMEMBERD; else - temp_cred.cr_flags &= ~CRF_NOMEMBERD; + temp_pcred->cr_flags &= ~CRF_NOMEMBERD; return(kauth_cred_update(cred, &temp_cred, TRUE)); } +/* + * XXX temporary, for NFS support until we can come up with a better + * XXX enumeration/comparison mechanism + * + * Notes: The return value exists to account for the possbility of a + * kauth_cred_t without a POSIX label. This will be the case in + * the future (see posix_cred_get() below, for more details). + */ +int +kauth_cred_getgroups(kauth_cred_t cred, gid_t *grouplist, int *countp) +{ + int limit = NGROUPS; + + /* + * If they just want a copy of the groups list, they may not care + * about the actual count. If they specify an input count, however, + * treat it as an indicator of the buffer size available in grouplist, + * and limit the returned list to that size. + */ + if (countp) { + limit = MIN(*countp, cred->cr_posix.cr_ngroups); + *countp = limit; + } + + memcpy(grouplist, cred->cr_posix.cr_groups, sizeof(gid_t) * limit); + + return 0; +} + /* * kauth_cred_setuidgid @@ -3262,15 +3800,19 @@ kauth_cred_t kauth_cred_setuidgid(kauth_cred_t cred, uid_t uid, gid_t gid) { struct ucred temp_cred; + posix_cred_t temp_pcred = posix_cred_get(&temp_cred); + posix_cred_t pcred; NULLCRED_CHECK(cred); + pcred = posix_cred_get(cred); + /* * We don't need to do anything if the effective, real and saved * user IDs are already the same as the user ID passed into us. */ - if (cred->cr_uid == uid && cred->cr_ruid == uid && cred->cr_svuid == uid && - cred->cr_groups[0] == gid && cred->cr_rgid == gid && cred->cr_svgid == gid) { + if (pcred->cr_uid == uid && pcred->cr_ruid == uid && pcred->cr_svuid == uid && + pcred->cr_gid == gid && pcred->cr_rgid == gid && pcred->cr_svgid == gid) { /* no change needed */ return(cred); } @@ -3280,26 +3822,26 @@ kauth_cred_setuidgid(kauth_cred_t cred, uid_t uid, gid_t gid) * with the new values. */ bzero(&temp_cred, sizeof(temp_cred)); - temp_cred.cr_uid = uid; - temp_cred.cr_ruid = uid; - temp_cred.cr_svuid = uid; - temp_cred.cr_flags = cred->cr_flags; + temp_pcred->cr_uid = uid; + temp_pcred->cr_ruid = uid; + temp_pcred->cr_svuid = uid; + temp_pcred->cr_flags = pcred->cr_flags; /* inherit the opt-out of memberd */ - if (cred->cr_flags & CRF_NOMEMBERD) { - temp_cred.cr_gmuid = KAUTH_UID_NONE; - temp_cred.cr_flags |= CRF_NOMEMBERD; + if (pcred->cr_flags & CRF_NOMEMBERD) { + temp_pcred->cr_gmuid = KAUTH_UID_NONE; + temp_pcred->cr_flags |= CRF_NOMEMBERD; } else { - temp_cred.cr_gmuid = uid; - temp_cred.cr_flags &= ~CRF_NOMEMBERD; + temp_pcred->cr_gmuid = uid; + temp_pcred->cr_flags &= ~CRF_NOMEMBERD; } - temp_cred.cr_ngroups = 1; + temp_pcred->cr_ngroups = 1; /* displacing a supplementary group opts us out of memberd */ if (kauth_cred_change_egid(&temp_cred, gid)) { - temp_cred.cr_gmuid = KAUTH_UID_NONE; - temp_cred.cr_flags |= CRF_NOMEMBERD; + temp_pcred->cr_gmuid = KAUTH_UID_NONE; + temp_pcred->cr_flags |= CRF_NOMEMBERD; } - temp_cred.cr_rgid = gid; - temp_cred.cr_svgid = gid; + temp_pcred->cr_rgid = gid; + temp_pcred->cr_svgid = gid; #if CONFIG_MACF temp_cred.cr_label = cred->cr_label; #endif @@ -3336,8 +3878,13 @@ kauth_cred_t kauth_cred_setsvuidgid(kauth_cred_t cred, uid_t uid, gid_t gid) { struct ucred temp_cred; + posix_cred_t temp_pcred = posix_cred_get(&temp_cred); + posix_cred_t pcred; NULLCRED_CHECK(cred); + + pcred = posix_cred_get(cred); + DEBUG_CRED_ENTER("kauth_cred_setsvuidgid: %p u%d->%d g%d->%d\n", cred, cred->cr_svuid, uid, cred->cr_svgid, gid); /* @@ -3345,7 +3892,7 @@ kauth_cred_setsvuidgid(kauth_cred_t cred, uid_t uid, gid_t gid) * uids are already the same as the uid provided. This check is * likely insufficient. */ - if (cred->cr_svuid == uid && cred->cr_svgid == gid) { + if (pcred->cr_svuid == uid && pcred->cr_svgid == gid) { /* no change needed */ return(cred); } @@ -3355,8 +3902,8 @@ kauth_cred_setsvuidgid(kauth_cred_t cred, uid_t uid, gid_t gid) * with new values. */ bcopy(cred, &temp_cred, sizeof(temp_cred)); - temp_cred.cr_svuid = uid; - temp_cred.cr_svgid = gid; + temp_pcred->cr_svuid = uid; + temp_pcred->cr_svgid = gid; return(kauth_cred_update(cred, &temp_cred, TRUE)); } @@ -3402,18 +3949,6 @@ kauth_cred_setauditinfo(kauth_cred_t cred, au_session_t *auditinfo_p) bcopy(cred, &temp_cred, sizeof(temp_cred)); bcopy(auditinfo_p, &temp_cred.cr_audit, sizeof(temp_cred.cr_audit)); - /* XXX the following will go away with cr_au */ - temp_cred.cr_au.ai_auid = auditinfo_p->as_aia_p->ai_auid; - temp_cred.cr_au.ai_mask.am_success = - auditinfo_p->as_mask.am_success; - temp_cred.cr_au.ai_mask.am_failure = - auditinfo_p->as_mask.am_failure; - temp_cred.cr_au.ai_termid.port = - auditinfo_p->as_aia_p->ai_termid.at_port; - temp_cred.cr_au.ai_termid.machine = - auditinfo_p->as_aia_p->ai_termid.at_addr[0]; - temp_cred.cr_au.ai_asid = auditinfo_p->as_aia_p->ai_asid; - /* XXX */ return(kauth_cred_update(cred, &temp_cred, FALSE)); } @@ -3560,6 +4095,9 @@ int kauth_proc_label_update(struct proc *p, struct label *label) continue; } p->p_ucred = my_new_cred; + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); + mac_proc_set_enforce(p, MAC_ALL_ENFORCE); proc_unlock(p); } @@ -3635,6 +4173,8 @@ kauth_proc_label_update_execve(struct proc *p, vfs_context_t ctx, continue; } p->p_ucred = my_new_cred; + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); mac_proc_set_enforce(p, MAC_ALL_ENFORCE); proc_unlock(p); } @@ -3951,10 +4491,12 @@ kauth_cred_copy_real(kauth_cred_t cred) { kauth_cred_t newcred = NULL, found_cred; struct ucred temp_cred; + posix_cred_t temp_pcred = posix_cred_get(&temp_cred); + posix_cred_t pcred = posix_cred_get(cred); /* if the credential is already 'real', just take a reference */ - if ((cred->cr_ruid == cred->cr_uid) && - (cred->cr_rgid == cred->cr_gid)) { + if ((pcred->cr_ruid == pcred->cr_uid) && + (pcred->cr_rgid == pcred->cr_gid)) { kauth_cred_ref(cred); return(cred); } @@ -3964,18 +4506,18 @@ kauth_cred_copy_real(kauth_cred_t cred) * with the new values. */ bcopy(cred, &temp_cred, sizeof(temp_cred)); - temp_cred.cr_uid = cred->cr_ruid; + temp_pcred->cr_uid = pcred->cr_ruid; /* displacing a supplementary group opts us out of memberd */ - if (kauth_cred_change_egid(&temp_cred, cred->cr_rgid)) { - temp_cred.cr_flags |= CRF_NOMEMBERD; - temp_cred.cr_gmuid = KAUTH_UID_NONE; + if (kauth_cred_change_egid(&temp_cred, pcred->cr_rgid)) { + temp_pcred->cr_flags |= CRF_NOMEMBERD; + temp_pcred->cr_gmuid = KAUTH_UID_NONE; } /* * If the cred is not opted out, make sure we are using the r/euid * for group checks */ - if (temp_cred.cr_gmuid != KAUTH_UID_NONE) - temp_cred.cr_gmuid = cred->cr_ruid; + if (temp_pcred->cr_gmuid != KAUTH_UID_NONE) + temp_pcred->cr_gmuid = pcred->cr_ruid; for (;;) { int err; @@ -4063,9 +4605,6 @@ kauth_cred_update(kauth_cred_t old_cred, kauth_cred_t model_cred, if (retain_auditinfo) { bcopy(&old_cred->cr_audit, &model_cred->cr_audit, sizeof(model_cred->cr_audit)); - /* XXX following bcopy will go away with cr_au */ - bcopy(&old_cred->cr_au, &model_cred->cr_au, - sizeof(model_cred->cr_au)); } for (;;) { @@ -4240,6 +4779,7 @@ kauth_cred_find(kauth_cred_t cred) { u_long hash_key; kauth_cred_t found_cred; + posix_cred_t pcred = posix_cred_get(cred); KAUTH_CRED_HASH_LOCK_ASSERT(); @@ -4258,23 +4798,26 @@ kauth_cred_find(kauth_cred_t cred) /* Find cred in the credential hash table */ TAILQ_FOREACH(found_cred, &kauth_cred_table_anchor[hash_key], cr_link) { boolean_t match; + posix_cred_t found_pcred = posix_cred_get(found_cred); /* * don't worry about the label unless the flags in * either credential tell us to. */ - if ((found_cred->cr_flags & CRF_MAC_ENFORCE) != 0 || - (cred->cr_flags & CRF_MAC_ENFORCE) != 0) { + if ((found_pcred->cr_flags & CRF_MAC_ENFORCE) != 0 || + (pcred->cr_flags & CRF_MAC_ENFORCE) != 0) { /* include the label pointer in the compare */ - match = (bcmp(&found_cred->cr_uid, &cred->cr_uid, + match = (bcmp(&found_pcred->cr_uid, &pcred->cr_uid, (sizeof(struct ucred) - - offsetof(struct ucred, cr_uid))) == 0); + offsetof(struct ucred, cr_posix))) == 0); } else { /* flags have to match, but skip the label in bcmp */ - match = (found_cred->cr_flags == cred->cr_flags && - bcmp(&found_cred->cr_uid, &cred->cr_uid, - (offsetof(struct ucred, cr_label) - - offsetof(struct ucred, cr_uid))) == 0); + match = (found_pcred->cr_flags == pcred->cr_flags && + bcmp(&found_pcred->cr_uid, &pcred->cr_uid, + sizeof(struct posix_cred)) == 0 && + bcmp(&found_cred->cr_audit, &cred->cr_audit, + sizeof(cred->cr_audit)) == 0); + } if (match) { /* found a match */ @@ -4326,24 +4869,33 @@ kauth_cred_hash(const uint8_t *datap, int data_len, u_long start_key) * not including the ref count or the TAILQ, which are mutable; * everything else isn't. * - * We also avoid the label (if the flag is not set saying the - * label is actually enforced). - * * Parameters: cred Credential for which hash is * desired * * Returns: (u_long) Returned hash key + * + * Notes: When actually moving the POSIX credential into a real label, + * remember to update this hash computation. */ static u_long kauth_cred_get_hashkey(kauth_cred_t cred) { + posix_cred_t pcred = posix_cred_get(cred); u_long hash_key = 0; - - hash_key = kauth_cred_hash((uint8_t *)&cred->cr_uid, - ((cred->cr_flags & CRF_MAC_ENFORCE) ? - sizeof(struct ucred) : offsetof(struct ucred, cr_label)) - - offsetof(struct ucred, cr_uid), - hash_key); + + if (pcred->cr_flags & CRF_MAC_ENFORCE) { + hash_key = kauth_cred_hash((uint8_t *)&cred->cr_posix, + sizeof(struct ucred) - offsetof(struct ucred, cr_posix), + hash_key); + } else { + /* skip label */ + hash_key = kauth_cred_hash((uint8_t *)&cred->cr_posix, + sizeof(struct posix_cred), + hash_key); + hash_key = kauth_cred_hash((uint8_t *)&cred->cr_audit, + sizeof(struct au_session), + hash_key); + } return(hash_key); } @@ -4691,3 +5243,226 @@ sysctl_dump_cred_backtraces( __unused struct sysctl_oid *oidp, __unused void *ar } #endif /* KAUTH_CRED_HASH_DEBUG || DEBUG_CRED */ + + +/* + ********************************************************************** + * The following routines will be moved to a policy_posix.c module at + * some future point. + ********************************************************************** + */ + +/* + * posix_cred_create + * + * Description: Helper function to create a kauth_cred_t credential that is + * initally labelled with a specific POSIX credential label + * + * Parameters: pcred The posix_cred_t to use as the initial + * label value + * + * Returns: (kauth_cred_t) The credential that was found in the + * hash or creates + * NULL kauth_cred_add() failed, or there was + * no egid specified, or we failed to + * attach a label to the new credential + * + * Notes: This function currently wraps kauth_cred_create(), and is the + * only consume of tht ill-fated function, apart from bsd_init(). + * It exists solely to support the NFS server code creation of + * credentials based on the over-the-wire RPC cals containing + * traditional POSIX credential information being tunneled to + * the server host from the client machine. + * + * In the future, we hope this function goes away. + * + * In the short term, it creates a temporary credential, puts + * the POSIX information from NFS into it, and then calls + * kauth_cred_create(), as an internal implementaiton detail. + * + * If we have to keep it around in the medium term, it will + * create a new kauth_cred_t, then label it with a POSIX label + * corresponding to the contents of the kauth_cred_t. If the + * policy_posix MACF module is not loaded, it will instead + * substitute a posix_cred_t which GRANTS all access (effectively + * a "root" credential) in order to not prevent NFS from working + * in the case that we are not supporting POSIX credentials. + */ +kauth_cred_t +posix_cred_create(posix_cred_t pcred) +{ + struct ucred temp_cred; + + bzero(&temp_cred, sizeof(temp_cred)); + temp_cred.cr_posix = *pcred; + + return kauth_cred_create(&temp_cred); +} + + +/* + * posix_cred_get + * + * Description: Given a kauth_cred_t, return the POSIX credential label, if + * any, which is associated with it. + * + * Parameters: cred The credential to obtain the label from + * + * Returns: posix_cred_t The POSIX credential label + * + * Notes: In the event that the policy_posix MACF module IS NOT loaded, + * this function will return a pointer to a posix_cred_t which + * GRANTS all access (effectively, a "root" credential). This is + * necessary to support legacy code which insists on tightly + * integrating POSIX credentails into its APIs, including, but + * not limited to, System V IPC mechanisms, POSIX IPC mechanisms, + * NFSv3, signals, dtrace, and a large number of kauth routines + * used to implement POSIX permissions related system calls. + * + * In the event that the policy_posix MACF module IS loaded, and + * there is no POSIX label on the kauth_cred_t credential, this + * function will return a pointer to a posix_cred_t which DENIES + * all access (effectively, a "deny rights granted by POSIX" + * credential). This is necessary to support the concept of a + * transiently loaded POSIX policy, or kauth_cred_t credentials + * which can not be used in conjunctions with POSIX permissions + * checks. + * + * This function currently returns the address of the cr_posix + * field of the supplied kauth_cred_t credential, and as such + * currently can not fail. In the future, this will not be the + * case. + */ +posix_cred_t +posix_cred_get(kauth_cred_t cred) +{ + return(&cred->cr_posix); +} + + +/* + * posix_cred_label + * + * Description: Label a kauth_cred_t with a POSIX credential label + * + * Parameters: cred The credential to label + * pcred The POSIX credential t label it with + * + * Returns: (void) + * + * Notes: This function is currently void in order to permit it to fit + * in with the currrent MACF framework label methods which allow + * labelling to fail silently. This is like acceptable for + * mandatory access controls, but not for POSIX, since those + * access controls are advisory. We will need to consider a + * return value in a future version of the MACF API. + * + * This operation currenty can not fail, as currently the POSIX + * credential is a subfield of the kauth_cred_t (ucred), which + * MUST be valid. In the future, this will not be the case. + */ +void +posix_cred_label(kauth_cred_t cred, posix_cred_t pcred) +{ + cred->cr_posix = *pcred; /* structure assign for now */ +} + + +/* + * posix_cred_access + * + * Description: Perform a POSIX access check for a protected object + * + * Parameters: cred The credential to check + * object_uid The POSIX UID of the protected object + * object_gid The POSIX GID of the protected object + * object_mode The POSIX mode of the protected object + * mode_req The requested POSIX access rights + * + * Returns 0 Access is granted + * EACCES Access is denied + * + * Notes: This code optimizes the case where the world and group rights + * would both grant the requested rights to avoid making a group + * membership query. This is a big performance win in the case + * where this is true. + */ +int +posix_cred_access(kauth_cred_t cred, id_t object_uid, id_t object_gid, mode_t object_mode, mode_t mode_req) +{ + int is_member; + mode_t mode_owner = (object_mode & S_IRWXU); + mode_t mode_group = (object_mode & S_IRWXG) << 3; + mode_t mode_world = (object_mode & S_IRWXO) << 6; + + /* + * Check first for owner rights + */ + if (kauth_cred_getuid(cred) == object_uid && (mode_req & mode_owner) == mode_req) + return (0); + + /* + * Combined group and world rights check, if we don't have owner rights + * + * OPTIMIZED: If group and world rights would grant the same bits, and + * they set of requested bits is in both, then we can simply check the + * world rights, avoiding a group membership check, which is expensive. + */ + if ((mode_req & mode_group & mode_world) == mode_req) { + return (0); + } else { + /* + * NON-OPTIMIZED: requires group membership check. + */ + if ((mode_req & mode_group) != mode_req) { + /* + * exclusion group : treat errors as "is a member" + * + * NON-OPTIMIZED: +group would deny; must check group + */ + if (!kauth_cred_ismember_gid(cred, object_gid, &is_member) && is_member) { + /* + * DENY: +group denies + */ + return (EACCES); + } else { + if ((mode_req & mode_world) != mode_req) { + /* + * DENY: both -group & world would deny + */ + return (EACCES); + } else { + /* + * ALLOW: allowed by -group and +world + */ + return (0); + } + } + } else { + /* + * inclusion group; treat errors as "not a member" + * + * NON-OPTIMIZED: +group allows, world denies; must + * check group + */ + if (!kauth_cred_ismember_gid(cred, object_gid, &is_member) && is_member) { + /* + * ALLOW: allowed by +group + */ + return (0); + } else { + if ((mode_req & mode_world) != mode_req) { + /* + * DENY: both -group & world would deny + */ + return (EACCES); + } else { + /* + * ALLOW: allowed by -group and +world + */ + return (0); + } + } + } + } +} diff --git a/bsd/kern/kern_descrip.c b/bsd/kern/kern_descrip.c index d3f483d24..3283ee3c0 100644 --- a/bsd/kern/kern_descrip.c +++ b/bsd/kern/kern_descrip.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -101,6 +101,7 @@ #include #include #include +#include #include #include #include @@ -112,6 +113,11 @@ #include +#if CONFIG_PROTECT +#include +#endif +#include + kern_return_t ipc_object_copyin(ipc_space_t, mach_port_name_t, mach_msg_type_name_t, ipc_port_t *); void ipc_port_release_send(ipc_port_t); @@ -119,16 +125,14 @@ void ipc_port_release_send(ipc_port_t); struct psemnode; struct pshmnode; -int fdopen(dev_t dev, int mode, int type, proc_t p); -int finishdup(proc_t p, struct filedesc *fdp, int old, int new, int32_t *retval); +static int finishdup(proc_t p, + struct filedesc *fdp, int old, int new, int flags, int32_t *retval); int falloc_locked(proc_t p, struct fileproc **resultfp, int *resultfd, vfs_context_t ctx, int locked); void fg_drop(struct fileproc * fp); void fg_free(struct fileglob *fg); void fg_ref(struct fileproc * fp); -#if CONFIG_EMBEDDED void fileport_releasefg(struct fileglob *fg); -#endif /* CONFIG_EMBEDDED */ /* flags for close_internal_locked */ #define FD_DUP2RESV 1 @@ -156,6 +160,9 @@ extern kauth_scope_t kauth_scope_fileop; extern int cs_debug; +/* Conflict wait queue for when selects collide (opaque type) */ +extern struct wait_queue select_conflict_queue; + #define f_flag f_fglob->fg_flag #define f_type f_fglob->fg_type #define f_msgcount f_fglob->fg_msgcount @@ -474,21 +481,20 @@ dup(proc_t p, struct dup_args *uap, int32_t *retval) proc_fdunlock(p); return (error); } - error = finishdup(p, fdp, old, new, retval); + error = finishdup(p, fdp, old, new, 0, retval); fp_drop(p, old, fp, 1); proc_fdunlock(p); return (error); } - /* * dup2 * * Description: Duplicate a file descriptor to a particular value. * * Parameters: p Process performing the dup - * uap->fd The fd to dup + * uap->from The fd to dup * uap->to The fd to dup it to * retval Pointer to the call return area * @@ -547,7 +553,8 @@ dup2(proc_t p, struct dup2_args *uap, int32_t *retval) goto startover; } - if ((fdp->fd_ofiles[new] != NULL) && ((error = fp_lookup(p, new, &nfp, 1)) == 0)) { + if ((fdp->fd_ofiles[new] != NULL) && + ((error = fp_lookup(p, new, &nfp, 1)) == 0)) { fp_drop(p, old, fp, 1); (void)close_internal_locked(p, new, nfp, FD_DUP2RESV); #if DIAGNOSTIC @@ -558,7 +565,7 @@ dup2(proc_t p, struct dup2_args *uap, int32_t *retval) } else { #if DIAGNOSTIC if (fdp->fd_ofiles[new] != NULL) - panic("dup2: unable to get ref on a fileproc %d\n", new); + panic("dup2: no ref on fileproc %d", new); #endif procfdtbl_reservefd(p, new); } @@ -570,11 +577,11 @@ dup2(proc_t p, struct dup2_args *uap, int32_t *retval) } #if DIAGNOSTIC if (fdp->fd_ofiles[new] != 0) - panic("dup2-1: overwriting fd_ofiles with new %d\n", new); + panic("dup2: overwriting fd_ofiles with new %d", new); if ((fdp->fd_ofileflags[new] & UF_RESERVED) == 0) - panic("dup2-1: unreserved fileflags with new %d\n", new); + panic("dup2: unreserved fileflags with new %d", new); #endif - error = finishdup(p, fdp, old, new, retval); + error = finishdup(p, fdp, old, new, 0, retval); fp_drop(p, old, fp, 1); proc_fdunlock(p); @@ -678,7 +685,6 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) off_t offset; int newmin; daddr64_t lbn, bn; - int devBlockSize = 0; unsigned int fflag; user_addr_t argp; boolean_t is64bit; @@ -723,6 +729,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) switch (uap->cmd) { case F_DUPFD: + case F_DUPFD_CLOEXEC: newmin = CAST_DOWN_EXPLICIT(int, uap->arg); /* arg is an int, so we won't lose bits */ AUDIT_ARG(value32, newmin); if ((u_int)newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur || @@ -732,7 +739,8 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) } if ( (error = fdalloc(p, newmin, &i)) ) goto out; - error = finishdup(p, fdp, fd, i, retval); + error = finishdup(p, fdp, fd, i, + uap->cmd == F_DUPFD_CLOEXEC ? UF_EXCLOSE : 0, retval); goto out; case F_GETFD: @@ -807,6 +815,36 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, &context); goto out; + case F_SETNOSIGPIPE: + tmp = CAST_DOWN_EXPLICIT(int, uap->arg); + if (fp->f_type == DTYPE_SOCKET) { + error = sock_setsockopt((struct socket *)fp->f_data, + SOL_SOCKET, SO_NOSIGPIPE, &tmp, sizeof (tmp)); + } else { + struct fileglob *fg = fp->f_fglob; + + lck_mtx_lock_spin(&fg->fg_lock); + if (tmp) + fg->fg_lflags |= FG_NOSIGPIPE; + else + fg->fg_lflags &= FG_NOSIGPIPE; + lck_mtx_unlock(&fg->fg_lock); + error = 0; + } + goto out; + + case F_GETNOSIGPIPE: + if (fp->f_type == DTYPE_SOCKET) { + int retsize = sizeof (*retval); + error = sock_getsockopt((struct socket *)fp->f_data, + SOL_SOCKET, SO_NOSIGPIPE, retval, &retsize); + } else { + *retval = (fp->f_fglob->fg_lflags & FG_NOSIGPIPE) ? + 1 : 0; + error = 0; + } + goto out; + case F_SETLKW: flg |= F_WAIT; /* Fall into F_SETLK */ @@ -886,6 +924,9 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) } case F_GETLK: +#if CONFIG_EMBEDDED + case F_GETLKPID: +#endif if (fp->f_type != DTYPE_VNODE) { error = EBADF; goto out; @@ -943,10 +984,10 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) #if CONFIG_MACF error = mac_file_check_lock(proc_ucred(p), fp->f_fglob, - F_GETLK, &fl); + uap->cmd, &fl); if (error == 0) #endif - error = VNOP_ADVLOCK(vp, (caddr_t)p, F_GETLK, &fl, F_POSIX, &context); + error = VNOP_ADVLOCK(vp, (caddr_t)p, uap->cmd, &fl, F_POSIX, &context); (void)vnode_put(vp); @@ -1108,6 +1149,18 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) goto out; + case F_NODIRECT: + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } + if (uap->arg) + fp->f_fglob->fg_flag |= FNODIRECT; + else + fp->f_fglob->fg_flag &= ~FNODIRECT; + + goto out; + case F_GLOBAL_NOCACHE: if (fp->f_type != DTYPE_VNODE) { error = EBADF; @@ -1170,6 +1223,23 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) goto outdrop; } + case F_FLUSH_DATA: + + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } + vp = (struct vnode *)fp->f_data; + proc_fdunlock(p); + + if ( (error = vnode_getwithref(vp)) == 0 ) { + error = cluster_push(vp, 0); + + (void)vnode_put(vp); + } + goto outdrop; + + case F_READBOOTSTRAP: case F_WRITEBOOTSTRAP: { user32_fbootstraptransfer_t user32_fbt_struct; @@ -1221,9 +1291,23 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) (void)vnode_put(vp); goto outdrop; } - case F_LOG2PHYS: { + case F_LOG2PHYS: + case F_LOG2PHYS_EXT: { struct log2phys l2p_struct; /* structure for allocate command */ + int devBlockSize; + off_t file_offset = 0; + size_t a_size = 0; + size_t run = 0; + + if (uap->cmd == F_LOG2PHYS_EXT) { + error = copyin(argp, (caddr_t)&l2p_struct, sizeof(l2p_struct)); + if (error) + goto out; + file_offset = l2p_struct.l2p_devoffset; + } else { + file_offset = fp->f_offset; + } if (fp->f_type != DTYPE_VNODE) { error = EBADF; goto out; @@ -1233,7 +1317,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) if ( (error = vnode_getwithref(vp)) ) { goto outdrop; } - error = VNOP_OFFTOBLK(vp, fp->f_offset, &lbn); + error = VNOP_OFFTOBLK(vp, file_offset, &lbn); if (error) { (void)vnode_put(vp); goto outdrop; @@ -1244,16 +1328,25 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) goto outdrop; } devBlockSize = vfs_devblocksize(vnode_mount(vp)); - - error = VNOP_BLOCKMAP(vp, offset, devBlockSize, &bn, NULL, NULL, 0, &context); + if (uap->cmd == F_LOG2PHYS_EXT) { + a_size = l2p_struct.l2p_contigbytes; + } else { + a_size = devBlockSize; + } + + error = VNOP_BLOCKMAP(vp, offset, a_size, &bn, &run, NULL, 0, &context); (void)vnode_put(vp); if (!error) { l2p_struct.l2p_flags = 0; /* for now */ - l2p_struct.l2p_contigbytes = 0; /* for now */ + if (uap->cmd == F_LOG2PHYS_EXT) { + l2p_struct.l2p_contigbytes = run - (file_offset - offset); + } else { + l2p_struct.l2p_contigbytes = 0; /* for now */ + } l2p_struct.l2p_devoffset = bn * devBlockSize; - l2p_struct.l2p_devoffset += fp->f_offset - offset; + l2p_struct.l2p_devoffset += file_offset - offset; error = copyout((caddr_t)&l2p_struct, argp, sizeof(l2p_struct)); } goto outdrop; @@ -1384,7 +1477,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) VATTR_SET(&va, va_mode, cmode & ACCESSPERMS); /* Start the lookup relative to the file descriptor's vnode. */ - NDINIT(&nd, LOOKUP, USEDVP | FOLLOW | AUDITVNPATH1, UIO_USERSPACE, + NDINIT(&nd, LOOKUP, OP_OPEN, USEDVP | FOLLOW | AUDITVNPATH1, UIO_USERSPACE, fopen.o_pathname, &context); nd.ni_dvp = vp; @@ -1429,7 +1522,8 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) } /* Start the lookup relative to the file descriptor's vnode. */ - NDINIT(&nd, DELETE, USEDVP | AUDITVNPATH1, UIO_USERSPACE, pathname, &context); + NDINIT(&nd, DELETE, OP_UNLINK, USEDVP | AUDITVNPATH1, UIO_USERSPACE, + pathname, &context); nd.ni_dvp = vp; error = unlink1(&context, &nd, 0); @@ -1533,6 +1627,9 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) kernel_blob_size); } else { /* ubc_blob_add() has consumed "kernel_blob_addr" */ +#if CHECK_CS_VALIDATION_BITMAP + ubc_cs_validation_bitmap_allocate( vp ); +#endif } (void) vnode_put(vp); @@ -1540,7 +1637,6 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) } case F_MARKDEPENDENCY: { - struct vnode *root_vp; struct vnode_attr va; vfs_context_t ctx = vfs_context_current(); kauth_cred_t cred; @@ -1563,13 +1659,11 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) goto outdrop; } - // the passed in vnode must be the root dir of the file system - if (VFS_ROOT(vp->v_mount, &root_vp, ctx) != 0 || vp != root_vp) { + if (!vnode_isvroot(vp)) { error = EINVAL; vnode_put(vp); goto outdrop; } - vnode_put(root_vp); // get the owner of the root dir VATTR_INIT(&va); @@ -1592,24 +1686,291 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) // if all those checks pass then we can mark the dependency vfs_markdependency(vp->v_mount); error = 0; + + vnode_put(vp); + + break; + } + +#ifdef CONFIG_PROTECT + case F_GETPROTECTIONCLASS: { + int class = 0; + + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } + vp = (struct vnode *)fp->f_data; + + proc_fdunlock(p); + + if (vnode_getwithref(vp)) { + error = ENOENT; + goto outdrop; + } + + error = cp_vnode_getclass (vp, &class); + if (error == 0) { + *retval = class; + } vnode_put(vp); + break; + } + + case F_SETPROTECTIONCLASS: { + /* tmp must be a valid PROTECTION_CLASS_* */ + tmp = CAST_DOWN_EXPLICIT(uint32_t, uap->arg); + + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } + vp = (struct vnode *)fp->f_data; + + proc_fdunlock(p); + + if (vnode_getwithref(vp)) { + error = ENOENT; + goto outdrop; + } + /* Only go forward if you have write access */ + vfs_context_t ctx = vfs_context_current(); + if(vnode_authorize(vp, NULLVP, (KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA), ctx) != 0) { + vnode_put(vp); + error = EBADF; + goto outdrop; + } + error = cp_vnode_setclass (vp, tmp); + vnode_put(vp); + break; + } +#endif /* CONFIG_PROTECT */ + + case F_MOVEDATAEXTENTS: { + struct fileproc *fp2 = NULL; + struct vnode *src_vp = NULLVP; + struct vnode *dst_vp = NULLVP; + /* We need to grab the 2nd FD out of the argments before moving on. */ + int fd2 = CAST_DOWN_EXPLICIT(int32_t, uap->arg); + + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } + vp = src_vp = (struct vnode *)fp->f_data; + + /* For now, special case HFS+ only, since this is SPI. */ + if (src_vp->v_tag != VT_HFS) { + error = EINVAL; + goto out; + } + + /* We're still holding the proc FD lock */ + if ( (error = fp_lookup(p, fd2, &fp2, 1)) ) { + error = EBADF; + goto out; + } + if (fp2->f_type != DTYPE_VNODE) { + fp_drop(p, fd2, fp2, 1); + error = EBADF; + goto out; + } + dst_vp = (struct vnode *)fp2->f_data; + + /* For now, special case HFS+ only, since this is SPI. */ + if (dst_vp->v_tag != VT_HFS) { + fp_drop(p, fd2, fp2, 1); + error = EINVAL; + goto out; + } + +#if CONFIG_MACF + /* Re-do MAC checks against the new FD, pass in a fake argument */ + error = mac_file_check_fcntl(proc_ucred(p), fp2->f_fglob, uap->cmd, 0); + if (error) { + fp_drop(p, fd2, fp2, 1); + goto out; + } +#endif + /* Audit the 2nd FD */ + AUDIT_ARG(fd, fd2); + + proc_fdunlock(p); + + /* Proc lock dropped; now we have a legit pair of FDs. Go to work */ + + if (vnode_getwithref(src_vp)) { + fp_drop(p, fd2, fp2, 0); + error = ENOENT; + goto outdrop; + } + if (vnode_getwithref(dst_vp)) { + vnode_put (src_vp); + fp_drop(p, fd2, fp2, 0); + error = ENOENT; + goto outdrop; + } + + /* + * Basic asserts; validate they are not the same and that + * both live on the same filesystem. + */ + + if (dst_vp == src_vp) { + vnode_put (src_vp); + vnode_put (dst_vp); + fp_drop (p, fd2, fp2, 0); + error = EINVAL; + goto outdrop; + } + + if (dst_vp->v_mount != src_vp->v_mount) { + vnode_put (src_vp); + vnode_put (dst_vp); + fp_drop (p, fd2, fp2, 0); + error = EXDEV; + goto outdrop; + } + + /* Now check for write access to the target files */ + if(vnode_authorize(src_vp, NULLVP, + (KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA), &context) != 0) { + vnode_put(src_vp); + vnode_put(dst_vp); + fp_drop(p, fd2, fp2, 0); + error = EBADF; + goto outdrop; + } + + if(vnode_authorize(dst_vp, NULLVP, + (KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA), &context) != 0) { + vnode_put(src_vp); + vnode_put(dst_vp); + fp_drop(p, fd2, fp2, 0); + error = EBADF; + goto outdrop; + } + + /* Verify that both vps point to files and not directories */ + if (!vnode_isreg(src_vp) || !vnode_isreg(dst_vp)) { + vnode_put(src_vp); + vnode_put(dst_vp); + fp_drop(p, fd2, fp2, 0); + error = EINVAL; + goto outdrop; + } + + /* + * The exchangedata syscall handler passes in 0 for the flags to VNOP_EXCHANGE. + * We'll pass in our special bit indicating that the new behavior is expected + */ + + error = VNOP_EXCHANGE(src_vp, dst_vp, FSOPT_EXCHANGE_DATA_ONLY, &context); + + vnode_put (src_vp); + vnode_put (dst_vp); + fp_drop(p, fd2, fp2, 0); break; } - case F_GETPROTECTIONCLASS: { - // stub to make the API work - printf("Reached F_GETPROTECTIONCLASS, returning without action\n"); + /* + * Set the vnode pointed to by 'fd' + * and tag it as the (potentially future) backing store + * for another filesystem + */ + case F_SETBACKINGSTORE: { + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } + vp = (struct vnode *)fp->f_data; + + if (vp->v_tag != VT_HFS) { + error = EINVAL; + goto out; + + } + proc_fdunlock(p); + + if (vnode_getwithref(vp)) { + error = ENOENT; + goto outdrop; + } + + /* only proceed if you have write access */ + vfs_context_t ctx = vfs_context_current(); + if(vnode_authorize(vp, NULLVP, (KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA), ctx) != 0) { + vnode_put(vp); + error = EBADF; + goto outdrop; + } + + + /* If arg != 0, set, otherwise unset */ + if (uap->arg) { + error = hfs_set_backingstore (vp, 1); + } + else { + error = hfs_set_backingstore (vp, 0); + } + /* Success. explicitly set error to 0. */ error = 0; - goto out; + + vnode_put(vp); + break; } - case F_SETPROTECTIONCLASS: { - // stub to make the API work - printf("Reached F_SETPROTECTIONCLASS, returning without action\n"); - error = 0; - goto out; + /* + * like F_GETPATH, but special semantics for + * the mobile time machine handler. + */ + case F_GETPATH_MTMINFO: { + char *pathbufp; + int pathlen; + + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } + vp = (struct vnode *)fp->f_data; + proc_fdunlock(p); + + pathlen = MAXPATHLEN; + MALLOC(pathbufp, char *, pathlen, M_TEMP, M_WAITOK); + if (pathbufp == NULL) { + error = ENOMEM; + goto outdrop; + } + if ( (error = vnode_getwithref(vp)) == 0 ) { + int backingstore = 0; + + /* Check for error from vn_getpath before moving on */ + if ((error = vn_getpath(vp, pathbufp, &pathlen)) == 0) { + if (vp->v_tag == VT_HFS) { + error = hfs_is_backingstore (vp, &backingstore); + } + (void)vnode_put(vp); + + if (error == 0) { + error = copyout((caddr_t)pathbufp, argp, pathlen); + } + if (error == 0) { + /* + * If the copyout was successful, now check to ensure + * that this vnode is not a BACKINGSTORE vnode. mtmd + * wants the path regardless. + */ + if (backingstore) { + error = EBUSY; + } + } + } else + (void)vnode_put(vp); + } + FREE(pathbufp, M_TEMP); + goto outdrop; } @@ -1730,6 +2091,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) * Parameters: p Process performing the dup * old The fd to dup * new The fd to dup it to + * fd_flags Flags to augment the new fd * retval Pointer to the call return area * * Returns: 0 Success @@ -1744,10 +2106,11 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) * * Notes: This function may drop and reacquire this lock; it is unsafe * for a caller to assume that other state protected by the lock - * has not been subsequently changes out from under it. + * has not been subsequently changed out from under it. */ int -finishdup(proc_t p, struct filedesc *fdp, int old, int new, int32_t *retval) +finishdup(proc_t p, + struct filedesc *fdp, int old, int new, int fd_flags, int32_t *retval) { struct fileproc *nfp; struct fileproc *ofp; @@ -1758,9 +2121,8 @@ finishdup(proc_t p, struct filedesc *fdp, int old, int new, int32_t *retval) #if DIAGNOSTIC proc_fdlock_assert(p, LCK_MTX_ASSERT_OWNED); #endif - if ((ofp = fdp->fd_ofiles[old]) == NULL || - (fdp->fd_ofileflags[old] & UF_RESERVED)) { + (fdp->fd_ofileflags[old] & UF_RESERVED)) { fdrelse(p, new); return (EBADF); } @@ -1796,13 +2158,14 @@ finishdup(proc_t p, struct filedesc *fdp, int old, int new, int32_t *retval) #if DIAGNOSTIC if (fdp->fd_ofiles[new] != 0) - panic("finishdup: overwriting fd_ofiles with new %d\n", new); + panic("finishdup: overwriting fd_ofiles with new %d", new); if ((fdp->fd_ofileflags[new] & UF_RESERVED) == 0) - panic("finishdup: unreserved fileflags with new %d\n", new); + panic("finishdup: unreserved fileflags with new %d", new); #endif if (new > fdp->fd_lastfile) fdp->fd_lastfile = new; + *fdflags(p, new) |= fd_flags; procfdtbl_releasefd(p, new, nfp); *retval = new; return (0); @@ -1897,13 +2260,13 @@ close_internal_locked(proc_t p, int fd, struct fileproc *fp, int flags) if ((fp->f_flags & FP_CLOSING) == FP_CLOSING) { - panic("close_internal_locked: being called on already closing fd\n"); + panic("close_internal_locked: being called on already closing fd"); } #if DIAGNOSTIC if ((fdp->fd_ofileflags[fd] & UF_RESERVED) == 0) - panic("close_internal: unreserved fileflags with fd %d\n", fd); + panic("close_internal: unreserved fileflags with fd %d", fd); #endif fp->f_flags |= FP_CLOSING; @@ -1961,7 +2324,7 @@ close_internal_locked(proc_t p, int fd, struct fileproc *fp, int flags) #if DIAGNOSTIC if (resvfd != 0) { if ((fdp->fd_ofileflags[fd] & UF_RESERVED) == 0) - panic("close with reserved fd returns with freed fd:%d: proc: %x\n", fd, (unsigned int)p); + panic("close with reserved fd returns with freed fd:%d: proc: %p", fd, p); } #endif @@ -3150,9 +3513,14 @@ fp_drop(proc_t p, int fd, struct fileproc *fp, int locked) } fp->f_iocount--; - if (p->p_fpdrainwait && fp->f_iocount == 0) { - p->p_fpdrainwait = 0; - needwakeup = 1; + if (fp->f_iocount == 0) { + if (fp->f_flags & FP_SELCONFLICT) + fp->f_flags &= ~FP_SELCONFLICT; + + if (p->p_fpdrainwait) { + p->p_fpdrainwait = 0; + needwakeup = 1; + } } if (!locked) proc_fdunlock(p); @@ -3188,7 +3556,7 @@ fp_drop(proc_t p, int fd, struct fileproc *fp, int locked) * * The fileproc referenced is not returned; because of this, care * must be taken to not drop the last reference (e.g. by closing - * the file). This is inhernely unsafe, since the reference may + * the file). This is inherently unsafe, since the reference may * not be recoverable from the vnode, if there is a subsequent * close that destroys the associate fileproc. The caller should * therefore retain their own reference on the fileproc so that @@ -3249,7 +3617,7 @@ file_vnode(int fd, struct vnode **vpp) * * The fileproc referenced is not returned; because of this, care * must be taken to not drop the last reference (e.g. by closing - * the file). This is inhernely unsafe, since the reference may + * the file). This is inherently unsafe, since the reference may * not be recoverable from the vnode, if there is a subsequent * close that destroys the associate fileproc. The caller should * therefore retain their own reference on the fileproc so that @@ -3314,7 +3682,7 @@ file_vnode_withvid(int fd, struct vnode **vpp, uint32_t * vidp) * * The fileproc referenced is not returned; because of this, care * must be taken to not drop the last reference (e.g. by closing - * the file). This is inhernely unsafe, since the reference may + * the file). This is inherently unsafe, since the reference may * not be recoverable from the socket, if there is a subsequent * close that destroys the associate fileproc. The caller should * therefore retain their own reference on the fileproc so that @@ -3445,9 +3813,14 @@ file_drop(int fd) } fp->f_iocount --; - if (p->p_fpdrainwait && fp->f_iocount == 0) { - p->p_fpdrainwait = 0; - needwakeup = 1; + if (fp->f_iocount == 0) { + if (fp->f_flags & FP_SELCONFLICT) + fp->f_flags &= ~FP_SELCONFLICT; + + if (p->p_fpdrainwait) { + p->p_fpdrainwait = 0; + needwakeup = 1; + } } proc_fdunlock(p); @@ -3481,7 +3854,7 @@ file_drop(int fd) * *resultfd (modified) Returned fd * * Locks: This function takes and drops the proc_fdlock; if this lock - * is alread held, use falloc_locked() instead. + * is already held, use falloc_locked() instead. * * Notes: This function takes separate process and context arguments * solely to support kern_exec.c; otherwise, it would take @@ -3505,7 +3878,7 @@ falloc(proc_t p, struct fileproc **resultfp, int *resultfd, vfs_context_t ctx) * falloc_locked * * Create a new open file structure and allocate - * a file decriptor for the process that refers to it. + * a file descriptor for the process that refers to it. * * Returns: 0 Success * @@ -3679,6 +4052,10 @@ fg_free(struct fileglob *fg) * that are either marked as close-on-exec, or which were in the * process of being opened at the time of the execve * + * Also handles the case (via posix_spawn()) where -all- + * files except those marked with "inherit" as treated as + * close-on-exec. + * * Parameters: p Pointer to process calling * execve * @@ -3693,27 +4070,39 @@ fg_free(struct fileglob *fg) * XXX: We should likely reverse the lock and funnel drop/acquire * order to avoid the small race window; it's also possible that * if the program doing the exec has an outstanding listen socket - * and a network connection is completed asyncrhonously that we + * and a network connection is completed asynchronously that we * will end up with a "ghost" socket reference in the new process. * * This needs reworking to make it safe to remove the funnel from * the execve and posix_spawn system calls. */ void -fdexec(proc_t p) +fdexec(proc_t p, short flags) { struct filedesc *fdp = p->p_fd; int i; - struct fileproc *fp; + boolean_t cloexec_default = (flags & POSIX_SPAWN_CLOEXEC_DEFAULT) != 0; proc_fdlock(p); - i = fdp->fd_lastfile; + for (i = fdp->fd_lastfile; i >= 0; i--) { + + struct fileproc *fp = fdp->fd_ofiles[i]; + char *flagp = &fdp->fd_ofileflags[i]; - while (i >= 0) { + if (cloexec_default) { + /* + * Reverse the usual semantics of file descriptor + * inheritance - all of them should be closed + * except files marked explicitly as "inherit" and + * not marked close-on-exec. + */ + if ((*flagp & (UF_EXCLOSE|UF_INHERIT)) != UF_INHERIT) + *flagp |= UF_EXCLOSE; + *flagp &= ~UF_INHERIT; + } - fp = fdp->fd_ofiles[i]; if ( - ((fdp->fd_ofileflags[i] & (UF_RESERVED|UF_EXCLOSE)) == UF_EXCLOSE) + ((*flagp & (UF_RESERVED|UF_EXCLOSE)) == UF_EXCLOSE) #if CONFIG_MACF || (fp && mac_file_check_inherit(proc_ucred(p), fp->f_fglob)) #endif @@ -3725,10 +4114,21 @@ fdexec(proc_t p) fdp->fd_lastfile--; if (i < fdp->fd_freefile) fdp->fd_freefile = i; + + /* + * Wait for any third party viewers (e.g., lsof) + * to release their references to this fileproc. + */ + while (fp->f_iocount > 0) { + p->p_fpdrainwait = 1; + msleep(&p->p_fpdrainwait, &p->p_fdmlock, PRIBIO, + "fpdrain", NULL); + } + closef_locked(fp, fp->f_fglob, p); + FREE_ZONE(fp, sizeof(*fp), M_FILEPROC); } - i--; } proc_fdunlock(p); } @@ -3764,7 +4164,7 @@ fdexec(proc_t p) * thread making the call, rather than from the process. * * In the case of a failure to obtain a reference, for most cases, - * the file entry will be silently droppped. There's an exception + * the file entry will be silently dropped. There's an exception * for the case of a chroot dir, since a failure to to obtain a * reference there would constitute an "escape" from the chroot * environment, which must not be allowed. In that case, we will @@ -3822,7 +4222,7 @@ fdcopy(proc_t p, vnode_t uth_cdir) * our reference from the parent also * since the vnode has gone DEAD making * it useless... by dropping it we'll - * be that much closer to recyling it + * be that much closer to recycling it */ vnode_rele(fdp->fd_cdir); fdp->fd_cdir = NULL; @@ -3994,7 +4394,7 @@ fdfree(proc_t p) if ((fp = fdp->fd_ofiles[i]) != NULL) { if (fdp->fd_ofileflags[i] & UF_RESERVED) - panic("fdfree: found fp with UF_RESERVED\n"); + panic("fdfree: found fp with UF_RESERVED"); /* closef drops the iocount ... */ if ((fp->f_flags & FP_INCHRREAD) != 0) @@ -4186,7 +4586,7 @@ closef_locked(struct fileproc *fp, struct fileglob *fg, proc_t p) * Locks: Assumes the caller holds the proc_fdlock * * Notes: For character devices, this occurs on the last close of the - * device; for all other file descriptos, this occurs on each + * device; for all other file descriptors, this occurs on each * close to prevent fd's from being closed out from under * operations currently in progress and blocked * @@ -4210,14 +4610,25 @@ fileproc_drain(proc_t p, struct fileproc * fp) if (fp->f_fglob->fg_ops->fo_drain) { (*fp->f_fglob->fg_ops->fo_drain)(fp, &context); } - if (((fp->f_flags & FP_INSELECT)== FP_INSELECT)) { - wait_queue_wakeup_all((wait_queue_t)fp->f_waddr, NULL, THREAD_INTERRUPTED); + if ((fp->f_flags & FP_INSELECT) == FP_INSELECT) { + if (wait_queue_wakeup_all((wait_queue_t)fp->f_waddr, NULL, THREAD_INTERRUPTED) == KERN_INVALID_ARGUMENT) + panic("bad wait queue for wait_queue_wakeup_all %p", fp->f_waddr); } + if ((fp->f_flags & FP_SELCONFLICT) == FP_SELCONFLICT) { + if (wait_queue_wakeup_all(&select_conflict_queue, NULL, THREAD_INTERRUPTED) == KERN_INVALID_ARGUMENT) + panic("bad select_conflict_queue"); + } p->p_fpdrainwait = 1; msleep(&p->p_fpdrainwait, &p->p_fdmlock, PRIBIO, "fpdrain", NULL); } +#if DIAGNOSTIC + if ((fp->f_flags & FP_INSELECT) != 0) + panic("FP_INSELECT set on drained fp"); +#endif + if ((fp->f_flags & FP_SELCONFLICT) == FP_SELCONFLICT) + fp->f_flags &= ~FP_SELCONFLICT; } @@ -4329,7 +4740,6 @@ flock(proc_t p, struct flock_args *uap, __unused int32_t *retval) } -#if CONFIG_EMBEDDED /* * fileport_makeport * @@ -4465,7 +4875,7 @@ fileport_makefd(proc_t p, struct fileport_makefd_args *uap, int32_t *retval) err = EINVAL; goto out; } - + MALLOC_ZONE(fp, struct fileproc *, sizeof(*fp), M_FILEPROC, M_WAITOK); if (fp == FILEPROC_NULL) { err = ENOMEM; @@ -4483,6 +4893,7 @@ fileport_makefd(proc_t p, struct fileport_makefd_args *uap, int32_t *retval) proc_fdunlock(p); goto out; } + *fdflags(p, fd) |= UF_EXCLOSE; procfdtbl_releasefd(p, fd, fp); proc_fdunlock(p); @@ -4500,7 +4911,6 @@ fileport_makefd(proc_t p, struct fileport_makefd_args *uap, int32_t *retval) return err; } -#endif /* CONFIG_EMBEDDED */ /* @@ -4524,7 +4934,7 @@ fileport_makefd(proc_t p, struct fileport_makefd_args *uap, int32_t *retval) * Notes: XXX This is not thread safe; see fdopen() above */ int -dupfdopen(struct filedesc *fdp, int indx, int dfd, int mode, int error) +dupfdopen(struct filedesc *fdp, int indx, int dfd, int flags, int error) { struct fileproc *wfp; struct fileproc *fp; @@ -4575,7 +4985,7 @@ dupfdopen(struct filedesc *fdp, int indx, int dfd, int mode, int error) * Check that the mode the file is being opened for is a * subset of the mode of the existing descriptor. */ - if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) { + if (((flags & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) { proc_fdunlock(p); return (EACCES); } @@ -4587,7 +4997,8 @@ dupfdopen(struct filedesc *fdp, int indx, int dfd, int mode, int error) fg_free(fp->f_fglob); fp->f_fglob = wfp->f_fglob; - fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; + fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd] | + (flags & O_CLOEXEC) ? UF_EXCLOSE : 0; proc_fdunlock(p); return (0); @@ -4623,10 +5034,11 @@ fg_ref(struct fileproc * fp) #if DIAGNOSTIC if ((fp->f_flags & ~((unsigned int)FP_VALID_FLAGS)) != 0) - panic("fg_ref: invalid bits on fp%x\n", (unsigned int)fp); + panic("fg_ref: invalid bits on fp %p", fp); if (fg->fg_count == 0) - panic("fg_ref: adding fgcount to zeroed fg :fp %x, fg%x\n ", (unsigned int)fp, (unsigned int)fg); + panic("fg_ref: adding fgcount to zeroed fg: fp %p fg %p", + fp, fg); #endif fg->fg_count++; lck_mtx_unlock(&fg->fg_lock); diff --git a/bsd/kern/kern_event.c b/bsd/kern/kern_event.c index 5d195dcf0..632501473 100644 --- a/bsd/kern/kern_event.c +++ b/bsd/kern/kern_event.c @@ -92,6 +92,9 @@ #include #include "net/net_str_id.h" +#include +#include + MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); #define KQ_EVENT NULL @@ -140,6 +143,8 @@ static void kevent_continue(struct kqueue *kq, void *data, int error); static void kqueue_scan_continue(void *contp, wait_result_t wait_result); static int kqueue_process(struct kqueue *kq, kevent_callback_t callback, void *data, int *countp, struct proc *p); +static int kqueue_begin_processing(struct kqueue *kq); +static void kqueue_end_processing(struct kqueue *kq); static int knote_process(struct knote *kn, kevent_callback_t callback, void *data, struct kqtailq *inprocessp, struct proc *p); static void knote_put(struct knote *kn); @@ -183,6 +188,15 @@ static struct filterops proc_filtops = { .f_event = filt_proc, }; +static int filt_vmattach(struct knote *kn); +static void filt_vmdetach(struct knote *kn); +static int filt_vm(struct knote *kn, long hint); +static struct filterops vm_filtops = { + .f_attach = filt_vmattach, + .f_detach = filt_vmdetach, + .f_event = filt_vm, +}; + extern struct filterops fs_filtops; extern struct filterops sig_filtops; @@ -238,11 +252,6 @@ static struct filterops user_filtops = { .f_touch = filt_usertouch, }; -#if CONFIG_AUDIT -/* Audit session filter */ -extern struct filterops audit_session_filtops; -#endif - /* * Table for for all system-defined filters. */ @@ -261,11 +270,8 @@ static struct filterops *sysfilt_ops[] = { &machport_filtops, /* EVFILT_MACHPORT */ &fs_filtops, /* EVFILT_FS */ &user_filtops, /* EVFILT_USER */ -#if CONFIG_AUDIT - &audit_session_filtops, /* EVFILT_SESSION */ -#else - &bad_filtops, -#endif + &bad_filtops, /* unused */ + &vm_filtops, /* EVFILT_VM */ }; /* @@ -455,6 +461,7 @@ static int filt_procattach(struct knote *kn) { struct proc *p; + pid_t selfpid = (pid_t)0; assert(PID_MAX < NOTE_PDATAMASK); @@ -466,6 +473,16 @@ filt_procattach(struct knote *kn) return (ESRCH); } + if ((kn->kn_sfflags & NOTE_EXIT) != 0) { + selfpid = proc_selfpid(); + /* check for validity of NOTE_EXISTATUS */ + if (((kn->kn_sfflags & NOTE_EXITSTATUS) != 0) && + ((p->p_ppid != selfpid) && (((p->p_lflag & P_LTRACED) == 0) || (p->p_oppid != selfpid)))) { + proc_rele(p); + return(EACCES); + } + } + proc_klist_lock(); kn->kn_flags |= EV_CLEAR; /* automatically set */ @@ -524,12 +541,57 @@ filt_proc(struct knote *kn, long hint) if (event == NOTE_REAP || (event == NOTE_EXIT && !(kn->kn_sfflags & NOTE_REAP))) { kn->kn_flags |= (EV_EOF | EV_ONESHOT); } + if ((event == NOTE_EXIT) && ((kn->kn_sfflags & NOTE_EXITSTATUS) != 0)) { + kn->kn_fflags |= NOTE_EXITSTATUS; + kn->kn_data = (hint & NOTE_PDATAMASK); + } + if ((event == NOTE_RESOURCEEND) && ((kn->kn_sfflags & NOTE_RESOURCEEND) != 0)) { + kn->kn_fflags |= NOTE_RESOURCEEND; + kn->kn_data = (hint & NOTE_PDATAMASK); + } } /* atomic check, no locking need when called from above */ return (kn->kn_fflags != 0); } +/* + * Virtual memory kevents + * + * author: Matt Jacobson [matthew_jacobson@apple.com] + */ + +static int +filt_vmattach(struct knote *kn) +{ + /* + * The note will be cleared once the information has been flushed to the client. + * If there is still pressure, we will be re-alerted. + */ + kn->kn_flags |= EV_CLEAR; + + return vm_knote_register(kn); +} + +static void +filt_vmdetach(struct knote *kn) +{ + vm_knote_unregister(kn); +} + +static int +filt_vm(struct knote *kn, long hint) +{ + /* hint == 0 means this is just an alive? check (always true) */ + if (hint != 0) { + /* If this knote is interested in the event specified in hint... */ + if ((kn->kn_sfflags & hint) != 0) { + kn->kn_fflags |= hint; + } + } + + return (kn->kn_fflags != 0); +} /* * filt_timervalidate - process data from user @@ -872,7 +934,7 @@ filt_userattach(struct knote *kn) { /* EVFILT_USER knotes are not attached to anything in the kernel */ kn->kn_hook = NULL; - if (kn->kn_fflags & NOTE_TRIGGER || kn->kn_flags & EV_TRIGGER) { + if (kn->kn_fflags & NOTE_TRIGGER) { kn->kn_hookid = 1; } else { kn->kn_hookid = 0; @@ -895,10 +957,10 @@ filt_user(struct knote *kn, __unused long hint) static void filt_usertouch(struct knote *kn, struct kevent64_s *kev, long type) { - int ffctrl; + uint32_t ffctrl; switch (type) { case EVENT_REGISTER: - if (kev->fflags & NOTE_TRIGGER || kev->flags & EV_TRIGGER) { + if (kev->fflags & NOTE_TRIGGER) { kn->kn_hookid = 1; } @@ -1511,6 +1573,7 @@ kevent_register(struct kqueue *kq, struct kevent64_s *kev, __unused struct proc error = fops->f_attach(kn); kqlock(kq); + if (error != 0) { /* * Failed to attach correctly, so drop. @@ -1594,11 +1657,6 @@ kevent_register(struct kqueue *kq, struct kevent64_s *kev, __unused struct proc */ if (!fops->f_isfd && fops->f_touch != NULL) fops->f_touch(kn, kev, EVENT_REGISTER); - - /* We may need to push some info down to a networked filesystem */ - if (kn->kn_filter == EVFILT_VNODE) { - vnode_knoteupdate(kn); - } } /* still have use ref on knote */ @@ -1770,6 +1828,47 @@ knote_process(struct knote *kn, return error; } +/* + * Return 0 to indicate that processing should proceed, + * -1 if there is nothing to process. + * + * Called with kqueue locked and returns the same way, + * but may drop lock temporarily. + */ +static int +kqueue_begin_processing(struct kqueue *kq) +{ + for (;;) { + if (kq->kq_count == 0) { + return -1; + } + + /* if someone else is processing the queue, wait */ + if (kq->kq_nprocess != 0) { + wait_queue_assert_wait((wait_queue_t)kq->kq_wqs, &kq->kq_nprocess, THREAD_UNINT, 0); + kq->kq_state |= KQ_PROCWAIT; + kqunlock(kq); + thread_block(THREAD_CONTINUE_NULL); + kqlock(kq); + } else { + kq->kq_nprocess = 1; + return 0; + } + } +} + +/* + * Called with kqueue lock held. + */ +static void +kqueue_end_processing(struct kqueue *kq) +{ + kq->kq_nprocess = 0; + if (kq->kq_state & KQ_PROCWAIT) { + kq->kq_state &= ~KQ_PROCWAIT; + wait_queue_wakeup_all((wait_queue_t)kq->kq_wqs, &kq->kq_nprocess, THREAD_AWAKENED); + } +} /* * kqueue_process - process the triggered events in a kqueue @@ -1799,23 +1898,13 @@ kqueue_process(struct kqueue *kq, int error; TAILQ_INIT(&inprocess); - restart: - if (kq->kq_count == 0) { + + if (kqueue_begin_processing(kq) == -1) { *countp = 0; + /* Nothing to process */ return 0; } - /* if someone else is processing the queue, wait */ - if (hw_atomic_add(&kq->kq_nprocess, 1) != 1) { - hw_atomic_sub(&kq->kq_nprocess, 1); - wait_queue_assert_wait((wait_queue_t)kq->kq_wqs, &kq->kq_nprocess, THREAD_UNINT, 0); - kq->kq_state |= KQ_PROCWAIT; - kqunlock(kq); - thread_block(THREAD_CONTINUE_NULL); - kqlock(kq); - goto restart; - } - /* * Clear any pre-posted status from previous runs, so we only * detect events that occur during this run. @@ -1850,11 +1939,8 @@ kqueue_process(struct kqueue *kq, kn->kn_tq = &kq->kq_head; TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); } - hw_atomic_sub(&kq->kq_nprocess, 1); - if (kq->kq_state & KQ_PROCWAIT) { - kq->kq_state &= ~KQ_PROCWAIT; - wait_queue_wakeup_all((wait_queue_t)kq->kq_wqs, &kq->kq_nprocess, THREAD_AWAKENED); - } + + kqueue_end_processing(kq); *countp = nevents; return error; @@ -2044,11 +2130,15 @@ static int kqueue_select(struct fileproc *fp, int which, void *wql, __unused vfs_context_t ctx) { struct kqueue *kq = (struct kqueue *)fp->f_data; - int again; - + struct knote *kn; + struct kqtailq inprocessq; + int retnum = 0; + if (which != FREAD) return 0; + TAILQ_INIT(&inprocessq); + kqlock(kq); /* * If this is the first pass, link the wait queue associated with the @@ -2067,11 +2157,12 @@ kqueue_select(struct fileproc *fp, int which, void *wql, __unused vfs_context_t (wait_queue_link_t)wql); } - retry: - again = 0; - if (kq->kq_count != 0) { - struct knote *kn; + if (kqueue_begin_processing(kq) == -1) { + kqunlock(kq); + return 0; + } + if (kq->kq_count != 0) { /* * there is something queued - but it might be a * KN_STAYQUEUED knote, which may or may not have @@ -2079,31 +2170,42 @@ kqueue_select(struct fileproc *fp, int which, void *wql, __unused vfs_context_t * list of knotes to see, and peek at the stay- * queued ones to be really sure. */ - TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) { - int retnum = 0; - if ((kn->kn_status & KN_STAYQUEUED) == 0 || - (retnum = kn->kn_fop->f_peek(kn)) > 0) { - kqunlock(kq); - return 1; + while ((kn = (struct knote*)TAILQ_FIRST(&kq->kq_head)) != NULL) { + if ((kn->kn_status & KN_STAYQUEUED) == 0) { + retnum = 1; + goto out; } - if (retnum < 0) - again++; + + TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); + TAILQ_INSERT_TAIL(&inprocessq, kn, kn_tqe); + + if (kqlock2knoteuse(kq, kn)) { + unsigned peek; + + peek = kn->kn_fop->f_peek(kn); + if (knoteuse2kqlock(kq, kn)) { + if (peek > 0) { + retnum = 1; + goto out; + } + } else { + retnum = 0; + } + } } } - /* - * If we stumbled across a knote that couldn't be peeked at, - * we have to drop the kq lock and try again. - */ - if (again > 0) { - kqunlock(kq); - mutex_pause(0); - kqlock(kq); - goto retry; +out: + /* Return knotes to active queue */ + while ((kn = TAILQ_FIRST(&inprocessq)) != NULL) { + TAILQ_REMOVE(&inprocessq, kn, kn_tqe); + kn->kn_tq = &kq->kq_head; + TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); } + kqueue_end_processing(kq); kqunlock(kq); - return 0; + return retnum; } /* @@ -2312,10 +2414,7 @@ knote_link_wait_queue(struct knote *kn, struct wait_queue *wq) kr = wait_queue_link(wq, kq->kq_wqs); if (kr == KERN_SUCCESS) { - kqlock(kq); - kn->kn_status |= KN_STAYQUEUED; - knote_enqueue(kn); - kqunlock(kq); + knote_markstayqueued(kn); return 0; } else { return ENOMEM; @@ -2531,6 +2630,7 @@ knote_init(void) /* Initialize the timer filter lock */ lck_mtx_init(&_filt_timerlock, kq_lck_grp, kq_lck_attr); + lck_mtx_init(&vm_pressure_klist_mutex, kq_lck_grp, kq_lck_attr); } SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL) @@ -2843,3 +2943,12 @@ fill_kqueueinfo(struct kqueue *kq, struct kqueue_info * kinfo) return(0); } + +void +knote_markstayqueued(struct knote *kn) +{ + kqlock(kn->kn_kq); + kn->kn_status |= KN_STAYQUEUED; + knote_enqueue(kn); + kqunlock(kn->kn_kq); +} diff --git a/bsd/kern/kern_exec.c b/bsd/kern/kern_exec.c index 722415a70..3de273b36 100644 --- a/bsd/kern/kern_exec.c +++ b/bsd/kern/kern_exec.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -139,6 +139,7 @@ #include #include +#include #if CONFIG_DTRACE /* Do not include dtrace.h, it redefines kmem_[alloc/free] */ @@ -153,8 +154,7 @@ extern void dtrace_lazy_dofs_destroy(proc_t); thread_t fork_create_child(task_t parent_task, proc_t child_proc, int inherit_memory, int is64bit); void vfork_exit(proc_t p, int rv); int setsigvec(proc_t, thread_t, int, struct __kern_sigaction *, boolean_t in_sigstart); -void workqueue_exit(struct proc *); - +extern void proc_apply_task_networkbg_internal(proc_t); /* * Mach things for which prototypes are unavailable from Mach headers @@ -186,16 +186,6 @@ extern struct savearea *get_user_regs(thread_t); #include -/* - * SIZE_MAXPTR The maximum size of a user space pointer, in bytes - * SIZE_IMG_STRSPACE The available string space, minus two pointers; we - * define it interms of the maximum, since we don't - * know the pointer size going in, until after we've - * parsed the executable image. - */ -#define SIZE_MAXPTR 8 /* 64 bits */ -#define SIZE_IMG_STRSPACE (NCARGS - 2 * SIZE_MAXPTR) - /* * EAI_ITERLIMIT The maximum number of times to iterate an image * activator in exec_activate_image() before treating @@ -203,6 +193,12 @@ extern struct savearea *get_user_regs(thread_t); */ #define EAI_ITERLIMIT 10 +/* + * For #! interpreter parsing + */ +#define IS_WHITESPACE(ch) ((ch == ' ') || (ch == '\t')) +#define IS_EOL(ch) ((ch == '#') || (ch == '\n')) + extern vm_map_t bsd_pageable_map; extern struct fileops vnops; @@ -218,9 +214,10 @@ static int execargs_alloc(struct image_params *imgp); static int execargs_free(struct image_params *imgp); static int exec_check_permissions(struct image_params *imgp); static int exec_extract_strings(struct image_params *imgp); +static int exec_add_apple_strings(struct image_params *imgp); static int exec_handle_sugid(struct image_params *imgp); static int sugid_scripts = 0; -SYSCTL_INT (_kern, OID_AUTO, sugid_scripts, CTLFLAG_RW, &sugid_scripts, 0, ""); +SYSCTL_INT (_kern, OID_AUTO, sugid_scripts, CTLFLAG_RW | CTLFLAG_LOCKED, &sugid_scripts, 0, ""); static kern_return_t create_unix_stack(vm_map_t map, user_addr_t user_stack, int customstack, proc_t p); static int copyoutptr(user_addr_t ua, user_addr_t ptr, int ptr_size); @@ -232,12 +229,14 @@ __private_extern__ int open1(vfs_context_t, struct nameidata *, int, struct vnode_attr *, int32_t *); /* - * exec_add_string + * exec_add_user_string * * Add the requested string to the string space area. * * Parameters; struct image_params * image parameter block * user_addr_t string to add to strings area + * int segment from which string comes + * boolean_t TRUE if string contributes to NCARGS * * Returns: 0 Success * !0 Failure errno from copyinstr() @@ -245,29 +244,41 @@ int open1(vfs_context_t, struct nameidata *, int, struct vnode_attr *, int32_t * Implicit returns: * (imgp->ip_strendp) updated location of next add, if any * (imgp->ip_strspace) updated byte count of space remaining + * (imgp->ip_argspace) updated byte count of space in NCARGS */ static int -exec_add_string(struct image_params *imgp, user_addr_t str) +exec_add_user_string(struct image_params *imgp, user_addr_t str, int seg, boolean_t is_ncargs) { - int error = 0; - - do { - size_t len = 0; - if (imgp->ip_strspace <= 0) { + int error = 0; + + do { + size_t len = 0; + int space; + + if (is_ncargs) + space = imgp->ip_argspace; /* by definition smaller than ip_strspace */ + else + space = imgp->ip_strspace; + + if (space <= 0) { error = E2BIG; break; } - if (!UIO_SEG_IS_USER_SPACE(imgp->ip_seg)) { + + if (!UIO_SEG_IS_USER_SPACE(seg)) { char *kstr = CAST_DOWN(char *,str); /* SAFE */ - error = copystr(kstr, imgp->ip_strendp, imgp->ip_strspace, &len); + error = copystr(kstr, imgp->ip_strendp, space, &len); } else { - error = copyinstr(str, imgp->ip_strendp, imgp->ip_strspace, - &len); + error = copyinstr(str, imgp->ip_strendp, space, &len); } + imgp->ip_strendp += len; imgp->ip_strspace -= len; + if (is_ncargs) + imgp->ip_argspace -= len; + } while (error == ENAMETOOLONG); - + return error; } @@ -277,11 +288,10 @@ exec_add_string(struct image_params *imgp, user_addr_t str) * To support new app package launching for Mac OS X, the dyld needs the * first argument to execve() stored on the user stack. * - * Save the executable path name at the top of the strings area and set + * Save the executable path name at the bottom of the strings area and set * the argument vector pointer to the location following that to indicate * the start of the argument and environment tuples, setting the remaining - * string space count to the size of the string area minus the path length - * and a reserve for two pointers. + * string space count to the size of the string area minus the path length. * * Parameters; struct image_params * image parameter block * char * path used to invoke program @@ -295,8 +305,9 @@ exec_add_string(struct image_params *imgp, user_addr_t str) * Implicit returns: * (imgp->ip_strings) saved path * (imgp->ip_strspace) space remaining in ip_strings - * (imgp->ip_argv) beginning of argument list * (imgp->ip_strendp) start of remaining copy area + * (imgp->ip_argspace) space remaining of NCARGS + * (imgp->ip_applec) Initial applev[0] * * Note: We have to do this before the initial namei() since in the * path contains symbolic links, namei() will overwrite the @@ -310,10 +321,7 @@ exec_save_path(struct image_params *imgp, user_addr_t path, int seg) { int error; size_t len; - char *kpath = CAST_DOWN(char *,path); /* SAFE */ - - imgp->ip_strendp = imgp->ip_strings; - imgp->ip_strspace = SIZE_IMG_STRSPACE; + char *kpath; len = MIN(MAXPATHLEN, imgp->ip_strspace); @@ -323,6 +331,7 @@ exec_save_path(struct image_params *imgp, user_addr_t path, int seg) error = copyinstr(path, imgp->ip_strings, len, &len); break; case UIO_SYSSPACE: + kpath = CAST_DOWN(char *,path); /* SAFE */ error = copystr(kpath, imgp->ip_strings, len, &len); break; default: @@ -333,12 +342,38 @@ exec_save_path(struct image_params *imgp, user_addr_t path, int seg) if (!error) { imgp->ip_strendp += len; imgp->ip_strspace -= len; - imgp->ip_argv = imgp->ip_strendp; } return(error); } +/* + * exec_reset_save_path + * + * If we detect a shell script, we need to reset the string area + * state so that the interpreter can be saved onto the stack. + + * Parameters; struct image_params * image parameter block + * + * Returns: int 0 Success + * + * Implicit returns: + * (imgp->ip_strings) saved path + * (imgp->ip_strspace) space remaining in ip_strings + * (imgp->ip_strendp) start of remaining copy area + * (imgp->ip_argspace) space remaining of NCARGS + * + */ +static int +exec_reset_save_path(struct image_params *imgp) +{ + imgp->ip_strendp = imgp->ip_strings; + imgp->ip_argspace = NCARGS; + imgp->ip_strspace = ( NCARGS + PAGE_SIZE ); + + return (0); +} + #ifdef IMGPF_POWERPC /* * exec_powerpc32_imgact @@ -406,11 +441,15 @@ exec_powerpc32_imgact(struct image_params *imgp) imgp->ip_flags |= IMGPF_POWERPC; /* impute an interpreter */ - error = copystr(exec_archhandler_ppc.path, imgp->ip_interp_name, + error = copystr(exec_archhandler_ppc.path, imgp->ip_interp_buffer, IMG_SHSIZE, &len); if (error) return (error); + exec_reset_save_path(imgp); + exec_save_path(imgp, CAST_USER_ADDR_T(imgp->ip_interp_buffer), + UIO_SYSSPACE); + /* * provide a replacement string for p->p_comm; we have to use an * alternate buffer for this, rather than replacing it directly, @@ -451,14 +490,12 @@ exec_shell_imgact(struct image_params *imgp) { char *vdata = imgp->ip_vdata; char *ihp; - char *line_endp; + char *line_startp, *line_endp; char *interp; - char temp[16]; proc_t p; struct fileproc *fp; int fd; int error; - size_t len; /* * Make sure it's a shell script. If we've already redirected @@ -480,65 +517,82 @@ exec_shell_imgact(struct image_params *imgp) #endif /* IMGPF_POWERPC */ imgp->ip_flags |= IMGPF_INTERPRET; + imgp->ip_interp_sugid_fd = -1; + imgp->ip_interp_buffer[0] = '\0'; - /* Check to see if SUGID scripts are permitted. If they aren't then + /* Check to see if SUGID scripts are permitted. If they aren't then * clear the SUGID bits. * imgp->ip_vattr is known to be valid. - */ - if (sugid_scripts == 0) { - imgp->ip_origvattr->va_mode &= ~(VSUID | VSGID); + */ + if (sugid_scripts == 0) { + imgp->ip_origvattr->va_mode &= ~(VSUID | VSGID); } - /* Find the nominal end of the interpreter line */ - for( ihp = &vdata[2]; *ihp != '\n' && *ihp != '#'; ihp++) { - if (ihp >= &vdata[IMG_SHSIZE]) + /* Try to find the first non-whitespace character */ + for( ihp = &vdata[2]; ihp < &vdata[IMG_SHSIZE]; ihp++ ) { + if (IS_EOL(*ihp)) { + /* Did not find interpreter, "#!\n" */ return (ENOEXEC); + } else if (IS_WHITESPACE(*ihp)) { + /* Whitespace, like "#! /bin/sh\n", keep going. */ + } else { + /* Found start of interpreter */ + break; + } } - line_endp = ihp; - ihp = &vdata[2]; - /* Skip over leading spaces - until the interpreter name */ - while ( ihp < line_endp && ((*ihp == ' ') || (*ihp == '\t'))) - ihp++; + if (ihp == &vdata[IMG_SHSIZE]) { + /* All whitespace, like "#! " */ + return (ENOEXEC); + } - /* - * Find the last non-whitespace character before the end of line or - * the beginning of a comment; this is our new end of line. - */ - for (;line_endp > ihp && ((*line_endp == ' ') || (*line_endp == '\t')); line_endp--) - continue; + line_startp = ihp; + + /* Try to find the end of the interpreter+args string */ + for ( ; ihp < &vdata[IMG_SHSIZE]; ihp++ ) { + if (IS_EOL(*ihp)) { + /* Got it */ + break; + } else { + /* Still part of interpreter or args */ + } + } - /* Empty? */ - if (line_endp == ihp) + if (ihp == &vdata[IMG_SHSIZE]) { + /* A long line, like "#! blah blah blah" without end */ return (ENOEXEC); + } + + /* Backtrack until we find the last non-whitespace */ + while (IS_EOL(*ihp) || IS_WHITESPACE(*ihp)) { + ihp--; + } + + /* The character after the last non-whitespace is our logical end of line */ + line_endp = ihp + 1; + + /* + * Now we have pointers to the usable part of: + * + * "#! /usr/bin/int first second third \n" + * ^ line_startp ^ line_endp + */ /* copy the interpreter name */ - interp = imgp->ip_interp_name; - while ((ihp < line_endp) && (*ihp != ' ') && (*ihp != '\t')) - *interp++ = *ihp++; + interp = imgp->ip_interp_buffer; + for ( ihp = line_startp; (ihp < line_endp) && !IS_WHITESPACE(*ihp); ihp++) + *interp++ = *ihp; *interp = '\0'; - exec_save_path(imgp, CAST_USER_ADDR_T(imgp->ip_interp_name), + exec_reset_save_path(imgp); + exec_save_path(imgp, CAST_USER_ADDR_T(imgp->ip_interp_buffer), UIO_SYSSPACE); - ihp = &vdata[2]; - while (ihp < line_endp) { - /* Skip leading whitespace before each argument */ - while ((*ihp == ' ') || (*ihp == '\t')) - ihp++; - - if (ihp >= line_endp) - break; - - /* We have an argument; copy it */ - while ((ihp < line_endp) && (*ihp != ' ') && (*ihp != '\t')) { - *imgp->ip_strendp++ = *ihp++; - imgp->ip_strspace--; - } - *imgp->ip_strendp++ = 0; - imgp->ip_strspace--; - imgp->ip_argc++; - } + /* Copy the entire interpreter + args for later processing into argv[] */ + interp = imgp->ip_interp_buffer; + for ( ihp = line_startp; (ihp < line_endp); ihp++) + *interp++ = *ihp; + *interp = '\0'; /* * If we have a SUID oder SGID script, create a file descriptor @@ -562,10 +616,7 @@ exec_shell_imgact(struct image_params *imgp) proc_fdunlock(p); vnode_ref(imgp->ip_vp); - snprintf(temp, sizeof(temp), "/dev/fd/%d", fd); - error = copyoutstr(temp, imgp->ip_user_fname, sizeof(temp), &len); - if (error) - return(error); + imgp->ip_interp_sugid_fd = fd; } return (-3); @@ -736,6 +787,7 @@ exec_mach_imgact(struct image_params *imgp) load_result_t load_result; struct _posix_spawnattr *psa = NULL; int spawn = (imgp->ip_flags & IMGPF_SPAWN); + int apptype = 0; /* * make sure it's a Mach-O 1.0 or Mach-O 2.0 binary; the difference @@ -766,7 +818,7 @@ exec_mach_imgact(struct image_params *imgp) /* * Save off the vfexec state up front; we have to do this, because - * we need to know if we were in this state initally subsequent to + * we need to know if we were in this state initially subsequent to * creating the backing task, thread, and uthread for the child * process (from the vfs_context_t from in img_parms). */ @@ -813,20 +865,14 @@ exec_mach_imgact(struct image_params *imgp) if (error) goto bad; - AUDIT_ARG(argv, imgp->ip_argv, imgp->ip_argc, - imgp->ip_strendargvp - imgp->ip_argv); - AUDIT_ARG(envv, imgp->ip_strendargvp, imgp->ip_envc, - imgp->ip_strendp - imgp->ip_strendargvp); + error = exec_add_apple_strings(imgp); + if (error) + goto bad; - /* - * Hack for binary compatability; put three NULs on the end of the - * string area, and round it up to the next word boundary. This - * ensures padding with NULs to the boundary. - */ - imgp->ip_strendp[0] = 0; - imgp->ip_strendp[1] = 0; - imgp->ip_strendp[2] = 0; - imgp->ip_strendp += (((imgp->ip_strendp - imgp->ip_strings) + NBPW-1) & ~(NBPW-1)); + AUDIT_ARG(argv, imgp->ip_startargv, imgp->ip_argc, + imgp->ip_endargv - imgp->ip_startargv); + AUDIT_ARG(envv, imgp->ip_endargv, imgp->ip_envc, + imgp->ip_endenvv - imgp->ip_endargv); #ifdef IMGPF_POWERPC /* @@ -838,7 +884,7 @@ exec_mach_imgact(struct image_params *imgp) * to the "encapsulated_binary:" label in exec_activate_image(). */ if (imgp->ip_vattr->va_fsid == exec_archhandler_ppc.fsid && - imgp->ip_vattr->va_fileid == (uint64_t)((u_long)exec_archhandler_ppc.fileid)) { + imgp->ip_vattr->va_fileid == exec_archhandler_ppc.fileid) { imgp->ip_flags |= IMGPF_POWERPC; } #endif /* IMGPF_POWERPC */ @@ -846,7 +892,7 @@ exec_mach_imgact(struct image_params *imgp) /* * We are being called to activate an image subsequent to a vfork() * operation; in this case, we know that our task, thread, and - * uthread are actualy those of our parent, and our proc, which we + * uthread are actually those of our parent, and our proc, which we * obtained indirectly from the image_params vfs_context_t, is the * new child process. */ @@ -885,7 +931,7 @@ exec_mach_imgact(struct image_params *imgp) * Load the Mach-O file. * * NOTE: An error after this point indicates we have potentially - * destroyed or overwrote some process state while attempting an + * destroyed or overwritten some process state while attempting an * execve() following a vfork(), which is an unrecoverable condition. */ @@ -932,10 +978,9 @@ exec_mach_imgact(struct image_params *imgp) cpu_type()); /* - * Close file descriptors - * which specify close-on-exec. + * Close file descriptors which specify close-on-exec. */ - fdexec(p); + fdexec(p, psa != NULL ? psa->psa_flags : 0); /* * deal with set[ug]id. @@ -959,14 +1004,6 @@ exec_mach_imgact(struct image_params *imgp) goto badtoolate; } - /* - * There is no continuing workq context during - * vfork exec. So no need to reset then. Otherwise - * clear the workqueue context. - */ - if (vfexec == 0 && spawn == 0) { - (void)workqueue_exit(p); - } if (vfexec || spawn) { old_map = vm_map_switch(get_task_map(task)); } @@ -991,15 +1028,12 @@ exec_mach_imgact(struct image_params *imgp) if (load_result.dynlinker) { uint64_t ap; + int new_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT) ? 8 : 4; /* Adjust the stack */ - if (imgp->ip_flags & IMGPF_IS_64BIT) { - ap = thread_adjuserstack(thread, -8); - error = copyoutptr(load_result.mach_header, ap, 8); - } else { - ap = thread_adjuserstack(thread, -4); - error = suword(ap, load_result.mach_header); - } + ap = thread_adjuserstack(thread, -new_ptr_size); + error = copyoutptr(load_result.mach_header, ap, new_ptr_size); + if (error) { if (vfexec || spawn) vm_map_switch(old_map); @@ -1058,6 +1092,8 @@ exec_mach_imgact(struct image_params *imgp) p->p_comm[imgp->ip_ndp->ni_cnd.cn_namelen] = '\0'; } + pal_dbg_set_task_name( p->task ); + memcpy(&p->p_uuid[0], &load_result.uuid[0], sizeof(p->p_uuid)); // dtrace code cleanup needed @@ -1143,6 +1179,22 @@ exec_mach_imgact(struct image_params *imgp) proc_unlock(p); (void) task_suspend(p->task); } + if ((psa->psa_flags & POSIX_SPAWN_OSX_TALAPP_START) || (psa->psa_flags & POSIX_SPAWN_OSX_DBCLIENT_START) || (psa->psa_flags & POSIX_SPAWN_IOS_APP_START)) { + if ((psa->psa_flags & POSIX_SPAWN_OSX_TALAPP_START)) + apptype = PROC_POLICY_OSX_APPTYPE_TAL; + else if (psa->psa_flags & POSIX_SPAWN_OSX_DBCLIENT_START) + apptype = PROC_POLICY_OSX_APPTYPE_DBCLIENT; + else if (psa->psa_flags & POSIX_SPAWN_IOS_APP_START) + apptype = PROC_POLICY_IOS_APPTYPE; + else + apptype = 0; + proc_set_task_apptype(p->task, apptype); + if ((apptype == PROC_POLICY_OSX_APPTYPE_TAL) || + (apptype == PROC_POLICY_OSX_APPTYPE_DBCLIENT)) { + + proc_apply_task_networkbg_internal(p); + } + } } /* @@ -1245,21 +1297,16 @@ exec_activate_image(struct image_params *imgp) if (error) goto bad; - /* - * XXXAUDIT: Note: the double copyin introduces an audit - * race. To correct this race, we must use a single - * copyin(), e.g. by passing a flag to namei to indicate an - * external path buffer is being used. - */ error = exec_save_path(imgp, imgp->ip_user_fname, imgp->ip_seg); if (error) { goto bad_notrans; } + /* Use imgp->ip_strings, which contains the copyin-ed exec path */ DTRACE_PROC1(exec, uintptr_t, imgp->ip_strings); - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, - imgp->ip_seg, imgp->ip_user_fname, imgp->ip_vfs_context); + NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, + UIO_SYSSPACE, CAST_USER_ADDR_T(imgp->ip_strings), imgp->ip_vfs_context); again: error = namei(&nd); @@ -1268,7 +1315,20 @@ exec_activate_image(struct image_params *imgp) imgp->ip_ndp = &nd; /* successful namei(); call nameidone() later */ imgp->ip_vp = nd.ni_vp; /* if set, need to vnode_put() at some point */ - error = proc_transstart(p, 0); + /* + * Before we start the transition from binary A to binary B, make + * sure another thread hasn't started exiting the process. We grab + * the proc lock to check p_lflag initially, and the transition + * mechanism ensures that the value doesn't change after we release + * the lock. + */ + proc_lock(p); + if (p->p_lflag & P_LEXIT) { + proc_unlock(p); + goto bad_notrans; + } + error = proc_transstart(p, 1); + proc_unlock(p); if (error) goto bad_notrans; @@ -1322,11 +1382,16 @@ exec_activate_image(struct image_params *imgp) mac_vnode_label_copy(imgp->ip_vp->v_label, imgp->ip_scriptlabelp); #endif + + nameidone(&nd); + vnode_put(imgp->ip_vp); imgp->ip_vp = NULL; /* already put */ - - NDINIT(&nd, LOOKUP, (nd.ni_cnd.cn_flags & HASBUF) | (FOLLOW | LOCKLEAF), - UIO_SYSSPACE, CAST_USER_ADDR_T(imgp->ip_interp_name), imgp->ip_vfs_context); + imgp->ip_ndp = NULL; /* already nameidone */ + + /* Use imgp->ip_strings, which exec_shell_imgact reset to the interpreter */ + NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF, + UIO_SYSSPACE, CAST_USER_ADDR_T(imgp->ip_strings), imgp->ip_vfs_context); #ifdef IMGPF_POWERPC /* @@ -1379,10 +1444,10 @@ exec_activate_image(struct image_params *imgp) * short psa_flags posix spawn attribute flags * * Returns: 0 Success - * KERN_FAILURE Failure + * EINVAL Failure * ENOTSUP Illegal posix_spawn attr flag was set */ -static int +static errno_t exec_handle_port_actions(struct image_params *imgp, short psa_flags) { _posix_spawn_port_actions_t pacts = imgp->ip_px_spa; @@ -1390,16 +1455,17 @@ exec_handle_port_actions(struct image_params *imgp, short psa_flags) _ps_port_action_t *act = NULL; task_t task = p->task; ipc_port_t port = NULL; - kern_return_t ret = KERN_SUCCESS; + errno_t ret = KERN_SUCCESS; int i; for (i = 0; i < pacts->pspa_count; i++) { act = &pacts->pspa_actions[i]; - ret = ipc_object_copyin(get_task_ipcspace(current_task()), + if (ipc_object_copyin(get_task_ipcspace(current_task()), CAST_MACH_PORT_TO_NAME(act->new_port), MACH_MSG_TYPE_COPY_SEND, - (ipc_object_t *) &port); + (ipc_object_t *) &port) != KERN_SUCCESS) + return EINVAL; if (ret) return ret; @@ -1409,19 +1475,19 @@ exec_handle_port_actions(struct image_params *imgp, short psa_flags) /* Only allowed when not under vfork */ if (!(psa_flags & POSIX_SPAWN_SETEXEC)) return ENOTSUP; - ret = task_set_special_port(task, + ret = (task_set_special_port(task, act->which, - port); + port) == KERN_SUCCESS) ? 0 : EINVAL; break; case PSPA_EXCEPTION: /* Only allowed when not under vfork */ if (!(psa_flags & POSIX_SPAWN_SETEXEC)) return ENOTSUP; - ret = task_set_exception_ports(task, + ret = (task_set_exception_ports(task, act->mask, port, act->behavior, - act->flavor); + act->flavor) == KERN_SUCCESS) ? 0 : EINVAL; break; #if CONFIG_AUDIT case PSPA_AU_SESSION: @@ -1430,7 +1496,7 @@ exec_handle_port_actions(struct image_params *imgp, short psa_flags) break; #endif default: - ret = KERN_FAILURE; + ret = EINVAL; } /* action failed, so release port resources */ if (ret) { @@ -1461,7 +1527,7 @@ exec_handle_port_actions(struct image_params *imgp, short psa_flags) * normally permitted to perform. */ static int -exec_handle_file_actions(struct image_params *imgp) +exec_handle_file_actions(struct image_params *imgp, short psa_flags) { int error = 0; int action; @@ -1479,7 +1545,7 @@ exec_handle_file_actions(struct image_params *imgp) * a path argument, which is normally copied in from * user space; because of this, we have to support an * open from kernel space that passes an address space - * context oof UIO_SYSSPACE, and casts the address + * context of UIO_SYSSPACE, and casts the address * argument to a user_addr_t. */ struct vnode_attr va; @@ -1494,7 +1560,7 @@ exec_handle_file_actions(struct image_params *imgp) mode = ((mode &~ p->p_fd->fd_cmask) & ALLPERMS) & ~S_ISTXT; VATTR_SET(&va, va_mode, mode & ACCESSPERMS); - NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, UIO_SYSSPACE, + NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_SYSSPACE, CAST_USER_ADDR_T(psfa->psfaa_openargs.psfao_path), imgp->ip_vfs_context); @@ -1506,8 +1572,8 @@ exec_handle_file_actions(struct image_params *imgp) /* * If there's an error, or we get the right fd by - * accident, then drop out here. This is easier that - * rearchitecting all the open code to preallocate fd + * accident, then drop out here. This is easier than + * reworking all the open code to preallocate fd * slots, and internally taking one as an argument. */ if (error || ival[0] == psfa->psfaa_filedes) @@ -1566,16 +1632,68 @@ exec_handle_file_actions(struct image_params *imgp) } break; + case PSFA_INHERIT: { + struct fileproc *fp; + int fd = psfa->psfaa_filedes; + + /* + * Check to see if the descriptor exists, and + * ensure it's -not- marked as close-on-exec. + * [Less code than the equivalent F_GETFD/F_SETFD.] + */ + proc_fdlock(p); + if ((error = fp_lookup(p, fd, &fp, 1)) == 0) { + *fdflags(p, fd) &= ~UF_EXCLOSE; + (void) fp_drop(p, fd, fp, 1); + } + proc_fdunlock(p); + } + break; + default: error = EINVAL; break; } + /* All file actions failures are considered fatal, per POSIX */ + if (error) break; } - return (error); + if (error != 0 || (psa_flags & POSIX_SPAWN_CLOEXEC_DEFAULT) == 0) + return (error); + + /* + * If POSIX_SPAWN_CLOEXEC_DEFAULT is set, behave (during + * this spawn only) as if "close on exec" is the default + * disposition of all pre-existing file descriptors. In this case, + * the list of file descriptors mentioned in the file actions + * are the only ones that can be inherited, so mark them now. + * + * The actual closing part comes later, in fdexec(). + */ + proc_fdlock(p); + for (action = 0; action < px_sfap->psfa_act_count; action++) { + _psfa_action_t *psfa = &px_sfap->psfa_act_acts[action]; + int fd = psfa->psfaa_filedes; + + switch (psfa->psfaa_type) { + case PSFA_DUP2: + fd = psfa->psfaa_openargs.psfao_oflag; + /*FALLTHROUGH*/ + case PSFA_OPEN: + case PSFA_INHERIT: + *fdflags(p, fd) |= UF_INHERIT; + break; + + case PSFA_CLOSE: + break; + } + } + proc_fdunlock(p); + + return (0); } @@ -1628,10 +1746,12 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) _posix_spawn_port_actions_t px_spap = NULL; struct __kern_sigaction vec; boolean_t spawn_no_exec = FALSE; + boolean_t proc_transit_set = TRUE; + boolean_t exec_done = FALSE; /* * Allocate a big chunk for locals instead of using stack since these - * structures a pretty big. + * structures are pretty big. */ MALLOC(bufp, char *, (sizeof(*imgp) + sizeof(*vap) + sizeof(*origvap)), M_TEMP, M_WAITOK | M_ZERO); imgp = (struct image_params *) bufp; @@ -1740,7 +1860,7 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) } /* - * If we don't have the extention flag that turns "posix_spawn()" + * If we don't have the extension flag that turns "posix_spawn()" * into "execve() with options", then we will be creating a new * process which does not inherit memory from the parent process, * which is one of the most expensive things about using fork() @@ -1755,7 +1875,7 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) if (spawn_no_exec) p = (proc_t)get_bsdthreadtask_info(imgp->ip_new_thread); - + assert(p != NULL); /* By default, the thread everyone plays with is the parent */ context.vc_thread = current_thread(); @@ -1768,17 +1888,22 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) if (spawn_no_exec) context.vc_thread = imgp->ip_new_thread; - /* * Post fdcopy(), pre exec_handle_sugid() - this is where we want * to handle the file_actions. Since vfork() also ends up setting * us into the parent process group, and saved off the signal flags, * this is also where we want to handle the spawn flags. */ + /* Has spawn file actions? */ - if (imgp->ip_px_sfa != NULL && - (error = exec_handle_file_actions(imgp)) != 0) { - goto bad; + if (imgp->ip_px_sfa != NULL) { + /* + * The POSIX_SPAWN_CLOEXEC_DEFAULT flag + * is handled in exec_handle_file_actions(). + */ + if ((error = exec_handle_file_actions(imgp, + imgp->ip_px_sa != NULL ? px_sa.psa_flags : 0)) != 0) + goto bad; } /* Has spawn port actions? */ @@ -1787,7 +1912,8 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) * The check for the POSIX_SPAWN_SETEXEC flag is done in * exec_handle_port_actions(). */ - if((error = exec_handle_port_actions(imgp, px_sa.psa_flags)) != 0) + if ((error = exec_handle_port_actions(imgp, + imgp->ip_px_sa != NULL ? px_sa.psa_flags : 0)) != 0) goto bad; } @@ -1824,12 +1950,36 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) */ if (px_sa.psa_flags & POSIX_SPAWN_RESETIDS) { kauth_cred_t my_cred = p->p_ucred; - kauth_cred_t my_new_cred = kauth_cred_setuidgid(my_cred, my_cred->cr_ruid, my_cred->cr_rgid); - if (my_new_cred != my_cred) + kauth_cred_t my_new_cred = kauth_cred_setuidgid(my_cred, kauth_cred_getruid(my_cred), kauth_cred_getrgid(my_cred)); + if (my_new_cred != my_cred) { p->p_ucred = my_new_cred; + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); + } } + + /* + * Disable ASLR for the spawned process. + */ + if (px_sa.psa_flags & _POSIX_SPAWN_DISABLE_ASLR) + OSBitOrAtomic(P_DISABLE_ASLR, &p->p_flag); + + /* + * Forcibly disallow execution from data pages for the spawned process + * even if it would otherwise be permitted by the architecture default. + */ + if (px_sa.psa_flags & _POSIX_SPAWN_ALLOW_DATA_EXEC) + imgp->ip_flags |= IMGPF_ALLOW_DATA_EXEC; } + /* + * Disable ASLR during image activation. This occurs either if the + * _POSIX_SPAWN_DISABLE_ASLR attribute was found above or if + * P_DISABLE_ASLR was inherited from the parent process. + */ + if (p->p_flag & P_DISABLE_ASLR) + imgp->ip_flags |= IMGPF_DISABLE_ASLR; + /* * Clear transition flag so we won't hang if exec_activate_image() causes * an automount (and launchd does a proc sysctl to service it). @@ -1838,6 +1988,7 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) */ if (spawn_no_exec) { proc_transend(p, 0); + proc_transit_set = 0; } #if MAC_SPAWN /* XXX */ @@ -1853,9 +2004,13 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) */ error = exec_activate_image(imgp); - /* Image not claimed by any activator? */ - if (error == -1) + if (error == 0) { + /* process completed the exec */ + exec_done = TRUE; + } else if (error == -1) { + /* Image not claimed by any activator? */ error = ENOEXEC; + } /* * If we have a spawn attr, and it contains signal related flags, @@ -1938,6 +2093,9 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) * before check_for_signature(), which uses psignal. */ if (spawn_no_exec) { + if (proc_transit_set) + proc_transend(p, 0); + /* * Drop the signal lock on the child which was taken on our * behalf by forkproc()/cloneproc() to prevent signals being @@ -2040,8 +2198,10 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) p->exit_thread = current_thread(); proc_unlock(p); exit1(p, 1, (int *)NULL); - task_deallocate(get_threadtask(imgp->ip_new_thread)); - thread_deallocate(imgp->ip_new_thread); + if (exec_done == FALSE) { + task_deallocate(get_threadtask(imgp->ip_new_thread)); + thread_deallocate(imgp->ip_new_thread); + } } else { /* someone is doing it for us; just skip it */ proc_unlock(p); @@ -2165,7 +2325,7 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval) imgp->ip_vattr = vap; imgp->ip_origvattr = origvap; imgp->ip_vfs_context = &context; - imgp->ip_flags = (is_64 ? IMGPF_WAS_64BIT : IMGPF_NONE); + imgp->ip_flags = (is_64 ? IMGPF_WAS_64BIT : IMGPF_NONE) | ((p->p_flag & P_DISABLE_ASLR) ? IMGPF_DISABLE_ASLR : IMGPF_NONE); imgp->ip_p_comm = alt_p_comm; /* for PowerPC */ imgp->ip_seg = (is_64 ? UIO_USERSPACE64 : UIO_USERSPACE32); @@ -2273,8 +2433,6 @@ copyinptr(user_addr_t froma, user_addr_t *toptr, int ptr_size) * Returns: 0 Success * EFAULT Bad 'ua' * - * Implicit returns: - * *ptr_size Modified */ static int copyoutptr(user_addr_t ua, user_addr_t ptr, int ptr_size) @@ -2311,85 +2469,156 @@ copyoutptr(user_addr_t ua, user_addr_t ptr, int ptr_size) * Note: The strings segment layout is backward, from the beginning * of the top of the stack to consume the minimal amount of * space possible; the returned stack pointer points to the - * end of the area consumed (stacks grow upward). + * end of the area consumed (stacks grow downward). * * argc is an int; arg[i] are pointers; env[i] are pointers; - * exec_path is a pointer; the 0's are (void *)NULL's + * the 0's are (void *)NULL's * * The stack frame layout is: * - * +-------------+ - * sp-> | argc | - * +-------------+ - * | arg[0] | - * +-------------+ - * : - * : - * +-------------+ - * | arg[argc-1] | - * +-------------+ - * | 0 | - * +-------------+ - * | env[0] | - * +-------------+ - * : - * : - * +-------------+ - * | env[n] | - * +-------------+ - * | 0 | - * +-------------+ - * | exec_path | In MacOS X PR2 Beaker2E the path passed to exec() is - * +-------------+ passed on the stack just after the trailing 0 of the - * | 0 | the envp[] array as a pointer to a string. - * +-------------+ - * | PATH AREA | - * +-------------+ - * | STRING AREA | - * : - * : - * | | <- p->user_stack - * +-------------+ + * +-------------+ <- p->user_stack + * | 16b | + * +-------------+ + * | STRING AREA | + * | : | + * | : | + * | : | + * +- -- -- -- --+ + * | PATH AREA | + * +-------------+ + * | 0 | + * +-------------+ + * | applev[n] | + * +-------------+ + * : + * : + * +-------------+ + * | applev[1] | + * +-------------+ + * | exec_path / | + * | applev[0] | + * +-------------+ + * | 0 | + * +-------------+ + * | env[n] | + * +-------------+ + * : + * : + * +-------------+ + * | env[0] | + * +-------------+ + * | 0 | + * +-------------+ + * | arg[argc-1] | + * +-------------+ + * : + * : + * +-------------+ + * | arg[0] | + * +-------------+ + * | argc | + * sp-> +-------------+ * * Although technically a part of the STRING AREA, we treat the PATH AREA as * a separate entity. This allows us to align the beginning of the PATH AREA * to a pointer boundary so that the exec_path, env[i], and argv[i] pointers * which preceed it on the stack are properly aligned. - * - * TODO: argc copied with suword(), which takes a 64 bit address */ + static int exec_copyout_strings(struct image_params *imgp, user_addr_t *stackp) { proc_t p = vfs_context_proc(imgp->ip_vfs_context); int ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT) ? 8 : 4; - char *argv = imgp->ip_argv; /* modifiable copy of argv */ + int ptr_area_size; + void *ptr_buffer_start, *ptr_buffer; + int string_size; + user_addr_t string_area; /* *argv[], *env[] */ - user_addr_t path_area; /* package launch path */ - user_addr_t ptr_area; /* argv[], env[], exec_path */ + user_addr_t ptr_area; /* argv[], env[], applev[] */ + user_addr_t argc_area; /* argc */ user_addr_t stack; - int stringc = imgp->ip_argc + imgp->ip_envc; - size_t len; int error; - ssize_t strspace; + + unsigned i; + struct copyout_desc { + char *start_string; + int count; +#if CONFIG_DTRACE + user_addr_t *dtrace_cookie; +#endif + boolean_t null_term; + } descriptors[] = { + { + .start_string = imgp->ip_startargv, + .count = imgp->ip_argc, +#if CONFIG_DTRACE + .dtrace_cookie = &p->p_dtrace_argv, +#endif + .null_term = TRUE + }, + { + .start_string = imgp->ip_endargv, + .count = imgp->ip_envc, +#if CONFIG_DTRACE + .dtrace_cookie = &p->p_dtrace_envp, +#endif + .null_term = TRUE + }, + { + .start_string = imgp->ip_strings, + .count = 1, +#if CONFIG_DTRACE + .dtrace_cookie = NULL, +#endif + .null_term = FALSE + }, + { + .start_string = imgp->ip_endenvv, + .count = imgp->ip_applec - 1, /* exec_path handled above */ +#if CONFIG_DTRACE + .dtrace_cookie = NULL, +#endif + .null_term = TRUE + } + }; stack = *stackp; - size_t patharea_len = imgp->ip_argv - imgp->ip_strings; - int envc_add = 0; - /* - * Set up pointers to the beginning of the string area, the beginning - * of the path area, and the beginning of the pointer area (actually, - * the location of argc, an int, which may be smaller than a pointer, - * but we use ptr_size worth of space for it, for alignment). + * All previous contributors to the string area + * should have aligned their sub-area */ - string_area = stack - (((imgp->ip_strendp - imgp->ip_strings) + ptr_size-1) & ~(ptr_size-1)) - ptr_size; - path_area = string_area - ((patharea_len + ptr_size-1) & ~(ptr_size-1)); - ptr_area = path_area - ((imgp->ip_argc + imgp->ip_envc + 4 + envc_add) * ptr_size) - ptr_size /*argc*/; + if (imgp->ip_strspace % ptr_size != 0) { + error = EINVAL; + goto bad; + } - /* Return the initial stack address: the location of argc */ - *stackp = ptr_area; + /* Grow the stack down for the strings we've been building up */ + string_size = imgp->ip_strendp - imgp->ip_strings; + stack -= string_size; + string_area = stack; + + /* + * Need room for one pointer for each string, plus + * one for the NULLs terminating the argv, envv, and apple areas. + */ + ptr_area_size = (imgp->ip_argc + imgp->ip_envc + imgp->ip_applec + 3) * + ptr_size; + stack -= ptr_area_size; + ptr_area = stack; + + /* We'll construct all the pointer arrays in our string buffer, + * which we already know is aligned properly, and ip_argspace + * was used to verify we have enough space. + */ + ptr_buffer_start = ptr_buffer = (void *)imgp->ip_strendp; + + /* + * Need room for pointer-aligned argc slot. + */ + stack -= ptr_size; + argc_area = stack; /* * Record the size of the arguments area so that sysctl_procargs() @@ -2397,92 +2626,73 @@ exec_copyout_strings(struct image_params *imgp, user_addr_t *stackp) */ proc_lock(p); p->p_argc = imgp->ip_argc; - p->p_argslen = (int)(stack - path_area); + p->p_argslen = (int)(*stackp - string_area); proc_unlock(p); + /* Return the initial stack address: the location of argc */ + *stackp = stack; /* - * Support for new app package launching for Mac OS X allocates - * the "path" at the begining of the imgp->ip_strings buffer. - * copy it just before the string area. - */ - len = 0; - error = copyoutstr(imgp->ip_strings, path_area, - patharea_len, - &len); + * Copy out the entire strings area. + */ + error = copyout(imgp->ip_strings, string_area, + string_size); if (error) goto bad; - - /* Save a NULL pointer below it */ - (void)copyoutptr(0LL, path_area - ptr_size, ptr_size); - - /* Save the pointer to "path" just below it */ - (void)copyoutptr(path_area, path_area - 2*ptr_size, ptr_size); - - /* - * ptr_size for 2 NULL one each ofter arg[argc -1] and env[n] - * ptr_size for argc - * skip over saved path, ptr_size for pointer to path, - * and ptr_size for the NULL after pointer to path. - */ - - /* argc (int32, stored in a ptr_size area) */ - (void)suword(ptr_area, imgp->ip_argc); - ptr_area += sizeof(int); - /* pad to ptr_size, if 64 bit image, to ensure user stack alignment */ - if (imgp->ip_flags & IMGPF_IS_64BIT) { - (void)suword(ptr_area, 0); /* int, not long: ignored */ - ptr_area += sizeof(int); - } + for (i = 0; i < sizeof(descriptors)/sizeof(descriptors[0]); i++) { + char *cur_string = descriptors[i].start_string; + int j; #if CONFIG_DTRACE - p->p_dtrace_argv = ptr_area; /* user_addr_t &argv[0] for dtrace convenience */ + if (descriptors[i].dtrace_cookie) { + proc_lock(p); + *descriptors[i].dtrace_cookie = ptr_area + ((uintptr_t)ptr_buffer - (uintptr_t)ptr_buffer_start); /* dtrace convenience */ + proc_unlock(p); + } #endif /* CONFIG_DTRACE */ - /* - * We use (string_area - path_area) here rather than the more - * intuitive (imgp->ip_argv - imgp->ip_strings) because we are - * interested in the length of the PATH_AREA in user space, - * rather than the actual length of the execution path, since - * it includes alignment padding of the PATH_AREA + STRING_AREA - * to a ptr_size boundary. - */ - strspace = SIZE_IMG_STRSPACE - (string_area - path_area); - for (;;) { - if (stringc == imgp->ip_envc) { - /* argv[n] = NULL */ - (void)copyoutptr(0LL, ptr_area, ptr_size); - ptr_area += ptr_size; -#if CONFIG_DTRACE - p->p_dtrace_envp = ptr_area; /* user_addr_t &env[0] for dtrace convenience */ -#endif /* CONFIG_DTRACE */ + /* + * For each segment (argv, envv, applev), copy as many pointers as requested + * to our pointer buffer. + */ + for (j = 0; j < descriptors[i].count; j++) { + user_addr_t cur_address = string_area + (cur_string - imgp->ip_strings); + + /* Copy out the pointer to the current string. Alignment has been verified */ + if (ptr_size == 8) { + *(uint64_t *)ptr_buffer = (uint64_t)cur_address; + } else { + *(uint32_t *)ptr_buffer = (uint32_t)cur_address; + } + + ptr_buffer = (void *)((uintptr_t)ptr_buffer + ptr_size); + cur_string += strlen(cur_string) + 1; /* Only a NUL between strings in the same area */ } - if (--stringc < 0) - break; - /* pointer: argv[n]/env[n] */ - (void)copyoutptr(string_area, ptr_area, ptr_size); - - /* string : argv[n][]/env[n][] */ - do { - if (strspace <= 0) { - error = E2BIG; - break; + if (descriptors[i].null_term) { + if (ptr_size == 8) { + *(uint64_t *)ptr_buffer = 0ULL; + } else { + *(uint32_t *)ptr_buffer = 0; } - error = copyoutstr(argv, string_area, - strspace, - &len); - string_area += len; - argv += len; - strspace -= len; - } while (error == ENAMETOOLONG); - if (error == EFAULT || error == E2BIG) - break; /* bad stack - user's problem */ - ptr_area += ptr_size; - } - /* env[n] = NULL */ - (void)copyoutptr(0LL, ptr_area, ptr_size); + + ptr_buffer = (void *)((uintptr_t)ptr_buffer + ptr_size); + } + } + + /* + * Copy out all our pointer arrays in bulk. + */ + error = copyout(ptr_buffer_start, ptr_area, + ptr_area_size); + if (error) + goto bad; + + /* argc (int32, stored in a ptr_size area) */ + error = copyoutptr((user_addr_t)imgp->ip_argc, argc_area, ptr_size); + if (error) + goto bad; bad: return(error); @@ -2495,6 +2705,11 @@ exec_copyout_strings(struct image_params *imgp, user_addr_t *stackp) * Copy arguments and environment from user space into work area; we may * have already copied some early arguments into the work area, and if * so, any arguments opied in are appended to those already there. + * This function is the primary manipulator of ip_argspace, since + * these are the arguments the client of execve(2) knows about. After + * each argv[]/envv[] string is copied, we charge the string length + * and argv[]/envv[] pointer slot to ip_argspace, so that we can + * full preflight the arg list size. * * Parameters: struct image_params * the image parameter block * @@ -2504,6 +2719,8 @@ exec_copyout_strings(struct image_params *imgp, user_addr_t *stackp) * Implicit returns; * (imgp->ip_argc) Count of arguments, updated * (imgp->ip_envc) Count of environment strings, updated + * (imgp->ip_argspace) Count of remaining of NCARGS + * (imgp->ip_interp_buffer) Interpreter and args (mutated in place) * * * Note: The argument and environment vectors are user space pointers @@ -2513,47 +2730,101 @@ static int exec_extract_strings(struct image_params *imgp) { int error = 0; - int strsz = 0; int ptr_size = (imgp->ip_flags & IMGPF_WAS_64BIT) ? 8 : 4; + int new_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT) ? 8 : 4; user_addr_t argv = imgp->ip_user_argv; user_addr_t envv = imgp->ip_user_envv; - /* - * If the argument vector is NULL, this is the system startup - * bootstrap from load_init_program(), and there's nothing to do - */ - if (imgp->ip_user_argv == 0LL) - goto bad; - - /* Now, get rest of arguments */ - /* * Adjust space reserved for the path name by however much padding it * needs. Doing this here since we didn't know if this would be a 32- * or 64-bit process back in exec_save_path. */ - strsz = strlen(imgp->ip_strings) + 1; - imgp->ip_strspace -= ((strsz + ptr_size-1) & ~(ptr_size-1)) - strsz; + while (imgp->ip_strspace % new_ptr_size != 0) { + *imgp->ip_strendp++ = '\0'; + imgp->ip_strspace--; + /* imgp->ip_argspace--; not counted towards exec args total */ + } /* - * If we are running an interpreter, replace the av[0] that was - * passed to execve() with the fully qualified path name that was - * passed to execve() for interpreters which do not use the PATH - * to locate their script arguments. + * From now on, we start attributing string space to ip_argspace */ - if((imgp->ip_flags & IMGPF_INTERPRET) != 0 && argv != 0LL) { + imgp->ip_startargv = imgp->ip_strendp; + imgp->ip_argc = 0; + + if((imgp->ip_flags & IMGPF_INTERPRET) != 0) { user_addr_t arg; + char *argstart, *ch; + + /* First, the arguments in the "#!" string are tokenized and extracted. */ + argstart = imgp->ip_interp_buffer; + while (argstart) { + ch = argstart; + while (*ch && !IS_WHITESPACE(*ch)) { + ch++; + } - error = copyinptr(argv, &arg, ptr_size); - if (error) - goto bad; - if (arg != 0LL && arg != (user_addr_t)-1) { - argv += ptr_size; - error = exec_add_string(imgp, imgp->ip_user_fname); + if (*ch == '\0') { + /* last argument, no need to NUL-terminate */ + error = exec_add_user_string(imgp, CAST_USER_ADDR_T(argstart), UIO_SYSSPACE, TRUE); + argstart = NULL; + } else { + /* NUL-terminate */ + *ch = '\0'; + error = exec_add_user_string(imgp, CAST_USER_ADDR_T(argstart), UIO_SYSSPACE, TRUE); + + /* + * Find the next string. We know spaces at the end of the string have already + * been stripped. + */ + argstart = ch + 1; + while (IS_WHITESPACE(*argstart)) { + argstart++; + } + } + + /* Error-check, regardless of whether this is the last interpreter arg or not */ if (error) goto bad; + if (imgp->ip_argspace < new_ptr_size) { + error = E2BIG; + goto bad; + } + imgp->ip_argspace -= new_ptr_size; /* to hold argv[] entry */ imgp->ip_argc++; } + + if (argv != 0LL) { + /* + * If we are running an interpreter, replace the av[0] that was + * passed to execve() with the path name that was + * passed to execve() for interpreters which do not use the PATH + * to locate their script arguments. + */ + error = copyinptr(argv, &arg, ptr_size); + if (error) + goto bad; + if (arg != 0LL) { + argv += ptr_size; /* consume without using */ + } + } + + if (imgp->ip_interp_sugid_fd != -1) { + char temp[19]; /* "/dev/fd/" + 10 digits + NUL */ + snprintf(temp, sizeof(temp), "/dev/fd/%d", imgp->ip_interp_sugid_fd); + error = exec_add_user_string(imgp, CAST_USER_ADDR_T(temp), UIO_SYSSPACE, TRUE); + } else { + error = exec_add_user_string(imgp, imgp->ip_user_fname, imgp->ip_seg, TRUE); + } + + if (error) + goto bad; + if (imgp->ip_argspace < new_ptr_size) { + error = E2BIG; + goto bad; + } + imgp->ip_argspace -= new_ptr_size; /* to hold argv[] entry */ + imgp->ip_argc++; } while (argv != 0LL) { @@ -2563,25 +2834,36 @@ exec_extract_strings(struct image_params *imgp) if (error) goto bad; - argv += ptr_size; if (arg == 0LL) { break; - } else if (arg == (user_addr_t)-1) { - /* Um... why would it be -1? */ - error = EFAULT; - goto bad; } + + argv += ptr_size; + /* * av[n...] = arg[n] */ - error = exec_add_string(imgp, arg); + error = exec_add_user_string(imgp, arg, imgp->ip_seg, TRUE); if (error) goto bad; + if (imgp->ip_argspace < new_ptr_size) { + error = E2BIG; + goto bad; + } + imgp->ip_argspace -= new_ptr_size; /* to hold argv[] entry */ imgp->ip_argc++; } + + /* Save space for argv[] NULL terminator */ + if (imgp->ip_argspace < new_ptr_size) { + error = E2BIG; + goto bad; + } + imgp->ip_argspace -= new_ptr_size; - /* Note where the args end and env begins. */ - imgp->ip_strendargvp = imgp->ip_strendp; + /* Note where the args ends and env begins. */ + imgp->ip_endargv = imgp->ip_strendp; + imgp->ip_envc = 0; /* Now, get the environment */ while (envv != 0LL) { @@ -2594,29 +2876,165 @@ exec_extract_strings(struct image_params *imgp) envv += ptr_size; if (env == 0LL) { break; - } else if (env == (user_addr_t)-1) { - error = EFAULT; - goto bad; } /* * av[n...] = env[n] */ - error = exec_add_string(imgp, env); + error = exec_add_user_string(imgp, env, imgp->ip_seg, TRUE); if (error) goto bad; + if (imgp->ip_argspace < new_ptr_size) { + error = E2BIG; + goto bad; + } + imgp->ip_argspace -= new_ptr_size; /* to hold envv[] entry */ imgp->ip_envc++; } + + /* Save space for envv[] NULL terminator */ + if (imgp->ip_argspace < new_ptr_size) { + error = E2BIG; + goto bad; + } + imgp->ip_argspace -= new_ptr_size; + + /* Align the tail of the combined argv+envv area */ + while (imgp->ip_strspace % new_ptr_size != 0) { + if (imgp->ip_argspace < 1) { + error = E2BIG; + goto bad; + } + *imgp->ip_strendp++ = '\0'; + imgp->ip_strspace--; + imgp->ip_argspace--; + } + + /* Note where the envv ends and applev begins. */ + imgp->ip_endenvv = imgp->ip_strendp; + + /* + * From now on, we are no longer charging argument + * space to ip_argspace. + */ + bad: return error; } +static char * +random_hex_str(char *str, int len) +{ + uint64_t low, high, value; + int idx; + char digit; + + /* A 64-bit value will only take 16 characters, plus '0x' and NULL. */ + if (len > 19) + len = 19; + + /* We need enough room for at least 1 digit */ + if (len < 4) + return (NULL); + + low = random(); + high = random(); + value = high << 32 | low; + + str[0] = '0'; + str[1] = 'x'; + for (idx = 2; idx < len - 1; idx++) { + digit = value & 0xf; + value = value >> 4; + if (digit < 10) + str[idx] = '0' + digit; + else + str[idx] = 'a' + (digit - 10); + } + str[idx] = '\0'; + return (str); +} + +/* + * Libc has an 8-element array set up for stack guard values. It only fills + * in one of those entries, and both gcc and llvm seem to use only a single + * 8-byte guard. Until somebody needs more than an 8-byte guard value, don't + * do the work to construct them. + */ +#define GUARD_VALUES 1 +#define GUARD_KEY "stack_guard=" + +/* + * System malloc needs some entropy when it is initialized. + */ +#define ENTROPY_VALUES 2 +#define ENTROPY_KEY "malloc_entropy=" + +/* + * Build up the contents of the apple[] string vector + */ +static int +exec_add_apple_strings(struct image_params *imgp) +{ + int i, error; + int new_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT) ? 8 : 4; + char guard[19]; + char guard_vec[strlen(GUARD_KEY) + 19 * GUARD_VALUES + 1]; + + char entropy[19]; + char entropy_vec[strlen(ENTROPY_KEY) + 19 * ENTROPY_VALUES + 1]; + + /* exec_save_path stored the first string */ + imgp->ip_applec = 1; + + /* + * Supply libc with a collection of random values to use when + * implementing -fstack-protector. + */ + (void)strlcpy(guard_vec, GUARD_KEY, sizeof (guard_vec)); + for (i = 0; i < GUARD_VALUES; i++) { + random_hex_str(guard, sizeof (guard)); + if (i) + (void)strlcat(guard_vec, ",", sizeof (guard_vec)); + (void)strlcat(guard_vec, guard, sizeof (guard_vec)); + } + + error = exec_add_user_string(imgp, CAST_USER_ADDR_T(guard_vec), UIO_SYSSPACE, FALSE); + if (error) + goto bad; + imgp->ip_applec++; + + /* + * Supply libc with entropy for system malloc. + */ + (void)strlcpy(entropy_vec, ENTROPY_KEY, sizeof(entropy_vec)); + for (i = 0; i < ENTROPY_VALUES; i++) { + random_hex_str(entropy, sizeof (entropy)); + if (i) + (void)strlcat(entropy_vec, ",", sizeof (entropy_vec)); + (void)strlcat(entropy_vec, entropy, sizeof (entropy_vec)); + } + + error = exec_add_user_string(imgp, CAST_USER_ADDR_T(entropy_vec), UIO_SYSSPACE, FALSE); + if (error) + goto bad; + imgp->ip_applec++; + + /* Align the tail of the combined applev area */ + while (imgp->ip_strspace % new_ptr_size != 0) { + *imgp->ip_strendp++ = '\0'; + imgp->ip_strspace--; + } + +bad: + return error; +} #define unix_stack_size(p) (p->p_rlimit[RLIMIT_STACK].rlim_cur) /* * exec_check_permissions * - * Decription: Verify that the file that is being attempted to be executed + * Description: Verify that the file that is being attempted to be executed * is in fact allowed to be executed based on it POSIX file * permissions and other access control criteria * @@ -2658,7 +3076,7 @@ exec_check_permissions(struct image_params *imgp) * will always succeed, and we don't want to happen unless the * file really is executable. */ - if ((vap->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) + if (!vfs_authopaque(vnode_mount(vp)) && ((vap->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)) return (EACCES); /* Disallow zero length files */ @@ -2705,7 +3123,7 @@ exec_check_permissions(struct image_params *imgp) * cached values, then we set the PowerPC environment flag. */ if (vap->va_fsid == exec_archhandler_ppc.fsid && - vap->va_fileid == (uint64_t)((uint32_t)exec_archhandler_ppc.fileid)) { + vap->va_fileid == exec_archhandler_ppc.fileid) { imgp->ip_flags |= IMGPF_POWERPC; } #endif /* IMGPF_POWERPC */ @@ -2790,7 +3208,7 @@ exec_handle_sugid(struct image_params *imgp) kauth_cred_getuid(cred) != imgp->ip_origvattr->va_uid) || ((imgp->ip_origvattr->va_mode & VSGID) != 0 && ((kauth_cred_ismember_gid(cred, imgp->ip_origvattr->va_gid, &leave_sugid_clear) || !leave_sugid_clear) || - (cred->cr_gid != imgp->ip_origvattr->va_gid)))) { + (kauth_cred_getgid(cred) != imgp->ip_origvattr->va_gid)))) { #if CONFIG_MACF /* label for MAC transition and neither VSUID nor VSGID */ @@ -2815,9 +3233,13 @@ exec_handle_sugid(struct image_params *imgp) */ if (imgp->ip_origvattr->va_mode & VSUID) { p->p_ucred = kauth_cred_setresuid(p->p_ucred, KAUTH_UID_NONE, imgp->ip_origvattr->va_uid, imgp->ip_origvattr->va_uid, KAUTH_UID_NONE); + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); } if (imgp->ip_origvattr->va_mode & VSGID) { p->p_ucred = kauth_cred_setresgid(p->p_ucred, KAUTH_GID_NONE, imgp->ip_origvattr->va_gid, imgp->ip_origvattr->va_gid); + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); } #if CONFIG_MACF @@ -2878,7 +3300,7 @@ exec_handle_sugid(struct image_params *imgp) if (dev_null == NULLVP) { struct nameidata nd1; - NDINIT(&nd1, LOOKUP, FOLLOW, UIO_SYSSPACE, + NDINIT(&nd1, LOOKUP, OP_OPEN, FOLLOW, UIO_SYSSPACE, CAST_USER_ADDR_T("/dev/null"), imgp->ip_vfs_context); @@ -2893,9 +3315,8 @@ exec_handle_sugid(struct image_params *imgp) } } - /* Radar 2261856; setuid security hole fix */ - /* Patch from OpenBSD: A. Ramesh */ /* + * Radar 2261856; setuid security hole fix * XXX For setuid processes, attempt to ensure that * stdin, stdout, and stderr are already allocated. * We do not want userland to accidentally allocate @@ -2913,7 +3334,7 @@ exec_handle_sugid(struct image_params *imgp) if ((error = falloc(p, &fp, &indx, imgp->ip_vfs_context)) != 0) continue; - if ((error = vnode_ref_ext(dev_null, FREAD)) != 0) { + if ((error = vnode_ref_ext(dev_null, FREAD, 0)) != 0) { fp_free(p, indx, fp); break; } @@ -2958,7 +3379,9 @@ exec_handle_sugid(struct image_params *imgp) * Implement the semantic where the effective user and group become * the saved user and group in exec'ed programs. */ - p->p_ucred = kauth_cred_setsvuidgid(p->p_ucred, kauth_cred_getuid(p->p_ucred), p->p_ucred->cr_gid); + p->p_ucred = kauth_cred_setsvuidgid(p->p_ucred, kauth_cred_getuid(p->p_ucred), kauth_cred_getgid(p->p_ucred)); + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); /* Update the process' identity version and set the security token */ p->p_idversion++; @@ -3131,7 +3554,7 @@ load_init_program(proc_t p) error = execve(p,&init_exec_args,retval); if (error) - panic("Process 1 exec of %s failed, errno %d\n", + panic("Process 1 exec of %s failed, errno %d", init_program_name, error); } @@ -3188,8 +3611,6 @@ load_return_to_errno(load_return_t lrtn) #include #include -extern semaphore_t execve_semaphore; - /* * execargs_alloc * @@ -3244,7 +3665,7 @@ execargs_lock_sleep(void) { static kern_return_t execargs_purgeable_allocate(char **execarg_address) { - kern_return_t kr = vm_allocate(bsd_pageable_map, (vm_offset_t *)execarg_address, NCARGS + PAGE_SIZE, VM_FLAGS_ANYWHERE | VM_FLAGS_PURGABLE); + kern_return_t kr = vm_allocate(bsd_pageable_map, (vm_offset_t *)execarg_address, BSD_PAGEABLE_SIZE_PER_EXEC, VM_FLAGS_ANYWHERE | VM_FLAGS_PURGABLE); assert(kr == KERN_SUCCESS); return kr; } @@ -3315,7 +3736,11 @@ execargs_alloc(struct image_params *imgp) return (ENOMEM); } - imgp->ip_vdata = imgp->ip_strings + NCARGS; + /* last page used to read in file headers */ + imgp->ip_vdata = imgp->ip_strings + ( NCARGS + PAGE_SIZE ); + imgp->ip_strendp = imgp->ip_strings; + imgp->ip_argspace = NCARGS; + imgp->ip_strspace = ( NCARGS + PAGE_SIZE ); return (0); } @@ -3404,8 +3829,11 @@ exec_resettextvp(proc_t p, struct image_params *imgp) static int check_for_signature(proc_t p, struct image_params *imgp) { + void *blob = NULL; + size_t length = 0; mach_port_t port = NULL; - kern_return_t error = 0; + kern_return_t kr = KERN_FAILURE; + int error = EACCES; unsigned char hash[SHA1_RESULTLEN]; /* @@ -3422,35 +3850,56 @@ check_for_signature(proc_t p, struct image_params *imgp) vm_map_switch_protect(get_task_map(p->task), TRUE); } - /* - * If the task_access_port is set and the proc isn't signed, - * ask for a code signature from user space. Fail the exec - * if permission is denied. - */ - error = task_get_task_access_port(p->task, &port); - if (error == 0 && IPC_PORT_VALID(port) && !(p->p_csflags & CS_VALID)) { - error = find_code_signature(port, p->p_pid); - if (error == KERN_FAILURE) { - /* Make very sure execution fails */ - psignal(p, SIGKILL); - return EACCES; - } + /* If the process is not signed or if it contains + * entitlements, we need to communicate through the + * task_access_port to taskgated. taskgated will provide a + * detached code signature if present, and will enforce any + * restrictions on entitlements. taskgated returns + * KERN_SUCCESS if it has completed its work and the exec + * should continue, or KERN_FAILURE if the exec should fail. + */ + error = cs_entitlements_blob_get(p, &blob, &length); - /* Only do this if exec_resettextvp() did not fail */ - if (p->p_textvp != NULLVP) { - /* - * If there's a new code directory, mark this process - * as signed. - */ - error = ubc_cs_getcdhash(p->p_textvp, p->p_textoff, hash); - if (error == 0) { - proc_lock(p); - p->p_csflags |= CS_VALID; - proc_unlock(p); - } + /* if signed and no entitlements, then we're done here */ + if ((p->p_csflags & CS_VALID) && NULL == blob) { + error = 0; + goto done; + } + + kr = task_get_task_access_port(p->task, &port); + if (KERN_SUCCESS != kr || !IPC_PORT_VALID(port)) { + error = 0; +#if !CONFIG_EMBEDDED + /* fatal on the desktop when entitlements are present */ + if (NULL != blob) + error = EACCES; +#endif + goto done; + } + + kr = find_code_signature(port, p->p_pid); + if (KERN_SUCCESS != kr) { + error = EACCES; + goto done; + } + + /* Only do this if exec_resettextvp() did not fail */ + if (p->p_textvp != NULLVP) { + /* + * If there's a new code directory, mark this process + * as signed. + */ + if (0 == ubc_cs_getcdhash(p->p_textvp, p->p_textoff, hash)) { + proc_lock(p); + p->p_csflags |= CS_VALID; + proc_unlock(p); } } - return KERN_SUCCESS; +done: + if (0 != error) + /* make very sure execution fails */ + psignal(p, SIGKILL); + return error; } diff --git a/bsd/kern/kern_exit.c b/bsd/kern/kern_exit.c index 811b4fb7d..7d5ddd37b 100644 --- a/bsd/kern/kern_exit.c +++ b/bsd/kern/kern_exit.c @@ -158,7 +158,6 @@ int *get_bsduthreadrval(thread_t); kern_return_t sys_perf_notify(thread_t thread, int pid); kern_return_t abnormal_exit_notify(mach_exception_data_type_t code, mach_exception_data_type_t subcode); -void workqueue_exit(struct proc *); void delay(int); /* @@ -256,8 +255,10 @@ exit1(proc_t p, int rv, int *retval) DTRACE_PROC1(exit, int, CLD_EXITED); proc_lock(p); + proc_transstart(p, 1); while (p->exit_thread != self) { if (sig_try_locked(p) <= 0) { + proc_transend(p, 1); if (get_threadtask(self) != task) { proc_unlock(p); return(0); @@ -283,11 +284,12 @@ exit1(proc_t p, int rv, int *retval) p->p_lflag |= P_LEXIT; p->p_xstat = rv; + proc_transend(p, 1); proc_unlock(p); proc_prepareexit(p, rv); - /* task terminate will call proc_terminate and that cleans it up */ + /* Last thread to terminate will call proc_exit() */ task_terminate_internal(task); return(0); @@ -372,21 +374,39 @@ proc_exit(proc_t p) pid_t pid; int exitval; - /* This can happen if thread_terminate of the single thread - * process - */ - uth = (struct uthread *)get_bsdthread_info(current_thread()); proc_lock(p); + proc_transstart(p, 1); if( !(p->p_lflag & P_LEXIT)) { + /* + * This can happen if a thread_terminate() occurs + * in a single-threaded process. + */ p->p_lflag |= P_LEXIT; + proc_transend(p, 1); proc_unlock(p); proc_prepareexit(p, 0); + (void) task_terminate_internal(task); proc_lock(p); + } else { + proc_transend(p, 1); } p->p_lflag |= P_LPEXIT; + + /* + * Other kernel threads may be in the middle of signalling this process. + * Wait for those threads to wrap it up before making the process + * disappear on them. + */ + if ((p->p_lflag & P_LINSIGNAL) || (p->p_sigwaitcnt > 0)) { + p->p_sigwaitcnt++; + while ((p->p_lflag & P_LINSIGNAL) || (p->p_sigwaitcnt > 1)) + msleep(&p->p_sigmask, &p->p_mlock, PWAIT, "proc_sigdrain", NULL); + p->p_sigwaitcnt--; + } + proc_unlock(p); pid = p->p_pid; exitval = p->p_xstat; @@ -429,6 +449,8 @@ proc_exit(proc_t p) MALLOC_ZONE(p->p_ru, struct rusage *, sizeof (*p->p_ru), M_ZOMBIE, M_WAITOK); + nspace_proc_exit(p); + /* * need to cancel async IO requests that can be cancelled and wait for those * already active. MAY BLOCK! @@ -575,7 +597,7 @@ proc_exit(proc_t p) * if the reap is already in progress. So we get * the reference here exclusively and their can be * no waiters. So there is no need for a wakeup - * after we are done. AlsO the reap frees the structure + * after we are done. Also the reap frees the structure * and the proc struct cannot be used for wakeups as well. * It is safe to use q here as this is system reap */ @@ -587,10 +609,21 @@ proc_exit(proc_t p) * since their existence means someone is messing up. */ if (q->p_lflag & P_LTRACED) { + /* + * Take a reference on the child process to + * ensure it doesn't exit and disappear between + * the time we drop the list_lock and attempt + * to acquire its proc_lock. + */ + if (proc_ref_locked(q) != q) + continue; + proc_list_unlock(); proc_lock(q); q->p_lflag &= ~P_LTRACED; if (q->sigwait_thread) { + thread_t thread = q->sigwait_thread; + proc_unlock(q); /* * The sigwait_thread could be stopped at a @@ -599,13 +632,16 @@ proc_exit(proc_t p) * the first thread in the task. So any attempts to kill * the process would result into a deadlock on q->sigwait. */ - thread_resume((thread_t)q->sigwait_thread); - clear_wait(q->sigwait_thread, THREAD_INTERRUPTED); - threadsignal((thread_t)q->sigwait_thread, SIGKILL, 0); - } else + thread_resume(thread); + clear_wait(thread, THREAD_INTERRUPTED); + threadsignal(thread, SIGKILL, 0); + } else { proc_unlock(q); + } + psignal(q, SIGKILL); proc_list_lock(); + proc_rele_locked(q); } } } @@ -629,10 +665,9 @@ proc_exit(proc_t p) */ /* No need for locking here as no one than this thread can access this */ if (p->p_ru != NULL) { + calcru(p, &p->p_stats->p_ru.ru_utime, &p->p_stats->p_ru.ru_stime, NULL); *p->p_ru = p->p_stats->p_ru; - calcru(p, &p->p_ru->ru_utime, &p->p_ru->ru_stime, NULL); - ruadd(p->p_ru, &p->p_stats->p_cru); } @@ -689,7 +724,8 @@ proc_exit(proc_t p) p->task = TASK_NULL; set_bsdtask_info(task, NULL); - proc_knote(p, NOTE_EXIT); + /* exit status will be seen by parent process */ + proc_knote(p, NOTE_EXIT | (p->p_xstat & 0xffff)); /* mark the thread as the one that is doing proc_exit * no need to hold proc lock in uthread_free @@ -737,7 +773,7 @@ proc_exit(proc_t p) * p_ucred usage is safe as it is an exiting process * and reference is dropped in reap */ - pp->si_uid = p->p_ucred->cr_ruid; + pp->si_uid = kauth_cred_getruid(p->p_ucred); proc_unlock(pp); } /* mark as a zombie */ @@ -855,7 +891,7 @@ reap_child_locked(proc_t parent, proc_t child, int deadparent, int locked, int d trace_parent->si_pid = child->p_pid; trace_parent->si_status = child->p_xstat; trace_parent->si_code = CLD_CONTINUED; - trace_parent->si_uid = child->p_ucred->cr_ruid; + trace_parent->si_uid = kauth_cred_getruid(child->p_ucred); proc_unlock(trace_parent); } proc_reparentlocked(child, trace_parent, 1, 0); @@ -899,7 +935,7 @@ reap_child_locked(proc_t parent, proc_t child, int deadparent, int locked, int d printf("Warning : lost p_ru for %s\n", child->p_comm); } - AUDIT_SESSION_PROCEXIT(child->p_ucred); + AUDIT_SESSION_PROCEXIT(child); /* * Decrement the count of procs running with this uid. @@ -907,7 +943,7 @@ reap_child_locked(proc_t parent, proc_t child, int deadparent, int locked, int d * and refernce is dropped after these calls down below * (locking protection is provided by list lock held in chgproccnt) */ - (void)chgproccnt(child->p_ucred->cr_ruid, -1); + (void)chgproccnt(kauth_cred_getruid(child->p_ucred), -1); #if CONFIG_LCTX ALLLCTX_LOCK; @@ -948,22 +984,21 @@ reap_child_locked(proc_t parent, proc_t child, int deadparent, int locked, int d proc_list_unlock(); -#ifdef CONFIG_EMBEDDED - lck_mtx_destroy(&child->p_mlock, proc_lck_grp); - lck_mtx_destroy(&child->p_fdmlock, proc_lck_grp); -#if CONFIG_DTRACE - lck_mtx_destroy(&child->p_dtrace_sprlock, proc_lck_grp); -#endif - lck_spin_destroy(&child->p_slock, proc_lck_grp); - -#else +#if CONFIG_FINE_LOCK_GROUPS lck_mtx_destroy(&child->p_mlock, proc_mlock_grp); lck_mtx_destroy(&child->p_fdmlock, proc_fdmlock_grp); #if CONFIG_DTRACE lck_mtx_destroy(&child->p_dtrace_sprlock, proc_lck_grp); #endif lck_spin_destroy(&child->p_slock, proc_slock_grp); +#else /* CONFIG_FINE_LOCK_GROUPS */ + lck_mtx_destroy(&child->p_mlock, proc_lck_grp); + lck_mtx_destroy(&child->p_fdmlock, proc_lck_grp); +#if CONFIG_DTRACE + lck_mtx_destroy(&child->p_dtrace_sprlock, proc_lck_grp); #endif + lck_spin_destroy(&child->p_slock, proc_lck_grp); +#endif /* CONFIG_FINE_LOCK_GROUPS */ workqueue_destroy_lock(child); FREE_ZONE(child, sizeof *child, M_PROC); @@ -1754,6 +1789,8 @@ vproc_exit(proc_t p) proc_lock(q); q->p_lflag &= ~P_LTRACED; if (q->sigwait_thread) { + thread_t thread = q->sigwait_thread; + proc_unlock(q); /* * The sigwait_thread could be stopped at a @@ -1762,12 +1799,13 @@ vproc_exit(proc_t p) * the first thread in the task. So any attempts to kill * the process would result into a deadlock on q->sigwait. */ - thread_resume((thread_t)q->sigwait_thread); - clear_wait(q->sigwait_thread, THREAD_INTERRUPTED); - threadsignal((thread_t)q->sigwait_thread, SIGKILL, 0); - } else + thread_resume(thread); + clear_wait(thread, THREAD_INTERRUPTED); + threadsignal(thread, SIGKILL, 0); + } else { proc_unlock(q); - + } + psignal(q, SIGKILL); proc_list_lock(); } @@ -1844,6 +1882,10 @@ vproc_exit(proc_t p) } } +#if PSYNCH + pth_proc_hashdelete(p); +#endif /* PSYNCH */ + /* * Other substructures are freed from wait(). */ @@ -1877,7 +1919,7 @@ vproc_exit(proc_t p) * p_ucred usage is safe as it is an exiting process * and reference is dropped in reap */ - pp->si_uid = p->p_ucred->cr_ruid; + pp->si_uid = kauth_cred_getruid(p->p_ucred); proc_unlock(pp); } /* mark as a zombie */ diff --git a/bsd/kern/kern_fork.c b/bsd/kern/kern_fork.c index a5b1350d3..7746398bf 100644 --- a/bsd/kern/kern_fork.c +++ b/bsd/kern/kern_fork.c @@ -129,7 +129,6 @@ extern void dtrace_lazy_dofs_duplicate(proc_t, proc_t); #include - /* XXX routines which should have Mach prototypes, but don't */ void thread_set_parent(thread_t parent, int pid); extern void act_thread_catt(void *ctx); @@ -365,7 +364,7 @@ fork1(proc_t parent_proc, thread_t *child_threadp, int kind) * exceed the limit. The variable nprocs is the current number of * processes, maxproc is the limit. */ - uid = kauth_cred_get()->cr_ruid; + uid = kauth_getruid(); proc_list_lock(); if ((nprocs >= maxproc - 1 && uid != 0) || nprocs >= maxproc) { proc_list_unlock(); @@ -466,7 +465,6 @@ fork1(proc_t parent_proc, thread_t *child_threadp, int kind) AUDIT_ARG(pid, child_proc->p_pid); - AUDIT_SESSION_PROCNEW(child_proc->p_ucred); // XXX END: wants to move to be common code (and safe) /* @@ -570,7 +568,6 @@ fork1(proc_t parent_proc, thread_t *child_threadp, int kind) AUDIT_ARG(pid, child_proc->p_pid); - AUDIT_SESSION_PROCNEW(child_proc->p_ucred); // XXX END: wants to move to be common code (and safe) /* @@ -690,7 +687,6 @@ vfork_return(proc_t child_proc, int32_t *retval, int rval) thread_t parent_thread = (thread_t)current_thread(); uthread_t parent_uthread = (uthread_t)get_bsdthread_info(parent_thread); - act_thread_catt(parent_uthread->uu_userstate); /* end vfork in parent */ @@ -948,14 +944,6 @@ cloneproc(task_t parent_task, proc_t parent_proc, int inherit_memory) if (parent_proc->p_flag & P_LP64) { task_set_64bit(child_task, TRUE); OSBitOrAtomic(P_LP64, (UInt32 *)&child_proc->p_flag); -#ifdef __ppc__ - /* - * PPC51: ppc64 is limited to 51-bit addresses. - * Memory above that limit is handled specially at - * the pmap level. - */ - pmap_map_sharedpage(child_task, get_map_pmap(get_task_map(child_task))); -#endif /* __ppc__ */ } else { task_set_64bit(child_task, FALSE); OSBitAndAtomic(~((uint32_t)P_LP64), (UInt32 *)&child_proc->p_flag); @@ -1031,6 +1019,9 @@ forkproc_free(proc_t p) /* Stop the profiling clock */ stopprofclock(p); + /* Update the audit session proc count */ + AUDIT_SESSION_PROCEXIT(p); + /* Release the credential reference */ kauth_cred_unref(&p->p_ucred); @@ -1069,6 +1060,7 @@ forkproc(proc_t parent_proc) { proc_t child_proc; /* Our new process */ static int nextpid = 0, pidwrap = 0, nextpidversion = 0; + static uint64_t nextuniqueid = 0; int error = 0; struct session *sessp; uthread_t parent_uthread = (uthread_t)get_bsdthread_info(current_thread()); @@ -1147,6 +1139,8 @@ forkproc(proc_t parent_proc) nprocs++; child_proc->p_pid = nextpid; child_proc->p_idversion = nextpidversion++; + /* kernel process is handcrafted and not from fork, so start from 1 */ + child_proc->p_uniqueid = ++nextuniqueid; #if 1 if (child_proc->p_pid != 0) { if (pfind_locked(child_proc->p_pid) != PROC_NULL) @@ -1180,7 +1174,7 @@ forkproc(proc_t parent_proc) * Increase reference counts on shared objects. * The p_stats and p_sigacts substructs are set in vm_fork. */ - child_proc->p_flag = (parent_proc->p_flag & (P_LP64 | P_TRANSLATED | P_AFFINITY)); + child_proc->p_flag = (parent_proc->p_flag & (P_LP64 | P_TRANSLATED | P_AFFINITY | P_DISABLE_ASLR)); if (parent_proc->p_flag & P_PROFIL) startprofclock(child_proc); /* @@ -1188,22 +1182,26 @@ forkproc(proc_t parent_proc) * credential will be granted to the new process. */ child_proc->p_ucred = kauth_cred_get_with_ref(); + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(child_proc); + /* update audit session proc count */ + AUDIT_SESSION_PROCNEW(child_proc); -#ifdef CONFIG_EMBEDDED - lck_mtx_init(&child_proc->p_mlock, proc_lck_grp, proc_lck_attr); - lck_mtx_init(&child_proc->p_fdmlock, proc_lck_grp, proc_lck_attr); -#if CONFIG_DTRACE - lck_mtx_init(&child_proc->p_dtrace_sprlock, proc_lck_grp, proc_lck_attr); -#endif - lck_spin_init(&child_proc->p_slock, proc_lck_grp, proc_lck_attr); -#else /* !CONFIG_EMBEDDED */ +#if CONFIG_FINE_LOCK_GROUPS lck_mtx_init(&child_proc->p_mlock, proc_mlock_grp, proc_lck_attr); lck_mtx_init(&child_proc->p_fdmlock, proc_fdmlock_grp, proc_lck_attr); #if CONFIG_DTRACE lck_mtx_init(&child_proc->p_dtrace_sprlock, proc_lck_grp, proc_lck_attr); #endif lck_spin_init(&child_proc->p_slock, proc_slock_grp, proc_lck_attr); -#endif /* !CONFIG_EMBEDDED */ +#else /* !CONFIG_FINE_LOCK_GROUPS */ + lck_mtx_init(&child_proc->p_mlock, proc_lck_grp, proc_lck_attr); + lck_mtx_init(&child_proc->p_fdmlock, proc_lck_grp, proc_lck_attr); +#if CONFIG_DTRACE + lck_mtx_init(&child_proc->p_dtrace_sprlock, proc_lck_grp, proc_lck_attr); +#endif + lck_spin_init(&child_proc->p_slock, proc_lck_grp, proc_lck_attr); +#endif /* !CONFIG_FINE_LOCK_GROUPS */ klist_init(&child_proc->p_klist); if (child_proc->p_textvp != NULLVP) { @@ -1396,6 +1394,7 @@ uthread_alloc(task_t task, thread_t thread, int noinherit) p = (proc_t) get_bsdtask_info(task); uth = (uthread_t)ut; + uth->uu_kwe.kwe_uth = uth; /* * Thread inherits credential from the creating thread, if both diff --git a/bsd/kern/kern_lockf.c b/bsd/kern/kern_lockf.c index 31b25d885..b7775864c 100644 --- a/bsd/kern/kern_lockf.c +++ b/bsd/kern/kern_lockf.c @@ -89,7 +89,7 @@ static int maxlockdepth = MAXDEPTH; void lf_print(const char *tag, struct lockf *lock); void lf_printlist(const char *tag, struct lockf *lock); static int lockf_debug = 2; -SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW, &lockf_debug, 0, ""); +SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &lockf_debug, 0, ""); /* * If there is no mask bit selector, or there is on, and the selector is @@ -129,11 +129,13 @@ static overlap_t lf_findoverlap(struct lockf *, struct lockf *, int, struct lockf ***, struct lockf **); static struct lockf *lf_getblock(struct lockf *); static int lf_getlock(struct lockf *, struct flock *); +#if CONFIG_EMBEDDED +static int lf_getlockpid(struct vnode *, struct flock *); +#endif static int lf_setlock(struct lockf *); static int lf_split(struct lockf *, struct lockf *); static void lf_wakelock(struct lockf *, boolean_t); - /* * lf_advlock * @@ -172,6 +174,11 @@ lf_advlock(struct vnop_advlock_args *ap) /* XXX HFS may need a !vnode_isreg(vp) EISDIR error here */ +#if CONFIG_EMBEDDED + if (ap->a_op == F_GETLKPID) + return lf_getlockpid(vp, fl); +#endif + /* * Avoid the common case of unlocking when inode has no locks. */ @@ -289,7 +296,7 @@ lf_advlock(struct vnop_advlock_args *ap) error = EINVAL; break; } - lck_mtx_unlock(&vp->v_lock); /* done maniplulating the list */ + lck_mtx_unlock(&vp->v_lock); /* done manipulating the list */ LOCKF_DEBUG(0, "lf_advlock: normal exit: %d\n\n", error); return (error); @@ -297,25 +304,42 @@ lf_advlock(struct vnop_advlock_args *ap) /* - * lf_coelesce_adjacent + * Take any lock attempts which are currently blocked by a given lock ("from") + * and mark them as blocked by a different lock ("to"). Used in the case + * where a byte range currently occupied by "from" is to be occupied by "to." + */ +static void +lf_move_blocked(struct lockf *to, struct lockf *from) +{ + struct lockf *tlock; + + TAILQ_FOREACH(tlock, &from->lf_blkhd, lf_block) { + tlock->lf_next = to; + } + + TAILQ_CONCAT(&to->lf_blkhd, &from->lf_blkhd, lf_block); +} + +/* + * lf_coalesce_adjacent * - * Description: Helper function: when setting a lock, coelesce adjacent + * Description: Helper function: when setting a lock, coalesce adjacent * locks. Needed because adjacent locks are not overlapping, - * but POSIX requires that they be coelesced. + * but POSIX requires that they be coalesced. * * Parameters: lock The new lock which may be adjacent - * to already locked reagions, and which - * should therefore be coelesced with them + * to already locked regions, and which + * should therefore be coalesced with them * * Returns: */ static void -lf_coelesce_adjacent(struct lockf *lock) +lf_coalesce_adjacent(struct lockf *lock) { struct lockf **lf = lock->lf_head; while (*lf != NOLOCKF) { - /* reject locks that obviously could not be coelesced */ + /* reject locks that obviously could not be coalesced */ if ((*lf == lock) || ((*lf)->lf_id != lock->lf_id) || ((*lf)->lf_type != lock->lf_type)) { @@ -323,27 +347,38 @@ lf_coelesce_adjacent(struct lockf *lock) continue; } + /* + * NOTE: Assumes that if two locks are adjacent on the number line + * and belong to the same owner, then they are adjacent on the list. + */ + /* If the lock ends adjacent to us, we can coelesce it */ if ((*lf)->lf_end != -1 && ((*lf)->lf_end + 1) == lock->lf_start) { struct lockf *adjacent = *lf; - LOCKF_DEBUG(0, "lf_coelesce_adjacent: coelesce adjacent previous\n"); + LOCKF_DEBUG(0, "lf_coalesce_adjacent: coalesce adjacent previous\n"); lock->lf_start = (*lf)->lf_start; *lf = lock; lf = &(*lf)->lf_next; + + lf_move_blocked(lock, adjacent); + FREE(adjacent, M_LOCKF); continue; } - /* If the lock starts adjacent to us, we can coelesce it */ + /* If the lock starts adjacent to us, we can coalesce it */ if (lock->lf_end != -1 && (lock->lf_end + 1) == (*lf)->lf_start) { struct lockf *adjacent = *lf; - LOCKF_DEBUG(0, "lf_coelesce_adjacent: coelesce adjacent following\n"); + LOCKF_DEBUG(0, "lf_coalesce_adjacent: coalesce adjacent following\n"); lock->lf_end = (*lf)->lf_end; lock->lf_next = (*lf)->lf_next; lf = &lock->lf_next; + + lf_move_blocked(lock, adjacent); + FREE(adjacent, M_LOCKF); continue; } @@ -373,7 +408,7 @@ lf_coelesce_adjacent(struct lockf *lock) * msleep:EINTR * * Notes: We add the lock to the provisional lock list. We do not - * coelesce at this time; this has implications for other lock + * coalesce at this time; this has implications for other lock * requestors in the blocker search mechanism. */ static int @@ -518,13 +553,8 @@ lf_setlock(struct lockf *lock) error = msleep(lock, &vp->v_lock, priority, lockstr, 0); if (!TAILQ_EMPTY(&lock->lf_blkhd)) { - struct lockf *tlock; - if ((block = lf_getblock(lock))) { - TAILQ_FOREACH(tlock, &lock->lf_blkhd, lf_block) { - tlock->lf_next = block; - } - TAILQ_CONCAT(&block->lf_blkhd, &lock->lf_blkhd, lf_block); + lf_move_blocked(block, lock); } } if (error) { /* XXX */ @@ -589,7 +619,7 @@ lf_setlock(struct lockf *lock) lf_wakelock(overlap, TRUE); overlap->lf_type = lock->lf_type; FREE(lock, M_LOCKF); - lock = overlap; /* for lf_coelesce_adjacent() */ + lock = overlap; /* for lf_coalesce_adjacent() */ break; case OVERLAP_CONTAINS_LOCK: @@ -598,7 +628,7 @@ lf_setlock(struct lockf *lock) */ if (overlap->lf_type == lock->lf_type) { FREE(lock, M_LOCKF); - lock = overlap; /* for lf_coelesce_adjacent() */ + lock = overlap; /* for lf_coalesce_adjacent() */ break; } if (overlap->lf_start == lock->lf_start) { @@ -676,8 +706,8 @@ lf_setlock(struct lockf *lock) } break; } - /* Coelesce adjacent locks with identical attributes */ - lf_coelesce_adjacent(lock); + /* Coalesce adjacent locks with identical attributes */ + lf_coalesce_adjacent(lock); #ifdef LOCKF_DEBUGGING if (lockf_debug & 1) { lf_print("lf_setlock: got the lock", lock); @@ -825,6 +855,55 @@ lf_getlock(struct lockf *lock, struct flock *fl) return (0); } +#if CONFIG_EMBEDDED +int lf_getlockpid(struct vnode *vp, struct flock *fl) +{ + struct lockf *lf, *blk; + + if (vp == 0) + return EINVAL; + + fl->l_type = F_UNLCK; + + lck_mtx_lock(&vp->v_lock); + + for (lf = vp->v_lockf; lf; lf = lf->lf_next) { + + if (lf->lf_flags & F_POSIX) { + if ((((struct proc *)lf->lf_id)->p_pid) == fl->l_pid) { + fl->l_type = lf->lf_type; + fl->l_whence = SEEK_SET; + fl->l_start = lf->lf_start; + if (lf->lf_end == -1) + fl->l_len = 0; + else + fl->l_len = lf->lf_end - lf->lf_start + 1; + + break; + } + } + + TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) { + if (blk->lf_flags & F_POSIX) { + if ((((struct proc *)blk->lf_id)->p_pid) == fl->l_pid) { + fl->l_type = blk->lf_type; + fl->l_whence = SEEK_SET; + fl->l_start = blk->lf_start; + if (blk->lf_end == -1) + fl->l_len = 0; + else + fl->l_len = blk->lf_end - blk->lf_start + 1; + + break; + } + } + } + } + + lck_mtx_unlock(&vp->v_lock); + return (0); +} +#endif /* * lf_getblock @@ -901,7 +980,7 @@ lf_getblock(struct lockf *lock) * while lf_setlock will iterate over all overlapping locks to * * The check parameter can be SELF, meaning we are looking for - * overelapping locks owned by us, or it can be OTHERS, meaning + * overlapping locks owned by us, or it can be OTHERS, meaning * we are looking for overlapping locks owned by someone else so * we can report a blocking lock on an F_GETLK request. * @@ -913,6 +992,7 @@ lf_findoverlap(struct lockf *lf, struct lockf *lock, int type, struct lockf ***prev, struct lockf **overlap) { off_t start, end; + int found_self = 0; *overlap = lf; if (lf == NOLOCKF) @@ -926,10 +1006,28 @@ lf_findoverlap(struct lockf *lf, struct lockf *lock, int type, while (lf != NOLOCKF) { if (((type & SELF) && lf->lf_id != lock->lf_id) || ((type & OTHERS) && lf->lf_id == lock->lf_id)) { + /* + * Locks belonging to one process are adjacent on the + * list, so if we've found any locks belonging to us, + * and we're now seeing something else, then we've + * examined all "self" locks. Note that bailing out + * here is quite important; for coalescing, we assume + * numerically adjacent locks from the same owner to + * be adjacent on the list. + */ + if ((type & SELF) && found_self) { + return OVERLAP_NONE; + } + *prev = &lf->lf_next; *overlap = lf = lf->lf_next; continue; } + + if ((type & SELF)) { + found_self = 1; + } + #ifdef LOCKF_DEBUGGING if (lockf_debug & 2) lf_print("\tchecking", lf); @@ -941,6 +1039,11 @@ lf_findoverlap(struct lockf *lf, struct lockf *lock, int type, (end != -1 && lf->lf_start > end)) { /* Case 0 */ LOCKF_DEBUG(2, "no overlap\n"); + + /* + * NOTE: assumes that locks for the same process are + * nonintersecting and ordered. + */ if ((type & SELF) && end != -1 && lf->lf_start > end) return (OVERLAP_NONE); *prev = &lf->lf_next; diff --git a/bsd/kern/kern_malloc.c b/bsd/kern/kern_malloc.c index ff86bfff6..c1700ee51 100644 --- a/bsd/kern/kern_malloc.c +++ b/bsd/kern/kern_malloc.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -293,9 +293,15 @@ const char *memname[] = { #else "", /* 109 M_DECMPFS_CNODE */ #endif /* HFS_COMPRESSION */ + "ipmfilter", /* 110 M_INMFILTER */ + "ipmsource", /* 111 M_IPMSOURCE */ + "in6mfilter", /* 112 M_IN6MFILTER */ + "ip6mopts", /* 113 M_IP6MOPTS */ + "ip6msource", /* 114 M_IP6MSOURCE */ }; /* for use with kmzones.kz_zalloczone */ +#define KMZ_CREATEZONE_ACCT ((void *)-3) #define KMZ_CREATEZONE ((void *)-2) #define KMZ_LOOKUPZONE ((void *)-1) #define KMZ_MALLOC ((void *)0) @@ -332,7 +338,7 @@ struct kmzones { { 0, KMZ_MALLOC, FALSE }, /* 21 M_FHANDLE */ #if (NFSCLIENT || NFSSERVER) { SOS(nfsreq), KMZ_CREATEZONE, FALSE }, /* 22 M_NFSREQ */ - { SOS(nfsmount), KMZ_CREATEZONE, FALSE },/* 23 M_NFSMNT */ + { SOS(nfsmount),KMZ_CREATEZONE, FALSE }, /* 23 M_NFSMNT */ { SOS(nfsnode), KMZ_CREATEZONE, FALSE }, /* 24 M_NFSNODE */ #else { 0, KMZ_MALLOC, FALSE }, /* 22 M_NFSREQ */ @@ -340,25 +346,25 @@ struct kmzones { { 0, KMZ_MALLOC, FALSE }, /* 24 M_NFSNODE */ #endif { SOS(vnode), KMZ_CREATEZONE, TRUE }, /* 25 M_VNODE */ - { SOS(namecache),KMZ_CREATEZONE, FALSE }, /* 26 M_CACHE */ + { SOS(namecache), KMZ_CREATEZONE, FALSE }, /* 26 M_CACHE */ #if QUOTA { SOX(dquot), KMZ_LOOKUPZONE, FALSE }, /* 27 M_DQUOT */ #else { 0, KMZ_MALLOC, FALSE }, /* 27 M_DQUOT */ #endif { 0, KMZ_MALLOC, FALSE }, /* 28 M_UFSMNT */ - { 0, KMZ_MALLOC, FALSE }, /* 29 M_CGSUM */ + { 0, KMZ_MALLOC, FALSE }, /* 29 M_SHM */ { SOS(plimit), KMZ_CREATEZONE, TRUE }, /* 30 M_PLIMIT */ - { SOS(sigacts), KMZ_CREATEZONE, TRUE }, /* 31 M_SIGACTS */ + { SOS(sigacts), KMZ_CREATEZONE_ACCT, TRUE }, /* 31 M_SIGACTS */ { 0, KMZ_MALLOC, FALSE }, /* 32 M_VMOBJ */ { 0, KMZ_MALLOC, FALSE }, /* 33 M_VMOBJHASH */ { 0, KMZ_MALLOC, FALSE }, /* 34 M_VMPMAP */ { 0, KMZ_MALLOC, FALSE }, /* 35 M_VMPVENT */ { 0, KMZ_MALLOC, FALSE }, /* 36 M_VMPAGER */ { 0, KMZ_MALLOC, FALSE }, /* 37 M_VMPGDATA */ - { SOS(fileproc),KMZ_CREATEZONE, TRUE }, /* 38 M_FILEPROC */ - { SOS(filedesc),KMZ_CREATEZONE, TRUE }, /* 39 M_FILEDESC */ - { SOX(lockf), KMZ_CREATEZONE, TRUE }, /* 40 M_LOCKF */ + { SOS(fileproc),KMZ_CREATEZONE_ACCT, TRUE }, /* 38 M_FILEPROC */ + { SOS(filedesc),KMZ_CREATEZONE_ACCT, TRUE }, /* 39 M_FILEDESC */ + { SOX(lockf), KMZ_CREATEZONE_ACCT, TRUE }, /* 40 M_LOCKF */ { SOS(proc), KMZ_CREATEZONE, FALSE }, /* 41 M_PROC */ { SOS(pstats), KMZ_CREATEZONE, TRUE }, /* 42 M_PSTATS */ { 0, KMZ_MALLOC, FALSE }, /* 43 M_SEGMENT */ @@ -370,10 +376,10 @@ struct kmzones { { 0, KMZ_MALLOC, FALSE }, /* 49 M_NETADDR */ #if (NFSCLIENT || NFSSERVER) { SOX(nfsrv_sock), - KMZ_CREATEZONE, FALSE }, /* 50 M_NFSSVC */ + KMZ_CREATEZONE_ACCT, FALSE }, /* 50 M_NFSSVC */ { 0, KMZ_MALLOC, FALSE }, /* 51 M_NFSUID */ { SOX(nfsrvcache), - KMZ_CREATEZONE, FALSE }, /* 52 M_NFSD */ + KMZ_CREATEZONE_ACCT, FALSE }, /* 52 M_NFSD */ #else { 0, KMZ_MALLOC, FALSE }, /* 50 M_NFSSVC */ { 0, KMZ_MALLOC, FALSE }, /* 51 M_NFSUID */ @@ -389,7 +395,7 @@ struct kmzones { { 0, KMZ_MALLOC, FALSE }, /* 58 unused entry */ #if (NFSCLIENT || NFSSERVER) { SOS(nfsrv_descript), - KMZ_CREATEZONE, FALSE }, /* 59 M_NFSRVDESC */ + KMZ_CREATEZONE_ACCT, FALSE }, /* 59 M_NFSRVDESC */ { SOS(nfsdmap), KMZ_CREATEZONE, FALSE }, /* 60 M_NFSDIROFF */ { SOS(fhandle), KMZ_LOOKUPZONE, FALSE }, /* 61 M_NFSBIGFH */ #else @@ -407,9 +413,9 @@ struct kmzones { { 0, KMZ_MALLOC, FALSE }, /* 69 M_ADOSFSMNT */ { 0, KMZ_MALLOC, FALSE }, /* 70 M_ADOSFSNODE */ { 0, KMZ_MALLOC, FALSE }, /* 71 M_ANODE */ - { SOX(buf), KMZ_CREATEZONE, TRUE }, /* 72 M_BUFHDR */ + { 0, KMZ_MALLOC, TRUE }, /* 72 M_BUFHDR */ { (NDFILE * OFILESIZE), - KMZ_CREATEZONE, FALSE }, /* 73 M_OFILETABL */ + KMZ_CREATEZONE_ACCT, FALSE }, /* 73 M_OFILETABL */ { MCLBYTES, KMZ_CREATEZONE, FALSE }, /* 74 M_MCLUST */ #if HFS { SOX(hfsmount),KMZ_LOOKUPZONE, FALSE }, /* 75 M_HFSMNT */ @@ -437,15 +443,15 @@ struct kmzones { { SOS(journal), KMZ_CREATEZONE, FALSE }, /* 91 M_JNL_JNL */ { SOS(transaction), KMZ_CREATEZONE, FALSE }, /* 92 M_JNL_TR */ #else - { 0, KMZ_MALLOC, FALSE }, /* 91 M_JNL_JNL */ - { 0, KMZ_MALLOC, FALSE }, /* 92 M_JNL_TR */ + { 0, KMZ_MALLOC, FALSE }, /* 91 M_JNL_JNL */ + { 0, KMZ_MALLOC, FALSE }, /* 92 M_JNL_TR */ #endif - { SOS(specinfo), KMZ_CREATEZONE, TRUE }, /* 93 M_SPECINFO */ - { SOS(kqueue), KMZ_CREATEZONE, FALSE }, /* 94 M_KQUEUE */ + { SOS(specinfo),KMZ_CREATEZONE, TRUE }, /* 93 M_SPECINFO */ + { SOS(kqueue), KMZ_CREATEZONE, FALSE }, /* 94 M_KQUEUE */ #if HFS - { SOS(directoryhint), KMZ_CREATEZONE, FALSE }, /* 95 M_HFSDIRHINT */ + { SOS(directoryhint), KMZ_CREATEZONE, TRUE }, /* 95 M_HFSDIRHINT */ #else - { 0, KMZ_MALLOC, FALSE }, /* 95 M_HFSDIRHINT */ + { 0, KMZ_MALLOC, FALSE }, /* 95 M_HFSDIRHINT */ #endif { SOS(cl_readahead), KMZ_CREATEZONE, TRUE }, /* 96 M_CLRDAHEAD */ { SOS(cl_writebehind),KMZ_CREATEZONE, TRUE }, /* 97 M_CLWRBEHIND */ @@ -454,7 +460,7 @@ struct kmzones { { 0, KMZ_MALLOC, FALSE }, /* 100 M_KAUTH */ { 0, KMZ_MALLOC, FALSE }, /* 101 M_DUMMYNET */ #ifndef __LP64__ - { SOS(unsafe_fsnode),KMZ_CREATEZONE, FALSE }, /* 102 M_UNSAFEFS */ + { SOS(unsafe_fsnode),KMZ_CREATEZONE, TRUE }, /* 102 M_UNSAFEFS */ #else { 0, KMZ_MALLOC, FALSE }, /* 102 M_UNSAFEFS */ #endif /* __LP64__ */ @@ -465,10 +471,15 @@ struct kmzones { { 0, KMZ_MALLOC, FALSE }, /* 107 M_LCTX */ { 0, KMZ_MALLOC, FALSE }, /* 108 M_TRAFFIC_MGT */ #if HFS_COMPRESSION - { SOS(decmpfs_cnode),KMZ_CREATEZONE, FALSE }, /* 109 M_DECMPFS_CNODE */ + { SOS(decmpfs_cnode),KMZ_CREATEZONE , FALSE}, /* 109 M_DECMPFS_CNODE */ #else { 0, KMZ_MALLOC, FALSE }, /* 109 M_DECMPFS_CNODE */ #endif /* HFS_COMPRESSION */ + { 0, KMZ_MALLOC, FALSE }, /* 110 M_INMFILTER */ + { 0, KMZ_MALLOC, FALSE }, /* 111 M_IPMSOURCE */ + { 0, KMZ_MALLOC, FALSE }, /* 112 M_IN6MFILTER */ + { 0, KMZ_MALLOC, FALSE }, /* 113 M_IP6MOPTS */ + { 0, KMZ_MALLOC, FALSE }, /* 114 M_IP6MSOURCE */ #undef SOS #undef SOX }; @@ -495,10 +506,14 @@ kmeminit(void) ; else /* XXX */ - if (kmz->kz_zalloczone == KMZ_CREATEZONE) { + if (kmz->kz_zalloczone == KMZ_CREATEZONE || + kmz->kz_zalloczone == KMZ_CREATEZONE_ACCT) { kmz->kz_zalloczone = zinit(kmz->kz_elemsize, 1024 * 1024, PAGE_SIZE, memname[kmz - kmzones]); + zone_change(kmz->kz_zalloczone, Z_CALLERACCT, + (kmz->kz_zalloczone == KMZ_CREATEZONE_ACCT)); + if (kmz->kz_noencrypt == TRUE) zone_change(kmz->kz_zalloczone, Z_NOENCRYPT, TRUE); } @@ -526,12 +541,6 @@ kmeminit(void) } } -#define MDECL(reqlen) \ -union { \ - struct _mhead hdr; \ - char _m[(reqlen) + sizeof (struct _mhead)]; \ -} - struct _mhead { size_t mlen; char dat[0]; @@ -543,8 +552,8 @@ _MALLOC( int type, int flags) { - MDECL(size) *mem; - size_t memsize = sizeof (*mem); + struct _mhead *hdr; + size_t memsize = sizeof (*hdr) + size; if (type >= M_LAST) panic("_malloc TYPE"); @@ -553,11 +562,11 @@ _MALLOC( return (NULL); if (flags & M_NOWAIT) { - mem = (void *)kalloc_noblock(memsize); + hdr = (void *)kalloc_noblock(memsize); } else { - mem = (void *)kalloc(memsize); + hdr = (void *)kalloc(memsize); - if (mem == NULL) { + if (hdr == NULL) { /* * We get here when the caller told us to block waiting for memory, but @@ -572,15 +581,15 @@ _MALLOC( panic("_MALLOC: kalloc returned NULL (potential leak), size %llu", (uint64_t) size); } } - if (!mem) + if (!hdr) return (0); - mem->hdr.mlen = memsize; + hdr->mlen = memsize; if (flags & M_ZERO) - bzero(mem->hdr.dat, size); + bzero(hdr->dat, size); - return (mem->hdr.dat); + return (hdr->dat); } void @@ -600,6 +609,36 @@ _FREE( kfree(hdr, hdr->mlen); } +void * +_REALLOC( + void *addr, + size_t size, + int type, + int flags) +{ + struct _mhead *hdr; + void *newaddr; + size_t alloc; + + /* realloc(NULL, ...) is equivalent to malloc(...) */ + if (addr == NULL) + return (_MALLOC(size, type, flags)); + + /* Allocate a new, bigger (or smaller) block */ + if ((newaddr = _MALLOC(size, type, flags)) == NULL) + return (NULL); + + hdr = addr; + --hdr; + alloc = hdr->mlen - sizeof (*hdr); + + /* Copy over original contents */ + bcopy(addr, newaddr, MIN(size, alloc)); + _FREE(addr, type); + + return (newaddr); +} + void * _MALLOC_ZONE( size_t size, @@ -660,3 +699,116 @@ _FREE_ZONE( else kfree(elem, size); } + +#if CONFIG_ZLEAKS + +SYSCTL_DECL(_kern_zleak); +SYSCTL_NODE(_kern, OID_AUTO, zleak, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "zleak"); + +/* + * kern.zleak.active + * + * Show the status of the zleak subsystem (0 = enabled, 1 = active, + * and -1 = failed), and if enabled, allow it to be activated immediately. + */ +static int +sysctl_zleak_active SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int oldval, val, error; + + val = oldval = get_zleak_state(); + error = sysctl_handle_int(oidp, &val, 0, req); + if (error || !req->newptr) + return (error); + /* + * Can only be activated if it's off (and not failed.) + * Cannot be deactivated once it's on. + */ + if (val == 1 && oldval == 0) { + kern_return_t kr = zleak_activate(); + + if (KERN_SUCCESS != kr) + printf("zleak_active: failed to activate " + "live zone leak debugging (%d).\n", kr); + } if (val == 0 && oldval == 1) { + printf("zleak_active: active, cannot be disabled.\n"); + return (EINVAL); + } + return (0); +} + +SYSCTL_PROC(_kern_zleak, OID_AUTO, active, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, sysctl_zleak_active, "I", "zleak activity"); + +/* + * kern.zleak.max_zonemap_size + * + * Read the value of the maximum zonemap size in bytes; useful + * as the maximum size that zleak.global_threshold and + * zleak.zone_threshold should be set to. + */ +static int +sysctl_zleak_max_zonemap_size SYSCTL_HANDLER_ARGS +{ + uint64_t zmap_max_size = *(vm_size_t *)arg1; + + return sysctl_handle_quad(oidp, &zmap_max_size, arg2, req); +} + +SYSCTL_PROC(_kern_zleak, OID_AUTO, max_zonemap_size, + CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED, + &zleak_max_zonemap_size, 0, + sysctl_zleak_max_zonemap_size, "Q", "zleak max zonemap size"); + + +static int +sysctl_zleak_threshold SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg2) + int error; + uint64_t value = *(vm_size_t *)arg1; + + error = sysctl_io_number(req, value, sizeof (value), &value, NULL); + + if (error || !req->newptr) + return (error); + + if (value > (uint64_t)zleak_max_zonemap_size) + return (ERANGE); + + *(vm_size_t *)arg1 = value; + return (0); +} + +/* + * kern.zleak.global_threshold + * + * Set the global zleak threshold size (in bytes). If the zone map + * grows larger than this value, zleaks are automatically activated. + * + * The default value is set in zleak_init(). + */ +SYSCTL_PROC(_kern_zleak, OID_AUTO, global_threshold, + CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, + &zleak_global_tracking_threshold, 0, + sysctl_zleak_threshold, "Q", "zleak global threshold"); + +/* + * kern.zleak.zone_threshold + * + * Set the per-zone threshold size (in bytes) above which any + * zone will automatically start zleak tracking. + * + * The default value is set in zleak_init(). + * + * Setting this variable will have no effect until zleak tracking is + * activated (See above.) + */ +SYSCTL_PROC(_kern_zleak, OID_AUTO, zone_threshold, + CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, + &zleak_per_zone_tracking_threshold, 0, + sysctl_zleak_threshold, "Q", "zleak per-zone threshold"); + +#endif /* CONFIG_ZLEAKS */ diff --git a/bsd/kern/kern_memorystatus.c b/bsd/kern/kern_memorystatus.c index dfbaa794d..489ddd2be 100644 --- a/bsd/kern/kern_memorystatus.c +++ b/bsd/kern/kern_memorystatus.c @@ -31,6 +31,8 @@ #include #include +#include +#include #include #include #include @@ -42,6 +44,126 @@ #include #include #include +#include +#include + +#if CONFIG_FREEZE +#include +#include + +enum { + kProcessSuspended = (1 << 0), + kProcessHibernated = (1 << 1), + kProcessNoReclaimWorth = (1 << 2), + kProcessIgnored = (1 << 3), + kProcessBusy = (1 << 4) +}; + +static lck_mtx_t * hibernation_mlock; +static lck_attr_t * hibernation_lck_attr; +static lck_grp_t * hibernation_lck_grp; +static lck_grp_attr_t * hibernation_lck_grp_attr; + +typedef struct hibernation_node { + RB_ENTRY(hibernation_node) link; + pid_t pid; + uint32_t state; + mach_timespec_t hibernation_ts; +} hibernation_node; + +static int hibernation_tree_compare(hibernation_node *n1, hibernation_node *n2) { + if (n1->pid < n2->pid) + return -1; + else if (n1->pid > n2->pid) + return 1; + else + return 0; +} + +static RB_HEAD(hibernation_tree, hibernation_node) hibernation_tree_head; +RB_PROTOTYPE_SC(static, hibernation_tree, hibernation_node, link, hibernation_tree_compare); + +RB_GENERATE(hibernation_tree, hibernation_node, link, hibernation_tree_compare); + +static inline boolean_t kern_hibernation_can_hibernate_processes(void); +static boolean_t kern_hibernation_can_hibernate(void); + +static void kern_hibernation_add_node(hibernation_node *node); +static hibernation_node *kern_hibernation_get_node(pid_t pid); +static void kern_hibernation_release_node(hibernation_node *node); +static void kern_hibernation_free_node(hibernation_node *node, boolean_t unlock); + +static void kern_hibernation_register_pid(pid_t pid); +static void kern_hibernation_unregister_pid(pid_t pid); + +static int kern_hibernation_get_process_state(pid_t pid, uint32_t *state, mach_timespec_t *ts); +static int kern_hibernation_set_process_state(pid_t pid, uint32_t state); + +static void kern_hibernation_cull(void); + +static void kern_hibernation_thread(void); + +extern boolean_t vm_freeze_enabled; + +int kern_hibernation_wakeup = 0; + +static int jetsam_priority_list_hibernation_index = 0; + +/* Thresholds */ +static int kern_memorystatus_level_hibernate = 50; + +#define HIBERNATION_PAGES_MIN ( 1 * 1024 * 1024 / PAGE_SIZE) +#define HIBERNATION_PAGES_MAX (16 * 1024 * 1024 / PAGE_SIZE) + +static unsigned int kern_memorystatus_hibernation_pages_min = HIBERNATION_PAGES_MIN; +static unsigned int kern_memorystatus_hibernation_pages_max = HIBERNATION_PAGES_MAX; + +static unsigned int kern_memorystatus_suspended_count = 0; +static unsigned int kern_memorystatus_hibernated_count = 0; + +static unsigned int kern_memorystatus_hibernation_suspended_minimum = 4; + +static unsigned int kern_memorystatus_low_swap_pages = 0; + +/* Throttling */ +#define HIBERNATION_DAILY_MB_MAX 1024 +#define HIBERNATION_DAILY_PAGEOUTS_MAX (HIBERNATION_DAILY_MB_MAX * (1024 * 1024 / PAGE_SIZE)) + +static struct throttle_interval_t { + uint32_t mins; + uint32_t burst_multiple; + uint32_t pageouts; + uint32_t max_pageouts; + mach_timespec_t ts; + boolean_t throttle; +} throttle_intervals[] = { + { 60, 8, 0, 0, { 0, 0 }, FALSE }, /* 1 hour intermediate interval, 8x burst */ + { 24 * 60, 1, 0, 0, { 0, 0 }, FALSE }, /* 24 hour long interval, no burst */ +}; + +/* Stats */ +static uint64_t kern_memorystatus_hibernation_count = 0; +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_hibernation_count, CTLFLAG_RD, &kern_memorystatus_hibernation_count, ""); + +static uint64_t kern_memorystatus_hibernation_pageouts = 0; +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_hibernation_pageouts, CTLFLAG_RD, &kern_memorystatus_hibernation_pageouts, ""); + +static uint64_t kern_memorystatus_hibernation_throttle_count = 0; +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_hibernation_throttle_count, CTLFLAG_RD, &kern_memorystatus_hibernation_throttle_count, ""); + +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_hibernation_min_processes, CTLFLAG_RW, &kern_memorystatus_hibernation_suspended_minimum, 0, ""); + +#if DEVELOPMENT || DEBUG +/* Allow parameter tweaking in these builds */ +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_level_hibernate, CTLFLAG_RW, &kern_memorystatus_level_hibernate, 0, ""); + +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_hibernation_pages_min, CTLFLAG_RW, &kern_memorystatus_hibernation_pages_min, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_hibernation_pages_max, CTLFLAG_RW, &kern_memorystatus_hibernation_pages_max, 0, ""); + +boolean_t kern_memorystatus_hibernation_throttle_enabled = TRUE; +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_hibernation_throttle_enabled, CTLFLAG_RW, &kern_memorystatus_hibernation_throttle_enabled, 0, ""); +#endif /* DEVELOPMENT || DEBUG */ +#endif /* CONFIG_FREEZE */ extern unsigned int vm_page_free_count; extern unsigned int vm_page_active_count; @@ -54,6 +176,8 @@ static void kern_memorystatus_thread(void); int kern_memorystatus_wakeup = 0; int kern_memorystatus_level = 0; int kern_memorystatus_last_level = 0; +unsigned int kern_memorystatus_delta; + unsigned int kern_memorystatus_kev_failure_count = 0; int kern_memorystatus_level_critical = 5; #define kern_memorystatus_level_highwater (kern_memorystatus_level_critical + 5) @@ -76,16 +200,66 @@ static lck_attr_t * jetsam_lck_attr; static lck_grp_t * jetsam_lck_grp; static lck_grp_attr_t * jetsam_lck_grp_attr; -SYSCTL_INT(_kern, OID_AUTO, memorystatus_level, CTLFLAG_RD, &kern_memorystatus_level, 0, ""); -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_kev_failure_count, CTLFLAG_RD, &kern_memorystatus_kev_failure_count, 0, ""); +SYSCTL_INT(_kern, OID_AUTO, memorystatus_level, CTLFLAG_RD | CTLFLAG_LOCKED, &kern_memorystatus_level, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_kev_failure_count, CTLFLAG_RD | CTLFLAG_LOCKED, &kern_memorystatus_kev_failure_count, 0, ""); + +#if DEVELOPMENT || DEBUG + +enum { + kJetsamDiagnosticModeNone = 0, + kJetsamDiagnosticModeAll = 1, + kJetsamDiagnosticModeStopAtFirstActive = 2 +} jetsam_diagnostic_mode = kJetsamDiagnosticModeNone; + +static int jetsam_diagnostic_suspended_one_active_proc = 0; + +static int +sysctl_jetsam_diagnostic_mode SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error, val = jetsam_diagnostic_mode; + boolean_t disabled; + + error = sysctl_handle_int(oidp, &val, 0, req); + if (error || !req->newptr) + return (error); + if ((val < 0) || (val > 2)) { + printf("jetsam: diagnostic mode: invalid value - %d\n", val); + return (0); + } + + /* + * If jetsam_diagnostic_mode is set, we need to lower memory threshold for jetsam + */ + disabled = (val == 0) && (jetsam_diagnostic_mode != kJetsamDiagnosticModeNone); + + jetsam_diagnostic_mode = val; + + if (disabled) { + kern_memorystatus_level_critical = 5; + printf("jetsam: diagnostic mode: resetting critical level to %d\n", kern_memorystatus_level_critical); + } else { + kern_memorystatus_level_critical = 10; + printf("jetsam: diagnostic mode: %d: increasing critical level to %d\n", (int) jetsam_diagnostic_mode, kern_memorystatus_level_critical); + if (jetsam_diagnostic_mode == kJetsamDiagnosticModeStopAtFirstActive) + printf("jetsam: diagnostic mode: will stop at first active app\n"); + } + + return (0); +} + +SYSCTL_PROC(_debug, OID_AUTO, jetsam_diagnostic_mode, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY, + &jetsam_diagnostic_mode, 0, sysctl_jetsam_diagnostic_mode, "I", "Jetsam Diagnostic Mode"); +#endif /* DEVELOPMENT || DEBUG */ __private_extern__ void kern_memorystatus_init(void) { - jetsam_lck_attr = lck_attr_alloc_init(); - jetsam_lck_grp_attr= lck_grp_attr_alloc_init(); - jetsam_lck_grp = lck_grp_alloc_init("jetsam", jetsam_lck_grp_attr); - jetsam_list_mlock = lck_mtx_alloc_init(jetsam_lck_grp, jetsam_lck_attr); + jetsam_lck_attr = lck_attr_alloc_init(); + jetsam_lck_grp_attr= lck_grp_attr_alloc_init(); + jetsam_lck_grp = lck_grp_alloc_init("jetsam", jetsam_lck_grp_attr); + jetsam_list_mlock = lck_mtx_alloc_init(jetsam_lck_grp, jetsam_lck_attr); + kern_memorystatus_delta = 5 * atop_64(max_mem) / 100; (void)kernel_thread(kernel_task, kern_memorystatus_thread); } @@ -153,48 +327,107 @@ jetsam_snapshot_procs(void) } static void -jetsam_mark_pid_in_snapshot(pid_t pid, int flag) +jetsam_mark_pid_in_snapshot(pid_t pid, int flags) { int i = 0; for (i = 0; i < jetsam_snapshot_list_count; i++) { if (jetsam_snapshot_list[i].pid == pid) { - jetsam_snapshot_list[i].flags |= flag; + jetsam_snapshot_list[i].flags |= flags; return; } } } int -jetsam_kill_top_proc(void) +jetsam_kill_top_proc(boolean_t any, uint32_t cause) { proc_t p; +#ifndef CONFIG_FREEZE +#pragma unused(any) +#endif + if (jetsam_snapshot_list_count == 0) { jetsam_snapshot_procs(); } lck_mtx_lock(jetsam_list_mlock); while (jetsam_priority_list_index < jetsam_priority_list_count) { - pid_t aPid; - aPid = jetsam_priority_list[jetsam_priority_list_index].pid; + jetsam_priority_entry_t* jetsam_priority_entry = &jetsam_priority_list[jetsam_priority_list_index]; + pid_t aPid = jetsam_priority_entry->pid; +#if DEVELOPMENT || DEBUG + int activeProcess = jetsam_priority_entry->flags & kJetsamFlagsFrontmost; + int procSuspendedForDiagnosis = jetsam_priority_entry->flags & kJetsamFlagsSuspForDiagnosis; +#endif /* DEVELOPMENT || DEBUG */ jetsam_priority_list_index++; /* skip empty slots in the list */ if (aPid == 0) { continue; // with lock held } lck_mtx_unlock(jetsam_list_mlock); - jetsam_mark_pid_in_snapshot(aPid, kJetsamFlagsKilled); p = proc_find(aPid); if (p != NULL) { - printf("jetsam: killing pid %d [%s] - memory_status_level: %d - ", - aPid, (p->p_comm ? p->p_comm : "(unknown)"), kern_memorystatus_level); - exit1(p, W_EXITCODE(0, SIGKILL), (int *)NULL); - proc_rele(p); + int flags = cause; +#if DEVELOPMENT || DEBUG + if ((jetsam_diagnostic_mode != kJetsamDiagnosticModeNone) && procSuspendedForDiagnosis) { + printf("jetsam: continuing after ignoring proc suspended already for diagnosis - %d\n", aPid); + proc_rele(p); + lck_mtx_lock(jetsam_list_mlock); + continue; + } +#endif /* DEVELOPMENT || DEBUG */ +#if CONFIG_FREEZE + hibernation_node *node; + boolean_t skip; + if ((node = kern_hibernation_get_node(aPid))) { + boolean_t reclaim_proc = !(node->state & (kProcessBusy | kProcessNoReclaimWorth)); + if (any || reclaim_proc) { + if (node->state & kProcessHibernated) { + flags |= kJetsamFlagsHibernated; + } + skip = FALSE; + } else { + skip = TRUE; + } + kern_hibernation_release_node(node); + } else { + skip = FALSE; + } + if (skip) { + proc_rele(p); + } else +#endif + { +#if DEVELOPMENT || DEBUG + if ((jetsam_diagnostic_mode != kJetsamDiagnosticModeNone) && activeProcess) { #if DEBUG - printf("jetsam: pid %d killed - memory_status_level: %d\n", aPid, kern_memorystatus_level); + printf("jetsam: suspending pid %d [%s] (active) for diagnosis - memory_status_level: %d\n", + aPid, (p->p_comm ? p->p_comm: "(unknown)"), kern_memorystatus_level); #endif /* DEBUG */ - return 0; + jetsam_mark_pid_in_snapshot(aPid, kJetsamFlagsSuspForDiagnosis); + jetsam_priority_entry->flags |= kJetsamFlagsSuspForDiagnosis; + task_suspend(p->task); + proc_rele(p); + if (jetsam_diagnostic_mode == kJetsamDiagnosticModeStopAtFirstActive) { + jetsam_diagnostic_suspended_one_active_proc = 1; + printf("jetsam: returning after suspending first active proc - %d\n", aPid); + } + return 0; + } else +#endif /* DEVELOPMENT || DEBUG */ + { + printf("jetsam: killing pid %d [%s] - memory_status_level: %d\n", + aPid, (p->p_comm ? p->p_comm : "(unknown)"), kern_memorystatus_level); + jetsam_mark_pid_in_snapshot(aPid, flags); + exit1(p, W_EXITCODE(0, SIGKILL), (int *)NULL); + proc_rele(p); +#if DEBUG + printf("jetsam: pid %d killed - memory_status_level: %d\n", aPid, kern_memorystatus_level); +#endif /* DEBUG */ + return 0; + } + } } lck_mtx_lock(jetsam_list_mlock); } @@ -220,54 +453,235 @@ jetsam_kill_hiwat_proc(void) if (aPid == 0 || (hiwat < 0)) { continue; // with lock held } - lck_mtx_unlock(jetsam_list_mlock); p = proc_find(aPid); if (p != NULL) { int32_t pages = (int32_t)jetsam_task_page_count(p->task); - if (pages > hiwat) { + boolean_t skip = (pages <= hiwat); +#if DEVELOPMENT || DEBUG + if (!skip && (jetsam_diagnostic_mode != kJetsamDiagnosticModeNone)) { + if (jetsam_priority_list[i].flags & kJetsamFlagsSuspForDiagnosis) { + proc_rele(p); + continue; + } + } +#endif /* DEVELOPMENT || DEBUG */ +#if CONFIG_FREEZE + if (!skip) { + hibernation_node *node; + if ((node = kern_hibernation_get_node(aPid))) { + if (node->state & kProcessBusy) { + kern_hibernation_release_node(node); + skip = TRUE; + } else { + kern_hibernation_free_node(node, TRUE); + skip = FALSE; + } + } + } +#endif + if (!skip) { #if DEBUG - printf("jetsam: killing pid %d [%s] - %d pages > hiwat (%d)\n", aPid, p->p_comm, pages, hiwat); + printf("jetsam: %s pid %d [%s] - %d pages > hiwat (%d)\n", + (jetsam_diagnostic_mode != kJetsamDiagnosticModeNone)?"suspending": "killing", aPid, p->p_comm, pages, hiwat); #endif /* DEBUG */ - exit1(p, W_EXITCODE(0, SIGKILL), (int *)NULL); - proc_rele(p); +#if DEVELOPMENT || DEBUG + if (jetsam_diagnostic_mode != kJetsamDiagnosticModeNone) { + lck_mtx_unlock(jetsam_list_mlock); + task_suspend(p->task); + proc_rele(p); #if DEBUG - printf("jetsam: pid %d killed - memory_status_level: %d\n", aPid, kern_memorystatus_level); + printf("jetsam: pid %d suspended for diagnosis - memory_status_level: %d\n", aPid, kern_memorystatus_level); #endif /* DEBUG */ - jetsam_mark_pid_in_snapshot(aPid, kJetsamFlagsKilledHiwat); - jetsam_priority_list[i].pid = 0; + jetsam_mark_pid_in_snapshot(aPid, kJetsamFlagsSuspForDiagnosis); + jetsam_priority_list[i].flags |= kJetsamFlagsSuspForDiagnosis; + } else +#endif /* DEVELOPMENT || DEBUG */ + { + jetsam_priority_list[i].pid = 0; + lck_mtx_unlock(jetsam_list_mlock); + exit1(p, W_EXITCODE(0, SIGKILL), (int *)NULL); + proc_rele(p); +#if DEBUG + printf("jetsam: pid %d killed - memory_status_level: %d\n", aPid, kern_memorystatus_level); +#endif /* DEBUG */ + jetsam_mark_pid_in_snapshot(aPid, kJetsamFlagsKilledHiwat); + } return 0; } else { proc_rele(p); } } - lck_mtx_lock(jetsam_list_mlock); } lck_mtx_unlock(jetsam_list_mlock); return -1; } +#if CONFIG_FREEZE +static void +jetsam_send_hibernation_note(uint32_t flags, pid_t pid, uint32_t pages) { + int ret; + struct kev_msg ev_msg; + jetsam_hibernation_entry_t data; + + ev_msg.vendor_code = KEV_VENDOR_APPLE; + ev_msg.kev_class = KEV_SYSTEM_CLASS; + ev_msg.kev_subclass = KEV_MEMORYSTATUS_SUBCLASS; + + ev_msg.event_code = kMemoryStatusHibernationNote; + + ev_msg.dv[0].data_length = sizeof data; + ev_msg.dv[0].data_ptr = &data; + ev_msg.dv[1].data_length = 0; + + data.pid = pid; + data.flags = flags; + data.pages = pages; + + ret = kev_post_msg(&ev_msg); + if (ret) { + kern_memorystatus_kev_failure_count++; + printf("%s: kev_post_msg() failed, err %d\n", __func__, ret); + } +} + +static int +jetsam_hibernate_top_proc(void) +{ + int hibernate_index; + proc_t p; + uint32_t i; + + lck_mtx_lock(jetsam_list_mlock); + + for (hibernate_index = jetsam_priority_list_index; hibernate_index < jetsam_priority_list_count; hibernate_index++) { + pid_t aPid; + uint32_t state = 0; + + aPid = jetsam_priority_list[hibernate_index].pid; + + /* skip empty slots in the list */ + if (aPid == 0) { + continue; // with lock held + } + + if (kern_hibernation_get_process_state(aPid, &state, NULL) != 0) { + continue; // with lock held + } + + /* ensure the process isn't marked as busy and is suspended */ + if ((state & kProcessBusy) || !(state & kProcessSuspended)) { + continue; // with lock held + } + + p = proc_find(aPid); + if (p != NULL) { + hibernation_node *node; + boolean_t skip; + uint32_t purgeable, wired, clean, dirty; + boolean_t shared; + + lck_mtx_unlock(jetsam_list_mlock); + + if ((node = kern_hibernation_get_node(aPid))) { + if (node->state & kProcessBusy) { + skip = TRUE; + } else { + node->state |= kProcessBusy; + /* Whether we hibernate or not, increase the count so can we maintain the gap between hibernated and suspended processes. */ + kern_memorystatus_hibernated_count++; + skip = FALSE; + } + kern_hibernation_release_node(node); + } else { + skip = TRUE; + } + + if (!skip) { + /* Only hibernate processes meeting our size criteria. If not met, mark it as such and return. */ + task_freeze(p->task, &purgeable, &wired, &clean, &dirty, &shared, TRUE); + skip = (dirty < kern_memorystatus_hibernation_pages_min) || (dirty > kern_memorystatus_hibernation_pages_max); + } + + if (!skip) { + unsigned int swap_pages_free = default_pager_swap_pages_free(); + + /* Ensure there's actually enough space free to hibernate this process. */ + if (dirty > swap_pages_free) { + kern_memorystatus_low_swap_pages = swap_pages_free; + skip = TRUE; + } + } + + if (skip) { + kern_hibernation_set_process_state(aPid, kProcessIgnored); + proc_rele(p); + return 0; + } + +#if DEBUG + printf("jetsam: pid %d [%s] hibernating - memory_status_level: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, shared %d, free swap: %d\n", + aPid, (p->p_comm ? p->p_comm : "(unknown)"), kern_memorystatus_level, purgeable, wired, clean, dirty, shared, default_pager_swap_pages_free()); +#endif + + task_freeze(p->task, &purgeable, &wired, &clean, &dirty, &shared, FALSE); + proc_rele(p); + + kern_hibernation_set_process_state(aPid, kProcessHibernated | (shared ? 0: kProcessNoReclaimWorth)); + + /* Update stats */ + for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) { + throttle_intervals[i].pageouts += dirty; + } + kern_memorystatus_hibernation_pageouts += dirty; + kern_memorystatus_hibernation_count++; + + jetsam_send_hibernation_note(kJetsamFlagsHibernated, aPid, dirty); + + return dirty; + } + } + lck_mtx_unlock(jetsam_list_mlock); + return -1; +} +#endif /* CONFIG_FREEZE */ + static void kern_memorystatus_thread(void) { struct kev_msg ev_msg; jetsam_kernel_stats_t data; + boolean_t post_memorystatus_snapshot = FALSE; int ret; + bzero(&data, sizeof(jetsam_kernel_stats_t)); + bzero(&ev_msg, sizeof(struct kev_msg)); while(1) { - while (kern_memorystatus_level <= kern_memorystatus_level_critical) { - if (jetsam_kill_top_proc() < 0) { +#if DEVELOPMENT || DEBUG + jetsam_diagnostic_suspended_one_active_proc = 0; +#endif /* DEVELOPMENT || DEBUG */ + + while (kern_memorystatus_level <= kern_memorystatus_level_highwater) { + if (jetsam_kill_hiwat_proc() < 0) { break; } + post_memorystatus_snapshot = TRUE; } - while (kern_memorystatus_level <= kern_memorystatus_level_highwater) { - if (jetsam_kill_hiwat_proc() < 0) { + while (kern_memorystatus_level <= kern_memorystatus_level_critical) { + if (jetsam_kill_top_proc(FALSE, kJetsamFlagsKilled) < 0) { break; } + post_memorystatus_snapshot = TRUE; +#if DEVELOPMENT || DEBUG + if ((jetsam_diagnostic_mode == kJetsamDiagnosticModeStopAtFirstActive) && jetsam_diagnostic_suspended_one_active_proc) { + printf("jetsam: stopping killing since 1 active proc suspended already for diagnosis\n"); + break; // we found first active proc, let's not kill any more + } +#endif /* DEVELOPMENT || DEBUG */ } - + kern_memorystatus_last_level = kern_memorystatus_level; ev_msg.vendor_code = KEV_VENDOR_APPLE; @@ -295,7 +709,7 @@ kern_memorystatus_thread(void) printf("%s: kev_post_msg() failed, err %d\n", __func__, ret); } - if (jetsam_snapshot_list_count) { + if (post_memorystatus_snapshot) { size_t snapshot_size = sizeof(jetsam_kernel_stats_t) + sizeof(size_t) + sizeof(jetsam_snapshot_entry_t) * jetsam_snapshot_list_count; ev_msg.event_code = kMemoryStatusSnapshotNote; ev_msg.dv[0].data_length = sizeof snapshot_size; @@ -318,6 +732,349 @@ kern_memorystatus_thread(void) } } +#if CONFIG_FREEZE + +__private_extern__ void +kern_hibernation_init(void) +{ + hibernation_lck_attr = lck_attr_alloc_init(); + hibernation_lck_grp_attr = lck_grp_attr_alloc_init(); + hibernation_lck_grp = lck_grp_alloc_init("hibernation", hibernation_lck_grp_attr); + hibernation_mlock = lck_mtx_alloc_init(hibernation_lck_grp, hibernation_lck_attr); + + RB_INIT(&hibernation_tree_head); + + (void)kernel_thread(kernel_task, kern_hibernation_thread); +} + +static inline boolean_t +kern_hibernation_can_hibernate_processes(void) +{ + boolean_t ret; + + lck_mtx_lock_spin(hibernation_mlock); + ret = (kern_memorystatus_suspended_count - kern_memorystatus_hibernated_count) > + kern_memorystatus_hibernation_suspended_minimum ? TRUE : FALSE; + lck_mtx_unlock(hibernation_mlock); + + return ret; +} + +static boolean_t +kern_hibernation_can_hibernate(void) +{ + /* Only hibernate if we're sufficiently low on memory; this holds off hibernation right after boot, + and is generally is a no-op once we've reached steady state. */ + if (kern_memorystatus_level > kern_memorystatus_level_hibernate) { + return FALSE; + } + + /* Check minimum suspended process threshold. */ + if (!kern_hibernation_can_hibernate_processes()) { + return FALSE; + } + + /* Is swap running low? */ + if (kern_memorystatus_low_swap_pages) { + /* If there's been no movement in free swap pages since we last attempted hibernation, return. */ + if (default_pager_swap_pages_free() <= kern_memorystatus_low_swap_pages) { + return FALSE; + } + + /* Pages have been freed, so we can retry. */ + kern_memorystatus_low_swap_pages = 0; + } + + /* OK */ + return TRUE; +} + +static void +kern_hibernation_add_node(hibernation_node *node) +{ + lck_mtx_lock_spin(hibernation_mlock); + + RB_INSERT(hibernation_tree, &hibernation_tree_head, node); + kern_memorystatus_suspended_count++; + + lck_mtx_unlock(hibernation_mlock); +} + +/* Returns with the hibernation lock taken */ +static hibernation_node * +kern_hibernation_get_node(pid_t pid) +{ + hibernation_node sought, *found; + sought.pid = pid; + lck_mtx_lock_spin(hibernation_mlock); + found = RB_FIND(hibernation_tree, &hibernation_tree_head, &sought); + if (!found) { + lck_mtx_unlock(hibernation_mlock); + } + return found; +} + +static void +kern_hibernation_release_node(hibernation_node *node) +{ +#pragma unused(node) + lck_mtx_unlock(hibernation_mlock); +} + +static void +kern_hibernation_free_node(hibernation_node *node, boolean_t unlock) +{ + /* make sure we're called with the hibernation_mlock held */ + lck_mtx_assert(hibernation_mlock, LCK_MTX_ASSERT_OWNED); + + if (node->state & (kProcessHibernated | kProcessIgnored)) { + kern_memorystatus_hibernated_count--; + } + + kern_memorystatus_suspended_count--; + + RB_REMOVE(hibernation_tree, &hibernation_tree_head, node); + kfree(node, sizeof(hibernation_node)); + + if (unlock) { + lck_mtx_unlock(hibernation_mlock); + } +} + +static void +kern_hibernation_register_pid(pid_t pid) +{ + hibernation_node *node; + +#if DEVELOPMENT || DEBUG + node = kern_hibernation_get_node(pid); + if (node) { + printf("kern_hibernation_register_pid: pid %d already registered!\n", pid); + kern_hibernation_release_node(node); + return; + } +#endif + + /* Register as a candiate for hibernation */ + node = (hibernation_node *)kalloc(sizeof(hibernation_node)); + if (node) { + clock_sec_t sec; + clock_nsec_t nsec; + mach_timespec_t ts; + + memset(node, 0, sizeof(hibernation_node)); + + node->pid = pid; + node->state = kProcessSuspended; + + clock_get_system_nanotime(&sec, &nsec); + ts.tv_sec = sec; + ts.tv_nsec = nsec; + + node->hibernation_ts = ts; + + kern_hibernation_add_node(node); + } +} + +static void +kern_hibernation_unregister_pid(pid_t pid) +{ + hibernation_node *node; + + node = kern_hibernation_get_node(pid); + if (node) { + kern_hibernation_free_node(node, TRUE); + } +} + +void +kern_hibernation_on_pid_suspend(pid_t pid) +{ + kern_hibernation_register_pid(pid); +} + +/* If enabled, we bring all the hibernated pages back prior to resumption; otherwise, they're faulted back in on demand */ +#define THAW_ON_RESUME 1 + +void +kern_hibernation_on_pid_resume(pid_t pid, task_t task) +{ +#if THAW_ON_RESUME + hibernation_node *node; + if ((node = kern_hibernation_get_node(pid))) { + if (node->state & kProcessHibernated) { + node->state |= kProcessBusy; + kern_hibernation_release_node(node); + task_thaw(task); + jetsam_send_hibernation_note(kJetsamFlagsThawed, pid, 0); + } else { + kern_hibernation_release_node(node); + } + } +#else +#pragma unused(task) +#endif + kern_hibernation_unregister_pid(pid); +} + +void +kern_hibernation_on_pid_hibernate(pid_t pid) +{ +#pragma unused(pid) + + /* Wake the hibernation thread */ + thread_wakeup((event_t)&kern_hibernation_wakeup); +} + +static int +kern_hibernation_get_process_state(pid_t pid, uint32_t *state, mach_timespec_t *ts) +{ + hibernation_node *found; + int err = ESRCH; + + *state = 0; + + found = kern_hibernation_get_node(pid); + if (found) { + *state = found->state; + if (ts) { + *ts = found->hibernation_ts; + } + err = 0; + kern_hibernation_release_node(found); + } + + return err; +} + +static int +kern_hibernation_set_process_state(pid_t pid, uint32_t state) +{ + hibernation_node *found; + int err = ESRCH; + + found = kern_hibernation_get_node(pid); + if (found) { + found->state = state; + err = 0; + kern_hibernation_release_node(found); + } + + return err; +} + +static void +kern_hibernation_update_throttle_interval(mach_timespec_t *ts, struct throttle_interval_t *interval) +{ + if (CMP_MACH_TIMESPEC(ts, &interval->ts) >= 0) { + if (!interval->max_pageouts) { + interval->max_pageouts = (interval->burst_multiple * (((uint64_t)interval->mins * HIBERNATION_DAILY_PAGEOUTS_MAX) / (24 * 60))); + } else { + printf("jetsam: %d minute throttle timeout, resetting\n", interval->mins); + } + interval->ts.tv_sec = interval->mins * 60; + interval->ts.tv_nsec = 0; + ADD_MACH_TIMESPEC(&interval->ts, ts); + /* Since we update the throttle stats pre-hibernation, adjust for overshoot here */ + if (interval->pageouts > interval->max_pageouts) { + interval->pageouts -= interval->max_pageouts; + } else { + interval->pageouts = 0; + } + interval->throttle = FALSE; + } else if (!interval->throttle && interval->pageouts >= interval->max_pageouts) { + printf("jetsam: %d minute pageout limit exceeded; enabling throttle\n", interval->mins); + interval->throttle = TRUE; + } +#ifdef DEBUG + printf("jetsam: throttle updated - %d frozen (%d max) within %dm; %dm remaining; throttle %s\n", + interval->pageouts, interval->max_pageouts, interval->mins, (interval->ts.tv_sec - ts->tv_sec) / 60, + interval->throttle ? "on" : "off"); +#endif +} + +static boolean_t +kern_hibernation_throttle_update(void) +{ + clock_sec_t sec; + clock_nsec_t nsec; + mach_timespec_t ts; + uint32_t i; + boolean_t throttled = FALSE; + +#if DEVELOPMENT || DEBUG + if (!kern_memorystatus_hibernation_throttle_enabled) + return FALSE; +#endif + + clock_get_system_nanotime(&sec, &nsec); + ts.tv_sec = sec; + ts.tv_nsec = nsec; + + /* Check hibernation pageouts over multiple intervals and throttle if we've exceeded our budget. + * + * This ensures that periods of inactivity can't be used as 'credit' towards hibernation if the device has + * remained dormant for a long period. We do, however, allow increased thresholds for shorter intervals in + * order to allow for bursts of activity. + */ + for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) { + kern_hibernation_update_throttle_interval(&ts, &throttle_intervals[i]); + if (throttle_intervals[i].throttle == TRUE) + throttled = TRUE; + } + + return throttled; +} + +static void +kern_hibernation_cull(void) +{ + hibernation_node *node, *next; + lck_mtx_lock(hibernation_mlock); + + for (node = RB_MIN(hibernation_tree, &hibernation_tree_head); node != NULL; node = next) { + proc_t p; + + next = RB_NEXT(hibernation_tree, &hibernation_tree_head, node); + + /* TODO: probably suboptimal, so revisit should it cause a performance issue */ + p = proc_find(node->pid); + if (p) { + proc_rele(p); + } else { + kern_hibernation_free_node(node, FALSE); + } + } + + lck_mtx_unlock(hibernation_mlock); +} + +static void +kern_hibernation_thread(void) +{ + if (vm_freeze_enabled) { + if (kern_hibernation_can_hibernate()) { + + /* Cull dead processes */ + kern_hibernation_cull(); + + /* Only hibernate if we've not exceeded our pageout budgets */ + if (!kern_hibernation_throttle_update()) { + jetsam_hibernate_top_proc(); + } else { + printf("kern_hibernation_thread: in throttle, ignoring hibernation\n"); + kern_memorystatus_hibernation_throttle_count++; /* Throttled, update stats */ + } + } + } + + assert_wait((event_t) &kern_hibernation_wakeup, THREAD_UNINT); + thread_block((thread_continue_t) kern_hibernation_thread); +} + +#endif /* CONFIG_FREEZE */ + static int sysctl_io_variable(struct sysctl_req *req, void *pValue, size_t currentsize, size_t maxsize, size_t *newsize) { @@ -362,19 +1119,24 @@ sysctl_handle_kern_memorystatus_priority_list(__unused struct sysctl_oid *oid, _ ret = sysctl_io_variable(req, &temp_list[0], currentsize, sizeof(temp_list), &newsize); if (!ret && req->newptr) { - jetsam_priority_list_count = newsize / sizeof(jetsam_priority_list[0]); + int temp_list_count = newsize / sizeof(jetsam_priority_list[0]); #if DEBUG printf("set jetsam priority pids = { "); - for (i = 0; i < jetsam_priority_list_count; i++) { + for (i = 0; i < temp_list_count; i++) { printf("(%d, 0x%08x, %d) ", temp_list[i].pid, temp_list[i].flags, temp_list[i].hiwat_pages); } printf("}\n"); #endif /* DEBUG */ lck_mtx_lock(jetsam_list_mlock); - for (i = 0; i < jetsam_priority_list_count; i++) { +#if CONFIG_FREEZE + jetsam_priority_list_hibernation_index = 0; +#endif + jetsam_priority_list_index = 0; + jetsam_priority_list_count = temp_list_count; + for (i = 0; i < temp_list_count; i++) { jetsam_priority_list[i] = temp_list[i]; } - for (i = jetsam_priority_list_count; i < kMaxPriorityEntries; i++) { + for (i = temp_list_count; i < kMaxPriorityEntries; i++) { jetsam_priority_list[i].pid = 0; jetsam_priority_list[i].flags = 0; jetsam_priority_list[i].hiwat_pages = -1; @@ -382,7 +1144,6 @@ sysctl_handle_kern_memorystatus_priority_list(__unused struct sysctl_oid *oid, _ jetsam_priority_list[i].hiwat_reserved2 = -1; jetsam_priority_list[i].hiwat_reserved3 = -1; } - jetsam_priority_list_index = 0; lck_mtx_unlock(jetsam_list_mlock); } return ret; @@ -421,5 +1182,5 @@ sysctl_handle_kern_memorystatus_snapshot(__unused struct sysctl_oid *oid, __unus return ret; } -SYSCTL_PROC(_kern, OID_AUTO, memorystatus_priority_list, CTLTYPE_OPAQUE|CTLFLAG_RW, 0, 0, sysctl_handle_kern_memorystatus_priority_list, "S,jetsam_priorities", ""); +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_priority_list, CTLTYPE_OPAQUE|CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_handle_kern_memorystatus_priority_list, "S,jetsam_priorities", ""); SYSCTL_PROC(_kern, OID_AUTO, memorystatus_snapshot, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, 0, sysctl_handle_kern_memorystatus_snapshot, "S,jetsam_snapshot", ""); diff --git a/bsd/kern/kern_mib.c b/bsd/kern/kern_mib.c index 658f13860..f82717429 100644 --- a/bsd/kern/kern_mib.c +++ b/bsd/kern/kern_mib.c @@ -333,34 +333,34 @@ sysctl_tbfrequency /* * hw.* MIB variables. */ -SYSCTL_PROC (_hw, HW_NCPU, ncpu, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_NCPU, sysctl_hw_generic, "I", ""); -SYSCTL_PROC (_hw, HW_AVAILCPU, activecpu, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_AVAILCPU, sysctl_hw_generic, "I", ""); -SYSCTL_PROC (_hw, OID_AUTO, physicalcpu, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_LOCAL_PHYSICALCPU, sysctl_hw_generic, "I", ""); -SYSCTL_PROC (_hw, OID_AUTO, physicalcpu_max, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_LOCAL_PHYSICALCPUMAX, sysctl_hw_generic, "I", ""); -SYSCTL_PROC (_hw, OID_AUTO, logicalcpu, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_LOCAL_LOGICALCPU, sysctl_hw_generic, "I", ""); -SYSCTL_PROC (_hw, OID_AUTO, logicalcpu_max, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_LOCAL_LOGICALCPUMAX, sysctl_hw_generic, "I", ""); -SYSCTL_INT (_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD | CTLFLAG_KERN, (int *)NULL, BYTE_ORDER, ""); -SYSCTL_INT (_hw, OID_AUTO, cputype, CTLFLAG_RD | CTLFLAG_KERN, &cputype, 0, ""); -SYSCTL_INT (_hw, OID_AUTO, cpusubtype, CTLFLAG_RD | CTLFLAG_KERN, &cpusubtype, 0, ""); -SYSCTL_INT (_hw, OID_AUTO, cpu64bit_capable, CTLFLAG_RD | CTLFLAG_KERN, &cpu64bit, 0, ""); -SYSCTL_INT (_hw, OID_AUTO, cpufamily, CTLFLAG_RD | CTLFLAG_KERN, &cpufamily, 0, ""); -SYSCTL_OPAQUE (_hw, OID_AUTO, cacheconfig, CTLFLAG_RD, &cacheconfig, sizeof(cacheconfig), "Q", ""); -SYSCTL_OPAQUE (_hw, OID_AUTO, cachesize, CTLFLAG_RD, &cachesize, sizeof(cachesize), "Q", ""); -SYSCTL_PROC (_hw, OID_AUTO, pagesize, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN, 0, 0, sysctl_pagesize, "Q", ""); -SYSCTL_QUAD (_hw, OID_AUTO, busfrequency, CTLFLAG_RD | CTLFLAG_KERN, &gPEClockFrequencyInfo.bus_frequency_hz, ""); -SYSCTL_QUAD (_hw, OID_AUTO, busfrequency_min, CTLFLAG_RD | CTLFLAG_KERN, &gPEClockFrequencyInfo.bus_frequency_min_hz, ""); -SYSCTL_QUAD (_hw, OID_AUTO, busfrequency_max, CTLFLAG_RD | CTLFLAG_KERN, &gPEClockFrequencyInfo.bus_frequency_max_hz, ""); -SYSCTL_QUAD (_hw, OID_AUTO, cpufrequency, CTLFLAG_RD | CTLFLAG_KERN, &gPEClockFrequencyInfo.cpu_frequency_hz, ""); -SYSCTL_QUAD (_hw, OID_AUTO, cpufrequency_min, CTLFLAG_RD | CTLFLAG_KERN, &gPEClockFrequencyInfo.cpu_frequency_min_hz, ""); -SYSCTL_QUAD (_hw, OID_AUTO, cpufrequency_max, CTLFLAG_RD | CTLFLAG_KERN, &gPEClockFrequencyInfo.cpu_frequency_max_hz, ""); -SYSCTL_PROC (_hw, OID_AUTO, cachelinesize, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_CACHELINE | CTLHW_RETQUAD, sysctl_hw_generic, "Q", ""); -SYSCTL_PROC (_hw, OID_AUTO, l1icachesize, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_L1ICACHESIZE | CTLHW_RETQUAD, sysctl_hw_generic, "Q", ""); -SYSCTL_PROC (_hw, OID_AUTO, l1dcachesize, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_L1DCACHESIZE | CTLHW_RETQUAD, sysctl_hw_generic, "Q", ""); -SYSCTL_PROC (_hw, OID_AUTO, l2cachesize, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_L2CACHESIZE | CTLHW_RETQUAD, sysctl_hw_generic, "Q", ""); -SYSCTL_PROC (_hw, OID_AUTO, l3cachesize, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_L3CACHESIZE | CTLHW_RETQUAD, sysctl_hw_generic, "Q", ""); -SYSCTL_PROC(_hw, OID_AUTO, tbfrequency, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN, 0, 0, sysctl_tbfrequency, "Q", ""); -SYSCTL_QUAD (_hw, HW_MEMSIZE, memsize, CTLFLAG_RD | CTLFLAG_KERN, &max_mem, ""); -SYSCTL_INT (_hw, OID_AUTO, packages, CTLFLAG_RD | CTLFLAG_KERN, &packages, 0, ""); +SYSCTL_PROC (_hw, HW_NCPU, ncpu, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, HW_NCPU, sysctl_hw_generic, "I", ""); +SYSCTL_PROC (_hw, HW_AVAILCPU, activecpu, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, HW_AVAILCPU, sysctl_hw_generic, "I", ""); +SYSCTL_PROC (_hw, OID_AUTO, physicalcpu, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, HW_LOCAL_PHYSICALCPU, sysctl_hw_generic, "I", ""); +SYSCTL_PROC (_hw, OID_AUTO, physicalcpu_max, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, HW_LOCAL_PHYSICALCPUMAX, sysctl_hw_generic, "I", ""); +SYSCTL_PROC (_hw, OID_AUTO, logicalcpu, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, HW_LOCAL_LOGICALCPU, sysctl_hw_generic, "I", ""); +SYSCTL_PROC (_hw, OID_AUTO, logicalcpu_max, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, HW_LOCAL_LOGICALCPUMAX, sysctl_hw_generic, "I", ""); +SYSCTL_INT (_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (int *)NULL, BYTE_ORDER, ""); +SYSCTL_INT (_hw, OID_AUTO, cputype, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &cputype, 0, ""); +SYSCTL_INT (_hw, OID_AUTO, cpusubtype, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &cpusubtype, 0, ""); +SYSCTL_INT (_hw, OID_AUTO, cpu64bit_capable, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &cpu64bit, 0, ""); +SYSCTL_INT (_hw, OID_AUTO, cpufamily, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &cpufamily, 0, ""); +SYSCTL_OPAQUE (_hw, OID_AUTO, cacheconfig, CTLFLAG_RD | CTLFLAG_LOCKED, &cacheconfig, sizeof(cacheconfig), "Q", ""); +SYSCTL_OPAQUE (_hw, OID_AUTO, cachesize, CTLFLAG_RD | CTLFLAG_LOCKED, &cachesize, sizeof(cachesize), "Q", ""); +SYSCTL_PROC (_hw, OID_AUTO, pagesize, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, 0, sysctl_pagesize, "Q", ""); +SYSCTL_QUAD (_hw, OID_AUTO, busfrequency, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gPEClockFrequencyInfo.bus_frequency_hz, ""); +SYSCTL_QUAD (_hw, OID_AUTO, busfrequency_min, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gPEClockFrequencyInfo.bus_frequency_min_hz, ""); +SYSCTL_QUAD (_hw, OID_AUTO, busfrequency_max, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gPEClockFrequencyInfo.bus_frequency_max_hz, ""); +SYSCTL_QUAD (_hw, OID_AUTO, cpufrequency, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gPEClockFrequencyInfo.cpu_frequency_hz, ""); +SYSCTL_QUAD (_hw, OID_AUTO, cpufrequency_min, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gPEClockFrequencyInfo.cpu_frequency_min_hz, ""); +SYSCTL_QUAD (_hw, OID_AUTO, cpufrequency_max, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gPEClockFrequencyInfo.cpu_frequency_max_hz, ""); +SYSCTL_PROC (_hw, OID_AUTO, cachelinesize, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, HW_CACHELINE | CTLHW_RETQUAD, sysctl_hw_generic, "Q", ""); +SYSCTL_PROC (_hw, OID_AUTO, l1icachesize, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, HW_L1ICACHESIZE | CTLHW_RETQUAD, sysctl_hw_generic, "Q", ""); +SYSCTL_PROC (_hw, OID_AUTO, l1dcachesize, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, HW_L1DCACHESIZE | CTLHW_RETQUAD, sysctl_hw_generic, "Q", ""); +SYSCTL_PROC (_hw, OID_AUTO, l2cachesize, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, HW_L2CACHESIZE | CTLHW_RETQUAD, sysctl_hw_generic, "Q", ""); +SYSCTL_PROC (_hw, OID_AUTO, l3cachesize, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, HW_L3CACHESIZE | CTLHW_RETQUAD, sysctl_hw_generic, "Q", ""); +SYSCTL_PROC(_hw, OID_AUTO, tbfrequency, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, 0, sysctl_tbfrequency, "Q", ""); +SYSCTL_QUAD (_hw, HW_MEMSIZE, memsize, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &max_mem, ""); +SYSCTL_INT (_hw, OID_AUTO, packages, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &packages, 0, ""); /* * Optional features can register nodes below hw.optional. @@ -372,7 +372,7 @@ SYSCTL_INT (_hw, OID_AUTO, packages, CTLFLAG_RD | CTLFLAG_KERN, &packages, 0 */ SYSCTL_NODE(_hw, OID_AUTO, optional, CTLFLAG_RW|CTLFLAG_LOCKED, NULL, "optional features"); -SYSCTL_INT(_hw_optional, OID_AUTO, floatingpoint, CTLFLAG_RD | CTLFLAG_KERN, (int *)NULL, 1, ""); /* always set */ +SYSCTL_INT(_hw_optional, OID_AUTO, floatingpoint, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (int *)NULL, 1, ""); /* always set */ /* * Deprecated variables. These are supported for backwards compatibility @@ -386,44 +386,26 @@ SYSCTL_INT(_hw_optional, OID_AUTO, floatingpoint, CTLFLAG_RD | CTLFLAG_KERN, (in * * The *_compat nodes are *NOT* visible within the kernel. */ -SYSCTL_COMPAT_INT (_hw, HW_PAGESIZE, pagesize_compat, CTLFLAG_RD | CTLFLAG_MASKED, &page_size, 0, ""); -SYSCTL_COMPAT_INT (_hw, HW_BUS_FREQ, busfrequency_compat, CTLFLAG_RD | CTLFLAG_MASKED, &gPEClockFrequencyInfo.bus_clock_rate_hz, 0, ""); -SYSCTL_COMPAT_INT (_hw, HW_CPU_FREQ, cpufrequency_compat, CTLFLAG_RD | CTLFLAG_MASKED, &gPEClockFrequencyInfo.cpu_clock_rate_hz, 0, ""); -SYSCTL_PROC(_hw, HW_CACHELINE, cachelinesize_compat, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_CACHELINE, sysctl_hw_generic, "I", ""); -SYSCTL_PROC(_hw, HW_L1ICACHESIZE, l1icachesize_compat, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_L1ICACHESIZE, sysctl_hw_generic, "I", ""); -SYSCTL_PROC(_hw, HW_L1DCACHESIZE, l1dcachesize_compat, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_L1DCACHESIZE, sysctl_hw_generic, "I", ""); -SYSCTL_PROC(_hw, HW_L2CACHESIZE, l2cachesize_compat, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_L2CACHESIZE, sysctl_hw_generic, "I", ""); -SYSCTL_PROC(_hw, HW_L3CACHESIZE, l3cachesize_compat, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_L3CACHESIZE, sysctl_hw_generic, "I", ""); -SYSCTL_COMPAT_INT (_hw, HW_TB_FREQ, tbfrequency_compat, CTLFLAG_RD | CTLFLAG_MASKED, &gPEClockFrequencyInfo.timebase_frequency_hz, 0, ""); -SYSCTL_PROC(_hw, HW_MACHINE, machine, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_MACHINE, sysctl_hw_generic, "A", ""); -SYSCTL_PROC(_hw, HW_MODEL, model, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_MODEL, sysctl_hw_generic, "A", ""); -SYSCTL_COMPAT_UINT(_hw, HW_PHYSMEM, physmem, CTLFLAG_RD | CTLFLAG_MASKED, &mem_size, 0, ""); -SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_USERMEM, sysctl_hw_generic, "I", ""); -SYSCTL_PROC(_hw, HW_EPOCH, epoch, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_EPOCH, sysctl_hw_generic, "I", ""); -SYSCTL_PROC(_hw, HW_VECTORUNIT, vectorunit, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_VECTORUNIT, sysctl_hw_generic, "I", ""); -SYSCTL_PROC(_hw, HW_L2SETTINGS, l2settings, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_L2SETTINGS, sysctl_hw_generic, "I", ""); -SYSCTL_PROC(_hw, HW_L3SETTINGS, l3settings, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_L3SETTINGS, sysctl_hw_generic, "I", ""); -SYSCTL_INT (_hw, OID_AUTO, cputhreadtype, CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN, &cputhreadtype, 0, ""); - -#ifdef __ppc__ -int altivec_flag = -1; -int graphicsops_flag = -1; -int x64bitops_flag = -1; -int fsqrt_flag = -1; -int stfiwx_flag = -1; -int dcba_flag = -1; -int datastreams_flag = -1; -int dcbtstreams_flag = -1; - -SYSCTL_INT(_hw_optional, OID_AUTO, altivec, CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN, &altivec_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, graphicsops, CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN, &graphicsops_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, 64bitops, CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN, &x64bitops_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, fsqrt, CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN, &fsqrt_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, stfiwx, CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN, &stfiwx_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, dcba, CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN, &dcba_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, datastreams, CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN, &datastreams_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, dcbtstreams, CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN, &dcbtstreams_flag, 0, ""); -#elif defined (__i386__) || defined (__x86_64__) +SYSCTL_COMPAT_INT (_hw, HW_PAGESIZE, pagesize_compat, CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, &page_size, 0, ""); +SYSCTL_COMPAT_INT (_hw, HW_BUS_FREQ, busfrequency_compat, CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, &gPEClockFrequencyInfo.bus_clock_rate_hz, 0, ""); +SYSCTL_COMPAT_INT (_hw, HW_CPU_FREQ, cpufrequency_compat, CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, &gPEClockFrequencyInfo.cpu_clock_rate_hz, 0, ""); +SYSCTL_PROC(_hw, HW_CACHELINE, cachelinesize_compat, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, HW_CACHELINE, sysctl_hw_generic, "I", ""); +SYSCTL_PROC(_hw, HW_L1ICACHESIZE, l1icachesize_compat, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, HW_L1ICACHESIZE, sysctl_hw_generic, "I", ""); +SYSCTL_PROC(_hw, HW_L1DCACHESIZE, l1dcachesize_compat, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, HW_L1DCACHESIZE, sysctl_hw_generic, "I", ""); +SYSCTL_PROC(_hw, HW_L2CACHESIZE, l2cachesize_compat, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, HW_L2CACHESIZE, sysctl_hw_generic, "I", ""); +SYSCTL_PROC(_hw, HW_L3CACHESIZE, l3cachesize_compat, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, HW_L3CACHESIZE, sysctl_hw_generic, "I", ""); +SYSCTL_COMPAT_INT (_hw, HW_TB_FREQ, tbfrequency_compat, CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, &gPEClockFrequencyInfo.timebase_frequency_hz, 0, ""); +SYSCTL_PROC(_hw, HW_MACHINE, machine, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, HW_MACHINE, sysctl_hw_generic, "A", ""); +SYSCTL_PROC(_hw, HW_MODEL, model, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, HW_MODEL, sysctl_hw_generic, "A", ""); +SYSCTL_COMPAT_UINT(_hw, HW_PHYSMEM, physmem, CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, &mem_size, 0, ""); +SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, HW_USERMEM, sysctl_hw_generic, "I", ""); +SYSCTL_PROC(_hw, HW_EPOCH, epoch, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, HW_EPOCH, sysctl_hw_generic, "I", ""); +SYSCTL_PROC(_hw, HW_VECTORUNIT, vectorunit, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, HW_VECTORUNIT, sysctl_hw_generic, "I", ""); +SYSCTL_PROC(_hw, HW_L2SETTINGS, l2settings, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, HW_L2SETTINGS, sysctl_hw_generic, "I", ""); +SYSCTL_PROC(_hw, HW_L3SETTINGS, l3settings, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, HW_L3SETTINGS, sysctl_hw_generic, "I", ""); +SYSCTL_INT (_hw, OID_AUTO, cputhreadtype, CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, &cputhreadtype, 0, ""); + +#if defined (__i386__) || defined (__x86_64__) int mmx_flag = -1; int sse_flag = -1; int sse2_flag = -1; @@ -433,22 +415,27 @@ int sse4_2_flag = -1; int x86_64_flag = -1; int supplementalsse3_flag = -1; int aes_flag = -1; - -SYSCTL_INT(_hw_optional, OID_AUTO, mmx, CTLFLAG_RD | CTLFLAG_KERN, &mmx_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, sse, CTLFLAG_RD | CTLFLAG_KERN, &sse_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, sse2, CTLFLAG_RD | CTLFLAG_KERN, &sse2_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, sse3, CTLFLAG_RD | CTLFLAG_KERN, &sse3_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, supplementalsse3, CTLFLAG_RD | CTLFLAG_KERN, &supplementalsse3_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, sse4_1, CTLFLAG_RD | CTLFLAG_KERN, &sse4_1_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, sse4_2, CTLFLAG_RD | CTLFLAG_KERN, &sse4_2_flag, 0, ""); +int avx1_0_flag = -1; + +SYSCTL_INT(_hw_optional, OID_AUTO, mmx, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &mmx_flag, 0, ""); +SYSCTL_INT(_hw_optional, OID_AUTO, sse, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &sse_flag, 0, ""); +SYSCTL_INT(_hw_optional, OID_AUTO, sse2, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &sse2_flag, 0, ""); +SYSCTL_INT(_hw_optional, OID_AUTO, sse3, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &sse3_flag, 0, ""); +SYSCTL_INT(_hw_optional, OID_AUTO, supplementalsse3, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &supplementalsse3_flag, 0, ""); +SYSCTL_INT(_hw_optional, OID_AUTO, sse4_1, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &sse4_1_flag, 0, ""); +SYSCTL_INT(_hw_optional, OID_AUTO, sse4_2, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &sse4_2_flag, 0, ""); /* "x86_64" is actually a preprocessor symbol on the x86_64 kernel, so we have to hack this */ #undef x86_64 -SYSCTL_INT(_hw_optional, OID_AUTO, x86_64, CTLFLAG_RD | CTLFLAG_KERN, &x86_64_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, aes, CTLFLAG_RD | CTLFLAG_KERN, &aes_flag, 0, ""); -#endif /* __ppc__ */ +SYSCTL_INT(_hw_optional, OID_AUTO, x86_64, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &x86_64_flag, 0, ""); +SYSCTL_INT(_hw_optional, OID_AUTO, aes, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &aes_flag, 0, ""); +SYSCTL_INT(_hw_optional, OID_AUTO, avx1_0, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &avx1_0_flag, 0, ""); +#endif /* !__i386__ && !__x86_64 && !__arm__ */ /* * Debugging interface to the CPU power management code. + * + * Note: Does not need locks because it disables interrupts over + * the call. */ static int pmsSysctl(__unused struct sysctl_oid *oidp, __unused void *arg1, @@ -468,7 +455,7 @@ pmsSysctl(__unused struct sysctl_oid *oidp, __unused void *arg1, return(error); } -SYSCTL_PROC(_hw, OID_AUTO, pms, CTLTYPE_STRUCT | CTLFLAG_WR, 0, 0, pmsSysctl, "S", "Processor Power Management"); +SYSCTL_PROC(_hw, OID_AUTO, pms, CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_LOCKED, 0, 0, pmsSysctl, "S", "Processor Power Management"); @@ -484,9 +471,7 @@ sysctl_mib_init(void) cputype = cpu_type(); cpusubtype = cpu_subtype(); cputhreadtype = cpu_threadtype(); -#if defined(__ppc__) - cpu64bit = (_cpu_capabilities & k64Bit) == k64Bit; -#elif defined(__i386__) || defined (__x86_64__) +#if defined(__i386__) || defined (__x86_64__) cpu64bit = (_get_cpu_capabilities() & k64Bit) == k64Bit; #endif @@ -502,114 +487,18 @@ sysctl_mib_init(void) sysctl_register_oid(&sysctl__hw_cputhreadtype); } -#ifdef __ppc__ -/* - * The convention for these is as follows: - * If the sysctl does not exist, the functionality is not present in the CPU. - * If the sysctl exists, it will not crash, and should otherwise function - * corectly. - * If the sysctl exists and returns 0, we advise against using this feature. - * If the sysctl exists and returns 1, we advise it's use. - */ - - if (_cpu_capabilities & kHasAltivec) { - altivec_flag = 1; - sysctl_register_oid(&sysctl__hw_optional_altivec); - } - if (_cpu_capabilities & kHasGraphicsOps) { - graphicsops_flag = 1; - sysctl_register_oid(&sysctl__hw_optional_graphicsops); - } - if (_cpu_capabilities & k64Bit) { - x64bitops_flag = 1; - sysctl_register_oid(&sysctl__hw_optional_64bitops); - } - if (_cpu_capabilities & kHasFsqrt) { - fsqrt_flag = 1; - sysctl_register_oid(&sysctl__hw_optional_fsqrt); - } - if (_cpu_capabilities & kHasStfiwx) { - stfiwx_flag = 1; - sysctl_register_oid(&sysctl__hw_optional_stfiwx); - } - if (_cpu_capabilities & kDcbaAvailable) - dcba_flag = 0; - if (_cpu_capabilities & kDcbaRecommended) - dcba_flag = 1; - if (dcba_flag >= 0) - sysctl_register_oid(&sysctl__hw_optional_dcba); - if (_cpu_capabilities & kDataStreamsAvailable) - datastreams_flag = 0; - if (_cpu_capabilities & kDataStreamsRecommended) - datastreams_flag = 1; - if (datastreams_flag >= 0) - sysctl_register_oid(&sysctl__hw_optional_datastreams); - if (_cpu_capabilities & kDcbtStreamsAvailable) - dcbtstreams_flag = 0; - if (_cpu_capabilities & kDcbtStreamsRecommended) - dcbtstreams_flag = 1; - if (dcbtstreams_flag >= 0) - sysctl_register_oid(&sysctl__hw_optional_dcbtstreams); - - /* hw.cpufamily */ - switch (cpusubtype) { - case CPU_SUBTYPE_POWERPC_750: - cpufamily = CPUFAMILY_POWERPC_G3; - break; - case CPU_SUBTYPE_POWERPC_7400: - case CPU_SUBTYPE_POWERPC_7450: - cpufamily = CPUFAMILY_POWERPC_G4; - break; - case CPU_SUBTYPE_POWERPC_970: - cpufamily = CPUFAMILY_POWERPC_G5; - break; - default: - cpufamily = CPUFAMILY_UNKNOWN; - } - - ml_cpu_info_t cpu_info; - ml_cpu_get_info(&cpu_info); - - host_basic_info_data_t hinfo; - mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT; - kern_return_t kret = host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count); - if(kret != KERN_SUCCESS) - { - hinfo.max_cpus = 1; - } - - /* hw.cachesize */ - cachesize[0] = max_mem; - cachesize[1] = cpu_info.l1_dcache_size; - cachesize[2] = cpu_info.l2_settings ? cpu_info.l2_cache_size : 0; - cachesize[3] = cpu_info.l3_settings ? cpu_info.l3_cache_size : 0; - cachesize[4] = 0; - - /* hw.cacheconfig */ - cacheconfig[0] = hinfo.max_cpus; - cacheconfig[1] = 1; - cacheconfig[2] = cachesize[2] ? 1 : 0; - cacheconfig[3] = cachesize[3] ? 1 : 0; - cacheconfig[4] = 0; - - /* hw.packages */ - if (cpusubtype == CPU_SUBTYPE_POWERPC_970 && - cpu_info.l2_cache_size == 1 * 1024 * 1024) - /* The signature of the dual-core G5 */ - packages = roundup(hinfo.max_cpus, 2) / 2; - else - packages = hinfo.max_cpus; - -#elif defined (__i386__) || defined (__x86_64__) - mmx_flag = ((_get_cpu_capabilities() & kHasMMX) == kHasMMX)? 1 : 0; - sse_flag = ((_get_cpu_capabilities() & kHasSSE) == kHasSSE)? 1 : 0; - sse2_flag = ((_get_cpu_capabilities() & kHasSSE2) == kHasSSE2)? 1 : 0; - sse3_flag = ((_get_cpu_capabilities() & kHasSSE3) == kHasSSE3)? 1 : 0; - supplementalsse3_flag = ((_get_cpu_capabilities() & kHasSupplementalSSE3) == kHasSupplementalSSE3)? 1 : 0; - sse4_1_flag = ((_get_cpu_capabilities() & kHasSSE4_1) == kHasSSE4_1)? 1 : 0; - sse4_2_flag = ((_get_cpu_capabilities() & kHasSSE4_2) == kHasSSE4_2)? 1 : 0; - x86_64_flag = ((_get_cpu_capabilities() & k64Bit) == k64Bit)? 1 : 0; - aes_flag = ((_get_cpu_capabilities() & kHasAES) == kHasAES)? 1 : 0; +#if defined (__i386__) || defined (__x86_64__) +#define is_capability_set(k) (((_get_cpu_capabilities() & (k)) == (k)) ? 1 : 0) + mmx_flag = is_capability_set(kHasMMX); + sse_flag = is_capability_set(kHasSSE); + sse2_flag = is_capability_set(kHasSSE2); + sse3_flag = is_capability_set(kHasSSE3); + supplementalsse3_flag = is_capability_set(kHasSupplementalSSE3); + sse4_1_flag = is_capability_set(kHasSSE4_1); + sse4_2_flag = is_capability_set(kHasSSE4_2); + x86_64_flag = is_capability_set(k64Bit); + aes_flag = is_capability_set(kHasAES); + avx1_0_flag = is_capability_set(kHasAVX1_0); /* hw.cpufamily */ cpufamily = cpuid_cpufamily(); @@ -633,7 +522,7 @@ sysctl_mib_init(void) / cpuid_info()->thread_count; #else /* end __arm__ */ -# warning we do not support this platform yet -#endif /* __ppc__ */ +# error unknown architecture +#endif /* !__i386__ && !__x86_64 && !__arm__ */ } diff --git a/bsd/kern/kern_mman.c b/bsd/kern/kern_mman.c index 6da43d2fd..979af3e5d 100644 --- a/bsd/kern/kern_mman.c +++ b/bsd/kern/kern_mman.c @@ -95,6 +95,9 @@ #include #include #include +#if CONFIG_PROTECT +#include +#endif #include #include @@ -156,6 +159,7 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) int fpref=0; int error =0; int fd = uap->fd; + int num_retries = 0; user_addr = (mach_vm_offset_t)uap->addr; user_size = (mach_vm_size_t) uap->len; @@ -203,7 +207,9 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) user_size += pageoff; /* low end... */ user_size = mach_vm_round_page(user_size); /* hi end */ - + if ((flags & MAP_JIT) && ((flags & MAP_FIXED) || (flags & MAP_SHARED) || (flags & MAP_FILE))){ + return EINVAL; + } /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). @@ -216,7 +222,7 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) */ user_addr -= pageoff; if (user_addr & PAGE_MASK) - return (EINVAL); + return (EINVAL); } #ifdef notyet /* DO not have apis to get this info, need to wait till then*/ @@ -236,6 +242,19 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) alloc_flags = 0; if (flags & MAP_ANON) { + + maxprot = VM_PROT_ALL; +#if CONFIG_MACF + /* + * Entitlement check. + * Re-enable once mac* is implemented. + */ + /*error = mac_proc_check_map_anon(p, user_addr, user_size, prot, flags, &maxprot); + if (error) { + return EINVAL; + }*/ +#endif /* MAC */ + /* * Mapping blank space is trivial. Use positive fds as the alias * value for memory tracking. @@ -245,7 +264,7 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) * Use "fd" to pass (some) Mach VM allocation flags, * (see the VM_FLAGS_* definitions). */ - alloc_flags = fd & (VM_FLAGS_ALIAS_MASK | + alloc_flags = fd & (VM_FLAGS_ALIAS_MASK | VM_FLAGS_SUPERPAGE_MASK | VM_FLAGS_PURGABLE); if (alloc_flags != fd) { /* reject if there are any extra flags */ @@ -254,7 +273,6 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) } handle = NULL; - maxprot = VM_PROT_ALL; file_pos = 0; mapanon = 1; } else { @@ -382,6 +400,21 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) goto bad; } #endif /* MAC */ + +#if CONFIG_PROTECT + { + void *cnode; + if ((cnode = cp_get_protected_cnode(vp)) != NULL) { + error = cp_handle_vnop(cnode, CP_READ_ACCESS | CP_WRITE_ACCESS); + if (error) { + (void) vnode_put(vp); + goto bad; + } + } + } +#endif /* CONFIG_PROTECT */ + + } } @@ -434,6 +467,9 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) if (flags & MAP_NOCACHE) alloc_flags |= VM_FLAGS_NO_CACHE; + if (flags & MAP_JIT){ + alloc_flags |= VM_FLAGS_MAP_JIT; + } /* * Lookup/allocate object. */ @@ -455,7 +491,7 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) if (maxprot & (VM_PROT_EXECUTE | VM_PROT_WRITE)) maxprot |= VM_PROT_READ; #endif /* radar 3777787 */ - +map_anon_retry: result = vm_map_enter_mem_object(user_map, &user_addr, user_size, 0, alloc_flags, @@ -464,6 +500,16 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) (flags & MAP_SHARED) ? VM_INHERIT_SHARE : VM_INHERIT_DEFAULT); + + /* If a non-binding address was specified for this anonymous + * mapping, retry the mapping with a zero base + * in the event the mapping operation failed due to + * lack of space between the address and the map's maximum. + */ + if ((result == KERN_NO_SPACE) && ((flags & MAP_FIXED) == 0) && user_addr && (num_retries++ == 0)) { + user_addr = PAGE_SIZE; + goto map_anon_retry; + } } else { if (vnode_isswap(vp)) { /* @@ -514,7 +560,7 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) if (maxprot & (VM_PROT_EXECUTE | VM_PROT_WRITE)) maxprot |= VM_PROT_READ; #endif /* radar 3777787 */ - +map_file_retry: result = vm_map_enter_mem_object_control(user_map, &user_addr, user_size, 0, alloc_flags, @@ -523,6 +569,16 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) (flags & MAP_SHARED) ? VM_INHERIT_SHARE : VM_INHERIT_DEFAULT); + + /* If a non-binding address was specified for this file backed + * mapping, retry the mapping with a zero base + * in the event the mapping operation failed due to + * lack of space between the address and the map's maximum. + */ + if ((result == KERN_NO_SPACE) && ((flags & MAP_FIXED) == 0) && user_addr && (num_retries++ == 0)) { + user_addr = PAGE_SIZE; + goto map_file_retry; + } } if (!mapanon) { @@ -855,13 +911,15 @@ madvise(__unused proc_t p, struct madvise_args *uap, __unused int32_t *retval) result = mach_vm_behavior_set(user_map, start, size, new_behavior); switch (result) { - case KERN_SUCCESS: - return (0); - case KERN_INVALID_ADDRESS: - return (ENOMEM); + case KERN_SUCCESS: + return 0; + case KERN_INVALID_ADDRESS: + return EINVAL; + case KERN_NO_SPACE: + return ENOMEM; } - return (EINVAL); + return EINVAL; } int @@ -1034,6 +1092,7 @@ munlockall(__unused proc_t p, __unused struct munlockall_args *uap, __unused int return(ENOSYS); } +#if !defined(CONFIG_EMBEDDED) /* USV: No! need to obsolete map_fd()! mmap() already supports 64 bits */ kern_return_t map_fd(struct map_fd_args *args) @@ -1070,6 +1129,7 @@ map_fd_funneled( vm_offset_t map_addr=0; vm_size_t map_size; int err=0; + vm_prot_t maxprot = VM_PROT_ALL; vm_map_t my_map; proc_t p = current_proc(); struct vnode_attr vattr; @@ -1103,6 +1163,29 @@ map_fd_funneled( goto bad; } +#if CONFIG_MACF + err = mac_file_check_mmap(vfs_context_ucred(vfs_context_current()), + fp->f_fglob, VM_PROT_DEFAULT, MAP_FILE, &maxprot); + if (err) { + (void)vnode_put(vp); + goto bad; + } +#endif /* MAC */ + +#if CONFIG_PROTECT + /* check for content protection access */ + { + void *cnode; + if ((cnode = cp_get_protected_cnode(vp)) != NULL) { + err = cp_handle_vnop(cnode, CP_READ_ACCESS | CP_WRITE_ACCESS); + if (err != 0) { + (void)vnode_put(vp); + goto bad; + } + } + } +#endif /* CONFIG_PROTECT */ + AUDIT_ARG(vnpath, vp, ARG_VNODE1); /* @@ -1148,7 +1231,7 @@ map_fd_funneled( my_map, &map_addr, map_size, (vm_offset_t)0, VM_FLAGS_ANYWHERE, pager, offset, TRUE, - VM_PROT_DEFAULT, VM_PROT_ALL, + VM_PROT_DEFAULT, maxprot, VM_INHERIT_DEFAULT); if (result != KERN_SUCCESS) { (void)vnode_put(vp); @@ -1213,4 +1296,5 @@ map_fd_funneled( fp_drop(p, fd, fp, 0); return (err); } +#endif /* !defined(CONFIG_EMBEDDED) */ diff --git a/bsd/kern/kern_newsysctl.c b/bsd/kern/kern_newsysctl.c index de083965e..2d872d54c 100644 --- a/bsd/kern/kern_newsysctl.c +++ b/bsd/kern/kern_newsysctl.c @@ -86,20 +86,51 @@ struct sysctl_oid_list sysctl__sysctl_children; lck_rw_t * sysctl_geometry_lock = NULL; -static void -sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i); +/* + * Conditionally allow dtrace to see these functions for debugging purposes. + */ +#ifdef STATIC +#undef STATIC +#endif +#if 0 +#define STATIC +#else +#define STATIC static +#endif + +/* forward declarations of static functions */ +STATIC funnel_t *spl_kernel_funnel(void); +STATIC void sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i); +STATIC int sysctl_sysctl_debug(struct sysctl_oid *oidp, void *arg1, + int arg2, struct sysctl_req *req); +STATIC int sysctl_sysctl_name(struct sysctl_oid *oidp, void *arg1, + int arg2, struct sysctl_req *req); +STATIC int sysctl_sysctl_next_ls (struct sysctl_oid_list *lsp, + int *name, u_int namelen, int *next, int *len, int level, + struct sysctl_oid **oidpp); +STATIC int sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l); +STATIC int sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l); +STATIC int name2oid (char *name, int *oid, int *len); +STATIC int sysctl_sysctl_name2oid(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_sysctl_next(struct sysctl_oid *oidp, void *arg1, int arg2, + struct sysctl_req *req); +STATIC int sysctl_sysctl_oidfmt(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC void splx_kernel_funnel(funnel_t *saved); +STATIC int sysctl_old_user(struct sysctl_req *req, const void *p, size_t l); +STATIC int sysctl_new_user(struct sysctl_req *req, void *p, size_t l); +STATIC int sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen); +STATIC int sysctlnametomib(const char *name, int *mibp, size_t *sizep); /* * Locking and stats */ -static struct sysctl_lock memlock; /* * XXX this does not belong here */ -static funnel_t * +STATIC funnel_t * spl_kernel_funnel(void) { funnel_t *cfunnel; @@ -113,7 +144,7 @@ spl_kernel_funnel(void) return(cfunnel); } -static void +STATIC void splx_kernel_funnel(funnel_t *saved) { if (saved != kernel_flock) { @@ -123,7 +154,7 @@ splx_kernel_funnel(funnel_t *saved) } } -static int sysctl_root SYSCTL_HANDLER_ARGS; +STATIC int sysctl_root SYSCTL_HANDLER_ARGS; struct sysctl_oid_list sysctl__children; /* root list */ @@ -133,21 +164,65 @@ struct sysctl_oid_list sysctl__children; /* root list */ * Order by number in each list. */ -void sysctl_register_oid(struct sysctl_oid *oidp) +void +sysctl_register_oid(struct sysctl_oid *new_oidp) { - struct sysctl_oid_list *parent = oidp->oid_parent; + struct sysctl_oid *oidp = NULL; + struct sysctl_oid_list *parent = new_oidp->oid_parent; struct sysctl_oid *p; struct sysctl_oid *q; int n; - funnel_t *fnl; + funnel_t *fnl = NULL; /* compiler doesn't notice CTLFLAG_LOCKED */ + + /* + * The OID can be old-style (needs copy), new style without an earlier + * version (also needs copy), or new style with a matching version (no + * copy needed). Later versions are rejected (presumably, the OID + * structure was changed for a necessary reason). + */ + if (!(new_oidp->oid_kind & CTLFLAG_OID2)) { + /* + * XXX: M_TEMP is perhaps not the most apropriate zone, as it + * XXX: will subject us to use-after-free by other consumers. + */ + MALLOC(oidp, struct sysctl_oid *, sizeof(*oidp), M_TEMP, M_WAITOK | M_ZERO); + if (oidp == NULL) + return; /* reject: no memory */ + + /* + * Copy the structure only through the oid_fmt field, which + * is the last field in a non-OID2 OID structure. + * + * Note: We may want to set the oid_descr to the + * oid_name (or "") at some future date. + */ + memcpy(oidp, new_oidp, offsetof(struct sysctl_oid, oid_descr)); + } else { + /* It's a later version; handle the versions we know about */ + switch (new_oidp->oid_version) { + case SYSCTL_OID_VERSION: + /* current version */ + oidp = new_oidp; + break; + default: + return; /* rejects unknown version */ + } + } - fnl = spl_kernel_funnel(); + /* + * If it's a locked OID being registered, we can assume that the + * caller is doing their own reentrancy locking before calling us. + */ + if (!(oidp->oid_kind & CTLFLAG_LOCKED)) + fnl = spl_kernel_funnel(); if(sysctl_geometry_lock == NULL) { - /* Initialise the geometry lock for reading/modifying the sysctl tree - * This is done here because IOKit registers some sysctls before bsd_init() - * calls sysctl_register_fixed(). + /* + * Initialise the geometry lock for reading/modifying the + * sysctl tree. This is done here because IOKit registers + * some sysctl's before bsd_init() calls + * sysctl_register_fixed(). */ lck_grp_t* lck_grp = lck_grp_alloc_init("sysctl", NULL); @@ -169,6 +244,12 @@ void sysctl_register_oid(struct sysctl_oid *oidp) n = p->oid_number; } oidp->oid_number = n + 1; + /* + * Reflect the number in an llocated OID into the template + * of the caller for sysctl_unregister_oid() compares. + */ + if (oidp != new_oidp) + new_oidp->oid_number = oidp->oid_number; } /* @@ -188,30 +269,83 @@ void sysctl_register_oid(struct sysctl_oid *oidp) /* Release the write lock */ lck_rw_unlock_exclusive(sysctl_geometry_lock); - splx_kernel_funnel(fnl); + if (!(oidp->oid_kind & CTLFLAG_LOCKED)) + splx_kernel_funnel(fnl); } -void sysctl_unregister_oid(struct sysctl_oid *oidp) +void +sysctl_unregister_oid(struct sysctl_oid *oidp) { - funnel_t *fnl; + struct sysctl_oid *removed_oidp = NULL; /* OID removed from tree */ + struct sysctl_oid *old_oidp = NULL; /* OID compatibility copy */ + funnel_t *fnl = NULL; /* compiler doesn't notice CTLFLAG_LOCKED */ - fnl = spl_kernel_funnel(); + if (!(oidp->oid_kind & CTLFLAG_LOCKED)) + fnl = spl_kernel_funnel(); /* Get the write lock to modify the geometry */ lck_rw_lock_exclusive(sysctl_geometry_lock); - SLIST_REMOVE(oidp->oid_parent, oidp, sysctl_oid, oid_link); + if (!(oidp->oid_kind & CTLFLAG_OID2)) { + /* + * We're using a copy so we can get the new fields in an + * old structure, so we have to iterate to compare the + * partial structure; when we find a match, we remove it + * normally and free the memory. + */ + SLIST_FOREACH(old_oidp, oidp->oid_parent, oid_link) { + if (!memcmp(&oidp->oid_number, &old_oidp->oid_number, (offsetof(struct sysctl_oid, oid_descr)-offsetof(struct sysctl_oid, oid_number)))) { + break; + } + } + if (old_oidp != NULL) { + SLIST_REMOVE(old_oidp->oid_parent, old_oidp, sysctl_oid, oid_link); + removed_oidp = old_oidp; + } + } else { + /* It's a later version; handle the versions we know about */ + switch (oidp->oid_version) { + case SYSCTL_OID_VERSION: + /* We can just remove the OID directly... */ + SLIST_REMOVE(oidp->oid_parent, oidp, sysctl_oid, oid_link); + removed_oidp = oidp; + break; + default: + /* XXX: Can't happen; probably tree coruption.*/ + break; /* rejects unknown version */ + } + } + + /* + * We've removed it from the list at this point, but we don't want + * to return to the caller until all handler references have drained + * out. Doing things in this order prevent other people coming in + * and starting new operations against the OID node we want removed. + * + * Note: oidp could be NULL if it wasn't found. + */ + while(removed_oidp && removed_oidp->oid_refcnt) { + lck_rw_sleep(sysctl_geometry_lock, LCK_SLEEP_EXCLUSIVE, &removed_oidp->oid_refcnt, THREAD_UNINT); + } /* Release the write lock */ lck_rw_unlock_exclusive(sysctl_geometry_lock); - splx_kernel_funnel(fnl); + /* If it was allocated, free it after dropping the lock */ + if (old_oidp != NULL) { + FREE(old_oidp, M_TEMP); + } + + /* And drop the funnel interlock, if needed */ + if (!(oidp->oid_kind & CTLFLAG_LOCKED)) + splx_kernel_funnel(fnl); } /* * Bulk-register all the oids in a linker_set. */ -void sysctl_register_set(const char *set) +void +sysctl_register_set(const char *set) { struct sysctl_oid **oidpp, *oidp; @@ -223,7 +357,8 @@ void sysctl_register_set(const char *set) } } -void sysctl_unregister_set(const char *set) +void +sysctl_unregister_set(const char *set) { struct sysctl_oid **oidpp, *oidp; @@ -401,7 +536,32 @@ int sysctl_io_opaque(struct sysctl_req *req,void *pValue, size_t valueSize, int * {0,4,...} return the kind & format info for the "..." OID. */ -static void +/* + * sysctl_sysctl_debug_dump_node + * + * Description: Dump debug information for a given sysctl_oid_list at the + * given oid depth out to the kernel log, via printf + * + * Parameters: l sysctl_oid_list pointer + * i current node depth + * + * Returns: (void) + * + * Implicit: kernel log, modified + * + * Locks: Assumes sysctl_geometry_lock is held prior to calling + * + * Notes: This function may call itself recursively to resolve Node + * values, which potentially have an inferioer sysctl_oid_list + * + * This function is only callable indirectly via the function + * sysctl_sysctl_debug() + * + * Bugs: The node depth indentation does not work; this may be an + * artifact of leading space removal by the log daemon itself + * or some intermediate routine. + */ +STATIC void sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i) { int k; @@ -414,7 +574,8 @@ sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i) printf("%d %s ", oidp->oid_number, oidp->oid_name); - printf("%c%c", + printf("%c%c%c", + oidp->oid_kind & CTLFLAG_LOCKED ? 'L':' ', oidp->oid_kind & CTLFLAG_RD ? 'R':' ', oidp->oid_kind & CTLFLAG_WR ? 'W':' '); @@ -439,18 +600,83 @@ sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i) } } -static int +/* + * sysctl_sysctl_debug + * + * Description: This function implements the "sysctl.debug" portion of the + * OID space for sysctl. + * + * OID: 0, 0 + * + * Parameters: __unused + * + * Returns: ENOENT + * + * Implicit: kernel log, modified + * + * Locks: Acquires and then releases a read lock on the + * sysctl_geometry_lock + */ +STATIC int sysctl_sysctl_debug(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, __unused struct sysctl_req *req) { + lck_rw_lock_shared(sysctl_geometry_lock); sysctl_sysctl_debug_dump_node(&sysctl__children, 0); + lck_rw_done(sysctl_geometry_lock); return ENOENT; } -SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD, +SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_sysctl_debug, "-", ""); -static int +/* + * sysctl_sysctl_name + * + * Description: Convert an OID into a string name; this is used by the user + * space sysctl() command line utility; this is done in a purely + * advisory capacity (e.g. to provide node names for "sysctl -A" + * output). + * + * OID: 0, 1 + * + * Parameters: oidp __unused + * arg1 A pointer to the OID name list + * integer array, beginning at + * adjusted option base 2 + * arg2 The number of elements which + * remain in the name array + * + * Returns: 0 Success + * SYSCTL_OUT:EPERM Permission denied + * SYSCTL_OUT:EFAULT Bad user supplied buffer + * SYSCTL_OUT:??? Return value from user function + * for SYSCTL_PROC leaf node + * + * Implict: Contents of user request buffer, modified + * + * Locks: Acquires and then releases a read lock on the + * sysctl_geometry_lock + * + * Notes: SPI (System Programming Interface); this is subject to change + * and may not be relied upon by third party applications; use + * a subprocess to communicate with the "sysctl" command line + * command instead, if you believe you need this functionality. + * Preferrably, use sysctlbyname() instead. + * + * Setting of the NULL termination of the output string is + * delayed until after the geometry lock is dropped. If there + * are no Entries remaining in the OID name list when this + * function is called, it will still write out the termination + * byte. + * + * This function differs from other sysctl functions in that + * it can not take an output buffer length of 0 to determine the + * space which will be required. It is suggested that the buffer + * length be PATH_MAX, and that authors of new sysctl's refrain + * from exceeding this string length. + */ +STATIC int sysctl_sysctl_name(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req) { @@ -461,6 +687,7 @@ sysctl_sysctl_name(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_oid_list *lsp = &sysctl__children, *lsp2; char tempbuf[10]; + lck_rw_lock_shared(sysctl_geometry_lock); while (namelen) { if (!lsp) { snprintf(tempbuf,sizeof(tempbuf),"%d",*name); @@ -468,8 +695,10 @@ sysctl_sysctl_name(__unused struct sysctl_oid *oidp, void *arg1, int arg2, error = SYSCTL_OUT(req, ".", 1); if (!error) error = SYSCTL_OUT(req, tempbuf, strlen(tempbuf)); - if (error) + if (error) { + lck_rw_done(sysctl_geometry_lock); return (error); + } namelen--; name++; continue; @@ -484,8 +713,10 @@ sysctl_sysctl_name(__unused struct sysctl_oid *oidp, void *arg1, int arg2, if (!error) error = SYSCTL_OUT(req, oid->oid_name, strlen(oid->oid_name)); - if (error) + if (error) { + lck_rw_done(sysctl_geometry_lock); return (error); + } namelen--; name++; @@ -501,12 +732,45 @@ sysctl_sysctl_name(__unused struct sysctl_oid *oidp, void *arg1, int arg2, } lsp = lsp2; } + lck_rw_done(sysctl_geometry_lock); return (SYSCTL_OUT(req, "", 1)); } -SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD, sysctl_sysctl_name, ""); +SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_sysctl_name, ""); -static int +/* + * sysctl_sysctl_next_ls + * + * Description: For a given OID name value, return the next consecutive OID + * name value within the geometry tree + * + * Parameters: lsp The OID list to look in + * name The OID name to start from + * namelen The length of the OID name + * next Pointer to new oid storage to + * fill in + * len Pointer to receive new OID + * length value of storage written + * level OID tree depth (used to compute + * len value) + * oidpp Pointer to OID list entry + * pointer; used to walk the list + * forward across recursion + * + * Returns: 0 Returning a new entry + * 1 End of geometry list reached + * + * Implicit: *next Modified to contain the new OID + * *len Modified to contain new length + * + * Locks: Assumes sysctl_geometry_lock is held prior to calling + * + * Notes: This function will not return OID values that have special + * handlers, since we can not tell wheter these handlers consume + * elements from the OID space as parameters. For this reason, + * we STRONGLY discourage these types of handlers + */ +STATIC int sysctl_sysctl_next_ls (struct sysctl_oid_list *lsp, int *name, u_int namelen, int *next, int *len, int level, struct sysctl_oid **oidpp) { @@ -566,7 +830,45 @@ sysctl_sysctl_next_ls (struct sysctl_oid_list *lsp, int *name, u_int namelen, return 1; } -static int +/* + * sysctl_sysctl_next + * + * Description: This is an iterator function designed to iterate the oid tree + * and provide a list of OIDs for use by the user space "sysctl" + * command line tool + * + * OID: 0, 2 + * + * Parameters: oidp __unused + * arg1 Pointer to start OID name + * arg2 Start OID name length + * req Pointer to user request buffer + * + * Returns: 0 Success + * ENOENT Reached end of OID space + * SYSCTL_OUT:EPERM Permission denied + * SYSCTL_OUT:EFAULT Bad user supplied buffer + * SYSCTL_OUT:??? Return value from user function + * for SYSCTL_PROC leaf node + * + * Implict: Contents of user request buffer, modified + * + * Locks: Acquires and then releases a read lock on the + * sysctl_geometry_lock + * + * Notes: SPI (System Programming Interface); this is subject to change + * and may not be relied upon by third party applications; use + * a subprocess to communicate with the "sysctl" command line + * command instead, if you believe you need this functionality. + * Preferrably, use sysctlbyname() instead. + * + * This function differs from other sysctl functions in that + * it can not take an output buffer length of 0 to determine the + * space which will be required. It is suggested that the buffer + * length be PATH_MAX, and that authors of new sysctl's refrain + * from exceeding this string length. + */ +STATIC int sysctl_sysctl_next(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req) { @@ -577,17 +879,38 @@ sysctl_sysctl_next(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_oid_list *lsp = &sysctl__children; int newoid[CTL_MAXNAME]; + lck_rw_lock_shared(sysctl_geometry_lock); i = sysctl_sysctl_next_ls (lsp, name, namelen, newoid, &j, 1, &oid); + lck_rw_done(sysctl_geometry_lock); if (i) return ENOENT; error = SYSCTL_OUT(req, newoid, j * sizeof (int)); return (error); } -SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD, sysctl_sysctl_next, ""); +SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_sysctl_next, ""); -static int -name2oid (char *name, int *oid, int *len, struct sysctl_oid **oidpp) +/* + * name2oid + * + * Description: Support function for use by sysctl_sysctl_name2oid(); looks + * up an OID name given a string name. + * + * Parameters: name NULL terminated string name + * oid Pointer to receive OID name + * len Pointer to receive OID length + * pointer value (see "Notes") + * + * Returns: 0 Success + * ENOENT Entry not found + * + * Implicit: *oid Modified to contain OID value + * *len Modified to contain OID length + * + * Locks: Assumes sysctl_geometry_lock is held prior to calling + */ +STATIC int +name2oid (char *name, int *oid, int *len) { int i; struct sysctl_oid *oidp; @@ -620,8 +943,6 @@ name2oid (char *name, int *oid, int *len, struct sysctl_oid **oidpp) (*len)++; if (!i) { - if (oidpp) - *oidpp = oidp; return (0); } @@ -643,16 +964,54 @@ name2oid (char *name, int *oid, int *len, struct sysctl_oid **oidpp) return ENOENT; } -static int +/* + * sysctl_sysctl_name2oid + * + * Description: Translate a string name to an OID name value; this is used by + * the sysctlbyname() function as well as by the "sysctl" command + * line command. + * + * OID: 0, 3 + * + * Parameters: oidp __unused + * arg1 __unused + * arg2 __unused + * req Request structure + * + * Returns: ENOENT Input length too short + * ENAMETOOLONG Input length too long + * ENOMEM Could not allocate work area + * SYSCTL_IN/OUT:EPERM Permission denied + * SYSCTL_IN/OUT:EFAULT Bad user supplied buffer + * SYSCTL_IN/OUT:??? Return value from user function + * name2oid:ENOENT Not found + * + * Implicit: *req Contents of request, modified + * + * Locks: Acquires and then releases a read lock on the + * sysctl_geometry_lock + * + * Notes: SPI (System Programming Interface); this is subject to change + * and may not be relied upon by third party applications; use + * a subprocess to communicate with the "sysctl" command line + * command instead, if you believe you need this functionality. + * Preferrably, use sysctlbyname() instead. + * + * This function differs from other sysctl functions in that + * it can not take an output buffer length of 0 to determine the + * space which will be required. It is suggested that the buffer + * length be PATH_MAX, and that authors of new sysctl's refrain + * from exceeding this string length. + */ +STATIC int sysctl_sysctl_name2oid(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { char *p; int error, oid[CTL_MAXNAME]; int len = 0; /* set by name2oid() */ - struct sysctl_oid *op = 0; - if (!req->newlen) + if (req->newlen < 1) return ENOENT; if (req->newlen >= MAXPATHLEN) /* XXX arbitrary, undocumented */ return (ENAMETOOLONG); @@ -669,7 +1028,13 @@ sysctl_sysctl_name2oid(__unused struct sysctl_oid *oidp, __unused void *arg1, p [req->newlen] = '\0'; - error = name2oid(p, oid, &len, &op); + /* + * Note: We acquire and release the geometry lock here to + * avoid making name2oid needlessly complex. + */ + lck_rw_lock_shared(sysctl_geometry_lock); + error = name2oid(p, oid, &len); + lck_rw_done(sysctl_geometry_lock); FREE(p, M_TEMP); @@ -680,19 +1045,58 @@ sysctl_sysctl_name2oid(__unused struct sysctl_oid *oidp, __unused void *arg1, return (error); } -SYSCTL_PROC(_sysctl, 3, name2oid, CTLFLAG_RW|CTLFLAG_ANYBODY|CTLFLAG_KERN, 0, 0, +SYSCTL_PROC(_sysctl, 3, name2oid, CTLFLAG_RW|CTLFLAG_ANYBODY|CTLFLAG_KERN | CTLFLAG_LOCKED, 0, 0, sysctl_sysctl_name2oid, "I", ""); -static int +/* + * sysctl_sysctl_oidfmt + * + * Description: For a given OID name, determine the format of the data which + * is associated with it. This is used by the "sysctl" command + * line command. + * + * OID: 0, 4 + * + * Parameters: oidp __unused + * arg1 The OID name to look up + * arg2 The length of the OID name + * req Pointer to user request buffer + * + * Returns: 0 Success + * EISDIR Malformed request + * ENOENT No such OID name + * SYSCTL_OUT:EPERM Permission denied + * SYSCTL_OUT:EFAULT Bad user supplied buffer + * SYSCTL_OUT:??? Return value from user function + * + * Implict: Contents of user request buffer, modified + * + * Locks: Acquires and then releases a read lock on the + * sysctl_geometry_lock + * + * Notes: SPI (System Programming Interface); this is subject to change + * and may not be relied upon by third party applications; use + * a subprocess to communicate with the "sysctl" command line + * command instead, if you believe you need this functionality. + * + * This function differs from other sysctl functions in that + * it can not take an output buffer length of 0 to determine the + * space which will be required. It is suggested that the buffer + * length be PATH_MAX, and that authors of new sysctl's refrain + * from exceeding this string length. + */ +STATIC int sysctl_sysctl_oidfmt(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req) { - int *name = (int *) arg1, error; + int *name = (int *) arg1; + int error = ENOENT; /* default error: not found */ u_int namelen = arg2; u_int indx; struct sysctl_oid *oid; struct sysctl_oid_list *lsp = &sysctl__children; + lck_rw_lock_shared(sysctl_geometry_lock); oid = SLIST_FIRST(lsp); indx = 0; @@ -707,28 +1111,34 @@ sysctl_sysctl_oidfmt(__unused struct sysctl_oid *oidp, void *arg1, int arg2, lsp = (struct sysctl_oid_list *)oid->oid_arg1; oid = SLIST_FIRST(lsp); } else { - if (indx != namelen) - return EISDIR; + if (indx != namelen) { + error = EISDIR; + goto err; + } goto found; } } else { oid = SLIST_NEXT(oid, oid_link); } } - return ENOENT; + /* Not found */ + goto err; + found: if (!oid->oid_fmt) - return ENOENT; + goto err; error = SYSCTL_OUT(req, &oid->oid_kind, sizeof(oid->oid_kind)); if (!error) error = SYSCTL_OUT(req, oid->oid_fmt, strlen(oid->oid_fmt)+1); +err: + lck_rw_done(sysctl_geometry_lock); return (error); } +SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_sysctl_oidfmt, ""); -SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD, sysctl_sysctl_oidfmt, ""); /* * Default "handler" functions. @@ -842,7 +1252,7 @@ sysctl_handle_opaque(__unused struct sysctl_oid *oidp, void *arg1, int arg2, /* * Transfer functions to/from kernel space. */ -static int +STATIC int sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l) { size_t i = 0; @@ -860,7 +1270,7 @@ sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l) return (0); } -static int +STATIC int sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l) { if (!req->newptr) @@ -914,7 +1324,7 @@ kernel_sysctl(struct proc *p, int *name, u_int namelen, void *old, size_t *oldle /* * Transfer function to/from user space. */ -static int +STATIC int sysctl_old_user(struct sysctl_req *req, const void *p, size_t l) { int error = 0; @@ -937,7 +1347,7 @@ sysctl_old_user(struct sysctl_req *req, const void *p, size_t l) return (0); } -static int +STATIC int sysctl_new_user(struct sysctl_req *req, void *p, size_t l) { int error; @@ -981,10 +1391,28 @@ sysctl_root(__unused struct sysctl_oid *oidp, void *arg1, int arg2, indx++; if (!(oid->oid_kind & CTLFLAG_LOCKED)) { +/* +printf("sysctl_root: missing CTLFLAG_LOCKED: "); +for(i = 0; i < (int)(indx - 1); i++) +printf("oid[%d] = %d ", i, name[i]); +printf("\n"); +*/ funnel_held = TRUE; } if (oid->oid_kind & CTLFLAG_NOLOCK) req->lock = 0; + /* + * For SYSCTL_PROC() functions which are for sysctl's + * which have parameters at the end of their OID + * space, you need to OR CTLTYPE_NODE into their + * access value. + * + * NOTE: For binary backward compatibility ONLY! Do + * NOT add new sysctl's that do this! Existing + * sysctl's which do this will eventually have + * compatibility code in user space, and this method + * will become unsupported. + */ if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { if (oid->oid_handler) goto found; @@ -1028,7 +1456,14 @@ sysctl_root(__unused struct sysctl_oid *oidp, void *arg1, int arg2, goto err; } - /* Most likely only root can write */ + /* + * This is where legacy enforcement of permissions occurs. If the + * flag does not say CTLFLAG_ANYBODY, then we prohibit anyone but + * root from writing new values down. If local enforcement happens + * at the leaf node, then it needs to be set as CTLFLAG_ANYBODY. In + * addition, if the leaf node is set this way, then in order to do + * specific enforcement, it has to be of type SYSCTL_PROC. + */ if (!(oid->oid_kind & CTLFLAG_ANYBODY) && req->newptr && req->p && (error = proc_suser(req->p))) @@ -1039,10 +1474,24 @@ sysctl_root(__unused struct sysctl_oid *oidp, void *arg1, int arg2, goto err; } + /* + * Reference the OID and drop the geometry lock; this prevents the + * OID from being deleted out from under the handler call, but does + * not prevent other calls into handlers or calls to manage the + * geometry elsewhere from blocking... + */ + OSAddAtomic(1, &oid->oid_refcnt); + + lck_rw_done(sysctl_geometry_lock); + + /* + * ...however, we still have to grab the funnel for those calls which + * may be into code whose reentrancy is protected by the funnel; a + * blocking operation should not prevent reentrancy, at this point. + */ if (funnel_held) { fnl = spl_kernel_funnel(); - MEMLOCK_LOCK(); } if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { @@ -1058,10 +1507,27 @@ sysctl_root(__unused struct sysctl_oid *oidp, void *arg1, int arg2, if (funnel_held) { - MEMLOCK_UNLOCK(); splx_kernel_funnel(fnl); } + /* + * This is tricky... we re-grab the geometry lock in order to drop + * the reference and wake on the address; since the geometry + * lock is a reader/writer lock rather than a mutex, we have to + * wake on all apparent 1->0 transitions. This abuses the drop + * after the reference decrement in order to wake any lck_rw_sleep() + * in progress in sysctl_unregister_oid() that slept because of a + * non-zero reference count. + * + * Note: OSAddAtomic() is defined to return the previous value; + * we use this and the fact that the lock itself is a + * barrier to avoid waking every time through on "hot" + * OIDs. + */ + lck_rw_lock_shared(sysctl_geometry_lock); + if (OSAddAtomic(-1, &oid->oid_refcnt) == 1) + wakeup(&oid->oid_refcnt); + err: lck_rw_done(sysctl_geometry_lock); return (error); @@ -1170,14 +1636,14 @@ userland_sysctl(struct proc *p, int *name, u_int namelen, user_addr_t oldp, * may not work correctly. */ -static int +STATIC int sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { return(kernel_sysctl(current_proc(), name, namelen, oldp, oldlenp, newp, newlen)); } -static int +STATIC int sysctlnametomib(const char *name, int *mibp, size_t *sizep) { int oid[2]; diff --git a/bsd/kern/kern_panicinfo.c b/bsd/kern/kern_panicinfo.c index 024ec5220..1a949de7b 100644 --- a/bsd/kern/kern_panicinfo.c +++ b/bsd/kern/kern_panicinfo.c @@ -47,7 +47,7 @@ extern int panic_dialog_set_image( const unsigned char * ptr, unsigned int size extern void panic_dialog_get_image( unsigned char ** ptr, unsigned int * size ); /* make the compiler happy */ -extern int sysctl_dopanicinfo(int *, u_int, user_addr_t, size_t *, user_addr_t, size_t, struct proc *); +static int sysctl_dopanicinfo SYSCTL_HANDLER_ARGS; #define PANIC_IMAGE_SIZE_LIMIT (32 * 4096) /* 128K - Maximum amount of memory consumed for the panic UI */ @@ -56,11 +56,20 @@ extern int sysctl_dopanicinfo(int *, u_int, user_addr_t, size_t *, user_addr_t, /* Local data */ static int image_size_limit = PANIC_IMAGE_SIZE_LIMIT; -__private_extern__ int -sysctl_dopanicinfo(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, - user_addr_t newp, size_t newlen, struct proc *p) +/* XXX Should be STATIC for dtrace debugging.. */ +static int +sysctl_dopanicinfo SYSCTL_HANDLER_ARGS { + __unused int cmd = oidp->oid_arg2; /* subcommand*/ + int *name = arg1; /* oid element argument vector */ + int namelen = arg2; /* number of oid element arguments */ + user_addr_t oldp = req->oldptr; /* user buffer copy out address */ + size_t *oldlenp = &req->oldlen; /* user buffer copy out size */ + user_addr_t newp = req->newptr; /* user buffer copy in address */ + size_t newlen = req->newlen; /* user buffer copy in size */ int error = 0; + proc_t p = current_proc(); + vm_offset_t newimage = (vm_offset_t )NULL; kern_return_t kret; unsigned char * prev_image_ptr; @@ -70,7 +79,8 @@ sysctl_dopanicinfo(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, if (namelen != 1) return (ENOTDIR); /* overloaded */ - if ( (error = proc_suser(p)) ) /* must be super user to muck with image */ + /* must be super user to muck with image */ + if ( (error = proc_suser(p)) ) return (error); switch (name[0]) { @@ -80,7 +90,7 @@ sysctl_dopanicinfo(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, case KERN_PANICINFO_TEST: panic_dialog_test(); - return (0); + break; case KERN_PANICINFO_MAXSIZE: @@ -91,7 +101,7 @@ sysctl_dopanicinfo(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, error = sysctl_int(oldp, oldlenp, newp, newlen, &image_size_limit); - return (error); + break; case KERN_PANICINFO_IMAGE: @@ -99,8 +109,10 @@ sysctl_dopanicinfo(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, if ( newp != USER_ADDR_NULL ) { /* check the length of the incoming image before allocating space for it. */ - if ( newlen > (size_t)image_size_limit ) - return (ENOMEM); + if ( newlen > (size_t)image_size_limit ) { + error = ENOMEM; + break; + } /* allocate some kernel wired memory for the new image */ kret = kmem_alloc(kernel_map, &newimage, (vm_size_t)round_page(newlen)); @@ -118,8 +130,7 @@ sysctl_dopanicinfo(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, error = EPERM; break; } - - return (error); + break; } /* copy the image in from user space */ @@ -169,12 +180,24 @@ sysctl_dopanicinfo(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, } } - return (0); + break; errout: if ( newimage != (vm_offset_t )NULL ) (void)kmem_free(kernel_map, newimage, (vm_size_t)round_page(newlen)); - return (error); + break; } + + /* adjust index so we return the right required/consumed amount */ + if (!error) + req->oldidx += req->oldlen; + + return (error); } +SYSCTL_PROC(_kern, KERN_PANICINFO, panicinfo, CTLTYPE_NODE|CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, + 0, /* Pointer argument (arg1) */ + 0, /* Integer argument (arg2) */ + sysctl_dopanicinfo, /* Handler function */ + NULL, /* Data pointer */ + ""); diff --git a/bsd/kern/kern_priv.c b/bsd/kern/kern_priv.c new file mode 100644 index 000000000..e7ceb6075 --- /dev/null +++ b/bsd/kern/kern_priv.c @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/*- + * Copyright (c) 2006 nCircle Network Security, Inc. + * Copyright (c) 2009 Robert N. M. Watson + * All rights reserved. + * + * This software was developed by Robert N. M. Watson for the TrustedBSD + * Project under contract to nCircle Network Security, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR, NCIRCLE NETWORK SECURITY, + * INC., OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include +#include +#include +#include +#include +#include + +#if CONFIG_MACF +#include +#endif + +/* + * Check a credential for privilege. Lots of good reasons to deny privilege; + * only a few to grant it. + */ +int +priv_check_cred(kauth_cred_t cred, int priv, __unused int flags) +{ + int error; + + /* + * We first evaluate policies that may deny the granting of + * privilege unilaterally. + */ +#if CONFIG_MACF + error = mac_priv_check(cred, priv); + if (error) + goto out; +#endif + + /* + * Having determined if privilege is restricted by various policies, + * now determine if privilege is granted. At this point, any policy + * may grant privilege. For now, we allow short-circuit boolean + * evaluation, so may not call all policies. Perhaps we should. + */ + if (kauth_cred_getuid(cred) == 0) { + error = 0; + goto out; + } + + /* + * Now check with MAC, if enabled, to see if a policy module grants + * privilege. + */ +#if CONFIG_MACF + if (mac_priv_grant(cred, priv) == 0) { + error = 0; + goto out; + } +#endif + + /* + * The default is deny, so if no policies have granted it, reject + * with a privilege error here. + */ + error = EPERM; +out: + return (error); +} diff --git a/bsd/kern/kern_proc.c b/bsd/kern/kern_proc.c index ba7505008..042a3a864 100644 --- a/bsd/kern/kern_proc.c +++ b/bsd/kern/kern_proc.c @@ -96,6 +96,7 @@ #include #include #include +#include #include #include #include @@ -168,13 +169,10 @@ static void orphanpg(struct pgrp *pg); void proc_name_kdp(task_t t, char * buf, int size); char *proc_name_address(void *p); -static proc_t proc_refinternal_locked(proc_t p); static void pgrp_add(struct pgrp * pgrp, proc_t parent, proc_t child); static void pgrp_remove(proc_t p); static void pgrp_replace(proc_t p, struct pgrp *pgrp); static void pgdelete_dropref(struct pgrp *pgrp); -static proc_t proc_find_zombref(int pid); -static void proc_drop_zombref(proc_t p); extern void pg_rele_dropref(struct pgrp * pgrp); struct fixjob_iterargs { @@ -345,7 +343,7 @@ proc_findinternal(int pid, int locked) } p = pfind_locked(pid); - if ((p == PROC_NULL) || (p != proc_refinternal_locked(p))) + if ((p == PROC_NULL) || (p != proc_ref_locked(p))) p = PROC_NULL; if (locked == 0) { @@ -373,15 +371,15 @@ proc_self(void) p = current_proc(); proc_list_lock(); - if (p != proc_refinternal_locked(p)) + if (p != proc_ref_locked(p)) p = PROC_NULL; proc_list_unlock(); return(p); } -static proc_t -proc_refinternal_locked(proc_t p) +proc_t +proc_ref_locked(proc_t p) { proc_t p1 = p; @@ -412,7 +410,7 @@ proc_rele_locked(proc_t p) } -static proc_t +proc_t proc_find_zombref(int pid) { proc_t p1 = PROC_NULL; @@ -440,7 +438,7 @@ proc_find_zombref(int pid) return(p1); } -static void +void proc_drop_zombref(proc_t p) { proc_list_lock(); @@ -608,7 +606,7 @@ proc_parent(proc_t p) proc_list_lock(); loop: pp = p->p_pptr; - parent = proc_refinternal_locked(pp); + parent = proc_ref_locked(pp); if ((parent == PROC_NULL) && (pp != PROC_NULL) && (pp->p_stat != SZOMB) && ((pp->p_listflag & P_LIST_EXITED) != 0) && ((pp->p_listflag & P_LIST_CHILDDRAINED)== 0)){ pp->p_listflag |= P_LIST_CHILDLKWAIT; msleep(&pp->p_childrencnt, proc_list_mlock, 0, "proc_parent", 0); @@ -781,12 +779,34 @@ proc_pidversion(proc_t p) return(p->p_idversion); } +uint64_t +proc_uniqueid(proc_t p) +{ + return(p->p_uniqueid); +} + +uint64_t +proc_selfuniqueid(void) +{ + proc_t p = current_proc(); + return(p->p_uniqueid); +} + int proc_getcdhash(proc_t p, unsigned char *cdhash) { return vn_getcdhash(p->p_textvp, p->p_textoff, cdhash); } +void +proc_getexecutableuuid(proc_t p, unsigned char *uuidbuf, unsigned long size) +{ + if (size >= sizeof(p->p_uuid)) { + memcpy(uuidbuf, p->p_uuid, sizeof(p->p_uuid)); + } +} + + void bsd_set_dependency_capable(task_t task) { @@ -1029,10 +1049,10 @@ enterpgrp(proc_t p, pid_t pgid, int mksess) sess->s_flags = 0; sess->s_listflags = 0; sess->s_ttypgrpid = NO_PID; -#ifdef CONFIG_EMBEDDED - lck_mtx_init(&sess->s_mlock, proc_lck_grp, proc_lck_attr); -#else +#if CONFIG_FINE_LOCK_GROUPS lck_mtx_init(&sess->s_mlock, proc_mlock_grp, proc_lck_attr); +#else + lck_mtx_init(&sess->s_mlock, proc_lck_grp, proc_lck_attr); #endif bcopy(procsp->s_login, sess->s_login, sizeof(sess->s_login)); @@ -1055,10 +1075,10 @@ enterpgrp(proc_t p, pid_t pgid, int mksess) proc_list_unlock(); } pgrp->pg_id = pgid; -#ifdef CONFIG_EMBEDDED - lck_mtx_init(&pgrp->pg_mlock, proc_lck_grp, proc_lck_attr); -#else +#if CONFIG_FINE_LOCK_GROUPS lck_mtx_init(&pgrp->pg_mlock, proc_mlock_grp, proc_lck_attr); +#else + lck_mtx_init(&pgrp->pg_mlock, proc_lck_grp, proc_lck_attr); #endif LIST_INIT(&pgrp->pg_members); pgrp->pg_membercnt = 0; @@ -1178,18 +1198,18 @@ pgdelete_dropref(struct pgrp *pgrp) if (sessp->s_count != 0) panic("pg_deleteref: freeing session in use"); proc_list_unlock(); -#ifdef CONFIG_EMBEDDED - lck_mtx_destroy(&sessp->s_mlock, proc_lck_grp); -#else +#if CONFIG_FINE_LOCK_GROUPS lck_mtx_destroy(&sessp->s_mlock, proc_mlock_grp); +#else + lck_mtx_destroy(&sessp->s_mlock, proc_lck_grp); #endif FREE_ZONE(sessp, sizeof(struct session), M_SESSION); } else proc_list_unlock(); -#ifdef CONFIG_EMBEDDED - lck_mtx_destroy(&pgrp->pg_mlock, proc_lck_grp); -#else +#if CONFIG_FINE_LOCK_GROUPS lck_mtx_destroy(&pgrp->pg_mlock, proc_mlock_grp); +#else + lck_mtx_destroy(&pgrp->pg_mlock, proc_lck_grp); #endif FREE_ZONE(pgrp, sizeof(*pgrp), M_PGRP); } @@ -1650,14 +1670,14 @@ sysctl_kern_lctx SYSCTL_HANDLER_ARGS SYSCTL_NODE(_kern, KERN_LCTX, lctx, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Login Context"); -SYSCTL_PROC(_kern_lctx, KERN_LCTX_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT, +SYSCTL_PROC(_kern_lctx, KERN_LCTX_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT | CTLFLAG_LOCKED, 0, 0, sysctl_kern_lctx, "S,lctx", "Return entire login context table"); -SYSCTL_NODE(_kern_lctx, KERN_LCTX_LCID, lcid, CTLFLAG_RD, +SYSCTL_NODE(_kern_lctx, KERN_LCTX_LCID, lcid, CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_kern_lctx, "Login Context Table"); -SYSCTL_INT(_kern_lctx, OID_AUTO, last, CTLFLAG_RD, &lastlcid, 0, ""); -SYSCTL_INT(_kern_lctx, OID_AUTO, count, CTLFLAG_RD, &alllctx_cnt, 0, ""); -SYSCTL_INT(_kern_lctx, OID_AUTO, max, CTLFLAG_RW, &maxlcid, 0, ""); +SYSCTL_INT(_kern_lctx, OID_AUTO, last, CTLFLAG_RD | CTLFLAG_LOCKED, &lastlcid, 0, ""); +SYSCTL_INT(_kern_lctx, OID_AUTO, count, CTLFLAG_RD | CTLFLAG_LOCKED, &alllctx_cnt, 0, ""); +SYSCTL_INT(_kern_lctx, OID_AUTO, max, CTLFLAG_RW | CTLFLAG_LOCKED, &maxlcid, 0, ""); #endif /* LCTX */ @@ -1811,7 +1831,33 @@ csops(__unused proc_t p, struct csops_args *uap, __unused int32_t *retval) } return error; - + + case CS_OPS_ENTITLEMENTS_BLOB: { + char zeros[8] = { 0 }; + void *start; + size_t length; + + if (0 != (error = cs_entitlements_blob_get(pt, + &start, &length))) + break; + if (usize < sizeof(zeros) || usize < length) { + error = ERANGE; + break; + } + if (NULL == start) { + start = zeros; + length = sizeof(zeros); + } + error = copyout(start, uaddr, length); + break; + } + + case CS_OPS_MARKRESTRICT: + proc_lock(pt); + pt->p_csflags |= CS_RESTRICT; + proc_unlock(pt); + break; + default: error = EINVAL; break; @@ -1984,7 +2030,7 @@ proc_rebootscan(callout, arg, filterfn, filterarg) for (p = allproc.lh_first; (p != 0); p = p->p_list.le_next) { if ( (filterfn == 0 ) || (filterfn(p, filterarg) != 0)) { - p = proc_refinternal_locked(p); + p = proc_ref_locked(p); proc_list_unlock(); lockheld = 0; @@ -2449,10 +2495,10 @@ session_rele(struct session *sess) if (sess->s_count != 0) panic("session_rele: freeing session in use"); proc_list_unlock(); -#ifdef CONFIG_EMBEDDED - lck_mtx_destroy(&sess->s_mlock, proc_lck_grp); -#else +#if CONFIG_FINE_LOCK_GROUPS lck_mtx_destroy(&sess->s_mlock, proc_mlock_grp); +#else + lck_mtx_destroy(&sess->s_mlock, proc_lck_grp); #endif FREE_ZONE(sess, sizeof(struct session), M_SESSION); } else @@ -2575,9 +2621,9 @@ unsigned long cs_procs_invalidated = 0; int cs_force_kill = 0; int cs_force_hard = 0; int cs_debug = 0; -SYSCTL_INT(_vm, OID_AUTO, cs_force_kill, CTLFLAG_RW, &cs_force_kill, 0, ""); -SYSCTL_INT(_vm, OID_AUTO, cs_force_hard, CTLFLAG_RW, &cs_force_hard, 0, ""); -SYSCTL_INT(_vm, OID_AUTO, cs_debug, CTLFLAG_RW, &cs_debug, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, cs_force_kill, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_force_kill, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, cs_force_hard, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_force_hard, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, cs_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_debug, 0, ""); int cs_allow_invalid(struct proc *p) @@ -2633,11 +2679,9 @@ cs_invalid_page( if (p->p_csflags & CS_KILL) { p->p_csflags |= CS_KILLED; proc_unlock(p); - if (cs_debug) { - printf("CODE SIGNING: cs_invalid_page(0x%llx): " - "p=%d[%s] honoring CS_KILL, final status 0x%x\n", - vaddr, p->p_pid, p->p_comm, p->p_csflags); - } + printf("CODE SIGNING: cs_invalid_page(0x%llx): " + "p=%d[%s] honoring CS_KILL, final status 0x%x\n", + vaddr, p->p_pid, p->p_comm, p->p_csflags); cs_procs_killed++; psignal(p, SIGKILL); proc_lock(p); @@ -2646,11 +2690,9 @@ cs_invalid_page( /* CS_HARD means fail the mapping operation so the process stays valid. */ if (p->p_csflags & CS_HARD) { proc_unlock(p); - if (cs_debug) { - printf("CODE SIGNING: cs_invalid_page(0x%llx): " - "p=%d[%s] honoring CS_HARD\n", - vaddr, p->p_pid, p->p_comm); - } + printf("CODE SIGNING: cs_invalid_page(0x%llx): " + "p=%d[%s] honoring CS_HARD\n", + vaddr, p->p_pid, p->p_comm); retval = 1; } else { if (p->p_csflags & CS_VALID) { @@ -2773,9 +2815,12 @@ proc_resetpcontrol(int pid) proc_t p; int pcontrol; int error; + proc_t self = current_proc(); - if ((error = suser(kauth_cred_get(), 0))) + /* if the process has been validated to handle resource control or root is valid one */ + if (((self->p_lflag & P_LVMRSRCOWNER) == 0) && (error = suser(kauth_cred_get(), 0))) return error; + p = proc_find(pid); if (p == PROC_NULL) return(ESRCH); diff --git a/bsd/kern/kern_prot.c b/bsd/kern/kern_prot.c index a084ddf89..d2408a2f3 100644 --- a/bsd/kern/kern_prot.c +++ b/bsd/kern/kern_prot.c @@ -360,9 +360,9 @@ gettid(__unused proc_t p, struct gettid_args *uap, int32_t *retval) if (!(uthread->uu_flag & UT_SETUID)) return (ESRCH); - if ((error = suword(uap->uidp, uthread->uu_ucred->cr_ruid))) + if ((error = suword(uap->uidp, kauth_cred_getruid(uthread->uu_ucred)))) return (error); - if ((error = suword(uap->gidp, uthread->uu_ucred->cr_rgid))) + if ((error = suword(uap->gidp, kauth_cred_getrgid(uthread->uu_ucred)))) return (error); *retval = 0; @@ -448,21 +448,23 @@ getgroups(__unused proc_t p, struct getgroups_args *uap, int32_t *retval) int ngrp; int error; kauth_cred_t cred; + posix_cred_t pcred; /* grab reference while we muck around with the credential */ cred = kauth_cred_get_with_ref(); + pcred = posix_cred_get(cred); if ((ngrp = uap->gidsetsize) == 0) { - *retval = cred->cr_ngroups; + *retval = pcred->cr_ngroups; kauth_cred_unref(&cred); return (0); } - if (ngrp < cred->cr_ngroups) { + if (ngrp < pcred->cr_ngroups) { kauth_cred_unref(&cred); return (EINVAL); } - ngrp = cred->cr_ngroups; - if ((error = copyout((caddr_t)cred->cr_groups, + ngrp = pcred->cr_ngroups; + if ((error = copyout((caddr_t)pcred->cr_groups, uap->gidset, ngrp * sizeof(gid_t)))) { kauth_cred_unref(&cred); @@ -716,17 +718,19 @@ setuid(proc_t p, struct setuid_args *uap, __unused int32_t *retval) uid_t gmuid = KAUTH_UID_NONE; int error; kauth_cred_t my_cred, my_new_cred; + posix_cred_t my_pcred; uid = uap->uid; my_cred = kauth_cred_proc_ref(p); + my_pcred = posix_cred_get(my_cred); DEBUG_CRED_ENTER("setuid (%d/%d): %p %d\n", p->p_pid, (p->p_pptr ? p->p_pptr->p_pid : 0), my_cred, uap->uid); AUDIT_ARG(uid, uid); - if (uid != my_cred->cr_ruid && /* allow setuid(getuid()) */ - uid != my_cred->cr_svuid && /* allow setuid(saved uid) */ + if (uid != my_pcred->cr_ruid && /* allow setuid(getuid()) */ + uid != my_pcred->cr_svuid && /* allow setuid(saved uid) */ (error = suser(my_cred, &p->p_acflag))) { kauth_cred_unref(&my_cred); return (error); @@ -747,7 +751,7 @@ setuid(proc_t p, struct setuid_args *uap, __unused int32_t *retval) * chgproccnt uses list lock for protection */ (void)chgproccnt(uid, 1); - (void)chgproccnt(my_cred->cr_ruid, -1); + (void)chgproccnt(my_pcred->cr_ruid, -1); } /* get current credential and take a reference while we muck with it */ @@ -761,7 +765,7 @@ setuid(proc_t p, struct setuid_args *uap, __unused int32_t *retval) * to something other than the default list for the user, as * in entering a group or leaving an exclusion group). */ - if (!(my_cred->cr_flags & CRF_NOMEMBERD)) + if (!(my_pcred->cr_flags & CRF_NOMEMBERD)) gmuid = uid; /* @@ -774,7 +778,7 @@ setuid(proc_t p, struct setuid_args *uap, __unused int32_t *retval) my_new_cred = kauth_cred_setresuid(my_cred, ruid, uid, svuid, gmuid); if (my_cred != my_new_cred) { - DEBUG_CRED_CHANGE("setuid CH(%d): %p/0x%08x -> %p/0x%08x\n", p->p_pid, my_cred, my_cred->cr_flags, my_new_cred, my_new_cred->cr_flags); + DEBUG_CRED_CHANGE("setuid CH(%d): %p/0x%08x -> %p/0x%08x\n", p->p_pid, my_cred, my_pcred->cr_flags, my_new_cred, posix_cred_get(my_new_cred)->cr_flags); proc_lock(p); /* @@ -791,6 +795,9 @@ setuid(proc_t p, struct setuid_args *uap, __unused int32_t *retval) continue; } p->p_ucred = my_new_cred; + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); + OSBitOrAtomic(P_SUGID, &p->p_flag); proc_unlock(p); } @@ -828,6 +835,7 @@ seteuid(proc_t p, struct seteuid_args *uap, __unused int32_t *retval) uid_t euid; int error; kauth_cred_t my_cred, my_new_cred; + posix_cred_t my_pcred; DEBUG_CRED_ENTER("seteuid: %d\n", uap->euid); @@ -835,8 +843,9 @@ seteuid(proc_t p, struct seteuid_args *uap, __unused int32_t *retval) AUDIT_ARG(euid, euid); my_cred = kauth_cred_proc_ref(p); + my_pcred = posix_cred_get(my_cred); - if (euid != my_cred->cr_ruid && euid != my_cred->cr_svuid && + if (euid != my_pcred->cr_ruid && euid != my_pcred->cr_svuid && (error = suser(my_cred, &p->p_acflag))) { kauth_cred_unref(&my_cred); return (error); @@ -855,11 +864,11 @@ seteuid(proc_t p, struct seteuid_args *uap, __unused int32_t *retval) * passed in. The subsequent compare is safe, because it is * a pointer compare rather than a contents compare. */ - my_new_cred = kauth_cred_setresuid(my_cred, KAUTH_UID_NONE, euid, KAUTH_UID_NONE, my_cred->cr_gmuid); + my_new_cred = kauth_cred_setresuid(my_cred, KAUTH_UID_NONE, euid, KAUTH_UID_NONE, my_pcred->cr_gmuid); if (my_cred != my_new_cred) { - DEBUG_CRED_CHANGE("seteuid CH(%d): %p/0x%08x -> %p/0x%08x\n", p->p_pid, my_cred, my_cred->cr_flags, my_new_cred, my_new_cred->cr_flags); + DEBUG_CRED_CHANGE("seteuid CH(%d): %p/0x%08x -> %p/0x%08x\n", p->p_pid, my_cred, my_pcred->cr_flags, my_new_cred, posix_cred_get(my_new_cred)->cr_flags); proc_lock(p); /* @@ -876,6 +885,8 @@ seteuid(proc_t p, struct seteuid_args *uap, __unused int32_t *retval) continue; } p->p_ucred = my_new_cred; + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); OSBitOrAtomic(P_SUGID, &p->p_flag); proc_unlock(p); } @@ -926,6 +937,7 @@ setreuid(proc_t p, struct setreuid_args *uap, __unused int32_t *retval) uid_t ruid, euid; int error; kauth_cred_t my_cred, my_new_cred; + posix_cred_t my_pcred; DEBUG_CRED_ENTER("setreuid %d %d\n", uap->ruid, uap->euid); @@ -939,15 +951,16 @@ setreuid(proc_t p, struct setreuid_args *uap, __unused int32_t *retval) AUDIT_ARG(ruid, ruid); my_cred = kauth_cred_proc_ref(p); + my_pcred = posix_cred_get(my_cred); if (((ruid != KAUTH_UID_NONE && /* allow no change of ruid */ - ruid != my_cred->cr_ruid && /* allow ruid = ruid */ - ruid != my_cred->cr_uid && /* allow ruid = euid */ - ruid != my_cred->cr_svuid) || /* allow ruid = svuid */ + ruid != my_pcred->cr_ruid && /* allow ruid = ruid */ + ruid != my_pcred->cr_uid && /* allow ruid = euid */ + ruid != my_pcred->cr_svuid) || /* allow ruid = svuid */ (euid != KAUTH_UID_NONE && /* allow no change of euid */ - euid != my_cred->cr_uid && /* allow euid = euid */ - euid != my_cred->cr_ruid && /* allow euid = ruid */ - euid != my_cred->cr_svuid)) && /* allow euid = svui */ + euid != my_pcred->cr_uid && /* allow euid = euid */ + euid != my_pcred->cr_ruid && /* allow euid = ruid */ + euid != my_pcred->cr_svuid)) && /* allow euid = svui */ (error = suser(my_cred, &p->p_acflag))) { /* allow root user any */ kauth_cred_unref(&my_cred); return (error); @@ -963,8 +976,8 @@ setreuid(proc_t p, struct setreuid_args *uap, __unused int32_t *retval) uid_t new_ruid; uid_t svuid = KAUTH_UID_NONE; - new_euid = my_cred->cr_uid; - new_ruid = my_cred->cr_ruid; + new_euid = my_pcred->cr_uid; + new_ruid = my_pcred->cr_ruid; /* * Set the credential with new info. If there is no change, @@ -973,16 +986,16 @@ setreuid(proc_t p, struct setreuid_args *uap, __unused int32_t *retval) * passed in. The subsequent compare is safe, because it is * a pointer compare rather than a contents compare. */ - if (euid == KAUTH_UID_NONE && my_cred->cr_uid != euid) { + if (euid == KAUTH_UID_NONE && my_pcred->cr_uid != euid) { /* changing the effective UID */ new_euid = euid; OSBitOrAtomic(P_SUGID, &p->p_flag); } - if (ruid != KAUTH_UID_NONE && my_cred->cr_ruid != ruid) { + if (ruid != KAUTH_UID_NONE && my_pcred->cr_ruid != ruid) { /* changing the real UID; must do user accounting */ /* chgproccnt uses list lock for protection */ (void)chgproccnt(ruid, 1); - (void)chgproccnt(my_cred->cr_ruid, -1); + (void)chgproccnt(my_pcred->cr_ruid, -1); new_ruid = ruid; OSBitOrAtomic(P_SUGID, &p->p_flag); } @@ -992,17 +1005,17 @@ setreuid(proc_t p, struct setreuid_args *uap, __unused int32_t *retval) * new effective uid. We are protected from escalation * by the prechecking. */ - if (my_cred->cr_svuid != uap->ruid && - my_cred->cr_svuid != uap->euid) { + if (my_pcred->cr_svuid != uap->ruid && + my_pcred->cr_svuid != uap->euid) { svuid = new_euid; OSBitOrAtomic(P_SUGID, &p->p_flag); } - my_new_cred = kauth_cred_setresuid(my_cred, ruid, euid, svuid, my_cred->cr_gmuid); + my_new_cred = kauth_cred_setresuid(my_cred, ruid, euid, svuid, my_pcred->cr_gmuid); if (my_cred != my_new_cred) { - DEBUG_CRED_CHANGE("setreuid CH(%d): %p/0x%08x -> %p/0x%08x\n", p->p_pid, my_cred, my_cred->cr_flags, my_new_cred, my_new_cred->cr_flags); + DEBUG_CRED_CHANGE("setreuid CH(%d): %p/0x%08x -> %p/0x%08x\n", p->p_pid, my_cred, my_pcred->cr_flags, my_new_cred, posix_cred_get(my_new_cred)->cr_flags); proc_lock(p); /* @@ -1019,6 +1032,8 @@ setreuid(proc_t p, struct setreuid_args *uap, __unused int32_t *retval) continue; } p->p_ucred = my_new_cred; + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); OSBitOrAtomic(P_SUGID, &p->p_flag); /* XXX redundant? */ proc_unlock(p); } @@ -1065,6 +1080,7 @@ setgid(proc_t p, struct setgid_args *uap, __unused int32_t *retval) gid_t svgid = KAUTH_GID_NONE; int error; kauth_cred_t my_cred, my_new_cred; + posix_cred_t my_pcred; DEBUG_CRED_ENTER("setgid(%d/%d): %d\n", p->p_pid, (p->p_pptr ? p->p_pptr->p_pid : 0), uap->gid); @@ -1072,9 +1088,10 @@ setgid(proc_t p, struct setgid_args *uap, __unused int32_t *retval) AUDIT_ARG(gid, gid); my_cred = kauth_cred_proc_ref(p); + my_pcred = posix_cred_get(my_cred); - if (gid != my_cred->cr_rgid && /* allow setgid(getgid()) */ - gid != my_cred->cr_svgid && /* allow setgid(saved gid) */ + if (gid != my_pcred->cr_rgid && /* allow setgid(getgid()) */ + gid != my_pcred->cr_svgid && /* allow setgid(saved gid) */ (error = suser(my_cred, &p->p_acflag))) { kauth_cred_unref(&my_cred); return (error); @@ -1119,6 +1136,8 @@ setgid(proc_t p, struct setgid_args *uap, __unused int32_t *retval) continue; } p->p_ucred = my_new_cred; + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); OSBitOrAtomic(P_SUGID, &p->p_flag); proc_unlock(p); } @@ -1161,6 +1180,7 @@ setegid(proc_t p, struct setegid_args *uap, __unused int32_t *retval) gid_t egid; int error; kauth_cred_t my_cred, my_new_cred; + posix_cred_t my_pcred; DEBUG_CRED_ENTER("setegid %d\n", uap->egid); @@ -1168,9 +1188,10 @@ setegid(proc_t p, struct setegid_args *uap, __unused int32_t *retval) AUDIT_ARG(egid, egid); my_cred = kauth_cred_proc_ref(p); + my_pcred = posix_cred_get(my_cred); - if (egid != my_cred->cr_rgid && - egid != my_cred->cr_svgid && + if (egid != my_pcred->cr_rgid && + egid != my_pcred->cr_svgid && (error = suser(my_cred, &p->p_acflag))) { kauth_cred_unref(&my_cred); return (error); @@ -1188,7 +1209,7 @@ setegid(proc_t p, struct setegid_args *uap, __unused int32_t *retval) my_new_cred = kauth_cred_setresgid(my_cred, KAUTH_GID_NONE, egid, KAUTH_GID_NONE); if (my_cred != my_new_cred) { - DEBUG_CRED_CHANGE("setegid(CH)%d: %p/0x%08x->%p/0x%08x\n", p->p_pid, my_cred, my_cred->cr_flags, my_new_cred, my_new_cred->cr_flags); + DEBUG_CRED_CHANGE("setegid(CH)%d: %p/0x%08x->%p/0x%08x\n", p->p_pid, my_cred, my_pcred->cr_flags, my_new_cred, posix_cred_get(my_new_cred)->cr_flags); proc_lock(p); /* @@ -1205,6 +1226,8 @@ setegid(proc_t p, struct setegid_args *uap, __unused int32_t *retval) continue; } p->p_ucred = my_new_cred; + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); OSBitOrAtomic(P_SUGID, &p->p_flag); proc_unlock(p); } @@ -1261,6 +1284,7 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval) gid_t rgid, egid; int error; kauth_cred_t my_cred, my_new_cred; + posix_cred_t my_pcred; DEBUG_CRED_ENTER("setregid %d %d\n", uap->rgid, uap->egid); @@ -1275,16 +1299,17 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval) AUDIT_ARG(rgid, rgid); my_cred = kauth_cred_proc_ref(p); + my_pcred = posix_cred_get(my_cred); if (((rgid != KAUTH_UID_NONE && /* allow no change of rgid */ - rgid != my_cred->cr_rgid && /* allow rgid = rgid */ - rgid != my_cred->cr_gid && /* allow rgid = egid */ - rgid != my_cred->cr_svgid) || /* allow rgid = svgid */ + rgid != my_pcred->cr_rgid && /* allow rgid = rgid */ + rgid != my_pcred->cr_gid && /* allow rgid = egid */ + rgid != my_pcred->cr_svgid) || /* allow rgid = svgid */ (egid != KAUTH_UID_NONE && /* allow no change of egid */ - egid != my_cred->cr_groups[0] && /* allow no change of egid */ - egid != my_cred->cr_gid && /* allow egid = egid */ - egid != my_cred->cr_rgid && /* allow egid = rgid */ - egid != my_cred->cr_svgid)) && /* allow egid = svgid */ + egid != my_pcred->cr_groups[0] && /* allow no change of egid */ + egid != my_pcred->cr_gid && /* allow egid = egid */ + egid != my_pcred->cr_rgid && /* allow egid = rgid */ + egid != my_pcred->cr_svgid)) && /* allow egid = svgid */ (error = suser(my_cred, &p->p_acflag))) { /* allow root user any */ kauth_cred_unref(&my_cred); return (error); @@ -1292,8 +1317,8 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval) /* get current credential and take a reference while we muck with it */ for (;;) { - uid_t new_egid = my_cred->cr_gid; - uid_t new_rgid = my_cred->cr_rgid; + uid_t new_egid = my_pcred->cr_gid; + uid_t new_rgid = my_pcred->cr_rgid; uid_t svgid = KAUTH_UID_NONE; @@ -1304,12 +1329,12 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval) * passed in. The subsequent compare is safe, because it is * a pointer compare rather than a contents compare. */ - if (egid == KAUTH_UID_NONE && my_cred->cr_groups[0] != egid) { + if (egid == KAUTH_UID_NONE && my_pcred->cr_gid != egid) { /* changing the effective GID */ new_egid = egid; OSBitOrAtomic(P_SUGID, &p->p_flag); } - if (rgid != KAUTH_UID_NONE && my_cred->cr_rgid != rgid) { + if (rgid != KAUTH_UID_NONE && my_pcred->cr_rgid != rgid) { /* changing the real GID */ new_rgid = rgid; OSBitOrAtomic(P_SUGID, &p->p_flag); @@ -1320,8 +1345,8 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval) * new effective gid. We are protected from escalation * by the prechecking. */ - if (my_cred->cr_svgid != uap->rgid && - my_cred->cr_svgid != uap->egid) { + if (my_pcred->cr_svgid != uap->rgid && + my_pcred->cr_svgid != uap->egid) { svgid = new_egid; OSBitOrAtomic(P_SUGID, &p->p_flag); } @@ -1329,7 +1354,7 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval) my_new_cred = kauth_cred_setresgid(my_cred, rgid, egid, svgid); if (my_cred != my_new_cred) { - DEBUG_CRED_CHANGE("setregid(CH)%d: %p/0x%08x->%p/0x%08x\n", p->p_pid, my_cred, my_cred->cr_flags, my_new_cred, my_new_cred->cr_flags); + DEBUG_CRED_CHANGE("setregid(CH)%d: %p/0x%08x->%p/0x%08x\n", p->p_pid, my_cred, my_pcred->cr_flags, my_new_cred, posix_cred_get(my_new_cred)->cr_flags); proc_lock(p); /* need to protect for a race where another thread @@ -1345,6 +1370,8 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval) continue; } p->p_ucred = my_new_cred; + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); OSBitOrAtomic(P_SUGID, &p->p_flag); /* XXX redundant? */ proc_unlock(p); } @@ -1444,6 +1471,7 @@ settid_with_pid(proc_t p, struct settid_with_pid_args *uap, __unused int32_t *re proc_t target_proc; struct uthread *uthread = get_bsdthread_info(current_thread()); kauth_cred_t my_cred, my_target_cred, my_new_cred; + posix_cred_t my_target_pcred; AUDIT_ARG(pid, uap->pid); AUDIT_ARG(value32, uap->assume); @@ -1491,7 +1519,8 @@ settid_with_pid(proc_t p, struct settid_with_pid_args *uap, __unused int32_t *re kauth_cred_ref(uthread->uu_ucred); my_cred = uthread->uu_ucred; my_target_cred = kauth_cred_proc_ref(target_proc); - my_new_cred = kauth_cred_setuidgid(my_cred, my_target_cred->cr_uid, my_target_cred->cr_gid); + my_target_pcred = posix_cred_get(my_target_cred); + my_new_cred = kauth_cred_setuidgid(my_cred, my_target_pcred->cr_uid, my_target_pcred->cr_gid); if (my_cred != my_new_cred) uthread->uu_ucred = my_new_cred; @@ -1647,13 +1676,15 @@ setgroups1(proc_t p, u_int gidsetsize, user_addr_t gidset, uid_t gmuid, __unused continue; } p->p_ucred = my_new_cred; + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); OSBitOrAtomic(P_SUGID, &p->p_flag); proc_unlock(p); } break; } /* Drop old proc reference or our extra reference */ - AUDIT_ARG(groupset, my_cred->cr_groups, ngrp); + AUDIT_ARG(groupset, posix_cred_get(my_cred)->cr_groups, ngrp); kauth_cred_unref(&my_cred); @@ -1835,15 +1866,17 @@ is_suser1(void) { proc_t p = current_proc(); kauth_cred_t my_cred; + posix_cred_t my_pcred; int err; if (!p) return (0); my_cred = kauth_cred_proc_ref(p); + my_pcred = posix_cred_get(my_cred); err = (suser(my_cred, &p->p_acflag) == 0 || - my_cred->cr_ruid == 0 || my_cred->cr_svuid == 0); + my_pcred->cr_ruid == 0 || my_pcred->cr_svuid == 0); kauth_cred_unref(&my_cred); return(err); } @@ -1959,6 +1992,7 @@ set_security_token(proc_t p) security_token_t sec_token; audit_token_t audit_token; kauth_cred_t my_cred; + posix_cred_t my_pcred; host_priv_t host_priv; /* @@ -1975,10 +2009,12 @@ set_security_token(proc_t p) } my_cred = kauth_cred_proc_ref(p); + my_pcred = posix_cred_get(my_cred); + /* XXX mach_init doesn't have a p_ucred when it calls this function */ if (IS_VALID_CRED(my_cred)) { sec_token.val[0] = kauth_cred_getuid(my_cred); - sec_token.val[1] = my_cred->cr_gid; + sec_token.val[1] = kauth_cred_getgid(my_cred); } else { sec_token.val[0] = 0; sec_token.val[1] = 0; @@ -1994,10 +2030,10 @@ set_security_token(proc_t p) * changes. */ audit_token.val[0] = my_cred->cr_audit.as_aia_p->ai_auid; - audit_token.val[1] = my_cred->cr_uid; - audit_token.val[2] = my_cred->cr_gid; - audit_token.val[3] = my_cred->cr_ruid; - audit_token.val[4] = my_cred->cr_rgid; + audit_token.val[1] = my_pcred->cr_uid; + audit_token.val[2] = my_pcred->cr_gid; + audit_token.val[3] = my_pcred->cr_ruid; + audit_token.val[4] = my_pcred->cr_rgid; audit_token.val[5] = p->p_pid; audit_token.val[6] = my_cred->cr_audit.as_aia_p->ai_asid; audit_token.val[7] = p->p_idversion; @@ -2028,12 +2064,13 @@ __private_extern__ void cru2x(kauth_cred_t cr, struct xucred *xcr) { + posix_cred_t pcr = posix_cred_get(cr); bzero(xcr, sizeof(*xcr)); xcr->cr_version = XUCRED_VERSION; xcr->cr_uid = kauth_cred_getuid(cr); - xcr->cr_ngroups = cr->cr_ngroups; - bcopy(cr->cr_groups, xcr->cr_groups, sizeof(xcr->cr_groups)); + xcr->cr_ngroups = pcr->cr_ngroups; + bcopy(pcr->cr_groups, xcr->cr_groups, sizeof(xcr->cr_groups)); } #if CONFIG_LCTX diff --git a/bsd/kern/kern_resource.c b/bsd/kern/kern_resource.c index 13b124887..d2473dbf0 100644 --- a/bsd/kern/kern_resource.c +++ b/bsd/kern/kern_resource.c @@ -111,8 +111,9 @@ int donice(struct proc *curp, struct proc *chgp, int n); int dosetrlimit(struct proc *p, u_int which, struct rlimit *limp); int uthread_get_background_state(uthread_t); static void do_background_socket(struct proc *p, thread_t thread, int priority); -static int do_background_thread(struct proc *curp, int priority); -static int do_background_task(struct proc *curp, int priority); +static int do_background_thread(struct proc *curp, thread_t thread, int priority); +static int do_background_proc(struct proc *curp, struct proc *targetp, int priority); +void proc_apply_task_networkbg_internal(proc_t); rlim_t maxdmap = MAXDSIZ; /* XXX */ rlim_t maxsmap = MAXSSIZ - PAGE_SIZE; /* XXX */ @@ -125,10 +126,10 @@ rlim_t maxsmap = MAXSSIZ - PAGE_SIZE; /* XXX */ */ __private_extern__ int maxfilesperproc = OPEN_MAX; /* per-proc open files limit */ -SYSCTL_INT( _kern, KERN_MAXPROCPERUID, maxprocperuid, CTLFLAG_RW, +SYSCTL_INT(_kern, KERN_MAXPROCPERUID, maxprocperuid, CTLFLAG_RW | CTLFLAG_LOCKED, &maxprocperuid, 0, "Maximum processes allowed per userid" ); -SYSCTL_INT( _kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, +SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW | CTLFLAG_LOCKED, &maxfilesperproc, 0, "Maximum files allowed open per process" ); /* Args and fn for proc_iteration callback used in setpriority */ @@ -371,8 +372,10 @@ setpriority(struct proc *curp, struct setpriority_args *uap, __unused int32_t *r if (uap->who != 0) { return (EINVAL); } - error = do_background_thread(curp, uap->prio); - (void) do_background_socket(curp, current_thread(), uap->prio); + error = do_background_thread(curp, current_thread(), uap->prio); + if (!error) { + (void) do_background_socket(curp, current_thread(), uap->prio); + } found++; break; } @@ -387,8 +390,10 @@ setpriority(struct proc *curp, struct setpriority_args *uap, __unused int32_t *r refheld = 1; } - error = do_background_task(p, uap->prio); - (void) do_background_socket(p, NULL, uap->prio); + error = do_background_proc(curp, p, uap->prio); + if (!error) { + (void) do_background_socket(p, NULL, uap->prio); + } found++; if (refheld != 0) @@ -421,9 +426,9 @@ donice(struct proc *curp, struct proc *chgp, int n) ucred = kauth_cred_proc_ref(curp); my_cred = kauth_cred_proc_ref(chgp); - if (suser(ucred, NULL) && ucred->cr_ruid && + if (suser(ucred, NULL) && kauth_cred_getruid(ucred) && kauth_cred_getuid(ucred) != kauth_cred_getuid(my_cred) && - ucred->cr_ruid != kauth_cred_getuid(my_cred)) { + kauth_cred_getruid(ucred) != kauth_cred_getuid(my_cred)) { error = EPERM; goto out; } @@ -451,19 +456,53 @@ donice(struct proc *curp, struct proc *chgp, int n) } static int -do_background_task(struct proc *p, int priority) +do_background_proc(struct proc *curp, struct proc *targetp, int priority) { int error = 0; + kauth_cred_t ucred; + kauth_cred_t target_cred; +#if CONFIG_EMBEDDED task_category_policy_data_t info; +#endif + + ucred = kauth_cred_get(); + target_cred = kauth_cred_proc_ref(targetp); + + if (!kauth_cred_issuser(ucred) && kauth_cred_getruid(ucred) && + kauth_cred_getuid(ucred) != kauth_cred_getuid(target_cred) && + kauth_cred_getruid(ucred) != kauth_cred_getuid(target_cred)) + { + error = EPERM; + goto out; + } + +#if CONFIG_MACF + error = mac_proc_check_sched(curp, targetp); + if (error) + goto out; +#endif + +#if !CONFIG_EMBEDDED + if (priority == PRIO_DARWIN_NONUI) + error = proc_apply_task_gpuacc(targetp->task, TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS); + else + error = proc_set1_bgtaskpolicy(targetp->task, priority); + if (error) + goto out; +#else /* !CONFIG_EMBEDDED */ /* set the max scheduling priority on the task */ - if (priority & PRIO_DARWIN_BG) { + if (priority == PRIO_DARWIN_BG) { info.role = TASK_THROTTLE_APPLICATION; - } else { + } + else if (priority == PRIO_DARWIN_NONUI) { + info.role = TASK_NONUI_APPLICATION; + } + else { info.role = TASK_DEFAULT_APPLICATION; } - error = task_policy_set(p->task, + error = task_policy_set(targetp->task, TASK_CATEGORY_POLICY, (task_policy_t) &info, TASK_CATEGORY_POLICY_COUNT); @@ -471,22 +510,24 @@ do_background_task(struct proc *p, int priority) if (error) goto out; - proc_lock(p); + proc_lock(targetp); /* mark proc structure as backgrounded */ - if (priority & PRIO_DARWIN_BG) { - p->p_lflag |= P_LBACKGROUND; + if (priority == PRIO_DARWIN_BG) { + targetp->p_lflag |= P_LBACKGROUND; } else { - p->p_lflag &= ~P_LBACKGROUND; + targetp->p_lflag &= ~P_LBACKGROUND; } /* set or reset the disk I/O priority */ - p->p_iopol_disk = (priority == PRIO_DARWIN_BG ? + targetp->p_iopol_disk = (priority == PRIO_DARWIN_BG ? IOPOL_THROTTLE : IOPOL_DEFAULT); - proc_unlock(p); + proc_unlock(targetp); +#endif /* !CONFIG_EMBEDDED */ out: + kauth_cred_unref(&target_cred); return (error); } @@ -497,7 +538,7 @@ do_background_socket(struct proc *p, thread_t thread, int priority) struct fileproc *fp; int i; - if (priority & PRIO_DARWIN_BG) { + if (priority == PRIO_DARWIN_BG) { /* * For PRIO_DARWIN_PROCESS (thread is NULL), simply mark * the sockets with the background flag. There's nothing @@ -523,12 +564,6 @@ do_background_socket(struct proc *p, thread_t thread, int priority) } } else { - u_int32_t traffic_mgt; - /* - * See comments on do_background_thread(). Deregulate network - * traffics only for setpriority(PRIO_DARWIN_THREAD). - */ - traffic_mgt = (thread == NULL) ? 0 : TRAFFIC_MGT_SO_BG_REGULATE; /* disable networking IO throttle. * NOTE - It is a known limitation of the current design that we @@ -550,7 +585,7 @@ do_background_socket(struct proc *p, thread_t thread, int priority) if ((thread) && (sockp->so_background_thread != thread)) { continue; } - socket_clear_traffic_mgt_flags(sockp, TRAFFIC_MGT_SO_BACKGROUND | traffic_mgt); + socket_clear_traffic_mgt_flags(sockp, TRAFFIC_MGT_SO_BACKGROUND); sockp->so_background_thread = NULL; } proc_fdunlock(p); @@ -572,15 +607,26 @@ do_background_socket(struct proc *p, thread_t thread, int priority) * and only TRAFFIC_MGT_SO_BACKGROUND is set via do_background_socket(). */ static int -do_background_thread(struct proc *curp __unused, int priority) +do_background_thread(struct proc *curp __unused, thread_t thread, int priority) { - thread_t thread; struct uthread *ut; +#if !CONFIG_EMBEDDED + int error = 0; +#else /* !CONFIG_EMBEDDED */ thread_precedence_policy_data_t policy; +#endif /* !CONFIG_EMBEDDED */ - thread = current_thread(); ut = get_bsdthread_info(thread); + /* Backgrounding is unsupported for threads in vfork */ + if ( (ut->uu_flag & UT_VFORK) != 0) { + return(EPERM); + } + +#if !CONFIG_EMBEDDED + error = proc_set1_bgthreadpolicy(curp->task, thread_tid(thread), priority); + return(error); +#else /* !CONFIG_EMBEDDED */ if ( (priority & PRIO_DARWIN_BG) == 0 ) { /* turn off backgrounding of thread */ if ( (ut->uu_flag & UT_BACKGROUND) == 0 ) { @@ -630,9 +676,57 @@ do_background_thread(struct proc *curp __unused, int priority) * thread then TRAFFIC_MGT_SO_{BACKGROUND,BG_REGULATE} is set. * Existing sockets are taken care of by do_background_socket(). */ +#endif /* !CONFIG_EMBEDDED */ return(0); } +#if CONFIG_EMBEDDED +int mach_do_background_thread(thread_t thread, int prio); + +int +mach_do_background_thread(thread_t thread, int prio) +{ + int error = 0; + struct proc *curp = NULL; + struct proc *targetp = NULL; + kauth_cred_t ucred; + + targetp = get_bsdtask_info(get_threadtask(thread)); + if (!targetp) { + return KERN_INVALID_ARGUMENT; + } + + curp = proc_self(); + if (curp == PROC_NULL) { + return KERN_FAILURE; + } + + ucred = kauth_cred_proc_ref(curp); + + if (suser(ucred, NULL) && curp != targetp) { + error = KERN_PROTECTION_FAILURE; + goto out; + } + + error = do_background_thread(curp, thread, prio); + if (!error) { + (void) do_background_socket(curp, thread, prio); + } else { + if (error == EPERM) { + error = KERN_PROTECTION_FAILURE; + } else { + error = KERN_FAILURE; + } + } + +out: + proc_rele(curp); + kauth_cred_unref(&ucred); + return error; +} +#endif /* CONFIG_EMBEDDED */ + +#if CONFIG_EMBEDDED /* * If the thread or its proc has been put into the background * with setpriority(PRIO_DARWIN_{THREAD,PROCESS}, *, PRIO_DARWIN_BG), @@ -653,6 +747,7 @@ uthread_get_background_state(uthread_t uth) return 0; } +#endif /* CONFIG_EMBEDDED */ /* * Returns: 0 Success @@ -1234,19 +1329,70 @@ int iopolicysys(__unused struct proc *p, __unused struct iopolicysys_args *uap, __unused int32_t *retval) { int error = 0; + struct _iopol_param_t iop_param; +#if !CONFIG_EMBEDDED + int processwide = 0; +#else /* !CONFIG_EMBEDDED */ thread_t thread = THREAD_NULL; - int *policy; struct uthread *ut = NULL; - struct _iopol_param_t iop_param; + int *policy; +#endif /* !CONFIG_EMBEDDED */ if ((error = copyin(uap->arg, &iop_param, sizeof(iop_param))) != 0) - goto exit; + goto out; if (iop_param.iop_iotype != IOPOL_TYPE_DISK) { error = EINVAL; - goto exit; + goto out; + } + +#if !CONFIG_EMBEDDED + switch (iop_param.iop_scope) { + case IOPOL_SCOPE_PROCESS: + processwide = 1; + break; + case IOPOL_SCOPE_THREAD: + processwide = 0; + break; + default: + error = EINVAL; + goto out; } + + switch(uap->cmd) { + case IOPOL_CMD_SET: + switch (iop_param.iop_policy) { + case IOPOL_DEFAULT: + case IOPOL_NORMAL: + case IOPOL_THROTTLE: + case IOPOL_PASSIVE: + if(processwide != 0) + proc_apply_task_diskacc(current_task(), iop_param.iop_policy); + else + proc_apply_thread_selfdiskacc(iop_param.iop_policy); + + break; + default: + error = EINVAL; + goto out; + } + break; + + case IOPOL_CMD_GET: + if(processwide != 0) + iop_param.iop_policy = proc_get_task_disacc(current_task()); + else + iop_param.iop_policy = proc_get_thread_selfdiskacc(); + + error = copyout((caddr_t)&iop_param, uap->arg, sizeof(iop_param)); + break; + default: + error = EINVAL; // unknown command + break; + } + +#else /* !CONFIG_EMBEDDED */ switch (iop_param.iop_scope) { case IOPOL_SCOPE_PROCESS: policy = &p->p_iopol_disk; @@ -1258,7 +1404,7 @@ iopolicysys(__unused struct proc *p, __unused struct iopolicysys_args *uap, __un break; default: error = EINVAL; - goto exit; + goto out; } switch(uap->cmd) { @@ -1274,7 +1420,7 @@ iopolicysys(__unused struct proc *p, __unused struct iopolicysys_args *uap, __un break; default: error = EINVAL; - goto exit; + goto out; } break; case IOPOL_CMD_GET: @@ -1300,7 +1446,8 @@ iopolicysys(__unused struct proc *p, __unused struct iopolicysys_args *uap, __un break; } - exit: +#endif /* !CONFIG_EMBEDDED */ +out: *retval = error; return (error); } @@ -1309,8 +1456,14 @@ iopolicysys(__unused struct proc *p, __unused struct iopolicysys_args *uap, __un boolean_t thread_is_io_throttled(void); boolean_t -thread_is_io_throttled(void) { +thread_is_io_throttled(void) +{ + +#if !CONFIG_EMBEDDED + return(proc_get_task_selfdiskacc() == IOPOL_THROTTLE); + +#else /* !CONFIG_EMBEDDED */ int policy; struct uthread *ut; @@ -1326,4 +1479,54 @@ thread_is_io_throttled(void) { return TRUE; } return FALSE; +#endif /* !CONFIG_EMBEDDED */ +} + +void +proc_apply_task_networkbg(void * bsd_info) +{ + proc_t p = PROC_NULL; + proc_t curp = (proc_t)bsd_info; + pid_t pid; + + pid = curp->p_pid; + p = proc_find(pid); + if (p != PROC_NULL) { + do_background_socket(p, NULL, PRIO_DARWIN_BG); + proc_rele(p); + } +} + +void +proc_restore_task_networkbg(void * bsd_info) +{ + proc_t p = PROC_NULL; + proc_t curp = (proc_t)bsd_info; + pid_t pid; + + pid = curp->p_pid; + p = proc_find(pid); + if (p != PROC_NULL) { + do_background_socket(p, NULL, 0); + proc_rele(p); + } + +} + +void +proc_set_task_networkbg(void * bsdinfo, int setbg) +{ + if (setbg != 0) + proc_apply_task_networkbg(bsdinfo); + else + proc_restore_task_networkbg(bsdinfo); } + +void +proc_apply_task_networkbg_internal(proc_t p) +{ + if (p != PROC_NULL) { + do_background_socket(p, NULL, PRIO_DARWIN_BG); + } +} + diff --git a/bsd/kern/kern_shutdown.c b/bsd/kern/kern_shutdown.c index f8984bb3c..4e231826d 100644 --- a/bsd/kern/kern_shutdown.c +++ b/bsd/kern/kern_shutdown.c @@ -67,16 +67,16 @@ #include /* for task_suspend() */ #include /* abused for sync() */ #include /* for delay_for_interval() */ +#include #include -int system_inshutdown = 0; +uint32_t system_inshutdown = 0; /* XXX should be in a header file somewhere, but isn't */ extern void md_prepare_for_shutdown(int, int, char *); extern void (*unmountroot_pre_hook)(void); -int waittime = -1; unsigned int proc_shutdown_exitcount = 0; static int sd_openlog(vfs_context_t); @@ -109,37 +109,34 @@ static int sd_callback1(proc_t p, void * arg); static int sd_callback2(proc_t p, void * arg); static int sd_callback3(proc_t p, void * arg); -void +int boot(int paniced, int howto, char *command) { struct proc *p = current_proc(); /* XXX */ int hostboot_option=0; - int funnel_state; - system_inshutdown = 1; - - funnel_state = thread_funnel_set(kernel_flock, TRUE); - - /* - * Temporary hack to notify the power management root domain - * that the system will shut down. - */ + if (!OSCompareAndSwap(0, 1, &system_inshutdown)) { + if ( (howto&RB_QUICK) == RB_QUICK) + goto force_reboot; + return (EBUSY); + } + /* + * Temporary hack to notify the power management root domain + * that the system will shut down. + */ IOSystemShutdownNotification(); md_prepare_for_shutdown(paniced, howto, command); - if ((howto&RB_QUICK)==RB_QUICK && waittime < 0) { - waittime = 0; + if ((howto&RB_QUICK)==RB_QUICK) { printf("Quick reboot...\n"); if ((howto&RB_NOSYNC)==0) { sync(p, (void *)NULL, (int *)NULL); } } - else if ((howto&RB_NOSYNC)==0 && waittime < 0) { + else if ((howto&RB_NOSYNC)==0) { int iter, nbusy; - waittime = 0; - printf("syncing disks... "); /* @@ -150,7 +147,7 @@ boot(int paniced, int howto, char *command) proc_shutdown(); #if CONFIG_AUDIT - audit_shutdown(); + audit_shutdown(); #endif if (unmountroot_pre_hook != NULL) @@ -162,7 +159,7 @@ boot(int paniced, int howto, char *command) * Now that all processes have been terminated and system is * sync'ed up, suspend init */ - + if (initproc && p != initproc) task_suspend(initproc->task); @@ -187,7 +184,6 @@ boot(int paniced, int howto, char *command) else printf("done\n"); } - #if NETWORKING /* * Can't just use an splnet() here to disable the network @@ -197,6 +193,7 @@ boot(int paniced, int howto, char *command) if_down_all(); #endif /* NETWORKING */ +force_reboot: if (howto & RB_POWERDOWN) hostboot_option = HOST_REBOOT_HALT; if (howto & RB_HALT) @@ -204,13 +201,15 @@ boot(int paniced, int howto, char *command) if (paniced == RB_PANIC) hostboot_option = HOST_REBOOT_HALT; - if (howto & RB_UPSDELAY) { - hostboot_option = HOST_REBOOT_UPSDELAY; - } + if (howto & RB_UPSDELAY) { + hostboot_option = HOST_REBOOT_UPSDELAY; + } host_reboot(host_priv_self(), hostboot_option); - - thread_funnel_set(kernel_flock, FALSE); + /* + * should not be reached + */ + return (0); } static int diff --git a/bsd/kern/kern_sig.c b/bsd/kern/kern_sig.c index e0ded6e4c..de5455812 100644 --- a/bsd/kern/kern_sig.c +++ b/bsd/kern/kern_sig.c @@ -334,10 +334,10 @@ cansignal(proc_t p, kauth_cred_t uc, proc_t q, int signum, int zombie) else my_cred = proc_ucred(q); - if (uc->cr_ruid == my_cred->cr_ruid || - uc->cr_ruid == my_cred->cr_svuid || - kauth_cred_getuid(uc) == my_cred->cr_ruid || - kauth_cred_getuid(uc) == my_cred->cr_svuid) { + if (kauth_cred_getruid(uc) == kauth_cred_getruid(my_cred) || + kauth_cred_getruid(uc) == kauth_cred_getsvuid(my_cred) || + kauth_cred_getuid(uc) == kauth_cred_getruid(my_cred) || + kauth_cred_getuid(uc) == kauth_cred_getsvuid(my_cred)) { if (zombie == 0) kauth_cred_unref(&my_cred); return (1); @@ -566,7 +566,7 @@ set_procsigmask(proc_t p, int bit) * process/thread pair. * * We mark thread as unused to alow compilation without warning - * onnon-PPC platforms. + * on non-PPC platforms. */ int setsigvec(proc_t p, __unused thread_t thread, int signum, struct __kern_sigaction *sa, boolean_t in_sigstart) @@ -623,14 +623,6 @@ setsigvec(proc_t p, __unused thread_t thread, int signum, struct __kern_sigactio OSBitAndAtomic(~((uint32_t)P_NOCLDWAIT), &p->p_flag); } -#ifdef __ppc__ - if (signum == SIGFPE) { - if (sa->sa_handler == SIG_DFL || sa->sa_handler == SIG_IGN) - thread_enable_fpe(thread, 0); - else - thread_enable_fpe(thread, 1); - } -#endif /* __ppc__ */ /* * Set bit in p_sigignore for signals that are set to SIG_IGN, * and for signals set to SIG_DFL where the default is to ignore. @@ -1749,34 +1741,35 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) if (flavor & PSIG_VFORK) { sig_task = task; sig_thread = thread; - sig_proc= p; + sig_proc = p; } else if (flavor & PSIG_THREAD) { sig_task = get_threadtask(thread); sig_thread = thread; sig_proc = (proc_t)get_bsdtask_info(sig_task); } else { sig_task = p->task; - sig_proc = p; sig_thread = (struct thread *)0; + sig_proc = p; } - if (((sig_task == TASK_NULL) || is_kerneltask(sig_task))) { + + if ((sig_task == TASK_NULL) || is_kerneltask(sig_task)) return; - } /* * do not send signals to the process that has the thread * doing a reboot(). Not doing so will mark that thread aborted - * and can cause IO failures wich will cause data loss. + * and can cause IO failures wich will cause data loss. There's + * also no need to send a signal to a process that is in the middle + * of being torn down. */ - if (ISSET(sig_proc->p_flag, P_REBOOT)) { + if (ISSET(sig_proc->p_flag, P_REBOOT) || + ISSET(sig_proc->p_lflag, P_LEXIT)) return; - } if( (flavor & (PSIG_VFORK | PSIG_THREAD)) == 0) { proc_knote(sig_proc, NOTE_SIGNAL | signum); } - if ((flavor & PSIG_LOCKED)== 0) proc_signalstart(sig_proc, 0); @@ -2027,7 +2020,7 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) if (( pp != PROC_NULL) && ((pp->p_flag & P_NOCLDSTOP) == 0)) { my_cred = kauth_cred_proc_ref(sig_proc); - r_uid = my_cred->cr_ruid; + r_uid = kauth_cred_getruid(my_cred); kauth_cred_unref(&my_cred); proc_lock(sig_proc); @@ -2077,6 +2070,14 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) */ sig_proc->p_stat = SRUN; proc_unlock(sig_proc); + /* + * In scenarios where suspend/resume are racing + * the signal we are missing AST_BSD by the time + * we get here, set again to avoid races. This + * was the scenario with spindump enabled shutdowns. + * We would need to cover this approp down the line. + */ + act_set_astbsd(sig_thread); thread_abort(sig_thread); goto psigout; @@ -2281,7 +2282,7 @@ issignal(proc_t p) } else { proc_unlock(p); my_cred = kauth_cred_proc_ref(p); - r_uid = my_cred->cr_ruid; + r_uid = kauth_cred_getruid(my_cred); kauth_cred_unref(&my_cred); pp = proc_parentholdref(p); @@ -2445,7 +2446,7 @@ issignal(proc_t p) stop(p, pp); if ((pp != PROC_NULL) && ((pp->p_flag & P_NOCLDSTOP) == 0)) { my_cred = kauth_cred_proc_ref(p); - r_uid = my_cred->cr_ruid; + r_uid = kauth_cred_getruid(my_cred); kauth_cred_unref(&my_cred); proc_lock(pp); @@ -2501,7 +2502,7 @@ issignal(proc_t p) } /* NOTREACHED */ out: - proc_signalend(p,1); + proc_signalend(p, 1); proc_unlock(p); return(retval); } @@ -2538,6 +2539,7 @@ CURSIG(proc_t p) signum = ffs((long)sigbits); mask = sigmask(signum); prop = sigprop[signum]; + sigbits &= ~mask; /* take the signal out */ /* * We should see pending but ignored signals @@ -2546,14 +2548,8 @@ CURSIG(proc_t p) if (mask & p->p_sigignore && (p->p_lflag & P_LTRACED) == 0) { continue; } + if (p->p_lflag & P_LTRACED && (p->p_lflag & P_LPPWAIT) == 0) { - /* - * Put the new signal into p_siglist. If the - * signal is being masked, look for other signals. - */ - mask = sigmask(signum); - if (ut->uu_sigmask & mask) - continue; return(signum); } @@ -2631,7 +2627,6 @@ CURSIG(proc_t p) */ return (signum); } - sigbits &= ~mask; /* take the signal! */ } /* NOTREACHED */ } @@ -2761,12 +2756,6 @@ postsig(int signum) ps->ps_siginfo &= ~mask; ps->ps_signodefer &= ~mask; } -#ifdef __ppc__ - /* Needs to disable to run in user mode */ - if (signum == SIGFPE) { - thread_enable_fpe(current_thread(), 0); - } -#endif /* __ppc__ */ if (ps->ps_sig != signum) { code = 0; @@ -2945,10 +2934,33 @@ bsd_ast(thread_t thread) ut->t_dtrace_sig = 0; psignal(p, dt_action_sig); } + if (ut->t_dtrace_stop) { - ut->t_dtrace_stop = 0; - psignal(p, SIGSTOP); + ut->t_dtrace_stop = 0; + proc_lock(p); + p->p_dtrace_stop = 1; + proc_unlock(p); + (void)task_suspend(p->task); + } + + if (ut->t_dtrace_resumepid) { + proc_t resumeproc = proc_find(ut->t_dtrace_resumepid); + ut->t_dtrace_resumepid = 0; + if (resumeproc != PROC_NULL) { + proc_lock(resumeproc); + /* We only act on processes stopped by dtrace */ + if (resumeproc->p_dtrace_stop) { + resumeproc->p_dtrace_stop = 0; + proc_unlock(resumeproc); + task_resume(resumeproc->task); + } + else { + proc_unlock(resumeproc); + } + proc_rele(resumeproc); + } } + #endif /* CONFIG_DTRACE */ if (CHECK_SIGNALS(p, current_thread(), ut)) { @@ -3066,79 +3078,37 @@ pgsigio(pid_t pgid, int sig) proc_rele(p); } - void proc_signalstart(proc_t p, int locked) { - if (locked == 0) + if (!locked) proc_lock(p); - while ((p->p_lflag & P_LINSIGNAL) == P_LINSIGNAL) { - p->p_lflag |= P_LSIGNALWAIT; + p->p_sigwaitcnt++; + while ((p->p_lflag & P_LINSIGNAL) == P_LINSIGNAL) msleep(&p->p_sigmask, &p->p_mlock, 0, "proc_signstart", NULL); - } + p->p_sigwaitcnt--; + p->p_lflag |= P_LINSIGNAL; -#if DIAGNOSTIC -#if SIGNAL_DEBUG -#ifdef __ppc__ - { - int sp, *fp, numsaved; - - __asm__ volatile("mr %0,r1" : "=r" (sp)); - - fp = (int *)*((int *)sp); - for (numsaved = 0; numsaved < 3; numsaved++) { - p->lockpc[numsaved] = fp[2]; - if ((int)fp <= 0) - break; - fp = (int *)*fp; - } - } -#endif /* __ppc__ */ -#endif /* SIGNAL_DEBUG */ -#endif /* DIAGNOSTIC */ p->p_signalholder = current_thread(); - if (locked == 0) + if (!locked) proc_unlock(p); - } void proc_signalend(proc_t p, int locked) { - if (locked == 0) + if (!locked) proc_lock(p); p->p_lflag &= ~P_LINSIGNAL; -#if DIAGNOSTIC -#if SIGNAL_DEBUG -#ifdef __ppc__ - { - int sp, *fp, numsaved; - - __asm__ volatile("mr %0,r1" : "=r" (sp)); - - fp = (int *)*((int *)sp); - for (numsaved = 0; numsaved < 3; numsaved++) { - p->unlockpc[numsaved] = fp[2]; - if ((int)fp <= 0) - break; - fp = (int *)*fp; - } - } -#endif /* __ppc__ */ -#endif /* SIGNAL_DEBUG */ -#endif /* DIAGNOSTIC */ - - if ((p->p_lflag & P_LSIGNALWAIT) == P_LSIGNALWAIT) { - p->p_lflag &= ~P_LSIGNALWAIT; + if (p->p_sigwaitcnt > 0) wakeup(&p->p_sigmask); - } + p->p_signalholder = NULL; - if (locked == 0) + if (!locked) proc_unlock(p); } - void sig_lock_to_exit(proc_t p) { diff --git a/bsd/kern/kern_symfile.c b/bsd/kern/kern_symfile.c index ffbff213f..dc6531b42 100644 --- a/bsd/kern/kern_symfile.c +++ b/bsd/kern/kern_symfile.c @@ -72,12 +72,13 @@ get_kernel_symfile(__unused proc_t p, __unused char const **symfile) struct kern_direct_file_io_ref_t { - vfs_context_t ctx; - struct vnode *vp; + vfs_context_t ctx; + struct vnode * vp; + dev_t device; }; -static int file_ioctl(void * p1, void * p2, int theIoctl, caddr_t result) +static int file_ioctl(void * p1, void * p2, u_long theIoctl, caddr_t result) { dev_t device = *(dev_t*) p1; @@ -85,7 +86,7 @@ static int file_ioctl(void * p1, void * p2, int theIoctl, caddr_t result) (device, theIoctl, result, S_IFBLK, p2)); } -static int device_ioctl(void * p1, __unused void * p2, int theIoctl, caddr_t result) +static int device_ioctl(void * p1, __unused void * p2, u_long theIoctl, caddr_t result) { return (VNOP_IOCTL(p1, theIoctl, result, 0, p2)); } @@ -94,10 +95,14 @@ struct kern_direct_file_io_ref_t * kern_open_file_for_direct_io(const char * name, kern_get_file_extents_callback_t callback, void * callback_ref, - dev_t * device_result, + dev_t * partition_device_result, + dev_t * image_device_result, uint64_t * partitionbase_result, uint64_t * maxiocount_result, - boolean_t * solid_state) + uint32_t * oflags, + off_t offset, + caddr_t addr, + vm_size_t len) { struct kern_direct_file_io_ref_t * ref; @@ -105,14 +110,21 @@ kern_open_file_for_direct_io(const char * name, struct vnode_attr va; int error; off_t f_offset; - uint32_t blksize; - uint64_t size; + off_t filelength; + uint64_t fileblk; + size_t filechunk; + uint64_t physoffset; dev_t device; + dev_t target = 0; + int isssd = 0; + uint32_t flags = 0; + uint32_t blksize; off_t maxiocount, count; + boolean_t locked = FALSE; - int (*do_ioctl)(void * p1, void * p2, int theIoctl, caddr_t result); - void * p1; - void * p2; + int (*do_ioctl)(void * p1, void * p2, u_long theIoctl, caddr_t result); + void * p1 = NULL; + void * p2 = NULL; error = EFAULT; @@ -124,12 +136,18 @@ kern_open_file_for_direct_io(const char * name, } ref->vp = NULL; - p = current_proc(); // kernproc; + p = kernproc; ref->ctx = vfs_context_create(vfs_context_current()); if ((error = vnode_open(name, (O_CREAT | FWRITE), (0), 0, &ref->vp, ref->ctx))) goto out; + if (addr && len) + { + if ((error = kern_write_file(ref, offset, addr, len))) + goto out; + } + VATTR_INIT(&va); VATTR_WANTED(&va, va_rdev); VATTR_WANTED(&va, va_fsid); @@ -169,6 +187,80 @@ kern_open_file_for_direct_io(const char * name, error = EFAULT; goto out; } + ref->device = device; + + // generate the block list + + error = do_ioctl(p1, p2, DKIOCLOCKPHYSICALEXTENTS, NULL); + if (error) + goto out; + locked = TRUE; + + // get block size + + error = do_ioctl(p1, p2, DKIOCGETBLOCKSIZE, (caddr_t) &blksize); + if (error) + goto out; + + if (ref->vp->v_type == VREG) + filelength = va.va_data_size; + else + { + error = do_ioctl(p1, p2, DKIOCGETBLOCKCOUNT, (caddr_t) &fileblk); + if (error) + goto out; + filelength = fileblk * blksize; + } + + f_offset = 0; + while (f_offset < filelength) + { + if (ref->vp->v_type == VREG) + { + filechunk = 1*1024*1024*1024; + daddr64_t blkno; + + error = VNOP_BLOCKMAP(ref->vp, f_offset, filechunk, &blkno, &filechunk, NULL, 0, NULL); + if (error) + goto out; + + fileblk = blkno * blksize; + } + else if ((ref->vp->v_type == VBLK) || (ref->vp->v_type == VCHR)) + { + fileblk = f_offset; + filechunk = f_offset ? 0 : filelength; + } + + physoffset = 0; + while (physoffset < filechunk) + { + dk_physical_extent_t getphysreq; + bzero(&getphysreq, sizeof(getphysreq)); + + getphysreq.offset = fileblk + physoffset; + getphysreq.length = (filechunk - physoffset); + error = do_ioctl(p1, p2, DKIOCGETPHYSICALEXTENT, (caddr_t) &getphysreq); + if (error) + goto out; + if (!target) + { + target = getphysreq.dev; + } + else if (target != getphysreq.dev) + { + error = ENOTSUP; + goto out; + } + callback(callback_ref, getphysreq.offset, getphysreq.length); + physoffset += getphysreq.length; + } + f_offset += filechunk; + } + callback(callback_ref, 0ULL, 0ULL); + + if (ref->vp->v_type == VREG) + p1 = ⌖ // get partition base @@ -226,62 +318,37 @@ kern_open_file_for_direct_io(const char * name, if (maxiocount_result) *maxiocount_result = maxiocount; - if (solid_state) - { - int isssd = 0; - error = do_ioctl(p1, p2, DKIOCISSOLIDSTATE, (caddr_t)&isssd); - if (error) - *solid_state = FALSE; - else - *solid_state = isssd; - } - - // generate the block list - - error = 0; - if (ref->vp->v_type == VREG) - { - f_offset = 0; - while(f_offset < (off_t) va.va_data_size) - { - size_t io_size = 1*1024*1024*1024; - daddr64_t blkno; - - error = VNOP_BLOCKMAP(ref->vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL, 0, NULL); - if (error) - goto out; - callback(callback_ref, ((uint64_t) blkno) * blksize, (uint64_t) io_size); - f_offset += io_size; - } - callback(callback_ref, 0ULL, 0ULL); - } - else if ((ref->vp->v_type == VBLK) || (ref->vp->v_type == VCHR)) - { - error = do_ioctl(p1, p2, DKIOCGETBLOCKCOUNT, (caddr_t) &size); - if (error) - goto out; - size *= blksize; - callback(callback_ref, 0ULL, size); - callback(callback_ref, size, 0ULL); - } + error = do_ioctl(p1, p2, DKIOCISSOLIDSTATE, (caddr_t)&isssd); + if (!error && isssd) + flags |= kIOHibernateOptionSSD; - if (device_result) - *device_result = device; + if (partition_device_result) + *partition_device_result = device; + if (image_device_result) + *image_device_result = target; + if (flags) + *oflags = flags; out: kprintf("kern_open_file_for_direct_io(%d)\n", error); - if (error && ref) { - if (ref->vp) { + if (error && locked) + { + p1 = &device; + (void) do_ioctl(p1, p2, DKIOCUNLOCKPHYSICALEXTENTS, NULL); + } + + if (error && ref) + { + if (ref->vp) + { vnode_close(ref->vp, FWRITE, ref->ctx); ref->vp = NULLVP; } - vfs_context_rele(ref->ctx); kfree(ref, sizeof(struct kern_direct_file_io_ref_t)); ref = NULL; } - return(ref); } @@ -296,21 +363,47 @@ kern_write_file(struct kern_direct_file_io_ref_t * ref, off_t offset, caddr_t ad } void -kern_close_file_for_direct_io(struct kern_direct_file_io_ref_t * ref) +kern_close_file_for_direct_io(struct kern_direct_file_io_ref_t * ref, + off_t offset, caddr_t addr, vm_size_t len) { + int error; kprintf("kern_close_file_for_direct_io\n"); - if (ref) { - int error; + if (!ref) return; - if (ref->vp) { - error = vnode_close(ref->vp, FWRITE, ref->ctx); - ref->vp = NULLVP; - kprintf("vnode_close(%d)\n", error); - } - vfs_context_rele(ref->ctx); - ref->ctx = NULL; - kfree(ref, sizeof(struct kern_direct_file_io_ref_t)); + if (ref->vp) + { + int (*do_ioctl)(void * p1, void * p2, u_long theIoctl, caddr_t result); + void * p1; + void * p2; + + if (ref->vp->v_type == VREG) + { + p1 = &ref->device; + p2 = kernproc; + do_ioctl = &file_ioctl; + } + else + { + /* Partition. */ + p1 = ref->vp; + p2 = ref->ctx; + do_ioctl = &device_ioctl; + } + (void) do_ioctl(p1, p2, DKIOCUNLOCKPHYSICALEXTENTS, NULL); + + if (addr && len) + { + (void) kern_write_file(ref, offset, addr, len); + } + + error = vnode_close(ref->vp, FWRITE, ref->ctx); + + ref->vp = NULLVP; + kprintf("vnode_close(%d)\n", error); } + vfs_context_rele(ref->ctx); + ref->ctx = NULL; + kfree(ref, sizeof(struct kern_direct_file_io_ref_t)); } diff --git a/bsd/kern/kern_synch.c b/bsd/kern/kern_synch.c index 68a45824e..c6b4888c3 100644 --- a/bsd/kern/kern_synch.c +++ b/bsd/kern/kern_synch.c @@ -162,7 +162,7 @@ _sleep( struct proc *p; thread_t self = current_thread(); struct uthread * ut; - int sig, catch = pri & PCATCH; + int sig, catch; int dropmutex = pri & PDROP; int spinmutex = pri & PSPIN; int wait_result; @@ -175,26 +175,39 @@ _sleep( /* It can still block in proc_exit() after the teardown. */ if (p->p_stats != NULL) OSIncrementAtomicLong(&p->p_stats->p_ru.ru_nvcsw); + + if (pri & PCATCH) + catch = THREAD_ABORTSAFE; + else + catch = THREAD_UNINT; /* set wait message & channel */ ut->uu_wchan = chan; ut->uu_wmesg = wmsg ? wmsg : "unknown"; if (mtx != NULL && chan != NULL && (thread_continue_t)continuation == THREAD_CONTINUE_NULL) { + int flags; + + if (dropmutex) + flags = LCK_SLEEP_UNLOCK; + else + flags = LCK_SLEEP_DEFAULT; + + if (spinmutex) + flags |= LCK_SLEEP_SPIN; if (abstime) - wait_result = lck_mtx_sleep_deadline(mtx, (dropmutex) ? LCK_SLEEP_UNLOCK : 0, - chan, (catch) ? THREAD_ABORTSAFE : THREAD_UNINT, abstime); + wait_result = lck_mtx_sleep_deadline(mtx, flags, chan, catch, abstime); else - wait_result = lck_mtx_sleep(mtx, (dropmutex) ? LCK_SLEEP_UNLOCK : 0, - chan, (catch) ? THREAD_ABORTSAFE : THREAD_UNINT); + wait_result = lck_mtx_sleep(mtx, flags, chan, catch); } else { if (chan != NULL) - assert_wait_deadline(chan, (catch) ? THREAD_ABORTSAFE : THREAD_UNINT, abstime); + assert_wait_deadline(chan, catch, abstime); if (mtx) lck_mtx_unlock(mtx); - if (catch) { + + if (catch == THREAD_ABORTSAFE) { if (SHOULDissignal(p,ut)) { if ((sig = CURSIG(p)) != 0) { if (clear_wait(self, THREAD_INTERRUPTED) == KERN_FAILURE) @@ -258,11 +271,11 @@ _sleep( * first, regardless of whether awakened due * to receiving event. */ - if (!catch) + if (catch != THREAD_ABORTSAFE) break; /* else fall through */ case THREAD_INTERRUPTED: - if (catch) { + if (catch == THREAD_ABORTSAFE) { if (thread_should_abort(self)) { error = EINTR; } else if (SHOULDissignal(p, ut)) { @@ -392,7 +405,7 @@ tsleep1( void wakeup(void *chan) { - thread_wakeup_prim((caddr_t)chan, FALSE, THREAD_AWAKENED); + thread_wakeup((caddr_t)chan); } /* @@ -404,7 +417,7 @@ wakeup(void *chan) void wakeup_one(caddr_t chan) { - thread_wakeup_prim((caddr_t)chan, TRUE, THREAD_AWAKENED); + thread_wakeup_one((caddr_t)chan); } /* diff --git a/bsd/kern/kern_sysctl.c b/bsd/kern/kern_sysctl.c index 842a3e572..f2c9c8711 100644 --- a/bsd/kern/kern_sysctl.c +++ b/bsd/kern/kern_sysctl.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -107,11 +107,13 @@ #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -128,15 +130,12 @@ #include #include +#include #if defined(__i386__) || defined(__x86_64__) #include #endif -sysctlfn kern_sysctl; -#if DEBUG -sysctlfn debug_sysctl; -#endif extern sysctlfn net_sysctl; extern sysctlfn cpu_sysctl; extern int aio_max_requests; @@ -146,25 +145,47 @@ extern int lowpri_IO_window_msecs; extern int lowpri_IO_delay_msecs; extern int nx_enabled; extern int speculative_reads_disabled; +extern int ignore_is_ssd; +extern unsigned int speculative_prefetch_max; extern unsigned int preheat_pages_max; extern unsigned int preheat_pages_min; -extern unsigned int preheat_pages_mult; extern long numvnodes; -static void +extern unsigned int vm_max_delayed_work_limit; +extern unsigned int vm_max_batch; + +extern unsigned int vm_page_free_min; +extern unsigned int vm_page_free_target; +extern unsigned int vm_page_free_reserved; +extern unsigned int vm_page_speculative_percentage; +extern unsigned int vm_page_speculative_q_age_ms; + +/* + * Conditionally allow dtrace to see these functions for debugging purposes. + */ +#ifdef STATIC +#undef STATIC +#endif +#if 0 +#define STATIC +#else +#define STATIC static +#endif + +extern boolean_t mach_timer_coalescing_enabled; + +STATIC void fill_user32_eproc(proc_t p, struct user32_eproc *ep); -static void +STATIC void fill_user32_externproc(proc_t p, struct user32_extern_proc *exp); -static void +STATIC void fill_user64_eproc(proc_t p, struct user64_eproc *ep); -static void +STATIC void fill_user64_proc(proc_t p, struct user64_kinfo_proc *kp); -static void +STATIC void fill_user64_externproc(proc_t p, struct user64_extern_proc *exp); extern int kdbg_control(int *name, u_int namelen, user_addr_t where, size_t * sizep); -int -kdebug_ops(int *name, u_int namelen, user_addr_t where, size_t *sizep, proc_t p); #if NFSCLIENT extern int netboot_root(void); @@ -174,41 +195,94 @@ pcsamples_ops(int *name, u_int namelen, user_addr_t where, size_t *sizep, proc_t p); __private_extern__ kern_return_t reset_vmobjectcache(unsigned int val1, unsigned int val2); -int -sysctl_doproc(int *name, u_int namelen, user_addr_t where, size_t *sizep); -int -sysctl_doprof(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, - user_addr_t newp, size_t newlen); -static void +STATIC void fill_user32_proc(proc_t p, struct user32_kinfo_proc *kp); int sysctl_procargs(int *name, u_int namelen, user_addr_t where, size_t *sizep, proc_t cur_proc); -static int -sysctl_procargs2(int *name, u_int namelen, user_addr_t where, size_t *sizep, - proc_t cur_proc); -static int +STATIC int sysctl_procargsx(int *name, u_int namelen, user_addr_t where, size_t *sizep, proc_t cur_proc, int argc_yes); int sysctl_struct(user_addr_t oldp, size_t *oldlenp, user_addr_t newp, size_t newlen, void *sp, int len); -static int sysdoproc_filt_KERN_PROC_PID(proc_t p, void * arg); -static int sysdoproc_filt_KERN_PROC_PGRP(proc_t p, void * arg); -static int sysdoproc_filt_KERN_PROC_TTY(proc_t p, void * arg); -static int sysdoproc_filt_KERN_PROC_UID(proc_t p, void * arg); -static int sysdoproc_filt_KERN_PROC_RUID(proc_t p, void * arg); +STATIC int sysdoproc_filt_KERN_PROC_PID(proc_t p, void * arg); +STATIC int sysdoproc_filt_KERN_PROC_PGRP(proc_t p, void * arg); +STATIC int sysdoproc_filt_KERN_PROC_TTY(proc_t p, void * arg); +STATIC int sysdoproc_filt_KERN_PROC_UID(proc_t p, void * arg); +STATIC int sysdoproc_filt_KERN_PROC_RUID(proc_t p, void * arg); #if CONFIG_LCTX -static int sysdoproc_filt_KERN_PROC_LCID(proc_t p, void * arg); +STATIC int sysdoproc_filt_KERN_PROC_LCID(proc_t p, void * arg); #endif int sysdoproc_callback(proc_t p, void *arg); -static int __sysctl_funneled(proc_t p, struct __sysctl_args *uap, int32_t *retval); + +/* forward declarations for non-static STATIC */ +STATIC void fill_loadavg64(struct loadavg *la, struct user64_loadavg *la64); +STATIC void fill_loadavg32(struct loadavg *la, struct user32_loadavg *la32); +STATIC int sysctl_handle_exec_archhandler_ppc(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_handle_kern_threadname(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_sched_stats(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_sched_stats_enable(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_file(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_kdebug_ops SYSCTL_HANDLER_ARGS; +STATIC int sysctl_dotranslate SYSCTL_HANDLER_ARGS; +STATIC int sysctl_doaffinity SYSCTL_HANDLER_ARGS; +#if COUNT_SYSCALLS +STATIC int sysctl_docountsyscalls SYSCTL_HANDLER_ARGS; +#endif /* COUNT_SYSCALLS */ +#if !CONFIG_EMBEDDED +STATIC int sysctl_doprocargs SYSCTL_HANDLER_ARGS; +#endif /* !CONFIG_EMBEDDED */ +STATIC int sysctl_doprocargs2 SYSCTL_HANDLER_ARGS; +STATIC int sysctl_prochandle SYSCTL_HANDLER_ARGS; +#if DEBUG +STATIC int sysctl_dodebug SYSCTL_HANDLER_ARGS; +#endif +STATIC int sysctl_aiomax(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_aioprocmax(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_aiothreads(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_maxproc(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_osversion(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_sysctl_bootargs(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_maxvnodes(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_securelvl(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_domainname(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_hostname(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_procname(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_boottime(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_symfile(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +#if NFSCLIENT +STATIC int sysctl_netboot(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +#endif +#ifdef CONFIG_IMGSRC_ACCESS +STATIC int sysctl_imgsrcdev(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +#endif +STATIC int sysctl_usrstack(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_usrstack64(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_coredump(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_suid_coredump(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_delayterm(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_rage_vnode(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_kern_check_openevt(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_nx(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_loadavg(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_vm_toggle_address_reuse(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_swapusage(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +#if defined(__i386__) || defined(__x86_64__) +STATIC int sysctl_sysctl_exec_affinity(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +#endif +STATIC int fetch_process_cputype( proc_t cur_proc, int *name, u_int namelen, cpu_type_t *cputype); +STATIC int sysctl_sysctl_native(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_sysctl_cputype(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_safeboot(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_singleuser(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); + extern void IORegistrySetOSBuildVersion(char * build_version); -static void +STATIC void fill_loadavg64(struct loadavg *la, struct user64_loadavg *la64) { la64->ldavg[0] = la->ldavg[0]; @@ -217,7 +291,7 @@ fill_loadavg64(struct loadavg *la, struct user64_loadavg *la64) la64->fscale = (user64_long_t)la->fscale; } -static void +STATIC void fill_loadavg32(struct loadavg *la, struct user32_loadavg *la32) { la32->ldavg[0] = la->ldavg[0]; @@ -226,34 +300,76 @@ fill_loadavg32(struct loadavg *la, struct user32_loadavg *la32) la32->fscale = (user32_long_t)la->fscale; } +/* + * sysctl_mem_hold + * + * Description: Wire down the callers address map on behalf of sysctl's + * that perform their own copy operations while holding + * locks e.g. in the paging path, which could lead to a + * deadlock, or while holding a spinlock. + * + * Parameters: addr User buffer address + * len User buffer length + * + * Returns: 0 Success + * vslock:ENOMEM Insufficient physical pages to wire + * vslock:EACCES Bad protection mode + * vslock:EINVAL Invalid parameters + * + * Notes: This code is invoked for the first OID element where the + * CTLFLAG_LOCKED is not specified for a given OID node + * element durng OID traversal, and is held for all + * subsequent node traversals, and only released after the + * leaf node handler invocation is complete. + * + * Legacy: For legacy scyctl's provided by third party code which + * expect funnel protection for calls into their code, this + * routine will also take the funnel, which will also only + * be released after the leaf node handler is complete. + * + * This is to support legacy 32 bit BSD KEXTs and legacy 32 + * bit single threaded filesystem KEXTs and similar code + * which relies on funnel protection, e.g. for things like + * FSID based sysctl's. + * + * NEW CODE SHOULD NOT RELY ON THIS BEHAVIOUR! IT WILL BE + * REMOVED IN A FUTURE RELASE OF Mac OS X! + * + * Bugs: This routine does nothing with the new_addr and new_len + * at present, but it should, since read from the user space + * process adddress space which could potentially trigger + * paging may also be occurring deep down. This is due to + * a current limitation of the vslock() routine, which will + * always request a wired mapping be read/write, due to not + * taking an access mode parameter. Note that this could + * also cause problems for output on architectures where + * write access does not require read acccess if the current + * mapping lacks read access. + * + * XXX: To be moved to kern_newsysctl.c to avoid __private_extern__ + */ +int sysctl_mem_lock(user_addr_t old_addr, user_size_t old_len, user_addr_t new_addr, user_size_t new_len); +int +sysctl_mem_lock(__unused user_addr_t old_addr, __unused user_size_t old_len, __unused user_addr_t new_addr, __unused user_size_t new_len) +{ + return 0; +} + /* * Locking and stats */ -static struct sysctl_lock memlock; /* sysctl() syscall */ int -__sysctl(proc_t p, struct __sysctl_args *uap, int32_t *retval) +__sysctl(proc_t p, struct __sysctl_args *uap, __unused int32_t *retval) { - boolean_t funnel_state; + boolean_t funnel_state = FALSE; /* not held if unknown */ int error; - - funnel_state = thread_funnel_set(kernel_flock, TRUE); - error = __sysctl_funneled(p, uap, retval); - thread_funnel_set(kernel_flock, funnel_state); - return(error); -} - -static int -__sysctl_funneled(proc_t p, struct __sysctl_args *uap, __unused int32_t *retval) -{ - int error, dolock = 1; size_t savelen = 0, oldlen = 0, newlen; - sysctlfn *fnp = NULL; int name[CTL_MAXNAME]; int error1; - boolean_t memlock_taken = FALSE; boolean_t vslock_taken = FALSE; + boolean_t funnel_taken = FALSE; #if CONFIG_MACF kauth_cred_t my_cred; #endif @@ -279,38 +395,49 @@ __sysctl_funneled(proc_t p, struct __sysctl_args *uap, __unused int32_t *retval) else { newlen = uap->newlen; } - + +/* + * XXX TODO: push down rights check for CTL_HW OIDs; most duplicate + * XXX it anyway, which is a performance sink, and requires use + * XXX of SUID root programs (see ). + * + * Note: Opt out of non-leaf node enforcement by removing this + * check for the top level OID value, and then adding + * CTLFLAG_ANYBODY to the leaf nodes in question. Enforce as + * suser for writed in leaf nodes by omitting this flag. + * Enforce with a higher granularity by making the leaf node + * of type SYSCTL_PROC() in order to provide a procedural + * enforcement call site. + * + * NOTE: This function is called prior to any subfunctions being + * called with a fallback to userland_sysctl(); as such, this + * permissions check here will veto the fallback operation. + */ /* CTL_UNSPEC is used to get oid to AUTO_OID */ if (uap->new != USER_ADDR_NULL - && ((name[0] == CTL_KERN - && !(name[1] == KERN_IPC || name[1] == KERN_PANICINFO || name[1] == KERN_PROCDELAYTERM || - name[1] == KERN_PROCNAME || name[1] == KERN_RAGEVNODE || name[1] == KERN_CHECKOPENEVT || name[1] == KERN_THREADNAME)) - || (name[0] == CTL_HW) + && ((name[0] == CTL_HW) || (name[0] == CTL_VM)) && (error = suser(kauth_cred_get(), &p->p_acflag))) return (error); -/* XXX: KERN, VFS and DEBUG are handled by their respective functions, - * but there is a fallback for all sysctls other than VFS to - * userland_sysctl() - KILL THIS! */ - switch (name[0]) { - case CTL_KERN: - fnp = kern_sysctl; - if ((name[1] != KERN_VNODE) && (name[1] != KERN_FILE) - && (name[1] != KERN_PROC)) - dolock = 0; - break; - case CTL_VFS: - fnp = vfs_sysctl; - break; -#if DEBUG - case CTL_DEBUG: - fnp = debug_sysctl; - break; +// XXX need to relocate into each terminal instead of leaving this here... +// XXX macf preemptory check. +#if CONFIG_MACF + my_cred = kauth_cred_proc_ref(p); + error = mac_system_check_sysctl( + my_cred, + (int *) name, + uap->namelen, + uap->old, + uap->oldlenp, + 0, /* XXX 1 for CTL_KERN checks */ + uap->new, + newlen + ); + kauth_cred_unref(&my_cred); + if (error) + return (error); #endif - default: - fnp = NULL; - } if (uap->oldlenp != USER_ADDR_NULL) { uint64_t oldlen64 = fuulong(uap->oldlenp); @@ -324,79 +451,82 @@ __sysctl_funneled(proc_t p, struct __sysctl_args *uap, __unused int32_t *retval) oldlen = 0xffffffffUL; } - if (uap->old != USER_ADDR_NULL) { - if (!useracc(uap->old, (user_size_t)oldlen, B_WRITE)) - return (EFAULT); + if ((name[0] == CTL_VFS || name[0] == CTL_VM)) { /* - * The kernel debug mechanism does not need to take this lock, and - * we don't grab the memlock around calls to KERN_PROC because it is reentrant. - * Grabbing the lock for a KERN_PROC sysctl makes a deadlock possible 5024049. + * Always take the funnel for CTL_VFS and CTL_VM + * + * XXX We should also take it for any OID without the + * XXX CTLFLAG_LOCKED set on it; fix this later! + */ + funnel_state = thread_funnel_set(kernel_flock, TRUE); + funnel_taken = TRUE; + + /* + * XXX Take the vslock() only when we are copying out; this + * XXX erroneously assumes that the copy in will not cause + * XXX a fault if caled from the paging path due to the + * XXX having been recently touched in order to establish + * XXX the input data. This is a bad assumption. + * + * Note: This is overkill, but third parties might + * already call sysctl internally in KEXTs that + * implement mass storage drivers. If you are + * writing a new KEXT, don't do that. */ - if (!((name[1] == KERN_KDEBUG) && (name[2] == KERN_KDGETENTROPY)) && - !(name[1] == KERN_PROC)) { - MEMLOCK_LOCK(); - memlock_taken = TRUE; - } - - if (dolock && oldlen) { - if ((error = vslock(uap->old, (user_size_t)oldlen))) { - if (memlock_taken == TRUE) - MEMLOCK_UNLOCK(); - return(error); + if(uap->old != USER_ADDR_NULL) { + if (!useracc(uap->old, (user_size_t)oldlen, B_WRITE)) { + thread_funnel_set(kernel_flock, funnel_state); + return (EFAULT); + } + + if (oldlen) { + if ((error = vslock(uap->old, (user_size_t)oldlen))) { + thread_funnel_set(kernel_flock, funnel_state); + return(error); + } + savelen = oldlen; + vslock_taken = TRUE; } - savelen = oldlen; - vslock_taken = TRUE; } } -#if CONFIG_MACF - my_cred = kauth_cred_proc_ref(p); - error = mac_system_check_sysctl( - my_cred, - (int *) name, - uap->namelen, - uap->old, - uap->oldlenp, - fnp == kern_sysctl ? 1 : 0, - uap->new, - newlen - ); - kauth_cred_unref(&my_cred); - if (!error) { -#endif - if (fnp) { - error = (*fnp)(name + 1, uap->namelen - 1, uap->old, + /* + * XXX convert vfs_sysctl subelements to newsysctl; this is hard + * XXX because of VFS_NUMMNTOPS being top level. + */ + error = ENOTSUP; + if (name[0] == CTL_VFS) { + error = vfs_sysctl(name + 1, uap->namelen - 1, uap->old, &oldlen, uap->new, newlen, p); } - else - error = ENOTSUP; -#if CONFIG_MACF - } -#endif if (vslock_taken == TRUE) { error1 = vsunlock(uap->old, (user_size_t)savelen, B_WRITE); if (!error) error = error1; } - if (memlock_taken == TRUE) - MEMLOCK_UNLOCK(); - if ( (name[0] != CTL_VFS) && (error == ENOTSUP)) { - size_t tmp = oldlen; - boolean_t funnel_state; - - /* - * Drop the funnel when calling new sysctl code, which will conditionally - * grab the funnel if it really needs to. - */ - funnel_state = thread_funnel_set(kernel_flock, FALSE); - + if ( (name[0] != CTL_VFS) && (error == ENOTSUP) ) { + size_t tmp = oldlen; error = userland_sysctl(p, name, uap->namelen, uap->old, &tmp, uap->new, newlen, &oldlen); + } + /* + * If we took the funnel, which we only do for CTL_VFS and CTL_VM on + * 32 bit architectures, then drop it. + * + * XXX the grabbing and dropping need to move into the leaf nodes, + * XXX for sysctl's that are not marked CTLFLAG_LOCKED, but this is + * XXX true for the vslock, as well. We have a start at a routine + * to wrapper this (above), but it's not turned on. The current code + * removed the funnel and the vslock() from all but these two top + * level OIDs. Note that VFS only needs to take the funnel if the FS + * against which it's operating is not thread safe (but since an FS + * can be in the paging path, it still needs to take the vslock()). + */ + if (funnel_taken) thread_funnel_set(kernel_flock, funnel_state); - } if ((error) && (error != ENOMEM)) return (error); @@ -424,21 +554,26 @@ int securelevel = -1; int securelevel; #endif -static int -sysctl_affinity( - int *name, - u_int namelen, - user_addr_t oldBuf, - size_t *oldSize, - user_addr_t newBuf, - __unused size_t newSize, - proc_t cur_proc) +STATIC int +sysctl_doaffinity SYSCTL_HANDLER_ARGS { + __unused int cmd = oidp->oid_arg2; /* subcommand*/ + int *name = arg1; /* oid element argument vector */ + int namelen = arg2; /* number of oid element arguments */ + user_addr_t oldp = req->oldptr; /* user buffer copy out address */ + size_t *oldlenp = &req->oldlen; /* user buffer copy out size */ + user_addr_t newp = req->newptr; /* user buffer copy in address */ +// size_t newlen = req->newlen; /* user buffer copy in size */ + + int error = ENOTSUP; /* Default to failure */ + + proc_t cur_proc = current_proc(); + if (namelen < 1) return (ENOTSUP); if (name[0] == 0 && 1 == namelen) { - return sysctl_rdint(oldBuf, oldSize, newBuf, + error = sysctl_rdint(oldp, oldlenp, newp, (cur_proc->p_flag & P_AFFINITY) ? 1 : 0); } else if (name[0] == 1 && 2 == namelen) { if (name[1] == 0) { @@ -446,21 +581,35 @@ sysctl_affinity( } else { OSBitOrAtomic(P_AFFINITY, &cur_proc->p_flag); } - return 0; + error = 0; } - return (ENOTSUP); + + /* adjust index so we return the right required/consumed amount */ + if (!error) + req->oldidx += req->oldlen; + + return (error); } +SYSCTL_PROC(_kern, KERN_AFFINITY, affinity, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + 0, /* Integer argument (arg2) */ + sysctl_doaffinity, /* Handler function */ + NULL, /* Data pointer */ + ""); + +STATIC int +sysctl_dotranslate SYSCTL_HANDLER_ARGS +{ + __unused int cmd = oidp->oid_arg2; /* subcommand*/ + int *name = arg1; /* oid element argument vector */ + int namelen = arg2; /* number of oid element arguments */ + user_addr_t oldp = req->oldptr; /* user buffer copy out address */ + size_t *oldlenp = &req->oldlen; /* user buffer copy out size */ + user_addr_t newp = req->newptr; /* user buffer copy in address */ +// size_t newlen = req->newlen; /* user buffer copy in size */ + int error; -static int -sysctl_translate( - int *name, - u_int namelen, - user_addr_t oldBuf, - size_t *oldSize, - user_addr_t newBuf, - __unused size_t newSize, - proc_t cur_proc) -{ + proc_t cur_proc = current_proc(); proc_t p; int istranslated = 0; kauth_cred_t my_cred; @@ -484,9 +633,25 @@ sysctl_translate( istranslated = (p->p_flag & P_TRANSLATED); proc_rele(p); - return sysctl_rdint(oldBuf, oldSize, newBuf, + error = sysctl_rdint(oldp, oldlenp, newp, (istranslated != 0) ? 1 : 0); + + /* adjust index so we return the right required/consumed amount */ + if (!error) + req->oldidx += req->oldlen; + + return (error); } +/* + * XXX make CTLFLAG_RW so sysctl_rdint() will EPERM on attempts to write; + * XXX this may not be necessary. + */ +SYSCTL_PROC(_kern, KERN_TRANSLATE, translate, CTLTYPE_NODE|CTLFLAG_RW | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + 0, /* Integer argument (arg2) */ + sysctl_dotranslate, /* Handler function */ + NULL, /* Data pointer */ + ""); int set_archhandler(__unused proc_t p, int arch) @@ -505,7 +670,7 @@ set_archhandler(__unused proc_t p, int arch) return (EBADARCH); } - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, + NDINIT(&nd, LOOKUP, OP_GETATTR, FOLLOW | LOCKLEAF, UIO_SYSSPACE, CAST_USER_ADDR_T(archhandler->path), ctx); error = namei(&nd); if (error) @@ -530,63 +695,20 @@ set_archhandler(__unused proc_t p, int arch) vnode_put(nd.ni_vp); archhandler->fsid = va.va_fsid; - archhandler->fileid = (u_int32_t)va.va_fileid; + archhandler->fileid = va.va_fileid; return 0; } -/* XXX remove once Rosetta is rev'ed */ -/*****************************************************************************/ -static int -sysctl_exec_archhandler_ppc( - __unused int *name, - __unused u_int namelen, - user_addr_t oldBuf, - size_t *oldSize, - user_addr_t newBuf, - size_t newSize, - proc_t p) -{ - int error; - size_t len; - char handler[sizeof(exec_archhandler_ppc.path)]; - vfs_context_t ctx = vfs_context_current(); - if (oldSize) { - len = strlen(exec_archhandler_ppc.path) + 1; - if (oldBuf) { - if (*oldSize < len) - return (ENOMEM); - error = copyout(exec_archhandler_ppc.path, oldBuf, len); - if (error) - return (error); - } - *oldSize = len - 1; - } - if (newBuf) { - error = suser(vfs_context_ucred(ctx), &p->p_acflag); - if (error) - return (error); - if (newSize >= sizeof(exec_archhandler_ppc.path)) - return (ENAMETOOLONG); - error = copyin(newBuf, handler, newSize); - if (error) - return (error); - handler[newSize] = 0; - strlcpy(exec_archhandler_ppc.path, handler, MAXPATHLEN); - error = set_archhandler(p, CPU_TYPE_POWERPC); - if (error) - return (error); - } - return 0; -} -/*****************************************************************************/ - -static int +STATIC int sysctl_handle_exec_archhandler_ppc(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req) { int error = 0; + if (req->newptr && !kauth_cred_issuser(kauth_cred_get())) + return (EPERM); + error = sysctl_handle_string(oidp, arg1, arg2, req); if (error) @@ -600,7 +722,7 @@ sysctl_handle_exec_archhandler_ppc(struct sysctl_oid *oidp, void *arg1, } -static int +STATIC int sysctl_handle_kern_threadname( __unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -657,133 +779,153 @@ sysctl_handle_kern_threadname( __unused struct sysctl_oid *oidp, __unused void * return 0; } -SYSCTL_PROC(_kern, KERN_THREADNAME, threadname, CTLFLAG_ANYBODY | CTLTYPE_STRING | CTLFLAG_RW, 0, 0, sysctl_handle_kern_threadname,"A",""); +SYSCTL_PROC(_kern, KERN_THREADNAME, threadname, CTLFLAG_ANYBODY | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_handle_kern_threadname,"A",""); SYSCTL_NODE(_kern, KERN_EXEC, exec, CTLFLAG_RD|CTLFLAG_LOCKED, 0, ""); SYSCTL_NODE(_kern_exec, OID_AUTO, archhandler, CTLFLAG_RD|CTLFLAG_LOCKED, 0, ""); SYSCTL_PROC(_kern_exec_archhandler, OID_AUTO, powerpc, - CTLTYPE_STRING | CTLFLAG_RW, exec_archhandler_ppc.path, 0, - sysctl_handle_exec_archhandler_ppc, "A", ""); + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED, + exec_archhandler_ppc.path, + sizeof(exec_archhandler_ppc.path), + sysctl_handle_exec_archhandler_ppc, "A", ""); + +#define BSD_HOST 1 +STATIC int +sysctl_sched_stats(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + host_basic_info_data_t hinfo; + kern_return_t kret; + uint32_t size; + int changed; + mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT; + struct _processor_statistics_np *buf; + int error; -extern int get_kernel_symfile(proc_t, char **); -__private_extern__ int -sysctl_dopanicinfo(int *, u_int, user_addr_t, size_t *, user_addr_t, - size_t, proc_t); + kret = host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count); + if (kret != KERN_SUCCESS) { + return EINVAL; + } -/* - * kernel related system variables. - */ -int -kern_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, - user_addr_t newp, size_t newlen, proc_t p) -{ - /* all sysctl names not listed below are terminal at this level */ - if (namelen != 1 - && !(name[0] == KERN_PROC - || name[0] == KERN_PROF - || name[0] == KERN_KDEBUG -#if !CONFIG_EMBEDDED - || name[0] == KERN_PROCARGS -#endif - || name[0] == KERN_PROCARGS2 - || name[0] == KERN_IPC - || name[0] == KERN_SYSV - || name[0] == KERN_AFFINITY - || name[0] == KERN_TRANSLATE - || name[0] == KERN_EXEC - || name[0] == KERN_PANICINFO - || name[0] == KERN_POSIX - || name[0] == KERN_TFP - || name[0] == KERN_TTY -#if CONFIG_LCTX - || name[0] == KERN_LCTX -#endif - ) - ) - return (ENOTDIR); /* overloaded */ + size = sizeof(struct _processor_statistics_np) * (hinfo.logical_cpu_max + 2); /* One for RT Queue, One for Fair Share Queue */ + + if (req->oldlen < size) { + return EINVAL; + } + + MALLOC(buf, struct _processor_statistics_np*, size, M_TEMP, M_ZERO | M_WAITOK); + + kret = get_sched_statistics(buf, &size); + if (kret != KERN_SUCCESS) { + error = EINVAL; + goto out; + } + + error = sysctl_io_opaque(req, buf, size, &changed); + if (error) { + goto out; + } + + if (changed) { + panic("Sched info changed?!"); + } +out: + FREE(buf, M_TEMP); + return error; +} + +SYSCTL_PROC(_kern, OID_AUTO, sched_stats, CTLFLAG_LOCKED, 0, 0, sysctl_sched_stats, "-", ""); + +STATIC int +sysctl_sched_stats_enable(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, __unused struct sysctl_req *req) +{ + boolean_t active; + int res; + + if (req->newlen != sizeof(active)) { + return EINVAL; + } + + res = copyin(req->newptr, &active, sizeof(active)); + if (res != 0) { + return res; + } + + return set_sched_stats_active(active); +} + +SYSCTL_PROC(_kern, OID_AUTO, sched_stats_enable, CTLFLAG_LOCKED | CTLFLAG_WR, 0, 0, sysctl_sched_stats_enable, "-", ""); + +extern int get_kernel_symfile(proc_t, char **); - switch (name[0]) { - case KERN_PROC: - return (sysctl_doproc(name + 1, namelen - 1, oldp, oldlenp)); -#ifdef GPROF - case KERN_PROF: - return (sysctl_doprof(name + 1, namelen - 1, oldp, oldlenp, - newp, newlen)); -#endif - case KERN_KDEBUG: - return (kdebug_ops(name + 1, namelen - 1, oldp, oldlenp, p)); -#if !CONFIG_EMBEDDED - case KERN_PROCARGS: - /* new one as it does not use kinfo_proc */ - return (sysctl_procargs(name + 1, namelen - 1, oldp, oldlenp, p)); -#endif - case KERN_PROCARGS2: - /* new one as it does not use kinfo_proc */ - return (sysctl_procargs2(name + 1, namelen - 1, oldp, oldlenp, p)); -#if PANIC_INFO - case KERN_PANICINFO: - return(sysctl_dopanicinfo(name + 1, namelen - 1, oldp, oldlenp, - newp, newlen, p)); -#endif - case KERN_AFFINITY: - return sysctl_affinity(name+1, namelen-1, oldp, oldlenp, - newp, newlen, p); - case KERN_TRANSLATE: - return sysctl_translate(name+1, namelen-1, oldp, oldlenp, newp, - newlen, p); - - /* XXX remove once Rosetta has rev'ed */ - case KERN_EXEC: - return sysctl_exec_archhandler_ppc(name+1, namelen-1, oldp, - oldlenp, newp, newlen, p); #if COUNT_SYSCALLS - case KERN_COUNT_SYSCALLS: - { - /* valid values passed in: - * = 0 means don't keep called counts for each bsd syscall - * > 0 means keep called counts for each bsd syscall - * = 2 means dump current counts to the system log - * = 3 means reset all counts - * for example, to dump current counts: - * sysctl -w kern.count_calls=2 - */ - error = sysctl_int(oldp, oldlenp, newp, newlen, &tmp); - if ( error != 0 ) { - return (error); - } - - if ( tmp == 1 ) { - do_count_syscalls = 1; - } - else if ( tmp == 0 || tmp == 2 || tmp == 3 ) { - extern int nsysent; - extern int syscalls_log[]; - extern const char * syscallnames[]; - int i; - for ( i = 0; i < nsysent; i++ ) { - if ( syscalls_log[i] != 0 ) { - if ( tmp == 2 ) { - printf("%d calls - name %s \n", syscalls_log[i], syscallnames[i]); - } - else { - syscalls_log[i] = 0; - } +#define KERN_COUNT_SYSCALLS (KERN_OSTYPE + 1000) + +extern int nsysent; +extern int syscalls_log[]; +extern const char *syscallnames[]; + +STATIC int +sysctl_docountsyscalls SYSCTL_HANDLER_ARGS +{ + __unused int cmd = oidp->oid_arg2; /* subcommand*/ + __unused int *name = arg1; /* oid element argument vector */ + __unused int namelen = arg2; /* number of oid element arguments */ + user_addr_t oldp = req->oldptr; /* user buffer copy out address */ + size_t *oldlenp = &req->oldlen; /* user buffer copy out size */ + user_addr_t newp = req->newptr; /* user buffer copy in address */ + size_t newlen = req->newlen; /* user buffer copy in size */ + int error; + + int tmp; + + /* valid values passed in: + * = 0 means don't keep called counts for each bsd syscall + * > 0 means keep called counts for each bsd syscall + * = 2 means dump current counts to the system log + * = 3 means reset all counts + * for example, to dump current counts: + * sysctl -w kern.count_calls=2 + */ + error = sysctl_int(oldp, oldlenp, newp, newlen, &tmp); + if ( error != 0 ) { + return (error); + } + + if ( tmp == 1 ) { + do_count_syscalls = 1; + } + else if ( tmp == 0 || tmp == 2 || tmp == 3 ) { + int i; + for ( i = 0; i < nsysent; i++ ) { + if ( syscalls_log[i] != 0 ) { + if ( tmp == 2 ) { + printf("%d calls - name %s \n", syscalls_log[i], syscallnames[i]); + } + else { + syscalls_log[i] = 0; } - } - if ( tmp != 0 ) { - do_count_syscalls = 1; } } - return (0); - } -#endif - default: - return (ENOTSUP); + if ( tmp != 0 ) { + do_count_syscalls = 1; + } } - /* NOTREACHED */ + + /* adjust index so we return the right required/consumed amount */ + if (!error) + req->oldidx += req->oldlen; + + return (error); } +SYSCTL_PROC(_kern, KERN_COUNT_SYSCALLS, count_syscalls, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + 0, /* Integer argument (arg2) */ + sysctl_docountsyscalls, /* Handler function */ + NULL, /* Data pointer */ + ""); +#endif /* COUNT_SYSCALLS */ #if DEBUG /* @@ -797,36 +939,68 @@ struct ctldebug debug2, debug3, debug4; struct ctldebug debug5, debug6, debug7, debug8, debug9; struct ctldebug debug10, debug11, debug12, debug13, debug14; struct ctldebug debug15, debug16, debug17, debug18, debug19; -static struct ctldebug *debugvars[CTL_DEBUG_MAXID] = { +STATIC struct ctldebug *debugvars[CTL_DEBUG_MAXID] = { &debug0, &debug1, &debug2, &debug3, &debug4, &debug5, &debug6, &debug7, &debug8, &debug9, &debug10, &debug11, &debug12, &debug13, &debug14, &debug15, &debug16, &debug17, &debug18, &debug19, }; -int -debug_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, - user_addr_t newp, size_t newlen, __unused proc_t p) -{ +STATIC int +sysctl_dodebug SYSCTL_HANDLER_ARGS +{ + int cmd = oidp->oid_arg2; /* subcommand*/ + int *name = arg1; /* oid element argument vector */ + int namelen = arg2; /* number of oid element arguments */ + user_addr_t oldp = req->oldptr; /* user buffer copy out address */ + size_t *oldlenp = &req->oldlen; /* user buffer copy out size */ + user_addr_t newp = req->newptr; /* user buffer copy in address */ + size_t newlen = req->newlen; /* user buffer copy in size */ + int error; + struct ctldebug *cdp; /* all sysctl names at this level are name and field */ - if (namelen != 2) + if (namelen != 1) return (ENOTSUP); /* overloaded */ - if (name[0] < 0 || name[0] >= CTL_DEBUG_MAXID) + if (cmd < 0 || cmd >= CTL_DEBUG_MAXID) return (ENOTSUP); - cdp = debugvars[name[0]]; + cdp = debugvars[cmd]; if (cdp->debugname == 0) return (ENOTSUP); - switch (name[1]) { + switch (name[0]) { case CTL_DEBUG_NAME: - return (sysctl_rdstring(oldp, oldlenp, newp, cdp->debugname)); + error = sysctl_rdstring(oldp, oldlenp, newp, cdp->debugname); + break; case CTL_DEBUG_VALUE: - return (sysctl_int(oldp, oldlenp, newp, newlen, cdp->debugvar)); + error = sysctl_int(oldp, oldlenp, newp, newlen, cdp->debugvar); + break; default: - return (ENOTSUP); + error = ENOTSUP; + break; } - /* NOTREACHED */ + + /* adjust index so we return the right required/consumed amount */ + if (!error) + req->oldidx += req->oldlen; + + return (error); } +/* + * XXX We mark this RW instead of RD to let sysctl_rdstring() return the + * XXX historical error. + */ +SYSCTL_PROC(_debug, CTL_DEBUG_NAME, name, CTLTYPE_NODE|CTLFLAG_RW | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + CTL_DEBUG_NAME, /* Integer argument (arg2) */ + sysctl_dodebug, /* Handler function */ + NULL, /* Data pointer */ + "Debugging"); +SYSCTL_PROC(_debug, CTL_DEBUG_VALUE, value, CTLTYPE_NODE|CTLFLAG_RW | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + CTL_DEBUG_VALUE, /* Integer argument (arg2) */ + sysctl_dodebug, /* Handler function */ + NULL, /* Data pointer */ + "Debugging"); #endif /* DEBUG */ /* @@ -1073,7 +1247,7 @@ sysctl_rdstruct(user_addr_t oldp, size_t *oldlenp, /* * Get file structures. */ -static int +STATIC int sysctl_file (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -1118,10 +1292,10 @@ sysctl_file } SYSCTL_PROC(_kern, KERN_FILE, file, - CTLTYPE_STRUCT | CTLFLAG_RW, + CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_file, "S,filehead", ""); -static int +STATIC int sysdoproc_filt_KERN_PROC_PID(proc_t p, void * arg) { if (p->p_pid != (pid_t)*(int*)arg) @@ -1130,7 +1304,7 @@ sysdoproc_filt_KERN_PROC_PID(proc_t p, void * arg) return(1); } -static int +STATIC int sysdoproc_filt_KERN_PROC_PGRP(proc_t p, void * arg) { if (p->p_pgrpid != (pid_t)*(int*)arg) @@ -1139,7 +1313,7 @@ sysdoproc_filt_KERN_PROC_PGRP(proc_t p, void * arg) return(1); } -static int +STATIC int sysdoproc_filt_KERN_PROC_TTY(proc_t p, void * arg) { boolean_t funnel_state; @@ -1162,7 +1336,7 @@ sysdoproc_filt_KERN_PROC_TTY(proc_t p, void * arg) return(retval); } -static int +STATIC int sysdoproc_filt_KERN_PROC_UID(proc_t p, void * arg) { kauth_cred_t my_cred; @@ -1181,7 +1355,7 @@ sysdoproc_filt_KERN_PROC_UID(proc_t p, void * arg) } -static int +STATIC int sysdoproc_filt_KERN_PROC_RUID(proc_t p, void * arg) { kauth_cred_t my_cred; @@ -1190,7 +1364,7 @@ sysdoproc_filt_KERN_PROC_RUID(proc_t p, void * arg) if (p->p_ucred == NULL) return(0); my_cred = kauth_cred_proc_ref(p); - ruid = my_cred->cr_ruid; + ruid = kauth_cred_getruid(my_cred); kauth_cred_unref(&my_cred); if (ruid != (uid_t)*(int*)arg) @@ -1200,7 +1374,7 @@ sysdoproc_filt_KERN_PROC_RUID(proc_t p, void * arg) } #if CONFIG_LCTX -static int +STATIC int sysdoproc_filt_KERN_PROC_LCID(proc_t p, void * arg) { if ((p->p_lctx == NULL) || @@ -1263,12 +1437,18 @@ sysdoproc_callback(proc_t p, void * arg) return(PROC_RETURNED); } -int -sysctl_doproc(int *name, u_int namelen, user_addr_t where, size_t *sizep) +SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD | CTLFLAG_LOCKED, 0, ""); +STATIC int +sysctl_prochandle SYSCTL_HANDLER_ARGS { + int cmd = oidp->oid_arg2; /* subcommand for multiple nodes */ + int *name = arg1; /* oid element argument vector */ + int namelen = arg2; /* number of oid element arguments */ + user_addr_t where = req->oldptr;/* user buffer copy out address */ + user_addr_t dp = where; size_t needed = 0; - int buflen = where != USER_ADDR_NULL ? *sizep : 0; + int buflen = where != USER_ADDR_NULL ? req->oldlen : 0; int error = 0; boolean_t is_64_bit = FALSE; struct user32_kinfo_proc user32_kproc; @@ -1281,8 +1461,9 @@ sysctl_doproc(int *name, u_int namelen, user_addr_t where, size_t *sizep) int ruidcheck = 0; int ttycheck = 0; - if (namelen != 2 && !(namelen == 1 && name[0] == KERN_PROC_ALL)) + if (namelen != 1 && !(namelen == 0 && cmd == KERN_PROC_ALL)) return (EINVAL); + is_64_bit = proc_is64bit(current_proc()); if (is_64_bit) { sizeof_kproc = sizeof(user_kproc); @@ -1294,7 +1475,7 @@ sysctl_doproc(int *name, u_int namelen, user_addr_t where, size_t *sizep) } - switch (name[0]) { + switch (cmd) { case KERN_PROC_PID: filterfn = sysdoproc_filt_KERN_PROC_PID; @@ -1321,6 +1502,12 @@ sysctl_doproc(int *name, u_int namelen, user_addr_t where, size_t *sizep) filterfn = sysdoproc_filt_KERN_PROC_LCID; break; #endif + case KERN_PROC_ALL: + break; + + default: + /* must be kern.proc. */ + return (ENOTSUP); } error = 0; @@ -1334,9 +1521,10 @@ sysctl_doproc(int *name, u_int namelen, user_addr_t where, size_t *sizep) args.ruidcheck = ruidcheck; args.ttycheck = ttycheck; args.sizeof_kproc = sizeof_kproc; - args.uidval = name[1]; + if (namelen) + args.uidval = name[0]; - proc_iterate((PROC_ALLPROCLIST | PROC_ZOMBPROCLIST), sysdoproc_callback, &args, filterfn, &name[1]); + proc_iterate((PROC_ALLPROCLIST | PROC_ZOMBPROCLIST), sysdoproc_callback, &args, filterfn, name); if (error) return(error); @@ -1345,20 +1533,87 @@ sysctl_doproc(int *name, u_int namelen, user_addr_t where, size_t *sizep) needed = args.needed; if (where != USER_ADDR_NULL) { - *sizep = dp - where; - if (needed > *sizep) + req->oldlen = dp - where; + if (needed > req->oldlen) return (ENOMEM); } else { needed += KERN_PROCSLOP; - *sizep = needed; + req->oldlen = needed; } + /* adjust index so we return the right required/consumed amount */ + req->oldidx += req->oldlen; return (0); } +/* + * We specify the subcommand code for multiple nodes as the 'req->arg2' value + * in the sysctl declaration itself, which comes into the handler function + * as 'oidp->oid_arg2'. + * + * For these particular sysctls, since they have well known OIDs, we could + * have just obtained it from the '((int *)arg1)[0]' parameter, but that would + * not demonstrate how to handle multiple sysctls that used OID_AUTO instead + * of a well known value with a common handler function. This is desirable, + * because we want well known values to "go away" at some future date. + * + * It should be noted that the value of '((int *)arg1)[1]' is used for many + * an integer parameter to the subcommand for many of these sysctls; we'd + * rather have used '((int *)arg1)[0]' for that, or even better, an element + * in a structure passed in as the the 'newp' argument to sysctlbyname(3), + * and then use leaf-node permissions enforcement, but that would have + * necessitated modifying user space code to correspond to the interface + * change, and we are striving for binary backward compatibility here; even + * though these are SPI, and not intended for use by user space applications + * which are not themselves system tools or libraries, some applications + * have erroneously used them. + */ +SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + KERN_PROC_ALL, /* Integer argument (arg2) */ + sysctl_prochandle, /* Handler function */ + NULL, /* Data is size variant on ILP32/LP64 */ + ""); +SYSCTL_PROC(_kern_proc, KERN_PROC_PID, pid, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + KERN_PROC_PID, /* Integer argument (arg2) */ + sysctl_prochandle, /* Handler function */ + NULL, /* Data is size variant on ILP32/LP64 */ + ""); +SYSCTL_PROC(_kern_proc, KERN_PROC_TTY, tty, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + KERN_PROC_TTY, /* Integer argument (arg2) */ + sysctl_prochandle, /* Handler function */ + NULL, /* Data is size variant on ILP32/LP64 */ + ""); +SYSCTL_PROC(_kern_proc, KERN_PROC_PGRP, pgrp, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + KERN_PROC_PGRP, /* Integer argument (arg2) */ + sysctl_prochandle, /* Handler function */ + NULL, /* Data is size variant on ILP32/LP64 */ + ""); +SYSCTL_PROC(_kern_proc, KERN_PROC_UID, uid, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + KERN_PROC_UID, /* Integer argument (arg2) */ + sysctl_prochandle, /* Handler function */ + NULL, /* Data is size variant on ILP32/LP64 */ + ""); +SYSCTL_PROC(_kern_proc, KERN_PROC_RUID, ruid, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + KERN_PROC_RUID, /* Integer argument (arg2) */ + sysctl_prochandle, /* Handler function */ + NULL, /* Data is size variant on ILP32/LP64 */ + ""); +SYSCTL_PROC(_kern_proc, KERN_PROC_LCID, lcid, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + KERN_PROC_LCID, /* Integer argument (arg2) */ + sysctl_prochandle, /* Handler function */ + NULL, /* Data is size variant on ILP32/LP64 */ + ""); + /* * Fill in an eproc structure for the specified process. */ -static void +STATIC void fill_user32_eproc(proc_t p, struct user32_eproc *ep) { struct tty *tp; @@ -1396,15 +1651,15 @@ fill_user32_eproc(proc_t p, struct user32_eproc *ep) my_cred = kauth_cred_proc_ref(p); /* A fake historical pcred */ - ep->e_pcred.p_ruid = my_cred->cr_ruid; - ep->e_pcred.p_svuid = my_cred->cr_svuid; - ep->e_pcred.p_rgid = my_cred->cr_rgid; - ep->e_pcred.p_svgid = my_cred->cr_svgid; + ep->e_pcred.p_ruid = kauth_cred_getruid(my_cred); + ep->e_pcred.p_svuid = kauth_cred_getsvuid(my_cred); + ep->e_pcred.p_rgid = kauth_cred_getrgid(my_cred); + ep->e_pcred.p_svgid = kauth_cred_getsvgid(my_cred); /* A fake historical *kauth_cred_t */ ep->e_ucred.cr_ref = my_cred->cr_ref; ep->e_ucred.cr_uid = kauth_cred_getuid(my_cred); - ep->e_ucred.cr_ngroups = my_cred->cr_ngroups; - bcopy(my_cred->cr_groups, ep->e_ucred.cr_groups, NGROUPS*sizeof(gid_t)); + ep->e_ucred.cr_ngroups = posix_cred_get(my_cred)->cr_ngroups; + bcopy(posix_cred_get(my_cred)->cr_groups, ep->e_ucred.cr_groups, NGROUPS*sizeof(gid_t)); kauth_cred_unref(&my_cred); } @@ -1437,7 +1692,7 @@ fill_user32_eproc(proc_t p, struct user32_eproc *ep) /* * Fill in an LP64 version of eproc structure for the specified process. */ -static void +STATIC void fill_user64_eproc(proc_t p, struct user64_eproc *ep) { struct tty *tp; @@ -1476,16 +1731,16 @@ fill_user64_eproc(proc_t p, struct user64_eproc *ep) my_cred = kauth_cred_proc_ref(p); /* A fake historical pcred */ - ep->e_pcred.p_ruid = my_cred->cr_ruid; - ep->e_pcred.p_svuid = my_cred->cr_svuid; - ep->e_pcred.p_rgid = my_cred->cr_rgid; - ep->e_pcred.p_svgid = my_cred->cr_svgid; + ep->e_pcred.p_ruid = kauth_cred_getruid(my_cred); + ep->e_pcred.p_svuid = kauth_cred_getsvuid(my_cred); + ep->e_pcred.p_rgid = kauth_cred_getrgid(my_cred); + ep->e_pcred.p_svgid = kauth_cred_getsvgid(my_cred); /* A fake historical *kauth_cred_t */ ep->e_ucred.cr_ref = my_cred->cr_ref; ep->e_ucred.cr_uid = kauth_cred_getuid(my_cred); - ep->e_ucred.cr_ngroups = my_cred->cr_ngroups; - bcopy(my_cred->cr_groups, ep->e_ucred.cr_groups, NGROUPS*sizeof(gid_t)); + ep->e_ucred.cr_ngroups = posix_cred_get(my_cred)->cr_ngroups; + bcopy(posix_cred_get(my_cred)->cr_groups, ep->e_ucred.cr_groups, NGROUPS*sizeof(gid_t)); kauth_cred_unref(&my_cred); } @@ -1518,7 +1773,7 @@ fill_user64_eproc(proc_t p, struct user64_eproc *ep) /* * Fill in an eproc structure for the specified process. */ -static void +STATIC void fill_user32_externproc(proc_t p, struct user32_extern_proc *exp) { exp->p_forw = exp->p_back = 0; @@ -1583,7 +1838,7 @@ fill_user32_externproc(proc_t p, struct user32_extern_proc *exp) /* * Fill in an LP64 version of extern_proc structure for the specified process. */ -static void +STATIC void fill_user64_externproc(proc_t p, struct user64_extern_proc *exp) { exp->p_forw = exp->p_back = USER_ADDR_NULL; @@ -1649,7 +1904,7 @@ fill_user64_externproc(proc_t p, struct user64_extern_proc *exp) exp->p_ru = CAST_USER_ADDR_T(p->p_ru); /* XXX may be NULL */ } -static void +STATIC void fill_user32_proc(proc_t p, struct user32_kinfo_proc *kp) { /* on a 64 bit kernel, 32 bit users will get some truncated information */ @@ -1657,23 +1912,31 @@ fill_user32_proc(proc_t p, struct user32_kinfo_proc *kp) fill_user32_eproc(p, &kp->kp_eproc); } -static void +STATIC void fill_user64_proc(proc_t p, struct user64_kinfo_proc *kp) { fill_user64_externproc(p, &kp->kp_proc); fill_user64_eproc(p, &kp->kp_eproc); } -int -kdebug_ops(int *name, u_int namelen, user_addr_t where, - size_t *sizep, proc_t p) +STATIC int +sysctl_kdebug_ops SYSCTL_HANDLER_ARGS { + __unused int cmd = oidp->oid_arg2; /* subcommand*/ + int *name = arg1; /* oid element argument vector */ + int namelen = arg2; /* number of oid element arguments */ + user_addr_t oldp = req->oldptr; /* user buffer copy out address */ + size_t *oldlenp = &req->oldlen; /* user buffer copy out size */ +// user_addr_t newp = req->newptr; /* user buffer copy in address */ +// size_t newlen = req->newlen; /* user buffer copy in size */ + + proc_t p = current_proc(); int ret=0; if (namelen == 0) return(ENOTSUP); - ret = suser(kauth_cred_get(), &p->p_acflag); + ret = suser(kauth_cred_get(), &p->p_acflag); if (ret) return(ret); @@ -1687,41 +1950,96 @@ kdebug_ops(int *name, u_int namelen, user_addr_t where, case KERN_KDSETREG: case KERN_KDGETREG: case KERN_KDREADTR: + case KERN_KDWRITETR: + case KERN_KDWRITEMAP: case KERN_KDPIDTR: case KERN_KDTHRMAP: case KERN_KDPIDEX: case KERN_KDSETRTCDEC: case KERN_KDSETBUF: case KERN_KDGETENTROPY: - ret = kdbg_control(name, namelen, where, sizep); + ret = kdbg_control(name, namelen, oldp, oldlenp); break; default: ret= ENOTSUP; break; } - return(ret); + + /* adjust index so we return the right required/consumed amount */ + if (!ret) + req->oldidx += req->oldlen; + + return (ret); } +SYSCTL_PROC(_kern, KERN_KDEBUG, kdebug, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + 0, /* Integer argument (arg2) */ + sysctl_kdebug_ops, /* Handler function */ + NULL, /* Data pointer */ + ""); +#if !CONFIG_EMBEDDED /* * Return the top *sizep bytes of the user stack, or the entire area of the * user stack down through the saved exec_path, whichever is smaller. */ -int -sysctl_procargs(int *name, u_int namelen, user_addr_t where, - size_t *sizep, proc_t cur_proc) -{ - return sysctl_procargsx( name, namelen, where, sizep, cur_proc, 0); +STATIC int +sysctl_doprocargs SYSCTL_HANDLER_ARGS +{ + __unused int cmd = oidp->oid_arg2; /* subcommand*/ + int *name = arg1; /* oid element argument vector */ + int namelen = arg2; /* number of oid element arguments */ + user_addr_t oldp = req->oldptr; /* user buffer copy out address */ + size_t *oldlenp = &req->oldlen; /* user buffer copy out size */ +// user_addr_t newp = req->newptr; /* user buffer copy in address */ +// size_t newlen = req->newlen; /* user buffer copy in size */ + int error; + + error = sysctl_procargsx( name, namelen, oldp, oldlenp, current_proc(), 0); + + /* adjust index so we return the right required/consumed amount */ + if (!error) + req->oldidx += req->oldlen; + + return (error); } +SYSCTL_PROC(_kern, KERN_PROCARGS, procargs, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + 0, /* Integer argument (arg2) */ + sysctl_doprocargs, /* Handler function */ + NULL, /* Data pointer */ + ""); +#endif /* !CONFIG_EMBEDDED */ + +STATIC int +sysctl_doprocargs2 SYSCTL_HANDLER_ARGS +{ + __unused int cmd = oidp->oid_arg2; /* subcommand*/ + int *name = arg1; /* oid element argument vector */ + int namelen = arg2; /* number of oid element arguments */ + user_addr_t oldp = req->oldptr; /* user buffer copy out address */ + size_t *oldlenp = &req->oldlen; /* user buffer copy out size */ +// user_addr_t newp = req->newptr; /* user buffer copy in address */ +// size_t newlen = req->newlen; /* user buffer copy in size */ + int error; -static int -sysctl_procargs2(int *name, u_int namelen, user_addr_t where, - size_t *sizep, proc_t cur_proc) -{ - return sysctl_procargsx( name, namelen, where, sizep, cur_proc, 1); + error = sysctl_procargsx( name, namelen, oldp, oldlenp, current_proc(), 1); + + /* adjust index so we return the right required/consumed amount */ + if (!error) + req->oldidx += req->oldlen; + + return (error); } +SYSCTL_PROC(_kern, KERN_PROCARGS2, procargs2, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + 0, /* Integer argument (arg2) */ + sysctl_doprocargs2, /* Handler function */ + NULL, /* Data pointer */ + ""); -static int +STATIC int sysctl_procargsx(int *name, u_int namelen, user_addr_t where, size_t *sizep, proc_t cur_proc, int argc_yes) { @@ -1977,7 +2295,7 @@ sysctl_procargsx(int *name, u_int namelen, user_addr_t where, /* * Max number of concurrent aio requests */ -static int +STATIC int sysctl_aiomax (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -1997,7 +2315,7 @@ sysctl_aiomax /* * Max number of concurrent aio requests per process */ -static int +STATIC int sysctl_aioprocmax (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2017,7 +2335,7 @@ sysctl_aioprocmax /* * Max number of async IO worker threads */ -static int +STATIC int sysctl_aiothreads (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2039,7 +2357,7 @@ sysctl_aiothreads /* * System-wide limit on the max number of processes */ -static int +STATIC int sysctl_maxproc (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2058,26 +2376,30 @@ sysctl_maxproc } SYSCTL_STRING(_kern, KERN_OSTYPE, ostype, - CTLFLAG_RD | CTLFLAG_KERN, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, ostype, 0, ""); SYSCTL_STRING(_kern, KERN_OSRELEASE, osrelease, - CTLFLAG_RD | CTLFLAG_KERN, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, osrelease, 0, ""); SYSCTL_INT(_kern, KERN_OSREV, osrevision, - CTLFLAG_RD | CTLFLAG_KERN, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (int *)NULL, BSD, ""); SYSCTL_STRING(_kern, KERN_VERSION, version, - CTLFLAG_RD | CTLFLAG_KERN, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, version, 0, ""); +SYSCTL_STRING(_kern, OID_AUTO, uuid, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, + &kernel_uuid[0], 0, ""); #if DEBUG int debug_kprint_syscall = 0; char debug_kprint_syscall_process[MAXCOMLEN+1]; +/* Thread safe: bits and string value are not used to reclaim state */ SYSCTL_INT (_debug, OID_AUTO, kprint_syscall, - CTLFLAG_RW, &debug_kprint_syscall, 0, "kprintf syscall tracing"); + CTLFLAG_RW | CTLFLAG_LOCKED, &debug_kprint_syscall, 0, "kprintf syscall tracing"); SYSCTL_STRING(_debug, OID_AUTO, kprint_syscall_process, - CTLFLAG_RW, debug_kprint_syscall_process, sizeof(debug_kprint_syscall_process), + CTLFLAG_RW | CTLFLAG_LOCKED, debug_kprint_syscall_process, sizeof(debug_kprint_syscall_process), "name of process for kprintf syscall tracing"); int debug_kprint_current_process(const char **namep) @@ -2113,7 +2435,7 @@ int debug_kprint_current_process(const char **namep) /* PR-5293665: need to use a callback function for kern.osversion to set * osversion in IORegistry */ -static int +STATIC int sysctl_osversion(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req) { int rval = 0; @@ -2128,11 +2450,11 @@ sysctl_osversion(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct } SYSCTL_PROC(_kern, KERN_OSVERSION, osversion, - CTLFLAG_RW | CTLFLAG_KERN | CTLTYPE_STRING, + CTLFLAG_RW | CTLFLAG_KERN | CTLTYPE_STRING | CTLFLAG_LOCKED, osversion, 256 /* OSVERSIZE*/, sysctl_osversion, "A", ""); -static int +STATIC int sysctl_sysctl_bootargs (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2150,46 +2472,46 @@ SYSCTL_PROC(_kern, OID_AUTO, bootargs, sysctl_sysctl_bootargs, "A", "bootargs"); SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, - CTLFLAG_RW | CTLFLAG_KERN, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, &maxfiles, 0, ""); SYSCTL_INT(_kern, KERN_ARGMAX, argmax, - CTLFLAG_RD | CTLFLAG_KERN, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (int *)NULL, ARG_MAX, ""); SYSCTL_INT(_kern, KERN_POSIX1, posix1version, - CTLFLAG_RD | CTLFLAG_KERN, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (int *)NULL, _POSIX_VERSION, ""); SYSCTL_INT(_kern, KERN_NGROUPS, ngroups, - CTLFLAG_RD | CTLFLAG_KERN, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (int *)NULL, NGROUPS_MAX, ""); SYSCTL_INT(_kern, KERN_JOB_CONTROL, job_control, - CTLFLAG_RD | CTLFLAG_KERN, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (int *)NULL, 1, ""); #if 1 /* _POSIX_SAVED_IDS from */ SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, - CTLFLAG_RD | CTLFLAG_KERN, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (int *)NULL, 1, ""); #else SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, - CTLFLAG_RD | CTLFLAG_KERN, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, NULL, 0, ""); #endif SYSCTL_INT(_kern, OID_AUTO, num_files, - CTLFLAG_RD, + CTLFLAG_RD | CTLFLAG_LOCKED, &nfiles, 0, ""); SYSCTL_COMPAT_INT(_kern, OID_AUTO, num_vnodes, - CTLFLAG_RD, + CTLFLAG_RD | CTLFLAG_LOCKED, &numvnodes, 0, ""); SYSCTL_INT(_kern, OID_AUTO, num_tasks, - CTLFLAG_RD, + CTLFLAG_RD | CTLFLAG_LOCKED, &task_max, 0, ""); SYSCTL_INT(_kern, OID_AUTO, num_threads, - CTLFLAG_RD, + CTLFLAG_RD | CTLFLAG_LOCKED, &thread_max, 0, ""); SYSCTL_INT(_kern, OID_AUTO, num_taskthreads, - CTLFLAG_RD, + CTLFLAG_RD | CTLFLAG_LOCKED, &task_threadmax, 0, ""); -static int +STATIC int sysctl_maxvnodes (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { int oldval = desiredvnodes; @@ -2203,27 +2525,31 @@ sysctl_maxvnodes (__unused struct sysctl_oid *oidp, __unused void *arg1, __unuse return(error); } +SYSCTL_INT(_kern, OID_AUTO, namecache_disabled, + CTLFLAG_RW | CTLFLAG_LOCKED, + &nc_disabled, 0, ""); + SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, - CTLTYPE_INT | CTLFLAG_RW, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_maxvnodes, "I", ""); SYSCTL_PROC(_kern, KERN_MAXPROC, maxproc, - CTLTYPE_INT | CTLFLAG_RW, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_maxproc, "I", ""); SYSCTL_PROC(_kern, KERN_AIOMAX, aiomax, - CTLTYPE_INT | CTLFLAG_RW, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_aiomax, "I", ""); SYSCTL_PROC(_kern, KERN_AIOPROCMAX, aioprocmax, - CTLTYPE_INT | CTLFLAG_RW, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_aioprocmax, "I", ""); SYSCTL_PROC(_kern, KERN_AIOTHREADS, aiothreads, - CTLTYPE_INT | CTLFLAG_RW, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_aiothreads, "I", ""); -static int +STATIC int sysctl_securelvl (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2242,11 +2568,11 @@ sysctl_securelvl } SYSCTL_PROC(_kern, KERN_SECURELVL, securelevel, - CTLTYPE_INT | CTLFLAG_RW, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_securelvl, "I", ""); -static int +STATIC int sysctl_domainname (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2259,14 +2585,14 @@ sysctl_domainname } SYSCTL_PROC(_kern, KERN_DOMAINNAME, nisdomainname, - CTLTYPE_STRING | CTLFLAG_RW, + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_domainname, "A", ""); SYSCTL_COMPAT_INT(_kern, KERN_HOSTID, hostid, - CTLFLAG_RW | CTLFLAG_KERN, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, &hostid, 0, ""); -static int +STATIC int sysctl_hostname (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2280,10 +2606,10 @@ sysctl_hostname SYSCTL_PROC(_kern, KERN_HOSTNAME, hostname, - CTLTYPE_STRING | CTLFLAG_RW, + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_hostname, "A", ""); -static int +STATIC int sysctl_procname (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2293,26 +2619,59 @@ sysctl_procname } SYSCTL_PROC(_kern, KERN_PROCNAME, procname, - CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_ANYBODY, + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, 0, 0, sysctl_procname, "A", ""); SYSCTL_INT(_kern, KERN_SPECULATIVE_READS, speculative_reads_disabled, - CTLFLAG_RW | CTLFLAG_KERN, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, &speculative_reads_disabled, 0, ""); +SYSCTL_INT(_kern, OID_AUTO, ignore_is_ssd, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, + &ignore_is_ssd, 0, ""); + SYSCTL_UINT(_kern, OID_AUTO, preheat_pages_max, - CTLFLAG_RW | CTLFLAG_KERN, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, &preheat_pages_max, 0, ""); SYSCTL_UINT(_kern, OID_AUTO, preheat_pages_min, - CTLFLAG_RW | CTLFLAG_KERN, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, &preheat_pages_min, 0, ""); -SYSCTL_UINT(_kern, OID_AUTO, preheat_pages_mult, - CTLFLAG_RW | CTLFLAG_KERN, - &preheat_pages_mult, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, speculative_prefetch_max, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, + &speculative_prefetch_max, 0, ""); -static int +SYSCTL_UINT(_kern, OID_AUTO, vm_page_free_target, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, + &vm_page_free_target, 0, ""); + +SYSCTL_UINT(_kern, OID_AUTO, vm_page_free_min, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, + &vm_page_free_min, 0, ""); + +SYSCTL_UINT(_kern, OID_AUTO, vm_page_free_reserved, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, + &vm_page_free_reserved, 0, ""); + +SYSCTL_UINT(_kern, OID_AUTO, vm_page_speculative_percentage, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, + &vm_page_speculative_percentage, 0, ""); + +SYSCTL_UINT(_kern, OID_AUTO, vm_page_speculative_q_age_ms, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, + &vm_page_speculative_q_age_ms, 0, ""); + +SYSCTL_UINT(_kern, OID_AUTO, vm_max_delayed_work_limit, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, + &vm_max_delayed_work_limit, 0, ""); + +SYSCTL_UINT(_kern, OID_AUTO, vm_max_batch, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, + &vm_max_batch, 0, ""); + + +STATIC int sysctl_boottime (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2333,10 +2692,10 @@ sysctl_boottime } SYSCTL_PROC(_kern, KERN_BOOTTIME, boottime, - CTLTYPE_STRUCT | CTLFLAG_RD, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_boottime, "S,timeval", ""); -static int +STATIC int sysctl_symfile (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2349,11 +2708,11 @@ sysctl_symfile SYSCTL_PROC(_kern, KERN_SYMFILE, symfile, - CTLTYPE_STRING | CTLFLAG_RD, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_symfile, "A", ""); #if NFSCLIENT -static int +STATIC int sysctl_netboot (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2361,12 +2720,15 @@ sysctl_netboot } SYSCTL_PROC(_kern, KERN_NETBOOT, netboot, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_netboot, "I", ""); #endif #ifdef CONFIG_IMGSRC_ACCESS -static int +/* + * Legacy--act as if only one layer of nesting is possible. + */ +STATIC int sysctl_imgsrcdev (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2378,16 +2740,16 @@ sysctl_imgsrcdev return EPERM; } - if (imgsrc_rootvnode == NULL) { + if (imgsrc_rootvnodes[0] == NULL) { return ENOENT; } - result = vnode_getwithref(imgsrc_rootvnode); + result = vnode_getwithref(imgsrc_rootvnodes[0]); if (result != 0) { return result; } - devvp = vnode_mount(imgsrc_rootvnode)->mnt_devvp; + devvp = vnode_mount(imgsrc_rootvnodes[0])->mnt_devvp; result = vnode_getwithref(devvp); if (result != 0) { goto out; @@ -2397,16 +2759,82 @@ sysctl_imgsrcdev vnode_put(devvp); out: - vnode_put(imgsrc_rootvnode); + vnode_put(imgsrc_rootvnodes[0]); return result; } SYSCTL_PROC(_kern, OID_AUTO, imgsrcdev, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_imgsrcdev, "I", ""); + +STATIC int +sysctl_imgsrcinfo +(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + int error; + struct imgsrc_info info[MAX_IMAGEBOOT_NESTING]; /* 2 for now, no problem */ + uint32_t i; + vnode_t rvp, devvp; + + if (imgsrc_rootvnodes[0] == NULLVP) { + return ENXIO; + } + + for (i = 0; i < MAX_IMAGEBOOT_NESTING; i++) { + /* + * Go get the root vnode. + */ + rvp = imgsrc_rootvnodes[i]; + if (rvp == NULLVP) { + break; + } + + error = vnode_get(rvp); + if (error != 0) { + return error; + } + + /* + * For now, no getting at a non-local volume. + */ + devvp = vnode_mount(rvp)->mnt_devvp; + if (devvp == NULL) { + vnode_put(rvp); + return EINVAL; + } + + error = vnode_getwithref(devvp); + if (error != 0) { + vnode_put(rvp); + return error; + } + + /* + * Fill in info. + */ + info[i].ii_dev = vnode_specrdev(devvp); + info[i].ii_flags = 0; + info[i].ii_height = i; + bzero(info[i].ii_reserved, sizeof(info[i].ii_reserved)); + + vnode_put(devvp); + vnode_put(rvp); + } + + return sysctl_io_opaque(req, info, i * sizeof(info[0]), NULL); +} + +SYSCTL_PROC(_kern, OID_AUTO, imgsrcinfo, + CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, sysctl_imgsrcinfo, "I", ""); + #endif /* CONFIG_IMGSRC_ACCESS */ -static int +SYSCTL_INT(_kern, OID_AUTO, timer_coalescing_enabled, + CTLFLAG_RW | CTLFLAG_LOCKED, + &mach_timer_coalescing_enabled, 0, ""); + +STATIC int sysctl_usrstack (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2414,10 +2842,10 @@ sysctl_usrstack } SYSCTL_PROC(_kern, KERN_USRSTACK32, usrstack, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_usrstack, "I", ""); -static int +STATIC int sysctl_usrstack64 (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2425,14 +2853,14 @@ sysctl_usrstack64 } SYSCTL_PROC(_kern, KERN_USRSTACK64, usrstack64, - CTLTYPE_QUAD | CTLFLAG_RD, + CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_usrstack64, "Q", ""); SYSCTL_STRING(_kern, KERN_COREFILE, corefile, - CTLFLAG_RW | CTLFLAG_KERN, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, corefilename, sizeof(corefilename), ""); -static int +STATIC int sysctl_coredump (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2451,10 +2879,10 @@ sysctl_coredump } SYSCTL_PROC(_kern, KERN_COREDUMP, coredump, - CTLTYPE_INT | CTLFLAG_RW, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_coredump, "I", ""); -static int +STATIC int sysctl_suid_coredump (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2473,10 +2901,10 @@ sysctl_suid_coredump } SYSCTL_PROC(_kern, KERN_SUGID_COREDUMP, sugid_coredump, - CTLTYPE_INT | CTLFLAG_RW, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_suid_coredump, "I", ""); -static int +STATIC int sysctl_delayterm (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2495,11 +2923,11 @@ sysctl_delayterm } SYSCTL_PROC(_kern, KERN_PROCDELAYTERM, delayterm, - CTLTYPE_INT | CTLFLAG_RW, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_delayterm, "I", ""); -static int +STATIC int sysctl_rage_vnode (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2545,11 +2973,11 @@ sysctl_rage_vnode } SYSCTL_PROC(_kern, KERN_RAGEVNODE, rage_vnode, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, 0, 0, sysctl_rage_vnode, "I", ""); -static int +STATIC int sysctl_kern_check_openevt (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2582,12 +3010,12 @@ sysctl_kern_check_openevt return(error); } -SYSCTL_PROC(_kern, KERN_CHECKOPENEVT, check_openevt, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY, +SYSCTL_PROC(_kern, KERN_CHECKOPENEVT, check_openevt, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, 0, 0, sysctl_kern_check_openevt, "I", "set the per-process check-open-evt flag"); -static int +STATIC int sysctl_nx (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2617,10 +3045,10 @@ sysctl_nx SYSCTL_PROC(_kern, KERN_NX_PROTECTION, nx, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_KERN, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, 0, sysctl_nx, "I", ""); -static int +STATIC int sysctl_loadavg (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2636,10 +3064,30 @@ sysctl_loadavg } SYSCTL_PROC(_vm, VM_LOADAVG, loadavg, - CTLTYPE_STRUCT | CTLFLAG_RD, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_loadavg, "S,loadavg", ""); -static int +/* + * Note: Thread safe; vm_map_lock protects in vm_toggle_entry_reuse() + */ +STATIC int +sysctl_vm_toggle_address_reuse(__unused struct sysctl_oid *oidp, __unused void *arg1, + __unused int arg2, struct sysctl_req *req) +{ + int old_value=0, new_value=0, error=0; + + if(vm_toggle_entry_reuse( VM_TOGGLE_GETVALUE, &old_value )) + return(error); + error = sysctl_io_number(req, old_value, sizeof(int), &new_value, NULL); + if (!error) { + return (vm_toggle_entry_reuse(new_value, NULL)); + } + return(error); +} + +SYSCTL_PROC(_debug, OID_AUTO, toggle_address_reuse, CTLFLAG_ANYBODY | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_vm_toggle_address_reuse,"I",""); + +STATIC int sysctl_swapusage (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2668,17 +3116,53 @@ sysctl_swapusage SYSCTL_PROC(_vm, VM_SWAPUSAGE, swapusage, - CTLTYPE_STRUCT | CTLFLAG_RD, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_swapusage, "S,xsw_usage", ""); +#if CONFIG_EMBEDDED +/* */ +boolean_t vm_freeze_enabled = FALSE; +#endif /* CONFIG_EMBEDDED */ + + +#if CONFIG_FREEZE +extern void vm_page_reactivate_all_throttled(void); + +static int +sysctl_freeze_enabled SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error, val = vm_freeze_enabled ? 1 : 0; + boolean_t disabled; + + error = sysctl_handle_int(oidp, &val, 0, req); + if (error || !req->newptr) + return (error); + + /* + * If freeze is being disabled, we need to move dirty pages out from the throttle to the active queue. + */ + disabled = (!val && vm_freeze_enabled); + + vm_freeze_enabled = val ? TRUE : FALSE; + + if (disabled) { + vm_page_reactivate_all_throttled(); + } + + return (0); +} + +SYSCTL_PROC(_vm, OID_AUTO, freeze_enabled, CTLTYPE_INT|CTLFLAG_RW, &vm_freeze_enabled, 0, sysctl_freeze_enabled, "I", ""); +#endif /* CONFIG_FREEZE */ /* this kernel does NOT implement shared_region_make_private_np() */ SYSCTL_INT(_kern, KERN_SHREG_PRIVATIZABLE, shreg_private, - CTLFLAG_RD, + CTLFLAG_RD | CTLFLAG_LOCKED, (int *)NULL, 0, ""); #if defined(__i386__) || defined(__x86_64__) -static int +STATIC int sysctl_sysctl_exec_affinity(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) @@ -2706,10 +3190,10 @@ sysctl_sysctl_exec_affinity(__unused struct sysctl_oid *oidp, return 0; } -SYSCTL_PROC(_sysctl, OID_AUTO, proc_exec_affinity, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY, 0, 0, sysctl_sysctl_exec_affinity ,"I","proc_exec_affinity"); +SYSCTL_PROC(_sysctl, OID_AUTO, proc_exec_affinity, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY | CTLFLAG_LOCKED, 0, 0, sysctl_sysctl_exec_affinity ,"I","proc_exec_affinity"); #endif -static int +STATIC int fetch_process_cputype( proc_t cur_proc, int *name, @@ -2752,7 +3236,7 @@ fetch_process_cputype( return (error); } -static int +STATIC int sysctl_sysctl_native(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req) { @@ -2765,9 +3249,9 @@ sysctl_sysctl_native(__unused struct sysctl_oid *oidp, void *arg1, int arg2, res = 0; return SYSCTL_OUT(req, &res, sizeof(res)); } -SYSCTL_PROC(_sysctl, OID_AUTO, proc_native, CTLTYPE_NODE|CTLFLAG_RD, 0, 0, sysctl_sysctl_native ,"I","proc_native"); +SYSCTL_PROC(_sysctl, OID_AUTO, proc_native, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_sysctl_native ,"I","proc_native"); -static int +STATIC int sysctl_sysctl_cputype(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req) { @@ -2777,9 +3261,9 @@ sysctl_sysctl_cputype(__unused struct sysctl_oid *oidp, void *arg1, int arg2, return error; return SYSCTL_OUT(req, &proc_cputype, sizeof(proc_cputype)); } -SYSCTL_PROC(_sysctl, OID_AUTO, proc_cputype, CTLTYPE_NODE|CTLFLAG_RD, 0, 0, sysctl_sysctl_cputype ,"I","proc_cputype"); +SYSCTL_PROC(_sysctl, OID_AUTO, proc_cputype, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_sysctl_cputype ,"I","proc_cputype"); -static int +STATIC int sysctl_safeboot (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2787,10 +3271,10 @@ sysctl_safeboot } SYSCTL_PROC(_kern, KERN_SAFEBOOT, safeboot, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_safeboot, "I", ""); -static int +STATIC int sysctl_singleuser (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2798,7 +3282,7 @@ sysctl_singleuser } SYSCTL_PROC(_kern, OID_AUTO, singleuser, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_singleuser, "I", ""); /* @@ -2808,9 +3292,9 @@ extern boolean_t affinity_sets_enabled; extern int affinity_sets_mapping; SYSCTL_INT (_kern, OID_AUTO, affinity_sets_enabled, - CTLFLAG_RW, (int *) &affinity_sets_enabled, 0, "hinting enabled"); + CTLFLAG_RW | CTLFLAG_LOCKED, (int *) &affinity_sets_enabled, 0, "hinting enabled"); SYSCTL_INT (_kern, OID_AUTO, affinity_sets_mapping, - CTLFLAG_RW, &affinity_sets_mapping, 0, "mapping policy"); + CTLFLAG_RW | CTLFLAG_LOCKED, &affinity_sets_mapping, 0, "mapping policy"); /* * Limit on total memory users can wire. @@ -2833,9 +3317,9 @@ vm_map_size_t vm_user_wire_limit; * There needs to be a more automatic/elegant way to do this */ -SYSCTL_QUAD(_vm, OID_AUTO, global_no_user_wire_amount, CTLFLAG_RW, &vm_global_no_user_wire_amount, ""); -SYSCTL_QUAD(_vm, OID_AUTO, global_user_wire_limit, CTLFLAG_RW, &vm_global_user_wire_limit, ""); -SYSCTL_QUAD(_vm, OID_AUTO, user_wire_limit, CTLFLAG_RW, &vm_user_wire_limit, ""); +SYSCTL_QUAD(_vm, OID_AUTO, global_no_user_wire_amount, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_global_no_user_wire_amount, ""); +SYSCTL_QUAD(_vm, OID_AUTO, global_user_wire_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_global_user_wire_limit, ""); +SYSCTL_QUAD(_vm, OID_AUTO, user_wire_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_user_wire_limit, ""); @@ -2846,15 +3330,15 @@ SYSCTL_QUAD(_vm, OID_AUTO, user_wire_limit, CTLFLAG_RW, &vm_user_wire_limit, "") extern uint32_t kdebug_thread_block; SYSCTL_INT (_kern, OID_AUTO, kdebug_thread_block, - CTLFLAG_RW, &kdebug_thread_block, 0, "kdebug thread_block"); + CTLFLAG_RW | CTLFLAG_LOCKED, &kdebug_thread_block, 0, "kdebug thread_block"); /* * Kernel stack size and depth */ SYSCTL_INT (_kern, OID_AUTO, stack_size, - CTLFLAG_RD, (int *) &kernel_stack_size, 0, "Kernel stack size"); + CTLFLAG_RD | CTLFLAG_LOCKED, (int *) &kernel_stack_size, 0, "Kernel stack size"); SYSCTL_INT (_kern, OID_AUTO, stack_depth_max, - CTLFLAG_RD, (int *) &kernel_stack_depth_max, 0, "Max kernel stack depth at interrupt or context switch"); + CTLFLAG_RD | CTLFLAG_LOCKED, (int *) &kernel_stack_depth_max, 0, "Max kernel stack depth at interrupt or context switch"); /* * enable back trace for port allocations @@ -2862,6 +3346,21 @@ SYSCTL_INT (_kern, OID_AUTO, stack_depth_max, extern int ipc_portbt; SYSCTL_INT(_kern, OID_AUTO, ipc_portbt, - CTLFLAG_RW | CTLFLAG_KERN, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, &ipc_portbt, 0, ""); +/* + * Scheduler sysctls + */ + +/* + * See osfmk/kern/sched_prim.c for the corresponding definition + * in osfmk/. If either version changes, update the other. + */ +#define SCHED_STRING_MAX_LENGTH (48) + +extern char sched_string[SCHED_STRING_MAX_LENGTH]; +SYSCTL_STRING(_kern, OID_AUTO, sched, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, + sched_string, sizeof(sched_string), + "Timeshare scheduler implementation"); diff --git a/bsd/kern/kern_time.c b/bsd/kern/kern_time.c index 6f392fb86..315863131 100644 --- a/bsd/kern/kern_time.c +++ b/bsd/kern/kern_time.c @@ -75,6 +75,7 @@ #include #include #include +#include #include #include @@ -215,7 +216,7 @@ adjtime(struct proc *p, struct adjtime_args *uap, __unused int32_t *retval) if (error) return (error); #endif - if ((error = suser(kauth_cred_get(), &p->p_acflag))) + if ((error = priv_check_cred(kauth_cred_get(), PRIV_ADJTIME, 0))) return (error); if (IS_64BIT_PROCESS(p)) { struct user64_timeval user_atv; diff --git a/bsd/kern/kern_xxx.c b/bsd/kern/kern_xxx.c index 0a080dd59..293808838 100644 --- a/bsd/kern/kern_xxx.c +++ b/bsd/kern/kern_xxx.c @@ -116,7 +116,7 @@ reboot(struct proc *p, register struct reboot_args *uap, __unused int32_t *retva #endif if (!error) { OSBitOrAtomic(P_REBOOT, &p->p_flag); /* No more signals for this proc */ - boot(RB_BOOT, uap->opt, command); + error = boot(RB_BOOT, uap->opt, command); } return(error); } diff --git a/bsd/kern/kpi_mbuf.c b/bsd/kern/kpi_mbuf.c index a89cfbef0..70ab53b31 100644 --- a/bsd/kern/kpi_mbuf.c +++ b/bsd/kern/kpi_mbuf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2010 Apple Inc. All rights reserved. + * Copyright (c) 2004-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -40,9 +40,9 @@ #include "net/net_str_id.h" -static const mbuf_flags_t mbuf_flags_mask = MBUF_EXT | MBUF_PKTHDR | MBUF_EOR | - MBUF_BCAST | MBUF_MCAST | MBUF_FRAG | MBUF_FIRSTFRAG | - MBUF_LASTFRAG | MBUF_PROMISC; +static const mbuf_flags_t mbuf_flags_mask = (MBUF_EXT | MBUF_PKTHDR | MBUF_EOR | + MBUF_LOOP | MBUF_BCAST | MBUF_MCAST | MBUF_FRAG | MBUF_FIRSTFRAG | + MBUF_LASTFRAG | MBUF_PROMISC | MBUF_HASFCS); void* mbuf_data(mbuf_t mbuf) { @@ -81,6 +81,10 @@ errno_t mbuf_align_32(mbuf_t mbuf, size_t len) return 0; } +/* This function is used to provide mcl_to_paddr via symbol indirection, + * please avoid any change in behavior or remove the indirection in + * config/Unsupported* + */ addr64_t mbuf_data_to_physical(void* ptr) { return (addr64_t)(uintptr_t)mcl_to_paddr(ptr); @@ -107,10 +111,10 @@ mbuf_attachcluster(mbuf_how_t how, mbuf_type_t type, mbuf_t *mbuf, caddr_t extbuf, void (*extfree)(caddr_t , u_int, caddr_t), size_t extsize, caddr_t extarg) { - if (extbuf == NULL || extfree == NULL || extsize == 0) + if (mbuf == NULL || extbuf == NULL || extfree == NULL || extsize == 0) return (EINVAL); - if ((*mbuf = m_clattach(mbuf != NULL ? *mbuf : NULL, type, extbuf, + if ((*mbuf = m_clattach(*mbuf, type, extbuf, extfree, extsize, extarg, how)) == NULL) return (ENOMEM); @@ -126,15 +130,15 @@ mbuf_alloccluster(mbuf_how_t how, size_t *size, caddr_t *addr) *addr = NULL; /* Jumbo cluster pool not available? */ - if (*size > NBPG && njcl == 0) + if (*size > MBIGCLBYTES && njcl == 0) return (ENOTSUP); if (*size <= MCLBYTES && (*addr = m_mclalloc(how)) != NULL) *size = MCLBYTES; - else if (*size > MCLBYTES && *size <= NBPG && + else if (*size > MCLBYTES && *size <= MBIGCLBYTES && (*addr = m_bigalloc(how)) != NULL) - *size = NBPG; - else if (*size > NBPG && *size <= M16KCLBYTES && + *size = MBIGCLBYTES; + else if (*size > MBIGCLBYTES && *size <= M16KCLBYTES && (*addr = m_16kalloc(how)) != NULL) *size = M16KCLBYTES; else @@ -149,14 +153,14 @@ mbuf_alloccluster(mbuf_how_t how, size_t *size, caddr_t *addr) void mbuf_freecluster(caddr_t addr, size_t size) { - if (size != MCLBYTES && size != NBPG && size != M16KCLBYTES) + if (size != MCLBYTES && size != MBIGCLBYTES && size != M16KCLBYTES) panic("%s: invalid size (%ld) for cluster %p", __func__, size, (void *)addr); if (size == MCLBYTES) m_mclfree(addr); - else if (size == NBPG) - m_bigfree(addr, NBPG, NULL); + else if (size == MBIGCLBYTES) + m_bigfree(addr, MBIGCLBYTES, NULL); else if (njcl > 0) m_16kfree(addr, M16KCLBYTES, NULL); else @@ -184,7 +188,7 @@ mbuf_getcluster(mbuf_how_t how, mbuf_type_t type, size_t size, mbuf_t* mbuf) */ if (size == MCLBYTES) { *mbuf = m_mclget(*mbuf, how); - } else if (size == NBPG) { + } else if (size == MBIGCLBYTES) { *mbuf = m_mbigget(*mbuf, how); } else if (size == M16KCLBYTES) { if (njcl > 0) { @@ -254,11 +258,17 @@ errno_t mbuf_getpacket(mbuf_how_t how, mbuf_t *mbuf) return error; } +/* This function is used to provide m_free via symbol indirection, please avoid + * any change in behavior or remove the indirection in config/Unsupported* + */ mbuf_t mbuf_free(mbuf_t mbuf) { return m_free(mbuf); } +/* This function is used to provide m_freem via symbol indirection, please avoid + * any change in behavior or remove the indirection in config/Unsupported* + */ void mbuf_freem(mbuf_t mbuf) { m_freem(mbuf); @@ -274,6 +284,10 @@ size_t mbuf_leadingspace(const mbuf_t mbuf) return m_leadingspace(mbuf); } +/* This function is used to provide m_trailingspace via symbol indirection, + * please avoid any change in behavior or remove the indirection in + * config/Unsupported* + */ size_t mbuf_trailingspace(const mbuf_t mbuf) { return m_trailingspace(mbuf); @@ -332,6 +346,9 @@ errno_t mbuf_pulldown(mbuf_t src, size_t *offset, size_t len, mbuf_t *location) return (*location == NULL) ? ENOMEM : 0; } +/* This function is used to provide m_adj via symbol indirection, please avoid + * any change in behavior or remove the indirection in config/Unsupported* + */ void mbuf_adj(mbuf_t mbuf, int len) { m_adj(mbuf, len); @@ -544,7 +561,7 @@ void mbuf_outbound_finalize(mbuf_t mbuf, u_int32_t protocol_family, size_t protocol_offset) { if ((mbuf->m_pkthdr.csum_flags & - (CSUM_DELAY_DATA | CSUM_DELAY_IP | CSUM_TCP_SUM16)) == 0) + (CSUM_DELAY_DATA | CSUM_DELAY_IP | CSUM_TCP_SUM16 | CSUM_DELAY_IPV6_DATA)) == 0) return; /* Generate the packet in software, client needs it */ @@ -573,14 +590,23 @@ mbuf_outbound_finalize(mbuf_t mbuf, u_int32_t protocol_family, size_t protocol_o mbuf->m_pkthdr.csum_flags &= ~(CSUM_DELAY_DATA | CSUM_DELAY_IP); break; + + case PF_INET6: + + if (mbuf->m_pkthdr.csum_flags & CSUM_DELAY_IPV6_DATA) { + in_delayed_cksum_offset(mbuf, protocol_offset); + } + mbuf->m_pkthdr.csum_flags &= ~CSUM_DELAY_IPV6_DATA; + break; + default: /* * Not sure what to do here if anything. - * Hardware checksum code looked pretty IPv4 specific. + * Hardware checksum code looked pretty IPv4/IPv6 specific. */ - if ((mbuf->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_IP)) != 0) - panic("mbuf_outbound_finalize - CSUM flags set for non-IPv4 packet (%u)!\n", protocol_family); + if ((mbuf->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_IP | CSUM_DELAY_IPV6_DATA)) != 0) + panic("mbuf_outbound_finalize - CSUM flags set for non-IPv4 or IPv6 packet (%u)!\n", protocol_family); } } @@ -619,7 +645,8 @@ mbuf_clear_vlan_tag( } static const mbuf_csum_request_flags_t mbuf_valid_csum_request_flags = - MBUF_CSUM_REQ_IP | MBUF_CSUM_REQ_TCP | MBUF_CSUM_REQ_UDP | MBUF_CSUM_REQ_SUM16; + MBUF_CSUM_REQ_IP | MBUF_CSUM_REQ_TCP | MBUF_CSUM_REQ_UDP | + MBUF_CSUM_REQ_SUM16 | MBUF_CSUM_REQ_TCPIPV6 | MBUF_CSUM_REQ_UDPIPV6; errno_t mbuf_set_csum_requested( @@ -827,7 +854,7 @@ mbuf_tag_allocate( } /* Allocate an mtag */ - tag = m_tag_alloc(id, type, length, how); + tag = m_tag_create(id, type, length, how, mbuf); if (tag == NULL) { return how == M_WAITOK ? ENOMEM : EWOULDBLOCK; } @@ -1072,34 +1099,16 @@ mbuf_get_mhlen(void) return (_MHLEN); } -mbuf_priority_t -mbuf_get_priority(struct mbuf *m) +u_int32_t +mbuf_get_minclsize(void) { -#if !PKT_PRIORITY -#pragma unused(m) - return (MBUF_PRIORITY_NORMAL); -#else /* PKT_PRIORITY */ - mbuf_priority_t prio = MBUF_PRIORITY_NORMAL; - - if (m == NULL || !(m->m_flags & M_PKTHDR)) - return (prio); - - /* Defaults to normal; ignore anything else but background */ - if (m->m_pkthdr.prio == MBUF_PRIORITY_BACKGROUND) - prio = MBUF_PRIORITY_BACKGROUND; - - return (prio); -#endif /* PKT_PRIORITY */ + return (MHLEN + MLEN); } mbuf_traffic_class_t mbuf_get_traffic_class(mbuf_t m) { -#if !PKT_PRIORITY -#pragma unused(m) - return (MBUF_TC_BE); -#else /* PKT_PRIORITY */ - mbuf_priority_t prio = MBUF_TC_BE; + mbuf_traffic_class_t prio = MBUF_TC_BE; if (m == NULL || !(m->m_flags & M_PKTHDR)) return (prio); @@ -1108,17 +1117,11 @@ mbuf_get_traffic_class(mbuf_t m) prio = m->m_pkthdr.prio; return (prio); -#endif /* PKT_PRIORITY */ } errno_t mbuf_set_traffic_class(mbuf_t m, mbuf_traffic_class_t tc) { -#if !PKT_PRIORITY -#pragma unused(m) -#pragma unused(tc) - return 0; -#else /* PKT_PRIORITY */ errno_t error = 0; if (m == NULL || !(m->m_flags & M_PKTHDR)) @@ -1136,5 +1139,4 @@ mbuf_set_traffic_class(mbuf_t m, mbuf_traffic_class_t tc) break; } return error; -#endif /* PKT_PRIORITY */ } diff --git a/bsd/kern/kpi_socket.c b/bsd/kern/kpi_socket.c index ee1fd5af0..70507beff 100644 --- a/bsd/kern/kpi_socket.c +++ b/bsd/kern/kpi_socket.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2010 Apple Inc. All rights reserved. + * Copyright (c) 2003-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -172,18 +172,13 @@ sock_accept( if (sa) FREE(sa, M_SONAME); /* - * If the socket has been marked as inactive by soacceptfilter(), - * disallow further operations on it. We explicitly call shutdown - * on both data directions to ensure that SS_CANT{RCV,SEND}MORE - * states are set for the socket. This would also flush out data - * hanging off the receive list of this socket. + * If the socket has been marked as inactive by sosetdefunct(), + * disallow further operations on it. */ if (new_so->so_flags & SOF_DEFUNCT) { - (void) soshutdownlock(new_so, SHUT_RD); - (void) soshutdownlock(new_so, SHUT_WR); - (void) sodisconnectlocked(new_so); + (void) sodefunct(current_proc(), new_so, + SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL); } - *new_sock = new_so; if (dosocklock) socket_unlock(new_so, 1); @@ -195,9 +190,30 @@ sock_bind( socket_t sock, const struct sockaddr *to) { - if (sock == NULL || to == NULL) return EINVAL; + int error = 0; + struct sockaddr *sa = NULL; + struct sockaddr_storage ss; + boolean_t want_free = TRUE; + + if (sock == NULL || to == NULL) + return EINVAL; + + if (to->sa_len > sizeof(ss)) { + MALLOC(sa, struct sockaddr *, to->sa_len, M_SONAME, M_WAITOK); + if (sa == NULL) + return ENOBUFS; + } else { + sa = (struct sockaddr *)&ss; + want_free = FALSE; + } + memcpy(sa, to, to->sa_len); + + error = sobind(sock, sa); - return sobind(sock, (struct sockaddr*)(uintptr_t)to); + if (sa != NULL && want_free == TRUE) + FREE(sa, M_SONAME); + + return error; } errno_t @@ -208,23 +224,37 @@ sock_connect( { int error = 0; lck_mtx_t *mutex_held; + struct sockaddr *sa = NULL; + struct sockaddr_storage ss; + boolean_t want_free = TRUE; if (sock == NULL || to == NULL) return EINVAL; + + if (to->sa_len > sizeof(ss)) { + MALLOC(sa, struct sockaddr *, to->sa_len, M_SONAME, + (flags & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK); + if (sa == NULL) + return ENOBUFS; + } else { + sa = (struct sockaddr *)&ss; + want_free = FALSE; + } + memcpy(sa, to, to->sa_len); socket_lock(sock, 1); if ((sock->so_state & SS_ISCONNECTING) && ((sock->so_state & SS_NBIO) != 0 || (flags & MSG_DONTWAIT) != 0)) { - socket_unlock(sock, 1); - return EALREADY; + error = EALREADY; + goto out; } - error = soconnectlock(sock, (struct sockaddr*)(uintptr_t)to, 0); + error = soconnectlock(sock, sa, 0); if (!error) { if ((sock->so_state & SS_ISCONNECTING) && ((sock->so_state & SS_NBIO) != 0 || (flags & MSG_DONTWAIT) != 0)) { - socket_unlock(sock, 1); - return EINPROGRESS; + error = EINPROGRESS; + goto out; } if (sock->so_proto->pr_getlock != NULL) @@ -247,7 +277,12 @@ sock_connect( else { sock->so_state &= ~SS_ISCONNECTING; } +out: socket_unlock(sock, 1); + + if (sa != NULL && want_free == TRUE) + FREE(sa, M_SONAME); + return error; } @@ -476,6 +511,27 @@ sock_setsockopt( return sosetopt(sock, &sopt); /* will lock socket */ } +/* + * This follows the recommended mappings between DSCP code points and WMM access classes + */ +static u_int8_t so_tc_from_dscp(u_int8_t dscp); +static u_int8_t +so_tc_from_dscp(u_int8_t dscp) +{ + u_int8_t tc; + + if (dscp >= 0x30 && dscp <= 0x3f) + tc = SO_TC_VO; + else if (dscp >= 0x20 && dscp <= 0x2f) + tc = SO_TC_VI; + else if (dscp >= 0x08 && dscp <= 0x17) + tc = SO_TC_BK; + else + tc = SO_TC_BE; + + return tc; +} + errno_t sock_settclassopt( socket_t sock, @@ -484,13 +540,9 @@ sock_settclassopt( errno_t error = 0; struct sockopt sopt; + int sotc; - if (sock == NULL || optval == NULL || optlen == 0) return EINVAL; - - sopt.sopt_dir = SOPT_SET; - sopt.sopt_val = CAST_USER_ADDR_T(optval); - sopt.sopt_valsize = optlen; - sopt.sopt_p = kernproc; + if (sock == NULL || optval == NULL || optlen != sizeof(int)) return EINVAL; socket_lock(sock, 1); if (!(sock->so_state & SS_ISCONNECTED)) { @@ -507,6 +559,28 @@ sock_settclassopt( goto out; } + /* + * Set the socket traffic class based on the passed DSCP code point + * regardless of the scope of the destination + */ + sotc = so_tc_from_dscp((*(const int *)optval) >> 2); + + sopt.sopt_dir = SOPT_SET; + sopt.sopt_val = CAST_USER_ADDR_T(&sotc); + sopt.sopt_valsize = sizeof(sotc); + sopt.sopt_p = kernproc; + sopt.sopt_level = SOL_SOCKET; + sopt.sopt_name = SO_TRAFFIC_CLASS; + + socket_unlock(sock, 0); + error = sosetopt(sock, &sopt); + socket_lock(sock, 0); + + if (error != 0) { + printf("sock_settclassopt: sosetopt SO_TRAFFIC_CLASS failed %d\n", error); + goto out; + } + /* Check if the destination address is LAN or link local address. * We do not want to set traffic class bits if the destination * is not local @@ -515,6 +589,11 @@ sock_settclassopt( goto out; } + sopt.sopt_dir = SOPT_SET; + sopt.sopt_val = CAST_USER_ADDR_T(optval); + sopt.sopt_valsize = optlen; + sopt.sopt_p = kernproc; + switch (sock->so_proto->pr_domain->dom_family) { case AF_INET: sopt.sopt_level = IPPROTO_IP; @@ -989,59 +1068,114 @@ sock_getlistener(socket_t sock) return (sock->so_head); } +static inline void +sock_set_tcp_stream_priority(socket_t sock) +{ + if ((sock->so_proto->pr_domain->dom_family == AF_INET || + sock->so_proto->pr_domain->dom_family == AF_INET6) && + sock->so_proto->pr_type == SOCK_STREAM) { + + set_tcp_stream_priority(sock); + + } +} + /* * Caller must have ensured socket is valid and won't be going away. */ void -socket_set_traffic_mgt_flags(socket_t sock, u_int32_t flags) +socket_set_traffic_mgt_flags_locked(socket_t sock, u_int32_t flags) { (void) OSBitOrAtomic(flags, &sock->so_traffic_mgt_flags); + sock_set_tcp_stream_priority(sock); +} + +void +socket_set_traffic_mgt_flags(socket_t sock, u_int32_t flags) +{ + socket_lock(sock, 1); + socket_set_traffic_mgt_flags_locked(sock, flags); + socket_unlock(sock, 1); } /* * Caller must have ensured socket is valid and won't be going away. */ void -socket_clear_traffic_mgt_flags(socket_t sock, u_int32_t flags) +socket_clear_traffic_mgt_flags_locked(socket_t sock, u_int32_t flags) { (void) OSBitAndAtomic(~flags, &sock->so_traffic_mgt_flags); + sock_set_tcp_stream_priority(sock); } -__private_extern__ void -set_traffic_class(struct mbuf *m, struct socket *so, int mtc) +void +socket_clear_traffic_mgt_flags(socket_t sock, u_int32_t flags) { -#if !PKT_PRIORITY -#pragma unused(m) -#pragma unused(so) -#pragma unused(mtc) - return; -#else /* PKT_PRIORITY */ - if (!(m->m_flags & M_PKTHDR)) - return; + socket_lock(sock, 1); + socket_clear_traffic_mgt_flags_locked(sock, flags); + socket_unlock(sock, 1); +} + - if (soisbackground(so)) { - m->m_pkthdr.prio = MBUF_TC_BK; - } else if (mtc != MBUF_TC_NONE) { - if (mtc >= MBUF_TC_BE && mtc <= MBUF_TC_VO) - m->m_pkthdr.prio = mtc; +/* + * Caller must have ensured socket is valid and won't be going away. + */ +errno_t +socket_defunct(struct proc *p, socket_t so, int level) +{ + errno_t retval; + + if (level != SHUTDOWN_SOCKET_LEVEL_DISCONNECT_SVC && + level != SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL) + return (EINVAL); + + socket_lock(so, 1); + /* + * SHUTDOWN_SOCKET_LEVEL_DISCONNECT_SVC level is meant to tear down + * all of mDNSResponder IPC sockets, currently those of AF_UNIX; note + * that this is an implementation artifact of mDNSResponder. We do + * a quick test against the socket buffers for SB_UNIX, since that + * would have been set by unp_attach() at socket creation time. + */ + if (level == SHUTDOWN_SOCKET_LEVEL_DISCONNECT_SVC && + (so->so_rcv.sb_flags & so->so_snd.sb_flags & SB_UNIX) != SB_UNIX) { + socket_unlock(so, 1); + return (EOPNOTSUPP); + } + retval = sosetdefunct(p, so, level, TRUE); + if (retval == 0) + retval = sodefunct(p, so, level); + socket_unlock(so, 1); + return (retval); +} + +errno_t +sock_setupcall(socket_t sock, sock_upcall callback, void* context) +{ + if (sock == NULL) + return EINVAL; + + /* + * Note that we don't wait for any in progress upcall to complete. + */ + socket_lock(sock, 1); + + sock->so_upcall = (so_upcall) callback; + sock->so_upcallarg = context; + if (callback) { + sock->so_rcv.sb_flags |= SB_UPCALL; +#if CONFIG_SOWUPCALL + sock->so_snd.sb_flags |= SB_UPCALL; +#endif /* CONFIG_SOWUPCALL */ } else { - switch (so->so_traffic_class) { - case SO_TC_BE: - m->m_pkthdr.prio = MBUF_TC_BE; - break; - case SO_TC_BK: - m->m_pkthdr.prio = MBUF_TC_BK; - break; - case SO_TC_VI: - m->m_pkthdr.prio = MBUF_TC_VI; - break; - case SO_TC_VO: - m->m_pkthdr.prio = MBUF_TC_VO; - break; - default: - break; - } + sock->so_rcv.sb_flags &= ~SB_UPCALL; +#if CONFIG_SOWUPCALL + sock->so_snd.sb_flags &= ~SB_UPCALL; +#endif /* CONFIG_SOWUPCALL */ } - return; -#endif /* PKT_PRIORITY */ + + socket_unlock(sock, 1); + + return 0; } + diff --git a/bsd/kern/kpi_socketfilter.c b/bsd/kern/kpi_socketfilter.c index c8469ab40..67a944c2d 100644 --- a/bsd/kern/kpi_socketfilter.c +++ b/bsd/kern/kpi_socketfilter.c @@ -33,17 +33,56 @@ #include #include #include +#include #include +#include +#include #include #include +#include #include +#define SFEF_ATTACHED 0x1 /* SFE is on socket list */ +#define SFEF_NODETACH 0x2 /* Detach should not be called */ +#define SFEF_NOSOCKET 0x4 /* Socket is gone */ + +struct socket_filter_entry { + struct socket_filter_entry *sfe_next_onsocket; + struct socket_filter_entry *sfe_next_onfilter; + struct socket_filter_entry *sfe_next_oncleanup; + + struct socket_filter *sfe_filter; + struct socket *sfe_socket; + void *sfe_cookie; + + uint32_t sfe_flags; + int32_t sfe_refcount; +}; + +struct socket_filter { + TAILQ_ENTRY(socket_filter) sf_protosw_next; + TAILQ_ENTRY(socket_filter) sf_global_next; + struct socket_filter_entry *sf_entry_head; + + struct protosw *sf_proto; + struct sflt_filter sf_filter; + u_int32_t sf_refcount; +}; + +TAILQ_HEAD(socket_filter_list, socket_filter); + static struct socket_filter_list sock_filter_head; -static lck_mtx_t *sock_filter_lock = 0; +static lck_rw_t *sock_filter_lock = NULL; +static lck_mtx_t *sock_filter_cleanup_lock = NULL; +static struct socket_filter_entry *sock_filter_cleanup_entries = NULL; +static thread_t sock_filter_cleanup_thread = NULL; -static void sflt_detach_private(struct socket_filter_entry *entry, int unregistering); +static void sflt_cleanup_thread(void *, wait_result_t); +static void sflt_detach_locked(struct socket_filter_entry *entry); + +#pragma mark -- Internal State Management -- __private_extern__ void sflt_init(void) @@ -54,70 +93,361 @@ sflt_init(void) TAILQ_INIT(&sock_filter_head); - /* Allocate a spin lock */ + /* Allocate a rw lock */ grp_attrib = lck_grp_attr_alloc_init(); lck_group = lck_grp_alloc_init("socket filter lock", grp_attrib); lck_grp_attr_free(grp_attrib); lck_attrib = lck_attr_alloc_init(); - sock_filter_lock = lck_mtx_alloc_init(lck_group, lck_attrib); + sock_filter_lock = lck_rw_alloc_init(lck_group, lck_attrib); + sock_filter_cleanup_lock = lck_mtx_alloc_init(lck_group, lck_attrib); lck_grp_free(lck_group); lck_attr_free(lck_attrib); } -__private_extern__ void -sflt_initsock( - struct socket *so) +static void +sflt_retain_locked( + struct socket_filter *filter) { - struct protosw *proto = so->so_proto; - struct socket_filter *filter; + filter->sf_refcount++; +} + +static void +sflt_release_locked( + struct socket_filter *filter) +{ + filter->sf_refcount--; + if (filter->sf_refcount == 0) + { + // Call the unregistered function + if (filter->sf_filter.sf_unregistered) { + lck_rw_unlock_exclusive(sock_filter_lock); + filter->sf_filter.sf_unregistered(filter->sf_filter.sf_handle); + lck_rw_lock_exclusive(sock_filter_lock); + } + + // Free the entry + FREE(filter, M_IFADDR); + } +} + +static void +sflt_entry_retain( + struct socket_filter_entry *entry) +{ + if (OSIncrementAtomic(&entry->sfe_refcount) <= 0) + panic("sflt_entry_retain - sfe_refcount <= 0\n"); +} + +static void +sflt_entry_release( + struct socket_filter_entry *entry) +{ + SInt32 old = OSDecrementAtomic(&entry->sfe_refcount); + if (old == 1) { + // That was the last reference + + // Take the cleanup lock + lck_mtx_lock(sock_filter_cleanup_lock); + + // Put this item on the cleanup list + entry->sfe_next_oncleanup = sock_filter_cleanup_entries; + sock_filter_cleanup_entries = entry; + + // If the item is the first item in the list + if (entry->sfe_next_oncleanup == NULL) { + if (sock_filter_cleanup_thread == NULL) { + // Create a thread + kernel_thread_start(sflt_cleanup_thread, NULL, &sock_filter_cleanup_thread); + } else { + // Wakeup the thread + wakeup(&sock_filter_cleanup_entries); + } + } + + // Drop the cleanup lock + lck_mtx_unlock(sock_filter_cleanup_lock); + } + else if (old <= 0) + { + panic("sflt_entry_release - sfe_refcount (%d) <= 0\n", (int)old); + } +} + +static void +sflt_cleanup_thread( + __unused void * blah, + __unused wait_result_t blah2) +{ + while (1) { + lck_mtx_lock(sock_filter_cleanup_lock); + while (sock_filter_cleanup_entries == NULL) { + // Sleep until we've got something better to do + msleep(&sock_filter_cleanup_entries, sock_filter_cleanup_lock, PWAIT, "sflt_cleanup", NULL); + } + + // Pull the current list of dead items + struct socket_filter_entry *dead = sock_filter_cleanup_entries; + sock_filter_cleanup_entries = NULL; + + // Drop the lock + lck_mtx_unlock(sock_filter_cleanup_lock); + + // Take the socket filter lock + lck_rw_lock_exclusive(sock_filter_lock); + + // Cleanup every dead item + struct socket_filter_entry *entry; + for (entry = dead; entry; entry = dead) { + struct socket_filter_entry **nextpp; + + dead = entry->sfe_next_oncleanup; + + // Call the detach function if necessary - drop the lock + if ((entry->sfe_flags & SFEF_NODETACH) == 0 && + entry->sfe_filter->sf_filter.sf_detach) { + entry->sfe_flags |= SFEF_NODETACH; + lck_rw_unlock_exclusive(sock_filter_lock); + + // Warning - passing a potentially dead socket may be bad + entry->sfe_filter->sf_filter. + sf_detach(entry->sfe_cookie, entry->sfe_socket); + + lck_rw_lock_exclusive(sock_filter_lock); + } + + // Pull entry off the socket list -- if the socket still exists + if ((entry->sfe_flags & SFEF_NOSOCKET) == 0) { + for (nextpp = &entry->sfe_socket->so_filt; *nextpp; + nextpp = &(*nextpp)->sfe_next_onsocket) { + if (*nextpp == entry) { + *nextpp = entry->sfe_next_onsocket; + break; + } + } + } + + // Pull entry off the filter list + for (nextpp = &entry->sfe_filter->sf_entry_head; *nextpp; + nextpp = &(*nextpp)->sfe_next_onfilter) { + if (*nextpp == entry) { + *nextpp = entry->sfe_next_onfilter; + break; + } + } + + // Release the filter -- may drop lock, but that's okay + sflt_release_locked(entry->sfe_filter); + entry->sfe_socket = NULL; + entry->sfe_filter = NULL; + FREE(entry, M_IFADDR); + } + + // Drop the socket filter lock + lck_rw_unlock_exclusive(sock_filter_lock); + } + // Not reached +} + +static int +sflt_attach_locked( + struct socket *so, + struct socket_filter *filter, + int socklocked) +{ + int error = 0; + struct socket_filter_entry *entry = NULL; - if (TAILQ_FIRST(&proto->pr_filter_head) != NULL) { - lck_mtx_lock(sock_filter_lock); - TAILQ_FOREACH(filter, &proto->pr_filter_head, sf_protosw_next) { - sflt_attach_private(so, filter, 0, 0); + if (filter == NULL) + error = ENOENT; + + if (error == 0) { + /* allocate the socket filter entry */ + MALLOC(entry, struct socket_filter_entry *, sizeof(*entry), M_IFADDR, M_WAITOK); + if (entry == NULL) { + error = ENOMEM; + } + } + + if (error == 0) { + /* Initialize the socket filter entry */ + entry->sfe_cookie = NULL; + entry->sfe_flags = SFEF_ATTACHED; + entry->sfe_refcount = 1; // corresponds to SFEF_ATTACHED flag set + + /* Put the entry in the filter list */ + sflt_retain_locked(filter); + entry->sfe_filter = filter; + entry->sfe_next_onfilter = filter->sf_entry_head; + filter->sf_entry_head = entry; + + /* Put the entry on the socket filter list */ + entry->sfe_socket = so; + entry->sfe_next_onsocket = so->so_filt; + so->so_filt = entry; + + if (entry->sfe_filter->sf_filter.sf_attach) { + // Retain the entry while we call attach + sflt_entry_retain(entry); + + // Release the filter lock -- callers must be aware we will do this + lck_rw_unlock_exclusive(sock_filter_lock); + + // Unlock the socket + if (socklocked) + socket_unlock(so, 0); + + // It's finally safe to call the filter function + error = entry->sfe_filter->sf_filter.sf_attach(&entry->sfe_cookie, so); + + // Lock the socket again + if (socklocked) + socket_lock(so, 0); + + // Lock the filters again + lck_rw_lock_exclusive(sock_filter_lock); + + // If the attach function returns an error, this filter must be detached + if (error) { + entry->sfe_flags |= SFEF_NODETACH; // don't call sf_detach + sflt_detach_locked(entry); + } + + // Release the retain we held through the attach call + sflt_entry_release(entry); } - lck_mtx_unlock(sock_filter_lock); } + + return error; } -__private_extern__ void -sflt_termsock( - struct socket *so) +errno_t +sflt_attach_internal( + socket_t socket, + sflt_handle handle) { - struct socket_filter_entry *filter; - struct socket_filter_entry *filter_next; + if (socket == NULL || handle == 0) + return EINVAL; + + int result = EINVAL; + + lck_rw_lock_exclusive(sock_filter_lock); + + struct socket_filter *filter = NULL; + TAILQ_FOREACH(filter, &sock_filter_head, sf_global_next) { + if (filter->sf_filter.sf_handle == handle) break; + } + + if (filter) { + result = sflt_attach_locked(socket, filter, 1); + } - for (filter = so->so_filt; filter; filter = filter_next) { - filter_next = filter->sfe_next_onsocket; - sflt_detach_private(filter, 0); + lck_rw_unlock_exclusive(sock_filter_lock); + + return result; +} + +static void +sflt_detach_locked( + struct socket_filter_entry *entry) +{ + if ((entry->sfe_flags & SFEF_ATTACHED) != 0) { + entry->sfe_flags &= ~SFEF_ATTACHED; + sflt_entry_release(entry); } - so->so_filt = NULL; } +#pragma mark -- Socket Layer Hooks -- + __private_extern__ void -sflt_use( +sflt_initsock( struct socket *so) { - so->so_filteruse++; + struct protosw *proto = so->so_proto; + + lck_rw_lock_shared(sock_filter_lock); + if (TAILQ_FIRST(&proto->pr_filter_head) != NULL) { + // Promote lock to exclusive + if (!lck_rw_lock_shared_to_exclusive(sock_filter_lock)) + lck_rw_lock_exclusive(sock_filter_lock); + + // Warning: A filter unregistering will be pulled out of the list. + // This could happen while we drop the lock in sftl_attach_locked + // or sflt_release_locked. For this reason we retain a reference + // on the filter (or next_filter) while calling this function + // + // This protects us from a panic, but it could result in a + // socket being created without all of the global filters if + // we're attaching a filter as it is removed, if that's possible. + struct socket_filter *filter = TAILQ_FIRST(&proto->pr_filter_head); + sflt_retain_locked(filter); + + while (filter) + { + struct socket_filter *filter_next; + + // Warning: sflt_attach_private_locked will drop the lock + sflt_attach_locked(so, filter, 0); + + filter_next = TAILQ_NEXT(filter, sf_protosw_next); + if (filter_next) + sflt_retain_locked(filter_next); + + // Warning: filt_release_locked may remove the filter from the queue + sflt_release_locked(filter); + filter = filter_next; + } + } + lck_rw_done(sock_filter_lock); } +/* + * sflt_termsock + * + * Detaches all filters from the socket. + */ + __private_extern__ void -sflt_unuse( +sflt_termsock( struct socket *so) { - so->so_filteruse--; - if (so->so_filteruse == 0) { - struct socket_filter_entry *filter; - struct socket_filter_entry *next_filter; - // search for detaching filters - for (filter = so->so_filt; filter; filter = next_filter) { - next_filter = filter->sfe_next_onsocket; + lck_rw_lock_exclusive(sock_filter_lock); + + struct socket_filter_entry *entry; + + while ((entry = so->so_filt) != NULL) { + // Pull filter off the socket + so->so_filt = entry->sfe_next_onsocket; + entry->sfe_flags |= SFEF_NOSOCKET; + + // Call detach + sflt_detach_locked(entry); + + // On sflt_termsock, we can't return until the detach function has been called + // Call the detach function - this is gross because the socket filter + // entry could be freed when we drop the lock, so we make copies on + // the stack and retain everything we need before dropping the lock + if ((entry->sfe_flags & SFEF_NODETACH) == 0 && + entry->sfe_filter->sf_filter.sf_detach) { + void *sfe_cookie = entry->sfe_cookie; + struct socket_filter *sfe_filter = entry->sfe_filter; - if (filter->sfe_flags & SFEF_DETACHUSEZERO) { - sflt_detach_private(filter, 0); - } + // Retain the socket filter + sflt_retain_locked(sfe_filter); + + // Mark that we've called the detach function + entry->sfe_flags |= SFEF_NODETACH; + + // Drop the lock around the call to the detach function + lck_rw_unlock_exclusive(sock_filter_lock); + sfe_filter->sf_filter.sf_detach(sfe_cookie, so); + lck_rw_lock_exclusive(sock_filter_lock); + + // Release the filter + sflt_release_locked(sfe_filter); } } + + lck_rw_unlock_exclusive(sock_filter_lock); } __private_extern__ void @@ -126,280 +456,595 @@ sflt_notify( sflt_event_t event, void *param) { - struct socket_filter_entry *filter; - int filtered = 0; + if (so->so_filt == NULL) return; + + struct socket_filter_entry *entry; + int unlocked = 0; - for (filter = so->so_filt; filter; - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_notify) { - if (filtered == 0) { - filtered = 1; - sflt_use(so); + lck_rw_lock_shared(sock_filter_lock); + for (entry = so->so_filt; entry; entry = entry->sfe_next_onsocket) { + if ((entry->sfe_flags & SFEF_ATTACHED) + && entry->sfe_filter->sf_filter.sf_notify) { + // Retain the filter entry and release the socket filter lock + sflt_entry_retain(entry); + lck_rw_unlock_shared(sock_filter_lock); + + // If the socket isn't already unlocked, unlock it + if (unlocked == 0) { + unlocked = 1; socket_unlock(so, 0); } - filter->sfe_filter->sf_filter.sf_notify( - filter->sfe_cookie, so, event, param); + + // Finally call the filter + entry->sfe_filter->sf_filter. + sf_notify(entry->sfe_cookie, so, event, param); + + // Take the socket filter lock again and release the entry + lck_rw_lock_shared(sock_filter_lock); + sflt_entry_release(entry); } } + lck_rw_unlock_shared(sock_filter_lock); - if (filtered != 0) { + if (unlocked != 0) { socket_lock(so, 0); - sflt_unuse(so); } } __private_extern__ int -sflt_data_in( - struct socket *so, - const struct sockaddr *from, - mbuf_t *data, - mbuf_t *control, - sflt_data_flag_t flags, - int *filtered) +sflt_ioctl( + struct socket *so, + u_long cmd, + caddr_t data) { - struct socket_filter_entry *filter; + if (so->so_filt == NULL) return 0; + + struct socket_filter_entry *entry; + int unlocked = 0; int error = 0; - int filtered_storage; - - if (filtered == NULL) - filtered = &filtered_storage; - *filtered = 0; - - for (filter = so->so_filt; filter && (error == 0); - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_data_in) { - if (*filtered == 0) { - *filtered = 1; - sflt_use(so); + + lck_rw_lock_shared(sock_filter_lock); + for (entry = so->so_filt; entry && error == 0; + entry = entry->sfe_next_onsocket) { + if ((entry->sfe_flags & SFEF_ATTACHED) + && entry->sfe_filter->sf_filter.sf_ioctl) { + // Retain the filter entry and release the socket filter lock + sflt_entry_retain(entry); + lck_rw_unlock_shared(sock_filter_lock); + + // If the socket isn't already unlocked, unlock it + if (unlocked == 0) { socket_unlock(so, 0); + unlocked = 1; } - error = filter->sfe_filter->sf_filter.sf_data_in( - filter->sfe_cookie, so, from, data, control, flags); + + // Call the filter + error = entry->sfe_filter->sf_filter. + sf_ioctl(entry->sfe_cookie, so, cmd, data); + + // Take the socket filter lock again and release the entry + lck_rw_lock_shared(sock_filter_lock); + sflt_entry_release(entry); } } - - if (*filtered != 0) { + lck_rw_unlock_shared(sock_filter_lock); + + if (unlocked) { socket_lock(so, 0); - sflt_unuse(so); } return error; } -/* sflt_attach_private - * - * Assumptions: If filter is not NULL, socket_filter_lock is held. - */ - __private_extern__ int -sflt_attach_private( - struct socket *so, - struct socket_filter *filter, - sflt_handle handle, - int sock_locked) +sflt_bind( + struct socket *so, + const struct sockaddr *nam) { - struct socket_filter_entry *entry = NULL; - int didlock = 0; - int error = 0; + if (so->so_filt == NULL) return 0; - if (filter == NULL) { - /* Find the filter by the handle */ - lck_mtx_lock(sock_filter_lock); - didlock = 1; - - TAILQ_FOREACH(filter, &sock_filter_head, sf_global_next) { - if (filter->sf_filter.sf_handle == handle) - break; + struct socket_filter_entry *entry; + int unlocked = 0; + int error = 0; + + lck_rw_lock_shared(sock_filter_lock); + for (entry = so->so_filt; entry && error == 0; + entry = entry->sfe_next_onsocket) { + if ((entry->sfe_flags & SFEF_ATTACHED) + && entry->sfe_filter->sf_filter.sf_bind) { + // Retain the filter entry and release the socket filter lock + sflt_entry_retain(entry); + lck_rw_unlock_shared(sock_filter_lock); + + // If the socket isn't already unlocked, unlock it + if (unlocked == 0) { + socket_unlock(so, 0); + unlocked = 1; + } + + // Call the filter + error = entry->sfe_filter->sf_filter. + sf_bind(entry->sfe_cookie, so, nam); + + // Take the socket filter lock again and release the entry + lck_rw_lock_shared(sock_filter_lock); + sflt_entry_release(entry); } } + lck_rw_unlock_shared(sock_filter_lock); + + if (unlocked) { + socket_lock(so, 0); + } - if (filter == NULL) - error = ENOENT; + return error; +} + +__private_extern__ int +sflt_listen( + struct socket *so) +{ + if (so->so_filt == NULL) return 0; - if (error == 0) { - /* allocate the socket filter entry */ - MALLOC(entry, struct socket_filter_entry *, sizeof(*entry), M_IFADDR, M_WAITOK); - if (entry == NULL) { - error = ENOMEM; + struct socket_filter_entry *entry; + int unlocked = 0; + int error = 0; + + lck_rw_lock_shared(sock_filter_lock); + for (entry = so->so_filt; entry && error == 0; + entry = entry->sfe_next_onsocket) { + if ((entry->sfe_flags & SFEF_ATTACHED) + && entry->sfe_filter->sf_filter.sf_listen) { + // Retain the filter entry and release the socket filter lock + sflt_entry_retain(entry); + lck_rw_unlock_shared(sock_filter_lock); + + // If the socket isn't already unlocked, unlock it + if (unlocked == 0) { + socket_unlock(so, 0); + unlocked = 1; + } + + // Call the filter + error = entry->sfe_filter->sf_filter. + sf_listen(entry->sfe_cookie, so); + + // Take the socket filter lock again and release the entry + lck_rw_lock_shared(sock_filter_lock); + sflt_entry_release(entry); } } + lck_rw_unlock_shared(sock_filter_lock); + + if (unlocked) { + socket_lock(so, 0); + } - if (error == 0) { - /* Initialize the socket filter entry and call the attach function */ - entry->sfe_filter = filter; - entry->sfe_socket = so; - entry->sfe_cookie = NULL; - entry->sfe_flags = 0; - if (entry->sfe_filter->sf_filter.sf_attach) { - filter->sf_usecount++; - - if (sock_locked) - socket_unlock(so, 0); - error = entry->sfe_filter->sf_filter.sf_attach(&entry->sfe_cookie, so); - if (sock_locked) - socket_lock(so, 0); - - filter->sf_usecount--; + return error; +} + +__private_extern__ int +sflt_accept( + struct socket *head, + struct socket *so, + const struct sockaddr *local, + const struct sockaddr *remote) +{ + if (so->so_filt == NULL) return 0; + + struct socket_filter_entry *entry; + int unlocked = 0; + int error = 0; + + lck_rw_lock_shared(sock_filter_lock); + for (entry = so->so_filt; entry && error == 0; + entry = entry->sfe_next_onsocket) { + if ((entry->sfe_flags & SFEF_ATTACHED) + && entry->sfe_filter->sf_filter.sf_accept) { + // Retain the filter entry and release the socket filter lock + sflt_entry_retain(entry); + lck_rw_unlock_shared(sock_filter_lock); - /* If the attach function returns an error, this filter is not attached */ - if (error) { - FREE(entry, M_IFADDR); - entry = NULL; + // If the socket isn't already unlocked, unlock it + if (unlocked == 0) { + socket_unlock(so, 0); + unlocked = 1; } + + // Call the filter + error = entry->sfe_filter->sf_filter. + sf_accept(entry->sfe_cookie, head, so, local, remote); + + // Take the socket filter lock again and release the entry + lck_rw_lock_shared(sock_filter_lock); + sflt_entry_release(entry); } } - - if (error == 0) { - /* Put the entry in the socket list */ - entry->sfe_next_onsocket = so->so_filt; - so->so_filt = entry; - - /* Put the entry in the filter list */ - entry->sfe_next_onfilter = filter->sf_entry_head; - filter->sf_entry_head = entry; - - /* Incremenet the parent filter's usecount */ - filter->sf_usecount++; + lck_rw_unlock_shared(sock_filter_lock); + + if (unlocked) { + socket_lock(so, 0); } - if (didlock) { - lck_mtx_unlock(sock_filter_lock); + return error; +} + +__private_extern__ int +sflt_getsockname( + struct socket *so, + struct sockaddr **local) +{ + if (so->so_filt == NULL) return 0; + + struct socket_filter_entry *entry; + int unlocked = 0; + int error = 0; + + lck_rw_lock_shared(sock_filter_lock); + for (entry = so->so_filt; entry && error == 0; + entry = entry->sfe_next_onsocket) { + if ((entry->sfe_flags & SFEF_ATTACHED) + && entry->sfe_filter->sf_filter.sf_getsockname) { + // Retain the filter entry and release the socket filter lock + sflt_entry_retain(entry); + lck_rw_unlock_shared(sock_filter_lock); + + // If the socket isn't already unlocked, unlock it + if (unlocked == 0) { + socket_unlock(so, 0); + unlocked = 1; + } + + // Call the filter + error = entry->sfe_filter->sf_filter. + sf_getsockname(entry->sfe_cookie, so, local); + + // Take the socket filter lock again and release the entry + lck_rw_lock_shared(sock_filter_lock); + sflt_entry_release(entry); + } + } + lck_rw_unlock_shared(sock_filter_lock); + + if (unlocked) { + socket_lock(so, 0); } return error; } +__private_extern__ int +sflt_getpeername( + struct socket *so, + struct sockaddr **remote) +{ + if (so->so_filt == NULL) return 0; + + struct socket_filter_entry *entry; + int unlocked = 0; + int error = 0; + + lck_rw_lock_shared(sock_filter_lock); + for (entry = so->so_filt; entry && error == 0; + entry = entry->sfe_next_onsocket) { + if ((entry->sfe_flags & SFEF_ATTACHED) + && entry->sfe_filter->sf_filter.sf_getpeername) { + // Retain the filter entry and release the socket filter lock + sflt_entry_retain(entry); + lck_rw_unlock_shared(sock_filter_lock); + + // If the socket isn't already unlocked, unlock it + if (unlocked == 0) { + socket_unlock(so, 0); + unlocked = 1; + } + + // Call the filter + error = entry->sfe_filter->sf_filter. + sf_getpeername(entry->sfe_cookie, so, remote); + + // Take the socket filter lock again and release the entry + lck_rw_lock_shared(sock_filter_lock); + sflt_entry_release(entry); + } + } + lck_rw_unlock_shared(sock_filter_lock); -/* sflt_detach_private - * - * Assumptions: if you pass 0 in for the second parameter, you are holding the - * socket lock for the socket the entry is attached to. If you pass 1 in for - * the second parameter, it is assumed that the entry is not on the filter's - * list and the socket lock is not held. - */ + if (unlocked) { + socket_lock(so, 0); + } + + return error; +} -static void -sflt_detach_private( - struct socket_filter_entry *entry, - int unregistering) +__private_extern__ int +sflt_connectin( + struct socket *so, + const struct sockaddr *remote) { - struct socket_filter_entry **next_ptr; - int detached = 0; - int found = 0; + if (so->so_filt == NULL) return 0; - if (unregistering) { - socket_lock(entry->sfe_socket, 0); - } + struct socket_filter_entry *entry; + int unlocked = 0; + int error = 0; - /* - * Attempt to find the entry on the filter's list and - * remove it. This prevents a filter detaching at the - * same time from attempting to remove the same entry. - */ - lck_mtx_lock(sock_filter_lock); - if (!unregistering) { - if ((entry->sfe_flags & SFEF_UNREGISTERING) != 0) { - /* - * Another thread is unregistering the filter, we - * need to avoid detaching the filter here so the - * socket won't go away. Bump up the socket's - * usecount so that it won't be freed until after - * the filter unregistration has been completed; - * at this point the caller has already held the - * socket's lock, so we can directly modify the - * usecount. - */ - if (!(entry->sfe_flags & SFEF_DETACHXREF)) { - entry->sfe_socket->so_usecount++; - entry->sfe_flags |= SFEF_DETACHXREF; + lck_rw_lock_shared(sock_filter_lock); + for (entry = so->so_filt; entry && error == 0; + entry = entry->sfe_next_onsocket) { + if ((entry->sfe_flags & SFEF_ATTACHED) + && entry->sfe_filter->sf_filter.sf_connect_in) { + // Retain the filter entry and release the socket filter lock + sflt_entry_retain(entry); + lck_rw_unlock_shared(sock_filter_lock); + + // If the socket isn't already unlocked, unlock it + if (unlocked == 0) { + socket_unlock(so, 0); + unlocked = 1; } - lck_mtx_unlock(sock_filter_lock); - return; + + // Call the filter + error = entry->sfe_filter->sf_filter. + sf_connect_in(entry->sfe_cookie, so, remote); + + // Take the socket filter lock again and release the entry + lck_rw_lock_shared(sock_filter_lock); + sflt_entry_release(entry); } - for (next_ptr = &entry->sfe_filter->sf_entry_head; *next_ptr; - next_ptr = &((*next_ptr)->sfe_next_onfilter)) { - if (*next_ptr == entry) { - found = 1; - *next_ptr = entry->sfe_next_onfilter; - break; + } + lck_rw_unlock_shared(sock_filter_lock); + + if (unlocked) { + socket_lock(so, 0); + } + + return error; +} + +__private_extern__ int +sflt_connectout( + struct socket *so, + const struct sockaddr *nam) +{ + if (so->so_filt == NULL) return 0; + + struct socket_filter_entry *entry; + int unlocked = 0; + int error = 0; + + lck_rw_lock_shared(sock_filter_lock); + for (entry = so->so_filt; entry && error == 0; + entry = entry->sfe_next_onsocket) { + if ((entry->sfe_flags & SFEF_ATTACHED) + && entry->sfe_filter->sf_filter.sf_connect_out) { + // Retain the filter entry and release the socket filter lock + sflt_entry_retain(entry); + lck_rw_unlock_shared(sock_filter_lock); + + // If the socket isn't already unlocked, unlock it + if (unlocked == 0) { + socket_unlock(so, 0); + unlocked = 1; } - } - - if (!found && (entry->sfe_flags & SFEF_DETACHUSEZERO) == 0) { - lck_mtx_unlock(sock_filter_lock); - return; - } - } else { - /* - * Clear the removing flag. We will perform the detach here or - * request a delayed detach. Since we do an extra ref release - * below, bump up the usecount if we haven't done so. - */ - entry->sfe_flags &= ~SFEF_UNREGISTERING; - if (!(entry->sfe_flags & SFEF_DETACHXREF)) { - entry->sfe_socket->so_usecount++; - entry->sfe_flags |= SFEF_DETACHXREF; + + // Call the filter + error = entry->sfe_filter->sf_filter. + sf_connect_out(entry->sfe_cookie, so, nam); + + // Take the socket filter lock again and release the entry + lck_rw_lock_shared(sock_filter_lock); + sflt_entry_release(entry); } } + lck_rw_unlock_shared(sock_filter_lock); - if (entry->sfe_socket->so_filteruse != 0) { - entry->sfe_flags |= SFEF_DETACHUSEZERO; - lck_mtx_unlock(sock_filter_lock); + if (unlocked) { + socket_lock(so, 0); + } - if (unregistering) { -#if DEBUG - printf("sflt_detach_private unregistering SFEF_DETACHUSEZERO " - "so%p so_filteruse %u so_usecount %d\n", - entry->sfe_socket, entry->sfe_socket->so_filteruse, - entry->sfe_socket->so_usecount); -#endif - socket_unlock(entry->sfe_socket, 0); + return error; +} + +__private_extern__ int +sflt_setsockopt( + struct socket *so, + struct sockopt *sopt) +{ + if (so->so_filt == NULL) return 0; + + struct socket_filter_entry *entry; + int unlocked = 0; + int error = 0; + + lck_rw_lock_shared(sock_filter_lock); + for (entry = so->so_filt; entry && error == 0; + entry = entry->sfe_next_onsocket) { + if ((entry->sfe_flags & SFEF_ATTACHED) + && entry->sfe_filter->sf_filter.sf_setoption) { + // Retain the filter entry and release the socket filter lock + sflt_entry_retain(entry); + lck_rw_unlock_shared(sock_filter_lock); + + // If the socket isn't already unlocked, unlock it + if (unlocked == 0) { + socket_unlock(so, 0); + unlocked = 1; + } + + // Call the filter + error = entry->sfe_filter->sf_filter. + sf_setoption(entry->sfe_cookie, so, sopt); + + // Take the socket filter lock again and release the entry + lck_rw_lock_shared(sock_filter_lock); + sflt_entry_release(entry); } - return; - } else { - /* - * Check if we are removing the last attached filter and - * the parent filter is being unregistered. - */ - entry->sfe_filter->sf_usecount--; - if ((entry->sfe_filter->sf_usecount == 0) && - (entry->sfe_filter->sf_flags & SFF_DETACHING) != 0) - detached = 1; - } - lck_mtx_unlock(sock_filter_lock); - - /* Remove from the socket list */ - for (next_ptr = &entry->sfe_socket->so_filt; *next_ptr; - next_ptr = &((*next_ptr)->sfe_next_onsocket)) { - if (*next_ptr == entry) { - *next_ptr = entry->sfe_next_onsocket; - break; + } + lck_rw_unlock_shared(sock_filter_lock); + + if (unlocked) { + socket_lock(so, 0); + } + + return error; +} + +__private_extern__ int +sflt_getsockopt( + struct socket *so, + struct sockopt *sopt) +{ + if (so->so_filt == NULL) return 0; + + struct socket_filter_entry *entry; + int unlocked = 0; + int error = 0; + + lck_rw_lock_shared(sock_filter_lock); + for (entry = so->so_filt; entry && error == 0; + entry = entry->sfe_next_onsocket) { + if ((entry->sfe_flags & SFEF_ATTACHED) + && entry->sfe_filter->sf_filter.sf_getoption) { + // Retain the filter entry and release the socket filter lock + sflt_entry_retain(entry); + lck_rw_unlock_shared(sock_filter_lock); + + // If the socket isn't already unlocked, unlock it + if (unlocked == 0) { + socket_unlock(so, 0); + unlocked = 1; + } + + // Call the filter + error = entry->sfe_filter->sf_filter. + sf_getoption(entry->sfe_cookie, so, sopt); + + // Take the socket filter lock again and release the entry + lck_rw_lock_shared(sock_filter_lock); + sflt_entry_release(entry); } } + lck_rw_unlock_shared(sock_filter_lock); + + if (unlocked) { + socket_lock(so, 0); + } + + return error; +} + +__private_extern__ int +sflt_data_out( + struct socket *so, + const struct sockaddr *to, + mbuf_t *data, + mbuf_t *control, + sflt_data_flag_t flags) +{ + if (so->so_filt == NULL) return 0; - if (entry->sfe_filter->sf_filter.sf_detach) - entry->sfe_filter->sf_filter.sf_detach(entry->sfe_cookie, entry->sfe_socket); + struct socket_filter_entry *entry; + int unlocked = 0; + int setsendthread = 0; + int error = 0; - if (detached && entry->sfe_filter->sf_filter.sf_unregistered) { - entry->sfe_filter->sf_filter.sf_unregistered(entry->sfe_filter->sf_filter.sf_handle); - FREE(entry->sfe_filter, M_IFADDR); + lck_rw_lock_shared(sock_filter_lock); + for (entry = so->so_filt; entry && error == 0; + entry = entry->sfe_next_onsocket) { + if ((entry->sfe_flags & SFEF_ATTACHED) + && entry->sfe_filter->sf_filter.sf_data_out) { + // Retain the filter entry and release the socket filter lock + sflt_entry_retain(entry); + lck_rw_unlock_shared(sock_filter_lock); + + // If the socket isn't already unlocked, unlock it + if (unlocked == 0) { + if (so->so_send_filt_thread == NULL) { + setsendthread = 1; + so->so_send_filt_thread = current_thread(); + } + socket_unlock(so, 0); + unlocked = 1; + } + + // Call the filter + error = entry->sfe_filter->sf_filter. + sf_data_out(entry->sfe_cookie, so, to, data, control, flags); + + // Take the socket filter lock again and release the entry + lck_rw_lock_shared(sock_filter_lock); + sflt_entry_release(entry); + } } + lck_rw_unlock_shared(sock_filter_lock); - if (unregistering) - socket_unlock(entry->sfe_socket, 1); + if (unlocked) { + socket_lock(so, 0); + if (setsendthread) so->so_send_filt_thread = NULL; + } + + return error; +} - FREE(entry, M_IFADDR); +__private_extern__ int +sflt_data_in( + struct socket *so, + const struct sockaddr *from, + mbuf_t *data, + mbuf_t *control, + sflt_data_flag_t flags) +{ + if (so->so_filt == NULL) return 0; + + struct socket_filter_entry *entry; + int error = 0; + int unlocked = 0; + + lck_rw_lock_shared(sock_filter_lock); + + for (entry = so->so_filt; entry && (error == 0); + entry = entry->sfe_next_onsocket) { + if ((entry->sfe_flags & SFEF_ATTACHED) && + entry->sfe_filter->sf_filter.sf_data_in) { + // Retain the filter entry and release the socket filter lock + sflt_entry_retain(entry); + lck_rw_unlock_shared(sock_filter_lock); + + // If the socket isn't already unlocked, unlock it + if (unlocked == 0) { + unlocked = 1; + socket_unlock(so, 0); + } + + // Call the filter + error = entry->sfe_filter->sf_filter.sf_data_in( + entry->sfe_cookie, so, from, data, control, flags); + + // Take the socket filter lock again and release the entry + lck_rw_lock_shared(sock_filter_lock); + sflt_entry_release(entry); + } + } + lck_rw_unlock_shared(sock_filter_lock); + + if (unlocked) { + socket_lock(so, 0); + } + + return error; } +#pragma mark -- KPI -- + errno_t sflt_attach( socket_t socket, sflt_handle handle) { - if (socket == NULL || handle == 0) - return EINVAL; - - return sflt_attach_private(socket, NULL, handle, 0); + socket_lock(socket, 1); + errno_t result = sflt_attach_internal(socket, handle); + socket_unlock(socket, 1); + return result; } errno_t @@ -407,34 +1052,29 @@ sflt_detach( socket_t socket, sflt_handle handle) { - struct socket_filter_entry *filter; + struct socket_filter_entry *entry; errno_t result = 0; if (socket == NULL || handle == 0) return EINVAL; - socket_lock(socket, 1); - - for (filter = socket->so_filt; filter; - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_handle == handle) + lck_rw_lock_exclusive(sock_filter_lock); + for (entry = socket->so_filt; entry; + entry = entry->sfe_next_onsocket) { + if (entry->sfe_filter->sf_filter.sf_handle == handle && + (entry->sfe_flags & SFEF_ATTACHED) != 0) { break; + } } - if (filter != NULL) { - sflt_detach_private(filter, 0); + if (entry != NULL) { + sflt_detach_locked(entry); } - else { - socket->so_filt = NULL; - result = ENOENT; - } - - socket_unlock(socket, 1); + lck_rw_unlock_exclusive(sock_filter_lock); return result; } - errno_t sflt_register( const struct sflt_filter *filter, @@ -481,7 +1121,7 @@ sflt_register( } bcopy(filter, &sock_filt->sf_filter, len); - lck_mtx_lock(sock_filter_lock); + lck_rw_lock_exclusive(sock_filter_lock); /* Look for an existing entry */ TAILQ_FOREACH(match, &sock_filter_head, sf_global_next) { if (match->sf_filter.sf_handle == @@ -489,7 +1129,7 @@ sflt_register( break; } } - + /* Add the entry only if there was no existing entry */ if (match == NULL) { TAILQ_INSERT_TAIL(&sock_filter_head, sock_filt, sf_global_next); @@ -498,9 +1138,10 @@ sflt_register( sf_protosw_next); sock_filt->sf_proto = pr; } + sflt_retain_locked(sock_filt); } - lck_mtx_unlock(sock_filter_lock); - + lck_rw_unlock_exclusive(sock_filter_lock); + if (match != NULL) { FREE(sock_filt, M_IFADDR); return EEXIST; @@ -514,62 +1155,39 @@ sflt_unregister( sflt_handle handle) { struct socket_filter *filter; - struct socket_filter_entry *entry_head = NULL; - struct socket_filter_entry *next_entry = NULL; + lck_rw_lock_exclusive(sock_filter_lock); - /* Find the entry and remove it from the global and protosw lists */ - lck_mtx_lock(sock_filter_lock); + /* Find the entry by the handle */ TAILQ_FOREACH(filter, &sock_filter_head, sf_global_next) { if (filter->sf_filter.sf_handle == handle) break; } if (filter) { + // Remove it from the global list TAILQ_REMOVE(&sock_filter_head, filter, sf_global_next); + + // Remove it from the protosw list if ((filter->sf_filter.sf_flags & SFLT_GLOBAL) != 0) { TAILQ_REMOVE(&filter->sf_proto->pr_filter_head, filter, sf_protosw_next); } - entry_head = filter->sf_entry_head; - filter->sf_entry_head = NULL; - filter->sf_flags |= SFF_DETACHING; - - for (next_entry = entry_head; next_entry; - next_entry = next_entry->sfe_next_onfilter) { - /* - * Mark this as "unregistering"; upon dropping the - * lock, another thread may win the race and attempt - * to detach a socket from it (e.g. as part of close) - * before we get a chance to detach. Setting this - * flag practically tells the other thread to go away. - * If the other thread wins, this causes an extra - * reference hold on the socket so that it won't be - * deallocated until after we finish with the detach - * for it below. If we win the race, the extra - * reference hold is also taken to compensate for the - * extra reference release when detach is called - * with a "1" for its second parameter. - */ - next_entry->sfe_flags |= SFEF_UNREGISTERING; + + // Detach from any sockets + struct socket_filter_entry *entry = NULL; + + for (entry = filter->sf_entry_head; entry; entry = entry->sfe_next_onfilter) { + sflt_detach_locked(entry); } + + // Release the filter + sflt_release_locked(filter); } - lck_mtx_unlock(sock_filter_lock); + lck_rw_unlock_exclusive(sock_filter_lock); if (filter == NULL) return ENOENT; - /* We need to detach the filter from any sockets it's attached to */ - if (entry_head == 0) { - if (filter->sf_filter.sf_unregistered) - filter->sf_filter.sf_unregistered(filter->sf_filter.sf_handle); - } else { - while (entry_head) { - next_entry = entry_head->sfe_next_onfilter; - sflt_detach_private(entry_head, 1); - entry_head = next_entry; - } - } - return 0; } diff --git a/bsd/kern/mach_loader.c b/bsd/kern/mach_loader.c index a6a0ab766..ab26c40b9 100644 --- a/bsd/kern/mach_loader.c +++ b/bsd/kern/mach_loader.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -59,6 +59,7 @@ #include #include +#include #include #include @@ -79,7 +80,6 @@ #include #include - /* * XXX vm/pmap.h should not treat these prototypes as MACH_KERNEL_PRIVATE * when KERNEL is defined. @@ -100,10 +100,6 @@ extern kern_return_t thread_state_initialize(thread_t thread); /* XXX should have prototypes in a shared header file */ extern int get_map_nentries(vm_map_t); -extern kern_return_t thread_userstack(thread_t, int, thread_state_t, - unsigned int, mach_vm_offset_t *, int *); -extern kern_return_t thread_entrypoint(thread_t, int, thread_state_t, - unsigned int, mach_vm_offset_t *); extern kern_return_t memory_object_signed(memory_object_control_t control, boolean_t is_signed); @@ -119,8 +115,11 @@ static load_result_t load_result_null = { .unixproc = 0, .dynlinker = 0, .customstack = 0, + .validentry = 0, .csflags = 0, - .uuid = { 0 } + .uuid = { 0 }, + .min_vm_addr = MACH_VM_MAX_ADDRESS, + .max_vm_addr = MACH_VM_MIN_ADDRESS }; /* @@ -135,6 +134,7 @@ parse_machfile( off_t file_offset, off_t macho_size, int depth, + int64_t slide, load_result_t *result ); @@ -147,10 +147,12 @@ load_segment( off_t macho_size, struct vnode *vp, vm_map_t map, + int64_t slide, load_result_t *result ); -int load_code_signature( +static load_return_t +load_code_signature( struct linkedit_data_command *lcp, struct vnode *vp, off_t macho_offset, @@ -171,13 +173,7 @@ static load_return_t load_unixthread( struct thread_command *tcp, thread_t thread, - load_result_t *result -); - -static load_return_t -load_thread( - struct thread_command *tcp, - thread_t thread, + int64_t slide, load_result_t *result ); @@ -193,7 +189,7 @@ load_threadstack( thread_t thread, uint32_t *ts, uint32_t total_size, - user_addr_t *user_stack, + mach_vm_offset_t *user_stack, int *customstack ); @@ -212,10 +208,12 @@ load_dylinker( vm_map_t map, thread_t thread, int depth, - load_result_t *result, - boolean_t is_64bit + int64_t slide, + load_result_t *result ); +struct macho_data; + static load_return_t get_macho_vnode( char *path, @@ -223,6 +221,7 @@ get_macho_vnode( struct mach_header *mach_header, off_t *file_offset, off_t *macho_size, + struct macho_data *macho_data, struct vnode **vpp ); @@ -246,7 +245,7 @@ widen_segment_command(const struct segment_command *scp32, static void note_all_image_info_section(const struct segment_command_64 *scp, boolean_t is64, size_t section_size, const void *sections, - load_result_t *result) + int64_t slide, load_result_t *result) { const union { struct section s32; @@ -263,6 +262,7 @@ note_all_image_info_section(const struct segment_command_64 *scp, sizeof(sectionp->s64.sectname))) { result->all_image_info_addr = is64 ? sectionp->s64.addr : sectionp->s32.addr; + result->all_image_info_addr += slide; result->all_image_info_size = is64 ? sectionp->s64.size : sectionp->s32.size; return; @@ -270,7 +270,6 @@ note_all_image_info_section(const struct segment_command_64 *scp, } } - load_return_t load_machfile( struct image_params *imgp, @@ -293,6 +292,8 @@ load_machfile( boolean_t create_map = FALSE; int spawn = (imgp->ip_flags & IMGPF_SPAWN); task_t task = current_task(); + mach_vm_offset_t aslr_offset = 0; + kern_return_t kret; if (new_map == VM_MAP_NULL) { create_map = TRUE; @@ -312,10 +313,12 @@ load_machfile( if (create_map) { pmap = pmap_create((vm_map_size_t) 0, (imgp->ip_flags & IMGPF_IS_64BIT)); + pal_switch_pmap(thread, pmap, imgp->ip_flags & IMGPF_IS_64BIT); map = vm_map_create(pmap, 0, vm_compute_max_offset((imgp->ip_flags & IMGPF_IS_64BIT)), TRUE); + } else map = new_map; @@ -325,6 +328,20 @@ load_machfile( if ( (header->flags & MH_ALLOW_STACK_EXECUTION) ) vm_map_disable_NX(map); #endif + + /* Forcibly disallow execution from data pages on even if the arch + * normally permits it. */ + if ((header->flags & MH_NO_HEAP_EXECUTION) && !(imgp->ip_flags & IMGPF_ALLOW_DATA_EXEC)) + vm_map_disallow_data_exec(map); + + /* + * Compute a random offset for ASLR. + */ + if (!(imgp->ip_flags & IMGPF_DISABLE_ASLR)) { + aslr_offset = random(); + aslr_offset %= 1 << ((imgp->ip_flags & IMGPF_IS_64BIT) ? 16 : 8); + aslr_offset <<= PAGE_SHIFT; + } if (!result) result = &myresult; @@ -332,7 +349,7 @@ load_machfile( *result = load_result_null; lret = parse_machfile(vp, map, thread, header, file_offset, macho_size, - 0, result); + 0, (int64_t)aslr_offset, result); if (lret != LOAD_SUCCESS) { if (create_map) { @@ -362,7 +379,7 @@ load_machfile( if (create_map) { /* - * If this is an exec, then we are going to destory the old + * If this is an exec, then we are going to destroy the old * task, and it's correct to halt it; if it's spawn, the * task is not yet running, and it makes no sense. */ @@ -376,15 +393,16 @@ load_machfile( * task halting (wait for threads and then cleanup * task resources). */ - task_start_halt(task); + kret = task_start_halt(task); + if (kret != KERN_SUCCESS) { + return(kret); + } proc_transcommit(current_proc(), 0); task_complete_halt(task); + workqueue_exit(current_proc()); } - old_map = swap_task_map(old_task, thread, map); + old_map = swap_task_map(old_task, thread, map, !spawn); vm_map_clear_4GB_pagezero(old_map); - /* XXX L4 : For spawn the current task isn't running... */ - if (!spawn) - pmap_switch(pmap); /* Make sure we are using the new pmap */ vm_map_deallocate(old_map); } return(LOAD_SUCCESS); @@ -397,7 +415,9 @@ load_machfile( * bits in the file format itself. We read into the kernel buffer the * commands section, and then parse it in order to parse the mach-o file * format load_command segment(s). We are only interested in a subset of - * the total set of possible commands. + * the total set of possible commands. If "map"==VM_MAP_NULL or + * "thread"==THREAD_NULL, do not make permament VM modifications, + * just preflight the parse. */ static load_return_t @@ -409,6 +429,7 @@ parse_machfile( off_t file_offset, off_t macho_size, int depth, + int64_t aslr_offset, load_result_t *result ) { @@ -428,10 +449,10 @@ parse_machfile( proc_t p = current_proc(); /* XXXX */ int error; int resid=0; - task_t task; size_t mach_header_sz = sizeof(struct mach_header); boolean_t abi64; boolean_t got_code_signatures = FALSE; + int64_t slide = 0; if (header->magic == MH_MAGIC_64 || header->magic == MH_CIGAM_64) { @@ -445,8 +466,6 @@ parse_machfile( return(LOAD_FAILURE); } - task = (task_t)get_threadtask(thread); - depth++; /* @@ -522,11 +541,30 @@ parse_machfile( kfree(kl_addr, kl_size); return(LOAD_IOERROR); } - + + /* + * For PIE and dyld, slide everything by the ASLR offset. + */ + if ((header->flags & MH_PIE) || (header->filetype == MH_DYLINKER)) { + slide = aslr_offset; + } + /* * Scan through the commands, processing each one as necessary. */ - for (pass = 1; pass <= 2; pass++) { + for (pass = 1; pass <= 3; pass++) { + +#if CONFIG_EMBEDDED + /* + * Check that the entry point is contained in an executable segments + */ + if ((pass == 3) && (result->validentry == 0)) { + thread_state_initialize(thread); + ret = LOAD_FAILURE; + break; + } +#endif + /* * Loop through each of the load_commands indicated by the * Mach-O header; if an absurd value is provided, we just @@ -535,6 +573,7 @@ parse_machfile( */ offset = mach_header_sz; ncmds = header->ncmds; + while (ncmds--) { /* * Get a pointer to the command. @@ -565,7 +604,7 @@ parse_machfile( switch(lcp->cmd) { case LC_SEGMENT: case LC_SEGMENT_64: - if (pass != 1) + if (pass != 2) break; ret = load_segment(lcp, header->filetype, @@ -574,25 +613,20 @@ parse_machfile( macho_size, vp, map, + slide, result); break; - case LC_THREAD: - if (pass != 2) - break; - ret = load_thread((struct thread_command *)lcp, - thread, - result); - break; case LC_UNIXTHREAD: - if (pass != 2) + if (pass != 1) break; ret = load_unixthread( (struct thread_command *) lcp, - thread, + thread, + slide, result); break; case LC_LOAD_DYLINKER: - if (pass != 2) + if (pass != 3) break; if ((depth == 1) && (dlp == 0)) { dlp = (struct dylinker_command *)lcp; @@ -602,14 +636,14 @@ parse_machfile( } break; case LC_UUID: - if (pass == 2 && depth == 1) { + if (pass == 1 && depth == 1) { uulp = (struct uuid_command *)lcp; memcpy(&result->uuid[0], &uulp->uuid[0], sizeof(result->uuid)); } break; case LC_CODE_SIGNATURE: /* CODE SIGNING */ - if (pass != 2) + if (pass != 1) break; /* pager -> uip -> load signatures & store in uip @@ -633,7 +667,7 @@ parse_machfile( break; #if CONFIG_CODE_DECRYPTION case LC_ENCRYPTION_INFO: - if (pass != 2) + if (pass != 3) break; ret = set_code_unprotect( (struct encryption_info_command *) lcp, @@ -671,24 +705,15 @@ parse_machfile( } } - if (dlp != 0) - ret = load_dylinker(dlp, dlarchbits, map, thread, depth, result, abi64); + if (dlp != 0) { + /* load the dylinker, and always slide it by the ASLR + * offset regardless of PIE */ + ret = load_dylinker(dlp, dlarchbits, map, thread, depth, aslr_offset, result); + } if(depth == 1) { if (result->thread_count == 0) { ret = LOAD_FAILURE; - } else if ( abi64 ) { -#ifdef __ppc__ - /* Map in 64-bit commpage */ - /* - * PPC51: ppc64 is limited to 51-bit addresses. - * Memory above that limit is handled specially - * at the pmap level. - * - * -- wrong task for vfork()/spawn() - */ - pmap_map_sharedpage(current_task(), get_map_pmap(map)); -#endif /* __ppc__ */ } } } @@ -780,6 +805,7 @@ load_segment( off_t macho_size, struct vnode *vp, vm_map_t map, + int64_t slide, load_result_t *result ) { @@ -795,17 +821,21 @@ load_segment( if (LC_SEGMENT_64 == lcp->cmd) { segment_command_size = sizeof(struct segment_command_64); single_section_size = sizeof(struct section_64); - scp = (struct segment_command_64 *)lcp; } else { segment_command_size = sizeof(struct segment_command); single_section_size = sizeof(struct section); - scp = &segment_command; - widen_segment_command((struct segment_command *)lcp, scp); } if (lcp->cmdsize < segment_command_size) return (LOAD_BADMACHO); total_section_size = lcp->cmdsize - segment_command_size; + if (LC_SEGMENT_64 == lcp->cmd) + scp = (struct segment_command_64 *)lcp; + else { + scp = &segment_command; + widen_segment_command((struct segment_command *)lcp, scp); + } + /* * Make sure what we get from the file is really ours (as specified * by macho_size). @@ -833,27 +863,48 @@ load_segment( map_addr = trunc_page_64(scp->vmaddr); /* JVXXX note that in XNU TOT this is round instead of trunc for 64 bits */ if (seg_size == 0) return (KERN_SUCCESS); - /* XXX (4596982) this interferes with Rosetta, so limit to 64-bit tasks */ if (map_addr == 0 && map_size == 0 && seg_size != 0 && - scp->cmd == LC_SEGMENT_64 && (scp->initprot & VM_PROT_ALL) == VM_PROT_NONE && (scp->maxprot & VM_PROT_ALL) == VM_PROT_NONE) { /* - * This is a "page zero" segment: it starts at address 0, - * is not mapped from the binary file and is not accessible. - * User-space should never be able to access that memory, so - * make it completely off limits by raising the VM map's - * minimum offset. + * For PIE, extend page zero rather than moving it. Extending + * page zero keeps early allocations from falling predictably + * between the end of page zero and the beginning of the first + * slid segment. */ - ret = vm_map_raise_min_offset(map, seg_size); - if (ret != KERN_SUCCESS) { - return (LOAD_FAILURE); + seg_size += slide; + slide = 0; + + /* XXX (4596982) this interferes with Rosetta, so limit to 64-bit tasks */ + if (scp->cmd == LC_SEGMENT_64) { + /* + * This is a "page zero" segment: it starts at address 0, + * is not mapped from the binary file and is not accessible. + * User-space should never be able to access that memory, so + * make it completely off limits by raising the VM map's + * minimum offset. + */ + ret = vm_map_raise_min_offset(map, seg_size); + if (ret != KERN_SUCCESS) { + return (LOAD_FAILURE); + } + return (LOAD_SUCCESS); } - return (LOAD_SUCCESS); } + /* If a non-zero slide was specified by the caller, apply now */ + map_addr += slide; + + if (map_addr < result->min_vm_addr) + result->min_vm_addr = map_addr; + if (map_addr+seg_size > result->max_vm_addr) + result->max_vm_addr = map_addr+seg_size; + + if (map == VM_MAP_NULL) + return (LOAD_SUCCESS); + map_offset = pager_offset + scp->fileoff; /* limited to 32 bits */ if (map_size > 0) { @@ -930,77 +981,12 @@ load_segment( result->all_image_info_addr == MACH_VM_MIN_ADDRESS) note_all_image_info_section(scp, LC_SEGMENT_64 == lcp->cmd, single_section_size, - (const char *)lcp + segment_command_size, result); - - return ret; -} - -static -load_return_t -load_thread( - struct thread_command *tcp, - thread_t thread, - load_result_t *result -) -{ - kern_return_t kret; - load_return_t lret; - task_t task; - int customstack=0; + (const char *)lcp + segment_command_size, slide, result); - if (tcp->cmdsize < sizeof(*tcp)) - return (LOAD_BADMACHO); - task = get_threadtask(thread); + if ((result->entry_point >= map_addr) && (result->entry_point < (map_addr + map_size))) + result->validentry = 1; - /* if count is 0; same as thread */ - if (result->thread_count != 0) { - kret = thread_create(task, &thread); - if (kret != KERN_SUCCESS) - return(LOAD_RESOURCE); - thread_deallocate(thread); - } - - lret = load_threadstate(thread, - (uint32_t *)(((vm_offset_t)tcp) + - sizeof(struct thread_command)), - tcp->cmdsize - sizeof(struct thread_command)); - if (lret != LOAD_SUCCESS) - return (lret); - - if (result->thread_count == 0) { - lret = load_threadstack(thread, - (uint32_t *)(((vm_offset_t)tcp) + - sizeof(struct thread_command)), - tcp->cmdsize - sizeof(struct thread_command), - &result->user_stack, - &customstack); - if (customstack) - result->customstack = 1; - else - result->customstack = 0; - - if (lret != LOAD_SUCCESS) - return(lret); - - lret = load_threadentry(thread, - (uint32_t *)(((vm_offset_t)tcp) + - sizeof(struct thread_command)), - tcp->cmdsize - sizeof(struct thread_command), - &result->entry_point); - if (lret != LOAD_SUCCESS) - return(lret); - } - /* - * Resume thread now, note that this means that the thread - * commands should appear after all the load commands to - * be sure they don't reference anything not yet mapped. - */ - else - thread_resume(thread); - - result->thread_count++; - - return(LOAD_SUCCESS); + return ret; } static @@ -1008,6 +994,7 @@ load_return_t load_unixthread( struct thread_command *tcp, thread_t thread, + int64_t slide, load_result_t *result ) { @@ -1017,9 +1004,12 @@ load_unixthread( if (tcp->cmdsize < sizeof(*tcp)) return (LOAD_BADMACHO); if (result->thread_count != 0) { -printf("load_unixthread: already have a thread!"); + printf("load_unixthread: already have a thread!"); return (LOAD_FAILURE); } + + if (thread == THREAD_NULL) + return (LOAD_SUCCESS); ret = load_threadstack(thread, (uint32_t *)(((vm_offset_t)tcp) + @@ -1031,9 +1021,12 @@ printf("load_unixthread: already have a thread!"); return(ret); if (customstack) - result->customstack = 1; + result->customstack = 1; else - result->customstack = 0; + result->customstack = 0; + + result->user_stack += slide; + ret = load_threadentry(thread, (uint32_t *)(((vm_offset_t)tcp) + sizeof(struct thread_command)), @@ -1042,6 +1035,8 @@ printf("load_unixthread: already have a thread!"); if (ret != LOAD_SUCCESS) return(ret); + result->entry_point += slide; + ret = load_threadstate(thread, (uint32_t *)(((vm_offset_t)tcp) + sizeof(struct thread_command)), @@ -1107,7 +1102,7 @@ load_threadstack( thread_t thread, uint32_t *ts, uint32_t total_size, - user_addr_t *user_stack, + mach_vm_offset_t *user_stack, int *customstack ) { @@ -1183,31 +1178,40 @@ load_threadentry( return(LOAD_SUCCESS); } +struct macho_data { + struct nameidata __nid; + union macho_vnode_header { + struct mach_header mach_header; + struct fat_header fat_header; + char __pad[512]; + } __header; +}; -static -load_return_t +static load_return_t load_dylinker( struct dylinker_command *lcp, integer_t archbits, vm_map_t map, thread_t thread, int depth, - load_result_t *result, - boolean_t is_64bit + int64_t slide, + load_result_t *result ) { char *name; char *p; struct vnode *vp = NULLVP; /* set by get_macho_vnode() */ - struct mach_header header; + struct mach_header *header; off_t file_offset = 0; /* set by get_macho_vnode() */ off_t macho_size = 0; /* set by get_macho_vnode() */ - vm_map_t copy_map; - load_result_t myresult; + load_result_t *myresult; kern_return_t ret; - vm_map_copy_t tmp; - mach_vm_offset_t dyl_start, map_addr; - mach_vm_size_t dyl_length; + struct macho_data *macho_data; + struct { + struct mach_header __header; + load_result_t __myresult; + struct macho_data __macho_data; + } *dyld_data; if (lcp->cmdsize < sizeof(*lcp)) return (LOAD_BADMACHO); @@ -1222,11 +1226,19 @@ load_dylinker( return(LOAD_BADMACHO); } while (*p++); - ret = get_macho_vnode(name, archbits, &header, &file_offset, &macho_size, &vp); + /* Allocate wad-of-data from heap to reduce excessively deep stacks */ + + MALLOC(dyld_data, void *, sizeof (*dyld_data), M_TEMP, M_WAITOK); + header = &dyld_data->__header; + myresult = &dyld_data->__myresult; + macho_data = &dyld_data->__macho_data; + + ret = get_macho_vnode(name, archbits, header, + &file_offset, &macho_size, macho_data, &vp); if (ret) - return (ret); - - myresult = load_result_null; + goto novp_out; + + *myresult = load_result_null; /* * First try to map dyld in directly. This should work most of @@ -1234,106 +1246,85 @@ load_dylinker( * mapped to its address. */ - ret = parse_machfile(vp, map, thread, &header, file_offset, macho_size, - depth, &myresult); + ret = parse_machfile(vp, map, thread, header, file_offset, + macho_size, depth, slide, myresult); /* * If it turned out something was in the way, then we'll take - * take this longer path to map dyld into a temporary map and - * copy it into destination map at a different address. + * take this longer path to preflight dyld's vm ranges, then + * map it at a free location in the address space. */ if (ret == LOAD_NOSPACE) { + mach_vm_offset_t dyl_start, map_addr; + mach_vm_size_t dyl_length; + int64_t slide_amount; + + *myresult = load_result_null; /* - * Load the Mach-O. - * Use a temporary map to do the work. + * Preflight parsing the Mach-O file with a NULL + * map, which will return the ranges needed for a + * subsequent map attempt (with a slide) in "myresult" */ - copy_map = vm_map_create(pmap_create(vm_map_round_page(macho_size), - is_64bit), - get_map_min(map), get_map_max(map), TRUE); - if (VM_MAP_NULL == copy_map) { - ret = LOAD_RESOURCE; + ret = parse_machfile(vp, VM_MAP_NULL, THREAD_NULL, header, + file_offset, macho_size, depth, 0 /* slide */, myresult); + + if (ret != LOAD_SUCCESS) { goto out; } - - myresult = load_result_null; - ret = parse_machfile(vp, copy_map, thread, &header, - file_offset, macho_size, - depth, &myresult); - - if (ret) { - vm_map_deallocate(copy_map); + dyl_start = myresult->min_vm_addr; + dyl_length = myresult->max_vm_addr - myresult->min_vm_addr; + + dyl_length += slide; + + /* To find an appropriate load address, do a quick allocation */ + map_addr = dyl_start; + ret = mach_vm_allocate(map, &map_addr, dyl_length, VM_FLAGS_ANYWHERE); + if (ret != KERN_SUCCESS) { + ret = LOAD_NOSPACE; goto out; } - - if (get_map_nentries(copy_map) > 0) { - - dyl_start = mach_get_vm_start(copy_map); - dyl_length = mach_get_vm_end(copy_map) - dyl_start; - - map_addr = dyl_start; - ret = mach_vm_allocate(map, &map_addr, dyl_length, VM_FLAGS_ANYWHERE); - - if (ret != KERN_SUCCESS) { - vm_map_deallocate(copy_map); - ret = LOAD_NOSPACE; - goto out; - - } - ret = vm_map_copyin(copy_map, - (vm_map_address_t)dyl_start, - (vm_map_size_t)dyl_length, - TRUE, &tmp); - if (ret != KERN_SUCCESS) { - (void) vm_map_remove(map, - vm_map_trunc_page(map_addr), - vm_map_round_page(map_addr + dyl_length), - VM_MAP_NO_FLAGS); - vm_map_deallocate(copy_map); - goto out; - } - - ret = vm_map_copy_overwrite(map, - (vm_map_address_t)map_addr, - tmp, FALSE); - if (ret != KERN_SUCCESS) { - vm_map_copy_discard(tmp); - (void) vm_map_remove(map, - vm_map_trunc_page(map_addr), - vm_map_round_page(map_addr + dyl_length), - VM_MAP_NO_FLAGS); - vm_map_deallocate(copy_map); - goto out; - } - - if (map_addr != dyl_start) { - myresult.entry_point += (map_addr - dyl_start); - myresult.all_image_info_addr += - (map_addr - dyl_start); - } - } else { - ret = LOAD_FAILURE; + ret = mach_vm_deallocate(map, map_addr, dyl_length); + if (ret != KERN_SUCCESS) { + ret = LOAD_NOSPACE; + goto out; } + + if (map_addr < dyl_start) + slide_amount = -(int64_t)(dyl_start - map_addr); + else + slide_amount = (int64_t)(map_addr - dyl_start); + + slide_amount += slide; - vm_map_deallocate(copy_map); + *myresult = load_result_null; + + ret = parse_machfile(vp, map, thread, header, + file_offset, macho_size, depth, slide_amount, myresult); + + if (ret) { + goto out; + } } - + if (ret == LOAD_SUCCESS) { result->dynlinker = TRUE; - result->entry_point = myresult.entry_point; - result->all_image_info_addr = myresult.all_image_info_addr; - result->all_image_info_size = myresult.all_image_info_size; + result->entry_point = myresult->entry_point; + result->all_image_info_addr = myresult->all_image_info_addr; + result->all_image_info_size = myresult->all_image_info_size; } out: vnode_put(vp); +novp_out: + FREE(dyld_data, M_TEMP); return (ret); } -int +static load_return_t load_code_signature( struct linkedit_data_command *lcp, struct vnode *vp, @@ -1408,6 +1399,10 @@ load_code_signature( /* ubc_cs_blob_add() has consumed "addr" */ addr = 0; } + +#if CHECK_CS_VALIDATION_BITMAP + ubc_cs_validation_bitmap_allocate( vp ); +#endif blob = ubc_cs_blob_get(vp, cputype, -1); @@ -1435,9 +1430,9 @@ set_code_unprotect( struct vnode *vp) { int result, len; - char vpath[MAXPATHLEN]; pager_crypt_info_t crypt_info; const char * cryptname = 0; + char *vpath; size_t offset; struct segment_command_64 *seg64; @@ -1445,8 +1440,7 @@ set_code_unprotect( vm_map_offset_t map_offset, map_size; kern_return_t kr; - if (eip->cmdsize < sizeof(*eip)) - return LOAD_BADMACHO; + if (eip->cmdsize < sizeof(*eip)) return LOAD_BADMACHO; switch(eip->cryptid) { case 0: @@ -1464,13 +1458,22 @@ set_code_unprotect( return LOAD_BADMACHO; } + if (map == VM_MAP_NULL) return (LOAD_SUCCESS); + if (NULL == text_crypter_create) return LOAD_FAILURE; + + MALLOC_ZONE(vpath, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); + if(vpath == NULL) return LOAD_FAILURE; + len = MAXPATHLEN; result = vn_getpath(vp, vpath, &len); - if(result) return result; + if(result) { + FREE_ZONE(vpath, MAXPATHLEN, M_NAMEI); + return LOAD_FAILURE; + } /* set up decrypter first */ - if(NULL==text_crypter_create) return LOAD_FAILURE; kr=text_crypter_create(&crypt_info, cryptname, (void*)vpath); + FREE_ZONE(vpath, MAXPATHLEN, M_NAMEI); if(kr) { printf("set_code_unprotect: unable to create decrypter %s, kr=%d\n", @@ -1549,6 +1552,7 @@ get_macho_vnode( struct mach_header *mach_header, off_t *file_offset, off_t *macho_size, + struct macho_data *data, struct vnode **vpp ) { @@ -1556,19 +1560,14 @@ get_macho_vnode( vfs_context_t ctx = vfs_context_current(); proc_t p = vfs_context_proc(ctx); kauth_cred_t kerncred; - struct nameidata nid, *ndp; + struct nameidata *ndp = &data->__nid; boolean_t is_fat; struct fat_arch fat_arch; - int error = LOAD_SUCCESS; + int error; int resid; - union { - struct mach_header mach_header; - struct fat_header fat_header; - char pad[512]; - } header; + union macho_vnode_header *header = &data->__header; off_t fsize = (off_t)0; - int err2; - + /* * Capture the kernel credential for use in the actual read of the * file, since the user doing the execution may have execute rights @@ -1579,10 +1578,8 @@ get_macho_vnode( */ kerncred = vfs_context_ucred(vfs_context_kernel()); - ndp = &nid; - /* init the namei data to point the file user's program name */ - NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx); + NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | LOCKLEAF, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx); if ((error = namei(ndp)) != 0) { if (error == ENOENT) { @@ -1594,7 +1591,7 @@ get_macho_vnode( } nameidone(ndp); vp = ndp->ni_vp; - + /* check for regular file */ if (vp->v_type != VREG) { error = LOAD_PROTECT; @@ -1625,41 +1622,42 @@ get_macho_vnode( goto bad1; } - if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)&header, sizeof(header), 0, + if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)header, sizeof (*header), 0, UIO_SYSSPACE, IO_NODELOCKED, kerncred, &resid, p)) != 0) { error = LOAD_IOERROR; goto bad2; } - - if (header.mach_header.magic == MH_MAGIC || - header.mach_header.magic == MH_MAGIC_64) - is_fat = FALSE; - else if (header.fat_header.magic == FAT_MAGIC || - header.fat_header.magic == FAT_CIGAM) - is_fat = TRUE; - else { - error = LOAD_BADMACHO; - goto bad2; + + if (header->mach_header.magic == MH_MAGIC || + header->mach_header.magic == MH_MAGIC_64) { + is_fat = FALSE; + } else if (header->fat_header.magic == FAT_MAGIC || + header->fat_header.magic == FAT_CIGAM) { + is_fat = TRUE; + } else { + error = LOAD_BADMACHO; + goto bad2; } if (is_fat) { /* Look up our architecture in the fat file. */ - error = fatfile_getarch_with_bits(vp, archbits, (vm_offset_t)(&header.fat_header), &fat_arch); + error = fatfile_getarch_with_bits(vp, archbits, + (vm_offset_t)(&header->fat_header), &fat_arch); if (error != LOAD_SUCCESS) goto bad2; /* Read the Mach-O header out of it */ - error = vn_rdwr(UIO_READ, vp, (caddr_t)&header.mach_header, - sizeof(header.mach_header), fat_arch.offset, - UIO_SYSSPACE, IO_NODELOCKED, kerncred, &resid, p); + error = vn_rdwr(UIO_READ, vp, (caddr_t)&header->mach_header, + sizeof (header->mach_header), fat_arch.offset, + UIO_SYSSPACE, IO_NODELOCKED, kerncred, &resid, p); if (error) { error = LOAD_IOERROR; goto bad2; } /* Is this really a Mach-O? */ - if (header.mach_header.magic != MH_MAGIC && - header.mach_header.magic != MH_MAGIC_64) { + if (header->mach_header.magic != MH_MAGIC && + header->mach_header.magic != MH_MAGIC_64) { error = LOAD_BADMACHO; goto bad2; } @@ -1677,25 +1675,23 @@ get_macho_vnode( * required, since the dynamic linker might work, but we will * refuse to load it because of this check. */ - if ((cpu_type_t)(header.mach_header.cputype & CPU_ARCH_MASK) != archbits) - return(LOAD_BADARCH); + if ((cpu_type_t)(header->mach_header.cputype & CPU_ARCH_MASK) != archbits) { + error = LOAD_BADARCH; + goto bad2; + } *file_offset = 0; *macho_size = fsize; } - *mach_header = header.mach_header; + *mach_header = header->mach_header; *vpp = vp; ubc_setsize(vp, fsize); - return (error); bad2: - err2 = VNOP_CLOSE(vp, FREAD, ctx); - vnode_put(vp); - return (error); - + (void) VNOP_CLOSE(vp, FREAD, ctx); bad1: vnode_put(vp); return(error); diff --git a/bsd/kern/mach_loader.h b/bsd/kern/mach_loader.h index 413d1a9a7..fd8e585db 100644 --- a/bsd/kern/mach_loader.h +++ b/bsd/kern/mach_loader.h @@ -60,9 +60,12 @@ typedef struct _load_result { /* boolean_t */ unixproc :1, dynlinker :1, customstack :1, + validentry :1, :0; unsigned int csflags; unsigned char uuid[16]; + mach_vm_address_t min_vm_addr; + mach_vm_address_t max_vm_addr; } load_result_t; struct image_params; diff --git a/bsd/kern/mach_process.c b/bsd/kern/mach_process.c index 0df8a49c8..9aba89b96 100644 --- a/bsd/kern/mach_process.c +++ b/bsd/kern/mach_process.c @@ -94,6 +94,8 @@ #include /* cs_allow_invalid() */ +#include + /* XXX ken/bsd_kern.c - prototype should be in common header */ int get_task_userstop(task_t); @@ -127,6 +129,10 @@ ptrace(struct proc *p, struct ptrace_args *uap, int32_t *retval) AUDIT_ARG(value32, uap->data); if (uap->req == PT_DENY_ATTACH) { +#if (DEVELOPMENT || DEBUG) && defined(__arm__) + if (PE_i_can_has_debugger(NULL)) + return(0); +#endif proc_lock(p); if (ISSET(p->p_lflag, P_LTRACED)) { proc_unlock(p); @@ -164,8 +170,10 @@ ptrace(struct proc *p, struct ptrace_args *uap, int32_t *retval) struct proc *pproc=proc_find(p->p_oppid); proc_unlock(p); cs_allow_invalid(p); - cs_allow_invalid(pproc); - proc_rele(pproc); + if(pproc) { + cs_allow_invalid(pproc); + proc_rele(pproc); + } return(0); } if (uap->req == PT_SIGEXC) { @@ -434,7 +442,7 @@ cantrace(proc_t cur_procp, kauth_cred_t creds, proc_t traced_procp, int *errp) * (3) it's not owned by you, or is set-id on exec * (unless you're root). */ - if ((creds->cr_ruid != proc_ucred(traced_procp)->cr_ruid || + if ((kauth_cred_getruid(creds) != kauth_cred_getruid(proc_ucred(traced_procp)) || ISSET(traced_procp->p_flag, P_SUGID)) && (my_err = suser(creds, &cur_procp->p_acflag)) != 0) { *errp = my_err; diff --git a/bsd/kern/makesyscalls.sh b/bsd/kern/makesyscalls.sh index d8b11ba6c..301905871 100755 --- a/bsd/kern/makesyscalls.sh +++ b/bsd/kern/makesyscalls.sh @@ -190,6 +190,11 @@ s/\$//g printf "#include \n" > sysarg printf "\n#ifdef KERNEL\n" > sysarg printf "#ifdef __APPLE_API_PRIVATE\n" > sysarg + printf "/*\n" > sysarg + printf " * The kernel may support multiple userspace ABIs, and must use\n" > sysarg + printf " * argument structures with elements large enough for any of them.\n" > sysarg + printf "*/\n" > sysarg + printf "\n" > sysarg printf "#ifndef __arm__\n" > sysarg printf "#define\tPAD_(t)\t(sizeof(uint64_t) <= sizeof(t) \\\n " > sysarg printf "\t\t? 0 : sizeof(uint64_t) - sizeof(t))\n" > sysarg @@ -205,8 +210,6 @@ s/\$//g printf "#define\tPADR_(t)\t0\n" > sysarg printf "#endif\n" > sysarg printf "\n__BEGIN_DECLS\n" > sysarg - printf "#ifndef __MUNGE_ONCE\n" > sysarg - printf "#define __MUNGE_ONCE\n" > sysarg printf "#ifndef __arm__\n" > sysarg printf "void munge_w(const void *, void *); \n" > sysarg printf "void munge_ww(const void *, void *); \n" > sysarg @@ -218,6 +221,10 @@ s/\$//g printf "void munge_wwwwwwww(const void *, void *); \n" > sysarg printf "void munge_wl(const void *, void *); \n" > sysarg printf "void munge_wlw(const void *, void *); \n" > sysarg + printf "void munge_wlwwwll(const void *, void *); \n" > sysarg + printf "void munge_wlwwwllw(const void *, void *); \n" > sysarg + printf "void munge_wlwwlwlw(const void *, void *); \n" > sysarg + printf "void munge_wllwwll(const void *, void *); \n" > sysarg printf "void munge_wwwl(const void *, void *); \n" > sysarg printf "void munge_wwwlw(const void *, void *); \n" > sysarg printf "void munge_wwwlww(const void *, void *); \n" > sysarg @@ -225,13 +232,18 @@ s/\$//g printf "void munge_wwwwlw(const void *, void *); \n" > sysarg printf "void munge_wwwwl(const void *, void *); \n" > sysarg printf "void munge_wwwwwl(const void *, void *); \n" > sysarg + printf "void munge_wwwwwlww(const void *, void *); \n" > sysarg + printf "void munge_wwwwwllw(const void *, void *); \n" > sysarg + printf "void munge_wwwwwlll(const void *, void *); \n" > sysarg printf "void munge_wwwwwwll(const void *, void *); \n" > sysarg + printf "void munge_wwwwwwl(const void *, void *); \n" > sysarg printf "void munge_wwwwwwlw(const void *, void *); \n" > sysarg printf "void munge_wsw(const void *, void *); \n" > sysarg printf "void munge_wws(const void *, void *); \n" > sysarg printf "void munge_wwwsw(const void *, void *); \n" > sysarg printf "void munge_llllll(const void *, void *); \n" > sysarg printf "#else \n" > sysarg + printf "/* ARM does not need mungers for BSD system calls */\n" > sysarg printf "#define munge_w NULL \n" > sysarg printf "#define munge_ww NULL \n" > sysarg printf "#define munge_www NULL \n" > sysarg @@ -242,6 +254,10 @@ s/\$//g printf "#define munge_wwwwwwww NULL \n" > sysarg printf "#define munge_wl NULL \n" > sysarg printf "#define munge_wlw NULL \n" > sysarg + printf "#define munge_wlwwwll NULL \n" > sysarg + printf "#define munge_wlwwwllw NULL \n" > sysarg + printf "#define munge_wlwwlwlw NULL \n" > sysarg + printf "#define munge_wllwwll NULL \n" > sysarg printf "#define munge_wwwl NULL \n" > sysarg printf "#define munge_wwwlw NULL \n" > sysarg printf "#define munge_wwwlww NULL\n" > sysarg @@ -249,22 +265,18 @@ s/\$//g printf "#define munge_wwwwl NULL \n" > sysarg printf "#define munge_wwwwlw NULL \n" > sysarg printf "#define munge_wwwwwl NULL \n" > sysarg + printf "#define munge_wwwwwlww NULL \n" > sysarg + printf "#define munge_wwwwwllw NULL \n" > sysarg + printf "#define munge_wwwwwlll NULL \n" > sysarg + printf "#define munge_wwwwwwl NULL \n" > sysarg printf "#define munge_wwwwwwlw NULL \n" > sysarg printf "#define munge_wsw NULL \n" > sysarg printf "#define munge_wws NULL \n" > sysarg printf "#define munge_wwwsw NULL \n" > sysarg printf "#define munge_llllll NULL \n" > sysarg - printf "#endif // ! __arm__\n" > sysarg - printf "#ifdef __ppc__\n" > sysarg - printf "void munge_d(const void *, void *); \n" > sysarg - printf "void munge_dd(const void *, void *); \n" > sysarg - printf "void munge_ddd(const void *, void *); \n" > sysarg - printf "void munge_dddd(const void *, void *); \n" > sysarg - printf "void munge_ddddd(const void *, void *); \n" > sysarg - printf "void munge_dddddd(const void *, void *); \n" > sysarg - printf "void munge_ddddddd(const void *, void *); \n" > sysarg - printf "void munge_dddddddd(const void *, void *); \n" > sysarg - printf "#else \n" > sysarg + printf "#endif /* __arm__ */\n" > sysarg + printf "\n" > sysarg + printf "/* Active 64-bit user ABIs do not need munging */\n" > sysarg printf "#define munge_d NULL \n" > sysarg printf "#define munge_dd NULL \n" > sysarg printf "#define munge_ddd NULL \n" > sysarg @@ -273,8 +285,6 @@ s/\$//g printf "#define munge_dddddd NULL \n" > sysarg printf "#define munge_ddddddd NULL \n" > sysarg printf "#define munge_dddddddd NULL \n" > sysarg - printf "#endif // __ppc__\n" > sysarg - printf "#endif /* !__MUNGE_ONCE */\n" > sysarg printf "\n" > sysarg @@ -592,7 +602,7 @@ s/\$//g argtype[i] == "socklen_t" || argtype[i] == "uint32_t" || argtype[i] == "int32_t" || argtype[i] == "sigset_t" || argtype[i] == "gid_t" || argtype[i] == "unsigned int" || argtype[i] == "mode_t" || argtype[i] == "key_t" || - argtype[i] == "mach_port_name_t") { + argtype[i] == "mach_port_name_t" || argtype[i] == "au_asid_t") { munge32 = munge32 "w" munge64 = munge64 "d" size32 += 4 diff --git a/bsd/kern/mcache.c b/bsd/kern/mcache.c index 14416f34a..a0c6cfb69 100644 --- a/bsd/kern/mcache.c +++ b/bsd/kern/mcache.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006-2007 Apple Inc. All rights reserved. + * Copyright (c) 2006-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -98,9 +98,6 @@ #define MCACHE_UNLOCK(l) lck_mtx_unlock(l) #define MCACHE_LOCK_TRY(l) lck_mtx_try_lock(l) -/* This should be in a header file */ -#define atomic_add_32(a, n) ((void) OSAddAtomic(n, a)) - static int ncpu; static lck_mtx_t *mcache_llock; static struct thread *mcache_llock_owner; @@ -137,8 +134,8 @@ static mcache_bkttype_t mcache_bkttype[] = { }; static mcache_t *mcache_create_common(const char *, size_t, size_t, - mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_notifyfn_t, - void *, u_int32_t, int, int); + mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_logfn_t, + mcache_notifyfn_t, void *, u_int32_t, int, int); static unsigned int mcache_slab_alloc(void *, mcache_obj_t ***, unsigned int, int); static void mcache_slab_free(void *, mcache_obj_t *, boolean_t); @@ -192,6 +189,7 @@ mcache_init(void) PAGE_SIZE, "mcache"); if (mcache_zone == NULL) panic("mcache_init: failed to allocate mcache zone\n"); + zone_change(mcache_zone, Z_CALLERACCT, FALSE); LIST_INIT(&mcache_head); @@ -233,7 +231,8 @@ mcache_create(const char *name, size_t bufsize, size_t align, u_int32_t flags, int wait) { return (mcache_create_common(name, bufsize, align, mcache_slab_alloc, - mcache_slab_free, mcache_slab_audit, NULL, NULL, flags, 1, wait)); + mcache_slab_free, mcache_slab_audit, NULL, NULL, NULL, flags, 1, + wait)); } /* @@ -244,10 +243,11 @@ mcache_create(const char *name, size_t bufsize, size_t align, __private_extern__ mcache_t * mcache_create_ext(const char *name, size_t bufsize, mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn, - mcache_notifyfn_t notifyfn, void *arg, u_int32_t flags, int wait) + mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg, + u_int32_t flags, int wait) { return (mcache_create_common(name, bufsize, 0, allocfn, - freefn, auditfn, notifyfn, arg, flags, 0, wait)); + freefn, auditfn, logfn, notifyfn, arg, flags, 0, wait)); } /* @@ -256,8 +256,8 @@ mcache_create_ext(const char *name, size_t bufsize, static mcache_t * mcache_create_common(const char *name, size_t bufsize, size_t align, mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn, - mcache_notifyfn_t notifyfn, void *arg, u_int32_t flags, int need_zone, - int wait) + mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg, + u_int32_t flags, int need_zone, int wait) { mcache_bkttype_t *btp; mcache_t *cp = NULL; @@ -267,7 +267,7 @@ mcache_create_common(const char *name, size_t bufsize, size_t align, char lck_name[64]; /* If auditing is on and print buffer is NULL, allocate it now */ - if ((flags & MCF_AUDIT) && mca_dump_buf == NULL) { + if ((flags & MCF_DEBUG) && mca_dump_buf == NULL) { int malloc_wait = (wait & MCR_NOSLEEP) ? M_NOWAIT : M_WAITOK; MALLOC(mca_dump_buf, char *, DUMP_MCA_BUF_SIZE, M_TEMP, malloc_wait | M_ZERO); @@ -313,6 +313,7 @@ mcache_create_common(const char *name, size_t bufsize, size_t align, cp->mc_slab_alloc = allocfn; cp->mc_slab_free = freefn; cp->mc_slab_audit = auditfn; + cp->mc_slab_log = logfn; cp->mc_slab_notify = notifyfn; cp->mc_private = need_zone ? cp : arg; cp->mc_bufsize = bufsize; @@ -467,6 +468,11 @@ mcache_alloc_ext(mcache_t *cp, mcache_obj_t **list, unsigned int num, int wait) /* If we got them all, return to caller */ if ((need -= objs) == 0) { MCACHE_UNLOCK(&ccp->cc_lock); + + if (!(cp->mc_flags & MCF_NOLEAKLOG) && + cp->mc_slab_log != NULL) + (*cp->mc_slab_log)(num, *top, TRUE); + if (cp->mc_flags & MCF_DEBUG) goto debug_alloc; @@ -534,11 +540,14 @@ mcache_alloc_ext(mcache_t *cp, mcache_obj_t **list, unsigned int num, int wait) } } + if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL) + (*cp->mc_slab_log)((num - need), *top, TRUE); + if (!(cp->mc_flags & MCF_DEBUG)) return (num - need); debug_alloc: - if (cp->mc_flags & MCF_VERIFY) { + if (cp->mc_flags & MCF_DEBUG) { mcache_obj_t **o = top; unsigned int n; @@ -561,7 +570,7 @@ mcache_alloc_ext(mcache_t *cp, mcache_obj_t **list, unsigned int num, int wait) } /* Invoke the slab layer audit callback if auditing is enabled */ - if ((cp->mc_flags & MCF_AUDIT) && cp->mc_slab_audit != NULL) + if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL) (*cp->mc_slab_audit)(cp->mc_private, *top, TRUE); return (num - need); @@ -678,8 +687,11 @@ mcache_free_ext(mcache_t *cp, mcache_obj_t *list) mcache_obj_t *nlist; mcache_bkt_t *bkt; + if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL) + (*cp->mc_slab_log)(0, list, FALSE); + /* Invoke the slab layer audit callback if auditing is enabled */ - if ((cp->mc_flags & MCF_AUDIT) && cp->mc_slab_audit != NULL) + if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL) (*cp->mc_slab_audit)(cp->mc_private, list, FALSE); MCACHE_LOCK(&ccp->cc_lock); @@ -899,7 +911,7 @@ mcache_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait) * the nearest 64-bit multiply; this is because we use * 64-bit memory access to set/check the pattern. */ - if (flags & MCF_AUDIT) { + if (flags & MCF_DEBUG) { VERIFY(((intptr_t)base + rsize) <= ((intptr_t)buf + cp->mc_chunksize)); mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize); @@ -958,7 +970,7 @@ mcache_slab_free(void *arg, mcache_obj_t *list, __unused boolean_t purged) /* Get the original address since we're about to free it */ pbuf = (void **)((intptr_t)base - sizeof (void *)); - if (flags & MCF_AUDIT) { + if (flags & MCF_DEBUG) { VERIFY(((intptr_t)base + rsize) <= ((intptr_t)*pbuf + cp->mc_chunksize)); mcache_audit_free_verify(NULL, base, offset, rsize); @@ -1156,7 +1168,7 @@ mcache_bkt_destroy(mcache_t *cp, mcache_bkttype_t *btp, mcache_bkt_t *bkt, if (nobjs > 0) { mcache_obj_t *top = bkt->bkt_obj[nobjs - 1]; - if (cp->mc_flags & MCF_VERIFY) { + if (cp->mc_flags & MCF_DEBUG) { mcache_obj_t *o = top; int cnt = 0; diff --git a/bsd/kern/netboot.c b/bsd/kern/netboot.c index 6c4b5437e..664f03ef7 100644 --- a/bsd/kern/netboot.c +++ b/bsd/kern/netboot.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2001-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -57,7 +57,8 @@ #include #include - +#include +#include #include //#include @@ -81,10 +82,6 @@ const void * IOBSDRegistryEntryGetData(void * entry, const char * property_name, int * packet_length); -extern int vndevice_root_image(const char * path, char devname[], - dev_t * dev_p); -extern int di_root_image(const char *path, char devname[], dev_t *dev_p); - #define BOOTP_RESPONSE "bootp-response" #define BSDP_RESPONSE "bsdp-response" #define DHCP_RESPONSE "dhcp-response" @@ -92,16 +89,6 @@ extern int di_root_image(const char *path, char devname[], dev_t *dev_p); /* forward declarations */ int inet_aton(char * cp, struct in_addr * pin); -boolean_t netboot_iaddr(struct in_addr * iaddr_p); -boolean_t netboot_rootpath(struct in_addr * server_ip, - char * name, int name_len, - char * path, int path_len); -int netboot_setup(void); -int netboot_mountroot(void); -int netboot_root(void); - - - #define IP_FORMAT "%d.%d.%d.%d" #define IP_CH(ip) ((u_char *)ip) #define IP_LIST(ip) IP_CH(ip)[0],IP_CH(ip)[1],IP_CH(ip)[2],IP_CH(ip)[3] @@ -125,29 +112,10 @@ struct netboot_info { char * image_path; int image_path_length; NetBootImageType image_type; - boolean_t use_hdix; + char * second_image_path; + int second_image_path_length; }; -int -inet_aton(char * cp, struct in_addr * pin) -{ - u_char * b = (u_char *)pin; - int i; - char * p; - - for (p = cp, i = 0; i < 4; i++) { - u_long l = strtoul(p, 0, 0); - if (l > 255) - return (FALSE); - b[i] = l; - p = strchr(p, '.'); - if (i < 3 && p == NULL) - return (FALSE); - p++; - } - return (TRUE); -} - /* * Function: parse_booter_path * Purpose: @@ -251,7 +219,7 @@ static __inline__ boolean_t parse_netboot_path(char * path, struct in_addr * iaddr_p, char const * * host, char * * mount_dir, char * * image_path) { - static char tmp[MAX_IPv4_STR_LEN]; /* Danger - not thread safe */ + static char tmp[MAX_IPv4_STR_LEN]; /* Danger - not thread safe */ char * start; char * colon; @@ -346,35 +314,46 @@ get_root_path(char * root_path) } +static void +save_path(char * * str_p, int * length_p, char * path) +{ + *length_p = strlen(path) + 1; + *str_p = (char *)kalloc(*length_p); + strlcpy(*str_p, path, *length_p); + return; +} + static struct netboot_info * netboot_info_init(struct in_addr iaddr) { - struct netboot_info * info; + boolean_t have_root_path = FALSE; + struct netboot_info * info = NULL; char * root_path = NULL; - boolean_t use_hdix = TRUE; - char * vndevice = NULL; - - MALLOC_ZONE(vndevice, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); - if (vndevice == NULL) - panic("netboot_info_init: M_NAMEI zone exhausted"); - if (PE_parse_boot_argn("vndevice", vndevice, MAXPATHLEN) == TRUE) { - use_hdix = FALSE; - } - FREE_ZONE(vndevice, MAXPATHLEN, M_NAMEI); info = (struct netboot_info *)kalloc(sizeof(*info)); bzero(info, sizeof(*info)); info->client_ip = iaddr; info->image_type = kNetBootImageTypeUnknown; - info->use_hdix = use_hdix; /* check for a booter-specified path then a NetBoot path */ MALLOC_ZONE(root_path, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); if (root_path == NULL) panic("netboot_info_init: M_NAMEI zone exhausted"); - if (PE_parse_boot_argn("rp", root_path, MAXPATHLEN) == TRUE - || PE_parse_boot_argn("rootpath", root_path, MAXPATHLEN) == TRUE - || get_root_path(root_path) == TRUE) { + if (PE_parse_boot_argn("rp0", root_path, MAXPATHLEN) == TRUE + || PE_parse_boot_argn("rp", root_path, MAXPATHLEN) == TRUE + || PE_parse_boot_argn("rootpath", root_path, MAXPATHLEN) == TRUE) { + if (imageboot_format_is_valid(root_path)) { + printf("netboot_info_init: rp0='%s' isn't a network path," + " ignoring\n", root_path); + } + else { + have_root_path = TRUE; + } + } + if (have_root_path == FALSE) { + have_root_path = get_root_path(root_path); + } + if (have_root_path) { const char * server_name = NULL; char * mount_point = NULL; char * image_path = NULL; @@ -391,11 +370,11 @@ netboot_info_init(struct in_addr iaddr) strlcpy(info->server_name, server_name, info->server_name_length); strlcpy(info->mount_point, mount_point, info->mount_point_length); - printf("Server %s Mount %s", + printf("netboot: NFS Server %s Mount %s", server_name, info->mount_point); if (image_path != NULL) { boolean_t needs_slash = FALSE; - + info->image_path_length = strlen(image_path) + 1; if (image_path[0] != '/') { needs_slash = TRUE; @@ -416,16 +395,27 @@ netboot_info_init(struct in_addr iaddr) } else if (strncmp(root_path, kNetBootRootPathPrefixHTTP, strlen(kNetBootRootPathPrefixHTTP)) == 0) { - /* only HDIX supports HTTP */ info->image_type = kNetBootImageTypeHTTP; - info->use_hdix = TRUE; - info->image_path_length = strlen(root_path) + 1; - info->image_path = (char *)kalloc(info->image_path_length); - strlcpy(info->image_path, root_path, info->image_path_length); + save_path(&info->image_path, &info->image_path_length, + root_path); + printf("netboot: HTTP URL %s\n", info->image_path); } else { printf("netboot: root path uses unrecognized format\n"); } + + /* check for image-within-image */ + if (info->image_path != NULL) { + if (PE_parse_boot_argn(IMAGEBOOT_ROOT_ARG, root_path, MAXPATHLEN) + || PE_parse_boot_argn("rp1", root_path, MAXPATHLEN)) { + /* rp1/root-dmg is the second-level image */ + save_path(&info->second_image_path, &info->second_image_path_length, + root_path); + } + } + if (info->second_image_path != NULL) { + printf("netboot: nested image %s\n", info->second_image_path); + } } FREE_ZONE(root_path, MAXPATHLEN, M_NAMEI); return (info); @@ -446,6 +436,9 @@ netboot_info_free(struct netboot_info * * info_p) if (info->image_path) { kfree(info->image_path, info->image_path_length); } + if (info->second_image_path) { + kfree(info->second_image_path, info->second_image_path_length); + } kfree(info, sizeof(*info)); } *info_p = NULL; @@ -565,13 +558,10 @@ route_cmd(int cmd, struct in_addr d, struct in_addr g, mask.sin_len = sizeof(mask); mask.sin_family = AF_INET; mask.sin_addr = m; - lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_NOTOWNED); - lck_mtx_lock(rnh_lock); - error = rtrequest_scoped_locked(cmd, (struct sockaddr *)&dst, - (struct sockaddr *)&gw, - (struct sockaddr *)&mask, - flags, NULL, ifscope); - lck_mtx_unlock(rnh_lock); + + error = rtrequest_scoped(cmd, (struct sockaddr *)&dst, + (struct sockaddr *)&gw, (struct sockaddr *)&mask, flags, NULL, ifscope); + return (error); } @@ -751,53 +741,24 @@ netboot_mountroot(void) int netboot_setup() { - dev_t dev; int error = 0; if (S_netboot_info_p == NULL || S_netboot_info_p->image_path == NULL) { goto done; } - if (S_netboot_info_p->use_hdix) { - printf("netboot_setup: calling di_root_image\n"); - error = di_root_image(S_netboot_info_p->image_path, - (char *)rootdevice, &dev); - if (error) { - printf("netboot_setup: di_root_image: failed %d\n", error); - goto done; - } + printf("netboot_setup: calling imageboot_mount_image\n"); + error = imageboot_mount_image(S_netboot_info_p->image_path, -1); + if (error != 0) { + printf("netboot: failed to mount root image, %d\n", error); } - else { - printf("netboot_setup: calling vndevice_root_image\n"); - error = vndevice_root_image(S_netboot_info_p->image_path, - (char *)rootdevice, &dev); - if (error) { - printf("netboot_setup: vndevice_root_image: failed %d\n", error); - goto done; + else if (S_netboot_info_p->second_image_path != NULL) { + error = imageboot_mount_image(S_netboot_info_p->second_image_path, 0); + if (error != 0) { + printf("netboot: failed to mount second root image, %d\n", error); } } - rootdev = dev; - mountroot = NULL; - printf("netboot: root device 0x%x\n", (int32_t)rootdev); - error = vfs_mountroot(); - if (error == 0 && rootvnode != NULL) { - struct vnode *tvp; - struct vnode *newdp; - - /* Get the vnode for '/'. Set fdp->fd_fd.fd_cdir to reference it. */ - if (VFS_ROOT(TAILQ_LAST(&mountlist,mntlist), &newdp, vfs_context_kernel())) - panic("netboot_setup: cannot find root vnode"); - vnode_ref(newdp); - vnode_put(newdp); - tvp = rootvnode; - vnode_rele(tvp); - filedesc0.fd_cdir = newdp; - rootvnode = newdp; - mount_list_lock(); - TAILQ_REMOVE(&mountlist, TAILQ_FIRST(&mountlist), mnt_list); - mount_list_unlock(); - mountlist.tqh_first->mnt_flag |= MNT_ROOTFS; - } + done: netboot_info_free(&S_netboot_info_p); return (error); diff --git a/bsd/kern/policy_check.c b/bsd/kern/policy_check.c new file mode 100644 index 000000000..e5573a99f --- /dev/null +++ b/bsd/kern/policy_check.c @@ -0,0 +1,511 @@ +#include +#include /* XXX printf() */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* OSBPrintBacktrace */ + + +/* forward declaration; see bsd_init.c */ +errno_t check_policy_init(int); +int get_thread_lock_count(thread_t th); /* forced forward */ + +/* + * Policy flags used when the policy is enabled + * + * Note: CHECK_POLICY_CHECK is probably not very useful unless you + * are kernel debugging and set a breakpoint. + */ +#define CHECK_POLICY_CHECK 0x00000001 /* Check on calls */ +#define CHECK_POLICY_FAIL 0x00000002 /* EPERM on fails */ +#define CHECK_POLICY_BACKTRACE 0x00000004 /* Show call stack on fails */ +#define CHECK_POLICY_PANIC 0x00000008 /* Panic on fails */ +#define CHECK_POLICY_PERIODIC 0x00000010 /* Show fails periodically */ + +static int policy_flags = 0; + + +#define CHECK_SET_INT_HOOK(x) .mpo_##x = (mpo_##x##_t *)common_int_hook, +#define CHECK_SET_VOID_HOOK(x) .mpo_##x = (mpo_##x##_t *)common_void_hook, + + +/* + * Init; currently, we only print our arrival notice. + */ +static void +hook_policy_init(struct mac_policy_conf *mpc) +{ + printf("Policy '%s' = '%s' ready\n", mpc->mpc_name, mpc->mpc_fullname); +} + +static void +hook_policy_initbsd(struct mac_policy_conf *mpc) +{ + /* called with policy_grab_exclusive mutex held; exempt */ + printf("hook_policy_initbsd: %s\n", mpc->mpc_name); +} + + +/* Implementation */ +#define CLASS_PERIOD_LIMIT 10000 +#define CLASS_PERIOD_MULT 20 + +static int policy_check_event = 1; +static int policy_check_period = 1; +static int policy_check_next = CLASS_PERIOD_MULT; + + +static int +common_int_hook(void) +{ + int i; + int rv = 0; + + if ((i = get_thread_lock_count(current_thread())) != 0) { + /* + * fail the MACF check if we hold a lock; this assumes a + * a non-void (authorization) MACF hook. + */ + if (policy_flags & CHECK_POLICY_FAIL) + rv = EPERM; + + /* + * display a backtrace if we hold a lock and we are not + * going to panic + */ + if ((policy_flags & (CHECK_POLICY_BACKTRACE | CHECK_POLICY_PANIC)) == CHECK_POLICY_BACKTRACE) { + if (policy_flags & CHECK_POLICY_PERIODIC) { + /* at exponentially increasing intervals */ + if (!(policy_check_event % policy_check_period)) { + if (policy_check_event <= policy_check_next || policy_check_period == CLASS_PERIOD_LIMIT) { + /* + * According to Derek, we could + * technically get a symbolicated name + * here, if we refactered some code + * and set the "keepsyms=1" boot + * argument... + */ + OSReportWithBacktrace("calling MACF hook with mutex count %d (event %d) ", i, policy_check_event); + } + } else { + if (policy_check_period < CLASS_PERIOD_LIMIT) { + policy_check_next *= CLASS_PERIOD_MULT; + policy_check_period *= CLASS_PERIOD_MULT; + } + } + } else { + /* always */ + OSReportWithBacktrace("calling MACF hook with mutex count %d (event %d) ", i, policy_check_event); + } + } + + /* Panic */ + if (policy_flags & CHECK_POLICY_PANIC) + panic("calling MACF hook with mutex count %d\n", i); + + /* count for non-fatal tracing */ + policy_check_event++; + } + + return rv; +} + +static void +common_void_hook(void) +{ + (void)common_int_hook(); + + return; +} + + +/* + * Policy hooks; one per possible hook + */ +static struct mac_policy_ops policy_ops = { + + /* separate init */ + .mpo_policy_init = hook_policy_init, + .mpo_policy_initbsd = hook_policy_initbsd, + + /* operations which return int */ + CHECK_SET_INT_HOOK(audit_check_postselect) + CHECK_SET_INT_HOOK(audit_check_preselect) + CHECK_SET_INT_HOOK(bpfdesc_check_receive) + CHECK_SET_INT_HOOK(cred_check_label_update_execve) + CHECK_SET_INT_HOOK(cred_check_label_update) + CHECK_SET_INT_HOOK(cred_check_visible) + CHECK_SET_INT_HOOK(cred_label_externalize_audit) + CHECK_SET_INT_HOOK(cred_label_externalize) + CHECK_SET_INT_HOOK(cred_label_internalize) + CHECK_SET_INT_HOOK(file_check_change_offset) + CHECK_SET_INT_HOOK(file_check_create) + CHECK_SET_INT_HOOK(file_check_dup) + CHECK_SET_INT_HOOK(file_check_fcntl) + CHECK_SET_INT_HOOK(file_check_get) + CHECK_SET_INT_HOOK(file_check_get_offset) + CHECK_SET_INT_HOOK(file_check_inherit) + CHECK_SET_INT_HOOK(file_check_ioctl) + CHECK_SET_INT_HOOK(file_check_lock) + CHECK_SET_INT_HOOK(file_check_mmap) + CHECK_SET_INT_HOOK(file_check_receive) + CHECK_SET_INT_HOOK(file_check_set) + CHECK_SET_INT_HOOK(ifnet_check_label_update) + CHECK_SET_INT_HOOK(ifnet_check_transmit) + CHECK_SET_INT_HOOK(ifnet_label_externalize) + CHECK_SET_INT_HOOK(ifnet_label_internalize) + CHECK_SET_INT_HOOK(inpcb_check_deliver) + CHECK_SET_INT_HOOK(inpcb_label_init) + CHECK_SET_INT_HOOK(iokit_check_device) + CHECK_SET_INT_HOOK(iokit_check_open) + CHECK_SET_INT_HOOK(iokit_check_set_properties) + CHECK_SET_INT_HOOK(iokit_check_hid_control) + CHECK_SET_INT_HOOK(ipq_label_compare) + CHECK_SET_INT_HOOK(ipq_label_init) + CHECK_SET_INT_HOOK(lctx_check_label_update) + CHECK_SET_INT_HOOK(lctx_label_externalize) + CHECK_SET_INT_HOOK(lctx_label_internalize) + CHECK_SET_INT_HOOK(mbuf_label_init) + CHECK_SET_INT_HOOK(mount_check_fsctl) + CHECK_SET_INT_HOOK(mount_check_getattr) + CHECK_SET_INT_HOOK(mount_check_label_update) + CHECK_SET_INT_HOOK(mount_check_mount) + CHECK_SET_INT_HOOK(mount_check_remount) + CHECK_SET_INT_HOOK(mount_check_setattr) + CHECK_SET_INT_HOOK(mount_check_stat) + CHECK_SET_INT_HOOK(mount_check_umount) + CHECK_SET_INT_HOOK(mount_label_externalize) + CHECK_SET_INT_HOOK(mount_label_internalize) + CHECK_SET_INT_HOOK(pipe_check_ioctl) + CHECK_SET_INT_HOOK(pipe_check_kqfilter) + CHECK_SET_INT_HOOK(pipe_check_label_update) + CHECK_SET_INT_HOOK(pipe_check_read) + CHECK_SET_INT_HOOK(pipe_check_select) + CHECK_SET_INT_HOOK(pipe_check_stat) + CHECK_SET_INT_HOOK(pipe_check_write) + CHECK_SET_INT_HOOK(pipe_label_externalize) + CHECK_SET_INT_HOOK(pipe_label_internalize) + CHECK_SET_INT_HOOK(policy_syscall) + CHECK_SET_INT_HOOK(port_check_copy_send) + CHECK_SET_INT_HOOK(port_check_hold_receive) + CHECK_SET_INT_HOOK(port_check_hold_send_once) + CHECK_SET_INT_HOOK(port_check_hold_send) + CHECK_SET_INT_HOOK(port_check_label_update) + CHECK_SET_INT_HOOK(port_check_make_send_once) + CHECK_SET_INT_HOOK(port_check_make_send) + CHECK_SET_INT_HOOK(port_check_method) + CHECK_SET_INT_HOOK(port_check_move_receive) + CHECK_SET_INT_HOOK(port_check_move_send_once) + CHECK_SET_INT_HOOK(port_check_move_send) + CHECK_SET_INT_HOOK(port_check_receive) + CHECK_SET_INT_HOOK(port_check_send) + CHECK_SET_INT_HOOK(port_check_service) + CHECK_SET_INT_HOOK(port_label_compute) + CHECK_SET_INT_HOOK(posixsem_check_create) + CHECK_SET_INT_HOOK(posixsem_check_open) + CHECK_SET_INT_HOOK(posixsem_check_post) + CHECK_SET_INT_HOOK(posixsem_check_unlink) + CHECK_SET_INT_HOOK(posixsem_check_wait) + CHECK_SET_INT_HOOK(posixshm_check_create) + CHECK_SET_INT_HOOK(posixshm_check_mmap) + CHECK_SET_INT_HOOK(posixshm_check_open) + CHECK_SET_INT_HOOK(posixshm_check_stat) + CHECK_SET_INT_HOOK(posixshm_check_truncate) + CHECK_SET_INT_HOOK(posixshm_check_unlink) + CHECK_SET_INT_HOOK(priv_check) + /* relative ordinal location of "priv_grant" */ + CHECK_SET_INT_HOOK(proc_check_debug) + CHECK_SET_INT_HOOK(proc_check_fork) + CHECK_SET_INT_HOOK(proc_check_getaudit) + CHECK_SET_INT_HOOK(proc_check_getauid) + CHECK_SET_INT_HOOK(proc_check_getlcid) + CHECK_SET_INT_HOOK(proc_check_map_anon) + CHECK_SET_INT_HOOK(proc_check_mprotect) + CHECK_SET_INT_HOOK(proc_check_sched) + CHECK_SET_INT_HOOK(proc_check_setaudit) + CHECK_SET_INT_HOOK(proc_check_setauid) + CHECK_SET_INT_HOOK(proc_check_setlcid) + CHECK_SET_INT_HOOK(proc_check_signal) + CHECK_SET_INT_HOOK(proc_check_suspend_resume) + CHECK_SET_INT_HOOK(proc_check_wait) + CHECK_SET_INT_HOOK(socket_check_accept) + CHECK_SET_INT_HOOK(socket_check_accepted) + CHECK_SET_INT_HOOK(socket_check_bind) + CHECK_SET_INT_HOOK(socket_check_connect) + CHECK_SET_INT_HOOK(socket_check_create) + CHECK_SET_INT_HOOK(socket_check_deliver) + CHECK_SET_INT_HOOK(socket_check_kqfilter) + CHECK_SET_INT_HOOK(socket_check_label_update) + CHECK_SET_INT_HOOK(socket_check_listen) + CHECK_SET_INT_HOOK(socket_check_receive) + CHECK_SET_INT_HOOK(socket_check_received) + CHECK_SET_INT_HOOK(socket_check_select) + CHECK_SET_INT_HOOK(socket_check_send) + CHECK_SET_INT_HOOK(socket_check_stat) + CHECK_SET_INT_HOOK(socket_check_setsockopt) + CHECK_SET_INT_HOOK(socket_check_getsockopt) + CHECK_SET_INT_HOOK(socket_label_externalize) + CHECK_SET_INT_HOOK(socket_label_init) + CHECK_SET_INT_HOOK(socket_label_internalize) + CHECK_SET_INT_HOOK(socketpeer_label_externalize) + CHECK_SET_INT_HOOK(socketpeer_label_init) + CHECK_SET_INT_HOOK(system_check_acct) + CHECK_SET_INT_HOOK(system_check_audit) + CHECK_SET_INT_HOOK(system_check_auditctl) + CHECK_SET_INT_HOOK(system_check_auditon) + CHECK_SET_INT_HOOK(system_check_chud) + CHECK_SET_INT_HOOK(system_check_host_priv) + CHECK_SET_INT_HOOK(system_check_nfsd) + CHECK_SET_INT_HOOK(system_check_reboot) + CHECK_SET_INT_HOOK(system_check_settime) + CHECK_SET_INT_HOOK(system_check_swapoff) + CHECK_SET_INT_HOOK(system_check_swapon) + CHECK_SET_INT_HOOK(system_check_sysctl) + CHECK_SET_INT_HOOK(sysvmsq_check_enqueue) + CHECK_SET_INT_HOOK(sysvmsq_check_msgrcv) + CHECK_SET_INT_HOOK(sysvmsq_check_msgrmid) + CHECK_SET_INT_HOOK(sysvmsq_check_msqctl) + CHECK_SET_INT_HOOK(sysvmsq_check_msqget) + CHECK_SET_INT_HOOK(sysvmsq_check_msqrcv) + CHECK_SET_INT_HOOK(sysvmsq_check_msqsnd) + CHECK_SET_INT_HOOK(sysvsem_check_semctl) + CHECK_SET_INT_HOOK(sysvsem_check_semget) + CHECK_SET_INT_HOOK(sysvsem_check_semop) + CHECK_SET_INT_HOOK(sysvshm_check_shmat) + CHECK_SET_INT_HOOK(sysvshm_check_shmctl) + CHECK_SET_INT_HOOK(sysvshm_check_shmdt) + CHECK_SET_INT_HOOK(sysvshm_check_shmget) + CHECK_SET_INT_HOOK(proc_check_get_task_name) + CHECK_SET_INT_HOOK(proc_check_get_task) + CHECK_SET_INT_HOOK(task_label_externalize) + CHECK_SET_INT_HOOK(task_label_internalize) + CHECK_SET_INT_HOOK(vnode_check_access) + CHECK_SET_INT_HOOK(vnode_check_chdir) + CHECK_SET_INT_HOOK(vnode_check_chroot) + CHECK_SET_INT_HOOK(vnode_check_create) + CHECK_SET_INT_HOOK(vnode_check_deleteextattr) + CHECK_SET_INT_HOOK(vnode_check_exchangedata) + CHECK_SET_INT_HOOK(vnode_check_exec) + CHECK_SET_INT_HOOK(vnode_check_fsgetpath) + CHECK_SET_INT_HOOK(vnode_check_signature) + CHECK_SET_INT_HOOK(vnode_check_getattrlist) + CHECK_SET_INT_HOOK(vnode_check_getextattr) + CHECK_SET_INT_HOOK(vnode_check_ioctl) + CHECK_SET_INT_HOOK(vnode_check_kqfilter) + CHECK_SET_INT_HOOK(vnode_check_label_update) + CHECK_SET_INT_HOOK(vnode_check_link) + CHECK_SET_INT_HOOK(vnode_check_listextattr) + CHECK_SET_INT_HOOK(vnode_check_lookup) + CHECK_SET_INT_HOOK(vnode_check_open) + CHECK_SET_INT_HOOK(vnode_check_read) + CHECK_SET_INT_HOOK(vnode_check_readdir) + CHECK_SET_INT_HOOK(vnode_check_readlink) + CHECK_SET_INT_HOOK(vnode_check_rename_from) + CHECK_SET_INT_HOOK(vnode_check_rename_to) + CHECK_SET_INT_HOOK(vnode_check_revoke) + CHECK_SET_INT_HOOK(vnode_check_searchfs) + CHECK_SET_INT_HOOK(vnode_check_select) + CHECK_SET_INT_HOOK(vnode_check_setattrlist) + CHECK_SET_INT_HOOK(vnode_check_setextattr) + CHECK_SET_INT_HOOK(vnode_check_setflags) + CHECK_SET_INT_HOOK(vnode_check_setmode) + CHECK_SET_INT_HOOK(vnode_check_setowner) + CHECK_SET_INT_HOOK(vnode_check_setutimes) + CHECK_SET_INT_HOOK(vnode_check_stat) + CHECK_SET_INT_HOOK(vnode_check_truncate) + CHECK_SET_INT_HOOK(vnode_check_uipc_bind) + CHECK_SET_INT_HOOK(vnode_check_uipc_connect) + CHECK_SET_INT_HOOK(vnode_check_unlink) + CHECK_SET_INT_HOOK(vnode_check_write) + CHECK_SET_INT_HOOK(vnode_label_associate_extattr) + CHECK_SET_INT_HOOK(vnode_label_externalize_audit) + CHECK_SET_INT_HOOK(vnode_label_externalize) + CHECK_SET_INT_HOOK(vnode_label_internalize) + CHECK_SET_INT_HOOK(vnode_label_store) + CHECK_SET_INT_HOOK(vnode_label_update_extattr) + CHECK_SET_INT_HOOK(vnode_notify_create) + + /* operations which return void */ + CHECK_SET_VOID_HOOK(bpfdesc_label_init) + CHECK_SET_VOID_HOOK(bpfdesc_label_destroy) + CHECK_SET_VOID_HOOK(bpfdesc_label_associate) + CHECK_SET_VOID_HOOK(cred_label_associate_fork) + CHECK_SET_VOID_HOOK(cred_label_associate_kernel) + CHECK_SET_VOID_HOOK(cred_label_associate) + CHECK_SET_VOID_HOOK(cred_label_associate_user) + CHECK_SET_VOID_HOOK(cred_label_destroy) + CHECK_SET_VOID_HOOK(cred_label_init) + CHECK_SET_VOID_HOOK(cred_label_update_execve) + CHECK_SET_VOID_HOOK(cred_label_update) + CHECK_SET_VOID_HOOK(devfs_label_associate_device) + CHECK_SET_VOID_HOOK(devfs_label_associate_directory) + CHECK_SET_VOID_HOOK(devfs_label_copy) + CHECK_SET_VOID_HOOK(devfs_label_destroy) + CHECK_SET_VOID_HOOK(devfs_label_init) + CHECK_SET_VOID_HOOK(devfs_label_update) + CHECK_SET_VOID_HOOK(file_check_mmap_downgrade) + CHECK_SET_VOID_HOOK(file_label_associate) + CHECK_SET_VOID_HOOK(file_label_destroy) + CHECK_SET_VOID_HOOK(file_label_init) + CHECK_SET_VOID_HOOK(ifnet_label_associate) + CHECK_SET_VOID_HOOK(ifnet_label_copy) + CHECK_SET_VOID_HOOK(ifnet_label_destroy) + CHECK_SET_VOID_HOOK(ifnet_label_init) + CHECK_SET_VOID_HOOK(ifnet_label_recycle) + CHECK_SET_VOID_HOOK(ifnet_label_update) + CHECK_SET_VOID_HOOK(inpcb_label_associate) + CHECK_SET_VOID_HOOK(inpcb_label_destroy) + CHECK_SET_VOID_HOOK(inpcb_label_recycle) + CHECK_SET_VOID_HOOK(inpcb_label_update) + CHECK_SET_VOID_HOOK(ipq_label_associate) + CHECK_SET_VOID_HOOK(ipq_label_destroy) + CHECK_SET_VOID_HOOK(ipq_label_update) + CHECK_SET_VOID_HOOK(lctx_label_destroy) + CHECK_SET_VOID_HOOK(lctx_label_init) + CHECK_SET_VOID_HOOK(lctx_label_update) + CHECK_SET_VOID_HOOK(lctx_notify_create) + CHECK_SET_VOID_HOOK(lctx_notify_join) + CHECK_SET_VOID_HOOK(lctx_notify_leave) + CHECK_SET_VOID_HOOK(mbuf_label_associate_bpfdesc) + CHECK_SET_VOID_HOOK(mbuf_label_associate_ifnet) + CHECK_SET_VOID_HOOK(mbuf_label_associate_inpcb) + CHECK_SET_VOID_HOOK(mbuf_label_associate_ipq) + CHECK_SET_VOID_HOOK(mbuf_label_associate_linklayer) + CHECK_SET_VOID_HOOK(mbuf_label_associate_multicast_encap) + CHECK_SET_VOID_HOOK(mbuf_label_associate_netlayer) + CHECK_SET_VOID_HOOK(mbuf_label_associate_socket) + CHECK_SET_VOID_HOOK(mbuf_label_copy) + CHECK_SET_VOID_HOOK(mbuf_label_destroy) + CHECK_SET_VOID_HOOK(mount_label_associate) + CHECK_SET_VOID_HOOK(mount_label_destroy) + CHECK_SET_VOID_HOOK(mount_label_init) + CHECK_SET_VOID_HOOK(netinet_fragment) + CHECK_SET_VOID_HOOK(netinet_icmp_reply) + CHECK_SET_VOID_HOOK(netinet_tcp_reply) + CHECK_SET_VOID_HOOK(pipe_label_associate) + CHECK_SET_VOID_HOOK(pipe_label_copy) + CHECK_SET_VOID_HOOK(pipe_label_destroy) + CHECK_SET_VOID_HOOK(pipe_label_init) + CHECK_SET_VOID_HOOK(pipe_label_update) + CHECK_SET_VOID_HOOK(policy_destroy) + /* relative ordinal location of "policy_init" */ + /* relative ordinal location of "policy_initbsd" */ + CHECK_SET_VOID_HOOK(port_label_associate_kernel) + CHECK_SET_VOID_HOOK(port_label_associate) + CHECK_SET_VOID_HOOK(port_label_copy) + CHECK_SET_VOID_HOOK(port_label_destroy) + CHECK_SET_VOID_HOOK(port_label_init) + CHECK_SET_VOID_HOOK(port_label_update_cred) + CHECK_SET_VOID_HOOK(port_label_update_kobject) + CHECK_SET_VOID_HOOK(posixsem_label_associate) + CHECK_SET_VOID_HOOK(posixsem_label_destroy) + CHECK_SET_VOID_HOOK(posixsem_label_init) + CHECK_SET_VOID_HOOK(posixshm_label_associate) + CHECK_SET_VOID_HOOK(posixshm_label_destroy) + CHECK_SET_VOID_HOOK(posixshm_label_init) + CHECK_SET_VOID_HOOK(proc_label_destroy) + CHECK_SET_VOID_HOOK(proc_label_init) + CHECK_SET_VOID_HOOK(socket_label_associate_accept) + CHECK_SET_VOID_HOOK(socket_label_associate) + CHECK_SET_VOID_HOOK(socket_label_copy) + CHECK_SET_VOID_HOOK(socket_label_destroy) + CHECK_SET_VOID_HOOK(socket_label_update) + CHECK_SET_VOID_HOOK(socketpeer_label_associate_mbuf) + CHECK_SET_VOID_HOOK(socketpeer_label_associate_socket) + CHECK_SET_VOID_HOOK(socketpeer_label_destroy) + CHECK_SET_VOID_HOOK(sysvmsg_label_associate) + CHECK_SET_VOID_HOOK(sysvmsg_label_destroy) + CHECK_SET_VOID_HOOK(sysvmsg_label_init) + CHECK_SET_VOID_HOOK(sysvmsg_label_recycle) + CHECK_SET_VOID_HOOK(sysvmsq_label_associate) + CHECK_SET_VOID_HOOK(sysvmsq_label_destroy) + CHECK_SET_VOID_HOOK(sysvmsq_label_init) + CHECK_SET_VOID_HOOK(sysvmsq_label_recycle) + CHECK_SET_VOID_HOOK(sysvsem_label_associate) + CHECK_SET_VOID_HOOK(sysvsem_label_destroy) + CHECK_SET_VOID_HOOK(sysvsem_label_init) + CHECK_SET_VOID_HOOK(sysvsem_label_recycle) + CHECK_SET_VOID_HOOK(sysvshm_label_associate) + CHECK_SET_VOID_HOOK(sysvshm_label_destroy) + CHECK_SET_VOID_HOOK(sysvshm_label_init) + CHECK_SET_VOID_HOOK(sysvshm_label_recycle) + CHECK_SET_VOID_HOOK(task_label_associate_kernel) + CHECK_SET_VOID_HOOK(task_label_associate) + CHECK_SET_VOID_HOOK(task_label_copy) + CHECK_SET_VOID_HOOK(task_label_destroy) + CHECK_SET_VOID_HOOK(task_label_init) + CHECK_SET_VOID_HOOK(task_label_update) + CHECK_SET_VOID_HOOK(vnode_label_associate_devfs) + CHECK_SET_VOID_HOOK(vnode_label_associate_file) + CHECK_SET_VOID_HOOK(vnode_label_associate_pipe) + CHECK_SET_VOID_HOOK(vnode_label_associate_posixsem) + CHECK_SET_VOID_HOOK(vnode_label_associate_posixshm) + CHECK_SET_VOID_HOOK(vnode_label_associate_singlelabel) + CHECK_SET_VOID_HOOK(vnode_label_associate_socket) + CHECK_SET_VOID_HOOK(vnode_label_copy) + CHECK_SET_VOID_HOOK(vnode_label_destroy) + CHECK_SET_VOID_HOOK(vnode_label_init) + CHECK_SET_VOID_HOOK(vnode_label_recycle) + CHECK_SET_VOID_HOOK(vnode_label_update) + CHECK_SET_VOID_HOOK(vnode_notify_rename) + .mpo_reserved12 = common_void_hook, + .mpo_reserved14 = common_void_hook, + .mpo_reserved15 = common_void_hook, + .mpo_reserved16 = common_void_hook, + .mpo_reserved17 = common_void_hook, + .mpo_reserved18 = common_void_hook, + .mpo_reserved19 = common_void_hook, + .mpo_reserved20 = common_void_hook, + .mpo_reserved21 = common_void_hook, + .mpo_reserved22 = common_void_hook, + .mpo_reserved23 = common_void_hook, + .mpo_reserved24 = common_void_hook, + .mpo_reserved25 = common_void_hook, + .mpo_reserved26 = common_void_hook, + .mpo_reserved27 = common_void_hook, + .mpo_reserved28 = common_void_hook, + .mpo_reserved29 = common_void_hook, +}; + +/* + * Policy definition + */ +static struct mac_policy_conf policy_conf = { + .mpc_name = "CHECK", + .mpc_fullname = "Check Assumptions Policy", + .mpc_field_off = NULL, /* no label slot */ + .mpc_labelnames = NULL, /* no policy label names */ + .mpc_labelname_count = 0, /* count of label names is 0 */ + .mpc_ops = &policy_ops, /* policy operations */ + .mpc_loadtime_flags = 0, + .mpc_runtime_flags = 0, +}; + +static mac_policy_handle_t policy_handle; + +/* + * Init routine; for a loadable policy, this would be called during the KEXT + * initialization; we're going to call this from bsd_init() if the boot + * argument for checking is present. + */ +errno_t +check_policy_init(int flags) +{ + /* Only instantiate the module if we have been asked to do checking */ + if (!flags) + return 0; + + policy_flags = flags; + + return mac_policy_register(&policy_conf, &policy_handle, NULL); +} diff --git a/bsd/kern/posix_sem.c b/bsd/kern/posix_sem.c index a2cd627f1..c312d1b84 100644 --- a/bsd/kern/posix_sem.c +++ b/bsd/kern/posix_sem.c @@ -30,7 +30,7 @@ * All Rights Reserved. */ /* - * posix_shm.c : Support for POSIX semaphore APIs + * posix_sem.c : Support for POSIX semaphore APIs * * File: posix_sem.c * Author: Ananthakrishna Ramesh @@ -155,9 +155,9 @@ u_long psemhash; /* size of hash table - 1 */ long psemnument; /* number of cache entries allocated */ long posix_sem_max = 10000; /* tunable for max POSIX semaphores */ /* 10000 limits to ~1M of memory */ -SYSCTL_NODE(_kern, KERN_POSIX, posix, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Posix"); -SYSCTL_NODE(_kern_posix, OID_AUTO, sem, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Semaphores"); -SYSCTL_LONG (_kern_posix_sem, OID_AUTO, max, CTLFLAG_RW, &posix_sem_max, "max"); +SYSCTL_NODE(_kern, KERN_POSIX, posix, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Posix"); +SYSCTL_NODE(_kern_posix, OID_AUTO, sem, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Semaphores"); +SYSCTL_LONG (_kern_posix_sem, OID_AUTO, max, CTLFLAG_RW | CTLFLAG_LOCKED, &posix_sem_max, "max"); struct psemstats psemstats; /* cache effectiveness statistics */ @@ -524,8 +524,8 @@ sem_open(proc_t p, struct sem_open_args *uap, user_addr_t *retval) pinfo->psem_flags = PSEM_DEFINED | PSEM_INCREATE; pinfo->psem_usecount = 1; pinfo->psem_mode = cmode; - pinfo->psem_uid = kauth_cred_getuid(kauth_cred_get()); - pinfo->psem_gid = kauth_cred_get()->cr_gid; + pinfo->psem_uid = kauth_getuid(); + pinfo->psem_gid = kauth_getgid(); bcopy(pnbuf, &pinfo->psem_name[0], PSEMNAMLEN); pinfo->psem_name[PSEMNAMLEN]= 0; pinfo->psem_flags &= ~PSEM_DEFINED; @@ -643,39 +643,14 @@ sem_open(proc_t p, struct sem_open_args *uap, user_addr_t *retval) static int psem_access(struct pseminfo *pinfo, int mode, kauth_cred_t cred) { - mode_t mask; - int is_member; + int mode_req = ((mode & FREAD) ? S_IRUSR : 0) | + ((mode & FWRITE) ? S_IWUSR : 0); /* Otherwise, user id 0 always gets access. */ if (!suser(cred, NULL)) return (0); - mask = 0; - - /* Otherwise, check the owner. */ - if (kauth_cred_getuid(cred) == pinfo->psem_uid) { - if (mode & FREAD) - mask |= S_IRUSR; - if (mode & FWRITE) - mask |= S_IWUSR; - return ((pinfo->psem_mode & mask) == mask ? 0 : EACCES); - } - - /* Otherwise, check the groups. */ - if (kauth_cred_ismember_gid(cred, pinfo->psem_gid, &is_member) == 0 && is_member) { - if (mode & FREAD) - mask |= S_IRGRP; - if (mode & FWRITE) - mask |= S_IWGRP; - return ((pinfo->psem_mode & mask) == mask ? 0 : EACCES); - } - - /* Otherwise, check everyone else. */ - if (mode & FREAD) - mask |= S_IROTH; - if (mode & FWRITE) - mask |= S_IWOTH; - return ((pinfo->psem_mode & mask) == mask ? 0 : EACCES); + return(posix_cred_access(cred, pinfo->psem_uid, pinfo->psem_gid, pinfo->psem_mode, mode_req)); } int @@ -809,6 +784,7 @@ sem_close(proc_t p, struct sem_close_args *uap, __unused int32_t *retval) proc_fdunlock(p); return(error); } + procfdtbl_markclosefd(p, fd); fileproc_drain(p, fp); fdrelse(p, fd); error = closef_locked(fp, fp->f_fglob, p); diff --git a/bsd/kern/posix_shm.c b/bsd/kern/posix_shm.c index 985538e69..617d1dc9f 100644 --- a/bsd/kern/posix_shm.c +++ b/bsd/kern/posix_shm.c @@ -178,7 +178,7 @@ static int pshm_write (struct fileproc *fp, struct uio *uio, static int pshm_ioctl (struct fileproc *fp, u_long com, caddr_t data, vfs_context_t ctx); static int pshm_select (struct fileproc *fp, int which, void *wql, vfs_context_t ctx); -static int pshm_close(struct pshmnode *pnode); +static int pshm_close(struct pshminfo *pinfo, int dropref); static int pshm_closefile (struct fileglob *fg, vfs_context_t ctx); static int pshm_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx); @@ -190,7 +190,7 @@ static void pshm_cache_delete(struct pshmcache *pcp); static void pshm_cache_purge(void); #endif /* NOT_USED */ static int pshm_cache_search(struct pshminfo **pshmp, struct pshmname *pnp, - struct pshmcache **pcache); + struct pshmcache **pcache, int addref); struct fileops pshmops = { pshm_read, pshm_write, pshm_ioctl, pshm_select, pshm_closefile, pshm_kqfilter, 0 }; @@ -229,7 +229,7 @@ pshm_lock_init( void ) static int pshm_cache_search(struct pshminfo **pshmp, struct pshmname *pnp, - struct pshmcache **pcache) + struct pshmcache **pcache, int addref) { struct pshmcache *pcp, *nnp; struct pshmhashhead *pcpp; @@ -258,6 +258,8 @@ pshm_cache_search(struct pshminfo **pshmp, struct pshmname *pnp, /* TOUCH(ncp); */ *pshmp = pcp->pshminfo; *pcache = pcp; + if (addref) + pcp->pshminfo->pshm_usecount++; return (-1); } @@ -287,7 +289,7 @@ pshm_cache_add(struct pshminfo *pshmp, struct pshmname *pnp, struct pshmcache *p /* if the entry has already been added by some one else return */ - if (pshm_cache_search(&dpinfo, pnp, &dpcp) == -1) { + if (pshm_cache_search(&dpinfo, pnp, &dpcp, 0) == -1) { return(EEXIST); } pshmnument++; @@ -438,6 +440,14 @@ shm_open(proc_t p, struct shm_open_args *uap, int32_t *retval) if (error) goto bad; + cmode &= ALLPERMS; + + fmode = FFLAGS(uap->oflag); + if ((fmode & (FREAD | FWRITE)) == 0) { + error = EINVAL; + goto bad; + } + /* * We allocate a new entry if we are less than the maximum * allowed and the one at the front of the LRU list is in use. @@ -466,27 +476,42 @@ shm_open(proc_t p, struct shm_open_args *uap, int32_t *retval) PSHM_SUBSYS_LOCK(); - error = pshm_cache_search(&pinfo, &nd, &pcache); + /* + * If we find the entry in the cache, this will take a reference, + * allowing us to unlock it for the permissions check. + */ + error = pshm_cache_search(&pinfo, &nd, &pcache, 1); + + PSHM_SUBSYS_UNLOCK(); if (error == ENOENT) { error = EINVAL; - goto bad_locked; - + goto bad; } + if (!error) { incache = 0; - } else + if (fmode & O_CREAT) { + /* create a new one (commit the allocation) */ + pinfo = new_pinfo; + pinfo->pshm_flags = PSHM_DEFINED | PSHM_INCREATE; + pinfo->pshm_usecount = 1; /* existence reference */ + pinfo->pshm_mode = cmode; + pinfo->pshm_uid = kauth_getuid(); + pinfo->pshm_gid = kauth_getgid(); + bcopy(pnbuf, &pinfo->pshm_name[0], PSHMNAMLEN); + pinfo->pshm_name[PSHMNAMLEN]=0; +#if CONFIG_MACF + error = mac_posixshm_check_create(kauth_cred_get(), nameptr); + if (error) { + goto bad; + } + mac_posixshm_label_associate(kauth_cred_get(), pinfo, nameptr); +#endif + } + } else { incache = 1; - fmode = FFLAGS(uap->oflag); - if ((fmode & (FREAD | FWRITE))==0) { - error = EINVAL; - goto bad_locked; - } - - cmode &= ALLPERMS; - - if (fmode & O_CREAT) { - if (incache) { + if (fmode & O_CREAT) { /* already exists */ if ((fmode & O_EXCL)) { AUDIT_ARG(posix_ipc_perm, pinfo->pshm_uid, @@ -495,65 +520,53 @@ shm_open(proc_t p, struct shm_open_args *uap, int32_t *retval) /* shm obj exists and opened O_EXCL */ error = EEXIST; - goto bad_locked; + goto bad; } if( pinfo->pshm_flags & PSHM_INDELETE) { error = ENOENT; - goto bad_locked; + goto bad; } AUDIT_ARG(posix_ipc_perm, pinfo->pshm_uid, pinfo->pshm_gid, pinfo->pshm_mode); #if CONFIG_MACF if ((error = mac_posixshm_check_open(kauth_cred_get(), pinfo))) { - goto bad_locked; + goto bad; } #endif if ( (error = pshm_access(pinfo, fmode, kauth_cred_get(), p)) ) { - goto bad_locked; - } - } else { - /* create a new one (commit the allocation) */ - pinfo = new_pinfo; - pinfo->pshm_flags = PSHM_DEFINED | PSHM_INCREATE; - pinfo->pshm_usecount = 1; /* existence reference */ - pinfo->pshm_mode = cmode; - pinfo->pshm_uid = kauth_cred_getuid(kauth_cred_get()); - pinfo->pshm_gid = kauth_cred_get()->cr_gid; - bcopy(pnbuf, &pinfo->pshm_name[0], PSHMNAMLEN); - pinfo->pshm_name[PSHMNAMLEN]=0; -#if CONFIG_MACF - error = mac_posixshm_check_create(kauth_cred_get(), nameptr); - if (error) { - goto bad_locked; + goto bad; } - mac_posixshm_label_associate(kauth_cred_get(), pinfo, nameptr); -#endif } - } else { + } + if (!(fmode & O_CREAT)) { if (!incache) { /* O_CREAT is not set and the object does not exist */ error = ENOENT; - goto bad_locked; + goto bad; } if( pinfo->pshm_flags & PSHM_INDELETE) { error = ENOENT; - goto bad_locked; + goto bad; } #if CONFIG_MACF if ((error = mac_posixshm_check_open(kauth_cred_get(), pinfo))) { - goto bad_locked; + goto bad; } #endif if ((error = pshm_access(pinfo, fmode, kauth_cred_get(), p))) { - goto bad_locked; + goto bad; } } if (fmode & O_TRUNC) { error = EINVAL; - goto bad_locked; + goto bad; } + + + PSHM_SUBSYS_LOCK(); + #if DIAGNOSTIC if (fmode & FWRITE) pinfo->pshm_writecount++; @@ -565,9 +578,13 @@ shm_open(proc_t p, struct shm_open_args *uap, int32_t *retval) if ( (error = pshm_cache_add(pinfo, &nd, pcp)) ) { goto bad_locked; } + /* + * add reference for the new entry; otherwise, we obtained + * one from the cache hit earlier. + */ + pinfo->pshm_usecount++; } pinfo->pshm_flags &= ~PSHM_INCREATE; - pinfo->pshm_usecount++; /* extra reference for the new fd */ new_pnode->pinfo = pinfo; PSHM_SUBSYS_UNLOCK(); @@ -604,6 +621,17 @@ shm_open(proc_t p, struct shm_open_args *uap, int32_t *retval) bad_locked: PSHM_SUBSYS_UNLOCK(); bad: + /* + * If we obtained the entry from the cache, we need to drop the + * reference; holding the reference may have prevented unlinking, + * so we need to call pshm_close() to get the full effect. + */ + if (incache) { + PSHM_SUBSYS_LOCK(); + pshm_close(pinfo, 1); + PSHM_SUBSYS_UNLOCK(); + } + if (pcp != NULL) FREE(pcp, M_SHM); @@ -633,7 +661,8 @@ pshm_truncate(__unused proc_t p, struct fileproc *fp, __unused int fd, struct pshmnode * pnode ; kern_return_t kret; mem_entry_name_port_t mem_object; - mach_vm_size_t size, total_size, alloc_size; + mach_vm_size_t total_size, alloc_size; + memory_object_size_t mosize; struct pshmobj *pshmobj, *pshmobj_next, **pshmobj_next_p; #if CONFIG_MACF int error; @@ -658,7 +687,7 @@ pshm_truncate(__unused proc_t p, struct fileproc *fp, __unused int fd, return(EINVAL); } #if CONFIG_MACF - error = mac_posixshm_check_truncate(kauth_cred_get(), pinfo, size); + error = mac_posixshm_check_truncate(kauth_cred_get(), pinfo, length); if (error) { PSHM_SUBSYS_UNLOCK(); return(error); @@ -671,14 +700,14 @@ pshm_truncate(__unused proc_t p, struct fileproc *fp, __unused int fd, for (alloc_size = 0; alloc_size < total_size; - alloc_size += size) { + alloc_size += mosize) { PSHM_SUBSYS_UNLOCK(); - size = MIN(total_size - alloc_size, ANON_MAX_SIZE); + mosize = MIN(total_size - alloc_size, ANON_MAX_SIZE); kret = mach_make_memory_entry_64( VM_MAP_NULL, - &size, + &mosize, 0, MAP_MEM_NAMED_CREATE | VM_PROT_DEFAULT, &mem_object, @@ -699,7 +728,7 @@ pshm_truncate(__unused proc_t p, struct fileproc *fp, __unused int fd, PSHM_SUBSYS_LOCK(); pshmobj->pshmo_memobject = (void *) mem_object; - pshmobj->pshmo_size = size; + pshmobj->pshmo_size = mosize; pshmobj->pshmo_next = NULL; *pshmobj_next_p = pshmobj; @@ -787,39 +816,14 @@ pshm_stat(struct pshmnode *pnode, void *ub, int isstat64) int pshm_access(struct pshminfo *pinfo, int mode, kauth_cred_t cred, __unused proc_t p) { - mode_t mask; - int is_member; + int mode_req = ((mode & FREAD) ? S_IRUSR : 0) | + ((mode & FWRITE) ? S_IWUSR : 0); /* Otherwise, user id 0 always gets access. */ if (!suser(cred, NULL)) return (0); - mask = 0; - - /* Otherwise, check the owner. */ - if (kauth_cred_getuid(cred) == pinfo->pshm_uid) { - if (mode & FREAD) - mask |= S_IRUSR; - if (mode & FWRITE) - mask |= S_IWUSR; - return ((pinfo->pshm_mode & mask) == mask ? 0 : EACCES); - } - - /* Otherwise, check the groups. */ - if (kauth_cred_ismember_gid(cred, pinfo->pshm_gid, &is_member) == 0 && is_member) { - if (mode & FREAD) - mask |= S_IRGRP; - if (mode & FWRITE) - mask |= S_IWGRP; - return ((pinfo->pshm_mode & mask) == mask ? 0 : EACCES); - } - - /* Otherwise, check everyone else. */ - if (mode & FREAD) - mask |= S_IROTH; - if (mode & FWRITE) - mask |= S_IWOTH; - return ((pinfo->pshm_mode & mask) == mask ? 0 : EACCES); + return(posix_cred_access(cred, pinfo->pshm_uid, pinfo->pshm_gid, pinfo->pshm_mode, mode_req)); } int @@ -1051,7 +1055,7 @@ shm_unlink(__unused proc_t p, struct shm_unlink_args *uap, } PSHM_SUBSYS_LOCK(); - error = pshm_cache_search(&pinfo, &nd, &pcache); + error = pshm_cache_search(&pinfo, &nd, &pcache, 0); if (error == ENOENT) { PSHM_SUBSYS_UNLOCK(); @@ -1132,16 +1136,16 @@ shm_unlink(__unused proc_t p, struct shm_unlink_args *uap, /* already called locked */ static int -pshm_close(struct pshmnode *pnode) +pshm_close(struct pshminfo *pinfo, int dropref) { - int error=0; - struct pshminfo *pinfo; + int error = 0; struct pshmobj *pshmobj, *pshmobj_next; - if ((pinfo = pnode->pinfo) == PSHMINFO_NULL) - return(EINVAL); - - if ((pinfo->pshm_flags & PSHM_ALLOCATED) != PSHM_ALLOCATED) { + /* + * If we are dropping the reference we took on the cache object, don't + * enforce the allocation requirement. + */ + if ( !dropref && ((pinfo->pshm_flags & PSHM_ALLOCATED) != PSHM_ALLOCATED)) { return(EINVAL); } #if DIAGNOSTIC @@ -1170,7 +1174,6 @@ pshm_close(struct pshmnode *pnode) PSHM_SUBSYS_LOCK(); FREE(pinfo,M_SHM); } - FREE(pnode, M_SHM); return (error); } @@ -1178,11 +1181,20 @@ pshm_close(struct pshmnode *pnode) static int pshm_closefile(struct fileglob *fg, __unused vfs_context_t ctx) { - int error; + int error = EINVAL; + struct pshmnode *pnode; PSHM_SUBSYS_LOCK(); - error = pshm_close(((struct pshmnode *)fg->fg_data)); + + if ((pnode = (struct pshmnode *)fg->fg_data) != NULL) { + if (pnode->pinfo != PSHMINFO_NULL) { + error = pshm_close(pnode->pinfo, 0); + } + FREE(pnode, M_SHM); + } + PSHM_SUBSYS_UNLOCK(); + return(error); } diff --git a/bsd/kern/proc_info.c b/bsd/kern/proc_info.c index d13a2df81..a907fad59 100644 --- a/bsd/kern/proc_info.c +++ b/bsd/kern/proc_info.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2005, 2010 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -57,6 +57,7 @@ #include #include #include +#include #include #include #include @@ -75,6 +76,8 @@ #include +#include + #include struct pshmnode; @@ -92,10 +95,12 @@ int proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t int proc_pidfdinfo(int pid, int flavor,int fd, user_addr_t buffer, uint32_t buffersize, int32_t * retval); int proc_kernmsgbuf(user_addr_t buffer, uint32_t buffersize, int32_t * retval); int proc_setcontrol(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t buffersize, int32_t * retval); +int proc_pidfileportinfo(int pid, int flavor, mach_port_name_t name, user_addr_t buffer, uint32_t buffersize, int32_t *retval); /* protos for procpidinfo calls */ int proc_pidfdlist(proc_t p, user_addr_t buffer, uint32_t buffersize, int32_t *retval); int proc_pidbsdinfo(proc_t p, struct proc_bsdinfo *pbsd, int zombie); +int proc_pidshortbsdinfo(proc_t p, struct proc_bsdshortinfo *pbsd_shortp, int zombie); int proc_pidtaskinfo(proc_t p, struct proc_taskinfo *ptinfo); int proc_pidallinfo(proc_t p, int flavor, uint64_t arg, user_addr_t buffer, uint32_t buffersize, int32_t *retval); int proc_pidthreadinfo(proc_t p, uint64_t arg, struct proc_threadinfo *pthinfo); @@ -106,6 +111,7 @@ int proc_pidregionpathinfo(proc_t p, uint64_t arg, user_addr_t buffer, uint32_t int proc_pidvnodepathinfo(proc_t p, uint64_t arg, user_addr_t buffer, uint32_t buffersize, int32_t *retval); int proc_pidpathinfo(proc_t p, uint64_t arg, user_addr_t buffer, uint32_t buffersize, int32_t *retval); int proc_pidworkqueueinfo(proc_t p, struct proc_workqueueinfo *pwqinfo); +int proc_pidfileportlist(proc_t p, user_addr_t buffer, uint32_t buffersize, int32_t *retval); /* protos for proc_pidfdinfo calls */ @@ -161,7 +167,9 @@ proc_info_internal(int callnum, int pid, int flavor, uint64_t arg, user_addr_t b return(proc_kernmsgbuf(buffer, buffersize, retval)); case 5: /* set on self properties proc_setcontrol */ return(proc_setcontrol(pid, flavor, arg, buffer, buffersize, retval)); - + case 6: /* proc_pidfileportinfo */ + return(proc_pidfileportinfo(pid, flavor, (mach_port_name_t)arg, buffer, buffersize, retval)); + default: return(EINVAL); } @@ -180,6 +188,7 @@ proc_listpids(uint32_t type, uint32_t typeinfo, user_addr_t buffer, uint32_t bu struct proc * p; struct tty * tp; int error = 0; + struct proclist *current_list; /* if the buffer is null, return num of procs */ if (buffer == (user_addr_t)0) { @@ -205,13 +214,20 @@ proc_listpids(uint32_t type, uint32_t typeinfo, user_addr_t buffer, uint32_t bu n = 0; ptr = (int *)kbuf; - LIST_FOREACH(p, &allproc, p_list) { + current_list = &allproc; +proc_loop: + LIST_FOREACH(p, current_list, p_list) { skip = 0; switch (type) { case PROC_PGRP_ONLY: if (p->p_pgrpid != (pid_t)typeinfo) skip = 1; break; + case PROC_PPID_ONLY: + if ((p->p_ppid != (pid_t)typeinfo) && (((p->p_lflag & P_LTRACED) == 0) || (p->p_oppid != (pid_t)typeinfo))) + skip = 1; + break; + case PROC_ALL_PIDS: skip = 0; break; @@ -245,7 +261,7 @@ proc_listpids(uint32_t type, uint32_t typeinfo, user_addr_t buffer, uint32_t bu uid_t uid; my_cred = kauth_cred_proc_ref(p); - uid = my_cred->cr_ruid; + uid = kauth_cred_getruid(my_cred); kauth_cred_unref(&my_cred); if (uid != (uid_t)typeinfo) skip = 1; @@ -256,11 +272,6 @@ proc_listpids(uint32_t type, uint32_t typeinfo, user_addr_t buffer, uint32_t bu break; }; - /* Do we have permission to look into this ? */ - if (proc_security_policy(p) != 0) { - skip = 1; - } - if(skip == 0) { *ptr++ = p->p_pid; n++; @@ -269,15 +280,10 @@ proc_listpids(uint32_t type, uint32_t typeinfo, user_addr_t buffer, uint32_t bu break; } - if (n < numprocs) { - LIST_FOREACH(p, &zombproc, p_list) { - *ptr++ = p->p_pid; - n++; - if (n >= numprocs) - break; - } + if ((n < numprocs) && (current_list == &allproc)) { + current_list = &zombproc; + goto proc_loop; } - proc_list_unlock(); @@ -345,6 +351,119 @@ proc_pidfdlist(proc_t p, user_addr_t buffer, uint32_t buffersize, int32_t *retv return(error); } +/* + * Helper functions for proc_pidfileportlist. + */ +static int +proc_fileport_count(__unused mach_port_name_t name, + __unused struct fileglob *fg, void *arg) +{ + uint32_t *counter = arg; + + *counter += 1; + return (0); +} + +struct fileport_fdtype_args { + struct proc_fileportinfo *ffa_pfi; + struct proc_fileportinfo *ffa_pfi_end; +}; + +static int +proc_fileport_fdtype(mach_port_name_t name, struct fileglob *fg, void *arg) +{ + struct fileport_fdtype_args *ffa = arg; + + if (ffa->ffa_pfi != ffa->ffa_pfi_end) { + ffa->ffa_pfi->proc_fdtype = fg->fg_type; + ffa->ffa_pfi->proc_fileport = name; + ffa->ffa_pfi++; + return (0); /* keep walking */ + } else + return (-1); /* stop the walk! */ +} + +int +proc_pidfileportlist(proc_t p, + user_addr_t buffer, uint32_t buffersize, int32_t *retval) +{ + void *kbuf; + vm_size_t kbufsize; + struct proc_fileportinfo *pfi; + uint32_t needfileports, numfileports; + struct fileport_fdtype_args ffa; + int error; + + needfileports = buffersize / sizeof (*pfi); + if ((user_addr_t)0 == buffer || needfileports > (uint32_t)maxfiles) { + /* + * Either (i) the user is asking for a fileport count, + * or (ii) the number of fileports they're asking for is + * larger than the maximum number of open files (!); count + * them to bound subsequent heap allocations. + */ + numfileports = 0; + switch (fileport_walk(p->task, + proc_fileport_count, &numfileports)) { + case KERN_SUCCESS: + break; + case KERN_RESOURCE_SHORTAGE: + return (ENOMEM); + case KERN_INVALID_TASK: + return (ESRCH); + default: + return (EINVAL); + } + + if (numfileports == 0) { + *retval = 0; /* none at all, bail */ + return (0); + } + if ((user_addr_t)0 == buffer) { + numfileports += 20; /* accelerate convergence */ + *retval = numfileports * sizeof (*pfi); + return (0); + } + if (needfileports > numfileports) + needfileports = numfileports; + } + + assert(buffersize >= PROC_PIDLISTFILEPORTS_SIZE); + + kbufsize = (vm_size_t)needfileports * sizeof (*pfi); + pfi = kbuf = kalloc(kbufsize); + if (kbuf == NULL) + return (ENOMEM); + bzero(kbuf, kbufsize); + + ffa.ffa_pfi = pfi; + ffa.ffa_pfi_end = pfi + needfileports; + + switch (fileport_walk(p->task, proc_fileport_fdtype, &ffa)) { + case KERN_SUCCESS: + error = 0; + pfi = ffa.ffa_pfi; + if ((numfileports = pfi - (typeof(pfi))kbuf) == 0) + break; + if (numfileports > needfileports) + panic("more fileports returned than requested"); + error = copyout(kbuf, buffer, numfileports * sizeof (*pfi)); + break; + case KERN_RESOURCE_SHORTAGE: + error = ENOMEM; + break; + case KERN_INVALID_TASK: + error = ESRCH; + break; + default: + error = EINVAL; + break; + } + kfree(kbuf, kbufsize); + if (error == 0) + *retval = numfileports * sizeof (*pfi); + return (error); +} int proc_pidbsdinfo(proc_t p, struct proc_bsdinfo * pbsd, int zombie) @@ -363,19 +482,21 @@ proc_pidbsdinfo(proc_t p, struct proc_bsdinfo * pbsd, int zombie) pbsd->pbi_xstatus = p->p_xstat; pbsd->pbi_pid = p->p_pid; pbsd->pbi_ppid = p->p_ppid; - pbsd->pbi_uid = my_cred->cr_uid; - pbsd->pbi_gid = my_cred->cr_gid; - pbsd->pbi_ruid = my_cred->cr_ruid; - pbsd->pbi_rgid = my_cred->cr_rgid; - pbsd->pbi_svuid = my_cred->cr_svuid; - pbsd->pbi_svgid = my_cred->cr_svgid; + pbsd->pbi_uid = kauth_cred_getuid(my_cred); + pbsd->pbi_gid = kauth_cred_getgid(my_cred); + pbsd->pbi_ruid = kauth_cred_getruid(my_cred); + pbsd->pbi_rgid = kauth_cred_getrgid(my_cred); + pbsd->pbi_svuid = kauth_cred_getsvuid(my_cred); + pbsd->pbi_svgid = kauth_cred_getsvgid(my_cred); kauth_cred_unref(&my_cred); pbsd->pbi_nice = p->p_nice; pbsd->pbi_start_tvsec = p->p_start.tv_sec; pbsd->pbi_start_tvusec = p->p_start.tv_usec; - bcopy(&p->p_comm, &pbsd->pbi_comm[0], MAXCOMLEN-1); - bcopy(&p->p_name, &pbsd->pbi_name[0], 2*MAXCOMLEN-1); + bcopy(&p->p_comm, &pbsd->pbi_comm[0], MAXCOMLEN); + pbsd->pbi_comm[MAXCOMLEN - 1] = '\0'; + bcopy(&p->p_name, &pbsd->pbi_name[0], 2*MAXCOMLEN); + pbsd->pbi_name[(2*MAXCOMLEN) - 1] = '\0'; pbsd->pbi_flags = 0; if ((p->p_flag & P_SYSTEM) == P_SYSTEM) @@ -392,6 +513,10 @@ proc_pidbsdinfo(proc_t p, struct proc_bsdinfo * pbsd, int zombie) pbsd->pbi_flags |= PROC_FLAG_CONTROLT; if ((p->p_flag & P_THCWD) == P_THCWD) pbsd->pbi_flags |= PROC_FLAG_THCWD; + if ((p->p_flag & P_SUGID) == P_SUGID) + pbsd->pbi_flags |= PROC_FLAG_PSUGID; + if ((p->p_flag & P_EXEC) == P_EXEC) + pbsd->pbi_flags |= PROC_FLAG_EXEC; if (sessionp != SESSION_NULL) { if (SESS_LEADER(p, sessionp)) @@ -422,6 +547,10 @@ proc_pidbsdinfo(proc_t p, struct proc_bsdinfo * pbsd, int zombie) break; }; + /* if process is a zombie skip bg state */ + if ((zombie == 0) && (p->p_stat != SZOMB) && (p->task != TASK_NULL)) + proc_get_darwinbgstate(p->task, &pbsd->pbi_flags); + if (zombie == 0) pbsd->pbi_nfiles = p->p_fd->fd_nfiles; if (pg != PGRP_NULL) { @@ -441,6 +570,72 @@ proc_pidbsdinfo(proc_t p, struct proc_bsdinfo * pbsd, int zombie) } +int +proc_pidshortbsdinfo(proc_t p, struct proc_bsdshortinfo * pbsd_shortp, int zombie) +{ + bzero(pbsd_shortp, sizeof(struct proc_bsdshortinfo)); + pbsd_shortp->pbsi_pid = p->p_pid; + pbsd_shortp->pbsi_ppid = p->p_ppid; + pbsd_shortp->pbsi_pgid = p->p_pgrpid; + pbsd_shortp->pbsi_status = p->p_stat; + bcopy(&p->p_comm, &pbsd_shortp->pbsi_comm[0], MAXCOMLEN); + pbsd_shortp->pbsi_comm[MAXCOMLEN - 1] = '\0'; + + pbsd_shortp->pbsi_flags = 0; + if ((p->p_flag & P_SYSTEM) == P_SYSTEM) + pbsd_shortp->pbsi_flags |= PROC_FLAG_SYSTEM; + if ((p->p_lflag & P_LTRACED) == P_LTRACED) + pbsd_shortp->pbsi_flags |= PROC_FLAG_TRACED; + if ((p->p_lflag & P_LEXIT) == P_LEXIT) + pbsd_shortp->pbsi_flags |= PROC_FLAG_INEXIT; + if ((p->p_lflag & P_LPPWAIT) == P_LPPWAIT) + pbsd_shortp->pbsi_flags |= PROC_FLAG_PPWAIT; + if ((p->p_flag & P_LP64) == P_LP64) + pbsd_shortp->pbsi_flags |= PROC_FLAG_LP64; + if ((p->p_flag & P_CONTROLT) == P_CONTROLT) + pbsd_shortp->pbsi_flags |= PROC_FLAG_CONTROLT; + if ((p->p_flag & P_THCWD) == P_THCWD) + pbsd_shortp->pbsi_flags |= PROC_FLAG_THCWD; + if ((p->p_flag & P_SUGID) == P_SUGID) + pbsd_shortp->pbsi_flags |= PROC_FLAG_PSUGID; + if ((p->p_flag & P_EXEC) == P_EXEC) + pbsd_shortp->pbsi_flags |= PROC_FLAG_EXEC; + + switch(PROC_CONTROL_STATE(p)) { + case P_PCTHROTTLE: + pbsd_shortp->pbsi_flags |= PROC_FLAG_PC_THROTTLE; + break; + case P_PCSUSP: + pbsd_shortp->pbsi_flags |= PROC_FLAG_PC_SUSP; + break; + case P_PCKILL: + pbsd_shortp->pbsi_flags |= PROC_FLAG_PC_KILL; + break; + }; + + switch(PROC_ACTION_STATE(p)) { + case P_PCTHROTTLE: + pbsd_shortp->pbsi_flags |= PROC_FLAG_PA_THROTTLE; + break; + case P_PCSUSP: + pbsd_shortp->pbsi_flags |= PROC_FLAG_PA_SUSP; + break; + }; + + /* if process is a zombie skip bg state */ + if ((zombie == 0) && (p->p_stat != SZOMB) && (p->task != TASK_NULL)) + proc_get_darwinbgstate(p->task, &pbsd_shortp->pbsi_flags); + + pbsd_shortp->pbsi_uid = p->p_uid; + pbsd_shortp->pbsi_gid = p->p_gid; + pbsd_shortp->pbsi_ruid = p->p_ruid; + pbsd_shortp->pbsi_rgid = p->p_rgid; + pbsd_shortp->pbsi_svuid = p->p_svuid; + pbsd_shortp->pbsi_svgid = p->p_svgid; + + return(0); +} + int proc_pidtaskinfo(proc_t p, struct proc_taskinfo * ptinfo) { @@ -739,7 +934,7 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu int error = ENOTSUP; int gotref = 0; int findzomb = 0; - int refheld = 0; + int refheld = 0, shortversion = 0; uint32_t size; int zombie = 0; @@ -786,6 +981,14 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu else size = PROC_PIDWORKQUEUEINFO_SIZE; break; + case PROC_PIDT_SHORTBSDINFO: + size = PROC_PIDT_SHORTBSDINFO_SIZE; + break; + case PROC_PIDLISTFILEPORTS: + size = PROC_PIDLISTFILEPORTS_SIZE; + if (buffer == (user_addr_t)0) + size = 0; + break; default: return(EINVAL); } @@ -797,7 +1000,7 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu return(EOVERFLOW); } - if ((flavor != PROC_PIDTBSDINFO) && (flavor != PROC_PIDPATHINFO)) { + if ((flavor != PROC_PIDTBSDINFO) && (flavor != PROC_PIDPATHINFO) && (flavor != PROC_PIDT_SHORTBSDINFO)) { if ((p = proc_find(pid)) == PROC_NULL) { error = ESRCH; goto out; @@ -816,8 +1019,11 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu } break; + case PROC_PIDT_SHORTBSDINFO: + shortversion = 1; case PROC_PIDTBSDINFO: { struct proc_bsdinfo pbsd; + struct proc_bsdshortinfo pbsd_short; zombie = 0; if (arg) @@ -825,27 +1031,45 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu p = proc_find(pid); if (p == PROC_NULL) { if (findzomb) - p = pzfind(pid); + p = proc_find_zombref(pid); if (p == NULL) { error = ESRCH; goto out; } zombie = 1; - } else - refheld = 1; + } + refheld = 1; /* Do we have permission to look into this ? */ - if ((error = proc_security_policy(p)) != 0) { - if (refheld != 0) - proc_rele(p); + if ((flavor != PROC_PIDT_SHORTBSDINFO) && ((error = proc_security_policy(p)) != 0)) { + if (refheld != 0) { + if (zombie != 0) + proc_drop_zombref(p); + else + proc_rele(p); + } goto out; } - error = proc_pidbsdinfo(p, &pbsd, zombie); - if (refheld != 0) - proc_rele(p); + if (shortversion != 0) { + error = proc_pidshortbsdinfo(p, &pbsd_short, zombie); + } else { + error = proc_pidbsdinfo(p, &pbsd, zombie); + } + if (refheld != 0) { + if (zombie != 0) + proc_drop_zombref(p); + else + proc_rele(p); + } if (error == 0) { - error = copyout(&pbsd, buffer, sizeof(struct proc_bsdinfo)); - if (error == 0) - *retval = sizeof(struct proc_bsdinfo); + if (shortversion != 0) { + error = copyout(&pbsd_short, buffer, sizeof(struct proc_bsdshortinfo)); + if (error == 0) + *retval = sizeof(struct proc_bsdshortinfo); + } else { + error = copyout(&pbsd, buffer, sizeof(struct proc_bsdinfo)); + if (error == 0) + *retval = sizeof(struct proc_bsdinfo); + } } } break; @@ -945,6 +1169,12 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu } break; + case PROC_PIDLISTFILEPORTS: { + error = proc_pidfileportlist(p, buffer, buffersize, + retval); + } + break; + default: error = ENOTSUP; } @@ -1297,9 +1527,8 @@ proc_pidfdinfo(int pid, int flavor, int fd, user_addr_t buffer, uint32_t buffer #endif /* NETAT */ default: { error = EINVAL; + goto out1; } - break; - } fp_drop(p, fd, fp , 0); @@ -1309,6 +1538,137 @@ out1 : return(error); } +/* + * Helper function for proc_pidfileportinfo + */ + +struct fileport_info_args { + int fia_flavor; + user_addr_t fia_buffer; + uint32_t fia_buffersize; + int32_t *fia_retval; +}; + +static kern_return_t +proc_fileport_info(__unused mach_port_name_t name, + struct fileglob *fg, void *arg) +{ + struct fileport_info_args *fia = arg; + struct fileproc __fileproc, *fp = &__fileproc; + int error; + + bzero(fp, sizeof (*fp)); + fp->f_fglob = fg; + + switch (fia->fia_flavor) { + case PROC_PIDFILEPORTVNODEPATHINFO: { + vnode_t vp; + + if (fg->fg_type != DTYPE_VNODE) { + error = ENOTSUP; + break; + } + vp = (struct vnode *)fg->fg_data; + error = pid_vnodeinfopath(vp, vnode_vid(vp), fp, 0, + fia->fia_buffer, fia->fia_buffersize, fia->fia_retval); + } break; + + case PROC_PIDFILEPORTSOCKETINFO: { + socket_t so; + + if (fg->fg_type != DTYPE_SOCKET) { + error = EOPNOTSUPP; + break; + } + so = (socket_t)fg->fg_data; + error = pid_socketinfo(so, fp, 0, + fia->fia_buffer, fia->fia_buffersize, fia->fia_retval); + } break; + + case PROC_PIDFILEPORTPSHMINFO: { + struct pshmnode *pshm; + + if (fg->fg_type != DTYPE_PSXSHM) { + error = EBADF; /* ick - mirror fp_getfpshm */ + break; + } + pshm = (struct pshmnode *)fg->fg_data; + error = pid_pshminfo(pshm, fp, 0, + fia->fia_buffer, fia->fia_buffersize, fia->fia_retval); + } break; + + case PROC_PIDFILEPORTPIPEINFO: { + struct pipe *cpipe; + + if (fg->fg_type != DTYPE_PIPE) { + error = EBADF; /* ick - mirror fp_getfpipe */ + break; + } + cpipe = (struct pipe *)fg->fg_data; + error = pid_pipeinfo(cpipe, fp, 0, + fia->fia_buffer, fia->fia_buffersize, fia->fia_retval); + } break; + + default: + error = EINVAL; + break; + } + + return (error); +} + +/************************* proc_pidfileportinfo routine *********************/ +int +proc_pidfileportinfo(int pid, int flavor, mach_port_name_t name, + user_addr_t buffer, uint32_t buffersize, int32_t *retval) +{ + proc_t p; + int error = ENOTSUP; + uint32_t size; + struct fileport_info_args fia; + + /* fileport types are restricted by filetype_issendable() */ + + switch (flavor) { + case PROC_PIDFILEPORTVNODEPATHINFO: + size = PROC_PIDFILEPORTVNODEPATHINFO_SIZE; + break; + case PROC_PIDFILEPORTSOCKETINFO: + size = PROC_PIDFILEPORTSOCKETINFO_SIZE; + break; + case PROC_PIDFILEPORTPSHMINFO: + size = PROC_PIDFILEPORTPSHMINFO_SIZE; + break; + case PROC_PIDFILEPORTPIPEINFO: + size = PROC_PIDFILEPORTPIPEINFO_SIZE; + break; + default: + return (EINVAL); + } + + if (buffersize < size) + return (ENOMEM); + if ((p = proc_find(pid)) == PROC_NULL) { + error = ESRCH; + goto out; + } + if ((error = proc_security_policy(p)) != 0) { + goto out1; + } + + fia.fia_flavor = flavor; + fia.fia_buffer = buffer; + fia.fia_buffersize = buffersize; + fia.fia_retval = retval; + + if (fileport_invoke(p->task, name, + proc_fileport_info, &fia, &error) != KERN_SUCCESS) + error = EINVAL; +out1: + proc_rele(p); +out: + return (error); +} static int proc_security_policy(proc_t p) @@ -1339,22 +1699,23 @@ proc_kernmsgbuf(user_addr_t buffer, uint32_t buffersize, int32_t * retval) /* ********* process control sets on self only */ int -proc_setcontrol(int pid, int flavor, uint64_t arg, __unused user_addr_t buffer, __unused uint32_t buffersize, __unused int32_t * retval) +proc_setcontrol(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t buffersize, __unused int32_t * retval) { struct proc * pself = PROC_NULL; int error = 0; uint32_t pcontrol = (uint32_t)arg; + struct uthread *ut = NULL; pself = current_proc(); if (pid != pself->p_pid) return(EINVAL); - if (pcontrol > P_PCMAX) - return(EINVAL); switch (flavor) { case PROC_SELFSET_PCONTROL: { + if (pcontrol > P_PCMAX) + return(EINVAL); proc_lock(pself); /* reset existing control setting while retaining action state */ pself->p_pcaction &= PROC_ACTION_MASK; @@ -1364,10 +1725,42 @@ proc_setcontrol(int pid, int flavor, uint64_t arg, __unused user_addr_t buffer, } break; + case PROC_SELFSET_THREADNAME: { + /* PROC_SELFSET_THREADNAME_SIZE = (MAXTHREADNAMESIZE -1) */ + if(buffersize > PROC_SELFSET_THREADNAME_SIZE) + return ENAMETOOLONG; + ut = current_uthread(); + + if(!ut->pth_name) + { + ut->pth_name = (char*)kalloc(MAXTHREADNAMESIZE ); + if(!ut->pth_name) + return ENOMEM; + } + bzero(ut->pth_name, MAXTHREADNAMESIZE); + error = copyin(buffer, ut->pth_name, buffersize); + } + break; + + case PROC_SELFSET_VMRSRCOWNER: { + /* need to to be superuser */ + if (suser(kauth_cred_get(), (u_short *)0) != 0) { + error = EPERM; + goto out; + } + + proc_lock(pself); + /* reset existing control setting while retaining action state */ + pself->p_lflag |= P_LVMRSRCOWNER; + proc_unlock(pself); + } + break; + default: error = ENOTSUP; } +out: return(error); } diff --git a/bsd/kern/process_policy.c b/bsd/kern/process_policy.c new file mode 100644 index 000000000..e6596dad4 --- /dev/null +++ b/bsd/kern/process_policy.c @@ -0,0 +1,460 @@ +/* + * Copyright (c) 2005, 2010 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * process policy syscall implementation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +static int handle_background(int scope, int action, int policy, int policy_subtype, user_addr_t attrp, proc_t proc, uint64_t target_threadid); +static int handle_hwaccess(int scope, int action, int policy, int policy_subtype, user_addr_t attrp, proc_t proc, uint64_t target_threadid); +static int handle_lowresrouce(int scope, int action, int policy, int policy_subtype, user_addr_t attrp, proc_t proc, uint64_t target_threadid); +static int handle_resourceuse(int scope, int action, int policy, int policy_subtype, user_addr_t attrp, proc_t proc, uint64_t target_threadid); +static int handle_apptype(int scope, int action, int policy, int policy_subtype, user_addr_t attrp, proc_t proc, uint64_t target_threadid); + +extern kern_return_t task_suspend(task_t); +extern kern_return_t task_resume(task_t); + +/***************************** process_policy ********************/ + +/* + *int process_policy(int scope, int action, int policy, int policy_subtype, + * proc_policy_attribute_t * attrp, pid_t target_pid, + * uint64_t target_threadid) + *{ int process_policy(int scope, int action, int policy, int policy_subtype, + * user_addr_t attrp, pid_t target_pid, uint64_t target_threadid); } + */ + +/* system call implementaion */ +int +process_policy(struct proc *p, struct process_policy_args * uap, __unused int32_t *retval) +{ + int error = 0; + int scope = uap->scope; + int policy = uap->policy; + int action = uap->action; + int policy_subtype = uap->policy_subtype; + user_addr_t attrp = uap->attrp; + pid_t target_pid = uap->target_pid; + uint64_t target_threadid = uap->target_threadid; + proc_t proc = PROC_NULL; + proc_t curp = current_proc(); + kauth_cred_t my_cred; +#if CONFIG_EMBEDDED + kauth_cred_t target_cred; +#endif + + if ((scope != PROC_POLICY_SCOPE_PROCESS) && (scope != PROC_POLICY_SCOPE_THREAD)) { + return(EINVAL); + } + proc = proc_find(target_pid); + if (proc == PROC_NULL) { + return(EINVAL); + } + + my_cred = kauth_cred_proc_ref(curp); + +#if CONFIG_EMBEDDED + target_cred = kauth_cred_proc_ref(proc); + + if (suser(my_cred, NULL) && kauth_cred_getruid(my_cred) && + kauth_cred_getuid(my_cred) != kauth_cred_getuid(target_cred) && + kauth_cred_getruid(my_cred) != kauth_cred_getuid(target_cred)) +#else + /* + * Resoure starvation control can be used by unpriv resource owner but priv at the time of ownership claim. This is + * checked in low resource handle routine. So bypass the checks here. + */ + if ((policy != PROC_POLICY_RESOURCE_STARVATION) && + (policy != PROC_POLICY_APPTYPE) && + (suser(my_cred, NULL) && curp != p)) +#endif + { + error = EPERM; + goto out; + } + +#if CONFIG_MACF + error = mac_proc_check_sched(curp, p); + if (error) + goto out; +#endif + + + switch(policy) { + case PROC_POLICY_BACKGROUND: + error = handle_background(scope, action, policy, policy_subtype, attrp, proc, target_threadid); + break; + case PROC_POLICY_HARDWARE_ACCESS: + error = handle_hwaccess(scope, action, policy, policy_subtype, attrp, proc, target_threadid); + break; + case PROC_POLICY_RESOURCE_STARVATION: + error = handle_lowresrouce(scope, action, policy, policy_subtype, attrp, proc, target_threadid); + break; + case PROC_POLICY_RESOURCE_USAGE: + error = handle_resourceuse(scope, action, policy, policy_subtype, attrp, proc, target_threadid); + break; + case PROC_POLICY_APPTYPE: + error = handle_apptype(scope, action, policy, policy_subtype, attrp, proc, target_threadid); + break; + default: + error = EINVAL; + break; + } + +out: + proc_rele(proc); + kauth_cred_unref(&my_cred); +#if CONFIG_EMBEDDED + kauth_cred_unref(&target_cred); +#endif + return(error); +} + + +/* darwin background handling code */ +static int +handle_background(int scope, int action, __unused int policy, __unused int policy_subtype, user_addr_t attrp, proc_t proc, uint64_t target_threadid) +{ + int intval, error = 0; + + + switch (action) { + case PROC_POLICY_ACTION_GET: + if (scope == PROC_POLICY_SCOPE_PROCESS) { + intval = proc_get_task_bg_policy(proc->task); + } else { + /* thread scope */ + intval = proc_get_thread_bg_policy(proc->task, target_threadid); + } + error = copyout((int *)&intval, (user_addr_t)attrp, sizeof(int)); + break; + + case PROC_POLICY_ACTION_SET: + error = copyin((user_addr_t)attrp, (int *)&intval, sizeof(int)); + if (error != 0) + goto out; + if (intval > PROC_POLICY_BG_ALL) { + error = EINVAL; + goto out; + } + if (scope == PROC_POLICY_SCOPE_PROCESS) { + error = proc_set_bgtaskpolicy(proc->task, intval); + } else { + /* thread scope */ + error = proc_set_bgthreadpolicy(proc->task, target_threadid, intval); + } + break; + + case PROC_POLICY_ACTION_ADD: + error = copyin((user_addr_t)attrp, (int *)&intval, sizeof(int)); + if (error != 0) + goto out; + if (intval > PROC_POLICY_BG_ALL) { + error = EINVAL; + goto out; + } + if (scope == PROC_POLICY_SCOPE_PROCESS) { + error = proc_add_bgtaskpolicy(proc->task, intval); + } else { + /* thread scope */ + error = proc_add_bgthreadpolicy(proc->task, target_threadid, intval); + } + break; + + case PROC_POLICY_ACTION_REMOVE: + error = copyin((user_addr_t)attrp, (int *)&intval, sizeof(int)); + if (error != 0) + goto out; + if (intval > PROC_POLICY_BG_ALL) { + error = EINVAL; + goto out; + } + if (scope == PROC_POLICY_SCOPE_PROCESS) { + error = proc_remove_bgtaskpolicy(proc->task, intval); + } else { + /* thread scope */ + error = proc_remove_bgthreadpolicy(proc->task, target_threadid, intval); + } + break; + + case PROC_POLICY_ACTION_APPLY: + if (scope == PROC_POLICY_SCOPE_PROCESS) { + error = proc_apply_bgtaskpolicy(proc->task); + } else { + /* thread scope */ + error = proc_apply_bgthreadpolicy(proc->task, target_threadid); + } + break; + + case PROC_POLICY_ACTION_RESTORE: + if (scope == PROC_POLICY_SCOPE_PROCESS) { + error = proc_restore_bgtaskpolicy(proc->task); + } else { + /* thread scope */ + error = proc_restore_bgthreadpolicy(proc->task, target_threadid); + } + break; + + case PROC_POLICY_ACTION_DENYINHERIT: + error = proc_denyinherit_policy(proc->task); + break; + + case PROC_POLICY_ACTION_DENYSELFSET: + error = proc_denyselfset_policy(proc->task); + break; + + default: + return(EINVAL); + } + +out: + return(error); +} + +static int +handle_hwaccess(__unused int scope, __unused int action, __unused int policy, int policy_subtype, __unused user_addr_t attrp, __unused proc_t proc, __unused uint64_t target_threadid) +{ + switch(policy_subtype) { + case PROC_POLICY_HWACCESS_NONE: + case PROC_POLICY_HWACCESS_DISK: + case PROC_POLICY_HWACCESS_GPU: + case PROC_POLICY_HWACCESS_NETWORK: + case PROC_POLICY_HWACCESS_CPU: + break; + default: + return(EINVAL); + } + return(0); +} + +static int +handle_lowresrouce(__unused int scope, int action, __unused int policy, int policy_subtype, __unused user_addr_t attrp, proc_t proc, __unused uint64_t target_threadid) +{ + int error = 0; + + switch(policy_subtype) { + case PROC_POLICY_RS_NONE: + case PROC_POLICY_RS_VIRTUALMEM: + break; + default: + return(EINVAL); + } + + if (action == PROC_POLICY_ACTION_RESTORE) + error = proc_resetpcontrol(proc_pid(proc)); + else + error = EINVAL; + + return(error); +} + + +static int +handle_resourceuse(__unused int scope, __unused int action, __unused int policy, int policy_subtype, user_addr_t attrp, proc_t proc, __unused uint64_t target_threadid) +{ + proc_policy_cpuusage_attr_t cpuattr; + int error = 0; + + switch(policy_subtype) { + case PROC_POLICY_RUSAGE_NONE: + case PROC_POLICY_RUSAGE_WIREDMEM: + case PROC_POLICY_RUSAGE_VIRTMEM: + case PROC_POLICY_RUSAGE_DISK: + case PROC_POLICY_RUSAGE_NETWORK: + case PROC_POLICY_RUSAGE_POWER: + return(ENOTSUP); + break; + default: + return(EINVAL); + case PROC_POLICY_RUSAGE_CPU: + break; + } + + switch (action) { + case PROC_POLICY_ACTION_GET: + error = proc_get_task_ruse_cpu(proc->task, &cpuattr.ppattr_cpu_attr, + &cpuattr.ppattr_cpu_percentage, + &cpuattr.ppattr_cpu_attr_interval, + &cpuattr.ppattr_cpu_attr_deadline); + if (error == 0) + error = copyout((proc_policy_cpuusage_attr_t *)&cpuattr, (user_addr_t)attrp, sizeof(proc_policy_cpuusage_attr_t)); + break; + + case PROC_POLICY_ACTION_APPLY: + case PROC_POLICY_ACTION_SET: + error = copyin((user_addr_t)attrp, (proc_policy_cpuusage_attr_t *)&cpuattr, sizeof(proc_policy_cpuusage_attr_t)); + + if (error == 0) { + error = proc_set_task_ruse_cpu(proc->task, cpuattr.ppattr_cpu_attr, + cpuattr.ppattr_cpu_percentage, + cpuattr.ppattr_cpu_attr_interval, + cpuattr.ppattr_cpu_attr_deadline); + } + default: + error = EINVAL; + break; + + } + + return(error); +} + + +static int +handle_apptype(__unused int scope, int action, __unused int policy, int policy_subtype, __unused user_addr_t attrp, proc_t proc, __unused uint64_t target_threadid) +{ + int error = 0; + + switch(policy_subtype) { + case PROC_POLICY_OSX_APPTYPE_TAL: + /* need to be super user to do this */ + if (kauth_cred_issuser(kauth_cred_get()) == 0) { + error = EPERM; + goto out; + } + break; + case PROC_POLICY_OSX_APPTYPE_DASHCLIENT: + /* no special priv needed */ + break; + case PROC_POLICY_OSX_APPTYPE_NONE: + case PROC_POLICY_IOS_APPTYPE: + case PROC_POLICY_IOS_NONUITYPE: + return(ENOTSUP); + break; + default: + return(EINVAL); + } + + switch (action) { + case PROC_POLICY_ACTION_ENABLE: + /* reapply the app foreground/background policy */ + error = proc_enable_task_apptype(proc->task, policy_subtype); + break; + case PROC_POLICY_ACTION_DISABLE: + /* remove the app foreground/background policy */ + error = proc_disable_task_apptype(proc->task, policy_subtype); + break; + default: + error = EINVAL; + break; + } + +out: + return(error); +} + +int +proc_apply_resource_actions(void * bsdinfo, int type, int action) +{ + proc_t p = (proc_t)bsdinfo; + + switch(action) { + case PROC_POLICY_RSRCACT_THROTTLE: + /* no need to do anything */ + break; + + case PROC_POLICY_RSRCACT_SUSPEND: + task_suspend(p->task); + break; + + case PROC_POLICY_RSRCACT_TERMINATE: + psignal(p, SIGKILL); + break; + + case PROC_POLICY_RSRCACT_NOTIFY: + proc_lock(p); + proc_knote(p, NOTE_RESOURCEEND | (type & 0xff)); + proc_unlock(p); + break; + } + + return(0); +} + + +int +proc_restore_resource_actions(void * bsdinfo, __unused int type, int action) +{ + proc_t p = (proc_t)bsdinfo; + + switch(action) { + case PROC_POLICY_RSRCACT_THROTTLE: + case PROC_POLICY_RSRCACT_TERMINATE: + case PROC_POLICY_RSRCACT_NOTIFY: + /* no need to do anything */ + break; + + case PROC_POLICY_RSRCACT_SUSPEND: + task_resume(p->task); + break; + + } + + return(0); +} + diff --git a/bsd/kern/pthread_support.c b/bsd/kern/pthread_support.c index 2691813a4..e5626dfa2 100644 --- a/bsd/kern/pthread_support.c +++ b/bsd/kern/pthread_support.c @@ -67,10 +67,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -82,9 +84,35 @@ #include -#define _PSYNCH_TRACE_ 0 /* kdebug trace */ -#define __TESTPANICS__ 0 /* panics for error conditions */ -#define COND_MTX_WAITQUEUEMOVE 0 /* auto move from cvar wait queue to mutex waitqueue */ +#include + +#define __PSYNCH_DEBUG__ 0 /* debug panic actions */ +#define _PSYNCH_TRACE_ 1 /* kdebug trace */ + +#define __TESTMODE__ 2 /* 0 - return error on user error conditions */ + /* 1 - log error on user error conditions */ + /* 2 - abort caller on user error conditions */ + /* 3 - panic on user error conditions */ +static int __test_panics__; +static int __test_aborts__; +static int __test_prints__; + +static inline void __FAILEDUSERTEST__(const char *str) +{ + proc_t p; + + if (__test_panics__ != 0) + panic(str); + + if (__test_aborts__ != 0 || __test_prints__ != 0) + p = current_proc(); + + if (__test_prints__ != 0) + printf("PSYNCH: pid[%d]: %s\n", p->p_pid, str); + + if (__test_aborts__ != 0) + psignal(p, SIGABRT); +} #if _PSYNCH_TRACE_ #define _PSYNCH_TRACE_MLWAIT 0x9000000 @@ -103,6 +131,10 @@ #define _PSYNCH_TRACE_RWUNLOCK2 0x9000034 #define _PSYNCH_TRACE_RWHANDLEU 0x9000038 #define _PSYNCH_TRACE_FSEQTILL 0x9000040 +#define _PSYNCH_TRACE_CLRPRE 0x9000044 +#define _PSYNCH_TRACE_CVHBROAD 0x9000048 +#define _PSYNCH_TRACE_CVSEQ 0x900004c +#define _PSYNCH_TRACE_THWAKEUP 0x9000050 /* user side */ #define _PSYNCH_TRACE_UM_LOCK 0x9000060 #define _PSYNCH_TRACE_UM_UNLOCK 0x9000064 @@ -112,8 +144,24 @@ #define _PSYNCH_TRACE_UM_CVSIG 0x9000074 #define _PSYNCH_TRACE_UM_CVBRD 0x9000078 +proc_t pthread_debug_proc = PROC_NULL; +static inline void __PTHREAD_TRACE_DEBUG(uint32_t debugid, uintptr_t arg1, + uintptr_t arg2, + uintptr_t arg3, + uintptr_t arg4, + uintptr_t arg5) +{ + proc_t p = current_proc(); + + if ((pthread_debug_proc != NULL) && (p == pthread_debug_proc)) + KERNEL_DEBUG_CONSTANT(debugid, arg1, arg2, arg3, arg4, arg5); +} + #endif /* _PSYNCH_TRACE_ */ +#define ECVCERORR 256 +#define ECVPERORR 512 + lck_mtx_t * pthread_list_mlock; #define PTHHASH(addr) (&pthashtbl[(addr) & pthhash]) @@ -122,19 +170,28 @@ struct pthhashhead * pth_glob_hashtbl; u_long pthhash; LIST_HEAD(, ksyn_wait_queue) pth_free_list; +int num_total_kwq = 0; /* number of kwq in use currently */ +int num_infreekwq = 0; /* number of kwq in free list */ +int num_freekwq = 0; /* number of kwq actually freed from the free the list */ +int num_reusekwq = 0; /* number of kwq pulled back for reuse from free list */ +int num_addedfreekwq = 0; /* number of added free kwq from the last instance */ +int num_lastfreekwqcount = 0; /* the free count from the last time */ static int PTH_HASHSIZE = 100; +static zone_t kwq_zone; /* zone for allocation of ksyn_queue */ +static zone_t kwe_zone; /* zone for allocation of ksyn_waitq_element */ #define SEQFIT 0 #define FIRSTFIT 1 struct ksyn_queue { - TAILQ_HEAD(, uthread) ksynq_uthlist; + TAILQ_HEAD(ksynq_kwelist_head, ksyn_waitq_element) ksynq_kwelist; uint32_t ksynq_count; /* number of entries in queue */ uint32_t ksynq_firstnum; /* lowest seq in queue */ uint32_t ksynq_lastnum; /* highest seq in queue */ }; +typedef struct ksyn_queue * ksyn_queue_t; #define KSYN_QUEUE_READ 0 #define KSYN_QUEUE_LREAD 1 @@ -146,9 +203,6 @@ struct ksyn_queue { struct ksyn_wait_queue { LIST_ENTRY(ksyn_wait_queue) kw_hash; LIST_ENTRY(ksyn_wait_queue) kw_list; -#if USE_WAITQUEUE - struct wait_queue kw_wq; -#endif /* USE_WAITQUEUE */ user_addr_t kw_addr; uint64_t kw_owner; uint64_t kw_object; /* object backing in shared mode */ @@ -157,78 +211,113 @@ struct ksyn_wait_queue { int kw_pflags; /* flags under listlock protection */ struct timeval kw_ts; /* timeval need for upkeep before free */ int kw_iocount; /* inuse reference */ + int kw_dropcount; /* current users unlocking... */ int kw_type; /* queue type like mutex, cvar, etc */ uint32_t kw_inqueue; /* num of waiters held */ + uint32_t kw_fakecount; /* number of error/prepost fakes */ uint32_t kw_highseq; /* highest seq in the queue */ uint32_t kw_lowseq; /* lowest seq in the queue */ + uint32_t kw_lword; /* L value from userland */ + uint32_t kw_uword; /* U world value from userland */ + uint32_t kw_sword; /* S word value from userland */ uint32_t kw_lastunlockseq; /* the last seq that unlocked */ +/* for CV to be used as the seq kernel has seen so far */ +#define kw_cvkernelseq kw_lastunlockseq + uint32_t kw_lastseqword; /* the last seq that unlocked */ +/* for mutex and cvar we need to track I bit values */ + uint32_t kw_nextseqword; /* the last seq that unlocked; with num of waiters */ +#define kw_initrecv kw_nextseqword /* number of incoming waiters with Ibit seen sofar */ + uint32_t kw_overlapwatch; /* chance for overlaps */ +#define kw_initcount kw_overlapwatch /* number of incoming waiters with Ibit expected */ + uint32_t kw_initcountseq; /* highest seq with Ibit on for mutex and cvar*/ uint32_t kw_pre_rwwc; /* prepost count */ uint32_t kw_pre_lockseq; /* prepost target seq */ - uint32_t kw_pre_cvretval; /* retval for cwait on prepost */ - uint32_t kw_pre_limrd; /* prepost read only(rwlock) */ - uint32_t kw_pre_limrdseq; /* prepost limit seq for reads(rwlock) */ - uint32_t kw_pre_limrdbits; /* seqbit needed for updates on prepost */ + uint32_t kw_pre_sseq; /* prepost target sword, in cvar used for mutexowned */ uint32_t kw_pre_intrcount; /* prepost of missed wakeup due to intrs */ uint32_t kw_pre_intrseq; /* prepost of missed wakeup limit seq */ uint32_t kw_pre_intrretbits; /* return bits value for missed wakeup threads */ uint32_t kw_pre_intrtype; /* type of failed wakueps*/ int kw_kflags; - TAILQ_HEAD(, uthread) kw_uthlist; /* List of uthreads */ struct ksyn_queue kw_ksynqueues[KSYN_QUEUE_MAX]; /* queues to hold threads */ lck_mtx_t kw_lock; /* mutex lock protecting this structure */ - struct ksyn_wait_queue * kw_attq; /* attached queue (cvar->mutex, need in prepost */ }; - -typedef struct ksyn_queue * ksyn_queue_t; typedef struct ksyn_wait_queue * ksyn_wait_queue_t; -#define PTHRW_EBIT 0x01 -#define PTHRW_LBIT 0x02 -#define PTHRW_YBIT 0x04 -#define PTHRW_WBIT 0x08 -#define PTHRW_UBIT 0x10 -#define PTHRW_RETRYBIT 0x20 -/* same as 0x20, shadow W bit for rwlock */ -#define PTHRW_SHADOW_W 0x20 - -#define PTHRW_TRYLKBIT 0x40 -#define PTHRW_RW_HUNLOCK 0x40 /* returning read thread responsible to handle unlock */ - -#define PTHRW_MTX_NONE 0x80 -#define PTHRW_RW_INIT 0x80 /* reset on the lock bits */ -/* same as 0x80, spurious rwlock unlock ret from kernel */ -#define PTHRW_RW_SPURIOUS 0x80 - #define PTHRW_INC 0x100 - -#define PTHRW_BIT_MASK 0x000000ff; +#define PTHRW_BIT_MASK 0x000000ff #define PTHRW_COUNT_SHIFT 8 #define PTHRW_COUNT_MASK 0xffffff00 #define PTHRW_MAX_READERS 0xffffff00 +/* New model bits on Lword */ +#define PTH_RWL_KBIT 0x01 /* users cannot acquire in user mode */ +#define PTH_RWL_EBIT 0x02 /* exclusive lock in progress */ +#define PTH_RWL_WBIT 0x04 /* write waiters pending in kernel */ +#define PTH_RWL_PBIT 0x04 /* prepost (cv) pending in kernel */ +#define PTH_RWL_YBIT 0x08 /* yielding write waiters pending in kernel */ +#define PTH_RWL_RETRYBIT 0x08 /* mutex retry wait */ +#define PTH_RWL_LBIT 0x10 /* long read in progress */ +#define PTH_RWL_MTXNONE 0x10 /* indicates the cvwait does not have mutex held */ +#define PTH_RWL_UBIT 0x20 /* upgrade request pending */ +#define PTH_RWL_MTX_WAIT 0x20 /* in cvar in mutex wait */ +#define PTH_RWL_RBIT 0x40 /* reader pending in kernel(not used) */ +#define PTH_RWL_MBIT 0x40 /* overlapping grants from kernel */ +#define PTH_RWL_TRYLKBIT 0x40 /* trylock attempt (mutex only) */ +#define PTH_RWL_IBIT 0x80 /* lcok reset, held untill first succeesful unlock */ + + +/* UBIT values for mutex, cvar */ +#define PTH_RWU_SBIT 0x01 +#define PTH_RWU_BBIT 0x02 + +#define PTHRW_RWL_INIT PTH_RWL_IBIT /* reset state on the lock bits (U)*/ + +/* New model bits on Sword */ +#define PTH_RWS_SBIT 0x01 /* kernel transition seq not set yet*/ +#define PTH_RWS_IBIT 0x02 /* Sequence is not set on return from kernel */ +#define PTH_RWS_CV_CBIT PTH_RWS_SBIT /* kernel has cleared all info w.r.s.t CV */ +#define PTH_RWS_CV_PBIT PTH_RWS_IBIT /* kernel has prepost/fake structs only,no waiters */ +#define PTH_RWS_CV_MBIT PTH_RWL_MBIT /* to indicate prepost return */ +#define PTH_RWS_WSVBIT 0x04 /* save W bit */ +#define PTH_RWS_USVBIT 0x08 /* save U bit */ +#define PTH_RWS_YSVBIT 0x10 /* save Y bit */ +#define PTHRW_RWS_INIT PTH_RWS_SBIT /* reset on the lock bits (U)*/ +#define PTHRW_RWS_SAVEMASK (PTH_RWS_WSVBIT|PTH_RWS_USVBIT|PTH_RWS_YSVBIT) /*save bits mask*/ +#define PTHRW_SW_Reset_BIT_MASK 0x000000fe /* remove S bit and get rest of the bits */ + +#define PTHRW_RWS_INIT PTH_RWS_SBIT /* reset on the lock bits (U)*/ + + +#define PTHRW_UN_BIT_MASK 0x000000bf /* remove overlap bit */ + + +#define PTHREAD_MTX_TID_SWITCHING (uint64_t)-1 + +/* new L word defns */ +#define is_rwl_readinuser(x) ((((x) & (PTH_RWL_UBIT | PTH_RWL_KBIT)) == 0)||(((x) & PTH_RWL_LBIT) != 0)) +#define is_rwl_ebit_set(x) (((x) & PTH_RWL_EBIT) != 0) +#define is_rwl_lbit_set(x) (((x) & PTH_RWL_LBIT) != 0) +#define is_rwl_readoverlap(x) (((x) & PTH_RWL_MBIT) != 0) +#define is_rw_ubit_set(x) (((x) & PTH_RWL_UBIT) != 0) + +/* S word checks */ +#define is_rws_setseq(x) (((x) & PTH_RWS_SBIT)) +#define is_rws_setunlockinit(x) (((x) & PTH_RWS_IBIT)) + /* first contended seq that kernel sees */ #define KW_MTXFIRST_KSEQ 0x200 #define KW_CVFIRST_KSEQ 1 #define KW_RWFIRST_KSEQ 0x200 -#define is_rw_ewubit_set(x) ((x & (PTHRW_EBIT | PTHRW_WBIT | PTHRW_UBIT)) != 0) -#define is_rw_lybit_set(x) ((x & (PTHRW_LBIT | PTHRW_YBIT)) != 0) -#define is_rw_ebit_set(x) ((x & PTHRW_EBIT) != 0) -#define is_rw_uebit_set(x) ((x & (PTHRW_EBIT | PTHRW_UBIT)) != 0) -#define is_rw_ubit_set(x) ((x & PTHRW_UBIT) != 0) -#define is_rw_either_ewyubit_set(x) ((x & (PTHRW_EBIT | PTHRW_WBIT | PTHRW_UBIT | PTHRW_YBIT)) != 0) - - -/* is x lower than Y */ -#define is_seqlower(x, y) ((x < y) || ((x - y) > (PTHRW_MAX_READERS/2))) -/* is x lower than or eq Y */ -#define is_seqlower_eq(x, y) ((x <= y) || ((x - y) > (PTHRW_MAX_READERS/2))) +int is_seqlower(uint32_t x, uint32_t y); +int is_seqlower_eq(uint32_t x, uint32_t y); +int is_seqhigher(uint32_t x, uint32_t y); +int is_seqhigher_eq(uint32_t x, uint32_t y); +int find_diff(uint32_t upto, uint32_t lowest); -/* is x greater than Y */ -#define is_seqhigher(x, y) ((x > y) || ((y - x) > (PTHRW_MAX_READERS/2))) static inline int diff_genseq(uint32_t x, uint32_t y) { if (x > y) { @@ -292,27 +381,39 @@ static inline int diff_genseq(uint32_t x, uint32_t y) { #define PTHREAD_POLICY_FLAGS_MASK 0x1c0 #define _PTHREAD_MTX_OPT_HOLDLOCK 0x200 -#define _PTHREAD_MTX_OPT_NOHOLDLOCK 0x400 -#define _PTHREAD_MTX_OPT_LASTDROP (_PTHREAD_MTX_OPT_HOLDLOCK | _PTHREAD_MTX_OPT_NOHOLDLOCK) +#define _PTHREAD_MTX_OPT_NOMTX 0x400 + +#define _PTHREAD_MTX_OPT_NOTIFY 0x1000 +#define _PTHREAD_MTX_OPT_MUTEX 0x2000 /* this is a mutex type */ +#define _PTHREAD_RWLOCK_UPGRADE_TRY 0x10000 + +/* pflags */ #define KSYN_WQ_INLIST 1 #define KSYN_WQ_INHASH 2 #define KSYN_WQ_SHARED 4 +#define KSYN_WQ_WAITING 8 /* threads waiting for this wq to be available */ #define KSYN_WQ_FLIST 0X10 /* in free list to be freed after a short delay */ +/* kflags */ +#define KSYN_KWF_INITCLEARED 1 /* the init status found and preposts cleared */ +#define KSYN_KWF_ZEROEDOUT 2 /* the lword, etc are inited to 0 */ + #define KSYN_CLEANUP_DEADLINE 10 int psynch_cleanupset; thread_call_t psynch_thcall; #define KSYN_WQTYPE_INWAIT 0x1000 +#define KSYN_WQTYPE_INDROP 0x2000 #define KSYN_WQTYPE_MTX 0x1 #define KSYN_WQTYPE_CVAR 0x2 #define KSYN_WQTYPE_RWLOCK 0x4 #define KSYN_WQTYPE_SEMA 0x8 #define KSYN_WQTYPE_BARR 0x10 -#define KSYN_WQTYPE_MASK 0xffff +#define KSYN_WQTYPE_MASK 0x00ff #define KSYN_MTX_MAX 0x0fffffff +#define KSYN_WQTYPE_MUTEXDROP (KSYN_WQTYPE_INDROP | KSYN_WQTYPE_MTX) #define KW_UNLOCK_PREPOST 0x01 #define KW_UNLOCK_PREPOST_UPGRADE 0x02 @@ -324,14 +425,14 @@ thread_call_t psynch_thcall; #define CLEAR_PREPOST_BITS(kwq) {\ kwq->kw_pre_lockseq = 0; \ + kwq->kw_pre_sseq = PTHRW_RWS_INIT; \ kwq->kw_pre_rwwc = 0; \ - kwq->kw_pre_cvretval = 0; \ } -#define CLEAR_READ_PREPOST_BITS(kwq) {\ - kwq->kw_pre_limrd = 0; \ - kwq->kw_pre_limrdseq = 0; \ - kwq->kw_pre_limrdbits = 0; \ +#define CLEAR_INITCOUNT_BITS(kwq) {\ + kwq->kw_initcount = 0; \ + kwq->kw_initrecv = 0; \ + kwq->kw_initcountseq = 0; \ } #define CLEAR_INTR_PREPOST_BITS(kwq) {\ @@ -340,7 +441,30 @@ thread_call_t psynch_thcall; kwq->kw_pre_intrretbits = 0; \ kwq->kw_pre_intrtype = 0; \ } - + +#define CLEAR_REINIT_BITS(kwq) {\ + if ((kwq->kw_type & KSYN_WQTYPE_MASK) == KSYN_WQTYPE_CVAR) { \ + if((kwq->kw_inqueue != 0) && (kwq->kw_inqueue != kwq->kw_fakecount)) \ + panic("CV:entries in queue durinmg reinit %d:%d\n",kwq->kw_inqueue, kwq->kw_fakecount); \ + };\ + if ((kwq->kw_type & KSYN_WQTYPE_MASK) == KSYN_WQTYPE_RWLOCK) { \ + kwq->kw_nextseqword = PTHRW_RWS_INIT; \ + kwq->kw_overlapwatch = 0; \ + }; \ + kwq->kw_pre_lockseq = 0; \ + kwq->kw_pre_rwwc = 0; \ + kwq->kw_pre_sseq = PTHRW_RWS_INIT; \ + kwq->kw_lastunlockseq = PTHRW_RWL_INIT; \ + kwq->kw_lastseqword = PTHRW_RWS_INIT; \ + kwq->kw_pre_intrcount = 0; \ + kwq->kw_pre_intrseq = 0; \ + kwq->kw_pre_intrretbits = 0; \ + kwq->kw_pre_intrtype = 0; \ + kwq->kw_lword = 0; \ + kwq->kw_uword = 0; \ + kwq->kw_sword = PTHRW_RWS_INIT; \ + } + void pthread_list_lock(void); void pthread_list_unlock(void); void pthread_list_lock_spin(void); @@ -349,41 +473,69 @@ void ksyn_wqlock(ksyn_wait_queue_t kwq); void ksyn_wqunlock(ksyn_wait_queue_t kwq); ksyn_wait_queue_t ksyn_wq_hash_lookup(user_addr_t mutex, proc_t p, int flags, uint64_t object, uint64_t offset); int ksyn_wqfind(user_addr_t mutex, uint32_t mgen, uint32_t ugen, uint32_t rw_wc, uint64_t tid, int flags, int wqtype , ksyn_wait_queue_t * wq); -void ksyn_wqrelease(ksyn_wait_queue_t mkwq, ksyn_wait_queue_t ckwq); -int ksyn_block_thread_locked(ksyn_wait_queue_t kwq, uint64_t abstime, uthread_t uth); -kern_return_t ksyn_wakeup_thread(ksyn_wait_queue_t kwq, uthread_t uth); -void ksyn_move_wqthread(ksyn_wait_queue_t ckwq, ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t updateval, int diffgen, int nomutex); -extern thread_t port_name_to_thread(mach_port_name_t port_name); +void ksyn_wqrelease(ksyn_wait_queue_t mkwq, ksyn_wait_queue_t ckwq, int qfreenow, int wqtype); extern int ksyn_findobj(uint64_t mutex, uint64_t * object, uint64_t * offset); -static void UPDATE_KWQ(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t ugen, uint32_t rw_wc, uint64_t tid, int wqtype, int retry); -void psynch_mutexdrop_internal(ksyn_wait_queue_t kwq, uint32_t lkseq, uint32_t ugen, int flags); - -#if USE_WAITQUEUE -kern_return_t wait_queue_move_all(wait_queue_t from, event64_t eventfrom, wait_queue_t to, event64_t eventto); -kern_return_t wait_queue_move_thread(wait_queue_t from, event64_t eventfrom, thread_t th, wait_queue_t to, event64_t eventto, thread_t * mthp); -#endif /* USE_WAITQUEUE */ -int kwq_handle_unlock(ksyn_wait_queue_t, uint32_t mgen, uint32_t * updatep, int flags, int *blockp, uint32_t premgen); +static void UPDATE_CVKWQ(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t ugen, uint32_t rw_wc, uint64_t tid, int wqtype); +extern thread_t port_name_to_thread(mach_port_name_t port_name); + +int ksyn_block_thread_locked(ksyn_wait_queue_t kwq, uint64_t abstime, ksyn_waitq_element_t kwe, int log); +kern_return_t ksyn_wakeup_thread(ksyn_wait_queue_t kwq, ksyn_waitq_element_t kwe); +void ksyn_freeallkwe(ksyn_queue_t kq); + +uint32_t psynch_mutexdrop_internal(ksyn_wait_queue_t kwq, uint32_t lkseq, uint32_t ugen, int flags); +int kwq_handle_unlock(ksyn_wait_queue_t, uint32_t mgen, uint32_t rw_wc, uint32_t * updatep, int flags, int *blockp, uint32_t premgen); + void ksyn_queue_init(ksyn_queue_t kq); -int ksyn_queue_insert(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t mgen, struct uthread * uth, int firstfit); -struct uthread * ksyn_queue_removefirst(ksyn_queue_t kq, ksyn_wait_queue_t kwq); -void ksyn_queue_removeitem(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uthread_t uth); +int ksyn_queue_insert(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t mgen, struct uthread * uth, ksyn_waitq_element_t kwe, int firstfit); +ksyn_waitq_element_t ksyn_queue_removefirst(ksyn_queue_t kq, ksyn_wait_queue_t kwq); +void ksyn_queue_removeitem(ksyn_wait_queue_t kwq, ksyn_queue_t kq, ksyn_waitq_element_t kwe); +int ksyn_queue_move_tofree(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t upto, ksyn_queue_t freeq, int all, int reease); void update_low_high(ksyn_wait_queue_t kwq, uint32_t lockseq); uint32_t find_nextlowseq(ksyn_wait_queue_t kwq); uint32_t find_nexthighseq(ksyn_wait_queue_t kwq); + int find_seq_till(ksyn_wait_queue_t kwq, uint32_t upto, uint32_t nwaiters, uint32_t *countp); -int find_diff(uint32_t upto, uint32_t lowest); uint32_t ksyn_queue_count_tolowest(ksyn_queue_t kq, uint32_t upto); + +ksyn_waitq_element_t ksyn_queue_find_cvpreposeq(ksyn_queue_t kq, uint32_t cgen); +uint32_t ksyn_queue_cvcount_entries(ksyn_queue_t kq, uint32_t upto, uint32_t from, int * numwaitersp, int * numintrp, int * numprepop); +void ksyn_handle_cvbroad(ksyn_wait_queue_t ckwq, uint32_t upto, uint32_t *updatep); +void ksyn_cvupdate_fixup(ksyn_wait_queue_t ckwq, uint32_t *updatep, ksyn_queue_t kfreeq, int release); +ksyn_waitq_element_t ksyn_queue_find_signalseq(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t toseq, uint32_t lockseq); +ksyn_waitq_element_t ksyn_queue_find_threadseq(ksyn_wait_queue_t ckwq, ksyn_queue_t kq, thread_t th, uint32_t toseq); + int ksyn_wakeupreaders(ksyn_wait_queue_t kwq, uint32_t limitread, int longreadset, int allreaders, uint32_t updatebits, int * wokenp); int kwq_find_rw_lowest(ksyn_wait_queue_t kwq, int flags, uint32_t premgen, int * type, uint32_t lowest[]); -uthread_t ksyn_queue_find_seq(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t seq); +ksyn_waitq_element_t ksyn_queue_find_seq(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t seq, int remove); +int kwq_handle_overlap(ksyn_wait_queue_t kwq, uint32_t lgenval, uint32_t ugenval, uint32_t rw_wc, uint32_t *updatebitsp, int flags , int * blockp); int kwq_handle_downgrade(ksyn_wait_queue_t kwq, uint32_t mgen, int flags, uint32_t premgen, int * blockp); - static void -UPDATE_KWQ(__unused ksyn_wait_queue_t kwq, __unused uint32_t mgen, __unused uint32_t ugen, __unused uint32_t rw_wc, __unused uint64_t tid, __unused int wqtype, __unused int retry) +UPDATE_CVKWQ(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t ugen, uint32_t rw_wc, __unused uint64_t tid, __unused int wqtype) { + if ((kwq->kw_type & KSYN_WQTYPE_MASK) == KSYN_WQTYPE_CVAR) { + if ((kwq->kw_kflags & KSYN_KWF_ZEROEDOUT) != 0) { + /* the values of L,U and S are cleared out due to L==S in previous transition */ + kwq->kw_lword = mgen; + kwq->kw_uword = ugen; + kwq->kw_sword = rw_wc; + kwq->kw_kflags &= ~KSYN_KWF_ZEROEDOUT; + } + if (is_seqhigher((mgen & PTHRW_COUNT_MASK), (kwq->kw_lword & PTHRW_COUNT_MASK)) != 0) + kwq->kw_lword = mgen; + if (is_seqhigher((ugen & PTHRW_COUNT_MASK), (kwq->kw_uword & PTHRW_COUNT_MASK)) != 0) + kwq->kw_uword = ugen; + if ((rw_wc & PTH_RWS_CV_CBIT) != 0) { + if(is_seqlower(kwq->kw_cvkernelseq, (rw_wc & PTHRW_COUNT_MASK)) != 0) { + kwq->kw_cvkernelseq = (rw_wc & PTHRW_COUNT_MASK); + } + if (is_seqhigher((rw_wc & PTHRW_COUNT_MASK), (kwq->kw_sword & PTHRW_COUNT_MASK)) != 0) + kwq->kw_sword = rw_wc; + } + } } + /* to protect the hashes, iocounts, freelist */ void pthread_list_lock(void) @@ -426,51 +578,43 @@ ksyn_wqunlock(ksyn_wait_queue_t kwq) /* routine to drop the mutex unlocks , used both for mutexunlock system call and drop during cond wait */ -void +uint32_t psynch_mutexdrop_internal(ksyn_wait_queue_t kwq, uint32_t lkseq, uint32_t ugen, int flags) { - uint32_t nextgen, low_writer, updatebits; + uint32_t nextgen, low_writer, updatebits, returnbits = 0; int firstfit = flags & _PTHREAD_MUTEX_POLICY_FIRSTFIT; - uthread_t uth; + ksyn_waitq_element_t kwe = NULL; kern_return_t kret = KERN_SUCCESS; - nextgen = (ugen + PTHRW_INC); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_KMDROP | DBG_FUNC_START, kwq, lkseq, ugen, flags, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_KMDROP | DBG_FUNC_START, (uint32_t)kwq->kw_addr, lkseq, ugen, flags, 0); #endif /* _PSYNCH_TRACE_ */ ksyn_wqlock(kwq); redrive: - -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, kwq, 1, kwq->kw_inqueue, nextgen, 0); -#endif /* _PSYNCH_TRACE_ */ + if (kwq->kw_inqueue != 0) { - updatebits = (kwq->kw_highseq & PTHRW_COUNT_MASK) | PTHRW_EBIT; - kwq->kw_lastunlockseq = ugen; + updatebits = (kwq->kw_highseq & PTHRW_COUNT_MASK) | (PTH_RWL_EBIT | PTH_RWL_KBIT); + kwq->kw_lastunlockseq = (ugen & PTHRW_COUNT_MASK); if (firstfit != 0) { -#if __TESTPANICS__ - panic("psynch_mutexdrop_internal: first fit mutex arrives, not enabled yet \n"); -#endif /* __TESTPANICS__ */ /* first fit , pick any one */ - uth = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], kwq); + kwe = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], kwq); + kwe->kwe_psynchretval = updatebits; + kwe->kwe_kwqqueue = NULL; - if (kwq->kw_ksynqueues[KSYN_QUEUE_WRITER].ksynq_count != 0) - updatebits |= PTHRW_WBIT; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, kwq, 2, uth, updatebits, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 0xcafecaf1, (uint32_t)(thread_tid((struct thread *)(((struct uthread *)(kwe->kwe_uth))->uu_context.vc_thread))), kwe->kwe_psynchretval, 0); #endif /* _PSYNCH_TRACE_ */ - - uth->uu_psynchretval = updatebits; - uth->uu_kwqqueue = NULL; - - kret = ksyn_wakeup_thread(kwq, uth); + + kret = ksyn_wakeup_thread(kwq, kwe); +#if __TESTPANICS__ if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) panic("psynch_mutexdrop_internal: panic unable to wakeup firstfit mutex thread\n"); +#endif /* __TESTPANICS__ */ if (kret == KERN_NOT_WAITING) goto redrive; } else { @@ -479,86 +623,124 @@ psynch_mutexdrop_internal(ksyn_wait_queue_t kwq, uint32_t lkseq, uint32_t ugen, low_writer &= PTHRW_COUNT_MASK; if (low_writer == nextgen) { -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, kwq, 3, low_writer, nextgen, 0); -#endif /* _PSYNCH_TRACE_ */ /* next seq to be granted found */ - uth = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], kwq); - if (kwq->kw_ksynqueues[KSYN_QUEUE_WRITER].ksynq_count != 0) - updatebits |= PTHRW_WBIT; + kwe = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], kwq); - uth->uu_psynchretval = updatebits; - uth->uu_kwqqueue = NULL; + /* since the grant could be cv, make sure mutex wait is set incase the thread interrupted out */ + kwe->kwe_psynchretval = updatebits | PTH_RWL_MTX_WAIT; + kwe->kwe_kwqqueue = NULL; - kret = ksyn_wakeup_thread(kwq, uth); +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 0xcafecaf2, (uint32_t)(thread_tid((struct thread *)(((struct uthread *)(kwe->kwe_uth))->uu_context.vc_thread))), kwe->kwe_psynchretval, 0); +#endif /* _PSYNCH_TRACE_ */ + + kret = ksyn_wakeup_thread(kwq, kwe); +#if __TESTPANICS__ if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) panic("psynch_mutexdrop_internal: panic unable to wakeup fairshare mutex thread\n"); - if (kret == KERN_NOT_WAITING) - goto redrive; +#endif /* __TESTPANICS__ */ + if (kret == KERN_NOT_WAITING) { + /* interrupt post */ + kwq->kw_pre_intrcount = 1; + kwq->kw_pre_intrseq = nextgen; + kwq->kw_pre_intrretbits = updatebits; + kwq->kw_pre_intrtype = PTH_RW_TYPE_WRITE; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 0xfafafaf1, nextgen, kwq->kw_pre_intrretbits, 0); +#endif /* _PSYNCH_TRACE_ */ + } } else if (is_seqhigher(low_writer, nextgen) != 0) { -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, kwq, 4, low_writer, nextgen, 0); -#endif /* _PSYNCH_TRACE_ */ kwq->kw_pre_rwwc++; + + if (kwq->kw_pre_rwwc > 1) { + __FAILEDUSERTEST__("psynch_mutexdrop_internal: prepost more than one (1)\n"); + goto out; + } + kwq->kw_pre_lockseq = (nextgen & PTHRW_COUNT_MASK); - } else { -#if __TESTPANICS__ - panic("psynch_mutexdrop_internal: FS mutex unlock sequence higher than the lowest one is queue\n"); -#endif /* __TESTPANICS__ */ #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, kwq, 5, low_writer, nextgen, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 0xfefefef1, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); #endif /* _PSYNCH_TRACE_ */ - uth = ksyn_queue_find_seq(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], nextgen); - if (uth != NULL) { - /* next seq to be granted found */ + } else { - if (kwq->kw_ksynqueues[KSYN_QUEUE_WRITER].ksynq_count != 0) - updatebits |= PTHRW_WBIT; - + //__FAILEDUSERTEST__("psynch_mutexdrop_internal: FS mutex unlock sequence higher than the lowest one is queue\n"); + + kwe = ksyn_queue_find_seq(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], (nextgen & PTHRW_COUNT_MASK), 1); + if (kwe != NULL) { + /* next seq to be granted found */ + /* since the grant could be cv, make sure mutex wait is set incase the thread interrupted out */ + kwe->kwe_psynchretval = updatebits | PTH_RWL_MTX_WAIT; + kwe->kwe_kwqqueue = NULL; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, kwq, 6, updatebits, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 0xcafecaf3, (uint32_t)(thread_tid((struct thread *)(((struct uthread *)(kwe->kwe_uth))->uu_context.vc_thread))), kwe->kwe_psynchretval, 0); #endif /* _PSYNCH_TRACE_ */ - uth->uu_psynchretval = updatebits; - uth->uu_kwqqueue = NULL; - - kret = ksyn_wakeup_thread(kwq, uth); + kret = ksyn_wakeup_thread(kwq, kwe); +#if __TESTPANICS__ if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) panic("psynch_mutexdrop_internal: panic unable to wakeup fairshare mutex thread\n"); +#endif /* __TESTPANICS__ */ if (kret == KERN_NOT_WAITING) goto redrive; } else { /* next seq to be granted not found, prepost */ -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, kwq, 7, 0, 0, 0); -#endif /* _PSYNCH_TRACE_ */ kwq->kw_pre_rwwc++; + + if (kwq->kw_pre_rwwc > 1) { + __FAILEDUSERTEST__("psynch_mutexdrop_internal: prepost more than one (2)\n"); + goto out; + } + kwq->kw_pre_lockseq = (nextgen & PTHRW_COUNT_MASK); +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 0xfefefef2, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); +#endif /* _PSYNCH_TRACE_ */ } } } } else { + + /* if firstfit the last one could be spurious */ + if (firstfit == 0) { + kwq->kw_lastunlockseq = (ugen & PTHRW_COUNT_MASK); + kwq->kw_pre_rwwc++; + + if (kwq->kw_pre_rwwc > 1) { + __FAILEDUSERTEST__("psynch_mutexdrop_internal: prepost more than one (3)\n"); + goto out; + } + + kwq->kw_pre_lockseq = (nextgen & PTHRW_COUNT_MASK); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, kwq, 8, 0, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 0xfefefef3, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); #endif /* _PSYNCH_TRACE_ */ - /* if firstfit the last one could be spurious */ - if ((firstfit == 0) || ((lkseq & PTHRW_COUNT_MASK) != nextgen)) { + } else { + /* first fit case */ #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, kwq, 9, 0, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 0xfefefef3, kwq->kw_lastunlockseq, kwq->kw_pre_lockseq, 0); #endif /* _PSYNCH_TRACE_ */ - kwq->kw_lastunlockseq = ugen; - kwq->kw_pre_rwwc++; - kwq->kw_pre_lockseq = (nextgen & PTHRW_COUNT_MASK); + kwq->kw_lastunlockseq = (ugen & PTHRW_COUNT_MASK); + /* not set or the new lkseq is higher */ + if ((kwq->kw_pre_rwwc == 0) || (is_seqlower(kwq->kw_pre_lockseq, lkseq) == 0)) + kwq->kw_pre_lockseq = (lkseq & PTHRW_COUNT_MASK); + kwq->kw_pre_rwwc = 1; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 0xfefefef3, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); +#endif /* _PSYNCH_TRACE_ */ + + /* indicate prepost content in kernel */ + returnbits = lkseq | PTH_RWL_PBIT; } } +out: ksyn_wqunlock(kwq); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_KMDROP | DBG_FUNC_END, kwq, 0, 0, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_KMDROP | DBG_FUNC_END, (uint32_t)kwq->kw_addr, 0xeeeeeeed, 0, 0, 0); #endif /* _PSYNCH_TRACE_ */ - ksyn_wqrelease(kwq, NULL); - return; + ksyn_wqrelease(kwq, NULL, 1, (KSYN_WQTYPE_INDROP | KSYN_WQTYPE_MTX)); + return(returnbits); } /* @@ -575,19 +757,24 @@ psynch_mutexwait(__unused proc_t p, struct psynch_mutexwait_args * uap, uint32_t int flags = uap->flags; ksyn_wait_queue_t kwq; int error=0; - int ins_flags; + int ins_flags, retry; uthread_t uth; int firstfit = flags & _PTHREAD_MUTEX_POLICY_FIRSTFIT; - uint32_t lockseq, updatebits; - + uint32_t lockseq, updatebits=0; + ksyn_waitq_element_t kwe; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_MLWAIT | DBG_FUNC_START, (uint32_t)mutex, mgen, ugen, flags, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_MLWAIT | DBG_FUNC_START, (uint32_t)mutex, mgen, ugen, flags, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_MLWAIT | DBG_FUNC_NONE, (uint32_t)mutex, mgen, ugen, (uint32_t)tid, 0); #endif /* _PSYNCH_TRACE_ */ uth = current_uthread(); - uth->uu_lockseq = uap->mgen; + kwe = &uth->uu_kwe; + kwe->kwe_lockseq = uap->mgen; + kwe->kwe_uth = uth; + kwe->kwe_psynchretval = 0; + kwe->kwe_kwqqueue = NULL; lockseq = (uap->mgen & PTHRW_COUNT_MASK); if (firstfit == 0) { @@ -600,67 +787,105 @@ psynch_mutexwait(__unused proc_t p, struct psynch_mutexwait_args * uap, uint32_t error = ksyn_wqfind(mutex, mgen, ugen, 0, tid, flags, (KSYN_WQTYPE_INWAIT|KSYN_WQTYPE_MTX), &kwq); if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_MLWAIT | DBG_FUNC_END, (uint32_t)mutex, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_MLWAIT | DBG_FUNC_END, (uint32_t)mutex, 1, 0xdeadbeef, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } ksyn_wqlock(kwq); - - if ((kwq->kw_pre_rwwc != 0) && ((ins_flags == FIRSTFIT) || (lockseq == kwq->kw_pre_lockseq ))) { + + if ((mgen & PTH_RWL_RETRYBIT) != 0) { + retry = 1; + mgen &= ~PTH_RWL_RETRYBIT; + } + + /* handle first the missed wakeups */ + if ((kwq->kw_pre_intrcount != 0) && + ((kwq->kw_pre_intrtype == PTH_RW_TYPE_WRITE)) && + (is_seqlower_eq(lockseq, (kwq->kw_pre_intrseq & PTHRW_COUNT_MASK)) != 0)) { + kwq->kw_pre_intrcount--; + kwe->kwe_psynchretval = kwq->kw_pre_intrretbits; + if (kwq->kw_pre_intrcount==0) + CLEAR_INTR_PREPOST_BITS(kwq); + ksyn_wqunlock(kwq); + *retval = kwe->kwe_psynchretval; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_MLWAIT | DBG_FUNC_NONE, (uint32_t)mutex, 0xfafafaf1, kwe->kwe_psynchretval, kwq->kw_pre_intrcount, 0); +#endif /* _PSYNCH_TRACE_ */ + goto out; + } + + if ((kwq->kw_pre_rwwc != 0) && ((ins_flags == FIRSTFIT) || ((lockseq & PTHRW_COUNT_MASK) == (kwq->kw_pre_lockseq & PTHRW_COUNT_MASK) ))) { /* got preposted lock */ kwq->kw_pre_rwwc--; if (kwq->kw_pre_rwwc == 0) { CLEAR_PREPOST_BITS(kwq); - kwq->kw_lastunlockseq = 0; + kwq->kw_lastunlockseq = PTHRW_RWL_INIT; + if (kwq->kw_inqueue == 0) { + updatebits = lockseq | (PTH_RWL_KBIT | PTH_RWL_EBIT); + } else { + updatebits = (kwq->kw_highseq & PTHRW_COUNT_MASK) | (PTH_RWL_KBIT | PTH_RWL_EBIT); + } + updatebits &= ~PTH_RWL_MTX_WAIT; + + kwe->kwe_psynchretval = updatebits; + + if (updatebits == 0) { + __FAILEDUSERTEST__("psynch_mutexwait(prepost): returning 0 lseq in mutexwait with no EBIT \n"); + } + ksyn_wqunlock(kwq); + *retval = updatebits; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_MLWAIT | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 0xfefefef1, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); +#endif /* _PSYNCH_TRACE_ */ + goto out; } else { - panic("psynch_mutexwait: more than one prepost %d\n", (kwq->kw_pre_rwwc + 1)); + __FAILEDUSERTEST__("psynch_mutexwait: more than one prepost\n"); kwq->kw_pre_lockseq += PTHRW_INC; /* look for next one */ + ksyn_wqunlock(kwq); + error = EINVAL; + goto out; } - if (kwq->kw_inqueue == 0) { - updatebits = lockseq | PTHRW_EBIT; - } else { - updatebits = (kwq->kw_highseq & PTHRW_COUNT_MASK) | (PTHRW_EBIT | PTHRW_WBIT); - } - - uth->uu_psynchretval = updatebits; -#if __TESTPANICS__ - if ((updatebits & PTHRW_COUNT_MASK) == 0) - panic("psynch_mutexwait: (prepost)returning 0 lseq in mutexwait with EBIT \n"); -#endif /* __TESTPANICS__ */ - ksyn_wqunlock(kwq); - *retval = updatebits; - goto out; } - error = ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], mgen, uth, ins_flags); - if (error != 0) - panic("psynch_mutexwait: failed to enqueue\n"); +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_MLWAIT | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 0xfeedfeed, mgen, ins_flags, 0); +#endif /* _PSYNCH_TRACE_ */ - error = ksyn_block_thread_locked(kwq, (uint64_t)0, uth); + error = ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], mgen, uth, kwe, ins_flags); + if (error != 0) { + ksyn_wqunlock(kwq); +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_MLWAIT | DBG_FUNC_END, (uint32_t)mutex, 2, 0xdeadbeef, error, 0); +#endif /* _PSYNCH_TRACE_ */ + goto out; + } + + error = ksyn_block_thread_locked(kwq, (uint64_t)0, kwe, 0); /* drops the wq lock */ if (error != 0) { ksyn_wqlock(kwq); + #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_MLWAIT | DBG_FUNC_NONE, (uint32_t)mutex, 2, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_MLWAIT | DBG_FUNC_NONE, (uint32_t)mutex, 3, 0xdeadbeef, error, 0); #endif /* _PSYNCH_TRACE_ */ - if (uth->uu_kwqqueue != NULL) - ksyn_queue_removeitem(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], uth); + if (kwe->kwe_kwqqueue != NULL) + ksyn_queue_removeitem(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], kwe); ksyn_wqunlock(kwq); } else { - updatebits = uth->uu_psynchretval; + updatebits = kwe->kwe_psynchretval; + updatebits &= ~PTH_RWL_MTX_WAIT; *retval = updatebits; -#if __TESTPANICS__ - if ((updatebits & PTHRW_COUNT_MASK) == 0) - panic("psynch_mutexwait: returning 0 lseq in mutexwait with EBIT \n"); -#endif /* __TESTPANICS__ */ + + if (updatebits == 0) + __FAILEDUSERTEST__("psynch_mutexwait: returning 0 lseq in mutexwait with no EBIT \n"); } out: - ksyn_wqrelease(kwq, NULL); + ksyn_wqrelease(kwq, NULL, 1, (KSYN_WQTYPE_INWAIT|KSYN_WQTYPE_MTX)); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_MLWAIT | DBG_FUNC_END, (uint32_t)mutex, 0, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_MLWAIT | DBG_FUNC_END, (uint32_t)mutex, 0xeeeeeeed, updatebits, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); @@ -670,32 +895,26 @@ psynch_mutexwait(__unused proc_t p, struct psynch_mutexwait_args * uap, uint32_t * psynch_mutexdrop: This system call is used for unlock postings on contended psynch mutexes. */ int -psynch_mutexdrop(__unused proc_t p, struct psynch_mutexdrop_args * uap, __unused uint32_t * retval) +psynch_mutexdrop(__unused proc_t p, struct psynch_mutexdrop_args * uap, uint32_t * retval) { user_addr_t mutex = uap->mutex; uint32_t mgen = uap->mgen; - uint32_t lkseq = mgen & PTHRW_COUNT_MASK; uint32_t ugen = uap->ugen; uint64_t tid = uap->tid; int flags = uap->flags; ksyn_wait_queue_t kwq; + uint32_t updateval; int error=0; -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_MLDROP | DBG_FUNC_START, (uint32_t)mutex, mgen, ugen, flags, 0); -#endif /* _PSYNCH_TRACE_ */ - error = ksyn_wqfind(mutex, mgen, ugen, 0, tid, flags, KSYN_WQTYPE_MTX, &kwq); + error = ksyn_wqfind(mutex, mgen, ugen, 0, tid, flags, (KSYN_WQTYPE_INDROP | KSYN_WQTYPE_MTX), &kwq); if (error != 0) { -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_MLDROP | DBG_FUNC_END, (uint32_t)mutex, 1, 0, error, 0); -#endif /* _PSYNCH_TRACE_ */ return(error); } - psynch_mutexdrop_internal(kwq, lkseq, ugen, flags); + + updateval = psynch_mutexdrop_internal(kwq, mgen, ugen, flags); /* drops the kwq reference */ -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_MLDROP | DBG_FUNC_END, (uint32_t)mutex, 0, 0, error, 0); -#endif /* _PSYNCH_TRACE_ */ + + *retval = updateval; return(0); } @@ -704,350 +923,261 @@ psynch_mutexdrop(__unused proc_t p, struct psynch_mutexdrop_args * uap, __unused * psynch_cvbroad: This system call is used for broadcast posting on blocked waiters of psynch cvars. */ int -psynch_cvbroad(__unused proc_t p, struct psynch_cvbroad_args * uap, int * retval) +psynch_cvbroad(__unused proc_t p, struct psynch_cvbroad_args * uap, uint32_t * retval) { user_addr_t cond = uap->cv; - uint32_t cgen = uap->cvgen; - uint32_t diffgen = uap->diffgen; - uint32_t mgen = uap->mgen; + uint64_t cvlsgen = uap->cvlsgen; + uint64_t cvudgen = uap->cvudgen; + uint32_t cgen, cugen, csgen, diffgen; + uint32_t uptoseq, fromseq; int flags = uap->flags; - ksyn_wait_queue_t kwq, ckwq; + ksyn_wait_queue_t ckwq; int error=0; -#if COND_MTX_WAITQUEUEMOVE - int mutexowned = flags & _PTHREAD_MTX_OPT_HOLDLOCK; - int nomutex = flags & _PTHREAD_MTX_OPT_NOHOLDLOCK; - user_addr_t mutex = uap->mutex; - uint32_t ugen = uap->ugen; - uint64_t tid = uap->tid; - uthread_t uth; - kern_return_t kret = KERN_SUCCESS; -#else /* COND_MTX_WAITQUEUEMOVE */ - int nomutex = _PTHREAD_MTX_OPT_NOHOLDLOCK; -#endif /* COND_MTX_WAITQUEUEMOVE */ - uint32_t nextgen, ngen; - int updatebits = 0; + uint32_t updatebits = 0; + uint32_t count; + struct ksyn_queue kfreeq; + + csgen = (uint32_t)((cvlsgen >> 32) & 0xffffffff); + cgen = ((uint32_t)(cvlsgen & 0xffffffff)); + cugen = (uint32_t)((cvudgen >> 32) & 0xffffffff); + diffgen = ((uint32_t)(cvudgen & 0xffffffff)); + count = (diffgen >> PTHRW_COUNT_SHIFT); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVBROAD | DBG_FUNC_START, (uint32_t)cond, (uint32_t) 0, cgen, mgen, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVBROAD | DBG_FUNC_START, (uint32_t)cond, cgen, cugen, csgen, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVBROAD | DBG_FUNC_NONE, (uint32_t)cond, 0xcbcbcbc1, diffgen,flags, 0); #endif /* _PSYNCH_TRACE_ */ - error = ksyn_wqfind(cond, cgen, cgen, 0, 0, flags, KSYN_WQTYPE_CVAR, &ckwq); + + uptoseq = cgen & PTHRW_COUNT_MASK; + fromseq = (cugen & PTHRW_COUNT_MASK) + PTHRW_INC; + + if (is_seqhigher(fromseq, uptoseq) || is_seqhigher((csgen & PTHRW_COUNT_MASK), uptoseq)) { + __FAILEDUSERTEST__("cvbroad: invalid L, U and S values\n"); + return EINVAL; + } + if (count > (uint32_t)task_threadmax) { + __FAILEDUSERTEST__("cvbroad: difference greater than maximum possible thread count\n"); + return EBUSY; + } + + ckwq = NULL; + + error = ksyn_wqfind(cond, cgen, cugen, csgen, 0, flags, (KSYN_WQTYPE_CVAR | KSYN_WQTYPE_INDROP), &ckwq); if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVBROAD | DBG_FUNC_END, (uint32_t)cond, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVBROAD | DBG_FUNC_END, (uint32_t)cond, 0, 0xdeadbeef, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } -#if COND_MTX_WAITQUEUEMOVE - ngen = mgen + (PTHRW_INC * diffgen); - if (nomutex ==0) { - error = ksyn_wqfind(mutex, ngen, ugen, 0, tid, flags, KSYN_WQTYPE_MTX, &kwq); - if (error != 0) { - kwq = NULL; - goto out; - } - } -#else /* COND_MTX_WAITQUEUEMOVE */ - nomutex = _PTHREAD_MTX_OPT_NOHOLDLOCK; - kwq= NULL; - ngen = 0; -#endif /* COND_MTX_WAITQUEUEMOVE */ - + *retval = 0; ksyn_wqlock(ckwq); -#if COND_MTX_WAITQUEUEMOVE -redrive: -#endif /* COND_MTX_WAITQUEUEMOVE */ - if (diffgen > ckwq->kw_inqueue) { - ckwq->kw_pre_rwwc = diffgen - ckwq->kw_inqueue; - ckwq->kw_pre_lockseq = cgen & PTHRW_BIT_MASK; - updatebits = ckwq->kw_pre_rwwc; /* unused mutex refs */ - nextgen = (mgen + (ckwq->kw_pre_rwwc * PTHRW_INC)); - } else { - updatebits = 0; - nextgen = mgen + PTHRW_INC; - } - - if (ckwq->kw_inqueue != 0) { -#if COND_MTX_WAITQUEUEMOVE - if (mutexowned != 0) { -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVBROAD | DBG_FUNC_NONE, (uint32_t)cond, 0, 1, ckwq->kw_inqueue, 0); -#endif /* _PSYNCH_TRACE_ */ - uth = ksyn_queue_removefirst(&ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER],ckwq); - uth->uu_psynchretval = ngen; - uth->uu_kwqqueue = NULL; - kret = ksyn_wakeup_thread(ckwq, uth); - if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) - panic("cvbraoad: failed to remove\n"); - if (kret == KERN_NOT_WAITING) { - /* - * trying to wake one thread to return, so if - * failed to wakeup get the next one.. - */ - goto redrive; - } - nextgen = nextgen + PTHRW_INC; - diffgen -= 1; - } -#else /* COND_MTX_WAITQUEUEMOVE */ - updatebits = 0; -#endif /* COND_MTX_WAITQUEUEMOVE */ - - /* nomutex case or in mutexowned case after the first one */ - /* move them all to the mutex waitqueue */ - if ((ckwq->kw_inqueue != 0) && (diffgen > 0)) { - /* atleast one more posting needed and there are waiting threads */ - /* drops the ckwq lock */ -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVBROAD | DBG_FUNC_NONE, (uint32_t)cond, 0, 2, diffgen, 0); -#endif /* _PSYNCH_TRACE_ */ - /* move threads from ckwq to kwq if COND_MTX_WAITQUEUEMOVE, else wakeup */ - ksyn_move_wqthread(ckwq, kwq, nextgen, ngen, diffgen, nomutex); - } else - ksyn_wqunlock(ckwq); - } else { - /* no need for prepost as it is covered before */ - ksyn_wqunlock(ckwq); - } + /* update L, U and S... */ + UPDATE_CVKWQ(ckwq, cgen, cugen, csgen, 0, KSYN_WQTYPE_CVAR); - if (error == 0) { - *retval = updatebits; - } + /* broadcast wakeups/prepost handling */ + ksyn_handle_cvbroad(ckwq, uptoseq, &updatebits); -#if COND_MTX_WAITQUEUEMOVE -out: -#endif /* COND_MTX_WAITQUEUEMOVE */ - ksyn_wqrelease(ckwq, kwq); + /* set C or P bits and free if needed */ + ckwq->kw_sword += (updatebits & PTHRW_COUNT_MASK); + ksyn_cvupdate_fixup(ckwq, &updatebits, &kfreeq, 1); + ksyn_wqunlock(ckwq); + + *retval = updatebits; + + ksyn_wqrelease(ckwq, NULL, 1, (KSYN_WQTYPE_INDROP | KSYN_WQTYPE_CVAR)); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVBROAD | DBG_FUNC_END, (uint32_t)cond, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVBROAD | DBG_FUNC_END, (uint32_t)cond, 0xeeeeeeed, (uint32_t)*retval, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } +ksyn_waitq_element_t +ksyn_queue_find_threadseq(ksyn_wait_queue_t ckwq, __unused ksyn_queue_t kq, thread_t th, uint32_t upto) +{ + uthread_t uth = get_bsdthread_info(th); + ksyn_waitq_element_t kwe = &uth->uu_kwe; + + if (kwe->kwe_kwqqueue != ckwq || + is_seqhigher((kwe->kwe_lockseq & PTHRW_COUNT_MASK), upto)) { + /* the thread is not waiting in the cv (or wasn't when the wakeup happened) */ + return NULL; + } + return kwe; +} + /* * psynch_cvsignal: This system call is used for signalling the blocked waiters of psynch cvars. */ int -psynch_cvsignal(__unused proc_t p, struct psynch_cvsignal_args * uap, int * retval) +psynch_cvsignal(__unused proc_t p, struct psynch_cvsignal_args * uap, uint32_t * retval) { user_addr_t cond = uap->cv; - uint32_t cgen = uap->cvgen; + uint64_t cvlsgen = uap->cvlsgen; + uint32_t cgen, csgen, signalseq, uptoseq; uint32_t cugen = uap->cvugen; - uint32_t mgen = uap->mgen; int threadport = uap->thread_port; int flags = uap->flags; - ksyn_wait_queue_t kwq, ckwq; - int error=0, kret; - uthread_t uth; -#if USE_WAITQUEUE - thread_t th = THREAD_NULL, mth; -#else /* USE_WAITQUEUE */ + ksyn_wait_queue_t ckwq = NULL; + ksyn_waitq_element_t kwe, nkwe = NULL; + ksyn_queue_t kq; + int error=0; thread_t th = THREAD_NULL; -#endif /* USE_WAITQUEUE */ -#if COND_MTX_WAITQUEUEMOVE - user_addr_t mutex = uap->mutex; - uint32_t ugen = uap->ugen; - int mutexowned = flags & _PTHREAD_MTX_OPT_HOLDLOCK; - int nomutex = flags & _PTHREAD_MTX_OPT_NOHOLDLOCK; -#else /* COND_MTX_WAITQUEUEMOVE */ - int nomutex = _PTHREAD_MTX_OPT_NOHOLDLOCK; -#endif /* COND_MTX_WAITQUEUEMOVE */ - uint32_t retbits, ngen, lockseq; + uint32_t updatebits = 0; + kern_return_t kret; + struct ksyn_queue kfreeq; - if (nomutex != 0) - retbits = 0; - else - retbits = 1; -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVSIGNAL | DBG_FUNC_START, (uint32_t)cond, (uint32_t) 0, cgen, mgen, 0); - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVSIGNAL | DBG_FUNC_NONE, (uint32_t)cond, (uint32_t)cugen , flags, mgen, 0); -#endif /* _PSYNCH_TRACE_ */ + csgen = (uint32_t)((cvlsgen >> 32) & 0xffffffff); + cgen = ((uint32_t)(cvlsgen & 0xffffffff)); - error = ksyn_wqfind(cond, cgen, cugen, 0, 0, flags, KSYN_WQTYPE_CVAR, &ckwq); - if (error != 0) { - *retval = retbits; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVSIGNAL | DBG_FUNC_END, (uint32_t)cond, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSIGNAL | DBG_FUNC_START, (uint32_t)cond, cgen, cugen, threadport, 0); #endif /* _PSYNCH_TRACE_ */ - return(error); - } - - - if ((flags & _PTHREAD_MTX_OPT_LASTDROP) == _PTHREAD_MTX_OPT_LASTDROP) { - - ksyn_wqlock(ckwq); - lockseq = cgen & PTHRW_COUNT_MASK; - /* do we need to check for lockseq as this is from last waiter, may be race ? */ - if ((ckwq->kw_pre_rwwc != 0) && (is_seqlower_eq(lockseq, ckwq->kw_pre_lockseq) != 0)) { - ckwq->kw_pre_rwwc--; - if (ckwq->kw_pre_rwwc == 0) - CLEAR_PREPOST_BITS(ckwq); - } - ksyn_wqunlock(ckwq); - /* no mutex or thread is associated with this, just notificaion */ - th = THREAD_NULL; - error = 0; - goto out; - } - ngen = mgen + PTHRW_INC; + uptoseq = cgen & PTHRW_COUNT_MASK; + signalseq = (cugen & PTHRW_COUNT_MASK) + PTHRW_INC; -#if COND_MTX_WAITQUEUEMOVE - if (nomutex == 0) { - /* mutex was not operated on, ignore it */ - error = ksyn_wqfind(mutex, ngen, ugen, 0, 0, flags, KSYN_WQTYPE_MTX, &kwq); - if (error != 0) { - *retval = retbits; - kwq = NULL; - goto out; - } - } else { -#endif /* COND_MTX_WAITQUEUEMOVE */ - kwq = NULL; -#if COND_MTX_WAITQUEUEMOVE + /* validate sane L, U, and S values */ + if (((threadport == 0) && (is_seqhigher(signalseq, uptoseq))) || is_seqhigher((csgen & PTHRW_COUNT_MASK), uptoseq)) { + __FAILEDUSERTEST__("psync_cvsignal; invalid sequence numbers\n"); + error = EINVAL; + goto out; } -#endif /* COND_MTX_WAITQUEUEMOVE */ - + /* If we are looking for a specific thread, grab a reference for it */ if (threadport != 0) { th = (thread_t)port_name_to_thread((mach_port_name_t)threadport); if (th == THREAD_NULL) { - *retval = retbits; error = ESRCH; goto out; } } - ksyn_wqlock(ckwq); -redrive: - if (ckwq->kw_inqueue != 0) { - *retval = 0; -#if COND_MTX_WAITQUEUEMOVE - if ((mutexowned != 0) || (nomutex != 0)) { + error = ksyn_wqfind(cond, cgen, cugen, csgen, 0, flags, (KSYN_WQTYPE_CVAR | KSYN_WQTYPE_INDROP), &ckwq); + if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVSIGNAL | DBG_FUNC_NONE, (uint32_t)cond, 0, 1, ckwq->kw_inqueue, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSIGNAL | DBG_FUNC_END, (uint32_t)cond, 0, 0xdeadbeef, error, 0); +#endif /* _PSYNCH_TRACE_ */ + goto out; + } + + ksyn_wqlock(ckwq); + + /* update L, U and S... */ + UPDATE_CVKWQ(ckwq, cgen, cugen, csgen, 0, KSYN_WQTYPE_CVAR); + + kq = &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER]; + +retry: + /* Only bother if we aren't already balanced */ + if ((ckwq->kw_lword & PTHRW_COUNT_MASK) != (ckwq->kw_sword & PTHRW_COUNT_MASK)) { + + kwe = (th != NULL) ? ksyn_queue_find_threadseq(ckwq, kq, th, uptoseq) : + ksyn_queue_find_signalseq(ckwq, kq, uptoseq, signalseq); + if (kwe != NULL) { + switch (kwe->kwe_flags) { + + case KWE_THREAD_BROADCAST: + /* broadcasts swallow our signal */ + break; + + case KWE_THREAD_PREPOST: + /* merge in with existing prepost at our same uptoseq */ + kwe->kwe_count += 1; + break; + + case KWE_THREAD_INWAIT: + if (is_seqlower((kwe->kwe_lockseq & PTHRW_COUNT_MASK), signalseq)) { + /* + * A valid thread in our range, but lower than our signal. + * Matching it may leave our match with nobody to wake it if/when + * it arrives (the signal originally meant for this thread might + * not successfully wake it). + * + * Convert to broadcast - may cause some spurious wakeups + * (allowed by spec), but avoids starvation (better choice). + */ +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSIGNAL | DBG_FUNC_NONE, (uint32_t)ckwq->kw_addr, 0xc1c1c1c1, uptoseq, 0, 0); #endif /* _PSYNCH_TRACE_ */ - if (th != THREAD_NULL) { - uth = get_bsdthread_info(th); - if (nomutex != 0) - ngen |= PTHRW_MTX_NONE; - uth->uu_psynchretval = ngen; - uth->uu_kwqqueue = NULL; - ksyn_queue_removeitem(ckwq, &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER], uth); - kret = ksyn_wakeup_thread(ckwq, uth); - if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) - panic("psynch_cvsignal: panic waking in cvsignal\n"); - if (kret == KERN_NOT_WAITING) { - if (threadport != 0) { - error = 0; - } else - goto redrive; - } - } else { - uth = ksyn_queue_removefirst(&ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER],ckwq); - if (nomutex != 0) - ngen |= PTHRW_MTX_NONE; - uth->uu_psynchretval = ngen; - uth->uu_kwqqueue = NULL; - kret = ksyn_wakeup_thread(ckwq, uth); - if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) - panic("psynch_cvsignal: panic waking in cvsignal\n"); - if (kret == KERN_NOT_WAITING) { - if (threadport != 0) { - error = 0; - } else - goto redrive; - } - } - ksyn_wqunlock(ckwq); - } else { -#endif /* COND_MTX_WAITQUEUEMOVE */ - /* need to move a thread to another queue */ + ksyn_handle_cvbroad(ckwq, uptoseq, &updatebits); + } else { + ksyn_queue_removeitem(ckwq, kq, kwe); + kwe->kwe_psynchretval = PTH_RWL_MTX_WAIT; + kwe->kwe_kwqqueue = NULL; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVSIGNAL | DBG_FUNC_NONE, (uint32_t)cond, 0, 2, ckwq->kw_inqueue, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSIGNAL | DBG_FUNC_NONE, (uint32_t)ckwq->kw_addr, 0xcafecaf2, (uint32_t)(thread_tid((struct thread *)(((struct uthread *)(kwe->kwe_uth))->uu_context.vc_thread))), kwe->kwe_psynchretval, 0); #endif /* _PSYNCH_TRACE_ */ - if (th != THREAD_NULL) { - uth = get_bsdthread_info(th); - /* if given thread not blocked in cvwait , return error */ - if (uth->uu_kwqqueue != ckwq) { - error = EINVAL; - ksyn_wqunlock(ckwq); - goto out; + kret = ksyn_wakeup_thread(ckwq, kwe); +#if __TESTPANICS__ + if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) + panic("ksyn_wakeup_thread: panic waking up condition waiter\n"); +#endif /* __TESTPANICS__ */ + updatebits += PTHRW_INC; } - ksyn_queue_removeitem(ckwq, &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER], uth); - } else { - uth = ksyn_queue_removefirst(&ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER],ckwq); - if (uth == NULL) - panic("cvsign: null uthread after rem"); - } -#if COND_MTX_WAITQUEUEMOVE - ksyn_wqunlock(ckwq); -#else /* COND_MTX_WAITQUEUEMOVE */ - uth->uu_psynchretval = 0; - uth->uu_kwqqueue = NULL; - kret = ksyn_wakeup_thread(ckwq, uth); - if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) - panic("psynch_cvsignal: panic waking in cvsignal\n"); - if (kret == KERN_NOT_WAITING) { - error = 0; - if (threadport == 0) - goto redrive; + + ckwq->kw_sword += (updatebits & PTHRW_COUNT_MASK); + break; + + default: + panic("unknown kweflags\n"); + break; } - + + } else if (th != NULL) { + /* + * Could not find the thread, post a broadcast, + * otherwise the waiter will be stuck. Use to send + * ESRCH here, did lead to rare hangs. + */ + ksyn_handle_cvbroad(ckwq, uptoseq, &updatebits); + ckwq->kw_sword += (updatebits & PTHRW_COUNT_MASK); + } else if (nkwe == NULL) { ksyn_wqunlock(ckwq); - error = 0; -#endif /* COND_MTX_WAITQUEUEMOVE */ - -#if COND_MTX_WAITQUEUEMOVE - ksyn_wqlock(kwq); - ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], ngen, uth, SEQFIT); -#if USE_WAITQUEUE - kret = wait_queue_move_thread(&ckwq->kw_wq, ckwq->kw_addr, th, &kwq->kw_wq, kwq->kw_addr, &mth); - if (kret == KERN_SUCCESS) { - if (mth != THREAD_NULL) { - uth = (struct uthread *)get_bsdthread_info(mth); - uth->uu_lockseq = ngen; - TAILQ_INSERT_TAIL(&kwq->kw_ksynqueues[KSYN_QUEUE_WRITER].ksynq_uthlist, uth, uu_mtxlist); - } - } -#else /* USE_WAITQUEUE */ - /* no need to move anything, just update the sequence */ - uth->uu_lockseq = ngen; - -#endif /* USE_WAITQUEUE */ - ksyn_wqunlock(kwq); - } -#endif /* COND_MTX_WAITQUEUEMOVE */ - } else { - /* prepost */ + nkwe = (ksyn_waitq_element_t)zalloc(kwe_zone); + ksyn_wqlock(ckwq); + goto retry; + + } else { + /* no eligible entries - add prepost */ + bzero(nkwe, sizeof(struct ksyn_waitq_element)); + nkwe->kwe_kwqqueue = ckwq; + nkwe->kwe_flags = KWE_THREAD_PREPOST; + nkwe->kwe_lockseq = uptoseq; + nkwe->kwe_count = 1; + nkwe->kwe_uth = NULL; + nkwe->kwe_psynchretval = 0; + #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVSIGNAL | DBG_FUNC_NONE, (uint32_t)cond, 0, 3, ckwq->kw_inqueue, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSIGNAL | DBG_FUNC_NONE, (uint32_t)ckwq->kw_addr, 0xfeedfefe, uptoseq, 0, 0); #endif /* _PSYNCH_TRACE_ */ - if (threadport != 0) { - error = EINVAL; - ksyn_wqunlock(ckwq); - goto out; + + (void)ksyn_queue_insert(ckwq, &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER], uptoseq, NULL, nkwe, SEQFIT); + ckwq->kw_fakecount++; + nkwe = NULL; } - - ckwq->kw_pre_rwwc++; - ckwq->kw_attq = kwq; - ckwq->kw_pre_lockseq = cgen & PTHRW_BIT_MASK; - ckwq->kw_pre_cvretval = ngen; - *retval = retbits; - ksyn_wqunlock(ckwq); + + /* set C or P bits and free if needed */ + ksyn_cvupdate_fixup(ckwq, &updatebits, &kfreeq, 1); } - /* ckwq is unlocked here */ - + + ksyn_wqunlock(ckwq); + if (nkwe != NULL) + zfree(kwe_zone, nkwe); + + ksyn_wqrelease(ckwq, NULL, 1, (KSYN_WQTYPE_INDROP | KSYN_WQTYPE_CVAR)); + out: - ksyn_wqrelease(ckwq, kwq); - if (th != THREAD_NULL) + if (th != NULL) thread_deallocate(th); + if (error == 0) + *retval = updatebits; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVSIGNAL | DBG_FUNC_END, (uint32_t)cond, 0, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSIGNAL | DBG_FUNC_END, (uint32_t)cond, 0xeeeeeeed, updatebits, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); @@ -1060,112 +1190,318 @@ int psynch_cvwait(__unused proc_t p, struct psynch_cvwait_args * uap, uint32_t * retval) { user_addr_t cond = uap->cv; - uint32_t cgen = uap->cvgen; + uint64_t cvlsgen = uap->cvlsgen; + uint32_t cgen, csgen; uint32_t cugen = uap->cvugen; user_addr_t mutex = uap->mutex; - uint32_t mgen =0, ugen; - int flags = 0; + uint64_t mugen = uap->mugen; + uint32_t mgen, ugen; + int flags = uap->flags; ksyn_wait_queue_t kwq, ckwq; - int error=0; + int error=0, local_error = 0; uint64_t abstime = 0; - uint32_t lockseq, updatebits; + uint32_t lockseq, updatebits=0; struct timespec ts; uthread_t uth; - + ksyn_waitq_element_t kwe, nkwe = NULL; + struct ksyn_queue *kq, kfreeq; +#if __TESTPANICS__ + //int timeoutval = 3; /* 3 secs */ + //u_int64_t ntime = 0; +#endif /* __TESTPANICS__ */ + /* for conformance reasons */ __pthread_testcancel(0); + csgen = (uint32_t)((cvlsgen >> 32) & 0xffffffff); + cgen = ((uint32_t)(cvlsgen & 0xffffffff)); + ugen = (uint32_t)((mugen >> 32) & 0xffffffff); + mgen = ((uint32_t)(mugen & 0xffffffff)); + #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_START, (uint32_t)cond, (uint32_t) mutex, cgen, mgen, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_START, (uint32_t)cond, cgen, cugen, csgen, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_NONE, (uint32_t)mutex, mgen, ugen, flags, 0); #endif /* _PSYNCH_TRACE_ */ - flags = 0; - if ((uap->usec & 0xc0000000) != 0) { - if (uap->usec & 0x40000000) - flags |= PTHREAD_PROCESS_SHARED; - if (uap->usec & 0x80000000) - flags |= _PTHREAD_MUTEX_POLICY_FIRSTFIT; + + lockseq = (cgen & PTHRW_COUNT_MASK); + /* + * In cvwait U word can be out of range as cond could be used only for + * timeouts. However S word needs to be within bounds and validated at + * user level as well. + */ + if (is_seqhigher_eq((csgen & PTHRW_COUNT_MASK), lockseq) != 0) { + __FAILEDUSERTEST__("psync_cvwait; invalid sequence numbers\n"); + return EINVAL; } - - error = ksyn_wqfind(cond, cgen, cugen, 0, 0, flags, KSYN_WQTYPE_CVAR | KSYN_WQTYPE_INWAIT, &ckwq); + + ckwq = kwq = NULL; + error = ksyn_wqfind(cond, cgen, cugen, csgen, 0, flags, KSYN_WQTYPE_CVAR | KSYN_WQTYPE_INWAIT, &ckwq); if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_END, (uint32_t)cond, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_END, (uint32_t)cond, 1, 0xdeadbeef, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } - if (mutex != (user_addr_t)0) { - mgen = uap->mgen; - ugen = uap->ugen; +#if __TESTPANICS__ + //clock_interval_to_deadline(timeoutval, NSEC_PER_SEC, &ntime); +#endif /* __TESTPANICS__ */ - error = ksyn_wqfind(mutex, mgen, ugen, 0, 0, flags, KSYN_WQTYPE_MTX, &kwq); { - if (error != 0) + if (mutex != (user_addr_t)0) { + error = ksyn_wqfind(mutex, mgen, ugen, 0, 0, flags, (KSYN_WQTYPE_INDROP | KSYN_WQTYPE_MTX), &kwq); + if (error != 0) { + local_error = error; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_END, (uint32_t)mutex, 2, 0xdeadbeef, error, 0); +#endif /* _PSYNCH_TRACE_ */ goto out; } - psynch_mutexdrop_internal(kwq, mgen, ugen, flags); + (void)psynch_mutexdrop_internal(kwq, mgen, ugen, flags); /* drops kwq reference */ + kwq = NULL; } - uth = current_uthread(); - uth->uu_lockseq = cgen; - lockseq = (cgen & PTHRW_COUNT_MASK); - - if (uap->sec != 0 || (uap->usec & 0x3fffffff) != 0) { + if (uap->sec != 0 || (uap->nsec & 0x3fffffff) != 0) { ts.tv_sec = uap->sec; - ts.tv_nsec = (uap->usec & 0xc0000000); - nanoseconds_to_absolutetime((uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec, &abstime ); - clock_absolutetime_interval_to_deadline( abstime, &abstime ); + ts.tv_nsec = (uap->nsec & 0x3fffffff); + nanoseconds_to_absolutetime((uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec, &abstime ); + clock_absolutetime_interval_to_deadline( abstime, &abstime ); } + ksyn_wqlock(ckwq); - if ((ckwq->kw_pre_rwwc != 0) && (is_seqlower_eq(lockseq, ckwq->kw_pre_lockseq) != 0)) { + + /* update L, U and S... */ + UPDATE_CVKWQ(ckwq, cgen, cugen, csgen, 0, KSYN_WQTYPE_CVAR); + + /* Look for the sequence for prepost (or conflicting thread */ + kq = &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER]; + kwe = ksyn_queue_find_cvpreposeq(kq, lockseq); + + if (kwe != NULL) { + switch (kwe->kwe_flags) { + + case KWE_THREAD_INWAIT: + ksyn_wqunlock(ckwq); + __FAILEDUSERTEST__("cvwait: thread entry with same sequence already present\n"); + local_error = EBUSY; + goto out; + + case KWE_THREAD_BROADCAST: + break; + + case KWE_THREAD_PREPOST: + if ((kwe->kwe_lockseq & PTHRW_COUNT_MASK) == lockseq) { + /* we can safely consume a reference, so do so */ + if (--kwe->kwe_count == 0) { + ksyn_queue_removeitem(ckwq, kq, kwe); + ckwq->kw_fakecount--; + nkwe = kwe; + } + } else { + /* + * consuming a prepost higher than our lock sequence is valid, but + * can leave the higher thread without a match. Convert the entry + * to a broadcast to compensate for this. + */ #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_NONE, (uint32_t)cond, 0, 1, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_NONE, (uint32_t)ckwq->kw_addr, 0xc2c2c2c2, kwe->kwe_lockseq, 0, 0); #endif /* _PSYNCH_TRACE_ */ + + ksyn_handle_cvbroad(ckwq, kwe->kwe_lockseq, &updatebits); +#if __TESTPANICS__ + if (updatebits != 0) + panic("psync_cvwait: convert pre-post to broadcast: woke up %d threads that shouldn't be there\n", + updatebits); +#endif /* __TESTPANICS__ */ + } + + break; -#if COND_MTX_WAITQUEUEMOVE - updatebits = ckwq->kw_pre_cvretval | PTHRW_MTX_NONE; -#else /* COND_MTX_WAITQUEUEMOVE */ - updatebits = 0; -#endif /* COND_MTX_WAITQUEUEMOVE */ - ckwq->kw_pre_rwwc--; - if (ckwq->kw_pre_rwwc == 0) - CLEAR_PREPOST_BITS(ckwq); - *retval = updatebits; + default: + panic("psync_cvwait: unexpected wait queue element type\n"); + } + +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_NONE, (uint32_t)ckwq->kw_addr, 0xfefefefe, kwe->kwe_lockseq, 0, 0); +#endif /* _PSYNCH_TRACE_ */ + + + updatebits = PTHRW_INC; + ckwq->kw_sword += PTHRW_INC; + + /* set C or P bits and free if needed */ + ksyn_cvupdate_fixup(ckwq, &updatebits, &kfreeq, 1); + error = 0; + local_error = 0; + + *retval = updatebits; + ksyn_wqunlock(ckwq); + + if (nkwe != NULL) + zfree(kwe_zone, nkwe); + goto out; + + } - } else { + uth = current_uthread(); + kwe = &uth->uu_kwe; + kwe->kwe_kwqqueue = ckwq; + kwe->kwe_flags = KWE_THREAD_INWAIT; + kwe->kwe_lockseq = lockseq; + kwe->kwe_count = 1; + kwe->kwe_uth = uth; + kwe->kwe_psynchretval = 0; + #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_NONE, (uint32_t)cond, 0, 2, cgen, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_NONE, (uint32_t)ckwq->kw_addr, 0xfeedfeed, cgen, 0, 0); #endif /* _PSYNCH_TRACE_ */ - error = ksyn_queue_insert(ckwq, &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER], cgen, uth, FIRSTFIT); - if (error != 0) - panic("psynch_cvwait: failed to enqueue\n"); - error = ksyn_block_thread_locked(ckwq, abstime, uth); - /* drops the lock */ + + error = ksyn_queue_insert(ckwq, kq, cgen, uth, kwe, SEQFIT); + if (error != 0) { + ksyn_wqunlock(ckwq); + local_error = error; + goto out; } + +#if 0 /* __TESTPANICS__ */ + /* if no timeout is passed, set 5 secs timeout to catch hangs */ + error = ksyn_block_thread_locked(ckwq, (abstime == 0) ? ntime : abstime, kwe, 1); +#else + error = ksyn_block_thread_locked(ckwq, abstime, kwe, 1); +#endif /* __TESTPANICS__ */ + /* lock dropped */ + + local_error = error; if (error != 0) { ksyn_wqlock(ckwq); + /* just in case it got woken up as we were granting */ + *retval = kwe->kwe_psynchretval; + +#if __TESTPANICS__ + if ((kwe->kwe_kwqqueue != NULL) && (kwe->kwe_kwqqueue != ckwq)) + panic("cvwait waiting on some other kwq\n"); + +#endif /* __TESTPANICS__ */ + + + if (kwe->kwe_kwqqueue != NULL) { + ksyn_queue_removeitem(ckwq, &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER], kwe); + kwe->kwe_kwqqueue = NULL; + } + if ((kwe->kwe_psynchretval & PTH_RWL_MTX_WAIT) != 0) { + /* the condition var granted. + * reset the error so that the thread returns back. + */ + local_error = 0; + /* no need to set any bits just return as cvsig/broad covers this */ + ksyn_wqunlock(ckwq); + *retval = 0; + goto out; + } + + ckwq->kw_sword += PTHRW_INC; + + /* set C and P bits, in the local error as well as updatebits */ + if ((ckwq->kw_lword & PTHRW_COUNT_MASK) == (ckwq->kw_sword & PTHRW_COUNT_MASK)) { + updatebits |= PTH_RWS_CV_CBIT; + local_error |= ECVCERORR; + if (ckwq->kw_inqueue != 0) { + (void)ksyn_queue_move_tofree(ckwq, kq, (ckwq->kw_lword & PTHRW_COUNT_MASK), &kfreeq, 1, 1); + } + ckwq->kw_lword = ckwq->kw_uword = ckwq->kw_sword = 0; + ckwq->kw_kflags |= KSYN_KWF_ZEROEDOUT; + } else { + /* everythig in the queue is a fake entry ? */ + if ((ckwq->kw_inqueue != 0) && (ckwq->kw_fakecount == ckwq->kw_inqueue)) { + updatebits |= PTH_RWS_CV_PBIT; + local_error |= ECVPERORR; + } + } + ksyn_wqunlock(ckwq); + + } else { + /* PTH_RWL_MTX_WAIT is removed */ + if ((kwe->kwe_psynchretval & PTH_RWS_CV_MBIT) != 0) + *retval = PTHRW_INC | PTH_RWS_CV_CBIT; + else + *retval = 0; + local_error = 0; + } +out: +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_END, (uint32_t)cond, 0xeeeeeeed, (uint32_t)*retval, local_error, 0); +#endif /* _PSYNCH_TRACE_ */ + ksyn_wqrelease(ckwq, NULL, 1, (KSYN_WQTYPE_INWAIT | KSYN_WQTYPE_CVAR)); + return(local_error); +} + +/* + * psynch_cvclrprepost: This system call clears pending prepost if present. + */ +int +psynch_cvclrprepost(__unused proc_t p, struct psynch_cvclrprepost_args * uap, __unused int * retval) +{ + user_addr_t cond = uap->cv; + uint32_t cgen = uap->cvgen; + uint32_t cugen = uap->cvugen; + uint32_t csgen = uap->cvsgen; + uint32_t pseq = uap->preposeq; + uint32_t flags = uap->flags; + int error; + ksyn_wait_queue_t ckwq = NULL; + struct ksyn_queue kfreeq; + #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_NONE, (uint32_t)cond, 0, 3, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CLRPRE | DBG_FUNC_START, (uint32_t)cond, cgen, cugen, csgen, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CLRPRE | DBG_FUNC_NONE, (uint32_t)cond, 0xcececece, pseq, flags, 0); #endif /* _PSYNCH_TRACE_ */ - if (uth->uu_kwqqueue != NULL) { - ksyn_queue_removeitem(ckwq, &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER], uth); + + if ((flags & _PTHREAD_MTX_OPT_MUTEX) == 0) { + error = ksyn_wqfind(cond, cgen, cugen, csgen, 0, flags, (KSYN_WQTYPE_CVAR | KSYN_WQTYPE_INDROP), &ckwq); + if (error != 0) { + *retval = 0; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CLRPRE | DBG_FUNC_END, (uint32_t)cond, 0, 0xdeadbeef, error, 0); +#endif /* _PSYNCH_TRACE_ */ + return(error); } + + ksyn_wqlock(ckwq); + (void)ksyn_queue_move_tofree(ckwq, &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER], (pseq & PTHRW_COUNT_MASK), &kfreeq, 0, 1); ksyn_wqunlock(ckwq); - } else { - *retval = uth->uu_psynchretval; + ksyn_wqrelease(ckwq, NULL, 1, (KSYN_WQTYPE_CVAR | KSYN_WQTYPE_INDROP)); + } else { + /* mutex type */ + error = ksyn_wqfind(cond, cgen, cugen, 0, 0, flags, (KSYN_WQTYPE_MTX | KSYN_WQTYPE_INDROP), &ckwq); + if (error != 0) { + *retval = 0; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CLRPRE | DBG_FUNC_END, (uint32_t)cond, 0, 0xdeadbeef, error, 0); +#endif /* _PSYNCH_TRACE_ */ + return(error); + } + ksyn_wqlock(ckwq); + if (((flags & _PTHREAD_MUTEX_POLICY_FIRSTFIT) != 0) && (ckwq->kw_pre_rwwc != 0)) { + if (is_seqlower_eq(ckwq->kw_pre_lockseq, cgen) != 0) { + /* clear prepost */ + ckwq->kw_pre_rwwc = 0; + ckwq->kw_pre_lockseq = 0; + } + } + ksyn_wqunlock(ckwq); + ksyn_wqrelease(ckwq, NULL, 1, (KSYN_WQTYPE_MTX | KSYN_WQTYPE_INDROP)); } -out: + #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_END, (uint32_t)cond, 0, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CLRPRE | DBG_FUNC_END, (uint32_t)cond, 0xeeeeeeed, 0, 0, 0); #endif /* _PSYNCH_TRACE_ */ - ksyn_wqrelease(ckwq, NULL); - return(error); + return(0); } /* ***************** pthread_rwlock ************************ */ @@ -1182,67 +1518,106 @@ psynch_rw_rdlock(__unused proc_t p, struct psynch_rw_rdlock_args * uap, uint32_t //uint64_t tid = uap->tid; int flags = uap->flags; int error = 0, block; - uint32_t lockseq = 0, updatebits = 0, preseq = 0; + uint32_t lockseq = 0, updatebits = 0, preseq = 0, prerw_wc = 0; ksyn_wait_queue_t kwq; uthread_t uth; + int isinit = lgen & PTHRW_RWL_INIT; + uint32_t returnbits = 0; + ksyn_waitq_element_t kwe; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); #endif /* _PSYNCH_TRACE_ */ uth = current_uthread(); /* preserve the seq number */ - uth->uu_lockseq = lgen; + kwe = &uth->uu_kwe; + kwe->kwe_lockseq = lgen; + kwe->kwe_uth = uth; + kwe->kwe_psynchretval = 0; + kwe->kwe_kwqqueue = NULL; + lockseq = lgen & PTHRW_COUNT_MASK; + error = ksyn_wqfind(rwlock, lgen, ugen, rw_wc, TID_ZERO, flags, (KSYN_WQTYPE_INWAIT|KSYN_WQTYPE_RWLOCK), &kwq); if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } ksyn_wqlock(kwq); + if (isinit != 0) { + lgen &= ~PTHRW_RWL_INIT; + if ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) == 0) { + /* first to notice the reset of the lock, clear preposts */ + CLEAR_REINIT_BITS(kwq); + kwq->kw_kflags |= KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 1, 0); +#endif /* _PSYNCH_TRACE_ */ + } + } + /* handle first the missed wakeups */ if ((kwq->kw_pre_intrcount != 0) && ((kwq->kw_pre_intrtype == PTH_RW_TYPE_READ) || (kwq->kw_pre_intrtype == PTH_RW_TYPE_LREAD)) && (is_seqlower_eq(lockseq, (kwq->kw_pre_intrseq & PTHRW_COUNT_MASK)) != 0)) { kwq->kw_pre_intrcount--; - uth->uu_psynchretval = kwq->kw_pre_intrretbits; + kwe->kwe_psynchretval = kwq->kw_pre_intrretbits; if (kwq->kw_pre_intrcount==0) CLEAR_INTR_PREPOST_BITS(kwq); ksyn_wqunlock(kwq); goto out; } - /* handle unlock2/downgrade first */ - if ((kwq->kw_pre_limrd != 0) && (is_seqlower_eq(lockseq, (kwq->kw_pre_limrdseq & PTHRW_COUNT_MASK)) != 0)) { + /* handle overlap first as they are not counted against pre_rwwc */ + + /* check for overlap and if no pending W bit (indicates writers) */ + if ((kwq->kw_overlapwatch != 0) && ((rw_wc & PTHRW_RWS_SAVEMASK) == 0) && ((lgen & PTH_RWL_WBIT) == 0)) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 1, kwq->kw_pre_limrd, kwq->kw_pre_limrdseq, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 10, kwq->kw_nextseqword, kwq->kw_lastseqword, 0); #endif /* _PSYNCH_TRACE_ */ - kwq->kw_pre_limrd--; - /* acquired the locks, so return */ - uth->uu_psynchretval = kwq->kw_pre_limrdbits; - if (kwq->kw_pre_limrd == 0) - CLEAR_READ_PREPOST_BITS(kwq); - ksyn_wqunlock(kwq); - goto out; + error = kwq_handle_overlap(kwq, lgen, ugen, rw_wc, &updatebits, (KW_UNLOCK_PREPOST_READLOCK|KW_UNLOCK_PREPOST), &block); +#if __TESTPANICS__ + if (error != 0) + panic("rw_rdlock: kwq_handle_overlap failed %d\n",error); +#endif /* __TESTPANICS__ */ + if (block == 0) { + error = 0; + kwe->kwe_psynchretval = updatebits; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 0xff, updatebits, 0xee, 0); +#endif /* _PSYNCH_TRACE_ */ + ksyn_wqunlock(kwq); + goto out; + } } if ((kwq->kw_pre_rwwc != 0) && (is_seqlower_eq(lockseq, (kwq->kw_pre_lockseq & PTHRW_COUNT_MASK)) != 0)) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 2, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 2, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); #endif /* _PSYNCH_TRACE_ */ kwq->kw_pre_rwwc--; if (kwq->kw_pre_rwwc == 0) { preseq = kwq->kw_pre_lockseq; + prerw_wc = kwq->kw_pre_sseq; CLEAR_PREPOST_BITS(kwq); - error = kwq_handle_unlock(kwq, preseq, &updatebits, (KW_UNLOCK_PREPOST_READLOCK|KW_UNLOCK_PREPOST), &block, lgen); + if ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) != 0){ + kwq->kw_kflags &= ~KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 0, 0); +#endif /* _PSYNCH_TRACE_ */ + } + error = kwq_handle_unlock(kwq, preseq, prerw_wc, &updatebits, (KW_UNLOCK_PREPOST_READLOCK|KW_UNLOCK_PREPOST), &block, lgen); +#if __TESTPANICS__ if (error != 0) - panic("kwq_handle_unlock failed %d\n",error); + panic("rw_rdlock: kwq_handle_unlock failed %d\n",error); +#endif /* __TESTPANICS__ */ if (block == 0) { ksyn_wqunlock(kwq); goto out; @@ -1251,31 +1626,35 @@ psynch_rw_rdlock(__unused proc_t p, struct psynch_rw_rdlock_args * uap, uint32_t } } + #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 3, 0, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 3, 0, 0, 0); #endif /* _PSYNCH_TRACE_ */ - error = ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_READ], lgen, uth, SEQFIT); + error = ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_READ], lgen, uth, kwe, SEQFIT); +#if __TESTPANICS__ if (error != 0) panic("psynch_rw_rdlock: failed to enqueue\n"); - error = ksyn_block_thread_locked(kwq, (uint64_t)0, uth); +#endif /* __TESTPANICS__ */ + error = ksyn_block_thread_locked(kwq, (uint64_t)0, kwe, 0); /* drops the kwq lock */ out: if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 4, error, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 4, error, 0, 0); #endif /* _PSYNCH_TRACE_ */ ksyn_wqlock(kwq); - if (uth->uu_kwqqueue != NULL) - ksyn_queue_removeitem(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_READ], uth); + if (kwe->kwe_kwqqueue != NULL) + ksyn_queue_removeitem(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_READ], kwe); ksyn_wqunlock(kwq); } else { /* update bits */ - *retval = uth->uu_psynchretval; + *retval = kwe->kwe_psynchretval; + returnbits = kwe->kwe_psynchretval; } - ksyn_wqrelease(kwq, NULL); + ksyn_wqrelease(kwq, NULL, 0, (KSYN_WQTYPE_INWAIT|KSYN_WQTYPE_RWLOCK)); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, returnbits, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } @@ -1284,7 +1663,7 @@ psynch_rw_rdlock(__unused proc_t p, struct psynch_rw_rdlock_args * uap, uint32_t * psynch_rw_longrdlock: This system call is used for psync rwlock long readers to block. */ int -psynch_rw_longrdlock(__unused proc_t p, struct psynch_rw_longrdlock_args * uap, uint32_t * retval) +psynch_rw_longrdlock(__unused proc_t p, __unused struct psynch_rw_longrdlock_args * uap, __unused uint32_t * retval) { user_addr_t rwlock = uap->rwlock; uint32_t lgen = uap->lgenval; @@ -1292,65 +1671,82 @@ psynch_rw_longrdlock(__unused proc_t p, struct psynch_rw_longrdlock_args * uap, uint32_t rw_wc = uap->rw_wc; //uint64_t tid = uap->tid; int flags = uap->flags; + int isinit = lgen & PTHRW_RWL_INIT; + uint32_t returnbits=0; + ksyn_waitq_element_t kwe; ksyn_wait_queue_t kwq; int error=0, block = 0 ; uthread_t uth; - uint32_t lockseq = 0, updatebits = 0, preseq = 0; + uint32_t lockseq = 0, updatebits = 0, preseq = 0, prerw_wc = 0; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWLRDLOCK | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWLRDLOCK | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); #endif /* _PSYNCH_TRACE_ */ uth = current_uthread(); - - uth->uu_lockseq = lgen; + kwe = &uth->uu_kwe; + kwe->kwe_lockseq = lgen; + kwe->kwe_uth = uth; + kwe->kwe_psynchretval = 0; + kwe->kwe_kwqqueue = NULL; lockseq = (lgen & PTHRW_COUNT_MASK); - + error = ksyn_wqfind(rwlock, lgen, ugen, rw_wc, TID_ZERO, flags, (KSYN_WQTYPE_INWAIT|KSYN_WQTYPE_RWLOCK), &kwq); if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWLRDLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWLRDLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } ksyn_wqlock(kwq); + if (isinit != 0) { + lgen &= ~PTHRW_RWL_INIT; + if ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) == 0) { + /* first to notice the reset of the lock, clear preposts */ + CLEAR_REINIT_BITS(kwq); + kwq->kw_kflags |= KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 1, 0); +#endif /* _PSYNCH_TRACE_ */ + } + } + /* handle first the missed wakeups */ if ((kwq->kw_pre_intrcount != 0) && (kwq->kw_pre_intrtype == PTH_RW_TYPE_LREAD) && (is_seqlower_eq(lockseq, (kwq->kw_pre_intrseq & PTHRW_COUNT_MASK)) != 0)) { kwq->kw_pre_intrcount--; - uth->uu_psynchretval = kwq->kw_pre_intrretbits; + kwe->kwe_psynchretval = kwq->kw_pre_intrretbits; if (kwq->kw_pre_intrcount==0) CLEAR_INTR_PREPOST_BITS(kwq); ksyn_wqunlock(kwq); goto out; } - /* handle unlock2/downgrade first */ - if ((kwq->kw_pre_limrd != 0) && (is_seqlower_eq(lockseq, (kwq->kw_pre_limrdseq & PTHRW_COUNT_MASK)) != 0)) { -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWLRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 1, kwq->kw_pre_limrd, kwq->kw_pre_limrdseq, 0); -#endif /* _PSYNCH_TRACE_ */ - kwq->kw_pre_limrd--; - if (kwq->kw_pre_limrd == 0) - CLEAR_READ_PREPOST_BITS(kwq); - /* not a read proceed */ - } if ((kwq->kw_pre_rwwc != 0) && (is_seqlower_eq(lockseq, (kwq->kw_pre_lockseq & PTHRW_COUNT_MASK)) != 0)) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWLRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 2, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWLRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 2, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); #endif /* _PSYNCH_TRACE_ */ kwq->kw_pre_rwwc--; if (kwq->kw_pre_rwwc == 0) { preseq = kwq->kw_pre_lockseq; + prerw_wc = kwq->kw_pre_sseq; CLEAR_PREPOST_BITS(kwq); - error = kwq_handle_unlock(kwq, preseq, &updatebits, (KW_UNLOCK_PREPOST_LREADLOCK|KW_UNLOCK_PREPOST), &block, lgen); + if ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) != 0){ + kwq->kw_kflags &= ~KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 0, 0); +#endif /* _PSYNCH_TRACE_ */ + } + error = kwq_handle_unlock(kwq, preseq, prerw_wc, &updatebits, (KW_UNLOCK_PREPOST_LREADLOCK|KW_UNLOCK_PREPOST), &block, lgen); +#if __TESTPANICS__ if (error != 0) panic("kwq_handle_unlock failed %d\n",error); +#endif /* __TESTPANICS__ */ if (block == 0) { ksyn_wqunlock(kwq); goto out; @@ -1360,32 +1756,35 @@ psynch_rw_longrdlock(__unused proc_t p, struct psynch_rw_longrdlock_args * uap, } #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWLRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 3, 0, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWLRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 3, 0, 0, 0); #endif /* _PSYNCH_TRACE_ */ - error = ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_LREAD], lgen, uth, SEQFIT); + error = ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_LREAD], lgen, uth, kwe, SEQFIT); +#if __TESTPANICS__ if (error != 0) panic("psynch_rw_longrdlock: failed to enqueue\n"); +#endif /* __TESTPANICS__ */ - error = ksyn_block_thread_locked(kwq, (uint64_t)0, uth); + error = ksyn_block_thread_locked(kwq, (uint64_t)0, kwe, 0); /* drops the kwq lock */ out: if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWLRDLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWLRDLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); #endif /* _PSYNCH_TRACE_ */ ksyn_wqlock(kwq); - if (uth->uu_kwqqueue != NULL) - ksyn_queue_removeitem(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_LREAD], uth); + if (kwe->kwe_kwqqueue != NULL) + ksyn_queue_removeitem(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_LREAD], kwe); ksyn_wqunlock(kwq); } else { /* update bits */ - *retval = uth->uu_psynchretval; + *retval = kwe->kwe_psynchretval; + returnbits = kwe->kwe_psynchretval; } - ksyn_wqrelease(kwq, NULL); + ksyn_wqrelease(kwq, NULL, 0, (KSYN_WQTYPE_INWAIT|KSYN_WQTYPE_RWLOCK)); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWLRDLOCK | DBG_FUNC_END, (uint32_t)rwlock, 0, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWLRDLOCK | DBG_FUNC_END, (uint32_t)rwlock, 0, returnbits, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } @@ -1406,97 +1805,122 @@ psynch_rw_wrlock(__unused proc_t p, struct psynch_rw_wrlock_args * uap, uint32_t ksyn_wait_queue_t kwq; int error=0; uthread_t uth; - uint32_t lockseq = 0, updatebits = 0, preseq = 0; + uint32_t lockseq = 0, updatebits = 0, preseq = 0, prerw_wc = 0; + int isinit = lgen & PTHRW_RWL_INIT; + uint32_t returnbits = 0; + ksyn_waitq_element_t kwe; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWWRLOCK | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWWRLOCK | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); #endif /* _PSYNCH_TRACE_ */ uth = current_uthread(); - - uth->uu_lockseq = lgen; + kwe = &uth->uu_kwe; + kwe->kwe_lockseq = lgen; + kwe->kwe_uth = uth; + kwe->kwe_psynchretval = 0; + kwe->kwe_kwqqueue = NULL; lockseq = (lgen & PTHRW_COUNT_MASK); error = ksyn_wqfind(rwlock, lgen, ugen, rw_wc, TID_ZERO, flags, (KSYN_WQTYPE_INWAIT|KSYN_WQTYPE_RWLOCK), &kwq); if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWWRLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWWRLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } ksyn_wqlock(kwq); + + if (isinit != 0) { + lgen &= ~PTHRW_RWL_INIT; + if ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) == 0) { + /* first to notice the reset of the lock, clear preposts */ + CLEAR_REINIT_BITS(kwq); + kwq->kw_kflags |= KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 1, 0); +#endif /* _PSYNCH_TRACE_ */ + } + } + + /* handle first the missed wakeups */ if ((kwq->kw_pre_intrcount != 0) && (kwq->kw_pre_intrtype == PTH_RW_TYPE_WRITE) && (is_seqlower_eq(lockseq, (kwq->kw_pre_intrseq & PTHRW_COUNT_MASK)) != 0)) { kwq->kw_pre_intrcount--; - uth->uu_psynchretval = kwq->kw_pre_intrretbits; + kwe->kwe_psynchretval = kwq->kw_pre_intrretbits; if (kwq->kw_pre_intrcount==0) CLEAR_INTR_PREPOST_BITS(kwq); ksyn_wqunlock(kwq); goto out; } - /* handle unlock2/downgrade first */ - if ((kwq->kw_pre_limrd != 0) && (is_seqlower_eq(lockseq, (kwq->kw_pre_limrdseq & PTHRW_COUNT_MASK)) != 0)) { -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 1, kwq->kw_pre_limrd, kwq->kw_pre_limrdseq, 0); -#endif /* _PSYNCH_TRACE_ */ - kwq->kw_pre_limrd--; - if (kwq->kw_pre_limrd == 0) - CLEAR_READ_PREPOST_BITS(kwq); - /* not a read proceed */ - } if ((kwq->kw_pre_rwwc != 0) && (is_seqlower_eq(lockseq, (kwq->kw_pre_lockseq & PTHRW_COUNT_MASK)) != 0)) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 2, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 2, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); #endif /* _PSYNCH_TRACE_ */ kwq->kw_pre_rwwc--; if (kwq->kw_pre_rwwc == 0) { preseq = kwq->kw_pre_lockseq; + prerw_wc = kwq->kw_pre_sseq; CLEAR_PREPOST_BITS(kwq); - error = kwq_handle_unlock(kwq, preseq, &updatebits, (KW_UNLOCK_PREPOST_WRLOCK|KW_UNLOCK_PREPOST), &block, lgen); + if ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) != 0){ + kwq->kw_kflags &= ~KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 0, 0); +#endif /* _PSYNCH_TRACE_ */ + } + error = kwq_handle_unlock(kwq, preseq, prerw_wc, &updatebits, (KW_UNLOCK_PREPOST_WRLOCK|KW_UNLOCK_PREPOST), &block, lgen); +#if __TESTPANICS__ if (error != 0) - panic("kwq_handle_unlock failed %d\n",error); + panic("rw_wrlock: kwq_handle_unlock failed %d\n",error); +#endif /* __TESTPANICS__ */ if (block == 0) { ksyn_wqunlock(kwq); - goto out; + *retval = updatebits; + goto out1; } /* insert to q and proceed as ususal */ } } + /* No overlap watch needed go ahead and block */ + #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 3, 0, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 3, 0, 0, 0); #endif /* _PSYNCH_TRACE_ */ - error = ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], lgen, uth, SEQFIT); + error = ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], lgen, uth, kwe, SEQFIT); +#if __TESTPANICS__ if (error != 0) panic("psynch_rw_wrlock: failed to enqueue\n"); +#endif /* __TESTPANICS__ */ - error = ksyn_block_thread_locked(kwq, (uint64_t)0, uth); + error = ksyn_block_thread_locked(kwq, (uint64_t)0, kwe, 0); /* drops the wq lock */ out: if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 4, error, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 4, error, 0, 0); #endif /* _PSYNCH_TRACE_ */ ksyn_wqlock(kwq); - if (uth->uu_kwqqueue != NULL) - ksyn_queue_removeitem(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], uth); + if (kwe->kwe_kwqqueue != NULL) + ksyn_queue_removeitem(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], kwe); ksyn_wqunlock(kwq); } else { /* update bits */ - *retval = uth->uu_psynchretval; + *retval = kwe->kwe_psynchretval; + returnbits = kwe->kwe_psynchretval; } - - ksyn_wqrelease(kwq, NULL); +out1: + ksyn_wqrelease(kwq, NULL, 0, (KSYN_WQTYPE_INWAIT|KSYN_WQTYPE_RWLOCK)); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWWRLOCK | DBG_FUNC_END, (uint32_t)rwlock, 0, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWWRLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, returnbits, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } @@ -1505,7 +1929,7 @@ psynch_rw_wrlock(__unused proc_t p, struct psynch_rw_wrlock_args * uap, uint32_t * psynch_rw_yieldwrlock: This system call is used for psync rwlock yielding writers to block. */ int -psynch_rw_yieldwrlock(__unused proc_t p, struct psynch_rw_yieldwrlock_args * uap, uint32_t * retval) +psynch_rw_yieldwrlock(__unused proc_t p, __unused struct psynch_rw_yieldwrlock_args * uap, __unused uint32_t * retval) { user_addr_t rwlock = uap->rwlock; uint32_t lgen = uap->lgenval; @@ -1516,65 +1940,82 @@ psynch_rw_yieldwrlock(__unused proc_t p, struct psynch_rw_yieldwrlock_args * ua int block; ksyn_wait_queue_t kwq; int error=0; + int isinit = lgen & PTHRW_RWL_INIT; uthread_t uth; + uint32_t returnbits=0; + ksyn_waitq_element_t kwe; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWYWRLOCK | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWYWRLOCK | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); #endif /* _PSYNCH_TRACE_ */ - uint32_t lockseq = 0, updatebits = 0, preseq = 0; + uint32_t lockseq = 0, updatebits = 0, preseq = 0, prerw_wc = 0; uth = current_uthread(); - - uth->uu_lockseq = lgen; + kwe = &uth->uu_kwe; + kwe->kwe_lockseq = lgen; + kwe->kwe_uth = uth; + kwe->kwe_psynchretval = 0; + kwe->kwe_kwqqueue = NULL; lockseq = (lgen & PTHRW_COUNT_MASK); error = ksyn_wqfind(rwlock, lgen, ugen, rw_wc, TID_ZERO, flags, (KSYN_WQTYPE_INWAIT|KSYN_WQTYPE_RWLOCK), &kwq); if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWYWRLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWYWRLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } ksyn_wqlock(kwq); + if (isinit != 0) { + lgen &= ~PTHRW_RWL_INIT; + if ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) == 0) { + /* first to notice the reset of the lock, clear preposts */ + CLEAR_REINIT_BITS(kwq); + kwq->kw_kflags |= KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 1, 0); +#endif /* _PSYNCH_TRACE_ */ + } + } + /* handle first the missed wakeups */ if ((kwq->kw_pre_intrcount != 0) && (kwq->kw_pre_intrtype == PTH_RW_TYPE_YWRITE) && (is_seqlower_eq(lockseq, (kwq->kw_pre_intrseq & PTHRW_COUNT_MASK)) != 0)) { kwq->kw_pre_intrcount--; - uth->uu_psynchretval = kwq->kw_pre_intrretbits; + kwe->kwe_psynchretval = kwq->kw_pre_intrretbits; if (kwq->kw_pre_intrcount==0) CLEAR_INTR_PREPOST_BITS(kwq); ksyn_wqunlock(kwq); goto out; } - /* handle unlock2/downgrade first */ - if ((kwq->kw_pre_limrd != 0) && (is_seqlower_eq(lockseq, (kwq->kw_pre_limrdseq & PTHRW_COUNT_MASK)) != 0)) { -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWYWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 1, kwq->kw_pre_limrd, kwq->kw_pre_limrdseq, 0); -#endif /* _PSYNCH_TRACE_ */ - kwq->kw_pre_limrd--; - if (kwq->kw_pre_limrd == 0) - CLEAR_READ_PREPOST_BITS(kwq); - /* not a read proceed */ - } - if ((kwq->kw_pre_rwwc != 0) && (is_seqlower_eq(lockseq, (kwq->kw_pre_lockseq & PTHRW_COUNT_MASK)) != 0)) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWYWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 2, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWYWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 2, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); #endif /* _PSYNCH_TRACE_ */ kwq->kw_pre_rwwc--; if (kwq->kw_pre_rwwc == 0) { preseq = kwq->kw_pre_lockseq; + prerw_wc = kwq->kw_pre_sseq; CLEAR_PREPOST_BITS(kwq); - error = kwq_handle_unlock(kwq, preseq, &updatebits, (KW_UNLOCK_PREPOST_YWRLOCK|KW_UNLOCK_PREPOST), &block, lgen); + if ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) != 0){ + kwq->kw_kflags &= ~KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 0, 0); +#endif /* _PSYNCH_TRACE_ */ + } + error = kwq_handle_unlock(kwq, preseq, prerw_wc, &updatebits, (KW_UNLOCK_PREPOST_YWRLOCK|KW_UNLOCK_PREPOST), &block, lgen); +#if __TESTPANICS__ if (error != 0) panic("kwq_handle_unlock failed %d\n",error); +#endif /* __TESTPANICS__ */ if (block == 0) { ksyn_wqunlock(kwq); + *retval = updatebits; goto out; } /* insert to q and proceed as ususal */ @@ -1582,37 +2023,40 @@ psynch_rw_yieldwrlock(__unused proc_t p, struct psynch_rw_yieldwrlock_args * ua } #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWYWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 3, 0, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWYWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 3, 0, 0, 0); #endif /* _PSYNCH_TRACE_ */ - error = ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_YWRITER], lgen, uth, SEQFIT); + error = ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_YWRITER], lgen, uth, kwe, SEQFIT); +#if __TESTPANICS__ if (error != 0) panic("psynch_rw_yieldwrlock: failed to enqueue\n"); +#endif /* __TESTPANICS__ */ - error = ksyn_block_thread_locked(kwq, (uint64_t)0, uth); + error = ksyn_block_thread_locked(kwq, (uint64_t)0, kwe, 0); out: if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWYWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 4, error, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWYWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 4, error, 0, 0); #endif /* _PSYNCH_TRACE_ */ ksyn_wqlock(kwq); - if (uth->uu_kwqqueue != NULL) - ksyn_queue_removeitem(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_YWRITER], uth); + if (kwe->kwe_kwqqueue != NULL) + ksyn_queue_removeitem(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_YWRITER], kwe); ksyn_wqunlock(kwq); } else { /* update bits */ - *retval = uth->uu_psynchretval; + *retval = kwe->kwe_psynchretval; + returnbits = kwe->kwe_psynchretval; } - ksyn_wqrelease(kwq, NULL); + ksyn_wqrelease(kwq, NULL, 0, (KSYN_WQTYPE_INWAIT | KSYN_WQTYPE_RWLOCK)); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWYWRLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWYWRLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, returnbits, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } - +#if NOTYET /* * psynch_rw_downgrade: This system call is used for wakeup blocked readers who are eligible to run due to downgrade. */ @@ -1626,72 +2070,93 @@ psynch_rw_downgrade(__unused proc_t p, struct psynch_rw_downgrade_args * uap, __ //uint64_t tid = uap->tid; int flags = uap->flags; uint32_t count = 0; - + int isinit = lgen & PTHRW_RWL_INIT; ksyn_wait_queue_t kwq; int error=0; uthread_t uth; uint32_t curgen = 0; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWDOWNGRADE | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWDOWNGRADE | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); #endif /* _PSYNCH_TRACE_ */ uth = current_uthread(); curgen = (lgen & PTHRW_COUNT_MASK); - error = ksyn_wqfind(rwlock, lgen, ugen, rw_wc, TID_ZERO, flags, (KSYN_WQTYPE_INWAIT|KSYN_WQTYPE_RWLOCK), &kwq); + error = ksyn_wqfind(rwlock, lgen, ugen, rw_wc, TID_ZERO, flags, (KSYN_WQTYPE_INDROP | KSYN_WQTYPE_RWLOCK), &kwq); if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWDOWNGRADE | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWDOWNGRADE | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } ksyn_wqlock(kwq); - if (is_seqlower(ugen, kwq->kw_lastunlockseq)!= 0) { + if ((lgen & PTHRW_RWL_INIT) != 0) { + lgen &= ~PTHRW_RWL_INIT; + if ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) == 0){ + CLEAR_REINIT_BITS(kwq); + kwq->kw_kflags |= KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 1, 0); +#endif /* _PSYNCH_TRACE_ */ + } + isinit = 1; + } + + /* if lastunlock seq is set, ensure the current one is not lower than that, as it would be spurious */ + if ((kwq->kw_lastunlockseq != PTHRW_RWL_INIT) && (is_seqlower(ugen, kwq->kw_lastunlockseq)!= 0)) { /* spurious updatebits?? */ + error = 0; goto out; } - /* fast path for default case */ - if((rw_wc == kwq->kw_inqueue) && (kwq->kw_highseq == curgen)) - goto dounlock; - /* have we seen all the waiters? */ - if(rw_wc > kwq->kw_inqueue) { - goto prepost; + + + /* If L-U != num of waiters, then it needs to be preposted or spr */ + diff = find_diff(lgen, ugen); + /* take count of the downgrade thread itself */ + diff--; + + +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 1, kwq->kw_inqueue, curgen, 0); +#endif /* _PSYNCH_TRACE_ */ + if (find_seq_till(kwq, curgen, diff, &count) == 0) { + if (count < (uint32_t)diff) + goto prepost; } - - if (is_seqhigher(curgen, kwq->kw_highseq) != 0) { - goto prepost; - } else { - if (find_seq_till(kwq, curgen, rw_wc, &count) == 0) { - if (count < rw_wc) { - kwq->kw_pre_limrd = rw_wc - count; - kwq->kw_pre_limrdseq = lgen; - kwq->kw_pre_limrdbits = lgen; - /* found none ? */ - if (count == 0) - goto out; - } - } + + /* no prepost and all threads are in place, reset the bit */ + if ((isinit != 0) && ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) != 0)){ + kwq->kw_kflags &= ~KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 0, 0); +#endif /* _PSYNCH_TRACE_ */ } + + /* can handle unlock now */ + CLEAR_PREPOST_BITS(kwq); + dounlock: #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWDOWNGRADE | DBG_FUNC_NONE, (uint32_t)rwlock, 3, 0, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWDOWNGRADE | DBG_FUNC_NONE, (uint32_t)rwlock, 3, 0, 0, 0); #endif /* _PSYNCH_TRACE_ */ error = kwq_handle_downgrade(kwq, lgen, 0, 0, NULL); +#if __TESTPANICS__ if (error != 0) panic("psynch_rw_downgrade: failed to wakeup\n"); +#endif /* __TESTPANICS__ */ out: ksyn_wqunlock(kwq); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWDOWNGRADE | DBG_FUNC_END, (uint32_t)rwlock, 0, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWDOWNGRADE | DBG_FUNC_END, (uint32_t)rwlock, 0, 0, error, 0); #endif /* _PSYNCH_TRACE_ */ - ksyn_wqrelease(kwq, NULL); + ksyn_wqrelease(kwq, NULL, 0, (KSYN_WQTYPE_INDROP | KSYN_WQTYPE_RWLOCK)); return(error); @@ -1699,7 +2164,7 @@ psynch_rw_downgrade(__unused proc_t p, struct psynch_rw_downgrade_args * uap, __ kwq->kw_pre_rwwc = (rw_wc - count); kwq->kw_pre_lockseq = lgen; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWDOWNGRADE | DBG_FUNC_NONE, (uint32_t)rwlock, 1, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWDOWNGRADE | DBG_FUNC_NONE, (uint32_t)rwlock, 1, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); #endif /* _PSYNCH_TRACE_ */ error = 0; goto out; @@ -1723,32 +2188,49 @@ psynch_rw_upgrade(__unused proc_t p, struct psynch_rw_upgrade_args * uap, uint32 int error=0; uthread_t uth; uint32_t lockseq = 0, updatebits = 0, preseq = 0; + int isinit = lgen & PTHRW_RWL_INIT; + ksyn_waitq_element_t kwe; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUPGRADE | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUPGRADE | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); #endif /* _PSYNCH_TRACE_ */ uth = current_uthread(); - - uth->uu_lockseq = lgen; + kwe = &uth->uu_kwe; + kwe->kwe_lockseq = lgen; + kwe->kwe_uth = uth; + kwe->kwe_psynchretval = 0; + kwe->kwe_kwqqueue = NULL; lockseq = (lgen & PTHRW_COUNT_MASK); - - error = ksyn_wqfind(rwlock, lgen, ugen, rw_wc, TID_ZERO, flags, (KSYN_WQTYPE_INWAIT|KSYN_WQTYPE_RWLOCK), &kwq); + + error = ksyn_wqfind(rwlock, lgen, ugen, rw_wc, TID_ZERO, flags, (KSYN_WQTYPE_INWAIT | KSYN_WQTYPE_RWLOCK), &kwq); if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUPGRADE | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUPGRADE | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } ksyn_wqlock(kwq); - + + if (isinit != 0) { + lgen &= ~PTHRW_RWL_INIT; + if ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) == 0) { + /* first to notice the reset of the lock, clear preposts */ + CLEAR_REINIT_BITS(kwq); + kwq->kw_kflags |= KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 1, 0); +#endif /* _PSYNCH_TRACE_ */ + } + } + /* handle first the missed wakeups */ if ((kwq->kw_pre_intrcount != 0) && - (kwq->kw_pre_intrtype == PTH_RW_TYPE_UPGRADE) && + ((kwq->kw_pre_intrtype == PTH_RW_TYPE_READ) || (kwq->kw_pre_intrtype == PTH_RW_TYPE_LREAD)) && (is_seqlower_eq(lockseq, (kwq->kw_pre_intrseq & PTHRW_COUNT_MASK)) != 0)) { kwq->kw_pre_intrcount--; - uth->uu_psynchretval = kwq->kw_pre_intrretbits; + kwe->kwe_psynchretval = kwq->kw_pre_intrretbits; if (kwq->kw_pre_intrcount==0) CLEAR_INTR_PREPOST_BITS(kwq); ksyn_wqunlock(kwq); @@ -1757,15 +2239,24 @@ psynch_rw_upgrade(__unused proc_t p, struct psynch_rw_upgrade_args * uap, uint32 if ((kwq->kw_pre_rwwc != 0) && (is_seqlower_eq(lockseq, (kwq->kw_pre_lockseq & PTHRW_COUNT_MASK)) != 0)) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUPGRADE | DBG_FUNC_NONE, (uint32_t)rwlock, 2, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 2, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); #endif /* _PSYNCH_TRACE_ */ kwq->kw_pre_rwwc--; if (kwq->kw_pre_rwwc == 0) { preseq = kwq->kw_pre_lockseq; + prerw_wc = kwq->kw_pre_sseq; CLEAR_PREPOST_BITS(kwq); - error = kwq_handle_unlock(kwq, preseq, &updatebits, (KW_UNLOCK_PREPOST_UPGRADE|KW_UNLOCK_PREPOST), &block, lgen); + if ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) != 0){ + kwq->kw_kflags &= ~KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 0, 0); +#endif /* _PSYNCH_TRACE_ */ + } + error = kwq_handle_unlock(kwq, preseq, prerw_wc, &updatebits, (KW_UNLOCK_PREPOST_UPGRADE|KW_UNLOCK_PREPOST), &block, lgen); +#if __TESTPANICS__ if (error != 0) - panic("kwq_handle_unlock failed %d\n",error); + panic("rw_rdlock: kwq_handle_unlock failed %d\n",error); +#endif /* __TESTPANICS__ */ if (block == 0) { ksyn_wqunlock(kwq); goto out; @@ -1776,37 +2267,52 @@ psynch_rw_upgrade(__unused proc_t p, struct psynch_rw_upgrade_args * uap, uint32 #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUPGRADE | DBG_FUNC_NONE, (uint32_t)rwlock, 3, 0, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUPGRADE | DBG_FUNC_NONE, (uint32_t)rwlock, 3, 0, 0, 0); #endif /* _PSYNCH_TRACE_ */ - error = ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_UPGRADE], lgen, uth, SEQFIT); + error = ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_UPGRADE], lgen, uth, kwe, SEQFIT); +#if __TESTPANICS__ if (error != 0) panic("psynch_rw_upgrade: failed to enqueue\n"); +#endif /* __TESTPANICS__ */ - error = ksyn_block_thread_locked(kwq, (uint64_t)0, uth); + error = ksyn_block_thread_locked(kwq, (uint64_t)0, kwe, 0); /* drops the lock */ out: if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUPGRADE | DBG_FUNC_NONE, (uint32_t)rwlock, 4, error, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUPGRADE | DBG_FUNC_NONE, (uint32_t)rwlock, 4, error, 0, 0); #endif /* _PSYNCH_TRACE_ */ ksyn_wqlock(kwq); - if (uth->uu_kwqqueue != NULL) - ksyn_queue_removeitem(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_UPGRADE], uth); + if (kwe->kwe_kwqqueue != NULL) + ksyn_queue_removeitem(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_UPGRADE], kwe); ksyn_wqunlock(kwq); } else { /* update bits */ - *retval = uth->uu_psynchretval; + *retval = kwe->kwe_psynchretval; } - ksyn_wqrelease(kwq, NULL); + ksyn_wqrelease(kwq, NULL, 0, (KSYN_WQTYPE_INWAIT | KSYN_WQTYPE_RWLOCK)); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUPGRADE | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUPGRADE | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); #endif /* _PSYNCH_TRACE_ */ + return(error); } +#else /* NOTYET */ +int +psynch_rw_upgrade(__unused proc_t p, __unused struct psynch_rw_upgrade_args * uap, __unused uint32_t * retval) +{ + return(0); +} +int +psynch_rw_downgrade(__unused proc_t p, __unused struct psynch_rw_downgrade_args * uap, __unused int * retval) +{ + return(0); +} +#endif /* NOTYET */ /* * psynch_rw_unlock: This system call is used for unlock state postings. This will grant appropriate * reader/writer variety lock. @@ -1825,19 +2331,20 @@ psynch_rw_unlock(__unused proc_t p, struct psynch_rw_unlock_args * uap, uint32_ uthread_t uth; ksyn_wait_queue_t kwq; uint32_t updatebits = 0; - int error=0; + int error=0, diff; uint32_t count = 0; + int isinit = 0; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); #endif /* _PSYNCH_TRACE_ */ uth = current_uthread(); - error = ksyn_wqfind(rwlock, lgen, ugen, rw_wc, TID_ZERO, flags, (KSYN_WQTYPE_RWLOCK), &kwq); + error = ksyn_wqfind(rwlock, lgen, ugen, rw_wc, TID_ZERO, flags, (KSYN_WQTYPE_INDROP | KSYN_WQTYPE_RWLOCK), &kwq); if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } @@ -1846,59 +2353,87 @@ psynch_rw_unlock(__unused proc_t p, struct psynch_rw_unlock_args * uap, uint32_ ksyn_wqlock(kwq); - if ((lgen & PTHRW_RW_INIT) != 0) { - kwq->kw_lastunlockseq = 0; - lgen &= ~PTHRW_RW_INIT; - } else if (is_seqlower(ugen, kwq->kw_lastunlockseq) != 0) { - /* spurious updatebits set */ - updatebits = PTHRW_RW_SPURIOUS; + if ((lgen & PTHRW_RWL_INIT) != 0) { + lgen &= ~PTHRW_RWL_INIT; + if ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) == 0){ + CLEAR_REINIT_BITS(kwq); + kwq->kw_kflags |= KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 1, 0); +#endif /* _PSYNCH_TRACE_ */ + } + isinit = 1; + } + + /* if lastunlock seq is set, ensure the current one is not lower than that, as it would be spurious */ + if ((kwq->kw_lastunlockseq != PTHRW_RWL_INIT) && (is_seqlower(ugen, kwq->kw_lastunlockseq)!= 0)) { +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, (uint32_t)0xeeeeeeee, rw_wc, kwq->kw_lastunlockseq, 0); +#endif /* _PSYNCH_TRACE_ */ + error = 0; goto out; } + /* If L-U != num of waiters, then it needs to be preposted or spr */ + diff = find_diff(lgen, ugen); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 1, kwq->kw_inqueue, curgen, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 1, kwq->kw_inqueue, curgen, 0); #endif /* _PSYNCH_TRACE_ */ - if (find_seq_till(kwq, curgen, rw_wc, &count) == 0) { - if (count < rw_wc) + if (find_seq_till(kwq, curgen, diff, &count) == 0) { + if ((count == 0) || (count < (uint32_t)diff)) goto prepost; } + /* no prepost and all threads are in place, reset the bit */ + if ((isinit != 0) && ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) != 0)){ + kwq->kw_kflags &= ~KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 0, 0); +#endif /* _PSYNCH_TRACE_ */ + } /* can handle unlock now */ CLEAR_PREPOST_BITS(kwq); - kwq->kw_lastunlockseq = ugen; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 2, 0, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 2, 0, 0, 0); #endif /* _PSYNCH_TRACE_ */ - error = kwq_handle_unlock(kwq, lgen, &updatebits, 0, NULL, 0); + error = kwq_handle_unlock(kwq, lgen, rw_wc, &updatebits, 0, NULL, 0); +#if __TESTPANICS__ if (error != 0) panic("psynch_rw_unlock: kwq_handle_unlock failed %d\n",error); +#endif /* __TESTPANICS__ */ out: if (error == 0) { /* update bits?? */ *retval = updatebits; } + + ksyn_wqunlock(kwq); - ksyn_wqrelease(kwq, NULL); + ksyn_wqrelease(kwq, NULL, 0, (KSYN_WQTYPE_INDROP | KSYN_WQTYPE_RWLOCK)); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_END, (uint32_t)rwlock, 0, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_END, (uint32_t)rwlock, 0, updatebits, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); prepost: - kwq->kw_pre_rwwc = (rw_wc - count); - kwq->kw_pre_lockseq = curgen; - kwq->kw_lastunlockseq = ugen; + /* update if the new seq is higher than prev prepost, or first set */ + if ((is_rws_setseq(kwq->kw_pre_sseq) != 0) || + (is_seqhigher_eq((rw_wc & PTHRW_COUNT_MASK), (kwq->kw_pre_sseq & PTHRW_COUNT_MASK)) != 0)) { + kwq->kw_pre_rwwc = (diff - count); + kwq->kw_pre_lockseq = curgen; + kwq->kw_pre_sseq = rw_wc; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 3, rw_wc, count, 0); - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 4, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 3, rw_wc, count, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 4, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); #endif /* _PSYNCH_TRACE_ */ - updatebits = (lgen | PTHRW_RW_SPURIOUS);/* let this not do unlock handling */ + updatebits = lgen; /* let this not do unlock handling */ + } error = 0; goto out; } @@ -1909,86 +2444,9 @@ psynch_rw_unlock(__unused proc_t p, struct psynch_rw_unlock_args * uap, uint32_ * to new reader arrival races */ int -psynch_rw_unlock2(__unused proc_t p, struct psynch_rw_unlock2_args * uap, uint32_t * retval) +psynch_rw_unlock2(__unused proc_t p, __unused struct psynch_rw_unlock2_args * uap, __unused uint32_t * retval) { - user_addr_t rwlock = uap->rwlock; - uint32_t lgen = uap->lgenval; - uint32_t ugen = uap->ugenval; - uint32_t rw_wc = uap->rw_wc; - //uint64_t tid = uap->tid; - int flags = uap->flags; - uthread_t uth; - uint32_t num_lreader, limitread, curgen, updatebits; - ksyn_wait_queue_t kwq; - int error=0, longreadset = 0; - int diff; - uint32_t count=0; - -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUNLOCK2 | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); -#endif /* _PSYNCH_TRACE_ */ - uth = current_uthread(); - - error = ksyn_wqfind(rwlock, lgen, ugen, rw_wc, TID_ZERO, flags, (KSYN_WQTYPE_RWLOCK), &kwq); - if (error != 0) { -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUNLOCK2 | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); -#endif /* _PSYNCH_TRACE_ */ - return(error); - } - - ksyn_wqlock(kwq); - - curgen = (lgen & PTHRW_COUNT_MASK); - diff = find_diff(lgen, ugen); - - limitread = lgen & PTHRW_COUNT_MASK; - - if (find_seq_till(kwq, curgen, diff, &count) == 0) { - kwq->kw_pre_limrd = diff - count; - kwq->kw_pre_limrdseq = lgen; - kwq->kw_pre_limrdbits = lgen; - /* found none ? */ - if (count == 0) - goto out; - } - - if (kwq->kw_ksynqueues[KSYN_QUEUE_LREAD].ksynq_count != 0) { - num_lreader = kwq->kw_ksynqueues[KSYN_QUEUE_LREAD].ksynq_firstnum; - if (is_seqlower_eq(num_lreader, limitread) != 0) - longreadset = 1; - } - - updatebits = lgen; -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUNLOCK2 | DBG_FUNC_NONE, (uint32_t)rwlock, 3, 0, 0, 0); -#endif /* _PSYNCH_TRACE_ */ - count = ksyn_wakeupreaders(kwq, limitread, longreadset, 0, updatebits, NULL); - - if (count != 0) { - if (kwq->kw_pre_limrd != 0) { - kwq->kw_pre_limrd += count; - } else { - kwq->kw_pre_limrd = count; - kwq->kw_pre_limrdseq = lgen; - kwq->kw_pre_limrdbits = lgen; - } - } - error = 0; - -out: - if (error == 0) { - /* update bits?? */ - *retval = uth->uu_psynchretval; - } - ksyn_wqunlock(kwq); - - ksyn_wqrelease(kwq, NULL); -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUNLOCK2 | DBG_FUNC_END, (uint32_t)rwlock, 0, 0, error, 0); -#endif /* _PSYNCH_TRACE_ */ - - return(error); + return(ENOTSUP); } @@ -1996,7 +2454,31 @@ psynch_rw_unlock2(__unused proc_t p, struct psynch_rw_unlock2_args * uap, uint3 void pth_global_hashinit() { + int arg; + pth_glob_hashtbl = hashinit(PTH_HASHSIZE * 4, M_PROC, &pthhash); + + /* + * pthtest={0,1,2,3} (override default aborting behavior on pthread sync failures) + * 0 - just return errors + * 1 - print and return errors + * 2 - abort user, print and return errors + * 3 - panic + */ + if (!PE_parse_boot_argn("pthtest", &arg, sizeof(arg))) + arg = __TESTMODE__; + + if (arg == 3) { + __test_panics__ = 1; + printf("Pthread support PANICS when sync kernel primitives misused\n"); + } else if (arg == 2) { + __test_aborts__ = 1; + __test_prints__ = 1; + printf("Pthread support ABORTS when sync kernel primitives misused\n"); + } else if (arg == 1) { + __test_prints__ = 1; + printf("Pthread support LOGS when sync kernel primitives misused\n"); + } } void @@ -2046,6 +2528,10 @@ pth_proc_hashdelete(proc_t p) int hashsize = pthhash + 1; int i; +#if _PSYNCH_TRACE_ + if ((pthread_debug_proc != NULL) && (p == pthread_debug_proc)) + pthread_debug_proc = PROC_NULL; +#endif /* _PSYNCH_TRACE_ */ hashptr = p->p_pthhash; if (hashptr == NULL) return; @@ -2060,16 +2546,39 @@ pth_proc_hashdelete(proc_t p) if ((kwq->kw_pflags & KSYN_WQ_FLIST) != 0) { kwq->kw_pflags &= ~KSYN_WQ_FLIST; LIST_REMOVE(kwq, kw_list); + num_infreekwq--; } + num_freekwq++; pthread_list_unlock(); + /* release fake entries if present for cvars */ + if (((kwq->kw_type & KSYN_WQTYPE_MASK) == KSYN_WQTYPE_CVAR) && (kwq->kw_inqueue != 0)) + ksyn_freeallkwe(&kwq->kw_ksynqueues[KSYN_QUEUE_WRITER]); lck_mtx_destroy(&kwq->kw_lock, pthread_lck_grp); - kfree(kwq, sizeof(struct ksyn_wait_queue)); + zfree(kwq_zone, kwq); } } FREE(p->p_pthhash, M_PROC); p->p_pthhash = NULL; } +/* no lock held for this as the waitqueue is getting freed */ +void +ksyn_freeallkwe(ksyn_queue_t kq) +{ + ksyn_waitq_element_t kwe; + + /* free all the fake entries, dequeue rest */ + kwe = TAILQ_FIRST(&kq->ksynq_kwelist); + while (kwe != NULL) { + if (kwe->kwe_flags != KWE_THREAD_INWAIT) { + TAILQ_REMOVE(&kq->ksynq_kwelist, kwe, kwe_list); + zfree(kwe_zone, kwe); + } else { + TAILQ_REMOVE(&kq->ksynq_kwelist, kwe, kwe_list); + } + kwe = TAILQ_FIRST(&kq->ksynq_kwelist); + } +} /* find kernel waitqueue, if not present create one. Grants a reference */ int @@ -2081,7 +2590,8 @@ ksyn_wqfind(user_addr_t mutex, uint32_t mgen, uint32_t ugen, uint32_t rw_wc, uin uint64_t object = 0, offset = 0; uint64_t hashhint; proc_t p = current_proc(); - int retry = mgen & PTHRW_RETRYBIT; + int retry = mgen & PTH_RWL_RETRYBIT; + struct ksyn_queue kfreeq; int i; if ((flags & PTHREAD_PSHARED_FLAGS_MASK) == PTHREAD_PROCESS_SHARED) @@ -2093,18 +2603,60 @@ ksyn_wqfind(user_addr_t mutex, uint32_t mgen, uint32_t ugen, uint32_t rw_wc, uin hashptr = p->p_pthhash; } + ksyn_queue_init(&kfreeq); + + if (((wqtype & KSYN_WQTYPE_MASK) == KSYN_WQTYPE_MTX) && (retry != 0)) + mgen &= ~PTH_RWL_RETRYBIT; + +loop: //pthread_list_lock_spin(); pthread_list_lock(); kwq = ksyn_wq_hash_lookup(mutex, p, flags, object, offset); if (kwq != NULL) { - kwq->kw_iocount++; if ((kwq->kw_pflags & KSYN_WQ_FLIST) != 0) { LIST_REMOVE(kwq, kw_list); kwq->kw_pflags &= ~KSYN_WQ_FLIST; + num_infreekwq--; + num_reusekwq++; + } + if ((kwq->kw_type & KSYN_WQTYPE_MASK) != (wqtype &KSYN_WQTYPE_MASK)) { + if ((kwq->kw_inqueue == 0) && (kwq->kw_pre_rwwc ==0) && (kwq->kw_pre_intrcount == 0)) { + if (kwq->kw_iocount == 0) { + kwq->kw_addr = mutex; + kwq->kw_flags = flags; + kwq->kw_object = object; + kwq->kw_offset = offset; + kwq->kw_type = (wqtype & KSYN_WQTYPE_MASK); + CLEAR_REINIT_BITS(kwq); + CLEAR_INTR_PREPOST_BITS(kwq); + CLEAR_PREPOST_BITS(kwq); + kwq->kw_lword = mgen; + kwq->kw_uword = ugen; + kwq->kw_sword = rw_wc; + kwq->kw_owner = tid; + } else if ((kwq->kw_iocount == 1) && (kwq->kw_dropcount == kwq->kw_iocount)) { + /* if all users are unlockers then wait for it to finish */ + kwq->kw_pflags |= KSYN_WQ_WAITING; + /* wait for the wq to be free */ + (void)msleep(&kwq->kw_pflags, pthread_list_mlock, PDROP, "ksyn_wqfind", 0); + /* does not have list lock */ + goto loop; + } else { + __FAILEDUSERTEST__("address already known to kernel for another (busy) synchronizer type\n"); + pthread_list_unlock(); + return EBUSY; + } + } else { + __FAILEDUSERTEST__("address already known to kernel for another (busy) synchronizer type(1)\n"); + pthread_list_unlock(); + return EBUSY; + } } - UPDATE_KWQ(kwq, mgen, ugen, rw_wc, tid, wqtype, retry); + kwq->kw_iocount++; + if (wqtype == KSYN_WQTYPE_MUTEXDROP) + kwq->kw_dropcount++; if (kwqp != NULL) *kwqp = kwq; pthread_list_unlock(); @@ -2113,23 +2665,34 @@ ksyn_wqfind(user_addr_t mutex, uint32_t mgen, uint32_t ugen, uint32_t rw_wc, uin pthread_list_unlock(); - nkwq = kalloc(sizeof(struct ksyn_wait_queue)); + nkwq = (ksyn_wait_queue_t)zalloc(kwq_zone); bzero(nkwq, sizeof(struct ksyn_wait_queue)); nkwq->kw_addr = mutex; nkwq->kw_flags = flags; nkwq->kw_iocount = 1; + if (wqtype == KSYN_WQTYPE_MUTEXDROP) + nkwq->kw_dropcount++; nkwq->kw_object = object; nkwq->kw_offset = offset; nkwq->kw_type = (wqtype & KSYN_WQTYPE_MASK); - TAILQ_INIT(&nkwq->kw_uthlist); + nkwq->kw_lastseqword = PTHRW_RWS_INIT; + if (nkwq->kw_type == KSYN_WQTYPE_RWLOCK) + nkwq->kw_nextseqword = PTHRW_RWS_INIT; + + nkwq->kw_pre_sseq = PTHRW_RWS_INIT; + + CLEAR_PREPOST_BITS(nkwq); + CLEAR_INTR_PREPOST_BITS(nkwq); + CLEAR_REINIT_BITS(nkwq); + nkwq->kw_lword = mgen; + nkwq->kw_uword = ugen; + nkwq->kw_sword = rw_wc; + nkwq->kw_owner = tid; + for (i=0; i< KSYN_QUEUE_MAX; i++) ksyn_queue_init(&nkwq->kw_ksynqueues[i]); - UPDATE_KWQ(nkwq, mgen, ugen, rw_wc, tid, wqtype, retry); -#if USE_WAITQUEUE - wait_queue_init(&nkwq->kw_wq, SYNC_POLICY_FIFO); -#endif /* USE_WAITQUEUE */ lck_mtx_init(&nkwq->kw_lock, pthread_lck_grp, pthread_lck_attr); //pthread_list_lock_spin(); @@ -2138,21 +2701,67 @@ ksyn_wqfind(user_addr_t mutex, uint32_t mgen, uint32_t ugen, uint32_t rw_wc, uin kwq = ksyn_wq_hash_lookup(mutex, p, flags, object, offset); if (kwq != NULL) { - kwq->kw_iocount++; if ((kwq->kw_pflags & KSYN_WQ_FLIST) != 0) { LIST_REMOVE(kwq, kw_list); kwq->kw_pflags &= ~KSYN_WQ_FLIST; + num_infreekwq--; + num_reusekwq++; + } + if ((kwq->kw_type & KSYN_WQTYPE_MASK) != (wqtype &KSYN_WQTYPE_MASK)) { + if ((kwq->kw_inqueue == 0) && (kwq->kw_pre_rwwc ==0) && (kwq->kw_pre_intrcount == 0)) { + if (kwq->kw_iocount == 0) { + kwq->kw_addr = mutex; + kwq->kw_flags = flags; + kwq->kw_object = object; + kwq->kw_offset = offset; + kwq->kw_type = (wqtype & KSYN_WQTYPE_MASK); + CLEAR_REINIT_BITS(kwq); + CLEAR_INTR_PREPOST_BITS(kwq); + CLEAR_PREPOST_BITS(kwq); + kwq->kw_lword = mgen; + kwq->kw_uword = ugen; + kwq->kw_sword = rw_wc; + kwq->kw_owner = tid; + } else if ((kwq->kw_iocount == 1) && (kwq->kw_dropcount == kwq->kw_iocount)) { + kwq->kw_pflags |= KSYN_WQ_WAITING; + /* wait for the wq to be free */ + (void)msleep(&kwq->kw_pflags, pthread_list_mlock, PDROP, "ksyn_wqfind", 0); + + lck_mtx_destroy(&nkwq->kw_lock, pthread_lck_grp); + zfree(kwq_zone, nkwq); + /* will acquire lock again */ + + goto loop; + } else { + __FAILEDUSERTEST__("address already known to kernel for another [busy] synchronizer type(2)\n"); + pthread_list_unlock(); + lck_mtx_destroy(&nkwq->kw_lock, pthread_lck_grp); + zfree(kwq_zone, nkwq); + return EBUSY; + } + } else { + __FAILEDUSERTEST__("address already known to kernel for another [busy] synchronizer type(3)\n"); + pthread_list_unlock(); + lck_mtx_destroy(&nkwq->kw_lock, pthread_lck_grp); + zfree(kwq_zone, nkwq); + return EBUSY; + } } - UPDATE_KWQ(kwq, mgen, ugen, rw_wc, tid, wqtype, retry); + kwq->kw_iocount++; + if (wqtype == KSYN_WQTYPE_MUTEXDROP) + kwq->kw_dropcount++; if (kwqp != NULL) *kwqp = kwq; pthread_list_unlock(); lck_mtx_destroy(&nkwq->kw_lock, pthread_lck_grp); - kfree(nkwq, sizeof(struct ksyn_wait_queue)); + zfree(kwq_zone, nkwq); return (0); } kwq = nkwq; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, kwq->kw_lword, kwq->kw_uword, kwq->kw_sword, 0xffff, 0); +#endif /* _PSYNCH_TRACE_ */ if ((flags & PTHREAD_PSHARED_FLAGS_MASK) == PTHREAD_PROCESS_SHARED) { kwq->kw_pflags |= KSYN_WQ_SHARED; @@ -2161,6 +2770,7 @@ ksyn_wqfind(user_addr_t mutex, uint32_t mgen, uint32_t ugen, uint32_t rw_wc, uin LIST_INSERT_HEAD(&hashptr[mutex & pthhash], kwq, kw_hash); kwq->kw_pflags |= KSYN_WQ_INHASH; + num_total_kwq++; pthread_list_unlock(); @@ -2171,34 +2781,81 @@ ksyn_wqfind(user_addr_t mutex, uint32_t mgen, uint32_t ugen, uint32_t rw_wc, uin /* Reference from find is dropped here. Starts the free process if needed */ void -ksyn_wqrelease(ksyn_wait_queue_t kwq, ksyn_wait_queue_t ckwq) +ksyn_wqrelease(ksyn_wait_queue_t kwq, ksyn_wait_queue_t ckwq, int qfreenow, int wqtype) { uint64_t deadline; struct timeval t; int sched = 0; - + ksyn_wait_queue_t free_elem = NULL; + ksyn_wait_queue_t free_elem1 = NULL; //pthread_list_lock_spin(); pthread_list_lock(); kwq->kw_iocount--; + if (wqtype == KSYN_WQTYPE_MUTEXDROP) { + kwq->kw_dropcount--; + } if (kwq->kw_iocount == 0) { - if ((kwq->kw_pre_rwwc == 0) && (kwq->kw_inqueue == 0)) { - microuptime(&kwq->kw_ts); - LIST_INSERT_HEAD(&pth_free_list, kwq, kw_list); - kwq->kw_pflags |= KSYN_WQ_FLIST; + if ((kwq->kw_pflags & KSYN_WQ_WAITING) != 0) { + /* some one is waiting for the waitqueue, wake them up */ + kwq->kw_pflags &= ~KSYN_WQ_WAITING; + wakeup(&kwq->kw_pflags); } - sched = 1; + + if ((kwq->kw_pre_rwwc == 0) && (kwq->kw_inqueue == 0) && (kwq->kw_pre_intrcount == 0)) { + if (qfreenow == 0) { + microuptime(&kwq->kw_ts); + LIST_INSERT_HEAD(&pth_free_list, kwq, kw_list); + kwq->kw_pflags |= KSYN_WQ_FLIST; + num_infreekwq++; + free_elem = NULL; + } else { + /* remove from the only list it is in ie hash */ + kwq->kw_pflags &= ~(KSYN_WQ_FLIST | KSYN_WQ_INHASH); + LIST_REMOVE(kwq, kw_hash); + lck_mtx_destroy(&kwq->kw_lock, pthread_lck_grp); + num_total_kwq--; + num_freekwq++; + free_elem = kwq; + } + } else + free_elem = NULL; + if (qfreenow == 0) + sched = 1; } - if (ckwq != NULL){ + + if (ckwq != NULL) { ckwq->kw_iocount--; + if (wqtype == KSYN_WQTYPE_MUTEXDROP) { + kwq->kw_dropcount--; + } if ( ckwq->kw_iocount == 0) { - if ((ckwq->kw_pre_rwwc == 0) && (ckwq->kw_inqueue == 0)) { - /* mark for free if we can */ - microuptime(&ckwq->kw_ts); - LIST_INSERT_HEAD(&pth_free_list, ckwq, kw_list); - ckwq->kw_pflags |= KSYN_WQ_FLIST; + if ((kwq->kw_pflags & KSYN_WQ_WAITING) != 0) { + /* some one is waiting for the waitqueue, wake them up */ + kwq->kw_pflags &= ~KSYN_WQ_WAITING; + wakeup(&kwq->kw_pflags); } - sched = 1; + if ((ckwq->kw_pre_rwwc == 0) && (ckwq->kw_inqueue == 0) && (ckwq->kw_pre_intrcount == 0)) { + if (qfreenow == 0) { + /* mark for free if we can */ + microuptime(&ckwq->kw_ts); + LIST_INSERT_HEAD(&pth_free_list, ckwq, kw_list); + ckwq->kw_pflags |= KSYN_WQ_FLIST; + num_infreekwq++; + free_elem1 = NULL; + } else { + /* remove from the only list it is in ie hash */ + ckwq->kw_pflags &= ~(KSYN_WQ_FLIST | KSYN_WQ_INHASH); + LIST_REMOVE(ckwq, kw_hash); + lck_mtx_destroy(&ckwq->kw_lock, pthread_lck_grp); + num_total_kwq--; + num_freekwq++; + free_elem1 = ckwq; + } + } else + free_elem1 = NULL; + if (qfreenow == 0) + sched = 1; } } @@ -2211,6 +2868,10 @@ ksyn_wqrelease(ksyn_wait_queue_t kwq, ksyn_wait_queue_t ckwq) thread_call_enter_delayed(psynch_thcall, deadline); } pthread_list_unlock(); + if (free_elem != NULL) + zfree(kwq_zone, free_elem); + if (free_elem1 != NULL) + zfree(kwq_zone, free_elem1); } /* responsible to free the waitqueues */ @@ -2226,16 +2887,13 @@ psynch_wq_cleanup(__unused void * param, __unused void * param1) //pthread_list_lock_spin(); pthread_list_lock(); + num_addedfreekwq = num_infreekwq - num_lastfreekwqcount; + num_lastfreekwqcount = num_infreekwq; microuptime(&t); LIST_FOREACH(kwq, &pth_free_list, kw_list) { - - if (count > 100) { - delayed = 1; - break; - } - if ((kwq->kw_iocount != 0) && (kwq->kw_inqueue != 0)) { - /* still in freelist ??? */ + if ((kwq->kw_iocount != 0) || (kwq->kw_pre_rwwc != 0) || (kwq->kw_inqueue != 0) || (kwq->kw_pre_intrcount != 0)) { + /* still in use */ continue; } diff = t.tv_sec - kwq->kw_ts.tv_sec; @@ -2244,10 +2902,13 @@ psynch_wq_cleanup(__unused void * param, __unused void * param1) if (diff >= KSYN_CLEANUP_DEADLINE) { /* out of hash */ kwq->kw_pflags &= ~(KSYN_WQ_FLIST | KSYN_WQ_INHASH); + num_infreekwq--; + num_freekwq++; LIST_REMOVE(kwq, kw_hash); LIST_REMOVE(kwq, kw_list); LIST_INSERT_HEAD(&freelist, kwq, kw_list); count ++; + num_total_kwq--; } else { delayed = 1; } @@ -2268,23 +2929,22 @@ psynch_wq_cleanup(__unused void * param, __unused void * param1) while ((kwq = LIST_FIRST(&freelist)) != NULL) { LIST_REMOVE(kwq, kw_list); lck_mtx_destroy(&kwq->kw_lock, pthread_lck_grp); - kfree(kwq, sizeof(struct ksyn_wait_queue)); + zfree(kwq_zone, kwq); } } int -ksyn_block_thread_locked(ksyn_wait_queue_t kwq, uint64_t abstime, uthread_t uth) +ksyn_block_thread_locked(ksyn_wait_queue_t kwq, uint64_t abstime, ksyn_waitq_element_t kwe, int mylog) { kern_return_t kret; int error = 0; +#if _PSYNCH_TRACE_ + uthread_t uth = NULL; +#endif /* _PSYNCH_TRACE_ */ - uth->uu_kwqqueue = (void *)kwq; -#if USE_WAITQUEUE - kret = wait_queue_assert_wait64(&kwq->kw_wq, kwq->kw_addr, THREAD_ABORTSAFE, abstime); -#else /* USE_WAITQUEUE */ - assert_wait_deadline(&uth->uu_psynchretval, THREAD_ABORTSAFE, abstime); -#endif /* USE_WAITQUEUE */ + kwe->kwe_kwqqueue = (void *)kwq; + assert_wait_deadline(&kwe->kwe_psynchretval, THREAD_ABORTSAFE, abstime); ksyn_wqunlock(kwq); kret = thread_block(NULL); @@ -2296,116 +2956,42 @@ ksyn_block_thread_locked(ksyn_wait_queue_t kwq, uint64_t abstime, uthread_t uth) error = EINTR; break; } +#if _PSYNCH_TRACE_ + uth = current_uthread(); +#if defined(__i386__) + if (mylog != 0) + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_THWAKEUP | DBG_FUNC_NONE, 0xf4f3f2f1, (uint32_t)uth, kret, 0, 0); +#else + if (mylog != 0) + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_THWAKEUP | DBG_FUNC_NONE, 0xeeeeeeee, kret, error, 0xeeeeeeee, 0); +#endif +#endif /* _PSYNCH_TRACE_ */ + return(error); } kern_return_t -#if USE_WAITQUEUE -ksyn_wakeup_thread(ksyn_wait_queue_t kwq, uthread_t uth) -#else /* USE_WAITQUEUE */ -ksyn_wakeup_thread(__unused ksyn_wait_queue_t kwq, uthread_t uth) -#endif /* USE_WAITQUEUE */ +ksyn_wakeup_thread(__unused ksyn_wait_queue_t kwq, ksyn_waitq_element_t kwe) { - thread_t th; kern_return_t kret; - th = uth->uu_context.vc_thread; +#if _PSYNCH_TRACE_ + uthread_t uth = NULL; +#endif /* _PSYNCH_TRACE_ */ -#if USE_WAITQUEUE - kret = wait_queue_wakeup64_thread(&kwq->kw_wq, kwq->kw_addr, th, THREAD_AWAKENED); -#else /* USE_WAITQUEUE */ - kret = thread_wakeup_prim((caddr_t)&uth->uu_psynchretval, TRUE, THREAD_AWAKENED); -#endif /* USE_WAITQUEUE */ + kret = thread_wakeup_one((caddr_t)&kwe->kwe_psynchretval); if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) panic("ksyn_wakeup_thread: panic waking up thread %x\n", kret); +#if _PSYNCH_TRACE_ + uth = kwe->kwe_uth; +#if defined(__i386__) + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_THWAKEUP | DBG_FUNC_NONE, 0xf1f2f3f4, (uint32_t)uth, kret, 0, 0); +#endif +#endif /* _PSYNCH_TRACE_ */ - - return(kret); } -/* move from one waitqueue to another */ -#if COND_MTX_WAITQUEUEMOVE -void -ksyn_move_wqthread( ksyn_wait_queue_t ckwq, ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t updateval, int diffgen, int nomutex) -#else /* COND_MTX_WAITQUEUEMOVE */ -void -ksyn_move_wqthread( ksyn_wait_queue_t ckwq, __unused ksyn_wait_queue_t kwq, __unused uint32_t mgen, uint32_t updateval, __unused int diffgen, int nomutex) -#endif /* COND_MTX_WAITQUEUEMOVE */ -{ - kern_return_t kret; - uthread_t uth; -#if COND_MTX_WAITQUEUEMOVE - int count = 0, error, kret; - uint32_t nextgen = mgen; -#endif /* COND_MTX_WAITQUEUEMOVE */ - struct ksyn_queue kq; - uint32_t upgen; - - ksyn_queue_init(&kq); -#if USE_WAITQUEUE - /* TBD wq move */ - kret = wait_queue_move_all(&ckwq->kw_wq, ckwq->kw_addr, &kwq->kw_wq, kwq->kw_addr); -#else /* USE_WAITQUEUE */ - /* no need to move as the thread is blocked at uthread address */ - kret = KERN_SUCCESS; -#endif /* USE_WAITQUEUE */ - - if (nomutex != 0) - upgen = updateval | PTHRW_MTX_NONE; - else - upgen = updateval; - - if (kret== KERN_SUCCESS) { -redrive: - while ((uth = ksyn_queue_removefirst(&ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER], ckwq)) != NULL) { - if (nomutex != 0) { -#if COND_MTX_WAITQUEUEMOVE - uth->uu_psynchretval = upgen; -#else /* COND_MTX_WAITQUEUEMOVE */ - uth->uu_psynchretval = 0; - uth->uu_kwqqueue = NULL; - kret = ksyn_wakeup_thread(ckwq, uth); - if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) - panic("ksyn_move_wqthread: panic waking up \n"); - if (kret == KERN_NOT_WAITING) - goto redrive; -#endif /* COND_MTX_WAITQUEUEMOVE */ - } -#if COND_MTX_WAITQUEUEMOVE - else { - count++; - if (count >diffgen) - panic("movethread inserting more than expected\n"); - TAILQ_INSERT_TAIL(&kq.ksynq_uthlist, uth, uu_mtxlist); - } -#endif /* COND_MTX_WAITQUEUEMOVE */ - - } - ksyn_wqunlock(ckwq); - -#if COND_MTX_WAITQUEUEMOVE - if ( (nomutex == 0) && (count > 0)) { - ksyn_wqlock(kwq); - uth = TAILQ_FIRST(&kq.ksynq_uthlist); - while(uth != NULL) { - TAILQ_REMOVE(&kq.ksynq_uthlist, uth, uu_mtxlist); - error = ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], nextgen, uth, SEQFIT); - if (error != 0) { - panic("movethread insert failed\n"); - } - uth->uu_lockseq = nextgen; - nextgen += PTHRW_INC; - uth = TAILQ_FIRST(&kq.ksynq_uthlist); - } - ksyn_wqunlock(kwq); - } -#endif /* COND_MTX_WAITQUEUEMOVE */ - } else - panic("movethread : wq move all failed\n"); - return; -} - /* find the true shared obect/offset for shared mutexes */ int ksyn_findobj(uint64_t mutex, uint64_t * objectp, uint64_t * offsetp) @@ -2509,12 +3095,13 @@ kwq_find_rw_lowest(ksyn_wait_queue_t kwq, int flags, uint32_t premgen, int * typ } else lowest[KSYN_QUEUE_YWRITER] = 0; - +#if __TESTPANICS__ if (count == 0) panic("nothing in the queue???\n"); +#endif /* __TESTPANICS__ */ - low = numbers[0]; + low = numbers[0]; lowtype = typenum[0]; if (count > 1) { for (i = 1; i< count; i++) { @@ -2535,44 +3122,39 @@ kwq_find_rw_lowest(ksyn_wait_queue_t kwq, int flags, uint32_t premgen, int * typ int ksyn_wakeupreaders(ksyn_wait_queue_t kwq, uint32_t limitread, int longreadset, int allreaders, uint32_t updatebits, int * wokenp) { - uthread_t uth; + ksyn_waitq_element_t kwe = NULL; ksyn_queue_t kq; int failedwakeup = 0; int numwoken = 0; kern_return_t kret = KERN_SUCCESS; - int resetbit = updatebits & PTHRW_RW_HUNLOCK; uint32_t lbits = 0; lbits = updatebits; if (longreadset != 0) { /* clear all read and longreads */ - while ((uth = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_READ], kwq)) != NULL) { - uth->uu_psynchretval = lbits; - /* set on one thread */ - if (resetbit != 0) { - lbits &= ~PTHRW_RW_HUNLOCK; - resetbit = 0; - } + while ((kwe = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_READ], kwq)) != NULL) { + kwe->kwe_psynchretval = lbits; + kwe->kwe_kwqqueue = NULL; + numwoken++; - uth->uu_kwqqueue = NULL; - kret = ksyn_wakeup_thread(kwq, uth); + kret = ksyn_wakeup_thread(kwq, kwe); +#if __TESTPANICS__ if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) panic("ksyn_wakeupreaders: panic waking up readers\n"); +#endif /* __TESTPANICS__ */ if (kret == KERN_NOT_WAITING) { failedwakeup++; } } - while ((uth = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_LREAD], kwq)) != NULL) { - uth->uu_psynchretval = lbits; - uth->uu_kwqqueue = NULL; - if (resetbit != 0) { - lbits &= ~PTHRW_RW_HUNLOCK; - resetbit = 0; - } + while ((kwe = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_LREAD], kwq)) != NULL) { + kwe->kwe_psynchretval = lbits; + kwe->kwe_kwqqueue = NULL; numwoken++; - kret = ksyn_wakeup_thread(kwq, uth); + kret = ksyn_wakeup_thread(kwq, kwe); +#if __TESTPANICS__ if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) panic("ksyn_wakeupreaders: panic waking up lreaders\n"); +#endif /* __TESTPANICS__ */ if (kret == KERN_NOT_WAITING) { failedwakeup++; } @@ -2580,17 +3162,15 @@ ksyn_wakeupreaders(ksyn_wait_queue_t kwq, uint32_t limitread, int longreadset, i } else { kq = &kwq->kw_ksynqueues[KSYN_QUEUE_READ]; while ((kq->ksynq_count != 0) && (allreaders || (is_seqlower(kq->ksynq_firstnum, limitread) != 0))) { - uth = ksyn_queue_removefirst(kq, kwq); - uth->uu_psynchretval = lbits; - if (resetbit != 0) { - lbits &= ~PTHRW_RW_HUNLOCK; - resetbit = 0; - } + kwe = ksyn_queue_removefirst(kq, kwq); + kwe->kwe_psynchretval = lbits; + kwe->kwe_kwqqueue = NULL; numwoken++; - uth->uu_kwqqueue = NULL; - kret = ksyn_wakeup_thread(kwq, uth); + kret = ksyn_wakeup_thread(kwq, kwe); +#if __TESTPANICS__ if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) panic("ksyn_wakeupreaders: panic waking up readers\n"); +#endif /* __TESTPANICS__ */ if (kret == KERN_NOT_WAITING) { failedwakeup++; } @@ -2605,32 +3185,45 @@ ksyn_wakeupreaders(ksyn_wait_queue_t kwq, uint32_t limitread, int longreadset, i /* This handles the unlock grants for next set on rw_unlock() or on arrival of all preposted waiters */ int -kwq_handle_unlock(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t * updatep, int flags, int * blockp, uint32_t premgen) +kwq_handle_unlock(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t rw_wc, uint32_t * updatep, int flags, int * blockp, uint32_t premgen) { uint32_t low_reader, low_writer, low_ywriter, low_lreader,limitrdnum; int rwtype, error=0; int longreadset = 0, allreaders, failed; - uint32_t updatebits; + uint32_t updatebits=0, numneeded = 0;; int prepost = flags & KW_UNLOCK_PREPOST; thread_t preth = THREAD_NULL; + ksyn_waitq_element_t kwe; uthread_t uth; thread_t th; int woken = 0; int block = 1; - uint32_t lowest[KSYN_QUEUE_MAX]; /* np need for upgrade as it is handled separately */ + uint32_t lowest[KSYN_QUEUE_MAX]; /* np need for upgrade as it is handled separately */ kern_return_t kret = KERN_SUCCESS; + ksyn_queue_t kq; + int curthreturns = 0; #if _PSYNCH_TRACE_ -#if defined(__i386__) - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWHANDLEU | DBG_FUNC_START, (uint32_t)kwq, mgen, premgen, 0, 0); -#endif + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWHANDLEU | DBG_FUNC_START, (uint32_t)kwq->kw_addr, mgen, premgen, rw_wc, 0); #endif /* _PSYNCH_TRACE_ */ if (prepost != 0) { preth = current_thread(); } + kq = &kwq->kw_ksynqueues[KSYN_QUEUE_READ]; + kwq->kw_lastseqword = rw_wc; + kwq->kw_lastunlockseq = (rw_wc & PTHRW_COUNT_MASK); + kwq->kw_overlapwatch = 0; + /* upgrade pending */ if (is_rw_ubit_set(mgen)) { +#if __TESTPANICS__ + panic("NO UBIT SHOULD BE SET\n"); + updatebits = PTH_RWL_EBIT | PTH_RWL_KBIT; + if (kwq->kw_ksynqueues[KSYN_QUEUE_WRITER].ksynq_count != 0) + updatebits |= PTH_RWL_WBIT; + if (kwq->kw_ksynqueues[KSYN_QUEUE_YWRITER].ksynq_count != 0) + updatebits |= PTH_RWL_YBIT; if (prepost != 0) { if((flags & KW_UNLOCK_PREPOST_UPGRADE) != 0) { /* upgrade thread calling the prepost */ @@ -2641,34 +3234,37 @@ kwq_handle_unlock(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t * updatep, int } if (kwq->kw_ksynqueues[KSYN_QUEUE_UPGRADE].ksynq_count > 0) { - uth = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_UPGRADE], kwq); - uth->uu_psynchretval = (mgen | PTHRW_EBIT) & ~PTHRW_UBIT; - uth->uu_kwqqueue = NULL; - kret = ksyn_wakeup_thread(kwq, uth); + kwe = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_UPGRADE], kwq); + + kwq->kw_nextseqword = (rw_wc & PTHRW_COUNT_MASK) + updatebits; + kwe->kwe_psynchretval = updatebits; + kwe->kwe_kwqqueue = NULL; + kret = ksyn_wakeup_thread(kwq, kwe); if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) panic("kwq_handle_unlock: panic waking up the upgrade thread \n"); if (kret == KERN_NOT_WAITING) { kwq->kw_pre_intrcount = 1; /* actually a count */ kwq->kw_pre_intrseq = mgen; - kwq->kw_pre_intrretbits = uth->uu_psynchretval; + kwq->kw_pre_intrretbits = kwe->kwe_psynchretval; kwq->kw_pre_intrtype = PTH_RW_TYPE_UPGRADE; } error = 0; } else { panic("panic unable to find the upgrade thread\n"); } +#endif /* __TESTPANICS__ */ ksyn_wqunlock(kwq); goto out; } error = kwq_find_rw_lowest(kwq, flags, premgen, &rwtype, lowest); +#if __TESTPANICS__ if (error != 0) panic("rwunlock: cannot fails to slot next round of threads"); +#endif /* __TESTPANICS__ */ #if _PSYNCH_TRACE_ -#if defined(__i386__) - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWHANDLEU | DBG_FUNC_NONE, (uint32_t)kwq, 1, rwtype, lowest, 0); -#endif + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWHANDLEU | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 1, rwtype, 0, 0); #endif /* _PSYNCH_TRACE_ */ low_reader = lowest[KSYN_QUEUE_READ]; low_lreader = lowest[KSYN_QUEUE_LREAD]; @@ -2676,24 +3272,36 @@ kwq_handle_unlock(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t * updatep, int low_ywriter = lowest[KSYN_QUEUE_YWRITER]; - updatebits = mgen & ~( PTHRW_EBIT | PTHRW_WBIT |PTHRW_YBIT | PTHRW_UBIT | PTHRW_LBIT); - longreadset = 0; allreaders = 0; + updatebits = 0; + + switch (rwtype & PTH_RW_TYPE_MASK) { case PTH_RW_TYPE_LREAD: longreadset = 1; + case PTH_RW_TYPE_READ: { + /* what about the preflight which is LREAD or READ ?? */ + if ((rwtype & PTH_RWSHFT_TYPE_MASK) != 0) { + if (rwtype & PTH_RWSHFT_TYPE_WRITE) + updatebits |= (PTH_RWL_WBIT | PTH_RWL_KBIT); + if (rwtype & PTH_RWSHFT_TYPE_YWRITE) + updatebits |= PTH_RWL_YBIT; + } limitrdnum = 0; if (longreadset == 0) { switch (rwtype & (PTH_RWSHFT_TYPE_WRITE | PTH_RWSHFT_TYPE_YWRITE)) { case PTH_RWSHFT_TYPE_WRITE: limitrdnum = low_writer; if (((rwtype & PTH_RWSHFT_TYPE_LREAD) != 0) && - (is_seqlower(low_lreader, low_writer) != 0)) { + (is_seqlower(low_lreader, limitrdnum) != 0)) { + longreadset = 1; + } + if (((flags & KW_UNLOCK_PREPOST_LREADLOCK) != 0) && + (is_seqlower(premgen, limitrdnum) != 0)) { longreadset = 1; } - break; case PTH_RWSHFT_TYPE_YWRITE: /* all read ? */ @@ -2702,11 +3310,25 @@ kwq_handle_unlock(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t * updatep, int longreadset = 1; } else allreaders = 1; + if (((flags & KW_UNLOCK_PREPOST_LREADLOCK) != 0) && + (is_seqlower(premgen, low_ywriter) != 0)) { + longreadset = 1; + allreaders = 0; + } + + break; case (PTH_RWSHFT_TYPE_WRITE | PTH_RWSHFT_TYPE_YWRITE): - limitrdnum = low_writer; + if (is_seqlower(low_ywriter, low_writer) != 0) { + limitrdnum = low_ywriter; + } else + limitrdnum = low_writer; if (((rwtype & PTH_RWSHFT_TYPE_LREAD) != 0) && - (is_seqlower(low_lreader, low_ywriter) != 0)) { + (is_seqlower(low_lreader, limitrdnum) != 0)) { + longreadset = 1; + } + if (((flags & KW_UNLOCK_PREPOST_LREADLOCK) != 0) && + (is_seqlower(premgen, limitrdnum) != 0)) { longreadset = 1; } break; @@ -2718,35 +3340,71 @@ kwq_handle_unlock(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t * updatep, int }; } + numneeded = 0; + if (longreadset != 0) { + updatebits |= PTH_RWL_LBIT; + updatebits &= ~PTH_RWL_KBIT; + if ((flags & (KW_UNLOCK_PREPOST_READLOCK | KW_UNLOCK_PREPOST_LREADLOCK)) != 0) + numneeded += 1; + numneeded += kwq->kw_ksynqueues[KSYN_QUEUE_READ].ksynq_count; + numneeded += kwq->kw_ksynqueues[KSYN_QUEUE_LREAD].ksynq_count; + updatebits += (numneeded << PTHRW_COUNT_SHIFT); + kwq->kw_overlapwatch = 1; + } else { + /* no longread, evaluate number of readers */ - if ((rwtype & PTH_RWSHFT_TYPE_WRITE) != 0) - updatebits |= PTHRW_WBIT; - else if ((rwtype & PTH_RWSHFT_TYPE_YWRITE) != 0) - updatebits |= PTHRW_YBIT; + switch (rwtype & (PTH_RWSHFT_TYPE_WRITE | PTH_RWSHFT_TYPE_YWRITE)) { + case PTH_RWSHFT_TYPE_WRITE: + limitrdnum = low_writer; + numneeded = ksyn_queue_count_tolowest(kq, limitrdnum); + if (((flags & KW_UNLOCK_PREPOST_READLOCK) != 0) && (is_seqlower(premgen, limitrdnum) != 0)) { + curthreturns = 1; + numneeded += 1; + } + break; + case PTH_RWSHFT_TYPE_YWRITE: + /* all read ? */ + numneeded += kwq->kw_ksynqueues[KSYN_QUEUE_READ].ksynq_count; + if ((flags & KW_UNLOCK_PREPOST_READLOCK) != 0) { + curthreturns = 1; + numneeded += 1; + } + break; + case (PTH_RWSHFT_TYPE_WRITE | PTH_RWSHFT_TYPE_YWRITE): + limitrdnum = low_writer; + numneeded = ksyn_queue_count_tolowest(kq, limitrdnum); + if (((flags & KW_UNLOCK_PREPOST_READLOCK) != 0) && (is_seqlower(premgen, limitrdnum) != 0)) { + curthreturns = 1; + numneeded += 1; + } + break; + default: /* no writers at all */ + /* no other waiters only readers */ + kwq->kw_overlapwatch = 1; + numneeded += kwq->kw_ksynqueues[KSYN_QUEUE_READ].ksynq_count; + if ((flags & KW_UNLOCK_PREPOST_READLOCK) != 0) { + curthreturns = 1; + numneeded += 1; + } + }; + + updatebits += (numneeded << PTHRW_COUNT_SHIFT); + } + kwq->kw_nextseqword = (rw_wc & PTHRW_COUNT_MASK) + updatebits; - if (longreadset == 0) { - if((prepost != 0) && - ((flags & KW_UNLOCK_PREPOST_READLOCK) != 0) && - ((allreaders != 0) || (is_seqlower(premgen, limitrdnum) != 0))) { - block = 0; - uth = current_uthread(); - uth->uu_psynchretval = updatebits; - } - } else { - updatebits |= PTHRW_LBIT; - if ((prepost != 0) && - ((flags & (KW_UNLOCK_PREPOST_READLOCK | KW_UNLOCK_PREPOST_LREADLOCK)) != 0)) { - block = 0; - uth = current_uthread(); - uth->uu_psynchretval = updatebits; - } + if (curthreturns != 0) { + block = 0; + uth = current_uthread(); + kwe = &uth->uu_kwe; + kwe->kwe_psynchretval = updatebits; } - if (prepost != 0) { - updatebits |= PTHRW_RW_HUNLOCK; - } failed = ksyn_wakeupreaders(kwq, limitrdnum, longreadset, allreaders, updatebits, &woken); +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWHANDLEU | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 2, woken, failed, 0); +#endif /* _PSYNCH_TRACE_ */ + if (failed != 0) { kwq->kw_pre_intrcount = failed; /* actually a count */ kwq->kw_pre_intrseq = limitrdnum; @@ -2757,43 +3415,49 @@ kwq_handle_unlock(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t * updatep, int kwq->kw_pre_intrtype = PTH_RW_TYPE_READ; } - /* if we woken up no one and the current thread is returning, ensure it is doing unlock */ - if ((prepost != 0) && (woken == 0) && (block == 0)&& ((updatebits & PTHRW_RW_HUNLOCK) != 0)) { - uth = current_uthread(); - uth->uu_psynchretval = updatebits; - } - error = 0; + if ((kwq->kw_ksynqueues[KSYN_QUEUE_WRITER].ksynq_count != 0) && ((updatebits & PTH_RWL_WBIT) == 0)) + panic("kwq_handle_unlock: writer pending but no writebit set %x\n", updatebits); } break; case PTH_RW_TYPE_WRITE: { - updatebits |= PTHRW_EBIT; + + /* only one thread is goin to be granted */ + updatebits |= (PTHRW_INC); + updatebits |= PTH_RWL_KBIT| PTH_RWL_EBIT; + if (((flags & KW_UNLOCK_PREPOST_WRLOCK) != 0) && (low_writer == premgen)) { block = 0; if (kwq->kw_ksynqueues[KSYN_QUEUE_WRITER].ksynq_count != 0) - updatebits |= PTHRW_WBIT; - else if ((rwtype & PTH_RWSHFT_TYPE_YWRITE) != 0) - updatebits |= PTHRW_YBIT; + updatebits |= PTH_RWL_WBIT; + if ((rwtype & PTH_RWSHFT_TYPE_YWRITE) != 0) + updatebits |= PTH_RWL_YBIT; th = preth; uth = get_bsdthread_info(th); - uth->uu_psynchretval = updatebits; + kwe = &uth->uu_kwe; + kwe->kwe_psynchretval = updatebits; } else { /* we are not granting writelock to the preposting thread */ - uth = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], kwq); + kwe = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], kwq); /* if there are writers present or the preposting write thread then W bit is to be set */ if ((kwq->kw_ksynqueues[KSYN_QUEUE_WRITER].ksynq_count != 0) || ((flags & KW_UNLOCK_PREPOST_WRLOCK) != 0) ) - updatebits |= PTHRW_WBIT; - else if ((rwtype & PTH_RWSHFT_TYPE_YWRITE) != 0) - updatebits |= PTHRW_YBIT; - uth->uu_psynchretval = updatebits; - uth->uu_kwqqueue = NULL; + updatebits |= PTH_RWL_WBIT; + if ((rwtype & PTH_RWSHFT_TYPE_YWRITE) != 0) + updatebits |= PTH_RWL_YBIT; + kwe->kwe_psynchretval = updatebits; + kwe->kwe_kwqqueue = NULL; /* setup next in the queue */ - kret = ksyn_wakeup_thread(kwq, uth); + kret = ksyn_wakeup_thread(kwq, kwe); +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWHANDLEU | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 3, kret, 0, 0); +#endif /* _PSYNCH_TRACE_ */ +#if __TESTPANICS__ if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) panic("kwq_handle_unlock: panic waking up writer\n"); +#endif /* __TESTPANICS__ */ if (kret == KERN_NOT_WAITING) { kwq->kw_pre_intrcount = 1; /* actually a count */ kwq->kw_pre_intrseq = low_writer; @@ -2802,6 +3466,9 @@ kwq_handle_unlock(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t * updatep, int } error = 0; } + kwq->kw_nextseqword = (rw_wc & PTHRW_COUNT_MASK) + updatebits; + if ((updatebits & (PTH_RWL_KBIT | PTH_RWL_EBIT)) != (PTH_RWL_KBIT | PTH_RWL_EBIT)) + panic("kwq_handle_unlock: writer lock granted but no ke set %x\n", updatebits); } break; @@ -2809,26 +3476,36 @@ kwq_handle_unlock(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t * updatep, int case PTH_RW_TYPE_YWRITE: { /* can reader locks be granted ahead of this write? */ if ((rwtype & PTH_RWSHFT_TYPE_READ) != 0) { - if ((rwtype & PTH_RWSHFT_TYPE_WRITE) != 0) - updatebits |= PTHRW_WBIT; - else if ((rwtype & PTH_RWSHFT_TYPE_WRITE) != 0) - updatebits |= PTHRW_YBIT; + if ((rwtype & PTH_RWSHFT_TYPE_MASK) != 0) { + if (rwtype & PTH_RWSHFT_TYPE_WRITE) + updatebits |= (PTH_RWL_WBIT | PTH_RWL_KBIT); + if (rwtype & PTH_RWSHFT_TYPE_YWRITE) + updatebits |= PTH_RWL_YBIT; + } if ((rwtype & PTH_RWSHFT_TYPE_WRITE) != 0) { /* is lowest reader less than the low writer? */ if (is_seqlower(low_reader,low_writer) == 0) goto yielditis; + + numneeded = ksyn_queue_count_tolowest(kq, low_writer); + updatebits += (numneeded << PTHRW_COUNT_SHIFT); if (((flags & KW_UNLOCK_PREPOST_READLOCK) != 0) && (is_seqlower(premgen, low_writer) != 0)) { uth = current_uthread(); - uth->uu_psynchretval = updatebits; + kwe = &uth->uu_kwe; + /* add one more */ + updatebits += PTHRW_INC; + kwe->kwe_psynchretval = updatebits; block = 0; } - if (prepost != 0) { - updatebits |= PTHRW_RW_HUNLOCK; - } + kwq->kw_nextseqword = (rw_wc & PTHRW_COUNT_MASK) + updatebits; + /* there will be readers to wakeup , no need to check for woken */ failed = ksyn_wakeupreaders(kwq, low_writer, 0, 0, updatebits, NULL); +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWHANDLEU | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 2, woken, failed, 0); +#endif /* _PSYNCH_TRACE_ */ if (failed != 0) { kwq->kw_pre_intrcount = failed; /* actually a count */ kwq->kw_pre_intrseq = low_writer; @@ -2838,32 +3515,33 @@ kwq_handle_unlock(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t * updatep, int error = 0; } else { /* wakeup all readers */ + numneeded = kwq->kw_ksynqueues[KSYN_QUEUE_READ].ksynq_count; + updatebits += (numneeded << PTHRW_COUNT_SHIFT); if ((prepost != 0) && ((flags & KW_UNLOCK_PREPOST_READLOCK) != 0)) { uth = current_uthread(); - uth->uu_psynchretval = updatebits; + kwe = &uth->uu_kwe; + updatebits += PTHRW_INC; + kwe->kwe_psynchretval = updatebits; block = 0; } - if (prepost != 0) { - updatebits |= PTHRW_RW_HUNLOCK; - } + kwq->kw_nextseqword = (rw_wc & PTHRW_COUNT_MASK) + updatebits; failed = ksyn_wakeupreaders(kwq, low_writer, 0, 1, updatebits, &woken); +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWHANDLEU | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 2, woken, failed, 0); +#endif /* _PSYNCH_TRACE_ */ if (failed != 0) { kwq->kw_pre_intrcount = failed; /* actually a count */ kwq->kw_pre_intrseq = kwq->kw_highseq; kwq->kw_pre_intrretbits = updatebits; kwq->kw_pre_intrtype = PTH_RW_TYPE_READ; } - /* if we woken up no one and the current thread is returning, ensure it is doing unlock */ - if ((prepost != 0) && (woken ==0) && (block == 0)&& ((updatebits & PTHRW_RW_HUNLOCK) != 0)) { - uth = current_uthread(); - uth->uu_psynchretval = updatebits; - } error = 0; } } else { yielditis: /* no reads, so granting yeilding writes */ - updatebits |= PTHRW_EBIT; + updatebits |= PTHRW_INC; + updatebits |= PTH_RWL_KBIT| PTH_RWL_EBIT; if (((flags & KW_UNLOCK_PREPOST_YWRLOCK) != 0) && (low_writer == premgen)) { /* preposting yielding write thread is being granted exclusive lock */ @@ -2871,29 +3549,35 @@ kwq_handle_unlock(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t * updatep, int block = 0; if ((rwtype & PTH_RWSHFT_TYPE_WRITE) != 0) - updatebits |= PTHRW_WBIT; + updatebits |= PTH_RWL_WBIT; else if (kwq->kw_ksynqueues[KSYN_QUEUE_YWRITER].ksynq_count != 0) - updatebits |= PTHRW_YBIT; + updatebits |= PTH_RWL_YBIT; th = preth; uth = get_bsdthread_info(th); - uth->uu_psynchretval = updatebits; + kwe = &uth->uu_kwe; + kwe->kwe_psynchretval = updatebits; } else { /* we are granting yield writelock to some other thread */ - uth = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_YWRITER], kwq); + kwe = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_YWRITER], kwq); if ((rwtype & PTH_RWSHFT_TYPE_WRITE) != 0) - updatebits |= PTHRW_WBIT; + updatebits |= PTH_RWL_WBIT; /* if there are ywriters present or the preposting ywrite thread then W bit is to be set */ else if ((kwq->kw_ksynqueues[KSYN_QUEUE_YWRITER].ksynq_count != 0) || ((flags & KW_UNLOCK_PREPOST_YWRLOCK) != 0) ) - updatebits |= PTHRW_YBIT; + updatebits |= PTH_RWL_YBIT; - uth->uu_psynchretval = updatebits; - uth->uu_kwqqueue = NULL; + kwe->kwe_psynchretval = updatebits; + kwe->kwe_kwqqueue = NULL; - kret = ksyn_wakeup_thread(kwq, uth); + kret = ksyn_wakeup_thread(kwq, kwe); +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWHANDLEU | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 3, kret, 0, 0); +#endif /* _PSYNCH_TRACE_ */ +#if __TESTPANICS__ if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) panic("kwq_handle_unlock : panic waking up readers\n"); +#endif /* __TESTPANICS__ */ if (kret == KERN_NOT_WAITING) { kwq->kw_pre_intrcount = 1; /* actually a count */ kwq->kw_pre_intrseq = low_ywriter; @@ -2902,6 +3586,7 @@ kwq_handle_unlock(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t * updatep, int } error = 0; } + kwq->kw_nextseqword = (rw_wc & PTHRW_COUNT_MASK) + updatebits; } } break; @@ -2911,21 +3596,58 @@ kwq_handle_unlock(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t * updatep, int }; - if (updatep != NULL) - *updatep = updatebits; out: + if (updatep != NULL) + *updatep = updatebits; if (blockp != NULL) *blockp = block; #if _PSYNCH_TRACE_ -#if defined(__i386__) - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWHANDLEU | DBG_FUNC_END, (uint32_t)kwq, 0, 0, block, 0); -#endif + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWHANDLEU | DBG_FUNC_END, (uint32_t)kwq->kw_addr, 0, updatebits, block, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } +int +kwq_handle_overlap(ksyn_wait_queue_t kwq, uint32_t lgenval, __unused uint32_t ugenval, uint32_t rw_wc, uint32_t *updatebitsp, __unused int flags , int * blockp) +{ + uint32_t highword = kwq->kw_nextseqword & PTHRW_COUNT_MASK; + uint32_t lowword = kwq->kw_lastseqword & PTHRW_COUNT_MASK; + uint32_t val=0; + int withinseq; + + + /* overlap is set, so no need to check for valid state for overlap */ + + withinseq = ((is_seqlower_eq(rw_wc, highword) != 0) || (is_seqhigher_eq(lowword, rw_wc) != 0)); + + if (withinseq != 0) { + if ((kwq->kw_nextseqword & PTH_RWL_LBIT) == 0) { + /* if no writers ahead, overlap granted */ + if ((lgenval & PTH_RWL_WBIT) == 0) { + goto grantoverlap; + } + } else { + /* Lbit is set, and writers ahead does not count */ + goto grantoverlap; + } + } + + *blockp = 1; + return(0); + +grantoverlap: + /* increase the next expected seq by one */ + kwq->kw_nextseqword += PTHRW_INC; + /* set count by one & bits from the nextseq and add M bit */ + val = PTHRW_INC; + val |= ((kwq->kw_nextseqword & PTHRW_BIT_MASK) | PTH_RWL_MBIT); + *updatebitsp = val; + *blockp = 0; + return(0); +} +#if NOTYET /* handle downgrade actions */ int kwq_handle_downgrade(ksyn_wait_queue_t kwq, uint32_t mgen, __unused int flags, __unused uint32_t premgen, __unused int * blockp) @@ -2964,33 +3686,38 @@ kwq_handle_downgrade(ksyn_wait_queue_t kwq, uint32_t mgen, __unused int flags, _ } return(0); } + +#endif /* NOTYET */ + /************* Indiv queue support routines ************************/ void ksyn_queue_init(ksyn_queue_t kq) { - TAILQ_INIT(&kq->ksynq_uthlist); + TAILQ_INIT(&kq->ksynq_kwelist); kq->ksynq_count = 0; kq->ksynq_firstnum = 0; kq->ksynq_lastnum = 0; } - int -ksyn_queue_insert(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t mgen, struct uthread * uth, int fit) +ksyn_queue_insert(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t mgen, struct uthread * uth, ksyn_waitq_element_t kwe, int fit) { uint32_t lockseq = mgen & PTHRW_COUNT_MASK; - struct uthread * q_uth, * r_uth; + ksyn_waitq_element_t q_kwe, r_kwe; + int res = 0; + uthread_t nuth = NULL; if (kq->ksynq_count == 0) { - TAILQ_INSERT_HEAD(&kq->ksynq_uthlist, uth, uu_mtxlist); + TAILQ_INSERT_HEAD(&kq->ksynq_kwelist, kwe, kwe_list); kq->ksynq_firstnum = lockseq; kq->ksynq_lastnum = lockseq; goto out; } if (fit == FIRSTFIT) { + /* TBD: if retry bit is set for mutex, add it to the head */ /* firstfit, arriving order */ - TAILQ_INSERT_TAIL(&kq->ksynq_uthlist, uth, uu_mtxlist); + TAILQ_INSERT_TAIL(&kq->ksynq_kwelist, kwe, kwe_list); if (is_seqlower (lockseq, kq->ksynq_firstnum) != 0) kq->ksynq_firstnum = lockseq; if (is_seqhigher (lockseq, kq->ksynq_lastnum) != 0) @@ -2998,55 +3725,79 @@ ksyn_queue_insert(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t mgen, struct goto out; } - if ((lockseq == kq->ksynq_firstnum) || (lockseq == kq->ksynq_lastnum)) - panic("ksyn_queue_insert: two threads with same lockseq "); + if ((lockseq == kq->ksynq_firstnum) || (lockseq == kq->ksynq_lastnum)) { + /* During prepost when a thread is getting cancelled, we could have two with same seq */ + if (kwe->kwe_flags == KWE_THREAD_PREPOST) { + q_kwe = ksyn_queue_find_seq(kwq, kq, lockseq, 0); + if ((q_kwe != NULL) && ((nuth = (uthread_t)q_kwe->kwe_uth) != NULL) && + ((nuth->uu_flag & (UT_CANCELDISABLE | UT_CANCEL | UT_CANCELED)) == UT_CANCEL)) { + TAILQ_INSERT_TAIL(&kq->ksynq_kwelist, kwe, kwe_list); + goto out; + + } else { + __FAILEDUSERTEST__("ksyn_queue_insert: two threads with same lockseq "); + res = EBUSY; + goto out1; + } + } else { + __FAILEDUSERTEST__("ksyn_queue_insert: two threads with same lockseq "); + res = EBUSY; + goto out1; + } + } /* check for next seq one */ if (is_seqlower(kq->ksynq_lastnum, lockseq) != 0) { - TAILQ_INSERT_TAIL(&kq->ksynq_uthlist, uth, uu_mtxlist); + TAILQ_INSERT_TAIL(&kq->ksynq_kwelist, kwe, kwe_list); kq->ksynq_lastnum = lockseq; goto out; } if (is_seqlower(lockseq, kq->ksynq_firstnum) != 0) { - TAILQ_INSERT_HEAD(&kq->ksynq_uthlist, uth, uu_mtxlist); + TAILQ_INSERT_HEAD(&kq->ksynq_kwelist, kwe, kwe_list); kq->ksynq_firstnum = lockseq; goto out; } /* goto slow insert mode */ - TAILQ_FOREACH_SAFE(q_uth, &kq->ksynq_uthlist, uu_mtxlist, r_uth) { - if (is_seqhigher(q_uth->uu_lockseq, lockseq) != 0) { - TAILQ_INSERT_BEFORE(q_uth, uth, uu_mtxlist); + TAILQ_FOREACH_SAFE(q_kwe, &kq->ksynq_kwelist, kwe_list, r_kwe) { + if (is_seqhigher(q_kwe->kwe_lockseq, lockseq) != 0) { + TAILQ_INSERT_BEFORE(q_kwe, kwe, kwe_list); goto out; } } +#if __TESTPANICS__ panic("failed to insert \n"); +#endif /* __TESTPANICS__ */ + out: + if (uth != NULL) + kwe->kwe_uth = uth; kq->ksynq_count++; kwq->kw_inqueue++; update_low_high(kwq, lockseq); - return(0); +out1: + return(res); } -struct uthread * +ksyn_waitq_element_t ksyn_queue_removefirst(ksyn_queue_t kq, ksyn_wait_queue_t kwq) { - uthread_t uth = NULL; - uthread_t q_uth; + ksyn_waitq_element_t kwe = NULL; + ksyn_waitq_element_t q_kwe; uint32_t curseq; if (kq->ksynq_count != 0) { - uth = TAILQ_FIRST(&kq->ksynq_uthlist); - TAILQ_REMOVE(&kq->ksynq_uthlist, uth, uu_mtxlist); - curseq = uth->uu_lockseq & PTHRW_COUNT_MASK; + kwe = TAILQ_FIRST(&kq->ksynq_kwelist); + TAILQ_REMOVE(&kq->ksynq_kwelist, kwe, kwe_list); + curseq = kwe->kwe_lockseq & PTHRW_COUNT_MASK; kq->ksynq_count--; kwq->kw_inqueue--; if(kq->ksynq_count != 0) { - q_uth = TAILQ_FIRST(&kq->ksynq_uthlist); - kq->ksynq_firstnum = (q_uth->uu_lockseq & PTHRW_COUNT_MASK); + q_kwe = TAILQ_FIRST(&kq->ksynq_kwelist); + kq->ksynq_firstnum = (q_kwe->kwe_lockseq & PTHRW_COUNT_MASK); } else { kq->ksynq_firstnum = 0; kq->ksynq_lastnum = 0; @@ -3062,28 +3813,30 @@ ksyn_queue_removefirst(ksyn_queue_t kq, ksyn_wait_queue_t kwq) kwq->kw_highseq = find_nexthighseq(kwq); } } - return(uth); + return(kwe); } void -ksyn_queue_removeitem(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uthread_t uth) +ksyn_queue_removeitem(ksyn_wait_queue_t kwq, ksyn_queue_t kq, ksyn_waitq_element_t kwe) { - uthread_t q_uth; + ksyn_waitq_element_t q_kwe; uint32_t curseq; if (kq->ksynq_count > 0) { - TAILQ_REMOVE(&kq->ksynq_uthlist, uth, uu_mtxlist); + TAILQ_REMOVE(&kq->ksynq_kwelist, kwe, kwe_list); kq->ksynq_count--; if(kq->ksynq_count != 0) { - q_uth = TAILQ_FIRST(&kq->ksynq_uthlist); - kq->ksynq_firstnum = (q_uth->uu_lockseq & PTHRW_COUNT_MASK); + q_kwe = TAILQ_FIRST(&kq->ksynq_kwelist); + kq->ksynq_firstnum = (q_kwe->kwe_lockseq & PTHRW_COUNT_MASK); + q_kwe = TAILQ_LAST(&kq->ksynq_kwelist, ksynq_kwelist_head); + kq->ksynq_lastnum = (q_kwe->kwe_lockseq & PTHRW_COUNT_MASK); } else { kq->ksynq_firstnum = 0; kq->ksynq_lastnum = 0; } kwq->kw_inqueue--; - curseq = uth->uu_lockseq & PTHRW_COUNT_MASK; + curseq = kwe->kwe_lockseq & PTHRW_COUNT_MASK; if (kwq->kw_inqueue == 0) { kwq->kw_lowseq = 0; kwq->kw_highseq = 0; @@ -3096,6 +3849,168 @@ ksyn_queue_removeitem(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uthread_t uth) } } +/* find the thread and removes from the queue */ +ksyn_waitq_element_t +ksyn_queue_find_seq(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t seq, int remove) +{ + ksyn_waitq_element_t q_kwe, r_kwe; + + /* TBD: bail out if higher seq is seen */ + /* case where wrap in the tail of the queue exists */ + TAILQ_FOREACH_SAFE(q_kwe, &kq->ksynq_kwelist, kwe_list, r_kwe) { + if ((q_kwe->kwe_lockseq & PTHRW_COUNT_MASK) == seq) { + if (remove != 0) + ksyn_queue_removeitem(kwq, kq, q_kwe); + return(q_kwe); + } + } + return(NULL); +} + + +/* find the thread at the target sequence (or a broadcast/prepost at or above) */ +ksyn_waitq_element_t +ksyn_queue_find_cvpreposeq(ksyn_queue_t kq, uint32_t cgen) +{ + ksyn_waitq_element_t q_kwe, r_kwe; + uint32_t lgen = (cgen & PTHRW_COUNT_MASK); + + /* case where wrap in the tail of the queue exists */ + TAILQ_FOREACH_SAFE(q_kwe, &kq->ksynq_kwelist, kwe_list, r_kwe) { + + /* skip the lower entries */ + if (is_seqlower((q_kwe->kwe_lockseq & PTHRW_COUNT_MASK), cgen) != 0) + continue; + + switch (q_kwe->kwe_flags) { + + case KWE_THREAD_INWAIT: + if ((q_kwe->kwe_lockseq & PTHRW_COUNT_MASK) != lgen) + break; + /* fall thru */ + + case KWE_THREAD_BROADCAST: + case KWE_THREAD_PREPOST: + return (q_kwe); + } + } + return(NULL); +} + +/* look for a thread at lockseq, a */ +ksyn_waitq_element_t +ksyn_queue_find_signalseq(__unused ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t uptoseq, uint32_t signalseq) +{ + ksyn_waitq_element_t q_kwe, r_kwe, t_kwe = NULL; + + /* case where wrap in the tail of the queue exists */ + TAILQ_FOREACH_SAFE(q_kwe, &kq->ksynq_kwelist, kwe_list, r_kwe) { + + switch (q_kwe->kwe_flags) { + + case KWE_THREAD_PREPOST: + if (is_seqhigher((q_kwe->kwe_lockseq & PTHRW_COUNT_MASK), uptoseq)) + return t_kwe; + /* fall thru */ + + case KWE_THREAD_BROADCAST: + /* match any prepost at our same uptoseq or any broadcast above */ + if (is_seqlower((q_kwe->kwe_lockseq & PTHRW_COUNT_MASK), uptoseq)) + continue; + return q_kwe; + + case KWE_THREAD_INWAIT: + /* + * Match any (non-cancelled) thread at or below our upto sequence - + * but prefer an exact match to our signal sequence (if present) to + * keep exact matches happening. + */ + if (is_seqhigher((q_kwe->kwe_lockseq & PTHRW_COUNT_MASK), uptoseq)) + return t_kwe; + + if (q_kwe->kwe_kwqqueue == kwq) { + uthread_t ut = q_kwe->kwe_uth; + if ((ut->uu_flag & ( UT_CANCELDISABLE | UT_CANCEL | UT_CANCELED)) != UT_CANCEL) { + /* if equal or higher than our signal sequence, return this one */ + if (is_seqhigher_eq((q_kwe->kwe_lockseq & PTHRW_COUNT_MASK), signalseq)) + return q_kwe; + + /* otherwise, just remember this eligible thread and move on */ + if (t_kwe == NULL) + t_kwe = q_kwe; + } + } + break; + + default: + panic("ksyn_queue_find_signalseq(): unknow wait queue element type (%d)\n", q_kwe->kwe_flags); + break; + } + } + return t_kwe; +} + + +int +ksyn_queue_move_tofree(ksyn_wait_queue_t ckwq, ksyn_queue_t kq, uint32_t upto, ksyn_queue_t kfreeq, int all, int release) +{ + ksyn_waitq_element_t kwe; + int count = 0; + uint32_t tseq = upto & PTHRW_COUNT_MASK; +#if _PSYNCH_TRACE_ + uthread_t ut; +#endif /* _PSYNCH_TRACE_ */ + + ksyn_queue_init(kfreeq); + + /* free all the entries, must be only fakes.. */ + kwe = TAILQ_FIRST(&kq->ksynq_kwelist); + while (kwe != NULL) { + if ((all == 0) && (is_seqhigher((kwe->kwe_lockseq & PTHRW_COUNT_MASK), tseq) != 0)) + break; + if (kwe->kwe_flags == KWE_THREAD_INWAIT) { + /* + * This scenario is typically noticed when the cvar is + * reinited and the new waiters are waiting. We can + * return them as spurious wait so the cvar state gets + * reset correctly. + */ +#if _PSYNCH_TRACE_ + ut = (uthread_t)kwe->kwe_uth; +#endif /* _PSYNCH_TRACE_ */ + + /* skip canceled ones */ + /* wake the rest */ + ksyn_queue_removeitem(ckwq, kq, kwe); + /* set M bit to indicate to waking CV to retun Inc val */ + kwe->kwe_psynchretval = PTHRW_INC | (PTH_RWS_CV_MBIT | PTH_RWL_MTX_WAIT); + kwe->kwe_kwqqueue = NULL; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVHBROAD | DBG_FUNC_NONE, (uint32_t)ckwq->kw_addr, 0xcafecaf3, (uint32_t)(thread_tid((struct thread *)(((struct uthread *)(kwe->kwe_uth))->uu_context.vc_thread))), kwe->kwe_psynchretval, 0); +#endif /* _PSYNCH_TRACE_ */ + (void)ksyn_wakeup_thread(ckwq, kwe); + } else { + ksyn_queue_removeitem(ckwq, kq, kwe); + TAILQ_INSERT_TAIL(&kfreeq->ksynq_kwelist, kwe, kwe_list); + ckwq->kw_fakecount--; + count++; + } + kwe = TAILQ_FIRST(&kq->ksynq_kwelist); + } + + if ((release != 0) && (count != 0)) { + kwe = TAILQ_FIRST(&kfreeq->ksynq_kwelist); + while (kwe != NULL) { + TAILQ_REMOVE(&kfreeq->ksynq_kwelist, kwe, kwe_list); + zfree(kwe_zone, kwe); + kwe = TAILQ_FIRST(&kfreeq->ksynq_kwelist); + } + } + + return(count); +} + +/*************************************************************************/ void update_low_high(ksyn_wait_queue_t kwq, uint32_t lockseq) @@ -3167,6 +4082,51 @@ find_nexthighseq(ksyn_wait_queue_t kwq) return(highest); } +int +is_seqlower(uint32_t x, uint32_t y) +{ + if (x < y) { + if ((y-x) < (PTHRW_MAX_READERS/2)) + return(1); + } else { + if ((x-y) > (PTHRW_MAX_READERS/2)) + return(1); + } + return(0); +} + +int +is_seqlower_eq(uint32_t x, uint32_t y) +{ + if (x==y) + return(1); + else + return(is_seqlower(x,y)); +} + +int +is_seqhigher(uint32_t x, uint32_t y) +{ + if (x > y) { + if ((x-y) < (PTHRW_MAX_READERS/2)) + return(1); + } else { + if ((y-x) > (PTHRW_MAX_READERS/2)) + return(1); + } + return(0); +} + +int +is_seqhigher_eq(uint32_t x, uint32_t y) +{ + if (x==y) + return(1); + else + return(is_seqhigher(x,y)); +} + + int find_diff(uint32_t upto, uint32_t lowest) { @@ -3174,7 +4134,14 @@ find_diff(uint32_t upto, uint32_t lowest) if (upto == lowest) return(0); +#if 0 diff = diff_genseq(upto, lowest); +#else + if (is_seqlower(upto, lowest) != 0) + diff = diff_genseq(lowest, upto); + else + diff = diff_genseq(upto, lowest); +#endif diff = (diff >> PTHRW_COUNT_SHIFT); return(diff); } @@ -3188,13 +4155,13 @@ find_seq_till(ksyn_wait_queue_t kwq, uint32_t upto, uint32_t nwaiters, uint32_t #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_FSEQTILL | DBG_FUNC_START, 0, 0, upto, nwaiters, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_FSEQTILL | DBG_FUNC_START, 0, 0, upto, nwaiters, 0); #endif /* _PSYNCH_TRACE_ */ for (i= 0; i< KSYN_QUEUE_MAX; i++) { count += ksyn_queue_count_tolowest(&kwq->kw_ksynqueues[i], upto); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_FSEQTILL | DBG_FUNC_NONE, 0, 1, i, count, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_FSEQTILL | DBG_FUNC_NONE, 0, 1, i, count, 0); #endif /* _PSYNCH_TRACE_ */ if (count >= nwaiters) { break; @@ -3205,9 +4172,11 @@ find_seq_till(ksyn_wait_queue_t kwq, uint32_t upto, uint32_t nwaiters, uint32_t *countp = count; } #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_FSEQTILL | DBG_FUNC_END, 0, 0, count, nwaiters, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_FSEQTILL | DBG_FUNC_END, 0, 0, count, nwaiters, 0); #endif /* _PSYNCH_TRACE_ */ - if (count >= nwaiters) + if (count == 0) + return(0); + else if (count >= nwaiters) return(1); else return(0); @@ -3218,7 +4187,7 @@ uint32_t ksyn_queue_count_tolowest(ksyn_queue_t kq, uint32_t upto) { uint32_t i = 0; - uthread_t uth, newuth; + ksyn_waitq_element_t kwe, newkwe; uint32_t curval; /* if nothing or the first num is greater than upto, return none */ @@ -3227,8 +4196,8 @@ ksyn_queue_count_tolowest(ksyn_queue_t kq, uint32_t upto) if (upto == kq->ksynq_firstnum) return(1); - TAILQ_FOREACH_SAFE(uth, &kq->ksynq_uthlist, uu_mtxlist, newuth) { - curval = (uth->uu_lockseq & PTHRW_COUNT_MASK); + TAILQ_FOREACH_SAFE(kwe, &kq->ksynq_kwelist, kwe_list, newkwe) { + curval = (kwe->kwe_lockseq & PTHRW_COUNT_MASK); if (upto == curval) { i++; break; @@ -3242,19 +4211,147 @@ ksyn_queue_count_tolowest(ksyn_queue_t kq, uint32_t upto) return(i); } -/* find the thread and removes from the queue */ -uthread_t -ksyn_queue_find_seq(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t seq) -{ - uthread_t q_uth, r_uth; - /* case where wrap in the tail of the queue exists */ - TAILQ_FOREACH_SAFE(q_uth, &kq->ksynq_uthlist, uu_mtxlist, r_uth) { - if (q_uth->uu_lockseq == seq) { - ksyn_queue_removeitem(kwq, kq, q_uth); - return(q_uth); - } + +/* handles the cond broadcast of cvar and returns number of woken threads and bits for syscall return */ +void +ksyn_handle_cvbroad(ksyn_wait_queue_t ckwq, uint32_t upto, uint32_t * updatep) +{ + kern_return_t kret; + ksyn_queue_t kq; + ksyn_waitq_element_t kwe, newkwe; + uint32_t updatebits = 0; + struct ksyn_queue kfreeq; + uthread_t ut; + +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVHBROAD | DBG_FUNC_START, 0xcbcbcbc2, upto, 0, 0, 0); +#endif /* _PSYNCH_TRACE_ */ + + ksyn_queue_init(&kfreeq); + kq = &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER]; + + retry: + TAILQ_FOREACH_SAFE(kwe, &kq->ksynq_kwelist, kwe_list, newkwe) { + + if (is_seqhigher((kwe->kwe_lockseq & PTHRW_COUNT_MASK), upto)) /* outside our range */ + break; + + /* now handle the one we found (inside the range) */ + switch (kwe->kwe_flags) { + + case KWE_THREAD_INWAIT: + ut = (uthread_t)kwe->kwe_uth; + + /* skip canceled ones */ + if (kwe->kwe_kwqqueue != ckwq || + (ut->uu_flag & (UT_CANCELDISABLE | UT_CANCEL | UT_CANCELED)) == UT_CANCEL) + break; + + /* wake the rest */ + ksyn_queue_removeitem(ckwq, kq, kwe); + kwe->kwe_psynchretval = PTH_RWL_MTX_WAIT; + kwe->kwe_kwqqueue = NULL; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVHBROAD | DBG_FUNC_NONE, (uint32_t)ckwq->kw_addr, 0xcafecaf2, (uint32_t)(thread_tid((struct thread *)(((struct uthread *)(kwe->kwe_uth))->uu_context.vc_thread))), kwe->kwe_psynchretval, 0); +#endif /* _PSYNCH_TRACE_ */ + kret = ksyn_wakeup_thread(ckwq, kwe); +#if __TESTPANICS__ + if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) + panic("ksyn_wakeupreaders: panic waking up readers\n"); +#endif /* __TESTPANICS__ */ + updatebits += PTHRW_INC; + break; + + case KWE_THREAD_BROADCAST: + case KWE_THREAD_PREPOST: + ksyn_queue_removeitem(ckwq, kq, kwe); + TAILQ_INSERT_TAIL(&kfreeq.ksynq_kwelist, kwe, kwe_list); + ckwq->kw_fakecount--; + break; + + default: + panic("unknown kweflags\n"); + break; } - return(NULL); + } + + /* Need to enter a broadcast in the queue (if not already at L == S) */ + + if ((ckwq->kw_lword & PTHRW_COUNT_MASK) != (ckwq->kw_sword & PTHRW_COUNT_MASK)) { + + newkwe = TAILQ_FIRST(&kfreeq.ksynq_kwelist); + if (newkwe == NULL) { + ksyn_wqunlock(ckwq); + newkwe = (ksyn_waitq_element_t)zalloc(kwe_zone); + TAILQ_INSERT_TAIL(&kfreeq.ksynq_kwelist, newkwe, kwe_list); + ksyn_wqlock(ckwq); + goto retry; + } + + TAILQ_REMOVE(&kfreeq.ksynq_kwelist, newkwe, kwe_list); + bzero(newkwe, sizeof(struct ksyn_waitq_element)); + newkwe->kwe_kwqqueue = ckwq; + newkwe->kwe_flags = KWE_THREAD_BROADCAST; + newkwe->kwe_lockseq = upto; + newkwe->kwe_count = 0; + newkwe->kwe_uth = NULL; + newkwe->kwe_psynchretval = 0; + +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVHBROAD | DBG_FUNC_NONE, (uint32_t)ckwq->kw_addr, 0xfeedfeed, upto, 0, 0); +#endif /* _PSYNCH_TRACE_ */ + + (void)ksyn_queue_insert(ckwq, &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER], upto, NULL, newkwe, SEQFIT); + ckwq->kw_fakecount++; + } + + /* free up any remaining things stumbled across above */ + kwe = TAILQ_FIRST(&kfreeq.ksynq_kwelist); + while (kwe != NULL) { + TAILQ_REMOVE(&kfreeq.ksynq_kwelist, kwe, kwe_list); + zfree(kwe_zone, kwe); + kwe = TAILQ_FIRST(&kfreeq.ksynq_kwelist); + } + + if (updatep != NULL) + *updatep = updatebits; + +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVHBROAD | DBG_FUNC_END, 0xeeeeeeed, updatebits, 0, 0, 0); +#endif /* _PSYNCH_TRACE_ */ } +void +ksyn_cvupdate_fixup(ksyn_wait_queue_t ckwq, uint32_t *updatep, ksyn_queue_t kfreeq, int release) +{ + uint32_t updatebits = 0; + + if (updatep != NULL) + updatebits = *updatep; + if ((ckwq->kw_lword & PTHRW_COUNT_MASK) == (ckwq->kw_sword & PTHRW_COUNT_MASK)) { + updatebits |= PTH_RWS_CV_CBIT; + if (ckwq->kw_inqueue != 0) { + /* FREE THE QUEUE */ + ksyn_queue_move_tofree(ckwq, &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER], ckwq->kw_lword, kfreeq, 0, release); +#if __TESTPANICS__ + if (ckwq->kw_inqueue != 0) + panic("ksyn_cvupdate_fixup: L == S, but entries in queue beyond S"); +#endif /* __TESTPANICS__ */ + } + ckwq->kw_lword = ckwq->kw_uword = ckwq->kw_sword = 0; + ckwq->kw_kflags |= KSYN_KWF_ZEROEDOUT; + } else if ((ckwq->kw_inqueue != 0) && (ckwq->kw_fakecount == ckwq->kw_inqueue)) { + /* only fake entries are present in the queue */ + updatebits |= PTH_RWS_CV_PBIT; + } + if (updatep != NULL) + *updatep = updatebits; +} + +void +psynch_zoneinit(void) +{ + kwq_zone = (zone_t)zinit(sizeof(struct ksyn_wait_queue), 8192 * sizeof(struct ksyn_wait_queue), 4096, "ksyn_waitqueue zone"); + kwe_zone = (zone_t)zinit(sizeof(struct ksyn_waitq_element), 8192 * sizeof(struct ksyn_waitq_element), 4096, "ksyn_waitq_element zone"); +} #endif /* PSYNCH */ diff --git a/bsd/kern/pthread_synch.c b/bsd/kern/pthread_synch.c index 7a00399cc..3fbed7532 100644 --- a/bsd/kern/pthread_synch.c +++ b/bsd/kern/pthread_synch.c @@ -91,6 +91,7 @@ #include #include #include /* for current_map() */ +#include #include /* for thread_resume */ #include #if defined(__i386__) @@ -109,12 +110,6 @@ #define KERNEL_DEBUG1 KERNEL_DEBUG_CONSTANT1 #endif - -#if defined(__ppc__) || defined(__ppc64__) -#include -#endif - - lck_grp_attr_t *pthread_lck_grp_attr; lck_grp_t *pthread_lck_grp; lck_attr_t *pthread_lck_attr; @@ -130,7 +125,6 @@ extern kern_return_t semaphore_signal_internal_trap(mach_port_name_t); extern void workqueue_thread_yielded(void); static int workqueue_additem(struct workqueue *wq, int prio, user_addr_t item, int affinity); -static int workqueue_removeitem(struct workqueue *wq, int prio, user_addr_t item); static boolean_t workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t th, user_addr_t oc_item, int oc_prio, int oc_affinity); static void wq_runitem(proc_t p, user_addr_t item, thread_t th, struct threadlist *tl, @@ -138,7 +132,7 @@ static void wq_runitem(proc_t p, user_addr_t item, thread_t th, struct threadlis static void wq_unpark_continue(void); static void wq_unsuspend_continue(void); static int setup_wqthread(proc_t p, thread_t th, user_addr_t item, int reuse_thread, struct threadlist *tl); -static boolean_t workqueue_addnewthread(struct workqueue *wq); +static boolean_t workqueue_addnewthread(struct workqueue *wq, boolean_t oc_thread); static void workqueue_removethread(struct threadlist *tl); static void workqueue_lock_spin(proc_t); static void workqueue_unlock(proc_t); @@ -215,9 +209,7 @@ bsdthread_create(__unused struct proc *p, struct bsdthread_create_args *uap, us isLP64 = IS_64BIT_PROCESS(p); -#if defined(__ppc__) - stackaddr = 0xF0000000; -#elif defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) stackaddr = 0xB0000000; #else #error Need to define a stack address hint for this architecture @@ -266,6 +258,22 @@ bsdthread_create(__unused struct proc *p, struct bsdthread_create_args *uap, us th_stack = (stackaddr + th_stacksize + PTH_DEFAULT_GUARDSIZE); th_pthread = (stackaddr + th_stacksize + PTH_DEFAULT_GUARDSIZE); user_stacksize = th_stacksize; + + /* + * Pre-fault the first page of the new thread's stack and the page that will + * contain the pthread_t structure. + */ + vm_fault( vmap, + vm_map_trunc_page(th_stack - PAGE_SIZE_64), + VM_PROT_READ | VM_PROT_WRITE, + FALSE, + THREAD_UNINT, NULL, 0); + + vm_fault( vmap, + vm_map_trunc_page(th_pthread), + VM_PROT_READ | VM_PROT_WRITE, + FALSE, + THREAD_UNINT, NULL, 0); } else { th_stack = user_stack; user_stacksize = user_stack; @@ -275,31 +283,7 @@ bsdthread_create(__unused struct proc *p, struct bsdthread_create_args *uap, us #endif } -#if defined(__ppc__) - /* - * Set up PowerPC registers... - * internally they are always kept as 64 bit and - * since the register set is the same between 32 and 64bit modes - * we don't need 2 different methods for setting the state - */ - { - ppc_thread_state64_t state64; - ppc_thread_state64_t *ts64 = &state64; - - ts64->srr0 = (uint64_t)p->p_threadstart; - ts64->r1 = (uint64_t)(th_stack - C_ARGSAVE_LEN - C_RED_ZONE); - ts64->r3 = (uint64_t)th_pthread; - ts64->r4 = (uint64_t)(th_thport); - ts64->r5 = (uint64_t)user_func; - ts64->r6 = (uint64_t)user_funcarg; - ts64->r7 = (uint64_t)user_stacksize; - ts64->r8 = (uint64_t)uap->flags; - - thread_set_wq_state64(th, (thread_state_t)ts64); - - thread_set_cthreadself(th, (uint64_t)th_pthread, isLP64); - } -#elif defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) { /* * Set up i386 registers & function call. @@ -453,26 +437,33 @@ uint32_t wq_stalled_window_usecs = WQ_STALLED_WINDOW_USECS; uint32_t wq_reduce_pool_window_usecs = WQ_REDUCE_POOL_WINDOW_USECS; uint32_t wq_max_timer_interval_usecs = WQ_MAX_TIMER_INTERVAL_USECS; uint32_t wq_max_threads = WORKQUEUE_MAXTHREADS; +uint32_t wq_max_constrained_threads = WORKQUEUE_MAXTHREADS / 8; -SYSCTL_INT(_kern, OID_AUTO, wq_yielded_threshold, CTLFLAG_RW, +SYSCTL_INT(_kern, OID_AUTO, wq_yielded_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &wq_yielded_threshold, 0, ""); -SYSCTL_INT(_kern, OID_AUTO, wq_yielded_window_usecs, CTLFLAG_RW, +SYSCTL_INT(_kern, OID_AUTO, wq_yielded_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED, &wq_yielded_window_usecs, 0, ""); -SYSCTL_INT(_kern, OID_AUTO, wq_stalled_window_usecs, CTLFLAG_RW, +SYSCTL_INT(_kern, OID_AUTO, wq_stalled_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED, &wq_stalled_window_usecs, 0, ""); -SYSCTL_INT(_kern, OID_AUTO, wq_reduce_pool_window_usecs, CTLFLAG_RW, +SYSCTL_INT(_kern, OID_AUTO, wq_reduce_pool_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED, &wq_reduce_pool_window_usecs, 0, ""); -SYSCTL_INT(_kern, OID_AUTO, wq_max_timer_interval_usecs, CTLFLAG_RW, +SYSCTL_INT(_kern, OID_AUTO, wq_max_timer_interval_usecs, CTLFLAG_RW | CTLFLAG_LOCKED, &wq_max_timer_interval_usecs, 0, ""); -SYSCTL_INT(_kern, OID_AUTO, wq_max_threads, CTLFLAG_RW, +SYSCTL_INT(_kern, OID_AUTO, wq_max_threads, CTLFLAG_RW | CTLFLAG_LOCKED, &wq_max_threads, 0, ""); +SYSCTL_INT(_kern, OID_AUTO, wq_max_constrained_threads, CTLFLAG_RW | CTLFLAG_LOCKED, + &wq_max_constrained_threads, 0, ""); + + +static uint32_t wq_init_constrained_limit = 1; + void workqueue_init_lock(proc_t p) @@ -542,11 +533,9 @@ wq_thread_is_busy(uint64_t cur_ts, uint64_t *lastblocked_tsp) */ lastblocked_ts = *lastblocked_tsp; -#if defined(__ppc__) -#else if ( !OSCompareAndSwap64((UInt64)lastblocked_ts, (UInt64)lastblocked_ts, lastblocked_tsp)) return (TRUE); -#endif + if (lastblocked_ts >= cur_ts) { /* * because the update of the timestamp when a thread blocks isn't @@ -682,7 +671,7 @@ workqueue_add_timer(struct workqueue *wq, __unused int param1) } } if (add_thread == TRUE) { - retval = workqueue_addnewthread(wq); + retval = workqueue_addnewthread(wq, FALSE); break; } } @@ -774,7 +763,7 @@ workqueue_thread_yielded(void) if (secs == 0 && usecs < wq_yielded_window_usecs) { if (wq->wq_thidlecount == 0) { - workqueue_addnewthread(wq); + workqueue_addnewthread(wq, TRUE); /* * 'workqueue_addnewthread' drops the workqueue lock * when creating the new thread and then retakes it before @@ -876,14 +865,9 @@ workqueue_callback(int type, thread_t thread) * since another thread would have to get scheduled and then block after we start down * this path), it's not a problem. Either timestamp is adequate, so no need to retry */ -#if defined(__ppc__) - /* - * this doesn't have to actually work reliablly for PPC, it just has to compile/link - */ - *lastblocked_ptr = (UInt64)curtime; -#else + OSCompareAndSwap64(*lastblocked_ptr, (UInt64)curtime, lastblocked_ptr); -#endif + if (wq->wq_itemcount) WQ_TIMER_NEEDED(wq, start_timer); @@ -963,9 +947,13 @@ workqueue_removethread(struct threadlist *tl) } - +/* + * called with workq lock held + * dropped and retaken around thread creation + * return with workq lock held + */ static boolean_t -workqueue_addnewthread(struct workqueue *wq) +workqueue_addnewthread(struct workqueue *wq, boolean_t oc_thread) { struct threadlist *tl; struct uthread *uth; @@ -975,8 +963,25 @@ workqueue_addnewthread(struct workqueue *wq) void *sright; mach_vm_offset_t stackaddr; - if (wq->wq_nthreads >= wq_max_threads || wq->wq_nthreads >= (CONFIG_THREAD_MAX - 20)) + if (wq->wq_nthreads >= wq_max_threads || wq->wq_nthreads >= (CONFIG_THREAD_MAX - 20)) { + wq->wq_lflags |= WQL_EXCEEDED_TOTAL_THREAD_LIMIT; return (FALSE); + } + wq->wq_lflags &= ~WQL_EXCEEDED_TOTAL_THREAD_LIMIT; + + if (oc_thread == FALSE && wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) { + /* + * if we're not creating this thread to service an overcommit request, + * then check the size of the constrained thread pool... if we've already + * reached our max for threads scheduled from this pool, don't create a new + * one... the callers of this function are prepared for failure. + */ + wq->wq_lflags |= WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT; + return (FALSE); + } + if (wq->wq_constrained_threads_scheduled < wq_max_constrained_threads) + wq->wq_lflags &= ~WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT; + wq->wq_nthreads++; p = wq->wq_proc; @@ -990,9 +995,7 @@ workqueue_addnewthread(struct workqueue *wq) tl = kalloc(sizeof(struct threadlist)); bzero(tl, sizeof(struct threadlist)); -#if defined(__ppc__) - stackaddr = 0xF0000000; -#elif defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) stackaddr = 0xB0000000; #else #error Need to define a stack address hint for this architecture @@ -1023,6 +1026,7 @@ workqueue_addnewthread(struct workqueue *wq) } if (kret != KERN_SUCCESS) { (void) thread_terminate(th); + thread_deallocate(th); kfree(tl, sizeof(struct threadlist)); goto failed; @@ -1043,11 +1047,6 @@ workqueue_addnewthread(struct workqueue *wq) tl->th_priority = WORKQUEUE_NUMPRIOS; tl->th_policy = -1; -#if defined(__ppc__) - //ml_fp_setvalid(FALSE); - thread_set_cthreadself(th, (uint64_t)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE), IS_64BIT_PROCESS(p)); -#endif /* __ppc__ */ - uth = get_bsdthread_info(tl->th_thread); uth->uu_threadlist = (void *)tl; @@ -1087,6 +1086,22 @@ workq_open(struct proc *p, __unused struct workq_open_args *uap, __unused int32 if ((p->p_lflag & P_LREGISTER) == 0) return(EINVAL); + num_cpus = ml_get_max_cpus(); + + if (wq_init_constrained_limit) { + uint32_t limit; + /* + * set up the limit for the constrained pool + * this is a virtual pool in that we don't + * maintain it on a separate idle and run list + */ + limit = num_cpus * (WORKQUEUE_NUMPRIOS + 1); + + if (limit > wq_max_constrained_threads) + wq_max_constrained_threads = limit; + + wq_init_constrained_limit = 0; + } workqueue_lock_spin(p); if (p->p_wqptr == NULL) { @@ -1107,8 +1122,6 @@ workq_open(struct proc *p, __unused struct workq_open_args *uap, __unused int32 workqueue_unlock(p); - num_cpus = ml_get_max_cpus(); - wq_size = sizeof(struct workqueue) + (num_cpus * WORKQUEUE_NUMPRIOS * sizeof(uint32_t)) + (num_cpus * WORKQUEUE_NUMPRIOS * sizeof(uint32_t)) + @@ -1153,7 +1166,7 @@ workq_open(struct proc *p, __unused struct workq_open_args *uap, __unused int32 * the size for the allocation of the workqueue struct */ nptr += (sizeof(uint64_t) - 1); - nptr = (char *)((long)nptr & ~(sizeof(uint64_t) - 1)); + nptr = (char *)((uintptr_t)nptr & ~(sizeof(uint64_t) - 1)); for (i = 0; i < WORKQUEUE_NUMPRIOS; i++) { wq->wq_lastblocked_ts[i] = (uint64_t *)nptr; @@ -1217,9 +1230,9 @@ workq_kernreturn(struct proc *p, struct workq_kernreturn_args *uap, __unused in workqueue_unlock(p); return (EINVAL); } - if (wq->wq_thidlecount == 0 && (oc_item || (wq->wq_nthreads < wq->wq_affinity_max))) { + if (wq->wq_thidlecount == 0 && (oc_item || (wq->wq_constrained_threads_scheduled < wq->wq_affinity_max))) { - workqueue_addnewthread(wq); + workqueue_addnewthread(wq, oc_item ? TRUE : FALSE); if (wq->wq_thidlecount == 0) oc_item = 0; @@ -1230,20 +1243,6 @@ workq_kernreturn(struct proc *p, struct workq_kernreturn_args *uap, __unused in KERNEL_DEBUG(0xefffd008 | DBG_FUNC_NONE, wq, prio, affinity, oc_item, 0); } break; - case WQOPS_QUEUE_REMOVE: { - - if ((prio < 0) || (prio >= WORKQUEUE_NUMPRIOS)) - return (EINVAL); - - workqueue_lock_spin(p); - - if ((wq = (struct workqueue *)p->p_wqptr) == NULL) { - workqueue_unlock(p); - return (EINVAL); - } - error = workqueue_removeitem(wq, prio, item); - } - break; case WQOPS_THREAD_RETURN: { th = current_thread(); @@ -1423,42 +1422,16 @@ workqueue_additem(struct workqueue *wq, int prio, user_addr_t item, int affinity return (0); } -static int -workqueue_removeitem(struct workqueue *wq, int prio, user_addr_t item) -{ - struct workitem *witem; - struct workitemlist *wl; - int error = ESRCH; - - wl = (struct workitemlist *)&wq->wq_list[prio]; - - TAILQ_FOREACH(witem, &wl->wl_itemlist, wi_entry) { - if (witem->wi_item == item) { - TAILQ_REMOVE(&wl->wl_itemlist, witem, wi_entry); - - if (TAILQ_EMPTY(&wl->wl_itemlist)) - wq->wq_list_bitmap &= ~(1 << prio); - wq->wq_itemcount--; - - witem->wi_item = (user_addr_t)0; - witem->wi_affinity = 0; - TAILQ_INSERT_HEAD(&wl->wl_freelist, witem, wi_entry); - - error = 0; - break; - } - } - return (error); -} - static int workqueue_importance[WORKQUEUE_NUMPRIOS] = { - 2, 0, -2, + 2, 0, -2, INT_MIN, }; +#define WORKQ_POLICY_TIMESHARE 1 + static int workqueue_policy[WORKQUEUE_NUMPRIOS] = { - 1, 1, 1, + WORKQ_POLICY_TIMESHARE, WORKQ_POLICY_TIMESHARE, WORKQ_POLICY_TIMESHARE, WORKQ_POLICY_TIMESHARE }; @@ -1536,10 +1509,20 @@ workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t thread, user_add } goto grab_idle_thread; } - if (wq->wq_itemcount == 0) { + /* + * if we get here, the work should be handled by a constrained thread + */ + if (wq->wq_itemcount == 0 || wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) { + /* + * no work to do, or we're already at or over the scheduling limit for + * constrained threads... just return or park the thread... + * do not start the timer for this condition... if we don't have any work, + * we'll check again when new work arrives... if we're over the limit, we need 1 or more + * constrained threads to return to the kernel before we can dispatch work from our queue + */ if ((th_to_park = thread) == THREAD_NULL) goto out_of_work; - goto parkit; + goto parkit; } for (priority = 0; priority < WORKQUEUE_NUMPRIOS; priority++) { if (wq->wq_list_bitmap & (1 << priority)) { @@ -1727,6 +1710,16 @@ workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t thread, user_add witem->wi_item = (user_addr_t)0; witem->wi_affinity = 0; TAILQ_INSERT_HEAD(&wl->wl_freelist, witem, wi_entry); + + if ( !(tl->th_flags & TH_LIST_CONSTRAINED)) { + wq->wq_constrained_threads_scheduled++; + tl->th_flags |= TH_LIST_CONSTRAINED; + } + } else { + if (tl->th_flags & TH_LIST_CONSTRAINED) { + wq->wq_constrained_threads_scheduled--; + tl->th_flags &= ~TH_LIST_CONSTRAINED; + } } orig_priority = tl->th_priority; orig_affinity_tag = tl->th_affinity_tag; @@ -1775,16 +1768,47 @@ workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t thread, user_add KERNEL_DEBUG(0xefffd120 | DBG_FUNC_START, wq, orig_priority, tl->th_policy, 0, 0); - if (tl->th_policy != policy) { + if ((orig_priority == WORKQUEUE_BG_PRIOQUEUE) || (priority == WORKQUEUE_BG_PRIOQUEUE)) { + struct uthread *ut = NULL; + + ut = get_bsdthread_info(th_to_run); + if (orig_priority == WORKQUEUE_BG_PRIOQUEUE) { + /* remove the disk throttle, importance will be reset in anycase */ +#if !CONFIG_EMBEDDED + proc_restore_workq_bgthreadpolicy(th_to_run); +#else /* !CONFIG_EMBEDDED */ + if ((ut->uu_flag & UT_BACKGROUND) != 0) { + ut->uu_flag &= ~UT_BACKGROUND; + ut->uu_iopol_disk = IOPOL_NORMAL; + } +#endif /* !CONFIG_EMBEDDED */ + } + + if (priority == WORKQUEUE_BG_PRIOQUEUE) { +#if !CONFIG_EMBEDDED + proc_apply_workq_bgthreadpolicy(th_to_run); +#else /* !CONFIG_EMBEDDED */ + if ((ut->uu_flag & UT_BACKGROUND) == 0) { + /* set diskthrottling */ + ut->uu_flag |= UT_BACKGROUND; + ut->uu_iopol_disk = IOPOL_THROTTLE; + } +#endif /* !CONFIG_EMBEDDED */ + } + } + + if (tl->th_policy != policy) { extinfo.timeshare = policy; (void)thread_policy_set_internal(th_to_run, THREAD_EXTENDED_POLICY, (thread_policy_t)&extinfo, THREAD_EXTENDED_POLICY_COUNT); tl->th_policy = policy; } + precedinfo.importance = workqueue_importance[priority]; (void)thread_policy_set_internal(th_to_run, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&precedinfo, THREAD_PRECEDENCE_POLICY_COUNT); + KERNEL_DEBUG(0xefffd120 | DBG_FUNC_END, wq, priority, policy, 0, 0); } if (kdebug_enable) { @@ -1858,12 +1882,18 @@ workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t thread, user_add wq->wq_thscheduled_count[tl->th_priority][tl->th_affinity_tag]--; wq->wq_threads_scheduled--; + if (tl->th_flags & TH_LIST_CONSTRAINED) { + wq->wq_constrained_threads_scheduled--; + wq->wq_lflags &= ~WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT; + tl->th_flags &= ~TH_LIST_CONSTRAINED; + } if (wq->wq_thidlecount < 100) us_to_wait = wq_reduce_pool_window_usecs - (wq->wq_thidlecount * (wq_reduce_pool_window_usecs / 100)); else us_to_wait = wq_reduce_pool_window_usecs / 100; wq->wq_thidlecount++; + wq->wq_lflags &= ~WQL_EXCEEDED_TOTAL_THREAD_LIMIT; assert_wait_timeout((caddr_t)tl, (THREAD_INTERRUPTIBLE), us_to_wait, NSEC_PER_USEC); @@ -2080,34 +2110,11 @@ wq_runitem(proc_t p, user_addr_t item, thread_t th, struct threadlist *tl, } } + int setup_wqthread(proc_t p, thread_t th, user_addr_t item, int reuse_thread, struct threadlist *tl) { -#if defined(__ppc__) - /* - * Set up PowerPC registers... - * internally they are always kept as 64 bit and - * since the register set is the same between 32 and 64bit modes - * we don't need 2 different methods for setting the state - */ - { - ppc_thread_state64_t state64; - ppc_thread_state64_t *ts64 = &state64; - - ts64->srr0 = (uint64_t)p->p_wqthread; - ts64->r1 = (uint64_t)((tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE) - C_ARGSAVE_LEN - C_RED_ZONE); - ts64->r3 = (uint64_t)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE); - ts64->r4 = (uint64_t)(tl->th_thport); - ts64->r5 = (uint64_t)(tl->th_stackaddr + PTH_DEFAULT_GUARDSIZE); - ts64->r6 = (uint64_t)item; - ts64->r7 = (uint64_t)reuse_thread; - ts64->r8 = (uint64_t)0; - - if ((reuse_thread != 0) && (ts64->r3 == (uint64_t)0)) - panic("setup_wqthread: setting reuse thread with null pthread\n"); - thread_set_wq_state64(th, (thread_state_t)ts64); - } -#elif defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) int isLP64 = 0; isLP64 = IS_64BIT_PROCESS(p); @@ -2183,6 +2190,14 @@ fill_procworkqueue(proc_t p, struct proc_workqueueinfo * pwqinfo) pwqinfo->pwq_nthreads = wq->wq_nthreads; pwqinfo->pwq_runthreads = activecount; pwqinfo->pwq_blockedthreads = wq->wq_threads_scheduled - activecount; + pwqinfo->pwq_state = 0; + + if (wq->wq_lflags & WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT) + pwqinfo->pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT; + + if (wq->wq_lflags & WQL_EXCEEDED_TOTAL_THREAD_LIMIT) + pwqinfo->pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT; + out: workqueue_unlock(p); return(error); @@ -2308,5 +2323,6 @@ pthread_init(void) pth_global_hashinit(); psynch_thcall = thread_call_allocate(psynch_wq_cleanup, NULL); + psynch_zoneinit(); #endif /* PSYNCH */ } diff --git a/bsd/kern/subr_log.c b/bsd/kern/subr_log.c index d39eccd5d..2cd5c3ac2 100644 --- a/bsd/kern/subr_log.c +++ b/bsd/kern/subr_log.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -81,6 +81,7 @@ #include #include #include +#include /* XXX should be in a common header somewhere */ extern void klogwakeup(void); @@ -92,6 +93,9 @@ extern void logwakeup(void); #define LOG_ASYNC 0x04 #define LOG_RDWAIT 0x08 +/* All globals should be accessed under LOG_LOCK() */ + +/* logsoftc only valid while log_open=1 */ struct logsoftc { int sc_state; /* see above for possibilities */ struct selinfo sc_selp; /* thread waiting for select */ @@ -99,11 +103,11 @@ struct logsoftc { } logsoftc; int log_open; /* also used in log() */ -char smsg_bufc[MSG_BSIZE]; /* static buffer */ -struct msgbuf temp_msgbuf = {0,MSG_BSIZE,0,0,smsg_bufc}; -struct msgbuf *msgbufp; -static int _logentrypend = 0; -static int log_inited = 0; +char smsg_bufc[CONFIG_MSG_BSIZE]; /* static buffer */ +struct msgbuf msgbuf = {MSG_MAGIC,sizeof(smsg_bufc),0,0,smsg_bufc}; +struct msgbuf *msgbufp = &msgbuf; +static int logentrypend = 0; + /* the following are implemented in osfmk/kern/printf.c */ extern void bsd_log_lock(void); extern void bsd_log_unlock(void); @@ -125,6 +129,16 @@ extern d_select_t logselect; #define LOG_LOCK() bsd_log_lock() #define LOG_UNLOCK() bsd_log_unlock() +#if DEBUG +#define LOG_SETSIZE_DEBUG(x...) kprintf(x) +#else +#define LOG_SETSIZE_DEBUG(x...) do { } while(0) +#endif + +static int sysctl_kern_msgbuf(struct sysctl_oid *oidp, + void *arg1, + int arg2, + struct sysctl_req *req); /*ARGSUSED*/ int @@ -135,21 +149,9 @@ logopen(__unused dev_t dev, __unused int flags, __unused int mode, struct proc * LOG_UNLOCK(); return (EBUSY); } - log_open = 1; logsoftc.sc_pgid = p->p_pid; /* signal process only */ - /* - * Potential race here with putchar() but since putchar should be - * called by autoconf, msg_magic should be initialized by the time - * we get here. - */ - if (msgbufp->msg_magic != MSG_MAGIC) { - register int i; + log_open = 1; - msgbufp->msg_magic = MSG_MAGIC; - msgbufp->msg_bufx = msgbufp->msg_bufr = 0; - for (i=0; i < MSG_BSIZE; i++) - msgbufp->msg_bufc[i] = 0; - } LOG_UNLOCK(); return (0); @@ -160,9 +162,9 @@ int logclose(__unused dev_t dev, __unused int flag, __unused int devtype, __unused struct proc *p) { LOG_LOCK(); - log_open = 0; selwakeup(&logsoftc.sc_selp); selthreadclear(&logsoftc.sc_selp); + log_open = 0; LOG_UNLOCK(); return (0); } @@ -171,7 +173,7 @@ logclose(__unused dev_t dev, __unused int flag, __unused int devtype, __unused s int logread(__unused dev_t dev, struct uio *uio, int flag) { - register long l; + int l; int error = 0; LOG_LOCK(); @@ -202,20 +204,24 @@ logread(__unused dev_t dev, struct uio *uio, int flag) logsoftc.sc_state &= ~LOG_RDWAIT; while (uio_resid(uio) > 0) { + int readpos; + l = msgbufp->msg_bufx - msgbufp->msg_bufr; if (l < 0) l = msgbufp->msg_size - msgbufp->msg_bufr; l = min(l, uio_resid(uio)); if (l == 0) break; + + readpos = msgbufp->msg_bufr; LOG_UNLOCK(); - error = uiomove((caddr_t)&msgbufp->msg_bufc[msgbufp->msg_bufr], - (int)l, uio); + error = uiomove((caddr_t)&msgbufp->msg_bufc[readpos], + l, uio); LOG_LOCK(); if (error) break; - msgbufp->msg_bufr += l; - if (msgbufp->msg_bufr < 0 || msgbufp->msg_bufr >= msgbufp->msg_size) + msgbufp->msg_bufr = readpos + l; + if (msgbufp->msg_bufr >= msgbufp->msg_size) msgbufp->msg_bufr = 0; } out: @@ -272,9 +278,13 @@ logwakeup(void) void klogwakeup(void) { - if (_logentrypend) { - _logentrypend = 0; + LOG_LOCK(); + if (logentrypend && log_open) { + logentrypend = 0; /* only reset if someone will be reading */ + LOG_UNLOCK(); logwakeup(); + } else { + LOG_UNLOCK(); } } @@ -282,7 +292,7 @@ klogwakeup(void) int logioctl(__unused dev_t dev, u_long com, caddr_t data, __unused int flag, __unused struct proc *p) { - long l; + int l; LOG_LOCK(); switch (com) { @@ -328,10 +338,7 @@ logioctl(__unused dev_t dev, u_long com, caddr_t data, __unused int flag, __unus void bsd_log_init(void) { - if (!log_inited) { - msgbufp = &temp_msgbuf; - log_inited = 1; - } + /* After this point, we must be ready to accept characters */ } @@ -353,24 +360,12 @@ bsd_log_init(void) void log_putc_locked(char c) { - register struct msgbuf *mbp; - - if (!log_inited) { - panic("bsd log is not inited"); - } + struct msgbuf *mbp; mbp = msgbufp; - if (mbp-> msg_magic != MSG_MAGIC) { - register int i; - - mbp->msg_magic = MSG_MAGIC; - mbp->msg_bufx = mbp->msg_bufr = 0; - for (i=0; i < MSG_BSIZE; i++) - mbp->msg_bufc[i] = 0; - } mbp->msg_bufc[mbp->msg_bufx++] = c; - _logentrypend = 1; - if (mbp->msg_bufx < 0 || mbp->msg_bufx >= msgbufp->msg_size) + logentrypend = 1; + if (mbp->msg_bufx >= msgbufp->msg_size) mbp->msg_bufx = 0; } @@ -391,9 +386,6 @@ log_putc_locked(char c) void log_putc(char c) { - if (!log_inited) { - panic("bsd log is not inited"); - } LOG_LOCK(); log_putc_locked(c); LOG_UNLOCK(); @@ -406,59 +398,143 @@ log_putc(char c) * to the kernel command line, and to read the current size using * sysctl kern.msgbuf * If there is no parameter on the kernel command line, the buffer is - * allocated statically and is MSG_BSIZE characters in size, otherwise - * memory is dynamically allocated. - * This function may only be called once, during kernel initialization. - * Memory management must already be up. The buffer must not have - * overflown yet. + * allocated statically and is CONFIG_MSG_BSIZE characters in size, otherwise + * memory is dynamically allocated. Memory management must already be up. */ -void -log_setsize(long size) { +int +log_setsize(int size) { char *new_logdata; - if (msgbufp->msg_size!=MSG_BSIZE) { - printf("log_setsize: attempt to change size more than once\n"); - return; - } - if (size==MSG_BSIZE) - return; - if (size MAX_MSG_BSIZE) + return (EINVAL); + + if (size <= 0) + return (EINVAL); + + new_logsize = size; if (!(new_logdata = (char*)kalloc(size))) { printf("log_setsize: unable to allocate memory\n"); - return; + return (ENOMEM); } + bzero(new_logdata, new_logsize); + LOG_LOCK(); - bcopy(smsg_bufc, new_logdata, MSG_BSIZE); - bzero(new_logdata+MSG_BSIZE, size - MSG_BSIZE); + + old_logsize = msgbufp->msg_size; + old_logdata = msgbufp->msg_bufc; + old_bufr = msgbufp->msg_bufr; + old_bufx = msgbufp->msg_bufx; + + LOG_SETSIZE_DEBUG("log_setsize(%d): old_logdata %p old_logsize %d old_bufr %d old_bufx %d\n", + size, old_logdata, old_logsize, old_bufr, old_bufx); + + /* start "new_logsize" bytes before the write pointer */ + if (new_logsize <= old_bufx) { + count = new_logsize; + p = old_logdata + old_bufx - count; + } else { + /* + * if new buffer is bigger, copy what we have and let the + * bzero above handle the difference + */ + count = MIN(new_logsize, old_logsize); + p = old_logdata + old_logsize - (count - old_bufx); + } + for (i = 0; i < count; i++) { + if (p >= old_logdata + old_logsize) + p = old_logdata; + + ch = *p++; + new_logdata[i] = ch; + } + + new_bufx = i; + if (new_bufx >= new_logsize) + new_bufx = 0; + msgbufp->msg_bufx = new_bufx; + + new_bufr = old_bufx - old_bufr; /* how much were we trailing bufx by? */ + if (new_bufr < 0) + new_bufr += old_logsize; + new_bufr = new_bufx - new_bufr; /* now relative to oldest data in new buffer */ + if (new_bufr < 0) + new_bufr += new_logsize; + msgbufp->msg_bufr = new_bufr; + + msgbufp->msg_size = new_logsize; + msgbufp->msg_bufc = new_logdata; + + LOG_SETSIZE_DEBUG("log_setsize(%d): new_logdata %p new_logsize %d new_bufr %d new_bufx %d\n", + size, new_logdata, new_logsize, new_bufr, new_bufx); + + LOG_UNLOCK(); + /* this memory is now dead - clear it so that it compresses better in case of suspend to disk etc. */ - bzero(smsg_bufc, MSG_BSIZE); - msgbufp->msg_size = size; - msgbufp->msg_bufc = new_logdata; + bzero(old_logdata, old_logsize); + if (old_logdata != smsg_bufc) { + /* dynamic memory that must be freed */ + kfree(old_logdata, old_logsize); + } + + printf("set system log size to %d bytes\n", new_logsize); + + return 0; +} + +SYSCTL_PROC(_kern, OID_AUTO, msgbuf, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_kern_msgbuf, "I", ""); + +static int sysctl_kern_msgbuf(struct sysctl_oid *oidp __unused, + void *arg1 __unused, + int arg2 __unused, + struct sysctl_req *req) +{ + int old_bufsize, bufsize; + int error; + + LOG_LOCK(); + old_bufsize = bufsize = msgbufp->msg_size; LOG_UNLOCK(); - printf("set system log size to %ld bytes\n", msgbufp->msg_size); + + error = sysctl_io_number(req, bufsize, sizeof(bufsize), &bufsize, NULL); + if (error) + return (error); + + if (bufsize != old_bufsize) { + error = log_setsize(bufsize); + } + + return (error); } -SYSCTL_LONG(_kern, OID_AUTO, msgbuf, CTLFLAG_RD, &temp_msgbuf.msg_size, ""); /* - * This should be called by single user mode /sbin/dmesg only. + * This should be called by /sbin/dmesg only via libproc. * It returns as much data still in the buffer as possible. */ int log_dmesg(user_addr_t buffer, uint32_t buffersize, int32_t * retval) { uint32_t i; - uint32_t localbuff_size = (msgbufp->msg_size + 2); + uint32_t localbuff_size; int error = 0, newl, skip; char *localbuff, *p, *copystart, ch; - long copysize; + size_t copysize; + LOG_LOCK(); + localbuff_size = (msgbufp->msg_size + 2); /* + '\n' + '\0' */ + LOG_UNLOCK(); + + /* Allocate a temporary non-circular buffer for copyout */ if (!(localbuff = (char *)kalloc(localbuff_size))) { printf("log_dmesg: unable to allocate memory\n"); return (ENOMEM); } + /* in between here, the log could become bigger, but that's fine */ LOG_LOCK(); @@ -483,7 +559,7 @@ log_dmesg(user_addr_t buffer, uint32_t buffersize, int32_t * retval) { } if (ch == '\0') continue; - newl = ch == '\n'; + newl = (ch == '\n'); localbuff[i++] = ch; /* The original version of this routine contained a buffer * overflow. At the time, a "small" targeted fix was desired diff --git a/bsd/kern/subr_prof.c b/bsd/kern/subr_prof.c index 5b1024141..4d07853d9 100644 --- a/bsd/kern/subr_prof.c +++ b/bsd/kern/subr_prof.c @@ -152,14 +152,28 @@ kmstartup(void) } /* - * Return kernel profiling information. + * XXX These should be broken out into per-argument OID values, + * XXX since there are no sub-OID parameter values, but unfortunately + * XXX there is barely enough time for an initial conversion. + * + * Note: These items appear to be read/write. */ -int +STATIC int +sysctl_doprofhandle SYSCTL_HANDLER_ARGS +{ sysctl_doprof(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, user_addr_t newp, size_t newlen) { + __unused int cmd = oidp->oid_arg2; /* subcommand*/ + int *name = arg1; /* oid element argument vector */ + int namelen = arg2; /* number of oid element arguments */ + user_addr_t oldp = req->oldptr; /* user buffer copy out address */ + size_t *oldlenp = req->oldlen; /* user buffer copy out size */ + user_addr_t newp = req->newptr; /* user buffer copy in address */ + size_t newlen = req->newlen; /* user buffer copy in size */ + struct gmonparam *gp = &_gmonparam; - int error; + int error = 0; /* all sysctl names at this level are terminal */ if (namelen != 1) @@ -169,28 +183,44 @@ sysctl_doprof(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, case GPROF_STATE: error = sysctl_int(oldp, oldlenp, newp, newlen, &gp->state); if (error) - return (error); + break; if (gp->state == GMON_PROF_OFF) stopprofclock(kernproc); else startprofclock(kernproc); - return (0); + break; case GPROF_COUNT: - return (sysctl_struct(oldp, oldlenp, newp, newlen, - gp->kcount, gp->kcountsize)); + error = sysctl_struct(oldp, oldlenp, newp, newlen, + gp->kcount, gp->kcountsize); + break; case GPROF_FROMS: - return (sysctl_struct(oldp, oldlenp, newp, newlen, - gp->froms, gp->fromssize)); + error = sysctl_struct(oldp, oldlenp, newp, newlen, + gp->froms, gp->fromssize); + break; case GPROF_TOS: - return (sysctl_struct(oldp, oldlenp, newp, newlen, - gp->tos, gp->tossize)); + error = sysctl_struct(oldp, oldlenp, newp, newlen, + gp->tos, gp->tossize); + break; case GPROF_GMONPARAM: - return (sysctl_rdstruct(oldp, oldlenp, newp, gp, sizeof *gp)); + error = sysctl_rdstruct(oldp, oldlenp, newp, gp, sizeof *gp); + break; default: - return (ENOTSUP); + error = ENOTSUP; + break; } - /* NOTREACHED */ + + /* adjust index so we return the right required/consumed amount */ + if (!error) + req->oldidx += req->oldlen; + + return(error); } +SYSCTL_PROC(_kern, KERN_PROF, prof, STLFLAG_NODE|CTLFLAG_RW | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + 0, /* Integer argument (arg2) */ + sysctl_doprofhandle, /* Handler function */ + NULL, /* No explicit data */ + ""); /* diff --git a/bsd/kern/sys_generic.c b/bsd/kern/sys_generic.c index 11a276bbd..bacd02b79 100644 --- a/bsd/kern/sys_generic.c +++ b/bsd/kern/sys_generic.c @@ -152,6 +152,21 @@ __private_extern__ int dofilewrite(vfs_context_t ctx, struct fileproc *fp, __private_extern__ int preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_vnode); __private_extern__ void donefileread(struct proc *p, struct fileproc *fp_ret, int fd); + +/* Conflict wait queue for when selects collide (opaque type) */ +struct wait_queue select_conflict_queue; + +/* + * Init routine called from bsd_init.c + */ +void select_wait_queue_init(void); +void +select_wait_queue_init(void) +{ + wait_queue_init(&select_conflict_queue, SYNC_POLICY_FIFO); +} + + #if NETAT extern int appletalk_inited; #endif /* NETAT */ @@ -570,7 +585,8 @@ dofilewrite(vfs_context_t ctx, struct fileproc *fp, error == EINTR || error == EWOULDBLOCK)) error = 0; /* The socket layer handles SIGPIPE */ - if (error == EPIPE && fp->f_type != DTYPE_SOCKET) { + if (error == EPIPE && fp->f_type != DTYPE_SOCKET && + (fp->f_fglob->fg_lflags & FG_NOSIGPIPE) == 0) { /* XXX Raise the signal on the thread? */ psignal(vfs_context_proc(ctx), SIGPIPE); } @@ -662,13 +678,14 @@ wr_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval) error == EINTR || error == EWOULDBLOCK)) error = 0; /* The socket layer handles SIGPIPE */ - if (error == EPIPE && fp->f_type != DTYPE_SOCKET) + if (error == EPIPE && fp->f_type != DTYPE_SOCKET && + (fp->f_fglob->fg_lflags & FG_NOSIGPIPE) == 0) psignal(p, SIGPIPE); } *retval = count - uio_resid(uio); out: - if ( (error == 0) ) + if (error == 0) fp_drop_written(p, fdes, fp); else fp_drop(p, fdes, fp, 0); @@ -937,8 +954,8 @@ extern int selcontinue(int error); extern int selprocess(int error, int sel_pass); static int selscan(struct proc *p, struct _select * sel, int nfd, int32_t *retval, int sel_pass, wait_queue_sub_t wqsub); -static int selcount(struct proc *p, u_int32_t *ibits, u_int32_t *obits, - int nfd, int * count, int *kfcount); +static int selcount(struct proc *p, u_int32_t *ibits, int nfd, int *count); +static int seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wakeup, int fromselcount); static int seldrop(struct proc *p, u_int32_t *ibits, int nfd); /* @@ -966,7 +983,6 @@ select_nocancel(struct proc *p, struct select_nocancel_args *uap, int32_t *retva struct _select *sel; int needzerofill = 1; int count = 0; - int kfcount = 0; th_act = current_thread(); uth = get_bsdthread_info(th_act); @@ -1070,13 +1086,11 @@ select_nocancel(struct proc *p, struct select_nocancel_args *uap, int32_t *retva else sel->abstime = 0; - sel->kfcount = 0; - if ( (error = selcount(p, sel->ibits, sel->obits, uap->nd, &count, &kfcount)) ) { + if ( (error = selcount(p, sel->ibits, uap->nd, &count)) ) { goto continuation; } sel->count = count; - sel->kfcount = kfcount; size = SIZEOF_WAITQUEUE_SET + (count * SIZEOF_WAITQUEUE_LINK); if (uth->uu_allocsize) { if (uth->uu_wqset == 0) @@ -1090,7 +1104,6 @@ select_nocancel(struct proc *p, struct select_nocancel_args *uap, int32_t *retva panic("failed to allocate memory for waitqueue\n"); } } else { - sel->count = count; uth->uu_allocsize = size; uth->uu_wqset = (wait_queue_set_t)kalloc(uth->uu_allocsize); if (uth->uu_wqset == (wait_queue_set_t)NULL) @@ -1101,7 +1114,18 @@ select_nocancel(struct proc *p, struct select_nocancel_args *uap, int32_t *retva wait_queue_set_init(uth->uu_wqset, (SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST)); continuation: - return selprocess(error, SEL_FIRSTPASS); + + if (error) { + /* + * We have already cleaned up any state we established, + * either locally or as a result of selcount(). We don't + * need to wait_subqueue_unlink_all(), since we haven't set + * anything at this point. + */ + return (error); + } + + return selprocess(0, SEL_FIRSTPASS); } int @@ -1110,6 +1134,13 @@ selcontinue(int error) return selprocess(error, SEL_SECONDPASS); } + +/* + * selprocess + * + * Parameters: error The error code from our caller + * sel_pass The pass we are on + */ int selprocess(int error, int sel_pass) { @@ -1134,20 +1165,24 @@ selprocess(int error, int sel_pass) uth = get_bsdthread_info(th_act); sel = &uth->uu_select; - /* if it is first pass wait queue is not setup yet */ if ((error != 0) && (sel_pass == SEL_FIRSTPASS)) unwind = 0; if (sel->count == 0) unwind = 0; retry: if (error != 0) { - goto done; + sel_pass = SEL_FIRSTPASS; /* Reset for seldrop */ + goto done; } ncoll = nselcoll; OSBitOrAtomic(P_SELECT, &p->p_flag); /* skip scans if the select is just for timeouts */ if (sel->count) { + /* + * Clear out any dangling refs from prior calls; technically + * there should not be any. + */ if (sel_pass == SEL_FIRSTPASS) wait_queue_sub_clearrefs(uth->uu_wqset); @@ -1215,10 +1250,10 @@ selprocess(int error, int sel_pass) error = 0; } - sel_pass = SEL_SECONDPASS; if (error == 0) { + sel_pass = SEL_SECONDPASS; if (!prepost) - somewakeup =1; + somewakeup = 1; goto retry; } done: @@ -1253,6 +1288,23 @@ selprocess(int error, int sel_pass) return(error); } + +/* + * selscan + * + * Parameters: p Process performing the select + * sel The per-thread select context structure + * nfd The number of file descriptors to scan + * retval The per thread system call return area + * sel_pass Which pass this is; allowed values are + * SEL_FIRSTPASS and SEL_SECONDPASS + * wqsub The per thread wait queue set + * + * Returns: 0 Success + * EIO Invalid p->p_fd field XXX Obsolete? + * EBADF One of the files in the bit vector is + * invalid. + */ static int selscan(struct proc *p, struct _select *sel, int nfd, int32_t *retval, int sel_pass, wait_queue_sub_t wqsub) @@ -1261,16 +1313,15 @@ selscan(struct proc *p, struct _select *sel, int nfd, int32_t *retval, int msk, i, j, fd; u_int32_t bits; struct fileproc *fp; - int n = 0; - int nc = 0; + int n = 0; /* count of bits */ + int nc = 0; /* bit vector offset (nc'th bit) */ static int flag[3] = { FREAD, FWRITE, 0 }; u_int32_t *iptr, *optr; u_int nw; u_int32_t *ibits, *obits; char * wql; char * wql_ptr; - int count, kfcount; - vnode_t vp; + int count; struct vfs_context context = *vfs_context_current(); /* @@ -1288,57 +1339,9 @@ selscan(struct proc *p, struct _select *sel, int nfd, int32_t *retval, nw = howmany(nfd, NFDBITS); count = sel->count; - kfcount = sel->kfcount; - - if (kfcount > count) - panic("selscan: count < kfcount"); - - if (kfcount != 0) { - proc_fdlock(p); - for (msk = 0; msk < 3; msk++) { - iptr = (u_int32_t *)&ibits[msk * nw]; - optr = (u_int32_t *)&obits[msk * nw]; - - for (i = 0; i < nfd; i += NFDBITS) { - bits = iptr[i/NFDBITS]; - - while ((j = ffs(bits)) && (fd = i + --j) < nfd) { - bits &= ~(1 << j); - fp = fdp->fd_ofiles[fd]; - - if (fp == NULL || - (fdp->fd_ofileflags[fd] & UF_RESERVED)) { - proc_fdunlock(p); - return(EBADF); - } - if (sel_pass == SEL_SECONDPASS) { - wql_ptr = (char *)0; - fp->f_flags &= ~FP_INSELECT; - fp->f_waddr = (void *)0; - } else { - wql_ptr = (wql + nc * SIZEOF_WAITQUEUE_LINK); - fp->f_flags |= FP_INSELECT; - fp->f_waddr = (void *)wqsub; - } - - context.vc_ucred = fp->f_cred; - - if (fp->f_ops && (fp->f_type == DTYPE_VNODE) - && ((vp = (struct vnode *)fp->f_data) != NULLVP) - && (vp->v_type == VCHR) - && fo_select(fp, flag[msk], wql_ptr, &context)) { - optr[fd/NFDBITS] |= (1 << (fd % NFDBITS)); - n++; - } - nc++; - } - } - } - proc_fdunlock(p); - } nc = 0; - if (kfcount != count) { + if (count) { proc_fdlock(p); for (msk = 0; msk < 3; msk++) { iptr = (u_int32_t *)&ibits[msk * nw]; @@ -1351,29 +1354,37 @@ selscan(struct proc *p, struct _select *sel, int nfd, int32_t *retval, bits &= ~(1 << j); fp = fdp->fd_ofiles[fd]; - if (fp == NULL || - (fdp->fd_ofileflags[fd] & UF_RESERVED)) { + if (fp == NULL || (fdp->fd_ofileflags[fd] & UF_RESERVED)) { + /* + * If we abort because of a bad + * fd, let the caller unwind... + */ proc_fdunlock(p); return(EBADF); } if (sel_pass == SEL_SECONDPASS) { wql_ptr = (char *)0; - fp->f_flags &= ~FP_INSELECT; - fp->f_waddr = (void *)0; + if ((fp->f_flags & FP_INSELECT) && (fp->f_waddr == (void *)wqsub)) { + fp->f_flags &= ~FP_INSELECT; + fp->f_waddr = (void *)0; + } } else { wql_ptr = (wql + nc * SIZEOF_WAITQUEUE_LINK); - fp->f_flags |= FP_INSELECT; - fp->f_waddr = (void *)wqsub; + if (fp->f_flags & FP_INSELECT) { + /* someone is already in select on this fp */ + fp->f_flags |= FP_SELCONFLICT; + wait_queue_link(&select_conflict_queue, (wait_queue_set_t)wqsub); + } else { + fp->f_flags |= FP_INSELECT; + fp->f_waddr = (void *)wqsub; + } } context.vc_ucred = fp->f_cred; - if ((fp->f_ops && - ((fp->f_type != DTYPE_VNODE) - || (((vp = (struct vnode *)fp->f_data) != NULLVP) - && (vp->v_type != VCHR)) - ) - && fo_select(fp, flag[msk], wql_ptr, &context))) { + /* The select; set the bit, if true */ + if (fp->f_ops + && fo_select(fp, flag[msk], wql_ptr, &context)) { optr[fd/NFDBITS] |= (1 << (fd % NFDBITS)); n++; } @@ -1476,9 +1487,9 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval) /* convert the poll event into a kqueue kevent */ kev.ident = fds[i].fd; kev.flags = EV_ADD | EV_ONESHOT | EV_POLL; - kev.fflags = NOTE_LOWAT; - kev.data = 1; /* efficiency be damned: any data should trigger */ kev.udata = CAST_USER_ADDR_T(&fds[i]); + kev.fflags = 0; + kev.data = 0; kev.ext[0] = 0; kev.ext[1] = 0; @@ -1608,9 +1619,32 @@ seltrue(__unused dev_t dev, __unused int flag, __unused struct proc *p) return (1); } +/* + * selcount + * + * Count the number of bits set in the input bit vector, and establish an + * outstanding fp->f_iocount for each of the descriptors which will be in + * use in the select operation. + * + * Parameters: p The process doing the select + * ibits The input bit vector + * nfd The number of fd's in the vector + * countp Pointer to where to store the bit count + * + * Returns: 0 Success + * EIO Bad per process open file table + * EBADF One of the bits in the input bit vector + * references an invalid fd + * + * Implicit: *countp (modified) Count of fd's + * + * Notes: This function is the first pass under the proc_fdlock() that + * permits us to recognize invalid descriptors in the bit vector; + * the may, however, not remain valid through the drop and + * later reacquisition of the proc_fdlock(). + */ static int -selcount(struct proc *p, u_int32_t *ibits, __unused u_int32_t *obits, - int nfd, int *countp, int * kfcountp) +selcount(struct proc *p, u_int32_t *ibits, int nfd, int *countp) { struct filedesc *fdp = p->p_fd; int msk, i, j, fd; @@ -1620,9 +1654,8 @@ selcount(struct proc *p, u_int32_t *ibits, __unused u_int32_t *obits, u_int32_t *iptr; u_int nw; int error=0; - int kfc = 0; int dropcount; - vnode_t vp; + int need_wakeup = 0; /* * Problems when reboot; due to MacOSX signal probs @@ -1630,7 +1663,6 @@ selcount(struct proc *p, u_int32_t *ibits, __unused u_int32_t *obits, */ if (fdp == NULL) { *countp = 0; - *kfcountp = 0; return(EIO); } nw = howmany(nfd, NFDBITS); @@ -1646,16 +1678,10 @@ selcount(struct proc *p, u_int32_t *ibits, __unused u_int32_t *obits, if (fp == NULL || (fdp->fd_ofileflags[fd] & UF_RESERVED)) { *countp = 0; - *kfcountp = 0; error = EBADF; goto bad; } fp->f_iocount++; - if ((fp->f_type == DTYPE_VNODE) - && ((vp = (struct vnode *)fp->f_data) != NULLVP) - && (vp->v_type == VCHR) ) - kfc++; - n++; } } @@ -1663,48 +1689,64 @@ selcount(struct proc *p, u_int32_t *ibits, __unused u_int32_t *obits, proc_fdunlock(p); *countp = n; - *kfcountp = kfc; return (0); + bad: dropcount = 0; if (n== 0) goto out; - /* undo the iocounts */ - for (msk = 0; msk < 3; msk++) { - iptr = (u_int32_t *)&ibits[msk * nw]; - for (i = 0; i < nfd; i += NFDBITS) { - bits = iptr[i/NFDBITS]; - while ((j = ffs(bits)) && (fd = i + --j) < nfd) { - bits &= ~(1 << j); - fp = fdp->fd_ofiles[fd]; - if (dropcount >= n) - goto out; - fp->f_iocount--; + /* Ignore error return; it's already EBADF */ + (void)seldrop_locked(p, ibits, nfd, n, &need_wakeup, 1); - if (p->p_fpdrainwait && fp->f_iocount == 0) { - p->p_fpdrainwait = 0; - wakeup(&p->p_fpdrainwait); - } - dropcount++; - } - } - } out: proc_fdunlock(p); + if (need_wakeup) { + wakeup(&p->p_fpdrainwait); + } return(error); } + +/* + * seldrop_locked + * + * Drop outstanding wait queue references set up during selscan(); drop the + * outstanding per fileproc f_iocount() picked up during the selcount(). + * + * Parameters: p Process performing the select + * ibits Input pit bector of fd's + * nfd Number of fd's + * lim Limit to number of vector entries to + * consider, or -1 for "all" + * inselect True if + * need_wakeup Pointer to flag to set to do a wakeup + * if f_iocont on any descriptor goes to 0 + * + * Returns: 0 Success + * EBADF One or more fds in the bit vector + * were invalid, but the rest + * were successfully dropped + * + * Notes: An fd make become bad while the proc_fdlock() is not held, + * if a multithreaded application closes the fd out from under + * the in progress select. In this case, we still have to + * clean up after the set up on the remaining fds. + */ static int -seldrop(struct proc *p, u_int32_t *ibits, int nfd) +seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wakeup, int fromselcount) { struct filedesc *fdp = p->p_fd; int msk, i, j, fd; u_int32_t bits; struct fileproc *fp; - int n = 0; u_int32_t *iptr; u_int nw; + int error = 0; + int dropcount = 0; + uthread_t uth = get_bsdthread_info(current_thread()); + + *need_wakeup = 0; /* * Problems when reboot; due to MacOSX signal probs @@ -1716,8 +1758,6 @@ seldrop(struct proc *p, u_int32_t *ibits, int nfd) nw = howmany(nfd, NFDBITS); - - proc_fdlock(p); for (msk = 0; msk < 3; msk++) { iptr = (u_int32_t *)&ibits[msk * nw]; for (i = 0; i < nfd; i += NFDBITS) { @@ -1725,28 +1765,67 @@ seldrop(struct proc *p, u_int32_t *ibits, int nfd) while ((j = ffs(bits)) && (fd = i + --j) < nfd) { bits &= ~(1 << j); fp = fdp->fd_ofiles[fd]; - if (fp == NULL -#if 0 - /* if you are here then it is being closed */ - || (fdp->fd_ofileflags[fd] & UF_RESERVED) -#endif - ) { - proc_fdunlock(p); - return(EBADF); + /* + * If we've already dropped as many as were + * counted/scanned, then we are done. + */ + if ((fromselcount != 0) && (++dropcount > lim)) + goto done; + + if (fp == NULL) { + /* skip (now) bad fds */ + error = EBADF; + continue; + } + /* + * Only clear the flag if we set it. We'll + * only find that we set it if we had made + * at least one [partial] pass through selscan(). + */ + if ((fp->f_flags & FP_INSELECT) && (fp->f_waddr == (void *)uth->uu_wqset)) { + fp->f_flags &= ~FP_INSELECT; + fp->f_waddr = (void *)0; } - n++; - fp->f_iocount--; - fp->f_flags &= ~FP_INSELECT; - if (p->p_fpdrainwait && fp->f_iocount == 0) { - p->p_fpdrainwait = 0; - wakeup(&p->p_fpdrainwait); + fp->f_iocount--; + if (fp->f_iocount < 0) + panic("f_iocount overdecrement!"); + + if (fp->f_iocount == 0) { + /* + * The last iocount is responsible for clearing + * selconfict flag - even if we didn't set it - + * and is also responsible for waking up anyone + * waiting on iocounts to drain. + */ + if (fp->f_flags & FP_SELCONFLICT) + fp->f_flags &= ~FP_SELCONFLICT; + if (p->p_fpdrainwait) { + p->p_fpdrainwait = 0; + *need_wakeup = 1; + } } } } } +done: + return (error); +} + + +static int +seldrop(struct proc *p, u_int32_t *ibits, int nfd) +{ + int error; + int need_wakeup = 0; + + proc_fdlock(p); + error = seldrop_locked(p, ibits, nfd, nfd, &need_wakeup, 0); proc_fdunlock(p); - return (0); + if (need_wakeup) { + wakeup(&p->p_fpdrainwait); + } + return (error); } /* @@ -1760,12 +1839,8 @@ selrecord(__unused struct proc *selector, struct selinfo *sip, void * p_wql) /* need to look at collisions */ - if ((p_wql == (void *)0) && ((sip->si_flags & SI_INITED) == 0)) { - return; - } - /*do not record if this is second pass of select */ - if((p_wql == (void *)0)) { + if(p_wql == (void *)0) { return; } diff --git a/bsd/kern/sys_pipe.c b/bsd/kern/sys_pipe.c index f86ad5a11..c374ea07e 100644 --- a/bsd/kern/sys_pipe.c +++ b/bsd/kern/sys_pipe.c @@ -231,17 +231,17 @@ int maxpipekva = 1024 * 1024 * 16; #if PIPE_SYSCTLS SYSCTL_DECL(_kern_ipc); -SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RD, +SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RD|CTLFLAG_LOCKED, &maxpipekva, 0, "Pipe KVA limit"); -SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekvawired, CTLFLAG_RW, +SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekvawired, CTLFLAG_RW|CTLFLAG_LOCKED, &maxpipekvawired, 0, "Pipe KVA wired limit"); -SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD, +SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD|CTLFLAG_LOCKED, &amountpipes, 0, "Current # of pipes"); -SYSCTL_INT(_kern_ipc, OID_AUTO, bigpipes, CTLFLAG_RD, +SYSCTL_INT(_kern_ipc, OID_AUTO, bigpipes, CTLFLAG_RD|CTLFLAG_LOCKED, &nbigpipe, 0, "Current # of big pipes"); -SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD, +SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD|CTLFLAG_LOCKED, &amountpipekva, 0, "Pipe KVA usage"); -SYSCTL_INT(_kern_ipc, OID_AUTO, pipekvawired, CTLFLAG_RD, +SYSCTL_INT(_kern_ipc, OID_AUTO, pipekvawired, CTLFLAG_RD|CTLFLAG_LOCKED, &amountpipekvawired, 0, "Pipe wired KVA usage"); #endif @@ -1332,6 +1332,16 @@ pipe_write(struct fileproc *fp, struct uio *uio, __unused int flags, error = EAGAIN; break; } + + /* + * If read side wants to go away, we just issue a signal + * to ourselves. + */ + if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) { + error = EPIPE; + break; + } + /* * We have no more space and have something to offer, * wake up select/poll. @@ -1344,14 +1354,6 @@ pipe_write(struct fileproc *fp, struct uio *uio, __unused int flags, if (error != 0) break; - /* - * If read side wants to go away, we just issue a signal - * to ourselves. - */ - if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) { - error = EPIPE; - break; - } } } --wpipe->pipe_busy; @@ -1741,8 +1743,14 @@ filt_piperead(struct knote *kn, long hint) kn->kn_flags |= EV_EOF; retval = 1; } else { - retval = (kn->kn_sfflags & NOTE_LOWAT) ? - (kn->kn_data >= kn->kn_sdata) : (kn->kn_data > 0); + int64_t lowwat = 1; + if (kn->kn_sfflags & NOTE_LOWAT) { + if (rpipe->pipe_buffer.size && kn->kn_sdata > rpipe->pipe_buffer.size) + lowwat = rpipe->pipe_buffer.size; + else if (kn->kn_sdata > lowwat) + lowwat = kn->kn_sdata; + } + retval = kn->kn_data >= lowwat; } if (hint == 0) @@ -1779,17 +1787,24 @@ filt_pipewrite(struct knote *kn, long hint) } kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; if (!kn->kn_data && wpipe->pipe_buffer.size == 0) - kn->kn_data = 1; /* unwritten pipe is ready for write */ + kn->kn_data = PIPE_BUF; /* unwritten pipe is ready for write */ #ifndef PIPE_NODIRECT if (wpipe->pipe_state & PIPE_DIRECTW) kn->kn_data = 0; #endif + int64_t lowwat = PIPE_BUF; + if (kn->kn_sfflags & NOTE_LOWAT) { + if (wpipe->pipe_buffer.size && kn->kn_sdata > wpipe->pipe_buffer.size) + lowwat = wpipe->pipe_buffer.size; + else if (kn->kn_sdata > lowwat) + lowwat = kn->kn_sdata; + } + if (hint == 0) PIPE_UNLOCK(rpipe); - return (kn->kn_data >= ((kn->kn_sfflags & NOTE_LOWAT) ? - kn->kn_sdata : PIPE_BUF)); + return (kn->kn_data >= lowwat); } int diff --git a/bsd/kern/sys_socket.c b/bsd/kern/sys_socket.c index 471cac76a..431e47658 100644 --- a/bsd/kern/sys_socket.c +++ b/bsd/kern/sys_socket.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -194,27 +194,7 @@ soioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) /* Call the socket filter's ioctl handler for most ioctls */ if (IOCGROUP(cmd) != 'i' && IOCGROUP(cmd) != 'r') { - int filtered = 0; - struct socket_filter_entry *filter; - - for (filter = so->so_filt; filter && error == 0; - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_ioctl) { - if (filtered == 0) { - sflt_use(so); - socket_unlock(so, 0); - filtered = 1; - } - error = filter->sfe_filter->sf_filter. - sf_ioctl(filter->sfe_cookie, so, cmd, data); - } - } - - if (filtered) { - socket_lock(so, 0); - sflt_unuse(so); - } - + error = sflt_ioctl(so, cmd, data); if (error != 0) goto out; } @@ -462,7 +442,7 @@ soo_stat(struct socket *so, void *ub, int isstat64) sb64->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH; sb64->st_size = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; sb64->st_uid = so->so_uid; - sb64->st_gid = -1; /* XXX -- what else to do? */ + sb64->st_gid = so->so_gid; } else { sb->st_mode = S_IFSOCK; if ((so->so_state & SS_CANTRCVMORE) == 0 || @@ -472,7 +452,7 @@ soo_stat(struct socket *so, void *ub, int isstat64) sb->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH; sb->st_size = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; sb->st_uid = so->so_uid; - sb->st_gid = -1; /* XXX -- what else to do? */ + sb->st_gid = so->so_gid; } ret = (*so->so_proto->pr_usrreqs->pru_sense)(so, ub, isstat64); diff --git a/bsd/kern/syscalls.master b/bsd/kern/syscalls.master index 00fb84082..009dd377b 100644 --- a/bsd/kern/syscalls.master +++ b/bsd/kern/syscalls.master @@ -39,21 +39,21 @@ #include 0 AUE_NULL ALL { int nosys(void); } { indirect syscall } -1 AUE_EXIT ALL { void exit(int rval); } -2 AUE_FORK ALL { int fork(void); } +1 AUE_EXIT ALL { void exit(int rval) NO_SYSCALL_STUB; } +2 AUE_FORK ALL { int fork(void) NO_SYSCALL_STUB; } 3 AUE_NULL ALL { user_ssize_t read(int fd, user_addr_t cbuf, user_size_t nbyte); } 4 AUE_NULL ALL { user_ssize_t write(int fd, user_addr_t cbuf, user_size_t nbyte); } -5 AUE_OPEN_RWTC ALL { int open(user_addr_t path, int flags, int mode); } +5 AUE_OPEN_RWTC ALL { int open(user_addr_t path, int flags, int mode) NO_SYSCALL_STUB; } 6 AUE_CLOSE ALL { int close(int fd); } -7 AUE_WAIT4 ALL { int wait4(int pid, user_addr_t status, int options, user_addr_t rusage); } +7 AUE_WAIT4 ALL { int wait4(int pid, user_addr_t status, int options, user_addr_t rusage) NO_SYSCALL_STUB; } 8 AUE_NULL ALL { int nosys(void); } { old creat } 9 AUE_LINK ALL { int link(user_addr_t path, user_addr_t link); } -10 AUE_UNLINK ALL { int unlink(user_addr_t path); } +10 AUE_UNLINK ALL { int unlink(user_addr_t path) NO_SYSCALL_STUB; } 11 AUE_NULL ALL { int nosys(void); } { old execv } 12 AUE_CHDIR ALL { int chdir(user_addr_t path); } 13 AUE_FCHDIR ALL { int fchdir(int fd); } 14 AUE_MKNOD ALL { int mknod(user_addr_t path, int mode, int dev); } -15 AUE_CHMOD ALL { int chmod(user_addr_t path, int mode); } +15 AUE_CHMOD ALL { int chmod(user_addr_t path, int mode) NO_SYSCALL_STUB; } 16 AUE_CHOWN ALL { int chown(user_addr_t path, int uid, int gid); } 17 AUE_NULL ALL { int nosys(void); } { old break } 18 AUE_GETFSSTAT ALL { int getfsstat(user_addr_t buf, int bufsize, int flags); } @@ -66,12 +66,12 @@ 25 AUE_GETEUID ALL { int geteuid(void); } 26 AUE_PTRACE ALL { int ptrace(int req, pid_t pid, caddr_t addr, int data); } #if SOCKETS -27 AUE_RECVMSG ALL { int recvmsg(int s, struct msghdr *msg, int flags); } -28 AUE_SENDMSG ALL { int sendmsg(int s, caddr_t msg, int flags); } -29 AUE_RECVFROM ALL { int recvfrom(int s, void *buf, size_t len, int flags, struct sockaddr *from, int *fromlenaddr); } -30 AUE_ACCEPT ALL { int accept(int s, caddr_t name, socklen_t *anamelen); } -31 AUE_GETPEERNAME ALL { int getpeername(int fdes, caddr_t asa, socklen_t *alen); } -32 AUE_GETSOCKNAME ALL { int getsockname(int fdes, caddr_t asa, socklen_t *alen); } +27 AUE_RECVMSG ALL { int recvmsg(int s, struct msghdr *msg, int flags) NO_SYSCALL_STUB; } +28 AUE_SENDMSG ALL { int sendmsg(int s, caddr_t msg, int flags) NO_SYSCALL_STUB; } +29 AUE_RECVFROM ALL { int recvfrom(int s, void *buf, size_t len, int flags, struct sockaddr *from, int *fromlenaddr) NO_SYSCALL_STUB; } +30 AUE_ACCEPT ALL { int accept(int s, caddr_t name, socklen_t *anamelen) NO_SYSCALL_STUB; } +31 AUE_GETPEERNAME ALL { int getpeername(int fdes, caddr_t asa, socklen_t *alen) NO_SYSCALL_STUB; } +32 AUE_GETSOCKNAME ALL { int getsockname(int fdes, caddr_t asa, socklen_t *alen) NO_SYSCALL_STUB; } #else 27 AUE_NULL ALL { int nosys(void); } 28 AUE_NULL ALL { int nosys(void); } @@ -84,7 +84,7 @@ 34 AUE_CHFLAGS ALL { int chflags(char *path, int flags); } 35 AUE_FCHFLAGS ALL { int fchflags(int fd, int flags); } 36 AUE_SYNC ALL { int sync(void); } -37 AUE_KILL ALL { int kill(int pid, int signum, int posix); } +37 AUE_KILL ALL { int kill(int pid, int signum, int posix) NO_SYSCALL_STUB; } 38 AUE_NULL ALL { int nosys(void); } { old stat } 39 AUE_GETPPID ALL { int getppid(void); } 40 AUE_NULL ALL { int nosys(void); } { old lstat } @@ -93,15 +93,15 @@ 43 AUE_GETEGID ALL { int getegid(void); } 44 AUE_PROFILE ALL { int profil(short *bufbase, size_t bufsize, u_long pcoffset, u_int pcscale); } 45 AUE_NULL ALL { int nosys(void); } { old ktrace } -46 AUE_SIGACTION ALL { int sigaction(int signum, struct __sigaction *nsa, struct sigaction *osa); } +46 AUE_SIGACTION ALL { int sigaction(int signum, struct __sigaction *nsa, struct sigaction *osa) NO_SYSCALL_STUB; } 47 AUE_GETGID ALL { int getgid(void); } 48 AUE_SIGPROCMASK ALL { int sigprocmask(int how, user_addr_t mask, user_addr_t omask); } -49 AUE_GETLOGIN ALL { int getlogin(char *namebuf, u_int namelen); } -50 AUE_SETLOGIN ALL { int setlogin(char *namebuf); } +49 AUE_GETLOGIN ALL { int getlogin(char *namebuf, u_int namelen) NO_SYSCALL_STUB; } +50 AUE_SETLOGIN ALL { int setlogin(char *namebuf) NO_SYSCALL_STUB; } 51 AUE_ACCT ALL { int acct(char *path); } 52 AUE_SIGPENDING ALL { int sigpending(struct sigvec *osv); } -53 AUE_SIGALTSTACK ALL { int sigaltstack(struct sigaltstack *nss, struct sigaltstack *oss); } -54 AUE_IOCTL ALL { int ioctl(int fd, u_long com, caddr_t data); } +53 AUE_SIGALTSTACK ALL { int sigaltstack(struct sigaltstack *nss, struct sigaltstack *oss) NO_SYSCALL_STUB ; } +54 AUE_IOCTL ALL { int ioctl(int fd, u_long com, caddr_t data) NO_SYSCALL_STUB; } 55 AUE_REBOOT ALL { int reboot(int opt, char *command); } 56 AUE_REVOKE ALL { int revoke(char *path); } 57 AUE_SYMLINK ALL { int symlink(char *path, char *link); } @@ -112,7 +112,7 @@ 62 AUE_NULL ALL { int nosys(void); } { old fstat } 63 AUE_NULL ALL { int nosys(void); } { used internally, reserved } 64 AUE_NULL ALL { int nosys(void); } { old getpagesize } -65 AUE_MSYNC ALL { int msync(caddr_t addr, size_t len, int flags); } +65 AUE_MSYNC ALL { int msync(caddr_t addr, size_t len, int flags) NO_SYSCALL_STUB; } 66 AUE_VFORK ALL { int vfork(void); } 67 AUE_NULL ALL { int nosys(void); } { old vread } 68 AUE_NULL ALL { int nosys(void); } { old vwrite } @@ -120,8 +120,8 @@ 70 AUE_NULL ALL { int nosys(void); } { old sstk } 71 AUE_NULL ALL { int nosys(void); } { old mmap } 72 AUE_NULL ALL { int nosys(void); } { old vadvise } -73 AUE_MUNMAP ALL { int munmap(caddr_t addr, size_t len); } -74 AUE_MPROTECT ALL { int mprotect(caddr_t addr, size_t len, int prot); } +73 AUE_MUNMAP ALL { int munmap(caddr_t addr, size_t len) NO_SYSCALL_STUB; } +74 AUE_MPROTECT ALL { int mprotect(caddr_t addr, size_t len, int prot) NO_SYSCALL_STUB; } 75 AUE_MADVISE ALL { int madvise(caddr_t addr, size_t len, int behav); } 76 AUE_NULL ALL { int nosys(void); } { old vhangup } 77 AUE_NULL ALL { int nosys(void); } { old vlimit } @@ -139,14 +139,14 @@ 89 AUE_GETDTABLESIZE ALL { int getdtablesize(void); } 90 AUE_DUP2 ALL { int dup2(u_int from, u_int to); } 91 AUE_NULL ALL { int nosys(void); } { old getdopt } -92 AUE_FCNTL ALL { int fcntl(int fd, int cmd, long arg); } -93 AUE_SELECT ALL { int select(int nd, u_int32_t *in, u_int32_t *ou, u_int32_t *ex, struct timeval *tv); } +92 AUE_FCNTL ALL { int fcntl(int fd, int cmd, long arg) NO_SYSCALL_STUB; } +93 AUE_SELECT ALL { int select(int nd, u_int32_t *in, u_int32_t *ou, u_int32_t *ex, struct timeval *tv) NO_SYSCALL_STUB; } 94 AUE_NULL ALL { int nosys(void); } { old setdopt } 95 AUE_FSYNC ALL { int fsync(int fd); } 96 AUE_SETPRIORITY ALL { int setpriority(int which, id_t who, int prio); } #if SOCKETS 97 AUE_SOCKET ALL { int socket(int domain, int type, int protocol); } -98 AUE_CONNECT ALL { int connect(int s, caddr_t name, socklen_t namelen); } +98 AUE_CONNECT ALL { int connect(int s, caddr_t name, socklen_t namelen) NO_SYSCALL_STUB; } #else 97 AUE_NULL ALL { int nosys(void); } 98 AUE_NULL ALL { int nosys(void); } @@ -157,9 +157,9 @@ 102 AUE_NULL ALL { int nosys(void); } { old recv } 103 AUE_NULL ALL { int nosys(void); } { old sigreturn } #if SOCKETS -104 AUE_BIND ALL { int bind(int s, caddr_t name, socklen_t namelen); } +104 AUE_BIND ALL { int bind(int s, caddr_t name, socklen_t namelen) NO_SYSCALL_STUB; } 105 AUE_SETSOCKOPT ALL { int setsockopt(int s, int level, int name, caddr_t val, socklen_t valsize); } -106 AUE_LISTEN ALL { int listen(int s, int backlog); } +106 AUE_LISTEN ALL { int listen(int s, int backlog) NO_SYSCALL_STUB; } #else 104 AUE_NULL ALL { int nosys(void); } 105 AUE_NULL ALL { int nosys(void); } @@ -169,7 +169,7 @@ 108 AUE_NULL ALL { int nosys(void); } { old sigvec } 109 AUE_NULL ALL { int nosys(void); } { old sigblock } 110 AUE_NULL ALL { int nosys(void); } { old sigsetmask } -111 AUE_NULL ALL { int sigsuspend(sigset_t mask); } +111 AUE_NULL ALL { int sigsuspend(sigset_t mask) NO_SYSCALL_STUB; } 112 AUE_NULL ALL { int nosys(void); } { old sigstack } #if SOCKETS 113 AUE_NULL ALL { int nosys(void); } { old recvmsg } @@ -179,7 +179,7 @@ 114 AUE_NULL ALL { int nosys(void); } #endif /* SOCKETS */ 115 AUE_NULL ALL { int nosys(void); } { old vtrace } -116 AUE_GETTIMEOFDAY ALL { int gettimeofday(struct timeval *tp, struct timezone *tzp); } +116 AUE_GETTIMEOFDAY ALL { int gettimeofday(struct timeval *tp, struct timezone *tzp) NO_SYSCALL_STUB; } 117 AUE_GETRUSAGE ALL { int getrusage(int who, struct rusage *rusage); } #if SOCKETS 118 AUE_GETSOCKOPT ALL { int getsockopt(int s, int level, int name, caddr_t val, socklen_t *avalsize); } @@ -189,28 +189,28 @@ 119 AUE_NULL ALL { int nosys(void); } { old resuba } 120 AUE_READV ALL { user_ssize_t readv(int fd, struct iovec *iovp, u_int iovcnt); } 121 AUE_WRITEV ALL { user_ssize_t writev(int fd, struct iovec *iovp, u_int iovcnt); } -122 AUE_SETTIMEOFDAY ALL { int settimeofday(struct timeval *tv, struct timezone *tzp); } +122 AUE_SETTIMEOFDAY ALL { int settimeofday(struct timeval *tv, struct timezone *tzp) NO_SYSCALL_STUB; } 123 AUE_FCHOWN ALL { int fchown(int fd, int uid, int gid); } -124 AUE_FCHMOD ALL { int fchmod(int fd, int mode); } +124 AUE_FCHMOD ALL { int fchmod(int fd, int mode) NO_SYSCALL_STUB; } 125 AUE_NULL ALL { int nosys(void); } { old recvfrom } -126 AUE_SETREUID ALL { int setreuid(uid_t ruid, uid_t euid); } -127 AUE_SETREGID ALL { int setregid(gid_t rgid, gid_t egid); } -128 AUE_RENAME ALL { int rename(char *from, char *to); } +126 AUE_SETREUID ALL { int setreuid(uid_t ruid, uid_t euid) NO_SYSCALL_STUB; } +127 AUE_SETREGID ALL { int setregid(gid_t rgid, gid_t egid) NO_SYSCALL_STUB; } +128 AUE_RENAME ALL { int rename(char *from, char *to) NO_SYSCALL_STUB; } 129 AUE_NULL ALL { int nosys(void); } { old truncate } 130 AUE_NULL ALL { int nosys(void); } { old ftruncate } 131 AUE_FLOCK ALL { int flock(int fd, int how); } 132 AUE_MKFIFO ALL { int mkfifo(user_addr_t path, int mode); } #if SOCKETS -133 AUE_SENDTO ALL { int sendto(int s, caddr_t buf, size_t len, int flags, caddr_t to, socklen_t tolen); } +133 AUE_SENDTO ALL { int sendto(int s, caddr_t buf, size_t len, int flags, caddr_t to, socklen_t tolen) NO_SYSCALL_STUB; } 134 AUE_SHUTDOWN ALL { int shutdown(int s, int how); } -135 AUE_SOCKETPAIR ALL { int socketpair(int domain, int type, int protocol, int *rsv); } +135 AUE_SOCKETPAIR ALL { int socketpair(int domain, int type, int protocol, int *rsv) NO_SYSCALL_STUB; } #else 133 AUE_NULL ALL { int nosys(void); } 134 AUE_NULL ALL { int nosys(void); } 135 AUE_NULL ALL { int nosys(void); } #endif /* SOCKETS */ 136 AUE_MKDIR ALL { int mkdir(user_addr_t path, int mode); } -137 AUE_RMDIR ALL { int rmdir(char *path); } +137 AUE_RMDIR ALL { int rmdir(char *path) NO_SYSCALL_STUB; } 138 AUE_UTIMES ALL { int utimes(char *path, struct timeval *tptr); } 139 AUE_FUTIMES ALL { int futimes(int fd, struct timeval *tptr); } 140 AUE_ADJTIME ALL { int adjtime(struct timeval *delta, struct timeval *olddelta); } @@ -279,10 +279,10 @@ 191 AUE_PATHCONF ALL { int pathconf(char *path, int name); } 192 AUE_FPATHCONF ALL { int fpathconf(int fd, int name); } 193 AUE_NULL ALL { int nosys(void); } -194 AUE_GETRLIMIT ALL { int getrlimit(u_int which, struct rlimit *rlp); } -195 AUE_SETRLIMIT ALL { int setrlimit(u_int which, struct rlimit *rlp); } +194 AUE_GETRLIMIT ALL { int getrlimit(u_int which, struct rlimit *rlp) NO_SYSCALL_STUB; } +195 AUE_SETRLIMIT ALL { int setrlimit(u_int which, struct rlimit *rlp) NO_SYSCALL_STUB; } 196 AUE_GETDIRENTRIES ALL { int getdirentries(int fd, char *buf, u_int count, long *basep); } -197 AUE_MMAP ALL { user_addr_t mmap(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos); } +197 AUE_MMAP ALL { user_addr_t mmap(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos) NO_SYSCALL_STUB; } 198 AUE_NULL ALL { int nosys(void); } { __syscall } 199 AUE_LSEEK ALL { off_t lseek(int fd, off_t offset, int whence); } 200 AUE_TRUNCATE ALL { int truncate(char *path, off_t length); } @@ -326,8 +326,8 @@ 217 AUE_STATV UHN { int statv(const char *path, struct vstat *vsb); } { soon to be obsolete } 218 AUE_LSTATV UHN { int lstatv(const char *path, struct vstat *vsb); } { soon to be obsolete } 219 AUE_FSTATV UHN { int fstatv(int fd, struct vstat *vsb); } { soon to be obsolete } -220 AUE_GETATTRLIST ALL { int getattrlist(const char *path, struct attrlist *alist, void *attributeBuffer, size_t bufferSize, u_long options); } -221 AUE_SETATTRLIST ALL { int setattrlist(const char *path, struct attrlist *alist, void *attributeBuffer, size_t bufferSize, u_long options); } +220 AUE_GETATTRLIST ALL { int getattrlist(const char *path, struct attrlist *alist, void *attributeBuffer, size_t bufferSize, u_long options) NO_SYSCALL_STUB; } +221 AUE_SETATTRLIST ALL { int setattrlist(const char *path, struct attrlist *alist, void *attributeBuffer, size_t bufferSize, u_long options) NO_SYSCALL_STUB; } 222 AUE_GETDIRENTRIESATTR ALL { int getdirentriesattr(int fd, struct attrlist *alist, void *buffer, size_t buffersize, u_long *count, u_long *basep, u_long *newstate, u_long options); } 223 AUE_EXCHANGEDATA ALL { int exchangedata(const char *path1, const char *path2, u_long options); } 224 AUE_NULL ALL { int nosys(void); } { old checkuseraccess / fsgetpath (which moved to 427) } @@ -349,8 +349,8 @@ 240 AUE_LISTXATTR ALL { user_ssize_t listxattr(user_addr_t path, user_addr_t namebuf, size_t bufsize, int options); } 241 AUE_FLISTXATTR ALL { user_ssize_t flistxattr(int fd, user_addr_t namebuf, size_t bufsize, int options); } 242 AUE_FSCTL ALL { int fsctl(const char *path, u_long cmd, caddr_t data, u_int options); } -243 AUE_INITGROUPS ALL { int initgroups(u_int gidsetsize, gid_t *gidset, int gmuid); } -244 AUE_POSIX_SPAWN ALL { int posix_spawn(pid_t *pid, const char *path, const struct _posix_spawn_args_desc *adesc, char **argv, char **envp); } +243 AUE_INITGROUPS ALL { int initgroups(u_int gidsetsize, gid_t *gidset, int gmuid) NO_SYSCALL_STUB; } +244 AUE_POSIX_SPAWN ALL { int posix_spawn(pid_t *pid, const char *path, const struct _posix_spawn_args_desc *adesc, char **argv, char **envp) NO_SYSCALL_STUB; } 245 AUE_FFSCTL ALL { int ffsctl(int fd, u_long cmd, caddr_t data, u_int options); } 246 AUE_NULL ALL { int nosys(void); } @@ -383,7 +383,7 @@ 253 AUE_NULL ALL { int nosys(void); } #endif #if SYSV_SEM -254 AUE_SEMCTL ALL { int semctl(int semid, int semnum, int cmd, semun_t arg); } +254 AUE_SEMCTL ALL { int semctl(int semid, int semnum, int cmd, semun_t arg) NO_SYSCALL_STUB; } 255 AUE_SEMGET ALL { int semget(key_t key, int nsems, int semflg); } 256 AUE_SEMOP ALL { int semop(int semid, struct sembuf *sops, int nsops); } 257 AUE_NULL ALL { int nosys(void); } @@ -394,7 +394,7 @@ 257 AUE_NULL ALL { int nosys(void); } #endif #if SYSV_MSG -258 AUE_MSGCTL ALL { int msgctl(int msqid, int cmd, struct msqid_ds *buf); } +258 AUE_MSGCTL ALL { int msgctl(int msqid, int cmd, struct msqid_ds *buf) NO_SYSCALL_STUB; } 259 AUE_MSGGET ALL { int msgget(key_t key, int msgflg); } 260 AUE_MSGSND ALL { int msgsnd(int msqid, void *msgp, size_t msgsz, int msgflg); } 261 AUE_MSGRCV ALL { user_ssize_t msgrcv(int msqid, void *msgp, size_t msgsz, long msgtyp, int msgflg); } @@ -406,7 +406,7 @@ #endif #if SYSV_SHM 262 AUE_SHMAT ALL { user_addr_t shmat(int shmid, void *shmaddr, int shmflg); } -263 AUE_SHMCTL ALL { int shmctl(int shmid, int cmd, struct shmid_ds *buf); } +263 AUE_SHMCTL ALL { int shmctl(int shmid, int cmd, struct shmid_ds *buf) NO_SYSCALL_STUB; } 264 AUE_SHMDT ALL { int shmdt(void *shmaddr); } 265 AUE_SHMGET ALL { int shmget(key_t key, size_t size, int shmflg); } #else @@ -444,7 +444,7 @@ 292 AUE_MKDIR_EXTENDED ALL { int mkdir_extended(user_addr_t path, uid_t uid, gid_t gid, int mode, user_addr_t xsecurity) NO_SYSCALL_STUB; } 293 AUE_IDENTITYSVC ALL { int identitysvc(int opcode, user_addr_t message) NO_SYSCALL_STUB; } 294 AUE_NULL ALL { int shared_region_check_np(uint64_t *start_address) NO_SYSCALL_STUB; } -295 AUE_NULL ALL { int shared_region_map_np(int fd, uint32_t count, const struct shared_file_mapping_np *mappings) NO_SYSCALL_STUB; } +295 AUE_NULL ALL { int nosys(void); } { old shared_region_map_np } 296 AUE_NULL ALL { int vm_pressure_monitor(int wait_for_pressure, int nsecs_monitored, uint32_t *pages_reclaimed); } #if PSYNCH 297 AUE_NULL ALL { uint32_t psynch_rw_longrdlock(user_addr_t rwlock, uint32_t lgenval, uint32_t ugenval, uint32_t rw_wc, int flags) NO_SYSCALL_STUB; } @@ -453,9 +453,9 @@ 300 AUE_NULL ALL { uint32_t psynch_rw_upgrade(user_addr_t rwlock, uint32_t lgenval, uint32_t ugenval, uint32_t rw_wc, int flags) NO_SYSCALL_STUB; } 301 AUE_NULL ALL { uint32_t psynch_mutexwait(user_addr_t mutex, uint32_t mgen, uint32_t ugen, uint64_t tid, uint32_t flags) NO_SYSCALL_STUB; } 302 AUE_NULL ALL { uint32_t psynch_mutexdrop(user_addr_t mutex, uint32_t mgen, uint32_t ugen, uint64_t tid, uint32_t flags) NO_SYSCALL_STUB; } -303 AUE_NULL ALL { int psynch_cvbroad(user_addr_t cv, uint32_t cvgen, uint32_t diffgen, user_addr_t mutex, uint32_t mgen, uint32_t ugen, uint64_t tid, uint32_t flags) NO_SYSCALL_STUB; } -304 AUE_NULL ALL { int psynch_cvsignal(user_addr_t cv, uint32_t cvgen, uint32_t cvugen, user_addr_t mutex, uint32_t mgen, uint32_t ugen, int thread_port, uint32_t flags) NO_SYSCALL_STUB; } -305 AUE_NULL ALL { uint32_t psynch_cvwait(user_addr_t cv, uint32_t cvgen, uint32_t cvugen, user_addr_t mutex, uint32_t mgen, uint32_t ugen, uint64_t sec, uint64_t usec) NO_SYSCALL_STUB; } +303 AUE_NULL ALL { uint32_t psynch_cvbroad(user_addr_t cv, uint64_t cvlsgen, uint64_t cvudgen, uint32_t flags, user_addr_t mutex, uint64_t mugen, uint64_t tid) NO_SYSCALL_STUB; } +304 AUE_NULL ALL { uint32_t psynch_cvsignal(user_addr_t cv, uint64_t cvlsgen, uint32_t cvugen, int thread_port, user_addr_t mutex, uint64_t mugen, uint64_t tid, uint32_t flags) NO_SYSCALL_STUB; } +305 AUE_NULL ALL { uint32_t psynch_cvwait(user_addr_t cv, uint64_t cvlsgen, uint32_t cvugen, user_addr_t mutex, uint64_t mugen, uint32_t flags, int64_t sec, uint32_t nsec) NO_SYSCALL_STUB; } 306 AUE_NULL ALL { uint32_t psynch_rw_rdlock(user_addr_t rwlock, uint32_t lgenval, uint32_t ugenval, uint32_t rw_wc, int flags) NO_SYSCALL_STUB; } 307 AUE_NULL ALL { uint32_t psynch_rw_wrlock(user_addr_t rwlock, uint32_t lgenval, uint32_t ugenval, uint32_t rw_wc, int flags) NO_SYSCALL_STUB; } 308 AUE_NULL ALL { uint32_t psynch_rw_unlock(user_addr_t rwlock, uint32_t lgenval, uint32_t ugenval, uint32_t rw_wc, int flags) NO_SYSCALL_STUB; } @@ -477,7 +477,11 @@ #endif 310 AUE_GETSID ALL { int getsid(pid_t pid); } 311 AUE_SETTIDWITHPID ALL { int settid_with_pid(pid_t pid, int assume) NO_SYSCALL_STUB; } +#if PSYNCH +312 AUE_NULL ALL { int psynch_cvclrprepost(user_addr_t cv, uint32_t cvgen, uint32_t cvugen, uint32_t cvsgen, uint32_t prepocnt, uint32_t preposeq, uint32_t flags) NO_SYSCALL_STUB; } +#else 312 AUE_NULL ALL { int nosys(void); } { old __pthread_cond_timedwait } +#endif 313 AUE_NULL ALL { int aio_fsync(int op, user_addr_t aiocbp); } 314 AUE_NULL ALL { user_ssize_t aio_return(user_addr_t aiocbp); } 315 AUE_NULL ALL { int aio_suspend(user_addr_t aiocblist, int nent, user_addr_t timeoutp); } @@ -488,7 +492,7 @@ 320 AUE_LIOLISTIO ALL { int lio_listio(int mode, user_addr_t aiocblist, int nent, user_addr_t sigp); } 321 AUE_NULL ALL { int nosys(void); } { old __pthread_cond_wait } 322 AUE_IOPOLICYSYS ALL { int iopolicysys(int cmd, void *arg) NO_SYSCALL_STUB; } -323 AUE_NULL ALL { int nosys(void); } +323 AUE_NULL ALL { int process_policy(int scope, int action, int policy, int policy_subtype, user_addr_t attrp, pid_t target_pid, uint64_t target_threadid) NO_SYSCALL_STUB; } 324 AUE_MLOCKALL ALL { int mlockall(int how); } 325 AUE_MUNLOCKALL ALL { int munlockall(int how); } 326 AUE_NULL ALL { int nosys(void); } @@ -544,7 +548,7 @@ #endif /* CONFIG_WORKQUEUE */ 362 AUE_KQUEUE ALL { int kqueue(void); } 363 AUE_NULL ALL { int kevent(int fd, const struct kevent *changelist, int nchanges, struct kevent *eventlist, int nevents, const struct timespec *timeout); } -364 AUE_LCHOWN ALL { int lchown(user_addr_t path, uid_t owner, gid_t group); } +364 AUE_LCHOWN ALL { int lchown(user_addr_t path, uid_t owner, gid_t group) NO_SYSCALL_STUB; } 365 AUE_STACKSNAPSHOT ALL { int stack_snapshot(pid_t pid, user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t flags, uint32_t dispatch_offset) NO_SYSCALL_STUB; } #if CONFIG_WORKQUEUE 366 AUE_NULL ALL { int bsdthread_register(user_addr_t threadstart, user_addr_t wqthread, int pthsize,user_addr_t dummy_value, user_addr_t targetconc_ptr, uint64_t dispatchqueue_offset) NO_SYSCALL_STUB; } @@ -637,7 +641,7 @@ ;#if OLD_SEMWAIT_SIGNAL ;423 AUE_NULL ALL { int nosys(void); } { old __semwait_signal_nocancel } ;#else -423 AUE_SEMWAITSIGNAL ALL { int __semwait_signal_nocancel(int cond_sem, int mutex_sem, int timeout, int relative, int64_t tv_sec, int32_t tv_nsec) NO_SYSCALL_STUB;} +423 AUE_SEMWAITSIGNAL ALL { int __semwait_signal_nocancel(int cond_sem, int mutex_sem, int timeout, int relative, int64_t tv_sec, int32_t tv_nsec); } ;#endif 424 AUE_MAC_MOUNT ALL { int __mac_mount(char *type, char *path, int flags, caddr_t data, struct mac *mac_p); } 425 AUE_MAC_GET_MOUNT ALL { int __mac_get_mount(char *path, struct mac *mac_p); } @@ -645,12 +649,18 @@ 427 AUE_FSGETPATH ALL { user_ssize_t fsgetpath(user_addr_t buf, size_t bufsize, user_addr_t fsid, uint64_t objid) NO_SYSCALL_STUB; } { private fsgetpath (File Manager SPI) } 428 AUE_NULL ALL { mach_port_name_t audit_session_self(void); } 429 AUE_NULL ALL { int audit_session_join(mach_port_name_t port); } -430 AUE_NULL ALL { int pid_suspend(int pid); } -431 AUE_NULL ALL { int pid_resume(int pid); } +430 AUE_NULL ALL { int fileport_makeport(int fd, user_addr_t portnamep); } +431 AUE_NULL ALL { int fileport_makefd(mach_port_name_t port); } +432 AUE_NULL ALL { int audit_session_port(au_asid_t asid, user_addr_t portnamep); } +433 AUE_NULL ALL { int pid_suspend(int pid); } +434 AUE_NULL ALL { int pid_resume(int pid); } #if CONFIG_EMBEDDED -432 AUE_NULL ALL { int fileport_makeport(int fd, user_addr_t portnamep); } -433 AUE_NULL ALL { int fileport_makefd(mach_port_name_t port); } +435 AUE_NULL ALL { int pid_hibernate(int pid); } +436 AUE_NULL ALL { int pid_shutdown_sockets(int pid, int level); } #else -432 AUE_NULL ALL { int nosys(void); } -433 AUE_NULL ALL { int nosys(void); } +435 AUE_NULL ALL { int nosys(void); } +436 AUE_NULL ALL { int nosys(void); } #endif +437 AUE_NULL ALL { int nosys(void); } { old shared_region_slide_np } +438 AUE_NULL ALL { int shared_region_map_and_slide_np(int fd, uint32_t count, const struct shared_file_mapping_np *mappings, uint32_t slide, uint64_t* slide_start, uint32_t slide_size) NO_SYSCALL_STUB; } + diff --git a/bsd/kern/sysv_ipc.c b/bsd/kern/sysv_ipc.c index 95c23d418..8f56757c4 100644 --- a/bsd/kern/sysv_ipc.c +++ b/bsd/kern/sysv_ipc.c @@ -60,46 +60,101 @@ #include #include +#include /* mode constants */ #include #include /* * Check for ipc permission - * - * XXX: Should pass proc argument so that we can pass - * XXX: proc->p_acflag to suser() */ + /* + * ipc_perm + * + * perm->mode mode of the object + * mode mode bits we want to test + * * Returns: 0 Success * EPERM * EACCES + * + * Notes: The IPC_M bit is special, in that it may only be granted to + * root, the creating user, or the owning user. + * + * This code does not use posix_cred_access() because of the + * need to check both creator and owner separately when we are + * considering a rights grant. Because of this, we need to do + * two evaluations when the values are inequal, which can lead + * us to defeat the callout avoidance optimization. So we do + * the work here, inline. This is less than optimal for any + * future work involving opacity of of POSIX credentials. + * + * Setting up the mode_owner / mode_group / mode_world implicitly + * masks the IPC_M bit off. This is intentional. + * + * See the posix_cred_access() implementation for algorithm + * information. */ int -ipcperm(kauth_cred_t cred, struct ipc_perm *perm, int mode) +ipcperm(kauth_cred_t cred, struct ipc_perm *perm, int mode_req) { + uid_t uid = kauth_cred_getuid(cred); /* avoid multiple calls */ + int want_mod_controlinfo = (mode_req & IPC_M); + int is_member; + mode_t mode_owner = (perm->mode & S_IRWXU); + mode_t mode_group = (perm->mode & S_IRWXG) << 3; + mode_t mode_world = (perm->mode & S_IRWXO) << 6; + /* Grant all rights to super user */ if (!suser(cred, (u_short *)NULL)) return (0); - /* Check for user match. */ - if (kauth_cred_getuid(cred) != perm->cuid && kauth_cred_getuid(cred) != perm->uid) { - int is_member; + /* Grant or deny rights based on ownership */ + if (uid == perm->cuid || uid == perm->uid) { + if (want_mod_controlinfo) + return (0); - if (mode & IPC_M) + return ((mode_req & mode_owner) == mode_req ? 0 : EACCES); + } else { + /* everyone else who wants to modify control info is denied */ + if (want_mod_controlinfo) return (EPERM); - /* Check for group match. */ - mode >>= 3; - if ((kauth_cred_ismember_gid(cred, perm->gid, &is_member) || !is_member) && - (kauth_cred_ismember_gid(cred, perm->cgid, &is_member) || !is_member)) { - /* Check for `other' match. */ - mode >>= 3; - } } - if (mode & IPC_M) + /* + * Combined group and world rights check, if no owner rights; positive + * asssertion of gid/cgid equality avoids an extra callout in the + * common case. + */ + if ((mode_req & mode_group & mode_world) == mode_req) { return (0); - - return ((mode & perm->mode) == mode ? 0 : EACCES); + } else { + if ((mode_req & mode_group) != mode_req) { + if ((!kauth_cred_ismember_gid(cred, perm->gid, &is_member) && is_member) && + ((perm->gid == perm->cgid) || + (!kauth_cred_ismember_gid(cred, perm->cgid, &is_member) && is_member))) { + return (EACCES); + } else { + if ((mode_req & mode_world) != mode_req) { + return (EACCES); + } else { + return (0); + } + } + } else { + if ((!kauth_cred_ismember_gid(cred, perm->gid, &is_member) && is_member) || + ((perm->gid != perm->cgid) && + (!kauth_cred_ismember_gid(cred, perm->cgid, &is_member) && is_member))) { + return (0); + } else { + if ((mode_req & mode_world) != mode_req) { + return (EACCES); + } else { + return (0); + } + } + } + } } diff --git a/bsd/kern/sysv_msg.c b/bsd/kern/sysv_msg.c index 7ed083eb9..daca44630 100644 --- a/bsd/kern/sysv_msg.c +++ b/bsd/kern/sysv_msg.c @@ -667,8 +667,8 @@ msgget(__unused struct proc *p, struct msgget_args *uap, int32_t *retval) msqptr->u.msg_perm._key = key; msqptr->u.msg_perm.cuid = kauth_cred_getuid(cred); msqptr->u.msg_perm.uid = kauth_cred_getuid(cred); - msqptr->u.msg_perm.cgid = cred->cr_gid; - msqptr->u.msg_perm.gid = cred->cr_gid; + msqptr->u.msg_perm.cgid = kauth_cred_getgid(cred); + msqptr->u.msg_perm.gid = kauth_cred_getgid(cred); msqptr->u.msg_perm.mode = (msgflg & 0777); /* Make sure that the returned msqid is unique */ msqptr->u.msg_perm._seq++; @@ -1576,7 +1576,7 @@ IPCS_msg_sysctl(__unused struct sysctl_oid *oidp, __unused void *arg1, } SYSCTL_DECL(_kern_sysv_ipcs); -SYSCTL_PROC(_kern_sysv_ipcs, OID_AUTO, msg, CTLFLAG_RW|CTLFLAG_ANYBODY, +SYSCTL_PROC(_kern_sysv_ipcs, OID_AUTO, msg, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, 0, 0, IPCS_msg_sysctl, "S,IPCS_msg_command", "ipcs msg command interface"); diff --git a/bsd/kern/sysv_sem.c b/bsd/kern/sysv_sem.c index 0e44029cf..ed43ec893 100644 --- a/bsd/kern/sysv_sem.c +++ b/bsd/kern/sysv_sem.c @@ -999,8 +999,8 @@ semget(__unused struct proc *p, struct semget_args *uap, int32_t *retval) sema[semid].u.sem_perm._key = key; sema[semid].u.sem_perm.cuid = kauth_cred_getuid(cred); sema[semid].u.sem_perm.uid = kauth_cred_getuid(cred); - sema[semid].u.sem_perm.cgid = cred->cr_gid; - sema[semid].u.sem_perm.gid = cred->cr_gid; + sema[semid].u.sem_perm.cgid = kauth_cred_getgid(cred); + sema[semid].u.sem_perm.gid = kauth_cred_getgid(cred); sema[semid].u.sem_perm.mode = (semflg & 0777) | SEM_ALLOC; sema[semid].u.sem_perm._seq = (sema[semid].u.sem_perm._seq + 1) & 0x7fff; @@ -1092,6 +1092,15 @@ semop(struct proc *p, struct semop_args *uap, int32_t *retval) goto semopout; } + /* OK for LP64, since sizeof(struct sembuf) is currently invariant */ + if ((eval = copyin(uap->sops, &sops, nsops * sizeof(struct sembuf))) != 0) { +#ifdef SEM_DEBUG + printf("eval = %d from copyin(%08x, %08x, %ld)\n", eval, + uap->sops, &sops, nsops * sizeof(struct sembuf)); +#endif + goto semopout; + } + #if CONFIG_MACF /* * Initial pass thru sops to see what permissions are needed. @@ -1110,15 +1119,6 @@ semop(struct proc *p, struct semop_args *uap, int32_t *retval) goto semopout; #endif - /* OK for LP64, since sizeof(struct sembuf) is currently invariant */ - if ((eval = copyin(uap->sops, &sops, nsops * sizeof(struct sembuf))) != 0) { -#ifdef SEM_DEBUG - printf("eval = %d from copyin(%08x, %08x, %ld)\n", eval, - uap->sops, &sops, nsops * sizeof(struct sembuf)); -#endif - goto semopout; - } - /* * Loop trying to satisfy the vector of requests. * If we reach a point where we must wait, any requests already @@ -1539,19 +1539,19 @@ sysctl_seminfo(__unused struct sysctl_oid *oidp, void *arg1, /* SYSCTL_NODE(_kern, KERN_SYSV, sysv, CTLFLAG_RW, 0, "SYSV"); */ extern struct sysctl_oid_list sysctl__kern_sysv_children; -SYSCTL_PROC(_kern_sysv, OID_AUTO, semmni, CTLTYPE_INT | CTLFLAG_RW, +SYSCTL_PROC(_kern_sysv, OID_AUTO, semmni, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &limitseminfo.semmni, 0, &sysctl_seminfo ,"I","semmni"); -SYSCTL_PROC(_kern_sysv, OID_AUTO, semmns, CTLTYPE_INT | CTLFLAG_RW, +SYSCTL_PROC(_kern_sysv, OID_AUTO, semmns, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &limitseminfo.semmns, 0, &sysctl_seminfo ,"I","semmns"); -SYSCTL_PROC(_kern_sysv, OID_AUTO, semmnu, CTLTYPE_INT | CTLFLAG_RW, +SYSCTL_PROC(_kern_sysv, OID_AUTO, semmnu, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &limitseminfo.semmnu, 0, &sysctl_seminfo ,"I","semmnu"); -SYSCTL_PROC(_kern_sysv, OID_AUTO, semmsl, CTLTYPE_INT | CTLFLAG_RW, +SYSCTL_PROC(_kern_sysv, OID_AUTO, semmsl, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &limitseminfo.semmsl, 0, &sysctl_seminfo ,"I","semmsl"); -SYSCTL_PROC(_kern_sysv, OID_AUTO, semume, CTLTYPE_INT | CTLFLAG_RW, +SYSCTL_PROC(_kern_sysv, OID_AUTO, semume, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &limitseminfo.semume, 0, &sysctl_seminfo ,"I","semume"); @@ -1662,7 +1662,7 @@ IPCS_sem_sysctl(__unused struct sysctl_oid *oidp, __unused void *arg1, } SYSCTL_DECL(_kern_sysv_ipcs); -SYSCTL_PROC(_kern_sysv_ipcs, OID_AUTO, sem, CTLFLAG_RW|CTLFLAG_ANYBODY, +SYSCTL_PROC(_kern_sysv_ipcs, OID_AUTO, sem, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, 0, 0, IPCS_sem_sysctl, "S,IPCS_sem_command", "ipcs sem command interface"); diff --git a/bsd/kern/sysv_shm.c b/bsd/kern/sysv_shm.c index 4a93dc597..25a484798 100644 --- a/bsd/kern/sysv_shm.c +++ b/bsd/kern/sysv_shm.c @@ -774,7 +774,7 @@ shmget_allocate_segment(struct proc *p, struct shmget_args *uap, int mode, shmid = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm); shmseg->u.shm_perm.cuid = shmseg->u.shm_perm.uid = kauth_cred_getuid(cred); - shmseg->u.shm_perm.cgid = shmseg->u.shm_perm.gid = cred->cr_gid; + shmseg->u.shm_perm.cgid = shmseg->u.shm_perm.gid = kauth_cred_getgid(cred); shmseg->u.shm_perm.mode = (shmseg->u.shm_perm.mode & SHMSEG_WANTED) | (mode & ACCESSPERMS) | SHMSEG_ALLOCATED; shmseg->u.shm_segsz = uap->size; @@ -1165,26 +1165,26 @@ IPCS_shm_sysctl(__unused struct sysctl_oid *oidp, __unused void *arg1, return(error); } -SYSCTL_NODE(_kern, KERN_SYSV, sysv, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "SYSV"); +SYSCTL_NODE(_kern, KERN_SYSV, sysv, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, 0, "SYSV"); -SYSCTL_PROC(_kern_sysv, OID_AUTO, shmmax, CTLTYPE_QUAD | CTLFLAG_RW, +SYSCTL_PROC(_kern_sysv, OID_AUTO, shmmax, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &shminfo.shmmax, 0, &sysctl_shminfo ,"Q","shmmax"); -SYSCTL_PROC(_kern_sysv, OID_AUTO, shmmin, CTLTYPE_QUAD | CTLFLAG_RW, +SYSCTL_PROC(_kern_sysv, OID_AUTO, shmmin, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &shminfo.shmmin, 0, &sysctl_shminfo ,"Q","shmmin"); -SYSCTL_PROC(_kern_sysv, OID_AUTO, shmmni, CTLTYPE_QUAD | CTLFLAG_RW, +SYSCTL_PROC(_kern_sysv, OID_AUTO, shmmni, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &shminfo.shmmni, 0, &sysctl_shminfo ,"Q","shmmni"); -SYSCTL_PROC(_kern_sysv, OID_AUTO, shmseg, CTLTYPE_QUAD | CTLFLAG_RW, +SYSCTL_PROC(_kern_sysv, OID_AUTO, shmseg, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &shminfo.shmseg, 0, &sysctl_shminfo ,"Q","shmseg"); -SYSCTL_PROC(_kern_sysv, OID_AUTO, shmall, CTLTYPE_QUAD | CTLFLAG_RW, +SYSCTL_PROC(_kern_sysv, OID_AUTO, shmall, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &shminfo.shmall, 0, &sysctl_shminfo ,"Q","shmall"); -SYSCTL_NODE(_kern_sysv, OID_AUTO, ipcs, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "SYSVIPCS"); +SYSCTL_NODE(_kern_sysv, OID_AUTO, ipcs, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, 0, "SYSVIPCS"); -SYSCTL_PROC(_kern_sysv_ipcs, OID_AUTO, shm, CTLFLAG_RW|CTLFLAG_ANYBODY, +SYSCTL_PROC(_kern_sysv_ipcs, OID_AUTO, shm, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, 0, 0, IPCS_shm_sysctl, "S,IPCS_shm_command", "ipcs shm command interface"); diff --git a/bsd/kern/trace.codes b/bsd/kern/trace.codes new file mode 100644 index 000000000..3792f3d37 --- /dev/null +++ b/bsd/kern/trace.codes @@ -0,0 +1,2149 @@ +0x1020000 KTrap_DivideError +0x1020004 KTrap_Debug +0x1020008 KTrap_NMI +0x102000c KTrap_Int3 +0x1020010 KTrap_Overflow +0x1020014 KTrap_BoundRange +0x1020018 KTrap_InvalidOpcode +0x102001c KTrap_DeviceNotAvail +0x1020020 KTrap_DoubleFault +0x1020024 KTrap_Coprocessor +0x1020028 KTrap_InvalidTSS +0x102002c KTrap_SegmentNotPresent +0x1020030 KTrap_StackFault +0x1020034 KTrap_GeneralProtection +0x1020038 KTrap_PageFault +0x102003c KTrap_unknown +0x1020040 KTrap_FloatPointError +0x1020044 KTrap_AlignmentCheck +0x1020048 KTrap_MachineCheck +0x102004c KTrap_SIMD_FP +0x10203fc KTrap_Preempt +0x1050000 INTERRUPT +0x1070000 UTrap_DivideError +0x1070004 UTrap_Debug +0x1070008 UTrap_NMI +0x107000c UTrap_Int3 +0x1070010 UTrap_Overflow +0x1070014 UTrap_BoundRange +0x1070018 UTrap_InvalidOpcode +0x107001c UTrap_DeviceNotAvail +0x1070020 UTrap_DoubleFault +0x1070024 UTrap_Coprocessor +0x1070028 UTrap_InvalidTSS +0x107002c UTrap_SegmentNotPresent +0x1070030 UTrap_StackFault +0x1070034 UTrap_GeneralProtection +0x1070038 UTrap_PageFault +0x107003c UTrap_unknown +0x1070040 UTrap_FloatPointError +0x1070044 UTrap_AlignmentCheck +0x1070048 UTrap_MachineCheck +0x107004c UTrap_SIMD_FP +0x1090000 DecrTrap +0x1090004 DecrSet +0x1090008 TimerCallIntr +0x109000c pmsStep +0x1090010 TimerMigration +0x1090014 rdHPET +0x1090018 set_tsc_deadline +0x10c0000 MACH_SysCall +0x10c0004 MSC_kern_invalid_#1 +0x10c0008 MSC_kern_invalid_#2 +0x10c000c MSC_kern_invalid_#3 +0x10c0010 MSC_kern_invalid_#4 +0x10c0014 MSC_kern_invalid_#5 +0x10c0018 MSC_kern_invalid_#6 +0x10c001c MSC_kern_invalid_#7 +0x10c0020 MSC_kern_invalid_#8 +0x10c0024 MSC_kern_invalid_#9 +0x10c0028 MSC_kern_invalid_#10 +0x10c002c MSC_kern_invalid_#11 +0x10c0030 MSC_kern_invalid_#12 +0x10c0034 MSC_kern_invalid_#13 +0x10c0038 MSC_kern_invalid_#14 +0x10c003c MSC_kern_invalid_#15 +0x10c0040 MSC_kern_invalid_#16 +0x10c0044 MSC_kern_invalid_#17 +0x10c0048 MSC_kern_invalid_#18 +0x10c004c MSC_kern_invalid_#19 +0x10c0050 MSC_kern_invalid_#20 +0x10c0054 MSC_kern_invalid_#21 +0x10c0058 MSC_kern_invalid_#22 +0x10c005c MSC_kern_invalid_#23 +0x10c0060 MSC_kern_invalid_#24 +0x10c0064 MSC_kern_invalid_#25 +0x10c0068 MSC_mach_reply_port +0x10c006c MSC_thread_self_trap +0x10c0070 MSC_task_self_trap +0x10c0074 MSC_host_self_trap +0x10c0078 MSC_kern_invalid_#30 +0x10c007c MSC_mach_msg_trap +0x10c0080 MSC_mach_msg_overwrite_trap +0x10c0084 MSC_semaphore_signal_trap +0x10c0088 MSC_semaphore_signal_all_trap +0x10c008c MSC_semaphore_signal_thread_trap +0x10c0090 MSC_semaphore_wait_trap +0x10c0094 MSC_semaphore_wait_signal_trap +0x10c0098 MSC_semaphore_timedwait_trap +0x10c009c MSC_semaphore_timedwait_signal_trap +0x10c00a0 MSC_kern_invalid_#40 +0x10c00a4 MSC_kern_invalid_#41 +0x10c00a8 MSC_kern_invalid_#42 +0x10c00ac MSC_map_fd +0x10c00b0 MSC_task_name_for_pid +0x10c00b4 MSC_task_for_pid +0x10c00b8 MSC_pid_for_task +0x10c00bc MSC_kern_invalid_#47 +0x10c00c0 MSC_macx_swapon +0x10c00c4 MSC_macx_swapoff +0x10c00c8 MSC_kern_invalid_#50 +0x10c00cc MSC_macx_triggers +0x10c00d0 MSC_macx_backing_store_suspend +0x10c00d4 MSC_macx_backing_store_recovery +0x10c00d8 MSC_kern_invalid_#54 +0x10c00dc MSC_kern_invalid_#55 +0x10c00e0 MSC_kern_invalid_#56 +0x10c00e4 MSC_kern_invalid_#57 +0x10c00e8 MSC_pfz_exit +0x10c00ec MSC_swtch_pri +0x10c00f0 MSC_swtch +0x10c00f4 MSC_thread_switch +0x10c00f8 MSC_clock_sleep_trap +0x10c00fc MSC_kern_invalid_#63 +0x10c0100 MSC_kern_invalid_#64 +0x10c0104 MSC_kern_invalid_#65 +0x10c0108 MSC_kern_invalid_#66 +0x10c010c MSC_kern_invalid_#67 +0x10c0110 MSC_kern_invalid_#68 +0x10c0114 MSC_kern_invalid_#69 +0x10c0118 MSC_kern_invalid_#70 +0x10c011c MSC_kern_invalid_#71 +0x10c0120 MSC_kern_invalid_#72 +0x10c0124 MSC_kern_invalid_#73 +0x10c0128 MSC_kern_invalid_#74 +0x10c012c MSC_kern_invalid_#75 +0x10c0130 MSC_kern_invalid_#76 +0x10c0134 MSC_kern_invalid_#77 +0x10c0138 MSC_kern_invalid_#78 +0x10c013c MSC_kern_invalid_#79 +0x10c0140 MSC_kern_invalid_#80 +0x10c0144 MSC_kern_invalid_#81 +0x10c0148 MSC_kern_invalid_#82 +0x10c014c MSC_kern_invalid_#83 +0x10c0150 MSC_kern_invalid_#84 +0x10c0154 MSC_kern_invalid_#85 +0x10c0158 MSC_kern_invalid_#86 +0x10c015c MSC_kern_invalid_#87 +0x10c0160 MSC_kern_invalid_#88 +0x10c0164 MSC_mach_timebase_info +0x10c0168 MSC_mach_wait_until +0x10c016c MSC_mk_timer_create +0x10c0170 MSC_mk_timer_destroy +0x10c0174 MSC_mk_timer_arm +0x10c0178 MSC_mk_timer_cancel +0x10c017c MSC_kern_invalid_#95 +0x10c0180 MSC_kern_invalid_#96 +0x10c0184 MSC_kern_invalid_#97 +0x10c0188 MSC_kern_invalid_#98 +0x10c018c MSC_kern_invalid_#99 +0x10c0190 MSC_iokit_user_client +0x10c0194 MSC_kern_invalid_#101 +0x10c0198 MSC_kern_invalid_#102 +0x10c019c MSC_kern_invalid_#103 +0x10c01a0 MSC_kern_invalid_#104 +0x10c01a4 MSC_kern_invalid_#105 +0x10c01a8 MSC_kern_invalid_#106 +0x10c01ac MSC_kern_invalid_#107 +0x10c01b0 MSC_kern_invalid_#108 +0x10c01b4 MSC_kern_invalid_#109 +0x10c01b8 MSC_kern_invalid_#110 +0x10c01bc MSC_kern_invalid_#111 +0x10c01c0 MSC_kern_invalid_#112 +0x10c01c4 MSC_kern_invalid_#113 +0x10c01c8 MSC_kern_invalid_#114 +0x10c01cc MSC_kern_invalid_#115 +0x10c01d0 MSC_kern_invalid_#116 +0x10c01d4 MSC_kern_invalid_#117 +0x10c01d8 MSC_kern_invalid_#118 +0x10c01dc MSC_kern_invalid_#119 +0x10c01e0 MSC_kern_invalid_#120 +0x10c01e4 MSC_kern_invalid_#121 +0x10c01e8 MSC_kern_invalid_#122 +0x10c01ec MSC_kern_invalid_#123 +0x10c01f0 MSC_kern_invalid_#124 +0x10c01f4 MSC_kern_invalid_#125 +0x10c01f8 MSC_kern_invalid_#126 +0x10c01fc MSC_kern_invalid_#127 +0x1300004 MACH_Pageout +0x1300008 MACH_vmfault +0x1300100 MACH_purgable_token_add +0x1300104 MACH_purgable_token_delete +0x1300108 MACH_purgable_token_ripened +0x130010c MACH_purgable_token_purged +0x1300400 MACH_vm_check_zf_delay +0x1300404 MACH_vm_cow_delay +0x1300408 MACH_vm_zf_delay +0x1300410 MACH_vm_pageout_scan +0x1300414 MACH_vm_pageout_balanceQ +0x1300418 MACH_vm_pageout_freelist +0x130041c MACH_vm_pageout_purge_one +0x1300420 MACH_vm_pageout_cache_evict +0x1300424 MACH_vm_pageout_thread_block +0x1300480 MACH_vm_upl_page_wait +0x1300484 MACH_vm_iopl_page_wait +0x1400000 MACH_SCHED +0x1400004 MACH_STKATTACH +0x1400008 MACH_STKHANDOFF +0x140000c MACH_CALLCONT +0x1400010 MACH_CALLOUT +0x1400014 MACH_ServiceT +0x1400018 MACH_MKRUNNABLE +0x140001c MACH_PROMOTE +0x1400020 MACH_DEMOTE +0x1400024 MACH_IDLE +0x1400028 MACH_STACK_DEPTH +0x140002c MACH_MOVED +0x1400030 MACH_FAIRSHARE_ENTER +0x1400034 MACH_FAIRSHARE_EXIT +0x1400038 MACH_FAILSAFE +0x1400040 MACH_STKHANDOFF_BT +0x1400044 MACH_SCHED_BT +0x1400048 MACH_IDLE_BT +0x1400050 MACH_SCHED_GET_URGENCY +0x1400054 MACH_SCHED_URGENCY +0x1400058 MACH_SCHED_REDISPATCH +0x140005C MACH_SCHED_REMOTE_AST +0x1400060 MACH_SCHED_LPA_BROKEN +0x1500000 MACH_MSGID_INVALID +0x1600000 MTX_SLEEP +0x1600004 MTX_SLEEP_DEADLINE +0x1600008 MTX_WAIT +0x160000c MTX_WAKEUP +0x1600010 MTX_LOCK +0x1600014 MTX_UNLOCK +0x1600080 MTX_x86_wait +0x1600084 MTX_x86_wakeup +0x1600088 MTX_x86_spin +0x160008c MTX_x86_acquire +0x1600090 MTX_x86_demote +0x1600200 MTX_full_lock +0x1600400 RW_EXCL_WaitForWriter +0x1600404 RW_EXCL_WaitForReaders +0x1600408 RW_SHRD_WaitForWriter +0x160040c RW_SHRDtoEXCL_FailedUpgrade +0x1600410 RW_SHRDtoEXCL_WaitForReaders +0x1600414 RW_EXCLtoSHRD +0x1600418 RW_EXCL_SpinForWriter +0x160041c RW_EXCL_WaitForWriter +0x1600420 RW_EXCL_SpinForReaders +0x1600424 RW_EXCL_WaitForReaders +0x1600428 RW_SHRD_unlock +0x160042c RW_EXCL_unlock +0x1600440 RW_SHRD_SpinForWriter +0x1600444 RW_SHRD_WaitForWriter +0x1600448 RW_SHRDtoEXCL_SpinForReaders +0x160044c RW_SHRDtoEXCL_WaitForReaders +0x1700000 PMAP_create +0x1700004 PMAP_destroy +0x1700008 PMAP_protect +0x170000c PMAP_page_protect +0x1700010 PMAP_enter +0x1700014 PMAP_remove +0x1700018 PMAP_nest +0x170001c PMAP_unnest +0x1700020 PMAP_flush_TLBS +0x1700024 PMAP_update_interrupt +0x1700028 PMAP_attribute_clear +0x2010000 L_IP_In_Beg +0x2010004 L_IP_Out_Beg +0x2010008 L_IP_In_End +0x201000c L_IP_Out_End +0x2010404 F_IP_Output +0x2010800 F_IP_Input +0x2010c00 F_In_CkSum +0x2020000 L_ARP_Req +0x2020004 L_ARP_Resp +0x2020008 L_ARP_Reply +0x202000c L_ARP_Timo +0x2020010 L_ARP_Look +0x2020014 L_ARP_Input +0x2030000 L_UDP_In_Beg +0x2030004 L_UDP_Out_Beg +0x2030008 L_UDP_In_End +0x203000c L_UDP_Out_End +0x2031400 F_UDP_Input +0x2031804 F_UDP_Output +0x2040000 L_TCP_In_Beg +0x2040004 L_TCP_Out_Beg +0x2040008 L_TCP_In_End +0x204000c L_TCP_Out_End +0x2040c00 F_TCP_Input +0x2041004 F_TCP_Output +0x2041400 F_TCP_FastT +0x2041404 F_TCP_SlowT +0x2041408 F_TCP_Close +0x2041800 F_PCB_Lookup +0x2041804 F_PCB_HshLkup +0x2041c00 F_TCP_NewConn +0x2041d00 F_TCP_gotSync +0x20b0010 F_SBDrop +0x20b0014 F_SBAppend +0x20b0404 F_SendMsg +0x20b0804 F_SendTo +0x20b0c04 F_SendIt +0x20b1004 F_SoSend +0x20b1008 F_SoSend_CopyD +0x20b1400 F_RecvFrom +0x20b1800 F_RecvMsg +0x20b1c00 F_RecvIt +0x20b2000 F_SoReceive +0x20b2100 F_SoShutdown +0x20b2400 F_SoAccept +0x20b2800 F_sendfile +0x20b2804 F_sendfile_wait +0x20b2808 F_sendfile_read +0x20b280c F_sendfile_send +0x2650004 AT_DDPinput +0x2f00000 F_FreemList +0x2f00004 F_m_copym +0x2f00008 F_getpackets +0x2f0000c F_getpackethdrs +0x3010000 HFS_Write +0x3010004 HFS_Fsync +0x3010008 HFS_Close +0x301000c HFS_Remove +0x3010010 HFS_Create +0x3010014 HFS_Inactive +0x3010018 HFS_Reclaim +0x301001C HFS_Truncate +0x3010028 vinvalbuf +0x3010030 HFS_Read +0x3010034 HFS_RL_ADD +0x3010038 HFS_RL_REMOVE +0x301003c MACH_copyiostr +0x3010040 UIO_copyout +0x3010044 UIO_copyin +0x3010048 MACH_copyio +0x301004c Cl_bp +0x3010050 Cl_iodone +0x3010054 Cl_ubc_dump +0x3010058 Cl_io +0x301005c Cl_zero +0x3010060 Cl_cmap +0x3010068 Cl_ioread +0x301006c Cl_iowrite +0x3010070 Cl_ioabort +0x3010074 Cl_zero_commit +0x3010078 Cl_wrdel_commit +0x301007c Cl_read_abort +0x3010080 Cl_read_copy +0x3010084 Cl_read_list_req +0x3010088 Cl_phys_uiomove +0x301008c Cl_read_commit +0x3010090 VFS_LOOKUP +0x3010094 Cl_read_uplmap +0x3010098 Cl_read_uplunmap +0x301009C VFS_LOOKUP_DONE +0x30100a0 Cl_write_copy +0x30100a4 Cl_write_list_req +0x30100a8 Cl_write_uiomove +0x30100ac Cl_write_zeros +0x30100b0 Cl_write_delayed +0x30100b4 Cl_write_abort +0x30100b8 Cl_zero_info +0x30100c0 Cl_rd_ahead +0x30100c4 Cl_rd_prefetch +0x30100c8 Cl_rd_prefabort +0x30100cc Cl_writepush +0x30100d0 Cl_pageout +0x30100d4 Cl_push +0x30100e0 Cl_pagein +0x30100f0 Cl_advisory_rd +0x30100f4 Cl_adv_fault_list +0x30100f8 Cl_adv_abort1 +0x30100fc Cl_adv_abort2 +0x3010118 Cl_read_direct +0x301011c Cl_ncpr_uiomv +0x3010120 Cl_ncpr_getupl +0x3010124 Cl_ncpr_clio +0x301012c Cl_write_direct +0x3010130 Cl_ncpw_getupl +0x3010134 Cl_ncpw_clio +0x3010138 Cl_sparse_collect +0x301013c Cl_sparse_push +0x3010140 Cl_sparse_add +0x3010144 Cl_release +0x3010148 Cl_drt_emptyfree +0x301014c Cl_drt_retcluster +0x3010150 Cl_drt_alloctable +0x3010154 Cl_drt_insert +0x3010158 Cl_drt_mark +0x301015c Cl_drt_6 +0x3010160 Cl_drt_freetable +0x3010170 Cl_read_contig_getupl +0x3010174 Cl_write_contig_getupl +0x3010178 Cl_io_type +0x301017c Cl_wait_IO +0x3010180 Vnode_Pagein +0x3010184 throttle_lowpri_io +0x3010200 Vnode_Pageout +0x3010280 Vnode_WaitForWrites +0x3010300 PageoutThrottle +0x3010340 SuperCluster +0x3010344 PS_Offsets +0x3010348 PS_Indexes +0x301034c Dirty_Indexes +0x3010350 PS_Write +0x3010354 PS_WriteComplete +0x3010380 PageoutCollect +0x3010384 PagesOnInactive_Q +0x3010388 PagesOnActive_Q +0x301038c PageoutScan +0x3010390 PageoutWait +0x3010394 PageoutWakeup1 +0x3010398 PageoutWakeup2 +0x301039c PageoutWakeup3 +0x3010400 NFS_doio +0x3010404 NFS_doio_offsets +0x3010408 NFS_doio_zero_read +0x301040c NFS_doio_zero_write +0x3010410 NFS_doio_invalidate +0x3010414 NFS_doio_retry +0x3010418 NFS_doio_done +0x3010500 NFS_pagein_zero +0x3010504 NFS_pageout_zero +0x3010508 NFS_pagein +0x301050c NFS_pageout +0x3010600 BIO_write_list_req +0x3010604 BIO_getblk_list_req +0x3010608 BIO_getblk +0x301060c BIO_biodone +0x3010610 BIO_brelse +0x3010614 BIO_recovered_buf +0x3010618 BIO_dumped_buf +0x301061c BIO_write_delayed +0x3010620 BIO_acquire_error +0x3010624 BIO_write_async +0x3010628 BIO_write_sync +0x301062c BIO_flushdirty +0x3010630 BIO_getblk_msleep +0x3010700 VM_pageout_list_req +0x3010704 VM_pagein_list_req +0x3010800 NFS_setattr +0x3010804 NFS_getattr +0x3010808 NFS_read +0x301080c NFS_write +0x3010810 NFS_truncate +0x3010814 NFS_flush +0x3010818 NFS_flush_again +0x301081c NFS_flush_bvec +0x3010820 NFS_flush_upls +0x3010824 NFS_commit +0x3010828 NFS_flush_commit +0x301082c NFS_flush_done +0x3010830 NFS_flush_busy +0x3010834 NFS_flush_bwrite +0x3010838 NFS_flush_normal +0x301083c NFS_loadattrcache +0x3010840 NFS_getattrcache +0x3010844 NFS_connect +0x3010848 NFS_reply +0x301084c NFS_request +0x3010850 NFS_softterm +0x3010854 NFS_rcvunlock +0x3010858 NFS_rcvlock +0x301085c NFS_timer +0x3010860 NFS_vinvalbuf +0x3010864 NFS_srvcommit +0x3010868 NFS_srvfsync +0x301086c NFS_RdAhead +0x3010870 NFS_srvread +0x3010874 NFS_srvVOPREAD +0x3010900 UBC_setsize +0x3010904 UBC_sync_range +0x3010908 UBC_upl_abort_range +0x301090c UBC_upl_commit_range +0x3011000 UPL_iopl_req +0x3011004 UPL_upl_req +0x3011008 UPL_abort_range +0x301100c UPL_abort +0x3011010 UPL_commit_range +0x3011014 UPL_commit +0x3011018 UPL_destroy +0x301101c UPL_commit_range_active +0x3011020 UPL_commit_range_inactive +0x3011024 UPL_map_enter_upl +0x3011028 UPL_map_remove_upl +0x301102c UPL_commit_range_speculative +0x3020000 P_WrData +0x3020004 P_WrDataDone +0x3020008 P_RdData +0x302000C P_RdDataDone +0x3020010 P_WrDataAsync +0x3020014 P_WrDataAsyncDone +0x3020018 P_RdDataAsync +0x302001C P_RdDataAsyncDone +0x3020020 P_WrMeta +0x3020024 P_WrMetaDone +0x3020028 P_RdMeta +0x302002C P_RdMetaDone +0x3020030 P_WrMetaAsync +0x3020034 P_WrMetaAsyncDone +0x3020038 P_RdMetaAsync +0x302003C P_RdMetaAsyncDone +0x3020040 P_PgOut +0x3020044 P_PgOutDone +0x3020048 P_PgIn +0x302004C P_PgInDone +0x3020050 P_PgOutAsync +0x3020054 P_PgOutAsyncDone +0x3020058 P_PgInAsync +0x302005C P_PgInAsyncDone +0x3020080 P_WrDataT +0x3020084 P_WrDataTDone +0x3020088 P_RdDataT +0x302008C P_RdDataTDone +0x3020090 P_WrDataAsyncT +0x3020094 P_WrDataAsyncTDone +0x3020098 P_RdDataAsyncT +0x302009C P_RdDataAsyncTDone +0x30200a0 P_WrMetaT +0x30200A4 P_WrMetaTDone +0x30200a8 P_RdMetaT +0x30200AC P_RdMetaTDone +0x30200b0 P_WrMetaAsyncT +0x30200B4 P_WrMetaAsyncTDone +0x30200b8 P_RdMetaAsyncT +0x30200BC P_RdMetaAsyncTDone +0x30200c0 P_PgOutT +0x30200C4 P_PgOutTDone +0x30200c8 P_PgInT +0x30200CC P_PgInTDone +0x30200d0 P_PgOutAsyncT +0x30200D4 P_PgOutAsyncTDone +0x30200d8 P_PgInAsyncT +0x30200DC P_PgInAsyncTDone +0x3020100 P_WrDataP +0x3020104 P_WrDataPDone +0x3020108 P_RdDataP +0x302010C P_RdDataPDone +0x3020110 P_WrDataAsyncP +0x3020114 P_WrDataAsyncPDone +0x3020118 P_RdDataAsyncP +0x302011C P_RdDataAsyncPDone +0x3020120 P_WrMetaP +0x3020124 P_WrMetaPDone +0x3020128 P_RdMetaP +0x302012C P_RdMetaPDone +0x3020130 P_WrMetaAsyncP +0x3020134 P_WrMetaAsyncPDone +0x3020138 P_RdMetaAsyncP +0x302013C P_RdMetaAsyncPDone +0x3020140 P_PgOutP +0x3020144 P_PgOutPDone +0x3020148 P_PgInP +0x302014C P_PgInPDone +0x3020150 P_PgOutAsyncP +0x3020154 P_PgOutAsyncPDone +0x3020158 P_PgInAsyncP +0x302015C P_PgInAsyncPDone +0x3050004 journal_flush +0x3070004 BootCache_tag +0x3070008 BootCache_batch +0x4010004 proc_exit +0x4010008 force_exit +0x40c0000 BSC_SysCall +0x40c0004 BSC_exit +0x40c0008 BSC_fork +0x40c000c BSC_read +0x40c0010 BSC_write +0x40c0014 BSC_open +0x40c0018 BSC_close +0x40c001c BSC_wait4 +0x40c0020 BSC_obs_creat +0x40c0024 BSC_link +0x40c0028 BSC_unlink +0x40c002c BSC_obs_execv +0x40c0030 BSC_chdir +0x40c0034 BSC_fchdir +0x40c0038 BSC_mknod +0x40c003c BSC_chmod +0x40c0040 BSC_chown +0x40c0044 BSC_obs_break +0x40c0048 BSC_getfsstat +0x40c004c BSC_obs_lseek +0x40c0050 BSC_getpid +0x40c0054 BSC_obs_mount +0x40c0058 BSC_obs_unmount +0x40c005c BSC_setuid +0x40c0060 BSC_getuid +0x40c0064 BSC_geteuid +0x40c0068 BSC_ptrace +0x40c006c BSC_recvmsg +0x40c0070 BSC_sendmsg +0x40c0074 BSC_recvfrom +0x40c0078 BSC_accept +0x40c007c BSC_getpeername +0x40c0080 BSC_getsockname +0x40c0084 BSC_access +0x40c0088 BSC_chflags +0x40c008c BSC_fchflags +0x40c0090 BSC_sync +0x40c0094 BSC_kill +0x40c0098 BSC_obs_stat +0x40c009c BSC_getppid +0x40c00a0 BSC_obs_lstat +0x40c00a4 BSC_dup +0x40c00a8 BSC_pipe +0x40c00ac BSC_getegid +0x40c00b0 BSC_profil +0x40c00b4 BSC_obs_ktrace +0x40c00b8 BSC_sigaction +0x40c00bc BSC_getgid +0x40c00c0 BSC_sigprocmask +0x40c00c4 BSC_getlogin +0x40c00c8 BSC_setlogin +0x40c00cc BSC_acct +0x40c00d0 BSC_sigpending +0x40c00d4 BSC_sigaltstack +0x40c00d8 BSC_ioctl +0x40c00dc BSC_reboot +0x40c00e0 BSC_revoke +0x40c00e4 BSC_symlink +0x40c00e8 BSC_readlink +0x40c00ec BSC_execve +0x40c00f0 BSC_umask +0x40c00f4 BSC_chroot +0x40c00f8 BSC_obs_fstat +0x40c00fc BSC_#63 +0x40c0100 BSC_obs_getpagesize +0x40c0104 BSC_msync +0x40c0108 BSC_vfork +0x40c010c BSC_obs_vread +0x40c0110 BSC_obs_vwrite +0x40c0114 BSC_obs_sbrk +0x40c0118 BSC_obs_sstk +0x40c011c BSC_obs_mmap +0x40c0120 BSC_obs_vadvise +0x40c0124 BSC_munmap +0x40c0128 BSC_mprotect +0x40c012c BSC_madvise +0x40c0130 BSC_obs_vhangup +0x40c0134 BSC_obs_vlimit +0x40c0138 BSC_mincore +0x40c013c BSC_getgroups +0x40c0140 BSC_setgroups +0x40c0144 BSC_getpgrp +0x40c0148 BSC_setpgid +0x40c014c BSC_setitimer +0x40c0150 BSC_obs_wait +0x40c0154 BSC_swapon +0x40c0158 BSC_getitimer +0x40c015c BSC_obs_gethostname +0x40c0160 BSC_obs_sethostname +0x40c0164 BSC_getdtablesize +0x40c0168 BSC_dup2 +0x40c016c BSC_obs_getdopt +0x40c0170 BSC_fcntl +0x40c0174 BSC_select +0x40c0178 BSC_obs_setdopt +0x40c017c BSC_fsync +0x40c0180 BSC_setpriority +0x40c0184 BSC_socket +0x40c0188 BSC_connect +0x40c018c BSC_obs_accept +0x40c0190 BSC_getpriority +0x40c0194 BSC_obs_send +0x40c0198 BSC_obs_recv +0x40c019c BSC_obs_sigreturn +0x40c01a0 BSC_bind +0x40c01a4 BSC_setsockopt +0x40c01a8 BSC_listen +0x40c01ac BSC_obs_vtimes +0x40c01b0 BSC_obs_sigvec +0x40c01b4 BSC_obs_sigblock +0x40c01b8 BSC_obs_sigsetmask +0x40c01bc BSC_sigsuspend +0x40c01c0 BSC_obs_sigstack +0x40c01c4 BSC_obs_recvmsg +0x40c01c8 BSC_obs_sendmsg +0x40c01cc BSC_obs_vtrace +0x40c01d0 BSC_gettimeofday +0x40c01d4 BSC_getrusage +0x40c01d8 BSC_getsockopt +0x40c01dc BSC_obs_resuba +0x40c01e0 BSC_readv +0x40c01e4 BSC_writev +0x40c01e8 BSC_settimeofday +0x40c01ec BSC_fchown +0x40c01f0 BSC_fchmod +0x40c01f4 BSC_obs_recvfrom +0x40c01f8 BSC_setreuid +0x40c01fc BSC_setregid +0x40c0200 BSC_rename +0x40c0204 BSC_obs_truncate +0x40c0208 BSC_obs_ftruncate +0x40c020c BSC_flock +0x40c0210 BSC_mkfifo +0x40c0214 BSC_sendto +0x40c0218 BSC_shutdown +0x40c021c BSC_socketpair +0x40c0220 BSC_mkdir +0x40c0224 BSC_rmdir +0x40c0228 BSC_utimes +0x40c022c BSC_futimes +0x40c0230 BSC_adjtime +0x40c0234 BSC_obs_getpeername +0x40c0238 BSC_gethostuuid +0x40c023c BSC_obs_sethostid +0x40c0240 BSC_obs_getrlimit +0x40c0244 BSC_obs_setrlimit +0x40c0248 BSC_obs_killpg +0x40c024c BSC_setsid +0x40c0250 BSC_obs_setquota +0x40c0254 BSC_obs_qquota +0x40c0258 BSC_obs_getsockname +0x40c025c BSC_getpgid +0x40c0260 BSC_setprivexec +0x40c0264 BSC_pread +0x40c0268 BSC_pwrite +0x40c026c BSC_nfssvc +0x40c0270 BSC_obs_getdirentries +0x40c0274 BSC_statfs +0x40c0278 BSC_fstatfs +0x40c027c BSC_unmount +0x40c0280 BSC_obs_async_daemon +0x40c0284 BSC_getfh +0x40c0288 BSC_obs_getdomainname +0x40c028c BSC_obs_setdomainname +0x40c0290 BSC_#164 +0x40c0294 BSC_quotactl +0x40c0298 BSC_obs_exportfs +0x40c029c BSC_mount +0x40c02a0 BSC_obs_ustat +0x40c02a4 BSC_csops +0x40c02a8 BSC_obs_table +0x40c02ac BSC_obs_wait3 +0x40c02b0 BSC_obs_rpause +0x40c02b4 BSC_waitid +0x40c02b8 BSC_obs_getdents +0x40c02bc BSC_obs_gc_control +0x40c02c0 BSC_add_profil +0x40c02c4 BSC_#177 +0x40c02c8 BSC_#178 +0x40c02cc BSC_#179 +0x40c02d0 BSC_kdebug_trace +0x40c02d4 BSC_setgid +0x40c02d8 BSC_setegid +0x40c02dc BSC_seteuid +0x40c02e0 BSC_sigreturn +0x40c02e4 BSC_chud +0x40c02e8 BSC_#186 +0x40c02ec BSC_fdatasync +0x40c02f0 BSC_stat +0x40c02f4 BSC_fstat +0x40c02f8 BSC_lstat +0x40c02fc BSC_pathconf +0x40c0300 BSC_fpathconf +0x40c0304 BSC_#193 +0x40c0308 BSC_getrlimit +0x40c030c BSC_setrlimit +0x40c0310 BSC_getdirentries +0x40c0314 BSC_mmap +0x40c0318 BSC_obs__syscall +0x40c031c BSC_lseek +0x40c0320 BSC_truncate +0x40c0324 BSC_ftruncate +0x40c0328 BSC_sysctl +0x40c032c BSC_mlock +0x40c0330 BSC_munlock +0x40c0334 BSC_undelete +0x40c0338 BSC_ATsocket +0x40c033c BSC_ATgetmsg +0x40c0340 BSC_ATputmsg +0x40c0344 BSC_ATPsndreq +0x40c0348 BSC_ATPsndrsp +0x40c034c BSC_ATPgetreq +0x40c0350 BSC_ATPgetrsp +0x40c0354 BSC_#213 +0x40c0358 BSC_#214 +0x40c035c BSC_#215 +0x40c0360 BSC_mkcomplex +0x40c0364 BSC_statv +0x40c0368 BSC_lstatv +0x40c036c BSC_fstatv +0x40c0370 BSC_getattrlist +0x40c0374 BSC_setattrlist +0x40c0378 BSC_getdirentriesattr +0x40c037c BSC_exchangedata +0x40c0380 BSC_#224 +0x40c0384 BSC_searchfs +0x40c0388 BSC_delete_Carbon +0x40c038c BSC_copyfile +0x40c0390 BSC_fgetattrlist +0x40c0394 BSC_fsetattrlist +0x40c0398 BSC_poll +0x40c039c BSC_watchevent +0x40c03a0 BSC_waitevent +0x40c03a4 BSC_modwatch +0x40c03a8 BSC_getxattr +0x40c03ac BSC_fgetxattr +0x40c03b0 BSC_setxattr +0x40c03b4 BSC_fsetxattr +0x40c03b8 BSC_removexattr +0x40c03bc BSC_fremovexattr +0x40c03c0 BSC_listxattr +0x40c03c4 BSC_flistxattr +0x40c03c8 BSC_fsctl +0x40c03cc BSC_initgroups +0x40c03d0 BSC_posix_spawn +0x40c03d4 BSC_ffsctl +0x40c03d8 BSC_#246 +0x40c03dc BSC_nfsclnt +0x40c03e0 BSC_fhopen +0x40c03e4 BSC_#249 +0x40c03e8 BSC_minherit +0x40c03ec BSC_semsys +0x40c03f0 BSC_msgsys +0x40c03f4 BSC_shmsys +0x40c03f8 BSC_semctl +0x40c03fc BSC_semget +0x40c0400 BSC_semop +0x40c0404 BSC_#257 +0x40c0408 BSC_msgctl +0x40c040c BSC_msgget +0x40c0410 BSC_msgsnd +0x40c0414 BSC_msgrcv +0x40c0418 BSC_shmat +0x40c041c BSC_shmctl +0x40c0420 BSC_shmdt +0x40c0424 BSC_shmget +0x40c0428 BSC_shm_open +0x40c042c BSC_shm_unlink +0x40c0430 BSC_sem_open +0x40c0434 BSC_sem_close +0x40c0438 BSC_sem_unlink +0x40c043c BSC_sem_wait +0x40c0440 BSC_sem_trywait +0x40c0444 BSC_sem_post +0x40c0448 BSC_sem_getvalue +0x40c044c BSC_sem_init +0x40c0450 BSC_sem_destroy +0x40c0454 BSC_open_extended +0x40c0458 BSC_umask_extended +0x40c045c BSC_stat_extended +0x40c0460 BSC_lstat_extended +0x40c0464 BSC_fstat_extended +0x40c0468 BSC_chmod_extended +0x40c046c BSC_fchmod_extended +0x40c0470 BSC_access_extended +0x40c0474 BSC_settid +0x40c0478 BSC_gettid +0x40c047c BSC_setsgroups +0x40c0480 BSC_getsgroups +0x40c0484 BSC_setwgroups +0x40c0488 BSC_getwgroups +0x40c048c BSC_mkfifo_extended +0x40c0490 BSC_mkdir_extended +0x40c0494 BSC_identitysvc +0x40c0498 BSC_shared_region_chk_np +0x40c049c BSC_shared_region_map_np +0x40c04a0 BSC_vm_pressure_monitor +0x40c04a4 BSC_psynch_rw_longrdlock +0x40c04a8 BSC_psynch_rw_yieldwrlock +0x40c04ac BSC_psynch_rw_downgrade +0x40c04b0 BSC_psynch_rw_upgrade +0x40c04b4 BSC_psynch_mutexwait +0x40c04b8 BSC_psynch_mutexdrop +0x40c04bc BSC_psynch_cvbroad +0x40c04c0 BSC_psynch_cvsignal +0x40c04c4 BSC_psynch_cvwait +0x40c04c8 BSC_psynch_rw_rdlock +0x40c04cc BSC_psynch_rw_wrlock +0x40c04d0 BSC_psynch_rw_unlock +0x40c04d4 BSC_psynch_rw_unlock2 +0x40c04d8 BSC_getsid +0x40c04dc BSC_settid_with_pid +0x40c04e0 BSC_psynch_cvclrprepost +0x40c04e4 BSC_aio_fsync +0x40c04e8 BSC_aio_return +0x40c04ec BSC_aio_suspend +0x40c04f0 BSC_aio_cancel +0x40c04f4 BSC_aio_error +0x40c04f8 BSC_aio_read +0x40c04fc BSC_aio_write +0x40c0500 BSC_lio_listio +0x40c0504 BSC_obs_pthread_cond_wait +0x40c0508 BSC_iopolicysys +0x40c050c BSC_process_policy +0x40c0510 BSC_mlockall +0x40c0514 BSC_munlockall +0x40c0518 BSC_#326 +0x40c051c BSC_issetugid +0x40c0520 BSC_pthread_kill +0x40c0524 BSC_pthread_sigmask +0x40c0528 BSC_sigwait +0x40c052c BSC_disable_threadsignal +0x40c0530 BSC_pthread_markcancel +0x40c0534 BSC_pthread_canceled +0x40c0538 BSC_semwait_signal +0x40c053c BSC_obs_utrace +0x40c0540 BSC_proc_info +0x40c0544 BSC_sendfile +0x40c0548 BSC_stat64 +0x40c054c BSC_fstat64 +0x40c0550 BSC_lstat64 +0x40c0554 BSC_stat64_extended +0x40c0558 BSC_lstat64_extended +0x40c055c BSC_fstat64_extended +0x40c0560 BSC_getdirentries64 +0x40c0564 BSC_statfs64 +0x40c0568 BSC_fstatfs64 +0x40c056c BSC_getfsstat64 +0x40c0570 BSC_pthread_chdir +0x40c0574 BSC_pthread_fchdir +0x40c0578 BSC_audit +0x40c057c BSC_auditon +0x40c0580 BSC_#352 +0x40c0584 BSC_getauid +0x40c0588 BSC_setauid +0x40c058c BSC_getaudit +0x40c0590 BSC_setaudit +0x40c0594 BSC_getaudit_addr +0x40c0598 BSC_setaudit_addr +0x40c059c BSC_auditctl +0x40c05a0 BSC_bsdthread_create +0x40c05a4 BSC_bsdthread_terminate +0x40c05a8 BSC_kqueue +0x40c05ac BSC_kevent +0x40c05b0 BSC_lchown +0x40c05b4 BSC_stack_snapshot +0x40c05b8 BSC_bsdthread_register +0x40c05bc BSC_workq_open +0x40c05c0 BSC_workq_kernreturn +0x40c05c4 BSC_kevent64 +0x40c05c8 BSC_obs_semwait_signal +0x40c05cc BSC_obs_semwait_signal_nocancel +0x40c05d0 BSC_thread_selfid +0x40c05d4 BSC_#373 +0x40c05d8 BSC_#374 +0x40c05dc BSC_#375 +0x40c05e0 BSC_#376 +0x40c05e4 BSC_#377 +0x40c05e8 BSC_#378 +0x40c05ec BSC_#379 +0x40c05f0 BSC_mac_execve +0x40c05f4 BSC_mac_syscall +0x40c05f8 BSC_mac_get_file +0x40c0600 BSC_mac_get_link +0x40c0604 BSC_mac_set_link +0x40c0608 BSC_mac_get_proc +0x40c060c BSC_mac_set_proc +0x40c0610 BSC_mac_get_fd +0x40c0614 BSC_mac_set_fd +0x40c0618 BSC_mac_get_pid +0x40c061c BSC_mac_get_lcid +0x40c0620 BSC_mac_get_lctx +0x40c0624 BSC_mac_set_lctx +0x40c0628 BSC_setlcid +0x40c062c BSC_getlcid +0x40c0630 BSC_read_nocancel +0x40c0634 BSC_write_nocancel +0x40c0638 BSC_open_nocancel +0x40c063c BSC_close_nocancel +0x40c0640 BSC_wait4_nocancel +0x40c0644 BSC_recvmsg_nocancel +0x40c0648 BSC_sendmsg_nocancel +0x40c064c BSC_recvfrom_nocancel +0x40c0650 BSC_accept_nocancel +0x40c0654 BSC_msync_nocancel +0x40c0658 BSC_fcntl_nocancel +0x40c065c BSC_select_nocancel +0x40c0660 BSC_fsync_nocancel +0x40c0664 BSC_connect_nocancel +0x40c0668 BSC_sigsuspend_nocancel +0x40c066c BSC_readv_nocancel +0x40c0670 BSC_writev_nocancel +0x40c0674 BSC_sendto_nocancel +0x40c0678 BSC_pread_nocancel +0x40c067c BSC_pwrite_nocancel +0x40c0680 BSC_waitid_nocancel +0x40c0684 BSC_poll_nocancel +0x40c0688 BSC_msgsnd_nocancel +0x40c068c BSC_msgrcv_nocancel +0x40c0690 BSC_sem_wait_nocancel +0x40c0694 BSC_aio_suspend_nocancel +0x40c0698 BSC_sigwait_nocancel +0x40c069c BSC_semwait_signal_nocancel +0x40c06a0 BSC_mac_mount +0x40c06a4 BSC_mac_get_mount +0x40c06a8 BSC_mac_getfsstat +0x40c06ac BSC_fsgetpath +0x40c06b0 BSC_audit_session +0x40c06b4 BSC_audit_session_join +0x40c06b8 BSC_fileport_makeport +0x40c06bc BSC_fileport_makefd +0x40c06c0 BSC_audit_session_port +0x40c06c4 BSC_pid_suspend +0x40c06c8 BSC_pid_resume +0x40c06cc BSC_pid_hibernate +0x40c06d0 BSC_pid_shutdown_sockets +0x40c06d4 BSC_shared_region_slide_np +0x40c06fc BSC_shared_region_map_and_slide_np +0x40e0104 BSC_msync_extended_info +0x40e0264 BSC_pread_extended_info +0x40e0268 BSC_pwrite_extended_info +0x40e0314 BSC_mmap_extended_info +0x40f0314 BSC_mmap_extended_info2 +0x5000004 INTC_Handler +0x5010004 WL_CheckForWork +0x5010008 WL_RunEventSources +0x5020004 IES_client +0x5020008 IES_latency +0x502000c IES_sema +0x5020010 IES_intctxt +0x5020018 IES_action +0x502001c IES_filter +0x5030004 TES_client +0x5030008 TES_latency +0x503000c TES_sema +0x5030010 TES_action +0x5040004 CQ_client +0x5040008 CQ_latency +0x504000c CQ_sema +0x5040010 CQ_psema +0x5040014 CQ_plock +0x5040018 CG_action +0x5080004 IOSERVICE_BUSY +0x5080008 IOSERVICE_NONBUSY +0x508000c IOSERVICE_MODULESTALL +0x5080010 IOSERVICE_MODULEUNSTALL +0x5080014 IOSERVICE_TERM_PHASE1 +0x5080018 IOSERVICE_TERM_REQUEST_OK +0x508001c IOSERVICE_TERM_REQUEST_FAIL +0x5080020 IOSERVICE_TERM_SCHEDULE_STOP +0x5080024 IOSERVICE_TERM_SCHEDULE_FINALIZE +0x5080028 IOSERVICE_TERM_WILL +0x508002c IOSERVICE_TERM_DID +0x5080030 IOSERVICE_TERM_DID_DEFER +0x5080034 IOSERVICE_TERM_FINALIZE +0x5080038 IOSERVICE_TERM_STOP +0x508003c IOSERVICE_TERM_STOP_NOP +0x5080040 IOSERVICE_TERM_STOP_DEFER +0x5080044 IOSERVICE_TERM_DONE +0x5080048 IOSERVICE_KEXTD_ALIVE +0x508004C IOSERVICE_KEXTD_READY +0x5080050 IOSERVICE_REGISTRY_QUIET +0x5100004 PM_SetParent +0x5100008 PM_AddChild +0x510000c PM_RemoveChild +0x5100010 PM_CtrlDriver +0x5100014 PM_CtrlDrvrE1 +0x5100018 PM_CtrlDrvrE2 +0x510001c PM_CtrlDrvrE3 +0x5100020 PM_CtrlDrvrE4 +0x5100024 PM_IntDriver +0x5100028 PM_AckE1 +0x510002c PM_ChildAck +0x5100030 PM_DriverAck +0x5100034 PM_AckE2 +0x5100038 PM_AckE3 +0x510003c PM_AckE4 +0x5100040 PM_DrvrAckSPwr +0x5100044 PM_WillChange +0x5100048 PM_DidChange +0x510004c PM_ReqstDomain +0x5100050 PM_MakeUsable +0x5100054 PM_ChangeTo +0x5100058 PM_ChngeToPriv +0x510005c PM_SetAggrssvs +0x5100060 PM_CritclTemp +0x5100064 PM_OverrideOn +0x5100068 PM_OverrideOff +0x510006c PM_EnqueueErr +0x5100070 PM_CollapseQ +0x5100074 PM_ChangeDone +0x5100078 PM_CtrlDrvTrdy +0x510007c PM_IntDrvrTrdy +0x5100080 PM_StartAckTmr +0x5100084 PM_ParentChnge +0x5100088 PM_AmndPrnChng +0x510008c PM_DeviceChnge +0x5100090 PM_ReqDenied +0x5100094 PM_CtrlDrvrE45 +0x5100098 PM_PrgrmHrdwre +0x510009c PM_InfDrvrPre +0x51000a0 PM_InfDrvrPost +0x51000a4 PM_RemoveDrivr +0x51000a8 PM_IdlTimerPrd +0x51000ac PM_SystemWake +0x51000b0 PM_AckE5 +0x51000b4 PM_ClientAck +0x51000b8 PM_ClientTardy +0x51000bc PM_ClientCancl +0x51000c0 PM_ClientNotfy +0x51000c4 PM_AppNotify +0x5230000 HID_Unexpected +0x5230004 HID_KeyboardLEDThreadTrigger +0x5230008 HID_KeyboardLEDThreadActive +0x523000c HID_KeyboardSetParam +0x5230010 HID_KeyboardCapsThreadTrigger +0x5230014 HID_KeyboardCapsThreadActive +0x5230018 HID_PostEvent +0x523001c HID_NewUserClient +0x5230020 HID_InturruptReport +0x5230024 HID_DispatchScroll +0x5230028 HID_DispatchRelativePointer +0x523002c HID_DispatchAbsolutePointer +0x5230030 HID_DispatchKeyboard +0x5230034 HID_EjectCallback +0x5230038 HID_CapsCallback +0x523003c HID_#3c +0x523004c HID_#4c +0x5310004 CPUPM_PSTATE +0x5310008 CPUPM_IDLE_CSTATE +0x531000c CPUPM_IDLE_HALT +0x5310010 CPUPM_IDLE_LOOP +0x5310014 CPUPM_HPET_START +0x5310018 CPUPM_HPET_END +0x531001c CPUPM_HPET_INTR +0x5310020 CPUPM_PSTATE_HW +0x5310024 CPUPM_PSTATE_LIMIT +0x5310028 CPUPM_PSTATE_PARK +0x531002c CPUPM_PSTATE_START +0x5310030 CPUPM_PSTATE_PAUSE +0x5310034 CPUPM_PSTATE_RESUME +0x5310038 CPUPM_PSTATE_DOWN +0x531003c CPUPM_PSTATE_UP +0x5310040 CPUPM_PSTATE_NORM +0x5310044 CPUPM_PSTATE_FORCE +0x5310048 CPUPM_PSTATE_TIMEOUT +0x531004c CPUPM_PSTATE_SETTO +0x5310050 CPUPM_SET_DEADLINE +0x5310054 CPUPM_GET_DEADLINE +0x5310058 CPUPM_DEADLINE +0x531005c CPUPM_IDLE_SNOOP +0x5310060 CPUPM_IDLE_LATENCY +0x5310064 CPUPM_IDLE_WAKEUP +0x5310068 CPUPM_IDLE_SW_WAKEUP +0x531006c CPUPM_IDLE_SELECT +0x5310070 CPUPM_IDLE_SELECTED +0x5310074 CPUPM_IDLE_INTSKIP +0x5310078 CPUPM_IDLE_LOCK +0x531007c CPUPM_IDLE_UNLOCK +0x5310080 CPUPM_IDLE_NO_HPET +0x5310084 CPUPM_FI_UP +0x5310088 CPUPM_FI_UP_CPU +0x531008c CPUPM_FI_MP +0x5310090 CPUPM_FI_MP_CPU +0x5310094 CPUPM_FI_PAUSE +0x5310098 CPUPM_FI_RUN +0x531009c CPUPM_PROC_HALT +0x53100a0 CPUPM_TRACE_STOPPED +0x53100a4 CPUPM_HPET_INT_LOCK +0x53100a8 CPUPM_HPET_INT_UNLOCK +0x53100ac CPUPM_HPET_TRY_AGAIN +0x53100b0 CPUPM_HPET_SETDEADLINE +0x53100b4 CPUPM_LOCK_HELDBY +0x53100b8 CPUPM_HPET_DELTA +0x53100bc CPUPM_HPET_TOO_LATE +0x53100c0 CPUPM_HPET_NO_DEADLINE +0x53100c4 CPUPM_IDLE +0x53100c8 CPUPM_CORE_CHK_DEADLINE +0x53100cc CPUPM_SET_HPET_DEADLINE +0x53100d0 CPUPM_HPET_READ +0x53100d4 CPUPM_TIME_ADJUST +0x53100d8 CPUPM_IDLE_MWAIT +0x53100dc CPUPM_FI_SLAVE_IDLE +0x53100e0 CPUPM_FI_SLAVE_BLOCK +0x53100e4 CPUPM_FI_MAST_SIGNAL +0x53100e8 CPUPM_CORE_DEADLINE +0x53100ec CPUPM_IDLE_FAST +0x53100f0 CPUPM_IDLE_PAUSE +0x53100f4 CPUPM_IDLE_SHORT +0x53100f8 CPUPM_IDLE_NORMAL +0x53100fc CPUPM_IDLE_SPURIOUS +0x5310100 CPUPM_PSTATE_INFO +0x5310104 CPUPM_PSTATE_INFO_HW +0x5310108 CPUPM_PSTATE_FSM +0x531010c CPUPM_PSTATE_FSM_STEP +0x5310110 CPUPM_PSTATE_FSM_EVAL +0x5310114 CPUPM_PSTATE_FSM_MAP +0x5310118 CPUPM_CPUSTEP_STEP +0x531011c CPUPM_CPUSTEP_STEP_UP +0x5310120 CPUPM_CPUSTEP_STEP_DOWN +0x5310124 CPUPM_CPUSTEP_AVAIL +0x5310128 CPUPM_CPUSTEP_AVAIL_STEP +0x531012c CPUPM_CPUSTEP_AVAIL_CHNG +0x5310130 CPUPM_CPUSTEP_LOAD +0x5310134 CPUPM_CPUSTEP_START +0x5310138 CPUPM_CPUSTEP_STOP +0x531013c CPUPM_CPUSTEP_COPY +0x5310140 CPUPM_CPUSTEP_CLEAR +0x5310144 CPUPM_CPUSTEP_RUNCOUNT +0x5310148 CPUPM_CPUSTEP_WAKEUP +0x531014c CPUPM_PSTATE_TRACE +0x5310150 CPUPM_PSTATE_EVENT +0x5310154 CPUPM_IDLE_RATE +0x5310158 CPUPM_PSTATE_FSM_RESUME +0x531015c CPUPM_PSTATE_FSM_PAUSE +0x5310160 CPUPM_PSTATE_INSTRUCTION +0x5310164 CPUPM_PSTATE_INST_ARG +0x5310168 CPUPM_PSTATE_STACK_PUSH +0x531016c CPUPM_PSTATE_STACK_POP +0x5310170 CPUPM_IDLE_PREFIRE +0x5310174 CPUPM_PSTATE_VERIFY +0x5310178 CPUPM_TIMER_MIGRATE +0x531017c CPUPM_RING_LIMIT +0x5310180 CPUPM_CONTEXT_PAUSE +0x5310184 CPUPM_CONTEXT_RESUME +0x5310188 CPUPM_CONTEXT_RESUME_INFO +0x531018c CPUPM_THREAD_RESUME +0x5310190 CPUPM_THREAD_PAUSE_INFO +0x5310194 CPUPM_THREAD_RESUME_INFO +0x5310198 CPUPM_TEST_MASTER_INFO +0x531019c CPUPM_TEST_SLAVE_INFO +0x53101a0 CPUPM_TEST_INFO +0x53101a4 CPUPM_TEST_RUN_INFO +0x53101a8 CPUPM_TEST_SLAVE_INFO +0x5330000 HIBERNATE +0x5330004 HIBERNATE_WRITE_IMAGE +0x5330008 HIBERNATE_MACHINE_INIT +0x533000c HIBERNATE_FLUSH_MEMORY +0x5330010 HIBERNATE_flush_queue +0x5330014 HIBERNATE_flush_wait +0x5330018 HIBERNATE_flush_in_progress +0x533001c HIBERNATE_flush_bufs +0x5330020 HIBERNATE_page_list_setall +0x5330024 HIBERNATE_aes_decrypt_cbc +0x7000004 TRACE_DATA_NEWTHREAD +0x7000008 TRACE_DATA_EXEC +0x7010004 TRACE_STRING_NEWTHREAD +0x7010008 TRACE_STRING_EXEC +0x7020000 TRACE_PANIC +0x7020004 TRACE_TIMESTAMPS +0x7020008 TRACE_LOST_EVENTS +0x702000c TRACE_WRITING_EVENTS +0x8000000 USER_TEST +0x8000004 USER_run +0x8000008 USER_join +0x800000c USER_create +0x8000010 USER_pthread_create +0x8000014 USER_pthread_exit +0x8000018 USER_pthread_join +0x800001c USER_pthread_run +0x8000020 USER_pthread_cleanup_push +0x8000100 FW_underrun +0x8000104 FW_interrupt +0x8000108 FW_workloop +0x8010400 F_DLIL_Input +0x8010800 F_DLIL_Output +0x8010c00 F_DLIL_IfOut +0x8040000 USER_STOP +0x9000084 wq_deallocate_stack +0x9000088 wq_allocate_stack +0x9008070 wq_run_item +0x9008074 wq_clean_thread +0x9008078 wq_post_done +0x900807c wq_stk_cleanup +0x9008080 wq_tsd_cleanup +0x9008084 wq_tsd_destructor +0x9008088 wq_pthread_exit +0x900808c wq_workqueue_exit +0xa000100 P_CS_Read +0xa000110 P_CS_Write +0xa000180 P_CS_ReadDone +0xa000190 P_CS_WriteDone +0xa000200 P_CS_ReadChunk +0xa000210 P_CS_WriteChunk +0xa000280 P_CS_ReadChunkDone +0xa000290 P_CS_WriteChunkDone +0xa000300 P_CS_ReadCrypto +0xa000310 P_CS_WriteCrypto +0xa000500 P_CS_Originated_Read +0xa000510 P_CS_Originated_Write +0xa000580 P_CS_Originated_ReadDone +0xa000590 P_CS_Originated_WriteDone +0xa000900 P_CS_MetaRead +0xa000910 P_CS_MetaWrite +0xa000980 P_CS_MetaReadDone +0xa000990 P_CS_MetaWriteDone +0xa008000 P_CS_SYNC_DISK +0xa008004 P_CS_WaitForBuffer +0xa008008 P_CS_NoBuffer +0xb000000 AFP_asp_tcp_usr_send +0xb000004 AFP_asp_tcp_usr_send_after_Request +0xb000008 AFP_asp_tcp_usr_send_after_FindDSIReq +0xb00000c AFP_asp_tcp_usr_send_after_Reply +0xb000010 AFP_asp_tcp_slowtimo +0xb000014 AFP_asp_tcp_usr_control +0xb000018 AFP_asp_tcp_fasttimo +0xb000020 AFP_Send +0xb000024 AFP_Send_before_sosend +0xb000028 AFP_Send_after_sosend +0xb00002c AFP_Send_before_write +0xb000030 AFP_Send_after_write +0xb000040 AFP_Reply +0xb000044 AFP_Reply_rcvdAlready +0xb000048 AFP_Reply_before_RcvLock +0xb00004c AFP_Reply_fail_RcvLock +0xb000050 AFP_Reply_before_ReadDSIHdr +0xb000054 AFP_Reply_after_ReadDSIHdr +0xb000058 AFP_Reply_fail_ReadDSIHdr +0xb00005c AFP_Reply_after_FindDSIReqInfo +0xb000060 AFP_Reply_SetAFPCmd +0xb000064 AFP_Reply_before_ReadDSIPacket +0xb000068 AFP_Reply_setRcvdReplyLen +0xb000070 AFP_SendReply +0xb000080 AFP_CreateDSIHeader +0xb000084 AFP_CreateDSIHeader_after_GetReqID +0xb000090 AFP_Request +0xb0000a0 AFP_ReceiveLock +0xb0000b0 AFP_ReceiveWakeUp +0xb0000c0 AFP_ReceiveUnLock +0xb0000e0 AFP_SendLock +0xb0000e4 AFP_SendUnLock +0xb0000f0 AFP_SendQueueLock +0xb000100 AFP_SendQueueUnLock +0xb000110 AFP_ReadDSIHeader +0xb000120 AFP_Receive +0xb000124 AFP_Receive_before_sorcv +0xb000128 AFP_Receive_after_sorcv +0xb000130 AFP_ReadDSIPacket +0xb000140 AFP_DoCopyOut +0xb000150 AFP_DoCopyIn +0xb000160 AFP_CheckRcvTickle +0xb000164 AFP_CheckRcvTickleTO +0xb000170 AFP_CheckSendTickle +0xb000180 AFP_CheckIncomingPkts +0xb000190 AFP_ProcessOptions +0xb000200 AFP_FindDSIReqInfo +0xb000204 AFP_FindDSIReqInfo_foundReqInfo +0xb000208 AFP_FindDSIReqInfo_flags +0xb00020c AFP_FindDSIReqLeave +0xb000210 AFP_UsrDisconnect +0xc000000 AFPVFS_UserReply +0xc000004 AFPVFS_UserReplyGetMbuf +0xc000008 AFPVFS_UserReplysosend +0xc000010 AFPVFS_UserCommand +0xc000018 AFPVFS_UserCommandsosend +0xc000020 AFPVFS_ReadFork +0xc000024 AFPVFS_ReadForkFillQPB +0xc000028 AFPVFS_ReadForkNbrRequests +0xc00002c AFPVFS_ReadForkSendQPB +0xc000030 AFPVFS_ReadForkSendErr +0xc000040 AFPVFS_ReadForkGetReply +0xc000044 AFPVFS_ReadForkGetReplyResult +0xc000050 AFPVFS_WriteFork +0xc000054 AFPVFS_WriteForkFillQPB +0xc000058 AFPVFS_WriteForkNbrRequests +0xc00005c AFPVFS_WriteForkSendQPB +0xc000060 AFPVFS_WriteForkSendErr +0xc000064 AFPVFS_WriteForkGetReply +0xc000068 AFPVFS_WriteForkGetReplyResult +0xc000070 AFPVFS_GetAttr +0xc000080 AFPVFS_SetAttr +0xc000090 AFPVFS_GetAttrList +0xc0000a0 AFPVFS_SetAttrList +0xc0000b0 AFPVFS_FSCTL +0xc0000c0 AFPVFS_LookUp +0xc0000d0 AFPVFS_CacheLookUp +0xc0000e0 AFPVFS_Write +0xc0000e4 AFPVFS_WriteNoCluster +0xc0000e8 AFPVFS_WriteDone +0xc0000f0 AFPVFS_DoWrite +0xc000100 AFPVFS_Lock +0xc000110 AFPVFS_Statfs +0xc000120 AFPVFS_Sync +0xc000130 AFPVFS_VGet +0xc000140 AFPVFS_FlushFiles +0xc000150 AFPVFS_Create +0xc000160 AFPVFS_Mknod +0xc000170 AFPVFS_Open +0xc000180 AFPVFS_Close +0xc000190 AFPVFS_Access +0xc000194 AFPVFS_AccessUID +0xc000198 AFPVFS_AccessGID +0xc00019c AFPVFS_AccessWID +0xc0001a0 AFPVFS_Writeperm +0xc0001b0 AFPVFS_Chmod +0xc0001c0 AFPVFS_Chflags +0xc0001d0 AFPVFS_Exchange +0xc0001e0 AFPVFS_Chid +0xc0001f0 AFPVFS_Fsync +0xc000200 AFPVFS_Remove +0xc000210 AFPVFS_Rename +0xc000220 AFPVFS_Copyfile +0xc000230 AFPVFS_Mkdir +0xc000240 AFPVFS_Symlink +0xc000250 AFPVFS_Readdir +0xc000260 AFPVFS_Readdirattr +0xc000264 AFPVFS_Readdirattr1 +0xc000268 AFPVFS_Readdirattr2 +0xc00026c AFPVFS_Readdirattr3 +0xc000270 AFPVFS_Readlink +0xc000280 AFPVFS_Abortop +0xc000290 AFPVFS_Inactive +0xc0002a0 AFPVFS_Reclaim +0xc0002b0 AFPVFS_Unlock +0xc0002c0 AFPVFS_Islocked +0xc0002d0 AFPVFS_Pathconf +0xc0002e0 AFPVFS_Update +0xc0002f0 AFPVFS_Makenode +0xc000300 AFPVFS_Allocate +0xc000310 AFPVFS_Search +0xc000320 AFPVFS_Reconnect +0xc0003e0 AFPVFS_Rmdir +0xc0003f0 AFPVFS_Vinit +0x11000000 DNC_PURGE1 +0x11000004 DNC_PURGE2 +0x11000008 DNC_FOUND +0x1100000c DNC_FAILED +0x11000010 DNC_ENTER +0x11000014 DNC_remove_name +0x11000018 DNC_ENTER_CREATE +0x1100001c DNC_update_identity +0x11000020 DNC_PURGE +0x11000030 DNC_LOOKUP_PATH +0x11000034 HFS_vnop_lookup +0x11000038 NAMEI +0x11000048 VFS_SUSPENDED +0x1100004C VFS_CACHEPURGE +0x11000050 VFS_CACHELOOKUP_SUCCESS +0x11000054 VFS_CACHELOOKUP_FAILED +0x11000058 VFS_CACHELOOKUP_ENTER +0x1100005c VFS_CACHELOOKUP +0x11000060 VFS_GETIOCOUNT +0x11000064 VFS_vnode_recycle +0x11000068 VFS_vnode_reclaim +0x11000070 HFS_getnewvnode1 +0x11000074 HFS_getnewvnode2 +0x11000078 HFS_chash_getcnode +0x1100007c HFS_vfs_getpath +0x11000080 VOLFS_lookup +0x11000084 lookup_mountedhere +0x11000088 VNOP_LOOKUP +0x1100008c HFS_chash_getvnode +0x11000090 VFS_vnode_rele +0x11000094 VFS_vnode_put +0x11004100 NC_lock_shared +0x11004104 NC_lock_exclusive +0x11004108 NC_unlock +0x1f000000 DYLD_initialize +0x1f010000 DYLD_CALL_image_init_routine +0x1f010004 DYLD_CALL_dependent_init_routine +0x1f010008 DYLD_CALL_lazy_init_routine +0x1f01000c DYLD_CALL_module_init_for_library +0x1f010010 DYLD_CALL_module_init_for_object +0x1f010014 DYLD_CALL_module_terminator_for_object +0x1f010018 DYLD_CALL_module_init_for_dylib +0x1f01001c DYLD_CALL_mod_term_func +0x1f010020 DYLD_CALL_object_func +0x1f010024 DYLD_CALL_library_func +0x1f010028 DYLD_CALL_add_image_func +0x1f01002c DYLD_CALL_remove_image_func +0x1f010030 DYLD_CALL_link_object_module_func +0x1f010034 DYLD_CALL_link_library_module_func +0x1f010038 DYLD_CALL_link_module_func +0x1f020000 DYLD_lookup_and_bind_with_hint +0x1f020004 DYLD_lookup_and_bind_fully +0x1f020008 DYLD_link_module +0x1f02000c DYLD_ulink_module +0x1f020010 DYLD_bind_objc_module +0x1f020014 DYLD_bind_fully_image_containing_address +0x1f020018 DYLD_make_delayed_module_initializer_calls +0x1f02001c DYLD_NSNameOfSymbol +0x1f020020 DYLD_NSAddressOfSymbol +0x1f020024 DYLD_NSModuleForSymbol +0x1f020028 DYLD_NSLookupAndBindSymbolWithHint +0x1f02002c DYLD_NSLookupSymbolInModule +0x1f020030 DYLD_NSLookupSymbolInImage +0x1f020034 DYLD_NSIsSymbolNameDefined +0x1f020038 DYLD_NSIsSymbolNameDefinedWithHint +0x1f02003c DYLD_NSIsSymbolNameDefinedInImage +0x1f020040 DYLD_NSNameOfModule +0x1f020044 DYLD_NSLibraryNameForModule +0x1f020048 DYLD_NSAddLibrary +0x1f02004c DYLD_NSAddLibraryWithSearching +0x1f020050 DYLD_NSAddImage +0x1f030000 DYLD_lookup_symbol +0x1f030004 DYLD_bind_lazy_symbol_reference +0x1f030008 DYLD_bind_symbol_by_name +0x1f03000c DYLD_link_in_need_modules +0x1f040000 DYLD_map_image +0x1f040004 DYLD_load_executable_image +0x1f040008 DYLD_load_library_image +0x1f04000c DYLD_map_library_image +0x1f040010 DYLD_map_bundle_image +0x1f040014 DYLD_load_dependent_libraries +0x1f040018 DYLD_notify_prebinding_agent +0x1ff10000 SCROLL_BEGIN_obs +0x1ff10100 SCROLL_END_obs +0x1ff20000 BOOT_BEGIN_obs +0x1ff20100 BOOT_END_obs +0x1ff20400 APP_DidActivateWindow_obs +0x1ff20500 TOOL_PRIVATE_1_obs +0x1ff20504 TOOL_PRIVATE_2_obs +0x1ff20508 TOOL_PRIVATE_3_obs +0x1ff2050c TOOL_PRIVATE_4_obs +0x1fff0000 LAUNCH_START_FINDER +0x1fff0100 LAUNCH_START_DOCK +0x1fff0200 LAUNCH_LSOpen +0x1fff0204 LAUNCH_LSRegisterItem +0x1fff0208 LAUNCH_LSGetApplicationAndFlagsForInfo +0x1fff0300 LAUNCH_CPSLaunch +0x1fff0304 LAUNCH_CPSRegisterwithServer +0x1fff0308 LAUNCH_CGSCheckInNewProcess +0x1fff030c LAUNCH_CPSExecProcess +0x1fff0310 LAUNCH_APP_EnterEventLoop +0x1fff0314 LAUNCH_APP_WillOpenUntitled +0x1fff031c LAUNCH_APP_DidOpenUntitled +0x1fff1000 LAUNCH_END +0x1fffffff LAUNCH_END +0x20000004 RTC_sync_TBR +0x21010000 SCROLL_BEGIN +0x21020000 BOOT_BEGIN +0x21030200 LOGIN_BEGIN +0x21030204 LOGINWINDOW_LAUNCHED +0x21030208 LOGINWINDOW_LAUNCHES_SA +0x2103020c LOGINWINDOW_GUI_APPEARS +0x21030210 LOGINWINDOW_LOGIN_CLICKED +0x21030214 LOGINWINDOW_ASKS_AUTH +0x21030218 LOGINWINDOW_AUTH_SUCCEEDED +0x2103021c LOGINWINDOW_LAUNCHES_DOCK +0x21030220 LOGINWINDOW_LAUNCHES_SUIS +0x21030224 LOGINWINDOW_LAUNCHES_FINDER +0x21030228 LOGINWINDOW_DOCK_LAUNCHED +0x2103022c LOGINWINDOW_SUIS_LAUNCHED +0x21030230 LOGINWINDOW_FINDER_LAUNCHED +0x21030234 LOGINWINDOW_LOGOUT_CLICKED +0x21030238 LOGINWINDOW_QUIT_FGAPPS +0x2103023c LOGINWINDOW_FGAPPS_QUIT +0x21030240 LOGINWINDOW_QUIT_SUIS +0x21030244 LOGINWINDOW_SUIS_DIES +0x21030248 LOGINWINDOW_QUIT_FINDER +0x2103024c LOGINWINDOW_FINDER_DIES +0x21030250 LOGINWINDOW_QUIT_DOCK +0x21030254 LOGINWINDOW_DOCK_DIES +0x21030258 LOGINWINDOW_EXIT +0x2103025c LOGINWINDOW_FUS_SELUSERNAME +0x21030260 LOGINWINDOW_FUS_SELLOGINWIND +0x21030270 LOGIN_APPLICATION_EXECUTING +0x21030274 LOGIN_APPLICATION_USABLE +0x21030300 LOGIN_END +0x21030500 LOGINWINDOW_APP_TERMINATION_REQUEST +0x21030504 LOGINWINDOW_LOGOUT_START +0x21030508 LOGINWINDOW_DESKTOP_UP +0x2103050c LOGINWINDOW_DESKTOP_UP_NOTIFICATION +0x21040000 APP_DIDActivateWindow +0x21050000 TOOL_PRIVATE_1 +0x21050004 TOOL_PRIVATE_2 +0x21050008 TOOL_PRIVATE_3 +0x2105000c TOOL_PRIVATE_4 +0x21060000 LAUNCH_CPSTraceLineNum +0x21060004 LAUNCH_CPSLaunch +0x21060008 LAUNCH_CPSRegisterwithServer +0x2106000c LAUNCH_CPSCheckInNewProcess +0x21060010 LAUNCH_CPSServerSideLaunch +0x21060014 LAUNCH_CPSExecProcess +0x21070000 LAUNCH_LSOpen +0x21070004 LAUNCH_LSRegisterItem +0x21070008 LAUNCH_LSGetApplicationAndFlagsForInfo +0x21080000 MCX_DAEMON_START +0x21080004 MCX_DAEMON_FINISH +0x21080008 MCX_STARTMCX_START +0x2108000C MCX_STARTMCX_FINISH +0x21080010 MCX_POSTCMP_DOCK_START +0x21080014 MCX_POSTCMP_DOCK_FINISH +0x21080020 MCX_POSTCMP_ENERGYSVR_START +0x21080024 MCX_POSTCMP_ENERGYSVR_FINISH +0x21080030 MCX_POSTCMP_LOGINITMS_START +0x21080034 MCX_POSTCMP_LOGINITMS_FINISH +0x21080040 MCX_CMP_COMPUTERINFO_START +0x21080044 MCX_CMP_COMPUTERINFO_FINISH +0x21080050 MCX_CMP_USERINFO_START +0x21080054 MCX_CMP_USERINFO_FINISH +0x21080060 MCX_POSTCMP_USER_START +0x21080064 MCX_POSTCMP_USER_FINISH +0x210800A0 MCX_MECHANISM_START +0x210800A4 MCX_MECHANISM_FINISH +0x210800C0 MCX_MECHANISM_PICKER_START +0x210800C4 MCX_MECHANISM_PICKER_FINISH +0x21080100 MCX_APPITEMS_START +0x21080104 MCX_APPITEMS_FINISH +0x21080200 MCX_CACHER_START +0x21080204 MCX_CACHER_FINISH +0x21080300 MCX_COMPOSITOR_START +0x21080304 MCX_COMPOSITOR_FINISH +0x21080400 MCX_DISKSETUP_START +0x21080404 MCX_DISKSETUP_FINISH +0x21090000 PHD_DAEMON_START +0x21090004 PHD_DAEMON_FINISH +0x21090010 PHD_SYNCNOW_START +0x21090014 PHD_SYNCNOW_FINISH +0x210b0000 TAL_APP_LAUNCH_START +0x210b0004 TAL_APP_LAUNCH_UNSUSPENDED +0x210b0008 TAL_APP_LAUNCH_UNTHROTTLED +0x210b000c TAL_APP_LAUNCH_VISIBLE +0x210b0010 TAL_APP_LAUNCH_READY +0x210b0014 TAL_ALL_LAUNCH_READY +0x21800000 SMB_smbd_idle +0x21800004 SMB_syscall_opendir +0x21800008 SMB_syscall_readdir +0x2180000c SMB_syscall_seekdir +0x21800010 SMB_syscall_telldir +0x21800014 SMB_syscall_rewinddir +0x21800018 SMB_syscall_mkdir +0x2180001c SMB_syscall_rmdir +0x21800020 SMB_syscall_closedir +0x21800024 SMB_syscall_open +0x21800028 SMB_syscall_close +0x2180002c SMB_syscall_read +0x21800030 SMB_syscall_pread +0x21800034 SMB_syscall_write +0x21800038 SMB_syscall_pwrite +0x2180003c SMB_syscall_lseek +0x21800040 SMB_syscall_sendfile +0x21800044 SMB_syscall_rename +0x21800048 SMB_syscall_fsync +0x2180004c SMB_syscall_stat +0x21800050 SMB_syscall_fstat +0x21800054 SMB_syscall_lstat +0x21800058 SMB_syscall_unlink +0x2180005c SMB_syscall_chmod +0x21800060 SMB_syscall_fchmod +0x21800064 SMB_syscall_chown +0x21800068 SMB_syscall_fchown +0x2180006c SMB_syscall_chdir +0x21800070 SMB_syscall_getwd +0x21800074 SMB_syscall_utime +0x21800078 SMB_syscall_ftruncate +0x2180007c SMB_syscall_fcntl_lock +0x21800080 SMB_syscall_kernel_flock +0x21800084 SMB_syscall_fcntl_getlock +0x21800088 SMB_syscall_readlink +0x2180008c SMB_syscall_symlink +0x21800090 SMB_syscall_link +0x21800094 SMB_syscall_mknod +0x21800098 SMB_syscall_realpath +0x2180009c SMB_syscall_get_quota +0x218000a0 SMB_syscall_set_quota +0x218000a4 SMB_smbmkdir +0x218000a8 SMB_smbrmdir +0x218000ac SMB_smbopen +0x218000b0 SMB_smbcreate +0x218000b4 SMB_smbclose +0x218000b8 SMB_smbflush +0x218000bc SMB_smbunlink +0x218000c0 SMB_smbmv +0x218000c4 SMB_smbgetatr +0x218000c8 SMB_smbsetatr +0x218000cc SMB_smbread +0x218000d0 SMB_smbwrite +0x218000d4 SMB_smblock +0x218000d8 SMB_smbunlock +0x218000dc SMB_smbctemp +0x218000e0 SMB_smbmknew +0x218000e4 SMB_smbcheckpath +0x218000e8 SMB_smbexit +0x218000ec SMB_smblseek +0x218000f0 SMB_smblockread +0x218000f4 SMB_smbwriteunlock +0x218000f8 SMB_smbreadbraw +0x218000fc SMB_smbreadbmpx +0x21800100 SMB_smbreadbs +0x21800104 SMB_smbwritebraw +0x21800108 SMB_smbwritebmpx +0x2180010c SMB_smbwritebs +0x21800110 SMB_smbwritec +0x21800114 SMB_smbsetattre +0x21800118 SMB_smbgetattre +0x2180011c SMB_smblockingx +0x21800120 SMB_smbtrans +0x21800124 SMB_smbtranss +0x21800128 SMB_smbioctl +0x2180012c SMB_smbioctls +0x21800130 SMB_smbcopy +0x21800134 SMB_smbmove +0x21800138 SMB_smbecho +0x2180013c SMB_smbwriteclose +0x21800140 SMB_smbopenx +0x21800144 SMB_smbreadx +0x21800148 SMB_smbwritex +0x2180014c SMB_smbtrans2 +0x21800150 SMB_smbtranss2 +0x21800154 SMB_smbfindclose +0x21800158 SMB_smbfindnclose +0x2180015c SMB_smbtcon +0x21800160 SMB_smbtdis +0x21800164 SMB_smbnegprot +0x21800168 SMB_smbsesssetupx +0x2180016c SMB_smbulogoffx +0x21800170 SMB_smbtconx +0x21800174 SMB_smbdskattr +0x21800178 SMB_smbsearch +0x2180017c SMB_smbffirst +0x21800180 SMB_smbfunique +0x21800184 SMB_smbfclose +0x21800188 SMB_smbnttrans +0x2180018c SMB_smbnttranss +0x21800190 SMB_smbntcreatex +0x21800194 SMB_smbntcancel +0x21800198 SMB_smbntrename +0x2180019c SMB_smbsplopen +0x218001a0 SMB_smbsplwr +0x218001a4 SMB_smbsplclose +0x218001a8 SMB_smbsplretq +0x218001ac SMB_smbsends +0x218001b0 SMB_smbsendb +0x218001b4 SMB_smbfwdname +0x218001b8 SMB_smbcancelf +0x218001bc SMB_smbgetmac +0x218001c0 SMB_smbsendstrt +0x218001c4 SMB_smbsendend +0x218001c8 SMB_smbsendtxt +0x218001cc SMB_smbinvalid +0x218001d0 SMB_pathworks_setdir +0x218001d4 SMB_trans2_open +0x218001d8 SMB_trans2_findfirst +0x218001dc SMB_trans2_findnext +0x218001e0 SMB_trans2_qfsinfo +0x218001e4 SMB_trans2_setfsinfo +0x218001e8 SMB_trans2_qpathinfo +0x218001ec SMB_trans2_setpathinfo +0x218001f0 SMB_trans2_qfileinfo +0x218001f4 SMB_trans2_setfileinfo +0x218001f8 SMB_trans2_fsctl +0x218001fc SMB_trans2_ioctl +0x21800200 SMB_trans2_findnotifyfirst +0x21800204 SMB_trans2_findnotifynext +0x21800208 SMB_trans2_mkdir +0x2180020c SMB_trans2_session_setup +0x21800210 SMB_trans2_get_dfs_referral +0x21800214 SMB_trans2_report_dfs_inconsistancy +0x21800218 SMB_nt_transact_create +0x2180021c SMB_nt_transact_ioctl +0x21800220 SMB_nt_transact_set_security_desc +0x21800224 SMB_nt_transact_notify_change +0x21800228 SMB_nt_transact_rename +0x2180022c SMB_nt_transact_query_security_desc +0x21800230 SMB_nt_transact_get_user_quota +0x21800234 SMB_nt_transact_set_user_quota +0x21800238 SMB_get_nt_acl +0x2180023c SMB_fget_nt_acl +0x21800240 SMB_set_nt_acl +0x21800244 SMB_fset_nt_acl +0x21800248 SMB_chmod_acl +0x2180024c SMB_fchmod_acl +0x21800250 SMB_name_release +0x21800254 SMB_name_refresh +0x21800258 SMB_name_registration +0x2180025c SMB_node_status +0x21800260 SMB_name_query +0x21800264 SMB_host_announce +0x21800268 SMB_workgroup_announce +0x2180026c SMB_local_master_announce +0x21800270 SMB_master_browser_announce +0x21800274 SMB_lm_host_announce +0x21800278 SMB_get_backup_list +0x2180027c SMB_reset_browser +0x21800280 SMB_announce_request +0x21800284 SMB_lm_announce_request +0x21800288 SMB_domain_logon +0x2180028c SMB_sync_browse_lists +0x21800290 SMB_run_elections +0x21800294 SMB_election +0x22000004 LAUNCHD_starting +0x22000008 LAUNCHD_exiting +0x2200000c LAUNCHD_finding_stray_pg +0x22000010 LAUNCHD_finding_all_strays +0x22000014 LAUNCHD_finding_execless +0x22000018 LAUNCHD_finding_weird_uids +0x2200001c LAUNCHD_data_pack +0x22000020 LAUNCHD_data_unpack +0x22000024 LAUNCHD_bug +0x22000028 LAUNCHD_mach_ipc +0x2200002c LAUNCHD_bsd_kevent +0x22000030 LAUNCHD_vproc_trans_incr +0x22000034 LAUNCHD_vproc_trans_decr +0xff000104 MSG_mach_notify_port_deleted +0xff000114 MSG_mach_notify_port_destroyed +0xff000118 MSG_mach_notify_no_senders +0xff00011c MSG_mach_notify_send_once +0xff000120 MSG_mach_notify_dead_name +0xff0001ec MSG_audit_triggers +0xff000320 MSG_host_info +0xff000324 MSG_host_kernel_version +0xff000328 MSG_host_page_size +0xff00032c MSG_mach_memory_object_memory_entry +0xff000330 MSG_host_processor_info +0xff000334 MSG_host_get_io_master +0xff000338 MSG_host_get_clock_service +0xff00033c MSG_kmod_get_info +0xff000340 MSG_host_zone_info +0xff000344 MSG_host_virtual_physical_table_info +0xff000348 MSG_host_ipc_hash_info +0xff00034c MSG_enable_bluebox +0xff000350 MSG_disable_bluebox +0xff000354 MSG_processor_set_default +0xff000358 MSG_processor_set_create +0xff00035c MSG_mach_memory_object_memory_entry_64 +0xff000360 MSG_host_statistics +0xff000364 MSG_host_request_notification +0xff000368 MSG_host_lockgroup_info +0xff00036c MSG_host_statistics64 +0xff000370 MSG_mach_zone_info +0xff000640 MSG_host_get_boot_info +0xff000644 MSG_host_reboot +0xff000648 MSG_host_priv_statistics +0xff00064c MSG_host_default_memory_manager +0xff000650 MSG_vm_wire +0xff000654 MSG_thread_wire +0xff000658 MSG_vm_allocate_cpm +0xff00065c MSG_host_processors +0xff000660 MSG_host_get_clock_control +0xff000664 MSG_kmod_create +0xff000668 MSG_kmod_destroy +0xff00066c MSG_kmod_control +0xff000670 MSG_host_get_special_port +0xff000674 MSG_host_set_special_port +0xff000678 MSG_host_set_exception_ports +0xff00067c MSG_host_get_exception_ports +0xff000680 MSG_host_swap_exception_ports +0xff000684 MSG_host_load_symbol_table +0xff000688 MSG_mach_vm_wire +0xff00068c MSG_host_processor_sets +0xff000690 MSG_host_processor_set_priv +0xff000694 MSG_set_dp_control_port +0xff000698 MSG_get_dp_control_port +0xff00069c MSG_host_set_UNDServer +0xff0006a0 MSG_host_get_UNDServer +0xff0006a4 MSG_kext_request +0xff000960 MSG_host_security_create_task_token +0xff000964 MSG_host_security_set_task_token +0xff000f9c MSG_mach_gss_init_sec_context +0xff000fa0 MSG_clock_get_time +0xff000fa0 MSG_mach_gss_accept_sec_context +0xff000fa4 MSG_clock_get_attributes +0xff000fa4 MSG_mach_gss_log_error +0xff000fa8 MSG_clock_alarm +0xff000fa8 MSG_mach_gss_init_sec_context_v2 +0xff000fac MSG_mach_gss_accept_sec_context_v2 +0xff000fb0 MSG_mach_gss_hold_cred +0xff000fb4 MSG_mach_gss_unhold_cred +0xff000ffc MSG_lockd_request +0xff001000 MSG_lockd_ping +0xff001004 MSG_lockd_shutdown +0xff0012c0 MSG_clock_set_time +0xff0012c4 MSG_clock_set_attributes +0xff001f40 MSG_memory_object_get_attributes +0xff001f44 MSG_memory_object_change_attributes +0xff001f48 MSG_memory_object_synchronize_completed +0xff001f4c MSG_memory_object_lock_request +0xff001f50 MSG_memory_object_destroy +0xff001f54 MSG_memory_object_upl_request +0xff001f58 MSG_memory_object_super_upl_request +0xff001f5c MSG_memory_object_cluster_size +0xff001f60 MSG_memory_object_page_op +0xff001f64 MSG_memory_object_recover_named +0xff001f68 MSG_memory_object_release_name +0xff001f6c MSG_memory_object_range_op +0xff002008 MSG_upl_abort +0xff00200c MSG_upl_abort_range +0xff002010 MSG_upl_commit +0xff002014 MSG_upl_commit_range +0xff002260 MSG_memory_object_init +0xff002264 MSG_memory_object_terminate +0xff002268 MSG_memory_object_data_request +0xff00226c MSG_memory_object_data_return +0xff002270 MSG_memory_object_data_initialize +0xff002274 MSG_memory_object_data_unlock +0xff002278 MSG_memory_object_synchronize +0xff00227c MSG_memory_object_map +0xff002280 MSG_memory_object_last_unmap +0xff002284 MSG_memory_object_data_reclaim +0xff002328 MSG_memory_object_create +0xff00238c MSG_default_pager_object_create +0xff002390 MSG_default_pager_info +0xff002394 MSG_default_pager_objects +0xff002398 MSG_default_pager_object_pages +0xff0023a0 MSG_default_pager_backing_store_create +0xff0023a4 MSG_default_pager_backing_store_delete +0xff0023a8 MSG_default_pager_backing_store_info +0xff0023ac MSG_default_pager_add_file +0xff0023b0 MSG_default_pager_triggers +0xff0023b4 MSG_default_pager_info_64 +0xff0023dc MSG_default_pager_space_alert +0xff002584 MSG_exception_raise +0xff002588 MSG_exception_raise_state +0xff00258c MSG_exception_raise_state_identity +0xff002594 MSG_mach_exception_raise +0xff002598 MSG_mach_exception_raise_state +0xff00259c MSG_mach_exception_raise_state_identity +0xff002bc0 MSG_io_object_get_class +0xff002bc4 MSG_io_object_conforms_to +0xff002bc8 MSG_io_iterator_next +0xff002bcc MSG_io_iterator_reset +0xff002bd0 MSG_io_service_get_matching_services +0xff002bd4 MSG_io_registry_entry_get_property +0xff002bd8 MSG_io_registry_create_iterator +0xff002bdc MSG_io_registry_iterator_enter_entry +0xff002be0 MSG_io_registry_iterator_exit_entry +0xff002be4 MSG_io_registry_entry_from_path +0xff002be8 MSG_io_registry_entry_get_name +0xff002bec MSG_io_registry_entry_get_properties +0xff002bf0 MSG_io_registry_entry_get_property_bytes +0xff002bf4 MSG_io_registry_entry_get_child_iterator +0xff002bf8 MSG_io_registry_entry_get_parent_iterator +0xff002c00 MSG_io_service_close +0xff002c04 MSG_io_connect_get_service +0xff002c08 MSG_io_connect_set_notification_port +0xff002c0c MSG_io_connect_map_memory +0xff002c10 MSG_io_connect_add_client +0xff002c14 MSG_io_connect_set_properties +0xff002c18 MSG_io_connect_method_scalarI_scalarO +0xff002c1c MSG_io_connect_method_scalarI_structureO +0xff002c20 MSG_io_connect_method_scalarI_structureI +0xff002c24 MSG_io_connect_method_structureI_structureO +0xff002c28 MSG_io_registry_entry_get_path +0xff002c2c MSG_io_registry_get_root_entry +0xff002c30 MSG_io_registry_entry_set_properties +0xff002c34 MSG_io_registry_entry_in_plane +0xff002c38 MSG_io_object_get_retain_count +0xff002c3c MSG_io_service_get_busy_state +0xff002c40 MSG_io_service_wait_quiet +0xff002c44 MSG_io_registry_entry_create_iterator +0xff002c48 MSG_io_iterator_is_valid +0xff002c4c MSG_io_make_matching +0xff002c50 MSG_io_catalog_send_data +0xff002c54 MSG_io_catalog_terminate +0xff002c58 MSG_io_catalog_get_data +0xff002c5c MSG_io_catalog_get_gen_count +0xff002c60 MSG_io_catalog_module_loaded +0xff002c64 MSG_io_catalog_reset +0xff002c68 MSG_io_service_request_probe +0xff002c6c MSG_io_registry_entry_get_name_in_plane +0xff002c70 MSG_io_service_match_property_table +0xff002c74 MSG_io_async_method_scalarI_scalarO +0xff002c78 MSG_io_async_method_scalarI_structureO +0xff002c7c MSG_io_async_method_scalarI_structureI +0xff002c80 MSG_io_async_method_structureI_structureO +0xff002c84 MSG_io_service_add_notification +0xff002c88 MSG_io_service_add_interest_notification +0xff002c8c MSG_io_service_acknowledge_notification +0xff002c90 MSG_io_connect_get_notification_semaphore +0xff002c94 MSG_io_connect_unmap_memory +0xff002c98 MSG_io_registry_entry_get_location_in_plane +0xff002c9c MSG_io_registry_entry_get_property_recursively +0xff002ca0 MSG_io_service_get_state +0xff002ca4 MSG_io_service_get_matching_services_ool +0xff002ca8 MSG_io_service_match_property_table_ool +0xff002cac MSG_io_service_add_notification_ool +0xff002cb0 MSG_io_object_get_superclass +0xff002cb4 MSG_io_object_get_bundle_identifier +0xff002cb8 MSG_io_service_open_extended +0xff002cbc MSG_io_connect_map_memory_into_task +0xff002cc0 MSG_io_connect_unmap_memory_from_task +0xff002cc4 MSG_io_connect_method +0xff002cc8 MSG_io_connect_async_method +0xff002ccc MSG_io_connect_set_notification_port_64 +0xff002cd0 MSG_io_service_add_notification_64 +0xff002cd4 MSG_io_service_add_interest_notification_64 +0xff002cd8 MSG_io_service_add_notification_ool_64 +0xff002cdc MSG_io_registry_entry_get_registry_entry_id +0xff002ee0 MSG_processor_start +0xff002ee4 MSG_processor_exit +0xff002ee8 MSG_processor_info +0xff002eec MSG_processor_control +0xff002ef0 MSG_processor_assign +0xff002ef4 MSG_processor_get_assignment +0xff003200 MSG_mach_port_names +0xff003204 MSG_mach_port_type +0xff003208 MSG_mach_port_rename +0xff00320c MSG_mach_port_allocate_name +0xff003210 MSG_mach_port_allocate +0xff003214 MSG_mach_port_destroy +0xff003218 MSG_mach_port_deallocate +0xff00321c MSG_mach_port_get_refs +0xff003220 MSG_mach_port_mod_refs +0xff003228 MSG_mach_port_set_mscount +0xff00322c MSG_mach_port_get_set_status +0xff003230 MSG_mach_port_move_member +0xff003234 MSG_mach_port_request_notification +0xff003238 MSG_mach_port_insert_right +0xff00323c MSG_mach_port_extract_right +0xff003240 MSG_mach_port_set_seqno +0xff003244 MSG_mach_port_get_attributes +0xff003248 MSG_mach_port_set_attributes +0xff00324c MSG_mach_port_allocate_qos +0xff003250 MSG_mach_port_allocate_full +0xff003254 MSG_task_set_port_space +0xff003258 MSG_mach_port_get_srights +0xff00325c MSG_mach_port_space_info +0xff003260 MSG_mach_port_dnrequest_info +0xff003264 MSG_mach_port_kernel_object +0xff003268 MSG_mach_port_insert_member +0xff00326c MSG_mach_port_extract_member +0xff003270 MSG_mach_port_get_context +0xff003274 MSG_mach_port_set_context +0xff003278 MSG_mach_port_kobject +0xff003520 MSG_task_create +0xff003524 MSG_task_terminate +0xff003528 MSG_task_threads +0xff00352c MSG_mach_ports_register +0xff003530 MSG_mach_ports_lookup +0xff003534 MSG_task_info +0xff003538 MSG_task_set_info +0xff00353c MSG_task_suspend +0xff003540 MSG_task_resume +0xff003544 MSG_task_get_special_port +0xff003548 MSG_task_set_special_port +0xff00354c MSG_thread_create +0xff003550 MSG_thread_create_running +0xff003554 MSG_task_set_exception_ports +0xff003558 MSG_task_get_exception_ports +0xff00355c MSG_task_swap_exception_ports +0xff003560 MSG_lock_set_create +0xff003564 MSG_lock_set_destroy +0xff003568 MSG_semaphore_create +0xff00356c MSG_semaphore_destroy +0xff003570 MSG_task_policy_set +0xff003574 MSG_task_policy_get +0xff003578 MSG_task_sample +0xff00357c MSG_task_policy +0xff003580 MSG_task_set_emulation +0xff003584 MSG_task_get_emulation_vector +0xff003588 MSG_task_set_emulation_vector +0xff00358c MSG_task_set_ras_pc +0xff003590 MSG_task_zone_info +0xff003594 MSG_task_assign +0xff003598 MSG_task_assign_default +0xff00359c MSG_task_get_assignment +0xff0035a0 MSG_task_set_policy +0xff0035a4 MSG_task_get_state +0xff0035a8 MSG_task_set_state +0xff003840 MSG_thread_terminate +0xff003844 MSG_act_get_state +0xff003848 MSG_act_set_state +0xff00384c MSG_thread_get_state +0xff003850 MSG_thread_set_state +0xff003854 MSG_thread_suspend +0xff003858 MSG_thread_resume +0xff00385c MSG_thread_abort +0xff003860 MSG_thread_abort_safely +0xff003864 MSG_thread_depress_abort +0xff003868 MSG_thread_get_special_port +0xff00386c MSG_thread_set_special_port +0xff003870 MSG_thread_info +0xff003874 MSG_thread_set_exception_ports +0xff003878 MSG_thread_get_exception_ports +0xff00387c MSG_thread_swap_exception_ports +0xff003880 MSG_thread_policy +0xff003884 MSG_thread_policy_set +0xff003888 MSG_thread_policy_get +0xff00388c MSG_thread_sample +0xff003890 MSG_etap_trace_thread +0xff003894 MSG_thread_assign +0xff003898 MSG_thread_assign_default +0xff00389c MSG_thread_get_assignment +0xff0038a0 MSG_thread_set_policy +0xff003b60 MSG_vm_region +0xff003b64 MSG_vm_allocate +0xff003b68 MSG_vm_deallocate +0xff003b6c MSG_vm_protect +0xff003b70 MSG_vm_inherit +0xff003b74 MSG_vm_read +0xff003b78 MSG_vm_read_list +0xff003b7c MSG_vm_write +0xff003b80 MSG_vm_copy +0xff003b84 MSG_vm_read_overwrite +0xff003b88 MSG_vm_msync +0xff003b8c MSG_vm_behavior_set +0xff003b90 MSG_vm_map +0xff003b94 MSG_vm_machine_attribute +0xff003b98 MSG_vm_remap +0xff003b9c MSG_task_wire +0xff003ba0 MSG_mach_make_memory_entry +0xff003ba4 MSG_vm_map_page_query +0xff003ba8 MSG_mach_vm_region_info +0xff003bac MSG_vm_mapped_pages_info +0xff003bb4 MSG_vm_region_recurse +0xff003bb8 MSG_vm_region_recurse_64 +0xff003bbc MSG_mach_vm_region_info_64 +0xff003bc0 MSG_vm_region_64 +0xff003bc4 MSG_mach_make_memory_entry_64 +0xff003bc8 MSG_vm_map_64 +0xff003bcc MSG_vm_map_get_upl +0xff003bd8 MSG_vm_purgable_control +0xff003e80 MSG_processor_set_statistics +0xff003e84 MSG_processor_set_destroy +0xff003e88 MSG_processor_set_max_priority +0xff003e8c MSG_processor_set_policy_enable +0xff003e90 MSG_processor_set_policy_disable +0xff003e94 MSG_processor_set_tasks +0xff003e98 MSG_processor_set_threads +0xff003e9c MSG_processor_set_policy_control +0xff003ea0 MSG_processor_set_stack_usage +0xff003ea4 MSG_processor_set_info +0xff004b00 MSG_mach_vm_allocate +0xff004b04 MSG_mach_vm_deallocate +0xff004b08 MSG_mach_vm_protect +0xff004b0c MSG_mach_vm_inherit +0xff004b10 MSG_mach_vm_read +0xff004b14 MSG_mach_vm_read_list +0xff004b18 MSG_mach_vm_write +0xff004b1c MSG_mach_vm_copy +0xff004b20 MSG_mach_vm_read_overwrite +0xff004b24 MSG_mach_vm_msync +0xff004b28 MSG_mach_vm_behavior_set +0xff004b2c MSG_mach_vm_map +0xff004b30 MSG_mach_vm_machine_attribute +0xff004b34 MSG_mach_vm_remap +0xff004b38 MSG_mach_vm_page_query +0xff004b3c MSG_mach_vm_region_recurse +0xff004b40 MSG_mach_vm_region +0xff004b44 MSG__mach_make_memory_entry +0xff004b48 MSG_mach_vm_purgable_control +0xff004b4c MSG_mach_vm_page_info +0xff004e20 MSG_ledger_create +0xff004e24 MSG_ledger_terminate +0xff004e28 MSG_ledger_transfer +0xff004e2c MSG_ledger_read +0xff005140 MSG_mach_get_task_label +0xff005144 MSG_mach_get_task_label_text +0xff005148 MSG_mach_get_label +0xff00514c MSG_mach_get_label_text +0xff005150 MSG_mach_set_port_label +0xff005154 MSG_mac_check_service +0xff005158 MSG_mac_port_check_service_obj +0xff00515c MSG_mac_port_check_access +0xff005160 MSG_mac_label_new +0xff005164 MSG_mac_request_label +0xff005dc0 MSG_UNDExecute_rpc +0xff005dc4 MSG_UNDDisplayNoticeFromBundle_rpc +0xff005dc8 MSG_UNDDisplayAlertFromBundle_rpc +0xff005dcc MSG_UNDDisplayCustomFromBundle_rpc +0xff005dd0 MSG_UNDDisplayCustomFromDictionary_rpc +0xff005dd4 MSG_UNDCancelNotification_rpc +0xff005dd8 MSG_UNDDisplayNoticeSimple_rpc +0xff005ddc MSG_UNDDisplayAlertSimple_rpc +0xff0060e0 MSG_UNDAlertCompletedWithResult_rpc +0xff0060e4 MSG_UNDNotificationCreated_rpc +0xff01a5e0 MSG_check_task_access +0xff01a5e4 MSG_find_code_signature +0xff04b320 MSG_kextd_ping +0xff25a8a0 MSG_lock_acquire +0xff25a8a4 MSG_lock_release +0xff25a8a8 MSG_lock_try +0xff25a8ac MSG_lock_make_stable +0xff25a8b0 MSG_lock_handoff +0xff25a8b4 MSG_lock_handoff_accept +0xff25abc0 MSG_semaphore_signal +0xff25abc4 MSG_semaphore_signal_all +0xff25abc8 MSG_semaphore_wait +0xff25abcc MSG_semaphore_signal_thread +0xff25abd0 MSG_semaphore_timedwait +0xff25abd4 MSG_semaphore_wait_signal +0xff25abd8 MSG_semaphore_timedwait_signal +0xffbebdcc MSG_clock_alarm_reply diff --git a/bsd/kern/tty.c b/bsd/kern/tty.c index 841781612..b97eb780e 100644 --- a/bsd/kern/tty.c +++ b/bsd/kern/tty.c @@ -1498,6 +1498,8 @@ ttioctl_locked(struct tty *tp, u_long cmd, caddr_t data, int flag, proc_t p) int ttyselect(struct tty *tp, int rw, void *wql, proc_t p) { + int retval = 0; + if (tp == NULL) return (ENXIO); @@ -1505,20 +1507,32 @@ ttyselect(struct tty *tp, int rw, void *wql, proc_t p) switch (rw) { case FREAD: - if (ttnread(tp) > 0 || ISSET(tp->t_state, TS_ZOMBIE)) + if (ISSET(tp->t_state, TS_ZOMBIE)) { return(1); + } + + retval = ttnread(tp); + if (retval > 0) { + break; + } + selrecord(p, &tp->t_rsel, wql); break; case FWRITE: - if ((tp->t_outq.c_cc <= tp->t_lowat && - ISSET(tp->t_state, TS_CONNECTED)) - || ISSET(tp->t_state, TS_ZOMBIE)) { - return (1); + if (ISSET(tp->t_state, TS_ZOMBIE)) { + return(1); } + + if ((tp->t_outq.c_cc <= tp->t_lowat) && + ISSET(tp->t_state, TS_CONNECTED)) { + retval = tp->t_hiwat - tp->t_outq.c_cc; + break; + } + selrecord(p, &tp->t_wsel, wql); break; } - return (0); + return retval; } @@ -3040,6 +3054,12 @@ ttyfree(struct tty *tp) { TTY_LOCK_NOTOWNED(tp); /* debug assert */ +#if DEBUG + if (!(SLIST_EMPTY(&tp->t_rsel.si_note) && SLIST_EMPTY(&tp->t_wsel.si_note))) { + panic("knotes hooked into a tty when the tty is freed.\n"); + } +#endif /* DEBUG */ + clfree(&tp->t_rawq); clfree(&tp->t_canq); clfree(&tp->t_outq); diff --git a/bsd/kern/tty_ptmx.c b/bsd/kern/tty_ptmx.c index 19c8e5bcc..a0be4feb5 100644 --- a/bsd/kern/tty_ptmx.c +++ b/bsd/kern/tty_ptmx.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1997-2010 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -178,7 +178,7 @@ _devfs_setattr(void * handle, unsigned short mode, uid_t uid, gid_t gid) char name[128]; snprintf(name, sizeof(name), "/dev/%s", direntp->de_name); - NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, CAST_USER_ADDR_T(name), ctx); + NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW, UIO_SYSSPACE, CAST_USER_ADDR_T(name), ctx); error = namei(&nd); if (error) goto out; @@ -229,7 +229,7 @@ sysctl_ptmx_max(__unused struct sysctl_oid *oidp, __unused void *arg1, SYSCTL_NODE(_kern, KERN_TTY, tty, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "TTY"); SYSCTL_PROC(_kern_tty, OID_AUTO, ptmx_max, - CTLTYPE_INT | CTLFLAG_RW, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &ptmx_max, 0, &sysctl_ptmx_max, "I", "ptmx_max"); @@ -259,6 +259,39 @@ struct ptmx_ioctl { static int ptmx_clone(dev_t dev, int minor); +/* + * Set of locks to keep the interaction between kevents and revoke + * from causing havoc. + */ + +#define LOG2_PTSD_KE_NLCK 2 +#define PTSD_KE_NLCK (1l << LOG2_PTSD_KE_NLCK) +#define PTSD_KE_LOCK_INDEX(x) ((x) & (PTSD_KE_NLCK - 1)) + +static lck_mtx_t ptsd_kevent_lock[PTSD_KE_NLCK]; + +static void +ptsd_kevent_lock_init(void) +{ + int i; + lck_grp_t *lgrp = lck_grp_alloc_init("ptsd kevent", LCK_GRP_ATTR_NULL); + + for (i = 0; i < PTSD_KE_NLCK; i++) + lck_mtx_init(&ptsd_kevent_lock[i], lgrp, LCK_ATTR_NULL); +} + +static void +ptsd_kevent_mtx_lock(int minor) +{ + lck_mtx_lock(&ptsd_kevent_lock[PTSD_KE_LOCK_INDEX(minor)]); +} + +static void +ptsd_kevent_mtx_unlock(int minor) +{ + lck_mtx_unlock(&ptsd_kevent_lock[PTSD_KE_LOCK_INDEX(minor)]); +} + int ptmx_init( __unused int config_count) { @@ -273,12 +306,25 @@ ptmx_init( __unused int config_count) return (ENOENT); } + if (cdevsw_setkqueueok(ptmx_major, &ptmx_cdev, 0) == -1) { + panic("Failed to set flags on ptmx cdevsw entry."); + } + /* Get a major number for /dev/pts/nnn */ if ((ptsd_major = cdevsw_add(-15, &ptsd_cdev)) == -1) { (void)cdevsw_remove(ptmx_major, &ptmx_cdev); printf("ptmx_init: failed to obtain /dev/ptmx major number\n"); return (ENOENT); } + + if (cdevsw_setkqueueok(ptsd_major, &ptsd_cdev, 0) == -1) { + panic("Failed to set flags on ptmx cdevsw entry."); + } + + /* + * Locks to guard against races between revoke and kevents + */ + ptsd_kevent_lock_init(); /* Create the /dev/ptmx device {,0} */ (void)devfs_make_node_clone(makedev(ptmx_major, 0), @@ -549,6 +595,7 @@ ptsd_open(dev_t dev, int flag, __unused int devtype, __unused proc_t p) error = (*linesw[tp->t_line].l_open)(dev, tp); /* Successful open; mark as open by the slave */ pti->pt_flags |= PF_OPEN_S; + CLR(tp->t_state, TS_IOCTL_NOT_OK); if (error == 0) ptmx_wakeup(tp, FREAD|FWRITE); out: @@ -556,6 +603,8 @@ ptsd_open(dev_t dev, int flag, __unused int devtype, __unused proc_t p) return (error); } +static void ptsd_revoke_knotes(dev_t, struct tty *); + FREE_BSDSTATIC int ptsd_close(dev_t dev, int flag, __unused int mode, __unused proc_t p) { @@ -587,9 +636,11 @@ ptsd_close(dev_t dev, int flag, __unused int mode, __unused proc_t p) #ifdef FIX_VSX_HANG tp->t_timeout = save_timeout; #endif - tty_unlock(tp); + if ((flag & IO_REVOKE) == IO_REVOKE) + ptsd_revoke_knotes(dev, tp); + /* unconditional, just like ttyclose() */ ptmx_free_ioctl(minor(dev), PF_OPEN_S); @@ -786,6 +837,7 @@ ptmx_open(dev_t dev, __unused int flag, __unused int devtype, __unused proc_t p) } tp->t_oproc = ptsd_start; CLR(tp->t_state, TS_ZOMBIE); + SET(tp->t_state, TS_IOCTL_NOT_OK); #ifdef sun4c tp->t_stop = ptsd_stop; #endif @@ -1000,19 +1052,30 @@ ptsd_select(dev_t dev, int rw, void *wql, proc_t p) switch (rw) { case FREAD: - if (ttnread(tp) > 0 || ISSET(tp->t_state, TS_ZOMBIE)) { + if (ISSET(tp->t_state, TS_ZOMBIE)) { retval = 1; break; } + + retval = ttnread(tp); + if (retval > 0) { + break; + } + selrecord(p, &tp->t_rsel, wql); break; case FWRITE: - if ((tp->t_outq.c_cc <= tp->t_lowat && - ISSET(tp->t_state, TS_CONNECTED)) - || ISSET(tp->t_state, TS_ZOMBIE)) { + if (ISSET(tp->t_state, TS_ZOMBIE)) { retval = 1; break; } + + if ((tp->t_outq.c_cc <= tp->t_lowat) && + ISSET(tp->t_state, TS_CONNECTED)) { + retval = tp->t_hiwat - tp->t_outq.c_cc; + break; + } + selrecord(p, &tp->t_wsel, wql); break; } @@ -1044,7 +1107,7 @@ ptmx_select(dev_t dev, int rw, void *wql, proc_t p) */ if ((tp->t_state&TS_ISOPEN) && tp->t_outq.c_cc && (tp->t_state&TS_TTSTOP) == 0) { - retval = 1; + retval = tp->t_outq.c_cc; break; } /* FALLTHROUGH */ @@ -1063,18 +1126,19 @@ ptmx_select(dev_t dev, int rw, void *wql, proc_t p) if (tp->t_state&TS_ISOPEN) { if (pti->pt_flags & PF_REMOTE) { if (tp->t_canq.c_cc == 0) { - retval = 1; + retval = (TTYHOG -1) ; break; } } else { - if (tp->t_rawq.c_cc + tp->t_canq.c_cc < TTYHOG-2) { - retval = 1; + retval = (TTYHOG - 2) - (tp->t_rawq.c_cc + tp->t_canq.c_cc); + if (retval > 0) { break; } if (tp->t_canq.c_cc == 0 && (tp->t_lflag&ICANON)) { retval = 1; break; } + retval = 0; } } selrecord(p, &pti->pt_selw, wql); @@ -1225,6 +1289,7 @@ cptyioctl(dev_t dev, u_long cmd, caddr_t data, int flag, proc_t p) struct ptmx_ioctl *pti; u_char *cc; int stop, error = 0; + int allow_ext_ioctl = 1; pti = ptmx_get_ioctl(minor(dev), 0); @@ -1233,11 +1298,18 @@ cptyioctl(dev_t dev, u_long cmd, caddr_t data, int flag, proc_t p) cc = tp->t_cc; + /* + * Do not permit extended ioctls on the master side of the pty unless + * the slave side has been successfully opened and initialized. + */ + if (cdevsw[major(dev)].d_open == ptmx_open && ISSET(tp->t_state, TS_IOCTL_NOT_OK)) + allow_ext_ioctl = 0; + /* * IF CONTROLLER STTY THEN MUST FLUSH TO PREVENT A HANG. * ttywflush(tp) will hang if there are characters in the outq. */ - if (cmd == TIOCEXT) { + if (cmd == TIOCEXT && allow_ext_ioctl) { /* * When the EXTPROC bit is being toggled, we need * to send an TIOCPKT_IOCTL if the packet driver @@ -1259,7 +1331,7 @@ cptyioctl(dev_t dev, u_long cmd, caddr_t data, int flag, proc_t p) } goto out; } else - if (cdevsw[major(dev)].d_open == ptmx_open) + if (cdevsw[major(dev)].d_open == ptmx_open) { switch (cmd) { case TIOCGPGRP: @@ -1363,6 +1435,17 @@ cptyioctl(dev_t dev, u_long cmd, caddr_t data, int flag, proc_t p) error = 0; goto out; } + + /* + * Fail all other calls; pty masters are not serial devices; + * we only pretend they are when the slave side of the pty is + * already open. + */ + if (!allow_ext_ioctl) { + error = ENOTTY; + goto out; + } + } error = (*linesw[tp->t_line].l_ioctl)(tp, cmd, data, flag, p); if (error == ENOTTY) { error = ttioctl_locked(tp, cmd, data, flag, p); @@ -1440,127 +1523,110 @@ cptyioctl(dev_t dev, u_long cmd, caddr_t data, int flag, proc_t p) * kqueue support. */ int ptsd_kqfilter(dev_t, struct knote *); -static void ptsd_kqops_read_detach(struct knote *); -static int ptsd_kqops_read_event(struct knote *, long); -static void ptsd_kqops_write_detach(struct knote *); -static int ptsd_kqops_write_event(struct knote *, long); +static void ptsd_kqops_detach(struct knote *); +static int ptsd_kqops_event(struct knote *, long); -static struct filterops ptsd_kqops_read = { +static struct filterops ptsd_kqops = { .f_isfd = 1, - .f_detach = ptsd_kqops_read_detach, - .f_event = ptsd_kqops_read_event, + .f_detach = ptsd_kqops_detach, + .f_event = ptsd_kqops_event, }; -static struct filterops ptsd_kqops_write = { - .f_isfd = 1, - .f_detach = ptsd_kqops_write_detach, - .f_event = ptsd_kqops_write_event, -}; -static void -ptsd_kqops_read_detach(struct knote *kn) -{ - struct ptmx_ioctl *pti; - struct tty *tp; - dev_t dev = (dev_t) kn->kn_hookid; - - pti = ptmx_get_ioctl(minor(dev), 0); - tp = pti->pt_tty; +#define PTSD_KNOTE_VALID NULL +#define PTSD_KNOTE_REVOKED ((void *)-911l) - if (tp == NULL) - return; - - tty_lock(tp); - KNOTE_DETACH(&tp->t_rsel.si_note, kn); - tty_unlock(tp); - - kn->kn_hookid = 0; -} +/* + * In the normal case, by the time the driver_close() routine is called + * on the slave, all knotes have been detached. However in the revoke(2) + * case, the driver's close routine is called while there are knotes active + * that reference the handlers below. And we have no obvious means to + * reach from the driver out to the kqueue's that reference them to get + * them to stop. + */ -static int -ptsd_kqops_read_event(struct knote *kn, long hint) +static void +ptsd_kqops_detach(struct knote *kn) { struct ptmx_ioctl *pti; struct tty *tp; - dev_t dev = (dev_t) kn->kn_hookid; - int retval = 0; - - pti = ptmx_get_ioctl(minor(dev), 0); - tp = pti->pt_tty; - - if (tp == NULL) - return (ENXIO); - - if (hint == 0) - tty_lock(tp); + dev_t dev, lockdev = (dev_t)kn->kn_hookid; - kn->kn_data = ttnread(tp); - if (kn->kn_data > 0) { - retval = 1; - } + ptsd_kevent_mtx_lock(minor(lockdev)); - if (ISSET(tp->t_state, TS_ZOMBIE)) { - kn->kn_flags |= EV_EOF; - retval = 1; + if ((dev = (dev_t)kn->kn_hookid) != 0) { + pti = ptmx_get_ioctl(minor(dev), 0); + if (pti != NULL && (tp = pti->pt_tty) != NULL) { + tty_lock(tp); + if (kn->kn_filter == EVFILT_READ) + KNOTE_DETACH(&tp->t_rsel.si_note, kn); + else + KNOTE_DETACH(&tp->t_wsel.si_note, kn); + tty_unlock(tp); + kn->kn_hookid = 0; + } } - if (hint == 0) - tty_unlock(tp); - return (retval); -} -static void -ptsd_kqops_write_detach(struct knote *kn) -{ - struct ptmx_ioctl *pti; - struct tty *tp; - dev_t dev = (dev_t) kn->kn_hookid; - - pti = ptmx_get_ioctl(minor(dev), 0); - tp = pti->pt_tty; - - if (tp == NULL) - return; - - tty_lock(tp); - KNOTE_DETACH(&tp->t_wsel.si_note, kn); - tty_unlock(tp); - - kn->kn_hookid = 0; + ptsd_kevent_mtx_unlock(minor(lockdev)); } static int -ptsd_kqops_write_event(struct knote *kn, long hint) +ptsd_kqops_event(struct knote *kn, long hint) { struct ptmx_ioctl *pti; struct tty *tp; - dev_t dev = (dev_t) kn->kn_hookid; + dev_t dev = (dev_t)kn->kn_hookid; int retval = 0; - pti = ptmx_get_ioctl(minor(dev), 0); - tp = pti->pt_tty; + ptsd_kevent_mtx_lock(minor(dev)); - if (tp == NULL) - return (ENXIO); + do { + if (kn->kn_hook != PTSD_KNOTE_VALID ) { + /* We were revoked */ + kn->kn_data = 0; + kn->kn_flags |= EV_EOF; + retval = 1; + break; + } - if (hint == 0) - tty_lock(tp); + pti = ptmx_get_ioctl(minor(dev), 0); + if (pti == NULL || (tp = pti->pt_tty) == NULL) { + kn->kn_data = ENXIO; + kn->kn_flags |= EV_ERROR; + retval = 1; + break; + } - if ((tp->t_outq.c_cc <= tp->t_lowat) && - ISSET(tp->t_state, TS_CONNECTED)) { - kn->kn_data = tp->t_outq.c_cn - tp->t_outq.c_cc; - retval = 1; - } + if (hint == 0) + tty_lock(tp); - if (ISSET(tp->t_state, TS_ZOMBIE)) { - kn->kn_flags |= EV_EOF; - retval = 1; - } + if (kn->kn_filter == EVFILT_READ) { + kn->kn_data = ttnread(tp); + if (kn->kn_data > 0) + retval = 1; + if (ISSET(tp->t_state, TS_ZOMBIE)) { + kn->kn_flags |= EV_EOF; + retval = 1; + } + } else { /* EVFILT_WRITE */ + if ((tp->t_outq.c_cc <= tp->t_lowat) && + ISSET(tp->t_state, TS_CONNECTED)) { + kn->kn_data = tp->t_outq.c_cn - tp->t_outq.c_cc; + retval = 1; + } + if (ISSET(tp->t_state, TS_ZOMBIE)) { + kn->kn_flags |= EV_EOF; + retval = 1; + } + } - if (hint == 0) - tty_unlock(tp); - return (retval); + if (hint == 0) + tty_unlock(tp); + } while (0); -} + ptsd_kevent_mtx_unlock(minor(dev)); + return (retval); +} int ptsd_kqfilter(dev_t dev, struct knote *kn) { @@ -1581,14 +1647,14 @@ ptsd_kqfilter(dev_t dev, struct knote *kn) tty_lock(tp); kn->kn_hookid = dev; + kn->kn_hook = PTSD_KNOTE_VALID; + kn->kn_fop = &ptsd_kqops; switch (kn->kn_filter) { case EVFILT_READ: - kn->kn_fop = &ptsd_kqops_read; KNOTE_ATTACH(&tp->t_rsel.si_note, kn); break; case EVFILT_WRITE: - kn->kn_fop = &ptsd_kqops_write; KNOTE_ATTACH(&tp->t_wsel.si_note, kn); break; default: @@ -1600,3 +1666,59 @@ ptsd_kqfilter(dev_t dev, struct knote *kn) return (retval); } +/* + * Support for revoke(2). + * + * Mark all the kn_hook fields so that future invocations of the + * f_event op will just say "EOF" *without* looking at the + * ptmx_ioctl structure (which may disappear or be recycled at + * the end of ptsd_close). Issue wakeups to post that EOF to + * anyone listening. And finally remove the knotes from the + * tty's klists to keep ttyclose() happy, and set the hookid to + * zero to make the final detach passively successful. + */ +static void +ptsd_revoke_knotes(dev_t dev, struct tty *tp) +{ + struct klist *list; + struct knote *kn, *tkn; + + /* (Hold and drop the right locks in the right order.) */ + + ptsd_kevent_mtx_lock(minor(dev)); + tty_lock(tp); + + list = &tp->t_rsel.si_note; + SLIST_FOREACH(kn, list, kn_selnext) + kn->kn_hook = PTSD_KNOTE_REVOKED; + + list = &tp->t_wsel.si_note; + SLIST_FOREACH(kn, list, kn_selnext) + kn->kn_hook = PTSD_KNOTE_REVOKED; + + tty_unlock(tp); + ptsd_kevent_mtx_unlock(minor(dev)); + + tty_lock(tp); + ttwakeup(tp); + ttwwakeup(tp); + tty_unlock(tp); + + ptsd_kevent_mtx_lock(minor(dev)); + tty_lock(tp); + + list = &tp->t_rsel.si_note; + SLIST_FOREACH_SAFE(kn, list, kn_selnext, tkn) { + (void) KNOTE_DETACH(list, kn); + kn->kn_hookid = 0; + } + + list = &tp->t_wsel.si_note; + SLIST_FOREACH_SAFE(kn, list, kn_selnext, tkn) { + (void) KNOTE_DETACH(list, kn); + kn->kn_hookid = 0; + } + + tty_unlock(tp); + ptsd_kevent_mtx_unlock(minor(dev)); +} diff --git a/bsd/kern/tty_subr.c b/bsd/kern/tty_subr.c index c2abc010d..89bc09fe0 100644 --- a/bsd/kern/tty_subr.c +++ b/bsd/kern/tty_subr.c @@ -340,7 +340,9 @@ clrbits(u_char *cp, int off, int len) cp[sby++] &= mask; mask = (1<p_textvp) { + error = EINVAL; + goto out; + } + if (NULL == (blob_list_entry = ubc_cs_blob_get(p->p_textvp, -1, + p->p_textoff))) + goto out; + super_blob = (void *)blob_list_entry->csb_mem_kaddr; + if (CSMAGIC_EMBEDDED_SIGNATURE != ntohl(super_blob->magic)) { + error = EBADEXEC; + goto out; + } + count = ntohl(super_blob->count); + for (i = 0; i < count; ++i) { + blob_index = &super_blob->index[i]; + blob = (void *)((char *)super_blob + ntohl(blob_index->offset)); + switch (ntohl(blob_index->type)) { + case CSSLOT_CODEDIRECTORY: + if (CSMAGIC_CODEDIRECTORY != ntohl(blob->magic)) + break; + code_dir = (void *)blob; + hash_size = code_dir->hashSize; + if (CSSLOT_ENTITLEMENTS <= + ntohl(code_dir->nSpecialSlots)) { + embedded_hash = (void *)((char *)code_dir + + ntohl(code_dir->hashOffset) - + (hash_size * CSSLOT_ENTITLEMENTS)); + } + break; + case CSSLOT_ENTITLEMENTS: + if (CSMAGIC_EMBEDDED_ENTITLEMENTS != ntohl(blob->magic)) + break; + start = (void *)blob; + length = ntohl(blob->length); + break; + default: + break; + } + } + if (NULL == start && NULL == embedded_hash) { + error = 0; + goto out; + } else if (NULL == start || NULL == embedded_hash) { + error = EBADEXEC; + goto out; + } + if (NULL == (computed_hash = kalloc(hash_size))) { + error = ENOMEM; + goto out; + } + SHA1Init(&context); + SHA1Update(&context, start, length); + SHA1Final(computed_hash, &context); + if (0 != memcmp(computed_hash, embedded_hash, hash_size)) { + error = EBADEXEC; + goto out; + } + error = 0; +out: + if (NULL != computed_hash) + kfree(computed_hash, hash_size); + if (0 == error) { + *out_start = start; + *out_length = length; + } + return error; +} + +/* + * ENTITLEMENTS + * End of routines to navigate entitlements in the kernel. + */ + + /* * ubc_init @@ -626,7 +743,10 @@ ubc_setsize(struct vnode *vp, off_t nsize) uip->ui_size = nsize; if (nsize >= osize) { /* Nothing more to do */ - lock_vnode_and_post(vp, NOTE_EXTEND); + if (nsize > osize) { + lock_vnode_and_post(vp, NOTE_EXTEND); + } + return (1); /* return success */ } @@ -986,6 +1106,16 @@ ubc_getobject(struct vnode *vp, __unused int flags) return (MEMORY_OBJECT_CONTROL_NULL); } +boolean_t +ubc_strict_uncached_IO(struct vnode *vp) +{ + boolean_t result = FALSE; + + if (UBCINFOEXISTS(vp)) { + result = memory_object_is_slid(vp->v_ubcinfo->ui_control); + } + return result; +} /* * ubc_blktooff @@ -1834,6 +1964,9 @@ ubc_create_upl( if (bufsize & 0xfff) return KERN_INVALID_ARGUMENT; + if (bufsize > MAX_UPL_SIZE * PAGE_SIZE) + return KERN_INVALID_ARGUMENT; + if (uplflags & (UPL_UBC_MSYNC | UPL_UBC_PAGEOUT | UPL_UBC_PAGEIN)) { if (uplflags & UPL_UBC_MSYNC) { @@ -2223,12 +2356,12 @@ static SInt32 cs_blob_count_peak = 0; int cs_validation = 1; -SYSCTL_INT(_vm, OID_AUTO, cs_validation, CTLFLAG_RW, &cs_validation, 0, "Do validate code signatures"); -SYSCTL_INT(_vm, OID_AUTO, cs_blob_count, CTLFLAG_RD, &cs_blob_count, 0, "Current number of code signature blobs"); -SYSCTL_INT(_vm, OID_AUTO, cs_blob_size, CTLFLAG_RD, &cs_blob_size, 0, "Current size of all code signature blobs"); -SYSCTL_INT(_vm, OID_AUTO, cs_blob_count_peak, CTLFLAG_RD, &cs_blob_count_peak, 0, "Peak number of code signature blobs"); -SYSCTL_INT(_vm, OID_AUTO, cs_blob_size_peak, CTLFLAG_RD, &cs_blob_size_peak, 0, "Peak size of code signature blobs"); -SYSCTL_INT(_vm, OID_AUTO, cs_blob_size_max, CTLFLAG_RD, &cs_blob_size_max, 0, "Size of biggest code signature blob"); +SYSCTL_INT(_vm, OID_AUTO, cs_validation, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_validation, 0, "Do validate code signatures"); +SYSCTL_INT(_vm, OID_AUTO, cs_blob_count, CTLFLAG_RD | CTLFLAG_LOCKED, (int *)(uintptr_t)&cs_blob_count, 0, "Current number of code signature blobs"); +SYSCTL_INT(_vm, OID_AUTO, cs_blob_size, CTLFLAG_RD | CTLFLAG_LOCKED, (int *)(uintptr_t)&cs_blob_size, 0, "Current size of all code signature blobs"); +SYSCTL_INT(_vm, OID_AUTO, cs_blob_count_peak, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_blob_count_peak, 0, "Peak number of code signature blobs"); +SYSCTL_INT(_vm, OID_AUTO, cs_blob_size_peak, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_blob_size_peak, 0, "Peak size of code signature blobs"); +SYSCTL_INT(_vm, OID_AUTO, cs_blob_size_max, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_blob_size_max, 0, "Size of biggest code signature blob"); kern_return_t ubc_cs_blob_allocate( @@ -2335,7 +2468,7 @@ ubc_cs_blob_add( blob->csb_start_offset = 0; blob->csb_end_offset = 0; } else { - unsigned char *sha1_base; + const unsigned char *sha1_base; int sha1_size; blob->csb_flags = ntohl(cd->flags) | CS_VALID; @@ -2582,6 +2715,9 @@ ubc_cs_free( OSAddAtomic((SInt32) -blob->csb_mem_size, &cs_blob_size); kfree(blob, sizeof (*blob)); } +#if CHECK_CS_VALIDATION_BITMAP + ubc_cs_validation_bitmap_deallocate( uip->ui_vnode ); +#endif uip->cs_blobs = NULL; } @@ -2820,3 +2956,127 @@ ubc_cs_getcdhash( return ret; } + +#if CHECK_CS_VALIDATION_BITMAP +#define stob(s) ((atop_64((s)) + 07) >> 3) +extern boolean_t root_fs_upgrade_try; + +/* + * Should we use the code-sign bitmap to avoid repeated code-sign validation? + * Depends: + * a) Is the target vnode on the root filesystem? + * b) Has someone tried to mount the root filesystem read-write? + * If answers are (a) yes AND (b) no, then we can use the bitmap. + */ +#define USE_CODE_SIGN_BITMAP(vp) ( (vp != NULL) && (vp->v_mount != NULL) && (vp->v_mount->mnt_flag & MNT_ROOTFS) && !root_fs_upgrade_try) +kern_return_t +ubc_cs_validation_bitmap_allocate( + vnode_t vp) +{ + kern_return_t kr = KERN_SUCCESS; + struct ubc_info *uip; + char *target_bitmap; + vm_object_size_t bitmap_size; + + if ( ! USE_CODE_SIGN_BITMAP(vp) || (! UBCINFOEXISTS(vp))) { + kr = KERN_INVALID_ARGUMENT; + } else { + uip = vp->v_ubcinfo; + + if ( uip->cs_valid_bitmap == NULL ) { + bitmap_size = stob(uip->ui_size); + target_bitmap = (char*) kalloc( (vm_size_t)bitmap_size ); + if (target_bitmap == 0) { + kr = KERN_NO_SPACE; + } else { + kr = KERN_SUCCESS; + } + if( kr == KERN_SUCCESS ) { + memset( target_bitmap, 0, (size_t)bitmap_size); + uip->cs_valid_bitmap = (void*)target_bitmap; + uip->cs_valid_bitmap_size = bitmap_size; + } + } + } + return kr; +} + +kern_return_t +ubc_cs_check_validation_bitmap ( + vnode_t vp, + memory_object_offset_t offset, + int optype) +{ + kern_return_t kr = KERN_SUCCESS; + + if ( ! USE_CODE_SIGN_BITMAP(vp) || ! UBCINFOEXISTS(vp)) { + kr = KERN_INVALID_ARGUMENT; + } else { + struct ubc_info *uip = vp->v_ubcinfo; + char *target_bitmap = uip->cs_valid_bitmap; + + if ( target_bitmap == NULL ) { + kr = KERN_INVALID_ARGUMENT; + } else { + uint64_t bit, byte; + bit = atop_64( offset ); + byte = bit >> 3; + + if ( byte > uip->cs_valid_bitmap_size ) { + kr = KERN_INVALID_ARGUMENT; + } else { + + if (optype == CS_BITMAP_SET) { + target_bitmap[byte] |= (1 << (bit & 07)); + kr = KERN_SUCCESS; + } else if (optype == CS_BITMAP_CLEAR) { + target_bitmap[byte] &= ~(1 << (bit & 07)); + kr = KERN_SUCCESS; + } else if (optype == CS_BITMAP_CHECK) { + if ( target_bitmap[byte] & (1 << (bit & 07))) { + kr = KERN_SUCCESS; + } else { + kr = KERN_FAILURE; + } + } + } + } + } + return kr; +} + +void +ubc_cs_validation_bitmap_deallocate( + vnode_t vp) +{ + struct ubc_info *uip; + void *target_bitmap; + vm_object_size_t bitmap_size; + + if ( UBCINFOEXISTS(vp)) { + uip = vp->v_ubcinfo; + + if ( (target_bitmap = uip->cs_valid_bitmap) != NULL ) { + bitmap_size = uip->cs_valid_bitmap_size; + kfree( target_bitmap, (vm_size_t) bitmap_size ); + uip->cs_valid_bitmap = NULL; + } + } +} +#else +kern_return_t ubc_cs_validation_bitmap_allocate(__unused vnode_t vp){ + return KERN_INVALID_ARGUMENT; +} + +kern_return_t ubc_cs_check_validation_bitmap( + __unused struct vnode *vp, + __unused memory_object_offset_t offset, + __unused int optype){ + + return KERN_INVALID_ARGUMENT; +} + +void ubc_cs_validation_bitmap_deallocate(__unused vnode_t vp){ + return; +} +#endif /* CHECK_CS_VALIDATION_BITMAP */ diff --git a/bsd/kern/uipc_domain.c b/bsd/kern/uipc_domain.c index 37a040c86..1065d3683 100644 --- a/bsd/kern/uipc_domain.c +++ b/bsd/kern/uipc_domain.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2009 Apple Inc. All rights reserved. + * Copyright (c) 1998-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -77,15 +77,14 @@ #include void init_domain(struct domain *dp) __attribute__((section("__TEXT, initcode"))); -void concat_domain(struct domain *dp) __attribute__((section("__TEXT, initcode"))); +void prepend_domain(struct domain *dp) __attribute__((section("__TEXT, initcode"))); - -void pffasttimo(void *); void pfslowtimo(void *); struct protosw *pffindprotonotype(int, int); struct protosw *pffindprotonotype_locked(int , int , int); struct domain *pffinddomain(int); +static void net_update_uptime(void); /* * Add/delete 'domain': Link structure into system list, @@ -101,6 +100,12 @@ extern int do_reclaim; extern sysctlfn net_sysctl; +static u_int64_t uptime; + +#ifdef INET6 +extern void ip6_fin(void); +#endif + static void init_proto(struct protosw *pr) { @@ -133,6 +138,16 @@ init_domain(struct domain *dp) dp->dom_name, (int)(pr - dp->dom_protosw)); +#if __APPLE__ + /* + * Warn that pr_fasttimo (now pr_unused) is deprecated since rdar://7617868 + */ + if (pr->pr_unused != NULL) { + printf("init_domain: warning %s, proto %d: pr_fasttimo is deprecated and won't be called\n", + dp->dom_name, pr->pr_protocol); + } +#endif + init_proto(pr); } @@ -147,8 +162,8 @@ init_domain(struct domain *dp) } void -concat_domain(struct domain *dp) -{ +prepend_domain(struct domain *dp) +{ lck_mtx_assert(domain_proto_mtx, LCK_MTX_ASSERT_OWNED); dp->dom_next = domains; domains = dp; @@ -162,7 +177,7 @@ net_add_domain(struct domain *dp) /* First, link in the domain */ lck_mtx_lock(domain_proto_mtx); - concat_domain(dp); + prepend_domain(dp); init_domain(dp); lck_mtx_unlock(domain_proto_mtx); @@ -302,31 +317,32 @@ domaininit(void) lck_mtx_lock(domain_proto_mtx); - concat_domain(&localdomain); - concat_domain(&routedomain); - concat_domain(&inetdomain); + prepend_domain(&localdomain); + prepend_domain(&inetdomain); #if NETAT - concat_domain(&atalkdomain); + prepend_domain(&atalkdomain); #endif #if INET6 - concat_domain(&inet6domain); + prepend_domain(&inet6domain); #endif + prepend_domain(&routedomain); + #if IPSEC - concat_domain(&keydomain); + prepend_domain(&keydomain); #endif #if NS - concat_domain(&nsdomain); + prepend_domain(&nsdomain); #endif #if ISO - concat_domain(&isodomain); + prepend_domain(&isodomain); #endif #if CCITT - concat_domain(&ccittdomain); + prepend_domain(&ccittdomain); #endif - concat_domain(&ndrvdomain); + prepend_domain(&ndrvdomain); - concat_domain(&systemdomain); + prepend_domain(&systemdomain); /* * Now ask them all to init (XXX including the routing domain, @@ -336,10 +352,17 @@ domaininit(void) init_domain(dp); lck_mtx_unlock(domain_proto_mtx); - timeout(pffasttimo, NULL, 1); timeout(pfslowtimo, NULL, 1); } +void +domainfin(void) +{ +#ifdef INET6 + ip6_fin(); +#endif +} + static __inline__ struct domain * pffinddomain_locked(int pf) { @@ -525,6 +548,13 @@ pfslowtimo(__unused void *arg) register struct domain *dp; register struct protosw *pr; + /* + * Update coarse-grained networking timestamp (in sec.); the idea + * is to piggy-back on the periodic slow timeout callout to update + * the counter returnable via net_uptime(). + */ + net_update_uptime(); + lck_mtx_lock(domain_proto_mtx); for (dp = domains; dp; dp = dp->dom_next) for (pr = dp->dom_protosw; pr; pr = pr->pr_next) { @@ -539,17 +569,26 @@ pfslowtimo(__unused void *arg) timeout(pfslowtimo, NULL, hz/PR_SLOWHZ); } -void -pffasttimo(__unused void *arg) +static void +net_update_uptime(void) { - register struct domain *dp; - register struct protosw *pr; + struct timeval tv; - lck_mtx_lock(domain_proto_mtx); - for (dp = domains; dp; dp = dp->dom_next) - for (pr = dp->dom_protosw; pr; pr = pr->pr_next) - if (pr->pr_fasttimo) - (*pr->pr_fasttimo)(); - lck_mtx_unlock(domain_proto_mtx); - timeout(pffasttimo, NULL, hz/PR_FASTHZ); + microuptime(&tv); + uptime = tv.tv_sec; +} + +/* + * An alternative way to obtain the coarse-grained uptime (in seconds) + * for networking code which do not require high-precision timestamp, + * as this is significantly cheaper than microuptime(). + */ +u_int64_t +net_uptime(void) +{ + /* If we get here before pfslowtimo() fires for the first time */ + if (uptime == 0) + net_update_uptime(); + + return (uptime); } diff --git a/bsd/kern/uipc_mbuf.c b/bsd/kern/uipc_mbuf.c index 627cd9926..d8d3ce857 100644 --- a/bsd/kern/uipc_mbuf.c +++ b/bsd/kern/uipc_mbuf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 1998-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -84,6 +84,7 @@ #include #include #include +#include #include #include @@ -115,7 +116,7 @@ * preserve the contents of the objects during its transactions. * * MC_BIGCL: - * This is a cache of rudimentary objects of NBPG in size; each + * This is a cache of rudimentary objects of MBIGCLBYTES in size; each * object represents a mbigcluster structure. This cache does not * preserve the contents of the objects during its transaction. * @@ -264,8 +265,9 @@ * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally, * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag, - * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Note - * that debugging consumes more CPU and memory. + * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Leak + * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g. + * "mbuf_debug=0x113". Note that debugging consumes more CPU and memory. * * Each object is associated with exactly one mcache_audit_t structure that * contains the information related to its last buffer transaction. Given @@ -276,9 +278,9 @@ * | mbuf addr | | mclaudit[i] | * +------------+ +=============+ * | | cl_audit[0] | - * i = MTOCL(addr) +-------------+ + * i = MTOBG(addr) +-------------+ * | +-----> | cl_audit[1] | -----> mcache_audit_t - * b = CLTOM(i) | +-------------+ + * b = BGTOM(i) | +-------------+ * | | | ... | * x = MCLIDX(b, addr) | +-------------+ * | | | cl_audit[7] | @@ -286,12 +288,12 @@ * (e.g. x == 1) * * The mclaudit[] array is allocated at initialization time, but its contents - * get populated when the corresponding cluster is created. Because a cluster - * can be turned into NMBPCL number of mbufs, we preserve enough space for the - * mbufs so that there is a 1-to-1 mapping between them. A cluster that never + * get populated when the corresponding cluster is created. Because a page + * can be turned into NMBPBG number of mbufs, we preserve enough space for the + * mbufs so that there is a 1-to-1 mapping between them. A page that never * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the - * remaining entries unused. For big clusters, only one entry is allocated - * and used for the entire cluster pair. + * remaining entries unused. For 16KB cluster, only one entry from the first + * page is allocated and used for the entire object. */ /* TODO: should be in header file */ @@ -311,7 +313,7 @@ static void *mbuf_worker_run; /* wait channel for worker thread */ static int mbuf_worker_ready; /* worker thread is runnable */ static int mbuf_expand_mcl; /* number of cluster creation requets */ static int mbuf_expand_big; /* number of big cluster creation requests */ -static int mbuf_expand_16k; /* number of 16K cluster creation requests */ +static int mbuf_expand_16k; /* number of 16KB cluster creation requests */ static int ncpu; /* number of CPUs */ static ppnum_t *mcl_paddr; /* Array of cluster physical addresses */ static ppnum_t mcl_pages; /* Size of array (# physical pages) */ @@ -320,19 +322,18 @@ static mcache_t *ref_cache; /* Cache of cluster reference & flags */ static mcache_t *mcl_audit_con_cache; /* Audit contents cache */ static unsigned int mbuf_debug; /* patchable mbuf mcache flags */ static unsigned int mb_normalized; /* number of packets "normalized" */ -static unsigned int mbuf_gscale; /* Power-of-two growth scale for m_howmany */ #define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */ -#define MB_GROWTH_NORMAL 4 /* Threshold: 15/16 of total */ +#define MB_GROWTH_NORMAL 2 /* Threshold: 3/4 of total */ typedef enum { MC_MBUF = 0, /* Regular mbuf */ MC_CL, /* Cluster */ - MC_BIGCL, /* Large (4K) cluster */ - MC_16KCL, /* Jumbo (16K) cluster */ + MC_BIGCL, /* Large (4KB) cluster */ + MC_16KCL, /* Jumbo (16KB) cluster */ MC_MBUF_CL, /* mbuf + cluster */ - MC_MBUF_BIGCL, /* mbuf + large (4K) cluster */ - MC_MBUF_16KCL /* mbuf + jumbo (16K) cluster */ + MC_MBUF_BIGCL, /* mbuf + large (4KB) cluster */ + MC_MBUF_16KCL /* mbuf + jumbo (16KB) cluster */ } mbuf_class_t; #define MBUF_CLASS_MIN MC_MBUF @@ -371,6 +372,8 @@ typedef enum { * a cluster's size. In this case, only the slab of the first cluster is * used. The rest of the slabs are marked with SLF_PARTIAL to indicate * that they are part of the larger slab. + * + * Each slab controls a page of memory. */ typedef struct mcl_slab { struct mcl_slab *sl_next; /* neighboring slab */ @@ -394,23 +397,24 @@ typedef struct mcl_slab { * whenever a new piece of memory mapped in from the VM crosses the 1MB * boundary. */ -#define NSLABSPMB ((1 << MBSHIFT) >> MCLSHIFT) /* 512 slabs/grp */ +#define NSLABSPMB ((1 << MBSHIFT) >> PGSHIFT) /* 256 slabs/grp */ typedef struct mcl_slabg { mcl_slab_t slg_slab[NSLABSPMB]; /* group of slabs */ } mcl_slabg_t; +/* + * Number of slabs needed to control a 16KB cluster object. + */ +#define NSLABSP16KB (M16KCLBYTES >> PGSHIFT) + /* * Per-cluster audit structure. */ typedef struct { - mcache_audit_t *cl_audit[NMBPCL]; /* array of audits */ + mcache_audit_t *cl_audit[NMBPBG]; /* array of audits */ } mcl_audit_t; -#if CONFIG_MBUF_NOEXPAND -static unsigned int maxmbufcl; -#endif /* CONFIG_MBUF_NOEXPAND */ - /* * Size of data from the beginning of an mbuf that covers m_hdr, pkthdr * and m_ext structures. If auditing is enabled, we allocate a shadow @@ -434,6 +438,7 @@ static unsigned int maxmbufcl; * Each of the following two arrays hold up to nmbclusters elements. */ static mcl_audit_t *mclaudit; /* array of cluster audit information */ +static unsigned int maxclaudit; /* max # of entries in audit table */ static mcl_slabg_t **slabstbl; /* cluster slabs table */ static unsigned int maxslabgrp; /* max # of entries in slabs table */ static unsigned int slabgrp; /* # of entries in slabs table */ @@ -442,13 +447,68 @@ static unsigned int slabgrp; /* # of entries in slabs table */ int nclusters; /* # of clusters for non-jumbo (legacy) sizes */ int njcl; /* # of clusters for jumbo sizes */ int njclbytes; /* size of a jumbo cluster */ -union mcluster *mbutl; /* first mapped cluster address */ -union mcluster *embutl; /* ending virtual address of mclusters */ +union mbigcluster *mbutl; /* first mapped cluster address */ +union mbigcluster *embutl; /* ending virtual address of mclusters */ int max_linkhdr; /* largest link-level header */ int max_protohdr; /* largest protocol header */ int max_hdr; /* largest link+protocol header */ int max_datalen; /* MHLEN - max_hdr */ +static boolean_t mclverify; /* debug: pattern-checking */ +static boolean_t mcltrace; /* debug: stack tracing */ +static boolean_t mclfindleak; /* debug: leak detection */ + +/* mbuf leak detection variables */ +static struct mleak_table mleak_table; +static mleak_stat_t *mleak_stat; + +#define MLEAK_STAT_SIZE(n) \ + ((size_t)(&((mleak_stat_t *)0)->ml_trace[n])) + +struct mallocation { + mcache_obj_t *element; /* the alloc'ed element, NULL if unused */ + u_int32_t trace_index; /* mtrace index for corresponding backtrace */ + u_int32_t count; /* How many objects were requested */ + u_int64_t hitcount; /* for determining hash effectiveness */ +}; + +struct mtrace { + u_int64_t collisions; + u_int64_t hitcount; + u_int64_t allocs; + u_int64_t depth; + uintptr_t addr[MLEAK_STACK_DEPTH]; +}; + +/* Size must be a power of two for the zhash to be able to just mask off bits */ +#define MLEAK_ALLOCATION_MAP_NUM 512 +#define MLEAK_TRACE_MAP_NUM 256 + +/* + * Sample factor for how often to record a trace. This is overwritable + * by the boot-arg mleak_sample_factor. + */ +#define MLEAK_SAMPLE_FACTOR 500 + +/* + * Number of top leakers recorded. + */ +#define MLEAK_NUM_TRACES 5 + +static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM; +static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM; + +/* Hashmaps of allocations and their corresponding traces */ +static struct mallocation *mleak_allocations; +static struct mtrace *mleak_traces; +static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES]; + +/* Lock to protect mleak tables from concurrent modification */ +static lck_mtx_t *mleak_lock; +static lck_attr_t *mleak_lock_attr; +static lck_grp_t *mleak_lock_grp; +static lck_grp_attr_t *mleak_lock_grp_attr; + extern u_int32_t high_sb_max; /* TODO: should be in header file */ @@ -460,7 +520,6 @@ int do_reclaim = 0; #define MIN16KCL (MINCL >> 2) /* Low watermarks (only map in pages once free counts go below) */ -#define MCL_LOWAT MINCL #define MBIGCL_LOWAT MINBIGCL #define M16KCL_LOWAT MIN16KCL @@ -525,15 +584,34 @@ static mbuf_table_t mbuf_table[] = { #define NELEM(a) (sizeof (a) / sizeof ((a)[0])) static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */ -static int mb_waiters; /* number of sleepers */ +static int mb_waiters; /* number of waiters */ + +#define MB_WDT_MAXTIME 10 /* # of secs before watchdog panic */ +static struct timeval mb_wdtstart; /* watchdog start timestamp */ +static char mbuf_dump_buf[256]; + +/* + * mbuf watchdog is enabled by default on embedded platforms. It is + * also toggeable via the kern.ipc.mb_watchdog sysctl. + */ +#if CONFIG_EMBEDDED +static unsigned int mb_watchdog = 1; +#else +static unsigned int mb_watchdog = 0; +#endif /* CONFIG_EMBEDDED */ /* The following are used to serialize m_clalloc() */ static boolean_t mb_clalloc_busy; static void *mb_clalloc_waitchan = &mb_clalloc_busy; static int mb_clalloc_waiters; +static void mbuf_mtypes_sync(boolean_t); static int mbstat_sysctl SYSCTL_HANDLER_ARGS; +static void mbuf_stat_sync(void); static int mb_stat_sysctl SYSCTL_HANDLER_ARGS; +static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS; +static int mleak_table_sysctl SYSCTL_HANDLER_ARGS; +static char *mbuf_dump(void); static void mbuf_table_init(void); static inline void m_incref(struct mbuf *); static inline u_int32_t m_decref(struct mbuf *); @@ -554,11 +632,13 @@ static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***, static void mbuf_cslab_free(void *, mcache_obj_t *, int); static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t); static int freelist_populate(mbuf_class_t, unsigned int, int); +static void freelist_init(mbuf_class_t); static boolean_t mbuf_cached_above(mbuf_class_t, int); static boolean_t mbuf_steal(mbuf_class_t, unsigned int); static void m_reclaim(mbuf_class_t, unsigned int, boolean_t); static int m_howmany(int, size_t); static void mbuf_worker_thread(void); +static void mbuf_watchdog(void); static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int); static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **, @@ -572,6 +652,11 @@ static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *); static void mcl_audit_mcheck_panic(struct mbuf *); static void mcl_audit_verify_nextptr(void *, mcache_audit_t *); +static void mleak_activate(void); +static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t); +static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int); +static void mleak_free(mcache_obj_t *); + static mcl_slab_t *slab_get(void *); static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t, void *, void *, unsigned int, int, int); @@ -582,7 +667,6 @@ static void slab_nextptr_panic(mcl_slab_t *, void *); static void slab_detach(mcl_slab_t *); static boolean_t slab_is_detached(mcl_slab_t *); -static unsigned int m_length(struct mbuf *); static int m_copyback0(struct mbuf **, int, int, const void *, int, int); static struct mbuf *m_split0(struct mbuf *, int, int, int); @@ -605,11 +689,19 @@ static struct mbuf *m_split0(struct mbuf *, int, int, int); */ #define EXTF_COMPOSITE 0x1 +/* + * This flag indicates that the external cluster is read-only, i.e. it is + * or was referred to by more than one mbufs. Once set, this flag is never + * cleared. + */ +#define EXTF_READONLY 0x2 +#define EXTF_MASK (EXTF_COMPOSITE | EXTF_READONLY) + #define MEXT_RFA(m) ((m)->m_ext.ext_refflags) #define MEXT_REF(m) (MEXT_RFA(m)->refcnt) #define MEXT_FLAGS(m) (MEXT_RFA(m)->flags) #define MBUF_IS_COMPOSITE(m) \ - (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_COMPOSITE)) + (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE) /* * Macros used to verify the integrity of the mbuf. @@ -638,15 +730,21 @@ static struct mbuf *m_split0(struct mbuf *, int, int, int); #define MTOD(m, t) ((t)((m)->m_data)) /* - * Macros to obtain cluster index and base cluster address. + * Macros to obtain (4KB) cluster index and base cluster address. + */ + +#define MTOBG(x) (((char *)(x) - (char *)mbutl) >> MBIGCLSHIFT) +#define BGTOM(x) ((union mbigcluster *)(mbutl + (x))) + +/* + * Macro to find the mbuf index relative to a base. */ -#define MTOCL(x) (((char *)(x) - (char *)mbutl) >> MCLSHIFT) -#define CLTOM(x) ((union mcluster *)(mbutl + (x))) +#define MCLIDX(c, m) (((char *)(m) - (char *)(c)) >> MSIZESHIFT) /* - * Macro to find the mbuf index relative to the cluster base. + * Same thing for 2KB cluster index. */ -#define MCLIDX(c, m) (((char *)(m) - (char *)(c)) >> 8) +#define CLBGIDX(c, m) (((char *)(m) - (char *)(c)) >> MCLSHIFT) /* * Macros used during mbuf and cluster initialization. @@ -670,6 +768,7 @@ static struct mbuf *m_split0(struct mbuf *, int, int, int); (m)->m_pkthdr.tso_segsz = 0; \ (m)->m_pkthdr.vlan_tag = 0; \ (m)->m_pkthdr.socket_id = 0; \ + (m)->m_pkthdr.vt_nrecs = 0; \ m_tag_init(m); \ m_prio_init(m); \ } \ @@ -759,16 +858,12 @@ static mbuf_mtypes_t *mbuf_mtypes; /* per-CPU statistics */ #define MTYPES_CPU(p) \ ((mtypes_cpu_t *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number()))) -/* This should be in a header file */ -#define atomic_add_16(a, n) ((void) OSAddAtomic16(n, a)) -#define atomic_add_32(a, n) ((void) OSAddAtomic(n, a)) - #define mtype_stat_add(type, n) { \ if ((unsigned)(type) < MT_MAX) { \ mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes); \ atomic_add_32(&mbs->cpu_mtypes[type], n); \ - } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \ - atomic_add_16((int16_t*)&mbstat.m_mtypes[type], n); \ + } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \ + atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n); \ } \ } @@ -776,13 +871,15 @@ static mbuf_mtypes_t *mbuf_mtypes; /* per-CPU statistics */ #define mtype_stat_inc(t) mtype_stat_add(t, 1) #define mtype_stat_dec(t) mtype_stat_sub(t, 1) -static int -mbstat_sysctl SYSCTL_HANDLER_ARGS +static void +mbuf_mtypes_sync(boolean_t locked) { -#pragma unused(oidp, arg1, arg2) int m, n; mtypes_cpu_t mtc; + if (locked) + lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + bzero(&mtc, sizeof (mtc)); for (m = 0; m < ncpu; m++) { mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m]; @@ -794,25 +891,33 @@ mbstat_sysctl SYSCTL_HANDLER_ARGS for (n = 0; n < MT_MAX; n++) mtc.cpu_mtypes[n] += temp.cpu_mtypes[n]; } - lck_mtx_lock(mbuf_mlock); + if (!locked) + lck_mtx_lock(mbuf_mlock); for (n = 0; n < MT_MAX; n++) mbstat.m_mtypes[n] = mtc.cpu_mtypes[n]; - lck_mtx_unlock(mbuf_mlock); - - return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat))); + if (!locked) + lck_mtx_unlock(mbuf_mlock); } static int -mb_stat_sysctl SYSCTL_HANDLER_ARGS +mbstat_sysctl SYSCTL_HANDLER_ARGS { #pragma unused(oidp, arg1, arg2) - mcache_t *cp; - mcache_cpu_t *ccp; + mbuf_mtypes_sync(FALSE); + + return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat))); +} + +static void +mbuf_stat_sync(void) +{ mb_class_stat_t *sp; - void *statp; - int k, m, bktsize, statsz, proc64 = proc_is64bit(req->p); + mcache_cpu_t *ccp; + mcache_t *cp; + int k, m, bktsize; + + lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); - lck_mtx_lock(mbuf_mlock); for (k = 0; k < NELEM(mbuf_table); k++) { cp = m_cache(k); ccp = &cp->mc_cpu[0]; @@ -854,9 +959,8 @@ mb_stat_sysctl SYSCTL_HANDLER_ARGS break; case MC_CL: - /* Deduct clusters used in composite cache and mbufs */ - sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) + - (P2ROUNDUP(m_total(MC_MBUF), NMBPCL)/NMBPCL)); + /* Deduct clusters used in composite cache */ + sp->mbcl_ctotal -= m_total(MC_MBUF_CL); break; case MC_BIGCL: @@ -873,6 +977,17 @@ mb_stat_sysctl SYSCTL_HANDLER_ARGS break; } } +} + +static int +mb_stat_sysctl SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + void *statp; + int k, statsz, proc64 = proc_is64bit(req->p); + + lck_mtx_lock(mbuf_mlock); + mbuf_stat_sync(); if (!proc64) { struct omb_class_stat *oc; @@ -913,6 +1028,69 @@ mb_stat_sysctl SYSCTL_HANDLER_ARGS return (SYSCTL_OUT(req, statp, statsz)); } +static int +mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + mleak_trace_stat_t *mltr; + int i; + + /* Ensure leak tracing turned on */ + if (!mclfindleak) + return (ENXIO); + + VERIFY(mleak_stat != NULL); +#ifdef __LP64__ + VERIFY(mleak_stat->ml_isaddr64); +#else + VERIFY(!mleak_stat->ml_isaddr64); +#endif /* !__LP64__ */ + VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES); + + lck_mtx_lock(mleak_lock); + mltr = &mleak_stat->ml_trace[0]; + bzero(mltr, sizeof (*mltr) * MLEAK_NUM_TRACES); + for (i = 0; i < MLEAK_NUM_TRACES; i++) { + int j; + + if (mleak_top_trace[i] == NULL || + mleak_top_trace[i]->allocs == 0) + continue; + + mltr->mltr_collisions = mleak_top_trace[i]->collisions; + mltr->mltr_hitcount = mleak_top_trace[i]->hitcount; + mltr->mltr_allocs = mleak_top_trace[i]->allocs; + mltr->mltr_depth = mleak_top_trace[i]->depth; + + VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH); + for (j = 0; j < mltr->mltr_depth; j++) + mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j]; + + mltr++; + } + i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES)); + lck_mtx_unlock(mleak_lock); + + return (i); +} + +static int +mleak_table_sysctl SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int i = 0; + + /* Ensure leak tracing turned on */ + if (!mclfindleak) + return (ENXIO); + + lck_mtx_lock(mleak_lock); + i = SYSCTL_OUT(req, &mleak_table, sizeof (mleak_table)); + lck_mtx_unlock(mleak_lock); + + return (i); +} + static inline void m_incref(struct mbuf *m) { @@ -924,6 +1102,14 @@ m_incref(struct mbuf *m) new = old + 1; ASSERT(new != 0); } while (!OSCompareAndSwap(old, new, addr)); + + /* + * If cluster is shared, mark it with (sticky) EXTF_READONLY; + * we don't clear the flag when the refcount goes back to 1 + * to simplify code calling m_mclhasreference(). + */ + if (new > 1 && !(MEXT_FLAGS(m) & EXTF_READONLY)) + (void) OSBitOrAtomic(EXTF_READONLY, &MEXT_FLAGS(m)); } static inline u_int32_t @@ -944,6 +1130,7 @@ m_decref(struct mbuf *m) static void mbuf_table_init(void) { + unsigned int b, c, s; int m; MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)), @@ -968,66 +1155,78 @@ mbuf_table_init(void) #endif /* CONFIG_MBUF_JUMBO */ /* - * nclusters is going to be split in 2 to hold both the 2K - * and the 4K pools, so make sure each half is even. + * nclusters holds both the 2KB and 4KB pools, so ensure it's + * a multiple of 4KB clusters. */ - nclusters = P2ROUNDDOWN(nmbclusters - njcl, 4); + nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG); if (njcl > 0) { /* - * Each jumbo cluster takes 8 2K clusters, so make - * sure that the pool size is evenly divisible by 8. + * Each jumbo cluster takes 8 2KB clusters, so make + * sure that the pool size is evenly divisible by 8; + * njcl is in 2KB unit, hence treated as such. */ njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8); - } -#if CONFIG_MBUF_NOEXPAND - /* Only use 4k clusters if we're setting aside more than 256k */ - if (nmbclusters <= 128) { - maxmbufcl = nmbclusters / 4; - } else { - /* Half to big clusters, half to small */ - maxmbufcl = (nmbclusters / 4) * 3; + /* Update nclusters with rounded down value of njcl */ + nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG); } -#endif /* CONFIG_MBUF_NOEXPAND */ /* - * 1/2 of the map is reserved for 2K clusters. Out of this, 1/16th - * of the total number of 2K clusters allocated is reserved and cannot - * be turned into mbufs. It can only be used for pure cluster objects. + * njcl is valid only on platforms with 16KB jumbo clusters, where + * it is configured to 1/3 of the pool size. On these platforms, + * the remaining is used for 2KB and 4KB clusters. On platforms + * without 16KB jumbo clusters, the entire pool is used for both + * 2KB and 4KB clusters. A 4KB cluster can either be splitted into + * 16 mbufs, or into 2 2KB clusters. + * + * +---+---+------------ ... -----------+------- ... -------+ + * | c | b | s | njcl | + * +---+---+------------ ... -----------+------- ... -------+ + * + * 1/32th of the shared region is reserved for pure 2KB and 4KB + * clusters (1/64th each.) + */ + c = P2ROUNDDOWN((nclusters >> 6), 2); /* in 2KB unit */ + b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), 2); /* in 4KB unit */ + s = nclusters - (c + (b << NCLPBGSHIFT)); /* in 2KB unit */ + + /* + * 1/64th (c) is reserved for 2KB clusters. */ - m_minlimit(MC_CL) = (nclusters >> 5); - m_maxlimit(MC_CL) = (nclusters >> 1); + m_minlimit(MC_CL) = c; + m_maxlimit(MC_CL) = s + c; /* in 2KB unit */ m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES; (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl"); /* - * The remaining (15/16th) can be turned into mbufs. + * Another 1/64th (b) of the map is reserved for 4KB clusters. + * It cannot be turned into 2KB clusters or mbufs. */ - m_minlimit(MC_MBUF) = 0; - m_maxlimit(MC_MBUF) = (m_maxlimit(MC_CL) - m_minlimit(MC_CL)) * NMBPCL; - m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE; - (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf"); + m_minlimit(MC_BIGCL) = b; + m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b; /* in 4KB unit */ + m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES; + (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl"); /* - * The other 1/2 of the map is reserved for 4K clusters. + * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB) */ - m_minlimit(MC_BIGCL) = 0; - m_maxlimit(MC_BIGCL) = m_maxlimit(MC_CL) >> 1; - m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = NBPG; - (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl"); + m_minlimit(MC_MBUF) = 0; + m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT); /* in mbuf unit */ + m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE; + (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf"); /* * Set limits for the composite classes. */ m_minlimit(MC_MBUF_CL) = 0; - m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL) - m_minlimit(MC_CL); + m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL); m_maxsize(MC_MBUF_CL) = MCLBYTES; m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL); (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl"); m_minlimit(MC_MBUF_BIGCL) = 0; m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL); - m_maxsize(MC_MBUF_BIGCL) = NBPG; + m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES; m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL); (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl"); @@ -1035,7 +1234,7 @@ mbuf_table_init(void) * And for jumbo classes. */ m_minlimit(MC_16KCL) = 0; - m_maxlimit(MC_16KCL) = (njcl >> 3); + m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT); /* in 16KB unit */ m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES; (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl"); @@ -1084,19 +1283,19 @@ static ncl_tbl_t ncl_table_srv[] = { #endif /* __LP64__ */ __private_extern__ unsigned int -mbuf_default_ncl(int srv, uint64_t mem) +mbuf_default_ncl(int server, uint64_t mem) { #if !defined(__LP64__) -#pragma unused(srv) +#pragma unused(server) unsigned int n; /* * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM). */ - if ((n = ((mem / 16) / MCLBYTES)) > 32768) - n = 32768; + if ((n = ((mem / 16) / MCLBYTES)) > 32768) + n = 32768; #else unsigned int n, i; - ncl_tbl_t *tbl = (srv ? ncl_table_srv : ncl_table); + ncl_tbl_t *tbl = (server ? ncl_table_srv : ncl_table); /* * 64-bit kernel (mbuf pool size based on table). */ @@ -1115,13 +1314,16 @@ __private_extern__ void mbinit(void) { unsigned int m; - int initmcl = MINCL; + unsigned int initmcl = 0; void *buf; thread_t thread = THREAD_NULL; if (nmbclusters == 0) nmbclusters = NMBCLUSTERS; + /* This should be a sane (at least even) value by now */ + VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1)); + /* Setup the mbuf table */ mbuf_table_init(); @@ -1131,25 +1333,51 @@ mbinit(void) mbuf_mlock_attr = lck_attr_alloc_init(); mbuf_mlock = lck_mtx_alloc_init(mbuf_mlock_grp, mbuf_mlock_attr); - /* Allocate cluster slabs table */ - maxslabgrp = P2ROUNDUP(nmbclusters, NSLABSPMB) / NSLABSPMB; + /* + * Allocate cluster slabs table: + * + * maxslabgrp = (N * 2048) / (1024 * 1024) + * + * Where N is nmbclusters rounded up to the nearest 512. This yields + * mcl_slab_g_t units, each one representing a MB of memory. + */ + maxslabgrp = + (P2ROUNDUP(nmbclusters, (MBSIZE >> 11)) << MCLSHIFT) >> MBSHIFT; MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *), M_TEMP, M_WAITOK | M_ZERO); VERIFY(slabstbl != NULL); - /* Allocate audit structures if needed */ + /* + * Allocate audit structures, if needed: + * + * maxclaudit = (maxslabgrp * 1024 * 1024) / 4096 + * + * This yields mcl_audit_t units, each one representing a page. + */ PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug)); mbuf_debug |= mcache_getflags(); - if (mbuf_debug & MCF_AUDIT) { - MALLOC(mclaudit, mcl_audit_t *, - nmbclusters * sizeof (*mclaudit), M_TEMP, - M_WAITOK | M_ZERO); + if (mbuf_debug & MCF_DEBUG) { + maxclaudit = ((maxslabgrp << MBSHIFT) >> PGSHIFT); + MALLOC(mclaudit, mcl_audit_t *, maxclaudit * sizeof (*mclaudit), + M_TEMP, M_WAITOK | M_ZERO); VERIFY(mclaudit != NULL); mcl_audit_con_cache = mcache_create("mcl_audit_contents", AUDIT_CONTENTS_SIZE, 0, 0, MCR_SLEEP); VERIFY(mcl_audit_con_cache != NULL); } + mclverify = (mbuf_debug & MCF_VERIFY); + mcltrace = (mbuf_debug & MCF_TRACE); + mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG); + + /* Enable mbuf leak logging, with a lock to protect the tables */ + + mleak_lock_grp_attr = lck_grp_attr_alloc_init(); + mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr); + mleak_lock_attr = lck_attr_alloc_init(); + mleak_lock = lck_mtx_alloc_init(mleak_lock_grp, mleak_lock_attr); + + mleak_activate(); /* Calculate the number of pages assigned to the cluster pool */ mcl_pages = (nmbclusters * MCLBYTES) / CLBYTES; @@ -1161,19 +1389,41 @@ mbinit(void) mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages); bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t)); - embutl = (union mcluster *) + embutl = (union mbigcluster *) ((unsigned char *)mbutl + (nmbclusters * MCLBYTES)); + VERIFY((((char *)embutl - (char *)mbutl) % MBIGCLBYTES) == 0); + /* Prime up the freelist */ PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl)); + if (initmcl != 0) { + initmcl >>= NCLPBGSHIFT; /* become a 4K unit */ + if (initmcl > m_maxlimit(MC_BIGCL)) + initmcl = m_maxlimit(MC_BIGCL); + } + if (initmcl < m_minlimit(MC_BIGCL)) + initmcl = m_minlimit(MC_BIGCL); lck_mtx_lock(mbuf_mlock); - if (m_clalloc(MAX(NBPG/CLBYTES, 1) * initmcl, M_WAIT, MCLBYTES) == 0) - panic("mbinit: m_clalloc failed\n"); + /* + * For classes with non-zero minimum limits, populate their freelists + * so that m_total(class) is at least m_minlimit(class). + */ + VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0); + freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT); + VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL)); + freelist_init(m_class(MC_CL)); + + for (m = 0; m < NELEM(mbuf_table); m++) { + /* Make sure we didn't miss any */ + VERIFY(m_minlimit(m_class(m)) == 0 || + m_total(m_class(m)) >= m_minlimit(m_class(m))); + } lck_mtx_unlock(mbuf_mlock); - (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init, NULL, &thread); + (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init, + NULL, &thread); thread_deallocate(thread); ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref), @@ -1181,7 +1431,7 @@ mbinit(void) /* Create the cache for each class */ for (m = 0; m < NELEM(mbuf_table); m++) { - void *allocfunc, *freefunc, *auditfunc; + void *allocfunc, *freefunc, *auditfunc, *logfunc; u_int32_t flags; flags = mbuf_debug; @@ -1190,10 +1440,12 @@ mbinit(void) allocfunc = mbuf_cslab_alloc; freefunc = mbuf_cslab_free; auditfunc = mbuf_cslab_audit; + logfunc = mleak_logger; } else { allocfunc = mbuf_slab_alloc; freefunc = mbuf_slab_free; auditfunc = mbuf_slab_audit; + logfunc = mleak_logger; } /* @@ -1206,8 +1458,11 @@ mbinit(void) njcl == 0) flags |= MCF_NOCPUCACHE; + if (!mclfindleak) + flags |= MCF_NOLEAKLOG; + m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m), - allocfunc, freefunc, auditfunc, mbuf_slab_notify, + allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify, (void *)(uintptr_t)m, flags, MCR_SLEEP); } @@ -1225,30 +1480,31 @@ mbinit(void) mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf, CPU_CACHE_SIZE); bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu)); - mbuf_gscale = MB_GROWTH_NORMAL; - - /* - * Set the max limit on sb_max to be 1/16 th of the size of + /* + * Set the max limit on sb_max to be 1/16 th of the size of * memory allocated for mbuf clusters. */ - high_sb_max = (nmbclusters << (MCLSHIFT - 4)); + high_sb_max = (nmbclusters << (MCLSHIFT - 4)); if (high_sb_max < sb_max) { /* sb_max is too large for this configuration, scale it down */ - if (high_sb_max > (1 << MBSHIFT)) { + if (high_sb_max > (1 << MBSHIFT)) { /* We have atleast 16 M of mbuf pool */ sb_max = high_sb_max; } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) { - /* If we have more than 1M of mbufpool, cap the size of + /* + * If we have more than 1M of mbufpool, cap the size of * max sock buf at 1M - */ + */ sb_max = high_sb_max = (1 << MBSHIFT); } else { sb_max = high_sb_max; } } - printf("mbinit: done (%d MB memory set for mbuf pool)\n", - (nmbclusters << MCLSHIFT) >> MBSHIFT); + printf("mbinit: done [%d MB total pool size, (%d/%d) split]\n", + (nmbclusters << MCLSHIFT) >> MBSHIFT, + (nclusters << MCLSHIFT) >> MBSHIFT, + (njcl << MCLSHIFT) >> MBSHIFT); } /* @@ -1274,7 +1530,7 @@ slab_alloc(mbuf_class_t class, int wait) * more than one buffer chunks (e.g. mbuf slabs). For other * slabs, this probably doesn't make much of a difference. */ - if (class == MC_MBUF && (wait & MCR_COMP)) + if ((class == MC_MBUF || class == MC_CL) && (wait & MCR_COMP)) sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead); else sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class)); @@ -1294,7 +1550,10 @@ slab_alloc(mbuf_class_t class, int wait) if (class == MC_MBUF) { sp->sl_head = buf->obj_next; - VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPCL - 1)); + VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPBG - 1)); + } else if (class == MC_CL) { + sp->sl_head = buf->obj_next; + VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NCLPBG - 1)); } else { sp->sl_head = NULL; } @@ -1319,41 +1578,33 @@ slab_alloc(mbuf_class_t class, int wait) if (class == MC_CL) { mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL); /* - * A 2K cluster slab can have at most 1 reference. + * A 2K cluster slab can have at most NCLPBG references. */ - VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 && - sp->sl_len == m_maxsize(MC_CL) && sp->sl_head == NULL); + VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPBG && + sp->sl_chunks == NCLPBG && + sp->sl_len == m_maxsize(MC_BIGCL)); + VERIFY(sp->sl_refcnt < NCLPBG || sp->sl_head == NULL); } else if (class == MC_BIGCL) { - mcl_slab_t *nsp = sp->sl_next; mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) + m_infree(MC_MBUF_BIGCL); /* - * Increment 2nd slab. A 4K big cluster takes - * 2 slabs, each having at most 1 reference. + * A 4K cluster slab can have at most 1 reference. */ VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 && - sp->sl_len == m_maxsize(MC_BIGCL) && sp->sl_head == NULL); - /* Next slab must already be present */ - VERIFY(nsp != NULL); - nsp->sl_refcnt++; - VERIFY(!slab_is_detached(nsp)); - VERIFY(nsp->sl_class == MC_BIGCL && - nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) && - nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 && - nsp->sl_len == 0 && nsp->sl_base == sp->sl_base && - nsp->sl_head == NULL); + sp->sl_len == m_maxsize(class) && sp->sl_head == NULL); } else if (class == MC_16KCL) { mcl_slab_t *nsp; int k; --m_infree(MC_16KCL); VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 && - sp->sl_len == m_maxsize(MC_16KCL) && sp->sl_head == NULL); + sp->sl_len == m_maxsize(class) && sp->sl_head == NULL); /* - * Increment 2nd-8th slab. A 16K big cluster takes - * 8 cluster slabs, each having at most 1 reference. + * Increment 2nd-Nth slab reference, where N is NSLABSP16KB. + * A 16KB big cluster takes NSLABSP16KB slabs, each having at + * most 1 reference. */ - for (nsp = sp, k = 1; k < (M16KCLBYTES / MCLBYTES); k++) { + for (nsp = sp, k = 1; k < NSLABSP16KB; k++) { nsp = nsp->sl_next; /* Next slab must already be present */ VERIFY(nsp != NULL); @@ -1366,7 +1617,7 @@ slab_alloc(mbuf_class_t class, int wait) nsp->sl_head == NULL); } } else { - ASSERT(class == MC_MBUF); + VERIFY(class == MC_MBUF); --m_infree(MC_MBUF); /* * If auditing is turned on, this check is @@ -1376,20 +1627,20 @@ slab_alloc(mbuf_class_t class, int wait) _MCHECK((struct mbuf *)buf); /* * Since we have incremented the reference count above, - * an mbuf slab (formerly a 2K cluster slab that was cut + * an mbuf slab (formerly a 4KB cluster slab that was cut * up into mbufs) must have a reference count between 1 - * and NMBPCL at this point. + * and NMBPBG at this point. */ - VERIFY(sp->sl_refcnt >= 1 && - (unsigned short)sp->sl_refcnt <= NMBPCL && - sp->sl_chunks == NMBPCL && sp->sl_len == m_maxsize(MC_CL)); - VERIFY((unsigned short)sp->sl_refcnt < NMBPCL || - sp->sl_head == NULL); + VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPBG && + sp->sl_chunks == NMBPBG && + sp->sl_len == m_maxsize(MC_BIGCL)); + VERIFY(sp->sl_refcnt < NMBPBG || sp->sl_head == NULL); } /* If empty, remove this slab from the class's freelist */ if (sp->sl_head == NULL) { - VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPCL); + VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPBG); + VERIFY(class != MC_CL || sp->sl_refcnt == NCLPBG); slab_remove(sp, class); } @@ -1415,45 +1666,38 @@ slab_free(mbuf_class_t class, mcache_obj_t *buf) /* Decrement slab reference */ sp->sl_refcnt--; - if (class == MC_CL || class == MC_BIGCL) { + if (class == MC_CL) { VERIFY(IS_P2ALIGNED(buf, MCLBYTES)); /* - * A 2K cluster slab can have at most 1 reference + * A slab that has been splitted for 2KB clusters can have + * at most 1 outstanding reference at this point. + */ + VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPBG - 1) && + sp->sl_chunks == NCLPBG && + sp->sl_len == m_maxsize(MC_BIGCL)); + VERIFY(sp->sl_refcnt < (NCLPBG - 1) || + (slab_is_detached(sp) && sp->sl_head == NULL)); + } else if (class == MC_BIGCL) { + VERIFY(IS_P2ALIGNED(buf, MCLBYTES)); + /* + * A 4KB cluster slab can have at most 1 reference * which must be 0 at this point. */ VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 && sp->sl_len == m_maxsize(class) && sp->sl_head == NULL); VERIFY(slab_is_detached(sp)); - if (class == MC_BIGCL) { - mcl_slab_t *nsp = sp->sl_next; - VERIFY(IS_P2ALIGNED(buf, NBPG)); - /* Next slab must already be present */ - VERIFY(nsp != NULL); - /* Decrement 2nd slab reference */ - nsp->sl_refcnt--; - /* - * A 4K big cluster takes 2 slabs, both - * must now have 0 reference. - */ - VERIFY(slab_is_detached(nsp)); - VERIFY(nsp->sl_class == MC_BIGCL && - (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) && - nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 && - nsp->sl_len == 0 && nsp->sl_base == sp->sl_base && - nsp->sl_head == NULL); - } } else if (class == MC_16KCL) { mcl_slab_t *nsp; int k; /* - * A 16K cluster takes 8 cluster slabs, all must + * A 16KB cluster takes NSLABSP16KB slabs, all must * now have 0 reference. */ - VERIFY(IS_P2ALIGNED(buf, NBPG)); + VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES)); VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 && - sp->sl_len == m_maxsize(MC_16KCL) && sp->sl_head == NULL); + sp->sl_len == m_maxsize(class) && sp->sl_head == NULL); VERIFY(slab_is_detached(sp)); - for (nsp = sp, k = 1; k < (M16KCLBYTES / MCLBYTES); k++) { + for (nsp = sp, k = 1; k < NSLABSP16KB; k++) { nsp = nsp->sl_next; /* Next slab must already be present */ VERIFY(nsp != NULL); @@ -1467,14 +1711,15 @@ slab_free(mbuf_class_t class, mcache_obj_t *buf) } } else { /* - * An mbuf slab has a total of NMBPL reference counts. - * Since we have decremented the reference above, it - * must now be between 0 and NMBPCL-1. + * A slab that has been splitted for mbufs has at most NMBPBG + * reference counts. Since we have decremented one reference + * above, it must now be between 0 and NMBPBG-1. */ - VERIFY(sp->sl_refcnt >= 0 && - (unsigned short)sp->sl_refcnt <= (NMBPCL - 1) && - sp->sl_chunks == NMBPCL && sp->sl_len == m_maxsize(MC_CL)); - VERIFY(sp->sl_refcnt < (NMBPCL - 1) || + VERIFY(class == MC_MBUF); + VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NMBPBG - 1) && + sp->sl_chunks == NMBPBG && + sp->sl_len == m_maxsize(MC_BIGCL)); + VERIFY(sp->sl_refcnt < (NMBPBG - 1) || (slab_is_detached(sp) && sp->sl_head == NULL)); } @@ -1485,12 +1730,15 @@ slab_free(mbuf_class_t class, mcache_obj_t *buf) */ if (mclaudit != NULL) { mcache_audit_t *mca = mcl_audit_buf2mca(class, buf); - mcache_audit_free_verify(mca, buf, 0, m_maxsize(class)); + if (mclverify) { + mcache_audit_free_verify(mca, buf, 0, m_maxsize(class)); + } mca->mca_uflags &= ~MB_SCVALID; } if (class == MC_CL) { mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL); + buf->obj_next = sp->sl_head; } else if (class == MC_BIGCL) { mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) + m_infree(MC_MBUF_BIGCL); @@ -1502,14 +1750,25 @@ slab_free(mbuf_class_t class, mcache_obj_t *buf) } sp->sl_head = buf; - /* All mbufs are freed; return the cluster that we stole earlier */ - if (sp->sl_refcnt == 0 && class == MC_MBUF) { - int i = NMBPCL; - - m_total(MC_MBUF) -= NMBPCL; + /* + * If a slab has been splitted to either one which holds 2KB clusters, + * or one which holds mbufs, turn it back to one which holds a 4KB + * cluster. + */ + if (class == MC_MBUF && sp->sl_refcnt == 0 && + m_total(class) > m_minlimit(class) && + m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) { + int i = NMBPBG; + + m_total(MC_BIGCL)++; + mbstat.m_bigclusters = m_total(MC_BIGCL); + m_total(MC_MBUF) -= NMBPBG; mbstat.m_mbufs = m_total(MC_MBUF); - m_infree(MC_MBUF) -= NMBPCL; - mtype_stat_add(MT_FREE, -((unsigned)NMBPCL)); + m_infree(MC_MBUF) -= NMBPBG; + mtype_stat_add(MT_FREE, -((unsigned)NMBPBG)); + + VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL)); + VERIFY(m_total(MC_MBUF) >= m_minlimit(MC_MBUF)); while (i--) { struct mbuf *m = sp->sl_head; @@ -1522,19 +1781,58 @@ slab_free(mbuf_class_t class, mcache_obj_t *buf) /* Remove the slab from the mbuf class's slab list */ slab_remove(sp, class); - /* Reinitialize it as a 2K cluster slab */ - slab_init(sp, MC_CL, sp->sl_flags, sp->sl_base, sp->sl_base, + /* Reinitialize it as a 4KB cluster slab */ + slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base, sp->sl_len, 0, 1); - if (mclaudit != NULL) + if (mclverify) { mcache_set_pattern(MCACHE_FREE_PATTERN, - (caddr_t)sp->sl_head, m_maxsize(MC_CL)); + (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL)); + } + mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) + + m_infree(MC_MBUF_BIGCL); - mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL); + VERIFY(slab_is_detached(sp)); + /* And finally switch class */ + class = MC_BIGCL; + } else if (class == MC_CL && sp->sl_refcnt == 0 && + m_total(class) > m_minlimit(class) && + m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) { + int i = NCLPBG; + + m_total(MC_BIGCL)++; + mbstat.m_bigclusters = m_total(MC_BIGCL); + m_total(MC_CL) -= NCLPBG; + mbstat.m_clusters = m_total(MC_CL); + m_infree(MC_CL) -= NCLPBG; + VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL)); + VERIFY(m_total(MC_CL) >= m_minlimit(MC_CL)); + + while (i--) { + union mcluster *c = sp->sl_head; + VERIFY(c != NULL); + sp->sl_head = c->mcl_next; + c->mcl_next = NULL; + } + VERIFY(sp->sl_head == NULL); + + /* Remove the slab from the 2KB cluster class's slab list */ + slab_remove(sp, class); + + /* Reinitialize it as a 4KB cluster slab */ + slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base, + sp->sl_len, 0, 1); + + if (mclverify) { + mcache_set_pattern(MCACHE_FREE_PATTERN, + (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL)); + } + mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) + + m_infree(MC_MBUF_BIGCL); VERIFY(slab_is_detached(sp)); /* And finally switch class */ - class = MC_CL; + class = MC_BIGCL; } /* Reinsert the slab to the class's slab list */ @@ -1593,6 +1891,9 @@ mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait) if (mbuf_cached_above(class, wait)) break; + /* watchdog checkpoint */ + mbuf_watchdog(); + /* We have nothing and cannot block; give up */ if (wait & MCR_NOSLEEP) { if (!(wait & MCR_TRYHARD)) { @@ -1689,7 +1990,9 @@ mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc) ASSERT(!(mca->mca_uflags & MB_SCVALID)); } /* Record this transaction */ - mcache_buffer_log(mca, list, m_cache(class)); + if (mcltrace) + mcache_buffer_log(mca, list, m_cache(class)); + if (alloc) mca->mca_uflags |= MB_INUSE; else @@ -1756,16 +2059,17 @@ cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num) clsp = slab_get(cl); VERIFY(m->m_flags == M_EXT && cl != NULL); VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m)); - VERIFY(clsp->sl_refcnt == 1); - if (class == MC_MBUF_BIGCL) { - nsp = clsp->sl_next; - /* Next slab must already be present */ - VERIFY(nsp != NULL); - VERIFY(nsp->sl_refcnt == 1); - } else if (class == MC_MBUF_16KCL) { + + if (class == MC_MBUF_CL) { + VERIFY(clsp->sl_refcnt >= 1 && + clsp->sl_refcnt <= NCLPBG); + } else { + VERIFY(clsp->sl_refcnt == 1); + } + + if (class == MC_MBUF_16KCL) { int k; - for (nsp = clsp, k = 1; - k < (M16KCLBYTES / MCLBYTES); k++) { + for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) { nsp = nsp->sl_next; /* Next slab must already be present */ VERIFY(nsp != NULL); @@ -1802,11 +2106,21 @@ cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged) mcache_obj_t *ref_list = NULL; mcl_slab_t *clsp, *nsp; void *cl; + mbuf_class_t cl_class; ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); VERIFY(class != MC_MBUF_16KCL || njcl > 0); lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + if (class == MC_MBUF_CL) { + cl_class = MC_CL; + } else if (class == MC_MBUF_BIGCL) { + cl_class = MC_BIGCL; + } else { + VERIFY(class == MC_MBUF_16KCL); + cl_class = MC_16KCL; + } + o = tail = list; while ((m = ms = (struct mbuf *)o) != NULL) { @@ -1815,37 +2129,33 @@ cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged) /* Do the mbuf sanity checks */ if (mclaudit != NULL) { mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); - mcache_audit_free_verify(mca, m, 0, m_maxsize(MC_MBUF)); + if (mclverify) { + mcache_audit_free_verify(mca, m, 0, + m_maxsize(MC_MBUF)); + } ms = (struct mbuf *)mca->mca_contents; } /* Do the cluster sanity checks */ cl = ms->m_ext.ext_buf; clsp = slab_get(cl); - if (mclaudit != NULL) { - size_t size; - if (class == MC_MBUF_CL) - size = m_maxsize(MC_CL); - else if (class == MC_MBUF_BIGCL) - size = m_maxsize(MC_BIGCL); - else - size = m_maxsize(MC_16KCL); - mcache_audit_free_verify(mcl_audit_buf2mca(MC_CL, + if (mclverify) { + size_t size = m_maxsize(cl_class); + mcache_audit_free_verify(mcl_audit_buf2mca(cl_class, (mcache_obj_t *)cl), cl, 0, size); } VERIFY(ms->m_type == MT_FREE); VERIFY(ms->m_flags == M_EXT); VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms)); - VERIFY(clsp->sl_refcnt == 1); - if (class == MC_MBUF_BIGCL) { - nsp = clsp->sl_next; - /* Next slab must already be present */ - VERIFY(nsp != NULL); - VERIFY(nsp->sl_refcnt == 1); - } else if (class == MC_MBUF_16KCL) { + if (cl_class == MC_CL) { + VERIFY(clsp->sl_refcnt >= 1 && + clsp->sl_refcnt <= NCLPBG); + } else { + VERIFY(clsp->sl_refcnt == 1); + } + if (cl_class == MC_16KCL) { int k; - for (nsp = clsp, k = 1; - k < (M16KCLBYTES / MCLBYTES); k++) { + for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) { nsp = nsp->sl_next; /* Next slab must already be present */ VERIFY(nsp != NULL); @@ -1926,7 +2236,7 @@ mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed, int wait) { mbuf_class_t class = (mbuf_class_t)arg; - mcache_t *cp = NULL; + mbuf_class_t cl_class = 0; unsigned int num = 0, cnum = 0, want = needed; mcache_obj_t *ref_list = NULL; mcache_obj_t *mp_list = NULL; @@ -1977,22 +2287,28 @@ mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed, if (!(wait & MCR_NOSLEEP)) wait |= MCR_FAILOK; + /* allocate mbufs */ needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait); if (needed == 0) { ASSERT(mp_list == NULL); goto fail; } - if (class == MC_MBUF_CL) - cp = m_cache(MC_CL); - else if (class == MC_MBUF_BIGCL) - cp = m_cache(MC_BIGCL); - else - cp = m_cache(MC_16KCL); - needed = mcache_alloc_ext(cp, &clp_list, needed, wait); + + /* allocate clusters */ + if (class == MC_MBUF_CL) { + cl_class = MC_CL; + } else if (class == MC_MBUF_BIGCL) { + cl_class = MC_BIGCL; + } else { + VERIFY(class == MC_MBUF_16KCL); + cl_class = MC_16KCL; + } + needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait); if (needed == 0) { ASSERT(clp_list == NULL); goto fail; } + needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait); if (needed == 0) { ASSERT(ref_list == NULL); @@ -2025,7 +2341,6 @@ mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed, */ if (mclaudit != NULL) { mcache_audit_t *mca, *cl_mca; - size_t size; lck_mtx_lock(mbuf_mlock); mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); @@ -2048,15 +2363,22 @@ mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed, lck_mtx_unlock(mbuf_mlock); /* Technically, they are in the freelist */ - mcache_set_pattern(MCACHE_FREE_PATTERN, m, - m_maxsize(MC_MBUF)); - if (class == MC_MBUF_CL) - size = m_maxsize(MC_CL); - else if (class == MC_MBUF_BIGCL) - size = m_maxsize(MC_BIGCL); - else - size = m_maxsize(MC_16KCL); - mcache_set_pattern(MCACHE_FREE_PATTERN, cl, size); + if (mclverify) { + size_t size; + + mcache_set_pattern(MCACHE_FREE_PATTERN, m, + m_maxsize(MC_MBUF)); + + if (class == MC_MBUF_CL) + size = m_maxsize(MC_CL); + else if (class == MC_MBUF_BIGCL) + size = m_maxsize(MC_BIGCL); + else + size = m_maxsize(MC_16KCL); + + mcache_set_pattern(MCACHE_FREE_PATTERN, cl, + size); + } } MBUF_INIT(ms, 0, MT_FREE); @@ -2082,7 +2404,7 @@ mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed, if (mp_list != NULL) mcache_free_ext(m_cache(MC_MBUF), mp_list); if (clp_list != NULL) - mcache_free_ext(cp, clp_list); + mcache_free_ext(m_cache(cl_class), clp_list); if (ref_list != NULL) mcache_free_ext(ref_cache, ref_list); @@ -2152,7 +2474,9 @@ mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc) /* Do the mbuf sanity checks and record its transaction */ mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); mcl_audit_mbuf(mca, m, TRUE, alloc); - mcache_buffer_log(mca, m, m_cache(class)); + if (mcltrace) + mcache_buffer_log(mca, m, m_cache(class)); + if (alloc) mca->mca_uflags |= MB_COMP_INUSE; else @@ -2163,7 +2487,7 @@ mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc) * freeing, since the contents of the actual mbuf has been * pattern-filled by the above call to mcl_audit_mbuf(). */ - if (!alloc) + if (!alloc && mclverify) ms = (struct mbuf *)mca->mca_contents; /* Do the cluster sanity checks and record its transaction */ @@ -2171,16 +2495,15 @@ mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc) clsp = slab_get(cl); VERIFY(ms->m_flags == M_EXT && cl != NULL); VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms)); - VERIFY(clsp->sl_refcnt == 1); - if (class == MC_MBUF_BIGCL) { - nsp = clsp->sl_next; - /* Next slab must already be present */ - VERIFY(nsp != NULL); - VERIFY(nsp->sl_refcnt == 1); - } else if (class == MC_MBUF_16KCL) { + if (class == MC_MBUF_CL) + VERIFY(clsp->sl_refcnt >= 1 && + clsp->sl_refcnt <= NCLPBG); + else + VERIFY(clsp->sl_refcnt == 1); + + if (class == MC_MBUF_16KCL) { int k; - for (nsp = clsp, k = 1; - k < (M16KCLBYTES / MCLBYTES); k++) { + for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) { nsp = nsp->sl_next; /* Next slab must already be present */ VERIFY(nsp != NULL); @@ -2196,7 +2519,9 @@ mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc) else size = m_maxsize(MC_16KCL); mcl_audit_cluster(mca, cl, size, alloc, FALSE); - mcache_buffer_log(mca, cl, m_cache(class)); + if (mcltrace) + mcache_buffer_log(mca, cl, m_cache(class)); + if (alloc) mca->mca_uflags |= MB_COMP_INUSE; else @@ -2221,8 +2546,8 @@ m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) mcache_obj_t *con_list = NULL; mcl_slab_t *sp; - VERIFY(bufsize == m_maxsize(MC_CL) || - bufsize == m_maxsize(MC_BIGCL) || bufsize == m_maxsize(MC_16KCL)); + VERIFY(bufsize == m_maxsize(MC_BIGCL) || + bufsize == m_maxsize(MC_16KCL)); lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); @@ -2258,7 +2583,7 @@ m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) page = kmem_mb_alloc(mb_map, size, large_buffer); /* - * If we did ask for "n" 16K physically contiguous chunks + * If we did ask for "n" 16KB physically contiguous chunks * and didn't get them, then please try again without this * restriction. */ @@ -2266,8 +2591,8 @@ m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) page = kmem_mb_alloc(mb_map, size, 0); if (page == 0) { - if (bufsize <= m_maxsize(MC_BIGCL)) { - /* Try for 1 page if failed, only for 2KB/4KB request */ + if (bufsize == m_maxsize(MC_BIGCL)) { + /* Try for 1 page if failed, only 4KB request */ size = NBPG; page = kmem_mb_alloc(mb_map, size, 0); } @@ -2288,24 +2613,20 @@ m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) /* * Yes, I realize this is a waste of memory for clusters * that never get transformed into mbufs, as we may end - * up with NMBPCL-1 unused audit structures per cluster. + * up with NMBPBG-1 unused audit structures per cluster. * But doing so tremendously simplifies the allocation * strategy, since at this point we are not holding the - * mbuf lock and the caller is okay to be blocked. For - * the case of big clusters, we allocate one structure - * for each as we never turn them into mbufs. + * mbuf lock and the caller is okay to be blocked. */ - if (bufsize == m_maxsize(MC_CL)) { - needed = numpages * 2 * NMBPCL; + if (bufsize == m_maxsize(MC_BIGCL)) { + needed = numpages * NMBPBG; i = mcache_alloc_ext(mcl_audit_con_cache, &con_list, needed, MCR_SLEEP); VERIFY(con_list != NULL && i == needed); - } else if (bufsize == m_maxsize(MC_BIGCL)) { - needed = numpages; } else { - needed = numpages / (M16KCLBYTES / NBPG); + needed = numpages / NSLABSP16KB; } i = mcache_alloc_ext(mcache_audit_cache, @@ -2331,68 +2652,23 @@ m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) mcl_paddr[offset] = new_page << PGSHIFT; /* Pattern-fill this fresh page */ - if (mclaudit != NULL) + if (mclverify) { mcache_set_pattern(MCACHE_FREE_PATTERN, (caddr_t)page, NBPG); - - if (bufsize == m_maxsize(MC_CL)) { - union mcluster *mcl = (union mcluster *)page; - - /* 1st cluster in the page */ - sp = slab_get(mcl); - if (mclaudit != NULL) - mcl_audit_init(mcl, &mca_list, &con_list, - AUDIT_CONTENTS_SIZE, NMBPCL); - - VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0); - slab_init(sp, MC_CL, SLF_MAPPED, - mcl, mcl, bufsize, 0, 1); - - /* Insert this slab */ - slab_insert(sp, MC_CL); - - /* Update stats now since slab_get() drops the lock */ - mbstat.m_clfree = ++m_infree(MC_CL) + - m_infree(MC_MBUF_CL); - mbstat.m_clusters = ++m_total(MC_CL); - VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL)); - - /* 2nd cluster in the page */ - sp = slab_get(++mcl); - if (mclaudit != NULL) - mcl_audit_init(mcl, &mca_list, &con_list, - AUDIT_CONTENTS_SIZE, NMBPCL); - - VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0); - slab_init(sp, MC_CL, SLF_MAPPED, - mcl, mcl, bufsize, 0, 1); - - /* Insert this slab */ - slab_insert(sp, MC_CL); - - /* Update stats now since slab_get() drops the lock */ - mbstat.m_clfree = ++m_infree(MC_CL) + - m_infree(MC_MBUF_CL); - mbstat.m_clusters = ++m_total(MC_CL); - VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL)); - } else if (bufsize == m_maxsize(MC_BIGCL)) { + } + if (bufsize == m_maxsize(MC_BIGCL)) { union mbigcluster *mbc = (union mbigcluster *)page; - mcl_slab_t *nsp; /* One for the entire page */ sp = slab_get(mbc); - if (mclaudit != NULL) - mcl_audit_init(mbc, &mca_list, NULL, 0, 1); - + if (mclaudit != NULL) { + mcl_audit_init(mbc, &mca_list, &con_list, + AUDIT_CONTENTS_SIZE, NMBPBG); + } VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0); slab_init(sp, MC_BIGCL, SLF_MAPPED, mbc, mbc, bufsize, 0, 1); - /* 2nd cluster's slab is part of the previous one */ - nsp = slab_get(((union mcluster *)page) + 1); - slab_init(nsp, MC_BIGCL, SLF_MAPPED | SLF_PARTIAL, - mbc, NULL, 0, 0, 0); - /* Insert this slab */ slab_insert(sp, MC_BIGCL); @@ -2401,7 +2677,7 @@ m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) m_infree(MC_MBUF_BIGCL); mbstat.m_bigclusters = ++m_total(MC_BIGCL); VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL)); - } else if ((i % (M16KCLBYTES / NBPG)) == 0) { + } else if ((i % NSLABSP16KB) == 0) { union m16kcluster *m16kcl = (union m16kcluster *)page; mcl_slab_t *nsp; int k; @@ -2416,9 +2692,12 @@ m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) slab_init(sp, MC_16KCL, SLF_MAPPED, m16kcl, m16kcl, bufsize, 0, 1); - /* 2nd-8th cluster's slab is part of the first one */ - for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) { - nsp = slab_get(((union mcluster *)page) + k); + /* + * 2nd-Nth page's slab is part of the first one, + * where N is NSLABSP16KB. + */ + for (k = 1; k < NSLABSP16KB; k++) { + nsp = slab_get(((union mbigcluster *)page) + k); VERIFY(nsp->sl_refcnt == 0 && nsp->sl_flags == 0); slab_init(nsp, MC_16KCL, @@ -2444,13 +2723,11 @@ m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) wakeup(mb_clalloc_waitchan); } - if (bufsize == m_maxsize(MC_CL)) - return (numpages << 1); - else if (bufsize == m_maxsize(MC_BIGCL)) + if (bufsize == m_maxsize(MC_BIGCL)) return (numpages); VERIFY(bufsize == m_maxsize(MC_16KCL)); - return (numpages / (M16KCLBYTES / NBPG)); + return (numpages / NSLABSP16KB); out: lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); @@ -2466,23 +2743,7 @@ m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) * When non-blocking we kick a thread if we have to grow the * pool or if the number of free clusters is less than requested. */ - if (bufsize == m_maxsize(MC_CL)) { - if (i > 0) { - /* - * Remember total number of clusters needed - * at this time. - */ - i += m_total(MC_CL); - if (i > mbuf_expand_mcl) { - mbuf_expand_mcl = i; - if (mbuf_worker_ready) - wakeup((caddr_t)&mbuf_worker_run); - } - } - - if (m_infree(MC_CL) >= num) - return (1); - } else if (bufsize == m_maxsize(MC_BIGCL)) { + if (bufsize == m_maxsize(MC_BIGCL)) { if (i > 0) { /* * Remember total number of 4KB clusters needed @@ -2525,44 +2786,30 @@ static int freelist_populate(mbuf_class_t class, unsigned int num, int wait) { mcache_obj_t *o = NULL; - int i; + int i, numpages = 0, count; VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL || class == MC_16KCL); -#if CONFIG_MBUF_NOEXPAND - if ((mbstat.m_mbufs / NMBPCL) >= maxmbufcl) { -#if DEBUG - static int printonce = 1; - if (printonce == 1) { - printonce = 0; - printf("m_expand failed, allocated %ld out of %d " - "clusters\n", mbstat.m_mbufs / NMBPCL, - nmbclusters); - } -#endif /* DEBUG */ - return (0); - } -#endif /* CONFIG_MBUF_NOEXPAND */ - lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); switch (class) { case MC_MBUF: case MC_CL: - i = m_clalloc(num, wait, m_maxsize(MC_CL)); + case MC_BIGCL: + numpages = (num * m_size(class) + NBPG - 1) / NBPG; + i = m_clalloc(numpages, wait, m_maxsize(MC_BIGCL)); - /* Respect the 2K clusters minimum limit */ - if (m_total(MC_CL) == m_maxlimit(MC_CL) && - m_infree(MC_CL) <= m_minlimit(MC_CL)) { - if (class != MC_CL || (wait & MCR_COMP)) + /* Respect the 4KB clusters minimum limit */ + if (m_total(MC_BIGCL) == m_maxlimit(MC_BIGCL) && + m_infree(MC_BIGCL) <= m_minlimit(MC_BIGCL)) { + if (class != MC_BIGCL || (wait & MCR_COMP)) return (0); } - if (class == MC_CL) + if (class == MC_BIGCL) return (i != 0); break; - case MC_BIGCL: case MC_16KCL: return (m_clalloc(num, wait, m_maxsize(class)) != 0); /* NOTREACHED */ @@ -2572,66 +2819,119 @@ freelist_populate(mbuf_class_t class, unsigned int num, int wait) /* NOTREACHED */ } - /* Steal a cluster and cut it up to create NMBPCL mbufs */ - if ((o = slab_alloc(MC_CL, wait)) != NULL) { + VERIFY(class == MC_MBUF || class == MC_CL); + + /* how many objects will we cut the page into? */ + int numobj = (class == MC_MBUF ? NMBPBG : NCLPBG); + + for (count = 0; count < numpages; count++) { + + /* respect totals, minlimit, maxlimit */ + if (m_total(MC_BIGCL) <= m_minlimit(MC_BIGCL) || + m_total(class) >= m_maxlimit(class)) + break; + + if ((o = slab_alloc(MC_BIGCL, wait)) == NULL) + break; + struct mbuf *m = (struct mbuf *)o; - mcache_audit_t *mca = NULL; + union mcluster *c = (union mcluster *)o; mcl_slab_t *sp = slab_get(o); + mcache_audit_t *mca = NULL; VERIFY(slab_is_detached(sp) && (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED); - /* Make sure that the cluster is unmolested while in freelist */ - if (mclaudit != NULL) { - mca = mcl_audit_buf2mca(MC_CL, o); - mcache_audit_free_verify(mca, o, 0, m_maxsize(MC_CL)); + /* + * Make sure that the cluster is unmolested + * while in freelist + */ + if (mclverify) { + mca = mcl_audit_buf2mca(MC_BIGCL, o); + mcache_audit_free_verify(mca, o, 0, + m_maxsize(MC_BIGCL)); } - /* Reinitialize it as an mbuf slab */ - slab_init(sp, MC_MBUF, sp->sl_flags, sp->sl_base, NULL, - sp->sl_len, 0, NMBPCL); + /* Reinitialize it as an mbuf or 2K slab */ + slab_init(sp, class, sp->sl_flags, + sp->sl_base, NULL, sp->sl_len, 0, numobj); - VERIFY(m == (struct mbuf *)sp->sl_base); + VERIFY(o == (mcache_obj_t *)sp->sl_base); VERIFY(sp->sl_head == NULL); - m_total(MC_MBUF) += NMBPCL; - mbstat.m_mbufs = m_total(MC_MBUF); - m_infree(MC_MBUF) += NMBPCL; - mtype_stat_add(MT_FREE, NMBPCL); + VERIFY(m_total(MC_BIGCL) > 0); + m_total(MC_BIGCL)--; + mbstat.m_bigclusters = m_total(MC_BIGCL); - i = NMBPCL; - while (i--) { - /* - * If auditing is enabled, construct the shadow mbuf - * in the audit structure instead of the actual one. - * mbuf_slab_audit() will take care of restoring the - * contents after the integrity check. - */ - if (mclaudit != NULL) { - struct mbuf *ms; - mca = mcl_audit_buf2mca(MC_MBUF, - (mcache_obj_t *)m); - ms = ((struct mbuf *)mca->mca_contents); - ms->m_type = MT_FREE; - } else { - m->m_type = MT_FREE; + m_total(class) += numobj; + m_infree(class) += numobj; + + VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL)); + VERIFY(m_total(class) <= m_maxlimit(class)); + + i = numobj; + if (class == MC_MBUF) { + mbstat.m_mbufs = m_total(MC_MBUF); + mtype_stat_add(MT_FREE, NMBPBG); + while (i--) { + /* + * If auditing is enabled, construct the + * shadow mbuf in the audit structure + * instead of the actual one. + * mbuf_slab_audit() will take care of + * restoring the contents after the + * integrity check. + */ + if (mclaudit != NULL) { + struct mbuf *ms; + mca = mcl_audit_buf2mca(MC_MBUF, + (mcache_obj_t *)m); + ms = ((struct mbuf *) + mca->mca_contents); + ms->m_type = MT_FREE; + } else { + m->m_type = MT_FREE; + } + m->m_next = sp->sl_head; + sp->sl_head = (void *)m++; + } + } else { /* MC_CL */ + mbstat.m_clfree = + m_infree(MC_CL) + m_infree(MC_MBUF_CL); + mbstat.m_clusters = m_total(MC_CL); + while (i--) { + c->mcl_next = sp->sl_head; + sp->sl_head = (void *)c++; } - m->m_next = sp->sl_head; - sp->sl_head = (void *)m++; } - /* Insert it into the mbuf class's slab list */ - slab_insert(sp, MC_MBUF); + /* Insert into the mbuf or 2k slab list */ + slab_insert(sp, class); if ((i = mb_waiters) > 0) mb_waiters = 0; if (i != 0) wakeup(mb_waitchan); - - return (1); } + return (count != 0); +} - return (0); +/* + * For each class, initialize the freelist to hold m_minlimit() objects. + */ +static void +freelist_init(mbuf_class_t class) +{ + lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + VERIFY(class == MC_CL || class == MC_BIGCL); + VERIFY(m_total(class) == 0); + VERIFY(m_minlimit(class) > 0); + + while (m_total(class) < m_minlimit(class)) + (void) freelist_populate(class, m_minlimit(class), M_WAIT); + + VERIFY(m_total(class) >= m_minlimit(class)); } /* @@ -2736,17 +3036,23 @@ m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp) switch (class) { case MC_MBUF: m_wantpurge(MC_CL)++; + m_wantpurge(MC_BIGCL)++; m_wantpurge(MC_MBUF_CL)++; m_wantpurge(MC_MBUF_BIGCL)++; break; case MC_CL: m_wantpurge(MC_MBUF)++; + m_wantpurge(MC_BIGCL)++; + m_wantpurge(MC_MBUF_BIGCL)++; if (!comp) m_wantpurge(MC_MBUF_CL)++; break; case MC_BIGCL: + m_wantpurge(MC_MBUF)++; + m_wantpurge(MC_CL)++; + m_wantpurge(MC_MBUF_CL)++; if (!comp) m_wantpurge(MC_MBUF_BIGCL)++; break; @@ -2894,11 +3200,11 @@ m_free(struct mbuf *m) if (m->m_flags & M_EXT) { u_int32_t refcnt; - u_int32_t flags; + u_int32_t composite; refcnt = m_decref(m); - flags = MEXT_FLAGS(m); - if (refcnt == 0 && flags == 0) { + composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE); + if (refcnt == 0 && !composite) { if (m->m_ext.ext_free == NULL) { mcache_free(m_cache(MC_CL), m->m_ext.ext_buf); } else if (m->m_ext.ext_free == m_bigfree) { @@ -2913,7 +3219,7 @@ m_free(struct mbuf *m) } mcache_free(ref_cache, MEXT_RFA(m)); MEXT_RFA(m) = NULL; - } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) { + } else if (refcnt == 0 && composite) { VERIFY(m->m_type != MT_FREE); mtype_stat_dec(m->m_type); @@ -2924,6 +3230,8 @@ m_free(struct mbuf *m) m->m_len = 0; m->m_next = m->m_nextpkt = NULL; + MEXT_FLAGS(m) &= ~EXTF_READONLY; + /* "Free" into the intermediate cache */ if (m->m_ext.ext_free == NULL) { mcache_free(m_cache(MC_MBUF_CL), m); @@ -2963,11 +3271,11 @@ m_clattach(struct mbuf *m, int type, caddr_t extbuf, if (m->m_flags & M_EXT) { u_int32_t refcnt; - u_int32_t flags; + u_int32_t composite; refcnt = m_decref(m); - flags = MEXT_FLAGS(m); - if (refcnt == 0 && flags == 0) { + composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE); + if (refcnt == 0 && !composite) { if (m->m_ext.ext_free == NULL) { mcache_free(m_cache(MC_CL), m->m_ext.ext_buf); } else if (m->m_ext.ext_free == m_bigfree) { @@ -2982,7 +3290,7 @@ m_clattach(struct mbuf *m, int type, caddr_t extbuf, } /* Re-use the reference structure */ rfa = MEXT_RFA(m); - } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) { + } else if (refcnt == 0 && composite) { VERIFY(m->m_type != MT_FREE); mtype_stat_dec(m->m_type); @@ -2992,6 +3300,9 @@ m_clattach(struct mbuf *m, int type, caddr_t extbuf, m->m_flags = M_EXT; m->m_len = 0; m->m_next = m->m_nextpkt = NULL; + + MEXT_FLAGS(m) &= ~EXTF_READONLY; + /* "Free" into the intermediate cache */ if (m->m_ext.ext_free == NULL) { mcache_free(m_cache(MC_MBUF_CL), m); @@ -3036,14 +3347,29 @@ m_getcl(int wait, int type, int flags) if (mcflags & MCR_NOSLEEP) mcflags |= MCR_TRYHARD; - m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags); - if (m != NULL) { + m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags); + if (m != NULL) { + u_int32_t flag; + struct ext_ref *rfa; + void *cl; + + VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT); + cl = m->m_ext.ext_buf; + rfa = MEXT_RFA(m); + + ASSERT(cl != NULL && rfa != NULL); + VERIFY(MBUF_IS_COMPOSITE(m) && m->m_ext.ext_free == NULL); + + flag = MEXT_FLAGS(m); + MBUF_INIT(m, hdr, type); + MBUF_CL_INIT(m, cl, rfa, 1, flag); + mtype_stat_inc(type); mtype_stat_dec(MT_FREE); #if CONFIG_MACF_NET if (hdr && mac_init_mbuf(m, wait) != 0) { - m_free(m); + m_freem(m); return (NULL); } #endif /* MAC_NET */ @@ -3091,7 +3417,7 @@ m_mclfree(caddr_t p) /* * mcl_hasreference() checks if a cluster of an mbuf is referenced by - * another mbuf + * another mbuf; see comments in m_incref() regarding EXTF_READONLY. */ int m_mclhasreference(struct mbuf *m) @@ -3101,7 +3427,7 @@ m_mclhasreference(struct mbuf *m) ASSERT(MEXT_RFA(m) != NULL); - return (MEXT_REF(m) > 1); + return ((MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0); } __private_extern__ caddr_t @@ -3292,7 +3618,7 @@ m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs, --num_with_pkthdrs; #if CONFIG_MACF_NET if (mac_mbuf_label_init(m, wait) != 0) { - m_free(m); + m_freem(m); break; } #endif /* MAC_NET */ @@ -3608,7 +3934,7 @@ m_allocpacket_internal(unsigned int *numlist, size_t packetlen, #if CONFIG_MACF_NET if (pkthdr && mac_init_mbuf(m, wait) != 0) { --num; - m_free(m); + m_freem(m); break; } #endif /* MAC_NET */ @@ -3745,7 +4071,7 @@ m_freem_list(struct mbuf *m) while (m != NULL) { struct mbuf *next = m->m_next; mcache_obj_t *o, *rfa; - u_int32_t refcnt, flags; + u_int32_t refcnt, composite; if (m->m_type == MT_FREE) panic("m_free: freeing an already freed mbuf"); @@ -3762,8 +4088,8 @@ m_freem_list(struct mbuf *m) o = (mcache_obj_t *)m->m_ext.ext_buf; refcnt = m_decref(m); - flags = MEXT_FLAGS(m); - if (refcnt == 0 && flags == 0) { + composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE); + if (refcnt == 0 && !composite) { if (m->m_ext.ext_free == NULL) { o->obj_next = mcl_list; mcl_list = o; @@ -3782,7 +4108,7 @@ m_freem_list(struct mbuf *m) rfa->obj_next = ref_list; ref_list = rfa; MEXT_RFA(m) = NULL; - } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) { + } else if (refcnt == 0 && composite) { VERIFY(m->m_type != MT_FREE); /* * Amortize the costs of atomic operations @@ -3804,6 +4130,8 @@ m_freem_list(struct mbuf *m) m->m_len = 0; m->m_next = m->m_nextpkt = NULL; + MEXT_FLAGS(m) &= ~EXTF_READONLY; + /* "Free" into the intermediate cache */ o = (mcache_obj_t *)m; if (m->m_ext.ext_free == NULL) { @@ -4067,7 +4395,7 @@ m_copym(struct mbuf *m, int off0, int len, int wait) */ struct mbuf * m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait, - struct mbuf **m_last, int *m_off) + struct mbuf **m_lastm, int *m_off) { struct mbuf *n, **np = NULL; int off = off0, len = len0; @@ -4081,8 +4409,8 @@ m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait, if (off == 0 && (m->m_flags & M_PKTHDR)) copyhdr = 1; - if (*m_last != NULL) { - m = *m_last; + if (*m_lastm != NULL) { + m = *m_lastm; off = *m_off; } else { while (off >= m->m_len) { @@ -4159,10 +4487,10 @@ m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait, if (len == 0) { if ((off + n->m_len) == m->m_len) { - *m_last = m->m_next; + *m_lastm = m->m_next; *m_off = 0; } else { - *m_last = m; + *m_lastm = m; *m_off = off + n->m_len; } break; @@ -4385,6 +4713,56 @@ m_pullup(struct mbuf *n, int len) return (0); } +/* + * Like m_pullup(), except a new mbuf is always allocated, and we allow + * the amount of empty space before the data in the new mbuf to be specified + * (in the event that the caller expects to prepend later). + */ +__private_extern__ int MSFail = 0; + +__private_extern__ struct mbuf * +m_copyup(struct mbuf *n, int len, int dstoff) +{ + struct mbuf *m; + int count, space; + + if (len > (MHLEN - dstoff)) + goto bad; + MGET(m, M_DONTWAIT, n->m_type); + if (m == NULL) + goto bad; + m->m_len = 0; + if (n->m_flags & M_PKTHDR) { + m_copy_pkthdr(m, n); + n->m_flags &= ~M_PKTHDR; + } + m->m_data += dstoff; + space = &m->m_dat[MLEN] - (m->m_data + m->m_len); + do { + count = min(min(max(len, max_protohdr), space), n->m_len); + memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t), + (unsigned)count); + len -= count; + m->m_len += count; + n->m_len -= count; + space -= count; + if (n->m_len) + n->m_data += count; + else + n = m_free(n); + } while (len > 0 && n); + if (len > 0) { + (void) m_free(m); + goto bad; + } + m->m_next = n; + return (m); +bad: + m_freem(n); + MSFail++; + return (NULL); +} + /* * Partition an mbuf chain in two pieces, returning the tail -- * all but the first len0 bytes. In case of failure, it returns NULL and @@ -4531,29 +4909,9 @@ m_devget(char *buf, int totlen, int off0, struct ifnet *ifp, return (top); } -void -mbuf_growth_aggressive(void) -{ - lck_mtx_lock(mbuf_mlock); - /* - * Don't start to grow the pool until we are at least - * 1/2 (50%) of current total capacity. - */ - mbuf_gscale = MB_GROWTH_AGGRESSIVE; - lck_mtx_unlock(mbuf_mlock); -} - -void -mbuf_growth_normal(void) -{ - lck_mtx_lock(mbuf_mlock); - /* - * Don't start to grow the pool until we are at least - * 15/16 (93.75%) of current total capacity. - */ - mbuf_gscale = MB_GROWTH_NORMAL; - lck_mtx_unlock(mbuf_mlock); -} +#ifndef MBUF_GROWTH_NORMAL_THRESH +#define MBUF_GROWTH_NORMAL_THRESH 25 +#endif /* * Cluster freelist allocation check. @@ -4562,94 +4920,121 @@ static int m_howmany(int num, size_t bufsize) { int i = 0, j = 0; - u_int32_t m_clusters, m_bigclusters, m_16kclusters; - u_int32_t m_clfree, m_bigclfree, m_16kclfree; - u_int32_t s = mbuf_gscale; + u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters; + u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree; + u_int32_t sumclusters, freeclusters; + u_int32_t percent_pool, percent_kmem; + u_int32_t mb_growth, mb_growth_thresh; + + VERIFY(bufsize == m_maxsize(MC_BIGCL) || + bufsize == m_maxsize(MC_16KCL)); lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + /* Numbers in 2K cluster units */ + m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT; m_clusters = m_total(MC_CL); - m_bigclusters = m_total(MC_BIGCL); + m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT; m_16kclusters = m_total(MC_16KCL); + sumclusters = m_mbclusters + m_clusters + m_bigclusters; + + m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT; m_clfree = m_infree(MC_CL); - m_bigclfree = m_infree(MC_BIGCL); + m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT; m_16kclfree = m_infree(MC_16KCL); + freeclusters = m_mbfree + m_clfree + m_bigclfree; /* Bail if we've maxed out the mbuf memory map */ - if ((bufsize != m_maxsize(MC_16KCL) && - (m_clusters + (m_bigclusters << 1) >= nclusters)) || + if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) || (njcl > 0 && bufsize == m_maxsize(MC_16KCL) && - (m_16kclusters << 3) >= njcl)) { -#if DEBUG - if (bufsize == MCLBYTES && num > m_clfree) { - printf("m_howmany - out of small clusters, " - "%d short\n", num - mbstat.m_clfree); - } -#endif /* DEBUG */ + (m_16kclusters << NCLPJCLSHIFT) >= njcl)) { return (0); } - if (bufsize == m_maxsize(MC_CL)) { + if (bufsize == m_maxsize(MC_BIGCL)) { /* Under minimum */ - if (m_clusters < MINCL) - return (MINCL - m_clusters); - /* Too few (free < threshold) and not over maximum */ - if (m_clusters < m_maxlimit(MC_CL)) { - if (m_clfree >= MCL_LOWAT) + if (m_bigclusters < m_minlimit(MC_BIGCL)) + return (m_minlimit(MC_BIGCL) - m_bigclusters); + + percent_pool = + ((sumclusters - freeclusters) * 100) / sumclusters; + percent_kmem = (sumclusters * 100) / nclusters; + + /* + * If a light/normal user, grow conservatively (75%) + * If a heavy user, grow aggressively (50%) + */ + if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH) + mb_growth = MB_GROWTH_NORMAL; + else + mb_growth = MB_GROWTH_AGGRESSIVE; + + if (percent_kmem < 5) { + /* For initial allocations */ + i = num; + } else { + /* Return if >= MBIGCL_LOWAT clusters available */ + if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT && + m_total(MC_BIGCL) >= + MBIGCL_LOWAT + m_minlimit(MC_BIGCL)) return (0); - if (num >= m_clfree) - i = num - m_clfree; - if (((m_clusters + num) >> s) > m_clfree) - j = ((m_clusters + num) >> s) - m_clfree; + + /* Ensure at least num clusters are accessible */ + if (num >= m_infree(MC_BIGCL)) + i = num - m_infree(MC_BIGCL); + if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL)) + j = num - (m_total(MC_BIGCL) - + m_minlimit(MC_BIGCL)); + i = MAX(i, j); - if (i + m_clusters >= m_maxlimit(MC_CL)) - i = m_maxlimit(MC_CL) - m_clusters; - } - VERIFY((m_total(MC_CL) + i) <= m_maxlimit(MC_CL)); - } else if (bufsize == m_maxsize(MC_BIGCL)) { - /* Under minimum */ - if (m_bigclusters < MINBIGCL) - return (MINBIGCL - m_bigclusters); - /* Too few (free < 1/16 total) and not over maximum */ - if (m_bigclusters < m_maxlimit(MC_BIGCL)) { - if (m_bigclfree >= MBIGCL_LOWAT) - return (0); - if (num >= m_bigclfree) - i = num - m_bigclfree; - if (((m_bigclusters + num) >> 4) > m_bigclfree) - j = ((m_bigclusters + num) >> 4) - m_bigclfree; + + /* + * Grow pool if percent_pool > 75 (normal growth) + * or percent_pool > 50 (aggressive growth). + */ + mb_growth_thresh = 100 - (100 / (1 << mb_growth)); + if (percent_pool > mb_growth_thresh) + j = ((sumclusters + num) >> mb_growth) - + freeclusters; i = MAX(i, j); - if (i + m_bigclusters >= m_maxlimit(MC_BIGCL)) - i = m_maxlimit(MC_BIGCL) - m_bigclusters; } + + /* Check to ensure we didn't go over limits */ + if (i + m_bigclusters >= m_maxlimit(MC_BIGCL)) + i = m_maxlimit(MC_BIGCL) - m_bigclusters; + if ((i << 1) + sumclusters >= nclusters) + i = (nclusters - sumclusters) >> 1; VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL)); - } else { + VERIFY(sumclusters + (i << 1) <= nclusters); + + } else { /* 16K CL */ VERIFY(njcl > 0); /* Under minimum */ if (m_16kclusters < MIN16KCL) return (MIN16KCL - m_16kclusters); - /* Too few (free < 1/16 total) and not over maximum */ - if (m_16kclusters < m_maxlimit(MC_16KCL)) { - if (m_16kclfree >= M16KCL_LOWAT) - return (0); - if (num >= m_16kclfree) - i = num - m_16kclfree; - if (((m_16kclusters + num) >> 4) > m_16kclfree) - j = ((m_16kclusters + num) >> 4) - m_16kclfree; - i = MAX(i, j); - if (i + m_16kclusters >= m_maxlimit(MC_16KCL)) - i = m_maxlimit(MC_16KCL) - m_16kclusters; - } + if (m_16kclfree >= M16KCL_LOWAT) + return (0); + + /* Ensure at least num clusters are available */ + if (num >= m_16kclfree) + i = num - m_16kclfree; + + /* Always grow 16KCL pool aggressively */ + if (((m_16kclusters + num) >> 1) > m_16kclfree) + j = ((m_16kclusters + num) >> 1) - m_16kclfree; + i = MAX(i, j); + + /* Check to ensure we don't go over limit */ + if (i + m_16kclusters >= m_maxlimit(MC_16KCL)) + i = m_maxlimit(MC_16KCL) - m_16kclusters; VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL)); } - return (i); } - /* * Return the number of bytes in the mbuf chain, m. - */ -static unsigned int + */ +unsigned int m_length(struct mbuf *m) { struct mbuf *m0; @@ -5157,6 +5542,61 @@ m_normalize(struct mbuf *m) return (top); } +/* + * Append the specified data to the indicated mbuf chain, + * Extend the mbuf chain if the new data does not fit in + * existing space. + * + * Return 1 if able to complete the job; otherwise 0. + */ +int +m_append(struct mbuf *m0, int len, caddr_t cp) +{ + struct mbuf *m, *n; + int remainder, space; + + for (m = m0; m->m_next != NULL; m = m->m_next) + ; + remainder = len; + space = M_TRAILINGSPACE(m); + if (space > 0) { + /* + * Copy into available space. + */ + if (space > remainder) + space = remainder; + bcopy(cp, mtod(m, caddr_t) + m->m_len, space); + m->m_len += space; + cp += space, remainder -= space; + } + while (remainder > 0) { + /* + * Allocate a new mbuf; could check space + * and allocate a cluster instead. + */ + n = m_get(M_WAITOK, m->m_type); + if (n == NULL) + break; + n->m_len = min(MLEN, remainder); + bcopy(cp, mtod(n, caddr_t), n->m_len); + cp += n->m_len; + remainder -= n->m_len; + m->m_next = n; + m = n; + } + if (m0->m_flags & M_PKTHDR) + m0->m_pkthdr.len += len - remainder; + return (remainder == 0); +} + +struct mbuf * +m_last(struct mbuf *m) +{ + while (m->m_next != NULL) + m = m->m_next; + return (m); +} + void m_mchtype(struct mbuf *m, int t) { @@ -5183,6 +5623,34 @@ m_mcheck(struct mbuf *m) _MCHECK(m); } +/* + * Return a pointer to mbuf/offset of location in mbuf chain. + */ +struct mbuf * +m_getptr(struct mbuf *m, int loc, int *off) +{ + + while (loc >= 0) { + /* Normal end of search. */ + if (m->m_len > loc) { + *off = loc; + return (m); + } else { + loc -= m->m_len; + if (m->m_next == NULL) { + if (loc == 0) { + /* Point at the end of valid data. */ + *off = m->m_len; + return (m); + } + return (NULL); + } + m = m->m_next; + } + } + return (NULL); +} + /* * Inform the corresponding mcache(s) that there's a waiter below. */ @@ -5225,6 +5693,29 @@ mbuf_waiter_dec(mbuf_class_t class, boolean_t comp) } } +/* + * Called during slab (blocking and non-blocking) allocation. If there + * is at least one waiter, and the time since the first waiter is blocked + * is greater than the watchdog timeout, panic the system. + */ +static void +mbuf_watchdog(void) +{ + struct timeval now; + unsigned int since; + + if (mb_waiters == 0 || !mb_watchdog) + return; + + microuptime(&now); + since = now.tv_sec - mb_wdtstart.tv_sec; + if (since >= MB_WDT_MAXTIME) { + panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__, + mb_waiters, since, mbuf_dump()); + /* NOTREACHED */ + } +} + /* * Called during blocking allocation. Returns TRUE if one or more objects * are available at the per-CPU caches layer and that allocation should be @@ -5266,6 +5757,16 @@ mbuf_sleep(mbuf_class_t class, unsigned int num, int wait) mbuf_waiter_inc(class, (wait & MCR_COMP)); VERIFY(!(wait & MCR_NOSLEEP)); + + /* + * If this is the first waiter, arm the watchdog timer. Otherwise + * check if we need to panic the system due to watchdog timeout. + */ + if (mb_waiters == 0) + microuptime(&mb_wdtstart); + else + mbuf_watchdog(); + mb_waiters++; (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL); @@ -5420,7 +5921,7 @@ slab_get(void *buf) } } - ix = MTOCL(buf) % NSLABSPMB; + ix = MTOBG(buf) % NSLABSPMB; VERIFY(ix < NSLABSPMB); return (&slg->slg_slab[ix]); @@ -5447,15 +5948,9 @@ slab_insert(mcl_slab_t *sp, mbuf_class_t class) m_slab_cnt(class)++; TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link); sp->sl_flags &= ~SLF_DETACHED; - if (class == MC_BIGCL) { - sp = sp->sl_next; - /* Next slab must already be present */ - VERIFY(sp != NULL); - VERIFY(slab_is_detached(sp)); - sp->sl_flags &= ~SLF_DETACHED; - } else if (class == MC_16KCL) { + if (class == MC_16KCL) { int k; - for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) { + for (k = 1; k < NSLABSP16KB; k++) { sp = sp->sl_next; /* Next slab must already be present */ VERIFY(sp != NULL); @@ -5473,15 +5968,9 @@ slab_remove(mcl_slab_t *sp, mbuf_class_t class) m_slab_cnt(class)--; TAILQ_REMOVE(&m_slablist(class), sp, sl_link); slab_detach(sp); - if (class == MC_BIGCL) { - sp = sp->sl_next; - /* Next slab must already be present */ - VERIFY(sp != NULL); - VERIFY(!slab_is_detached(sp)); - slab_detach(sp); - } else if (class == MC_16KCL) { + if (class == MC_16KCL) { int k; - for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) { + for (k = 1; k < NSLABSP16KB; k++) { sp = sp->sl_next; /* Next slab must already be present */ VERIFY(sp != NULL); @@ -5511,7 +6000,7 @@ slab_nextptr_panic(mcl_slab_t *sp, void *addr) void *next = ((mcache_obj_t *)buf)->obj_next; if (next != addr) continue; - if (mclaudit == NULL) { + if (!mclverify) { if (next != NULL && !MBUF_IN_MAP(next)) { mcache_t *cp = m_cache(sp->sl_class); panic("%s: %s buffer %p in slab %p modified " @@ -5553,12 +6042,14 @@ mcl_audit_init(void *buf, mcache_audit_t **mca_list, boolean_t save_contents = (con_list != NULL); unsigned int i, ix; - ASSERT(num <= NMBPCL); + ASSERT(num <= NMBPBG); ASSERT(con_list == NULL || con_size != 0); - ix = MTOCL(buf); + ix = MTOBG(buf); + VERIFY(ix < maxclaudit); + /* Make sure we haven't been here before */ - for (i = 0; i < NMBPCL; i++) + for (i = 0; i < NMBPBG; i++) VERIFY(mclaudit[ix].cl_audit[i] == NULL); mca = mca_tail = *mca_list; @@ -5594,31 +6085,39 @@ mcl_audit_init(void *buf, mcache_audit_t **mca_list, } /* - * Given an address of a buffer (mbuf/cluster/big cluster), return + * Given an address of a buffer (mbuf/2KB/4KB/16KB), return * the corresponding audit structure for that buffer. */ static mcache_audit_t * mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o) { mcache_audit_t *mca = NULL; - int ix = MTOCL(o); + int ix = MTOBG(o); + VERIFY(ix < maxclaudit); VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG))); switch (class) { case MC_MBUF: /* - * For the mbuf case, find the index of the cluster + * For the mbuf case, find the index of the page * used by the mbuf and use that index to locate the - * base address of the cluster. Then find out the - * mbuf index relative to the cluster base and use + * base address of the page. Then find out the + * mbuf index relative to the page base and use * it to locate the audit structure. */ - VERIFY(MCLIDX(CLTOM(ix), o) < (int)NMBPCL); - mca = mclaudit[ix].cl_audit[MCLIDX(CLTOM(ix), o)]; + VERIFY(MCLIDX(BGTOM(ix), o) < (int)NMBPBG); + mca = mclaudit[ix].cl_audit[MCLIDX(BGTOM(ix), o)]; break; case MC_CL: + /* + * Same thing as above, but for 2KB clusters in a page. + */ + VERIFY(CLBGIDX(BGTOM(ix), o) < (int)NCLPBG); + mca = mclaudit[ix].cl_audit[CLBGIDX(BGTOM(ix), o)]; + break; + case MC_BIGCL: case MC_16KCL: /* @@ -5645,19 +6144,24 @@ mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite, VERIFY(mca->mca_contents != NULL && mca->mca_contents_size == AUDIT_CONTENTS_SIZE); - mcl_audit_verify_nextptr(next, mca); + if (mclverify) + mcl_audit_verify_nextptr(next, mca); if (!alloc) { /* Save constructed mbuf fields */ mcl_audit_save_mbuf(m, mca); - mcache_set_pattern(MCACHE_FREE_PATTERN, m, m_maxsize(MC_MBUF)); + if (mclverify) { + mcache_set_pattern(MCACHE_FREE_PATTERN, m, + m_maxsize(MC_MBUF)); + } ((mcache_obj_t *)m)->obj_next = next; return; } /* Check if the buffer has been corrupted while in freelist */ - mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF)); - + if (mclverify) { + mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF)); + } /* Restore constructed mbuf fields */ mcl_audit_restore_mbuf(m, mca, composite); } @@ -5704,12 +6208,14 @@ mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc, mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next; if (!alloc) { - mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size); + if (mclverify) { + mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size); + } if (save_next) { mcl_audit_verify_nextptr(next, mca); ((mcache_obj_t *)addr)->obj_next = next; } - } else { + } else if (mclverify) { /* Check if the buffer has been corrupted while in freelist */ mcl_audit_verify_nextptr(next, mca); mcache_audit_free_verify_set(mca, addr, 0, size); @@ -5732,8 +6238,8 @@ mcl_audit_mcheck_panic(struct mbuf *m) static void mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca) { - if (next != NULL && next != (void *)MCACHE_FREE_PATTERN && - !MBUF_IN_MAP(next)) { + if (next != NULL && !MBUF_IN_MAP(next) && + (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) { panic("mcl_audit: buffer %p modified after free at offset 0: " "%p out of range [%p-%p)\n%s\n", mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca)); @@ -5741,10 +6247,358 @@ mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca) } } +/* This function turns on mbuf leak detection */ +static void +mleak_activate(void) +{ + mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR; + PE_parse_boot_argn("mleak_sample_factor", + &mleak_table.mleak_sample_factor, + sizeof (mleak_table.mleak_sample_factor)); + + if (mleak_table.mleak_sample_factor == 0) + mclfindleak = 0; + + if (mclfindleak == 0) + return; + + vm_size_t alloc_size = + mleak_alloc_buckets * sizeof (struct mallocation); + vm_size_t trace_size = mleak_trace_buckets * sizeof (struct mtrace); + + MALLOC(mleak_allocations, struct mallocation *, alloc_size, + M_TEMP, M_WAITOK | M_ZERO); + VERIFY(mleak_allocations != NULL); + + MALLOC(mleak_traces, struct mtrace *, trace_size, + M_TEMP, M_WAITOK | M_ZERO); + VERIFY(mleak_traces != NULL); + + MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES), + M_TEMP, M_WAITOK | M_ZERO); + VERIFY(mleak_stat != NULL); + mleak_stat->ml_cnt = MLEAK_NUM_TRACES; +#ifdef __LP64__ + mleak_stat->ml_isaddr64 = 1; +#endif /* __LP64__ */ +} + +static void +mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc) +{ + int temp; + + if (mclfindleak == 0) + return; + + if (!alloc) + return (mleak_free(addr)); + + temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1); + + if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) { + uintptr_t bt[MLEAK_STACK_DEPTH]; + int logged = fastbacktrace(bt, MLEAK_STACK_DEPTH); + mleak_log(bt, addr, logged, num); + } +} + +/* + * This function records the allocation in the mleak_allocations table + * and the backtrace in the mleak_traces table; if allocation slot is in use, + * replace old allocation with new one if the trace slot is in use, return + * (or increment refcount if same trace). + */ +static boolean_t +mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num) +{ + struct mallocation *allocation; + struct mtrace *trace; + uint32_t trace_index; + int i; + + /* Quit if someone else modifying the tables */ + if (!lck_mtx_try_lock_spin(mleak_lock)) { + mleak_table.total_conflicts++; + return (FALSE); + } + + allocation = &mleak_allocations[hashaddr((uintptr_t)addr, + mleak_alloc_buckets)]; + trace_index = hashbacktrace(bt, depth, mleak_trace_buckets); + trace = &mleak_traces[trace_index]; + + VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]); + VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]); + + allocation->hitcount++; + trace->hitcount++; + + /* + * If the allocation bucket we want is occupied + * and the occupier has the same trace, just bail. + */ + if (allocation->element != NULL && + trace_index == allocation->trace_index) { + mleak_table.alloc_collisions++; + lck_mtx_unlock(mleak_lock); + return (TRUE); + } + + /* + * Store the backtrace in the traces array; + * Size of zero = trace bucket is free. + */ + if (trace->allocs > 0 && + bcmp(trace->addr, bt, (depth * sizeof (uintptr_t))) != 0) { + /* Different, unique trace, but the same hash! Bail out. */ + trace->collisions++; + mleak_table.trace_collisions++; + lck_mtx_unlock(mleak_lock); + return (TRUE); + } else if (trace->allocs > 0) { + /* Same trace, already added, so increment refcount */ + trace->allocs++; + } else { + /* Found an unused trace bucket, so record the trace here */ + if (trace->depth != 0) { + /* this slot previously used but not currently in use */ + mleak_table.trace_overwrites++; + } + mleak_table.trace_recorded++; + trace->allocs = 1; + memcpy(trace->addr, bt, (depth * sizeof (uintptr_t))); + trace->depth = depth; + trace->collisions = 0; + } + + /* Step 2: Store the allocation record in the allocations array */ + if (allocation->element != NULL) { + /* + * Replace an existing allocation. No need to preserve + * because only a subset of the allocations are being + * recorded anyway. + */ + mleak_table.alloc_collisions++; + } else if (allocation->trace_index != 0) { + mleak_table.alloc_overwrites++; + } + allocation->element = addr; + allocation->trace_index = trace_index; + allocation->count = num; + mleak_table.alloc_recorded++; + mleak_table.outstanding_allocs++; + + /* keep a log of the last 5 traces to be top trace, in order */ + for (i = 0; i < MLEAK_NUM_TRACES; i++) { + if (mleak_top_trace[i] == NULL || + mleak_top_trace[i]->allocs <= trace->allocs) { + if (mleak_top_trace[i] != trace) { + int j = MLEAK_NUM_TRACES; + while (--j > i) { + mleak_top_trace[j] = + mleak_top_trace[j - 1]; + } + mleak_top_trace[i] = trace; + } + break; + } + } + + lck_mtx_unlock(mleak_lock); + return (TRUE); +} + +static void +mleak_free(mcache_obj_t *addr) +{ + while (addr != NULL) { + struct mallocation *allocation = &mleak_allocations + [hashaddr((uintptr_t)addr, mleak_alloc_buckets)]; + + if (allocation->element == addr && + allocation->trace_index < mleak_trace_buckets) { + lck_mtx_lock_spin(mleak_lock); + if (allocation->element == addr && + allocation->trace_index < mleak_trace_buckets) { + struct mtrace *trace; + trace = &mleak_traces[allocation->trace_index]; + /* allocs = 0 means trace bucket is unused */ + if (trace->allocs > 0) + trace->allocs--; + if (trace->allocs == 0) + trace->depth = 0; + /* NULL element means alloc bucket is unused */ + allocation->element = NULL; + mleak_table.outstanding_allocs--; + } + lck_mtx_unlock(mleak_lock); + } + addr = addr->obj_next; + } +} + +static struct mbtypes { + int mt_type; + const char *mt_name; +} mbtypes[] = { + { MT_DATA, "data" }, + { MT_OOBDATA, "oob data" }, + { MT_CONTROL, "ancillary data" }, + { MT_HEADER, "packet headers" }, + { MT_SOCKET, "socket structures" }, + { MT_PCB, "protocol control blocks" }, + { MT_RTABLE, "routing table entries" }, + { MT_HTABLE, "IMP host table entries" }, + { MT_ATABLE, "address resolution tables" }, + { MT_FTABLE, "fragment reassembly queue headers" }, + { MT_SONAME, "socket names and addresses" }, + { MT_SOOPTS, "socket options" }, + { MT_RIGHTS, "access rights" }, + { MT_IFADDR, "interface addresses" }, + { MT_TAG, "packet tags" }, + { 0, NULL } +}; + +#define MBUF_DUMP_BUF_CHK() { \ + clen -= k; \ + if (clen < 1) \ + goto done; \ + c += k; \ +} + +static char * +mbuf_dump(void) +{ + unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct; + u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0; + u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0; + u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0; + int nmbtypes = sizeof (mbstat.m_mtypes) / sizeof (short); + uint8_t seen[256]; + struct mbtypes *mp; + mb_class_stat_t *sp; + char *c = mbuf_dump_buf; + int i, k, clen = sizeof (mbuf_dump_buf); + + mbuf_dump_buf[0] = '\0'; + + /* synchronize all statistics in the mbuf table */ + mbuf_stat_sync(); + mbuf_mtypes_sync(TRUE); + + sp = &mb_stat->mbs_class[0]; + for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) { + u_int32_t mem; + + if (m_class(i) == MC_MBUF) { + m_mbufs = sp->mbcl_active; + } else if (m_class(i) == MC_CL) { + m_clfree = sp->mbcl_total - sp->mbcl_active; + } else if (m_class(i) == MC_BIGCL) { + m_bigclfree = sp->mbcl_total - sp->mbcl_active; + } else if (njcl > 0 && m_class(i) == MC_16KCL) { + m_16kclfree = sp->mbcl_total - sp->mbcl_active; + m_16kclusters = sp->mbcl_total; + } else if (m_class(i) == MC_MBUF_CL) { + m_mbufclfree = sp->mbcl_total - sp->mbcl_active; + } else if (m_class(i) == MC_MBUF_BIGCL) { + m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active; + } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) { + m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active; + } + + mem = sp->mbcl_ctotal * sp->mbcl_size; + totmem += mem; + totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) * + sp->mbcl_size; + + } + + /* adjust free counts to include composite caches */ + m_clfree += m_mbufclfree; + m_bigclfree += m_mbufbigclfree; + m_16kclfree += m_mbuf16kclfree; + + totmbufs = 0; + for (mp = mbtypes; mp->mt_name != NULL; mp++) + totmbufs += mbstat.m_mtypes[mp->mt_type]; + if (totmbufs > m_mbufs) + totmbufs = m_mbufs; + k = snprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs); + MBUF_DUMP_BUF_CHK(); + + bzero(&seen, sizeof (seen)); + for (mp = mbtypes; mp->mt_name != NULL; mp++) { + if (mbstat.m_mtypes[mp->mt_type] != 0) { + seen[mp->mt_type] = 1; + k = snprintf(c, clen, "\t%u mbufs allocated to %s\n", + mbstat.m_mtypes[mp->mt_type], mp->mt_name); + MBUF_DUMP_BUF_CHK(); + } + } + seen[MT_FREE] = 1; + for (i = 0; i < nmbtypes; i++) + if (!seen[i] && mbstat.m_mtypes[i] != 0) { + k = snprintf(c, clen, "\t%u mbufs allocated to " + "\n", mbstat.m_mtypes[i], i); + MBUF_DUMP_BUF_CHK(); + } + if ((m_mbufs - totmbufs) > 0) { + k = snprintf(c, clen, "\t%lu mbufs allocated to caches\n", + m_mbufs - totmbufs); + MBUF_DUMP_BUF_CHK(); + } + k = snprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n" + "%u/%u mbuf 4KB clusters in use\n", + (unsigned int)(mbstat.m_clusters - m_clfree), + (unsigned int)mbstat.m_clusters, + (unsigned int)(mbstat.m_bigclusters - m_bigclfree), + (unsigned int)mbstat.m_bigclusters); + MBUF_DUMP_BUF_CHK(); + + if (njcl > 0) { + k = snprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n", + m_16kclusters - m_16kclfree, m_16kclusters, + njclbytes / 1024); + MBUF_DUMP_BUF_CHK(); + } + totused = totmem - totfree; + if (totmem == 0) { + totpct = 0; + } else if (totused < (ULONG_MAX / 100)) { + totpct = (totused * 100) / totmem; + } else { + u_long totmem1 = totmem / 100; + u_long totused1 = totused / 100; + totpct = (totused1 * 100) / totmem1; + } + k = snprintf(c, clen, "%lu KB allocated to network (approx. %lu%% " + "in use)\n", totmem / 1024, totpct); + MBUF_DUMP_BUF_CHK(); + +done: + return (mbuf_dump_buf); +} + +#undef MBUF_DUMP_BUF_CHK + SYSCTL_DECL(_kern_ipc); -SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RD | CTLFLAG_LOCKED, +SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat, + CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, mbstat_sysctl, "S,mbstat", ""); -SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat, CTLFLAG_RD | CTLFLAG_LOCKED, +SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat, + CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, mb_stat_sysctl, "S,mb_stat", ""); -SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized, CTLFLAG_RD | CTLFLAG_LOCKED, - &mb_normalized, 0, ""); +SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace, + CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", ""); +SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table, + CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, mleak_table_sysctl, "S,mleak_table", ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor, + CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized, + CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog, + CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, ""); diff --git a/bsd/kern/uipc_mbuf2.c b/bsd/kern/uipc_mbuf2.c index 5276ce659..386238460 100644 --- a/bsd/kern/uipc_mbuf2.c +++ b/bsd/kern/uipc_mbuf2.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -105,11 +105,12 @@ #include #include #include -#if defined(PULLDOWN_STAT) && defined(INET6) +#include +#if INET6 #include #include #include -#endif +#endif /* INET6 */ #if CONFIG_MACF_NET #include @@ -131,7 +132,7 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp) struct mbuf *n, *o; int hlen, tlen, olen; int sharedcluster; -#if defined(PULLDOWN_STAT) && defined(INET6) +#if defined(PULLDOWN_STAT) && INET6 static struct mbuf *prev = NULL; int prevlen = 0, prevmlen = 0; #endif @@ -144,11 +145,11 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp) return NULL; /* impossible */ } -#if defined(PULLDOWN_STAT) && defined(INET6) +#if defined(PULLDOWN_STAT) && INET6 ip6stat.ip6s_pulldown++; #endif -#if defined(PULLDOWN_STAT) && defined(INET6) +#if defined(PULLDOWN_STAT) && INET6 /* statistics for m_pullup */ ip6stat.ip6s_pullup++; if (off + len > MHLEN) @@ -241,7 +242,7 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp) if ((off == 0 || offp) && len <= n->m_len - off) goto ok; -#if defined(PULLDOWN_STAT) && defined(INET6) +#if defined(PULLDOWN_STAT) && INET6 ip6stat.ip6s_pulldown_copy++; #endif @@ -321,7 +322,7 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp) * now, we need to do the hard way. don't m_copy as there's no room * on both end. */ -#if defined(PULLDOWN_STAT) && defined(INET6) +#if defined(PULLDOWN_STAT) && INET6 ip6stat.ip6s_pulldown_alloc++; #endif MGET(o, M_DONTWAIT, m->m_type); @@ -365,6 +366,67 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp) return n; } +/* + * Create and return an m_tag, either by re-using space in a previous tag + * or by allocating a new mbuf/cluster + */ +struct m_tag * +m_tag_create(u_int32_t id, u_int16_t type, int len, int wait, struct mbuf *buf) +{ + struct m_tag *t = NULL; + struct m_tag *p; + + if (len < 0) + return (NULL); + + if (len + sizeof (struct m_tag) + sizeof (struct m_taghdr) > MLEN) + return (m_tag_alloc(id, type, len, wait)); + + /* + * We've exhausted all external cases. Now, go through the m_tag + * chain and see if we can fit it in any of them. + * If not (t == NULL), call m_tag_alloc to store it in a new mbuf. + */ + p = SLIST_FIRST(&buf->m_pkthdr.tags); + while(p != NULL) { + /* 2KCL m_tag */ + if (M_TAG_ALIGN(p->m_tag_len) + + sizeof (struct m_taghdr) > MLEN) { + p = SLIST_NEXT(p, m_tag_link); + continue; + } + + VERIFY(p->m_tag_cookie == M_TAG_VALID_PATTERN); + + struct mbuf *m = m_dtom(p); + struct m_taghdr *hdr = (struct m_taghdr *)m->m_data; + + VERIFY(m->m_flags & M_TAGHDR && !(m->m_flags & M_EXT)); + + /* The mbuf can store this m_tag */ + if (M_TAG_ALIGN(len) <= MLEN - m->m_len) { + t = (struct m_tag *)(m->m_data + m->m_len); + hdr->refcnt++; + m->m_len += M_TAG_ALIGN(len); + VERIFY(m->m_len <= MLEN); + break; + } + + p = SLIST_NEXT(p, m_tag_link); + } + + if (t == NULL) + return (m_tag_alloc(id, type, len, wait)); + + t->m_tag_cookie = M_TAG_VALID_PATTERN; + t->m_tag_type = type; + t->m_tag_len = len; + t->m_tag_id = id; + if (len > 0) + bzero(t + 1, len); + return (t); +} + /* Get a packet tag structure along with specified data following. */ struct m_tag * m_tag_alloc(u_int32_t id, u_int16_t type, int len, int wait) @@ -372,26 +434,39 @@ m_tag_alloc(u_int32_t id, u_int16_t type, int len, int wait) struct m_tag *t; if (len < 0) - return NULL; -#if CONFIG_MBUF_TAGS_MALLOC - t = _MALLOC(len + sizeof (struct m_tag), M_TEMP, wait); -#else - if (len + sizeof(struct m_tag) <= MLEN) { + return (NULL); + + if (M_TAG_ALIGN(len) + sizeof (struct m_taghdr) <= MLEN) { struct mbuf *m = m_get(wait, MT_TAG); + struct m_taghdr *hdr; + if (m == NULL) - return NULL; - t = mtod(m, struct m_tag *); - } else if (len + sizeof(struct m_tag) <= MCLBYTES) { - t = (struct m_tag *) m_mclalloc(wait); - } else + return (NULL); + + m->m_flags |= M_TAGHDR; + + hdr = (struct m_taghdr *)m->m_data; + hdr->refcnt = 1; + m->m_len += sizeof (struct m_taghdr); + t = (struct m_tag *)(m->m_data + m->m_len); + m->m_len += M_TAG_ALIGN(len); + VERIFY(m->m_len <= MLEN); + } else if (len + sizeof (struct m_tag) <= MCLBYTES) { + t = (struct m_tag *)m_mclalloc(wait); + } else { t = NULL; -#endif + } + if (t == NULL) - return NULL; + return (NULL); + + t->m_tag_cookie = M_TAG_VALID_PATTERN; t->m_tag_type = type; t->m_tag_len = len; t->m_tag_id = id; - return t; + if (len > 0) + bzero(t + 1, len); + return (t); } @@ -405,25 +480,44 @@ m_tag_free(struct m_tag *t) t->m_tag_type == KERNEL_TAG_TYPE_MACLABEL) mac_mbuf_tag_destroy(t); #endif -#if CONFIG_MBUF_TAGS_MALLOC - _FREE(t, M_TEMP); -#else +#if INET6 + if (t != NULL && + t->m_tag_id == KERNEL_MODULE_TAG_ID && + t->m_tag_type == KERNEL_TAG_TYPE_INET6 && + t->m_tag_len == sizeof (struct ip6aux)) + ip6_destroyaux((struct ip6aux *)(t + 1)); +#endif /* INET6 */ if (t == NULL) return; - if (t->m_tag_len + sizeof(struct m_tag) <= MLEN) { + if (M_TAG_ALIGN(t->m_tag_len) + sizeof (struct m_taghdr) <= MLEN) { struct mbuf * m = m_dtom(t); - m_free(m); + VERIFY(m->m_flags & M_TAGHDR); + struct m_taghdr *hdr = (struct m_taghdr *)m->m_data; + + /* No other tags in this mbuf */ + if(--hdr->refcnt == 0) { + m_free(m); + return; + } + + /* Pattern-fill the header */ + u_int64_t *fill_ptr = (u_int64_t *)t; + u_int64_t *end_ptr = (u_int64_t *)(t + 1); + while (fill_ptr < end_ptr) { + *fill_ptr = M_TAG_FREE_PATTERN; + fill_ptr++; + } } else { - MCLFREE((caddr_t)t); + m_mclfree((caddr_t)t); } -#endif } /* Prepend a packet tag. */ void m_tag_prepend(struct mbuf *m, struct m_tag *t) { - KASSERT(m && t, ("m_tag_prepend: null argument, m %p t %p", m, t)); + VERIFY(m != NULL && t != NULL); + SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link); } @@ -431,7 +525,9 @@ m_tag_prepend(struct mbuf *m, struct m_tag *t) void m_tag_unlink(struct mbuf *m, struct m_tag *t) { - KASSERT(m && t, ("m_tag_unlink: null argument, m %p t %p", m, t)); + VERIFY(m != NULL && t != NULL); + VERIFY(t->m_tag_cookie == M_TAG_VALID_PATTERN); + SLIST_REMOVE(&m->m_pkthdr.tags, t, m_tag, m_tag_link); } @@ -439,7 +535,8 @@ m_tag_unlink(struct mbuf *m, struct m_tag *t) void m_tag_delete(struct mbuf *m, struct m_tag *t) { - KASSERT(m && t, ("m_tag_delete: null argument, m %p t %p", m, t)); + VERIFY(m != NULL && t != NULL); + m_tag_unlink(m, t); m_tag_free(t); } @@ -450,15 +547,21 @@ m_tag_delete_chain(struct mbuf *m, struct m_tag *t) { struct m_tag *p, *q; - KASSERT(m, ("m_tag_delete_chain: null mbuf")); - if (t != NULL) + VERIFY(m != NULL); + + if (t != NULL) { p = t; - else + } else { p = SLIST_FIRST(&m->m_pkthdr.tags); + } if (p == NULL) return; - while ((q = SLIST_NEXT(p, m_tag_link)) != NULL) + + VERIFY(p->m_tag_cookie == M_TAG_VALID_PATTERN); + while ((q = SLIST_NEXT(p, m_tag_link)) != NULL) { + VERIFY(q->m_tag_cookie == M_TAG_VALID_PATTERN); m_tag_delete(m, q); + } m_tag_delete(m, p); } @@ -468,17 +571,21 @@ m_tag_locate(struct mbuf *m, u_int32_t id, u_int16_t type, struct m_tag *t) { struct m_tag *p; - KASSERT(m, ("m_tag_find: null mbuf")); - if (t == NULL) + VERIFY(m != NULL); + + if (t == NULL) { p = SLIST_FIRST(&m->m_pkthdr.tags); - else + } else { + VERIFY(t->m_tag_cookie == M_TAG_VALID_PATTERN); p = SLIST_NEXT(t, m_tag_link); + } while (p != NULL) { + VERIFY(p->m_tag_cookie == M_TAG_VALID_PATTERN); if (p->m_tag_id == id && p->m_tag_type == type) - return p; + return (p); p = SLIST_NEXT(p, m_tag_link); } - return NULL; + return (NULL); } /* Copy a single tag. */ @@ -487,7 +594,8 @@ m_tag_copy(struct m_tag *t, int how) { struct m_tag *p; - KASSERT(t, ("m_tag_copy: null tag")); + VERIFY(t != NULL); + p = m_tag_alloc(t->m_tag_id, t->m_tag_type, t->m_tag_len, how); if (p == NULL) return (NULL); @@ -507,8 +615,16 @@ m_tag_copy(struct m_tag *t, int how) mac_mbuf_tag_copy(t, p); } else #endif +#if INET6 + if (t != NULL && + t->m_tag_id == KERNEL_MODULE_TAG_ID && + t->m_tag_type == KERNEL_TAG_TYPE_INET6 && + t->m_tag_len == sizeof (struct ip6aux)) { + ip6_copyaux((struct ip6aux *)(t + 1), (struct ip6aux *)(p + 1)); + } else +#endif /* INET6 */ bcopy(t + 1, p + 1, t->m_tag_len); /* Copy the data */ - return p; + return (p); } /* @@ -522,29 +638,32 @@ m_tag_copy_chain(struct mbuf *to, struct mbuf *from, int how) { struct m_tag *p, *t, *tprev = NULL; - KASSERT(to && from, - ("m_tag_copy: null argument, to %p from %p", to, from)); + VERIFY(to != NULL && from != NULL); + m_tag_delete_chain(to, NULL); SLIST_FOREACH(p, &from->m_pkthdr.tags, m_tag_link) { + VERIFY(p->m_tag_cookie == M_TAG_VALID_PATTERN); t = m_tag_copy(p, how); if (t == NULL) { m_tag_delete_chain(to, NULL); - return 0; + return (0); } - if (tprev == NULL) + if (tprev == NULL) { SLIST_INSERT_HEAD(&to->m_pkthdr.tags, t, m_tag_link); - else { + } else { SLIST_INSERT_AFTER(tprev, t, m_tag_link); tprev = t; } } - return 1; + return (1); } /* Initialize tags on an mbuf. */ void m_tag_init(struct mbuf *m) { + VERIFY(m != NULL); + SLIST_INIT(&m->m_pkthdr.tags); #if PF_PKTHDR bzero(&m->m_pkthdr.pf_mtag, sizeof (m->m_pkthdr.pf_mtag)); @@ -555,34 +674,25 @@ m_tag_init(struct mbuf *m) struct m_tag * m_tag_first(struct mbuf *m) { - return SLIST_FIRST(&m->m_pkthdr.tags); + VERIFY(m != NULL); + + return (SLIST_FIRST(&m->m_pkthdr.tags)); } /* Get next tag in chain. */ struct m_tag * -m_tag_next(__unused struct mbuf *m, struct m_tag *t) +m_tag_next(struct mbuf *m, struct m_tag *t) { - return SLIST_NEXT(t, m_tag_link); -} - -void -m_prio_init(struct mbuf *m) -{ -#if !PKT_PRIORITY #pragma unused(m) -#else /* PKT_PRIORITY */ - if (m->m_flags & M_PKTHDR) - m->m_pkthdr.prio = MBUF_PRIORITY_NORMAL; -#endif /* PKT_PRIORITY */ + VERIFY(t != NULL); + VERIFY(t->m_tag_cookie == M_TAG_VALID_PATTERN); + + return (SLIST_NEXT(t, m_tag_link)); } void -m_prio_background(struct mbuf *m) +m_prio_init(struct mbuf *m) { -#if !PKT_PRIORITY -#pragma unused(m) -#else /* PKT_PRIORITY */ if (m->m_flags & M_PKTHDR) - m->m_pkthdr.prio = MBUF_PRIORITY_BACKGROUND; -#endif /* PKT_PRIORITY */ + m->m_pkthdr.prio = MBUF_TC_BE; } diff --git a/bsd/kern/uipc_socket.c b/bsd/kern/uipc_socket.c index 4b2c8a79b..b496895f6 100644 --- a/bsd/kern/uipc_socket.c +++ b/bsd/kern/uipc_socket.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2008 Apple Inc. All rights reserved. + * Copyright (c) 1998-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -96,18 +96,25 @@ #include #include #include +#include +#include #include #include #include #include #include #include +#include + +#include #if CONFIG_MACF #include #include #endif /* MAC */ +extern int in6_init_done; + int so_cache_hw = 0; int so_cache_timeouts = 0; int so_cache_max_freed = 0; @@ -170,15 +177,15 @@ MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); SYSCTL_DECL(_kern_ipc); int somaxconn = SOMAXCONN; -SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn, 0, ""); +SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, ""); /* Should we get a maximum also ??? */ static int sosendmaxchain = 65536; static int sosendminchain = 16384; static int sorecvmincopy = 16384; -SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain, CTLFLAG_RW, &sosendminchain, +SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain, CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, ""); -SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy, CTLFLAG_RW, &sorecvmincopy, +SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy, CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, ""); /* @@ -186,7 +193,7 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy, CTLFLAG_RW, &sorecvmincopy, * the socket is marked with SOF_MULTIPAGES; see below. */ int sosendjcl = 1; -SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl, CTLFLAG_RW, &sosendjcl, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl, CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, ""); /* * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large @@ -200,9 +207,13 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl, CTLFLAG_RW, &sosendjcl, 0, ""); * capable. Set this to 1 only for testing/debugging purposes. */ int sosendjcl_ignore_capab = 0; -SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab, CTLFLAG_RW, +SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab, CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, ""); +int sodefunctlog = 0; +SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED, + &sodefunctlog, 0, ""); + /* * Socket operation routines. * These routines are called by the routines in @@ -223,7 +234,9 @@ extern struct protosw *pffindprotonotype(int, int); extern int soclose_locked(struct socket *); extern int soo_kqfilter(struct fileproc *, struct knote *, struct proc *); +#if CONFIG_EMBEDDED extern int uthread_get_background_state(uthread_t); +#endif /*CONFIG_EMBEDDED */ #ifdef __APPLE__ @@ -237,6 +250,9 @@ static void so_cache_timer(void *); void soclose_wait_locked(struct socket *so); int so_isdstlocal(struct socket *so); +__private_extern__ u_int32_t sotcdb = 0; +SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED, + &sotcdb, 0, ""); void socketinit(void) @@ -275,6 +291,7 @@ socketinit(void) get_inpcb_str_size() + 4 + get_tcp_str_size()); so_cache_zone = zinit(str_size, 120000*str_size, 8192, "socache zone"); + zone_change(so_cache_zone, Z_CALLERACCT, FALSE); zone_change(so_cache_zone, Z_NOENCRYPT, TRUE); #if TEMPDEBUG printf("cached_sock_alloc -- so_cache_zone size is %x\n", str_size); @@ -284,6 +301,10 @@ socketinit(void) so_cache_zone_element_size = str_size; sflt_init(); + + VERIFY(SO_TC_MAX == SO_TC_STATS_MAX); + + socket_tclass_init(); } static void @@ -398,6 +419,21 @@ cached_sock_free(struct socket *so) #endif } +static void +so_update_last_owner_locked( + struct socket *so, + proc_t self) +{ + if (self == NULL) + self = current_proc(); + + if (self) + { + so->last_upid = proc_uniqueid(self); + so->last_pid = proc_pid(self); + } +} + static void so_cache_timer(__unused void *dummy) { @@ -464,6 +500,7 @@ soalloc(int waitok, int dom, int type) return (NULL); } #endif /* MAC_SOCKET */ + so_update_last_owner_locked(so, NULL); } return (so); @@ -488,8 +525,10 @@ socreate(int dom, struct socket **aso, int type, int proto) register struct protosw *prp; register struct socket *so; register int error = 0; +#if CONFIG_EMBEDDED thread_t thread; struct uthread *ut; +#endif /* CONFIG_EMBEDDED */ #if TCPDEBUG extern int tcpconsdebug; @@ -521,6 +560,7 @@ socreate(int dom, struct socket **aso, int type, int proto) so->so_type = type; so->so_uid = kauth_cred_getuid(kauth_cred_get()); + so->so_gid = kauth_cred_getgid(kauth_cred_get()); if (!suser(kauth_cred_get(), NULL)) so->so_state = SS_PRIV; @@ -566,22 +606,42 @@ socreate(int dom, struct socket **aso, int type, int proto) so->so_options |= SO_DEBUG; #endif #endif + so_set_default_traffic_class(so); /* * If this is a background thread/task, mark the socket as such. */ +#if !CONFIG_EMBEDDED + if (proc_get_self_isbackground() != 0) +#else /* !CONFIG_EMBEDDED */ thread = current_thread(); ut = get_bsdthread_info(thread); - if (uthread_get_background_state(ut)) { + if (uthread_get_background_state(ut)) +#endif /* !CONFIG_EMBEDDED */ + { socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND); - so->so_background_thread = thread; - /* - * In case setpriority(PRIO_DARWIN_THREAD) was called - * on this thread, regulate network (TCP) traffics. - */ - if (ut->uu_flag & UT_BACKGROUND_TRAFFIC_MGT) { - socket_set_traffic_mgt_flags(so, - TRAFFIC_MGT_SO_BG_REGULATE); - } + so->so_background_thread = current_thread(); + } + + switch (dom) { + /* + * Don't mark Unix domain sockets as eligible for defunct by default. + */ + case PF_LOCAL: + so->so_flags |= SOF_NODEFUNCT; + break; + /* + * Radar 9119053 + * Since v6 initialization is asynchronous and we can't hold + * up the main boot path, we need to at least hold off any + * sockets attempting to be created until the v6 stack is + * up and ready. + */ + case PF_INET6: + if (in6_init_done == 0) + ip6_fin(); + break; + default: + break; } *aso = so; @@ -615,40 +675,25 @@ sobind(struct socket *so, struct sockaddr *nam) { struct proc *p = current_proc(); int error = 0; - struct socket_filter_entry *filter; - int filtered = 0; socket_lock(so, 1); + + so_update_last_owner_locked(so, p); /* - * If this is a bind request on a previously-accepted socket - * that has been marked as inactive, reject it now before - * we go any further. + * If this is a bind request on a socket that has been marked + * as inactive, reject it now before we go any further. */ if (so->so_flags & SOF_DEFUNCT) { error = EINVAL; + SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", + __func__, proc_pid(p), so, INP_SOCKAF(so), INP_SOCKTYPE(so), + error)); goto out; } /* Socket filter */ - error = 0; - for (filter = so->so_filt; filter && (error == 0); - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_bind) { - if (filtered == 0) { - filtered = 1; - sflt_use(so); - socket_unlock(so, 0); - } - error = filter->sfe_filter->sf_filter. - sf_bind(filter->sfe_cookie, so, nam); - } - } - if (filtered != 0) { - socket_lock(so, 0); - sflt_unuse(so); - } - /* End socket filter */ + error = sflt_bind(so, nam); if (error == 0) error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p); @@ -664,6 +709,9 @@ sobind(struct socket *so, struct sockaddr *nam) void sodealloc(struct socket *so) { + /* Remove any filters */ + sflt_termsock(so); + so->so_gencnt = ++so_gencnt; #if CONFIG_MACF_SOCKET @@ -703,10 +751,11 @@ solisten(struct socket *so, int backlog) { struct proc *p = current_proc(); int error = 0; - struct socket_filter_entry *filter; - int filtered = 0; socket_lock(so, 1); + + so_update_last_owner_locked(so, p); + if (so->so_proto == NULL) { error = EINVAL; goto out; @@ -718,13 +767,18 @@ solisten(struct socket *so, int backlog) /* * If the listen request is made on a socket that is not fully - * disconnected, or on a previously-accepted socket that has - * been marked as inactive, reject the request now. + * disconnected, or on a socket that has been marked as inactive, + * reject the request now. */ if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) || (so->so_flags & SOF_DEFUNCT)) { error = EINVAL; + if (so->so_flags & SOF_DEFUNCT) { + SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", + __func__, proc_pid(p), so, INP_SOCKAF(so), + INP_SOCKTYPE(so), error)); + } goto out; } @@ -733,23 +787,7 @@ solisten(struct socket *so, int backlog) goto out; } - error = 0; - for (filter = so->so_filt; filter && (error == 0); - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_listen) { - if (filtered == 0) { - filtered = 1; - sflt_use(so); - socket_unlock(so, 0); - } - error = filter->sfe_filter->sf_filter. - sf_listen(filter->sfe_cookie, so); - } - } - if (filtered != 0) { - socket_lock(so, 0); - sflt_unuse(so); - } + error = sflt_listen(so); if (error == 0) { error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p); @@ -793,9 +831,6 @@ sofreelastref(struct socket *so, int dealloc) /* Assume socket is locked */ - /* Remove any filters - may be called more than once */ - sflt_termsock(so); - if ((!(so->so_flags & SOF_PCBCLEARING)) || ((so->so_state & SS_NOFDREF) == 0)) { #ifdef __APPLE__ @@ -1104,8 +1139,7 @@ int soacceptfilter(struct socket *so) { struct sockaddr *local = NULL, *remote = NULL; - struct socket_filter_entry *filter; - int error = 0, filtered = 0; + int error = 0; struct socket *head = so->so_head; /* @@ -1126,29 +1160,7 @@ soacceptfilter(struct socket *so) goto done; } - /* - * At this point, we have a reference on the listening socket - * so we know it won't be going away. Do the same for the newly - * accepted socket while we invoke the accept callback routine. - */ - for (filter = so->so_filt; filter != NULL && error == 0; - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_accept != NULL) { - if (!filtered) { - filtered = 1; - sflt_use(so); - socket_unlock(so, 0); - } - error = filter->sfe_filter->sf_filter. - sf_accept(filter->sfe_cookie, - head, so, local, remote); - } - } - - if (filtered) { - socket_lock(so, 0); - sflt_unuse(so); - } + error = sflt_accept(head, so, local, remote); /* * If we get EJUSTRETURN from one of the filters, mark this socket @@ -1157,10 +1169,8 @@ soacceptfilter(struct socket *so) */ if (error == EJUSTRETURN) { error = 0; - so->so_flags |= SOF_DEFUNCT; - /* Prevent data from being appended to the socket buffers */ - so->so_snd.sb_flags |= SB_DROP; - so->so_rcv.sb_flags |= SB_DROP; + (void) sosetdefunct(current_proc(), so, + SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE); } if (error != 0) { @@ -1207,14 +1217,22 @@ soconnectlock(struct socket *so, struct sockaddr *nam, int dolock) if (dolock) socket_lock(so, 1); + so_update_last_owner_locked(so, p); + /* * If this is a listening socket or if this is a previously-accepted * socket that has been marked as inactive, reject the connect request. */ if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) { + error = EOPNOTSUPP; + if (so->so_flags & SOF_DEFUNCT) { + SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", + __func__, proc_pid(p), so, INP_SOCKAF(so), + INP_SOCKTYPE(so), error)); + } if (dolock) socket_unlock(so, 1); - return (EOPNOTSUPP); + return (error); } if ((so->so_restrictions & SO_RESTRICT_DENYOUT) != 0) { @@ -1238,36 +1256,14 @@ soconnectlock(struct socket *so, struct sockaddr *nam, int dolock) * Run connect filter before calling protocol: * - non-blocking connect returns before completion; */ - struct socket_filter_entry *filter; - int filtered = 0; - - error = 0; - for (filter = so->so_filt; filter && (error == 0); - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_connect_out) { - if (filtered == 0) { - filtered = 1; - sflt_use(so); - socket_unlock(so, 0); - } - error = filter->sfe_filter->sf_filter. - sf_connect_out(filter->sfe_cookie, so, nam); - } - } - if (filtered != 0) { - socket_lock(so, 0); - sflt_unuse(so); - } + error = sflt_connectout(so, nam); if (error) { if (error == EJUSTRETURN) error = 0; - if (dolock) - socket_unlock(so, 1); - return (error); + } else { + error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p); } - - error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p); } if (dolock) socket_unlock(so, 1); @@ -1377,6 +1373,8 @@ sosendcheck(struct socket *so, struct sockaddr *addr, int32_t resid, int32_t cle } else { error = sblock(&so->so_snd, SBLOCKWAIT(flags)); if (error) { + if (so->so_flags & SOF_DEFUNCT) + goto defunct; return (error); } *sblocked = 1; @@ -1384,12 +1382,17 @@ sosendcheck(struct socket *so, struct sockaddr *addr, int32_t resid, int32_t cle } /* - * If a send attempt is made on a previously-accepted socket - * that has been marked as inactive (disconnected), reject - * the request. + * If a send attempt is made on a socket that has been marked + * as inactive (disconnected), reject the request. */ - if (so->so_flags & SOF_DEFUNCT) - return (ENOTCONN); + if (so->so_flags & SOF_DEFUNCT) { +defunct: + error = EPIPE; + SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", __func__, + proc_selfpid(), so, INP_SOCKAF(so), INP_SOCKTYPE(so), + error)); + return (error); + } if (so->so_state & SS_CANTSENDMORE) return (EPIPE); @@ -1423,8 +1426,11 @@ sosendcheck(struct socket *so, struct sockaddr *addr, int32_t resid, int32_t cle return (EWOULDBLOCK); } sbunlock(&so->so_snd, 1); + *sblocked = 0; error = sbwait(&so->so_snd); if (error) { + if (so->so_flags & SOF_DEFUNCT) + goto defunct; return (error); } goto restart; @@ -1515,6 +1521,8 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat); socket_lock(so, 1); + so_update_last_owner_locked(so, p); + if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) { error = EOPNOTSUPP; socket_unlock(so, 1); @@ -1555,10 +1563,6 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1024 : 0); do { - struct socket_filter_entry *filter; - int filtered; - boolean_t recursive; - if (uio == NULL) { /* * Data is prepackaged in "top". @@ -1611,7 +1615,8 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, * haven't yet consumed. */ if (freelist == NULL && - bytes_to_copy > NBPG && jumbocl) { + bytes_to_copy > MBIGCLBYTES && + jumbocl) { num_needed = bytes_to_copy / M16KCLBYTES; @@ -1634,10 +1639,10 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, if (freelist == NULL && bytes_to_copy > MCLBYTES) { num_needed = - bytes_to_copy / NBPG; + bytes_to_copy / MBIGCLBYTES; if ((bytes_to_copy - - (num_needed * NBPG)) >= + (num_needed * MBIGCLBYTES)) >= MINCLSIZE) num_needed++; @@ -1645,7 +1650,7 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, m_getpackets_internal( (unsigned int *)&num_needed, hdrs_needed, M_WAIT, 0, - NBPG); + MBIGCLBYTES); /* * Fall back to cluster size * if allocation failed @@ -1783,65 +1788,24 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, /* * Socket filter processing */ - recursive = (so->so_send_filt_thread != NULL); - filtered = 0; - error = 0; - for (filter = so->so_filt; filter && (error == 0); - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_data_out) { - int so_flags = 0; - if (filtered == 0) { - filtered = 1; - so->so_send_filt_thread = - current_thread(); - sflt_use(so); - socket_unlock(so, 0); - so_flags = - (sendflags & MSG_OOB) ? - sock_data_filt_flag_oob : 0; - } - error = filter->sfe_filter->sf_filter. - sf_data_out(filter->sfe_cookie, so, - addr, &top, &control, so_flags); + error = sflt_data_out(so, addr, &top, &control, + (sendflags & MSG_OOB) ? sock_data_filt_flag_oob : 0); + if (error) { + if (error == EJUSTRETURN) { + error = 0; + clen = 0; + control = 0; + top = 0; } - } - if (filtered) { - /* - * At this point, we've run at least one - * filter. The socket is unlocked as is - * the socket buffer. Clear the recorded - * filter thread only when we are outside - * of a filter's context. This allows for - * a filter to issue multiple inject calls - * from its sf_data_out callback routine. - */ - socket_lock(so, 0); - sflt_unuse(so); - if (!recursive) - so->so_send_filt_thread = 0; - if (error) { - if (error == EJUSTRETURN) { - error = 0; - clen = 0; - control = 0; - top = 0; - } - - goto release; - } + goto release; } /* * End Socket filter processing */ - if (error == EJUSTRETURN) { - /* A socket filter handled this data */ - error = 0; - } else { - error = (*so->so_proto->pr_usrreqs->pru_send) - (so, sendflags, top, addr, control, p); - } + error = (*so->so_proto->pr_usrreqs->pru_send) + (so, sendflags, top, addr, control, p); #ifdef __APPLE__ if (flags & MSG_SEND) so->so_temp = NULL; @@ -1935,6 +1899,7 @@ soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat); socket_lock(so, 1); + so_update_last_owner_locked(so, p); #ifdef MORE_LOCKING_DEBUG if (so->so_usecount == 1) @@ -1958,14 +1923,18 @@ soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, if (so->so_flags & SOF_DEFUNCT) { struct sockbuf *sb = &so->so_rcv; + error = ENOTCONN; + SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", __func__, + proc_pid(p), so, INP_SOCKAF(so), INP_SOCKTYPE(so), error)); /* * This socket should have been disconnected and flushed - * prior to being returned from accept; there should be - * no data on its receive list, so panic otherwise. + * prior to being returned from sodefunct(); there should + * be no data on its receive list, so panic otherwise. */ - sb_empty_assert(sb, __func__); + if (so->so_state & SS_DEFUNCT) + sb_empty_assert(sb, __func__); socket_unlock(so, 1); - return (ENOTCONN); + return (error); } /* @@ -2197,6 +2166,14 @@ soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, goto restart; } socket_lock(so, 0); + /* + * If the socket has been defunct'd, drop it. + */ + if (so->so_flags & SOF_DEFUNCT) { + m_freem(m); + error = ENOTCONN; + goto release; + } /* * Re-adjust the socket receive list and re-enqueue * the record in front of any packets which may have @@ -2253,6 +2230,7 @@ soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; struct sockbuf *sb_rcv = &so->so_rcv; + struct mbuf **msgpcm = NULL; /* * Externalizing the control messages would require us to @@ -2265,7 +2243,23 @@ soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, do { if (flags & MSG_PEEK) { if (controlp != NULL) { + if (*controlp == NULL) { + msgpcm = controlp; + } *controlp = m_copy(m, 0, m->m_len); + + /* If we failed to allocate an mbuf, + * release any previously allocated + * mbufs for control data. Return + * an error. Keep the mbufs in the + * socket as this is using + * MSG_PEEK flag. + */ + if (*controlp == NULL) { + m_freem(*msgpcm); + error = ENOBUFS; + goto release; + } controlp = &(*controlp)->m_next; } m = m->m_next; @@ -2499,8 +2493,25 @@ soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, if (flags & MSG_PEEK) { moff += len; } else { - if (mp) - *mp = m_copym(m, 0, len, M_WAIT); + if (mp != NULL) { + int copy_flag; + + if (flags & MSG_DONTWAIT) + copy_flag = M_DONTWAIT; + else + copy_flag = M_WAIT; + *mp = m_copym(m, 0, len, copy_flag); + if (*mp == NULL) { + /* + * Failed to allocate an mbuf. + * Adjust uio_resid back, it was + * adjusted down by len bytes which + * we didn't copy over + */ + uio_setresid(uio, (uio_resid(uio) + len)); + break; + } + } m->m_data += len; m->m_len -= len; so->so_rcv.sb_cc -= len; @@ -2959,13 +2970,13 @@ sosetopt(struct socket *so, struct sockopt *sopt) int error, optval; struct linger l; struct timeval tv; - struct socket_filter_entry *filter; - int filtered = 0; #if CONFIG_MACF_SOCKET struct mac extmac; #endif /* MAC_SOCKET */ socket_lock(so, 1); + so_update_last_owner_locked(so, NULL); + if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) == (SS_CANTRCVMORE | SS_CANTSENDMORE) && (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) { @@ -2978,29 +2989,11 @@ sosetopt(struct socket *so, struct sockopt *sopt) sopt->sopt_dir = SOPT_SET; } - error = 0; - for (filter = so->so_filt; filter && (error == 0); - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_setoption) { - if (filtered == 0) { - filtered = 1; - sflt_use(so); - socket_unlock(so, 0); - } - error = filter->sfe_filter->sf_filter. - sf_setoption(filter->sfe_cookie, so, sopt); - } - } - - if (filtered != 0) { - socket_lock(so, 0); - sflt_unuse(so); - - if (error) { - if (error == EJUSTRETURN) - error = 0; - goto bad; - } + error = sflt_setsockopt(so, sopt); + if (error) { + if (error == EJUSTRETURN) + error = 0; + goto bad; } error = 0; @@ -3036,6 +3029,7 @@ sosetopt(struct socket *so, struct sockopt *sopt) case SO_REUSEPORT: case SO_OOBINLINE: case SO_TIMESTAMP: + case SO_TIMESTAMP_MONOTONIC: #ifdef __APPLE__ case SO_DONTTRUNC: case SO_WANTMORE: @@ -3126,8 +3120,7 @@ sosetopt(struct socket *so, struct sockopt *sopt) if (error) goto bad; - error = sflt_attach_private(so, NULL, - nke.nke_handle, 1); + error = sflt_attach_internal(so, nke.nke_handle); break; } @@ -3253,19 +3246,76 @@ sosetopt(struct socket *so, struct sockopt *sopt) break; } -#if PKT_PRIORITY case SO_TRAFFIC_CLASS: { error = sooptcopyin(sopt, &optval, sizeof (optval), sizeof (optval)); if (error) goto bad; - if (optval < SO_TC_BE || optval > SO_TC_VO) { - error = EINVAL; + error = so_set_traffic_class(so, optval); + if (error) goto bad; - } - so->so_traffic_class = optval; + break; } -#endif /* PKT_PRIORITY */ + + case SO_RECV_TRAFFIC_CLASS: { + error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval)); + if (error) + goto bad; + if (optval == 0) + so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS; + else + so->so_flags |= SOF_RECV_TRAFFIC_CLASS; + break; + } + + case SO_TRAFFIC_CLASS_DBG: { + struct so_tcdbg so_tcdbg; + + error = sooptcopyin(sopt, &so_tcdbg, sizeof (struct so_tcdbg), + sizeof (struct so_tcdbg)); + if (error) + goto bad; + error = so_set_tcdbg(so, &so_tcdbg); + if (error) + goto bad; + break; + } + + case SO_DEFUNCTOK: + error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval)); + if (error != 0 || (so->so_flags & SOF_DEFUNCT)) { + if (error == 0) + error = EBADF; + goto bad; + } + /* + * Any process can set SO_DEFUNCTOK (clear + * SOF_NODEFUNCT), but only root can clear + * SO_DEFUNCTOK (set SOF_NODEFUNCT). + */ + if (optval == 0 && + kauth_cred_issuser(kauth_cred_get()) == 0) { + error = EPERM; + goto bad; + } + if (optval) + so->so_flags &= ~SOF_NODEFUNCT; + else + so->so_flags |= SOF_NODEFUNCT; + + SODEFUNCTLOG(("%s[%d]: so %p [%d,%d] is now marked as " + "%seligible for defunct\n", __func__, + proc_selfpid(), so, INP_SOCKAF(so), + INP_SOCKTYPE(so), + (so->so_flags & SOF_NODEFUNCT) ? "not " : "")); + break; + + case SO_ISDEFUNCT: + /* This option is not settable */ + error = EINVAL; + break; default: error = ENOPROTOOPT; @@ -3355,8 +3405,6 @@ sogetopt(struct socket *so, struct sockopt *sopt) int error, optval; struct linger l; struct timeval tv; - struct socket_filter_entry *filter; - int filtered = 0; #if CONFIG_MACF_SOCKET struct mac extmac; #endif /* MAC_SOCKET */ @@ -3366,32 +3414,16 @@ sogetopt(struct socket *so, struct sockopt *sopt) } socket_lock(so, 1); + so_update_last_owner_locked(so, NULL); - error = 0; - for (filter = so->so_filt; filter && (error == 0); - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_getoption) { - if (filtered == 0) { - filtered = 1; - sflt_use(so); - socket_unlock(so, 0); - } - error = filter->sfe_filter->sf_filter. - sf_getoption(filter->sfe_cookie, so, sopt); - } - } - if (filtered != 0) { - socket_lock(so, 0); - sflt_unuse(so); - - if (error) { - if (error == EJUSTRETURN) - error = 0; - socket_unlock(so, 1); - return (error); - } + error = sflt_getsockopt(so, sopt); + if (error) { + if (error == EJUSTRETURN) + error = 0; + socket_unlock(so, 1); + return (error); } - + error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto && so->so_proto->pr_ctloutput) { @@ -3421,6 +3453,7 @@ sogetopt(struct socket *so, struct sockopt *sopt) case SO_BROADCAST: case SO_OOBINLINE: case SO_TIMESTAMP: + case SO_TIMESTAMP_MONOTONIC: #ifdef __APPLE__ case SO_DONTTRUNC: case SO_WANTMORE: @@ -3556,11 +3589,29 @@ sogetopt(struct socket *so, struct sockopt *sopt) error = sooptcopyout(sopt, &sonpx, sizeof(struct so_np_extensions)); break; } -#if PKT_PRIORITY + case SO_TRAFFIC_CLASS: optval = so->so_traffic_class; goto integer; -#endif /* PKT_PRIORITY */ + + case SO_RECV_TRAFFIC_CLASS: + optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS); + goto integer; + + case SO_TRAFFIC_CLASS_STATS: + error = sooptcopyout(sopt, &so->so_tc_stats, sizeof(so->so_tc_stats)); + + case SO_TRAFFIC_CLASS_DBG: + error = sogetopt_tcdbg(so, sopt); + break; + + case SO_DEFUNCTOK: + optval = !(so->so_flags & SOF_NODEFUNCT); + goto integer; + + case SO_ISDEFUNCT: + optval = (so->so_flags & SOF_DEFUNCT); + goto integer; default: error = ENOPROTOOPT; @@ -3570,8 +3621,10 @@ sogetopt(struct socket *so, struct sockopt *sopt) return (error); } } - -/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ +/* The size limits on our soopt_getm is different from that on FreeBSD. + * We limit the size of options to MCLBYTES. This will have to change + * if we need to define options that need more space than MCLBYTES. + */ int soopt_getm(struct sockopt *sopt, struct mbuf **mp) { @@ -3579,7 +3632,7 @@ soopt_getm(struct sockopt *sopt, struct mbuf **mp) int sopt_size = sopt->sopt_valsize; int how; - if (sopt_size > MAX_SOOPTGETM_SIZE) + if (sopt_size <= 0 || sopt_size > MCLBYTES) return (EMSGSIZE); how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT; @@ -3600,7 +3653,7 @@ soopt_getm(struct sockopt *sopt, struct mbuf **mp) *mp = m; m_prev = m; - while (sopt_size) { + while (sopt_size > 0) { MGET(m, how, MT_DATA); if (m == 0) { m_freem(*mp); @@ -3610,6 +3663,7 @@ soopt_getm(struct sockopt *sopt, struct mbuf **mp) MCLGET(m, how); if ((m->m_flags & M_EXT) == 0) { m_freem(*mp); + m_freem(m); return (ENOBUFS); } m->m_len = min(MCLBYTES, sopt_size); @@ -3623,7 +3677,7 @@ soopt_getm(struct sockopt *sopt, struct mbuf **mp) return (0); } -/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ +/* copyin sopt data into mbuf chain */ int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) { @@ -3654,7 +3708,7 @@ soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) return (0); } -/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ +/* copyout mbuf chain data into soopt */ int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) { @@ -3709,6 +3763,7 @@ sopoll(struct socket *so, int events, __unused kauth_cred_t cred, void * wql) int revents = 0; socket_lock(so, 1); + so_update_last_owner_locked(so, p); if (events & (POLLIN | POLLRDNORM)) if (soreadable(so)) @@ -3863,12 +3918,19 @@ filt_soread(struct knote *kn, long hint) return (1); } + int64_t lowwat = so->so_rcv.sb_lowat; + if (kn->kn_sfflags & NOTE_LOWAT) + { + if (kn->kn_sdata > so->so_rcv.sb_hiwat) + lowwat = so->so_rcv.sb_hiwat; + else if (kn->kn_sdata > lowwat) + lowwat = kn->kn_sdata; + } + if ((hint & SO_FILT_HINT_LOCKED) == 0) socket_unlock(so, 1); - - return ((kn->kn_flags & EV_OOBAND) || - kn->kn_data >= ((kn->kn_sfflags & NOTE_LOWAT) ? - kn->kn_sdata : so->so_rcv.sb_lowat)); + + return ((kn->kn_flags & EV_OOBAND) || kn->kn_data >= lowwat); } static void @@ -3911,14 +3973,20 @@ filt_sowrite(struct knote *kn, long hint) socket_unlock(so, 1); return (0); } + int64_t lowwat = so->so_snd.sb_lowat; + if (kn->kn_sfflags & NOTE_LOWAT) + { + if (kn->kn_sdata > so->so_snd.sb_hiwat) + lowwat = so->so_snd.sb_hiwat; + else if (kn->kn_sdata > lowwat) + lowwat = kn->kn_sdata; + } if ((hint & SO_FILT_HINT_LOCKED) == 0) socket_unlock(so, 1); - if (kn->kn_sfflags & NOTE_LOWAT) - return (kn->kn_data >= kn->kn_sdata); - return (kn->kn_data >= so->so_snd.sb_lowat); + return (kn->kn_data >= lowwat); } -#define SO_LOCK_HISTORY_STR_LEN (2 * SO_LCKDBG_MAX * (2 + sizeof(void *) + 1) + 1) +#define SO_LOCK_HISTORY_STR_LEN (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof(void *)) + 1) + 1) __private_extern__ const char * solockhistory_nr(struct socket *so) { @@ -3926,6 +3994,7 @@ __private_extern__ const char * solockhistory_nr(struct socket *so) int i; static char lock_history_str[SO_LOCK_HISTORY_STR_LEN]; + bzero(lock_history_str, sizeof(lock_history_str)); for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) { n += snprintf(lock_history_str + n, SO_LOCK_HISTORY_STR_LEN - n, "%lx:%lx ", (uintptr_t) so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX], @@ -4054,3 +4123,107 @@ so_isdstlocal(struct socket *so) { } return 0; } + +int +sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce) +{ + int err = 0, defunct; + + defunct = (so->so_flags & SOF_DEFUNCT); + if (defunct) { + if (!(so->so_snd.sb_flags & so->so_rcv.sb_flags & SB_DROP)) + panic("%s: SB_DROP not set", __func__); + goto done; + } + + if (so->so_flags & SOF_NODEFUNCT) { + if (noforce) { + err = EOPNOTSUPP; + SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p " + "[%d,%d] is not eligible for defunct (%d)\n", + __func__, proc_selfpid(), proc_pid(p), level, so, + INP_SOCKAF(so), INP_SOCKTYPE(so), err)); + return (err); + } + so->so_flags &= ~SOF_NODEFUNCT; + SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p [%d,%d] " + "defunct by force\n", __func__, proc_selfpid(), proc_pid(p), + level, so, INP_SOCKAF(so), INP_SOCKTYPE(so))); + } + + so->so_flags |= SOF_DEFUNCT; + /* Prevent further data from being appended to the socket buffers */ + so->so_snd.sb_flags |= SB_DROP; + so->so_rcv.sb_flags |= SB_DROP; + +done: + SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p [%d,%d] %s " + "defunct\n", __func__, proc_selfpid(), proc_pid(p), level, so, + INP_SOCKAF(so), INP_SOCKTYPE(so), + defunct ? "is already" : "marked as")); + + return (err); +} + +int +sodefunct(struct proc *p, struct socket *so, int level) +{ + struct sockbuf *rcv, *snd; + + if (!(so->so_flags & SOF_DEFUNCT)) + panic("%s improperly called", __func__); + + if (so->so_state & SS_DEFUNCT) + goto done; + + rcv = &so->so_rcv; + snd = &so->so_snd; + + SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p [%d,%d] is now " + "defunct [rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", + __func__, proc_selfpid(), proc_pid(p), level, so, + INP_SOCKAF(so), INP_SOCKTYPE(so), + (uint32_t)rcv->sb_sel.si_flags, (uint32_t)snd->sb_sel.si_flags, + (uint16_t)rcv->sb_flags, (uint16_t)snd->sb_flags)); + + /* + * Unwedge threads blocked on sbwait() and sb_lock(). + */ + sbwakeup(rcv); + sbwakeup(snd); + + if (rcv->sb_flags & SB_LOCK) + sbunlock(rcv, 1); + if (snd->sb_flags & SB_LOCK) + sbunlock(snd, 1); + + /* + * Flush the buffers and disconnect. We explicitly call shutdown + * on both data directions to ensure that SS_CANT{RCV,SEND}MORE + * states are set for the socket. This would also flush out data + * hanging off the receive list of this socket. + */ + (void) soshutdownlock(so, SHUT_RD); + (void) soshutdownlock(so, SHUT_WR); + (void) sodisconnectlocked(so); + + /* + * Explicitly handle connectionless-protocol disconnection + * and release any remaining data in the socket buffers. + */ + if (!(so->so_flags & SS_ISDISCONNECTED)) + (void) soisdisconnected(so); + + if (so->so_error == 0) + so->so_error = EBADF; + + if (rcv->sb_cc != 0) + sbrelease(rcv); + if (snd->sb_cc != 0) + sbrelease(snd); + + so->so_state |= SS_DEFUNCT; + +done: + return (0); +} diff --git a/bsd/kern/uipc_socket2.c b/bsd/kern/uipc_socket2.c index a6b2af000..4b71dd80c 100644 --- a/bsd/kern/uipc_socket2.c +++ b/bsd/kern/uipc_socket2.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2010 Apple Inc. All rights reserved. + * Copyright (c) 1998-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -222,6 +222,20 @@ soisdisconnected(struct socket *so) sorwakeup(so); } +/* This function will issue a wakeup like soisdisconnected but it will not + * notify the socket filters. This will avoid unlocking the socket + * in the midst of closing it. + */ +void +sodisconnectwakeup(struct socket *so) +{ + so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); + so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED); + wakeup((caddr_t)&so->so_timeo); + sowwakeup(so); + sorwakeup(so); +} + /* * When an attempt at a new connection is noted on a socket * which accepts connections, sonewconn is called. If the @@ -276,7 +290,6 @@ sonewconn_internal(struct socket *head, int connstatus) return ((struct socket *)0); } - so->so_head = head; so->so_type = head->so_type; so->so_options = head->so_options &~ SO_ACCEPTCONN; so->so_linger = head->so_linger; @@ -285,13 +298,15 @@ sonewconn_internal(struct socket *head, int connstatus) so->so_timeo = head->so_timeo; so->so_pgid = head->so_pgid; so->so_uid = head->so_uid; + so->so_gid = head->so_gid; /* inherit socket options stored in so_flags */ so->so_flags = head->so_flags & (SOF_NOSIGPIPE | SOF_NOADDRAVAIL | SOF_REUSESHAREUID | SOF_NOTIFYCONFLICT | SOF_BINDRANDOMPORT | - SOF_NPX_SETOPTSHUT); + SOF_NPX_SETOPTSHUT | + SOF_NODEFUNCT); so->so_usecount = 1; so->next_lock_lr = 0; so->next_unlock_lr = 0; @@ -307,15 +322,11 @@ sonewconn_internal(struct socket *head, int connstatus) #endif /* inherit traffic management properties of listener */ - so->so_traffic_mgt_flags = head->so_traffic_mgt_flags & - (TRAFFIC_MGT_SO_BACKGROUND | TRAFFIC_MGT_SO_BG_REGULATE); + so->so_traffic_mgt_flags = head->so_traffic_mgt_flags & (TRAFFIC_MGT_SO_BACKGROUND); so->so_background_thread = head->so_background_thread; -#if PKT_PRIORITY so->so_traffic_class = head->so_traffic_class; -#endif /* PKT_PRIORITY */ if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { - sflt_termsock(so); sodealloc(so); return ((struct socket *)0); } @@ -328,17 +339,36 @@ sonewconn_internal(struct socket *head, int connstatus) socket_unlock(head, 0); if (((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL) != 0) || error) { - sflt_termsock(so); sodealloc(so); if (head->so_proto->pr_unlock) socket_lock(head, 0); return ((struct socket *)0); } - if (head->so_proto->pr_unlock) + if (head->so_proto->pr_unlock) { socket_lock(head, 0); + /* Radar 7385998 Recheck that the head is still accepting + * to avoid race condition when head is getting closed. + */ + if ((head->so_options & SO_ACCEPTCONN) == 0) { + so->so_state &= ~SS_NOFDREF; + soclose(so); + return ((struct socket *)0); + } + } + #ifdef __APPLE__ so->so_proto->pr_domain->dom_refs++; #endif + /* Insert in head appropriate lists */ + so->so_head = head; + + /* Since this socket is going to be inserted into the incomp + * queue, it can be picked up by another thread in + * tcp_dropdropablreq to get dropped before it is setup.. + * To prevent this race, set in-progress flag which can be + * cleared later + */ + so->so_flags |= SOF_INCOMP_INPROGRESS; if (connstatus) { TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); @@ -367,27 +397,7 @@ sonewconn_internal(struct socket *head, int connstatus) struct socket * sonewconn(struct socket *head, int connstatus, const struct sockaddr *from) { - int error = 0; - struct socket_filter_entry *filter; - int filtered = 0; - - for (filter = head->so_filt; filter && (error == 0); - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_connect_in) { - if (filtered == 0) { - filtered = 1; - sflt_use(head); - socket_unlock(head, 0); - } - error = filter->sfe_filter->sf_filter. - sf_connect_in(filter->sfe_cookie, head, from); - } - } - if (filtered != 0) { - socket_lock(head, 0); - sflt_unuse(head); - } - + int error = sflt_connectin(head, from); if (error) { return (NULL); } @@ -443,6 +453,7 @@ sbwait(struct sockbuf *sb) mutex_held = (*so->so_proto->pr_getlock)(so, 0); else mutex_held = so->so_proto->pr_domain->dom_mtx; + lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); sb->sb_flags |= SB_WAIT; @@ -458,8 +469,13 @@ sbwait(struct sockbuf *sb) if (so->so_usecount < 1) panic("sbwait: so=%p refcount=%d\n", so, so->so_usecount); - if ((so->so_state & SS_DRAINING)) { + if ((so->so_state & SS_DRAINING) || (so->so_flags & SOF_DEFUNCT)) { error = EBADF; + if (so->so_flags & SOF_DEFUNCT) { + SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", + __func__, proc_selfpid(), so, INP_SOCKAF(so), + INP_SOCKTYPE(so), error)); + } } return (error); @@ -484,10 +500,13 @@ sb_lock(struct sockbuf *sb) while (sb->sb_flags & SB_LOCK) { sb->sb_flags |= SB_WANT; + if (so->so_proto->pr_getlock != NULL) mutex_held = (*so->so_proto->pr_getlock)(so, 0); else mutex_held = so->so_proto->pr_domain->dom_mtx; + lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); + if (so->so_usecount < 1) panic("sb_lock: so=%p refcount=%d\n", so, so->so_usecount); @@ -498,6 +517,14 @@ sb_lock(struct sockbuf *sb) if (so->so_usecount < 1) panic("sb_lock: 2 so=%p refcount=%d\n", so, so->so_usecount); + + if (error == 0 && (so->so_flags & SOF_DEFUNCT)) { + error = EBADF; + SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", + __func__, proc_selfpid(), so, INP_SOCKAF(so), + INP_SOCKTYPE(so), error)); + } + if (error) return (error); } @@ -505,6 +532,15 @@ sb_lock(struct sockbuf *sb) return (0); } +void +sbwakeup(struct sockbuf *sb) +{ + if (sb->sb_flags & SB_WAIT) { + sb->sb_flags &= ~SB_WAIT; + wakeup((caddr_t)&sb->sb_cc); + } +} + /* * Wakeup processes waiting on a socket buffer. * Do asynchronous notification via SIGIO @@ -513,12 +549,17 @@ sb_lock(struct sockbuf *sb) void sowakeup(struct socket *so, struct sockbuf *sb) { + if (so->so_flags & SOF_DEFUNCT) { + SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] si 0x%x, " + "fl 0x%x [%s]\n", __func__, proc_selfpid(), so, + INP_SOCKAF(so), INP_SOCKTYPE(so), + (uint32_t)sb->sb_sel.si_flags, (uint16_t)sb->sb_flags, + (sb->sb_flags & SB_RECV) ? "rcv" : "snd")); + } + sb->sb_flags &= ~SB_SEL; selwakeup(&sb->sb_sel); - if (sb->sb_flags & SB_WAIT) { - sb->sb_flags &= ~SB_WAIT; - wakeup((caddr_t)&sb->sb_cc); - } + sbwakeup(sb); if (so->so_state & SS_ASYNC) { if (so->so_pgid < 0) gsignal(-so->so_pgid, SIGIO); @@ -685,7 +726,7 @@ sbappend(struct sockbuf *sb, struct mbuf *m) return (sbappendrecord(sb, m)); if (sb->sb_flags & SB_RECV) { - int error = sflt_data_in(so, NULL, &m, NULL, 0, NULL); + int error = sflt_data_in(so, NULL, &m, NULL, 0); SBLASTRECORDCHK(sb, "sbappend 2"); if (error != 0) { if (error != EJUSTRETURN) @@ -724,7 +765,7 @@ sbappendstream(struct sockbuf *sb, struct mbuf *m) } if (sb->sb_flags & SB_RECV) { - int error = sflt_data_in(so, NULL, &m, NULL, 0, NULL); + int error = sflt_data_in(so, NULL, &m, NULL, 0); SBLASTRECORDCHK(sb, "sbappendstream 1"); if (error != 0) { if (error != EJUSTRETURN) @@ -844,7 +885,7 @@ sbappendrecord(struct sockbuf *sb, struct mbuf *m0) if (sb->sb_flags & SB_RECV) { int error = sflt_data_in(sb->sb_so, NULL, &m0, NULL, - sock_data_filt_flag_record, NULL); + sock_data_filt_flag_record); if (error != 0) { SBLASTRECORDCHK(sb, "sbappendrecord 1"); if (error != EJUSTRETURN) @@ -895,7 +936,7 @@ sbinsertoob(struct sockbuf *sb, struct mbuf *m0) if ((sb->sb_flags & SB_RECV) != 0) { int error = sflt_data_in(sb->sb_so, NULL, &m0, NULL, - sock_data_filt_flag_oob, NULL); + sock_data_filt_flag_oob); SBLASTRECORDCHK(sb, "sbinsertoob 2"); if (error) { @@ -1040,7 +1081,7 @@ sbappendaddr(struct sockbuf *sb, struct sockaddr *asa, struct mbuf *m0, /* Call socket data in filters */ if ((sb->sb_flags & SB_RECV) != 0) { int error; - error = sflt_data_in(sb->sb_so, asa, &m0, &control, 0, NULL); + error = sflt_data_in(sb->sb_so, asa, &m0, &control, 0); SBLASTRECORDCHK(sb, __func__); if (error) { if (error != EJUSTRETURN) { @@ -1135,7 +1176,7 @@ sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control, if (sb->sb_flags & SB_RECV) { int error; - error = sflt_data_in(sb->sb_so, NULL, &m0, &control, 0, NULL); + error = sflt_data_in(sb->sb_so, NULL, &m0, &control, 0); SBLASTRECORDCHK(sb, __func__); if (error) { if (error != EJUSTRETURN) { @@ -1413,6 +1454,38 @@ sbcreatecontrol(caddr_t p, int size, int type, int level) return (m); } +struct mbuf** +sbcreatecontrol_mbuf(caddr_t p, int size, int type, int level, struct mbuf** mp) +{ + struct mbuf* m; + struct cmsghdr *cp; + + if (*mp == NULL){ + *mp = sbcreatecontrol(p, size, type, level); + return mp; + } + + if (CMSG_SPACE((u_int)size) + (*mp)->m_len > MLEN){ + mp = &(*mp)->m_next; + *mp = sbcreatecontrol(p, size, type, level); + return mp; + } + + m = *mp; + + cp = (struct cmsghdr *) (mtod(m, char *) + m->m_len); + m->m_len += CMSG_SPACE(size); + + /* XXX check size? */ + (void) memcpy(CMSG_DATA(cp), p, size); + cp->cmsg_len = CMSG_LEN(size); + cp->cmsg_level = level; + cp->cmsg_type = type; + + return mp; +} + + /* * Some routines that return EOPNOTSUPP for entry points that are not * supported by a protocol. Fill in as needed. @@ -1858,72 +1931,12 @@ soisbackground(struct socket *so) return (so->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BACKGROUND); } -#if PKT_PRIORITY -#define _MIN_NXT_CMSGHDR_PTR(cmsg) \ - ((char *)(cmsg) + \ - __DARWIN_ALIGN32((__uint32_t)(cmsg)->cmsg_len) + \ - __DARWIN_ALIGN32(sizeof(struct cmsghdr))) - -#define M_FIRST_CMSGHDR(m) \ - ((char *)(m) != (char *)0L && (size_t)(m)->m_len >= sizeof(struct cmsghdr) && \ - (socklen_t)(m)->m_len >= __DARWIN_ALIGN32(((struct cmsghdr *)(m)->m_data)->cmsg_len) ?\ - (struct cmsghdr *)(m)->m_data : \ - (struct cmsghdr *)0L) - -#define M_NXT_CMSGHDR(m, cmsg) \ - ((char *)(cmsg) == (char *)0L ? M_FIRST_CMSGHDR(m) : \ - _MIN_NXT_CMSGHDR_PTR(cmsg) > ((char *)(m)->m_data) + (m)->m_len || \ - _MIN_NXT_CMSGHDR_PTR(cmsg) < (char *)(m)->m_data ? \ - (struct cmsghdr *)0L /* NULL */ : \ - (struct cmsghdr *)((unsigned char *)(cmsg) + \ - __DARWIN_ALIGN32((__uint32_t)(cmsg)->cmsg_len))) -#endif /* PKT_PRIORITY */ - -__private_extern__ int -mbuf_traffic_class_from_control(struct mbuf *control) -{ -#if !PKT_PRIORITY -#pragma unused(control) - return MBUF_TC_NONE; -#else /* PKT_PRIORITY */ - struct cmsghdr *cm; - - for (cm = M_FIRST_CMSGHDR(control); cm; cm = M_NXT_CMSGHDR(control, cm)) { - int tc; - - if (cm->cmsg_len < sizeof(struct cmsghdr)) - break; - - if (cm->cmsg_level != SOL_SOCKET || cm->cmsg_type != SO_TRAFFIC_CLASS) - continue; - if (cm->cmsg_len != CMSG_LEN(sizeof(int))) - continue; - - tc = *(int *)CMSG_DATA(cm); - - switch (tc) { - case SO_TC_BE: - return MBUF_TC_BE; - case SO_TC_BK: - return MBUF_TC_BK; - case SO_TC_VI: - return MBUF_TC_VI; - case SO_TC_VO: - return MBUF_TC_VO; - default: - break; - } - } - - return MBUF_TC_NONE; -#endif /* PKT_PRIORITY */ -} /* * Here is the definition of some of the basic objects in the kern.ipc * branch of the MIB. */ -SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "IPC"); +SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW|CTLFLAG_LOCKED|CTLFLAG_ANYBODY, 0, "IPC"); /* Check that the maximum socket buffer size is within a range */ @@ -1946,20 +1959,20 @@ sysctl_sb_max(__unused struct sysctl_oid *oidp, __unused void *arg1, return error; } -SYSCTL_PROC(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_INT | CTLFLAG_RW, +SYSCTL_PROC(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &sb_max, 0, &sysctl_sb_max, "IU", "Maximum socket buffer size"); -SYSCTL_INT(_kern_ipc, OID_AUTO, maxsockets, CTLFLAG_RD, +SYSCTL_INT(_kern_ipc, OID_AUTO, maxsockets, CTLFLAG_RD | CTLFLAG_LOCKED, &maxsockets, 0, "Maximum number of sockets avaliable"); -SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW, +SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW | CTLFLAG_LOCKED, &sb_efficiency, 0, ""); -SYSCTL_INT(_kern_ipc, OID_AUTO, sbspace_factor, CTLFLAG_RW, +SYSCTL_INT(_kern_ipc, OID_AUTO, sbspace_factor, CTLFLAG_RW | CTLFLAG_LOCKED, &sbspace_factor, 0, "Ratio of mbuf/cluster use for socket layers"); -SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RD, +SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RD | CTLFLAG_LOCKED, &nmbclusters, 0, ""); -SYSCTL_INT(_kern_ipc, OID_AUTO, njcl, CTLFLAG_RD, &njcl, 0, ""); -SYSCTL_INT(_kern_ipc, OID_AUTO, njclbytes, CTLFLAG_RD, &njclbytes, 0, ""); -SYSCTL_INT(_kern_ipc, KIPC_SOQLIMITCOMPAT, soqlimitcompat, CTLFLAG_RW, +SYSCTL_INT(_kern_ipc, OID_AUTO, njcl, CTLFLAG_RD | CTLFLAG_LOCKED, &njcl, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, njclbytes, CTLFLAG_RD | CTLFLAG_LOCKED, &njclbytes, 0, ""); +SYSCTL_INT(_kern_ipc, KIPC_SOQLIMITCOMPAT, soqlimitcompat, CTLFLAG_RW | CTLFLAG_LOCKED, &soqlimitcompat, 1, "Enable socket queue limit compatibility"); -SYSCTL_INT(_kern_ipc, OID_AUTO, soqlencomp, CTLFLAG_RW, +SYSCTL_INT(_kern_ipc, OID_AUTO, soqlencomp, CTLFLAG_RW | CTLFLAG_LOCKED, &soqlencomp, 0, "Listen backlog represents only complete queue"); diff --git a/bsd/kern/uipc_syscalls.c b/bsd/kern/uipc_syscalls.c index 3c0aec400..521de769e 100644 --- a/bsd/kern/uipc_syscalls.c +++ b/bsd/kern/uipc_syscalls.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -86,6 +86,7 @@ #include #include #include +#include #include @@ -191,6 +192,22 @@ socket(struct proc *p, struct socket_args *uap, int32_t *retval) if (error) { fp_free(p, fd, fp); } else { + thread_t thread; + struct uthread *ut; + + thread = current_thread(); + ut = get_bsdthread_info(thread); + + /* if this is a backgrounded thread then throttle all new sockets */ +#if !CONFIG_EMBEDDED + if (proc_get_selfthread_isbackground() != 0) +#else /* !CONFIG_EMBEDDED */ + if ( (ut->uu_flag & UT_BACKGROUND) != 0 ) +#endif /* !CONFIG_EMBEDDED */ + { + so->so_traffic_mgt_flags |= TRAFFIC_MGT_SO_BACKGROUND; + so->so_background_thread = thread; + } fp->f_data = (caddr_t)so; proc_fdlock(p); @@ -510,16 +527,12 @@ accept_nocancel(struct proc *p, struct accept_nocancel_args *uap, releasefd: /* - * If the socket has been marked as inactive by soacceptfilter(), - * disallow further operations on it. We explicitly call shutdown - * on both data directions to ensure that SS_CANT{RCV,SEND}MORE - * states are set for the socket. This would also flush out data - * hanging off the receive list of this socket. + * If the socket has been marked as inactive by sosetdefunct(), + * disallow further operations on it. */ if (so->so_flags & SOF_DEFUNCT) { - (void) soshutdownlock(so, SHUT_RD); - (void) soshutdownlock(so, SHUT_WR); - (void) sodisconnectlocked(so); + sodefunct(current_proc(), so, + SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL); } if (dosocklock) @@ -735,6 +748,9 @@ socketpair(struct proc *p, struct socketpair_args *uap, } } + if ((error = copyout(sv, uap->rsv, 2 * sizeof (int))) != 0) + goto free4; + proc_fdlock(p); procfdtbl_releasefd(p, sv[0], NULL); procfdtbl_releasefd(p, sv[1], NULL); @@ -742,8 +758,7 @@ socketpair(struct proc *p, struct socketpair_args *uap, fp_drop(p, sv[1], fp2, 1); proc_fdunlock(p); - error = copyout((caddr_t)sv, uap->rsv, 2 * sizeof (int)); - return (error); + return (0); free4: fp_free(p, sv[1], fp2); free3: @@ -1194,63 +1209,79 @@ recvit(struct proc *p, int s, struct user_msghdr *mp, uio_t uiop, while (m && len > 0) { unsigned int tocopy; struct cmsghdr *cp = mtod(m, struct cmsghdr *); - - /* - * SCM_TIMESTAMP hack because struct timeval has a - * different size for 32 bits and 64 bits processes - */ - if (cp->cmsg_level == SOL_SOCKET && cp->cmsg_type == SCM_TIMESTAMP) { - unsigned char tmp_buffer[CMSG_SPACE(sizeof(struct user64_timeval))]; - struct cmsghdr *tmp_cp = (struct cmsghdr *)tmp_buffer; - int tmp_space; - struct timeval *tv = (struct timeval *)CMSG_DATA(cp); - - tmp_cp->cmsg_level = SOL_SOCKET; - tmp_cp->cmsg_type = SCM_TIMESTAMP; + int cp_size = CMSG_ALIGN(cp->cmsg_len); + int buflen = m->m_len; + + while (buflen > 0 && len > 0) { - if (proc_is64bit(p)) { - struct user64_timeval *tv64 = (struct user64_timeval *)CMSG_DATA(tmp_cp); - - tv64->tv_sec = tv->tv_sec; - tv64->tv_usec = tv->tv_usec; - - tmp_cp->cmsg_len = CMSG_LEN(sizeof(struct user64_timeval)); - tmp_space = CMSG_SPACE(sizeof(struct user64_timeval)); - } else { - struct user32_timeval *tv32 = (struct user32_timeval *)CMSG_DATA(tmp_cp); + /* + SCM_TIMESTAMP hack because struct timeval has a + * different size for 32 bits and 64 bits processes + */ + if (cp->cmsg_level == SOL_SOCKET && cp->cmsg_type == SCM_TIMESTAMP) { + unsigned char tmp_buffer[CMSG_SPACE(sizeof(struct user64_timeval))]; + struct cmsghdr *tmp_cp = (struct cmsghdr *)tmp_buffer; + int tmp_space; + struct timeval *tv = (struct timeval *)CMSG_DATA(cp); + + tmp_cp->cmsg_level = SOL_SOCKET; + tmp_cp->cmsg_type = SCM_TIMESTAMP; + + if (proc_is64bit(p)) { + struct user64_timeval *tv64 = (struct user64_timeval *)CMSG_DATA(tmp_cp); + + tv64->tv_sec = tv->tv_sec; + tv64->tv_usec = tv->tv_usec; + + tmp_cp->cmsg_len = CMSG_LEN(sizeof(struct user64_timeval)); + tmp_space = CMSG_SPACE(sizeof(struct user64_timeval)); + } else { + struct user32_timeval *tv32 = (struct user32_timeval *)CMSG_DATA(tmp_cp); + + tv32->tv_sec = tv->tv_sec; + tv32->tv_usec = tv->tv_usec; + + tmp_cp->cmsg_len = CMSG_LEN(sizeof(struct user32_timeval)); + tmp_space = CMSG_SPACE(sizeof(struct user32_timeval)); + } + if (len >= tmp_space) { + tocopy = tmp_space; + } else { + mp->msg_flags |= MSG_CTRUNC; + tocopy = len; + } + error = copyout(tmp_buffer, ctlbuf, tocopy); + if (error) + goto out; - tv32->tv_sec = tv->tv_sec; - tv32->tv_usec = tv->tv_usec; - - tmp_cp->cmsg_len = CMSG_LEN(sizeof(struct user32_timeval)); - tmp_space = CMSG_SPACE(sizeof(struct user32_timeval)); - } - if (len >= tmp_space) { - tocopy = tmp_space; - } else { - mp->msg_flags |= MSG_CTRUNC; - tocopy = len; - } - error = copyout(tmp_buffer, ctlbuf, tocopy); - if (error) - goto out; - - } else { - if (len >= m->m_len) { - tocopy = m->m_len; } else { - mp->msg_flags |= MSG_CTRUNC; - tocopy = len; + + if (cp_size > buflen) { + panic("cp_size > buflen, something wrong with alignment!"); + } + + if (len >= cp_size) { + tocopy = cp_size; + } else { + mp->msg_flags |= MSG_CTRUNC; + tocopy = len; + } + + error = copyout((caddr_t) cp, ctlbuf, + tocopy); + if (error) + goto out; } - - error = copyout((caddr_t)mtod(m, caddr_t), ctlbuf, - tocopy); - if (error) - goto out; + + + ctlbuf += tocopy; + len -= tocopy; + + buflen -= cp_size; + cp = (struct cmsghdr *) ((unsigned char *) cp + cp_size); + cp_size = CMSG_ALIGN(cp->cmsg_len); } - - ctlbuf += tocopy; - len -= tocopy; + m = m->m_next; } mp->msg_controllen = ctlbuf - mp->msg_control; @@ -1266,7 +1297,6 @@ recvit(struct proc *p, int s, struct user_msghdr *mp, uio_t uiop, return (error); } - /* * Returns: 0 Success * ENOMEM @@ -1698,28 +1728,9 @@ getsockname(__unused struct proc *p, struct getsockname_args *uap, socket_lock(so, 1); error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa); if (error == 0) { - struct socket_filter_entry *filter; - int filtered = 0; - for (filter = so->so_filt; filter && error == 0; - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_getsockname) { - if (!filtered) { - filtered = 1; - sflt_use(so); - socket_unlock(so, 0); - } - error = filter->sfe_filter->sf_filter. - sf_getsockname(filter->sfe_cookie, so, &sa); - } - } - + error = sflt_getsockname(so, &sa); if (error == EJUSTRETURN) error = 0; - - if (filtered) { - socket_lock(so, 0); - sflt_unuse(so); - } } socket_unlock(so, 1); if (error) @@ -1802,28 +1813,9 @@ getpeername(__unused struct proc *p, struct getpeername_args *uap, sa = 0; error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa); if (error == 0) { - struct socket_filter_entry *filter; - int filtered = 0; - for (filter = so->so_filt; filter && error == 0; - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_getpeername) { - if (!filtered) { - filtered = 1; - sflt_use(so); - socket_unlock(so, 0); - } - error = filter->sfe_filter->sf_filter. - sf_getpeername(filter->sfe_cookie, so, &sa); - } - } - + error = sflt_getpeername(so, &sa); if (error == EJUSTRETURN) error = 0; - - if (filtered) { - socket_lock(so, 0); - sflt_unuse(so); - } } socket_unlock(so, 1); if (error) @@ -1983,7 +1975,7 @@ SYSCTL_DECL(_kern_ipc); #define SFUIOBUFS 64 static int sendfileuiobufs = SFUIOBUFS; -SYSCTL_INT(_kern_ipc, OID_AUTO, sendfileuiobufs, CTLFLAG_RW, &sendfileuiobufs, +SYSCTL_INT(_kern_ipc, OID_AUTO, sendfileuiobufs, CTLFLAG_RW | CTLFLAG_LOCKED, &sendfileuiobufs, 0, ""); /* Macros to compute the number of mbufs needed depending on cluster size */ @@ -2026,13 +2018,13 @@ alloc_sendpkt(int how, size_t pktlen, unsigned int *maxchunks, * use mbuf_allocpacket(). The logic below is similar to sosend(). */ *m = NULL; - if (pktlen > NBPG && jumbocl) { + if (pktlen > MBIGCLBYTES && jumbocl) { needed = MIN(SENDFILE_MAX_16K, HOWMANY_16K(pktlen)); *m = m_getpackets_internal(&needed, 1, how, 0, M16KCLBYTES); } if (*m == NULL) { needed = MIN(SENDFILE_MAX_4K, HOWMANY_4K(pktlen)); - *m = m_getpackets_internal(&needed, 1, how, 0, NBPG); + *m = m_getpackets_internal(&needed, 1, how, 0, MBIGCLBYTES); } /* @@ -2043,7 +2035,7 @@ alloc_sendpkt(int how, size_t pktlen, unsigned int *maxchunks, */ if (*m == NULL) { needed = 1; - *m = m_getpackets_internal(&needed, 1, M_WAIT, 1, NBPG); + *m = m_getpackets_internal(&needed, 1, M_WAIT, 1, MBIGCLBYTES); } if (*m == NULL) panic("%s: blocking allocation returned NULL\n", __func__); @@ -2295,7 +2287,7 @@ sendfile(struct proc *p, struct sendfile_args *uap, __unused int *retval) if (xfsize != uio_resid(auio)) printf("sendfile: xfsize: %lld != uio_resid(auio): " - "%lld\n", xfsize, uio_resid(auio)); + "%lld\n", xfsize, (long long)uio_resid(auio)); KERNEL_DEBUG_CONSTANT((DBG_FNC_SENDFILE_READ | DBG_FUNC_START), uap->s, (unsigned int)((xfsize >> 32) & 0x0ffffffff), @@ -2385,53 +2377,20 @@ sendfile(struct proc *p, struct sendfile_args *uap, __unused int *retval) } goto retry_space; } + + struct mbuf *control = NULL; { /* * Socket filter processing */ - struct socket_filter_entry *filter; - int filtered = 0; - struct mbuf *control = NULL; - boolean_t recursive = (so->so_send_filt_thread != NULL); - error = 0; - for (filter = so->so_filt; filter && (error == 0); - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_data_out) { - if (filtered == 0) { - filtered = 1; - so->so_send_filt_thread = - current_thread(); - sflt_use(so); - socket_unlock(so, 0); - } - error = filter->sfe_filter->sf_filter. - sf_data_out(filter->sfe_cookie, so, - NULL, &m0, &control, 0); - } - } - - if (filtered) { - /* - * At this point, we've run at least one filter. - * The socket is unlocked as is the socket - * buffer. Clear the recorded filter thread - * only when we are outside of a filter's - * context. This allows for a filter to issue - * multiple inject calls from its sf_data_out - * callback routine. - */ - socket_lock(so, 0); - sflt_unuse(so); - if (!recursive) - so->so_send_filt_thread = 0; - if (error) { - if (error == EJUSTRETURN) { - error = 0; - continue; - } - goto done3; + error = sflt_data_out(so, NULL, &m0, &control, 0); + if (error) { + if (error == EJUSTRETURN) { + error = 0; + continue; } + goto done3; } /* * End Socket filter processing @@ -2440,7 +2399,7 @@ sendfile(struct proc *p, struct sendfile_args *uap, __unused int *retval) KERNEL_DEBUG_CONSTANT((DBG_FNC_SENDFILE_SEND | DBG_FUNC_START), uap->s, 0, 0, 0, 0); error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m0, - 0, 0, p); + 0, control, p); KERNEL_DEBUG_CONSTANT((DBG_FNC_SENDFILE_SEND | DBG_FUNC_START), uap->s, 0, 0, 0, 0); if (error) { diff --git a/bsd/kern/uipc_usrreq.c b/bsd/kern/uipc_usrreq.c index 202f2d858..c64053a2c 100644 --- a/bsd/kern/uipc_usrreq.c +++ b/bsd/kern/uipc_usrreq.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -117,6 +117,32 @@ static u_int disconnect_in_progress; extern lck_mtx_t *uipc_lock; static struct unp_head unp_shead, unp_dhead; +/* + * mDNSResponder tracing. When enabled, endpoints connected to + * /var/run/mDNSResponder will be traced; during each send on + * the traced socket, we log the PID and process name of the + * sending process. We also print out a bit of info related + * to the data itself; this assumes ipc_msg_hdr in dnssd_ipc.h + * of mDNSResponder stays the same. + */ +#define MDNSRESPONDER_PATH "/var/run/mDNSResponder" + +static int unpst_tracemdns; /* enable tracing */ + +#define MDNS_IPC_MSG_HDR_VERSION_1 1 + +struct mdns_ipc_msg_hdr { + uint32_t version; + uint32_t datalen; + uint32_t ipc_flags; + uint32_t op; + union { + void *context; + uint32_t u32[2]; + } __attribute__((packed)); + uint32_t reg_index; +} __attribute__((packed)); + /* * Unix communications domain. * @@ -271,7 +297,7 @@ uipc_detach(struct socket *so) if (unp == 0) return (EINVAL); - lck_mtx_assert(unp->unp_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&unp->unp_mtx, LCK_MTX_ASSERT_OWNED); unp_detach(unp); return (0); } @@ -428,7 +454,8 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, } so2 = unp->unp_conn->unp_socket; - unp_get_locks_in_order(so, so2); + if (so != so2) + unp_get_locks_in_order(so, so2); if (unp->unp_addr) from = (struct sockaddr *)unp->unp_addr; @@ -450,7 +477,8 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, control = NULL; } - socket_unlock(so2, 1); + if (so != so2) + socket_unlock(so2, 1); m = NULL; if (nam) @@ -498,6 +526,16 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, break; } + if (unp->unp_flags & UNP_TRACE_MDNS) { + struct mdns_ipc_msg_hdr hdr; + + if (mbuf_copydata(m, 0, sizeof (hdr), &hdr) == 0 && + hdr.version == ntohl(MDNS_IPC_MSG_HDR_VERSION_1)) { + printf("%s[mDNSResponder] pid=%d (%s): op=0x%x\n", + __func__, p->p_pid, p->p_comm, ntohl(hdr.op)); + } + } + /* * Send to paired receive port, and then reduce send buffer * hiwater marks to maintain backpressure. Wake up readers. @@ -694,17 +732,19 @@ static int unp_rights; /* file descriptors in flight */ static int unp_disposed; /* discarded file descriptors */ SYSCTL_DECL(_net_local_stream); -SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW, +SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW | CTLFLAG_LOCKED, &unpst_sendspace, 0, ""); -SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW, +SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW | CTLFLAG_LOCKED, &unpst_recvspace, 0, ""); +SYSCTL_INT(_net_local_stream, OID_AUTO, tracemdns, CTLFLAG_RW | CTLFLAG_LOCKED, + &unpst_tracemdns, 0, ""); SYSCTL_DECL(_net_local_dgram); -SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW, +SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW | CTLFLAG_LOCKED, &unpdg_sendspace, 0, ""); -SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW, +SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW | CTLFLAG_LOCKED, &unpdg_recvspace, 0, ""); SYSCTL_DECL(_net_local); -SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, ""); +SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD | CTLFLAG_LOCKED, &unp_rights, 0, ""); /* * Returns: 0 Success @@ -739,11 +779,8 @@ unp_attach(struct socket *so) return (ENOBUFS); bzero(unp, sizeof (*unp)); - unp->unp_mtx = lck_mtx_alloc_init(unp_mtx_grp, unp_mtx_attr); - if (unp->unp_mtx == NULL) { - zfree(unp_zone, unp); - return(ENOBUFS); - } + lck_mtx_init(&unp->unp_mtx, + unp_mtx_grp, unp_mtx_attr); lck_rw_lock_exclusive(unp_list_mtx); LIST_INIT(&unp->unp_refs); @@ -892,7 +929,7 @@ unp_bind( socket_unlock(so, 0); strlcpy(buf, soun->sun_path, namelen+1); - NDINIT(&nd, CREATE, FOLLOW | LOCKPARENT, UIO_SYSSPACE, + NDINIT(&nd, CREATE, OP_MKFIFO, FOLLOW | LOCKPARENT, UIO_SYSSPACE, CAST_USER_ADDR_T(buf), ctx); /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ error = namei(&nd); @@ -938,7 +975,7 @@ unp_bind( if (!error) { /* create the socket */ - error = vn_create(dvp, &vp, &nd.ni_cnd, &va, 0, ctx); + error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx); } nameidone(&nd); @@ -1001,7 +1038,7 @@ unp_connect(struct socket *so, struct sockaddr *nam, __unused proc_t p) strlcpy(buf, soun->sun_path, len+1); socket_unlock(so, 0); - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, + NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, CAST_USER_ADDR_T(buf), ctx); error = namei(&nd); if (error) { @@ -1046,8 +1083,13 @@ unp_connect(struct socket *so, struct sockaddr *nam, __unused proc_t p) if (so2->so_pcb == NULL) { error = ECONNREFUSED; - socket_unlock(so2, 1); - socket_lock(so, 0); + if (so != so2) { + socket_unlock(so2, 1); + socket_lock(so, 0); + } else { + /* Release the reference held for the listen socket */ + so2->so_usecount--; + } goto out; } @@ -1055,7 +1097,7 @@ unp_connect(struct socket *so, struct sockaddr *nam, __unused proc_t p) socket_unlock(so2, 0); socket_lock(so, 0); socket_lock(so2, 0); - } else { + } else if (so > so2) { socket_lock(so, 0); } /* @@ -1064,15 +1106,13 @@ unp_connect(struct socket *so, struct sockaddr *nam, __unused proc_t p) * XXX - probably shouldn't return an error for SOCK_DGRAM */ if ((so->so_state & SS_ISCONNECTED) != 0) { - socket_unlock(so2, 1); error = EISCONN; - goto out; + goto decref_out; } if (so->so_type != so2->so_type) { - socket_unlock(so2, 1); error = EPROTOTYPE; - goto out; + goto decref_out; } if (so->so_proto->pr_flags & PR_CONNREQUIRED) { @@ -1149,19 +1189,41 @@ unp_connect(struct socket *so, struct sockaddr *nam, __unused proc_t p) socket_lock(so3, 1); so2 = so3; + /* + * Enable tracing for mDNSResponder endpoints. (The use + * of sizeof instead of strlen below takes the null + * terminating character into account.) + */ + if (unpst_tracemdns && + !strncmp(soun->sun_path, MDNSRESPONDER_PATH, + sizeof (MDNSRESPONDER_PATH))) { + unp->unp_flags |= UNP_TRACE_MDNS; + unp2->unp_flags |= UNP_TRACE_MDNS; + } } error = unp_connect2(so, so2); + +decref_out: if (so2 != NULL) { - socket_unlock(so2, 1); + if (so != so2) { + socket_unlock(so2, 1); + } else { + /* Release the extra reference held for the listen socket. + * This is possible only for SOCK_DGRAM sockets. We refuse + * connecting to the same socket for SOCK_STREAM sockets. + */ + so2->so_usecount--; + } } if (list_so != NULL) { socket_lock(list_so, 0); socket_unlock(list_so, 1); } + out: - lck_mtx_assert(unp->unp_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&unp->unp_mtx, LCK_MTX_ASSERT_OWNED); vnode_put(vp); return (error); } @@ -1182,8 +1244,8 @@ unp_connect2(struct socket *so, struct socket *so2) unp2 = sotounpcb(so2); - lck_mtx_assert(unp->unp_mtx, LCK_MTX_ASSERT_OWNED); - lck_mtx_assert(unp2->unp_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&unp->unp_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&unp2->unp_mtx, LCK_MTX_ASSERT_OWNED); /* Verify both sockets are still opened */ if (unp == 0 || unp2 == 0) @@ -1197,15 +1259,18 @@ unp_connect2(struct socket *so, struct socket *so2) case SOCK_DGRAM: LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink); - - /* Avoid lock order reversals due to drop/acquire in soisconnected. */ - /* Keep an extra reference on so2 that will be dropped - * soon after getting the locks in order - */ - socket_unlock(so2, 0); - soisconnected(so); - unp_get_locks_in_order(so, so2); - so2->so_usecount--; + if (so != so2) { + /* Avoid lock order reversals due to drop/acquire in soisconnected. */ + /* Keep an extra reference on so2 that will be dropped + * soon after getting the locks in order + */ + socket_unlock(so2, 0); + soisconnected(so); + unp_get_locks_in_order(so, so2); + so2->so_usecount--; + } else { + soisconnected(so); + } break; @@ -1242,8 +1307,8 @@ unp_connect2(struct socket *so, struct socket *so2) default: panic("unknown socket type %d in unp_connect2", so->so_type); } - lck_mtx_assert(unp->unp_mtx, LCK_MTX_ASSERT_OWNED); - lck_mtx_assert(unp2->unp_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&unp->unp_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&unp2->unp_mtx, LCK_MTX_ASSERT_OWNED); return (0); } @@ -1284,7 +1349,12 @@ unp_disconnect(struct unpcb *unp) so2 = unp2->unp_socket; try_again: - if (so < so2) { + if (so == so2) { + if (so_locked == 0) { + socket_lock(so, 0); + } + waitso = so; + } else if (so < so2) { if (so_locked == 0) { socket_lock(so, 0); } @@ -1298,19 +1368,22 @@ unp_disconnect(struct unpcb *unp) socket_lock(so, 0); waitso = so; } + so_locked = 1; - lck_mtx_assert(unp->unp_mtx, LCK_MTX_ASSERT_OWNED); - lck_mtx_assert(unp2->unp_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&unp->unp_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&unp2->unp_mtx, LCK_MTX_ASSERT_OWNED); /* Check for the UNP_DONTDISCONNECT flag, if it * is set, release both sockets and go to sleep */ if ((((struct unpcb *)waitso->so_pcb)->unp_flags & UNP_DONTDISCONNECT) != 0) { - socket_unlock(so2, 1); + if (so != so2) { + socket_unlock(so2, 1); + } so_locked = 0; - (void)msleep(waitso->so_pcb, unp->unp_mtx, + (void)msleep(waitso->so_pcb, &unp->unp_mtx, PSOCK | PDROP, "unpdisconnect", NULL); goto try_again; } @@ -1322,12 +1395,16 @@ unp_disconnect(struct unpcb *unp) unp->unp_conn = NULL; so2->so_usecount--; + if (unp->unp_flags & UNP_TRACE_MDNS) + unp->unp_flags &= ~UNP_TRACE_MDNS; + switch (unp->unp_socket->so_type) { case SOCK_DGRAM: LIST_REMOVE(unp, unp_reflink); unp->unp_socket->so_state &= ~SS_ISCONNECTED; - socket_unlock(so2, 1); + if (so != so2) + socket_unlock(so2, 1); break; case SOCK_STREAM: @@ -1343,6 +1420,10 @@ unp_disconnect(struct unpcb *unp) unp2->unp_socket->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); unp->unp_socket->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED); + + if (unp2->unp_flags & UNP_TRACE_MDNS) + unp2->unp_flags &= ~UNP_TRACE_MDNS; + strdisconn = 1; break; default: @@ -1362,7 +1443,7 @@ unp_disconnect(struct unpcb *unp) socket_lock(so,0); soisdisconnected(so); } - lck_mtx_assert(unp->unp_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&unp->unp_mtx, LCK_MTX_ASSERT_OWNED); return; } @@ -1519,10 +1600,10 @@ unp_pcblist SYSCTL_HANDLER_ARGS return (error); } -SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD, +SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED, (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb", "List of active local datagram sockets"); -SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD, +SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED, (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb", "List of active local stream sockets"); @@ -1662,10 +1743,10 @@ unp_pcblist64 SYSCTL_HANDLER_ARGS return (error); } -SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist64, CTLFLAG_RD, +SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist64, CTLFLAG_RD | CTLFLAG_LOCKED, (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist64, "S,xunpcb64", "List of active local datagram sockets 64 bit"); -SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist64, CTLFLAG_RD, +SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist64, CTLFLAG_RD | CTLFLAG_LOCKED, (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist64, "S,xunpcb64", "List of active local stream sockets 64 bit"); @@ -2195,7 +2276,7 @@ unp_lock(struct socket *so, int refcount, void * lr) else lr_saved = lr; if (so->so_pcb) { - lck_mtx_lock(((struct unpcb *)so->so_pcb)->unp_mtx); + lck_mtx_lock(&((struct unpcb *)so->so_pcb)->unp_mtx); } else { panic("unp_lock: so=%p NO PCB! lr=%p ref=0x%x\n", so, lr_saved, so->so_usecount); @@ -2232,7 +2313,7 @@ unp_unlock(struct socket *so, int refcount, void * lr) if (so->so_pcb == NULL) { panic("unp_unlock: so=%p NO PCB usecount=%x\n", so, so->so_usecount); } else { - mutex_held = ((struct unpcb *)so->so_pcb)->unp_mtx; + mutex_held = &((struct unpcb *)so->so_pcb)->unp_mtx; } lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); so->unlock_lr[so->next_unlock_lr] = lr_saved; @@ -2245,8 +2326,6 @@ unp_unlock(struct socket *so, int refcount, void * lr) FREE(unp->unp_addr, M_SONAME); lck_mtx_unlock(mutex_held); - if (unp->unp_mtx) - lck_mtx_free(unp->unp_mtx, unp_mtx_grp); unp->unp_gencnt = ++unp_gencnt; zfree(unp_zone, unp); @@ -2269,7 +2348,7 @@ unp_getlock(struct socket *so, __unused int locktype) if (so->so_pcb) { if (so->so_usecount < 0) panic("unp_getlock: so=%p usecount=%x\n", so, so->so_usecount); - return(unp->unp_mtx); + return(&unp->unp_mtx); } else { panic("unp_getlock: so=%p NULL so_pcb\n", so); return (so->so_proto->pr_domain->dom_mtx); diff --git a/bsd/kern/vm_pressure.c b/bsd/kern/vm_pressure.c new file mode 100644 index 000000000..b5fc2f072 --- /dev/null +++ b/bsd/kern/vm_pressure.c @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2009-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void vm_pressure_klist_lock(void); +void vm_pressure_klist_unlock(void); + +void vm_dispatch_memory_pressure(void); +int vm_try_terminate_candidates(void); +int vm_try_pressure_candidates(void); +void vm_recharge_active_list(void); + +struct klist vm_pressure_klist; +struct klist vm_pressure_klist_dormant; + +void vm_pressure_klist_lock(void) { + lck_mtx_lock(&vm_pressure_klist_mutex); +} + +void vm_pressure_klist_unlock(void) { + lck_mtx_unlock(&vm_pressure_klist_mutex); +} + +int vm_knote_register(struct knote *kn) { + int rv = 0; + + vm_pressure_klist_lock(); + + if ((kn->kn_sfflags & (NOTE_VM_PRESSURE))) { +#if DEBUG + printf("[vm_pressure] process %d registering pressure notification\n", kn->kn_kq->kq_p->p_pid); +#endif + KNOTE_ATTACH(&vm_pressure_klist, kn); + } else + rv = ENOTSUP; + + vm_pressure_klist_unlock(); + + return rv; +} + +void vm_knote_unregister(struct knote *kn) { + struct knote *kn_temp; + + vm_pressure_klist_lock(); + +#if DEBUG + printf("[vm_pressure] process %d cancelling pressure notification\n", kn->kn_kq->kq_p->p_pid); +#endif + + SLIST_FOREACH(kn_temp, &vm_pressure_klist, kn_selnext) { + if (kn_temp == kn) { + KNOTE_DETACH(&vm_pressure_klist, kn); + vm_pressure_klist_unlock(); + return; + } + } + KNOTE_DETACH(&vm_pressure_klist_dormant, kn); + + vm_pressure_klist_unlock(); +} + +/* Interface for event dispatch from vm_pageout_garbage_collect thread */ +void consider_pressure_events(void) { + vm_dispatch_memory_pressure(); +} + +void vm_dispatch_memory_pressure(void) { + vm_pressure_klist_lock(); + + if (!SLIST_EMPTY(&vm_pressure_klist)) { + +#if DEBUG + printf("[vm_pressure] vm_dispatch_memory_pressure\n"); +#endif + + if (vm_try_pressure_candidates()) { + vm_pressure_klist_unlock(); + return; + } + + } + + /* Else... */ + +#if DEBUG + printf("[vm_pressure] could not find suitable event candidate\n"); +#endif + + vm_recharge_active_list(); + + vm_pressure_klist_unlock(); +} + +/* + * Try standard pressure event candidates. Called with klist lock held. + */ +int vm_try_pressure_candidates(void) { + /* + * This value is the threshold that a process must meet to be considered for scavenging. + * If a process has sufficiently little resident memory, there is probably no use scavenging it. + * At best, we'll scavenge very little memory. At worst, we'll page in code pages or malloc metadata. + */ + +#define VM_PRESSURE_MINIMUM_RSIZE (10 * 1024 * 1024) + + struct proc *p_max = NULL; + unsigned int resident_max = 0; + struct knote *kn_max = NULL; + struct knote *kn; + + SLIST_FOREACH(kn, &vm_pressure_klist, kn_selnext) { + if ( (kn != NULL ) && ( kn->kn_kq != NULL ) && ( kn->kn_kq->kq_p != NULL ) ) { + if (kn->kn_sfflags & NOTE_VM_PRESSURE) { + struct proc *p = kn->kn_kq->kq_p; + if (!(kn->kn_status & KN_DISABLED)) { + kern_return_t kr = KERN_SUCCESS; + struct task *t = (struct task *)(p->task); + struct task_basic_info basic_info; + mach_msg_type_number_t size = TASK_BASIC_INFO_COUNT; + if( ( kr = task_info(t, TASK_BASIC_INFO, (task_info_t)(&basic_info), &size)) == KERN_SUCCESS ) { + unsigned int resident_size = basic_info.resident_size; + /* + * We don't want a small process to block large processes from + * being notified again. + */ + if (resident_size >= VM_PRESSURE_MINIMUM_RSIZE) { + if (resident_size > resident_max) { + p_max = p; + resident_max = resident_size; + kn_max = kn; + } + } else { +#if DEBUG + /* There was no candidate with enough resident memory to scavenge */ + /* This debug print makes too much noise now */ + //printf("[vm_pressure] threshold failed for pid %d with %u resident, skipping...\n", p->p_pid, resident_size); +#endif + } + } else { +#if DEBUG + printf("[vm_pressure] task_info for pid %d failed with %d\n", p->p_pid, kr); +#endif + } + } else { +#if DEBUG + printf("[vm_pressure] pid %d currently disabled, skipping...\n", p->p_pid); +#endif + } + } + } else { +#if DEBUG + if (kn == NULL) { + printf("[vm_pressure] kn is NULL\n"); + } else if (kn->kn_kq == NULL) { + printf("[vm_pressure] kn->kn_kq is NULL\n"); + } else if (kn->kn_kq->kq_p == NULL) { + printf("[vm_pressure] kn->kn_kq->kq_p is NULL\n"); + } +#endif + } + } + + if (kn_max == NULL) return 0; + +#if DEBUG + printf("[vm_pressure] sending event to pid %d with %u resident\n", kn_max->kn_kq->kq_p->p_pid, resident_max); +#endif + + KNOTE_DETACH(&vm_pressure_klist, kn_max); + struct klist dispatch_klist = { NULL }; + KNOTE_ATTACH(&dispatch_klist, kn_max); + KNOTE(&dispatch_klist, NOTE_VM_PRESSURE); + KNOTE_ATTACH(&vm_pressure_klist_dormant, kn_max); + + return 1; +} + + +/* + * Remove all elements from the dormant list and place them on the active list. + * Called with klist lock held. + */ +void vm_recharge_active_list(void) { + /* Re-charge the main list from the dormant list if possible */ + if (!SLIST_EMPTY(&vm_pressure_klist_dormant)) { +#if DEBUG + printf("[vm_pressure] recharging main list from dormant list\n"); +#endif + struct knote *kn; + while (!SLIST_EMPTY(&vm_pressure_klist_dormant)) { + kn = SLIST_FIRST(&vm_pressure_klist_dormant); + SLIST_REMOVE_HEAD(&vm_pressure_klist_dormant, kn_selnext); + SLIST_INSERT_HEAD(&vm_pressure_klist, kn, kn_selnext); + } + } +} diff --git a/bsd/ppc/reg.h b/bsd/kern/vm_pressure.h similarity index 80% rename from bsd/ppc/reg.h rename to bsd/kern/vm_pressure.h index 0449be6df..8063c820a 100644 --- a/bsd/ppc/reg.h +++ b/bsd/kern/vm_pressure.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,21 +25,17 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * Copyright 1993, NeXT Computer, Inc. - */ - -#ifndef _BSD_PPC_REG_H_ -#define _BSD_PPC_REG_H_ +#ifndef VM_PRESSURE_H +#define VM_PRESSURE_H -#ifdef BSD_KERNEL_PRIVATE +#include -/* Index into the thread_state */ -#define SP 3 -#define PC 0 +static lck_mtx_t vm_pressure_klist_mutex; -#endif /* KERNEL_PRIVATE */ +int vm_knote_register(struct knote *); +void vm_knote_unregister(struct knote *); -#endif /* _BSD_PPC_REG_H_ */ +void consider_pressure_events(void); +#endif /* VM_PRESSURE_H */ diff --git a/bsd/libkern/libkern.h b/bsd/libkern/libkern.h index 6fd1f7a86..0d9cff919 100644 --- a/bsd/libkern/libkern.h +++ b/bsd/libkern/libkern.h @@ -203,16 +203,7 @@ extern void flush_dcache64(addr64_t, unsigned, int); static __inline__ unsigned int clz(unsigned int num) { -#if __ppc__ - unsigned int result; - __asm__ volatile( - "cntlzw %0, %1" - : "=r" (result) - : "r" (num) - ); - return result; - -#elif __i386__ +#if __i386__ unsigned int result; __asm__ volatile( "bsrl %1, %0\n\t" diff --git a/bsd/machine/_limits.h b/bsd/machine/_limits.h index dd32b6197..c1d8abd07 100644 --- a/bsd/machine/_limits.h +++ b/bsd/machine/_limits.h @@ -28,9 +28,7 @@ #ifndef _BSD_MACHINE__LIMITS_H_ #define _BSD_MACHINE__LIMITS_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/_limits.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/_limits.h" #else #error architecture not supported diff --git a/bsd/machine/_param.h b/bsd/machine/_param.h index 844370744..beb2cb939 100644 --- a/bsd/machine/_param.h +++ b/bsd/machine/_param.h @@ -25,9 +25,7 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/_param.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/_param.h" #else #error architecture not supported diff --git a/bsd/machine/_structs.h b/bsd/machine/_structs.h index a0e15996e..509d5f618 100644 --- a/bsd/machine/_structs.h +++ b/bsd/machine/_structs.h @@ -25,9 +25,7 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/_structs.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/_structs.h" #else #error architecture not supported diff --git a/bsd/machine/_types.h b/bsd/machine/_types.h index ceac56ea0..92c65bf6c 100644 --- a/bsd/machine/_types.h +++ b/bsd/machine/_types.h @@ -28,9 +28,7 @@ #ifndef _BSD_MACHINE__TYPES_H_ #define _BSD_MACHINE__TYPES_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/_types.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/_types.h" #else #error architecture not supported diff --git a/bsd/machine/dis_tables.h b/bsd/machine/dis_tables.h index 6eaff8106..7ac37dd7e 100644 --- a/bsd/machine/dis_tables.h +++ b/bsd/machine/dis_tables.h @@ -28,9 +28,7 @@ #ifndef _BSD_MACHINE_DIS_TABLES_H_ #define _BSD_MACHINE_DIS_TABLES_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/dis_tables.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/dis_tables.h" #else #error architecture not supported diff --git a/bsd/machine/disklabel.h b/bsd/machine/disklabel.h index 93fa986ed..490bbda8a 100644 --- a/bsd/machine/disklabel.h +++ b/bsd/machine/disklabel.h @@ -28,9 +28,7 @@ #ifndef _BSD_MACHINE_CPU_H_ #define _BSD_MACHINE_CPU_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/disklabel.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/disklabel.h" #else #error architecture not supported diff --git a/bsd/machine/endian.h b/bsd/machine/endian.h index 879cf17bd..871af6483 100644 --- a/bsd/machine/endian.h +++ b/bsd/machine/endian.h @@ -31,9 +31,7 @@ #ifndef _BSD_MACHINE_ENDIAN_H_ #define _BSD_MACHINE_ENDIAN_H_ -#if defined (__ppc__) || defined(__ppc64__) -#include "ppc/endian.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/endian.h" #else #error architecture not supported diff --git a/bsd/machine/exec.h b/bsd/machine/exec.h index fc8a27279..1a6417179 100644 --- a/bsd/machine/exec.h +++ b/bsd/machine/exec.h @@ -44,16 +44,14 @@ struct exec_info { struct exec_archhandler { char path[MAXPATHLEN]; uint32_t fsid; - long fileid; + uint64_t fileid; }; extern struct exec_archhandler exec_archhandler_ppc; int set_archhandler(struct proc *, int); int grade_binary(cpu_type_t, cpu_subtype_t); -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/exec.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/exec.h" #else #error architecture not supported diff --git a/bsd/machine/fasttrap_isa.h b/bsd/machine/fasttrap_isa.h index d57bac1ba..cfe9e297a 100644 --- a/bsd/machine/fasttrap_isa.h +++ b/bsd/machine/fasttrap_isa.h @@ -28,9 +28,7 @@ #ifndef _BSD_MACHINE_FASTTRAP_ISA_H_ #define _BSD_MACHINE_FASTTRAP_ISA_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/fasttrap_isa.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/fasttrap_isa.h" #else #error architecture not supported diff --git a/bsd/machine/limits.h b/bsd/machine/limits.h index 0f40842f9..e96709f89 100644 --- a/bsd/machine/limits.h +++ b/bsd/machine/limits.h @@ -2,9 +2,7 @@ compiler. GCC provides its own limits.h which can be found in /usr/lib/gcc, although it is not very informative. This file is public domain. */ -#if defined (__ppc__) || defined (__ppc64__) -#include -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include #else #error architecture not supported diff --git a/bsd/machine/param.h b/bsd/machine/param.h index 6253a5fb4..2724da7e1 100644 --- a/bsd/machine/param.h +++ b/bsd/machine/param.h @@ -31,9 +31,7 @@ #ifndef _BSD_MACHINE_PARAM_H_ #define _BSD_MACHINE_PARAM_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/param.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/param.h" #else #error architecture not supported diff --git a/bsd/machine/profile.h b/bsd/machine/profile.h index ea28264c6..cc8a5eac0 100644 --- a/bsd/machine/profile.h +++ b/bsd/machine/profile.h @@ -33,9 +33,7 @@ #ifndef _BSD_MACHINE_PROFILE_H_ #define _BSD_MACHINE_PROFILE_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/profile.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/profile.h" #else #error architecture not supported diff --git a/bsd/machine/psl.h b/bsd/machine/psl.h index 711639e4f..01c6e0a25 100644 --- a/bsd/machine/psl.h +++ b/bsd/machine/psl.h @@ -28,9 +28,7 @@ #ifndef _BSD_MACHINE_PSL_H_ #define _BSD_MACHINE_PSL_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/psl.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/psl.h" #else #error architecture not supported diff --git a/bsd/machine/ptrace.h b/bsd/machine/ptrace.h index 031327fe4..3320c2226 100644 --- a/bsd/machine/ptrace.h +++ b/bsd/machine/ptrace.h @@ -31,9 +31,7 @@ #ifndef _BSD_MACHINE_PTRACE_H_ #define _BSD_MACHINE_PTRACE_H_ -#if defined (__ppc__) || defined(__ppc64__) -#include "ppc/ptrace.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/ptrace.h" #else #error architecture not supported diff --git a/bsd/machine/reboot.h b/bsd/machine/reboot.h index cf91c27da..864f1970c 100644 --- a/bsd/machine/reboot.h +++ b/bsd/machine/reboot.h @@ -28,9 +28,7 @@ #ifndef _BSD_MACHINE_REBOOT_H_ #define _BSD_MACHINE_REBOOT_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/reboot.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/reboot.h" #else #error architecture not supported diff --git a/bsd/machine/reg.h b/bsd/machine/reg.h index 95ec0f7d1..30e5dc524 100644 --- a/bsd/machine/reg.h +++ b/bsd/machine/reg.h @@ -28,9 +28,7 @@ #ifndef _BSD_MACHINE_REG_H_ #define _BSD_MACHINE_REG_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/reg.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/reg.h" #else #error architecture not supported diff --git a/bsd/machine/setjmp.h b/bsd/machine/setjmp.h index 4f37be1d3..262acfbc8 100644 --- a/bsd/machine/setjmp.h +++ b/bsd/machine/setjmp.h @@ -31,9 +31,7 @@ #ifndef _MACHINE_SETJMP_H_ #define _MACHINE_SETJMP_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/setjmp.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/setjmp.h" #else #error architecture not supported diff --git a/bsd/machine/signal.h b/bsd/machine/signal.h index 227d4182e..4b7f69c19 100644 --- a/bsd/machine/signal.h +++ b/bsd/machine/signal.h @@ -28,9 +28,7 @@ #ifndef _BSD_MACHINE_SIGNAL_H_ #define _BSD_MACHINE_SIGNAL_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/signal.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/signal.h" #else #error architecture not supported diff --git a/bsd/machine/types.h b/bsd/machine/types.h index ed113ddde..5d6d4db44 100644 --- a/bsd/machine/types.h +++ b/bsd/machine/types.h @@ -31,9 +31,7 @@ #ifndef _BSD_MACHINE_TYPES_H_ #define _BSD_MACHINE_TYPES_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/types.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/types.h" #else #error architecture not supported diff --git a/bsd/machine/ucontext.h b/bsd/machine/ucontext.h index a0e91489a..60e157643 100644 --- a/bsd/machine/ucontext.h +++ b/bsd/machine/ucontext.h @@ -28,9 +28,7 @@ #ifndef _MACHINE_UCONTEXT_H_ #define _MACHINE_UCONTEXT_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/ucontext.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/ucontext.h" #else #error architecture not supported diff --git a/bsd/machine/vmparam.h b/bsd/machine/vmparam.h index 8911ea054..54b212382 100644 --- a/bsd/machine/vmparam.h +++ b/bsd/machine/vmparam.h @@ -28,9 +28,7 @@ #ifndef _BSD_MACHINE_VMPARAM_H_ #define _BSD_MACHINE_VMPARAM_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/vmparam.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/vmparam.h" #else #error architecture not supported diff --git a/bsd/man/man2/Makefile b/bsd/man/man2/Makefile index 12cc26329..93d247a32 100644 --- a/bsd/man/man2/Makefile +++ b/bsd/man/man2/Makefile @@ -67,6 +67,7 @@ DATAFILES = \ getauid.2 \ getdirentries.2 \ getdirentriesattr.2 \ + getdtablesize.2 \ getegid.2 \ geteuid.2 \ getfh.2 \ @@ -145,6 +146,11 @@ DATAFILES = \ rmdir.2 \ searchfs.2 \ select.2 \ + sem_close.2 \ + sem_open.2 \ + sem_post.2 \ + sem_unlink.2 \ + sem_wait.2 \ semctl.2 \ semget.2 \ semop.2 \ @@ -166,12 +172,16 @@ DATAFILES = \ setpgid.2 \ setpgrp.2 \ setpriority.2 \ + setregid.2 \ + setreuid.2 \ setrlimit.2 \ setsid.2 \ setsockopt.2 \ settimeofday.2 \ setuid.2 \ setxattr.2 \ + shm_open.2 \ + shm_unlink.2 \ shmat.2 \ shmctl.2 \ shmdt.2 \ @@ -194,6 +204,7 @@ DATAFILES = \ syscall.2 \ truncate.2 \ umask.2 \ + undelete.2 \ unlink.2 \ unmount.2 \ utimes.2 \ @@ -207,7 +218,8 @@ DATAFILES = \ posix_spawn.2 INSTALL_MAN_LINKS = \ - posix_spawn.2 posix_spawnp.2 + posix_spawn.2 posix_spawnp.2 \ + sem_wait.2 sem_trywait.2 INSTALL_MAN_LIST = ${DATAFILES} diff --git a/bsd/man/man2/auditon.2 b/bsd/man/man2/auditon.2 index bf37e6ab4..4d551ba7a 100644 --- a/bsd/man/man2/auditon.2 +++ b/bsd/man/man2/auditon.2 @@ -243,6 +243,15 @@ structure with the field set to the maximum audit log file size. A value of 0 indicates no limit to the size. +.It Dv A_SETSFLAGS +Set the audit sessions flags for the current session. +The +.Fa data +argument must point to an +.Vt au_asflgs_t +value containing the new audit session flags. +Audit session flags may be updated only according to local +access control policy. .It Dv A_GETCLASS Return the event to class mapping for the designated audit event. The @@ -376,6 +385,13 @@ The .Va af_currsz field will be set to the current audit log file size. +.It Dv A_GETSFLAGS +Returns the audit session flags for the current session. +The +.Fa data +argument must point to an +.Vt au_asflgs_t +value which will be set with the current session flags. .It Dv A_GETCWD .\" [COMMENTED OUT]: Valid description, not yet implemented. .\" Return the current working directory as stored in the audit subsystem. diff --git a/bsd/man/man2/dup.2 b/bsd/man/man2/dup.2 index c13ca0bb5..897966a52 100644 --- a/bsd/man/man2/dup.2 +++ b/bsd/man/man2/dup.2 @@ -33,7 +33,7 @@ .\" .\" @(#)dup.2 8.1 (Berkeley) 6/4/93 .\" -.Dd June 4, 1993 +.Dd December 1, 2010 .Dt DUP 2 .Os BSD 4 .Sh NAME @@ -100,8 +100,18 @@ In the value of the new descriptor .Fa fildes2 is specified. -If this descriptor is already in use, -the descriptor is first deallocated as if a +If +.Fa fildes +and +.Fa fildes2 +are equal, then +.Fn dup2 +just returns +.Fa fildes2 ; +no other changes are made to the existing descriptor. +Otherwise, if descriptor +.Fa fildes2 +is already in use, it is first deallocated as if a .Xr close 2 call had been done first. .Sh RETURN VALUES diff --git a/bsd/man/man2/exchangedata.2 b/bsd/man/man2/exchangedata.2 index cc2111ea4..83dc23c1b 100644 --- a/bsd/man/man2/exchangedata.2 +++ b/bsd/man/man2/exchangedata.2 @@ -24,8 +24,9 @@ .Nd atomically exchange data between two files .Sh SYNOPSIS .Fd #include +.Fd #include .Ft int -.Fn exchangedata "const char * path1" "const char * path2" "unsigned long options" +.Fn exchangedata "const char * path1" "const char * path2" "unsigned int options" . .Sh DESCRIPTION The diff --git a/bsd/man/man2/fcntl.2 b/bsd/man/man2/fcntl.2 index d6d1ce8cd..b53a38be1 100644 --- a/bsd/man/man2/fcntl.2 +++ b/bsd/man/man2/fcntl.2 @@ -1,5 +1,5 @@ .\" -.\" Copyright (c) 2008 Apple Inc. All rights reserved. +.\" Copyright (c) 2011 Apple Inc. All rights reserved. .\" .\" @APPLE_LICENSE_HEADER_START@ .\" @@ -56,7 +56,7 @@ .\" .\" @(#)fcntl.2 8.2 (Berkeley) 1/12/94 .\" -.Dd October 2, 2008 +.Dd February 17, 2011 .Dt FCNTL 2 .Os BSD 4.2 .Sh NAME @@ -98,26 +98,24 @@ Same file status flags (i.e., both file descriptors share the same file status flags). .It The close-on-exec flag associated with the new file descriptor -is set to remain open across +is cleared so that the descriptor remains open across an .Xr execv 2 -system calls. +system call. .El +.It Dv F_DUPFD_CLOEXEC +Like +.Dv F_DUPFD , +except that the close-on-exec flag associated with the new file descriptor +is set. .It Dv F_GETFD -Get the close-on-exec flag associated with the file descriptor -.Fa fildes . -If the low-order bit of the returned value is 0, -the file will remain open across -.Fn exec , -otherwise the file will be closed upon execution of -.Fn exec +Get the flags associated with the file descriptor +.Fa fildes , +as described below .Fa ( arg is ignored). .It Dv F_SETFD -Set the close-on-exec flag associated with -.Fa fildes -to the low order bit of -.Fa arg -(0 or 1 as above). +Set the file descriptor flags to +.Fa arg . .It Dv F_GETFL Get descriptor status flags, as described below .Fa ( arg @@ -187,6 +185,9 @@ Get disk device information. Currently this only includes the disk device address that corresponds to the current file offset. +.It Dv F_LOG2PHYS_EXT +Variant of F_LOG2PHYS that uses the passed in +file offset and length. .It Dv F_FULLFSYNC Does the same thing as .Xr fsync 2 @@ -200,6 +201,43 @@ and Universal Disk Format (UDF) file systems. The operation may take quite a while to complete. Certain FireWire drives have also been known to ignore the request to flush their buffered data. +.It Dv F_SETNOSIGPIPE +Determines whether a +.Dv SIGPIPE +signal will be generated when a write fails on a pipe or socket for +which there is no reader. If +.Fa arg +is non-zero, +.Dv SIGPIPE +generation is disabled for descriptor +.Fa fildes , +while an +.Fa arg +of zero enables it (the default). +.It Dv F_GETNOSIGPIPE +Returns whether a +.Dv SIGPIPE +signal will be generated when a write fails on a pipe or socket +for which there is no reader. The semantics of the return value +match those of the +.Fa arg +of +.Dv F_SETNOSIGPIPE . +.El +.Pp +The flags for the +.Dv F_GETFD +and +.Dv F_SETFD +commands are as follows: +.Bl -tag -width FD_CLOEXECX -offset indent +.It Dv FD_CLOEXEC +Close-on-exec; the given file descriptor will be automatically +closed in the successor process image when one of the +.Xr execv 2 +or +.Xr posix_spawn 2 +family of system calls is invoked. .El .Pp The flags for the @@ -476,15 +514,43 @@ commands operate on the following structure. .Pp The .Dv F_LOG2PHYS -command operates on the following structure. +command operates on the following structure: .ne 7v .Bd -literal struct log2phys { - u_int32_t l2p_flags; /* unused so far */ - off_t l2p_contigbytes; /* unused so far */ - off_t l2p_devoffset; /* bytes into device */ + u_int32_t l2p_flags; /* unused so far */ + off_t l2p_contigbytes; /* unused so far */ + off_t l2p_devoffset; /* bytes into device */ }; .Ed +.Pp +The +.Dv F_LOG2PHYS_EXT +command operates on the same structure as F_LOG2PHYS but treats it as an in/out: +.ne 7v +.Bd -literal + struct log2phys { + u_int32_t l2p_flags; /* unused so far */ + off_t l2p_contigbytes; /* IN: number of bytes to be queried; + OUT: number of contiguous bytes allocated at this position */ + off_t l2p_devoffset; /* IN: bytes into file; + OUT: bytes into device */ + }; +.Ed +.Pp +If +.Fa fildes +is a socket, then the +.Dv F_SETNOSIGPIPE +and +.Dv F_GETNOSIGPIPE +commands are directly analogous, and fully interoperate with the +.Dv SO_NOSIGPIPE +option of +.Xr setsockopt 2 +and +.Xr getsockopt 2 +respectively. .Sh RETURN VALUES Upon successful completion, the value returned depends on .Fa cmd @@ -579,6 +645,8 @@ The argument .Fa cmd is .Dv F_LOG2PHYS +or +.Dv F_LOG2PHYS_EXT and .Fa fildes is not a valid file descriptor open for reading. @@ -696,6 +764,9 @@ the process ID given as argument is not in use. .Xr flock 2 , .Xr getdtablesize 2 , .Xr open 2 , +.Xr pipe 2 , +.Xr socket 2 , +.Xr setsockopt 2 , .Xr sigaction 3 .Sh HISTORY The diff --git a/bsd/man/man2/getattrlist.2 b/bsd/man/man2/getattrlist.2 index 856f1a110..e2af8fc60 100644 --- a/bsd/man/man2/getattrlist.2 +++ b/bsd/man/man2/getattrlist.2 @@ -354,8 +354,11 @@ An structure containing the name of the file system object as UTF-8 encoded, null terminated C string. The attribute data length will not be greater than -.Dv NAME_MAX + -1. +.Dv NAME_MAX ++ 1 characters, which is +.Dv NAME_MAX +* 3 + 1 bytes (as one UTF-8-encoded character may +take up to three bytes). .Pp . .It ATTR_CMN_DEVID @@ -570,6 +573,11 @@ field of the .Vt stat structure returned by .Xr stat 2 . +Only the permission bits of +.Fa st_mode +are valid; other bits should be ignored, +e.g., by masking with +.Dv ~S_IFMT . . .It ATTR_CMN_NAMEDATTRCOUNT A @@ -665,6 +673,13 @@ The attribute data length will not be greater than Inconsistent behavior may be observed when this attribute is requested on hard-linked items, particularly when the file system does not support ATTR_CMN_PARENTID natively. Callers should be aware of this when requesting the full path of a hard-linked item. +. +.It ATTR_CMN_ADDEDTIME +A +.Vt timespec +that contains the time that the file system object was created or renamed into +its containing directory. Note that inconsistent behavior may obe observed +when this attribute is requested on hard-linked items. .Pp . .El @@ -1288,6 +1303,13 @@ that did not support them. .Pp Introduced with Darwin 10.0 (Mac OS X version 10.6). . +.It VOL_CAP_FMT_64BIT_OBJECT_IDS +If this bit is set, the volume format uses object IDs that are 64-bit. +This means that ATTR_CMN_FILEID and ATTR_CMN_PARENTID are the only +legitimate attributes for obtaining object IDs from this volume and the +32-bit fid_objno fields of the fsobj_id_t returned by ATTR_CMN_OBJID, +ATTR_CMN_OBJPERMANENTID, and ATTR_CMN_PAROBJID are undefined. +. .El .Pp . @@ -1602,6 +1624,10 @@ structure is 64-bits (two 32-bit elements) in 32-bit code, and 128-bits (two 64-bit elements) in 64-bit code; however, it is aligned on a 4-byte (32-bit) boundary, even in 64-bit code. .Pp +If you use a structure +for the attribute data, it must be correctly packed and aligned (see +examples). +.Pp . Inconsistent behavior may be observed when the ATTR_CMN_FULLPATH attribute is requested on hard-linked items, particularly when the file system does not support ATTR_CMN_PARENTID @@ -1633,7 +1659,7 @@ struct FInfoAttrBuf { u_int32_t length; fsobj_type_t objType; char finderInfo[32]; -}; +} __attribute__((aligned(4), packed)); typedef struct FInfoAttrBuf FInfoAttrBuf; .Pp . @@ -1700,14 +1726,14 @@ typedef struct attrlist attrlist_t; struct FInfo2CommonAttrBuf { fsobj_type_t objType; char finderInfo[32]; -}; +} __attribute__((aligned(4), packed)); typedef struct FInfo2CommonAttrBuf FInfo2CommonAttrBuf; .Pp . struct FInfo2AttrBuf { u_int32_t length; FInfo2CommonAttrBuf common; -}; +} __attribute__((aligned(4), packed));; typedef struct FInfo2AttrBuf FInfo2AttrBuf; .Pp . @@ -1790,7 +1816,7 @@ struct VolAttrBuf { attrreference_t volNameRef; char mountPointSpace[MAXPATHLEN]; char volNameSpace[MAXPATHLEN]; -}; +} __attribute__((aligned(4), packed)); typedef struct VolAttrBuf VolAttrBuf; .Pp . @@ -1843,6 +1869,53 @@ static int VolDemo(const char *path) } .Ed .Pp +The following sample demonstrates the need to use packing and alignment +controls; without the attribute, in 64-bit code, the fields of the structure are not +placed at the locations that the kernel expects. +. +.Bd -literal +#include +#include +#include +#include +#include +#include +#include +.Pp +/* The alignment and packing attribute is necessary in 64-bit code */ +struct AttrListTimes { + u_int32_t length; + struct timespec st_crtime; + struct timespec st_modtime; +} __attribute__((aligned(4), packed)); +.Pp +main(int argc, char **argv) +{ + int rv; + int i; +.Pp + for (i = 1; i < argc; i++) { + struct attrlist attrList; + struct AttrListTimes myStat = {0}; + char *path = argv[i]; +.Pp + memset(&attrList, 0, sizeof(attrList)); + attrList.bitmapcount = ATTR_BIT_MAP_COUNT; + attrList.commonattr = ATTR_CMN_CRTIME | + ATTR_CMN_MODTIME; +.Pp + rv = getattrlist(path, &attrList, &myStat, sizeof(myStat), 0); +.Pp + if (rv == -1) { + warn("getattrlist(%s)", path); + continue; + } + printf("%s: Modification time = %s", argv[i], ctime(&myStat.st_modtime.tv_sec)); + } + return 0; +} +.Ed +.Pp . .Sh SEE ALSO . diff --git a/bsd/man/man2/getaudit.2 b/bsd/man/man2/getaudit.2 index 10f84aaf6..d2895cd33 100644 --- a/bsd/man/man2/getaudit.2 +++ b/bsd/man/man2/getaudit.2 @@ -59,7 +59,6 @@ The data structure is defined as follows: .nf .in +4n - struct auditinfo { au_id_t ai_auid; /* Audit user ID */ au_mask_t ai_mask; /* Audit masks */ @@ -74,15 +73,13 @@ The .Fa ai_auid variable contains the audit identifier which is recorded in the audit log for each event the process caused. -.PP - +.Pp The .Fa au_mask_t data structure defines the bit mask for auditing successful and failed events out of the predefined list of event classes. It is defined as follows: .nf .in +4n - struct au_mask { unsigned int am_success; /* success bits */ unsigned int am_failure; /* failure bits */ @@ -90,15 +87,13 @@ struct au_mask { typedef struct au_mask au_mask_t; .in .fi -.PP - +.Pp The .Fa au_termid_t data structure defines the Terminal ID recorded with every event caused by the process. It is defined as follows: .nf .in +4n - struct au_tid { dev_t port; u_int32_t machine; @@ -106,8 +101,7 @@ struct au_tid { typedef struct au_tid au_tid_t; .in .fi -.PP - +.Pp The .Fa ai_asid variable contains the audit session ID which is recorded with every event @@ -122,7 +116,6 @@ data structure supports Terminal IDs with larger addresses such as those used in IP version 6. It is defined as follows: .nf .in +4n - struct auditinfo_addr { au_id_t ai_auid; /* Audit user ID. */ au_mask_t ai_mask; /* Audit masks. */ @@ -134,14 +127,12 @@ typedef struct auditinfo_addr auditinfo_addr_t; .in .fi .Pp - The .Fa au_tid_addr_t data structure which includes a larger address storage field and an additional field with the type of address stored: .nf .in +4n - struct au_tid_addr { dev_t at_port; u_int32_t at_type; diff --git a/bsd/man/man2/getauid.2 b/bsd/man/man2/getauid.2 index a89d98aae..373deb2a0 100644 --- a/bsd/man/man2/getauid.2 +++ b/bsd/man/man2/getauid.2 @@ -25,7 +25,7 @@ .Os .Sh NAME .Nm getauid -.Nd "retrieve audit session ID" +.Nd "retrieve audit user ID" .Sh SYNOPSIS .In bsm/audit.h .Ft int @@ -34,7 +34,7 @@ The .Fn getauid system call -retrieves the active audit session ID for the current process via the +retrieves the active audit user ID for the current process via the .Vt au_id_t pointed to by .Fa auid . diff --git a/bsd/man/man2/getdirentries.2 b/bsd/man/man2/getdirentries.2 index a77a5d8b6..a513ea8e8 100644 --- a/bsd/man/man2/getdirentries.2 +++ b/bsd/man/man2/getdirentries.2 @@ -67,14 +67,8 @@ with buffers smaller than this size. .Pp The data in the buffer is a series of .Em dirent -structures each containing the following entries: -.Bd -literal -offset indent -u_int32_t d_fileno; /* file number of entry */ -u_int16_t d_reclen; /* length of this record */ -u_int8_t d_type; /* file type, see below */ -u_int8_t d_namlen; /* length of string in d_name */ -char d_name[MAXNAMELEN + 1]; /* see below */ -.Ed +structures (see +.Xr dir 5 ) .Pp The .Fa d_fileno @@ -166,7 +160,11 @@ will not work with 64-bit inodes; in order to use .Fn getdirentries , .Dv _DARWIN_NO_64_BIT_INODE -must be defined. +must be defined. See +.Xr stat 2 +for more information on +.Dv _DARWIN_NO_64_BIT_INODE +and its other effects. .Sh RETURN VALUES If successful, the number of bytes actually transferred is returned. Otherwise, -1 is returned and the global variable @@ -193,8 +191,10 @@ error occurred while reading from or writing to the file system. .Sh SEE ALSO .Xr lseek 2 , .Xr open 2 , +.Xr stat 2 , .Xr opendir 3 , -.Xr readdir 3 +.Xr readdir 3 , +.Xr dir 5 .Sh HISTORY The .Fn getdirentries diff --git a/bsd/man/man2/getdirentriesattr.2 b/bsd/man/man2/getdirentriesattr.2 index a2cc333ff..78a839766 100644 --- a/bsd/man/man2/getdirentriesattr.2 +++ b/bsd/man/man2/getdirentriesattr.2 @@ -163,7 +163,7 @@ However, since the variable is too small to hold an .Vt off_t , you should use .Xr lseek 2 -to get the directoy's current position instead of using this parameter. +to get the directory's current position instead of using this parameter. The initial value of the variable is ignored. .Pp . @@ -245,6 +245,16 @@ If you're implementing a volume format that supports .Fn getdirentriesattr , you should be careful to support the behaviour specified by this document. . +.Pp +If the directory contains a mount point, then +.Dv DIR_MNTSTATUS_MNTPOINT +will be set in the +.Dv ATTR_DIR_MOUNTSTATUS +for that entry; all other attributes for that entry, however, +will be for the underlying file system (as opposed to the mounted +file system). +.Xr getattrlist 2 +should be used to get the attributes for the mount point. .Sh ERRORS .Fn getdirentriesattr will fail if: @@ -315,7 +325,8 @@ struct FInfoAttrBuf { attrreference_t name; fsobj_type_t objType; char finderInfo[32]; -}; + u_int32_t dirStatus; +} __attribute__((aligned(4), packed)); typedef struct FInfoAttrBuf FInfoAttrBuf; .Pp . @@ -358,6 +369,7 @@ static int FInfoDemo(const char *dirPath) attrList.commonattr = ATTR_CMN_NAME | ATTR_CMN_OBJTYPE | ATTR_CMN_FNDRINFO; + attrList.dirattr = ATTR_DIR_MOUNTSTATUS; .Pp err = 0; @@ -411,7 +423,10 @@ static int FInfoDemo(const char *dirPath) ); break; case VDIR: - printf("directory "); + if (thisEntry->dirStatus & DIR_MNTSTATUS_MNTPOINT) + printf("mount-point "); + else + printf("directory "); break; default: printf( @@ -428,7 +443,7 @@ static int FInfoDemo(const char *dirPath) .Pp // Advance to the next entry. .Pp - ((char *) thisEntry) += thisEntry->length; + thisEntry = (FInfoAttrBuf*)((char*)thisEntry + thisEntry->length); } } } while ( err == 0 && ! done ); diff --git a/bsd/man/man2/getdtablesize.2 b/bsd/man/man2/getdtablesize.2 new file mode 100644 index 000000000..7465f9aeb --- /dev/null +++ b/bsd/man/man2/getdtablesize.2 @@ -0,0 +1,63 @@ +.\" Copyright (c) 1983, 1991, 1993 +.\" The Regents of the University of California. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. All advertising materials mentioning features or use of this software +.\" must display the following acknowledgement: +.\" This product includes software developed by the University of +.\" California, Berkeley and its contributors. +.\" 4. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" @(#)getdtablesize.2 8.1 (Berkeley) 6/4/93 +.\" $FreeBSD: src/lib/libc/sys/getdtablesize.2,v 1.4.2.3 2001/12/14 18:34:00 ru Exp $ +.\" +.Dd June 4, 1993 +.Dt GETDTABLESIZE 2 +.Os +.Sh NAME +.Nm getdtablesize +.Nd get descriptor table size +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In unistd.h +.Ft int +.Fn getdtablesize void +.Sh DESCRIPTION +Each process has a fixed size descriptor table, +which is guaranteed to have at least 20 slots. The entries in +the descriptor table are numbered with small integers starting at 0. +The call +.Fn getdtablesize +returns the size of this table. +.Sh SEE ALSO +.Xr close 2 , +.Xr dup 2 , +.Xr open 2 , +.Xr select 2 +.Sh HISTORY +The +.Fn getdtablesize +function call appeared in +.Bx 4.2 . diff --git a/bsd/man/man2/getfsstat.2 b/bsd/man/man2/getfsstat.2 index 47e792b60..99e2abaf6 100644 --- a/bsd/man/man2/getfsstat.2 +++ b/bsd/man/man2/getfsstat.2 @@ -56,111 +56,15 @@ function returns information about all mounted file systems. The .Fa buf argument is a pointer to an array of -.Xr statfs -structures. -.Pp -As of Mac OS X 10.6, the default size of the -.Ft ino_t -type is 64 bits (the macro -.Dv _DARWIN_FEATURE_64_BIT_INODE -will be defined). -While there is no -.Ft ino_t -type used in the -.Xr statfs -structure, the changes to -.Fn getfsstat -are grouped together with the 64-bit inode changes. -The string fields in the -.Xr statfs -structure are larger and the variant symbol -.Li _getfsstat$INODE64 -will be automatically used. -The -.Xr statfs -structure is defined as: -.Bd -literal -typedef struct { int32_t val[2]; } fsid_t; - -#define MFSTYPENAMELEN 16 /* length of fs type name including null */ -#define MAXPATHLEN 1024 -#define MNAMELEN MAXPATHLEN - -struct statfs { /* when _DARWIN_FEATURE_64_BIT_INODE is defined */ - uint32_t f_bsize; /* fundamental file system block size */ - int32_t f_iosize; /* optimal transfer block size */ - uint64_t f_blocks; /* total data blocks in file system */ - uint64_t f_bfree; /* free blocks in fs */ - uint64_t f_bavail; /* free blocks avail to non-superuser */ - uint64_t f_files; /* total file nodes in file system */ - uint64_t f_ffree; /* free file nodes in fs */ - fsid_t f_fsid; /* file system id */ - uid_t f_owner; /* user that mounted the filesystem */ - uint32_t f_type; /* type of filesystem */ - uint32_t f_flags; /* copy of mount exported flags */ - uint32_t f_fssubtype; /* fs sub-type (flavor) */ - char f_fstypename[MFSTYPENAMELEN]; /* fs type name */ - char f_mntonname[MAXPATHLEN]; /* directory on which mounted */ - char f_mntfromname[MAXPATHLEN]; /* mounted filesystem */ - uint32_t f_reserved[8]; /* For future use */ -}; -.Ed -.Pp -(In 10.5, 64-bit -.Ft ino_t , -larger -.Xr statfs -structure and variant symbol were available if the macro -.Dv _DARWIN_USE_64_BIT_INODE -is defined before any header files are included; this macro is optional in -10.6.) -.Pp -If the macro -.Dv _DARWIN_NO_64_BIT_INODE -is defined before any header files are included, or if the deployment target -is less than 10.6, the legacy -.Xr statfs -structure will be in effect. -The -.Ft ino_t -type will be 32 bits (the -.Dv _DARWIN_FEATURE_64_BIT_INODE -macro will not be defined), the strings in the -.Xr statfs -structure will be their smaller legacy size (and long mount paths may no longer -fit) and the undecorated symbol -.Li _getfsstat -will be used. -This legacy .Fa statfs -structure is defined as: -.Bd -literal -#define MFSNAMELEN 15 /* length of fs type name, not inc. nul */ -#define MNAMELEN 90 /* length of buffer for returned name */ - -struct statfs { /* when _DARWIN_FEATURE_64_BIT_INODE is NOT defined */ - short f_otype; /* type of file system (reserved: zero) */ - short f_oflags; /* copy of mount flags (reserved: zero) */ - long f_bsize; /* fundamental file system block size */ - long f_iosize; /* optimal transfer block size */ - long f_blocks; /* total data blocks in file system */ - long f_bfree; /* free blocks in fs */ - long f_bavail; /* free blocks avail to non-superuser */ - long f_files; /* total file nodes in file system */ - long f_ffree; /* free file nodes in fs */ - fsid_t f_fsid; /* file system id */ - uid_t f_owner; /* user that mounted the file system */ - short f_reserved1; /* reserved for future use */ - short f_type; /* type of file system (reserved) */ - long f_flags; /* copy of mount flags (reserved) */ - long f_reserved2[2]; /* reserved for future use */ - char f_fstypename[MFSNAMELEN]; /* fs type name */ - char f_mntonname[MNAMELEN]; /* directory on which mounted */ - char f_mntfromname[MNAMELEN]; /* mounted file system */ - char f_reserved3; /* reserved for future use */ - long f_reserved4[4]; /* reserved for future use */ -}; -.Ed +structures (see +.Xr statfs 2 ) . +As +.Xr statfs 2 +indicates, the structure is defined differently depending on +whether the macro _DARWIN_FEATURE_64_BIT_INODE is defined (see +.Xr stat 2 +for more information on this macro). .Pp Fields that are undefined for a particular file system are set to -1. The buffer is filled with an array of @@ -223,6 +127,7 @@ routine is equivalent to the default is defined), so there is no longer any reason to use it (it will be removed in the future). .Sh SEE ALSO +.Xr stat 2 , .Xr statfs 2 , .Xr fstab 5 , .Xr mount 8 diff --git a/bsd/man/man2/getgroups.2 b/bsd/man/man2/getgroups.2 index f2a9e995d..a941bc389 100644 --- a/bsd/man/man2/getgroups.2 +++ b/bsd/man/man2/getgroups.2 @@ -1,5 +1,5 @@ .\" -.\" Copyright (c) 2008 Apple Inc. All rights reserved. +.\" Copyright (c) 2008, 2010 Apple Inc. All rights reserved. .\" .\" @APPLE_LICENSE_HEADER_START@ .\" @@ -56,7 +56,7 @@ .\" .\" @(#)getgroups.2 8.2 (Berkeley) 4/16/94 .\" -.Dd October 2, 2008 +.Dd September 17, 2010 .Dt GETGROUPS 2 .Os BSD 4.2 .Sh NAME @@ -90,6 +90,28 @@ is 0, returns the number of groups without modifying the .Fa grouplist[] array. +.Pp +To provide compatibility with applications that use +.Fn getgroups +in environments where users may be in more than +.Dv {NGROUPS_MAX} +groups, a variant of +.Fn getgroups , +obtained when compiling with either the macros +.Dv _DARWIN_UNLIMITED_GETGROUPS +or +.Dv _DARWIN_C_SOURCE +defined, can be used that is not limited to +.Dv {NGROUPS_MAX} +groups. +However, this variant only returns the user's default group access list and +not the group list modified by a call to +.Xr setgroups 2 +(either in the current process or an ancestor process). +Use of +.Xr setgroups 2 +is highly discouraged, and there is no foolproof way to determine if it has +been previously called. .Sh RETURN VALUES A successful call returns the number of groups in the group set. Otherwise, a value of -1 is returned and the global integer variable @@ -112,12 +134,6 @@ The argument although non-zero, is smaller than the number of groups in the group set. .El -.Sh LEGACY DESCRIPTION -If _DARWIN_C_SOURCE is defined, -.Fn getgroups -can return more than -.Dv {NGROUPS_MAX} -groups. .Sh LEGACY SYNOPSIS .Fd #include .Fd #include diff --git a/bsd/man/man2/gettimeofday.2 b/bsd/man/man2/gettimeofday.2 index 96659f100..a9b300555 100644 --- a/bsd/man/man2/gettimeofday.2 +++ b/bsd/man/man2/gettimeofday.2 @@ -53,7 +53,6 @@ .Fa "const struct timezone *tzp" .Fc .Sh DESCRIPTION -.Ef .Pp The system's notion of the current Greenwich time and the current time zone is obtained with the diff --git a/bsd/man/man2/kqueue.2 b/bsd/man/man2/kqueue.2 index f7a12d523..6ab998c5a 100644 --- a/bsd/man/man2/kqueue.2 +++ b/bsd/man/man2/kqueue.2 @@ -74,7 +74,7 @@ and The .Fn kqueue system call -provides a generic method of notifying the user when an kernel +provides a generic method of notifying the user when a kernel event (kevent) happens or a condition holds, based on the results of small pieces of kernel code termed filters. A kevent is identified by an (ident, filter) pair and specifies @@ -267,7 +267,7 @@ the descriptor. .It EV_RECEIPT This flag is useful for making bulk changes to a kqueue without draining any pending events. When passed as input, it forces EV_ERROR to always be returned. -When a filter is successfully added. The +When a filter is successfully added, the .Va data field will be zero. .It EV_ONESHOT @@ -433,6 +433,8 @@ The events to monitor are: .Bl -tag -width NOTE_SIGNAL .It NOTE_EXIT The process has exited. +.It NOTE_EXITSTATUS +The process has exited and its exit status is in filter specific data. Valid only on child processes and to be used along with NOTE_EXIT. .It NOTE_FORK The process created a child process via .Xr fork 2 @@ -507,42 +509,6 @@ contains the number of times the timeout has expired since the last call to or .Fn kevent64 . This filter automatically sets the EV_CLEAR flag internally. -.It EVFILT_SESSION -Takes the audit session ID to monitor as the identifier and the events to watch for in -.Va fflags , -and returns when one or more of the requested session events occurs. -To monitor for events for any audit session the value AS_ANY_ASID -should be used as the identifier. With AS_ANY_ASID, as new audit -sessions are created they are included as if the were added -individually. The events to monitor are: -.Bl -tag -width NOTE_AS_UPDATE -.It NOTE_AS_START -A new audit session has started. -.It NOTE_AS_END -All the processes in the audit session have exited. -.It NOTE_AS_CLOSE -This audit session is no longer valid in the kernel. In other words, it -is now safe to dispose of any cached information about this session or -reuse its session ID for a new audit session. -.It NOTE_AS_UPDATE -The audit session information was updated. The audit session information is -considered immutable once initially set. If this becomes enforced in -the kernel then this event may no longer be needed and may become -obsolete. -.It NOTE_AS_ERR -This flag is returned if the system was unable to attach an event to a -new session when the audit session ID of AS_ANY_ASID -is used. This is usually due to resource limitations. -.El -.Pp -On return, -.Va fflags -contains the events which triggered the filter, -.Va ident -contains the audit session ID, and -.Va data -contains the audit user ID. -This filter automatically sets the EV_CLEAR flag internally. .El .Pp ---- diff --git a/bsd/man/man2/madvise.2 b/bsd/man/man2/madvise.2 index a4b4d415d..9f89c32ba 100644 --- a/bsd/man/man2/madvise.2 +++ b/bsd/man/man2/madvise.2 @@ -126,7 +126,7 @@ This is used with system call. .It Dv MADV_ZERO_WIRED_PAGES Indicates that the application would like the wired pages in this address -range to be zeroed out if the address range is dellocated without first +range to be zeroed out if the address range is deallocated without first unwiring the pages (i.e. a munmap(2) without a preceding munlock(2) or the application quits). This is used with diff --git a/bsd/man/man2/mmap.2 b/bsd/man/man2/mmap.2 index af9de8c04..b55d054e1 100644 --- a/bsd/man/man2/mmap.2 +++ b/bsd/man/man2/mmap.2 @@ -148,6 +148,15 @@ VM_MAKE_TAG(tag) to associate an 8-bit tag with the region defines some preset tags (with a VM_MEMORY_ prefix). Users are encouraged to use tags between 240 and 255. Tags are used by tools such as vmmap(1) to help identify specific memory regions. +.Pp +VM_FLAGS_SUPERPAGE_SIZE_* to use superpages for the allocation. +See for supported architectures and sizes (or use +VM_FLAGS_SUPERPAGE_SIZE_ANY to have the kernel choose a size). +The specified size must be divisible by the superpage size (except for +VM_FLAGS_SUPERPAGE_SIZE_ANY), and if you use MAP_FIXED, the specified address +must be properly aligned. If the system cannot satisfy the request with superpages, +the call will fail. Note that currently, superpages are always wired and not +inherited by children of the process. .It Dv MAP_FILE Mapped from a regular file. (This is the default mapping type, and need not be specified.) diff --git a/bsd/man/man2/open.2 b/bsd/man/man2/open.2 index 2d121402d..80c293626 100644 --- a/bsd/man/man2/open.2 +++ b/bsd/man/man2/open.2 @@ -1,5 +1,5 @@ .\" -.\" Copyright (c) 2008 Apple Inc. All rights reserved. +.\" Copyright (c) 2010 Apple Inc. All rights reserved. .\" .\" @APPLE_LICENSE_HEADER_START@ .\" @@ -56,7 +56,7 @@ .\" .\" @(#)open.2 8.2 (Berkeley) 11/16/93 .\" -.Dd October 7, 2008 +.Dd November 10, 2010 .Dt OPEN 2 .Os BSD 4 .Sh NAME @@ -114,6 +114,7 @@ O_EXLOCK atomically obtain an exclusive lock O_NOFOLLOW do not follow symlinks O_SYMLINK allow open of symlinks O_EVTONLY descriptor requested for event notifications only +O_CLOEXEC mark as close-on-exec .Ed .Pp Opening a file with @@ -133,7 +134,9 @@ returns an error. This may be used to implement a simple exclusive-access locking mechanism. If .Dv O_EXCL -is set and the last component of the pathname is a symbolic link, +is set with +.Dv O_CREAT +and the last component of the pathname is a symbolic link, .Fn open will fail even if the symbolic link points to a non-existent name. .Pp @@ -184,6 +187,15 @@ flag is only intended for monitoring a file for changes (e.g. kqueue). Note: whe this flag is used, the opened file will not prevent an unmount of the volume that contains the file. .Pp +The +.Dv O_CLOEXEC +flag causes the file descriptor to be marked as close-on-exec, +setting the +.Dv FD_CLOEXEC +flag. The state of the file descriptor flags can be inspected +using the F_GETFD fcntl. See +.Xr fcntl 2 . +.Pp If successful, .Fn open returns a non-negative integer, termed a file descriptor. diff --git a/bsd/man/man2/pathconf.2 b/bsd/man/man2/pathconf.2 index afe640327..9743384ce 100644 --- a/bsd/man/man2/pathconf.2 +++ b/bsd/man/man2/pathconf.2 @@ -103,6 +103,16 @@ system call, otherwise 0. Return 1 if file names longer than KERN_NAME_MAX are truncated. .It Li _PC_VDISABLE Returns the terminal character disabling value. +.It Li _PC_XATTR_SIZE_BITS +Returns the number of bits used to store maximum extended +attribute size in bytes. For example, if the maximum +attribute size supported by a file system is 128K, the +value returned will be 18. However a value 18 can mean +that the maximum attribute size can be anywhere from +(256KB - 1) to 128KB. As a special case, the resource +fork can have much larger size, and some file system +specific extended attributes can have smaller and preset +size; for example, Finder Info is always 32 bytes. .El .Sh RETURN VALUES If the call to diff --git a/bsd/man/man2/pipe.2 b/bsd/man/man2/pipe.2 index 03f12c196..df5b9d85c 100644 --- a/bsd/man/man2/pipe.2 +++ b/bsd/man/man2/pipe.2 @@ -33,7 +33,7 @@ .\" .\" @(#)pipe.2 8.1 (Berkeley) 6/4/93 .\" -.Dd June 4, 1993 +.Dd February 17, 2011 .Dt PIPE 2 .Os BSD 4 .Sh NAME @@ -82,6 +82,12 @@ signal. Widowing a pipe is the only way to deliver end-of-file to a reader: after the reader consumes any buffered data, reading a widowed pipe returns a zero count. +.Pp +The generation of the +.Dv SIGPIPE +signal can be suppressed using the +.Dv F_SETNOSIGPIPE +fcntl command. .Sh RETURN VALUES On successful creation of the pipe, zero is returned. Otherwise, a value of -1 is returned and the variable @@ -111,6 +117,7 @@ The system file table is full. .Xr fork 2 , .Xr read 2 , .Xr socketpair 2 , +.Xr fcntl 2 , .Xr write 2 .Sh HISTORY A diff --git a/bsd/man/man2/posix_spawn.2 b/bsd/man/man2/posix_spawn.2 index 76bfa055b..6a940d5c1 100644 --- a/bsd/man/man2/posix_spawn.2 +++ b/bsd/man/man2/posix_spawn.2 @@ -1,5 +1,5 @@ .\" -.\" Copyright (c) 2000-2007 Apple Inc. All rights reserved. +.\" Copyright (c) 2000-2010 Apple Inc. All rights reserved. .\" .\" @APPLE_OSREFERENCE_LICENSE_HEADER_START@ .\" @@ -27,7 +27,7 @@ .\" .\" @(#)posix_spawn.2 . -.Dd August 9, 2007 +.Dd November 2, 2010 .Dt POSIX_SPAWN 2 .Os "Mac OS X" .Sh NAME @@ -95,7 +95,7 @@ spawned process. The value is undefined in the case of a failure. .Pp The argument .Fa file_actions -is either NULL, or it is a a pointer to a file actions object that was +is either NULL, or it is a pointer to a file actions object that was initialized by a call to .Xr posix_spawn_file_actions_init 3 and represents zero or more file actions. @@ -108,9 +108,12 @@ and .Xr fcntl 2 ) . Descriptors that remain open are unaffected by .Fn posix_spawn -unless their behaviour is modified by a file action; see +unless their behaviour is modified by particular spawn flags +or a file action; see +.Xr posix_spawnattr_setflags 3 +and .Xr posix_spawn_file_actions_init 3 -for more information. +for additional information. .Pp The argument .Fa attrp diff --git a/bsd/man/man2/quotactl.2 b/bsd/man/man2/quotactl.2 index c60519313..4a4760dc6 100644 --- a/bsd/man/man2/quotactl.2 +++ b/bsd/man/man2/quotactl.2 @@ -64,7 +64,6 @@ The address of an optional command specific data structure, may be given; its interpretation is discussed below with each command. .Pp -Currently quotas are supported only for the "ffs" and "hfs" filesystems. A command is composed of a primary command (see below) and a command type used to interpret the .Fa id . diff --git a/bsd/man/man2/sem_close.2 b/bsd/man/man2/sem_close.2 new file mode 100644 index 000000000..cdff87c7c --- /dev/null +++ b/bsd/man/man2/sem_close.2 @@ -0,0 +1,60 @@ +.\" $Darwin$ +.\" +.\" Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. +.\" +.\" @APPLE_LICENSE_HEADER_START@ +.\" +.\" The contents of this file constitute Original Code as defined in and +.\" are subject to the Apple Public Source License Version 1.1 (the +.\" "License"). You may not use this file except in compliance with the +.\" License. Please obtain a copy of the License at +.\" http://www.apple.com/publicsource and read it before using this file. +.\" +.\" This Original Code and all software distributed under the License are +.\" distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the +.\" License for the specific language governing rights and limitations +.\" under the License. +.\" +.\" @APPLE_LICENSE_HEADER_END@ +.\" +.Dd June 8, 2000 +.Dt SEM_CLOSE 2 +.Os Darwin +.Sh NAME +.Nm sem_close +.Nd close a named semaphore +.Sh SYNOPSIS +.Fd #include +.Ft int +.Fn sem_close "sem_t *sem" +.Sh DESCRIPTION +The system resources associated with the named semaphore referenced by +.Fa sem +are deallocated and the descriptor is invalidated. +.Pp +If successful, +.Fn sem_close +will return 0. Otherwise, -1 is returned and +.Va errno +is set. +.Sh ERRORS +.Fn sem_close +succeeds unless: +.Bl -tag -width Er +.It Bq Er EINVAL +.Fa sem +is not a valid semaphore descriptor. +.El +.Sh SEE ALSO +.Xr sem_init 2 , +.Xr sem_open 2 , +.Xr sem_unlink 2 , +.Xr semctl 2 , +.Xr semget 2 , +.Xr semop 2 +.Sh HISTORY +.Fn sem_close +is specified in the POSIX Realtime Extension (1003.1b-1993/1003.1i-1995). diff --git a/bsd/man/man2/sem_open.2 b/bsd/man/man2/sem_open.2 new file mode 100644 index 000000000..423e98ae4 --- /dev/null +++ b/bsd/man/man2/sem_open.2 @@ -0,0 +1,169 @@ +.\" $Darwin$ +.\" +.\" Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. +.\" +.\" @APPLE_LICENSE_HEADER_START@ +.\" +.\" The contents of this file constitute Original Code as defined in and +.\" are subject to the Apple Public Source License Version 1.1 (the +.\" "License"). You may not use this file except in compliance with the +.\" License. Please obtain a copy of the License at +.\" http://www.apple.com/publicsource and read it before using this file. +.\" +.\" This Original Code and all software distributed under the License are +.\" distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the +.\" License for the specific language governing rights and limitations +.\" under the License. +.\" +.\" @APPLE_LICENSE_HEADER_END@ +.\" +.Dd June 8, 2000 +.Dt SEM_OPEN 2 +.Os Darwin +.Sh NAME +.Nm sem_open +.Nd initialize and open a named semaphore +.Sh SYNOPSIS +.Fd #include +.Ft sem_t * +.Fo sem_open +.Fa "const char *name" +.Fa "int oflag" +.Fa "..." +.Fc +.Pp +The parameters "mode_t mode" and "unsigned int value" +are optional. +.Sh DESCRIPTION +The named semaphore named +.Fa name +is initialized and opened as specified by the argument +.Fa oflag +and a semaphore descriptor is returned to the calling process. +.Pp +The value of +.Fa oflag +is formed by +.Em or Ns 'ing +the following values: +.Pp +.Bd -literal -offset indent -compact +O_CREAT create the semaphore if it does not exist +O_EXCL error if create and semaphore exists +.Ed +.Pp +If +.Dv O_CREAT +is specified, +.Fn sem_open +requires an additional two arguments. +.Fa mode +specifies the permissions for the semaphore as described in +.Xr chmod 2 +and modified by the process' umask value (see +.Xr umask 2 ) . +The semaphore is created with an initial +.Fa value , +which must be less than or equal to +.Dv SEM_VALUE_MAX . +.Pp +If +.Dv O_EXCL +is specified and the semaphore exists, +.Fn sem_open +fails. The check for the existence of the semaphore and the creation +of the semaphore are atomic with respect to all processes calling +.Fn sem_open +with +.Dv O_CREAT +and +.Dv O_EXCL +set. +.Pp +When a new semaphore is created, it is given the user ID and group ID +which correspond to the effective user and group IDs of the calling +process. There is no visible entry in the file system for the created +object in this implementation. +.Pp +The returned semaphore descriptor is available to the calling process +until it is closed with +.Fn sem_close , +or until the caller exits or execs. +.Pp +If a process makes repeated calls to +.Fn sem_open , +with the same +.Fa name +argument, the same descriptor is returned for each successful call, +unless +.Fn sem_unlink +has been called on the semaphore in the interim. +.Pp +If +.Fn sem_open +fails for any reason, it will return a value of +.Dv SEM_FAILED +and sets +.Va errno . +On success, it returns a semaphore descriptor. +.Sh ERRORS +The named semaphore is opened unless: +.Bl -tag -width Er +.It Bq Er EACCES +The required permissions (for reading and/or writing) +are denied for the given flags; or +.Dv O_CREAT +is specified, the object does not exist, and permission to +create the semaphore is denied. +.It Bq Er EEXIST +.Dv O_CREAT +and +.Dv O_EXCL +were specified and the semaphore exists. +.It Bq Er EINTR +The +.Fn sem_open +operation was interrupted by a signal. +.It Bq Er EINVAL +The +.Fn shm_open +operation is not supported; or +.Dv O_CREAT +is specified and +.Fa value +exceeds +.Dv SEM_VALUE_MAX . +.It Bq Er EMFILE +The process has already reached its limit for semaphores or file +descriptors in use. +.It Bq Er ENAMETOOLONG +.Fa name +exceeded +.Dv SEM_NAME_LEN +characters. +.It Bq Er ENFILE +Too many semaphores or file descriptors are open on the system. +.It Bq Er ENOENT +.Dv O_CREAT +is not set and the named semaphore does not exist. +.It Bq Er ENOSPC +.Dv O_CREAT +is specified, the file does not exist, and there is insufficient +space available to create the semaphore. +.El +.Sh SEE ALSO +.Xr sem_close 2 , +.Xr sem_post 2 , +.Xr sem_trywait 2 , +.Xr sem_unlink 2 , +.Xr sem_wait 2 , +.Xr semctl 2 , +.Xr semget 2 , +.Xr semop 2 , +.Xr umask 2 +.Sh HISTORY +.Fn sem_open +is specified in the POSIX Realtime Extension (1003.1b-1993/1003.1i-1995). diff --git a/bsd/man/man2/sem_post.2 b/bsd/man/man2/sem_post.2 new file mode 100644 index 000000000..36d06fde8 --- /dev/null +++ b/bsd/man/man2/sem_post.2 @@ -0,0 +1,65 @@ +.\" $Darwin$ +.\" +.\" Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. +.\" +.\" @APPLE_LICENSE_HEADER_START@ +.\" +.\" The contents of this file constitute Original Code as defined in and +.\" are subject to the Apple Public Source License Version 1.1 (the +.\" "License"). You may not use this file except in compliance with the +.\" License. Please obtain a copy of the License at +.\" http://www.apple.com/publicsource and read it before using this file. +.\" +.\" This Original Code and all software distributed under the License are +.\" distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the +.\" License for the specific language governing rights and limitations +.\" under the License. +.\" +.\" @APPLE_LICENSE_HEADER_END@ +.\" +.Dd June 8, 2000 +.Dt SEM_POST 2 +.Os Darwin +.Sh NAME +.Nm sem_post +.Nd unlock a semaphore +.Sh SYNOPSIS +.Fd #include +.Ft int +.Fn sem_post "sem_t *sem" +.Sh DESCRIPTION +The semaphore referenced by +.Fa sem +is unlocked, the value of the semaphore is incremented, and all +threads which are waiting on the semaphore are awakened. +.Pp +.Fn sem_post +is reentrant with respect to signals and may be called from within a +signal hanlder. +.Pp +If successful, +.Fn sem_post +will return 0. Otherwise, -1 is returned and +.Va errno +is set. +.Sh ERRORS +.Fn sem_post +succeeds unless: +.Bl -tag -width Er +.It Bq Er EINVAL +.Fa sem +is not a valid semaphore descriptor. +.El +.Sh SEE ALSO +.Xr sem_open 2 , +.Xr sem_trywait 2 , +.Xr sem_wait 2 , +.Xr semctl 2 , +.Xr semget 2 , +.Xr semop 2 +.Sh HISTORY +.Fn sem_post +is specified in the POSIX Realtime Extension (1003.1b-1993/1003.1i-1995). diff --git a/bsd/man/man2/sem_unlink.2 b/bsd/man/man2/sem_unlink.2 new file mode 100644 index 000000000..7fc7e9c4d --- /dev/null +++ b/bsd/man/man2/sem_unlink.2 @@ -0,0 +1,74 @@ +.\" $Darwin$ +.\" +.\" Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. +.\" +.\" @APPLE_LICENSE_HEADER_START@ +.\" +.\" The contents of this file constitute Original Code as defined in and +.\" are subject to the Apple Public Source License Version 1.1 (the +.\" "License"). You may not use this file except in compliance with the +.\" License. Please obtain a copy of the License at +.\" http://www.apple.com/publicsource and read it before using this file. +.\" +.\" This Original Code and all software distributed under the License are +.\" distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the +.\" License for the specific language governing rights and limitations +.\" under the License. +.\" +.\" @APPLE_LICENSE_HEADER_END@ +.\" +.Dd June 8, 2000 +.Dt SEM_UNLINK 2 +.Os Darwin +.Sh NAME +.Nm sem_unlink +.Nd remove a named semaphore +.Sh SYNOPSIS +.Fd #include +.Ft int +.Fn sem_unlink "const char *name" +.Sh DESCRIPTION +The named semaphore named +.Fa name +is removed. If the semaphore is in use by other processes, then +.Fa name +is immediately disassociated with the semaphore, but the semaphore +itself will not be removed until all references to it have been +closed. Subsequent calls to +.Fn sem_open +using +.Fa name +will refer to or create a new semaphore named +.Fa name . +.Pp +If successful, +.Fn sem_unlink +will return 0. Otherwise, -1 is returned and +.Va errno +is set, and the state of the semaphore is unchanged. +.Sh ERRORS +.Fn sem_unlink +succeeds unless: +.Bl -tag -width Er +.It Bq Er EACCES +Permission is denied to be remove the semaphore. +.It Bq Er ENAMETOOLONG +.Fa name +exceeded +.Dv SEM_NAME_LEN +characters. +.It Bq Er ENOENT +The named semaphore does not exist. +.El +.Sh SEE ALSO +.Xr sem_close 2 , +.Xr sem_open 2 , +.Xr semctl 2 , +.Xr semget 2 , +.Xr semop 2 +.Sh HISTORY +.Fn sem_unlink +is specified in the POSIX Realtime Extension (1003.1b-1993/1003.1i-1995). diff --git a/bsd/man/man2/sem_wait.2 b/bsd/man/man2/sem_wait.2 new file mode 100644 index 000000000..02f8d8586 --- /dev/null +++ b/bsd/man/man2/sem_wait.2 @@ -0,0 +1,88 @@ +.\" $Darwin$ +.\" +.\" Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. +.\" +.\" @APPLE_LICENSE_HEADER_START@ +.\" +.\" The contents of this file constitute Original Code as defined in and +.\" are subject to the Apple Public Source License Version 1.1 (the +.\" "License"). You may not use this file except in compliance with the +.\" License. Please obtain a copy of the License at +.\" http://www.apple.com/publicsource and read it before using this file. +.\" +.\" This Original Code and all software distributed under the License are +.\" distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the +.\" License for the specific language governing rights and limitations +.\" under the License. +.\" +.\" @APPLE_LICENSE_HEADER_END@ +.\" +.Dd June 8, 2000 +.Dt SEM_WAIT 2 +.Os Darwin +.Sh NAME +.Nm sem_trywait, sem_wait +.Nd lock a semaphore +.Sh SYNOPSIS +.Fd #include +.Ft int +.Fn sem_trywait "sem_t *sem" +.Ft int +.Fn sem_wait "sem_t *sem" +.Sh DESCRIPTION +The semaphore referenced by +.Fa sem +is locked. When calling +.Fn sem_wait , +if the semaphore's value is zero, the calling thread will block until +the lock is acquired or until the call is interrupted by a +signal. Alternatively, the +.Fn sem_trywait +function will fail if the semaphore is already locked, rather than +blocking on the semaphore. +.Pp +If successful (the lock was acquired), +.Fn sem_wait +and +.Fn sem_trywait +will return 0. Otherwise, -1 is returned and +.Va errno +is set, and the state of the semaphore is unchanged. +.Sh ERRORS +.Fn sem_wait +and +.Fn sem_trywait +succeed unless: +.Bl -tag -width Er +.It Bq Er EAGAIN +The semaphore is already locked. +.It Bq Er EDEADLK +A deadlock was detected. +.It Bq Er EINTR +The call was interrupted by a signal. +.It Bq Er EINVAL +.Fa sem +is not a valid semaphore descriptor. +.El +.Sh NOTES +Applications may encounter a priority inversion while using +semaphores. When a thread is waiting on a semaphore which is about to +be posted by a lower-priority thread and the lower-priority thread is +preempted by another thread (of medium priority), a priority inversion +has occured, and the higher-priority thread will be blocked for an +unlimited time period. Programmers using the realtime functionality +of the system should take care to avoid priority inversions. +.Sh SEE ALSO +.Xr sem_open 2 , +.Xr sem_post 2 , +.Xr semctl 2 , +.Xr semget 2 , +.Xr semop 2 +.Sh HISTORY +.Fn sem_wait +and +.Fn sem_trywait +are specified in the POSIX Realtime Extension (1003.1b-1993/1003.1i-1995). diff --git a/bsd/man/man2/sendfile.2 b/bsd/man/man2/sendfile.2 index d2919e3d2..1e5e537f3 100644 --- a/bsd/man/man2/sendfile.2 +++ b/bsd/man/man2/sendfile.2 @@ -104,14 +104,16 @@ arrays is specified by and .Fa trl_cnt . .Pp -When a header or trailer is specified the value of +When a header or trailer is specified, the value of .Fa len -returned will include the size of header or trailer sent. The user should -provide sufficiently large value of +argument indicates the maximum number of bytes in the header and/or file to be sent. +It does not control the trailer; if a trailer exists, all of it will be sent. +If the value of .Fa len -as argument including the size of header or trailer, -otherwise only part of file data will be sent -following the header. +argument is 0, all of the header and/or file will be sent before the entire trailer is sent. +On return, the +.Fa len +argument specifies the total number of bytes sent. .Pp The .Fa flags diff --git a/bsd/man/man2/setaudit.2 b/bsd/man/man2/setaudit.2 index 6b1979f5d..b626e0cf8 100644 --- a/bsd/man/man2/setaudit.2 +++ b/bsd/man/man2/setaudit.2 @@ -54,7 +54,6 @@ The data structure is defined as follows: .nf .in +4n - struct auditinfo { au_id_t ai_auid; /* Audit user ID */ au_mask_t ai_mask; /* Audit masks */ @@ -77,15 +76,13 @@ Until is set to something other than AU_DEFAUDITID any audit events generated by the system with be filtered by the non-attributed audit mask. -.PP - +.Pp The .Fa au_mask_t data structure defines the bit mask for auditing successful and failed events out of the predefined list of event classes. It is defined as follows: .nf .in +4n - struct au_mask { unsigned int am_success; /* success bits */ unsigned int am_failure; /* failure bits */ @@ -93,24 +90,21 @@ struct au_mask { typedef struct au_mask au_mask_t; .in .fi -.PP - +.Pp The .Fa au_termid_t data structure defines the Terminal ID recorded with every event caused by the process. It is defined as follows: .nf .in +4n - struct au_tid { dev_t port; u_int32_t machine; }; typedef struct au_tid au_tid_t; - .in .fi -.PP +.Pp The .Fa ai_asid variable contains the audit session ID which is recorded with every event @@ -118,7 +112,7 @@ caused by the process. It can be any value in the range 1 to PID_MAX (99999). If the value of AU_ASSIGN_ASID is used for .Fa ai_asid a unique session ID will be generated by the kernel. -The audit session ID will be returned in +The audit session ID will be returned in the .Fa ai_asid field on success. .Pp @@ -127,11 +121,10 @@ The system call uses the expanded .Fa auditinfo_addr_t -data structure supports Terminal IDs with larger addresses such as those used -in IP version 6. It is defined as follows: +data structure which supports Terminal IDs with larger addresses +such as those used in IP version 6. It is defined as follows: .nf .in +4n - struct auditinfo_addr { au_id_t ai_auid; /* Audit user ID. */ au_mask_t ai_mask; /* Audit masks. */ @@ -145,11 +138,10 @@ typedef struct auditinfo_addr auditinfo_addr_t; .Pp The .Fa au_tid_addr_t -data structure which includes a larger address storage field and an additional +data structure includes a larger address storage field and an additional field with the type of address stored: .nf .in +4n - struct au_tid_addr { dev_t at_port; u_int32_t at_type; @@ -183,18 +175,20 @@ field in is set to AU_IPv4 and the other .Fa ai_tid_addr fields are all set to zero. -The -.Fa ai_flags -field can only be set when a new session is initially created. Creating a new session is done by setting the .Fa ai_asid field to an unique session value or AU_ASSIGN_ASID. These system calls will fail when attempting to change the -.Fa ai_auid , -.Fa ai_termid , -or -.Fa ai_flags +.Fa ai_auid +or +.Fa ai_termid fields once set to something other than the default values. +The +.Fa ai_flags +field may be updated only according to local access control +policy but this is usually accomplished with +.Xr auditon 2 +using the A_SETSFLAGS command. The audit preselection masks may be changed at any time but are usually updated with .Xr auditon 2 diff --git a/bsd/man/man2/setgroups.2 b/bsd/man/man2/setgroups.2 index 1547f7027..0ec3c3086 100644 --- a/bsd/man/man2/setgroups.2 +++ b/bsd/man/man2/setgroups.2 @@ -33,7 +33,7 @@ .\" .\" @(#)setgroups.2 8.2 (Berkeley) 4/16/94 .\" -.Dd April 16, 1994 +.Dd September 15, 2010 .Dt SETGROUPS 2 .Os BSD 4.2 .Sh NAME @@ -56,6 +56,10 @@ more than .Dv {NGROUPS_MAX} . .Pp Only the super-user may set new groups. +.Pp +Use of +.Fn setgroups +is highly discouraged. .Sh RETURN VALUES A 0 value is returned on success, -1 on error, with an error code stored in diff --git a/bsd/man/man2/setregid.2 b/bsd/man/man2/setregid.2 new file mode 100644 index 000000000..47d791647 --- /dev/null +++ b/bsd/man/man2/setregid.2 @@ -0,0 +1,92 @@ +.\" Copyright (c) 1980, 1991, 1993, 1994 +.\" The Regents of the University of California. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. All advertising materials mentioning features or use of this software +.\" must display the following acknowledgement: +.\" This product includes software developed by the University of +.\" California, Berkeley and its contributors. +.\" 4. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" @(#)setregid.2 8.2 (Berkeley) 4/16/94 +.\" $FreeBSD: src/lib/libc/sys/setregid.2,v 1.6.2.4 2001/12/14 18:34:01 ru Exp $ +.\" +.Dd April 16, 1994 +.Dt SETREGID 2 +.Os +.Sh NAME +.Nm setregid +.Nd set real and effective group ID +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In unistd.h +.Ft int +.Fn setregid "gid_t rgid" "gid_t egid" +.Sh DESCRIPTION +The real and effective group ID's of the current process +are set to the arguments. +Unprivileged users may change the real group +ID to the effective group ID and vice-versa; only the super-user may +make other changes. +.Pp +Supplying a value of -1 for either the real or effective +group ID forces the system to substitute the current +ID in place of the -1 parameter. +.Pp +The +.Fn setregid +function was intended to allow swapping +the real and effective group IDs +in set-group-ID programs to temporarily relinquish the set-group-ID value. +This function did not work correctly; +its purpose is now better served by the use of the +.Fn setegid +function (see +.Xr setuid 2 ) . +.Pp +When setting the real and effective group IDs to the same value, +the standard +.Fn setgid +function is preferred. +.Sh RETURN VALUES +.Rv -std setregid +.Sh ERRORS +.Bl -tag -width Er +.It Bq Er EPERM +The current process is not the super-user and a change +other than changing the effective group-id to the real group-id +was specified. +.El +.Sh SEE ALSO +.Xr getgid 2 , +.Xr issetugid 2 , +.Xr setegid 2 , +.Xr setgid 2 , +.Xr setuid 2 +.Sh HISTORY +The +.Fn setregid +system call appeared in +.Bx 4.2 . diff --git a/bsd/man/man2/setreuid.2 b/bsd/man/man2/setreuid.2 new file mode 100644 index 000000000..13cfeadf4 --- /dev/null +++ b/bsd/man/man2/setreuid.2 @@ -0,0 +1,90 @@ +.\" Copyright (c) 1980, 1991, 1993, 1994 +.\" The Regents of the University of California. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. All advertising materials mentioning features or use of this software +.\" must display the following acknowledgement: +.\" This product includes software developed by the University of +.\" California, Berkeley and its contributors. +.\" 4. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" @(#)setreuid.2 8.2 (Berkeley) 4/16/94 +.\" $FreeBSD: src/lib/libc/sys/setreuid.2,v 1.6.2.6 2001/12/14 18:34:01 ru Exp $ +.\" +.Dd February 8, 2001 +.Dt SETREUID 2 +.Os +.Sh NAME +.Nm setreuid +.Nd set real and effective user IDs +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In unistd.h +.Ft int +.Fn setreuid "uid_t ruid" "uid_t euid" +.Sh DESCRIPTION +The real and effective user IDs of the +current process are set according to the arguments. +If +.Fa ruid +or +.Fa euid +is -1, the current uid is filled in by the system. +Unprivileged users may change the real user +ID to the effective user ID and vice-versa; only the super-user may +make other changes. +.Pp +The +.Fn setreuid +function has been used to swap the real and effective user IDs +in set-user-ID programs to temporarily relinquish the set-user-ID value. +This purpose is now better served by the use of the +.Fn seteuid +function (see +.Xr setuid 2 ) . +.Pp +When setting the real and effective user IDs to the same value, +the standard +.Fn setuid +function is preferred. +.Sh RETURN VALUES +.Rv -std setreuid +.Sh ERRORS +.Bl -tag -width Er +.It Bq Er EPERM +The current process is not the super-user and a change +other than changing the effective user-id to the real user-id +was specified. +.El +.Sh SEE ALSO +.Xr getuid 2 , +.Xr issetugid 2 , +.Xr seteuid 2 , +.Xr setuid 2 +.Sh HISTORY +The +.Fn setreuid +system call appeared in +.Bx 4.2 . diff --git a/bsd/man/man2/setxattr.2 b/bsd/man/man2/setxattr.2 index 6fe4f86b8..957c5bd77 100644 --- a/bsd/man/man2/setxattr.2 +++ b/bsd/man/man2/setxattr.2 @@ -91,6 +91,13 @@ is identical to except that it sets an extended attribute on an open file referenced by file descriptor .Fa fd . +.Sh NOTE +On some filesystems, such as +.Dv HFS+ , +setting the extended attribute +.Dv com.apple.ResourceFork +will update the modification time (``mtime'') of +the file. .Sh RETURN VALUES On success, 0 is returned. On failure, -1 is returned and the global variable diff --git a/bsd/man/man2/shm_open.2 b/bsd/man/man2/shm_open.2 new file mode 100644 index 000000000..1b4bfc685 --- /dev/null +++ b/bsd/man/man2/shm_open.2 @@ -0,0 +1,179 @@ +.\" $Darwin$ +.\" +.\" Copyright (c) 1999-2002 Apple Computer, Inc. All rights reserved. +.\" +.\" @APPLE_LICENSE_HEADER_START@ +.\" +.\" The contents of this file constitute Original Code as defined in and +.\" are subject to the Apple Public Source License Version 1.1 (the +.\" "License"). You may not use this file except in compliance with the +.\" License. Please obtain a copy of the License at +.\" http://www.apple.com/publicsource and read it before using this file. +.\" +.\" This Original Code and all software distributed under the License are +.\" distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the +.\" License for the specific language governing rights and limitations +.\" under the License. +.\" +.\" @APPLE_LICENSE_HEADER_END@ +.\" +.Dd August 29, 2008 +.Dt SHM_OPEN 2 +.Os Darwin +.Sh NAME +.Nm shm_open +.Nd open a shared memory object +.Sh SYNOPSIS +.Fd #include +.Ft int +.Fo shm_open +.Fa "const char *name" +.Fa "int oflag" +.Fa "..." +.Fc +.Pp +The parameter "mode_t mode" is optional. +.Sh DESCRIPTION +The shared memory object referenced by +.Fa name +is opened for reading and/or writing as specified by the argument +.Fa oflag +and the file descriptor returned to the calling process. +The returned file descriptor will be the lowest non-open file +descriptor for the calling process, and is not shared with any +other processes, as it is a new file descriptor. The new file +descriptor will have the +.Dv FD_CLOEXEC +flag set. +Repeated calls +to +.Nm shm_open +with the same string value for +.Fn name +will return a file descriptor referring to the same shared memory +object, provided that the object has not been unlinked by a call to +.Fn shm_unlink . +The +.Fa oflag +argument may indicate the file is to be +created if it does not exist (by specifying the +.Dv O_CREAT +flag), in which case the file is created with mode +.Fa mode +as described in +.Xr chmod 2 +and modified by the process' umask value (see +.Xr umask 2 ) . +.Pp +The value of +.Fa oflag +is formed by +.Em or Ns 'ing +the following values: +.Pp +.Bd -literal -offset indent -compact +O_RDONLY open for reading only +O_RDWR open for reading and writing +O_CREAT create object if it does not exist +O_EXCL error if create and object exists +O_TRUNC truncate size to 0 +.Ed +.Pp +Exactly one of +.Dv O_RDONLY +or +.Dv O_RDWR +must be specified. +.Pp +If +.Dv O_TRUNC +is specified and the +file exists, the file is truncated to zero length. +If +.Dv O_EXCL +is set with +.Dv O_CREAT +and the file already +exists, +.Fn shm_open +returns an error. This may be used to +implement a simple exclusive access locking mechanism. +.Pp +If successful, +.Fn shm_open +returns a non-negative integer, termed a file descriptor. +It returns -1 and sets +.Va errno +on failure. +The file pointer used to mark the current position within the +memory object is set to the beginning of the object. +.Pp +When a new shared memory object is created it is given the +owner and group corresponding to the effective user and +group of the calling process. There is no visible entry in the +file system for the created object in this implementation. +.Pp +When a shared memory object is created, it persists until it +it unlinked and all other references are gone. Objects do +not persist across a system reboot. +.Pp +The system imposes a limit on the number of file descriptors +open simultaneously by one process. +.Xr Getdtablesize 2 +returns the current system limit. +.Sh ERRORS +The named object is opened unless: +.Bl -tag -width Er +.It Bq Er EACCES +The required permissions (for reading and/or writing) +are denied for the given flags. +.It Bq Er EACCES +.Dv O_CREAT +is specified, the object does not exist, and permission to +create the object is denied. +.It Bq Er EEXIST +.Dv O_CREAT +and +.Dv O_EXCL +were specified and the object exists. +.It Bq Er EINTR +The +.Fn shm_open +operation was interrupted by a signal. +.It Bq Er EINVAL +The +.Fn shm_open +operation is not supported. +.It Bq Er EMFILE +The process has already reached its limit for open file descriptors. +.It Bq Er ENAMETOOLONG +.Fa name +exceeded the name size limit. +This is currently +.Dv PSHMNAMLEN +characters (defined in +.In sys/posix_shm.h ) , +but this may change in the future. +.It Bq Er ENFILE +The system file table is full. +.It Bq Er ENOENT +.Dv O_CREAT +is not set and the named object does not exist. +.It Bq Er ENOSPC +.Dv O_CREAT +is specified, the file does not exist, and there is insufficient +space available to create the object. +.El +.Sh SEE ALSO +.Xr chmod 2 , +.Xr close 2 , +.Xr getdtablesize 2 , +.Xr mmap 2 , +.Xr shm_unlink 2 , +.Xr umask 2 +.Sh HISTORY +.Fn shm_open +is specified in the POSIX Realtime Extension (1003.1b-1993/1003.1i-1995). diff --git a/bsd/man/man2/shm_unlink.2 b/bsd/man/man2/shm_unlink.2 new file mode 100644 index 000000000..7ecc66fd4 --- /dev/null +++ b/bsd/man/man2/shm_unlink.2 @@ -0,0 +1,87 @@ +.\" $Darwin$ +.\" +.\" Copyright (c) 1999-2002 Apple Computer, Inc. All rights reserved. +.\" +.\" @APPLE_LICENSE_HEADER_START@ +.\" +.\" The contents of this file constitute Original Code as defined in and +.\" are subject to the Apple Public Source License Version 1.1 (the +.\" "License"). You may not use this file except in compliance with the +.\" License. Please obtain a copy of the License at +.\" http://www.apple.com/publicsource and read it before using this file. +.\" +.\" This Original Code and all software distributed under the License are +.\" distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the +.\" License for the specific language governing rights and limitations +.\" under the License. +.\" +.\" @APPLE_LICENSE_HEADER_END@ +.\" +.Dd August 31, 2006 +.Dt SHM_UNLINK 2 +.Os Darwin +.Sh NAME +.Nm shm_unlink +.Nd remove shared memory object +.Sh SYNOPSIS +.Fd #include +.Ft int +.Fn shm_unlink "const char *name" +.Sh DESCRIPTION +The +.Fn shm_unlink +function disassociates the shared memory object specified by +.Fa name +from that name. +The resources associated with the shared memory object remain intact +until the last file descriptor reference is removed, e.g., by +.Xr close 2 +or +.Xr munmap 2 , +at which point the resources are reclaimed +(if no references exist at the time of the call to +.Fn shm_unlink , +the resources are reclaimed immediately). +The name can only be reused +when it is bound to a new shared memory object with a call to +.Xr shm_open 2 +with the +.Dv O_CREAT +flag. +.Sh RETURN VALUES +Upon successful completion, a value of 0 is returned. +Otherwise, a value of -1 is returned and +.Va errno +is set to indicate the error, +and the named shared memory object will remain unchanged. +.Sh ERRORS +The +.Fn shm_unlink +succeeds unless: +.Bl -tag -width Er +.It Bq Er EACCES +Permission is denied to be remove the object. +.It Bq Er ENAMETOOLONG +.Fa name +exceeded the name size limit. +This is currently +.Dv PSHMNAMLEN +characters (defined in +.In sys/posix_shm.h ) , +but this may change in the future. +.It Bq Er ENOENT +The named object does not exist. +.El +.Sh SEE ALSO +.Xr close 2 , +.Xr mmap 2 , +.Xr munmap 2 , +.Xr shm_open 2 , +.Xr shmat 2 , +.Xr shmctl 2 +.Sh HISTORY +.Fn shm_open +is specified in the POSIX Realtime Extension (1003.1b-1993/1003.1i-1995). diff --git a/bsd/man/man2/stat.2 b/bsd/man/man2/stat.2 index 76cca02f1..02de79c72 100644 --- a/bsd/man/man2/stat.2 +++ b/bsd/man/man2/stat.2 @@ -127,9 +127,7 @@ as defined by and into which information is placed concerning the file. When the macro .Dv _DARWIN_FEATURE_64_BIT_INODE -is not defined (the -.Ft ino_t -type is 32-bits), the +is not defined (see below for more information about this macro), the .Fa stat structure is defined as: .Bd -literal @@ -137,7 +135,7 @@ struct stat { /* when _DARWIN_FEATURE_64_BIT_INODE is NOT defined */ dev_t st_dev; /* device inode resides on */ ino_t st_ino; /* inode's number */ mode_t st_mode; /* inode protection mode */ - nlink_t st_nlink; /* number or hard links to the file */ + nlink_t st_nlink; /* number of hard links to the file */ uid_t st_uid; /* user-id of owner */ gid_t st_gid; /* group-id of owner */ dev_t st_rdev; /* device type, for special file inode */ @@ -155,16 +153,6 @@ struct stat { /* when _DARWIN_FEATURE_64_BIT_INODE is NOT defined */ However, when the macro .Dv _DARWIN_FEATURE_64_BIT_INODE is defined, the -.Ft ino_t -type will be 64-bits (force 64-bit inode mode by defining the -.Dv _DARWIN_USE_64_BIT_INODE -macro before including header files). -This will cause symbol variants of the -.Fa stat -family, with the -.Fa $INODE64 -suffixes, to be automatically linked in. -In addition, the .Fa stat structure will now be defined as: .Bd -literal @@ -274,6 +262,141 @@ field, see .Aq Pa sys/stat.h and .Xr chflags 2 . +.Sh _DARWIN_FEATURE_64_BIT_INODE +In order to accommodate advanced capabilities of newer file systems, the +.Fa struct stat , +.Fa struct statfs , +and +.Fa struct dirent +data structures were updated in Mac OSX 10.5. +.Pp +The most obvious change is the increased size of +.Fa ino_t +from 32 bits to 64 bits. As a consequence, storing an ino_t in an int is +no longer safe, and file formats storing ino_t as 32-bit values may need to +be updated. There are other changes as well, such as the widening of +.Fa f_fstypename , +.Fa f_mntonname , +and +.Fa f_mntfromname +in +.Fa struct statfs . +Please refer to +.Xr stat 2 +and +.Xr dir 5 +for more detail on the specific changes to the other affected data structures. +.Pp +On platforms that existed before these updates were available, ABI +compatibility is achieved by providing two implementations for related +functions: one using the legacy data structures and one using the updated +data structures. Variants which make use of the newer structures have their +symbols suffixed with $INODE64. These $INODE64 suffixes are automatically +appended by the compiler tool-chain and should not be used directly. +.Pp +Platforms that were released after these updates only have the newer variants +available to them. These platforms have the macro +.Dv _DARWIN_FEATURE_ONLY_64_BIT_INODE +defined. +.Pp +The +.Dv _DARWIN_FEATURE_64_BIT_INODE +macro should not be set directly. Instead, developers should make use of the +.Dv _DARWIN_NO_64_BIT_INODE +or +.Dv _DARWIN_USE_64_BIT_INODE +macros when the default variant is not desired. The following table details +the effects of defining these macros for different deployment targets. +.Pp +.TS +center; +c s s s +l | c s s +c | c c c +c | c c c +l | c c c. +T{ +.Dv _DARWIN_FEATURE_ONLY_64_BIT_INODE Sy not defined +T} += + Deployment Target +user defines: < 10.5 10.5 > 10.5 +_ +T{ +.Em (none) +T} 32-bit 32-bit 64-bit +T{ +.Dv _DARWIN_NO_64_BIT_INODE +T} 32-bit 32-bit 32-bit +T{ +.Dv _DARWIN_USE_64_BIT_INODE +T} 32-bit 64-bit 64-bit +_ +.T& +c s s s +c s s s +c | l s s +c | c c c +l | c c c. + +T{ +.Dv _DARWIN_FEATURE_ONLY_64_BIT_INODE Sy defined +T} += +user defines: Any Deployment Target +_ +T{ +.Em (none) +T} 64-bit-only +T{ +.Dv _DARWIN_NO_64_BIT_INODE +T} T{ +.Em (error) +T} +T{ +.Dv _DARWIN_USE_64_BIT_INODE +T} 64-bit-only +_ +.TE +.Pp +.Bl -tag -width 64-bit-only -offset indent +.It 32-bit +32-bit inode values are enabled, and the legacy structures involving the +.Vt ino_t +type are in use. +The macro +.Dv _DARWIN_FEATURE_64_BIT_INODE +is not defined. +.It 64-bit +64-bit inode values are enabled, and the expanded structures involving the +.Vt ino_t +type are in use. +The macro +.Dv _DARWIN_FEATURE_64_BIT_INODE +is defined, and loader symbols will contain the +.Li $INODE64 +suffix. +.It 64-bit-only +Like 64-bit, except loader symbols do not have the +.Li $INODE64 +suffix. +.It Em (error) +A compile time error is generated. +.El +.Pp +Due to the increased benefits of the larger structure, it is highly +recommended that developers not define +.Dv _DARWIN_NO_64_BIT_INODE +and make use of +.Dv _DARWIN_USE_64_BIT_INODE +when targeting Mac OSX 10.5. +.Pp +In addition to the $INODE64 suffixed symbols, variants suffixed with 64 are +also available for related functions. These functions were provided as a way +for developers to use the updated structures in code that also made use of +the legacy structures. The enlarged stat structures were also prefixed with +64 to distinguish them from their legacy variants. These functions have been +deprecated and should be avoided. .Sh RETURN VALUES Upon successful completion a value of 0 is returned. Otherwise, a value of -1 is returned and @@ -399,6 +522,7 @@ structure when 64-bit inodes are in effect (see above). .Xr chown 2 , .Xr utimes 2 , .Xr compat 5 , +.Xr statfs 2 , .Xr symlink 7 .Sh BUGS Applying diff --git a/bsd/man/man2/statfs.2 b/bsd/man/man2/statfs.2 index 16e80f5d4..85dce6a80 100644 --- a/bsd/man/man2/statfs.2 +++ b/bsd/man/man2/statfs.2 @@ -71,9 +71,11 @@ argument is a pointer to a structure. When the macro .Dv _DARWIN_FEATURE_64_BIT_INODE -is not defined (the -.Ft ino_t -type is 32-bits), that structure is defined as: +is not defined (see +.Xr stat 2 +for more information on this macro), the +.Fa statfs +structure is defined as: .Bd -literal typedef struct { int32_t val[2]; } fsid_t; @@ -107,18 +109,8 @@ struct statfs { /* when _DARWIN_FEATURE_64_BIT_INODE is NOT defined */ However, when the macro .Dv _DARWIN_FEATURE_64_BIT_INODE is defined, the -.Ft ino_t -type will be 64-bits (force 64-bit inode mode by defining the -.Dv _DARWIN_USE_64_BIT_INODE -macro before including header files). -This will cause symbol variants of the -.Fa statfs -family, with the -.Fa $INODE64 -suffixes, to be automatically linked in. -In addition, the .Fa statfs -structure will now be defined as: +structure is defined as: .Bd -literal #define MFSTYPENAMELEN 16 /* length of fs type name including null */ #define MAXPATHLEN 1024 @@ -144,6 +136,13 @@ struct statfs { /* when _DARWIN_FEATURE_64_BIT_INODE is defined */ }; .Ed .Pp +Note that the +.Fa f_fstypename , +.Fa f_mntonname , +and +.Fa f_mntfromname +fields are also wider in this variant. +.Pp Fields that are undefined for a particular file system are set to -1. The .Fn fstatfs diff --git a/bsd/man/man2/undelete.2 b/bsd/man/man2/undelete.2 new file mode 100644 index 000000000..b85ecdcae --- /dev/null +++ b/bsd/man/man2/undelete.2 @@ -0,0 +1,108 @@ +.\" Copyright (c) 1994 +.\" Jan-Simon Pendry +.\" The Regents of the University of California. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. All advertising materials mentioning features or use of this software +.\" must display the following acknowledgement: +.\" This product includes software developed by the University of +.\" California, Berkeley and its contributors. +.\" 4. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" @(#)undelete.2 8.4 (Berkeley) 10/18/94 +.\" $FreeBSD: src/lib/libc/sys/undelete.2,v 1.17 2006/01/22 19:49:37 truckman Exp $ +.\" +.Dd January 22, 2006 +.Dt UNDELETE 2 +.Os +.Sh NAME +.Nm undelete +.Nd attempt to recover a deleted file +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In unistd.h +.Ft int +.Fn undelete "const char *path" +.Sh DESCRIPTION +The +.Fn undelete +system call attempts to recover the deleted file named by +.Fa path . +Currently, this works only when the named object +is a whiteout in a union file system. +The system call removes the whiteout causing +any objects in a lower layer of the +union stack to become visible once more. +.Pp +Eventually, the +.Fn undelete +functionality may be expanded to other file systems able to recover +deleted files such as the log-structured file system. +.Sh RETURN VALUES +.Rv -std undelete +.Sh ERRORS +The +.Fn undelete +succeeds unless: +.Bl -tag -width Er +.It Bq Er ENOTDIR +A component of the path prefix is not a directory. +.It Bq Er ENAMETOOLONG +A component of a pathname exceeded 255 characters, +or an entire path name exceeded 1023 characters. +.It Bq Er EEXIST +The path does not reference a whiteout. +.It Bq Er ENOENT +The named whiteout does not exist. +.It Bq Er EACCES +Search permission is denied for a component of the path prefix. +.It Bq Er EACCES +Write permission is denied on the directory containing the name +to be undeleted. +.It Bq Er ELOOP +Too many symbolic links were encountered in translating the pathname. +.It Bq Er EPERM +The directory containing the name is marked sticky, +and the containing directory is not owned by the effective user ID. +.It Bq Er EINVAL +The last component of the path is +.Ql .. . +.It Bq Er EIO +An I/O error occurred while updating the directory entry. +.It Bq Er EROFS +The name resides on a read-only file system. +.It Bq Er EFAULT +The +.Fa path +argument +points outside the process's allocated address space. +.El +.Sh SEE ALSO +.Xr unlink 2 +.Sh HISTORY +The +.Fn undelete +system call first appeared in +.Bx 4.4 Lite . diff --git a/bsd/man/man3/posix_spawn_file_actions_addclose.3 b/bsd/man/man3/posix_spawn_file_actions_addclose.3 index 6cd2033c3..36c64715d 100644 --- a/bsd/man/man3/posix_spawn_file_actions_addclose.3 +++ b/bsd/man/man3/posix_spawn_file_actions_addclose.3 @@ -1,5 +1,5 @@ .\" -.\" Copyright (c) 2000-2007 Apple Inc. All rights reserved. +.\" Copyright (c) 2000-2010 Apple Inc. All rights reserved. .\" .\" @APPLE_OSREFERENCE_LICENSE_HEADER_START@ .\" @@ -27,7 +27,7 @@ .\" .\" @(#)posix_spawn_file_actions_addclose.3 . -.Dd August 22, 2007 +.Dd November 2, 2010 .Dt POSIX_SPAWN_FILE_ACTIONS_ADDCLOSE 3 .Os "Mac OS X" .Sh NAME @@ -56,6 +56,11 @@ .Fa "int filedes" .Fa "int newfiledes" .Fc +.Ft int +.Fo posix_spawn_file_actions_addinherit_np +.Fa "posix_spawn_file_actions_t *file_actions" +.Fa "int filedes" +.Fc .Sh DESCRIPTION The .Fn posix_spawn_file_actions_addclose @@ -115,6 +120,42 @@ is created as if had been called on .Em filedes prior to the new child process starting execution. +.Pp +The +.Fn posix_spawn_file_actions_addinherit_np +function adds an abstract inheritance operation to the +list of operations associated with the object referenced by +.Em file_actions , +for subsequent use in a call to +.Xr posix_spawn 2 +or +.Xr posix_spawnp 2 . +The pre-existing descriptor referred to by +.Em filedes +is marked for inheritance into the new process image, and the +.Em FD_CLOEXEC +flag is cleared from the file descriptor in the new process image. +.Pp +Normally, for +.Xr posix_spawn 2 +and +.Xr posix_spawnp 2 , +all file descriptors are inherited from the parent process +into the spawned process, except for those explicitly +marked as close-on-exec. However if the flag +.Em POSIX_SPAWN_CLOEXEC_DEFAULT +is set, then during the spawn operation, all pre-existing +file descriptors in the parent process are treated as if they +had been marked close-on-exec i.e. none of them are automatically +inherited. See +.Xr posix_spawnattr_setflags 3 . +Only file descriptors explicitly manipulated via +.Em file_actions +are made available in the spawned process. In that case, +.Fn posix_spawn_file_actions_addinherit_np +can be used to make specific pre-existing file +descriptors from the parent process be +available in the spawned process. .Sh RETURN VALUES On success, these functions return 0; on failure they return an error number from @@ -127,7 +168,7 @@ These functions may fail if: The value specified by .Fa filedes is negative or would cause the process to exceed the maximum number of -open files it is allowed.. +open files it is allowed. .\" ========== .It Bq Er EINVAL The value of @@ -135,7 +176,7 @@ The value of is invalid. .\" ========== .It Bq Er ENOMEM -Insufficient memory was available eo add to the +Insufficient memory was available to add the new action to .Fa file_actions . .El .Sh SEE ALSO @@ -143,6 +184,7 @@ Insufficient memory was available eo add to the .Xr posix_spawnp 2 , .Xr posix_spawn_file_actions_init 3 , .Xr posix_spawn_file_actions_destroy 3 , +.Xr posix_spawnattr_setflags 3 . .Sh STANDARDS .St -susv3 [SPN] .Sh HISTORY diff --git a/bsd/man/man3/posix_spawnattr_setflags.3 b/bsd/man/man3/posix_spawnattr_setflags.3 index 8828a83df..3359497ec 100644 --- a/bsd/man/man3/posix_spawnattr_setflags.3 +++ b/bsd/man/man3/posix_spawnattr_setflags.3 @@ -1,5 +1,5 @@ .\" -.\" Copyright (c) 2000-2007 Apple Inc. All rights reserved. +.\" Copyright (c) 2000-2010 Apple Inc. All rights reserved. .\" .\" @APPLE_OSREFERENCE_LICENSE_HEADER_START@ .\" @@ -27,7 +27,7 @@ .\" .\" @(#)posix_spawnattr_setflags.3 . -.Dd August 22, 2007 +.Dd October 28, 2010 .Dt POSIX_SPAWNATTR_SETFLAGS 3 .Os "Mac OS X" .Sh NAME @@ -119,6 +119,13 @@ manipulate the process before it begins execution in user space. This permits, for example, obtaining exact instruction counts, or debugging very early in .Xr dyld 1 . +.It Dv POSIX_SPAWN_CLOEXEC_DEFAULT +.Em Apple Extension : +If this bit is set, then only file descriptors explicitly described by the +.Fa file_actions +argument are available in the spawned process; all +of the other file descriptors are +automatically closed in the spawned process. .El .Sh RETURN VALUES On success, these functions return 0; on failure they return an error @@ -154,6 +161,7 @@ is invalid. .Xr posix_spawnattr_setpgroup 3 , .Xr posix_spawnattr_setsigdefault 3 , .Xr posix_spawnattr_setsigmask 3 , +.Xr posix_spawn_file_actions_init 3 , .Xr setpgid 2 , .Xr execve 2 , .Xr dyld 1 diff --git a/bsd/man/man4/auditpipe.4 b/bsd/man/man4/auditpipe.4 index 7e0d7cc3e..e3a7a9427 100644 --- a/bsd/man/man4/auditpipe.4 +++ b/bsd/man/man4/auditpipe.4 @@ -24,7 +24,7 @@ .\" .\" $FreeBSD: src/share/man/man4/auditpipe.4,v 1.6 2008/05/02 17:36:22 rwatson Exp $ .\" -.Dd May 5, 2006 +.Dd Oct 18, 2010 .Os .Dt AUDITPIPE 4 .Sh NAME @@ -156,7 +156,7 @@ These flags correspond to the field in .Xr audit_control 5 . The ioctl argument should be of type -.Vt u_int . +.Vt au_mask_t . .It Dv AUDITPIPE_SET_PRESELECT_FLAGS Set the current default preselection flags for attributable events on the pipe. @@ -165,7 +165,7 @@ These flags correspond to the field in .Xr audit_control 5 . The ioctl argument should be of type -.Vt u_int . +.Vt au_mask_t . .It Dv AUDITPIPE_GET_PRESELECT_NAFLAGS Retrieve the current default preselection flags for non-attributable events on the pipe. @@ -174,7 +174,7 @@ These flags correspond to the field in .Xr audit_control 5 . The ioctl argument should be of type -.Vt u_int . +.Vt au_mask_t . .It Dv AUDITPIPE_SET_PRESELECT_NAFLAGS Set the current default preselection flags for non-attributable events on the pipe. @@ -183,7 +183,7 @@ These flags correspond to the field in .Xr audit_control 5 . The ioctl argument should be of type -.Vt u_int . +.Vt au_mask_t . .It Dv AUDITPIPE_GET_PRESELECT_AUID Query the current preselection masks for a specific auid on the pipe. The ioctl argument should be of type @@ -252,7 +252,5 @@ It might be desirable to provided a more flexible selection model. The per-pipe audit event queue is fifo, with drops occurring if either the user thread provides in sufficient for the record on the queue head, or on enqueue if there is insufficient room. -It might be desirable to support partial reads of records, which would be -more compatible with buffered I/O as implemented in system libraries, and to -allow applications to select which records are dropped, possibly in the style -of preselection. +It might be desirable to allow applications to select which records are +dropped, possibly in the style of preselection. diff --git a/bsd/man/man4/gif.4 b/bsd/man/man4/gif.4 index fe42f42cd..00e63f8eb 100644 --- a/bsd/man/man4/gif.4 +++ b/bsd/man/man4/gif.4 @@ -39,7 +39,7 @@ .Sh DESCRIPTION The .Nm -interface is a generic tunnelling pseudo device for IPv4 and IPv6. +interface is a generic tunneling pseudo device for IPv4 and IPv6. It can tunnel IPv[46] traffic over IPv[46]. Therefore, there can be four possible configurations. The behavior of @@ -195,7 +195,7 @@ The device first appeared in WIDE hydrangea IPv6 kit. .\" .Sh BUGS -There are many tunnelling protocol specifications, +There are many tunneling protocol specifications, defined differently from each other. .Nm may not interoperate with peers which are based on different specifications, diff --git a/bsd/man/man4/icmp6.4 b/bsd/man/man4/icmp6.4 index 40a30a31f..f41f7216d 100644 --- a/bsd/man/man4/icmp6.4 +++ b/bsd/man/man4/icmp6.4 @@ -235,7 +235,7 @@ sockets can be opened with the .Dv SOCK_DGRAM socket type without requiring root privileges. The synopsis is the following: .Pp -.Fn socket AF_INET6 SOCK_DGRAM IPPROTO_ICMP6 +.Fn socket AF_INET6 SOCK_DGRAM IPPROTO_ICMPV6 .Pp This can only be used to send .Tn ICMPv6 diff --git a/bsd/man/man4/netintro.4 b/bsd/man/man4/netintro.4 index 725797146..ab2b1b277 100644 --- a/bsd/man/man4/netintro.4 +++ b/bsd/man/man4/netintro.4 @@ -145,7 +145,8 @@ are known to the system (and additional formats are defined for possible future implementation): .Bd -literal #define AF_UNIX 1 /* local to host (pipes) */ -#define AF_INET 2 /* internetwork: UDP, TCP, etc. */ +#define AF_INET 2 /* IPv4: UDP, TCP, etc. */ +#define AF_INET6 30 /* IPv6: UDP, TCP, etc. */ #define AF_NS 6 /* Xerox NS protocols */ #define AF_CCITT 10 /* CCITT protocols, X.25 etc */ #define AF_HYLINK 15 /* NSC Hyperchannel */ diff --git a/bsd/man/man4/random.4 b/bsd/man/man4/random.4 index bc0dbc76c..ed72fa315 100644 --- a/bsd/man/man4/random.4 +++ b/bsd/man/man4/random.4 @@ -72,4 +72,4 @@ directly before obtaining important random numbers. .Sh HISTORY A .Nm -device appeared in Linux operating system. +device appeared in the Linux operating system. diff --git a/bsd/man/man5/Makefile b/bsd/man/man5/Makefile index 0780eadfa..bf6093b3f 100644 --- a/bsd/man/man5/Makefile +++ b/bsd/man/man5/Makefile @@ -11,8 +11,6 @@ DATAFILES = \ core.5 \ dir.5 \ dirent.5 \ - fs.5 \ - inode.5 \ types.5 INSTALL_MAN_LIST = ${DATAFILES} diff --git a/bsd/man/man5/dir.5 b/bsd/man/man5/dir.5 index 6f2eacb60..c9e37b3b5 100644 --- a/bsd/man/man5/dir.5 +++ b/bsd/man/man5/dir.5 @@ -87,9 +87,9 @@ and further in the file .Aq dirent.h . When the macro .Dv _DARWIN_FEATURE_64_BIT_INODE -is not defined (the -.Ft ino_t -type is 32-bits), the +is not defined (see +.Xr stat 2 +for more information on this macro), the .Fa dirent structure is defined as: .Bd -literal @@ -116,16 +116,8 @@ struct dirent { /* when _DARWIN_FEATURE_64_BIT_INODE is NOT defined */ However, when the macro .Dv _DARWIN_FEATURE_64_BIT_INODE is defined, the -.Ft ino_t -type will be 64-bits (force 64-bit inode mode by defining the -.Dv _DARWIN_USE_64_BIT_INODE -macro before including header files). -This will cause symbol variants of the directory routines, with the -.Fa $INODE64 -suffixes, to be automatically linked in. -In addition, the .Fa dirent -structure will now be defined as: +structure is defined as: .Bd -literal /* * The dirent structure defines the format of directory entries. diff --git a/bsd/man/man5/fs.5 b/bsd/man/man5/fs.5 deleted file mode 100644 index 18da833e6..000000000 --- a/bsd/man/man5/fs.5 +++ /dev/null @@ -1,343 +0,0 @@ -.\" $NetBSD: fs.5,v 1.3 1994/11/30 19:31:17 jtc Exp $ -.\" -.\" Copyright (c) 1983, 1991, 1993 -.\" The Regents of the University of California. All rights reserved. -.\" -.\" Redistribution and use in source and binary forms, with or without -.\" modification, are permitted provided that the following conditions -.\" are met: -.\" 1. Redistributions of source code must retain the above copyright -.\" notice, this list of conditions and the following disclaimer. -.\" 2. Redistributions in binary form must reproduce the above copyright -.\" notice, this list of conditions and the following disclaimer in the -.\" documentation and/or other materials provided with the distribution. -.\" 3. All advertising materials mentioning features or use of this software -.\" must display the following acknowledgement: -.\" This product includes software developed by the University of -.\" California, Berkeley and its contributors. -.\" 4. Neither the name of the University nor the names of its contributors -.\" may be used to endorse or promote products derived from this software -.\" without specific prior written permission. -.\" -.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND -.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -.\" SUCH DAMAGE. -.\" -.\" @(#)fs.5 8.2 (Berkeley) 4/19/94 -.\" -.Dd April 19, 1994 -.Dt FS 5 -.Os BSD 4.2 -.Sh NAME -.Nm fs , -.Nm inode -.Nd format of file system volume -.Sh SYNOPSIS -.Fd #include -.Fd #include -.Fd #include -.Sh DESCRIPTION -The files -.Aq Pa fs.h -and -.Aq Pa inode.h -declare several structures, defined variables and macros -which are used to create and manage the underlying format of -file system objects on random access devices (disks). -.Pp -The block size and number of blocks which -comprise a file system are parameters of the file system. -Sectors beginning at -.Dv BBLOCK -and continuing for -.Dv BBSIZE -are used -for a disklabel and for some hardware primary -and secondary bootstrapping programs. -.Pp -The actual file system begins at sector -.Dv SBLOCK -with the -.Em super-block -that is of size -.Dv SBSIZE . -The following structure described the super-block and is -from the file -.Aq Pa ufs/fs.h : -.Bd -literal -#define FS_MAGIC 0x011954 -struct fs { - struct fs *fs_link; /* linked list of file systems */ - struct fs *fs_rlink; /* used for incore super blocks */ - daddr_t fs_sblkno; /* addr of super-block in filesys */ - daddr_t fs_cblkno; /* offset of cyl-block in filesys */ - daddr_t fs_iblkno; /* offset of inode-blocks in filesys */ - daddr_t fs_dblkno; /* offset of first data after cg */ - long fs_cgoffset; /* cylinder group offset in cylinder */ - long fs_cgmask; /* used to calc mod fs_ntrak */ - time_t fs_time; /* last time written */ - long fs_size; /* number of blocks in fs */ - long fs_dsize; /* number of data blocks in fs */ - long fs_ncg; /* number of cylinder groups */ - long fs_bsize; /* size of basic blocks in fs */ - long fs_fsize; /* size of frag blocks in fs */ - long fs_frag; /* number of frags in a block in fs */ -/* these are configuration parameters */ - long fs_minfree; /* minimum percentage of free blocks */ - long fs_rotdelay; /* num of ms for optimal next block */ - long fs_rps; /* disk revolutions per second */ -/* these fields can be computed from the others */ - long fs_bmask; /* ``blkoff'' calc of blk offsets */ - long fs_fmask; /* ``fragoff'' calc of frag offsets */ - long fs_bshift; /* ``lblkno'' calc of logical blkno */ - long fs_fshift; /* ``numfrags'' calc number of frags */ -/* these are configuration parameters */ - long fs_maxcontig; /* max number of contiguous blks */ - long fs_maxbpg; /* max number of blks per cyl group */ -/* these fields can be computed from the others */ - long fs_fragshift; /* block to frag shift */ - long fs_fsbtodb; /* fsbtodb and dbtofsb shift constant */ - long fs_sbsize; /* actual size of super block */ - long fs_csmask; /* csum block offset */ - long fs_csshift; /* csum block number */ - long fs_nindir; /* value of NINDIR */ - long fs_inopb; /* value of INOPB */ - long fs_nspf; /* value of NSPF */ -/* yet another configuration parameter */ - long fs_optim; /* optimization preference, see below */ -/* these fields are derived from the hardware */ - long fs_npsect; /* # sectors/track including spares */ - long fs_interleave; /* hardware sector interleave */ - long fs_trackskew; /* sector 0 skew, per track */ - long fs_headswitch; /* head switch time, usec */ - long fs_trkseek; /* track-to-track seek, usec */ -/* sizes determined by number of cylinder groups and their sizes */ - daddr_t fs_csaddr; /* blk addr of cyl grp summary area */ - long fs_cssize; /* size of cyl grp summary area */ - long fs_cgsize; /* cylinder group size */ -/* these fields are derived from the hardware */ - long fs_ntrak; /* tracks per cylinder */ - long fs_nsect; /* sectors per track */ - long fs_spc; /* sectors per cylinder */ -/* this comes from the disk driver partitioning */ - long fs_ncyl; /* cylinders in file system */ -/* these fields can be computed from the others */ - long fs_cpg; /* cylinders per group */ - long fs_ipg; /* inodes per group */ - long fs_fpg; /* blocks per group * fs_frag */ -/* this data must be re-computed after crashes */ - struct csum fs_cstotal; /* cylinder summary information */ -/* these fields are cleared at mount time */ - char fs_fmod; /* super block modified flag */ - char fs_clean; /* file system is clean flag */ - char fs_ronly; /* mounted read-only flag */ - char fs_flags; /* currently unused flag */ - char fs_fsmnt[MAXMNTLEN]; /* name mounted on */ -/* these fields retain the current block allocation info */ - long fs_cgrotor; /* last cg searched */ - struct csum *fs_csp[MAXCSBUFS]; /* list of fs_cs info buffers */ - long fs_cpc; /* cyl per cycle in postbl */ - short fs_opostbl[16][8]; /* old rotation block list head */ - long fs_sparecon[56]; /* reserved for future constants */ - quad fs_qbmask; /* ~fs_bmask - for use with quad size */ - quad fs_qfmask; /* ~fs_fmask - for use with quad size */ - long fs_postblformat; /* format of positional layout tables */ - long fs_nrpos; /* number of rotational positions */ - long fs_postbloff; /* (short) rotation block list head */ - long fs_rotbloff; /* (u_char) blocks for each rotation */ - long fs_magic; /* magic number */ - u_char fs_space[1]; /* list of blocks for each rotation */ -/* actually longer */ -}; -.Ed -.Pp -Each disk drive contains some number of file systems. -A file system consists of a number of cylinder groups. -Each cylinder group has inodes and data. -.Pp -A file system is described by its super-block, which in turn -describes the cylinder groups. The super-block is critical -data and is replicated in each cylinder group to protect against -catastrophic loss. This is done at file system creation -time and the critical -super-block data does not change, so the copies need not be -referenced further unless disaster strikes. -.Pp -Addresses stored in inodes are capable of addressing fragments -of `blocks'. File system blocks of at most size -.Dv MAXBSIZE -can -be optionally broken into 2, 4, or 8 pieces, each of which is -addressable; these pieces may be -.Dv DEV_BSIZE , -or some multiple of -a -.Dv DEV_BSIZE -unit. -.Pp -Large files consist of exclusively large data blocks. To avoid -undue wasted disk space, the last data block of a small file is -allocated as only as many fragments of a large block as are -necessary. The file system format retains only a single pointer -to such a fragment, which is a piece of a single large block that -has been divided. The size of such a fragment is determinable from -information in the inode, using the -.Fn blksize fs ip lbn -macro. -.Pp -The file system records space availability at the fragment level; -to determine block availability, aligned fragments are examined. -.Pp -The root inode is the root of the file system. -Inode 0 can't be used for normal purposes and -historically bad blocks were linked to inode 1, -thus the root inode is 2 (inode 1 is no longer used for -this purpose, however numerous dump tapes make this -assumption, so we are stuck with it). -.Pp -The -.Fa fs_minfree -element gives the minimum acceptable percentage of file system -blocks that may be free. If the freelist drops below this level -only the super-user may continue to allocate blocks. -The -.Fa fs_minfree -element -may be set to 0 if no reserve of free blocks is deemed necessary, -however severe performance degradations will be observed if the -file system is run at greater than 90% full; thus the default -value of -.Fa fs_minfree -is 10%. -.Pp -Empirically the best trade-off between block fragmentation and -overall disk utilization at a loading of 90% comes with a -fragmentation of 8, thus the default fragment size is an eighth -of the block size. -.Pp -The element -.Fa fs_optim -specifies whether the file system should try to minimize the time spent -allocating blocks, or if it should attempt to minimize the space -fragmentation on the disk. -If the value of fs_minfree (see above) is less than 10%, -then the file system defaults to optimizing for space to avoid -running out of full sized blocks. -If the value of minfree is greater than or equal to 10%, -fragmentation is unlikely to be problematical, and -the file system defaults to optimizing for time. -.Pp -.Em Cylinder group related limits : -Each cylinder keeps track of the availability of blocks at different -rotational positions, so that sequential blocks can be laid out -with minimum rotational latency. With the default of 8 distinguished -rotational positions, the resolution of the -summary information is 2ms for a typical 3600 rpm drive. -.Pp -The element -.Fa fs_rotdelay -gives the minimum number of milliseconds to initiate -another disk transfer on the same cylinder. -It is used in determining the rotationally optimal -layout for disk blocks within a file; -the default value for -.Fa fs_rotdelay -is 2ms. -.Pp -Each file system has a statically allocated number of inodes. -An inode is allocated for each -.Dv NBPI -bytes of disk space. -The inode allocation strategy is extremely conservative. -.Pp -.Dv MINBSIZE -is the smallest allowable block size. -With a -.Dv MINBSIZE -of 4096 -it is possible to create files of size -2^32 with only two levels of indirection. -.Dv MINBSIZE -must be big enough to hold a cylinder group block, -thus changes to -.Pq Fa struct cg -must keep its size within -.Dv MINBSIZE . -Note that super-blocks are never more than size -.Dv SBSIZE . -.Pp -The path name on which the file system is mounted is maintained in -.Fa fs_fsmnt . -.Dv MAXMNTLEN -defines the amount of space allocated in -the super-block for this name. -The limit on the amount of summary information per file system -is defined by -.Dv MAXCSBUFS. -For a 4096 byte block size, it is currently parameterized for a -maximum of two million cylinders. -.Pp -Per cylinder group information is summarized in blocks allocated -from the first cylinder group's data blocks. -These blocks are read in from -.Fa fs_csaddr -(size -.Fa fs_cssize ) -in addition to the super-block. -.Pp -.Sy N.B.: -.Xr sizeof Pq Fa struct csum -must be a power of two in order for -the -.Fn fs_cs -macro to work. -.Pp -The -.Em "Super-block for a file system" : -The size of the rotational layout tables -is limited by the fact that the super-block is of size -.Dv SBSIZE . -The size of these tables is -.Em inversely -proportional to the block -size of the file system. The size of the tables is -increased when sector sizes are not powers of two, -as this increases the number of cylinders -included before the rotational pattern repeats -.Pq Fa fs_cpc . -The size of the rotational layout -tables is derived from the number of bytes remaining in -.Pq Fa struct fs . -.Pp -The number of blocks of data per cylinder group -is limited because cylinder groups are at most one block. -The inode and free block tables -must fit into a single block after deducting space for -the cylinder group structure -.Pq Fa struct cg . -.Pp -The -.Em Inode : -The inode is the focus of all file activity in the -file system. -There is a unique inode allocated -for each active file, -each current directory, each mounted-on file, -text file, and the root. -An inode is `named' by its device/i-number pair. -For further information, see the include file -.Aq Pa sys/inode.h . -.Sh HISTORY -A super-block structure named filsys appeared in -.At v6 . -The file system described in this manual appeared -in -.Bx 4.2 . diff --git a/bsd/man/man5/inode.5 b/bsd/man/man5/inode.5 deleted file mode 100644 index 1b47f6228..000000000 --- a/bsd/man/man5/inode.5 +++ /dev/null @@ -1 +0,0 @@ -.so man5/fs.5 diff --git a/bsd/miscfs/Makefile b/bsd/miscfs/Makefile index 009da4c3f..ece064108 100644 --- a/bsd/miscfs/Makefile +++ b/bsd/miscfs/Makefile @@ -13,8 +13,6 @@ INSTINC_SUBDIRS = \ specfs \ union -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ @@ -22,8 +20,6 @@ EXPINC_SUBDIRS = \ fifofs \ specfs -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ SETUP_SUBDIRS = \ diff --git a/bsd/miscfs/devfs/Makefile b/bsd/miscfs/devfs/Makefile index bb2e43304..9d29f42e1 100644 --- a/bsd/miscfs/devfs/Makefile +++ b/bsd/miscfs/devfs/Makefile @@ -9,14 +9,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ DATAFILES = \ diff --git a/bsd/miscfs/devfs/devfs_tree.c b/bsd/miscfs/devfs/devfs_tree.c index 58aea8eb9..daf8c8ace 100644 --- a/bsd/miscfs/devfs/devfs_tree.c +++ b/bsd/miscfs/devfs/devfs_tree.c @@ -148,6 +148,7 @@ lck_grp_t * devfs_lck_grp; lck_grp_attr_t * devfs_lck_grp_attr; lck_attr_t * devfs_lck_attr; lck_mtx_t devfs_mutex; +lck_mtx_t devfs_attr_mutex; devdirent_t * dev_root = NULL; /* root of backing tree */ struct devfs_stats devfs_stats; /* hold stats */ @@ -185,6 +186,7 @@ devfs_sinit(void) devfs_lck_attr = lck_attr_alloc_init(); lck_mtx_init(&devfs_mutex, devfs_lck_grp, devfs_lck_attr); + lck_mtx_init(&devfs_attr_mutex, devfs_lck_grp, devfs_lck_attr); DEVFS_LOCK(); error = dev_add_entry("root", NULL, DEV_DIR, NULL, NULL, NULL, &dev_root); diff --git a/bsd/miscfs/devfs/devfs_vfsops.c b/bsd/miscfs/devfs/devfs_vfsops.c index c5875bd55..f34edc2c5 100644 --- a/bsd/miscfs/devfs/devfs_vfsops.c +++ b/bsd/miscfs/devfs/devfs_vfsops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -93,9 +93,10 @@ static int devfs_statfs( struct mount *mp, struct vfsstatfs *sbp, vfs_context_t ctx); static int devfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t ctx); +#if !defined(SECURE_KERNEL) extern int setup_kmem; __private_extern__ void devfs_setup_kmem(void); - +#endif /*- * Called from the generic VFS startups. @@ -114,9 +115,11 @@ devfs_init(__unused struct vfsconf *vfsp) UID_ROOT, GID_WHEEL, 0622, "console"); devfs_make_node(makedev(2, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666, "tty"); +#if !defined(SECURE_KERNEL) if (setup_kmem) { devfs_setup_kmem(); } +#endif devfs_make_node(makedev(3, 2), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666, "null"); devfs_make_node(makedev(3, 3), DEVFS_CHAR, @@ -131,6 +134,7 @@ devfs_init(__unused struct vfsconf *vfsp) return 0; } +#if !defined(SECURE_KERNEL) __private_extern__ void devfs_setup_kmem(void) { @@ -139,6 +143,7 @@ devfs_setup_kmem(void) devfs_make_node(makedev(3, 1), DEVFS_CHAR, UID_ROOT, GID_KMEM, 0640, "kmem"); } +#endif /*- @@ -495,7 +500,7 @@ devfs_kernel_mount(char * mntname) /* * Get vnode to be covered */ - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, + NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | LOCKLEAF, UIO_SYSSPACE, CAST_USER_ADDR_T(mntname), ctx); if ((error = namei(&nd))) { printf("devfs_kernel_mount: failed to find directory '%s', %d", diff --git a/bsd/miscfs/devfs/devfs_vnops.c b/bsd/miscfs/devfs/devfs_vnops.c index 58746bb73..207a50c01 100644 --- a/bsd/miscfs/devfs/devfs_vnops.c +++ b/bsd/miscfs/devfs/devfs_vnops.c @@ -110,9 +110,79 @@ #include "fdesc.h" #endif /* FDESC */ -static int devfs_update(struct vnode *vp, struct timeval *access, - struct timeval *modify); -void devfs_rele_node(devnode_t *); +static int devfs_update(struct vnode *vp, struct timeval *access, + struct timeval *modify); +void devfs_rele_node(devnode_t *); +static void devfs_consider_time_update(devnode_t *dnp, uint32_t just_changed_flags); +static boolean_t devfs_update_needed(long now_s, long last_s); +void dn_times_locked(devnode_t * dnp, struct timeval *t1, struct timeval *t2, struct timeval *t3, uint32_t just_changed_flags); +void dn_times_now(devnode_t *dnp, uint32_t just_changed_flags); +void dn_mark_for_delayed_times_update(devnode_t *dnp, uint32_t just_changed_flags); + +void +dn_times_locked(devnode_t * dnp, struct timeval *t1, struct timeval *t2, struct timeval *t3, uint32_t just_changed_flags) +{ + + lck_mtx_assert(&devfs_attr_mutex, LCK_MTX_ASSERT_OWNED); + + if (just_changed_flags & DEVFS_UPDATE_ACCESS) { + dnp->dn_atime.tv_sec = t1->tv_sec; + dnp->dn_atime.tv_nsec = t1->tv_usec * 1000; + dnp->dn_access = 0; + } else if (dnp->dn_access) { + dnp->dn_atime.tv_sec = MIN(t1->tv_sec, dnp->dn_atime.tv_sec + DEVFS_LAZY_UPDATE_SECONDS); + dnp->dn_atime.tv_nsec = t1->tv_usec * 1000; + dnp->dn_access = 0; + } + + if (just_changed_flags & DEVFS_UPDATE_MOD) { + dnp->dn_mtime.tv_sec = t2->tv_sec; + dnp->dn_mtime.tv_nsec = t2->tv_usec * 1000; + dnp->dn_update = 0; + } else if (dnp->dn_update) { + dnp->dn_mtime.tv_sec = MIN(t2->tv_sec, dnp->dn_mtime.tv_sec + DEVFS_LAZY_UPDATE_SECONDS); + dnp->dn_mtime.tv_nsec = t2->tv_usec * 1000; + dnp->dn_update = 0; + } + + if (just_changed_flags & DEVFS_UPDATE_CHANGE) { + dnp->dn_ctime.tv_sec = t3->tv_sec; + dnp->dn_ctime.tv_nsec = t3->tv_usec * 1000; + dnp->dn_change = 0; + } else if (dnp->dn_change) { + dnp->dn_ctime.tv_sec = MIN(t3->tv_sec, dnp->dn_ctime.tv_sec + DEVFS_LAZY_UPDATE_SECONDS); + dnp->dn_ctime.tv_nsec = t3->tv_usec * 1000; + dnp->dn_change = 0; + } +} + +void +dn_mark_for_delayed_times_update(devnode_t *dnp, uint32_t just_changed_flags) +{ + if (just_changed_flags & DEVFS_UPDATE_CHANGE) { + dnp->dn_change = 1; + } + if (just_changed_flags & DEVFS_UPDATE_ACCESS) { + dnp->dn_access = 1; + } + if (just_changed_flags & DEVFS_UPDATE_MOD) { + dnp->dn_update = 1; + } +} + +/* + * Update times based on pending updates and optionally a set of new changes. + */ +void +dn_times_now(devnode_t * dnp, uint32_t just_changed_flags) +{ + struct timeval now; + + DEVFS_ATTR_LOCK_SPIN(); + microtime(&now); + dn_times_locked(dnp, &now, &now, &now, just_changed_flags); + DEVFS_ATTR_UNLOCK(); +} /* @@ -353,9 +423,6 @@ devfs_getattr(struct vnop_getattr_args *ap) DEVFS_LOCK(); file_node = VTODN(vp); - microtime(&now); - dn_times(file_node, &now, &now, &now); - VATTR_RETURN(vap, va_mode, file_node->dn_mode); /* @@ -402,6 +469,13 @@ devfs_getattr(struct vnop_getattr_args *ap) VATTR_RETURN(vap, va_iosize, MAXPHYSIO); else VATTR_RETURN(vap, va_iosize, vp->v_mount->mnt_vfsstat.f_iosize); + + + DEVFS_ATTR_LOCK_SPIN(); + + microtime(&now); + dn_times_locked(file_node, &now, &now, &now, 0); + /* if the time is bogus, set it to the boot time */ if (file_node->dn_ctime.tv_sec == 0) { file_node->dn_ctime.tv_sec = boottime_sec(); @@ -414,6 +488,9 @@ devfs_getattr(struct vnop_getattr_args *ap) VATTR_RETURN(vap, va_change_time, file_node->dn_ctime); VATTR_RETURN(vap, va_modify_time, file_node->dn_mtime); VATTR_RETURN(vap, va_access_time, file_node->dn_atime); + + DEVFS_ATTR_UNLOCK(); + VATTR_RETURN(vap, va_gen, 0); VATTR_RETURN(vap, va_filerev, 0); VATTR_RETURN(vap, va_acl, NULL); @@ -557,13 +634,11 @@ devfs_close(struct vnop_close_args *ap) { struct vnode * vp = ap->a_vp; register devnode_t * dnp; - struct timeval now; if (vnode_isinuse(vp, 1)) { DEVFS_LOCK(); dnp = VTODN(vp); - microtime(&now); - dn_times(dnp, &now, &now, &now); + dn_times_now(dnp, 0); DEVFS_UNLOCK(); } return (0); @@ -579,19 +654,68 @@ devfsspec_close(struct vnop_close_args *ap) { struct vnode * vp = ap->a_vp; register devnode_t * dnp; - struct timeval now; if (vnode_isinuse(vp, 0)) { DEVFS_LOCK(); - microtime(&now); dnp = VTODN(vp); - dn_times(dnp, &now, &now, &now); + dn_times_now(dnp, 0); DEVFS_UNLOCK(); } return (VOCALL (spec_vnodeop_p, VOFFSET(vnop_close), ap)); } +static boolean_t +devfs_update_needed(long now_s, long last_s) +{ + if (now_s > last_s) { + if (now_s - last_s >= DEVFS_LAZY_UPDATE_SECONDS) { + return TRUE; + } + } + + return FALSE; +} + +/* + * Given a set of time updates required [to happen at some point], check + * either make those changes (and resolve other pending updates) or mark + * the devnode for a subsequent update. + */ +static void +devfs_consider_time_update(devnode_t *dnp, uint32_t just_changed_flags) +{ + struct timeval now; + long now_s; + + microtime(&now); + now_s = now.tv_sec; + + if (dnp->dn_change || (just_changed_flags & DEVFS_UPDATE_CHANGE)) { + if (devfs_update_needed(now_s, dnp->dn_ctime.tv_sec)) { + dn_times_now(dnp, just_changed_flags); + return; + } + } + if (dnp->dn_access || (just_changed_flags & DEVFS_UPDATE_ACCESS)) { + if (devfs_update_needed(now_s, dnp->dn_atime.tv_sec)) { + dn_times_now(dnp, just_changed_flags); + return; + } + } + if (dnp->dn_update || (just_changed_flags & DEVFS_UPDATE_MOD)) { + if (devfs_update_needed(now_s, dnp->dn_mtime.tv_sec)) { + dn_times_now(dnp, just_changed_flags); + return; + } + } + + /* Not going to do anything now--mark for later update */ + dn_mark_for_delayed_times_update(dnp, just_changed_flags); + + return; +} + static int devfsspec_read(struct vnop_read_args *ap) /* struct vnop_read_args { @@ -603,7 +727,7 @@ devfsspec_read(struct vnop_read_args *ap) { register devnode_t * dnp = VTODN(ap->a_vp); - dnp->dn_access = 1; + devfs_consider_time_update(dnp, DEVFS_UPDATE_ACCESS); return (VOCALL (spec_vnodeop_p, VOFFSET(vnop_read), ap)); } @@ -619,8 +743,7 @@ devfsspec_write(struct vnop_write_args *ap) { register devnode_t * dnp = VTODN(ap->a_vp); - dnp->dn_change = 1; - dnp->dn_update = 1; + devfs_consider_time_update(dnp, DEVFS_UPDATE_CHANGE | DEVFS_UPDATE_MOD); return (VOCALL (spec_vnodeop_p, VOFFSET(vnop_write), ap)); } @@ -704,8 +827,7 @@ devfs_vnop_remove(struct vnop_remove_args *ap) /*********************************** * Start actually doing things.... * ***********************************/ - tdp->dn_change = 1; - tdp->dn_update = 1; + devfs_consider_time_update(tdp, DEVFS_UPDATE_CHANGE | DEVFS_UPDATE_MOD); /* * Target must be empty if a directory and have no links @@ -741,7 +863,6 @@ devfs_link(struct vnop_link_args *ap) devnode_t * tdp; devdirent_t * tnp; int error = 0; - struct timeval now; /* * First catch an arbitrary restriction for this FS @@ -770,10 +891,7 @@ devfs_link(struct vnop_link_args *ap) /*********************************** * Start actually doing things.... * ***********************************/ - fp->dn_change = 1; - - microtime(&now); - error = devfs_update(vp, &now, &now); + dn_times_now(fp, DEVFS_UPDATE_CHANGE); if (!error) { error = dev_add_name(cnp->cn_nameptr, tdp, NULL, fp, &tnp); @@ -833,7 +951,6 @@ devfs_rename(struct vnop_rename_args *ap) devdirent_t *fnp,*tnp; int doingdirectory = 0; int error = 0; - struct timeval now; DEVFS_LOCK(); /* @@ -914,12 +1031,8 @@ devfs_rename(struct vnop_rename_args *ap) /*********************************** * Start actually doing things.... * ***********************************/ - fp->dn_change = 1; - microtime(&now); + dn_times_now(fp, DEVFS_UPDATE_CHANGE); - if ( (error = devfs_update(fvp, &now, &now)) ) { - goto out; - } /* * Check if just deleting a link name. */ @@ -1192,8 +1305,6 @@ devfs_readdir(struct vnop_readdir_args *ap) name_node = dir_node->dn_typeinfo.Dir.dirlist; nodenumber = 0; - dir_node->dn_access = 1; - while ((name_node || (nodenumber < 2)) && (uio_resid(uio) > 0)) { switch(nodenumber) @@ -1256,6 +1367,8 @@ devfs_readdir(struct vnop_readdir_args *ap) DEVFS_UNLOCK(); uio->uio_offset = pos; + devfs_consider_time_update(dir_node, DEVFS_UPDATE_ACCESS); + return (error); } @@ -1405,8 +1518,11 @@ devfs_update(struct vnode *vp, struct timeval *access, struct timeval *modify) return (0); } + + DEVFS_ATTR_LOCK_SPIN(); microtime(&now); - dn_times(ip, access, modify, &now); + dn_times_locked(ip, access, modify, &now, DEVFS_UPDATE_ACCESS | DEVFS_UPDATE_MOD); + DEVFS_ATTR_UNLOCK(); return (0); } diff --git a/bsd/miscfs/devfs/devfsdefs.h b/bsd/miscfs/devfs/devfsdefs.h index ce85cf853..e8b12000a 100644 --- a/bsd/miscfs/devfs/devfsdefs.h +++ b/bsd/miscfs/devfs/devfsdefs.h @@ -178,6 +178,7 @@ struct devdirent extern devdirent_t * dev_root; extern struct devfs_stats devfs_stats; extern lck_mtx_t devfs_mutex; +extern lck_mtx_t devfs_attr_mutex; /* * Rules for front nodes: @@ -214,9 +215,10 @@ struct devfsmount #define VTODN(vp) ((devnode_t *)(vp)->v_data) #define DEVFS_LOCK() lck_mtx_lock(&devfs_mutex) - #define DEVFS_UNLOCK() lck_mtx_unlock(&devfs_mutex) +#define DEVFS_ATTR_LOCK_SPIN() lck_mtx_lock_spin(&devfs_attr_mutex); +#define DEVFS_ATTR_UNLOCK() lck_mtx_unlock(&devfs_attr_mutex); /* * XXX all the (SInt32 *) casts below assume sizeof(int) == sizeof(long) @@ -269,34 +271,32 @@ DEVFS_DECR_STRINGSPACE(int space) OSAddAtomic(-space, &devfs_stats.stringspace); } -static __inline__ void -dn_times(devnode_t * dnp, struct timeval *t1, struct timeval *t2, struct timeval *t3) -{ - if (dnp->dn_access) { - dnp->dn_atime.tv_sec = t1->tv_sec; - dnp->dn_atime.tv_nsec = t1->tv_usec * 1000; - dnp->dn_access = 0; - } - if (dnp->dn_update) { - dnp->dn_mtime.tv_sec = t2->tv_sec; - dnp->dn_mtime.tv_nsec = t2->tv_usec * 1000; - dnp->dn_update = 0; - } - if (dnp->dn_change) { - dnp->dn_ctime.tv_sec = t3->tv_sec; - dnp->dn_ctime.tv_nsec = t3->tv_usec * 1000; - dnp->dn_change = 0; - } - - return; -} +/* + * Access, change, and modify times are protected by a separate lock, + * which allows tty times to be updated (no more than once per second) + * in the I/O path without too much fear of contention. + * + * For getattr, update times to current time if the last update was recent; + * preserve legacy behavior that frequent stats can yield sub-second resolutions. + * If the last time is old, however, we know that the event that triggered + * the need for an update was no more than 1s after the last update. In that case, + * use (last update + 1s) as the time, avoiding the illusion that last update happened + * much later than it really did. + */ +#define DEVFS_LAZY_UPDATE_SECONDS 1 + +#define DEVFS_UPDATE_CHANGE 0x1 +#define DEVFS_UPDATE_MOD 0x2 +#define DEVFS_UPDATE_ACCESS 0x4 static __inline__ void dn_copy_times(devnode_t * target, devnode_t * source) { + DEVFS_ATTR_LOCK_SPIN(); target->dn_atime = source->dn_atime; target->dn_mtime = source->dn_mtime; target->dn_ctime = source->dn_ctime; + DEVFS_ATTR_UNLOCK(); return; } diff --git a/bsd/miscfs/fifofs/Makefile b/bsd/miscfs/fifofs/Makefile index ff18c9388..d70a3ab16 100644 --- a/bsd/miscfs/fifofs/Makefile +++ b/bsd/miscfs/fifofs/Makefile @@ -9,14 +9,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ DATAFILES = \ diff --git a/bsd/miscfs/nullfs/null.h b/bsd/miscfs/nullfs/null.h deleted file mode 100644 index 3209be3d9..000000000 --- a/bsd/miscfs/nullfs/null.h +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/* - * Copyright (c) 1992, 1993 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software donated to Berkeley by - * Jan-Simon Pendry. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)null.h 8.3 (Berkeley) 8/20/94 - * - * null.h 8.2 (Berkeley) 1/21/94 - */ -#ifdef __NULLFS_NULL_H__ -#define __NULLFS_NULL_H__ - -#include - -#ifdef __APPLE_API_PRIVATE -struct null_args { - char *target; /* Target of loopback */ -}; - -struct null_mount { - struct mount *nullm_vfs; - struct vnode *nullm_rootvp; /* Reference to root null_node */ -}; - -#ifdef KERNEL -/* LP64 version of null_args. all pointers - * grow when we're dealing with a 64-bit process. - * WARNING - keep in sync with null_args - */ -struct user_null_args { - user_addr_t target; /* Target of loopback */ -}; - -/* - * A cache of vnode references - */ -struct null_node { - LIST_ENTRY(null_node) null_hash; /* Hash list */ - struct vnode *null_lowervp; /* VREFed once */ - struct vnode *null_vnode; /* Back pointer */ -}; - -extern int null_node_create(struct mount *mp, struct vnode *target, struct vnode **vpp); - -#define MOUNTTONULLMOUNT(mp) ((struct null_mount *)((mp)->mnt_data)) -#define VTONULL(vp) ((struct null_node *)(vp)->v_data) -#define NULLTOV(xp) ((xp)->null_vnode) -#ifdef NULLFS_DIAGNOSTIC -extern struct vnode *null_checkvp(struct vnode *vp, char *fil, int lno); -#define NULLVPTOLOWERVP(vp) null_checkvp((vp), __FILE__, __LINE__) -#else -#define NULLVPTOLOWERVP(vp) (VTONULL(vp)->null_lowervp) -#endif - -extern int (**null_vnodeop_p)(void *); -extern struct vfsops null_vfsops; -#endif /* KERNEL */ - -#endif /* __APPLE_API_PRIVATE */ -#endif /* __NULLFS_NULL_H__ */ diff --git a/bsd/miscfs/nullfs/null_subr.c b/bsd/miscfs/nullfs/null_subr.c deleted file mode 100644 index d061bb77f..000000000 --- a/bsd/miscfs/nullfs/null_subr.c +++ /dev/null @@ -1,304 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/* - * Copyright (c) 1992, 1993 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software donated to Berkeley by - * Jan-Simon Pendry. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)null_subr.c 8.7 (Berkeley) 5/14/95 - * - * null_subr.c 8.4 (Berkeley) 1/21/94 - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define LOG2_SIZEVNODE 7 /* log2(sizeof struct vnode) */ -#define NNULLNODECACHE 16 - -/* - * Null layer cache: - * Each cache entry holds a reference to the lower vnode - * along with a pointer to the alias vnode. When an - * entry is added the lower vnode is vnode_get'd. When the - * alias is removed the lower vnode is vnode_put'd. - */ - -#define NULL_NHASH(vp) \ - (&null_node_hashtbl[(((uintptr_t)vp)>>LOG2_SIZEVNODE) & null_node_hash]) -LIST_HEAD(null_node_hashhead, null_node) *null_node_hashtbl; -u_long null_node_hash; - -/* - * Initialise cache headers - */ -nullfs_init() -{ - -#ifdef NULLFS_DIAGNOSTIC - printf("nullfs_init\n"); /* printed during system boot */ -#endif - null_node_hashtbl = hashinit(NNULLNODECACHE, M_CACHE, &null_node_hash); -} - -/* - * Return a vnode_get'ed alias for lower vnode if already exists, else 0. - */ -static struct vnode * -null_node_find(mp, lowervp) - struct mount *mp; - struct vnode *lowervp; -{ - struct proc *p = curproc; /* XXX */ - struct null_node_hashhead *hd; - struct null_node *a; - struct vnode *vp; - - /* - * Find hash base, and then search the (two-way) linked - * list looking for a null_node structure which is referencing - * the lower vnode. If found, the increment the null_node - * reference count (but NOT the lower vnode's vnode_get counter). - */ - hd = NULL_NHASH(lowervp); -loop: - for (a = hd->lh_first; a != 0; a = a->null_hash.le_next) { - if (a->null_lowervp == lowervp && NULLTOV(a)->v_mount == mp) { - vp = NULLTOV(a); - - if (vnode_get(vp)) { - printf ("null_node_find: vget failed.\n"); - goto loop; - }; - return (vp); - } - } - - return NULL; -} - - -/* - * Make a new null_node node. - * Vp is the alias vnode, lofsvp is the lower vnode. - * Maintain a reference to (lowervp). - */ -static int -null_node_alloc(mp, lowervp, vpp) - struct mount *mp; - struct vnode *lowervp; - struct vnode **vpp; -{ - struct null_node_hashhead *hd; - struct null_node *xp; - struct vnode *othervp, *vp; - int error; - - MALLOC(xp, struct null_node *, sizeof(struct null_node), M_TEMP, M_WAITOK); - if (error = getnewvnode(VT_NULL, mp, null_vnodeop_p, vpp)) { - FREE(xp, M_TEMP); - return (error); - } - vp = *vpp; - - vp->v_type = lowervp->v_type; - xp->null_vnode = vp; - vp->v_data = xp; - xp->null_lowervp = lowervp; - /* - * Before we insert our new node onto the hash chains, - * check to see if someone else has beaten us to it. - */ - if (othervp = null_node_find(lowervp)) { - FREE(xp, M_TEMP); - vp->v_type = VBAD; /* node is discarded */ - vp->v_usecount = 0; /* XXX */ - vp->v_data = 0; /* prevent access to freed data */ - *vpp = othervp; - return 0; - }; - if (vp->v_type == VREG) - ubc_info_init(vp); - vnode_get(lowervp); /* Extra vnode_get will be vnode_put'd in null_node_create */ - hd = NULL_NHASH(lowervp); - LIST_INSERT_HEAD(hd, xp, null_hash); - return 0; -} - - -/* - * Try to find an existing null_node vnode refering - * to it, otherwise make a new null_node vnode which - * contains a reference to the lower vnode. - */ -int -null_node_create(mp, lowervp, newvpp) - struct mount *mp; - struct vnode *lowervp; - struct vnode **newvpp; -{ - struct vnode *aliasvp; - - if (aliasvp = null_node_find(mp, lowervp)) { - /* - * null_node_find has taken another reference - * to the alias vnode. - */ -#ifdef NULLFS_DIAGNOSTIC - vprint("null_node_create: exists", NULLTOV(ap)); -#endif - /* vnode_get(aliasvp); --- done in null_node_find */ - } else { - int error; - - /* - * Get new vnode. - */ -#ifdef NULLFS_DIAGNOSTIC - printf("null_node_create: create new alias vnode\n"); -#endif - - /* - * Make new vnode reference the null_node. - */ - if (error = null_node_alloc(mp, lowervp, &aliasvp)) - return error; - - /* - * aliasvp is already vnode_get'd by getnewvnode() - */ - } - - vnode_put(lowervp); - -#if DIAGNOSTIC - if (lowervp->v_usecount < 1) { - /* Should never happen... */ - vprint ("null_node_create: alias ", aliasvp); - vprint ("null_node_create: lower ", lowervp); - panic ("null_node_create: lower has 0 usecount."); - }; -#endif - -#ifdef NULLFS_DIAGNOSTIC - vprint("null_node_create: alias", aliasvp); - vprint("null_node_create: lower", lowervp); -#endif - - *newvpp = aliasvp; - return (0); -} -#ifdef NULLFS_DIAGNOSTIC -struct vnode * -null_checkvp(vp, fil, lno) - struct vnode *vp; - char *fil; - int lno; -{ - struct null_node *a = VTONULL(vp); -#ifdef notyet - /* - * Can't do this check because vnop_reclaim runs - * with a funny vop vector. - */ - if (vp->v_op != null_vnodeop_p) { - printf ("null_checkvp: on non-null-node\n"); - while (null_checkvp_barrier) /*WAIT*/ ; - panic("null_checkvp"); - }; -#endif - if (a->null_lowervp == NULL) { - /* Should never happen */ - int i; uint32_t *p; - printf("vp = %x, ZERO ptr\n", vp); - for (p = (uint32_t *) a, i = 0; i < 8; i++) - printf(" %x", p[i]); - printf("\n"); - /* wait for debugger */ - while (null_checkvp_barrier) /*WAIT*/ ; - panic("null_checkvp"); - } - if (a->null_lowervp->v_usecount < 1) { - int i; uint32_t *p; - printf("vp = %x, unref'ed lowervp\n", vp); - for (p = (uint32_t *) a, i = 0; i < 8; i++) - printf(" %x", p[i]); - printf("\n"); - /* wait for debugger */ - while (null_checkvp_barrier) /*WAIT*/ ; - panic ("null with unref'ed lowervp"); - }; -#ifdef notyet - printf("null %x/%d -> %x/%d [%s, %d]\n", - NULLTOV(a), NULLTOV(a)->v_usecount, - a->null_lowervp, a->null_lowervp->v_usecount, - fil, lno); -#endif - return a->null_lowervp; -} -#endif diff --git a/bsd/miscfs/nullfs/null_vfsops.c b/bsd/miscfs/nullfs/null_vfsops.c deleted file mode 100644 index e81c64059..000000000 --- a/bsd/miscfs/nullfs/null_vfsops.c +++ /dev/null @@ -1,382 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/* - * Copyright (c) 1992, 1993 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software donated to Berkeley by - * Jan-Simon Pendry. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)null_vfsops.c 8.7 (Berkeley) 5/14/95 - * - * @(#)lofs_vfsops.c 1.2 (Berkeley) 6/18/92 - */ - -/* - * Null Layer - * (See null_vnops.c for a description of what this does.) - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Mount null layer - */ -static int -nullfs_mount(mp, devvp, data, context) - struct mount *mp; - vnode_t devvp; - user_addr_t data; - vfs_context_t context; -{ - int error = 0; - struct user_null_args args; - struct vnode *lowerrootvp, *vp; - struct vnode *nullm_rootvp; - struct null_mount *xmp; - u_int size; - -#ifdef NULLFS_DIAGNOSTIC - printf("nullfs_mount(mp = %x)\n", mp); -#endif - - /* - * Update is a no-op - */ - if (mp->mnt_flag & MNT_UPDATE) { - return (ENOTSUP); - /* return VFS_MOUNT(MOUNTTONULLMOUNT(mp)->nullm_vfs, devvp, data, p);*/ - } - - /* - * Get argument - */ - if (vfs_context_is64bit(context)) { - error = copyin(data, (caddr_t)&args, sizeof (args)); - } - else { - struct null_args temp; - error = copyin(data, (caddr_t)&temp, sizeof (temp)); - args.target = CAST_USER_ADDR_T(temp.target); - } - if (error) - return (error); - - /* - * Find lower node - */ - NDINIT(ndp, LOOKUP, FOLLOW|WANTPARENT|LOCKLEAF, - UIO_USERSPACE, args.target, context); - if (error = namei(ndp)) - return (error); - nameidone(ndp); - /* - * Sanity check on lower vnode - */ - lowerrootvp = ndp->ni_vp; - - vnode_put(ndp->ni_dvp); - ndp->ni_dvp = NULL; - - xmp = (struct null_mount *) _MALLOC(sizeof(struct null_mount), - M_UFSMNT, M_WAITOK); /* XXX */ - - /* - * Save reference to underlying FS - */ - xmp->nullm_vfs = lowerrootvp->v_mount; - - /* - * Save reference. Each mount also holds - * a reference on the root vnode. - */ - error = null_node_create(mp, lowerrootvp, &vp); - /* - * Make sure the node alias worked - */ - if (error) { - vnode_put(lowerrootvp); - FREE(xmp, M_UFSMNT); /* XXX */ - return (error); - } - - /* - * Keep a held reference to the root vnode. - * It is vnode_put'd in nullfs_unmount. - */ - nullm_rootvp = vp; - nullm_rootvp->v_flag |= VROOT; - xmp->nullm_rootvp = nullm_rootvp; - if (NULLVPTOLOWERVP(nullm_rootvp)->v_mount->mnt_flag & MNT_LOCAL) - mp->mnt_flag |= MNT_LOCAL; - mp->mnt_data = (qaddr_t) xmp; - vfs_getnewfsid(mp); - - (void) copyinstr(args.target, mp->mnt_vfsstat.f_mntfromname, MAXPATHLEN - 1, - &size); - bzero(mp->mnt_vfsstat.f_mntfromname + size, MNAMELEN - size); -#ifdef NULLFS_DIAGNOSTIC - printf("nullfs_mount: lower %s, alias at %s\n", - mp->mnt_vfsstat.f_mntfromname, mp->mnt_vfsstat.f_mntonname); -#endif - return (0); -} - -/* - * VFS start. Nothing needed here - the start routine - * on the underlying filesystem will have been called - * when that filesystem was mounted. - */ -static int -nullfs_start(mp, flags, context) - struct mount *mp; - int flags; - vfs_context_t context; -{ - return (0); - /* return VFS_START(MOUNTTONULLMOUNT(mp)->nullm_vfs, flags, context); */ -} - -/* - * Free reference to null layer - */ -static int -nullfs_unmount(mp, mntflags, context) - struct mount *mp; - int mntflags; - vfs_context_t context; -{ - struct vnode *nullm_rootvp = MOUNTTONULLMOUNT(mp)->nullm_rootvp; - int error; - int flags = 0; - int force = 0; - -#ifdef NULLFS_DIAGNOSTIC - printf("nullfs_unmount(mp = %x)\n", mp); -#endif - - if (mntflags & MNT_FORCE) { - flags |= FORCECLOSE; - force = 1; - } - - if ( (nullm_rootvp->v_usecount > 1) && !force ) - return (EBUSY); - if ( (error = vflush(mp, nullm_rootvp, flags)) && !force ) - return (error); - -#ifdef NULLFS_DIAGNOSTIC - vprint("alias root of lower", nullm_rootvp); -#endif - /* - * Release reference on underlying root vnode - */ - vnode_put(nullm_rootvp); - /* - * And blow it away for future re-use - */ - vnode_reclaim(nullm_rootvp); - /* - * Finally, throw away the null_mount structure - */ - FREE(mp->mnt_data, M_UFSMNT); /* XXX */ - mp->mnt_data = 0; - return 0; -} - -static int -nullfs_root(mp, vpp, context) - struct mount *mp; - struct vnode **vpp; - vfs_context_t context; -{ - struct proc *p = curproc; /* XXX */ - struct vnode *vp; - -#ifdef NULLFS_DIAGNOSTIC - printf("nullfs_root(mp = %x, vp = %x->%x)\n", mp, - MOUNTTONULLMOUNT(mp)->nullm_rootvp, - NULLVPTOLOWERVP(MOUNTTONULLMOUNT(mp)->nullm_rootvp) - ); -#endif - - /* - * Return locked reference to root. - */ - vp = MOUNTTONULLMOUNT(mp)->nullm_rootvp; - vnode_get(vp); - *vpp = vp; - return 0; -} - -static int -nullfs_quotactl(mp, cmd, uid, datap, context) - struct mount *mp; - int cmd; - uid_t uid; - caddr_t datap; - vfs_context_t context; -{ - return VFS_QUOTACTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, uid, datap, context); -} - -static int -nullfs_statfs(mp, sbp, context) - struct mount *mp; - struct vfsstatfs *sbp; - vfs_context_t context; -{ - int error; - struct vfsstatfs mstat; - -#ifdef NULLFS_DIAGNOSTIC - printf("nullfs_statfs(mp = %x, vp = %x->%x)\n", mp, - MOUNTTONULLMOUNT(mp)->nullm_rootvp, - NULLVPTOLOWERVP(MOUNTTONULLMOUNT(mp)->nullm_rootvp) - ); -#endif - - bzero(&mstat, sizeof(mstat)); - - error = VFS_STATFS(MOUNTTONULLMOUNT(mp)->nullm_vfs, &mstat, context); - if (error) - return (error); - - /* now copy across the "interesting" information and fake the rest */ - //sbp->f_type = mstat.f_type; - sbp->f_flags = mstat.f_flags; - sbp->f_bsize = mstat.f_bsize; - sbp->f_iosize = mstat.f_iosize; - sbp->f_blocks = mstat.f_blocks; - sbp->f_bfree = mstat.f_bfree; - sbp->f_bavail = mstat.f_bavail; - sbp->f_files = mstat.f_files; - sbp->f_ffree = mstat.f_ffree; - return (0); -} - -static int -nullfs_sync(__unused struct mount *mp, __unused int waitfor, - __unused kauth_cred_t cred, __unused vfs_context_t context) -{ - /* - * XXX - Assumes no data cached at null layer. - */ - return (0); -} - -static int -nullfs_vget(mp, ino, vpp, context) - struct mount *mp; - ino64_t ino; - struct vnode **vpp; - vfs_context_t context; -{ - - return VFS_VGET(MOUNTTONULLMOUNT(mp)->nullm_vfs, ino, vpp, context); -} - -static int -nullfs_fhtovp(mp, fhlen, fhp, vpp, context) - struct mount *mp; - int fhlen; - unsigned char *fhp; - struct vnode **vpp; - vfs_context_t context; -{ - - return VFS_FHTOVP(MOUNTTONULLMOUNT(mp)->nullm_vfs, fhlen, fhp, vpp, context); -} - -static int -nullfs_vptofh(vp, fhlenp, fhp, context) - struct vnode *vp; - int *fhlenp; - unsigned char *fhp; - vfs_context_t context; -{ - return VFS_VPTOFH(NULLVPTOLOWERVP(vp), fhlenp, fhp, context); -} - -int nullfs_init (struct vfsconf *); - -#define nullfs_sysctl (int (*) (int *, u_int, user_addr_t, size_t *, user_addr_t, size_t, proc_t))eopnotsupp - -struct vfsops null_vfsops = { - nullfs_mount, - nullfs_start, - nullfs_unmount, - nullfs_root, - nullfs_quotactl, - nullfs_statfs, - nullfs_sync, - nullfs_vget, - nullfs_fhtovp, - nullfs_vptofh, - nullfs_init, - nullfs_sysctl -}; diff --git a/bsd/miscfs/nullfs/null_vnops.c b/bsd/miscfs/nullfs/null_vnops.c deleted file mode 100644 index 4b2fb2bbf..000000000 --- a/bsd/miscfs/nullfs/null_vnops.c +++ /dev/null @@ -1,570 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/* - * Copyright (c) 1992, 1993 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * John Heidemann of the UCLA Ficus project. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)null_vnops.c 8.6 (Berkeley) 5/27/95 - * - * Ancestors: - * @(#)lofs_vnops.c 1.2 (Berkeley) 6/18/92 - * ...and... - * @(#)null_vnodeops.c 1.20 92/07/07 UCLA Ficus project - */ - -/* - * Null Layer - * - * (See mount_null(8) for more information.) - * - * The null layer duplicates a portion of the file system - * name space under a new name. In this respect, it is - * similar to the loopback file system. It differs from - * the loopback fs in two respects: it is implemented using - * a stackable layers techniques, and it's "null-node"s stack above - * all lower-layer vnodes, not just over directory vnodes. - * - * The null layer has two purposes. First, it serves as a demonstration - * of layering by proving a layer which does nothing. (It actually - * does everything the loopback file system does, which is slightly - * more than nothing.) Second, the null layer can serve as a prototype - * layer. Since it provides all necessary layer framework, - * new file system layers can be created very easily be starting - * with a null layer. - * - * The remainder of this man page examines the null layer as a basis - * for constructing new layers. - * - * - * INSTANTIATING NEW NULL LAYERS - * - * New null layers are created with mount_null(8). - * Mount_null(8) takes two arguments, the pathname - * of the lower vfs (target-pn) and the pathname where the null - * layer will appear in the namespace (alias-pn). After - * the null layer is put into place, the contents - * of target-pn subtree will be aliased under alias-pn. - * - * - * OPERATION OF A NULL LAYER - * - * The null layer is the minimum file system layer, - * simply bypassing all possible operations to the lower layer - * for processing there. The majority of its activity centers - * on the bypass routine, though which nearly all vnode operations - * pass. - * - * The bypass routine accepts arbitrary vnode operations for - * handling by the lower layer. It begins by examing vnode - * operation arguments and replacing any null-nodes by their - * lower-layer equivlants. It then invokes the operation - * on the lower layer. Finally, it replaces the null-nodes - * in the arguments and, if a vnode is return by the operation, - * stacks a null-node on top of the returned vnode. - * - * Although bypass handles most operations, vnop_getattr, vnop_lock, - * vnop_unlock, vnop_inactive, vnop_reclaim, and vnop_print are not - * bypassed. Vop_getattr must change the fsid being returned. - * Vop_lock and vnop_unlock must handle any locking for the - * current vnode as well as pass the lock request down. - * Vop_inactive and vnop_reclaim are not bypassed so that - * they can handle freeing null-layer specific data. Vop_print - * is not bypassed to avoid excessive debugging information. - * Also, certain vnode operations change the locking state within - * the operation (create, mknod, remove, link, rename, mkdir, rmdir, - * and symlink). Ideally these operations should not change the - * lock state, but should be changed to let the caller of the - * function unlock them. Otherwise all intermediate vnode layers - * (such as union, umapfs, etc) must catch these functions to do - * the necessary locking at their layer. - * - * - * INSTANTIATING VNODE STACKS - * - * Mounting associates the null layer with a lower layer, - * effect stacking two VFSes. Vnode stacks are instead - * created on demand as files are accessed. - * - * The initial mount creates a single vnode stack for the - * root of the new null layer. All other vnode stacks - * are created as a result of vnode operations on - * this or other null vnode stacks. - * - * New vnode stacks come into existance as a result of - * an operation which returns a vnode. - * The bypass routine stacks a null-node above the new - * vnode before returning it to the caller. - * - * For example, imagine mounting a null layer with - * "mount_null /usr/include /dev/layer/null". - * Changing directory to /dev/layer/null will assign - * the root null-node (which was created when the null layer was mounted). - * Now consider opening "sys". A vnop_lookup would be - * done on the root null-node. This operation would bypass through - * to the lower layer which would return a vnode representing - * the UFS "sys". Null_bypass then builds a null-node - * aliasing the UFS "sys" and returns this to the caller. - * Later operations on the null-node "sys" will repeat this - * process when constructing other vnode stacks. - * - * - * CREATING OTHER FILE SYSTEM LAYERS - * - * One of the easiest ways to construct new file system layers is to make - * a copy of the null layer, rename all files and variables, and - * then begin modifing the copy. Sed can be used to easily rename - * all variables. - * - * The umap layer is an example of a layer descended from the - * null layer. - * - * - * INVOKING OPERATIONS ON LOWER LAYERS - * - * There are two techniques to invoke operations on a lower layer - * when the operation cannot be completely bypassed. Each method - * is appropriate in different situations. In both cases, - * it is the responsibility of the aliasing layer to make - * the operation arguments "correct" for the lower layer - * by mapping an vnode arguments to the lower layer. - * - * The first approach is to call the aliasing layer's bypass routine. - * This method is most suitable when you wish to invoke the operation - * currently being hanldled on the lower layer. It has the advantage - * that the bypass routine already must do argument mapping. - * An example of this is null_getattrs in the null layer. - * - * A second approach is to directly invoked vnode operations on - * the lower layer with the VOP_OPERATIONNAME interface. - * The advantage of this method is that it is easy to invoke - * arbitrary operations on the lower layer. The disadvantage - * is that vnodes arguments must be manualy mapped. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -int null_bug_bypass = 0; /* for debugging: enables bypass printf'ing */ - -/* - * This is the 10-Apr-92 bypass routine. - * This version has been optimized for speed, throwing away some - * safety checks. It should still always work, but it's not as - * robust to programmer errors. - * Define SAFETY to include some error checking code. - * - * In general, we map all vnodes going down and unmap them on the way back. - * As an exception to this, vnodes can be marked "unmapped" by setting - * the Nth bit in operation's vdesc_flags. - * - * Also, some BSD vnode operations have the side effect of node_put'ing - * their arguments. With stacking, the reference counts are held - * by the upper node, not the lower one, so we must handle these - * side-effects here. This is not of concern in Sun-derived systems - * since there are no such side-effects. - * - * This makes the following assumptions: - * - only one returned vpp - * - no INOUT vpp's (Sun's vnop_open has one of these) - * - the vnode operation vector of the first vnode should be used - * to determine what implementation of the op should be invoked - * - all mapped vnodes are of our vnode-type (NEEDSWORK: - * problems on rmdir'ing mount points and renaming?) - */ -int -null_bypass(ap) - struct vnop_generic_args /* { - struct vnodeop_desc *a_desc; - - } */ *ap; -{ - extern int (**null_vnodeop_p)(void *); /* not extern, really "forward" */ - register struct vnode **this_vp_p; - int error; - struct vnode *old_vps[VDESC_MAX_VPS]; - struct vnode **vps_p[VDESC_MAX_VPS]; - struct vnode ***vppp; - struct vnodeop_desc *descp = ap->a_desc; - int reles, i; - - if (null_bug_bypass) - printf ("null_bypass: %s\n", descp->vdesc_name); - -#ifdef SAFETY - /* - * We require at least one vp. - */ - if (descp->vdesc_vp_offsets == NULL || - descp->vdesc_vp_offsets[0] == VDESC_NO_OFFSET) - panic ("null_bypass: no vp's in map.\n"); -#endif - - /* - * Map the vnodes going in. - * Later, we'll invoke the operation based on - * the first mapped vnode's operation vector. - */ - reles = descp->vdesc_flags; - for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { - if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET) - break; /* bail out at end of list */ - vps_p[i] = this_vp_p = - VOPARG_OFFSETTO(struct vnode**,descp->vdesc_vp_offsets[i],ap); - /* - * We're not guaranteed that any but the first vnode - * are of our type. Check for and don't map any - * that aren't. (We must always map first vp or vclean fails.) - */ - if (i && (*this_vp_p == NULL || - (*this_vp_p)->v_op != null_vnodeop_p)) { - old_vps[i] = NULL; - } else { - old_vps[i] = *this_vp_p; - *(vps_p[i]) = NULLVPTOLOWERVP(*this_vp_p); - /* - * XXX - Several operations have the side effect - * of vnode_put'ing their vp's. We must account for - * that. (This should go away in the future.) - */ - if (reles & 1) - vnode_get(*this_vp_p); - } - - } - - /* - * Call the operation on the lower layer - * with the modified argument structure. - */ - error = VCALL(*(vps_p[0]), descp->vdesc_offset, ap); - - /* - * Maintain the illusion of call-by-value - * by restoring vnodes in the argument structure - * to their original value. - */ - reles = descp->vdesc_flags; - for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { - if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET) - break; /* bail out at end of list */ - if (old_vps[i]) { - *(vps_p[i]) = old_vps[i]; - if (reles & 1) - vnode_put(*(vps_p[i])); - } - } - - /* - * Map the possible out-going vpp - * (Assumes that the lower layer always returns - * a vnode_get'ed vpp unless it gets an error.) - */ - if (descp->vdesc_vpp_offset != VDESC_NO_OFFSET && - !(descp->vdesc_flags & VDESC_NOMAP_VPP) && - !error) { - /* - * XXX - even though some ops have vpp returned vp's, - * several ops actually vnode_put this before returning. - * We must avoid these ops. - * (This should go away when these ops are regularized.) - */ - if (descp->vdesc_flags & VDESC_VPP_WILLRELE) - goto out; - vppp = VOPARG_OFFSETTO(struct vnode***, - descp->vdesc_vpp_offset,ap); - error = null_node_create(old_vps[0]->v_mount, **vppp, *vppp); - } - - out: - return (error); -} - -/* - * We have to carry on the locking protocol on the null layer vnodes - * as we progress through the tree. We also have to enforce read-only - * if this layer is mounted read-only. - */ -null_lookup(ap) - struct vnop_lookup_args /* { - struct vnode * a_dvp; - struct vnode ** a_vpp; - struct componentname * a_cnp; - vfs_context_t a_context; - } */ *ap; -{ - struct componentname *cnp = ap->a_cnp; - struct proc *p = cnp->cn_proc; - int flags = cnp->cn_flags; - struct vnode *dvp, *vp; - int error; - - error = null_bypass(ap); - - /* - * We must do the same locking and unlocking at this layer as - * is done in the layers below us. We could figure this out - * based on the error return and the LASTCN, LOCKPARENT, and - * LOCKLEAF flags. However, it is more expidient to just find - * out the state of the lower level vnodes and set ours to the - * same state. - */ - dvp = ap->a_dvp; - vp = *ap->a_vpp; - if (dvp == vp) - return (error); - return (error); -} - -/* - * Setattr call. - */ -int -null_setattr( - struct vnop_setattr_args /* { - struct vnodeop_desc *a_desc; - struct vnode *a_vp; - struct vnode_attr *a_vap; - kauth_cred_t a_cred; - struct proc *a_p; - } */ *ap) -{ - struct vnode *vp = ap->a_vp; - struct vnode_attr *vap = ap->a_vap; - - if (VATTR_IS_ACTIVE(vap, va_data_size)) { - switch (vp->v_type) { - case VDIR: - return (EISDIR); - case VCHR: - case VBLK: - case VSOCK: - case VFIFO: - return (0); - case VREG: - case VLNK: - default: - } - } - return (null_bypass(ap)); -} - -/* - * We handle getattr only to change the fsid. - */ -int -null_getattr(ap) - struct vnop_getattr_args /* { - struct vnode *a_vp; - struct vnode_attr *a_vap; - vfs_context_t a_context; - } */ *ap; -{ - int error; - - if (error = null_bypass(ap)) - return (error); - /* Requires that arguments be restored. */ - VATTR_RETURN(ap->a_vap, va_fsid, ap->a_vp->v_mount->mnt_vfsstat.f_fsid.val[0]); - return (0); -} - -int -null_access(ap) - struct vnop_access_args /* { - struct vnode *a_vp; - int a_action; - vfs_context_t a_context; - } */ *ap; -{ - return (null_bypass(ap)); -} - -int -null_inactive(ap) - struct vnop_inactive_args /* { - struct vnode *a_vp; - vfs_context_t a_context; - } */ *ap; -{ - /* - * Do nothing (and _don't_ bypass). - * Wait to vnode_put lowervp until reclaim, - * so that until then our null_node is in the - * cache and reusable. - * - * NEEDSWORK: Someday, consider inactive'ing - * the lowervp and then trying to reactivate it - * with capabilities (v_id) - * like they do in the name lookup cache code. - * That's too much work for now. - */ - return (0); -} - -int -null_reclaim(ap) - struct vnop_reclaim_args /* { - struct vnode *a_vp; - vfs_context_t a_context; - } */ *ap; -{ - struct vnode *vp = ap->a_vp; - struct null_node *xp = VTONULL(vp); - struct vnode *lowervp = xp->null_lowervp; - - /* - * Note: in vnop_reclaim, vp->v_op == dead_vnodeop_p, - * so we can't call VOPs on ourself. - */ - /* After this assignment, this node will not be re-used. */ - xp->null_lowervp = NULL; - LIST_REMOVE(xp, null_hash); - FREE(vp->v_data, M_TEMP); - vp->v_data = NULL; - vnode_put (lowervp); - return (0); -} - -/* - * XXX - vnop_strategy must be hand coded because it has no - * vnode in its arguments. - * This goes away with a merged VM/buffer cache. - */ -int -null_strategy(ap) - struct vnop_strategy_args /* { - struct buf *a_bp; - } */ *ap; -{ - struct buf *bp = ap->a_bp; - int error; - struct vnode *savedvp; - - savedvp = vnode(bp); - buf_setvnode(bp, NULLVPTOLOWERVP(savedvp)); - - error = VNOP_STRATEGY(bp); - - buf_setvnode(bp, savedvp); - - return (error); -} - -/* - * XXX - like vnop_strategy, vnop_bwrite must be hand coded because it has no - * vnode in its arguments. - * This goes away with a merged VM/buffer cache. - */ -int -null_bwrite(ap) - struct vnop_bwrite_args /* { - struct buf *a_bp; - } */ *ap; -{ - struct buf *bp = ap->a_bp; - int error; - struct vnode *savedvp; - - savedvp = buf_vnode(bp); - buf_setvnode(bp, NULLVPTOLOWERVP(savedvp)); - - error = VNOP_BWRITE(bp); - - buf_setvnode(bp, savedvp); - - return (error); -} - -/* - * Global vfs data structures - */ - -#define VOPFUNC int (*)(void *) - -int (**null_vnodeop_p)(void *); -struct vnodeopv_entry_desc null_vnodeop_entries[] = { - { &vnop_default_desc, (VOPFUNC)null_bypass }, - - { &vnop_lookup_desc, (VOPFUNC)null_lookup }, - { &vnop_setattr_desc, (VOPFUNC)null_setattr }, - { &vnop_getattr_desc, (VOPFUNC)null_getattr }, - { &vnop_access_desc, (VOPFUNC)null_access }, - { &vnop_inactive_desc, (VOPFUNC)null_inactive }, - { &vnop_reclaim_desc, (VOPFUNC)null_reclaim }, - - { &vnop_strategy_desc, (VOPFUNC)null_strategy }, - { &vnop_bwrite_desc, (VOPFUNC)null_bwrite }, - - { (struct vnodeop_desc*)NULL, (int(*)())NULL } -}; -struct vnodeopv_desc null_vnodeop_opv_desc = - { &null_vnodeop_p, null_vnodeop_entries }; diff --git a/bsd/miscfs/specfs/Makefile b/bsd/miscfs/specfs/Makefile index 52832cc71..7c6f583e4 100644 --- a/bsd/miscfs/specfs/Makefile +++ b/bsd/miscfs/specfs/Makefile @@ -9,14 +9,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ DATAFILES = \ diff --git a/bsd/miscfs/specfs/spec_vnops.c b/bsd/miscfs/specfs/spec_vnops.c index cbd0de6d9..8050679f8 100644 --- a/bsd/miscfs/specfs/spec_vnops.c +++ b/bsd/miscfs/specfs/spec_vnops.c @@ -69,8 +69,9 @@ #include #include #include -#include #include +#include +#include #include #include #include @@ -82,6 +83,8 @@ #include #include #include +#include +#include #include @@ -247,7 +250,15 @@ spec_open(struct vnop_open_args *ap) vp->v_flag |= VISTTY; vnode_unlock(vp); } + + devsw_lock(dev, S_IFCHR); error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p); + + if (error == 0) { + vp->v_specinfo->si_opencount++; + } + + devsw_unlock(dev, S_IFCHR); return (error); case VBLK: @@ -266,7 +277,14 @@ spec_open(struct vnop_open_args *ap) */ if ( (error = vfs_mountedon(vp)) ) return (error); + + devsw_lock(dev, S_IFBLK); error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p); + if (!error) { + vp->v_specinfo->si_opencount++; + } + devsw_unlock(dev, S_IFBLK); + if (!error) { u_int64_t blkcnt; u_int32_t blksize; @@ -382,7 +400,7 @@ spec_read(struct vnop_read_args *ap) } n = min((unsigned)(n - on), uio_resid(uio)); - error = uiomove((char *)0 + buf_dataptr(bp) + on, n, uio); + error = uiomove((char *)buf_dataptr(bp) + on, n, uio); if (n + on == bsize) buf_markaged(bp); buf_brelse(bp); @@ -484,7 +502,7 @@ spec_write(struct vnop_write_args *ap) } n = min(n, bsize - buf_resid(bp)); - error = uiomove((char *)0 + buf_dataptr(bp) + on, n, uio); + error = uiomove((char *)buf_dataptr(bp) + on, n, uio); if (error) { buf_brelse(bp); return (error); @@ -562,6 +580,8 @@ spec_select(struct vnop_select_args *ap) } } +static int filt_specattach(struct knote *kn); + int spec_kqfilter(vnode_t vp, struct knote *kn) { @@ -575,8 +595,8 @@ spec_kqfilter(vnode_t vp, struct knote *kn) dev = vnode_specrdev(vp); if (vnode_istty(vp)) { - /* We can hook into the slave side of a tty */ - err = ptsd_kqfilter(dev, kn); + /* We can hook into TTYs... */ + err = filt_specattach(kn); } else { /* Try a bpf device, as defined in bsd/net/bpf.c */ err = bpfkqfilter(dev, kn); @@ -618,8 +638,12 @@ void IOSleep(int); #define LOWPRI_WINDOW_MSECS_INC 50 #define LOWPRI_MAX_WINDOW_MSECS 200 #define LOWPRI_MAX_WAITING_MSECS 200 -#define LOWPRI_SLEEP_INTERVAL 5 +#if CONFIG_EMBEDDED +#define LOWPRI_SLEEP_INTERVAL 5 +#else +#define LOWPRI_SLEEP_INTERVAL 2 +#endif struct _throttle_io_info_t { struct timeval last_normal_IO_timestamp; @@ -627,7 +651,6 @@ struct _throttle_io_info_t { SInt32 numthreads_throttling; SInt32 refcnt; SInt32 alloc; - }; struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV]; @@ -647,10 +670,31 @@ int lowpri_max_waiting_msecs = LOWPRI_MAX_WAITING_MSECS; #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...) #endif -SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_initial_window_msecs, CTLFLAG_RW, &lowpri_IO_initial_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, ""); -SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_window_inc, CTLFLAG_RW, &lowpri_IO_window_msecs_inc, LOWPRI_INITIAL_WINDOW_MSECS, ""); -SYSCTL_INT(_debug, OID_AUTO, lowpri_max_window_msecs, CTLFLAG_RW, &lowpri_max_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, ""); -SYSCTL_INT(_debug, OID_AUTO, lowpri_max_waiting_msecs, CTLFLAG_RW, &lowpri_max_waiting_msecs, LOWPRI_INITIAL_WINDOW_MSECS, ""); +SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_initial_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_IO_initial_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, ""); +SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_window_inc, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_IO_window_msecs_inc, LOWPRI_INITIAL_WINDOW_MSECS, ""); +SYSCTL_INT(_debug, OID_AUTO, lowpri_max_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_max_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, ""); +SYSCTL_INT(_debug, OID_AUTO, lowpri_max_waiting_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_max_waiting_msecs, LOWPRI_INITIAL_WINDOW_MSECS, ""); + +/* + * throttled I/O helper function + * convert the index of the lowest set bit to a device index + */ +int +num_trailing_0(uint64_t n) +{ + /* + * since in most cases the number of trailing 0s is very small, + * we simply counting sequentially from the lowest bit + */ + if (n == 0) + return sizeof(n) * 8; + int count = 0; + while (!ISSET(n, 1)) { + n >>= 1; + ++count; + } + return count; +} /* * Release the reference and if the item was allocated and this is the last @@ -760,6 +804,41 @@ throttle_info_mount_ref(mount_t mp, void *throttle_info) mp->mnt_throttle_info = throttle_info; } +/* + * Private KPI routine + * + * return a handle for accessing throttle_info given a throttle_mask. The + * handle must be released by throttle_info_rel_by_mask + */ +int +throttle_info_ref_by_mask(uint64_t throttle_mask, + throttle_info_handle_t *throttle_info_handle) +{ + int dev_index; + struct _throttle_io_info_t *info; + + if (throttle_info_handle == NULL) + return EINVAL; + + dev_index = num_trailing_0(throttle_mask); + info = &_throttle_io_info[dev_index]; + throttle_info_ref(info); + *(struct _throttle_io_info_t**)throttle_info_handle = info; + return 0; +} + +/* + * Private KPI routine + * + * release the handle obtained by throttle_info_ref_by_mask + */ +void +throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle) +{ + /* for now the handle is just a pointer to _throttle_io_info_t */ + throttle_info_rel((struct _throttle_io_info_t*)throttle_info_handle); +} + /* * KPI routine * @@ -804,12 +883,51 @@ update_last_io_time(mount_t mp) microuptime(&info->last_IO_timestamp); } + +#if CONFIG_EMBEDDED + +int throttle_get_io_policy(struct uthread **ut) +{ + int policy = IOPOL_DEFAULT; + proc_t p = current_proc(); + + *ut = get_bsdthread_info(current_thread()); + + if (p != NULL) + policy = p->p_iopol_disk; + + if (*ut != NULL) { + // the I/O policy of the thread overrides that of the process + // unless the I/O policy of the thread is default + if ((*ut)->uu_iopol_disk != IOPOL_DEFAULT) + policy = (*ut)->uu_iopol_disk; + } + return policy; +} +#else + +int throttle_get_io_policy(__unused struct uthread **ut) +{ + *ut = get_bsdthread_info(current_thread()); + + return (proc_get_task_selfdiskacc()); +} +#endif + + static int throttle_io_will_be_throttled_internal(int lowpri_window_msecs, void * throttle_info) { struct _throttle_io_info_t *info = throttle_info; struct timeval elapsed; int elapsed_msecs; + int policy; + struct uthread *ut; + + policy = throttle_get_io_policy(&ut); + + if (ut->uu_throttle_bc == FALSE && policy != IOPOL_THROTTLE) + return (0); microuptime(&elapsed); timevalsub(&elapsed, &info->last_normal_IO_timestamp); @@ -841,12 +959,15 @@ throttle_io_will_be_throttled(int lowpri_window_msecs, mount_t mp) return throttle_io_will_be_throttled_internal(lowpri_window_msecs, info); } -void throttle_lowpri_io(boolean_t ok_to_sleep) +uint32_t +throttle_lowpri_io(int sleep_amount) { - int i; + int sleep_cnt = 0; + int numthreads_throttling; int max_try_num; struct uthread *ut; struct _throttle_io_info_t *info; + int max_waiting_msecs; ut = get_bsdthread_info(current_thread()); @@ -854,23 +975,39 @@ void throttle_lowpri_io(boolean_t ok_to_sleep) goto done; info = ut->uu_throttle_info; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, - ut->uu_lowpri_window, ok_to_sleep, 0, 0, 0); - if (ok_to_sleep == TRUE) { - max_try_num = lowpri_max_waiting_msecs / LOWPRI_SLEEP_INTERVAL * MAX(1, info->numthreads_throttling); + if (sleep_amount != 0) { +#if CONFIG_EMBEDDED + max_waiting_msecs = lowpri_max_waiting_msecs; +#else + if (ut->uu_throttle_isssd == TRUE) + max_waiting_msecs = lowpri_max_waiting_msecs / 100; + else + max_waiting_msecs = lowpri_max_waiting_msecs; +#endif + if (max_waiting_msecs < LOWPRI_SLEEP_INTERVAL) + max_waiting_msecs = LOWPRI_SLEEP_INTERVAL; - for (i=0; inumthreads_throttling + MIN(10, MAX(1, sleep_amount)) - 1; + max_try_num = max_waiting_msecs / LOWPRI_SLEEP_INTERVAL * MAX(1, numthreads_throttling); + + for (sleep_cnt = 0; sleep_cnt < max_try_num; sleep_cnt++) { if (throttle_io_will_be_throttled_internal(ut->uu_lowpri_window, info)) { + if (sleep_cnt == 0) { + KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, + ut->uu_lowpri_window, max_try_num, numthreads_throttling, 0, 0); + } IOSleep(LOWPRI_SLEEP_INTERVAL); DEBUG_ALLOC_THROTTLE_INFO("sleeping because of info = %p\n", info, info ); } else { break; } } + if (sleep_cnt) { + KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, + ut->uu_lowpri_window, sleep_cnt, 0, 0, 0); + } } - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, - ut->uu_lowpri_window, i*5, 0, 0, 0); SInt32 oldValue; oldValue = OSDecrementAtomic(&info->numthreads_throttling); @@ -882,35 +1019,72 @@ void throttle_lowpri_io(boolean_t ok_to_sleep) if (ut->uu_throttle_info) throttle_info_rel(ut->uu_throttle_info); ut->uu_throttle_info = NULL; + ut->uu_throttle_bc = FALSE; + + return (sleep_cnt * LOWPRI_SLEEP_INTERVAL); } -int throttle_get_io_policy(struct uthread **ut) +/* + * KPI routine + * + * set a kernel thread's IO policy. policy can be: + * IOPOL_NORMAL, IOPOL_THROTTLE, IOPOL_PASSIVE + * + * explanations about these policies are in the man page of setiopolicy_np + */ +void throttle_set_thread_io_policy(int policy) { - int policy = IOPOL_DEFAULT; - proc_t p = current_proc(); +#if !CONFIG_EMBEDDED + proc_apply_thread_selfdiskacc(policy); +#else /* !CONFIG_EMBEDDED */ + struct uthread *ut; + ut = get_bsdthread_info(current_thread()); + ut->uu_iopol_disk = policy; +#endif /* !CONFIG_EMBEDDED */ +} - *ut = get_bsdthread_info(current_thread()); - - if (p != NULL) - policy = p->p_iopol_disk; - if (*ut != NULL) { - // the I/O policy of the thread overrides that of the process - // unless the I/O policy of the thread is default - if ((*ut)->uu_iopol_disk != IOPOL_DEFAULT) - policy = (*ut)->uu_iopol_disk; +static +void throttle_info_reset_window(struct uthread *ut) +{ + struct _throttle_io_info_t *info; + + info = ut->uu_throttle_info; + + OSDecrementAtomic(&info->numthreads_throttling); + throttle_info_rel(info); + ut->uu_throttle_info = NULL; + ut->uu_lowpri_window = 0; +} + +static +void throttle_info_set_initial_window(struct uthread *ut, struct _throttle_io_info_t *info, boolean_t isssd, boolean_t BC_throttle) +{ + SInt32 oldValue; + + ut->uu_throttle_info = info; + throttle_info_ref(info); + DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info ); + + oldValue = OSIncrementAtomic(&info->numthreads_throttling); + if (oldValue < 0) { + panic("%s: numthreads negative", __func__); } - return policy; + ut->uu_lowpri_window = lowpri_IO_initial_window_msecs; + ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * oldValue; + ut->uu_throttle_isssd = isssd; + ut->uu_throttle_bc = BC_throttle; } -void throttle_info_update(void *throttle_info, int flags) + +static +void throttle_info_update_internal(void *throttle_info, int flags, boolean_t isssd) { struct _throttle_io_info_t *info = throttle_info; struct uthread *ut; int policy; int is_throttleable_io = 0; int is_passive_io = 0; - SInt32 oldValue; if (!lowpri_IO_initial_window_msecs || (info == NULL)) return; @@ -949,28 +1123,19 @@ void throttle_info_update(void *throttle_info, int flags) * do the delay just before we return from the system * call that triggered this I/O or from vnode_pagein */ - if (ut->uu_lowpri_window == 0) { - ut->uu_throttle_info = info; - throttle_info_ref(ut->uu_throttle_info); - DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info ); - - oldValue = OSIncrementAtomic(&info->numthreads_throttling); - if (oldValue < 0) { - panic("%s: numthreads negative", __func__); - } - ut->uu_lowpri_window = lowpri_IO_initial_window_msecs; - ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * oldValue; - } else { + if (ut->uu_lowpri_window == 0) + throttle_info_set_initial_window(ut, info, isssd, FALSE); + else { /* The thread sends I/Os to different devices within the same system call */ if (ut->uu_throttle_info != info) { - struct _throttle_io_info_t *old_info = ut->uu_throttle_info; + struct _throttle_io_info_t *old_info = ut->uu_throttle_info; // keep track of the numthreads in the right device OSDecrementAtomic(&old_info->numthreads_throttling); OSIncrementAtomic(&info->numthreads_throttling); - DEBUG_ALLOC_THROTTLE_INFO("switching from info = %p\n", old_info, old_info ); - DEBUG_ALLOC_THROTTLE_INFO("switching to info = %p\n", info, info ); + DEBUG_ALLOC_THROTTLE_INFO("switching from info = %p\n", old_info, old_info ); + DEBUG_ALLOC_THROTTLE_INFO("switching to info = %p\n", info, info ); /* This thread no longer needs a reference on that throttle info */ throttle_info_rel(ut->uu_throttle_info); ut->uu_throttle_info = info; @@ -981,26 +1146,76 @@ void throttle_info_update(void *throttle_info, int flags) ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * numthreads; if (ut->uu_lowpri_window > lowpri_max_window_msecs * numthreads) ut->uu_lowpri_window = lowpri_max_window_msecs * numthreads; + + if (isssd == FALSE) { + /* + * we're here because we've actually issued I/Os to different devices... + * if at least one of them was a non SSD, then thottle the thread + * using the policy for non SSDs + */ + ut->uu_throttle_isssd = FALSE; + } } } } +/* + * KPI routine + * + * this is usually called before every I/O, used for throttled I/O + * book keeping. This routine has low overhead and does not sleep + */ +void throttle_info_update(void *throttle_info, int flags) +{ + throttle_info_update_internal(throttle_info, flags, FALSE); +} + +/* + * KPI routine + * + * this is usually called before every I/O, used for throttled I/O + * book keeping. This routine has low overhead and does not sleep + */ +void throttle_info_update_by_mask(void *throttle_info_handle, int flags) +{ + void *throttle_info = throttle_info_handle; + /* for now we only use the lowest bit of the throttle mask, so the + * handle is the same as the throttle_info. Later if we store a + * set of throttle infos in the handle, we will want to loop through + * them and call throttle_info_update in a loop + */ + throttle_info_update(throttle_info, flags); +} + +extern int ignore_is_ssd; + int spec_strategy(struct vnop_strategy_args *ap) { buf_t bp; int bflags; - int policy; + int policy; dev_t bdev; uthread_t ut; - size_t devbsdunit; mount_t mp; + int strategy_ret; + struct _throttle_io_info_t *throttle_info; + boolean_t isssd = FALSE; bp = ap->a_bp; bdev = buf_device(bp); - bflags = buf_flags(bp); mp = buf_vnode(bp)->v_mount; + policy = throttle_get_io_policy(&ut); + + if (policy == IOPOL_THROTTLE) { + bp->b_flags |= B_THROTTLED_IO; + bp->b_flags &= ~B_PASSIVE; + } else if (policy == IOPOL_PASSIVE) + bp->b_flags |= B_PASSIVE; + + bflags = bp->b_flags; + if (kdebug_enable) { int code = 0; @@ -1014,6 +1229,11 @@ spec_strategy(struct vnop_strategy_args *ap) else if (bflags & B_PAGEIO) code |= DKIO_PAGING; + if (bflags & B_THROTTLED_IO) + code |= DKIO_THROTTLE; + else if (bflags & B_PASSIVE) + code |= DKIO_PASSIVE; + KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE, bp, bdev, (int)buf_blkno(bp), buf_count(bp), 0); } @@ -1021,29 +1241,63 @@ spec_strategy(struct vnop_strategy_args *ap) mp && (mp->mnt_kern_flag & MNTK_ROOTDEV)) hard_throttle_on_root = 1; + if (mp != NULL) { + if ((mp->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) + isssd = TRUE; + throttle_info = &_throttle_io_info[mp->mnt_devbsdunit]; + } else + throttle_info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; - if (mp != NULL) - devbsdunit = mp->mnt_devbsdunit; - else - devbsdunit = LOWPRI_MAX_NUM_DEV - 1; - - throttle_info_update(&_throttle_io_info[devbsdunit], bflags); - if ((policy = throttle_get_io_policy(&ut)) == IOPOL_THROTTLE) { - bp->b_flags |= B_THROTTLED_IO; - } - + throttle_info_update_internal(throttle_info, bflags, isssd); if ((bflags & B_READ) == 0) { - microuptime(&_throttle_io_info[devbsdunit].last_IO_timestamp); + microuptime(&throttle_info->last_IO_timestamp); if (mp) { INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_write_size); } } else if (mp) { INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_read_size); } - - (*bdevsw[major(bdev)].d_strategy)(bp); + /* + * The BootCache may give us special information about + * the IO, so it returns special values that we check + * for here. + * + * IO_SATISFIED_BY_CACHE + * The read has been satisfied by the boot cache. Don't + * throttle the thread unnecessarily. + * + * IO_SHOULD_BE_THROTTLED + * The boot cache is playing back a playlist and this IO + * cut through. Throttle it so we're not cutting through + * the boot cache too often. + * + * Note that typical strategy routines are defined with + * a void return so we'll get garbage here. In the + * unlikely case the garbage matches our special return + * value, it's not a big deal since we're only adjusting + * the throttling delay. + */ +#define IO_SATISFIED_BY_CACHE ((int)0xcafefeed) +#define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef) + typedef int strategy_fcn_ret_t(struct buf *bp); + strategy_ret = (*(strategy_fcn_ret_t*)bdevsw[major(bdev)].d_strategy)(bp); + + if ((IO_SATISFIED_BY_CACHE == strategy_ret) && (ut->uu_lowpri_window != 0) && (ut->uu_throttle_info != NULL)) { + /* + * If this was a throttled IO satisfied by the boot cache, + * don't delay the thread. + */ + throttle_info_reset_window(ut); + + } else if ((IO_SHOULD_BE_THROTTLED == strategy_ret) && (ut->uu_lowpri_window == 0) && (ut->uu_throttle_info == NULL)) { + /* + * If the boot cache indicates this IO should be throttled, + * delay the thread. + */ + throttle_info_set_initial_window(ut, throttle_info, isssd, TRUE); + } return (0); } @@ -1066,11 +1320,11 @@ spec_close(struct vnop_close_args *ap) { struct vnode *vp = ap->a_vp; dev_t dev = vp->v_rdev; - int (*devclose)(dev_t, int, int, struct proc *); - int mode, error; + int error = 0; int flags = ap->a_fflag; struct proc *p = vfs_context_proc(ap->a_context); struct session *sessp; + int do_rele = 0; switch (vp->v_type) { @@ -1088,38 +1342,56 @@ spec_close(struct vnop_close_args *ap) if (sessp != SESSION_NULL) { if ((vcount(vp) == 1) && (vp == sessp->s_ttyvp)) { + session_lock(sessp); - sessp->s_ttyvp = NULL; - sessp->s_ttyvid = 0; - sessp->s_ttyp = TTY_NULL; - sessp->s_ttypgrpid = NO_PID; + if (vp == sessp->s_ttyvp) { + sessp->s_ttyvp = NULL; + sessp->s_ttyvid = 0; + sessp->s_ttyp = TTY_NULL; + sessp->s_ttypgrpid = NO_PID; + do_rele = 1; + } session_unlock(sessp); - vnode_rele(vp); + + if (do_rele) { + vnode_rele(vp); + } } session_rele(sessp); } - devclose = cdevsw[major(dev)].d_close; - mode = S_IFCHR; + devsw_lock(dev, S_IFCHR); + + vp->v_specinfo->si_opencount--; + + if (vp->v_specinfo->si_opencount < 0) { + panic("Negative open count?"); + } /* * close on last reference or on vnode revoke call */ - if ((flags & IO_REVOKE) != 0) - break; - if (vcount(vp) > 0) + if ((vcount(vp) > 0) && ((flags & IO_REVOKE) == 0)) { + devsw_unlock(dev, S_IFCHR); return (0); + } + + error = cdevsw[major(dev)].d_close(dev, flags, S_IFCHR, p); + + devsw_unlock(dev, S_IFCHR); break; case VBLK: /* - * Since every use (buffer, vnode, swap, blockmap) - * holds a reference to the vnode, and because we mark - * any other vnodes that alias this device, when the - * sum of the reference counts on all the aliased - * vnodes descends to zero, we are on last close. + * If there is more than one outstanding open, don't + * send the close to the device. */ - if (vcount(vp) > 0) + devsw_lock(dev, S_IFBLK); + if (vcount(vp) > 1) { + vp->v_specinfo->si_opencount--; + devsw_unlock(dev, S_IFBLK); return (0); + } + devsw_unlock(dev, S_IFBLK); /* * On last close of a block device (that isn't mounted) @@ -1133,8 +1405,22 @@ spec_close(struct vnop_close_args *ap) if (error) return (error); - devclose = bdevsw[major(dev)].d_close; - mode = S_IFBLK; + devsw_lock(dev, S_IFBLK); + + vp->v_specinfo->si_opencount--; + + if (vp->v_specinfo->si_opencount < 0) { + panic("Negative open count?"); + } + + if (vcount(vp) > 0) { + devsw_unlock(dev, S_IFBLK); + return (0); + } + + error = bdevsw[major(dev)].d_close(dev, flags, S_IFBLK, p); + + devsw_unlock(dev, S_IFBLK); break; default: @@ -1142,7 +1428,7 @@ spec_close(struct vnop_close_args *ap) return(EBADF); } - return ((*devclose)(dev, flags, mode, p)); + return error; } /* @@ -1234,3 +1520,171 @@ spec_offtoblk(struct vnop_offtoblk_args *ap) return (0); } + +static void filt_specdetach(struct knote *kn); +static int filt_spec(struct knote *kn, long hint); +static unsigned filt_specpeek(struct knote *kn); + +struct filterops spec_filtops = { + .f_isfd = 1, + .f_attach = filt_specattach, + .f_detach = filt_specdetach, + .f_event = filt_spec, + .f_peek = filt_specpeek +}; + +static int +filter_to_seltype(int16_t filter) +{ + switch (filter) { + case EVFILT_READ: + return FREAD; + case EVFILT_WRITE: + return FWRITE; + break; + default: + panic("filt_to_seltype(): invalid filter %d\n", filter); + return 0; + } +} + +static int +filt_specattach(struct knote *kn) +{ + vnode_t vp; + dev_t dev; + + vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; /* Already have iocount, and vnode is alive */ + + assert(vnode_ischr(vp)); + + dev = vnode_specrdev(vp); + + if (major(dev) > nchrdev) { + return ENXIO; + } + + if ((cdevsw_flags[major(dev)] & CDEVSW_SELECT_KQUEUE) == 0) { + return EINVAL; + } + + /* Resulting wql is safe to unlink even if it has never been linked */ + kn->kn_hook = wait_queue_link_allocate(); + if (kn->kn_hook == NULL) { + return EAGAIN; + } + + kn->kn_fop = &spec_filtops; + kn->kn_hookid = vnode_vid(vp); + + knote_markstayqueued(kn); + + return 0; +} + +static void +filt_specdetach(struct knote *kn) +{ + kern_return_t ret; + + /* + * Given wait queue link and wait queue set, unlink. This is subtle. + * If the device has been revoked from under us, selclearthread() will + * have removed our link from the kqueue's wait queue set, which + * wait_queue_set_unlink_one() will detect and handle. + */ + ret = wait_queue_set_unlink_one(kn->kn_kq->kq_wqs, kn->kn_hook); + if (ret != KERN_SUCCESS) { + panic("filt_specdetach(): failed to unlink wait queue link."); + } + + (void)wait_queue_link_free(kn->kn_hook); + kn->kn_hook = NULL; + kn->kn_status &= ~KN_STAYQUEUED; +} + +static int +filt_spec(struct knote *kn, long hint) +{ + vnode_t vp; + uthread_t uth; + wait_queue_set_t old_wqs; + vfs_context_t ctx; + int selres; + int error; + int use_offset; + dev_t dev; + uint64_t flags; + + assert(kn->kn_hook != NULL); + + if (hint != 0) { + panic("filt_spec(): nonzero hint?"); + } + + uth = get_bsdthread_info(current_thread()); + ctx = vfs_context_current(); + vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; + + error = vnode_getwithvid(vp, kn->kn_hookid); + if (error != 0) { + kn->kn_flags |= (EV_EOF | EV_ONESHOT); + return 1; + } + + dev = vnode_specrdev(vp); + flags = cdevsw_flags[major(dev)]; + use_offset = ((flags & CDEVSW_USE_OFFSET) != 0); + assert((flags & CDEVSW_SELECT_KQUEUE) != 0); + + /* Trick selrecord() into hooking kqueue's wait queue set into device wait queue */ + old_wqs = uth->uu_wqset; + uth->uu_wqset = kn->kn_kq->kq_wqs; + selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx); + uth->uu_wqset = old_wqs; + + if (use_offset) { + if (kn->kn_fp->f_fglob->fg_offset >= (uint32_t)selres) { + kn->kn_data = 0; + } else { + kn->kn_data = ((uint32_t)selres) - kn->kn_fp->f_fglob->fg_offset; + } + } else { + kn->kn_data = selres; + } + + vnode_put(vp); + + return (kn->kn_data != 0); +} + +static unsigned +filt_specpeek(struct knote *kn) +{ + vnode_t vp; + uthread_t uth; + wait_queue_set_t old_wqs; + vfs_context_t ctx; + int error, selres; + + uth = get_bsdthread_info(current_thread()); + ctx = vfs_context_current(); + vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; + + error = vnode_getwithvid(vp, kn->kn_hookid); + if (error != 0) { + return 1; /* Just like VNOP_SELECT() on recycled vnode */ + } + + /* + * Why pass the link here? Because we may not have registered in the past... + */ + old_wqs = uth->uu_wqset; + uth->uu_wqset = kn->kn_kq->kq_wqs; + selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx); + uth->uu_wqset = old_wqs; + + vnode_put(vp); + return selres; +} + diff --git a/bsd/miscfs/specfs/specdev.h b/bsd/miscfs/specfs/specdev.h index dfe9c9945..3394fedbf 100644 --- a/bsd/miscfs/specfs/specdev.h +++ b/bsd/miscfs/specfs/specdev.h @@ -79,6 +79,7 @@ struct specinfo { struct vnode *si_specnext; long si_flags; dev_t si_rdev; + int32_t si_opencount; daddr_t si_size; /* device block size in bytes */ daddr64_t si_lastr; /* last read blkno (read-ahead) */ u_int64_t si_devsize; /* actual device size in bytes */ diff --git a/bsd/miscfs/union/Makefile b/bsd/miscfs/union/Makefile index 72ccd7707..513e6bbb9 100644 --- a/bsd/miscfs/union/Makefile +++ b/bsd/miscfs/union/Makefile @@ -9,14 +9,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ DATAFILES = \ diff --git a/bsd/miscfs/union/union.h b/bsd/miscfs/union/union.h index 5411f538d..eee3e5a87 100644 --- a/bsd/miscfs/union/union.h +++ b/bsd/miscfs/union/union.h @@ -67,156 +67,5 @@ #ifndef __UNION_UNION_H__ #define __UNION_UNION_H__ -#include -#include - -#ifdef __APPLE_API_PRIVATE -struct union_args { - char *target; /* Target of loopback */ - int mntflags; /* Options on the mount */ -}; - -#define UNMNT_ABOVE 0x0001 /* Target appears above mount point */ -#define UNMNT_BELOW 0x0002 /* Target appears below mount point */ -#define UNMNT_REPLACE 0x0003 /* Target replaces mount point */ -#ifdef FAULTFS -#define UNMNT_FAULTIN 0x0004 /* get the files to TOT on lookup */ -#define UNMNT_OPMASK 0x0007 -#else -#define UNMNT_OPMASK 0x0003 -#endif - -#ifdef BSD_KERNEL_PRIVATE - -struct union_mount { - struct vnode *um_uppervp; /* */ - int um_uppervid; /* vid of upper vnode */ - struct vnode *um_lowervp; /* Left unlocked */ - int um_lowervid; /* vid of lower vnode */ - kauth_cred_t um_cred; /* Credentials of user calling mount */ - int um_cmode; /* cmask from mount process */ - int um_op; /* Operation mode */ - dev_t um_upperdev; /* Upper root node fsid[0]*/ -}; - - -#define UNION_ABOVE(x) (x->um_op == UNMNT_ABOVE) -#define UNION_LOWER(x) (x->um_op == UNMNT_BELOW) -#define UNION_REPLACE(x) (x->um_op == UNMNT_REPLACE) -#ifdef FAULTFS -#define UNION_FAULTIN(x) (x->um_op == UNMNT_FAULTIN) -#else -#define UNION_FAULTIN(x) (0) - -#endif - -/* LP64 version of union_args. all pointers - * grow when we're dealing with a 64-bit process. - * WARNING - keep in sync with union_args - */ - -struct user_union_args { - user_addr_t target; /* Target of loopback */ - int mntflags; /* Options on the mount */ - char _pad[4]; -}; - -/* - * DEFDIRMODE is the mode bits used to create a shadow directory. - */ -#define VRWXMODE (VREAD|VWRITE|VEXEC) -#define VRWMODE (VREAD|VWRITE) -#define UN_DIRMODE ((VRWXMODE)|(VRWXMODE>>3)|(VRWXMODE>>6)) -#define UN_FILEMODE ((VRWMODE)|(VRWMODE>>3)|(VRWMODE>>6)) - -/* - * A cache of vnode references - */ -struct union_node { - LIST_ENTRY(union_node) un_cache; /* Hash chain */ - struct vnode *un_vnode; /* Back pointer */ - - struct vnode *un_uppervp; /* overlaying object */ - int un_uppervid; /* vid of upper vnode */ - off_t un_uppersz; /* size of upper object */ - - struct vnode *un_lowervp; /* underlying object */ - int un_lowervid; /* vid of upper vnode */ - off_t un_lowersz; /* size of lower object */ - - struct vnode *un_dirvp; /* Parent dir of uppervp */ - struct vnode *un_pvp; /* Parent vnode */ - - char *un_path; /* saved component name */ - int un_hash; /* saved un_path hash value */ - int un_openl; /* # of opens on lowervp */ - int un_exclcnt; /* exclusive count */ - unsigned int un_flags; - mount_t un_mount; - struct vnode **un_dircache; /* cached union stack */ -}; - -#define UN_WANT 0x01 /* union node is needed */ -#define UN_LOCKED 0x02 /* union node is locked */ -#define UN_CACHED 0x04 /* In union cache */ -#define UN_TRANSIT 0x08 /* The union node is in creation */ -#define UN_DELETED 0x10 /* The union node is deleted */ -#ifdef FAULTFS -#define UN_FAULTFS 0x80 /* The union node is for faultfs */ -#endif -#define UN_DIRENVN 0x100 /* The union node is created for dir enumeration */ - - -#ifdef FAULTFS -#define UNNODE_FAULTIN(x) ((x->un_flags & UN_FAULTFS)== UN_FAULTFS) -#else -#define UNNODE_FAULTIN(x) (0) -#endif -/* - * Hash table locking flags - */ - -#define UNVP_WANT 0x01 -#define UNVP_LOCKED 0x02 - -#define MOUNTTOUNIONMOUNT(mp) ((struct union_mount *)((mp)->mnt_data)) -#define VTOUNION(vp) ((struct union_node *)(vp)->v_data) -#define UNIONTOV(un) ((un)->un_vnode) -#define LOWERVP(vp) (VTOUNION(vp)->un_lowervp) -#define UPPERVP(vp) (VTOUNION(vp)->un_uppervp) -#define OTHERVP(vp) (UPPERVP(vp) ? UPPERVP(vp) : LOWERVP(vp)) - - -extern int union_allocvp(struct vnode **, struct mount *, - struct vnode *, struct vnode *, - struct componentname *, struct vnode *, - struct vnode *, int); -extern int union_freevp(struct vnode *); -extern struct vnode * union_dircache(struct vnode *, vfs_context_t); -extern int union_copyfile(struct vnode *, struct vnode *,vfs_context_t ); -extern int union_copyup(struct union_node *, int, vfs_context_t ); -extern int union_dowhiteout(struct union_node *, vfs_context_t); -extern int union_mkshadow(struct union_mount *, struct vnode *, - struct componentname *, struct vnode **); -extern int union_mkwhiteout(struct union_mount *, struct vnode *, - struct componentname *, char *); -extern int union_vn_create(struct vnode **, struct union_node *, mode_t mode, vfs_context_t context); -extern int union_cn_close(struct vnode *, int, vfs_context_t context); -extern void union_removed_upper(struct union_node *un); -extern struct vnode *union_lowervp(struct vnode *); -extern void union_newsize(struct vnode *, off_t, off_t); -extern int union_init(struct vfsconf *); -extern void union_updatevp(struct union_node *, struct vnode *, struct vnode *); -extern void union_dircache_free(struct union_node *); -extern int (*union_dircheckp)(struct vnode **, struct fileproc *, vfs_context_t); -extern int union_faultin_copyup(struct vnode ** uvpp, vnode_t udvp, vnode_t lvp, struct componentname * cnp, vfs_context_t context); -extern int (**union_vnodeop_p)(void *); -extern struct vfsops union_vfsops; -void union_lock(void); -void union_unlock(void); - -#endif /* BSD_KERNEL_PRIVATE */ - -#endif /* __APPLE_API_PRIVATE */ #endif /* __UNION_UNION_H__ */ diff --git a/bsd/miscfs/union/union_subr.c b/bsd/miscfs/union/union_subr.c deleted file mode 100644 index 34dbe14f3..000000000 --- a/bsd/miscfs/union/union_subr.c +++ /dev/null @@ -1,1604 +0,0 @@ -/* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/* - * Copyright (c) 1994 Jan-Simon Pendry - * Copyright (c) 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Jan-Simon Pendry. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)union_subr.c 8.20 (Berkeley) 5/20/95 - */ -/* - * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce - * support for mandatory and extensible security protections. This notice - * is included in support of clause 2.2 (b) of the Apple Public License, - * Version 2.0. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if CONFIG_MACF -#include -#endif - - -static int union_vn_close(struct vnode *vp, int fmode, vfs_context_t ctx); - -/* must be power of two, otherwise change UNION_HASH() */ -#define NHASH 32 - -/* unsigned int ... */ -#define UNION_HASH(u, l) \ - (((((uintptr_t) (u)) + ((uintptr_t) l)) >> 8) & (NHASH-1)) - -static LIST_HEAD(unhead, union_node) unhead[NHASH]; -static int unvplock[NHASH]; - -static lck_grp_t * union_lck_grp; -static lck_grp_attr_t * union_lck_grp_attr; -static lck_attr_t * union_lck_attr; -static lck_mtx_t * union_mtxp; - -static int union_dircheck(struct vnode **, struct fileproc *, vfs_context_t ctx); -static void union_newlower(struct union_node *, struct vnode *); -static void union_newupper(struct union_node *, struct vnode *); - - -int -union_init(__unused struct vfsconf *vfsp) -{ - int i; - - union_lck_grp_attr= lck_grp_attr_alloc_init(); -#if DIAGNOSTIC - lck_grp_attr_setstat(union_lck_grp_attr); -#endif - union_lck_grp = lck_grp_alloc_init("union", union_lck_grp_attr); - union_lck_attr = lck_attr_alloc_init(); -#if DIAGNOSTIC - lck_attr_setdebug(union_lck_attr); -#endif - union_mtxp = lck_mtx_alloc_init(union_lck_grp, union_lck_attr); - - for (i = 0; i < NHASH; i++) - LIST_INIT(&unhead[i]); - bzero((caddr_t) unvplock, sizeof(unvplock)); - /* add the hook for getdirentries */ - union_dircheckp = union_dircheck; - - return (0); -} - -void -union_lock() -{ - lck_mtx_lock(union_mtxp); -} - -void -union_unlock() -{ - lck_mtx_unlock(union_mtxp); -} - - -static int -union_list_lock(int ix) -{ - - if (unvplock[ix] & UNVP_LOCKED) { - unvplock[ix] |= UNVP_WANT; - msleep((caddr_t) &unvplock[ix], union_mtxp, PINOD, "union_list_lock", NULL); - return (1); - } - - unvplock[ix] |= UNVP_LOCKED; - - return (0); -} - -static void -union_list_unlock(int ix) -{ - - unvplock[ix] &= ~UNVP_LOCKED; - - if (unvplock[ix] & UNVP_WANT) { - unvplock[ix] &= ~UNVP_WANT; - wakeup((caddr_t) &unvplock[ix]); - } -} - -/* - * union_updatevp: - * - * The uppervp, if not NULL, must be referenced and not locked by us - * The lowervp, if not NULL, must be referenced. - * - * If uppervp and lowervp match pointers already installed, then - * nothing happens. The passed vp's (when matching) are not adjusted. - * - * This routine may only be called by union_newupper() and - * union_newlower(). - */ - -/* always called with union lock held */ -void -union_updatevp(struct union_node *un, struct vnode *uppervp, - struct vnode *lowervp) -{ - int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp); - int nhash = UNION_HASH(uppervp, lowervp); - int docache = (lowervp != NULLVP || uppervp != NULLVP); - int lhash, uhash; - vnode_t freevp; - vnode_t freedirvp; - caddr_t freepath; - - /* - * Ensure locking is ordered from lower to higher - * to avoid deadlocks. - */ - if (nhash < ohash) { - lhash = nhash; - uhash = ohash; - } else { - lhash = ohash; - uhash = nhash; - } - - if (lhash != uhash) { - while (union_list_lock(lhash)) - continue; - } - - while (union_list_lock(uhash)) - continue; - - if (ohash != nhash || !docache) { - if (un->un_flags & UN_CACHED) { - un->un_flags &= ~UN_CACHED; - LIST_REMOVE(un, un_cache); - } - } - - if (ohash != nhash) - union_list_unlock(ohash); - - if (un->un_lowervp != lowervp) { - freevp = freedirvp = NULLVP; - freepath = (caddr_t)0; - if (un->un_lowervp) { - freevp = un->un_lowervp; - un->un_lowervp = lowervp; - if (un->un_path) { - freepath = un->un_path; - un->un_path = 0; - } - if (un->un_dirvp) { - freedirvp = un->un_dirvp; - un->un_dirvp = NULLVP; - } - union_unlock(); - if (freevp) - vnode_put(freevp); - if (freedirvp) - vnode_put(freedirvp); - if (freepath) - _FREE(un->un_path, M_TEMP); - union_lock(); - } else - un->un_lowervp = lowervp; - if (lowervp != NULLVP) - un->un_lowervid = vnode_vid(lowervp); - un->un_lowersz = VNOVAL; - } - - if (un->un_uppervp != uppervp) { - freevp = NULLVP; - if (un->un_uppervp) { - freevp = un->un_uppervp; - } - un->un_uppervp = uppervp; - if (uppervp != NULLVP) - un->un_uppervid = vnode_vid(uppervp); - un->un_uppersz = VNOVAL; - union_unlock(); - if (freevp) - vnode_put(freevp); - union_lock(); - } - - if (docache && (ohash != nhash)) { - LIST_INSERT_HEAD(&unhead[nhash], un, un_cache); - un->un_flags |= UN_CACHED; - } - - union_list_unlock(nhash); -} - -/* - * Set a new lowervp. The passed lowervp must be referenced and will be - * stored in the vp in a referenced state. - */ -/* always called with union lock held */ - -static void -union_newlower(un, lowervp) - struct union_node *un; - struct vnode *lowervp; -{ - union_updatevp(un, un->un_uppervp, lowervp); -} - -/* - * Set a new uppervp. The passed uppervp must be locked and will be - * stored in the vp in a locked state. The caller should not unlock - * uppervp. - */ - -/* always called with union lock held */ -static void -union_newupper(un, uppervp) - struct union_node *un; - struct vnode *uppervp; -{ - union_updatevp(un, uppervp, un->un_lowervp); -} - -/* - * Keep track of size changes in the underlying vnodes. - * If the size changes, then callback to the vm layer - * giving priority to the upper layer size. - */ -/* always called with union lock held */ -void -union_newsize(vp, uppersz, lowersz) - struct vnode *vp; - off_t uppersz, lowersz; -{ - struct union_node *un; - off_t sz; - - /* only interested in regular files */ - if (vp->v_type != VREG) - return; - - un = VTOUNION(vp); - sz = VNOVAL; - - if ((uppersz != VNOVAL) && (un->un_uppersz != uppersz)) { - un->un_uppersz = uppersz; - if (sz == VNOVAL) - sz = un->un_uppersz; - } - - if ((lowersz != VNOVAL) && (un->un_lowersz != lowersz)) { - un->un_lowersz = lowersz; - if (sz == VNOVAL) - sz = un->un_lowersz; - } - - if (sz != VNOVAL) { -#ifdef UNION_DIAGNOSTIC - printf("union: %s size now %ld\n", - uppersz != VNOVAL ? "upper" : "lower", (long) sz); -#endif - union_unlock(); - ubc_setsize(vp, sz); - union_lock(); - } -} - -/* - * union_allocvp: allocate a union_node and associate it with a - * parent union_node and one or two vnodes. - * - * vpp Holds the returned vnode locked and referenced if no - * error occurs. - * - * mp Holds the mount point. mp may or may not be busied. - * allocvp() makes no changes to mp. - * - * dvp Holds the parent union_node to the one we wish to create. - * XXX may only be used to traverse an uncopied lowervp-based - * tree? XXX - * - * dvp may or may not be locked. allocvp() makes no changes - * to dvp. - * - * upperdvp Holds the parent vnode to uppervp, generally used along - * with path component information to create a shadow of - * lowervp when uppervp does not exist. - * - * upperdvp is referenced but unlocked on entry, and will be - * dereferenced on return. - * - * uppervp Holds the new uppervp vnode to be stored in the - * union_node we are allocating. uppervp is referenced but - * not locked, and will be dereferenced on return. - * - * lowervp Holds the new lowervp vnode to be stored in the - * union_node we are allocating. lowervp is referenced but - * not locked, and will be dereferenced on return. - * - * cnp Holds path component information to be coupled with - * lowervp and upperdvp to allow unionfs to create an uppervp - * later on. Only used if lowervp is valid. The contents - * of cnp is only valid for the duration of the call. - * - * docache Determine whether this node should be entered in the - * cache or whether it should be destroyed as soon as possible. - * - * All union_nodes are maintained on a singly-linked - * list. New nodes are only allocated when they cannot - * be found on this list. Entries on the list are - * removed when the vfs reclaim entry is called. - * - * A single lock is kept for the entire list. This is - * needed because the getnewvnode() function can block - * waiting for a vnode to become free, in which case there - * may be more than one process trying to get the same - * vnode. This lock is only taken if we are going to - * call getnewvnode(), since the kernel itself is single-threaded. - * - * If an entry is found on the list, then call vget() to - * take a reference. This is done because there may be - * zero references to it and so it needs to removed from - * the vnode free list. - */ - -/* always called with union lock held */ - -int -union_allocvp(struct vnode **vpp, - struct mount *mp, - struct vnode *undvp, - struct vnode *dvp, - struct componentname *cnp, - struct vnode *uppervp, - struct vnode *lowervp, - int docache) -{ - int error; - struct union_node *un = NULL; - struct union_node *unp; - struct vnode *xlowervp = NULLVP; - struct union_mount *um = MOUNTTOUNIONMOUNT(mp); - int hash = 0; /* protected by docache */ - int markroot; - int try; - struct vnode_fsparam vfsp; - enum vtype vtype; - - if (uppervp == NULLVP && lowervp == NULLVP) - panic("union: unidentifiable allocation"); - - /* - * if both upper and lower vp are provided and are off different type - * consider lowervp as NULL - */ - if (uppervp && lowervp && (uppervp->v_type != lowervp->v_type)) { - xlowervp = lowervp; - lowervp = NULLVP; - } - - /* detect the root vnode (and aliases) */ - markroot = 0; - if ((uppervp == um->um_uppervp) && - ((lowervp == NULLVP) || lowervp == um->um_lowervp)) { - if (lowervp == NULLVP) { - lowervp = um->um_lowervp; - if (lowervp != NULLVP) { - union_unlock(); - vnode_get(lowervp); - union_lock(); - } - } - markroot = VROOT; - } - -loop: - if (!docache) { - un = NULL; - } else for (try = 0; try < 3; try++) { - switch (try) { - case 0: - if (lowervp == NULLVP) - continue; - hash = UNION_HASH(uppervp, lowervp); - break; - - case 1: - if (uppervp == NULLVP) - continue; - hash = UNION_HASH(uppervp, NULLVP); - break; - - case 2: - if (lowervp == NULLVP) - continue; - /* Not sure how this path gets exercised ? */ - hash = UNION_HASH(NULLVP, lowervp); - break; - } - - while (union_list_lock(hash)) - continue; - - for (un = unhead[hash].lh_first; un != 0; - un = un->un_cache.le_next) { - if ((un->un_lowervp == lowervp || - un->un_lowervp == NULLVP) && - (un->un_uppervp == uppervp || - un->un_uppervp == NULLVP) && - (un->un_mount == mp)) { - break; - } - } - - union_list_unlock(hash); - - if (un) - break; - } - - if (un) { - /* - * Obtain a lock on the union_node. - * uppervp is locked, though un->un_uppervp - * may not be. this doesn't break the locking - * hierarchy since in the case that un->un_uppervp - * is not yet locked it will be vnode_put'd and replaced - * with uppervp. - */ - - if (un->un_flags & UN_LOCKED) { - un->un_flags |= UN_WANT; - msleep((caddr_t) &un->un_flags, union_mtxp, PINOD, "union node locked", 0); - goto loop; - } - un->un_flags |= UN_LOCKED; - - union_unlock(); - if (UNIONTOV(un) == NULLVP) - panic("null vnode in union node\n"); - if (vnode_get(UNIONTOV(un))) { - union_lock(); - un->un_flags &= ~UN_LOCKED; - if ((un->un_flags & UN_WANT) == UN_WANT) { - un->un_flags &= ~UN_LOCKED; - wakeup(&un->un_flags); - } - goto loop; - } - union_lock(); - - /* - * At this point, the union_node is locked, - * un->un_uppervp may not be locked, and uppervp - * is locked or nil. - */ - - /* - * Save information about the upper layer. - */ - if (uppervp != un->un_uppervp) { - union_newupper(un, uppervp); - } else if (uppervp) { - union_unlock(); - vnode_put(uppervp); - union_lock(); - } - - /* - * Save information about the lower layer. - * This needs to keep track of pathname - * and directory information which union_vn_create - * might need. - */ - if (lowervp != un->un_lowervp) { - union_newlower(un, lowervp); - if (cnp && (lowervp != NULLVP)) { - un->un_hash = cnp->cn_hash; - union_unlock(); - MALLOC(un->un_path, caddr_t, cnp->cn_namelen+1, - M_TEMP, M_WAITOK); - bcopy(cnp->cn_nameptr, un->un_path, - cnp->cn_namelen); - vnode_get(dvp); - union_lock(); - un->un_path[cnp->cn_namelen] = '\0'; - un->un_dirvp = dvp; - } - } else if (lowervp) { - union_unlock(); - vnode_put(lowervp); - union_lock(); - } - *vpp = UNIONTOV(un); - un->un_flags &= ~UN_LOCKED; - if ((un->un_flags & UN_WANT) == UN_WANT) { - un->un_flags &= ~UN_WANT; - wakeup(&un->un_flags); - } - return (0); - } - - if (docache) { - /* - * otherwise lock the vp list while we call getnewvnode - * since that can block. - */ - hash = UNION_HASH(uppervp, lowervp); - - if (union_list_lock(hash)) - goto loop; - } - - union_unlock(); - MALLOC(unp, void *, sizeof(struct union_node), M_TEMP, M_WAITOK); - union_lock(); - - bzero(unp, sizeof(struct union_node)); - un = unp; - un->un_uppervp = uppervp; - if (uppervp != NULLVP) - un->un_uppervid = vnode_vid(uppervp); - un->un_uppersz = VNOVAL; - un->un_lowervp = lowervp; - if (lowervp != NULLVP) - un->un_lowervid = vnode_vid(lowervp); - un->un_lowersz = VNOVAL; - un->un_pvp = undvp; - if (undvp != NULLVP) - vnode_get(undvp); - un->un_dircache = 0; - un->un_openl = 0; - un->un_mount = mp; - un->un_flags = UN_LOCKED; -#ifdef FAULTFS - if (UNION_FAULTIN(um)) - un->un_flags |= UN_FAULTFS; -#endif - - if (docache) { - /* Insert with lock held */ - LIST_INSERT_HEAD(&unhead[hash], un, un_cache); - un->un_flags |= UN_CACHED; - union_list_unlock(hash); - } - - union_unlock(); - - if (uppervp) - vtype = uppervp->v_type; - else - vtype = lowervp->v_type; - - bzero(&vfsp, sizeof(struct vnode_fsparam)); - vfsp.vnfs_mp = mp; - vfsp.vnfs_vtype = vtype; - vfsp.vnfs_str = "unionfs"; - vfsp.vnfs_dvp = undvp; - vfsp.vnfs_fsnode = unp; - vfsp.vnfs_cnp = cnp; - vfsp.vnfs_vops = union_vnodeop_p; - vfsp.vnfs_rdev = 0; - vfsp.vnfs_filesize = 0; - vfsp.vnfs_flags = VNFS_NOCACHE | VNFS_CANTCACHE; - vfsp.vnfs_marksystem = 0; - vfsp.vnfs_markroot = markroot; - - error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, vpp); - if (error) { - /* XXXXX Is this right ???? XXXXXXX */ - if (uppervp) { - vnode_put(uppervp); - } - if (lowervp) - vnode_put(lowervp); - - union_lock(); - if (un->un_flags & UN_CACHED) { - un->un_flags &= ~UN_CACHED; - LIST_REMOVE(un, un_cache); - } - if (docache) - union_list_unlock(hash); - - FREE(unp, M_TEMP); - - return (error); - } - - if (cnp && (lowervp != NULLVP)) { - un->un_hash = cnp->cn_hash; - un->un_path = _MALLOC(cnp->cn_namelen+1, M_TEMP, M_WAITOK); - bcopy(cnp->cn_nameptr, un->un_path, cnp->cn_namelen); - un->un_path[cnp->cn_namelen] = '\0'; - vnode_get(dvp); - un->un_dirvp = dvp; - } else { - un->un_hash = 0; - un->un_path = 0; - un->un_dirvp = 0; - } - - if (xlowervp) - vnode_put(xlowervp); - - union_lock(); - - vnode_settag(*vpp, VT_UNION); - un->un_vnode = *vpp; - if (un->un_vnode->v_type == VDIR) { - if (un->un_uppervp == NULLVP) { - panic("faulting fs and no upper vp for dir?"); - } - - } - - - un->un_flags &= ~UN_LOCKED; - if ((un->un_flags & UN_WANT) == UN_WANT) { - un->un_flags &= ~UN_WANT; - wakeup(&un->un_flags); - } - - return(error); - -} - -/* always called with union lock held */ -int -union_freevp(struct vnode *vp) -{ - struct union_node *un = VTOUNION(vp); - - if (un->un_flags & UN_CACHED) { - un->un_flags &= ~UN_CACHED; - LIST_REMOVE(un, un_cache); - } - - union_unlock(); - if (un->un_pvp != NULLVP) - vnode_put(un->un_pvp); - if (un->un_uppervp != NULLVP) - vnode_put(un->un_uppervp); - if (un->un_lowervp != NULLVP) - vnode_put(un->un_lowervp); - if (un->un_dirvp != NULLVP) - vnode_put(un->un_dirvp); - if (un->un_path) - _FREE(un->un_path, M_TEMP); - - FREE(vp->v_data, M_TEMP); - vp->v_data = 0; - union_lock(); - - return (0); -} - -/* - * copyfile. copy the vnode (fvp) to the vnode (tvp) - * using a sequence of reads and writes. both (fvp) - * and (tvp) are locked on entry and exit. - */ -/* called with no union lock held */ -int -union_copyfile(struct vnode *fvp, struct vnode *tvp, vfs_context_t context) -{ - char *bufp; - struct uio *auio; - char uio_buf [ UIO_SIZEOF(1) ]; - int error = 0; - - /* - * strategy: - * allocate a buffer of size MAXPHYSIO. - * loop doing reads and writes, keeping track - * of the current uio offset. - * give up at the first sign of trouble. - */ - - auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, - UIO_READ /* will change */, &uio_buf, sizeof(uio_buf)); - - bufp = _MALLOC(MAXPHYSIO, M_TEMP, M_WAITOK); - if (bufp == NULL) { - return ENOMEM; - } - - /* ugly loop follows... */ - do { - off_t offset = uio_offset(auio); - - uio_reset(auio, offset, UIO_SYSSPACE, UIO_READ); - uio_addiov(auio, (uintptr_t)bufp, MAXPHYSIO); - error = VNOP_READ(fvp, auio, 0, context); - - if (error == 0) { - user_ssize_t resid = uio_resid(auio); - - uio_reset(auio, offset, UIO_SYSSPACE, UIO_WRITE); - uio_addiov(auio, (uintptr_t)bufp, MAXPHYSIO - resid); - - if (uio_resid(auio) == 0) - break; - - do { - error = VNOP_WRITE(tvp, auio, 0, context); - } while ((uio_resid(auio) > 0) && (error == 0)); - } - - } while (error == 0); - - _FREE(bufp, M_TEMP); - return (error); -} - -/* - * (un) is assumed to be locked on entry and remains - * locked on exit. - */ -/* always called with union lock held */ -int -union_copyup(struct union_node *un, int docopy, vfs_context_t context) -{ - int error; - struct vnode *lvp, *uvp; - struct vnode_attr vattr; - mode_t cmode = 0; - - - lvp = un->un_lowervp; - - union_unlock(); - - if (UNNODE_FAULTIN(un)) { - /* Need to inherit exec mode in faulting fs */ - VATTR_INIT(&vattr); - VATTR_WANTED(&vattr, va_flags); - if (vnode_getattr(lvp, &vattr, context) == 0 ) - cmode = vattr.va_mode; - - } - error = union_vn_create(&uvp, un, cmode, context); - if (error) { - union_lock(); - if (error == EEXIST) { - if (uvp != NULLVP) { - union_newupper(un, uvp); - error = 0; - } - } - return (error); - } - - union_lock(); - /* at this point, uppervp is locked */ - union_newupper(un, uvp); - union_unlock(); - - - if (docopy) { - /* - * XX - should not ignore errors - * from vnop_close - */ - error = VNOP_OPEN(lvp, FREAD, context); - if (error == 0) { - error = union_copyfile(lvp, uvp, context); - (void) VNOP_CLOSE(lvp, FREAD, context); - } -#ifdef UNION_DIAGNOSTIC - if (error == 0) - uprintf("union: copied up %s\n", un->un_path); -#endif - - } - union_vn_close(uvp, FWRITE, context); - - /* - * Subsequent IOs will go to the top layer, so - * call close on the lower vnode and open on the - * upper vnode to ensure that the filesystem keeps - * its references counts right. This doesn't do - * the right thing with (cred) and (FREAD) though. - * Ignoring error returns is not right, either. - */ - - /* No need to hold the lock as the union node should be locked for this(it is in faultin mode) */ - if (error == 0) { - int i; - - for (i = 0; i < un->un_openl; i++) { - (void) VNOP_CLOSE(lvp, FREAD, context); - (void) VNOP_OPEN(uvp, FREAD, context); - } - un->un_openl = 0; - } - - union_lock(); - - return (error); - -} - - -int -union_faultin_copyup(struct vnode **vpp, vnode_t udvp, vnode_t lvp, struct componentname * cnp, vfs_context_t context) -{ - int error; - struct vnode *uvp; - struct vnode_attr vattr; - struct vnode_attr *vap; - mode_t cmode = 0; - int fmode = FFLAGS(O_WRONLY|O_CREAT|O_TRUNC|O_EXCL); - struct proc * p = vfs_context_proc(context); - struct componentname cn; - - - vap = &vattr; - VATTR_INIT(vap); - VATTR_WANTED(vap, va_flags); - if (vnode_getattr(lvp, vap, context) == 0 ) - cmode = vattr.va_mode; - - *vpp = NULLVP; - - - if (cmode == (mode_t)0) - cmode = UN_FILEMODE & ~p->p_fd->fd_cmask; - else - cmode = cmode & ~p->p_fd->fd_cmask; - - - /* - * Build a new componentname structure (for the same - * reasons outlines in union_mkshadow()). - * The difference here is that the file is owned by - * the current user, rather than by the person who - * did the mount, since the current user needs to be - * able to write the file (that's why it is being - * copied in the first place). - */ - bzero(&cn, sizeof(struct componentname)); - - cn.cn_namelen = cnp->cn_namelen; - cn.cn_pnbuf = (caddr_t) _MALLOC_ZONE(cn.cn_namelen+1, - M_NAMEI, M_WAITOK); - cn.cn_pnlen = cn.cn_namelen+1; - bcopy(cnp->cn_nameptr, cn.cn_pnbuf, cn.cn_namelen+1); - cn.cn_nameiop = CREATE; - cn.cn_flags = (HASBUF|SAVENAME|SAVESTART|ISLASTCN|UNIONCREATED); - cn.cn_context = context; - cn.cn_nameptr = cn.cn_pnbuf; - cn.cn_hash = 0; - cn.cn_consume = 0; - - /* - * Pass dvp unlocked and referenced on call to relookup(). - * - * If an error occurs, dvp will be returned unlocked and dereferenced. - */ - if ((error = relookup(udvp, &uvp, &cn)) != 0) { - goto out; - } - - /* - * If no error occurs, dvp will be returned locked with the reference - * left as before, and vpp will be returned referenced and locked. - */ - if (uvp) { - *vpp = uvp; - error = EEXIST; - goto out; - } - - /* - * Good - there was no race to create the file - * so go ahead and create it. The permissions - * on the file will be 0666 modified by the - * current user's umask. Access to the file, while - * it is unioned, will require access to the top *and* - * bottom files. Access when not unioned will simply - * require access to the top-level file. - * - * TODO: confirm choice of access permissions. - * decide on authorisation behaviour - */ - - VATTR_INIT(vap); - VATTR_SET(vap, va_type, VREG); - VATTR_SET(vap, va_mode, cmode); - - cn.cn_flags |= (UNIONCREATED); - if ((error = vn_create(udvp, &uvp, &cn, vap, 0, context)) != 0) { - goto out; - } - - - if ((error = VNOP_OPEN(uvp, fmode, context)) != 0) { - vn_clearunionwait(uvp, 0); - vnode_recycle(uvp); - vnode_put(uvp); - goto out; - } - - error = vnode_ref_ext(uvp, fmode); - if (error ) { - vn_clearunionwait(uvp, 0); - VNOP_CLOSE(uvp, fmode, context); - vnode_recycle(uvp); - vnode_put(uvp); - goto out; - } - - - /* - * XX - should not ignore errors - * from vnop_close - */ - error = VNOP_OPEN(lvp, FREAD, context); - if (error == 0) { - error = union_copyfile(lvp, uvp, context); - (void) VNOP_CLOSE(lvp, FREAD, context); - } - - VNOP_CLOSE(uvp, fmode, context); - vnode_rele_ext(uvp, fmode, 0); - vn_clearunionwait(uvp, 0); - - *vpp = uvp; -out: - if ((cn.cn_flags & HASBUF) == HASBUF) { - FREE_ZONE(cn.cn_pnbuf, cn.cn_pnlen, M_NAMEI); - cn.cn_flags &= ~HASBUF; - } - return (error); -} - - -/* - * union_relookup: - * - * dvp should be locked on entry and will be locked on return. No - * net change in the ref count will occur. - * - * If an error is returned, *vpp will be invalid, otherwise it - * will hold a locked, referenced vnode. If *vpp == dvp then - * remember that only one exclusive lock is held. - */ - -/* No union lock held for this call */ -static int -union_relookup( -#ifdef XXX_HELP_ME - struct union_mount *um, -#else /* !XXX_HELP_ME */ - __unused struct union_mount *um, -#endif /* !XXX_HELP_ME */ - struct vnode *dvp, - struct vnode **vpp, - struct componentname *cnp, - struct componentname *cn, - char *path, - int pathlen) -{ - int error; - - /* - * A new componentname structure must be faked up because - * there is no way to know where the upper level cnp came - * from or what it is being used for. This must duplicate - * some of the work done by NDINIT, some of the work done - * by namei, some of the work done by lookup and some of - * the work done by vnop_lookup when given a CREATE flag. - * Conclusion: Horrible. - */ - cn->cn_namelen = pathlen; - cn->cn_pnbuf = _MALLOC_ZONE(cn->cn_namelen+1, M_NAMEI, M_WAITOK); - cn->cn_pnlen = cn->cn_namelen+1; - bcopy(path, cn->cn_pnbuf, cn->cn_namelen); - cn->cn_pnbuf[cn->cn_namelen] = '\0'; - - cn->cn_nameiop = CREATE; - cn->cn_flags = (HASBUF|SAVENAME|SAVESTART|ISLASTCN ); -#ifdef XXX_HELP_ME - cn->cn_proc = cnp->cn_proc; - if (um->um_op == UNMNT_ABOVE) - cn->cn_cred = cnp->cn_cred; - else - cn->cn_cred = um->um_cred; -#endif - cn->cn_context = cnp->cn_context; /* XXX !UNMNT_ABOVE case ??? */ - cn->cn_nameptr = cn->cn_pnbuf; - cn->cn_hash = 0; - cn->cn_consume = cnp->cn_consume; - - vnode_get(dvp); - error = relookup(dvp, vpp, cn); - vnode_put(dvp); - - return (error); -} - -/* - * Create a shadow directory in the upper layer. - * The new vnode is returned locked. - * - * (um) points to the union mount structure for access to the - * the mounting process's credentials. - * (dvp) is the directory in which to create the shadow directory, - * It is locked (but not ref'd) on entry and return. - * (cnp) is the component name to be created. - * (vpp) is the returned newly created shadow directory, which - * is returned locked and ref'd - */ -/* No union lock held for this call */ -int -union_mkshadow(um, dvp, cnp, vpp) - struct union_mount *um; - struct vnode *dvp; - struct componentname *cnp; - struct vnode **vpp; -{ - int error; - struct vnode_attr va; - struct componentname cn; - - bzero(&cn, sizeof(struct componentname)); - - - error = union_relookup(um, dvp, vpp, cnp, &cn, - cnp->cn_nameptr, cnp->cn_namelen); - if (error) - goto out; - - if (*vpp) { - error = EEXIST; - goto out; - } - - /* - * Policy: when creating the shadow directory in the - * upper layer, create it owned by the user who did - * the mount, group from parent directory, and mode - * 777 modified by umask (ie mostly identical to the - * mkdir syscall). (jsp, kb) - */ - - VATTR_INIT(&va); - VATTR_SET(&va, va_type, VDIR); - VATTR_SET(&va, va_mode, um->um_cmode); - - error = vn_create(dvp, vpp, &cn, &va, 0, cnp->cn_context); -out: - if ((cn.cn_flags & HASBUF) == HASBUF) { - FREE_ZONE(cn.cn_pnbuf, cn.cn_pnlen, M_NAMEI); - cn.cn_flags &= ~HASBUF; - } - return (error); -} - -/* - * Create a whiteout entry in the upper layer. - * - * (um) points to the union mount structure for access to the - * the mounting process's credentials. - * (dvp) is the directory in which to create the whiteout. - * it is locked on entry and exit. - * (cnp) is the componentname to be created. - */ -/* No union lock held for this call */ -int -union_mkwhiteout(um, dvp, cnp, path) - struct union_mount *um; - struct vnode *dvp; - struct componentname *cnp; - char *path; -{ - int error; - struct vnode *wvp; - struct componentname cn; - - bzero(&cn, sizeof(struct componentname)); - - error = union_relookup(um, dvp, &wvp, cnp, &cn, path, strlen(path)); - if (error) { - goto out; - } - if (wvp) { - error = EEXIST; - goto out; - } - - error = VNOP_WHITEOUT(dvp, &cn, CREATE, cnp->cn_context); - -out: - if ((cn.cn_flags & HASBUF) == HASBUF) { - FREE_ZONE(cn.cn_pnbuf, cn.cn_pnlen, M_NAMEI); - cn.cn_flags &= ~HASBUF; - } - return (error); -} - - -/* - * union_vn_create: creates and opens a new shadow file - * on the upper union layer. This function is similar - * in spirit to calling vn_open() but it avoids calling namei(). - * The problem with calling namei() is that a) it locks too many - * things, and b) it doesn't start at the "right" directory, - * whereas relookup() is told where to start. - * - * On entry, the vnode associated with un is locked. It remains locked - * on return. - * - * If no error occurs, *vpp contains a locked referenced vnode for your - * use. If an error occurs *vpp iis undefined. - */ -/* called with no union lock held */ -int -union_vn_create(struct vnode **vpp, struct union_node *un, mode_t cmode, vfs_context_t context) -{ - struct vnode *vp; - struct vnode_attr vat; - struct vnode_attr *vap = &vat; - int fmode = FFLAGS(O_WRONLY|O_CREAT|O_TRUNC|O_EXCL); - int error; - struct proc * p = vfs_context_proc(context); - struct componentname cn; - - bzero(&cn, sizeof(struct componentname)); - *vpp = NULLVP; - - if (cmode == (mode_t)0) - cmode = UN_FILEMODE & ~p->p_fd->fd_cmask; - else - cmode = cmode & ~p->p_fd->fd_cmask; - - - /* - * Build a new componentname structure (for the same - * reasons outlines in union_mkshadow()). - * The difference here is that the file is owned by - * the current user, rather than by the person who - * did the mount, since the current user needs to be - * able to write the file (that's why it is being - * copied in the first place). - */ - cn.cn_namelen = strlen(un->un_path); - cn.cn_pnbuf = (caddr_t) _MALLOC_ZONE(cn.cn_namelen+1, - M_NAMEI, M_WAITOK); - cn.cn_pnlen = cn.cn_namelen+1; - bcopy(un->un_path, cn.cn_pnbuf, cn.cn_namelen+1); - cn.cn_nameiop = CREATE; - if (UNNODE_FAULTIN(un)) - cn.cn_flags = (HASBUF|SAVENAME|SAVESTART|ISLASTCN|UNIONCREATED); - else - cn.cn_flags = (HASBUF|SAVENAME|SAVESTART|ISLASTCN); - cn.cn_context = context; - cn.cn_nameptr = cn.cn_pnbuf; - cn.cn_hash = un->un_hash; - cn.cn_consume = 0; - - /* - * Pass dvp unlocked and referenced on call to relookup(). - * - * If an error occurs, dvp will be returned unlocked and dereferenced. - */ - vnode_get(un->un_dirvp); - if ((error = relookup(un->un_dirvp, &vp, &cn)) != 0) { - vnode_put(un->un_dirvp); - goto out; - } - vnode_put(un->un_dirvp); - - /* - * If no error occurs, dvp will be returned locked with the reference - * left as before, and vpp will be returned referenced and locked. - */ - if (vp) { - *vpp = vp; - error = EEXIST; - goto out; - } - - /* - * Good - there was no race to create the file - * so go ahead and create it. The permissions - * on the file will be 0666 modified by the - * current user's umask. Access to the file, while - * it is unioned, will require access to the top *and* - * bottom files. Access when not unioned will simply - * require access to the top-level file. - * - * TODO: confirm choice of access permissions. - * decide on authorisation behaviour - */ - - VATTR_INIT(vap); - VATTR_SET(vap, va_type, VREG); - VATTR_SET(vap, va_mode, cmode); - - if ((error = vn_create(un->un_dirvp, &vp, &cn, vap, 0, context)) != 0) { - goto out; - } - - if ((error = VNOP_OPEN(vp, fmode, context)) != 0) { - vnode_put(vp); - goto out; - } - - vnode_lock(vp); - if (++vp->v_writecount <= 0) - panic("union: v_writecount"); - vnode_unlock(vp); - *vpp = vp; - error = 0; - -out: - if ((cn.cn_flags & HASBUF) == HASBUF) { - FREE_ZONE(cn.cn_pnbuf, cn.cn_pnlen, M_NAMEI); - cn.cn_flags &= ~HASBUF; - } - return(error); -} - -/* called with no union lock held */ -static int -union_vn_close(struct vnode *vp, int fmode, vfs_context_t context) -{ - - if (fmode & FWRITE) { - vnode_lock(vp); - --vp->v_writecount; - vnode_unlock(vp); - } - return (VNOP_CLOSE(vp, fmode, context)); -} - -/* - * union_removed_upper: - * - * An upper-only file/directory has been removed; un-cache it so - * that unionfs vnode gets reclaimed and the last uppervp reference - * disappears. - * - * Called with union_node unlocked. - */ -/* always called with union lock held */ -void -union_removed_upper(un) - struct union_node *un; -{ - union_newupper(un, NULLVP); - if (un->un_flags & UN_CACHED) { - un->un_flags &= ~UN_CACHED; - LIST_REMOVE(un, un_cache); - } - -} - -#if 0 -struct vnode * -union_lowervp(vp) - struct vnode *vp; -{ - struct union_node *un = VTOUNION(vp); - - if ((un->un_lowervp != NULLVP) && - (vp->v_type == un->un_lowervp->v_type)) { - if (vnode_get(un->un_lowervp) == 0) - return (un->un_lowervp); - } - - return (NULLVP); -} -#endif - -/* - * Determine whether a whiteout is needed - * during a remove/rmdir operation. - */ -/* called with no union lock held */ -int -union_dowhiteout(struct union_node *un, vfs_context_t ctx) -{ - struct vnode_attr va; - - if (UNNODE_FAULTIN(un)) - return(0); - - if ((un->un_lowervp != NULLVP) ) - return (1); - - VATTR_INIT(&va); - VATTR_WANTED(&va, va_flags); - if (vnode_getattr(un->un_uppervp, &va, ctx) == 0 && - (va.va_flags & OPAQUE)) - return (1); - - return (0); -} - -/* called with no union lock held */ -static void -union_dircache_r(struct vnode *vp, struct vnode ***vppp, int *cntp) -{ - struct union_node *un; - - if (vp->v_op != union_vnodeop_p) { - if (vppp) { - vnode_get(vp); - *(*vppp)++ = vp; - if (--(*cntp) == 0) - panic("union: dircache table too small"); - } else { - (*cntp)++; - } - - return; - } - - un = VTOUNION(vp); - if (un->un_uppervp != NULLVP) - union_dircache_r(un->un_uppervp, vppp, cntp); - if (un->un_lowervp != NULLVP) - union_dircache_r(un->un_lowervp, vppp, cntp); -} - -/* called with no union lock held */ -struct vnode * -union_dircache(struct vnode *vp, __unused vfs_context_t context) -{ - int count; - struct vnode *nvp, *lvp; - struct vnode **vpp; - struct vnode **dircache, **newdircache; - struct union_node *un; - int error; - int alloced = 0; - - union_lock(); - newdircache = NULL; - - nvp = NULLVP; - un = VTOUNION(vp); - - dircache = un->un_dircache; - if (dircache == 0) { - union_unlock(); - count = 0; - union_dircache_r(vp, 0, &count); - count++; -#if 0 - /* too bad; we need Union now! */ -#if MAC_XXX - panic("MAC Framework doesn't support unionfs (yet)\n"); -#endif /* MAC */ -#endif - - dircache = (struct vnode **) - _MALLOC(count * sizeof(struct vnode *), - M_TEMP, M_WAITOK); - if (dircache == NULL) { - goto out; - } - newdircache = dircache; - alloced = 1; - vpp = dircache; - union_dircache_r(vp, &vpp, &count); - *vpp = NULLVP; - vpp = dircache + 1; - union_lock(); - } else { - vpp = dircache; - do { - if (*vpp++ == un->un_uppervp) - break; - } while (*vpp != NULLVP); - } - - lvp = *vpp; - union_unlock(); - if (lvp == NULLVP) { - goto out; - } - - vnode_get(lvp); - union_lock(); - - error = union_allocvp(&nvp, vp->v_mount, NULLVP, NULLVP, 0, lvp, NULLVP, 0); - if (error) { - union_unlock(); - vnode_put(lvp); - goto out; - } - - un->un_dircache = 0; - un = VTOUNION(nvp); -#if 0 - if ((alloced != 0) && (un->un_dircache != 0)) { - union_unlock(); - for (vpp = newdircache; *vpp != NULLVP; vpp++) - vnode_put(*vpp); - _FREE(newdircache, M_TEMP); - newdircache = NULL; - union_lock(); - if (nvp != NULLVP) - union_freevp(nvp); - goto loop; - } -#endif - un->un_dircache = dircache; - un->un_flags |= UN_DIRENVN; - - newdircache = NULL; - union_unlock(); - return (nvp); - -out: - /* - * If we allocated a new dircache and couldn't attach - * it to a new vp, free the resources we allocated. - */ - if (newdircache) { - for (vpp = newdircache; *vpp != NULLVP; vpp++) - vnode_put(*vpp); - _FREE(newdircache, M_TEMP); - } - return (NULLVP); -} - -/* - * Module glue to remove #ifdef UNION from vfs_syscalls.c - */ -/* Called with no union lock, the union_dircache takes locks when necessary */ -static int -union_dircheck(struct vnode **vpp, struct fileproc *fp, vfs_context_t ctx) -{ - int error = 0; - vnode_t vp = *vpp; - - if (vp->v_op == union_vnodeop_p) { - struct vnode *lvp; - - lvp = union_dircache(vp, ctx); - if (lvp != NULLVP) { - struct vnode_attr va; - /* - * If the directory is opaque, - * then don't show lower entries - */ - VATTR_INIT(&va); - VATTR_WANTED(&va, va_flags); - error = vnode_getattr(vp, &va, ctx); - if (va.va_flags & OPAQUE) { - vnode_put(lvp); - lvp = NULL; - } - } - - if (lvp != NULLVP) { -#if CONFIG_MACF - error = mac_vnode_check_open(ctx, lvp, FREAD); - if (error) { - vnode_put(lvp); - return(error); - } -#endif /* MAC */ - error = VNOP_OPEN(lvp, FREAD, ctx); - if (error) { - vnode_put(lvp); - return(error); - } - vnode_ref(lvp); - fp->f_fglob->fg_data = (caddr_t) lvp; - fp->f_fglob->fg_offset = 0; - - error = VNOP_CLOSE(vp, FREAD, ctx); - vnode_rele(vp); - vnode_put(vp); - if (error) - return(error); - - *vpp = lvp; - return -1; /* goto unionread */ - } - } - return error; -} - -/* called from inactive with union lock held */ -void -union_dircache_free(struct union_node *un) -{ - struct vnode **vpp; - - vpp = un->un_dircache; - un->un_dircache = NULL; - union_unlock(); - - for (; *vpp != NULLVP; vpp++) - vnode_put(*vpp); - _FREE(un->un_dircache, M_TEMP); - union_lock(); -} - diff --git a/bsd/miscfs/union/union_vfsops.c b/bsd/miscfs/union/union_vfsops.c deleted file mode 100644 index 6924e2f67..000000000 --- a/bsd/miscfs/union/union_vfsops.c +++ /dev/null @@ -1,563 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/* - * Copyright (c) 1994, 1995 The Regents of the University of California. - * Copyright (c) 1994, 1995 Jan-Simon Pendry. - * All rights reserved. - * - * This code is derived from software donated to Berkeley by - * Jan-Simon Pendry. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)union_vfsops.c 8.20 (Berkeley) 5/20/95 - */ - -/* - * Union Layer - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static int union_itercallback(vnode_t, void *); -static int union_root(mount_t, vnode_t *, vfs_context_t); - -/* - * Mount union filesystem - */ -static int -union_mount(mount_t mp, __unused vnode_t devvp, user_addr_t data, vfs_context_t context) -{ - proc_t p = vfs_context_proc(context); - int error = 0; - struct user_union_args args; - struct vnode *lowerrootvp = NULLVP; - struct vnode *upperrootvp = NULLVP; - struct union_mount *um = NULL; - kauth_cred_t cred = NOCRED; - const char *cp = NULL; - char *vcp; - int len; - u_int size; - struct nameidata nd; - -#ifdef UNION_DIAGNOSTIC - printf("union_mount(mp = %x)\n", mp); -#endif - - /* - * Update is a no-op - */ - if (mp->mnt_flag & MNT_UPDATE) { - /* - * Need to provide. - * 1. a way to convert between rdonly and rdwr mounts. - * 2. support for nfs exports. - */ - error = ENOTSUP; - goto bad; - } - - /* - * Get argument - */ - if (vfs_context_is64bit(context)) { - error = copyin(data, (caddr_t)&args, sizeof(args)); - } - else { - struct union_args temp; - error = copyin(data, (caddr_t)&temp, sizeof (temp)); - args.target = CAST_USER_ADDR_T(temp.target); - args.mntflags = temp.mntflags; - } - if (error) - goto bad; - - lowerrootvp = mp->mnt_vnodecovered; - vnode_get(lowerrootvp); - - /* - * Find upper node. - */ - NDINIT(&nd, LOOKUP, FOLLOW|WANTPARENT, - (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32), - args.target, context); - - if ((error = namei(&nd))) - goto bad; - - nameidone(&nd); - upperrootvp = nd.ni_vp; - vnode_put(nd.ni_dvp); - nd.ni_dvp = NULL; - - if (upperrootvp->v_type != VDIR) { - error = EINVAL; - goto bad; - } - - MALLOC(um, struct union_mount *, sizeof(struct union_mount), - M_UFSMNT, M_WAITOK); - - /* - * Keep a held reference to the target vnodes. - * They are vnode_put'd in union_unmount. - * - * Depending on the _BELOW flag, the filesystems are - * viewed in a different order. In effect, this is the - * same as providing a mount under option to the mount syscall. - */ - - um->um_op = args.mntflags & UNMNT_OPMASK; - switch (um->um_op) { - case UNMNT_ABOVE: - um->um_lowervp = lowerrootvp; - um->um_uppervp = upperrootvp; - break; - - case UNMNT_BELOW: - um->um_lowervp = upperrootvp; - um->um_uppervp = lowerrootvp; - break; - - case UNMNT_REPLACE: - vnode_put(lowerrootvp); - lowerrootvp = NULLVP; - um->um_uppervp = upperrootvp; - um->um_lowervp = lowerrootvp; - break; - -#ifdef FAULTFS - case UNMNT_FAULTIN: - um->um_lowervp = upperrootvp; - um->um_uppervp = lowerrootvp; - break; -#endif - - default: - error = EINVAL; - goto bad; - } - - if (um->um_lowervp != NULLVP) - um->um_lowervid = vnode_vid(um->um_lowervp); - if (um->um_uppervp != NULLVP) - um->um_uppervid = vnode_vid(um->um_uppervp); - /* - * Unless the mount is readonly, ensure that the top layer - * supports whiteout operations - */ -#ifdef FAULTFS - if ((um->um_op != UNMNT_FAULTIN) && (mp->mnt_flag & MNT_RDONLY) == 0) -#else - if ((mp->mnt_flag & MNT_RDONLY) == 0) -#endif - { - error = VNOP_WHITEOUT(um->um_uppervp, (struct componentname *) 0, - LOOKUP, context); - if (error) - goto bad; - } - - um->um_cred = kauth_cred_get_with_ref(); - um->um_cmode = UN_DIRMODE &~ p->p_fd->fd_cmask; - - /* - * Depending on what you think the MNT_LOCAL flag might mean, - * you may want the && to be || on the conditional below. - * At the moment it has been defined that the filesystem is - * only local if it is all local, ie the MNT_LOCAL flag implies - * that the entire namespace is local. If you think the MNT_LOCAL - * flag implies that some of the files might be stored locally - * then you will want to change the conditional. - */ - if (um->um_op == UNMNT_ABOVE) { - if (((um->um_lowervp == NULLVP) || - (um->um_lowervp->v_mount->mnt_flag & MNT_LOCAL)) && - (um->um_uppervp->v_mount->mnt_flag & MNT_LOCAL)) - mp->mnt_flag |= MNT_LOCAL; - } - - /* - * Copy in the upper layer's RDONLY flag. This is for the benefit - * of lookup() which explicitly checks the flag, rather than asking - * the filesystem for it's own opinion. This means, that an update - * mount of the underlying filesystem to go from rdonly to rdwr - * will leave the unioned view as read-only. - */ - mp->mnt_flag |= (um->um_uppervp->v_mount->mnt_flag & MNT_RDONLY); - - mp->mnt_data = (qaddr_t) um; - vfs_getnewfsid(mp); - - - switch (um->um_op) { - case UNMNT_ABOVE: - cp = ":"; - break; - case UNMNT_BELOW: - cp = ":"; - break; - case UNMNT_REPLACE: - cp = ""; - break; -#ifdef FAULTFS - case UNMNT_FAULTIN: - cp = "/FaultingFS/"; - break; -#endif - } - len = strlen(cp); - bcopy(cp, mp->mnt_vfsstat.f_mntfromname, len); - - vcp = mp->mnt_vfsstat.f_mntfromname + len; - len = MNAMELEN - len; - - (void) copyinstr(args.target, vcp, len - 1, (size_t *)&size); - bzero(vcp + size, len - size); - -#ifdef UNION_DIAGNOSTIC - printf("union_mount: from %s, on %s\n", - mp->mnt_vfsstat.f_mntfromname, mp->mnt_vfsstat.f_mntonname); -#endif - return (0); - -bad: - if (um) - _FREE(um, M_UFSMNT); - if (IS_VALID_CRED(cred)) - kauth_cred_unref(&cred); - if (upperrootvp) - vnode_put(upperrootvp); - if (lowerrootvp) - vnode_put(lowerrootvp); - return (error); -} - -/* - * VFS start. Nothing needed here - the start routine - * on the underlying filesystem(s) will have been called - * when that filesystem was mounted. - */ -static int -union_start(__unused struct mount *mp, __unused int flags, __unused vfs_context_t context) -{ - - return (0); -} - -static int -union_itercallback(__unused vnode_t vp, void *args) -{ - int num = *(int *)args; - - *(int *)args = num + 1; - return(VNODE_RETURNED); -} - - - -/* - * Free reference to union layer - */ -static int -union_unmount(mount_t mp, int mntflags, vfs_context_t context) -{ - struct union_mount *um = MOUNTTOUNIONMOUNT(mp); - struct vnode *um_rootvp; - int error; - int freeing; - int flags = 0; - -#ifdef UNION_DIAGNOSTIC - printf("union_unmount(mp = %x)\n", mp); -#endif - - if (mntflags & MNT_FORCE) - flags |= FORCECLOSE; - - if ((error = union_root(mp, &um_rootvp, context))) - return (error); - - /* - * Keep flushing vnodes from the mount list. - * This is needed because of the un_pvp held - * reference to the parent vnode. - * If more vnodes have been freed on a given pass, - * the try again. The loop will iterate at most - * (d) times, where (d) is the maximum tree depth - * in the filesystem. - */ - for (freeing = 0; vflush(mp, um_rootvp, flags) != 0;) { - int n = 0; - - vnode_iterate(mp, VNODE_NOLOCK_INTERNAL, union_itercallback, &n); - - /* if this is unchanged then stop */ - if (n == freeing) - break; - - /* otherwise try once more time */ - freeing = n; - } - - /* At this point the root vnode should have a single reference */ - if (vnode_isinuse(um_rootvp, 0)) { - vnode_put(um_rootvp); - return (EBUSY); - } - -#ifdef UNION_DIAGNOSTIC - vprint("union root", um_rootvp); -#endif - /* - * Discard references to upper and lower target vnodes. - */ - if (um->um_lowervp) - vnode_put(um->um_lowervp); - vnode_put(um->um_uppervp); - if (IS_VALID_CRED(um->um_cred)) { - kauth_cred_unref(&um->um_cred); - } - /* - * Release reference on underlying root vnode - */ - vnode_put(um_rootvp); - /* - * And blow it away for future re-use - */ - vnode_reclaim(um_rootvp); - /* - * Finally, throw away the union_mount structure - */ - _FREE(mp->mnt_data, M_UFSMNT); /* XXX */ - mp->mnt_data = NULL; - return (0); -} - -static int -union_root(mount_t mp, vnode_t *vpp, __unused vfs_context_t context) -{ - struct union_mount *um = MOUNTTOUNIONMOUNT(mp); - int error; - - /* - * Return locked reference to root. - */ - vnode_get(um->um_uppervp); - if (um->um_lowervp) - vnode_get(um->um_lowervp); - - union_lock(); - error = union_allocvp(vpp, mp, - (struct vnode *) 0, - (struct vnode *) 0, - (struct componentname *) 0, - um->um_uppervp, - um->um_lowervp, - 1); - union_unlock(); - - if (error) { - vnode_put(um->um_uppervp); - if (um->um_lowervp) - vnode_put(um->um_lowervp); - } - - return (error); -} - -static int -union_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t context) -{ - int error; - struct union_mount *um = MOUNTTOUNIONMOUNT(mp); - struct vfs_attr attr; - uint32_t lbsize = 0; - -#ifdef UNION_DIAGNOSTIC - printf("union_vfs_getattr(mp = %x, lvp = %x, uvp = %x)\n", mp, - um->um_lowervp, - um->um_uppervp); -#endif - - /* Get values from lower file system (if any) */ - if (um->um_lowervp) { - VFSATTR_INIT(&attr); - VFSATTR_WANTED(&attr, f_bsize); - VFSATTR_WANTED(&attr, f_blocks); - VFSATTR_WANTED(&attr, f_bused); - VFSATTR_WANTED(&attr, f_files); - error = vfs_getattr(um->um_lowervp->v_mount, &attr, context); - if (error) - return (error); - - /* now copy across the "interesting" information and fake the rest */ - if (VFSATTR_IS_SUPPORTED(&attr, f_bsize)) - lbsize = attr.f_bsize; - else - lbsize = um->um_lowervp->v_mount->mnt_devblocksize; - fsap->f_blocks = VFSATTR_IS_SUPPORTED(&attr, f_blocks) ? attr.f_blocks : 0; - fsap->f_bused = VFSATTR_IS_SUPPORTED(&attr, f_bused) ? attr.f_bused : 0; - fsap->f_files = VFSATTR_IS_SUPPORTED(&attr, f_files) ? attr.f_files : 0; - } else { - fsap->f_blocks = 0; - fsap->f_bused = 0; - fsap->f_files = 0; - } - - VFSATTR_INIT(&attr); - VFSATTR_WANTED(&attr, f_bsize); - VFSATTR_WANTED(&attr, f_blocks); - VFSATTR_WANTED(&attr, f_bfree); - VFSATTR_WANTED(&attr, f_bavail); - VFSATTR_WANTED(&attr, f_files); - VFSATTR_WANTED(&attr, f_ffree); - error = vfs_getattr(um->um_uppervp->v_mount, &attr, context); - if (error) - return (error); - - if (VFSATTR_IS_SUPPORTED(&attr, f_bsize)) { - fsap->f_bsize = attr.f_bsize; - VFSATTR_SET_SUPPORTED(fsap, f_bsize); - } - if (VFSATTR_IS_SUPPORTED(&attr, f_iosize)) { - fsap->f_iosize = attr.f_iosize; - VFSATTR_SET_SUPPORTED(fsap, f_iosize); - } - - /* - * if the lower and upper blocksizes differ, then frig the - * block counts so that the sizes reported by df make some - * kind of sense. none of this makes sense though. - */ - if (VFSATTR_IS_SUPPORTED(&attr, f_bsize)) - fsap->f_bsize = attr.f_bsize; - else - fsap->f_bsize = um->um_uppervp->v_mount->mnt_devblocksize; - VFSATTR_RETURN(fsap, f_bsize, attr.f_bsize); - if (fsap->f_bsize != lbsize) - fsap->f_blocks = fsap->f_blocks * lbsize / attr.f_bsize; - - /* - * The "total" fields count total resources in all layers, - * the "free" fields count only those resources which are - * free in the upper layer (since only the upper layer - * is writeable). - */ - if (VFSATTR_IS_SUPPORTED(&attr, f_blocks)) - fsap->f_blocks += attr.f_blocks; - if (VFSATTR_IS_SUPPORTED(&attr, f_bfree)) - fsap->f_bfree = attr.f_bfree; - if (VFSATTR_IS_SUPPORTED(&attr, f_bavail)) - fsap->f_bavail = attr.f_bavail; - if (VFSATTR_IS_SUPPORTED(&attr, f_bused)) - fsap->f_bused += attr.f_bused; - if (VFSATTR_IS_SUPPORTED(&attr, f_files)) - fsap->f_files += attr.f_files; - if (VFSATTR_IS_SUPPORTED(&attr, f_ffree)) - fsap->f_ffree = attr.f_ffree; - - VFSATTR_SET_SUPPORTED(fsap, f_bsize); - VFSATTR_SET_SUPPORTED(fsap, f_blocks); - VFSATTR_SET_SUPPORTED(fsap, f_bfree); - VFSATTR_SET_SUPPORTED(fsap, f_bavail); - VFSATTR_SET_SUPPORTED(fsap, f_bused); - VFSATTR_SET_SUPPORTED(fsap, f_files); - VFSATTR_SET_SUPPORTED(fsap, f_ffree); - - return (0); -} - -/* - * XXX - Assumes no data cached at union layer. - */ -#define union_sync (int (*) (mount_t, int, vfs_context_t))nullop - -#define union_fhtovp (int (*) (mount_t, int, unsigned char *, vnode_t *, vfs_context_t))eopnotsupp -#define union_sysctl (int (*) (int *, u_int, user_addr_t, size_t *, user_addr_t, size_t, vfs_context_t))eopnotsupp -#define union_vget (int (*) (mount_t, ino64_t, vnode_t *, vfs_context_t))eopnotsupp -#define union_vptofh (int (*) (vnode_t, int *, unsigned char *, vfs_context_t))eopnotsupp - -struct vfsops union_vfsops = { - union_mount, - union_start, - union_unmount, - union_root, - NULL, /* quotactl */ - union_vfs_getattr, - union_sync, - union_vget, - union_fhtovp, - union_vptofh, - union_init, - union_sysctl, - NULL, - {NULL} -}; - - diff --git a/bsd/miscfs/union/union_vnops.c b/bsd/miscfs/union/union_vnops.c deleted file mode 100644 index ddc374dea..000000000 --- a/bsd/miscfs/union/union_vnops.c +++ /dev/null @@ -1,1726 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/* - * Copyright (c) 1992, 1993, 1994, 1995 Jan-Simon Pendry. - * Copyright (c) 1992, 1993, 1994, 1995 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Jan-Simon Pendry. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)union_vnops.c 8.32 (Berkeley) 6/23/95 - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* called with no union lock held */ -static int -union_lookup1(struct vnode *udvp, struct vnode **dvpp, struct vnode **vpp, - struct componentname *cnp) -{ - int error; - vfs_context_t ctx = cnp->cn_context; - struct vnode *tdvp; - struct vnode *dvp; - struct mount *mp; - - dvp = *dvpp; - - /* - * If stepping up the directory tree, check for going - * back across the mount point, in which case do what - * lookup would do by stepping back down the mount - * hierarchy. - */ - if (cnp->cn_flags & ISDOTDOT) { - while ((dvp != udvp) && (dvp->v_flag & VROOT)) { - /* - * Don't do the NOCROSSMOUNT check - * at this level. By definition, - * union fs deals with namespaces, not - * filesystems. - */ - tdvp = dvp; - *dvpp = dvp = dvp->v_mount->mnt_vnodecovered; - vnode_put(tdvp); - vnode_get(dvp); - } - } - - error = VNOP_LOOKUP(dvp, &tdvp, cnp, ctx); - if (error) - return (error); - - dvp = tdvp; - /* - * Lastly check if the current node is a mount point in - * which case walk up the mount hierarchy making sure not to - * bump into the root of the mount tree (ie. dvp != udvp). - */ - while (dvp != udvp && (dvp->v_type == VDIR) && - (mp = dvp->v_mountedhere)) { - if (vfs_busy(mp, LK_NOWAIT)) { - vnode_put(dvp); - return(ENOENT); - } - error = VFS_ROOT(mp, &tdvp, ctx); - vfs_unbusy(mp); - if (error) { - vnode_put(dvp); - return (error); - } - - vnode_put(dvp); - dvp = tdvp; - } - - *vpp = dvp; - return (0); -} - -static int -union_lookup(struct vnop_lookup_args *ap) -/* - struct vnop_lookup_args { - struct vnodeop_desc *a_desc; - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - vfs_context_t a_context; - } *ap) -*/ -{ - int error; - int uerror = 0; - int lerror = 0; - struct vnode *uppervp, *lowervp; - struct vnode *upperdvp, *lowerdvp; - struct vnode *dvp = ap->a_dvp; - struct union_node *dun; - struct componentname *cnp = ap->a_cnp; - vfs_context_t ctx = cnp->cn_context; - int lockparent = cnp->cn_flags & LOCKPARENT; - struct union_mount *um; - kauth_cred_t saved_cred; - int iswhiteout; - struct vnode_attr va; - int isfaultfs = 0; - int upperlookup = 0; - int retry_count = 0; - -#ifdef notyet - if (cnp->cn_namelen == 3 && - cnp->cn_nameptr[2] == '.' && - cnp->cn_nameptr[1] == '.' && - cnp->cn_nameptr[0] == '.') { - dvp = *ap->a_vpp = LOWERVP(ap->a_dvp); - if (dvp == NULLVP) - return (ENOENT); - vnode_get(dvp); - - return (0); - } -#endif - - - -retry: - union_lock(); - um = MOUNTTOUNIONMOUNT(dvp->v_mount); - dun = VTOUNION(dvp); - upperdvp = dun->un_uppervp; - lowerdvp = dun->un_lowervp; - uppervp = NULLVP; - lowervp = NULLVP; - iswhiteout = 0; - - union_unlock(); - - if(UNION_FAULTIN(um)) - isfaultfs = 1; - - if (isfaultfs == 0) - cnp->cn_flags |= LOCKPARENT; - - /* - * do the lookup in the upper level. - * if that level comsumes additional pathnames, - * then assume that something special is going - * on and just return that vnode. - */ - if (upperdvp != NULLVP) { - if (lockparent != 0) - cnp->cn_flags &= ~LOCKPARENT; - uerror = union_lookup1(um->um_uppervp, &upperdvp, - &uppervp, cnp); - upperlookup = 1; - - if (cnp->cn_consume != 0) { - *ap->a_vpp = uppervp; - if (!lockparent) - cnp->cn_flags &= ~LOCKPARENT; - else - cnp->cn_flags |= LOCKPARENT; - return (uerror); - } - if (uerror == ENOENT || uerror == EJUSTRETURN) { - if (cnp->cn_flags & ISWHITEOUT) { - iswhiteout = 1; - } else if (lowerdvp != NULLVP) { - VATTR_INIT(&va); - VATTR_WANTED(&va, va_flags); - lerror = vnode_getattr(upperdvp, &va, ap->a_context); - if (lerror == 0 && (va.va_flags & OPAQUE)) - iswhiteout = 1; - } - } - } else { - uerror = ENOENT; - } - - /* - * faultingfs: If upper layer lookup is succesful - * we will return that vp if it is regular file. - * So so skip lower level lookup - */ - - if ((isfaultfs == 1) && (upperlookup == 1) && (uerror == 0) && ((vnode_isreg(uppervp) != 0))) - goto donelowerlookup; - - /* - * in a similar way to the upper layer, do the lookup - * in the lower layer. this time, if there is some - * component magic going on, then vnode_put whatever we got - * back from the upper layer and return the lower vnode - * instead. - */ - if (lowerdvp != NULLVP && !iswhiteout) { - int nameiop; - - /* - * Only do a LOOKUP on the bottom node, since - * we won't be making changes to it anyway. - */ - nameiop = cnp->cn_nameiop; - cnp->cn_nameiop = LOOKUP; - if (um->um_op == UNMNT_BELOW) { - /* XXX BOGUS */ - saved_cred = cnp->cn_context->vc_ucred; - cnp->cn_context->vc_ucred = um->um_cred; - if (lockparent != 0) - cnp->cn_flags &= ~LOCKPARENT; - lerror = union_lookup1(um->um_lowervp, &lowerdvp, - &lowervp, cnp); - cnp->cn_context->vc_ucred = saved_cred; - } else { - if (lockparent != 0) - cnp->cn_flags &= ~LOCKPARENT; - lerror = union_lookup1(um->um_lowervp, &lowerdvp, - &lowervp, cnp); - } - cnp->cn_nameiop = nameiop; - - if (cnp->cn_consume != 0) { - if (uppervp != NULLVP) { - vnode_put(uppervp); - uppervp = NULLVP; - } - *ap->a_vpp = lowervp; - if (!lockparent) - cnp->cn_flags &= ~LOCKPARENT; - else - cnp->cn_flags |= LOCKPARENT; - return (lerror); - } - } else { - lerror = ENOENT; - if ((cnp->cn_flags & ISDOTDOT) && dun->un_pvp != NULLVP) { - lowervp = LOWERVP(dun->un_pvp); - if (lowervp != NULLVP) { - lerror = 0; - } - } - } - -donelowerlookup: - - if (!lockparent) - cnp->cn_flags &= ~LOCKPARENT; - - /* - * at this point, we have uerror and lerror indicating - * possible errors with the lookups in the upper and lower - * layers. additionally, uppervp and lowervp are (locked) - * references to existing vnodes in the upper and lower layers. - * - * there are now three cases to consider. - * 1. if both layers returned an error, then return whatever - * error the upper layer generated. - * - * 2. if the top layer failed and the bottom layer succeeded - * then two subcases occur. - * a. the bottom vnode is not a directory, in which - * case just return a new union vnode referencing - * an empty top layer and the existing bottom layer. - * b. the bottom vnode is a directory, in which case - * create a new directory in the top-level and - * continue as in case 3. - * - * 3. if the top layer succeeded then return a new union - * vnode referencing whatever the new top layer and - * whatever the bottom layer returned. - */ - - *ap->a_vpp = NULLVP; - - /* case 1. */ - if ((uerror != 0) && (lerror != 0)) { - if (!lockparent) - cnp->cn_flags &= ~LOCKPARENT; - else - cnp->cn_flags |= LOCKPARENT; - return (uerror); - } - - /* case 2. */ - if (uerror != 0 /* && (lerror == 0) */ ) { - if (lowervp->v_type == VDIR) { /* case 2b. */ - /* No need to lock the union here */ - /* if the vnode exists it returns it even if it marks error */ - - uppervp = NULLVP; - - uerror = union_mkshadow(um, upperdvp, cnp, &uppervp); - - if ((uerror == EEXIST)){ - if (uppervp == NULLVP) { - retry_count++; - if (retry_count <= 2) { - if (lowervp != NULLVP) - vnode_put(lowervp); - goto retry; - } - } - uerror = 0; - } - - if (uerror) { - if (uppervp != NULLVP) { - vnode_put(uppervp); - } - if (lowervp != NULLVP) { - vnode_put(lowervp); - } - if (!lockparent) - cnp->cn_flags &= ~LOCKPARENT; - else - cnp->cn_flags |= LOCKPARENT; - return (uerror); - } - } else if ((lowervp->v_type == VREG) && (isfaultfs == 1)) { - error = union_faultin_copyup(&uppervp, upperdvp, lowervp, cnp, ctx); - uerror = 0; - } - } - - - /* if this is faulting filesystem and upper vp exisits skip allocation of union node */ - if ((isfaultfs == 1) && (uerror == 0) && (uppervp != NULLVP) && ((vnode_isreg(uppervp) != 0)|| (vnode_islnk(uppervp) != 0))) { - vn_checkunionwait(uppervp); - *ap->a_vpp = uppervp; - if (lowervp != NULLVP) - vnode_put(lowervp); - if (!lockparent) - cnp->cn_flags &= ~LOCKPARENT; - else - cnp->cn_flags |= LOCKPARENT; - return(0); - } - - union_lock(); - error = union_allocvp(ap->a_vpp, dvp->v_mount, dvp, upperdvp, cnp, - uppervp, lowervp, 1); - union_unlock(); - - if (error) { - if (uppervp != NULLVP) - vnode_put(uppervp); - if (lowervp != NULLVP) - vnode_put(lowervp); - } - - if (!lockparent) - cnp->cn_flags &= ~LOCKPARENT; - else - cnp->cn_flags |= LOCKPARENT; - return (error); -} - -static int -union_create(struct vnop_create_args *ap) -/* - struct vnop_create_args { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - struct vnode_attr *a_vap; - vfs_context_t a_context; - } *ap; -*/ -{ - struct union_node *un = VTOUNION(ap->a_dvp); - struct vnode *dvp = un->un_uppervp; - struct componentname *cnp = ap->a_cnp; - - if (dvp != NULLVP) { - int error; - struct vnode *vp; - struct mount *mp; - - - mp = ap->a_dvp->v_mount; - - /* note that this is a direct passthrough to the filesystem */ - error = VNOP_CREATE(dvp, &vp, cnp, ap->a_vap, ap->a_context); - if (error) - return (error); - - /* if this is faulting filesystem and is a reg file, skip allocation of union node */ - if (UNNODE_FAULTIN(un) && (vp != NULLVP) && ((vnode_isreg(vp) != 0)|| (vnode_islnk(vp) != 0))) { - *ap->a_vpp = vp; - return(0); - } - - - union_lock(); - error = union_allocvp(ap->a_vpp, mp, NULLVP, NULLVP, cnp, vp, - NULLVP, 1); - union_unlock(); - if (error) - vnode_put(vp); - return (error); - } - - return (EROFS); -} - -static int -union_whiteout(struct vnop_whiteout_args *ap) -/* - struct vnop_whiteout_args { - struct vnode *a_dvp; - struct componentname *a_cnp; - int a_flags; - vfs_context_t a_context; - } *ap; -*/ -{ - struct union_node *un = VTOUNION(ap->a_dvp); - struct componentname *cnp = ap->a_cnp; - int error; - - if (un->un_uppervp == NULLVP) { - return (ENOTSUP); - } - - error = (VNOP_WHITEOUT(un->un_uppervp, cnp, ap->a_flags, ap->a_context)); - return(error); -} - -/* mknod can do fifos, chr, blk or whiteout entries */ -static int -union_mknod(struct vnop_mknod_args *ap) -/* - struct vnop_mknod_args { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - struct vnode_attr *a_vap; - vfs_context_t a_context; - } *ap; -*/ -{ - struct union_node *un = VTOUNION(ap->a_dvp); - struct vnode *dvp = un->un_uppervp; - struct componentname *cnp = ap->a_cnp; - - if (dvp != NULLVP) { - int error; - struct vnode *vp; - struct mount *mp; - - - mp = ap->a_dvp->v_mount; - - /* note that this is a direct passthrough to the filesystem */ - error = VNOP_MKNOD(dvp, &vp, cnp, ap->a_vap, ap->a_context); - if (error) - return (error); - - if (vp != NULLVP) { - union_lock(); - error = union_allocvp(ap->a_vpp, mp, NULLVP, NULLVP, - cnp, vp, NULLVP, 1); - union_unlock(); - if (error) - vnode_put(vp); - } - return (error); - } - return (EROFS); -} - -static int -union_open(struct vnop_open_args *ap) -/* - struct vnop_open_args { - struct vnodeop_desc *a_desc; - struct vnode *a_vp; - int a_mode; - vfs_context_t a_context; - } *ap; -*/ -{ - struct union_node *un = VTOUNION(ap->a_vp); - struct vnode *tvp; - int mode = ap->a_mode; - int error; - - /* - * If there is an existing upper vp then simply open that. - */ - - tvp = un->un_uppervp; - if (tvp == NULLVP) { - - /* - * If the lower vnode is being opened for writing, then - * copy the file contents to the upper vnode and open that, - * otherwise can simply open the lower vnode. - */ - tvp = un->un_lowervp; - if ((ap->a_mode & FWRITE) && (tvp->v_type == VREG)) { - /* For above below mounts we need draining.. */ - /* This path is not taken for faultin mode */ - /* LOCK the union node as well **/ - union_lock(); - un->un_flags |= UN_LOCKED; - - error = union_copyup(un, (mode&O_TRUNC) == 0, ap->a_context); - un->un_flags &= ~UN_LOCKED; - if ((un->un_flags & UN_WANT) == UN_WANT) { - un->un_flags &= ~UN_WANT; - wakeup(&un->un_flags); - } - union_unlock(); - if (error == 0) - error = VNOP_OPEN(un->un_uppervp, mode, ap->a_context); - return (error); - } - - /* - * Just open the lower vnode - */ - un->un_openl++; - - error = VNOP_OPEN(tvp, mode, ap->a_context); - - return (error); - } - - error = VNOP_OPEN(tvp, mode, ap->a_context); - - return (error); -} - -static int -union_close(struct vnop_close_args *ap) -/* - struct vnop_close_args { - struct vnode *a_vp; - int a_fflag; - vfs_context_t a_context; - } *ap; -*/ -{ - struct union_node *un = VTOUNION(ap->a_vp); - struct vnode *vp; - int error = 0; - - if ((vp = un->un_uppervp) == NULLVP) { -#ifdef UNION_DIAGNOSTIC - if (un->un_openl <= 0) - panic("union: un_openl cnt"); -#endif - --un->un_openl; - vp = un->un_lowervp; - } - - ap->a_vp = vp; - error = (VCALL(vp, VOFFSET(vnop_close), ap)); - return(error); -} - -/* - * Check access permission on the union vnode. - * The access check being enforced is to check - * against both the underlying vnode, and any - * copied vnode. This ensures that no additional - * file permissions are given away simply because - * the user caused an implicit file copy. - */ -static int -union_access(struct vnop_access_args *ap) -/* - struct vnop_access_args { - struct vnodeop_desc *a_desc; - struct vnode *a_vp; - int a_action; - vfs_context_t a_context; - } *ap; -*/ -{ - struct union_node *un = VTOUNION(ap->a_vp); - int error = EACCES; - struct vnode *vp; - - if ((vp = un->un_uppervp) != NULLVP) { - ap->a_vp = vp; - return (VCALL(vp, VOFFSET(vnop_access), ap)); - } - - if ((vp = un->un_lowervp) != NULLVP) { - ap->a_vp = vp; - error = VCALL(vp, VOFFSET(vnop_access), ap); - if (error == 0) { - struct union_mount *um = MOUNTTOUNIONMOUNT(vp->v_mount); - - if (um->um_op == UNMNT_BELOW) { - error = VCALL(vp, VOFFSET(vnop_access), ap); - } - } - if (error) - return (error); - } - - return (error); -} - -/* - * We handle getattr only to change the fsid and - * track object sizes - */ -static int -union_getattr(struct vnop_getattr_args *ap) -/* - struct vnop_getattr_args { - struct vnode *a_vp; - struct vnode_attr *a_vap; - vfs_context_t a_context; - } *ap; -*/ -{ - int error=0; - struct union_node *un = VTOUNION(ap->a_vp); - struct vnode *vp = un->un_uppervp; - struct vnode_attr *vap; - struct vnode_attr va; - - - /* - * Some programs walk the filesystem hierarchy by counting - * links to directories to avoid stat'ing all the time. - * This means the link count on directories needs to be "correct". - * The only way to do that is to call getattr on both layers - * and fix up the link count. The link count will not necessarily - * be accurate but will be large enough to defeat the tree walkers. - */ - - vap = ap->a_vap; - - vp = un->un_uppervp; - if (vp != NULLVP) { - /* - * It's not clear whether vnop_getattr is to be - * called with the vnode locked or not. stat() calls - * it with (vp) locked, and fstat calls it with - * (vp) unlocked. - * In the mean time, compensate here by checking - * the union_node's lock flag. - */ - - error = vnode_getattr(vp, vap, ap->a_context); - if (error) { - return (error); - } - union_lock(); - union_newsize(ap->a_vp, vap->va_data_size, VNOVAL); - union_unlock(); - } - - if (vp == NULLVP) { - vp = un->un_lowervp; - } else if (vp->v_type == VDIR) { - vp = un->un_lowervp; - VATTR_INIT(&va); - /* all we want from the lower node is the link count */ - VATTR_WANTED(&va, va_nlink); - vap = &va; - } else { - vp = NULLVP; - } - - if (vp != NULLVP) { - error = vnode_getattr(vp, vap, ap->a_context); - if (error) { - return (error); - } - union_lock(); - union_newsize(ap->a_vp, VNOVAL, vap->va_data_size); - union_unlock(); - } - - if ((vap != ap->a_vap) && (vap->va_type == VDIR)) - ap->a_vap->va_nlink += vap->va_nlink; - - VATTR_RETURN(ap->a_vap, va_fsid, ap->a_vp->v_mount->mnt_vfsstat.f_fsid.val[0]); - return (0); -} - -static int -union_setattr(struct vnop_setattr_args *ap) -/* - struct vnop_setattr_args { - struct vnode *a_vp; - struct vnode_attr *a_vap; - vfs_context_t a_context; - } *ap; -*/ -{ - struct union_node *un = VTOUNION(ap->a_vp); - int error; - - /* - * Handle case of truncating lower object to zero size, - * by creating a zero length upper object. This is to - * handle the case of open with O_TRUNC and O_CREAT. - */ - if (VATTR_IS_ACTIVE(ap->a_vap, va_data_size) && - (un->un_uppervp == NULLVP) && - /* assert(un->un_lowervp != NULLVP) */ - (un->un_lowervp->v_type == VREG)) { - union_lock(); - error = union_copyup(un, (ap->a_vap->va_data_size != 0), ap->a_context); - union_unlock(); - if (error) { - return (error); - } - } - - /* - * Try to set attributes in upper layer, - * otherwise return read-only filesystem error. - */ - if (un->un_uppervp != NULLVP) { - error = vnode_setattr(un->un_uppervp, ap->a_vap, ap->a_context); - if ((error == 0) && VATTR_IS_ACTIVE(ap->a_vap, va_data_size)) { - union_lock(); - union_newsize(ap->a_vp, ap->a_vap->va_data_size, VNOVAL); - union_unlock(); - } - } else { - error = EROFS; - } - - return (error); -} - -static int -union_read(struct vnop_read_args *ap) -/* - struct vnop_read_args { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - vfs_context_t a_context; - } *ap; -*/ -{ - int error; - struct vnode *vp = OTHERVP(ap->a_vp); - - error = VNOP_READ(vp, ap->a_uio, ap->a_ioflag, ap->a_context); - - /* - * XXX - * perhaps the size of the underlying object has changed under - * our feet. take advantage of the offset information present - * in the uio structure. - */ - if (error == 0) { - struct union_node *un = VTOUNION(ap->a_vp); - off_t cur = ap->a_uio->uio_offset; - - if (vp == un->un_uppervp) { - if (cur > un->un_uppersz) { - union_lock(); - union_newsize(ap->a_vp, cur, VNOVAL); - union_unlock(); - } - } else { - if (cur > un->un_lowersz) { - union_lock(); - union_newsize(ap->a_vp, VNOVAL, cur); - union_unlock(); - } - } - } - - return (error); -} - -static int -union_write(struct vnop_write_args *ap) -/* - struct vnop_write_args { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - vfs_context_t a_context; - } *ap; -*/ -{ - int error; - struct vnode *vp; - struct union_node *un = VTOUNION(ap->a_vp); - - vp = UPPERVP(ap->a_vp); - if (vp == NULLVP) - panic("union: missing upper layer in write"); - - error = VNOP_WRITE(vp, ap->a_uio, ap->a_ioflag, ap->a_context); - - /* - * the size of the underlying object may be changed by the - * write. - */ - if (error == 0) { - off_t cur = ap->a_uio->uio_offset; - - if (cur > un->un_uppersz) { - union_lock(); - union_newsize(ap->a_vp, cur, VNOVAL); - union_unlock(); - } - } - - return (error); -} - - -static int -union_ioctl(struct vnop_ioctl_args *ap) -/* - struct vnop_ioctl_args { - struct vnode *a_vp; - int a_command; - caddr_t a_data; - int a_fflag; - vfs_context_t a_context; - } *ap; -*/ -{ - register struct vnode *ovp = OTHERVP(ap->a_vp); - - ap->a_vp = ovp; - return (VCALL(ovp, VOFFSET(vnop_ioctl), ap)); -} - -static int -union_select(struct vnop_select_args *ap) -/* - struct vnop_select_args { - struct vnode *a_vp; - int a_which; - int a_fflags; - void * a_wql; - vfs_context_t a_context; - } *ap; -*/ -{ - register struct vnode *ovp = OTHERVP(ap->a_vp); - - ap->a_vp = ovp; - return (VCALL(ovp, VOFFSET(vnop_select), ap)); -} - -static int -union_revoke(struct vnop_revoke_args *ap) -/* - struct vnop_revoke_args { - struct vnode *a_vp; - int a_flags; - vfs_context_t a_context; - } *ap; -*/ -{ - struct vnode *vp = ap->a_vp; - - if (UPPERVP(vp)) - VNOP_REVOKE(UPPERVP(vp), ap->a_flags, ap->a_context); - if (LOWERVP(vp)) - VNOP_REVOKE(LOWERVP(vp), ap->a_flags, ap->a_context); - vnode_reclaim(vp); - - return (0); -} - -static int -union_mmap(struct vnop_mmap_args *ap) -/* - struct vnop_mmap_args { - struct vnode *a_vp; - int a_fflags; - kauth_cred_t a_cred; - struct proc *a_p; - } *ap; -*/ -{ - register struct vnode *ovp = OTHERVP(ap->a_vp); - - ap->a_vp = ovp; - return (VCALL(ovp, VOFFSET(vnop_mmap), ap)); -} - -static int -union_mnomap(struct vnop_mnomap_args *ap) -/* - struct vnop_mnomap_args { - struct vnode *a_vp; - int a_fflags; - kauth_cred_t a_cred; - struct proc *a_p; - } *ap; -*/ -{ - register struct vnode *ovp = OTHERVP(ap->a_vp); - - ap->a_vp = ovp; - return (VCALL(ovp, VOFFSET(vnop_mnomap), ap)); -} - -static int -union_fsync(struct vnop_fsync_args *ap) -/* - struct vnop_fsync_args { - struct vnode *a_vp; - int a_waitfor; - vfs_context_t a_context; - } *ap; -*/ -{ - int error = 0; - struct vnode *targetvp = OTHERVP(ap->a_vp); - - if (targetvp != NULLVP) { - - error = VNOP_FSYNC(targetvp, ap->a_waitfor, ap->a_context); - } - - return (error); -} - -static int -union_remove(struct vnop_remove_args *ap) -/* - struct vnop_remove_args { - struct vnode *a_dvp; - struct vnode *a_vp; - struct componentname *a_cnp; - vfs_context_t a_context; - } *ap; -*/ -{ - int error, flags; - struct union_node *dun = VTOUNION(ap->a_dvp); - struct union_node *un = VTOUNION(ap->a_vp); - struct componentname *cnp = ap->a_cnp; - int busydel = 0; - - if (dun->un_uppervp == NULLVP) - panic("union remove: null upper vnode"); - - if (UNNODE_FAULTIN(dun) && ((ap->a_vp != NULLVP) && - ((vnode_isreg(ap->a_vp) != 0) || (vnode_islnk(ap->a_vp) != 0)))) { - return(VNOP_REMOVE(dun->un_uppervp, ap->a_vp, ap->a_cnp, ap->a_flags, ap->a_context)); - } - - if (un->un_uppervp != NULLVP) { - struct vnode *dvp = dun->un_uppervp; - struct vnode *vp = un->un_uppervp; - - flags = ap->a_flags; - if (vnode_isinuse(ap->a_vp, 0)) - busydel = 1; - if ((flags & VNODE_REMOVE_NODELETEBUSY) && (busydel != 0)) { - return(EBUSY); - } - if (union_dowhiteout(un, cnp->cn_context)) - cnp->cn_flags |= DOWHITEOUT; - - if (busydel != 0) { - union_lock(); - un->un_flags |= UN_DELETED; - if (un->un_flags & UN_CACHED) { - un->un_flags &= ~UN_CACHED; - LIST_REMOVE(un, un_cache); - } - union_unlock(); - vnode_ref(vp); - } - error = VNOP_REMOVE(dvp, vp, cnp, 0, ap->a_context); - if (!error) { - union_lock(); - if (busydel == 0) - union_removed_upper(un); - union_unlock(); - } - } else { - if (UNNODE_FAULTIN(un)) - panic("faultfs: No uppervp"); - error = union_mkwhiteout( - MOUNTTOUNIONMOUNT(UNIONTOV(dun)->v_mount), - dun->un_uppervp, ap->a_cnp, un->un_path); - } - - return (error); -} - -static int -union_link(struct vnop_link_args *ap) -/* - struct vnop_link_args { - struct vnode *a_vp; - struct vnode *a_tdvp; - struct componentname *a_cnp; - vfs_context_t a_context; - } *ap; -*/ -{ - int error = 0; - struct componentname *cnp = ap->a_cnp; - struct union_node *un; - struct vnode *vp; - struct vnode *tdvp; - - un = VTOUNION(ap->a_tdvp); - - if (ap->a_tdvp->v_op != ap->a_vp->v_op) { - vp = ap->a_vp; - } else { - struct union_node *tun = VTOUNION(ap->a_vp); - if (tun->un_uppervp == NULLVP) { - if (UNNODE_FAULTIN(tun)) - panic("faultfs: No uppervp"); - if (un->un_uppervp == tun->un_dirvp) { - } - union_lock(); - /* Would need to drain for above,below mount and faulin does not enter this path */ - un->un_flags |= UN_LOCKED; - error = union_copyup(tun, 1, ap->a_context); - un->un_flags &= ~UN_LOCKED; - if ((un->un_flags & UN_WANT) == UN_WANT) { - un->un_flags &= ~UN_WANT; - wakeup(&un->un_flags); - } - union_unlock(); - } - vp = tun->un_uppervp; - } - tdvp = un->un_uppervp; - if (tdvp == NULLVP) - error = EROFS; - - if (error) { - return (error); - } - - - error = (VNOP_LINK(vp, tdvp, cnp, ap->a_context)); - return(error); -} - -static int -union_rename(struct vnop_rename_args *ap) -/* - struct vnop_rename_args { - struct vnode *a_fdvp; - struct vnode *a_fvp; - struct componentname *a_fcnp; - struct vnode *a_tdvp; - struct vnode *a_tvp; - struct componentname *a_tcnp; - vfs_context_t a_context; - } *ap; -*/ -{ - int error; - - struct vnode *fdvp = ap->a_fdvp; - struct vnode *fvp = ap->a_fvp; - struct vnode *tdvp = ap->a_tdvp; - struct vnode *tvp = ap->a_tvp; - - - if (fdvp->v_op == union_vnodeop_p) { /* always true */ - struct union_node *un = VTOUNION(fdvp); - if (un->un_uppervp == NULLVP) { - if (UNNODE_FAULTIN(un)) - panic("faultfs rename: No uppervp"); - /* - * this should never happen in normal - * operation but might if there was - * a problem creating the top-level shadow - * directory. - */ - error = EXDEV; - goto bad; - } - - fdvp = un->un_uppervp; - } - - if (fvp->v_op == union_vnodeop_p) { /* always true */ - struct union_node *un = VTOUNION(fvp); - if (un->un_uppervp == NULLVP) { - if (UNNODE_FAULTIN(un)) - panic("faultfs rename: No uppervp"); - /* XXX: should do a copyup */ - error = EXDEV; - goto bad; - } - - if (un->un_lowervp != NULLVP) - ap->a_fcnp->cn_flags |= DOWHITEOUT; - - fvp = un->un_uppervp; - } - - if (tdvp->v_op == union_vnodeop_p) { - struct union_node *un = VTOUNION(tdvp); - if (un->un_uppervp == NULLVP) { - /* - * this should never happen in normal - * operation but might if there was - * a problem creating the top-level shadow - * directory. - */ - if (UNNODE_FAULTIN(un)) - panic("faultfs rename: No uppervp"); - error = EXDEV; - goto bad; - } - - tdvp = un->un_uppervp; - } - - if (tvp != NULLVP && tvp->v_op == union_vnodeop_p) { - struct union_node *un = VTOUNION(tvp); - - tvp = un->un_uppervp; - } - - return (VNOP_RENAME(fdvp, fvp, ap->a_fcnp, tdvp, tvp, ap->a_tcnp, ap->a_context)); - -bad: - return (error); -} - -static int -union_mkdir(struct vnop_mkdir_args *ap) -/* - struct vnop_mkdir_args { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - struct vnode_attr *a_vap; - vfs_context_t a_context; - } *ap; -*/ -{ - struct union_node *un = VTOUNION(ap->a_dvp); - struct vnode *dvp = un->un_uppervp; - struct componentname *cnp = ap->a_cnp; - - if (dvp != NULLVP) { - int error; - struct vnode *vp; - - - /* note that this is a direct fallthrough to the filesystem */ - error = VNOP_MKDIR(dvp, &vp, cnp, ap->a_vap, ap->a_context); - if (error) - return (error); - - union_lock(); - error = union_allocvp(ap->a_vpp, ap->a_dvp->v_mount, ap->a_dvp, - NULLVP, cnp, vp, NULLVP, 1); - union_unlock(); - if (error) - vnode_put(vp); - return (error); - } - return (EROFS); -} - -static int -union_rmdir(struct vnop_rmdir_args *ap) -/* - struct vnop_rmdir_args { - struct vnode *a_dvp; - struct vnode *a_vp; - struct componentname *a_cnp; - vfs_context_t a_context; - } *ap; -*/ -{ - int error; - struct union_node *dun = VTOUNION(ap->a_dvp); - struct union_node *un = VTOUNION(ap->a_vp); - struct componentname *cnp = ap->a_cnp; - int busydel = 0; - - /******* NODE HAS TO BE LOCKED ******/ - if (dun->un_uppervp == NULLVP) - panic("union rmdir: null upper vnode"); - - if (un->un_uppervp != NULLVP) { - struct vnode *dvp = dun->un_uppervp; - struct vnode *vp = un->un_uppervp; - - if (vnode_isinuse(ap->a_vp, 0)) { - busydel = 1; - union_lock(); - un->un_flags |= UN_DELETED; - if (un->un_flags & UN_CACHED) { - un->un_flags &= ~UN_CACHED; - LIST_REMOVE(un, un_cache); - } - union_unlock(); - vnode_ref(vp); - } - - - if (union_dowhiteout(un, cnp->cn_context)) - cnp->cn_flags |= DOWHITEOUT; - error = VNOP_RMDIR(dvp, vp, ap->a_cnp, ap->a_context); - if (!error) { - union_lock(); - if (busydel == 0) - union_removed_upper(un); - union_unlock(); - } - } else { - if (UNNODE_FAULTIN(un)) - panic("faultfs: No uppervp"); - error = union_mkwhiteout( - MOUNTTOUNIONMOUNT(UNIONTOV(dun)->v_mount), - dun->un_uppervp, ap->a_cnp, un->un_path); - } - return (error); -} - -static int -union_symlink(struct vnop_symlink_args *ap) -/* - struct vnop_symlink_args { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - struct vnode_attr *a_vap; - char *a_target; - vfs_context_t a_context; - } *ap; -*/ -{ - struct union_node *un = VTOUNION(ap->a_dvp); - struct vnode *dvp = un->un_uppervp; - struct componentname *cnp = ap->a_cnp; - - if (dvp != NULLVP) { - int error; - struct vnode *vp; - - error = VNOP_SYMLINK(dvp, &vp, cnp, ap->a_vap, ap->a_target, ap->a_context); - *ap->a_vpp = vp; - return (error); - } - return (EROFS); -} - -/* - * union_readdir works in concert with getdirentries and - * readdir(3) to provide a list of entries in the unioned - * directories. getdirentries is responsible for walking - * down the union stack. readdir(3) is responsible for - * eliminating duplicate names from the returned data stream. - */ -static int -union_readdir(struct vnop_readdir_args *ap) -/* - struct vnop_readdir_args { - struct vnodeop_desc *a_desc; - struct vnode *a_vp; - struct uio *a_uio; - int a_flags; - int *a_eofflag; - int *a_numdirent; - vfs_context_t a_context; - } *ap; -*/ -{ - struct union_node *un = VTOUNION(ap->a_vp); - struct vnode *uvp = un->un_uppervp; - - if (ap->a_flags & (VNODE_READDIR_EXTENDED | VNODE_READDIR_REQSEEKOFF)) - return (EINVAL); - - if (uvp == NULLVP) - return (0); - - ap->a_vp = uvp; - return (VCALL(uvp, VOFFSET(vnop_readdir), ap)); -} - -static int -union_readlink(struct vnop_readlink_args *ap) -/* - struct vnop_readlink_args { - struct vnode *a_vp; - struct uio *a_uio; - vfs_context_t a_context; - } *ap; -*/ -{ - int error; - struct vnode *vp = OTHERVP(ap->a_vp); - - ap->a_vp = vp; - error = VCALL(vp, VOFFSET(vnop_readlink), ap); - - return (error); -} - -static int -union_inactive(struct vnop_inactive_args *ap) -/* - struct vnop_inactive_args { - struct vnode *a_vp; - vfs_context_t a_context; - } *ap; -*/ -{ - struct vnode *vp = ap->a_vp; - struct union_node *un = VTOUNION(vp); - - /* - * Do nothing (and _don't_ bypass). - * Wait to vnode_put lowervp until reclaim, - * so that until then our union_node is in the - * cache and reusable. - * - * NEEDSWORK: Someday, consider inactive'ing - * the lowervp and then trying to reactivate it - * with capabilities (v_id) - * like they do in the name lookup cache code. - * That's too much work for now. - */ - - union_lock(); - if (un->un_flags & UN_DELETED) { - if(un->un_uppervp != NULLVP) { - vnode_rele(un->un_uppervp); - } - union_removed_upper(un); - } - - if (un->un_dircache != 0) { - union_dircache_free(un); - } - if (un->un_flags & UN_DIRENVN) { - vnode_recycle(vp); - } - - union_unlock(); - - return (0); -} - -static int -union_reclaim(struct vnop_reclaim_args *ap) -/* - struct vnop_reclaim_args { - struct vnode *a_vp; - vfs_context_t a_context; - } *ap; -*/ -{ - - union_lock(); - union_freevp(ap->a_vp); - union_unlock(); - - return (0); -} - -static int -union_blockmap(struct vnop_blockmap_args *ap) -/* - struct vnop_blockmap_args { - struct vnode *a_vp; - off_t a_offset; - size_t a_size; - daddr64_t *a_bpn; - size_t *a_run; - void *a_poff; - int a_flags; - } *ap; -*/ -{ - int error; - struct vnode *vp = OTHERVP(ap->a_vp); - - ap->a_vp = vp; - error = VCALL(vp, VOFFSET(vnop_blockmap), ap); - - return (error); -} - -static int -union_pathconf(struct vnop_pathconf_args *ap) -/* - struct vnop_pathconf_args { - struct vnode *a_vp; - int a_name; - int *a_retval; - vfs_context_t a_context; - } *ap; -*/ -{ - int error; - struct vnode *vp = OTHERVP(ap->a_vp); - - ap->a_vp = vp; - error = VCALL(vp, VOFFSET(vnop_pathconf), ap); - - return (error); -} - -static int -union_advlock(struct vnop_advlock_args *ap) -/* - struct vnop_advlock_args { - struct vnode *a_vp; - caddr_t a_id; - int a_op; - struct flock *a_fl; - int a_flags; - vfs_context_t a_context; - } *ap; -*/ -{ - register struct vnode *ovp = OTHERVP(ap->a_vp); - - ap->a_vp = ovp; - return (VCALL(ovp, VOFFSET(vnop_advlock), ap)); -} - - -/* - * XXX - vnop_strategy must be hand coded because it has no - * vnode in its arguments. - * This goes away with a merged VM/buffer cache. - */ -static int -union_strategy(struct vnop_strategy_args *ap) -/* - struct vnop_strategy_args { - struct buf *a_bp; - } *ap; -*/ -{ - struct buf *bp = ap->a_bp; - int error; - struct vnode *savedvp; - - savedvp = buf_vnode(bp); - buf_setvnode(bp, OTHERVP(savedvp)); - -#if DIAGNOSTIC - if (buf_vnode(bp) == NULLVP) - panic("union_strategy: nil vp"); - if (((buf_flags(bp) & B_READ) == 0) && - (buf_vnode(bp) == LOWERVP(savedvp))) - panic("union_strategy: writing to lowervp"); -#endif - - error = VNOP_STRATEGY(bp); - buf_setvnode(bp, savedvp); - - return (error); -} - -/* Pagein */ -static int -union_pagein(struct vnop_pagein_args *ap) -/* - struct vnop_pagein_args { - struct vnode *a_vp, - upl_t a_pl, - upl_offset_t a_pl_offset, - off_t a_f_offset, - size_t a_size, - int a_flags - vfs_context_t a_context; - } *ap; -*/ -{ - int error; - struct vnode *vp = OTHERVP(ap->a_vp); - - error = VNOP_PAGEIN(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset, - ap->a_size, ap->a_flags, ap->a_context); - - /* - * XXX - * perhaps the size of the underlying object has changed under - * our feet. take advantage of the offset information present - * in the uio structure. - */ - if (error == 0) { - struct union_node *un = VTOUNION(ap->a_vp); - off_t cur = ap->a_f_offset + (off_t)ap->a_pl_offset; - - if (vp == un->un_uppervp) { - if (cur > un->un_uppersz) { - union_lock(); - union_newsize(ap->a_vp, cur, VNOVAL); - union_unlock(); - } - } else { - if (cur > un->un_lowersz) { - union_lock(); - union_newsize(ap->a_vp, VNOVAL, cur); - union_unlock(); - } - } - } - - return (error); -} - -/* Pageout */ -static int -union_pageout(struct vnop_pageout_args *ap) -/* - struct vnop_pageout_args { - struct vnode *a_vp, - upl_t a_pl, - vm_offset_t a_pl_offset, - off_t a_f_offset, - size_t a_size, - int a_flags - vfs_context_t a_context; - } *ap; -*/ -{ - int error; - struct vnode *vp; - struct union_node *un = VTOUNION(ap->a_vp); - - vp = UPPERVP(ap->a_vp); - if (vp == NULLVP) - panic("union: missing upper layer in pageout"); - - error = VNOP_PAGEOUT(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset, - ap->a_size, ap->a_flags, ap->a_context); - - /* - * the size of the underlying object may be changed by the - * write. - */ - if (error == 0) { - off_t cur = ap->a_f_offset + (off_t)ap->a_pl_offset; - - if (cur > un->un_uppersz) { - union_lock(); - union_newsize(ap->a_vp, cur, VNOVAL); - union_unlock(); - } - } - - return (error); -} - -/* Blktooff derives file offset for the given logical block number */ -static int -union_blktooff(struct vnop_blktooff_args *ap) -/* - struct vnop_blktooff_args { - struct vnode *a_vp; - daddr64_t a_lblkno; - off_t *a_offset; - } *ap; -*/ -{ - int error; - struct vnode *vp = OTHERVP(ap->a_vp); - - error = VNOP_BLKTOOFF(vp, ap->a_lblkno, ap->a_offset); - - return(error); -} - -/* offtoblk derives file offset for the given logical block number */ -static int -union_offtoblk(struct vnop_offtoblk_args *ap) -/* - struct vnop_offtoblk_args { - struct vnode *a_vp; - off_t a_offset; - daddr64_t *a_lblkno; - } *ap; -*/ -{ - int error; - struct vnode *vp = OTHERVP(ap->a_vp); - - error = VNOP_OFFTOBLK(vp, ap->a_offset, ap->a_lblkno); - - return(error); -} - -#define VOPFUNC int (*)(void *) - -/* - * Global vfs data structures - */ -int (**union_vnodeop_p)(void *); -struct vnodeopv_entry_desc union_vnodeop_entries[] = { - { &vnop_default_desc, (VOPFUNC)vn_default_error }, - { &vnop_lookup_desc, (VOPFUNC)union_lookup }, /* lookup */ - { &vnop_create_desc, (VOPFUNC)union_create }, /* create */ - { &vnop_whiteout_desc, (VOPFUNC)union_whiteout }, /* whiteout */ - { &vnop_mknod_desc, (VOPFUNC)union_mknod }, /* mknod */ - { &vnop_open_desc, (VOPFUNC)union_open }, /* open */ - { &vnop_close_desc, (VOPFUNC)union_close }, /* close */ - { &vnop_access_desc, (VOPFUNC)union_access }, /* access */ - { &vnop_getattr_desc, (VOPFUNC)union_getattr }, /* getattr */ - { &vnop_setattr_desc, (VOPFUNC)union_setattr }, /* setattr */ - { &vnop_read_desc, (VOPFUNC)union_read }, /* read */ - { &vnop_write_desc, (VOPFUNC)union_write }, /* write */ - { &vnop_ioctl_desc, (VOPFUNC)union_ioctl }, /* ioctl */ - { &vnop_select_desc, (VOPFUNC)union_select }, /* select */ - { &vnop_revoke_desc, (VOPFUNC)union_revoke }, /* revoke */ - { &vnop_mmap_desc, (VOPFUNC)union_mmap }, /* mmap */ - { &vnop_mnomap_desc, (VOPFUNC)union_mnomap }, /* mnomap */ - { &vnop_fsync_desc, (VOPFUNC)union_fsync }, /* fsync */ - { &vnop_remove_desc, (VOPFUNC)union_remove }, /* remove */ - { &vnop_link_desc, (VOPFUNC)union_link }, /* link */ - { &vnop_rename_desc, (VOPFUNC)union_rename }, /* rename */ - { &vnop_mkdir_desc, (VOPFUNC)union_mkdir }, /* mkdir */ - { &vnop_rmdir_desc, (VOPFUNC)union_rmdir }, /* rmdir */ - { &vnop_symlink_desc, (VOPFUNC)union_symlink }, /* symlink */ - { &vnop_readdir_desc, (VOPFUNC)union_readdir }, /* readdir */ - { &vnop_readlink_desc, (VOPFUNC)union_readlink }, /* readlink */ - { &vnop_inactive_desc, (VOPFUNC)union_inactive }, /* inactive */ - { &vnop_reclaim_desc, (VOPFUNC)union_reclaim }, /* reclaim */ - { &vnop_strategy_desc, (VOPFUNC)union_strategy }, /* strategy */ - { &vnop_pathconf_desc, (VOPFUNC)union_pathconf }, /* pathconf */ - { &vnop_advlock_desc, (VOPFUNC)union_advlock }, /* advlock */ -#ifdef notdef - { &vnop_bwrite_desc, (VOPFUNC)union_bwrite }, /* bwrite */ -#endif - { &vnop_pagein_desc, (VOPFUNC)union_pagein }, /* Pagein */ - { &vnop_pageout_desc, (VOPFUNC)union_pageout }, /* Pageout */ - { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ - { &vnop_blktooff_desc, (VOPFUNC)union_blktooff }, /* blktooff */ - { &vnop_offtoblk_desc, (VOPFUNC)union_offtoblk }, /* offtoblk */ - { &vnop_blockmap_desc, (VOPFUNC)union_blockmap }, /* blockmap */ - { (struct vnodeop_desc*)NULL, (int(*)())NULL } -}; -struct vnodeopv_desc union_vnodeop_opv_desc = - { &union_vnodeop_p, union_vnodeop_entries }; diff --git a/bsd/net/Makefile b/bsd/net/Makefile index 2f77f154b..79c622bf8 100644 --- a/bsd/net/Makefile +++ b/bsd/net/Makefile @@ -9,14 +9,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ DATAFILES= \ @@ -32,14 +28,15 @@ KERNELFILES= \ if_ether.h init.h radix.h PRIVATE_DATAFILES = \ - if_atm.h if_vlan_var.h if_ppp.h firewire.h \ + if_vlan_var.h if_ppp.h firewire.h \ ppp_defs.h radix.h if_bond_var.h lacp.h ndrv_var.h \ - raw_cb.h etherdefs.h iso88025.h if_pflog.h pfvar.h \ - if_bridgevar.h + netsrc.h raw_cb.h etherdefs.h iso88025.h if_pflog.h pfvar.h \ + if_bridgevar.h ntstat.h if_llreach.h PRIVATE_KERNELFILES = ${KERNELFILES} \ bpfdesc.h dlil_pvt.h ppp_comp.h \ - zlib.h bpf_compat.h net_osdep.h + zlib.h bpf_compat.h net_osdep.h \ + ntstat.h if_llreach.h INSTALL_MI_LIST = ${DATAFILES} diff --git a/bsd/net/bpf.c b/bsd/net/bpf.c index d9ec5b137..e370dfc5e 100644 --- a/bsd/net/bpf.c +++ b/bsd/net/bpf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -119,6 +119,7 @@ #include #include +#include #if CONFIG_MACF_NET #include @@ -147,13 +148,13 @@ static caddr_t bpf_alloc(); * The default read buffer size is patchable. */ static unsigned int bpf_bufsize = BPF_BUFSIZE; -SYSCTL_INT(_debug, OID_AUTO, bpf_bufsize, CTLFLAG_RW, +SYSCTL_INT(_debug, OID_AUTO, bpf_bufsize, CTLFLAG_RW | CTLFLAG_LOCKED, &bpf_bufsize, 0, ""); -static unsigned int bpf_maxbufsize = BPF_MAXBUFSIZE; -SYSCTL_INT(_debug, OID_AUTO, bpf_maxbufsize, CTLFLAG_RW, +__private_extern__ unsigned int bpf_maxbufsize = BPF_MAXBUFSIZE; +SYSCTL_INT(_debug, OID_AUTO, bpf_maxbufsize, CTLFLAG_RW | CTLFLAG_LOCKED, &bpf_maxbufsize, 0, ""); static unsigned int bpf_maxdevices = 256; -SYSCTL_UINT(_debug, OID_AUTO, bpf_maxdevices, CTLFLAG_RW, +SYSCTL_UINT(_debug, OID_AUTO, bpf_maxdevices, CTLFLAG_RW | CTLFLAG_LOCKED, &bpf_maxdevices, 0, ""); /* @@ -196,6 +197,7 @@ static void bpf_mcopy(const void *, void *, size_t); static int bpf_movein(struct uio *, int, struct mbuf **, struct sockaddr *, int *); static int bpf_setif(struct bpf_d *, ifnet_t ifp, u_int32_t dlt); +static void bpf_timed_out(void *, void *); static void bpf_wakeup(struct bpf_d *); static void catchpacket(struct bpf_d *, u_char *, u_int, u_int, void (*)(const void *, void *, size_t)); @@ -216,26 +218,26 @@ static int bpf_tap_callback(struct ifnet *ifp, struct mbuf *m); * Darwin differs from BSD here, the following are static * on BSD and not static on Darwin. */ - d_open_t bpfopen; - d_close_t bpfclose; - d_read_t bpfread; - d_write_t bpfwrite; - ioctl_fcn_t bpfioctl; - select_fcn_t bpfpoll; + d_open_t bpfopen; + d_close_t bpfclose; + d_read_t bpfread; + d_write_t bpfwrite; + ioctl_fcn_t bpfioctl; + select_fcn_t bpfselect; /* Darwin's cdevsw struct differs slightly from BSDs */ #define CDEV_MAJOR 23 static struct cdevsw bpf_cdevsw = { - /* open */ bpfopen, - /* close */ bpfclose, - /* read */ bpfread, - /* write */ bpfwrite, - /* ioctl */ bpfioctl, + /* open */ bpfopen, + /* close */ bpfclose, + /* read */ bpfread, + /* write */ bpfwrite, + /* ioctl */ bpfioctl, /* stop */ eno_stop, /* reset */ eno_reset, /* tty */ NULL, - /* select */ bpfpoll, + /* select */ bpfselect, /* mmap */ eno_mmap, /* strategy*/ eno_strat, /* getc */ eno_getc, @@ -314,6 +316,11 @@ bpf_movein(struct uio *uio, int linktype, struct mbuf **mp, struct sockaddr *soc sa_family = AF_IEEE80211; hlen = 0; break; + + case DLT_IEEE802_11_RADIO: + sa_family = AF_IEEE80211; + hlen = 0; + break; default: return (EIO); @@ -367,6 +374,7 @@ bpf_movein(struct uio *uio, int linktype, struct mbuf **mp, struct sockaddr *soc m->m_pkthdr.len = m->m_len = len; m->m_pkthdr.rcvif = NULL; *mp = m; + /* * Make room for link header. */ @@ -383,8 +391,25 @@ bpf_movein(struct uio *uio, int linktype, struct mbuf **mp, struct sockaddr *soc goto bad; } error = UIOMOVE(mtod(m, caddr_t), len - hlen, UIO_WRITE, uio); - if (!error) - return (0); + if (error) + goto bad; + + /* Check for multicast destination */ + switch (linktype) { + case DLT_EN10MB: { + struct ether_header *eh = mtod(m, struct ether_header *); + + if (ETHER_IS_MULTICAST(eh->ether_dhost)) { + if (_ether_cmp(etherbroadcastaddr, eh->ether_dhost) == 0) + m->m_flags |= M_BCAST; + else + m->m_flags |= M_MCAST; + } + break; + } + } + + return 0; bad: m_freem(m); return (error); @@ -550,6 +575,59 @@ bpf_detachd(struct bpf_d *d) } +/* + * Start asynchronous timer, if necessary. + * Must be called with bpf_mlock held. + */ +static void +bpf_start_timer(struct bpf_d *d) +{ + uint64_t deadline; + struct timeval tv; + + if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) { + tv.tv_sec = d->bd_rtout / hz; + tv.tv_usec = (d->bd_rtout % hz) * tick; + + clock_interval_to_deadline((uint64_t)tv.tv_sec * USEC_PER_SEC + tv.tv_usec, + NSEC_PER_USEC, + &deadline); + /* + * The state is BPF_IDLE, so the timer hasn't + * been started yet, and hasn't gone off yet; + * there is no thread call scheduled, so this + * won't change the schedule. + * + * XXX - what if, by the time it gets entered, + * the deadline has already passed? + */ + thread_call_enter_delayed(d->bd_thread_call, deadline); + d->bd_state = BPF_WAITING; + } +} + +/* + * Cancel asynchronous timer. + * Must be called with bpf_mlock held. + */ +static boolean_t +bpf_stop_timer(struct bpf_d *d) +{ + /* + * If the timer has already gone off, this does nothing. + * Our caller is expected to set d->bd_state to BPF_IDLE, + * with the bpf_mlock, after we are called. bpf_timed_out() + * also grabs bpf_mlock, so, if the timer has gone off and + * bpf_timed_out() hasn't finished, it's waiting for the + * lock; when this thread releases the lock, it will + * find the state is BPF_IDLE, and just release the + * lock and return. + */ + return (thread_call_cancel(d->bd_thread_call)); +} + + + /* * Open ethernet device. Returns ENXIO for illegal minor device number, * EBUSY if file is open by another process. @@ -612,6 +690,16 @@ bpfopen(dev_t dev, int flags, __unused int fmt, d->bd_sig = SIGIO; d->bd_seesent = 1; d->bd_oflags = flags; + d->bd_state = BPF_IDLE; + d->bd_thread_call = thread_call_allocate(bpf_timed_out, d); + + if (d->bd_thread_call == NULL) { + printf("bpfopen: malloc thread call failed\n"); + bpf_dtab[minor(dev)] = NULL; + lck_mtx_unlock(bpf_mlock); + _FREE(d, M_DEVBUF); + return ENOMEM; + } #if CONFIG_MACF_NET mac_bpfdesc_label_init(d); mac_bpfdesc_label_associate(kauth_cred_get(), d); @@ -643,12 +731,67 @@ bpfclose(dev_t dev, __unused int flags, __unused int fmt, } bpf_dtab[minor(dev)] = (void *)1; /* Mark closing */ + /* + * Deal with any in-progress timeouts. + */ + switch (d->bd_state) { + case BPF_IDLE: + /* + * Not waiting for a timeout, and no timeout happened. + */ + break; + + case BPF_WAITING: + /* + * Waiting for a timeout. + * Cancel any timer that has yet to go off, + * and mark the state as "closing". + * Then drop the lock to allow any timers that + * *have* gone off to run to completion, and wait + * for them to finish. + */ + if (!bpf_stop_timer(d)) { + /* + * There was no pending call, so the call must + * have been in progress. Wait for the call to + * complete; we have to drop the lock while + * waiting. to let the in-progrss call complete + */ + d->bd_state = BPF_DRAINING; + while (d->bd_state == BPF_DRAINING) + msleep((caddr_t)d, bpf_mlock, PRINET, + "bpfdraining", NULL); + } + d->bd_state = BPF_IDLE; + break; + + case BPF_TIMED_OUT: + /* + * Timer went off, and the timeout routine finished. + */ + d->bd_state = BPF_IDLE; + break; + + case BPF_DRAINING: + /* + * Another thread is blocked on a close waiting for + * a timeout to finish. + * This "shouldn't happen", as the first thread to enter + * bpfclose() will set bpf_dtab[minor(dev)] to 1, and + * all subsequent threads should see that and fail with + * ENXIO. + */ + panic("Two threads blocked in a BPF close"); + break; + } + if (d->bd_bif) bpf_detachd(d); selthreadclear(&d->bd_sel); #if CONFIG_MACF_NET mac_bpfdesc_label_destroy(d); #endif + thread_call_free(d->bd_thread_call); bpf_freed(d); /* Mark free in same context as bpfopen comes to check */ @@ -666,15 +809,12 @@ bpfclose(dev_t dev, __unused int flags, __unused int fmt, static int bpf_sleep(struct bpf_d *d, int pri, const char *wmesg, int timo) { - int st; + u_int64_t abstime = 0; - lck_mtx_unlock(bpf_mlock); - - st = tsleep((caddr_t)d, pri, wmesg, timo); - - lck_mtx_lock(bpf_mlock); + if(timo) + clock_interval_to_deadline(timo, NSEC_PER_SEC / hz, &abstime); - return st; + return msleep1((caddr_t)d, bpf_mlock, pri, wmesg, abstime); } /* @@ -695,6 +835,7 @@ int bpfread(dev_t dev, struct uio *uio, int ioflag) { struct bpf_d *d; + int timed_out; int error; lck_mtx_lock(bpf_mlock); @@ -705,7 +846,6 @@ bpfread(dev_t dev, struct uio *uio, int ioflag) return (ENXIO); } - /* * Restrict application to use a buffer the same size as * as kernel buffers. @@ -714,6 +854,12 @@ bpfread(dev_t dev, struct uio *uio, int ioflag) lck_mtx_unlock(bpf_mlock); return (EINVAL); } + + if (d->bd_state == BPF_WAITING) + bpf_stop_timer(d); + + timed_out = (d->bd_state == BPF_TIMED_OUT); + d->bd_state = BPF_IDLE; /* * If the hold buffer is empty, then do a timed sleep, which @@ -721,9 +867,14 @@ bpfread(dev_t dev, struct uio *uio, int ioflag) * have arrived to fill the store buffer. */ while (d->bd_hbuf == 0) { - if (d->bd_immediate && d->bd_slen != 0) { + if ((d->bd_immediate || timed_out || (ioflag & IO_NDELAY)) + && d->bd_slen != 0) { /* - * A packet(s) either arrived since the previous + * We're in immediate mode, or are reading + * in non-blocking mode, or a timer was + * started before the read (e.g., by select() + * or poll()) and has expired and a packet(s) + * either arrived since the previous * read or arrived while we were asleep. * Rotate the buffers and return what's here. */ @@ -806,6 +957,10 @@ bpfread(dev_t dev, struct uio *uio, int ioflag) static void bpf_wakeup(struct bpf_d *d) { + if (d->bd_state == BPF_WAITING) { + bpf_stop_timer(d); + d->bd_state = BPF_IDLE; + } wakeup((caddr_t)d); if (d->bd_async && d->bd_sig && d->bd_sigio) pgsigio(d->bd_sigio, d->bd_sig); @@ -826,6 +981,36 @@ bpf_wakeup(struct bpf_d *d) #endif } + +static void +bpf_timed_out(void *arg, __unused void *dummy) +{ + struct bpf_d *d = (struct bpf_d *)arg; + + lck_mtx_lock(bpf_mlock); + if (d->bd_state == BPF_WAITING) { + /* + * There's a select or kqueue waiting for this; if there's + * now stuff to read, wake it up. + */ + d->bd_state = BPF_TIMED_OUT; + if (d->bd_slen != 0) + bpf_wakeup(d); + } else if (d->bd_state == BPF_DRAINING) { + /* + * A close is waiting for this to finish. + * Mark it as finished, and wake the close up. + */ + d->bd_state = BPF_IDLE; + bpf_wakeup(d); + } + lck_mtx_unlock(bpf_mlock); +} + + + + + /* keep in sync with bpf_movein above: */ #define MAX_DATALINK_HDR_LEN (sizeof(struct firewire_header)) @@ -838,6 +1023,8 @@ bpfwrite(dev_t dev, struct uio *uio, __unused int ioflag) int error; char dst_buf[SOCKADDR_HDR_LEN + MAX_DATALINK_HDR_LEN]; int datlen = 0; + int bif_dlt; + int bd_hdrcmplt; lck_mtx_lock(bpf_mlock); @@ -853,31 +1040,56 @@ bpfwrite(dev_t dev, struct uio *uio, __unused int ioflag) ifp = d->bd_bif->bif_ifp; + if ((ifp->if_flags & IFF_UP) == 0) { + lck_mtx_unlock(bpf_mlock); + return (ENETDOWN); + } if (uio_resid(uio) == 0) { lck_mtx_unlock(bpf_mlock); return (0); } ((struct sockaddr *)dst_buf)->sa_len = sizeof(dst_buf); - error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, &m, - d->bd_hdrcmplt ? NULL : (struct sockaddr *)dst_buf, - &datlen); - if (error) { - lck_mtx_unlock(bpf_mlock); + + /* + * fix for PR-6849527 + * geting variables onto stack before dropping lock for bpf_movein() + */ + bif_dlt = (int)d->bd_bif->bif_dlt; + bd_hdrcmplt = d->bd_hdrcmplt; + + /* bpf_movein allocating mbufs; drop lock */ + lck_mtx_unlock(bpf_mlock); + + error = bpf_movein(uio, bif_dlt, &m, + bd_hdrcmplt ? NULL : (struct sockaddr *)dst_buf, + &datlen); + + if (error) { return (error); } - if ((unsigned)datlen > ifp->if_mtu) { + /* taking the lock again and verifying whether device is open */ + lck_mtx_lock(bpf_mlock); + d = bpf_dtab[minor(dev)]; + if (d == 0 || d == (void *)1) { lck_mtx_unlock(bpf_mlock); m_freem(m); - return (EMSGSIZE); + return (ENXIO); } - - if ((error = ifp_use(ifp, kIfNetUseCount_MustNotBeZero)) != 0) { + + if (d->bd_bif == NULL) { + lck_mtx_unlock(bpf_mlock); + m_free(m); + return (ENXIO); + } + + if ((unsigned)datlen > ifp->if_mtu) { lck_mtx_unlock(bpf_mlock); m_freem(m); - return (error); + return (EMSGSIZE); } + #if CONFIG_MACF_NET mac_mbuf_label_associate_bpfdesc(d, m); #endif @@ -892,10 +1104,7 @@ bpfwrite(dev_t dev, struct uio *uio, __unused int ioflag) else { error = dlil_output(ifp, PF_INET, m, NULL, (struct sockaddr *)dst_buf, 0); } - - if (ifp_unuse(ifp) != 0) - ifp_use_reached_zero(ifp); - + /* * The driver frees the mbuf. */ @@ -956,6 +1165,10 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags, return (ENXIO); } + if (d->bd_state == BPF_WAITING) + bpf_stop_timer(d); + d->bd_state = BPF_IDLE; + switch (cmd) { default: @@ -1124,34 +1337,60 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags, /* * Set read timeout. */ - case BIOCSRTIMEOUT: - { - struct BPF_TIMEVAL *_tv = (struct BPF_TIMEVAL *)addr; + case BIOCSRTIMEOUT32: + { + struct user32_timeval *_tv = (struct user32_timeval *)addr; struct timeval tv; tv.tv_sec = _tv->tv_sec; tv.tv_usec = _tv->tv_usec; - /* + /* * Subtract 1 tick from tvtohz() since this isn't * a one-shot timer. */ if ((error = itimerfix(&tv)) == 0) d->bd_rtout = tvtohz(&tv) - 1; break; - } + } - /* + case BIOCSRTIMEOUT64: + { + struct user64_timeval *_tv = (struct user64_timeval *)addr; + struct timeval tv; + + tv.tv_sec = _tv->tv_sec; + tv.tv_usec = _tv->tv_usec; + + /* + * Subtract 1 tick from tvtohz() since this isn't + * a one-shot timer. + */ + if ((error = itimerfix(&tv)) == 0) + d->bd_rtout = tvtohz(&tv) - 1; + break; + } + + /* * Get read timeout. */ - case BIOCGRTIMEOUT: + case BIOCGRTIMEOUT32: { - struct BPF_TIMEVAL *tv = (struct BPF_TIMEVAL *)addr; + struct user32_timeval *tv = (struct user32_timeval *)addr; tv->tv_sec = d->bd_rtout / hz; tv->tv_usec = (d->bd_rtout % hz) * tick; break; - } + } + + case BIOCGRTIMEOUT64: + { + struct user64_timeval *tv = (struct user64_timeval *)addr; + + tv->tv_sec = d->bd_rtout / hz; + tv->tv_usec = (d->bd_rtout % hz) * tick; + break; + } /* * Get packet stats. @@ -1320,14 +1559,10 @@ bpf_setif(struct bpf_d *d, ifnet_t theywant, u_int32_t dlt) continue; /* * We found the requested interface. - * If it's not up, return an error. * Allocate the packet buffers if we need to. * If we're already attached to requested interface, * just flush the buffer. */ - if ((ifp->if_flags & IFF_UP) == 0) - return (ENETDOWN); - if (d->bd_sbuf == 0) { error = bpf_allocbufs(d); if (error != 0) @@ -1441,10 +1676,10 @@ bpf_setdlt(struct bpf_d *d, uint32_t dlt) * Otherwise, return false but make a note that a selwakeup() must be done. */ int -bpfpoll(dev_t dev, int events, void * wql, struct proc *p) +bpfselect(dev_t dev, int which, void * wql, struct proc *p) { struct bpf_d *d; - int revents = 0; + int ret = 0; lck_mtx_lock(bpf_mlock); @@ -1454,25 +1689,38 @@ bpfpoll(dev_t dev, int events, void * wql, struct proc *p) return (ENXIO); } - /* - * An imitation of the FIONREAD ioctl code. - */ if (d->bd_bif == NULL) { lck_mtx_unlock(bpf_mlock); return (ENXIO); } - if (events & (POLLIN | POLLRDNORM)) { - if (d->bd_hlen != 0 || (d->bd_immediate && d->bd_slen != 0)) - revents |= events & (POLLIN | POLLRDNORM); - else - selrecord(p, &d->bd_sel, wql); + switch (which) { + case FREAD: + if (d->bd_hlen != 0 || + ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) && + d->bd_slen != 0)) + ret = 1; /* read has data to return */ + else { + /* + * Read has no data to return. + * Make the select wait, and start a timer if + * necessary. + */ + selrecord(p, &d->bd_sel, wql); + bpf_start_timer(d); + } + break; + + case FWRITE: + ret = 1; /* can't determine whether a write would block */ + break; } lck_mtx_unlock(bpf_mlock); - return (revents); + return (ret); } + /* * Support for kevent() system call. Register EVFILT_READ filters and * reject all others. @@ -1511,9 +1759,6 @@ bpfkqfilter(dev_t dev, struct knote *kn) return (ENXIO); } - /* - * An imitation of the FIONREAD ioctl code. - */ if (d->bd_bif == NULL) { lck_mtx_unlock(bpf_mlock); return (ENXIO); @@ -1546,13 +1791,52 @@ filt_bpfread(struct knote *kn, long hint) lck_mtx_lock(bpf_mlock); if (d->bd_immediate) { + /* + * If there's data in the hold buffer, it's the + * amount of data a read will return. + * + * If there's no data in the hold buffer, but + * there's data in the store buffer, a read will + * immediately rotate the store buffer to the + * hold buffer, the amount of data in the store + * buffer is the amount of data a read will + * return. + * + * If there's no data in either buffer, we're not + * ready to read. + */ kn->kn_data = (d->bd_hlen == 0 ? d->bd_slen : d->bd_hlen); - ready = (kn->kn_data >= ((kn->kn_sfflags & NOTE_LOWAT) ? - kn->kn_sdata : 1)); + int64_t lowwat = 1; + if (kn->kn_sfflags & NOTE_LOWAT) + { + if (kn->kn_sdata > d->bd_bufsize) + lowwat = d->bd_bufsize; + else if (kn->kn_sdata > lowwat) + lowwat = kn->kn_sdata; + } + ready = (kn->kn_data >= lowwat); } else { - kn->kn_data = d->bd_hlen; + /* + * If there's data in the hold buffer, it's the + * amount of data a read will return. + * + * If there's no data in the hold buffer, but + * there's data in the store buffer, if the + * timer has expired a read will immediately + * rotate the store buffer to the hold buffer, + * so the amount of data in the store buffer is + * the amount of data a read will return. + * + * If there's no data in either buffer, or there's + * no data in the hold buffer and the timer hasn't + * expired, we're not ready to read. + */ + kn->kn_data = (d->bd_hlen == 0 && d->bd_state == BPF_TIMED_OUT ? + d->bd_slen : d->bd_hlen); ready = (kn->kn_data > 0); } + if (!ready) + bpf_start_timer(d); if (hint == 0) lck_mtx_unlock(bpf_mlock); @@ -1721,6 +2005,7 @@ catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, struct bpf_hdr *hp; int totlen, curlen; int hdrlen = d->bd_bif->bif_hdrlen; + int do_wakeup = 0; /* * Figure out how many bytes to move. If the packet is * greater or equal to the snapshot length, transfer that @@ -1741,7 +2026,7 @@ catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, * Rotate the buffers if we can, then wakeup any * pending reads. */ - if (d->bd_fbuf == 0) { + if (d->bd_fbuf == NULL) { /* * We haven't completed the previous read yet, * so drop the packet. @@ -1750,15 +2035,16 @@ catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, return; } ROTATE_BUFFERS(d); - bpf_wakeup(d); + do_wakeup = 1; curlen = 0; } - else if (d->bd_immediate) + else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) /* - * Immediate mode is set. A packet arrived so any - * reads should be woken up. + * Immediate mode is set, or the read timeout has + * already expired during a select call. A packet + * arrived, so the reader should be woken up. */ - bpf_wakeup(d); + do_wakeup = 1; /* * Append the bpf header. @@ -1775,6 +2061,9 @@ catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, */ (*cpfn)(pkt, (u_char *)hp + hdrlen, (hp->bh_caplen = totlen - hdrlen)); d->bd_slen = curlen + totlen; + + if (do_wakeup) + bpf_wakeup(d); } /* diff --git a/bsd/net/bpf.h b/bsd/net/bpf.h index a07cfe28e..92a5f31a0 100644 --- a/bsd/net/bpf.h +++ b/bsd/net/bpf.h @@ -172,8 +172,16 @@ struct bpf_version { #define BIOCGDLT _IOR('B',106, u_int) #define BIOCGETIF _IOR('B',107, struct ifreq) #define BIOCSETIF _IOW('B',108, struct ifreq) -#define BIOCSRTIMEOUT _IOW('B',109, struct BPF_TIMEVAL) -#define BIOCGRTIMEOUT _IOR('B',110, struct BPF_TIMEVAL) +#define BIOCSRTIMEOUT _IOW('B',109, struct timeval) +#ifdef KERNEL_PRIVATE +#define BIOCSRTIMEOUT64 _IOW('B',109, struct user64_timeval) +#define BIOCSRTIMEOUT32 _IOW('B',109, struct user32_timeval) +#endif /* KERNEL_PRIVATE */ +#define BIOCGRTIMEOUT _IOR('B',110, struct timeval) +#ifdef KERNEL_PRIVATE +#define BIOCGRTIMEOUT64 _IOR('B',110, struct user64_timeval) +#define BIOCGRTIMEOUT32 _IOR('B',110, struct user32_timeval) +#endif /* KERNEL_PRIVATE */ #define BIOCGSTATS _IOR('B',111, struct bpf_stat) #define BIOCIMMEDIATE _IOW('B',112, u_int) #define BIOCVERSION _IOR('B',113, struct bpf_version) diff --git a/bsd/net/bpf_filter.c b/bsd/net/bpf_filter.c index 31ce77023..69d35371f 100644 --- a/bsd/net/bpf_filter.c +++ b/bsd/net/bpf_filter.c @@ -110,6 +110,8 @@ } \ } +extern unsigned int bpf_maxbufsize; + static u_int16_t m_xhalf(struct mbuf *m, bpf_u_int32 k, int *err); static u_int32_t m_xword(struct mbuf *m, bpf_u_int32 k, int *err); @@ -528,9 +530,10 @@ bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) /* * Return true if the 'fcode' is a valid filter program. * The constraints are that each jump be forward and to a valid - * code. The code must terminate with either an accept or reject. - * 'valid' is an array for use by the routine (it must be at least - * 'len' bytes long). + * code, that memory accesses are within valid ranges (to the + * extent that this can be checked statically; loads of packet data + * have to be, and are, also checked at run time), and that + * the code terminates with either an accept or reject. * * The kernel needs to be able to verify an application's filter code. * Otherwise, a bogus program could easily crash the system. @@ -538,40 +541,112 @@ bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) int bpf_validate(const struct bpf_insn *f, int len) { - register int i; + u_int i, from; const struct bpf_insn *p; - for (i = 0; i < len; ++i) { - /* - * Check that that jumps are forward, and within - * the code block. - */ + if (len < 1 || len > BPF_MAXINSNS) + return 0; + + for (i = 0; i < ((u_int)len); ++i) { p = &f[i]; - if (BPF_CLASS(p->code) == BPF_JMP) { - register int from = i + 1; - - if (BPF_OP(p->code) == BPF_JA) { - if (from >= len || p->k >= (bpf_u_int32)(len - from)) + switch (BPF_CLASS(p->code)) { + /* + * Check that memory operations use valid addresses + */ + case BPF_LD: + case BPF_LDX: + switch (BPF_MODE(p->code)) { + case BPF_IMM: + break; + case BPF_ABS: + case BPF_IND: + case BPF_MSH: + /* + * More strict check with actual packet length + * is done runtime. + */ + if (p->k >= bpf_maxbufsize) + return 0; + break; + case BPF_MEM: + if (p->k >= BPF_MEMWORDS) + return 0; + break; + case BPF_LEN: + break; + default: + return 0; + } + break; + case BPF_ST: + case BPF_STX: + if (p->k >= BPF_MEMWORDS) return 0; - } - else if (from >= len || p->jt >= len - from || - p->jf >= len - from) + break; + case BPF_ALU: + switch (BPF_OP(p->code)) { + case BPF_ADD: + case BPF_SUB: + case BPF_MUL: + case BPF_OR: + case BPF_AND: + case BPF_LSH: + case BPF_RSH: + case BPF_NEG: + break; + case BPF_DIV: + /* + * Check for constant division by 0 + */ + if(BPF_SRC(p->code) == BPF_K && p->k == 0) + return 0; + break; + default: + return 0; + } + break; + case BPF_JMP: + /* + * Check that jumps are within the code block, + * and that unconditional branches don't go + * backwards as a result of an overflow. + * Unconditional branches have a 32-bit offset, + * so they could overflow; we check to make + * sure they don't. Conditional branches have + * an 8-bit offset, and the from address is + * less than equal to BPF_MAXINSNS, and we assume that + * BPF_MAXINSNS is sufficiently small that adding 255 + * to it won't overlflow + * + * We know that len is <= BPF_MAXINSNS, and we + * assume that BPF_MAXINSNS is less than the maximum + * size of a u_int, so that i+1 doesn't overflow + */ + from = i+1; + switch (BPF_OP(p->code)) { + case BPF_JA: + if (from + p->k < from || from + p->k >= ((u_int)len)) + return 0; + break; + case BPF_JEQ: + case BPF_JGT: + case BPF_JGE: + case BPF_JSET: + if (from + p->jt >= ((u_int)len) || from + p->jf >= ((u_int)len)) + return 0; + break; + default: + return 0; + } + break; + case BPF_RET: + break; + case BPF_MISC: + break; + default: return 0; } - /* - * Check that memory operations use valid addresses. - */ - if ((BPF_CLASS(p->code) == BPF_ST || - (BPF_CLASS(p->code) == BPF_LD && - (p->code & 0xe0) == BPF_MEM)) && - p->k >= BPF_MEMWORDS) - return 0; - /* - * Check for constant division by 0. - */ - if (p->code == (BPF_ALU|BPF_DIV|BPF_K) && p->k == 0) - return 0; } - return BPF_CLASS(f[len - 1].code) == BPF_RET; + return BPF_CLASS(f[len - 1].code) == BPF_RET; } #endif diff --git a/bsd/net/bpfdesc.h b/bsd/net/bpfdesc.h index 2a5cd1aaf..e0507f935 100644 --- a/bsd/net/bpfdesc.h +++ b/bsd/net/bpfdesc.h @@ -76,6 +76,7 @@ */ #include +#include /* * Descriptor associated with each open bpf file. @@ -99,7 +100,7 @@ struct bpf_d { int bd_bufsize; /* absolute length of buffers */ - struct bpf_if * bd_bif; /* interface descriptor */ + struct bpf_if *bd_bif; /* interface descriptor */ u_int32_t bd_rtout; /* Read timeout in 'ticks' */ struct bpf_insn *bd_filter; /* filter code */ u_int32_t bd_rcount; /* number of packets received */ @@ -127,11 +128,24 @@ struct bpf_d { int bd_hdrcmplt; /* false to fill in src lladdr automatically */ int bd_seesent; /* true if bpf should see sent packets */ int bd_oflags; /* device open flags */ + thread_call_t bd_thread_call; /* for BPF timeouts with select */ #if CONFIG_MACF_NET struct label * bd_label; /* MAC label for descriptor */ #endif }; +/* Values for bd_state */ +#define BPF_IDLE 0 /* no select in progress or kqueue pending */ +#define BPF_WAITING 1 /* waiting for read timeout in select/kqueue */ +#define BPF_TIMED_OUT 2 /* read timeout has expired in select/kqueue */ +#define BPF_DRAINING 3 /* waiting for timeout routine to finish during close */ + +/* Test whether a BPF is ready for read(). */ +#define bpf_ready(bd) ((bd)->bd_hlen != 0 || \ + (((bd)->bd_immediate || (bd)->bd_state == BPF_TIMED_OUT) && \ + (bd)->bd_slen != 0)) + + /* * Descriptor associated with each attached hardware interface. */ diff --git a/bsd/net/bridgestp.c b/bsd/net/bridgestp.c new file mode 100644 index 000000000..1d6922f28 --- /dev/null +++ b/bsd/net/bridgestp.c @@ -0,0 +1,2425 @@ +/* $NetBSD: bridgestp.c,v 1.5 2003/11/28 08:56:48 keihan Exp $ */ + +/* + * Copyright (c) 2009-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Copyright (c) 2000 Jason L. Wright (jason@thought.net) + * Copyright (c) 2006 Andrew Thompson (thompsa@FreeBSD.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * OpenBSD: bridgestp.c,v 1.5 2001/03/22 03:48:29 jason Exp + */ + +/* + * Implementation of the spanning tree protocol as defined in + * ISO/IEC 802.1D-2004, June 9, 2004. + */ + +#include +//__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +//#include +//#include +#include +#include +//#include +//#include + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +#include + +static lck_mtx_t *bstp_task_mtx = NULL; +static lck_grp_t *bstp_task_grp = NULL; +static lck_attr_t *bstp_task_attr = NULL; +static thread_t bstp_task_thread; +static TAILQ_HEAD(bstp_task_queue, bstp_task) + bstp_task_queue = TAILQ_HEAD_INITIALIZER(bstp_task_queue); +static struct bstp_task *bstp_task_queue_running = NULL; + +static void bstp_create_task_thread(void); +static void bstp_task_thread_func(void); + +static void bstp_task_enqueue(struct bstp_task *); +static void bstp_task_drain(struct bstp_task *); + +#define BSTP_TASK_INIT(bt, func, context) do { \ + (bt)->bt_count = 0; \ + (bt)->bt_func = func; \ + (bt)->bt_context = context; \ +} while(0) + + + +#define BSTP_LOCK_INIT(_bs) (_bs)->bs_mtx = lck_mtx_alloc_init(bstp_lock_grp, bstp_lock_attr) +#define BSTP_LOCK_DESTROY(_bs) lck_mtx_free((_bs)->bs_mtx, bstp_lock_grp) +#define BSTP_LOCK(_bs) lck_mtx_lock((_bs)->bs_mtx) +#define BSTP_UNLOCK(_bs) lck_mtx_unlock((_bs)->bs_mtx) +#define BSTP_LOCK_ASSERT(_bs) lck_mtx_assert((_bs)->bs_mtx, LCK_MTX_ASSERT_OWNED) + + +#ifdef BRIDGESTP_DEBUG +#define DPRINTF(fmt, arg...) printf("bstp: " fmt, ##arg) +#else +#define DPRINTF(fmt, arg...) +#endif + +#define PV2ADDR(pv, eaddr) do { \ + eaddr[0] = pv >> 40; \ + eaddr[1] = pv >> 32; \ + eaddr[2] = pv >> 24; \ + eaddr[3] = pv >> 16; \ + eaddr[4] = pv >> 8; \ + eaddr[5] = pv >> 0; \ +} while (0) + +#define INFO_BETTER 1 +#define INFO_SAME 0 +#define INFO_WORSE -1 + +const uint8_t bstp_etheraddr[] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 }; + +LIST_HEAD(, bstp_state) bstp_list; +static lck_mtx_t *bstp_list_mtx; +static lck_grp_t *bstp_lock_grp = NULL; +static lck_attr_t *bstp_lock_attr = NULL; + +static void bstp_transmit(struct bstp_state *, struct bstp_port *); +static void bstp_transmit_bpdu(struct bstp_state *, struct bstp_port *); +static void bstp_transmit_tcn(struct bstp_state *, struct bstp_port *); +static void bstp_decode_bpdu(struct bstp_port *, struct bstp_cbpdu *, + struct bstp_config_unit *); +static void bstp_send_bpdu(struct bstp_state *, struct bstp_port *, + struct bstp_cbpdu *); +static void bstp_enqueue(struct ifnet *, struct mbuf *); +static int bstp_pdu_flags(struct bstp_port *); +static void bstp_received_stp(struct bstp_state *, struct bstp_port *, + struct mbuf **, struct bstp_tbpdu *); +static void bstp_received_rstp(struct bstp_state *, struct bstp_port *, + struct mbuf **, struct bstp_tbpdu *); +static void bstp_received_tcn(struct bstp_state *, struct bstp_port *, + struct bstp_tcn_unit *); +static void bstp_received_bpdu(struct bstp_state *, struct bstp_port *, + struct bstp_config_unit *); +static int bstp_pdu_rcvtype(struct bstp_port *, struct bstp_config_unit *); +static int bstp_pdu_bettersame(struct bstp_port *, int); +static int bstp_info_cmp(struct bstp_pri_vector *, + struct bstp_pri_vector *); +static int bstp_info_superior(struct bstp_pri_vector *, + struct bstp_pri_vector *); +static void bstp_assign_roles(struct bstp_state *); +static void bstp_update_roles(struct bstp_state *, struct bstp_port *); +static void bstp_update_state(struct bstp_state *, struct bstp_port *); +static void bstp_update_tc(struct bstp_port *); +static void bstp_update_info(struct bstp_port *); +static void bstp_set_other_tcprop(struct bstp_port *); +static void bstp_set_all_reroot(struct bstp_state *); +static void bstp_set_all_sync(struct bstp_state *); +static void bstp_set_port_state(struct bstp_port *, int); +static void bstp_set_port_role(struct bstp_port *, int); +static void bstp_set_port_proto(struct bstp_port *, int); +static void bstp_set_port_tc(struct bstp_port *, int); +static void bstp_set_timer_tc(struct bstp_port *); +static void bstp_set_timer_msgage(struct bstp_port *); +static int bstp_rerooted(struct bstp_state *, struct bstp_port *); +static uint32_t bstp_calc_path_cost(struct bstp_port *); +static void bstp_notify_state(void *, int); +static void bstp_notify_rtage(void *, int); +static void bstp_ifupdstatus(struct bstp_state *, struct bstp_port *); +static void bstp_enable_port(struct bstp_state *, struct bstp_port *); +static void bstp_disable_port(struct bstp_state *, struct bstp_port *); +static void bstp_tick(void *); +static void bstp_timer_start(struct bstp_timer *, uint16_t); +static void bstp_timer_stop(struct bstp_timer *); +static void bstp_timer_latch(struct bstp_timer *); +static int bstp_timer_expired(struct bstp_timer *); +static void bstp_hello_timer_expiry(struct bstp_state *, + struct bstp_port *); +static void bstp_message_age_expiry(struct bstp_state *, + struct bstp_port *); +static void bstp_migrate_delay_expiry(struct bstp_state *, + struct bstp_port *); +static void bstp_edge_delay_expiry(struct bstp_state *, + struct bstp_port *); +static int bstp_addr_cmp(const uint8_t *, const uint8_t *); +static int bstp_same_bridgeid(uint64_t, uint64_t); +static void bstp_reinit(struct bstp_state *); + +static void +bstp_transmit(struct bstp_state *bs, struct bstp_port *bp) +{ + if (bs->bs_running == 0) + return; + + /* + * a PDU can only be sent if we have tx quota left and the + * hello timer is running. + */ + if (bp->bp_hello_timer.active == 0) { + /* Test if it needs to be reset */ + bstp_hello_timer_expiry(bs, bp); + return; + } + if (bp->bp_txcount > bs->bs_txholdcount) + /* Ran out of karma */ + return; + + if (bp->bp_protover == BSTP_PROTO_RSTP) { + bstp_transmit_bpdu(bs, bp); + bp->bp_tc_ack = 0; + } else { /* STP */ + switch (bp->bp_role) { + case BSTP_ROLE_DESIGNATED: + bstp_transmit_bpdu(bs, bp); + bp->bp_tc_ack = 0; + break; + + case BSTP_ROLE_ROOT: + bstp_transmit_tcn(bs, bp); + break; + } + } + bstp_timer_start(&bp->bp_hello_timer, bp->bp_desg_htime); + bp->bp_flags &= ~BSTP_PORT_NEWINFO; +} + +static void +bstp_transmit_bpdu(struct bstp_state *bs, struct bstp_port *bp) +{ + struct bstp_cbpdu bpdu; + + BSTP_LOCK_ASSERT(bs); + + bpdu.cbu_rootpri = htons(bp->bp_desg_pv.pv_root_id >> 48); + PV2ADDR(bp->bp_desg_pv.pv_root_id, bpdu.cbu_rootaddr); + + bpdu.cbu_rootpathcost = htonl(bp->bp_desg_pv.pv_cost); + + bpdu.cbu_bridgepri = htons(bp->bp_desg_pv.pv_dbridge_id >> 48); + PV2ADDR(bp->bp_desg_pv.pv_dbridge_id, bpdu.cbu_bridgeaddr); + + bpdu.cbu_portid = htons(bp->bp_port_id); + bpdu.cbu_messageage = htons(bp->bp_desg_msg_age); + bpdu.cbu_maxage = htons(bp->bp_desg_max_age); + bpdu.cbu_hellotime = htons(bp->bp_desg_htime); + bpdu.cbu_forwarddelay = htons(bp->bp_desg_fdelay); + + bpdu.cbu_flags = bstp_pdu_flags(bp); + + switch (bp->bp_protover) { + case BSTP_PROTO_STP: + bpdu.cbu_bpdutype = BSTP_MSGTYPE_CFG; + break; + + case BSTP_PROTO_RSTP: + bpdu.cbu_bpdutype = BSTP_MSGTYPE_RSTP; + break; + } + + bstp_send_bpdu(bs, bp, &bpdu); +} + +static void +bstp_transmit_tcn(struct bstp_state *bs, struct bstp_port *bp) +{ + struct bstp_tbpdu bpdu; + struct ifnet *ifp = bp->bp_ifp; + struct ether_header *eh; + struct mbuf *m; + int touched = bs ? 1 : 0; + + touched++; + + KASSERT(bp == bs->bs_root_port, ("%s: bad root port\n", __func__)); + + if ((ifp->if_flags & IFF_RUNNING) == 0) + return; + + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == NULL) + return; + + m->m_pkthdr.rcvif = ifp; + m->m_pkthdr.len = sizeof(*eh) + sizeof(bpdu); + m->m_len = m->m_pkthdr.len; + + eh = mtod(m, struct ether_header *); + + memcpy(eh->ether_shost, ifnet_lladdr(ifp), ETHER_ADDR_LEN); + memcpy(eh->ether_dhost, bstp_etheraddr, ETHER_ADDR_LEN); + eh->ether_type = htons(sizeof(bpdu)); + + bpdu.tbu_ssap = bpdu.tbu_dsap = LLC_8021D_LSAP; + bpdu.tbu_ctl = LLC_UI; + bpdu.tbu_protoid = 0; + bpdu.tbu_protover = 0; + bpdu.tbu_bpdutype = BSTP_MSGTYPE_TCN; + + memcpy(mtod(m, caddr_t) + sizeof(*eh), &bpdu, sizeof(bpdu)); + + bp->bp_txcount++; + bstp_enqueue(ifp, m); +} + +static void +bstp_decode_bpdu(struct bstp_port *bp, struct bstp_cbpdu *cpdu, + struct bstp_config_unit *cu) +{ + int flags; + + cu->cu_pv.pv_root_id = + (((uint64_t)ntohs(cpdu->cbu_rootpri)) << 48) | + (((uint64_t)cpdu->cbu_rootaddr[0]) << 40) | + (((uint64_t)cpdu->cbu_rootaddr[1]) << 32) | + (((uint64_t)cpdu->cbu_rootaddr[2]) << 24) | + (((uint64_t)cpdu->cbu_rootaddr[3]) << 16) | + (((uint64_t)cpdu->cbu_rootaddr[4]) << 8) | + (((uint64_t)cpdu->cbu_rootaddr[5]) << 0); + + cu->cu_pv.pv_dbridge_id = + (((uint64_t)ntohs(cpdu->cbu_bridgepri)) << 48) | + (((uint64_t)cpdu->cbu_bridgeaddr[0]) << 40) | + (((uint64_t)cpdu->cbu_bridgeaddr[1]) << 32) | + (((uint64_t)cpdu->cbu_bridgeaddr[2]) << 24) | + (((uint64_t)cpdu->cbu_bridgeaddr[3]) << 16) | + (((uint64_t)cpdu->cbu_bridgeaddr[4]) << 8) | + (((uint64_t)cpdu->cbu_bridgeaddr[5]) << 0); + + cu->cu_pv.pv_cost = ntohl(cpdu->cbu_rootpathcost); + cu->cu_message_age = ntohs(cpdu->cbu_messageage); + cu->cu_max_age = ntohs(cpdu->cbu_maxage); + cu->cu_hello_time = ntohs(cpdu->cbu_hellotime); + cu->cu_forward_delay = ntohs(cpdu->cbu_forwarddelay); + cu->cu_pv.pv_dport_id = ntohs(cpdu->cbu_portid); + cu->cu_pv.pv_port_id = bp->bp_port_id; + cu->cu_message_type = cpdu->cbu_bpdutype; + + /* Strip off unused flags in STP mode */ + flags = cpdu->cbu_flags; + switch (cpdu->cbu_protover) { + case BSTP_PROTO_STP: + flags &= BSTP_PDU_STPMASK; + /* A STP BPDU explicitly conveys a Designated Port */ + cu->cu_role = BSTP_ROLE_DESIGNATED; + break; + + case BSTP_PROTO_RSTP: + flags &= BSTP_PDU_RSTPMASK; + break; + } + + cu->cu_topology_change_ack = + (flags & BSTP_PDU_F_TCA) ? 1 : 0; + cu->cu_proposal = + (flags & BSTP_PDU_F_P) ? 1 : 0; + cu->cu_agree = + (flags & BSTP_PDU_F_A) ? 1 : 0; + cu->cu_learning = + (flags & BSTP_PDU_F_L) ? 1 : 0; + cu->cu_forwarding = + (flags & BSTP_PDU_F_F) ? 1 : 0; + cu->cu_topology_change = + (flags & BSTP_PDU_F_TC) ? 1 : 0; + + switch ((flags & BSTP_PDU_PRMASK) >> BSTP_PDU_PRSHIFT) { + case BSTP_PDU_F_ROOT: + cu->cu_role = BSTP_ROLE_ROOT; + break; + case BSTP_PDU_F_ALT: + cu->cu_role = BSTP_ROLE_ALTERNATE; + break; + case BSTP_PDU_F_DESG: + cu->cu_role = BSTP_ROLE_DESIGNATED; + break; + } +} + +static void +bstp_send_bpdu(struct bstp_state *bs, struct bstp_port *bp, + struct bstp_cbpdu *bpdu) +{ + struct ifnet *ifp; + struct mbuf *m; + struct ether_header *eh; + + BSTP_LOCK_ASSERT(bs); + + ifp = bp->bp_ifp; + + if ((ifp->if_flags & IFF_RUNNING) == 0) + return; + + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == NULL) + return; + + eh = mtod(m, struct ether_header *); + + bpdu->cbu_ssap = bpdu->cbu_dsap = LLC_8021D_LSAP; + bpdu->cbu_ctl = LLC_UI; + bpdu->cbu_protoid = htons(BSTP_PROTO_ID); + + memcpy(eh->ether_shost, ifnet_lladdr(ifp), ETHER_ADDR_LEN); + memcpy(eh->ether_dhost, bstp_etheraddr, ETHER_ADDR_LEN); + + switch (bpdu->cbu_bpdutype) { + case BSTP_MSGTYPE_CFG: + bpdu->cbu_protover = BSTP_PROTO_STP; + m->m_pkthdr.len = sizeof(*eh) + BSTP_BPDU_STP_LEN; + eh->ether_type = htons(BSTP_BPDU_STP_LEN); + memcpy(mtod(m, caddr_t) + sizeof(*eh), bpdu, + BSTP_BPDU_STP_LEN); + break; + + case BSTP_MSGTYPE_RSTP: + bpdu->cbu_protover = BSTP_PROTO_RSTP; + bpdu->cbu_versionlen = htons(0); + m->m_pkthdr.len = sizeof(*eh) + BSTP_BPDU_RSTP_LEN; + eh->ether_type = htons(BSTP_BPDU_RSTP_LEN); + memcpy(mtod(m, caddr_t) + sizeof(*eh), bpdu, + BSTP_BPDU_RSTP_LEN); + break; + + default: + panic("not implemented"); + } + m->m_pkthdr.rcvif = ifp; + m->m_len = m->m_pkthdr.len; + + bp->bp_txcount++; + bstp_enqueue(ifp, m); +} + +static void +bstp_enqueue(struct ifnet *dst_ifp, struct mbuf *m) +{ + errno_t error = 0; + u_int32_t len = m->m_pkthdr.len; + + m->m_flags |= M_PROTO1; //set to avoid loops + + error = ifnet_output_raw(dst_ifp, 0, m); + if (error == 0) { + (void) ifnet_stat_increment_out(dst_ifp, 1, len, 0); + } else { + (void) ifnet_stat_increment_out(dst_ifp, 0, 0, 1); + } +} + +static int +bstp_pdu_flags(struct bstp_port *bp) +{ + int flags = 0; + + if (bp->bp_proposing && bp->bp_state != BSTP_IFSTATE_FORWARDING) + flags |= BSTP_PDU_F_P; + + if (bp->bp_agree) + flags |= BSTP_PDU_F_A; + + if (bp->bp_tc_timer.active) + flags |= BSTP_PDU_F_TC; + + if (bp->bp_tc_ack) + flags |= BSTP_PDU_F_TCA; + + switch (bp->bp_state) { + case BSTP_IFSTATE_LEARNING: + flags |= BSTP_PDU_F_L; + break; + + case BSTP_IFSTATE_FORWARDING: + flags |= (BSTP_PDU_F_L | BSTP_PDU_F_F); + break; + } + + switch (bp->bp_role) { + case BSTP_ROLE_ROOT: + flags |= + (BSTP_PDU_F_ROOT << BSTP_PDU_PRSHIFT); + break; + + case BSTP_ROLE_ALTERNATE: + case BSTP_ROLE_BACKUP: /* fall through */ + flags |= + (BSTP_PDU_F_ALT << BSTP_PDU_PRSHIFT); + break; + + case BSTP_ROLE_DESIGNATED: + flags |= + (BSTP_PDU_F_DESG << BSTP_PDU_PRSHIFT); + break; + } + + /* Strip off unused flags in either mode */ + switch (bp->bp_protover) { + case BSTP_PROTO_STP: + flags &= BSTP_PDU_STPMASK; + break; + case BSTP_PROTO_RSTP: + flags &= BSTP_PDU_RSTPMASK; + break; + } + return (flags); +} + +struct mbuf * +bstp_input(struct bstp_port *bp, __unused struct ifnet *ifp, struct mbuf *m) +{ + struct bstp_state *bs = bp->bp_bs; + struct ether_header *eh; + struct bstp_tbpdu tpdu; + uint16_t len; + + if (bp->bp_active == 0) { + m_freem(m); + return (NULL); + } + + BSTP_LOCK(bs); + + eh = mtod(m, struct ether_header *); + + len = ntohs(eh->ether_type); + if (len < sizeof(tpdu)) + goto out; + + m_adj(m, ETHER_HDR_LEN); + + if (m->m_pkthdr.len > len) + m_adj(m, len - m->m_pkthdr.len); + if ((unsigned int)m->m_len < sizeof(tpdu) && + (m = m_pullup(m, sizeof(tpdu))) == NULL) + goto out; + + memcpy(&tpdu, mtod(m, caddr_t), sizeof(tpdu)); + + /* basic packet checks */ + if (tpdu.tbu_dsap != LLC_8021D_LSAP || + tpdu.tbu_ssap != LLC_8021D_LSAP || + tpdu.tbu_ctl != LLC_UI) + goto out; + if (tpdu.tbu_protoid != BSTP_PROTO_ID) + goto out; + + /* + * We can treat later versions of the PDU as the same as the maximum + * version we implement. All additional parameters/flags are ignored. + */ + if (tpdu.tbu_protover > BSTP_PROTO_MAX) + tpdu.tbu_protover = BSTP_PROTO_MAX; + + if (tpdu.tbu_protover != bp->bp_protover) { + /* + * Wait for the migration delay timer to expire before changing + * protocol version to avoid flip-flops. + */ + if (bp->bp_flags & BSTP_PORT_CANMIGRATE) + bstp_set_port_proto(bp, tpdu.tbu_protover); + else + goto out; + } + + /* Clear operedge upon receiving a PDU on the port */ + bp->bp_operedge = 0; + bstp_timer_start(&bp->bp_edge_delay_timer, + BSTP_DEFAULT_MIGRATE_DELAY); + + switch (tpdu.tbu_protover) { + case BSTP_PROTO_STP: + bstp_received_stp(bs, bp, &m, &tpdu); + break; + + case BSTP_PROTO_RSTP: + bstp_received_rstp(bs, bp, &m, &tpdu); + break; + } +out: + BSTP_UNLOCK(bs); + if (m) + m_freem(m); + return (NULL); +} + +static void +bstp_received_stp(struct bstp_state *bs, struct bstp_port *bp, + struct mbuf **mp, struct bstp_tbpdu *tpdu) +{ + struct bstp_cbpdu cpdu; + struct bstp_config_unit *cu = &bp->bp_msg_cu; + struct bstp_tcn_unit tu; + + switch (tpdu->tbu_bpdutype) { + case BSTP_MSGTYPE_TCN: + tu.tu_message_type = tpdu->tbu_bpdutype; + bstp_received_tcn(bs, bp, &tu); + break; + case BSTP_MSGTYPE_CFG: + if ((*mp)->m_len < BSTP_BPDU_STP_LEN && + (*mp = m_pullup(*mp, BSTP_BPDU_STP_LEN)) == NULL) + return; + memcpy(&cpdu, mtod(*mp, caddr_t), BSTP_BPDU_STP_LEN); + + bstp_decode_bpdu(bp, &cpdu, cu); + bstp_received_bpdu(bs, bp, cu); + break; + } +} + +static void +bstp_received_rstp(struct bstp_state *bs, struct bstp_port *bp, + struct mbuf **mp, struct bstp_tbpdu *tpdu) +{ + struct bstp_cbpdu cpdu; + struct bstp_config_unit *cu = &bp->bp_msg_cu; + + if (tpdu->tbu_bpdutype != BSTP_MSGTYPE_RSTP) + return; + + if ((*mp)->m_len < BSTP_BPDU_RSTP_LEN && + (*mp = m_pullup(*mp, BSTP_BPDU_RSTP_LEN)) == NULL) + return; + memcpy(&cpdu, mtod(*mp, caddr_t), BSTP_BPDU_RSTP_LEN); + + bstp_decode_bpdu(bp, &cpdu, cu); + bstp_received_bpdu(bs, bp, cu); +} + +static void +bstp_received_tcn(__unused struct bstp_state *bs, struct bstp_port *bp, + __unused struct bstp_tcn_unit *tcn) +{ + bp->bp_rcvdtcn = 1; + bstp_update_tc(bp); +} + +static void +bstp_received_bpdu(struct bstp_state *bs, struct bstp_port *bp, + struct bstp_config_unit *cu) +{ + int type; + + BSTP_LOCK_ASSERT(bs); + + /* We need to have transitioned to INFO_MINE before proceeding */ + switch (bp->bp_infois) { + case BSTP_INFO_DISABLED: + case BSTP_INFO_AGED: + return; + } + + type = bstp_pdu_rcvtype(bp, cu); + + switch (type) { + case BSTP_PDU_SUPERIOR: + bs->bs_allsynced = 0; + bp->bp_agreed = 0; + bp->bp_proposing = 0; + + if (cu->cu_proposal && cu->cu_forwarding == 0) + bp->bp_proposed = 1; + if (cu->cu_topology_change) + bp->bp_rcvdtc = 1; + if (cu->cu_topology_change_ack) + bp->bp_rcvdtca = 1; + + if (bp->bp_agree && + !bstp_pdu_bettersame(bp, BSTP_INFO_RECEIVED)) + bp->bp_agree = 0; + + /* copy the received priority and timers to the port */ + bp->bp_port_pv = cu->cu_pv; + bp->bp_port_msg_age = cu->cu_message_age; + bp->bp_port_max_age = cu->cu_max_age; + bp->bp_port_fdelay = cu->cu_forward_delay; + bp->bp_port_htime = + (cu->cu_hello_time > BSTP_MIN_HELLO_TIME ? + cu->cu_hello_time : BSTP_MIN_HELLO_TIME); + + /* set expiry for the new info */ + bstp_set_timer_msgage(bp); + + bp->bp_infois = BSTP_INFO_RECEIVED; + bstp_assign_roles(bs); + break; + + case BSTP_PDU_REPEATED: + if (cu->cu_proposal && cu->cu_forwarding == 0) + bp->bp_proposed = 1; + if (cu->cu_topology_change) + bp->bp_rcvdtc = 1; + if (cu->cu_topology_change_ack) + bp->bp_rcvdtca = 1; + + /* rearm the age timer */ + bstp_set_timer_msgage(bp); + break; + + case BSTP_PDU_INFERIOR: + if (cu->cu_learning) { + bp->bp_agreed = 1; + bp->bp_proposing = 0; + } + break; + + case BSTP_PDU_INFERIORALT: + /* + * only point to point links are allowed fast + * transitions to forwarding. + */ + if (cu->cu_agree && bp->bp_ptp_link) { + bp->bp_agreed = 1; + bp->bp_proposing = 0; + } else + bp->bp_agreed = 0; + + if (cu->cu_topology_change) + bp->bp_rcvdtc = 1; + if (cu->cu_topology_change_ack) + bp->bp_rcvdtca = 1; + break; + + case BSTP_PDU_OTHER: + return; /* do nothing */ + } + /* update the state machines with the new data */ + bstp_update_state(bs, bp); +} + +static int +bstp_pdu_rcvtype(struct bstp_port *bp, struct bstp_config_unit *cu) +{ + int type; + + /* default return type */ + type = BSTP_PDU_OTHER; + + switch (cu->cu_role) { + case BSTP_ROLE_DESIGNATED: + if (bstp_info_superior(&bp->bp_port_pv, &cu->cu_pv)) + /* bpdu priority is superior */ + type = BSTP_PDU_SUPERIOR; + else if (bstp_info_cmp(&bp->bp_port_pv, &cu->cu_pv) == + INFO_SAME) { + if (bp->bp_port_msg_age != cu->cu_message_age || + bp->bp_port_max_age != cu->cu_max_age || + bp->bp_port_fdelay != cu->cu_forward_delay || + bp->bp_port_htime != cu->cu_hello_time) + /* bpdu priority is equal and timers differ */ + type = BSTP_PDU_SUPERIOR; + else + /* bpdu is equal */ + type = BSTP_PDU_REPEATED; + } else + /* bpdu priority is worse */ + type = BSTP_PDU_INFERIOR; + + break; + + case BSTP_ROLE_ROOT: + case BSTP_ROLE_ALTERNATE: + case BSTP_ROLE_BACKUP: + if (bstp_info_cmp(&bp->bp_port_pv, &cu->cu_pv) <= INFO_SAME) + /* + * not a designated port and priority is the same or + * worse + */ + type = BSTP_PDU_INFERIORALT; + break; + } + + return (type); +} + +static int +bstp_pdu_bettersame(struct bstp_port *bp, int newinfo) +{ + if (newinfo == BSTP_INFO_RECEIVED && + bp->bp_infois == BSTP_INFO_RECEIVED && + bstp_info_cmp(&bp->bp_port_pv, &bp->bp_msg_cu.cu_pv) >= INFO_SAME) + return (1); + + if (newinfo == BSTP_INFO_MINE && + bp->bp_infois == BSTP_INFO_MINE && + bstp_info_cmp(&bp->bp_port_pv, &bp->bp_desg_pv) >= INFO_SAME) + return (1); + + return (0); +} + +static int +bstp_info_cmp(struct bstp_pri_vector *pv, + struct bstp_pri_vector *cpv) +{ + if (cpv->pv_root_id < pv->pv_root_id) + return (INFO_BETTER); + if (cpv->pv_root_id > pv->pv_root_id) + return (INFO_WORSE); + + if (cpv->pv_cost < pv->pv_cost) + return (INFO_BETTER); + if (cpv->pv_cost > pv->pv_cost) + return (INFO_WORSE); + + if (cpv->pv_dbridge_id < pv->pv_dbridge_id) + return (INFO_BETTER); + if (cpv->pv_dbridge_id > pv->pv_dbridge_id) + return (INFO_WORSE); + + if (cpv->pv_dport_id < pv->pv_dport_id) + return (INFO_BETTER); + if (cpv->pv_dport_id > pv->pv_dport_id) + return (INFO_WORSE); + + return (INFO_SAME); +} + +/* + * This message priority vector is superior to the port priority vector and + * will replace it if, and only if, the message priority vector is better than + * the port priority vector, or the message has been transmitted from the same + * designated bridge and designated port as the port priority vector. + */ +static int +bstp_info_superior(struct bstp_pri_vector *pv, + struct bstp_pri_vector *cpv) +{ + if (bstp_info_cmp(pv, cpv) == INFO_BETTER || + (bstp_same_bridgeid(pv->pv_dbridge_id, cpv->pv_dbridge_id) && + (cpv->pv_dport_id & 0xfff) == (pv->pv_dport_id & 0xfff))) + return (1); + return (0); +} + +static void +bstp_assign_roles(struct bstp_state *bs) +{ + struct bstp_port *bp, *rbp = NULL; + struct bstp_pri_vector pv; + + /* default to our priority vector */ + bs->bs_root_pv = bs->bs_bridge_pv; + bs->bs_root_msg_age = 0; + bs->bs_root_max_age = bs->bs_bridge_max_age; + bs->bs_root_fdelay = bs->bs_bridge_fdelay; + bs->bs_root_htime = bs->bs_bridge_htime; + bs->bs_root_port = NULL; + + /* check if any recieved info supersedes us */ + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { + if (bp->bp_infois != BSTP_INFO_RECEIVED) + continue; + + pv = bp->bp_port_pv; + pv.pv_cost += bp->bp_path_cost; + + /* + * The root priority vector is the best of the set comprising + * the bridge priority vector plus all root path priority + * vectors whose bridge address is not equal to us. + */ + if (bstp_same_bridgeid(pv.pv_dbridge_id, + bs->bs_bridge_pv.pv_dbridge_id) == 0 && + bstp_info_cmp(&bs->bs_root_pv, &pv) == INFO_BETTER) { + /* the port vector replaces the root */ + bs->bs_root_pv = pv; + bs->bs_root_msg_age = bp->bp_port_msg_age + + BSTP_MESSAGE_AGE_INCR; + bs->bs_root_max_age = bp->bp_port_max_age; + bs->bs_root_fdelay = bp->bp_port_fdelay; + bs->bs_root_htime = bp->bp_port_htime; + rbp = bp; + } + } + + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { + /* calculate the port designated vector */ + bp->bp_desg_pv.pv_root_id = bs->bs_root_pv.pv_root_id; + bp->bp_desg_pv.pv_cost = bs->bs_root_pv.pv_cost; + bp->bp_desg_pv.pv_dbridge_id = bs->bs_bridge_pv.pv_dbridge_id; + bp->bp_desg_pv.pv_dport_id = bp->bp_port_id; + bp->bp_desg_pv.pv_port_id = bp->bp_port_id; + + /* calculate designated times */ + bp->bp_desg_msg_age = bs->bs_root_msg_age; + bp->bp_desg_max_age = bs->bs_root_max_age; + bp->bp_desg_fdelay = bs->bs_root_fdelay; + bp->bp_desg_htime = bs->bs_bridge_htime; + + + switch (bp->bp_infois) { + case BSTP_INFO_DISABLED: + bstp_set_port_role(bp, BSTP_ROLE_DISABLED); + break; + + case BSTP_INFO_AGED: + bstp_set_port_role(bp, BSTP_ROLE_DESIGNATED); + bstp_update_info(bp); + break; + + case BSTP_INFO_MINE: + bstp_set_port_role(bp, BSTP_ROLE_DESIGNATED); + /* update the port info if stale */ + if (bstp_info_cmp(&bp->bp_port_pv, + &bp->bp_desg_pv) != INFO_SAME || + (rbp != NULL && + (bp->bp_port_msg_age != rbp->bp_port_msg_age || + bp->bp_port_max_age != rbp->bp_port_max_age || + bp->bp_port_fdelay != rbp->bp_port_fdelay || + bp->bp_port_htime != rbp->bp_port_htime))) + bstp_update_info(bp); + break; + + case BSTP_INFO_RECEIVED: + if (bp == rbp) { + /* + * root priority is derived from this + * port, make it the root port. + */ + bstp_set_port_role(bp, BSTP_ROLE_ROOT); + bs->bs_root_port = bp; + } else if (bstp_info_cmp(&bp->bp_port_pv, + &bp->bp_desg_pv) == INFO_BETTER) { + /* + * the port priority is lower than the root + * port. + */ + bstp_set_port_role(bp, BSTP_ROLE_DESIGNATED); + bstp_update_info(bp); + } else { + if (bstp_same_bridgeid( + bp->bp_port_pv.pv_dbridge_id, + bs->bs_bridge_pv.pv_dbridge_id)) { + /* + * the designated bridge refers to + * another port on this bridge. + */ + bstp_set_port_role(bp, + BSTP_ROLE_BACKUP); + } else { + /* + * the port is an inferior path to the + * root bridge. + */ + bstp_set_port_role(bp, + BSTP_ROLE_ALTERNATE); + } + } + break; + } + } +} + +static void +bstp_update_state(struct bstp_state *bs, struct bstp_port *bp) +{ + struct bstp_port *bp2; + int synced; + + BSTP_LOCK_ASSERT(bs); + + /* check if all the ports have syncronised again */ + if (!bs->bs_allsynced) { + synced = 1; + LIST_FOREACH(bp2, &bs->bs_bplist, bp_next) { + if (!(bp2->bp_synced || + bp2->bp_role == BSTP_ROLE_ROOT)) { + synced = 0; + break; + } + } + bs->bs_allsynced = synced; + } + + bstp_update_roles(bs, bp); + bstp_update_tc(bp); +} + +static void +bstp_update_roles(struct bstp_state *bs, struct bstp_port *bp) +{ + switch (bp->bp_role) { + case BSTP_ROLE_DISABLED: + /* Clear any flags if set */ + if (bp->bp_sync || !bp->bp_synced || bp->bp_reroot) { + bp->bp_sync = 0; + bp->bp_synced = 1; + bp->bp_reroot = 0; + } + break; + + case BSTP_ROLE_ALTERNATE: + case BSTP_ROLE_BACKUP: + if ((bs->bs_allsynced && !bp->bp_agree) || + (bp->bp_proposed && bp->bp_agree)) { + bp->bp_proposed = 0; + bp->bp_agree = 1; + bp->bp_flags |= BSTP_PORT_NEWINFO; + DPRINTF("%s -> ALTERNATE_AGREED\n", + bp->bp_ifp->if_xname); + } + + if (bp->bp_proposed && !bp->bp_agree) { + bstp_set_all_sync(bs); + bp->bp_proposed = 0; + DPRINTF("%s -> ALTERNATE_PROPOSED\n", + bp->bp_ifp->if_xname); + } + + /* Clear any flags if set */ + if (bp->bp_sync || !bp->bp_synced || bp->bp_reroot) { + bp->bp_sync = 0; + bp->bp_synced = 1; + bp->bp_reroot = 0; + DPRINTF("%s -> ALTERNATE_PORT\n", bp->bp_ifp->if_xname); + } + break; + + case BSTP_ROLE_ROOT: + if (bp->bp_state != BSTP_IFSTATE_FORWARDING && !bp->bp_reroot) { + bstp_set_all_reroot(bs); + DPRINTF("%s -> ROOT_REROOT\n", bp->bp_ifp->if_xname); + } + + if ((bs->bs_allsynced && !bp->bp_agree) || + (bp->bp_proposed && bp->bp_agree)) { + bp->bp_proposed = 0; + bp->bp_sync = 0; + bp->bp_agree = 1; + bp->bp_flags |= BSTP_PORT_NEWINFO; + DPRINTF("%s -> ROOT_AGREED\n", bp->bp_ifp->if_xname); + } + + if (bp->bp_proposed && !bp->bp_agree) { + bstp_set_all_sync(bs); + bp->bp_proposed = 0; + DPRINTF("%s -> ROOT_PROPOSED\n", bp->bp_ifp->if_xname); + } + + if (bp->bp_state != BSTP_IFSTATE_FORWARDING && + (bp->bp_forward_delay_timer.active == 0 || + (bstp_rerooted(bs, bp) && + bp->bp_recent_backup_timer.active == 0 && + bp->bp_protover == BSTP_PROTO_RSTP))) { + switch (bp->bp_state) { + case BSTP_IFSTATE_DISCARDING: + bstp_set_port_state(bp, BSTP_IFSTATE_LEARNING); + break; + case BSTP_IFSTATE_LEARNING: + bstp_set_port_state(bp, + BSTP_IFSTATE_FORWARDING); + break; + } + } + + if (bp->bp_state == BSTP_IFSTATE_FORWARDING && bp->bp_reroot) { + bp->bp_reroot = 0; + DPRINTF("%s -> ROOT_REROOTED\n", bp->bp_ifp->if_xname); + } + break; + + case BSTP_ROLE_DESIGNATED: + if (bp->bp_recent_root_timer.active == 0 && bp->bp_reroot) { + bp->bp_reroot = 0; + DPRINTF("%s -> DESIGNATED_RETIRED\n", + bp->bp_ifp->if_xname); + } + + if ((bp->bp_state == BSTP_IFSTATE_DISCARDING && + !bp->bp_synced) || (bp->bp_agreed && !bp->bp_synced) || + (bp->bp_operedge && !bp->bp_synced) || + (bp->bp_sync && bp->bp_synced)) { + bstp_timer_stop(&bp->bp_recent_root_timer); + bp->bp_synced = 1; + bp->bp_sync = 0; + DPRINTF("%s -> DESIGNATED_SYNCED\n", + bp->bp_ifp->if_xname); + } + + if (bp->bp_state != BSTP_IFSTATE_FORWARDING && + !bp->bp_agreed && !bp->bp_proposing && + !bp->bp_operedge) { + bp->bp_proposing = 1; + bp->bp_flags |= BSTP_PORT_NEWINFO; + bstp_timer_start(&bp->bp_edge_delay_timer, + (bp->bp_ptp_link ? BSTP_DEFAULT_MIGRATE_DELAY : + bp->bp_desg_max_age)); + DPRINTF("%s -> DESIGNATED_PROPOSE\n", + bp->bp_ifp->if_xname); + } + + if (bp->bp_state != BSTP_IFSTATE_FORWARDING && + (bp->bp_forward_delay_timer.active == 0 || bp->bp_agreed || + bp->bp_operedge) && + (bp->bp_recent_root_timer.active == 0 || !bp->bp_reroot) && + !bp->bp_sync) { +#ifdef BRIDGESTP_DEBUG + if (bp->bp_agreed) + DPRINTF("%s -> AGREED\n", bp->bp_ifp->if_xname); +#endif /* BRIDGESTP_DEBUG */ + /* + * If agreed|operedge then go straight to forwarding, + * otherwise follow discard -> learn -> forward. + */ + if (bp->bp_agreed || bp->bp_operedge || + bp->bp_state == BSTP_IFSTATE_LEARNING) { + bstp_set_port_state(bp, + BSTP_IFSTATE_FORWARDING); + bp->bp_agreed = bp->bp_protover; + } else if (bp->bp_state == BSTP_IFSTATE_DISCARDING) + bstp_set_port_state(bp, BSTP_IFSTATE_LEARNING); + } + + if (((bp->bp_sync && !bp->bp_synced) || + (bp->bp_reroot && bp->bp_recent_root_timer.active) || + (bp->bp_flags & BSTP_PORT_DISPUTED)) && !bp->bp_operedge && + bp->bp_state != BSTP_IFSTATE_DISCARDING) { + bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING); + bp->bp_flags &= ~BSTP_PORT_DISPUTED; + bstp_timer_start(&bp->bp_forward_delay_timer, + bp->bp_protover == BSTP_PROTO_RSTP ? + bp->bp_desg_htime : bp->bp_desg_fdelay); + DPRINTF("%s -> DESIGNATED_DISCARD\n", + bp->bp_ifp->if_xname); + } + break; + } + + if (bp->bp_flags & BSTP_PORT_NEWINFO) + bstp_transmit(bs, bp); +} + +static void +bstp_update_tc(struct bstp_port *bp) +{ + switch (bp->bp_tcstate) { + case BSTP_TCSTATE_ACTIVE: + if ((bp->bp_role != BSTP_ROLE_DESIGNATED && + bp->bp_role != BSTP_ROLE_ROOT) || bp->bp_operedge) + bstp_set_port_tc(bp, BSTP_TCSTATE_LEARNING); + + if (bp->bp_rcvdtcn) + bstp_set_port_tc(bp, BSTP_TCSTATE_TCN); + if (bp->bp_rcvdtc) + bstp_set_port_tc(bp, BSTP_TCSTATE_TC); + + if (bp->bp_tc_prop && !bp->bp_operedge) + bstp_set_port_tc(bp, BSTP_TCSTATE_PROPAG); + + if (bp->bp_rcvdtca) + bstp_set_port_tc(bp, BSTP_TCSTATE_ACK); + break; + + case BSTP_TCSTATE_INACTIVE: + if ((bp->bp_state == BSTP_IFSTATE_LEARNING || + bp->bp_state == BSTP_IFSTATE_FORWARDING) && + bp->bp_fdbflush == 0) + bstp_set_port_tc(bp, BSTP_TCSTATE_LEARNING); + break; + + case BSTP_TCSTATE_LEARNING: + if (bp->bp_rcvdtc || bp->bp_rcvdtcn || bp->bp_rcvdtca || + bp->bp_tc_prop) + bstp_set_port_tc(bp, BSTP_TCSTATE_LEARNING); + else if (bp->bp_role != BSTP_ROLE_DESIGNATED && + bp->bp_role != BSTP_ROLE_ROOT && + bp->bp_state == BSTP_IFSTATE_DISCARDING) + bstp_set_port_tc(bp, BSTP_TCSTATE_INACTIVE); + + if ((bp->bp_role == BSTP_ROLE_DESIGNATED || + bp->bp_role == BSTP_ROLE_ROOT) && + bp->bp_state == BSTP_IFSTATE_FORWARDING && + !bp->bp_operedge) + bstp_set_port_tc(bp, BSTP_TCSTATE_DETECTED); + break; + + /* these are transient states and go straight back to ACTIVE */ + case BSTP_TCSTATE_DETECTED: + case BSTP_TCSTATE_TCN: + case BSTP_TCSTATE_TC: + case BSTP_TCSTATE_PROPAG: + case BSTP_TCSTATE_ACK: + DPRINTF("Invalid TC state for %s\n", + bp->bp_ifp->if_xname); + break; + } + +} + +static void +bstp_update_info(struct bstp_port *bp) +{ + struct bstp_state *bs = bp->bp_bs; + + bp->bp_proposing = 0; + bp->bp_proposed = 0; + + if (bp->bp_agreed && !bstp_pdu_bettersame(bp, BSTP_INFO_MINE)) + bp->bp_agreed = 0; + + if (bp->bp_synced && !bp->bp_agreed) { + bp->bp_synced = 0; + bs->bs_allsynced = 0; + } + + /* copy the designated pv to the port */ + bp->bp_port_pv = bp->bp_desg_pv; + bp->bp_port_msg_age = bp->bp_desg_msg_age; + bp->bp_port_max_age = bp->bp_desg_max_age; + bp->bp_port_fdelay = bp->bp_desg_fdelay; + bp->bp_port_htime = bp->bp_desg_htime; + bp->bp_infois = BSTP_INFO_MINE; + + /* Set transmit flag but do not immediately send */ + bp->bp_flags |= BSTP_PORT_NEWINFO; +} + +/* set tcprop on every port other than the caller */ +static void +bstp_set_other_tcprop(struct bstp_port *bp) +{ + struct bstp_state *bs = bp->bp_bs; + struct bstp_port *bp2; + + BSTP_LOCK_ASSERT(bs); + + LIST_FOREACH(bp2, &bs->bs_bplist, bp_next) { + if (bp2 == bp) + continue; + bp2->bp_tc_prop = 1; + } +} + +static void +bstp_set_all_reroot(struct bstp_state *bs) +{ + struct bstp_port *bp; + + BSTP_LOCK_ASSERT(bs); + + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) + bp->bp_reroot = 1; +} + +static void +bstp_set_all_sync(struct bstp_state *bs) +{ + struct bstp_port *bp; + + BSTP_LOCK_ASSERT(bs); + + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { + bp->bp_sync = 1; + bp->bp_synced = 0; /* Not explicit in spec */ + } + + bs->bs_allsynced = 0; +} + +static void +bstp_set_port_state(struct bstp_port *bp, int state) +{ + if (bp->bp_state == state) + return; + + bp->bp_state = state; + + switch (bp->bp_state) { + case BSTP_IFSTATE_DISCARDING: + DPRINTF("state changed to DISCARDING on %s\n", + bp->bp_ifp->if_xname); + break; + + case BSTP_IFSTATE_LEARNING: + DPRINTF("state changed to LEARNING on %s\n", + bp->bp_ifp->if_xname); + + bstp_timer_start(&bp->bp_forward_delay_timer, + bp->bp_protover == BSTP_PROTO_RSTP ? + bp->bp_desg_htime : bp->bp_desg_fdelay); + break; + + case BSTP_IFSTATE_FORWARDING: + DPRINTF("state changed to FORWARDING on %s\n", + bp->bp_ifp->if_xname); + + bstp_timer_stop(&bp->bp_forward_delay_timer); + /* Record that we enabled forwarding */ + bp->bp_forward_transitions++; + break; + } + + /* notify the parent bridge */ + bstp_task_enqueue(&bp->bp_statetask); +} + +static void +bstp_set_port_role(struct bstp_port *bp, int role) +{ + struct bstp_state *bs = bp->bp_bs; + + if (bp->bp_role == role) + return; + + /* perform pre-change tasks */ + switch (bp->bp_role) { + case BSTP_ROLE_DISABLED: + bstp_timer_start(&bp->bp_forward_delay_timer, + bp->bp_desg_max_age); + break; + + case BSTP_ROLE_BACKUP: + bstp_timer_start(&bp->bp_recent_backup_timer, + bp->bp_desg_htime * 2); + /* fall through */ + case BSTP_ROLE_ALTERNATE: + bstp_timer_start(&bp->bp_forward_delay_timer, + bp->bp_desg_fdelay); + bp->bp_sync = 0; + bp->bp_synced = 1; + bp->bp_reroot = 0; + break; + + case BSTP_ROLE_ROOT: + bstp_timer_start(&bp->bp_recent_root_timer, + BSTP_DEFAULT_FORWARD_DELAY); + break; + } + + bp->bp_role = role; + /* clear values not carried between roles */ + bp->bp_proposing = 0; + bs->bs_allsynced = 0; + + /* initialise the new role */ + switch (bp->bp_role) { + case BSTP_ROLE_DISABLED: + case BSTP_ROLE_ALTERNATE: + case BSTP_ROLE_BACKUP: + DPRINTF("%s role -> ALT/BACK/DISABLED\n", + bp->bp_ifp->if_xname); + bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING); + bstp_timer_stop(&bp->bp_recent_root_timer); + bstp_timer_latch(&bp->bp_forward_delay_timer); + bp->bp_sync = 0; + bp->bp_synced = 1; + bp->bp_reroot = 0; + break; + + case BSTP_ROLE_ROOT: + DPRINTF("%s role -> ROOT\n", + bp->bp_ifp->if_xname); + bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING); + bstp_timer_latch(&bp->bp_recent_root_timer); + bp->bp_proposing = 0; + break; + + case BSTP_ROLE_DESIGNATED: + DPRINTF("%s role -> DESIGNATED\n", + bp->bp_ifp->if_xname); + bstp_timer_start(&bp->bp_hello_timer, + bp->bp_desg_htime); + bp->bp_agree = 0; + break; + } + + /* let the TC state know that the role changed */ + bstp_update_tc(bp); +} + +static void +bstp_set_port_proto(struct bstp_port *bp, int proto) +{ + struct bstp_state *bs = bp->bp_bs; + + /* supported protocol versions */ + switch (proto) { + case BSTP_PROTO_STP: + /* we can downgrade protocols only */ + bstp_timer_stop(&bp->bp_migrate_delay_timer); + /* clear unsupported features */ + bp->bp_operedge = 0; + /* STP compat mode only uses 16 bits of the 32 */ + if (bp->bp_path_cost > 65535) + bp->bp_path_cost = 65535; + break; + + case BSTP_PROTO_RSTP: + bstp_timer_start(&bp->bp_migrate_delay_timer, + bs->bs_migration_delay); + break; + + default: + DPRINTF("Unsupported STP version %d\n", proto); + return; + } + + bp->bp_protover = proto; + bp->bp_flags &= ~BSTP_PORT_CANMIGRATE; +} + +static void +bstp_set_port_tc(struct bstp_port *bp, int state) +{ + struct bstp_state *bs = bp->bp_bs; + + bp->bp_tcstate = state; + + /* initialise the new state */ + switch (bp->bp_tcstate) { + case BSTP_TCSTATE_ACTIVE: + DPRINTF("%s -> TC_ACTIVE\n", bp->bp_ifp->if_xname); + /* nothing to do */ + break; + + case BSTP_TCSTATE_INACTIVE: + bstp_timer_stop(&bp->bp_tc_timer); + /* flush routes on the parent bridge */ + bp->bp_fdbflush = 1; + bstp_task_enqueue(&bp->bp_rtagetask); + bp->bp_tc_ack = 0; + DPRINTF("%s -> TC_INACTIVE\n", bp->bp_ifp->if_xname); + break; + + case BSTP_TCSTATE_LEARNING: + bp->bp_rcvdtc = 0; + bp->bp_rcvdtcn = 0; + bp->bp_rcvdtca = 0; + bp->bp_tc_prop = 0; + DPRINTF("%s -> TC_LEARNING\n", bp->bp_ifp->if_xname); + break; + + case BSTP_TCSTATE_DETECTED: + bstp_set_timer_tc(bp); + bstp_set_other_tcprop(bp); + /* send out notification */ + bp->bp_flags |= BSTP_PORT_NEWINFO; + bstp_transmit(bs, bp); + /* reviewed for getmicrotime usage */ + getmicrotime(&bs->bs_last_tc_time); + DPRINTF("%s -> TC_DETECTED\n", bp->bp_ifp->if_xname); + bp->bp_tcstate = BSTP_TCSTATE_ACTIVE; /* UCT */ + break; + + case BSTP_TCSTATE_TCN: + bstp_set_timer_tc(bp); + DPRINTF("%s -> TC_TCN\n", bp->bp_ifp->if_xname); + /* fall through */ + case BSTP_TCSTATE_TC: + bp->bp_rcvdtc = 0; + bp->bp_rcvdtcn = 0; + if (bp->bp_role == BSTP_ROLE_DESIGNATED) + bp->bp_tc_ack = 1; + + bstp_set_other_tcprop(bp); + DPRINTF("%s -> TC_TC\n", bp->bp_ifp->if_xname); + bp->bp_tcstate = BSTP_TCSTATE_ACTIVE; /* UCT */ + break; + + case BSTP_TCSTATE_PROPAG: + /* flush routes on the parent bridge */ + bp->bp_fdbflush = 1; + bstp_task_enqueue(&bp->bp_rtagetask); + bp->bp_tc_prop = 0; + bstp_set_timer_tc(bp); + DPRINTF("%s -> TC_PROPAG\n", bp->bp_ifp->if_xname); + bp->bp_tcstate = BSTP_TCSTATE_ACTIVE; /* UCT */ + break; + + case BSTP_TCSTATE_ACK: + bstp_timer_stop(&bp->bp_tc_timer); + bp->bp_rcvdtca = 0; + DPRINTF("%s -> TC_ACK\n", bp->bp_ifp->if_xname); + bp->bp_tcstate = BSTP_TCSTATE_ACTIVE; /* UCT */ + break; + } +} + +static void +bstp_set_timer_tc(struct bstp_port *bp) +{ + struct bstp_state *bs = bp->bp_bs; + + if (bp->bp_tc_timer.active) + return; + + switch (bp->bp_protover) { + case BSTP_PROTO_RSTP: + bstp_timer_start(&bp->bp_tc_timer, + bp->bp_desg_htime + BSTP_TICK_VAL); + bp->bp_flags |= BSTP_PORT_NEWINFO; + break; + + case BSTP_PROTO_STP: + bstp_timer_start(&bp->bp_tc_timer, + bs->bs_root_max_age + bs->bs_root_fdelay); + break; + } +} + +static void +bstp_set_timer_msgage(struct bstp_port *bp) +{ + if (bp->bp_port_msg_age + BSTP_MESSAGE_AGE_INCR <= + bp->bp_port_max_age) { + bstp_timer_start(&bp->bp_message_age_timer, + bp->bp_port_htime * 3); + } else + /* expires immediately */ + bstp_timer_start(&bp->bp_message_age_timer, 0); +} + +static int +bstp_rerooted(struct bstp_state *bs, struct bstp_port *bp) +{ + struct bstp_port *bp2; + int rr_set = 0; + + LIST_FOREACH(bp2, &bs->bs_bplist, bp_next) { + if (bp2 == bp) + continue; + if (bp2->bp_recent_root_timer.active) { + rr_set = 1; + break; + } + } + return (!rr_set); +} + +int +bstp_set_htime(struct bstp_state *bs, int t) +{ + /* convert seconds to ticks */ + t *= BSTP_TICK_VAL; + + /* value can only be changed in leagacy stp mode */ + if (bs->bs_protover != BSTP_PROTO_STP) + return (EPERM); + + if (t < BSTP_MIN_HELLO_TIME || t > BSTP_MAX_HELLO_TIME) + return (EINVAL); + + BSTP_LOCK(bs); + bs->bs_bridge_htime = t; + bstp_reinit(bs); + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_fdelay(struct bstp_state *bs, int t) +{ + /* convert seconds to ticks */ + t *= BSTP_TICK_VAL; + + if (t < BSTP_MIN_FORWARD_DELAY || t > BSTP_MAX_FORWARD_DELAY) + return (EINVAL); + + BSTP_LOCK(bs); + bs->bs_bridge_fdelay = t; + bstp_reinit(bs); + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_maxage(struct bstp_state *bs, int t) +{ + /* convert seconds to ticks */ + t *= BSTP_TICK_VAL; + + if (t < BSTP_MIN_MAX_AGE || t > BSTP_MAX_MAX_AGE) + return (EINVAL); + + BSTP_LOCK(bs); + bs->bs_bridge_max_age = t; + bstp_reinit(bs); + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_holdcount(struct bstp_state *bs, int count) +{ + struct bstp_port *bp; + + if (count < BSTP_MIN_HOLD_COUNT || + count > BSTP_MAX_HOLD_COUNT) + return (EINVAL); + + BSTP_LOCK(bs); + bs->bs_txholdcount = count; + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) + bp->bp_txcount = 0; + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_protocol(struct bstp_state *bs, int proto) +{ + struct bstp_port *bp; + + switch (proto) { + /* Supported protocol versions */ + case BSTP_PROTO_STP: + case BSTP_PROTO_RSTP: + break; + + default: + return (EINVAL); + } + + BSTP_LOCK(bs); + bs->bs_protover = proto; + bs->bs_bridge_htime = BSTP_DEFAULT_HELLO_TIME; + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { + /* reinit state */ + bp->bp_infois = BSTP_INFO_DISABLED; + bp->bp_txcount = 0; + bstp_set_port_proto(bp, bs->bs_protover); + bstp_set_port_role(bp, BSTP_ROLE_DISABLED); + bstp_set_port_tc(bp, BSTP_TCSTATE_INACTIVE); + bstp_timer_stop(&bp->bp_recent_backup_timer); + } + bstp_reinit(bs); + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_priority(struct bstp_state *bs, int pri) +{ + if (pri < 0 || pri > BSTP_MAX_PRIORITY) + return (EINVAL); + + /* Limit to steps of 4096 */ + pri -= pri % 4096; + + BSTP_LOCK(bs); + bs->bs_bridge_priority = pri; + bstp_reinit(bs); + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_port_priority(struct bstp_port *bp, int pri) +{ + struct bstp_state *bs = bp->bp_bs; + + if (pri < 0 || pri > BSTP_MAX_PORT_PRIORITY) + return (EINVAL); + + /* Limit to steps of 16 */ + pri -= pri % 16; + + BSTP_LOCK(bs); + bp->bp_priority = pri; + bstp_reinit(bs); + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_path_cost(struct bstp_port *bp, uint32_t path_cost) +{ + struct bstp_state *bs = bp->bp_bs; + + if (path_cost > BSTP_MAX_PATH_COST) + return (EINVAL); + + /* STP compat mode only uses 16 bits of the 32 */ + if (bp->bp_protover == BSTP_PROTO_STP && path_cost > 65535) + path_cost = 65535; + + BSTP_LOCK(bs); + + if (path_cost == 0) { /* use auto */ + bp->bp_flags &= ~BSTP_PORT_ADMCOST; + bp->bp_path_cost = bstp_calc_path_cost(bp); + } else { + bp->bp_path_cost = path_cost; + bp->bp_flags |= BSTP_PORT_ADMCOST; + } + bstp_reinit(bs); + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_edge(struct bstp_port *bp, int set) +{ + struct bstp_state *bs = bp->bp_bs; + + BSTP_LOCK(bs); + if ((bp->bp_operedge = set) == 0) + bp->bp_flags &= ~BSTP_PORT_ADMEDGE; + else + bp->bp_flags |= BSTP_PORT_ADMEDGE; + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_autoedge(struct bstp_port *bp, int set) +{ + struct bstp_state *bs = bp->bp_bs; + + BSTP_LOCK(bs); + if (set) { + bp->bp_flags |= BSTP_PORT_AUTOEDGE; + /* we may be able to transition straight to edge */ + if (bp->bp_edge_delay_timer.active == 0) + bstp_edge_delay_expiry(bs, bp); + } else + bp->bp_flags &= ~BSTP_PORT_AUTOEDGE; + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_ptp(struct bstp_port *bp, int set) +{ + struct bstp_state *bs = bp->bp_bs; + + BSTP_LOCK(bs); + bp->bp_ptp_link = set; + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_autoptp(struct bstp_port *bp, int set) +{ + struct bstp_state *bs = bp->bp_bs; + + BSTP_LOCK(bs); + if (set) { + bp->bp_flags |= BSTP_PORT_AUTOPTP; + if (bp->bp_role != BSTP_ROLE_DISABLED) + bstp_ifupdstatus(bs, bp); + } else + bp->bp_flags &= ~BSTP_PORT_AUTOPTP; + BSTP_UNLOCK(bs); + return (0); +} + +/* + * Calculate the path cost according to the link speed. + */ +static uint32_t +bstp_calc_path_cost(struct bstp_port *bp) +{ + struct ifnet *ifp = bp->bp_ifp; + uint32_t path_cost; + + /* If the priority has been manually set then retain the value */ + if (bp->bp_flags & BSTP_PORT_ADMCOST) + return bp->bp_path_cost; + + if (bp->bp_if_link_state == LINK_STATE_DOWN) { + /* Recalc when the link comes up again */ + bp->bp_flags |= BSTP_PORT_PNDCOST; + return (BSTP_DEFAULT_PATH_COST); + } + + if (ifp->if_baudrate < 1000) + return (BSTP_DEFAULT_PATH_COST); + + /* formula from section 17.14, IEEE Std 802.1D-2004 */ + path_cost = 20000000000ULL / (ifp->if_baudrate / 1000); + + if (path_cost > BSTP_MAX_PATH_COST) + path_cost = BSTP_MAX_PATH_COST; + + /* STP compat mode only uses 16 bits of the 32 */ + if (bp->bp_protover == BSTP_PROTO_STP && path_cost > 65535) + path_cost = 65535; + + return (path_cost); +} + +/* + * Notify the bridge that a port state has changed, we need to do this from a + * taskqueue to avoid a LOR. + */ +static void +bstp_notify_state(void *arg, __unused int pending) +{ + struct bstp_port *bp = (struct bstp_port *)arg; + struct bstp_state *bs = bp->bp_bs; + + if (bp->bp_active == 1 && bs->bs_state_cb != NULL) + (*bs->bs_state_cb)(bp->bp_ifp, bp->bp_state); +} + +/* + * Flush the routes on the bridge port, we need to do this from a + * taskqueue to avoid a LOR. + */ +static void +bstp_notify_rtage(void *arg, __unused int pending) +{ + struct bstp_port *bp = (struct bstp_port *)arg; + struct bstp_state *bs = bp->bp_bs; + int age = 0; + + BSTP_LOCK(bs); + switch (bp->bp_protover) { + case BSTP_PROTO_STP: + /* convert to seconds */ + age = bp->bp_desg_fdelay / BSTP_TICK_VAL; + break; + + case BSTP_PROTO_RSTP: + age = 0; + break; + } + BSTP_UNLOCK(bs); + + if (bp->bp_active == 1 && bs->bs_rtage_cb != NULL) + (*bs->bs_rtage_cb)(bp->bp_ifp, age); + + /* flush is complete */ + BSTP_LOCK(bs); + bp->bp_fdbflush = 0; + BSTP_UNLOCK(bs); +} + +void +bstp_linkstate(struct ifnet *ifp, __unused int state) +{ + struct bstp_state *bs; + struct bstp_port *bp; + + /* search for the stp port */ + lck_mtx_lock(bstp_list_mtx); + LIST_FOREACH(bs, &bstp_list, bs_list) { + BSTP_LOCK(bs); + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { + if (bp->bp_ifp == ifp) { + bstp_ifupdstatus(bs, bp); + bstp_update_state(bs, bp); + /* it only exists once so return */ + BSTP_UNLOCK(bs); + lck_mtx_unlock(bstp_list_mtx); + return; + } + } + BSTP_UNLOCK(bs); + } + lck_mtx_unlock(bstp_list_mtx); +} + +static void +bstp_ifupdstatus(struct bstp_state *bs, struct bstp_port *bp) +{ + struct ifnet *ifp = bp->bp_ifp; + struct ifmediareq ifmr; + int error = 0; + + BSTP_LOCK_ASSERT(bs); + + bzero((char *)&ifmr, sizeof(ifmr)); + error = (*ifp->if_ioctl)(ifp, SIOCGIFMEDIA, (caddr_t)&ifmr); + + if ((error == 0) && (ifp->if_flags & IFF_UP)) { + if (ifmr.ifm_status & IFM_ACTIVE) { + /* A full-duplex link is assumed to be point to point */ + if (bp->bp_flags & BSTP_PORT_AUTOPTP) { + bp->bp_ptp_link = + ifmr.ifm_active & IFM_FDX ? 1 : 0; + } + + /* Calc the cost if the link was down previously */ + if (bp->bp_flags & BSTP_PORT_PNDCOST) { + bp->bp_path_cost = bstp_calc_path_cost(bp); + bp->bp_flags &= ~BSTP_PORT_PNDCOST; + } + + if (bp->bp_role == BSTP_ROLE_DISABLED) + bstp_enable_port(bs, bp); + } else { + if (bp->bp_role != BSTP_ROLE_DISABLED) { + bstp_disable_port(bs, bp); + if ((bp->bp_flags & BSTP_PORT_ADMEDGE) && + bp->bp_protover == BSTP_PROTO_RSTP) + bp->bp_operedge = 1; + } + } + return; + } + + if (bp->bp_infois != BSTP_INFO_DISABLED) + bstp_disable_port(bs, bp); +} + +static void +bstp_enable_port(struct bstp_state *bs, struct bstp_port *bp) +{ + bp->bp_infois = BSTP_INFO_AGED; + bstp_assign_roles(bs); +} + +static void +bstp_disable_port(struct bstp_state *bs, struct bstp_port *bp) +{ + bp->bp_infois = BSTP_INFO_DISABLED; + bstp_assign_roles(bs); +} + +static void +bstp_tick(void *arg) +{ + struct bstp_state *bs = arg; + struct bstp_port *bp; + struct timespec ts; + + BSTP_LOCK(bs); + + if (bs->bs_running == 0) + return; + + /* slow timer to catch missed link events */ + if (bstp_timer_expired(&bs->bs_link_timer)) { + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) + bstp_ifupdstatus(bs, bp); + bstp_timer_start(&bs->bs_link_timer, BSTP_LINK_TIMER); + } + + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { + /* no events need to happen for these */ + bstp_timer_expired(&bp->bp_tc_timer); + bstp_timer_expired(&bp->bp_recent_root_timer); + bstp_timer_expired(&bp->bp_forward_delay_timer); + bstp_timer_expired(&bp->bp_recent_backup_timer); + + if (bstp_timer_expired(&bp->bp_hello_timer)) + bstp_hello_timer_expiry(bs, bp); + + if (bstp_timer_expired(&bp->bp_message_age_timer)) + bstp_message_age_expiry(bs, bp); + + if (bstp_timer_expired(&bp->bp_migrate_delay_timer)) + bstp_migrate_delay_expiry(bs, bp); + + if (bstp_timer_expired(&bp->bp_edge_delay_timer)) + bstp_edge_delay_expiry(bs, bp); + + /* update the various state machines for the port */ + bstp_update_state(bs, bp); + + if (bp->bp_txcount > 0) + bp->bp_txcount--; + } + + BSTP_UNLOCK(bs); + + ts.tv_sec = 1; + ts.tv_nsec = 0; + bsd_timeout(bstp_tick, bs, &ts); +} + +static void +bstp_timer_start(struct bstp_timer *t, uint16_t v) +{ + t->value = v; + t->active = 1; + t->latched = 0; +} + +static void +bstp_timer_stop(struct bstp_timer *t) +{ + t->value = 0; + t->active = 0; + t->latched = 0; +} + +static void +bstp_timer_latch(struct bstp_timer *t) +{ + t->latched = 1; + t->active = 1; +} + +static int +bstp_timer_expired(struct bstp_timer *t) +{ + if (t->active == 0 || t->latched) + return (0); + t->value -= BSTP_TICK_VAL; + if (t->value <= 0) { + bstp_timer_stop(t); + return (1); + } + return (0); +} + +static void +bstp_hello_timer_expiry(struct bstp_state *bs, struct bstp_port *bp) +{ + if ((bp->bp_flags & BSTP_PORT_NEWINFO) || + bp->bp_role == BSTP_ROLE_DESIGNATED || + (bp->bp_role == BSTP_ROLE_ROOT && + bp->bp_tc_timer.active == 1)) { + bstp_timer_start(&bp->bp_hello_timer, bp->bp_desg_htime); + bp->bp_flags |= BSTP_PORT_NEWINFO; + bstp_transmit(bs, bp); + } +} + +static void +bstp_message_age_expiry(struct bstp_state *bs, struct bstp_port *bp) +{ + if (bp->bp_infois == BSTP_INFO_RECEIVED) { + bp->bp_infois = BSTP_INFO_AGED; + bstp_assign_roles(bs); + DPRINTF("aged info on %s\n", bp->bp_ifp->if_xname); + } +} + +static void +bstp_migrate_delay_expiry(__unused struct bstp_state *bs, struct bstp_port *bp) +{ + bp->bp_flags |= BSTP_PORT_CANMIGRATE; +} + +static void +bstp_edge_delay_expiry(__unused struct bstp_state *bs, struct bstp_port *bp) +{ + if ((bp->bp_flags & BSTP_PORT_AUTOEDGE) && + bp->bp_protover == BSTP_PROTO_RSTP && bp->bp_proposing && + bp->bp_role == BSTP_ROLE_DESIGNATED) { + bp->bp_operedge = 1; + DPRINTF("%s -> edge port\n", bp->bp_ifp->if_xname); + } +} + +static int +bstp_addr_cmp(const uint8_t *a, const uint8_t *b) +{ + int i, d; + + for (i = 0, d = 0; i < ETHER_ADDR_LEN && d == 0; i++) { + d = ((int)a[i]) - ((int)b[i]); + } + + return (d); +} + +/* + * compare the bridge address component of the bridgeid + */ +static int +bstp_same_bridgeid(uint64_t id1, uint64_t id2) +{ + u_char addr1[ETHER_ADDR_LEN]; + u_char addr2[ETHER_ADDR_LEN]; + + PV2ADDR(id1, addr1); + PV2ADDR(id2, addr2); + + if (bstp_addr_cmp(addr1, addr2) == 0) + return (1); + + return (0); +} + +void +bstp_reinit(struct bstp_state *bs) +{ + struct bstp_port *bp; + struct ifnet *ifp, *mif; + u_char *e_addr; + static const u_char llzero[ETHER_ADDR_LEN]; /* 00:00:00:00:00:00 */ + + BSTP_LOCK_ASSERT(bs); + + mif = NULL; + /* + * Search through the Ethernet adapters and find the one with the + * lowest value. The adapter which we take the MAC address from does + * not need to be part of the bridge, it just needs to be a unique + * value. + */ + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + if (ifp->if_type != IFT_ETHER) + continue; + + if (bstp_addr_cmp(ifnet_lladdr(ifp), llzero) == 0) + continue; + + if (mif == NULL) { + mif = ifp; + continue; + } + if (bstp_addr_cmp(ifnet_lladdr(ifp), ifnet_lladdr(mif)) < 0) { + mif = ifp; + continue; + } + } + ifnet_head_done(); + + if (LIST_EMPTY(&bs->bs_bplist) || mif == NULL) { + /* Set the bridge and root id (lower bits) to zero */ + bs->bs_bridge_pv.pv_dbridge_id = + ((uint64_t)bs->bs_bridge_priority) << 48; + bs->bs_bridge_pv.pv_root_id = bs->bs_bridge_pv.pv_dbridge_id; + bs->bs_root_pv = bs->bs_bridge_pv; + /* Disable any remaining ports, they will have no MAC address */ + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { + bp->bp_infois = BSTP_INFO_DISABLED; + bstp_set_port_role(bp, BSTP_ROLE_DISABLED); + } + bsd_untimeout(bstp_tick, bs); + return; + } + + e_addr = ifnet_lladdr(mif); + bs->bs_bridge_pv.pv_dbridge_id = + (((uint64_t)bs->bs_bridge_priority) << 48) | + (((uint64_t)e_addr[0]) << 40) | + (((uint64_t)e_addr[1]) << 32) | + (((uint64_t)e_addr[2]) << 24) | + (((uint64_t)e_addr[3]) << 16) | + (((uint64_t)e_addr[4]) << 8) | + (((uint64_t)e_addr[5])); + + bs->bs_bridge_pv.pv_root_id = bs->bs_bridge_pv.pv_dbridge_id; + bs->bs_bridge_pv.pv_cost = 0; + bs->bs_bridge_pv.pv_dport_id = 0; + bs->bs_bridge_pv.pv_port_id = 0; + + if (bs->bs_running) + bsd_untimeout(bstp_tick, bs); + + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { + bp->bp_port_id = (bp->bp_priority << 8) | + (bp->bp_ifp->if_index & 0xfff); + bstp_ifupdstatus(bs, bp); + } + + bstp_assign_roles(bs); + bstp_timer_start(&bs->bs_link_timer, BSTP_LINK_TIMER); +} + +void +bstp_attach(struct bstp_state *bs, struct bstp_cb_ops *cb) +{ + BSTP_LOCK_INIT(bs); + LIST_INIT(&bs->bs_bplist); + + bs->bs_bridge_max_age = BSTP_DEFAULT_MAX_AGE; + bs->bs_bridge_htime = BSTP_DEFAULT_HELLO_TIME; + bs->bs_bridge_fdelay = BSTP_DEFAULT_FORWARD_DELAY; + bs->bs_bridge_priority = BSTP_DEFAULT_BRIDGE_PRIORITY; + bs->bs_hold_time = BSTP_DEFAULT_HOLD_TIME; + bs->bs_migration_delay = BSTP_DEFAULT_MIGRATE_DELAY; + bs->bs_txholdcount = BSTP_DEFAULT_HOLD_COUNT; + bs->bs_protover = BSTP_PROTO_RSTP; + bs->bs_state_cb = cb->bcb_state; + bs->bs_rtage_cb = cb->bcb_rtage; + + /* reviewed for getmicrotime usage */ + getmicrotime(&bs->bs_last_tc_time); + + lck_mtx_lock(bstp_list_mtx); + LIST_INSERT_HEAD(&bstp_list, bs, bs_list); + lck_mtx_unlock(bstp_list_mtx); +} + +void +bstp_detach(struct bstp_state *bs) +{ + KASSERT(LIST_EMPTY(&bs->bs_bplist), ("bstp still active")); + + lck_mtx_lock(bstp_list_mtx); + LIST_REMOVE(bs, bs_list); + lck_mtx_unlock(bstp_list_mtx); + bsd_untimeout(bstp_tick, bs); + BSTP_LOCK_DESTROY(bs); +} + +void +bstp_init(struct bstp_state *bs) +{ + struct timespec ts; + + ts.tv_sec = 1; + ts.tv_nsec = 0; + + BSTP_LOCK(bs); + bsd_timeout(bstp_tick, bs, &ts); + bs->bs_running = 1; + bstp_reinit(bs); + BSTP_UNLOCK(bs); +} + +void +bstp_stop(struct bstp_state *bs) +{ + struct bstp_port *bp; + + BSTP_LOCK(bs); + + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) + bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING); + + bs->bs_running = 0; + bsd_untimeout(bstp_tick, bs); + BSTP_UNLOCK(bs); +} + +int +bstp_create(struct bstp_state *bs, struct bstp_port *bp, struct ifnet *ifp) +{ + bzero(bp, sizeof(struct bstp_port)); + + BSTP_LOCK(bs); + bp->bp_ifp = ifp; + bp->bp_bs = bs; + bp->bp_priority = BSTP_DEFAULT_PORT_PRIORITY; + BSTP_TASK_INIT(&bp->bp_statetask, bstp_notify_state, bp); + BSTP_TASK_INIT(&bp->bp_rtagetask, bstp_notify_rtage, bp); + + /* Init state */ + bp->bp_infois = BSTP_INFO_DISABLED; + bp->bp_flags = BSTP_PORT_AUTOEDGE|BSTP_PORT_AUTOPTP; + bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING); + bstp_set_port_proto(bp, bs->bs_protover); + bstp_set_port_role(bp, BSTP_ROLE_DISABLED); + bstp_set_port_tc(bp, BSTP_TCSTATE_INACTIVE); + bp->bp_path_cost = bstp_calc_path_cost(bp); + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_enable(struct bstp_port *bp) +{ + struct bstp_state *bs = bp->bp_bs; + struct ifnet *ifp = bp->bp_ifp; + + KASSERT(bp->bp_active == 0, ("already a bstp member")); + + switch (ifp->if_type) { + case IFT_ETHER: /* These can do spanning tree. */ + break; + default: + /* Nothing else can. */ + return (EINVAL); + } + + BSTP_LOCK(bs); + LIST_INSERT_HEAD(&bs->bs_bplist, bp, bp_next); + bp->bp_active = 1; + bp->bp_flags |= BSTP_PORT_NEWINFO; + bstp_reinit(bs); + bstp_update_roles(bs, bp); + BSTP_UNLOCK(bs); + return (0); +} + +void +bstp_disable(struct bstp_port *bp) +{ + struct bstp_state *bs = bp->bp_bs; + + KASSERT(bp->bp_active == 1, ("not a bstp member")); + + BSTP_LOCK(bs); + bstp_disable_port(bs, bp); + LIST_REMOVE(bp, bp_next); + bp->bp_active = 0; + bstp_reinit(bs); + BSTP_UNLOCK(bs); +} + +/* + * The bstp_port structure is about to be freed by the parent bridge. + */ +void +bstp_destroy(struct bstp_port *bp) +{ + KASSERT(bp->bp_active == 0, ("port is still attached")); + bstp_task_drain(&bp->bp_statetask); + bstp_task_drain(&bp->bp_rtagetask); +} + + +__private_extern__ void +bstp_sys_init(void) +{ + lck_grp_attr_t *lck_grp_attr = NULL; + + lck_grp_attr = lck_grp_attr_alloc_init(); + bstp_lock_grp = lck_grp_alloc_init("bstp", lck_grp_attr); + bstp_lock_attr = lck_attr_alloc_init(); +#if BRIDGE_DEBUG + lck_attr_setdebug(bstp_lock_attr); +#endif + bstp_list_mtx = lck_mtx_alloc_init(bstp_lock_grp, bstp_lock_attr); + lck_grp_attr_free(lck_grp_attr); + + LIST_INIT(&bstp_list); + + bstp_create_task_thread(); +} + + + +static void +bstp_create_task_thread(void) +{ + kern_return_t error; + + lck_grp_attr_t *lck_grp_attr = NULL; + + lck_grp_attr = lck_grp_attr_alloc_init(); + bstp_task_grp = lck_grp_alloc_init("bstp_task", lck_grp_attr); + bstp_task_attr = lck_attr_alloc_init(); +#if BRIDGE_DEBUG + lck_attr_setdebug(bstp_task_attr); +#endif + bstp_task_mtx = lck_mtx_alloc_init(bstp_lock_grp, bstp_lock_attr); + lck_grp_attr_free(lck_grp_attr); + + error = kernel_thread_start((thread_continue_t)bstp_task_thread_func, NULL, &bstp_task_thread); +} + + +static void +bstp_task_thread_func(void) +{ + struct bstp_task *bt, *tvar; + + lck_mtx_lock(bstp_task_mtx); + + do { + while(TAILQ_EMPTY(&bstp_task_queue)) { + wakeup(&bstp_task_queue_running); + msleep(&bstp_task_queue, bstp_task_mtx, PZERO, "bstp_task_queue", NULL); + } + + TAILQ_FOREACH_SAFE(bt, &bstp_task_queue, bt_next, tvar) { + int count = bt->bt_count; + + bt->bt_count = 0; + + bstp_task_queue_running = bt; + lck_mtx_unlock(bstp_task_mtx); + + (*bt->bt_func)(bt->bt_context, count); + + lck_mtx_lock(bstp_task_mtx); + bstp_task_queue_running = NULL; + + if (bt->bt_count == 0) + TAILQ_REMOVE(&bstp_task_queue, bt, bt_next); + } + } while (1); + + /* UNREACHED */ +} + +static void +bstp_task_enqueue(struct bstp_task *bt) +{ + lck_mtx_lock(bstp_task_mtx); + + if (bt->bt_count) { + bt->bt_count++; + lck_mtx_unlock(bstp_task_mtx); + wakeup(&bstp_task_queue); + return; + } + + bt->bt_count = 1; + TAILQ_INSERT_TAIL(&bstp_task_queue, bt, bt_next); + + lck_mtx_unlock(bstp_task_mtx); + + wakeup(&bstp_task_queue); +} + +static void +bstp_task_drain(struct bstp_task *bt) +{ + lck_mtx_lock(bstp_task_mtx); + + while (bt->bt_count != 0 || bstp_task_queue_running == bt) { + wakeup(&bstp_task_queue); + msleep(&bstp_task_queue_running, bstp_task_mtx, PZERO, "bstp_task_queue", NULL); + } + lck_mtx_unlock(bstp_task_mtx); +} + + diff --git a/bsd/net/bridgestp.h b/bsd/net/bridgestp.h new file mode 100644 index 000000000..a70f7aaba --- /dev/null +++ b/bsd/net/bridgestp.h @@ -0,0 +1,441 @@ +/* $NetBSD: if_bridgevar.h,v 1.4 2003/07/08 07:13:50 itojun Exp $ */ + +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Copyright 2001 Wasabi Systems, Inc. + * All rights reserved. + * + * Written by Jason R. Thorpe for Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed for the NetBSD Project by + * Wasabi Systems, Inc. + * 4. The name of Wasabi Systems, Inc. may not be used to endorse + * or promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 1999, 2000 Jason L. Wright (jason@thought.net) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Jason L. Wright + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * OpenBSD: if_bridge.h,v 1.14 2001/03/22 03:48:29 jason Exp + * + * $FreeBSD$ + */ + +#ifndef __BRIDGESTP_H__ +#define __BRIDGESTP_H__ + +/* + * Data structure and control definitions for STP interfaces. + */ + +#include +#include +/* STP port states */ +#define BSTP_IFSTATE_DISABLED 0 +#define BSTP_IFSTATE_LISTENING 1 +#define BSTP_IFSTATE_LEARNING 2 +#define BSTP_IFSTATE_FORWARDING 3 +#define BSTP_IFSTATE_BLOCKING 4 +#define BSTP_IFSTATE_DISCARDING 5 + +#define BSTP_TCSTATE_ACTIVE 1 +#define BSTP_TCSTATE_DETECTED 2 +#define BSTP_TCSTATE_INACTIVE 3 +#define BSTP_TCSTATE_LEARNING 4 +#define BSTP_TCSTATE_PROPAG 5 +#define BSTP_TCSTATE_ACK 6 +#define BSTP_TCSTATE_TC 7 +#define BSTP_TCSTATE_TCN 8 + +#define BSTP_ROLE_DISABLED 0 +#define BSTP_ROLE_ROOT 1 +#define BSTP_ROLE_DESIGNATED 2 +#define BSTP_ROLE_ALTERNATE 3 +#define BSTP_ROLE_BACKUP 4 + +#ifdef XNU_KERNEL_PRIVATE + +/* STP port flags */ +#define BSTP_PORT_CANMIGRATE 0x0001 +#define BSTP_PORT_NEWINFO 0x0002 +#define BSTP_PORT_DISPUTED 0x0004 +#define BSTP_PORT_ADMCOST 0x0008 +#define BSTP_PORT_AUTOEDGE 0x0010 +#define BSTP_PORT_AUTOPTP 0x0020 +#define BSTP_PORT_ADMEDGE 0x0040 +#define BSTP_PORT_PNDCOST 0x0080 + +/* BPDU priority */ +#define BSTP_PDU_SUPERIOR 1 +#define BSTP_PDU_REPEATED 2 +#define BSTP_PDU_INFERIOR 3 +#define BSTP_PDU_INFERIORALT 4 +#define BSTP_PDU_OTHER 5 + +/* BPDU flags */ +#define BSTP_PDU_PRMASK 0x0c /* Port Role */ +#define BSTP_PDU_PRSHIFT 2 /* Port Role offset */ +#define BSTP_PDU_F_UNKN 0x00 /* Unknown port (00) */ +#define BSTP_PDU_F_ALT 0x01 /* Alt/Backup port (01) */ +#define BSTP_PDU_F_ROOT 0x02 /* Root port (10) */ +#define BSTP_PDU_F_DESG 0x03 /* Designated port (11) */ + +#define BSTP_PDU_STPMASK 0x81 /* strip unused STP flags */ +#define BSTP_PDU_RSTPMASK 0x7f /* strip unused RSTP flags */ +#define BSTP_PDU_F_TC 0x01 /* Topology change */ +#define BSTP_PDU_F_P 0x02 /* Proposal flag */ +#define BSTP_PDU_F_L 0x10 /* Learning flag */ +#define BSTP_PDU_F_F 0x20 /* Forwarding flag */ +#define BSTP_PDU_F_A 0x40 /* Agreement flag */ +#define BSTP_PDU_F_TCA 0x80 /* Topology change ack */ + +/* + * Spanning tree defaults. + */ +#define BSTP_DEFAULT_MAX_AGE (20 * 256) +#define BSTP_DEFAULT_HELLO_TIME (2 * 256) +#define BSTP_DEFAULT_FORWARD_DELAY (15 * 256) +#define BSTP_DEFAULT_HOLD_TIME (1 * 256) +#define BSTP_DEFAULT_MIGRATE_DELAY (3 * 256) +#define BSTP_DEFAULT_HOLD_COUNT 6 +#define BSTP_DEFAULT_BRIDGE_PRIORITY 0x8000 +#define BSTP_DEFAULT_PORT_PRIORITY 0x80 +#define BSTP_DEFAULT_PATH_COST 55 +#define BSTP_MIN_HELLO_TIME (1 * 256) +#define BSTP_MIN_MAX_AGE (6 * 256) +#define BSTP_MIN_FORWARD_DELAY (4 * 256) +#define BSTP_MIN_HOLD_COUNT 1 +#define BSTP_MAX_HELLO_TIME (2 * 256) +#define BSTP_MAX_MAX_AGE (40 * 256) +#define BSTP_MAX_FORWARD_DELAY (30 * 256) +#define BSTP_MAX_HOLD_COUNT 10 +#define BSTP_MAX_PRIORITY 61440 +#define BSTP_MAX_PORT_PRIORITY 240 +#define BSTP_MAX_PATH_COST 200000000 + +/* BPDU message types */ +#define BSTP_MSGTYPE_CFG 0x00 /* Configuration */ +#define BSTP_MSGTYPE_RSTP 0x02 /* Rapid STP */ +#define BSTP_MSGTYPE_TCN 0x80 /* Topology chg notification */ + +/* Protocol versions */ +#define BSTP_PROTO_ID 0x00 +#define BSTP_PROTO_STP 0x00 +#define BSTP_PROTO_RSTP 0x02 +#define BSTP_PROTO_MAX BSTP_PROTO_RSTP + +#define BSTP_INFO_RECIEVED 1 /* compat */ +#define BSTP_INFO_RECEIVED 1 +#define BSTP_INFO_MINE 2 +#define BSTP_INFO_AGED 3 +#define BSTP_INFO_DISABLED 4 + + +#define BSTP_MESSAGE_AGE_INCR (1 * 256) /* in 256ths of a second */ +#define BSTP_TICK_VAL (1 * 256) /* in 256ths of a second */ +#define BSTP_LINK_TIMER (BSTP_TICK_VAL * 15) + +/* + * Driver callbacks for STP state changes + */ +typedef void (*bstp_state_cb_t)(struct ifnet *, int); +typedef void (*bstp_rtage_cb_t)(struct ifnet *, int); +struct bstp_cb_ops { + bstp_state_cb_t bcb_state; + bstp_rtage_cb_t bcb_rtage; +}; + +/* + * Because BPDU's do not make nicely aligned structures, two different + * declarations are used: bstp_?bpdu (wire representation, packed) and + * bstp_*_unit (internal, nicely aligned version). + */ + +/* configuration bridge protocol data unit */ +struct bstp_cbpdu { + uint8_t cbu_dsap; /* LLC: destination sap */ + uint8_t cbu_ssap; /* LLC: source sap */ + uint8_t cbu_ctl; /* LLC: control */ + uint16_t cbu_protoid; /* protocol id */ + uint8_t cbu_protover; /* protocol version */ + uint8_t cbu_bpdutype; /* message type */ + uint8_t cbu_flags; /* flags (below) */ + + /* root id */ + uint16_t cbu_rootpri; /* root priority */ + uint8_t cbu_rootaddr[6]; /* root address */ + + uint32_t cbu_rootpathcost; /* root path cost */ + + /* bridge id */ + uint16_t cbu_bridgepri; /* bridge priority */ + uint8_t cbu_bridgeaddr[6]; /* bridge address */ + + uint16_t cbu_portid; /* port id */ + uint16_t cbu_messageage; /* current message age */ + uint16_t cbu_maxage; /* maximum age */ + uint16_t cbu_hellotime; /* hello time */ + uint16_t cbu_forwarddelay; /* forwarding delay */ + uint8_t cbu_versionlen; /* version 1 length */ +} __attribute__((__packed__)); +#define BSTP_BPDU_STP_LEN (3 + 35) /* LLC + STP pdu */ +#define BSTP_BPDU_RSTP_LEN (3 + 36) /* LLC + RSTP pdu */ + +/* topology change notification bridge protocol data unit */ +struct bstp_tbpdu { + uint8_t tbu_dsap; /* LLC: destination sap */ + uint8_t tbu_ssap; /* LLC: source sap */ + uint8_t tbu_ctl; /* LLC: control */ + uint16_t tbu_protoid; /* protocol id */ + uint8_t tbu_protover; /* protocol version */ + uint8_t tbu_bpdutype; /* message type */ +} __attribute__((__packed__)); + +/* + * Timekeeping structure used in spanning tree code. + */ + +typedef void bstp_task_func_t(void *context, int count); + +struct bstp_task { + TAILQ_ENTRY(bstp_task) bt_next; + int bt_count; + bstp_task_func_t *bt_func; + void *bt_context; +}; + +struct bstp_timer { + int active; + int latched; + int value; +}; + +struct bstp_pri_vector { + uint64_t pv_root_id; + uint32_t pv_cost; + uint64_t pv_dbridge_id; + uint16_t pv_dport_id; + uint16_t pv_port_id; +}; + +struct bstp_config_unit { + struct bstp_pri_vector cu_pv; + uint16_t cu_message_age; + uint16_t cu_max_age; + uint16_t cu_forward_delay; + uint16_t cu_hello_time; + uint8_t cu_message_type; + uint8_t cu_topology_change_ack; + uint8_t cu_topology_change; + uint8_t cu_proposal; + uint8_t cu_agree; + uint8_t cu_learning; + uint8_t cu_forwarding; + uint8_t cu_role; +}; + +struct bstp_tcn_unit { + uint8_t tu_message_type; +}; + +struct bstp_port { + LIST_ENTRY(bstp_port) bp_next; + struct ifnet *bp_ifp; /* parent if */ + struct bstp_state *bp_bs; + uint8_t bp_active; + uint8_t bp_protover; + uint32_t bp_flags; + uint32_t bp_path_cost; + uint16_t bp_port_msg_age; + uint16_t bp_port_max_age; + uint16_t bp_port_fdelay; + uint16_t bp_port_htime; + uint16_t bp_desg_msg_age; + uint16_t bp_desg_max_age; + uint16_t bp_desg_fdelay; + uint16_t bp_desg_htime; + struct bstp_timer bp_edge_delay_timer; + struct bstp_timer bp_forward_delay_timer; + struct bstp_timer bp_hello_timer; + struct bstp_timer bp_message_age_timer; + struct bstp_timer bp_migrate_delay_timer; + struct bstp_timer bp_recent_backup_timer; + struct bstp_timer bp_recent_root_timer; + struct bstp_timer bp_tc_timer; + struct bstp_config_unit bp_msg_cu; + struct bstp_pri_vector bp_desg_pv; + struct bstp_pri_vector bp_port_pv; + uint16_t bp_port_id; + uint8_t bp_state; + uint8_t bp_tcstate; + uint8_t bp_role; + uint8_t bp_infois; + uint8_t bp_tc_ack; + uint8_t bp_tc_prop; + uint8_t bp_fdbflush; + uint8_t bp_priority; + uint8_t bp_ptp_link; + uint8_t bp_agree; + uint8_t bp_agreed; + uint8_t bp_sync; + uint8_t bp_synced; + uint8_t bp_proposing; + uint8_t bp_proposed; + uint8_t bp_operedge; + uint8_t bp_reroot; + uint8_t bp_rcvdtc; + uint8_t bp_rcvdtca; + uint8_t bp_rcvdtcn; + uint32_t bp_forward_transitions; + uint8_t bp_txcount; + struct bstp_task bp_statetask; + struct bstp_task bp_rtagetask; + uint32_t bp_if_link_state; /* cache of the parent if link state */ +}; + +/* + * Values for bp_if_link_state. + */ +#define LINK_STATE_UNKNOWN 0 /* link invalid/unknown */ +#define LINK_STATE_DOWN 1 /* link is down */ +#define LINK_STATE_UP 2 /* link is up */ + +/* + * Software state for each bridge STP. + */ +struct bstp_state { + LIST_ENTRY(bstp_state) bs_list; + uint8_t bs_running; + lck_mtx_t *bs_mtx; + struct bstp_pri_vector bs_bridge_pv; + struct bstp_pri_vector bs_root_pv; + struct bstp_port *bs_root_port; + uint8_t bs_protover; + uint16_t bs_migration_delay; + uint16_t bs_edge_delay; + uint16_t bs_bridge_max_age; + uint16_t bs_bridge_fdelay; + uint16_t bs_bridge_htime; + uint16_t bs_root_msg_age; + uint16_t bs_root_max_age; + uint16_t bs_root_fdelay; + uint16_t bs_root_htime; + uint16_t bs_hold_time; + uint16_t bs_bridge_priority; + uint8_t bs_txholdcount; + uint8_t bs_allsynced; + struct bstp_timer bs_link_timer; + struct timeval bs_last_tc_time; + LIST_HEAD(, bstp_port) bs_bplist; + bstp_state_cb_t bs_state_cb; + bstp_rtage_cb_t bs_rtage_cb; +}; + +extern const uint8_t bstp_etheraddr[]; + +void bstp_attach(struct bstp_state *, struct bstp_cb_ops *); +void bstp_detach(struct bstp_state *); +void bstp_init(struct bstp_state *); +void bstp_stop(struct bstp_state *); +int bstp_create(struct bstp_state *, struct bstp_port *, struct ifnet *); +int bstp_enable(struct bstp_port *); +void bstp_disable(struct bstp_port *); +void bstp_destroy(struct bstp_port *); +void bstp_linkstate(struct ifnet *, int); +int bstp_set_htime(struct bstp_state *, int); +int bstp_set_fdelay(struct bstp_state *, int); +int bstp_set_maxage(struct bstp_state *, int); +int bstp_set_holdcount(struct bstp_state *, int); +int bstp_set_protocol(struct bstp_state *, int); +int bstp_set_priority(struct bstp_state *, int); +int bstp_set_port_priority(struct bstp_port *, int); +int bstp_set_path_cost(struct bstp_port *, uint32_t); +int bstp_set_edge(struct bstp_port *, int); +int bstp_set_autoedge(struct bstp_port *, int); +int bstp_set_ptp(struct bstp_port *, int); +int bstp_set_autoptp(struct bstp_port *, int); +struct mbuf *bstp_input(struct bstp_port *, struct ifnet *, struct mbuf *); + +void bstp_sys_init(void); + +#endif /* XNU_KERNEL_PRIVATE */ + +#endif /* __BRIDGESTP_H__ */ + diff --git a/bsd/net/dlil.c b/bsd/net/dlil.c index 848b3b3f1..272388f02 100644 --- a/bsd/net/dlil.c +++ b/bsd/net/dlil.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2010 Apple Inc. All rights reserved. + * Copyright (c) 1999-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,10 +25,6 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * Data Link Inteface Layer - * Author: Ted Walker - */ /* * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce * support for mandatory and extensible security protections. This notice @@ -53,22 +49,41 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include +#include #include +#if INET +#include +#include +#endif /* INET */ + +#if INET6 +#include +#include +#include +#endif /* INET6 */ + +#if NETAT +#include +#endif /* NETAT */ + #include #include #include +#include #if CONFIG_MACF_NET #include @@ -78,8 +93,8 @@ #include #endif /* PF */ -#define DBG_LAYER_BEG DLILDBG_CODE(DBG_DLIL_STATIC, 0) -#define DBG_LAYER_END DLILDBG_CODE(DBG_DLIL_STATIC, 2) +#define DBG_LAYER_BEG DLILDBG_CODE(DBG_DLIL_STATIC, 0) +#define DBG_LAYER_END DLILDBG_CODE(DBG_DLIL_STATIC, 2) #define DBG_FNC_DLIL_INPUT DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8)) #define DBG_FNC_DLIL_OUTPUT DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8)) #define DBG_FNC_DLIL_IFOUT DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8)) @@ -95,48 +110,51 @@ #define DLIL_PRINTF kprintf #endif -#define atomic_add_32(a, n) \ - ((void) OSAddAtomic(n, (volatile SInt32 *)a)) - -#if PKT_PRIORITY #define _CASSERT(x) \ switch (0) { case 0: case (x): ; } -#define IF_DATA_REQUIRE_ALIGNED_32(f) \ - _CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int32_t))) +#define IF_DATA_REQUIRE_ALIGNED_64(f) \ + _CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t))) -#define IFNET_IF_DATA_REQUIRE_ALIGNED_32(f) \ - _CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int32_t))) -#endif /* PKT_PRIORITY */ +#define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f) \ + _CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t))) + +#define IFNET_IF_TC_REQUIRE_ALIGNED_64(f) \ + _CASSERT(!(offsetof(struct ifnet, if_tc.f) % sizeof (u_int64_t))) enum { kProtoKPI_v1 = 1, kProtoKPI_v2 = 2 }; +/* + * List of if_proto structures in if_proto_hash[] is protected by + * the ifnet lock. The rest of the fields are initialized at protocol + * attach time and never change, thus no lock required as long as + * a reference to it is valid, via if_proto_ref(). + */ struct if_proto { - SLIST_ENTRY(if_proto) next_hash; - int refcount; - int detaching; - struct ifnet *ifp; - struct domain *dl_domain; + SLIST_ENTRY(if_proto) next_hash; + u_int32_t refcount; + u_int32_t detached; + struct ifnet *ifp; protocol_family_t protocol_family; - int proto_kpi; + int proto_kpi; union { struct { - proto_media_input input; - proto_media_preout pre_output; - proto_media_event event; - proto_media_ioctl ioctl; + proto_media_input input; + proto_media_preout pre_output; + proto_media_event event; + proto_media_ioctl ioctl; proto_media_detached detached; proto_media_resolve_multi resolve_multi; proto_media_send_arp send_arp; } v1; struct { proto_media_input_v2 input; - proto_media_preout pre_output; - proto_media_event event; - proto_media_ioctl ioctl; + proto_media_preout pre_output; + proto_media_event event; + proto_media_ioctl ioctl; proto_media_detached detached; proto_media_resolve_multi resolve_multi; proto_media_send_arp send_arp; @@ -146,51 +164,118 @@ struct if_proto { SLIST_HEAD(proto_hash_entry, if_proto); +#define DLIL_SDLMAXLEN 64 +#define DLIL_SDLDATALEN \ + (DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0])) struct dlil_ifnet { - /* ifnet and drvr_ext are used by the stack and drivers - drvr_ext extends the public ifnet and must follow dl_if */ - struct ifnet dl_if; /* public ifnet */ - - /* dlil private fields */ - TAILQ_ENTRY(dlil_ifnet) dl_if_link; /* dlil_ifnet are link together */ - /* it is not the ifnet list */ - void *if_uniqueid; /* unique id identifying the interface */ - size_t if_uniqueid_len;/* length of the unique id */ - char if_namestorage[IFNAMSIZ]; /* interface name storage */ + struct ifnet dl_if; /* public ifnet */ + /* + * dlil private fields, protected by dl_if_lock + */ + decl_lck_mtx_data(, dl_if_lock); + TAILQ_ENTRY(dlil_ifnet) dl_if_link; /* dlil_ifnet link */ + u_int32_t dl_if_flags; /* flags (below) */ + u_int32_t dl_if_refcnt; /* refcnt */ + void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */ + void *dl_if_uniqueid; /* unique interface id */ + size_t dl_if_uniqueid_len; /* length of the unique id */ + char dl_if_namestorage[IFNAMSIZ]; /* interface name storage */ + struct { + struct ifaddr ifa; /* lladdr ifa */ + u_int8_t asdl[DLIL_SDLMAXLEN]; /* addr storage */ + u_int8_t msdl[DLIL_SDLMAXLEN]; /* mask storage */ + } dl_if_lladdr; + ctrace_t dl_if_attach; /* attach PC stacktrace */ + ctrace_t dl_if_detach; /* detach PC stacktrace */ +}; + +/* Values for dl_if_flags (private to DLIL) */ +#define DLIF_INUSE 0x1 /* DLIL ifnet recycler, ifnet in use */ +#define DLIF_REUSE 0x2 /* DLIL ifnet recycles, ifnet is not new */ +#define DLIF_DEBUG 0x4 /* has debugging info */ + +#define IF_REF_TRACE_HIST_SIZE 8 /* size of ref trace history */ + +/* For gdb */ +__private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE; + +struct dlil_ifnet_dbg { + struct dlil_ifnet dldbg_dlif; /* dlil_ifnet */ + u_int16_t dldbg_if_refhold_cnt; /* # ifnet references */ + u_int16_t dldbg_if_refrele_cnt; /* # ifnet releases */ + /* + * Circular lists of ifnet_{reference,release} callers. + */ + ctrace_t dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE]; + ctrace_t dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE]; }; +#define DLIL_TO_IFP(s) (&s->dl_if) +#define IFP_TO_DLIL(s) ((struct dlil_ifnet *)s) + struct ifnet_filter { TAILQ_ENTRY(ifnet_filter) filt_next; - ifnet_t filt_ifp; - int filt_detaching; - - const char *filt_name; - void *filt_cookie; - protocol_family_t filt_protocol; - iff_input_func filt_input; - iff_output_func filt_output; - iff_event_func filt_event; - iff_ioctl_func filt_ioctl; - iff_detached_func filt_detached; + u_int32_t filt_skip; + ifnet_t filt_ifp; + const char *filt_name; + void *filt_cookie; + protocol_family_t filt_protocol; + iff_input_func filt_input; + iff_output_func filt_output; + iff_event_func filt_event; + iff_ioctl_func filt_ioctl; + iff_detached_func filt_detached; }; struct proto_input_entry; static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head; static lck_grp_t *dlil_lock_group; -static lck_grp_t *ifnet_lock_group; +lck_grp_t *ifnet_lock_group; static lck_grp_t *ifnet_head_lock_group; -static lck_attr_t *ifnet_lock_attr; -static lck_rw_t *ifnet_head_mutex; -static lck_mtx_t *dlil_ifnet_mutex; -static lck_mtx_t *dlil_mutex; -static u_int32_t dlil_read_count = 0; -static u_int32_t dlil_detach_waiting = 0; +lck_attr_t *ifnet_lock_attr; +decl_lck_rw_data(, ifnet_head_lock); +decl_lck_mtx_data(, dlil_ifnet_lock); u_int32_t dlil_filter_count = 0; extern u_int32_t ipv4_ll_arp_aware; -#if IFNET_ROUTE_REFCNT +#if DEBUG +static unsigned int ifnet_debug = 1; /* debugging (enabled) */ +#else +static unsigned int ifnet_debug; /* debugging (disabled) */ +#endif /* !DEBUG */ +static unsigned int dlif_size; /* size of dlil_ifnet to allocate */ +static unsigned int dlif_bufsize; /* size of dlif_size + headroom */ +static struct zone *dlif_zone; /* zone for dlil_ifnet */ + +#define DLIF_ZONE_MAX 64 /* maximum elements in zone */ +#define DLIF_ZONE_NAME "ifnet" /* zone name */ + +static unsigned int dlif_filt_size; /* size of ifnet_filter */ +static struct zone *dlif_filt_zone; /* zone for ifnet_filter */ + +#define DLIF_FILT_ZONE_MAX 8 /* maximum elements in zone */ +#define DLIF_FILT_ZONE_NAME "ifnet_filter" /* zone name */ + +static unsigned int dlif_inp_size; /* size of dlil_threading_info */ +static struct zone *dlif_inp_zone; /* zone for dlil_threading_info */ + +#define DLIF_INP_ZONE_MAX DLIF_ZONE_MAX /* maximum elements in zone */ +#define DLIF_INP_ZONE_NAME "ifnet_thread" /* zone name */ + +static unsigned int dlif_phash_size; /* size of ifnet proto hash table */ +static struct zone *dlif_phash_zone; /* zone for ifnet proto hash table */ + +#define DLIF_PHASH_ZONE_MAX DLIF_ZONE_MAX /* maximum elements in zone */ +#define DLIF_PHASH_ZONE_NAME "ifnet_proto_hash" /* zone name */ + +static unsigned int dlif_proto_size; /* size of if_proto */ +static struct zone *dlif_proto_zone; /* zone for if_proto */ + +#define DLIF_PROTO_ZONE_MAX (DLIF_ZONE_MAX*2) /* maximum elements in zone */ +#define DLIF_PROTO_ZONE_NAME "ifnet_proto" /* zone name */ + /* * Updating this variable should be done by first acquiring the global * radix node head (rnh_lock), in tandem with settting/clearing the @@ -198,7 +283,6 @@ extern u_int32_t ipv4_ll_arp_aware; */ u_int32_t ifnet_aggressive_drainers; static u_int32_t net_rtref; -#endif /* IFNET_ROUTE_REFCNT */ static struct dlil_threading_info dlil_lo_thread; __private_extern__ struct dlil_threading_info *dlil_lo_thread_ptr = &dlil_lo_thread; @@ -206,135 +290,117 @@ __private_extern__ struct dlil_threading_info *dlil_lo_thread_ptr = &dlil_lo_th static struct mbuf *dlil_lo_input_mbuf_head = NULL; static struct mbuf *dlil_lo_input_mbuf_tail = NULL; -#if IFNET_INPUT_SANITY_CHK -static int dlil_lo_input_mbuf_count = 0; -int dlil_input_sanity_check = 0; /* sanity checking of input packet lists received */ -#endif -int dlil_multithreaded_input = 1; -static int cur_dlil_input_threads = 0; - static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg); static int dlil_detach_filter_internal(interface_filter_t filter, int detached); -static void dlil_call_delayed_detach_thread(void); +static void dlil_if_trace(struct dlil_ifnet *, int); +static void if_proto_ref(struct if_proto *); +static void if_proto_free(struct if_proto *); +static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t); +static int dlil_ifp_proto_count(struct ifnet *); +static void if_flt_monitor_busy(struct ifnet *); +static void if_flt_monitor_unbusy(struct ifnet *); +static void if_flt_monitor_enter(struct ifnet *); +static void if_flt_monitor_leave(struct ifnet *); +static int dlil_interface_filters_input(struct ifnet *, struct mbuf **, + char **, protocol_family_t); +static int dlil_interface_filters_output(struct ifnet *, struct mbuf **, + protocol_family_t); +static struct ifaddr *dlil_alloc_lladdr(struct ifnet *, + const struct sockaddr_dl *); +static int ifnet_lookup(struct ifnet *); +static void if_purgeaddrs(struct ifnet *); + +static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t, + struct mbuf *, char *); +static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t, + struct mbuf *); +static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t, + mbuf_t *, const struct sockaddr *, void *, char *, char *); +static void ifproto_media_event(struct ifnet *, protocol_family_t, + const struct kev_msg *); +static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t, + unsigned long, void *); +static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *, + struct sockaddr_dl *, size_t); +static errno_t ifproto_media_send_arp(struct ifnet *, u_short, + const struct sockaddr_dl *, const struct sockaddr *, + const struct sockaddr_dl *, const struct sockaddr *); + +static errno_t ifp_if_output(struct ifnet *, struct mbuf *); +static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *, + protocol_family_t *); +static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t, + const struct ifnet_demux_desc *, u_int32_t); +static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t); +static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *); +static errno_t ifp_if_framer(struct ifnet *, struct mbuf **, + const struct sockaddr *, const char *, const char *); +static errno_t ifp_if_ioctl(struct ifnet *, unsigned long, void *); +static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func); +static void ifp_if_free(struct ifnet *); +static void ifp_if_event(struct ifnet *, const struct kev_msg *); + +static void dlil_input_thread_func(struct dlil_threading_info *inpthread); +static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *); + +static void ifnet_delayed_thread_func(void); +static void ifnet_detach_final(struct ifnet *); +static void ifnet_detaching_enqueue(struct ifnet *); +static struct ifnet *ifnet_detaching_dequeue(void); + +static void ifp_src_route_copyout(struct ifnet *, struct route *); +static void ifp_src_route_copyin(struct ifnet *, struct route *); +#if INET6 +static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *); +static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *); +#endif /* INET6 */ + +/* The following are protected by dlil_ifnet_lock */ +static TAILQ_HEAD(, ifnet) ifnet_detaching_head; +static u_int32_t ifnet_detaching_cnt; +static void *ifnet_delayed_run; /* wait channel for detaching thread */ + +extern void bpfdetach(struct ifnet*); +extern void proto_input_run(void); -static void dlil_read_begin(void); -static __inline__ void dlil_read_end(void); -static int dlil_write_begin(void); -static void dlil_write_end(void); +__private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *); #if DEBUG -__private_extern__ int dlil_verbose = 1; +static int dlil_verbose = 1; #else -__private_extern__ int dlil_verbose = 0; +static int dlil_verbose = 0; #endif /* DEBUG */ +static int dlil_multithreaded_input = 1; +static int cur_dlil_input_threads = 0; +#if IFNET_INPUT_SANITY_CHK +static int dlil_lo_input_mbuf_count = 0; +/* sanity checking of input packet lists received */ +static int dlil_input_sanity_check = 0; +#endif -unsigned int net_affinity = 1; -static kern_return_t dlil_affinity_set(struct thread *, u_int32_t); +SYSCTL_DECL(_net_link_generic_system); -extern void bpfdetach(struct ifnet*); -extern void proto_input_run(void); // new run_netisr +SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose, CTLFLAG_RW, + &dlil_verbose, 0, "Log DLIL error messages"); -void dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m); -static void dlil_input_thread_func(struct dlil_threading_info *inpthread); -__private_extern__ int dlil_create_input_thread( - ifnet_t, struct dlil_threading_info *); -__private_extern__ void dlil_terminate_input_thread( - struct dlil_threading_info *); +SYSCTL_INT(_net_link_generic_system, OID_AUTO, multi_threaded_input, CTLFLAG_RW, + &dlil_multithreaded_input , 0, "Uses multiple input thread for DLIL input"); -__private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *); +#if IFNET_INPUT_SANITY_CHK +SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check, + CTLFLAG_RW, &dlil_input_sanity_check , 0, + "Turn on sanity checking in DLIL input"); +#endif -int dlil_expand_mcl; +unsigned int net_affinity = 1; +static kern_return_t dlil_affinity_set(struct thread *, u_int32_t); extern u_int32_t inject_buckets; -static const u_int32_t dlil_writer_waiting = 0x80000000; static lck_grp_attr_t *dlil_grp_attributes = NULL; static lck_attr_t *dlil_lck_attributes = NULL; static lck_grp_t *dlil_input_lock_grp = NULL; -static inline void* -_cast_non_const(const void * ptr) { - union { - const void* cval; - void* val; - } ret; - - ret.cval = ptr; - return (ret.val); -} - -/* Should these be inline? */ -static void -dlil_read_begin(void) -{ - u_int32_t new_value; - u_int32_t old_value; - struct uthread *uth = get_bsdthread_info(current_thread()); - - if (uth->dlil_incremented_read == dlil_writer_waiting) - panic("dlil_read_begin - thread is already a writer"); - - do { -again: - old_value = dlil_read_count; - - if ((old_value & dlil_writer_waiting) != 0 && uth->dlil_incremented_read == 0) - { - tsleep(&dlil_read_count, PRIBIO, "dlil_read_count", 1); - goto again; - } - - new_value = old_value + 1; - } while (!OSCompareAndSwap((UInt32)old_value, (UInt32)new_value, (UInt32*)&dlil_read_count)); - - uth->dlil_incremented_read++; -} - -static void -dlil_read_end(void) -{ - struct uthread *uth = get_bsdthread_info(current_thread()); - - OSDecrementAtomic(&dlil_read_count); - uth->dlil_incremented_read--; - if (dlil_read_count == dlil_writer_waiting) - wakeup(_cast_non_const(&dlil_writer_waiting)); -} - -static int -dlil_write_begin(void) -{ - struct uthread *uth = get_bsdthread_info(current_thread()); - - if (uth->dlil_incremented_read != 0) { - return EDEADLK; - } - lck_mtx_lock(dlil_mutex); - OSBitOrAtomic((UInt32)dlil_writer_waiting, &dlil_read_count); -again: - if (dlil_read_count == dlil_writer_waiting) { - uth->dlil_incremented_read = dlil_writer_waiting; - return 0; - } - else { - tsleep(_cast_non_const(&dlil_writer_waiting), PRIBIO, "dlil_writer_waiting", 1); - goto again; - } -} - -static void -dlil_write_end(void) -{ - struct uthread *uth = get_bsdthread_info(current_thread()); - - if (uth->dlil_incremented_read != dlil_writer_waiting) - panic("dlil_write_end - thread is not a writer"); - OSBitAndAtomic((UInt32)~dlil_writer_waiting, &dlil_read_count); - lck_mtx_unlock(dlil_mutex); - uth->dlil_incremented_read = 0; - wakeup(&dlil_read_count); -} - #define PROTO_HASH_SLOTS 0x5 /* @@ -351,192 +417,248 @@ proto_hash_value(u_int32_t protocol_family) */ switch(protocol_family) { case PF_INET: - return 0; + return (0); case PF_INET6: - return 1; + return (1); case PF_APPLETALK: - return 2; + return (2); case PF_VLAN: - return 3; + return (3); + case PF_UNSPEC: default: - return 4; + return (4); } } -static struct if_proto* +/* + * Caller must already be holding ifnet lock. + */ +static struct if_proto * find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family) { struct if_proto *proto = NULL; u_int32_t i = proto_hash_value(protocol_family); - if (ifp->if_proto_hash) { + + ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED); + + if (ifp->if_proto_hash != NULL) proto = SLIST_FIRST(&ifp->if_proto_hash[i]); - } - - while(proto && proto->protocol_family != protocol_family) { + + while (proto != NULL && proto->protocol_family != protocol_family) proto = SLIST_NEXT(proto, next_hash); - } - - return proto; + + if (proto != NULL) + if_proto_ref(proto); + + return (proto); } static void if_proto_ref(struct if_proto *proto) { - OSAddAtomic(1, &proto->refcount); + atomic_add_32(&proto->refcount, 1); } +extern void if_rtproto_del(struct ifnet *ifp, int protocol); + static void if_proto_free(struct if_proto *proto) { - int oldval = OSAddAtomic(-1, &proto->refcount); - - if (oldval == 1) { /* This was the last reference */ - FREE(proto, M_IFADDR); + u_int32_t oldval; + struct ifnet *ifp = proto->ifp; + u_int32_t proto_family = proto->protocol_family; + struct kev_dl_proto_data ev_pr_data; + + oldval = atomic_add_32_ov(&proto->refcount, -1); + if (oldval > 1) + return; + + /* No more reference on this, protocol must have been detached */ + VERIFY(proto->detached); + + if (proto->proto_kpi == kProtoKPI_v1) { + if (proto->kpi.v1.detached) + proto->kpi.v1.detached(ifp, proto->protocol_family); + } + if (proto->proto_kpi == kProtoKPI_v2) { + if (proto->kpi.v2.detached) + proto->kpi.v2.detached(ifp, proto->protocol_family); } + + /* + * Cleanup routes that may still be in the routing table for that + * interface/protocol pair. + */ + if_rtproto_del(ifp, proto_family); + + /* + * The reserved field carries the number of protocol still attached + * (subject to change) + */ + ifnet_lock_shared(ifp); + ev_pr_data.proto_family = proto_family; + ev_pr_data.proto_remaining_count = dlil_ifp_proto_count(ifp); + ifnet_lock_done(ifp); + + dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED, + (struct net_event_data *)&ev_pr_data, + sizeof(struct kev_dl_proto_data)); + + zfree(dlif_proto_zone, proto); } __private_extern__ void -ifnet_lock_assert( - __unused struct ifnet *ifp, - __unused int what) +ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what) { -#if IFNET_RW_LOCK - /* - * Not implemented for rw locks. - * - * Function exists so when/if we use mutex we can - * enable this check. - */ -#else - lck_mtx_assert(ifp->if_lock, what); -#endif + unsigned int type = 0; + int ass = 1; + + switch (what) { + case IFNET_LCK_ASSERT_EXCLUSIVE: + type = LCK_RW_ASSERT_EXCLUSIVE; + break; + + case IFNET_LCK_ASSERT_SHARED: + type = LCK_RW_ASSERT_SHARED; + break; + + case IFNET_LCK_ASSERT_OWNED: + type = LCK_RW_ASSERT_HELD; + break; + + case IFNET_LCK_ASSERT_NOTOWNED: + /* nothing to do here for RW lock; bypass assert */ + ass = 0; + break; + + default: + panic("bad ifnet assert type: %d", what); + /* NOTREACHED */ + } + if (ass) + lck_rw_assert(&ifp->if_lock, type); } __private_extern__ void -ifnet_lock_shared( - struct ifnet *ifp) +ifnet_lock_shared(struct ifnet *ifp) { -#if IFNET_RW_LOCK - lck_rw_lock_shared(ifp->if_lock); -#else - lck_mtx_assert(ifp->if_lock, LCK_MTX_ASSERT_NOTOWNED); - lck_mtx_lock(ifp->if_lock); -#endif + lck_rw_lock_shared(&ifp->if_lock); } __private_extern__ void -ifnet_lock_exclusive( - struct ifnet *ifp) +ifnet_lock_exclusive(struct ifnet *ifp) { -#if IFNET_RW_LOCK - lck_rw_lock_exclusive(ifp->if_lock); -#else - lck_mtx_assert(ifp->if_lock, LCK_MTX_ASSERT_NOTOWNED); - lck_mtx_lock(ifp->if_lock); -#endif + lck_rw_lock_exclusive(&ifp->if_lock); } __private_extern__ void -ifnet_lock_done( - struct ifnet *ifp) +ifnet_lock_done(struct ifnet *ifp) { -#if IFNET_RW_LOCK - lck_rw_done(ifp->if_lock); -#else - lck_mtx_assert(ifp->if_lock, LCK_MTX_ASSERT_OWNED); - lck_mtx_unlock(ifp->if_lock); -#endif + lck_rw_done(&ifp->if_lock); } __private_extern__ void ifnet_head_lock_shared(void) { - lck_rw_lock_shared(ifnet_head_mutex); + lck_rw_lock_shared(&ifnet_head_lock); } __private_extern__ void ifnet_head_lock_exclusive(void) { - lck_rw_lock_exclusive(ifnet_head_mutex); + lck_rw_lock_exclusive(&ifnet_head_lock); } __private_extern__ void ifnet_head_done(void) { - lck_rw_done(ifnet_head_mutex); + lck_rw_done(&ifnet_head_lock); } -static int dlil_ifp_proto_count(struct ifnet * ifp) +/* + * Caller must already be holding ifnet lock. + */ +static int +dlil_ifp_proto_count(struct ifnet * ifp) { - int count = 0; - int i; - - if (ifp->if_proto_hash != NULL) { - for (i = 0; i < PROTO_HASH_SLOTS; i++) { - struct if_proto *proto; - SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) { - count++; - } + int i, count = 0; + + ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED); + + if (ifp->if_proto_hash == NULL) + goto done; + + for (i = 0; i < PROTO_HASH_SLOTS; i++) { + struct if_proto *proto; + SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) { + count++; } } - - return count; +done: + return (count); } __private_extern__ void -dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass, u_int32_t event_code, - struct net_event_data *event_data, u_int32_t event_data_len) +dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass, + u_int32_t event_code, struct net_event_data *event_data, + u_int32_t event_data_len) { - struct net_event_data ev_data; - struct kev_msg ev_msg; - - /* + struct net_event_data ev_data; + struct kev_msg ev_msg; + + bzero(&ev_msg, sizeof (ev_msg)); + bzero(&ev_data, sizeof (ev_data)); + /* * a net event always starts with a net_event_data structure * but the caller can generate a simple net event or * provide a longer event structure to post */ - - ev_msg.vendor_code = KEV_VENDOR_APPLE; - ev_msg.kev_class = KEV_NETWORK_CLASS; - ev_msg.kev_subclass = event_subclass; - ev_msg.event_code = event_code; - - if (event_data == 0) { + ev_msg.vendor_code = KEV_VENDOR_APPLE; + ev_msg.kev_class = KEV_NETWORK_CLASS; + ev_msg.kev_subclass = event_subclass; + ev_msg.event_code = event_code; + + if (event_data == NULL) { event_data = &ev_data; event_data_len = sizeof(struct net_event_data); } - + strncpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ); event_data->if_family = ifp->if_family; event_data->if_unit = (u_int32_t) ifp->if_unit; - + ev_msg.dv[0].data_length = event_data_len; - ev_msg.dv[0].data_ptr = event_data; + ev_msg.dv[0].data_ptr = event_data; ev_msg.dv[1].data_length = 0; - + dlil_event_internal(ifp, &ev_msg); } -__private_extern__ int -dlil_create_input_thread( - ifnet_t ifp, struct dlil_threading_info *inputthread) +static int +dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inputthread) { int error; bzero(inputthread, sizeof(*inputthread)); - // loopback ifp may not be configured at dlil_init time. - if (ifp == lo_ifp) - strlcat(inputthread->input_name, "dlil_input_main_thread_mtx", 32); - else - snprintf(inputthread->input_name, 32, "dlil_input_%s%d_mtx", ifp->if_name, ifp->if_unit); + /* loopback ifp may not be configured at dlil_init time. */ + if (ifp == lo_ifp) { + (void) strlcat(inputthread->input_name, + "dlil_input_main_thread_mtx", DLIL_THREADNAME_LEN); + } else { + (void) snprintf(inputthread->input_name, DLIL_THREADNAME_LEN, + "dlil_input_%s%d_mtx", ifp->if_name, ifp->if_unit); + } - inputthread->lck_grp = lck_grp_alloc_init(inputthread->input_name, dlil_grp_attributes); - inputthread->input_lck = lck_mtx_alloc_init(inputthread->lck_grp, dlil_lck_attributes); + inputthread->lck_grp = lck_grp_alloc_init(inputthread->input_name, + dlil_grp_attributes); + lck_mtx_init(&inputthread->input_lck, inputthread->lck_grp, + dlil_lck_attributes); - error= kernel_thread_start((thread_continue_t)dlil_input_thread_func, inputthread, &inputthread->input_thread); + error= kernel_thread_start((thread_continue_t)dlil_input_thread_func, + inputthread, &inputthread->input_thread); if (error == 0) { - ml_thread_policy(inputthread->input_thread, MACHINE_GROUP, - (MACHINE_NETWORK_GROUP|MACHINE_NETWORK_NETISR)); + ml_thread_policy(inputthread->input_thread, MACHINE_GROUP, + (MACHINE_NETWORK_GROUP|MACHINE_NETWORK_NETISR)); /* * Except for the loopback dlil input thread, we create * an affinity set so that the matching workloop thread @@ -557,31 +679,16 @@ dlil_create_input_thread( } } } else { - panic("dlil_create_input_thread: couldn't create thread\n"); + panic("%s: couldn't create thread", __func__); + /* NOTREACHED */ } OSAddAtomic(1, &cur_dlil_input_threads); #if DLIL_DEBUG - printf("dlil_create_input_thread: threadinfo: %p input_thread=%p threads: cur=%d max=%d\n", - inputthread, inputthread->input_thread, dlil_multithreaded_input, cur_dlil_input_threads); + printf("%s: threadinfo: %p input_thread=%p threads: cur=%d max=%d\n", + __func__, inputthread, inputthread->input_thread, + dlil_multithreaded_input, cur_dlil_input_threads); #endif - return error; -} -__private_extern__ void -dlil_terminate_input_thread( - struct dlil_threading_info *inputthread) -{ - OSAddAtomic(-1, &cur_dlil_input_threads); - - lck_mtx_unlock(inputthread->input_lck); - lck_mtx_free(inputthread->input_lck, inputthread->lck_grp); - lck_grp_free(inputthread->lck_grp); - - FREE(inputthread, M_NKE); - - /* For the extra reference count from kernel_thread_start() */ - thread_deallocate(current_thread()); - - thread_terminate(current_thread()); + return (error); } static kern_return_t @@ -598,65 +705,246 @@ dlil_affinity_set(struct thread *tp, u_int32_t tag) void dlil_init(void) { - thread_t thread = THREAD_NULL; + thread_t thread = THREAD_NULL; + + /* + * The following fields must be 64-bit aligned for atomic operations. + */ + IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets); + IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors) + IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets); + IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors); + IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions); + IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes); + IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes); + IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts); + IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts); + IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops); + IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto); + + IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets); + IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors) + IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets); + IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors); + IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions); + IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes); + IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes); + IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts); + IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts); + IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops); + IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto); + + IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ibkpackets); + IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ibkbytes); + IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_obkpackets); + IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_obkbytes); + IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ivipackets); + IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ivibytes); + IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ovipackets); + IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ovibytes); + IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ivopackets); + IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ivobytes); + IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ovopackets); + IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ovobytes); + + /* + * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts. + */ + _CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP); + _CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP); + _CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP); + _CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT); + _CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT); + _CASSERT(IF_HWASSIST_CSUM_TCP_SUM16 == IFNET_CSUM_SUM16); + _CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING); + _CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU); + _CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4); + _CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6); + + /* + * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info. + */ + _CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN); + + PE_parse_boot_argn("net_affinity", &net_affinity, + sizeof (net_affinity)); - PE_parse_boot_argn("net_affinity", &net_affinity, sizeof (net_affinity)); -#if IFNET_ROUTE_REFCNT PE_parse_boot_argn("net_rtref", &net_rtref, sizeof (net_rtref)); -#endif /* IFNET_ROUTE_REFCNT */ + + PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof (ifnet_debug)); + + dlif_size = (ifnet_debug == 0) ? sizeof (struct dlil_ifnet) : + sizeof (struct dlil_ifnet_dbg); + /* Enforce 64-bit alignment for dlil_ifnet structure */ + dlif_bufsize = dlif_size + sizeof (void *) + sizeof (u_int64_t); + dlif_bufsize = P2ROUNDUP(dlif_bufsize, sizeof (u_int64_t)); + dlif_zone = zinit(dlif_bufsize, DLIF_ZONE_MAX * dlif_bufsize, + 0, DLIF_ZONE_NAME); + if (dlif_zone == NULL) { + panic("%s: failed allocating %s", __func__, DLIF_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(dlif_zone, Z_EXPAND, TRUE); + zone_change(dlif_zone, Z_CALLERACCT, FALSE); + + dlif_filt_size = sizeof (struct ifnet_filter); + dlif_filt_zone = zinit(dlif_filt_size, + DLIF_FILT_ZONE_MAX * dlif_filt_size, 0, DLIF_FILT_ZONE_NAME); + if (dlif_filt_zone == NULL) { + panic("%s: failed allocating %s", __func__, + DLIF_FILT_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(dlif_filt_zone, Z_EXPAND, TRUE); + zone_change(dlif_filt_zone, Z_CALLERACCT, FALSE); + + dlif_inp_size = sizeof (struct dlil_threading_info); + dlif_inp_zone = zinit(dlif_inp_size, + DLIF_INP_ZONE_MAX * dlif_inp_size, 0, DLIF_INP_ZONE_NAME); + if (dlif_inp_zone == NULL) { + panic("%s: failed allocating %s", __func__, DLIF_INP_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(dlif_inp_zone, Z_EXPAND, TRUE); + zone_change(dlif_inp_zone, Z_CALLERACCT, FALSE); + + dlif_phash_size = sizeof (struct proto_hash_entry) * PROTO_HASH_SLOTS; + dlif_phash_zone = zinit(dlif_phash_size, + DLIF_PHASH_ZONE_MAX * dlif_phash_size, 0, DLIF_PHASH_ZONE_NAME); + if (dlif_phash_zone == NULL) { + panic("%s: failed allocating %s", __func__, + DLIF_PHASH_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(dlif_phash_zone, Z_EXPAND, TRUE); + zone_change(dlif_phash_zone, Z_CALLERACCT, FALSE); + + dlif_proto_size = sizeof (struct if_proto); + dlif_proto_zone = zinit(dlif_proto_size, + DLIF_PROTO_ZONE_MAX * dlif_proto_size, 0, DLIF_PROTO_ZONE_NAME); + if (dlif_proto_zone == NULL) { + panic("%s: failed allocating %s", __func__, + DLIF_PROTO_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(dlif_proto_zone, Z_EXPAND, TRUE); + zone_change(dlif_proto_zone, Z_CALLERACCT, FALSE); + + ifnet_llreach_init(); TAILQ_INIT(&dlil_ifnet_head); TAILQ_INIT(&ifnet_head); - + TAILQ_INIT(&ifnet_detaching_head); + /* Setup the lock groups we will use */ dlil_grp_attributes = lck_grp_attr_alloc_init(); - dlil_lock_group = lck_grp_alloc_init("dlil internal locks", dlil_grp_attributes); - ifnet_lock_group = lck_grp_alloc_init("ifnet locks", dlil_grp_attributes); - ifnet_head_lock_group = lck_grp_alloc_init("ifnet head lock", dlil_grp_attributes); - dlil_input_lock_grp = lck_grp_alloc_init("dlil input lock", dlil_grp_attributes); - + dlil_lock_group = lck_grp_alloc_init("dlil internal locks", + dlil_grp_attributes); + ifnet_lock_group = lck_grp_alloc_init("ifnet locks", + dlil_grp_attributes); + ifnet_head_lock_group = lck_grp_alloc_init("ifnet head lock", + dlil_grp_attributes); + dlil_input_lock_grp = lck_grp_alloc_init("dlil input lock", + dlil_grp_attributes); + /* Setup the lock attributes we will use */ dlil_lck_attributes = lck_attr_alloc_init(); - + ifnet_lock_attr = lck_attr_alloc_init(); - - - ifnet_head_mutex = lck_rw_alloc_init(ifnet_head_lock_group, dlil_lck_attributes); - dlil_ifnet_mutex = lck_mtx_alloc_init(dlil_lock_group, dlil_lck_attributes); - dlil_mutex = lck_mtx_alloc_init(dlil_lock_group, dlil_lck_attributes); - + + lck_rw_init(&ifnet_head_lock, ifnet_head_lock_group, + dlil_lck_attributes); + lck_mtx_init(&dlil_ifnet_lock, dlil_lock_group, dlil_lck_attributes); + lck_attr_free(dlil_lck_attributes); dlil_lck_attributes = NULL; - + + ifa_init(); + /* - * Create and start up the first dlil input thread once everything is initialized + * Create and start up the first dlil input thread once everything + * is initialized. */ - dlil_create_input_thread(0, dlil_lo_thread_ptr); + dlil_create_input_thread(lo_ifp, dlil_lo_thread_ptr); - (void) kernel_thread_start((thread_continue_t)dlil_call_delayed_detach_thread, NULL, &thread); + if (kernel_thread_start((thread_continue_t)ifnet_delayed_thread_func, + NULL, &thread) != 0) { + panic("%s: couldn't create detach thread", __func__); + /* NOTREACHED */ + } thread_deallocate(thread); + #if PF /* Initialize the packet filter */ pfinit(); #endif /* PF */ } +static void +if_flt_monitor_busy(struct ifnet *ifp) +{ + lck_mtx_assert(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED); + + ++ifp->if_flt_busy; + VERIFY(ifp->if_flt_busy != 0); +} + +static void +if_flt_monitor_unbusy(struct ifnet *ifp) +{ + if_flt_monitor_leave(ifp); +} + +static void +if_flt_monitor_enter(struct ifnet *ifp) +{ + lck_mtx_assert(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED); + + while (ifp->if_flt_busy) { + ++ifp->if_flt_waiters; + (void) msleep(&ifp->if_flt_head, &ifp->if_flt_lock, + (PZERO - 1), "if_flt_monitor", NULL); + } + if_flt_monitor_busy(ifp); +} + +static void +if_flt_monitor_leave(struct ifnet *ifp) +{ + lck_mtx_assert(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED); + + VERIFY(ifp->if_flt_busy != 0); + --ifp->if_flt_busy; + + if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) { + ifp->if_flt_waiters = 0; + wakeup(&ifp->if_flt_head); + } +} + __private_extern__ int -dlil_attach_filter( - struct ifnet *ifp, - const struct iff_filter *if_filter, - interface_filter_t *filter_ref) -{ - int retval = 0; - struct ifnet_filter *filter; - - MALLOC(filter, struct ifnet_filter *, sizeof(*filter), M_NKE, M_WAITOK); - if (filter == NULL) - return ENOMEM; - bzero(filter, sizeof(*filter)); +dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter, + interface_filter_t *filter_ref) +{ + int retval = 0; + struct ifnet_filter *filter = NULL; - + ifnet_head_lock_shared(); + /* Check that the interface is in the global list */ + if (!ifnet_lookup(ifp)) { + retval = ENXIO; + goto done; + } + + filter = zalloc(dlif_filt_zone); + if (filter == NULL) { + retval = ENOMEM; + goto done; + } + bzero(filter, dlif_filt_size); + + /* refcnt held above during lookup */ filter->filt_ifp = ifp; filter->filt_cookie = if_filter->iff_cookie; filter->filt_name = if_filter->iff_name; @@ -666,14 +954,16 @@ dlil_attach_filter( filter->filt_event = if_filter->iff_event; filter->filt_ioctl = if_filter->iff_ioctl; filter->filt_detached = if_filter->iff_detached; - - if ((retval = dlil_write_begin()) != 0) { - /* Failed to acquire the write lock */ - FREE(filter, M_NKE); - return retval; - } + + lck_mtx_lock(&ifp->if_flt_lock); + if_flt_monitor_enter(ifp); + + lck_mtx_assert(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED); TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next); - dlil_write_end(); + + if_flt_monitor_leave(ifp); + lck_mtx_unlock(&ifp->if_flt_lock); + *filter_ref = filter; /* @@ -684,71 +974,88 @@ dlil_attach_filter( if (use_routegenid) routegenid_update(); - return retval; + if (dlil_verbose) { + printf("%s%d: %s filter attached\n", ifp->if_name, + ifp->if_unit, if_filter->iff_name); + } +done: + ifnet_head_done(); + if (retval != 0 && ifp != NULL) { + DLIL_PRINTF("%s%d: failed to attach %s (err=%d)\n", + ifp->if_name, ifp->if_unit, if_filter->iff_name, retval); + } + if (retval != 0 && filter != NULL) + zfree(dlif_filt_zone, filter); + + return (retval); } static int -dlil_detach_filter_internal( - interface_filter_t filter, - int detached) +dlil_detach_filter_internal(interface_filter_t filter, int detached) { int retval = 0; - + if (detached == 0) { - ifnet_t ifp = NULL; - interface_filter_t entry = NULL; - - /* Take the write lock */ - retval = dlil_write_begin(); - if (retval != 0 && retval != EDEADLK) - return retval; - - /* - * At this point either we have the write lock (retval == 0) - * or we couldn't get it (retval == EDEADLK) because someone - * else up the stack is holding the read lock. It is safe to - * read, either the read or write is held. Verify the filter - * parameter before proceeding. - */ + ifnet_t ifp = NULL; + ifnet_head_lock_shared(); TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + interface_filter_t entry = NULL; + + lck_mtx_lock(&ifp->if_flt_lock); TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) { - if (entry == filter) - break; + if (entry != filter || entry->filt_skip) + continue; + /* + * We've found a match; since it's possible + * that the thread gets blocked in the monitor, + * we do the lock dance. Interface should + * not be detached since we still have a use + * count held during filter attach. + */ + entry->filt_skip = 1; /* skip input/output */ + lck_mtx_unlock(&ifp->if_flt_lock); + ifnet_head_done(); + + lck_mtx_lock(&ifp->if_flt_lock); + if_flt_monitor_enter(ifp); + lck_mtx_assert(&ifp->if_flt_lock, + LCK_MTX_ASSERT_OWNED); + + /* Remove the filter from the list */ + TAILQ_REMOVE(&ifp->if_flt_head, filter, + filt_next); + + if_flt_monitor_leave(ifp); + lck_mtx_unlock(&ifp->if_flt_lock); + if (dlil_verbose) { + printf("%s%d: %s filter detached\n", + ifp->if_name, ifp->if_unit, + filter->filt_name); + } + goto destroy; } - if (entry == filter) - break; + lck_mtx_unlock(&ifp->if_flt_lock); } ifnet_head_done(); - - if (entry != filter) { - /* filter parameter is not a valid filter ref */ - if (retval == 0) { - dlil_write_end(); - } - return EINVAL; - } - - if (retval == EDEADLK) { - /* Perform a delayed detach */ - filter->filt_detaching = 1; - dlil_detach_waiting = 1; - wakeup(&dlil_detach_waiting); - return 0; - } - - /* Remove the filter from the list */ - TAILQ_REMOVE(&ifp->if_flt_head, filter, filt_next); - dlil_write_end(); + + /* filter parameter is not a valid filter ref */ + retval = EINVAL; + goto done; } - - /* Call the detached funciton if there is one */ + + if (dlil_verbose) + printf("%s filter detached\n", filter->filt_name); + +destroy: + + /* Call the detached function if there is one */ if (filter->filt_detached) filter->filt_detached(filter->filt_cookie, filter->filt_ifp); /* Free the filter */ - FREE(filter, M_NKE); - + zfree(dlif_filt_zone, filter); + /* * Decrease filter count and route_generation ID to let TCP * know it should reevalute doing TSO or not @@ -757,7 +1064,12 @@ dlil_detach_filter_internal( if (use_routegenid) routegenid_update(); - return retval; +done: + if (retval != 0) { + DLIL_PRINTF("failed to detach %s filter (err=%d)\n", + filter->filt_name, retval); + } + return (retval); } __private_extern__ void @@ -769,8 +1081,7 @@ dlil_detach_filter(interface_filter_t filter) } static void -dlil_input_thread_func( - struct dlil_threading_info *inputthread) +dlil_input_thread_func(struct dlil_threading_info *inputthread) { while (1) { struct mbuf *m = NULL, *m_loop = NULL; @@ -779,28 +1090,44 @@ dlil_input_thread_func( int count; struct mbuf *m1; #endif /* IFNET_INPUT_SANITY_CHK */ - - lck_mtx_lock(inputthread->input_lck); - + + lck_mtx_lock_spin(&inputthread->input_lck); + /* Wait until there is work to be done */ - while ((inputthread->input_waiting & ~DLIL_INPUT_RUNNING) == 0) { + while (!(inputthread->input_waiting & ~DLIL_INPUT_RUNNING)) { inputthread->input_waiting &= ~DLIL_INPUT_RUNNING; - msleep(&inputthread->input_waiting, inputthread->input_lck, 0, inputthread->input_name, 0); + msleep(&inputthread->input_waiting, + &inputthread->input_lck, 0, + inputthread->input_name, 0); } - - lck_mtx_assert(inputthread->input_lck, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&inputthread->input_lck, LCK_MTX_ASSERT_OWNED); m = inputthread->mbuf_head; inputthread->mbuf_head = NULL; inputthread->mbuf_tail = NULL; if (inputthread->input_waiting & DLIL_INPUT_TERMINATE) { - if (m) - mbuf_freem_list(m); - /* this is the end */ - dlil_terminate_input_thread(inputthread); - return; + lck_mtx_unlock(&inputthread->input_lck); + + if (m != NULL) + mbuf_freem_list(m); + + OSAddAtomic(-1, &cur_dlil_input_threads); + + lck_mtx_destroy(&inputthread->input_lck, + inputthread->lck_grp); + lck_grp_free(inputthread->lck_grp); + + zfree(dlif_inp_zone, inputthread); + + /* for the extra refcnt from kernel_thread_start() */ + thread_deallocate(current_thread()); + + /* this is the end */ + thread_terminate(current_thread()); + /* NOTREACHED */ + return; } inputthread->input_waiting |= DLIL_INPUT_RUNNING; @@ -820,69 +1147,76 @@ dlil_input_thread_func( loop_cnt = dlil_lo_input_mbuf_count; dlil_lo_input_mbuf_count = 0; } - - lck_mtx_unlock(inputthread->input_lck); - + + lck_mtx_unlock(&inputthread->input_lck); + for (m1 = m, count = 0; m1; m1 = mbuf_nextpkt(m1)) { count++; } if (count != mbuf_cnt) { - panic("dlil_input_func - thread=%p reg. loop queue has %d packets, should have %d\n", - inputthread, count, mbuf_cnt); + panic("%s - thread=%p reg. loop queue " + "has %d packets, should have %d\n", + __func__, inputthread, count, mbuf_cnt); + /* NOTREACHED */ } - + if (inputthread == dlil_lo_thread_ptr) { - for (m1 = m_loop, count = 0; m1; m1 = mbuf_nextpkt(m1)) { + for (m1 = m_loop, count = 0; m1; + m1 = mbuf_nextpkt(m1)) { count++; } if (count != loop_cnt) { - panic("dlil_input_func - thread=%p loop queue has %d packets, should have %d\n", - inputthread, count, loop_cnt); + panic("%s - thread=%p loop queue " + "has %d packets, should have %d\n", + __func__, inputthread, count, + loop_cnt); + /* NOTREACHED */ } } - } else + } else #endif /* IFNET_INPUT_SANITY_CHK */ { - lck_mtx_unlock(inputthread->input_lck); + lck_mtx_unlock(&inputthread->input_lck); } /* * NOTE warning %%% attention !!!! - * We should think about putting some thread starvation safeguards if - * we deal with long chains of packets. + * We should think about putting some thread starvation + * safeguards if we deal with long chains of packets. */ if (m_loop) { - if (inputthread == dlil_lo_thread_ptr) + if (inputthread == dlil_lo_thread_ptr) { dlil_input_packet_list(lo_ifp, m_loop); + } #if IFNET_INPUT_SANITY_CHK - else - panic("dlil_input_func - thread=%p loop queue has %d packets, should have none!\n", - inputthread, loop_cnt); + else { + panic("%s - thread=%p loop queue has %d " + "packets, should have none!\n", __func__, + inputthread, loop_cnt); + /* NOTREACHED */ + } #endif /* IFNET_INPUT_SANITY_CHK */ } - - if (m) + if (m != NULL) dlil_input_packet_list(0, m); + lck_mtx_lock_spin(&inputthread->input_lck); - lck_mtx_lock(inputthread->input_lck); - - if ((inputthread->input_waiting & (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)) != 0) { - lck_mtx_unlock(inputthread->input_lck); + if (inputthread->input_waiting & + (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)) { + lck_mtx_unlock(&inputthread->input_lck); proto_input_run(); - } - else - lck_mtx_unlock(inputthread->input_lck); + } else { + lck_mtx_unlock(&inputthread->input_lck); + } } } errno_t -ifnet_input( - ifnet_t ifp, - mbuf_t m_head, - const struct ifnet_stat_increment_param *stats) +ifnet_input(ifnet_t ifp, mbuf_t m_head, + const struct ifnet_stat_increment_param *stats) { struct thread *tp = current_thread(); mbuf_t m_tail; @@ -892,9 +1226,9 @@ ifnet_input( #endif /* IFNET_INPUT_SANITY_CHK */ if (ifp == NULL || m_head == NULL) { - if (m_head) + if (m_head != NULL) mbuf_freem_list(m_head); - return EINVAL; + return (EINVAL); } m_tail = m_head; @@ -902,14 +1236,16 @@ ifnet_input( #if IFNET_INPUT_SANITY_CHK if (dlil_input_sanity_check != 0) { ifnet_t rcvif; - + rcvif = mbuf_pkthdr_rcvif(m_tail); pkt_count++; - + if (rcvif == NULL || - (ifp->if_type != IFT_LOOP && rcvif != ifp) || - (mbuf_flags(m_head) & MBUF_PKTHDR) == 0) { - panic("ifnet_input - invalid mbuf %p\n", m_tail); + (ifp->if_type != IFT_LOOP && rcvif != ifp) || + !(mbuf_flags(m_head) & MBUF_PKTHDR)) { + panic("%s - invalid mbuf %p\n", + __func__, m_tail); + /* NOTREACHED */ } } #endif /* IFNET_INPUT_SANITY_CHK */ @@ -920,7 +1256,7 @@ ifnet_input( inp = ifp->if_input_thread; - if (dlil_multithreaded_input == 0 || inp == NULL) + if (dlil_multithreaded_input == 0 || inp == NULL) inp = dlil_lo_thread_ptr; /* @@ -928,11 +1264,11 @@ ifnet_input( * affinity set, associate this workloop thread with the same set. * We will only do this once. */ - lck_mtx_lock(inp->input_lck); + lck_mtx_lock_spin(&inp->input_lck); if (inp->net_affinity && inp->workloop_thread == NULL) { u_int32_t tag = inp->tag; inp->workloop_thread = tp; - lck_mtx_unlock(inp->input_lck); + lck_mtx_unlock(&inp->input_lck); /* Associated the current thread with the new affinity tag */ (void) dlil_affinity_set(tp, tag); @@ -943,7 +1279,7 @@ ifnet_input( * its affinity. */ thread_reference(tp); - lck_mtx_lock(inp->input_lck); + lck_mtx_lock_spin(&inp->input_lck); } /* WARNING @@ -964,11 +1300,10 @@ ifnet_input( inp->input_mbuf_cnt += pkt_count; inp->input_wake_cnt++; - lck_mtx_assert(inp->input_lck, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&inp->input_lck, LCK_MTX_ASSERT_OWNED); } #endif - } - else { + } else { if (inp->mbuf_head == NULL) inp->mbuf_head = m_head; else if (inp->mbuf_tail != NULL) @@ -980,58 +1315,71 @@ ifnet_input( inp->input_mbuf_cnt += pkt_count; inp->input_wake_cnt++; - lck_mtx_assert(inp->input_lck, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&inp->input_lck, LCK_MTX_ASSERT_OWNED); } #endif } - inp->input_waiting |= DLIL_INPUT_WAITING; if ((inp->input_waiting & DLIL_INPUT_RUNNING) == 0) { wakeup((caddr_t)&inp->input_waiting); } + lck_mtx_unlock(&inp->input_lck); + if (stats) { - ifp->if_data.ifi_ipackets += stats->packets_in; - ifp->if_data.ifi_ibytes += stats->bytes_in; - ifp->if_data.ifi_ierrors += stats->errors_in; - - ifp->if_data.ifi_opackets += stats->packets_out; - ifp->if_data.ifi_obytes += stats->bytes_out; - ifp->if_data.ifi_oerrors += stats->errors_out; - - ifp->if_data.ifi_collisions += stats->collisions; - ifp->if_data.ifi_iqdrops += stats->dropped; + atomic_add_64(&ifp->if_data.ifi_ipackets, stats->packets_in); + atomic_add_64(&ifp->if_data.ifi_ibytes, stats->bytes_in); + atomic_add_64(&ifp->if_data.ifi_ierrors, stats->errors_in); + + atomic_add_64(&ifp->if_data.ifi_opackets, stats->packets_out); + atomic_add_64(&ifp->if_data.ifi_obytes, stats->bytes_out); + atomic_add_64(&ifp->if_data.ifi_oerrors, stats->errors_out); + + atomic_add_64(&ifp->if_data.ifi_collisions, stats->collisions); + atomic_add_64(&ifp->if_data.ifi_iqdrops, stats->dropped); } - lck_mtx_unlock(inp->input_lck); - - return 0; + return (0); } static int -dlil_interface_filters_input(struct ifnet * ifp, struct mbuf * * m_p, - char * * frame_header_p, - protocol_family_t protocol_family) +dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p, + char **frame_header_p, protocol_family_t protocol_family) { - struct ifnet_filter * filter; + struct ifnet_filter *filter; + /* + * Pass the inbound packet to the interface filters + */ + lck_mtx_lock_spin(&ifp->if_flt_lock); + /* prevent filter list from changing in case we drop the lock */ + if_flt_monitor_busy(ifp); TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) { int result; - if (filter->filt_input - && (filter->filt_protocol == 0 - || filter->filt_protocol == protocol_family)) { + if (!filter->filt_skip && filter->filt_input != NULL && + (filter->filt_protocol == 0 || + filter->filt_protocol == protocol_family)) { + lck_mtx_unlock(&ifp->if_flt_lock); + result = (*filter->filt_input)(filter->filt_cookie, - ifp, protocol_family, - m_p, frame_header_p); + ifp, protocol_family, m_p, frame_header_p); + + lck_mtx_lock_spin(&ifp->if_flt_lock); if (result != 0) { + /* we're done with the filter list */ + if_flt_monitor_unbusy(ifp); + lck_mtx_unlock(&ifp->if_flt_lock); return (result); } } } + /* we're done with the filter list */ + if_flt_monitor_unbusy(ifp); + lck_mtx_unlock(&ifp->if_flt_lock); /* - * Strip away M_PROTO1 bit prior to sending packet up the stack as + * Strip away M_PROTO1 bit prior to sending packet up the stack as * it is meant to be local to a subsystem -- if_bridge for M_PROTO1 */ if (*m_p != NULL) @@ -1040,6 +1388,45 @@ dlil_interface_filters_input(struct ifnet * ifp, struct mbuf * * m_p, return (0); } +static int +dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p, + protocol_family_t protocol_family) +{ + struct ifnet_filter *filter; + + /* + * Pass the outbound packet to the interface filters + */ + lck_mtx_lock_spin(&ifp->if_flt_lock); + /* prevent filter list from changing in case we drop the lock */ + if_flt_monitor_busy(ifp); + TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) { + int result; + + if (!filter->filt_skip && filter->filt_output != NULL && + (filter->filt_protocol == 0 || + filter->filt_protocol == protocol_family)) { + lck_mtx_unlock(&ifp->if_flt_lock); + + result = filter->filt_output(filter->filt_cookie, ifp, + protocol_family, m_p); + + lck_mtx_lock_spin(&ifp->if_flt_lock); + if (result != 0) { + /* we're done with the filter list */ + if_flt_monitor_unbusy(ifp); + lck_mtx_unlock(&ifp->if_flt_lock); + return (result); + } + } + } + /* we're done with the filter list */ + if_flt_monitor_unbusy(ifp); + lck_mtx_unlock(&ifp->if_flt_lock); + + return (0); +} + static void dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m) { @@ -1050,24 +1437,21 @@ dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m) while (m != NULL) { char * frame_header; mbuf_t next_packet; - + next_packet = m->m_nextpkt; m->m_nextpkt = NULL; frame_header = m->m_pkthdr.header; m->m_pkthdr.header = NULL; - error = (*ifproto->kpi.v1.input)(ifproto->ifp, - ifproto->protocol_family, - m, frame_header); + error = (*ifproto->kpi.v1.input)(ifproto->ifp, + ifproto->protocol_family, m, frame_header); if (error != 0 && error != EJUSTRETURN) m_freem(m); m = next_packet; } - } - else if (ifproto->proto_kpi == kProtoKPI_v2) { + } else if (ifproto->proto_kpi == kProtoKPI_v2) { /* Version 2 protocols support packet lists */ error = (*ifproto->kpi.v2.input)(ifproto->ifp, - ifproto->protocol_family, - m); + ifproto->protocol_family, m); if (error != 0 && error != EJUSTRETURN) m_freem_list(m); } @@ -1078,7 +1462,6 @@ __private_extern__ void dlil_input_packet_list(struct ifnet * ifp_param, struct mbuf *m) { int error = 0; - int locked = 0; protocol_family_t protocol_family; mbuf_t next_packet; ifnet_t ifp = ifp_param; @@ -1089,66 +1472,71 @@ dlil_input_packet_list(struct ifnet * ifp_param, struct mbuf *m) KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START,0,0,0,0,0); + while (m != NULL) { - struct if_proto * ifproto = NULL; + struct if_proto *ifproto = NULL; + int iorefcnt = 0; - next_packet = m->m_nextpkt; - m->m_nextpkt = NULL; if (ifp_param == NULL) ifp = m->m_pkthdr.rcvif; + + /* Check if this mbuf looks valid */ + MBUF_INPUT_CHECK(m, ifp); + + next_packet = m->m_nextpkt; + m->m_nextpkt = NULL; frame_header = m->m_pkthdr.header; m->m_pkthdr.header = NULL; - if (locked == 0) { - /* dlil lock protects the demux and interface filters */ - locked = 1; - dlil_read_begin(); + /* Get an IO reference count if the interface is not + * loopback and it is attached. + */ + if (ifp != lo_ifp) { + if (!ifnet_is_attached(ifp, 1)) { + m_freem(m); + goto next; + } + iorefcnt = 1; } -#if PKT_PRIORITY switch (m->m_pkthdr.prio) { case MBUF_TC_BK: - ifp->if_tc.ifi_ibkpackets++; - ifp->if_tc.ifi_ibkbytes += m->m_pkthdr.len; + atomic_add_64(&ifp->if_tc.ifi_ibkpackets, 1); + atomic_add_64(&ifp->if_tc.ifi_ibkbytes, m->m_pkthdr.len); break; case MBUF_TC_VI: - ifp->if_tc.ifi_ivipackets++; - ifp->if_tc.ifi_ivibytes += m->m_pkthdr.len; + atomic_add_64(&ifp->if_tc.ifi_ivipackets, 1); + atomic_add_64(&ifp->if_tc.ifi_ivibytes, m->m_pkthdr.len); break; case MBUF_TC_VO: - ifp->if_tc.ifi_ivopackets++; - ifp->if_tc.ifi_ivobytes += m->m_pkthdr.len; + atomic_add_64(&ifp->if_tc.ifi_ivopackets, 1); + atomic_add_64(&ifp->if_tc.ifi_ivobytes, m->m_pkthdr.len); break; default: break; } -#endif PKT_PRIORITY /* find which protocol family this packet is for */ + ifnet_lock_shared(ifp); error = (*ifp->if_demux)(ifp, m, frame_header, - &protocol_family); + &protocol_family); + ifnet_lock_done(ifp); if (error != 0) { - if (error == EJUSTRETURN) { + if (error == EJUSTRETURN) goto next; - } protocol_family = 0; } - - /* DANGER!!! */ + if (m->m_flags & (M_BCAST|M_MCAST)) - ifp->if_imcasts++; + atomic_add_64(&ifp->if_imcasts, 1); /* run interface filters, exclude VLAN packets PR-3586856 */ if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) == 0) { - int filter_result; - - filter_result = dlil_interface_filters_input(ifp, &m, - &frame_header, - protocol_family); - if (filter_result != 0) { - if (filter_result != EJUSTRETURN) { + error = dlil_interface_filters_input(ifp, &m, + &frame_header, protocol_family); + if (error != 0) { + if (error != EJUSTRETURN) m_freem(m); - } goto next; } } @@ -1156,19 +1544,21 @@ dlil_input_packet_list(struct ifnet * ifp_param, struct mbuf *m) m_freem(m); goto next; } - + /* Lookup the protocol attachment to this interface */ if (protocol_family == 0) { ifproto = NULL; - } - else if (last_ifproto != NULL - && last_ifproto->ifp == ifp - && (last_ifproto->protocol_family - == protocol_family)) { + } else if (last_ifproto != NULL && last_ifproto->ifp == ifp && + (last_ifproto->protocol_family == protocol_family)) { + VERIFY(ifproto == NULL); ifproto = last_ifproto; - } - else { + if_proto_ref(last_ifproto); + } else { + VERIFY(ifproto == NULL); + ifnet_lock_shared(ifp); + /* callee holds a proto refcnt upon success */ ifproto = find_attached_proto(ifp, protocol_family); + ifnet_lock_done(ifp); } if (ifproto == NULL) { /* no protocol for this packet, discard */ @@ -1176,18 +1566,14 @@ dlil_input_packet_list(struct ifnet * ifp_param, struct mbuf *m) goto next; } if (ifproto != last_ifproto) { - /* make sure ifproto can't go away during input */ - if_proto_ref(ifproto); if (last_ifproto != NULL) { /* pass up the list for the previous protocol */ - dlil_read_end(); - dlil_ifproto_input(last_ifproto, pkt_first); pkt_first = NULL; if_proto_free(last_ifproto); - dlil_read_begin(); } last_ifproto = ifproto; + if_proto_ref(ifproto); } /* extend the list */ m->m_pkthdr.header = frame_header; @@ -1198,78 +1584,127 @@ dlil_input_packet_list(struct ifnet * ifp_param, struct mbuf *m) } pkt_next = &m->m_nextpkt; - next: +next: if (next_packet == NULL && last_ifproto != NULL) { /* pass up the last list of packets */ - dlil_read_end(); - dlil_ifproto_input(last_ifproto, pkt_first); if_proto_free(last_ifproto); - locked = 0; + last_ifproto = NULL; + } + if (ifproto != NULL) { + if_proto_free(ifproto); + ifproto = NULL; } + m = next_packet; + /* update the driver's multicast filter, if needed */ + if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) + ifp->if_updatemcasts = 0; + if (iorefcnt == 1) + ifnet_decr_iorefcnt(ifp); } - if (locked != 0) { - dlil_read_end(); - } + KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END,0,0,0,0,0); return; } +errno_t +if_mcasts_update(struct ifnet *ifp) +{ + errno_t err; + + err = ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL); + if (err == EAFNOSUPPORT) + err = 0; + printf("%s%d: %s %d suspended link-layer multicast membership(s) " + "(err=%d)\n", ifp->if_name, ifp->if_unit, + (err == 0 ? "successfully restored" : "failed to restore"), + ifp->if_updatemcasts, err); + + /* just return success */ + return (0); +} + static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *event) { struct ifnet_filter *filter; - - if (ifp_use(ifp, kIfNetUseCount_MustNotBeZero) == 0) { - dlil_read_begin(); - - /* Pass the event to the interface filters */ - TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) { - if (filter->filt_event) - filter->filt_event(filter->filt_cookie, ifp, filter->filt_protocol, event); + + /* Get an io ref count if the interface is attached */ + if (!ifnet_is_attached(ifp, 1)) + goto done; + + /* + * Pass the event to the interface filters + */ + lck_mtx_lock_spin(&ifp->if_flt_lock); + /* prevent filter list from changing in case we drop the lock */ + if_flt_monitor_busy(ifp); + TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) { + if (filter->filt_event != NULL) { + lck_mtx_unlock(&ifp->if_flt_lock); + + filter->filt_event(filter->filt_cookie, ifp, + filter->filt_protocol, event); + + lck_mtx_lock_spin(&ifp->if_flt_lock); } - - if (ifp->if_proto_hash) { - int i; - - for (i = 0; i < PROTO_HASH_SLOTS; i++) { - struct if_proto *proto; - - SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) { - proto_media_event eventp = proto->proto_kpi == kProtoKPI_v1 - ? proto->kpi.v1.event : proto->kpi.v2.event; - - if (eventp) - eventp(ifp, proto->protocol_family, event); + } + /* we're done with the filter list */ + if_flt_monitor_unbusy(ifp); + lck_mtx_unlock(&ifp->if_flt_lock); + + ifnet_lock_shared(ifp); + if (ifp->if_proto_hash != NULL) { + int i; + + for (i = 0; i < PROTO_HASH_SLOTS; i++) { + struct if_proto *proto; + + SLIST_FOREACH(proto, &ifp->if_proto_hash[i], + next_hash) { + proto_media_event eventp = + (proto->proto_kpi == kProtoKPI_v1 ? + proto->kpi.v1.event : + proto->kpi.v2.event); + + if (eventp != NULL) { + if_proto_ref(proto); + ifnet_lock_done(ifp); + + eventp(ifp, proto->protocol_family, + event); + + ifnet_lock_shared(ifp); + if_proto_free(proto); } } } - - dlil_read_end(); - - /* Pass the event to the interface */ - if (ifp->if_event) - ifp->if_event(ifp, event); - - if (ifp_unuse(ifp)) - ifp_use_reached_zero(ifp); } - - return kev_post_msg(event); + ifnet_lock_done(ifp); + + /* Pass the event to the interface */ + if (ifp->if_event != NULL) + ifp->if_event(ifp, event); + + /* Release the io ref count */ + ifnet_decr_iorefcnt(ifp); + +done: + return (kev_post_msg(event)); } errno_t -ifnet_event( - ifnet_t ifp, - struct kern_event_msg *event) +ifnet_event(ifnet_t ifp, struct kern_event_msg *event) { struct kev_msg kev_msg; int result = 0; - if (ifp == NULL || event == NULL) return EINVAL; + if (ifp == NULL || event == NULL) + return (EINVAL); + bzero(&kev_msg, sizeof (kev_msg)); kev_msg.vendor_code = event->vendor_code; kev_msg.kev_class = event->kev_class; kev_msg.kev_subclass = event->kev_subclass; @@ -1277,16 +1712,17 @@ ifnet_event( kev_msg.dv[0].data_ptr = &event->event_data[0]; kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE; kev_msg.dv[1].data_length = 0; - + result = dlil_event_internal(ifp, &kev_msg); - return result; + return (result); } #if CONFIG_MACF_NET #include #include -static int dlil_get_socket_type(struct mbuf **mp, int family, int raw) +static int +dlil_get_socket_type(struct mbuf **mp, int family, int raw) { struct mbuf *m; struct ip *ip; @@ -1327,184 +1763,26 @@ static int dlil_get_socket_type(struct mbuf **mp, int family, int raw) static void if_inc_traffic_class_out(ifnet_t ifp, mbuf_t m) { -#if !PKT_PRIORITY -#pragma unused(ifp) -#pragma unused(m) - return; -#else if (!(m->m_flags & M_PKTHDR)) return; switch (m->m_pkthdr.prio) { case MBUF_TC_BK: - ifp->if_tc.ifi_obkpackets++; - ifp->if_tc.ifi_obkbytes += m->m_pkthdr.len; + atomic_add_64(&ifp->if_tc.ifi_obkpackets, 1); + atomic_add_64(&ifp->if_tc.ifi_obkbytes, m->m_pkthdr.len); break; case MBUF_TC_VI: - ifp->if_tc.ifi_ovipackets++; - ifp->if_tc.ifi_ovibytes += m->m_pkthdr.len; + atomic_add_64(&ifp->if_tc.ifi_ovipackets, 1); + atomic_add_64(&ifp->if_tc.ifi_ovibytes, m->m_pkthdr.len); break; case MBUF_TC_VO: - ifp->if_tc.ifi_ovopackets++; - ifp->if_tc.ifi_ovobytes += m->m_pkthdr.len; + atomic_add_64(&ifp->if_tc.ifi_ovopackets, 1); + atomic_add_64(&ifp->if_tc.ifi_ovobytes, m->m_pkthdr.len); break; default: break; } -#endif PKT_PRIORITY -} - -#if 0 -int -dlil_output_list( - struct ifnet* ifp, - u_long proto_family, - struct mbuf *packetlist, - caddr_t route, - const struct sockaddr *dest, - int raw) -{ - char *frame_type = NULL; - char *dst_linkaddr = NULL; - int retval = 0; - char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4]; - char dst_linkaddr_buffer[MAX_LINKADDR * 4]; - struct ifnet_filter *filter; - struct if_proto *proto = 0; - mbuf_t m; - mbuf_t send_head = NULL; - mbuf_t *send_tail = &send_head; - - KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START,0,0,0,0,0); - - dlil_read_begin(); - - frame_type = frame_type_buffer; - dst_linkaddr = dst_linkaddr_buffer; - - if (raw == 0) { - proto = find_attached_proto(ifp, proto_family); - if (proto == NULL) { - retval = ENXIO; - goto cleanup; - } - } - -preout_again: - if (packetlist == NULL) - goto cleanup; - m = packetlist; - packetlist = packetlist->m_nextpkt; - m->m_nextpkt = NULL; - - if (raw == 0) { - proto_media_preout preoutp = proto->proto_kpi == kProtoKPI_v1 - ? proto->kpi.v1.pre_output : proto->kpi.v2.pre_output; - retval = 0; - if (preoutp) - retval = preoutp(ifp, proto_family, &m, dest, route, frame_type, dst_linkaddr); - - if (retval) { - if (retval == EJUSTRETURN) { - goto preout_again; - } - - m_freem(m); - goto cleanup; - } - } - - do { -#if CONFIG_MACF_NET - retval = mac_ifnet_check_transmit(ifp, m, proto_family, - dlil_get_socket_type(&m, proto_family, raw)); - if (retval) { - m_freem(m); - goto cleanup; - } -#endif - - if (raw == 0 && ifp->if_framer) { - retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr, frame_type); - if (retval) { - if (retval != EJUSTRETURN) { - m_freem(m); - } - goto next; - } - } - - /* - * Let interface filters (if any) do their thing ... - */ - /* Do not pass VLAN tagged packets to filters PR-3586856 */ - if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) == 0) { - TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) { - if ((filter->filt_protocol == 0 || (filter->filt_protocol == proto_family)) && - filter->filt_output) { - retval = filter->filt_output(filter->filt_cookie, ifp, proto_family, &m); - if (retval) { - if (retval != EJUSTRETURN) - m_freem(m); - goto next; - } - } - } - } - /* - * Strip away M_PROTO1 bit prior to sending packet to the driver - * as this field may be used by the driver - */ - m->m_flags &= ~M_PROTO1; - - /* - * Finally, call the driver. - */ - - if ((ifp->if_eflags & IFEF_SENDLIST) != 0) { - *send_tail = m; - send_tail = &m->m_nextpkt; - } - else { - KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START, 0,0,0,0,0); - retval = ifp->if_output(ifp, m); - if (retval && dlil_verbose) { - printf("dlil_output: output error on %s%d retval = %d\n", - ifp->if_name, ifp->if_unit, retval); - } - KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0,0,0,0,0); - } - KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0,0,0,0,0); - -next: - m = packetlist; - if (m) { - packetlist = packetlist->m_nextpkt; - m->m_nextpkt = NULL; - } - } while (m); - - if (send_head) { - KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START, 0,0,0,0,0); - retval = ifp->if_output(ifp, send_head); - if (retval && dlil_verbose) { - printf("dlil_output: output error on %s%d retval = %d\n", - ifp->if_name, ifp->if_unit, retval); - } - KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0,0,0,0,0); - } - - KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END,0,0,0,0,0); - -cleanup: - dlil_read_end(); - if (packetlist) /* if any packet left, clean up */ - mbuf_freem_list(packetlist); - if (retval == EJUSTRETURN) - retval = 0; - return retval; } -#endif /* * dlil_output @@ -1519,62 +1797,72 @@ dlil_output_list( * because a protocol is likely to interact with an ifp while it * is under the protocol lock. */ -__private_extern__ errno_t -dlil_output( - ifnet_t ifp, - protocol_family_t proto_family, - mbuf_t packetlist, - void *route, - const struct sockaddr *dest, - int raw) -{ - char *frame_type = NULL; - char *dst_linkaddr = NULL; - int retval = 0; - char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4]; - char dst_linkaddr_buffer[MAX_LINKADDR * 4]; - struct ifnet_filter *filter; - struct if_proto *proto = 0; +errno_t +dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist, + void *route, const struct sockaddr *dest, int raw) +{ + char *frame_type = NULL; + char *dst_linkaddr = NULL; + int retval = 0; + char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4]; + char dst_linkaddr_buffer[MAX_LINKADDR * 4]; + struct if_proto *proto = NULL; mbuf_t m; mbuf_t send_head = NULL; mbuf_t *send_tail = &send_head; - + int iorefcnt = 0; + KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START,0,0,0,0,0); - - dlil_read_begin(); - - frame_type = frame_type_buffer; - dst_linkaddr = dst_linkaddr_buffer; - + + /* Get an io refcnt if the interface is attached to prevent ifnet_detach + * from happening while this operation is in progress */ + if (!ifnet_is_attached(ifp, 1)) { + retval = ENXIO; + goto cleanup; + } + iorefcnt = 1; + + /* update the driver's multicast filter, if needed */ + if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) + ifp->if_updatemcasts = 0; + + frame_type = frame_type_buffer; + dst_linkaddr = dst_linkaddr_buffer; + if (raw == 0) { + ifnet_lock_shared(ifp); + /* callee holds a proto refcnt upon success */ proto = find_attached_proto(ifp, proto_family); if (proto == NULL) { + ifnet_lock_done(ifp); retval = ENXIO; goto cleanup; } + ifnet_lock_done(ifp); } - + preout_again: if (packetlist == NULL) goto cleanup; + m = packetlist; packetlist = packetlist->m_nextpkt; m->m_nextpkt = NULL; - + if (raw == 0) { - proto_media_preout preoutp = proto->proto_kpi == kProtoKPI_v1 - ? proto->kpi.v1.pre_output : proto->kpi.v2.pre_output; + proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ? + proto->kpi.v1.pre_output : proto->kpi.v2.pre_output); retval = 0; - if (preoutp) - retval = preoutp(ifp, proto_family, &m, dest, route, frame_type, dst_linkaddr); - - if (retval) { - if (retval == EJUSTRETURN) { - goto preout_again; + if (preoutp != NULL) { + retval = preoutp(ifp, proto_family, &m, dest, route, + frame_type, dst_linkaddr); + + if (retval != 0) { + if (retval == EJUSTRETURN) + goto preout_again; + m_freem(m); + goto cleanup; } - - m_freem(m); - goto cleanup; } } @@ -1588,6 +1876,21 @@ dlil_output( #endif do { +#if CONFIG_DTRACE + if (proto_family == PF_INET) { + struct ip *ip = mtod(m, struct ip*); + DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL, + struct ip *, ip, struct ifnet *, ifp, + struct ip *, ip, struct ip6_hdr *, NULL); + + } else if (proto_family == PF_INET6) { + struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr*); + DTRACE_IP6(send, struct mbuf*, m, struct inpcb *, NULL, + struct ip6_hdr *, ip6, struct ifnet*, ifp, + struct ip*, NULL, struct ip6_hdr *, ip6); + } +#endif /* CONFIG_DTRACE */ + if (raw == 0 && ifp->if_framer) { int rcvif_set = 0; @@ -1605,11 +1908,11 @@ dlil_output( rcvif_set = 1; } - retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr, frame_type); + retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr, + frame_type); if (retval) { - if (retval != EJUSTRETURN) { + if (retval != EJUSTRETURN) m_freem(m); - } goto next; } @@ -1625,25 +1928,20 @@ dlil_output( if (rcvif_set && m->m_pkthdr.rcvif == ifp) m->m_pkthdr.rcvif = NULL; } - - /* + + /* * Let interface filters (if any) do their thing ... */ /* Do not pass VLAN tagged packets to filters PR-3586856 */ if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) == 0) { - TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) { - if ((filter->filt_protocol == 0 || (filter->filt_protocol == proto_family)) && - filter->filt_output) { - retval = filter->filt_output(filter->filt_cookie, ifp, proto_family, &m); - if (retval) { - if (retval != EJUSTRETURN) - m_freem(m); - goto next; - } - } + retval = dlil_interface_filters_output(ifp, + &m, proto_family); + if (retval != 0) { + if (retval != EJUSTRETURN) + m_freem(m); + goto next; } } - /* * Strip away M_PROTO1 bit prior to sending packet to the driver * as this field may be used by the driver @@ -1663,40 +1961,43 @@ dlil_output( goto next; } - /* - * If this is a TSO packet, make sure the interface still advertise TSO capability + /* + * If this is a TSO packet, make sure the interface still + * advertise TSO capability. */ - if ((m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) && !(ifp->if_hwassist & IFNET_TSO_IPV4)) { - retval = EMSGSIZE; - m_freem(m); - goto cleanup; + if ((m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) && + !(ifp->if_hwassist & IFNET_TSO_IPV4)) { + retval = EMSGSIZE; + m_freem(m); + goto cleanup; } - if ((m->m_pkthdr.csum_flags & CSUM_TSO_IPV6) && !(ifp->if_hwassist & IFNET_TSO_IPV6)) { - retval = EMSGSIZE; - m_freem(m); - goto cleanup; + if ((m->m_pkthdr.csum_flags & CSUM_TSO_IPV6) && + !(ifp->if_hwassist & IFNET_TSO_IPV6)) { + retval = EMSGSIZE; + m_freem(m); + goto cleanup; } + /* * Finally, call the driver. */ - if ((ifp->if_eflags & IFEF_SENDLIST) != 0) { *send_tail = m; send_tail = &m->m_nextpkt; - } - else { - KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START, 0,0,0,0,0); - + } else { if_inc_traffic_class_out(ifp, m); - + KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START, + 0,0,0,0,0); retval = ifp->if_output(ifp, m); if (retval && dlil_verbose) { - printf("dlil_output: output error on %s%d retval = %d\n", - ifp->if_name, ifp->if_unit, retval); + printf("%s: output error on %s%d retval = %d\n", + __func__, ifp->if_name, ifp->if_unit, + retval); } - KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0,0,0,0,0); + KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, + 0,0,0,0,0); } KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0,0,0,0,0); @@ -1709,115 +2010,121 @@ dlil_output( } while (m); if (send_head) { - KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START, 0,0,0,0,0); - if_inc_traffic_class_out(ifp, send_head); + KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START, 0,0,0,0,0); retval = ifp->if_output(ifp, send_head); if (retval && dlil_verbose) { - printf("dlil_output: output error on %s%d retval = %d\n", - ifp->if_name, ifp->if_unit, retval); + printf("%s: output error on %s%d retval = %d\n", + __func__, ifp->if_name, ifp->if_unit, retval); } KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0,0,0,0,0); } - + KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END,0,0,0,0,0); cleanup: - dlil_read_end(); - if (packetlist) /* if any packet left, clean up */ + if (proto != NULL) + if_proto_free(proto); + if (packetlist) /* if any packets are left, clean up */ mbuf_freem_list(packetlist); if (retval == EJUSTRETURN) retval = 0; - return retval; + if (iorefcnt == 1) + ifnet_decr_iorefcnt(ifp); + + return (retval); } errno_t -ifnet_ioctl( - ifnet_t ifp, - protocol_family_t proto_fam, - u_long ioctl_code, - void *ioctl_arg) -{ - struct ifnet_filter *filter; - int retval = EOPNOTSUPP; - int result = 0; - int holding_read = 0; - +ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code, + void *ioctl_arg) +{ + struct ifnet_filter *filter; + int retval = EOPNOTSUPP; + int result = 0; + if (ifp == NULL || ioctl_code == 0) - return EINVAL; - - /* Attempt to increment the use count. If it's zero, bail out, the ifp is invalid */ - result = ifp_use(ifp, kIfNetUseCount_MustNotBeZero); - if (result != 0) - return EOPNOTSUPP; - - dlil_read_begin(); - holding_read = 1; - + return (EINVAL); + + /* Get an io ref count if the interface is attached */ + if (!ifnet_is_attached(ifp, 1)) + return (EOPNOTSUPP); + /* Run the interface filters first. * We want to run all filters before calling the protocol, * interface family, or interface. */ + lck_mtx_lock_spin(&ifp->if_flt_lock); + /* prevent filter list from changing in case we drop the lock */ + if_flt_monitor_busy(ifp); TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) { - if ((filter->filt_protocol == 0 || (filter->filt_protocol == proto_fam)) && - filter->filt_ioctl != NULL) { - result = filter->filt_ioctl(filter->filt_cookie, ifp, proto_fam, ioctl_code, ioctl_arg); + if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 || + filter->filt_protocol == proto_fam)) { + lck_mtx_unlock(&ifp->if_flt_lock); + + result = filter->filt_ioctl(filter->filt_cookie, ifp, + proto_fam, ioctl_code, ioctl_arg); + + lck_mtx_lock_spin(&ifp->if_flt_lock); + /* Only update retval if no one has handled the ioctl */ if (retval == EOPNOTSUPP || result == EJUSTRETURN) { if (result == ENOTSUP) result = EOPNOTSUPP; retval = result; - if (retval && retval != EOPNOTSUPP) { + if (retval != 0 && retval != EOPNOTSUPP) { + /* we're done with the filter list */ + if_flt_monitor_unbusy(ifp); + lck_mtx_unlock(&ifp->if_flt_lock); goto cleanup; } } } } - + /* we're done with the filter list */ + if_flt_monitor_unbusy(ifp); + lck_mtx_unlock(&ifp->if_flt_lock); + /* Allow the protocol to handle the ioctl */ - if (proto_fam) { - struct if_proto *proto = find_attached_proto(ifp, proto_fam); - - if (proto != 0) { - proto_media_ioctl ioctlp = proto->proto_kpi == kProtoKPI_v1 - ? proto->kpi.v1.ioctl : proto->kpi.v2.ioctl; + if (proto_fam != 0) { + struct if_proto *proto; + + /* callee holds a proto refcnt upon success */ + ifnet_lock_shared(ifp); + proto = find_attached_proto(ifp, proto_fam); + ifnet_lock_done(ifp); + if (proto != NULL) { + proto_media_ioctl ioctlp = + (proto->proto_kpi == kProtoKPI_v1 ? + proto->kpi.v1.ioctl : proto->kpi.v2.ioctl); result = EOPNOTSUPP; - if (ioctlp) - result = ioctlp(ifp, proto_fam, ioctl_code, ioctl_arg); - + if (ioctlp != NULL) + result = ioctlp(ifp, proto_fam, ioctl_code, + ioctl_arg); + if_proto_free(proto); + /* Only update retval if no one has handled the ioctl */ if (retval == EOPNOTSUPP || result == EJUSTRETURN) { if (result == ENOTSUP) result = EOPNOTSUPP; retval = result; - if (retval && retval != EOPNOTSUPP) { + if (retval && retval != EOPNOTSUPP) goto cleanup; - } } } } - - /* - * Since we have incremented the use count on the ifp, we are guaranteed - * that the ifp will not go away (the function pointers may not be changed). - * We release the dlil read lock so the interface ioctl may trigger a - * protocol attach. This happens with vlan and may occur with other virtual - * interfaces. - */ - dlil_read_end(); - holding_read = 0; - + /* retval is either 0 or EOPNOTSUPP */ - + /* * Let the interface handle this ioctl. * If it returns EOPNOTSUPP, ignore that, we may have * already handled this in the protocol or family. */ - if (ifp->if_ioctl) + if (ifp->if_ioctl) result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg); - + /* Only update retval if no one has handled the ioctl */ if (retval == EOPNOTSUPP || result == EJUSTRETURN) { if (result == ENOTSUP) @@ -1827,60 +2134,59 @@ ifnet_ioctl( goto cleanup; } } - -cleanup: - if (holding_read) - dlil_read_end(); - if (ifp_unuse(ifp)) - ifp_use_reached_zero(ifp); +cleanup: if (retval == EJUSTRETURN) retval = 0; - return retval; + + ifnet_decr_iorefcnt(ifp); + + return (retval); } __private_extern__ errno_t -dlil_set_bpf_tap( - ifnet_t ifp, - bpf_tap_mode mode, - bpf_packet_func callback) +dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback) { errno_t error = 0; - - dlil_read_begin(); - if (ifp->if_set_bpf_tap) + + + if (ifp->if_set_bpf_tap) { + /* Get an io reference on the interface if it is attached */ + if (!ifnet_is_attached(ifp, 1)) + return ENXIO; error = ifp->if_set_bpf_tap(ifp, mode, callback); - dlil_read_end(); - - return error; + ifnet_decr_iorefcnt(ifp); + } + return (error); } errno_t -dlil_resolve_multi( - struct ifnet *ifp, - const struct sockaddr *proto_addr, - struct sockaddr *ll_addr, - size_t ll_len) +dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr, + struct sockaddr *ll_addr, size_t ll_len) { errno_t result = EOPNOTSUPP; struct if_proto *proto; const struct sockaddr *verify; proto_media_resolve_multi resolvep; - - dlil_read_begin(); - + + if (!ifnet_is_attached(ifp, 1)) + return result; + bzero(ll_addr, ll_len); - - /* Call the protocol first */ + + /* Call the protocol first; callee holds a proto refcnt upon success */ + ifnet_lock_shared(ifp); proto = find_attached_proto(ifp, proto_addr->sa_family); + ifnet_lock_done(ifp); if (proto != NULL) { - resolvep = proto->proto_kpi == kProtoKPI_v1 - ? proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi; + resolvep = (proto->proto_kpi == kProtoKPI_v1 ? + proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi); if (resolvep != NULL) - result = resolvep(ifp, proto_addr,(struct sockaddr_dl*)ll_addr, - ll_len); + result = resolvep(ifp, proto_addr, + (struct sockaddr_dl*)ll_addr, ll_len); + if_proto_free(proto); } - + /* Let the interface verify the multicast address */ if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) { if (result == 0) @@ -1889,73 +2195,63 @@ dlil_resolve_multi( verify = proto_addr; result = ifp->if_check_multi(ifp, verify); } - - dlil_read_end(); - - return result; + + ifnet_decr_iorefcnt(ifp); + return (result); } __private_extern__ errno_t -dlil_send_arp_internal( - ifnet_t ifp, - u_short arpop, - const struct sockaddr_dl* sender_hw, - const struct sockaddr* sender_proto, - const struct sockaddr_dl* target_hw, - const struct sockaddr* target_proto) +dlil_send_arp_internal(ifnet_t ifp, u_short arpop, + const struct sockaddr_dl* sender_hw, const struct sockaddr* sender_proto, + const struct sockaddr_dl* target_hw, const struct sockaddr* target_proto) { struct if_proto *proto; errno_t result = 0; - - dlil_read_begin(); - + + /* callee holds a proto refcnt upon success */ + ifnet_lock_shared(ifp); proto = find_attached_proto(ifp, target_proto->sa_family); + ifnet_lock_done(ifp); if (proto == NULL) { result = ENOTSUP; - } - else { + } else { proto_media_send_arp arpp; - arpp = proto->proto_kpi == kProtoKPI_v1 - ? proto->kpi.v1.send_arp : proto->kpi.v2.send_arp; + arpp = (proto->proto_kpi == kProtoKPI_v1 ? + proto->kpi.v1.send_arp : proto->kpi.v2.send_arp); if (arpp == NULL) result = ENOTSUP; else - result = arpp(ifp, arpop, sender_hw, sender_proto, target_hw, - target_proto); + result = arpp(ifp, arpop, sender_hw, sender_proto, + target_hw, target_proto); + if_proto_free(proto); } - - dlil_read_end(); - - return result; + + return (result); } static __inline__ int _is_announcement(const struct sockaddr_in * sender_sin, - const struct sockaddr_in * target_sin) + const struct sockaddr_in * target_sin) { if (sender_sin == NULL) { - return FALSE; + return (FALSE); } return (sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr); } __private_extern__ errno_t -dlil_send_arp( - ifnet_t ifp, - u_short arpop, - const struct sockaddr_dl* sender_hw, - const struct sockaddr* sender_proto, - const struct sockaddr_dl* target_hw, - const struct sockaddr* target_proto) +dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl* sender_hw, + const struct sockaddr* sender_proto, const struct sockaddr_dl* target_hw, + const struct sockaddr* target_proto) { errno_t result = 0; const struct sockaddr_in * sender_sin; const struct sockaddr_in * target_sin; - - if (target_proto == NULL || (sender_proto && - sender_proto->sa_family != target_proto->sa_family)) - return EINVAL; - + + if (target_proto == NULL || (sender_proto != NULL && + sender_proto->sa_family != target_proto->sa_family)) + return (EINVAL); + /* * If this is an ARP request and the target IP is IPv4LL, * send the request on all interfaces. The exception is @@ -1964,281 +2260,293 @@ dlil_send_arp( */ sender_sin = (const struct sockaddr_in *)sender_proto; target_sin = (const struct sockaddr_in *)target_proto; - if (target_proto->sa_family == AF_INET - && IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) - && ipv4_ll_arp_aware != 0 - && arpop == ARPOP_REQUEST - && !_is_announcement(target_sin, sender_sin)) { + if (target_proto->sa_family == AF_INET && + IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) && + ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST && + !_is_announcement(target_sin, sender_sin)) { ifnet_t *ifp_list; u_int32_t count; u_int32_t ifp_on; - + result = ENOTSUP; if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) { for (ifp_on = 0; ifp_on < count; ifp_on++) { - errno_t new_result; - ifaddr_t source_hw = NULL; - ifaddr_t source_ip = NULL; - struct sockaddr_in source_ip_copy; - + errno_t new_result; + ifaddr_t source_hw = NULL; + ifaddr_t source_ip = NULL; + struct sockaddr_in source_ip_copy; + struct ifnet *cur_ifp = ifp_list[ifp_on]; + /* - * Only arp on interfaces marked for IPv4LL ARPing. This may - * mean that we don't ARP on the interface the subnet route - * points to. + * Only arp on interfaces marked for IPv4LL + * ARPing. This may mean that we don't ARP on + * the interface the subnet route points to. */ - if ((ifp_list[ifp_on]->if_eflags & IFEF_ARPLL) == 0) { + if (!(cur_ifp->if_eflags & IFEF_ARPLL)) continue; - } /* Find the source IP address */ - ifnet_lock_shared(ifp_list[ifp_on]); - source_hw = TAILQ_FIRST(&ifp_list[ifp_on]->if_addrhead); - TAILQ_FOREACH(source_ip, &ifp_list[ifp_on]->if_addrhead, - ifa_link) { - if (source_ip->ifa_addr && - source_ip->ifa_addr->sa_family == AF_INET) { + ifnet_lock_shared(cur_ifp); + source_hw = cur_ifp->if_lladdr; + TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead, + ifa_link) { + IFA_LOCK(source_ip); + if (source_ip->ifa_addr != NULL && + source_ip->ifa_addr->sa_family == + AF_INET) { + /* Copy the source IP address */ + source_ip_copy = + *(struct sockaddr_in *) + source_ip->ifa_addr; + IFA_UNLOCK(source_ip); break; } + IFA_UNLOCK(source_ip); } - + /* No IP Source, don't arp */ if (source_ip == NULL) { - ifnet_lock_done(ifp_list[ifp_on]); + ifnet_lock_done(cur_ifp); continue; } - - /* Copy the source IP address */ - source_ip_copy = *(struct sockaddr_in*)source_ip->ifa_addr; - ifaref(source_hw); - ifnet_lock_done(ifp_list[ifp_on]); - + + IFA_ADDREF(source_hw); + ifnet_lock_done(cur_ifp); + /* Send the ARP */ - new_result = dlil_send_arp_internal(ifp_list[ifp_on], arpop, - (struct sockaddr_dl*)source_hw->ifa_addr, - (struct sockaddr*)&source_ip_copy, NULL, - target_proto); + new_result = dlil_send_arp_internal(cur_ifp, + arpop, + (struct sockaddr_dl *)source_hw->ifa_addr, + (struct sockaddr *)&source_ip_copy, NULL, + target_proto); - ifafree(source_hw); + IFA_REMREF(source_hw); if (result == ENOTSUP) { result = new_result; } } + ifnet_list_free(ifp_list); } - - ifnet_list_free(ifp_list); - } - else { - result = dlil_send_arp_internal(ifp, arpop, sender_hw, sender_proto, - target_hw, target_proto); + } else { + result = dlil_send_arp_internal(ifp, arpop, sender_hw, + sender_proto, target_hw, target_proto); } - - return result; + + return (result); } -__private_extern__ int -ifp_use( - struct ifnet *ifp, - int handle_zero) +/* + * Caller must hold ifnet head lock. + */ +static int +ifnet_lookup(struct ifnet *ifp) { - int old_value; - int retval = 0; - - do { - old_value = ifp->if_usecnt; - if (old_value == 0 && handle_zero == kIfNetUseCount_MustNotBeZero) { - retval = ENXIO; // ifp is invalid + struct ifnet *_ifp; + + lck_rw_assert(&ifnet_head_lock, LCK_RW_ASSERT_HELD); + TAILQ_FOREACH(_ifp, &ifnet_head, if_link) { + if (_ifp == ifp) break; - } - } while (!OSCompareAndSwap((UInt32)old_value, (UInt32)old_value + 1, (UInt32*)&ifp->if_usecnt)); - - return retval; + } + return (_ifp != NULL); } - -/* ifp_unuse is broken into two pieces. - * - * ifp_use and ifp_unuse must be called between when the caller calls - * dlil_write_begin and dlil_write_end. ifp_unuse needs to perform some - * operations after dlil_write_end has been called. For this reason, - * anyone calling ifp_unuse must call ifp_use_reached_zero if ifp_unuse - * returns a non-zero value. The caller must call ifp_use_reached_zero - * after the caller has called dlil_write_end. +/* + * Caller has to pass a non-zero refio argument to get a + * IO reference count. This will prevent ifnet_detach from + * being called when there are outstanding io reference counts. */ -__private_extern__ void -ifp_use_reached_zero( - struct ifnet *ifp) -{ - ifnet_detached_func free_func; - - dlil_read_begin(); - - if (ifp->if_usecnt != 0) - panic("ifp_use_reached_zero: ifp->if_usecnt != 0"); - - ifnet_head_lock_exclusive(); - ifnet_lock_exclusive(ifp); - - /* Remove ourselves from the list */ - TAILQ_REMOVE(&ifnet_head, ifp, if_link); - ifnet_addrs[ifp->if_index - 1] = NULL; - - /* ifp should be removed from the interface list */ - while (ifp->if_multiaddrs.lh_first) { - struct ifmultiaddr *ifma = ifp->if_multiaddrs.lh_first; - - /* - * When the interface is gone, we will no longer - * be listening on these multicasts. Various bits - * of the stack may be referencing these multicasts, - * release only our reference. +int +ifnet_is_attached(struct ifnet *ifp, int refio) +{ + int ret; + + lck_mtx_lock_spin(&ifp->if_ref_lock); + if ((ret = ((ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING)) == + IFRF_ATTACHED))) { + if (refio > 0) + ifp->if_refio++; + } + lck_mtx_unlock(&ifp->if_ref_lock); + + return (ret); +} + +void +ifnet_decr_iorefcnt(struct ifnet *ifp) +{ + lck_mtx_lock_spin(&ifp->if_ref_lock); + VERIFY(ifp->if_refio > 0); + VERIFY((ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING)) != 0); + ifp->if_refio--; + + /* if there are no more outstanding io references, wakeup the + * ifnet_detach thread if detaching flag is set. + */ + if (ifp->if_refio == 0 && + (ifp->if_refflags & IFRF_DETACHING) != 0) { + /* Convert the spinlock to a regular mutex if we have + * to wait for any reason while doing a wakeup. */ - LIST_REMOVE(ifma, ifma_link); - ifma->ifma_ifp = NULL; - ifma_release(ifma); + lck_mtx_convert_spin(&ifp->if_ref_lock); + wakeup(&(ifp->if_refio)); } + lck_mtx_unlock(&ifp->if_ref_lock); +} - ifp->if_eflags &= ~IFEF_DETACHING; // clear the detaching flag - ifnet_lock_done(ifp); - ifnet_head_done(); +static void +dlil_if_trace(struct dlil_ifnet *dl_if, int refhold) +{ + struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if; + ctrace_t *tr; + u_int32_t idx; + u_int16_t *cnt; - free_func = ifp->if_free; - dlil_read_end(); - dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0); - - if (free_func) - free_func(ifp); + if (!(dl_if->dl_if_flags & DLIF_DEBUG)) { + panic("%s: dl_if %p has no debug structure", __func__, dl_if); + /* NOTREACHED */ + } + + if (refhold) { + cnt = &dl_if_dbg->dldbg_if_refhold_cnt; + tr = dl_if_dbg->dldbg_if_refhold; + } else { + cnt = &dl_if_dbg->dldbg_if_refrele_cnt; + tr = dl_if_dbg->dldbg_if_refrele; + } + + idx = atomic_add_16_ov(cnt, 1) % IF_REF_TRACE_HIST_SIZE; + ctrace_record(&tr[idx]); } -__private_extern__ int -ifp_unuse( - struct ifnet *ifp) -{ - int oldval; - oldval = OSDecrementAtomic(&ifp->if_usecnt); - if (oldval == 0) - panic("ifp_unuse: ifp(%s%d)->if_usecnt was zero\n", ifp->if_name, ifp->if_unit); - - if (oldval > 1) - return 0; - - if ((ifp->if_eflags & IFEF_DETACHING) == 0) - panic("ifp_unuse: use count reached zero but detching flag is not set!"); - - return 1; /* caller must call ifp_use_reached_zero */ +errno_t +dlil_if_ref(struct ifnet *ifp) +{ + struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp; + + if (dl_if == NULL) + return (EINVAL); + + lck_mtx_lock_spin(&dl_if->dl_if_lock); + ++dl_if->dl_if_refcnt; + if (dl_if->dl_if_refcnt == 0) { + panic("%s: wraparound refcnt for ifp=%p", __func__, ifp); + /* NOTREACHED */ + } + if (dl_if->dl_if_trace != NULL) + (*dl_if->dl_if_trace)(dl_if, TRUE); + lck_mtx_unlock(&dl_if->dl_if_lock); + + return (0); } -extern lck_mtx_t *domain_proto_mtx; +errno_t +dlil_if_free(struct ifnet *ifp) +{ + struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp; + + if (dl_if == NULL) + return (EINVAL); + + lck_mtx_lock_spin(&dl_if->dl_if_lock); + if (dl_if->dl_if_refcnt == 0) { + panic("%s: negative refcnt for ifp=%p", __func__, ifp); + /* NOTREACHED */ + } + --dl_if->dl_if_refcnt; + if (dl_if->dl_if_trace != NULL) + (*dl_if->dl_if_trace)(dl_if, FALSE); + lck_mtx_unlock(&dl_if->dl_if_lock); + + return (0); +} static errno_t -dlil_attach_protocol_internal( - struct if_proto *proto, - const struct ifnet_demux_desc *demux_list, - u_int32_t demux_count) +dlil_attach_protocol_internal(struct if_proto *proto, + const struct ifnet_demux_desc *demux_list, u_int32_t demux_count) { - struct kev_dl_proto_data ev_pr_data; + struct kev_dl_proto_data ev_pr_data; struct ifnet *ifp = proto->ifp; int retval = 0; u_int32_t hash_value = proto_hash_value(proto->protocol_family); - - /* setup some of the common values */ - { - struct domain *dp; - lck_mtx_lock(domain_proto_mtx); - dp = domains; - while (dp && (protocol_family_t)dp->dom_family != proto->protocol_family) - dp = dp->dom_next; - proto->dl_domain = dp; - lck_mtx_unlock(domain_proto_mtx); - } - - /* - * Take the write lock to protect readers and exclude other writers. - */ - if ((retval = dlil_write_begin()) != 0) { - printf("dlil_attach_protocol_internal - dlil_write_begin returned %d\n", retval); - return retval; - } - - /* Check that the interface isn't currently detaching */ - ifnet_lock_shared(ifp); - if ((ifp->if_eflags & IFEF_DETACHING) != 0) { + struct if_proto *prev_proto; + struct if_proto *_proto; + + /* callee holds a proto refcnt upon success */ + ifnet_lock_exclusive(ifp); + _proto = find_attached_proto(ifp, proto->protocol_family); + if (_proto != NULL) { ifnet_lock_done(ifp); - dlil_write_end(); - return ENXIO; + if_proto_free(_proto); + return (EEXIST); } - ifnet_lock_done(ifp); - - if (find_attached_proto(ifp, proto->protocol_family) != NULL) { - dlil_write_end(); - return EEXIST; - } - + /* * Call family module add_proto routine so it can refine the * demux descriptors as it wishes. */ - retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list, demux_count); + retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list, + demux_count); if (retval) { - dlil_write_end(); - return retval; + ifnet_lock_done(ifp); + return (retval); } - - /* - * We can't fail from this point on. - * Increment the number of uses (protocol attachments + interface attached). - */ - ifp_use(ifp, kIfNetUseCount_MustNotBeZero); - + /* * Insert the protocol in the hash */ - { - struct if_proto* prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]); - while (prev_proto && SLIST_NEXT(prev_proto, next_hash) != NULL) - prev_proto = SLIST_NEXT(prev_proto, next_hash); - if (prev_proto) - SLIST_INSERT_AFTER(prev_proto, proto, next_hash); - else - SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value], proto, next_hash); - } + prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]); + while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL) + prev_proto = SLIST_NEXT(prev_proto, next_hash); + if (prev_proto) + SLIST_INSERT_AFTER(prev_proto, proto, next_hash); + else + SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value], + proto, next_hash); + + /* hold a proto refcnt for attach */ + if_proto_ref(proto); /* - * Add to if_proto list for this interface + * The reserved field carries the number of protocol still attached + * (subject to change) */ - if_proto_ref(proto); - dlil_write_end(); - - /* the reserved field carries the number of protocol still attached (subject to change) */ ev_pr_data.proto_family = proto->protocol_family; ev_pr_data.proto_remaining_count = dlil_ifp_proto_count(ifp); - dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED, - (struct net_event_data *)&ev_pr_data, - sizeof(struct kev_dl_proto_data)); -#if 0 - DLIL_PRINTF("dlil. Attached protocol %d to %s%d - %d\n", proto->protocol_family, - ifp->if_name, ifp->if_unit, retval); -#endif - return retval; + ifnet_lock_done(ifp); + + dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED, + (struct net_event_data *)&ev_pr_data, + sizeof (struct kev_dl_proto_data)); + return (retval); } errno_t ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol, - const struct ifnet_attach_proto_param *proto_details) + const struct ifnet_attach_proto_param *proto_details) { int retval = 0; struct if_proto *ifproto = NULL; - - if (ifp == NULL || protocol == 0 || proto_details == NULL) - return EINVAL; - - ifproto = _MALLOC(sizeof(struct if_proto), M_IFADDR, M_WAITOK); - if (ifproto == 0) { - DLIL_PRINTF("ERROR - dlil failed if_proto allocation\n"); + + ifnet_head_lock_shared(); + if (ifp == NULL || protocol == 0 || proto_details == NULL) { + retval = EINVAL; + goto end; + } + /* Check that the interface is in the global list */ + if (!ifnet_lookup(ifp)) { + retval = ENXIO; + goto end; + } + + ifproto = zalloc(dlif_proto_zone); + if (ifproto == NULL) { retval = ENOMEM; goto end; } - bzero(ifproto, sizeof(*ifproto)); - + bzero(ifproto, dlif_proto_size); + + /* refcnt held above during lookup */ ifproto->ifp = ifp; ifproto->protocol_family = protocol; ifproto->proto_kpi = kProtoKPI_v1; @@ -2249,34 +2557,52 @@ ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol, ifproto->kpi.v1.detached = proto_details->detached; ifproto->kpi.v1.resolve_multi = proto_details->resolve; ifproto->kpi.v1.send_arp = proto_details->send_arp; - + retval = dlil_attach_protocol_internal(ifproto, - proto_details->demux_list, proto_details->demux_count); - + proto_details->demux_list, proto_details->demux_count); + + if (dlil_verbose) { + printf("%s%d: attached v1 protocol %d\n", ifp->if_name, + ifp->if_unit, protocol); + } + end: - if (retval && ifproto) - FREE(ifproto, M_IFADDR); - return retval; + if (retval != 0 && retval != EEXIST && ifp != NULL) { + DLIL_PRINTF("%s%d: failed to attach v1 protocol %d (err=%d)\n", + ifp->if_name, ifp->if_unit, protocol, retval); + } + ifnet_head_done(); + if (retval != 0 && ifproto != NULL) + zfree(dlif_proto_zone, ifproto); + return (retval); } errno_t ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol, - const struct ifnet_attach_proto_param_v2 *proto_details) + const struct ifnet_attach_proto_param_v2 *proto_details) { int retval = 0; struct if_proto *ifproto = NULL; - - if (ifp == NULL || protocol == 0 || proto_details == NULL) - return EINVAL; - - ifproto = _MALLOC(sizeof(struct if_proto), M_IFADDR, M_WAITOK); - if (ifproto == 0) { - DLIL_PRINTF("ERROR - dlil failed if_proto allocation\n"); + + ifnet_head_lock_shared(); + if (ifp == NULL || protocol == 0 || proto_details == NULL) { + retval = EINVAL; + goto end; + } + /* Check that the interface is in the global list */ + if (!ifnet_lookup(ifp)) { + retval = ENXIO; + goto end; + } + + ifproto = zalloc(dlif_proto_zone); + if (ifproto == NULL) { retval = ENOMEM; goto end; } bzero(ifproto, sizeof(*ifproto)); - + + /* refcnt held above during lookup */ ifproto->ifp = ifp; ifproto->protocol_family = protocol; ifproto->proto_kpi = kProtoKPI_v2; @@ -2287,49 +2613,24 @@ ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol, ifproto->kpi.v2.detached = proto_details->detached; ifproto->kpi.v2.resolve_multi = proto_details->resolve; ifproto->kpi.v2.send_arp = proto_details->send_arp; - - retval = dlil_attach_protocol_internal(ifproto, - proto_details->demux_list, proto_details->demux_count); - -end: - if (retval && ifproto) - FREE(ifproto, M_IFADDR); - return retval; -} -extern void if_rtproto_del(struct ifnet *ifp, int protocol); + retval = dlil_attach_protocol_internal(ifproto, + proto_details->demux_list, proto_details->demux_count); -static int -dlil_detach_protocol_internal( - struct if_proto *proto) -{ - struct ifnet *ifp = proto->ifp; - u_int32_t proto_family = proto->protocol_family; - struct kev_dl_proto_data ev_pr_data; - - if (proto->proto_kpi == kProtoKPI_v1) { - if (proto->kpi.v1.detached) - proto->kpi.v1.detached(ifp, proto->protocol_family); + if (dlil_verbose) { + printf("%s%d: attached v2 protocol %d\n", ifp->if_name, + ifp->if_unit, protocol); } - if (proto->proto_kpi == kProtoKPI_v2) { - if (proto->kpi.v2.detached) - proto->kpi.v2.detached(ifp, proto->protocol_family); + +end: + if (retval != 0 && retval != EEXIST && ifp != NULL) { + DLIL_PRINTF("%s%d: failed to attach v2 protocol %d (err=%d)\n", + ifp->if_name, ifp->if_unit, protocol, retval); } - if_proto_free(proto); - - /* - * Cleanup routes that may still be in the routing table for that interface/protocol pair. - */ - - if_rtproto_del(ifp, proto_family); - - /* the reserved field carries the number of protocol still attached (subject to change) */ - ev_pr_data.proto_family = proto_family; - ev_pr_data.proto_remaining_count = dlil_ifp_proto_count(ifp); - dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED, - (struct net_event_data *)&ev_pr_data, - sizeof(struct kev_dl_proto_data)); - return 0; + ifnet_head_done(); + if (retval != 0 && ifproto != NULL) + zfree(dlif_proto_zone, ifproto); + return (retval); } errno_t @@ -2337,260 +2638,169 @@ ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family) { struct if_proto *proto = NULL; int retval = 0; - int use_reached_zero = 0; - - if (ifp == NULL || proto_family == 0) return EINVAL; - - if ((retval = dlil_write_begin()) != 0) { - if (retval == EDEADLK) { - retval = 0; - dlil_read_begin(); - proto = find_attached_proto(ifp, proto_family); - if (proto == 0) { - retval = ENXIO; - } - else { - proto->detaching = 1; - dlil_detach_waiting = 1; - wakeup(&dlil_detach_waiting); - } - dlil_read_end(); - } + + if (ifp == NULL || proto_family == 0) { + retval = EINVAL; goto end; } - + + ifnet_lock_exclusive(ifp); + /* callee holds a proto refcnt upon success */ proto = find_attached_proto(ifp, proto_family); - if (proto == NULL) { retval = ENXIO; - dlil_write_end(); + ifnet_lock_done(ifp); goto end; } - - /* - * Call family module del_proto - */ - + + /* call family module del_proto */ if (ifp->if_del_proto) ifp->if_del_proto(ifp, proto->protocol_family); - SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)], proto, if_proto, next_hash); - - /* - * We can do the rest of the work outside of the write lock. - */ - use_reached_zero = ifp_unuse(ifp); - dlil_write_end(); - - dlil_detach_protocol_internal(proto); + SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)], + proto, if_proto, next_hash); + + if (proto->proto_kpi == kProtoKPI_v1) { + proto->kpi.v1.input = ifproto_media_input_v1; + proto->kpi.v1.pre_output= ifproto_media_preout; + proto->kpi.v1.event = ifproto_media_event; + proto->kpi.v1.ioctl = ifproto_media_ioctl; + proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi; + proto->kpi.v1.send_arp = ifproto_media_send_arp; + } else { + proto->kpi.v2.input = ifproto_media_input_v2; + proto->kpi.v2.pre_output = ifproto_media_preout; + proto->kpi.v2.event = ifproto_media_event; + proto->kpi.v2.ioctl = ifproto_media_ioctl; + proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi; + proto->kpi.v2.send_arp = ifproto_media_send_arp; + } + proto->detached = 1; + ifnet_lock_done(ifp); + + if (dlil_verbose) { + printf("%s%d: detached %s protocol %d\n", ifp->if_name, + ifp->if_unit, (proto->proto_kpi == kProtoKPI_v1) ? + "v1" : "v2", proto_family); + } + + /* release proto refcnt held during protocol attach */ + if_proto_free(proto); /* - * Only handle the case where the interface will go away after - * we've sent the message. This way post message can send the - * message to the interface safely. + * Release proto refcnt held during lookup; the rest of + * protocol detach steps will happen when the last proto + * reference is released. */ - - if (use_reached_zero) - ifp_use_reached_zero(ifp); - + if_proto_free(proto); + end: - return retval; + return (retval); } -/* - * dlil_delayed_detach_thread is responsible for detaching - * protocols, protocol filters, and interface filters after - * an attempt was made to detach one of those items while - * it was not safe to do so (i.e. called dlil_read_begin). - * - * This function will take the dlil write lock and walk - * through each of the interfaces looking for items with - * the detaching flag set. When an item is found, it is - * detached from the interface and placed on a local list. - * After all of the items have been collected, we drop the - * write lock and performed the post detach. This is done - * so we only have to take the write lock once. - * - * When detaching a protocol filter, if we find that we - * have detached the very last protocol and we need to call - * ifp_use_reached_zero, we have to break out of our work - * to drop the write lock so we can call ifp_use_reached_zero. - */ - -static void -dlil_delayed_detach_thread(__unused void* foo, __unused wait_result_t wait) + +static errno_t +ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol, + struct mbuf *packet, char *header) { - thread_t self = current_thread(); - int asserted = 0; - - ml_thread_policy(self, MACHINE_GROUP, - (MACHINE_NETWORK_GROUP|MACHINE_NETWORK_NETISR)); +#pragma unused(ifp, protocol, packet, header) + return (ENXIO); +} + +static errno_t +ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol, + struct mbuf *packet) +{ +#pragma unused(ifp, protocol, packet) + return (ENXIO); + +} + +static errno_t +ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol, + mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type, + char *link_layer_dest) +{ +#pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest) + return (ENXIO); - - while (1) { - if (dlil_detach_waiting != 0 && dlil_write_begin() == 0) { - struct ifnet *ifp; - struct proto_hash_entry detached_protos; - struct ifnet_filter_head detached_filters; - struct if_proto *proto; - struct if_proto *next_proto; - struct ifnet_filter *filt; - struct ifnet_filter *next_filt; - int reached_zero; - - reached_zero = 0; - - /* Clear the detach waiting flag */ - dlil_detach_waiting = 0; - TAILQ_INIT(&detached_filters); - SLIST_INIT(&detached_protos); - - ifnet_head_lock_shared(); - TAILQ_FOREACH(ifp, &ifnet_head, if_link) { - int i; - - // Look for protocols and protocol filters - for (i = 0; i < PROTO_HASH_SLOTS && !reached_zero; i++) { - struct if_proto **prev_nextptr = &SLIST_FIRST(&ifp->if_proto_hash[i]); - for (proto = *prev_nextptr; proto; proto = *prev_nextptr) { - - // Detach this protocol - if (proto->detaching) { - if (ifp->if_del_proto) - ifp->if_del_proto(ifp, proto->protocol_family); - *prev_nextptr = SLIST_NEXT(proto, next_hash); - SLIST_INSERT_HEAD(&detached_protos, proto, next_hash); - reached_zero = ifp_unuse(ifp); - if (reached_zero) { - break; - } - } - else { - // Update prev_nextptr to point to our next ptr - prev_nextptr = &SLIST_NEXT(proto, next_hash); - } - } - } - - // look for interface filters that need to be detached - for (filt = TAILQ_FIRST(&ifp->if_flt_head); filt; filt = next_filt) { - next_filt = TAILQ_NEXT(filt, filt_next); - if (filt->filt_detaching != 0) { - // take this interface filter off the interface filter list - TAILQ_REMOVE(&ifp->if_flt_head, filt, filt_next); - - // put this interface filter on the detached filters list - TAILQ_INSERT_TAIL(&detached_filters, filt, filt_next); - } - } - - if (ifp->if_delayed_detach) { - ifp->if_delayed_detach = 0; - reached_zero = ifp_unuse(ifp); - } - - if (reached_zero) - break; - } - ifnet_head_done(); - dlil_write_end(); - - for (filt = TAILQ_FIRST(&detached_filters); filt; filt = next_filt) { - next_filt = TAILQ_NEXT(filt, filt_next); - /* - * dlil_detach_filter_internal won't remove an item from - * the list if it is already detached (second parameter). - * The item will be freed though. - */ - dlil_detach_filter_internal(filt, 1); - } - - for (proto = SLIST_FIRST(&detached_protos); proto; proto = next_proto) { - next_proto = SLIST_NEXT(proto, next_hash); - dlil_detach_protocol_internal(proto); - } - - if (reached_zero) { - ifp_use_reached_zero(ifp); - dlil_detach_waiting = 1; // we may have missed something - } - } - - if (!asserted && dlil_detach_waiting == 0) { - asserted = 1; - assert_wait(&dlil_detach_waiting, THREAD_UNINT); - } - - if (dlil_detach_waiting == 0) { - asserted = 0; - thread_block(dlil_delayed_detach_thread); - } - } } static void -dlil_call_delayed_detach_thread(void) { - dlil_delayed_detach_thread(NULL, THREAD_RESTART); +ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol, + const struct kev_msg *event) +{ +#pragma unused(ifp, protocol, event) +} + +static errno_t +ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol, + unsigned long command, void *argument) +{ +#pragma unused(ifp, protocol, command, argument) + return (ENXIO); +} + +static errno_t +ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr, + struct sockaddr_dl *out_ll, size_t ll_len) +{ +#pragma unused(ifp, proto_addr, out_ll, ll_len) + return (ENXIO); +} + +static errno_t +ifproto_media_send_arp(struct ifnet *ifp, u_short arpop, + const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto, + const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto) +{ +#pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto) + return (ENXIO); } extern int if_next_index(void); errno_t -ifnet_attach( - ifnet_t ifp, - const struct sockaddr_dl *ll_addr) +ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr) { - u_int32_t interface_family; struct ifnet *tmp_if; - struct proto_hash_entry *new_proto_list = NULL; - int locked = 0; - - if (ifp == NULL) return EINVAL; - if (ll_addr && ifp->if_addrlen == 0) { - ifp->if_addrlen = ll_addr->sdl_alen; - } - else if (ll_addr && ll_addr->sdl_alen != ifp->if_addrlen) { - return EINVAL; - } - - interface_family = ifp->if_family; - - ifnet_head_lock_shared(); + struct ifaddr *ifa; + struct if_data_internal if_data_saved; + struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp; + if (ifp == NULL) + return (EINVAL); + + ifnet_head_lock_exclusive(); /* Verify we aren't already on the list */ TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) { if (tmp_if == ifp) { ifnet_head_done(); - return EEXIST; + return (EEXIST); } } - - ifnet_head_done(); - - if ((ifp->if_eflags & IFEF_REUSE) == 0 || ifp->if_lock == 0) -#if IFNET_RW_LOCK - ifp->if_lock = lck_rw_alloc_init(ifnet_lock_group, ifnet_lock_attr); -#else - ifp->if_lock = lck_mtx_alloc_init(ifnet_lock_group, ifnet_lock_attr); -#endif - if (ifp->if_lock == 0) { - return ENOMEM; + lck_mtx_lock_spin(&ifp->if_ref_lock); + if (ifp->if_refflags & IFRF_ATTACHED) { + panic("%s: flags mismatch (attached set) ifp=%p", + __func__, ifp); + /* NOTREACHED */ } + lck_mtx_unlock(&ifp->if_ref_lock); - if (!(ifp->if_eflags & IFEF_REUSE) || ifp->if_fwd_route_lock == NULL) { - if (ifp->if_fwd_route_lock == NULL) - ifp->if_fwd_route_lock = lck_mtx_alloc_init( - ifnet_lock_group, ifnet_lock_attr); + ifnet_lock_exclusive(ifp); - if (ifp->if_fwd_route_lock == NULL) { -#if IFNET_RW_LOCK - lck_rw_free(ifp->if_lock, ifnet_lock_group); -#else - lck_mtx_free(ifp->if_lock, ifnet_lock_group); -#endif - ifp->if_lock = NULL; - return (ENOMEM); + /* Sanity check */ + VERIFY(ifp->if_detaching_link.tqe_next == NULL); + VERIFY(ifp->if_detaching_link.tqe_prev == NULL); + + if (ll_addr != NULL) { + if (ifp->if_addrlen == 0) { + ifp->if_addrlen = ll_addr->sdl_alen; + } else if (ll_addr->sdl_alen != ifp->if_addrlen) { + ifnet_lock_done(ifp); + ifnet_head_done(); + return (EINVAL); } } @@ -2598,251 +2808,606 @@ ifnet_attach( * Allow interfaces without protocol families to attach * only if they have the necessary fields filled out. */ - - if (ifp->if_add_proto == 0 || ifp->if_del_proto == 0) { - DLIL_PRINTF("dlil Attempt to attach interface without family module - %d\n", - interface_family); - return ENODEV; - } - - if ((ifp->if_eflags & IFEF_REUSE) == 0 || ifp->if_proto_hash == NULL) { - MALLOC(new_proto_list, struct proto_hash_entry*, sizeof(struct proto_hash_entry) * PROTO_HASH_SLOTS, - M_NKE, M_WAITOK); - - if (new_proto_list == 0) { - return ENOBUFS; - } + if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) { + DLIL_PRINTF("%s: Attempt to attach interface without " + "family module - %d\n", __func__, ifp->if_family); + ifnet_lock_done(ifp); + ifnet_head_done(); + return (ENODEV); } - dlil_write_begin(); - locked = 1; + /* Allocate protocol hash table */ + VERIFY(ifp->if_proto_hash == NULL); + ifp->if_proto_hash = zalloc(dlif_phash_zone); + if (ifp->if_proto_hash == NULL) { + ifnet_lock_done(ifp); + ifnet_head_done(); + return (ENOBUFS); + } + bzero(ifp->if_proto_hash, dlif_phash_size); + lck_mtx_lock_spin(&ifp->if_flt_lock); + VERIFY(TAILQ_EMPTY(&ifp->if_flt_head)); TAILQ_INIT(&ifp->if_flt_head); - - - if (new_proto_list) { - bzero(new_proto_list, (PROTO_HASH_SLOTS * sizeof(struct proto_hash_entry))); - ifp->if_proto_hash = new_proto_list; - new_proto_list = NULL; - } - - /* old_if_attach */ - { - char workbuf[64]; - int namelen, masklen, socksize, ifasize; - struct ifaddr *ifa = NULL; - - if (ifp->if_snd.ifq_maxlen == 0) - ifp->if_snd.ifq_maxlen = ifqmaxlen; - TAILQ_INIT(&ifp->if_prefixhead); + VERIFY(ifp->if_flt_busy == 0); + VERIFY(ifp->if_flt_waiters == 0); + lck_mtx_unlock(&ifp->if_flt_lock); + + VERIFY(TAILQ_EMPTY(&ifp->if_prefixhead)); + TAILQ_INIT(&ifp->if_prefixhead); + + if (!(dl_if->dl_if_flags & DLIF_REUSE)) { + VERIFY(LIST_EMPTY(&ifp->if_multiaddrs)); LIST_INIT(&ifp->if_multiaddrs); - ifnet_touch_lastchange(ifp); - - /* usecount to track attachment to the ifnet list */ - ifp_use(ifp, kIfNetUseCount_MayBeZero); - - /* Lock the list of interfaces */ - ifnet_head_lock_exclusive(); - ifnet_lock_exclusive(ifp); - - if ((ifp->if_eflags & IFEF_REUSE) == 0 || ifp->if_index == 0) { - int idx = if_next_index(); - - if (idx == -1) { - ifnet_lock_done(ifp); - ifnet_head_done(); - ifp_unuse(ifp); - dlil_write_end(); - - return ENOBUFS; - } - ifp->if_index = idx; - } else { - ifa = TAILQ_FIRST(&ifp->if_addrhead); - } - namelen = snprintf(workbuf, sizeof(workbuf), "%s%d", ifp->if_name, ifp->if_unit); -#define _offsetof(t, m) ((uintptr_t)((caddr_t)&((t *)0)->m)) - masklen = _offsetof(struct sockaddr_dl, sdl_data[0]) + namelen; - socksize = masklen + ifp->if_addrlen; -#define ROUNDUP(a) (1 + (((a) - 1) | (sizeof(u_int32_t) - 1))) - if ((u_int32_t)socksize < sizeof(struct sockaddr_dl)) - socksize = sizeof(struct sockaddr_dl); - socksize = ROUNDUP(socksize); - ifasize = sizeof(struct ifaddr) + 2 * socksize; - - /* - * Allocate a new ifa if we don't have one - * or the old one is too small. - */ - if (ifa == NULL || socksize > ifa->ifa_addr->sa_len) { - if (ifa) - if_detach_ifa(ifp, ifa); - ifa = (struct ifaddr*)_MALLOC(ifasize, M_IFADDR, M_WAITOK); - } - - if (ifa) { - struct sockaddr_dl *sdl = (struct sockaddr_dl *)(ifa + 1); - ifnet_addrs[ifp->if_index - 1] = ifa; - bzero(ifa, ifasize); - ifa->ifa_debug |= IFD_ALLOC; - sdl->sdl_len = socksize; - sdl->sdl_family = AF_LINK; - bcopy(workbuf, sdl->sdl_data, namelen); - sdl->sdl_nlen = namelen; - sdl->sdl_index = ifp->if_index; - sdl->sdl_type = ifp->if_type; - if (ll_addr) { - sdl->sdl_alen = ll_addr->sdl_alen; - if (ll_addr->sdl_alen != ifp->if_addrlen) - panic("ifnet_attach - ll_addr->sdl_alen != ifp->if_addrlen"); - bcopy(CONST_LLADDR(ll_addr), LLADDR(sdl), sdl->sdl_alen); - } - ifa->ifa_ifp = ifp; - ifa->ifa_rtrequest = link_rtrequest; - ifa->ifa_addr = (struct sockaddr*)sdl; - sdl = (struct sockaddr_dl*)(socksize + (caddr_t)sdl); - ifa->ifa_netmask = (struct sockaddr*)sdl; - sdl->sdl_len = masklen; - while (namelen != 0) - sdl->sdl_data[--namelen] = 0xff; - } + } - TAILQ_INIT(&ifp->if_addrhead); - ifa = ifnet_addrs[ifp->if_index - 1]; - - if (ifa) { - /* - * We don't use if_attach_ifa because we want - * this address to be first on the list. - */ - ifaref(ifa); - ifa->ifa_debug |= IFD_ATTACHED; - TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link); + VERIFY(ifp->if_allhostsinm == NULL); + VERIFY(TAILQ_EMPTY(&ifp->if_addrhead)); + TAILQ_INIT(&ifp->if_addrhead); + + if (ifp->if_snd.ifq_maxlen == 0) + ifp->if_snd.ifq_maxlen = ifqmaxlen; + + if (ifp->if_index == 0) { + int idx = if_next_index(); + + if (idx == -1) { + ifp->if_index = 0; + ifnet_lock_done(ifp); + ifnet_head_done(); + return (ENOBUFS); } + ifp->if_index = idx; + } + /* There should not be anything occupying this slot */ + VERIFY(ifindex2ifnet[ifp->if_index] == NULL); + + /* allocate (if needed) and initialize a link address */ + VERIFY(!(dl_if->dl_if_flags & DLIF_REUSE) || ifp->if_lladdr != NULL); + ifa = dlil_alloc_lladdr(ifp, ll_addr); + if (ifa == NULL) { + ifnet_lock_done(ifp); + ifnet_head_done(); + return (ENOBUFS); + } + + VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL); + ifnet_addrs[ifp->if_index - 1] = ifa; + + /* make this address the first on the list */ + IFA_LOCK(ifa); + /* hold a reference for ifnet_addrs[] */ + IFA_ADDREF_LOCKED(ifa); + /* if_attach_link_ifa() holds a reference for ifa_link */ + if_attach_link_ifa(ifp, ifa); + IFA_UNLOCK(ifa); + #if CONFIG_MACF_NET - mac_ifnet_label_associate(ifp); + mac_ifnet_label_associate(ifp); #endif - - TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link); - ifindex2ifnet[ifp->if_index] = ifp; - } - /* - * A specific dlil input thread is created per Ethernet/PDP interface. - * pseudo interfaces or other types of interfaces use the main ("loopback") thread. - * If the sysctl "net.link.generic.system.multi_threaded_input" is set to zero, all packets will - * be handled by the main loopback thread, reverting to 10.4.x behaviour. - * - */ + TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link); + ifindex2ifnet[ifp->if_index] = ifp; - if (ifp->if_type == IFT_ETHER || ifp->if_type == IFT_PDP) { + /* Hold a reference to the underlying dlil_ifnet */ + ifnet_reference(ifp); + + /* + * A specific dlil input thread is created per Ethernet/cellular + * interface. pseudo interfaces or other types of interfaces use + * the main ("loopback") thread. + * + * If the sysctl "net.link.generic.system.multi_threaded_input" is set + * to zero, all packets will be handled by the main loopback thread, + * reverting to 10.4.x behaviour. + */ + if (dlil_multithreaded_input && + (ifp->if_type == IFT_ETHER || ifp->if_type == IFT_CELLULAR)) { int err; - if (dlil_multithreaded_input > 0) { - ifp->if_input_thread = _MALLOC(sizeof(struct dlil_threading_info), M_NKE, M_WAITOK); - if (ifp->if_input_thread == NULL) - panic("ifnet_attach ifp=%p couldn't alloc threading\n", ifp); - if ((err = dlil_create_input_thread(ifp, ifp->if_input_thread)) != 0) - panic("ifnet_attach ifp=%p couldn't get a thread. err=%d\n", ifp, err); + ifp->if_input_thread = zalloc(dlif_inp_zone); + if (ifp->if_input_thread == NULL) { + panic("%s: ifp=%p couldn't alloc threading", + __func__, ifp); + /* NOTREACHED */ + } + bzero(ifp->if_input_thread, dlif_inp_size); + err = dlil_create_input_thread(ifp, ifp->if_input_thread); + if (err != 0) { + panic("%s: ifp=%p couldn't get a thread. " + "err=%d", __func__, ifp, err); + /* NOTREACHED */ + } #ifdef DLIL_DEBUG - printf("ifnet_attach: dlil thread for ifp=%p if_index=%d\n", ifp, ifp->if_index); + printf("%s: dlil thread for ifp=%p if_index=%d\n", + __func__, ifp, ifp->if_index); #endif - } } + + /* Clear stats (save and restore other fields that we care) */ + if_data_saved = ifp->if_data; + bzero(&ifp->if_data, sizeof (ifp->if_data)); + ifp->if_data.ifi_type = if_data_saved.ifi_type; + ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen; + ifp->if_data.ifi_physical = if_data_saved.ifi_physical; + ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen; + ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen; + ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu; + ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate; + ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist; + ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu; + ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu; + ifnet_touch_lastchange(ifp); + + /* Record attach PC stacktrace */ + ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach); + + ifp->if_updatemcasts = 0; + if (!LIST_EMPTY(&ifp->if_multiaddrs)) { + struct ifmultiaddr *ifma; + LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + IFMA_LOCK(ifma); + if (ifma->ifma_addr->sa_family == AF_LINK || + ifma->ifma_addr->sa_family == AF_UNSPEC) + ifp->if_updatemcasts++; + IFMA_UNLOCK(ifma); + } + + printf("%s%d: attached with %d suspended link-layer multicast " + "membership(s)\n", ifp->if_name, ifp->if_unit, + ifp->if_updatemcasts); + } + ifnet_lock_done(ifp); ifnet_head_done(); -#if PF + + lck_mtx_lock(&ifp->if_cached_route_lock); + /* Enable forwarding cached route */ + ifp->if_fwd_cacheok = 1; + /* Clean up any existing cached routes */ + if (ifp->if_fwd_route.ro_rt != NULL) + rtfree(ifp->if_fwd_route.ro_rt); + bzero(&ifp->if_fwd_route, sizeof (ifp->if_fwd_route)); + if (ifp->if_src_route.ro_rt != NULL) + rtfree(ifp->if_src_route.ro_rt); + bzero(&ifp->if_src_route, sizeof (ifp->if_src_route)); + if (ifp->if_src_route6.ro_rt != NULL) + rtfree(ifp->if_src_route6.ro_rt); + bzero(&ifp->if_src_route6, sizeof (ifp->if_src_route6)); + lck_mtx_unlock(&ifp->if_cached_route_lock); + + ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE)); + /* - * Attach packet filter to this interface, if enaled. + * Allocate and attach IGMPv3/MLDv2 interface specific variables + * and trees; do this before the ifnet is marked as attached. + * The ifnet keeps the reference to the info structures even after + * the ifnet is detached, since the network-layer records still + * refer to the info structures even after that. This also + * makes it possible for them to still function after the ifnet + * is recycled or reattached. */ - pf_ifnet_hook(ifp, 1); -#endif /* PF */ - dlil_write_end(); +#if INET + if (IGMP_IFINFO(ifp) == NULL) { + IGMP_IFINFO(ifp) = igmp_domifattach(ifp, M_WAITOK); + VERIFY(IGMP_IFINFO(ifp) != NULL); + } else { + VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp); + igmp_domifreattach(IGMP_IFINFO(ifp)); + } +#endif /* INET */ +#if INET6 + if (MLD_IFINFO(ifp) == NULL) { + MLD_IFINFO(ifp) = mld_domifattach(ifp, M_WAITOK); + VERIFY(MLD_IFINFO(ifp) != NULL); + } else { + VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp); + mld_domifreattach(MLD_IFINFO(ifp)); + } +#endif /* INET6 */ -#if IFNET_ROUTE_REFCNT + /* + * Finally, mark this ifnet as attached. + */ + lck_mtx_lock(rnh_lock); + ifnet_lock_exclusive(ifp); + lck_mtx_lock_spin(&ifp->if_ref_lock); + ifp->if_refflags = IFRF_ATTACHED; + lck_mtx_unlock(&ifp->if_ref_lock); if (net_rtref) { - (void) ifnet_set_idle_flags(ifp, IFRF_IDLE_NOTIFY, + /* boot-args override; enable idle notification */ + (void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY, IFRF_IDLE_NOTIFY); + } else { + /* apply previous request(s) to set the idle flags, if any */ + (void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags, + ifp->if_idle_new_flags_mask); + } -#endif /* IFNET_ROUTE_REFCNT */ + ifnet_lock_done(ifp); + lck_mtx_unlock(rnh_lock); + +#if PF + /* + * Attach packet filter to this interface, if enabled. + */ + pf_ifnet_hook(ifp, 1); +#endif /* PF */ dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0); - return 0; + if (dlil_verbose) { + printf("%s%d: attached%s\n", ifp->if_name, ifp->if_unit, + (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : ""); + } + + return (0); +} + +/* + * Prepare the storage for the first/permanent link address, which must + * must have the same lifetime as the ifnet itself. Although the link + * address gets removed from if_addrhead and ifnet_addrs[] at detach time, + * its location in memory must never change as it may still be referred + * to by some parts of the system afterwards (unfortunate implementation + * artifacts inherited from BSD.) + * + * Caller must hold ifnet lock as writer. + */ +static struct ifaddr * +dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr) +{ + struct ifaddr *ifa, *oifa; + struct sockaddr_dl *asdl, *msdl; + char workbuf[IFNAMSIZ*2]; + int namelen, masklen, socksize; + struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp; + + ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE); + VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen); + + namelen = snprintf(workbuf, sizeof (workbuf), "%s%d", + ifp->if_name, ifp->if_unit); + masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + namelen; + socksize = masklen + ifp->if_addrlen; +#define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1))) + if ((u_int32_t)socksize < sizeof (struct sockaddr_dl)) + socksize = sizeof(struct sockaddr_dl); + socksize = ROUNDUP(socksize); +#undef ROUNDUP + + ifa = ifp->if_lladdr; + if (socksize > DLIL_SDLMAXLEN || + (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) { + /* + * Rare, but in the event that the link address requires + * more storage space than DLIL_SDLMAXLEN, allocate the + * largest possible storages for address and mask, such + * that we can reuse the same space when if_addrlen grows. + * This same space will be used when if_addrlen shrinks. + */ + if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) { + int ifasize = sizeof (*ifa) + 2 * SOCK_MAXADDRLEN; + ifa = _MALLOC(ifasize, M_IFADDR, M_WAITOK | M_ZERO); + if (ifa == NULL) + return (NULL); + ifa_lock_init(ifa); + /* Don't set IFD_ALLOC, as this is permanent */ + ifa->ifa_debug = IFD_LINK; + } + IFA_LOCK(ifa); + /* address and mask sockaddr_dl locations */ + asdl = (struct sockaddr_dl *)(ifa + 1); + bzero(asdl, SOCK_MAXADDRLEN); + msdl = (struct sockaddr_dl *)((char *)asdl + SOCK_MAXADDRLEN); + bzero(msdl, SOCK_MAXADDRLEN); + } else { + VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa); + /* + * Use the storage areas for address and mask within the + * dlil_ifnet structure. This is the most common case. + */ + if (ifa == NULL) { + ifa = &dl_if->dl_if_lladdr.ifa; + ifa_lock_init(ifa); + /* Don't set IFD_ALLOC, as this is permanent */ + ifa->ifa_debug = IFD_LINK; + } + IFA_LOCK(ifa); + /* address and mask sockaddr_dl locations */ + asdl = (struct sockaddr_dl *)&dl_if->dl_if_lladdr.asdl; + bzero(asdl, sizeof (dl_if->dl_if_lladdr.asdl)); + msdl = (struct sockaddr_dl *)&dl_if->dl_if_lladdr.msdl; + bzero(msdl, sizeof (dl_if->dl_if_lladdr.msdl)); + } + + /* hold a permanent reference for the ifnet itself */ + IFA_ADDREF_LOCKED(ifa); + oifa = ifp->if_lladdr; + ifp->if_lladdr = ifa; + + VERIFY(ifa->ifa_debug == IFD_LINK); + ifa->ifa_ifp = ifp; + ifa->ifa_rtrequest = link_rtrequest; + ifa->ifa_addr = (struct sockaddr *)asdl; + asdl->sdl_len = socksize; + asdl->sdl_family = AF_LINK; + bcopy(workbuf, asdl->sdl_data, namelen); + asdl->sdl_nlen = namelen; + asdl->sdl_index = ifp->if_index; + asdl->sdl_type = ifp->if_type; + if (ll_addr != NULL) { + asdl->sdl_alen = ll_addr->sdl_alen; + bcopy(CONST_LLADDR(ll_addr), LLADDR(asdl), asdl->sdl_alen); + } else { + asdl->sdl_alen = 0; + } + ifa->ifa_netmask = (struct sockaddr*)msdl; + msdl->sdl_len = masklen; + while (namelen != 0) + msdl->sdl_data[--namelen] = 0xff; + IFA_UNLOCK(ifa); + + if (oifa != NULL) + IFA_REMREF(oifa); + + return (ifa); +} + +static void +if_purgeaddrs(struct ifnet *ifp) +{ +#if INET + in_purgeaddrs(ifp); +#endif /* INET */ +#if INET6 + in6_purgeaddrs(ifp); +#endif /* INET6 */ +#if NETAT + at_purgeaddrs(ifp); +#endif } errno_t -ifnet_detach( - ifnet_t ifp) +ifnet_detach(ifnet_t ifp) { - struct ifnet_filter *filter; - struct ifnet_filter *filter_next; - int zeroed = 0; - int retval = 0; - struct ifnet_filter_head fhead; - struct dlil_threading_info *inputthread; - - if (ifp == NULL) return EINVAL; - + if (ifp == NULL) + return (EINVAL); + + ifnet_head_lock_exclusive(); + lck_mtx_lock(rnh_lock); ifnet_lock_exclusive(ifp); - - if ((ifp->if_eflags & IFEF_DETACHING) != 0) { + + /* + * Check to see if this interface has previously triggered + * aggressive protocol draining; if so, decrement the global + * refcnt and clear PR_AGGDRAIN on the route domain if + * there are no more of such an interface around. + */ + (void) ifnet_set_idle_flags_locked(ifp, 0, ~0); + + lck_mtx_lock_spin(&ifp->if_ref_lock); + if (!(ifp->if_refflags & IFRF_ATTACHED)) { + lck_mtx_unlock(&ifp->if_ref_lock); + ifnet_lock_done(ifp); + lck_mtx_unlock(rnh_lock); + ifnet_head_done(); + return (EINVAL); + } else if (ifp->if_refflags & IFRF_DETACHING) { /* Interface has already been detached */ + lck_mtx_unlock(&ifp->if_ref_lock); ifnet_lock_done(ifp); - return ENXIO; + lck_mtx_unlock(rnh_lock); + ifnet_head_done(); + return (ENXIO); } - + /* Indicate this interface is being detached */ + ifp->if_refflags &= ~IFRF_ATTACHED; + ifp->if_refflags |= IFRF_DETACHING; + lck_mtx_unlock(&ifp->if_ref_lock); + + if (dlil_verbose) + printf("%s%d: detaching\n", ifp->if_name, ifp->if_unit); + /* - * Indicate this interface is being detached. - * - * This should prevent protocols from attaching - * from this point on. Interface will remain on - * the list until all of the protocols are detached. + * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will + * no longer be visible during lookups from this point. */ - ifp->if_eflags |= IFEF_DETACHING; + VERIFY(ifindex2ifnet[ifp->if_index] == ifp); + TAILQ_REMOVE(&ifnet_head, ifp, if_link); + ifp->if_link.tqe_next = NULL; + ifp->if_link.tqe_prev = NULL; + ifindex2ifnet[ifp->if_index] = NULL; + + /* Record detach PC stacktrace */ + ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach); + ifnet_lock_done(ifp); - - dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0); - + lck_mtx_unlock(rnh_lock); + ifnet_head_done(); + /* Let BPF know we're detaching */ bpfdetach(ifp); - -#if IFNET_ROUTE_REFCNT + + /* Mark the interface as DOWN */ + if_down(ifp); + + /* Disable forwarding cached route */ + lck_mtx_lock(&ifp->if_cached_route_lock); + ifp->if_fwd_cacheok = 0; + lck_mtx_unlock(&ifp->if_cached_route_lock); + /* - * Check to see if this interface has previously triggered - * aggressive protocol draining; if so, decrement the global - * refcnt and clear PR_AGGDRAIN on the route domain if - * there are no more of such an interface around. + * Drain any deferred IGMPv3/MLDv2 query responses, but keep the + * references to the info structures and leave them attached to + * this ifnet. */ - if (ifp->if_want_aggressive_drain != 0) - (void) ifnet_set_idle_flags(ifp, 0, ~0); -#endif /* IFNET_ROUTE_REFCNT */ - - if ((retval = dlil_write_begin()) != 0) { - if (retval == EDEADLK) { - retval = 0; - - /* We need to perform a delayed detach */ - ifp->if_delayed_detach = 1; - dlil_detach_waiting = 1; - wakeup(&dlil_detach_waiting); +#if INET + igmp_domifdetach(ifp); +#endif /* INET */ +#if INET6 + mld_domifdetach(ifp); +#endif /* INET6 */ + + dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0); + + /* Let worker thread take care of the rest, to avoid reentrancy */ + lck_mtx_lock(&dlil_ifnet_lock); + ifnet_detaching_enqueue(ifp); + lck_mtx_unlock(&dlil_ifnet_lock); + + return (0); +} + +static void +ifnet_detaching_enqueue(struct ifnet *ifp) +{ + lck_mtx_assert(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED); + + ++ifnet_detaching_cnt; + VERIFY(ifnet_detaching_cnt != 0); + TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link); + wakeup((caddr_t)&ifnet_delayed_run); +} + +static struct ifnet * +ifnet_detaching_dequeue(void) +{ + struct ifnet *ifp; + + lck_mtx_assert(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED); + + ifp = TAILQ_FIRST(&ifnet_detaching_head); + VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL); + if (ifp != NULL) { + VERIFY(ifnet_detaching_cnt != 0); + --ifnet_detaching_cnt; + TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link); + ifp->if_detaching_link.tqe_next = NULL; + ifp->if_detaching_link.tqe_prev = NULL; + } + return (ifp); +} + +static void +ifnet_delayed_thread_func(void) +{ + struct ifnet *ifp; + + for (;;) { + lck_mtx_lock(&dlil_ifnet_lock); + while (ifnet_detaching_cnt == 0) { + (void) msleep(&ifnet_delayed_run, &dlil_ifnet_lock, + (PZERO - 1), "ifnet_delayed_thread", NULL); + } + + VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL); + + /* Take care of detaching ifnet */ + ifp = ifnet_detaching_dequeue(); + if (ifp != NULL) { + lck_mtx_unlock(&dlil_ifnet_lock); + ifnet_detach_final(ifp); + } else { + lck_mtx_unlock(&dlil_ifnet_lock); } - return retval; } +} -#if PF - /* - * Detach this interface from packet filter, if enabled. +static void +ifnet_detach_final(struct ifnet *ifp) +{ + struct ifnet_filter *filter, *filter_next; + struct ifnet_filter_head fhead; + struct dlil_threading_info *inputthread; + struct ifaddr *ifa; + ifnet_detached_func if_free; + int i; + + lck_mtx_lock(&ifp->if_ref_lock); + if (!(ifp->if_refflags & IFRF_DETACHING)) { + panic("%s: flags mismatch (detaching not set) ifp=%p", + __func__, ifp); + /* NOTREACHED */ + } + + /* Wait until the existing IO references get released + * before we proceed with ifnet_detach */ - pf_ifnet_hook(ifp, 0); -#endif /* PF */ + while (ifp->if_refio > 0) { + printf("%s: Waiting for IO references on %s%d interface " + "to be released\n", __func__, ifp->if_name, ifp->if_unit); + (void) msleep(&(ifp->if_refio), &ifp->if_ref_lock, + (PZERO - 1), "ifnet_ioref_wait", NULL); + } + lck_mtx_unlock(&ifp->if_ref_lock); + + /* Detach interface filters */ + lck_mtx_lock(&ifp->if_flt_lock); + if_flt_monitor_enter(ifp); - /* Steal the list of interface filters */ + lck_mtx_assert(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED); fhead = ifp->if_flt_head; TAILQ_INIT(&ifp->if_flt_head); - /* unuse the interface */ - zeroed = ifp_unuse(ifp); + for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) { + filter_next = TAILQ_NEXT(filter, filt_next); + lck_mtx_unlock(&ifp->if_flt_lock); + + dlil_detach_filter_internal(filter, 1); + lck_mtx_lock(&ifp->if_flt_lock); + } + if_flt_monitor_leave(ifp); + lck_mtx_unlock(&ifp->if_flt_lock); + + /* Tell upper layers to drop their network addresses */ + if_purgeaddrs(ifp); + + ifnet_lock_exclusive(ifp); + + /* Uplumb all protocols */ + for (i = 0; i < PROTO_HASH_SLOTS; i++) { + struct if_proto *proto; + + proto = SLIST_FIRST(&ifp->if_proto_hash[i]); + while (proto != NULL) { + protocol_family_t family = proto->protocol_family; + ifnet_lock_done(ifp); + proto_unplumb(family, ifp); + ifnet_lock_exclusive(ifp); + proto = SLIST_FIRST(&ifp->if_proto_hash[i]); + } + /* There should not be any protocols left */ + VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i])); + } + zfree(dlif_phash_zone, ifp->if_proto_hash); + ifp->if_proto_hash = NULL; + + /* Detach (permanent) link address from if_addrhead */ + ifa = TAILQ_FIRST(&ifp->if_addrhead); + VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa); + IFA_LOCK(ifa); + if_detach_link_ifa(ifp, ifa); + IFA_UNLOCK(ifa); + + /* Remove (permanent) link address from ifnet_addrs[] */ + IFA_REMREF(ifa); + ifnet_addrs[ifp->if_index - 1] = NULL; + + /* This interface should not be on {ifnet_head,detaching} */ + VERIFY(ifp->if_link.tqe_next == NULL); + VERIFY(ifp->if_link.tqe_prev == NULL); + VERIFY(ifp->if_detaching_link.tqe_next == NULL); + VERIFY(ifp->if_detaching_link.tqe_prev == NULL); + + /* Prefix list should be empty by now */ + VERIFY(TAILQ_EMPTY(&ifp->if_prefixhead)); + + /* The slot should have been emptied */ + VERIFY(ifindex2ifnet[ifp->if_index] == NULL); + + /* There should not be any addresses left */ + VERIFY(TAILQ_EMPTY(&ifp->if_addrhead)); /* * If thread affinity was set for the workloop thread, we will need @@ -2853,16 +3418,19 @@ ifnet_detach( if (inputthread->net_affinity) { struct thread *tp; - if (inputthread == dlil_lo_thread_ptr) - panic("Thread affinity should not be enabled " - "on the loopback dlil input thread\n"); + if (inputthread == dlil_lo_thread_ptr) { + panic("%s: Thread affinity should not be " + "enabled on the loopback dlil input " + "thread", __func__); + /* NOTREACHED */ + } - lck_mtx_lock(inputthread->input_lck); + lck_mtx_lock_spin(&inputthread->input_lck); tp = inputthread->workloop_thread; inputthread->workloop_thread = NULL; inputthread->tag = 0; inputthread->net_affinity = FALSE; - lck_mtx_unlock(inputthread->input_lck); + lck_mtx_unlock(&inputthread->input_lck); /* Tear down workloop thread affinity */ if (tp != NULL) { @@ -2882,183 +3450,290 @@ ifnet_detach( if (inputthread != dlil_lo_thread_ptr) { #ifdef DLIL_DEBUG - printf("ifnet_detach: wakeup thread threadinfo: %p " + printf("%s: wakeup thread threadinfo: %p " "input_thread=%p threads: cur=%d max=%d\n", - inputthread, inputthread->input_thread, + __func__, inputthread, inputthread->input_thread, dlil_multithreaded_input, cur_dlil_input_threads); #endif - lck_mtx_lock(inputthread->input_lck); + lck_mtx_lock_spin(&inputthread->input_lck); inputthread->input_waiting |= DLIL_INPUT_TERMINATE; - if ((inputthread->input_waiting & DLIL_INPUT_RUNNING) == 0) { + if (!(inputthread->input_waiting & DLIL_INPUT_RUNNING)) wakeup((caddr_t)&inputthread->input_waiting); - } - lck_mtx_unlock(inputthread->input_lck); + + lck_mtx_unlock(&inputthread->input_lck); } } - /* last chance to clean up IPv4 forwarding cached route */ - lck_mtx_lock(ifp->if_fwd_route_lock); - if (ifp->if_fwd_route.ro_rt != NULL) { + + /* The driver might unload, so point these to ourselves */ + if_free = ifp->if_free; + ifp->if_output = ifp_if_output; + ifp->if_ioctl = ifp_if_ioctl; + ifp->if_set_bpf_tap = ifp_if_set_bpf_tap; + ifp->if_free = ifp_if_free; + ifp->if_demux = ifp_if_demux; + ifp->if_event = ifp_if_event; + ifp->if_framer = ifp_if_framer; + ifp->if_add_proto = ifp_if_add_proto; + ifp->if_del_proto = ifp_if_del_proto; + ifp->if_check_multi = ifp_if_check_multi; + + ifnet_lock_done(ifp); + +#if PF + /* + * Detach this interface from packet filter, if enabled. + */ + pf_ifnet_hook(ifp, 0); +#endif /* PF */ + + /* Filter list should be empty */ + lck_mtx_lock_spin(&ifp->if_flt_lock); + VERIFY(TAILQ_EMPTY(&ifp->if_flt_head)); + VERIFY(ifp->if_flt_busy == 0); + VERIFY(ifp->if_flt_waiters == 0); + lck_mtx_unlock(&ifp->if_flt_lock); + + /* Last chance to cleanup any cached route */ + lck_mtx_lock(&ifp->if_cached_route_lock); + VERIFY(!ifp->if_fwd_cacheok); + if (ifp->if_fwd_route.ro_rt != NULL) rtfree(ifp->if_fwd_route.ro_rt); - ifp->if_fwd_route.ro_rt = NULL; - } - lck_mtx_unlock(ifp->if_fwd_route_lock); - dlil_write_end(); - - for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) { - filter_next = TAILQ_NEXT(filter, filt_next); - dlil_detach_filter_internal(filter, 1); - } - - if (zeroed != 0) { - ifp_use_reached_zero(ifp); + bzero(&ifp->if_fwd_route, sizeof (ifp->if_fwd_route)); + if (ifp->if_src_route.ro_rt != NULL) + rtfree(ifp->if_src_route.ro_rt); + bzero(&ifp->if_src_route, sizeof (ifp->if_src_route)); + if (ifp->if_src_route6.ro_rt != NULL) + rtfree(ifp->if_src_route6.ro_rt); + bzero(&ifp->if_src_route6, sizeof (ifp->if_src_route6)); + lck_mtx_unlock(&ifp->if_cached_route_lock); + + ifnet_llreach_ifdetach(ifp); + + dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0); + + if (if_free != NULL) + if_free(ifp); + + /* + * Finally, mark this ifnet as detached. + */ + lck_mtx_lock_spin(&ifp->if_ref_lock); + if (!(ifp->if_refflags & IFRF_DETACHING)) { + panic("%s: flags mismatch (detaching not set) ifp=%p", + __func__, ifp); + /* NOTREACHED */ } - - return retval; + ifp->if_refflags &= ~IFRF_DETACHING; + lck_mtx_unlock(&ifp->if_ref_lock); + + if (dlil_verbose) + printf("%s%d: detached\n", ifp->if_name, ifp->if_unit); + + /* Release reference held during ifnet attach */ + ifnet_release(ifp); } static errno_t -dlil_recycle_ioctl( - __unused ifnet_t ifnet_ptr, - __unused u_long ioctl_code, - __unused void *ioctl_arg) +ifp_if_output(struct ifnet *ifp, struct mbuf *m) { - return EOPNOTSUPP; +#pragma unused(ifp) + m_freem(m); + return (0); } -static int -dlil_recycle_output( - __unused struct ifnet *ifnet_ptr, - struct mbuf *m) +static errno_t +ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf) { - m_freem(m); - return 0; +#pragma unused(ifp, fh, pf) + m_freem(m); + return (EJUSTRETURN); } -static void -dlil_recycle_free( - __unused ifnet_t ifnet_ptr) +static errno_t +ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf, + const struct ifnet_demux_desc *da, u_int32_t dc) { +#pragma unused(ifp, pf, da, dc) + return (EINVAL); } static errno_t -dlil_recycle_set_bpf_tap( - __unused ifnet_t ifp, - __unused bpf_tap_mode mode, - __unused bpf_packet_func callback) +ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf) { - /* XXX not sure what to do here */ - return 0; +#pragma unused(ifp, pf) + return (EINVAL); +} + +static errno_t +ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa) +{ +#pragma unused(ifp, sa) + return (EOPNOTSUPP); +} + +static errno_t +ifp_if_framer(struct ifnet *ifp, struct mbuf **m, + const struct sockaddr *sa, const char *ll, const char *t) +{ +#pragma unused(ifp, m, sa, ll, t) + m_freem(*m); + *m = NULL; + return (EJUSTRETURN); +} + +static errno_t +ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg) +{ +#pragma unused(ifp, cmd, arg) + return (EOPNOTSUPP); +} + +static errno_t +ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f) +{ +#pragma unused(ifp, tm, f) + /* XXX not sure what to do here */ + return (0); +} + +static void +ifp_if_free(struct ifnet *ifp) +{ +#pragma unused(ifp) +} + +static void +ifp_if_event(struct ifnet *ifp, const struct kev_msg *e) +{ +#pragma unused(ifp, e) } __private_extern__ -int dlil_if_acquire( - u_int32_t family, - const void *uniqueid, - size_t uniqueid_len, - struct ifnet **ifp) -{ - struct ifnet *ifp1 = NULL; - struct dlil_ifnet *dlifp1 = NULL; - int ret = 0; - - lck_mtx_lock(dlil_ifnet_mutex); - TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) { - - ifp1 = (struct ifnet *)dlifp1; - - if (ifp1->if_family == family) { - - /* same uniqueid and same len or no unique id specified */ - if ((uniqueid_len == dlifp1->if_uniqueid_len) - && !bcmp(uniqueid, dlifp1->if_uniqueid, uniqueid_len)) { - - /* check for matching interface in use */ - if (ifp1->if_eflags & IFEF_INUSE) { - if (uniqueid_len) { - ret = EBUSY; - goto end; - } - } - else { - if (!ifp1->if_lock) - panic("ifp's lock is gone\n"); - ifnet_lock_exclusive(ifp1); - ifp1->if_eflags |= (IFEF_INUSE | IFEF_REUSE); - ifnet_lock_done(ifp1); - *ifp = ifp1; +int dlil_if_acquire(u_int32_t family, const void *uniqueid, + size_t uniqueid_len, struct ifnet **ifp) +{ + struct ifnet *ifp1 = NULL; + struct dlil_ifnet *dlifp1 = NULL; + void *buf, *base, **pbuf; + int ret = 0; + + lck_mtx_lock(&dlil_ifnet_lock); + TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) { + ifp1 = (struct ifnet *)dlifp1; + + if (ifp1->if_family != family) + continue; + + lck_mtx_lock(&dlifp1->dl_if_lock); + /* same uniqueid and same len or no unique id specified */ + if ((uniqueid_len == dlifp1->dl_if_uniqueid_len) && + !bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len)) { + /* check for matching interface in use */ + if (dlifp1->dl_if_flags & DLIF_INUSE) { + if (uniqueid_len) { + ret = EBUSY; + lck_mtx_unlock(&dlifp1->dl_if_lock); goto end; - } - } - } - } - - /* no interface found, allocate a new one */ - MALLOC(dlifp1, struct dlil_ifnet *, sizeof(*dlifp1), M_NKE, M_WAITOK); - if (dlifp1 == 0) { - ret = ENOMEM; - goto end; - } - - bzero(dlifp1, sizeof(*dlifp1)); - - if (uniqueid_len) { - MALLOC(dlifp1->if_uniqueid, void *, uniqueid_len, M_NKE, M_WAITOK); - if (dlifp1->if_uniqueid == 0) { - FREE(dlifp1, M_NKE); - ret = ENOMEM; - goto end; - } - bcopy(uniqueid, dlifp1->if_uniqueid, uniqueid_len); - dlifp1->if_uniqueid_len = uniqueid_len; - } - - ifp1 = (struct ifnet *)dlifp1; - ifp1->if_eflags |= IFEF_INUSE; - ifp1->if_name = dlifp1->if_namestorage; + } + } else { + dlifp1->dl_if_flags |= (DLIF_INUSE|DLIF_REUSE); + lck_mtx_unlock(&dlifp1->dl_if_lock); + *ifp = ifp1; + goto end; + } + } + lck_mtx_unlock(&dlifp1->dl_if_lock); + } + + /* no interface found, allocate a new one */ + buf = zalloc(dlif_zone); + if (buf == NULL) { + ret = ENOMEM; + goto end; + } + bzero(buf, dlif_bufsize); + + /* Get the 64-bit aligned base address for this object */ + base = (void *)P2ROUNDUP((intptr_t)buf + sizeof (u_int64_t), + sizeof (u_int64_t)); + VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize)); + + /* + * Wind back a pointer size from the aligned base and + * save the original address so we can free it later. + */ + pbuf = (void **)((intptr_t)base - sizeof (void *)); + *pbuf = buf; + dlifp1 = base; + + if (uniqueid_len) { + MALLOC(dlifp1->dl_if_uniqueid, void *, uniqueid_len, + M_NKE, M_WAITOK); + if (dlifp1->dl_if_uniqueid == NULL) { + zfree(dlif_zone, dlifp1); + ret = ENOMEM; + goto end; + } + bcopy(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len); + dlifp1->dl_if_uniqueid_len = uniqueid_len; + } + + ifp1 = (struct ifnet *)dlifp1; + dlifp1->dl_if_flags = DLIF_INUSE; + if (ifnet_debug) { + dlifp1->dl_if_flags |= DLIF_DEBUG; + dlifp1->dl_if_trace = dlil_if_trace; + } + ifp1->if_name = dlifp1->dl_if_namestorage; #if CONFIG_MACF_NET - mac_ifnet_label_init(ifp1); + mac_ifnet_label_init(ifp1); #endif - TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link); - - *ifp = ifp1; + lck_mtx_init(&dlifp1->dl_if_lock, ifnet_lock_group, ifnet_lock_attr); + lck_rw_init(&ifp1->if_lock, ifnet_lock_group, ifnet_lock_attr); + lck_mtx_init(&ifp1->if_ref_lock, ifnet_lock_group, ifnet_lock_attr); + lck_mtx_init(&ifp1->if_flt_lock, ifnet_lock_group, ifnet_lock_attr); + lck_mtx_init(&ifp1->if_cached_route_lock, ifnet_lock_group, + ifnet_lock_attr); + lck_mtx_init(&ifp1->if_addrconfig_lock, ifnet_lock_group, + ifnet_lock_attr); + lck_rw_init(&ifp1->if_llreach_lock, ifnet_lock_group, ifnet_lock_attr); + + TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link); + + *ifp = ifp1; end: - lck_mtx_unlock(dlil_ifnet_mutex); + lck_mtx_unlock(&dlil_ifnet_lock); - return ret; + VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof (u_int64_t)) && + IS_P2ALIGNED(&ifp1->if_data, sizeof (u_int64_t)))); + + return (ret); } __private_extern__ void -dlil_if_release( - ifnet_t ifp) -{ - struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp; - - /* Interface does not have a lock until it is attached - radar 3713951 */ - if (ifp->if_lock) - ifnet_lock_exclusive(ifp); - ifp->if_eflags &= ~IFEF_INUSE; - ifp->if_ioctl = dlil_recycle_ioctl; - ifp->if_output = dlil_recycle_output; - ifp->if_free = dlil_recycle_free; - ifp->if_set_bpf_tap = dlil_recycle_set_bpf_tap; - - strncpy(dlifp->if_namestorage, ifp->if_name, IFNAMSIZ); - ifp->if_name = dlifp->if_namestorage; +dlil_if_release(ifnet_t ifp) +{ + struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp; + + ifnet_lock_exclusive(ifp); + lck_mtx_lock(&dlifp->dl_if_lock); + dlifp->dl_if_flags &= ~DLIF_INUSE; + strncpy(dlifp->dl_if_namestorage, ifp->if_name, IFNAMSIZ); + ifp->if_name = dlifp->dl_if_namestorage; + lck_mtx_unlock(&dlifp->dl_if_lock); #if CONFIG_MACF_NET - /* - * We can either recycle the MAC label here or in dlil_if_acquire(). - * It seems logical to do it here but this means that anything that - * still has a handle on ifp will now see it as unlabeled. - * Since the interface is "dead" that may be OK. Revisit later. - */ - mac_ifnet_label_recycle(ifp); + /* + * We can either recycle the MAC label here or in dlil_if_acquire(). + * It seems logical to do it here but this means that anything that + * still has a handle on ifp will now see it as unlabeled. + * Since the interface is "dead" that may be OK. Revisit later. + */ + mac_ifnet_label_recycle(ifp); #endif - if (ifp->if_lock) - ifnet_lock_done(ifp); - + ifnet_lock_done(ifp); } __private_extern__ void @@ -3081,3 +3756,138 @@ dlil_proto_unplumb_all(struct ifnet *ifp) (void) proto_unplumb(PF_APPLETALK, ifp); #endif /* NETAT */ } + +static void +ifp_src_route_copyout(struct ifnet *ifp, struct route *dst) +{ + lck_mtx_lock_spin(&ifp->if_cached_route_lock); + lck_mtx_convert_spin(&ifp->if_cached_route_lock); + + route_copyout(dst, &ifp->if_src_route, sizeof (*dst)); + + lck_mtx_unlock(&ifp->if_cached_route_lock); +} + +static void +ifp_src_route_copyin(struct ifnet *ifp, struct route *src) +{ + lck_mtx_lock_spin(&ifp->if_cached_route_lock); + lck_mtx_convert_spin(&ifp->if_cached_route_lock); + + if (ifp->if_fwd_cacheok) { + route_copyin(src, &ifp->if_src_route, sizeof (*src)); + } else { + rtfree(src->ro_rt); + src->ro_rt = NULL; + } + lck_mtx_unlock(&ifp->if_cached_route_lock); +} + +#if INET6 +static void +ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst) +{ + lck_mtx_lock_spin(&ifp->if_cached_route_lock); + lck_mtx_convert_spin(&ifp->if_cached_route_lock); + + route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6, + sizeof (*dst)); + + lck_mtx_unlock(&ifp->if_cached_route_lock); +} + +static void +ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src) +{ + lck_mtx_lock_spin(&ifp->if_cached_route_lock); + lck_mtx_convert_spin(&ifp->if_cached_route_lock); + + if (ifp->if_fwd_cacheok) { + route_copyin((struct route *)src, + (struct route *)&ifp->if_src_route6, sizeof (*src)); + } else { + rtfree(src->ro_rt); + src->ro_rt = NULL; + } + lck_mtx_unlock(&ifp->if_cached_route_lock); +} +#endif /* INET6 */ + +struct rtentry * +ifnet_cached_rtlookup_inet(struct ifnet *ifp, struct in_addr src_ip) +{ + struct route src_rt; + struct sockaddr_in *dst = (struct sockaddr_in *)(&src_rt.ro_dst); + + ifp_src_route_copyout(ifp, &src_rt); + + if (src_rt.ro_rt == NULL || !(src_rt.ro_rt->rt_flags & RTF_UP) || + src_ip.s_addr != dst->sin_addr.s_addr || + src_rt.ro_rt->generation_id != route_generation) { + if (src_rt.ro_rt != NULL) { + rtfree(src_rt.ro_rt); + src_rt.ro_rt = NULL; + } else if (dst->sin_family != AF_INET) { + bzero(&src_rt.ro_dst, sizeof (src_rt.ro_dst)); + dst->sin_len = sizeof (src_rt.ro_dst); + dst->sin_family = AF_INET; + } + dst->sin_addr = src_ip; + + if (src_rt.ro_rt == NULL) { + src_rt.ro_rt = rtalloc1_scoped((struct sockaddr *)dst, + 0, 0, ifp->if_index); + + if (src_rt.ro_rt != NULL) { + /* retain a ref, copyin consumes one */ + struct rtentry *rte = src_rt.ro_rt; + RT_ADDREF(rte); + ifp_src_route_copyin(ifp, &src_rt); + src_rt.ro_rt = rte; + } + } + } + + return (src_rt.ro_rt); +} + +#if INET6 +struct rtentry* +ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6) +{ + struct route_in6 src_rt; + + ifp_src_route6_copyout(ifp, &src_rt); + + if (src_rt.ro_rt == NULL || !(src_rt.ro_rt->rt_flags & RTF_UP) || + !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr) || + src_rt.ro_rt->generation_id != route_generation) { + if (src_rt.ro_rt != NULL) { + rtfree(src_rt.ro_rt); + src_rt.ro_rt = NULL; + } else if (src_rt.ro_dst.sin6_family != AF_INET6) { + bzero(&src_rt.ro_dst, sizeof (src_rt.ro_dst)); + src_rt.ro_dst.sin6_len = sizeof (src_rt.ro_dst); + src_rt.ro_dst.sin6_family = AF_INET6; + } + src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6); + src_rt.ro_dst.sin6_addr = *src_ip6; + + if (src_rt.ro_rt == NULL) { + src_rt.ro_rt = rtalloc1_scoped( + (struct sockaddr *)&src_rt.ro_dst, 0, 0, + ifp->if_index); + + if (src_rt.ro_rt != NULL) { + /* retain a ref, copyin consumes one */ + struct rtentry *rte = src_rt.ro_rt; + RT_ADDREF(rte); + ifp_src_route6_copyin(ifp, &src_rt); + src_rt.ro_rt = rte; + } + } + } + + return (src_rt.ro_rt); +} +#endif /* INET6 */ diff --git a/bsd/net/dlil.h b/bsd/net/dlil.h index e91dc7a01..db1060db8 100644 --- a/bsd/net/dlil.h +++ b/bsd/net/dlil.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009 Apple Inc. All rights reserved. + * Copyright (c) 1999-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,12 +25,6 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * Copyright (c) 1999 Apple Computer, Inc. - * - * Data Link Inteface Layer - * Author: Ted Walker - */ #ifndef DLIL_H #define DLIL_H #ifdef KERNEL @@ -82,22 +76,22 @@ struct sockaddr_dl; #endif -#ifdef BSD_KERNEL_PRIVATE -struct ifnet_stat_increment_param; struct iff_filter; +#define DLIL_THREADNAME_LEN 32 + struct dlil_threading_info { - mbuf_t mbuf_head; /* start of mbuf list from if */ - mbuf_t mbuf_tail; - u_int32_t mbuf_count; + decl_lck_mtx_data(, input_lck); + lck_grp_t *lck_grp; /* lock group (for lock stats) */ + mbuf_t mbuf_head; /* start of mbuf list from if */ + mbuf_t mbuf_tail; + u_int32_t mbuf_count; boolean_t net_affinity; /* affinity set is available */ - u_int32_t input_waiting; /* DLIL condition of thread */ + u_int32_t input_waiting; /* DLIL condition of thread */ struct thread *input_thread; /* thread data for this input */ struct thread *workloop_thread; /* current workloop thread */ u_int32_t tag; /* current affinity tag */ - lck_mtx_t *input_lck; - lck_grp_t *lck_grp; /* lock group (for lock stats) */ - char input_name[32]; + char input_name[DLIL_THREADNAME_LEN]; #if IFNET_INPUT_SANITY_CHK u_int32_t input_wake_cnt; /* number of times the thread was awaken with packets to process */ u_long input_mbuf_cnt; /* total number of mbuf packets processed by this thread */ @@ -105,8 +99,8 @@ struct dlil_threading_info { }; /* - The following are shared with kpi_protocol.c so that it may wakeup - the input thread to run through packets queued for protocol input. + * The following are shared with kpi_protocol.c so that it may wakeup + * the input thread to run through packets queued for protocol input. */ #define DLIL_INPUT_RUNNING 0x80000000 #define DLIL_INPUT_WAITING 0x40000000 @@ -114,79 +108,52 @@ struct dlil_threading_info { #define DLIL_PROTO_WAITING 0x10000000 #define DLIL_INPUT_TERMINATE 0x08000000 -void dlil_init(void); +extern void dlil_init(void); -errno_t dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, - bpf_packet_func callback); +extern errno_t dlil_set_bpf_tap(ifnet_t, bpf_tap_mode, bpf_packet_func); /* - * Send arp internal bypasses the check for - * IPv4LL. + * Send arp internal bypasses the check for IPv4LL. */ -errno_t -dlil_send_arp_internal( - ifnet_t ifp, - u_int16_t arpop, - const struct sockaddr_dl* sender_hw, - const struct sockaddr* sender_proto, - const struct sockaddr_dl* target_hw, - const struct sockaddr* target_proto); - -int -dlil_output( - ifnet_t ifp, - protocol_family_t proto_family, - mbuf_t packetlist, - void *route, - const struct sockaddr *dest, - int raw); - -errno_t -dlil_resolve_multi( - struct ifnet *ifp, - const struct sockaddr *proto_addr, - struct sockaddr *ll_addr, - size_t ll_len); - -errno_t -dlil_send_arp( - ifnet_t ifp, - u_int16_t arpop, - const struct sockaddr_dl* sender_hw, - const struct sockaddr* sender_proto, - const struct sockaddr_dl* target_hw, - const struct sockaddr* target_proto); - -int dlil_attach_filter(ifnet_t ifp, const struct iff_filter *if_filter, - interface_filter_t *filter_ref); -void dlil_detach_filter(interface_filter_t filter); -int dlil_detach_protocol(ifnet_t ifp, u_int32_t protocol); -extern void dlil_proto_unplumb_all(ifnet_t); +extern errno_t dlil_send_arp_internal(ifnet_t, u_int16_t, + const struct sockaddr_dl *, const struct sockaddr *, + const struct sockaddr_dl *, const struct sockaddr *); -#endif /* BSD_KERNEL_PRIVATE */ +extern int dlil_output(ifnet_t, protocol_family_t, mbuf_t, void *, + const struct sockaddr *, int); -void -dlil_post_msg(struct ifnet *ifp,u_int32_t event_subclass, u_int32_t event_code, - struct net_event_data *event_data, u_int32_t event_data_len); +extern void dlil_input_packet_list(struct ifnet *, struct mbuf *); -/* - * dlil_if_acquire is obsolete. Use ifnet_allocate. - */ +extern errno_t dlil_resolve_multi(struct ifnet *, + const struct sockaddr *, struct sockaddr *, size_t); + +extern errno_t dlil_send_arp(ifnet_t, u_int16_t, const struct sockaddr_dl *, + const struct sockaddr *, const struct sockaddr_dl *, + const struct sockaddr *); -int dlil_if_acquire(u_int32_t family, const void *uniqueid, size_t uniqueid_len, - struct ifnet **ifp); - +extern int dlil_attach_filter(ifnet_t, const struct iff_filter *, + interface_filter_t *); +extern void dlil_detach_filter(interface_filter_t); -/* +extern void dlil_proto_unplumb_all(ifnet_t); + +extern void dlil_post_msg(struct ifnet *, u_int32_t, u_int32_t, + struct net_event_data *, u_int32_t); + +/* + * dlil_if_acquire is obsolete. Use ifnet_allocate. + */ +extern int dlil_if_acquire(u_int32_t, const void *, size_t, struct ifnet **); +/* * dlil_if_release is obsolete. The equivalent is called automatically when * an interface is detached. */ +extern void dlil_if_release(struct ifnet *ifp); -void dlil_if_release(struct ifnet *ifp); - -#if IFNET_ROUTE_REFCNT extern u_int32_t ifnet_aggressive_drainers; -#endif /* IFNET_ROUTE_REFCNT */ + +extern errno_t dlil_if_ref(struct ifnet *); +extern errno_t dlil_if_free(struct ifnet *); #endif /* KERNEL_PRIVATE */ #endif /* KERNEL */ diff --git a/bsd/net/ether_if_module.c b/bsd/net/ether_if_module.c index fc1d9e4cf..a1cbfb3d1 100644 --- a/bsd/net/ether_if_module.c +++ b/bsd/net/ether_if_module.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -98,6 +98,9 @@ #include #include #include +#if IF_BRIDGE +#include +#endif /* IF_BRIDGE */ #include @@ -133,7 +136,7 @@ struct en_desc { #endif /* - * Header for the demux list, hangs off of IFP at family_cookie + * Header for the demux list, hangs off of IFP at if_family_cookie */ struct ether_desc_blk_str { @@ -147,19 +150,6 @@ struct ether_desc_blk_str { __private_extern__ u_char etherbroadcastaddr[ETHER_ADDR_LEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; -static __inline__ int -_ether_cmp(const void * a, const void * b) -{ - const u_int16_t * a_s = (const u_int16_t *)a; - const u_int16_t * b_s = (const u_int16_t *)b; - - if (a_s[0] != b_s[0] - || a_s[1] != b_s[1] - || a_s[2] != b_s[2]) { - return (1); - } - return (0); -} /* * Release all descriptor entries owned by this protocol (there may be several). @@ -171,7 +161,7 @@ ether_del_proto( ifnet_t ifp, protocol_family_t protocol_family) { - struct ether_desc_blk_str *desc_blk = (struct ether_desc_blk_str *)ifp->family_cookie; + struct ether_desc_blk_str *desc_blk = (struct ether_desc_blk_str *)ifp->if_family_cookie; u_int32_t current = 0; int found = 0; @@ -187,8 +177,8 @@ ether_del_proto( } if (desc_blk->n_used == 0) { - FREE(ifp->family_cookie, M_IFADDR); - ifp->family_cookie = 0; + FREE(ifp->if_family_cookie, M_IFADDR); + ifp->if_family_cookie = 0; } else { /* Decrement n_max_used */ @@ -207,7 +197,7 @@ ether_add_proto_internal( const struct ifnet_demux_desc *demux) { struct en_desc *ed; - struct ether_desc_blk_str *desc_blk = (struct ether_desc_blk_str *)ifp->family_cookie; + struct ether_desc_blk_str *desc_blk = (struct ether_desc_blk_str *)ifp->if_family_cookie; u_int32_t i; switch (demux->type) { @@ -291,7 +281,7 @@ ether_add_proto_internal( FREE(desc_blk, M_IFADDR); } desc_blk = tmp; - ifp->family_cookie = (uintptr_t)desc_blk; + ifp->if_family_cookie = (uintptr_t)desc_blk; desc_blk->n_count = new_count; } else { @@ -372,7 +362,7 @@ ether_demux( u_int16_t type; u_int8_t *data; u_int32_t i = 0; - struct ether_desc_blk_str *desc_blk = (struct ether_desc_blk_str *)ifp->family_cookie; + struct ether_desc_blk_str *desc_blk = (struct ether_desc_blk_str *)ifp->if_family_cookie; u_int32_t maxd = desc_blk ? desc_blk->n_max_used : 0; struct en_desc *ed = desc_blk ? desc_blk->block_ptr : NULL; u_int32_t extProto1 = 0; @@ -386,6 +376,16 @@ ether_demux( m->m_flags |= M_MCAST; } + if (m->m_flags & M_HASFCS) { + /* + * If the M_HASFCS is set by the driver we want to make sure + * that we strip off the trailing FCS data before handing it + * up the stack. + */ + m_adj(m, -ETHER_CRC_LEN); + m->m_flags &= ~M_HASFCS; + } + if (ifp->if_eflags & IFEF_BOND) { /* if we're bonded, bond "protocol" gets all the packets */ *protocol_family = PF_BOND; @@ -632,6 +632,9 @@ __private_extern__ int ether_family_init(void) #if BOND bond_family_init(); #endif /* BOND */ +#if IF_BRIDGE + bridgeattach(0); +#endif /* IF_BRIDGE */ done: diff --git a/bsd/net/ether_inet6_pr_module.c b/bsd/net/ether_inet6_pr_module.c index 371cccfd6..e8411dec6 100644 --- a/bsd/net/ether_inet6_pr_module.c +++ b/bsd/net/ether_inet6_pr_module.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -59,8 +59,6 @@ * */ - - #include #include #include @@ -69,6 +67,7 @@ #include #include #include +#include #include #include @@ -78,6 +77,7 @@ #include #include #include +#include #include #include @@ -90,13 +90,6 @@ #include #endif - - -#include - -#include - - #if LLC && CCITT extern struct ifqueue pkintrq; #endif @@ -114,70 +107,83 @@ extern struct ifqueue pkintrq; * the ether header, which is provided separately. */ static errno_t -ether_inet6_input( - __unused ifnet_t ifp, - protocol_family_t protocol, - mbuf_t packet, - __unused char *header) +ether_inet6_input(ifnet_t ifp, protocol_family_t protocol, + mbuf_t packet, char *header) { - errno_t error; +#pragma unused(ifp, protocol) + struct ether_header *eh = (struct ether_header *)header; + + if (eh->ether_type == htons(ETHERTYPE_IPV6)) { + struct ifnet *mifp; + /* + * Trust the ifp in the mbuf, rather than ifproto's + * since the packet could have been injected via + * a dlil_input_packet_list() using an ifp that is + * different than the one where the packet really + * came from. + */ + mifp = mbuf_pkthdr_rcvif(packet); + + /* Update L2 reachability record, if present (and not bcast) */ + if (bcmp(eh->ether_shost, etherbroadcastaddr, + ETHER_ADDR_LEN) != 0) { + nd6_llreach_set_reachable(mifp, eh->ether_shost, + ETHER_ADDR_LEN); + } - if ((error = proto_input(protocol, packet))) + if (proto_input(protocol, packet) != 0) + m_freem(packet); + } else { m_freem(packet); - return error; + } + + return (EJUSTRETURN); } static errno_t -ether_inet6_pre_output( - ifnet_t ifp, - __unused protocol_family_t protocol_family, - mbuf_t *m0, - const struct sockaddr *dst_netaddr, - void *route, - char *type, - char *edst) +ether_inet6_pre_output(ifnet_t ifp, protocol_family_t protocol_family, + mbuf_t *m0, const struct sockaddr *dst_netaddr, void *route, + char *type, char *edst) { +#pragma unused(protocol_family) errno_t result; - struct sockaddr_dl sdl; - register struct mbuf *m = *m0; + struct sockaddr_dl sdl; + struct mbuf *m = *m0; /* * Tell ether_frameout it's ok to loop packet if necessary */ m->m_flags |= M_LOOP; - - result = nd6_lookup_ipv6(ifp, (const struct sockaddr_in6*)dst_netaddr, - &sdl, sizeof(sdl), route, *m0); - + + result = nd6_lookup_ipv6(ifp, (const struct sockaddr_in6 *)dst_netaddr, + &sdl, sizeof (sdl), route, *m0); + if (result == 0) { - *(u_int16_t*)type = htons(ETHERTYPE_IPV6); + *(u_int16_t *)type = htons(ETHERTYPE_IPV6); bcopy(LLADDR(&sdl), edst, sdl.sdl_alen); } - - - return result; + return (result); } static int -ether_inet6_resolve_multi( - ifnet_t ifp, - const struct sockaddr *proto_addr, - struct sockaddr_dl *out_ll, - size_t ll_len) +ether_inet6_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr, + struct sockaddr_dl *out_ll, size_t ll_len) { - static const size_t minsize = offsetof(struct sockaddr_dl, sdl_data[0]) + ETHER_ADDR_LEN; - const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6*)proto_addr; - + static const size_t minsize = + offsetof(struct sockaddr_dl, sdl_data[0]) + ETHER_ADDR_LEN; + const struct sockaddr_in6 *sin6 = + (const struct sockaddr_in6 *)proto_addr; + if (proto_addr->sa_family != AF_INET6) - return EAFNOSUPPORT; - - if (proto_addr->sa_len < sizeof(struct sockaddr_in6)) - return EINVAL; - + return (EAFNOSUPPORT); + + if (proto_addr->sa_len < sizeof (struct sockaddr_in6)) + return (EINVAL); + if (ll_len < minsize) - return EMSGSIZE; - + return (EMSGSIZE); + bzero(out_ll, minsize); out_ll->sdl_len = minsize; out_ll->sdl_family = AF_LINK; @@ -187,20 +193,17 @@ ether_inet6_resolve_multi( out_ll->sdl_alen = ETHER_ADDR_LEN; out_ll->sdl_slen = 0; ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, LLADDR(out_ll)); - - return 0; -} + return (0); +} static errno_t -ether_inet6_prmod_ioctl( - ifnet_t ifp, - __unused protocol_family_t protocol_family, - u_long command, - void *data) +ether_inet6_prmod_ioctl(ifnet_t ifp, protocol_family_t protocol_family, + u_long command, void *data) { - struct ifreq *ifr = (struct ifreq *) data; - int error = 0; +#pragma unused(protocol_family) + struct ifreq *ifr = (struct ifreq *)data; + int error = 0; switch (command) { case SIOCSIFADDR: @@ -211,30 +214,30 @@ ether_inet6_prmod_ioctl( break; case SIOCGIFADDR: - ifnet_lladdr_copy_bytes(ifp, ifr->ifr_addr.sa_data, ETHER_ADDR_LEN); - break; + (void) ifnet_lladdr_copy_bytes(ifp, ifr->ifr_addr.sa_data, + ETHER_ADDR_LEN); + break; - default: - error = EOPNOTSUPP; - break; - } - return (error); + default: + error = EOPNOTSUPP; + break; + } + return (error); } errno_t -ether_attach_inet6( - struct ifnet *ifp, - __unused protocol_family_t protocol_family) +ether_attach_inet6(struct ifnet *ifp, protocol_family_t protocol_family) { +#pragma unused(protocol_family) struct ifnet_attach_proto_param proto; struct ifnet_demux_desc demux[1]; - u_short en_6native=htons(ETHERTYPE_IPV6); + u_short en_6native = htons(ETHERTYPE_IPV6); errno_t error; - - bzero(&proto, sizeof(proto)); + + bzero(&proto, sizeof (proto)); demux[0].type = DLIL_DESC_ETYPE2; demux[0].data = &en_6native; - demux[0].datalen = sizeof(en_6native); + demux[0].datalen = sizeof (en_6native); proto.demux_list = demux; proto.demux_count = 1; proto.input = ether_inet6_input; @@ -243,24 +246,15 @@ ether_attach_inet6( proto.resolve = ether_inet6_resolve_multi; error = ifnet_attach_protocol(ifp, protocol_family, &proto); if (error && error != EEXIST) { - printf("WARNING: ether_attach_inet6 can't attach ipv6 to %s%d\n", - ifp->if_name, ifp->if_unit); + printf("WARNING: %s can't attach ipv6 to %s%d\n", __func__, + ifp->if_name, ifp->if_unit); } - - return error; + + return (error); } void -ether_detach_inet6( - struct ifnet *ifp, - protocol_family_t protocol_family) +ether_detach_inet6(struct ifnet *ifp, protocol_family_t protocol_family) { - errno_t error; - - error = ifnet_detach_protocol(ifp, protocol_family); - if (error && error != ENOENT) { - printf("WARNING: ether_detach_inet6 can't detach ipv6 from %s%d\n", - ifp->if_name, ifp->if_unit); - } + (void) ifnet_detach_protocol(ifp, protocol_family); } - diff --git a/bsd/net/ether_inet_pr_module.c b/bsd/net/ether_inet_pr_module.c index 422866e73..12a8ead3c 100644 --- a/bsd/net/ether_inet_pr_module.c +++ b/bsd/net/ether_inet_pr_module.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -103,64 +103,62 @@ #include #endif -/* Local function declerations */ +/* Local function declarations */ extern void *kdp_get_interface(void); extern void kdp_set_ip_and_mac_addresses(struct in_addr *ipaddr, - struct ether_addr *macaddr); + struct ether_addr *macaddr); -static __inline__ void -_ip_copy(struct in_addr * dst, const struct in_addr * src) -{ - *dst = *src; - return; -} +#define _ip_copy(dst, src) \ + (*(dst) = *(src)) static void -ether_inet_arp_input( - struct mbuf *m) +ether_inet_arp_input(struct ifnet *ifp, struct mbuf *m) { struct ether_arp *ea; struct sockaddr_dl sender_hw; struct sockaddr_in sender_ip; struct sockaddr_in target_ip; - - if (mbuf_len(m) < sizeof(*ea) && - mbuf_pullup(&m, sizeof(*ea)) != 0) + + if (mbuf_len(m) < sizeof (*ea) && mbuf_pullup(&m, sizeof (*ea)) != 0) return; - + ea = mbuf_data(m); - + /* Verify this is an ethernet/ip arp and address lengths are correct */ if (ntohs(ea->arp_hrd) != ARPHRD_ETHER || - ntohs(ea->arp_pro) != ETHERTYPE_IP || - ea->arp_pln != sizeof(struct in_addr) || - ea->arp_hln != ETHER_ADDR_LEN) { - mbuf_free(m); + ntohs(ea->arp_pro) != ETHERTYPE_IP || + ea->arp_pln != sizeof (struct in_addr) || + ea->arp_hln != ETHER_ADDR_LEN) { + mbuf_freem(m); return; } - + /* Verify the sender is not broadcast */ if (bcmp(ea->arp_sha, etherbroadcastaddr, ETHER_ADDR_LEN) == 0) { - mbuf_free(m); + mbuf_freem(m); return; } - - bzero(&sender_ip, sizeof(sender_ip)); - sender_ip.sin_len = sizeof(sender_ip); + + bzero(&sender_ip, sizeof (sender_ip)); + sender_ip.sin_len = sizeof (sender_ip); sender_ip.sin_family = AF_INET; _ip_copy(&sender_ip.sin_addr, (const struct in_addr *)ea->arp_spa); target_ip = sender_ip; _ip_copy(&target_ip.sin_addr, (const struct in_addr *)ea->arp_tpa); - - bzero(&sender_hw, sizeof(sender_hw)); - sender_hw.sdl_len = sizeof(sender_hw); + + bzero(&sender_hw, sizeof (sender_hw)); + sender_hw.sdl_len = sizeof (sender_hw); sender_hw.sdl_family = AF_LINK; sender_hw.sdl_type = IFT_ETHER; sender_hw.sdl_alen = ETHER_ADDR_LEN; bcopy(ea->arp_sha, LLADDR(&sender_hw), ETHER_ADDR_LEN); - - arp_ip_handle_input(mbuf_pkthdr_rcvif(m), ntohs(ea->arp_op), &sender_hw, &sender_ip, &target_ip); - mbuf_free(m); + + /* update L2 reachability record, if present */ + arp_llreach_set_reachable(ifp, LLADDR(&sender_hw), ETHER_ADDR_LEN); + + arp_ip_handle_input(ifp, ntohs(ea->arp_op), &sender_hw, &sender_ip, + &target_ip); + mbuf_freem(m); } /* @@ -169,120 +167,131 @@ ether_inet_arp_input( * the ether header, which is provided separately. */ static errno_t -ether_inet_input( - __unused ifnet_t ifp, - __unused protocol_family_t protocol_family, - mbuf_t m_list) +ether_inet_input(ifnet_t ifp, protocol_family_t protocol_family, + mbuf_t m_list) { +#pragma unused(ifp, protocol_family) mbuf_t m; mbuf_t *tailptr = &m_list; mbuf_t nextpkt; - + /* Strip ARP and non-IP packets out of the list */ for (m = m_list; m; m = nextpkt) { - struct ether_header *eh = mbuf_pkthdr_header(m); - - nextpkt = m->m_nextpkt; - - if (eh->ether_type == htons(ETHERTYPE_IP)) { - /* put this packet in the list */ - *tailptr = m; - tailptr = &m->m_nextpkt; - } - else { - /* Pass ARP packets to arp input */ + struct ether_header *eh = mbuf_pkthdr_header(m); + struct ifnet *mifp; + + /* + * Trust the ifp in the mbuf, rather than ifproto's + * since the packet could have been injected via + * a dlil_input_packet_list() using an ifp that is + * different than the one where the packet really + * came from. + */ + mifp = mbuf_pkthdr_rcvif(m); + + nextpkt = m->m_nextpkt; + + if (eh->ether_type == htons(ETHERTYPE_IP)) { + /* + * Update L2 reachability record, if present + * (and if not a broadcast sender). + */ + if (bcmp(eh->ether_shost, etherbroadcastaddr, + ETHER_ADDR_LEN) != 0) { + arp_llreach_set_reachable(mifp, eh->ether_shost, + ETHER_ADDR_LEN); + } + /* put this packet in the list */ + *tailptr = m; + tailptr = &m->m_nextpkt; + } else { + /* Pass ARP packets to arp input */ m->m_nextpkt = NULL; - if (eh->ether_type == htons(ETHERTYPE_ARP)) - ether_inet_arp_input(m); - else - mbuf_freem(m); - } + if (eh->ether_type == htons(ETHERTYPE_ARP)) + ether_inet_arp_input(mifp, m); + else + mbuf_freem(m); + } } - + *tailptr = NULL; - + /* Pass IP list to ip input */ - if (m_list != NULL && proto_input(PF_INET, m_list) != 0) - { + if (m_list != NULL && proto_input(PF_INET, m_list) != 0) { mbuf_freem_list(m_list); } - - return 0; + + return (EJUSTRETURN); } static errno_t -ether_inet_pre_output( - ifnet_t ifp, - __unused protocol_family_t protocol_family, - mbuf_t *m0, - const struct sockaddr *dst_netaddr, - void* route, - char *type, - char *edst) +ether_inet_pre_output(ifnet_t ifp, protocol_family_t protocol_family, + mbuf_t *m0, const struct sockaddr *dst_netaddr, + void *route, char *type, char *edst) { - register struct mbuf *m = *m0; - const struct ether_header *eh; - errno_t result = 0; - - - if ((ifp->if_flags & (IFF_UP|IFF_RUNNING)) != (IFF_UP|IFF_RUNNING)) - return ENETDOWN; - - /* - * Tell ether_frameout it's ok to loop packet unless negated below. - */ - m->m_flags |= M_LOOP; - - switch (dst_netaddr->sa_family) { - - case AF_INET: { - struct sockaddr_dl ll_dest; - result = arp_lookup_ip(ifp, (const struct sockaddr_in*)dst_netaddr, - &ll_dest, sizeof(ll_dest), (route_t)route, *m0); - if (result == 0) { - bcopy(LLADDR(&ll_dest), edst, ETHER_ADDR_LEN); - *(u_int16_t*)type = htons(ETHERTYPE_IP); - } - } - break; +#pragma unused(protocol_family) + struct mbuf *m = *m0; + const struct ether_header *eh; + errno_t result = 0; - case pseudo_AF_HDRCMPLT: - case AF_UNSPEC: - m->m_flags &= ~M_LOOP; - eh = (const struct ether_header *)dst_netaddr->sa_data; - (void)memcpy(edst, eh->ether_dhost, 6); - *(u_short *)type = eh->ether_type; - break; - - default: - printf("%s%d: can't handle af%d\n", ifp->if_name, ifp->if_unit, - dst_netaddr->sa_family); - - result = EAFNOSUPPORT; - } - - return result; + if ((ifp->if_flags & (IFF_UP|IFF_RUNNING)) != (IFF_UP|IFF_RUNNING)) + return (ENETDOWN); + + /* + * Tell ether_frameout it's ok to loop packet unless negated below. + */ + m->m_flags |= M_LOOP; + + switch (dst_netaddr->sa_family) { + case AF_INET: { + struct sockaddr_dl ll_dest; + + result = arp_lookup_ip(ifp, + (const struct sockaddr_in *)dst_netaddr, &ll_dest, + sizeof (ll_dest), (route_t)route, *m0); + if (result == 0) { + bcopy(LLADDR(&ll_dest), edst, ETHER_ADDR_LEN); + *(u_int16_t *)type = htons(ETHERTYPE_IP); + } + break; + } + + case pseudo_AF_HDRCMPLT: + case AF_UNSPEC: + m->m_flags &= ~M_LOOP; + eh = (const struct ether_header *)dst_netaddr->sa_data; + (void) memcpy(edst, eh->ether_dhost, 6); + *(u_short *)type = eh->ether_type; + break; + + default: + printf("%s%d: can't handle af%d\n", ifp->if_name, ifp->if_unit, + dst_netaddr->sa_family); + + result = EAFNOSUPPORT; + break; + } + + return (result); } static errno_t -ether_inet_resolve_multi( - ifnet_t ifp, - const struct sockaddr *proto_addr, - struct sockaddr_dl *out_ll, - size_t ll_len) +ether_inet_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr, + struct sockaddr_dl *out_ll, size_t ll_len) { - static const size_t minsize = offsetof(struct sockaddr_dl, sdl_data[0]) + ETHER_ADDR_LEN; - const struct sockaddr_in *sin = (const struct sockaddr_in*)proto_addr; - + static const size_t minsize = + offsetof(struct sockaddr_dl, sdl_data[0]) + ETHER_ADDR_LEN; + const struct sockaddr_in *sin = (const struct sockaddr_in *)proto_addr; + if (proto_addr->sa_family != AF_INET) - return EAFNOSUPPORT; - - if (proto_addr->sa_len < sizeof(struct sockaddr_in)) - return EINVAL; + return (EAFNOSUPPORT); + + if (proto_addr->sa_len < sizeof (struct sockaddr_in)) + return (EINVAL); if (ll_len < minsize) - return EMSGSIZE; - + return (EMSGSIZE); + bzero(out_ll, minsize); out_ll->sdl_len = minsize; out_ll->sdl_family = AF_LINK; @@ -292,141 +301,128 @@ ether_inet_resolve_multi( out_ll->sdl_alen = ETHER_ADDR_LEN; out_ll->sdl_slen = 0; ETHER_MAP_IP_MULTICAST(&sin->sin_addr, LLADDR(out_ll)); - - return 0; + + return (0); } static errno_t -ether_inet_prmod_ioctl( - ifnet_t ifp, - __unused protocol_family_t protocol_family, - u_long command, - void *data) +ether_inet_prmod_ioctl(ifnet_t ifp, protocol_family_t protocol_family, + u_long command, void *data) { - ifaddr_t ifa = data; - struct ifreq *ifr = data; - int error = 0; - +#pragma unused(protocol_family) + ifaddr_t ifa = data; + struct ifreq *ifr = data; + int error = 0; + + switch (command) { + case SIOCSIFADDR: + case SIOCAIFADDR: + if (!(ifnet_flags(ifp) & IFF_RUNNING)) { + ifnet_set_flags(ifp, IFF_UP, IFF_UP); + ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL); + } - switch (command) { - case SIOCSIFADDR: - case SIOCAIFADDR: - if ((ifnet_flags(ifp) & IFF_RUNNING) == 0) { - ifnet_set_flags(ifp, IFF_UP, IFF_UP); - ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL); - } + if (ifaddr_address_family(ifa) != AF_INET) + break; - switch (ifaddr_address_family(ifa)) { - - case AF_INET: - - inet_arp_init_ifaddr(ifp, ifa); - /* - * Register new IP and MAC addresses with the kernel - * debugger if the interface is the same as was registered - * by IOKernelDebugger. If no interface was registered, - * fall back and just match against en0 interface. - * Do this only for the first address of the interface - * and not for aliases. - */ - if (command == SIOCSIFADDR && - ((kdp_get_interface() != 0 && - kdp_get_interface() == ifp->if_softc) || - (kdp_get_interface() == 0 && ifp->if_unit == 0))) + inet_arp_init_ifaddr(ifp, ifa); + /* + * Register new IP and MAC addresses with the kernel + * debugger if the interface is the same as was registered + * by IOKernelDebugger. If no interface was registered, + * fall back and just match against en0 interface. + * Do this only for the first address of the interface + * and not for aliases. + */ + if (command == SIOCSIFADDR && + ((kdp_get_interface() != 0 && + kdp_get_interface() == ifp->if_softc) || + (kdp_get_interface() == 0 && ifp->if_unit == 0))) kdp_set_ip_and_mac_addresses(&(IA_SIN(ifa)->sin_addr), ifnet_lladdr(ifp)); + break; - break; - - default: - break; - } - - break; - - case SIOCGIFADDR: - ifnet_lladdr_copy_bytes(ifp, ifr->ifr_addr.sa_data, ETHER_ADDR_LEN); + case SIOCGIFADDR: + ifnet_lladdr_copy_bytes(ifp, ifr->ifr_addr.sa_data, + ETHER_ADDR_LEN); break; - default: + default: error = EOPNOTSUPP; break; - } + } - return (error); + return (error); } static void -ether_inet_event( - ifnet_t ifp, - __unused protocol_family_t protocol, - const struct kev_msg *event) +ether_inet_event(ifnet_t ifp, protocol_family_t protocol, + const struct kev_msg *event) { - ifaddr_t *addresses; - +#pragma unused(protocol) + ifaddr_t *addresses; + if (event->vendor_code != KEV_VENDOR_APPLE || - event->kev_class != KEV_NETWORK_CLASS || - event->kev_subclass != KEV_DL_SUBCLASS || - event->event_code != KEV_DL_LINK_ADDRESS_CHANGED) { + event->kev_class != KEV_NETWORK_CLASS || + event->kev_subclass != KEV_DL_SUBCLASS || + event->event_code != KEV_DL_LINK_ADDRESS_CHANGED) { return; } - + if (ifnet_get_address_list_family(ifp, &addresses, AF_INET) == 0) { int i; - + for (i = 0; addresses[i] != NULL; i++) { inet_arp_init_ifaddr(ifp, addresses[i]); } - + ifnet_free_address_list(addresses); } } static errno_t -ether_inet_arp( - ifnet_t ifp, - u_short arpop, - const struct sockaddr_dl* sender_hw, - const struct sockaddr* sender_proto, - const struct sockaddr_dl* target_hw, - const struct sockaddr* target_proto) +ether_inet_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw, + const struct sockaddr *sender_proto, const struct sockaddr_dl *target_hw, + const struct sockaddr *target_proto) { mbuf_t m; errno_t result; struct ether_header *eh; struct ether_arp *ea; - const struct sockaddr_in* sender_ip = (const struct sockaddr_in*)sender_proto; - const struct sockaddr_in* target_ip = (const struct sockaddr_in*)target_proto; + const struct sockaddr_in *sender_ip = + (const struct sockaddr_in *)sender_proto; + const struct sockaddr_in *target_ip = + (const struct sockaddr_in *)target_proto; char *datap; - + if (target_ip == NULL) - return EINVAL; - + return (EINVAL); + if ((sender_ip && sender_ip->sin_family != AF_INET) || target_ip->sin_family != AF_INET) - return EAFNOSUPPORT; - + return (EAFNOSUPPORT); + result = mbuf_gethdr(MBUF_DONTWAIT, MBUF_TYPE_DATA, &m); if (result != 0) - return result; - - mbuf_setlen(m, sizeof(*ea)); - mbuf_pkthdr_setlen(m, sizeof(*ea)); - + return (result); + + mbuf_setlen(m, sizeof (*ea)); + mbuf_pkthdr_setlen(m, sizeof (*ea)); + /* Move the data pointer in the mbuf to the end, aligned to 4 bytes */ datap = mbuf_datastart(m); datap += mbuf_trailingspace(m); datap -= (((uintptr_t)datap) & 0x3); - mbuf_setdata(m, datap, sizeof(*ea)); + mbuf_setdata(m, datap, sizeof (*ea)); ea = mbuf_data(m); - + /* * Prepend the ethernet header, we will send the raw frame; * callee frees the original mbuf when allocation fails. */ - result = mbuf_prepend(&m, sizeof(*eh), MBUF_DONTWAIT); + result = mbuf_prepend(&m, sizeof (*eh), MBUF_DONTWAIT); if (result != 0) - return result; + return (result); eh = mbuf_data(m); eh->ether_type = htons(ETHERTYPE_ARP); @@ -434,108 +430,108 @@ ether_inet_arp( #if CONFIG_MACF_NET mac_mbuf_label_associate_linklayer(ifp, m); #endif - + /* Fill out the arp header */ ea->arp_pro = htons(ETHERTYPE_IP); - ea->arp_hln = sizeof(ea->arp_sha); - ea->arp_pln = sizeof(ea->arp_spa); + ea->arp_hln = sizeof (ea->arp_sha); + ea->arp_pln = sizeof (ea->arp_spa); ea->arp_hrd = htons(ARPHRD_ETHER); ea->arp_op = htons(arpop); - + /* Sender Hardware */ if (sender_hw != NULL) { - bcopy(CONST_LLADDR(sender_hw), ea->arp_sha, sizeof(ea->arp_sha)); - } - else { + bcopy(CONST_LLADDR(sender_hw), ea->arp_sha, + sizeof (ea->arp_sha)); + } else { ifnet_lladdr_copy_bytes(ifp, ea->arp_sha, ETHER_ADDR_LEN); } - ifnet_lladdr_copy_bytes(ifp, eh->ether_shost, sizeof(eh->ether_shost)); - + ifnet_lladdr_copy_bytes(ifp, eh->ether_shost, sizeof (eh->ether_shost)); + /* Sender IP */ if (sender_ip != NULL) { - bcopy(&sender_ip->sin_addr, ea->arp_spa, sizeof(ea->arp_spa)); - } - else { + bcopy(&sender_ip->sin_addr, ea->arp_spa, sizeof (ea->arp_spa)); + } else { struct ifaddr *ifa; - + /* Look for an IP address to use as our source */ ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET) + IFA_LOCK(ifa); + if (ifa->ifa_addr != NULL && + ifa->ifa_addr->sa_family == AF_INET) { + bcopy(&((struct sockaddr_in *)ifa->ifa_addr)-> + sin_addr, ea->arp_spa, sizeof(ea->arp_spa)); + IFA_UNLOCK(ifa); break; - } - if (ifa) { - bcopy(&((struct sockaddr_in*)ifa->ifa_addr)->sin_addr, ea->arp_spa, - sizeof(ea->arp_spa)); + } + IFA_UNLOCK(ifa); } ifnet_lock_done(ifp); - + if (ifa == NULL) { - mbuf_free(m); - return ENXIO; + mbuf_freem(m); + return (ENXIO); } } - + /* Target Hardware */ - if (target_hw == 0) { - bzero(ea->arp_tha, sizeof(ea->arp_tha)); - bcopy(etherbroadcastaddr, eh->ether_dhost, sizeof(eh->ether_dhost)); - } - else { - bcopy(CONST_LLADDR(target_hw), ea->arp_tha, sizeof(ea->arp_tha)); - bcopy(CONST_LLADDR(target_hw), eh->ether_dhost, sizeof(eh->ether_dhost)); + if (target_hw == NULL) { + bzero(ea->arp_tha, sizeof (ea->arp_tha)); + bcopy(etherbroadcastaddr, eh->ether_dhost, + sizeof (eh->ether_dhost)); + } else { + bcopy(CONST_LLADDR(target_hw), ea->arp_tha, + sizeof (ea->arp_tha)); + bcopy(CONST_LLADDR(target_hw), eh->ether_dhost, + sizeof (eh->ether_dhost)); } - + /* Target IP */ - bcopy(&target_ip->sin_addr, ea->arp_tpa, sizeof(ea->arp_tpa)); - + bcopy(&target_ip->sin_addr, ea->arp_tpa, sizeof (ea->arp_tpa)); + ifnet_output_raw(ifp, PF_INET, m); - - return 0; + + return (0); } errno_t -ether_attach_inet( - struct ifnet *ifp, - __unused protocol_family_t proto_family) +ether_attach_inet(struct ifnet *ifp, protocol_family_t proto_family) { +#pragma unused(proto_family) struct ifnet_attach_proto_param_v2 proto; struct ifnet_demux_desc demux[2]; - u_short en_native=htons(ETHERTYPE_IP); - u_short arp_native=htons(ETHERTYPE_ARP); + u_short en_native = htons(ETHERTYPE_IP); + u_short arp_native = htons(ETHERTYPE_ARP); errno_t error; - - bzero(&demux[0], sizeof(demux)); + + bzero(&demux[0], sizeof (demux)); demux[0].type = DLIL_DESC_ETYPE2; demux[0].data = &en_native; - demux[0].datalen = sizeof(en_native); + demux[0].datalen = sizeof (en_native); demux[1].type = DLIL_DESC_ETYPE2; demux[1].data = &arp_native; - demux[1].datalen = sizeof(arp_native); + demux[1].datalen = sizeof (arp_native); - bzero(&proto, sizeof(proto)); + bzero(&proto, sizeof (proto)); proto.demux_list = demux; - proto.demux_count = sizeof(demux) / sizeof(demux[0]); + proto.demux_count = sizeof (demux) / sizeof (demux[0]); proto.input = ether_inet_input; proto.pre_output = ether_inet_pre_output; proto.ioctl = ether_inet_prmod_ioctl; proto.event = ether_inet_event; proto.resolve = ether_inet_resolve_multi; proto.send_arp = ether_inet_arp; - + error = ifnet_attach_protocol_v2(ifp, proto_family, &proto); if (error && error != EEXIST) { - printf("WARNING: ether_attach_inet can't attach ip to %s%d\n", - ifp->if_name, ifp->if_unit); + printf("WARNING: %s can't attach ip to %s%d\n", __func__, + ifp->if_name, ifp->if_unit); } - return error; + return (error); } void -ether_detach_inet( - struct ifnet *ifp, - protocol_family_t proto_family) +ether_detach_inet(struct ifnet *ifp, protocol_family_t proto_family) { - (void)ifnet_detach_protocol(ifp, proto_family); + (void) ifnet_detach_protocol(ifp, proto_family); } - diff --git a/bsd/net/ethernet.h b/bsd/net/ethernet.h index 00b7fa5fb..aea52bc20 100644 --- a/bsd/net/ethernet.h +++ b/bsd/net/ethernet.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000,2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -65,6 +65,14 @@ */ #define ETHER_MAX_LEN 1518 +/* + * Mbuf adjust factor to force 32-bit alignment of IP header. + * Drivers should do m_adj(m, ETHER_ALIGN) when setting up a + * receive so the upper layers get the IP header properly aligned + * past the 14-byte Ethernet header. + */ +#define ETHER_ALIGN 2 /* driver adjust for IP hdr alignment */ + /* * A macro to validate a length with */ @@ -120,7 +128,23 @@ struct ether_addr *ether_aton(const char *); #ifdef BSD_KERNEL_PRIVATE extern u_char etherbroadcastaddr[ETHER_ADDR_LEN]; -#endif + + +static __inline__ int +_ether_cmp(const void * a, const void * b) +{ + const u_int16_t * a_s = (const u_int16_t *)a; + const u_int16_t * b_s = (const u_int16_t *)b; + + if (a_s[0] != b_s[0] + || a_s[1] != b_s[1] + || a_s[2] != b_s[2]) { + return (1); + } + return (0); +} + +#endif /* BSD_KERNEL_PRIVATE */ #define ETHER_IS_MULTICAST(addr) (*(addr) & 0x01) /* is address mcast/bcast? */ diff --git a/bsd/net/if.c b/bsd/net/if.c index 02b698007..26314b948 100644 --- a/bsd/net/if.c +++ b/bsd/net/if.c @@ -81,9 +81,13 @@ #include #include #include +#include +#include #include +#include + #include #include #include @@ -106,15 +110,14 @@ #include #include #include +#include #if INET6 #include #include +#include #endif #endif -extern int dlil_multithreaded_input; -extern struct dlil_threading_info *dlil_lo_thread_ptr; - #if CONFIG_MACF_NET #include #endif @@ -124,11 +127,21 @@ extern struct dlil_threading_info *dlil_lo_thread_ptr; * System initialization */ +/* Lock group and attribute for ifaddr lock */ +lck_attr_t *ifa_mtx_attr; +lck_grp_t *ifa_mtx_grp; +static lck_grp_attr_t *ifa_mtx_grp_attr; + static int ifconf(u_long cmd, user_addr_t ifrp, int * ret_space); static void if_qflush(struct ifqueue *); __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *); void if_rtproto_del(struct ifnet *ifp, int protocol); +static int if_addmulti_common(struct ifnet *, const struct sockaddr *, + struct ifmultiaddr **, int); +static int if_delmulti_common(struct ifmultiaddr *, struct ifnet *, + const struct sockaddr *, int); + static int if_rtmtu(struct radix_node *, void *); static void if_rtmtu_update(struct ifnet *); @@ -137,7 +150,6 @@ static int if_clone_list(int count, int * total, user_addr_t dst); #endif /* IF_CLONE_LIST */ MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address"); -MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address"); int ifqmaxlen = IFQ_MAXLEN; struct ifnethead ifnet_head = TAILQ_HEAD_INITIALIZER(ifnet_head); @@ -147,6 +159,50 @@ LIST_HEAD(, if_clone) if_cloners = LIST_HEAD_INITIALIZER(if_cloners); static struct ifaddr *ifa_ifwithnet_common(const struct sockaddr *, unsigned int); +static void if_attach_ifa_common(struct ifnet *, struct ifaddr *, int); +static void if_detach_ifa_common(struct ifnet *, struct ifaddr *, int); + +static void if_attach_ifma(struct ifnet *, struct ifmultiaddr *, int); +static int if_detach_ifma(struct ifnet *, struct ifmultiaddr *, int); + +static struct ifmultiaddr *ifma_alloc(int); +static void ifma_free(struct ifmultiaddr *); +static void ifma_trace(struct ifmultiaddr *, int); + +#if DEBUG +static unsigned int ifma_debug = 1; /* debugging (enabled) */ +#else +static unsigned int ifma_debug; /* debugging (disabled) */ +#endif /* !DEBUG */ +static unsigned int ifma_size; /* size of zone element */ +static struct zone *ifma_zone; /* zone for ifmultiaddr */ + +#define IFMA_TRACE_HIST_SIZE 32 /* size of trace history */ + +/* For gdb */ +__private_extern__ unsigned int ifma_trace_hist_size = IFMA_TRACE_HIST_SIZE; + +struct ifmultiaddr_dbg { + struct ifmultiaddr ifma; /* ifmultiaddr */ + u_int16_t ifma_refhold_cnt; /* # of ref */ + u_int16_t ifma_refrele_cnt; /* # of rele */ + /* + * Circular lists of IFA_ADDREF and IFA_REMREF callers. + */ + ctrace_t ifma_refhold[IFMA_TRACE_HIST_SIZE]; + ctrace_t ifma_refrele[IFMA_TRACE_HIST_SIZE]; + /* + * Trash list linkage + */ + TAILQ_ENTRY(ifmultiaddr_dbg) ifma_trash_link; +}; + +/* List of trash ifmultiaddr entries protected by ifma_trash_lock */ +static TAILQ_HEAD(, ifmultiaddr_dbg) ifma_trash_head; +static decl_lck_mtx_data(, ifma_trash_lock); + +#define IFMA_ZONE_MAX 64 /* maximum elements in zone */ +#define IFMA_ZONE_NAME "ifmultiaddr" /* zone name */ #if INET6 /* @@ -154,9 +210,36 @@ static struct ifaddr *ifa_ifwithnet_common(const struct sockaddr *, * should be more generalized? */ extern void nd6_setmtu(struct ifnet *); +extern lck_mtx_t *nd6_mutex; #endif +void +ifa_init(void) +{ + /* Setup lock group and attribute for ifaddr */ + ifa_mtx_grp_attr = lck_grp_attr_alloc_init(); + ifa_mtx_grp = lck_grp_alloc_init("ifaddr", ifa_mtx_grp_attr); + ifa_mtx_attr = lck_attr_alloc_init(); + + PE_parse_boot_argn("ifa_debug", &ifma_debug, sizeof (ifma_debug)); + + ifma_size = (ifma_debug == 0) ? sizeof (struct ifmultiaddr) : + sizeof (struct ifmultiaddr_dbg); + + ifma_zone = zinit(ifma_size, IFMA_ZONE_MAX * ifma_size, 0, + IFMA_ZONE_NAME); + if (ifma_zone == NULL) { + panic("%s: failed allocating %s", __func__, IFMA_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(ifma_zone, Z_EXPAND, TRUE); + zone_change(ifma_zone, Z_CALLERACCT, FALSE); + + lck_mtx_init(&ifma_trash_lock, ifa_mtx_grp, ifa_mtx_attr); + TAILQ_INIT(&ifma_trash_head); +} + /* * Network interface utility routines. * @@ -169,45 +252,106 @@ struct ifaddr **ifnet_addrs; struct ifnet **ifindex2ifnet; __private_extern__ void -if_attach_ifa( - struct ifnet *ifp, - struct ifaddr *ifa) +if_attach_ifa(struct ifnet *ifp, struct ifaddr *ifa) +{ + if_attach_ifa_common(ifp, ifa, 0); +} + +__private_extern__ void +if_attach_link_ifa(struct ifnet *ifp, struct ifaddr *ifa) +{ + if_attach_ifa_common(ifp, ifa, 1); +} + +static void +if_attach_ifa_common(struct ifnet *ifp, struct ifaddr *ifa, int link) { - ifnet_lock_assert(ifp, LCK_MTX_ASSERT_OWNED); - if (ifa->ifa_debug & IFD_ATTACHED) { - panic("if_attach_ifa: Attempted to attach address that's already attached!\n"); + ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE); + IFA_LOCK_ASSERT_HELD(ifa); + + if (ifa->ifa_ifp != ifp) { + panic("%s: Mismatch ifa_ifp=%p != ifp=%p", __func__, + ifa->ifa_ifp, ifp); + /* NOTREACHED */ + } else if (ifa->ifa_debug & IFD_ATTACHED) { + panic("%s: Attempt to attach an already attached ifa=%p", + __func__, ifa); + /* NOTREACHED */ + } else if (link && !(ifa->ifa_debug & IFD_LINK)) { + panic("%s: Unexpected non-link address ifa=%p", __func__, ifa); + /* NOTREACHED */ + } else if (!link && (ifa->ifa_debug & IFD_LINK)) { + panic("%s: Unexpected link address ifa=%p", __func__, ifa); + /* NOTREACHED */ } - ifaref(ifa); + IFA_ADDREF_LOCKED(ifa); ifa->ifa_debug |= IFD_ATTACHED; - TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); + if (link) + TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link); + else + TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); + + if (ifa->ifa_attached != NULL) + (*ifa->ifa_attached)(ifa); } __private_extern__ void -if_detach_ifa( - struct ifnet *ifp, - struct ifaddr *ifa) -{ - ifnet_lock_assert(ifp, LCK_MTX_ASSERT_OWNED); -#if 1 - /* Debugging code */ - if ((ifa->ifa_debug & IFD_ATTACHED) == 0) { - printf("if_detach_ifa: ifa is not attached to any interface! flags=%u\n", ifa->ifa_debug); - return; - } - else { +if_detach_ifa(struct ifnet *ifp, struct ifaddr *ifa) +{ + if_detach_ifa_common(ifp, ifa, 0); +} + +__private_extern__ void +if_detach_link_ifa(struct ifnet *ifp, struct ifaddr *ifa) +{ + if_detach_ifa_common(ifp, ifa, 1); +} + +static void +if_detach_ifa_common(struct ifnet *ifp, struct ifaddr *ifa, int link) +{ + ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE); + IFA_LOCK_ASSERT_HELD(ifa); + + if (link && !(ifa->ifa_debug & IFD_LINK)) { + panic("%s: Unexpected non-link address ifa=%p", __func__, ifa); + /* NOTREACHED */ + } else if (link && ifa != TAILQ_FIRST(&ifp->if_addrhead)) { + panic("%s: Link address ifa=%p not first", __func__, ifa); + /* NOTREACHED */ + } else if (!link && (ifa->ifa_debug & IFD_LINK)) { + panic("%s: Unexpected link address ifa=%p", __func__, ifa); + /* NOTREACHED */ + } else if (!(ifa->ifa_debug & IFD_ATTACHED)) { + panic("%s: Attempt to detach an unattached address ifa=%p", + __func__, ifa); + /* NOTREACHED */ + } else if (ifa->ifa_ifp != ifp) { + panic("%s: Mismatch ifa_ifp=%p, ifp=%p", __func__, + ifa->ifa_ifp, ifp); + /* NOTREACHED */ + } else if (ifa->ifa_debug & IFD_DEBUG) { struct ifaddr *ifa2; TAILQ_FOREACH(ifa2, &ifp->if_addrhead, ifa_link) { if (ifa2 == ifa) break; } if (ifa2 != ifa) { - printf("if_detach_ifa: Attempted to detach IFA that was not attached!\n"); - } + panic("%s: Attempt to detach a stray address ifa=%p", + __func__, ifa); + /* NOTREACHED */ + } } -#endif TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link); + /* This must not be the last reference to the ifaddr */ + if (IFA_REMREF_LOCKED(ifa) == NULL) { + panic("%s: unexpected (missing) refcnt ifa=%p", __func__, ifa); + /* NOTREACHED */ + } ifa->ifa_debug &= ~IFD_ATTACHED; - ifafree(ifa); + + if (ifa->ifa_detached != NULL) + (*ifa->ifa_detached)(ifa); } #define INITIAL_IF_INDEXLIM 8 @@ -346,7 +490,8 @@ if_clone_create(char *name, int len, void *params) * there's no straightforward way to recover if * it happens. */ - panic("if_clone_create(): interface name too long"); + panic("%s: interface name too long", __func__); + /* NOTREACHED */ } } @@ -548,36 +693,72 @@ ifa_foraddr_scoped(unsigned int addr, unsigned int scope) lck_rw_lock_shared(in_ifaddr_rwlock); TAILQ_FOREACH(ia, INADDR_HASH(addr), ia_hash) { + IFA_LOCK_SPIN(&ia->ia_ifa); if (ia->ia_addr.sin_addr.s_addr == addr && - (scope == IFSCOPE_NONE || ia->ia_ifp->if_index == scope)) + (scope == IFSCOPE_NONE || ia->ia_ifp->if_index == scope)) { + IFA_ADDREF_LOCKED(&ia->ia_ifa); /* for caller */ + IFA_UNLOCK(&ia->ia_ifa); break; + } + IFA_UNLOCK(&ia->ia_ifa); } - if (ia != NULL) - ifaref(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); return (ia); } +#if INET6 +/* + * Similar to ifa_foraddr, except that this for IPv6. + */ +__private_extern__ struct in6_ifaddr * +ifa_foraddr6(struct in6_addr *addr6) +{ + return (ifa_foraddr6_scoped(addr6, IFSCOPE_NONE)); +} + +__private_extern__ struct in6_ifaddr * +ifa_foraddr6_scoped(struct in6_addr *addr6, unsigned int scope) +{ + struct in6_ifaddr *ia = NULL; + + lck_rw_lock_shared(&in6_ifaddr_rwlock); + for (ia = in6_ifaddrs; ia; ia = ia->ia_next) { + IFA_LOCK(&ia->ia_ifa); + if (IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, addr6) && + (scope == IFSCOPE_NONE || ia->ia_ifp->if_index == scope)) { + IFA_ADDREF_LOCKED(&ia->ia_ifa); /* for caller */ + IFA_UNLOCK(&ia->ia_ifa); + break; + } + IFA_UNLOCK(&ia->ia_ifa); + } + lck_rw_done(&in6_ifaddr_rwlock); + + return (ia); +} +#endif /* INET6 */ + /* * Return the first (primary) address of a given family on an interface. */ __private_extern__ struct ifaddr * ifa_ifpgetprimary(struct ifnet *ifp, int family) { - struct ifaddr *ifa0 = NULL, *ifa; + struct ifaddr *ifa; ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - if (ifa->ifa_addr->sa_family == family && ifa0 == NULL) { - ifa0 = ifa; + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr->sa_family == family) { + IFA_ADDREF_LOCKED(ifa); /* for caller */ + IFA_UNLOCK(ifa); break; } + IFA_UNLOCK(ifa); } - if (ifa0 != NULL) - ifaref(ifa0); ifnet_lock_done(ifp); - return (ifa0); + return (ifa); } /* @@ -585,75 +766,89 @@ ifa_ifpgetprimary(struct ifnet *ifp, int family) */ /*ARGSUSED*/ struct ifaddr * -ifa_ifwithaddr( - const struct sockaddr *addr) +ifa_ifwithaddr(const struct sockaddr *addr) { struct ifnet *ifp; struct ifaddr *ifa; struct ifaddr *result = NULL; -#define equal(a1, a2) \ - (bcmp((const void*)(a1), (const void*)(a2), ((const struct sockaddr *)(a1))->sa_len) == 0) - +#define equal(a1, a2) \ + (bcmp((const void*)(a1), (const void*)(a2), \ + ((const struct sockaddr *)(a1))->sa_len) == 0) + ifnet_head_lock_shared(); - for (ifp = ifnet_head.tqh_first; ifp && !result; ifp = ifp->if_link.tqe_next) { + for (ifp = ifnet_head.tqh_first; ifp && !result; + ifp = ifp->if_link.tqe_next) { ifnet_lock_shared(ifp); for (ifa = ifp->if_addrhead.tqh_first; ifa; - ifa = ifa->ifa_link.tqe_next) { - if (ifa->ifa_addr->sa_family != addr->sa_family) + ifa = ifa->ifa_link.tqe_next) { + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr->sa_family != addr->sa_family) { + IFA_UNLOCK(ifa); continue; + } if (equal(addr, ifa->ifa_addr)) { result = ifa; + IFA_ADDREF_LOCKED(ifa); /* for caller */ + IFA_UNLOCK(ifa); break; } - if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && - /* IP6 doesn't have broadcast */ - ifa->ifa_broadaddr->sa_len != 0 && - equal(ifa->ifa_broadaddr, addr)) { + if ((ifp->if_flags & IFF_BROADCAST) && + ifa->ifa_broadaddr != NULL && + /* IP6 doesn't have broadcast */ + ifa->ifa_broadaddr->sa_len != 0 && + equal(ifa->ifa_broadaddr, addr)) { result = ifa; + IFA_ADDREF_LOCKED(ifa); /* for caller */ + IFA_UNLOCK(ifa); break; } + IFA_UNLOCK(ifa); } - if (result) - ifaref(result); ifnet_lock_done(ifp); } ifnet_head_done(); - - return result; + + return (result); } /* * Locate the point to point interface with a given destination address. */ /*ARGSUSED*/ struct ifaddr * -ifa_ifwithdstaddr( - const struct sockaddr *addr) +ifa_ifwithdstaddr(const struct sockaddr *addr) { struct ifnet *ifp; struct ifaddr *ifa; struct ifaddr *result = NULL; ifnet_head_lock_shared(); - for (ifp = ifnet_head.tqh_first; ifp && !result; ifp = ifp->if_link.tqe_next) { - if (ifp->if_flags & IFF_POINTOPOINT) { + for (ifp = ifnet_head.tqh_first; ifp && !result; + ifp = ifp->if_link.tqe_next) { + if ((ifp->if_flags & IFF_POINTOPOINT)) { ifnet_lock_shared(ifp); for (ifa = ifp->if_addrhead.tqh_first; ifa; - ifa = ifa->ifa_link.tqe_next) { - if (ifa->ifa_addr->sa_family != addr->sa_family) + ifa = ifa->ifa_link.tqe_next) { + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr->sa_family != + addr->sa_family) { + IFA_UNLOCK(ifa); continue; - if (ifa->ifa_dstaddr && equal(addr, ifa->ifa_dstaddr)) { + } + if (ifa->ifa_dstaddr && + equal(addr, ifa->ifa_dstaddr)) { result = ifa; + IFA_ADDREF_LOCKED(ifa); /* for caller */ + IFA_UNLOCK(ifa); break; } + IFA_UNLOCK(ifa); } - if (result) - ifaref(result); ifnet_lock_done(ifp); } } ifnet_head_done(); - return result; + return (result); } /* @@ -686,10 +881,15 @@ ifa_ifwithaddr_scoped(const struct sockaddr *addr, unsigned int ifscope) ifnet_lock_shared(ifp); for (ifa = ifp->if_addrhead.tqh_first; ifa != NULL; ifa = ifa->ifa_link.tqe_next) { - if (ifa->ifa_addr->sa_family != addr->sa_family) + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr->sa_family != addr->sa_family) { + IFA_UNLOCK(ifa); continue; + } if (equal(addr, ifa->ifa_addr)) { result = ifa; + IFA_ADDREF_LOCKED(ifa); /* for caller */ + IFA_UNLOCK(ifa); break; } if ((ifp->if_flags & IFF_BROADCAST) && @@ -698,11 +898,12 @@ ifa_ifwithaddr_scoped(const struct sockaddr *addr, unsigned int ifscope) ifa->ifa_broadaddr->sa_len != 0 && equal(ifa->ifa_broadaddr, addr)) { result = ifa; + IFA_ADDREF_LOCKED(ifa); /* for caller */ + IFA_UNLOCK(ifa); break; } + IFA_UNLOCK(ifa); } - if (result != NULL) - ifaref(result); ifnet_lock_done(ifp); } ifnet_head_done(); @@ -731,11 +932,17 @@ ifa_ifwithnet_common(const struct sockaddr *addr, unsigned int ifscope) { struct ifnet *ifp; struct ifaddr *ifa = NULL; - struct ifaddr *ifa_maybe = (struct ifaddr *) 0; + struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; const char *addr_data = addr->sa_data, *cplim; - if (!ip_doscopedroute || addr->sa_family != AF_INET) +#if INET6 + if ((af != AF_INET && af != AF_INET6) || + (af == AF_INET && !ip_doscopedroute) || + (af == AF_INET6 && !ip6_doscopedroute)) +#else + if (af != AF_INET || !ip_doscopedroute) +#endif /* !INET6 */ ifscope = IFSCOPE_NONE; ifnet_head_lock_shared(); @@ -744,15 +951,14 @@ ifa_ifwithnet_common(const struct sockaddr *addr, unsigned int ifscope) * so do that if we can. */ if (af == AF_LINK) { - const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)addr; - if (sdl->sdl_index && sdl->sdl_index <= if_index) { + const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)addr; + if (sdl->sdl_index && sdl->sdl_index <= if_index) { ifa = ifnet_addrs[sdl->sdl_index - 1]; - - if (ifa) - ifaref(ifa); - + if (ifa != NULL) + IFA_ADDREF(ifa); + ifnet_head_done(); - return ifa; + return (ifa); } } @@ -766,15 +972,19 @@ ifa_ifwithnet_common(const struct sockaddr *addr, unsigned int ifscope) ifa = ifa->ifa_link.tqe_next) { const char *cp, *cp2, *cp3; - if (ifa->ifa_addr->sa_family != af) -next: continue; + IFA_LOCK(ifa); + if (ifa->ifa_addr == NULL || + ifa->ifa_addr->sa_family != af) { +next: + IFA_UNLOCK(ifa); + continue; + } #ifndef __APPLE__ /* This breaks tunneling application trying to install a route with * a specific subnet and the local address as the destination * It's breaks binary compatibility with previous version of MacOS X */ if ( - #if INET6 /* XXX: for maching gif tunnel dst as routing entry gateway */ addr->sa_family != AF_INET6 && #endif @@ -787,10 +997,13 @@ next: continue; * The trouble is that we don't know the * netmask for the remote end. */ - if (ifa->ifa_dstaddr != 0 - && equal(addr, ifa->ifa_dstaddr)) { - break; - } + if (ifa->ifa_dstaddr != 0 && + equal(addr, ifa->ifa_dstaddr)) { + IFA_ADDREF_LOCKED(ifa); + IFA_UNLOCK(ifa); + break; + } + IFA_UNLOCK(ifa); } else #endif /* __APPLE__*/ { @@ -799,8 +1012,10 @@ next: continue; * find using a matching interface. */ if (ifscope != IFSCOPE_NONE && - ifp->if_index != ifscope) + ifp->if_index != ifscope) { + IFA_UNLOCK(ifa); continue; + } /* * Scan all the bits in the ifa's address. @@ -809,8 +1024,10 @@ next: continue; * to see if it really matters. * (A byte at a time) */ - if (ifa->ifa_netmask == 0) + if (ifa->ifa_netmask == 0) { + IFA_UNLOCK(ifa); continue; + } cp = addr_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; @@ -826,40 +1043,33 @@ next: continue; * before continuing to search * for an even better one. */ - if (ifa_maybe == 0 || + if (ifa_maybe == NULL || rn_refines((caddr_t)ifa->ifa_netmask, (caddr_t)ifa_maybe->ifa_netmask)) { - ifaref(ifa); - if (ifa_maybe) - ifafree(ifa_maybe); + IFA_ADDREF_LOCKED(ifa); /* ifa_maybe */ + IFA_UNLOCK(ifa); + if (ifa_maybe != NULL) + IFA_REMREF(ifa_maybe); ifa_maybe = ifa; + } else { + IFA_UNLOCK(ifa); } } + IFA_LOCK_ASSERT_NOTHELD(ifa); } - - if (ifa) { - ifaref(ifa); - } - - /* - * ifa is set if we found an exact match. - * take a reference to the ifa before - * releasing the ifp lock - */ ifnet_lock_done(ifp); - - if (ifa) { + + if (ifa != NULL) break; - } } ifnet_head_done(); - if (!ifa) + + if (ifa == NULL) ifa = ifa_maybe; - else if (ifa_maybe) { - ifafree(ifa_maybe); - ifa_maybe = NULL; - } - return ifa; + else if (ifa_maybe != NULL) + IFA_REMREF(ifa_maybe); + + return (ifa); } /* @@ -867,9 +1077,7 @@ next: continue; * a given address. */ struct ifaddr * -ifaof_ifpforaddr( - const struct sockaddr *addr, - struct ifnet *ifp) +ifaof_ifpforaddr(const struct sockaddr *addr, struct ifnet *ifp) { struct ifaddr *ifa = NULL; const char *cp, *cp2, *cp3; @@ -880,55 +1088,80 @@ ifaof_ifpforaddr( if (af >= AF_MAX) return (NULL); - + ifnet_lock_shared(ifp); for (ifa = ifp->if_addrhead.tqh_first; ifa; ifa = ifa->ifa_link.tqe_next) { - if (ifa->ifa_addr->sa_family != af) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != af) { + IFA_UNLOCK(ifa); continue; - if (ifa_maybe == 0) + } + if (ifa_maybe == NULL) { + IFA_ADDREF_LOCKED(ifa); /* for ifa_maybe */ ifa_maybe = ifa; + } if (ifa->ifa_netmask == 0) { - if (equal(addr, ifa->ifa_addr) || - (ifa->ifa_dstaddr && equal(addr, ifa->ifa_dstaddr))) - break; + if (equal(addr, ifa->ifa_addr) || (ifa->ifa_dstaddr && + equal(addr, ifa->ifa_dstaddr))) { + IFA_ADDREF_LOCKED(ifa); /* for caller */ + IFA_UNLOCK(ifa); + break; + } + IFA_UNLOCK(ifa); continue; } if (ifp->if_flags & IFF_POINTOPOINT) { - if (ifa->ifa_dstaddr && equal(addr, ifa->ifa_dstaddr)) + if (ifa->ifa_dstaddr && equal(addr, ifa->ifa_dstaddr)) { + IFA_ADDREF_LOCKED(ifa); /* for caller */ + IFA_UNLOCK(ifa); break; + } } else { - if (equal(addr, ifa->ifa_addr)) { + if (equal(addr, ifa->ifa_addr)) { /* exact match */ + IFA_ADDREF_LOCKED(ifa); /* for caller */ + IFA_UNLOCK(ifa); break; } cp = addr->sa_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; - cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; + cplim = ifa->ifa_netmask->sa_len + + (char *)ifa->ifa_netmask; for (; cp3 < cplim; cp3++) if ((*cp++ ^ *cp2++) & *cp3) break; if (cp3 == cplim) { /* subnet match */ if (better_ifa_maybe == NULL) { + /* for better_ifa_maybe */ + IFA_ADDREF_LOCKED(ifa); better_ifa_maybe = ifa; } } } + IFA_UNLOCK(ifa); } - + if (ifa == NULL) { if (better_ifa_maybe != NULL) { ifa = better_ifa_maybe; + better_ifa_maybe = NULL; } else { ifa = ifa_maybe; + ifa_maybe = NULL; } } - if (ifa) ifaref(ifa); - + ifnet_lock_done(ifp); - return ifa; + + if (better_ifa_maybe != NULL) + IFA_REMREF(better_ifa_maybe); + if (ifa_maybe != NULL) + IFA_REMREF(ifa_maybe); + + return (ifa); } #include @@ -944,6 +1177,7 @@ link_rtrequest(int cmd, struct rtentry *rt, struct sockaddr *sa) struct ifaddr *ifa; struct sockaddr *dst; struct ifnet *ifp; + void (*ifa_rtrequest)(int, struct rtentry *, struct sockaddr *); lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); RT_LOCK_ASSERT_HELD(rt); @@ -951,12 +1185,19 @@ link_rtrequest(int cmd, struct rtentry *rt, struct sockaddr *sa) if (cmd != RTM_ADD || ((ifa = rt->rt_ifa) == 0) || ((ifp = ifa->ifa_ifp) == 0) || ((dst = rt_key(rt)) == 0)) return; + + /* Become a regular mutex, just in case */ + RT_CONVERT_LOCK(rt); + ifa = ifaof_ifpforaddr(dst, ifp); if (ifa) { rtsetifa(rt, ifa); - if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest) - ifa->ifa_rtrequest(cmd, rt, sa); - ifafree(ifa); + IFA_LOCK_SPIN(ifa); + ifa_rtrequest = ifa->ifa_rtrequest; + IFA_UNLOCK(ifa); + if (ifa_rtrequest != NULL && ifa_rtrequest != link_rtrequest) + ifa_rtrequest(cmd, rt, sa); + IFA_REMREF(ifa); } } @@ -1088,19 +1329,19 @@ ifunit(const char *name) len = strlen(name); if (len < 2 || len > IFNAMSIZ) - return NULL; + return (NULL); cp = name + len - 1; c = *cp; if (c < '0' || c > '9') - return NULL; /* trailing garbage */ + return (NULL); /* trailing garbage */ unit = 0; m = 1; do { if (cp == name) - return NULL; /* no interface name */ + return (NULL); /* no interface name */ unit += (c - '0') * m; if (unit > 1000000) - return NULL; /* number is unreasonable */ + return (NULL); /* number is unreasonable */ m *= 10; c = *--cp; } while (c >= '0' && c <= '9'); @@ -1134,7 +1375,7 @@ if_withname(struct sockaddr *sa) if ( (sa->sa_family != AF_LINK) || (sdl->sdl_nlen == 0) || (sdl->sdl_nlen > IFNAMSIZ) ) - return NULL; + return (NULL); /* * ifunit wants a null-terminated name. It may not be null-terminated @@ -1145,7 +1386,7 @@ if_withname(struct sockaddr *sa) bcopy(sdl->sdl_data, ifname, sdl->sdl_nlen); ifname[sdl->sdl_nlen] = '\0'; - return ifunit(ifname); + return (ifunit(ifname)); } @@ -1163,6 +1404,8 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) struct kev_msg ev_msg; struct net_event_data ev_data; + bzero(&ev_data, sizeof(struct net_event_data)); + bzero(&ev_msg, sizeof(struct kev_msg)); switch (cmd) { case OSIOCGIFCONF32: case SIOCGIFCONF32: { @@ -1210,21 +1453,26 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) } ifp = ifunit(ifr->ifr_name); - if (ifp == 0) + if (ifp == NULL) return (ENXIO); - switch (cmd) { + switch (cmd) { case SIOCGIFFLAGS: ifnet_lock_shared(ifp); ifr->ifr_flags = ifp->if_flags; ifnet_lock_done(ifp); break; + case SIOCGIFCAP: + ifnet_lock_shared(ifp); + ifr->ifr_reqcap = ifp->if_capabilities; + ifr->ifr_curcap = ifp->if_capenable; + ifnet_lock_done(ifp); + break; + #if CONFIG_MACF_NET case SIOCGIFMAC: error = mac_ifnet_label_get(kauth_cred_get(), ifr, ifp); - if (error) - return (error); break; #endif case SIOCGIFMETRIC: @@ -1247,19 +1495,27 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) case SIOCSIFFLAGS: error = proc_suser(p); - if (error) - return (error); + if (error != 0) + break; - ifnet_set_flags(ifp, ifr->ifr_flags, (u_int16_t)~IFF_CANTCHANGE); + (void) ifnet_set_flags(ifp, ifr->ifr_flags, + (u_int16_t)~IFF_CANTCHANGE); - error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, - cmd, data); + /* + * Note that we intentionally ignore any error from below + * for the SIOCSIFFLAGS case. + */ + (void) ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, + cmd, data); - /* Send the event even upon error from the driver because we changed the flags */ + /* + * Send the event even upon error from the driver because + * we changed the flags. + */ ev_msg.vendor_code = KEV_VENDOR_APPLE; ev_msg.kev_class = KEV_NETWORK_CLASS; ev_msg.kev_subclass = KEV_DL_SUBCLASS; - + ev_msg.event_code = KEV_DL_SIFFLAGS; strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ); ev_data.if_family = ifp->if_family; @@ -1272,24 +1528,37 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) ifnet_touch_lastchange(ifp); break; + case SIOCSIFCAP: + error = proc_suser(p); + if (error != 0) + break; + + if ((ifr->ifr_reqcap & ~ifp->if_capabilities)) { + error = EINVAL; + break; + } + error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, + cmd, data); + + ifnet_touch_lastchange(ifp); + break; + #if CONFIG_MACF_NET case SIOCSIFMAC: error = mac_ifnet_label_set(kauth_cred_get(), ifr, ifp); - if (error) - return (error); break; #endif case SIOCSIFMETRIC: error = proc_suser(p); - if (error) - return (error); - ifp->if_metric = ifr->ifr_metric; + if (error != 0) + break; + ifp->if_metric = ifr->ifr_metric; ev_msg.vendor_code = KEV_VENDOR_APPLE; ev_msg.kev_class = KEV_NETWORK_CLASS; ev_msg.kev_subclass = KEV_DL_SUBCLASS; - + ev_msg.event_code = KEV_DL_SIFMETRICS; strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ); ev_data.if_family = ifp->if_family; @@ -1305,115 +1574,135 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) case SIOCSIFPHYS: error = proc_suser(p); - if (error) - return error; + if (error != 0) + break; - error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, - cmd, data); + error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, + cmd, data); + if (error != 0) + break; - if (error == 0) { - ev_msg.vendor_code = KEV_VENDOR_APPLE; - ev_msg.kev_class = KEV_NETWORK_CLASS; - ev_msg.kev_subclass = KEV_DL_SUBCLASS; - - ev_msg.event_code = KEV_DL_SIFPHYS; - strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ); - ev_data.if_family = ifp->if_family; - ev_data.if_unit = (u_int32_t) ifp->if_unit; - ev_msg.dv[0].data_length = sizeof(struct net_event_data); - ev_msg.dv[0].data_ptr = &ev_data; - ev_msg.dv[1].data_length = 0; - kev_post_msg(&ev_msg); - - ifnet_touch_lastchange(ifp); - } - return(error); + ev_msg.vendor_code = KEV_VENDOR_APPLE; + ev_msg.kev_class = KEV_NETWORK_CLASS; + ev_msg.kev_subclass = KEV_DL_SUBCLASS; + + ev_msg.event_code = KEV_DL_SIFPHYS; + strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ); + ev_data.if_family = ifp->if_family; + ev_data.if_unit = (u_int32_t) ifp->if_unit; + ev_msg.dv[0].data_length = sizeof(struct net_event_data); + ev_msg.dv[0].data_ptr = &ev_data; + ev_msg.dv[1].data_length = 0; + kev_post_msg(&ev_msg); + + ifnet_touch_lastchange(ifp); + break; case SIOCSIFMTU: { u_int32_t oldmtu = ifp->if_mtu; error = proc_suser(p); - if (error) - return (error); - if (ifp->if_ioctl == NULL) - return (EOPNOTSUPP); - if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) - return (EINVAL); - - error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, - cmd, data); + if (error != 0) + break; - if (error == 0) { - ev_msg.vendor_code = KEV_VENDOR_APPLE; - ev_msg.kev_class = KEV_NETWORK_CLASS; - ev_msg.kev_subclass = KEV_DL_SUBCLASS; - - ev_msg.event_code = KEV_DL_SIFMTU; - strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ); - ev_data.if_family = ifp->if_family; - ev_data.if_unit = (u_int32_t) ifp->if_unit; - ev_msg.dv[0].data_length = sizeof(struct net_event_data); - ev_msg.dv[0].data_ptr = &ev_data; - ev_msg.dv[1].data_length = 0; - kev_post_msg(&ev_msg); - - ifnet_touch_lastchange(ifp); - rt_ifmsg(ifp); + if (ifp->if_ioctl == NULL) { + error = EOPNOTSUPP; + break; + } + if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) { + error = EINVAL; + break; } + error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, + cmd, data); + if (error != 0) + break; + + ev_msg.vendor_code = KEV_VENDOR_APPLE; + ev_msg.kev_class = KEV_NETWORK_CLASS; + ev_msg.kev_subclass = KEV_DL_SUBCLASS; + + ev_msg.event_code = KEV_DL_SIFMTU; + strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ); + ev_data.if_family = ifp->if_family; + ev_data.if_unit = (u_int32_t) ifp->if_unit; + ev_msg.dv[0].data_length = sizeof(struct net_event_data); + ev_msg.dv[0].data_ptr = &ev_data; + ev_msg.dv[1].data_length = 0; + kev_post_msg(&ev_msg); + + ifnet_touch_lastchange(ifp); + rt_ifmsg(ifp); + /* * If the link MTU changed, do network layer specific procedure * and update all route entries associated with the interface, * so that their MTU metric gets updated. */ - if (error == 0 && ifp->if_mtu != oldmtu) { + if (ifp->if_mtu != oldmtu) { if_rtmtu_update(ifp); #if INET6 nd6_setmtu(ifp); #endif } - return (error); + break; } case SIOCADDMULTI: case SIOCDELMULTI: error = proc_suser(p); - if (error) - return (error); + if (error != 0) + break; /* Don't allow group membership on non-multicast interfaces. */ - if ((ifp->if_flags & IFF_MULTICAST) == 0) - return EOPNOTSUPP; + if ((ifp->if_flags & IFF_MULTICAST) == 0) { + error = EOPNOTSUPP; + break; + } -#ifndef __APPLE__ /* Don't let users screw up protocols' entries. */ - if (ifr->ifr_addr.sa_family != AF_LINK) - return EINVAL; -#endif + if (ifr->ifr_addr.sa_family != AF_UNSPEC && + ifr->ifr_addr.sa_family != AF_LINK) { + error = EINVAL; + break; + } + /* + * User is permitted to anonymously join a particular link + * multicast group via SIOCADDMULTI. Subsequent join requested + * for the same record which has an outstanding refcnt from a + * past if_addmulti_anon() will not result in EADDRINUSE error + * (unlike other BSDs.) Anonymously leaving a group is also + * allowed only as long as there is an outstanding refcnt held + * by a previous anonymous request, or else ENOENT (even if the + * link-layer multicast membership exists for a network-layer + * membership.) + */ if (cmd == SIOCADDMULTI) { - error = if_addmulti(ifp, &ifr->ifr_addr, NULL); + error = if_addmulti_anon(ifp, &ifr->ifr_addr, NULL); ev_msg.event_code = KEV_DL_ADDMULTI; } else { - error = if_delmulti(ifp, &ifr->ifr_addr); + error = if_delmulti_anon(ifp, &ifr->ifr_addr); ev_msg.event_code = KEV_DL_DELMULTI; } - if (error == 0) { - ev_msg.vendor_code = KEV_VENDOR_APPLE; - ev_msg.kev_class = KEV_NETWORK_CLASS; - ev_msg.kev_subclass = KEV_DL_SUBCLASS; - strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ); - - ev_data.if_family = ifp->if_family; - ev_data.if_unit = (u_int32_t) ifp->if_unit; - ev_msg.dv[0].data_length = sizeof(struct net_event_data); - ev_msg.dv[0].data_ptr = &ev_data; - ev_msg.dv[1].data_length = 0; - kev_post_msg(&ev_msg); - - ifnet_touch_lastchange(ifp); - } - return error; + if (error != 0) + break; + + ev_msg.vendor_code = KEV_VENDOR_APPLE; + ev_msg.kev_class = KEV_NETWORK_CLASS; + ev_msg.kev_subclass = KEV_DL_SUBCLASS; + strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ); + + ev_data.if_family = ifp->if_family; + ev_data.if_unit = (u_int32_t) ifp->if_unit; + ev_msg.dv[0].data_length = sizeof(struct net_event_data); + ev_msg.dv[0].data_ptr = &ev_data; + ev_msg.dv[1].data_length = 0; + kev_post_msg(&ev_msg); + + ifnet_touch_lastchange(ifp); + break; case SIOCSIFPHYADDR: case SIOCDIFPHYADDR: @@ -1429,20 +1718,21 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) case SIOCSIFVLAN: case SIOCSIFBOND: error = proc_suser(p); - if (error) - return (error); + if (error != 0) + break; - error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, - cmd, data); + error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, + cmd, data); + if (error != 0) + break; - if (error == 0) - ifnet_touch_lastchange(ifp); - return error; + ifnet_touch_lastchange(ifp); + break; case SIOCGIFSTATUS: ifs = (struct ifstat *)data; ifs->ascii[0] = '\0'; - + case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: case SIOCGLIFPHYADDR: @@ -1450,12 +1740,15 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) case SIOCGIFMEDIA64: case SIOCGIFGENERIC: case SIOCGIFDEVMTU: - return ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, - cmd, data); + error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, + cmd, data); + break; + case SIOCGIFVLAN: case SIOCGIFBOND: - return ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, - cmd, data); + error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, + cmd, data); + break; case SIOCGIFWAKEFLAGS: ifnet_lock_shared(ifp); @@ -1464,24 +1757,21 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) break; case SIOCGIFGETRTREFCNT: -#if IFNET_ROUTE_REFCNT ifnet_lock_shared(ifp); ifr->ifr_route_refcnt = ifp->if_route_refcnt; ifnet_lock_done(ifp); break; -#else - return (EOPNOTSUPP); -#endif /* IFNET_ROUTE_REFCNT */ default: oif_flags = ifp->if_flags; - if (so->so_proto == 0) - return (EOPNOTSUPP); + if (so->so_proto == NULL) { + error = EOPNOTSUPP; + break; + } { - int ocmd = cmd; + u_long ocmd = cmd; switch (cmd) { - case SIOCSIFDSTADDR: case SIOCSIFADDR: case SIOCSIFBRDADDR: @@ -1513,12 +1803,13 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) case OSIOCGIFNETMASK: cmd = SIOCGIFNETMASK; } + socket_lock(so, 1); - error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, - data, ifp, p)); + error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, + data, ifp, p)); socket_unlock(so, 1); - switch (ocmd) { + switch (ocmd) { case OSIOCGIFADDR: case OSIOCGIFDSTADDR: case OSIOCGIFBRDADDR: @@ -1534,12 +1825,12 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) } if (error == EOPNOTSUPP || error == ENOTSUP) - error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, - cmd, data); + error = ifnet_ioctl(ifp, + so->so_proto->pr_domain->dom_family, cmd, data); - return (error); + break; } - return (0); + return (error); } int @@ -1564,47 +1855,43 @@ ifnet_set_promiscuous( ifnet_t ifp, int pswitch) { - struct ifreq ifr; int error = 0; - int oldflags; - int locked = 0; - int changed = 0; + int oldflags = 0; + int newflags = 0; ifnet_lock_exclusive(ifp); - locked = 1; oldflags = ifp->if_flags; - if (pswitch) { - /* - * If the device is not configured up, we cannot put it in - * promiscuous mode. - */ - if ((ifp->if_flags & IFF_UP) == 0) { - error = ENETDOWN; - goto done; - } - if (ifp->if_pcount++ != 0) { - goto done; - } + ifp->if_pcount += pswitch ? 1 : -1; + + if (ifp->if_pcount > 0) ifp->if_flags |= IFF_PROMISC; - } else { - if (--ifp->if_pcount > 0) - goto done; + else ifp->if_flags &= ~IFF_PROMISC; - } - ifr.ifr_flags = ifp->if_flags; - locked = 0; + + newflags = ifp->if_flags; ifnet_lock_done(ifp); - error = ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, &ifr); - if (error == 0) - rt_ifmsg(ifp); - else - ifp->if_flags = oldflags; -done: - if (locked) ifnet_lock_done(ifp); - if (changed) { - log(LOG_INFO, "%s%d: promiscuous mode %s\n", + + if (newflags != oldflags && (newflags & IFF_UP) != 0) { + error = ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL); + if (error == 0) { + rt_ifmsg(ifp); + } else { + ifnet_lock_exclusive(ifp); + // revert the flags + ifp->if_pcount -= pswitch ? 1 : -1; + if (ifp->if_pcount > 0) + ifp->if_flags |= IFF_PROMISC; + else + ifp->if_flags &= ~IFF_PROMISC; + ifnet_lock_done(ifp); + } + } + + if (newflags != oldflags) { + log(LOG_INFO, "%s%d: promiscuous mode %s%s\n", ifp->if_name, ifp->if_unit, - pswitch != 0 ? "enabled" : "disabled"); + (newflags & IFF_PROMISC) != 0 ? "enable" : "disable", + error != 0 ? " failed" : " succeeded"); } return error; } @@ -1624,7 +1911,7 @@ ifconf(u_long cmd, user_addr_t ifrp, int * ret_space) struct ifreq ifr; int error = 0; size_t space; - + /* * Zero the ifr buffer to make sure we don't * disclose the contents of the stack. @@ -1633,7 +1920,8 @@ ifconf(u_long cmd, user_addr_t ifrp, int * ret_space) space = *ret_space; ifnet_head_lock_shared(); - for (ifp = ifnet_head.tqh_first; space > sizeof(ifr) && ifp; ifp = ifp->if_link.tqe_next) { + for (ifp = ifnet_head.tqh_first; space > sizeof(ifr) && + ifp; ifp = ifp->if_link.tqe_next) { char workbuf[64]; size_t ifnlen, addrs; @@ -1645,17 +1933,22 @@ ifconf(u_long cmd, user_addr_t ifrp, int * ret_space) } else { strlcpy(ifr.ifr_name, workbuf, IFNAMSIZ); } - + ifnet_lock_shared(ifp); addrs = 0; ifa = ifp->if_addrhead.tqh_first; for ( ; space > sizeof (ifr) && ifa; ifa = ifa->ifa_link.tqe_next) { - struct sockaddr *sa = ifa->ifa_addr; + struct sockaddr *sa; + + IFA_LOCK(ifa); + sa = ifa->ifa_addr; #ifndef __APPLE__ - if (curproc->p_prison && prison_if(curproc, sa)) + if (curproc->p_prison && prison_if(curproc, sa)) { + IFA_UNLOCK(ifa); continue; + } #endif addrs++; if (cmd == OSIOCGIFCONF32 || cmd == OSIOCGIFCONF64) { @@ -1663,30 +1956,38 @@ ifconf(u_long cmd, user_addr_t ifrp, int * ret_space) (struct osockaddr *)&ifr.ifr_addr; ifr.ifr_addr = *sa; osa->sa_family = sa->sa_family; - error = copyout((caddr_t)&ifr, ifrp, sizeof(ifr)); + error = copyout((caddr_t)&ifr, ifrp, + sizeof (ifr)); ifrp += sizeof(struct ifreq); } else if (sa->sa_len <= sizeof(*sa)) { ifr.ifr_addr = *sa; - error = copyout((caddr_t)&ifr, ifrp, sizeof(ifr)); + error = copyout((caddr_t)&ifr, ifrp, + sizeof (ifr)); ifrp += sizeof(struct ifreq); } else { - if (space < sizeof (ifr) + sa->sa_len - sizeof(*sa)) + if (space < + sizeof (ifr) + sa->sa_len - sizeof(*sa)) { + IFA_UNLOCK(ifa); break; + } space -= sa->sa_len - sizeof(*sa); - error = copyout((caddr_t)&ifr, ifrp, sizeof (ifr.ifr_name)); + error = copyout((caddr_t)&ifr, ifrp, + sizeof (ifr.ifr_name)); if (error == 0) { - error = copyout((caddr_t)sa, - (ifrp + offsetof(struct ifreq, ifr_addr)), - sa->sa_len); + error = copyout((caddr_t)sa, (ifrp + + offsetof(struct ifreq, ifr_addr)), + sa->sa_len); } - ifrp += (sa->sa_len + offsetof(struct ifreq, ifr_addr)); + ifrp += (sa->sa_len + offsetof(struct ifreq, + ifr_addr)); } + IFA_UNLOCK(ifa); if (error) break; space -= sizeof (ifr); } ifnet_lock_done(ifp); - + if (error) break; if (!addrs) { @@ -1738,64 +2039,300 @@ if_allmulti(struct ifnet *ifp, int onswitch) return error; } +static struct ifmultiaddr * +ifma_alloc(int how) +{ + struct ifmultiaddr *ifma; + + ifma = (how == M_WAITOK) ? zalloc(ifma_zone) : + zalloc_noblock(ifma_zone); + + if (ifma != NULL) { + bzero(ifma, ifma_size); + lck_mtx_init(&ifma->ifma_lock, ifa_mtx_grp, ifa_mtx_attr); + ifma->ifma_debug |= IFD_ALLOC; + if (ifma_debug != 0) { + ifma->ifma_debug |= IFD_DEBUG; + ifma->ifma_trace = ifma_trace; + } + } + return (ifma); +} + +static void +ifma_free(struct ifmultiaddr *ifma) +{ + IFMA_LOCK(ifma); + + if (ifma->ifma_protospec != NULL) { + panic("%s: Protospec not NULL for ifma=%p", __func__, ifma); + /* NOTREACHED */ + } else if ((ifma->ifma_flags & IFMAF_ANONYMOUS) || + ifma->ifma_anoncnt != 0) { + panic("%s: Freeing ifma=%p with outstanding anon req", + __func__, ifma); + /* NOTREACHED */ + } else if (ifma->ifma_debug & IFD_ATTACHED) { + panic("%s: ifma=%p attached to ifma_ifp=%p is being freed", + __func__, ifma, ifma->ifma_ifp); + /* NOTREACHED */ + } else if (!(ifma->ifma_debug & IFD_ALLOC)) { + panic("%s: ifma %p cannot be freed", __func__, ifma); + /* NOTREACHED */ + } else if (ifma->ifma_refcount != 0) { + panic("%s: non-zero refcount ifma=%p", __func__, ifma); + /* NOTREACHED */ + } else if (ifma->ifma_reqcnt != 0) { + panic("%s: non-zero reqcnt ifma=%p", __func__, ifma); + /* NOTREACHED */ + } else if (ifma->ifma_ifp != NULL) { + panic("%s: non-NULL ifma_ifp=%p for ifma=%p", __func__, + ifma->ifma_ifp, ifma); + /* NOTREACHED */ + } else if (ifma->ifma_ll != NULL) { + panic("%s: non-NULL ifma_ll=%p for ifma=%p", __func__, + ifma->ifma_ll, ifma); + /* NOTREACHED */ + } + ifma->ifma_debug &= ~IFD_ALLOC; + if ((ifma->ifma_debug & (IFD_DEBUG | IFD_TRASHED)) == + (IFD_DEBUG | IFD_TRASHED)) { + lck_mtx_lock(&ifma_trash_lock); + TAILQ_REMOVE(&ifma_trash_head, (struct ifmultiaddr_dbg *)ifma, + ifma_trash_link); + lck_mtx_unlock(&ifma_trash_lock); + ifma->ifma_debug &= ~IFD_TRASHED; + } + IFMA_UNLOCK(ifma); + + if (ifma->ifma_addr != NULL) { + FREE(ifma->ifma_addr, M_IFADDR); + ifma->ifma_addr = NULL; + } + lck_mtx_destroy(&ifma->ifma_lock, ifa_mtx_grp); + zfree(ifma_zone, ifma); +} + +static void +ifma_trace(struct ifmultiaddr *ifma, int refhold) +{ + struct ifmultiaddr_dbg *ifma_dbg = (struct ifmultiaddr_dbg *)ifma; + ctrace_t *tr; + u_int32_t idx; + u_int16_t *cnt; + + if (!(ifma->ifma_debug & IFD_DEBUG)) { + panic("%s: ifma %p has no debug structure", __func__, ifma); + /* NOTREACHED */ + } + if (refhold) { + cnt = &ifma_dbg->ifma_refhold_cnt; + tr = ifma_dbg->ifma_refhold; + } else { + cnt = &ifma_dbg->ifma_refrele_cnt; + tr = ifma_dbg->ifma_refrele; + } + + idx = atomic_add_16_ov(cnt, 1) % IFMA_TRACE_HIST_SIZE; + ctrace_record(&tr[idx]); +} + void -ifma_reference( - struct ifmultiaddr *ifma) +ifma_addref(struct ifmultiaddr *ifma, int locked) { - if (OSIncrementAtomic(&ifma->ifma_refcount) <= 0) - panic("ifma_reference: ifma already released or invalid\n"); + if (!locked) + IFMA_LOCK(ifma); + else + IFMA_LOCK_ASSERT_HELD(ifma); + + if (++ifma->ifma_refcount == 0) { + panic("%s: ifma=%p wraparound refcnt", __func__, ifma); + /* NOTREACHED */ + } else if (ifma->ifma_trace != NULL) { + (*ifma->ifma_trace)(ifma, TRUE); + } + if (!locked) + IFMA_UNLOCK(ifma); } void -ifma_release( - struct ifmultiaddr *ifma) -{ - while (ifma) { - struct ifmultiaddr *next; - int32_t prevValue = OSDecrementAtomic(&ifma->ifma_refcount); - if (prevValue < 1) - panic("ifma_release: ifma already released or invalid\n"); - if (prevValue != 1) - break; - - /* Allow the allocator of the protospec to free it */ - if (ifma->ifma_protospec && ifma->ifma_free) { - ifma->ifma_free(ifma->ifma_protospec); - } - - next = ifma->ifma_ll; - FREE(ifma->ifma_addr, M_IFMADDR); - FREE(ifma, M_IFMADDR); - ifma = next; +ifma_remref(struct ifmultiaddr *ifma) +{ + struct ifmultiaddr *ll; + + IFMA_LOCK(ifma); + + if (ifma->ifma_refcount == 0) { + panic("%s: ifma=%p negative refcnt", __func__, ifma); + /* NOTREACHED */ + } else if (ifma->ifma_trace != NULL) { + (*ifma->ifma_trace)(ifma, FALSE); + } + + --ifma->ifma_refcount; + if (ifma->ifma_refcount > 0) { + IFMA_UNLOCK(ifma); + return; } + + ll = ifma->ifma_ll; + ifma->ifma_ifp = NULL; + ifma->ifma_ll = NULL; + IFMA_UNLOCK(ifma); + ifma_free(ifma); /* deallocate it */ + + if (ll != NULL) + IFMA_REMREF(ll); +} + +static void +if_attach_ifma(struct ifnet *ifp, struct ifmultiaddr *ifma, int anon) +{ + ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE); + IFMA_LOCK_ASSERT_HELD(ifma); + + if (ifma->ifma_ifp != ifp) { + panic("%s: Mismatch ifma_ifp=%p != ifp=%p", __func__, + ifma->ifma_ifp, ifp); + /* NOTREACHED */ + } else if (ifma->ifma_debug & IFD_ATTACHED) { + panic("%s: Attempt to attach an already attached ifma=%p", + __func__, ifma); + /* NOTREACHED */ + } else if (anon && (ifma->ifma_flags & IFMAF_ANONYMOUS)) { + panic("%s: ifma=%p unexpected IFMAF_ANONYMOUS", __func__, ifma); + /* NOTREACHED */ + } else if (ifma->ifma_debug & IFD_TRASHED) { + panic("%s: Attempt to reattach a detached ifma=%p", + __func__, ifma); + /* NOTREACHED */ + } + + ifma->ifma_reqcnt++; + VERIFY(ifma->ifma_reqcnt == 1); + IFMA_ADDREF_LOCKED(ifma); + ifma->ifma_debug |= IFD_ATTACHED; + if (anon) { + ifma->ifma_anoncnt++; + VERIFY(ifma->ifma_anoncnt == 1); + ifma->ifma_flags |= IFMAF_ANONYMOUS; + } + + LIST_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link); } - /* - * Find an ifmultiaddr that matches a socket address on an interface. - * - * Caller is responsible for holding the ifnet_lock while calling - * this function. - */ static int -if_addmulti_doesexist( - struct ifnet *ifp, - const struct sockaddr *sa, - struct ifmultiaddr **retifma) +if_detach_ifma(struct ifnet *ifp, struct ifmultiaddr *ifma, int anon) +{ + ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE); + IFMA_LOCK_ASSERT_HELD(ifma); + + if (ifma->ifma_reqcnt == 0) { + panic("%s: ifma=%p negative reqcnt", __func__, ifma); + /* NOTREACHED */ + } else if (anon && !(ifma->ifma_flags & IFMAF_ANONYMOUS)) { + panic("%s: ifma=%p missing IFMAF_ANONYMOUS", __func__, ifma); + /* NOTREACHED */ + } else if (anon && ifma->ifma_anoncnt == 0) { + panic("%s: ifma=%p negative anonreqcnt", __func__, ifma); + /* NOTREACHED */ + } else if (ifma->ifma_ifp != ifp) { + panic("%s: Mismatch ifma_ifp=%p, ifp=%p", __func__, + ifma->ifma_ifp, ifp); + /* NOTREACHED */ + } + + if (anon) { + --ifma->ifma_anoncnt; + if (ifma->ifma_anoncnt > 0) + return (0); + ifma->ifma_flags &= ~IFMAF_ANONYMOUS; + } + + --ifma->ifma_reqcnt; + if (ifma->ifma_reqcnt > 0) + return (0); + + if (ifma->ifma_protospec != NULL) { + panic("%s: Protospec not NULL for ifma=%p", __func__, ifma); + /* NOTREACHED */ + } else if ((ifma->ifma_flags & IFMAF_ANONYMOUS) || + ifma->ifma_anoncnt != 0) { + panic("%s: Detaching ifma=%p with outstanding anon req", + __func__, ifma); + /* NOTREACHED */ + } else if (!(ifma->ifma_debug & IFD_ATTACHED)) { + panic("%s: Attempt to detach an unattached address ifma=%p", + __func__, ifma); + /* NOTREACHED */ + } else if (ifma->ifma_debug & IFD_TRASHED) { + panic("%s: ifma %p is already in trash list", __func__, ifma); + /* NOTREACHED */ + } + + /* + * NOTE: Caller calls IFMA_REMREF + */ + ifma->ifma_debug &= ~IFD_ATTACHED; + LIST_REMOVE(ifma, ifma_link); + if (LIST_EMPTY(&ifp->if_multiaddrs)) + ifp->if_updatemcasts = 0; + + if (ifma->ifma_debug & IFD_DEBUG) { + /* Become a regular mutex, just in case */ + IFMA_CONVERT_LOCK(ifma); + lck_mtx_lock(&ifma_trash_lock); + TAILQ_INSERT_TAIL(&ifma_trash_head, + (struct ifmultiaddr_dbg *)ifma, ifma_trash_link); + lck_mtx_unlock(&ifma_trash_lock); + ifma->ifma_debug |= IFD_TRASHED; + } + + return (1); +} + +/* + * Find an ifmultiaddr that matches a socket address on an interface. + * + * Caller is responsible for holding the ifnet_lock while calling + * this function. + */ +static int +if_addmulti_doesexist(struct ifnet *ifp, const struct sockaddr *sa, + struct ifmultiaddr **retifma, int anon) { struct ifmultiaddr *ifma; - for (ifma = ifp->if_multiaddrs.lh_first; ifma; - ifma = ifma->ifma_link.le_next) { - if (equal(sa, ifma->ifma_addr)) { - ifma->ifma_usecount++; - if (retifma) { - *retifma = ifma; - ifma_reference(*retifma); + + for (ifma = LIST_FIRST(&ifp->if_multiaddrs); ifma != NULL; + ifma = LIST_NEXT(ifma, ifma_link)) { + IFMA_LOCK_SPIN(ifma); + if (!equal(sa, ifma->ifma_addr)) { + IFMA_UNLOCK(ifma); + continue; + } + if (anon) { + VERIFY(!(ifma->ifma_flags & IFMAF_ANONYMOUS) || + ifma->ifma_anoncnt != 0); + VERIFY((ifma->ifma_flags & IFMAF_ANONYMOUS) || + ifma->ifma_anoncnt == 0); + ifma->ifma_anoncnt++; + if (!(ifma->ifma_flags & IFMAF_ANONYMOUS)) { + VERIFY(ifma->ifma_anoncnt == 1); + ifma->ifma_flags |= IFMAF_ANONYMOUS; } - return 0; } + if (!anon || ifma->ifma_anoncnt == 1) { + ifma->ifma_reqcnt++; + VERIFY(ifma->ifma_reqcnt > 1); + } + if (retifma != NULL) { + *retifma = ifma; + IFMA_ADDREF_LOCKED(ifma); + } + IFMA_UNLOCK(ifma); + return (0); } - - return ENOENT; + return (ENOENT); } /* @@ -1864,67 +2401,114 @@ copy_and_normalize( } /* - * Add a multicast listenership to the interface in question. - * The link layer provides a routine which converts + * Network-layer protocol domains which hold references to the underlying + * link-layer record must use this routine. */ int -if_addmulti( - struct ifnet *ifp, /* interface to manipulate */ - const struct sockaddr *sa, /* address to add */ - struct ifmultiaddr **retifma) +if_addmulti(struct ifnet *ifp, const struct sockaddr *sa, + struct ifmultiaddr **retifma) +{ + return (if_addmulti_common(ifp, sa, retifma, 0)); +} + +/* + * Anything other than network-layer protocol domains which hold references + * to the underlying link-layer record must use this routine: SIOCADDMULTI + * ioctl, ifnet_add_multicast(), AppleTalk, if_bond. + */ +int +if_addmulti_anon(struct ifnet *ifp, const struct sockaddr *sa, + struct ifmultiaddr **retifma) +{ + return (if_addmulti_common(ifp, sa, retifma, 1)); +} + +/* + * Register an additional multicast address with a network interface. + * + * - If the address is already present, bump the reference count on the + * address and return. + * - If the address is not link-layer, look up a link layer address. + * - Allocate address structures for one or both addresses, and attach to the + * multicast address list on the interface. If automatically adding a link + * layer address, the protocol address will own a reference to the link + * layer address, to be freed when it is freed. + * - Notify the network device driver of an addition to the multicast address + * list. + * + * 'sa' points to caller-owned memory with the desired multicast address. + * + * 'retifma' will be used to return a pointer to the resulting multicast + * address reference, if desired. + * + * 'anon' indicates a link-layer address with no protocol address reference + * made to it. Anything other than network-layer protocol domain requests + * are considered as anonymous. + */ +static int +if_addmulti_common(struct ifnet *ifp, const struct sockaddr *sa, + struct ifmultiaddr **retifma, int anon) { struct sockaddr_storage storage; struct sockaddr *llsa = NULL; struct sockaddr *dupsa = NULL; - int error = 0; + int error = 0, ll_firstref = 0, lladdr; struct ifmultiaddr *ifma = NULL; struct ifmultiaddr *llifma = NULL; - + + /* Only AF_UNSPEC/AF_LINK is allowed for an "anonymous" address */ + VERIFY(!anon || sa->sa_family == AF_UNSPEC || + sa->sa_family == AF_LINK); + /* If sa is a AF_LINK or AF_UNSPEC, duplicate and normalize it */ if (sa->sa_family == AF_LINK || sa->sa_family == AF_UNSPEC) { dupsa = copy_and_normalize(sa); if (dupsa == NULL) { - return ENOMEM; + error = ENOMEM; + goto cleanup; } sa = dupsa; } - + ifnet_lock_exclusive(ifp); - error = if_addmulti_doesexist(ifp, sa, retifma); - ifnet_lock_done(ifp); - - if (error == 0) { + if (!(ifp->if_flags & IFF_MULTICAST)) { + error = EADDRNOTAVAIL; + ifnet_lock_done(ifp); goto cleanup; } + /* If the address is already present, return a new reference to it */ + error = if_addmulti_doesexist(ifp, sa, retifma, anon); + ifnet_lock_done(ifp); + if (error == 0) + goto cleanup; + /* - * Give the link layer a chance to accept/reject it, and also - * find out which AF_LINK address this maps to, if it isn't one - * already. + * The address isn't already present; give the link layer a chance + * to accept/reject it, and also find out which AF_LINK address this + * maps to, if it isn't one already. */ - error = dlil_resolve_multi(ifp, sa, (struct sockaddr*)&storage, - sizeof(storage)); + error = dlil_resolve_multi(ifp, sa, (struct sockaddr *)&storage, + sizeof (storage)); if (error == 0 && storage.ss_len != 0) { - llsa = copy_and_normalize((struct sockaddr*)&storage); + llsa = copy_and_normalize((struct sockaddr *)&storage); if (llsa == NULL) { error = ENOMEM; goto cleanup; } - - MALLOC(llifma, struct ifmultiaddr *, sizeof *llifma, M_IFMADDR, M_WAITOK); + + llifma = ifma_alloc(M_WAITOK); if (llifma == NULL) { error = ENOMEM; goto cleanup; } } - + /* to be similar to FreeBSD */ - if (error == EOPNOTSUPP) { + if (error == EOPNOTSUPP) error = 0; - } - else if (error) { + else if (error != 0) goto cleanup; - } /* Allocate while we aren't holding any locks */ if (dupsa == NULL) { @@ -1934,185 +2518,212 @@ if_addmulti( goto cleanup; } } - MALLOC(ifma, struct ifmultiaddr *, sizeof *ifma, M_IFMADDR, M_WAITOK); + ifma = ifma_alloc(M_WAITOK); if (ifma == NULL) { error = ENOMEM; goto cleanup; } - + ifnet_lock_exclusive(ifp); /* * Check again for the matching multicast. */ - if ((error = if_addmulti_doesexist(ifp, sa, retifma)) == 0) { + error = if_addmulti_doesexist(ifp, sa, retifma, anon); + if (error == 0) { ifnet_lock_done(ifp); goto cleanup; } - bzero(ifma, sizeof(*ifma)); - ifma->ifma_addr = dupsa; - ifma->ifma_ifp = ifp; - ifma->ifma_usecount = 1; - ifma->ifma_refcount = 1; - - if (llifma != 0) { - if (if_addmulti_doesexist(ifp, llsa, &ifma->ifma_ll) == 0) { - FREE(llsa, M_IFMADDR); - FREE(llifma, M_IFMADDR); + if (llifma != NULL) { + VERIFY(!anon); /* must not get here if "anonymous" */ + if (if_addmulti_doesexist(ifp, llsa, &ifma->ifma_ll, 0) == 0) { + FREE(llsa, M_IFADDR); + llsa = NULL; + ifma_free(llifma); + llifma = NULL; + VERIFY(ifma->ifma_ll->ifma_ifp == ifp); } else { - bzero(llifma, sizeof(*llifma)); + ll_firstref = 1; llifma->ifma_addr = llsa; llifma->ifma_ifp = ifp; - llifma->ifma_usecount = 1; - llifma->ifma_refcount = 1; - LIST_INSERT_HEAD(&ifp->if_multiaddrs, llifma, ifma_link); - + IFMA_LOCK(llifma); + if_attach_ifma(ifp, llifma, 0); + /* add extra refcnt for ifma */ + IFMA_ADDREF_LOCKED(llifma); + IFMA_UNLOCK(llifma); ifma->ifma_ll = llifma; - ifma_reference(ifma->ifma_ll); } } - - LIST_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link); - - if (retifma) { + + /* "anonymous" request should not result in network address */ + VERIFY(!anon || ifma->ifma_ll == NULL); + + ifma->ifma_addr = dupsa; + ifma->ifma_ifp = ifp; + IFMA_LOCK(ifma); + if_attach_ifma(ifp, ifma, anon); + IFMA_ADDREF_LOCKED(ifma); /* for this routine */ + if (retifma != NULL) { *retifma = ifma; - ifma_reference(*retifma); + IFMA_ADDREF_LOCKED(*retifma); /* for caller */ } - + lladdr = (ifma->ifma_addr->sa_family == AF_UNSPEC || + ifma->ifma_addr->sa_family == AF_LINK); + IFMA_UNLOCK(ifma); ifnet_lock_done(ifp); - - if (llsa != 0) - rt_newmaddrmsg(RTM_NEWMADDR, ifma); + + rt_newmaddrmsg(RTM_NEWMADDR, ifma); + IFMA_REMREF(ifma); /* for this routine */ /* * We are certain we have added something, so call down to the - * interface to let them know about it. + * interface to let them know about it. Do this only for newly- + * added AF_LINK/AF_UNSPEC address in the if_multiaddrs set. */ - ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL); - - return 0; - + if (lladdr || ll_firstref) + (void) ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL); + + if (ifp->if_updatemcasts > 0) + ifp->if_updatemcasts = 0; + + return (0); + cleanup: - if (ifma) - FREE(ifma, M_IFADDR); - if (dupsa) + if (ifma != NULL) + ifma_free(ifma); + if (dupsa != NULL) FREE(dupsa, M_IFADDR); - if (llifma) - FREE(llifma, M_IFADDR); - if (llsa) + if (llifma != NULL) + ifma_free(llifma); + if (llsa != NULL) FREE(llsa, M_IFADDR); - - return error; + + return (error); } +/* + * Delete a multicast group membership by network-layer group address. + * This routine is deprecated. + */ int -if_delmultiaddr( - struct ifmultiaddr *ifma, - int locked) +if_delmulti(struct ifnet *ifp, const struct sockaddr *sa) { - struct ifnet *ifp; - int do_del_multi = 0; - - ifp = ifma->ifma_ifp; - - if (!locked && ifp) { - ifnet_lock_exclusive(ifp); - } - - while (ifma != NULL) { - struct ifmultiaddr *ll_ifma; - - if (ifma->ifma_usecount > 1) { - ifma->ifma_usecount--; - break; - } - - if (ifp) - LIST_REMOVE(ifma, ifma_link); - - ll_ifma = ifma->ifma_ll; - - if (ll_ifma) { /* send a routing msg for network addresses only */ - if (ifp) - ifnet_lock_done(ifp); - rt_newmaddrmsg(RTM_DELMADDR, ifma); - if (ifp) - ifnet_lock_exclusive(ifp); - } - - /* - * Make sure the interface driver is notified - * in the case of a link layer mcast group being left. - */ - if (ll_ifma == 0) { - if (ifp && ifma->ifma_addr->sa_family == AF_LINK) - do_del_multi = 1; - break; - } - - if (ifp) - ifma_release(ifma); - - ifma = ll_ifma; - } - - if (!locked && ifp) { - /* This wasn't initially locked, we should unlock it */ - ifnet_lock_done(ifp); - } - - if (do_del_multi) { - if (locked) - ifnet_lock_done(ifp); - ifnet_ioctl(ifp, 0, SIOCDELMULTI, NULL); - if (locked) - ifnet_lock_exclusive(ifp); - } - - return 0; + return (if_delmulti_common(NULL, ifp, sa, 0)); } /* - * Remove a reference to a multicast address on this interface. Yell - * if the request does not match an existing membership. + * Delete a multicast group membership by group membership pointer. + * Network-layer protocol domains must use this routine. */ int -if_delmulti( - struct ifnet *ifp, - const struct sockaddr *sa) +if_delmulti_ifma(struct ifmultiaddr *ifma) +{ + return (if_delmulti_common(ifma, NULL, NULL, 0)); +} + +/* + * Anything other than network-layer protocol domains which hold references + * to the underlying link-layer record must use this routine: SIOCDELMULTI + * ioctl, ifnet_remove_multicast(), AppleTalk, if_bond. + */ +int +if_delmulti_anon(struct ifnet *ifp, const struct sockaddr *sa) +{ + return (if_delmulti_common(NULL, ifp, sa, 1)); +} + +/* + * Delete a multicast group membership by network-layer group address. + * + * Returns ENOENT if the entry could not be found. + */ +static int +if_delmulti_common(struct ifmultiaddr *ifma, struct ifnet *ifp, + const struct sockaddr *sa, int anon) { - struct ifmultiaddr *ifma; struct sockaddr *dupsa = NULL; - int retval = 0; + int lastref, ll_lastref = 0, lladdr; + struct ifmultiaddr *ll = NULL; - if (sa->sa_family == AF_LINK || sa->sa_family == AF_UNSPEC) { + /* sanity check for callers */ + VERIFY(ifma != NULL || (ifp != NULL && sa != NULL)); + + if (ifma != NULL) + ifp = ifma->ifma_ifp; + + if (sa != NULL && + (sa->sa_family == AF_LINK || sa->sa_family == AF_UNSPEC)) { dupsa = copy_and_normalize(sa); - if (dupsa == NULL) { - return ENOMEM; - } + if (dupsa == NULL) + return (ENOMEM); sa = dupsa; } - + ifnet_lock_exclusive(ifp); - for (ifma = ifp->if_multiaddrs.lh_first; ifma; - ifma = ifma->ifma_link.le_next) - if (equal(sa, ifma->ifma_addr)) + if (ifma == NULL) { + for (ifma = LIST_FIRST(&ifp->if_multiaddrs); ifma != NULL; + ifma = LIST_NEXT(ifma, ifma_link)) { + IFMA_LOCK(ifma); + if (!equal(sa, ifma->ifma_addr) || + (anon && !(ifma->ifma_flags & IFMAF_ANONYMOUS))) { + VERIFY(!(ifma->ifma_flags & IFMAF_ANONYMOUS) || + ifma->ifma_anoncnt != 0); + IFMA_UNLOCK(ifma); + continue; + } + /* found; keep it locked */ break; - if (ifma == 0) { - ifnet_lock_done(ifp); - if (dupsa) - FREE(dupsa, M_IFADDR); - return ENOENT; + } + if (ifma == NULL) { + if (dupsa != NULL) + FREE(dupsa, M_IFADDR); + ifnet_lock_done(ifp); + return (ENOENT); + } + } else { + IFMA_LOCK(ifma); + } + IFMA_LOCK_ASSERT_HELD(ifma); + IFMA_ADDREF_LOCKED(ifma); /* for this routine */ + lastref = if_detach_ifma(ifp, ifma, anon); + VERIFY(!lastref || (!(ifma->ifma_debug & IFD_ATTACHED) && + ifma->ifma_reqcnt == 0)); + VERIFY(!anon || ifma->ifma_ll == NULL); + ll = ifma->ifma_ll; + lladdr = (ifma->ifma_addr->sa_family == AF_UNSPEC || + ifma->ifma_addr->sa_family == AF_LINK); + IFMA_UNLOCK(ifma); + if (lastref && ll != NULL) { + IFMA_LOCK(ll); + ll_lastref = if_detach_ifma(ifp, ll, 0); + IFMA_UNLOCK(ll); } - - retval = if_delmultiaddr(ifma, 1); ifnet_lock_done(ifp); - if (dupsa) + + if (lastref) + rt_newmaddrmsg(RTM_DELMADDR, ifma); + + if ((ll == NULL && lastref && lladdr) || ll_lastref) { + /* + * Make sure the interface driver is notified in the + * case of a link layer mcast group being left. Do + * this only for a AF_LINK/AF_UNSPEC address that has + * been removed from the if_multiaddrs set. + */ + ifnet_ioctl(ifp, 0, SIOCDELMULTI, NULL); + } + + if (lastref) + IFMA_REMREF(ifma); /* for if_multiaddrs list */ + if (ll_lastref) + IFMA_REMREF(ll); /* for if_multiaddrs list */ + + IFMA_REMREF(ifma); /* for this routine */ + if (dupsa != NULL) FREE(dupsa, M_IFADDR); - - return retval; -} + return (0); +} /* * We don't use if_setlladdr, our interfaces are responsible for @@ -2126,21 +2737,6 @@ if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) } #endif -struct ifmultiaddr * -ifmaof_ifpforaddr(const struct sockaddr *sa, struct ifnet *ifp) -{ - struct ifmultiaddr *ifma; - - ifnet_lock_shared(ifp); - for (ifma = ifp->if_multiaddrs.lh_first; ifma; - ifma = ifma->ifma_link.le_next) - if (equal(ifma->ifma_addr, sa)) - break; - ifnet_lock_done(ifp); - - return ifma; -} - SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Link layers"); SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Generic link-management"); @@ -2279,17 +2875,18 @@ void if_rtmtu_update(struct ifnet *ifp) } __private_extern__ void -if_data_internal_to_if_data( - struct ifnet *ifp, - const struct if_data_internal *if_data_int, - struct if_data *if_data) +if_data_internal_to_if_data(struct ifnet *ifp, + const struct if_data_internal *if_data_int, struct if_data *if_data) { - struct dlil_threading_info *thread; - if ((thread = ifp->if_input_thread) == NULL || (dlil_multithreaded_input == 0)) - thread = dlil_lo_thread_ptr; - -#define COPYFIELD(fld) if_data->fld = if_data_int->fld +#pragma unused(ifp) +#define COPYFIELD(fld) if_data->fld = if_data_int->fld #define COPYFIELD32(fld) if_data->fld = (u_int32_t)(if_data_int->fld) +/* compiler will cast down to 32-bit */ +#define COPYFIELD32_ATOMIC(fld) do { \ + atomic_get_64(if_data->fld, \ + (u_int64_t *)(void *)(uintptr_t)&if_data_int->fld); \ +} while (0) + COPYFIELD(ifi_type); COPYFIELD(ifi_typelen); COPYFIELD(ifi_physical); @@ -2302,29 +2899,28 @@ if_data_internal_to_if_data( COPYFIELD(ifi_metric); if (if_data_int->ifi_baudrate & 0xFFFFFFFF00000000LL) { if_data->ifi_baudrate = 0xFFFFFFFF; - } - else { + } else { COPYFIELD32(ifi_baudrate); } + + COPYFIELD32_ATOMIC(ifi_ipackets); + COPYFIELD32_ATOMIC(ifi_ierrors); + COPYFIELD32_ATOMIC(ifi_opackets); + COPYFIELD32_ATOMIC(ifi_oerrors); + COPYFIELD32_ATOMIC(ifi_collisions); + COPYFIELD32_ATOMIC(ifi_ibytes); + COPYFIELD32_ATOMIC(ifi_obytes); + COPYFIELD32_ATOMIC(ifi_imcasts); + COPYFIELD32_ATOMIC(ifi_omcasts); + COPYFIELD32_ATOMIC(ifi_iqdrops); + COPYFIELD32_ATOMIC(ifi_noproto); + + COPYFIELD(ifi_recvtiming); + COPYFIELD(ifi_xmittiming); - lck_mtx_lock(thread->input_lck); - COPYFIELD32(ifi_ipackets); - COPYFIELD32(ifi_ierrors); - COPYFIELD32(ifi_opackets); - COPYFIELD32(ifi_oerrors); - COPYFIELD32(ifi_collisions); - COPYFIELD32(ifi_ibytes); - COPYFIELD32(ifi_obytes); - COPYFIELD32(ifi_imcasts); - COPYFIELD32(ifi_omcasts); - COPYFIELD32(ifi_iqdrops); - COPYFIELD32(ifi_noproto); - COPYFIELD32(ifi_recvtiming); - COPYFIELD32(ifi_xmittiming); if_data->ifi_lastchange.tv_sec = if_data_int->ifi_lastchange.tv_sec; if_data->ifi_lastchange.tv_usec = if_data_int->ifi_lastchange.tv_usec; - lck_mtx_unlock(thread->input_lck); - + #if IF_LASTCHANGEUPTIME if_data->ifi_lastchange.tv_sec += boottime_sec(); #endif @@ -2333,70 +2929,103 @@ if_data_internal_to_if_data( COPYFIELD(ifi_hwassist); if_data->ifi_reserved1 = 0; if_data->ifi_reserved2 = 0; +#undef COPYFIELD32_ATOMIC #undef COPYFIELD32 #undef COPYFIELD } __private_extern__ void -if_data_internal_to_if_data64( - struct ifnet *ifp, - const struct if_data_internal *if_data_int, - struct if_data64 *if_data64) +if_data_internal_to_if_data64(struct ifnet *ifp, + const struct if_data_internal *if_data_int, + struct if_data64 *if_data64) { - struct dlil_threading_info *thread; - if ((thread = ifp->if_input_thread) == NULL || (dlil_multithreaded_input == 0)) - thread = dlil_lo_thread_ptr; - -#define COPYFIELD(fld) if_data64->fld = if_data_int->fld - COPYFIELD(ifi_type); - COPYFIELD(ifi_typelen); - COPYFIELD(ifi_physical); - COPYFIELD(ifi_addrlen); - COPYFIELD(ifi_hdrlen); - COPYFIELD(ifi_recvquota); - COPYFIELD(ifi_xmitquota); +#pragma unused(ifp) +#define COPYFIELD64(fld) if_data64->fld = if_data_int->fld +#define COPYFIELD64_ATOMIC(fld) do { \ + atomic_get_64(if_data64->fld, \ + (u_int64_t *)(void *)(uintptr_t)&if_data_int->fld); \ +} while (0) + + COPYFIELD64(ifi_type); + COPYFIELD64(ifi_typelen); + COPYFIELD64(ifi_physical); + COPYFIELD64(ifi_addrlen); + COPYFIELD64(ifi_hdrlen); + COPYFIELD64(ifi_recvquota); + COPYFIELD64(ifi_xmitquota); if_data64->ifi_unused1 = 0; - COPYFIELD(ifi_mtu); - COPYFIELD(ifi_metric); - COPYFIELD(ifi_baudrate); - - lck_mtx_lock(thread->input_lck); - COPYFIELD(ifi_ipackets); - COPYFIELD(ifi_ierrors); - COPYFIELD(ifi_opackets); - COPYFIELD(ifi_oerrors); - COPYFIELD(ifi_collisions); - COPYFIELD(ifi_ibytes); - COPYFIELD(ifi_obytes); - COPYFIELD(ifi_imcasts); - COPYFIELD(ifi_omcasts); - COPYFIELD(ifi_iqdrops); - COPYFIELD(ifi_noproto); - COPYFIELD(ifi_recvtiming); - COPYFIELD(ifi_xmittiming); + COPYFIELD64(ifi_mtu); + COPYFIELD64(ifi_metric); + COPYFIELD64(ifi_baudrate); + + COPYFIELD64_ATOMIC(ifi_ipackets); + COPYFIELD64_ATOMIC(ifi_ierrors); + COPYFIELD64_ATOMIC(ifi_opackets); + COPYFIELD64_ATOMIC(ifi_oerrors); + COPYFIELD64_ATOMIC(ifi_collisions); + COPYFIELD64_ATOMIC(ifi_ibytes); + COPYFIELD64_ATOMIC(ifi_obytes); + COPYFIELD64_ATOMIC(ifi_imcasts); + COPYFIELD64_ATOMIC(ifi_omcasts); + COPYFIELD64_ATOMIC(ifi_iqdrops); + COPYFIELD64_ATOMIC(ifi_noproto); + + /* Note these two fields are actually 32 bit, so doing COPYFIELD64_ATOMIC will + * cause them to be misaligned + */ + COPYFIELD64(ifi_recvtiming); + COPYFIELD64(ifi_xmittiming); + if_data64->ifi_lastchange.tv_sec = if_data_int->ifi_lastchange.tv_sec; if_data64->ifi_lastchange.tv_usec = if_data_int->ifi_lastchange.tv_usec; - lck_mtx_unlock(thread->input_lck); - + #if IF_LASTCHANGEUPTIME if_data64->ifi_lastchange.tv_sec += boottime_sec(); #endif -#undef COPYFIELD +#undef COPYFIELD64 } -void -ifafree(struct ifaddr *ifa) +__private_extern__ void +if_copy_traffic_class(struct ifnet *ifp, + struct if_traffic_class *if_tc) { - int oldval; +#define COPY_IF_TC_FIELD64_ATOMIC(fld) do { \ + atomic_get_64(if_tc->fld, \ + (u_int64_t *)(void *)(uintptr_t)&ifp->if_tc.fld); \ +} while (0) + + COPY_IF_TC_FIELD64_ATOMIC(ifi_ibkpackets); + COPY_IF_TC_FIELD64_ATOMIC(ifi_ibkbytes); + COPY_IF_TC_FIELD64_ATOMIC(ifi_obkpackets); + COPY_IF_TC_FIELD64_ATOMIC(ifi_obkbytes); + COPY_IF_TC_FIELD64_ATOMIC(ifi_ivipackets); + COPY_IF_TC_FIELD64_ATOMIC(ifi_ivibytes); + COPY_IF_TC_FIELD64_ATOMIC(ifi_ovipackets); + COPY_IF_TC_FIELD64_ATOMIC(ifi_ovibytes); + COPY_IF_TC_FIELD64_ATOMIC(ifi_ivopackets); + COPY_IF_TC_FIELD64_ATOMIC(ifi_ivobytes); + COPY_IF_TC_FIELD64_ATOMIC(ifi_ovopackets); + COPY_IF_TC_FIELD64_ATOMIC(ifi_ovobytes); + +#undef COPY_IF_TC_FIELD64_ATOMIC +} - oldval = OSAddAtomic(-1, &ifa->ifa_refcnt); - if (oldval >= 1 && ifa->ifa_trace != NULL) - (*ifa->ifa_trace)(ifa, FALSE); - if (oldval == 0) { + +struct ifaddr * +ifa_remref(struct ifaddr *ifa, int locked) +{ + if (!locked) + IFA_LOCK_SPIN(ifa); + else + IFA_LOCK_ASSERT_HELD(ifa); + + if (ifa->ifa_refcnt == 0) panic("%s: ifa %p negative refcnt\n", __func__, ifa); - } else if (oldval == 1) { - if (ifa->ifa_debug & IFD_ATTACHED) + else if (ifa->ifa_trace != NULL) + (*ifa->ifa_trace)(ifa, FALSE); + if (--ifa->ifa_refcnt == 0) { + if (ifa->ifa_debug & IFD_ATTACHED) panic("ifa %p attached to ifp is being freed\n", ifa); /* * Some interface addresses are allocated either statically @@ -2406,22 +3035,54 @@ ifafree(struct ifaddr *ifa) * leave it alone. */ if (ifa->ifa_debug & IFD_ALLOC) { - if (ifa->ifa_free == NULL) + if (ifa->ifa_free == NULL) { + IFA_UNLOCK(ifa); FREE(ifa, M_IFADDR); - else + } else { + /* Become a regular mutex */ + IFA_CONVERT_LOCK(ifa); + /* callee will unlock */ (*ifa->ifa_free)(ifa); + } + } else { + IFA_UNLOCK(ifa); } + ifa = NULL; } + + if (!locked && ifa != NULL) + IFA_UNLOCK(ifa); + + return (ifa); } void -ifaref(struct ifaddr *ifa) +ifa_addref(struct ifaddr *ifa, int locked) { - int oldval; + if (!locked) + IFA_LOCK_SPIN(ifa); + else + IFA_LOCK_ASSERT_HELD(ifa); - oldval = OSAddAtomic(1, &ifa->ifa_refcnt); - if (oldval < 0) - panic("%s: ifa %p negative refcnt\n", __func__, ifa); - else if (ifa->ifa_trace != NULL) + if (++ifa->ifa_refcnt == 0) { + panic("%s: ifa %p wraparound refcnt\n", __func__, ifa); + /* NOTREACHED */ + } else if (ifa->ifa_trace != NULL) { (*ifa->ifa_trace)(ifa, TRUE); + } + if (!locked) + IFA_UNLOCK(ifa); +} + +void +ifa_lock_init(struct ifaddr *ifa) +{ + lck_mtx_init(&ifa->ifa_lock, ifa_mtx_grp, ifa_mtx_attr); +} + +void +ifa_lock_destroy(struct ifaddr *ifa) +{ + IFA_LOCK_ASSERT_NOTHELD(ifa); + lck_mtx_destroy(&ifa->ifa_lock, ifa_mtx_grp); } diff --git a/bsd/net/if.h b/bsd/net/if.h index 1e847ab04..a7974460c 100644 --- a/bsd/net/if.h +++ b/bsd/net/if.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -96,6 +96,7 @@ #define KEV_DL_LINK_ADDRESS_CHANGED 16 #define KEV_DL_WAKEFLAGS_CHANGED 17 #define KEV_DL_IF_IDLE_ROUTE_REFCNT 18 +#define KEV_DL_IFCAP_CHANGED 19 #include #include @@ -146,16 +147,17 @@ struct if_clonereq32 { #define IFEF_AUTOCONFIGURING 0x1 #define IFEF_DVR_REENTRY_OK 0x20 /* When set, driver may be reentered from its own thread */ #define IFEF_ACCEPT_RTADVD 0x40 /* set to accept IPv6 router advertisement on the interface */ -#define IFEF_DETACHING 0x80 /* Set when interface is detaching */ -#define IFEF_USEKPI 0x100 /* Set when interface is created through the KPIs */ +#define _IFEF_DETACHING 0x80 /* deprecated */ +#define IFEF_USEKPI 0x100 /* Set when interface is created through the KPIs */ #define IFEF_VLAN 0x200 /* interface has one or more vlans */ #define IFEF_BOND 0x400 /* interface is part of bond */ #define IFEF_ARPLL 0x800 /* ARP for IPv4LL addresses on this port */ #define IFEF_NOWINDOWSCALE 0x1000 /* Don't scale TCP window on iface */ #define IFEF_NOAUTOIPV6LL 0x2000 /* Interface IPv6 LinkLocal address not provided by kernel */ -#define IFEF_SENDLIST 0x10000000 /* Interface supports sending a list of packets */ -#define IFEF_REUSE 0x20000000 /* DLIL ifnet recycler, ifnet is not new */ -#define IFEF_INUSE 0x40000000 /* DLIL ifnet recycler, ifnet in use */ +#define IFEF_SERVICE_TRIGGERED 0x20000 /* interface is on-demand dynamically created/destroyed */ +#define IFEF_SENDLIST 0x10000000 /* Interface supports sending a list of packets */ +#define _IFEF_REUSE 0x20000000 /* deprecated */ +#define _IFEF_INUSE 0x40000000 /* deprecated */ #define IFEF_UPDOWNCHANGE 0x80000000 /* Interface's up/down state is changing */ /* @@ -177,6 +179,40 @@ struct if_clonereq32 { #endif /* KERNEL_PRIVATE */ +/* + * Capabilities that interfaces can advertise. + * + * struct ifnet.if_capabilities + * contains the optional features & capabilities a particular interface + * supports (not only the driver but also the detected hw revision). + * Capabilities are defined by IFCAP_* below. + * struct ifnet.if_capenable + * contains the enabled (either by default or through ifconfig) optional + * features & capabilities on this interface. + * Capabilities are defined by IFCAP_* below. + * struct if_data.ifi_hwassist in IFNET_* form, defined in net/kpi_interface.h, + * contains the enabled optional features & capabilites that can be used + * individually per packet and are specified in the mbuf pkthdr.csum_flags + * field. IFCAP_* and IFNET_* do not match one to one and IFNET_* may be + * more detailed or differenciated than IFCAP_*. + * IFNET_* hwassist flags have corresponding CSUM_* in sys/mbuf.h + */ +#define IFCAP_RXCSUM 0x00001 /* can offload checksum on RX */ +#define IFCAP_TXCSUM 0x00002 /* can offload checksum on TX */ +#define IFCAP_VLAN_MTU 0x00004 /* VLAN-compatible MTU */ +#define IFCAP_VLAN_HWTAGGING 0x00008 /* hardware VLAN tag support */ +#define IFCAP_JUMBO_MTU 0x00010 /* 9000 byte MTU supported */ +#define IFCAP_TSO4 0x00020 /* can do TCP Segmentation Offload */ +#define IFCAP_TSO6 0x00040 /* can do TCP6 Segmentation Offload */ +#define IFCAP_LRO 0x00080 /* can do Large Receive Offload */ +#define IFCAP_AV 0x00100 /* can do 802.1 AV Bridging */ + +#define IFCAP_HWCSUM (IFCAP_RXCSUM | IFCAP_TXCSUM) +#define IFCAP_TSO (IFCAP_TSO4 | IFCAP_TSO6) + +#define IFCAP_VALID (IFCAP_HWCSUM | IFCAP_TSO | IFCAP_LRO | IFCAP_VLAN_MTU | \ + IFCAP_VLAN_HWTAGGING | IFCAP_JUMBO_MTU | IFCAP_AV) + #define IFQ_MAXLEN 50 #define IFNET_SLOWHZ 1 /* granularity is 1 second */ @@ -341,6 +377,7 @@ struct ifreq { struct ifkpi ifru_kpi; u_int32_t ifru_wake_flags; u_int32_t ifru_route_refcnt; + int ifru_cap[2]; } ifr_ifru; #define ifr_addr ifr_ifru.ifru_addr /* address */ #define ifr_dstaddr ifr_ifru.ifru_dstaddr /* other end of p-to-p link */ @@ -364,6 +401,8 @@ struct ifreq { #define ifr_kpi ifr_ifru.ifru_kpi #define ifr_wake_flags ifr_ifru.ifru_wake_flags /* wake capabilities of devive */ #define ifr_route_refcnt ifr_ifru.ifru_route_refcnt /* route references on interface */ +#define ifr_reqcap ifr_ifru.ifru_cap[0] /* requested capabilities */ +#define ifr_curcap ifr_ifru.ifru_cap[1] /* current capabilities */ }; #define _SIZEOF_ADDR_IFREQ(ifr) \ diff --git a/bsd/net/if_atm.h b/bsd/net/if_atm.h deleted file mode 100644 index dc4689b56..000000000 --- a/bsd/net/if_atm.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* $NetBSD: if_atm.h,v 1.7 1996/11/09 23:02:27 chuck Exp $ */ -/* $FreeBSD: src/sys/net/if_atm.h,v 1.4 1999/12/29 04:38:34 peter Exp $ */ - -/* - * - * Copyright (c) 1996 Charles D. Cranor and Washington University. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by Charles D. Cranor and - * Washington University. - * 4. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - * net/if_atm.h - */ - -#ifdef KERNEL_PRIVATE -#if defined(__NetBSD__) || defined(__OpenBSD__) || defined(__bsdi__) -#define RTALLOC1(A,B) rtalloc1((A),(B)) -#elif defined(__FreeBSD__) || defined(__APPLE__) -#define RTALLOC1(A,B) rtalloc1((A),(B),0UL) -#endif -#endif /* KERNEL_PRIVATE */ - -/* - * pseudo header for packet transmission - */ -struct atm_pseudohdr { - u_int8_t atm_ph[4]; /* flags+VPI+VCI1(msb)+VCI2(lsb) */ -}; - -#define ATM_PH_FLAGS(X) ((X)->atm_ph[0]) -#define ATM_PH_VPI(X) ((X)->atm_ph[1]) -#define ATM_PH_VCI(X) ((((X)->atm_ph[2]) << 8) | ((X)->atm_ph[3])) -#define ATM_PH_SETVCI(X,V) { \ - (X)->atm_ph[2] = ((V) >> 8) & 0xff; \ - (X)->atm_ph[3] = ((V) & 0xff); \ -} - -#define ATM_PH_AAL5 0x01 /* use AAL5? (0 == aal0) */ -#define ATM_PH_LLCSNAP 0x02 /* use the LLC SNAP encoding (iff aal5) */ - -#define ATM_PH_DRIVER7 0x40 /* reserve for driver's use */ -#define ATM_PH_DRIVER8 0x80 /* reserve for driver's use */ - -#define ATMMTU 9180 /* ATM MTU size for IP */ - /* XXX: could be 9188 with LLC/SNAP according - to comer */ - -/* user's ioctl hook for raw atm mode */ -#define SIOCRAWATM _IOWR('a', 122, int) /* set driver's raw mode */ - -/* atm_pseudoioctl: turns on and off RX VCIs [for internal use only!] */ -struct atm_pseudoioctl { - struct atm_pseudohdr aph; - void *rxhand; -}; -#define SIOCATMENA _IOWR('a', 123, struct atm_pseudoioctl) /* enable */ -#define SIOCATMDIS _IOWR('a', 124, struct atm_pseudoioctl) /* disable */ - - -/* - * XXX forget all the garbage in if_llc.h and do it the easy way - */ - -#define ATMLLC_HDR "\252\252\3\0\0\0" -struct atmllc { - u_int8_t llchdr[6]; /* aa.aa.03.00.00.00 */ - u_int8_t type[2]; /* "ethernet" type */ -}; - -/* ATM_LLC macros: note type code in host byte order */ -#define ATM_LLC_TYPE(X) (((X)->type[0] << 8) | ((X)->type[1])) -#define ATM_LLC_SETTYPE(X,V) { \ - (X)->type[1] = ((V) >> 8) & 0xff; \ - (X)->type[0] = ((V) & 0xff); \ -} - -#ifdef KERNEL_PRIVATE -void atm_ifattach(struct ifnet *); -void atm_input(struct ifnet *, struct atm_pseudohdr *, - struct mbuf *, void *); -int atm_output(struct ifnet *, struct mbuf *, struct sockaddr *, - struct rtentry *); -#endif /* KERNEL_PRIVATE */ - diff --git a/bsd/net/if_bond.c b/bsd/net/if_bond.c index fa07935a6..91790bd3a 100644 --- a/bsd/net/if_bond.c +++ b/bsd/net/if_bond.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2010 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,8 +79,6 @@ #include #include -extern void dlil_input_packet_list(struct ifnet *, struct mbuf *); - static struct ether_addr slow_proto_multicast = { IEEE8023AD_SLOW_PROTO_MULTICAST }; @@ -724,8 +722,9 @@ ifbond_release(ifbond_ref ifb) printf("ifbond_release(%s) removing multicast\n", ifb->ifb_name); } - (void)if_delmultiaddr(ifb->ifb_ifma_slow_proto, 0); - ifma_release(ifb->ifb_ifma_slow_proto); + (void) if_delmulti_anon(ifb->ifb_ifma_slow_proto->ifma_ifp, + ifb->ifb_ifma_slow_proto->ifma_addr); + IFMA_REMREF(ifb->ifb_ifma_slow_proto); } if (ifb->ifb_distributing_array != NULL) { FREE(ifb->ifb_distributing_array, M_BOND); @@ -885,10 +884,6 @@ if_siflladdr(struct ifnet * ifp, const struct ether_addr * ea_p) ifr.ifr_addr.sa_family = AF_UNSPEC; ifr.ifr_addr.sa_len = ETHER_ADDR_LEN; ether_addr_copy(ifr.ifr_addr.sa_data, ea_p); -#if 0 - snprintf(ifr.ifr_name, sizeof(ifr.ifr_name), "%s%d", ifnet_name(ifp), - ifnet_unit(ifp)); -#endif return (ifnet_ioctl(ifp, 0, SIOCSIFLLADDR, &ifr)); } @@ -909,9 +904,6 @@ bond_globals_create(lacp_system_priority sys_pri, TAILQ_INIT(&b->ifbond_list); b->system = *sys; b->system_priority = sys_pri; -#if 0 - b->verbose = 1; -#endif return (b); } @@ -936,7 +928,6 @@ bond_globals_init(void) for (i = 0; i < 4; i++) { char ifname[IFNAMSIZ+1]; snprintf(ifname, sizeof(ifname), "en%d", i); - /* XXX ifunit() needs to return a reference on the ifp */ ifp = ifunit(ifname); if (ifp != NULL) { break; @@ -1108,8 +1099,7 @@ ifbond_add_slow_proto_multicast(ifbond_ref ifb) sdl.sdl_nlen = 0; sdl.sdl_alen = sizeof(slow_proto_multicast); bcopy(&slow_proto_multicast, sdl.sdl_data, sizeof(slow_proto_multicast)); - error = if_addmulti(ifb->ifb_ifp, (struct sockaddr *)&sdl, - &ifma); + error = if_addmulti_anon(ifb->ifb_ifp, (struct sockaddr *)&sdl, &ifma); if (error == 0) { ifb->ifb_ifma_slow_proto = ifma; } @@ -1236,10 +1226,10 @@ bond_if_detach(struct ifnet * ifp) int error; error = ifnet_detach(ifp); - if (error) { - printf("bond_if_detach %s%d: ifnet_detach failed, %d\n", - ifnet_name(ifp), ifnet_unit(ifp), error); - } + if (error) { + printf("bond_if_detach %s%d: ifnet_detach failed, %d\n", + ifnet_name(ifp), ifnet_unit(ifp), error); + } return; } @@ -2571,24 +2561,10 @@ static int bond_set_promisc(__unused struct ifnet *ifp) { int error = 0; -#if 0 - ifbond_ref ifb = ifnet_softc(ifp); - - - if ((ifnet_flags(ifp) & IFF_PROMISC) != 0) { - if ((ifb->ifb_flags & IFBF_PROMISC) == 0) { - error = ifnet_set_promiscuous(ifb->ifb_p, 1); - if (error == 0) - ifb->ifb_flags |= IFBF_PROMISC; - } - } else { - if ((ifb->ifb_flags & IFBF_PROMISC) != 0) { - error = ifnet_set_promiscuous(ifb->ifb_p, 0); - if (error == 0) - ifb->ifb_flags &= ~IFBF_PROMISC; - } - } -#endif + /* + * The benefit of doing this currently does not warrant + * the added code complexity. Do nothing and return. + */ return (error); } @@ -2812,7 +2788,6 @@ bond_ioctl(struct ifnet *ifp, u_long cmd, void * data) switch (ibr.ibr_op) { case IF_BOND_OP_ADD_INTERFACE: case IF_BOND_OP_REMOVE_INTERFACE: - /* XXX ifunit() needs to return a reference on the ifp */ port_ifp = ifunit(ibr.ibr_ibru.ibru_if_name); if (port_ifp == NULL) { error = ENXIO; @@ -2947,23 +2922,16 @@ bond_if_free(struct ifnet * ifp) } static void -bond_event(struct ifnet * port_ifp, __unused protocol_family_t protocol, - const struct kev_msg * event) +bond_handle_event(struct ifnet * port_ifp, int event_code) { struct ifnet * bond_ifp = NULL; - int event_code = 0; ifbond_ref ifb; int old_distributing_count; bondport_ref p; struct media_info media_info = { 0, 0}; - if (event->vendor_code != KEV_VENDOR_APPLE - || event->kev_class != KEV_NETWORK_CLASS - || event->kev_subclass != KEV_DL_SUBCLASS) { - return; - } - switch (event->event_code) { - case KEV_DL_IF_DETACHING: + switch (event_code) { + case KEV_DL_IF_DETACHED: break; case KEV_DL_LINK_OFF: case KEV_DL_LINK_ON: @@ -2980,8 +2948,8 @@ bond_event(struct ifnet * port_ifp, __unused protocol_family_t protocol, } ifb = p->po_bond; old_distributing_count = ifb->ifb_distributing_count; - switch (event->event_code) { - case KEV_DL_IF_DETACHING: + switch (event_code) { + case KEV_DL_IF_DETACHED: bond_remove_interface(ifb, p->po_ifp); break; case KEV_DL_LINK_OFF: @@ -3042,6 +3010,37 @@ bond_event(struct ifnet * port_ifp, __unused protocol_family_t protocol, return; } +static void +bond_event(struct ifnet * port_ifp, __unused protocol_family_t protocol, + const struct kev_msg * event) +{ + int event_code; + + if (event->vendor_code != KEV_VENDOR_APPLE + || event->kev_class != KEV_NETWORK_CLASS + || event->kev_subclass != KEV_DL_SUBCLASS) { + return; + } + event_code = event->event_code; + switch (event_code) { + case KEV_DL_LINK_OFF: + case KEV_DL_LINK_ON: + /* we only care about link status changes */ + bond_handle_event(port_ifp, event_code); + break; + default: + break; + } + return; +} + +static errno_t +bond_detached(ifnet_t port_ifp, __unused protocol_family_t protocol) +{ + bond_handle_event(port_ifp, KEV_DL_IF_DETACHED); + return (0); +} + static void interface_link_event(struct ifnet * ifp, u_int32_t event_code) { @@ -3051,6 +3050,7 @@ interface_link_event(struct ifnet * ifp, u_int32_t event_code) char if_name[IFNAMSIZ]; } event; + bzero(&event, sizeof(event)); event.header.total_size = sizeof(event); event.header.vendor_code = KEV_VENDOR_APPLE; event.header.kev_class = KEV_NETWORK_CLASS; @@ -3082,6 +3082,7 @@ bond_attach_protocol(struct ifnet *ifp) bzero(®, sizeof(reg)); reg.input = bond_input; reg.event = bond_event; + reg.detached = bond_detached; error = ifnet_attach_protocol(ifp, PF_BOND, ®); if (error) { diff --git a/bsd/net/if_bridge.c b/bsd/net/if_bridge.c new file mode 100644 index 000000000..fd546fa0e --- /dev/null +++ b/bsd/net/if_bridge.c @@ -0,0 +1,5138 @@ +/* $NetBSD: if_bridge.c,v 1.31 2005/06/01 19:45:34 jdc Exp $ */ +/* + * Copyright (c) 2004-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Copyright 2001 Wasabi Systems, Inc. + * All rights reserved. + * + * Written by Jason R. Thorpe for Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed for the NetBSD Project by + * Wasabi Systems, Inc. + * 4. The name of Wasabi Systems, Inc. may not be used to endorse + * or promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 1999, 2000 Jason L. Wright (jason@thought.net) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * OpenBSD: if_bridge.c,v 1.60 2001/06/15 03:38:33 itojun Exp + */ + +/* + * Network interface bridge support. + * + * TODO: + * + * - Currently only supports Ethernet-like interfaces (Ethernet, + * 802.11, VLANs on Ethernet, etc.) Figure out a nice way + * to bridge other types of interfaces (FDDI-FDDI, and maybe + * consider heterogenous bridges). + */ + +#include +//__FBSDID("$FreeBSD$"); + +//#include "opt_inet.h" +//#include "opt_inet6.h" +//#include "opt_carp.h" + +#define BRIDGE_DEBUG 1 +#ifndef BRIDGE_DEBUG +#define BRIDGE_DEBUG 0 +#endif /* BRIDGE_DEBUG */ + +#include +#include +#include +#include +#include +#include +#include /* for net/if.h */ +#include +//#include /* string functions */ +#include +#include +#include +#include +//#include +//#include +//#include +#include +#include +//#include +#include + +#include + +#include + +#include + +#if NBPFILTER > 0 +#include +#endif +#include +//#include +#include +#include +#include +//#include + +#include /* for struct arpcom */ +#include +#include +#include +#include +#ifdef INET6 +#include +#include +#endif +#ifdef DEV_CARP +#include +#endif +//#include +#include /* for struct arpcom */ +#include +#include +#include +#include + +#include +#include +#include + +#include +#ifdef PFIL_HOOKS +#include +#include +#endif /* PFIL_HOOKS */ + +#if BRIDGE_DEBUG + +#define BR_LCKDBG_MAX 4 + +#define BRIDGE_LOCK(_sc) bridge_lock(_sc) +#define BRIDGE_UNLOCK(_sc) bridge_unlock(_sc) +#define BRIDGE_LOCK_ASSERT(_sc) lck_mtx_assert((_sc)->sc_mtx, LCK_MTX_ASSERT_OWNED) +#define BRIDGE_LOCK2REF(_sc, _err) _err = bridge_lock2ref(_sc) +#define BRIDGE_UNREF(_sc) bridge_unref(_sc) +#define BRIDGE_XLOCK(_sc) bridge_xlock(_sc) +#define BRIDGE_XDROP(_sc) bridge_xdrop(_sc) + +#else /* BRIDGE_DEBUG */ + +#define BRIDGE_LOCK(_sc) lck_mtx_lock((_sc)->sc_mtx) +#define BRIDGE_UNLOCK(_sc) lck_mtx_unlock((_sc)->sc_mtx) +#define BRIDGE_LOCK_ASSERT(_sc) lck_mtx_assert((_sc)->sc_mtx, LCK_MTX_ASSERT_OWNED) +#define BRIDGE_LOCK2REF(_sc, _err) do { \ + lck_mtx_assert((_sc)->sc_mtx, LCK_MTX_ASSERT_OWNED); \ + if ((_sc)->sc_iflist_xcnt > 0) \ + (_err) = EBUSY; \ + else \ + (_sc)->sc_iflist_ref++; \ + lck_mtx_unlock((_sc)->sc_mtx); \ +} while (0) +#define BRIDGE_UNREF(_sc) do { \ + lck_mtx_lock((_sc)->sc_mtx); \ + (_sc)->sc_iflist_ref--; \ + if (((_sc)->sc_iflist_xcnt > 0) && ((_sc)->sc_iflist_ref == 0)) { \ + lck_mtx_unlock((_sc)->sc_mtx); \ + wakeup(&(_sc)->sc_cv); \ + } else \ + lck_mtx_unlock((_sc)->sc_mtx); \ +} while (0) +#define BRIDGE_XLOCK(_sc) do { \ + lck_mtx_assert((_sc)->sc_mtx, LCK_MTX_ASSERT_OWNED); \ + (_sc)->sc_iflist_xcnt++; \ + while ((_sc)->sc_iflist_ref > 0) \ + msleep(&(_sc)->sc_cv, (_sc)->sc_mtx, PZERO, "BRIDGE_XLOCK", NULL); \ +} while (0) +#define BRIDGE_XDROP(_sc) do { \ + lck_mtx_assert((_sc)->sc_mtx, LCK_MTX_ASSERT_OWNED); \ + (_sc)->sc_iflist_xcnt--; \ +} while (0) + +#endif /* BRIDGE_DEBUG */ + +#if NBPFILTER > 0 +#define BRIDGE_BPF_MTAP_INPUT(sc, m) \ + if (sc->sc_bpf_input) \ + bridge_bpf_input(sc->sc_ifp, m) +#else /* NBPFILTER */ +#define BRIDGE_BPF_MTAP_INPUT(ifp, m) +#endif /* NBPFILTER */ + +/* + * Size of the route hash table. Must be a power of two. + */ +/* APPLE MODIFICATION - per Wasabi performance improvement, change the hash table size */ +#if 0 +#ifndef BRIDGE_RTHASH_SIZE +#define BRIDGE_RTHASH_SIZE 1024 +#endif +#else +#ifndef BRIDGE_RTHASH_SIZE +#define BRIDGE_RTHASH_SIZE 256 +#endif +#endif + +/* APPLE MODIFICATION - support for HW checksums */ +#if APPLE_BRIDGE_HWCKSUM_SUPPORT +#include +#include +#endif + +#define BRIDGE_RTHASH_MASK (BRIDGE_RTHASH_SIZE - 1) + +/* + * Maximum number of addresses to cache. + */ +#ifndef BRIDGE_RTABLE_MAX +#define BRIDGE_RTABLE_MAX 100 +#endif + + +/* + * Timeout (in seconds) for entries learned dynamically. + */ +#ifndef BRIDGE_RTABLE_TIMEOUT +#define BRIDGE_RTABLE_TIMEOUT (20 * 60) /* same as ARP */ +#endif + +/* + * Number of seconds between walks of the route list. + */ +#ifndef BRIDGE_RTABLE_PRUNE_PERIOD +#define BRIDGE_RTABLE_PRUNE_PERIOD (5 * 60) +#endif + +/* + * List of capabilities to possibly mask on the member interface. + */ +#define BRIDGE_IFCAPS_MASK (IFCAP_TOE|IFCAP_TSO|IFCAP_TXCSUM) +/* + * List of capabilities to disable on the member interface. + */ +#define BRIDGE_IFCAPS_STRIP IFCAP_LRO + +/* + * Bridge interface list entry. + */ +struct bridge_iflist { + TAILQ_ENTRY(bridge_iflist) bif_next; + struct ifnet *bif_ifp; /* member if */ + struct bstp_port bif_stp; /* STP state */ + uint32_t bif_flags; /* member if flags */ + int bif_savedcaps; /* saved capabilities */ + uint32_t bif_addrmax; /* max # of addresses */ + uint32_t bif_addrcnt; /* cur. # of addresses */ + uint32_t bif_addrexceeded;/* # of address violations */ + + interface_filter_t bif_iff_ref; + struct bridge_softc *bif_sc; + char bif_promisc; /* promiscuous mode set */ + char bif_proto_attached; /* protocol attached */ + char bif_filter_attached; /* interface filter attached */ +}; + +/* + * Bridge route node. + */ +struct bridge_rtnode { + LIST_ENTRY(bridge_rtnode) brt_hash; /* hash table linkage */ + LIST_ENTRY(bridge_rtnode) brt_list; /* list linkage */ + struct bridge_iflist *brt_dst; /* destination if */ + unsigned long brt_expire; /* expiration time */ + uint8_t brt_flags; /* address flags */ + uint8_t brt_addr[ETHER_ADDR_LEN]; + uint16_t brt_vlan; /* vlan id */ + +}; +#define brt_ifp brt_dst->bif_ifp + +/* + * Software state for each bridge. + */ +struct bridge_softc { + struct ifnet *sc_ifp; /* make this an interface */ + LIST_ENTRY(bridge_softc) sc_list; + lck_mtx_t *sc_mtx; + void *sc_cv; + uint32_t sc_brtmax; /* max # of addresses */ + uint32_t sc_brtcnt; /* cur. # of addresses */ + uint32_t sc_brttimeout; /* rt timeout in seconds */ + uint32_t sc_iflist_ref; /* refcount for sc_iflist */ + uint32_t sc_iflist_xcnt; /* refcount for sc_iflist */ + TAILQ_HEAD(, bridge_iflist) sc_iflist; /* member interface list */ + LIST_HEAD(, bridge_rtnode) *sc_rthash; /* our forwarding table */ + LIST_HEAD(, bridge_rtnode) sc_rtlist; /* list version of above */ + uint32_t sc_rthash_key; /* key for hash */ + TAILQ_HEAD(, bridge_iflist) sc_spanlist; /* span ports list */ + struct bstp_state sc_stp; /* STP state */ + uint32_t sc_brtexceeded; /* # of cache drops */ + uint32_t sc_filter_flags; /* ipf and flags */ + + char sc_if_xname[IFNAMSIZ]; + bpf_packet_func sc_bpf_input; + bpf_packet_func sc_bpf_output; + u_int32_t sc_flags; + +#if BRIDGE_DEBUG + void *lock_lr[BR_LCKDBG_MAX]; /* locking calling history */ + int next_lock_lr; + void *unlock_lr[BR_LCKDBG_MAX]; /* unlocking caller history */ + int next_unlock_lr; +#endif /* BRIDGE_DEBUG */ +}; + +#define SCF_DETACHING 0x1 + +static lck_mtx_t *bridge_list_mtx; +//eventhandler_tag bridge_detach_cookie = NULL; + +int bridge_rtable_prune_period = BRIDGE_RTABLE_PRUNE_PERIOD; + +static zone_t bridge_rtnode_pool = NULL; + +static int bridge_clone_create(struct if_clone *, uint32_t, void *); +static int bridge_clone_destroy(struct ifnet *); + +static errno_t bridge_ioctl(struct ifnet *, u_long, void *); +#if HAS_IF_CAP +static void bridge_mutecaps(struct bridge_softc *); +static void bridge_set_ifcap(struct bridge_softc *, struct bridge_iflist *, + int); +#endif +__private_extern__ void bridge_ifdetach(struct bridge_iflist *, struct ifnet *); +static int bridge_init(struct ifnet *); +#if HAS_BRIDGE_DUMMYNET +static void bridge_dummynet(struct mbuf *, struct ifnet *); +#endif +static void bridge_stop(struct ifnet *, int); +static errno_t bridge_start(struct ifnet *, struct mbuf *); +__private_extern__ errno_t bridge_input(struct ifnet *, struct mbuf *, void *); +#if BRIDGE_MEMBER_OUT_FILTER +static errno_t bridge_iff_output(void *, ifnet_t , protocol_family_t , mbuf_t *); +static int bridge_output(struct ifnet *, struct mbuf *, struct sockaddr *, + struct rtentry *); +#endif +static void bridge_enqueue(struct bridge_softc *, struct ifnet *, + struct mbuf *); +static void bridge_rtdelete(struct bridge_softc *, struct ifnet *ifp, int); + +static void bridge_forward(struct bridge_softc *, struct bridge_iflist *, + struct mbuf *m); + +static void bridge_timer(void *); + +static void bridge_broadcast(struct bridge_softc *, struct ifnet *, + struct mbuf *, int); +static void bridge_span(struct bridge_softc *, struct mbuf *); + +static int bridge_rtupdate(struct bridge_softc *, const uint8_t *, + uint16_t, struct bridge_iflist *, int, uint8_t); +static struct ifnet *bridge_rtlookup(struct bridge_softc *, const uint8_t *, + uint16_t); +static void bridge_rttrim(struct bridge_softc *); +static void bridge_rtage(struct bridge_softc *); +static void bridge_rtflush(struct bridge_softc *, int); +static int bridge_rtdaddr(struct bridge_softc *, const uint8_t *, + uint16_t); + +static int bridge_rtable_init(struct bridge_softc *); +static void bridge_rtable_fini(struct bridge_softc *); + +static int bridge_rtnode_addr_cmp(const uint8_t *, const uint8_t *); +static struct bridge_rtnode *bridge_rtnode_lookup(struct bridge_softc *, + const uint8_t *, uint16_t); +static int bridge_rtnode_insert(struct bridge_softc *, + struct bridge_rtnode *); +static void bridge_rtnode_destroy(struct bridge_softc *, + struct bridge_rtnode *); +static void bridge_rtable_expire(struct ifnet *, int); +static void bridge_state_change(struct ifnet *, int); + +static struct bridge_iflist *bridge_lookup_member(struct bridge_softc *, + const char *name); +static struct bridge_iflist *bridge_lookup_member_if(struct bridge_softc *, + struct ifnet *ifp); +static void bridge_delete_member(struct bridge_softc *, + struct bridge_iflist *, int); +static void bridge_delete_span(struct bridge_softc *, + struct bridge_iflist *); + +static int bridge_ioctl_add(struct bridge_softc *, void *); +static int bridge_ioctl_del(struct bridge_softc *, void *); +static int bridge_ioctl_gifflags(struct bridge_softc *, void *); +static int bridge_ioctl_sifflags(struct bridge_softc *, void *); +static int bridge_ioctl_scache(struct bridge_softc *, void *); +static int bridge_ioctl_gcache(struct bridge_softc *, void *); +static int bridge_ioctl_gifs32(struct bridge_softc *, void *); +static int bridge_ioctl_gifs64(struct bridge_softc *, void *); +static int bridge_ioctl_rts32(struct bridge_softc *, void *); +static int bridge_ioctl_rts64(struct bridge_softc *, void *); +static int bridge_ioctl_saddr32(struct bridge_softc *, void *); +static int bridge_ioctl_saddr64(struct bridge_softc *, void *); +static int bridge_ioctl_sto(struct bridge_softc *, void *); +static int bridge_ioctl_gto(struct bridge_softc *, void *); +static int bridge_ioctl_daddr32(struct bridge_softc *, void *); +static int bridge_ioctl_daddr64(struct bridge_softc *, void *); +static int bridge_ioctl_flush(struct bridge_softc *, void *); +static int bridge_ioctl_gpri(struct bridge_softc *, void *); +static int bridge_ioctl_spri(struct bridge_softc *, void *); +static int bridge_ioctl_ght(struct bridge_softc *, void *); +static int bridge_ioctl_sht(struct bridge_softc *, void *); +static int bridge_ioctl_gfd(struct bridge_softc *, void *); +static int bridge_ioctl_sfd(struct bridge_softc *, void *); +static int bridge_ioctl_gma(struct bridge_softc *, void *); +static int bridge_ioctl_sma(struct bridge_softc *, void *); +static int bridge_ioctl_sifprio(struct bridge_softc *, void *); +static int bridge_ioctl_sifcost(struct bridge_softc *, void *); +static int bridge_ioctl_sifmaxaddr(struct bridge_softc *, void *); +static int bridge_ioctl_addspan(struct bridge_softc *, void *); +static int bridge_ioctl_delspan(struct bridge_softc *, void *); +static int bridge_ioctl_gbparam32(struct bridge_softc *, void *); +static int bridge_ioctl_gbparam64(struct bridge_softc *, void *); +static int bridge_ioctl_grte(struct bridge_softc *, void *); +static int bridge_ioctl_gifsstp32(struct bridge_softc *, void *); +static int bridge_ioctl_gifsstp64(struct bridge_softc *, void *); +static int bridge_ioctl_sproto(struct bridge_softc *, void *); +static int bridge_ioctl_stxhc(struct bridge_softc *, void *); +static int bridge_ioctl_purge(struct bridge_softc *sc, void *arg); +static int bridge_ioctl_gfilt(struct bridge_softc *, void *); +static int bridge_ioctl_sfilt(struct bridge_softc *, void *); +#ifdef PFIL_HOOKS +static int bridge_pfil(struct mbuf **, struct ifnet *, struct ifnet *, + int); +static int bridge_ip_checkbasic(struct mbuf **mp); +#ifdef INET6 +static int bridge_ip6_checkbasic(struct mbuf **mp); +#endif /* INET6 */ +static int bridge_fragment(struct ifnet *, struct mbuf *, + struct ether_header *, int, struct llc *); +#endif /* PFIL_HOOKS */ + +static errno_t bridge_set_bpf_tap(ifnet_t ifn, bpf_tap_mode mode, bpf_packet_func bpf_callback); +__private_extern__ errno_t bridge_bpf_input(ifnet_t ifp, struct mbuf *m); +__private_extern__ errno_t bridge_bpf_output(ifnet_t ifp, struct mbuf *m); + +static void bridge_detach(ifnet_t ifp); + +#define m_copypacket(m, how) m_copym(m, 0, M_COPYALL, how) + +/* The default bridge vlan is 1 (IEEE 802.1Q-2003 Table 9-2) */ +#define VLANTAGOF(_m) 0 + +static struct bstp_cb_ops bridge_ops = { + .bcb_state = bridge_state_change, + .bcb_rtage = bridge_rtable_expire +}; + +SYSCTL_DECL(_net_link); +SYSCTL_NODE(_net_link, IFT_BRIDGE, bridge, CTLFLAG_RW, 0, "Bridge"); + +#if defined(PFIL_HOOKS) +static int pfil_onlyip = 1; /* only pass IP[46] packets when pfil is enabled */ +static int pfil_bridge = 1; /* run pfil hooks on the bridge interface */ +static int pfil_member = 1; /* run pfil hooks on the member interface */ +static int pfil_ipfw = 0; /* layer2 filter with ipfw */ +static int pfil_ipfw_arp = 0; /* layer2 filter with ipfw */ +static int pfil_local_phys = 0; /* run pfil hooks on the physical interface for + locally destined packets */ +SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_onlyip, CTLFLAG_RW, + &pfil_onlyip, 0, "Only pass IP packets when pfil is enabled"); +SYSCTL_INT(_net_link_bridge, OID_AUTO, ipfw_arp, CTLFLAG_RW, + &pfil_ipfw_arp, 0, "Filter ARP packets through IPFW layer2"); +SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_bridge, CTLFLAG_RW, + &pfil_bridge, 0, "Packet filter on the bridge interface"); +SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_member, CTLFLAG_RW, + &pfil_member, 0, "Packet filter on the member interface"); +SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_local_phys, CTLFLAG_RW, + &pfil_local_phys, 0, + "Packet filter on the physical interface for locally destined packets"); +#endif /* PFIL_HOOKS */ + +static int log_stp = 0; /* log STP state changes */ +SYSCTL_INT(_net_link_bridge, OID_AUTO, log_stp, CTLFLAG_RW, + &log_stp, 0, "Log STP state changes"); + +struct bridge_control { + int (*bc_func)(struct bridge_softc *, void *); + unsigned int bc_argsize; + unsigned int bc_flags; +}; + +#define BC_F_COPYIN 0x01 /* copy arguments in */ +#define BC_F_COPYOUT 0x02 /* copy arguments out */ +#define BC_F_SUSER 0x04 /* do super-user check */ + +static const struct bridge_control bridge_control_table32[] = { + { bridge_ioctl_add, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_del, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gifflags, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_COPYOUT }, + { bridge_ioctl_sifflags, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_scache, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_gcache, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + + { bridge_ioctl_gifs32, sizeof(struct ifbifconf32), + BC_F_COPYIN|BC_F_COPYOUT }, + { bridge_ioctl_rts32, sizeof(struct ifbaconf32), + BC_F_COPYIN|BC_F_COPYOUT }, + + { bridge_ioctl_saddr32, sizeof(struct ifbareq32), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sto, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_gto, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + + { bridge_ioctl_daddr32, sizeof(struct ifbareq32), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_flush, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gpri, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_spri, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_ght, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sht, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gfd, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sfd, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gma, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sma, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sifprio, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sifcost, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gfilt, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sfilt, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_purge, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_addspan, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_delspan, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gbparam32, sizeof(struct ifbropreq32), + BC_F_COPYOUT }, + + { bridge_ioctl_grte, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + + { bridge_ioctl_gifsstp32, sizeof(struct ifbpstpconf32), + BC_F_COPYIN|BC_F_COPYOUT }, + + { bridge_ioctl_sproto, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_stxhc, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sifmaxaddr, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, +}; + +static const struct bridge_control bridge_control_table64[] = { + { bridge_ioctl_add, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_del, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gifflags, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_COPYOUT }, + { bridge_ioctl_sifflags, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_scache, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_gcache, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + + { bridge_ioctl_gifs64, sizeof(struct ifbifconf64), + BC_F_COPYIN|BC_F_COPYOUT }, + { bridge_ioctl_rts64, sizeof(struct ifbaconf64), + BC_F_COPYIN|BC_F_COPYOUT }, + + { bridge_ioctl_saddr64, sizeof(struct ifbareq64), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sto, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_gto, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + + { bridge_ioctl_daddr64, sizeof(struct ifbareq64), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_flush, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gpri, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_spri, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_ght, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sht, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gfd, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sfd, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gma, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sma, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sifprio, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sifcost, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gfilt, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sfilt, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_purge, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_addspan, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_delspan, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gbparam64, sizeof(struct ifbropreq64), + BC_F_COPYOUT }, + + { bridge_ioctl_grte, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + + { bridge_ioctl_gifsstp64, sizeof(struct ifbpstpconf64), + BC_F_COPYIN|BC_F_COPYOUT }, + + { bridge_ioctl_sproto, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_stxhc, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sifmaxaddr, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, +}; + +static const unsigned int bridge_control_table_size = +sizeof(bridge_control_table32) / sizeof(bridge_control_table32[0]); + +static LIST_HEAD(, bridge_softc) bridge_list = LIST_HEAD_INITIALIZER(bridge_list); + +static lck_grp_t *bridge_lock_grp = NULL; +static lck_attr_t *bridge_lock_attr = NULL; + +static if_clone_t bridge_cloner = NULL; + +__private_extern__ int _if_brige_debug = 0; + +SYSCTL_INT(_net_link_bridge, OID_AUTO, debug, CTLFLAG_RW, + &_if_brige_debug, 0, "Bridge debug"); + +#if BRIDGE_DEBUG + +static void printf_ether_header(struct ether_header *eh); +static void printf_mbuf_data(mbuf_t m, size_t offset, size_t len); +static void printf_mbuf_pkthdr(mbuf_t m, const char *prefix, const char *suffix); +static void printf_mbuf(mbuf_t m, const char *prefix, const char *suffix); +static void link_print(struct sockaddr_dl * dl_p); + +static void bridge_lock(struct bridge_softc *); +static void bridge_unlock(struct bridge_softc *); +static int bridge_lock2ref(struct bridge_softc *); +static void bridge_unref(struct bridge_softc *); +static void bridge_xlock(struct bridge_softc *); +static void bridge_xdrop(struct bridge_softc *); + +static void bridge_lock(struct bridge_softc *sc) +{ + void *lr_saved = __builtin_return_address(0); + + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED); + + lck_mtx_lock(sc->sc_mtx); + + sc->lock_lr[sc->next_lock_lr] = lr_saved; + sc->next_lock_lr = (sc->next_lock_lr+1) % SO_LCKDBG_MAX; +} + +static void bridge_unlock(struct bridge_softc *sc) +{ + void *lr_saved = __builtin_return_address(0); + + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED); + + sc->unlock_lr[sc->next_unlock_lr] = lr_saved; + sc->next_unlock_lr = (sc->next_unlock_lr+1) % SO_LCKDBG_MAX; + + lck_mtx_unlock(sc->sc_mtx); +} + +static int bridge_lock2ref(struct bridge_softc *sc) +{ + int error = 0; + void *lr_saved = __builtin_return_address(0); + + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED); + + if (sc->sc_iflist_xcnt > 0) + error = EBUSY; + else + sc->sc_iflist_ref++; + + sc->unlock_lr[sc->next_unlock_lr] = lr_saved; + sc->next_unlock_lr = (sc->next_unlock_lr+1) % SO_LCKDBG_MAX; + lck_mtx_unlock(sc->sc_mtx); + + return error; +} + +static void bridge_unref(struct bridge_softc *sc) +{ + void *lr_saved = __builtin_return_address(0); + + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED); + + lck_mtx_lock(sc->sc_mtx); + sc->lock_lr[sc->next_lock_lr] = lr_saved; + sc->next_lock_lr = (sc->next_lock_lr+1) % SO_LCKDBG_MAX; + + sc->sc_iflist_ref--; + + sc->unlock_lr[sc->next_unlock_lr] = lr_saved; + sc->next_unlock_lr = (sc->next_unlock_lr+1) % SO_LCKDBG_MAX; + if ((sc->sc_iflist_xcnt > 0) && (sc->sc_iflist_ref == 0)) { + lck_mtx_unlock(sc->sc_mtx); + wakeup(&sc->sc_cv); + } else + lck_mtx_unlock(sc->sc_mtx); +} + +static void bridge_xlock(struct bridge_softc *sc) +{ + void *lr_saved = __builtin_return_address(0); + + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED); + + sc->sc_iflist_xcnt++; + while (sc->sc_iflist_ref > 0) { + sc->unlock_lr[sc->next_unlock_lr] = lr_saved; + sc->next_unlock_lr = (sc->next_unlock_lr+1) % SO_LCKDBG_MAX; + + msleep(&sc->sc_cv, sc->sc_mtx, PZERO, "BRIDGE_XLOCK", NULL); + + sc->lock_lr[sc->next_lock_lr] = lr_saved; + sc->next_lock_lr = (sc->next_lock_lr+1) % SO_LCKDBG_MAX; + } +} + +static void bridge_xdrop(struct bridge_softc *sc) +{ + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED); + + sc->sc_iflist_xcnt--; +} + +void +printf_mbuf_pkthdr(mbuf_t m, const char *prefix, const char *suffix) +{ + if (m) + printf("%spktlen: %u rcvif: %p header: %p nextpkt: %p%s", + prefix ? prefix : "", + (unsigned int)mbuf_pkthdr_len(m), mbuf_pkthdr_rcvif(m), mbuf_pkthdr_header(m), mbuf_nextpkt(m), + suffix ? suffix : ""); + else + printf("%s%s\n", prefix, suffix); +} + +void +printf_mbuf(mbuf_t m, const char *prefix, const char *suffix) +{ + if (m) { + printf("%s%p type: %u flags: 0x%x len: %u data: %p maxlen: %u datastart: %p next: %p%s", + prefix ? prefix : "", + m, mbuf_type(m), mbuf_flags(m), (unsigned int)mbuf_len(m), mbuf_data(m), + (unsigned int)mbuf_maxlen(m), mbuf_datastart(m), mbuf_next(m), + !suffix || (mbuf_flags(m) & MBUF_PKTHDR) ? "" : suffix); + if ((mbuf_flags(m) & MBUF_PKTHDR)) + printf_mbuf_pkthdr(m, " ", suffix); + } else + printf("%s%s\n", prefix, suffix); +} + +void +printf_mbuf_data(mbuf_t m, size_t offset, size_t len) +{ + mbuf_t n; + size_t i, j; + size_t pktlen, mlen, maxlen; + unsigned char *ptr; + + pktlen = mbuf_pkthdr_len(m); + + if (offset > pktlen) + return; + + maxlen = (pktlen - offset > len) ? len : pktlen; + n = m; + mlen = mbuf_len(n); + ptr = mbuf_data(n); + for (i = 0, j = 0; i < maxlen; i++, j++) { + if (j >= mlen) { + n = mbuf_next(n); + if (n == 0) + break; + ptr = mbuf_data(n); + mlen = mbuf_len(n); + j = 0; + } + if (i >= offset) { + printf("%02x%s", ptr[j], i % 2 ? " " : ""); + } + } + return; +} + +static void +printf_ether_header(struct ether_header *eh) +{ + printf("%02x:%02x:%02x:%02x:%02x:%02x > %02x:%02x:%02x:%02x:%02x:%02x 0x%04x ", + eh->ether_shost[0], eh->ether_shost[1], eh->ether_shost[2], + eh->ether_shost[3], eh->ether_shost[4], eh->ether_shost[5], + eh->ether_dhost[0], eh->ether_dhost[1], eh->ether_dhost[2], + eh->ether_dhost[3], eh->ether_dhost[4], eh->ether_dhost[5], + eh->ether_type); +} + +static void +link_print(struct sockaddr_dl * dl_p) +{ + int i; + +#if 1 + printf("sdl len %d index %d family %d type 0x%x nlen %d alen %d" + " slen %d addr ", dl_p->sdl_len, + dl_p->sdl_index, dl_p->sdl_family, dl_p->sdl_type, + dl_p->sdl_nlen, dl_p->sdl_alen, dl_p->sdl_slen); +#endif + for (i = 0; i < dl_p->sdl_alen; i++) + printf("%s%x", i ? ":" : "", + (CONST_LLADDR(dl_p))[i]); + printf("\n"); + return; +} + +#endif /* BRIDGE_DEBUG */ + +/* + * bridgeattach: + * + * Pseudo-device attach routine. + */ +__private_extern__ int +bridgeattach(__unused int n) +{ + int error; + lck_grp_attr_t *lck_grp_attr = NULL; + struct ifnet_clone_params ifnet_clone_params; + + bridge_rtnode_pool = zinit(sizeof(struct bridge_rtnode), 1024 * sizeof(struct bridge_rtnode), + 0, "bridge_rtnode"); + zone_change(bridge_rtnode_pool, Z_CALLERACCT, FALSE); + + lck_grp_attr = lck_grp_attr_alloc_init(); + + bridge_lock_grp = lck_grp_alloc_init("if_bridge", lck_grp_attr); + + bridge_lock_attr = lck_attr_alloc_init(); + +#if BRIDGE_DEBUG + lck_attr_setdebug(bridge_lock_attr); +#endif + + bridge_list_mtx = lck_mtx_alloc_init(bridge_lock_grp, bridge_lock_attr); + + // can free the attributes once we've allocated the group lock + lck_grp_attr_free(lck_grp_attr); + + LIST_INIT(&bridge_list); + + bstp_sys_init(); + + ifnet_clone_params.ifc_name = "bridge"; + ifnet_clone_params.ifc_create = bridge_clone_create; + ifnet_clone_params.ifc_destroy = bridge_clone_destroy; + + error = ifnet_clone_attach(&ifnet_clone_params, &bridge_cloner); + if (error != 0) + printf("bridgeattach: ifnet_clone_attach failed %d\n", error); + + return error; +} + +#if defined(PFIL_HOOKS) +/* + * handler for net.link.bridge.pfil_ipfw + */ +static int +sysctl_pfil_ipfw SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1,arg2) + int enable = pfil_ipfw; + int error; + + error = sysctl_handle_int(oidp, &enable, 0, req); + enable = (enable) ? 1 : 0; + + if (enable != pfil_ipfw) { + pfil_ipfw = enable; + + /* + * Disable pfil so that ipfw doesnt run twice, if the user + * really wants both then they can re-enable pfil_bridge and/or + * pfil_member. Also allow non-ip packets as ipfw can filter by + * layer2 type. + */ + if (pfil_ipfw) { + pfil_onlyip = 0; + pfil_bridge = 0; + pfil_member = 0; + } + } + + return (error); +} +SYSCTL_PROC(_net_link_bridge, OID_AUTO, ipfw, CTLTYPE_INT|CTLFLAG_RW, + &pfil_ipfw, 0, &sysctl_pfil_ipfw, "I", "Layer2 filter with IPFW"); +#endif /* PFIL_HOOKS */ + +/* + * bridge_clone_create: + * + * Create a new bridge instance. + */ +static int +bridge_clone_create(struct if_clone *ifc, uint32_t unit, __unused void *params) +{ + struct ifnet *ifp = NULL; + struct bridge_softc *sc; + u_char eaddr[6]; + struct ifnet_init_params init_params; + errno_t error = 0; + uint32_t sdl_buffer[offsetof(struct sockaddr_dl, sdl_data) + IFNAMSIZ + ETHER_ADDR_LEN]; + struct sockaddr_dl *sdl = (struct sockaddr_dl *)sdl_buffer; + + sc = _MALLOC(sizeof(*sc), M_DEVBUF, M_WAITOK); + memset(sc, 0, sizeof(*sc)); + + sc->sc_mtx = lck_mtx_alloc_init(bridge_lock_grp, bridge_lock_attr); + sc->sc_brtmax = BRIDGE_RTABLE_MAX; + sc->sc_brttimeout = BRIDGE_RTABLE_TIMEOUT; + sc->sc_filter_flags = IFBF_FILT_DEFAULT; +#ifndef BRIDGE_IPF + /* + * For backwards compatibility with previous behaviour... + * Switch off filtering on the bridge itself if BRIDGE_IPF is + * not defined. + */ + sc->sc_filter_flags &= ~IFBF_FILT_USEIPF; +#endif + + /* Initialize our routing table. */ + error = bridge_rtable_init(sc); + if (error != 0) { + printf("bridge_clone_create: bridge_rtable_init failed %d\n", error); + goto done; + } + + TAILQ_INIT(&sc->sc_iflist); + TAILQ_INIT(&sc->sc_spanlist); + + /* use the interface name as the unique id for ifp recycle */ + snprintf(sc->sc_if_xname, sizeof(sc->sc_if_xname), "%s%d", + ifc->ifc_name, unit); + memset(&init_params, 0, sizeof(struct ifnet_init_params)); + init_params.uniqueid = sc->sc_if_xname; + init_params.uniqueid_len = strlen(sc->sc_if_xname); + init_params.name = ifc->ifc_name; + init_params.unit = unit; + init_params.family = IFNET_FAMILY_ETHERNET; + init_params.type = IFT_BRIDGE; + init_params.output = bridge_start; + init_params.demux = ether_demux; + init_params.add_proto = ether_add_proto; + init_params.del_proto = ether_del_proto; + init_params.check_multi = ether_check_multi; + init_params.framer = ether_frameout; + init_params.softc = sc; + init_params.ioctl = bridge_ioctl; + init_params.set_bpf_tap = bridge_set_bpf_tap; + init_params.detach = bridge_detach; + init_params.broadcast_addr = etherbroadcastaddr; + init_params.broadcast_len = ETHER_ADDR_LEN; + error = ifnet_allocate(&init_params, &ifp); + if (error != 0) { + printf("bridge_clone_create: ifnet_allocate failed %d\n", error); + goto done; + } + sc->sc_ifp = ifp; + + error = ifnet_set_mtu(ifp, ETHERMTU); + if (error != 0) { + printf("bridge_clone_create: ifnet_set_mtu failed %d\n", error); + goto done; + } + error = ifnet_set_addrlen(ifp, ETHER_ADDR_LEN); + if (error != 0) { + printf("bridge_clone_create: ifnet_set_addrlen failed %d\n", error); + goto done; + } + error = ifnet_set_baudrate(ifp, 10000000) ; // XXX: this is what IONetworking does + if (error != 0) { + printf("bridge_clone_create: ifnet_set_baudrate failed %d\n", error); + goto done; + } + error = ifnet_set_hdrlen(ifp, ETHER_HDR_LEN); + if (error != 0) { + printf("bridge_clone_create: ifnet_set_hdrlen failed %d\n", error); + goto done; + } + error = ifnet_set_flags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_NOTRAILERS | IFF_MULTICAST, + 0xffff); + if (error != 0) { + printf("bridge_clone_create: ifnet_set_flags failed %d\n", error); + goto done; + } + +#if 0 + /* + * Generate a random ethernet address with a locally administered + * address. + * + * Since we are using random ethernet addresses for the bridge, it is + * possible that we might have address collisions, so make sure that + * this hardware address isn't already in use on another bridge. + */ + { + int retry; + + for (retry = 1; retry != 0;) { + struct ifnet *bifp; + struct bridge_softc *sc2; + + read_random(eaddr, ETHER_ADDR_LEN); + eaddr[0] &= ~1; /* clear multicast bit */ + eaddr[0] |= 2; /* set the LAA bit */ + retry = 0; + lck_mtx_lock(bridge_list_mtx); + LIST_FOREACH(sc2, &bridge_list, sc_list) { + bifp = sc2->sc_ifp; + if (memcmp(eaddr, ifnet_lladdr(bifp), ETHER_ADDR_LEN) == 0) + retry = 1; + } + lck_mtx_unlock(bridge_list_mtx); + } + } +#else + /* + * Generate a random ethernet address and use the private AC:DE:48 + * OUI code. + */ + { + uint32_t r; + + read_random(&r, sizeof(r)); + eaddr[0] = 0xAC; + eaddr[1] = 0xDE; + eaddr[2] = 0x48; + eaddr[3] = (r >> 0) & 0xffu; + eaddr[4] = (r >> 8) & 0xffu; + eaddr[5] = (r >> 16) & 0xffu; + } +#endif + + memset(sdl, 0, sizeof(sdl_buffer)); + sdl->sdl_family = AF_LINK; + sdl->sdl_nlen = strlen(sc->sc_if_xname); + sdl->sdl_alen = ETHER_ADDR_LEN; + sdl->sdl_len = offsetof(struct sockaddr_dl, sdl_data); + memcpy(sdl->sdl_data, sc->sc_if_xname, sdl->sdl_nlen); + memcpy(LLADDR(sdl), eaddr, ETHER_ADDR_LEN); + +#if BRIDGE_DEBUG + link_print(sdl); +#endif + + error = ifnet_attach(ifp, NULL); + if (error != 0) { + printf("bridge_clone_create: ifnet_attach failed %d\n", error); + goto done; + } + + error = ifnet_set_lladdr_and_type(ifp, eaddr, ETHER_ADDR_LEN, IFT_ETHER); + if (error != 0) { + printf("bridge_clone_create: ifnet_set_lladdr_and_type failed %d\n", error); + goto done; + } + +#if APPLE_BRIDGE_HWCKSUM_SUPPORT + /* + * APPLE MODIFICATION - our bridge can support HW checksums + * (useful if underlying interfaces support them) on TX, + * RX is not that interesting, since the stack just looks to + * see if the packet has been checksummed already (I think) + * but we might as well indicate we support it + */ + ifp->if_capabilities = + IFCAP_CSUM_IPv4_Tx | IFCAP_CSUM_TCPv4_Tx | IFCAP_CSUM_UDPv4_Tx | + IFCAP_CSUM_IPv4_Rx | IFCAP_CSUM_TCPv4_Rx | IFCAP_CSUM_UDPv4_Rx ; +#endif + + bstp_attach(&sc->sc_stp, &bridge_ops); + + lck_mtx_lock(bridge_list_mtx); + LIST_INSERT_HEAD(&bridge_list, sc, sc_list); + lck_mtx_unlock(bridge_list_mtx); + + /* attach as ethernet */ + error = bpf_attach(ifp, DLT_EN10MB, sizeof(struct ether_header), NULL, NULL); + +done: + if (error != 0) { + printf("bridge_clone_create failed error %d\n", error); + /* Cleanup TBD */ + } + + return error; +} + +/* + * bridge_clone_destroy: + * + * Destroy a bridge instance. + */ +static int +bridge_clone_destroy(struct ifnet *ifp) +{ + struct bridge_softc *sc = ifp->if_softc; + struct bridge_iflist *bif; + errno_t error; + + BRIDGE_LOCK(sc); + if ((sc->sc_flags & SCF_DETACHING)) { + BRIDGE_UNLOCK(sc); + return 0; + } + sc->sc_flags |= SCF_DETACHING; + + bridge_stop(ifp, 1); + + error = ifnet_set_flags(ifp, 0, IFF_UP); + if (error != 0) { + printf("bridge_clone_destroy: ifnet_set_flags failed %d\n", error); + } + + while ((bif = TAILQ_FIRST(&sc->sc_iflist)) != NULL) + bridge_delete_member(sc, bif, 0); + + while ((bif = TAILQ_FIRST(&sc->sc_spanlist)) != NULL) { + bridge_delete_span(sc, bif); + } + + BRIDGE_UNLOCK(sc); + + error = ifnet_detach(ifp); + if (error != 0) { + panic("bridge_clone_destroy: ifnet_detach(%p) failed %d\n", ifp, error); + if ((sc = (struct bridge_softc *)ifnet_softc(ifp)) != NULL) { + BRIDGE_LOCK(sc); + sc->sc_flags &= ~SCF_DETACHING; + BRIDGE_UNLOCK(sc); + } + return 0; + } + + return 0; +} + +#define DRVSPEC do { \ + if (ifd->ifd_cmd >= bridge_control_table_size) { \ + error = EINVAL; \ + break; \ + } \ + bc = &bridge_control_table[ifd->ifd_cmd]; \ + \ + if (cmd == SIOCGDRVSPEC && \ + (bc->bc_flags & BC_F_COPYOUT) == 0) { \ + error = EINVAL; \ + break; \ + } \ + else if (cmd == SIOCSDRVSPEC && \ + (bc->bc_flags & BC_F_COPYOUT) != 0) { \ + error = EINVAL; \ + break; \ + } \ + \ + if (bc->bc_flags & BC_F_SUSER) { \ + error = kauth_authorize_generic(kauth_cred_get(), KAUTH_GENERIC_ISSUSER); \ + if (error) \ + break; \ + } \ + \ + if (ifd->ifd_len != bc->bc_argsize || \ + ifd->ifd_len > sizeof(args)) { \ + error = EINVAL; \ + break; \ + } \ + \ + bzero(&args, sizeof(args)); \ + if (bc->bc_flags & BC_F_COPYIN) { \ + error = copyin(ifd->ifd_data, &args, ifd->ifd_len); \ + if (error) \ + break; \ + } \ + \ + BRIDGE_LOCK(sc); \ + error = (*bc->bc_func)(sc, &args); \ + BRIDGE_UNLOCK(sc); \ + if (error) \ + break; \ + \ + if (bc->bc_flags & BC_F_COPYOUT) \ + error = copyout(&args, ifd->ifd_data, ifd->ifd_len); \ +} while (0) + + +/* + * bridge_ioctl: + * + * Handle a control request from the operator. + */ +static errno_t +bridge_ioctl(struct ifnet *ifp, u_long cmd, void *data) +{ + struct bridge_softc *sc = ifp->if_softc; + struct ifreq *ifr = (struct ifreq *) data; + int error = 0; + + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED); + +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf("bridge_ioctl: ifp %p cmd 0x%08lx (%c%c [%lu] %c %lu)\n", + ifp, + cmd, + (cmd & IOC_IN) ? 'I' : ' ', + (cmd & IOC_OUT) ? 'O' : ' ', + IOCPARM_LEN(cmd), + (char)IOCGROUP(cmd), + cmd & 0xff); +#endif + + switch (cmd) { + + case SIOCSIFADDR: + case SIOCAIFADDR: + ifnet_set_flags(ifp, IFF_UP, IFF_UP); + break; + + case SIOCGIFMEDIA32: + case SIOCGIFMEDIA64: + error = EINVAL; + break; + + case SIOCADDMULTI: + case SIOCDELMULTI: + break; + + case SIOCSDRVSPEC32: + case SIOCGDRVSPEC32: { + union { + struct ifbreq ifbreq; + struct ifbifconf32 ifbifconf; + struct ifbareq32 ifbareq; + struct ifbaconf32 ifbaconf; + struct ifbrparam ifbrparam; + struct ifbropreq32 ifbropreq; + } args; + struct ifdrv32 *ifd = (struct ifdrv32 *) data; + const struct bridge_control *bridge_control_table = bridge_control_table32, *bc; + + DRVSPEC; + + break; + } + case SIOCSDRVSPEC64: + case SIOCGDRVSPEC64: { + union { + struct ifbreq ifbreq; + struct ifbifconf64 ifbifconf; + struct ifbareq64 ifbareq; + struct ifbaconf64 ifbaconf; + struct ifbrparam ifbrparam; + struct ifbropreq64 ifbropreq; + } args; + struct ifdrv64 *ifd = (struct ifdrv64 *) data; + const struct bridge_control *bridge_control_table = bridge_control_table64, *bc; + + DRVSPEC; + + break; + } + + case SIOCSIFFLAGS: + if (!(ifp->if_flags & IFF_UP) && + (ifp->if_flags & IFF_RUNNING)) { + /* + * If interface is marked down and it is running, + * then stop and disable it. + */ + BRIDGE_LOCK(sc); + bridge_stop(ifp, 1); + BRIDGE_UNLOCK(sc); + } else if ((ifp->if_flags & IFF_UP) && + !(ifp->if_flags & IFF_RUNNING)) { + /* + * If interface is marked up and it is stopped, then + * start it. + */ + BRIDGE_LOCK(sc); + error = bridge_init(ifp); + BRIDGE_UNLOCK(sc); + } + break; + + case SIOCSIFLLADDR: + error = ifnet_set_lladdr(ifp, ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len); + if (error != 0) + printf("bridge_ioctl: ifnet_set_lladdr failed %d\n", error); + break; + + case SIOCSIFMTU: + /* Do not allow the MTU to be changed on the bridge */ + error = EINVAL; + break; + + default: + /* + * drop the lock as ether_ioctl() will call bridge_start() and + * cause the lock to be recursed. + */ + error = ether_ioctl(ifp, cmd, data); +#if BRIDGE_DEBUG + if (error != 0) + printf("bridge_ioctl: ether_ioctl ifp %p cmd 0x%08lx (%c%c [%lu] %c %lu) failed error: %d\n", + ifp, + cmd, + (cmd & IOC_IN) ? 'I' : ' ', + (cmd & IOC_OUT) ? 'O' : ' ', + IOCPARM_LEN(cmd), + (char) IOCGROUP(cmd), + cmd & 0xff, + error); +#endif /* BRIDGE_DEBUG */ + break; + } + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED); + + return (error); +} + +#if HAS_IF_CAP +/* + * bridge_mutecaps: + * + * Clear or restore unwanted capabilities on the member interface + */ +static void +bridge_mutecaps(struct bridge_softc *sc) +{ + struct bridge_iflist *bif; + int enabled, mask; + + /* Initial bitmask of capabilities to test */ + mask = BRIDGE_IFCAPS_MASK; + + TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) { + /* Every member must support it or its disabled */ + mask &= bif->bif_savedcaps; + } + + TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) { + enabled = bif->bif_ifp->if_capenable; + enabled &= ~BRIDGE_IFCAPS_STRIP; + /* strip off mask bits and enable them again if allowed */ + enabled &= ~BRIDGE_IFCAPS_MASK; + enabled |= mask; + + bridge_set_ifcap(sc, bif, enabled); + } + +} + +static void +bridge_set_ifcap(struct bridge_softc *sc, struct bridge_iflist *bif, int set) +{ + struct ifnet *ifp = bif->bif_ifp; + struct ifreq ifr; + int error; + + bzero(&ifr, sizeof(ifr)); + ifr.ifr_reqcap = set; + + if (ifp->if_capenable != set) { + IFF_LOCKGIANT(ifp); + error = (*ifp->if_ioctl)(ifp, SIOCSIFCAP, (caddr_t)&ifr); + IFF_UNLOCKGIANT(ifp); + if (error) + printf("error setting interface capabilities on %s\n", + ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp), + ifp->if_xname); + } +} +#endif /* HAS_IF_CAP */ + +/* + * bridge_lookup_member: + * + * Lookup a bridge member interface. + */ +static struct bridge_iflist * +bridge_lookup_member(struct bridge_softc *sc, const char *name) +{ + struct bridge_iflist *bif; + struct ifnet *ifp; + char if_xname[IFNAMSIZ]; + + BRIDGE_LOCK_ASSERT(sc); + + TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) { + ifp = bif->bif_ifp; + snprintf(if_xname, sizeof(if_xname), "%s%d", + ifnet_name(ifp), ifnet_unit(ifp)); + if (strncmp(if_xname, name, sizeof(if_xname)) == 0) + return (bif); + } + + return (NULL); +} + +/* + * bridge_lookup_member_if: + * + * Lookup a bridge member interface by ifnet*. + */ +static struct bridge_iflist * +bridge_lookup_member_if(struct bridge_softc *sc, struct ifnet *member_ifp) +{ + struct bridge_iflist *bif; + + BRIDGE_LOCK_ASSERT(sc); + + TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) { + if (bif->bif_ifp == member_ifp) + return (bif); + } + + return (NULL); +} + +static errno_t +bridge_iff_input(void* cookie, ifnet_t ifp, __unused protocol_family_t protocol, + mbuf_t *data, char **frame_ptr) +{ + errno_t error = 0; + struct bridge_iflist *bif = (struct bridge_iflist *)cookie; + struct bridge_softc *sc = bif->bif_sc; + int included = 0; + size_t frmlen = 0; + mbuf_t m = *data; + + if ((m->m_flags & M_PROTO1)) + goto out; + + if (*frame_ptr >= (char *)mbuf_datastart(m) && *frame_ptr <= (char *)mbuf_data(m)) { + included = 1; + frmlen = (char *)mbuf_data(m) - *frame_ptr; + } +#if BRIDGE_DEBUG + if (_if_brige_debug) { + printf("bridge_iff_input %s%d from %s%d m %p data %p frame %p %s frmlen %lu\n", + ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp), + ifnet_name(ifp), ifnet_unit(ifp), + m, mbuf_data(m), *frame_ptr, included ? "inside" : "outside", frmlen); + + if (_if_brige_debug > 1) { + printf_mbuf(m, "bridge_iff_input[", "\n"); + printf_ether_header((struct ether_header *)*frame_ptr); + printf_mbuf_data(m, 0, 20); + printf("\n"); + } + } +#endif /* BRIDGE_DEBUG */ + + /* Move data pointer to start of frame to the link layer header */ + if (included) { + (void) mbuf_setdata(m, (char *)mbuf_data(m) - frmlen, mbuf_len(m) + frmlen); + (void) mbuf_pkthdr_adjustlen(m, frmlen); + } else { + printf("bridge_iff_input: frame_ptr outside mbuf\n"); + goto out; + } + + error = bridge_input(ifp, m, *frame_ptr); + + /* Adjust packet back to original */ + if (error == 0) { + (void) mbuf_setdata(m, (char *)mbuf_data(m) + frmlen, mbuf_len(m) - frmlen); + (void) mbuf_pkthdr_adjustlen(m, -frmlen); + } +#if BRIDGE_DEBUG + if (_if_brige_debug > 1) { + printf("\n"); + printf_mbuf(m, "bridge_iff_input]", "\n"); + } +#endif /* BRIDGE_DEBUG */ + +out: + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED); + + return error; +} + + +#if BRIDGE_MEMBER_OUT_FILTER +static errno_t +bridge_iff_output(void *cookie, ifnet_t ifp, __unused protocol_family_t protocol, mbuf_t *data) +{ + errno_t error = 0; + struct bridge_iflist *bif = (struct bridge_iflist *)cookie; + struct bridge_softc *sc = bif->bif_sc; + mbuf_t m = *data; + + if ((m->m_flags & M_PROTO1)) + goto out; + +#if BRIDGE_DEBUG + if (_if_brige_debug) { + printf("bridge_iff_output %s%d from %s%d m %p data %p\n", + ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp), + ifnet_name(ifp), ifnet_unit(ifp), + m, mbuf_data(m)); + } +#endif /* BRIDGE_DEBUG */ + + error = bridge_output(sc, ifp, m); + if (error != 0) { + printf("bridge_iff_output: bridge_output failed error %d\n", error); + } + +out: + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED); + + return error; +} +#endif /* BRIDGE_MEMBER_OUT_FILTER */ + + +static void +bridge_iff_event(void* cookie, ifnet_t ifp, __unused protocol_family_t protocol, + const struct kev_msg *event_msg) +{ + struct bridge_iflist *bif = (struct bridge_iflist *)cookie; + + if (event_msg->vendor_code == KEV_VENDOR_APPLE && + event_msg->kev_class == KEV_NETWORK_CLASS && + event_msg->kev_subclass == KEV_DL_SUBCLASS) { + switch (event_msg->event_code) { + case KEV_DL_IF_DETACHING: + case KEV_DL_IF_DETACHED: + bridge_ifdetach(bif, ifp); + break; + + case KEV_DL_LINK_OFF: + case KEV_DL_LINK_ON: { + bstp_linkstate(ifp, event_msg->event_code); + break; + } + + case KEV_DL_SIFFLAGS: { + if (bif->bif_promisc == 0 && (ifp->if_flags & IFF_UP)) { + errno_t error = ifnet_set_promiscuous(ifp, 1); + if (error != 0) { + printf("bridge_iff_event: ifnet_set_promiscuous(%s%d) failed %d\n", + ifnet_name(ifp), ifnet_unit(ifp), error); + } else { + bif->bif_promisc = 1; + } + } + break; + } + + default: + break; + } + } +} + +/* + * bridge_iff_detached: + * + * Detach an interface from a bridge. Called when a member + * interface is detaching. + */ +static void +bridge_iff_detached(void* cookie, __unused ifnet_t ifp) +{ + struct bridge_iflist *bif = (struct bridge_iflist *)cookie; + +#if BRIDGE_DEBUG + printf("bridge_iff_detached: %s%d\n", + ifnet_name(ifp), ifnet_unit(ifp)); +#endif + + bridge_ifdetach(bif, ifp); + + _FREE(bif, M_DEVBUF); + + return; +} + +static errno_t +bridge_proto_input(ifnet_t ifp, __unused protocol_family_t protocol, + __unused mbuf_t packet, __unused char *header) +{ + printf("bridge_proto_input: unexpected packet from %s%d\n", + ifnet_name(ifp), ifnet_unit(ifp)); + return 0; +} + +static int +bridge_attach_protocol(struct ifnet *ifp) +{ + int error; + struct ifnet_attach_proto_param reg; + + printf("bridge_attach_protocol: %s%d\n", + ifnet_name(ifp), ifnet_unit(ifp)); + + bzero(®, sizeof(reg)); + reg.input = bridge_proto_input; + + error = ifnet_attach_protocol(ifp, PF_BRIDGE, ®); + if (error) + printf("bridge_attach_protocol: ifnet_attach_protocol(%s%d) failed, %d\n", + ifnet_name(ifp), ifnet_unit(ifp), error); + + return (error); +} + +static int +bridge_detach_protocol(struct ifnet *ifp) +{ + int error; + + printf("bridge_detach_protocol: %s%d\n", + ifnet_name(ifp), ifnet_unit(ifp)); + + error = ifnet_detach_protocol(ifp, PF_BRIDGE); + if (error) + printf("bridge_attach_protocol: ifnet_detach_protocol(%s%d) failed, %d\n", + ifnet_name(ifp), ifnet_unit(ifp), error); + + return (error); +} + +/* + * bridge_delete_member: + * + * Delete the specified member interface. + */ +static void +bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif, + int gone) +{ + struct ifnet *ifs = bif->bif_ifp; + + BRIDGE_LOCK_ASSERT(sc); + + if (!gone) { + switch (ifs->if_type) { + case IFT_ETHER: + case IFT_L2VLAN: + /* + * Take the interface out of promiscuous mode. + */ + if (bif->bif_promisc) + (void) ifnet_set_promiscuous(ifs, 0); + break; + + case IFT_GIF: + break; + + default: +#ifdef DIAGNOSTIC + panic("bridge_delete_member: impossible"); +#endif + break; + } + +#if HAS_IF_CAP + /* reneable any interface capabilities */ + bridge_set_ifcap(sc, bif, bif->bif_savedcaps); +#endif + } + + if (bif->bif_proto_attached) { + /* Respect lock ordering with DLIL lock */ + BRIDGE_UNLOCK(sc); + (void) bridge_detach_protocol(ifs); + BRIDGE_LOCK(sc); + } + if (bif->bif_flags & IFBIF_STP) + bstp_disable(&bif->bif_stp); + + ifs->if_bridge = NULL; + BRIDGE_XLOCK(sc); + TAILQ_REMOVE(&sc->sc_iflist, bif, bif_next); + BRIDGE_XDROP(sc); + + ifnet_release(ifs); + +#if HAS_IF_CAP + bridge_mutecaps(sc); /* recalcuate now this interface is removed */ +#endif /* HAS_IF_CAP */ + bridge_rtdelete(sc, ifs, IFBF_FLUSHALL); + KASSERT(bif->bif_addrcnt == 0, + ("%s: %d bridge routes referenced", __func__, bif->bif_addrcnt)); + + BRIDGE_UNLOCK(sc); + bstp_destroy(&bif->bif_stp); /* prepare to free */ + BRIDGE_LOCK(sc); + + if (bif->bif_filter_attached) { + /* Respect lock ordering with DLIL lock */ + BRIDGE_UNLOCK(sc); + iflt_detach(bif->bif_iff_ref); + BRIDGE_LOCK(sc); + } else { + _FREE(bif, M_DEVBUF); + } +} + +/* + * bridge_delete_span: + * + * Delete the specified span interface. + */ +static void +bridge_delete_span(struct bridge_softc *sc, struct bridge_iflist *bif) +{ + BRIDGE_LOCK_ASSERT(sc); + + KASSERT(bif->bif_ifp->if_bridge == NULL, + ("%s: not a span interface", __func__)); + + ifnet_release(bif->bif_ifp); + + TAILQ_REMOVE(&sc->sc_spanlist, bif, bif_next); + _FREE(bif, M_DEVBUF); +} + +static int +bridge_ioctl_add(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif = NULL; + struct ifnet *ifs; + int error = 0; + struct iff_filter iff; + + ifs = ifunit(req->ifbr_ifsname); + if (ifs == NULL) + return (ENOENT); + if (ifs->if_ioctl == NULL) /* must be supported */ + return (EINVAL); + + /* If it's in the span list, it can't be a member. */ + TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) + if (ifs == bif->bif_ifp) + return (EBUSY); + + /* Allow the first Ethernet member to define the MTU */ + if (ifs->if_type != IFT_GIF) { + if (TAILQ_EMPTY(&sc->sc_iflist)) + sc->sc_ifp->if_mtu = ifs->if_mtu; + else if (sc->sc_ifp->if_mtu != ifs->if_mtu) { + printf("%s%d: invalid MTU for %s%d", + ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp), + ifnet_name(ifs), ifnet_unit(ifs)); + return (EINVAL); + } + } + + if (ifs->if_bridge == sc) + return (EEXIST); + + if (ifs->if_bridge != NULL) + return (EBUSY); + + bif = _MALLOC(sizeof(*bif), M_DEVBUF, M_NOWAIT|M_ZERO); + if (bif == NULL) + return (ENOMEM); + + bif->bif_ifp = ifs; + bif->bif_flags = IFBIF_LEARNING | IFBIF_DISCOVER; +#if HAS_IF_CAP + bif->bif_savedcaps = ifs->if_capenable; +#endif /* HAS_IF_CAP */ + bif->bif_sc = sc; + + ifnet_reference(ifs); + + ifs->if_bridge = sc; + bstp_create(&sc->sc_stp, &bif->bif_stp, bif->bif_ifp); + /* + * XXX: XLOCK HERE!?! + */ + TAILQ_INSERT_TAIL(&sc->sc_iflist, bif, bif_next); + +#if HAS_IF_CAP + /* Set interface capabilities to the intersection set of all members */ + bridge_mutecaps(sc); +#endif /* HAS_IF_CAP */ + + + switch (ifs->if_type) { + case IFT_ETHER: + case IFT_L2VLAN: + /* + * Place the interface into promiscuous mode. + */ + error = ifnet_set_promiscuous(ifs, 1); + if (error) { + /* Ignore error when device is not up */ + if (error != ENETDOWN) + goto out; + error = 0; + } else { + bif->bif_promisc = 1; + } + break; + + case IFT_GIF: + break; + + default: + error = EINVAL; + goto out; + } + + /* + * Respect lock ordering with DLIL lock for the following operations + */ + BRIDGE_UNLOCK(sc); + + /* + * install an interface filter + */ + memset(&iff, 0, sizeof(struct iff_filter)); + iff.iff_cookie = bif; + iff.iff_name = "com.apple.kernel.bsd.net.if_bridge"; + iff.iff_input = bridge_iff_input; +#if BRIDGE_MEMBER_OUT_FILTER + iff.iff_output = bridge_iff_output; +#endif /* BRIDGE_MEMBER_OUT_FILTER */ + iff.iff_event = bridge_iff_event; + iff.iff_detached = bridge_iff_detached; + error = iflt_attach(ifs, &iff, &bif->bif_iff_ref); + if (error != 0) { + printf("bridge_ioctl_add: iflt_attach failed %d\n", error); + BRIDGE_LOCK(sc); + goto out; + } + bif->bif_filter_attached = 1; + + /* + * install an dummy "bridge" protocol + */ + if ((error = bridge_attach_protocol(ifs)) != 0) { + if (error != 0) { + printf("bridge_ioctl_add: bridge_attach_protocol failed %d\n", error); + BRIDGE_LOCK(sc); + goto out; + } + } + bif->bif_proto_attached = 1; + + BRIDGE_LOCK(sc); + +out: + if (error && bif != NULL) + bridge_delete_member(sc, bif, 1); + + return (error); +} + +static int +bridge_ioctl_del(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + + bif = bridge_lookup_member(sc, req->ifbr_ifsname); + if (bif == NULL) + return (ENOENT); + + bridge_delete_member(sc, bif, 0); + + return (0); +} + +static int +bridge_ioctl_purge(__unused struct bridge_softc *sc, __unused void *arg) +{ + return (0); +} + +static int +bridge_ioctl_gifflags(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + struct bstp_port *bp; + + bif = bridge_lookup_member(sc, req->ifbr_ifsname); + if (bif == NULL) + return (ENOENT); + + bp = &bif->bif_stp; + req->ifbr_ifsflags = bif->bif_flags; + req->ifbr_state = bp->bp_state; + req->ifbr_priority = bp->bp_priority; + req->ifbr_path_cost = bp->bp_path_cost; + req->ifbr_portno = bif->bif_ifp->if_index & 0xfff; + req->ifbr_proto = bp->bp_protover; + req->ifbr_role = bp->bp_role; + req->ifbr_stpflags = bp->bp_flags; + req->ifbr_addrcnt = bif->bif_addrcnt; + req->ifbr_addrmax = bif->bif_addrmax; + req->ifbr_addrexceeded = bif->bif_addrexceeded; + + /* Copy STP state options as flags */ + if (bp->bp_operedge) + req->ifbr_ifsflags |= IFBIF_BSTP_EDGE; + if (bp->bp_flags & BSTP_PORT_AUTOEDGE) + req->ifbr_ifsflags |= IFBIF_BSTP_AUTOEDGE; + if (bp->bp_ptp_link) + req->ifbr_ifsflags |= IFBIF_BSTP_PTP; + if (bp->bp_flags & BSTP_PORT_AUTOPTP) + req->ifbr_ifsflags |= IFBIF_BSTP_AUTOPTP; + if (bp->bp_flags & BSTP_PORT_ADMEDGE) + req->ifbr_ifsflags |= IFBIF_BSTP_ADMEDGE; + if (bp->bp_flags & BSTP_PORT_ADMCOST) + req->ifbr_ifsflags |= IFBIF_BSTP_ADMCOST; + return (0); +} + +static int +bridge_ioctl_sifflags(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + struct bstp_port *bp; + int error; + + bif = bridge_lookup_member(sc, req->ifbr_ifsname); + if (bif == NULL) + return (ENOENT); + bp = &bif->bif_stp; + + if (req->ifbr_ifsflags & IFBIF_SPAN) + /* SPAN is readonly */ + return (EINVAL); + + + if (req->ifbr_ifsflags & IFBIF_STP) { + if ((bif->bif_flags & IFBIF_STP) == 0) { + error = bstp_enable(&bif->bif_stp); + if (error) + return (error); + } + } else { + if ((bif->bif_flags & IFBIF_STP) != 0) + bstp_disable(&bif->bif_stp); + } + + /* Pass on STP flags */ + bstp_set_edge(bp, req->ifbr_ifsflags & IFBIF_BSTP_EDGE ? 1 : 0); + bstp_set_autoedge(bp, req->ifbr_ifsflags & IFBIF_BSTP_AUTOEDGE ? 1 : 0); + bstp_set_ptp(bp, req->ifbr_ifsflags & IFBIF_BSTP_PTP ? 1 : 0); + bstp_set_autoptp(bp, req->ifbr_ifsflags & IFBIF_BSTP_AUTOPTP ? 1 : 0); + + /* Save the bits relating to the bridge */ + bif->bif_flags = req->ifbr_ifsflags & IFBIFMASK; + + + return (0); +} + +static int +bridge_ioctl_scache(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + sc->sc_brtmax = param->ifbrp_csize; + bridge_rttrim(sc); + + return (0); +} + +static int +bridge_ioctl_gcache(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + param->ifbrp_csize = sc->sc_brtmax; + + return (0); +} + + +#define BRIDGE_IOCTL_GIFS do { \ + struct bridge_iflist *bif; \ + struct ifbreq breq; \ + char *buf, *outbuf; \ + unsigned int count, buflen, len; \ + \ + count = 0; \ + TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) \ + count++; \ + TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) \ + count++; \ + \ + buflen = sizeof(breq) * count; \ + if (bifc->ifbic_len == 0) { \ + bifc->ifbic_len = buflen; \ + return (0); \ + } \ + BRIDGE_UNLOCK(sc); \ + outbuf = _MALLOC(buflen, M_TEMP, M_WAITOK | M_ZERO); \ + BRIDGE_LOCK(sc); \ + \ + count = 0; \ + buf = outbuf; \ + len = min(bifc->ifbic_len, buflen); \ + bzero(&breq, sizeof(breq)); \ + TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) { \ + if (len < sizeof(breq)) \ + break; \ + \ + snprintf(breq.ifbr_ifsname, sizeof(breq.ifbr_ifsname), "%s%d", \ + ifnet_name(bif->bif_ifp), ifnet_unit(bif->bif_ifp)); \ + /* Fill in the ifbreq structure */ \ + error = bridge_ioctl_gifflags(sc, &breq); \ + if (error) \ + break; \ + memcpy(buf, &breq, sizeof(breq)); \ + count++; \ + buf += sizeof(breq); \ + len -= sizeof(breq); \ + } \ + TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) { \ + if (len < sizeof(breq)) \ + break; \ + \ + snprintf(breq.ifbr_ifsname, sizeof(breq.ifbr_ifsname), "%s%d", \ + ifnet_name(bif->bif_ifp), ifnet_unit(bif->bif_ifp)); \ + breq.ifbr_ifsflags = bif->bif_flags; \ + breq.ifbr_portno = bif->bif_ifp->if_index & 0xfff; \ + memcpy(buf, &breq, sizeof(breq)); \ + count++; \ + buf += sizeof(breq); \ + len -= sizeof(breq); \ + } \ + \ + BRIDGE_UNLOCK(sc); \ + bifc->ifbic_len = sizeof(breq) * count; \ + error = copyout(outbuf, bifc->ifbic_req, bifc->ifbic_len); \ + BRIDGE_LOCK(sc); \ + _FREE(outbuf, M_TEMP); \ +} while (0) + +static int +bridge_ioctl_gifs64(struct bridge_softc *sc, void *arg) +{ + struct ifbifconf64 *bifc = arg; + int error = 0; + + BRIDGE_IOCTL_GIFS; + + return (error); +} + +static int +bridge_ioctl_gifs32(struct bridge_softc *sc, void *arg) +{ + struct ifbifconf32 *bifc = arg; + int error = 0; + + BRIDGE_IOCTL_GIFS; + + return (error); +} + + +#define BRIDGE_IOCTL_RTS do { \ + struct bridge_rtnode *brt; \ + char *buf, *outbuf; \ + unsigned int count, buflen, len; \ + struct timespec now; \ + \ + if (bac->ifbac_len == 0) \ + return (0); \ + \ + count = 0; \ + LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) \ + count++; \ + buflen = sizeof(bareq) * count; \ + \ + BRIDGE_UNLOCK(sc); \ + outbuf = _MALLOC(buflen, M_TEMP, M_WAITOK | M_ZERO); \ + BRIDGE_LOCK(sc); \ + \ + count = 0; \ + buf = outbuf; \ + len = min(bac->ifbac_len, buflen); \ + bzero(&bareq, sizeof(bareq)); \ + LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) { \ + if (len < sizeof(bareq)) \ + goto out; \ + snprintf(bareq.ifba_ifsname, sizeof(bareq.ifba_ifsname), "%s%d", \ + ifnet_name(brt->brt_ifp), ifnet_unit(brt->brt_ifp)); \ + memcpy(bareq.ifba_dst, brt->brt_addr, sizeof(brt->brt_addr)); \ + bareq.ifba_vlan = brt->brt_vlan; \ + if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) { \ + nanouptime(&now); \ + if ((unsigned long)now.tv_sec < brt->brt_expire) \ + bareq.ifba_expire = brt->brt_expire - now.tv_sec; \ + } else \ + bareq.ifba_expire = 0; \ + bareq.ifba_flags = brt->brt_flags; \ + \ + memcpy(buf, &bareq, sizeof(bareq)); \ + count++; \ + buf += sizeof(bareq); \ + len -= sizeof(bareq); \ + } \ +out: \ + BRIDGE_UNLOCK(sc); \ + bac->ifbac_len = sizeof(bareq) * count; \ + error = copyout(outbuf, bac->ifbac_req, bac->ifbac_len); \ + BRIDGE_LOCK(sc); \ + _FREE(outbuf, M_TEMP); \ + return (error); \ +} while (0) + +static int +bridge_ioctl_rts64(struct bridge_softc *sc, void *arg) +{ + struct ifbaconf64 *bac = arg; + struct ifbareq64 bareq; + int error = 0; + + BRIDGE_IOCTL_RTS; + + return (error); +} + +static int +bridge_ioctl_rts32(struct bridge_softc *sc, void *arg) +{ + struct ifbaconf32 *bac = arg; + struct ifbareq32 bareq; + int error = 0; + + BRIDGE_IOCTL_RTS; + + return (error); +} + +static int +bridge_ioctl_saddr32(struct bridge_softc *sc, void *arg) +{ + struct ifbareq32 *req = arg; + struct bridge_iflist *bif; + int error; + + bif = bridge_lookup_member(sc, req->ifba_ifsname); + if (bif == NULL) + return (ENOENT); + + error = bridge_rtupdate(sc, req->ifba_dst, req->ifba_vlan, bif, 1, + req->ifba_flags); + + return (error); +} + +static int +bridge_ioctl_saddr64(struct bridge_softc *sc, void *arg) +{ + struct ifbareq64 *req = arg; + struct bridge_iflist *bif; + int error; + + bif = bridge_lookup_member(sc, req->ifba_ifsname); + if (bif == NULL) + return (ENOENT); + + error = bridge_rtupdate(sc, req->ifba_dst, req->ifba_vlan, bif, 1, + req->ifba_flags); + + return (error); +} + +static int +bridge_ioctl_sto(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + sc->sc_brttimeout = param->ifbrp_ctime; + return (0); +} + +static int +bridge_ioctl_gto(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + param->ifbrp_ctime = sc->sc_brttimeout; + return (0); +} + +static int +bridge_ioctl_daddr32(struct bridge_softc *sc, void *arg) +{ + struct ifbareq32 *req = arg; + + return (bridge_rtdaddr(sc, req->ifba_dst, req->ifba_vlan)); +} + +static int +bridge_ioctl_daddr64(struct bridge_softc *sc, void *arg) +{ + struct ifbareq64 *req = arg; + + return (bridge_rtdaddr(sc, req->ifba_dst, req->ifba_vlan)); +} + +static int +bridge_ioctl_flush(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + + bridge_rtflush(sc, req->ifbr_ifsflags); + return (0); +} + +static int +bridge_ioctl_gpri(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + struct bstp_state *bs = &sc->sc_stp; + + param->ifbrp_prio = bs->bs_bridge_priority; + return (0); +} + +static int +bridge_ioctl_spri(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + return (bstp_set_priority(&sc->sc_stp, param->ifbrp_prio)); +} + +static int +bridge_ioctl_ght(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + struct bstp_state *bs = &sc->sc_stp; + + param->ifbrp_hellotime = bs->bs_bridge_htime >> 8; + return (0); +} + +static int +bridge_ioctl_sht(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + return (bstp_set_htime(&sc->sc_stp, param->ifbrp_hellotime)); +} + +static int +bridge_ioctl_gfd(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + struct bstp_state *bs = &sc->sc_stp; + + param->ifbrp_fwddelay = bs->bs_bridge_fdelay >> 8; + return (0); +} + +static int +bridge_ioctl_sfd(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + return (bstp_set_fdelay(&sc->sc_stp, param->ifbrp_fwddelay)); +} + +static int +bridge_ioctl_gma(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + struct bstp_state *bs = &sc->sc_stp; + + param->ifbrp_maxage = bs->bs_bridge_max_age >> 8; + return (0); +} + +static int +bridge_ioctl_sma(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + return (bstp_set_maxage(&sc->sc_stp, param->ifbrp_maxage)); +} + +static int +bridge_ioctl_sifprio(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + + bif = bridge_lookup_member(sc, req->ifbr_ifsname); + if (bif == NULL) + return (ENOENT); + + return (bstp_set_port_priority(&bif->bif_stp, req->ifbr_priority)); +} + +static int +bridge_ioctl_sifcost(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + + bif = bridge_lookup_member(sc, req->ifbr_ifsname); + if (bif == NULL) + return (ENOENT); + + return (bstp_set_path_cost(&bif->bif_stp, req->ifbr_path_cost)); +} + +static int +bridge_ioctl_gfilt(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + param->ifbrp_filter = sc->sc_filter_flags; + + return (0); +} + +static int +bridge_ioctl_sfilt(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + if (param->ifbrp_filter & ~IFBF_FILT_MASK) + return (EINVAL); + +#ifndef BRIDGE_IPF + if (param->ifbrp_filter & IFBF_FILT_USEIPF) + return (EINVAL); +#endif + + sc->sc_filter_flags = param->ifbrp_filter; + + return (0); +} + +static int +bridge_ioctl_sifmaxaddr(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + + bif = bridge_lookup_member(sc, req->ifbr_ifsname); + if (bif == NULL) + return (ENOENT); + + bif->bif_addrmax = req->ifbr_addrmax; + return (0); +} + +static int +bridge_ioctl_addspan(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif = NULL; + struct ifnet *ifs; + + ifs = ifunit(req->ifbr_ifsname); + if (ifs == NULL) + return (ENOENT); + + TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) + if (ifs == bif->bif_ifp) + return (EBUSY); + + if (ifs->if_bridge != NULL) + return (EBUSY); + + switch (ifs->if_type) { + case IFT_ETHER: + case IFT_GIF: + case IFT_L2VLAN: + break; + default: + return (EINVAL); + } + + bif = _MALLOC(sizeof(*bif), M_DEVBUF, M_NOWAIT|M_ZERO); + if (bif == NULL) + return (ENOMEM); + + bif->bif_ifp = ifs; + bif->bif_flags = IFBIF_SPAN; + + ifnet_reference(bif->bif_ifp); + + TAILQ_INSERT_HEAD(&sc->sc_spanlist, bif, bif_next); + + return (0); +} + +static int +bridge_ioctl_delspan(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + struct ifnet *ifs; + + ifs = ifunit(req->ifbr_ifsname); + if (ifs == NULL) + return (ENOENT); + + TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) + if (ifs == bif->bif_ifp) + break; + + if (bif == NULL) + return (ENOENT); + + bridge_delete_span(sc, bif); + + return (0); +} + +#define BRIDGE_IOCTL_GBPARAM do { \ + struct bstp_state *bs = &sc->sc_stp; \ + struct bstp_port *root_port; \ + \ + req->ifbop_maxage = bs->bs_bridge_max_age >> 8; \ + req->ifbop_hellotime = bs->bs_bridge_htime >> 8; \ + req->ifbop_fwddelay = bs->bs_bridge_fdelay >> 8; \ + \ + root_port = bs->bs_root_port; \ + if (root_port == NULL) \ + req->ifbop_root_port = 0; \ + else \ + req->ifbop_root_port = root_port->bp_ifp->if_index; \ + \ + req->ifbop_holdcount = bs->bs_txholdcount; \ + req->ifbop_priority = bs->bs_bridge_priority; \ + req->ifbop_protocol = bs->bs_protover; \ + req->ifbop_root_path_cost = bs->bs_root_pv.pv_cost; \ + req->ifbop_bridgeid = bs->bs_bridge_pv.pv_dbridge_id; \ + req->ifbop_designated_root = bs->bs_root_pv.pv_root_id; \ + req->ifbop_designated_bridge = bs->bs_root_pv.pv_dbridge_id; \ + req->ifbop_last_tc_time.tv_sec = bs->bs_last_tc_time.tv_sec; \ + req->ifbop_last_tc_time.tv_usec = bs->bs_last_tc_time.tv_usec; \ +} while (0) + +static int +bridge_ioctl_gbparam32(struct bridge_softc *sc, void *arg) +{ + struct ifbropreq32 *req = arg; + + BRIDGE_IOCTL_GBPARAM; + + return (0); +} + +static int +bridge_ioctl_gbparam64(struct bridge_softc *sc, void *arg) +{ + struct ifbropreq64 *req = arg; + + BRIDGE_IOCTL_GBPARAM; + + return (0); +} + + +static int +bridge_ioctl_grte(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + param->ifbrp_cexceeded = sc->sc_brtexceeded; + return (0); +} + +#define BRIDGE_IOCTL_GIFSSTP do { \ + struct bridge_iflist *bif; \ + struct bstp_port *bp; \ + struct ifbpstpreq bpreq; \ + char *buf, *outbuf; \ + unsigned int count, buflen, len; \ + \ + count = 0; \ + TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) { \ + if ((bif->bif_flags & IFBIF_STP) != 0) \ + count++; \ + } \ + \ + buflen = sizeof(bpreq) * count; \ + if (bifstp->ifbpstp_len == 0) { \ + bifstp->ifbpstp_len = buflen; \ + return (0); \ + } \ + \ + BRIDGE_UNLOCK(sc); \ + outbuf = _MALLOC(buflen, M_TEMP, M_WAITOK | M_ZERO); \ + BRIDGE_LOCK(sc); \ + \ + count = 0; \ + buf = outbuf; \ + len = min(bifstp->ifbpstp_len, buflen); \ + bzero(&bpreq, sizeof(bpreq)); \ + TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) { \ + if (len < sizeof(bpreq)) \ + break; \ + \ + if ((bif->bif_flags & IFBIF_STP) == 0) \ + continue; \ + \ + bp = &bif->bif_stp; \ + bpreq.ifbp_portno = bif->bif_ifp->if_index & 0xfff; \ + bpreq.ifbp_fwd_trans = bp->bp_forward_transitions; \ + bpreq.ifbp_design_cost = bp->bp_desg_pv.pv_cost; \ + bpreq.ifbp_design_port = bp->bp_desg_pv.pv_port_id; \ + bpreq.ifbp_design_bridge = bp->bp_desg_pv.pv_dbridge_id; \ + bpreq.ifbp_design_root = bp->bp_desg_pv.pv_root_id; \ + \ + memcpy(buf, &bpreq, sizeof(bpreq)); \ + count++; \ + buf += sizeof(bpreq); \ + len -= sizeof(bpreq); \ + } \ + \ + BRIDGE_UNLOCK(sc); \ + bifstp->ifbpstp_len = sizeof(bpreq) * count; \ + error = copyout(outbuf, bifstp->ifbpstp_req, bifstp->ifbpstp_len); \ + BRIDGE_LOCK(sc); \ + _FREE(outbuf, M_TEMP); \ + return (error); \ +} while (0) + +static int +bridge_ioctl_gifsstp32(struct bridge_softc *sc, void *arg) +{ + struct ifbpstpconf32 *bifstp = arg; + int error = 0; + + BRIDGE_IOCTL_GIFSSTP; + + return (error); +} + +static int +bridge_ioctl_gifsstp64(struct bridge_softc *sc, void *arg) +{ + struct ifbpstpconf64 *bifstp = arg; + int error = 0; + + BRIDGE_IOCTL_GIFSSTP; + + return (error); +} + +static int +bridge_ioctl_sproto(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + return (bstp_set_protocol(&sc->sc_stp, param->ifbrp_proto)); +} + +static int +bridge_ioctl_stxhc(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + return (bstp_set_holdcount(&sc->sc_stp, param->ifbrp_txhc)); +} + +/* + * bridge_ifdetach: + * + * Detach an interface from a bridge. Called when a member + * interface is detaching. + */ +__private_extern__ void +bridge_ifdetach(struct bridge_iflist *bif, struct ifnet *ifp) +{ + struct bridge_softc *sc = ifp->if_bridge; + +#if BRIDGE_DEBUG + printf("bridge_ifdetach %s%d\n", ifnet_name(ifp), ifnet_unit(ifp)); +#endif + + /* Check if the interface is a bridge member */ + if (sc != NULL) { + BRIDGE_LOCK(sc); + + bif = bridge_lookup_member_if(sc, ifp); + if (bif != NULL) + bridge_delete_member(sc, bif, 1); + + BRIDGE_UNLOCK(sc); + return; + } + + /* Check if the interface is a span port */ + lck_mtx_lock(bridge_list_mtx); + LIST_FOREACH(sc, &bridge_list, sc_list) { + BRIDGE_LOCK(sc); + TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) + if (ifp == bif->bif_ifp) { + bridge_delete_span(sc, bif); + break; + } + + BRIDGE_UNLOCK(sc); + } + lck_mtx_unlock(bridge_list_mtx); +} + +/* + * bridge_init: + * + * Initialize a bridge interface. + */ +static int +bridge_init(struct ifnet *ifp) +{ + struct bridge_softc *sc = (struct bridge_softc *)ifp->if_softc; + struct timespec ts; + errno_t error; + + BRIDGE_LOCK_ASSERT(sc); + + if ((ifnet_flags(ifp) & IFF_RUNNING)) + return 0; + + ts.tv_sec = bridge_rtable_prune_period; + ts.tv_nsec = 0; + bsd_timeout(bridge_timer, sc, &ts); + + error = ifnet_set_flags(ifp, IFF_RUNNING, IFF_RUNNING); + if (error == 0) + bstp_init(&sc->sc_stp); /* Initialize Spanning Tree */ + + return error; +} + +/* + * bridge_stop: + * + * Stop the bridge interface. + */ +static void +bridge_stop(struct ifnet *ifp, __unused int disable) +{ + struct bridge_softc *sc = ifp->if_softc; + + BRIDGE_LOCK_ASSERT(sc); + + if ((ifnet_flags(ifp) & IFF_RUNNING) == 0) + return; + + bsd_untimeout(bridge_timer, sc); + bstp_stop(&sc->sc_stp); + + bridge_rtflush(sc, IFBF_FLUSHDYN); + + (void) ifnet_set_flags(ifp, 0, IFF_RUNNING); +} + +/* + * bridge_enqueue: + * + * Enqueue a packet on a bridge member interface. + * + */ +static void +bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m) +{ + int len, error = 0; + short mflags; + struct mbuf *m0; + + /* We may be sending a fragment so traverse the mbuf */ + for (; m; m = m0) { + m0 = m->m_nextpkt; + m->m_nextpkt = NULL; + + len = m->m_pkthdr.len; + mflags = m->m_flags; + m->m_flags |= M_PROTO1; //set to avoid loops + +#if HAS_IF_CAP + /* + * If underlying interface can not do VLAN tag insertion itself + * then attach a packet tag that holds it. + */ + if ((m->m_flags & M_VLANTAG) && + (dst_ifp->if_capenable & IFCAP_VLAN_HWTAGGING) == 0) { + m = ether_vlanencap(m, m->m_pkthdr.ether_vtag); + if (m == NULL) { + printf("%s%d: unable to prepend VLAN header\n", + ifnet_name(dst_ifp), ifnet_unit(dst_ifp)); + (void) ifnet_stat_increment_out(dst_ifp, 0, 0, 1); + continue; + } + m->m_flags &= ~M_VLANTAG; + } +#endif /* HAS_IF_CAP */ + + error = ifnet_output_raw(dst_ifp, 0, m); + if (error == 0) { + (void) ifnet_stat_increment_out(sc->sc_ifp, 1, len, 0); + } else { + (void) ifnet_stat_increment_out(sc->sc_ifp, 0, 0, 1); + } + } + + return; +} + +#if HAS_BRIDGE_DUMMYNET +/* + * bridge_dummynet: + * + * Receive a queued packet from dummynet and pass it on to the output + * interface. + * + * The mbuf has the Ethernet header already attached. + */ +static void +bridge_dummynet(struct mbuf *m, struct ifnet *ifp) +{ + struct bridge_softc *sc; + + sc = ifp->if_bridge; + + /* + * The packet didnt originate from a member interface. This should only + * ever happen if a member interface is removed while packets are + * queued for it. + */ + if (sc == NULL) { + m_freem(m); + return; + } + + if (PFIL_HOOKED(&inet_pfil_hook) +#ifdef INET6 + || PFIL_HOOKED(&inet6_pfil_hook) +#endif + ) { + if (bridge_pfil(&m, sc->sc_ifp, ifp, PFIL_OUT) != 0) + return; + if (m == NULL) + return; + } + + bridge_enqueue(sc, ifp, m); +} +#endif /* HAS_BRIDGE_DUMMYNET */ + +#if BRIDGE_MEMBER_OUT_FILTER +/* + * bridge_output: + * + * Send output from a bridge member interface. This + * performs the bridging function for locally originated + * packets. + * + * The mbuf has the Ethernet header already attached. We must + * enqueue or free the mbuf before returning. + */ +static int +bridge_output(struct ifnet *ifp, struct mbuf *m, __unused struct sockaddr *sa, + __unused struct rtentry *rt) +{ + struct ether_header *eh; + struct ifnet *dst_if; + struct bridge_softc *sc; + uint16_t vlan; + +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf("bridge_output ifp %p %s%d\n", ifp, ifnet_name(ifp), ifnet_unit(ifp)); +#endif /* BRIDGE_DEBUG */ + + if (m->m_len < ETHER_HDR_LEN) { + m = m_pullup(m, ETHER_HDR_LEN); + if (m == NULL) + return (0); + } + + eh = mtod(m, struct ether_header *); + sc = ifp->if_bridge; + vlan = VLANTAGOF(m); + + BRIDGE_LOCK(sc); + + /* APPLE MODIFICATION + * If the packet is an 802.1X ethertype, then only send on the + * original output interface. + */ + if (eh->ether_type == htons(ETHERTYPE_PAE)) { + dst_if = ifp; + goto sendunicast; + } + + /* + * If bridge is down, but the original output interface is up, + * go ahead and send out that interface. Otherwise, the packet + * is dropped below. + */ + if ((sc->sc_ifp->if_flags & IFF_RUNNING) == 0) { + dst_if = ifp; + goto sendunicast; + } + + /* + * If the packet is a multicast, or we don't know a better way to + * get there, send to all interfaces. + */ + if (ETHER_IS_MULTICAST(eh->ether_dhost)) + dst_if = NULL; + else + dst_if = bridge_rtlookup(sc, eh->ether_dhost, vlan); + if (dst_if == NULL) { + struct bridge_iflist *bif; + struct mbuf *mc; + int error = 0, used = 0; + + bridge_span(sc, m); + + BRIDGE_LOCK2REF(sc, error); + if (error) { + m_freem(m); + return (0); + } + + TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) { + dst_if = bif->bif_ifp; + + if (dst_if->if_type == IFT_GIF) + continue; + if ((dst_if->if_flags & IFF_RUNNING) == 0) + continue; + + /* + * If this is not the original output interface, + * and the interface is participating in spanning + * tree, make sure the port is in a state that + * allows forwarding. + */ + if (dst_if != ifp && (bif->bif_flags & IFBIF_STP) && + bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) + continue; + + if (LIST_NEXT(bif, bif_next) == NULL) { + used = 1; + mc = m; + } else { + mc = m_copypacket(m, M_DONTWAIT); + if (mc == NULL) { + (void) ifnet_stat_increment_out(sc->sc_ifp, 0, 0, 1); + continue; + } + } + + bridge_enqueue(sc, dst_if, mc); + } + if (used == 0) + m_freem(m); + BRIDGE_UNREF(sc); + return (0); + } + +sendunicast: + /* + * XXX Spanning tree consideration here? + */ + + bridge_span(sc, m); + if ((dst_if->if_flags & IFF_RUNNING) == 0) { + m_freem(m); + BRIDGE_UNLOCK(sc); + return (0); + } + + BRIDGE_UNLOCK(sc); + bridge_enqueue(sc, dst_if, m); + return (0); +} +#endif /* BRIDGE_MEMBER_OUT_FILTER */ + +#if APPLE_BRIDGE_HWCKSUM_SUPPORT +static struct mbuf* bridge_fix_txcsum( struct mbuf *m ) +{ + // basic tests indicate that the vast majority of packets being processed + // here have an Ethernet header mbuf pre-pended to them (the first case below) + // the second highest are those where the Ethernet and IP/TCP/UDP headers are + // all in one mbuf (second case below) + // the third case has, in fact, never hit for me -- although if I comment out + // the first two cases, that code works for them, so I consider it a + // decent general solution + + int amt = ETHER_HDR_LEN; + int hlen = M_CSUM_DATA_IPv4_IPHL( m->m_pkthdr.csum_data ); + int off = M_CSUM_DATA_IPv4_OFFSET( m->m_pkthdr.csum_data ); + + /* + * NOTE we should never get vlan-attached packets here; + * support for those COULD be added, but we don't use them + * and it really kinda slows things down to worry about them + */ + +#ifdef DIAGNOSTIC + if ( m_tag_find( m, PACKET_TAG_VLAN, NULL ) != NULL ) + { + printf( "bridge: transmitting packet tagged with VLAN?\n" ); + KASSERT( 0 ); + m_freem( m ); + return NULL; + } +#endif + + if ( m->m_pkthdr.csum_flags & M_CSUM_IPv4 ) + { + amt += hlen; + } + if ( m->m_pkthdr.csum_flags & M_CSUM_TCPv4 ) + { + amt += off + sizeof( uint16_t ); + } + + if ( m->m_pkthdr.csum_flags & M_CSUM_UDPv4 ) + { + amt += off + sizeof( uint16_t ); + } + + if ( m->m_len == ETHER_HDR_LEN ) + { + // this is the case where there's an Ethernet header in an mbuf + + // the first mbuf is the Ethernet header -- just strip it off and do the checksum + struct mbuf *m_ip = m->m_next; + + // set up m_ip so the cksum operations work + /* APPLE MODIFICATION 22 Apr 2008 + * Clear the m_tag list before setting + * M_PKTHDR. + * + * If this m_buf chain was extended via M_PREPEND(), then + * m_ip->m_pkthdr is identical to m->m_pkthdr (see + * M_MOVE_PKTHDR()). The only thing preventing access to this + * invalid packet header data is the fact that the M_PKTHDR + * flag is clear, i.e., m_ip->m_flag & M_PKTHDR == 0, but we're + * about to set the M_PKTHDR flag, so to be safe we initialize, + * more accurately, we clear, m_ip->m_pkthdr.tags via + * m_tag_init(). + * + * Suppose that we do not do this; if m_pullup(), below, fails, + * then m_ip will be freed along with m_ip->m_pkthdr.tags, but + * we will also free m soon after, via m_freem(), and + * consequently attempt to free m->m_pkthdr.tags in the + * process. The problem is that m->m_pkthdr.tags will have + * already been freed by virtue of being equal to + * m_ip->m_pkthdr.tags. Attempts to dereference + * m->m_pkthdr.tags in m_tag_delete_chain() will result in a + * panic. + */ + m_tag_init(m_ip); + /* END MODIFICATION */ + m_ip->m_flags |= M_PKTHDR; + m_ip->m_pkthdr.csum_flags = m->m_pkthdr.csum_flags; + m_ip->m_pkthdr.csum_data = m->m_pkthdr.csum_data; + m_ip->m_pkthdr.len = m->m_pkthdr.len - ETHER_HDR_LEN; + + // set up the header mbuf so we can prepend it back on again later + m->m_pkthdr.csum_flags = 0; + m->m_pkthdr.csum_data = 0; + m->m_pkthdr.len = ETHER_HDR_LEN; + m->m_next = NULL; + + + // now do the checksums we need -- first IP + if ( m_ip->m_pkthdr.csum_flags & M_CSUM_IPv4 ) + { + // make sure the IP header (or at least the part with the cksum) is there + m_ip = m_pullup( m_ip, sizeof( struct ip ) ); + if ( m_ip == NULL ) + { + printf( "bridge: failed to flatten header\n "); + m_freem( m ); + return NULL; + } + + // now do the checksum + { + struct ip *ip = mtod( m_ip, struct ip* ); + ip->ip_sum = in_cksum( m_ip, hlen ); + +#ifdef VERY_VERY_VERY_DIAGNOSTIC + printf( "bridge: performed IPv4 checksum\n" ); +#endif + } + } + + // now do a TCP or UDP delayed checksum + if ( m_ip->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4) ) + { + in_delayed_cksum( m_ip ); + +#ifdef VERY_VERY_VERY_DIAGNOSTIC + printf( "bridge: performed TCPv4/UDPv4 checksum\n" ); +#endif + } + + // now attach the ethernet header back onto the IP packet + m->m_next = m_ip; + m->m_pkthdr.len += m_length( m_ip ); + + // clear the M_PKTHDR flags on the ip packet (again, we re-attach later) + m_ip->m_flags &= ~M_PKTHDR; + + // and clear any csum flags + m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4); + } + else if ( m->m_len >= amt ) + { + // everything fits in the first mbuf, so futz with m->m_data, m->m_len and m->m_pkthdr.len to + // make it work + m->m_len -= ETHER_HDR_LEN; + m->m_data += ETHER_HDR_LEN; + m->m_pkthdr.len -= ETHER_HDR_LEN; + + // now do the checksums we need -- first IP + if ( m->m_pkthdr.csum_flags & M_CSUM_IPv4 ) + { + struct ip *ip = mtod( m, struct ip* ); + ip->ip_sum = in_cksum( m, hlen ); + +#ifdef VERY_VERY_VERY_DIAGNOSTIC + printf( "bridge: performed IPv4 checksum\n" ); +#endif + } + + // now do a TCP or UDP delayed checksum + if ( m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4) ) + { + in_delayed_cksum( m ); + +#ifdef VERY_VERY_VERY_DIAGNOSTIC + printf( "bridge: performed TCPv4/UDPv4 checksum\n" ); +#endif + } + + // now stick the ethernet header back on + m->m_len += ETHER_HDR_LEN; + m->m_data -= ETHER_HDR_LEN; + m->m_pkthdr.len += ETHER_HDR_LEN; + + // and clear any csum flags + m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4); + } + else + { + struct mbuf *m_ip; + + // general case -- need to simply split it off and deal + + // first, calculate how much needs to be made writable (we may have a read-only mbuf here) + hlen = M_CSUM_DATA_IPv4_IPHL( m->m_pkthdr.csum_data ); +#if PARANOID + off = M_CSUM_DATA_IPv4_OFFSET( m->m_pkthdr.csum_data ); + + if ( m->m_pkthdr.csum_flags & M_CSUM_IPv4 ) + { + amt += hlen; + } + + if ( m->m_pkthdr.csum_flags & M_CSUM_TCPv4 ) + { + amt += sizeof( struct tcphdr * ); + amt += off; + } + + if ( m->m_pkthdr.csum_flags & M_CSUM_UDPv4 ) + { + amt += sizeof( struct udphdr * ); + amt += off; + } +#endif + + // now split the ethernet header off of the IP packet (we'll re-attach later) + m_ip = m_split( m, ETHER_HDR_LEN, M_NOWAIT ); + if ( m_ip == NULL ) + { + printf( "bridge_fix_txcsum: could not split ether header\n" ); + + m_freem( m ); + return NULL; + } + +#if PARANOID + // make sure that the IP packet is writable for the portion we need + if ( m_makewritable( &m_ip, 0, amt, M_DONTWAIT ) != 0 ) + { + printf( "bridge_fix_txcsum: could not make %d bytes writable\n", amt ); + + m_freem( m ); + m_freem( m_ip ); + return NULL; + } +#endif + + m_ip->m_pkthdr.csum_flags = m->m_pkthdr.csum_flags; + m_ip->m_pkthdr.csum_data = m->m_pkthdr.csum_data; + + m->m_pkthdr.csum_flags = 0; + m->m_pkthdr.csum_data = 0; + + // now do the checksums we need -- first IP + if ( m_ip->m_pkthdr.csum_flags & M_CSUM_IPv4 ) + { + // make sure the IP header (or at least the part with the cksum) is there + m_ip = m_pullup( m_ip, sizeof( struct ip ) ); + if ( m_ip == NULL ) + { + printf( "bridge: failed to flatten header\n "); + m_freem( m ); + return NULL; + } + + // now do the checksum + { + struct ip *ip = mtod( m_ip, struct ip* ); + ip->ip_sum = in_cksum( m_ip, hlen ); + +#ifdef VERY_VERY_VERY_DIAGNOSTIC + printf( "bridge: performed IPv4 checksum\n" ); +#endif + } + } + + // now do a TCP or UDP delayed checksum + if ( m_ip->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4) ) + { + in_delayed_cksum( m_ip ); + +#ifdef VERY_VERY_VERY_DIAGNOSTIC + printf( "bridge: performed TCPv4/UDPv4 checksum\n" ); +#endif + } + + // now attach the ethernet header back onto the IP packet + m->m_next = m_ip; + m->m_pkthdr.len += m_length( m_ip ); + + // clear the M_PKTHDR flags on the ip packet (again, we re-attach later) + m_ip->m_flags &= ~M_PKTHDR; + + // and clear any csum flags + m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4); + } + + return m; +} +#endif + +/* + * bridge_start: + * + * Start output on a bridge. + * + */ +static errno_t +bridge_start(struct ifnet *ifp, struct mbuf *m) +{ + struct bridge_softc *sc = ifnet_softc(ifp); + struct ether_header *eh; + struct ifnet *dst_if; + + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED); + + eh = mtod(m, struct ether_header *); + + BRIDGE_LOCK(sc); + + if ((m->m_flags & (M_BCAST|M_MCAST)) == 0 && + (dst_if = bridge_rtlookup(sc, eh->ether_dhost, 0)) != NULL) { + + { +#if APPLE_BRIDGE_HWCKSUM_SUPPORT + /* + * APPLE MODIFICATION - if the packet needs a checksum (i.e., + * checksum has been deferred for HW support) AND the destination + * interface doesn't support HW checksums, then we + * need to fix-up the checksum here + */ + if ( + ( (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4) ) != 0 ) && + ( (dst_if->if_csum_flags_tx & m->m_pkthdr.csum_flags ) != m->m_pkthdr.csum_flags ) + ) + { + m = bridge_fix_txcsum( m ); + if ( m == NULL ) + { + goto done; + } + } + +#else + if (eh->ether_type == htons(ETHERTYPE_IP)) + mbuf_outbound_finalize(m, PF_INET, sizeof(struct ether_header)); + else + m->m_pkthdr.csum_flags = 0; +#endif + #if NBPFILTER > 0 + if (sc->sc_bpf_output) + bridge_bpf_output(ifp, m); + #endif + BRIDGE_UNLOCK(sc); + bridge_enqueue(sc, dst_if, m); + } + } else + { +#if APPLE_BRIDGE_HWCKSUM_SUPPORT + + /* + * APPLE MODIFICATION - if the MULTICAST packet needs a checksum (i.e., + * checksum has been deferred for HW support) AND at least one destination + * interface doesn't support HW checksums, then we go ahead and fix it up + * here, since it doesn't make sense to do it more than once + */ + + if ( + (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4)) && + /* + * XXX FIX ME: keep track of whether or not we have any interfaces that + * do not support checksums (for now, assume we do) + */ + ( 1 ) + ) + { + m = bridge_fix_txcsum( m ); + if ( m == NULL ) + { + goto done; + } + } +#else + if (eh->ether_type == htons(ETHERTYPE_IP)) + mbuf_outbound_finalize(m, PF_INET, sizeof(struct ether_header)); + else + m->m_pkthdr.csum_flags = 0; +#endif + + #if NBPFILTER > 0 + if (sc->sc_bpf_output) + bridge_bpf_output(ifp, m); + #endif + bridge_broadcast(sc, ifp, m, 0); + } +#if APPLE_BRIDGE_HWCKSUM_SUPPORT +done: +#endif + + return 0; +} + +/* + * bridge_forward: + * + * The forwarding function of the bridge. + * + * NOTE: Releases the lock on return. + */ +static void +bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, + struct mbuf *m) +{ + struct bridge_iflist *dbif; + struct ifnet *src_if, *dst_if, *ifp; + struct ether_header *eh; + uint16_t vlan; + uint8_t *dst; + int error; + + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED); + +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf("bridge_forward %s%d m%p\n", ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp), m); +#endif /* BRIDGE_DEBUG */ + + src_if = m->m_pkthdr.rcvif; + ifp = sc->sc_ifp; + + (void) ifnet_stat_increment_in(ifp, 1, m->m_pkthdr.len, 0); + vlan = VLANTAGOF(m); + + + if ((sbif->bif_flags & IFBIF_STP) && + sbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) + goto drop; + + eh = mtod(m, struct ether_header *); + dst = eh->ether_dhost; + + /* If the interface is learning, record the address. */ + if (sbif->bif_flags & IFBIF_LEARNING) { + error = bridge_rtupdate(sc, eh->ether_shost, vlan, + sbif, 0, IFBAF_DYNAMIC); + /* + * If the interface has addresses limits then deny any source + * that is not in the cache. + */ + if (error && sbif->bif_addrmax) + goto drop; + } + + if ((sbif->bif_flags & IFBIF_STP) != 0 && + sbif->bif_stp.bp_state == BSTP_IFSTATE_LEARNING) + goto drop; + + /* + * At this point, the port either doesn't participate + * in spanning tree or it is in the forwarding state. + */ + + /* + * If the packet is unicast, destined for someone on + * "this" side of the bridge, drop it. + */ + if ((m->m_flags & (M_BCAST|M_MCAST)) == 0) { + dst_if = bridge_rtlookup(sc, dst, vlan); + if (src_if == dst_if) + goto drop; + } else { + /* + * Check if its a reserved multicast address, any address + * listed in 802.1D section 7.12.6 may not be forwarded by the + * bridge. + * This is currently 01-80-C2-00-00-00 to 01-80-C2-00-00-0F + */ + if (dst[0] == 0x01 && dst[1] == 0x80 && + dst[2] == 0xc2 && dst[3] == 0x00 && + dst[4] == 0x00 && dst[5] <= 0x0f) + goto drop; + + + /* ...forward it to all interfaces. */ + atomic_add_64(&ifp->if_imcasts, 1); + dst_if = NULL; + } + + /* + * If we have a destination interface which is a member of our bridge, + * OR this is a unicast packet, push it through the bpf(4) machinery. + * For broadcast or multicast packets, don't bother because it will + * be reinjected into ether_input. We do this before we pass the packets + * through the pfil(9) framework, as it is possible that pfil(9) will + * drop the packet, or possibly modify it, making it difficult to debug + * firewall issues on the bridge. + */ +#if NBPFILTER > 0 + if (eh->ether_type == htons(ETHERTYPE_RSN_PREAUTH) || + dst_if != NULL || (m->m_flags & (M_BCAST | M_MCAST)) == 0) { + m->m_pkthdr.rcvif = ifp; + if (sc->sc_bpf_input) + bridge_bpf_input(ifp, m); + } +#endif /* NBPFILTER */ + +#if defined(PFIL_HOOKS) + /* run the packet filter */ + if (PFIL_HOOKED(&inet_pfil_hook) +#ifdef INET6 + || PFIL_HOOKED(&inet6_pfil_hook) +#endif /* INET6 */ + ) { + BRIDGE_UNLOCK(sc); + if (bridge_pfil(&m, ifp, src_if, PFIL_IN) != 0) + return; + if (m == NULL) + return; + BRIDGE_LOCK(sc); + } +#endif /* PFIL_HOOKS */ + + if (dst_if == NULL) { + /* + * Clear any in-bound checksum flags for this packet. + */ + mbuf_inbound_modified(m); + + bridge_broadcast(sc, src_if, m, 1); + + return; + } + + /* + * At this point, we're dealing with a unicast frame + * going to a different interface. + */ + if ((dst_if->if_flags & IFF_RUNNING) == 0) + goto drop; + + dbif = bridge_lookup_member_if(sc, dst_if); + if (dbif == NULL) + /* Not a member of the bridge (anymore?) */ + goto drop; + + /* Private segments can not talk to each other */ + if (sbif->bif_flags & dbif->bif_flags & IFBIF_PRIVATE) + goto drop; + + if ((dbif->bif_flags & IFBIF_STP) && + dbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) + goto drop; + +#if HAS_DHCPRA_MASK + /* APPLE MODIFICATION */ + if ((dst_if->if_extflags & IFEXTF_DHCPRA_MASK) != 0) { + m = ip_xdhcpra_output(dst_if, m); + if (!m) { + ++sc->sc_sc.sc_ifp.if_xdhcpra; + return; + } + } +#endif /* HAS_DHCPRA_MASK */ + + BRIDGE_UNLOCK(sc); + +#if defined(PFIL_HOOKS) + if (PFIL_HOOKED(&inet_pfil_hook) +#ifdef INET6 + || PFIL_HOOKED(&inet6_pfil_hook) +#endif + ) { + if (bridge_pfil(&m, ifp, dst_if, PFIL_OUT) != 0) + return; + if (m == NULL) + return; + } +#endif /* PFIL_HOOKS */ + + /* + * Clear any in-bound checksum flags for this packet. + */ + mbuf_inbound_modified(m); + + bridge_enqueue(sc, dst_if, m); + return; + +drop: + BRIDGE_UNLOCK(sc); + m_freem(m); +} + +#if BRIDGE_DEBUG + +char * ether_ntop(char *, size_t , const u_char *); + +__private_extern__ char * +ether_ntop(char *buf, size_t len, const u_char *ap) +{ + snprintf(buf, len, "%02x:%02x:%02x:%02x:%02x:%02x", + ap[0], ap[1], ap[2], ap[3], ap[4], ap[5]); + + return buf; +} + +#endif /* BRIDGE_DEBUG */ + +/* + * bridge_input: + * + * Filter input from a member interface. Queue the packet for + * bridging if it is not for us. + */ +__private_extern__ errno_t +bridge_input(struct ifnet *ifp, struct mbuf *m, __unused void *frame_header) +{ + struct bridge_softc *sc = ifp->if_bridge; + struct bridge_iflist *bif, *bif2; + struct ifnet *bifp; + struct ether_header *eh; + struct mbuf *mc, *mc2; + uint16_t vlan; + int error; + +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf("bridge_input: %s%d from %s%d m %p data %p\n", + ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp), + ifnet_name(ifp), ifnet_unit(ifp), + m, mbuf_data(m)); +#endif /* BRIDGE_DEBUG */ + + if ((sc->sc_ifp->if_flags & IFF_RUNNING) == 0) { +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf( "bridge_input: %s%d not running passing along\n", + ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp)); +#endif /* BRIDGE_DEBUG */ + return 0; + } + + bifp = sc->sc_ifp; + vlan = VLANTAGOF(m); + +#ifdef IFF_MONITOR + /* + * Implement support for bridge monitoring. If this flag has been + * set on this interface, discard the packet once we push it through + * the bpf(4) machinery, but before we do, increment the byte and + * packet counters associated with this interface. + */ + if ((bifp->if_flags & IFF_MONITOR) != 0) { + m->m_pkthdr.rcvif = bifp; + BRIDGE_BPF_MTAP_INPUT(sc, m); + (void) ifnet_stat_increment_in(bifp, 1, m->m_pkthdr.len, 0); + m_freem(m); + return EJUSTRETURN; + } +#endif /* IFF_MONITOR */ + + /* + * Need to clear the promiscous flags otherwise it will be + * dropped by DLIL after processing filters + */ + if ((mbuf_flags(m) & MBUF_PROMISC)) + mbuf_setflags_mask(m, 0, MBUF_PROMISC); + + BRIDGE_LOCK(sc); + bif = bridge_lookup_member_if(sc, ifp); + if (bif == NULL) { + BRIDGE_UNLOCK(sc); +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf( "bridge_input: %s%d bridge_lookup_member_if failed\n", + ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp)); +#endif /* BRIDGE_DEBUG */ + return 0; + } + + eh = mtod(m, struct ether_header *); + + bridge_span(sc, m); + + if (m->m_flags & (M_BCAST|M_MCAST)) { + +#if BRIDGE_DEBUG + if (_if_brige_debug) + if ((m->m_flags & M_MCAST)) + printf("mulicast: %02x:%02x:%02x:%02x:%02x:%02x\n", + eh->ether_dhost[0], eh->ether_dhost[1], eh->ether_dhost[2], + eh->ether_dhost[3], eh->ether_dhost[4], eh->ether_dhost[5]); + +#endif /* BRIDGE_DEBUG */ + + /* Tap off 802.1D packets; they do not get forwarded. */ + if (memcmp(eh->ether_dhost, bstp_etheraddr, + ETHER_ADDR_LEN) == 0) { + m = bstp_input(&bif->bif_stp, ifp, m); + if (m == NULL) { + BRIDGE_UNLOCK(sc); + return EJUSTRETURN; + } + } + + if ((bif->bif_flags & IFBIF_STP) && + bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) { + BRIDGE_UNLOCK(sc); + return 0; + } + + /* + * Make a deep copy of the packet and enqueue the copy + * for bridge processing; return the original packet for + * local processing. + */ + mc = m_dup(m, M_DONTWAIT); + if (mc == NULL) { + BRIDGE_UNLOCK(sc); + return 0; + } + + /* + * Perform the bridge forwarding function with the copy. + * + * Note that bridge_forward calls BRIDGE_UNLOCK + */ + bridge_forward(sc, bif, mc); + + /* + * Reinject the mbuf as arriving on the bridge so we have a + * chance at claiming multicast packets. We can not loop back + * here from ether_input as a bridge is never a member of a + * bridge. + */ + KASSERT(bifp->if_bridge == NULL, + ("loop created in bridge_input")); + mc2 = m_dup(m, M_DONTWAIT); + if (mc2 != NULL) { + /* Keep the layer3 header aligned */ + int i = min(mc2->m_pkthdr.len, max_protohdr); + mc2 = m_copyup(mc2, i, ETHER_ALIGN); + } + if (mc2 != NULL) { + // mark packet as arriving on the bridge + mc2->m_pkthdr.rcvif = bifp; + mc2->m_pkthdr.header = mbuf_data(mc2); + +#if NBPFILTER > 0 + if (sc->sc_bpf_input) + bridge_bpf_input(bifp, mc2); +#endif /* NBPFILTER */ + (void) mbuf_setdata(mc2, (char *)mbuf_data(mc2) + ETHER_HDR_LEN, mbuf_len(mc2) - ETHER_HDR_LEN); + (void) mbuf_pkthdr_adjustlen(mc2, - ETHER_HDR_LEN); + + (void) ifnet_stat_increment_in(bifp, 1, mbuf_pkthdr_len(mc2), 0); + +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf( "bridge_input: %s%d mcast for us\n", + ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp)); +#endif /* BRIDGE_DEBUG */ + + dlil_input_packet_list(bifp, mc2); + } + + /* Return the original packet for local processing. */ + return 0; + } + + if ((bif->bif_flags & IFBIF_STP) && + bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) { + BRIDGE_UNLOCK(sc); + return 0; + } + +#ifdef DEV_CARP +# define OR_CARP_CHECK_WE_ARE_DST(iface) \ + || ((iface)->if_carp \ + && carp_forus((iface)->if_carp, eh->ether_dhost)) +# define OR_CARP_CHECK_WE_ARE_SRC(iface) \ + || ((iface)->if_carp \ + && carp_forus((iface)->if_carp, eh->ether_shost)) +#else +# define OR_CARP_CHECK_WE_ARE_DST(iface) +# define OR_CARP_CHECK_WE_ARE_SRC(iface) +#endif + +#ifdef INET6 +# define OR_PFIL_HOOKED_INET6 \ + || PFIL_HOOKED(&inet6_pfil_hook) +#else +# define OR_PFIL_HOOKED_INET6 +#endif + +#if defined(PFIL_HOOKS) +#define PFIL_PHYS(sc, ifp, m) do { \ + if (pfil_local_phys && \ + (PFIL_HOOKED(&inet_pfil_hook) \ + OR_PFIL_HOOKED_INET6)) { \ + if (bridge_pfil(&m, NULL, ifp, \ + PFIL_IN) != 0 || m == NULL) { \ + BRIDGE_UNLOCK(sc); \ + return (NULL); \ + } \ + } \ + } while (0) +#else /* PFIL_HOOKS */ +#define PFIL_PHYS(sc, ifp, m) +#endif /* PFIL_HOOKS */ + +#define GRAB_OUR_PACKETS(iface) \ + if ((iface)->if_type == IFT_GIF) \ + continue; \ + /* It is destined for us. */ \ + if (memcmp(ifnet_lladdr((iface)), eh->ether_dhost, ETHER_ADDR_LEN) == 0 \ + OR_CARP_CHECK_WE_ARE_DST((iface)) \ + ) { \ + if ((iface)->if_type == IFT_BRIDGE) { \ + BRIDGE_BPF_MTAP_INPUT(sc, m); \ + /* Filter on the physical interface. */ \ + PFIL_PHYS(sc, iface, m); \ + } \ + if (bif->bif_flags & IFBIF_LEARNING) { \ + error = bridge_rtupdate(sc, eh->ether_shost, \ + vlan, bif, 0, IFBAF_DYNAMIC); \ + if (error && bif->bif_addrmax) { \ + BRIDGE_UNLOCK(sc); \ + return EJUSTRETURN; \ + } \ + } \ + m->m_pkthdr.rcvif = iface; \ + BRIDGE_UNLOCK(sc); \ + return 0; \ + } \ + \ + /* We just received a packet that we sent out. */ \ + if (memcmp(ifnet_lladdr((iface)), eh->ether_shost, ETHER_ADDR_LEN) == 0 \ + OR_CARP_CHECK_WE_ARE_SRC((iface)) \ + ) { \ + BRIDGE_UNLOCK(sc); \ + return EJUSTRETURN; \ + } + + /* + * Unicast. + */ + /* + * If the packet is for us, set the packets source as the + * bridge, and return the packet back to ether_input for + * local processing. + */ + if (memcmp(eh->ether_dhost, ifnet_lladdr(bifp), + ETHER_ADDR_LEN) == 0 + OR_CARP_CHECK_WE_ARE_DST(bifp)) { + + /* Mark the packet as arriving on the bridge interface */ + (void) mbuf_pkthdr_setrcvif(m, bifp); + mbuf_pkthdr_setheader(m, frame_header); + + /* + * If the interface is learning, and the source + * address is valid and not multicast, record + * the address. + */ + if ((bif->bif_flags & IFBIF_LEARNING) != 0 && + ETHER_IS_MULTICAST(eh->ether_shost) == 0 && + (eh->ether_shost[0] | eh->ether_shost[1] | + eh->ether_shost[2] | eh->ether_shost[3] | + eh->ether_shost[4] | eh->ether_shost[5]) != 0) { + (void) bridge_rtupdate(sc, eh->ether_shost, + vlan, bif, 0, IFBAF_DYNAMIC); + } + + BRIDGE_BPF_MTAP_INPUT(sc, m); + + (void) mbuf_setdata(m, (char *)mbuf_data(m) + ETHER_HDR_LEN, mbuf_len(m) - ETHER_HDR_LEN); + (void) mbuf_pkthdr_adjustlen(m, - ETHER_HDR_LEN); + + (void) ifnet_stat_increment_in(bifp, 1, mbuf_pkthdr_len(m), 0); + + BRIDGE_UNLOCK(sc); + +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf( "bridge_input: %s%d packet for bridge\n", + ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp)); +#endif /* BRIDGE_DEBUG */ + + dlil_input_packet_list(bifp, m); + + return EJUSTRETURN; + } + + /* + * if the destination of the packet is for the MAC address of + * the member interface itself, then we don't need to forward + * it -- just pass it back. Note that it'll likely just be + * dropped by the stack, but if something else is bound to + * the interface directly (for example, the wireless stats + * protocol -- although that actually uses BPF right now), + * then it will consume the packet + * + * ALSO, note that we do this check AFTER checking for the + * bridge's own MAC address, because the bridge may be + * using the SAME MAC address as one of its interfaces + */ + if (memcmp(eh->ether_dhost, ifnet_lladdr(ifp), + ETHER_ADDR_LEN) == 0) { + +#ifdef VERY_VERY_VERY_DIAGNOSTIC + printf("bridge_input: not forwarding packet bound for member interface\n" ); +#endif + BRIDGE_UNLOCK(sc); + return 0; + } + + /* Now check the all bridge members. */ + TAILQ_FOREACH(bif2, &sc->sc_iflist, bif_next) { + GRAB_OUR_PACKETS(bif2->bif_ifp) + } + +#undef OR_CARP_CHECK_WE_ARE_DST +#undef OR_CARP_CHECK_WE_ARE_SRC +#undef OR_PFIL_HOOKED_INET6 +#undef GRAB_OUR_PACKETS + + /* + * Perform the bridge forwarding function. + * + * Note that bridge_forward calls BRIDGE_UNLOCK + */ + bridge_forward(sc, bif, m); + + return EJUSTRETURN; +} + +/* + * bridge_broadcast: + * + * Send a frame to all interfaces that are members of + * the bridge, except for the one on which the packet + * arrived. + * + * NOTE: Releases the lock on return. + */ +static void +bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, + struct mbuf *m, int runfilt) +{ +#ifndef PFIL_HOOKS +#pragma unused(runfilt) +#endif + struct bridge_iflist *dbif, *sbif; + struct mbuf *mc; + struct ifnet *dst_if; + int error = 0, used = 0; + + sbif = bridge_lookup_member_if(sc, src_if); + + BRIDGE_LOCK2REF(sc, error); + if (error) { + m_freem(m); + return; + } + +#ifdef PFIL_HOOKS + /* Filter on the bridge interface before broadcasting */ + if (runfilt && (PFIL_HOOKED(&inet_pfil_hook) +#ifdef INET6 + || PFIL_HOOKED(&inet6_pfil_hook) +#endif /* INET6 */ + )) { + if (bridge_pfil(&m, sc->sc_ifp, NULL, PFIL_OUT) != 0) + goto out; + if (m == NULL) + goto out; + } +#endif /* PFIL_HOOKS */ + + TAILQ_FOREACH(dbif, &sc->sc_iflist, bif_next) { + dst_if = dbif->bif_ifp; + if (dst_if == src_if) + continue; + + /* Private segments can not talk to each other */ + if (sbif && (sbif->bif_flags & dbif->bif_flags & IFBIF_PRIVATE)) + continue; + + if ((dbif->bif_flags & IFBIF_STP) && + dbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) + continue; + + if ((dbif->bif_flags & IFBIF_DISCOVER) == 0 && + (m->m_flags & (M_BCAST|M_MCAST)) == 0) + continue; + + if ((dst_if->if_flags & IFF_RUNNING) == 0) + continue; + + if (TAILQ_NEXT(dbif, bif_next) == NULL) { + mc = m; + used = 1; + } else { + mc = m_dup(m, M_DONTWAIT); + if (mc == NULL) { + (void) ifnet_stat_increment_out(sc->sc_ifp, 0, 0, 1); + continue; + } + } + +#ifdef PFIL_HOOKS + /* + * Filter on the output interface. Pass a NULL bridge interface + * pointer so we do not redundantly filter on the bridge for + * each interface we broadcast on. + */ + if (runfilt && (PFIL_HOOKED(&inet_pfil_hook) +#ifdef INET6 + || PFIL_HOOKED(&inet6_pfil_hook) +#endif + )) { + if (used == 0) { + /* Keep the layer3 header aligned */ + int i = min(mc->m_pkthdr.len, max_protohdr); + mc = m_copyup(mc, i, ETHER_ALIGN); + if (mc == NULL) { + (void) ifnet_stat_increment_out(sc->sc_ifp, 0, 0, 1); + continue; + } + } + if (bridge_pfil(&mc, NULL, dst_if, PFIL_OUT) != 0) + continue; + if (mc == NULL) + continue; + } +#endif /* PFIL_HOOKS */ + + bridge_enqueue(sc, dst_if, mc); + } + if (used == 0) + m_freem(m); + +#ifdef PFIL_HOOKS +out: +#endif /* PFIL_HOOKS */ + + BRIDGE_UNREF(sc); +} + +/* + * bridge_span: + * + * Duplicate a packet out one or more interfaces that are in span mode, + * the original mbuf is unmodified. + */ +static void +bridge_span(struct bridge_softc *sc, struct mbuf *m) +{ + struct bridge_iflist *bif; + struct ifnet *dst_if; + struct mbuf *mc; + + if (TAILQ_EMPTY(&sc->sc_spanlist)) + return; + + TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) { + dst_if = bif->bif_ifp; + + if ((dst_if->if_flags & IFF_RUNNING) == 0) + continue; + + mc = m_copypacket(m, M_DONTWAIT); + if (mc == NULL) { + (void) ifnet_stat_increment_out(sc->sc_ifp, 0, 0, 1); + continue; + } + + bridge_enqueue(sc, dst_if, mc); + } +} + + + +/* + * bridge_rtupdate: + * + * Add a bridge routing entry. + */ +static int +bridge_rtupdate(struct bridge_softc *sc, const uint8_t *dst, uint16_t vlan, + struct bridge_iflist *bif, int setflags, uint8_t flags) +{ + struct bridge_rtnode *brt; + int error; + + BRIDGE_LOCK_ASSERT(sc); + + /* Check the source address is valid and not multicast. */ + if (ETHER_IS_MULTICAST(dst) || + (dst[0] == 0 && dst[1] == 0 && dst[2] == 0 && + dst[3] == 0 && dst[4] == 0 && dst[5] == 0) != 0) + return (EINVAL); + + + /* 802.1p frames map to vlan 1 */ + if (vlan == 0) + vlan = 1; + + /* + * A route for this destination might already exist. If so, + * update it, otherwise create a new one. + */ + if ((brt = bridge_rtnode_lookup(sc, dst, vlan)) == NULL) { + if (sc->sc_brtcnt >= sc->sc_brtmax) { + sc->sc_brtexceeded++; + return (ENOSPC); + } + /* Check per interface address limits (if enabled) */ + if (bif->bif_addrmax && bif->bif_addrcnt >= bif->bif_addrmax) { + bif->bif_addrexceeded++; + return (ENOSPC); + } + + /* + * Allocate a new bridge forwarding node, and + * initialize the expiration time and Ethernet + * address. + */ + brt = zalloc_noblock(bridge_rtnode_pool); + if (brt == NULL) + return (ENOMEM); + + if (bif->bif_flags & IFBIF_STICKY) + brt->brt_flags = IFBAF_STICKY; + else + brt->brt_flags = IFBAF_DYNAMIC; + + memcpy(brt->brt_addr, dst, ETHER_ADDR_LEN); + brt->brt_vlan = vlan; + + + if ((error = bridge_rtnode_insert(sc, brt)) != 0) { + zfree(bridge_rtnode_pool, brt); + return (error); + } + brt->brt_dst = bif; + bif->bif_addrcnt++; + } + + if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC && + brt->brt_dst != bif) { + brt->brt_dst->bif_addrcnt--; + brt->brt_dst = bif; + brt->brt_dst->bif_addrcnt++; + } + + if ((flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) { + struct timespec now; + + nanouptime(&now); + brt->brt_expire = now.tv_sec + sc->sc_brttimeout; + } + if (setflags) + brt->brt_flags = flags; + + + return (0); +} + +/* + * bridge_rtlookup: + * + * Lookup the destination interface for an address. + */ +static struct ifnet * +bridge_rtlookup(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan) +{ + struct bridge_rtnode *brt; + + BRIDGE_LOCK_ASSERT(sc); + + if ((brt = bridge_rtnode_lookup(sc, addr, vlan)) == NULL) + return (NULL); + + return (brt->brt_ifp); +} + +/* + * bridge_rttrim: + * + * Trim the routine table so that we have a number + * of routing entries less than or equal to the + * maximum number. + */ +static void +bridge_rttrim(struct bridge_softc *sc) +{ + struct bridge_rtnode *brt, *nbrt; + + BRIDGE_LOCK_ASSERT(sc); + + /* Make sure we actually need to do this. */ + if (sc->sc_brtcnt <= sc->sc_brtmax) + return; + + /* Force an aging cycle; this might trim enough addresses. */ + bridge_rtage(sc); + if (sc->sc_brtcnt <= sc->sc_brtmax) + return; + + LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) { + if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) { + bridge_rtnode_destroy(sc, brt); + if (sc->sc_brtcnt <= sc->sc_brtmax) + return; + } + } +} + +/* + * bridge_timer: + * + * Aging timer for the bridge. + */ +static void +bridge_timer(void *arg) +{ + struct bridge_softc *sc = arg; + + BRIDGE_LOCK(sc); + + bridge_rtage(sc); + + BRIDGE_UNLOCK(sc); + + if (sc->sc_ifp->if_flags & IFF_RUNNING) { + struct timespec ts; + + ts.tv_sec = bridge_rtable_prune_period; + ts.tv_nsec = 0; + bsd_timeout(bridge_timer, sc, &ts); + } +} + +/* + * bridge_rtage: + * + * Perform an aging cycle. + */ +static void +bridge_rtage(struct bridge_softc *sc) +{ + struct bridge_rtnode *brt, *nbrt; + + BRIDGE_LOCK_ASSERT(sc); + + LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) { + if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) { + struct timespec now; + + nanouptime(&now); + if ((unsigned long)now.tv_sec >= brt->brt_expire) + bridge_rtnode_destroy(sc, brt); + } + } +} + +/* + * bridge_rtflush: + * + * Remove all dynamic addresses from the bridge. + */ +static void +bridge_rtflush(struct bridge_softc *sc, int full) +{ + struct bridge_rtnode *brt, *nbrt; + + BRIDGE_LOCK_ASSERT(sc); + + LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) { + if (full || (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) + bridge_rtnode_destroy(sc, brt); + } +} + +/* + * bridge_rtdaddr: + * + * Remove an address from the table. + */ +static int +bridge_rtdaddr(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan) +{ + struct bridge_rtnode *brt; + int found = 0; + + BRIDGE_LOCK_ASSERT(sc); + + /* + * If vlan is zero then we want to delete for all vlans so the lookup + * may return more than one. + */ + while ((brt = bridge_rtnode_lookup(sc, addr, vlan)) != NULL) { + bridge_rtnode_destroy(sc, brt); + found = 1; + } + + return (found ? 0 : ENOENT); +} + +/* + * bridge_rtdelete: + * + * Delete routes to a speicifc member interface. + */ +static void +bridge_rtdelete(struct bridge_softc *sc, struct ifnet *ifp, int full) +{ + struct bridge_rtnode *brt, *nbrt; + + BRIDGE_LOCK_ASSERT(sc); + + LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) { + if (brt->brt_ifp == ifp && (full || + (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC)) + bridge_rtnode_destroy(sc, brt); + } +} + +/* + * bridge_rtable_init: + * + * Initialize the route table for this bridge. + */ +static int +bridge_rtable_init(struct bridge_softc *sc) +{ + int i; + + sc->sc_rthash = _MALLOC(sizeof(*sc->sc_rthash) * BRIDGE_RTHASH_SIZE, + M_DEVBUF, M_NOWAIT); + if (sc->sc_rthash == NULL) + return (ENOMEM); + + for (i = 0; i < BRIDGE_RTHASH_SIZE; i++) + LIST_INIT(&sc->sc_rthash[i]); + + sc->sc_rthash_key = random(); + + LIST_INIT(&sc->sc_rtlist); + + return (0); +} + +/* + * bridge_rtable_fini: + * + * Deconstruct the route table for this bridge. + */ +static void +bridge_rtable_fini(struct bridge_softc *sc) +{ + + KASSERT(sc->sc_brtcnt == 0, + ("%s: %d bridge routes referenced", __func__, sc->sc_brtcnt)); + _FREE(sc->sc_rthash, M_DEVBUF); +} + +/* + * The following hash function is adapted from "Hash Functions" by Bob Jenkins + * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). + */ +#define mix(a, b, c) \ +do { \ + a -= b; a -= c; a ^= (c >> 13); \ + b -= c; b -= a; b ^= (a << 8); \ + c -= a; c -= b; c ^= (b >> 13); \ + a -= b; a -= c; a ^= (c >> 12); \ + b -= c; b -= a; b ^= (a << 16); \ + c -= a; c -= b; c ^= (b >> 5); \ + a -= b; a -= c; a ^= (c >> 3); \ + b -= c; b -= a; b ^= (a << 10); \ + c -= a; c -= b; c ^= (b >> 15); \ +} while (/*CONSTCOND*/0) + +static __inline uint32_t +bridge_rthash(struct bridge_softc *sc, const uint8_t *addr) +{ + uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = sc->sc_rthash_key; + + b += addr[5] << 8; + b += addr[4]; + a += addr[3] << 24; + a += addr[2] << 16; + a += addr[1] << 8; + a += addr[0]; + + mix(a, b, c); + + return (c & BRIDGE_RTHASH_MASK); +} + +#undef mix + +static int +bridge_rtnode_addr_cmp(const uint8_t *a, const uint8_t *b) +{ + int i, d; + + for (i = 0, d = 0; i < ETHER_ADDR_LEN && d == 0; i++) { + d = ((int)a[i]) - ((int)b[i]); + } + + return (d); +} + +/* + * bridge_rtnode_lookup: + * + * Look up a bridge route node for the specified destination. Compare the + * vlan id or if zero then just return the first match. + */ +static struct bridge_rtnode * +bridge_rtnode_lookup(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan) +{ + struct bridge_rtnode *brt; + uint32_t hash; + int dir; + + BRIDGE_LOCK_ASSERT(sc); + + hash = bridge_rthash(sc, addr); + LIST_FOREACH(brt, &sc->sc_rthash[hash], brt_hash) { + dir = bridge_rtnode_addr_cmp(addr, brt->brt_addr); + if (dir == 0 && (brt->brt_vlan == vlan || vlan == 0)) + return (brt); + if (dir > 0) + return (NULL); + } + + return (NULL); +} + +/* + * bridge_rtnode_insert: + * + * Insert the specified bridge node into the route table. We + * assume the entry is not already in the table. + */ +static int +bridge_rtnode_insert(struct bridge_softc *sc, struct bridge_rtnode *brt) +{ + struct bridge_rtnode *lbrt; + uint32_t hash; + int dir; + + BRIDGE_LOCK_ASSERT(sc); + + hash = bridge_rthash(sc, brt->brt_addr); + + lbrt = LIST_FIRST(&sc->sc_rthash[hash]); + if (lbrt == NULL) { + LIST_INSERT_HEAD(&sc->sc_rthash[hash], brt, brt_hash); + goto out; + } + + do { + dir = bridge_rtnode_addr_cmp(brt->brt_addr, lbrt->brt_addr); + if (dir == 0 && brt->brt_vlan == lbrt->brt_vlan) + return (EEXIST); + if (dir > 0) { + LIST_INSERT_BEFORE(lbrt, brt, brt_hash); + goto out; + } + if (LIST_NEXT(lbrt, brt_hash) == NULL) { + LIST_INSERT_AFTER(lbrt, brt, brt_hash); + goto out; + } + lbrt = LIST_NEXT(lbrt, brt_hash); + } while (lbrt != NULL); + +#ifdef DIAGNOSTIC + panic("bridge_rtnode_insert: impossible"); +#endif + +out: + LIST_INSERT_HEAD(&sc->sc_rtlist, brt, brt_list); + sc->sc_brtcnt++; + + return (0); +} + +/* + * bridge_rtnode_destroy: + * + * Destroy a bridge rtnode. + */ +static void +bridge_rtnode_destroy(struct bridge_softc *sc, struct bridge_rtnode *brt) +{ + BRIDGE_LOCK_ASSERT(sc); + + LIST_REMOVE(brt, brt_hash); + + LIST_REMOVE(brt, brt_list); + sc->sc_brtcnt--; + brt->brt_dst->bif_addrcnt--; + zfree(bridge_rtnode_pool, brt); +} + +/* + * bridge_rtable_expire: + * + * Set the expiry time for all routes on an interface. + */ +static void +bridge_rtable_expire(struct ifnet *ifp, int age) +{ + struct bridge_softc *sc = ifp->if_bridge; + struct bridge_rtnode *brt; + + BRIDGE_LOCK(sc); + + /* + * If the age is zero then flush, otherwise set all the expiry times to + * age for the interface + */ + if (age == 0) + bridge_rtdelete(sc, ifp, IFBF_FLUSHDYN); + else { + LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) { + struct timespec now; + + nanouptime(&now); + /* Cap the expiry time to 'age' */ + if (brt->brt_ifp == ifp && + brt->brt_expire > (unsigned long)now.tv_sec + age && + (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) + brt->brt_expire = (unsigned long)now.tv_sec + age; + } + } + BRIDGE_UNLOCK(sc); +} + +/* + * bridge_state_change: + * + * Callback from the bridgestp code when a port changes states. + */ +static void +bridge_state_change(struct ifnet *ifp, int state) +{ + struct bridge_softc *sc = ifp->if_bridge; + static const char *stpstates[] = { + "disabled", + "listening", + "learning", + "forwarding", + "blocking", + "discarding" + }; + + if (log_stp) + log(LOG_NOTICE, "%s%d: state changed to %s on %s%d\n", + ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp), + stpstates[state], + ifnet_name(ifp), ifnet_unit(ifp)); +} + +#ifdef PFIL_HOOKS +/* + * Send bridge packets through pfil if they are one of the types pfil can deal + * with, or if they are ARP or REVARP. (pfil will pass ARP and REVARP without + * question.) If *bifp or *ifp are NULL then packet filtering is skipped for + * that interface. + */ +static int +bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir) +{ + int snap, error, i, hlen; + struct ether_header *eh1, eh2; + struct ip_fw_args args; + struct ip *ip; + struct llc llc1; + u_int16_t ether_type; + + snap = 0; + error = -1; /* Default error if not error == 0 */ + +#if 0 + /* we may return with the IP fields swapped, ensure its not shared */ + KASSERT(M_WRITABLE(*mp), ("%s: modifying a shared mbuf", __func__)); +#endif + + if (pfil_bridge == 0 && pfil_member == 0 && pfil_ipfw == 0) + return (0); /* filtering is disabled */ + + i = min((*mp)->m_pkthdr.len, max_protohdr); + if ((*mp)->m_len < i) { + *mp = m_pullup(*mp, i); + if (*mp == NULL) { + printf("%s: m_pullup failed\n", __func__); + return (-1); + } + } + + eh1 = mtod(*mp, struct ether_header *); + ether_type = ntohs(eh1->ether_type); + + /* + * Check for SNAP/LLC. + */ + if (ether_type < ETHERMTU) { + struct llc *llc2 = (struct llc *)(eh1 + 1); + + if ((*mp)->m_len >= ETHER_HDR_LEN + 8 && + llc2->llc_dsap == LLC_SNAP_LSAP && + llc2->llc_ssap == LLC_SNAP_LSAP && + llc2->llc_control == LLC_UI) { + ether_type = htons(llc2->llc_un.type_snap.ether_type); + snap = 1; + } + } + + /* + * If we're trying to filter bridge traffic, don't look at anything + * other than IP and ARP traffic. If the filter doesn't understand + * IPv6, don't allow IPv6 through the bridge either. This is lame + * since if we really wanted, say, an AppleTalk filter, we are hosed, + * but of course we don't have an AppleTalk filter to begin with. + * (Note that since pfil doesn't understand ARP it will pass *ALL* + * ARP traffic.) + */ + switch (ether_type) { + case ETHERTYPE_ARP: + case ETHERTYPE_REVARP: + if (pfil_ipfw_arp == 0) + return (0); /* Automatically pass */ + break; + + case ETHERTYPE_IP: +#ifdef INET6 + case ETHERTYPE_IPV6: +#endif /* INET6 */ + break; + default: + /* + * Check to see if the user wants to pass non-ip + * packets, these will not be checked by pfil(9) and + * passed unconditionally so the default is to drop. + */ + if (pfil_onlyip) + goto bad; + } + + /* Strip off the Ethernet header and keep a copy. */ + m_copydata(*mp, 0, ETHER_HDR_LEN, (caddr_t) &eh2); + m_adj(*mp, ETHER_HDR_LEN); + + /* Strip off snap header, if present */ + if (snap) { + m_copydata(*mp, 0, sizeof(struct llc), (caddr_t) &llc1); + m_adj(*mp, sizeof(struct llc)); + } + + /* + * Check the IP header for alignment and errors + */ + if (dir == PFIL_IN) { + switch (ether_type) { + case ETHERTYPE_IP: + error = bridge_ip_checkbasic(mp); + break; +#ifdef INET6 + case ETHERTYPE_IPV6: + error = bridge_ip6_checkbasic(mp); + break; +#endif /* INET6 */ + default: + error = 0; + } + if (error) + goto bad; + } + + if (IPFW_LOADED && pfil_ipfw != 0 && dir == PFIL_OUT && ifp != NULL) { + error = -1; + args.rule = ip_dn_claim_rule(*mp); + if (args.rule != NULL && fw_one_pass) + goto ipfwpass; /* packet already partially processed */ + + args.m = *mp; + args.oif = ifp; + args.next_hop = NULL; + args.eh = &eh2; + args.inp = NULL; /* used by ipfw uid/gid/jail rules */ + i = ip_fw_chk_ptr(&args); + *mp = args.m; + + if (*mp == NULL) + return (error); + + if (DUMMYNET_LOADED && (i == IP_FW_DUMMYNET)) { + + /* put the Ethernet header back on */ + M_PREPEND(*mp, ETHER_HDR_LEN, M_DONTWAIT); + if (*mp == NULL) + return (error); + bcopy(&eh2, mtod(*mp, caddr_t), ETHER_HDR_LEN); + + /* + * Pass the pkt to dummynet, which consumes it. The + * packet will return to us via bridge_dummynet(). + */ + args.oif = ifp; + ip_dn_io_ptr(mp, DN_TO_IFB_FWD, &args); + return (error); + } + + if (i != IP_FW_PASS) /* drop */ + goto bad; + } + +ipfwpass: + error = 0; + + /* + * Run the packet through pfil + */ + switch (ether_type) { + case ETHERTYPE_IP: + /* + * before calling the firewall, swap fields the same as + * IP does. here we assume the header is contiguous + */ + ip = mtod(*mp, struct ip *); + + ip->ip_len = ntohs(ip->ip_len); + ip->ip_off = ntohs(ip->ip_off); + + /* + * Run pfil on the member interface and the bridge, both can + * be skipped by clearing pfil_member or pfil_bridge. + * + * Keep the order: + * in_if -> bridge_if -> out_if + */ + if (pfil_bridge && dir == PFIL_OUT && bifp != NULL) + error = pfil_run_hooks(&inet_pfil_hook, mp, bifp, + dir, NULL); + + if (*mp == NULL || error != 0) /* filter may consume */ + break; + + if (pfil_member && ifp != NULL) + error = pfil_run_hooks(&inet_pfil_hook, mp, ifp, + dir, NULL); + + if (*mp == NULL || error != 0) /* filter may consume */ + break; + + if (pfil_bridge && dir == PFIL_IN && bifp != NULL) + error = pfil_run_hooks(&inet_pfil_hook, mp, bifp, + dir, NULL); + + if (*mp == NULL || error != 0) /* filter may consume */ + break; + + /* check if we need to fragment the packet */ + if (pfil_member && ifp != NULL && dir == PFIL_OUT) { + i = (*mp)->m_pkthdr.len; + if (i > ifp->if_mtu) { + error = bridge_fragment(ifp, *mp, &eh2, snap, + &llc1); + return (error); + } + } + + /* Recalculate the ip checksum and restore byte ordering */ + ip = mtod(*mp, struct ip *); + hlen = ip->ip_hl << 2; + if (hlen < sizeof(struct ip)) + goto bad; + if (hlen > (*mp)->m_len) { + if ((*mp = m_pullup(*mp, hlen)) == 0) + goto bad; + ip = mtod(*mp, struct ip *); + if (ip == NULL) + goto bad; + } + ip->ip_len = htons(ip->ip_len); + ip->ip_off = htons(ip->ip_off); + ip->ip_sum = 0; + if (hlen == sizeof(struct ip)) + ip->ip_sum = in_cksum_hdr(ip); + else + ip->ip_sum = in_cksum(*mp, hlen); + + break; +#ifdef INET6 + case ETHERTYPE_IPV6: + if (pfil_bridge && dir == PFIL_OUT && bifp != NULL) + error = pfil_run_hooks(&inet6_pfil_hook, mp, bifp, + dir, NULL); + + if (*mp == NULL || error != 0) /* filter may consume */ + break; + + if (pfil_member && ifp != NULL) + error = pfil_run_hooks(&inet6_pfil_hook, mp, ifp, + dir, NULL); + + if (*mp == NULL || error != 0) /* filter may consume */ + break; + + if (pfil_bridge && dir == PFIL_IN && bifp != NULL) + error = pfil_run_hooks(&inet6_pfil_hook, mp, bifp, + dir, NULL); + break; +#endif + default: + error = 0; + break; + } + + if (*mp == NULL) + return (error); + if (error != 0) + goto bad; + + error = -1; + + /* + * Finally, put everything back the way it was and return + */ + if (snap) { + M_PREPEND(*mp, sizeof(struct llc), M_DONTWAIT); + if (*mp == NULL) + return (error); + bcopy(&llc1, mtod(*mp, caddr_t), sizeof(struct llc)); + } + + M_PREPEND(*mp, ETHER_HDR_LEN, M_DONTWAIT); + if (*mp == NULL) + return (error); + bcopy(&eh2, mtod(*mp, caddr_t), ETHER_HDR_LEN); + + return (0); + +bad: + m_freem(*mp); + *mp = NULL; + return (error); +} + + +/* + * Perform basic checks on header size since + * pfil assumes ip_input has already processed + * it for it. Cut-and-pasted from ip_input.c. + * Given how simple the IPv6 version is, + * does the IPv4 version really need to be + * this complicated? + * + * XXX Should we update ipstat here, or not? + * XXX Right now we update ipstat but not + * XXX csum_counter. + */ +static int +bridge_ip_checkbasic(struct mbuf **mp) +{ + struct mbuf *m = *mp; + struct ip *ip; + int len, hlen; + u_short sum; + + if (*mp == NULL) + return (-1); + + if (IP_HDR_ALIGNED_P(mtod(m, caddr_t)) == 0) { + if ((m = m_copyup(m, sizeof(struct ip), + (max_linkhdr + 3) & ~3)) == NULL) { + /* XXXJRT new stat, please */ + ipstat.ips_toosmall++; + goto bad; + } + } else if (__predict_false(m->m_len < sizeof (struct ip))) { + if ((m = m_pullup(m, sizeof (struct ip))) == NULL) { + ipstat.ips_toosmall++; + goto bad; + } + } + ip = mtod(m, struct ip *); + if (ip == NULL) goto bad; + + if (ip->ip_v != IPVERSION) { + ipstat.ips_badvers++; + goto bad; + } + hlen = ip->ip_hl << 2; + if (hlen < sizeof(struct ip)) { /* minimum header length */ + ipstat.ips_badhlen++; + goto bad; + } + if (hlen > m->m_len) { + if ((m = m_pullup(m, hlen)) == 0) { + ipstat.ips_badhlen++; + goto bad; + } + ip = mtod(m, struct ip *); + if (ip == NULL) goto bad; + } + + if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { + sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); + } else { + if (hlen == sizeof(struct ip)) { + sum = in_cksum_hdr(ip); + } else { + sum = in_cksum(m, hlen); + } + } + if (sum) { + ipstat.ips_badsum++; + goto bad; + } + + /* Retrieve the packet length. */ + len = ntohs(ip->ip_len); + + /* + * Check for additional length bogosity + */ + if (len < hlen) { + ipstat.ips_badlen++; + goto bad; + } + + /* + * Check that the amount of data in the buffers + * is as at least much as the IP header would have us expect. + * Drop packet if shorter than we expect. + */ + if (m->m_pkthdr.len < len) { + ipstat.ips_tooshort++; + goto bad; + } + + /* Checks out, proceed */ + *mp = m; + return (0); + +bad: + *mp = m; + return (-1); +} + +#ifdef INET6 +/* + * Same as above, but for IPv6. + * Cut-and-pasted from ip6_input.c. + * XXX Should we update ip6stat, or not? + */ +static int +bridge_ip6_checkbasic(struct mbuf **mp) +{ + struct mbuf *m = *mp; + struct ip6_hdr *ip6; + + /* + * If the IPv6 header is not aligned, slurp it up into a new + * mbuf with space for link headers, in the event we forward + * it. Otherwise, if it is aligned, make sure the entire base + * IPv6 header is in the first mbuf of the chain. + */ + if (IP6_HDR_ALIGNED_P(mtod(m, caddr_t)) == 0) { + struct ifnet *inifp = m->m_pkthdr.rcvif; + if ((m = m_copyup(m, sizeof(struct ip6_hdr), + (max_linkhdr + 3) & ~3)) == NULL) { + /* XXXJRT new stat, please */ + ip6stat.ip6s_toosmall++; + in6_ifstat_inc(inifp, ifs6_in_hdrerr); + goto bad; + } + } else if (__predict_false(m->m_len < sizeof(struct ip6_hdr))) { + struct ifnet *inifp = m->m_pkthdr.rcvif; + if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { + ip6stat.ip6s_toosmall++; + in6_ifstat_inc(inifp, ifs6_in_hdrerr); + goto bad; + } + } + + ip6 = mtod(m, struct ip6_hdr *); + + if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { + ip6stat.ip6s_badvers++; + in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr); + goto bad; + } + + /* Checks out, proceed */ + *mp = m; + return (0); + +bad: + *mp = m; + return (-1); +} +#endif /* INET6 */ + +/* + * bridge_fragment: + * + * Return a fragmented mbuf chain. + */ +static int +bridge_fragment(struct ifnet *ifp, struct mbuf *m, struct ether_header *eh, + int snap, struct llc *llc) +{ + struct mbuf *m0; + struct ip *ip; + int error = -1; + + if (m->m_len < sizeof(struct ip) && + (m = m_pullup(m, sizeof(struct ip))) == NULL) + goto out; + ip = mtod(m, struct ip *); + + error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist, + CSUM_DELAY_IP); + if (error) + goto out; + + /* walk the chain and re-add the Ethernet header */ + for (m0 = m; m0; m0 = m0->m_nextpkt) { + if (error == 0) { + if (snap) { + M_PREPEND(m0, sizeof(struct llc), M_DONTWAIT); + if (m0 == NULL) { + error = ENOBUFS; + continue; + } + bcopy(llc, mtod(m0, caddr_t), + sizeof(struct llc)); + } + M_PREPEND(m0, ETHER_HDR_LEN, M_DONTWAIT); + if (m0 == NULL) { + error = ENOBUFS; + continue; + } + bcopy(eh, mtod(m0, caddr_t), ETHER_HDR_LEN); + } else + m_freem(m); + } + + if (error == 0) + ipstat.ips_fragmented++; + + return (error); + +out: + if (m != NULL) + m_freem(m); + return (error); +} +#endif /* PFIL_HOOKS */ + +static errno_t +bridge_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func bpf_callback) +{ + struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp); + + //printf("bridge_set_bpf_tap ifp %p mode %d\n", ifp, mode); + + /* TBD locking */ + if (sc == NULL || (sc->sc_flags & SCF_DETACHING)) { + return ENODEV; + } + + switch (mode) { + case BPF_TAP_DISABLE: + sc->sc_bpf_input = sc->sc_bpf_output = NULL; + break; + + case BPF_TAP_INPUT: + sc->sc_bpf_input = bpf_callback; + break; + + case BPF_TAP_OUTPUT: + sc->sc_bpf_output = bpf_callback; + break; + + case BPF_TAP_INPUT_OUTPUT: + sc->sc_bpf_input = sc->sc_bpf_output = bpf_callback; + break; + + default: + break; + } + + return 0; +} + +static void +bridge_detach(ifnet_t ifp) +{ + struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp); + + bstp_detach(&sc->sc_stp); + + /* Tear down the routing table. */ + bridge_rtable_fini(sc); + + lck_mtx_lock(bridge_list_mtx); + LIST_REMOVE(sc, sc_list); + lck_mtx_unlock(bridge_list_mtx); + + ifnet_release(ifp); + + lck_mtx_free(sc->sc_mtx, bridge_lock_grp); + + _FREE(sc, M_DEVBUF); + return; +} + +__private_extern__ errno_t bridge_bpf_input(ifnet_t ifp, struct mbuf *m) +{ + struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp); + + if (sc->sc_bpf_input) { + if (mbuf_pkthdr_rcvif(m) != ifp) + printf("bridge_bpf_input rcvif: %p != ifp %p\n", mbuf_pkthdr_rcvif(m), ifp); + (*sc->sc_bpf_input)(ifp, m); + } + return 0; +} + +__private_extern__ errno_t bridge_bpf_output(ifnet_t ifp, struct mbuf *m) +{ + struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp); + + if (sc->sc_bpf_output) { + (*sc->sc_bpf_output)(ifp, m); + } + return 0; +} diff --git a/bsd/net/if_bridgevar.h b/bsd/net/if_bridgevar.h new file mode 100644 index 000000000..3d1375aed --- /dev/null +++ b/bsd/net/if_bridgevar.h @@ -0,0 +1,499 @@ +/* $NetBSD: if_bridgevar.h,v 1.4 2003/07/08 07:13:50 itojun Exp $ */ +/* + * Copyright (c) 2004-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Copyright 2001 Wasabi Systems, Inc. + * All rights reserved. + * + * Written by Jason R. Thorpe for Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed for the NetBSD Project by + * Wasabi Systems, Inc. + * 4. The name of Wasabi Systems, Inc. may not be used to endorse + * or promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 1999, 2000 Jason L. Wright (jason@thought.net) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Jason L. Wright + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * OpenBSD: if_bridge.h,v 1.14 2001/03/22 03:48:29 jason Exp + * + * $FreeBSD$ + */ + +/* + * Data structure and control definitions for bridge interfaces. + */ + +#ifndef _NET_IF_BRIDGEVAR_H_ +#define _NET_IF_BRIDGEVAR_H_ + +#ifdef PRIVATE + +#include + +#include +#include + +/* + * Commands used in the SIOCSDRVSPEC ioctl. Note the lookup of the + * bridge interface itself is keyed off the ifdrv structure. + */ +#define BRDGADD 0 /* add bridge member (ifbreq) */ +#define BRDGDEL 1 /* delete bridge member (ifbreq) */ +#define BRDGGIFFLGS 2 /* get member if flags (ifbreq) */ +#define BRDGSIFFLGS 3 /* set member if flags (ifbreq) */ +#define BRDGSCACHE 4 /* set cache size (ifbrparam) */ +#define BRDGGCACHE 5 /* get cache size (ifbrparam) */ +#define BRDGGIFS 6 /* get member list (ifbifconf) */ +#define BRDGRTS 7 /* get address list (ifbaconf) */ +#define BRDGSADDR 8 /* set static address (ifbareq) */ +#define BRDGSTO 9 /* set cache timeout (ifbrparam) */ +#define BRDGGTO 10 /* get cache timeout (ifbrparam) */ +#define BRDGDADDR 11 /* delete address (ifbareq) */ +#define BRDGFLUSH 12 /* flush address cache (ifbreq) */ + +#define BRDGGPRI 13 /* get priority (ifbrparam) */ +#define BRDGSPRI 14 /* set priority (ifbrparam) */ +#define BRDGGHT 15 /* get hello time (ifbrparam) */ +#define BRDGSHT 16 /* set hello time (ifbrparam) */ +#define BRDGGFD 17 /* get forward delay (ifbrparam) */ +#define BRDGSFD 18 /* set forward delay (ifbrparam) */ +#define BRDGGMA 19 /* get max age (ifbrparam) */ +#define BRDGSMA 20 /* set max age (ifbrparam) */ +#define BRDGSIFPRIO 21 /* set if priority (ifbreq) */ +#define BRDGSIFCOST 22 /* set if path cost (ifbreq) */ +#define BRDGGFILT 23 /* get filter flags (ifbrparam) */ +#define BRDGSFILT 24 /* set filter flags (ifbrparam) */ +#define BRDGPURGE 25 /* purge address cache for a particular interface (ifbreq) */ +#define BRDGADDS 26 /* add bridge span member (ifbreq) */ +#define BRDGDELS 27 /* delete bridge span member (ifbreq) */ +#define BRDGPARAM 28 /* get bridge STP params (ifbropreq) */ +#define BRDGGRTE 29 /* get cache drops (ifbrparam) */ +#define BRDGGIFSSTP 30 /* get member STP params list (ifbpstpconf) */ +#define BRDGSPROTO 31 /* set protocol (ifbrparam) */ +#define BRDGSTXHC 32 /* set tx hold count (ifbrparam) */ +#define BRDGSIFAMAX 33 /* set max interface addrs (ifbreq) */ + +/* + * Generic bridge control request. + */ +#pragma pack(4) + +struct ifbreq { + char ifbr_ifsname[IFNAMSIZ]; /* member if name */ + uint32_t ifbr_ifsflags; /* member if flags */ + uint32_t ifbr_stpflags; /* member if STP flags */ + uint32_t ifbr_path_cost; /* member if STP cost */ + uint8_t ifbr_portno; /* member if port number */ + uint8_t ifbr_priority; /* member if STP priority */ + uint8_t ifbr_proto; /* member if STP protocol */ + uint8_t ifbr_role; /* member if STP role */ + uint8_t ifbr_state; /* member if STP state */ + uint32_t ifbr_addrcnt; /* member if addr number */ + uint32_t ifbr_addrmax; /* member if addr max */ + uint32_t ifbr_addrexceeded; /* member if addr violations */ + uint8_t pad[32]; +}; + +#pragma pack() + +/* BRDGGIFFLAGS, BRDGSIFFLAGS */ +#define IFBIF_LEARNING 0x0001 /* if can learn */ +#define IFBIF_DISCOVER 0x0002 /* if sends packets w/ unknown dest. */ +#define IFBIF_STP 0x0004 /* if participates in spanning tree */ +#define IFBIF_SPAN 0x0008 /* if is a span port */ +#define IFBIF_STICKY 0x0010 /* if learned addresses stick */ +#define IFBIF_BSTP_EDGE 0x0020 /* member stp edge port */ +#define IFBIF_BSTP_AUTOEDGE 0x0040 /* member stp autoedge enabled */ +#define IFBIF_BSTP_PTP 0x0080 /* member stp point to point */ +#define IFBIF_BSTP_AUTOPTP 0x0100 /* member stp autoptp enabled */ +#define IFBIF_BSTP_ADMEDGE 0x0200 /* member stp admin edge enabled */ +#define IFBIF_BSTP_ADMCOST 0x0400 /* member stp admin path cost */ +#define IFBIF_PRIVATE 0x0800 /* if is a private segment */ + +#define IFBIFBITS "\020\001LEARNING\002DISCOVER\003STP\004SPAN" \ + "\005STICKY\014PRIVATE\006EDGE\007AUTOEDGE\010PTP" \ + "\011AUTOPTP" +#define IFBIFMASK ~(IFBIF_BSTP_EDGE|IFBIF_BSTP_AUTOEDGE|IFBIF_BSTP_PTP| \ + IFBIF_BSTP_AUTOPTP|IFBIF_BSTP_ADMEDGE| \ + IFBIF_BSTP_ADMCOST) /* not saved */ + +/* BRDGFLUSH */ +#define IFBF_FLUSHDYN 0x00 /* flush learned addresses only */ +#define IFBF_FLUSHALL 0x01 /* flush all addresses */ + +/* BRDGSFILT */ +#define IFBF_FILT_USEIPF 0x00000001 /* run pfil hooks on the bridge +interface */ +#define IFBF_FILT_MEMBER 0x00000002 /* run pfil hooks on the member +interfaces */ +#define IFBF_FILT_ONLYIP 0x00000004 /* only pass IP[46] packets when +pfil is enabled */ +#define IFBF_FILT_MASK 0x00000007 /* mask of valid values */ + + +/* APPLE MODIFICATION : Default is to pass non-IP packets. */ +#define IFBF_FILT_DEFAULT ( IFBF_FILT_USEIPF | IFBF_FILT_MEMBER ) +#if 0 +#define IFBF_FILT_DEFAULT (IFBF_FILT_USEIPF | \ +IFBF_FILT_MEMBER | \ +IFBF_FILT_ONLYIP) +#endif + +/* + * Interface list structure. + */ + +#pragma pack(4) + +#ifndef XNU_KERNEL_PRIVATE + +struct ifbifconf { + uint32_t ifbic_len; /* buffer size */ + union { + caddr_t ifbicu_buf; + struct ifbreq *ifbicu_req; +#define ifbic_buf ifbic_ifbicu.ifbicu_buf +#define ifbic_req ifbic_ifbicu.ifbicu_req + } ifbic_ifbicu; +}; + +#else /* XNU_KERNEL_PRIVATE */ + +struct ifbifconf32 { + uint32_t ifbic_len; /* buffer size */ + union { + user32_addr_t ifbicu_buf; + user32_addr_t ifbicu_req; +#define ifbic_buf ifbic_ifbicu.ifbicu_buf +#define ifbic_req ifbic_ifbicu.ifbicu_req + } ifbic_ifbicu; +}; + +struct ifbifconf64 { + uint32_t ifbic_len; /* buffer size */ + union { + user64_addr_t ifbicu_buf; + user64_addr_t ifbicu_req; + } ifbic_ifbicu; +}; +#endif /* XNU_KERNEL_PRIVATE */ + +#pragma pack() + +/* + * Bridge address request. + */ + +#pragma pack(4) + +#ifndef XNU_KERNEL_PRIVATE + +struct ifbareq { + char ifba_ifsname[IFNAMSIZ]; /* member if name */ + unsigned long ifba_expire; /* address expire time */ + uint8_t ifba_flags; /* address flags */ + uint8_t ifba_dst[ETHER_ADDR_LEN];/* destination address */ + uint16_t ifba_vlan; /* vlan id */ +}; + +#else /* XNU_KERNEL_PRIVATE */ + +struct ifbareq32 { + char ifba_ifsname[IFNAMSIZ]; /* member if name */ + uint32_t ifba_expire; /* address expire time */ + uint8_t ifba_flags; /* address flags */ + uint8_t ifba_dst[ETHER_ADDR_LEN];/* destination address */ + uint16_t ifba_vlan; /* vlan id */ +}; + +struct ifbareq64 { + char ifba_ifsname[IFNAMSIZ]; /* member if name */ + uint64_t ifba_expire; /* address expire time */ + uint8_t ifba_flags; /* address flags */ + uint8_t ifba_dst[ETHER_ADDR_LEN];/* destination address */ + uint16_t ifba_vlan; /* vlan id */ +}; +#endif /* XNU_KERNEL_PRIVATE */ + +#pragma pack() + +#define IFBAF_TYPEMASK 0x03 /* address type mask */ +#define IFBAF_DYNAMIC 0x00 /* dynamically learned address */ +#define IFBAF_STATIC 0x01 /* static address */ +#define IFBAF_STICKY 0x02 /* sticky address */ + +#define IFBAFBITS "\020\1STATIC\2STICKY" + +/* + * Address list structure. + */ + +#pragma pack(4) + +#ifndef XNU_KERNEL_PRIVATE + +struct ifbaconf { + uint32_t ifbac_len; /* buffer size */ + union { + caddr_t ifbacu_buf; + struct ifbareq *ifbacu_req; +#define ifbac_buf ifbac_ifbacu.ifbacu_buf +#define ifbac_req ifbac_ifbacu.ifbacu_req + } ifbac_ifbacu; +}; + +#else /* XNU_KERNEL_PRIVATE */ + +struct ifbaconf32 { + uint32_t ifbac_len; /* buffer size */ + union { + user32_addr_t ifbacu_buf; + user32_addr_t ifbacu_req; +#define ifbac_buf ifbac_ifbacu.ifbacu_buf +#define ifbac_req ifbac_ifbacu.ifbacu_req + } ifbac_ifbacu; +}; + +struct ifbaconf64 { + uint32_t ifbac_len; /* buffer size */ + union { + user64_addr_t ifbacu_buf; + user64_addr_t ifbacu_req; + } ifbac_ifbacu; +}; +#endif /* XNU_KERNEL_PRIVATE */ + +#pragma pack() + +/* + * Bridge parameter structure. + */ + +#pragma pack(4) + +struct ifbrparam { + union { + uint32_t ifbrpu_int32; + uint16_t ifbrpu_int16; + uint8_t ifbrpu_int8; + } ifbrp_ifbrpu; +}; + +#pragma pack() + +#define ifbrp_csize ifbrp_ifbrpu.ifbrpu_int32 /* cache size */ +#define ifbrp_ctime ifbrp_ifbrpu.ifbrpu_int32 /* cache time (sec) */ +#define ifbrp_prio ifbrp_ifbrpu.ifbrpu_int16 /* bridge priority */ +#define ifbrp_proto ifbrp_ifbrpu.ifbrpu_int8 /* bridge protocol */ +#define ifbrp_txhc ifbrp_ifbrpu.ifbrpu_int8 /* bpdu tx holdcount */ +#define ifbrp_hellotime ifbrp_ifbrpu.ifbrpu_int8 /* hello time (sec) */ +#define ifbrp_fwddelay ifbrp_ifbrpu.ifbrpu_int8 /* fwd time (sec) */ +#define ifbrp_maxage ifbrp_ifbrpu.ifbrpu_int8 /* max age (sec) */ +#define ifbrp_cexceeded ifbrp_ifbrpu.ifbrpu_int32 /* # of cache dropped + * adresses */ +#define ifbrp_filter ifbrp_ifbrpu.ifbrpu_int32 /* filtering flags */ + +/* + * Bridge current operational parameters structure. + */ + +#pragma pack(4) + +#ifndef XNU_KERNEL_PRIVATE + +struct ifbropreq { + uint8_t ifbop_holdcount; + uint8_t ifbop_maxage; + uint8_t ifbop_hellotime; + uint8_t ifbop_fwddelay; + uint8_t ifbop_protocol; + uint16_t ifbop_priority; + uint16_t ifbop_root_port; + uint32_t ifbop_root_path_cost; + uint64_t ifbop_bridgeid; + uint64_t ifbop_designated_root; + uint64_t ifbop_designated_bridge; + struct timeval ifbop_last_tc_time; +}; + +#else /* XNU_KERNEL_PRIVATE */ + +struct ifbropreq32 { + uint8_t ifbop_holdcount; + uint8_t ifbop_maxage; + uint8_t ifbop_hellotime; + uint8_t ifbop_fwddelay; + uint8_t ifbop_protocol; + uint16_t ifbop_priority; + uint16_t ifbop_root_port; + uint32_t ifbop_root_path_cost; + uint64_t ifbop_bridgeid; + uint64_t ifbop_designated_root; + uint64_t ifbop_designated_bridge; + struct timeval ifbop_last_tc_time; +}; + +struct ifbropreq64 { + uint8_t ifbop_holdcount; + uint8_t ifbop_maxage; + uint8_t ifbop_hellotime; + uint8_t ifbop_fwddelay; + uint8_t ifbop_protocol; + uint16_t ifbop_priority; + uint16_t ifbop_root_port; + uint32_t ifbop_root_path_cost; + uint64_t ifbop_bridgeid; + uint64_t ifbop_designated_root; + uint64_t ifbop_designated_bridge; + struct timeval ifbop_last_tc_time; +}; + +#endif + +#pragma pack() + +/* + * Bridge member operational STP params structure. + */ + +#pragma pack(4) + +struct ifbpstpreq { + uint8_t ifbp_portno; /* bp STP port number */ + uint32_t ifbp_fwd_trans; /* bp STP fwd transitions */ + uint32_t ifbp_design_cost; /* bp STP designated cost */ + uint32_t ifbp_design_port; /* bp STP designated port */ + uint64_t ifbp_design_bridge; /* bp STP designated bridge */ + uint64_t ifbp_design_root; /* bp STP designated root */ +}; + +#pragma pack() + +/* + * Bridge STP ports list structure. + */ + +#pragma pack(4) + +#ifndef XNU_KERNEL_PRIVATE + +struct ifbpstpconf { + uint32_t ifbpstp_len; /* buffer size */ + union { + caddr_t ifbpstpu_buf; + struct ifbpstpreq *ifbpstpu_req; + } ifbpstp_ifbpstpu; +#define ifbpstp_buf ifbpstp_ifbpstpu.ifbpstpu_buf +#define ifbpstp_req ifbpstp_ifbpstpu.ifbpstpu_req +}; + +#else /* XNU_KERNEL_PRIVATE */ + +struct ifbpstpconf32 { + uint32_t ifbpstp_len; /* buffer size */ + union { + user32_addr_t ifbpstpu_buf; + user32_addr_t ifbpstpu_req; +#define ifbpstp_buf ifbpstp_ifbpstpu.ifbpstpu_buf +#define ifbpstp_req ifbpstp_ifbpstpu.ifbpstpu_req + } ifbpstp_ifbpstpu; +}; + +struct ifbpstpconf64 { + uint32_t ifbpstp_len; /* buffer size */ + union { + user64_addr_t ifbpstpu_buf; + user64_addr_t ifbpstpu_req; + } ifbpstp_ifbpstpu; +}; + +#endif /* XNU_KERNEL_PRIVATE */ + +#pragma pack() + + +#ifdef XNU_KERNEL_PRIVATE + +int bridgeattach(int); + +#endif /* XNU_KERNEL_PRIVATE */ +#endif /* PRIVATE */ +#endif /* !_NET_IF_BRIDGEVAR_H_ */ diff --git a/bsd/net/if_disc.c b/bsd/net/if_disc.c deleted file mode 100644 index 229e281f6..000000000 --- a/bsd/net/if_disc.c +++ /dev/null @@ -1,240 +0,0 @@ -/* - * Copyright (c) 2000-2008 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1982, 1986, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * From: @(#)if_loop.c 8.1 (Berkeley) 6/10/93 - * $FreeBSD: src/sys/net/if_disc.c,v 1.26.2.1 2001/03/06 00:32:09 obrien Exp $ - */ - -/* - * Discard interface driver for protocol testing and timing. - * (Based on the loopback.) - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#ifdef TINY_DSMTU -#define DSMTU (1024+512) -#else -#define DSMTU 65532 -#endif - -static void discattach(void); - -static struct ifnet discif; -static int discoutput(struct ifnet *, struct mbuf *, struct sockaddr *, - struct rtentry *); -static void discrtrequest(int cmd, struct rtentry *rt, struct sockaddr *sa); -static int discioctl(struct ifnet *, u_long, caddr_t); - -/* ARGSUSED */ -static void -discattach() -{ - register struct ifnet *ifp = &discif; - - ifp->if_name = "ds"; - ifp->if_family = APPLE_IF_FAM_DISC; - ifp->if_mtu = DSMTU; - ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST; - ifp->if_ioctl = discioctl; - ifp->if_output = discoutput; - ifp->if_type = IFT_LOOP; - ifp->if_hdrlen = 0; - ifp->if_addrlen = 0; - if_attach(ifp); - bpfattach(ifp, DLT_NULL, sizeof(u_int)); -} - -#ifndef __APPLE__ -static int -disc_modevent(module_t mod, int type, void *data) -{ - switch (type) { - case MOD_LOAD: - discattach(); - break; - case MOD_UNLOAD: - printf("if_disc module unload - not possible for this module type\n"); - return EINVAL; - } - return 0; -} - -static moduledata_t disc_mod = { - "if_disc", - disc_modevent, - NULL -}; - -DECLARE_MODULE(if_disc, disc_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); -#endif - -static int -discoutput(ifp, m, dst, rt) - struct ifnet *ifp; - register struct mbuf *m; - struct sockaddr *dst; - register struct rtentry *rt; -{ - if ((m->m_flags & M_PKTHDR) == 0) - panic("discoutput no HDR"); - /* BPF write needs to be handled specially */ - if (dst->sa_family == AF_UNSPEC) { - dst->sa_family = *(mtod(m, int *)); - m->m_len -= sizeof(int); - m->m_pkthdr.len -= sizeof(int); - m->m_data += sizeof(int); - } - - if (discif.if_bpf) { - /* We need to prepend the address family as a four byte field. */ - u_int af = dst->sa_family; - - bpf_tap_out(ifp, 0, m, &af, sizeof(af)); - } - m->m_pkthdr.rcvif = ifp; - - ifp->if_opackets++; - ifp->if_obytes += m->m_pkthdr.len; - - m_freem(m); - return 0; -} - -/* ARGSUSED */ -static void -discrtrequest(cmd, rt, sa) - int cmd; - struct rtentry *rt; - struct sockaddr *sa; -{ - if (rt != NULL) { - RT_LOCK_ASSERT_HELD(rt); - rt->rt_rmx.rmx_mtu = DSMTU; - } -} - -/* - * Process an ioctl request. - */ -/* ARGSUSED */ -static int -discioctl(ifp, cmd, data) - register struct ifnet *ifp; - u_long cmd; - caddr_t data; -{ - register struct ifaddr *ifa; - register struct ifreq *ifr = (struct ifreq *)data; - register int error = 0; - - switch (cmd) { - - case SIOCSIFADDR: - ifnet_set_flags(ifp, IFF_UP, IFF_UP); - ifa = (struct ifaddr *)data; - if (ifa != 0) - ifa->ifa_rtrequest = discrtrequest; - /* - * Everything else is done at a higher level. - */ - break; - - case SIOCADDMULTI: - case SIOCDELMULTI: - if (ifr == 0) { - error = EAFNOSUPPORT; /* XXX */ - break; - } - switch (ifr->ifr_addr.sa_family) { - -#if INET - case AF_INET: - break; -#endif -#if INET6 - case AF_INET6: - break; -#endif - - default: - error = EAFNOSUPPORT; - break; - } - break; - - case SIOCSIFMTU: - ifp->if_mtu = ifr->ifr_mtu; - break; - - default: - error = EINVAL; - } - return (error); -} diff --git a/bsd/net/if_dummy.c b/bsd/net/if_dummy.c deleted file mode 100644 index 68dac9c9d..000000000 --- a/bsd/net/if_dummy.c +++ /dev/null @@ -1,290 +0,0 @@ -/* - * Copyright (c) 2000-2008 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1982, 1986, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ -/* - * derived from - * @(#)if_loop.c 8.1 (Berkeley) 6/10/93 - * Id: if_loop.c,v 1.22 1996/06/19 16:24:10 wollman Exp - */ - -/* - * Loopback interface driver for protocol testing and timing. - */ -#if BSD310 -#include "opt_inet.h" -#endif -#include "dummy.h" -#if NDUMMY > 0 - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#if INET -#include -#include -#include -#include -#endif - -#if INET6 -#if !INET -#include -#endif -#include -#include -#endif - -#if NETATALK -#include -#include -#include -#endif /* NETATALK */ - -#include "bpfilter.h" - -static int dummyioctl(struct ifnet *, u_long, caddr_t); -int dummyoutput(struct ifnet *, register struct mbuf *, struct sockaddr *, - register struct rtentry *); -static void dummyrtrequest(int, struct rtentry *, struct sockaddr *); - -static void dummyattach(void *); -PSEUDO_SET(dummyattach, if_dummy); - -#if TINY_DUMMYMTU -#define DUMMYMTU (1024+512) -#else -#define DUMMYMTU 16384 -#endif -#define HAVE_OLD_BPF 1 - -static struct ifnet dummyif[NDUMMY]; - -/* ARGSUSED */ -static void -dummyattach(dummy) - void *dummy; -{ - register struct ifnet *ifp; - register int i = 0; - - for (i = 0; i < NDUMMY; i++) { - ifp = &dummyif[i]; -#if defined(__NetBSD__) || defined(__OpenBSD__) - sprintf(ifp->if_xname, "dummy%d", i); -#else - ifp->if_name = "dummy"; - ifp->if_unit = i; -#endif -#ifndef __bsdi__ - ifp->if_softc = NULL; -#endif - ifp->if_mtu = DUMMYMTU; - /* Change to BROADCAST experimentaly to announce its prefix. */ - ifp->if_flags = /* IFF_LOOPBACK */ IFF_BROADCAST | IFF_MULTICAST; - ifp->if_ioctl = dummyioctl; - ifp->if_output = dummyoutput; - ifp->if_type = IFT_DUMMY; - ifp->if_hdrlen = 0; - ifp->if_addrlen = 0; - if_attach(ifp); -#if NBPFILTER > 0 -#ifdef HAVE_OLD_BPF - bpfattach(ifp, DLT_NULL, sizeof(u_int)); -#else - bpfattach(&ifp->if_bpf, ifp, DLT_NULL, sizeof(u_int)); -#endif -#endif - } -} - -int -dummyoutput(ifp, m, dst, rt) - struct ifnet *ifp; - register struct mbuf *m; - struct sockaddr *dst; - register struct rtentry *rt; -{ - if ((m->m_flags & M_PKTHDR) == 0) - panic("dummyoutput no HDR"); -#if NBPFILTER > 0 - /* BPF write needs to be handled specially */ - if (dst->sa_family == AF_UNSPEC) { - dst->sa_family = *(mtod(m, int *)); - m->m_len -= sizeof(int); - m->m_pkthdr.len -= sizeof(int); - m->m_data += sizeof(int); - } - - if (ifp->if_bpf) { - /* We need to prepend the address family as a four byte field. */ - u_int af = dst->sa_family; - - bpf_tap_out(ifp, 0, m, &af, sizeof(af)); - } -#endif - m->m_pkthdr.rcvif = ifp; - - if (rt != NULL) { - u_int32_t rt_flags = rt->rt_flags; - if (rt_flags & (RTF_REJECT | RTF_BLACKHOLE)) { - m_freem(m); - return ((rt_flags & RTF_BLACKHOLE) ? 0 : - (rt_flags & RTF_HOST) ? EHOSTUNREACH : ENETUNREACH); - } - } - ifp->if_opackets++; - ifp->if_obytes += m->m_pkthdr.len; - proto_inject(dst->sa_family, m); - ifp->if_ipackets++; - ifp->if_ibytes += m->m_pkthdr.len; - return (0); -} - -/* ARGSUSED */ -static void -dummyrtrequest(cmd, rt, sa) - int cmd; - struct rtentry *rt; - struct sockaddr *sa; -{ - if (rt != NULL) { - RT_LOCK_ASSERT_HELD(rt); - rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; /* for ISO */ - /* - * For optimal performance, the send and receive buffers - * should be at least twice the MTU plus a little more for - * overhead. - */ - rt->rt_rmx.rmx_recvpipe = - rt->rt_rmx.rmx_sendpipe = 3 * DUMMYMTU; - } -} - -/* - * Process an ioctl request. - */ -/* ARGSUSED */ -static int -dummyioctl(ifp, cmd, data) - register struct ifnet *ifp; - u_long cmd; - caddr_t data; -{ - register struct ifaddr *ifa; - register struct ifreq *ifr = (struct ifreq *)data; - register int error = 0; - - switch (cmd) { - - case SIOCSIFADDR: - ifnet_set_flags(ifp, IFF_UP | IFF_RUNNING, IFF_UP | IFF_RUNNING); - ifa = (struct ifaddr *)data; - ifa->ifa_rtrequest = dummyrtrequest; - /* - * Everything else is done at a higher level. - */ - break; - - case SIOCADDMULTI: - case SIOCDELMULTI: - if (ifr == 0) { - error = EAFNOSUPPORT; /* XXX */ - break; - } - switch (ifr->ifr_addr.sa_family) { - -#if INET - case AF_INET: - break; -#endif -#if INET6 - case AF_INET6: - break; -#endif - - default: - error = EAFNOSUPPORT; - break; - } - break; - - case SIOCSIFMTU: - ifp->if_mtu = ifr->ifr_mtu; - break; - - case SIOCSIFFLAGS: - break; - - default: - error = EINVAL; - } - return (error); -} -#endif /* NDUMMY > 0 */ diff --git a/bsd/net/if_ethersubr.c b/bsd/net/if_ethersubr.c deleted file mode 100644 index e407e009f..000000000 --- a/bsd/net/if_ethersubr.c +++ /dev/null @@ -1,229 +0,0 @@ -/* - * Copyright (c) 2000, 2009 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1982, 1989, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)if_ethersubr.c 8.1 (Berkeley) 6/10/93 - * $FreeBSD: src/sys/net/if_ethersubr.c,v 1.70.2.17 2001/08/01 00:47:49 fenner Exp $ - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#if INET || INET6 -#include -#include -#include -#include -#include -#endif - -#if IPX -#include -#include -#endif - -#include - -#if LLC && CCITT -extern struct ifqueue pkintrq; -#endif - -/* #include "vlan.h" */ -#if NVLAN > 0 -#include -#endif /* NVLAN > 0 */ - -extern u_char etherbroadcastaddr[]; -#define senderr(e) do { error = (e); goto bad;} while (0) - -/* - * Perform common duties while attaching to interface list - */ - -int -ether_resolvemulti( - struct ifnet *ifp, - struct sockaddr **llsa, - struct sockaddr *sa) -{ - struct sockaddr_dl *sdl; - struct sockaddr_in *sin; - u_char *e_addr; -#if INET6 - struct sockaddr_in6 *sin6; -#endif - - - switch(sa->sa_family) { - case AF_UNSPEC: - /* AppleTalk uses AF_UNSPEC for multicast registration. - * No mapping needed. Just check that it's a valid MC address. - */ - e_addr = &sa->sa_data[0]; - if ((e_addr[0] & 1) != 1) - return EADDRNOTAVAIL; - *llsa = 0; - return 0; - - case AF_LINK: - /* - * No mapping needed. Just check that it's a valid MC address. - */ - sdl = (struct sockaddr_dl *)sa; - e_addr = LLADDR(sdl); - if ((e_addr[0] & 1) != 1) - return EADDRNOTAVAIL; - *llsa = 0; - return 0; - -#if INET - case AF_INET: - sin = (struct sockaddr_in *)sa; - if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) - return EADDRNOTAVAIL; - MALLOC(sdl, struct sockaddr_dl *, sizeof *sdl, M_IFMADDR, - M_WAITOK); - if (sdl == NULL) - return ENOBUFS; - sdl->sdl_len = sizeof *sdl; - sdl->sdl_family = AF_LINK; - sdl->sdl_index = ifp->if_index; - sdl->sdl_type = IFT_ETHER; - sdl->sdl_nlen = 0; - sdl->sdl_alen = ETHER_ADDR_LEN; - sdl->sdl_slen = 0; - e_addr = LLADDR(sdl); - ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr); - *llsa = (struct sockaddr *)sdl; - return 0; -#endif -#if INET6 - case AF_INET6: - sin6 = (struct sockaddr_in6 *)sa; - if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { - /* - * An IP6 address of 0 means listen to all - * of the Ethernet multicast address used for IP6. - * (This is used for multicast routers.) - */ - ifp->if_flags |= IFF_ALLMULTI; - *llsa = 0; - return 0; - } - MALLOC(sdl, struct sockaddr_dl *, sizeof *sdl, M_IFMADDR, - M_WAITOK); - if (sdl == NULL) - return ENOBUFS; - sdl->sdl_len = sizeof *sdl; - sdl->sdl_family = AF_LINK; - sdl->sdl_index = ifp->if_index; - sdl->sdl_type = IFT_ETHER; - sdl->sdl_nlen = 0; - sdl->sdl_alen = ETHER_ADDR_LEN; - sdl->sdl_slen = 0; - e_addr = LLADDR(sdl); - ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr); -#if 0 - printf("ether_resolvemulti Adding %x:%x:%x:%x:%x:%x\n", - e_addr[0], e_addr[1], e_addr[2], e_addr[3], e_addr[4], e_addr[5]); -#endif - *llsa = (struct sockaddr *)sdl; - return 0; -#endif - - default: - /* - * Well, the text isn't quite right, but it's the name - * that counts... - */ - return EAFNOSUPPORT; - } -} - - -/* - * Convert Ethernet address to printable (loggable) representation. - */ -static u_char digits[] = "0123456789abcdef"; -char * -ether_sprintf(p, ap) - register u_char *p; - register u_char *ap; -{ register char *cp; - register i; - - for (cp = p, i = 0; i < 6; i++) { - *cp++ = digits[*ap >> 4]; - *cp++ = digits[*ap++ & 0xf]; - *cp++ = ':'; - } - *--cp = 0; - return (p); -} diff --git a/bsd/net/if_fddisubr.c b/bsd/net/if_fddisubr.c deleted file mode 100644 index 1de331796..000000000 --- a/bsd/net/if_fddisubr.c +++ /dev/null @@ -1,637 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1995, 1996 - * Matt Thomas . All rights reserved. - * Copyright (c) 1982, 1989, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * from: if_ethersubr.c,v 1.5 1994/12/13 22:31:45 wollman Exp - */ - -#include "opt_atalk.h" -#include "opt_inet.h" -#include "opt_ipx.h" - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#if INET -#include -#include -#include -#endif -#if defined(__FreeBSD__) -#include -#else -#include -#endif - -#if IPX -#include -#include -#endif - -#if DECNET -#include -#endif - -#include "bpfilter.h" - -#define senderr(e) { error = (e); goto bad;} - -/* - * This really should be defined in if_llc.h but in case it isn't. - */ -#ifndef llc_snap -#define llc_snap llc_un.type_snap -#endif - -#if defined(__bsdi__) || defined(__NetBSD__) -#define RTALLOC1(a, b) rtalloc1(a, b) -#define ARPRESOLVE(a, b, c, d, e, f) arpresolve(a, b, c, d, e) -#elif defined(__FreeBSD__) -#define RTALLOC1(a, b) rtalloc1(a, b, 0UL) -#define ARPRESOLVE(a, b, c, d, e, f) arpresolve(a, b, c, d, e, f) -#endif -/* - * FDDI output routine. - * Encapsulate a packet of type family for the local net. - * Use trailer local net encapsulation if enough data in first - * packet leaves a multiple of 512 bytes of data in remainder. - * Assumes that ifp is actually pointer to arpcom structure. - */ -int -fddi_output(ifp, m0, dst, rt0) - register struct ifnet *ifp; - struct mbuf *m0; - struct sockaddr *dst; - struct rtentry *rt0; -{ - u_int16_t type; - int s, loop_copy = 0, error = 0; - u_char edst[6]; - register struct mbuf *m = m0; - register struct rtentry *rt; - register struct fddi_header *fh; - struct arpcom *ac = (struct arpcom *)ifp; - - if ((ifp->if_flags & (IFF_UP|IFF_RUNNING)) != (IFF_UP|IFF_RUNNING)) - senderr(ENETDOWN); - getmicrotime(&ifp->if_lastchange); -#if !defined(__bsdi__) || _BSDI_VERSION >= 199401 - if (rt = rt0) { - if ((rt->rt_flags & RTF_UP) == 0) { - if (rt0 = rt = RTALLOC1(dst, 1)) - rtunref(rt); - else - senderr(EHOSTUNREACH); - } - if (rt->rt_flags & RTF_GATEWAY) { - if (rt->rt_gwroute == 0) - goto lookup; - if (((rt = rt->rt_gwroute)->rt_flags & RTF_UP) == 0) { - rtfree(rt); rt = rt0; - lookup: rt->rt_gwroute = RTALLOC1(rt->rt_gateway, 1); - if ((rt = rt->rt_gwroute) == 0) - senderr(EHOSTUNREACH); - } - } - if (rt->rt_flags & RTF_REJECT) - if (rt->rt_rmx.rmx_expire == 0 || - time_second < rt->rt_rmx.rmx_expire) - senderr(rt == rt0 ? EHOSTDOWN : EHOSTUNREACH); - } -#endif - switch (dst->sa_family) { - -#if INET - case AF_INET: { -#if !defined(__bsdi__) || _BSDI_VERSION >= 199401 - if (!ARPRESOLVE(ac, rt, m, dst, edst, rt0)) - return (0); /* if not yet resolved */ -#else - int usetrailers; - if (!arpresolve(ac, m, &((struct sockaddr_in *)dst)->sin_addr, edst, &usetrailers)) - return (0); /* if not yet resolved */ -#endif - type = htons(ETHERTYPE_IP); - break; - } -#endif -#if IPX - case AF_IPX: - type = htons(ETHERTYPE_IPX); - bcopy((caddr_t)&(((struct sockaddr_ipx *)dst)->sipx_addr.x_host), - (caddr_t)edst, sizeof (edst)); - break; -#endif - -#if NS - case AF_NS: - type = htons(ETHERTYPE_NS); - bcopy((caddr_t)&(((struct sockaddr_ns *)dst)->sns_addr.x_host), - (caddr_t)edst, sizeof (edst)); - break; -#endif -#if ISO - case AF_ISO: { - int snpalen; - struct llc *l; - register struct sockaddr_dl *sdl; - - if (rt && (sdl = (struct sockaddr_dl *)rt->rt_gateway) && - sdl->sdl_family == AF_LINK && sdl->sdl_alen > 0) { - bcopy(LLADDR(sdl), (caddr_t)edst, sizeof(edst)); - } else if (error = - iso_snparesolve(ifp, (struct sockaddr_iso *)dst, - (char *)edst, &snpalen)) - goto bad; /* Not Resolved */ - /* If broadcasting on a simplex interface, loopback a copy */ - if (*edst & 1) - m->m_flags |= (M_BCAST|M_MCAST); - M_PREPEND(m, 3, M_DONTWAIT); - if (m == NULL) - return (0); - type = 0; - l = mtod(m, struct llc *); - l->llc_dsap = l->llc_ssap = LLC_ISO_LSAP; - l->llc_control = LLC_UI; - IFDEBUG(D_ETHER) - int i; - printf("unoutput: sending pkt to: "); - for (i=0; i<6; i++) - printf("%x ", edst[i] & 0xff); - printf("\n"); - ENDDEBUG - } break; -#endif /* ISO */ -#if LLC -/* case AF_NSAP: */ - case AF_CCITT: { - register struct sockaddr_dl *sdl = - (struct sockaddr_dl *) rt -> rt_gateway; - - if (sdl && sdl->sdl_family != AF_LINK && sdl->sdl_alen <= 0) - goto bad; /* Not a link interface ? Funny ... */ - bcopy(LLADDR(sdl), (char *)edst, sizeof(edst)); - if (*edst & 1) - loop_copy = 1; - type = 0; -#if LLC_DEBUG - { - int i; - register struct llc *l = mtod(m, struct llc *); - - printf("fddi_output: sending LLC2 pkt to: "); - for (i=0; i<6; i++) - printf("%x ", edst[i] & 0xff); - printf(" len 0x%x dsap 0x%x ssap 0x%x control 0x%x\n", - type & 0xff, l->llc_dsap & 0xff, l->llc_ssap &0xff, - l->llc_control & 0xff); - - } -#endif /* LLC_DEBUG */ - } break; -#endif /* LLC */ - - case AF_UNSPEC: - { - struct ether_header *eh; - loop_copy = -1; - eh = (struct ether_header *)dst->sa_data; - (void)memcpy((caddr_t)edst, (caddr_t)eh->ether_dhost, sizeof (edst)); - if (*edst & 1) - m->m_flags |= (M_BCAST|M_MCAST); - type = eh->ether_type; - break; - } - -#if NBPFILTER > 0 - case AF_IMPLINK: - { - fh = mtod(m, struct fddi_header *); - error = EPROTONOSUPPORT; - switch (fh->fddi_fc & (FDDIFC_C|FDDIFC_L|FDDIFC_F)) { - case FDDIFC_LLC_ASYNC: { - /* legal priorities are 0 through 7 */ - if ((fh->fddi_fc & FDDIFC_Z) > 7) - goto bad; - break; - } - case FDDIFC_LLC_SYNC: { - /* FDDIFC_Z bits reserved, must be zero */ - if (fh->fddi_fc & FDDIFC_Z) - goto bad; - break; - } - case FDDIFC_SMT: { - /* FDDIFC_Z bits must be non zero */ - if ((fh->fddi_fc & FDDIFC_Z) == 0) - goto bad; - break; - } - default: { - /* anything else is too dangerous */ - goto bad; - } - } - error = 0; - if (fh->fddi_dhost[0] & 1) - m->m_flags |= (M_BCAST|M_MCAST); - goto queue_it; - } -#endif - default: - printf("%s%d: can't handle af%d\n", ifp->if_name, ifp->if_unit, - dst->sa_family); - senderr(EAFNOSUPPORT); - } - - if (type != 0) { - register struct llc *l; - M_PREPEND(m, sizeof (struct llc), M_DONTWAIT); - if (m == 0) - senderr(ENOBUFS); - l = mtod(m, struct llc *); - l->llc_control = LLC_UI; - l->llc_dsap = l->llc_ssap = LLC_SNAP_LSAP; - l->llc_snap.org_code[0] = l->llc_snap.org_code[1] = l->llc_snap.org_code[2] = 0; - (void)memcpy((caddr_t) &l->llc_snap.ether_type, (caddr_t) &type, - sizeof(u_int16_t)); - } - - /* - * Add local net header. If no space in first mbuf, - * allocate another. - */ - M_PREPEND(m, sizeof (struct fddi_header), M_DONTWAIT); - if (m == 0) - senderr(ENOBUFS); - fh = mtod(m, struct fddi_header *); - fh->fddi_fc = FDDIFC_LLC_ASYNC|FDDIFC_LLC_PRIO4; - (void)memcpy((caddr_t)fh->fddi_dhost, (caddr_t)edst, sizeof (edst)); - queue_it: - (void)memcpy((caddr_t)fh->fddi_shost, (caddr_t)ac->ac_enaddr, - sizeof(fh->fddi_shost)); - - /* - * If a simplex interface, and the packet is being sent to our - * Ethernet address or a broadcast address, loopback a copy. - * XXX To make a simplex device behave exactly like a duplex - * device, we should copy in the case of sending to our own - * ethernet address (thus letting the original actually appear - * on the wire). However, we don't do that here for security - * reasons and compatibility with the original behavior. - */ - if ((ifp->if_flags & IFF_SIMPLEX) && - (loop_copy != -1)) { - if ((m->m_flags & M_BCAST) || loop_copy) { - struct mbuf *n = m_copy(m, 0, (int)M_COPYALL); - - (void) if_simloop(ifp, - n, dst, sizeof(struct fddi_header)); - } else if (bcmp(fh->fddi_dhost, - fh->fddi_shost, sizeof(fh->fddi_shost)) == 0) { - (void) if_simloop(ifp, - m, dst, sizeof(struct fddi_header)); - return(0); /* XXX */ - } - } - - s = splimp(); - /* - * Queue message on interface, and start output if interface - * not yet active. - */ - if (IF_QFULL(&ifp->if_snd)) { - IF_DROP(&ifp->if_snd); - splx(s); - senderr(ENOBUFS); - } - ifp->if_obytes += m->m_pkthdr.len; - IF_ENQUEUE(&ifp->if_snd, m); - if ((ifp->if_flags & IFF_OACTIVE) == 0) - (*ifp->if_start)(ifp); - splx(s); - if (m->m_flags & M_MCAST) - ifp->if_omcasts++; - return (error); - -bad: - if (m) - m_freem(m); - return (error); -} - -/* - * Process a received FDDI packet; - * the packet is in the mbuf chain m without - * the fddi header, which is provided separately. - */ -void -fddi_input(ifp, fh, m) - struct ifnet *ifp; - register struct fddi_header *fh; - struct mbuf *m; -{ - register struct ifqueue *inq; - register struct llc *l; - int s; - - if ((ifp->if_flags & IFF_UP) == 0) { - m_freem(m); - return; - } - getmicrotime(&ifp->if_lastchange); - ifp->if_ibytes += m->m_pkthdr.len + sizeof (*fh); - if (fh->fddi_dhost[0] & 1) { - if (bcmp((caddr_t)fddibroadcastaddr, (caddr_t)fh->fddi_dhost, - sizeof(fddibroadcastaddr)) == 0) - m->m_flags |= M_BCAST; - else - m->m_flags |= M_MCAST; - ifp->if_imcasts++; - } else if ((ifp->if_flags & IFF_PROMISC) - && bcmp(((struct arpcom *)ifp)->ac_enaddr, (caddr_t)fh->fddi_dhost, - sizeof(fh->fddi_dhost)) != 0) { - m_freem(m); - return; - } - -#ifdef M_LINK0 - /* - * If this has a LLC priority of 0, then mark it so upper - * layers have a hint that it really came via a FDDI/Ethernet - * bridge. - */ - if ((fh->fddi_fc & FDDIFC_LLC_PRIO7) == FDDIFC_LLC_PRIO0) - m->m_flags |= M_LINK0; -#endif - - l = mtod(m, struct llc *); - switch (l->llc_dsap) { -#if defined(INET) || NS || IPX || defined(NETATALK) - case LLC_SNAP_LSAP: - { - u_int16_t type; - if (l->llc_control != LLC_UI || l->llc_ssap != LLC_SNAP_LSAP) - goto dropanyway; - - if (l->llc_snap.org_code[0] != 0 || l->llc_snap.org_code[1] != 0|| l->llc_snap.org_code[2] != 0) - goto dropanyway; - type = ntohs(l->llc_snap.ether_type); - m_adj(m, 8); - switch (type) { -#if INET - case ETHERTYPE_IP: - if (ipflow_fastforward(m)) - return; - schednetisr(NETISR_IP); - inq = &ipintrq; - break; - - case ETHERTYPE_ARP: -#if !defined(__bsdi__) || _BSDI_VERSION >= 199401 - schednetisr(NETISR_ARP); - inq = &arpintrq; - break; -#else - arpinput((struct arpcom *)ifp, m); - return; -#endif -#endif -#if IPX - case ETHERTYPE_IPX: - schednetisr(NETISR_IPX); - inq = &ipxintrq; - break; -#endif -#if NS - case ETHERTYPE_NS: - schednetisr(NETISR_NS); - inq = &nsintrq; - break; -#endif -#if DECNET - case ETHERTYPE_DECNET: - schednetisr(NETISR_DECNET); - inq = &decnetintrq; - break; -#endif - - default: - /* printf("fddi_input: unknown protocol 0x%x\n", type); */ - ifp->if_noproto++; - goto dropanyway; - } - break; - } -#endif /* INET || NS */ -#if ISO - case LLC_ISO_LSAP: - switch (l->llc_control) { - case LLC_UI: - /* LLC_UI_P forbidden in class 1 service */ - if ((l->llc_dsap == LLC_ISO_LSAP) && - (l->llc_ssap == LLC_ISO_LSAP)) { - /* LSAP for ISO */ - m->m_data += 3; /* XXX */ - m->m_len -= 3; /* XXX */ - m->m_pkthdr.len -= 3; /* XXX */ - M_PREPEND(m, sizeof *fh, M_DONTWAIT); - if (m == 0) - return; - *mtod(m, struct fddi_header *) = *fh; - IFDEBUG(D_ETHER) - printf("clnp packet"); - ENDDEBUG - schednetisr(NETISR_ISO); - inq = &clnlintrq; - break; - } - goto dropanyway; - - case LLC_XID: - case LLC_XID_P: - if(m->m_len < 6) - goto dropanyway; - l->llc_window = 0; - l->llc_fid = 9; - l->llc_class = 1; - l->llc_dsap = l->llc_ssap = 0; - /* Fall through to */ - case LLC_TEST: - case LLC_TEST_P: - { - struct sockaddr sa; - register struct ether_header *eh; - struct arpcom *ac = (struct arpcom *) ifp; - int i; - u_char c = l->llc_dsap; - - l->llc_dsap = l->llc_ssap; - l->llc_ssap = c; - if (m->m_flags & (M_BCAST | M_MCAST)) - bcopy((caddr_t)ac->ac_enaddr, - (caddr_t)eh->ether_dhost, 6); - sa.sa_family = AF_UNSPEC; - sa.sa_len = sizeof(sa); - eh = (struct ether_header *)sa.sa_data; - for (i = 0; i < 6; i++) { - eh->ether_shost[i] = fh->fddi_dhost[i]; - eh->ether_dhost[i] = fh->fddi_shost[i]; - } - eh->ether_type = 0; - ifp->if_output(ifp, m, &sa, NULL); - return; - } - default: - m_freem(m); - return; - } - break; -#endif /* ISO */ -#if LLC - case LLC_X25_LSAP: - { - M_PREPEND(m, sizeof(struct sdl_hdr) , M_DONTWAIT); - if (m == 0) - return; - if ( !sdl_sethdrif(ifp, fh->fddi_shost, LLC_X25_LSAP, - fh->fddi_dhost, LLC_X25_LSAP, 6, - mtod(m, struct sdl_hdr *))) - panic("ETHER cons addr failure"); - mtod(m, struct sdl_hdr *)->sdlhdr_len = m->m_pkthdr.len - sizeof(struct sdl_hdr); -#if LLC_DEBUG - printf("llc packet\n"); -#endif /* LLC_DEBUG */ - schednetisr(NETISR_CCITT); - inq = &llcintrq; - break; - } -#endif /* LLC */ - - default: - /* printf("fddi_input: unknown dsap 0x%x\n", l->llc_dsap); */ - ifp->if_noproto++; - dropanyway: - m_freem(m); - return; - } - - s = splimp(); - if (IF_QFULL(inq)) { - IF_DROP(inq); - m_freem(m); - } else - IF_ENQUEUE(inq, m); - splx(s); -} -/* - * Perform common duties while attaching to interface list - */ -#ifdef __NetBSD__ -#define ifa_next ifa_list.tqe_next -#endif - -void -fddi_ifattach(ifp) - register struct ifnet *ifp; -{ - register struct ifaddr *ifa; - register struct sockaddr_dl *sdl; - - ifp->if_type = IFT_FDDI; - ifp->if_addrlen = 6; - ifp->if_hdrlen = 21; - ifp->if_mtu = FDDIMTU; - ifp->if_baudrate = 100000000; -#if IFF_NOTRAILERS - ifp->if_flags |= IFF_NOTRAILERS; -#endif -#if defined(__FreeBSD__) - ifa = ifnet_addrs[ifp->if_index - 1]; - sdl = (struct sockaddr_dl *)ifa->ifa_addr; - sdl->sdl_type = IFT_FDDI; - sdl->sdl_alen = ifp->if_addrlen; - bcopy(((struct arpcom *)ifp)->ac_enaddr, LLADDR(sdl), ifp->if_addrlen); -#elif defined(__NetBSD__) - LIST_INIT(&((struct arpcom *)ifp)->ac_multiaddrs); - for (ifa = ifp->if_addrlist.tqh_first; ifa != NULL; ifa = ifa->ifa_list.tqe_next) -#else - for (ifa = ifp->if_addrlist; ifa != NULL; ifa = ifa->ifa_next) -#endif -#if !defined(__FreeBSD__) - if ((sdl = (struct sockaddr_dl *)ifa->ifa_addr) && - sdl->sdl_family == AF_LINK) { - sdl->sdl_type = IFT_FDDI; - sdl->sdl_alen = ifp->if_addrlen; - bcopy((caddr_t)((struct arpcom *)ifp)->ac_enaddr, - LLADDR(sdl), ifp->if_addrlen); - break; - } -#endif -} diff --git a/bsd/net/if_gif.c b/bsd/net/if_gif.c index 38e876d6d..b25ecb3a5 100644 --- a/bsd/net/if_gif.c +++ b/bsd/net/if_gif.c @@ -122,7 +122,6 @@ TAILQ_HEAD(gifhead, gif_softc) gifs = TAILQ_HEAD_INITIALIZER(gifs); #ifdef __APPLE__ void gifattach(void); -static void gif_create_dev(void); static int gif_encapcheck(const struct mbuf*, int, int, void*); static errno_t gif_output(ifnet_t ifp, mbuf_t m); static errno_t gif_input(ifnet_t ifp, protocol_family_t protocol_family, @@ -156,6 +155,11 @@ struct ip6protosw in6_gif_protosw = }; #endif +static if_clone_t gif_cloner = NULL; +static int gif_clone_create(struct if_clone *, uint32_t, void *); +static int gif_clone_destroy(struct ifnet *); +static void gif_delete_tunnel(struct gif_softc *); + #ifdef __APPLE__ /* * Theory of operation: initially, one gif interface is created. @@ -237,6 +241,8 @@ __private_extern__ void gifattach(void) { errno_t result; + struct ifnet_clone_params ifnet_clone_params; + struct if_clone *ifc = NULL; /* Init the list of interfaces */ TAILQ_INIT(&gifs); @@ -252,8 +258,17 @@ gifattach(void) if (result != 0) printf("proto_register_plumber failed for AF_INET6 error=%d\n", result); + ifnet_clone_params.ifc_name = "gif"; + ifnet_clone_params.ifc_create = gif_clone_create; + ifnet_clone_params.ifc_destroy = gif_clone_destroy; + + result = ifnet_clone_attach(&ifnet_clone_params, &gif_cloner); + if (result != 0) + printf("gifattach: ifnet_clone_attach failed %d\n", result); + /* Create first device */ - gif_create_dev(); + ifc = if_clone_lookup("gif", NULL); + gif_clone_create(ifc, 0, NULL); } static errno_t @@ -270,35 +285,34 @@ gif_set_bpf_tap( return 0; } -/* Creates another gif device if there are none free */ -static void -gif_create_dev(void) + +static int +gif_clone_create(struct if_clone *ifc, uint32_t unit, __unused void *params) { - struct gif_softc *sc; - struct ifnet_init_params gif_init; - errno_t result = 0; - - + struct gif_softc *sc = NULL; + struct ifnet_init_params gif_init; + errno_t result = 0; + /* Can't create more than GIF_MAXUNIT */ if (ngif >= GIF_MAXUNIT) - return; - - /* Check for unused gif interface */ - TAILQ_FOREACH(sc, &gifs, gif_link) { - /* If unused, return, no need to create a new interface */ - if ((ifnet_flags(sc->gif_if) & IFF_RUNNING) == 0) - return; - } + return (ENXIO); sc = _MALLOC(sizeof(struct gif_softc), M_DEVBUF, M_WAITOK); if (sc == NULL) { - log(LOG_ERR, "gifattach: failed to allocate gif%d\n", ngif); - return; + log(LOG_ERR, "gif_clone_create: failed to allocate gif%d\n", unit); + return ENOBUFS; } - + bzero(sc, sizeof(struct gif_softc)); + + /* use the interface name as the unique id for ifp recycle */ + snprintf(sc->gif_ifname, sizeof(sc->gif_ifname), "%s%d", + ifc->ifc_name, unit); + bzero(&gif_init, sizeof(gif_init)); + gif_init.uniqueid = sc->gif_ifname; + gif_init.uniqueid_len = strlen(sc->gif_ifname); gif_init.name = GIFNAME; - gif_init.unit = ngif; + gif_init.unit = unit; gif_init.type = IFT_GIF; gif_init.family = IFNET_FAMILY_GIF; gif_init.output = gif_output; @@ -309,22 +323,22 @@ gif_create_dev(void) gif_init.ioctl = gif_ioctl; gif_init.set_bpf_tap = gif_set_bpf_tap; - bzero(sc, sizeof(struct gif_softc)); result = ifnet_allocate(&gif_init, &sc->gif_if); if (result != 0) { - printf("gif_create_dev, ifnet_allocate failed - %d\n", result); + printf("gif_clone_create, ifnet_allocate failed - %d\n", result); _FREE(sc, M_DEVBUF); - return; + return ENOBUFS; } + sc->encap_cookie4 = sc->encap_cookie6 = NULL; #if INET sc->encap_cookie4 = encap_attach_func(AF_INET, -1, - gif_encapcheck, &in_gif_protosw, sc); + gif_encapcheck, &in_gif_protosw, sc); if (sc->encap_cookie4 == NULL) { printf("%s: unable to attach encap4\n", if_name(sc->gif_if)); ifnet_release(sc->gif_if); FREE(sc, M_DEVBUF); - return; + return ENOBUFS; } #endif #if INET6 @@ -338,7 +352,7 @@ gif_create_dev(void) printf("%s: unable to attach encap6\n", if_name(sc->gif_if)); ifnet_release(sc->gif_if); FREE(sc, M_DEVBUF); - return; + return ENOBUFS; } #endif sc->gif_called = 0; @@ -350,10 +364,18 @@ gif_create_dev(void) #endif result = ifnet_attach(sc->gif_if, NULL); if (result != 0) { - printf("gif_create_dev - ifnet_attach failed - %d\n", result); + printf("gif_clone_create - ifnet_attach failed - %d\n", result); ifnet_release(sc->gif_if); + if (sc->encap_cookie4) { + encap_detach(sc->encap_cookie4); + sc->encap_cookie4 = NULL; + } + if (sc->encap_cookie6) { + encap_detach(sc->encap_cookie6); + sc->encap_cookie6 = NULL; + } FREE(sc, M_DEVBUF); - return; + return result; } #if CONFIG_MACF_NET mac_ifnet_label_init(&sc->gif_if); @@ -361,6 +383,43 @@ gif_create_dev(void) bpfattach(sc->gif_if, DLT_NULL, sizeof(u_int)); TAILQ_INSERT_TAIL(&gifs, sc, gif_link); ngif++; + return 0; +} + +static int +gif_clone_destroy(struct ifnet *ifp) +{ +#if defined(INET) || defined(INET6) + int err = 0; +#endif + struct gif_softc *sc = ifp->if_softc; + + TAILQ_REMOVE(&gifs, sc, gif_link); + + gif_delete_tunnel(sc); +#ifdef INET6 + if (sc->encap_cookie6 != NULL) { + err = encap_detach(sc->encap_cookie6); + KASSERT(err == 0, ("gif_clone_destroy: Unexpected error detaching encap_cookie6")); + } +#endif +#ifdef INET + if (sc->encap_cookie4 != NULL) { + err = encap_detach(sc->encap_cookie4); + KASSERT(err == 0, ("gif_clone_destroy: Unexpected error detaching encap_cookie4")); + } +#endif + err = ifnet_set_flags(ifp, 0, IFF_UP); + if (err != 0) { + printf("gif_clone_destroy: ifnet_set_flags failed %d\n", err); + } + + err = ifnet_detach(ifp); + if (err != 0) + panic("gif_clone_destroy: ifnet_detach(%p) failed %d\n", ifp, err); + FREE(sc, M_DEVBUF); + ngif--; + return 0; } static int @@ -488,7 +547,6 @@ gif_input( mbuf_t m, __unused char *frame_header) { - errno_t error; struct gif_softc *sc = ifnet_softc(ifp); bpf_tap_in(ifp, 0, m, &sc->gif_proto, sizeof(sc->gif_proto)); @@ -505,8 +563,11 @@ gif_input( * it occurs more times than we thought, we may change the policy * again. */ - error = proto_input(protocol_family, m); - ifnet_stat_increment_in(ifp, 1, m->m_pkthdr.len, 0); + if (proto_input(protocol_family, m) != 0) { + ifnet_stat_increment_in(ifp, 0, 0, 1); + m_freem(m); + } else + ifnet_stat_increment_in(ifp, 1, m->m_pkthdr.len, 0); return (0); } @@ -716,11 +777,6 @@ gif_ioctl( ifnet_set_flags(ifp, IFF_RUNNING | IFF_UP, IFF_RUNNING | IFF_UP); -#ifdef __APPLE__ - /* Make sure at least one unused device is still available */ - gif_create_dev(); -#endif - error = 0; break; @@ -839,7 +895,6 @@ gif_ioctl( return error; } -#ifndef __APPLE__ /* This function is not used in our stack */ void gif_delete_tunnel(sc) @@ -857,4 +912,3 @@ gif_delete_tunnel(sc) } /* change the IFF_UP flag as well? */ } -#endif diff --git a/bsd/net/if_gif.h b/bsd/net/if_gif.h index dc193f74c..dfba33645 100644 --- a/bsd/net/if_gif.h +++ b/bsd/net/if_gif.h @@ -90,6 +90,7 @@ struct gif_softc { TAILQ_ENTRY(gif_softc) gif_link; /* all gif's are linked */ bpf_tap_mode tap_mode; bpf_packet_func tap_callback; + char gif_ifname[IFNAMSIZ]; }; #define gif_ro gifsc_gifscr.gifscr_ro diff --git a/bsd/net/if_llreach.c b/bsd/net/if_llreach.c new file mode 100644 index 000000000..669beb0f4 --- /dev/null +++ b/bsd/net/if_llreach.c @@ -0,0 +1,565 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Link-layer Reachability Record + * + * Each interface maintains a red-black tree which contains records related + * to the on-link nodes which we are interested in communicating with. Each + * record gets allocated and inserted into the tree in the following manner: + * upon processing an ARP announcement or reply from a known node (i.e. there + * exists a ARP route entry for the node), and if a link-layer reachability + * record for the node doesn't yet exist; and, upon processing a ND6 RS/RA/ + * NS/NA/redirect from a node, and if a link-layer reachability record for the + * node doesn't yet exist. + * + * Each newly created record is then referred to by the resolver route entry; + * if a record already exists, its reference count gets increased for the new + * resolver entry which now refers to it. A record gets removed from the tree + * and freed once its reference counts drops to zero, i.e. when there is no + * more resolver entry referring to it. + * + * A record contains the link-layer protocol (e.g. Ethertype IP/IPv6), the + * HW address of the sender, the "last heard from" timestamp (lr_lastrcvd) and + * the number of references made to it (lr_reqcnt). Because the key for each + * record in the red-black tree consists of the link-layer protocol, therefore + * the namespace for the records is partitioned based on the type of link-layer + * protocol, i.e. an Ethertype IP link-layer record is only referred to by one + * or more ARP entries; an Ethernet IPv6 link-layer record is only referred to + * by one or more ND6 entries. Therefore, lr_reqcnt represents the number of + * resolver entry references to the record for the same protocol family. + * + * Upon receiving packets from the network, the protocol's input callback + * (e.g. ether_inet{6}_input) informs the corresponding resolver (ARP/ND6) + * about the (link-layer) origin of the packet. This results in searching + * for a matching record in the red-black tree for the interface where the + * packet arrived on. If there's no match, no further processing takes place. + * Otherwise, the lr_lastrcvd timestamp of the record is updated. + * + * When an IP/IPv6 packet is transmitted to the resolver (i.e. the destination + * is on-link), ARP/ND6 records the "last spoken to" timestamp in the route + * entry ({la,ln}_lastused). + * + * The reachability of the on-link node is determined by the following logic, + * upon sending a packet thru the resolver: + * + * a) If the record is only used by exactly one resolver entry (lr_reqcnt + * is 1), i.e. the target host does not have IP/IPv6 aliases that we know + * of, check if lr_lastrcvd is "recent." If so, simply send the packet; + * otherwise, re-resolve the target node. + * + * b) If the record is shared by multiple resolver entries (lr_reqcnt is + * greater than 1), i.e. the target host has more than one IP/IPv6 aliases + * on the same network interface, we can't rely on lr_lastrcvd alone, as + * one of the IP/IPv6 aliases could have been silently moved to another + * node for which we don't have a link-layer record. If lr_lastrcvd is + * not "recent", we re-resolve the target node. Otherwise, we perform + * an additional check against {la,ln}_lastused to see whether it is also + * "recent", relative to lr_lastrcvd. If so, simply send the packet; + * otherwise, re-resolve the target node. + * + * The value for "recent" is configurable by adjusting the basetime value for + * net.link.ether.inet.arp_llreach_base or net.inet6.icmp6.nd6_llreach_base. + * The default basetime value is 30 seconds, and the actual expiration time + * is calculated by multiplying the basetime value with some random factor, + * which results in a number between 15 to 45 seconds. Setting the basetime + * value to 0 effectively disables this feature for the corresponding resolver. + * + * Assumptions: + * + * The above logic is based upon the following assumptions: + * + * i) Network traffics are mostly bi-directional, i.e. the act of sending + * packets to an on-link node would most likely cause us to receive + * packets from that node. + * + * ii) If the on-link node's IP/IPv6 address silently moves to another + * on-link node for which we are not aware of, non-unicast packets + * from the old node would trigger the record's lr_lastrcvd to be + * kept recent. + * + * We can mitigate the above by having the resolver check its {la,ln}_lastused + * timestamp at all times, i.e. not only when lr_reqcnt is greater than 1; but + * we currently optimize for the common cases. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#if INET6 +#include +#include +#endif /* INET6 */ + +static unsigned int iflr_size; /* size of if_llreach */ +static struct zone *iflr_zone; /* zone for if_llreach */ + +#define IFLR_ZONE_MAX 128 /* maximum elements in zone */ +#define IFLR_ZONE_NAME "if_llreach" /* zone name */ + +static struct if_llreach *iflr_alloc(int); +static void iflr_free(struct if_llreach *); +static __inline int iflr_cmp(const struct if_llreach *, + const struct if_llreach *); +static __inline int iflr_reachable(struct if_llreach *, int, u_int64_t); +static int sysctl_llreach_ifinfo SYSCTL_HANDLER_ARGS; + +/* The following is protected by if_llreach_lock */ +RB_GENERATE_PREV(ll_reach_tree, if_llreach, lr_link, iflr_cmp); + +SYSCTL_DECL(_net_link_generic_system); + +SYSCTL_NODE(_net_link_generic_system, OID_AUTO, llreach_info, + CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_llreach_ifinfo, + "Per-interface tree of source link-layer reachability records"); + +/* + * Link-layer reachability is based off node constants in RFC4861. + */ +#if INET6 +#define LL_COMPUTE_RTIME(x) ND_COMPUTE_RTIME(x) +#else +#define LL_MIN_RANDOM_FACTOR 512 /* 1024 * 0.5 */ +#define LL_MAX_RANDOM_FACTOR 1536 /* 1024 * 1.5 */ +#define LL_COMPUTE_RTIME(x) \ + (((LL_MIN_RANDOM_FACTOR * (x >> 10)) + (random() & \ + ((LL_MAX_RANDOM_FACTOR - LL_MIN_RANDOM_FACTOR) * (x >> 10)))) / 1000) +#endif /* !INET6 */ + +void +ifnet_llreach_init(void) +{ + iflr_size = sizeof (struct if_llreach); + iflr_zone = zinit(iflr_size, + IFLR_ZONE_MAX * iflr_size, 0, IFLR_ZONE_NAME); + if (iflr_zone == NULL) { + panic("%s: failed allocating %s", __func__, IFLR_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(iflr_zone, Z_EXPAND, TRUE); + zone_change(iflr_zone, Z_CALLERACCT, FALSE); +} + +void +ifnet_llreach_ifattach(struct ifnet *ifp, boolean_t reuse) +{ + lck_rw_lock_exclusive(&ifp->if_llreach_lock); + /* Initialize link-layer source tree (if not already) */ + if (!reuse) + RB_INIT(&ifp->if_ll_srcs); + lck_rw_done(&ifp->if_llreach_lock); +} + +void +ifnet_llreach_ifdetach(struct ifnet *ifp) +{ +#pragma unused(ifp) + /* + * Nothing to do for now; the link-layer source tree might + * contain entries at this point, that are still referred + * to by route entries pointing to this ifp. + */ +} + +/* + * Link-layer source tree comparison function. + * + * An ordered predicate is necessary; bcmp() is not documented to return + * an indication of order, memcmp() is, and is an ISO C99 requirement. + */ +static __inline int +iflr_cmp(const struct if_llreach *a, const struct if_llreach *b) +{ + return (memcmp(&a->lr_key, &b->lr_key, sizeof (a->lr_key))); +} + +static __inline int +iflr_reachable(struct if_llreach *lr, int cmp_delta, u_int64_t tval) +{ + u_int64_t now; + u_int64_t expire; + + now = net_uptime(); /* current approx. uptime */ + /* + * No need for lr_lock; atomically read the last rcvd uptime. + */ + expire = lr->lr_lastrcvd + lr->lr_reachable; + /* + * If we haven't heard back from the local host for over + * lr_reachable seconds, consider that the host is no + * longer reachable. + */ + if (!cmp_delta) + return (expire >= now); + /* + * If the caller supplied a reference time, consider the + * host is reachable if the record hasn't expired (see above) + * and if the reference time is within the past lr_reachable + * seconds. + */ + return ((expire >= now) && (now - tval) < lr->lr_reachable); +} + +int +ifnet_llreach_reachable(struct if_llreach *lr) +{ + /* + * Check whether the cache is too old to be trusted. + */ + return (iflr_reachable(lr, 0, 0)); +} + +int +ifnet_llreach_reachable_delta(struct if_llreach *lr, u_int64_t tval) +{ + /* + * Check whether the cache is too old to be trusted. + */ + return (iflr_reachable(lr, 1, tval)); +} + +void +ifnet_llreach_set_reachable(struct ifnet *ifp, u_int16_t llproto, void *addr, + unsigned int alen) +{ + struct if_llreach find, *lr; + + VERIFY(alen == IF_LLREACH_MAXLEN); /* for now */ + + find.lr_key.proto = llproto; + bcopy(addr, &find.lr_key.addr, IF_LLREACH_MAXLEN); + + lck_rw_lock_shared(&ifp->if_llreach_lock); + lr = RB_FIND(ll_reach_tree, &ifp->if_ll_srcs, &find); + if (lr == NULL) { + lck_rw_done(&ifp->if_llreach_lock); + return; + } + /* + * No need for lr_lock; atomically update the last rcvd uptime. + */ + lr->lr_lastrcvd = net_uptime(); + lck_rw_done(&ifp->if_llreach_lock); +} + +struct if_llreach * +ifnet_llreach_alloc(struct ifnet *ifp, u_int16_t llproto, void *addr, + unsigned int alen, u_int64_t llreach_base) +{ + struct if_llreach find, *lr; + struct timeval now; + + if (llreach_base == 0) + return (NULL); + + VERIFY(alen == IF_LLREACH_MAXLEN); /* for now */ + + find.lr_key.proto = llproto; + bcopy(addr, &find.lr_key.addr, IF_LLREACH_MAXLEN); + + lck_rw_lock_shared(&ifp->if_llreach_lock); + lr = RB_FIND(ll_reach_tree, &ifp->if_ll_srcs, &find); + if (lr != NULL) { +found: + IFLR_LOCK(lr); + VERIFY(lr->lr_reqcnt >= 1); + lr->lr_reqcnt++; + VERIFY(lr->lr_reqcnt != 0); + IFLR_ADDREF_LOCKED(lr); /* for caller */ + lr->lr_lastrcvd = net_uptime(); /* current approx. uptime */ + IFLR_UNLOCK(lr); + lck_rw_done(&ifp->if_llreach_lock); + return (lr); + } + + if (!lck_rw_lock_shared_to_exclusive(&ifp->if_llreach_lock)) + lck_rw_lock_exclusive(&ifp->if_llreach_lock); + + lck_rw_assert(&ifp->if_llreach_lock, LCK_RW_ASSERT_EXCLUSIVE); + + /* in case things have changed while becoming writer */ + lr = RB_FIND(ll_reach_tree, &ifp->if_ll_srcs, &find); + if (lr != NULL) + goto found; + + lr = iflr_alloc(M_WAITOK); + if (lr == NULL) { + lck_rw_done(&ifp->if_llreach_lock); + return (NULL); + } + IFLR_LOCK(lr); + lr->lr_reqcnt++; + VERIFY(lr->lr_reqcnt == 1); + IFLR_ADDREF_LOCKED(lr); /* for RB tree */ + IFLR_ADDREF_LOCKED(lr); /* for caller */ + lr->lr_lastrcvd = net_uptime(); /* current approx. uptime */ + lr->lr_baseup = lr->lr_lastrcvd; /* base uptime */ + microtime(&now); + lr->lr_basecal = now.tv_sec; /* base calendar time */ + lr->lr_basereachable = llreach_base; + lr->lr_reachable = LL_COMPUTE_RTIME(lr->lr_basereachable * 1000); + lr->lr_debug |= IFD_ATTACHED; + lr->lr_ifp = ifp; + lr->lr_key.proto = llproto; + bcopy(addr, &lr->lr_key.addr, IF_LLREACH_MAXLEN); + RB_INSERT(ll_reach_tree, &ifp->if_ll_srcs, lr); + IFLR_UNLOCK(lr); + lck_rw_done(&ifp->if_llreach_lock); + + return (lr); +} + +void +ifnet_llreach_free(struct if_llreach *lr) +{ + struct ifnet *ifp; + + /* no need to lock here; lr_ifp never changes */ + ifp = lr->lr_ifp; + + lck_rw_lock_exclusive(&ifp->if_llreach_lock); + IFLR_LOCK(lr); + if (lr->lr_reqcnt == 0) { + panic("%s: lr=%p negative reqcnt", __func__, lr); + /* NOTREACHED */ + } + --lr->lr_reqcnt; + if (lr->lr_reqcnt > 0) { + IFLR_UNLOCK(lr); + lck_rw_done(&ifp->if_llreach_lock); + IFLR_REMREF(lr); /* for caller */ + return; + } + if (!(lr->lr_debug & IFD_ATTACHED)) { + panic("%s: Attempt to detach an unattached llreach lr=%p", + __func__, lr); + /* NOTREACHED */ + } + lr->lr_debug &= ~IFD_ATTACHED; + RB_REMOVE(ll_reach_tree, &ifp->if_ll_srcs, lr); + IFLR_UNLOCK(lr); + lck_rw_done(&ifp->if_llreach_lock); + + IFLR_REMREF(lr); /* for RB tree */ + IFLR_REMREF(lr); /* for caller */ +} + +u_int64_t +ifnet_llreach_up2cal(struct if_llreach *lr, u_int64_t uptime) +{ + u_int64_t calendar = 0; + + if (uptime != 0) { + struct timeval cnow; + u_int64_t unow; + + getmicrotime(&cnow); /* current calendar time */ + unow = net_uptime(); /* current approx. uptime */ + /* + * Take into account possible calendar time changes; + * adjust base calendar value if necessary, i.e. + * the calendar skew should equate to the uptime skew. + */ + lr->lr_basecal += (cnow.tv_sec - lr->lr_basecal) - + (unow - lr->lr_baseup); + + calendar = lr->lr_basecal + lr->lr_reachable + + (uptime - lr->lr_baseup); + } + + return (calendar); +} + +static struct if_llreach * +iflr_alloc(int how) +{ + struct if_llreach *lr; + + lr = (how == M_WAITOK) ? zalloc(iflr_zone) : zalloc_noblock(iflr_zone); + if (lr != NULL) { + bzero(lr, iflr_size); + lck_mtx_init(&lr->lr_lock, ifnet_lock_group, ifnet_lock_attr); + lr->lr_debug |= IFD_ALLOC; + } + return (lr); +} + +static void +iflr_free(struct if_llreach *lr) +{ + IFLR_LOCK(lr); + if (lr->lr_debug & IFD_ATTACHED) { + panic("%s: attached lr=%p is being freed", __func__, lr); + /* NOTREACHED */ + } else if (!(lr->lr_debug & IFD_ALLOC)) { + panic("%s: lr %p cannot be freed", __func__, lr); + /* NOTREACHED */ + } else if (lr->lr_refcnt != 0) { + panic("%s: non-zero refcount lr=%p", __func__, lr); + /* NOTREACHED */ + } else if (lr->lr_reqcnt != 0) { + panic("%s: non-zero reqcnt lr=%p", __func__, lr); + /* NOTREACHED */ + } + lr->lr_debug &= ~IFD_ALLOC; + IFLR_UNLOCK(lr); + + lck_mtx_destroy(&lr->lr_lock, ifnet_lock_group); + zfree(iflr_zone, lr); +} + +void +iflr_addref(struct if_llreach *lr, int locked) +{ + if (!locked) + IFLR_LOCK(lr); + else + IFLR_LOCK_ASSERT_HELD(lr); + + if (++lr->lr_refcnt == 0) { + panic("%s: lr=%p wraparound refcnt", __func__, lr); + /* NOTREACHED */ + } + if (!locked) + IFLR_UNLOCK(lr); +} + +void +iflr_remref(struct if_llreach *lr) +{ + IFLR_LOCK(lr); + if (lr->lr_refcnt == 0) { + panic("%s: lr=%p negative refcnt", __func__, lr); + /* NOTREACHED */ + } + --lr->lr_refcnt; + if (lr->lr_refcnt > 0) { + IFLR_UNLOCK(lr); + return; + } + IFLR_UNLOCK(lr); + + iflr_free(lr); /* deallocate it */ +} + +void +ifnet_lr2ri(struct if_llreach *lr, struct rt_reach_info *ri) +{ + struct if_llreach_info lri; + + IFLR_LOCK_ASSERT_HELD(lr); + + bzero(ri, sizeof (*ri)); + ifnet_lr2lri(lr, &lri); + ri->ri_refcnt = lri.lri_refcnt; + ri->ri_probes = lri.lri_probes; + ri->ri_rcv_expire = lri.lri_expire; +} + +void +ifnet_lr2lri(struct if_llreach *lr, struct if_llreach_info *lri) +{ + IFLR_LOCK_ASSERT_HELD(lr); + + bzero(lri, sizeof (*lri)); + /* + * Note here we return request count, not actual memory refcnt. + */ + lri->lri_refcnt = lr->lr_reqcnt; + lri->lri_ifindex = lr->lr_ifp->if_index; + lri->lri_probes = lr->lr_probes; + lri->lri_expire = ifnet_llreach_up2cal(lr, lr->lr_lastrcvd); + lri->lri_proto = lr->lr_key.proto; + bcopy(&lr->lr_key.addr, &lri->lri_addr, IF_LLREACH_MAXLEN); +} + +static int +sysctl_llreach_ifinfo SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp) + int *name, retval = 0; + unsigned int namelen; + uint32_t ifindex; + struct if_llreach *lr; + struct if_llreach_info lri; + struct ifnet *ifp; + + name = (int *)arg1; + namelen = (unsigned int)arg2; + + if (req->newptr != USER_ADDR_NULL) + return (EPERM); + + if (namelen != 1) + return (EINVAL); + + ifindex = name[0]; + ifnet_head_lock_shared(); + if (ifindex <= 0 || ifindex > (u_int)if_index) { + printf("%s: ifindex %u out of range\n", __func__, ifindex); + ifnet_head_done(); + return (ENOENT); + } + + ifp = ifindex2ifnet[ifindex]; + ifnet_head_done(); + if (ifp == NULL) { + printf("%s: no ifp for ifindex %u\n", __func__, ifindex); + return (ENOENT); + } + + lck_rw_lock_shared(&ifp->if_llreach_lock); + RB_FOREACH(lr, ll_reach_tree, &ifp->if_ll_srcs) { + /* Export to if_llreach_info structure */ + IFLR_LOCK(lr); + ifnet_lr2lri(lr, &lri); + IFLR_UNLOCK(lr); + + if ((retval = SYSCTL_OUT(req, &lri, sizeof (lri))) != 0) + break; + } + lck_rw_done(&ifp->if_llreach_lock); + + return (retval); +} diff --git a/bsd/net/if_llreach.h b/bsd/net/if_llreach.h new file mode 100644 index 000000000..e922fb0e4 --- /dev/null +++ b/bsd/net/if_llreach.h @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _NET_IF_LLREACH_H_ +#define _NET_IF_LLREACH_H_ + +#ifdef PRIVATE +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* + * Per-interface link-layer reachability information (private). + */ +#define IF_LLREACHINFO_ADDRLEN 64 /* max ll addr len */ + +struct if_llreach_info { + u_int32_t lri_refcnt; /* reference count */ + u_int32_t lri_ifindex; /* interface index */ + u_int64_t lri_expire; /* expiration (calendar) time */ + u_int32_t lri_probes; /* total # of probes */ + u_int16_t lri_reserved; /* for future use */ + u_int16_t lri_proto; /* ll proto */ + u_int8_t lri_addr[IF_LLREACHINFO_ADDRLEN]; /* ll addr */ +}; + +#ifdef XNU_KERNEL_PRIVATE +#include +#include +#include +#include +#if INET6 +#include +#include +#endif /* INET6 */ + +/* + * Link-layer reachability is based off node constants in RFC4861. + */ +#if INET6 +#define LL_BASE_REACHABLE REACHABLE_TIME +#else +#define LL_BASE_REACHABLE 30000 /* msec */ +#endif /* !INET6 */ + +/* + * Per-interface link-layer reachability. (Currently only for ARP/Ethernet.) + */ +#define IF_LLREACH_MAXLEN ETHER_ADDR_LEN + +struct if_llreach { + decl_lck_mtx_data(, lr_lock); + RB_ENTRY(if_llreach) lr_link; /* RB tree links */ + struct ifnet *lr_ifp; /* back pointer to ifnet */ + u_int32_t lr_refcnt; /* reference count */ + u_int32_t lr_reqcnt; /* RB tree request count */ + u_int32_t lr_debug; /* see ifa_debug flags */ + u_int32_t lr_probes; /* number of probes so far */ + u_int64_t lr_basecal; /* base calendar time */ + u_int64_t lr_baseup; /* base uptime */ + u_int64_t lr_lastrcvd; /* last-heard-of timestamp */ + u_int32_t lr_basereachable; /* baseline time */ + u_int32_t lr_reachable; /* reachable time */ + struct lr_key_s { + u_int16_t proto; /* ll proto */ + u_int8_t addr[IF_LLREACH_MAXLEN]; /* ll addr */ + } lr_key; +}; + +RB_PROTOTYPE_SC_PREV(__private_extern__, ll_reach_tree, if_llreach, + ls_link, ifllr_cmp); + +#define IFLR_LOCK_ASSERT_HELD(_iflr) \ + lck_mtx_assert(&(_iflr)->lr_lock, LCK_MTX_ASSERT_OWNED) + +#define IFLR_LOCK_ASSERT_NOTHELD(_iflr) \ + lck_mtx_assert(&(_iflr)->lr_lock, LCK_MTX_ASSERT_NOTOWNED) + +#define IFLR_LOCK(_iflr) \ + lck_mtx_lock(&(_iflr)->lr_lock) + +#define IFLR_LOCK_SPIN(_iflr) \ + lck_mtx_lock_spin(&(_iflr)->lr_lock) + +#define IFLR_CONVERT_LOCK(_iflr) do { \ + IFLR_LOCK_ASSERT_HELD(_iflr); \ + lck_mtx_convert_spin(&(_iflr)->lr_lock); \ +} while (0) + +#define IFLR_UNLOCK(_iflr) \ + lck_mtx_unlock(&(_iflr)->lr_lock) + +#define IFLR_ADDREF(_iflr) \ + iflr_addref(_iflr, 0) + +#define IFLR_ADDREF_LOCKED(_iflr) \ + iflr_addref(_iflr, 1) + +#define IFLR_REMREF(_iflr) \ + iflr_remref(_iflr) + +extern void ifnet_llreach_init(void); +extern void ifnet_llreach_ifattach(struct ifnet *, boolean_t); +extern void ifnet_llreach_ifdetach(struct ifnet *); +extern struct if_llreach *ifnet_llreach_alloc(struct ifnet *, u_int16_t, void *, + unsigned int, u_int64_t); +extern void ifnet_llreach_free(struct if_llreach *); +extern int ifnet_llreach_reachable(struct if_llreach *); +extern int ifnet_llreach_reachable_delta(struct if_llreach *, u_int64_t); +extern void ifnet_llreach_set_reachable(struct ifnet *, u_int16_t, void *, + unsigned int); +extern u_int64_t ifnet_llreach_up2cal(struct if_llreach *, u_int64_t); +extern void ifnet_lr2ri(struct if_llreach *, struct rt_reach_info *); +extern void ifnet_lr2lri(struct if_llreach *, struct if_llreach_info *); +extern void iflr_addref(struct if_llreach *, int); +extern void iflr_remref(struct if_llreach *); +#endif /* XNU_KERNEL_PRIVATE */ + +#ifdef __cplusplus +} +#endif +#endif /* PRIVATE */ +#endif /* !_NET_IF_LLREACH_H_ */ diff --git a/bsd/net/if_loop.c b/bsd/net/if_loop.c index f62bdc362..5ba5b11a5 100644 --- a/bsd/net/if_loop.c +++ b/bsd/net/if_loop.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,6 +79,7 @@ #include #include #include +#include #include #include @@ -106,11 +107,6 @@ extern struct ifqueue atalkintrq; #endif -#include "bpfilter.h" -#if NBPFILTER > 0 -#include -#endif - #if CONFIG_MACF_NET #include #endif @@ -214,11 +210,11 @@ lo_output( if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = ifp; - ifp->if_ibytes += m->m_pkthdr.len; - ifp->if_obytes += m->m_pkthdr.len; + atomic_add_64(&ifp->if_ibytes, m->m_pkthdr.len); + atomic_add_64(&ifp->if_obytes, m->m_pkthdr.len); - ifp->if_opackets++; - ifp->if_ipackets++; + atomic_add_64(&ifp->if_opackets, 1); + atomic_add_64(&ifp->if_ipackets, 1); m->m_pkthdr.header = mtod(m, char *); if (apple_hwcksum_tx != 0) { @@ -339,7 +335,9 @@ loioctl( case SIOCSIFADDR: ifnet_set_flags(ifp, IFF_UP | IFF_RUNNING, IFF_UP | IFF_RUNNING); ifa = (struct ifaddr *)data; + IFA_LOCK_SPIN(ifa); ifa->ifa_rtrequest = lortrequest; + IFA_UNLOCK(ifa); /* * Everything else is done at a higher level. */ @@ -475,7 +473,9 @@ More than one loopback interface is not supported. ifnet_set_mtu(lo_ifp, LOMTU); ifnet_set_flags(lo_ifp, IFF_LOOPBACK | IFF_MULTICAST, IFF_LOOPBACK | IFF_MULTICAST); - ifnet_set_offload(lo_ifp, IFNET_CSUM_IP | IFNET_CSUM_TCP | IFNET_CSUM_UDP | IFNET_CSUM_FRAGMENT | IFNET_IP_FRAGMENT | IFNET_MULTIPAGES); + ifnet_set_offload(lo_ifp, IFNET_CSUM_IP | IFNET_CSUM_TCP | IFNET_CSUM_UDP | + IFNET_CSUM_TCPIPV6 | IFNET_CSUM_UDPIPV6 | IFNET_IPV6_FRAGMENT | + IFNET_CSUM_FRAGMENT | IFNET_IP_FRAGMENT | IFNET_MULTIPAGES); ifnet_set_hdrlen(lo_ifp, sizeof(struct loopback_header)); ifnet_set_eflags(lo_ifp, IFEF_SENDLIST, IFEF_SENDLIST); diff --git a/bsd/net/if_media.h b/bsd/net/if_media.h index 12cbc871b..32afe224d 100644 --- a/bsd/net/if_media.h +++ b/bsd/net/if_media.h @@ -221,7 +221,7 @@ int ifmedia_ioctl(struct ifnet *ifp, struct ifreq *ifr, #define IFM_FDX 0x00100000 /* Force full duplex */ #define IFM_HDX 0x00200000 /* Force half duplex */ #define IFM_FLOW 0x00400000 /* enable hardware flow control */ -#define IFM_EEE 0x00800000 /* Support energy efficient ethernet */ +#define IFM_EEE 0x00800000 /* Driver defined flag */ #define IFM_FLAG0 0x01000000 /* Driver defined flag */ #define IFM_FLAG1 0x02000000 /* Driver defined flag */ #define IFM_FLAG2 0x04000000 /* Driver defined flag */ diff --git a/bsd/net/if_mib.c b/bsd/net/if_mib.c index b21529b2a..9ab76f698 100644 --- a/bsd/net/if_mib.c +++ b/bsd/net/if_mib.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -67,8 +67,6 @@ #include #include -#if NETMIBS - /* * A sysctl(3) MIB for generic interface information. This information * is exported in the net.link.generic branch, which has the following @@ -97,31 +95,17 @@ SYSCTL_DECL(_net_link_generic); SYSCTL_NODE(_net_link_generic, IFMIB_SYSTEM, system, CTLFLAG_RD|CTLFLAG_LOCKED, 0, "Variables global to all interfaces"); -SYSCTL_INT(_net_link_generic_system, IFMIB_IFCOUNT, ifcount, CTLFLAG_RD, +SYSCTL_INT(_net_link_generic_system, IFMIB_IFCOUNT, ifcount, CTLFLAG_RD | CTLFLAG_LOCKED, &if_index, 0, "Number of configured interfaces"); static int sysctl_ifdata SYSCTL_HANDLER_ARGS; -SYSCTL_NODE(_net_link_generic, IFMIB_IFDATA, ifdata, CTLFLAG_RD, +SYSCTL_NODE(_net_link_generic, IFMIB_IFDATA, ifdata, CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_ifdata, "Interface table"); static int sysctl_ifalldata SYSCTL_HANDLER_ARGS; -SYSCTL_NODE(_net_link_generic, IFMIB_IFALLDATA, ifalldata, CTLFLAG_RD, +SYSCTL_NODE(_net_link_generic, IFMIB_IFALLDATA, ifalldata, CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_ifalldata, "Interface table"); -extern int dlil_multithreaded_input; -SYSCTL_INT(_net_link_generic_system, OID_AUTO, multi_threaded_input, CTLFLAG_RW, - &dlil_multithreaded_input , 0, "Uses multiple input thread for DLIL input"); -#ifdef IFNET_INPUT_SANITY_CHK -extern int dlil_input_sanity_check; -SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check, CTLFLAG_RW, - &dlil_input_sanity_check , 0, "Turn on sanity checking in DLIL input"); -#endif - -extern int dlil_verbose; -SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose, CTLFLAG_RW, - &dlil_verbose, 0, "Log DLIL error messages"); - - static int make_ifmibdata(struct ifnet *, int *, struct sysctl_req *); int @@ -140,7 +124,7 @@ make_ifmibdata(struct ifnet *ifp, int *name, struct sysctl_req *req) /* * Make sure the interface is in use */ - if (ifp->if_refcnt > 0) { + if (ifnet_is_attached(ifp, 0)) { snprintf(ifmd.ifmd_name, sizeof(ifmd.ifmd_name), "%s%d", ifp->if_name, ifp->if_unit); @@ -191,11 +175,14 @@ make_ifmibdata(struct ifnet *ifp, int *name, struct sysctl_req *req) #endif /* IF_MIB_WR */ break; -#if PKT_PRIORITY - case IFDATA_SUPPLEMENTAL: - error = SYSCTL_OUT(req, &ifp->if_tc, sizeof(struct if_traffic_class)); + case IFDATA_SUPPLEMENTAL: { + struct if_traffic_class if_tc; + + if_copy_traffic_class(ifp, &if_tc); + + error = SYSCTL_OUT(req, &if_tc, sizeof(struct if_traffic_class)); break; -#endif /* PKT_PRIORITY */ + } } return error; @@ -211,23 +198,24 @@ sysctl_ifdata SYSCTL_HANDLER_ARGS /* XXX bad syntax! */ struct ifnet *ifp; if (namelen != 2) - return EINVAL; + return (EINVAL); + ifnet_head_lock_shared(); if (name[0] <= 0 || name[0] > if_index || - (ifp = ifindex2ifnet[name[0]]) == NULL || - ifp->if_refcnt == 0) { + (ifp = ifindex2ifnet[name[0]]) == NULL) { ifnet_head_done(); - return ENOENT; + return (ENOENT); } + ifnet_reference(ifp); ifnet_head_done(); ifnet_lock_shared(ifp); - error = make_ifmibdata(ifp, name, req); - ifnet_lock_done(ifp); - - return error; + + ifnet_release(ifp); + + return (error); } int @@ -240,20 +228,18 @@ sysctl_ifalldata SYSCTL_HANDLER_ARGS /* XXX bad syntax! */ struct ifnet *ifp; if (namelen != 2) - return EINVAL; + return (EINVAL); ifnet_head_lock_shared(); TAILQ_FOREACH(ifp, &ifnet_head, if_link) { ifnet_lock_shared(ifp); - + error = make_ifmibdata(ifp, name, req); - + ifnet_lock_done(ifp); - if (error) + if (error != 0) break; } ifnet_head_done(); return error; } - -#endif diff --git a/bsd/net/if_mib.h b/bsd/net/if_mib.h index 36d2667a4..5b773bddf 100644 --- a/bsd/net/if_mib.h +++ b/bsd/net/if_mib.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * diff --git a/bsd/net/if_pflog.c b/bsd/net/if_pflog.c index 8e7480911..ae2f9254c 100644 --- a/bsd/net/if_pflog.c +++ b/bsd/net/if_pflog.c @@ -68,6 +68,7 @@ #include #include #include +#include #include #include @@ -344,8 +345,8 @@ pflog_packet(struct pfi_kif *kif, struct mbuf *m, sa_family_t af, u_int8_t dir, } #endif /* INET */ - ifn->if_opackets++; - ifn->if_obytes += m->m_pkthdr.len; + atomic_add_64(&ifn->if_opackets, 1); + atomic_add_64(&ifn->if_obytes, m->m_pkthdr.len); switch (dir) { case PF_IN: diff --git a/bsd/net/if_stf.c b/bsd/net/if_stf.c index 96b9664b5..c9d24e249 100644 --- a/bsd/net/if_stf.c +++ b/bsd/net/if_stf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -120,6 +120,8 @@ #include +#include + #include #include #include @@ -148,9 +150,10 @@ #include #endif -#define IN6_IS_ADDR_6TO4(x) (ntohs((x)->s6_addr16[0]) == 0x2002) #define GET_V4(x) ((const struct in_addr *)(&(x)->s6_addr16[1])) +static lck_grp_t *stf_mtx_grp; + struct stf_softc { ifnet_t sc_if; /* common area */ u_int32_t sc_protocol_family; /* dlil protocol attached */ @@ -159,6 +162,7 @@ struct stf_softc { struct route_in6 __sc_ro6; /* just for safety */ } __sc_ro46; #define sc_ro __sc_ro46.__sc_ro4 + decl_lck_mtx_data(, sc_ro_mtx); const struct encaptab *encap_cookie; bpf_tap_mode tap_mode; bpf_packet_func tap_callback; @@ -167,14 +171,16 @@ struct stf_softc { void stfattach (void); static int ip_stf_ttl = 40; +static int stf_init_done; static void in_stf_input(struct mbuf *, int); +static void stfinit(void); extern struct domain inetdomain; struct protosw in_stf_protosw = { SOCK_RAW, &inetdomain, IPPROTO_IPV6, PR_ATOMIC|PR_ADDR, in_stf_input, NULL, NULL, rip_ctloutput, NULL, - NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, &rip_usrreqs, NULL, rip_unlock, NULL, {NULL, NULL}, NULL, {0} @@ -192,6 +198,15 @@ static void stf_rtrequest(int, struct rtentry *, struct sockaddr *); static errno_t stf_ioctl(ifnet_t ifp, u_long cmd, void *data); static errno_t stf_output(ifnet_t ifp, mbuf_t m); +static void +stfinit(void) +{ + if (!stf_init_done) { + stf_mtx_grp = lck_grp_alloc_init("stf", LCK_GRP_ATTR_NULL); + stf_init_done = 1; + } +} + /* * gif_input is the input handler for IP and IPv6 attached to gif */ @@ -202,7 +217,8 @@ stf_media_input( mbuf_t m, __unused char *frame_header) { - proto_input(protocol_family, m); + if (proto_input(protocol_family, m) != 0) + m_freem(m); return (0); } @@ -297,6 +313,8 @@ stfattach(void) const struct encaptab *p; struct ifnet_init_params stf_init; + stfinit(); + error = proto_register_plumber(PF_INET6, APPLE_IF_FAM_STF, stf_attach_inet6, NULL); if (error != 0) @@ -318,6 +336,7 @@ stfattach(void) return; } sc->encap_cookie = p; + lck_mtx_init(&sc->sc_ro_mtx, stf_mtx_grp, LCK_ATTR_NULL); bzero(&stf_init, sizeof(stf_init)); stf_init.name = "stf"; @@ -336,6 +355,7 @@ stfattach(void) if (error != 0) { printf("stfattach, ifnet_allocate failed - %d\n", error); encap_detach(sc->encap_cookie); + lck_mtx_destroy(&sc->sc_ro_mtx, stf_mtx_grp); FREE(sc, M_DEVBUF); return; } @@ -355,6 +375,7 @@ stfattach(void) printf("stfattach: ifnet_attach returned error=%d\n", error); encap_detach(sc->encap_cookie); ifnet_release(sc->sc_if); + lck_mtx_destroy(&sc->sc_ro_mtx, stf_mtx_grp); FREE(sc, M_DEVBUF); return; } @@ -404,9 +425,11 @@ stf_encapcheck( * local 6to4 address. * success on: dst = 10.1.1.1, ia6->ia_addr = 2002:0a01:0101:... */ + IFA_LOCK(&ia6->ia_ifa); if (bcmp(GET_V4(&ia6->ia_addr.sin6_addr), &ip.ip_dst, sizeof(ip.ip_dst)) != 0) { - ifafree(&ia6->ia_ifa); + IFA_UNLOCK(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); return 0; } /* @@ -421,11 +444,13 @@ stf_encapcheck( b = ip.ip_src; b.s_addr &= GET_V4(&ia6->ia_prefixmask.sin6_addr)->s_addr; if (a.s_addr != b.s_addr) { - ifafree(&ia6->ia_ifa); + IFA_UNLOCK(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); return 0; } /* stf interface makes single side match only */ - ifafree(&ia6->ia_ifa); + IFA_UNLOCK(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); return 32; } @@ -438,38 +463,46 @@ stf_getsrcifa6(struct ifnet *ifp) struct in_addr in; ifnet_lock_shared(ifp); - for (ia = ifp->if_addrlist.tqh_first; - ia; - ia = ia->ifa_list.tqe_next) - { - if (ia->ifa_addr == NULL) + for (ia = ifp->if_addrlist.tqh_first; ia; ia = ia->ifa_list.tqe_next) { + IFA_LOCK(ia); + if (ia->ifa_addr == NULL) { + IFA_UNLOCK(ia); continue; - if (ia->ifa_addr->sa_family != AF_INET6) + } + if (ia->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ia); continue; + } sin6 = (struct sockaddr_in6 *)ia->ifa_addr; - if (!IN6_IS_ADDR_6TO4(&sin6->sin6_addr)) + if (!IN6_IS_ADDR_6TO4(&sin6->sin6_addr)) { + IFA_UNLOCK(ia); continue; - + } bcopy(GET_V4(&sin6->sin6_addr), &in, sizeof(in)); + IFA_UNLOCK(ia); lck_rw_lock_shared(in_ifaddr_rwlock); for (ia4 = TAILQ_FIRST(&in_ifaddrhead); ia4; ia4 = TAILQ_NEXT(ia4, ia_link)) { - if (ia4->ia_addr.sin_addr.s_addr == in.s_addr) + IFA_LOCK(&ia4->ia_ifa); + if (ia4->ia_addr.sin_addr.s_addr == in.s_addr) { + IFA_UNLOCK(&ia4->ia_ifa); break; + } + IFA_UNLOCK(&ia4->ia_ifa); } lck_rw_done(in_ifaddr_rwlock); if (ia4 == NULL) continue; - ifaref(ia); + IFA_ADDREF(ia); /* for caller */ ifnet_lock_done(ifp); - return (struct in6_ifaddr *)ia; + return ((struct in6_ifaddr *)ia); } ifnet_lock_done(ifp); - return NULL; + return (NULL); } int @@ -491,6 +524,7 @@ stf_pre_output( struct ip6_hdr *ip6; struct in6_ifaddr *ia6; struct sockaddr_in *dst4; + struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; errno_t result = 0; sc = ifnet_softc(ifp); @@ -516,7 +550,7 @@ stf_pre_output( m = m_pullup(m, sizeof(*ip6)); if (!m) { *m0 = NULL; /* makes sure this won't be double freed */ - ifafree(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); return ENOBUFS; } } @@ -532,7 +566,7 @@ stf_pre_output( else if (IN6_IS_ADDR_6TO4(&dst6->sin6_addr)) in4 = GET_V4(&dst6->sin6_addr); else { - ifafree(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); return ENETUNREACH; } @@ -548,15 +582,17 @@ stf_pre_output( m = m_pullup(m, sizeof(struct ip)); if (m == NULL) { *m0 = NULL; - ifafree(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); return ENOBUFS; } ip = mtod(m, struct ip *); bzero(ip, sizeof(*ip)); + IFA_LOCK_SPIN(&ia6->ia_ifa); bcopy(GET_V4(&((struct sockaddr_in6 *)&ia6->ia_addr)->sin6_addr), &ip->ip_src, sizeof(ip->ip_src)); + IFA_UNLOCK(&ia6->ia_ifa); bcopy(in4, &ip->ip_dst, sizeof(ip->ip_dst)); ip->ip_p = IPPROTO_IPV6; ip->ip_ttl = ip_stf_ttl; @@ -566,11 +602,11 @@ stf_pre_output( else ip_ecn_ingress(ECN_NOCARE, &ip->ip_tos, &tos); + lck_mtx_lock(&sc->sc_ro_mtx); dst4 = (struct sockaddr_in *)&sc->sc_ro.ro_dst; if (dst4->sin_family != AF_INET || bcmp(&dst4->sin_addr, &ip->ip_dst, sizeof(ip->ip_dst)) != 0) { - /* cache route doesn't match */ - printf("stf_output: cached route doesn't match \n"); + /* cache route doesn't match: always the case during the first use */ dst4->sin_family = AF_INET; dst4->sin_len = sizeof(struct sockaddr_in); bcopy(&ip->ip_dst, &dst4->sin_addr, sizeof(dst4->sin_addr)); @@ -580,21 +616,15 @@ stf_pre_output( } } - if (sc->sc_ro.ro_rt == NULL) { - rtalloc(&sc->sc_ro); - if (sc->sc_ro.ro_rt == NULL) { - ifafree(&ia6->ia_ifa); - return ENETUNREACH; - } - } + result = ip_output_list(m, 0, NULL, &sc->sc_ro, IP_OUTARGS, NULL, &ipoa); + lck_mtx_unlock(&sc->sc_ro_mtx); - result = ip_output_list(m, 0, NULL, &sc->sc_ro, 0, NULL, NULL); /* Assumption: ip_output will free mbuf on errors */ /* All the output processing is done here, don't let stf_output be called */ if (result == 0) result = EJUSTRETURN; *m0 = NULL; - ifafree(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); return result; } static errno_t @@ -635,12 +665,17 @@ stf_checkaddr4( ia4; ia4 = TAILQ_NEXT(ia4, ia_link)) { - if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0) + IFA_LOCK(&ia4->ia_ifa); + if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0) { + IFA_UNLOCK(&ia4->ia_ifa); continue; + } if (in->s_addr == ia4->ia_broadaddr.sin_addr.s_addr) { + IFA_UNLOCK(&ia4->ia_ifa); lck_rw_done(in_ifaddr_rwlock); return -1; } + IFA_UNLOCK(&ia4->ia_ifa); } lck_rw_done(in_ifaddr_rwlock); @@ -820,7 +855,13 @@ stf_ioctl( switch (cmd) { case SIOCSIFADDR: ifa = (struct ifaddr *)data; - if (ifa == NULL || ifa->ifa_addr->sa_family != AF_INET6) { + if (ifa == NULL) { + error = EAFNOSUPPORT; + break; + } + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); error = EAFNOSUPPORT; break; } @@ -829,10 +870,16 @@ stf_ioctl( if ( !(ifnet_flags( ifp ) & IFF_UP) ) { /* do this only if the interface is not already up */ ifa->ifa_rtrequest = stf_rtrequest; + IFA_UNLOCK(ifa); ifnet_set_flags(ifp, IFF_UP, IFF_UP); + } else { + IFA_UNLOCK(ifa); } - } else + } else { + IFA_UNLOCK(ifa); error = EINVAL; + } + IFA_LOCK_ASSERT_NOTHELD(ifa); break; case SIOCADDMULTI: diff --git a/bsd/net/if_types.h b/bsd/net/if_types.h index 4eced169b..b3f8e5b65 100644 --- a/bsd/net/if_types.h +++ b/bsd/net/if_types.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -148,6 +148,7 @@ #define IFT_PFSYNC 0xf6 /* Packet filter state syncing */ #define IFT_CARP 0xf8 /* Common Address Redundancy Protocol */ -#define IFT_PDP 0xff /* GPRS Packet Data Protocol */ +#define IFT_CELLULAR 0xff /* Packet Data over Cellular */ +#define IFT_PDP IFT_CELLULAR /* deprecated; use IFT_CELLULAR */ #endif diff --git a/bsd/net/if_utun.c b/bsd/net/if_utun.c index 1b35e44b4..a8667845b 100644 --- a/bsd/net/if_utun.c +++ b/bsd/net/if_utun.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2009 Apple Inc. All rights reserved. + * Copyright (c) 2008-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -366,6 +366,9 @@ utun_cleanup_family( goto cleanup; } + /* always set SS_PRIV, we want to close and detach regardless */ + sock_setpriv(pf_socket, 1); + result = utun_detach_ip(interface, protocol, pf_socket); if (result == 0 || result == ENXIO) { /* We are done! We either detached or weren't attached. */ @@ -705,7 +708,6 @@ utun_ioctl( void *data) { errno_t result = 0; - struct ifaddr *ifa = (struct ifaddr *)data; switch(command) { case SIOCSIFMTU: @@ -716,13 +718,6 @@ utun_ioctl( /* ifioctl() takes care of it */ break; - case SIOCSIFADDR: - case SIOCAIFADDR: - /* This will be called for called for IPv6 Address additions */ - if (ifa->ifa_addr->sa_family == AF_INET6) - break; - /* Fall though for other families like IPv4 */ - default: result = EOPNOTSUPP; } @@ -754,7 +749,8 @@ utun_proto_input( // remove protocol family first mbuf_adj(m, sizeof(u_int32_t)); - proto_input(protocol, m); + if (proto_input(protocol, m) != 0) + m_freem(m); return 0; } @@ -770,8 +766,8 @@ utun_proto_pre_output( __unused char *link_layer_dest) { - *(protocol_family_t *)(void *)frame_type = protocol; - return 0; + *(protocol_family_t *)(void *)frame_type = protocol; + return 0; } static errno_t diff --git a/bsd/net/if_var.h b/bsd/net/if_var.h index 51b48d27c..a76aa7dbb 100644 --- a/bsd/net/if_var.h +++ b/bsd/net/if_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -223,8 +223,6 @@ struct if_data64 { struct IF_DATA_TIMEVAL ifi_lastchange; /* time of last administrative change */ }; -#pragma pack() - #ifdef PRIVATE struct if_traffic_class { u_int64_t ifi_ibkpackets; /* TC_BK packets received on interface */ @@ -240,11 +238,28 @@ struct if_traffic_class { u_int64_t ifi_ovopackets; /* TC_VO packets sent on interface */ u_int64_t ifi_ovobytes; /* TC_VO bytes sent on interface */ }; +#endif /* PRIVATE */ + +#pragma pack() +/* + * Structure defining a queue for a network interface. + */ +struct ifqueue { + void *ifq_head; + void *ifq_tail; + int ifq_len; + int ifq_maxlen; + int ifq_drops; +}; + +#ifdef XNU_KERNEL_PRIVATE /* * Internal storage of if_data. This is bound to change. Various places in the * stack will translate this data structure in to the externally visible - * if_data structure above. + * if_data structure above. Note that during interface attach time, the + * embedded if_data structure in ifnet is cleared, with the exception of + * some non-statistics related fields. */ struct if_data_internal { /* generic interface information */ @@ -259,6 +274,7 @@ struct if_data_internal { u_int32_t ifi_mtu; /* maximum transmission unit */ u_int32_t ifi_metric; /* routing metric (external only) */ u_int32_t ifi_baudrate; /* linespeed */ + u_int32_t _pad; /* volatile statistics */ u_int64_t ifi_ipackets; /* packets received on interface */ u_int64_t ifi_ierrors; /* input errors on interface */ @@ -279,7 +295,9 @@ struct if_data_internal { u_int32_t ifi_tso_v4_mtu; /* TCP Segment Offload IPv4 maximum segment size */ u_int32_t ifi_tso_v6_mtu; /* TCP Segment Offload IPv6 maximum segment size */ }; +#endif /* XNU_KERNEL_PRIVATE */ +#ifdef PRIVATE #define if_mtu if_data.ifi_mtu #define if_type if_data.ifi_type #define if_typelen if_data.ifi_typelen @@ -303,47 +321,57 @@ struct if_data_internal { #define if_lastchange if_data.ifi_lastchange #define if_recvquota if_data.ifi_recvquota #define if_xmitquota if_data.ifi_xmitquota -#define if_iflags if_data.ifi_iflags +#endif /* PRIVATE */ +#ifdef XNU_KERNEL_PRIVATE #define if_tso_v4_mtu if_data.ifi_tso_v4_mtu #define if_tso_v6_mtu if_data.ifi_tso_v6_mtu +#endif /* XNU_KERNEL_PRIVATE */ -struct mbuf; -struct ifaddr; -TAILQ_HEAD(ifnethead, ifnet); /* we use TAILQs so that the order of */ -TAILQ_HEAD(ifaddrhead, ifaddr); /* instantiation is preserved in the list */ -TAILQ_HEAD(ifprefixhead, ifprefix); -LIST_HEAD(ifmultihead, ifmultiaddr); -struct tqdummy; -TAILQ_HEAD(tailq_head, tqdummy); - +#ifdef XNU_KERNEL_PRIVATE /* * Forward structure declarations for function prototypes [sic]. */ -struct proc; -struct rtentry; -struct socket; -struct ether_header; -struct sockaddr_dl; +struct proc; +struct rtentry; +struct socket; struct ifnet_filter; +struct mbuf; +struct ifaddr; +struct tqdummy; +struct proto_hash_entry; +struct dlil_threading_info; +#if PF +struct pfi_kif; +#endif /* PF */ +/* we use TAILQs so that the order of instantiation is preserved in the list */ +TAILQ_HEAD(ifnethead, ifnet); +TAILQ_HEAD(ifaddrhead, ifaddr); +TAILQ_HEAD(ifprefixhead, ifprefix); +LIST_HEAD(ifmultihead, ifmultiaddr); +TAILQ_HEAD(tailq_head, tqdummy); TAILQ_HEAD(ifnet_filter_head, ifnet_filter); TAILQ_HEAD(ddesc_head_name, dlil_demux_desc); +#endif /* XNU_KERNEL_PRIVATE */ -/* All of the following IF_HWASSIST_* flags are defined - * in kpi_inteface.h as IFNET_* flags. These are redefined - * here as constants to avoid failures to build user level - * programs that can not include kpi_interface.h. It is - * important to keep this in sync with the definitions in - * kpi_interface.h. The corresponding constant for each - * definition is mentioned in the comment. +#ifdef PRIVATE +/* + * All of the following IF_HWASSIST_* flags are defined in kpi_inteface.h as + * IFNET_* flags. These are redefined here as constants to avoid failures to + * build user level programs that can not include kpi_interface.h. It is + * important to keep this in sync with the definitions in kpi_interface.h. + * The corresponding constant for each definition is mentioned in the comment. * - * Bottom 16 bits reserved for hardware checksum + * Bottom 16 bits reserved for hardware checksum */ #define IF_HWASSIST_CSUM_IP 0x0001 /* will csum IP, IFNET_CSUM_IP */ #define IF_HWASSIST_CSUM_TCP 0x0002 /* will csum TCP, IFNET_CSUM_TCP */ #define IF_HWASSIST_CSUM_UDP 0x0004 /* will csum UDP, IFNET_CSUM_UDP */ #define IF_HWASSIST_CSUM_IP_FRAGS 0x0008 /* will csum IP fragments, IFNET_CSUM_FRAGMENT */ #define IF_HWASSIST_CSUM_FRAGMENT 0x0010 /* will do IP fragmentation, IFNET_IP_FRAGMENT */ +#define IF_HWASSIST_CSUM_TCPIPV6 0x0020 /* will csum TCPv6, IFNET_CSUM_TCPIPV6 */ +#define IF_HWASSIST_CSUM_UDPIPV6 0x0040 /* will csum UDPv6, IFNET_CSUM_UDP */ +#define IF_HWASSIST_CSUM_FRAGMENT_IPV6 0x0080 /* will do IPv6 fragmentation, IFNET_IPV6_FRAGMENT */ #define IF_HWASSIST_CSUM_TCP_SUM16 0x1000 /* simple TCP Sum16 computation, IFNET_CSUM_SUM16 */ #define IF_HWASSIST_CSUM_MASK 0xffff #define IF_HWASSIST_CSUM_FLAGS(hwassist) ((hwassist) & IF_HWASSIST_CSUM_MASK) @@ -356,30 +384,13 @@ TAILQ_HEAD(ddesc_head_name, dlil_demux_desc); #define IF_HWASSIST_TSO_V4 0x00200000 /* will do TCP Segment offload for IPv4, IFNET_TSO_IPV4 */ #define IF_HWASSIST_TSO_V6 0x00400000 /* will do TCP Segment offload for IPv6, IFNET_TSO_IPV6 */ - -#define IFNET_RW_LOCK 1 - #endif /* PRIVATE */ -/* - * Structure defining a queue for a network interface. - */ -struct ifqueue { - void *ifq_head; - void *ifq_tail; - int ifq_len; - int ifq_maxlen; - int ifq_drops; -}; -#ifdef PRIVATE +#ifdef XNU_KERNEL_PRIVATE +#include +#include -struct ddesc_head_str; -struct proto_hash_entry; -struct kev_msg; -struct dlil_threading_info; -#if PF -struct pfi_kif; -#endif /* PF */ +RB_HEAD(ll_reach_tree, if_llreach); /* define struct ll_reach_tree */ /* * Structure defining a network interface. @@ -387,34 +398,42 @@ struct pfi_kif; * (Would like to call this struct ``if'', but C isn't PL/1.) */ struct ifnet { - void *if_softc; /* pointer to driver state */ - const char *if_name; /* name, e.g. ``en'' or ``lo'' */ - TAILQ_ENTRY(ifnet) if_link; /* all struct ifnets are chained */ - struct ifaddrhead if_addrhead; /* linked list of addresses per if */ - u_int32_t if_refcnt; -#ifdef __KPI_INTERFACE__ - ifnet_check_multi if_check_multi; -#else - void* if_check_multi; -#endif /* __KPI_INTERFACE__ */ - int if_pcount; /* number of promiscuous listeners */ - struct bpf_if *if_bpf; /* packet filter structure */ - u_short if_index; /* numeric abbreviation for this if */ - short if_unit; /* sub-unit for lower level driver */ - short if_timer; /* time 'til if_watchdog called */ - short if_flags; /* up/down, broadcast, etc. */ - int if_ipending; /* interrupts pending */ - void *if_linkmib; /* link-type-specific MIB data */ - size_t if_linkmiblen; /* length of above data */ - struct if_data_internal if_data; - -/* New with DLIL */ -#ifdef BSD_KERNEL_PRIVATE - int if_usecnt; -#else - int refcnt; -#endif -#ifdef __KPI_INTERFACE__ + /* + * Lock (RW or mutex) to protect this data structure (static storage.) + */ + decl_lck_rw_data(, if_lock); + void *if_softc; /* pointer to driver state */ + const char *if_name; /* name, e.g. ``en'' or ``lo'' */ + TAILQ_ENTRY(ifnet) if_link; /* all struct ifnets are chained */ + TAILQ_ENTRY(ifnet) if_detaching_link; /* list of detaching ifnets */ + + decl_lck_mtx_data(, if_ref_lock) + u_int32_t if_refflags; + u_int32_t if_refio; /* number of io ops to the underlying driver */ + +#define if_list if_link + struct ifaddrhead if_addrhead; /* linked list of addresses per if */ +#define if_addrlist if_addrhead + struct ifaddr *if_lladdr; /* link address (first/permanent) */ + + int if_pcount; /* number of promiscuous listeners */ + struct bpf_if *if_bpf; /* packet filter structure */ + u_short if_index; /* numeric abbreviation for this if */ + short if_unit; /* sub-unit for lower level driver */ + short if_timer; /* time 'til if_watchdog called */ + short if_flags; /* up/down, broadcast, etc. */ + u_int32_t if_eflags; /* see */ + + int if_capabilities; /* interface features & capabilities */ + int if_capenable; /* enabled features & capabilities */ + + void *if_linkmib; /* link-type-specific MIB data */ + size_t if_linkmiblen; /* length of above data */ + + struct if_data_internal if_data __attribute__((aligned(8))); + + ifnet_family_t if_family; /* value assigned by Apple */ + uintptr_t if_family_cookie; ifnet_output_func if_output; ifnet_ioctl_func if_ioctl; ifnet_set_bpf_tap if_set_bpf_tap; @@ -422,64 +441,28 @@ struct ifnet { ifnet_demux_func if_demux; ifnet_event_func if_event; ifnet_framer_func if_framer; - ifnet_family_t if_family; /* value assigned by Apple */ -#else - void* if_output; - void* if_ioctl; - void* if_set_bpf_tap; - void* if_free; - void* if_demux; - void* if_event; - void* if_framer; - u_int32_t if_family; /* value assigned by Apple */ -#endif + ifnet_add_proto_func if_add_proto; + ifnet_del_proto_func if_del_proto; + ifnet_check_multi if_check_multi; + struct proto_hash_entry *if_proto_hash; + void *if_kpi_storage; + decl_lck_mtx_data(, if_flt_lock) + u_int32_t if_flt_busy; + u_int32_t if_flt_waiters; struct ifnet_filter_head if_flt_head; -/* End DLIL specific */ + struct ifmultihead if_multiaddrs; /* multicast addresses */ + u_int32_t if_updatemcasts; /* mcast addrs need updating */ + int if_amcount; /* # of all-multicast reqs */ + decl_lck_mtx_data(, if_addrconfig_lock); /* for serializing addr config */ + struct in_multi *if_allhostsinm; /* store all-hosts inm for this ifp */ - u_int32_t if_delayed_detach; /* need to perform delayed detach */ - void *if_private; /* private to interface */ - long if_eflags; /* autoaddr, autoaddr done, etc. */ - - struct ifmultihead if_multiaddrs; /* multicast addresses configured */ - int if_amcount; /* number of all-multicast requests */ -/* procedure handles */ -#ifdef __KPI_INTERFACE__ - ifnet_add_proto_func if_add_proto; - ifnet_del_proto_func if_del_proto; -#else /* !__KPI_INTERFACE__ */ - void* if_add_proto; - void* if_del_proto; -#endif /* !__KPI_INTERFACE__ */ - struct proto_hash_entry *if_proto_hash; - void *if_kpi_storage; -#if 0 - void *unused_was_init; -#else struct dlil_threading_info *if_input_thread; -#endif - void *unused_was_resolvemulti; - - struct ifqueue if_snd; - u_int32_t unused_2[1]; -#ifdef __APPLE__ - uintptr_t family_cookie; - struct ifprefixhead if_prefixhead; /* list of prefixes per if */ -#ifdef _KERN_LOCKS_H_ -#if IFNET_RW_LOCK - lck_rw_t *if_lock; /* Lock to protect this interface */ -#else - lck_mtx_t *if_lock; /* Lock to protect this interface */ -#endif -#else - void *if_lock; -#endif + struct ifqueue if_snd; -#else - struct ifprefixhead if_prefixhead; /* list of prefixes per if */ -#endif /* __APPLE__ */ + struct ifprefixhead if_prefixhead; /* list of prefixes per if */ struct { u_int32_t length; union { @@ -488,133 +471,134 @@ struct ifnet { } u; } if_broadcast; #if CONFIG_MACF_NET - struct label *if_label; /* interface MAC label */ + struct label *if_label; /* interface MAC label */ #endif - u_int32_t if_wake_properties; + u_int32_t if_wake_properties; #if PF - struct thread *if_pf_curthread; - struct pfi_kif *if_pf_kif; + struct thread *if_pf_curthread; + struct pfi_kif *if_pf_kif; #endif /* PF */ -#ifdef _KERN_LOCKS_H_ - lck_mtx_t *if_fwd_route_lock; -#else - void *if_fwd_route_lock; -#endif - struct route if_fwd_route; /* cached IPv4 forwarding route */ - void *if_bridge; /* bridge glue */ -#if IFNET_ROUTE_REFCNT - u_int32_t if_want_aggressive_drain; - u_int32_t if_idle_flags; /* idle flags */ - u_int32_t if_route_refcnt; /* idle: route ref count */ -#endif /* IFNET_ROUTE_REFCNT */ -#if PKT_PRIORITY - struct if_traffic_class if_tc __attribute__((aligned(8))); -#endif /* PKT_PRIORITY */ -}; -#ifndef __APPLE__ -/* for compatibility with other BSDs */ -#define if_addrlist if_addrhead -#define if_list if_link -#endif /* !__APPLE__ */ + decl_lck_mtx_data(, if_cached_route_lock); + u_int32_t if_fwd_cacheok; + struct route if_fwd_route; /* cached forwarding route */ + struct route if_src_route; /* cached ipv4 source route */ + struct route_in6 if_src_route6; /* cached ipv6 source route */ + decl_lck_rw_data(, if_llreach_lock); + struct ll_reach_tree if_ll_srcs; /* source link-layer tree */ -#endif /* PRIVATE */ + void *if_bridge; /* bridge glue */ + + u_int32_t if_want_aggressive_drain; + u_int32_t if_idle_flags; /* idle flags */ + u_int32_t if_idle_new_flags; /* temporary idle flags */ + u_int32_t if_idle_new_flags_mask; /* temporary mask */ + u_int32_t if_route_refcnt; /* idle: route ref count */ + + struct if_traffic_class if_tc __attribute__((aligned(8))); +#if INET + struct igmp_ifinfo *if_igi; /* for IGMPv3 */ +#endif /* INET */ +#if INET6 + struct mld_ifinfo *if_mli; /* for MLDv2 */ +#endif /* INET6 */ +}; + +/* + * Valid values for if_useflags + */ +#define IFRF_ATTACHED 0x1 /* ifnet attach is completely done */ +#define IFRF_DETACHING 0x2 /* detach has been requested */ -#ifdef KERNEL_PRIVATE /* * Structure describing a `cloning' interface. */ struct if_clone { LIST_ENTRY(if_clone) ifc_list; /* on list of cloners */ - const char *ifc_name; /* name of device, e.g. `vlan' */ - size_t ifc_namelen; /* length of name */ - u_int32_t ifc_minifs; /* minimum number of interfaces */ - u_int32_t ifc_maxunit; /* maximum unit number */ - unsigned char *ifc_units; /* bitmap to handle units */ - u_int32_t ifc_bmlen; /* bitmap length */ - - int (*ifc_create)(struct if_clone *, u_int32_t, void *); - int (*ifc_destroy)(struct ifnet *); + const char *ifc_name; /* name of device, e.g. `vlan' */ + size_t ifc_namelen; /* length of name */ + u_int32_t ifc_minifs; /* minimum number of interfaces */ + u_int32_t ifc_maxunit; /* maximum unit number */ + unsigned char *ifc_units; /* bitmap to handle units */ + u_int32_t ifc_bmlen; /* bitmap length */ + + int (*ifc_create)(struct if_clone *, u_int32_t, void *); + int (*ifc_destroy)(struct ifnet *); }; -#define IF_CLONE_INITIALIZER(name, create, destroy, minifs, maxunit) \ - { { NULL, NULL }, name, sizeof(name) - 1, minifs, maxunit, NULL, 0, create, destroy } +#define IF_CLONE_INITIALIZER(name, create, destroy, minifs, maxunit) { \ + { NULL, NULL }, name, (sizeof (name) - 1), minifs, maxunit, NULL, 0, \ + create, destroy \ +} #define M_CLONE M_IFADDR /* - * Bit values in if_ipending - */ -#define IFI_RECV 1 /* I want to receive */ -#define IFI_XMIT 2 /* I want to transmit */ - -/* - * Output queues (ifp->if_snd) and slow device input queues (*ifp->if_slowq) - * are queues of messages stored on ifqueue structures - * (defined above). Entries are added to and deleted from these structures - * by these macros, which should be called with ipl raised to splimp(). + * Macros to manipulate ifqueue. Users of these macros are responsible + * for serialization, by holding whatever lock is appropriate for the + * corresponding structure that is referring the ifqueue. */ #define IF_QFULL(ifq) ((ifq)->ifq_len >= (ifq)->ifq_maxlen) #define IF_DROP(ifq) ((ifq)->ifq_drops++) -#define IF_ENQUEUE(ifq, m) { \ - (m)->m_nextpkt = 0; \ - if ((ifq)->ifq_tail == 0) \ - (ifq)->ifq_head = m; \ - else \ - ((struct mbuf*)(ifq)->ifq_tail)->m_nextpkt = m; \ - (ifq)->ifq_tail = m; \ - (ifq)->ifq_len++; \ +#define IF_ENQUEUE(ifq, m) { \ + (m)->m_nextpkt = NULL; \ + if ((ifq)->ifq_tail == NULL) \ + (ifq)->ifq_head = m; \ + else \ + ((struct mbuf*)(ifq)->ifq_tail)->m_nextpkt = m; \ + (ifq)->ifq_tail = m; \ + (ifq)->ifq_len++; \ } -#define IF_PREPEND(ifq, m) { \ - (m)->m_nextpkt = (ifq)->ifq_head; \ - if ((ifq)->ifq_tail == 0) \ - (ifq)->ifq_tail = (m); \ - (ifq)->ifq_head = (m); \ - (ifq)->ifq_len++; \ +#define IF_PREPEND(ifq, m) { \ + (m)->m_nextpkt = (ifq)->ifq_head; \ + if ((ifq)->ifq_tail == NULL) \ + (ifq)->ifq_tail = (m); \ + (ifq)->ifq_head = (m); \ + (ifq)->ifq_len++; \ } -#define IF_DEQUEUE(ifq, m) { \ - (m) = (ifq)->ifq_head; \ - if (m) { \ - if (((ifq)->ifq_head = (m)->m_nextpkt) == 0) \ - (ifq)->ifq_tail = 0; \ - (m)->m_nextpkt = 0; \ - (ifq)->ifq_len--; \ - } \ +#define IF_DEQUEUE(ifq, m) { \ + (m) = (ifq)->ifq_head; \ + if (m != NULL) { \ + if (((ifq)->ifq_head = (m)->m_nextpkt) == NULL) \ + (ifq)->ifq_tail = NULL; \ + (m)->m_nextpkt = NULL; \ + (ifq)->ifq_len--; \ + } \ } - -#define IF_ENQ_DROP(ifq, m) if_enq_drop(ifq, m) - -#if defined(__GNUC__) && defined(MT_HEADER) -static __inline int -if_queue_drop(struct ifqueue *ifq, __unused struct mbuf *m) -{ - IF_DROP(ifq); - return 0; +#define IF_REMQUEUE(ifq, m) { \ + struct mbuf *_p = (ifq)->ifq_head; \ + struct mbuf *_n = (m)->m_nextpkt; \ + if ((m) == _p) \ + _p = NULL; \ + while (_p != NULL) { \ + if (_p->m_nextpkt == (m)) \ + break; \ + _p = _p->m_nextpkt; \ + } \ + VERIFY(_p != NULL || ((m) == (ifq)->ifq_head)); \ + if ((m) == (ifq)->ifq_head) \ + (ifq)->ifq_head = _n; \ + if ((m) == (ifq)->ifq_tail) \ + (ifq)->ifq_tail = _p; \ + VERIFY((ifq)->ifq_tail != NULL || (ifq)->ifq_head == NULL); \ + VERIFY((ifq)->ifq_len != 0); \ + --(ifq)->ifq_len; \ + if (_p != NULL) \ + _p->m_nextpkt = _n; \ + (m)->m_nextpkt = NULL; \ } +#define IF_DRAIN(ifq) do { \ + struct mbuf *m; \ + for (;;) { \ + IF_DEQUEUE(ifq, m); \ + if (m == NULL) \ + break; \ + m_freem(m); \ + } \ +} while (0) -static __inline int -if_enq_drop(struct ifqueue *ifq, struct mbuf *m) -{ - if (IF_QFULL(ifq) && - !if_queue_drop(ifq, m)) - return 0; - IF_ENQUEUE(ifq, m); - return 1; -} -#else - -#ifdef MT_HEADER -int if_enq_drop(struct ifqueue *, struct mbuf *); -#endif /* MT_HEADER */ - -#endif /* defined(__GNUC__) && defined(MT_HEADER) */ - -#endif /* KERNEL_PRIVATE */ - - -#ifdef PRIVATE /* * The ifaddr structure contains information about one address * of an interface. They are maintained by the different address families, @@ -622,21 +606,24 @@ int if_enq_drop(struct ifqueue *, struct mbuf *); * together so all addresses for an interface can be located. */ struct ifaddr { - struct sockaddr *ifa_addr; /* address of interface */ - struct sockaddr *ifa_dstaddr; /* other end of p-to-p link */ + decl_lck_mtx_data(, ifa_lock); /* lock for ifaddr */ + uint32_t ifa_refcnt; /* ref count, use IFA_{ADD,REM}REF */ + uint32_t ifa_debug; /* debug flags */ + struct sockaddr *ifa_addr; /* address of interface */ + struct sockaddr *ifa_dstaddr; /* other end of p-to-p link */ #define ifa_broadaddr ifa_dstaddr /* broadcast address interface */ - struct sockaddr *ifa_netmask; /* used to determine subnet */ - struct ifnet *ifa_ifp; /* back-pointer to interface */ + struct sockaddr *ifa_netmask; /* used to determine subnet */ + struct ifnet *ifa_ifp; /* back-pointer to interface */ TAILQ_ENTRY(ifaddr) ifa_link; /* queue macro glue */ void (*ifa_rtrequest) /* check or clean routes (+ or -)'d */ (int, struct rtentry *, struct sockaddr *); - uint32_t ifa_flags; /* mostly rt_flags for cloning */ - int32_t ifa_refcnt; /* ref count, use ifaref, ifafree */ - int32_t ifa_metric; /* cost of going out this interface */ + uint32_t ifa_flags; /* mostly rt_flags for cloning */ + int32_t ifa_metric; /* cost of going out this interface */ void (*ifa_free)(struct ifaddr *); /* callback fn for freeing */ void (*ifa_trace) /* callback fn for tracing refs */ (struct ifaddr *, int); - uint32_t ifa_debug; /* debug flags */ + void (*ifa_attached)(struct ifaddr *); /* callback fn for attaching */ + void (*ifa_detached)(struct ifaddr *); /* callback fn for detaching */ }; /* @@ -648,13 +635,47 @@ struct ifaddr { /* * Valid values for ifa_debug */ -#define IFD_ATTACHED 0x1 /* attached to an interface */ +#define IFD_ATTACHED 0x1 /* attached to list */ #define IFD_ALLOC 0x2 /* dynamically allocated */ #define IFD_DEBUG 0x4 /* has debugging info */ +#define IFD_LINK 0x8 /* link address */ +#define IFD_TRASHED 0x10 /* in trash list */ +#define IFD_SKIP 0x20 /* skip this entry */ +#define IFD_NOTREADY 0x40 /* embryonic; not yet ready */ -#endif /* PRIVATE */ +#define IFA_LOCK_ASSERT_HELD(_ifa) \ + lck_mtx_assert(&(_ifa)->ifa_lock, LCK_MTX_ASSERT_OWNED) + +#define IFA_LOCK_ASSERT_NOTHELD(_ifa) \ + lck_mtx_assert(&(_ifa)->ifa_lock, LCK_MTX_ASSERT_NOTOWNED) + +#define IFA_LOCK(_ifa) \ + lck_mtx_lock(&(_ifa)->ifa_lock) + +#define IFA_LOCK_SPIN(_ifa) \ + lck_mtx_lock_spin(&(_ifa)->ifa_lock) + +#define IFA_CONVERT_LOCK(_ifa) do { \ + IFA_LOCK_ASSERT_HELD(_ifa); \ + lck_mtx_convert_spin(&(_ifa)->ifa_lock); \ +} while (0) + +#define IFA_UNLOCK(_ifa) \ + lck_mtx_unlock(&(_ifa)->ifa_lock) + +#define IFA_ADDREF(_ifa) \ + ifa_addref(_ifa, 0) + +#define IFA_ADDREF_LOCKED(_ifa) \ + ifa_addref(_ifa, 1) + +#define IFA_REMREF(_ifa) do { \ + (void) ifa_remref(_ifa, 0); \ +} while (0) + +#define IFA_REMREF_LOCKED(_ifa) \ + ifa_remref(_ifa, 1) -#ifdef KERNEL_PRIVATE /* * The prefix structure contains information about one prefix * of an interface. They are maintained by the different address families, @@ -668,115 +689,169 @@ struct ifprefix { u_char ifpr_plen; /* prefix length in bits */ u_char ifpr_type; /* protocol dependent prefix type */ }; -#endif /* KERNEL_PRIVATE */ - -#ifdef PRIVATE -typedef void (*ifma_protospec_free_func)(void* ifma_protospec); /* * Multicast address structure. This is analogous to the ifaddr * structure except that it keeps track of multicast addresses. - * Also, the reference count here is a count of requests for this - * address, not a count of pointers to this structure. + * Also, the request count here is a count of requests for this + * address, not a count of pointers to this structure; anonymous + * membership(s) holds one outstanding request count. */ struct ifmultiaddr { + decl_lck_mtx_data(, ifma_lock); + u_int32_t ifma_refcount; /* reference count */ + u_int32_t ifma_anoncnt; /* # of anonymous requests */ + u_int32_t ifma_reqcnt; /* total requests for this address */ + u_int32_t ifma_debug; /* see ifa_debug flags */ + u_int32_t ifma_flags; /* see below */ LIST_ENTRY(ifmultiaddr) ifma_link; /* queue macro glue */ - struct sockaddr *ifma_addr; /* address this membership is for */ + struct sockaddr *ifma_addr; /* address this membership is for */ struct ifmultiaddr *ifma_ll; /* link-layer translation, if any */ - struct ifnet *ifma_ifp; /* back-pointer to interface */ - u_int ifma_usecount; /* use count, protected by ifp's lock */ - void *ifma_protospec; /* protocol-specific state, if any */ - int32_t ifma_refcount; /* reference count, atomically protected */ - ifma_protospec_free_func ifma_free; /* function called to free ifma_protospec */ + struct ifnet *ifma_ifp; /* back-pointer to interface */ + void *ifma_protospec; /* protocol-specific state, if any */ + void (*ifma_trace) /* callback fn for tracing refs */ + (struct ifmultiaddr *, int); }; -#endif /* PRIVATE */ - -#ifdef KERNEL_PRIVATE -#define IFAREF(ifa) ifaref(ifa) -#define IFAFREE(ifa) ifafree(ifa) /* - * To preserve kmem compatibility, we define - * ifnet_head to ifnet. This should be temp. + * Values for ifma_flags */ -#define ifnet_head ifnet -extern struct ifnethead ifnet_head; -extern struct ifnet **ifindex2ifnet; -extern int ifqmaxlen; -extern ifnet_t lo_ifp; -extern int if_index; -extern struct ifaddr **ifnet_addrs; - -int if_addmulti(struct ifnet *, const struct sockaddr *, struct ifmultiaddr **); -int if_allmulti(struct ifnet *, int); -void if_attach(struct ifnet *); -int if_delmultiaddr(struct ifmultiaddr *ifma, int locked); -int if_delmulti(struct ifnet *, const struct sockaddr *); -void if_down(struct ifnet *); -int if_down_all(void); -void if_route(struct ifnet *, int flag, int fam); -void if_unroute(struct ifnet *, int flag, int fam); -void if_up(struct ifnet *); -void if_updown(struct ifnet *ifp, int up); -/*void ifinit(void));*/ /* declared in systm.h for main( */ -int ifioctl(struct socket *, u_long, caddr_t, struct proc *); -int ifioctllocked(struct socket *, u_long, caddr_t, struct proc *); -struct ifnet *ifunit(const char *); -struct ifnet *if_withname(struct sockaddr *); - -int if_clone_attach(struct if_clone *); -void if_clone_detach(struct if_clone *); -struct if_clone * - if_clone_lookup(const char *, u_int32_t *); - -void ifnet_lock_assert(struct ifnet *ifp, int what); -void ifnet_lock_shared(struct ifnet *ifp); -void ifnet_lock_exclusive(struct ifnet *ifp); -void ifnet_lock_done(struct ifnet *ifp); - -void ifnet_head_lock_shared(void); -void ifnet_head_lock_exclusive(void); -void ifnet_head_done(void); - -void if_attach_ifa(struct ifnet * ifp, struct ifaddr *ifa); -void if_detach_ifa(struct ifnet * ifp, struct ifaddr *ifa); - -void ifma_reference(struct ifmultiaddr *ifma); -void ifma_release(struct ifmultiaddr *ifma); - -struct ifaddr *ifa_ifwithaddr(const struct sockaddr *); -struct ifaddr *ifa_ifwithaddr_scoped(const struct sockaddr *, unsigned int); -struct ifaddr *ifa_ifwithdstaddr(const struct sockaddr *); -struct ifaddr *ifa_ifwithnet(const struct sockaddr *); -struct ifaddr *ifa_ifwithnet_scoped(const struct sockaddr *, unsigned int); -struct ifaddr *ifa_ifwithroute(int, const struct sockaddr *, const struct sockaddr *); -struct ifaddr *ifa_ifwithroute_locked(int, const struct sockaddr *, const struct sockaddr *); -struct ifaddr *ifa_ifwithroute_scoped_locked(int, const struct sockaddr *, +#define IFMAF_ANONYMOUS 0x1 /* has anonymous request ref(s) held */ + +#define IFMA_LOCK_ASSERT_HELD(_ifma) \ + lck_mtx_assert(&(_ifma)->ifma_lock, LCK_MTX_ASSERT_OWNED) + +#define IFMA_LOCK_ASSERT_NOTHELD(_ifma) \ + lck_mtx_assert(&(_ifma)->ifma_lock, LCK_MTX_ASSERT_NOTOWNED) + +#define IFMA_LOCK(_ifma) \ + lck_mtx_lock(&(_ifma)->ifma_lock) + +#define IFMA_LOCK_SPIN(_ifma) \ + lck_mtx_lock_spin(&(_ifma)->ifma_lock) + +#define IFMA_CONVERT_LOCK(_ifma) do { \ + IFMA_LOCK_ASSERT_HELD(_ifma); \ + lck_mtx_convert_spin(&(_ifma)->ifma_lock); \ +} while (0) + +#define IFMA_UNLOCK(_ifma) \ + lck_mtx_unlock(&(_ifma)->ifma_lock) + +#define IFMA_ADDREF(_ifma) \ + ifma_addref(_ifma, 0) + +#define IFMA_ADDREF_LOCKED(_ifma) \ + ifma_addref(_ifma, 1) + +#define IFMA_REMREF(_ifma) \ + ifma_remref(_ifma) + +__private_extern__ struct ifnethead ifnet_head; +__private_extern__ struct ifnet **ifindex2ifnet; +__private_extern__ int ifqmaxlen; +__private_extern__ int if_index; +__private_extern__ struct ifaddr **ifnet_addrs; +__private_extern__ lck_attr_t *ifa_mtx_attr; +__private_extern__ lck_grp_t *ifa_mtx_grp; +__private_extern__ lck_grp_t *ifnet_lock_group; +__private_extern__ lck_attr_t *ifnet_lock_attr; +extern ifnet_t lo_ifp; + +extern int if_addmulti(struct ifnet *, const struct sockaddr *, + struct ifmultiaddr **); +extern int if_addmulti_anon(struct ifnet *, const struct sockaddr *, + struct ifmultiaddr **); +extern int if_allmulti(struct ifnet *, int); +extern int if_delmulti(struct ifnet *, const struct sockaddr *); +extern int if_delmulti_ifma(struct ifmultiaddr *); +extern int if_delmulti_anon(struct ifnet *, const struct sockaddr *); +extern void if_down(struct ifnet *); +extern int if_down_all(void); +extern void if_up(struct ifnet *); +__private_extern__ void if_updown(struct ifnet *ifp, int up); +extern int ifioctl(struct socket *, u_long, caddr_t, struct proc *); +extern int ifioctllocked(struct socket *, u_long, caddr_t, struct proc *); +extern struct ifnet *ifunit(const char *); +extern struct ifnet *if_withname(struct sockaddr *); + +extern struct if_clone *if_clone_lookup(const char *, u_int32_t *); +extern int if_clone_attach(struct if_clone *); +extern void if_clone_detach(struct if_clone *); + +extern errno_t if_mcasts_update(struct ifnet *); + +typedef enum { + IFNET_LCK_ASSERT_EXCLUSIVE, /* RW: held as writer */ + IFNET_LCK_ASSERT_SHARED, /* RW: held as reader */ + IFNET_LCK_ASSERT_OWNED, /* RW: writer/reader, MTX: held */ + IFNET_LCK_ASSERT_NOTOWNED /* not held */ +} ifnet_lock_assert_t; + +__private_extern__ void ifnet_lock_assert(struct ifnet *, ifnet_lock_assert_t); +__private_extern__ void ifnet_lock_shared(struct ifnet *ifp); +__private_extern__ void ifnet_lock_exclusive(struct ifnet *ifp); +__private_extern__ void ifnet_lock_done(struct ifnet *ifp); + +__private_extern__ void ifnet_head_lock_shared(void); +__private_extern__ void ifnet_head_lock_exclusive(void); +__private_extern__ void ifnet_head_done(void); + +__private_extern__ errno_t ifnet_set_idle_flags_locked(ifnet_t, u_int32_t, + u_int32_t); +__private_extern__ int ifnet_is_attached(struct ifnet *, int refio); +__private_extern__ void ifnet_decr_iorefcnt(struct ifnet *); + +__private_extern__ void if_attach_ifa(struct ifnet *, struct ifaddr *); +__private_extern__ void if_attach_link_ifa(struct ifnet *, struct ifaddr *); +__private_extern__ void if_detach_ifa(struct ifnet *, struct ifaddr *); +__private_extern__ void if_detach_link_ifa(struct ifnet *, struct ifaddr *); + +extern struct ifaddr *ifa_ifwithaddr(const struct sockaddr *); +extern struct ifaddr *ifa_ifwithaddr_scoped(const struct sockaddr *, unsigned int); +extern struct ifaddr *ifa_ifwithdstaddr(const struct sockaddr *); +extern struct ifaddr *ifa_ifwithnet(const struct sockaddr *); +extern struct ifaddr *ifa_ifwithnet_scoped(const struct sockaddr *, unsigned int); +extern struct ifaddr *ifa_ifwithroute(int, const struct sockaddr *, + const struct sockaddr *); +extern struct ifaddr *ifa_ifwithroute_locked(int, const struct sockaddr *, const struct sockaddr *); +extern struct ifaddr *ifa_ifwithroute_scoped_locked(int, const struct sockaddr *, const struct sockaddr *, unsigned int); -struct ifaddr *ifaof_ifpforaddr(const struct sockaddr *, struct ifnet *); -struct ifaddr *ifa_ifpgetprimary(struct ifnet *, int); -void ifafree(struct ifaddr *); -void ifaref(struct ifaddr *); - -struct ifmultiaddr *ifmaof_ifpforaddr(const struct sockaddr *, struct ifnet *); - -extern struct in_ifaddr *ifa_foraddr(unsigned int); -extern struct in_ifaddr *ifa_foraddr_scoped(unsigned int, unsigned int); - -#ifdef BSD_KERNEL_PRIVATE -enum { - kIfNetUseCount_MayBeZero = 0, - kIfNetUseCount_MustNotBeZero = 1 -}; - -int ifp_use(struct ifnet *ifp, int handle_zero); -int ifp_unuse(struct ifnet *ifp); -void ifp_use_reached_zero(struct ifnet *ifp); - -void if_data_internal_to_if_data(struct ifnet *ifp, const struct if_data_internal *if_data_int, - struct if_data *if_data); -void if_data_internal_to_if_data64(struct ifnet *ifp, const struct if_data_internal *if_data_int, - struct if_data64 *if_data64); -#endif /* BSD_KERNEL_PRIVATE */ -#endif /* KERNEL_PRIVATE */ +extern struct ifaddr *ifaof_ifpforaddr(const struct sockaddr *, struct ifnet *); +__private_extern__ struct ifaddr *ifa_ifpgetprimary(struct ifnet *, int); +extern void ifa_addref(struct ifaddr *, int); +extern struct ifaddr *ifa_remref(struct ifaddr *, int); +extern void ifa_lock_init(struct ifaddr *); +extern void ifa_lock_destroy(struct ifaddr *); +extern void ifma_addref(struct ifmultiaddr *, int); +extern void ifma_remref(struct ifmultiaddr *); + +extern void ifa_init(void); + +__private_extern__ struct in_ifaddr *ifa_foraddr(unsigned int); +__private_extern__ struct in_ifaddr *ifa_foraddr_scoped(unsigned int, + unsigned int); + +#if INET6 +struct in6_addr; +__private_extern__ struct in6_ifaddr *ifa_foraddr6(struct in6_addr *); +__private_extern__ struct in6_ifaddr *ifa_foraddr6_scoped(struct in6_addr *, + unsigned int); +#endif /* INET6 */ + +__private_extern__ void if_data_internal_to_if_data(struct ifnet *ifp, + const struct if_data_internal *if_data_int, struct if_data *if_data); +__private_extern__ void if_data_internal_to_if_data64(struct ifnet *ifp, + const struct if_data_internal *if_data_int, struct if_data64 *if_data64); +__private_extern__ void if_copy_traffic_class(struct ifnet *ifp, + struct if_traffic_class *if_tc); + +__private_extern__ struct rtentry *ifnet_cached_rtlookup_inet(struct ifnet *, + struct in_addr); +#if INET6 +__private_extern__ struct rtentry *ifnet_cached_rtlookup_inet6(struct ifnet *, + struct in6_addr *); +#endif /* INET6 */ + +#endif /* XNU_KERNEL_PRIVATE */ #endif /* !_NET_IF_VAR_H_ */ diff --git a/bsd/net/if_vlan.c b/bsd/net/if_vlan.c index 0179f102c..cf090602d 100644 --- a/bsd/net/if_vlan.c +++ b/bsd/net/if_vlan.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2010 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2003-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,6 +79,7 @@ #include #include #include +#include #include #include @@ -185,8 +186,14 @@ LIST_HEAD(vlan_parent_list, vlan_parent); struct ifvlan; LIST_HEAD(ifvlan_list, ifvlan); +typedef LIST_ENTRY(vlan_parent) +vlan_parent_entry; +typedef LIST_ENTRY(ifvlan) +ifvlan_entry; + +#define VLP_SIGNATURE 0xfaceface typedef struct vlan_parent { - LIST_ENTRY(vlan_parent) vlp_parent_list;/* list of parents */ + vlan_parent_entry vlp_parent_list;/* list of parents */ struct ifnet * vlp_ifp; /* interface */ struct ifvlan_list vlp_vlan_list; /* list of VLAN's */ #define VLPF_SUPPORTS_VLAN_MTU 0x1 @@ -195,10 +202,12 @@ typedef struct vlan_parent { u_int32_t vlp_flags; struct ifdevmtu vlp_devmtu; SInt32 vlp_retain_count; + UInt32 vlp_signature; /* VLP_SIGNATURE */ } vlan_parent, * vlan_parent_ref; +#define IFV_SIGNATURE 0xbeefbeef struct ifvlan { - LIST_ENTRY(ifvlan) ifv_vlan_list; + ifvlan_entry ifv_vlan_list; char ifv_name[IFNAMSIZ]; /* our unique id */ struct ifnet * ifv_ifp; /* our interface */ vlan_parent_ref ifv_vlp; /* parent information */ @@ -215,6 +224,8 @@ struct ifvlan { u_int32_t ifv_flags; bpf_packet_func ifv_bpf_input; bpf_packet_func ifv_bpf_output; + SInt32 ifv_retain_count; + UInt32 ifv_signature; /* IFV_SIGNATURE */ }; typedef struct ifvlan * ifvlan_ref; @@ -230,6 +241,11 @@ static vlan_globals_ref g_vlan; #define ifv_encaplen ifv_mib.ifvm_encaplen #define ifv_mtufudge ifv_mib.ifvm_mtufudge +static void +vlan_parent_retain(vlan_parent_ref vlp); + +static void +vlan_parent_release(vlan_parent_ref vlp); /** ** vlan_parent_ref vlp_flags in-lines @@ -363,12 +379,10 @@ static int vlan_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, static int vlan_attach_protocol(struct ifnet *ifp); static int vlan_detach_protocol(struct ifnet *ifp); static int vlan_setmulti(struct ifnet *ifp); -static int vlan_unconfig(struct ifnet *ifp); +static int vlan_unconfig(ifvlan_ref ifv, int need_to_wait); static int vlan_config(struct ifnet * ifp, struct ifnet * p, int tag); static void vlan_if_free(struct ifnet * ifp); -static void vlan_remove(ifvlan_ref ifv); -static void vlan_if_detach(struct ifnet * ifp); -static int vlan_new_mtu(struct ifnet * ifp, int mtu); +static int vlan_remove(ifvlan_ref ifv, int need_to_wait); static struct if_clone vlan_cloner = IF_CLONE_INITIALIZER(VLANNAME, vlan_clone_create, @@ -376,9 +390,118 @@ static struct if_clone vlan_cloner = IF_CLONE_INITIALIZER(VLANNAME, 0, IF_MAXUNIT); static void interface_link_event(struct ifnet * ifp, u_int32_t event_code); -static void vlan_parent_link_event(vlan_parent_ref vlp, +static void vlan_parent_link_event(struct ifnet * p, u_int32_t event_code); -extern void dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m); + +static int ifvlan_new_mtu(ifvlan_ref ifv, int mtu); + +/** + ** ifvlan_ref routines + **/ +static void +ifvlan_retain(ifvlan_ref ifv) +{ + if (ifv->ifv_signature != IFV_SIGNATURE) { + panic("ifvlan_retain: bad signature\n"); + } + if (ifv->ifv_retain_count == 0) { + panic("ifvlan_retain: retain count is 0\n"); + } + OSIncrementAtomic(&ifv->ifv_retain_count); +} + +static void +ifvlan_release(ifvlan_ref ifv) +{ + UInt32 old_retain_count; + + if (ifv->ifv_signature != IFV_SIGNATURE) { + panic("ifvlan_release: bad signature\n"); + } + old_retain_count = OSDecrementAtomic(&ifv->ifv_retain_count); + switch (old_retain_count) { + case 0: + panic("ifvlan_release: retain count is 0\n"); + break; + case 1: + if (g_vlan->verbose) { + printf("ifvlan_release(%s)\n", ifv->ifv_name); + } + ifv->ifv_signature = 0; + FREE(ifv, M_VLAN); + break; + default: + break; + } + return; +} + +static vlan_parent_ref +ifvlan_get_vlan_parent_retained(ifvlan_ref ifv) +{ + vlan_parent_ref vlp = ifv->ifv_vlp; + + if (vlan_parent_flags_detaching(vlp)) { + return (NULL); + } + vlan_parent_retain(vlp); + return (vlp); +} + +/** + ** ifnet_* routines + **/ + +static ifvlan_ref +ifnet_get_ifvlan(struct ifnet * ifp) +{ + ifvlan_ref ifv; + + ifv = (ifvlan_ref)ifnet_softc(ifp); + return (ifv); +} + +static ifvlan_ref +ifnet_get_ifvlan_retained(struct ifnet * ifp) +{ + ifvlan_ref ifv; + + ifv = ifnet_get_ifvlan(ifp); + if (ifv == NULL) { + return (NULL); + } + if (ifvlan_flags_detaching(ifv)) { + return (NULL); + } + ifvlan_retain(ifv); + return (ifv); +} + +static int +ifnet_ifvlan_vlan_parent_ok(struct ifnet * ifp, ifvlan_ref ifv, + vlan_parent_ref vlp) +{ + ifvlan_ref check_ifv; + + check_ifv = ifnet_get_ifvlan(ifp); + if (check_ifv != ifv || ifvlan_flags_detaching(ifv)) { + /* ifvlan_ref no longer valid */ + return (FALSE); + } + if (ifv->ifv_vlp != vlp) { + /* vlan_parent no longer valid */ + return (FALSE); + } + if (vlan_parent_flags_detaching(vlp)) { + /* parent is detaching */ + return (FALSE); + } + return (TRUE); +} + +/** + ** vlan, etc. routines + **/ static int vlan_globals_init(void) @@ -471,17 +594,26 @@ vlan_bpf_input(struct ifnet * ifp, struct mbuf * m, /** ** vlan_parent synchronization routines **/ -static __inline__ void +static void vlan_parent_retain(vlan_parent_ref vlp) { + if (vlp->vlp_signature != VLP_SIGNATURE) { + panic("vlan_parent_retain: signature is bad\n"); + } + if (vlp->vlp_retain_count == 0) { + panic("vlan_parent_retain: retain count is 0\n"); + } OSIncrementAtomic(&vlp->vlp_retain_count); } -static __inline__ void +static void vlan_parent_release(vlan_parent_ref vlp) { UInt32 old_retain_count; + if (vlp->vlp_signature != VLP_SIGNATURE) { + panic("vlan_parent_release: signature is bad\n"); + } old_retain_count = OSDecrementAtomic(&vlp->vlp_retain_count); switch (old_retain_count) { case 0: @@ -493,6 +625,7 @@ vlan_parent_release(vlan_parent_ref vlp) printf("vlan_parent_release(%s%d)\n", ifnet_name(ifp), ifnet_unit(ifp)); } + vlp->vlp_signature = 0; FREE(vlp, M_VLAN); break; default: @@ -561,7 +694,6 @@ vlan_parent_signal(vlan_parent_ref vlp, const char * msg) return; } - /* * Program our multicast filter. What we're actually doing is * programming the multicast filter of the parent. This has the @@ -576,35 +708,22 @@ vlan_setmulti(struct ifnet * ifp) int error = 0; ifvlan_ref ifv; struct ifnet * p; - vlan_parent_ref vlp; + vlan_parent_ref vlp = NULL; vlan_lock(); - ifv = (ifvlan_ref)ifnet_softc(ifp); - if (ifv == NULL || ifvlan_flags_detaching(ifv)) { + ifv = ifnet_get_ifvlan_retained(ifp); + if (ifv == NULL) { goto unlock_done; } - vlp = ifv->ifv_vlp; + vlp = ifvlan_get_vlan_parent_retained(ifv); if (vlp == NULL) { /* no parent, no need to program the multicast filter */ goto unlock_done; } - if (vlan_parent_flags_detaching(vlp)) { - goto unlock_done; - } - vlan_parent_retain(vlp); vlan_parent_wait(vlp, "vlan_setmulti"); /* check again, things could have changed */ - ifv = (ifvlan_ref)ifnet_softc(ifp); - if (ifv == NULL || ifvlan_flags_detaching(ifv)) { - goto signal_done; - } - if (ifv->ifv_vlp != vlp) { - /* vlan parent changed */ - goto signal_done; - } - if (vlp == NULL) { - /* no parent, no need to program the multicast filter */ + if (ifnet_ifvlan_vlan_parent_ok(ifp, ifv, vlp) == FALSE) { goto signal_done; } p = vlp->vlp_ifp; @@ -620,6 +739,12 @@ vlan_setmulti(struct ifnet * ifp) unlock_done: vlan_unlock(); + if (ifv != NULL) { + ifvlan_release(ifv); + } + if (vlp != NULL) { + vlan_parent_release(vlp); + } return (error); } @@ -711,7 +836,8 @@ vlan_parent_create(struct ifnet * p, vlan_parent_ref * ret_vlp) } LIST_INIT(&vlp->vlp_vlan_list); vlp->vlp_ifp = p; - vlan_parent_retain(vlp); + vlp->vlp_retain_count = 1; + vlp->vlp_signature = VLP_SIGNATURE; if (ifnet_offload(p) & (IF_HWASSIST_VLAN_MTU | IF_HWASSIST_VLAN_TAGGING)) { vlan_parent_flags_set_supports_vlan_mtu(vlp); @@ -721,28 +847,57 @@ vlan_parent_create(struct ifnet * p, vlan_parent_ref * ret_vlp) } static void -vlan_parent_remove_all_vlans(vlan_parent_ref vlp) +vlan_parent_remove_all_vlans(struct ifnet * p) { ifvlan_ref ifv; - struct ifnet * p; - - vlan_assert_lock_held(); + int need_vlp_release = 0; + ifvlan_ref next; + vlan_parent_ref vlp; - while ((ifv = LIST_FIRST(&vlp->vlp_vlan_list)) != NULL) { - vlan_remove(ifv); + vlan_lock(); + vlp = parent_list_lookup(p); + if (vlp == NULL || vlan_parent_flags_detaching(vlp)) { + /* no VLAN's */ vlan_unlock(); - vlan_if_detach(ifv->ifv_ifp); - vlan_lock(); + return; + } + vlan_parent_flags_set_detaching(vlp); + vlan_parent_retain(vlp); + vlan_parent_wait(vlp, "vlan_parent_remove_all_vlans"); + need_vlp_release++; + vlp = parent_list_lookup(p); + /* check again */ + if (vlp == NULL) { + goto signal_done; + } + + for (ifv = LIST_FIRST(&vlp->vlp_vlan_list); ifv != NULL; ifv = next) { + struct ifnet * ifp = ifv->ifv_ifp; + int removed; + + next = LIST_NEXT(ifv, ifv_vlan_list); + removed = vlan_remove(ifv, FALSE); + if (removed) { + vlan_unlock(); + ifnet_detach(ifp); + vlan_lock(); + } } /* the vlan parent has no more VLAN's */ - p = vlp->vlp_ifp; ifnet_set_eflags(p, 0, IFEF_VLAN); /* clear IFEF_VLAN */ + LIST_REMOVE(vlp, vlp_parent_list); + need_vlp_release++; /* one for being in the list */ + need_vlp_release++; /* final reference */ + + signal_done: + vlan_parent_signal(vlp, "vlan_parent_remove_all_vlans"); vlan_unlock(); - vlan_parent_release(vlp); - vlan_lock(); + while (need_vlp_release--) { + vlan_parent_release(vlp); + } return; } @@ -797,13 +952,16 @@ vlan_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params) if (ifv == NULL) return ENOBUFS; bzero(ifv, sizeof(struct ifvlan)); + ifv->ifv_retain_count = 1; + ifv->ifv_signature = IFV_SIGNATURE; multicast_list_init(&ifv->ifv_multicast); /* use the interface name as the unique id for ifp recycle */ - if ((unsigned int)snprintf(ifv->ifv_name, sizeof(ifv->ifv_name), "%s%d", - ifc->ifc_name, unit) >= sizeof(ifv->ifv_name)) { - FREE(ifv, M_VLAN); - return (EINVAL); + if ((unsigned int) + snprintf(ifv->ifv_name, sizeof(ifv->ifv_name), "%s%d", + ifc->ifc_name, unit) >= sizeof(ifv->ifv_name)) { + ifvlan_release(ifv); + return (EINVAL); } bzero(&vlan_init, sizeof(vlan_init)); @@ -828,16 +986,10 @@ vlan_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params) error = ifnet_allocate(&vlan_init, &ifp); if (error) { - FREE(ifv, M_VLAN); - return (error); + ifvlan_release(ifv); + return (error); } -#if 0 - /* NB: flags are not set here */ - ifnet_set_link_mib_data(ifp, &ifv->ifv_mib, sizeof ifv->ifv_mib); - /* NB: mtu is not set here */ -#endif - ifnet_set_offload(ifp, 0); ifnet_set_addrlen(ifp, ETHER_ADDR_LEN); /* XXX ethernet specific */ ifnet_set_baudrate(ifp, 0); @@ -845,9 +997,9 @@ vlan_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params) error = ifnet_attach(ifp, NULL); if (error) { - ifnet_release(ifp); - FREE(ifv, M_VLAN); - return (error); + ifnet_release(ifp); + ifvlan_release(ifv); + return (error); } ifv->ifv_ifp = ifp; @@ -856,21 +1008,18 @@ vlan_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params) return (0); } -static void -vlan_remove(ifvlan_ref ifv) +static int +vlan_remove(ifvlan_ref ifv, int need_to_wait) { vlan_assert_lock_held(); + if (ifvlan_flags_detaching(ifv)) { + return (0); + } ifvlan_flags_set_detaching(ifv); - vlan_unconfig(ifv->ifv_ifp); - return; + vlan_unconfig(ifv, need_to_wait); + return (1); } -static void -vlan_if_detach(struct ifnet * ifp) -{ - ifnet_detach(ifp); - return; -} static int vlan_clone_destroy(struct ifnet *ifp) @@ -878,18 +1027,19 @@ vlan_clone_destroy(struct ifnet *ifp) ifvlan_ref ifv; vlan_lock(); - ifv = ifnet_softc(ifp); - if (ifv == NULL || ifnet_type(ifp) != IFT_L2VLAN) { + ifv = ifnet_get_ifvlan_retained(ifp); + if (ifv == NULL) { vlan_unlock(); return 0; } - if (ifvlan_flags_detaching(ifv)) { + if (vlan_remove(ifv, TRUE) == 0) { vlan_unlock(); + ifvlan_release(ifv); return 0; } - vlan_remove(ifv); vlan_unlock(); - vlan_if_detach(ifp); + ifvlan_release(ifv); + ifnet_detach(ifp); return 0; } @@ -900,8 +1050,8 @@ vlan_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func func) ifvlan_ref ifv; vlan_lock(); - ifv = ifnet_softc(ifp); - if (ifv == NULL || ifvlan_flags_detaching(ifv)) { + ifv = ifnet_get_ifvlan_retained(ifp); + if (ifv == NULL) { vlan_unlock(); return (ENODEV); } @@ -925,6 +1075,7 @@ vlan_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func func) break; } vlan_unlock(); + ifvlan_release(ifv); return 0; } @@ -938,7 +1089,7 @@ vlan_output(struct ifnet * ifp, struct mbuf * m) struct ifnet * p; int soft_vlan; u_short tag; - vlan_parent_ref vlp; + vlan_parent_ref vlp = NULL; if (m == 0) { return (0); @@ -948,18 +1099,13 @@ vlan_output(struct ifnet * ifp, struct mbuf * m) return (0); } vlan_lock(); - ifv = (ifvlan_ref)ifnet_softc(ifp); - if (ifv == NULL || ifvlan_flags_detaching(ifv) - || ifvlan_flags_ready(ifv) == 0) { - vlan_unlock(); - m_freem_list(m); - return (0); + ifv = ifnet_get_ifvlan_retained(ifp); + if (ifv == NULL || ifvlan_flags_ready(ifv) == 0) { + goto unlock_done; } - vlp = ifv->ifv_vlp; + vlp = ifvlan_get_vlan_parent_retained(ifv); if (vlp == NULL) { - vlan_unlock(); - m_freem_list(m); - return (0); + goto unlock_done; } p = vlp->vlp_ifp; (void)ifnet_stat_increment_out(ifp, 1, m->m_pkthdr.len, 0); @@ -968,12 +1114,16 @@ vlan_output(struct ifnet * ifp, struct mbuf * m) tag = ifv->ifv_tag; encaplen = ifv->ifv_encaplen; vlan_unlock(); + + ifvlan_release(ifv); + vlan_parent_release(vlp); + vlan_bpf_output(ifp, m, bpf_func); /* do not run parent's if_output() if the parent is not up */ if ((ifnet_flags(p) & (IFF_UP | IFF_RUNNING)) != (IFF_UP | IFF_RUNNING)) { m_freem(m); - ifp->if_collisions++; + atomic_add_64(&ifp->if_collisions, 1); return (0); } /* @@ -992,7 +1142,7 @@ vlan_output(struct ifnet * ifp, struct mbuf * m) if (m == NULL) { printf("%s%d: unable to prepend VLAN header\n", ifnet_name(ifp), ifnet_unit(ifp)); - ifp->if_oerrors++; + atomic_add_64(&ifp->if_oerrors, 1); return (0); } /* M_PREPEND takes care of m_len, m_pkthdr.len for us */ @@ -1001,7 +1151,7 @@ vlan_output(struct ifnet * ifp, struct mbuf * m) if (m == NULL) { printf("%s%d: unable to pullup VLAN header\n", ifnet_name(ifp), ifnet_unit(ifp)); - ifp->if_oerrors++; + atomic_add_64(&ifp->if_oerrors, 1); return (0); } } @@ -1017,7 +1167,19 @@ vlan_output(struct ifnet * ifp, struct mbuf * m) evl->evl_encap_proto = htons(ETHERTYPE_VLAN); evl->evl_tag = htons(tag); } - return ifnet_output_raw(p, PF_VLAN, m); + return (ifnet_output_raw(p, PF_VLAN, m)); + + unlock_done: + vlan_unlock(); + if (ifv != NULL) { + ifvlan_release(ifv); + } + if (vlp != NULL) { + vlan_parent_release(vlp); + } + m_freem_list(m); + return (0); + } static int @@ -1120,20 +1282,17 @@ vlan_input(ifnet_t p, __unused protocol_family_t protocol, return 0; } -#define VLAN_CONFIG_PROGRESS_VLP_RETAINED 0x1 -#define VLAN_CONFIG_PROGRESS_IN_LIST 0x2 - static int vlan_config(struct ifnet * ifp, struct ifnet * p, int tag) { int error; - int first_vlan = 0; + int first_vlan = FALSE; ifvlan_ref ifv = NULL; - vlan_parent_ref new_vlp = NULL; + int ifv_added = FALSE; int need_vlp_release = 0; + vlan_parent_ref new_vlp = NULL; ifnet_offload_t offload; u_int16_t parent_flags; - u_int32_t progress = 0; vlan_parent_ref vlp = NULL; /* pre-allocate space for vlan_parent, in case we're first */ @@ -1143,14 +1302,19 @@ vlan_config(struct ifnet * ifp, struct ifnet * p, int tag) } vlan_lock(); - ifv = (ifvlan_ref)ifnet_softc(ifp); - if (ifv != NULL && ifv->ifv_vlp != NULL) { + ifv = ifnet_get_ifvlan_retained(ifp); + if (ifv == NULL || ifv->ifv_vlp != NULL) { vlan_unlock(); + if (ifv != NULL) { + ifvlan_release(ifv); + } vlan_parent_release(new_vlp); return (EBUSY); } vlp = parent_list_lookup(p); if (vlp != NULL) { + vlan_parent_retain(vlp); + need_vlp_release++; if (vlan_parent_lookup_tag(vlp, tag) != NULL) { /* already a VLAN with that tag on this interface */ error = EADDRINUSE; @@ -1158,28 +1322,38 @@ vlan_config(struct ifnet * ifp, struct ifnet * p, int tag) } } else { + /* one for being in the list */ + vlan_parent_retain(new_vlp); + /* we're the first VLAN on this interface */ LIST_INSERT_HEAD(&g_vlan->parent_list, new_vlp, vlp_parent_list); vlp = new_vlp; + + vlan_parent_retain(vlp); + need_vlp_release++; } /* need to wait to ensure no one else is trying to add/remove */ - vlan_parent_retain(vlp); - progress |= VLAN_CONFIG_PROGRESS_VLP_RETAINED; vlan_parent_wait(vlp, "vlan_config"); - ifv = (ifvlan_ref)ifnet_softc(ifp); - if (ifv == NULL) { - error = EOPNOTSUPP; + if (ifnet_get_ifvlan(ifp) != ifv) { + error = EINVAL; goto signal_done; } + + /* check again because someone might have gotten in */ + if (parent_list_lookup(p) != vlp) { + error = EBUSY; + goto signal_done; + } + if (vlan_parent_flags_detaching(vlp) || ifvlan_flags_detaching(ifv) || ifv->ifv_vlp != NULL) { error = EBUSY; goto signal_done; } - /* check again because someone might have gotten in */ + /* check again because someone might have gotten the tag */ if (vlan_parent_lookup_tag(vlp, tag) != NULL) { /* already a VLAN with that tag on this interface */ error = EADDRINUSE; @@ -1187,10 +1361,11 @@ vlan_config(struct ifnet * ifp, struct ifnet * p, int tag) } if (vlan_parent_no_vlans(vlp)) { - first_vlan = 1; + first_vlan = TRUE; } vlan_parent_add_vlan(vlp, ifv, tag); - progress |= VLAN_CONFIG_PROGRESS_IN_LIST; + ifvlan_retain(ifv); /* parent references ifv */ + ifv_added = TRUE; /* check whether bond interface is using parent interface */ ifnet_lock_exclusive(p); @@ -1271,34 +1446,44 @@ vlan_config(struct ifnet * ifp, struct ifnet * p, int tag) /* throw it away, it wasn't needed */ vlan_parent_release(new_vlp); } + if (ifv != NULL) { + ifvlan_release(ifv); + } return 0; signal_done: vlan_assert_lock_held(); - vlan_parent_signal(vlp, "vlan_config"); - unlock_done: - if ((progress & VLAN_CONFIG_PROGRESS_IN_LIST) != 0) { + if (ifv_added) { vlan_parent_remove_vlan(vlp, ifv); + if (!vlan_parent_flags_detaching(vlp) && vlan_parent_no_vlans(vlp)) { + /* the vlan parent has no more VLAN's */ + ifnet_set_eflags(p, 0, IFEF_VLAN); + LIST_REMOVE(vlp, vlp_parent_list); + /* release outside of the lock below */ + need_vlp_release++; + + /* one for being in the list */ + need_vlp_release++; + } } - if (!vlan_parent_flags_detaching(vlp) && vlan_parent_no_vlans(vlp)) { - /* the vlan parent has no more VLAN's */ - ifnet_set_eflags(p, 0, IFEF_VLAN); - LIST_REMOVE(vlp, vlp_parent_list); - /* release outside of the lock below */ - need_vlp_release = 1; - } + vlan_parent_signal(vlp, "vlan_config"); + + unlock_done: vlan_unlock(); - if ((progress & VLAN_CONFIG_PROGRESS_VLP_RETAINED) != 0) { - vlan_parent_release(vlp); - } - if (need_vlp_release) { + while (need_vlp_release--) { vlan_parent_release(vlp); } if (new_vlp != vlp) { vlan_parent_release(new_vlp); } + if (ifv != NULL) { + if (ifv_added) { + ifvlan_release(ifv); + } + ifvlan_release(ifv); + } return (error); } @@ -1311,7 +1496,7 @@ vlan_link_event(struct ifnet * ifp, struct ifnet * p) bzero(&ifmr, sizeof(ifmr)); snprintf(ifmr.ifm_name, sizeof(ifmr.ifm_name), "%s%d", ifnet_name(p), ifnet_unit(p)); - if (ifnet_ioctl(p, 0, SIOCGIFMEDIA, &ifmr) == 0 + if (ifnet_ioctl(p, 0, SIOCGIFMEDIA, &ifmr) == 0 && ifmr.ifm_count > 0 && ifmr.ifm_status & IFM_AVALID) { u_int32_t event; @@ -1323,36 +1508,36 @@ vlan_link_event(struct ifnet * ifp, struct ifnet * p) } static int -vlan_unconfig(struct ifnet * ifp) +vlan_unconfig(ifvlan_ref ifv, int need_to_wait) { - int error = 0; - ifvlan_ref ifv; - int last_vlan = 0; + struct ifnet * ifp = ifv->ifv_ifp; + int last_vlan = FALSE; + int need_ifv_release = 0; int need_vlp_release = 0; struct ifnet * p; vlan_parent_ref vlp; vlan_assert_lock_held(); - ifv = (ifvlan_ref)ifnet_softc(ifp); - if (ifv == NULL) { - return (0); - } vlp = ifv->ifv_vlp; if (vlp == NULL) { return (0); } - vlan_parent_retain(vlp); - vlan_parent_wait(vlp, "vlan_unconfig"); + if (need_to_wait) { + need_vlp_release++; + vlan_parent_retain(vlp); + vlan_parent_wait(vlp, "vlan_unconfig"); - /* check again because another thread could be in vlan_unconfig */ - ifv = (ifvlan_ref)ifnet_softc(ifp); - if (ifv == NULL) { - goto signal_done; - } - if (ifv->ifv_vlp != vlp) { - /* vlan parent changed */ - goto signal_done; + /* check again because another thread could be in vlan_unconfig */ + if (ifv != ifnet_get_ifvlan(ifp)) { + goto signal_done; + } + if (ifv->ifv_vlp != vlp) { + /* vlan parent changed */ + goto signal_done; + } } + + /* ifv has a reference on vlp, need to remove it */ need_vlp_release++; p = vlp->vlp_ifp; @@ -1362,56 +1547,67 @@ vlan_unconfig(struct ifnet * ifp) printf("vlan_unconfig: last vlan on %s%d\n", ifnet_name(p), ifnet_unit(p)); } - last_vlan = 1; + last_vlan = TRUE; } /* back-out any effect our mtu might have had on the parent */ - (void)vlan_new_mtu(ifp, ETHERMTU - ifv->ifv_mtufudge); + (void)ifvlan_new_mtu(ifv, ETHERMTU - ifv->ifv_mtufudge); vlan_unlock(); - /* detach VLAN "protocol" */ - if (last_vlan) { - (void)vlan_detach_protocol(p); - } - /* un-join multicast on parent interface */ (void)multicast_list_remove(&ifv->ifv_multicast); /* Clear our MAC address. */ ifnet_set_lladdr_and_type(ifp, NULL, 0, IFT_L2VLAN); - vlan_lock(); + /* detach VLAN "protocol" */ + if (last_vlan) { + (void)vlan_detach_protocol(p); + } - /* Disconnect from parent. */ - vlan_parent_remove_vlan(vlp, ifv); + vlan_lock(); /* return to the state we were in before SIFVLAN */ ifnet_set_mtu(ifp, 0); ifnet_set_flags(ifp, 0, IFF_BROADCAST | IFF_MULTICAST | IFF_SIMPLEX | IFF_RUNNING); ifnet_set_offload(ifp, 0); - ifv->ifv_flags = 0; ifv->ifv_mtufudge = 0; - if (!vlan_parent_flags_detaching(vlp) && vlan_parent_no_vlans(vlp)) { + /* Disconnect from parent. */ + vlan_parent_remove_vlan(vlp, ifv); + ifv->ifv_flags = 0; + + /* vlan_parent has reference to ifv, remove it */ + need_ifv_release++; + + /* from this point on, no more referencing ifv */ + if (last_vlan && !vlan_parent_flags_detaching(vlp)) { /* the vlan parent has no more VLAN's */ ifnet_set_eflags(p, 0, IFEF_VLAN); LIST_REMOVE(vlp, vlp_parent_list); + + /* one for being in the list */ + need_vlp_release++; + /* release outside of the lock below */ need_vlp_release++; } signal_done: - vlan_parent_signal(vlp, "vlan_unconfig"); + if (need_to_wait) { + vlan_parent_signal(vlp, "vlan_unconfig"); + } vlan_unlock(); - vlan_parent_release(vlp); /* one because we waited */ - - while (need_vlp_release--) { + while (need_ifv_release--) { + ifvlan_release(ifv); + } + while (need_vlp_release--) { /* references to vlp */ vlan_parent_release(vlp); } vlan_lock(); - return (error); + return (0); } static int @@ -1422,9 +1618,9 @@ vlan_set_promisc(struct ifnet * ifp) vlan_parent_ref vlp; vlan_lock(); - ifv = (ifvlan_ref)ifnet_softc(ifp); - if (ifv == NULL || ifvlan_flags_detaching(ifv)) { - error = (ifv == NULL) ? EOPNOTSUPP : EBUSY; + ifv = ifnet_get_ifvlan_retained(ifp); + if (ifv == NULL) { + error = EBUSY; goto done; } @@ -1449,22 +1645,24 @@ vlan_set_promisc(struct ifnet * ifp) } done: vlan_unlock(); + if (ifv != NULL) { + ifvlan_release(ifv); + } return (error); } static int -vlan_new_mtu(struct ifnet * ifp, int mtu) +ifvlan_new_mtu(ifvlan_ref ifv, int mtu) { struct ifdevmtu * devmtu_p; int error = 0; - ifvlan_ref ifv; + struct ifnet * ifp = ifv->ifv_ifp; int max_mtu; int new_mtu = 0; int req_mtu; vlan_parent_ref vlp; vlan_assert_lock_held(); - ifv = (ifvlan_ref)ifnet_softc(ifp); vlp = ifv->ifv_vlp; devmtu_p = &vlp->vlp_devmtu; req_mtu = mtu + ifv->ifv_mtufudge; @@ -1504,44 +1702,45 @@ vlan_set_mtu(struct ifnet * ifp, int mtu) return (EINVAL); } vlan_lock(); - ifv = (ifvlan_ref)ifnet_softc(ifp); - if (ifv == NULL || ifvlan_flags_detaching(ifv)) { + ifv = ifnet_get_ifvlan_retained(ifp); + if (ifv == NULL) { vlan_unlock(); - return ((ifv == NULL) ? EOPNOTSUPP : EBUSY); + return (EBUSY); } - vlp = ifv->ifv_vlp; - if (vlp == NULL || vlan_parent_flags_detaching(vlp)) { + vlp = ifvlan_get_vlan_parent_retained(ifv); + if (vlp == NULL) { vlan_unlock(); + ifvlan_release(ifv); if (mtu != 0) { return (EINVAL); } return (0); } - vlan_parent_retain(vlp); vlan_parent_wait(vlp, "vlan_set_mtu"); /* check again, something might have changed */ - ifv = (ifvlan_ref)ifnet_softc(ifp); - if (ifv == NULL || ifvlan_flags_detaching(ifv)) { - error = (ifv == NULL) ? EOPNOTSUPP : EBUSY; + if (ifnet_get_ifvlan(ifp) != ifv + || ifvlan_flags_detaching(ifv)) { + error = EBUSY; goto signal_done; } if (ifv->ifv_vlp != vlp) { /* vlan parent changed */ goto signal_done; } - if (vlp == NULL || vlan_parent_flags_detaching(vlp)) { + if (vlan_parent_flags_detaching(vlp)) { if (mtu != 0) { error = EINVAL; } goto signal_done; } - error = vlan_new_mtu(ifp, mtu); + error = ifvlan_new_mtu(ifv, mtu); signal_done: vlan_parent_signal(vlp, "vlan_set_mtu"); vlan_unlock(); vlan_parent_release(vlp); + ifvlan_release(ifv); return (error); } @@ -1685,7 +1884,10 @@ vlan_ioctl(ifnet_t ifp, u_long cmd, void * data) /* generate a link event based on the state of the parent */ vlan_link_event(ifp, p); - } else { + } + else { + int need_link_event = FALSE; + vlan_lock(); ifv = (ifvlan_ref)ifnet_softc(ifp); if (ifv == NULL || ifvlan_flags_detaching(ifv)) { @@ -1693,9 +1895,9 @@ vlan_ioctl(ifnet_t ifp, u_long cmd, void * data) error = (ifv == NULL ? EOPNOTSUPP : EBUSY); break; } - error = vlan_unconfig(ifp); + need_link_event = vlan_remove(ifv, TRUE); vlan_unlock(); - if (error == 0) { + if (need_link_event) { interface_link_event(ifp, KEV_DL_LINK_OFF); } } @@ -1748,22 +1950,20 @@ vlan_if_free(struct ifnet * ifp) if (ifp == NULL) { return; } - vlan_lock(); ifv = (ifvlan_ref)ifnet_softc(ifp); if (ifv == NULL) { - vlan_unlock(); return; } - vlan_unlock(); + ifvlan_release(ifv); ifnet_release(ifp); - FREE(ifv, M_VLAN); + return; } static void vlan_event(struct ifnet * p, __unused protocol_family_t protocol, const struct kev_msg * event) { - vlan_parent_ref vlp; + int event_code; /* Check if the interface we are attached to is being detached */ if (event->vendor_code != KEV_VENDOR_APPLE @@ -1771,43 +1971,28 @@ vlan_event(struct ifnet * p, __unused protocol_family_t protocol, || event->kev_subclass != KEV_DL_SUBCLASS) { return; } - switch (event->event_code) { - case KEV_DL_IF_DETACHING: + event_code = event->event_code; + switch (event_code) { case KEV_DL_LINK_OFF: case KEV_DL_LINK_ON: + vlan_parent_link_event(p, event_code); break; default: return; } - vlan_lock(); - if ((ifnet_eflags(p) & IFEF_VLAN) == 0) { - vlan_unlock(); - /* no VLAN's */ - return; - } - vlp = parent_list_lookup(p); - if (vlp == NULL) { - /* no VLAN's */ - vlan_unlock(); - return; - } - switch (event->event_code) { - case KEV_DL_IF_DETACHING: - vlan_parent_flags_set_detaching(vlp); - vlan_parent_remove_all_vlans(vlp); - break; - - case KEV_DL_LINK_OFF: - case KEV_DL_LINK_ON: - vlan_parent_link_event(vlp, event->event_code); - break; - default: - break; - } - vlan_unlock(); return; } +static errno_t +vlan_detached(ifnet_t p, __unused protocol_family_t protocol) +{ + if (ifnet_is_attached(p, 0) == 0) { + /* if the parent isn't attached, remove all VLANs */ + vlan_parent_remove_all_vlans(p); + } + return (0); +} + static void interface_link_event(struct ifnet * ifp, u_int32_t event_code) { @@ -1817,6 +2002,7 @@ interface_link_event(struct ifnet * ifp, u_int32_t event_code) char if_name[IFNAMSIZ]; } event; + bzero(&event, sizeof(event)); event.header.total_size = sizeof(event); event.header.vendor_code = KEV_VENDOR_APPLE; event.header.kev_class = KEV_NETWORK_CLASS; @@ -1830,13 +2016,45 @@ interface_link_event(struct ifnet * ifp, u_int32_t event_code) } static void -vlan_parent_link_event(vlan_parent_ref vlp, u_int32_t event_code) +vlan_parent_link_event(struct ifnet * p, u_int32_t event_code) { - ifvlan_ref ifv; + ifvlan_ref ifv; + vlan_parent_ref vlp; + vlan_lock(); + if ((ifnet_eflags(p) & IFEF_VLAN) == 0) { + vlan_unlock(); + /* no VLAN's */ + return; + } + vlp = parent_list_lookup(p); + if (vlp == NULL) { + /* no VLAN's */ + vlan_unlock(); + return; + } + + vlan_parent_retain(vlp); + vlan_parent_wait(vlp, "vlan_parent_link_event"); + if (vlan_parent_flags_detaching(vlp)) { + goto signal_done; + } + + vlan_unlock(); + + /* vlan_parent_wait() gives us exclusive access to the list */ LIST_FOREACH(ifv, &vlp->vlp_vlan_list, ifv_vlan_list) { - interface_link_event(ifv->ifv_ifp, event_code); + struct ifnet * ifp = ifv->ifv_ifp; + + interface_link_event(ifp, event_code); } + + vlan_lock(); + + signal_done: + vlan_parent_signal(vlp, "vlan_parent_link_event"); + vlan_unlock(); + vlan_parent_release(vlp); return; } @@ -1860,6 +2078,7 @@ vlan_attach_protocol(struct ifnet *ifp) bzero(®, sizeof(reg)); reg.input = vlan_input; reg.event = vlan_event; + reg.detached = vlan_detached; error = ifnet_attach_protocol(ifp, PF_VLAN, ®); if (error) { printf("vlan_proto_attach(%s%d) ifnet_attach_protocol failed, %d\n", diff --git a/bsd/net/kext_net.h b/bsd/net/kext_net.h index 6215515a3..48ade0710 100644 --- a/bsd/net/kext_net.h +++ b/bsd/net/kext_net.h @@ -46,48 +46,29 @@ * Internal implementation bits */ -struct socket_filter; - -#define SFEF_DETACHUSEZERO 0x1 /* Detach when use reaches zero */ -#define SFEF_UNREGISTERING 0x2 /* Remove due to unregister */ -#define SFEF_DETACHXREF 0x4 /* Extra reference held for detach */ - -struct socket_filter_entry { - struct socket_filter_entry *sfe_next_onsocket; - struct socket_filter_entry *sfe_next_onfilter; - - struct socket_filter *sfe_filter; - struct socket *sfe_socket; - void *sfe_cookie; - - u_int32_t sfe_flags; -}; - -#define SFF_DETACHING 0x1 - -struct socket_filter { - TAILQ_ENTRY(socket_filter) sf_protosw_next; - TAILQ_ENTRY(socket_filter) sf_global_next; - struct socket_filter_entry *sf_entry_head; - - struct protosw *sf_proto; - struct sflt_filter sf_filter; - u_int32_t sf_flags; - u_int32_t sf_usecount; -}; - -TAILQ_HEAD(socket_filter_list, socket_filter); - /* Private, internal implementation functions */ -void sflt_init(void) __attribute__((section("__TEXT, initcode"))); -void sflt_initsock(struct socket *so); -void sflt_termsock(struct socket *so); -void sflt_use(struct socket *so); -void sflt_unuse(struct socket *so); -void sflt_notify(struct socket *so, sflt_event_t event, void *param); -int sflt_data_in(struct socket *so, const struct sockaddr *from, mbuf_t *data, - mbuf_t *control, sflt_data_flag_t flags, int *filtered); -int sflt_attach_private(struct socket *so, struct socket_filter *filter, sflt_handle handle, int locked); +extern void sflt_init(void) __attribute__((section("__TEXT, initcode"))); +extern void sflt_initsock(struct socket *so); +extern void sflt_termsock(struct socket *so); +extern errno_t sflt_attach_internal(struct socket *so, sflt_handle handle); +extern void sflt_notify(struct socket *so, sflt_event_t event, void *param); +extern int sflt_ioctl(struct socket *so, u_long cmd, caddr_t data); +extern int sflt_bind(struct socket *so, const struct sockaddr *nam); +extern int sflt_listen(struct socket *so); +extern int sflt_accept(struct socket *head, struct socket *so, + const struct sockaddr *local, + const struct sockaddr *remote); +extern int sflt_getsockname(struct socket *so, struct sockaddr **local); +extern int sflt_getpeername(struct socket *so, struct sockaddr **remote); +extern int sflt_connectin(struct socket *head, const struct sockaddr *remote); +extern int sflt_connectout(struct socket *so, const struct sockaddr *nam); +extern int sflt_setsockopt(struct socket *so, struct sockopt *sopt); +extern int sflt_getsockopt(struct socket *so, struct sockopt *sopt); +extern int sflt_data_out(struct socket *so, const struct sockaddr *to, + mbuf_t *data, mbuf_t *control, + sflt_data_flag_t flags); +extern int sflt_data_in(struct socket *so, const struct sockaddr *from, + mbuf_t *data, mbuf_t *control, sflt_data_flag_t flags); #endif /* BSD_KERNEL_PRIVATE */ diff --git a/bsd/net/kpi_interface.c b/bsd/net/kpi_interface.c index e56564c58..82ba11b03 100644 --- a/bsd/net/kpi_interface.c +++ b/bsd/net/kpi_interface.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -55,9 +56,6 @@ #define TOUCHLASTCHANGE(__if_lastchange) microtime(__if_lastchange) #endif -extern struct dlil_threading_info *dlil_lo_thread_ptr; -extern int dlil_multithreaded_input; - static errno_t ifnet_list_get_common(ifnet_family_t, boolean_t, ifnet_t **, u_int32_t *); @@ -184,31 +182,15 @@ ifnet_allocate( } errno_t -ifnet_reference( - ifnet_t ifp) +ifnet_reference(ifnet_t ifp) { - int oldval; - - if (ifp == NULL) return EINVAL; - - oldval = OSIncrementAtomic(&ifp->if_refcnt); - - return 0; + return (dlil_if_ref(ifp)); } errno_t -ifnet_release( - ifnet_t ifp) +ifnet_release(ifnet_t ifp) { - int oldval; - - if (ifp == NULL) return EINVAL; - - oldval = OSDecrementAtomic(&ifp->if_refcnt); - if (oldval == 0) - panic("ifnet_release - refcount decremented past zero!"); - - return 0; + return (dlil_if_free(ifp)); } errno_t @@ -256,27 +238,22 @@ ifnet_index( } errno_t -ifnet_set_flags( - ifnet_t interface, - u_int16_t new_flags, - u_int16_t mask) +ifnet_set_flags(ifnet_t interface, u_int16_t new_flags, u_int16_t mask) { - int lock; - - if (interface == NULL) return EINVAL; - lock = (interface->if_lock != 0); - - if (lock) ifnet_lock_exclusive(interface); - + if (interface == NULL) + return (EINVAL); + + ifnet_lock_exclusive(interface); + /* If we are modifying the up/down state, call if_updown */ - if (lock && (mask & IFF_UP) != 0) { + if ((mask & IFF_UP) != 0) { if_updown(interface, (new_flags & IFF_UP) == IFF_UP); } - + interface->if_flags = (new_flags & mask) | (interface->if_flags & ~mask); - if (lock) ifnet_lock_done(interface); - - return 0; + ifnet_lock_done(interface); + + return (0); } u_int16_t @@ -287,21 +264,16 @@ ifnet_flags( } errno_t -ifnet_set_eflags( - ifnet_t interface, - u_int32_t new_flags, - u_int32_t mask) +ifnet_set_eflags(ifnet_t interface, u_int32_t new_flags, u_int32_t mask) { - int lock; - - if (interface == NULL) return EINVAL; - lock = (interface->if_lock != 0); - - if (lock) ifnet_lock_exclusive(interface); + if (interface == NULL) + return (EINVAL); + + ifnet_lock_exclusive(interface); interface->if_eflags = (new_flags & mask) | (interface->if_eflags & ~mask); - if (lock) ifnet_lock_done(interface); - - return 0; + ifnet_lock_done(interface); + + return (0); } u_int32_t @@ -312,19 +284,28 @@ ifnet_eflags( } errno_t -ifnet_set_idle_flags(ifnet_t ifp, u_int32_t new_flags, u_int32_t mask) +ifnet_set_idle_flags_locked(ifnet_t ifp, u_int32_t new_flags, u_int32_t mask) { -#if IFNET_ROUTE_REFCNT - int lock, before, after; + int before, after; if (ifp == NULL) return (EINVAL); - lck_mtx_lock(rnh_lock); + lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); + ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE); - lock = (ifp->if_lock != NULL); - if (lock) - ifnet_lock_exclusive(ifp); + /* + * If this is called prior to ifnet attach, the actual work will + * be done at attach time. Otherwise, if it is called after + * ifnet detach, then it is a no-op. + */ + if (!ifnet_is_attached(ifp, 0)) { + ifp->if_idle_new_flags = new_flags; + ifp->if_idle_new_flags_mask = mask; + return (0); + } else { + ifp->if_idle_new_flags = ifp->if_idle_new_flags_mask = 0; + } before = ifp->if_idle_flags; ifp->if_idle_flags = (new_flags & mask) | (ifp->if_idle_flags & ~mask); @@ -345,49 +326,140 @@ ifnet_set_idle_flags(ifnet_t ifp, u_int32_t new_flags, u_int32_t mask) rt_aggdrain(1); } - if (lock) - ifnet_lock_done(ifp); + return (0); +} + +errno_t +ifnet_set_idle_flags(ifnet_t ifp, u_int32_t new_flags, u_int32_t mask) +{ + errno_t err; + lck_mtx_lock(rnh_lock); + ifnet_lock_exclusive(ifp); + err = ifnet_set_idle_flags_locked(ifp, new_flags, mask); + ifnet_lock_done(ifp); lck_mtx_unlock(rnh_lock); - return (0); -#else -#pragma unused(ifp, new_flags, mask) - return (ENOTSUP); -#endif /* IFNET_ROUTE_REFCNT */ + return (err); } u_int32_t ifnet_idle_flags(ifnet_t ifp) { -#if IFNET_ROUTE_REFCNT return ((ifp == NULL) ? 0 : ifp->if_idle_flags); -#else -#pragma unused(ifp) - return (0); -#endif /* IFNET_ROUTE_REFCNT */ +} + +errno_t ifnet_set_capabilities_supported(ifnet_t ifp, u_int32_t new_caps, + u_int32_t mask) +{ + errno_t error = 0; + int tmp; + + if (ifp == NULL) + return EINVAL; + + ifnet_lock_exclusive(ifp); + tmp = (new_caps & mask) | (ifp->if_capabilities & ~mask); + if ((tmp & ~IFCAP_VALID)) + error = EINVAL; + else + ifp->if_capabilities = tmp; + ifnet_lock_done(ifp); + + return error; +} + +u_int32_t ifnet_capabilities_supported(ifnet_t ifp) +{ + return ((ifp == NULL) ? 0 : ifp->if_capabilities); +} + + +errno_t ifnet_set_capabilities_enabled(ifnet_t ifp, u_int32_t new_caps, + u_int32_t mask) +{ + errno_t error = 0; + int tmp; + struct kev_msg ev_msg; + struct net_event_data ev_data; + + if (ifp == NULL) + return EINVAL; + + ifnet_lock_exclusive(ifp); + tmp = (new_caps & mask) | (ifp->if_capenable & ~mask); + if ((tmp & ~IFCAP_VALID) || (tmp & ~ifp->if_capabilities)) + error = EINVAL; + else + ifp->if_capenable = tmp; + ifnet_lock_done(ifp); + + /* Notify application of the change */ + bzero(&ev_data, sizeof(struct net_event_data)); + bzero(&ev_msg, sizeof(struct kev_msg)); + ev_msg.vendor_code = KEV_VENDOR_APPLE; + ev_msg.kev_class = KEV_NETWORK_CLASS; + ev_msg.kev_subclass = KEV_DL_SUBCLASS; + + ev_msg.event_code = KEV_DL_IFCAP_CHANGED; + strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ); + ev_data.if_family = ifp->if_family; + ev_data.if_unit = (u_int32_t) ifp->if_unit; + ev_msg.dv[0].data_length = sizeof(struct net_event_data); + ev_msg.dv[0].data_ptr = &ev_data; + ev_msg.dv[1].data_length = 0; + kev_post_msg(&ev_msg); + + return error; +} + +u_int32_t ifnet_capabilities_enabled(ifnet_t ifp) +{ + return ((ifp == NULL) ? 0 : ifp->if_capenable); + + return 0; } static const ifnet_offload_t offload_mask = IFNET_CSUM_IP | IFNET_CSUM_TCP | IFNET_CSUM_UDP | IFNET_CSUM_FRAGMENT | IFNET_IP_FRAGMENT | + IFNET_CSUM_TCPIPV6 | IFNET_CSUM_UDPIPV6 | IFNET_IPV6_FRAGMENT | IFNET_CSUM_SUM16 | IFNET_VLAN_TAGGING | IFNET_VLAN_MTU | IFNET_MULTIPAGES | IFNET_TSO_IPV4 | IFNET_TSO_IPV6; +static const ifnet_offload_t any_offload_csum = IFNET_CSUM_IP | IFNET_CSUM_TCP | + IFNET_CSUM_UDP | IFNET_CSUM_FRAGMENT | + IFNET_CSUM_TCPIPV6 | IFNET_CSUM_UDPIPV6 | + IFNET_CSUM_SUM16; + + errno_t -ifnet_set_offload( - ifnet_t interface, - ifnet_offload_t offload) +ifnet_set_offload(ifnet_t interface, ifnet_offload_t offload) { - int lock; - - if (interface == NULL) return EINVAL; - lock = (interface->if_lock != 0); + u_int32_t ifcaps = 0; - if (lock) ifnet_lock_exclusive(interface); - interface->if_hwassist = (offload & offload_mask); - if (lock) ifnet_lock_done(interface); - - return 0; + if (interface == NULL) + return (EINVAL); + + ifnet_lock_exclusive(interface); + interface->if_hwassist = (offload & offload_mask); + ifnet_lock_done(interface); + + if ((offload & any_offload_csum)) + ifcaps |= IFCAP_HWCSUM; + if ((offload & IFNET_TSO_IPV4)) + ifcaps |= IFCAP_TSO4; + if ((offload & IFNET_TSO_IPV6)) + ifcaps |= IFCAP_TSO6; + if ((offload & IFNET_VLAN_MTU)) + ifcaps |= IFCAP_VLAN_MTU; + if ((offload & IFNET_VLAN_TAGGING)) + ifcaps |= IFCAP_VLAN_HWTAGGING; + if (ifcaps != 0) { + (void) ifnet_set_capabilities_supported(interface, ifcaps, IFCAP_VALID); + (void) ifnet_set_capabilities_enabled(interface, ifcaps, IFCAP_VALID); + } + + return (0); } ifnet_offload_t @@ -466,13 +538,14 @@ ifnet_get_tso_mtu( return error; } -errno_t +errno_t ifnet_set_wake_flags(ifnet_t interface, u_int32_t properties, u_int32_t mask) { - int lock; - struct kev_msg ev_msg; - struct net_event_data ev_data; - + struct kev_msg ev_msg; + struct net_event_data ev_data; + + bzero(&ev_data, sizeof(struct net_event_data)); + bzero(&ev_msg, sizeof(struct kev_msg)); if (interface == NULL) return EINVAL; @@ -480,15 +553,11 @@ ifnet_set_wake_flags(ifnet_t interface, u_int32_t properties, u_int32_t mask) if ((properties & mask) & ~IF_WAKE_VALID_FLAGS) return EINVAL; - lock = (interface->if_lock != 0); - - if (lock) - ifnet_lock_exclusive(interface); + ifnet_lock_exclusive(interface); interface->if_wake_properties = (properties & mask) | (interface->if_wake_properties & ~mask); - if (lock) - ifnet_lock_done(interface); + ifnet_lock_done(interface); (void) ifnet_touch_lastchange(interface); @@ -505,7 +574,7 @@ ifnet_set_wake_flags(ifnet_t interface, u_int32_t properties, u_int32_t mask) ev_msg.dv[0].data_ptr = &ev_data; ev_msg.dv[1].data_length = 0; kev_post_msg(&ev_msg); - + return 0; } @@ -515,55 +584,43 @@ ifnet_get_wake_flags(ifnet_t interface) return interface == NULL ? 0 : interface->if_wake_properties; } - - - /* * Should MIB data store a copy? */ errno_t -ifnet_set_link_mib_data( - ifnet_t interface, - void* mibData, - u_int32_t mibLen) +ifnet_set_link_mib_data(ifnet_t interface, void *mibData, u_int32_t mibLen) { - int lock; - - if (interface == NULL) return EINVAL; - lock = (interface->if_lock != 0); - - if (lock) ifnet_lock_exclusive(interface); + if (interface == NULL) + return (EINVAL); + + ifnet_lock_exclusive(interface); interface->if_linkmib = (void*)mibData; interface->if_linkmiblen = mibLen; - if (lock) ifnet_lock_done(interface); - return 0; + ifnet_lock_done(interface); + return (0); } errno_t -ifnet_get_link_mib_data( - ifnet_t interface, - void *mibData, - u_int32_t *mibLen) +ifnet_get_link_mib_data(ifnet_t interface, void *mibData, u_int32_t *mibLen) { errno_t result = 0; - int lock; - - if (interface == NULL) return EINVAL; - lock = (interface->if_lock != NULL); - - if (lock) ifnet_lock_shared(interface); + + if (interface == NULL) + return (EINVAL); + + ifnet_lock_shared(interface); if (*mibLen < interface->if_linkmiblen) result = EMSGSIZE; if (result == 0 && interface->if_linkmib == NULL) result = ENOTSUP; - + if (result == 0) { *mibLen = interface->if_linkmiblen; bcopy(interface->if_linkmib, mibData, *mibLen); } - if (lock) ifnet_lock_done(interface); - - return result; + ifnet_lock_done(interface); + + return (result); } u_int32_t @@ -634,15 +691,12 @@ ifnet_type( #if 0 errno_t -ifnet_set_typelen( - ifnet_t interface, - u_char typelen) +ifnet_set_typelen(ifnet_t interface, u_char typelen) { - int lock = (interface->if_lock != 0); - if (lock) ifnet_lock_exclusive(interface); + ifnet_lock_exclusive(interface); interface->if_data.ifi_typelen = typelen; - if (lock) ifnet_lock_done(interface); - return 0; + ifnet_lock_done(interface); + return (0); } u_char @@ -733,310 +787,283 @@ ifnet_baudrate( } errno_t -ifnet_stat_increment( - ifnet_t interface, - const struct ifnet_stat_increment_param *counts) +ifnet_stat_increment(ifnet_t interface, + const struct ifnet_stat_increment_param *counts) { - struct dlil_threading_info *thread; - if (interface == NULL) return EINVAL; - - if ((thread = interface->if_input_thread) == NULL || (dlil_multithreaded_input == 0)) - thread = dlil_lo_thread_ptr; + if (interface == NULL) + return (EINVAL); - lck_mtx_lock(thread->input_lck); + atomic_add_64(&interface->if_data.ifi_ipackets, counts->packets_in); + atomic_add_64(&interface->if_data.ifi_ibytes, counts->bytes_in); + atomic_add_64(&interface->if_data.ifi_ierrors, counts->errors_in); - interface->if_data.ifi_ipackets += counts->packets_in; - interface->if_data.ifi_ibytes += counts->bytes_in; - interface->if_data.ifi_ierrors += counts->errors_in; + atomic_add_64(&interface->if_data.ifi_opackets, counts->packets_out); + atomic_add_64(&interface->if_data.ifi_obytes, counts->bytes_out); + atomic_add_64(&interface->if_data.ifi_oerrors, counts->errors_out); - interface->if_data.ifi_opackets += counts->packets_out; - interface->if_data.ifi_obytes += counts->bytes_out; - interface->if_data.ifi_oerrors += counts->errors_out; + atomic_add_64(&interface->if_data.ifi_collisions, counts->collisions); + atomic_add_64(&interface->if_data.ifi_iqdrops, counts->dropped); - interface->if_data.ifi_collisions += counts->collisions; - interface->if_data.ifi_iqdrops += counts->dropped; - /* Touch the last change time. */ TOUCHLASTCHANGE(&interface->if_lastchange); - lck_mtx_unlock(thread->input_lck); - - return 0; + return (0); } errno_t -ifnet_stat_increment_in( - ifnet_t interface, - u_int32_t packets_in, - u_int32_t bytes_in, - u_int32_t errors_in) +ifnet_stat_increment_in(ifnet_t interface, u_int32_t packets_in, + u_int32_t bytes_in, u_int32_t errors_in) { - struct dlil_threading_info *thread; - - if (interface == NULL) return EINVAL; - - if ((thread = interface->if_input_thread) == NULL || (dlil_multithreaded_input == 0)) - thread = dlil_lo_thread_ptr; - - lck_mtx_lock(thread->input_lck); + if (interface == NULL) + return (EINVAL); - interface->if_data.ifi_ipackets += packets_in; - interface->if_data.ifi_ibytes += bytes_in; - interface->if_data.ifi_ierrors += errors_in; + atomic_add_64(&interface->if_data.ifi_ipackets, packets_in); + atomic_add_64(&interface->if_data.ifi_ibytes, bytes_in); + atomic_add_64(&interface->if_data.ifi_ierrors, errors_in); TOUCHLASTCHANGE(&interface->if_lastchange); - lck_mtx_unlock(thread->input_lck); - - return 0; + return (0); } errno_t -ifnet_stat_increment_out( - ifnet_t interface, - u_int32_t packets_out, - u_int32_t bytes_out, - u_int32_t errors_out) +ifnet_stat_increment_out(ifnet_t interface, u_int32_t packets_out, + u_int32_t bytes_out, u_int32_t errors_out) { - struct dlil_threading_info *thread; - if (interface == NULL) return EINVAL; - - if ((thread = interface->if_input_thread) == NULL || (dlil_multithreaded_input == 0)) - thread = dlil_lo_thread_ptr; - - lck_mtx_lock(thread->input_lck); + if (interface == NULL) + return (EINVAL); - interface->if_data.ifi_opackets += packets_out; - interface->if_data.ifi_obytes += bytes_out; - interface->if_data.ifi_oerrors += errors_out; + atomic_add_64(&interface->if_data.ifi_opackets, packets_out); + atomic_add_64(&interface->if_data.ifi_obytes, bytes_out); + atomic_add_64(&interface->if_data.ifi_oerrors, errors_out); TOUCHLASTCHANGE(&interface->if_lastchange); - lck_mtx_unlock(thread->input_lck); - - return 0; + return (0); } errno_t -ifnet_set_stat( - ifnet_t interface, - const struct ifnet_stats_param *stats) +ifnet_set_stat(ifnet_t interface, const struct ifnet_stats_param *stats) { - struct dlil_threading_info *thread; + if (interface == NULL) + return (EINVAL); - if (interface == NULL) return EINVAL; - - if ((thread = interface->if_input_thread) == NULL || (dlil_multithreaded_input == 0)) - thread = dlil_lo_thread_ptr; + atomic_set_64(&interface->if_data.ifi_ipackets, stats->packets_in); + atomic_set_64(&interface->if_data.ifi_ibytes, stats->bytes_in); + atomic_set_64(&interface->if_data.ifi_imcasts, stats->multicasts_in); + atomic_set_64(&interface->if_data.ifi_ierrors, stats->errors_in); - lck_mtx_lock(thread->input_lck); + atomic_set_64(&interface->if_data.ifi_opackets, stats->packets_out); + atomic_set_64(&interface->if_data.ifi_obytes, stats->bytes_out); + atomic_set_64(&interface->if_data.ifi_omcasts, stats->multicasts_out); + atomic_set_64(&interface->if_data.ifi_oerrors, stats->errors_out); - interface->if_data.ifi_ipackets = stats->packets_in; - interface->if_data.ifi_ibytes = stats->bytes_in; - interface->if_data.ifi_imcasts = stats->multicasts_in; - interface->if_data.ifi_ierrors = stats->errors_in; - - interface->if_data.ifi_opackets = stats->packets_out; - interface->if_data.ifi_obytes = stats->bytes_out; - interface->if_data.ifi_omcasts = stats->multicasts_out; - interface->if_data.ifi_oerrors = stats->errors_out; - - interface->if_data.ifi_collisions = stats->collisions; - interface->if_data.ifi_iqdrops = stats->dropped; - interface->if_data.ifi_noproto = stats->no_protocol; + atomic_set_64(&interface->if_data.ifi_collisions, stats->collisions); + atomic_set_64(&interface->if_data.ifi_iqdrops, stats->dropped); + atomic_set_64(&interface->if_data.ifi_noproto, stats->no_protocol); /* Touch the last change time. */ TOUCHLASTCHANGE(&interface->if_lastchange); - lck_mtx_unlock(thread->input_lck); - return 0; } errno_t -ifnet_stat( - ifnet_t interface, - struct ifnet_stats_param *stats) +ifnet_stat(ifnet_t interface, struct ifnet_stats_param *stats) { - struct dlil_threading_info *thread; - - if (interface == NULL) return EINVAL; - - if ((thread = interface->if_input_thread) == NULL || (dlil_multithreaded_input == 0)) - thread = dlil_lo_thread_ptr; - - lck_mtx_lock(thread->input_lck); + if (interface == NULL) + return (EINVAL); - stats->packets_in = interface->if_data.ifi_ipackets; - stats->bytes_in = interface->if_data.ifi_ibytes; - stats->multicasts_in = interface->if_data.ifi_imcasts; - stats->errors_in = interface->if_data.ifi_ierrors; + atomic_get_64(stats->packets_in, &interface->if_data.ifi_ipackets); + atomic_get_64(stats->bytes_in, &interface->if_data.ifi_ibytes); + atomic_get_64(stats->multicasts_in, &interface->if_data.ifi_imcasts); + atomic_get_64(stats->errors_in, &interface->if_data.ifi_ierrors); - stats->packets_out = interface->if_data.ifi_opackets; - stats->bytes_out = interface->if_data.ifi_obytes; - stats->multicasts_out = interface->if_data.ifi_omcasts; - stats->errors_out = interface->if_data.ifi_oerrors; + atomic_get_64(stats->packets_out, &interface->if_data.ifi_opackets); + atomic_get_64(stats->bytes_out, &interface->if_data.ifi_obytes); + atomic_get_64(stats->multicasts_out, &interface->if_data.ifi_omcasts); + atomic_get_64(stats->errors_out, &interface->if_data.ifi_oerrors); - stats->collisions = interface->if_data.ifi_collisions; - stats->dropped = interface->if_data.ifi_iqdrops; - stats->no_protocol = interface->if_data.ifi_noproto; + atomic_get_64(stats->collisions, &interface->if_data.ifi_collisions); + atomic_get_64(stats->dropped, &interface->if_data.ifi_iqdrops); + atomic_get_64(stats->no_protocol, &interface->if_data.ifi_noproto); - lck_mtx_unlock(thread->input_lck); - - return 0; + return (0); } errno_t -ifnet_touch_lastchange( - ifnet_t interface) +ifnet_touch_lastchange(ifnet_t interface) { - struct dlil_threading_info *thread; - - if (interface == NULL) return EINVAL; - - if ((thread = interface->if_input_thread) == NULL || (dlil_multithreaded_input == 0)) - thread = dlil_lo_thread_ptr; - - lck_mtx_lock(thread->input_lck); + if (interface == NULL) + return (EINVAL); TOUCHLASTCHANGE(&interface->if_lastchange); - lck_mtx_unlock(thread->input_lck); - - return 0; + return (0); } errno_t -ifnet_lastchange( - ifnet_t interface, - struct timeval *last_change) +ifnet_lastchange(ifnet_t interface, struct timeval *last_change) { - struct dlil_threading_info *thread; - - if (interface == NULL) return EINVAL; - - if ((thread = interface->if_input_thread) == NULL || (dlil_multithreaded_input == 0)) - thread = dlil_lo_thread_ptr; - - lck_mtx_lock(thread->input_lck); + if (interface == NULL) + return (EINVAL); *last_change = interface->if_data.ifi_lastchange; - - lck_mtx_unlock(thread->input_lck); - #if IF_LASTCHANGEUPTIME /* Crude conversion from uptime to calendar time */ last_change->tv_sec += boottime_sec(); #endif - - return 0; + return (0); } errno_t -ifnet_get_address_list( - ifnet_t interface, - ifaddr_t **addresses) +ifnet_get_address_list(ifnet_t interface, ifaddr_t **addresses) { - if (addresses == NULL) return EINVAL; - return ifnet_get_address_list_family(interface, addresses, 0); + return (addresses == NULL ? EINVAL : + ifnet_get_address_list_family(interface, addresses, 0)); } +struct ifnet_addr_list { + SLIST_ENTRY(ifnet_addr_list) ifal_le; + struct ifaddr *ifal_ifa; +}; + errno_t -ifnet_get_address_list_family( - ifnet_t interface, - ifaddr_t **addresses, - sa_family_t family) +ifnet_get_address_list_family(ifnet_t interface, ifaddr_t **addresses, + sa_family_t family) +{ + return (ifnet_get_address_list_family_internal(interface, addresses, + family, 0, M_NOWAIT)); +} + +__private_extern__ errno_t +ifnet_get_address_list_family_internal(ifnet_t interface, ifaddr_t **addresses, + sa_family_t family, int detached, int how) { + SLIST_HEAD(, ifnet_addr_list) ifal_head; + struct ifnet_addr_list *ifal, *ifal_tmp; struct ifnet *ifp; int count = 0; - int cmax = 0; - - if (addresses == NULL) return EINVAL; + errno_t err = 0; + + SLIST_INIT(&ifal_head); + + if (addresses == NULL) { + err = EINVAL; + goto done; + } *addresses = NULL; - + + if (detached) { + /* + * Interface has been detached, so skip the lookup + * at ifnet_head and go directly to inner loop. + */ + ifp = interface; + if (ifp == NULL) { + err = EINVAL; + goto done; + } + goto one; + } + ifnet_head_lock_shared(); - TAILQ_FOREACH(ifp, &ifnet, if_link) - { - if (interface && ifp != interface) continue; - + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + if (interface != NULL && ifp != interface) + continue; +one: ifnet_lock_shared(ifp); - if ((ifp->if_eflags & IFEF_DETACHING) == 0) { - if (interface == NULL || interface == ifp) - { - struct ifaddr *addr; - TAILQ_FOREACH(addr, &ifp->if_addrhead, ifa_link) - { - if (family == 0 || addr->ifa_addr->sa_family == family) - cmax++; + if (interface == NULL || interface == ifp) { + struct ifaddr *ifa; + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + IFA_LOCK(ifa); + if (family != 0 && + ifa->ifa_addr->sa_family != family) { + IFA_UNLOCK(ifa); + continue; } + MALLOC(ifal, struct ifnet_addr_list *, + sizeof (*ifal), M_TEMP, how); + if (ifal == NULL) { + IFA_UNLOCK(ifa); + ifnet_lock_done(ifp); + if (!detached) + ifnet_head_done(); + err = ENOMEM; + goto done; + } + ifal->ifal_ifa = ifa; + IFA_ADDREF_LOCKED(ifa); + SLIST_INSERT_HEAD(&ifal_head, ifal, ifal_le); + ++count; + IFA_UNLOCK(ifa); } } - else if (interface != NULL) { - ifnet_lock_done(ifp); - ifnet_head_done(); - return ENXIO; - } ifnet_lock_done(ifp); + if (detached) + break; } - - MALLOC(*addresses, ifaddr_t*, sizeof(ifaddr_t) * (cmax + 1), M_TEMP, M_NOWAIT); - if (*addresses == NULL) { + if (!detached) ifnet_head_done(); - return ENOMEM; + + if (count == 0) { + err = ENXIO; + goto done; } - - TAILQ_FOREACH(ifp, &ifnet, if_link) - { - if (interface && ifp != interface) continue; - - ifnet_lock_shared(ifp); - if ((ifp->if_eflags & IFEF_DETACHING) == 0) { - if (interface == NULL || (struct ifnet*)interface == ifp) - { - struct ifaddr *addr; - TAILQ_FOREACH(addr, &ifp->if_addrhead, ifa_link) - { - if (count + 1 > cmax) break; - if (family == 0 || addr->ifa_addr->sa_family == family) { - (*addresses)[count] = (ifaddr_t)addr; - ifaddr_reference((*addresses)[count]); - count++; - } - } - } - } - ifnet_lock_done(ifp); - if (interface || count == cmax) - break; + MALLOC(*addresses, ifaddr_t *, sizeof (ifaddr_t) * (count + 1), + M_TEMP, how); + if (*addresses == NULL) { + err = ENOMEM; + goto done; } - ifnet_head_done(); - (*addresses)[cmax] = 0; - - return 0; + bzero(*addresses, sizeof (ifaddr_t) * (count + 1)); + +done: + SLIST_FOREACH_SAFE(ifal, &ifal_head, ifal_le, ifal_tmp) { + SLIST_REMOVE(&ifal_head, ifal, ifnet_addr_list, ifal_le); + if (err == 0) + (*addresses)[--count] = ifal->ifal_ifa; + else + IFA_REMREF(ifal->ifal_ifa); + FREE(ifal, M_TEMP); + } + + return (err); } void -ifnet_free_address_list( - ifaddr_t *addresses) +ifnet_free_address_list(ifaddr_t *addresses) { int i; - - if (addresses == NULL) return; - + + if (addresses == NULL) + return; + for (i = 0; addresses[i] != NULL; i++) - { - ifaddr_release(addresses[i]); - } - + IFA_REMREF(addresses[i]); + FREE(addresses, M_TEMP); } -void* -ifnet_lladdr( - ifnet_t interface) +void * +ifnet_lladdr(ifnet_t interface) { - if (interface == NULL) return NULL; - return LLADDR(SDL(interface->if_addrhead.tqh_first->ifa_addr)); + struct ifaddr *ifa; + void *lladdr; + + if (interface == NULL) + return (NULL); + + /* + * if_lladdr points to the permanent link address of + * the interface; it never gets deallocated. + */ + ifa = interface->if_lladdr; + IFA_LOCK_SPIN(ifa); + lladdr = LLADDR(SDL(ifa->ifa_addr)); + IFA_UNLOCK(ifa); + + return (lladdr); } errno_t @@ -1068,74 +1095,80 @@ ifnet_llbroadcast_copy_bytes( } errno_t -ifnet_lladdr_copy_bytes( - ifnet_t interface, - void* lladdr, - size_t lladdr_len) +ifnet_lladdr_copy_bytes(ifnet_t interface, void *lladdr, size_t lladdr_len) { struct sockaddr_dl *sdl; - if (interface == NULL || lladdr == NULL) return EINVAL; - - sdl = SDL(interface->if_addrhead.tqh_first->ifa_addr); - - while (1) { - if (lladdr_len != sdl->sdl_alen) { - bzero(lladdr, lladdr_len); - return EMSGSIZE; - } - bcopy(LLADDR(sdl), lladdr, lladdr_len); - if (bcmp(lladdr, LLADDR(sdl), lladdr_len) == 0 && - lladdr_len == sdl->sdl_alen) - break; + struct ifaddr *ifa; + + if (interface == NULL || lladdr == NULL) + return (EINVAL); + + /* + * if_lladdr points to the permanent link address of + * the interface; it never gets deallocated. + */ + ifa = interface->if_lladdr; + IFA_LOCK_SPIN(ifa); + sdl = SDL(ifa->ifa_addr); + if (lladdr_len != sdl->sdl_alen) { + bzero(lladdr, lladdr_len); + IFA_UNLOCK(ifa); + return (EMSGSIZE); } - return 0; + bcopy(LLADDR(sdl), lladdr, lladdr_len); + IFA_UNLOCK(ifa); + + return (0); } static errno_t -ifnet_set_lladdr_internal( - ifnet_t interface, - const void *lladdr, - size_t lladdr_len, - u_char new_type, - int apply_type) +ifnet_set_lladdr_internal(ifnet_t interface, const void *lladdr, + size_t lladdr_len, u_char new_type, int apply_type) { struct ifaddr *ifa; - struct sockaddr_dl *sdl; errno_t error = 0; - - if (interface == NULL) return EINVAL; - - if (lladdr_len != 0 && (lladdr_len != interface->if_addrlen || lladdr == 0)) - return EINVAL; - + + if (interface == NULL) + return (EINVAL); + ifnet_head_lock_shared(); + ifnet_lock_exclusive(interface); + if (lladdr_len != 0 && + (lladdr_len != interface->if_addrlen || lladdr == 0)) { + ifnet_lock_done(interface); + ifnet_head_done(); + return (EINVAL); + } ifa = ifnet_addrs[interface->if_index - 1]; if (ifa != NULL) { + struct sockaddr_dl *sdl; + + IFA_LOCK_SPIN(ifa); sdl = (struct sockaddr_dl*)ifa->ifa_addr; if (lladdr_len != 0) { bcopy(lladdr, LLADDR(sdl), lladdr_len); - } - else { + } else { bzero(LLADDR(sdl), interface->if_addrlen); } sdl->sdl_alen = lladdr_len; - + if (apply_type) { sdl->sdl_type = new_type; } - } - else { + IFA_UNLOCK(ifa); + } else { error = ENXIO; } + ifnet_lock_done(interface); ifnet_head_done(); - + /* Generate a kernel event */ if (error == 0) { dlil_post_msg(interface, KEV_DL_SUBCLASS, KEV_DL_LINK_ADDRESS_CHANGED, NULL, 0); } - - return error; + + return (error); } errno_t @@ -1158,64 +1191,68 @@ ifnet_set_lladdr_and_type( } errno_t -ifnet_add_multicast( - ifnet_t interface, - const struct sockaddr *maddr, - ifmultiaddr_t *address) +ifnet_add_multicast(ifnet_t interface, const struct sockaddr *maddr, + ifmultiaddr_t *ifmap) { - if (interface == NULL || maddr == NULL) return EINVAL; - return if_addmulti(interface, maddr, address); + if (interface == NULL || maddr == NULL) + return (EINVAL); + + /* Don't let users screw up protocols' entries. */ + if (maddr->sa_family != AF_UNSPEC && maddr->sa_family != AF_LINK) + return (EINVAL); + + return (if_addmulti_anon(interface, maddr, ifmap)); } errno_t -ifnet_remove_multicast( - ifmultiaddr_t address) +ifnet_remove_multicast(ifmultiaddr_t ifma) { - if (address == NULL) return EINVAL; - return if_delmultiaddr(address, 0); + struct sockaddr *maddr; + + if (ifma == NULL) + return (EINVAL); + + maddr = ifma->ifma_addr; + /* Don't let users screw up protocols' entries. */ + if (maddr->sa_family != AF_UNSPEC && maddr->sa_family != AF_LINK) + return (EINVAL); + + return (if_delmulti_anon(ifma->ifma_ifp, maddr)); } -errno_t ifnet_get_multicast_list(ifnet_t interface, ifmultiaddr_t **addresses) +errno_t +ifnet_get_multicast_list(ifnet_t ifp, ifmultiaddr_t **addresses) { int count = 0; int cmax = 0; struct ifmultiaddr *addr; - int lock; - - if (interface == NULL || addresses == NULL) - return EINVAL; - - lock = (interface->if_lock != 0); - if (lock) ifnet_lock_shared(interface); - if ((interface->if_eflags & IFEF_DETACHING) == 0) { - LIST_FOREACH(addr, &interface->if_multiaddrs, ifma_link) - { - cmax++; - } - } - else { - if (lock) ifnet_lock_done(interface); - return ENXIO; + + if (ifp == NULL || addresses == NULL) + return (EINVAL); + + ifnet_lock_shared(ifp); + LIST_FOREACH(addr, &ifp->if_multiaddrs, ifma_link) { + cmax++; } - - MALLOC(*addresses, ifmultiaddr_t*, sizeof(ifmultiaddr_t) * (cmax + 1), M_TEMP, M_NOWAIT); + + MALLOC(*addresses, ifmultiaddr_t *, sizeof (ifmultiaddr_t) * (cmax + 1), + M_TEMP, M_NOWAIT); if (*addresses == NULL) { - if (lock) ifnet_lock_done(interface); - return ENOMEM; + ifnet_lock_done(ifp); + return (ENOMEM); } - - LIST_FOREACH(addr, &interface->if_multiaddrs, ifma_link) - { + + LIST_FOREACH(addr, &ifp->if_multiaddrs, ifma_link) { if (count + 1 > cmax) break; (*addresses)[count] = (ifmultiaddr_t)addr; ifmaddr_reference((*addresses)[count]); count++; } - (*addresses)[cmax] = 0; - if (lock) ifnet_lock_done(interface); - - return 0; + (*addresses)[cmax] = NULL; + ifnet_lock_done(ifp); + + return (0); } void @@ -1235,44 +1272,42 @@ ifnet_free_multicast_list( } errno_t -ifnet_find_by_name( - const char *ifname, - ifnet_t *interface) +ifnet_find_by_name(const char *ifname, ifnet_t *ifpp) { struct ifnet *ifp; int namelen; - - if (ifname == NULL) return EINVAL; - + + if (ifname == NULL) + return (EINVAL); + namelen = strlen(ifname); - - *interface = NULL; - + + *ifpp = NULL; + ifnet_head_lock_shared(); - TAILQ_FOREACH(ifp, &ifnet, if_link) - { - struct ifaddr *ifa = ifnet_addrs[ifp->if_index - 1]; + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + struct ifaddr *ifa; struct sockaddr_dl *ll_addr; - if (!ifa || !ifa->ifa_addr) + ifa = ifnet_addrs[ifp->if_index - 1]; + if (ifa == NULL) continue; + IFA_LOCK(ifa); ll_addr = (struct sockaddr_dl *)ifa->ifa_addr; - if ((ifp->if_eflags & IFEF_DETACHING) == 0 && - namelen == ll_addr->sdl_nlen && - (strncmp(ll_addr->sdl_data, ifname, ll_addr->sdl_nlen) == 0)) - { + if (namelen == ll_addr->sdl_nlen && + !strncmp(ll_addr->sdl_data, ifname, ll_addr->sdl_nlen)) { + IFA_UNLOCK(ifa); + *ifpp = ifp; + ifnet_reference(*ifpp); break; } - } - if (ifp) { - *interface = ifp; - ifnet_reference(*interface); + IFA_UNLOCK(ifa); } ifnet_head_done(); - - return (ifp == NULL) ? ENXIO : 0; + + return ((ifp == NULL) ? ENXIO : 0); } errno_t @@ -1287,54 +1322,74 @@ ifnet_list_get_all(ifnet_family_t family, ifnet_t **list, u_int32_t *count) return (ifnet_list_get_common(family, TRUE, list, count)); } +struct ifnet_list { + SLIST_ENTRY(ifnet_list) ifl_le; + struct ifnet *ifl_ifp; +}; + static errno_t ifnet_list_get_common(ifnet_family_t family, boolean_t get_all, ifnet_t **list, u_int32_t *count) { +#pragma unused(get_all) + SLIST_HEAD(, ifnet_list) ifl_head; + struct ifnet_list *ifl, *ifl_tmp; struct ifnet *ifp; - u_int32_t cmax = 0; - *count = 0; - errno_t result = 0; - - if (list == NULL || count == NULL) - return (EINVAL); - - ifnet_head_lock_shared(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { - if ((ifp->if_eflags & IFEF_DETACHING) && !get_all) - continue; - if (family == IFNET_FAMILY_ANY || ifp->if_family == family) - cmax++; - } + int cnt = 0; + errno_t err = 0; - if (cmax == 0) - result = ENXIO; + SLIST_INIT(&ifl_head); - if (result == 0) { - MALLOC(*list, ifnet_t*, sizeof(ifnet_t) * (cmax + 1), - M_TEMP, M_NOWAIT); - if (*list == NULL) - result = ENOMEM; + if (list == NULL || count == NULL) { + err = EINVAL; + goto done; } + *count = 0; + *list = NULL; - if (result == 0) { - TAILQ_FOREACH(ifp, &ifnet, if_link) { - if ((ifp->if_eflags & IFEF_DETACHING) && !get_all) - continue; - if (*count + 1 > cmax) - break; - if (family == IFNET_FAMILY_ANY || - ((ifnet_family_t)ifp->if_family) == family) { - (*list)[*count] = (ifnet_t)ifp; - ifnet_reference((*list)[*count]); - (*count)++; + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + if (family == IFNET_FAMILY_ANY || ifp->if_family == family) { + MALLOC(ifl, struct ifnet_list *, sizeof (*ifl), + M_TEMP, M_NOWAIT); + if (ifl == NULL) { + ifnet_head_done(); + err = ENOMEM; + goto done; } + ifl->ifl_ifp = ifp; + ifnet_reference(ifp); + SLIST_INSERT_HEAD(&ifl_head, ifl, ifl_le); + ++cnt; } - (*list)[*count] = NULL; } ifnet_head_done(); - return (result); + if (cnt == 0) { + err = ENXIO; + goto done; + } + + MALLOC(*list, ifnet_t *, sizeof (ifnet_t) * (cnt + 1), + M_TEMP, M_NOWAIT); + if (*list == NULL) { + err = ENOMEM; + goto done; + } + bzero(*list, sizeof (ifnet_t) * (cnt + 1)); + *count = cnt; + +done: + SLIST_FOREACH_SAFE(ifl, &ifl_head, ifl_le, ifl_tmp) { + SLIST_REMOVE(&ifl_head, ifl, ifnet_list, ifl_le); + if (err == 0) + (*list)[--cnt] = ifl->ifl_ifp; + else + ifnet_release(ifl->ifl_ifp); + FREE(ifl, M_TEMP); + } + + return (err); } void @@ -1345,9 +1400,8 @@ ifnet_list_free(ifnet_t *interfaces) if (interfaces == NULL) return; - for (i = 0; interfaces[i]; i++) { + for (i = 0; interfaces[i]; i++) ifnet_release(interfaces[i]); - } FREE(interfaces, M_TEMP); } @@ -1357,97 +1411,132 @@ ifnet_list_free(ifnet_t *interfaces) /****************************************************************************/ errno_t -ifaddr_reference( - ifaddr_t ifa) +ifaddr_reference(ifaddr_t ifa) { - if (ifa == NULL) return EINVAL; - ifaref(ifa); - return 0; + if (ifa == NULL) + return (EINVAL); + + IFA_ADDREF(ifa); + return (0); } errno_t -ifaddr_release( - ifaddr_t ifa) +ifaddr_release(ifaddr_t ifa) { - if (ifa == NULL) return EINVAL; - ifafree(ifa); - return 0; + if (ifa == NULL) + return (EINVAL); + + IFA_REMREF(ifa); + return (0); } sa_family_t -ifaddr_address_family( - ifaddr_t ifa) +ifaddr_address_family(ifaddr_t ifa) { - if (ifa && ifa->ifa_addr) - return ifa->ifa_addr->sa_family; - - return 0; + sa_family_t family = 0; + + if (ifa != NULL) { + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr != NULL) + family = ifa->ifa_addr->sa_family; + IFA_UNLOCK(ifa); + } + return (family); } errno_t -ifaddr_address( - ifaddr_t ifa, - struct sockaddr *out_addr, - u_int32_t addr_size) +ifaddr_address(ifaddr_t ifa, struct sockaddr *out_addr, u_int32_t addr_size) { u_int32_t copylen; - - if (ifa == NULL || out_addr == NULL) return EINVAL; - if (ifa->ifa_addr == NULL) return ENOTSUP; - - copylen = (addr_size >= ifa->ifa_addr->sa_len) ? ifa->ifa_addr->sa_len : addr_size; + + if (ifa == NULL || out_addr == NULL) + return (EINVAL); + + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr == NULL) { + IFA_UNLOCK(ifa); + return (ENOTSUP); + } + + copylen = (addr_size >= ifa->ifa_addr->sa_len) ? + ifa->ifa_addr->sa_len : addr_size; bcopy(ifa->ifa_addr, out_addr, copylen); - - if (ifa->ifa_addr->sa_len > addr_size) return EMSGSIZE; - - return 0; + + if (ifa->ifa_addr->sa_len > addr_size) { + IFA_UNLOCK(ifa); + return (EMSGSIZE); + } + + IFA_UNLOCK(ifa); + return (0); } errno_t -ifaddr_dstaddress( - ifaddr_t ifa, - struct sockaddr *out_addr, - u_int32_t addr_size) +ifaddr_dstaddress(ifaddr_t ifa, struct sockaddr *out_addr, u_int32_t addr_size) { u_int32_t copylen; - if (ifa == NULL || out_addr == NULL) return EINVAL; - if (ifa->ifa_dstaddr == NULL) return ENOTSUP; - - copylen = (addr_size >= ifa->ifa_dstaddr->sa_len) ? ifa->ifa_dstaddr->sa_len : addr_size; + + if (ifa == NULL || out_addr == NULL) + return (EINVAL); + + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_dstaddr == NULL) { + IFA_UNLOCK(ifa); + return (ENOTSUP); + } + + copylen = (addr_size >= ifa->ifa_dstaddr->sa_len) ? + ifa->ifa_dstaddr->sa_len : addr_size; bcopy(ifa->ifa_dstaddr, out_addr, copylen); - if (ifa->ifa_dstaddr->sa_len > addr_size) return EMSGSIZE; - - return 0; + if (ifa->ifa_dstaddr->sa_len > addr_size) { + IFA_UNLOCK(ifa); + return (EMSGSIZE); + } + + IFA_UNLOCK(ifa); + return (0); } errno_t -ifaddr_netmask( - ifaddr_t ifa, - struct sockaddr *out_addr, - u_int32_t addr_size) +ifaddr_netmask(ifaddr_t ifa, struct sockaddr *out_addr, u_int32_t addr_size) { u_int32_t copylen; - if (ifa == NULL || out_addr == NULL) return EINVAL; - if (ifa->ifa_netmask == NULL) return ENOTSUP; - - copylen = addr_size >= ifa->ifa_netmask->sa_len ? ifa->ifa_netmask->sa_len : addr_size; + + if (ifa == NULL || out_addr == NULL) + return (EINVAL); + + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_netmask == NULL) { + IFA_UNLOCK(ifa); + return (ENOTSUP); + } + + copylen = addr_size >= ifa->ifa_netmask->sa_len ? + ifa->ifa_netmask->sa_len : addr_size; bcopy(ifa->ifa_netmask, out_addr, copylen); - - if (ifa->ifa_netmask->sa_len > addr_size) return EMSGSIZE; - - return 0; + + if (ifa->ifa_netmask->sa_len > addr_size) { + IFA_UNLOCK(ifa); + return (EMSGSIZE); + } + + IFA_UNLOCK(ifa); + return (0); } ifnet_t -ifaddr_ifnet( - ifaddr_t ifa) +ifaddr_ifnet(ifaddr_t ifa) { struct ifnet *ifp; - if (ifa == NULL) return NULL; + + if (ifa == NULL) + return (NULL); + + /* ifa_ifp is set once at creation time; it is never changed */ ifp = ifa->ifa_ifp; - - return (ifnet_t)ifp; + + return (ifp); } ifaddr_t @@ -1494,60 +1583,70 @@ ifaddr_findbestforaddr( } errno_t -ifmaddr_reference( - ifmultiaddr_t ifmaddr) +ifmaddr_reference(ifmultiaddr_t ifmaddr) { - if (ifmaddr == NULL) return EINVAL; - ifma_reference(ifmaddr); - return 0; + if (ifmaddr == NULL) + return (EINVAL); + + IFMA_ADDREF(ifmaddr); + return (0); } errno_t -ifmaddr_release( - ifmultiaddr_t ifmaddr) +ifmaddr_release(ifmultiaddr_t ifmaddr) { - if (ifmaddr == NULL) return EINVAL; - ifma_release(ifmaddr); - return 0; + if (ifmaddr == NULL) + return (EINVAL); + + IFMA_REMREF(ifmaddr); + return (0); } errno_t -ifmaddr_address( - ifmultiaddr_t ifmaddr, - struct sockaddr *out_addr, - u_int32_t addr_size) +ifmaddr_address(ifmultiaddr_t ifma, struct sockaddr *out_addr, + u_int32_t addr_size) { u_int32_t copylen; - - if (ifmaddr == NULL || out_addr == NULL) return EINVAL; - if (ifmaddr->ifma_addr == NULL) return ENOTSUP; - - copylen = addr_size >= ifmaddr->ifma_addr->sa_len ? ifmaddr->ifma_addr->sa_len : addr_size; - bcopy(ifmaddr->ifma_addr, out_addr, copylen); - - if (ifmaddr->ifma_addr->sa_len > addr_size) return EMSGSIZE; - - return 0; + + if (ifma == NULL || out_addr == NULL) + return (EINVAL); + + IFMA_LOCK(ifma); + if (ifma->ifma_addr == NULL) { + IFMA_UNLOCK(ifma); + return (ENOTSUP); + } + + copylen = (addr_size >= ifma->ifma_addr->sa_len ? + ifma->ifma_addr->sa_len : addr_size); + bcopy(ifma->ifma_addr, out_addr, copylen); + + if (ifma->ifma_addr->sa_len > addr_size) { + IFMA_UNLOCK(ifma); + return (EMSGSIZE); + } + IFMA_UNLOCK(ifma); + return (0); } errno_t -ifmaddr_lladdress( - ifmultiaddr_t ifmaddr, - struct sockaddr *out_addr, - u_int32_t addr_size) +ifmaddr_lladdress(ifmultiaddr_t ifma, struct sockaddr *out_addr, + u_int32_t addr_size) { - if (ifmaddr == NULL || out_addr == NULL) return EINVAL; - if (ifmaddr->ifma_ll == NULL) return ENOTSUP; - - return ifmaddr_address(ifmaddr->ifma_ll, out_addr, addr_size); + struct ifmultiaddr *ifma_ll; + + if (ifma == NULL || out_addr == NULL) + return (EINVAL); + if ((ifma_ll = ifma->ifma_ll) == NULL) + return (ENOTSUP); + + return (ifmaddr_address(ifma_ll, out_addr, addr_size)); } ifnet_t -ifmaddr_ifnet( - ifmultiaddr_t ifmaddr) +ifmaddr_ifnet(ifmultiaddr_t ifma) { - if (ifmaddr == NULL || ifmaddr->ifma_ifp == NULL) return NULL; - return ifmaddr->ifma_ifp; + return (ifma == NULL ? NULL : ifma->ifma_ifp); } /******************************************************************************/ diff --git a/bsd/net/kpi_interface.h b/bsd/net/kpi_interface.h index e22eba1c7..e2fd084b6 100644 --- a/bsd/net/kpi_interface.h +++ b/bsd/net/kpi_interface.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2010 Apple Inc. All rights reserved. + * Copyright (c) 2004-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -66,6 +66,7 @@ struct ifnet_demux_desc; @constant IFNET_FAMILY_STF A 6to4 interface. @constant IFNET_FAMILY_FIREWIRE An IEEE 1394 (firewire) interface. @constant IFNET_FAMILY_BOND A virtual bonded interface. + @constant IFNET_FAMILY_CELLULAR A cellular interface. */ enum { @@ -83,7 +84,8 @@ enum { IFNET_FAMILY_FAITH = 11, IFNET_FAMILY_STF = 12, IFNET_FAMILY_FIREWIRE = 13, - IFNET_FAMILY_BOND = 14 + IFNET_FAMILY_BOND = 14, + IFNET_FAMILY_CELLULAR = 15 }; /*! @typedef ifnet_family_t @@ -129,6 +131,9 @@ typedef u_int32_t protocol_family_t; @constant IFNET_CSUM_UDP Hardware will calculate UDP checksums. @constant IFNET_CSUM_FRAGMENT Hardware will checksum IP fragments. @constant IFNET_IP_FRAGMENT Hardware will fragment IP packets. + @constant IFNET_CSUM_TCPIPV6 Hardware will calculate TCP IPv6 checksums. + @constant IFNET_CSUM_UDPIPV6 Hardware will calculate UDP IPv6 checksums. + @constant IFNET_IPV6_FRAGMENT Hardware will fragment IPv6 packets. @constant IFNET_VLAN_TAGGING Hardware will generate VLAN headers. @constant IFNET_VLAN_MTU Hardware supports VLAN MTU. @constant IFNET_MULTIPAGES Driver is capable of handling packets @@ -147,8 +152,15 @@ typedef u_int32_t protocol_family_t; If the Interface driver sets this flag, TCP will send larger frames (up to 64KB) as one frame to the adapter which will perform the final packetization. The maximum TSO segment supported by the interface can be set with "ifnet_set_tso_mtu". To retreive the real MTU - for the TCP connection the function "mbuf_get_tso_requested" is used by the driver. + for the TCP connection the function "mbuf_get_tso_requested" is used by the driver. Note + that if TSO is active, all the packets will be flagged for TSO, not just large packets. @constant IFNET_TSO_IPV6 Hardware supports IPv6 TCP Segment Offloading. + If the Interface driver sets this flag, TCP IPv6 will send larger frames (up to 64KB) as one + frame to the adapter which will perform the final packetization. The maximum TSO segment + supported by the interface can be set with "ifnet_set_tso_mtu". To retreive the real MTU + for the TCP IPv6 connection the function "mbuf_get_tso_requested" is used by the driver. + Note that if TSO is active, all the packets will be flagged for TSO, not just large packets. + */ enum { @@ -157,6 +169,9 @@ enum { IFNET_CSUM_UDP = 0x00000004, IFNET_CSUM_FRAGMENT = 0x00000008, IFNET_IP_FRAGMENT = 0x00000010, + IFNET_CSUM_TCPIPV6 = 0x00000020, + IFNET_CSUM_UDPIPV6 = 0x00000040, + IFNET_IPV6_FRAGMENT = 0x00000080, #ifdef KERNEL_PRIVATE IFNET_CSUM_SUM16 = 0x00001000, #endif /* KERNEL_PRIVATE */ @@ -839,7 +854,7 @@ extern const char *ifnet_name(ifnet_t interface); @function ifnet_family @discussion Returns the family of the interface. @param interface Interface to retrieve the unit number from. - @result Unit number. + @result Interface family type. */ extern ifnet_family_t ifnet_family(ifnet_t interface); @@ -942,16 +957,92 @@ extern u_int32_t ifnet_idle_flags(ifnet_t interface); #endif /* KERNEL_PRIVATE */ +/*! + @function ifnet_set_capabilities_supported + @discussion Specify the capabilities supported by the interface. + @discussion This function lets you specify which capabilities are supported + by the interface. Typically this function is called by the driver when + the interface gets attached to the system. + The mask allows to control which capability to set or unset. + The kernel will effectively take the lock, then set the + interface's flags to (if_capabilities & ~mask) | (new_caps & mask). + + This function is intended to be called by the driver. A kext + must not call this function on an interface the kext does not + own. + @param interface Interface to set the capabilities on. + @param new_caps The value of the capabilities that should be set or unset. These + flags are defined in net/if.h + @param mask Which capabilities that should be affected. These + flags are defined in net/if.h + @result 0 on success otherwise the errno error. + */ +extern errno_t ifnet_set_capabilities_supported(ifnet_t interface, u_int32_t new_caps, + u_int32_t mask); + +/*! + @function ifnet_capabilities_supported + @discussion Retrieve the interface capabilities supported by the interface. + @param interface Interface to retrieve the capabilities from. + @result Flags. Capabilities flags are defined in net/if.h + */ +extern u_int32_t ifnet_capabilities_supported(ifnet_t interface); + +/*! + @function ifnet_set_capabilities_enabled + @discussion Enable and/or disable the interface capabilities to match new_caps. + @discussion Sets the interface capabilities to new_caps. This function + lets you specify which capabilities you want to change using the mask. + The kernel will effectively take the lock, then set the + interface's flags to (if_capenable & ~mask) | (new_caps & mask). + + This function is intended to be called by the driver. A kext + must not call this function on an interface the kext does not + own. + + Typically this function is called by the driver when the interface is + created to specify which of the supported capabilities are enabled by + default. This function is also meant to be called when the driver handles + the interface ioctl SIOCSIFCAP. + + The driver should call ifnet_set_offlad() to indicate the corresponding + hardware offload bits that will be used by the networking stack. + + It is an error to enable a capability that is not marked as + supported by the interface. + @param interface Interface to set the capabilities on. + @param new_caps The value of the capabilities that should be set or unset. These + flags are defined in net/if.h + @param mask Which capabilities that should be affected. These + flags are defined in net/if.h + @result 0 on success otherwise the errno error. + */ +extern errno_t ifnet_set_capabilities_enabled(ifnet_t interface, u_int32_t new_caps, + u_int32_t mask); + +/*! + @function ifnet_capabilities_enabled + @discussion Retrieve the interface capabilities enabled on the interface. + @param interface Interface to retrieve the capabilities from. + @result Flags. Capabilities flags are defined in net/if.h + */ +extern u_int32_t ifnet_capabilities_enabled(ifnet_t interface); + + /*! @function ifnet_set_offload @discussion Sets a bitfield to indicate special hardware offload support provided by the interface such as hardware checksums and VLAN. This replaces the if_hwassist flags field. Any flags unrecognized by the stack will not be set. + + Note the system will automatically set the interface capabilities + that correspond to the offload flags modified -- i.e. the driver + does not have to call ifnet_set_capabilities_enabled() and + ifnet_set_capabilities_supported(). @param interface The interface. @param offload The new set of flags indicating which offload options the device supports. - @param mask The mask of flags to be modified. @result 0 on success otherwise the errno error. */ extern errno_t ifnet_set_offload(ifnet_t interface, ifnet_offload_t offload); @@ -1446,6 +1537,11 @@ extern errno_t ifnet_get_address_list(ifnet_t interface, ifaddr_t **addresses); extern errno_t ifnet_get_address_list_family(ifnet_t interface, ifaddr_t **addresses, sa_family_t family); +#ifdef KERNEL_PRIVATE +__private_extern__ errno_t ifnet_get_address_list_family_internal(ifnet_t, + ifaddr_t **, sa_family_t, int, int); +#endif /* KERNEL_PRIVATE */ + /*! @function ifnet_free_address_list @discussion Free a list of addresses returned from @@ -1543,9 +1639,9 @@ extern errno_t ifnet_resolve_multicast(ifnet_t ifp, ifnet_remove_multicast and making sure you no longer have any references to the multicast. @param interface The interface. - @param maddr The multicast address to join. Either a physical - address or logical address to be translated to a physical - address. + @param maddr The multicast address (AF_UNSPEC/AF_LINK) to join. Either + a physical address or logical address to be translated to a + physical address. @param multicast The resulting ifmultiaddr_t multicast address. @result 0 on success otherwise the errno error. */ diff --git a/bsd/net/kpi_protocol.c b/bsd/net/kpi_protocol.c index a48cd249a..6c3043c94 100644 --- a/bsd/net/kpi_protocol.c +++ b/bsd/net/kpi_protocol.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -154,14 +154,14 @@ proto_register_input( } - lck_mtx_lock(thread->input_lck); + lck_mtx_lock(&thread->input_lck); entry->next = proto_input_add_list; proto_input_add_list = entry; thread->input_waiting |= DLIL_PROTO_REGISTER; if ((thread->input_waiting & DLIL_INPUT_RUNNING) == 0) wakeup((caddr_t)&thread->input_waiting); - lck_mtx_unlock(thread->input_lck); + lck_mtx_unlock(&thread->input_lck); return 0; } @@ -219,14 +219,14 @@ proto_input_run(void) mbuf_t packet_list; int i, locked = 0; - lck_mtx_assert(thread->input_lck, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_assert(&thread->input_lck, LCK_MTX_ASSERT_NOTOWNED); if ((thread->input_waiting & DLIL_PROTO_REGISTER) != 0) { - lck_mtx_lock(thread->input_lck); + lck_mtx_lock_spin(&thread->input_lck); entry = proto_input_add_list; proto_input_add_list = NULL; thread->input_waiting &= ~DLIL_PROTO_REGISTER; - lck_mtx_unlock(thread->input_lck); + lck_mtx_unlock(&thread->input_lck); proto_delayed_attach(entry); } /* @@ -237,7 +237,7 @@ proto_input_run(void) for (entry = proto_hash[i]; entry && proto_total_waiting; entry = entry->next) { if (entry->inject_first) { - lck_mtx_lock(thread->input_lck); + lck_mtx_lock_spin(&thread->input_lck); thread->input_waiting &= ~DLIL_PROTO_WAITING; packet_list = entry->inject_first; @@ -246,7 +246,7 @@ proto_input_run(void) entry->inject_last = NULL; proto_total_waiting--; - lck_mtx_unlock(thread->input_lck); + lck_mtx_unlock(&thread->input_lck); if (entry->domain && (entry->domain->dom_flags & DOM_REENTRANT) == 0) { lck_mtx_lock(entry->domain->dom_mtx); @@ -333,7 +333,7 @@ proto_inject( } if (entry) { - lck_mtx_lock(thread->input_lck); + lck_mtx_lock(&thread->input_lck); if (entry->inject_first == NULL) { proto_total_waiting++; thread->input_waiting |= DLIL_PROTO_WAITING; @@ -346,7 +346,7 @@ proto_inject( if ((thread->input_waiting & DLIL_INPUT_RUNNING) == 0) { wakeup((caddr_t)&thread->input_waiting); } - lck_mtx_unlock(thread->input_lck); + lck_mtx_unlock(&thread->input_lck); } else { diff --git a/bsd/net/multicast_list.c b/bsd/net/multicast_list.c index 68fbf23b0..e91aeeb11 100644 --- a/bsd/net/multicast_list.c +++ b/bsd/net/multicast_list.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -151,5 +151,6 @@ multicast_list_program(struct multicast_list * mc_list, (void)multicast_list_remove(mc_list); *mc_list = new_mc_list; } + ifnet_free_multicast_list(source_multicast_list); return (error); } diff --git a/bsd/net/ndrv.c b/bsd/net/ndrv.c index 1797d16f4..51c218910 100644 --- a/bsd/net/ndrv.c +++ b/bsd/net/ndrv.c @@ -100,6 +100,8 @@ extern struct domain ndrvdomain; extern struct protosw ndrvsw; extern lck_mtx_t *domain_proto_mtx; +#define NDRV_PROTODEMUX_COUNT 10 + /* * Verify these values match. * To keep clients from including dlil.h, we define @@ -703,6 +705,8 @@ ndrv_setspec(struct ndrv_cb *np, struct sockopt *sopt) return ENOTSUP; // version is too new! else if (ndrvSpec.version < 1) return EINVAL; // version is not valid + else if (ndrvSpec.demux_count > NDRV_PROTODEMUX_COUNT || ndrvSpec.demux_count == 0) + return EINVAL; // demux_count is not valid bzero(&proto_param, sizeof(proto_param)); proto_param.demux_count = ndrvSpec.demux_count; diff --git a/bsd/net/ndrv.h b/bsd/net/ndrv.h index 6f61df9f5..7e9fc9700 100644 --- a/bsd/net/ndrv.h +++ b/bsd/net/ndrv.h @@ -109,7 +109,7 @@ struct ndrv_demux_desc * Field: * version : must be NDRV_PROTOCOL_DESC_VERS * protocol_family : unique identifier for this protocol - * demux_count : number of demux_list descriptors in demux_list + * demux_count : number of demux_list descriptors in demux_list; maximum of 10 * demux_list : pointer to array of demux descriptors */ struct ndrv_protocol_desc diff --git a/bsd/net/net_osdep.h b/bsd/net/net_osdep.h index 4208380cf..a17921f57 100644 --- a/bsd/net/net_osdep.h +++ b/bsd/net/net_osdep.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -244,9 +244,10 @@ * NetBSD 1.5: always use IFAREF whenever reference gets added. * always use IFAFREE whenever reference gets freed. * IFAFREE frees ifaddr when ifa_refcnt reaches 0. - * Darwin: always use ifaref whenever reference gets added. - * always use ifafree whenever reference gets freed. - * ifaref and ifafree are responsible for determining when to free. + * Darwin: always use IFA_ADDREF whenever reference gets added. + * always use IFA_REMREF whenever reference gets freed. + * IFA_ADDREF and IFA_REMREF are responsible for determining + * when to free. * others: do not increase refcnt for ifp->if_addrlist and in_ifaddr. * use IFAFREE once when ifaddr is disconnected from * ifp->if_addrlist and in_ifaddr. IFAFREE frees ifaddr when @@ -267,11 +268,6 @@ extern const char *if_name(struct ifnet *); #define if_addrlist if_addrhead #define if_list if_link -/* sys/net/if.h */ -#ifndef __APPLE__ -#define IFAREF(ifa) do { ++(ifa)->ifa_refcnt; } while (0) -#endif - #define WITH_CONVERT_AND_STRIP_IP_LEN #if 1 /* at this moment, all OSes do this */ diff --git a/bsd/net/net_str_id.c b/bsd/net/net_str_id.c index 7f4fcd52f..bc28f03c4 100644 --- a/bsd/net/net_str_id.c +++ b/bsd/net/net_str_id.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008,2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -52,8 +52,6 @@ static lck_mtx_t *net_str_id_lock = NULL; static u_int32_t nsi_kind_next[NSI_MAX_KIND] = { FIRST_NET_STR_ID, FIRST_NET_STR_ID, FIRST_NET_STR_ID }; static u_int32_t nsi_next_id = FIRST_NET_STR_ID; -#if NETMIBS - extern int sysctl_if_family_ids SYSCTL_HANDLER_ARGS; SYSCTL_DECL(_net_link_generic_system); @@ -61,9 +59,6 @@ SYSCTL_DECL(_net_link_generic_system); SYSCTL_PROC(_net_link_generic_system, OID_AUTO, if_family_ids, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_if_family_ids, "S, if_family_id", "Interface Family ID table"); -#endif /* NETMIBS */ - - __private_extern__ void net_str_id_init(void) { @@ -153,8 +148,6 @@ net_str_id_find_internal(const char *string, u_int32_t *out_id, } -#if NETMIBS - #define ROUNDUP32(a) \ ((a) > 0 ? (1 + (((a) - 1) | (sizeof(uint32_t) - 1))) : sizeof(uint32_t)) @@ -210,6 +203,3 @@ sysctl_if_family_ids SYSCTL_HANDLER_ARGS /* XXX bad syntax! */ _FREE(iffmid, M_TEMP); return error; } - -#endif /* NETMIBS */ - diff --git a/bsd/net/netsrc.c b/bsd/net/netsrc.c new file mode 100644 index 000000000..2c1037c26 --- /dev/null +++ b/bsd/net/netsrc.c @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +static errno_t netsrc_ctlsend(kern_ctl_ref, uint32_t, void *, mbuf_t, int); +static errno_t netsrc_ctlconnect(kern_ctl_ref, struct sockaddr_ctl *, void **); +static errno_t netsrc_ipv4(kern_ctl_ref, uint32_t, struct netsrc_req *); +static errno_t netsrc_ipv6(kern_ctl_ref, uint32_t, struct netsrc_req *); + +static kern_ctl_ref netsrc_ctlref = NULL; + +__private_extern__ void +netsrc_init(void) +{ + errno_t error; + struct kern_ctl_reg netsrc_ctl = { + .ctl_connect = netsrc_ctlconnect, + .ctl_send = netsrc_ctlsend, + }; + + strlcpy(netsrc_ctl.ctl_name, NETSRC_CTLNAME, sizeof(NETSRC_CTLNAME)); + + if ((error = ctl_register(&netsrc_ctl, &netsrc_ctlref))) + printf("%s: ctl_register failed %d\n", __func__, error); +} + +static errno_t +netsrc_ctlconnect(kern_ctl_ref kctl, struct sockaddr_ctl *sac, void **uinfo) +{ +#pragma unused(kctl, sac, uinfo) + + /* + * We don't need to do anything here. This callback is only necessary + * for ctl_register() to succeed. + */ + return (0); +} + +static errno_t +netsrc_ctlsend(kern_ctl_ref kctl, uint32_t unit, void *uinfo, mbuf_t m, + int flags) +{ +#pragma unused(uinfo, flags) + errno_t error; + struct netsrc_req *nrq, storage; + + if (mbuf_pkthdr_len(m) < sizeof(*nrq)) { + error = EINVAL; + goto out; + } + if (mbuf_len(m) >= sizeof(*nrq)) + nrq = mbuf_data(m); + else { + mbuf_copydata(m, 0, sizeof(storage), &storage); + nrq = &storage; + } + /* We only have one version right now. */ + if (nrq->nrq_ver != NETSRC_VERSION1) { + error = EINVAL; + goto out; + } + switch (nrq->nrq_sin.sin_family) { + case AF_INET: + error = netsrc_ipv4(kctl, unit, nrq); + break; + case AF_INET6: + error = netsrc_ipv6(kctl, unit, nrq); + break; + default: + printf("%s: invalid family\n", __func__); + error = EINVAL; + } +out: + mbuf_freem(m); + + return (error); + +} + +static errno_t +netsrc_ipv4(kern_ctl_ref kctl, uint32_t unit, struct netsrc_req *nrq) +{ + errno_t error = EHOSTUNREACH; + struct sockaddr_in *dstsin; + struct rtentry *rt; + struct in_ifaddr *ia; + struct netsrc_rep nrp; + struct sockaddr_in6 v4entry = { + .sin6_family = AF_INET6, + .sin6_len = sizeof(struct sockaddr_in6), + .sin6_addr = IN6ADDR_V4MAPPED_INIT, + }; + struct in6_addrpolicy *policy; + + dstsin = &nrq->nrq_sin; + + if (dstsin->sin_len < sizeof (*dstsin) || + dstsin->sin_addr.s_addr == INADDR_ANY) + return (EINVAL); + + lck_mtx_lock(rnh_lock); + rt = rt_lookup(TRUE, (struct sockaddr *)dstsin, NULL, + rt_tables[AF_INET], nrq->nrq_ifscope); + lck_mtx_unlock(rnh_lock); + if (!rt) + return (EHOSTUNREACH); + lck_rw_lock_shared(in_ifaddr_rwlock); + TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { + IFA_LOCK_SPIN(&ia->ia_ifa); + if (ia->ia_ifp == rt->rt_ifp) { + memset(&nrp, 0, sizeof(nrp)); + memcpy(&nrp.nrp_sin, IA_SIN(ia), sizeof(nrp.nrp_sin)); + IFA_UNLOCK(&ia->ia_ifa); + v4entry.sin6_addr.s6_addr32[3] = + nrp.nrp_sin.sin_addr.s_addr; + policy = in6_addrsel_lookup_policy(&v4entry); + if (policy->label != -1) { + nrp.nrp_label = policy->label; + nrp.nrp_precedence = policy->preced; + /* XXX might not be true */ + nrp.nrp_dstlabel = policy->label; + nrp.nrp_dstprecedence = policy->preced; + } + error = ctl_enqueuedata(kctl, unit, &nrp, + sizeof(nrp), CTL_DATA_EOR); + break; + } + IFA_UNLOCK(&ia->ia_ifa); + } + lck_rw_done(in_ifaddr_rwlock); + if (rt) + rtfree(rt); + + return (error); +} + +static errno_t +netsrc_ipv6(kern_ctl_ref kctl, uint32_t unit, struct netsrc_req *nrq) +{ + struct sockaddr_in6 *dstsin6; + struct in6_addr *in6, storage; + struct in6_ifaddr *ia; + struct route_in6 ro; + int error = EHOSTUNREACH; + struct netsrc_rep nrp; + + dstsin6 = &nrq->nrq_sin6; + + if (dstsin6->sin6_len < sizeof (*dstsin6) || + IN6_IS_ADDR_UNSPECIFIED(&dstsin6->sin6_addr)) + return (EINVAL); + + memset(&ro, 0, sizeof(ro)); + lck_mtx_lock(rnh_lock); + ro.ro_rt = rt_lookup(TRUE, (struct sockaddr *)dstsin6, NULL, + rt_tables[AF_INET6], nrq->nrq_ifscope); + lck_mtx_unlock(rnh_lock); + if (!ro.ro_rt) + return (EHOSTUNREACH); + in6 = in6_selectsrc(dstsin6, NULL, NULL, &ro, NULL, &storage, + nrq->nrq_ifscope, &error); + if (ro.ro_rt) + rtfree(ro.ro_rt); + if (!in6 || error) + return (error); + memset(&nrp, 0, sizeof(nrp)); + nrp.nrp_sin6.sin6_family = AF_INET6; + nrp.nrp_sin6.sin6_len = sizeof(nrp.nrp_sin6); + memcpy(&nrp.nrp_sin6.sin6_addr, in6, sizeof(nrp.nrp_sin6.sin6_addr)); + lck_rw_lock_shared(&in6_ifaddr_rwlock); + for (ia = in6_ifaddrs; ia; ia = ia->ia_next) { + if (memcmp(&ia->ia_addr.sin6_addr, in6, sizeof(*in6)) == 0) { + struct sockaddr_in6 sin6; + struct in6_addrpolicy *policy; + + if (ia->ia6_flags & IN6_IFF_TEMPORARY) + nrp.nrp_flags |= NETSRC_IP6_FLAG_TEMPORARY; + if (ia->ia6_flags & IN6_IFF_TENTATIVE) + nrp.nrp_flags |= NETSRC_IP6_FLAG_TENTATIVE; + if (ia->ia6_flags & IN6_IFF_DEPRECATED) + nrp.nrp_flags |= NETSRC_IP6_FLAG_DEPRECATED; + sin6.sin6_family = AF_INET6; + sin6.sin6_len = sizeof(sin6); + memcpy(&sin6.sin6_addr, in6, sizeof(*in6)); + policy = in6_addrsel_lookup_policy(&sin6); + if (policy->label != -1) { + nrp.nrp_label = policy->label; + nrp.nrp_precedence = policy->preced; + } + memcpy(&sin6.sin6_addr, &dstsin6->sin6_addr, + sizeof(dstsin6->sin6_addr)); + policy = in6_addrsel_lookup_policy(&sin6); + if (policy->label != -1) { + nrp.nrp_dstlabel = policy->label; + nrp.nrp_dstprecedence = policy->preced; + } + break; + } + } + lck_rw_done(&in6_ifaddr_rwlock); + error = ctl_enqueuedata(kctl, unit, &nrp, sizeof(nrp), + CTL_DATA_EOR); + + return (error); +} diff --git a/EXTERNAL_HEADERS/architecture/ppc/cframe.h b/bsd/net/netsrc.h similarity index 58% rename from EXTERNAL_HEADERS/architecture/ppc/cframe.h rename to bsd/net/netsrc.h index 0db3fce7d..54ba8d8be 100644 --- a/EXTERNAL_HEADERS/architecture/ppc/cframe.h +++ b/bsd/net/netsrc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,26 +25,46 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* Copyright (c) 1991 NeXT Software, Inc. All rights reserved. - * - * File: architecture/ppc/cframe.h - * Author: Mike DeMoney, NeXT Software, Inc. - * - * This include file defines C calling sequence defines - * for ppc port. - */ -#ifndef _ARCH_PPC_CFRAME_H_ -#define _ARCH_PPC_CFRAME_H_ - -#if defined (__ppc64__) -#define C_ARGSAVE_LEN 64 /* at least 64 bytes of arg save */ -#define C_STACK_ALIGN 32 /* stack must be 32 byte aligned */ -#define C_RED_ZONE 320 /* 320 bytes to skip over saved registers */ -#else -#define C_ARGSAVE_LEN 32 /* at least 32 bytes of arg save */ -#define C_STACK_ALIGN 16 /* stack must be 16 byte aligned */ -#define C_RED_ZONE 224 /* 224 bytes to skip over saved registers */ +#ifndef __NET_NETSRC_H__ + +#define NETSRC_CTLNAME "com.apple.netsrc" + +#define NETSRC_VERSION1 1 +#define NETSRC_CURVERS NETSRC_VERSION1 + +struct netsrc_req { + unsigned int nrq_ver; + unsigned int nrq_ifscope; + union { + struct sockaddr_in _usin; + struct sockaddr_in6 _usin6; + } _usa; +}; + +#define nrq_sin _usa._usin +#define nrq_sin6 _usa._usin6 + +struct netsrc_rep { + union { + struct sockaddr_in _usin; + struct sockaddr_in6 _usin6; + } _usa; +#define NETSRC_IP6_FLAG_TENTATIVE 0x0001 +#define NETSRC_IP6_FLAG_TEMPORARY 0x0002 +#define NETSRC_IP6_FLAG_DEPRECATED 0x0004 + uint16_t nrp_flags; + uint16_t nrp_label; + uint16_t nrp_precedence; + uint16_t nrp_dstlabel; + uint16_t nrp_dstprecedence; +}; + +#define nrp_sin _usa._usin +#define nrp_sin6 _usa._usin6 + +#ifdef KERNEL_PRIVATE +__private_extern__ void netsrc_init(void); #endif -#endif /* _ARCH_PPC_CFRAME_H_ */ +#endif /* __NET_NETSRC_H__ */ diff --git a/bsd/net/ntstat.c b/bsd/net/ntstat.c new file mode 100644 index 000000000..4bb6e1c28 --- /dev/null +++ b/bsd/net/ntstat.c @@ -0,0 +1,1954 @@ +/* + * Copyright (c) 2010-2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +__private_extern__ int nstat_collect = 1; +SYSCTL_INT(_net, OID_AUTO, statistics, CTLFLAG_RW | CTLFLAG_LOCKED, + &nstat_collect, 0, "Collect detailed statistics"); + +typedef struct nstat_control_state +{ + struct nstat_control_state *next; + u_int32_t watching; + decl_lck_mtx_data(, mtx); + kern_ctl_ref kctl; + u_int32_t unit; + nstat_src_ref_t next_srcref; + struct nstat_src *srcs; + int cleanup; + int suser; +} nstat_control_state; + +static void nstat_control_register(void); + +static volatile OSMallocTag nstat_malloc_tag = NULL; +static nstat_control_state *nstat_controls = NULL; +static uint64_t nstat_idle_time = 0ULL; +static decl_lck_mtx_data(, nstat_mtx); + +static void +nstat_copy_sa_out( + const struct sockaddr *src, + struct sockaddr *dst, + int maxlen) +{ + if (src->sa_len > maxlen) return; + + bcopy(src, dst, src->sa_len); + if (src->sa_family == AF_INET6 && + src->sa_len >= sizeof(struct sockaddr_in6)) + { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6*)dst; + if (IN6_IS_SCOPE_EMBED(&sin6->sin6_addr)) + { + if (sin6->sin6_scope_id == 0) + sin6->sin6_scope_id = ntohs(sin6->sin6_addr.__u6_addr.__u6_addr16[1]); + sin6->sin6_addr.__u6_addr.__u6_addr16[1] = 0; + } + } +} + +static void +nstat_ip_to_sockaddr( + const struct in_addr *ip, + u_int16_t port, + struct sockaddr_in *sin, + u_int32_t maxlen) +{ + if (maxlen < sizeof(struct sockaddr_in)) + return; + + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_port = port; + sin->sin_addr = *ip; +} + +static void +nstat_ip6_to_sockaddr( + const struct in6_addr *ip6, + u_int16_t port, + struct sockaddr_in6 *sin6, + u_int32_t maxlen) +{ + if (maxlen < sizeof(struct sockaddr_in6)) + return; + + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(*sin6); + sin6->sin6_port = port; + sin6->sin6_addr = *ip6; + if (IN6_IS_SCOPE_EMBED(&sin6->sin6_addr)) + { + sin6->sin6_scope_id = ntohs(sin6->sin6_addr.__u6_addr.__u6_addr16[1]); + sin6->sin6_addr.__u6_addr.__u6_addr16[1] = 0; + } +} + +#pragma mark -- Network Statistic Providers -- + +typedef struct nstat_provider +{ + struct nstat_provider *next; + nstat_provider_id_t nstat_provider_id; + size_t nstat_descriptor_length; + errno_t (*nstat_lookup)(const void *data, u_int32_t length, nstat_provider_cookie_t *out_cookie); + int (*nstat_gone)(nstat_provider_cookie_t cookie); + errno_t (*nstat_counts)(nstat_provider_cookie_t cookie, struct nstat_counts *out_counts, int *out_gone); + errno_t (*nstat_watcher_add)(nstat_control_state *state); + void (*nstat_watcher_remove)(nstat_control_state *state); + errno_t (*nstat_copy_descriptor)(nstat_provider_cookie_t cookie, void *data, u_int32_t len); + void (*nstat_release)(nstat_provider_cookie_t cookie); +} nstat_provider; + +static errno_t nstat_control_source_add(u_int64_t context, nstat_control_state *state, nstat_provider *provider, nstat_provider_cookie_t cookie); +struct nstat_provider *nstat_providers = NULL; + +static struct nstat_provider* +nstat_find_provider_by_id( + nstat_provider_id_t id) +{ + struct nstat_provider *provider; + + for (provider = nstat_providers; provider != NULL; provider = provider->next) + { + if (provider->nstat_provider_id == id) + break; + } + + return provider; +} + +static errno_t +nstat_lookup_entry( + nstat_provider_id_t id, + const void *data, + u_int32_t length, + nstat_provider **out_provider, + nstat_provider_cookie_t *out_cookie) +{ + *out_provider = nstat_find_provider_by_id(id); + if (*out_provider == NULL) + { + printf("%s:%d: provider %u not found\n", __FUNCTION__, __LINE__, id); + return ENOENT; + } + + return (*out_provider)->nstat_lookup(data, length, out_cookie); +} + +static void nstat_init_route_provider(void); +static void nstat_init_tcp_provider(void); +static void nstat_init_udp_provider(void); + +static void +nstat_init(void) +{ + if (nstat_malloc_tag != NULL) return; + + OSMallocTag tag = OSMalloc_Tagalloc(NET_STAT_CONTROL_NAME, OSMT_DEFAULT); + if (!OSCompareAndSwapPtr(NULL, tag, &nstat_malloc_tag)) + { + OSMalloc_Tagfree(tag); + tag = nstat_malloc_tag; + } + else + { + // we need to initialize other things, we do it here as this code path will only be hit once; + nstat_init_route_provider(); + nstat_init_tcp_provider(); + nstat_init_udp_provider(); + nstat_control_register(); + } +} + +#pragma mark -- Aligned Buffer Allocation -- + +struct align_header +{ + u_int32_t offset; + u_int32_t length; +}; + +static void* +nstat_malloc_aligned( + u_int32_t length, + u_int8_t alignment, + OSMallocTag tag) +{ + struct align_header *hdr = NULL; + u_int32_t size = length + sizeof(*hdr) + alignment - 1; + + u_int8_t *buffer = OSMalloc(size, tag); + if (buffer == NULL) return NULL; + + u_int8_t *aligned = buffer + sizeof(*hdr); + aligned = (u_int8_t*)P2ROUNDUP(aligned, alignment); + + hdr = (struct align_header*)(aligned - sizeof(*hdr)); + hdr->offset = aligned - buffer; + hdr->length = size; + + return aligned; +} + +static void +nstat_free_aligned( + void *buffer, + OSMallocTag tag) +{ + struct align_header *hdr = (struct align_header*)((u_int8_t*)buffer - sizeof(*hdr)); + OSFree(((char*)buffer) - hdr->offset, hdr->length, tag); +} + +#pragma mark -- Route Provider -- + +static nstat_provider nstat_route_provider; + +static errno_t +nstat_route_lookup( + const void *data, + u_int32_t length, + nstat_provider_cookie_t *out_cookie) +{ + // rt_lookup doesn't take const params but it doesn't modify the parameters for + // the lookup. So...we use a union to eliminate the warning. + union + { + struct sockaddr *sa; + const struct sockaddr *const_sa; + } dst, mask; + + const nstat_route_add_param *param = (const nstat_route_add_param*)data; + *out_cookie = NULL; + + if (length < sizeof(*param)) + { + printf("%s:%d: expected %lu byte param, received %u\n", __FUNCTION__, __LINE__, sizeof(*param), length); + return EINVAL; + } + + if (param->dst.v4.sin_family == 0 || + param->dst.v4.sin_family > AF_MAX || + (param->mask.v4.sin_family != 0 && param->mask.v4.sin_family != param->dst.v4.sin_family)) + { + printf("%s:%d invalid family (dst=%d, mask=%d)\n", __FUNCTION__, __LINE__, + param->dst.v4.sin_family, param->mask.v4.sin_family); + return EINVAL; + } + + if (param->dst.v4.sin_len > sizeof(param->dst) || + (param->mask.v4.sin_family && param->mask.v4.sin_len > sizeof(param->mask.v4.sin_len))) + { + printf("%s:%d invalid length (dst=%d, mask=%d)\n", __FUNCTION__, __LINE__, + param->dst.v4.sin_len, param->mask.v4.sin_len); + } + + // TBD: Need to validate length of sockaddr for different families? + dst.const_sa = (const struct sockaddr*)¶m->dst; + mask.const_sa = param->mask.v4.sin_family ? (const struct sockaddr*)¶m->mask : NULL; + + struct radix_node_head *rnh = rt_tables[dst.sa->sa_family]; + if (rnh == NULL) return EAFNOSUPPORT; + + lck_mtx_lock(rnh_lock); + struct rtentry *rt = rt_lookup(TRUE, dst.sa, mask.sa, rnh, param->ifindex); + lck_mtx_unlock(rnh_lock); + + if (rt) *out_cookie = (nstat_provider_cookie_t)rt; + + return rt ? 0 : ENOENT; +} + +static int +nstat_route_gone( + nstat_provider_cookie_t cookie) +{ + struct rtentry *rt = (struct rtentry*)cookie; + return ((rt->rt_flags & RTF_UP) == 0) ? 1 : 0; +} + +static errno_t +nstat_route_counts( + nstat_provider_cookie_t cookie, + struct nstat_counts *out_counts, + int *out_gone) +{ + struct rtentry *rt = (struct rtentry*)cookie; + struct nstat_counts *rt_stats = rt->rt_stats; + + *out_gone = 0; + + if ((rt->rt_flags & RTF_UP) == 0) *out_gone = 1; + + if (rt_stats) + { + atomic_get_64(out_counts->nstat_rxpackets, &rt_stats->nstat_rxpackets); + atomic_get_64(out_counts->nstat_rxbytes, &rt_stats->nstat_rxbytes); + atomic_get_64(out_counts->nstat_txpackets, &rt_stats->nstat_txpackets); + atomic_get_64(out_counts->nstat_txbytes, &rt_stats->nstat_txbytes); + out_counts->nstat_rxduplicatebytes = rt_stats->nstat_rxduplicatebytes; + out_counts->nstat_rxoutoforderbytes = rt_stats->nstat_rxoutoforderbytes; + out_counts->nstat_txretransmit = rt_stats->nstat_txretransmit; + out_counts->nstat_connectattempts = rt_stats->nstat_connectattempts; + out_counts->nstat_connectsuccesses = rt_stats->nstat_connectsuccesses; + out_counts->nstat_min_rtt = rt_stats->nstat_min_rtt; + out_counts->nstat_avg_rtt = rt_stats->nstat_avg_rtt; + out_counts->nstat_var_rtt = rt_stats->nstat_var_rtt; + } + else + bzero(out_counts, sizeof(*out_counts)); + + return 0; +} + +static void +nstat_route_release( + nstat_provider_cookie_t cookie) +{ + rtfree((struct rtentry*)cookie); +} + +static u_int32_t nstat_route_watchers = 0; + +static int +nstat_route_walktree_add( + struct radix_node *rn, + void *context) +{ + errno_t result = 0; + struct rtentry *rt = (struct rtentry *)rn; + nstat_control_state *state = (nstat_control_state*)context; + + lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); + + /* RTF_UP can't change while rnh_lock is held */ + if ((rt->rt_flags & RTF_UP) != 0) + { + /* Clear RTPRF_OURS if the route is still usable */ + RT_LOCK(rt); + if (rt_validate(rt)) { + RT_ADDREF_LOCKED(rt); + RT_UNLOCK(rt); + } else { + RT_UNLOCK(rt); + rt = NULL; + } + + /* Otherwise if RTF_CONDEMNED, treat it as if it were down */ + if (rt == NULL) + return (0); + + result = nstat_control_source_add(0, state, &nstat_route_provider, rt); + if (result != 0) + rtfree_locked(rt); + } + + return result; +} + +static errno_t +nstat_route_add_watcher( + nstat_control_state *state) +{ + int i; + errno_t result = 0; + OSIncrementAtomic(&nstat_route_watchers); + + lck_mtx_lock(rnh_lock); + for (i = 1; i < AF_MAX; i++) + { + struct radix_node_head *rnh; + rnh = rt_tables[i]; + if (!rnh) continue; + + result = rnh->rnh_walktree(rnh, nstat_route_walktree_add, state); + if (result != 0) + { + printf("%s:%d rnh_walktree failed: %d\n", __FUNCTION__, __LINE__, result); + break; + } + } + lck_mtx_unlock(rnh_lock); + + return result; +} + +__private_extern__ void +nstat_route_new_entry( + struct rtentry *rt) +{ + if (nstat_route_watchers == 0) + return; + + lck_mtx_lock(&nstat_mtx); + if ((rt->rt_flags & RTF_UP) != 0) + { + nstat_control_state *state; + for (state = nstat_controls; state; state = state->next) + { + if ((state->watching & (1 << NSTAT_PROVIDER_ROUTE)) != 0) + { + // this client is watching routes + // acquire a reference for the route + RT_ADDREF(rt); + + // add the source, if that fails, release the reference + if (nstat_control_source_add(0, state, &nstat_route_provider, rt) != 0) + RT_REMREF(rt); + } + } + } + lck_mtx_unlock(&nstat_mtx); +} + +static void +nstat_route_remove_watcher( + __unused nstat_control_state *state) +{ + OSDecrementAtomic(&nstat_route_watchers); +} + +static errno_t +nstat_route_copy_descriptor( + nstat_provider_cookie_t cookie, + void *data, + u_int32_t len) +{ + nstat_route_descriptor *desc = (nstat_route_descriptor*)data; + if (len < sizeof(*desc)) + { + printf("%s:%d invalid length, wanted %lu, got %d\n", __FUNCTION__, __LINE__, sizeof(*desc), len); + return EINVAL; + } + bzero(desc, sizeof(*desc)); + + struct rtentry *rt = (struct rtentry*)cookie; + desc->id = (uintptr_t)rt; + desc->parent_id = (uintptr_t)rt->rt_parent; + desc->gateway_id = (uintptr_t)rt->rt_gwroute; + + + // key/dest + struct sockaddr *sa; + if ((sa = rt_key(rt))) + nstat_copy_sa_out(sa, &desc->dst.sa, sizeof(desc->dst)); + + // mask + if ((sa = rt_mask(rt)) && sa->sa_len <= sizeof(desc->mask)) + memcpy(&desc->mask, sa, sa->sa_len); + + // gateway + if ((sa = rt->rt_gateway)) + nstat_copy_sa_out(sa, &desc->gateway.sa, sizeof(desc->gateway)); + + if (rt->rt_ifp) + desc->ifindex = rt->rt_ifp->if_index; + + desc->flags = rt->rt_flags; + + return 0; +} + +static void +nstat_init_route_provider(void) +{ + bzero(&nstat_route_provider, sizeof(nstat_route_provider)); + nstat_route_provider.nstat_descriptor_length = sizeof(nstat_route_descriptor); + nstat_route_provider.nstat_provider_id = NSTAT_PROVIDER_ROUTE; + nstat_route_provider.nstat_lookup = nstat_route_lookup; + nstat_route_provider.nstat_gone = nstat_route_gone; + nstat_route_provider.nstat_counts = nstat_route_counts; + nstat_route_provider.nstat_release = nstat_route_release; + nstat_route_provider.nstat_watcher_add = nstat_route_add_watcher; + nstat_route_provider.nstat_watcher_remove = nstat_route_remove_watcher; + nstat_route_provider.nstat_copy_descriptor = nstat_route_copy_descriptor; + nstat_route_provider.next = nstat_providers; + nstat_providers = &nstat_route_provider; +} + +#pragma mark -- Route Collection -- + +static struct nstat_counts* +nstat_route_attach( + struct rtentry *rte) +{ + struct nstat_counts *result = rte->rt_stats; + if (result) return result; + + if (nstat_malloc_tag == NULL) nstat_init(); + + result = nstat_malloc_aligned(sizeof(*result), sizeof(u_int64_t), nstat_malloc_tag); + if (!result) return result; + + bzero(result, sizeof(*result)); + + if (!OSCompareAndSwapPtr(NULL, result, &rte->rt_stats)) + { + nstat_free_aligned(result, nstat_malloc_tag); + result = rte->rt_stats; + } + + return result; +} + +__private_extern__ void +nstat_route_detach( + struct rtentry *rte) +{ + if (rte->rt_stats) + { + nstat_free_aligned(rte->rt_stats, nstat_malloc_tag); + rte->rt_stats = NULL; + } +} + +__private_extern__ void +nstat_route_connect_attempt( + struct rtentry *rte) +{ + while (rte) + { + struct nstat_counts* stats = nstat_route_attach(rte); + if (stats) + { + OSIncrementAtomic(&stats->nstat_connectattempts); + } + + rte = rte->rt_parent; + } +} + +__private_extern__ void +nstat_route_connect_success( + struct rtentry *rte) +{ + // This route + while (rte) + { + struct nstat_counts* stats = nstat_route_attach(rte); + if (stats) + { + OSIncrementAtomic(&stats->nstat_connectsuccesses); + } + + rte = rte->rt_parent; + } +} + +__private_extern__ void +nstat_route_tx( + struct rtentry *rte, + u_int32_t packets, + u_int32_t bytes, + u_int32_t flags) +{ + while (rte) + { + struct nstat_counts* stats = nstat_route_attach(rte); + if (stats) + { + if ((flags & NSTAT_TX_FLAG_RETRANSMIT) != 0) + { + OSAddAtomic(bytes, &stats->nstat_txretransmit); + } + else + { + OSAddAtomic64((SInt64)packets, (SInt64*)&stats->nstat_txpackets); + OSAddAtomic64((SInt64)bytes, (SInt64*)&stats->nstat_txbytes); + } + } + + rte = rte->rt_parent; + } +} + +__private_extern__ void +nstat_route_rx( + struct rtentry *rte, + u_int32_t packets, + u_int32_t bytes, + u_int32_t flags) +{ + while (rte) + { + struct nstat_counts* stats = nstat_route_attach(rte); + if (stats) + { + if (flags == 0) + { + OSAddAtomic64((SInt64)packets, (SInt64*)&stats->nstat_rxpackets); + OSAddAtomic64((SInt64)bytes, (SInt64*)&stats->nstat_rxbytes); + } + else + { + if (flags & NSTAT_RX_FLAG_OUT_OF_ORDER) + OSAddAtomic(bytes, &stats->nstat_rxoutoforderbytes); + if (flags & NSTAT_RX_FLAG_DUPLICATE) + OSAddAtomic(bytes, &stats->nstat_rxduplicatebytes); + } + } + + rte = rte->rt_parent; + } +} + +__private_extern__ void +nstat_route_rtt( + struct rtentry *rte, + u_int32_t rtt, + u_int32_t rtt_var) +{ + const int32_t factor = 8; + + while (rte) + { + struct nstat_counts* stats = nstat_route_attach(rte); + if (stats) + { + int32_t oldrtt; + int32_t newrtt; + + // average + do + { + oldrtt = stats->nstat_avg_rtt; + if (oldrtt == 0) + { + newrtt = rtt; + } + else + { + newrtt = oldrtt - (oldrtt - (int32_t)rtt) / factor; + } + if (oldrtt == newrtt) break; + } while (!OSCompareAndSwap(oldrtt, newrtt, &stats->nstat_avg_rtt)); + + // minimum + do + { + oldrtt = stats->nstat_min_rtt; + if (oldrtt != 0 && oldrtt < (int32_t)rtt) + { + break; + } + } while (!OSCompareAndSwap(oldrtt, rtt, &stats->nstat_min_rtt)); + + // variance + do + { + oldrtt = stats->nstat_var_rtt; + if (oldrtt == 0) + { + newrtt = rtt_var; + } + else + { + newrtt = oldrtt - (oldrtt - (int32_t)rtt_var) / factor; + } + if (oldrtt == newrtt) break; + } while (!OSCompareAndSwap(oldrtt, newrtt, &stats->nstat_var_rtt)); + } + + rte = rte->rt_parent; + } +} + +#pragma mark -- TCP Provider -- + +static nstat_provider nstat_tcp_provider; + +static errno_t +nstat_tcpudp_lookup( + struct inpcbinfo *inpinfo, + const void *data, + u_int32_t length, + nstat_provider_cookie_t *out_cookie) +{ + // parameter validation + const nstat_tcp_add_param *param = (const nstat_tcp_add_param*)data; + if (length < sizeof(*param)) + { + printf("%s:%d expected %lu byte param, received %u\n", __FUNCTION__, __LINE__, sizeof(*param), length); + return EINVAL; + } + + // src and dst must match + if (param->remote.v4.sin_family != 0 && + param->remote.v4.sin_family != param->local.v4.sin_family) + { + printf("%s:%d src family (%d) and dst family (%d) don't match\n", + __FUNCTION__, __LINE__, param->local.v4.sin_family, param->remote.v4.sin_family); + return EINVAL; + } + + struct inpcb *inp = NULL; + + switch (param->local.v4.sin_family) + { + case AF_INET: + { + if (param->local.v4.sin_len != sizeof(param->local.v4) || + (param->remote.v4.sin_family != 0 && + param->remote.v4.sin_len != sizeof(param->remote.v4))) + { + printf("%s:%d invalid length for v4 src (%d) or dst (%d), should be %lu\n", + __FUNCTION__, __LINE__, param->local.v4.sin_len, param->remote.v4.sin_len, + sizeof(param->remote.v4)); + return EINVAL; + } + + inp = in_pcblookup_hash(inpinfo, param->remote.v4.sin_addr, param->remote.v4.sin_port, + param->local.v4.sin_addr, param->local.v4.sin_port, 1, NULL); + } + break; + +#if INET6 + case AF_INET6: + { + union + { + const struct in6_addr *in6c; + struct in6_addr *in6; + } local, remote; + + if (param->local.v6.sin6_len != sizeof(param->local.v6) || + (param->remote.v6.sin6_family != 0 && + param->remote.v6.sin6_len != sizeof(param->remote.v6))) + { + printf("%s:%d invalid length for v6 src (%d) or dst (%d), should be %lu\n", + __FUNCTION__, __LINE__, param->local.v6.sin6_len, param->remote.v6.sin6_len, + sizeof(param->remote.v6)); + return EINVAL; + } + + local.in6c = ¶m->local.v6.sin6_addr; + remote.in6c = ¶m->remote.v6.sin6_addr; + + inp = in6_pcblookup_hash(inpinfo, remote.in6, param->remote.v6.sin6_port, + local.in6, param->local.v6.sin6_port, 1, NULL); + } + break; +#endif + + default: + printf("%s:%d unsupported address family %d\n", __FUNCTION__, __LINE__, param->local.v4.sin_family); + return EINVAL; + } + + if (inp == NULL) return ENOENT; + + // At this point we have a ref to the inpcb + *out_cookie = inp; + return 0; +} + +static errno_t +nstat_tcp_lookup( + const void *data, + u_int32_t length, + nstat_provider_cookie_t *out_cookie) +{ + return nstat_tcpudp_lookup(&tcbinfo, data, length, out_cookie); +} + +static int +nstat_tcp_gone( + nstat_provider_cookie_t cookie) +{ + struct inpcb *inp = (struct inpcb*)cookie; + struct tcpcb *tp = intotcpcb(inp); + return (inp->inp_state == INPCB_STATE_DEAD || tp->t_state == TCPS_TIME_WAIT) ? 1 : 0; +} + +static errno_t +nstat_tcp_counts( + nstat_provider_cookie_t cookie, + struct nstat_counts *out_counts, + int *out_gone) +{ + struct inpcb *inp = (struct inpcb*)cookie; + struct tcpcb *tp = intotcpcb(inp); + + bzero(out_counts, sizeof(*out_counts)); + + *out_gone = 0; + + // if the pcb is in the dead state, we should stop using it + if (inp->inp_state == INPCB_STATE_DEAD || tp->t_state == TCPS_TIME_WAIT) + { + *out_gone = 1; + } + + if (tp->t_state > TCPS_LISTEN) + { + atomic_get_64(out_counts->nstat_rxpackets, &inp->inp_stat->rxpackets); + atomic_get_64(out_counts->nstat_rxbytes, &inp->inp_stat->rxbytes); + atomic_get_64(out_counts->nstat_txpackets, &inp->inp_stat->txpackets); + atomic_get_64(out_counts->nstat_txbytes, &inp->inp_stat->txbytes); + out_counts->nstat_rxduplicatebytes = tp->t_stat.rxduplicatebytes; + out_counts->nstat_rxoutoforderbytes = tp->t_stat.rxoutoforderbytes; + out_counts->nstat_txretransmit = tp->t_stat.txretransmitbytes; + out_counts->nstat_connectattempts = tp->t_state >= TCPS_SYN_SENT ? 1 : 0; + out_counts->nstat_connectsuccesses = tp->t_state >= TCPS_ESTABLISHED ? 1 : 0; + out_counts->nstat_avg_rtt = tp->t_srtt; + out_counts->nstat_min_rtt = tp->t_rttbest; + out_counts->nstat_var_rtt = tp->t_rttvar; + } + + return 0; +} + +static void +nstat_tcp_release( + nstat_provider_cookie_t cookie) +{ + struct inpcb *inp = (struct inpcb*)cookie; + in_pcb_checkstate(inp, WNT_RELEASE, 0); +} + +static u_int32_t nstat_tcp_watchers = 0; + +static errno_t +nstat_tcp_add_watcher( + nstat_control_state *state) +{ + OSIncrementAtomic(&nstat_tcp_watchers); + + lck_rw_lock_shared(tcbinfo.mtx); + + // Add all current tcp inpcbs. Ignore those in timewait + struct inpcb *inp; + for (inp = LIST_FIRST(tcbinfo.listhead); inp; inp = LIST_NEXT(inp, inp_list)) + { + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) + continue; + + if (nstat_control_source_add(0, state, &nstat_tcp_provider, inp) != 0) + { + in_pcb_checkstate(inp, WNT_RELEASE, 0); + break; + } + } + + lck_rw_done(tcbinfo.mtx); + + return 0; +} + +static void +nstat_tcp_remove_watcher( + __unused nstat_control_state *state) +{ + OSDecrementAtomic(&nstat_tcp_watchers); +} + +__private_extern__ void +nstat_tcp_new_pcb( + struct inpcb *inp) +{ + if (nstat_tcp_watchers == 0) + return; + + lck_mtx_lock(&nstat_mtx); + nstat_control_state *state; + for (state = nstat_controls; state; state = state->next) + { + if ((state->watching & (1 << NSTAT_PROVIDER_TCP)) != 0) + { + // this client is watching tcp + // acquire a reference for it + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) + break; + + // add the source, if that fails, release the reference + if (nstat_control_source_add(0, state, &nstat_tcp_provider, inp) != 0) + { + in_pcb_checkstate(inp, WNT_RELEASE, 0); + break; + } + } + } + lck_mtx_unlock(&nstat_mtx); +} + +static errno_t +nstat_tcp_copy_descriptor( + nstat_provider_cookie_t cookie, + void *data, + u_int32_t len) +{ + if (len < sizeof(nstat_tcp_descriptor)) + { + printf("%s:%d invalid length, wanted %lu, got %d\n", __FUNCTION__, __LINE__, sizeof(nstat_tcp_descriptor), len); + return EINVAL; + } + + nstat_tcp_descriptor *desc = (nstat_tcp_descriptor*)data; + struct inpcb *inp = (struct inpcb*)cookie; + struct tcpcb *tp = intotcpcb(inp); + + bzero(desc, sizeof(*desc)); + + if (inp->inp_vflag & INP_IPV6) + { + nstat_ip6_to_sockaddr(&inp->in6p_laddr, inp->inp_lport, + &desc->local.v6, sizeof(desc->local)); + nstat_ip6_to_sockaddr(&inp->in6p_faddr, inp->inp_fport, + &desc->remote.v6, sizeof(desc->remote)); + } + else if (inp->inp_vflag & INP_IPV4) + { + nstat_ip_to_sockaddr(&inp->inp_laddr, inp->inp_lport, + &desc->local.v4, sizeof(desc->local)); + nstat_ip_to_sockaddr(&inp->inp_faddr, inp->inp_fport, + &desc->remote.v4, sizeof(desc->remote)); + } + + desc->state = intotcpcb(inp)->t_state; + if (inp->inp_route.ro_rt && inp->inp_route.ro_rt->rt_ifp) + desc->ifindex = inp->inp_route.ro_rt->rt_ifp->if_index; + + // danger - not locked, values could be bogus + desc->txunacked = tp->snd_max - tp->snd_una; + desc->txwindow = tp->snd_wnd; + desc->txcwindow = tp->snd_cwnd; + + struct socket *so = inp->inp_socket; + if (so) + { + // TBD - take the socket lock around these to make sure + // they're in sync? + desc->upid = so->last_upid; + desc->pid = so->last_pid; + + proc_name(desc->pid, desc->pname, sizeof(desc->pname)); + desc->pname[sizeof(desc->pname) - 1] = 0; + + desc->sndbufsize = so->so_snd.sb_hiwat; + desc->sndbufused = so->so_snd.sb_cc; + desc->rcvbufsize = so->so_rcv.sb_hiwat; + desc->rcvbufused = so->so_rcv.sb_cc; + } + + return 0; +} + +static void +nstat_init_tcp_provider(void) +{ + bzero(&nstat_tcp_provider, sizeof(nstat_tcp_provider)); + nstat_tcp_provider.nstat_descriptor_length = sizeof(nstat_tcp_descriptor); + nstat_tcp_provider.nstat_provider_id = NSTAT_PROVIDER_TCP; + nstat_tcp_provider.nstat_lookup = nstat_tcp_lookup; + nstat_tcp_provider.nstat_gone = nstat_tcp_gone; + nstat_tcp_provider.nstat_counts = nstat_tcp_counts; + nstat_tcp_provider.nstat_release = nstat_tcp_release; + nstat_tcp_provider.nstat_watcher_add = nstat_tcp_add_watcher; + nstat_tcp_provider.nstat_watcher_remove = nstat_tcp_remove_watcher; + nstat_tcp_provider.nstat_copy_descriptor = nstat_tcp_copy_descriptor; + nstat_tcp_provider.next = nstat_providers; + nstat_providers = &nstat_tcp_provider; +} + +#pragma mark -- UDP Provider -- + +static nstat_provider nstat_udp_provider; + +static errno_t +nstat_udp_lookup( + const void *data, + u_int32_t length, + nstat_provider_cookie_t *out_cookie) +{ + return nstat_tcpudp_lookup(&udbinfo, data, length, out_cookie); +} + +static int +nstat_udp_gone( + nstat_provider_cookie_t cookie) +{ + struct inpcb *inp = (struct inpcb*)cookie; + return (inp->inp_state == INPCB_STATE_DEAD) ? 1 : 0; +} + +static errno_t +nstat_udp_counts( + nstat_provider_cookie_t cookie, + struct nstat_counts *out_counts, + int *out_gone) +{ + struct inpcb *inp = (struct inpcb*)cookie; + + *out_gone = 0; + + // if the pcb is in the dead state, we should stop using it + if (inp->inp_state == INPCB_STATE_DEAD) + { + *out_gone = 1; + } + + atomic_get_64(out_counts->nstat_rxpackets, &inp->inp_stat->rxpackets); + atomic_get_64(out_counts->nstat_rxbytes, &inp->inp_stat->rxbytes); + atomic_get_64(out_counts->nstat_txpackets, &inp->inp_stat->txpackets); + atomic_get_64(out_counts->nstat_txbytes, &inp->inp_stat->txbytes); + + return 0; +} + +static void +nstat_udp_release( + nstat_provider_cookie_t cookie) +{ + struct inpcb *inp = (struct inpcb*)cookie; + in_pcb_checkstate(inp, WNT_RELEASE, 0); +} + +static u_int32_t nstat_udp_watchers = 0; + +static errno_t +nstat_udp_add_watcher( + nstat_control_state *state) +{ + OSIncrementAtomic(&nstat_udp_watchers); + + lck_rw_lock_shared(tcbinfo.mtx); + + // Add all current tcp inpcbs. Ignore those in timewait + struct inpcb *inp; + for (inp = LIST_FIRST(udbinfo.listhead); inp; inp = LIST_NEXT(inp, inp_list)) + { + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) + continue; + + if (nstat_control_source_add(0, state, &nstat_udp_provider, inp) != 0) + { + in_pcb_checkstate(inp, WNT_RELEASE, 0); + break; + } + } + + lck_rw_done(tcbinfo.mtx); + + return 0; +} + +static void +nstat_udp_remove_watcher( + __unused nstat_control_state *state) +{ + OSDecrementAtomic(&nstat_udp_watchers); +} + +__private_extern__ void +nstat_udp_new_pcb( + struct inpcb *inp) +{ + if (nstat_udp_watchers == 0) + return; + + lck_mtx_lock(&nstat_mtx); + nstat_control_state *state; + for (state = nstat_controls; state; state = state->next) + { + if ((state->watching & (1 << NSTAT_PROVIDER_UDP)) != 0) + { + // this client is watching tcp + // acquire a reference for it + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) + break; + + // add the source, if that fails, release the reference + if (nstat_control_source_add(0, state, &nstat_udp_provider, inp) != 0) + { + in_pcb_checkstate(inp, WNT_RELEASE, 0); + break; + } + } + } + lck_mtx_unlock(&nstat_mtx); +} + +static errno_t +nstat_udp_copy_descriptor( + nstat_provider_cookie_t cookie, + void *data, + u_int32_t len) +{ + if (len < sizeof(nstat_udp_descriptor)) + { + printf("%s:%d invalid length, wanted %lu, got %d\n", __FUNCTION__, __LINE__, sizeof(nstat_tcp_descriptor), len); + return EINVAL; + } + + nstat_udp_descriptor *desc = (nstat_udp_descriptor*)data; + struct inpcb *inp = (struct inpcb*)cookie; + + bzero(desc, sizeof(*desc)); + + if (inp->inp_vflag & INP_IPV6) + { + nstat_ip6_to_sockaddr(&inp->in6p_laddr, inp->inp_lport, + &desc->local.v6, sizeof(desc->local)); + nstat_ip6_to_sockaddr(&inp->in6p_faddr, inp->inp_fport, + &desc->remote.v6, sizeof(desc->remote)); + } + else if (inp->inp_vflag & INP_IPV4) + { + nstat_ip_to_sockaddr(&inp->inp_laddr, inp->inp_lport, + &desc->local.v4, sizeof(desc->local)); + nstat_ip_to_sockaddr(&inp->inp_faddr, inp->inp_fport, + &desc->remote.v4, sizeof(desc->remote)); + } + + if (inp->inp_route.ro_rt && inp->inp_route.ro_rt->rt_ifp) + desc->ifindex = inp->inp_route.ro_rt->rt_ifp->if_index; + + struct socket *so = inp->inp_socket; + if (so) + { + // TBD - take the socket lock around these to make sure + // they're in sync? + desc->upid = so->last_upid; + desc->pid = so->last_pid; + + desc->rcvbufsize = so->so_rcv.sb_hiwat; + desc->rcvbufused = so->so_rcv.sb_cc; + + proc_name(desc->pid, desc->pname, sizeof(desc->pname)); + desc->pname[sizeof(desc->pname) - 1] = 0; + } + + return 0; +} + +static void +nstat_init_udp_provider(void) +{ + bzero(&nstat_udp_provider, sizeof(nstat_udp_provider)); + nstat_udp_provider.nstat_provider_id = NSTAT_PROVIDER_UDP; + nstat_udp_provider.nstat_descriptor_length = sizeof(nstat_udp_descriptor); + nstat_udp_provider.nstat_lookup = nstat_udp_lookup; + nstat_udp_provider.nstat_gone = nstat_udp_gone; + nstat_udp_provider.nstat_counts = nstat_udp_counts; + nstat_udp_provider.nstat_watcher_add = nstat_udp_add_watcher; + nstat_udp_provider.nstat_watcher_remove = nstat_udp_remove_watcher; + nstat_udp_provider.nstat_copy_descriptor = nstat_udp_copy_descriptor; + nstat_udp_provider.nstat_release = nstat_udp_release; + nstat_udp_provider.next = nstat_providers; + nstat_providers = &nstat_udp_provider; +} + +#pragma mark -- Kernel Control Socket -- + +typedef struct nstat_src +{ + struct nstat_src *next; + nstat_src_ref_t srcref; + nstat_provider *provider; + nstat_provider_cookie_t cookie; +} nstat_src; + +static kern_ctl_ref nstat_ctlref = NULL; +static lck_grp_t *nstat_lck_grp = NULL; + +static errno_t nstat_control_connect(kern_ctl_ref kctl, struct sockaddr_ctl *sac, void **uinfo); +static errno_t nstat_control_disconnect(kern_ctl_ref kctl, u_int32_t unit, void *uinfo); +static errno_t nstat_control_send(kern_ctl_ref kctl, u_int32_t unit, void *uinfo, mbuf_t m, int flags); +static int nstat_control_send_description(nstat_control_state *state, nstat_src *src, u_int64_t context); +static void nstat_control_cleanup_source(nstat_control_state *state, struct nstat_src *src); + + +static void* +nstat_idle_check( + __unused thread_call_param_t p0, + __unused thread_call_param_t p1) +{ + lck_mtx_lock(&nstat_mtx); + + nstat_idle_time = 0ULL; + + nstat_control_state *control; + nstat_src *dead = NULL; + nstat_src *dead_list = NULL; + for (control = nstat_controls; control; control = control->next) + { + lck_mtx_lock(&control->mtx); + nstat_src **srcpp = &control->srcs; + + while(*srcpp != NULL) + { + if ((*srcpp)->provider->nstat_gone((*srcpp)->cookie)) + { + // Pull it off the list + dead = *srcpp; + *srcpp = (*srcpp)->next; + + // send a last description + nstat_control_send_description(control, dead, 0ULL); + + // send the source removed notification + nstat_msg_src_removed removed; + removed.hdr.type = NSTAT_MSG_TYPE_SRC_REMOVED; + removed.hdr.context = 0; + removed.srcref = dead->srcref; + errno_t result = ctl_enqueuedata(control->kctl, control->unit, &removed, sizeof(removed), CTL_DATA_EOR); + if (result != 0) printf("%s:%d ctl_enqueuedata failed: %d\n", __FUNCTION__, __LINE__, result); + + // Put this on the list to release later + dead->next = dead_list; + dead_list = dead; + } + else + { + srcpp = &(*srcpp)->next; + } + } + lck_mtx_unlock(&control->mtx); + } + + if (nstat_controls) + { + clock_interval_to_deadline(60, NSEC_PER_SEC, &nstat_idle_time); + thread_call_func_delayed((thread_call_func_t)nstat_idle_check, NULL, nstat_idle_time); + } + + lck_mtx_unlock(&nstat_mtx); + + // Release the sources now that we aren't holding lots of locks + while (dead_list) + { + dead = dead_list; + dead_list = dead->next; + + nstat_control_cleanup_source(NULL, dead); + } + + return NULL; +} + +static void +nstat_control_register(void) +{ + // Create our lock group first + lck_grp_attr_t *grp_attr = lck_grp_attr_alloc_init(); + lck_grp_attr_setdefault(grp_attr); + nstat_lck_grp = lck_grp_alloc_init("network statistics kctl", grp_attr); + lck_grp_attr_free(grp_attr); + + lck_mtx_init(&nstat_mtx, nstat_lck_grp, NULL); + + // Register the control + struct kern_ctl_reg nstat_control; + bzero(&nstat_control, sizeof(nstat_control)); + strlcpy(nstat_control.ctl_name, NET_STAT_CONTROL_NAME, sizeof(nstat_control.ctl_name)); + nstat_control.ctl_connect = nstat_control_connect; + nstat_control.ctl_disconnect = nstat_control_disconnect; + nstat_control.ctl_send = nstat_control_send; + + errno_t result = ctl_register(&nstat_control, &nstat_ctlref); + if (result != 0) + printf("%s:%d ctl_register failed: %d", __FUNCTION__, __LINE__, result); +} + +static void +nstat_control_cleanup_source( + nstat_control_state *state, + struct nstat_src *src) +{ + if (state) + { + nstat_msg_src_removed removed; + removed.hdr.type = NSTAT_MSG_TYPE_SRC_REMOVED; + removed.hdr.context = 0; + removed.srcref = src->srcref; + errno_t result = ctl_enqueuedata(state->kctl, state->unit, &removed, sizeof(removed), CTL_DATA_EOR); + if (result != 0) printf("%s:%d ctl_enqueuedata failed: %d\n", __FUNCTION__, __LINE__, result); + } + + // Cleanup the source if we found it. + src->provider->nstat_release(src->cookie); + OSFree(src, sizeof(*src), nstat_malloc_tag); +} + +static errno_t +nstat_control_connect( + kern_ctl_ref kctl, + struct sockaddr_ctl *sac, + void **uinfo) +{ + nstat_control_state *state = OSMalloc(sizeof(*state), nstat_malloc_tag); + if (state == NULL) return ENOMEM; + + bzero(state, sizeof(*state)); + lck_mtx_init(&state->mtx, nstat_lck_grp, NULL); + state->kctl = kctl; + state->unit = sac->sc_unit; + *uinfo = state; + + // check if we're super user + proc_t pself = proc_self(); + state->suser = proc_suser(pself) == 0; + proc_rele(pself); + + lck_mtx_lock(&nstat_mtx); + state->next = nstat_controls; + nstat_controls = state; + + if (nstat_idle_time == 0ULL) + { + clock_interval_to_deadline(60, NSEC_PER_SEC, &nstat_idle_time); + thread_call_func_delayed((thread_call_func_t)nstat_idle_check, NULL, nstat_idle_time); + } + + lck_mtx_unlock(&nstat_mtx); + + return 0; +} + +static errno_t +nstat_control_disconnect( + __unused kern_ctl_ref kctl, + __unused u_int32_t unit, + __unused void *uinfo) +{ + u_int32_t watching; + nstat_control_state *state = (nstat_control_state*)uinfo; + + // pull it out of the global list of states + lck_mtx_lock(&nstat_mtx); + nstat_control_state **statepp; + for (statepp = &nstat_controls; *statepp; statepp = &(*statepp)->next) + { + if (*statepp == state) + { + *statepp = state->next; + break; + } + } + lck_mtx_unlock(&nstat_mtx); + + lck_mtx_lock(&state->mtx); + // Stop watching for sources + nstat_provider *provider; + watching = state->watching; + state->watching = 0; + for (provider = nstat_providers; provider && watching; provider = provider->next) + { + if ((watching & (1 << provider->nstat_provider_id)) != 0) + { + watching &= ~(1 << provider->nstat_provider_id); + provider->nstat_watcher_remove(state); + } + } + + // set cleanup flags + state->cleanup = TRUE; + + // Copy out the list of sources + nstat_src *srcs = state->srcs; + state->srcs = NULL; + lck_mtx_unlock(&state->mtx); + + while (srcs) + { + nstat_src *src; + + // pull it out of the list + src = srcs; + srcs = src->next; + + // clean it up + nstat_control_cleanup_source(NULL, src); + } + + OSFree(state, sizeof(*state), nstat_malloc_tag); + + return 0; +} + +static nstat_src_ref_t +nstat_control_next_src_ref( + nstat_control_state *state) +{ + int i = 0; + nstat_src_ref_t toReturn = NSTAT_SRC_REF_INVALID; + + for (i = 0; i < 1000 && toReturn == NSTAT_SRC_REF_INVALID; i++) + { + if (state->next_srcref == NSTAT_SRC_REF_INVALID || + state->next_srcref == NSTAT_SRC_REF_ALL) + { + state->next_srcref = 1; + } + + nstat_src *src; + for (src = state->srcs; src; src = src->next) + { + if (src->srcref == state->next_srcref) + break; + } + + if (src == NULL) toReturn = state->next_srcref; + state->next_srcref++; + } + + return toReturn; +} + +static int +nstat_control_send_description( + nstat_control_state *state, + nstat_src *src, + u_int64_t context) +{ + // Provider doesn't support getting the descriptor? Done. + if (src->provider->nstat_descriptor_length == 0 || + src->provider->nstat_copy_descriptor == NULL) + { + lck_mtx_unlock(&state->mtx); + printf("%s:%d - provider doesn't support descriptions\n", __FUNCTION__, __LINE__); + return EOPNOTSUPP; + } + + // Allocate storage for the descriptor message + mbuf_t msg; + unsigned int one = 1; + u_int32_t size = offsetof(nstat_msg_src_description, data) + src->provider->nstat_descriptor_length; + if (mbuf_allocpacket(MBUF_WAITOK, size, &one, &msg) != 0) + { + lck_mtx_unlock(&state->mtx); + printf("%s:%d - failed to allocate response\n", __FUNCTION__, __LINE__); + return ENOMEM; + } + + nstat_msg_src_description *desc = (nstat_msg_src_description*)mbuf_data(msg); + mbuf_setlen(msg, size); + mbuf_pkthdr_setlen(msg, mbuf_len(msg)); + + // Query the provider for the provider specific bits + errno_t result = src->provider->nstat_copy_descriptor(src->cookie, desc->data, src->provider->nstat_descriptor_length); + + if (result != 0) + { + mbuf_freem(msg); + printf("%s:%d - provider failed to copy descriptor %d\n", __FUNCTION__, __LINE__, result); + return result; + } + + desc->hdr.context = context; + desc->hdr.type = NSTAT_MSG_TYPE_SRC_DESC; + desc->srcref = src->srcref; + desc->provider = src->provider->nstat_provider_id; + + result = ctl_enqueuembuf(state->kctl, state->unit, msg, CTL_DATA_EOR); + if (result != 0) + { + printf("%s:%d ctl_enqueuembuf returned error %d\n", __FUNCTION__, __LINE__, result); + mbuf_freem(msg); + } + + return result; +} + +static errno_t +nstat_control_handle_add_request( + nstat_control_state *state, + mbuf_t m) +{ + errno_t result; + + // Verify the header fits in the first mbuf + if (mbuf_len(m) < offsetof(nstat_msg_add_src_req, param)) + { + printf("mbuf_len(m)=%lu, offsetof(nstat_msg_add_src_req*, param)=%lu\n", + mbuf_len(m), offsetof(nstat_msg_add_src_req, param)); + return EINVAL; + } + + // Calculate the length of the parameter field + int32_t paramlength = mbuf_pkthdr_len(m) - offsetof(nstat_msg_add_src_req, param); + if (paramlength < 0 || paramlength > 2 * 1024) + { + printf("invalid paramlength=%d\n", paramlength); + return EINVAL; + } + + nstat_provider *provider; + nstat_provider_cookie_t cookie; + nstat_msg_add_src_req *req = mbuf_data(m); + if (mbuf_pkthdr_len(m) > mbuf_len(m)) + { + // parameter is too large, we need to make a contiguous copy + void *data = OSMalloc(paramlength, nstat_malloc_tag); + + if (!data) return ENOMEM; + result = mbuf_copydata(m, offsetof(nstat_msg_add_src_req, param), paramlength, data); + if (result == 0) + result = nstat_lookup_entry(req->provider, data, paramlength, &provider, &cookie); + OSFree(data, paramlength, nstat_malloc_tag); + } + else + { + result = nstat_lookup_entry(req->provider, (void*)&req->param, paramlength, &provider, &cookie); + } + + if (result != 0) + { + printf("nstat_lookup_entry failed: %d\n", result); + return result; + } + + result = nstat_control_source_add(req->hdr.context, state, provider, cookie); + if (result != 0) + provider->nstat_release(cookie); + + return result; +} + +static int +nstat_perm_check( + __unused nstat_control_state *state) +{ + int allow = 0; +#if !REQUIRE_ROOT_FOR_STATS + allow = 1; +#else + // If the socket was created by a priv process, allow + if (state->suser) return 1; + + // If the current process is priv, allow + proc_t self = proc_self(); + allow = proc_suser(self) == 0; + proc_rele(self); + + // TBD: check for entitlement, root check is too coarse +#endif /* REQUIRE_ROOT_FOR_STATS */ + + return allow; +} + +static errno_t +nstat_control_handle_add_all( + nstat_control_state *state, + mbuf_t m) +{ + errno_t result = 0; + + if (!nstat_perm_check(state)) + { + return EPERM; + } + + // Verify the header fits in the first mbuf + if (mbuf_len(m) < sizeof(nstat_msg_add_all_srcs)) + { + printf("mbuf_len(m)=%lu, sizeof(nstat_msg_add_all_srcs)=%lu\n", + mbuf_len(m), sizeof(nstat_msg_add_all_srcs)); + return EINVAL; + } + + nstat_msg_add_all_srcs *req = mbuf_data(m); + nstat_provider *provider = nstat_find_provider_by_id(req->provider); + + if (!provider) return ENOENT; + if (provider->nstat_watcher_add == NULL) return ENOTSUP; + + // Make sure we don't add the provider twice + lck_mtx_lock(&state->mtx); + if ((state->watching & (1 << provider->nstat_provider_id)) != 0) + result = EALREADY; + state->watching |= (1 << provider->nstat_provider_id); + lck_mtx_unlock(&state->mtx); + if (result != 0) return result; + + result = provider->nstat_watcher_add(state); + if (result != 0) + { + lck_mtx_lock(&state->mtx); + state->watching &= ~(1 << provider->nstat_provider_id); + lck_mtx_unlock(&state->mtx); + } + + if (result == 0) + { + // Notify the client + nstat_msg_hdr success; + success.context = req->hdr.context; + success.type = NSTAT_MSG_TYPE_SUCCESS; + success.pad = 0; + if (ctl_enqueuedata(state->kctl, state->unit, &success, sizeof(success), CTL_DATA_EOR) != 0) + printf("%s:%d - failed to enqueue success message\n", __FUNCTION__, __LINE__); + } + + return result; +} + +static errno_t +nstat_control_source_add( + u_int64_t context, + nstat_control_state *state, + nstat_provider *provider, + nstat_provider_cookie_t cookie) +{ + // Fill out source added message + mbuf_t msg = NULL; + unsigned int one = 1; + + if (mbuf_allocpacket(MBUF_WAITOK, sizeof(nstat_msg_src_added), &one, &msg) != 0) + return ENOMEM; + + mbuf_setlen(msg, sizeof(nstat_msg_src_added)); + mbuf_pkthdr_setlen(msg, mbuf_len(msg)); + nstat_msg_src_added *add = mbuf_data(msg); + bzero(add, sizeof(*add)); + add->hdr.type = NSTAT_MSG_TYPE_SRC_ADDED; + add->hdr.context = context; + add->provider = provider->nstat_provider_id; + + // Allocate storage for the source + nstat_src *src = OSMalloc(sizeof(*src), nstat_malloc_tag); + if (src == NULL) + { + mbuf_freem(msg); + return ENOMEM; + } + + // Fill in the source, including picking an unused source ref + lck_mtx_lock(&state->mtx); + + add->srcref = src->srcref = nstat_control_next_src_ref(state); + if (state->cleanup || src->srcref == NSTAT_SRC_REF_INVALID) + { + lck_mtx_unlock(&state->mtx); + OSFree(src, sizeof(*src), nstat_malloc_tag); + mbuf_freem(msg); + return EINVAL; + } + src->provider = provider; + src->cookie = cookie; + + // send the source added message + errno_t result = ctl_enqueuembuf(state->kctl, state->unit, msg, CTL_DATA_EOR); + if (result != 0) + { + lck_mtx_unlock(&state->mtx); + printf("%s:%d ctl_enqueuembuf failed: %d\n", __FUNCTION__, __LINE__, result); + OSFree(src, sizeof(*src), nstat_malloc_tag); + mbuf_freem(msg); + return result; + } + + // Put the source in the list + src->next = state->srcs; + state->srcs = src; + + // send the description message + // not useful as the source is often not complete +// nstat_control_send_description(state, src, 0ULL); + + lck_mtx_unlock(&state->mtx); + + return 0; +} + +static errno_t +nstat_control_handle_remove_request( + nstat_control_state *state, + mbuf_t m) +{ + nstat_src_ref_t srcref = NSTAT_SRC_REF_INVALID; + + if (mbuf_copydata(m, offsetof(nstat_msg_rem_src_req, srcref), sizeof(srcref), &srcref) != 0) + { + printf("%s:%d - invalid length %u, expected %lu\n", __FUNCTION__, __LINE__, (u_int32_t)mbuf_pkthdr_len(m), sizeof(nstat_msg_rem_src_req)); + return EINVAL; + } + + lck_mtx_lock(&state->mtx); + + // Remove this source as we look for it + nstat_src **nextp; + nstat_src *src = NULL; + for (nextp = &state->srcs; *nextp; nextp = &(*nextp)->next) + { + if ((*nextp)->srcref == srcref) + { + src = *nextp; + *nextp = src->next; + break; + } + } + + lck_mtx_unlock(&state->mtx); + + if (src) nstat_control_cleanup_source(state, src); + + return src ? 0 : ENOENT; +} + +static errno_t +nstat_control_handle_query_request( + nstat_control_state *state, + mbuf_t m) +{ + // TBD: handle this from another thread so we can enqueue a lot of data + // As written, if a client requests query all, this function will be + // called from their send of the request message. We will attempt to write + // responses and succeed until the buffer fills up. Since the clients thread + // is blocked on send, it won't be reading unless the client has two threads + // using this socket, one for read and one for write. Two threads probably + // won't work with this code anyhow since we don't have proper locking in + // place yet. + nstat_src *dead_srcs = NULL; + errno_t result = ENOENT; + nstat_msg_query_src_req req; + if (mbuf_copydata(m, 0, sizeof(req), &req) != 0) + { + printf("%s:%d - invalid length %u, expected %lu\n", __FUNCTION__, __LINE__, (u_int32_t)mbuf_pkthdr_len(m), sizeof(req)); + return EINVAL; + } + + lck_mtx_lock(&state->mtx); + nstat_src **srcpp = &state->srcs; + while (*srcpp != NULL) + { + int gone; + gone = 0; + + if (req.srcref == NSTAT_SRC_REF_ALL || + (*srcpp)->srcref == req.srcref) + { + nstat_msg_src_counts counts; + counts.hdr.type = NSTAT_MSG_TYPE_SRC_COUNTS; + counts.hdr.context = req.hdr.context; + counts.srcref = (*srcpp)->srcref; + bzero(&counts.counts, sizeof(counts.counts)); + result = (*srcpp)->provider->nstat_counts((*srcpp)->cookie, &counts.counts, &gone); + + if (result == 0) + { + result = ctl_enqueuedata(state->kctl, state->unit, &counts, sizeof(counts), CTL_DATA_EOR); + if (result != 0) + { + printf("%s:%d ctl_enqueuedata failed: %d\n", __FUNCTION__, __LINE__, result); + } + } + else + { + printf("%s:%d provider->nstat_counts failed: %d\n", __FUNCTION__, __LINE__, result); + } + + if (gone) + { + // send one last descriptor message so client may see last state + nstat_control_send_description(state, *srcpp, 0ULL); + + // pull src out of the list + nstat_src *src = *srcpp; + *srcpp = src->next; + + src->next = dead_srcs; + dead_srcs = src; + } + + if (req.srcref != NSTAT_SRC_REF_ALL) + break; + } + + if (!gone) + srcpp = &(*srcpp)->next; + } + lck_mtx_unlock(&state->mtx); + + while (dead_srcs) + { + nstat_src *src; + + src = dead_srcs; + dead_srcs = src->next; + + // release src and send notification + nstat_control_cleanup_source(state, src); + } + + if (req.srcref == NSTAT_SRC_REF_ALL) + { + nstat_msg_hdr success; + success.context = req.hdr.context; + success.type = NSTAT_MSG_TYPE_SUCCESS; + success.pad = 0; + if (ctl_enqueuedata(state->kctl, state->unit, &success, sizeof(success), CTL_DATA_EOR) != 0) + printf("%s:%d - failed to enqueue success message\n", __FUNCTION__, __LINE__); + result = 0; + } + + return result; +} + +static errno_t +nstat_control_handle_get_src_description( + nstat_control_state *state, + mbuf_t m) +{ + nstat_msg_get_src_description req; + if (mbuf_copydata(m, 0, sizeof(req), &req) != 0) + { + printf("%s:%d - invalid length %u, expected %lu\n", __FUNCTION__, __LINE__, (u_int32_t)mbuf_pkthdr_len(m), sizeof(req)); + return EINVAL; + } + + // Find the source + lck_mtx_lock(&state->mtx); + nstat_src *src; + for (src = state->srcs; src; src = src->next) + { + if (src->srcref == req.srcref) + break; + } + + // No source? Done. + if (!src) + { + lck_mtx_unlock(&state->mtx); + printf("%s:%d - no matching source\n", __FUNCTION__, __LINE__); + return ENOENT; + } + + errno_t result = nstat_control_send_description(state, src, req.hdr.context); + lck_mtx_unlock(&state->mtx); + + return result; +} + +static errno_t +nstat_control_send( + kern_ctl_ref kctl, + u_int32_t unit, + __unused void *uinfo, + mbuf_t m, + __unused int flags) +{ + nstat_control_state *state = (nstat_control_state*)uinfo; + struct nstat_msg_hdr *hdr; + struct nstat_msg_hdr storage; + errno_t result = 0; + + if (mbuf_pkthdr_len(m) < sizeof(hdr)) + { + // Is this the right thing to do? + printf("%s:%d - message too short, was %ld expected %lu\n", __FUNCTION__, __LINE__, + mbuf_pkthdr_len(m), sizeof(*hdr)); + mbuf_freem(m); + return EINVAL; + } + + if (mbuf_len(m) >= sizeof(*hdr)) + { + hdr = mbuf_data(m); + } + else + { + mbuf_copydata(m, 0, sizeof(storage), &storage); + hdr = &storage; + } + + switch (hdr->type) + { + case NSTAT_MSG_TYPE_ADD_SRC: + result = nstat_control_handle_add_request(state, m); + break; + + case NSTAT_MSG_TYPE_ADD_ALL_SRCS: + result = nstat_control_handle_add_all(state, m); + break; + + case NSTAT_MSG_TYPE_REM_SRC: + result = nstat_control_handle_remove_request(state, m); + break; + + case NSTAT_MSG_TYPE_QUERY_SRC: + result = nstat_control_handle_query_request(state, m); + break; + + case NSTAT_MSG_TYPE_GET_SRC_DESC: + result = nstat_control_handle_get_src_description(state, m); + break; + + default: + printf("%s:%d - unknown message type %d\n", __FUNCTION__, __LINE__, hdr->type); + result = EINVAL; + break; + } + + if (result != 0) + { + struct nstat_msg_error err; + + err.hdr.type = NSTAT_MSG_TYPE_ERROR; + err.hdr.context = hdr->context; + err.error = result; + + result = ctl_enqueuedata(kctl, unit, &err, sizeof(err), CTL_DATA_EOR); + } + + mbuf_freem(m); + + return result; +} diff --git a/bsd/net/ntstat.h b/bsd/net/ntstat.h new file mode 100644 index 000000000..4bbb3dc1b --- /dev/null +++ b/bsd/net/ntstat.h @@ -0,0 +1,348 @@ +/* + * Copyright (c) 2010-2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef __NTSTAT_H__ +#define __NTSTAT_H__ +#include + +#ifdef PRIVATE +#pragma pack(push, 4) +#pragma mark -- Common Data Structures -- + +#define __NSTAT_REVISION__ 1 + +typedef u_int32_t nstat_provider_id_t; +typedef u_int32_t nstat_src_ref_t; + +typedef struct nstat_counts +{ + /* Counters */ + u_int64_t nstat_rxpackets __attribute__((aligned(8))); + u_int64_t nstat_rxbytes __attribute__((aligned(8))); + u_int64_t nstat_txpackets __attribute__((aligned(8))); + u_int64_t nstat_txbytes __attribute__((aligned(8))); + + u_int32_t nstat_rxduplicatebytes; + u_int32_t nstat_rxoutoforderbytes; + u_int32_t nstat_txretransmit; + + u_int32_t nstat_connectattempts; + u_int32_t nstat_connectsuccesses; + + u_int32_t nstat_min_rtt; + u_int32_t nstat_avg_rtt; + u_int32_t nstat_var_rtt; +} nstat_counts; + +#pragma mark -- Network Statistics Providers -- + +enum +{ + NSTAT_PROVIDER_ROUTE = 1 + ,NSTAT_PROVIDER_TCP = 2 + ,NSTAT_PROVIDER_UDP = 3 +}; + +typedef struct nstat_route_add_param +{ + union + { + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } dst; + union + { + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } mask; + u_int32_t ifindex; +} nstat_route_add_param; + +typedef struct nstat_tcp_add_param +{ + union + { + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } local; + union + { + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } remote; +} nstat_tcp_add_param; + +typedef struct nstat_tcp_descriptor +{ + union + { + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } local; + + union + { + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } remote; + + u_int32_t ifindex; + + u_int32_t state; + + u_int32_t sndbufsize; + u_int32_t sndbufused; + u_int32_t rcvbufsize; + u_int32_t rcvbufused; + u_int32_t txunacked; + u_int32_t txwindow; + u_int32_t txcwindow; + + u_int64_t upid; + u_int32_t pid; + char pname[64]; +} nstat_tcp_descriptor; + +typedef struct nstat_tcp_add_param nstat_udp_add_param; + +typedef struct nstat_udp_descriptor +{ + union + { + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } local; + + union + { + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } remote; + + u_int32_t ifindex; + + u_int32_t rcvbufsize; + u_int32_t rcvbufused; + + u_int64_t upid; + u_int32_t pid; + char pname[64]; +} nstat_udp_descriptor; + +typedef struct nstat_route_descriptor +{ + u_int64_t id; + u_int64_t parent_id; + u_int64_t gateway_id; + + union + { + struct sockaddr_in v4; + struct sockaddr_in6 v6; + struct sockaddr sa; + } dst; + + union + { + struct sockaddr_in v4; + struct sockaddr_in6 v6; + struct sockaddr sa; + } mask; + + union + { + struct sockaddr_in v4; + struct sockaddr_in6 v6; + struct sockaddr sa; + } gateway; + + u_int32_t ifindex; + u_int32_t flags; + +} nstat_route_descriptor; + +#pragma mark -- Network Statistics User Client -- + +#define NET_STAT_CONTROL_NAME "com.apple.network.statistics" + +enum +{ + // generice respnse messages + NSTAT_MSG_TYPE_SUCCESS = 0 + ,NSTAT_MSG_TYPE_ERROR = 1 + + // Requests + ,NSTAT_MSG_TYPE_ADD_SRC = 1001 + ,NSTAT_MSG_TYPE_ADD_ALL_SRCS = 1002 + ,NSTAT_MSG_TYPE_REM_SRC = 1003 + ,NSTAT_MSG_TYPE_QUERY_SRC = 1004 + ,NSTAT_MSG_TYPE_GET_SRC_DESC = 1005 + + // Responses/Notfications + ,NSTAT_MSG_TYPE_SRC_ADDED = 10001 + ,NSTAT_MSG_TYPE_SRC_REMOVED = 10002 + ,NSTAT_MSG_TYPE_SRC_DESC = 10003 + ,NSTAT_MSG_TYPE_SRC_COUNTS = 10004 +}; + +enum +{ + NSTAT_SRC_REF_ALL = 0xffffffff + ,NSTAT_SRC_REF_INVALID = 0 +}; + +typedef struct nstat_msg_hdr +{ + u_int64_t context; + u_int32_t type; + u_int32_t pad; // unused for now +} nstat_msg_hdr; + +typedef struct nstat_msg_error +{ + nstat_msg_hdr hdr; + u_int32_t error; // errno error +} nstat_msg_error; + +typedef struct nstat_msg_add_src +{ + nstat_msg_hdr hdr; + nstat_provider_id_t provider; + u_int8_t param[]; +} nstat_msg_add_src_req; + +typedef struct nstat_msg_add_all_srcs +{ + nstat_msg_hdr hdr; + nstat_provider_id_t provider; +} nstat_msg_add_all_srcs; + +typedef struct nstat_msg_src_added +{ + nstat_msg_hdr hdr; + nstat_provider_id_t provider; + nstat_src_ref_t srcref; +} nstat_msg_src_added; + +typedef struct nstat_msg_rem_src +{ + nstat_msg_hdr hdr; + nstat_src_ref_t srcref; +} nstat_msg_rem_src_req; + +typedef struct nstat_msg_get_src_description +{ + nstat_msg_hdr hdr; + nstat_src_ref_t srcref; +} nstat_msg_get_src_description; + +typedef struct nstat_msg_src_description +{ + nstat_msg_hdr hdr; + nstat_src_ref_t srcref; + nstat_provider_id_t provider; + u_int8_t data[]; +} nstat_msg_src_description; + +typedef struct nstat_msg_query_src +{ + nstat_msg_hdr hdr; + nstat_src_ref_t srcref; +} nstat_msg_query_src_req; + +typedef struct nstat_msg_src_counts +{ + nstat_msg_hdr hdr; + nstat_src_ref_t srcref; + nstat_counts counts; +} nstat_msg_src_counts; + +typedef struct nstat_msg_src_removed +{ + nstat_msg_hdr hdr; + nstat_src_ref_t srcref; +} nstat_msg_src_removed; + +#pragma pack(pop) + +#endif /* PRIVATE */ + +#ifdef XNU_KERNEL_PRIVATE +#include + +#pragma mark -- Generic Network Statistics Provider -- + +typedef void * nstat_provider_cookie_t; + +#pragma mark -- Route Statistics Gathering Functions -- +struct rtentry; + +enum +{ + NSTAT_TX_FLAG_RETRANSMIT = 1 +}; + +enum +{ + NSTAT_RX_FLAG_DUPLICATE = 1, + NSTAT_RX_FLAG_OUT_OF_ORDER = 2 +}; + +// indicates whether or not collection of statistics is enabled +extern int nstat_collect; + +// Route collection routines +void nstat_route_connect_attempt(struct rtentry *rte); +void nstat_route_connect_success(struct rtentry *rte); +void nstat_route_tx(struct rtentry *rte, u_int32_t packets, u_int32_t bytes, u_int32_t flags); +void nstat_route_rx(struct rtentry *rte, u_int32_t packets, u_int32_t bytes, u_int32_t flags); +void nstat_route_rtt(struct rtentry *rte, u_int32_t rtt, u_int32_t rtt_var); +void nstat_route_detach(struct rtentry *rte); + +// watcher support +struct inpcb; +void nstat_tcp_new_pcb(struct inpcb *inp); +void nstat_udp_new_pcb(struct inpcb *inp); +void nstat_route_new_entry(struct rtentry *rt); + +// locked_add_64 uses atomic operations on 32bit so the 64bit +// value can be properly read. The values are only ever incremented +// while under the socket lock, so on 64bit we don't actually need +// atomic operations to increment. +#if defined(__LP64__) +#define locked_add_64(__addr, __count) do { \ + *(__addr) += (__count); \ +} while (0) +#else +#define locked_add_64(__addr, __count) do { \ + atomic_add_64((__addr), (__count)); \ +} while (0) +#endif + +#endif /* XNU_KERNEL_PRIVATE */ + +#endif /* __NTSTAT_H__ */ diff --git a/bsd/net/pf.c b/bsd/net/pf.c index 47134bda3..62b39f2ad 100644 --- a/bsd/net/pf.c +++ b/bsd/net/pf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2009 Apple Inc. All rights reserved. + * Copyright (c) 2007-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -379,6 +379,7 @@ pf_lazy_makewritable(struct pf_pdesc *pd, struct mbuf *m, int len) pd->lmw = len; if (len >= 0 && m != pd->mp) { pd->mp = m; + pd->pf_mtag = pf_find_mtag(m); switch (pd->af) { case AF_INET: { @@ -2356,18 +2357,42 @@ pf_change_ap(int dir, struct mbuf *m, struct pf_addr *a, u_int16_t *p, #endif /* INET */ #if INET6 case AF_INET6: - *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( - pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( - pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc, - ao.addr16[0], an->addr16[0], u), - ao.addr16[1], an->addr16[1], u), - ao.addr16[2], an->addr16[2], u), - ao.addr16[3], an->addr16[3], u), - ao.addr16[4], an->addr16[4], u), - ao.addr16[5], an->addr16[5], u), - ao.addr16[6], an->addr16[6], u), - ao.addr16[7], an->addr16[7], u), - po, pn, u); + /* + * If the packet is originated from an ALG on the NAT gateway + * (source address is loopback or local), in which case the + * TCP/UDP checksum field contains the pseudo header checksum + * that's not yet complemented. + */ + if (dir == PF_OUT && m != NULL && + (m->m_flags & M_PKTHDR) && + (m->m_pkthdr.csum_flags & (CSUM_TCPIPV6 | CSUM_UDPIPV6))) { + /* Pseudo-header checksum does not include ports */ + *pc = ~pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(~*pc, + ao.addr16[0], an->addr16[0], u), + ao.addr16[1], an->addr16[1], u), + ao.addr16[2], an->addr16[2], u), + ao.addr16[3], an->addr16[3], u), + ao.addr16[4], an->addr16[4], u), + ao.addr16[5], an->addr16[5], u), + ao.addr16[6], an->addr16[6], u), + ao.addr16[7], an->addr16[7], u), + po, pn, u); + } else { + *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc, + ao.addr16[0], an->addr16[0], u), + ao.addr16[1], an->addr16[1], u), + ao.addr16[2], an->addr16[2], u), + ao.addr16[3], an->addr16[3], u), + ao.addr16[4], an->addr16[4], u), + ao.addr16[5], an->addr16[5], u), + ao.addr16[6], an->addr16[6], u), + ao.addr16[7], an->addr16[7], u), + po, pn, u); + } break; #endif /* INET6 */ } @@ -2721,7 +2746,7 @@ pf_send_tcp(const struct pf_rule *r, sa_family_t af, h6->ip6_hlim = IPV6_DEFHLIM; bzero(&ro6, sizeof (ro6)); - ip6_output(m, NULL, &ro6, 0, NULL, NULL, 0); + ip6_output(m, NULL, &ro6, 0, NULL, NULL, NULL); if (ro6.ro_rt != NULL) rtfree(ro6.ro_rt); break; @@ -3959,8 +3984,8 @@ pf_socket_lookup(int direction, struct pf_pdesc *pd) { struct pf_addr *saddr, *daddr; u_int16_t sport, dport; - struct inpcbinfo *pi; - struct inpcb *inp = NULL; + struct inpcbinfo *pi; + int inp = 0; if (pd == NULL) return (-1); @@ -4001,10 +4026,10 @@ pf_socket_lookup(int direction, struct pf_pdesc *pd) switch (pd->af) { #if INET case AF_INET: - inp = in_pcblookup_hash(pi, saddr->v4, sport, daddr->v4, dport, - 0, NULL); + inp = in_pcblookup_hash_exists(pi, saddr->v4, sport, daddr->v4, dport, + 0, &pd->lookup.uid, &pd->lookup.gid, NULL); #if INET6 - if (inp == NULL) { + if (inp == 0) { struct in6_addr s6, d6; memset(&s6, 0, sizeof (s6)); @@ -4017,25 +4042,26 @@ pf_socket_lookup(int direction, struct pf_pdesc *pd) memcpy(&d6.s6_addr32[3], &daddr->v4, sizeof (daddr->v4)); - inp = in6_pcblookup_hash(pi, &s6, sport, - &d6, dport, 0, NULL); - if (inp == NULL) { - inp = in_pcblookup_hash(pi, saddr->v4, sport, - daddr->v4, dport, INPLOOKUP_WILDCARD, NULL); - if (inp == NULL) { - inp = in6_pcblookup_hash(pi, &s6, sport, + inp = in6_pcblookup_hash_exists(pi, &s6, sport, + &d6, dport, 0, &pd->lookup.uid, &pd->lookup.gid, NULL); + if (inp == 0) { + inp = in_pcblookup_hash_exists(pi, saddr->v4, sport, + daddr->v4, dport, INPLOOKUP_WILDCARD, &pd->lookup.uid, &pd->lookup.gid, NULL); + if (inp == 0) { + inp = in6_pcblookup_hash_exists(pi, &s6, sport, &d6, dport, INPLOOKUP_WILDCARD, - NULL); - if (inp == NULL) + &pd->lookup.uid, &pd->lookup.gid, NULL); + if (inp == 0) return (-1); } } } #else - if (inp == NULL) { - inp = in_pcblookup_hash(pi, saddr->v4, sport, - daddr->v4, dport, INPLOOKUP_WILDCARD, NULL); - if (inp == NULL) + if (inp == 0) { + inp = in_pcblookup_hash_exists(pi, saddr->v4, sport, + daddr->v4, dport, INPLOOKUP_WILDCARD, + &pd->lookup.uid, &pd->lookup.gid, NULL); + if (inp == 0) return (-1); } #endif /* !INET6 */ @@ -4043,24 +4069,22 @@ pf_socket_lookup(int direction, struct pf_pdesc *pd) #endif /* INET */ #if INET6 case AF_INET6: - inp = in6_pcblookup_hash(pi, &saddr->v6, sport, &daddr->v6, - dport, 0, NULL); - if (inp == NULL) { - inp = in6_pcblookup_hash(pi, &saddr->v6, sport, - &daddr->v6, dport, INPLOOKUP_WILDCARD, NULL); - if (inp == NULL) + inp = in6_pcblookup_hash_exists(pi, &saddr->v6, sport, &daddr->v6, + dport, 0, &pd->lookup.uid, &pd->lookup.gid, NULL); + if (inp == 0) { + inp = in6_pcblookup_hash_exists(pi, &saddr->v6, sport, + &daddr->v6, dport, INPLOOKUP_WILDCARD, + &pd->lookup.uid, &pd->lookup.gid, NULL); + if (inp == 0) return (-1); } break; #endif /* INET6 */ - + default: return (-1); } - if (inp != NULL) - in_pcb_checkstate(inp, WNT_RELEASE, 0); - return (1); } @@ -8162,10 +8186,12 @@ pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, } ifp = ro->ro_rt->rt_ifp; + RT_LOCK(ro->ro_rt); ro->ro_rt->rt_use++; if (ro->ro_rt->rt_flags & RTF_GATEWAY) dst = satosin(ro->ro_rt->rt_gateway); + RT_UNLOCK(ro->ro_rt); } else { if (TAILQ_EMPTY(&r->rpool.list)) { DPFPRINTF(PF_DEBUG_URGENT, @@ -8277,7 +8303,14 @@ pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, } m1 = m0; + + /* PR-8933605: send ip_len,ip_off to ip_fragment in host byte order */ +#if BYTE_ORDER != BIG_ENDIAN + NTOHS(ip->ip_off); + NTOHS(ip->ip_len); +#endif error = ip_fragment(m0, ifp, ifp->if_mtu, sw_csum); + if (error) { m0 = NULL; goto bad; @@ -8365,7 +8398,7 @@ pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, if ((pf_mtag = pf_get_mtag(m0)) == NULL) goto bad; pf_mtag->flags |= PF_TAG_GENERATED; - ip6_output(m0, NULL, NULL, 0, NULL, NULL, 0); + ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL); return; } @@ -8410,7 +8443,7 @@ pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, if (IN6_IS_SCOPE_EMBED(&dst->sin6_addr)) dst->sin6_addr.s6_addr16[1] = htons(ifp->if_index); if ((unsigned)m0->m_pkthdr.len <= ifp->if_mtu) { - error = nd6_output(ifp, ifp, m0, dst, NULL, 0); + error = nd6_output(ifp, ifp, m0, dst, NULL); } else { in6_ifstat_inc(ifp, ifs6_in_toobig); if (r->rt != PF_DUPTO) @@ -9536,6 +9569,7 @@ pool_init(struct pool *pp, size_t size, unsigned int align, unsigned int ioff, pp->pool_zone = zinit(size, 1024 * size, PAGE_SIZE, wchan); if (pp->pool_zone != NULL) { zone_change(pp->pool_zone, Z_EXPAND, TRUE); + zone_change(pp->pool_zone, Z_CALLERACCT, FALSE); pp->pool_hiwat = pp->pool_limit = (unsigned int)-1; pp->pool_name = wchan; } @@ -9622,8 +9656,8 @@ pf_get_mtag(struct mbuf *m) if ((mtag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_PF, NULL)) == NULL) { - mtag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_PF, - sizeof (struct pf_mtag), M_NOWAIT); + mtag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_PF, + sizeof (struct pf_mtag), M_NOWAIT, m); if (mtag == NULL) return (NULL); bzero(mtag + 1, sizeof (struct pf_mtag)); diff --git a/bsd/net/pf_if.c b/bsd/net/pf_if.c index 06873fce6..4c05205ba 100644 --- a/bsd/net/pf_if.c +++ b/bsd/net/pf_if.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2009 Apple Inc. All rights reserved. + * Copyright (c) 2007-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -436,28 +436,45 @@ pfi_instance_add(struct ifnet *ifp, int net, int flags) return; ifnet_lock_shared(ifp); TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { - if (ia->ifa_addr == NULL) + IFA_LOCK(ia); + if (ia->ifa_addr == NULL) { + IFA_UNLOCK(ia); continue; + } af = ia->ifa_addr->sa_family; - if (af != AF_INET && af != AF_INET6) + if (af != AF_INET && af != AF_INET6) { + IFA_UNLOCK(ia); continue; - if ((flags & PFI_AFLAG_BROADCAST) && af == AF_INET6) + } + if ((flags & PFI_AFLAG_BROADCAST) && af == AF_INET6) { + IFA_UNLOCK(ia); continue; + } if ((flags & PFI_AFLAG_BROADCAST) && - !(ifp->if_flags & IFF_BROADCAST)) + !(ifp->if_flags & IFF_BROADCAST)) { + IFA_UNLOCK(ia); continue; + } if ((flags & PFI_AFLAG_PEER) && - !(ifp->if_flags & IFF_POINTOPOINT)) + !(ifp->if_flags & IFF_POINTOPOINT)) { + IFA_UNLOCK(ia); continue; + } if ((flags & PFI_AFLAG_NETWORK) && af == AF_INET6 && IN6_IS_ADDR_LINKLOCAL( - &((struct sockaddr_in6 *)ia->ifa_addr)->sin6_addr)) + &((struct sockaddr_in6 *)ia->ifa_addr)->sin6_addr)) { + IFA_UNLOCK(ia); continue; + } if (flags & PFI_AFLAG_NOALIAS) { - if (af == AF_INET && got4) + if (af == AF_INET && got4) { + IFA_UNLOCK(ia); continue; - if (af == AF_INET6 && got6) + } + if (af == AF_INET6 && got6) { + IFA_UNLOCK(ia); continue; + } } if (af == AF_INET) got4 = 1; @@ -480,6 +497,7 @@ pfi_instance_add(struct ifnet *ifp, int net, int flags) pfi_address_add(ia->ifa_dstaddr, af, net2); else pfi_address_add(ia->ifa_addr, af, net2); + IFA_UNLOCK(ia); } ifnet_lock_done(ifp); } diff --git a/bsd/net/pf_ioctl.c b/bsd/net/pf_ioctl.c index 9165abfd4..25763d8f5 100644 --- a/bsd/net/pf_ioctl.c +++ b/bsd/net/pf_ioctl.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2009 Apple Inc. All rights reserved. + * Copyright (c) 2007-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,6 +79,7 @@ #include #include #include +#include #include @@ -134,6 +135,8 @@ static int pf_rollback_altq(u_int32_t); static int pf_commit_altq(u_int32_t); static int pf_enable_altq(struct pf_altq *); static int pf_disable_altq(struct pf_altq *); +static void pf_altq_copyin(struct pf_altq *, struct pf_altq *); +static void pf_altq_copyout(struct pf_altq *, struct pf_altq *); #endif /* ALTQ */ static int pf_begin_rules(u_int32_t *, int, const char *); static int pf_rollback_rules(u_int32_t, int, char *); @@ -145,10 +148,14 @@ static void pf_hash_rule_addr(MD5_CTX *, struct pf_rule_addr *, u_int8_t); static void pf_hash_rule_addr(MD5_CTX *, struct pf_rule_addr *); #endif static int pf_commit_rules(u_int32_t, int, char *); +static void pf_rule_copyin(struct pf_rule *, struct pf_rule *, struct proc *); +static void pf_rule_copyout(struct pf_rule *, struct pf_rule *); static void pf_state_export(struct pfsync_state *, struct pf_state_key *, struct pf_state *); static void pf_state_import(struct pfsync_state *, struct pf_state_key *, struct pf_state *); +static void pf_pooladdr_copyin(struct pf_pooladdr *, struct pf_pooladdr *); +static void pf_pooladdr_copyout(struct pf_pooladdr *, struct pf_pooladdr *); #define PF_CDEV_MAJOR (-1) @@ -180,7 +187,16 @@ static void pf_detach_hooks(void); * and used in pf_af_hook() for performance optimization, such that packets * will enter pf_test() or pf_test6() only when PF is running. */ -static int pf_is_enabled; +int pf_is_enabled = 0; + +/* + * These are the pf enabled reference counting variables + */ +static u_int64_t pf_enabled_ref_count; +static u_int32_t nr_tokens = 0; + +SLIST_HEAD(list_head, pfioc_kernel_token); +static struct list_head token_list_head; struct pf_rule pf_default_rule; #if ALTQ @@ -230,6 +246,78 @@ struct thread *pf_purge_thread; extern void pfi_kifaddr_update(void *); +/* pf enable ref-counting helper functions */ +static u_int64_t generate_token(void); +static int remove_token(struct pfioc_remove_token *); +static void invalidate_all_tokens(void); + +static u_int64_t +generate_token(void) +{ + u_int64_t token_value; + struct pfioc_kernel_token *new_token; + + new_token = _MALLOC(sizeof (struct pfioc_kernel_token), M_TEMP, M_WAITOK|M_ZERO); + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if (new_token == NULL) { + /* malloc failed! bail! */ + printf("%s: unable to allocate pf token structure!", __func__); + return 0; + } + + token_value = (u_int64_t)(uintptr_t)new_token; + + new_token->token.token_value = token_value; + new_token->token.pid = proc_pid(current_proc()); + proc_name(new_token->token.pid, new_token->token.proc_name, + sizeof (new_token->token.proc_name)); + new_token->token.timestamp = pf_calendar_time_second(); + + SLIST_INSERT_HEAD(&token_list_head, new_token, next); + nr_tokens++; + + return token_value; +} + +static int +remove_token(struct pfioc_remove_token *tok) +{ + struct pfioc_kernel_token *entry, *tmp; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + SLIST_FOREACH_SAFE(entry, &token_list_head, next, tmp) { + if (tok->token_value == entry->token.token_value) { + SLIST_REMOVE(&token_list_head, entry, pfioc_kernel_token, next); + _FREE(entry, M_TEMP); + nr_tokens--; + return 0; /* success */ + } + } + + printf("pf : remove failure\n"); + return ESRCH; /* failure */ +} + +static void +invalidate_all_tokens(void) +{ + struct pfioc_kernel_token *entry, *tmp; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + SLIST_FOREACH_SAFE(entry, &token_list_head, next, tmp) { + SLIST_REMOVE(&token_list_head, entry, pfioc_kernel_token, next); + _FREE(entry, M_TEMP); + } + + nr_tokens = 0; + + return; +} + void pfinit(void) { @@ -859,6 +947,27 @@ pf_disable_altq(struct pf_altq *altq) return (error); } + +static void +pf_altq_copyin(struct pf_altq *src, struct pf_altq *dst) +{ + bcopy(src, dst, sizeof (struct pf_altq)); + + dst->ifname[sizeof (dst->ifname) - 1] = '\0'; + dst->qname[sizeof (dst->qname) - 1] = '\0'; + dst->parent[sizeof (dst->parent) - 1] = '\0'; + dst->altq_disc = NULL; + TAILQ_INIT(&dst->entries); +} + +static void +pf_altq_copyout(struct pf_altq *src, struct pf_altq *dst) +{ + bcopy(src, dst, sizeof (struct pf_altq)); + + dst->altq_disc = NULL; + TAILQ_INIT(&dst->entries); +} #endif /* ALTQ */ static int @@ -951,7 +1060,7 @@ pf_hash_rule_addr(MD5_CTX *ctx, struct pf_rule_addr *pfr) PF_MD5_UPD(pfr, xport.range.port[0]); PF_MD5_UPD(pfr, xport.range.port[1]); PF_MD5_UPD(pfr, xport.range.op); - break; + break; default: break; @@ -1067,6 +1176,53 @@ pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor) return (0); } +static void +pf_rule_copyin(struct pf_rule *src, struct pf_rule *dst, struct proc *p) +{ + bcopy(src, dst, sizeof (struct pf_rule)); + + dst->label[sizeof (dst->label) - 1] = '\0'; + dst->ifname[sizeof (dst->ifname) - 1] = '\0'; + dst->qname[sizeof (dst->qname) - 1] = '\0'; + dst->pqname[sizeof (dst->pqname) - 1] = '\0'; + dst->tagname[sizeof (dst->tagname) - 1] = '\0'; + dst->match_tagname[sizeof (dst->match_tagname) - 1] = '\0'; + dst->overload_tblname[sizeof (dst->overload_tblname) - 1] = '\0'; + + dst->cuid = kauth_cred_getuid(p->p_ucred); + dst->cpid = p->p_pid; + + dst->anchor = NULL; + dst->kif = NULL; + dst->overload_tbl = NULL; + + TAILQ_INIT(&dst->rpool.list); + dst->rpool.cur = NULL; + + /* initialize refcounting */ + dst->states = 0; + dst->src_nodes = 0; + + dst->entries.tqe_prev = NULL; + dst->entries.tqe_next = NULL; +} + +static void +pf_rule_copyout(struct pf_rule *src, struct pf_rule *dst) +{ + bcopy(src, dst, sizeof (struct pf_rule)); + + dst->anchor = NULL; + dst->kif = NULL; + dst->overload_tbl = NULL; + + TAILQ_INIT(&dst->rpool.list); + dst->rpool.cur = NULL; + + dst->entries.tqe_prev = NULL; + dst->entries.tqe_next = NULL; +} + static void pf_state_export(struct pfsync_state *sp, struct pf_state_key *sk, struct pf_state *s) @@ -1176,6 +1332,27 @@ pf_state_import(struct pfsync_state *sp, struct pf_state_key *sk, s->bytes[0] = s->bytes[1] = 0; } +static void +pf_pooladdr_copyin(struct pf_pooladdr *src, struct pf_pooladdr *dst) +{ + bcopy(src, dst, sizeof (struct pf_pooladdr)); + + dst->entries.tqe_prev = NULL; + dst->entries.tqe_next = NULL; + dst->ifname[sizeof (dst->ifname) - 1] = '\0'; + dst->kif = NULL; +} + +static void +pf_pooladdr_copyout(struct pf_pooladdr *src, struct pf_pooladdr *dst) +{ + bcopy(src, dst, sizeof (struct pf_pooladdr)); + + dst->entries.tqe_prev = NULL; + dst->entries.tqe_next = NULL; + dst->kif = NULL; +} + static int pf_setup_pfsync_matching(struct pf_ruleset *rs) { @@ -1216,6 +1393,38 @@ pf_setup_pfsync_matching(struct pf_ruleset *rs) return (0); } +static void +pf_start(void) +{ + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + VERIFY(pf_is_enabled == 0); + + pf_is_enabled = 1; + pf_status.running = 1; + pf_status.since = pf_calendar_time_second(); + if (pf_status.stateid == 0) { + pf_status.stateid = pf_time_second(); + pf_status.stateid = pf_status.stateid << 32; + } + wakeup(pf_purge_thread_fn); + DPFPRINTF(PF_DEBUG_MISC, ("pf: started\n")); +} + +static void +pf_stop(void) +{ + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + VERIFY(pf_is_enabled); + + pf_status.running = 0; + pf_is_enabled = 0; + pf_status.since = pf_calendar_time_second(); + wakeup(pf_purge_thread_fn); + DPFPRINTF(PF_DEBUG_MISC, ("pf: stopped\n")); +} + static int pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) { @@ -1282,7 +1491,10 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) if (!(flags & FWRITE)) switch (cmd) { case DIOCSTART: + case DIOCSTARTREF: case DIOCSTOP: + case DIOCSTOPREF: + case DIOCGETSTARTERS: case DIOCGETRULES: case DIOCGETADDRS: case DIOCGETADDR: @@ -1316,7 +1528,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) case DIOCRSETADDRS: case DIOCRSETTFLAGS: if (((struct pfioc_table *)addr)->pfrio_flags & - PFR_FLAG_DUMMY) { + PFR_FLAG_DUMMY) { flags |= FWRITE; /* need write lock for dummy */ break; /* dummy operation ok */ } @@ -1341,20 +1553,41 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) case DIOCSTART: if (pf_status.running) { + /* + * Increment the reference for a simple -e enable, so + * that even if other processes drop their references, + * pf will still be available to processes that turned + * it on without taking a reference + */ + if (nr_tokens == pf_enabled_ref_count) { + pf_enabled_ref_count++; + VERIFY(pf_enabled_ref_count != 0); + } error = EEXIST; } else if (pf_purge_thread == NULL) { error = ENOMEM; } else { - pf_is_enabled = 1; - pf_status.running = 1; - pf_status.since = pf_calendar_time_second(); - if (pf_status.stateid == 0) { - pf_status.stateid = pf_time_second(); - pf_status.stateid = pf_status.stateid << 32; + pf_start(); + pf_enabled_ref_count++; + VERIFY(pf_enabled_ref_count != 0); + } + break; + + case DIOCSTARTREF: /* returns a token */ + if (pf_purge_thread == NULL) { + error = ENOMEM; + } else { + if ((*(u_int64_t *)addr = generate_token()) != 0) { + if (pf_is_enabled == 0) { + pf_start(); + } + pf_enabled_ref_count++; + VERIFY(pf_enabled_ref_count != 0); + } else { + error = ENOMEM; + DPFPRINTF(PF_DEBUG_URGENT, + ("pf: unable to generate token\n")); } - mbuf_growth_aggressive(); - wakeup(pf_purge_thread_fn); - DPFPRINTF(PF_DEBUG_MISC, ("pf: started\n")); } break; @@ -1362,23 +1595,102 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) if (!pf_status.running) { error = ENOENT; } else { - mbuf_growth_normal(); - pf_status.running = 0; - pf_is_enabled = 0; - pf_status.since = pf_calendar_time_second(); - wakeup(pf_purge_thread_fn); - DPFPRINTF(PF_DEBUG_MISC, ("pf: stopped\n")); + pf_stop(); + pf_enabled_ref_count = 0; + invalidate_all_tokens(); } break; + case DIOCSTOPREF: + if (!pf_status.running) { + error = ENOENT; + } else { + if ((error = remove_token( + (struct pfioc_remove_token*)addr))==0) { + VERIFY(pf_enabled_ref_count != 0); + pf_enabled_ref_count--; + // return currently held references + ((struct pfioc_remove_token *)addr)->refcount + = pf_enabled_ref_count; + DPFPRINTF(PF_DEBUG_MISC, + ("pf: enabled refcount decremented\n")); + } else { + error = EINVAL; + DPFPRINTF(PF_DEBUG_URGENT, + ("pf: token mismatch\n")); + break; + } + + if (pf_enabled_ref_count == 0) + pf_stop(); + } + break; + + case DIOCGETSTARTERS: { + struct pfioc_tokens *g_token = (struct pfioc_tokens *)addr; + struct pfioc_token *tokens; + struct pfioc_kernel_token *entry, *tmp; + user_addr_t token_buf; + int g_token_size_copy; + char *ptr; + + if (nr_tokens == 0) { + error = ENOENT; + break; + } + + g_token_size_copy = g_token->size; + + if (g_token->size == 0) { + g_token->size = sizeof (struct pfioc_token) * nr_tokens; + break; + } + + token_buf = PF_USER_ADDR(addr, pfioc_tokens, pgt_buf); + tokens = _MALLOC(sizeof(struct pfioc_token) * nr_tokens, + M_TEMP, M_WAITOK); + + if (tokens == NULL) { + error = ENOMEM; + break; + } + + ptr = (void *)tokens; + SLIST_FOREACH_SAFE(entry, &token_list_head, next, tmp) { + if ((unsigned)g_token_size_copy + < sizeof(struct pfioc_token)) + break; /* no more buffer space left */ + + ((struct pfioc_token *)(ptr))->token_value = entry->token.token_value; + ((struct pfioc_token *)(ptr))->timestamp = entry->token.timestamp; + ((struct pfioc_token *)(ptr))->pid = entry->token.pid; + memcpy(((struct pfioc_token *)(ptr))->proc_name, entry->token.proc_name, + PFTOK_PROCNAME_LEN); + ptr += sizeof(struct pfioc_token); + + g_token_size_copy -= sizeof(struct pfioc_token); + } + + if (g_token_size_copy < g_token->size) { + error = copyout(tokens, token_buf, + g_token->size - g_token_size_copy); + } + + g_token->size -= g_token_size_copy; + _FREE(tokens, M_TEMP); + + break; + } + case DIOCADDRULE: { struct pfioc_rule *pr = (struct pfioc_rule *)addr; struct pf_ruleset *ruleset; struct pf_rule *rule, *tail; struct pf_pooladdr *apa; - int rs_num; + int rs_num; - pr->anchor[sizeof (pr->anchor) - 1] = 0; + pr->anchor[sizeof (pr->anchor) - 1] = '\0'; + pr->anchor_call[sizeof (pr->anchor_call) - 1] = '\0'; ruleset = pf_find_ruleset(pr->anchor); if (ruleset == NULL) { error = EINVAL; @@ -1406,16 +1718,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENOMEM; break; } - bcopy(&pr->rule, rule, sizeof (struct pf_rule)); - rule->cuid = kauth_cred_getuid(p->p_ucred); - rule->cpid = p->p_pid; - rule->anchor = NULL; - rule->kif = NULL; - TAILQ_INIT(&rule->rpool.list); - /* initialize refcounting */ - rule->states = 0; - rule->src_nodes = 0; - rule->entries.tqe_prev = NULL; + pf_rule_copyin(&pr->rule, rule, p); #if !INET if (rule->af == AF_INET) { pool_put(&pf_rule_pl, rule); @@ -1526,7 +1829,8 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) struct pf_rule *tail; int rs_num; - pr->anchor[sizeof (pr->anchor) - 1] = 0; + pr->anchor[sizeof (pr->anchor) - 1] = '\0'; + pr->anchor_call[sizeof (pr->anchor_call) - 1] = '\0'; ruleset = pf_find_ruleset(pr->anchor); if (ruleset == NULL) { error = EINVAL; @@ -1553,7 +1857,8 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) struct pf_rule *rule; int rs_num, i; - pr->anchor[sizeof (pr->anchor) - 1] = 0; + pr->anchor[sizeof (pr->anchor) - 1] = '\0'; + pr->anchor_call[sizeof (pr->anchor_call) - 1] = '\0'; ruleset = pf_find_ruleset(pr->anchor); if (ruleset == NULL) { error = EINVAL; @@ -1575,7 +1880,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = EBUSY; break; } - bcopy(rule, &pr->rule, sizeof (struct pf_rule)); + pf_rule_copyout(rule, &pr->rule); if (pf_anchor_copyout(ruleset, rule, pr)) { error = EBUSY; break; @@ -1620,6 +1925,8 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = EINVAL; break; } + pcr->anchor[sizeof (pcr->anchor) - 1] = '\0'; + pcr->anchor_call[sizeof (pcr->anchor_call) - 1] = '\0'; ruleset = pf_find_ruleset(pcr->anchor); if (ruleset == NULL) { error = EINVAL; @@ -1652,13 +1959,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENOMEM; break; } - bcopy(&pcr->rule, newrule, sizeof (struct pf_rule)); - newrule->cuid = kauth_cred_getuid(p->p_ucred); - newrule->cpid = p->p_pid; - TAILQ_INIT(&newrule->rpool.list); - /* initialize refcounting */ - newrule->states = 0; - newrule->entries.tqe_prev = NULL; + pf_rule_copyin(&pcr->rule, newrule, p); #if !INET if (newrule->af == AF_INET) { pool_put(&pf_rule_pl, newrule); @@ -1816,6 +2117,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) struct pfioc_state_kill *psk = (struct pfioc_state_kill *)addr; int killed = 0; + psk->psk_ifname[sizeof (psk->psk_ifname) - 1] = '\0'; for (s = RB_MIN(pf_state_tree_id, &tree_id); s; s = nexts) { nexts = RB_NEXT(pf_state_tree_id, &tree_id, s); @@ -2268,7 +2570,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENOMEM; break; } - bcopy(&pa->altq, altq, sizeof (struct pf_altq)); + pf_altq_copyin(&pa->altq, altq); /* * if this is for a queue, find the discipline and @@ -2297,7 +2599,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) } TAILQ_INSERT_TAIL(pf_altqs_inactive, altq, entries); - bcopy(altq, &pa->altq, sizeof (struct pf_altq)); + pf_altq_copyout(altq, &pa->altq); break; } @@ -2331,7 +2633,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = EBUSY; break; } - bcopy(altq, &pa->altq, sizeof (struct pf_altq)); + pf_altq_copyout(altq, &pa->altq); break; } @@ -2381,6 +2683,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) case DIOCADDADDR: { struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr; + pp->anchor[sizeof (pp->anchor) - 1] = '\0'; if (pp->ticket != ticket_pabuf) { error = EBUSY; break; @@ -2408,7 +2711,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENOMEM; break; } - bcopy(&pp->addr, pa, sizeof (struct pf_pooladdr)); + pf_pooladdr_copyin(&pp->addr, pa); if (pa->ifname[0]) { pa->kif = pfi_kif_get(pa->ifname); if (pa->kif == NULL) { @@ -2433,6 +2736,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr; pp->nr = 0; + pp->anchor[sizeof (pp->anchor) - 1] = '\0'; pool = pf_get_pool(pp->anchor, pp->ticket, pp->r_action, pp->r_num, 0, 1, 0); if (pool == NULL) { @@ -2448,6 +2752,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr; u_int32_t nr = 0; + pp->anchor[sizeof (pp->anchor) - 1] = '\0'; pool = pf_get_pool(pp->anchor, pp->ticket, pp->r_action, pp->r_num, 0, 1, 1); if (pool == NULL) { @@ -2463,7 +2768,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = EBUSY; break; } - bcopy(pa, &pp->addr, sizeof (struct pf_pooladdr)); + pf_pooladdr_copyout(pa, &pp->addr); pfi_dynaddr_copyout(&pp->addr.addr); pf_tbladdr_copyout(&pp->addr.addr); pf_rtlabel_copyout(&pp->addr.addr); @@ -2487,6 +2792,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) break; } + pca->anchor[sizeof (pca->anchor) - 1] = '\0'; ruleset = pf_find_ruleset(pca->anchor); if (ruleset == NULL) { error = EBUSY; @@ -2504,7 +2810,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENOMEM; break; } - bcopy(&pca->addr, newpa, sizeof (struct pf_pooladdr)); + pf_pooladdr_copyin(&pca->addr, newpa); #if !INET if (pca->af == AF_INET) { pool_put(&pf_pooladdr_pl, newpa); @@ -2585,7 +2891,8 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) struct pf_ruleset *ruleset; struct pf_anchor *anchor; - pr->path[sizeof (pr->path) - 1] = 0; + pr->path[sizeof (pr->path) - 1] = '\0'; + pr->name[sizeof (pr->name) - 1] = '\0'; if ((ruleset = pf_find_ruleset(pr->path)) == NULL) { error = EINVAL; break; @@ -2610,7 +2917,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) struct pf_anchor *anchor; u_int32_t nr = 0; - pr->path[sizeof (pr->path) - 1] = 0; + pr->path[sizeof (pr->path) - 1] = '\0'; if ((ruleset = pf_find_ruleset(pr->path)) == NULL) { error = EINVAL; break; @@ -2645,6 +2952,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENODEV; break; } + pfr_table_copyin_cleanup(&io->pfrio_table); error = pfr_clr_tables(&io->pfrio_table, &io->pfrio_ndel, io->pfrio_flags | PFR_FLAG_USERIOCTL); break; @@ -2684,6 +2992,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENODEV; break; } + pfr_table_copyin_cleanup(&io->pfrio_table); error = pfr_get_tables(&io->pfrio_table, buf, &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); break; @@ -2697,6 +3006,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENODEV; break; } + pfr_table_copyin_cleanup(&io->pfrio_table); error = pfr_get_tstats(&io->pfrio_table, buf, &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); break; @@ -2736,6 +3046,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENODEV; break; } + pfr_table_copyin_cleanup(&io->pfrio_table); error = pfr_clr_addrs(&io->pfrio_table, &io->pfrio_ndel, io->pfrio_flags | PFR_FLAG_USERIOCTL); break; @@ -2749,6 +3060,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENODEV; break; } + pfr_table_copyin_cleanup(&io->pfrio_table); error = pfr_add_addrs(&io->pfrio_table, buf, io->pfrio_size, &io->pfrio_nadd, io->pfrio_flags | PFR_FLAG_USERIOCTL); @@ -2763,6 +3075,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENODEV; break; } + pfr_table_copyin_cleanup(&io->pfrio_table); error = pfr_del_addrs(&io->pfrio_table, buf, io->pfrio_size, &io->pfrio_ndel, io->pfrio_flags | PFR_FLAG_USERIOCTL); @@ -2777,6 +3090,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENODEV; break; } + pfr_table_copyin_cleanup(&io->pfrio_table); error = pfr_set_addrs(&io->pfrio_table, buf, io->pfrio_size, &io->pfrio_size2, &io->pfrio_nadd, &io->pfrio_ndel, &io->pfrio_nchange, io->pfrio_flags | @@ -2792,6 +3106,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENODEV; break; } + pfr_table_copyin_cleanup(&io->pfrio_table); error = pfr_get_addrs(&io->pfrio_table, buf, &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); break; @@ -2805,6 +3120,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENODEV; break; } + pfr_table_copyin_cleanup(&io->pfrio_table); error = pfr_get_astats(&io->pfrio_table, buf, &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); break; @@ -2818,6 +3134,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENODEV; break; } + pfr_table_copyin_cleanup(&io->pfrio_table); error = pfr_clr_astats(&io->pfrio_table, buf, io->pfrio_size, &io->pfrio_nzero, io->pfrio_flags | PFR_FLAG_USERIOCTL); @@ -2832,6 +3149,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENODEV; break; } + pfr_table_copyin_cleanup(&io->pfrio_table); error = pfr_tst_addrs(&io->pfrio_table, buf, io->pfrio_size, &io->pfrio_nmatch, io->pfrio_flags | PFR_FLAG_USERIOCTL); @@ -2846,6 +3164,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENODEV; break; } + pfr_table_copyin_cleanup(&io->pfrio_table); error = pfr_ina_define(&io->pfrio_table, buf, io->pfrio_size, &io->pfrio_nadd, &io->pfrio_naddr, io->pfrio_ticket, io->pfrio_flags | PFR_FLAG_USERIOCTL); @@ -2885,6 +3204,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = EFAULT; goto fail; } + ioe->anchor[sizeof (ioe->anchor) - 1] = '\0'; switch (ioe->rs_num) { case PF_RULESET_ALTQ: #if ALTQ @@ -2954,6 +3274,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = EFAULT; goto fail; } + ioe->anchor[sizeof (ioe->anchor) - 1] = '\0'; switch (ioe->rs_num) { case PF_RULESET_ALTQ: #if ALTQ @@ -3019,6 +3340,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = EFAULT; goto fail; } + ioe->anchor[sizeof (ioe->anchor) - 1] = '\0'; switch (ioe->rs_num) { case PF_RULESET_ALTQ: #if ALTQ @@ -3077,6 +3399,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = EFAULT; goto fail; } + ioe->anchor[sizeof (ioe->anchor) - 1] = '\0'; switch (ioe->rs_num) { case PF_RULESET_ALTQ: #if ALTQ @@ -3155,6 +3478,10 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) n->conn_rate.count * diff / n->conn_rate.seconds; + _RB_PARENT(pstore, entry) = NULL; + RB_LEFT(pstore, entry) = RB_RIGHT(pstore, entry) = NULL; + pstore->kif = NULL; + error = copyout(pstore, buf, sizeof (*pstore)); if (error) { _FREE(pstore, M_TEMP); @@ -3387,9 +3714,11 @@ pf_inet_hook(struct ifnet *ifp, struct mbuf **mp, int input) } #if BYTE_ORDER != BIG_ENDIAN else { - ip = mtod(*mp, struct ip *); - NTOHS(ip->ip_len); - NTOHS(ip->ip_off); + if (*mp != NULL) { + ip = mtod(*mp, struct ip *); + NTOHS(ip->ip_len); + NTOHS(ip->ip_off); + } } #endif return (error); @@ -3402,10 +3731,6 @@ pf_inet6_hook(struct ifnet *ifp, struct mbuf **mp, int input) { int error = 0; -#if 0 - /* - * TODO: once we support IPv6 hardware checksum offload - */ /* * If the packet is outbound, is originated locally, is flagged for * delayed UDP/TCP checksum calculation, and is about to be processed @@ -3414,16 +3739,15 @@ pf_inet6_hook(struct ifnet *ifp, struct mbuf **mp, int input) * it properly. */ if (!input && (*mp)->m_pkthdr.rcvif == NULL) { - static const int mask = CSUM_DELAY_DATA; + static const int mask = CSUM_DELAY_IPV6_DATA; const int flags = (*mp)->m_pkthdr.csum_flags & ~IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist); if (flags & mask) { - in6_delayed_cksum(*mp); + in6_delayed_cksum(*mp, sizeof(struct ip6_hdr)); (*mp)->m_pkthdr.csum_flags &= ~mask; } } -#endif if (pf_test6(input ? PF_IN : PF_OUT, ifp, mp, NULL) != PF_PASS) { if (*mp != NULL) { @@ -3449,7 +3773,8 @@ pf_ifaddr_hook(struct ifnet *ifp, unsigned long cmd) case SIOCAIFADDR: case SIOCDIFADDR: #if INET6 - case SIOCAIFADDR_IN6: + case SIOCAIFADDR_IN6_32: + case SIOCAIFADDR_IN6_64: case SIOCDIFADDR_IN6: #endif /* INET6 */ if (ifp->if_pf_kif != NULL) diff --git a/bsd/net/pf_osfp.c b/bsd/net/pf_osfp.c index b7e579d5c..89d71e889 100644 --- a/bsd/net/pf_osfp.c +++ b/bsd/net/pf_osfp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2009 Apple Inc. All rights reserved. + * Copyright (c) 2007-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -312,7 +312,7 @@ int pf_osfp_add(struct pf_osfp_ioctl *fpioc) { struct pf_os_fingerprint *fp, fpadd; - struct pf_osfp_entry *entry; + struct pf_osfp_entry *entry, *uentry; memset(&fpadd, 0, sizeof (fpadd)); fpadd.fp_tcpopts = fpioc->fp_tcpopts; @@ -324,6 +324,12 @@ pf_osfp_add(struct pf_osfp_ioctl *fpioc) fpadd.fp_wscale = fpioc->fp_wscale; fpadd.fp_ttl = fpioc->fp_ttl; + uentry = &fpioc->fp_os; + uentry->fp_entry.sle_next = NULL; + uentry->fp_class_nm[sizeof (uentry->fp_class_nm) - 1] = '\0'; + uentry->fp_version_nm[sizeof (uentry->fp_version_nm) - 1] = '\0'; + uentry->fp_subtype_nm[sizeof (uentry->fp_subtype_nm) - 1] = '\0'; + DPFPRINTF("adding osfp %s %s %s = %s%d:%d:%d:%s%d:0x%llx %d " "(TS=%s,M=%s%d,W=%s%d) %x\n", fpioc->fp_os.fp_class_nm, fpioc->fp_os.fp_version_nm, @@ -527,6 +533,7 @@ pf_osfp_get(struct pf_osfp_ioctl *fpioc) fpioc->fp_getnum = num; memcpy(&fpioc->fp_os, entry, sizeof (fpioc->fp_os)); + fpioc->fp_os.fp_entry.sle_next = NULL; return (0); } } diff --git a/bsd/net/pf_table.c b/bsd/net/pf_table.c index 735c65b81..ea3b529f5 100644 --- a/bsd/net/pf_table.c +++ b/bsd/net/pf_table.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2009 Apple Inc. All rights reserved. + * Copyright (c) 2007-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1190,6 +1190,7 @@ pfr_add_tables(user_addr_t tbl, int size, int *nadd, int flags) for (i = 0; i < size; i++, tbl += sizeof (key.pfrkt_t)) { if (COPYIN(tbl, &key.pfrkt_t, sizeof (key.pfrkt_t), flags)) senderr(EFAULT); + pfr_table_copyin_cleanup(&key.pfrkt_t); if (pfr_validate_table(&key.pfrkt_t, PFR_TFLAG_USRMASK, flags & PFR_FLAG_USERIOCTL)) senderr(EINVAL); @@ -1266,6 +1267,7 @@ pfr_del_tables(user_addr_t tbl, int size, int *ndel, int flags) for (i = 0; i < size; i++, tbl += sizeof (key.pfrkt_t)) { if (COPYIN(tbl, &key.pfrkt_t, sizeof (key.pfrkt_t), flags)) return (EFAULT); + pfr_table_copyin_cleanup(&key.pfrkt_t); if (pfr_validate_table(&key.pfrkt_t, 0, flags & PFR_FLAG_USERIOCTL)) return (EINVAL); @@ -1385,6 +1387,7 @@ pfr_clr_tstats(user_addr_t tbl, int size, int *nzero, int flags) for (i = 0; i < size; i++, tbl += sizeof (key.pfrkt_t)) { if (COPYIN(tbl, &key.pfrkt_t, sizeof (key.pfrkt_t), flags)) return (EFAULT); + pfr_table_copyin_cleanup(&key.pfrkt_t); if (pfr_validate_table(&key.pfrkt_t, 0, 0)) return (EINVAL); p = RB_FIND(pfr_ktablehead, &pfr_ktables, &key); @@ -1420,6 +1423,7 @@ pfr_set_tflags(user_addr_t tbl, int size, int setflag, int clrflag, for (i = 0; i < size; i++, tbl += sizeof (key.pfrkt_t)) { if (COPYIN(tbl, &key.pfrkt_t, sizeof (key.pfrkt_t), flags)) return (EFAULT); + pfr_table_copyin_cleanup(&key.pfrkt_t); if (pfr_validate_table(&key.pfrkt_t, 0, flags & PFR_FLAG_USERIOCTL)) return (EINVAL); @@ -1730,6 +1734,13 @@ pfr_commit_ktable(struct pfr_ktable *kt, u_int64_t tzero) pfr_setflags_ktable(kt, nflags); } +void +pfr_table_copyin_cleanup(struct pfr_table *tbl) +{ + tbl->pfrt_anchor[sizeof (tbl->pfrt_anchor) - 1] = '\0'; + tbl->pfrt_name[sizeof (tbl->pfrt_name) - 1] = '\0'; +} + static int pfr_validate_table(struct pfr_table *tbl, int allowedflags, int no_reserved) { diff --git a/bsd/net/pfkeyv2.h b/bsd/net/pfkeyv2.h index fa89f14c7..e452e1d2e 100644 --- a/bsd/net/pfkeyv2.h +++ b/bsd/net/pfkeyv2.h @@ -412,6 +412,7 @@ struct sadb_sastat { #define SADB_X_EXT_NATT_KEEPALIVE 0x0004 /* Local node is behind NAT, send keepalives */ /* Should only be set for outbound SAs */ #define SADB_X_EXT_NATT_MULTIPLEUSERS 0x0008 /* For use on VPN server - support multiple users */ +#define SADB_X_EXT_NATT_DETECTED_PEER 0x0010 #endif /* PRIVATE */ diff --git a/bsd/net/pfvar.h b/bsd/net/pfvar.h index fc35db9e7..58c4b3969 100644 --- a/bsd/net/pfvar.h +++ b/bsd/net/pfvar.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2009 Apple Inc. All rights reserved. + * Copyright (c) 2007-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -77,7 +77,7 @@ extern "C" { #include #include #include -#include +#include #include #include @@ -1771,6 +1771,55 @@ struct pfioc_states_64 { }; #endif /* KERNEL */ +#define PFTOK_PROCNAME_LEN 64 +#pragma pack(1) +struct pfioc_token { + u_int64_t token_value; + u_int64_t timestamp; + pid_t pid; + char proc_name[PFTOK_PROCNAME_LEN]; +}; +#pragma pack() + +struct pfioc_kernel_token { + SLIST_ENTRY(pfioc_kernel_token) next; + struct pfioc_token token; +}; + +struct pfioc_remove_token { + u_int64_t token_value; + u_int64_t refcount; +}; + +struct pfioc_tokens { + int size; + union { + caddr_t pgtu_buf; + struct pfioc_token *pgtu_tokens; + } pgt_u __attribute__((aligned(8))); +#define pgt_buf pgt_u.pgtu_buf +#define pgt_tokens pgt_u.pgtu_tokens +}; + +#ifdef KERNEL +struct pfioc_tokens_32 { + int size; + union { + user32_addr_t pgtu_buf; + user32_addr_t pgtu_tokens; + } pgt_u __attribute__((aligned(8))); +}; + +struct pfioc_tokens_64 { + int size; + union { + user64_addr_t pgtu_buf; + user64_addr_t pgtu_tokens; + } pgt_u __attribute__((aligned(8))); +}; +#endif /* KERNEL */ + + struct pfioc_src_nodes { int psn_len; union { @@ -1860,6 +1909,7 @@ struct pfioc_trans_64 { }; #endif /* KERNEL */ + #define PFR_FLAG_ATOMIC 0x00000001 #define PFR_FLAG_DUMMY 0x00000002 #define PFR_FLAG_FEEDBACK 0x00000004 @@ -1955,12 +2005,15 @@ struct pfioc_iface_64 { #define DIOCSTART _IO ('D', 1) #define DIOCSTOP _IO ('D', 2) #define DIOCADDRULE _IOWR('D', 4, struct pfioc_rule) +#define DIOCGETSTARTERS _IOWR('D', 5, struct pfioc_tokens) #define DIOCGETRULES _IOWR('D', 6, struct pfioc_rule) #define DIOCGETRULE _IOWR('D', 7, struct pfioc_rule) -/* XXX cut 8 - 17 */ +#define DIOCSTARTREF _IOR ('D', 8, u_int64_t) +#define DIOCSTOPREF _IOWR('D', 9, struct pfioc_remove_token) +/* XXX cut 10 - 17 */ #define DIOCCLRSTATES _IOWR('D', 18, struct pfioc_state_kill) #define DIOCGETSTATE _IOWR('D', 19, struct pfioc_state) -#define DIOCSETSTATUSIF _IOWR('D', 20, struct pfioc_if) +#define DIOCSETSTATUSIF _IOWR('D', 20, struct pfioc_if) #define DIOCGETSTATUS _IOWR('D', 21, struct pf_status) #define DIOCCLRSTATUS _IO ('D', 22) #define DIOCNATLOOK _IOWR('D', 23, struct pfioc_natlook) @@ -1995,23 +2048,23 @@ struct pfioc_iface_64 { #define DIOCRDELTABLES _IOWR('D', 62, struct pfioc_table) #define DIOCRGETTABLES _IOWR('D', 63, struct pfioc_table) #define DIOCRGETTSTATS _IOWR('D', 64, struct pfioc_table) -#define DIOCRCLRTSTATS _IOWR('D', 65, struct pfioc_table) +#define DIOCRCLRTSTATS _IOWR('D', 65, struct pfioc_table) #define DIOCRCLRADDRS _IOWR('D', 66, struct pfioc_table) #define DIOCRADDADDRS _IOWR('D', 67, struct pfioc_table) #define DIOCRDELADDRS _IOWR('D', 68, struct pfioc_table) #define DIOCRSETADDRS _IOWR('D', 69, struct pfioc_table) #define DIOCRGETADDRS _IOWR('D', 70, struct pfioc_table) #define DIOCRGETASTATS _IOWR('D', 71, struct pfioc_table) -#define DIOCRCLRASTATS _IOWR('D', 72, struct pfioc_table) +#define DIOCRCLRASTATS _IOWR('D', 72, struct pfioc_table) #define DIOCRTSTADDRS _IOWR('D', 73, struct pfioc_table) #define DIOCRSETTFLAGS _IOWR('D', 74, struct pfioc_table) #define DIOCRINADEFINE _IOWR('D', 77, struct pfioc_table) #define DIOCOSFPFLUSH _IO('D', 78) #define DIOCOSFPADD _IOWR('D', 79, struct pf_osfp_ioctl) #define DIOCOSFPGET _IOWR('D', 80, struct pf_osfp_ioctl) -#define DIOCXBEGIN _IOWR('D', 81, struct pfioc_trans) -#define DIOCXCOMMIT _IOWR('D', 82, struct pfioc_trans) -#define DIOCXROLLBACK _IOWR('D', 83, struct pfioc_trans) +#define DIOCXBEGIN _IOWR('D', 81, struct pfioc_trans) +#define DIOCXCOMMIT _IOWR('D', 82, struct pfioc_trans) +#define DIOCXROLLBACK _IOWR('D', 83, struct pfioc_trans) #define DIOCGETSRCNODES _IOWR('D', 84, struct pfioc_src_nodes) #define DIOCCLRSRCNODES _IO('D', 85) #define DIOCSETHOSTID _IOWR('D', 86, u_int32_t) @@ -2158,6 +2211,7 @@ __private_extern__ int pfr_pool_get(struct pfr_ktable *, int *, struct pf_addr *, struct pf_addr **, struct pf_addr **, sa_family_t); __private_extern__ void pfr_dynaddr_update(struct pfr_ktable *, struct pfi_dynaddr *); +__private_extern__ void pfr_table_copyin_cleanup(struct pfr_table *); __private_extern__ struct pfr_ktable *pfr_attach_table(struct pf_ruleset *, char *); __private_extern__ void pfr_detach_table(struct pfr_ktable *); @@ -2248,6 +2302,9 @@ __private_extern__ struct pf_anchor_global pf_anchors; __private_extern__ struct pf_anchor pf_main_anchor; #define pf_main_ruleset pf_main_anchor.ruleset +__private_extern__ int pf_is_enabled; +#define PF_IS_ENABLED (pf_is_enabled != 0) + /* these ruleset functions can be linked into userland programs (pfctl) */ __private_extern__ int pf_get_ruleset_number(u_int8_t); __private_extern__ void pf_init_ruleset(struct pf_ruleset *); diff --git a/bsd/net/ppp_deflate.c b/bsd/net/ppp_deflate.c index 5968578b0..4541def29 100644 --- a/bsd/net/ppp_deflate.c +++ b/bsd/net/ppp_deflate.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -316,7 +316,7 @@ z_compress(arg, mret, mp, orig_len, maxolen) } ++state->seqno; - rptr += (proto > 0xff)? 2: 3; /* skip 1st proto byte if 0 */ + rptr += (proto > 0xff)? 2: 3; /* skip 1st proto byte if 0 */ state->strm.next_in = rptr; state->strm.avail_in = mtod(mp, u_char *) + mp->m_len - rptr; mp = mp->m_next; diff --git a/bsd/net/route.c b/bsd/net/route.c index 5f1580e8f..5ed681a0f 100644 --- a/bsd/net/route.c +++ b/bsd/net/route.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -70,16 +70,25 @@ #include #include #include +#include +#include #include #include #include #include +#include #include #include #include #include +#include + +#if INET6 +#include +#include +#endif /* INET6 */ #include @@ -187,7 +196,6 @@ */ #define equal(a1, a2) (bcmp((caddr_t)(a1), (caddr_t)(a2), (a1)->sa_len) == 0) -#define SA(p) ((struct sockaddr *)(p)) extern void kdp_set_gateway_mac (void *gatewaymac); @@ -261,11 +269,6 @@ struct rtentry_dbg { TAILQ_ENTRY(rtentry_dbg) rtd_trash_link; }; -#define atomic_add_16_ov(a, n) \ - ((uint16_t) OSAddAtomic16(n, (volatile SInt16 *)a)) -#define atomic_add_32_ov(a, n) \ - ((uint32_t) OSAddAtomic(n, a)) - /* List of trash route entries protected by rnh_lock */ static TAILQ_HEAD(, rtentry_dbg) rttrash_head; @@ -285,33 +288,34 @@ static struct rtentry *rtalloc1_common_locked(struct sockaddr *, int, uint32_t, static int rtrequest_common_locked(int, struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct rtentry **, unsigned int); +static struct rtentry *rtalloc1_locked(struct sockaddr *, int, uint32_t); static void rtalloc_ign_common_locked(struct route *, uint32_t, unsigned int); -static inline void sa_set_ifscope(struct sockaddr *, unsigned int); -static struct sockaddr *sin_copy(struct sockaddr_in *, struct sockaddr_in *, - unsigned int); -static struct sockaddr *mask_copy(struct sockaddr *, struct sockaddr_in *, - unsigned int); +static inline void sin6_set_ifscope(struct sockaddr *, unsigned int); +static inline void sin6_set_embedded_ifscope(struct sockaddr *, unsigned int); +static inline unsigned int sin6_get_embedded_ifscope(struct sockaddr *); +static struct sockaddr *sa_copy(struct sockaddr *, struct sockaddr_storage *, + unsigned int *); +static struct sockaddr *ma_copy(int, struct sockaddr *, + struct sockaddr_storage *, unsigned int); static struct sockaddr *sa_trim(struct sockaddr *, int); static struct radix_node *node_lookup(struct sockaddr *, struct sockaddr *, unsigned int); -static struct radix_node *node_lookup_default(void); +static struct radix_node *node_lookup_default(int); static int rn_match_ifscope(struct radix_node *, void *); static struct ifaddr *ifa_ifwithroute_common_locked(int, const struct sockaddr *, const struct sockaddr *, unsigned int); static struct rtentry *rte_alloc(void); static void rte_free(struct rtentry *); static void rtfree_common(struct rtentry *, boolean_t); -#if IFNET_ROUTE_REFCNT static void rte_if_ref(struct ifnet *, int); -#endif /* IFNET_ROUTE_REFCNT */ uint32_t route_generation = 0; /* - * sockaddr_in with embedded interface scope; this is used internally - * to keep track of scoped route entries in the routing table. The - * fact that such a scope is embedded in the structure is an artifact - * of the current implementation which could change in future. + * sockaddr_in with scope ID field; this is used internally to keep + * track of scoped route entries in the routing table. The fact that + * such a value is embedded in the structure is an artifact of the + * current implementation which could change in future. */ struct sockaddr_inifscope { __uint8_t sin_len; @@ -330,11 +334,14 @@ struct sockaddr_inifscope { __uint32_t ifscope; } _in_index; } un; -#define sin_ifscope un._in_index.ifscope +#define sin_scope_id un._in_index.ifscope }; +#define SA(sa) ((struct sockaddr *)(size_t)(sa)) #define SIN(sa) ((struct sockaddr_in *)(size_t)(sa)) +#define SIN6(sa) ((struct sockaddr_in6 *)(size_t)(sa)) #define SINIFSCOPE(sa) ((struct sockaddr_inifscope *)(size_t)(sa)) +#define SIN6IFSCOPE(sa) SIN6(sa) #define ASSERT_SINIFSCOPE(sa) { \ if ((sa)->sa_family != AF_INET || \ @@ -342,6 +349,12 @@ struct sockaddr_inifscope { panic("%s: bad sockaddr_in %p\n", __func__, sa); \ } +#define ASSERT_SIN6IFSCOPE(sa) { \ + if ((sa)->sa_family != AF_INET6 || \ + (sa)->sa_len < sizeof (struct sockaddr_in6)) \ + panic("%s: bad sockaddr_in %p\n", __func__, sa); \ +} + /* * Argument to leaf-matching routine; at present it is scoped routing * specific but can be expanded in future to include other search filters. @@ -358,27 +371,36 @@ static struct sockaddr sin_def = { sizeof (struct sockaddr_in), AF_INET, { 0, } }; +static struct sockaddr_in6 sin6_def = { + sizeof (struct sockaddr_in6), AF_INET6, 0, 0, IN6ADDR_ANY_INIT, 0 +}; + /* * Interface index (scope) of the primary interface; determined at * the time when the default, non-scoped route gets added, changed * or deleted. Protected by rnh_lock. */ static unsigned int primary_ifscope = IFSCOPE_NONE; +static unsigned int primary6_ifscope = IFSCOPE_NONE; + +#define INET_DEFAULT(sa) \ + ((sa)->sa_family == AF_INET && SIN(sa)->sin_addr.s_addr == 0) -#define INET_DEFAULT(dst) \ - ((dst)->sa_family == AF_INET && SIN(dst)->sin_addr.s_addr == 0) +#define INET6_DEFAULT(sa) \ + ((sa)->sa_family == AF_INET6 && \ + IN6_IS_ADDR_UNSPECIFIED(&SIN6(sa)->sin6_addr)) +#define SA_DEFAULT(sa) (INET_DEFAULT(sa) || INET6_DEFAULT(sa)) #define RT(r) ((struct rtentry *)r) +#define RN(r) ((struct radix_node *)r) #define RT_HOST(r) (RT(r)->rt_flags & RTF_HOST) -#if IFNET_ROUTE_REFCNT SYSCTL_DECL(_net_idle_route); static int rt_if_idle_expire_timeout = RT_IF_IDLE_EXPIRE_TIMEOUT; SYSCTL_INT(_net_idle_route, OID_AUTO, expire_timeout, CTLFLAG_RW, &rt_if_idle_expire_timeout, 0, "Default expiration time on routes for " "interface idle reference counting"); -#endif /* IFNET_ROUTE_REFCNT */ /* * Given a route, determine whether or not it is the non-scoped default @@ -386,88 +408,189 @@ SYSCTL_INT(_net_idle_route, OID_AUTO, expire_timeout, CTLFLAG_RW, * a separate place when rt is in the process of being created. */ boolean_t -rt_inet_default(struct rtentry *rt, struct sockaddr *dst) +rt_primary_default(struct rtentry *rt, struct sockaddr *dst) { - return (INET_DEFAULT(dst) && !(rt->rt_flags & RTF_IFSCOPE)); + return (SA_DEFAULT(dst) && !(rt->rt_flags & RTF_IFSCOPE)); } /* * Set the ifscope of the primary interface; caller holds rnh_lock. */ void -set_primary_ifscope(unsigned int ifscope) +set_primary_ifscope(int af, unsigned int ifscope) { - primary_ifscope = ifscope; + if (af == AF_INET) + primary_ifscope = ifscope; + else + primary6_ifscope = ifscope; } /* * Return the ifscope of the primary interface; caller holds rnh_lock. */ unsigned int -get_primary_ifscope(void) +get_primary_ifscope(int af) { - return (primary_ifscope); + return (af == AF_INET ? primary_ifscope : primary6_ifscope); } /* - * Embed ifscope into a given a sockaddr_in. + * Set the scope ID of a given a sockaddr_in. */ -static inline void -sa_set_ifscope(struct sockaddr *sa, unsigned int ifscope) +void +sin_set_ifscope(struct sockaddr *sa, unsigned int ifscope) { /* Caller must pass in sockaddr_in */ ASSERT_SINIFSCOPE(sa); - SINIFSCOPE(sa)->sin_ifscope = ifscope; + SINIFSCOPE(sa)->sin_scope_id = ifscope; } /* - * Given a sockaddr_in, return the embedded ifscope to the caller. + * Set the scope ID of given a sockaddr_in6. + */ +static inline void +sin6_set_ifscope(struct sockaddr *sa, unsigned int ifscope) +{ + /* Caller must pass in sockaddr_in6 */ + ASSERT_SIN6IFSCOPE(sa); + + SIN6IFSCOPE(sa)->sin6_scope_id = ifscope; +} + +/* + * Given a sockaddr_in, return the scope ID to the caller. */ unsigned int -sa_get_ifscope(struct sockaddr *sa) +sin_get_ifscope(struct sockaddr *sa) { /* Caller must pass in sockaddr_in */ ASSERT_SINIFSCOPE(sa); - return (SINIFSCOPE(sa)->sin_ifscope); + return (SINIFSCOPE(sa)->sin_scope_id); } /* - * Copy a sockaddr_in src to dst and embed ifscope into dst. + * Given a sockaddr_in6, return the scope ID to the caller. + */ +unsigned int +sin6_get_ifscope(struct sockaddr *sa) +{ + /* Caller must pass in sockaddr_in6 */ + ASSERT_SIN6IFSCOPE(sa); + + return (SIN6IFSCOPE(sa)->sin6_scope_id); +} + +static inline void +sin6_set_embedded_ifscope(struct sockaddr *sa, unsigned int ifscope) +{ + /* Caller must pass in sockaddr_in6 */ + ASSERT_SIN6IFSCOPE(sa); + VERIFY(IN6_IS_SCOPE_EMBED(&(SIN6(sa)->sin6_addr))); + + SIN6(sa)->sin6_addr.s6_addr16[1] = htons(ifscope); +} + +static inline unsigned int +sin6_get_embedded_ifscope(struct sockaddr *sa) +{ + /* Caller must pass in sockaddr_in6 */ + ASSERT_SIN6IFSCOPE(sa); + + return (ntohs(SIN6(sa)->sin6_addr.s6_addr16[1])); +} + +/* + * Copy a sockaddr_{in,in6} src to a dst storage and set scope ID into dst. + * + * To clear the scope ID, pass is a NULL pifscope. To set the scope ID, pass + * in a non-NULL pifscope with non-zero ifscope. Otherwise if pifscope is + * non-NULL and ifscope is IFSCOPE_NONE, the existing scope ID is left intact. + * In any case, the effective scope ID value is returned to the caller via + * pifscope, if it is non-NULL. */ static struct sockaddr * -sin_copy(struct sockaddr_in *src, struct sockaddr_in *dst, unsigned int ifscope) +sa_copy(struct sockaddr *src, struct sockaddr_storage *dst, + unsigned int *pifscope) { - *dst = *src; - sa_set_ifscope(SA(dst), ifscope); + int af = src->sa_family; + unsigned int ifscope = (pifscope != NULL) ? *pifscope : IFSCOPE_NONE; + + VERIFY(af == AF_INET || af == AF_INET6); + + bzero(dst, sizeof (*dst)); + + if (af == AF_INET) { + bcopy(src, dst, sizeof (struct sockaddr_in)); + if (pifscope == NULL || ifscope != IFSCOPE_NONE) + sin_set_ifscope(SA(dst), ifscope); + } else { + bcopy(src, dst, sizeof (struct sockaddr_in6)); + if (pifscope != NULL && + IN6_IS_SCOPE_EMBED(&SIN6(dst)->sin6_addr)) { + unsigned int eifscope; + /* + * If the address contains the embedded scope ID, + * use that as the value for sin6_scope_id as long + * the caller doesn't insist on clearing it (by + * passing NULL) or setting it. + */ + eifscope = sin6_get_embedded_ifscope(SA(dst)); + if (eifscope != IFSCOPE_NONE && ifscope == IFSCOPE_NONE) + ifscope = eifscope; + sin6_set_ifscope(SA(dst), ifscope); + /* + * If sin6_scope_id is set but the address doesn't + * contain the equivalent embedded value, set it. + */ + if (ifscope != IFSCOPE_NONE && eifscope != ifscope) + sin6_set_embedded_ifscope(SA(dst), ifscope); + } else if (pifscope == NULL || ifscope != IFSCOPE_NONE) { + sin6_set_ifscope(SA(dst), ifscope); + } + } + + if (pifscope != NULL) { + *pifscope = (af == AF_INET) ? sin_get_ifscope(SA(dst)) : + sin6_get_ifscope(SA(dst)); + } return (SA(dst)); } /* - * Copy a mask from src to a sockaddr_in dst and embed ifscope into dst. + * Copy a mask from src to a dst storage and set scope ID into dst. */ static struct sockaddr * -mask_copy(struct sockaddr *src, struct sockaddr_in *dst, unsigned int ifscope) +ma_copy(int af, struct sockaddr *src, struct sockaddr_storage *dst, + unsigned int ifscope) { - /* We know dst is at least the size of sockaddr{_in} */ + VERIFY(af == AF_INET || af == AF_INET6); + bzero(dst, sizeof (*dst)); rt_maskedcopy(src, SA(dst), src); /* * The length of the mask sockaddr would need to be adjusted - * to cover the additional sin_ifscope field; when ifscope is - * IFSCOPE_NONE, we'd end up clearing the embedded ifscope on + * to cover the additional {sin,sin6}_ifscope field; when ifscope + * is IFSCOPE_NONE, we'd end up clearing the scope ID field on * the destination mask in addition to extending the length * of the sockaddr, as a side effect. This is okay, as any * trailing zeroes would be skipped by rn_addmask prior to * inserting or looking up the mask in the mask tree. */ - SINIFSCOPE(dst)->sin_ifscope = ifscope; - SINIFSCOPE(dst)->sin_len = - offsetof(struct sockaddr_inifscope, sin_ifscope) + - sizeof (SINIFSCOPE(dst)->sin_ifscope); + if (af == AF_INET) { + SINIFSCOPE(dst)->sin_scope_id = ifscope; + SINIFSCOPE(dst)->sin_len = + offsetof(struct sockaddr_inifscope, sin_scope_id) + + sizeof (SINIFSCOPE(dst)->sin_scope_id); + } else { + SIN6IFSCOPE(dst)->sin6_scope_id = ifscope; + SIN6IFSCOPE(dst)->sin6_len = + offsetof(struct sockaddr_in6, sin6_scope_id) + + sizeof (SIN6IFSCOPE(dst)->sin6_scope_id); + } return (SA(dst)); } @@ -501,15 +624,15 @@ sa_trim(struct sockaddr *sa, int skip) } /* - * Called by rtm_msg{1,2} routines to "scrub" the embedded interface scope - * away from the socket address structure, so that clients of the routing - * socket will not be confused by the presence of the embedded scope, or the - * side effect of the increased length due to that. The source sockaddr is - * not modified; instead, the scrubbing happens on the destination sockaddr - * storage that is passed in by the caller. + * Called by rtm_msg{1,2} routines to "scrub" the scope ID field away from + * the socket address structure, so that clients of the routing socket will + * not be confused by the presence of the information, or the side effect of + * the increased length due to that. The source sockaddr is not modified; + * instead, the scrubbing happens on the destination sockaddr storage that + * is passed in by the caller. */ struct sockaddr * -rtm_scrub_ifscope(int idx, struct sockaddr *hint, struct sockaddr *sa, +rtm_scrub_ifscope(int type, int idx, struct sockaddr *hint, struct sockaddr *sa, struct sockaddr_storage *ss) { struct sockaddr *ret = sa; @@ -517,39 +640,64 @@ rtm_scrub_ifscope(int idx, struct sockaddr *hint, struct sockaddr *sa, switch (idx) { case RTAX_DST: /* - * If this is for an AF_INET destination address, call - * sin_copy() with IFSCOPE_NONE as it does what we need. + * If this is for an AF_INET/AF_INET6 destination address, + * call sa_copy() to clear the scope ID field. */ if (sa->sa_family == AF_INET && - SINIFSCOPE(sa)->sin_ifscope != IFSCOPE_NONE) { - bzero(ss, sizeof (*ss)); - ret = sin_copy(SIN(sa), SIN(ss), IFSCOPE_NONE); + SINIFSCOPE(sa)->sin_scope_id != IFSCOPE_NONE) { + ret = sa_copy(sa, ss, NULL); + } else if (sa->sa_family == AF_INET6 && + SIN6IFSCOPE(sa)->sin6_scope_id != IFSCOPE_NONE) { + ret = sa_copy(sa, ss, NULL); } break; case RTAX_NETMASK: { + int skip, af; /* - * If this is for a mask, we can't tell whether or not - * there is an embedded interface scope, as the span of - * bytes between sa_len and the beginning of the mask - * (offset of sin_addr in the case of AF_INET) may be - * filled with all-ones by rn_addmask(), and hence we - * cannot rely on sa_family. Because of this, we use - * the sa_family of the hint sockaddr (RTAX_{DST,IFA}) - * as indicator as to whether or not the mask is to be - * treated as one for AF_INET. Clearing the embedded - * scope involves setting it to IFSCOPE_NONE followed - * by calling sa_trim() to trim trailing zeroes from - * the storage sockaddr, which reverses what was done - * earlier by mask_copy() on the source sockaddr. + * If this is for a mask, we can't tell whether or not there + * is an valid scope ID value, as the span of bytes between + * sa_len and the beginning of the mask (offset of sin_addr in + * the case of AF_INET, or sin6_addr for AF_INET6) may be + * filled with all-ones by rn_addmask(), and hence we cannot + * rely on sa_family. Because of this, we use the sa_family + * of the hint sockaddr (RTAX_{DST,IFA}) as indicator as to + * whether or not the mask is to be treated as one for AF_INET + * or AF_INET6. Clearing the scope ID field involves setting + * it to IFSCOPE_NONE followed by calling sa_trim() to trim + * trailing zeroes from the storage sockaddr, which reverses + * what was done earlier by ma_copy() on the source sockaddr. */ - int skip = offsetof(struct sockaddr_in, sin_addr); - if (sa->sa_len > skip && sa->sa_len <= sizeof (*ss) && - hint != NULL && hint->sa_family == AF_INET) { + if (hint == NULL || + ((af = hint->sa_family) != AF_INET && af != AF_INET6)) + break; /* nothing to do */ + + skip = (af == AF_INET) ? + offsetof(struct sockaddr_in, sin_addr) : + offsetof(struct sockaddr_in6, sin6_addr); + + if (sa->sa_len > skip && sa->sa_len <= sizeof (*ss)) { bzero(ss, sizeof (*ss)); bcopy(sa, ss, sa->sa_len); - SINIFSCOPE(ss)->sin_ifscope = IFSCOPE_NONE; + /* + * Don't use {sin,sin6}_set_ifscope() as sa_family + * and sa_len for the netmask might not be set to + * the corresponding expected values of the hint. + */ + if (hint->sa_family == AF_INET) + SINIFSCOPE(ss)->sin_scope_id = IFSCOPE_NONE; + else + SIN6IFSCOPE(ss)->sin6_scope_id = IFSCOPE_NONE; ret = sa_trim(SA(ss), skip); + + /* + * For AF_INET6 mask, set sa_len appropriately unless + * this is requested via systl_dumpentry(), in which + * case we return the raw value. + */ + if (hint->sa_family == AF_INET6 && + type != RTM_GET && type != RTM_GET2) + SA(ret)->sa_len = sizeof (struct sockaddr_in6); } break; } @@ -569,11 +717,14 @@ rn_match_ifscope(struct radix_node *rn, void *arg) { struct rtentry *rt = (struct rtentry *)rn; struct matchleaf_arg *ma = arg; + int af = rt_key(rt)->sa_family; - if (!(rt->rt_flags & RTF_IFSCOPE) || rt_key(rt)->sa_family != AF_INET) + if (!(rt->rt_flags & RTF_IFSCOPE) || (af != AF_INET && af != AF_INET6)) return (0); - return (SINIFSCOPE(rt_key(rt))->sin_ifscope == ma->ifscope); + return (af == AF_INET ? + (SINIFSCOPE(rt_key(rt))->sin_scope_id == ma->ifscope) : + (SIN6IFSCOPE(rt_key(rt))->sin6_scope_id == ma->ifscope)); } static void @@ -624,6 +775,7 @@ route_init(void) panic("route_init: failed allocating rte_zone"); zone_change(rte_zone, Z_EXPAND, TRUE); + zone_change(rte_zone, Z_CALLERACCT, FALSE); zone_change(rte_zone, Z_NOENCRYPT, TRUE); TAILQ_INIT(&rttrash_head); @@ -648,16 +800,9 @@ rtalloc(struct route *ro) } void -rtalloc_ign_locked(struct route *ro, uint32_t ignore) +rtalloc_scoped(struct route *ro, unsigned int ifscope) { - return (rtalloc_ign_common_locked(ro, ignore, IFSCOPE_NONE)); -} - -void -rtalloc_scoped_ign_locked(struct route *ro, uint32_t ignore, - unsigned int ifscope) -{ - return (rtalloc_ign_common_locked(ro, ignore, ifscope)); + rtalloc_scoped_ign(ro, 0, ifscope); } static void @@ -689,7 +834,7 @@ rtalloc_ign(struct route *ro, uint32_t ignore) { lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_NOTOWNED); lck_mtx_lock(rnh_lock); - rtalloc_ign_locked(ro, ignore); + rtalloc_ign_common_locked(ro, ignore, IFSCOPE_NONE); lck_mtx_unlock(rnh_lock); } @@ -698,11 +843,11 @@ rtalloc_scoped_ign(struct route *ro, uint32_t ignore, unsigned int ifscope) { lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_NOTOWNED); lck_mtx_lock(rnh_lock); - rtalloc_scoped_ign_locked(ro, ignore, ifscope); + rtalloc_ign_common_locked(ro, ignore, ifscope); lck_mtx_unlock(rnh_lock); } -struct rtentry * +static struct rtentry * rtalloc1_locked(struct sockaddr *dst, int report, uint32_t ignflags) { return (rtalloc1_common_locked(dst, report, ignflags, IFSCOPE_NONE)); @@ -910,6 +1055,9 @@ rtfree_common(struct rtentry *rt, boolean_t locked) * resources associated with the route. */ if (!(rt->rt_flags & RTF_UP)) { + struct rtentry *rt_parent; + struct ifaddr *rt_ifa; + if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic("rt %p freed while in radix tree\n", rt); /* @@ -922,25 +1070,15 @@ rtfree_common(struct rtentry *rt, boolean_t locked) rtd_trash_link); } - /* - * Route is no longer in the tree and refcnt is 0; - * we have exclusive access, so destroy it. - */ - RT_UNLOCK(rt); - /* * release references on items we hold them on.. * e.g other routes and ifaddrs. */ - if (rt->rt_parent != NULL) { - rtfree_locked(rt->rt_parent); + if ((rt_parent = rt->rt_parent) != NULL) rt->rt_parent = NULL; - } - if (rt->rt_ifa != NULL) { - ifafree(rt->rt_ifa); + if ((rt_ifa = rt->rt_ifa) != NULL) rt->rt_ifa = NULL; - } /* * Now free any attached link-layer info. @@ -953,6 +1091,18 @@ rtfree_common(struct rtentry *rt, boolean_t locked) rt->rt_llinfo = NULL; } + /* + * Route is no longer in the tree and refcnt is 0; + * we have exclusive access, so destroy it. + */ + RT_UNLOCK(rt); + + if (rt_parent != NULL) + rtfree_locked(rt_parent); + + if (rt_ifa != NULL) + IFA_REMREF(rt_ifa); + /* * The key is separately alloc'd so free it (see rt_setgate()). * This also frees the gateway, as they are always malloc'd @@ -960,6 +1110,11 @@ rtfree_common(struct rtentry *rt, boolean_t locked) */ R_Free(rt_key(rt)); + /* + * Free any statistics that may have been allocated + */ + nstat_route_detach(rt); + /* * and the rtentry itself of course */ @@ -1057,16 +1212,19 @@ rtsetifa(struct rtentry *rt, struct ifaddr* ifa) if (rt->rt_ifa == ifa) return; + /* Become a regular mutex, just in case */ + RT_CONVERT_LOCK(rt); + /* Release the old ifa */ if (rt->rt_ifa) - ifafree(rt->rt_ifa); + IFA_REMREF(rt->rt_ifa); /* Set rt_ifa */ rt->rt_ifa = ifa; /* Take a reference to the ifa */ if (rt->rt_ifa) - ifaref(rt->rt_ifa); + IFA_ADDREF(rt->rt_ifa); } /* @@ -1086,11 +1244,23 @@ rtredirect(struct ifnet *ifp, struct sockaddr *dst, struct sockaddr *gateway, struct rt_addrinfo info; struct ifaddr *ifa = NULL; unsigned int ifscope = (ifp != NULL) ? ifp->if_index : IFSCOPE_NONE; - struct sockaddr_in sin; + struct sockaddr_storage ss; lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_NOTOWNED); lck_mtx_lock(rnh_lock); + /* + * Transform src into the internal routing table form for + * comparison against rt_gateway below. + */ +#if INET6 + if ((src->sa_family == AF_INET && ip_doscopedroute) || + (src->sa_family == AF_INET6 && ip6_doscopedroute)) +#else + if (src->sa_family == AF_INET && ip_doscopedroute) +#endif /* !INET6 */ + src = sa_copy(src, &ss, &ifscope); + /* * Verify the gateway is directly reachable; if scoped routing * is enabled, verify that it is reachable from the interface @@ -1106,31 +1276,29 @@ rtredirect(struct ifnet *ifp, struct sockaddr *dst, struct sockaddr *gateway, if (rt != NULL) RT_LOCK(rt); - /* Embed scope in src for comparison against rt_gateway below */ - if (ip_doscopedroute && src->sa_family == AF_INET) - src = sin_copy(SIN(src), &sin, ifscope); - /* * If the redirect isn't from our current router for this dst, * it's either old or wrong. If it redirects us to ourselves, * we have a routing loop, perhaps as a result of an interface - * going down recently. + * going down recently. Holding rnh_lock here prevents the + * possibility of rt_ifa/ifa's ifa_addr from changing (e.g. + * in_ifinit), so okay to access ifa_addr without locking. */ if (!(flags & RTF_DONE) && rt != NULL && (!equal(src, rt->rt_gateway) || !equal(rt->rt_ifa->ifa_addr, ifa->ifa_addr))) { error = EINVAL; } else { - ifafree(ifa); + IFA_REMREF(ifa); if ((ifa = ifa_ifwithaddr(gateway))) { - ifafree(ifa); + IFA_REMREF(ifa); ifa = NULL; error = EHOSTUNREACH; } } if (ifa) { - ifafree(ifa); + IFA_REMREF(ifa); ifa = NULL; } @@ -1265,25 +1433,36 @@ ifa_ifwithroute_scoped_locked(int flags, const struct sockaddr *dst, static struct ifaddr * ifa_ifwithroute_common_locked(int flags, const struct sockaddr *dst, - const struct sockaddr *gateway, unsigned int ifscope) + const struct sockaddr *gw, unsigned int ifscope) { struct ifaddr *ifa = NULL; struct rtentry *rt = NULL; - struct sockaddr_in dst_in, gw_in; + struct sockaddr_storage dst_ss, gw_ss; lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); - if (ip_doscopedroute) { - /* - * Just in case the sockaddr passed in by the caller - * contains embedded scope, make sure to clear it since - * IPv4 interface addresses aren't scoped. - */ - if (dst != NULL && dst->sa_family == AF_INET) - dst = sin_copy(SIN(dst), &dst_in, IFSCOPE_NONE); - if (gateway != NULL && gateway->sa_family == AF_INET) - gateway = sin_copy(SIN(gateway), &gw_in, IFSCOPE_NONE); - } + /* + * Just in case the sockaddr passed in by the caller + * contains a scope ID, make sure to clear it since + * interface addresses aren't scoped. + */ +#if INET6 + if (dst != NULL && + ((dst->sa_family == AF_INET && ip_doscopedroute) || + (dst->sa_family == AF_INET6 && ip6_doscopedroute))) +#else + if (dst != NULL && dst->sa_family == AF_INET && ip_doscopedroute) +#endif /* !INET6 */ + dst = sa_copy(SA(dst), &dst_ss, NULL); + +#if INET6 + if (gw != NULL && + ((gw->sa_family == AF_INET && ip_doscopedroute) || + (gw->sa_family == AF_INET6 && ip6_doscopedroute))) +#else + if (gw != NULL && gw->sa_family == AF_INET && ip_doscopedroute) +#endif /* !INET6 */ + gw = sa_copy(SA(gw), &gw_ss, NULL); if (!(flags & RTF_GATEWAY)) { /* @@ -1297,17 +1476,17 @@ ifa_ifwithroute_common_locked(int flags, const struct sockaddr *dst, ifa = ifa_ifwithdstaddr(dst); } if (ifa == NULL) - ifa = ifa_ifwithaddr_scoped(gateway, ifscope); + ifa = ifa_ifwithaddr_scoped(gw, ifscope); } else { /* * If we are adding a route to a remote net * or host, the gateway may still be on the * other end of a pt to pt link. */ - ifa = ifa_ifwithdstaddr(gateway); + ifa = ifa_ifwithdstaddr(gw); } if (ifa == NULL) - ifa = ifa_ifwithnet_scoped(gateway, ifscope); + ifa = ifa_ifwithnet_scoped(gw, ifscope); if (ifa == NULL) { /* Workaround to avoid gcc warning regarding const variable */ rt = rtalloc1_scoped_locked((struct sockaddr *)(size_t)dst, @@ -1315,19 +1494,27 @@ ifa_ifwithroute_common_locked(int flags, const struct sockaddr *dst, if (rt != NULL) { RT_LOCK_SPIN(rt); ifa = rt->rt_ifa; - if (ifa != NULL) - ifaref(ifa); + if (ifa != NULL) { + /* Become a regular mutex */ + RT_CONVERT_LOCK(rt); + IFA_ADDREF(ifa); + } RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); rt = NULL; } } + /* + * Holding rnh_lock here prevents the possibility of ifa from + * changing (e.g. in_ifinit), so it is safe to access its + * ifa_addr (here and down below) without locking. + */ if (ifa != NULL && ifa->ifa_addr->sa_family != dst->sa_family) { struct ifaddr *newifa; /* Callee adds reference to newifa upon success */ newifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp); if (newifa != NULL) { - ifafree(ifa); + IFA_REMREF(ifa); ifa = newifa; } } @@ -1337,18 +1524,21 @@ ifa_ifwithroute_common_locked(int flags, const struct sockaddr *dst, * that may not agree with info garnered from the interfaces. * The routing table should carry more precedence than the * interfaces in this matter. Must be careful not to stomp - * on new entries from rtinit, hence (ifa->ifa_addr != gateway). + * on new entries from rtinit, hence (ifa->ifa_addr != gw). */ if ((ifa == NULL || - !equal(ifa->ifa_addr, (struct sockaddr *)(size_t)gateway)) && - (rt = rtalloc1_scoped_locked((struct sockaddr *)(size_t)gateway, + !equal(ifa->ifa_addr, (struct sockaddr *)(size_t)gw)) && + (rt = rtalloc1_scoped_locked((struct sockaddr *)(size_t)gw, 0, 0, ifscope)) != NULL) { if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); RT_LOCK_SPIN(rt); ifa = rt->rt_ifa; - if (ifa != NULL) - ifaref(ifa); + if (ifa != NULL) { + /* Become a regular mutex */ + RT_CONVERT_LOCK(rt); + IFA_ADDREF(ifa); + } RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); } @@ -1359,7 +1549,7 @@ ifa_ifwithroute_common_locked(int flags, const struct sockaddr *dst, */ if ((flags & RTF_IFSCOPE) && ifa != NULL && ifa->ifa_ifp->if_index != ifscope) { - ifafree(ifa); + IFA_REMREF(ifa); ifa = NULL; } @@ -1400,7 +1590,7 @@ rtrequest_scoped_locked(int req, struct sockaddr *dst, * Do appropriate manipulations of a routing tree given all the bits of * info needed. * - * Embedding the scope in the radix key is an internal job that should be + * Storing the scope ID in the radix key is an internal job that should be * left to routines in this module. Callers should specify the scope value * to the "scoped" variants of route routines instead of manipulating the * key itself. This is typically done when creating a scoped route, e.g. @@ -1422,59 +1612,79 @@ rtrequest_common_locked(int req, struct sockaddr *dst0, struct radix_node_head *rnh; struct ifaddr *ifa = NULL; struct sockaddr *ndst, *dst = dst0; - struct sockaddr_in sin, mask; + struct sockaddr_storage ss, mask; + struct timeval curr_calendartime; + int af = dst->sa_family; + void (*ifa_rtrequest)(int, struct rtentry *, struct sockaddr *); + #define senderr(x) { error = x ; goto bad; } lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); /* * Find the correct routing tree to use for this Address Family */ - if ((rnh = rt_tables[dst->sa_family]) == 0) + if ((rnh = rt_tables[af]) == NULL) senderr(ESRCH); /* * If we are adding a host route then we don't want to put * a netmask in the tree */ if (flags & RTF_HOST) - netmask = 0; + netmask = NULL; /* - * If RTF_IFSCOPE is specified, use a local copy of the destination - * address to embed the scope into. This logic is repeated below + * If Scoped Routing is enabled, use a local copy of the destination + * address to store the scope ID into. This logic is repeated below * in the RTM_RESOLVE handler since the caller does not normally - * specify such a flag during a resolve; instead it passes in the - * route used for cloning for which the scope info is derived from. - * Note also that in the case of RTM_DELETE, the address passed in - * by the caller might already contain the embedded scope info when - * it is the key itself, thus making RTF_IFSCOPE unnecessary; one - * instance where it is explicitly set is inside route_output() - * as part of handling a routing socket request. + * specify such a flag during a resolve, as well as for the handling + * of IPv4 link-local address; instead, it passes in the route used for + * cloning for which the scope info is derived from. Note also that + * in the case of RTM_DELETE, the address passed in by the caller + * might already contain the scope ID info when it is the key itself, + * thus making RTF_IFSCOPE unnecessary; one instance where it is + * explicitly set is inside route_output() as part of handling a + * routing socket request. */ - if (req != RTM_RESOLVE && (flags & RTF_IFSCOPE)) { - /* Scoped routing is for AF_INET only */ - if (dst->sa_family != AF_INET || - (req == RTM_ADD && !ip_doscopedroute)) - senderr(EINVAL); +#if INET6 + if (req != RTM_RESOLVE && + ((af == AF_INET && ip_doscopedroute) || + (af == AF_INET6 && ip6_doscopedroute))) { +#else + if (req != RTM_RESOLVE && af == AF_INET && ip_doscopedroute) { +#endif /* !INET6 */ + /* Transform dst into the internal routing table form */ + dst = sa_copy(dst, &ss, &ifscope); - if (ifscope == IFSCOPE_NONE) { - flags &= ~RTF_IFSCOPE; - } else { - /* Embed ifscope into the key (local copy) */ - dst = sin_copy(SIN(dst), &sin, ifscope); + /* Transform netmask into the internal routing table form */ + if (netmask != NULL) + netmask = ma_copy(af, netmask, &mask, ifscope); - /* Embed ifscope into netmask (local copy) */ - if (netmask != NULL) - netmask = mask_copy(netmask, &mask, ifscope); - } + if (ifscope != IFSCOPE_NONE) + flags |= RTF_IFSCOPE; + } else { + if ((flags & RTF_IFSCOPE) && (af != AF_INET && af != AF_INET6)) + senderr(EINVAL); + +#if INET6 + if ((af == AF_INET && !ip_doscopedroute) || + (af == AF_INET6 && !ip6_doscopedroute)) +#else + if (af == AF_INET && !ip_doscopedroute) +#endif /* !INET6 */ + ifscope = IFSCOPE_NONE; } + if (ifscope == IFSCOPE_NONE) + flags &= ~RTF_IFSCOPE; + switch (req) { - case RTM_DELETE: + case RTM_DELETE: { + struct rtentry *gwrt = NULL; /* * Remove the item from the tree and return it. * Complain if it is not there and do no more processing. */ - if ((rn = rnh->rnh_deladdr(dst, netmask, rnh)) == 0) + if ((rn = rnh->rnh_deladdr(dst, netmask, rnh)) == NULL) senderr(ESRCH); if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic ("rtrequest delete"); @@ -1512,20 +1722,22 @@ rtrequest_common_locked(int req, struct sockaddr *dst0, /* * Remove any external references we may have. - * This might result in another rtentry being freed if - * we held its last reference. */ - if (rt->rt_gwroute != NULL) { - rtfree_locked(rt->rt_gwroute); + if ((gwrt = rt->rt_gwroute) != NULL) rt->rt_gwroute = NULL; - } /* * give the protocol a chance to keep things in sync. */ - if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) - ifa->ifa_rtrequest(RTM_DELETE, rt, SA(0)); - ifa = NULL; + if ((ifa = rt->rt_ifa) != NULL) { + IFA_LOCK_SPIN(ifa); + ifa_rtrequest = ifa->ifa_rtrequest; + IFA_UNLOCK(ifa); + if (ifa_rtrequest != NULL) + ifa_rtrequest(RTM_DELETE, rt, NULL); + /* keep reference on rt_ifa */ + ifa = NULL; + } /* * one more rtentry floating around that is not @@ -1541,18 +1753,23 @@ rtrequest_common_locked(int req, struct sockaddr *dst0, * If this is the (non-scoped) default route, clear * the interface index used for the primary ifscope. */ - if (rt_inet_default(rt, rt_key(rt))) - set_primary_ifscope(IFSCOPE_NONE); - -#if IFNET_ROUTE_REFCNT - if (rt->rt_if_ref_fn != NULL) { - rt->rt_if_ref_fn(rt->rt_ifp, -1); - rt->rt_flags &= ~RTF_IFREF; + if (rt_primary_default(rt, rt_key(rt))) { + set_primary_ifscope(rt_key(rt)->sa_family, + IFSCOPE_NONE); } -#endif /* IFNET_ROUTE_REFCNT */ + rt_clear_idleref(rt); RT_UNLOCK(rt); + /* + * This might result in another rtentry being freed if + * we held its last reference. Do this after the rtentry + * lock is dropped above, as it could lead to the same + * lock being acquired if gwrt is a clone of rt. + */ + if (gwrt != NULL) + rtfree_locked(gwrt); + /* * If the caller wants it, then it can have it, * but it's up to it to free the rtentry as we won't be @@ -1566,9 +1783,9 @@ rtrequest_common_locked(int req, struct sockaddr *dst0, rtfree_locked(rt); } break; - + } case RTM_RESOLVE: - if (ret_nrt == 0 || (rt = *ret_nrt) == 0) + if (ret_nrt == NULL || (rt = *ret_nrt) == NULL) senderr(EINVAL); /* * If cloning, we have the parent route given by the caller @@ -1581,40 +1798,55 @@ rtrequest_common_locked(int req, struct sockaddr *dst0, * of rt_rmx. */ ifa = rt->rt_ifa; - ifaref(ifa); + IFA_ADDREF(ifa); flags = rt->rt_flags & ~(RTF_CLONING | RTF_PRCLONING | RTF_STATIC); flags |= RTF_WASCLONED; gateway = rt->rt_gateway; - if ((netmask = rt->rt_genmask) == 0) + if ((netmask = rt->rt_genmask) == NULL) flags |= RTF_HOST; - if (!ip_doscopedroute || dst->sa_family != AF_INET) +#if INET6 + if ((af != AF_INET && af != AF_INET6) || + (af == AF_INET && !ip_doscopedroute) || + (af == AF_INET6 && !ip6_doscopedroute)) +#else + if (af != AF_INET || !ip_doscopedroute) +#endif /* !INET6 */ goto makeroute; + /* * When scoped routing is enabled, cloned entries are * always scoped according to the interface portion of * the parent route. The exception to this are IPv4 * link local addresses. */ - if (!IN_LINKLOCAL(ntohl(SIN(dst)->sin_addr.s_addr))) { + if (af == AF_INET && + IN_LINKLOCAL(ntohl(SIN(dst)->sin_addr.s_addr))) { + ifscope = IFSCOPE_NONE; + flags &= ~RTF_IFSCOPE; + } else { if (flags & RTF_IFSCOPE) { - ifscope = sa_get_ifscope(rt_key(rt)); + ifscope = (af == AF_INET) ? + sin_get_ifscope(rt_key(rt)) : + sin6_get_ifscope(rt_key(rt)); } else { ifscope = rt->rt_ifp->if_index; flags |= RTF_IFSCOPE; } - } else { - ifscope = IFSCOPE_NONE; - flags &= ~RTF_IFSCOPE; + VERIFY(ifscope != IFSCOPE_NONE); } - /* Embed or clear ifscope into/from the key (local copy) */ - dst = sin_copy(SIN(dst), &sin, ifscope); + /* + * Transform dst into the internal routing table form, + * clearing out the scope ID field if ifscope isn't set. + */ + dst = sa_copy(dst, &ss, (ifscope == IFSCOPE_NONE) ? + NULL : &ifscope); - /* Embed or clear ifscope into/from netmask (local copy) */ + /* Transform netmask into the internal routing table form */ if (netmask != NULL) - netmask = mask_copy(netmask, &mask, ifscope); + netmask = ma_copy(af, netmask, &mask, ifscope); goto makeroute; @@ -1631,10 +1863,13 @@ rtrequest_common_locked(int req, struct sockaddr *dst0, if (ifa == NULL) senderr(ENETUNREACH); makeroute: + getmicrotime(&curr_calendartime); if ((rt = rte_alloc()) == NULL) senderr(ENOBUFS); Bzero(rt, sizeof(*rt)); rte_lock_init(rt); + rt->base_calendartime = curr_calendartime.tv_sec; + rt->base_uptime = net_uptime(); RT_LOCK(rt); rt->rt_flags = RTF_UP | flags; @@ -1644,6 +1879,7 @@ rtrequest_common_locked(int req, struct sockaddr *dst0, */ if ((error = rt_setgate(rt, dst, gateway)) != 0) { RT_UNLOCK(rt); + nstat_route_detach(rt); rte_lock_destroy(rt); rte_free(rt); senderr(error); @@ -1712,23 +1948,24 @@ rtrequest_common_locked(int req, struct sockaddr *dst0, * If it still failed to go into the tree, * then un-make it (this should be a function) */ - if (rn == 0) { + if (rn == NULL) { if (rt->rt_gwroute) { rtfree_locked(rt->rt_gwroute); rt->rt_gwroute = NULL; } if (rt->rt_ifa) { - ifafree(rt->rt_ifa); + IFA_REMREF(rt->rt_ifa); rt->rt_ifa = NULL; } R_Free(rt_key(rt)); RT_UNLOCK(rt); + nstat_route_detach(rt); rte_lock_destroy(rt); rte_free(rt); senderr(EEXIST); } - rt->rt_parent = 0; + rt->rt_parent = NULL; /* * If we got here from RESOLVE, then we are cloning so clone @@ -1741,42 +1978,46 @@ rtrequest_common_locked(int req, struct sockaddr *dst0, */ if (req == RTM_RESOLVE) { RT_LOCK_SPIN(*ret_nrt); - rt->rt_rmx = (*ret_nrt)->rt_rmx; /* copy metrics */ + VERIFY((*ret_nrt)->rt_expire == 0 || (*ret_nrt)->rt_rmx.rmx_expire != 0); + VERIFY((*ret_nrt)->rt_expire != 0 || (*ret_nrt)->rt_rmx.rmx_expire == 0); + rt->rt_rmx = (*ret_nrt)->rt_rmx; + rt_setexpire(rt, (*ret_nrt)->rt_expire); if ((*ret_nrt)->rt_flags & (RTF_CLONING | RTF_PRCLONING)) { rt->rt_parent = (*ret_nrt); RT_ADDREF_LOCKED(*ret_nrt); } RT_UNLOCK(*ret_nrt); -#if IFNET_ROUTE_REFCNT /* * Enable interface reference counting for unicast * cloned routes and bump up the reference count. */ if (rt->rt_parent != NULL && !(rt->rt_flags & (RTF_BROADCAST | RTF_MULTICAST))) { - rt->rt_if_ref_fn = rte_if_ref; - rt->rt_if_ref_fn(rt->rt_ifp, 1); - rt->rt_flags |= RTF_IFREF; + rt_set_idleref(rt); } -#endif /* IFNET_ROUTE_REFCNT */ } /* * if this protocol has something to add to this then * allow it to do that as well. */ - if (ifa->ifa_rtrequest) - ifa->ifa_rtrequest(req, rt, SA(ret_nrt ? *ret_nrt : 0)); - ifafree(ifa); - ifa = 0; + IFA_LOCK_SPIN(ifa); + ifa_rtrequest = ifa->ifa_rtrequest; + IFA_UNLOCK(ifa); + if (ifa_rtrequest != NULL) + ifa_rtrequest(req, rt, SA(ret_nrt ? *ret_nrt : NULL)); + IFA_REMREF(ifa); + ifa = NULL; /* * If this is the (non-scoped) default route, record * the interface index used for the primary ifscope. */ - if (rt_inet_default(rt, rt_key(rt))) - set_primary_ifscope(rt->rt_ifp->if_index); + if (rt_primary_default(rt, rt_key(rt))) { + set_primary_ifscope(rt_key(rt)->sa_family, + rt->rt_ifp->if_index); + } /* * actually return a resultant rtentry and @@ -1793,7 +2034,7 @@ rtrequest_common_locked(int req, struct sockaddr *dst0, * hasn't been added to the tree yet. */ if (req == RTM_ADD && - !(rt->rt_flags & RTF_HOST) && rt_mask(rt) != 0) { + !(rt->rt_flags & RTF_HOST) && rt_mask(rt) != NULL) { struct rtfc_arg arg; arg.rnh = rnh; arg.rt0 = rt; @@ -1803,22 +2044,19 @@ rtrequest_common_locked(int req, struct sockaddr *dst0, } else { RT_UNLOCK(rt); } + + nstat_route_new_entry(rt); break; } bad: if (ifa) - ifafree(ifa); + IFA_REMREF(ifa); return (error); } int -rtrequest( - int req, - struct sockaddr *dst, - struct sockaddr *gateway, - struct sockaddr *netmask, - int flags, - struct rtentry **ret_nrt) +rtrequest(int req, struct sockaddr *dst, struct sockaddr *gateway, + struct sockaddr *netmask, int flags, struct rtentry **ret_nrt) { int error; lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_NOTOWNED); @@ -1827,6 +2065,21 @@ rtrequest( lck_mtx_unlock(rnh_lock); return (error); } + +int +rtrequest_scoped(int req, struct sockaddr *dst, struct sockaddr *gateway, + struct sockaddr *netmask, int flags, struct rtentry **ret_nrt, + unsigned int ifscope) +{ + int error; + lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_lock(rnh_lock); + error = rtrequest_scoped_locked(req, dst, gateway, netmask, flags, + ret_nrt, ifscope); + lck_mtx_unlock(rnh_lock); + return (error); +} + /* * Called from rtrequest(RTM_DELETE, ...) to fix up the route's ``family'' * (i.e., the routes related to it by the operation of cloning). This @@ -2018,11 +2271,16 @@ rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) struct rtentry *gwrt; unsigned int ifscope; - ifscope = (dst->sa_family == AF_INET) ? - sa_get_ifscope(dst) : IFSCOPE_NONE; + if (dst->sa_family == AF_INET) + ifscope = sin_get_ifscope(dst); + else if (dst->sa_family == AF_INET6) + ifscope = sin6_get_ifscope(dst); + else + ifscope = IFSCOPE_NONE; RT_UNLOCK(rt); - gwrt = rtalloc1_scoped_locked(gate, 1, RTF_PRCLONING, ifscope); + gwrt = rtalloc1_scoped_locked(gate, 1, + RTF_CLONING | RTF_PRCLONING, ifscope); if (gwrt != NULL) RT_LOCK_ASSERT_NOTHELD(gwrt); RT_LOCK(rt); @@ -2082,8 +2340,10 @@ rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) * primary ifscope. Also done in rt_setif() to take care * of the non-redirect cases. */ - if (rt_inet_default(rt, dst) && rt->rt_ifp != NULL) - set_primary_ifscope(rt->rt_ifp->if_index); + if (rt_primary_default(rt, dst) && rt->rt_ifp != NULL) { + set_primary_ifscope(dst->sa_family, + rt->rt_ifp->if_index); + } /* * Tell the kernel debugger about the new default gateway @@ -2095,8 +2355,8 @@ rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) */ if ((dst->sa_family == AF_INET) && gwrt != NULL && gwrt->rt_gateway->sa_family == AF_LINK && - (gwrt->rt_ifp->if_index == get_primary_ifscope() || - get_primary_ifscope() == IFSCOPE_NONE)) + (gwrt->rt_ifp->if_index == get_primary_ifscope(AF_INET) || + get_primary_ifscope(AF_INET) == IFSCOPE_NONE)) kdp_set_gateway_mac(SDL(gwrt->rt_gateway)->sdl_data); } @@ -2142,11 +2402,16 @@ rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) * For consistency between rt_gateway and rt_key(gwrt). */ if ((rt->rt_flags & RTF_GATEWAY) && rt->rt_gwroute != NULL && - (rt->rt_gwroute->rt_flags & RTF_IFSCOPE) && - rt->rt_gateway->sa_family == AF_INET && - rt_key(rt->rt_gwroute)->sa_family == AF_INET) { - sa_set_ifscope(rt->rt_gateway, - sa_get_ifscope(rt_key(rt->rt_gwroute))); + (rt->rt_gwroute->rt_flags & RTF_IFSCOPE)) { + if (rt->rt_gateway->sa_family == AF_INET && + rt_key(rt->rt_gwroute)->sa_family == AF_INET) { + sin_set_ifscope(rt->rt_gateway, + sin_get_ifscope(rt_key(rt->rt_gwroute))); + } else if (rt->rt_gateway->sa_family == AF_INET6 && + rt_key(rt->rt_gwroute)->sa_family == AF_INET6) { + sin6_set_ifscope(rt->rt_gateway, + sin6_get_ifscope(rt_key(rt->rt_gwroute))); + } } /* @@ -2192,32 +2457,35 @@ rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, } /* - * Lookup an AF_INET scoped or non-scoped route depending on the ifscope - * value passed in by the caller (IFSCOPE_NONE implies non-scoped). + * Lookup an AF_INET/AF_INET6 scoped or non-scoped route depending on the + * ifscope value passed in by the caller (IFSCOPE_NONE implies non-scoped). */ static struct radix_node * node_lookup(struct sockaddr *dst, struct sockaddr *netmask, unsigned int ifscope) { - struct radix_node_head *rnh = rt_tables[AF_INET]; + struct radix_node_head *rnh; struct radix_node *rn; - struct sockaddr_in sin, mask; + struct sockaddr_storage ss, mask; + int af = dst->sa_family; struct matchleaf_arg ma = { ifscope }; rn_matchf_t *f = rn_match_ifscope; void *w = &ma; - if (dst->sa_family != AF_INET) + if (af != AF_INET && af != AF_INET6) return (NULL); + rnh = rt_tables[af]; + /* - * Embed ifscope into the search key; for a non-scoped - * search this will clear out any embedded scope value. + * Transform dst into the internal routing table form, + * clearing out the scope ID field if ifscope isn't set. */ - dst = sin_copy(SIN(dst), &sin, ifscope); + dst = sa_copy(dst, &ss, (ifscope == IFSCOPE_NONE) ? NULL : &ifscope); - /* Embed (or clear) ifscope into netmask */ + /* Transform netmask into the internal routing table form */ if (netmask != NULL) - netmask = mask_copy(netmask, &mask, ifscope); + netmask = ma_copy(af, netmask, &mask, ifscope); if (ifscope == IFSCOPE_NONE) f = w = NULL; @@ -2230,13 +2498,18 @@ node_lookup(struct sockaddr *dst, struct sockaddr *netmask, } /* - * Lookup the AF_INET non-scoped default route. + * Lookup the AF_INET/AF_INET6 non-scoped default route. */ static struct radix_node * -node_lookup_default(void) +node_lookup_default(int af) { - struct radix_node_head *rnh = rt_tables[AF_INET]; - return (rnh->rnh_lookup(&sin_def, NULL, rnh)); + struct radix_node_head *rnh; + + VERIFY(af == AF_INET || af == AF_INET6); + rnh = rt_tables[af]; + + return (af == AF_INET ? rnh->rnh_lookup(&sin_def, NULL, rnh) : + rnh->rnh_lookup(&sin6_def, NULL, rnh)); } /* @@ -2250,10 +2523,10 @@ node_lookup_default(void) * per-interface route instance. This permits multiple route entries having * the same destination (but not necessarily the same gateway) to exist in * the routing table; each of these entries is specific to the corresponding - * interface. This is made possible by embedding the scope value into the + * interface. This is made possible by storing the scope ID value into the * radix key, thus making each route entry unique. These scoped entries * exist along with the regular, non-scoped entries in the same radix tree - * for a given address family (currently AF_INET only); the scope logically + * for a given address family (AF_INET/AF_INET6); the scope logically * partitions it into multiple per-interface sub-trees. * * When a scoped route lookup is performed, the routing table is searched for @@ -2267,7 +2540,9 @@ rt_lookup(boolean_t lookup_only, struct sockaddr *dst, struct sockaddr *netmask, struct radix_node_head *rnh, unsigned int ifscope) { struct radix_node *rn0, *rn; - boolean_t dontcare = (ifscope == IFSCOPE_NONE); + boolean_t dontcare; + int af = dst->sa_family; + struct sockaddr_storage dst_ss, mask_ss; lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); @@ -2277,11 +2552,14 @@ rt_lookup(boolean_t lookup_only, struct sockaddr *dst, struct sockaddr *netmask, /* * Non-scoped route lookup. */ - if (!ip_doscopedroute || dst->sa_family != AF_INET) { - if (lookup_only) - rn = rnh->rnh_lookup(dst, netmask, rnh); - else - rn = rnh->rnh_matchaddr(dst, rnh); +#if INET6 + if ((af != AF_INET && af != AF_INET6) || + (af == AF_INET && !ip_doscopedroute) || + (af == AF_INET6 && !ip6_doscopedroute)) { +#else + if (af != AF_INET || !ip_doscopedroute) { +#endif /* !INET6 */ + rn = rnh->rnh_matchaddr(dst, rnh); /* * Don't return a root node; also, rnh_matchaddr callback @@ -2303,6 +2581,12 @@ rt_lookup(boolean_t lookup_only, struct sockaddr *dst, struct sockaddr *netmask, return (RT(rn)); } + /* Transform dst/netmask into the internal routing table form */ + dst = sa_copy(dst, &dst_ss, &ifscope); + if (netmask != NULL) + netmask = ma_copy(af, netmask, &mask_ss, ifscope); + dontcare = (ifscope == IFSCOPE_NONE); + /* * Scoped route lookup: * @@ -2316,10 +2600,13 @@ rt_lookup(boolean_t lookup_only, struct sockaddr *dst, struct sockaddr *netmask, /* * If the caller did not specify a scope, use the primary scope * derived from the system's non-scoped default route. If, for - * any reason, there is no primary interface, return what we have. + * any reason, there is no primary interface, ifscope will be + * set to IFSCOPE_NONE; if the above lookup resulted in a route, + * we'll do a more-specific search below, scoped to the interface + * of that route. */ - if (dontcare && (ifscope = get_primary_ifscope()) == IFSCOPE_NONE) - goto done; + if (dontcare) + ifscope = get_primary_ifscope(af); /* * Keep the original result if either of the following is true: @@ -2381,7 +2668,7 @@ rt_lookup(boolean_t lookup_only, struct sockaddr *dst, struct sockaddr *netmask, * as a more specific route. */ if (rn == NULL || (rn0 != NULL && - ((INET_DEFAULT(rt_key(RT(rn))) && !INET_DEFAULT(rt_key(RT(rn0)))) || + ((SA_DEFAULT(rt_key(RT(rn))) && !SA_DEFAULT(rt_key(RT(rn0)))) || (!RT_HOST(rn) && RT_HOST(rn0))))) rn = rn0; @@ -2389,23 +2676,20 @@ rt_lookup(boolean_t lookup_only, struct sockaddr *dst, struct sockaddr *netmask, * If we still don't have a route, use the non-scoped default * route as long as the interface portion satistifes the scope. */ - if (rn == NULL && (rn = node_lookup_default()) != NULL && + if (rn == NULL && (rn = node_lookup_default(af)) != NULL && RT(rn)->rt_ifp->if_index != ifscope) rn = NULL; -done: if (rn != NULL) { /* - * Manually clear RTPRF_OURS using in_validate() and + * Manually clear RTPRF_OURS using rt_validate() and * bump up the reference count after, and not before; - * we only get here for AF_INET. node_lookup() has - * done the check against RNF_ROOT, so we can be sure + * we only get here for AF_INET/AF_INET6. node_lookup() + * has done the check against RNF_ROOT, so we can be sure * that we're not returning a root node here. */ RT_LOCK_SPIN(RT(rn)); - if (!(RT(rn)->rt_flags & RTF_CONDEMNED)) { - if (!lookup_only) - (void) in_validate(rn); + if (rt_validate(RT(rn))) { RT_ADDREF_LOCKED(RT(rn)); RT_UNLOCK(RT(rn)); } else { @@ -2417,6 +2701,25 @@ rt_lookup(boolean_t lookup_only, struct sockaddr *dst, struct sockaddr *netmask, return (RT(rn)); } +boolean_t +rt_validate(struct rtentry *rt) +{ + RT_LOCK_ASSERT_HELD(rt); + + if (!(rt->rt_flags & RTF_CONDEMNED)) { + int af = rt_key(rt)->sa_family; + + if (af == AF_INET) + (void) in_validate(RN(rt)); + else if (af == AF_INET6) + (void) in6_validate(RN(rt)); + } else { + rt = NULL; + } + + return (rt != NULL); +} + /* * Set up a routing table entry, normally * for an interface. @@ -2440,8 +2743,14 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) struct sockaddr *deldst; struct mbuf *m = 0; struct rtentry *nrt = 0; + u_int32_t ifa_flags; int error; + /* + * Holding rnh_lock here prevents the possibility of ifa from + * changing (e.g. in_ifinit), so it is safe to access its + * ifa_{dst}addr (here and down below) without locking. + */ dst = flags & RTF_HOST ? ifa->ifa_dstaddr : ifa->ifa_addr; /* * If it's a delete, check that if it exists, it's on the correct @@ -2513,8 +2822,11 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) /* * Do the actual request */ + IFA_LOCK_SPIN(ifa); + ifa_flags = ifa->ifa_flags; + IFA_UNLOCK(ifa); error = rtrequest_locked(cmd, dst, ifa->ifa_addr, ifa->ifa_netmask, - flags | ifa->ifa_flags, &nrt); + flags | ifa_flags, &nrt); if (m) (void) m_free(m); /* @@ -2544,6 +2856,9 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) * have already existed or something. (XXX) */ if (rt->rt_ifa != ifa) { + void (*ifa_rtrequest) + (int, struct rtentry *, struct sockaddr *); + if (!(rt->rt_ifa->ifa_ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK))) printf("rtinit: wrong ifa (%p) was (%p)\n", @@ -2553,22 +2868,31 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) * remove anything it has associated with * this route and ifaddr. */ - if (rt->rt_ifa->ifa_rtrequest) - rt->rt_ifa->ifa_rtrequest(RTM_DELETE, rt, SA(0)); + IFA_LOCK_SPIN(rt->rt_ifa); + ifa_rtrequest = rt->rt_ifa->ifa_rtrequest; + IFA_UNLOCK(rt->rt_ifa); + if (ifa_rtrequest != NULL) + ifa_rtrequest(RTM_DELETE, rt, SA(0)); /* * Set the route's ifa. */ rtsetifa(rt, ifa); -#if IFNET_ROUTE_REFCNT - /* - * Adjust route ref count for the interfaces. - */ - if (rt->rt_if_ref_fn != NULL && - rt->rt_ifp != ifa->ifa_ifp) { - rt->rt_if_ref_fn(ifa->ifa_ifp, 1); - rt->rt_if_ref_fn(rt->rt_ifp, -1); + + if (rt->rt_ifp != ifa->ifa_ifp) { + /* + * Purge any link-layer info caching. + */ + if (rt->rt_llinfo_purge != NULL) + rt->rt_llinfo_purge(rt); + /* + * Adjust route ref count for the interfaces. + */ + if (rt->rt_if_ref_fn != NULL) { + rt->rt_if_ref_fn(ifa->ifa_ifp, 1); + rt->rt_if_ref_fn(rt->rt_ifp, -1); + } } -#endif /* IFNET_ROUTE_REFCNT */ + /* * And substitute in references to the ifaddr * we are adding. @@ -2579,8 +2903,11 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) * Now ask the protocol to check if it needs * any special processing in its new form. */ - if (ifa->ifa_rtrequest) - ifa->ifa_rtrequest(RTM_ADD, rt, SA(0)); + IFA_LOCK_SPIN(ifa); + ifa_rtrequest = ifa->ifa_rtrequest; + IFA_UNLOCK(ifa); + if (ifa_rtrequest != NULL) + ifa_rtrequest(RTM_ADD, rt, SA(0)); } /* * notify any listenning routing agents of the change @@ -2604,7 +2931,6 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) u_int64_t rt_expiry(struct rtentry *rt, u_int64_t base, u_int32_t delta) { -#if IFNET_ROUTE_REFCNT u_int64_t retval; /* @@ -2619,10 +2945,29 @@ rt_expiry(struct rtentry *rt, u_int64_t base, u_int32_t delta) retval = base + MIN(rt_if_idle_expire_timeout, delta); return (retval); -#else -#pragma unused(rt) - return (base + delta); -#endif /* IFNET_ROUTE_REFCNT */ +} + +void +rt_set_idleref(struct rtentry *rt) +{ + RT_LOCK_ASSERT_HELD(rt); + + rt_clear_idleref(rt); + rt->rt_if_ref_fn = rte_if_ref; + rt->rt_if_ref_fn(rt->rt_ifp, 1); + rt->rt_flags |= RTF_IFREF; +} + +void +rt_clear_idleref(struct rtentry *rt) +{ + RT_LOCK_ASSERT_HELD(rt); + + if (rt->rt_if_ref_fn != NULL) { + rt->rt_if_ref_fn(rt->rt_ifp, -1); + rt->rt_flags &= ~RTF_IFREF; + rt->rt_if_ref_fn = NULL; + } } static void @@ -2703,7 +3048,6 @@ rte_free(struct rtentry *p) zfree(rte_zone, p); } -#if IFNET_ROUTE_REFCNT static void rte_if_ref(struct ifnet *ifp, int cnt) { @@ -2749,7 +3093,6 @@ rte_if_ref(struct ifnet *ifp, int cnt) kev_post_msg(&ev_msg); } } -#endif /* IFNET_ROUTE_REFCNT */ static inline struct rtentry * rte_alloc_debug(void) @@ -2799,3 +3142,50 @@ ctrace_record(ctrace_t *tr) bzero(tr->pc, sizeof (tr->pc)); (void) OSBacktrace(tr->pc, CTRACE_STACK_SIZE); } + +__private_extern__ void +route_copyout( + struct route *dst, + const struct route *src, + size_t length) +{ + /* Copy everything (rt, dst, flags) from ifnet */ + bcopy(src, dst, length); + + /* Hold one reference for the local copy of struct route */ + if (dst->ro_rt != NULL) + RT_ADDREF(dst->ro_rt); +} + +__private_extern__ void +route_copyin( + struct route *src, + struct route *dst, + size_t length) +{ + /* No cached route in the ifnet? */ + if (dst->ro_rt == NULL) { + /* + * Copy everything (rt, dst, flags) from ip_forward(); + * the reference to the route was held at the time + * it was allocated and is kept intact. + */ + bcopy(src, dst, length); + } else if (src->ro_rt != NULL) { + /* + * If the same, update just the ro_flags and ditch the one + * in the local copy. Else ditch the one that is currently + * cached, and cache the new route. + */ + if (dst->ro_rt == src->ro_rt) { + dst->ro_flags = src->ro_flags; + rtfree(src->ro_rt); + } else { + rtfree(dst->ro_rt); + bcopy(src, dst, length); + } + } + + /* This function consumes the reference */ + src->ro_rt = NULL; +} diff --git a/bsd/net/route.h b/bsd/net/route.h index 71eb4f8f8..47aa3f902 100644 --- a/bsd/net/route.h +++ b/bsd/net/route.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -95,6 +95,15 @@ struct route { #define ROF_SRCIF_SELECTED 0x1 /* source interface was selected */ +/* + * Route reachability info (private) + */ +struct rt_reach_info { + u_int32_t ri_refcnt; /* reference count */ + u_int32_t ri_probes; /* total # of probes */ + u_int64_t ri_snd_expire; /* transmit expiration (calendar) time */ + u_int64_t ri_rcv_expire; /* receive expiration (calendar) time */ +}; #else struct route; #endif /* PRIVATE */ @@ -159,6 +168,9 @@ struct rtentry { struct ifaddr *rt_ifa; /* the answer: interface addr to use */ struct sockaddr *rt_genmask; /* for generation of cloned routes */ void *rt_llinfo; /* pointer to link level info cache */ + void (*rt_llinfo_get_ri) /* llinfo get reachability info fn */ + (struct rtentry *, struct rt_reach_info *); + void (*rt_llinfo_purge)(struct rtentry *); /* llinfo purge fn */ void (*rt_llinfo_free)(void *); /* link level info free function */ struct rt_metrics rt_rmx; /* metrics used by rx'ing protocols */ struct rtentry *rt_gwroute; /* implied entry for gatewayed routes */ @@ -168,10 +180,15 @@ struct rtentry { * See bsd/net/route.c for synchronization notes. */ decl_lck_mtx_data(, rt_lock); /* lock for routing entry */ -#if IFNET_ROUTE_REFCNT + struct nstat_counts *rt_stats; void (*rt_if_ref_fn)(struct ifnet *, int); /* interface ref func */ -#endif /* IFNET_ROUTE_REFCNT */ + + uint64_t rt_expire; /* expiration time in uptime seconds */ + uint64_t base_calendartime; /* calendar time upon entry creation */ + uint64_t base_uptime;/* uptime upon entry creation */ }; + +extern void rt_setexpire(struct rtentry *, uint64_t); #endif /* KERNEL_PRIVATE */ #ifdef KERNEL_PRIVATE @@ -251,6 +268,27 @@ struct rt_msghdr2 { struct rt_metrics rtm_rmx; /* metrics themselves */ }; +#ifdef PRIVATE +/* + * Extended routing message header (private). + */ +struct rt_msghdr_ext { + u_short rtm_msglen; /* to skip over non-understood messages */ + u_char rtm_version; /* future binary compatibility */ + u_char rtm_type; /* message type */ + u_int32_t rtm_index; /* index for associated ifp */ + u_int32_t rtm_flags; /* flags, incl. kern & message, e.g. DONE */ + u_int32_t rtm_reserved; /* for future use */ + u_int32_t rtm_addrs; /* bitmask identifying sockaddrs in msg */ + pid_t rtm_pid; /* identify sender */ + int rtm_seq; /* for sender to identify action */ + int rtm_errno; /* why failed */ + u_int32_t rtm_use; /* from rtentry */ + u_int32_t rtm_inits; /* which metrics we are initializing */ + struct rt_metrics rtm_rmx; /* metrics themselves */ + struct rt_reach_info rtm_ri; /* route reachability info */ +}; +#endif /* PRIVATE */ #define RTM_VERSION 5 /* Up the ante and ignore older versions */ @@ -279,6 +317,9 @@ struct rt_msghdr2 { #define RTM_IFINFO2 0x12 /* */ #define RTM_NEWMADDR2 0x13 /* */ #define RTM_GET2 0x14 /* */ +#ifdef PRIVATE +#define RTM_GET_EXT 0x15 +#endif /* PRIVATE */ /* * Bitmask values for rtm_inits and rmx_locks. @@ -445,18 +486,16 @@ extern void rt_missmsg(int, struct rt_addrinfo *, int, int); extern void rt_newaddrmsg(int, struct ifaddr *, int, struct rtentry *); extern void rt_newmaddrmsg(int, struct ifmultiaddr *); extern int rt_setgate(struct rtentry *, struct sockaddr *, struct sockaddr *); -extern void set_primary_ifscope(unsigned int); -extern unsigned int get_primary_ifscope(void); -extern boolean_t rt_inet_default(struct rtentry *, struct sockaddr *); +extern void set_primary_ifscope(int, unsigned int); +extern unsigned int get_primary_ifscope(int); +extern boolean_t rt_primary_default(struct rtentry *, struct sockaddr *); extern struct rtentry *rt_lookup(boolean_t, struct sockaddr *, struct sockaddr *, struct radix_node_head *, unsigned int); extern void rtalloc(struct route *); +extern void rtalloc_scoped(struct route *, unsigned int); extern void rtalloc_ign(struct route *, uint32_t); -extern void rtalloc_ign_locked(struct route *, uint32_t); extern void rtalloc_scoped_ign(struct route *, uint32_t, unsigned int); -extern void rtalloc_scoped_ign_locked(struct route *, uint32_t, unsigned int); extern struct rtentry *rtalloc1(struct sockaddr *, int, uint32_t); -extern struct rtentry *rtalloc1_locked(struct sockaddr *, int, uint32_t); extern struct rtentry *rtalloc1_scoped(struct sockaddr *, int, uint32_t, unsigned int); extern struct rtentry *rtalloc1_scoped_locked(struct sockaddr *, int, @@ -478,19 +517,30 @@ extern void rtredirect(struct ifnet *, struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct sockaddr *, struct rtentry **); extern int rtrequest(int, struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct rtentry **); +extern int rtrequest_scoped(int, struct sockaddr *, struct sockaddr *, + struct sockaddr *, int, struct rtentry **, unsigned int); extern int rtrequest_locked(int, struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct rtentry **); extern int rtrequest_scoped_locked(int, struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct rtentry **, unsigned int); -extern unsigned int sa_get_ifscope(struct sockaddr *); +extern void sin_set_ifscope(struct sockaddr *, unsigned int); +extern unsigned int sin_get_ifscope(struct sockaddr *); +extern unsigned int sin6_get_ifscope(struct sockaddr *); extern void rt_lock(struct rtentry *, boolean_t); extern void rt_unlock(struct rtentry *); -extern struct sockaddr *rtm_scrub_ifscope(int, struct sockaddr *, +extern struct sockaddr *rtm_scrub_ifscope(int, int, struct sockaddr *, struct sockaddr *, struct sockaddr_storage *); extern u_int64_t rt_expiry(struct rtentry *, u_int64_t, u_int32_t); -#if IFNET_ROUTE_REFCNT +extern void rt_set_idleref(struct rtentry *); +extern void rt_clear_idleref(struct rtentry *); extern void rt_aggdrain(int); -#endif /* IFNET_ROUTE_REFCNT */ +extern boolean_t rt_validate(struct rtentry *); + +#ifdef XNU_KERNEL_PRIVATE +extern void route_copyin(struct route *src, struct route *dst, size_t length); +extern void route_copyout(struct route *dst, const struct route *src, size_t length); +#endif /* XNU_KERNEL_PRIVATE */ + #endif /* KERNEL_PRIVATE */ #endif diff --git a/bsd/net/rtsock.c b/bsd/net/rtsock.c index 819f8349c..42b20064a 100644 --- a/bsd/net/rtsock.c +++ b/bsd/net/rtsock.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -73,6 +73,7 @@ #include #include #include +#include #include #include @@ -95,7 +96,6 @@ MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables"); static struct sockaddr route_dst = { 2, PF_ROUTE, { 0, } }; static struct sockaddr route_src = { 2, PF_ROUTE, { 0, } }; static struct sockaddr sa_zero = { sizeof(sa_zero), AF_INET, { 0, } }; -static struct sockproto route_proto = { PF_ROUTE, 0 }; struct walkarg { int w_tmemsize; @@ -108,30 +108,21 @@ static struct mbuf *rt_msg1(int, struct rt_addrinfo *); static int rt_msg2(int, struct rt_addrinfo *, caddr_t, struct walkarg *); static int rt_xaddrs(caddr_t, caddr_t, struct rt_addrinfo *); static int sysctl_dumpentry(struct radix_node *rn, void *vw); +static int sysctl_dumpentry_ext(struct radix_node *rn, void *vw); static int sysctl_iflist(int af, struct walkarg *w); static int sysctl_iflist2(int af, struct walkarg *w); -static int route_output(struct mbuf *, struct socket *); -static void rt_setmetrics(u_int32_t, struct rt_metrics *, struct rt_metrics *); +static int route_output(struct mbuf *, struct socket *); +static void rt_setmetrics(u_int32_t, struct rt_metrics *, struct rtentry *); +static void rt_getmetrics(struct rtentry *, struct rt_metrics *); static void rt_setif(struct rtentry *, struct sockaddr *, struct sockaddr *, struct sockaddr *, unsigned int); -#if IFNET_ROUTE_REFCNT static void rt_drainall(void); -#endif /* IFNET_ROUTE_REFCNT */ #define SIN(sa) ((struct sockaddr_in *)(size_t)(sa)) -/* Sleazy use of local variables throughout file, warning!!!! */ -#define dst info.rti_info[RTAX_DST] -#define gate info.rti_info[RTAX_GATEWAY] -#define netmask info.rti_info[RTAX_NETMASK] -#define genmask info.rti_info[RTAX_GENMASK] -#define ifpaddr info.rti_info[RTAX_IFP] -#define ifaaddr info.rti_info[RTAX_IFA] -#define brdaddr info.rti_info[RTAX_BRD] SYSCTL_NODE(_net, OID_AUTO, idle, CTLFLAG_RW, 0, "idle network monitoring"); -#if IFNET_ROUTE_REFCNT static struct timeval last_ts; SYSCTL_NODE(_net_idle, OID_AUTO, route, CTLFLAG_RW, 0, "idle route monitoring"); @@ -140,7 +131,15 @@ static int rt_if_idle_drain_interval = RT_IF_IDLE_DRAIN_INTERVAL; SYSCTL_INT(_net_idle_route, OID_AUTO, drain_interval, CTLFLAG_RW, &rt_if_idle_drain_interval, 0, "Default interval for draining " "routes when doing interface idle reference counting."); -#endif /* IFNET_ROUTE_REFCNT */ + +/* + * This macro calculates skew in wall clock, just in case the user changes the + * system time. This skew adjustment is required because we now keep the route + * expiration times in uptime terms in the kernel, but the userland still + * expects expiration times in terms of calendar times. + */ +#define CALCULATE_CLOCKSKEW(cc, ic, cu, iu)\ + ((cc.tv_sec - ic) - (cu - iu)) /* * It really doesn't make any sense at all for this code to share much @@ -322,6 +321,7 @@ route_output(struct mbuf *m, struct socket *so) struct radix_node_head *rnh; struct rt_addrinfo info; int len, error = 0; + sa_family_t dst_sa_family = 0; struct ifnet *ifp = NULL; #ifndef __APPLE__ struct proc *curproc = current_proc(); @@ -344,17 +344,17 @@ route_output(struct mbuf *m, struct socket *so) len = m->m_pkthdr.len; if (len < sizeof(*rtm) || len != mtod(m, struct rt_msghdr *)->rtm_msglen) { - dst = NULL; + info.rti_info[RTAX_DST] = NULL; senderr(EINVAL); } R_Malloc(rtm, struct rt_msghdr *, len); if (rtm == NULL) { - dst = NULL; + info.rti_info[RTAX_DST] = NULL; senderr(ENOBUFS); } m_copydata(m, 0, len, (caddr_t)rtm); if (rtm->rtm_version != RTM_VERSION) { - dst = NULL; + info.rti_info[RTAX_DST] = NULL; senderr(EPROTONOSUPPORT); } @@ -374,51 +374,52 @@ route_output(struct mbuf *m, struct socket *so) * may perform operations other than RTM_GET */ if (rtm->rtm_type != RTM_GET && (so->so_state & SS_PRIV) == 0) { - dst = NULL; + info.rti_info[RTAX_DST] = NULL; senderr(EPERM); } rtm->rtm_pid = proc_selfpid(); info.rti_addrs = rtm->rtm_addrs; if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info)) { - dst = NULL; + info.rti_info[RTAX_DST] = NULL; senderr(EINVAL); } - if (dst == NULL || (dst->sa_family >= AF_MAX) || - (gate != NULL && (gate->sa_family >= AF_MAX))) { + if (info.rti_info[RTAX_DST] == NULL || (info.rti_info[RTAX_DST]->sa_family >= AF_MAX) || + (info.rti_info[RTAX_GATEWAY] != NULL && (info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX))) { senderr(EINVAL); } - if (dst->sa_family == AF_INET && dst->sa_len != sizeof (dst_in)) { + if (info.rti_info[RTAX_DST]->sa_family == AF_INET && info.rti_info[RTAX_DST]->sa_len != sizeof (dst_in)) { /* At minimum, we need up to sin_addr */ - if (dst->sa_len < offsetof(struct sockaddr_in, sin_zero)) + if (info.rti_info[RTAX_DST]->sa_len < offsetof(struct sockaddr_in, sin_zero)) senderr(EINVAL); bzero(&dst_in, sizeof (dst_in)); dst_in.sin_len = sizeof (dst_in); dst_in.sin_family = AF_INET; - dst_in.sin_port = SIN(dst)->sin_port; - dst_in.sin_addr = SIN(dst)->sin_addr; - dst = (struct sockaddr *)&dst_in; + dst_in.sin_port = SIN(info.rti_info[RTAX_DST])->sin_port; + dst_in.sin_addr = SIN(info.rti_info[RTAX_DST])->sin_addr; + info.rti_info[RTAX_DST] = (struct sockaddr *)&dst_in; + dst_sa_family = info.rti_info[RTAX_DST]->sa_family; } - if (gate != NULL && - gate->sa_family == AF_INET && gate->sa_len != sizeof (gate_in)) { + if (info.rti_info[RTAX_GATEWAY] != NULL && + info.rti_info[RTAX_GATEWAY]->sa_family == AF_INET && info.rti_info[RTAX_GATEWAY]->sa_len != sizeof (gate_in)) { /* At minimum, we need up to sin_addr */ - if (gate->sa_len < offsetof(struct sockaddr_in, sin_zero)) + if (info.rti_info[RTAX_GATEWAY]->sa_len < offsetof(struct sockaddr_in, sin_zero)) senderr(EINVAL); bzero(&gate_in, sizeof (gate_in)); gate_in.sin_len = sizeof (gate_in); gate_in.sin_family = AF_INET; - gate_in.sin_port = SIN(gate)->sin_port; - gate_in.sin_addr = SIN(gate)->sin_addr; - gate = (struct sockaddr *)&gate_in; + gate_in.sin_port = SIN(info.rti_info[RTAX_GATEWAY])->sin_port; + gate_in.sin_addr = SIN(info.rti_info[RTAX_GATEWAY])->sin_addr; + info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gate_in; } - if (genmask) { + if (info.rti_info[RTAX_GENMASK]) { struct radix_node *t; - t = rn_addmask((caddr_t)genmask, 0, 1); - if (t && Bcmp(genmask, t->rn_key, *(u_char *)genmask) == 0) - genmask = (struct sockaddr *)(t->rn_key); + t = rn_addmask((caddr_t)info.rti_info[RTAX_GENMASK], 0, 1); + if (t && Bcmp(info.rti_info[RTAX_GENMASK], t->rn_key, *(u_char *)info.rti_info[RTAX_GENMASK]) == 0) + info.rti_info[RTAX_GENMASK] = (struct sockaddr *)(t->rn_key); else senderr(ENOBUFS); } @@ -427,16 +428,27 @@ route_output(struct mbuf *m, struct socket *so) * If RTF_IFSCOPE flag is set, then rtm_index specifies the scope. */ if (rtm->rtm_flags & RTF_IFSCOPE) { - /* Scoped routing is for AF_INET only */ - if (dst->sa_family != AF_INET) + if (info.rti_info[RTAX_DST]->sa_family != AF_INET && info.rti_info[RTAX_DST]->sa_family != AF_INET6) senderr(EINVAL); ifscope = rtm->rtm_index; } + /* + * For AF_INET, always zero out the embedded scope ID. If this is + * a scoped request, it must be done explicitly by setting RTF_IFSCOPE + * flag and the corresponding rtm_index value. This is to prevent + * false interpretation of the scope ID because it's using the sin_zero + * field, which might not be properly cleared by the requestor. + */ + if (info.rti_info[RTAX_DST]->sa_family == AF_INET) + sin_set_ifscope(info.rti_info[RTAX_DST], IFSCOPE_NONE); + if (info.rti_info[RTAX_GATEWAY] != NULL && info.rti_info[RTAX_GATEWAY]->sa_family == AF_INET) + sin_set_ifscope(info.rti_info[RTAX_GATEWAY], IFSCOPE_NONE); + switch (rtm->rtm_type) { case RTM_ADD: - if (gate == NULL) + if (info.rti_info[RTAX_GATEWAY] == NULL) senderr(EINVAL); #ifdef __APPLE__ @@ -454,21 +466,21 @@ route_output(struct mbuf *m, struct socket *so) { #define satosinaddr(sa) (((struct sockaddr_in *)sa)->sin_addr.s_addr) - if (check_routeselfref && (dst && dst->sa_family == AF_INET) && - (netmask && satosinaddr(netmask) == INADDR_BROADCAST) && - (gate && satosinaddr(dst) == satosinaddr(gate))) { + if (check_routeselfref && (info.rti_info[RTAX_DST] && info.rti_info[RTAX_DST]->sa_family == AF_INET) && + (info.rti_info[RTAX_NETMASK] && satosinaddr(info.rti_info[RTAX_NETMASK]) == INADDR_BROADCAST) && + (info.rti_info[RTAX_GATEWAY] && satosinaddr(info.rti_info[RTAX_DST]) == satosinaddr(info.rti_info[RTAX_GATEWAY]))) { log(LOG_WARNING, "route_output: circular route %ld.%ld.%ld.%ld/32 ignored\n", - (ntohl(satosinaddr(gate)>>24))&0xff, - (ntohl(satosinaddr(gate)>>16))&0xff, - (ntohl(satosinaddr(gate)>>8))&0xff, - (ntohl(satosinaddr(gate)))&0xff); + (ntohl(satosinaddr(info.rti_info[RTAX_GATEWAY])>>24))&0xff, + (ntohl(satosinaddr(info.rti_info[RTAX_GATEWAY])>>16))&0xff, + (ntohl(satosinaddr(info.rti_info[RTAX_GATEWAY])>>8))&0xff, + (ntohl(satosinaddr(info.rti_info[RTAX_GATEWAY])))&0xff); senderr(EINVAL); } } #endif - error = rtrequest_scoped_locked(RTM_ADD, dst, gate, - netmask, rtm->rtm_flags, &saved_nrt, ifscope); + error = rtrequest_scoped_locked(RTM_ADD, info.rti_info[RTAX_DST], info.rti_info[RTAX_GATEWAY], + info.rti_info[RTAX_NETMASK], rtm->rtm_flags, &saved_nrt, ifscope); if (error == 0 && saved_nrt) { RT_LOCK(saved_nrt); #ifdef __APPLE__ @@ -499,24 +511,24 @@ route_output(struct mbuf *m, struct socket *so) * rarely encountered. * dwiggins@bbn.com */ - - rt_setif(saved_nrt, ifpaddr, ifaaddr, gate, + + rt_setif(saved_nrt, info.rti_info[RTAX_IFP], info.rti_info[RTAX_IFA], info.rti_info[RTAX_GATEWAY], ifscope); #endif rt_setmetrics(rtm->rtm_inits, - &rtm->rtm_rmx, &saved_nrt->rt_rmx); + &rtm->rtm_rmx, saved_nrt); saved_nrt->rt_rmx.rmx_locks &= ~(rtm->rtm_inits); saved_nrt->rt_rmx.rmx_locks |= - (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks); - saved_nrt->rt_genmask = genmask; + (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks); + saved_nrt->rt_genmask = info.rti_info[RTAX_GENMASK]; RT_REMREF_LOCKED(saved_nrt); RT_UNLOCK(saved_nrt); } break; case RTM_DELETE: - error = rtrequest_scoped_locked(RTM_DELETE, dst, - gate, netmask, rtm->rtm_flags, &saved_nrt, ifscope); + error = rtrequest_scoped_locked(RTM_DELETE, info.rti_info[RTAX_DST], + info.rti_info[RTAX_GATEWAY], info.rti_info[RTAX_NETMASK], rtm->rtm_flags, &saved_nrt, ifscope); if (error == 0) { rt = saved_nrt; RT_LOCK(rt); @@ -527,18 +539,23 @@ route_output(struct mbuf *m, struct socket *so) case RTM_GET: case RTM_CHANGE: case RTM_LOCK: - if ((rnh = rt_tables[dst->sa_family]) == NULL) + if ((rnh = rt_tables[info.rti_info[RTAX_DST]->sa_family]) == NULL) senderr(EAFNOSUPPORT); /* * Lookup the best match based on the key-mask pair; * callee adds a reference and checks for root node. */ - rt = rt_lookup(TRUE, dst, netmask, rnh, ifscope); + rt = rt_lookup(TRUE, info.rti_info[RTAX_DST], info.rti_info[RTAX_NETMASK], rnh, ifscope); if (rt == NULL) senderr(ESRCH); RT_LOCK(rt); + /* + * Holding rnh_lock here prevents the possibility of + * ifa from changing (e.g. in_ifinit), so it is safe + * to access its ifa_addr (down below) without locking. + */ switch(rtm->rtm_type) { case RTM_GET: { @@ -546,52 +563,63 @@ route_output(struct mbuf *m, struct socket *so) report: ifa2 = NULL; RT_LOCK_ASSERT_HELD(rt); - dst = rt_key(rt); - gate = rt->rt_gateway; - netmask = rt_mask(rt); - genmask = rt->rt_genmask; + info.rti_info[RTAX_DST] = rt_key(rt); + dst_sa_family = info.rti_info[RTAX_DST]->sa_family; + info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; + info.rti_info[RTAX_NETMASK] = rt_mask(rt); + info.rti_info[RTAX_GENMASK] = rt->rt_genmask; if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) { ifp = rt->rt_ifp; if (ifp) { ifnet_lock_shared(ifp); - ifa2 = ifp->if_addrhead.tqh_first; - ifpaddr = ifa2->ifa_addr; - ifaref(ifa2); + ifa2 = ifp->if_lladdr; + info.rti_info[RTAX_IFP] = ifa2->ifa_addr; + IFA_ADDREF(ifa2); ifnet_lock_done(ifp); - ifaaddr = rt->rt_ifa->ifa_addr; + info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; rtm->rtm_index = ifp->if_index; } else { - ifpaddr = 0; - ifaaddr = 0; + info.rti_info[RTAX_IFP] = NULL; + info.rti_info[RTAX_IFA] = NULL; } + } else if ((ifp = rt->rt_ifp) != NULL) { + rtm->rtm_index = ifp->if_index; } + if (ifa2 != NULL) + IFA_LOCK(ifa2); len = rt_msg2(rtm->rtm_type, &info, (caddr_t)0, (struct walkarg *)0); + if (ifa2 != NULL) + IFA_UNLOCK(ifa2); if (len > rtm->rtm_msglen) { struct rt_msghdr *new_rtm; R_Malloc(new_rtm, struct rt_msghdr *, len); if (new_rtm == 0) { RT_UNLOCK(rt); if (ifa2 != NULL) - ifafree(ifa2); + IFA_REMREF(ifa2); senderr(ENOBUFS); } Bcopy(rtm, new_rtm, rtm->rtm_msglen); R_Free(rtm); rtm = new_rtm; } + if (ifa2 != NULL) + IFA_LOCK(ifa2); (void)rt_msg2(rtm->rtm_type, &info, (caddr_t)rtm, (struct walkarg *)0); + if (ifa2 != NULL) + IFA_UNLOCK(ifa2); rtm->rtm_flags = rt->rt_flags; - rtm->rtm_rmx = rt->rt_rmx; + rt_getmetrics(rt, &rtm->rtm_rmx); rtm->rtm_addrs = info.rti_addrs; if (ifa2 != NULL) - ifafree(ifa2); + IFA_REMREF(ifa2); } break; case RTM_CHANGE: - if (gate && (error = rt_setgate(rt, - rt_key(rt), gate))) { + if (info.rti_info[RTAX_GATEWAY] && (error = rt_setgate(rt, + rt_key(rt), info.rti_info[RTAX_GATEWAY]))) { RT_UNLOCK(rt); senderr(error); } @@ -602,8 +630,8 @@ route_output(struct mbuf *m, struct socket *so) * flags on the default route without changing the * default gateway. Changing flags still doesn't work. */ - if ((rt->rt_flags & RTF_GATEWAY) && !gate) - gate = rt->rt_gateway; + if ((rt->rt_flags & RTF_GATEWAY) && !info.rti_info[RTAX_GATEWAY]) + info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; #ifdef __APPLE__ /* @@ -611,19 +639,19 @@ route_output(struct mbuf *m, struct socket *so) * equivalent to the code found at this very spot * in BSD. */ - rt_setif(rt, ifpaddr, ifaaddr, gate, + rt_setif(rt, info.rti_info[RTAX_IFP], info.rti_info[RTAX_IFA], info.rti_info[RTAX_GATEWAY], ifscope); #endif rt_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx, - &rt->rt_rmx); + rt); #ifndef __APPLE__ /* rt_setif, called above does this for us on darwin */ if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest) - rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, gate); + rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, info.rti_info[RTAX_GATEWAY]); #endif - if (genmask) - rt->rt_genmask = genmask; + if (info.rti_info[RTAX_GENMASK]) + rt->rt_genmask = info.rti_info[RTAX_GENMASK]; /* * Fall into */ @@ -684,10 +712,11 @@ route_output(struct mbuf *m, struct socket *so) if (error) return error; } else { + struct sockproto route_proto = {PF_ROUTE, 0}; if (rp) rp->rcb_proto.sp_family = 0; /* Avoid us */ - if (dst) - route_proto.sp_protocol = dst->sa_family; + if (dst_sa_family != 0) + route_proto.sp_protocol = dst_sa_family; if (m) { socket_unlock(so, 0); raw_input(m, &route_proto, &route_src, &route_dst); @@ -700,10 +729,28 @@ route_output(struct mbuf *m, struct socket *so) return (error); } +void +rt_setexpire(struct rtentry *rt, uint64_t expiry) +{ + /* set both rt_expire and rmx_expire */ + rt->rt_expire = expiry; + if (expiry) { + rt->rt_rmx.rmx_expire = expiry + rt->base_calendartime - + rt->base_uptime; + } else + rt->rt_rmx.rmx_expire = 0; +} + static void -rt_setmetrics(u_int32_t which, struct rt_metrics *in, struct rt_metrics *out) +rt_setmetrics(u_int32_t which, struct rt_metrics *in, struct rtentry *out) { -#define metric(f, e) if (which & (f)) out->e = in->e; + struct timeval curr_calendar_time; + uint64_t curr_uptime; + + getmicrotime(&curr_calendar_time); + curr_uptime = net_uptime(); + +#define metric(f, e) if (which & (f)) out->rt_rmx.e = in->e; metric(RTV_RPIPE, rmx_recvpipe); metric(RTV_SPIPE, rmx_sendpipe); metric(RTV_SSTHRESH, rmx_ssthresh); @@ -713,17 +760,65 @@ rt_setmetrics(u_int32_t which, struct rt_metrics *in, struct rt_metrics *out) metric(RTV_MTU, rmx_mtu); metric(RTV_EXPIRE, rmx_expire); #undef metric + + if (out->rt_rmx.rmx_expire > 0) { + /* account for system time change */ + curr_uptime = net_uptime(); + getmicrotime(&curr_calendar_time); + out->base_calendartime += + CALCULATE_CLOCKSKEW(curr_calendar_time, + out->base_calendartime, + curr_uptime, out->base_uptime); + rt_setexpire(out, + out->rt_rmx.rmx_expire - + out->base_calendartime + + out->base_uptime); + } else { + rt_setexpire(out, 0); + } + + VERIFY(out->rt_expire == 0 || out->rt_rmx.rmx_expire != 0); + VERIFY(out->rt_expire != 0 || out->rt_rmx.rmx_expire == 0); +} + +static void +rt_getmetrics(struct rtentry *in, struct rt_metrics *out) +{ + struct timeval curr_calendar_time; + uint64_t curr_uptime; + + VERIFY(in->rt_expire == 0 || in->rt_rmx.rmx_expire != 0); + VERIFY(in->rt_expire != 0 || in->rt_rmx.rmx_expire == 0); + + *out = in->rt_rmx; + + if (in->rt_expire) { + /* account for system time change */ + getmicrotime(&curr_calendar_time); + curr_uptime = net_uptime(); + + in->base_calendartime += + CALCULATE_CLOCKSKEW(curr_calendar_time, + in->base_calendartime, + curr_uptime, in->base_uptime); + + out->rmx_expire = in->base_calendartime + + in->rt_expire - in->base_uptime; + } else + out->rmx_expire = 0; } /* - * Set route's interface given ifpaddr, ifaaddr, and gateway. + * Set route's interface given info.rti_info[RTAX_IFP], info.rti_info[RTAX_IFA], and gateway. */ static void rt_setif(struct rtentry *rt, struct sockaddr *Ifpaddr, struct sockaddr *Ifaaddr, struct sockaddr *Gate, unsigned int ifscope) { - struct ifaddr *ifa = 0; - struct ifnet *ifp = 0; + struct ifaddr *ifa = NULL; + struct ifnet *ifp = NULL; + void (*ifa_rtrequest) + (int, struct rtentry *, struct sockaddr *); lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); @@ -740,6 +835,9 @@ rt_setif(struct rtentry *rt, struct sockaddr *Ifpaddr, struct sockaddr *Ifaaddr, /* Add an extra ref for ourselves */ RT_ADDREF_LOCKED(rt); + /* Become a regular mutex, just in case */ + RT_CONVERT_LOCK(rt); + /* * New gateway could require new ifaddr, ifp; flags may also * be different; ifp may be specified by ll sockaddr when @@ -747,11 +845,11 @@ rt_setif(struct rtentry *rt, struct sockaddr *Ifpaddr, struct sockaddr *Ifaaddr, */ if (Ifpaddr && (ifa = ifa_ifwithnet_scoped(Ifpaddr, ifscope)) && (ifp = ifa->ifa_ifp) && (Ifaaddr || Gate)) { - ifafree(ifa); + IFA_REMREF(ifa); ifa = ifaof_ifpforaddr(Ifaaddr ? Ifaaddr : Gate, ifp); } else { if (ifa) { - ifafree(ifa); + IFA_REMREF(ifa); ifa = 0; } if (Ifpaddr && (ifp = if_withname(Ifpaddr)) ) { @@ -761,7 +859,7 @@ rt_setif(struct rtentry *rt, struct sockaddr *Ifpaddr, struct sockaddr *Ifaaddr, ifnet_lock_shared(ifp); ifa = TAILQ_FIRST(&ifp->if_addrhead); if (ifa != NULL) - ifaref(ifa); + IFA_ADDREF(ifa); ifnet_lock_done(ifp); } } else if (Ifaaddr && @@ -783,7 +881,7 @@ rt_setif(struct rtentry *rt, struct sockaddr *Ifpaddr, struct sockaddr *Ifaaddr, /* Don't update a defunct route */ if (rt->rt_flags & RTF_CONDEMNED) { if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); /* Release extra ref */ RT_REMREF_LOCKED(rt); return; @@ -793,39 +891,63 @@ rt_setif(struct rtentry *rt, struct sockaddr *Ifpaddr, struct sockaddr *Ifaaddr, if (ifa) { struct ifaddr *oifa = rt->rt_ifa; if (oifa != ifa) { - if (oifa && oifa->ifa_rtrequest) - oifa->ifa_rtrequest(RTM_DELETE, rt, Gate); + if (oifa != NULL) { + IFA_LOCK_SPIN(oifa); + ifa_rtrequest = oifa->ifa_rtrequest; + IFA_UNLOCK(oifa); + if (ifa_rtrequest != NULL) + ifa_rtrequest(RTM_DELETE, rt, Gate); + } rtsetifa(rt, ifa); -#if IFNET_ROUTE_REFCNT - /* - * Adjust route ref count for the interfaces. - */ - if (rt->rt_if_ref_fn != NULL && rt->rt_ifp != ifp) { - rt->rt_if_ref_fn(ifp, 1); - rt->rt_if_ref_fn(rt->rt_ifp, -1); + + if (rt->rt_ifp != ifp) { + /* + * Purge any link-layer info caching. + */ + if (rt->rt_llinfo_purge != NULL) + rt->rt_llinfo_purge(rt); + + /* + * Adjust route ref count for the interfaces. + */ + if (rt->rt_if_ref_fn != NULL) { + rt->rt_if_ref_fn(ifp, 1); + rt->rt_if_ref_fn(rt->rt_ifp, -1); + } } -#endif /* IFNET_ROUTE_REFCNT */ rt->rt_ifp = ifp; /* * If this is the (non-scoped) default route, record * the interface index used for the primary ifscope. */ - if (rt_inet_default(rt, rt_key(rt))) - set_primary_ifscope(rt->rt_ifp->if_index); + if (rt_primary_default(rt, rt_key(rt))) { + set_primary_ifscope(rt_key(rt)->sa_family, + rt->rt_ifp->if_index); + } rt->rt_rmx.rmx_mtu = ifp->if_mtu; - if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest) - rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, Gate); - ifafree(ifa); + if (rt->rt_ifa != NULL) { + IFA_LOCK_SPIN(rt->rt_ifa); + ifa_rtrequest = rt->rt_ifa->ifa_rtrequest; + IFA_UNLOCK(rt->rt_ifa); + if (ifa_rtrequest != NULL) + ifa_rtrequest(RTM_ADD, rt, Gate); + } + IFA_REMREF(ifa); /* Release extra ref */ RT_REMREF_LOCKED(rt); return; } - ifafree(ifa); + IFA_REMREF(ifa); } /* XXX: to reset gateway to correct value, at RTM_CHANGE */ - if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest) - rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, Gate); + if (rt->rt_ifa != NULL) { + IFA_LOCK_SPIN(rt->rt_ifa); + ifa_rtrequest = rt->rt_ifa->ifa_rtrequest; + IFA_UNLOCK(rt->rt_ifa); + if (ifa_rtrequest != NULL) + ifa_rtrequest(RTM_ADD, rt, Gate); + } /* Release extra ref */ RT_REMREF_LOCKED(rt); @@ -935,7 +1057,7 @@ rt_msg1(int type, struct rt_addrinfo *rtinfo) hint = rtinfo->rti_info[RTAX_IFA]; /* Scrub away any trace of embedded interface scope */ - sa = rtm_scrub_ifscope(i, hint, sa, &ss); + sa = rtm_scrub_ifscope(type, i, hint, sa, &ss); break; default: @@ -990,6 +1112,10 @@ rt_msg2(int type, struct rt_addrinfo *rtinfo, caddr_t cp, struct walkarg *w) len = sizeof(struct ifma_msghdr2); break; + case RTM_GET_EXT: + len = sizeof (struct rt_msghdr_ext); + break; + case RTM_GET2: len = sizeof(struct rt_msghdr2); break; @@ -1014,7 +1140,7 @@ rt_msg2(int type, struct rt_addrinfo *rtinfo, caddr_t cp, struct walkarg *w) hint = rtinfo->rti_info[RTAX_IFA]; /* Scrub away any trace of embedded interface scope */ - sa = rtm_scrub_ifscope(i, hint, sa, &ss); + sa = rtm_scrub_ifscope(type, i, hint, sa, &ss); break; default: @@ -1070,6 +1196,7 @@ rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error) struct rt_msghdr *rtm; struct mbuf *m; struct sockaddr *sa = rtinfo->rti_info[RTAX_DST]; + struct sockproto route_proto = {PF_ROUTE, 0}; if (route_cb.any_count == 0) return; @@ -1080,7 +1207,7 @@ rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error) rtm->rtm_flags = RTF_DONE | flags; rtm->rtm_errno = error; rtm->rtm_addrs = rtinfo->rti_addrs; - route_proto.sp_protocol = sa ? sa->sa_family : 0; + route_proto.sp_family = sa ? sa->sa_family : 0; raw_input(m, &route_proto, &route_src, &route_dst); } @@ -1095,6 +1222,7 @@ rt_ifmsg( struct if_msghdr *ifm; struct mbuf *m; struct rt_addrinfo info; + struct sockproto route_proto = {PF_ROUTE, 0}; if (route_cb.any_count == 0) return; @@ -1107,7 +1235,6 @@ rt_ifmsg( ifm->ifm_flags = (u_short)ifp->if_flags; if_data_internal_to_if_data(ifp, &ifp->if_data, &ifm->ifm_data); ifm->ifm_addrs = 0; - route_proto.sp_protocol = 0; raw_input(m, &route_proto, &route_src, &route_dst); } @@ -1120,7 +1247,7 @@ rt_ifmsg( * copies of it. * * Since this is coming from the interface, it is expected that the - * interface will be locked. Caller must hold rt_lock. + * interface will be locked. Caller must hold rnh_lock and rt_lock. */ void rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt) @@ -1130,11 +1257,16 @@ rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt) int pass; struct mbuf *m = 0; struct ifnet *ifp = ifa->ifa_ifp; + struct sockproto route_proto = {PF_ROUTE, 0}; + lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); RT_LOCK_ASSERT_HELD(rt); if (route_cb.any_count == 0) return; + + /* Become a regular mutex, just in case */ + RT_CONVERT_LOCK(rt); for (pass = 1; pass < 3; pass++) { bzero((caddr_t)&info, sizeof(info)); if ((cmd == RTM_ADD && pass == 1) || @@ -1142,21 +1274,32 @@ rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt) struct ifa_msghdr *ifam; int ncmd = cmd == RTM_ADD ? RTM_NEWADDR : RTM_DELADDR; - /* Lock ifp for if_addrhead */ + /* Lock ifp for if_lladdr */ ifnet_lock_shared(ifp); - ifaaddr = sa = ifa->ifa_addr; - ifpaddr = ifp->if_addrhead.tqh_first->ifa_addr; - netmask = ifa->ifa_netmask; - brdaddr = ifa->ifa_dstaddr; + IFA_LOCK(ifa); + info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr; + /* + * Holding ifnet lock here prevents the link address + * from changing contents, so no need to hold its + * lock. The link address is always present; it's + * never freed. + */ + info.rti_info[RTAX_IFP] = ifp->if_lladdr->ifa_addr; + info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask; + info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; if ((m = rt_msg1(ncmd, &info)) == NULL) { + IFA_UNLOCK(ifa); ifnet_lock_done(ifp); continue; } + IFA_UNLOCK(ifa); ifnet_lock_done(ifp); ifam = mtod(m, struct ifa_msghdr *); ifam->ifam_index = ifp->if_index; + IFA_LOCK_SPIN(ifa); ifam->ifam_metric = ifa->ifa_metric; ifam->ifam_flags = ifa->ifa_flags; + IFA_UNLOCK(ifa); ifam->ifam_addrs = info.rti_addrs; } if ((cmd == RTM_ADD && pass == 2) || @@ -1165,9 +1308,9 @@ rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt) if (rt == 0) continue; - netmask = rt_mask(rt); - dst = sa = rt_key(rt); - gate = rt->rt_gateway; + info.rti_info[RTAX_NETMASK] = rt_mask(rt); + info.rti_info[RTAX_DST] = sa = rt_key(rt); + info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; if ((m = rt_msg1(cmd, &info)) == NULL) continue; rtm = mtod(m, struct rt_msghdr *); @@ -1193,35 +1336,34 @@ rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma) struct mbuf *m = 0; struct ifnet *ifp = ifma->ifma_ifp; struct ifma_msghdr *ifmam; + struct sockproto route_proto = {PF_ROUTE, 0}; if (route_cb.any_count == 0) return; + /* Lock ifp for if_lladdr */ + ifnet_lock_shared(ifp); bzero((caddr_t)&info, sizeof(info)); - ifaaddr = ifma->ifma_addr; - /* Lock ifp for if_addrhead */ - if (ifp != NULL) - ifnet_lock_shared(ifp); - if (ifp && ifp->if_addrhead.tqh_first) - ifpaddr = ifp->if_addrhead.tqh_first->ifa_addr; - else - ifpaddr = NULL; + IFMA_LOCK(ifma); + info.rti_info[RTAX_IFA] = ifma->ifma_addr; + info.rti_info[RTAX_IFP] = ifp->if_lladdr->ifa_addr; /* lladdr doesn't need lock */ + /* * If a link-layer address is present, present it as a ``gateway'' * (similarly to how ARP entries, e.g., are presented). */ - gate = ifma->ifma_ll->ifma_addr; + info.rti_info[RTAX_GATEWAY] = (ifma->ifma_ll != NULL) ? ifma->ifma_ll->ifma_addr : NULL; if ((m = rt_msg1(cmd, &info)) == NULL) { - if (ifp != NULL) - ifnet_lock_done(ifp); + IFMA_UNLOCK(ifma); + ifnet_lock_done(ifp); return; } - if (ifp != NULL) - ifnet_lock_done(ifp); ifmam = mtod(m, struct ifma_msghdr *); - ifmam->ifmam_index = ifp ? ifp->if_index : 0; + ifmam->ifmam_index = ifp->if_index; ifmam->ifmam_addrs = info.rti_addrs; route_proto.sp_protocol = ifma->ifma_addr->sa_family; + IFMA_UNLOCK(ifma); + ifnet_lock_done(ifp); raw_input(m, &route_proto, &route_src, &route_dst); } @@ -1242,10 +1384,11 @@ sysctl_dumpentry(struct radix_node *rn, void *vw) return 0; } bzero((caddr_t)&info, sizeof(info)); - dst = rt_key(rt); - gate = rt->rt_gateway; - netmask = rt_mask(rt); - genmask = rt->rt_genmask; + info.rti_info[RTAX_DST] = rt_key(rt); + info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; + info.rti_info[RTAX_NETMASK] = rt_mask(rt); + info.rti_info[RTAX_GENMASK] = rt->rt_genmask; + if (w->w_op != NET_RT_DUMP2) { size = rt_msg2(RTM_GET, &info, 0, w); if (w->w_req && w->w_tmem) { @@ -1253,201 +1396,418 @@ sysctl_dumpentry(struct radix_node *rn, void *vw) rtm->rtm_flags = rt->rt_flags; rtm->rtm_use = rt->rt_use; - rtm->rtm_rmx = rt->rt_rmx; + rt_getmetrics(rt, &rtm->rtm_rmx); rtm->rtm_index = rt->rt_ifp->if_index; rtm->rtm_pid = 0; - rtm->rtm_seq = 0; - rtm->rtm_errno = 0; + rtm->rtm_seq = 0; + rtm->rtm_errno = 0; rtm->rtm_addrs = info.rti_addrs; error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size); RT_UNLOCK(rt); return (error); } } else { - size = rt_msg2(RTM_GET2, &info, 0, w); - if (w->w_req && w->w_tmem) { - struct rt_msghdr2 *rtm = (struct rt_msghdr2 *)w->w_tmem; - - rtm->rtm_flags = rt->rt_flags; - rtm->rtm_use = rt->rt_use; - rtm->rtm_rmx = rt->rt_rmx; - rtm->rtm_index = rt->rt_ifp->if_index; - rtm->rtm_refcnt = rt->rt_refcnt; + size = rt_msg2(RTM_GET2, &info, 0, w); + if (w->w_req && w->w_tmem) { + struct rt_msghdr2 *rtm = (struct rt_msghdr2 *)w->w_tmem; + + rtm->rtm_flags = rt->rt_flags; + rtm->rtm_use = rt->rt_use; + rt_getmetrics(rt, &rtm->rtm_rmx); + rtm->rtm_index = rt->rt_ifp->if_index; + rtm->rtm_refcnt = rt->rt_refcnt; if (rt->rt_parent) rtm->rtm_parentflags = rt->rt_parent->rt_flags; else rtm->rtm_parentflags = 0; - rtm->rtm_reserved = 0; - rtm->rtm_addrs = info.rti_addrs; - error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size); + rtm->rtm_reserved = 0; + rtm->rtm_addrs = info.rti_addrs; + error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size); RT_UNLOCK(rt); - return (error); - + return (error); } } RT_UNLOCK(rt); return (error); } +/* + * This is used for dumping extended information from route entries. + */ int -sysctl_iflist( - int af, - struct walkarg *w) +sysctl_dumpentry_ext(struct radix_node *rn, void *vw) +{ + struct walkarg *w = vw; + struct rtentry *rt = (struct rtentry *)rn; + int error = 0, size; + struct rt_addrinfo info; + + RT_LOCK(rt); + if (w->w_op == NET_RT_DUMPX_FLAGS && !(rt->rt_flags & w->w_arg)) { + RT_UNLOCK(rt); + return (0); + } + bzero(&info, sizeof (info)); + info.rti_info[RTAX_DST] = rt_key(rt); + info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; + info.rti_info[RTAX_NETMASK] = rt_mask(rt); + info.rti_info[RTAX_GENMASK] = rt->rt_genmask; + + size = rt_msg2(RTM_GET_EXT, &info, 0, w); + if (w->w_req && w->w_tmem) { + struct rt_msghdr_ext *ertm = (struct rt_msghdr_ext *)w->w_tmem; + + ertm->rtm_flags = rt->rt_flags; + ertm->rtm_use = rt->rt_use; + rt_getmetrics(rt, &ertm->rtm_rmx); + ertm->rtm_index = rt->rt_ifp->if_index; + ertm->rtm_pid = 0; + ertm->rtm_seq = 0; + ertm->rtm_errno = 0; + ertm->rtm_addrs = info.rti_addrs; + if (rt->rt_llinfo_get_ri == NULL) + bzero(&ertm->rtm_ri, sizeof (ertm->rtm_ri)); + else + rt->rt_llinfo_get_ri(rt, &ertm->rtm_ri); + + error = SYSCTL_OUT(w->w_req, (caddr_t)ertm, size); + RT_UNLOCK(rt); + return (error); + } + RT_UNLOCK(rt); + return (error); +} + +/* + * rdar://9307819 + * To avoid to call copyout() while holding locks and to cause problems + * in the paging path, sysctl_iflist() and sysctl_iflist2() contstruct + * the list in two passes. In the first pass we compute the total + * length of the data we are going to copyout, then we release + * all locks to allocate a temporary buffer that gets filled + * in the second pass. + * + * Note that we are verifying the assumption that _MALLOC returns a buffer + * that is at least 32 bits aligned and that the messages and addresses are + * 32 bits aligned. + */ + +int +sysctl_iflist(int af, struct walkarg *w) { struct ifnet *ifp; struct ifaddr *ifa; struct rt_addrinfo info; int len, error = 0; + int pass = 0; + int total_len = 0, current_len = 0; + char *total_buffer = NULL, *cp = NULL; bzero((caddr_t)&info, sizeof(info)); - ifnet_head_lock_shared(); - TAILQ_FOREACH(ifp, &ifnet_head, if_link) { - if (error) - break; - if (w->w_arg && w->w_arg != ifp->if_index) - continue; - ifnet_lock_shared(ifp); - ifa = ifp->if_addrhead.tqh_first; - ifpaddr = ifa->ifa_addr; - len = rt_msg2(RTM_IFINFO, &info, (caddr_t)0, w); - ifpaddr = 0; - if (w->w_req && w->w_tmem) { - struct if_msghdr *ifm; - - ifm = (struct if_msghdr *)w->w_tmem; - ifm->ifm_index = ifp->if_index; - ifm->ifm_flags = (u_short)ifp->if_flags; - if_data_internal_to_if_data(ifp, &ifp->if_data, &ifm->ifm_data); - ifm->ifm_addrs = info.rti_addrs; - error = SYSCTL_OUT(w->w_req,(caddr_t)ifm, len); - if (error) { - ifnet_lock_done(ifp); + + for (pass = 0; pass < 2; pass++) { + ifnet_head_lock_shared(); + + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + if (error) break; - } - } - while ((ifa = ifa->ifa_link.tqe_next) != 0) { - if (af && af != ifa->ifa_addr->sa_family) + if (w->w_arg && w->w_arg != ifp->if_index) continue; - ifaaddr = ifa->ifa_addr; - netmask = ifa->ifa_netmask; - brdaddr = ifa->ifa_dstaddr; - len = rt_msg2(RTM_NEWADDR, &info, 0, w); - if (w->w_req && w->w_tmem) { - struct ifa_msghdr *ifam; - - ifam = (struct ifa_msghdr *)w->w_tmem; - ifam->ifam_index = ifa->ifa_ifp->if_index; - ifam->ifam_flags = ifa->ifa_flags; - ifam->ifam_metric = ifa->ifa_metric; - ifam->ifam_addrs = info.rti_addrs; - error = SYSCTL_OUT(w->w_req, w->w_tmem, len); - if (error) + ifnet_lock_shared(ifp); + /* + * Holding ifnet lock here prevents the link address from + * changing contents, so no need to hold the ifa lock. + * The link address is always present; it's never freed. + */ + ifa = ifp->if_lladdr; + info.rti_info[RTAX_IFP] = ifa->ifa_addr; + len = rt_msg2(RTM_IFINFO, &info, (caddr_t)0, NULL); + if (pass == 0) { + total_len += len; + } else { + struct if_msghdr *ifm; + + if (current_len + len > total_len) { + ifnet_lock_done(ifp); + printf("sysctl_iflist: current_len (%d) + len (%d) > total_len (%d)\n", + current_len, len, total_len); + error = ENOBUFS; break; + } + info.rti_info[RTAX_IFP] = ifa->ifa_addr; + len = rt_msg2(RTM_IFINFO, &info, (caddr_t)cp, NULL); + info.rti_info[RTAX_IFP] = NULL; + + ifm = (struct if_msghdr *)cp; + ifm->ifm_index = ifp->if_index; + ifm->ifm_flags = (u_short)ifp->if_flags; + if_data_internal_to_if_data(ifp, &ifp->if_data, + &ifm->ifm_data); + ifm->ifm_addrs = info.rti_addrs; + + cp += len; + VERIFY(IS_P2ALIGNED(cp, sizeof(u_int32_t))); + current_len += len; } + while ((ifa = ifa->ifa_link.tqe_next) != 0) { + IFA_LOCK(ifa); + if (af && af != ifa->ifa_addr->sa_family) { + IFA_UNLOCK(ifa); + continue; + } + info.rti_info[RTAX_IFA] = ifa->ifa_addr; + info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask; + info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; + len = rt_msg2(RTM_NEWADDR, &info, 0, 0); + if (pass == 0) { + total_len += len; + } else { + struct ifa_msghdr *ifam; + + if (current_len + len > total_len) { + IFA_UNLOCK(ifa); + printf("sysctl_iflist: current_len (%d) + len (%d) > total_len (%d)\n", + current_len, len, total_len); + error = ENOBUFS; + break; + } + len = rt_msg2(RTM_NEWADDR, &info, (caddr_t)cp, NULL); + + ifam = (struct ifa_msghdr *)cp; + ifam->ifam_index = ifa->ifa_ifp->if_index; + ifam->ifam_flags = ifa->ifa_flags; + ifam->ifam_metric = ifa->ifa_metric; + ifam->ifam_addrs = info.rti_addrs; + + cp += len; + VERIFY(IS_P2ALIGNED(cp, sizeof(u_int32_t))); + current_len += len; + } + IFA_UNLOCK(ifa); + } + ifnet_lock_done(ifp); + info.rti_info[RTAX_IFA] = info.rti_info[RTAX_NETMASK] = + info.rti_info[RTAX_BRD] = NULL; + } + + ifnet_head_done(); + + if (error) + break; + + if (pass == 0) { + /* Better to return zero length buffer than ENOBUFS */ + if (total_len == 0) + total_len = 1; + total_len += total_len >> 3; + total_buffer = _MALLOC(total_len, M_RTABLE, M_ZERO | M_WAITOK); + if (total_buffer == NULL) { + printf("sysctl_iflist: _MALLOC(%d) failed\n", total_len); + error = ENOBUFS; + break; + } + cp = total_buffer; + VERIFY(IS_P2ALIGNED(cp, sizeof(u_int32_t))); + } else { + error = SYSCTL_OUT(w->w_req, total_buffer, current_len); + if (error) + break; } - ifnet_lock_done(ifp); - ifaaddr = netmask = brdaddr = 0; } - ifnet_head_done(); + + if (total_buffer != NULL) + _FREE(total_buffer, M_RTABLE); + return error; } int -sysctl_iflist2( - int af, - struct walkarg *w) +sysctl_iflist2(int af, struct walkarg *w) { struct ifnet *ifp; struct ifaddr *ifa; struct rt_addrinfo info; int len, error = 0; - + int pass = 0; + int total_len = 0, current_len = 0; + char *total_buffer = NULL, *cp = NULL; + bzero((caddr_t)&info, sizeof(info)); - ifnet_head_lock_shared(); - TAILQ_FOREACH(ifp, &ifnet_head, if_link) { - if (error) - break; - if (w->w_arg && w->w_arg != ifp->if_index) - continue; - ifnet_lock_shared(ifp); - ifa = ifp->if_addrhead.tqh_first; - ifpaddr = ifa->ifa_addr; - len = rt_msg2(RTM_IFINFO2, &info, (caddr_t)0, w); - ifpaddr = 0; - if (w->w_req && w->w_tmem) { - struct if_msghdr2 *ifm; - - ifm = (struct if_msghdr2 *)w->w_tmem; - ifm->ifm_addrs = info.rti_addrs; - ifm->ifm_flags = (u_short)ifp->if_flags; - ifm->ifm_index = ifp->if_index; - ifm->ifm_snd_len = ifp->if_snd.ifq_len; - ifm->ifm_snd_maxlen = ifp->if_snd.ifq_maxlen; - ifm->ifm_snd_drops = ifp->if_snd.ifq_drops; - ifm->ifm_timer = ifp->if_timer; - if_data_internal_to_if_data64(ifp, &ifp->if_data, &ifm->ifm_data); - error = SYSCTL_OUT(w->w_req, w->w_tmem, len); - if (error) { - ifnet_lock_done(ifp); + + for (pass = 0; pass < 2; pass++) { + ifnet_head_lock_shared(); + + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + if (error) break; - } - } - while ((ifa = ifa->ifa_link.tqe_next) != 0) { - if (af && af != ifa->ifa_addr->sa_family) + if (w->w_arg && w->w_arg != ifp->if_index) continue; - ifaaddr = ifa->ifa_addr; - netmask = ifa->ifa_netmask; - brdaddr = ifa->ifa_dstaddr; - len = rt_msg2(RTM_NEWADDR, &info, 0, w); - if (w->w_req && w->w_tmem) { - struct ifa_msghdr *ifam; - - ifam = (struct ifa_msghdr *)w->w_tmem; - ifam->ifam_index = ifa->ifa_ifp->if_index; - ifam->ifam_flags = ifa->ifa_flags; - ifam->ifam_metric = ifa->ifa_metric; - ifam->ifam_addrs = info.rti_addrs; - error = SYSCTL_OUT(w->w_req, w->w_tmem, len); - if (error) + ifnet_lock_shared(ifp); + /* + * Holding ifnet lock here prevents the link address from + * changing contents, so no need to hold the ifa lock. + * The link address is always present; it's never freed. + */ + ifa = ifp->if_lladdr; + info.rti_info[RTAX_IFP] = ifa->ifa_addr; + len = rt_msg2(RTM_IFINFO2, &info, (caddr_t)0, NULL); + if (pass == 0) { + total_len += len; + } else { + struct if_msghdr2 *ifm; + + if (current_len + len > total_len) { + ifnet_lock_done(ifp); + printf("sysctl_iflist2: current_len (%d) + len (%d) > total_len (%d)\n", + current_len, len, total_len); + error = ENOBUFS; break; + } + info.rti_info[RTAX_IFP] = ifa->ifa_addr; + len = rt_msg2(RTM_IFINFO2, &info, (caddr_t)cp, NULL); + info.rti_info[RTAX_IFP] = NULL; + + ifm = (struct if_msghdr2 *)cp; + ifm->ifm_addrs = info.rti_addrs; + ifm->ifm_flags = (u_short)ifp->if_flags; + ifm->ifm_index = ifp->if_index; + ifm->ifm_snd_len = ifp->if_snd.ifq_len; + ifm->ifm_snd_maxlen = ifp->if_snd.ifq_maxlen; + ifm->ifm_snd_drops = ifp->if_snd.ifq_drops; + ifm->ifm_timer = ifp->if_timer; + if_data_internal_to_if_data64(ifp, &ifp->if_data, + &ifm->ifm_data); + + cp += len; + VERIFY(IS_P2ALIGNED(cp, sizeof(u_int32_t))); + current_len += len; } - } - if (error) { - ifnet_lock_done(ifp); - break; - } - { - struct ifmultiaddr *ifma; - - for (ifma = ifp->if_multiaddrs.lh_first; ifma; - ifma = ifma->ifma_link.le_next) { - if (af && af != ifma->ifma_addr->sa_family) + while ((ifa = ifa->ifa_link.tqe_next) != 0) { + IFA_LOCK(ifa); + if (af && af != ifa->ifa_addr->sa_family) { + IFA_UNLOCK(ifa); continue; - bzero((caddr_t)&info, sizeof(info)); - ifaaddr = ifma->ifma_addr; - if (ifp->if_addrhead.tqh_first) - ifpaddr = ifp->if_addrhead.tqh_first->ifa_addr; - if (ifma->ifma_ll) - gate = ifma->ifma_ll->ifma_addr; - len = rt_msg2(RTM_NEWMADDR2, &info, 0, w); - if (w->w_req && w->w_tmem) { - struct ifma_msghdr2 *ifmam; - - ifmam = (struct ifma_msghdr2 *)w->w_tmem; - ifmam->ifmam_addrs = info.rti_addrs; - ifmam->ifmam_flags = 0; - ifmam->ifmam_index = ifma->ifma_ifp->if_index; - ifmam->ifmam_refcount = ifma->ifma_refcount; - error = SYSCTL_OUT(w->w_req, w->w_tmem, len); - if (error) + } + info.rti_info[RTAX_IFA] = ifa->ifa_addr; + info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask; + info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; + len = rt_msg2(RTM_NEWADDR, &info, 0, 0); + if (pass == 0) { + total_len += len; + } else { + struct ifa_msghdr *ifam; + + if (current_len + len > total_len) { + IFA_UNLOCK(ifa); + printf("sysctl_iflist2: current_len (%d) + len (%d) > total_len (%d)\n", + current_len, len, total_len); + error = ENOBUFS; break; + } + len = rt_msg2(RTM_NEWADDR, &info, (caddr_t)cp, 0); + + ifam = (struct ifa_msghdr *)cp; + ifam->ifam_index = ifa->ifa_ifp->if_index; + ifam->ifam_flags = ifa->ifa_flags; + ifam->ifam_metric = ifa->ifa_metric; + ifam->ifam_addrs = info.rti_addrs; + + cp += len; + VERIFY(IS_P2ALIGNED(cp, sizeof(u_int32_t))); + current_len += len; + } + IFA_UNLOCK(ifa); + } + if (error) { + ifnet_lock_done(ifp); + break; + } + { + struct ifmultiaddr *ifma; + + for (ifma = LIST_FIRST(&ifp->if_multiaddrs); + ifma != NULL; ifma = LIST_NEXT(ifma, ifma_link)) { + struct ifaddr *ifa0; + + IFMA_LOCK(ifma); + if (af && af != ifma->ifma_addr->sa_family) { + IFMA_UNLOCK(ifma); + continue; + } + bzero((caddr_t)&info, sizeof(info)); + info.rti_info[RTAX_IFA] = ifma->ifma_addr; + /* + * Holding ifnet lock here prevents the link + * address from changing contents, so no need + * to hold the ifa0 lock. The link address is + * always present; it's never freed. + */ + ifa0 = ifp->if_lladdr; + info.rti_info[RTAX_IFP] = ifa0->ifa_addr; + if (ifma->ifma_ll != NULL) + info.rti_info[RTAX_GATEWAY] = ifma->ifma_ll->ifma_addr; + len = rt_msg2(RTM_NEWMADDR2, &info, 0, 0); + if (pass == 0) { + total_len += len; + } else { + struct ifma_msghdr2 *ifmam; + + if (current_len + len > total_len) { + IFMA_UNLOCK(ifma); + printf("sysctl_iflist2: current_len (%d) + len (%d) > total_len (%d)\n", + current_len, len, total_len); + error = ENOBUFS; + break; + } + len = rt_msg2(RTM_NEWMADDR2, &info, (caddr_t)cp, 0); + + ifmam = (struct ifma_msghdr2 *)cp; + ifmam->ifmam_addrs = info.rti_addrs; + ifmam->ifmam_flags = 0; + ifmam->ifmam_index = + ifma->ifma_ifp->if_index; + ifmam->ifmam_refcount = + ifma->ifma_reqcnt; + + cp += len; + VERIFY(IS_P2ALIGNED(cp, sizeof(u_int32_t))); + current_len += len; + } + IFMA_UNLOCK(ifma); } } + ifnet_lock_done(ifp); + info.rti_info[RTAX_IFA] = info.rti_info[RTAX_NETMASK] = + info.rti_info[RTAX_BRD] = NULL; + } + ifnet_head_done(); + + if (error) + break; + + if (pass == 0) { + /* Better to return zero length buffer than ENOBUFS */ + if (total_len == 0) + total_len = 1; + total_len += total_len >> 3; + total_buffer = _MALLOC(total_len, M_RTABLE, M_ZERO | M_WAITOK); + if (total_buffer == NULL) { + printf("sysctl_iflist2: _MALLOC(%d) failed\n", total_len); + error = ENOBUFS; + break; + } + cp = total_buffer; + VERIFY(IS_P2ALIGNED(cp, sizeof(u_int32_t))); + } else { + error = SYSCTL_OUT(w->w_req, total_buffer, current_len); + if (error) + break; } - ifnet_lock_done(ifp); - ifaaddr = netmask = brdaddr = 0; } - ifnet_head_done(); + + if (total_buffer != NULL) + _FREE(total_buffer, M_RTABLE); + return error; } @@ -1467,16 +1827,15 @@ sysctl_rtstat(struct sysctl_req *req) static int sysctl_rttrash(struct sysctl_req *req) { - int error; - - error = SYSCTL_OUT(req, &rttrash, sizeof(rttrash)); - if (error) - return (error); + int error; + + error = SYSCTL_OUT(req, &rttrash, sizeof(rttrash)); + if (error) + return (error); - return 0; + return 0; } -#if IFNET_ROUTE_REFCNT /* * Called from pfslowtimo(), protected by domain_proto_mtx */ @@ -1503,9 +1862,11 @@ rt_drainall(void) timerclear(&last_ts); in_rtqdrain(); /* protocol cloned routes: INET */ - in6_rtqdrain(); /* protocol cloned routes: INET6 */ in_arpdrain(NULL); /* cloned routes: ARP */ +#if INET6 + in6_rtqdrain(); /* protocol cloned routes: INET6 */ nd6_drain(NULL); /* cloned routes: ND6 */ +#endif /* INET6 */ last_ts.tv_sec = current_ts.tv_sec; last_ts.tv_usec = current_ts.tv_usec; @@ -1522,7 +1883,6 @@ rt_aggdrain(int on) else routedomain.dom_protosw->pr_flags &= ~PR_AGGDRAIN; } -#endif /* IFNET_ROUTE_REFCNT */ static int sysctl_rtsock SYSCTL_HANDLER_ARGS @@ -1556,7 +1916,17 @@ sysctl_rtsock SYSCTL_HANDLER_ARGS for (i = 1; i <= AF_MAX; i++) if ((rnh = rt_tables[i]) && (af == 0 || af == i) && (error = rnh->rnh_walktree(rnh, - sysctl_dumpentry, &w))) + sysctl_dumpentry, &w))) + break; + lck_mtx_unlock(rnh_lock); + break; + case NET_RT_DUMPX: + case NET_RT_DUMPX_FLAGS: + lck_mtx_lock(rnh_lock); + for (i = 1; i <= AF_MAX; i++) + if ((rnh = rt_tables[i]) && (af == 0 || af == i) && + (error = rnh->rnh_walktree(rnh, + sysctl_dumpentry_ext, &w))) break; lck_mtx_unlock(rnh_lock); break; @@ -1578,7 +1948,7 @@ sysctl_rtsock SYSCTL_HANDLER_ARGS return (error); } -SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD, sysctl_rtsock, ""); +SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_rtsock, ""); /* * Definitions of protocols supported in the ROUTE domain. @@ -1587,12 +1957,7 @@ static struct protosw routesw[] = { { SOCK_RAW, &routedomain, 0, PR_ATOMIC|PR_ADDR, 0, route_output, raw_ctlinput, 0, 0, - raw_init, 0, 0, -#if IFNET_ROUTE_REFCNT - rt_drainall, -#else - 0, -#endif /* IFNET_ROUTE_REFCNT */ + raw_init, 0, 0, rt_drainall, 0, &route_usrreqs, 0, 0, 0, diff --git a/bsd/net/rtsock_mip.c b/bsd/net/rtsock_mip.c deleted file mode 100644 index 2acd2585d..000000000 --- a/bsd/net/rtsock_mip.c +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* $KAME$ */ - -/* to be included from net/rtsock.c - ugly but necessary for portability */ -/* - * Mobile IPv6 addition. - * Send a routing message to all routing socket listeners. - */ -void -rt_mip6msg(cmd, ifp, rt) - int cmd; - struct ifnet *ifp; - register struct rtentry *rt; -{ - struct rt_addrinfo info; - struct sockaddr *sa = 0; - struct mbuf *m = 0; - register struct rt_msghdr *rtm; - -#ifdef MIP6_DEBUG - printf("route_cb.any_count = %d\n", route_cb.any_count); -#endif - bzero((caddr_t)&info, sizeof(info)); - - if (rt == 0 || ifp == 0) - return; - netmask = rt_mask(rt); - dst = sa = rt_key(rt); - gate = rt->rt_gateway; - genmask = rt->rt_genmask; - if ((m = rt_msg1(cmd, &info)) == NULL) { -#ifdef MIP6_DEBUG - printf("failure... \n"); -#endif - return; - } - rtm = mtod(m, struct rt_msghdr *); - rtm->rtm_index = ifp->if_index; - rtm->rtm_flags |= rt->rt_flags; - rtm->rtm_rmx = rt->rt_rmx; - rtm->rtm_addrs = info.rti_addrs; - rtm->rtm_flags |= RTF_DONE; - - route_proto.sp_protocol = sa ? sa->sa_family : 0; -#ifdef __bsdi__ - raw_input(m, NULL, &route_proto, &route_src, &route_dst); -#else - raw_input(m, &route_proto, &route_src, &route_dst); -#endif -} diff --git a/bsd/netat/Makefile b/bsd/netat/Makefile index 9282cd445..3f307255c 100644 --- a/bsd/netat/Makefile +++ b/bsd/netat/Makefile @@ -10,14 +10,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ SETUP_SUBDIRS = \ diff --git a/bsd/netat/asp_proto.c b/bsd/netat/asp_proto.c index bd471ff5a..8f1621b54 100644 --- a/bsd/netat/asp_proto.c +++ b/bsd/netat/asp_proto.c @@ -319,7 +319,7 @@ return ( static char mbuf_str[100]; char *mbuf_totals() { - sprintf(mbuf_str, + snprintf(mbuf_str, sizeof(mbuf_str), /* "dat = %d, prot = %d, ioc = %d, err = %d, hu = %d, ack = %d, nak = %d, ctl = %d", */ diff --git a/bsd/netat/at.c b/bsd/netat/at.c index 572b7f58b..ae6120798 100644 --- a/bsd/netat/at.c +++ b/bsd/netat/at.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -128,6 +128,21 @@ static int set_zones(zone_usage_t *ifz) return(0); } /* set_zones */ +static int +at_domifattach(struct ifnet *ifp, at_ifaddr_t *ifID) +{ + int error; + + if ((error = proto_plumb(PF_APPLETALK, ifp))) { + if (error != EEXIST) + log(LOG_ERR, "%s: proto_plumb returned %d if=%s%d\n", + __func__, error, ifp->if_name, ifp->if_unit); + } else if (ifID) + ifID->at_was_attached = 1; + + return (error); +} + /* * Generic internet control operations (ioctl's). * ifp is 0 if not an interface-specific ioctl. @@ -580,10 +595,10 @@ at_control(so, cmd, data, ifp) ifID->aa_ifp = ifp; ifa = &ifID->aa_ifa; - error = proto_plumb(PF_APPLETALK, ifp); + error = at_domifattach(ifp, ifID); if (error == EEXIST) { - ifID->at_was_attached = 1; - error = 0; + ifID->at_was_attached = 1; + error = 0; } if (error != 0) { break; @@ -592,27 +607,36 @@ at_control(so, cmd, data, ifp) ifID->cable_multicast_addr = etalk_multicast_addr; xpatcnt++; ifnet_lock_exclusive(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) - if ((sdl = (struct sockaddr_dl *)ifa->ifa_addr) && - (sdl->sdl_family == AF_LINK)) { - bcopy(LLADDR(sdl), ifID->xaddr, sizeof(ifID->xaddr)); + /* + * Holding ifnet lock here prevents the link address + * from changing contents, so no need to hold the ifa + * lock. The link address is always present; it's + * never freed. + */ + sdl = (struct sockaddr_dl *)ifp->if_lladdr->ifa_addr; + bcopy(LLADDR(sdl), ifID->xaddr, sizeof(ifID->xaddr)); #ifdef APPLETALK_DEBUG - kprintf("SIOCSIFADDR: local enet address is %x.%x.%x.%x.%x.%x\n", - ifID->xaddr[0], ifID->xaddr[1], - ifID->xaddr[2], ifID->xaddr[3], - ifID->xaddr[4], ifID->xaddr[5]); + kprintf("SIOCSIFADDR: local enet address is " + "%x.%x.%x.%x.%x.%x\n", + ifID->xaddr[0], ifID->xaddr[1], + ifID->xaddr[2], ifID->xaddr[3], + ifID->xaddr[4], ifID->xaddr[5]); #endif - break; - } /* attach the AppleTalk address to the ifnet structure */ ifa = &ifID->aa_ifa; + ifa_lock_init(ifa); + VERIFY(!(ifa->ifa_debug & IFD_ALLOC)); ifa->ifa_addr = (struct sockaddr *)&ifID->ifNodeAddress; ifID->ifNodeAddress.sat_len = sizeof(struct sockaddr_at); ifID->ifNodeAddress.sat_family = AF_APPLETALK; /* the address itself will be filled in when ifThisNode is set */ + IFA_LOCK(ifa); if_attach_ifa(ifp, ifa); + /* add a reference for at_interfaces[] */ + IFA_ADDREF_LOCKED(ifa); + IFA_UNLOCK(ifa); ifnet_lock_done(ifp); } break; @@ -678,11 +702,7 @@ at_control(so, cmd, data, ifp) error = EACCES; break; } - error = proto_plumb(PF_APPLETALK, ifp); - if (ifID != NULL - && (error == 0 || error == EEXIST)) { - ifID->at_was_attached = 1; - } + error = at_domifattach(ifp, ifID); break; case SIOCPROTODETACH: @@ -713,6 +733,7 @@ void atalk_post_msg(struct ifnet *ifp, u_long event_code, struct at_addr *addres struct kev_atalk_data at_event_data; struct kev_msg ev_msg; + bzero(&ev_msg, sizeof(struct kev_msg)); ev_msg.vendor_code = KEV_VENDOR_APPLE; ev_msg.kev_class = KEV_NETWORK_CLASS; ev_msg.kev_subclass = KEV_ATALK_SUBCLASS; @@ -739,3 +760,22 @@ void atalk_post_msg(struct ifnet *ifp, u_long event_code, struct at_addr *addres kev_post_msg(&ev_msg); } + + +/* + * This is untested; the code is here only for completeness. + */ +void +at_purgeaddrs(struct ifnet *ifp) +{ + at_ifaddr_t *ifID = NULL; + int pat_id; + + /* Find address for this interface, if it exists */ + for (pat_id = 0; pat_id < xpatcnt; pat_id++) { + if (at_interfaces[pat_id].aa_ifp == ifp) { + ifID = &at_interfaces[pat_id]; + elap_offline(ifID); + } + } +} diff --git a/bsd/netat/at_var.h b/bsd/netat/at_var.h index 64025eb6f..1513f9a82 100644 --- a/bsd/netat/at_var.h +++ b/bsd/netat/at_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -351,7 +351,7 @@ void init_ddp_handler(void); int elap_wput(gref_t *gref, gbuf_t *m); int at_ioctl(struct atpcb *, u_long, caddr_t, int ); - +extern void at_purgeaddrs(struct ifnet *); #endif /* KERNEL_PRIVATE */ #endif /* __APPLE_API_OBSOLETE */ diff --git a/bsd/netat/ddp.c b/bsd/netat/ddp.c index 31467fa6b..0e2ebea4b 100644 --- a/bsd/netat/ddp.c +++ b/bsd/netat/ddp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -384,12 +384,17 @@ void ddp_rem_if(ifID) struct ifaddr *ifa = &ifID->aa_ifa; /* un-do processing done in SIOCSIFADDR */ - if (ifa->ifa_addr) { - ifnet_lock_exclusive(ifID->aa_ifp); + ifnet_lock_exclusive(ifID->aa_ifp); + IFA_LOCK(ifa); + if (ifa->ifa_debug & IFD_ATTACHED) { if_detach_ifa(ifID->aa_ifp, ifa); ifa->ifa_addr = NULL; - ifnet_lock_done(ifID->aa_ifp); } + IFA_UNLOCK(ifa); + /* release reference held for at_interfaces[] */ + IFA_REMREF(ifa); + ifnet_lock_done(ifID->aa_ifp); + if (ifID->at_was_attached == 0 && ifID->aa_ifp != NULL) { (void)proto_unplumb(PF_APPLETALK, ifID->aa_ifp); } diff --git a/bsd/netat/ddp_lap.c b/bsd/netat/ddp_lap.c index c7e075c5a..42f81cdad 100644 --- a/bsd/netat/ddp_lap.c +++ b/bsd/netat/ddp_lap.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1602,7 +1602,7 @@ int at_reg_mcast(ifID, data) *(unsigned*)data, (*(unsigned *)(data+2))&0x0000ffff, (unsigned)ifID)); - if (if_addmulti(nddp, (struct sockaddr *)&sdl, 0)) + if (if_addmulti_anon(nddp, (struct sockaddr *)&sdl, NULL)) return -1; } return 0; @@ -1641,7 +1641,7 @@ int at_unreg_mcast(ifID, data) (unsigned)ifID)); bzero(data, sizeof(struct etalk_addr)); - if (if_delmulti(nddp, (struct sockaddr *)&sdl)) + if (if_delmulti_anon(nddp, (struct sockaddr *)&sdl)) return -1; } return 0; @@ -1687,7 +1687,7 @@ int at_reg_mcast(ifID, data) *(unsigned*)data, (*(unsigned *)(data+2))&0x0000ffff, (unsigned)ifID)); - if (if_addmulti(nddp, (struct sockaddr *)&sdl, 0)) + if (if_addmulti_anon(nddp, (struct sockaddr *)&sdl, NULL)) return -1; } @@ -1724,7 +1724,7 @@ int at_unreg_mcast(ifID, data) (unsigned)ifID)); bzero(data, ETHERNET_ADDR_LEN); - if (if_delmulti(nddp, (struct sockaddr *)&sdl)) + if (if_delmulti_anon(nddp, (struct sockaddr *)&sdl)) return(-1); } diff --git a/bsd/netat/sys_glue.c b/bsd/netat/sys_glue.c index dd22563be..acb307fbc 100644 --- a/bsd/netat/sys_glue.c +++ b/bsd/netat/sys_glue.c @@ -97,10 +97,10 @@ int RouterMix = RT_MIX_DEFAULT; /* default for nbr of ppsec */ SYSCTL_INT(_net_appletalk, OID_AUTO, routermix, CTLFLAG_WR, &RouterMix, 0, "Appletalk RouterMix"); at_ddp_stats_t at_ddp_stats; /* DDP statistics */ -SYSCTL_STRUCT(_net_appletalk, OID_AUTO, ddpstats, CTLFLAG_RD, +SYSCTL_STRUCT(_net_appletalk, OID_AUTO, ddpstats, CTLFLAG_RD | CTLFLAG_LOCKED, &at_ddp_stats, at_ddp_stats, "AppleTalk DDP Stats"); extern int atp_resp_seqno2big; -SYSCTL_INT(_net_appletalk, OID_AUTO, atp_resp_seqno2big, CTLFLAG_RD, +SYSCTL_INT(_net_appletalk, OID_AUTO, atp_resp_seqno2big, CTLFLAG_RD | CTLFLAG_LOCKED, &atp_resp_seqno2big, 0, "Appletalk ATP seqno too big count"); static void ioccmd_t_32_to_64( ioccmd_t *from_p, user_ioccmd_t *to_p ); diff --git a/bsd/netinet/Makefile b/bsd/netinet/Makefile index de3d2890a..91973125c 100644 --- a/bsd/netinet/Makefile +++ b/bsd/netinet/Makefile @@ -9,14 +9,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ DATAFILES = \ @@ -32,12 +28,12 @@ KERNELFILES = \ kpi_ipfilter.h in_arp.h PRIVATE_DATAFILES = \ - if_fddi.h if_atm.h ip_dummynet.h \ + ip_dummynet.h \ tcp_debug.h \ in_gif.h ip_compat.h PRIVATE_KERNELFILES = ${KERNELFILES} \ - ip_ecn.h ip_encap.h ip_flow.h + ip_ecn.h ip_encap.h INSTALL_MI_LIST = ${DATAFILES} @@ -48,7 +44,6 @@ EXPORT_MI_LIST = ${DATAFILES} ${KERNELFILES} EXPORT_MI_DIR = ${INSTALL_MI_DIR} INSTALL_MI_LCL_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} - INSTALL_KF_MI_LCL_LIST = ${INSTALL_MI_LCL_LIST} ${PRIVATE_KERNELFILES} include $(MakeInc_rule) diff --git a/bsd/netinet/icmp6.h b/bsd/netinet/icmp6.h index e3c559b11..aab7c4ffd 100644 --- a/bsd/netinet/icmp6.h +++ b/bsd/netinet/icmp6.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000,2008 Apple Inc. All rights reserved. + * Copyright (c) 2000,2008-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -95,6 +95,10 @@ #define _NETINET_ICMP6_H_ #include +#ifdef XNU_KERNEL_PRIVATE +#include +#endif + #define ICMPV6_PLD_MAXLEN 1232 /* IPV6_MMTU - sizeof(struct ip6_hdr) - sizeof(struct icmp6_hdr) */ @@ -107,7 +111,7 @@ struct icmp6_hdr { u_int16_t icmp6_un_data16[2]; /* type-specific field */ u_int8_t icmp6_un_data8[4]; /* type-specific field */ } icmp6_dataun; -}; +} __attribute__((__packed__)); #define icmp6_data32 icmp6_dataun.icmp6_un_data32 #define icmp6_data16 icmp6_dataun.icmp6_un_data16 @@ -125,23 +129,27 @@ struct icmp6_hdr { #define ICMP6_ECHO_REQUEST 128 /* echo service */ #define ICMP6_ECHO_REPLY 129 /* echo reply */ +#define MLD_LISTENER_QUERY 130 /* multicast listener query */ +#define MLD_LISTENER_REPORT 131 /* multicast listener report */ +#define MLD_LISTENER_DONE 132 /* multicast listener done */ +#define MLD_LISTENER_REDUCTION MLD_LISTENER_DONE /* RFC3542 definition */ + +/* RFC2292 decls */ #define ICMP6_MEMBERSHIP_QUERY 130 /* group membership query */ -#define MLD6_LISTENER_QUERY 130 /* multicast listener query */ #define ICMP6_MEMBERSHIP_REPORT 131 /* group membership report */ -#define MLD6_LISTENER_REPORT 131 /* multicast listener report */ #define ICMP6_MEMBERSHIP_REDUCTION 132 /* group membership termination */ -#define MLD6_LISTENER_DONE 132 /* multicast listener done */ #ifndef KERNEL -#define MLD_LISTENER_QUERY MLD6_LISTENER_QUERY -#define MLD_LISTENER_REPORT MLD6_LISTENER_REPORT -#define MLD_LISTENER_DONE MLD6_LISTENER_DONE -#endif /* !KERNEL */ +/* the followings are for backward compatibility to old KAME apps. */ +#define MLD6_LISTENER_QUERY MLD_LISTENER_QUERY +#define MLD6_LISTENER_REPORT MLD_LISTENER_REPORT +#define MLD6_LISTENER_DONE MLD_LISTENER_DONE +#endif #define ND_ROUTER_SOLICIT 133 /* router solicitation */ -#define ND_ROUTER_ADVERT 134 /* router advertisment */ +#define ND_ROUTER_ADVERT 134 /* router advertisement */ #define ND_NEIGHBOR_SOLICIT 135 /* neighbor solicitation */ -#define ND_NEIGHBOR_ADVERT 136 /* neighbor advertisment */ +#define ND_NEIGHBOR_ADVERT 136 /* neighbor advertisement */ #define ND_REDIRECT 137 /* redirect */ #define ICMP6_ROUTER_RENUMBERING 138 /* router renumbering */ @@ -152,20 +160,18 @@ struct icmp6_hdr { #define ICMP6_FQDN_REPLY 140 /* FQDN reply */ #define ICMP6_NI_QUERY 139 /* node information request */ #define ICMP6_NI_REPLY 140 /* node information reply */ +#define MLDV2_LISTENER_REPORT 143 /* RFC3810 listener report */ /* The definitions below are experimental. TBA */ -#define MLD6_MTRACE_RESP 200 /* mtrace response(to sender) */ -#define MLD6_MTRACE 201 /* mtrace messages */ +#define MLD_MTRACE_RESP 200 /* mtrace resp (to sender) */ +#define MLD_MTRACE 201 /* mtrace messages */ #ifndef KERNEL -#define MLD_MTRACE_RESP MLD6_MTRACE_RESP -#define MLD_MTRACE MLD6_MTRACE -#endif /* !KERNEL */ +#define MLD6_MTRACE_RESP MLD_MTRACE_RESP +#define MLD6_MTRACE MLD_MTRACE +#endif -#define ICMP6_HADISCOV_REQUEST 202 /* XXX To be defined */ -#define ICMP6_HADISCOV_REPLY 203 /* XXX To be defined */ - -#define ICMP6_MAXTYPE 203 +#define ICMP6_MAXTYPE 201 #define ICMP6_DST_UNREACH_NOROUTE 0 /* no route to destination */ #define ICMP6_DST_UNREACH_ADMIN 1 /* administratively prohibited */ @@ -202,16 +208,30 @@ struct icmp6_hdr { /* * Multicast Listener Discovery */ -struct mld6_hdr { - struct icmp6_hdr mld6_hdr; - struct in6_addr mld6_addr; /* multicast address */ -}; +struct mld_hdr { + struct icmp6_hdr mld_icmp6_hdr; + struct in6_addr mld_addr; /* multicast address */ +} __attribute__((__packed__)); + +/* definitions to provide backward compatibility to old KAME applications */ +#ifndef KERNEL +#define mld6_hdr mld_hdr +#define mld6_type mld_type +#define mld6_code mld_code +#define mld6_cksum mld_cksum +#define mld6_maxdelay mld_maxdelay +#define mld6_reserved mld_reserved +#define mld6_addr mld_addr +#endif -#define mld6_type mld6_hdr.icmp6_type -#define mld6_code mld6_hdr.icmp6_code -#define mld6_cksum mld6_hdr.icmp6_cksum -#define mld6_maxdelay mld6_hdr.icmp6_data16[0] -#define mld6_reserved mld6_hdr.icmp6_data16[1] +/* shortcut macro definitions */ +#define mld_type mld_icmp6_hdr.icmp6_type +#define mld_code mld_icmp6_hdr.icmp6_code +#define mld_cksum mld_icmp6_hdr.icmp6_cksum +#define mld_maxdelay mld_icmp6_hdr.icmp6_data16[0] +#define mld_reserved mld_icmp6_hdr.icmp6_data16[1] +#define mld_v2_reserved mld_icmp6_hdr.icmp6_data16[0] +#define mld_v2_numrecs mld_icmp6_hdr.icmp6_data16[1] /* * Neighbor Discovery @@ -220,7 +240,7 @@ struct mld6_hdr { struct nd_router_solicit { /* router solicitation */ struct icmp6_hdr nd_rs_hdr; /* could be followed by options */ -}; +}__attribute__((__packed__)); #define nd_rs_type nd_rs_hdr.icmp6_type #define nd_rs_code nd_rs_hdr.icmp6_code @@ -232,7 +252,7 @@ struct nd_router_advert { /* router advertisement */ u_int32_t nd_ra_reachable; /* reachable time */ u_int32_t nd_ra_retransmit; /* retransmit timer */ /* could be followed by options */ -}; +} __attribute__((__packed__)); #define nd_ra_type nd_ra_hdr.icmp6_type #define nd_ra_code nd_ra_hdr.icmp6_code @@ -260,7 +280,7 @@ struct nd_neighbor_solicit { /* neighbor solicitation */ struct icmp6_hdr nd_ns_hdr; struct in6_addr nd_ns_target; /*target address */ /* could be followed by options */ -}; +}__attribute__((__packed__)); #define nd_ns_type nd_ns_hdr.icmp6_type #define nd_ns_code nd_ns_hdr.icmp6_code @@ -271,7 +291,7 @@ struct nd_neighbor_advert { /* neighbor advertisement */ struct icmp6_hdr nd_na_hdr; struct in6_addr nd_na_target; /* target address */ /* could be followed by options */ -}; +}__attribute__((__packed__)); #define nd_na_type nd_na_hdr.icmp6_type #define nd_na_code nd_na_hdr.icmp6_code @@ -294,7 +314,7 @@ struct nd_redirect { /* redirect */ struct in6_addr nd_rd_target; /* target address */ struct in6_addr nd_rd_dst; /* destination address */ /* could be followed by options */ -}; +}__attribute__((__packed__)); #define nd_rd_type nd_rd_hdr.icmp6_type #define nd_rd_code nd_rd_hdr.icmp6_code @@ -305,13 +325,14 @@ struct nd_opt_hdr { /* Neighbor discovery option header */ u_int8_t nd_opt_type; u_int8_t nd_opt_len; /* followed by option specific data*/ -}; +}__attribute__((__packed__)); #define ND_OPT_SOURCE_LINKADDR 1 #define ND_OPT_TARGET_LINKADDR 2 #define ND_OPT_PREFIX_INFORMATION 3 #define ND_OPT_REDIRECTED_HEADER 4 #define ND_OPT_MTU 5 +#define ND_OPT_RDNSS 25 /* RFC 5006 */ #define ND_OPT_ROUTE_INFO 200 /* draft-ietf-ipngwg-router-preference, not officially assigned yet */ @@ -324,7 +345,7 @@ struct nd_opt_prefix_info { /* prefix information */ u_int32_t nd_opt_pi_preferred_time; u_int32_t nd_opt_pi_reserved2; struct in6_addr nd_opt_pi_prefix; -}; +}__attribute__((__packed__)); #define ND_OPT_PI_FLAG_ONLINK 0x80 #define ND_OPT_PI_FLAG_AUTO 0x40 @@ -335,14 +356,14 @@ struct nd_opt_rd_hdr { /* redirected header */ u_int16_t nd_opt_rh_reserved1; u_int32_t nd_opt_rh_reserved2; /* followed by IP header and data */ -}; +} __attribute__((__packed__)); struct nd_opt_mtu { /* MTU option */ u_int8_t nd_opt_mtu_type; u_int8_t nd_opt_mtu_len; u_int16_t nd_opt_mtu_reserved; u_int32_t nd_opt_mtu_mtu; -}; +}__attribute__((__packed__)); struct nd_opt_route_info { /* route info */ u_int8_t nd_opt_rti_type; @@ -350,8 +371,16 @@ struct nd_opt_route_info { /* route info */ u_int8_t nd_opt_rti_prefixlen; u_int8_t nd_opt_rti_flags; u_int32_t nd_opt_rti_lifetime; - /* followed by prefix */ -}; + /* prefix follows */ +}__attribute__((__packed__)); + +struct nd_opt_rdnss { /* recursive domain name system servers */ + u_int8_t nd_opt_rdnss_type; + u_int8_t nd_opt_rdnss_len; + u_int16_t nd_opt_rdnss_reserved; + u_int32_t nd_opt_rdnss_lifetime; + struct in6_addr nd_opt_rdnss_addr[1]; +} __attribute__((__packed__)); /* * icmp6 namelookup @@ -366,7 +395,7 @@ struct icmp6_namelookup { u_int8_t icmp6_nl_name[3]; #endif /* could be followed by options */ -}; +}__attribute__((__packed__)); /* * icmp6 node information @@ -375,7 +404,7 @@ struct icmp6_nodeinfo { struct icmp6_hdr icmp6_ni_hdr; u_int8_t icmp6_ni_nonce[8]; /* could be followed by reply data */ -}; +}__attribute__((__packed__)); #define ni_type icmp6_ni_hdr.icmp6_type #define ni_code icmp6_ni_hdr.icmp6_code @@ -438,7 +467,7 @@ struct ni_reply_fqdn { u_int32_t ni_fqdn_ttl; /* TTL */ u_int8_t ni_fqdn_namelen; /* length in octets of the FQDN */ u_int8_t ni_fqdn_name[3]; /* XXX: alignment */ -}; +}__attribute__((__packed__)); /* * Router Renumbering. as router-renum-08.txt @@ -449,7 +478,7 @@ struct icmp6_router_renum { /* router renumbering header */ u_int8_t rr_flags; u_int16_t rr_maxdelay; u_int32_t rr_reserved; -}; +} __attribute__((__packed__)); #define ICMP6_RR_FLAGS_TEST 0x80 #define ICMP6_RR_FLAGS_REQRESULT 0x40 @@ -471,7 +500,7 @@ struct rr_pco_match { /* match prefix part */ u_int8_t rpm_maxlen; u_int16_t rpm_reserved; struct in6_addr rpm_prefix; -}; +} __attribute__((__packed__)); #define RPM_PCO_ADD 1 #define RPM_PCO_CHANGE 2 @@ -487,7 +516,7 @@ struct rr_pco_use { /* use prefix part */ u_int32_t rpu_pltime; u_int32_t rpu_flags; struct in6_addr rpu_prefix; -}; +} __attribute__((__packed__)); #define ICMP6_RR_PCOUSE_RAFLAGS_ONLINK 0x80 #define ICMP6_RR_PCOUSE_RAFLAGS_AUTO 0x40 @@ -505,7 +534,7 @@ struct rr_result { /* router renumbering result message */ u_int8_t rrr_matchedlen; u_int32_t rrr_ifid; struct in6_addr rrr_prefix; -}; +} __attribute__((__packed__)); #if BYTE_ORDER == BIG_ENDIAN #define ICMP6_RR_RESULT_FLAGS_OOB 0x0002 #define ICMP6_RR_RESULT_FLAGS_FORBIDDEN 0x0001 @@ -613,24 +642,32 @@ struct icmp6stat { /* * Names for ICMP sysctl objects */ -#define ICMPV6CTL_STATS 1 -#define ICMPV6CTL_REDIRACCEPT 2 /* accept/process redirects */ -#define ICMPV6CTL_REDIRTIMEOUT 3 /* redirect cache time */ -#define ICMPV6CTL_ND6_PRUNE 6 -#define ICMPV6CTL_ND6_DELAY 8 -#define ICMPV6CTL_ND6_UMAXTRIES 9 +#define ICMPV6CTL_STATS 1 +#define ICMPV6CTL_REDIRACCEPT 2 /* accept/process redirects */ +#define ICMPV6CTL_REDIRTIMEOUT 3 /* redirect cache time */ +#if 0 /*obsoleted*/ +#define ICMPV6CTL_ERRRATELIMIT 5 /* ICMPv6 error rate limitation */ +#endif +#define ICMPV6CTL_ND6_PRUNE 6 +#define ICMPV6CTL_ND6_DELAY 8 +#define ICMPV6CTL_ND6_UMAXTRIES 9 #define ICMPV6CTL_ND6_MMAXTRIES 10 #define ICMPV6CTL_ND6_USELOOPBACK 11 /*#define ICMPV6CTL_ND6_PROXYALL 12 obsoleted, do not reuse here */ -#define ICMPV6CTL_NODEINFO 13 -#define ICMPV6CTL_ERRPPSLIMIT 14 /* ICMPv6 error pps limitation */ +#define ICMPV6CTL_NODEINFO 13 +#define ICMPV6CTL_ERRPPSLIMIT 14 /* ICMPv6 error pps limitation */ #define ICMPV6CTL_ND6_MAXNUDHINT 15 -#define ICMPV6CTL_MTUDISC_HIWAT 16 -#define ICMPV6CTL_MTUDISC_LOWAT 17 -#define ICMPV6CTL_ND6_DEBUG 18 -#define ICMPV6CTL_ND6_DRLIST 19 -#define ICMPV6CTL_ND6_PRLIST 20 -#define ICMPV6CTL_MAXID 21 +#define ICMPV6CTL_MTUDISC_HIWAT 16 +#define ICMPV6CTL_MTUDISC_LOWAT 17 +#define ICMPV6CTL_ND6_DEBUG 18 +#define ICMPV6CTL_ND6_DRLIST 19 +#define ICMPV6CTL_ND6_PRLIST 20 +#define ICMPV6CTL_MLD_MAXSRCFILTER 21 +#define ICMPV6CTL_MLD_SOMAXSRC 22 +#define ICMPV6CTL_MLD_VERSION 23 +#define ICMPV6CTL_ND6_MAXQLEN 24 +#define ICMPV6CTL_ND6_ACCEPT_6TO4 25 +#define ICMPV6CTL_MAXID 26 #ifdef KERNEL_PRIVATE #define ICMPV6CTL_NAMES { \ @@ -655,6 +692,11 @@ struct icmp6stat { { "nd6_debug", CTLTYPE_INT }, \ { 0, 0 }, \ { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { "nd6_accept_6to4", CTLTYPE_INT }, \ } #define RTF_PROBEMTU RTF_PROTO1 @@ -667,8 +709,8 @@ struct in6_multi; void icmp6_init(void); void icmp6_paramerror(struct mbuf *, int); void icmp6_error(struct mbuf *, int, int, int); -int icmp6_input(struct mbuf **, int *); -void icmp6_fasttimo(void); +void icmp6_error2(struct mbuf *, int, int, int, struct ifnet *); +int icmp6_input(struct mbuf **, int *, int); void icmp6_reflect(struct mbuf *, size_t); void icmp6_prepare(struct mbuf *); void icmp6_redirect_input(struct mbuf *, int); @@ -677,14 +719,17 @@ void icmp6_redirect_output(struct mbuf *, struct rtentry *); struct ip6ctlparam; void icmp6_mtudisc_update(struct ip6ctlparam *, int); +extern lck_rw_t icmp6_ifs_rwlock; /* XXX: is this the right place for these macros? */ #define icmp6_ifstat_inc(ifp, tag) \ -do { \ +do { \ + lck_rw_lock_shared(&icmp6_ifs_rwlock); \ if ((ifp) && (ifp)->if_index <= if_index \ - && (ifp)->if_index < icmp6_ifstatmax \ - && icmp6_ifstat && icmp6_ifstat[(ifp)->if_index]) { \ - icmp6_ifstat[(ifp)->if_index]->tag++; \ - } \ + && (ifp)->if_index < icmp6_ifstatmax \ + && icmp6_ifstat && icmp6_ifstat[(ifp)->if_index]) { \ + atomic_add_64(&icmp6_ifstat[(ifp)->if_index]->tag, 1); \ + } \ + lck_rw_done(&icmp6_ifs_rwlock); \ } while (0) #define icmp6_ifoutstat_inc(ifp, type, code) \ @@ -692,7 +737,7 @@ do { \ icmp6_ifstat_inc(ifp, ifs6_out_msg); \ if (type < ICMP6_INFOMSG_MASK) \ icmp6_ifstat_inc(ifp, ifs6_out_error); \ - switch(type) { \ + switch (type) { \ case ICMP6_DST_UNREACH: \ icmp6_ifstat_inc(ifp, ifs6_out_dstunreach); \ if (code == ICMP6_DST_UNREACH_ADMIN) \ @@ -713,13 +758,13 @@ do { \ case ICMP6_ECHO_REPLY: \ icmp6_ifstat_inc(ifp, ifs6_out_echoreply); \ break; \ - case MLD6_LISTENER_QUERY: \ + case MLD_LISTENER_QUERY: \ icmp6_ifstat_inc(ifp, ifs6_out_mldquery); \ break; \ - case MLD6_LISTENER_REPORT: \ + case MLD_LISTENER_REPORT: \ icmp6_ifstat_inc(ifp, ifs6_out_mldreport); \ break; \ - case MLD6_LISTENER_DONE: \ + case MLD_LISTENER_DONE: \ icmp6_ifstat_inc(ifp, ifs6_out_mlddone); \ break; \ case ND_ROUTER_SOLICIT: \ @@ -742,6 +787,12 @@ do { \ extern int icmp6_rediraccept; /* accept/process redirects */ extern int icmp6_redirtimeout; /* cache time for redirect routes */ + +#define ICMP6_NODEINFO_FQDNOK 0x1 +#define ICMP6_NODEINFO_NODEADDROK 0x2 +#define ICMP6_NODEINFO_TMPADDROK 0x4 +#define ICMP6_NODEINFO_GLOBALOK 0x8 + #endif /* KERNEL_PRIVATE */ #endif /* !_NETINET_ICMP6_H_ */ diff --git a/bsd/netinet/if_atm.c b/bsd/netinet/if_atm.c deleted file mode 100644 index 0fa54144f..000000000 --- a/bsd/netinet/if_atm.c +++ /dev/null @@ -1,303 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* $NetBSD: if_atm.c,v 1.6 1996/10/13 02:03:01 christos Exp $ */ - -/* - * - * Copyright (c) 1996 Charles D. Cranor and Washington University. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by Charles D. Cranor and - * Washington University. - * 4. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD: src/sys/netinet/if_atm.c,v 1.8 1999/12/07 17:39:06 shin Exp $ - */ - -/* - * IP <=> ATM address resolution. - */ - -#if defined(INET) || defined(INET6) - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include - - -#if NATM -#include -#endif - - -#define SDL(s) ((struct sockaddr_dl *)s) - -/* - * atm_rtrequest: handle ATM rt request (in support of generic code) - * inputs: "req" = request code - * "rt" = route entry - * "sa" = sockaddr - */ - -void -atm_rtrequest(req, rt, sa) - int req; - register struct rtentry *rt; - struct sockaddr *sa; -{ - register struct sockaddr *gate = rt->rt_gateway; - struct atm_pseudoioctl api; -#if NATM - struct sockaddr_in *sin; - struct natmpcb *npcb = NULL; - struct atm_pseudohdr *aph; -#endif - static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; - - if (rt->rt_flags & RTF_GATEWAY) /* link level requests only */ - return; - - switch (req) { - - case RTM_RESOLVE: /* resolve: only happens when cloning */ - printf("atm_rtrequest: RTM_RESOLVE request detected?\n"); - break; - - case RTM_ADD: - - /* - * route added by a command (e.g. ifconfig, route, arp...). - * - * first check to see if this is not a host route, in which - * case we are being called via "ifconfig" to set the address. - */ - - if ((rt->rt_flags & RTF_HOST) == 0) { - rt_setgate(rt,rt_key(rt),(struct sockaddr *)&null_sdl); - gate = rt->rt_gateway; - SDL(gate)->sdl_type = rt->rt_ifp->if_type; - SDL(gate)->sdl_index = rt->rt_ifp->if_index; - break; - } - - if ((rt->rt_flags & RTF_CLONING) != 0) { - printf("atm_rtrequest: cloning route detected?\n"); - break; - } - if (gate->sa_family != AF_LINK || - gate->sa_len < sizeof(null_sdl)) { - log(LOG_DEBUG, "atm_rtrequest: bad gateway value"); - break; - } - -#if DIAGNOSTIC - if (rt->rt_ifp->if_ioctl == NULL) panic("atm null ioctl"); -#endif - -#if NATM - /* - * let native ATM know we are using this VCI/VPI - * (i.e. reserve it) - */ - sin = (struct sockaddr_in *) rt_key(rt); - if (sin->sin_family != AF_INET) - goto failed; - aph = (struct atm_pseudohdr *) LLADDR(SDL(gate)); - npcb = npcb_add(NULL, rt->rt_ifp, ATM_PH_VCI(aph), - ATM_PH_VPI(aph)); - if (npcb == NULL) - goto failed; - npcb->npcb_flags |= NPCB_IP; - npcb->ipaddr.s_addr = sin->sin_addr.s_addr; - /* XXX: move npcb to llinfo when ATM ARP is ready */ - rt->rt_llinfo = (caddr_t) npcb; - rt->rt_flags |= RTF_LLINFO; -#endif - /* - * let the lower level know this circuit is active - */ - bcopy(LLADDR(SDL(gate)), &api.aph, sizeof(api.aph)); - api.rxhand = NULL; - if (ifnet_ioctl(rt->rt_ifpm 0, SIOCATMENA, &api) != 0) { - printf("atm: couldn't add VC\n"); - goto failed; - } - - SDL(gate)->sdl_type = rt->rt_ifp->if_type; - SDL(gate)->sdl_index = rt->rt_ifp->if_index; - - break; - -failed: -#if NATM - if (npcb) { - npcb_free(npcb, NPCB_DESTROY); - rt->rt_llinfo = NULL; - rt->rt_flags &= ~RTF_LLINFO; - } -#endif - rtrequest(RTM_DELETE, rt_key(rt), (struct sockaddr *)0, - rt_mask(rt), 0, (struct rtentry **) 0); - break; - - case RTM_DELETE: - -#if NATM - /* - * tell native ATM we are done with this VC - */ - - if (rt->rt_flags & RTF_LLINFO) { - npcb_free((struct natmpcb *)rt->rt_llinfo, - NPCB_DESTROY); - rt->rt_llinfo = NULL; - rt->rt_flags &= ~RTF_LLINFO; - } -#endif - /* - * tell the lower layer to disable this circuit - */ - - bcopy(LLADDR(SDL(gate)), &api.aph, sizeof(api.aph)); - api.rxhand = NULL; - ifnet_ioctl(rt->rt_ifp, 0, SIOCATMDIS, &api); - - break; - } -} - -/* - * atmresolve: - * inputs: - * [1] "rt" = the link level route to use (or null if need to look one up) - * [2] "m" = mbuf containing the data to be sent - * [3] "dst" = sockaddr_in (IP) address of dest. - * output: - * [4] "desten" = ATM pseudo header which we will fill in VPI/VCI info - * return: - * 0 == resolve FAILED; note that "m" gets m_freem'd in this case - * 1 == resolve OK; desten contains result - * - * XXX: will need more work if we wish to support ATMARP in the kernel, - * but this is enough for PVCs entered via the "route" command. - */ - -int -atmresolve(rt, m, dst, desten) - -register struct rtentry *rt; -struct mbuf *m; -register struct sockaddr *dst; -register struct atm_pseudohdr *desten; /* OUT */ - -{ - struct sockaddr_dl *sdl; - - if (m->m_flags & (M_BCAST|M_MCAST)) { - log(LOG_INFO, "atmresolve: BCAST/MCAST packet detected/dumped"); - goto bad; - } - - if (rt == NULL) { - rt = RTALLOC1(dst, 0); - if (rt == NULL) goto bad; /* failed */ - rtunref(rt); /* don't keep LL references */ - if ((rt->rt_flags & RTF_GATEWAY) != 0 || - (rt->rt_flags & RTF_LLINFO) == 0 || - /* XXX: are we using LLINFO? */ - rt->rt_gateway->sa_family != AF_LINK) { - goto bad; - } - } - - /* - * note that rt_gateway is a sockaddr_dl which contains the - * atm_pseudohdr data structure for this route. we currently - * don't need any rt_llinfo info (but will if we want to support - * ATM ARP [c.f. if_ether.c]). - */ - - sdl = SDL(rt->rt_gateway); - - /* - * Check the address family and length is valid, the address - * is resolved; otherwise, try to resolve. - */ - - - if (sdl->sdl_family == AF_LINK && sdl->sdl_alen == sizeof(*desten)) { - bcopy(LLADDR(sdl), desten, sdl->sdl_alen); - return(1); /* ok, go for it! */ - } - - /* - * we got an entry, but it doesn't have valid link address - * info in it (it is prob. the interface route, which has - * sdl_alen == 0). dump packet. (fall through to "bad"). - */ - -bad: - m_freem(m); - return(0); -} -#endif /* INET */ diff --git a/bsd/netinet/if_atm.h b/bsd/netinet/if_atm.h deleted file mode 100644 index 989fa974d..000000000 --- a/bsd/netinet/if_atm.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* $FreeBSD: src/sys/netinet/if_atm.h,v 1.2.6.1 2000/08/03 01:07:02 peter Exp $ */ -/* $NetBSD: if_atm.h,v 1.2 1996/07/03 17:17:17 chuck Exp $ */ - -/* - * - * Copyright (c) 1996 Charles D. Cranor and Washington University. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by Charles D. Cranor and - * Washington University. - * 4. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - * if_atm.h - */ -#include - -#ifdef KERNEL_PRIVATE -struct atm_pseudohdr; -struct mbuf; -struct rtentry; -struct sockaddr; - -void atm_rtrequest(int, struct rtentry *, struct sockaddr *); -int atmresolve(struct rtentry *, struct mbuf *, struct sockaddr *, - struct atm_pseudohdr *); -#endif /* KERNEL_PRIVATE */ diff --git a/bsd/netinet/if_fddi.h b/bsd/netinet/if_fddi.h deleted file mode 100644 index fb9f81f10..000000000 --- a/bsd/netinet/if_fddi.h +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1982, 1986, 1993 - * The Regents of the University of California. All rights reserved. - * Copyright (c) 1995 Matt Thomas (thomas@lkg.dec.com) - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)if_fddi.h 8.1 (Berkeley) 6/10/93 - * $FreeBSD: src/sys/netinet/if_fddi.h,v 1.8 1999/12/29 04:40:58 peter Exp $ - */ - -#ifndef _NETINET_IF_FDDI_H_ -#define _NETINET_IF_FDDI_H_ -#include - -/* - * Structure of an 100Mb/s FDDI header. - */ -struct fddi_header { - u_char fddi_fc; - u_char fddi_dhost[6]; - u_char fddi_shost[6]; -}; - -#define FDDIIPMTU 4352 -#define FDDIMTU 4470 -#define FDDIMIN 3 - -#define FDDIFC_C 0x80 /* 0b10000000 */ -#define FDDIFC_L 0x40 /* 0b01000000 */ -#define FDDIFC_F 0x30 /* 0b00110000 */ -#define FDDIFC_Z 0x0F /* 0b00001111 */ - -#define FDDIFC_LLC_ASYNC 0x50 -#define FDDIFC_LLC_PRIO0 0 -#define FDDIFC_LLC_PRIO1 1 -#define FDDIFC_LLC_PRIO2 2 -#define FDDIFC_LLC_PRIO3 3 -#define FDDIFC_LLC_PRIO4 4 -#define FDDIFC_LLC_PRIO5 5 -#define FDDIFC_LLC_PRIO6 6 -#define FDDIFC_LLC_PRIO7 7 -#define FDDIFC_LLC_SYNC 0xd0 -#define FDDIFC_SMT 0x40 - -#ifdef KERNEL_PRIVATE -#define fddibroadcastaddr etherbroadcastaddr -#define fddi_ipmulticast_min ether_ipmulticast_min -#define fddi_ipmulticast_max ether_ipmulticast_max -#define fddi_addmulti ether_addmulti -#define fddi_delmulti ether_delmulti -#define fddi_sprintf ether_sprintf - -void fddi_ifattach(struct ifnet *); -void fddi_input(struct ifnet *, struct fddi_header *, struct mbuf *); -int fddi_output(struct ifnet *, - struct mbuf *, struct sockaddr *, struct rtentry *); -#endif /* KERNEL_PRIVATE */ - -#endif /* _NETINET_IF_FDDI_H_ */ diff --git a/bsd/netinet/igmp.c b/bsd/netinet/igmp.c index bc2b80d68..d10484ddf 100644 --- a/bsd/netinet/igmp.c +++ b/bsd/netinet/igmp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,7 +25,8 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* +/*- + * Copyright (c) 2007-2009 Bruce Simpson. * Copyright (c) 1988 Stephen Deering. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. @@ -72,15 +73,19 @@ /* * Internet Group Management Protocol (IGMP) routines. + * [RFC1112, RFC2236, RFC3376] * * Written by Steve Deering, Stanford, May 1988. * Modified by Rosen Sharma, Stanford, Aug 1994. * Modified by Bill Fenner, Xerox PARC, Feb 1995. * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995. + * Significantly rewritten for IGMPv3, VIMAGE, and SMP by Bruce Simpson. * * MULTICAST Revision: 3.5.1.4 */ +#include + #include #include #include @@ -89,6 +94,10 @@ #include #include #include +#include + +#include +#include #include #include @@ -100,449 +109,3659 @@ #include #include #include +#include + +#ifdef IGMP_DEBUG +__inline__ char * +inet_ntoa(struct in_addr ina) +{ + static char buf[4*sizeof "123"]; + unsigned char *ucp = (unsigned char *)&ina; -#if CONFIG_MACF_NET -#include + snprintf(buf, sizeof(buf), "%d.%d.%d.%d", + ucp[0] & 0xff, + ucp[1] & 0xff, + ucp[2] & 0xff, + ucp[3] & 0xff); + return buf; +} #endif -#ifndef __APPLE__ -static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state"); +static void igi_initvar(struct igmp_ifinfo *, struct ifnet *, int); +static struct igmp_ifinfo *igi_alloc(int); +static void igi_free(struct igmp_ifinfo *); +static void igi_delete(const struct ifnet *); +static void igmp_dispatch_queue(struct igmp_ifinfo *, struct ifqueue *, + int, const int, struct ifnet *); +static void igmp_final_leave(struct in_multi *, struct igmp_ifinfo *); +static int igmp_handle_state_change(struct in_multi *, + struct igmp_ifinfo *); +static int igmp_initial_join(struct in_multi *, struct igmp_ifinfo *); +static int igmp_input_v1_query(struct ifnet *, const struct ip *, + const struct igmp *); +static int igmp_input_v2_query(struct ifnet *, const struct ip *, + const struct igmp *); +static int igmp_input_v3_query(struct ifnet *, const struct ip *, + /*const*/ struct igmpv3 *); +static int igmp_input_v3_group_query(struct in_multi *, + int, /*const*/ struct igmpv3 *); +static int igmp_input_v1_report(struct ifnet *, /*const*/ struct ip *, + /*const*/ struct igmp *); +static int igmp_input_v2_report(struct ifnet *, /*const*/ struct ip *, + /*const*/ struct igmp *); +void igmp_sendpkt(struct mbuf *, struct ifnet *); +static __inline__ int igmp_isgroupreported(const struct in_addr); +static struct mbuf * + igmp_ra_alloc(void); +#ifdef IGMP_DEBUG +static const char * igmp_rec_type_to_str(const int); #endif +static void igmp_set_version(struct igmp_ifinfo *, const int); +static void igmp_flush_relq(struct igmp_ifinfo *); +static int igmp_v1v2_queue_report(struct in_multi *, const int); +static void igmp_v1v2_process_group_timer(struct in_multi *, const int); +static void igmp_v1v2_process_querier_timers(struct igmp_ifinfo *); +static void igmp_v2_update_group(struct in_multi *, const int); +static void igmp_v3_cancel_link_timers(struct igmp_ifinfo *); +static void igmp_v3_dispatch_general_query(struct igmp_ifinfo *); +static struct mbuf * + igmp_v3_encap_report(struct ifnet *, struct mbuf *); +static int igmp_v3_enqueue_group_record(struct ifqueue *, + struct in_multi *, const int, const int, const int); +static int igmp_v3_enqueue_filter_change(struct ifqueue *, + struct in_multi *); +static void igmp_v3_process_group_timers(struct igmp_ifinfo *, + struct ifqueue *, struct ifqueue *, struct in_multi *, + const int); +static int igmp_v3_merge_state_changes(struct in_multi *, + struct ifqueue *); +static void igmp_v3_suppress_group_record(struct in_multi *); +static int sysctl_igmp_ifinfo SYSCTL_HANDLER_ARGS; +static int sysctl_igmp_gsr SYSCTL_HANDLER_ARGS; +static int sysctl_igmp_default_version SYSCTL_HANDLER_ARGS; -static struct router_info * - find_rti(struct ifnet *ifp, int wait); +struct mbuf *m_raopt; /* Router Alert option */ -static struct igmpstat igmpstat; +static int interface_timers_running; /* IGMPv3 general + * query response */ +static int state_change_timers_running; /* IGMPv3 state-change + * retransmit */ +static int current_state_timers_running; /* IGMPv1/v2 host + * report; IGMPv3 g/sg + * query response */ -SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_RD, - &igmpstat, igmpstat, ""); +static LIST_HEAD(, igmp_ifinfo) igi_head; +static struct igmpstat_v3 igmpstat_v3 = { + .igps_version = IGPS_VERSION_3, + .igps_len = sizeof(struct igmpstat_v3), +}; +static struct igmpstat igmpstat; /* old IGMPv2 stats structure */ +static struct timeval igmp_gsrdelay = {10, 0}; -static int igmp_timers_are_running; -static uint32_t igmp_all_hosts_group; -static uint32_t igmp_all_rtrs_group; -static struct mbuf *router_alert; -static struct router_info *Head; +static int igmp_recvifkludge = 1; +static int igmp_sendra = 1; +static int igmp_sendlocal = 1; +static int igmp_v1enable = 1; +static int igmp_v2enable = 1; +static int igmp_legacysupp = 0; +static int igmp_default_version = IGMP_VERSION_3; -static void igmp_sendpkt(struct in_multi *, int, uint32_t); +SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_RD | CTLFLAG_LOCKED, + &igmpstat, igmpstat, ""); +SYSCTL_STRUCT(_net_inet_igmp, OID_AUTO, v3stats, + CTLFLAG_RD | CTLFLAG_LOCKED, &igmpstat_v3, igmpstat_v3, ""); +SYSCTL_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_RW | CTLFLAG_LOCKED, + &igmp_recvifkludge, 0, + "Rewrite IGMPv1/v2 reports from 0.0.0.0 to contain subnet address"); +SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_RW | CTLFLAG_LOCKED, + &igmp_sendra, 0, + "Send IP Router Alert option in IGMPv2/v3 messages"); +SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_RW | CTLFLAG_LOCKED, + &igmp_sendlocal, 0, + "Send IGMP membership reports for 224.0.0.0/24 groups"); +SYSCTL_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_RW | CTLFLAG_LOCKED, + &igmp_v1enable, 0, + "Enable backwards compatibility with IGMPv1"); +SYSCTL_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_RW | CTLFLAG_LOCKED, + &igmp_v2enable, 0, + "Enable backwards compatibility with IGMPv2"); +SYSCTL_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_RW | CTLFLAG_LOCKED, + &igmp_legacysupp, 0, + "Allow v1/v2 reports to suppress v3 group responses"); +SYSCTL_PROC(_net_inet_igmp, OID_AUTO, default_version, + CTLTYPE_INT | CTLFLAG_RW, + &igmp_default_version, 0, sysctl_igmp_default_version, "I", + "Default version of IGMP to run on each interface"); +SYSCTL_PROC(_net_inet_igmp, OID_AUTO, gsrdelay, + CTLTYPE_INT | CTLFLAG_RW, + &igmp_gsrdelay.tv_sec, 0, sysctl_igmp_gsr, "I", + "Rate limit for IGMPv3 Group-and-Source queries in seconds"); +#ifdef IGMP_DEBUG +int igmp_debug = 0; +SYSCTL_INT(_net_inet_igmp, OID_AUTO, + debug, CTLFLAG_RW | CTLFLAG_LOCKED, &igmp_debug, 0, ""); +#endif -void -igmp_init(void) -{ - struct ipoption *ra; +SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo, CTLFLAG_RD | CTLFLAG_LOCKED, + sysctl_igmp_ifinfo, "Per-interface IGMPv3 state"); - /* - * To avoid byte-swapping the same value over and over again. - */ - igmp_all_hosts_group = htonl(INADDR_ALLHOSTS_GROUP); - igmp_all_rtrs_group = htonl(INADDR_ALLRTRS_GROUP); +/* Lock group and attribute for igmp_mtx */ +static lck_attr_t *igmp_mtx_attr; +static lck_grp_t *igmp_mtx_grp; +static lck_grp_attr_t *igmp_mtx_grp_attr; - igmp_timers_are_running = 0; +/* + * Locking and reference counting: + * + * igmp_mtx mainly protects igi_head. In cases where both igmp_mtx and + * in_multihead_lock must be held, the former must be acquired first in order + * to maintain lock ordering. It is not a requirement that igmp_mtx be + * acquired first before in_multihead_lock, but in case both must be acquired + * in succession, the correct lock ordering must be followed. + * + * Instead of walking the if_multiaddrs list at the interface and returning + * the ifma_protospec value of a matching entry, we search the global list + * of in_multi records and find it that way; this is done with in_multihead + * lock held. Doing so avoids the race condition issues that many other BSDs + * suffer from (therefore in our implementation, ifma_protospec will never be + * NULL for as long as the in_multi is valid.) + * + * The above creates a requirement for the in_multi to stay in in_multihead + * list even after the final IGMP leave (in IGMPv3 mode) until no longer needs + * be retransmitted (this is not required for IGMPv1/v2.) In order to handle + * this, the request and reference counts of the in_multi are bumped up when + * the state changes to IGMP_LEAVING_MEMBER, and later dropped in the timeout + * handler. Each in_multi holds a reference to the underlying igmp_ifinfo. + * + * Thus, the permitted lock oder is: + * + * igmp_mtx, in_multihead_lock, inm_lock, igi_lock + * + * Any may be taken independently, but if any are held at the same time, + * the above lock order must be followed. + */ +static decl_lck_mtx_data(, igmp_mtx); +static int igmp_timers_are_running; - /* - * Construct a Router Alert option to use in outgoing packets - */ - MGET(router_alert, M_WAIT, MT_DATA); - ra = mtod(router_alert, struct ipoption *); - ra->ipopt_dst.s_addr = 0; - ra->ipopt_list[0] = IPOPT_RA; /* Router Alert Option */ - ra->ipopt_list[1] = 0x04; /* 4 bytes long */ - ra->ipopt_list[2] = 0x00; - ra->ipopt_list[3] = 0x00; - router_alert->m_len = sizeof(ra->ipopt_dst) + ra->ipopt_list[1]; +#define IGI_ZONE_MAX 64 /* maximum elements in zone */ +#define IGI_ZONE_NAME "igmp_ifinfo" /* zone name */ - Head = (struct router_info *) 0; -} +static unsigned int igi_size; /* size of zone element */ +static struct zone *igi_zone; /* zone for igmp_ifinfo */ -static struct router_info * -find_rti( - struct ifnet *ifp, int wait) +#ifdef IGMP_DEBUG +static __inline char * +inet_ntoa_haddr(in_addr_t haddr) { - struct router_info *rti = Head; - - -#if IGMP_DEBUG - printf("[igmp.c, _find_rti] --> entering \n"); -#endif - while (rti) { - if (rti->rti_ifp == ifp) { -#if IGMP_DEBUG - printf("[igmp.c, _find_rti] --> found old entry \n"); -#endif - return rti; - } - rti = rti->rti_next; - } - - MALLOC(rti, struct router_info *, sizeof *rti, M_IGMP, wait); - if (rti != NULL) - { - rti->rti_ifp = ifp; - rti->rti_type = IGMP_V2_ROUTER; - rti->rti_time = 0; - rti->rti_next = Head; - Head = rti; - } -#if IGMP_DEBUG - if (rti) printf("[igmp.c, _find_rti] --> created an entry \n"); + struct in_addr ia; + + ia.s_addr = htonl(haddr); + return (inet_ntoa(ia)); +} #endif - return rti; +/* + * Retrieve or set default IGMP version. + */ +static int +sysctl_igmp_default_version SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg2) + int error; + int new; + + lck_mtx_lock(&igmp_mtx); + + error = SYSCTL_OUT(req, arg1, sizeof(int)); + if (error || !req->newptr) + goto out_locked; + + new = igmp_default_version; + + error = SYSCTL_IN(req, &new, sizeof(int)); + if (error) + goto out_locked; + + if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) { + error = EINVAL; + goto out_locked; + } + + IGMP_PRINTF(("change igmp_default_version from %d to %d\n", + igmp_default_version, new)); + + igmp_default_version = new; + +out_locked: + lck_mtx_unlock(&igmp_mtx); + return (error); } -void -igmp_input( - struct mbuf *m, - int iphlen) +/* + * Retrieve or set threshold between group-source queries in seconds. + * + */ +static int +sysctl_igmp_gsr SYSCTL_HANDLER_ARGS { - struct igmp *igmp; - struct ip *ip; - int igmplen; - struct ifnet *ifp = m->m_pkthdr.rcvif; - int minlen; - struct in_multi *inm; - struct in_ifaddr *ia; - struct in_multistep step; - struct router_info *rti; - - int timer; /** timer value in the igmp query header **/ +#pragma unused(arg1, arg2) + int error; + int i; - ++igmpstat.igps_rcv_total; + lck_mtx_lock(&igmp_mtx); - ip = mtod(m, struct ip *); - igmplen = ip->ip_len; + i = igmp_gsrdelay.tv_sec; - /* - * Validate lengths - */ - if (igmplen < IGMP_MINLEN) { - ++igmpstat.igps_rcv_tooshort; - m_freem(m); - return; - } - minlen = iphlen + IGMP_MINLEN; - if ((m->m_flags & M_EXT || m->m_len < minlen) && - (m = m_pullup(m, minlen)) == 0) { - ++igmpstat.igps_rcv_tooshort; - return; - } + error = sysctl_handle_int(oidp, &i, 0, req); + if (error || !req->newptr) + goto out_locked; - /* - * Validate checksum - */ - m->m_data += iphlen; - m->m_len -= iphlen; - igmp = mtod(m, struct igmp *); - if (in_cksum(m, igmplen)) { - ++igmpstat.igps_rcv_badsum; - m_freem(m); - return; + if (i < -1 || i >= 60) { + error = EINVAL; + goto out_locked; } - m->m_data -= iphlen; - m->m_len += iphlen; - ip = mtod(m, struct ip *); - timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE; - if (timer == 0) - timer = 1; - rti = find_rti(ifp, M_NOWAIT); - if (rti == NULL) { - m_freem(m); - return; - } + igmp_gsrdelay.tv_sec = i; - /* - * In the IGMPv2 specification, there are 3 states and a flag. - * - * In Non-Member state, we simply don't have a membership record. - * In Delaying Member state, our timer is running (inm->inm_timer) - * In Idle Member state, our timer is not running (inm->inm_timer==0) - * - * The flag is inm->inm_state, it is set to IGMP_OTHERMEMBER if - * we have heard a report from another member, or IGMP_IREPORTEDLAST - * if I sent the last report. - */ - switch (igmp->igmp_type) { +out_locked: + lck_mtx_unlock(&igmp_mtx); + return (error); +} - case IGMP_MEMBERSHIP_QUERY: - ++igmpstat.igps_rcv_queries; +/* + * Expose struct igmp_ifinfo to userland, keyed by ifindex. + * For use by ifmcstat(8). + * + */ +static int +sysctl_igmp_ifinfo SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp) + int *name; + int error; + u_int namelen; + struct ifnet *ifp; + struct igmp_ifinfo *igi; + struct igmp_ifinfo_u igi_u; - if (ifp->if_flags & IFF_LOOPBACK) - break; + name = (int *)arg1; + namelen = arg2; - if (igmp->igmp_code == 0) { - /* - * Old router. Remember that the querier on this - * interface is old, and set the timer to the - * value in RFC 1112. - */ + if (req->newptr != USER_ADDR_NULL) + return (EPERM); - rti->rti_type = IGMP_V1_ROUTER; - rti->rti_time = 0; + if (namelen != 1) + return (EINVAL); - timer = IGMP_MAX_HOST_REPORT_DELAY * PR_FASTHZ; + lck_mtx_lock(&igmp_mtx); - if (ip->ip_dst.s_addr != igmp_all_hosts_group || - igmp->igmp_group.s_addr != 0) { - ++igmpstat.igps_rcv_badqueries; - m_freem(m); - return; - } - } else { - /* - * New router. Simply do the new validity check. - */ - - if (igmp->igmp_group.s_addr != 0 && - !IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) { - ++igmpstat.igps_rcv_badqueries; - m_freem(m); - return; - } - } + if (name[0] <= 0 || name[0] > (u_int)if_index) { + error = ENOENT; + goto out_locked; + } - /* - * - Start the timers in all of our membership records - * that the query applies to for the interface on - * which the query arrived excl. those that belong - * to the "all-hosts" group (224.0.0.1). - * - Restart any timer that is already running but has - * a value longer than the requested timeout. - * - Use the value specified in the query message as - * the maximum timeout. - */ - lck_mtx_lock(rnh_lock); - IN_FIRST_MULTI(step, inm); - while (inm != NULL) { - if (inm->inm_ifp == ifp && - inm->inm_addr.s_addr != igmp_all_hosts_group && - (igmp->igmp_group.s_addr == 0 || - igmp->igmp_group.s_addr == inm->inm_addr.s_addr)) { - if (inm->inm_timer == 0 || - inm->inm_timer > timer) { - inm->inm_timer = - IGMP_RANDOM_DELAY(timer); - igmp_timers_are_running = 1; - } - } - IN_NEXT_MULTI(step, inm); + error = ENOENT; + + ifnet_head_lock_shared(); + ifp = ifindex2ifnet[name[0]]; + ifnet_head_done(); + if (ifp == NULL) + goto out_locked; + + bzero(&igi_u, sizeof (igi_u)); + + LIST_FOREACH(igi, &igi_head, igi_link) { + IGI_LOCK(igi); + if (ifp != igi->igi_ifp) { + IGI_UNLOCK(igi); + continue; } - lck_mtx_unlock(rnh_lock); + igi_u.igi_ifindex = igi->igi_ifp->if_index; + igi_u.igi_version = igi->igi_version; + igi_u.igi_v1_timer = igi->igi_v1_timer; + igi_u.igi_v2_timer = igi->igi_v2_timer; + igi_u.igi_v3_timer = igi->igi_v3_timer; + igi_u.igi_flags = igi->igi_flags; + igi_u.igi_rv = igi->igi_rv; + igi_u.igi_qi = igi->igi_qi; + igi_u.igi_qri = igi->igi_qri; + igi_u.igi_uri = igi->igi_uri; + IGI_UNLOCK(igi); + error = SYSCTL_OUT(req, &igi_u, sizeof (igi_u)); break; + } - case IGMP_V1_MEMBERSHIP_REPORT: - case IGMP_V2_MEMBERSHIP_REPORT: - /* - * For fast leave to work, we have to know that we are the - * last person to send a report for this group. Reports - * can potentially get looped back if we are a multicast - * router, so discard reports sourced by me. - */ - IFP_TO_IA(ifp, ia); - if (ia && ip->ip_src.s_addr == IA_SIN(ia)->sin_addr.s_addr) { - ifafree(&ia->ia_ifa); - break; - } +out_locked: + lck_mtx_unlock(&igmp_mtx); + return (error); +} - ++igmpstat.igps_rcv_reports; +/* + * Dispatch an entire queue of pending packet chains + * + * Must not be called with inm_lock held. + */ +static void +igmp_dispatch_queue(struct igmp_ifinfo *igi, struct ifqueue *ifq, int limit, + const int loop, struct ifnet *ifp) +{ + struct mbuf *m; + struct ip *ip; - if (ifp->if_flags & IFF_LOOPBACK) { - if (ia != NULL) - ifafree(&ia->ia_ifa); + if (igi != NULL) + IGI_LOCK_ASSERT_HELD(igi); + + for (;;) { + IF_DEQUEUE(ifq, m); + if (m == NULL) break; - } + IGMP_PRINTF(("%s: dispatch %p from %p\n", __func__, ifq, m)); + ip = mtod(m, struct ip *); + if (loop) + m->m_flags |= M_IGMP_LOOP; + if (igi != NULL) + IGI_UNLOCK(igi); + igmp_sendpkt(m, ifp); + if (igi != NULL) + IGI_LOCK(igi); + if (--limit == 0) + break; + } - if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) { - ++igmpstat.igps_rcv_badreports; - m_freem(m); - if (ia != NULL) - ifafree(&ia->ia_ifa); - return; - } + if (igi != NULL) + IGI_LOCK_ASSERT_HELD(igi); +} - /* - * KLUDGE: if the IP source address of the report has an - * unspecified (i.e., zero) subnet number, as is allowed for - * a booting host, replace it with the correct subnet number - * so that a process-level multicast routing demon can - * determine which subnet it arrived from. This is necessary - * to compensate for the lack of any way for a process to - * determine the arrival interface of an incoming packet. - */ - if ((ntohl(ip->ip_src.s_addr) & IN_CLASSA_NET) == 0) - if (ia) ip->ip_src.s_addr = htonl(ia->ia_subnet); +/* + * Filter outgoing IGMP report state by group. + * + * Reports are ALWAYS suppressed for ALL-HOSTS (224.0.0.1). + * If the net.inet.igmp.sendlocal sysctl is 0, then IGMP reports are + * disabled for all groups in the 224.0.0.0/24 link-local scope. However, + * this may break certain IGMP snooping switches which rely on the old + * report behaviour. + * + * Return zero if the given group is one for which IGMP reports + * should be suppressed, or non-zero if reports should be issued. + */ - /* - * If we belong to the group being reported, stop - * our timer for that group. - */ - ifnet_lock_shared(ifp); - IN_LOOKUP_MULTI(igmp->igmp_group, ifp, inm); - ifnet_lock_done(ifp); +static __inline__ +int igmp_isgroupreported(const struct in_addr addr) +{ - if (inm != NULL) { - inm->inm_timer = 0; - ++igmpstat.igps_rcv_ourreports; + if (in_allhosts(addr) || + ((!igmp_sendlocal && IN_LOCAL_GROUP(ntohl(addr.s_addr))))) + return (0); - inm->inm_state = IGMP_OTHERMEMBER; - } + return (1); +} - if (ia != NULL) - ifafree(&ia->ia_ifa); - break; - } +/* + * Construct a Router Alert option to use in outgoing packets. + */ +static struct mbuf * +igmp_ra_alloc(void) +{ + struct mbuf *m; + struct ipoption *p; - /* - * Pass all valid IGMP packets up to any process(es) listening - * on a raw IGMP socket. - */ - rip_input(m, iphlen); + MGET(m, M_WAITOK, MT_DATA); + p = mtod(m, struct ipoption *); + p->ipopt_dst.s_addr = INADDR_ANY; + p->ipopt_list[0] = IPOPT_RA; /* Router Alert Option */ + p->ipopt_list[1] = 0x04; /* 4 bytes long */ + p->ipopt_list[2] = IPOPT_EOL; /* End of IP option list */ + p->ipopt_list[3] = 0x00; /* pad byte */ + m->m_len = sizeof(p->ipopt_dst) + p->ipopt_list[1]; + + return (m); } -int -igmp_joingroup(struct in_multi *inm) +/* + * Attach IGMP when PF_INET is attached to an interface. + */ +struct igmp_ifinfo * +igmp_domifattach(struct ifnet *ifp, int how) { + struct igmp_ifinfo *igi; - if (inm->inm_addr.s_addr == igmp_all_hosts_group - || inm->inm_ifp->if_flags & IFF_LOOPBACK) { - inm->inm_timer = 0; - inm->inm_state = IGMP_OTHERMEMBER; - } else { - inm->inm_rti = find_rti(inm->inm_ifp, M_WAITOK); - if (inm->inm_rti == NULL) return ENOMEM; - igmp_sendpkt(inm, inm->inm_rti->rti_type, 0); - inm->inm_timer = IGMP_RANDOM_DELAY( - IGMP_MAX_HOST_REPORT_DELAY*PR_FASTHZ); - inm->inm_state = IGMP_IREPORTEDLAST; - igmp_timers_are_running = 1; - } - return 0; + IGMP_PRINTF(("%s: called for ifp %p(%s)\n", + __func__, ifp, ifp->if_name)); + + igi = igi_alloc(how); + if (igi == NULL) + return (NULL); + + lck_mtx_lock(&igmp_mtx); + + IGI_LOCK(igi); + igi_initvar(igi, ifp, 0); + igi->igi_debug |= IFD_ATTACHED; + IGI_ADDREF_LOCKED(igi); /* hold a reference for igi_head */ + IGI_ADDREF_LOCKED(igi); /* hold a reference for caller */ + IGI_UNLOCK(igi); + + LIST_INSERT_HEAD(&igi_head, igi, igi_link); + + lck_mtx_unlock(&igmp_mtx); + + IGMP_PRINTF(("allocate igmp_ifinfo for ifp %p(%s)\n", + ifp, ifp->if_name)); + + return (igi); } +/* + * Attach IGMP when PF_INET is reattached to an interface. Caller is + * expected to have an outstanding reference to the igi. + */ void -igmp_leavegroup(struct in_multi *inm) +igmp_domifreattach(struct igmp_ifinfo *igi) { - if (inm->inm_state == IGMP_IREPORTEDLAST && - inm->inm_addr.s_addr != igmp_all_hosts_group && - !(inm->inm_ifp->if_flags & IFF_LOOPBACK) && - inm->inm_rti->rti_type != IGMP_V1_ROUTER) - igmp_sendpkt(inm, IGMP_V2_LEAVE_GROUP, igmp_all_rtrs_group); + struct ifnet *ifp; + + lck_mtx_lock(&igmp_mtx); + + IGI_LOCK(igi); + VERIFY(!(igi->igi_debug & IFD_ATTACHED)); + ifp = igi->igi_ifp; + VERIFY(ifp != NULL); + igi_initvar(igi, ifp, 1); + igi->igi_debug |= IFD_ATTACHED; + IGI_ADDREF_LOCKED(igi); /* hold a reference for igi_head */ + IGI_UNLOCK(igi); + + LIST_INSERT_HEAD(&igi_head, igi, igi_link); + + lck_mtx_unlock(&igmp_mtx); + + IGMP_PRINTF(("reattached igmp_ifinfo for ifp %p(%s)\n", + ifp, ifp->if_name)); } +/* + * Hook for domifdetach. + */ void -igmp_fasttimo(void) +igmp_domifdetach(struct ifnet *ifp) { - struct in_multi *inm; - struct in_multistep step; + IGMP_PRINTF(("%s: called for ifp %p(%s%d)\n", + __func__, ifp, ifp->if_name, ifp->if_unit)); + + lck_mtx_lock(&igmp_mtx); + igi_delete(ifp); + lck_mtx_unlock(&igmp_mtx); +} + +/* + * Called at interface detach time. Note that we only flush all deferred + * responses and record releases; all remaining inm records and their source + * entries related to this interface are left intact, in order to handle + * the reattach case. + */ +static void +igi_delete(const struct ifnet *ifp) +{ + struct igmp_ifinfo *igi, *tigi; + + lck_mtx_assert(&igmp_mtx, LCK_MTX_ASSERT_OWNED); + + LIST_FOREACH_SAFE(igi, &igi_head, igi_link, tigi) { + IGI_LOCK(igi); + if (igi->igi_ifp == ifp) { + /* + * Free deferred General Query responses. + */ + IF_DRAIN(&igi->igi_gq); + IF_DRAIN(&igi->igi_v2q); + igmp_flush_relq(igi); + VERIFY(SLIST_EMPTY(&igi->igi_relinmhead)); + igi->igi_debug &= ~IFD_ATTACHED; + IGI_UNLOCK(igi); + + LIST_REMOVE(igi, igi_link); + IGI_REMREF(igi); /* release igi_head reference */ + return; + } + IGI_UNLOCK(igi); + } + panic("%s: igmp_ifinfo not found for ifp %p\n", __func__, ifp); +} + +static void +igi_initvar(struct igmp_ifinfo *igi, struct ifnet *ifp, int reattach) +{ + IGI_LOCK_ASSERT_HELD(igi); + + igi->igi_ifp = ifp; + igi->igi_version = igmp_default_version; + igi->igi_flags = 0; + igi->igi_rv = IGMP_RV_INIT; + igi->igi_qi = IGMP_QI_INIT; + igi->igi_qri = IGMP_QRI_INIT; + igi->igi_uri = IGMP_URI_INIT; + + /* ifnet is not yet attached; no need to hold ifnet lock */ + if (!(ifp->if_flags & IFF_MULTICAST)) + igi->igi_flags |= IGIF_SILENT; + + if (!reattach) + SLIST_INIT(&igi->igi_relinmhead); /* - * Quick check to see if any work needs to be done, in order - * to minimize the overhead of fasttimo processing. + * Responses to general queries are subject to bounds. */ + igi->igi_gq.ifq_maxlen = IGMP_MAX_RESPONSE_PACKETS; + igi->igi_v2q.ifq_maxlen = IGMP_MAX_RESPONSE_PACKETS; +} - if (!igmp_timers_are_running) - return; +static struct igmp_ifinfo * +igi_alloc(int how) +{ + struct igmp_ifinfo *igi; - igmp_timers_are_running = 0; - IN_FIRST_MULTI(step, inm); - while (inm != NULL) { - if (inm->inm_timer == 0) { - /* do nothing */ - } else if ((--inm->inm_timer == 0) && (inm->inm_rti != NULL)) { - igmp_sendpkt(inm, inm->inm_rti->rti_type, 0); - inm->inm_state = IGMP_IREPORTEDLAST; - } else { - igmp_timers_are_running = 1; - } - IN_NEXT_MULTI(step, inm); + igi = (how == M_WAITOK) ? zalloc(igi_zone) : zalloc_noblock(igi_zone); + if (igi != NULL) { + bzero(igi, igi_size); + lck_mtx_init(&igi->igi_lock, igmp_mtx_grp, igmp_mtx_attr); + igi->igi_debug |= IFD_ALLOC; + } + return (igi); +} + +static void +igi_free(struct igmp_ifinfo *igi) +{ + IGI_LOCK(igi); + if (igi->igi_debug & IFD_ATTACHED) { + panic("%s: attached igi=%p is being freed", __func__, igi); + /* NOTREACHED */ + } else if (igi->igi_ifp != NULL) { + panic("%s: ifp not NULL for igi=%p", __func__, igi); + /* NOTREACHED */ + } else if (!(igi->igi_debug & IFD_ALLOC)) { + panic("%s: igi %p cannot be freed", __func__, igi); + /* NOTREACHED */ + } else if (igi->igi_refcnt != 0) { + panic("%s: non-zero refcnt igi=%p", __func__, igi); + /* NOTREACHED */ } + igi->igi_debug &= ~IFD_ALLOC; + IGI_UNLOCK(igi); + + lck_mtx_destroy(&igi->igi_lock, igmp_mtx_grp); + zfree(igi_zone, igi); } void -igmp_slowtimo(void) +igi_addref(struct igmp_ifinfo *igi, int locked) { - struct router_info *rti = Head; + if (!locked) + IGI_LOCK_SPIN(igi); + else + IGI_LOCK_ASSERT_HELD(igi); -#if IGMP_DEBUG - printf("[igmp.c,_slowtimo] -- > entering \n"); -#endif - while (rti) { - if (rti->rti_type == IGMP_V1_ROUTER) { - rti->rti_time++; - if (rti->rti_time >= IGMP_AGE_THRESHOLD) { - rti->rti_type = IGMP_V2_ROUTER; - } - } - rti = rti->rti_next; + if (++igi->igi_refcnt == 0) { + panic("%s: igi=%p wraparound refcnt", __func__, igi); + /* NOTREACHED */ } -#if IGMP_DEBUG - printf("[igmp.c,_slowtimo] -- > exiting \n"); -#endif + if (!locked) + IGI_UNLOCK(igi); } -static void -igmp_sendpkt(struct in_multi *inm, int type, uint32_t addr) +void +igi_remref(struct igmp_ifinfo *igi) { - struct mbuf *m; - struct igmp *igmp; - struct ip *ip; - struct ip_moptions imo; - struct route ro; + struct ifnet *ifp; - MGETHDR(m, M_DONTWAIT, MT_HEADER); /* MAC-OK */ - if (m == NULL) - return; + IGI_LOCK_SPIN(igi); - m->m_pkthdr.rcvif = lo_ifp; -#if CONFIG_MACF_NET - mac_mbuf_label_associate_linklayer(inm->inm_ifp, m); -#endif - m->m_pkthdr.len = sizeof(struct ip) + IGMP_MINLEN; - MH_ALIGN(m, IGMP_MINLEN + sizeof(struct ip)); - m->m_data += sizeof(struct ip); - m->m_len = IGMP_MINLEN; - m->m_pkthdr.csum_flags = 0; - m->m_pkthdr.csum_data = 0; - igmp = mtod(m, struct igmp *); - igmp->igmp_type = type; - igmp->igmp_code = 0; - igmp->igmp_group = inm->inm_addr; - igmp->igmp_cksum = 0; - igmp->igmp_cksum = in_cksum(m, IGMP_MINLEN); - - m->m_data -= sizeof(struct ip); - m->m_len += sizeof(struct ip); - ip = mtod(m, struct ip *); - ip->ip_tos = 0; - ip->ip_len = sizeof(struct ip) + IGMP_MINLEN; - ip->ip_off = 0; - ip->ip_p = IPPROTO_IGMP; - ip->ip_src.s_addr = INADDR_ANY; - ip->ip_dst.s_addr = addr ? addr : igmp->igmp_group.s_addr; - - imo.imo_multicast_ifp = inm->inm_ifp; - imo.imo_multicast_ttl = 1; - imo.imo_multicast_vif = -1; -#if MROUTING - /* - * Request loopback of the report if we are acting as a multicast - * router, so that the process-level routing demon can hear it. - */ - imo.imo_multicast_loop = (ip_mrouter != NULL); -#else - imo.imo_multicast_loop = 0; -#endif + if (igi->igi_refcnt == 0) { + panic("%s: igi=%p negative refcnt", __func__, igi); + /* NOTREACHED */ + } + + --igi->igi_refcnt; + if (igi->igi_refcnt > 0) { + IGI_UNLOCK(igi); + return; + } + + ifp = igi->igi_ifp; + igi->igi_ifp = NULL; + IF_DRAIN(&igi->igi_gq); + IF_DRAIN(&igi->igi_v2q); + igmp_flush_relq(igi); + VERIFY(SLIST_EMPTY(&igi->igi_relinmhead)); + IGI_UNLOCK(igi); + + IGMP_PRINTF(("%s: freeing igmp_ifinfo for ifp %p(%s%d)\n", + __func__, ifp, ifp->if_name, ifp->if_unit)); + + igi_free(igi); +} + +/* + * Process a received IGMPv1 query. + * Return non-zero if the message should be dropped. + */ +static int +igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip, + const struct igmp *igmp) +{ + struct igmp_ifinfo *igi; + struct in_multi *inm; + struct in_multistep step; /* - * XXX - * Do we have to worry about reentrancy here? Don't think so. + * IGMPv1 Host Membership Queries SHOULD always be addressed to + * 224.0.0.1. They are always treated as General Queries. + * igmp_group is always ignored. Do not drop it as a userland + * daemon may wish to see it. */ - bzero(&ro, sizeof (ro)); - (void) ip_output(m, router_alert, &ro, 0, &imo, NULL); - if (ro.ro_rt != NULL) { - rtfree(ro.ro_rt); - ro.ro_rt = NULL; + if (!in_allhosts(ip->ip_dst) || !in_nullhost(igmp->igmp_group)) { + IGMPSTAT_INC(igps_rcv_badqueries); + OIGMPSTAT_INC(igps_rcv_badqueries); + return (0); } + IGMPSTAT_INC(igps_rcv_gen_queries); - ++igmpstat.igps_snd_reports; -} + igi = IGMP_IFINFO(ifp); + VERIFY(igi != NULL); + + IGI_LOCK(igi); + if (igi->igi_flags & IGIF_LOOPBACK) { + IGMP_PRINTF(("ignore v1 query on IGIF_LOOPBACK ifp %p(%s%d)\n", + ifp, ifp->if_name, ifp->if_unit)); + IGI_UNLOCK(igi); + return (0); + } + /* + * Switch to IGMPv1 host compatibility mode. + */ + igmp_set_version(igi, IGMP_VERSION_1); + IGI_UNLOCK(igi); + IGMP_PRINTF(("process v1 query on ifp %p(%s%d)\n", ifp, ifp->if_name, + ifp->if_unit)); + + /* + * Start the timers in all of our group records + * for the interface on which the query arrived, + * except those which are already running. + */ + in_multihead_lock_shared(); + IN_FIRST_MULTI(step, inm); + while (inm != NULL) { + INM_LOCK(inm); + if (inm->inm_ifp != ifp) + goto next; + if (inm->inm_timer != 0) + goto next; + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + break; + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + case IGMP_REPORTING_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_AWAKENING_MEMBER: + inm->inm_state = IGMP_REPORTING_MEMBER; + inm->inm_timer = IGMP_RANDOM_DELAY( + IGMP_V1V2_MAX_RI * PR_SLOWHZ); + current_state_timers_running = 1; + break; + case IGMP_LEAVING_MEMBER: + break; + } +next: + INM_UNLOCK(inm); + IN_NEXT_MULTI(step, inm); + } + in_multihead_lock_done(); + + return (0); +} + +/* + * Process a received IGMPv2 general or group-specific query. + */ +static int +igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip, + const struct igmp *igmp) +{ + struct igmp_ifinfo *igi; + struct in_multi *inm; + int is_general_query; + uint16_t timer; + + is_general_query = 0; + + /* + * Validate address fields upfront. + */ + if (in_nullhost(igmp->igmp_group)) { + /* + * IGMPv2 General Query. + * If this was not sent to the all-hosts group, ignore it. + */ + if (!in_allhosts(ip->ip_dst)) + return (0); + IGMPSTAT_INC(igps_rcv_gen_queries); + is_general_query = 1; + } else { + /* IGMPv2 Group-Specific Query. */ + IGMPSTAT_INC(igps_rcv_group_queries); + } + + igi = IGMP_IFINFO(ifp); + VERIFY(igi != NULL); + + IGI_LOCK(igi); + if (igi->igi_flags & IGIF_LOOPBACK) { + IGMP_PRINTF(("ignore v2 query on IGIF_LOOPBACK ifp %p(%s%d)\n", + ifp, ifp->if_name, ifp->if_unit)); + IGI_UNLOCK(igi); + return(0); + } + /* + * Ignore v2 query if in v1 Compatibility Mode. + */ + if (igi->igi_version == IGMP_VERSION_1) { + IGI_UNLOCK(igi); + return (0); + } + igmp_set_version(igi, IGMP_VERSION_2); + IGI_UNLOCK(igi); + + timer = igmp->igmp_code * PR_SLOWHZ / IGMP_TIMER_SCALE; + if (timer == 0) + timer = 1; + + if (is_general_query) { + struct in_multistep step; + + IGMP_PRINTF(("process v2 general query on ifp %p(%s%d)\n", + ifp, ifp->if_name, ifp->if_unit)); + /* + * For each reporting group joined on this + * interface, kick the report timer. + */ + in_multihead_lock_shared(); + IN_FIRST_MULTI(step, inm); + while (inm != NULL) { + INM_LOCK(inm); + if (inm->inm_ifp == ifp) + igmp_v2_update_group(inm, timer); + INM_UNLOCK(inm); + IN_NEXT_MULTI(step, inm); + } + in_multihead_lock_done(); + } else { + /* + * Group-specific IGMPv2 query, we need only + * look up the single group to process it. + */ + in_multihead_lock_shared(); + IN_LOOKUP_MULTI(&igmp->igmp_group, ifp, inm); + in_multihead_lock_done(); + if (inm != NULL) { + INM_LOCK(inm); + IGMP_PRINTF(("process v2 query %s on ifp %p(%s%d)\n", + inet_ntoa(igmp->igmp_group), ifp, ifp->if_name, + ifp->if_unit)); + igmp_v2_update_group(inm, timer); + INM_UNLOCK(inm); + INM_REMREF(inm); /* from IN_LOOKUP_MULTI */ + } + } + + return (0); +} + +/* + * Update the report timer on a group in response to an IGMPv2 query. + * + * If we are becoming the reporting member for this group, start the timer. + * If we already are the reporting member for this group, and timer is + * below the threshold, reset it. + * + * We may be updating the group for the first time since we switched + * to IGMPv3. If we are, then we must clear any recorded source lists, + * and transition to REPORTING state; the group timer is overloaded + * for group and group-source query responses. + * + * Unlike IGMPv3, the delay per group should be jittered + * to avoid bursts of IGMPv2 reports. + */ +static void +igmp_v2_update_group(struct in_multi *inm, const int timer) +{ + + IGMP_PRINTF(("%s: %s/%s%d timer=%d\n", __func__, + inet_ntoa(inm->inm_addr), inm->inm_ifp->if_name, + inm->inm_ifp->if_unit, timer)); + + INM_LOCK_ASSERT_HELD(inm); + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + break; + case IGMP_REPORTING_MEMBER: + if (inm->inm_timer != 0 && + inm->inm_timer <= timer) { + IGMP_PRINTF(("%s: REPORTING and timer running, " + "skipping.\n", __func__)); + break; + } + /* FALLTHROUGH */ + case IGMP_SG_QUERY_PENDING_MEMBER: + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_AWAKENING_MEMBER: + IGMP_PRINTF(("%s: ->REPORTING\n", __func__)); + inm->inm_state = IGMP_REPORTING_MEMBER; + inm->inm_timer = IGMP_RANDOM_DELAY(timer); + current_state_timers_running = 1; + break; + case IGMP_SLEEPING_MEMBER: + IGMP_PRINTF(("%s: ->AWAKENING\n", __func__)); + inm->inm_state = IGMP_AWAKENING_MEMBER; + break; + case IGMP_LEAVING_MEMBER: + break; + } +} + +/* + * Process a received IGMPv3 general, group-specific or + * group-and-source-specific query. + * Assumes m has already been pulled up to the full IGMP message length. + * Return 0 if successful, otherwise an appropriate error code is returned. + */ +static int +igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip, + /*const*/ struct igmpv3 *igmpv3) +{ + struct igmp_ifinfo *igi; + struct in_multi *inm; + int is_general_query; + uint32_t maxresp, nsrc, qqi; + uint16_t timer; + uint8_t qrv; + + is_general_query = 0; + + IGMP_PRINTF(("process v3 query on ifp %p(%s%d)\n", ifp, ifp->if_name, + ifp->if_unit)); + + maxresp = igmpv3->igmp_code; /* in 1/10ths of a second */ + if (maxresp >= 128) { + maxresp = IGMP_MANT(igmpv3->igmp_code) << + (IGMP_EXP(igmpv3->igmp_code) + 3); + } + + /* + * Robustness must never be less than 2 for on-wire IGMPv3. + * FUTURE: Check if ifp has IGIF_LOOPBACK set, as we will make + * an exception for interfaces whose IGMPv3 state changes + * are redirected to loopback (e.g. MANET). + */ + qrv = IGMP_QRV(igmpv3->igmp_misc); + if (qrv < 2) { + IGMP_PRINTF(("%s: clamping qrv %d to %d\n", __func__, + qrv, IGMP_RV_INIT)); + qrv = IGMP_RV_INIT; + } + + qqi = igmpv3->igmp_qqi; + if (qqi >= 128) { + qqi = IGMP_MANT(igmpv3->igmp_qqi) << + (IGMP_EXP(igmpv3->igmp_qqi) + 3); + } + + timer = maxresp * PR_SLOWHZ / IGMP_TIMER_SCALE; + if (timer == 0) + timer = 1; + + nsrc = ntohs(igmpv3->igmp_numsrc); + + /* + * Validate address fields and versions upfront before + * accepting v3 query. + */ + if (in_nullhost(igmpv3->igmp_group)) { + /* + * IGMPv3 General Query. + * + * General Queries SHOULD be directed to 224.0.0.1. + * A general query with a source list has undefined + * behaviour; discard it. + */ + IGMPSTAT_INC(igps_rcv_gen_queries); + if (!in_allhosts(ip->ip_dst) || nsrc > 0) { + IGMPSTAT_INC(igps_rcv_badqueries); + OIGMPSTAT_INC(igps_rcv_badqueries); + return (0); + } + is_general_query = 1; + } else { + /* Group or group-source specific query. */ + if (nsrc == 0) + IGMPSTAT_INC(igps_rcv_group_queries); + else + IGMPSTAT_INC(igps_rcv_gsr_queries); + } + + igi = IGMP_IFINFO(ifp); + VERIFY(igi != NULL); + + IGI_LOCK(igi); + if (igi->igi_flags & IGIF_LOOPBACK) { + IGMP_PRINTF(("ignore v3 query on IGIF_LOOPBACK ifp %p(%s%d)\n", + ifp, ifp->if_name, ifp->if_unit)); + IGI_UNLOCK(igi); + return (0); + } + + /* + * Discard the v3 query if we're in Compatibility Mode. + * The RFC is not obviously worded that hosts need to stay in + * compatibility mode until the Old Version Querier Present + * timer expires. + */ + if (igi->igi_version != IGMP_VERSION_3) { + IGMP_PRINTF(("ignore v3 query in v%d mode on ifp %p(%s%d)\n", + igi->igi_version, ifp, ifp->if_name, ifp->if_unit)); + IGI_UNLOCK(igi); + return (0); + } + + igmp_set_version(igi, IGMP_VERSION_3); + igi->igi_rv = qrv; + igi->igi_qi = qqi; + igi->igi_qri = maxresp; + + + IGMP_PRINTF(("%s: qrv %d qi %d qri %d\n", __func__, qrv, qqi, + maxresp)); + + if (is_general_query) { + /* + * Schedule a current-state report on this ifp for + * all groups, possibly containing source lists. + * If there is a pending General Query response + * scheduled earlier than the selected delay, do + * not schedule any other reports. + * Otherwise, reset the interface timer. + */ + IGMP_PRINTF(("process v3 general query on ifp %p(%s%d)\n", + ifp, ifp->if_name, ifp->if_unit)); + if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) { + igi->igi_v3_timer = IGMP_RANDOM_DELAY(timer); + interface_timers_running = 1; + } + IGI_UNLOCK(igi); + } else { + IGI_UNLOCK(igi); + /* + * Group-source-specific queries are throttled on + * a per-group basis to defeat denial-of-service attempts. + * Queries for groups we are not a member of on this + * link are simply ignored. + */ + in_multihead_lock_shared(); + IN_LOOKUP_MULTI(&igmpv3->igmp_group, ifp, inm); + in_multihead_lock_done(); + if (inm == NULL) + return (0); + + INM_LOCK(inm); +#ifndef __APPLE__ + /* TODO: need ratecheck equivalent */ + if (nsrc > 0) { + if (!ratecheck(&inm->inm_lastgsrtv, + &igmp_gsrdelay)) { + IGMP_PRINTF(("%s: GS query throttled.\n", + __func__)); + IGMPSTAT_INC(igps_drop_gsr_queries); + INM_UNLOCK(inm); + INM_REMREF(inm); /* from IN_LOOKUP_MULTI */ + return (0); + } + } +#endif + IGMP_PRINTF(("process v3 %s query on ifp %p(%s%d)\n", + inet_ntoa(igmpv3->igmp_group), ifp, ifp->if_name, + ifp->if_unit)); + /* + * If there is a pending General Query response + * scheduled sooner than the selected delay, no + * further report need be scheduled. + * Otherwise, prepare to respond to the + * group-specific or group-and-source query. + */ + IGI_LOCK(igi); + if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) { + IGI_UNLOCK(igi); + igmp_input_v3_group_query(inm, timer, igmpv3); + } else { + IGI_UNLOCK(igi); + } + INM_UNLOCK(inm); + INM_REMREF(inm); /* from IN_LOOKUP_MULTI */ + } + + return (0); +} + +/* + * Process a recieved IGMPv3 group-specific or group-and-source-specific + * query. + * Return <0 if any error occured. Currently this is ignored. + */ +static int +igmp_input_v3_group_query(struct in_multi *inm, + int timer, /*const*/ struct igmpv3 *igmpv3) +{ + int retval; + uint16_t nsrc; + + INM_LOCK_ASSERT_HELD(inm); + + retval = 0; + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_AWAKENING_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_LEAVING_MEMBER: + return (retval); + case IGMP_REPORTING_MEMBER: + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + break; + } + + nsrc = ntohs(igmpv3->igmp_numsrc); + + /* + * Deal with group-specific queries upfront. + * If any group query is already pending, purge any recorded + * source-list state if it exists, and schedule a query response + * for this group-specific query. + */ + if (nsrc == 0) { + if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER || + inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) { + inm_clear_recorded(inm); + timer = min(inm->inm_timer, timer); + } + inm->inm_state = IGMP_G_QUERY_PENDING_MEMBER; + inm->inm_timer = IGMP_RANDOM_DELAY(timer); + current_state_timers_running = 1; + return (retval); + } + + /* + * Deal with the case where a group-and-source-specific query has + * been received but a group-specific query is already pending. + */ + if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER) { + timer = min(inm->inm_timer, timer); + inm->inm_timer = IGMP_RANDOM_DELAY(timer); + current_state_timers_running = 1; + return (retval); + } + + /* + * Finally, deal with the case where a group-and-source-specific + * query has been received, where a response to a previous g-s-r + * query exists, or none exists. + * In this case, we need to parse the source-list which the Querier + * has provided us with and check if we have any source list filter + * entries at T1 for these sources. If we do not, there is no need + * schedule a report and the query may be dropped. + * If we do, we must record them and schedule a current-state + * report for those sources. + * FIXME: Handling source lists larger than 1 mbuf requires that + * we pass the mbuf chain pointer down to this function, and use + * m_getptr() to walk the chain. + */ + if (inm->inm_nsrc > 0) { + const struct in_addr *ap; + int i, nrecorded; + + ap = (const struct in_addr *)(igmpv3 + 1); + nrecorded = 0; + for (i = 0; i < nsrc; i++, ap++) { + retval = inm_record_source(inm, ap->s_addr); + if (retval < 0) + break; + nrecorded += retval; + } + if (nrecorded > 0) { + IGMP_PRINTF(("%s: schedule response to SG query\n", + __func__)); + inm->inm_state = IGMP_SG_QUERY_PENDING_MEMBER; + inm->inm_timer = IGMP_RANDOM_DELAY(timer); + current_state_timers_running = 1; + } + } + + return (retval); +} + +/* + * Process a received IGMPv1 host membership report. + * + * NOTE: 0.0.0.0 workaround breaks const correctness. + */ +static int +igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip, + /*const*/ struct igmp *igmp) +{ + struct in_ifaddr *ia; + struct in_multi *inm; + + IGMPSTAT_INC(igps_rcv_reports); + OIGMPSTAT_INC(igps_rcv_reports); + + if (ifp->if_flags & IFF_LOOPBACK) + return (0); + + if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr) || + !in_hosteq(igmp->igmp_group, ip->ip_dst))) { + IGMPSTAT_INC(igps_rcv_badreports); + OIGMPSTAT_INC(igps_rcv_badreports); + return (EINVAL); + } + + /* + * RFC 3376, Section 4.2.13, 9.2, 9.3: + * Booting clients may use the source address 0.0.0.0. Some + * IGMP daemons may not know how to use IP_RECVIF to determine + * the interface upon which this message was received. + * Replace 0.0.0.0 with the subnet address if told to do so. + */ + if (igmp_recvifkludge && in_nullhost(ip->ip_src)) { + IFP_TO_IA(ifp, ia); + if (ia != NULL) { + IFA_LOCK(&ia->ia_ifa); + ip->ip_src.s_addr = htonl(ia->ia_subnet); + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); + } + } + + IGMP_PRINTF(("process v1 report %s on ifp %p(%s%d)\n", + inet_ntoa(igmp->igmp_group), ifp, ifp->if_name, ifp->if_unit)); + + /* + * IGMPv1 report suppression. + * If we are a member of this group, and our membership should be + * reported, stop our group timer and transition to the 'lazy' state. + */ + in_multihead_lock_shared(); + IN_LOOKUP_MULTI(&igmp->igmp_group, ifp, inm); + in_multihead_lock_done(); + if (inm != NULL) { + struct igmp_ifinfo *igi; + + INM_LOCK(inm); + + igi = inm->inm_igi; + VERIFY(igi != NULL); + + IGMPSTAT_INC(igps_rcv_ourreports); + OIGMPSTAT_INC(igps_rcv_ourreports); + + /* + * If we are in IGMPv3 host mode, do not allow the + * other host's IGMPv1 report to suppress our reports + * unless explicitly configured to do so. + */ + IGI_LOCK(igi); + if (igi->igi_version == IGMP_VERSION_3) { + if (igmp_legacysupp) + igmp_v3_suppress_group_record(inm); + IGI_UNLOCK(igi); + INM_UNLOCK(inm); + INM_REMREF(inm); /* from IN_LOOKUP_MULTI */ + return (0); + } + + INM_LOCK_ASSERT_HELD(inm); + inm->inm_timer = 0; + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + break; + case IGMP_IDLE_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_AWAKENING_MEMBER: + IGMP_PRINTF(("report suppressed for %s on ifp %p(%s%d)\n", + inet_ntoa(igmp->igmp_group), ifp, ifp->if_name, + ifp->if_unit)); + case IGMP_SLEEPING_MEMBER: + inm->inm_state = IGMP_SLEEPING_MEMBER; + break; + case IGMP_REPORTING_MEMBER: + IGMP_PRINTF(("report suppressed for %s on ifp %p(%s%d)\n", + inet_ntoa(igmp->igmp_group), ifp, ifp->if_name, + ifp->if_unit)); + if (igi->igi_version == IGMP_VERSION_1) + inm->inm_state = IGMP_LAZY_MEMBER; + else if (igi->igi_version == IGMP_VERSION_2) + inm->inm_state = IGMP_SLEEPING_MEMBER; + break; + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + case IGMP_LEAVING_MEMBER: + break; + } + IGI_UNLOCK(igi); + INM_UNLOCK(inm); + INM_REMREF(inm); /* from IN_LOOKUP_MULTI */ + } + + return (0); +} + +/* + * Process a received IGMPv2 host membership report. + * + * NOTE: 0.0.0.0 workaround breaks const correctness. + */ +static int +igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip, + /*const*/ struct igmp *igmp) +{ + struct in_ifaddr *ia; + struct in_multi *inm; + + /* + * Make sure we don't hear our own membership report. Fast + * leave requires knowing that we are the only member of a + * group. + */ + IFP_TO_IA(ifp, ia); + if (ia != NULL) { + IFA_LOCK(&ia->ia_ifa); + if (in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) { + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); + return (0); + } + IFA_UNLOCK(&ia->ia_ifa); + } + + IGMPSTAT_INC(igps_rcv_reports); + OIGMPSTAT_INC(igps_rcv_reports); + + if (ifp->if_flags & IFF_LOOPBACK) { + if (ia != NULL) + IFA_REMREF(&ia->ia_ifa); + return (0); + } + + if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) || + !in_hosteq(igmp->igmp_group, ip->ip_dst)) { + if (ia != NULL) + IFA_REMREF(&ia->ia_ifa); + IGMPSTAT_INC(igps_rcv_badreports); + OIGMPSTAT_INC(igps_rcv_badreports); + return (EINVAL); + } + + /* + * RFC 3376, Section 4.2.13, 9.2, 9.3: + * Booting clients may use the source address 0.0.0.0. Some + * IGMP daemons may not know how to use IP_RECVIF to determine + * the interface upon which this message was received. + * Replace 0.0.0.0 with the subnet address if told to do so. + */ + if (igmp_recvifkludge && in_nullhost(ip->ip_src)) { + if (ia != NULL) { + IFA_LOCK(&ia->ia_ifa); + ip->ip_src.s_addr = htonl(ia->ia_subnet); + IFA_UNLOCK(&ia->ia_ifa); + } + } + if (ia != NULL) + IFA_REMREF(&ia->ia_ifa); + + IGMP_PRINTF(("process v2 report %s on ifp %p(%s%d)\n", + inet_ntoa(igmp->igmp_group), ifp, ifp->if_name, ifp->if_unit)); + + /* + * IGMPv2 report suppression. + * If we are a member of this group, and our membership should be + * reported, and our group timer is pending or about to be reset, + * stop our group timer by transitioning to the 'lazy' state. + */ + in_multihead_lock_shared(); + IN_LOOKUP_MULTI(&igmp->igmp_group, ifp, inm); + in_multihead_lock_done(); + if (inm != NULL) { + struct igmp_ifinfo *igi; + + INM_LOCK(inm); + igi = inm->inm_igi; + VERIFY(igi != NULL); + + IGMPSTAT_INC(igps_rcv_ourreports); + OIGMPSTAT_INC(igps_rcv_ourreports); + + /* + * If we are in IGMPv3 host mode, do not allow the + * other host's IGMPv1 report to suppress our reports + * unless explicitly configured to do so. + */ + IGI_LOCK(igi); + if (igi->igi_version == IGMP_VERSION_3) { + if (igmp_legacysupp) + igmp_v3_suppress_group_record(inm); + IGI_UNLOCK(igi); + INM_UNLOCK(inm); + INM_REMREF(inm); + return (0); + } + + inm->inm_timer = 0; + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + case IGMP_SLEEPING_MEMBER: + break; + case IGMP_REPORTING_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_AWAKENING_MEMBER: + IGMP_PRINTF(("report suppressed for %s on ifp %p(%s%d)\n", + inet_ntoa(igmp->igmp_group), ifp, ifp->if_name, + ifp->if_unit)); + case IGMP_LAZY_MEMBER: + inm->inm_state = IGMP_LAZY_MEMBER; + break; + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + case IGMP_LEAVING_MEMBER: + break; + } + IGI_UNLOCK(igi); + INM_UNLOCK(inm); + INM_REMREF(inm); + } + + return (0); +} + +void +igmp_input(struct mbuf *m, int off) +{ + int iphlen; + struct ifnet *ifp; + struct igmp *igmp; + struct ip *ip; + int igmplen; + int minlen; + int queryver; + + IGMP_PRINTF(("%s: called w/mbuf (%p,%d)\n", __func__, m, off)); + + ifp = m->m_pkthdr.rcvif; + + IGMPSTAT_INC(igps_rcv_total); + OIGMPSTAT_INC(igps_rcv_total); + + ip = mtod(m, struct ip *); + iphlen = off; + + /* By now, ip_len no longer contains the length of IP header */ + igmplen = ip->ip_len; + + /* + * Validate lengths. + */ + if (igmplen < IGMP_MINLEN) { + IGMPSTAT_INC(igps_rcv_tooshort); + OIGMPSTAT_INC(igps_rcv_tooshort); + m_freem(m); + return; + } + + /* + * Always pullup to the minimum size for v1/v2 or v3 + * to amortize calls to m_pulldown(). + */ + if (igmplen >= IGMP_V3_QUERY_MINLEN) + minlen = IGMP_V3_QUERY_MINLEN; + else + minlen = IGMP_MINLEN; + + M_STRUCT_GET(igmp, struct igmp *, m, off, minlen); + if (igmp == NULL) { + IGMPSTAT_INC(igps_rcv_tooshort); + OIGMPSTAT_INC(igps_rcv_tooshort); + return; + } + + /* + * Validate checksum. + */ + m->m_data += iphlen; + m->m_len -= iphlen; + if (in_cksum(m, igmplen)) { + IGMPSTAT_INC(igps_rcv_badsum); + OIGMPSTAT_INC(igps_rcv_badsum); + m_freem(m); + return; + } + m->m_data -= iphlen; + m->m_len += iphlen; + + /* + * IGMP control traffic is link-scope, and must have a TTL of 1. + * DVMRP traffic (e.g. mrinfo, mtrace) is an exception; + * probe packets may come from beyond the LAN. + */ + if (igmp->igmp_type != IGMP_DVMRP && ip->ip_ttl != 1) { + IGMPSTAT_INC(igps_rcv_badttl); + m_freem(m); + return; + } + + switch (igmp->igmp_type) { + case IGMP_HOST_MEMBERSHIP_QUERY: + if (igmplen == IGMP_MINLEN) { + if (igmp->igmp_code == 0) + queryver = IGMP_VERSION_1; + else + queryver = IGMP_VERSION_2; + } else if (igmplen >= IGMP_V3_QUERY_MINLEN) { + queryver = IGMP_VERSION_3; + } else { + IGMPSTAT_INC(igps_rcv_tooshort); + OIGMPSTAT_INC(igps_rcv_tooshort); + m_freem(m); + return; + } + + OIGMPSTAT_INC(igps_rcv_queries); + + switch (queryver) { + case IGMP_VERSION_1: + IGMPSTAT_INC(igps_rcv_v1v2_queries); + if (!igmp_v1enable) + break; + if (igmp_input_v1_query(ifp, ip, igmp) != 0) { + m_freem(m); + return; + } + break; + + case IGMP_VERSION_2: + IGMPSTAT_INC(igps_rcv_v1v2_queries); + if (!igmp_v2enable) + break; + if (igmp_input_v2_query(ifp, ip, igmp) != 0) { + m_freem(m); + return; + } + break; + + case IGMP_VERSION_3: { + struct igmpv3 *igmpv3; + uint16_t igmpv3len; + uint16_t srclen; + int nsrc; + + IGMPSTAT_INC(igps_rcv_v3_queries); + igmpv3 = (struct igmpv3 *)igmp; + /* + * Validate length based on source count. + */ + nsrc = ntohs(igmpv3->igmp_numsrc); + srclen = sizeof(struct in_addr) * nsrc; + if (igmplen < (IGMP_V3_QUERY_MINLEN + srclen)) { + IGMPSTAT_INC(igps_rcv_tooshort); + OIGMPSTAT_INC(igps_rcv_tooshort); + m_freem(m); + return; + } + igmpv3len = IGMP_V3_QUERY_MINLEN + srclen; + M_STRUCT_GET(igmpv3, struct igmpv3 *, m, + off, igmpv3len); + if (igmpv3 == NULL) { + IGMPSTAT_INC(igps_rcv_tooshort); + OIGMPSTAT_INC(igps_rcv_tooshort); + return; + } + if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) { + m_freem(m); + return; + } + } + break; + } + break; + + case IGMP_v1_HOST_MEMBERSHIP_REPORT: + if (!igmp_v1enable) + break; + if (igmp_input_v1_report(ifp, ip, igmp) != 0) { + m_freem(m); + return; + } + break; + + case IGMP_v2_HOST_MEMBERSHIP_REPORT: + if (!igmp_v2enable) + break; +#ifndef __APPLE__ + if (!ip_checkrouteralert(m)) + IGMPSTAT_INC(igps_rcv_nora); +#endif + if (igmp_input_v2_report(ifp, ip, igmp) != 0) { + m_freem(m); + return; + } + break; + + case IGMP_v3_HOST_MEMBERSHIP_REPORT: + /* + * Hosts do not need to process IGMPv3 membership reports, + * as report suppression is no longer required. + */ +#ifndef __APPLE__ + if (!ip_checkrouteralert(m)) + IGMPSTAT_INC(igps_rcv_nora); +#endif + break; + + default: + break; + } + + lck_mtx_assert(&igmp_mtx, LCK_MTX_ASSERT_NOTOWNED); + /* + * Pass all valid IGMP packets up to any process(es) listening on a + * raw IGMP socket. + */ + rip_input(m, off); +} + + +/* + * IGMP slowtimo handler. + * Combiles both the slow and fast timer into one. We loose some responsivness but + * allows the system to avoid having a pr_fasttimo, thus allowing for power savings. + * + */ +void +igmp_slowtimo(void) +{ + struct ifqueue scq; /* State-change packets */ + struct ifqueue qrq; /* Query response packets */ + struct ifnet *ifp; + struct igmp_ifinfo *igi; + struct in_multi *inm; + int loop = 0, uri_fasthz = 0; + + lck_mtx_lock(&igmp_mtx); + + LIST_FOREACH(igi, &igi_head, igi_link) { + IGI_LOCK(igi); + igmp_v1v2_process_querier_timers(igi); + IGI_UNLOCK(igi); + } + + /* + * NOTE: previously handled by fasttimo + * + * Quick check to see if any work needs to be done, in order to + * minimize the overhead of fasttimo processing. + */ + if (!current_state_timers_running && + !interface_timers_running && + !state_change_timers_running) { + lck_mtx_unlock(&igmp_mtx); + return; + } + + /* + * IGMPv3 General Query response timer processing. + */ + if (interface_timers_running) { + interface_timers_running = 0; + LIST_FOREACH(igi, &igi_head, igi_link) { + IGI_LOCK(igi); + if (igi->igi_v3_timer == 0) { + /* Do nothing. */ + } else if (--igi->igi_v3_timer == 0) { + igmp_v3_dispatch_general_query(igi); + } else { + interface_timers_running = 1; + } + IGI_UNLOCK(igi); + } + } + + if (!current_state_timers_running && + !state_change_timers_running) + goto out_locked; + + current_state_timers_running = 0; + state_change_timers_running = 0; + + memset(&qrq, 0, sizeof(struct ifqueue)); + qrq.ifq_maxlen = IGMP_MAX_G_GS_PACKETS; + + memset(&scq, 0, sizeof(struct ifqueue)); + scq.ifq_maxlen = IGMP_MAX_STATE_CHANGE_PACKETS; + + /* + * IGMPv1/v2/v3 host report and state-change timer processing. + * Note: Processing a v3 group timer may remove a node. + */ + LIST_FOREACH(igi, &igi_head, igi_link) { + struct in_multistep step; + + IGI_LOCK(igi); + ifp = igi->igi_ifp; + loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; + uri_fasthz = IGMP_RANDOM_DELAY(igi->igi_uri * PR_SLOWHZ); + IGI_UNLOCK(igi); + + in_multihead_lock_shared(); + IN_FIRST_MULTI(step, inm); + while (inm != NULL) { + INM_LOCK(inm); + if (inm->inm_ifp != ifp) + goto next; + + IGI_LOCK(igi); + switch (igi->igi_version) { + case IGMP_VERSION_1: + case IGMP_VERSION_2: + igmp_v1v2_process_group_timer(inm, + igi->igi_version); + break; + case IGMP_VERSION_3: + igmp_v3_process_group_timers(igi, &qrq, + &scq, inm, uri_fasthz); + break; + } + IGI_UNLOCK(igi); +next: + INM_UNLOCK(inm); + IN_NEXT_MULTI(step, inm); + } + in_multihead_lock_done(); + + IGI_LOCK(igi); + if (igi->igi_version == IGMP_VERSION_1 || + igi->igi_version == IGMP_VERSION_2) { + igmp_dispatch_queue(igi, &igi->igi_v2q, 0, loop, ifp); + } else if (igi->igi_version == IGMP_VERSION_3) { + IGI_UNLOCK(igi); + igmp_dispatch_queue(NULL, &qrq, 0, loop, ifp); + igmp_dispatch_queue(NULL, &scq, 0, loop, ifp); + VERIFY(qrq.ifq_len == 0); + VERIFY(scq.ifq_len == 0); + IGI_LOCK(igi); + } + /* + * In case there are still any pending membership reports + * which didn't get drained at version change time. + */ + IF_DRAIN(&igi->igi_v2q); + /* + * Release all deferred inm records, and drain any locally + * enqueued packets; do it even if the current IGMP version + * for the link is no longer IGMPv3, in order to handle the + * version change case. + */ + igmp_flush_relq(igi); + VERIFY(SLIST_EMPTY(&igi->igi_relinmhead)); + IGI_UNLOCK(igi); + + IF_DRAIN(&qrq); + IF_DRAIN(&scq); + } + +out_locked: + lck_mtx_unlock(&igmp_mtx); +} + +/* + * Free the in_multi reference(s) for this IGMP lifecycle. + * + * Caller must be holding igi_lock. + */ +static void +igmp_flush_relq(struct igmp_ifinfo *igi) +{ + struct in_multi *inm; + +again: + IGI_LOCK_ASSERT_HELD(igi); + inm = SLIST_FIRST(&igi->igi_relinmhead); + if (inm != NULL) { + int lastref; + + SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele); + IGI_UNLOCK(igi); + + in_multihead_lock_exclusive(); + INM_LOCK(inm); + VERIFY(inm->inm_nrelecnt != 0); + inm->inm_nrelecnt--; + lastref = in_multi_detach(inm); + VERIFY(!lastref || (!(inm->inm_debug & IFD_ATTACHED) && + inm->inm_reqcnt == 0)); + INM_UNLOCK(inm); + in_multihead_lock_done(); + /* from igi_relinmhead */ + INM_REMREF(inm); + /* from in_multihead list */ + if (lastref) + INM_REMREF(inm); + + IGI_LOCK(igi); + goto again; + } +} + +/* + * Update host report group timer for IGMPv1/v2. + * Will update the global pending timer flags. + */ +static void +igmp_v1v2_process_group_timer(struct in_multi *inm, const int igmp_version) +{ + int report_timer_expired; + + INM_LOCK_ASSERT_HELD(inm); + IGI_LOCK_ASSERT_HELD(inm->inm_igi); + + if (inm->inm_timer == 0) { + report_timer_expired = 0; + } else if (--inm->inm_timer == 0) { + report_timer_expired = 1; + } else { + current_state_timers_running = 1; + return; + } + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_AWAKENING_MEMBER: + break; + case IGMP_REPORTING_MEMBER: + if (report_timer_expired) { + inm->inm_state = IGMP_IDLE_MEMBER; + (void) igmp_v1v2_queue_report(inm, + (igmp_version == IGMP_VERSION_2) ? + IGMP_v2_HOST_MEMBERSHIP_REPORT : + IGMP_v1_HOST_MEMBERSHIP_REPORT); + INM_LOCK_ASSERT_HELD(inm); + IGI_LOCK_ASSERT_HELD(inm->inm_igi); + } + break; + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + case IGMP_LEAVING_MEMBER: + break; + } +} + +/* + * Update a group's timers for IGMPv3. + * Will update the global pending timer flags. + * Note: Unlocked read from igi. + */ +static void +igmp_v3_process_group_timers(struct igmp_ifinfo *igi, + struct ifqueue *qrq, struct ifqueue *scq, + struct in_multi *inm, const int uri_fasthz) +{ + int query_response_timer_expired; + int state_change_retransmit_timer_expired; + + INM_LOCK_ASSERT_HELD(inm); + IGI_LOCK_ASSERT_HELD(igi); + VERIFY(igi == inm->inm_igi); + + query_response_timer_expired = 0; + state_change_retransmit_timer_expired = 0; + + /* + * During a transition from v1/v2 compatibility mode back to v3, + * a group record in REPORTING state may still have its group + * timer active. This is a no-op in this function; it is easier + * to deal with it here than to complicate the slow-timeout path. + */ + if (inm->inm_timer == 0) { + query_response_timer_expired = 0; + } else if (--inm->inm_timer == 0) { + query_response_timer_expired = 1; + } else { + current_state_timers_running = 1; + } + + if (inm->inm_sctimer == 0) { + state_change_retransmit_timer_expired = 0; + } else if (--inm->inm_sctimer == 0) { + state_change_retransmit_timer_expired = 1; + } else { + state_change_timers_running = 1; + } + + /* We are in fasttimo, so be quick about it. */ + if (!state_change_retransmit_timer_expired && + !query_response_timer_expired) + return; + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_AWAKENING_MEMBER: + case IGMP_IDLE_MEMBER: + break; + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + /* + * Respond to a previously pending Group-Specific + * or Group-and-Source-Specific query by enqueueing + * the appropriate Current-State report for + * immediate transmission. + */ + if (query_response_timer_expired) { + int retval; + + retval = igmp_v3_enqueue_group_record(qrq, inm, 0, 1, + (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)); + IGMP_PRINTF(("%s: enqueue record = %d\n", + __func__, retval)); + inm->inm_state = IGMP_REPORTING_MEMBER; + /* XXX Clear recorded sources for next time. */ + inm_clear_recorded(inm); + } + /* FALLTHROUGH */ + case IGMP_REPORTING_MEMBER: + case IGMP_LEAVING_MEMBER: + if (state_change_retransmit_timer_expired) { + /* + * State-change retransmission timer fired. + * If there are any further pending retransmissions, + * set the global pending state-change flag, and + * reset the timer. + */ + if (--inm->inm_scrv > 0) { + inm->inm_sctimer = uri_fasthz; + state_change_timers_running = 1; + } + /* + * Retransmit the previously computed state-change + * report. If there are no further pending + * retransmissions, the mbuf queue will be consumed. + * Update T0 state to T1 as we have now sent + * a state-change. + */ + (void) igmp_v3_merge_state_changes(inm, scq); + + inm_commit(inm); + IGMP_PRINTF(("%s: T1 -> T0 for %s/%s%d\n", __func__, + inet_ntoa(inm->inm_addr), inm->inm_ifp->if_name, + inm->inm_ifp->if_unit)); + + /* + * If we are leaving the group for good, make sure + * we release IGMP's reference to it. + * This release must be deferred using a SLIST, + * as we are called from a loop which traverses + * the in_multihead list. + */ + if (inm->inm_state == IGMP_LEAVING_MEMBER && + inm->inm_scrv == 0) { + inm->inm_state = IGMP_NOT_MEMBER; + /* + * A reference has already been held in + * igmp_final_leave() for this inm, so + * no need to hold another one. We also + * bumped up its request count then, so + * that it stays in in_multihead. Both + * of them will be released when it is + * dequeued later on. + */ + VERIFY(inm->inm_nrelecnt != 0); + SLIST_INSERT_HEAD(&igi->igi_relinmhead, + inm, inm_nrele); + } + } + break; + } +} + +/* + * Suppress a group's pending response to a group or source/group query. + * + * Do NOT suppress state changes. This leads to IGMPv3 inconsistency. + * Do NOT update ST1/ST0 as this operation merely suppresses + * the currently pending group record. + * Do NOT suppress the response to a general query. It is possible but + * it would require adding another state or flag. + */ +static void +igmp_v3_suppress_group_record(struct in_multi *inm) +{ + + INM_LOCK_ASSERT_HELD(inm); + IGI_LOCK_ASSERT_HELD(inm->inm_igi); + + VERIFY(inm->inm_igi->igi_version == IGMP_VERSION_3); + + if (inm->inm_state != IGMP_G_QUERY_PENDING_MEMBER || + inm->inm_state != IGMP_SG_QUERY_PENDING_MEMBER) + return; + + if (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) + inm_clear_recorded(inm); + + inm->inm_timer = 0; + inm->inm_state = IGMP_REPORTING_MEMBER; +} + +/* + * Switch to a different IGMP version on the given interface, + * as per Section 7.2.1. + */ +static void +igmp_set_version(struct igmp_ifinfo *igi, const int igmp_version) +{ + int old_version_timer; + + IGI_LOCK_ASSERT_HELD(igi); + + IGMP_PRINTF(("%s: switching to v%d on ifp %p(%s%d)\n", __func__, + igmp_version, igi->igi_ifp, igi->igi_ifp->if_name, + igi->igi_ifp->if_unit)); + + if (igmp_version == IGMP_VERSION_1 || igmp_version == IGMP_VERSION_2) { + /* + * Compute the "Older Version Querier Present" timer as per + * Section 8.12. + */ + old_version_timer = igi->igi_rv * igi->igi_qi + igi->igi_qri; + old_version_timer *= PR_SLOWHZ; + + if (igmp_version == IGMP_VERSION_1) { + igi->igi_v1_timer = old_version_timer; + igi->igi_v2_timer = 0; + } else if (igmp_version == IGMP_VERSION_2) { + igi->igi_v1_timer = 0; + igi->igi_v2_timer = old_version_timer; + } + } + + if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) { + if (igi->igi_version != IGMP_VERSION_2) { + igi->igi_version = IGMP_VERSION_2; + igmp_v3_cancel_link_timers(igi); + } + } else if (igi->igi_v1_timer > 0) { + if (igi->igi_version != IGMP_VERSION_1) { + igi->igi_version = IGMP_VERSION_1; + igmp_v3_cancel_link_timers(igi); + } + } + + IGI_LOCK_ASSERT_HELD(igi); +} + +/* + * Cancel pending IGMPv3 timers for the given link and all groups + * joined on it; state-change, general-query, and group-query timers. + * + * Only ever called on a transition from v3 to Compatibility mode. Kill + * the timers stone dead (this may be expensive for large N groups), they + * will be restarted if Compatibility Mode deems that they must be due to + * query processing. + */ +static void +igmp_v3_cancel_link_timers(struct igmp_ifinfo *igi) +{ + struct ifnet *ifp; + struct in_multi *inm; + struct in_multistep step; + + IGI_LOCK_ASSERT_HELD(igi); + + IGMP_PRINTF(("%s: cancel v3 timers on ifp %p(%s%d)\n", __func__, + igi->igi_ifp, igi->igi_ifp->if_name, igi->igi_ifp->if_unit)); + + /* + * Stop the v3 General Query Response on this link stone dead. + * If fasttimo is woken up due to interface_timers_running, + * the flag will be cleared if there are no pending link timers. + */ + igi->igi_v3_timer = 0; + + /* + * Now clear the current-state and state-change report timers + * for all memberships scoped to this link. + */ + ifp = igi->igi_ifp; + IGI_UNLOCK(igi); + + in_multihead_lock_shared(); + IN_FIRST_MULTI(step, inm); + while (inm != NULL) { + INM_LOCK(inm); + if (inm->inm_ifp != ifp) + goto next; + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_AWAKENING_MEMBER: + /* + * These states are either not relevant in v3 mode, + * or are unreported. Do nothing. + */ + break; + case IGMP_LEAVING_MEMBER: + /* + * If we are leaving the group and switching to + * compatibility mode, we need to release the final + * reference held for issuing the INCLUDE {}, and + * transition to REPORTING to ensure the host leave + * message is sent upstream to the old querier -- + * transition to NOT would lose the leave and race. + * During igmp_final_leave(), we bumped up both the + * request and reference counts. Since we cannot + * call in_multi_detach() here, defer this task to + * the timer routine. + */ + VERIFY(inm->inm_nrelecnt != 0); + IGI_LOCK(igi); + SLIST_INSERT_HEAD(&igi->igi_relinmhead, inm, inm_nrele); + IGI_UNLOCK(igi); + /* FALLTHROUGH */ + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + inm_clear_recorded(inm); + /* FALLTHROUGH */ + case IGMP_REPORTING_MEMBER: + inm->inm_state = IGMP_REPORTING_MEMBER; + break; + } + /* + * Always clear state-change and group report timers. + * Free any pending IGMPv3 state-change records. + */ + inm->inm_sctimer = 0; + inm->inm_timer = 0; + IF_DRAIN(&inm->inm_scq); +next: + INM_UNLOCK(inm); + IN_NEXT_MULTI(step, inm); + } + in_multihead_lock_done(); + + IGI_LOCK(igi); +} + +/* + * Update the Older Version Querier Present timers for a link. + * See Section 7.2.1 of RFC 3376. + */ +static void +igmp_v1v2_process_querier_timers(struct igmp_ifinfo *igi) +{ + IGI_LOCK_ASSERT_HELD(igi); + + if (igi->igi_v1_timer == 0 && igi->igi_v2_timer == 0) { + /* + * IGMPv1 and IGMPv2 Querier Present timers expired. + * + * Revert to IGMPv3. + */ + if (igi->igi_version != IGMP_VERSION_3) { + IGMP_PRINTF(("%s: transition from v%d -> v%d on %p(%s%d)\n", + __func__, igi->igi_version, IGMP_VERSION_3, + igi->igi_ifp, igi->igi_ifp->if_name, + igi->igi_ifp->if_unit)); + igi->igi_version = IGMP_VERSION_3; + IF_DRAIN(&igi->igi_v2q); + } + } else if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) { + /* + * IGMPv1 Querier Present timer expired, + * IGMPv2 Querier Present timer running. + * If IGMPv2 was disabled since last timeout, + * revert to IGMPv3. + * If IGMPv2 is enabled, revert to IGMPv2. + */ + if (!igmp_v2enable) { + IGMP_PRINTF(("%s: transition from v%d -> v%d on %p(%s%d)\n", + __func__, igi->igi_version, IGMP_VERSION_3, + igi->igi_ifp, igi->igi_ifp->if_name, + igi->igi_ifp->if_unit)); + igi->igi_v2_timer = 0; + igi->igi_version = IGMP_VERSION_3; + IF_DRAIN(&igi->igi_v2q); + } else { + --igi->igi_v2_timer; + if (igi->igi_version != IGMP_VERSION_2) { + IGMP_PRINTF(("%s: transition from v%d -> v%d on %p(%s%d)\n", + __func__, igi->igi_version, IGMP_VERSION_2, + igi->igi_ifp, igi->igi_ifp->if_name, + igi->igi_ifp->if_unit)); + igi->igi_version = IGMP_VERSION_2; + IF_DRAIN(&igi->igi_gq); + } + } + } else if (igi->igi_v1_timer > 0) { + /* + * IGMPv1 Querier Present timer running. + * Stop IGMPv2 timer if running. + * + * If IGMPv1 was disabled since last timeout, + * revert to IGMPv3. + * If IGMPv1 is enabled, reset IGMPv2 timer if running. + */ + if (!igmp_v1enable) { + IGMP_PRINTF(("%s: transition from v%d -> v%d on %p(%s%d)\n", + __func__, igi->igi_version, IGMP_VERSION_3, + igi->igi_ifp, igi->igi_ifp->if_name, + igi->igi_ifp->if_unit)); + igi->igi_v1_timer = 0; + igi->igi_version = IGMP_VERSION_3; + IF_DRAIN(&igi->igi_v2q); + } else { + --igi->igi_v1_timer; + } + if (igi->igi_v2_timer > 0) { + IGMP_PRINTF(("%s: cancel v2 timer on %p(%s%d)\n", + __func__, igi->igi_ifp, igi->igi_ifp->if_name, + igi->igi_ifp->if_unit)); + igi->igi_v2_timer = 0; + } + } +} + +/* + * Dispatch an IGMPv1/v2 host report or leave message. + * These are always small enough to fit inside a single mbuf. + */ +static int +igmp_v1v2_queue_report(struct in_multi *inm, const int type) +{ + struct ifnet *ifp; + struct igmp *igmp; + struct ip *ip; + struct mbuf *m; + int error = 0; + + INM_LOCK_ASSERT_HELD(inm); + IGI_LOCK_ASSERT_HELD(inm->inm_igi); + + ifp = inm->inm_ifp; + + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == NULL) + return (ENOMEM); + MH_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp)); + + m->m_pkthdr.len = sizeof(struct ip) + sizeof(struct igmp); + + m->m_data += sizeof(struct ip); + m->m_len = sizeof(struct igmp); + + igmp = mtod(m, struct igmp *); + igmp->igmp_type = type; + igmp->igmp_code = 0; + igmp->igmp_group = inm->inm_addr; + igmp->igmp_cksum = 0; + igmp->igmp_cksum = in_cksum(m, sizeof(struct igmp)); + + m->m_data -= sizeof(struct ip); + m->m_len += sizeof(struct ip); + + ip = mtod(m, struct ip *); + ip->ip_tos = 0; + ip->ip_len = sizeof(struct ip) + sizeof(struct igmp); + ip->ip_off = 0; + ip->ip_p = IPPROTO_IGMP; + ip->ip_src.s_addr = INADDR_ANY; + + if (type == IGMP_HOST_LEAVE_MESSAGE) + ip->ip_dst.s_addr = htonl(INADDR_ALLRTRS_GROUP); + else + ip->ip_dst = inm->inm_addr; + + m->m_flags |= M_IGMPV2; + if (inm->inm_igi->igi_flags & IGIF_LOOPBACK) + m->m_flags |= M_IGMP_LOOP; + + /* + * Due to the fact that at this point we are possibly holding + * in_multihead_lock in shared or exclusive mode, we can't call + * igmp_sendpkt() here since that will eventually call ip_output(), + * which will try to lock in_multihead_lock and cause a deadlock. + * Instead we defer the work to the igmp_slowtimo() thread, thus + * avoiding unlocking in_multihead_lock here. + */ + if (IF_QFULL(&inm->inm_igi->igi_v2q)) { + IGMP_PRINTF(("%s: v1/v2 outbound queue full\n", __func__)); + error = ENOMEM; + m_freem(m); + } else + IF_ENQUEUE(&inm->inm_igi->igi_v2q, m); + + return (error); +} + +/* + * Process a state change from the upper layer for the given IPv4 group. + * + * Each socket holds a reference on the in_multi in its own ip_moptions. + * The socket layer will have made the necessary updates to the group + * state, it is now up to IGMP to issue a state change report if there + * has been any change between T0 (when the last state-change was issued) + * and T1 (now). + * + * We use the IGMPv3 state machine at group level. The IGMP module + * however makes the decision as to which IGMP protocol version to speak. + * A state change *from* INCLUDE {} always means an initial join. + * A state change *to* INCLUDE {} always means a final leave. + * + * FUTURE: If IGIF_V3LITE is enabled for this interface, then we can + * save ourselves a bunch of work; any exclusive mode groups need not + * compute source filter lists. + */ +int +igmp_change_state(struct in_multi *inm) +{ + struct igmp_ifinfo *igi; + struct ifnet *ifp; + int error = 0; + + INM_LOCK_ASSERT_HELD(inm); + VERIFY(inm->inm_igi != NULL); + IGI_LOCK_ASSERT_NOTHELD(inm->inm_igi); + + /* + * Try to detect if the upper layer just asked us to change state + * for an interface which has now gone away. + */ + VERIFY(inm->inm_ifma != NULL); + ifp = inm->inm_ifma->ifma_ifp; + /* + * Sanity check that netinet's notion of ifp is the same as net's. + */ + VERIFY(inm->inm_ifp == ifp); + + igi = IGMP_IFINFO(ifp); + VERIFY(igi != NULL); + + /* + * If we detect a state transition to or from MCAST_UNDEFINED + * for this group, then we are starting or finishing an IGMP + * life cycle for this group. + */ + if (inm->inm_st[1].iss_fmode != inm->inm_st[0].iss_fmode) { + IGMP_PRINTF(("%s: inm transition %d -> %d\n", __func__, + inm->inm_st[0].iss_fmode, inm->inm_st[1].iss_fmode)); + if (inm->inm_st[0].iss_fmode == MCAST_UNDEFINED) { + IGMP_PRINTF(("%s: initial join\n", __func__)); + error = igmp_initial_join(inm, igi); + goto out; + } else if (inm->inm_st[1].iss_fmode == MCAST_UNDEFINED) { + IGMP_PRINTF(("%s: final leave\n", __func__)); + igmp_final_leave(inm, igi); + goto out; + } + } else { + IGMP_PRINTF(("%s: filter set change\n", __func__)); + } + + error = igmp_handle_state_change(inm, igi); +out: + return (error); +} + +/* + * Perform the initial join for an IGMP group. + * + * When joining a group: + * If the group should have its IGMP traffic suppressed, do nothing. + * IGMPv1 starts sending IGMPv1 host membership reports. + * IGMPv2 starts sending IGMPv2 host membership reports. + * IGMPv3 will schedule an IGMPv3 state-change report containing the + * initial state of the membership. + */ +static int +igmp_initial_join(struct in_multi *inm, struct igmp_ifinfo *igi) +{ + struct ifnet *ifp; + struct ifqueue *ifq; + int error, retval, syncstates; + + INM_LOCK_ASSERT_HELD(inm); + IGI_LOCK_ASSERT_NOTHELD(igi); + + IGMP_PRINTF(("%s: initial join %s on ifp %p(%s%d)\n", + __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp, + inm->inm_ifp->if_name, inm->inm_ifp->if_unit)); + + error = 0; + syncstates = 1; + + ifp = inm->inm_ifp; + + IGI_LOCK(igi); + VERIFY(igi->igi_ifp == ifp); + + /* + * Groups joined on loopback or marked as 'not reported', + * e.g. 224.0.0.1, enter the IGMP_SILENT_MEMBER state and + * are never reported in any IGMP protocol exchanges. + * All other groups enter the appropriate IGMP state machine + * for the version in use on this link. + * A link marked as IGIF_SILENT causes IGMP to be completely + * disabled for the link. + */ + if ((ifp->if_flags & IFF_LOOPBACK) || + (igi->igi_flags & IGIF_SILENT) || + !igmp_isgroupreported(inm->inm_addr)) { + IGMP_PRINTF(("%s: not kicking state machine for silent group\n", + __func__)); + inm->inm_state = IGMP_SILENT_MEMBER; + inm->inm_timer = 0; + } else { + /* + * Deal with overlapping in_multi lifecycle. + * If this group was LEAVING, then make sure + * we drop the reference we picked up to keep the + * group around for the final INCLUDE {} enqueue. + * Since we cannot call in_multi_detach() here, + * defer this task to the timer routine. + */ + if (igi->igi_version == IGMP_VERSION_3 && + inm->inm_state == IGMP_LEAVING_MEMBER) { + VERIFY(inm->inm_nrelecnt != 0); + SLIST_INSERT_HEAD(&igi->igi_relinmhead, inm, inm_nrele); + } + + inm->inm_state = IGMP_REPORTING_MEMBER; + + switch (igi->igi_version) { + case IGMP_VERSION_1: + case IGMP_VERSION_2: + inm->inm_state = IGMP_IDLE_MEMBER; + error = igmp_v1v2_queue_report(inm, + (igi->igi_version == IGMP_VERSION_2) ? + IGMP_v2_HOST_MEMBERSHIP_REPORT : + IGMP_v1_HOST_MEMBERSHIP_REPORT); + + INM_LOCK_ASSERT_HELD(inm); + IGI_LOCK_ASSERT_HELD(igi); + + if (error == 0) { + inm->inm_timer = IGMP_RANDOM_DELAY( + IGMP_V1V2_MAX_RI * PR_SLOWHZ); + current_state_timers_running = 1; + } + break; + + case IGMP_VERSION_3: + /* + * Defer update of T0 to T1, until the first copy + * of the state change has been transmitted. + */ + syncstates = 0; + + /* + * Immediately enqueue a State-Change Report for + * this interface, freeing any previous reports. + * Don't kick the timers if there is nothing to do, + * or if an error occurred. + */ + ifq = &inm->inm_scq; + IF_DRAIN(ifq); + retval = igmp_v3_enqueue_group_record(ifq, inm, 1, + 0, 0); + IGMP_PRINTF(("%s: enqueue record = %d\n", + __func__, retval)); + if (retval <= 0) { + error = retval * -1; + break; + } + + /* + * Schedule transmission of pending state-change + * report up to RV times for this link. The timer + * will fire at the next igmp_fasttimo (~200ms), + * giving us an opportunity to merge the reports. + */ + if (igi->igi_flags & IGIF_LOOPBACK) { + inm->inm_scrv = 1; + } else { + VERIFY(igi->igi_rv > 1); + inm->inm_scrv = igi->igi_rv; + } + inm->inm_sctimer = 1; + state_change_timers_running = 1; + + error = 0; + break; + } + } + IGI_UNLOCK(igi); + + /* + * Only update the T0 state if state change is atomic, + * i.e. we don't need to wait for a timer to fire before we + * can consider the state change to have been communicated. + */ + if (syncstates) { + inm_commit(inm); + IGMP_PRINTF(("%s: T1 -> T0 for %s/%s%d\n", __func__, + inet_ntoa(inm->inm_addr), inm->inm_ifp->if_name, + inm->inm_ifp->if_unit)); + } + + return (error); +} + +/* + * Issue an intermediate state change during the IGMP life-cycle. + */ +static int +igmp_handle_state_change(struct in_multi *inm, struct igmp_ifinfo *igi) +{ + struct ifnet *ifp; + int retval; + + INM_LOCK_ASSERT_HELD(inm); + IGI_LOCK_ASSERT_NOTHELD(igi); + + IGMP_PRINTF(("%s: state change for %s on ifp %p(%s%d)\n", + __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp, + inm->inm_ifp->if_name, inm->inm_ifp->if_unit)); + + ifp = inm->inm_ifp; + + IGI_LOCK(igi); + VERIFY(igi->igi_ifp == ifp); + + if ((ifp->if_flags & IFF_LOOPBACK) || + (igi->igi_flags & IGIF_SILENT) || + !igmp_isgroupreported(inm->inm_addr) || + (igi->igi_version != IGMP_VERSION_3)) { + IGI_UNLOCK(igi); + if (!igmp_isgroupreported(inm->inm_addr)) { + IGMP_PRINTF(("%s: not kicking state " + "machine for silent group\n", __func__)); + } + IGMP_PRINTF(("%s: nothing to do\n", __func__)); + inm_commit(inm); + IGMP_PRINTF(("%s: T1 -> T0 for %s/%s\n", __func__, + inet_ntoa(inm->inm_addr), inm->inm_ifp->if_name)); + return (0); + } + + IF_DRAIN(&inm->inm_scq); + + retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0); + IGMP_PRINTF(("%s: enqueue record = %d\n", __func__, retval)); + if (retval <= 0) { + IGI_UNLOCK(igi); + return (-retval); + } + /* + * If record(s) were enqueued, start the state-change + * report timer for this group. + */ + inm->inm_scrv = ((igi->igi_flags & IGIF_LOOPBACK) ? 1 : igi->igi_rv); + inm->inm_sctimer = 1; + state_change_timers_running = 1; + IGI_UNLOCK(igi); + + return (0); +} + +/* + * Perform the final leave for an IGMP group. + * + * When leaving a group: + * IGMPv1 does nothing. + * IGMPv2 sends a host leave message, if and only if we are the reporter. + * IGMPv3 enqueues a state-change report containing a transition + * to INCLUDE {} for immediate transmission. + */ +static void +igmp_final_leave(struct in_multi *inm, struct igmp_ifinfo *igi) +{ + int syncstates = 1; + + INM_LOCK_ASSERT_HELD(inm); + IGI_LOCK_ASSERT_NOTHELD(igi); + + IGMP_PRINTF(("%s: final leave %s on ifp %p(%s%d)\n", + __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp, + inm->inm_ifp->if_name, inm->inm_ifp->if_unit)); + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + case IGMP_LEAVING_MEMBER: + /* Already leaving or left; do nothing. */ + IGMP_PRINTF(("%s: not kicking state machine for silent group\n", + __func__)); + break; + case IGMP_REPORTING_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + IGI_LOCK(igi); + if (igi->igi_version == IGMP_VERSION_2) { + if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER || + inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) { + panic("%s: IGMPv3 state reached, not IGMPv3 " + "mode\n", __func__); + /* NOTREACHED */ + } + igmp_v1v2_queue_report(inm, IGMP_HOST_LEAVE_MESSAGE); + + INM_LOCK_ASSERT_HELD(inm); + IGI_LOCK_ASSERT_HELD(igi); + + inm->inm_state = IGMP_NOT_MEMBER; + } else if (igi->igi_version == IGMP_VERSION_3) { + /* + * Stop group timer and all pending reports. + * Immediately enqueue a state-change report + * TO_IN {} to be sent on the next fast timeout, + * giving us an opportunity to merge reports. + */ + IF_DRAIN(&inm->inm_scq); + inm->inm_timer = 0; + if (igi->igi_flags & IGIF_LOOPBACK) { + inm->inm_scrv = 1; + } else { + inm->inm_scrv = igi->igi_rv; + } + IGMP_PRINTF(("%s: Leaving %s/%s%d with %d " + "pending retransmissions.\n", __func__, + inet_ntoa(inm->inm_addr), + inm->inm_ifp->if_name, inm->inm_ifp->if_unit, + inm->inm_scrv)); + if (inm->inm_scrv == 0) { + inm->inm_state = IGMP_NOT_MEMBER; + inm->inm_sctimer = 0; + } else { + int retval; + /* + * Stick around in the in_multihead list; + * the final detach will be issued by + * igmp_v3_process_group_timers() when + * the retransmit timer expires. + */ + INM_ADDREF_LOCKED(inm); + VERIFY(inm->inm_debug & IFD_ATTACHED); + inm->inm_reqcnt++; + VERIFY(inm->inm_reqcnt >= 1); + inm->inm_nrelecnt++; + VERIFY(inm->inm_nrelecnt != 0); + + retval = igmp_v3_enqueue_group_record( + &inm->inm_scq, inm, 1, 0, 0); + KASSERT(retval != 0, + ("%s: enqueue record = %d\n", __func__, + retval)); + + inm->inm_state = IGMP_LEAVING_MEMBER; + inm->inm_sctimer = 1; + state_change_timers_running = 1; + syncstates = 0; + } + } + IGI_UNLOCK(igi); + break; + case IGMP_LAZY_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_AWAKENING_MEMBER: + /* Our reports are suppressed; do nothing. */ + break; + } + + if (syncstates) { + inm_commit(inm); + IGMP_PRINTF(("%s: T1 -> T0 for %s/%s%d\n", __func__, + inet_ntoa(inm->inm_addr), inm->inm_ifp->if_name, + inm->inm_ifp->if_unit)); + inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; + IGMP_PRINTF(("%s: T1 now MCAST_UNDEFINED for %s/%s%d\n", + __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_name, + inm->inm_ifp->if_unit)); + } +} + +/* + * Enqueue an IGMPv3 group record to the given output queue. + * + * XXX This function could do with having the allocation code + * split out, and the multiple-tree-walks coalesced into a single + * routine as has been done in igmp_v3_enqueue_filter_change(). + * + * If is_state_change is zero, a current-state record is appended. + * If is_state_change is non-zero, a state-change report is appended. + * + * If is_group_query is non-zero, an mbuf packet chain is allocated. + * If is_group_query is zero, and if there is a packet with free space + * at the tail of the queue, it will be appended to providing there + * is enough free space. + * Otherwise a new mbuf packet chain is allocated. + * + * If is_source_query is non-zero, each source is checked to see if + * it was recorded for a Group-Source query, and will be omitted if + * it is not both in-mode and recorded. + * + * The function will attempt to allocate leading space in the packet + * for the IP/IGMP header to be prepended without fragmenting the chain. + * + * If successful the size of all data appended to the queue is returned, + * otherwise an error code less than zero is returned, or zero if + * no record(s) were appended. + */ +static int +igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm, + const int is_state_change, const int is_group_query, + const int is_source_query) +{ + struct igmp_grouprec ig; + struct igmp_grouprec *pig; + struct ifnet *ifp; + struct ip_msource *ims, *nims; + struct mbuf *m0, *m, *md; + int error, is_filter_list_change; + int minrec0len, m0srcs, msrcs, nbytes, off; + int record_has_sources; + int now; + int type; + in_addr_t naddr; + uint8_t mode; + + INM_LOCK_ASSERT_HELD(inm); + IGI_LOCK_ASSERT_HELD(inm->inm_igi); + + error = 0; + ifp = inm->inm_ifp; + is_filter_list_change = 0; + m = NULL; + m0 = NULL; + m0srcs = 0; + msrcs = 0; + nbytes = 0; + nims = NULL; + record_has_sources = 1; + pig = NULL; + type = IGMP_DO_NOTHING; + mode = inm->inm_st[1].iss_fmode; + + /* + * If we did not transition out of ASM mode during t0->t1, + * and there are no source nodes to process, we can skip + * the generation of source records. + */ + if (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0 && + inm->inm_nsrc == 0) + record_has_sources = 0; + + if (is_state_change) { + /* + * Queue a state change record. + * If the mode did not change, and there are non-ASM + * listeners or source filters present, + * we potentially need to issue two records for the group. + * If we are transitioning to MCAST_UNDEFINED, we need + * not send any sources. + * If there are ASM listeners, and there was no filter + * mode transition of any kind, do nothing. + */ + if (mode != inm->inm_st[0].iss_fmode) { + if (mode == MCAST_EXCLUDE) { + IGMP_PRINTF(("%s: change to EXCLUDE\n", + __func__)); + type = IGMP_CHANGE_TO_EXCLUDE_MODE; + } else { + IGMP_PRINTF(("%s: change to INCLUDE\n", + __func__)); + type = IGMP_CHANGE_TO_INCLUDE_MODE; + if (mode == MCAST_UNDEFINED) + record_has_sources = 0; + } + } else { + if (record_has_sources) { + is_filter_list_change = 1; + } else { + type = IGMP_DO_NOTHING; + } + } + } else { + /* + * Queue a current state record. + */ + if (mode == MCAST_EXCLUDE) { + type = IGMP_MODE_IS_EXCLUDE; + } else if (mode == MCAST_INCLUDE) { + type = IGMP_MODE_IS_INCLUDE; + VERIFY(inm->inm_st[1].iss_asm == 0); + } + } + + /* + * Generate the filter list changes using a separate function. + */ + if (is_filter_list_change) + return (igmp_v3_enqueue_filter_change(ifq, inm)); + + if (type == IGMP_DO_NOTHING) { + IGMP_PRINTF(("%s: nothing to do for %s/%s%d\n", + __func__, inet_ntoa(inm->inm_addr), + inm->inm_ifp->if_name, inm->inm_ifp->if_unit)); + return (0); + } + + /* + * If any sources are present, we must be able to fit at least + * one in the trailing space of the tail packet's mbuf, + * ideally more. + */ + minrec0len = sizeof(struct igmp_grouprec); + if (record_has_sources) + minrec0len += sizeof(in_addr_t); + + IGMP_PRINTF(("%s: queueing %s for %s/%s%d\n", __func__, + igmp_rec_type_to_str(type), inet_ntoa(inm->inm_addr), + inm->inm_ifp->if_name, inm->inm_ifp->if_unit)); + + /* + * Check if we have a packet in the tail of the queue for this + * group into which the first group record for this group will fit. + * Otherwise allocate a new packet. + * Always allocate leading space for IP+RA_OPT+IGMP+REPORT. + * Note: Group records for G/GSR query responses MUST be sent + * in their own packet. + */ + m0 = ifq->ifq_tail; + if (!is_group_query && + m0 != NULL && + (m0->m_pkthdr.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) && + (m0->m_pkthdr.len + minrec0len) < + (ifp->if_mtu - IGMP_LEADINGSPACE)) { + m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - + sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); + m = m0; + IGMP_PRINTF(("%s: use existing packet\n", __func__)); + } else { + if (IF_QFULL(ifq)) { + IGMP_PRINTF(("%s: outbound queue full\n", __func__)); + return (-ENOMEM); + } + m = NULL; + m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - + sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); + if (!is_state_change && !is_group_query) { + m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + if (m) + m->m_data += IGMP_LEADINGSPACE; + } + if (m == NULL) { + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m) + MH_ALIGN(m, IGMP_LEADINGSPACE); + } + if (m == NULL) + return (-ENOMEM); + + IGMP_PRINTF(("%s: allocated first packet\n", __func__)); + } + + /* + * Append group record. + * If we have sources, we don't know how many yet. + */ + ig.ig_type = type; + ig.ig_datalen = 0; + ig.ig_numsrc = 0; + ig.ig_group = inm->inm_addr; + if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) { + if (m != m0) + m_freem(m); + IGMP_PRINTF(("%s: m_append() failed.\n", __func__)); + return (-ENOMEM); + } + nbytes += sizeof(struct igmp_grouprec); + + /* + * Append as many sources as will fit in the first packet. + * If we are appending to a new packet, the chain allocation + * may potentially use clusters; use m_getptr() in this case. + * If we are appending to an existing packet, we need to obtain + * a pointer to the group record after m_append(), in case a new + * mbuf was allocated. + * Only append sources which are in-mode at t1. If we are + * transitioning to MCAST_UNDEFINED state on the group, do not + * include source entries. + * Only report recorded sources in our filter set when responding + * to a group-source query. + */ + if (record_has_sources) { + if (m == m0) { + md = m_last(m); + pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + + md->m_len - nbytes); + } else { + md = m_getptr(m, 0, &off); + pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + + off); + } + msrcs = 0; + RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, nims) { + IGMP_PRINTF(("%s: visit node %s\n", __func__, + inet_ntoa_haddr(ims->ims_haddr))); + now = ims_get_mode(inm, ims, 1); + IGMP_PRINTF(("%s: node is %d\n", __func__, now)); + if ((now != mode) || + (now == mode && mode == MCAST_UNDEFINED)) { + IGMP_PRINTF(("%s: skip node\n", __func__)); + continue; + } + if (is_source_query && ims->ims_stp == 0) { + IGMP_PRINTF(("%s: skip unrecorded node\n", + __func__)); + continue; + } + IGMP_PRINTF(("%s: append node\n", __func__)); + naddr = htonl(ims->ims_haddr); + if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { + if (m != m0) + m_freem(m); + IGMP_PRINTF(("%s: m_append() failed.\n", + __func__)); + return (-ENOMEM); + } + nbytes += sizeof(in_addr_t); + ++msrcs; + if (msrcs == m0srcs) + break; + } + IGMP_PRINTF(("%s: msrcs is %d this packet\n", __func__, + msrcs)); + pig->ig_numsrc = htons(msrcs); + nbytes += (msrcs * sizeof(in_addr_t)); + } + + if (is_source_query && msrcs == 0) { + IGMP_PRINTF(("%s: no recorded sources to report\n", __func__)); + if (m != m0) + m_freem(m); + return (0); + } + + /* + * We are good to go with first packet. + */ + if (m != m0) { + IGMP_PRINTF(("%s: enqueueing first packet\n", __func__)); + m->m_pkthdr.vt_nrecs = 1; + m->m_pkthdr.rcvif = ifp; + IF_ENQUEUE(ifq, m); + } else { + m->m_pkthdr.vt_nrecs++; + } + /* + * No further work needed if no source list in packet(s). + */ + if (!record_has_sources) + return (nbytes); + + /* + * Whilst sources remain to be announced, we need to allocate + * a new packet and fill out as many sources as will fit. + * Always try for a cluster first. + */ + while (nims != NULL) { + if (IF_QFULL(ifq)) { + IGMP_PRINTF(("%s: outbound queue full\n", __func__)); + return (-ENOMEM); + } + m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + if (m) + m->m_data += IGMP_LEADINGSPACE; + if (m == NULL) { + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m) + MH_ALIGN(m, IGMP_LEADINGSPACE); + } + if (m == NULL) + return (-ENOMEM); + md = m_getptr(m, 0, &off); + pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); + IGMP_PRINTF(("%s: allocated next packet\n", __func__)); + + if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) { + if (m != m0) + m_freem(m); + IGMP_PRINTF(("%s: m_append() failed.\n", __func__)); + return (-ENOMEM); + } + m->m_pkthdr.vt_nrecs = 1; + nbytes += sizeof(struct igmp_grouprec); + + m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - + sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); + + msrcs = 0; + RB_FOREACH_FROM(ims, ip_msource_tree, nims) { + IGMP_PRINTF(("%s: visit node %s\n", __func__, + inet_ntoa_haddr(ims->ims_haddr))); + now = ims_get_mode(inm, ims, 1); + if ((now != mode) || + (now == mode && mode == MCAST_UNDEFINED)) { + IGMP_PRINTF(("%s: skip node\n", __func__)); + continue; + } + if (is_source_query && ims->ims_stp == 0) { + IGMP_PRINTF(("%s: skip unrecorded node\n", + __func__)); + continue; + } + IGMP_PRINTF(("%s: append node\n", __func__)); + naddr = htonl(ims->ims_haddr); + if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { + if (m != m0) + m_freem(m); + IGMP_PRINTF(("%s: m_append() failed.\n", + __func__)); + return (-ENOMEM); + } + ++msrcs; + if (msrcs == m0srcs) + break; + } + pig->ig_numsrc = htons(msrcs); + nbytes += (msrcs * sizeof(in_addr_t)); + + IGMP_PRINTF(("%s: enqueueing next packet\n", __func__)); + m->m_pkthdr.rcvif = ifp; + IF_ENQUEUE(ifq, m); + } + + return (nbytes); +} + +/* + * Type used to mark record pass completion. + * We exploit the fact we can cast to this easily from the + * current filter modes on each ip_msource node. + */ +typedef enum { + REC_NONE = 0x00, /* MCAST_UNDEFINED */ + REC_ALLOW = 0x01, /* MCAST_INCLUDE */ + REC_BLOCK = 0x02, /* MCAST_EXCLUDE */ + REC_FULL = REC_ALLOW | REC_BLOCK +} rectype_t; + +/* + * Enqueue an IGMPv3 filter list change to the given output queue. + * + * Source list filter state is held in an RB-tree. When the filter list + * for a group is changed without changing its mode, we need to compute + * the deltas between T0 and T1 for each source in the filter set, + * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records. + * + * As we may potentially queue two record types, and the entire R-B tree + * needs to be walked at once, we break this out into its own function + * so we can generate a tightly packed queue of packets. + * + * XXX This could be written to only use one tree walk, although that makes + * serializing into the mbuf chains a bit harder. For now we do two walks + * which makes things easier on us, and it may or may not be harder on + * the L2 cache. + * + * If successful the size of all data appended to the queue is returned, + * otherwise an error code less than zero is returned, or zero if + * no record(s) were appended. + */ +static int +igmp_v3_enqueue_filter_change(struct ifqueue *ifq, struct in_multi *inm) +{ + static const int MINRECLEN = + sizeof(struct igmp_grouprec) + sizeof(in_addr_t); + struct ifnet *ifp; + struct igmp_grouprec ig; + struct igmp_grouprec *pig; + struct ip_msource *ims, *nims; + struct mbuf *m, *m0, *md; + in_addr_t naddr; + int m0srcs, nbytes, npbytes, off, rsrcs, schanged; + int nallow, nblock; + uint8_t mode, now, then; + rectype_t crt, drt, nrt; + + INM_LOCK_ASSERT_HELD(inm); + + if (inm->inm_nsrc == 0 || + (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0)) + return (0); + + ifp = inm->inm_ifp; /* interface */ + mode = inm->inm_st[1].iss_fmode; /* filter mode at t1 */ + crt = REC_NONE; /* current group record type */ + drt = REC_NONE; /* mask of completed group record types */ + nrt = REC_NONE; /* record type for current node */ + m0srcs = 0; /* # source which will fit in current mbuf chain */ + nbytes = 0; /* # of bytes appended to group's state-change queue */ + npbytes = 0; /* # of bytes appended this packet */ + rsrcs = 0; /* # sources encoded in current record */ + schanged = 0; /* # nodes encoded in overall filter change */ + nallow = 0; /* # of source entries in ALLOW_NEW */ + nblock = 0; /* # of source entries in BLOCK_OLD */ + nims = NULL; /* next tree node pointer */ + + /* + * For each possible filter record mode. + * The first kind of source we encounter tells us which + * is the first kind of record we start appending. + * If a node transitioned to UNDEFINED at t1, its mode is treated + * as the inverse of the group's filter mode. + */ + while (drt != REC_FULL) { + do { + m0 = ifq->ifq_tail; + if (m0 != NULL && + (m0->m_pkthdr.vt_nrecs + 1 <= + IGMP_V3_REPORT_MAXRECS) && + (m0->m_pkthdr.len + MINRECLEN) < + (ifp->if_mtu - IGMP_LEADINGSPACE)) { + m = m0; + m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - + sizeof(struct igmp_grouprec)) / + sizeof(in_addr_t); + IGMP_PRINTF(("%s: use previous packet\n", + __func__)); + } else { + m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + if (m) + m->m_data += IGMP_LEADINGSPACE; + if (m == NULL) { + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m) + MH_ALIGN(m, IGMP_LEADINGSPACE); + } + if (m == NULL) { + IGMP_PRINTF(("%s: m_get*() failed\n", + __func__)); + return (-ENOMEM); + } + m->m_pkthdr.vt_nrecs = 0; + m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - + sizeof(struct igmp_grouprec)) / + sizeof(in_addr_t); + npbytes = 0; + IGMP_PRINTF(("%s: allocated new packet\n", + __func__)); + } + /* + * Append the IGMP group record header to the + * current packet's data area. + * Recalculate pointer to free space for next + * group record, in case m_append() allocated + * a new mbuf or cluster. + */ + memset(&ig, 0, sizeof(ig)); + ig.ig_group = inm->inm_addr; + if (!m_append(m, sizeof(ig), (void *)&ig)) { + if (m != m0) + m_freem(m); + IGMP_PRINTF(("%s: m_append() failed\n", + __func__)); + return (-ENOMEM); + } + npbytes += sizeof(struct igmp_grouprec); + if (m != m0) { + /* new packet; offset in c hain */ + md = m_getptr(m, npbytes - + sizeof(struct igmp_grouprec), &off); + pig = (struct igmp_grouprec *)(mtod(md, + uint8_t *) + off); + } else { + /* current packet; offset from last append */ + md = m_last(m); + pig = (struct igmp_grouprec *)(mtod(md, + uint8_t *) + md->m_len - + sizeof(struct igmp_grouprec)); + } + /* + * Begin walking the tree for this record type + * pass, or continue from where we left off + * previously if we had to allocate a new packet. + * Only report deltas in-mode at t1. + * We need not report included sources as allowed + * if we are in inclusive mode on the group, + * however the converse is not true. + */ + rsrcs = 0; + if (nims == NULL) + nims = RB_MIN(ip_msource_tree, &inm->inm_srcs); + RB_FOREACH_FROM(ims, ip_msource_tree, nims) { + IGMP_PRINTF(("%s: visit node %s\n", + __func__, inet_ntoa_haddr(ims->ims_haddr))); + now = ims_get_mode(inm, ims, 1); + then = ims_get_mode(inm, ims, 0); + IGMP_PRINTF(("%s: mode: t0 %d, t1 %d\n", + __func__, then, now)); + if (now == then) { + IGMP_PRINTF(("%s: skip unchanged\n", + __func__)); + continue; + } + if (mode == MCAST_EXCLUDE && + now == MCAST_INCLUDE) { + IGMP_PRINTF(("%s: skip IN src on EX " + "group\n", __func__)); + continue; + } + nrt = (rectype_t)now; + if (nrt == REC_NONE) + nrt = (rectype_t)(~mode & REC_FULL); + if (schanged++ == 0) { + crt = nrt; + } else if (crt != nrt) + continue; + naddr = htonl(ims->ims_haddr); + if (!m_append(m, sizeof(in_addr_t), + (void *)&naddr)) { + if (m != m0) + m_freem(m); + IGMP_PRINTF(("%s: m_append() failed\n", + __func__)); + return (-ENOMEM); + } + nallow += !!(crt == REC_ALLOW); + nblock += !!(crt == REC_BLOCK); + if (++rsrcs == m0srcs) + break; + } + /* + * If we did not append any tree nodes on this + * pass, back out of allocations. + */ + if (rsrcs == 0) { + npbytes -= sizeof(struct igmp_grouprec); + if (m != m0) { + IGMP_PRINTF(("%s: m_free(m)\n", + __func__)); + m_freem(m); + } else { + IGMP_PRINTF(("%s: m_adj(m, -ig)\n", + __func__)); + m_adj(m, -((int)sizeof( + struct igmp_grouprec))); + } + continue; + } + npbytes += (rsrcs * sizeof(in_addr_t)); + if (crt == REC_ALLOW) + pig->ig_type = IGMP_ALLOW_NEW_SOURCES; + else if (crt == REC_BLOCK) + pig->ig_type = IGMP_BLOCK_OLD_SOURCES; + pig->ig_numsrc = htons(rsrcs); + /* + * Count the new group record, and enqueue this + * packet if it wasn't already queued. + */ + m->m_pkthdr.vt_nrecs++; + m->m_pkthdr.rcvif = ifp; + if (m != m0) + IF_ENQUEUE(ifq, m); + nbytes += npbytes; + } while (nims != NULL); + drt |= crt; + crt = (~crt & REC_FULL); + } + + IGMP_PRINTF(("%s: queued %d ALLOW_NEW, %d BLOCK_OLD\n", __func__, + nallow, nblock)); + + return (nbytes); +} + +static int +igmp_v3_merge_state_changes(struct in_multi *inm, struct ifqueue *ifscq) +{ + struct ifqueue *gq; + struct mbuf *m; /* pending state-change */ + struct mbuf *m0; /* copy of pending state-change */ + struct mbuf *mt; /* last state-change in packet */ + struct mbuf *n; + int docopy, domerge; + u_int recslen; + + INM_LOCK_ASSERT_HELD(inm); + + docopy = 0; + domerge = 0; + recslen = 0; + + /* + * If there are further pending retransmissions, make a writable + * copy of each queued state-change message before merging. + */ + if (inm->inm_scrv > 0) + docopy = 1; + + gq = &inm->inm_scq; +#ifdef IGMP_DEBUG + if (gq->ifq_head == NULL) { + IGMP_PRINTF(("%s: WARNING: queue for inm %p is empty\n", + __func__, inm)); + } +#endif + + /* + * Use IF_REMQUEUE() instead of IF_DEQUEUE() below, since the + * packet might not always be at the head of the ifqueue. + */ + m = gq->ifq_head; + while (m != NULL) { + /* + * Only merge the report into the current packet if + * there is sufficient space to do so; an IGMPv3 report + * packet may only contain 65,535 group records. + * Always use a simple mbuf chain concatentation to do this, + * as large state changes for single groups may have + * allocated clusters. + */ + domerge = 0; + mt = ifscq->ifq_tail; + if (mt != NULL) { + recslen = m_length(m); + + if ((mt->m_pkthdr.vt_nrecs + + m->m_pkthdr.vt_nrecs <= + IGMP_V3_REPORT_MAXRECS) && + (mt->m_pkthdr.len + recslen <= + (inm->inm_ifp->if_mtu - IGMP_LEADINGSPACE))) + domerge = 1; + } + + if (!domerge && IF_QFULL(gq)) { + IGMP_PRINTF(("%s: outbound queue full, skipping whole " + "packet %p\n", __func__, m)); + n = m->m_nextpkt; + if (!docopy) { + IF_REMQUEUE(gq, m); + m_freem(m); + } + m = n; + continue; + } + + if (!docopy) { + IGMP_PRINTF(("%s: dequeueing %p\n", __func__, m)); + n = m->m_nextpkt; + IF_REMQUEUE(gq, m); + m0 = m; + m = n; + } else { + IGMP_PRINTF(("%s: copying %p\n", __func__, m)); + m0 = m_dup(m, M_NOWAIT); + if (m0 == NULL) + return (ENOMEM); + m0->m_nextpkt = NULL; + m = m->m_nextpkt; + } + + if (!domerge) { + IGMP_PRINTF(("%s: queueing %p to ifscq %p)\n", + __func__, m0, ifscq)); + m0->m_pkthdr.rcvif = inm->inm_ifp; + IF_ENQUEUE(ifscq, m0); + } else { + struct mbuf *mtl; /* last mbuf of packet mt */ + + IGMP_PRINTF(("%s: merging %p with ifscq tail %p)\n", + __func__, m0, mt)); + + mtl = m_last(mt); + m0->m_flags &= ~M_PKTHDR; + mt->m_pkthdr.len += recslen; + mt->m_pkthdr.vt_nrecs += + m0->m_pkthdr.vt_nrecs; + + mtl->m_next = m0; + } + } + + return (0); +} + +/* + * Respond to a pending IGMPv3 General Query. + */ +static void +igmp_v3_dispatch_general_query(struct igmp_ifinfo *igi) +{ + struct ifnet *ifp; + struct in_multi *inm; + struct in_multistep step; + int retval, loop; + + IGI_LOCK_ASSERT_HELD(igi); + + VERIFY(igi->igi_version == IGMP_VERSION_3); + + ifp = igi->igi_ifp; + IGI_UNLOCK(igi); + + in_multihead_lock_shared(); + IN_FIRST_MULTI(step, inm); + while (inm != NULL) { + INM_LOCK(inm); + if (inm->inm_ifp != ifp) + goto next; + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + break; + case IGMP_REPORTING_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_AWAKENING_MEMBER: + inm->inm_state = IGMP_REPORTING_MEMBER; + IGI_LOCK(igi); + retval = igmp_v3_enqueue_group_record(&igi->igi_gq, + inm, 0, 0, 0); + IGI_UNLOCK(igi); + IGMP_PRINTF(("%s: enqueue record = %d\n", + __func__, retval)); + break; + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + case IGMP_LEAVING_MEMBER: + break; + } +next: + INM_UNLOCK(inm); + IN_NEXT_MULTI(step, inm); + } + in_multihead_lock_done(); + + IGI_LOCK(igi); + loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; + igmp_dispatch_queue(igi, &igi->igi_gq, IGMP_MAX_RESPONSE_BURST, + loop, ifp); + IGI_LOCK_ASSERT_HELD(igi); + /* + * Slew transmission of bursts over 500ms intervals. + */ + if (igi->igi_gq.ifq_head != NULL) { + igi->igi_v3_timer = 1 + IGMP_RANDOM_DELAY( + IGMP_RESPONSE_BURST_INTERVAL); + interface_timers_running = 1; + } +} + +/* + * Transmit the next pending IGMP message in the output queue. + * + * Must not be called with inm_lock or igi_lock held. + */ +void +igmp_sendpkt(struct mbuf *m, struct ifnet *ifp) +{ + struct ip_moptions *imo; + struct mbuf *ipopts, *m0; + int error; + struct route ro; + + IGMP_PRINTF(("%s: transmit %p\n", __func__, m)); + + /* + * Check if the ifnet is still attached. + */ + if (ifp == NULL || !ifnet_is_attached(ifp, 0)) { + IGMP_PRINTF(("%s: dropped %p as ifp u went away.\n", + __func__, m)); + m_freem(m); + OSAddAtomic(1, &ipstat.ips_noroute); + return; + } + + ipopts = igmp_sendra ? m_raopt : NULL; + + imo = ip_allocmoptions(M_WAITOK); + if (imo == NULL) { + m_freem(m); + return; + } + + imo->imo_multicast_ttl = 1; + imo->imo_multicast_vif = -1; +#if MROUTING + imo->imo_multicast_loop = (ip_mrouter != NULL); +#else + imo->imo_multicast_loop = 0; +#endif + + /* + * If the user requested that IGMP traffic be explicitly + * redirected to the loopback interface (e.g. they are running a + * MANET interface and the routing protocol needs to see the + * updates), handle this now. + */ + if (m->m_flags & M_IGMP_LOOP) + imo->imo_multicast_ifp = lo_ifp; + else + imo->imo_multicast_ifp = ifp; + + if (m->m_flags & M_IGMPV2) { + m0 = m; + } else { + m0 = igmp_v3_encap_report(ifp, m); + if (m0 == NULL) { + /* + * If igmp_v3_encap_report() failed, then M_PREPEND() + * already freed the original mbuf chain. + * This means that we don't have to m_freem(m) here. + */ + IGMP_PRINTF(("%s: dropped %p\n", __func__, m)); + IMO_REMREF(imo); + atomic_add_32(&ipstat.ips_odropped, 1); + return; + } + } + + m->m_flags &= ~(M_PROTOFLAGS | M_IGMP_LOOP); + m0->m_pkthdr.rcvif = lo_ifp; +#ifdef MAC + mac_netinet_igmp_send(ifp, m0); +#endif + bzero(&ro, sizeof (ro)); + error = ip_output(m0, ipopts, &ro, 0, imo, NULL); + if (ro.ro_rt != NULL) { + rtfree(ro.ro_rt); + ro.ro_rt = NULL; + } + + IMO_REMREF(imo); + + if (error) { + IGMP_PRINTF(("%s: ip_output(%p) = %d\n", __func__, m0, error)); + return; + } + + IGMPSTAT_INC(igps_snd_reports); + OIGMPSTAT_INC(igps_snd_reports); +} +/* + * Encapsulate an IGMPv3 report. + * + * The internal mbuf flag M_IGMPV3_HDR is used to indicate that the mbuf + * chain has already had its IP/IGMPv3 header prepended. In this case + * the function will not attempt to prepend; the lengths and checksums + * will however be re-computed. + * + * Returns a pointer to the new mbuf chain head, or NULL if the + * allocation failed. + */ +static struct mbuf * +igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m) +{ + struct igmp_report *igmp; + struct ip *ip; + int hdrlen, igmpreclen; + + VERIFY((m->m_flags & M_PKTHDR)); + + igmpreclen = m_length(m); + hdrlen = sizeof(struct ip) + sizeof(struct igmp_report); + + if (m->m_flags & M_IGMPV3_HDR) { + igmpreclen -= hdrlen; + } else { + M_PREPEND(m, hdrlen, M_DONTWAIT); + if (m == NULL) + return (NULL); + m->m_flags |= M_IGMPV3_HDR; + } + + IGMP_PRINTF(("%s: igmpreclen is %d\n", __func__, igmpreclen)); + + m->m_data += sizeof(struct ip); + m->m_len -= sizeof(struct ip); + + igmp = mtod(m, struct igmp_report *); + igmp->ir_type = IGMP_v3_HOST_MEMBERSHIP_REPORT; + igmp->ir_rsv1 = 0; + igmp->ir_rsv2 = 0; + igmp->ir_numgrps = htons(m->m_pkthdr.vt_nrecs); + igmp->ir_cksum = 0; + igmp->ir_cksum = in_cksum(m, sizeof(struct igmp_report) + igmpreclen); + m->m_pkthdr.vt_nrecs = 0; + + m->m_data -= sizeof(struct ip); + m->m_len += sizeof(struct ip); + + ip = mtod(m, struct ip *); + ip->ip_tos = IPTOS_PREC_INTERNETCONTROL; + ip->ip_len = hdrlen + igmpreclen; + ip->ip_off = IP_DF; + ip->ip_p = IPPROTO_IGMP; + ip->ip_sum = 0; + + ip->ip_src.s_addr = INADDR_ANY; + + if (m->m_flags & M_IGMP_LOOP) { + struct in_ifaddr *ia; + + IFP_TO_IA(ifp, ia); + if (ia != NULL) { + IFA_LOCK(&ia->ia_ifa); + ip->ip_src = ia->ia_addr.sin_addr; + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); + } + } + + ip->ip_dst.s_addr = htonl(INADDR_ALLRPTS_GROUP); + + return (m); +} + +#ifdef IGMP_DEBUG +static const char * +igmp_rec_type_to_str(const int type) +{ + switch (type) { + case IGMP_CHANGE_TO_EXCLUDE_MODE: + return "TO_EX"; + break; + case IGMP_CHANGE_TO_INCLUDE_MODE: + return "TO_IN"; + break; + case IGMP_MODE_IS_EXCLUDE: + return "MODE_EX"; + break; + case IGMP_MODE_IS_INCLUDE: + return "MODE_IN"; + break; + case IGMP_ALLOW_NEW_SOURCES: + return "ALLOW_NEW"; + break; + case IGMP_BLOCK_OLD_SOURCES: + return "BLOCK_OLD"; + break; + default: + break; + } + return "unknown"; +} +#endif + +void +igmp_init(void) +{ + + IGMP_PRINTF(("%s: initializing\n", __func__)); + + igmp_timers_are_running = 0; + + /* Setup lock group and attribute for igmp_mtx */ + igmp_mtx_grp_attr = lck_grp_attr_alloc_init(); + igmp_mtx_grp = lck_grp_alloc_init("igmp_mtx", igmp_mtx_grp_attr); + igmp_mtx_attr = lck_attr_alloc_init(); + lck_mtx_init(&igmp_mtx, igmp_mtx_grp, igmp_mtx_attr); + + LIST_INIT(&igi_head); + m_raopt = igmp_ra_alloc(); + + igi_size = sizeof (struct igmp_ifinfo); + igi_zone = zinit(igi_size, IGI_ZONE_MAX * igi_size, + 0, IGI_ZONE_NAME); + if (igi_zone == NULL) { + panic("%s: failed allocating %s", __func__, IGI_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(igi_zone, Z_EXPAND, TRUE); + zone_change(igi_zone, Z_CALLERACCT, FALSE); +} diff --git a/bsd/netinet/igmp.h b/bsd/netinet/igmp.h index 3774cd860..28352317d 100644 --- a/bsd/netinet/igmp.h +++ b/bsd/netinet/igmp.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2010 Apple, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -77,8 +77,11 @@ * MULTICAST Revision: 3.5.1.2 */ +/* Minimum length of any IGMP protocol message. */ +#define IGMP_MINLEN 8 + /* - * IGMP packet format. + * IGMPv1/v2 query and host report format. */ struct igmp { u_char igmp_type; /* version & type of IGMP message */ @@ -87,38 +90,91 @@ struct igmp { struct in_addr igmp_group; /* group address being reported */ }; /* (zero for queries) */ -#define IGMP_MINLEN 8 +/* + * IGMP v3 query format. + */ +struct igmpv3 { + u_char igmp_type; /* version & type of IGMP message */ + u_char igmp_code; /* subtype for routing msgs */ + u_short igmp_cksum; /* IP-style checksum */ + struct in_addr igmp_group; /* group address being reported */ + /* (zero for queries) */ + u_char igmp_misc; /* reserved/suppress/robustness */ + u_char igmp_qqi; /* querier's query interval */ + u_short igmp_numsrc; /* number of sources */ + /*struct in_addr igmp_sources[1];*/ /* source addresses */ +}; +#define IGMP_V3_QUERY_MINLEN 12 +#define IGMP_EXP(x) (((x) >> 4) & 0x07) +#define IGMP_MANT(x) ((x) & 0x0f) +#define IGMP_QRESV(x) (((x) >> 4) & 0x0f) +#define IGMP_SFLAG(x) (((x) >> 3) & 0x01) +#define IGMP_QRV(x) ((x) & 0x07) + +struct igmp_grouprec { + u_char ig_type; /* record type */ + u_char ig_datalen; /* length of auxiliary data */ + u_short ig_numsrc; /* number of sources */ + struct in_addr ig_group; /* group address being reported */ + /*struct in_addr ig_sources[1];*/ /* source addresses */ +}; +#define IGMP_GRPREC_HDRLEN 8 /* - * Message types, including version number. + * IGMPv3 host membership report header. */ -#define IGMP_MEMBERSHIP_QUERY 0x11 /* membership query */ -#define IGMP_V1_MEMBERSHIP_REPORT 0x12 /* Ver. 1 membership report */ -#define IGMP_V2_MEMBERSHIP_REPORT 0x16 /* Ver. 2 membership report */ -#define IGMP_V2_LEAVE_GROUP 0x17 /* Leave-group message */ +struct igmp_report { + u_char ir_type; /* IGMP_v3_HOST_MEMBERSHIP_REPORT */ + u_char ir_rsv1; /* must be zero */ + u_short ir_cksum; /* checksum */ + u_short ir_rsv2; /* must be zero */ + u_short ir_numgrps; /* number of group records */ + /*struct igmp_grouprec ir_groups[1];*/ /* group records */ +}; +#define IGMP_V3_REPORT_MINLEN 8 +#define IGMP_V3_REPORT_MAXRECS 65535 +/* + * Message types, including version number. + */ +#define IGMP_HOST_MEMBERSHIP_QUERY 0x11 /* membership query */ +#define IGMP_v1_HOST_MEMBERSHIP_REPORT 0x12 /* Ver. 1 membership report */ #define IGMP_DVMRP 0x13 /* DVMRP routing message */ -#define IGMP_PIM 0x14 /* PIM routing message */ - -#define IGMP_MTRACE_RESP 0x1e /* traceroute resp.(to sender)*/ -#define IGMP_MTRACE 0x1f /* mcast traceroute messages */ +#define IGMP_PIM 0x14 /* PIMv1 message (historic) */ +#define IGMP_v2_HOST_MEMBERSHIP_REPORT 0x16 /* Ver. 2 membership report */ +#define IGMP_HOST_LEAVE_MESSAGE 0x17 /* Leave-group message */ +#define IGMP_MTRACE_REPLY 0x1e /* mtrace(8) reply */ +#define IGMP_MTRACE_QUERY 0x1f /* mtrace(8) probe */ +#define IGMP_v3_HOST_MEMBERSHIP_REPORT 0x22 /* Ver. 3 membership report */ -#define IGMP_MAX_HOST_REPORT_DELAY 10 /* max delay for response to */ - /* query (in seconds) according */ - /* to RFC1112 */ +/* + * IGMPv3 report modes. + */ +#define IGMP_DO_NOTHING 0 /* don't send a record */ +#define IGMP_MODE_IS_INCLUDE 1 /* MODE_IN */ +#define IGMP_MODE_IS_EXCLUDE 2 /* MODE_EX */ +#define IGMP_CHANGE_TO_INCLUDE_MODE 3 /* TO_IN */ +#define IGMP_CHANGE_TO_EXCLUDE_MODE 4 /* TO_EX */ +#define IGMP_ALLOW_NEW_SOURCES 5 /* ALLOW_NEW */ +#define IGMP_BLOCK_OLD_SOURCES 6 /* BLOCK_OLD */ +/* + * IGMPv3 query types. + */ +#define IGMP_V3_GENERAL_QUERY 1 +#define IGMP_V3_GROUP_QUERY 2 +#define IGMP_V3_GROUP_SOURCE_QUERY 3 -#define IGMP_TIMER_SCALE 10 /* denotes that the igmp code field */ - /* specifies time in 10th of seconds*/ +/* + * Maximum report interval for IGMP v1/v2 host membership reports [RFC 1112] + */ +#define IGMP_V1V2_MAX_RI 10 +#define IGMP_MAX_HOST_REPORT_DELAY IGMP_V1V2_MAX_RI /* - * The following four defininitions are for backwards compatibility. - * They should be removed as soon as all applications are updated to - * use the new constant names. + * IGMP_TIMER_SCALE denotes that the igmp code field specifies + * time in tenths of a second. */ -#define IGMP_HOST_MEMBERSHIP_QUERY IGMP_MEMBERSHIP_QUERY -#define IGMP_HOST_MEMBERSHIP_REPORT IGMP_V1_MEMBERSHIP_REPORT -#define IGMP_HOST_NEW_MEMBERSHIP_REPORT IGMP_V2_MEMBERSHIP_REPORT -#define IGMP_HOST_LEAVE_MESSAGE IGMP_V2_LEAVE_GROUP +#define IGMP_TIMER_SCALE 10 #endif /* _NETINET_IGMP_H_ */ diff --git a/bsd/netinet/igmp_var.h b/bsd/netinet/igmp_var.h index 5e9f7e983..8fdaab868 100644 --- a/bsd/netinet/igmp_var.h +++ b/bsd/netinet/igmp_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -68,7 +68,6 @@ #define _NETINET_IGMP_VAR_H_ #include - /* * Internet Group Management Protocol (IGMP), * implementation-specific definitions. @@ -78,6 +77,48 @@ * MULTICAST Revision: 3.5.1.3 */ +struct igmpstat_v3 { + /* + * Structure header (to insulate ABI changes). + */ + uint32_t igps_version; /* version of this structure */ + uint32_t igps_len; /* length of this structure */ + /* + * Message statistics. + */ + uint64_t igps_rcv_total; /* total IGMP messages received */ + uint64_t igps_rcv_tooshort; /* received with too few bytes */ + uint64_t igps_rcv_badttl; /* received with ttl other than 1 */ + uint64_t igps_rcv_badsum; /* received with bad checksum */ + /* + * Query statistics. + */ + uint64_t igps_rcv_v1v2_queries; /* received IGMPv1/IGMPv2 queries */ + uint64_t igps_rcv_v3_queries; /* received IGMPv3 queries */ + uint64_t igps_rcv_badqueries; /* received invalid queries */ + uint64_t igps_rcv_gen_queries; /* received general queries */ + uint64_t igps_rcv_group_queries;/* received group queries */ + uint64_t igps_rcv_gsr_queries; /* received group-source queries */ + uint64_t igps_drop_gsr_queries; /* dropped group-source queries */ + /* + * Report statistics. + */ + uint64_t igps_rcv_reports; /* received membership reports */ + uint64_t igps_rcv_badreports; /* received invalid reports */ + uint64_t igps_rcv_ourreports; /* received reports for our groups */ + uint64_t igps_rcv_nora; /* received w/o Router Alert option */ + uint64_t igps_snd_reports; /* sent membership reports */ + /* + * Padding for future additions. + */ + uint64_t __igps_pad[4]; +} __attribute__((aligned(8))); + +/* + * Old IGMPv2 stat structure for backward compatibility + * + */ + struct igmpstat { u_int igps_rcv_total; /* total IGMP messages received */ u_int igps_rcv_tooshort; /* received with too few bytes */ @@ -90,41 +131,189 @@ struct igmpstat { u_int igps_snd_reports; /* sent membership reports */ }; -#ifdef KERNEL_PRIVATE -#ifdef KERNEL +#define IGPS_VERSION_3 3 +#define IGPS_VERSION3_LEN 168 + +#ifdef PRIVATE +/* + * Per-interface IGMP router version information. + */ +#ifndef XNU_KERNEL_PRIVATE +struct igmp_ifinfo { +#else +struct igmp_ifinfo_u { +#endif /* XNU_KERNEL_PRIVATE */ + uint32_t igi_ifindex; /* interface this instance belongs to */ + uint32_t igi_version; /* IGMPv3 Host Compatibility Mode */ + uint32_t igi_v1_timer; /* IGMPv1 Querier Present timer (s) */ + uint32_t igi_v2_timer; /* IGMPv2 Querier Present timer (s) */ + uint32_t igi_v3_timer; /* IGMPv3 General Query (interface) timer (s)*/ + uint32_t igi_flags; /* IGMP per-interface flags */ + uint32_t igi_rv; /* IGMPv3 Robustness Variable */ + uint32_t igi_qi; /* IGMPv3 Query Interval (s) */ + uint32_t igi_qri; /* IGMPv3 Query Response Interval (s) */ + uint32_t igi_uri; /* IGMPv3 Unsolicited Report Interval (s) */ +}; + +#define IGIF_SILENT 0x00000001 /* Do not use IGMP on this ifp */ +#define IGIF_LOOPBACK 0x00000002 /* Send IGMP reports to loopback */ + +/* + * IGMP version tag. + */ +#define IGMP_VERSION_NONE 0 /* Invalid */ +#define IGMP_VERSION_1 1 +#define IGMP_VERSION_2 2 +#define IGMP_VERSION_3 3 /* Default */ +#endif /* PRIVATE */ + +#ifdef XNU_KERNEL_PRIVATE +#include +#define IGMP_DEBUG 1 +#ifdef IGMP_DEBUG +extern char * inet_ntoa(struct in_addr); +extern int igmp_debug; + +#define IGMP_PRINTF(x) do { if (igmp_debug) printf x; } while (0) +#else +#define IGMP_PRINTF(x) +#endif + +#define OIGMPSTAT_ADD(name, val) atomic_add_32(&igmpstat.name , (val)) +#define OIGMPSTAT_INC(name) OIGMPSTAT_ADD(name, 1) + +#define IGMPSTAT_ADD(name, val) atomic_add_64(&igmpstat_v3.name , (val)) +#define IGMPSTAT_INC(name) IGMPSTAT_ADD(name, 1) + #define IGMP_RANDOM_DELAY(X) (random() % (X) + 1) +#define IGMP_MAX_STATE_CHANGES 24 /* Max pending changes per group */ + /* - * States for IGMPv2's leave processing + * IGMP per-group states. */ -#define IGMP_OTHERMEMBER 0 -#define IGMP_IREPORTEDLAST 1 +#define IGMP_NOT_MEMBER 0 /* Can garbage collect in_multi */ +#define IGMP_SILENT_MEMBER 1 /* Do not perform IGMP for group */ +#define IGMP_REPORTING_MEMBER 2 /* IGMPv1/2/3 we are reporter */ +#define IGMP_IDLE_MEMBER 3 /* IGMPv1/2 we reported last */ +#define IGMP_LAZY_MEMBER 4 /* IGMPv1/2 other member reporting */ +#define IGMP_SLEEPING_MEMBER 5 /* IGMPv1/2 start query response */ +#define IGMP_AWAKENING_MEMBER 6 /* IGMPv1/2 group timer will start */ +#define IGMP_G_QUERY_PENDING_MEMBER 7 /* IGMPv3 group query pending */ +#define IGMP_SG_QUERY_PENDING_MEMBER 8 /* IGMPv3 source query pending */ +#define IGMP_LEAVING_MEMBER 9 /* IGMPv3 dying gasp (pending last */ + /* retransmission of INCLUDE {}) */ +/* + * IGMPv3 protocol control variables. + */ +#define IGMP_RV_INIT 2 /* Robustness Variable */ +#define IGMP_RV_MIN 1 +#define IGMP_RV_MAX 7 + +#define IGMP_QI_INIT 125 /* Query Interval (s) */ +#define IGMP_QI_MIN 1 +#define IGMP_QI_MAX 255 + +#define IGMP_QRI_INIT 10 /* Query Response Interval (s) */ +#define IGMP_QRI_MIN 1 +#define IGMP_QRI_MAX 255 + +#define IGMP_URI_INIT 3 /* Unsolicited Report Interval (s) */ +#define IGMP_URI_MIN 0 +#define IGMP_URI_MAX 10 + +#define IGMP_MAX_G_GS_PACKETS 8 /* # of packets to answer G/GS */ +#define IGMP_MAX_STATE_CHANGE_PACKETS 8 /* # of packets per state change */ +#define IGMP_MAX_RESPONSE_PACKETS 16 /* # of packets for general query */ +#define IGMP_MAX_RESPONSE_BURST 4 /* # of responses to send at once */ +#define IGMP_RESPONSE_BURST_INTERVAL (PR_SLOWHZ) /* 500ms */ /* - * We must remember what version the subnet's querier is. - * We conveniently use the IGMP message type for the proper - * membership report to keep this state. + * IGMP-specific mbuf flags. */ -#define IGMP_V1_ROUTER IGMP_V1_MEMBERSHIP_REPORT -#define IGMP_V2_ROUTER IGMP_V2_MEMBERSHIP_REPORT +#define M_IGMPV2 M_PROTO1 /* Packet is IGMPv2 */ +#define M_IGMPV3_HDR M_PROTO2 /* Packet has IGMPv3 headers */ +#define M_GROUPREC M_PROTO3 /* mbuf chain is a group record */ +#define M_IGMP_LOOP M_LOOP /* transmit on loif, not real ifp */ + +/* + * Default amount of leading space for IGMPv3 to allocate at the + * beginning of its mbuf packet chains, to avoid fragmentation and + * unnecessary allocation of leading mbufs. + */ +#define RAOPT_LEN 4 /* Length of IP Router Alert option */ +#define IGMP_LEADINGSPACE \ + (sizeof(struct ip) + RAOPT_LEN + sizeof(struct igmp_report)) + +struct igmp_ifinfo { + decl_lck_mtx_data(, igi_lock); + uint32_t igi_refcnt; /* reference count */ + uint32_t igi_debug; /* see ifa_debug flags */ + LIST_ENTRY(igmp_ifinfo) igi_link; + struct ifnet *igi_ifp; /* interface this instance belongs to */ + uint32_t igi_version; /* IGMPv3 Host Compatibility Mode */ + uint32_t igi_v1_timer; /* IGMPv1 Querier Present timer (s) */ + uint32_t igi_v2_timer; /* IGMPv2 Querier Present timer (s) */ + uint32_t igi_v3_timer; /* IGMPv3 General Query (interface) timer (s)*/ + uint32_t igi_flags; /* IGMP per-interface flags */ + uint32_t igi_rv; /* IGMPv3 Robustness Variable */ + uint32_t igi_qi; /* IGMPv3 Query Interval (s) */ + uint32_t igi_qri; /* IGMPv3 Query Response Interval (s) */ + uint32_t igi_uri; /* IGMPv3 Unsolicited Report Interval (s) */ + SLIST_HEAD(,in_multi) igi_relinmhead; /* released groups */ + struct ifqueue igi_gq; /* queue of general query responses */ + struct ifqueue igi_v2q; /* queue of v1/v2 packets */ +}; + +#define IGI_LOCK_ASSERT_HELD(_igi) \ + lck_mtx_assert(&(_igi)->igi_lock, LCK_MTX_ASSERT_OWNED) + +#define IGI_LOCK_ASSERT_NOTHELD(_igi) \ + lck_mtx_assert(&(_igi)->igi_lock, LCK_MTX_ASSERT_NOTOWNED) + +#define IGI_LOCK(_igi) \ + lck_mtx_lock(&(_igi)->igi_lock) + +#define IGI_LOCK_SPIN(_igi) \ + lck_mtx_lock_spin(&(_igi)->igi_lock) + +#define IGI_CONVERT_LOCK(_igi) do { \ + IGI_LOCK_ASSERT_HELD(_igi); \ + lck_mtx_convert_spin(&(_igi)->igi_lock); \ +} while (0) + +#define IGI_UNLOCK(_igi) \ + lck_mtx_unlock(&(_igi)->igi_lock) + +#define IGI_ADDREF(_igi) \ + igi_addref(_igi, 0) + +#define IGI_ADDREF_LOCKED(_igi) \ + igi_addref(_igi, 1) + +#define IGI_REMREF(_igi) \ + igi_remref(_igi) /* - * Revert to new router if we haven't heard from an old router in - * this amount of time. + * Per-link IGMP context. */ -#define IGMP_AGE_THRESHOLD 540 +#define IGMP_IFINFO(ifp) ((ifp)->if_igi) -void igmp_init(void) __attribute__((section("__TEXT, initcode"))); -void igmp_input(struct mbuf *, int); -int igmp_joingroup(struct in_multi *); -void igmp_leavegroup(struct in_multi *); -void igmp_fasttimo(void); -void igmp_slowtimo(void); +extern void igmp_init(void) __attribute__((section("__TEXT, initcode"))); +extern int igmp_change_state(struct in_multi *); +extern struct igmp_ifinfo *igmp_domifattach(struct ifnet *, int); +extern void igmp_domifreattach(struct igmp_ifinfo *); +extern void igmp_domifdetach(struct ifnet *); +extern void igmp_input(struct mbuf *, int); +extern int igmp_joingroup(struct in_multi *); +extern void igmp_leavegroup(struct in_multi *); +extern void igmp_slowtimo(void); +extern void igi_addref(struct igmp_ifinfo *, int); +extern void igi_remref(struct igmp_ifinfo *); SYSCTL_DECL(_net_inet_igmp); -#endif /* KERNEL */ -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ /* * Names for IGMP sysctl objects @@ -132,11 +321,11 @@ SYSCTL_DECL(_net_inet_igmp); #define IGMPCTL_STATS 1 /* statistics (read-only) */ #define IGMPCTL_MAXID 2 -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #define IGMPCTL_NAMES { \ { 0, 0 }, \ { "stats", CTLTYPE_STRUCT }, \ } -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #endif diff --git a/bsd/netinet/in.c b/bsd/netinet/in.c index 32b8c64f6..85b9d38af 100644 --- a/bsd/netinet/in.c +++ b/bsd/netinet/in.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -72,6 +72,7 @@ #include #include #include +#include #include #include @@ -122,42 +123,66 @@ static void in_iahash_remove(struct in_ifaddr *); static void in_iahash_insert(struct in_ifaddr *); static void in_iahash_insert_ptp(struct in_ifaddr *); static struct in_ifaddr *in_ifaddr_alloc(int); +static void in_ifaddr_attached(struct ifaddr *); +static void in_ifaddr_detached(struct ifaddr *); static void in_ifaddr_free(struct ifaddr *); static void in_ifaddr_trace(struct ifaddr *, int); static int subnetsarelocal = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, subnets_are_local, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, subnets_are_local, CTLFLAG_RW | CTLFLAG_LOCKED, &subnetsarelocal, 0, ""); -struct in_multihead in_multihead; /* XXX BSS initialization */ - /* Track whether or not the SIOCARPIPLL ioctl has been called */ __private_extern__ u_int32_t ipv4_ll_arp_aware = 0; +#define INIFA_TRACE_HIST_SIZE 32 /* size of trace history */ + +/* For gdb */ +__private_extern__ unsigned int inifa_trace_hist_size = INIFA_TRACE_HIST_SIZE; + struct in_ifaddr_dbg { struct in_ifaddr inifa; /* in_ifaddr */ struct in_ifaddr inifa_old; /* saved in_ifaddr */ - u_int16_t inifa_refhold_cnt; /* # of ifaref */ - u_int16_t inifa_refrele_cnt; /* # of ifafree */ + u_int16_t inifa_refhold_cnt; /* # of IFA_ADDREF */ + u_int16_t inifa_refrele_cnt; /* # of IFA_REMREF */ /* * Alloc and free callers. */ ctrace_t inifa_alloc; ctrace_t inifa_free; /* - * Circular lists of ifaref and ifafree callers. + * Circular lists of IFA_ADDREF and IFA_REMREF callers. */ - ctrace_t inifa_refhold[CTRACE_HIST_SIZE]; - ctrace_t inifa_refrele[CTRACE_HIST_SIZE]; + ctrace_t inifa_refhold[INIFA_TRACE_HIST_SIZE]; + ctrace_t inifa_refrele[INIFA_TRACE_HIST_SIZE]; + /* + * Trash list linkage + */ + TAILQ_ENTRY(in_ifaddr_dbg) inifa_trash_link; }; -static unsigned int inifa_debug; /* debug flags */ +/* List of trash in_ifaddr entries protected by inifa_trash_lock */ +static TAILQ_HEAD(, in_ifaddr_dbg) inifa_trash_head; +static decl_lck_mtx_data(, inifa_trash_lock); + +#if DEBUG +static unsigned int inifa_debug = 1; /* debugging (enabled) */ +#else +static unsigned int inifa_debug; /* debugging (disabled) */ +#endif /* !DEBUG */ static unsigned int inifa_size; /* size of zone element */ static struct zone *inifa_zone; /* zone for in_ifaddr */ #define INIFA_ZONE_MAX 64 /* maximum elements in zone */ #define INIFA_ZONE_NAME "in_ifaddr" /* zone name */ +/* + * Return 1 if the address is + * - loopback + * - unicast or multicast link local + * - routed via a link level gateway + * - belongs to a directly connected (sub)net + */ int inaddr_local(struct in_addr in) { @@ -165,20 +190,27 @@ inaddr_local(struct in_addr in) struct sockaddr_in sin; int local = 0; - sin.sin_family = AF_INET; - sin.sin_len = sizeof (sin); - sin.sin_addr = in; - rt = rtalloc1((struct sockaddr *)&sin, 0, 0); - - if (rt != NULL) { - RT_LOCK_SPIN(rt); - if (rt->rt_gateway->sa_family == AF_LINK || - (rt->rt_ifp->if_flags & IFF_LOOPBACK)) + if (ntohl(in.s_addr) == INADDR_LOOPBACK || IN_LINKLOCAL(ntohl(in.s_addr))) { + local = 1; + } else if (ntohl(in.s_addr) >= INADDR_UNSPEC_GROUP && + ntohl(in.s_addr) <= INADDR_MAX_LOCAL_GROUP) { local = 1; - RT_UNLOCK(rt); - rtfree(rt); } else { - local = in_localaddr(in); + sin.sin_family = AF_INET; + sin.sin_len = sizeof (sin); + sin.sin_addr = in; + rt = rtalloc1((struct sockaddr *)&sin, 0, 0); + + if (rt != NULL) { + RT_LOCK_SPIN(rt); + if (rt->rt_gateway->sa_family == AF_LINK || + (rt->rt_ifp->if_flags & IFF_LOOPBACK)) + local = 1; + RT_UNLOCK(rt); + rtfree(rt); + } else { + local = in_localaddr(in); + } } return (local); } @@ -198,20 +230,28 @@ in_localaddr(struct in_addr in) if (subnetsarelocal) { lck_rw_lock_shared(in_ifaddr_rwlock); for (ia = in_ifaddrhead.tqh_first; ia; - ia = ia->ia_link.tqe_next) + ia = ia->ia_link.tqe_next) { + IFA_LOCK(&ia->ia_ifa); if ((i & ia->ia_netmask) == ia->ia_net) { + IFA_UNLOCK(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); return (1); } + IFA_UNLOCK(&ia->ia_ifa); + } lck_rw_done(in_ifaddr_rwlock); } else { lck_rw_lock_shared(in_ifaddr_rwlock); for (ia = in_ifaddrhead.tqh_first; ia; - ia = ia->ia_link.tqe_next) + ia = ia->ia_link.tqe_next) { + IFA_LOCK(&ia->ia_ifa); if ((i & ia->ia_subnetmask) == ia->ia_subnet) { + IFA_UNLOCK(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); return (1); } + IFA_UNLOCK(&ia->ia_ifa); + } lck_rw_done(in_ifaddr_rwlock); } return (0); @@ -292,6 +332,18 @@ in_len2mask(struct in_addr *mask, int len) static int in_interfaces; /* number of external internet interfaces */ +static int +in_domifattach(struct ifnet *ifp) +{ + int error; + + if ((error = proto_plumb(PF_INET, ifp)) && error != EEXIST) + log(LOG_ERR, "%s: proto_plumb returned %d if=%s%d\n", + __func__, error, ifp->if_name, ifp->if_unit); + + return (error); +} + /* * Generic internet control operations (ioctl's). * Ifp is 0 if not an interface-specific ioctl. @@ -331,6 +383,8 @@ in_control( struct kev_msg ev_msg; struct kev_in_data in_event_data; + bzero(&in_event_data, sizeof(struct kev_in_data)); + bzero(&ev_msg, sizeof(struct kev_msg)); switch (cmd) { case SIOCALIFADDR: case SIOCDLIFADDR: @@ -354,19 +408,24 @@ in_control( for (iap = in_ifaddrhead.tqh_first; iap; iap = iap->ia_link.tqe_next) if (iap->ia_ifp == ifp) { + IFA_LOCK(&iap->ia_ifa); if (((struct sockaddr_in *)&ifr->ifr_addr)->sin_addr.s_addr == iap->ia_addr.sin_addr.s_addr) { ia = iap; + IFA_UNLOCK(&iap->ia_ifa); break; } else if (ia == NULL) { ia = iap; - if (ifr->ifr_addr.sa_family != AF_INET) + if (ifr->ifr_addr.sa_family != AF_INET) { + IFA_UNLOCK(&iap->ia_ifa); break; + } } + IFA_UNLOCK(&iap->ia_ifa); } /* take a reference on ia before releasing lock */ if (ia != NULL) { - ifaref(&ia->ia_ifa); + IFA_ADDREF(&ia->ia_ifa); } lck_rw_done(in_ifaddr_rwlock); } @@ -393,19 +452,19 @@ in_control( lck_rw_lock_shared(in_ifaddr_rwlock); for (oia = ia; ia; ia = ia->ia_link.tqe_next) { + IFA_LOCK(&ia->ia_ifa); if (ia->ia_ifp == ifp && ia->ia_addr.sin_addr.s_addr == - ifra->ifra_addr.sin_addr.s_addr) + ifra->ifra_addr.sin_addr.s_addr) { + IFA_ADDREF_LOCKED(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); break; - } - /* take a reference on ia before releasing lock */ - if (ia != NULL && ia != oia) { - ifaref(&ia->ia_ifa); + } + IFA_UNLOCK(&ia->ia_ifa); } lck_rw_done(in_ifaddr_rwlock); - if (oia != NULL && oia != ia) { - ifafree(&oia->ia_ifa); - } + if (oia != NULL) + IFA_REMREF(&oia->ia_ifa); if ((ifp->if_flags & IFF_POINTOPOINT) && (cmd == SIOCAIFADDR) && (ifra->ifra_dstaddr.sin_addr.s_addr @@ -426,7 +485,13 @@ in_control( case SIOCSIFADDR: case SIOCSIFNETMASK: case SIOCSIFDSTADDR: - if ((so->so_state & SS_PRIV) == 0) { + /* socket is NULL if called from in_purgeaddrs() */ + if (so != NULL && (so->so_state & SS_PRIV) == 0) { + error = EPERM; + goto done; + } + /* in case it's NULL, make sure it came from the kernel */ + if (so == NULL && p != kernproc) { error = EPERM; goto done; } @@ -439,21 +504,22 @@ in_control( error = EINVAL; goto done; } - if (ia == (struct in_ifaddr *)0) { + if (ia == NULL) { ia = in_ifaddr_alloc(M_WAITOK); - if (ia == (struct in_ifaddr *)NULL) { + if (ia == NULL) { error = ENOBUFS; goto done; } - IA_HASH_INIT(ia); + ifnet_lock_exclusive(ifp); ifa = &ia->ia_ifa; + IFA_LOCK(ifa); /* Hold a reference for this routine */ - ifaref(ifa); + IFA_ADDREF_LOCKED(ifa); + IA_HASH_INIT(ia); ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr; ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask; ia->ia_sockmask.sin_len = 8; - ifnet_lock_exclusive(ifp); if (ifp->if_flags & IFF_BROADCAST) { ia->ia_broadaddr.sin_len = sizeof(ia->ia_addr); ia->ia_broadaddr.sin_family = AF_INET; @@ -463,22 +529,25 @@ in_control( in_interfaces++; /* if_attach_ifa() holds a reference for ifa_link */ if_attach_ifa(ifp, ifa); + /* + * If we have to go through in_ifinit(), make sure + * to avoid installing route(s) based on this address + * via PFC_IFUP event, before the link resolver (ARP) + * initializes it. + */ + if (cmd == SIOCAIFADDR || cmd == SIOCSIFADDR) + ifa->ifa_debug |= IFD_NOTREADY; + IFA_UNLOCK(ifa); ifnet_lock_done(ifp); lck_rw_lock_exclusive(in_ifaddr_rwlock); /* Hold a reference for ia_link */ - ifaref(ifa); + IFA_ADDREF(ifa); TAILQ_INSERT_TAIL(&in_ifaddrhead, ia, ia_link); lck_rw_done(in_ifaddr_rwlock); - - /* Generic protocol plumbing */ - - if ((error = proto_plumb(PF_INET, ifp))) { - if (error != EEXIST) { - kprintf("in.c: warning can't plumb proto if=%s%d type %d error=%d\n", - ifp->if_name, ifp->if_unit, ifp->if_type, error); - } - error = 0; /*discard error, can be cold with unsupported interfaces */ - } + error = in_domifattach(ifp); + /* discard error,can be cold with unsupported interfaces */ + if (error) + error = 0; } break; @@ -531,7 +600,9 @@ in_control( break; case SIOCGIFADDR: + IFA_LOCK(&ia->ia_ifa); *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_addr; + IFA_UNLOCK(&ia->ia_ifa); break; case SIOCGIFBRDADDR: @@ -539,7 +610,9 @@ in_control( error = EINVAL; break; } + IFA_LOCK(&ia->ia_ifa); *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_broadaddr; + IFA_UNLOCK(&ia->ia_ifa); break; case SIOCGIFDSTADDR: @@ -547,11 +620,15 @@ in_control( error = EINVAL; break; } + IFA_LOCK(&ia->ia_ifa); *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_dstaddr; + IFA_UNLOCK(&ia->ia_ifa); break; case SIOCGIFNETMASK: + IFA_LOCK(&ia->ia_ifa); *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_sockmask; + IFA_UNLOCK(&ia->ia_ifa); break; case SIOCSIFDSTADDR: @@ -559,23 +636,28 @@ in_control( error = EINVAL; break; } + IFA_LOCK(&ia->ia_ifa); oldaddr = ia->ia_dstaddr; ia->ia_dstaddr = *(struct sockaddr_in *)&ifr->ifr_dstaddr; if (ia->ia_dstaddr.sin_family == AF_INET) ia->ia_dstaddr.sin_len = sizeof (struct sockaddr_in); + IFA_UNLOCK(&ia->ia_ifa); error = ifnet_ioctl(ifp, PF_INET, SIOCSIFDSTADDR, ia); + IFA_LOCK(&ia->ia_ifa); if (error == EOPNOTSUPP) { error = 0; } if (error) { ia->ia_dstaddr = oldaddr; + IFA_UNLOCK(&ia->ia_ifa); break; } + IFA_LOCK_ASSERT_HELD(&ia->ia_ifa); ev_msg.vendor_code = KEV_VENDOR_APPLE; ev_msg.kev_class = KEV_NETWORK_CLASS; ev_msg.kev_subclass = KEV_INET_SUBCLASS; - + ev_msg.event_code = KEV_INET_SIFDSTADDR; if (ia->ia_ifa.ifa_dstaddr) @@ -590,6 +672,7 @@ in_control( in_event_data.ia_subnet = ia->ia_subnet; in_event_data.ia_subnetmask = ia->ia_subnetmask; in_event_data.ia_netbroadcast = ia->ia_netbroadcast; + IFA_UNLOCK(&ia->ia_ifa); strncpy(&in_event_data.link_data.if_name[0], ifp->if_name, IFNAMSIZ); in_event_data.link_data.if_family = ifp->if_family; in_event_data.link_data.if_unit = (u_int32_t) ifp->if_unit; @@ -600,14 +683,22 @@ in_control( kev_post_msg(&ev_msg); - + lck_mtx_lock(rnh_lock); + IFA_LOCK(&ia->ia_ifa); if (ia->ia_flags & IFA_ROUTE) { ia->ia_ifa.ifa_dstaddr = (struct sockaddr *)&oldaddr; - rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST); + IFA_UNLOCK(&ia->ia_ifa); + rtinit_locked(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST); + IFA_LOCK(&ia->ia_ifa); ia->ia_ifa.ifa_dstaddr = - (struct sockaddr *)&ia->ia_dstaddr; - rtinit(&(ia->ia_ifa), (int)RTM_ADD, RTF_HOST|RTF_UP); + (struct sockaddr *)&ia->ia_dstaddr; + IFA_UNLOCK(&ia->ia_ifa); + rtinit_locked(&(ia->ia_ifa), (int)RTM_ADD, + RTF_HOST|RTF_UP); + } else { + IFA_UNLOCK(&ia->ia_ifa); } + lck_mtx_unlock(rnh_lock); break; case SIOCSIFBRDADDR: @@ -615,12 +706,13 @@ in_control( error = EINVAL; break; } + IFA_LOCK(&ia->ia_ifa); ia->ia_broadaddr = *(struct sockaddr_in *)&ifr->ifr_broadaddr; ev_msg.vendor_code = KEV_VENDOR_APPLE; ev_msg.kev_class = KEV_NETWORK_CLASS; ev_msg.kev_subclass = KEV_INET_SUBCLASS; - + ev_msg.event_code = KEV_INET_SIFBRDADDR; if (ia->ia_ifa.ifa_dstaddr) @@ -635,6 +727,7 @@ in_control( in_event_data.ia_subnet = ia->ia_subnet; in_event_data.ia_subnetmask = ia->ia_subnetmask; in_event_data.ia_netbroadcast = ia->ia_netbroadcast; + IFA_UNLOCK(&ia->ia_ifa); strncpy(&in_event_data.link_data.if_name[0], ifp->if_name, IFNAMSIZ); in_event_data.link_data.if_family = ifp->if_family; in_event_data.link_data.if_unit = (u_int32_t) ifp->if_unit; @@ -661,34 +754,41 @@ in_control( break; case SIOCPROTOATTACH: - error = proto_plumb(PF_INET, ifp); + error = in_domifattach(ifp); break; - + case SIOCPROTODETACH: - // if an ip address is still present, refuse to detach + /* + * If an IPv4 address is still present, refuse to detach. + */ ifnet_lock_shared(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) - if (ifa->ifa_addr->sa_family == AF_INET) + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family == AF_INET) { + IFA_UNLOCK(ifa); break; + } + IFA_UNLOCK(ifa); + } ifnet_lock_done(ifp); - if (ifa != 0) { + if (ifa != NULL) { error = EBUSY; break; } error = proto_unplumb(PF_INET, ifp); break; - case SIOCSIFNETMASK: { u_long i; - + i = ifra->ifra_addr.sin_addr.s_addr; + IFA_LOCK(&ia->ia_ifa); ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr = i); ev_msg.vendor_code = KEV_VENDOR_APPLE; ev_msg.kev_class = KEV_NETWORK_CLASS; ev_msg.kev_subclass = KEV_INET_SUBCLASS; - + ev_msg.event_code = KEV_INET_SIFNETMASK; if (ia->ia_ifa.ifa_dstaddr) @@ -703,6 +803,7 @@ in_control( in_event_data.ia_subnet = ia->ia_subnet; in_event_data.ia_subnetmask = ia->ia_subnetmask; in_event_data.ia_netbroadcast = ia->ia_netbroadcast; + IFA_UNLOCK(&ia->ia_ifa); strncpy(&in_event_data.link_data.if_name[0], ifp->if_name, IFNAMSIZ); in_event_data.link_data.if_family = ifp->if_family; in_event_data.link_data.if_unit = (u_int32_t) ifp->if_unit; @@ -720,6 +821,7 @@ in_control( hostIsNew = 1; error = 0; + IFA_LOCK(&ia->ia_ifa); if (ia->ia_addr.sin_family == AF_INET) { if (ifra->ifra_addr.sin_len == 0) { ifra->ifra_addr = ia->ia_addr; @@ -729,7 +831,9 @@ in_control( hostIsNew = 0; } if (ifra->ifra_mask.sin_len) { + IFA_UNLOCK(&ia->ia_ifa); in_ifscrub(ifp, ia, 0); + IFA_LOCK(&ia->ia_ifa); ia->ia_sockmask = ifra->ifra_mask; ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr); @@ -737,19 +841,25 @@ in_control( } if ((ifp->if_flags & IFF_POINTOPOINT) && (ifra->ifra_dstaddr.sin_family == AF_INET)) { + IFA_UNLOCK(&ia->ia_ifa); in_ifscrub(ifp, ia, 0); + IFA_LOCK(&ia->ia_ifa); ia->ia_dstaddr = ifra->ifra_dstaddr; ia->ia_dstaddr.sin_len = sizeof (struct sockaddr_in); maskIsNew = 1; /* We lie; but the effect's the same */ } if (ifra->ifra_addr.sin_family == AF_INET && (hostIsNew || maskIsNew)) { + IFA_UNLOCK(&ia->ia_ifa); error = in_ifinit(ifp, ia, &ifra->ifra_addr, 0); + } else { + IFA_UNLOCK(&ia->ia_ifa); } #if PF if (!error) (void) pf_ifaddr_hook(ifp, cmd); #endif /* PF */ + IFA_LOCK(&ia->ia_ifa); if ((ifp->if_flags & IFF_BROADCAST) && (ifra->ifra_broadaddr.sin_family == AF_INET)) ia->ia_broadaddr = ifra->ifra_broadaddr; @@ -780,6 +890,7 @@ in_control( in_event_data.ia_subnet = ia->ia_subnet; in_event_data.ia_subnetmask = ia->ia_subnetmask; in_event_data.ia_netbroadcast = ia->ia_netbroadcast; + IFA_UNLOCK(&ia->ia_ifa); strncpy(&in_event_data.link_data.if_name[0], ifp->if_name, IFNAMSIZ); in_event_data.link_data.if_family = ifp->if_family; in_event_data.link_data.if_unit = (u_int32_t) ifp->if_unit; @@ -789,6 +900,8 @@ in_control( ev_msg.dv[1].data_length = 0; kev_post_msg(&ev_msg); + } else { + IFA_UNLOCK(&ia->ia_ifa); } break; @@ -804,9 +917,10 @@ in_control( ev_msg.vendor_code = KEV_VENDOR_APPLE; ev_msg.kev_class = KEV_NETWORK_CLASS; ev_msg.kev_subclass = KEV_INET_SUBCLASS; - + ev_msg.event_code = KEV_INET_ADDR_DELETED; + IFA_LOCK(&ia->ia_ifa); if (ia->ia_ifa.ifa_dstaddr) in_event_data.ia_dstaddr = ((struct sockaddr_in *)ia->ia_ifa.ifa_dstaddr)->sin_addr; @@ -819,6 +933,7 @@ in_control( in_event_data.ia_subnet = ia->ia_subnet; in_event_data.ia_subnetmask = ia->ia_subnetmask; in_event_data.ia_netbroadcast = ia->ia_netbroadcast; + IFA_UNLOCK(&ia->ia_ifa); strncpy(&in_event_data.link_data.if_name[0], ifp->if_name, IFNAMSIZ); in_event_data.link_data.if_family = ifp->if_family; in_event_data.link_data.if_unit = (u_int32_t) ifp->if_unit; @@ -830,10 +945,12 @@ in_control( ifa = &ia->ia_ifa; lck_rw_lock_exclusive(in_ifaddr_rwlock); /* Release ia_link reference */ - ifafree(ifa); + IFA_REMREF(ifa); TAILQ_REMOVE(&in_ifaddrhead, ia, ia_link); + IFA_LOCK(ifa); if (IA_IS_HASHED(ia)) in_iahash_remove(ia); + IFA_UNLOCK(ifa); lck_rw_done(in_ifaddr_rwlock); /* @@ -841,31 +958,42 @@ in_control( */ in_ifscrub(ifp, ia, 0); ifnet_lock_exclusive(ifp); + IFA_LOCK(ifa); /* if_detach_ifa() releases ifa_link reference */ if_detach_ifa(ifp, ifa); -#ifdef __APPLE__ + /* Our reference to this address is dropped at the bottom */ + IFA_UNLOCK(ifa); + /* * If the interface supports multicast, and no address is left, * remove the "all hosts" multicast group from that interface. */ - if (ifp->if_flags & IFF_MULTICAST) { - struct in_addr addr; - struct in_multi *inm = NULL; + if ((ifp->if_flags & IFF_MULTICAST) != 0 || + ifp->if_allhostsinm != NULL ) { - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) - if (ifa->ifa_addr->sa_family == AF_INET) + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family == AF_INET) { + IFA_UNLOCK(ifa); break; - - if (ifa == 0) { - addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); - IN_LOOKUP_MULTI(addr, ifp, inm); + } + IFA_UNLOCK(ifa); } ifnet_lock_done(ifp); - if (inm) - in_delmulti(&inm); - } else + + lck_mtx_lock(&ifp->if_addrconfig_lock); + if (ifa == NULL && ifp->if_allhostsinm != NULL) { + struct in_multi *inm = ifp->if_allhostsinm; + ifp->if_allhostsinm = NULL; + + in_delmulti(inm); + /* release the reference for allhostsinm pointer */ + INM_REMREF(inm); + } + lck_mtx_unlock(&ifp->if_addrconfig_lock); + } else { ifnet_lock_done(ifp); -#endif + } /* Post the kernel event */ kev_post_msg(&ev_msg); @@ -881,7 +1009,7 @@ in_control( error = 0; /* Release reference from ifa_ifpgetprimary() */ - ifafree(ifa); + IFA_REMREF(ifa); } #if PF (void) pf_ifaddr_hook(ifp, cmd); @@ -933,7 +1061,6 @@ in_control( /* Multicast options */ if (cloned_inp->inp_moptions != NULL) { - int i; struct ip_moptions *cloned_imo = cloned_inp->inp_moptions; struct ip_moptions *imo = inp->inp_moptions; @@ -942,35 +1069,15 @@ in_control( * No multicast option buffer attached to the pcb; * allocate one. */ - imo = (struct ip_moptions*) - _MALLOC(sizeof(*imo), M_IPMOPTS, M_WAITOK); + imo = ip_allocmoptions(M_WAITOK); if (imo == NULL) { error2 = ENOBUFS; break; } inp->inp_moptions = imo; } - imo->imo_multicast_ifp = cloned_imo->imo_multicast_ifp; - imo->imo_multicast_vif = cloned_imo->imo_multicast_vif; - imo->imo_multicast_ttl = cloned_imo->imo_multicast_ttl; - imo->imo_multicast_loop = cloned_imo->imo_multicast_loop; - imo->imo_num_memberships = cloned_imo->imo_num_memberships; - for (i = 0; i < cloned_imo->imo_num_memberships; i++) { - imo->imo_membership[i] = - in_addmulti(&cloned_imo->imo_membership[i]->inm_addr, - cloned_imo->imo_membership[i]->inm_ifp); - if (imo->imo_membership[i] == NULL) { - error2 = ENOBUFS; - break; - } - } - if (i < cloned_imo->imo_num_memberships) { - /* Failed, perform cleanup */ - for (i--; i >= 0; i--) - in_delmulti(&imo->imo_membership[i]); - imo->imo_num_memberships = 0; - break; - } + + error2 = imo_clone(cloned_imo, imo); } } break; @@ -982,7 +1089,7 @@ in_control( } done: if (ia != NULL) { - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); } return (error); } @@ -1017,7 +1124,7 @@ in_lifaddr_ioctl( /* sanity checks */ if (!data || !ifp) { panic("invalid argument to in_lifaddr_ioctl"); - /*NOTRECHED*/ + /*NOTREACHED*/ } switch (cmd) { @@ -1112,21 +1219,30 @@ in_lifaddr_ioctl( ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; - if (!cmp) + } + if (!cmp) { + IFA_UNLOCK(ifa); break; + } candidate.s_addr = ((struct sockaddr_in *)&ifa->ifa_addr)->sin_addr.s_addr; candidate.s_addr &= mask.s_addr; + IFA_UNLOCK(ifa); if (candidate.s_addr == match.s_addr) break; } + if (ifa != NULL) + IFA_ADDREF(ifa); ifnet_lock_done(ifp); if (!ifa) return EADDRNOTAVAIL; ia = (struct in_ifaddr *)ifa; if (cmd == SIOCGLIFADDR) { + IFA_LOCK(ifa); /* fill in the if_laddrreq structure */ bcopy(&ia->ia_addr, &iflr->addr, ia->ia_addr.sin_len); @@ -1141,6 +1257,8 @@ in_lifaddr_ioctl( iflr->flags = 0; /*XXX*/ + IFA_UNLOCK(ifa); + IFA_REMREF(ifa); return 0; } else { struct in_aliasreq ifra; @@ -1150,6 +1268,7 @@ in_lifaddr_ioctl( bcopy(iflr->iflr_name, ifra.ifra_name, sizeof(ifra.ifra_name)); + IFA_LOCK(ifa); bcopy(&ia->ia_addr, &ifra.ifra_addr, ia->ia_addr.sin_len); if ((ifp->if_flags & IFF_POINTOPOINT) != 0) { @@ -1158,7 +1277,8 @@ in_lifaddr_ioctl( } bcopy(&ia->ia_sockmask, &ifra.ifra_dstaddr, ia->ia_sockmask.sin_len); - + IFA_UNLOCK(ifa); + IFA_REMREF(ifa); return in_control(so, SIOCDIFADDR, (caddr_t)&ifra, ifp, p); } @@ -1172,21 +1292,23 @@ in_lifaddr_ioctl( * Delete any existing route for an interface. */ void -in_ifscrub( - struct ifnet *ifp, - struct in_ifaddr *ia, - int locked) +in_ifscrub(struct ifnet *ifp, struct in_ifaddr *ia, int locked) { - - if ((ia->ia_flags & IFA_ROUTE) == 0) + IFA_LOCK(&ia->ia_ifa); + if ((ia->ia_flags & IFA_ROUTE) == 0) { + IFA_UNLOCK(&ia->ia_ifa); return; + } + IFA_UNLOCK(&ia->ia_ifa); if (!locked) lck_mtx_lock(rnh_lock); if (ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)) rtinit_locked(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST); else rtinit_locked(&(ia->ia_ifa), (int)RTM_DELETE, 0); + IFA_LOCK(&ia->ia_ifa); ia->ia_flags &= ~IFA_ROUTE; + IFA_UNLOCK(&ia->ia_ifa); if (!locked) lck_mtx_unlock(rnh_lock); } @@ -1197,12 +1319,20 @@ in_ifscrub( static void in_iahash_remove(struct in_ifaddr *ia) { - if (!IA_IS_HASHED(ia)) - panic("attempt to remove wrong ia %p from hash table\n", ia); + lck_rw_assert(in_ifaddr_rwlock, LCK_RW_ASSERT_EXCLUSIVE); + IFA_LOCK_ASSERT_HELD(&ia->ia_ifa); + if (!IA_IS_HASHED(ia)) { + panic("attempt to remove wrong ia %p from hash table\n", ia); + /* NOTREACHED */ + } TAILQ_REMOVE(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash); IA_HASH_INIT(ia); - ifafree(&ia->ia_ifa); + if (IFA_REMREF_LOCKED(&ia->ia_ifa) == NULL) { + panic("%s: unexpected (missing) refcnt ifa=%p", __func__, + &ia->ia_ifa); + /* NOTREACHED */ + } } /* @@ -1211,13 +1341,18 @@ in_iahash_remove(struct in_ifaddr *ia) static void in_iahash_insert(struct in_ifaddr *ia) { - if (ia->ia_addr.sin_family != AF_INET) + lck_rw_assert(in_ifaddr_rwlock, LCK_RW_ASSERT_EXCLUSIVE); + IFA_LOCK_ASSERT_HELD(&ia->ia_ifa); + + if (ia->ia_addr.sin_family != AF_INET) { panic("attempt to insert wrong ia %p into hash table\n", ia); - else if (IA_IS_HASHED(ia)) + /* NOTREACHED */ + } else if (IA_IS_HASHED(ia)) { panic("attempt to double-insert ia %p into hash table\n", ia); - + /* NOTREACHED */ + } TAILQ_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash); - ifaref(&ia->ia_ifa); + IFA_ADDREF_LOCKED(&ia->ia_ifa); } /* @@ -1236,22 +1371,39 @@ in_iahash_insert_ptp(struct in_ifaddr *ia) struct in_ifaddr *tmp_ifa; struct ifnet *tmp_ifp; - if (ia->ia_addr.sin_family != AF_INET) + lck_rw_assert(in_ifaddr_rwlock, LCK_RW_ASSERT_EXCLUSIVE); + IFA_LOCK_ASSERT_HELD(&ia->ia_ifa); + + if (ia->ia_addr.sin_family != AF_INET) { panic("attempt to insert wrong ia %p into hash table\n", ia); - else if (IA_IS_HASHED(ia)) + /* NOTREACHED */ + } else if (IA_IS_HASHED(ia)) { panic("attempt to double-insert ia %p into hash table\n", ia); - - TAILQ_FOREACH(tmp_ifa, INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia_hash) - if (IA_SIN(tmp_ifa)->sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr) + /* NOTREACHED */ + } + IFA_UNLOCK(&ia->ia_ifa); + TAILQ_FOREACH(tmp_ifa, INADDR_HASH(ia->ia_addr.sin_addr.s_addr), + ia_hash) { + IFA_LOCK(&tmp_ifa->ia_ifa); + /* ia->ia_addr won't change, so check without lock */ + if (IA_SIN(tmp_ifa)->sin_addr.s_addr == + ia->ia_addr.sin_addr.s_addr) { + IFA_UNLOCK(&tmp_ifa->ia_ifa); break; + } + IFA_UNLOCK(&tmp_ifa->ia_ifa); + } tmp_ifp = (tmp_ifa == NULL) ? NULL : tmp_ifa->ia_ifp; - if (tmp_ifp == NULL) - TAILQ_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash); - else - TAILQ_INSERT_TAIL(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash); - - ifaref(&ia->ia_ifa); + IFA_LOCK(&ia->ia_ifa); + if (tmp_ifp == NULL) { + TAILQ_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), + ia, ia_hash); + } else { + TAILQ_INSERT_TAIL(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), + ia, ia_hash); + } + IFA_ADDREF_LOCKED(&ia->ia_ifa); } /* @@ -1273,9 +1425,10 @@ in_ifinit( int oldremoved = 0; /* Take an extra reference for this routine */ - ifaref(&ia->ia_ifa); + IFA_ADDREF(&ia->ia_ifa); lck_rw_lock_exclusive(in_ifaddr_rwlock); + IFA_LOCK(&ia->ia_ifa); oldaddr = ia->ia_addr; if (IA_IS_HASHED(ia)) { oldremoved = 1; @@ -1285,8 +1438,9 @@ in_ifinit( ia->ia_addr.sin_len = sizeof (*sin); if ((ifp->if_flags & IFF_POINTOPOINT)) in_iahash_insert_ptp(ia); - else + else in_iahash_insert(ia); + IFA_UNLOCK(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); /* @@ -1315,10 +1469,11 @@ in_ifinit( } /* Release reference from ifa_ifpgetprimary() */ - ifafree(ifa0); + IFA_REMREF(ifa0); if (error) { lck_rw_lock_exclusive(in_ifaddr_rwlock); + IFA_LOCK(&ia->ia_ifa); if (IA_IS_HASHED(ia)) in_iahash_remove(ia); ia->ia_addr = oldaddr; @@ -1328,17 +1483,27 @@ in_ifinit( else in_iahash_insert(ia); } + IFA_UNLOCK(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); /* Release extra reference taken above */ - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); return (error); } lck_mtx_lock(rnh_lock); + IFA_LOCK(&ia->ia_ifa); + /* + * Address has been initialized by the link resolver (ARP) + * via ifnet_ioctl() above; it may now generate route(s). + */ + ia->ia_ifa.ifa_debug &= ~IFD_NOTREADY; if (scrub) { ia->ia_ifa.ifa_addr = (struct sockaddr *)&oldaddr; + IFA_UNLOCK(&ia->ia_ifa); in_ifscrub(ifp, ia, 1); + IFA_LOCK(&ia->ia_ifa); ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; } + IFA_LOCK_ASSERT_HELD(&ia->ia_ifa); if (IN_CLASSA(i)) ia->ia_netmask = IN_CLASSA_NET; else if (IN_CLASSB(i)) @@ -1372,16 +1537,21 @@ in_ifinit( flags |= RTF_HOST; } else if (ifp->if_flags & IFF_POINTOPOINT) { if (ia->ia_dstaddr.sin_family != AF_INET) { + IFA_UNLOCK(&ia->ia_ifa); lck_mtx_unlock(rnh_lock); /* Release extra reference taken above */ - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); return (0); } ia->ia_dstaddr.sin_len = sizeof (*sin); flags |= RTF_HOST; } - if ((error = rtinit_locked(&(ia->ia_ifa), (int)RTM_ADD, flags)) == 0) + IFA_UNLOCK(&ia->ia_ifa); + if ((error = rtinit_locked(&(ia->ia_ifa), (int)RTM_ADD, flags)) == 0) { + IFA_LOCK(&ia->ia_ifa); ia->ia_flags |= IFA_ROUTE; + IFA_UNLOCK(&ia->ia_ifa); + } lck_mtx_unlock(rnh_lock); /* XXX check if the subnet route points to the same interface */ @@ -1393,19 +1563,29 @@ in_ifinit( * multicast group on that interface. */ if (ifp->if_flags & IFF_MULTICAST) { - struct in_multi *inm; struct in_addr addr; + lck_mtx_lock(&ifp->if_addrconfig_lock); addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); - ifnet_lock_shared(ifp); - IN_LOOKUP_MULTI(addr, ifp, inm); - ifnet_lock_done(ifp); - if (inm == 0) - in_addmulti(&addr, ifp); + if (ifp->if_allhostsinm == NULL) { + struct in_multi *inm; + inm = in_addmulti(&addr, ifp); + + if (inm != NULL) { + /* keep the reference on inm added by + * in_addmulti above for storing the + * pointer in allhostsinm + */ + ifp->if_allhostsinm = inm; + } else { + printf("Failed to add membership to all-hosts multicast address on interface %s%d\n", ifp->if_name, ifp->if_unit); + } + } + lck_mtx_unlock(&ifp->if_addrconfig_lock); } /* Release extra reference taken above */ - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); return (error); } @@ -1414,18 +1594,15 @@ in_ifinit( * Return 1 if the address might be a local broadcast address. */ int -in_broadcast( - struct in_addr in, - struct ifnet *ifp) +in_broadcast(struct in_addr in, struct ifnet *ifp) { struct ifaddr *ifa; u_int32_t t; - if (in.s_addr == INADDR_BROADCAST || - in.s_addr == INADDR_ANY) - return 1; + if (in.s_addr == INADDR_BROADCAST || in.s_addr == INADDR_ANY) + return (1); if ((ifp->if_flags & IFF_BROADCAST) == 0) - return 0; + return (0); t = ntohl(in.s_addr); /* * Look through the list of addresses for a match @@ -1434,10 +1611,7 @@ in_broadcast( #define ia ((struct in_ifaddr *)ifa) ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - if (ifa->ifa_addr == NULL) { - ifnet_lock_done(ifp); - return (0); - } + IFA_LOCK(ifa); if (ifa->ifa_addr->sa_family == AF_INET && (in.s_addr == ia->ia_broadaddr.sin_addr.s_addr || in.s_addr == ia->ia_netbroadcast.s_addr || @@ -1451,140 +1625,76 @@ in_broadcast( * address. */ ia->ia_subnetmask != (u_int32_t)0xffffffff) { + IFA_UNLOCK(ifa); ifnet_lock_done(ifp); - return 1; + return (1); } + IFA_UNLOCK(ifa); } ifnet_lock_done(ifp); return (0); #undef ia } -static void -in_free_inm( - void* ifma_protospec) -{ - struct in_multi *inm = ifma_protospec; - - /* - * No remaining claims to this record; let IGMP know that - * we are leaving the multicast group. - */ - igmp_leavegroup(inm); - lck_mtx_lock(rnh_lock); - LIST_REMOVE(inm, inm_link); - lck_mtx_unlock(rnh_lock); - FREE(inm, M_IPMADDR); -} - -/* - * Add an address to the list of IP multicast addresses for a given interface. - */ -struct in_multi * -in_addmulti( - struct in_addr *ap, - struct ifnet *ifp) +void +in_purgeaddrs(struct ifnet *ifp) { - struct in_multi *inm; - int error; - struct sockaddr_in sin; - struct ifmultiaddr *ifma; + struct ifaddr **ifap; + int err, i; /* - * Call generic routine to add membership or increment - * refcount. It wants addresses in the form of a sockaddr, - * so we build one here (being careful to zero the unused bytes). + * Be nice, and try the civilized way first. If we can't get + * rid of them this way, then do it the rough way. We must + * only get here during detach time, after the ifnet has been + * removed from the global list and arrays. */ - bzero(&sin, sizeof sin); - sin.sin_family = AF_INET; - sin.sin_len = sizeof sin; - sin.sin_addr = *ap; - error = if_addmulti(ifp, (struct sockaddr *)&sin, &ifma); - if (error) { - return 0; - } - - /* - * If ifma->ifma_protospec is null, then if_addmulti() created - * a new record. Otherwise, we are done. - */ - if (ifma->ifma_protospec != 0) { - return ifma->ifma_protospec; - } - - inm = (struct in_multi *) _MALLOC(sizeof(*inm), M_IPMADDR, M_WAITOK); - if (inm == NULL) { - return (NULL); - } - - bzero(inm, sizeof *inm); - inm->inm_addr = *ap; - inm->inm_ifp = ifp; - inm->inm_ifma = ifma; - lck_mtx_lock(rnh_lock); - if (ifma->ifma_protospec == NULL) { - ifma->ifma_protospec = inm; - ifma->ifma_free = in_free_inm; - LIST_INSERT_HEAD(&in_multihead, inm, inm_link); - } - lck_mtx_unlock(rnh_lock); - - if (ifma->ifma_protospec != inm) { - _FREE(inm, M_IPMADDR); - return ifma->ifma_protospec; - } - - /* - * Let IGMP know that we have joined a new IP multicast group. - */ - error = igmp_joingroup(inm); - if (error) { - char addrbuf[16]; - - /* - * We can't free the inm because someone else may already be - * using it. Once we put it in to ifma->ifma_protospec, it - * must exist as long as the ifma does. Might be nice to flag - * the error so we can try igmp_joingroup the next time through. - */ - log(LOG_ERR, "igmp_joingroup error %d joining multicast %s on %s%d\n", - error, inet_ntop(AF_INET, &sin.sin_addr, addrbuf, sizeof(addrbuf)), - ifp->if_name, ifp->if_unit); - } - - return (inm); -} - -/* - * Delete a multicast address record. - */ -void -in_delmulti( - struct in_multi **inm) -{ - struct in_multi *inm2; - - lck_mtx_lock(rnh_lock); - LIST_FOREACH(inm2, &in_multihead, inm_link) { - if (inm2 == *inm) - break; - } - if (inm2 != *inm) { - lck_mtx_unlock(rnh_lock); - printf("in_delmulti - ignoring invalid inm (%p)\n", *inm); - return; - } - lck_mtx_unlock(rnh_lock); - - /* We intentionally do this a bit differently than BSD */ - if ((*inm)->inm_ifma) { - if_delmultiaddr((*inm)->inm_ifma, 0); - ifma_release((*inm)->inm_ifma); + err = ifnet_get_address_list_family_internal(ifp, &ifap, AF_INET, 1, + M_WAITOK); + if (err == 0 && ifap != NULL) { + for (i = 0; ifap[i] != NULL; i++) { + struct ifaliasreq ifr; + struct ifaddr *ifa; + + ifa = ifap[i]; + bzero(&ifr, sizeof (ifr)); + IFA_LOCK(ifa); + ifr.ifra_addr = *ifa->ifa_addr; + if (ifa->ifa_dstaddr != NULL) + ifr.ifra_broadaddr = *ifa->ifa_dstaddr; + IFA_UNLOCK(ifa); + err = in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, + kernproc); + /* if we lost the race, ignore it */ + if (err == EADDRNOTAVAIL) + err = 0; + if (err != 0) { + char s_addr[MAX_IPv4_STR_LEN]; + char s_dstaddr[MAX_IPv4_STR_LEN]; + struct in_addr *s, *d; + + IFA_LOCK(ifa); + s = &((struct sockaddr_in *) + ifa->ifa_addr)->sin_addr; + d = &((struct sockaddr_in *) + ifa->ifa_dstaddr)->sin_addr; + (void) inet_ntop(AF_INET, &s->s_addr, s_addr, + sizeof (s_addr)); + (void) inet_ntop(AF_INET, &d->s_addr, s_dstaddr, + sizeof (s_dstaddr)); + IFA_UNLOCK(ifa); + + printf("%s: SIOCDIFADDR ifp=%p ifa_addr=%s " + "ifa_dstaddr=%s (err=%d)\n", __func__, ifp, + s_addr, s_dstaddr, err); + } + } + ifnet_free_address_list(ifap); + } else if (err != 0 && err != ENXIO) { + printf("%s: error retrieving list of AF_INET addresses for " + "ifp=%p (err=%d)\n", __func__, ifp, err); } - *inm = NULL; } -#if !NFSCLIENT int inet_aton(char *cp, struct in_addr *pin); int inet_aton(char * cp, struct in_addr * pin) @@ -1605,7 +1715,19 @@ inet_aton(char * cp, struct in_addr * pin) } return (TRUE); } -#endif + +int inet_ntoa2(struct in_addr * pin, char * cp, const int len); +int inet_ntoa2(struct in_addr * pin, char * cp, const int len) +{ + int ret; + + /* address is in network byte order */ + ret = snprintf(cp, len, "%u.%u.%u.%u", pin->s_addr & 0xFF, + (pin->s_addr >> 8) & 0xFF, (pin->s_addr >> 16) & 0xFF, + (pin->s_addr >> 24) & 0xFF); + + return ret < len ? TRUE : FALSE; +} /* * Called as part of ip_init @@ -1613,6 +1735,8 @@ inet_aton(char * cp, struct in_addr * pin) void in_ifaddr_init(void) { + in_multi_init(); + PE_parse_boot_argn("ifa_debug", &inifa_debug, sizeof (inifa_debug)); inifa_size = (inifa_debug == 0) ? sizeof (struct in_ifaddr) : @@ -1620,10 +1744,15 @@ in_ifaddr_init(void) inifa_zone = zinit(inifa_size, INIFA_ZONE_MAX * inifa_size, 0, INIFA_ZONE_NAME); - if (inifa_zone == NULL) + if (inifa_zone == NULL) { panic("%s: failed allocating %s", __func__, INIFA_ZONE_NAME); - + /* NOTREACHED */ + } zone_change(inifa_zone, Z_EXPAND, TRUE); + zone_change(inifa_zone, Z_CALLERACCT, FALSE); + + lck_mtx_init(&inifa_trash_lock, ifa_mtx_grp, ifa_mtx_attr); + TAILQ_INIT(&inifa_trash_head); } static struct in_ifaddr * @@ -1637,11 +1766,14 @@ in_ifaddr_alloc(int how) bzero(inifa, inifa_size); inifa->ia_ifa.ifa_free = in_ifaddr_free; inifa->ia_ifa.ifa_debug |= IFD_ALLOC; + ifa_lock_init(&inifa->ia_ifa); if (inifa_debug != 0) { struct in_ifaddr_dbg *inifa_dbg = (struct in_ifaddr_dbg *)inifa; inifa->ia_ifa.ifa_debug |= IFD_DEBUG; inifa->ia_ifa.ifa_trace = in_ifaddr_trace; + inifa->ia_ifa.ifa_attached = in_ifaddr_attached; + inifa->ia_ifa.ifa_detached = in_ifaddr_detached; ctrace_record(&inifa_dbg->inifa_alloc); } } @@ -1651,21 +1783,79 @@ in_ifaddr_alloc(int how) static void in_ifaddr_free(struct ifaddr *ifa) { - if (ifa->ifa_refcnt != 0) + IFA_LOCK_ASSERT_HELD(ifa); + + if (ifa->ifa_refcnt != 0) { panic("%s: ifa %p bad ref cnt", __func__, ifa); - if (!(ifa->ifa_debug & IFD_ALLOC)) + /* NOTREACHED */ + } if (!(ifa->ifa_debug & IFD_ALLOC)) { panic("%s: ifa %p cannot be freed", __func__, ifa); - + /* NOTREACHED */ + } if (ifa->ifa_debug & IFD_DEBUG) { struct in_ifaddr_dbg *inifa_dbg = (struct in_ifaddr_dbg *)ifa; ctrace_record(&inifa_dbg->inifa_free); bcopy(&inifa_dbg->inifa, &inifa_dbg->inifa_old, sizeof (struct in_ifaddr)); + if (ifa->ifa_debug & IFD_TRASHED) { + /* Become a regular mutex, just in case */ + IFA_CONVERT_LOCK(ifa); + lck_mtx_lock(&inifa_trash_lock); + TAILQ_REMOVE(&inifa_trash_head, inifa_dbg, + inifa_trash_link); + lck_mtx_unlock(&inifa_trash_lock); + ifa->ifa_debug &= ~IFD_TRASHED; + } } + IFA_UNLOCK(ifa); + ifa_lock_destroy(ifa); bzero(ifa, sizeof (struct in_ifaddr)); zfree(inifa_zone, ifa); } +static void +in_ifaddr_attached(struct ifaddr *ifa) +{ + struct in_ifaddr_dbg *inifa_dbg = (struct in_ifaddr_dbg *)ifa; + + IFA_LOCK_ASSERT_HELD(ifa); + + if (!(ifa->ifa_debug & IFD_DEBUG)) { + panic("%s: ifa %p has no debug structure", __func__, ifa); + /* NOTREACHED */ + } + if (ifa->ifa_debug & IFD_TRASHED) { + /* Become a regular mutex, just in case */ + IFA_CONVERT_LOCK(ifa); + lck_mtx_lock(&inifa_trash_lock); + TAILQ_REMOVE(&inifa_trash_head, inifa_dbg, inifa_trash_link); + lck_mtx_unlock(&inifa_trash_lock); + ifa->ifa_debug &= ~IFD_TRASHED; + } +} + +static void +in_ifaddr_detached(struct ifaddr *ifa) +{ + struct in_ifaddr_dbg *inifa_dbg = (struct in_ifaddr_dbg *)ifa; + + IFA_LOCK_ASSERT_HELD(ifa); + + if (!(ifa->ifa_debug & IFD_DEBUG)) { + panic("%s: ifa %p has no debug structure", __func__, ifa); + /* NOTREACHED */ + } else if (ifa->ifa_debug & IFD_TRASHED) { + panic("%s: ifa %p is already in trash list", __func__, ifa); + /* NOTREACHED */ + } + ifa->ifa_debug |= IFD_TRASHED; + /* Become a regular mutex, just in case */ + IFA_CONVERT_LOCK(ifa); + lck_mtx_lock(&inifa_trash_lock); + TAILQ_INSERT_TAIL(&inifa_trash_head, inifa_dbg, inifa_trash_link); + lck_mtx_unlock(&inifa_trash_lock); +} + static void in_ifaddr_trace(struct ifaddr *ifa, int refhold) { @@ -1674,9 +1864,10 @@ in_ifaddr_trace(struct ifaddr *ifa, int refhold) u_int32_t idx; u_int16_t *cnt; - if (!(ifa->ifa_debug & IFD_DEBUG)) + if (!(ifa->ifa_debug & IFD_DEBUG)) { panic("%s: ifa %p has no debug structure", __func__, ifa); - + /* NOTREACHED */ + } if (refhold) { cnt = &inifa_dbg->inifa_refhold_cnt; tr = inifa_dbg->inifa_refhold; @@ -1685,6 +1876,6 @@ in_ifaddr_trace(struct ifaddr *ifa, int refhold) tr = inifa_dbg->inifa_refrele; } - idx = OSAddAtomic16(1, (volatile SInt16 *)cnt) % CTRACE_HIST_SIZE; + idx = atomic_add_16_ov(cnt, 1) % INIFA_TRACE_HIST_SIZE; ctrace_record(&tr[idx]); } diff --git a/bsd/netinet/in.h b/bsd/netinet/in.h index fc38f8401..4e66c26c7 100644 --- a/bsd/netinet/in.h +++ b/bsd/netinet/in.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -67,6 +67,10 @@ #include #include /* uint(8|16|32)_t */ +#ifndef KERNEL +#include +#endif + #ifndef _IN_ADDR_T #define _IN_ADDR_T typedef __uint32_t in_addr_t; /* base type for internet address */ @@ -207,10 +211,11 @@ typedef __uint16_t in_port_t; #define IPPROTO_ENCAP 98 /* encapsulation header */ #define IPPROTO_APES 99 /* any private encr. scheme */ #define IPPROTO_GMTP 100 /* GMTP*/ -#define IPPROTO_IPCOMP 108 /* payload compression (IPComp) */ /* 101-254: Partly Unassigned */ #define IPPROTO_PIM 103 /* Protocol Independent Mcast */ +#define IPPROTO_IPCOMP 108 /* payload compression (IPComp) */ #define IPPROTO_PGM 113 /* PGM */ +#define IPPROTO_SCTP 132 /* SCTP */ /* 255: Reserved */ /* BSD Private, local use, namespace incursion */ #define IPPROTO_DIVERT 254 /* divert pseudo-protocol */ @@ -341,6 +346,7 @@ struct in_addr { #define IN_BADCLASS(i) (((u_int32_t)(i) & 0xf0000000) == 0xf0000000) #define INADDR_LOOPBACK (u_int32_t)0x7f000001 + #ifndef KERNEL #define INADDR_NONE 0xffffffff /* -1 return */ #endif @@ -348,11 +354,25 @@ struct in_addr { #define INADDR_UNSPEC_GROUP (u_int32_t)0xe0000000 /* 224.0.0.0 */ #define INADDR_ALLHOSTS_GROUP (u_int32_t)0xe0000001 /* 224.0.0.1 */ #define INADDR_ALLRTRS_GROUP (u_int32_t)0xe0000002 /* 224.0.0.2 */ +#define INADDR_ALLRPTS_GROUP (u_int32_t)0xe0000016 /* 224.0.0.22, IGMPv3 */ +#define INADDR_CARP_GROUP (u_int32_t)0xe0000012 /* 224.0.0.18 */ +#define INADDR_PFSYNC_GROUP (u_int32_t)0xe00000f0 /* 224.0.0.240 */ +#define INADDR_ALLMDNS_GROUP (u_int32_t)0xe00000fb /* 224.0.0.251 */ #define INADDR_MAX_LOCAL_GROUP (u_int32_t)0xe00000ff /* 224.0.0.255 */ #ifdef __APPLE__ #define IN_LINKLOCALNETNUM (u_int32_t)0xA9FE0000 /* 169.254.0.0 */ #define IN_LINKLOCAL(i) (((u_int32_t)(i) & IN_CLASSB_NET) == IN_LINKLOCALNETNUM) +#define IN_LOOPBACK(i) (((u_int32_t)(i) & 0xff000000) == 0x7f000000) +#define IN_ZERONET(i) (((u_int32_t)(i) & 0xff000000) == 0) + +#define IN_PRIVATE(i) ((((u_int32_t)(i) & 0xff000000) == 0x0a000000) || \ + (((u_int32_t)(i) & 0xfff00000) == 0xac100000) || \ + (((u_int32_t)(i) & 0xffff0000) == 0xc0a80000)) + +#define IN_LOCAL_GROUP(i) (((u_int32_t)(i) & 0xffffff00) == 0xe0000000) + +#define IN_ANY_LOCAL(i) (IN_LINKLOCAL(i) || IN_LOCAL_GROUP(i)) #endif #define IN_LOOPBACKNET 127 /* official! */ @@ -415,7 +435,9 @@ struct ip_opts { #define IP_STRIPHDR 23 /* bool: drop receive of raw IP header */ #endif #define IP_RECVTTL 24 /* bool; receive reception TTL w/dgram */ -#define IP_BOUND_IF 25 /* set/get bound interface */ +#define IP_BOUND_IF 25 /* int; set/get bound interface */ +#define IP_PKTINFO 26 /* get pktinfo on recv socket, set src on sent dgram */ +#define IP_RECVPKTINFO IP_PKTINFO /* receive pktinfo w/dgram */ #define IP_FW_ADD 40 /* add a firewall rule to chain */ @@ -440,24 +462,53 @@ struct ip_opts { #define IP_DUMMYNET_GET 64 /* get entire dummynet pipes */ #define IP_TRAFFIC_MGT_BACKGROUND 65 /* int*; get background IO flags; set background IO */ +#define IP_MULTICAST_IFINDEX 66 /* int*; set/get IP multicast i/f index */ + +/* IPv4 Source Filter Multicast API [RFC3678] */ +#define IP_ADD_SOURCE_MEMBERSHIP 70 /* join a source-specific group */ +#define IP_DROP_SOURCE_MEMBERSHIP 71 /* drop a single source */ +#define IP_BLOCK_SOURCE 72 /* block a source */ +#define IP_UNBLOCK_SOURCE 73 /* unblock a source */ + +/* The following option is private; do not use it from user applications. */ +#define IP_MSFILTER 74 /* set/get filter list */ + +/* Protocol Independent Multicast API [RFC3678] */ +#define MCAST_JOIN_GROUP 80 /* join an any-source group */ +#define MCAST_LEAVE_GROUP 81 /* leave all sources for group */ +#define MCAST_JOIN_SOURCE_GROUP 82 /* join a source-specific group */ +#define MCAST_LEAVE_SOURCE_GROUP 83 /* leave a single source */ +#define MCAST_BLOCK_SOURCE 84 /* block a source */ +#define MCAST_UNBLOCK_SOURCE 85 /* unblock a source */ #ifdef PRIVATE #define IP_FORCE_OUT_IFP 69 /* deprecated; use IP_BOUND_IF instead */ -#endif - -/* Background socket configuration flags */ -#ifdef __APPLE_API_UNSTABLE -#define TRAFFIC_MGT_SO_BACKGROUND 0x0001 /* background socket */ -#define TRAFFIC_MGT_SO_BG_SUPPRESSED 0x0002 /* currently throttled */ -#define TRAFFIC_MGT_SO_BG_REGULATE 0x0004 /* traffic is regulated */ -#endif /* __APPLE_API_UNSTABLE */ +#define IP_NO_IFT_CELLULAR 6969 /* for internal use only */ +#define IP_NO_IFT_PDP IP_NO_IFT_CELLULAR /* deprecated */ +#define IP_OUT_IF 9696 /* for internal use only */ +#endif /* PRIVATE */ /* * Defaults and limits for options */ #define IP_DEFAULT_MULTICAST_TTL 1 /* normally limit m'casts to 1 hop */ #define IP_DEFAULT_MULTICAST_LOOP 1 /* normally hear sends if a member */ -#define IP_MAX_MEMBERSHIPS 20 /* per socket */ + +/* + * The imo_membership vector for each socket is now dynamically allocated at + * run-time, bounded by USHRT_MAX, and is reallocated when needed, sized + * according to a power-of-two increment. + */ +#define IP_MIN_MEMBERSHIPS 31 +#define IP_MAX_MEMBERSHIPS 4095 + +/* + * Default resource limits for IPv4 multicast source filtering. + * These may be modified by sysctl. + */ +#define IP_MAX_GROUP_SRC_FILTER 512 /* sources per group */ +#define IP_MAX_SOCK_SRC_FILTER 128 /* sources per socket/group */ +#define IP_MAX_SOCK_MUTE_FILTER 128 /* XXX no longer used */ /* * Argument structure for IP_ADD_MEMBERSHIP and IP_DROP_MEMBERSHIP. @@ -467,6 +518,105 @@ struct ip_mreq { struct in_addr imr_interface; /* local IP address of interface */ }; +/* + * Modified argument structure for IP_MULTICAST_IF, obtained from Linux. + * This is used to specify an interface index for multicast sends, as + * the IPv4 legacy APIs do not support this (unless IP_SENDIF is available). + */ +struct ip_mreqn { + struct in_addr imr_multiaddr; /* IP multicast address of group */ + struct in_addr imr_address; /* local IP address of interface */ + int imr_ifindex; /* Interface index; cast to uint32_t */ +}; + +#pragma pack(4) +/* + * Argument structure for IPv4 Multicast Source Filter APIs. [RFC3678] + */ +struct ip_mreq_source { + struct in_addr imr_multiaddr; /* IP multicast address of group */ + struct in_addr imr_sourceaddr; /* IP address of source */ + struct in_addr imr_interface; /* local IP address of interface */ +}; + +/* + * Argument structures for Protocol-Independent Multicast Source + * Filter APIs. [RFC3678] + */ +struct group_req { + uint32_t gr_interface; /* interface index */ + struct sockaddr_storage gr_group; /* group address */ +}; + +struct group_source_req { + uint32_t gsr_interface; /* interface index */ + struct sockaddr_storage gsr_group; /* group address */ + struct sockaddr_storage gsr_source; /* source address */ +}; + +#ifndef __MSFILTERREQ_DEFINED +#define __MSFILTERREQ_DEFINED +/* + * The following structure is private; do not use it from user applications. + * It is used to communicate IP_MSFILTER/IPV6_MSFILTER information between + * the RFC 3678 libc functions and the kernel. + */ +struct __msfilterreq { + uint32_t msfr_ifindex; /* interface index */ + uint32_t msfr_fmode; /* filter mode for group */ + uint32_t msfr_nsrcs; /* # of sources in msfr_srcs */ + uint32_t __msfr_align; + struct sockaddr_storage msfr_group; /* group address */ + struct sockaddr_storage *msfr_srcs; +}; + +#ifdef XNU_KERNEL_PRIVATE +struct __msfilterreq32 { + uint32_t msfr_ifindex; /* interface index */ + uint32_t msfr_fmode; /* filter mode for group */ + uint32_t msfr_nsrcs; /* # of sources in msfr_srcs */ + uint32_t __msfr_align; + struct sockaddr_storage msfr_group; /* group address */ + user32_addr_t msfr_srcs; +}; + +struct __msfilterreq64 { + uint32_t msfr_ifindex; /* interface index */ + uint32_t msfr_fmode; /* filter mode for group */ + uint32_t msfr_nsrcs; /* # of sources in msfr_srcs */ + uint32_t __msfr_align; + struct sockaddr_storage msfr_group; /* group address */ + user64_addr_t msfr_srcs; +}; +#endif /* XNU_KERNEL_PRIVATE */ +#endif /* __MSFILTERREQ_DEFINED */ + +#pragma pack() +struct sockaddr; + +#ifndef KERNEL +/* + * Advanced (Full-state) APIs [RFC3678] + * The RFC specifies uint_t for the 6th argument to [sg]etsourcefilter(). + * We use uint32_t here to be consistent. + */ +int setipv4sourcefilter(int, struct in_addr, struct in_addr, uint32_t, + uint32_t, struct in_addr *) __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_NA); +int getipv4sourcefilter(int, struct in_addr, struct in_addr, uint32_t *, + uint32_t *, struct in_addr *) __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_NA); +int setsourcefilter(int, uint32_t, struct sockaddr *, socklen_t, + uint32_t, uint32_t, struct sockaddr_storage *) __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_NA); +int getsourcefilter(int, uint32_t, struct sockaddr *, socklen_t, + uint32_t *, uint32_t *, struct sockaddr_storage *) __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_NA); +#endif + +/* + * Filter modes; also used to represent per-socket filter mode internally. + */ +#define MCAST_UNDEFINED 0 /* fmode: not yet defined */ +#define MCAST_INCLUDE 1 /* fmode: include these source(s) */ +#define MCAST_EXCLUDE 2 /* fmode: exclude these source(s) */ + /* * Argument for IP_PORTRANGE: * - which range to search when port is unspecified at bind() or connect() @@ -476,6 +626,31 @@ struct ip_mreq { #define IP_PORTRANGE_LOW 2 /* "low" - vouchsafe security */ +/* + * IP_PKTINFO: Packet information (equivalent to RFC2292 sec 5 for IPv4) + * This structure is used for + * + * 1) Receiving ancilliary data about the datagram if IP_PKTINFO sockopt is + * set on the socket. In this case ipi_ifindex will contain the interface + * index the datagram was received on, ipi_addr is the IP address the + * datagram was received to. + * + * 2) Sending a datagram using a specific interface or IP source address. + * if ipi_ifindex is set to non-zero when in_pktinfo is passed as + * ancilliary data of type IP_PKTINFO, this will be used as the source + * interface to send the datagram from. If ipi_ifindex is null, ip_spec_dst + * will be used for the source address. + * + * Note: if IP_BOUND_IF is set on the socket, ipi_ifindex in the ancillary + * IP_PKTINFO option silently overrides the bound interface when it is + * specified during send time. + */ +struct in_pktinfo { + unsigned int ipi_ifindex; /* send/recv interface index */ + struct in_addr ipi_spec_dst; /* Local address */ + struct in_addr ipi_addr; /* IP Header dst address */ +}; + /* * Definitions for inet sysctl operations. * @@ -616,6 +791,11 @@ extern int in_localaddr(struct in_addr); extern u_int32_t in_netof(struct in_addr); extern int inaddr_local(struct in_addr); + +#define in_hosteq(s, t) ((s).s_addr == (t).s_addr) +#define in_nullhost(x) ((x).s_addr == INADDR_ANY) +#define in_allhosts(x) ((x).s_addr == htonl(INADDR_ALLHOSTS_GROUP)) + #endif /* KERNEL_PRIVATE */ #define MAX_IPv4_STR_LEN 16 #define MAX_IPv6_STR_LEN 64 diff --git a/bsd/netinet/in_arp.c b/bsd/netinet/in_arp.c index 886528306..8a4dfcd14 100644 --- a/bsd/netinet/in_arp.c +++ b/bsd/netinet/in_arp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2009 Apple Inc. All rights reserved. + * Copyright (c) 2004-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -70,11 +70,14 @@ #include #include #include +#include +#include #include #include #include #include #include +#include #include #include #include @@ -83,7 +86,6 @@ #define SA(p) ((struct sockaddr *)(p)) #define SIN(s) ((struct sockaddr_in *)s) #define CONST_LLADDR(s) ((const u_char*)((s)->sdl_data + (s)->sdl_nlen)) -#define rt_expire rt_rmx.rmx_expire #define equal(a1, a2) (bcmp((caddr_t)(a1), (caddr_t)(a2), (a1)->sa_len) == 0) static const size_t MAX_HW_LEN = 10; @@ -100,16 +102,26 @@ static int arpt_down = 20; /* once declared down, don't send for 20 sec */ int apple_hwcksum_tx = 1; int apple_hwcksum_rx = 1; -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, prune_intvl, CTLFLAG_RW, - &arpt_prune, 0, ""); -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_RW, - &arpt_keep, 0, ""); -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, host_down_time, CTLFLAG_RW, - &arpt_down, 0, ""); -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, apple_hwcksum_tx, CTLFLAG_RW, - &apple_hwcksum_tx, 0, ""); -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, apple_hwcksum_rx, CTLFLAG_RW, - &apple_hwcksum_rx, 0, ""); +static int arp_llreach_base = (LL_BASE_REACHABLE / 1000); /* seconds */ + +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, prune_intvl, + CTLFLAG_RW | CTLFLAG_LOCKED, &arpt_prune, 0, ""); + +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, + CTLFLAG_RW | CTLFLAG_LOCKED, &arpt_keep, 0, ""); + +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, host_down_time, + CTLFLAG_RW | CTLFLAG_LOCKED, &arpt_down, 0, ""); + +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, apple_hwcksum_tx, + CTLFLAG_RW | CTLFLAG_LOCKED, &apple_hwcksum_tx, 0, ""); + +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, apple_hwcksum_rx, + CTLFLAG_RW | CTLFLAG_LOCKED, &apple_hwcksum_rx, 0, ""); + +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, arp_llreach_base, + CTLFLAG_RW | CTLFLAG_LOCKED, &arp_llreach_base, LL_BASE_REACHABLE, + "default ARP link-layer reachability max lifetime (in seconds)"); struct llinfo_arp { /* @@ -121,7 +133,10 @@ struct llinfo_arp { * The following are protected by rt_lock */ struct mbuf *la_hold; /* last packet until resolved/timeout */ - int32_t la_asked; /* last time we QUERIED for this addr */ + struct if_llreach *la_llreach; /* link-layer reachability record */ + u_int64_t la_lastused; /* last used timestamp */ + u_int32_t la_asked; /* # of requests sent */ + u_int32_t la_persist; /* expirable, but stays around */ }; /* @@ -140,7 +155,7 @@ struct llinfo_arp { * * - Routing lock (rnh_lock) * - * la_hold, la_asked + * la_hold, la_asked, la_llreach, la_lastused * * - Routing entry lock (rt_lock) * @@ -153,33 +168,36 @@ static LIST_HEAD(, llinfo_arp) llinfo_arp; static int arp_inuse, arp_allocated; -static int arp_maxtries = 5; +static u_int32_t arp_maxtries = 5; static int useloopback = 1; /* use loopback interface for local traffic */ static int arp_proxyall = 0; static int arp_sendllconflict = 0; -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_RW, +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_RW | CTLFLAG_LOCKED, &arp_maxtries, 0, ""); -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, useloopback, CTLFLAG_RW, +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, useloopback, CTLFLAG_RW | CTLFLAG_LOCKED, &useloopback, 0, ""); -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_RW, +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_RW | CTLFLAG_LOCKED, &arp_proxyall, 0, ""); -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, sendllconflict, CTLFLAG_RW, +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, sendllconflict, CTLFLAG_RW | CTLFLAG_LOCKED, &arp_sendllconflict, 0, ""); -static int log_arp_warnings = 0; +static int log_arp_warnings = 0; /* Thread safe: no accumulated state */ -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_warnings, CTLFLAG_RW, +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_warnings, + CTLFLAG_RW | CTLFLAG_LOCKED, &log_arp_warnings, 0, "log arp warning messages"); -static int keep_announcements = 1; -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, keep_announcements, CTLFLAG_RW, +static int keep_announcements = 1; /* Thread safe: no aging of state */ +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, keep_announcements, + CTLFLAG_RW | CTLFLAG_LOCKED, &keep_announcements, 0, "keep arp announcements"); -static int send_conflicting_probes = 1; -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, send_conflicting_probes, CTLFLAG_RW, +static int send_conflicting_probes = 1; /* Thread safe: no accumulated state */ +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, send_conflicting_probes, + CTLFLAG_RW | CTLFLAG_LOCKED, &send_conflicting_probes, 0, "send conflicting link-local arp probes"); @@ -188,6 +206,13 @@ static errno_t arp_lookup_route(const struct in_addr *, int, static void arptimer(void *); static struct llinfo_arp *arp_llinfo_alloc(void); static void arp_llinfo_free(void *); +static void arp_llinfo_purge(struct rtentry *); +static void arp_llinfo_get_ri(struct rtentry *, struct rt_reach_info *); + +static __inline void arp_llreach_use(struct llinfo_arp *); +static __inline int arp_llreach_reachable(struct llinfo_arp *); +static void arp_llreach_alloc(struct rtentry *, struct ifnet *, void *, + unsigned int, boolean_t); extern u_int32_t ipv4_ll_arp_aware; @@ -214,6 +239,7 @@ arp_init(void) panic("%s: failed allocating llinfo_arp_zone", __func__); zone_change(llinfo_arp_zone, Z_EXPAND, TRUE); + zone_change(llinfo_arp_zone, Z_CALLERACCT, FALSE); arpinit_done = 1; @@ -243,9 +269,194 @@ arp_llinfo_free(void *arg) la->la_hold = NULL; } + /* Purge any link-layer info caching */ + VERIFY(la->la_rt->rt_llinfo == la); + if (la->la_rt->rt_llinfo_purge != NULL) + la->la_rt->rt_llinfo_purge(la->la_rt); + zfree(llinfo_arp_zone, la); } +static void +arp_llinfo_purge(struct rtentry *rt) +{ + struct llinfo_arp *la = rt->rt_llinfo; + + RT_LOCK_ASSERT_HELD(rt); + VERIFY(rt->rt_llinfo_purge == arp_llinfo_purge && la != NULL); + + if (la->la_llreach != NULL) { + RT_CONVERT_LOCK(rt); + ifnet_llreach_free(la->la_llreach); + la->la_llreach = NULL; + } + la->la_lastused = 0; +} + +static void +arp_llinfo_get_ri(struct rtentry *rt, struct rt_reach_info *ri) +{ + struct llinfo_arp *la = rt->rt_llinfo; + struct if_llreach *lr = la->la_llreach; + + if (lr == NULL) { + bzero(ri, sizeof (*ri)); + } else { + IFLR_LOCK(lr); + /* Export to rt_reach_info structure */ + ifnet_lr2ri(lr, ri); + /* Export ARP send expiration time */ + ri->ri_snd_expire = ifnet_llreach_up2cal(lr, la->la_lastused); + IFLR_UNLOCK(lr); + } +} + +void +arp_llreach_set_reachable(struct ifnet *ifp, void *addr, unsigned int alen) +{ + /* Nothing more to do if it's disabled */ + if (arp_llreach_base == 0) + return; + + ifnet_llreach_set_reachable(ifp, ETHERTYPE_IP, addr, alen); +} + +static __inline void +arp_llreach_use(struct llinfo_arp *la) +{ + if (la->la_llreach != NULL) + la->la_lastused = net_uptime(); +} + +static __inline int +arp_llreach_reachable(struct llinfo_arp *la) +{ + struct if_llreach *lr; + const char *why = NULL; + + /* Nothing more to do if it's disabled; pretend it's reachable */ + if (arp_llreach_base == 0) + return (1); + + if ((lr = la->la_llreach) == NULL) { + /* + * Link-layer reachability record isn't present for this + * ARP entry; pretend it's reachable and use it as is. + */ + return (1); + } else if (ifnet_llreach_reachable(lr)) { + /* + * Record is present, it's not shared with other ARP + * entries and a packet has recently been received + * from the remote host; consider it reachable. + */ + if (lr->lr_reqcnt == 1) + return (1); + + /* Prime it up, if this is the first time */ + if (la->la_lastused == 0) { + VERIFY(la->la_llreach != NULL); + arp_llreach_use(la); + } + + /* + * Record is present and shared with one or more ARP + * entries, and a packet has recently been received + * from the remote host. Since it's shared by more + * than one IP addresses, we can't rely on the link- + * layer reachability alone; consider it reachable if + * this ARP entry has been used "recently." + */ + if (ifnet_llreach_reachable_delta(lr, la->la_lastused)) + return (1); + + why = "has alias(es) and hasn't been used in a while"; + } else { + why = "haven't heard from it in a while"; + } + + if (log_arp_warnings) { + char tmp[MAX_IPv4_STR_LEN]; + u_int64_t now = net_uptime(); + + log(LOG_DEBUG, "%s%d: ARP probe(s) needed for %s; " + "%s [lastused %lld, lastrcvd %lld] secs ago\n", + lr->lr_ifp->if_name, lr->lr_ifp->if_unit, inet_ntop(AF_INET, + &SIN(rt_key(la->la_rt))->sin_addr, tmp, sizeof (tmp)), why, + (la->la_lastused ? (int64_t)(now - la->la_lastused) : -1), + (lr->lr_lastrcvd ? (int64_t)(now - lr->lr_lastrcvd) : -1)); + + } + return (0); +} + +/* + * Obtain a link-layer source cache entry for the sender. + * + * NOTE: This is currently only for ARP/Ethernet. + */ +static void +arp_llreach_alloc(struct rtentry *rt, struct ifnet *ifp, void *addr, + unsigned int alen, boolean_t solicited) +{ + VERIFY(rt->rt_expire == 0 || rt->rt_rmx.rmx_expire != 0); + VERIFY(rt->rt_expire != 0 || rt->rt_rmx.rmx_expire == 0); + if (arp_llreach_base != 0 && + rt->rt_expire != 0 && rt->rt_ifp != lo_ifp && + ifp->if_addrlen == IF_LLREACH_MAXLEN && /* Ethernet */ + alen == ifp->if_addrlen) { + struct llinfo_arp *la = rt->rt_llinfo; + struct if_llreach *lr; + const char *why = NULL, *type = ""; + + /* Become a regular mutex, just in case */ + RT_CONVERT_LOCK(rt); + + if ((lr = la->la_llreach) != NULL) { + type = (solicited ? "ARP reply" : "ARP announcement"); + /* + * If target has changed, create a new record; + * otherwise keep existing record. + */ + IFLR_LOCK(lr); + if (bcmp(addr, lr->lr_key.addr, alen) != 0) { + IFLR_UNLOCK(lr); + /* Purge any link-layer info caching */ + VERIFY(rt->rt_llinfo_purge != NULL); + rt->rt_llinfo_purge(rt); + lr = NULL; + why = " for different target HW address; " + "using new llreach record"; + } else { + lr->lr_probes = 0; /* reset probe count */ + IFLR_UNLOCK(lr); + if (solicited) { + why = " for same target HW address; " + "keeping existing llreach record"; + } + } + } + + if (lr == NULL) { + lr = la->la_llreach = ifnet_llreach_alloc(ifp, + ETHERTYPE_IP, addr, alen, arp_llreach_base); + if (lr != NULL) { + lr->lr_probes = 0; /* reset probe count */ + if (why == NULL) + why = "creating new llreach record"; + } + } + + if (log_arp_warnings && lr != NULL && why != NULL) { + char tmp[MAX_IPv4_STR_LEN]; + + log(LOG_DEBUG, "%s%d: %s%s for %s\n", ifp->if_name, + ifp->if_unit, type, why, inet_ntop(AF_INET, + &SIN(rt_key(rt))->sin_addr, tmp, sizeof (tmp))); + } + } +} + /* * Free an arp entry. */ @@ -264,6 +475,16 @@ arptfree(struct llinfo_arp *la) la->la_asked = 0; rt->rt_flags &= ~RTF_REJECT; RT_UNLOCK(rt); + } else if (la->la_persist) { + /* + * Instead of issuing RTM_DELETE, stop this route entry + * from holding an interface idle reference count; if + * the route is later reused, arp_validate() will revert + * this action. + */ + if (rt->rt_refcnt == 0) + rt_clear_idleref(rt); + RT_UNLOCK(rt); } else { /* * Safe to drop rt_lock and use rt_key, since holding @@ -281,16 +502,18 @@ in_arpdrain(void *ignored_arg) { #pragma unused (ignored_arg) struct llinfo_arp *la, *ola; - struct timeval timenow; + uint64_t timenow; lck_mtx_lock(rnh_lock); la = llinfo_arp.lh_first; - getmicrotime(&timenow); + timenow = net_uptime(); while ((ola = la) != 0) { struct rtentry *rt = la->la_rt; la = la->la_le.le_next; RT_LOCK(rt); - if (rt->rt_expire && rt->rt_expire <= timenow.tv_sec) + VERIFY(rt->rt_expire == 0 || rt->rt_rmx.rmx_expire != 0); + VERIFY(rt->rt_expire != 0 || rt->rt_rmx.rmx_expire == 0); + if (rt->rt_expire && rt->rt_expire <= timenow) arptfree(ola); /* timer has expired, clear */ else RT_UNLOCK(rt); @@ -298,6 +521,20 @@ in_arpdrain(void *ignored_arg) lck_mtx_unlock(rnh_lock); } +void +arp_validate(struct rtentry *rt) +{ + struct llinfo_arp *la = rt->rt_llinfo; + + RT_LOCK_ASSERT_HELD(rt); + /* + * If this is a persistent ARP entry, make it count towards the + * interface idleness just like before arptfree() was called. + */ + if (la->la_persist) + rt_set_idleref(rt); +} + /* * Timeout routine. Age arp_tab entries periodically. */ @@ -322,7 +559,7 @@ arp_rtrequest( struct sockaddr *gate = rt->rt_gateway; struct llinfo_arp *la = rt->rt_llinfo; static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK, 0, 0, 0, 0, 0, {0}}; - struct timeval timenow; + uint64_t timenow; if (!arpinit_done) { panic("%s: ARP has not been initialized", __func__); @@ -333,7 +570,7 @@ arp_rtrequest( if (rt->rt_flags & RTF_GATEWAY) return; - getmicrotime(&timenow); + timenow = net_uptime(); switch (req) { case RTM_ADD: @@ -358,12 +595,14 @@ arp_rtrequest( * In case we're called before 1.0 sec. * has elapsed. */ - rt->rt_expire = MAX(timenow.tv_sec, 1); + rt_setexpire(rt, MAX(timenow, 1)); } break; } /* Announce a new entry if requested. */ if (rt->rt_flags & RTF_ANNOUNCE) { + if (la != NULL) + arp_llreach_use(la); /* Mark use timestamp */ RT_UNLOCK(rt); dlil_send_arp(rt->rt_ifp, ARPOP_REQUEST, SDL(gate), rt_key(rt), NULL, rt_key(rt)); @@ -391,6 +630,8 @@ arp_rtrequest( log(LOG_DEBUG, "%s: malloc failed\n", __func__); break; } + rt->rt_llinfo_get_ri = arp_llinfo_get_ri; + rt->rt_llinfo_purge = arp_llinfo_purge; rt->rt_llinfo_free = arp_llinfo_free; arp_inuse++, arp_allocated++; @@ -402,14 +643,16 @@ arp_rtrequest( /* * This keeps the multicast addresses from showing up * in `arp -a' listings as unresolved. It's not actually - * functional. Then the same for broadcast. + * functional. Then the same for broadcast. For IPv4 + * link-local address, keep the entry around even after + * it has expired. */ if (IN_MULTICAST(ntohl(SIN(rt_key(rt))->sin_addr.s_addr))) { RT_UNLOCK(rt); dlil_resolve_multi(rt->rt_ifp, rt_key(rt), gate, sizeof(struct sockaddr_dl)); RT_LOCK(rt); - rt->rt_expire = 0; + rt_setexpire(rt, 0); } else if (in_broadcast(SIN(rt_key(rt))->sin_addr, rt->rt_ifp)) { struct sockaddr_dl *gate_ll = SDL(gate); @@ -421,35 +664,60 @@ arp_rtrequest( gate_ll->sdl_family = AF_LINK; gate_ll->sdl_len = sizeof(struct sockaddr_dl); /* In case we're called before 1.0 sec. has elapsed */ - rt->rt_expire = MAX(timenow.tv_sec, 1); + rt_setexpire(rt, MAX(timenow, 1)); + } else if (IN_LINKLOCAL(ntohl(SIN(rt_key(rt))->sin_addr.s_addr))) { + /* + * The persistent bit implies that once the ARP + * entry has reached it expiration time, the idle + * reference count to the interface will be released, + * but the ARP entry itself stays in the routing table + * until it is explicitly removed. + */ + la->la_persist = 1; + rt->rt_flags |= RTF_STATIC; } + /* Become a regular mutex, just in case */ + RT_CONVERT_LOCK(rt); + IFA_LOCK_SPIN(rt->rt_ifa); if (SIN(rt_key(rt))->sin_addr.s_addr == (IA_SIN(rt->rt_ifa))->sin_addr.s_addr) { - /* - * This test used to be - * if (loif.if_flags & IFF_UP) - * It allowed local traffic to be forced - * through the hardware by configuring the loopback down. - * However, it causes problems during network configuration - * for boards that can't receive packets they send. - * It is now necessary to clear "useloopback" and remove - * the route to force traffic out to the hardware. - */ - rt->rt_expire = 0; - ifnet_lladdr_copy_bytes(rt->rt_ifp, LLADDR(SDL(gate)), SDL(gate)->sdl_alen = 6); + IFA_UNLOCK(rt->rt_ifa); + /* + * This test used to be + * if (loif.if_flags & IFF_UP) + * It allowed local traffic to be forced through the + * hardware by configuring the loopback down. However, + * it causes problems during network configuration + * for boards that can't receive packets they send. + * It is now necessary to clear "useloopback" and + * remove the route to force traffic out to the + * hardware. + */ + rt_setexpire(rt, 0); + ifnet_lladdr_copy_bytes(rt->rt_ifp, LLADDR(SDL(gate)), + SDL(gate)->sdl_alen = rt->rt_ifp->if_addrlen); if (useloopback) { -#if IFNET_ROUTE_REFCNT - /* Adjust route ref count for the interfaces */ - if (rt->rt_if_ref_fn != NULL && - rt->rt_ifp != lo_ifp) { - rt->rt_if_ref_fn(lo_ifp, 1); - rt->rt_if_ref_fn(rt->rt_ifp, -1); + if (rt->rt_ifp != lo_ifp) { + /* + * Purge any link-layer info caching. + */ + if (rt->rt_llinfo_purge != NULL) + rt->rt_llinfo_purge(rt); + + /* + * Adjust route ref count for the + * interfaces. + */ + if (rt->rt_if_ref_fn != NULL) { + rt->rt_if_ref_fn(lo_ifp, 1); + rt->rt_if_ref_fn(rt->rt_ifp, -1); + } } -#endif /* IFNET_ROUTE_REFCNT */ rt->rt_ifp = lo_ifp; } - + } else { + IFA_UNLOCK(rt->rt_ifa); } break; @@ -466,10 +734,18 @@ arp_rtrequest( LIST_REMOVE(la, la_le); la->la_le.le_next = NULL; la->la_le.le_prev = NULL; + + /* + * Purge any link-layer info caching. + */ + if (rt->rt_llinfo_purge != NULL) + rt->rt_llinfo_purge(rt); + rt->rt_flags &= ~RTF_LLINFO; - if (la->la_hold != NULL) + if (la->la_hold != NULL) { m_freem(la->la_hold); - la->la_hold = NULL; + la->la_hold = NULL; + } } } @@ -518,6 +794,13 @@ arp_lookup_route(const struct in_addr *addr, int create, int proxy, sin.sin_addr.s_addr = addr->s_addr; sin.sin_other = proxy ? SIN_PROXY : 0; + /* + * If the destination is a link-local address, don't + * constrain the lookup (don't scope it). + */ + if (IN_LINKLOCAL(ntohl(addr->s_addr))) + ifscope = IFSCOPE_NONE; + rt = rtalloc1_scoped((struct sockaddr*)&sin, create, 0, ifscope); if (rt == NULL) return (ENETUNREACH); @@ -592,7 +875,7 @@ __private_extern__ errno_t arp_route_to_gateway_route(const struct sockaddr *net_dest, route_t hint0, route_t *out_route) { - struct timeval timenow; + uint64_t timenow; route_t rt = hint0, hint = hint0; errno_t error = 0; @@ -728,9 +1011,11 @@ arp_route_to_gateway_route(const struct sockaddr *net_dest, route_t hint0, } if (rt->rt_flags & RTF_REJECT) { - getmicrotime(&timenow); - if (rt->rt_rmx.rmx_expire == 0 || - timenow.tv_sec < rt->rt_rmx.rmx_expire) { + VERIFY(rt->rt_expire == 0 || rt->rt_rmx.rmx_expire != 0); + VERIFY(rt->rt_expire != 0 || rt->rt_rmx.rmx_expire == 0); + timenow = net_uptime(); + if (rt->rt_expire == 0 || + timenow < rt->rt_expire) { RT_UNLOCK(rt); senderr(rt == hint ? EHOSTDOWN : EHOSTUNREACH); } @@ -774,8 +1059,9 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest, route_t route = NULL; /* output route */ errno_t result = 0; struct sockaddr_dl *gateway; - struct llinfo_arp *llinfo; - struct timeval timenow; + struct llinfo_arp *llinfo = NULL; + uint64_t timenow; + int unreachable = 0; if (net_dest->sin_family != AF_INET) return (EAFNOSUPPORT); @@ -849,7 +1135,7 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest, RT_LOCK_ASSERT_HELD(route); } - if (result || route == NULL || route->rt_llinfo == NULL) { + if (result || route == NULL || (llinfo = route->rt_llinfo) == NULL) { char tmp[MAX_IPv4_STR_LEN]; /* In case result is 0 but no route, return an error */ @@ -868,13 +1154,22 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest, * Now that we have the right route, is it filled in? */ gateway = SDL(route->rt_gateway); - getmicrotime(&timenow); - if ((route->rt_rmx.rmx_expire == 0 || - route->rt_rmx.rmx_expire > timenow.tv_sec) && gateway != NULL && - gateway->sdl_family == AF_LINK && gateway->sdl_alen != 0) { + timenow = net_uptime(); + VERIFY(route->rt_expire == 0 || route->rt_rmx.rmx_expire != 0); + VERIFY(route->rt_expire != 0 || route->rt_rmx.rmx_expire == 0); + if ((route->rt_expire == 0 || + route->rt_expire > timenow) && gateway != NULL && + gateway->sdl_family == AF_LINK && gateway->sdl_alen != 0 && + !(unreachable = !arp_llreach_reachable(llinfo))) { bcopy(gateway, ll_dest, MIN(gateway->sdl_len, ll_dest_len)); result = 0; + arp_llreach_use(llinfo); /* Mark use timestamp */ goto release; + } else if (unreachable) { + /* + * Discard existing answer in case we need to probe. + */ + gateway->sdl_alen = 0; } if (ifp->if_flags & IFF_NOARP) { @@ -885,34 +1180,51 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest, /* * Route wasn't complete/valid. We need to arp. */ - llinfo = route->rt_llinfo; if (packet != NULL) { if (llinfo->la_hold != NULL) m_freem(llinfo->la_hold); llinfo->la_hold = packet; } - if (route->rt_rmx.rmx_expire) { + if (route->rt_expire) { route->rt_flags &= ~RTF_REJECT; if (llinfo->la_asked == 0 || - route->rt_rmx.rmx_expire != timenow.tv_sec) { - route->rt_rmx.rmx_expire = timenow.tv_sec; + route->rt_expire != timenow) { + rt_setexpire(route, timenow); if (llinfo->la_asked++ < arp_maxtries) { struct ifaddr *rt_ifa = route->rt_ifa; - ifaref(rt_ifa); + struct sockaddr *sa; + + /* Become a regular mutex, just in case */ + RT_CONVERT_LOCK(route); + /* Update probe count, if applicable */ + if (llinfo->la_llreach != NULL) { + IFLR_LOCK_SPIN(llinfo->la_llreach); + llinfo->la_llreach->lr_probes++; + IFLR_UNLOCK(llinfo->la_llreach); + } + IFA_LOCK_SPIN(rt_ifa); + IFA_ADDREF_LOCKED(rt_ifa); + sa = rt_ifa->ifa_addr; + IFA_UNLOCK(rt_ifa); + arp_llreach_use(llinfo); /* Mark use timestamp */ RT_UNLOCK(route); dlil_send_arp(ifp, ARPOP_REQUEST, NULL, - rt_ifa->ifa_addr, NULL, - (const struct sockaddr*)net_dest); - ifafree(rt_ifa); + sa, NULL, (const struct sockaddr*)net_dest); + IFA_REMREF(rt_ifa); RT_LOCK(route); result = EJUSTRETURN; goto release; } else { route->rt_flags |= RTF_REJECT; - route->rt_rmx.rmx_expire = rt_expiry(route, - route->rt_rmx.rmx_expire, arpt_down); + rt_setexpire(route, rt_expiry(route, + route->rt_expire, arpt_down)); llinfo->la_asked = 0; + /* + * Clear la_hold; don't free the packet since + * we're not returning EJUSTRETURN; the caller + * will handle the freeing. + */ llinfo->la_hold = NULL; result = EHOSTUNREACH; goto release; @@ -950,52 +1262,61 @@ arp_ip_handle_input( struct ifaddr *ifa; struct in_ifaddr *ia; struct in_ifaddr *best_ia = NULL; + struct sockaddr_in best_ia_sin; route_t route = NULL; char buf[3 * MAX_HW_LEN]; // enough for MAX_HW_LEN byte hw address struct llinfo_arp *llinfo; errno_t error; int created_announcement = 0; int bridged = 0, is_bridge = 0; - + /* Do not respond to requests for 0.0.0.0 */ if (target_ip->sin_addr.s_addr == 0 && arpop == ARPOP_REQUEST) goto done; - - if (ifp->if_bridge) + + if (ifp->if_bridge) bridged = 1; if (ifp->if_type == IFT_BRIDGE) is_bridge = 1; /* * Determine if this ARP is for us - * For a bridge, we want to check the address irrespective + * For a bridge, we want to check the address irrespective * of the receive interface. */ lck_rw_lock_shared(in_ifaddr_rwlock); TAILQ_FOREACH(ia, INADDR_HASH(target_ip->sin_addr.s_addr), ia_hash) { + IFA_LOCK_SPIN(&ia->ia_ifa); if (((bridged && ia->ia_ifp->if_bridge != NULL) || - (ia->ia_ifp == ifp)) && + (ia->ia_ifp == ifp)) && ia->ia_addr.sin_addr.s_addr == target_ip->sin_addr.s_addr) { - best_ia = ia; - ifaref(&best_ia->ia_ifa); - lck_rw_done(in_ifaddr_rwlock); - goto match; + best_ia = ia; + best_ia_sin = best_ia->ia_addr; + IFA_ADDREF_LOCKED(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); + lck_rw_done(in_ifaddr_rwlock); + goto match; } + IFA_UNLOCK(&ia->ia_ifa); } TAILQ_FOREACH(ia, INADDR_HASH(sender_ip->sin_addr.s_addr), ia_hash) { + IFA_LOCK_SPIN(&ia->ia_ifa); if (((bridged && ia->ia_ifp->if_bridge != NULL) || - (ia->ia_ifp == ifp)) && + (ia->ia_ifp == ifp)) && ia->ia_addr.sin_addr.s_addr == sender_ip->sin_addr.s_addr) { - best_ia = ia; - ifaref(&best_ia->ia_ifa); - lck_rw_done(in_ifaddr_rwlock); - goto match; + best_ia = ia; + best_ia_sin = best_ia->ia_addr; + IFA_ADDREF_LOCKED(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); + lck_rw_done(in_ifaddr_rwlock); + goto match; } + IFA_UNLOCK(&ia->ia_ifa); } -#define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia) \ - (ia->ia_ifp->if_bridge == ifp->if_softc && \ +#define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia) \ + (ia->ia_ifp->if_bridge == ifp->if_softc && \ !bcmp(ifnet_lladdr(ia->ia_ifp), ifnet_lladdr(ifp), ifp->if_addrlen) && \ addr == ia->ia_addr.sin_addr.s_addr) /* @@ -1005,14 +1326,20 @@ arp_ip_handle_input( * meant to be destined to the bridge member. */ if (is_bridge) { - TAILQ_FOREACH(ia, INADDR_HASH(target_ip->sin_addr.s_addr), ia_hash) { - if (BDG_MEMBER_MATCHES_ARP(target_ip->sin_addr.s_addr, ifp, ia)) { + TAILQ_FOREACH(ia, INADDR_HASH(target_ip->sin_addr.s_addr), + ia_hash) { + IFA_LOCK_SPIN(&ia->ia_ifa); + if (BDG_MEMBER_MATCHES_ARP(target_ip->sin_addr.s_addr, + ifp, ia)) { ifp = ia->ia_ifp; best_ia = ia; - ifaref(&best_ia->ia_ifa); + best_ia_sin = best_ia->ia_addr; + IFA_ADDREF_LOCKED(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); goto match; } + IFA_UNLOCK(&ia->ia_ifa); } } lck_rw_done(in_ifaddr_rwlock); @@ -1024,10 +1351,15 @@ arp_ip_handle_input( */ ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - if (ifa->ifa_addr->sa_family != AF_INET) + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr->sa_family != AF_INET) { + IFA_UNLOCK(ifa); continue; + } best_ia = (struct in_ifaddr *)ifa; - ifaref(&best_ia->ia_ifa); + best_ia_sin = best_ia->ia_addr; + IFA_ADDREF_LOCKED(ifa); + IFA_UNLOCK(ifa); ifnet_lock_done(ifp); goto match; } @@ -1042,15 +1374,17 @@ arp_ip_handle_input( match: /* If the packet is from this interface, ignore the packet */ - if (!bcmp(CONST_LLADDR(sender_hw), ifnet_lladdr(ifp), sender_hw->sdl_len)) { + if (!bcmp(CONST_LLADDR(sender_hw), ifnet_lladdr(ifp), sender_hw->sdl_alen)) { goto done; } /* Check for a conflict */ - if (!bridged && sender_ip->sin_addr.s_addr == best_ia->ia_addr.sin_addr.s_addr) { + if (!bridged && sender_ip->sin_addr.s_addr == best_ia_sin.sin_addr.s_addr) { struct kev_msg ev_msg; struct kev_in_collision *in_collision; u_char storage[sizeof(struct kev_in_collision) + MAX_HW_LEN]; + bzero(&ev_msg, sizeof(struct kev_msg)); + bzero(storage, (sizeof(struct kev_in_collision) + MAX_HW_LEN)); in_collision = (struct kev_in_collision*)storage; log(LOG_ERR, "%s%d duplicate IP address %s sent from address %s\n", ifp->if_name, ifp->if_unit, @@ -1083,7 +1417,7 @@ arp_ip_handle_input( * entry locked, upon success. */ error = arp_lookup_route(&sender_ip->sin_addr, - (target_ip->sin_addr.s_addr == best_ia->ia_addr.sin_addr.s_addr && + (target_ip->sin_addr.s_addr == best_ia_sin.sin_addr.s_addr && sender_ip->sin_addr.s_addr != 0), 0, &route, ifp->if_index); if (error == 0) @@ -1142,6 +1476,9 @@ arp_ip_handle_input( sdl_addr_to_hex(sender_hw, buf, sizeof(buf)), ifp->if_name, ifp->if_unit); } + /* Mark use timestamp */ + if (route->rt_llinfo != NULL) + arp_llreach_use(route->rt_llinfo); /* We're done with the route */ RT_REMREF_LOCKED(route); RT_UNLOCK(route); @@ -1152,21 +1489,19 @@ arp_ip_handle_input( * This will not force the device to pick a new number if the device * has already assigned that number. * This will not imply to the device that we own that address. + * The link address is always present; it's never freed. */ ifnet_lock_shared(ifp); - ifa = TAILQ_FIRST(&ifp->if_addrhead); - if (ifa != NULL) - ifaref(ifa); + ifa = ifp->if_lladdr; + IFA_ADDREF(ifa); ifnet_lock_done(ifp); dlil_send_arp_internal(ifp, ARPOP_REQUEST, - ifa != NULL ? SDL(ifa->ifa_addr) : NULL, + SDL(ifa->ifa_addr), (const struct sockaddr*)sender_ip, sender_hw, (const struct sockaddr*)target_ip); - if (ifa != NULL) { - ifafree(ifa); - ifa = NULL; - } - } + IFA_REMREF(ifa); + ifa = NULL; + } } goto respond; } else if (keep_announcements != 0 @@ -1203,6 +1538,8 @@ arp_ip_handle_input( } RT_LOCK_ASSERT_HELD(route); + VERIFY(route->rt_expire == 0 || route->rt_rmx.rmx_expire != 0); + VERIFY(route->rt_expire != 0 || route->rt_rmx.rmx_expire == 0); gateway = SDL(route->rt_gateway); if (!bridged && route->rt_ifp != ifp) { if (!IN_LINKLOCAL(ntohl(sender_ip->sin_addr.s_addr)) || (ifp->if_eflags & IFEF_ARPLL) == 0) { @@ -1218,7 +1555,7 @@ arp_ip_handle_input( } else { /* Don't change a permanent address */ - if (route->rt_rmx.rmx_expire == 0) { + if (route->rt_expire == 0) { goto respond; } @@ -1249,14 +1586,19 @@ arp_ip_handle_input( lck_mtx_unlock(rnh_lock); goto respond; } -#if IFNET_ROUTE_REFCNT - /* Adjust route ref count for the interfaces */ - if (route->rt_if_ref_fn != NULL && - route->rt_ifp != ifp) { - route->rt_if_ref_fn(ifp, 1); - route->rt_if_ref_fn(route->rt_ifp, -1); + if (route->rt_ifp != ifp) { + /* + * Purge any link-layer info caching. + */ + if (route->rt_llinfo_purge != NULL) + route->rt_llinfo_purge(route); + + /* Adjust route ref count for the interfaces */ + if (route->rt_if_ref_fn != NULL) { + route->rt_if_ref_fn(ifp, 1); + route->rt_if_ref_fn(route->rt_ifp, -1); + } } -#endif /* IFNET_ROUTE_REFCNT */ /* Change the interface when the existing route is on */ route->rt_ifp = ifp; rtsetifa(route, &best_ia->ia_ifa); @@ -1274,7 +1616,7 @@ arp_ip_handle_input( } if (gateway->sdl_alen && bcmp(LLADDR(gateway), CONST_LLADDR(sender_hw), gateway->sdl_alen)) { - if (route->rt_rmx.rmx_expire && log_arp_warnings) { + if (route->rt_expire && log_arp_warnings) { char buf2[3 * MAX_HW_LEN]; log(LOG_INFO, "arp: %s moved from %s to %s on %s%d\n", inet_ntop(AF_INET, &sender_ip->sin_addr, ipv4str, @@ -1283,7 +1625,7 @@ arp_ip_handle_input( sdl_addr_to_hex(sender_hw, buf2, sizeof(buf2)), ifp->if_name, ifp->if_unit); } - else if (route->rt_rmx.rmx_expire == 0) { + else if (route->rt_expire == 0) { if (log_arp_warnings) { log(LOG_ERR, "arp: %s attempts to modify " "permanent entry for %s on %s%d\n", @@ -1302,22 +1644,26 @@ arp_ip_handle_input( bcopy(CONST_LLADDR(sender_hw), LLADDR(gateway), gateway->sdl_alen); /* Update the expire time for the route and clear the reject flag */ - if (route->rt_rmx.rmx_expire) { - struct timeval timenow; - - getmicrotime(&timenow); - route->rt_rmx.rmx_expire = - rt_expiry(route, timenow.tv_sec, arpt_keep); + if (route->rt_expire) { + uint64_t timenow; + + timenow = net_uptime(); + rt_setexpire(route, + rt_expiry(route, timenow, arpt_keep)); } route->rt_flags &= ~RTF_REJECT; + /* cache the gateway (sender HW) address */ + arp_llreach_alloc(route, ifp, LLADDR(gateway), gateway->sdl_alen, + (arpop == ARPOP_REPLY)); + /* update the llinfo, send a queued packet if there is one */ llinfo = route->rt_llinfo; llinfo->la_asked = 0; if (llinfo->la_hold) { struct mbuf *m0; m0 = llinfo->la_hold; - llinfo->la_hold = 0; + llinfo->la_hold = NULL; RT_UNLOCK(route); dlil_output(ifp, PF_INET, m0, (caddr_t)route, rt_key(route), 0); @@ -1327,6 +1673,9 @@ arp_ip_handle_input( respond: if (route != NULL) { + /* Mark use timestamp if we're going to send a reply */ + if (arpop == ARPOP_REQUEST && route->rt_llinfo != NULL) + arp_llreach_use(route->rt_llinfo); RT_REMREF_LOCKED(route); RT_UNLOCK(route); route = NULL; @@ -1336,7 +1685,7 @@ arp_ip_handle_input( goto done; /* If we are not the target, check if we should proxy */ - if (target_ip->sin_addr.s_addr != best_ia->ia_addr.sin_addr.s_addr) { + if (target_ip->sin_addr.s_addr != best_ia_sin.sin_addr.s_addr) { /* * Find a proxy route; callee holds a reference on the * route and returns with the route entry locked, upon @@ -1390,6 +1739,9 @@ arp_ip_handle_input( goto done; } } + /* Mark use timestamp */ + if (route->rt_llinfo != NULL) + arp_llreach_use(route->rt_llinfo); RT_REMREF_LOCKED(route); RT_UNLOCK(route); } @@ -1400,16 +1752,19 @@ arp_ip_handle_input( done: if (best_ia != NULL) - ifafree(&best_ia->ia_ifa); + IFA_REMREF(&best_ia->ia_ifa); return 0; } void -arp_ifinit( - struct ifnet *ifp, - struct ifaddr *ifa) +arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa) { + struct sockaddr *sa; + + IFA_LOCK(ifa); ifa->ifa_rtrequest = arp_rtrequest; ifa->ifa_flags |= RTF_CLONING; - dlil_send_arp(ifp, ARPOP_REQUEST, NULL, ifa->ifa_addr, NULL, ifa->ifa_addr); + sa = ifa->ifa_addr; + IFA_UNLOCK(ifa); + dlil_send_arp(ifp, ARPOP_REQUEST, NULL, sa, NULL, sa); } diff --git a/bsd/netinet/in_arp.h b/bsd/netinet/in_arp.h index 9b1a740ac..99a106572 100644 --- a/bsd/netinet/in_arp.h +++ b/bsd/netinet/in_arp.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009 Apple Inc. All rights reserved. + * Copyright (c) 2009-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -67,8 +67,11 @@ extern errno_t inet_arp_lookup(ifnet_t interface, size_t ll_dest_len, route_t hint, mbuf_t packet); #endif /* BSD_KERNEL_PRIVATE */ #ifdef KERNEL_PRIVATE +struct in_addr; extern void arp_init(void); extern void in_arpdrain(void *); +extern void arp_validate(struct rtentry *); +extern void arp_llreach_set_reachable(struct ifnet *, void *, unsigned int); /* arp_lookup_ip is obsolete, use inet_arp_lookup */ extern errno_t arp_lookup_ip(ifnet_t interface, const struct sockaddr_in *ip_dest, struct sockaddr_dl *ll_dest, diff --git a/bsd/netinet/in_cksum.c b/bsd/netinet/in_cksum.c index cf3e3dbca..1fcafd583 100644 --- a/bsd/netinet/in_cksum.c +++ b/bsd/netinet/in_cksum.c @@ -93,7 +93,7 @@ union q_util { u_int64_t q; }; -#define ADDCARRY(x) (x > 65535 ? x -= 65535 : x) +#define ADDCARRY(x) do { if (x > 65535) { x -= 65535; } } while (0) #define REDUCE32 \ { \ @@ -118,7 +118,7 @@ inet_cksum_simple(struct mbuf *m, int len) return (inet_cksum(m, 0, 0, len)); } -inline u_short +u_short in_addword(u_short a, u_short b) { union l_util l_util; @@ -128,7 +128,7 @@ in_addword(u_short a, u_short b) return (sum); } -inline u_short +u_short in_pseudo(u_int a, u_int b, u_int c) { u_int64_t sum; @@ -141,77 +141,7 @@ in_pseudo(u_int a, u_int b, u_int c) } -#if defined(__ppc__) - -extern u_short xsum_assym(u_short *p, int len, u_short xsum, int odd); - -u_int16_t -inet_cksum(struct mbuf *m, unsigned int nxt, unsigned int skip, - unsigned int len) -{ - u_short *w; - u_int32_t sum = 0; - int mlen = 0; - int starting_on_odd = 0; - - KERNEL_DEBUG(DBG_FNC_IN_CKSUM | DBG_FUNC_START, len,0,0,0,0); - - /* sanity check */ - if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.len < skip + len) { - panic("inet_cksum: mbuf len (%d) < off+len (%d+%d)\n", - m->m_pkthdr.len, skip, len); - } - - /* include pseudo header checksum? */ - if (nxt != 0) { - struct ip *iph; - - if (m->m_len < sizeof (struct ip)) - panic("inet_cksum: bad mbuf chain"); - - iph = mtod(m, struct ip *); - sum = in_pseudo(iph->ip_src.s_addr, iph->ip_dst.s_addr, - htonl(len + nxt)); - } - - if (skip != 0) { - for (; skip && m; m = m->m_next) { - if (m->m_len > skip) { - mlen = m->m_len - skip; - w = (u_short *)(m->m_data+skip); - goto skip_start; - } else { - skip -= m->m_len; - } - } - } - - for (;m && len; m = m->m_next) { - if (m->m_len == 0) - continue; - mlen = m->m_len; - w = mtod(m, u_short *); - -skip_start: - if (len < mlen) - mlen = len; - sum = xsum_assym(w, mlen, sum, starting_on_odd); - len -= mlen; - if (mlen & 0x1) - { - if (starting_on_odd) - starting_on_odd = 0; - else - starting_on_odd = 1; - } - } - - KERNEL_DEBUG(DBG_FNC_IN_CKSUM | DBG_FUNC_END, 0,0,0,0,0); - - return (~sum & 0xffff); -} - -#elif defined(__arm__) && __ARM_ARCH__ >= 6 +#if defined(__arm__) && __ARM_ARCH__ >= 6 extern int cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum); diff --git a/bsd/netinet/in_dhcp.c b/bsd/netinet/in_dhcp.c index c6fdffdd8..90fc06ae5 100644 --- a/bsd/netinet/in_dhcp.c +++ b/bsd/netinet/in_dhcp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1988-2007 Apple Inc. All rights reserved. + * Copyright (c) 1988-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -281,12 +281,6 @@ link_print(struct sockaddr_dl * dl_p) { int i; -#if 0 - printf("len %d index %d family %d type 0x%x nlen %d alen %d" - " slen %d addr ", dl_p->sdl_len, - dl_p->sdl_index, dl_p->sdl_family, dl_p->sdl_type, - dl_p->sdl_nlen, dl_p->sdl_alen, dl_p->sdl_slen); -#endif for (i = 0; i < dl_p->sdl_alen; i++) printf("%s%x", i ? ":" : "", (link_address(dl_p))[i]); @@ -297,19 +291,7 @@ link_print(struct sockaddr_dl * dl_p) static struct sockaddr_dl * link_from_ifnet(struct ifnet * ifp) { - struct ifaddr * addr; - - ifnet_lock_shared(ifp); - TAILQ_FOREACH(addr, &ifp->if_addrhead, ifa_link) { - if (addr->ifa_addr->sa_family == AF_LINK) { - struct sockaddr_dl * dl_p = (struct sockaddr_dl *)(addr->ifa_addr); - - ifnet_lock_done(ifp); - return (dl_p); - } - } - ifnet_lock_done(ifp); - return (NULL); + return ((struct sockaddr_dl *)ifp->if_lladdr->ifa_addr); } /* diff --git a/bsd/netinet/in_gif.c b/bsd/netinet/in_gif.c index 482aef5e4..9a6cb3db6 100644 --- a/bsd/netinet/in_gif.c +++ b/bsd/netinet/in_gif.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -94,7 +94,7 @@ #include int ip_gif_ttl = GIF_TTL; -SYSCTL_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_gif_ttl, 0, ""); int @@ -111,7 +111,7 @@ in_gif_output( struct ip iphdr; /* capsule IP header, host byte ordered */ int proto, error; u_int8_t tos; - struct ip_out_args ipoa = { IFSCOPE_NONE }; + struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; if (sin_src == NULL || sin_dst == NULL || sin_src->sin_family != AF_INET || @@ -371,10 +371,13 @@ gif_encapcheck4( { if ((ifnet_flags(ia4->ia_ifa.ifa_ifp) & IFF_BROADCAST) == 0) continue; + IFA_LOCK(&ia4->ia_ifa); if (ip.ip_src.s_addr == ia4->ia_broadaddr.sin_addr.s_addr) { + IFA_UNLOCK(&ia4->ia_ifa); lck_rw_done(in_ifaddr_rwlock); return 0; } + IFA_UNLOCK(&ia4->ia_ifa); } lck_rw_done(in_ifaddr_rwlock); @@ -393,11 +396,6 @@ gif_encapcheck4( if (rt != NULL) RT_LOCK(rt); if (rt == NULL || rt->rt_ifp != m->m_pkthdr.rcvif) { -#if 0 - log(LOG_WARNING, "%s: packet from 0x%x dropped " - "due to ingress filter\n", if_name(&sc->gif_if), - (u_int32_t)ntohl(sin.sin_addr.s_addr)); -#endif if (rt != NULL) { RT_UNLOCK(rt); rtfree(rt); diff --git a/bsd/netinet/in_mcast.c b/bsd/netinet/in_mcast.c new file mode 100644 index 000000000..1854fd26e --- /dev/null +++ b/bsd/netinet/in_mcast.c @@ -0,0 +1,3641 @@ +/* + * Copyright (c) 2010-2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/*- + * Copyright (c) 2007-2009 Bruce Simpson. + * Copyright (c) 2005 Robert N. M. Watson. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * IPv4 multicast socket, group, and socket option processing module. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifndef __SOCKUNION_DECLARED +union sockunion { + struct sockaddr_storage ss; + struct sockaddr sa; + struct sockaddr_dl sdl; + struct sockaddr_in sin; +}; +typedef union sockunion sockunion_t; +#define __SOCKUNION_DECLARED +#endif /* __SOCKUNION_DECLARED */ + +/* + * Functions with non-static linkage defined in this file should be + * declared in in_var.h: + * imo_multi_filter() + * in_addmulti() + * in_delmulti() + * in_joingroup() + * in_leavegroup() + * and ip_var.h: + * inp_freemoptions() + * inp_getmoptions() + * inp_setmoptions() + * + * XXX: Both carp and pf need to use the legacy (*,G) KPIs in_addmulti() + * and in_delmulti(). + */ +static void imf_commit(struct in_mfilter *); +static int imf_get_source(struct in_mfilter *imf, + const struct sockaddr_in *psin, + struct in_msource **); +static struct in_msource * + imf_graft(struct in_mfilter *, const uint8_t, + const struct sockaddr_in *); +static int imf_prune(struct in_mfilter *, const struct sockaddr_in *); +static void imf_rollback(struct in_mfilter *); +static void imf_reap(struct in_mfilter *); +static int imo_grow(struct ip_moptions *, size_t); +static size_t imo_match_group(const struct ip_moptions *, + const struct ifnet *, const struct sockaddr *); +static struct in_msource * + imo_match_source(const struct ip_moptions *, const size_t, + const struct sockaddr *); +static void ims_merge(struct ip_msource *ims, + const struct in_msource *lims, const int rollback); +static int in_getmulti(struct ifnet *, const struct in_addr *, + struct in_multi **); +static int in_joingroup(struct ifnet *, const struct in_addr *, + struct in_mfilter *, struct in_multi **); +static int inm_get_source(struct in_multi *inm, const in_addr_t haddr, + const int noalloc, struct ip_msource **pims); +static int inm_is_ifp_detached(const struct in_multi *); +static int inm_merge(struct in_multi *, /*const*/ struct in_mfilter *); +static void inm_reap(struct in_multi *); +static struct ip_moptions * + inp_findmoptions(struct inpcb *); +static int inp_get_source_filters(struct inpcb *, struct sockopt *); +static struct ifnet * + inp_lookup_mcast_ifp(const struct inpcb *, + const struct sockaddr_in *, const struct in_addr); +static int inp_block_unblock_source(struct inpcb *, struct sockopt *); +static int inp_set_multicast_if(struct inpcb *, struct sockopt *); +static int inp_set_source_filters(struct inpcb *, struct sockopt *); +static int sysctl_ip_mcast_filters SYSCTL_HANDLER_ARGS; +static struct ifnet * ip_multicast_if(struct in_addr *, unsigned int *); +static __inline__ int ip_msource_cmp(const struct ip_msource *, + const struct ip_msource *); + +SYSCTL_NODE(_net_inet_ip, OID_AUTO, mcast, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "IPv4 multicast"); + +static u_long in_mcast_maxgrpsrc = IP_MAX_GROUP_SRC_FILTER; +SYSCTL_LONG(_net_inet_ip_mcast, OID_AUTO, maxgrpsrc, + CTLFLAG_RW | CTLFLAG_LOCKED, &in_mcast_maxgrpsrc, "Max source filters per group"); + +static u_long in_mcast_maxsocksrc = IP_MAX_SOCK_SRC_FILTER; +SYSCTL_LONG(_net_inet_ip_mcast, OID_AUTO, maxsocksrc, + CTLFLAG_RW | CTLFLAG_LOCKED, &in_mcast_maxsocksrc, + "Max source filters per socket"); + +int in_mcast_loop = IP_DEFAULT_MULTICAST_LOOP; +SYSCTL_INT(_net_inet_ip_mcast, OID_AUTO, loop, CTLFLAG_RW | CTLFLAG_LOCKED, + &in_mcast_loop, 0, "Loopback multicast datagrams by default"); + +SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters, + CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_ip_mcast_filters, + "Per-interface stack-wide source filters"); + +RB_GENERATE_PREV(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp); + +#define INM_TRACE_HIST_SIZE 32 /* size of trace history */ + +/* For gdb */ +__private_extern__ unsigned int inm_trace_hist_size = INM_TRACE_HIST_SIZE; + +struct in_multi_dbg { + struct in_multi inm; /* in_multi */ + u_int16_t inm_refhold_cnt; /* # of ref */ + u_int16_t inm_refrele_cnt; /* # of rele */ + /* + * Circular lists of inm_addref and inm_remref callers. + */ + ctrace_t inm_refhold[INM_TRACE_HIST_SIZE]; + ctrace_t inm_refrele[INM_TRACE_HIST_SIZE]; + /* + * Trash list linkage + */ + TAILQ_ENTRY(in_multi_dbg) inm_trash_link; +}; + +/* List of trash in_multi entries protected by inm_trash_lock */ +static TAILQ_HEAD(, in_multi_dbg) inm_trash_head; +static decl_lck_mtx_data(, inm_trash_lock); + +#define INM_ZONE_MAX 64 /* maximum elements in zone */ +#define INM_ZONE_NAME "in_multi" /* zone name */ + +#if DEBUG +static unsigned int inm_debug = 1; /* debugging (enabled) */ +#else +static unsigned int inm_debug; /* debugging (disabled) */ +#endif /* !DEBUG */ +static unsigned int inm_size; /* size of zone element */ +static struct zone *inm_zone; /* zone for in_multi */ + +#define IPMS_ZONE_MAX 64 /* maximum elements in zone */ +#define IPMS_ZONE_NAME "ip_msource" /* zone name */ + +static unsigned int ipms_size; /* size of zone element */ +static struct zone *ipms_zone; /* zone for ip_msource */ + +#define INMS_ZONE_MAX 64 /* maximum elements in zone */ +#define INMS_ZONE_NAME "in_msource" /* zone name */ + +static unsigned int inms_size; /* size of zone element */ +static struct zone *inms_zone; /* zone for in_msource */ + +/* Lock group and attribute for in_multihead_lock lock */ +static lck_attr_t *in_multihead_lock_attr; +static lck_grp_t *in_multihead_lock_grp; +static lck_grp_attr_t *in_multihead_lock_grp_attr; + +static decl_lck_rw_data(, in_multihead_lock); +struct in_multihead in_multihead; + +static struct in_multi *in_multi_alloc(int); +static void in_multi_free(struct in_multi *); +static void in_multi_attach(struct in_multi *); +static void inm_trace(struct in_multi *, int); + +static struct ip_msource *ipms_alloc(int); +static void ipms_free(struct ip_msource *); +static struct in_msource *inms_alloc(int); +static void inms_free(struct in_msource *); + +#define IMO_CAST_TO_NONCONST(x) ((struct ip_moptions *)(void *)(uintptr_t)x) +#define INM_CAST_TO_NONCONST(x) ((struct in_multi *)(void *)(uintptr_t)x) + +static __inline int +ip_msource_cmp(const struct ip_msource *a, const struct ip_msource *b) +{ + + if (a->ims_haddr < b->ims_haddr) + return (-1); + if (a->ims_haddr == b->ims_haddr) + return (0); + return (1); +} + +/* + * Inline function which wraps assertions for a valid ifp. + */ +static __inline__ int +inm_is_ifp_detached(const struct in_multi *inm) +{ + VERIFY(inm->inm_ifma != NULL); + VERIFY(inm->inm_ifp == inm->inm_ifma->ifma_ifp); + + return (!ifnet_is_attached(inm->inm_ifp, 0)); +} + +/* + * Initialize an in_mfilter structure to a known state at t0, t1 + * with an empty source filter list. + */ +static __inline__ void +imf_init(struct in_mfilter *imf, const int st0, const int st1) +{ + memset(imf, 0, sizeof(struct in_mfilter)); + RB_INIT(&imf->imf_sources); + imf->imf_st[0] = st0; + imf->imf_st[1] = st1; +} + +/* + * Resize the ip_moptions vector to the next power-of-two minus 1. + */ +static int +imo_grow(struct ip_moptions *imo, size_t newmax) +{ + struct in_multi **nmships; + struct in_multi **omships; + struct in_mfilter *nmfilters; + struct in_mfilter *omfilters; + size_t idx; + size_t oldmax; + + IMO_LOCK_ASSERT_HELD(imo); + + nmships = NULL; + nmfilters = NULL; + omships = imo->imo_membership; + omfilters = imo->imo_mfilters; + oldmax = imo->imo_max_memberships; + if (newmax == 0) + newmax = ((oldmax + 1) * 2) - 1; + + if (newmax > IP_MAX_MEMBERSHIPS) + return (ETOOMANYREFS); + + if ((nmships = (struct in_multi **)_REALLOC(omships, + sizeof (struct in_multi *) * newmax, M_IPMOPTS, + M_WAITOK | M_ZERO)) == NULL) + return (ENOMEM); + + imo->imo_membership = nmships; + + if ((nmfilters = (struct in_mfilter *)_REALLOC(omfilters, + sizeof (struct in_mfilter) * newmax, M_INMFILTER, + M_WAITOK | M_ZERO)) == NULL) + return (ENOMEM); + + imo->imo_mfilters = nmfilters; + + /* Initialize newly allocated source filter heads. */ + for (idx = oldmax; idx < newmax; idx++) + imf_init(&nmfilters[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); + + imo->imo_max_memberships = newmax; + + return (0); +} + +/* + * Find an IPv4 multicast group entry for this ip_moptions instance + * which matches the specified group, and optionally an interface. + * Return its index into the array, or -1 if not found. + */ +static size_t +imo_match_group(const struct ip_moptions *imo, const struct ifnet *ifp, + const struct sockaddr *group) +{ + const struct sockaddr_in *gsin; + struct in_multi *pinm; + int idx; + int nmships; + + IMO_LOCK_ASSERT_HELD(IMO_CAST_TO_NONCONST(imo)); + + gsin = (const struct sockaddr_in *)group; + + /* The imo_membership array may be lazy allocated. */ + if (imo->imo_membership == NULL || imo->imo_num_memberships == 0) + return (-1); + + nmships = imo->imo_num_memberships; + for (idx = 0; idx < nmships; idx++) { + pinm = imo->imo_membership[idx]; + if (pinm == NULL) + continue; + INM_LOCK(pinm); + if ((ifp == NULL || (pinm->inm_ifp == ifp)) && + in_hosteq(pinm->inm_addr, gsin->sin_addr)) { + INM_UNLOCK(pinm); + break; + } + INM_UNLOCK(pinm); + } + if (idx >= nmships) + idx = -1; + + return (idx); +} + +/* + * Find an IPv4 multicast source entry for this imo which matches + * the given group index for this socket, and source address. + * + * NOTE: This does not check if the entry is in-mode, merely if + * it exists, which may not be the desired behaviour. + */ +static struct in_msource * +imo_match_source(const struct ip_moptions *imo, const size_t gidx, + const struct sockaddr *src) +{ + struct ip_msource find; + struct in_mfilter *imf; + struct ip_msource *ims; + const sockunion_t *psa; + + IMO_LOCK_ASSERT_HELD(IMO_CAST_TO_NONCONST(imo)); + + VERIFY(src->sa_family == AF_INET); + VERIFY(gidx != (size_t)-1 && gidx < imo->imo_num_memberships); + + /* The imo_mfilters array may be lazy allocated. */ + if (imo->imo_mfilters == NULL) + return (NULL); + imf = &imo->imo_mfilters[gidx]; + + /* Source trees are keyed in host byte order. */ + psa = (const sockunion_t *)src; + find.ims_haddr = ntohl(psa->sin.sin_addr.s_addr); + ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); + + return ((struct in_msource *)ims); +} + +/* + * Perform filtering for multicast datagrams on a socket by group and source. + * + * Returns 0 if a datagram should be allowed through, or various error codes + * if the socket was not a member of the group, or the source was muted, etc. + */ +int +imo_multi_filter(const struct ip_moptions *imo, const struct ifnet *ifp, + const struct sockaddr *group, const struct sockaddr *src) +{ + size_t gidx; + struct in_msource *ims; + int mode; + + IMO_LOCK_ASSERT_HELD(IMO_CAST_TO_NONCONST(imo)); + VERIFY(ifp != NULL); + + gidx = imo_match_group(imo, ifp, group); + if (gidx == (size_t)-1) + return (MCAST_NOTGMEMBER); + + /* + * Check if the source was included in an (S,G) join. + * Allow reception on exclusive memberships by default, + * reject reception on inclusive memberships by default. + * Exclude source only if an in-mode exclude filter exists. + * Include source only if an in-mode include filter exists. + * NOTE: We are comparing group state here at IGMP t1 (now) + * with socket-layer t0 (since last downcall). + */ + mode = imo->imo_mfilters[gidx].imf_st[1]; + ims = imo_match_source(imo, gidx, src); + + if ((ims == NULL && mode == MCAST_INCLUDE) || + (ims != NULL && ims->imsl_st[0] != mode)) { + return (MCAST_NOTSMEMBER); + } + + return (MCAST_PASS); +} + +int +imo_clone(struct ip_moptions *from, struct ip_moptions *to) +{ + int i, err = 0; + + IMO_LOCK(from); + IMO_LOCK(to); + + to->imo_multicast_ifp = from->imo_multicast_ifp; + to->imo_multicast_vif = from->imo_multicast_vif; + to->imo_multicast_ttl = from->imo_multicast_ttl; + to->imo_multicast_loop = from->imo_multicast_loop; + + /* + * We're cloning, so drop any existing memberships and source + * filters on the destination ip_moptions. + */ + for (i = 0; i < to->imo_num_memberships; ++i) { + struct in_mfilter *imf; + + imf = to->imo_mfilters ? &to->imo_mfilters[i] : NULL; + if (imf != NULL) + imf_leave(imf); + + (void) in_leavegroup(to->imo_membership[i], imf); + + if (imf != NULL) + imf_purge(imf); + + INM_REMREF(to->imo_membership[i]); + to->imo_membership[i] = NULL; + } + to->imo_num_memberships = 0; + + VERIFY(to->imo_max_memberships != 0 && from->imo_max_memberships != 0); + if (to->imo_max_memberships < from->imo_max_memberships) { + /* + * Ensure source and destination ip_moptions memberships + * and source filters arrays are at least equal in size. + */ + err = imo_grow(to, from->imo_max_memberships); + if (err != 0) + goto done; + } + VERIFY(to->imo_max_memberships >= from->imo_max_memberships); + + /* + * Source filtering doesn't apply to OpenTransport socket, + * so simply hold additional reference count per membership. + */ + for (i = 0; i < from->imo_num_memberships; i++) { + to->imo_membership[i] = from->imo_membership[i]; + INM_ADDREF(from->imo_membership[i]); + to->imo_num_memberships++; + } + VERIFY(to->imo_num_memberships == from->imo_num_memberships); + +done: + IMO_UNLOCK(to); + IMO_UNLOCK(from); + + return (err); +} + +/* + * Find and return a reference to an in_multi record for (ifp, group), + * and bump its reference count. + * If one does not exist, try to allocate it, and update link-layer multicast + * filters on ifp to listen for group. + * Return 0 if successful, otherwise return an appropriate error code. + */ +static int +in_getmulti(struct ifnet *ifp, const struct in_addr *group, + struct in_multi **pinm) +{ + struct sockaddr_in gsin; + struct ifmultiaddr *ifma; + struct in_multi *inm; + int error; + + in_multihead_lock_shared(); + IN_LOOKUP_MULTI(group, ifp, inm); + if (inm != NULL) { + INM_LOCK(inm); + VERIFY(inm->inm_reqcnt >= 1); + inm->inm_reqcnt++; + VERIFY(inm->inm_reqcnt != 0); + *pinm = inm; + INM_UNLOCK(inm); + in_multihead_lock_done(); + /* + * We already joined this group; return the inm + * with a refcount held (via lookup) for caller. + */ + return (0); + } + in_multihead_lock_done(); + + bzero(&gsin, sizeof(gsin)); + gsin.sin_family = AF_INET; + gsin.sin_len = sizeof(struct sockaddr_in); + gsin.sin_addr = *group; + + /* + * Check if a link-layer group is already associated + * with this network-layer group on the given ifnet. + */ + error = if_addmulti(ifp, (struct sockaddr *)&gsin, &ifma); + if (error != 0) + return (error); + + /* + * See comments in inm_remref() for access to ifma_protospec. + */ + in_multihead_lock_exclusive(); + IFMA_LOCK(ifma); + if ((inm = ifma->ifma_protospec) != NULL) { + VERIFY(ifma->ifma_addr != NULL); + VERIFY(ifma->ifma_addr->sa_family == AF_INET); + INM_ADDREF(inm); /* for caller */ + IFMA_UNLOCK(ifma); + INM_LOCK(inm); + VERIFY(inm->inm_ifma == ifma); + VERIFY(inm->inm_ifp == ifp); + VERIFY(in_hosteq(inm->inm_addr, *group)); + if (inm->inm_debug & IFD_ATTACHED) { + VERIFY(inm->inm_reqcnt >= 1); + inm->inm_reqcnt++; + VERIFY(inm->inm_reqcnt != 0); + *pinm = inm; + INM_UNLOCK(inm); + in_multihead_lock_done(); + IFMA_REMREF(ifma); + /* + * We lost the race with another thread doing + * in_getmulti(); since this group has already + * been joined; return the inm with a refcount + * held for caller. + */ + return (0); + } + /* + * We lost the race with another thread doing in_delmulti(); + * the inm referring to the ifma has been detached, thus we + * reattach it back to the in_multihead list and return the + * inm with a refcount held for the caller. + */ + in_multi_attach(inm); + VERIFY((inm->inm_debug & + (IFD_ATTACHED | IFD_TRASHED)) == IFD_ATTACHED); + *pinm = inm; + INM_UNLOCK(inm); + in_multihead_lock_done(); + IFMA_REMREF(ifma); + return (0); + } + IFMA_UNLOCK(ifma); + + /* + * A new in_multi record is needed; allocate and initialize it. + * We DO NOT perform an IGMP join as the in_ layer may need to + * push an initial source list down to IGMP to support SSM. + * + * The initial source filter state is INCLUDE, {} as per the RFC. + */ + inm = in_multi_alloc(M_WAITOK); + if (inm == NULL) { + in_multihead_lock_done(); + IFMA_REMREF(ifma); + return (ENOMEM); + } + INM_LOCK(inm); + inm->inm_addr = *group; + inm->inm_ifp = ifp; + inm->inm_igi = IGMP_IFINFO(ifp); + VERIFY(inm->inm_igi != NULL); + IGI_ADDREF(inm->inm_igi); + inm->inm_ifma = ifma; /* keep refcount from if_addmulti() */ + inm->inm_state = IGMP_NOT_MEMBER; + /* + * Pending state-changes per group are subject to a bounds check. + */ + inm->inm_scq.ifq_maxlen = IGMP_MAX_STATE_CHANGES; + inm->inm_st[0].iss_fmode = MCAST_UNDEFINED; + inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; + RB_INIT(&inm->inm_srcs); + *pinm = inm; + in_multi_attach(inm); + VERIFY((inm->inm_debug & (IFD_ATTACHED | IFD_TRASHED)) == IFD_ATTACHED); + INM_ADDREF_LOCKED(inm); /* for caller */ + INM_UNLOCK(inm); + + IFMA_LOCK(ifma); + VERIFY(ifma->ifma_protospec == NULL); + ifma->ifma_protospec = inm; + IFMA_UNLOCK(ifma); + in_multihead_lock_done(); + + return (0); +} + +/* + * Clear recorded source entries for a group. + * Used by the IGMP code. + * FIXME: Should reap. + */ +void +inm_clear_recorded(struct in_multi *inm) +{ + struct ip_msource *ims; + + INM_LOCK_ASSERT_HELD(inm); + + RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { + if (ims->ims_stp) { + ims->ims_stp = 0; + --inm->inm_st[1].iss_rec; + } + } + VERIFY(inm->inm_st[1].iss_rec == 0); +} + +/* + * Record a source as pending for a Source-Group IGMPv3 query. + * This lives here as it modifies the shared tree. + * + * inm is the group descriptor. + * naddr is the address of the source to record in network-byte order. + * + * If the net.inet.igmp.sgalloc sysctl is non-zero, we will + * lazy-allocate a source node in response to an SG query. + * Otherwise, no allocation is performed. This saves some memory + * with the trade-off that the source will not be reported to the + * router if joined in the window between the query response and + * the group actually being joined on the local host. + * + * Return 0 if the source didn't exist or was already marked as recorded. + * Return 1 if the source was marked as recorded by this function. + * Return <0 if any error occured (negated errno code). + */ +int +inm_record_source(struct in_multi *inm, const in_addr_t naddr) +{ + struct ip_msource find; + struct ip_msource *ims, *nims; + + INM_LOCK_ASSERT_HELD(inm); + + find.ims_haddr = ntohl(naddr); + ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find); + if (ims && ims->ims_stp) + return (0); + if (ims == NULL) { + if (inm->inm_nsrc == in_mcast_maxgrpsrc) + return (-ENOSPC); + nims = ipms_alloc(M_WAITOK); + if (nims == NULL) + return (-ENOMEM); + nims->ims_haddr = find.ims_haddr; + RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims); + ++inm->inm_nsrc; + ims = nims; + } + + /* + * Mark the source as recorded and update the recorded + * source count. + */ + ++ims->ims_stp; + ++inm->inm_st[1].iss_rec; + + return (1); +} + +/* + * Return a pointer to an in_msource owned by an in_mfilter, + * given its source address. + * Lazy-allocate if needed. If this is a new entry its filter state is + * undefined at t0. + * + * imf is the filter set being modified. + * haddr is the source address in *host* byte-order. + * + * Caller is expected to be holding imo_lock. + */ +static int +imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin, + struct in_msource **plims) +{ + struct ip_msource find; + struct ip_msource *ims; + struct in_msource *lims; + int error; + + error = 0; + ims = NULL; + lims = NULL; + + /* key is host byte order */ + find.ims_haddr = ntohl(psin->sin_addr.s_addr); + ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); + lims = (struct in_msource *)ims; + if (lims == NULL) { + if (imf->imf_nsrc == in_mcast_maxsocksrc) + return (ENOSPC); + lims = inms_alloc(M_WAITOK); + if (lims == NULL) + return (ENOMEM); + lims->ims_haddr = find.ims_haddr; + lims->imsl_st[0] = MCAST_UNDEFINED; + RB_INSERT(ip_msource_tree, &imf->imf_sources, + (struct ip_msource *)lims); + ++imf->imf_nsrc; + } + + *plims = lims; + + return (error); +} + +/* + * Graft a source entry into an existing socket-layer filter set, + * maintaining any required invariants and checking allocations. + * + * The source is marked as being in the new filter mode at t1. + * + * Return the pointer to the new node, otherwise return NULL. + * + * Caller is expected to be holding imo_lock. + */ +static struct in_msource * +imf_graft(struct in_mfilter *imf, const uint8_t st1, + const struct sockaddr_in *psin) +{ + struct in_msource *lims; + + lims = inms_alloc(M_WAITOK); + if (lims == NULL) + return (NULL); + lims->ims_haddr = ntohl(psin->sin_addr.s_addr); + lims->imsl_st[0] = MCAST_UNDEFINED; + lims->imsl_st[1] = st1; + RB_INSERT(ip_msource_tree, &imf->imf_sources, + (struct ip_msource *)lims); + ++imf->imf_nsrc; + + return (lims); +} + +/* + * Prune a source entry from an existing socket-layer filter set, + * maintaining any required invariants and checking allocations. + * + * The source is marked as being left at t1, it is not freed. + * + * Return 0 if no error occurred, otherwise return an errno value. + * + * Caller is expected to be holding imo_lock. + */ +static int +imf_prune(struct in_mfilter *imf, const struct sockaddr_in *psin) +{ + struct ip_msource find; + struct ip_msource *ims; + struct in_msource *lims; + + /* key is host byte order */ + find.ims_haddr = ntohl(psin->sin_addr.s_addr); + ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); + if (ims == NULL) + return (ENOENT); + lims = (struct in_msource *)ims; + lims->imsl_st[1] = MCAST_UNDEFINED; + return (0); +} + +/* + * Revert socket-layer filter set deltas at t1 to t0 state. + * + * Caller is expected to be holding imo_lock. + */ +static void +imf_rollback(struct in_mfilter *imf) +{ + struct ip_msource *ims, *tims; + struct in_msource *lims; + + RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { + lims = (struct in_msource *)ims; + if (lims->imsl_st[0] == lims->imsl_st[1]) { + /* no change at t1 */ + continue; + } else if (lims->imsl_st[0] != MCAST_UNDEFINED) { + /* revert change to existing source at t1 */ + lims->imsl_st[1] = lims->imsl_st[0]; + } else { + /* revert source added t1 */ + IGMP_PRINTF(("%s: free inms %p\n", __func__, lims)); + RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); + inms_free(lims); + imf->imf_nsrc--; + } + } + imf->imf_st[1] = imf->imf_st[0]; +} + +/* + * Mark socket-layer filter set as INCLUDE {} at t1. + * + * Caller is expected to be holding imo_lock. + */ +void +imf_leave(struct in_mfilter *imf) +{ + struct ip_msource *ims; + struct in_msource *lims; + + RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { + lims = (struct in_msource *)ims; + lims->imsl_st[1] = MCAST_UNDEFINED; + } + imf->imf_st[1] = MCAST_INCLUDE; +} + +/* + * Mark socket-layer filter set deltas as committed. + * + * Caller is expected to be holding imo_lock. + */ +static void +imf_commit(struct in_mfilter *imf) +{ + struct ip_msource *ims; + struct in_msource *lims; + + RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { + lims = (struct in_msource *)ims; + lims->imsl_st[0] = lims->imsl_st[1]; + } + imf->imf_st[0] = imf->imf_st[1]; +} + +/* + * Reap unreferenced sources from socket-layer filter set. + * + * Caller is expected to be holding imo_lock. + */ +static void +imf_reap(struct in_mfilter *imf) +{ + struct ip_msource *ims, *tims; + struct in_msource *lims; + + RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { + lims = (struct in_msource *)ims; + if ((lims->imsl_st[0] == MCAST_UNDEFINED) && + (lims->imsl_st[1] == MCAST_UNDEFINED)) { + IGMP_PRINTF(("%s: free inms %p\n", __func__, lims)); + RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); + inms_free(lims); + imf->imf_nsrc--; + } + } +} + +/* + * Purge socket-layer filter set. + * + * Caller is expected to be holding imo_lock. + */ +void +imf_purge(struct in_mfilter *imf) +{ + struct ip_msource *ims, *tims; + struct in_msource *lims; + + RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { + lims = (struct in_msource *)ims; + IGMP_PRINTF(("%s: free inms %p\n", __func__, lims)); + RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); + inms_free(lims); + imf->imf_nsrc--; + } + imf->imf_st[0] = imf->imf_st[1] = MCAST_UNDEFINED; + VERIFY(RB_EMPTY(&imf->imf_sources)); +} + +/* + * Look up a source filter entry for a multicast group. + * + * inm is the group descriptor to work with. + * haddr is the host-byte-order IPv4 address to look up. + * noalloc may be non-zero to suppress allocation of sources. + * *pims will be set to the address of the retrieved or allocated source. + * + * Return 0 if successful, otherwise return a non-zero error code. + */ +static int +inm_get_source(struct in_multi *inm, const in_addr_t haddr, + const int noalloc, struct ip_msource **pims) +{ + struct ip_msource find; + struct ip_msource *ims, *nims; +#ifdef IGMP_DEBUG + struct in_addr ia; +#endif + INM_LOCK_ASSERT_HELD(inm); + + find.ims_haddr = haddr; + ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find); + if (ims == NULL && !noalloc) { + if (inm->inm_nsrc == in_mcast_maxgrpsrc) + return (ENOSPC); + nims = ipms_alloc(M_WAITOK); + if (nims == NULL) + return (ENOMEM); + nims->ims_haddr = haddr; + RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims); + ++inm->inm_nsrc; + ims = nims; +#ifdef IGMP_DEBUG + ia.s_addr = htonl(haddr); + IGMP_PRINTF(("%s: allocated %s as %p\n", __func__, + inet_ntoa(ia), ims)); +#endif + } + + *pims = ims; + return (0); +} + +/* + * Helper function to derive the filter mode on a source entry + * from its internal counters. Predicates are: + * A source is only excluded if all listeners exclude it. + * A source is only included if no listeners exclude it, + * and at least one listener includes it. + * May be used by ifmcstat(8). + */ +uint8_t +ims_get_mode(const struct in_multi *inm, const struct ip_msource *ims, + uint8_t t) +{ + INM_LOCK_ASSERT_HELD(INM_CAST_TO_NONCONST(inm)); + + t = !!t; + if (inm->inm_st[t].iss_ex > 0 && + inm->inm_st[t].iss_ex == ims->ims_st[t].ex) + return (MCAST_EXCLUDE); + else if (ims->ims_st[t].in > 0 && ims->ims_st[t].ex == 0) + return (MCAST_INCLUDE); + return (MCAST_UNDEFINED); +} + +/* + * Merge socket-layer source into IGMP-layer source. + * If rollback is non-zero, perform the inverse of the merge. + */ +static void +ims_merge(struct ip_msource *ims, const struct in_msource *lims, + const int rollback) +{ + int n = rollback ? -1 : 1; +#ifdef IGMP_DEBUG + struct in_addr ia; + + ia.s_addr = htonl(ims->ims_haddr); +#endif + + if (lims->imsl_st[0] == MCAST_EXCLUDE) { + IGMP_PRINTF(("%s: t1 ex -= %d on %s\n", + __func__, n, inet_ntoa(ia))); + ims->ims_st[1].ex -= n; + } else if (lims->imsl_st[0] == MCAST_INCLUDE) { + IGMP_PRINTF(("%s: t1 in -= %d on %s\n", + __func__, n, inet_ntoa(ia))); + ims->ims_st[1].in -= n; + } + + if (lims->imsl_st[1] == MCAST_EXCLUDE) { + IGMP_PRINTF(("%s: t1 ex += %d on %s\n", + __func__, n, inet_ntoa(ia))); + ims->ims_st[1].ex += n; + } else if (lims->imsl_st[1] == MCAST_INCLUDE) { + IGMP_PRINTF(("%s: t1 in += %d on %s\n", + __func__, n, inet_ntoa(ia))); + ims->ims_st[1].in += n; + } +} + +/* + * Atomically update the global in_multi state, when a membership's + * filter list is being updated in any way. + * + * imf is the per-inpcb-membership group filter pointer. + * A fake imf may be passed for in-kernel consumers. + * + * XXX This is a candidate for a set-symmetric-difference style loop + * which would eliminate the repeated lookup from root of ims nodes, + * as they share the same key space. + * + * If any error occurred this function will back out of refcounts + * and return a non-zero value. + */ +static int +inm_merge(struct in_multi *inm, /*const*/ struct in_mfilter *imf) +{ + struct ip_msource *ims, *nims; + struct in_msource *lims; + int schanged, error; + int nsrc0, nsrc1; + + INM_LOCK_ASSERT_HELD(inm); + + schanged = 0; + error = 0; + nsrc1 = nsrc0 = 0; + + /* + * Update the source filters first, as this may fail. + * Maintain count of in-mode filters at t0, t1. These are + * used to work out if we transition into ASM mode or not. + * Maintain a count of source filters whose state was + * actually modified by this operation. + */ + RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { + lims = (struct in_msource *)ims; + if (lims->imsl_st[0] == imf->imf_st[0]) nsrc0++; + if (lims->imsl_st[1] == imf->imf_st[1]) nsrc1++; + if (lims->imsl_st[0] == lims->imsl_st[1]) continue; + error = inm_get_source(inm, lims->ims_haddr, 0, &nims); + ++schanged; + if (error) + break; + ims_merge(nims, lims, 0); + } + if (error) { + struct ip_msource *bims; + + RB_FOREACH_REVERSE_FROM(ims, ip_msource_tree, nims) { + lims = (struct in_msource *)ims; + if (lims->imsl_st[0] == lims->imsl_st[1]) + continue; + (void) inm_get_source(inm, lims->ims_haddr, 1, &bims); + if (bims == NULL) + continue; + ims_merge(bims, lims, 1); + } + goto out_reap; + } + + IGMP_PRINTF(("%s: imf filters in-mode: %d at t0, %d at t1\n", + __func__, nsrc0, nsrc1)); + + /* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */ + if (imf->imf_st[0] == imf->imf_st[1] && + imf->imf_st[1] == MCAST_INCLUDE) { + if (nsrc1 == 0) { + IGMP_PRINTF(("%s: --in on inm at t1\n", __func__)); + --inm->inm_st[1].iss_in; + } + } + + /* Handle filter mode transition on socket. */ + if (imf->imf_st[0] != imf->imf_st[1]) { + IGMP_PRINTF(("%s: imf transition %d to %d\n", + __func__, imf->imf_st[0], imf->imf_st[1])); + + if (imf->imf_st[0] == MCAST_EXCLUDE) { + IGMP_PRINTF(("%s: --ex on inm at t1\n", __func__)); + --inm->inm_st[1].iss_ex; + } else if (imf->imf_st[0] == MCAST_INCLUDE) { + IGMP_PRINTF(("%s: --in on inm at t1\n", __func__)); + --inm->inm_st[1].iss_in; + } + + if (imf->imf_st[1] == MCAST_EXCLUDE) { + IGMP_PRINTF(("%s: ex++ on inm at t1\n", __func__)); + inm->inm_st[1].iss_ex++; + } else if (imf->imf_st[1] == MCAST_INCLUDE && nsrc1 > 0) { + IGMP_PRINTF(("%s: in++ on inm at t1\n", __func__)); + inm->inm_st[1].iss_in++; + } + } + + /* + * Track inm filter state in terms of listener counts. + * If there are any exclusive listeners, stack-wide + * membership is exclusive. + * Otherwise, if only inclusive listeners, stack-wide is inclusive. + * If no listeners remain, state is undefined at t1, + * and the IGMP lifecycle for this group should finish. + */ + if (inm->inm_st[1].iss_ex > 0) { + IGMP_PRINTF(("%s: transition to EX\n", __func__)); + inm->inm_st[1].iss_fmode = MCAST_EXCLUDE; + } else if (inm->inm_st[1].iss_in > 0) { + IGMP_PRINTF(("%s: transition to IN\n", __func__)); + inm->inm_st[1].iss_fmode = MCAST_INCLUDE; + } else { + IGMP_PRINTF(("%s: transition to UNDEF\n", __func__)); + inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; + } + + /* Decrement ASM listener count on transition out of ASM mode. */ + if (imf->imf_st[0] == MCAST_EXCLUDE && nsrc0 == 0) { + if ((imf->imf_st[1] != MCAST_EXCLUDE) || + (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) { + IGMP_PRINTF(("%s: --asm on inm at t1\n", __func__)); + --inm->inm_st[1].iss_asm; + } + } + + /* Increment ASM listener count on transition to ASM mode. */ + if (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 == 0) { + IGMP_PRINTF(("%s: asm++ on inm at t1\n", __func__)); + inm->inm_st[1].iss_asm++; + } + + IGMP_PRINTF(("%s: merged imf %p to inm %p\n", __func__, imf, inm)); + inm_print(inm); + +out_reap: + if (schanged > 0) { + IGMP_PRINTF(("%s: sources changed; reaping\n", __func__)); + inm_reap(inm); + } + return (error); +} + +/* + * Mark an in_multi's filter set deltas as committed. + * Called by IGMP after a state change has been enqueued. + */ +void +inm_commit(struct in_multi *inm) +{ + struct ip_msource *ims; + + INM_LOCK_ASSERT_HELD(inm); + + IGMP_PRINTF(("%s: commit inm %p\n", __func__, inm)); + IGMP_PRINTF(("%s: pre commit:\n", __func__)); + inm_print(inm); + + RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { + ims->ims_st[0] = ims->ims_st[1]; + } + inm->inm_st[0] = inm->inm_st[1]; +} + +/* + * Reap unreferenced nodes from an in_multi's filter set. + */ +static void +inm_reap(struct in_multi *inm) +{ + struct ip_msource *ims, *tims; + + INM_LOCK_ASSERT_HELD(inm); + + RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) { + if (ims->ims_st[0].ex > 0 || ims->ims_st[0].in > 0 || + ims->ims_st[1].ex > 0 || ims->ims_st[1].in > 0 || + ims->ims_stp != 0) + continue; + IGMP_PRINTF(("%s: free ims %p\n", __func__, ims)); + RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims); + ipms_free(ims); + inm->inm_nsrc--; + } +} + +/* + * Purge all source nodes from an in_multi's filter set. + */ +void +inm_purge(struct in_multi *inm) +{ + struct ip_msource *ims, *tims; + + INM_LOCK_ASSERT_HELD(inm); + + RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) { + IGMP_PRINTF(("%s: free ims %p\n", __func__, ims)); + RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims); + ipms_free(ims); + inm->inm_nsrc--; + } +} + +/* + * Join a multicast group; real entry point. + * + * Only preserves atomicity at inm level. + * NOTE: imf argument cannot be const due to sys/tree.h limitations. + * + * If the IGMP downcall fails, the group is not joined, and an error + * code is returned. + */ +static int +in_joingroup(struct ifnet *ifp, const struct in_addr *gina, + /*const*/ struct in_mfilter *imf, struct in_multi **pinm) +{ + struct in_mfilter timf; + struct in_multi *inm = NULL; + int error = 0; + + IGMP_PRINTF(("%s: join %s on %p(%s%d))\n", __func__, + inet_ntoa(*gina), ifp, ifp->if_name, ifp->if_unit)); + + *pinm = NULL; + + /* + * If no imf was specified (i.e. kernel consumer), + * fake one up and assume it is an ASM join. + */ + if (imf == NULL) { + imf_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE); + imf = &timf; + } + + error = in_getmulti(ifp, gina, &inm); + if (error) { + IGMP_PRINTF(("%s: in_getmulti() failure\n", __func__)); + return (error); + } + + IGMP_PRINTF(("%s: merge inm state\n", __func__)); + + INM_LOCK(inm); + error = inm_merge(inm, imf); + if (error) { + IGMP_PRINTF(("%s: failed to merge inm state\n", __func__)); + goto out_inm_release; + } + + IGMP_PRINTF(("%s: doing igmp downcall\n", __func__)); + error = igmp_change_state(inm); + if (error) { + IGMP_PRINTF(("%s: failed to update source\n", __func__)); + goto out_inm_release; + } + +out_inm_release: + if (error) { + IGMP_PRINTF(("%s: dropping ref on %p\n", __func__, inm)); + INM_UNLOCK(inm); + INM_REMREF(inm); + } else { + INM_UNLOCK(inm); + *pinm = inm; /* keep refcount from in_getmulti() */ + } + + return (error); +} + +/* + * Leave a multicast group; real entry point. + * All source filters will be expunged. + * + * Only preserves atomicity at inm level. + * + * Note: This is not the same as inm_release(*) as this function also + * makes a state change downcall into IGMP. + */ +int +in_leavegroup(struct in_multi *inm, /*const*/ struct in_mfilter *imf) +{ + struct in_mfilter timf; + int error, lastref; + + error = 0; + + INM_LOCK_ASSERT_NOTHELD(inm); + + in_multihead_lock_exclusive(); + INM_LOCK(inm); + + IGMP_PRINTF(("%s: leave inm %p, %s/%s%d, imf %p\n", __func__, + inm, inet_ntoa(inm->inm_addr), + (inm_is_ifp_detached(inm) ? "null" : inm->inm_ifp->if_name), + inm->inm_ifp->if_unit, imf)); + + /* + * If no imf was specified (i.e. kernel consumer), + * fake one up and assume it is an ASM join. + */ + if (imf == NULL) { + imf_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED); + imf = &timf; + } + + /* + * Begin state merge transaction at IGMP layer. + * + * As this particular invocation should not cause any memory + * to be allocated, and there is no opportunity to roll back + * the transaction, it MUST NOT fail. + */ + IGMP_PRINTF(("%s: merge inm state\n", __func__)); + + error = inm_merge(inm, imf); + KASSERT(error == 0, ("%s: failed to merge inm state\n", __func__)); + + IGMP_PRINTF(("%s: doing igmp downcall\n", __func__)); + error = igmp_change_state(inm); +#if IGMP_DEBUG + if (error) + IGMP_PRINTF(("%s: failed igmp downcall\n", __func__)); +#endif + lastref = in_multi_detach(inm); + VERIFY(!lastref || (!(inm->inm_debug & IFD_ATTACHED) && + inm->inm_reqcnt == 0)); + INM_UNLOCK(inm); + in_multihead_lock_done(); + + if (lastref) + INM_REMREF(inm); /* for in_multihead list */ + + return (error); +} + +/* + * Join an IPv4 multicast group in (*,G) exclusive mode. + * The group must be a 224.0.0.0/24 link-scope group. + * This KPI is for legacy kernel consumers only. + */ +struct in_multi * +in_addmulti(struct in_addr *ap, struct ifnet *ifp) +{ + struct in_multi *pinm = NULL; + int error; + + KASSERT(IN_LOCAL_GROUP(ntohl(ap->s_addr)), + ("%s: %s not in 224.0.0.0/24\n", __func__, inet_ntoa(*ap))); + + error = in_joingroup(ifp, ap, NULL, &pinm); + VERIFY(pinm != NULL || error != 0); + + return (pinm); +} + +/* + * Leave an IPv4 multicast group, assumed to be in exclusive (*,G) mode. + * This KPI is for legacy kernel consumers only. + */ +void +in_delmulti(struct in_multi *inm) +{ + + (void) in_leavegroup(inm, NULL); +} + +/* + * Block or unblock an ASM multicast source on an inpcb. + * This implements the delta-based API described in RFC 3678. + * + * The delta-based API applies only to exclusive-mode memberships. + * An IGMP downcall will be performed. + * + * Return 0 if successful, otherwise return an appropriate error code. + */ +static int +inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) +{ + struct group_source_req gsr; + sockunion_t *gsa, *ssa; + struct ifnet *ifp; + struct in_mfilter *imf; + struct ip_moptions *imo; + struct in_msource *ims; + struct in_multi *inm; + size_t idx; + uint16_t fmode; + int error, doblock; + unsigned int ifindex = 0; + + ifp = NULL; + error = 0; + doblock = 0; + + memset(&gsr, 0, sizeof(struct group_source_req)); + gsa = (sockunion_t *)&gsr.gsr_group; + ssa = (sockunion_t *)&gsr.gsr_source; + + switch (sopt->sopt_name) { + case IP_BLOCK_SOURCE: + case IP_UNBLOCK_SOURCE: { + struct ip_mreq_source mreqs; + + error = sooptcopyin(sopt, &mreqs, + sizeof(struct ip_mreq_source), + sizeof(struct ip_mreq_source)); + if (error) + return (error); + + gsa->sin.sin_family = AF_INET; + gsa->sin.sin_len = sizeof(struct sockaddr_in); + gsa->sin.sin_addr = mreqs.imr_multiaddr; + + ssa->sin.sin_family = AF_INET; + ssa->sin.sin_len = sizeof(struct sockaddr_in); + ssa->sin.sin_addr = mreqs.imr_sourceaddr; + + if (!in_nullhost(mreqs.imr_interface)) + ifp = ip_multicast_if(&mreqs.imr_interface, &ifindex); + + if (sopt->sopt_name == IP_BLOCK_SOURCE) + doblock = 1; + + IGMP_PRINTF(("%s: imr_interface = %s, ifp = %p\n", + __func__, inet_ntoa(mreqs.imr_interface), ifp)); + break; + } + + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + error = sooptcopyin(sopt, &gsr, + sizeof(struct group_source_req), + sizeof(struct group_source_req)); + if (error) + return (error); + + if (gsa->sin.sin_family != AF_INET || + gsa->sin.sin_len != sizeof(struct sockaddr_in)) + return (EINVAL); + + if (ssa->sin.sin_family != AF_INET || + ssa->sin.sin_len != sizeof(struct sockaddr_in)) + return (EINVAL); + + ifnet_head_lock_shared(); + if (gsr.gsr_interface == 0 || + (u_int)if_index < gsr.gsr_interface) { + ifnet_head_done(); + return (EADDRNOTAVAIL); + } + + ifp = ifindex2ifnet[gsr.gsr_interface]; + ifnet_head_done(); + + if (ifp == NULL) + return (EADDRNOTAVAIL); + + if (sopt->sopt_name == MCAST_BLOCK_SOURCE) + doblock = 1; + break; + + default: + IGMP_PRINTF(("%s: unknown sopt_name %d\n", + __func__, sopt->sopt_name)); + return (EOPNOTSUPP); + break; + } + + if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) + return (EINVAL); + + /* + * Check if we are actually a member of this group. + */ + imo = inp_findmoptions(inp); + if (imo == NULL) + return (ENOMEM); + + IMO_LOCK(imo); + idx = imo_match_group(imo, ifp, &gsa->sa); + if (idx == (size_t)-1 || imo->imo_mfilters == NULL) { + error = EADDRNOTAVAIL; + goto out_imo_locked; + } + + VERIFY(imo->imo_mfilters != NULL); + imf = &imo->imo_mfilters[idx]; + inm = imo->imo_membership[idx]; + + /* + * Attempting to use the delta-based API on an + * non exclusive-mode membership is an error. + */ + fmode = imf->imf_st[0]; + if (fmode != MCAST_EXCLUDE) { + error = EINVAL; + goto out_imo_locked; + } + + /* + * Deal with error cases up-front: + * Asked to block, but already blocked; or + * Asked to unblock, but nothing to unblock. + * If adding a new block entry, allocate it. + */ + ims = imo_match_source(imo, idx, &ssa->sa); + if ((ims != NULL && doblock) || (ims == NULL && !doblock)) { + IGMP_PRINTF(("%s: source %s %spresent\n", __func__, + inet_ntoa(ssa->sin.sin_addr), doblock ? "" : "not ")); + error = EADDRNOTAVAIL; + goto out_imo_locked; + } + + /* + * Begin state merge transaction at socket layer. + */ + if (doblock) { + IGMP_PRINTF(("%s: %s source\n", __func__, "block")); + ims = imf_graft(imf, fmode, &ssa->sin); + if (ims == NULL) + error = ENOMEM; + } else { + IGMP_PRINTF(("%s: %s source\n", __func__, "allow")); + error = imf_prune(imf, &ssa->sin); + } + + if (error) { + IGMP_PRINTF(("%s: merge imf state failed\n", __func__)); + goto out_imf_rollback; + } + + /* + * Begin state merge transaction at IGMP layer. + */ + INM_LOCK(inm); + IGMP_PRINTF(("%s: merge inm state\n", __func__)); + error = inm_merge(inm, imf); + if (error) { + IGMP_PRINTF(("%s: failed to merge inm state\n", __func__)); + INM_UNLOCK(inm); + goto out_imf_rollback; + } + + IGMP_PRINTF(("%s: doing igmp downcall\n", __func__)); + error = igmp_change_state(inm); + INM_UNLOCK(inm); +#if IGMP_DEBUG + if (error) + IGMP_PRINTF(("%s: failed igmp downcall\n", __func__)); +#endif + +out_imf_rollback: + if (error) + imf_rollback(imf); + else + imf_commit(imf); + + imf_reap(imf); + +out_imo_locked: + IMO_UNLOCK(imo); + IMO_REMREF(imo); /* from inp_findmoptions() */ + return (error); +} + +/* + * Given an inpcb, return its multicast options structure pointer. + * + * Caller is responsible for locking the inpcb, and releasing the + * extra reference held on the imo, upon a successful return. + */ +static struct ip_moptions * +inp_findmoptions(struct inpcb *inp) +{ + struct ip_moptions *imo; + struct in_multi **immp; + struct in_mfilter *imfp; + size_t idx; + + if ((imo = inp->inp_moptions) != NULL) { + IMO_ADDREF(imo); /* for caller */ + return (imo); + } + + imo = ip_allocmoptions(M_WAITOK); + if (imo == NULL) + return (NULL); + + immp = _MALLOC(sizeof (*immp) * IP_MIN_MEMBERSHIPS, M_IPMOPTS, + M_WAITOK | M_ZERO); + if (immp == NULL) { + IMO_REMREF(imo); + return (NULL); + } + + imfp = _MALLOC(sizeof (struct in_mfilter) * IP_MIN_MEMBERSHIPS, + M_INMFILTER, M_WAITOK | M_ZERO); + if (imfp == NULL) { + _FREE(immp, M_IPMOPTS); + IMO_REMREF(imo); + return (NULL); + } + + imo->imo_multicast_ifp = NULL; + imo->imo_multicast_addr.s_addr = INADDR_ANY; + imo->imo_multicast_vif = -1; + imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; + imo->imo_multicast_loop = in_mcast_loop; + imo->imo_num_memberships = 0; + imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; + imo->imo_membership = immp; + + /* Initialize per-group source filters. */ + for (idx = 0; idx < IP_MIN_MEMBERSHIPS; idx++) + imf_init(&imfp[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); + + imo->imo_mfilters = imfp; + inp->inp_moptions = imo; /* keep reference from ip_allocmoptions() */ + IMO_ADDREF(imo); /* for caller */ + + return (imo); +} +/* + * Atomically get source filters on a socket for an IPv4 multicast group. + */ +static int +inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt) +{ + struct __msfilterreq64 msfr, msfr64; + struct __msfilterreq32 msfr32; + sockunion_t *gsa; + struct ifnet *ifp; + struct ip_moptions *imo; + struct in_mfilter *imf; + struct ip_msource *ims; + struct in_msource *lims; + struct sockaddr_in *psin; + struct sockaddr_storage *ptss; + struct sockaddr_storage *tss; + int error; + size_t idx, nsrcs, ncsrcs; + user_addr_t tmp_ptr; + + imo = inp->inp_moptions; + VERIFY(imo != NULL); + + if (IS_64BIT_PROCESS(current_proc())) { + error = sooptcopyin(sopt, &msfr64, + sizeof(struct __msfilterreq64), + sizeof(struct __msfilterreq64)); + if (error) + return (error); + /* we never use msfr.msfr_srcs; */ + memcpy(&msfr, &msfr64, sizeof(msfr)); + } else { + error = sooptcopyin(sopt, &msfr32, + sizeof(struct __msfilterreq32), + sizeof(struct __msfilterreq32)); + if (error) + return (error); + /* we never use msfr.msfr_srcs; */ + memcpy(&msfr, &msfr32, sizeof(msfr)); + } + + ifnet_head_lock_shared(); + if (msfr.msfr_ifindex == 0 || (u_int)if_index < msfr.msfr_ifindex) { + ifnet_head_done(); + return (EADDRNOTAVAIL); + } + + ifp = ifindex2ifnet[msfr.msfr_ifindex]; + ifnet_head_done(); + + if (ifp == NULL) + return (EADDRNOTAVAIL); + + if (msfr.msfr_nsrcs > in_mcast_maxsocksrc) + msfr.msfr_nsrcs = in_mcast_maxsocksrc; + + IMO_LOCK(imo); + /* + * Lookup group on the socket. + */ + gsa = (sockunion_t *)&msfr.msfr_group; + idx = imo_match_group(imo, ifp, &gsa->sa); + if (idx == (size_t)-1 || imo->imo_mfilters == NULL) { + IMO_UNLOCK(imo); + return (EADDRNOTAVAIL); + } + imf = &imo->imo_mfilters[idx]; + + /* + * Ignore memberships which are in limbo. + */ + if (imf->imf_st[1] == MCAST_UNDEFINED) { + IMO_UNLOCK(imo); + return (EAGAIN); + } + msfr.msfr_fmode = imf->imf_st[1]; + + /* + * If the user specified a buffer, copy out the source filter + * entries to userland gracefully. + * We only copy out the number of entries which userland + * has asked for, but we always tell userland how big the + * buffer really needs to be. + */ + + if (IS_64BIT_PROCESS(current_proc())) + tmp_ptr = msfr64.msfr_srcs; + else + tmp_ptr = CAST_USER_ADDR_T(msfr32.msfr_srcs); + + tss = NULL; + if (tmp_ptr != USER_ADDR_NULL && msfr.msfr_nsrcs > 0) { + tss = _MALLOC(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, + M_TEMP, M_WAITOK | M_ZERO); + if (tss == NULL) { + IMO_UNLOCK(imo); + return (ENOBUFS); + } + } + + /* + * Count number of sources in-mode at t0. + * If buffer space exists and remains, copy out source entries. + */ + nsrcs = msfr.msfr_nsrcs; + ncsrcs = 0; + ptss = tss; + RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { + lims = (struct in_msource *)ims; + if (lims->imsl_st[0] == MCAST_UNDEFINED || + lims->imsl_st[0] != imf->imf_st[0]) + continue; + if (tss != NULL && nsrcs > 0) { + psin = (struct sockaddr_in *)ptss; + psin->sin_family = AF_INET; + psin->sin_len = sizeof(struct sockaddr_in); + psin->sin_addr.s_addr = htonl(lims->ims_haddr); + psin->sin_port = 0; + ++ptss; + --nsrcs; + ++ncsrcs; + } + } + + IMO_UNLOCK(imo); + + if (tss != NULL) { + error = copyout(tss, tmp_ptr, + sizeof(struct sockaddr_storage) * ncsrcs); + FREE(tss, M_TEMP); + if (error) + return (error); + } + + msfr.msfr_nsrcs = ncsrcs; + if (IS_64BIT_PROCESS(current_proc())) { + msfr64.msfr_ifindex = msfr.msfr_ifindex; + msfr64.msfr_fmode = msfr.msfr_fmode; + msfr64.msfr_nsrcs = msfr.msfr_nsrcs; + memcpy(&msfr64.msfr_group, &msfr.msfr_group, + sizeof(struct sockaddr_storage)); + error = sooptcopyout(sopt, &msfr64, + sizeof(struct __msfilterreq64)); + } else { + msfr32.msfr_ifindex = msfr.msfr_ifindex; + msfr32.msfr_fmode = msfr.msfr_fmode; + msfr32.msfr_nsrcs = msfr.msfr_nsrcs; + memcpy(&msfr64.msfr_group, &msfr.msfr_group, + sizeof(struct sockaddr_storage)); + error = sooptcopyout(sopt, &msfr32, + sizeof(struct __msfilterreq32)); + } + + return (error); +} + +/* + * Return the IP multicast options in response to user getsockopt(). + */ +int +inp_getmoptions(struct inpcb *inp, struct sockopt *sopt) +{ + struct ip_mreqn mreqn; + struct ip_moptions *imo; + struct ifnet *ifp; + struct in_ifaddr *ia; + int error, optval; + unsigned int ifindex; + u_char coptval; + + imo = inp->inp_moptions; + /* + * If socket is neither of type SOCK_RAW or SOCK_DGRAM, + * or is a divert socket, reject it. + */ + if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || + (inp->inp_socket->so_proto->pr_type != SOCK_RAW && + inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) { + return (EOPNOTSUPP); + } + + error = 0; + switch (sopt->sopt_name) { +#ifdef MROUTING + case IP_MULTICAST_VIF: + if (imo != NULL) { + IMO_LOCK(imo); + optval = imo->imo_multicast_vif; + IMO_UNLOCK(imo); + } else + optval = -1; + error = sooptcopyout(sopt, &optval, sizeof(int)); + break; +#endif /* MROUTING */ + + case IP_MULTICAST_IF: + memset(&mreqn, 0, sizeof(struct ip_mreqn)); + if (imo != NULL) { + IMO_LOCK(imo); + ifp = imo->imo_multicast_ifp; + if (!in_nullhost(imo->imo_multicast_addr)) { + mreqn.imr_address = imo->imo_multicast_addr; + } else if (ifp != NULL) { + mreqn.imr_ifindex = ifp->if_index; + IFP_TO_IA(ifp, ia); + if (ia != NULL) { + IFA_LOCK_SPIN(&ia->ia_ifa); + mreqn.imr_address = + IA_SIN(ia)->sin_addr; + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); + } + } + IMO_UNLOCK(imo); + } + if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { + error = sooptcopyout(sopt, &mreqn, + sizeof(struct ip_mreqn)); + } else { + error = sooptcopyout(sopt, &mreqn.imr_address, + sizeof(struct in_addr)); + } + break; + + case IP_MULTICAST_IFINDEX: + if (imo != NULL) + IMO_LOCK(imo); + if (imo == NULL || imo->imo_multicast_ifp == NULL) { + ifindex = 0; + } else { + ifindex = imo->imo_multicast_ifp->if_index; + } + if (imo != NULL) + IMO_UNLOCK(imo); + error = sooptcopyout(sopt, &ifindex, sizeof (ifindex)); + break; + + case IP_MULTICAST_TTL: + if (imo == NULL) + optval = coptval = IP_DEFAULT_MULTICAST_TTL; + else { + IMO_LOCK(imo); + optval = coptval = imo->imo_multicast_ttl; + IMO_UNLOCK(imo); + } + if (sopt->sopt_valsize == sizeof(u_char)) + error = sooptcopyout(sopt, &coptval, sizeof(u_char)); + else + error = sooptcopyout(sopt, &optval, sizeof(int)); + break; + + case IP_MULTICAST_LOOP: + if (imo == 0) + optval = coptval = IP_DEFAULT_MULTICAST_LOOP; + else { + IMO_LOCK(imo); + optval = coptval = imo->imo_multicast_loop; + IMO_UNLOCK(imo); + } + if (sopt->sopt_valsize == sizeof(u_char)) + error = sooptcopyout(sopt, &coptval, sizeof(u_char)); + else + error = sooptcopyout(sopt, &optval, sizeof(int)); + break; + + case IP_MSFILTER: + if (imo == NULL) { + error = EADDRNOTAVAIL; + } else { + error = inp_get_source_filters(inp, sopt); + } + break; + + default: + error = ENOPROTOOPT; + break; + } + + return (error); +} + +/* + * Look up the ifnet to use for a multicast group membership, + * given the IPv4 address of an interface, and the IPv4 group address. + * + * This routine exists to support legacy multicast applications + * which do not understand that multicast memberships are scoped to + * specific physical links in the networking stack, or which need + * to join link-scope groups before IPv4 addresses are configured. + * + * If inp is non-NULL and is bound to an interface, use this socket's + * inp_boundif for any required routing table lookup. + * + * If the route lookup fails, attempt to use the first non-loopback + * interface with multicast capability in the system as a + * last resort. The legacy IPv4 ASM API requires that we do + * this in order to allow groups to be joined when the routing + * table has not yet been populated during boot. + * + * Returns NULL if no ifp could be found. + * + */ +static struct ifnet * +inp_lookup_mcast_ifp(const struct inpcb *inp, + const struct sockaddr_in *gsin, const struct in_addr ina) +{ + struct ifnet *ifp; + unsigned int ifindex = 0; + + VERIFY(gsin->sin_family == AF_INET); + VERIFY(IN_MULTICAST(ntohl(gsin->sin_addr.s_addr))); + + ifp = NULL; + if (!in_nullhost(ina)) { + struct in_addr new_ina; + memcpy(&new_ina, &ina, sizeof(struct in_addr)); + ifp = ip_multicast_if(&new_ina, &ifindex); + } else { + struct route ro; + unsigned int ifscope = IFSCOPE_NONE; + + if (inp != NULL && (inp->inp_flags & INP_BOUND_IF)) + ifscope = inp->inp_boundif; + + bzero(&ro, sizeof (ro)); + memcpy(&ro.ro_dst, gsin, sizeof(struct sockaddr_in)); + rtalloc_scoped_ign(&ro, 0, ifscope); + if (ro.ro_rt != NULL) { + ifp = ro.ro_rt->rt_ifp; + VERIFY(ifp != NULL); + rtfree(ro.ro_rt); + } else { + struct in_ifaddr *ia; + struct ifnet *mifp; + + mifp = NULL; + lck_rw_lock_shared(in_ifaddr_rwlock); + TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { + IFA_LOCK_SPIN(&ia->ia_ifa); + mifp = ia->ia_ifp; + IFA_UNLOCK(&ia->ia_ifa); + if (!(mifp->if_flags & IFF_LOOPBACK) && + (mifp->if_flags & IFF_MULTICAST)) { + ifp = mifp; + break; + } + } + lck_rw_done(in_ifaddr_rwlock); + } + } + + return (ifp); +} + +/* + * Join an IPv4 multicast group, possibly with a source. + * + * NB: sopt->sopt_val might point to the kernel address space. This means that + * we were called by the IPv6 stack due to the presence of an IPv6 v4 mapped + * address. In this scenario, sopt_p points to kernproc and sooptcopyin() will + * just issue an in-kernel memcpy. + */ +int +inp_join_group(struct inpcb *inp, struct sockopt *sopt) +{ + struct group_source_req gsr; + sockunion_t *gsa, *ssa; + struct ifnet *ifp; + struct in_mfilter *imf; + struct ip_moptions *imo; + struct in_multi *inm = NULL; + struct in_msource *lims; + size_t idx; + int error, is_new; + + ifp = NULL; + imf = NULL; + error = 0; + is_new = 0; + + memset(&gsr, 0, sizeof(struct group_source_req)); + gsa = (sockunion_t *)&gsr.gsr_group; + gsa->ss.ss_family = AF_UNSPEC; + ssa = (sockunion_t *)&gsr.gsr_source; + ssa->ss.ss_family = AF_UNSPEC; + + switch (sopt->sopt_name) { + case IP_ADD_MEMBERSHIP: + case IP_ADD_SOURCE_MEMBERSHIP: { + struct ip_mreq_source mreqs; + + if (sopt->sopt_name == IP_ADD_MEMBERSHIP) { + error = sooptcopyin(sopt, &mreqs, + sizeof(struct ip_mreq), + sizeof(struct ip_mreq)); + /* + * Do argument switcharoo from ip_mreq into + * ip_mreq_source to avoid using two instances. + */ + mreqs.imr_interface = mreqs.imr_sourceaddr; + mreqs.imr_sourceaddr.s_addr = INADDR_ANY; + } else if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) { + error = sooptcopyin(sopt, &mreqs, + sizeof(struct ip_mreq_source), + sizeof(struct ip_mreq_source)); + } + if (error) { + IGMP_PRINTF(("%s: error copyin IP_ADD_MEMBERSHIP/" + "IP_ADD_SOURCE_MEMBERSHIP %d err=%d\n", + __func__, sopt->sopt_name, error)); + return (error); + } + + gsa->sin.sin_family = AF_INET; + gsa->sin.sin_len = sizeof(struct sockaddr_in); + gsa->sin.sin_addr = mreqs.imr_multiaddr; + + if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) { + ssa->sin.sin_family = AF_INET; + ssa->sin.sin_len = sizeof(struct sockaddr_in); + ssa->sin.sin_addr = mreqs.imr_sourceaddr; + } + + if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) + return (EINVAL); + + ifp = inp_lookup_mcast_ifp(inp, &gsa->sin, + mreqs.imr_interface); + IGMP_PRINTF(("%s: imr_interface = %s, ifp = %p\n", + __func__, inet_ntoa(mreqs.imr_interface), ifp)); + break; + } + + case MCAST_JOIN_GROUP: + case MCAST_JOIN_SOURCE_GROUP: + if (sopt->sopt_name == MCAST_JOIN_GROUP) { + error = sooptcopyin(sopt, &gsr, + sizeof(struct group_req), + sizeof(struct group_req)); + } else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { + error = sooptcopyin(sopt, &gsr, + sizeof(struct group_source_req), + sizeof(struct group_source_req)); + } + if (error) + return (error); + + if (gsa->sin.sin_family != AF_INET || + gsa->sin.sin_len != sizeof(struct sockaddr_in)) + return (EINVAL); + + /* + * Overwrite the port field if present, as the sockaddr + * being copied in may be matched with a binary comparison. + */ + gsa->sin.sin_port = 0; + if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { + if (ssa->sin.sin_family != AF_INET || + ssa->sin.sin_len != sizeof(struct sockaddr_in)) + return (EINVAL); + ssa->sin.sin_port = 0; + } + + if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) + return (EINVAL); + + ifnet_head_lock_shared(); + if (gsr.gsr_interface == 0 || + (u_int)if_index < gsr.gsr_interface) { + ifnet_head_done(); + return (EADDRNOTAVAIL); + } + ifp = ifindex2ifnet[gsr.gsr_interface]; + ifnet_head_done(); + + break; + + default: + IGMP_PRINTF(("%s: unknown sopt_name %d\n", + __func__, sopt->sopt_name)); + return (EOPNOTSUPP); + break; + } + + if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) + return (EADDRNOTAVAIL); + + imo = inp_findmoptions(inp); + if (imo == NULL) + return (ENOMEM); + + IMO_LOCK(imo); + idx = imo_match_group(imo, ifp, &gsa->sa); + if (idx == (size_t)-1) { + is_new = 1; + } else { + inm = imo->imo_membership[idx]; + imf = &imo->imo_mfilters[idx]; + if (ssa->ss.ss_family != AF_UNSPEC) { + /* + * MCAST_JOIN_SOURCE_GROUP on an exclusive membership + * is an error. On an existing inclusive membership, + * it just adds the source to the filter list. + */ + if (imf->imf_st[1] != MCAST_INCLUDE) { + error = EINVAL; + goto out_imo_locked; + } + /* + * Throw out duplicates. + * + * XXX FIXME: This makes a naive assumption that + * even if entries exist for *ssa in this imf, + * they will be rejected as dupes, even if they + * are not valid in the current mode (in-mode). + * + * in_msource is transactioned just as for anything + * else in SSM -- but note naive use of inm_graft() + * below for allocating new filter entries. + * + * This is only an issue if someone mixes the + * full-state SSM API with the delta-based API, + * which is discouraged in the relevant RFCs. + */ + lims = imo_match_source(imo, idx, &ssa->sa); + if (lims != NULL /*&& + lims->imsl_st[1] == MCAST_INCLUDE*/) { + error = EADDRNOTAVAIL; + goto out_imo_locked; + } + } else { + /* + * MCAST_JOIN_GROUP on an existing exclusive + * membership is an error; return EADDRINUSE + * to preserve 4.4BSD API idempotence, and + * avoid tedious detour to code below. + * NOTE: This is bending RFC 3678 a bit. + * + * On an existing inclusive membership, this is also + * an error; if you want to change filter mode, + * you must use the userland API setsourcefilter(). + * XXX We don't reject this for imf in UNDEFINED + * state at t1, because allocation of a filter + * is atomic with allocation of a membership. + */ + error = EINVAL; + /* See comments above for EADDRINUSE */ + if (imf->imf_st[1] == MCAST_EXCLUDE) + error = EADDRINUSE; + goto out_imo_locked; + } + } + + /* + * Begin state merge transaction at socket layer. + */ + + if (is_new) { + if (imo->imo_num_memberships == imo->imo_max_memberships) { + error = imo_grow(imo, 0); + if (error) + goto out_imo_locked; + } + /* + * Allocate the new slot upfront so we can deal with + * grafting the new source filter in same code path + * as for join-source on existing membership. + */ + idx = imo->imo_num_memberships; + imo->imo_membership[idx] = NULL; + imo->imo_num_memberships++; + VERIFY(imo->imo_mfilters != NULL); + imf = &imo->imo_mfilters[idx]; + VERIFY(RB_EMPTY(&imf->imf_sources)); + } + + /* + * Graft new source into filter list for this inpcb's + * membership of the group. The in_multi may not have + * been allocated yet if this is a new membership, however, + * the in_mfilter slot will be allocated and must be initialized. + */ + if (ssa->ss.ss_family != AF_UNSPEC) { + /* Membership starts in IN mode */ + if (is_new) { + IGMP_PRINTF(("%s: new join w/source\n", __func__)); + imf_init(imf, MCAST_UNDEFINED, MCAST_INCLUDE); + } else { + IGMP_PRINTF(("%s: %s source\n", __func__, "allow")); + } + lims = imf_graft(imf, MCAST_INCLUDE, &ssa->sin); + if (lims == NULL) { + IGMP_PRINTF(("%s: merge imf state failed\n", + __func__)); + error = ENOMEM; + goto out_imo_free; + } + } else { + /* No address specified; Membership starts in EX mode */ + if (is_new) { + IGMP_PRINTF(("%s: new join w/o source\n", __func__)); + imf_init(imf, MCAST_UNDEFINED, MCAST_EXCLUDE); + } + } + + /* + * Begin state merge transaction at IGMP layer. + */ + + if (is_new) { + VERIFY(inm == NULL); + error = in_joingroup(ifp, &gsa->sin.sin_addr, imf, &inm); + VERIFY(inm != NULL || error != 0); + if (error) + goto out_imo_free; + imo->imo_membership[idx] = inm; /* from in_joingroup() */ + } else { + IGMP_PRINTF(("%s: merge inm state\n", __func__)); + INM_LOCK(inm); + error = inm_merge(inm, imf); + if (error) { + IGMP_PRINTF(("%s: failed to merge inm state\n", + __func__)); + INM_UNLOCK(inm); + goto out_imf_rollback; + } + IGMP_PRINTF(("%s: doing igmp downcall\n", __func__)); + error = igmp_change_state(inm); + INM_UNLOCK(inm); + if (error) { + IGMP_PRINTF(("%s: failed igmp downcall\n", + __func__)); + goto out_imf_rollback; + } + } + +out_imf_rollback: + if (error) { + imf_rollback(imf); + if (is_new) + imf_purge(imf); + else + imf_reap(imf); + } else { + imf_commit(imf); + } + +out_imo_free: + if (error && is_new) { + VERIFY(inm == NULL); + imo->imo_membership[idx] = NULL; + --imo->imo_num_memberships; + } + +out_imo_locked: + IMO_UNLOCK(imo); + IMO_REMREF(imo); /* from inp_findmoptions() */ + return (error); +} + +/* + * Leave an IPv4 multicast group on an inpcb, possibly with a source. + * + * NB: sopt->sopt_val might point to the kernel address space. Refer to the + * block comment on top of inp_join_group() for more information. + */ +int +inp_leave_group(struct inpcb *inp, struct sockopt *sopt) +{ + struct group_source_req gsr; + struct ip_mreq_source mreqs; + sockunion_t *gsa, *ssa; + struct ifnet *ifp; + struct in_mfilter *imf; + struct ip_moptions *imo; + struct in_msource *ims; + struct in_multi *inm = NULL; + size_t idx; + int error, is_final; + unsigned int ifindex = 0; + + ifp = NULL; + error = 0; + is_final = 1; + + memset(&gsr, 0, sizeof(struct group_source_req)); + gsa = (sockunion_t *)&gsr.gsr_group; + gsa->ss.ss_family = AF_UNSPEC; + ssa = (sockunion_t *)&gsr.gsr_source; + ssa->ss.ss_family = AF_UNSPEC; + + switch (sopt->sopt_name) { + case IP_DROP_MEMBERSHIP: + case IP_DROP_SOURCE_MEMBERSHIP: + if (sopt->sopt_name == IP_DROP_MEMBERSHIP) { + error = sooptcopyin(sopt, &mreqs, + sizeof(struct ip_mreq), + sizeof(struct ip_mreq)); + /* + * Swap interface and sourceaddr arguments, + * as ip_mreq and ip_mreq_source are laid + * out differently. + */ + mreqs.imr_interface = mreqs.imr_sourceaddr; + mreqs.imr_sourceaddr.s_addr = INADDR_ANY; + } else if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { + error = sooptcopyin(sopt, &mreqs, + sizeof(struct ip_mreq_source), + sizeof(struct ip_mreq_source)); + } + if (error) + return (error); + + gsa->sin.sin_family = AF_INET; + gsa->sin.sin_len = sizeof(struct sockaddr_in); + gsa->sin.sin_addr = mreqs.imr_multiaddr; + + if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { + ssa->sin.sin_family = AF_INET; + ssa->sin.sin_len = sizeof(struct sockaddr_in); + ssa->sin.sin_addr = mreqs.imr_sourceaddr; + } + /* + * Attempt to look up hinted ifp from interface address. + * Fallthrough with null ifp iff lookup fails, to + * preserve 4.4BSD mcast API idempotence. + * XXX NOTE WELL: The RFC 3678 API is preferred because + * using an IPv4 address as a key is racy. + */ + if (!in_nullhost(mreqs.imr_interface)) + ifp = ip_multicast_if(&mreqs.imr_interface, &ifindex); + + IGMP_PRINTF(("%s: imr_interface = %s, ifp = %p\n", + __func__, inet_ntoa(mreqs.imr_interface), ifp)); + + break; + + case MCAST_LEAVE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + if (sopt->sopt_name == MCAST_LEAVE_GROUP) { + error = sooptcopyin(sopt, &gsr, + sizeof(struct group_req), + sizeof(struct group_req)); + } else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { + error = sooptcopyin(sopt, &gsr, + sizeof(struct group_source_req), + sizeof(struct group_source_req)); + } + if (error) + return (error); + + if (gsa->sin.sin_family != AF_INET || + gsa->sin.sin_len != sizeof(struct sockaddr_in)) + return (EINVAL); + + if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { + if (ssa->sin.sin_family != AF_INET || + ssa->sin.sin_len != sizeof(struct sockaddr_in)) + return (EINVAL); + } + + ifnet_head_lock_shared(); + if (gsr.gsr_interface == 0 || + (u_int)if_index < gsr.gsr_interface) { + ifnet_head_done(); + return (EADDRNOTAVAIL); + } + + ifp = ifindex2ifnet[gsr.gsr_interface]; + ifnet_head_done(); + break; + + default: + IGMP_PRINTF(("%s: unknown sopt_name %d\n", + __func__, sopt->sopt_name)); + return (EOPNOTSUPP); + break; + } + + if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) + return (EINVAL); + + /* + * Find the membership in the membership array. + */ + imo = inp_findmoptions(inp); + if (imo == NULL) + return (ENOMEM); + + IMO_LOCK(imo); + idx = imo_match_group(imo, ifp, &gsa->sa); + if (idx == (size_t)-1) { + error = EADDRNOTAVAIL; + goto out_locked; + } + inm = imo->imo_membership[idx]; + imf = &imo->imo_mfilters[idx]; + + if (ssa->ss.ss_family != AF_UNSPEC) { + IGMP_PRINTF(("%s: opt=%d is_final=0\n", __func__, + sopt->sopt_name)); + is_final = 0; + } + + /* + * Begin state merge transaction at socket layer. + */ + + /* + * If we were instructed only to leave a given source, do so. + * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships. + */ + if (is_final) { + imf_leave(imf); + } else { + if (imf->imf_st[0] == MCAST_EXCLUDE) { + error = EADDRNOTAVAIL; + goto out_locked; + } + ims = imo_match_source(imo, idx, &ssa->sa); + if (ims == NULL) { + IGMP_PRINTF(("%s: source %s %spresent\n", __func__, + inet_ntoa(ssa->sin.sin_addr), "not ")); + error = EADDRNOTAVAIL; + goto out_locked; + } + IGMP_PRINTF(("%s: %s source\n", __func__, "block")); + error = imf_prune(imf, &ssa->sin); + if (error) { + IGMP_PRINTF(("%s: merge imf state failed\n", + __func__)); + goto out_locked; + } + } + + /* + * Begin state merge transaction at IGMP layer. + */ + + if (is_final) { + /* + * Give up the multicast address record to which + * the membership points. Reference held in imo + * will be released below. + */ + (void) in_leavegroup(inm, imf); + } else { + IGMP_PRINTF(("%s: merge inm state\n", __func__)); + INM_LOCK(inm); + error = inm_merge(inm, imf); + if (error) { + IGMP_PRINTF(("%s: failed to merge inm state\n", + __func__)); + INM_UNLOCK(inm); + goto out_imf_rollback; + } + + IGMP_PRINTF(("%s: doing igmp downcall\n", __func__)); + error = igmp_change_state(inm); + if (error) { + IGMP_PRINTF(("%s: failed igmp downcall\n", __func__)); + } + INM_UNLOCK(inm); + } + +out_imf_rollback: + if (error) + imf_rollback(imf); + else + imf_commit(imf); + + imf_reap(imf); + + if (is_final) { + /* Remove the gap in the membership and filter array. */ + VERIFY(inm == imo->imo_membership[idx]); + imo->imo_membership[idx] = NULL; + INM_REMREF(inm); + for (++idx; idx < imo->imo_num_memberships; ++idx) { + imo->imo_membership[idx-1] = imo->imo_membership[idx]; + imo->imo_mfilters[idx-1] = imo->imo_mfilters[idx]; + } + imo->imo_num_memberships--; + } + +out_locked: + IMO_UNLOCK(imo); + IMO_REMREF(imo); /* from inp_findmoptions() */ + return (error); +} + +/* + * Select the interface for transmitting IPv4 multicast datagrams. + * + * Either an instance of struct in_addr or an instance of struct ip_mreqn + * may be passed to this socket option. An address of INADDR_ANY or an + * interface index of 0 is used to remove a previous selection. + * When no interface is selected, one is chosen for every send. + */ +static int +inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt) +{ + struct in_addr addr; + struct ip_mreqn mreqn; + struct ifnet *ifp; + struct ip_moptions *imo; + int error = 0 ; + unsigned int ifindex = 0; + + if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { + /* + * An interface index was specified using the + * Linux-derived ip_mreqn structure. + */ + error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreqn), + sizeof(struct ip_mreqn)); + if (error) + return (error); + + ifnet_head_lock_shared(); + if (mreqn.imr_ifindex < 0 || if_index < mreqn.imr_ifindex) { + ifnet_head_done(); + return (EINVAL); + } + + if (mreqn.imr_ifindex == 0) { + ifp = NULL; + } else { + ifp = ifindex2ifnet[mreqn.imr_ifindex]; + if (ifp == NULL) { + ifnet_head_done(); + return (EADDRNOTAVAIL); + } + } + ifnet_head_done(); + } else { + /* + * An interface was specified by IPv4 address. + * This is the traditional BSD usage. + */ + error = sooptcopyin(sopt, &addr, sizeof(struct in_addr), + sizeof(struct in_addr)); + if (error) + return (error); + if (in_nullhost(addr)) { + ifp = NULL; + } else { + ifp = ip_multicast_if(&addr, &ifindex); + if (ifp == NULL) { + IGMP_PRINTF(("%s: can't find ifp for addr=%s\n", + __func__, inet_ntoa(addr))); + return (EADDRNOTAVAIL); + } + } +#ifdef IGMP_DEBUG0 + IGMP_PRINTF(("%s: ifp = %p, addr = %s\n", __func__, ifp, + inet_ntoa(addr))); +#endif + } + + /* Reject interfaces which do not support multicast. */ + if (ifp != NULL && (ifp->if_flags & IFF_MULTICAST) == 0) + return (EOPNOTSUPP); + + imo = inp_findmoptions(inp); + if (imo == NULL) + return (ENOMEM); + + IMO_LOCK(imo); + imo->imo_multicast_ifp = ifp; + if (ifindex) + imo->imo_multicast_addr = addr; + else + imo->imo_multicast_addr.s_addr = INADDR_ANY; + IMO_UNLOCK(imo); + IMO_REMREF(imo); /* from inp_findmoptions() */ + + return (0); +} + +/* + * Atomically set source filters on a socket for an IPv4 multicast group. + */ +static int +inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) +{ + struct __msfilterreq64 msfr, msfr64; + struct __msfilterreq32 msfr32; + sockunion_t *gsa; + struct ifnet *ifp; + struct in_mfilter *imf; + struct ip_moptions *imo; + struct in_multi *inm; + size_t idx; + int error; + user_addr_t tmp_ptr; + + if (IS_64BIT_PROCESS(current_proc())) { + error = sooptcopyin(sopt, &msfr64, + sizeof(struct __msfilterreq64), + sizeof(struct __msfilterreq64)); + if (error) + return (error); + /* we never use msfr.msfr_srcs; */ + memcpy(&msfr, &msfr64, sizeof(msfr)); + } else { + error = sooptcopyin(sopt, &msfr32, + sizeof(struct __msfilterreq32), + sizeof(struct __msfilterreq32)); + if (error) + return (error); + /* we never use msfr.msfr_srcs; */ + memcpy(&msfr, &msfr32, sizeof(msfr)); + } + + if (msfr.msfr_nsrcs > in_mcast_maxsocksrc) + return (ENOBUFS); + + if ((msfr.msfr_fmode != MCAST_EXCLUDE && + msfr.msfr_fmode != MCAST_INCLUDE)) + return (EINVAL); + + if (msfr.msfr_group.ss_family != AF_INET || + msfr.msfr_group.ss_len != sizeof(struct sockaddr_in)) + return (EINVAL); + + gsa = (sockunion_t *)&msfr.msfr_group; + if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) + return (EINVAL); + + gsa->sin.sin_port = 0; /* ignore port */ + + ifnet_head_lock_shared(); + if (msfr.msfr_ifindex == 0 || (u_int)if_index < msfr.msfr_ifindex) { + ifnet_head_done(); + return (EADDRNOTAVAIL); + } + + ifp = ifindex2ifnet[msfr.msfr_ifindex]; + ifnet_head_done(); + if (ifp == NULL) + return (EADDRNOTAVAIL); + + /* + * Check if this socket is a member of this group. + */ + imo = inp_findmoptions(inp); + if (imo == NULL) + return (ENOMEM); + + IMO_LOCK(imo); + idx = imo_match_group(imo, ifp, &gsa->sa); + if (idx == (size_t)-1 || imo->imo_mfilters == NULL) { + error = EADDRNOTAVAIL; + goto out_imo_locked; + } + inm = imo->imo_membership[idx]; + imf = &imo->imo_mfilters[idx]; + + /* + * Begin state merge transaction at socket layer. + */ + + imf->imf_st[1] = msfr.msfr_fmode; + + /* + * Apply any new source filters, if present. + * Make a copy of the user-space source vector so + * that we may copy them with a single copyin. This + * allows us to deal with page faults up-front. + */ + if (msfr.msfr_nsrcs > 0) { + struct in_msource *lims; + struct sockaddr_in *psin; + struct sockaddr_storage *kss, *pkss; + int i; + + if (IS_64BIT_PROCESS(current_proc())) + tmp_ptr = msfr64.msfr_srcs; + else + tmp_ptr = CAST_USER_ADDR_T(msfr32.msfr_srcs); + + IGMP_PRINTF(("%s: loading %lu source list entries\n", + __func__, (unsigned long)msfr.msfr_nsrcs)); + kss = _MALLOC(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, + M_TEMP, M_WAITOK); + if (kss == NULL) { + error = ENOMEM; + goto out_imo_locked; + } + error = copyin(tmp_ptr, kss, + sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); + if (error) { + FREE(kss, M_TEMP); + goto out_imo_locked; + } + + /* + * Mark all source filters as UNDEFINED at t1. + * Restore new group filter mode, as imf_leave() + * will set it to INCLUDE. + */ + imf_leave(imf); + imf->imf_st[1] = msfr.msfr_fmode; + + /* + * Update socket layer filters at t1, lazy-allocating + * new entries. This saves a bunch of memory at the + * cost of one RB_FIND() per source entry; duplicate + * entries in the msfr_nsrcs vector are ignored. + * If we encounter an error, rollback transaction. + * + * XXX This too could be replaced with a set-symmetric + * difference like loop to avoid walking from root + * every time, as the key space is common. + */ + for (i = 0, pkss = kss; (u_int)i < msfr.msfr_nsrcs; + i++, pkss++) { + psin = (struct sockaddr_in *)pkss; + if (psin->sin_family != AF_INET) { + error = EAFNOSUPPORT; + break; + } + if (psin->sin_len != sizeof(struct sockaddr_in)) { + error = EINVAL; + break; + } + error = imf_get_source(imf, psin, &lims); + if (error) + break; + lims->imsl_st[1] = imf->imf_st[1]; + } + FREE(kss, M_TEMP); + } + + if (error) + goto out_imf_rollback; + + /* + * Begin state merge transaction at IGMP layer. + */ + INM_LOCK(inm); + IGMP_PRINTF(("%s: merge inm state\n", __func__)); + error = inm_merge(inm, imf); + if (error) { + IGMP_PRINTF(("%s: failed to merge inm state\n", __func__)); + INM_UNLOCK(inm); + goto out_imf_rollback; + } + + IGMP_PRINTF(("%s: doing igmp downcall\n", __func__)); + error = igmp_change_state(inm); + INM_UNLOCK(inm); +#ifdef IGMP_DEBUG + if (error) + IGMP_PRINTF(("%s: failed igmp downcall\n", __func__)); +#endif + +out_imf_rollback: + if (error) + imf_rollback(imf); + else + imf_commit(imf); + + imf_reap(imf); + +out_imo_locked: + IMO_UNLOCK(imo); + IMO_REMREF(imo); /* from inp_findmoptions() */ + + return (error); +} + +/* + * Set the IP multicast options in response to user setsockopt(). + * + * Many of the socket options handled in this function duplicate the + * functionality of socket options in the regular unicast API. However, + * it is not possible to merge the duplicate code, because the idempotence + * of the IPv4 multicast part of the BSD Sockets API must be preserved; + * the effects of these options must be treated as separate and distinct. + * + * FUTURE: The IP_MULTICAST_VIF option may be eliminated if MROUTING + * is refactored to no longer use vifs. + */ +int +inp_setmoptions(struct inpcb *inp, struct sockopt *sopt) +{ + struct ip_moptions *imo; + int error; + unsigned int ifindex; + struct ifnet *ifp; + + error = 0; + + /* + * If socket is neither of type SOCK_RAW or SOCK_DGRAM, + * or is a divert socket, reject it. + */ + if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || + (inp->inp_socket->so_proto->pr_type != SOCK_RAW && + inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) + return (EOPNOTSUPP); + + switch (sopt->sopt_name) { +#if MROUTING + case IP_MULTICAST_VIF: { + int vifi; + /* + * Select a multicast VIF for transmission. + * Only useful if multicast forwarding is active. + */ + if (legal_vif_num == NULL) { + error = EOPNOTSUPP; + break; + } + error = sooptcopyin(sopt, &vifi, sizeof(int), sizeof(int)); + if (error) + break; + if (!legal_vif_num(vifi) && (vifi != -1)) { + error = EINVAL; + break; + } + imo = inp_findmoptions(inp); + if (imo == NULL) { + error = ENOMEM; + break; + } + IMO_LOCK(imo); + imo->imo_multicast_vif = vifi; + IMO_UNLOCK(imo); + IMO_REMREF(imo); /* from inp_findmoptions() */ + break; + } +#endif + case IP_MULTICAST_IF: + error = inp_set_multicast_if(inp, sopt); + break; + + case IP_MULTICAST_IFINDEX: + /* + * Select the interface for outgoing multicast packets. + */ + error = sooptcopyin(sopt, &ifindex, sizeof (ifindex), + sizeof (ifindex)); + if (error) + break; + + imo = inp_findmoptions(inp); + if (imo == NULL) { + error = ENOMEM; + break; + } + /* + * Index 0 is used to remove a previous selection. + * When no interface is selected, a default one is + * chosen every time a multicast packet is sent. + */ + if (ifindex == 0) { + IMO_LOCK(imo); + imo->imo_multicast_ifp = NULL; + IMO_UNLOCK(imo); + IMO_REMREF(imo); /* from inp_findmoptions() */ + break; + } + + ifnet_head_lock_shared(); + /* Don't need to check is ifindex is < 0 since it's unsigned */ + if ((unsigned int)if_index < ifindex) { + ifnet_head_done(); + IMO_REMREF(imo); /* from inp_findmoptions() */ + error = ENXIO; /* per IPV6_MULTICAST_IF */ + break; + } + ifp = ifindex2ifnet[ifindex]; + ifnet_head_done(); + + /* If it's detached or isn't a multicast interface, bail out */ + if (ifp == NULL || !(ifp->if_flags & IFF_MULTICAST)) { + IMO_REMREF(imo); /* from inp_findmoptions() */ + error = EADDRNOTAVAIL; + break; + } + IMO_LOCK(imo); + imo->imo_multicast_ifp = ifp; + /* + * Clear out any remnants of past IP_MULTICAST_IF. The addr + * isn't really used anywhere in the kernel; we could have + * iterated thru the addresses of the interface and pick one + * here, but that is redundant since ip_getmoptions() already + * takes care of that for INADDR_ANY. + */ + imo->imo_multicast_addr.s_addr = INADDR_ANY; + IMO_UNLOCK(imo); + IMO_REMREF(imo); /* from inp_findmoptions() */ + break; + + case IP_MULTICAST_TTL: { + u_char ttl; + + /* + * Set the IP time-to-live for outgoing multicast packets. + * The original multicast API required a char argument, + * which is inconsistent with the rest of the socket API. + * We allow either a char or an int. + */ + if (sopt->sopt_valsize == sizeof(u_char)) { + error = sooptcopyin(sopt, &ttl, sizeof(u_char), + sizeof(u_char)); + if (error) + break; + } else { + u_int ittl; + + error = sooptcopyin(sopt, &ittl, sizeof(u_int), + sizeof(u_int)); + if (error) + break; + if (ittl > 255) { + error = EINVAL; + break; + } + ttl = (u_char)ittl; + } + imo = inp_findmoptions(inp); + if (imo == NULL) { + error = ENOMEM; + break; + } + IMO_LOCK(imo); + imo->imo_multicast_ttl = ttl; + IMO_UNLOCK(imo); + IMO_REMREF(imo); /* from inp_findmoptions() */ + break; + } + + case IP_MULTICAST_LOOP: { + u_char loop; + + /* + * Set the loopback flag for outgoing multicast packets. + * Must be zero or one. The original multicast API required a + * char argument, which is inconsistent with the rest + * of the socket API. We allow either a char or an int. + */ + if (sopt->sopt_valsize == sizeof(u_char)) { + error = sooptcopyin(sopt, &loop, sizeof(u_char), + sizeof(u_char)); + if (error) + break; + } else { + u_int iloop; + + error = sooptcopyin(sopt, &iloop, sizeof(u_int), + sizeof(u_int)); + if (error) + break; + loop = (u_char)iloop; + } + imo = inp_findmoptions(inp); + if (imo == NULL) { + error = ENOMEM; + break; + } + IMO_LOCK(imo); + imo->imo_multicast_loop = !!loop; + IMO_UNLOCK(imo); + IMO_REMREF(imo); /* from inp_findmoptions() */ + break; + } + + case IP_ADD_MEMBERSHIP: + case IP_ADD_SOURCE_MEMBERSHIP: + case MCAST_JOIN_GROUP: + case MCAST_JOIN_SOURCE_GROUP: + error = inp_join_group(inp, sopt); + break; + + case IP_DROP_MEMBERSHIP: + case IP_DROP_SOURCE_MEMBERSHIP: + case MCAST_LEAVE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + error = inp_leave_group(inp, sopt); + break; + + case IP_BLOCK_SOURCE: + case IP_UNBLOCK_SOURCE: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + error = inp_block_unblock_source(inp, sopt); + break; + + case IP_MSFILTER: + error = inp_set_source_filters(inp, sopt); + break; + + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} + +/* + * Expose IGMP's multicast filter mode and source list(s) to userland, + * keyed by (ifindex, group). + * The filter mode is written out as a uint32_t, followed by + * 0..n of struct in_addr. + * For use by ifmcstat(8). + */ +static int +sysctl_ip_mcast_filters SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp) + + struct in_addr src, group; + struct ifnet *ifp; + struct in_multi *inm; + struct in_multistep step; + struct ip_msource *ims; + int *name; + int retval = 0; + u_int namelen; + uint32_t fmode, ifindex; + + name = (int *)arg1; + namelen = (u_int)arg2; + + if (req->newptr != USER_ADDR_NULL) + return (EPERM); + + if (namelen != 2) + return (EINVAL); + + ifindex = name[0]; + ifnet_head_lock_shared(); + if (ifindex <= 0 || ifindex > (u_int)if_index) { + IGMP_PRINTF(("%s: ifindex %u out of range\n", + __func__, ifindex)); + ifnet_head_done(); + return (ENOENT); + } + + group.s_addr = name[1]; + if (!IN_MULTICAST(ntohl(group.s_addr))) { + IGMP_PRINTF(("%s: group %s is not multicast\n", + __func__, inet_ntoa(group))); + ifnet_head_done(); + return (EINVAL); + } + + ifp = ifindex2ifnet[ifindex]; + ifnet_head_done(); + if (ifp == NULL) { + IGMP_PRINTF(("%s: no ifp for ifindex %u\n", __func__, ifindex)); + return (ENOENT); + } + + in_multihead_lock_shared(); + IN_FIRST_MULTI(step, inm); + while (inm != NULL) { + INM_LOCK(inm); + if (inm->inm_ifp != ifp) + goto next; + + if (!in_hosteq(inm->inm_addr, group)) + goto next; + + fmode = inm->inm_st[1].iss_fmode; + retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t)); + if (retval != 0) { + INM_UNLOCK(inm); + break; /* abort */ + } + RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { +#ifdef IGMP_DEBUG + struct in_addr ina; + ina.s_addr = htonl(ims->ims_haddr); + IGMP_PRINTF(("%s: visit node %s\n", __func__, + inet_ntoa(ina))); +#endif + /* + * Only copy-out sources which are in-mode. + */ + if (fmode != ims_get_mode(inm, ims, 1)) { + IGMP_PRINTF(("%s: skip non-in-mode\n", + __func__)); + continue; /* process next source */ + } + src.s_addr = htonl(ims->ims_haddr); + retval = SYSCTL_OUT(req, &src, sizeof(struct in_addr)); + if (retval != 0) + break; /* process next inm */ + } +next: + INM_UNLOCK(inm); + IN_NEXT_MULTI(step, inm); + } + in_multihead_lock_done(); + + return (retval); +} + +/* + * XXX + * The whole multicast option thing needs to be re-thought. + * Several of these options are equally applicable to non-multicast + * transmission, and one (IP_MULTICAST_TTL) totally duplicates a + * standard option (IP_TTL). + */ +/* + * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index. + */ +static struct ifnet * +ip_multicast_if(struct in_addr *a, unsigned int *ifindexp) +{ + unsigned int ifindex; + struct ifnet *ifp; + + if (ifindexp != NULL) + *ifindexp = 0; + if (ntohl(a->s_addr) >> 24 == 0) { + ifindex = ntohl(a->s_addr) & 0xffffff; + ifnet_head_lock_shared(); + /* Don't need to check is ifindex is < 0 since it's unsigned */ + if ((unsigned int)if_index < ifindex) { + ifnet_head_done(); + return (NULL); + } + ifp = ifindex2ifnet[ifindex]; + ifnet_head_done(); + if (ifp != NULL && ifindexp != NULL) + *ifindexp = ifindex; + } else { + INADDR_TO_IFP(*a, ifp); + } + return (ifp); +} + +void +in_multi_init(void) +{ + PE_parse_boot_argn("ifa_debug", &inm_debug, sizeof (inm_debug)); + + /* Setup lock group and attribute for in_multihead */ + in_multihead_lock_grp_attr = lck_grp_attr_alloc_init(); + in_multihead_lock_grp = lck_grp_alloc_init("in_multihead", + in_multihead_lock_grp_attr); + in_multihead_lock_attr = lck_attr_alloc_init(); + lck_rw_init(&in_multihead_lock, in_multihead_lock_grp, + in_multihead_lock_attr); + + lck_mtx_init(&inm_trash_lock, in_multihead_lock_grp, + in_multihead_lock_attr); + TAILQ_INIT(&inm_trash_head); + + inm_size = (inm_debug == 0) ? sizeof (struct in_multi) : + sizeof (struct in_multi_dbg); + inm_zone = zinit(inm_size, INM_ZONE_MAX * inm_size, + 0, INM_ZONE_NAME); + if (inm_zone == NULL) { + panic("%s: failed allocating %s", __func__, INM_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(inm_zone, Z_EXPAND, TRUE); + + ipms_size = sizeof (struct ip_msource); + ipms_zone = zinit(ipms_size, IPMS_ZONE_MAX * ipms_size, + 0, IPMS_ZONE_NAME); + if (ipms_zone == NULL) { + panic("%s: failed allocating %s", __func__, IPMS_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(ipms_zone, Z_EXPAND, TRUE); + + inms_size = sizeof (struct in_msource); + inms_zone = zinit(inms_size, INMS_ZONE_MAX * inms_size, + 0, INMS_ZONE_NAME); + if (inms_zone == NULL) { + panic("%s: failed allocating %s", __func__, INMS_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(inms_zone, Z_EXPAND, TRUE); +} + +static struct in_multi * +in_multi_alloc(int how) +{ + struct in_multi *inm; + + inm = (how == M_WAITOK) ? zalloc(inm_zone) : zalloc_noblock(inm_zone); + if (inm != NULL) { + bzero(inm, inm_size); + lck_mtx_init(&inm->inm_lock, in_multihead_lock_grp, + in_multihead_lock_attr); + inm->inm_debug |= IFD_ALLOC; + if (inm_debug != 0) { + inm->inm_debug |= IFD_DEBUG; + inm->inm_trace = inm_trace; + } + } + return (inm); +} + +static void +in_multi_free(struct in_multi *inm) +{ + INM_LOCK(inm); + if (inm->inm_debug & IFD_ATTACHED) { + panic("%s: attached inm=%p is being freed", __func__, inm); + /* NOTREACHED */ + } else if (inm->inm_ifma != NULL) { + panic("%s: ifma not NULL for inm=%p", __func__, inm); + /* NOTREACHED */ + } else if (!(inm->inm_debug & IFD_ALLOC)) { + panic("%s: inm %p cannot be freed", __func__, inm); + /* NOTREACHED */ + } else if (inm->inm_refcount != 0) { + panic("%s: non-zero refcount inm=%p", __func__, inm); + /* NOTREACHED */ + } else if (inm->inm_reqcnt != 0) { + panic("%s: non-zero reqcnt inm=%p", __func__, inm); + /* NOTREACHED */ + } + + /* Free any pending IGMPv3 state-change records */ + IF_DRAIN(&inm->inm_scq); + + inm->inm_debug &= ~IFD_ALLOC; + if ((inm->inm_debug & (IFD_DEBUG | IFD_TRASHED)) == + (IFD_DEBUG | IFD_TRASHED)) { + lck_mtx_lock(&inm_trash_lock); + TAILQ_REMOVE(&inm_trash_head, (struct in_multi_dbg *)inm, + inm_trash_link); + lck_mtx_unlock(&inm_trash_lock); + inm->inm_debug &= ~IFD_TRASHED; + } + INM_UNLOCK(inm); + + lck_mtx_destroy(&inm->inm_lock, in_multihead_lock_grp); + zfree(inm_zone, inm); +} + +static void +in_multi_attach(struct in_multi *inm) +{ + in_multihead_lock_assert(LCK_RW_ASSERT_EXCLUSIVE); + INM_LOCK_ASSERT_HELD(inm); + + if (inm->inm_debug & IFD_ATTACHED) { + panic("%s: Attempt to attach an already attached inm=%p", + __func__, inm); + /* NOTREACHED */ + } else if (inm->inm_debug & IFD_TRASHED) { + panic("%s: Attempt to reattach a detached inm=%p", + __func__, inm); + /* NOTREACHED */ + } + + inm->inm_reqcnt++; + VERIFY(inm->inm_reqcnt == 1); + INM_ADDREF_LOCKED(inm); + inm->inm_debug |= IFD_ATTACHED; + /* + * Reattach case: If debugging is enabled, take it + * out of the trash list and clear IFD_TRASHED. + */ + if ((inm->inm_debug & (IFD_DEBUG | IFD_TRASHED)) == + (IFD_DEBUG | IFD_TRASHED)) { + /* Become a regular mutex, just in case */ + INM_CONVERT_LOCK(inm); + lck_mtx_lock(&inm_trash_lock); + TAILQ_REMOVE(&inm_trash_head, (struct in_multi_dbg *)inm, + inm_trash_link); + lck_mtx_unlock(&inm_trash_lock); + inm->inm_debug &= ~IFD_TRASHED; + } + + LIST_INSERT_HEAD(&in_multihead, inm, inm_link); +} + +int +in_multi_detach(struct in_multi *inm) +{ + in_multihead_lock_assert(LCK_RW_ASSERT_EXCLUSIVE); + INM_LOCK_ASSERT_HELD(inm); + + if (inm->inm_reqcnt == 0) { + panic("%s: inm=%p negative reqcnt", __func__, inm); + /* NOTREACHED */ + } + + --inm->inm_reqcnt; + if (inm->inm_reqcnt > 0) + return (0); + + if (!(inm->inm_debug & IFD_ATTACHED)) { + panic("%s: Attempt to detach an unattached record inm=%p", + __func__, inm); + /* NOTREACHED */ + } else if (inm->inm_debug & IFD_TRASHED) { + panic("%s: inm %p is already in trash list", __func__, inm); + /* NOTREACHED */ + } + + /* + * NOTE: Caller calls IFMA_REMREF + */ + inm->inm_debug &= ~IFD_ATTACHED; + LIST_REMOVE(inm, inm_link); + + if (inm->inm_debug & IFD_DEBUG) { + /* Become a regular mutex, just in case */ + INM_CONVERT_LOCK(inm); + lck_mtx_lock(&inm_trash_lock); + TAILQ_INSERT_TAIL(&inm_trash_head, + (struct in_multi_dbg *)inm, inm_trash_link); + lck_mtx_unlock(&inm_trash_lock); + inm->inm_debug |= IFD_TRASHED; + } + + return (1); +} + +void +inm_addref(struct in_multi *inm, int locked) +{ + if (!locked) + INM_LOCK_SPIN(inm); + else + INM_LOCK_ASSERT_HELD(inm); + + if (++inm->inm_refcount == 0) { + panic("%s: inm=%p wraparound refcnt", __func__, inm); + /* NOTREACHED */ + } else if (inm->inm_trace != NULL) { + (*inm->inm_trace)(inm, TRUE); + } + if (!locked) + INM_UNLOCK(inm); +} + +void +inm_remref(struct in_multi *inm, int locked) +{ + struct ifmultiaddr *ifma; + struct igmp_ifinfo *igi; + + if (!locked) + INM_LOCK_SPIN(inm); + else + INM_LOCK_ASSERT_HELD(inm); + + if (inm->inm_refcount == 0 || (inm->inm_refcount == 1 && locked)) { + panic("%s: inm=%p negative/missing refcnt", __func__, inm); + /* NOTREACHED */ + } else if (inm->inm_trace != NULL) { + (*inm->inm_trace)(inm, FALSE); + } + + --inm->inm_refcount; + if (inm->inm_refcount > 0) { + if (!locked) + INM_UNLOCK(inm); + return; + } + + /* + * Synchronization with in_getmulti(). In the event the inm has been + * detached, the underlying ifma would still be in the if_multiaddrs + * list, and thus can be looked up via if_addmulti(). At that point, + * the only way to find this inm is via ifma_protospec. To avoid + * race conditions between the last inm_remref() of that inm and its + * use via ifma_protospec, in_multihead lock is used for serialization. + * In order to avoid violating the lock order, we must drop inm_lock + * before acquiring in_multihead lock. To prevent the inm from being + * freed prematurely, we hold an extra reference. + */ + ++inm->inm_refcount; + INM_UNLOCK(inm); + in_multihead_lock_shared(); + INM_LOCK_SPIN(inm); + --inm->inm_refcount; + if (inm->inm_refcount > 0) { + /* We've lost the race, so abort since inm is still in use */ + INM_UNLOCK(inm); + in_multihead_lock_done(); + /* If it was locked, return it as such */ + if (locked) + INM_LOCK(inm); + return; + } + inm_purge(inm); + ifma = inm->inm_ifma; + inm->inm_ifma = NULL; + inm->inm_ifp = NULL; + igi = inm->inm_igi; + inm->inm_igi = NULL; + INM_UNLOCK(inm); + IFMA_LOCK_SPIN(ifma); + ifma->ifma_protospec = NULL; + IFMA_UNLOCK(ifma); + in_multihead_lock_done(); + + in_multi_free(inm); + if_delmulti_ifma(ifma); + /* Release reference held to the underlying ifmultiaddr */ + IFMA_REMREF(ifma); + + if (igi != NULL) + IGI_REMREF(igi); +} + +static void +inm_trace(struct in_multi *inm, int refhold) +{ + struct in_multi_dbg *inm_dbg = (struct in_multi_dbg *)inm; + ctrace_t *tr; + u_int32_t idx; + u_int16_t *cnt; + + if (!(inm->inm_debug & IFD_DEBUG)) { + panic("%s: inm %p has no debug structure", __func__, inm); + /* NOTREACHED */ + } + if (refhold) { + cnt = &inm_dbg->inm_refhold_cnt; + tr = inm_dbg->inm_refhold; + } else { + cnt = &inm_dbg->inm_refrele_cnt; + tr = inm_dbg->inm_refrele; + } + + idx = atomic_add_16_ov(cnt, 1) % INM_TRACE_HIST_SIZE; + ctrace_record(&tr[idx]); +} + +void +in_multihead_lock_exclusive(void) +{ + lck_rw_lock_exclusive(&in_multihead_lock); +} + +void +in_multihead_lock_shared(void) +{ + lck_rw_lock_shared(&in_multihead_lock); +} + +void +in_multihead_lock_assert(int what) +{ + lck_rw_assert(&in_multihead_lock, what); +} + +void +in_multihead_lock_done(void) +{ + lck_rw_done(&in_multihead_lock); +} + +static struct ip_msource * +ipms_alloc(int how) +{ + struct ip_msource *ims; + + ims = (how == M_WAITOK) ? zalloc(ipms_zone) : zalloc_noblock(ipms_zone); + if (ims != NULL) + bzero(ims, ipms_size); + + return (ims); +} + +static void +ipms_free(struct ip_msource *ims) +{ + zfree(ipms_zone, ims); +} + +static struct in_msource * +inms_alloc(int how) +{ + struct in_msource *inms; + + inms = (how == M_WAITOK) ? zalloc(inms_zone) : + zalloc_noblock(inms_zone); + if (inms != NULL) + bzero(inms, inms_size); + + return (inms); +} + +static void +inms_free(struct in_msource *inms) +{ + zfree(inms_zone, inms); +} + +#ifdef IGMP_DEBUG + +static const char *inm_modestrs[] = { "un\n", "in", "ex" }; + +static const char * +inm_mode_str(const int mode) +{ + if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE) + return (inm_modestrs[mode]); + return ("??"); +} + +static const char *inm_statestrs[] = { + "not-member\n", + "silent\n", + "idle\n", + "lazy\n", + "sleeping\n", + "awakening\n", + "query-pending\n", + "sg-query-pending\n", + "leaving" +}; + +static const char * +inm_state_str(const int state) +{ + if (state >= IGMP_NOT_MEMBER && state <= IGMP_LEAVING_MEMBER) + return (inm_statestrs[state]); + return ("??"); +} + +/* + * Dump an in_multi structure to the console. + */ +void +inm_print(const struct in_multi *inm) +{ + int t; + + INM_LOCK_ASSERT_HELD(INM_CAST_TO_NONCONST(inm)); + + if (igmp_debug == 0) + return; + + printf("%s: --- begin inm %p ---\n", __func__, inm); + printf("addr %s ifp %p(%s%d) ifma %p\n", + inet_ntoa(inm->inm_addr), + inm->inm_ifp, + inm->inm_ifp->if_name, + inm->inm_ifp->if_unit, + inm->inm_ifma); + printf("timer %u state %s refcount %u scq.len %u\n", + inm->inm_timer, + inm_state_str(inm->inm_state), + inm->inm_refcount, + inm->inm_scq.ifq_len); + printf("igi %p nsrc %lu sctimer %u scrv %u\n", + inm->inm_igi, + inm->inm_nsrc, + inm->inm_sctimer, + inm->inm_scrv); + for (t = 0; t < 2; t++) { + printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t, + inm_mode_str(inm->inm_st[t].iss_fmode), + inm->inm_st[t].iss_asm, + inm->inm_st[t].iss_ex, + inm->inm_st[t].iss_in, + inm->inm_st[t].iss_rec); + } + printf("%s: --- end inm %p ---\n", __func__, inm); +} + +#else + +void +inm_print(__unused const struct in_multi *inm) +{ + +} + +#endif diff --git a/bsd/netinet/in_pcb.c b/bsd/netinet/in_pcb.c index 696222176..51eeca0b3 100644 --- a/bsd/netinet/in_pcb.c +++ b/bsd/netinet/in_pcb.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -75,6 +75,9 @@ #endif #include #include +#include +#include +#include #include #include @@ -156,17 +159,17 @@ sysctl_net_ipport_check SYSCTL_HANDLER_ARGS SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "IP Ports"); -SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW, +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED, &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", ""); -SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW, +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED, &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", ""); -SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW, +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED, &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", ""); -SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW, +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED, &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", ""); -SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW, +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED, &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", ""); -SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW, +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED, &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", ""); extern int udp_use_randomport; @@ -233,14 +236,17 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo, __unused struct proc * } mac_inpcb_label_associate(so, inp); #endif + // make sure inp_stat is always 64bit aligned + inp->inp_stat = (struct inp_stat*)P2ROUNDUP(inp->inp_stat_store, sizeof(u_int64_t)); + if (((uintptr_t)inp->inp_stat - (uintptr_t)inp->inp_stat_store) + + sizeof(*inp->inp_stat) > sizeof(inp->inp_stat_store)) { + panic("insufficient space to align inp_stat"); + } + so->so_pcb = (caddr_t)inp; if (so->so_proto->pr_flags & PR_PCBLOCK) { - inp->inpcb_mtx = lck_mtx_alloc_init(pcbinfo->mtx_grp, pcbinfo->mtx_attr); - if (inp->inpcb_mtx == NULL) { - printf("in_pcballoc: can't alloc mutex! so=%p\n", so); - return(ENOMEM); - } + lck_mtx_init(&inp->inpcb_mtx, pcbinfo->mtx_grp, pcbinfo->mtx_attr); } #if IPSEC @@ -297,7 +303,7 @@ in_pcblookup_local_and_cleanup( if (inp && inp->inp_wantcnt == WNT_STOPUSING) { struct socket *so = inp->inp_socket; - lck_mtx_lock(inp->inpcb_mtx); + lck_mtx_lock(&inp->inpcb_mtx); if (so->so_usecount == 0) { if (inp->inp_state != INPCB_STATE_DEAD) @@ -306,7 +312,7 @@ in_pcblookup_local_and_cleanup( inp = NULL; } else { - lck_mtx_unlock(inp->inpcb_mtx); + lck_mtx_unlock(&inp->inpcb_mtx); } } @@ -324,6 +330,8 @@ in_pcb_conflict_post_msg(u_int16_t port) struct kev_msg ev_msg; struct kev_in_portinuse in_portinuse; + bzero(&in_portinuse, sizeof(struct kev_in_portinuse)); + bzero(&ev_msg, sizeof(struct kev_msg)); in_portinuse.port = ntohs(port); /* port in host order */ in_portinuse.req_pid = proc_selfpid(); ev_msg.vendor_code = KEV_VENDOR_APPLE; @@ -344,7 +352,7 @@ in_pcb_conflict_post_msg(u_int16_t port) * EACCES Permission denied * EADDRINUSE Address in use * EAGAIN Resource unavailable, try again - * proc_suser:EPERM Operation not permitted + * priv_check_cred:EPERM Operation not permitted */ int in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) @@ -356,6 +364,7 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) u_short lport = 0, rand_port = 0; int wild = 0, reuseport = (so->so_options & SO_REUSEPORT); int error, randomport, conflict = 0; + kauth_cred_t cred; if (TAILQ_EMPTY(&in_ifaddrhead)) /* XXX broken! */ return (EADDRNOTAVAIL); @@ -366,6 +375,8 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) socket_unlock(so, 0); /* keep reference on socket */ lck_rw_lock_exclusive(pcbinfo->mtx); if (nam) { + unsigned int outif = 0; + sin = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sin)) { lck_rw_done(pcbinfo->mtx); @@ -403,7 +414,10 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) return (EADDRNOTAVAIL); } else { - ifafree(ifa); + IFA_LOCK(ifa); + outif = ifa->ifa_ifp->if_index; + IFA_UNLOCK(ifa); + IFA_REMREF(ifa); } } if (lport) { @@ -411,10 +425,15 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) /* GROSS */ #if !CONFIG_EMBEDDED - if (ntohs(lport) < IPPORT_RESERVED && proc_suser(p)) { - lck_rw_done(pcbinfo->mtx); - socket_lock(so, 0); - return (EACCES); + if (ntohs(lport) < IPPORT_RESERVED) { + cred = kauth_cred_proc_ref(p); + error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); + kauth_cred_unref(&cred); + if (error != 0) { + lck_rw_done(pcbinfo->mtx); + socket_lock(so, 0); + return (EACCES); + } } #endif if (so->so_uid && @@ -487,6 +506,7 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) } } inp->inp_laddr = sin->sin_addr; + inp->inp_last_outif = outif; } if (lport == 0) { u_short first, last; @@ -502,7 +522,10 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) last = ipport_hilastauto; lastport = &pcbinfo->lasthi; } else if (inp->inp_flags & INP_LOWPORT) { - if ((error = proc_suser(p)) != 0) { + cred = kauth_cred_proc_ref(p); + error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); + kauth_cred_unref(&cred); + if (error != 0) { lck_rw_done(pcbinfo->mtx); socket_lock(so, 0); return error; @@ -541,6 +564,7 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) lck_rw_done(pcbinfo->mtx); socket_lock(so, 0); inp->inp_laddr.s_addr = INADDR_ANY; + inp->inp_last_outif = 0; return (EADDRNOTAVAIL); } --*lastport; @@ -564,6 +588,7 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) lck_rw_done(pcbinfo->mtx); socket_lock(so, 0); inp->inp_laddr.s_addr = INADDR_ANY; + inp->inp_last_outif = 0; return (EADDRNOTAVAIL); } ++*lastport; @@ -579,6 +604,7 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) if (in_pcbinshash(inp, 1) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; + inp->inp_last_outif = 0; lck_rw_done(pcbinfo->mtx); return (EAGAIN); } @@ -605,7 +631,7 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) */ int in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, - struct sockaddr_in **plocal_sin) + struct sockaddr_in *plocal_sin, unsigned int *out_ifscope) { struct in_ifaddr *ia; struct sockaddr_in *sin = (struct sockaddr_in *)nam; @@ -619,6 +645,7 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, lck_rw_lock_shared(in_ifaddr_rwlock); if (!TAILQ_EMPTY(&in_ifaddrhead)) { + ia = TAILQ_FIRST(&in_ifaddrhead); /* * If the destination address is INADDR_ANY, * use the primary local address. @@ -629,21 +656,34 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, #define satosin(sa) ((struct sockaddr_in *)(sa)) #define sintosa(sin) ((struct sockaddr *)(sin)) #define ifatoia(ifa) ((struct in_ifaddr *)(ifa)) + IFA_LOCK_SPIN(&ia->ia_ifa); if (sin->sin_addr.s_addr == INADDR_ANY) - sin->sin_addr = IA_SIN(TAILQ_FIRST(&in_ifaddrhead))->sin_addr; + sin->sin_addr = IA_SIN(ia)->sin_addr; else if (sin->sin_addr.s_addr == (u_int32_t)INADDR_BROADCAST && - (TAILQ_FIRST(&in_ifaddrhead)->ia_ifp->if_flags & IFF_BROADCAST)) - sin->sin_addr = satosin(&TAILQ_FIRST(&in_ifaddrhead)->ia_broadaddr)->sin_addr; + (ia->ia_ifp->if_flags & IFF_BROADCAST)) + sin->sin_addr = satosin(&ia->ia_broadaddr)->sin_addr; + IFA_UNLOCK(&ia->ia_ifa); + ia = NULL; } lck_rw_done(in_ifaddr_rwlock); if (inp->inp_laddr.s_addr == INADDR_ANY) { struct route *ro; - unsigned int ifscope; - + unsigned int ifscope = IFSCOPE_NONE; + unsigned int nocell; + /* + * If the socket is bound to a specifc interface, the + * optional scoped takes precedence over that if it + * is set by the caller. + */ ia = (struct in_ifaddr *)0; - ifscope = (inp->inp_flags & INP_BOUND_IF) ? - inp->inp_boundif : IFSCOPE_NONE; + + if (out_ifscope != NULL && *out_ifscope != IFSCOPE_NONE) + ifscope = *out_ifscope; + else if (inp->inp_flags & INP_BOUND_IF) + ifscope = inp->inp_boundif; + + nocell = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; /* * If route is known or can be allocated now, * our src addr is taken from the i/f, else punt. @@ -672,10 +712,23 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, ro->ro_dst.sa_len = sizeof(struct sockaddr_in); ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = sin->sin_addr; - rtalloc_scoped_ign(ro, 0, ifscope); + rtalloc_scoped(ro, ifscope); if (ro->ro_rt != NULL) RT_LOCK_SPIN(ro->ro_rt); } + /* + * If the route points to a cellular interface and the + * caller forbids our using interfaces of such type, + * pretend that there is no route. + */ + if (nocell && ro->ro_rt != NULL) { + RT_LOCK_ASSERT_HELD(ro->ro_rt); + if (ro->ro_rt->rt_ifp->if_type == IFT_CELLULAR) { + RT_UNLOCK(ro->ro_rt); + rtfree(ro->ro_rt); + ro->ro_rt = NULL; + } + } /* * If we found a route, use the address * corresponding to the outgoing interface @@ -683,11 +736,13 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, * to our address on another net goes to loopback). */ if (ro->ro_rt != NULL) { - RT_LOCK_ASSERT_HELD(ro->ro_rt); + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro->ro_rt); if (!(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)) { ia = ifatoia(ro->ro_rt->rt_ifa); - if (ia) - ifaref(&ia->ia_ifa); + if (ia) { + IFA_ADDREF(&ia->ia_ifa); + } } RT_UNLOCK(ro->ro_rt); } @@ -705,9 +760,19 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, lck_rw_lock_shared(in_ifaddr_rwlock); ia = TAILQ_FIRST(&in_ifaddrhead); if (ia) - ifaref(&ia->ia_ifa); + IFA_ADDREF(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); } + /* + * If the source address belongs to a cellular interface + * and the socket forbids our using interfaces of such + * type, pretend that there is no source address. + */ + if (nocell && ia != NULL && + ia->ia_ifa.ifa_ifp->if_type == IFT_CELLULAR) { + IFA_REMREF(&ia->ia_ifa); + ia = NULL; + } if (ia == 0) return (EADDRNOTAVAIL); } @@ -722,29 +787,37 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, struct ifnet *ifp; imo = inp->inp_moptions; + IMO_LOCK(imo); if (imo->imo_multicast_ifp != NULL && (ia == NULL || ia->ia_ifp != imo->imo_multicast_ifp)) { ifp = imo->imo_multicast_ifp; if (ia) - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); lck_rw_lock_shared(in_ifaddr_rwlock); TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { if (ia->ia_ifp == ifp) break; } if (ia) - ifaref(&ia->ia_ifa); + IFA_ADDREF(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); - if (ia == 0) + if (ia == 0) { + IMO_UNLOCK(imo); return (EADDRNOTAVAIL); + } } + IMO_UNLOCK(imo); } /* * Don't do pcblookup call here; return interface in plocal_sin * and exit to caller, that will do the lookup. */ - *plocal_sin = &ia->ia_addr; - ifafree(&ia->ia_ifa); + IFA_LOCK_SPIN(&ia->ia_ifa); + *plocal_sin = ia->ia_addr; + if (out_ifscope != NULL) + *out_ifscope = ia->ia_ifp->if_index; + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); } return(0); } @@ -757,9 +830,9 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, * then pick one. */ int -in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p) +in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p, unsigned int *ifscope) { - struct sockaddr_in *ifaddr; + struct sockaddr_in ifaddr; struct sockaddr_in *sin = (struct sockaddr_in *)nam; struct inpcb *pcb; int error; @@ -767,14 +840,23 @@ in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p) /* * Call inner routine, to assign local interface address. */ - if ((error = in_pcbladdr(inp, nam, &ifaddr)) != 0) + if ((error = in_pcbladdr(inp, nam, &ifaddr, ifscope)) != 0) return(error); socket_unlock(inp->inp_socket, 0); pcb = in_pcblookup_hash(inp->inp_pcbinfo, sin->sin_addr, sin->sin_port, - inp->inp_laddr.s_addr ? inp->inp_laddr : ifaddr->sin_addr, + inp->inp_laddr.s_addr ? inp->inp_laddr : ifaddr.sin_addr, inp->inp_lport, 0, NULL); socket_lock(inp->inp_socket, 0); + + /* Check if the socket is still in a valid state. When we unlock this + * embryonic socket, it can get aborted if another thread is closing + * the listener (radar 7947600). + */ + if ((inp->inp_socket->so_flags & SOF_ABORTED) != 0) { + return ECONNREFUSED; + } + if (pcb != NULL) { in_pcb_checkstate(pcb, WNT_RELEASE, pcb == inp ? 1 : 0); return (EADDRINUSE); @@ -791,7 +873,8 @@ in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p) lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx); socket_lock(inp->inp_socket, 0); } - inp->inp_laddr = ifaddr->sin_addr; + inp->inp_laddr = ifaddr.sin_addr; + inp->inp_last_outif = ifscope ? *ifscope : IFSCOPE_NONE; inp->inp_flags |= INP_INADDR_ANY; } else { @@ -858,6 +941,7 @@ in_pcbdetach(struct inpcb *inp) #endif if ((so->so_flags & SOF_PCBCLEARING) == 0) { struct rtentry *rt; + struct ip_moptions *imo; inp->inp_vflag = 0; if (inp->inp_options) @@ -866,8 +950,10 @@ in_pcbdetach(struct inpcb *inp) inp->inp_route.ro_rt = NULL; rtfree(rt); } - ip_freemoptions(inp->inp_moptions); + imo = inp->inp_moptions; inp->inp_moptions = NULL; + if (imo != NULL) + IMO_REMREF(imo); sofreelastref(so, 0); inp->inp_state = INPCB_STATE_DEAD; so->so_flags |= SOF_PCBCLEARING; /* makes sure we're not called twice from so_close */ @@ -886,9 +972,10 @@ in_pcbdispose(struct inpcb *inp) printf("in_pcbdispose: not dead yet? so=%p\n", so); } #endif - if (so && so->so_usecount != 0) - panic("in_pcbdispose: use count=%x so=%p\n", so->so_usecount, so); + panic("%s: so %p so_usecount %d so_lockhistory %s\n", + __func__, so, so->so_usecount, + (so != NULL) ? solockhistory_nr(so) : "--"); lck_rw_assert(ipi->mtx, LCK_RW_ASSERT_EXCLUSIVE); @@ -909,8 +996,8 @@ in_pcbdispose(struct inpcb *inp) } if (so->so_head != NULL) panic("in_pcbdispose, so=%p head still exist\n", so); - lck_mtx_unlock(inp->inpcb_mtx); - lck_mtx_free(inp->inpcb_mtx, ipi->mtx_grp); + lck_mtx_unlock(&inp->inpcb_mtx); + lck_mtx_destroy(&inp->inpcb_mtx, ipi->mtx_grp); } so->so_flags |= SOF_PCBCLEARING; /* makes sure we're not called twice from so_close */ so->so_saved_pcb = (caddr_t) inp; @@ -1075,7 +1162,7 @@ in_losing(struct inpcb *inp) if ((ia = ifa_foraddr(inp->inp_laddr.s_addr)) != NULL) { inp->inp_route.ro_rt = NULL; rtfree(rt); - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); } /* * A new route can be allocated @@ -1099,7 +1186,7 @@ in_rtchange(struct inpcb *inp, __unused int errno) if ((ia = ifa_foraddr(inp->inp_laddr.s_addr)) == NULL) { return; /* we can't remove the route now. not sure if still ok to use src */ } - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); rtfree(rt); inp->inp_route.ro_rt = NULL; /* @@ -1200,6 +1287,131 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, } } +/* + * Check if PCB exists in hash list. + */ +int +in_pcblookup_hash_exists( + struct inpcbinfo *pcbinfo, + struct in_addr faddr, + u_int fport_arg, + struct in_addr laddr, + u_int lport_arg, + int wildcard, + uid_t *uid, + gid_t *gid, + __unused struct ifnet *ifp) +{ + struct inpcbhead *head; + struct inpcb *inp; + u_short fport = fport_arg, lport = lport_arg; + int found; + + *uid = UID_MAX; + *gid = GID_MAX; + + /* + * We may have found the pcb in the last lookup - check this first. + */ + + lck_rw_lock_shared(pcbinfo->mtx); + + /* + * First look for an exact match. + */ + head = &pcbinfo->hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, + pcbinfo->hashmask)]; + LIST_FOREACH(inp, head, inp_hash) { +#if INET6 + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr == faddr.s_addr && + inp->inp_laddr.s_addr == laddr.s_addr && + inp->inp_fport == fport && + inp->inp_lport == lport) { + if ((found = (inp->inp_socket != NULL))) { + /* + * Found. + */ + *uid = inp->inp_socket->so_uid; + *gid = inp->inp_socket->so_gid; + } + lck_rw_done(pcbinfo->mtx); + return (found); + } + } + if (wildcard) { + struct inpcb *local_wild = NULL; +#if INET6 + struct inpcb *local_wild_mapped = NULL; +#endif + + head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, + pcbinfo->hashmask)]; + LIST_FOREACH(inp, head, inp_hash) { +#if INET6 + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr == INADDR_ANY && + inp->inp_lport == lport) { +#if defined(NFAITH) && NFAITH > 0 + if (ifp && ifp->if_type == IFT_FAITH && + (inp->inp_flags & INP_FAITH) == 0) + continue; +#endif + if (inp->inp_laddr.s_addr == laddr.s_addr) { + if ((found = (inp->inp_socket != NULL))) { + *uid = inp->inp_socket->so_uid; + *gid = inp->inp_socket->so_gid; + } + lck_rw_done(pcbinfo->mtx); + return (found); + } + else if (inp->inp_laddr.s_addr == INADDR_ANY) { +#if INET6 + if (inp->inp_socket && + INP_CHECK_SOCKAF(inp->inp_socket, + AF_INET6)) + local_wild_mapped = inp; + else +#endif /* INET6 */ + local_wild = inp; + } + } + } + if (local_wild == NULL) { +#if INET6 + if (local_wild_mapped != NULL) { + if ((found = (local_wild_mapped->inp_socket != NULL))) { + *uid = local_wild_mapped->inp_socket->so_uid; + *gid = local_wild_mapped->inp_socket->so_gid; + } + lck_rw_done(pcbinfo->mtx); + return (found); + } +#endif /* INET6 */ + lck_rw_done(pcbinfo->mtx); + return (0); + } + if (local_wild != NULL) { + if ((found = (local_wild->inp_socket != NULL))) { + *uid = local_wild->inp_socket->so_uid; + *gid = local_wild->inp_socket->so_gid; + } + lck_rw_done(pcbinfo->mtx); + return (found); + } + } + + /* + * Not found. + */ + lck_rw_done(pcbinfo->mtx); + return (0); +} + /* * Lookup PCB in hash list. */ @@ -1336,10 +1548,15 @@ in_pcbinshash(struct inpcb *inp, int locked) if (!locked) { if (!lck_rw_try_lock_exclusive(pcbinfo->mtx)) { - /*lock inversion issue, mostly with udp multicast packets */ + /*lock inversion issue, mostly with udp multicast packets */ socket_unlock(inp->inp_socket, 0); lck_rw_lock_exclusive(pcbinfo->mtx); socket_lock(inp->inp_socket, 0); + if (inp->inp_state == INPCB_STATE_DEAD) { + /* The socket got dropped when it was unlocked */ + lck_rw_done(pcbinfo->mtx); + return(ECONNABORTED); + } } } @@ -1458,6 +1675,7 @@ in_pcb_checkstate(struct inpcb *pcb, int mode, int locked) if (locked == 0) socket_lock(pcb->inp_socket, 1); pcb->inp_state = INPCB_STATE_DEAD; + stopusing: if (pcb->inp_socket->so_usecount < 0) panic("in_pcb_checkstate STOP pcb=%p so=%p usecount is negative\n", pcb, pcb->inp_socket); @@ -1569,25 +1787,26 @@ inpcb_to_xinpcb64( struct inpcb *inp, struct xinpcb64 *xinp) { - xinp->inp_fport = inp->inp_fport; - xinp->inp_lport = inp->inp_lport; - xinp->inp_gencnt = inp->inp_gencnt; - xinp->inp_flags = inp->inp_flags; - xinp->inp_flow = inp->inp_flow; - xinp->inp_vflag = inp->inp_vflag; - xinp->inp_ip_ttl = inp->inp_ip_ttl; - xinp->inp_ip_p = inp->inp_ip_p; - xinp->inp_dependfaddr.inp6_foreign = inp->inp_dependfaddr.inp6_foreign; - xinp->inp_dependladdr.inp6_local = inp->inp_dependladdr.inp6_local; - xinp->inp_depend4.inp4_ip_tos = inp->inp_depend4.inp4_ip_tos; - xinp->inp_depend6.inp6_hlim = inp->inp_depend6.inp6_hlim; - xinp->inp_depend6.inp6_cksum = inp->inp_depend6.inp6_cksum; + xinp->inp_fport = inp->inp_fport; + xinp->inp_lport = inp->inp_lport; + xinp->inp_gencnt = inp->inp_gencnt; + xinp->inp_flags = inp->inp_flags; + xinp->inp_flow = inp->inp_flow; + xinp->inp_vflag = inp->inp_vflag; + xinp->inp_ip_ttl = inp->inp_ip_ttl; + xinp->inp_ip_p = inp->inp_ip_p; + xinp->inp_dependfaddr.inp6_foreign = inp->inp_dependfaddr.inp6_foreign; + xinp->inp_dependladdr.inp6_local = inp->inp_dependladdr.inp6_local; + xinp->inp_depend4.inp4_ip_tos = inp->inp_depend4.inp4_ip_tos; + xinp->inp_depend6.inp6_hlim = inp->inp_depend6.inp6_hlim; + xinp->inp_depend6.inp6_cksum = inp->inp_depend6.inp6_cksum; xinp->inp_depend6.inp6_ifindex = inp->inp_depend6.inp6_ifindex; - xinp->inp_depend6.inp6_hops = inp->inp_depend6.inp6_hops; + xinp->inp_depend6.inp6_hops = inp->inp_depend6.inp6_hops; } #endif /* !CONFIG_EMBEDDED */ + /* * The following routines implement this scheme: * @@ -1619,7 +1838,7 @@ inp_route_copyout(struct inpcb *inp, struct route *dst) { struct route *src = &inp->inp_route; - lck_mtx_assert(inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED); /* * If the route in the PCB is not for IPv4, blow it away; @@ -1629,13 +1848,8 @@ inp_route_copyout(struct inpcb *inp, struct route *dst) rtfree(src->ro_rt); src->ro_rt = NULL; } - - /* Copy everything (rt, dst, flags) from PCB */ - bcopy(src, dst, sizeof (*dst)); - - /* Hold one reference for the local copy of struct route */ - if (dst->ro_rt != NULL) - RT_ADDREF(dst->ro_rt); + + route_copyout(dst, src, sizeof(*dst)); } void @@ -1643,33 +1857,61 @@ inp_route_copyin(struct inpcb *inp, struct route *src) { struct route *dst = &inp->inp_route; - lck_mtx_assert(inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED); /* Minor sanity check */ if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET) panic("%s: wrong or corrupted route: %p", __func__, src); - /* No cached route in the PCB? */ - if (dst->ro_rt == NULL) { - /* - * Copy everything (rt, dst, flags) from ip_output(); - * the reference to the route was held at the time - * it was allocated and is kept intact. - */ - bcopy(src, dst, sizeof (*dst)); - } else if (src->ro_rt != NULL) { - /* - * If the same, update just the ro_flags and ditch the one - * in the local copy. Else ditch the one that is currently - * cached, and cache what we got back from ip_output(). - */ - if (dst->ro_rt == src->ro_rt) { - dst->ro_flags = src->ro_flags; - rtfree(src->ro_rt); - src->ro_rt = NULL; - } else { - rtfree(dst->ro_rt); - bcopy(src, dst, sizeof (*dst)); - } + route_copyin(src, dst, sizeof(*src)); +} + +/* + * Handler for setting IP_FORCE_OUT_IFP/IP_BOUND_IF/IPV6_BOUND_IF socket option. + */ +void +inp_bindif(struct inpcb *inp, unsigned int ifscope) +{ + /* + * A zero interface scope value indicates an "unbind". + * Otherwise, take in whatever value the app desires; + * the app may already know the scope (or force itself + * to such a scope) ahead of time before the interface + * gets attached. It doesn't matter either way; any + * route lookup from this point on will require an + * exact match for the embedded interface scope. + */ + inp->inp_boundif = ifscope; + if (inp->inp_boundif == IFSCOPE_NONE) + inp->inp_flags &= ~INP_BOUND_IF; + else + inp->inp_flags |= INP_BOUND_IF; + + /* Blow away any cached route in the PCB */ + if (inp->inp_route.ro_rt != NULL) { + rtfree(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = NULL; + } +} + +/* + * Handler for setting IP_NO_IFT_CELLULAR/IPV6_NO_IFT_CELLULAR socket option. + */ +int +inp_nocellular(struct inpcb *inp, unsigned int val) +{ + if (val) { + inp->inp_flags |= INP_NO_IFT_CELLULAR; + } else if (inp->inp_flags & INP_NO_IFT_CELLULAR) { + /* once set, it cannot be unset */ + return (EINVAL); } + + /* Blow away any cached route in the PCB */ + if (inp->inp_route.ro_rt != NULL) { + rtfree(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = NULL; + } + + return (0); } diff --git a/bsd/netinet/in_pcb.h b/bsd/netinet/in_pcb.h index a793f3a12..728b93e33 100644 --- a/bsd/netinet/in_pcb.h +++ b/bsd/netinet/in_pcb.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -100,8 +100,8 @@ typedef u_quad_t inp_gen_t; /* * PCB with AF_INET6 null bind'ed laddr can receive AF_INET input packet. - * So, AF_INET6 null laddr is also used as AF_INET null laddr, - * by utilize following structure. (At last, same as INRIA) + * So, AF_INET6 null laddr is also used as AF_INET null laddr, by utilizing + * the following structure. */ struct in_addr_4in6 { u_int32_t ia46_pad32[3]; @@ -120,6 +120,14 @@ struct icmp6_filter; struct label; #endif +struct inp_stat +{ + u_int64_t rxpackets; + u_int64_t rxbytes; + u_int64_t txpackets; + u_int64_t txbytes; +}; + struct inpcb { LIST_ENTRY(inpcb) inp_hash; /* hash list */ int inp_wantcnt; /* pcb wanted count. protected by pcb list lock */ @@ -127,7 +135,7 @@ struct inpcb { u_short inp_fport; /* foreign port */ u_short inp_lport; /* local port */ LIST_ENTRY(inpcb) inp_list; /* list for all PCBs of this proto */ - caddr_t inp_ppcb; /* pointer to per-protocol pcb */ + void *inp_ppcb; /* pointer to per-protocol pcb */ struct inpcbinfo *inp_pcbinfo; /* PCB list info */ struct socket *inp_socket; /* back pointer to socket */ u_char nat_owner; /* Used to NAT TCP/UDP traffic */ @@ -187,16 +195,15 @@ struct inpcb { int hash_element; /* Array index of pcb's hash list */ caddr_t inp_saved_ppcb; /* place to save pointer while cached */ struct inpcbpolicy *inp_sp; -#ifdef _KERN_LOCKS_H_ - lck_mtx_t *inpcb_mtx; /* inpcb per-socket mutex */ -#else - void *inpcb_mtx; -#endif + decl_lck_mtx_data( ,inpcb_mtx); /* inpcb per-socket mutex */ unsigned int inp_boundif; /* interface scope for INP_BOUND_IF */ - u_int32_t inp_reserved[3]; /* reserved for future use */ + unsigned int inp_last_outif; /* last known outgoing interface */ + u_int32_t inp_reserved[2]; /* reserved for future use */ #if CONFIG_MACF_NET struct label *inp_label; /* MAC label */ #endif + struct inp_stat *inp_stat; + u_int8_t inp_stat_store[sizeof(struct inp_stat) + sizeof(u_int64_t)]; }; #endif /* KERNEL_PRIVATE */ @@ -355,29 +362,70 @@ struct xinpcb64 { u_char inp_vflag; u_char inp_ip_ttl; /* time to live */ u_char inp_ip_p; /* protocol */ - union { /* foreign host table entry */ - struct in_addr_4in6 inp46_foreign; - struct in6_addr inp6_foreign; - } inp_dependfaddr; - union { /* local host table entry */ - struct in_addr_4in6 inp46_local; - struct in6_addr inp6_local; - } inp_dependladdr; - struct { - u_char inp4_ip_tos; /* type of service */ - } inp_depend4; - struct { - u_int8_t inp6_hlim; - int inp6_cksum; - u_short inp6_ifindex; - short inp6_hops; - } inp_depend6; - struct xsocket64 xi_socket; + union { /* foreign host table entry */ + struct in_addr_4in6 inp46_foreign; + struct in6_addr inp6_foreign; + } inp_dependfaddr; + union { /* local host table entry */ + struct in_addr_4in6 inp46_local; + struct in6_addr inp6_local; + } inp_dependladdr; + struct { + u_char inp4_ip_tos; /* type of service */ + } inp_depend4; + struct { + u_int8_t inp6_hlim; + int inp6_cksum; + u_short inp6_ifindex; + short inp6_hops; + } inp_depend6; + struct xsocket64 xi_socket; u_quad_t xi_alignment_hack; }; #endif /* !CONFIG_EMBEDDED */ +#ifdef PRIVATE + +struct xinpcb_list_entry { + u_int64_t le_next; + u_int64_t le_prev; +}; + +struct xinpcb_n { + u_int32_t xi_len; /* length of this structure */ + u_int32_t xi_kind; /* XSO_INPCB */ + u_int64_t xi_inpp; + u_short inp_fport; /* foreign port */ + u_short inp_lport; /* local port */ + u_int64_t inp_ppcb; /* pointer to per-protocol pcb */ + inp_gen_t inp_gencnt; /* generation count of this instance */ + int inp_flags; /* generic IP/datagram flags */ + u_int32_t inp_flow; + u_char inp_vflag; + u_char inp_ip_ttl; /* time to live */ + u_char inp_ip_p; /* protocol */ + union { /* foreign host table entry */ + struct in_addr_4in6 inp46_foreign; + struct in6_addr inp6_foreign; + } inp_dependfaddr; + union { /* local host table entry */ + struct in_addr_4in6 inp46_local; + struct in6_addr inp6_local; + } inp_dependladdr; + struct { + u_char inp4_ip_tos; /* type of service */ + } inp_depend4; + struct { + u_int8_t inp6_hlim; + int inp6_cksum; + u_short inp6_ifindex; + short inp6_hops; + } inp_depend6; +}; + +#endif /* PRIVATE */ + struct xinpgen { u_int32_t xig_len; /* length of this structure */ u_int xig_count; /* number of PCBs at this time */ @@ -419,6 +467,7 @@ struct xinpgen { #define in6p_ppcb inp_ppcb /* for KAME src sync over BSD*'s */ #define in6p_state inp_state #define in6p_wantcnt inp_wantcnt +#define in6p_last_outif inp_last_outif #ifdef KERNEL_PRIVATE struct inpcbport { @@ -477,31 +526,36 @@ struct inpcbinfo { /* XXX documentation, prefixes */ #ifdef __APPLE__ #define INP_STRIPHDR 0x200 /* Strip headers in raw_ip, for OT support */ #endif -#define INP_FAITH 0x400 /* accept FAITH'ed connections */ +#define INP_FAITH 0x400 /* accept FAITH'ed connections */ #define INP_INADDR_ANY 0x800 /* local address wasn't specified */ #define INP_RECVTTL 0x1000 #define INP_UDP_NOCKSUM 0x2000 /* Turn off outbound UDP checksum */ #define INP_BOUND_IF 0x4000 /* bind socket to an ifindex */ -#define IN6P_IPV6_V6ONLY 0x008000 /* restrict AF_INET6 socket for v6 */ - -#define IN6P_PKTINFO 0x010000 /* receive IP6 dst and I/F */ -#define IN6P_HOPLIMIT 0x020000 /* receive hoplimit */ -#define IN6P_HOPOPTS 0x040000 /* receive hop-by-hop options */ -#define IN6P_DSTOPTS 0x080000 /* receive dst options after rthdr */ -#define IN6P_RTHDR 0x100000 /* receive routing header */ +#define IN6P_IPV6_V6ONLY 0x8000 /* restrict AF_INET6 socket for v6 */ +#define IN6P_PKTINFO 0x10000 /* receive IP6 dst and I/F */ +#define IN6P_HOPLIMIT 0x20000 /* receive hoplimit */ +#define IN6P_HOPOPTS 0x40000 /* receive hop-by-hop options */ +#define IN6P_DSTOPTS 0x80000 /* receive dst options after rthdr */ +#define IN6P_RTHDR 0x100000 /* receive routing header */ #define IN6P_RTHDRDSTOPTS 0x200000 /* receive dstoptions before rthdr */ -#define IN6P_TCLASS 0x400000 /* receive traffic class value */ +#define IN6P_TCLASS 0x400000 /* receive traffic class value */ #define IN6P_AUTOFLOWLABEL 0x800000 /* attach flowlabel automatically */ -#define IN6P_BINDV6ONLY 0x10000000 /* do not grab IPv4 traffic */ +#define IN6P_BINDV6ONLY 0x1000000 /* do not grab IPv4 traffic */ +#define IN6P_RFC2292 0x2000000 /* used RFC2292 API on the socket */ +#define IN6P_MTU 0x4000000 /* receive path MTU */ +#define INP_PKTINFO 0x8000000 /* receive and send PKTINFO for IPv4 */ + +#define INP_NO_IFT_CELLULAR 0x20000000 /* do not use IFT_CELLULAR route */ #ifdef KERNEL_PRIVATE #define INP_CONTROLOPTS (INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\ - INP_RECVIF|INP_RECVTTL|\ + INP_RECVIF|INP_RECVTTL|INP_PKTINFO|\ IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_HOPOPTS|\ IN6P_DSTOPTS|IN6P_RTHDR|IN6P_RTHDRDSTOPTS|\ - IN6P_TCLASS|IN6P_AUTOFLOWLABEL) + IN6P_TCLASS|IN6P_RFC2292|IN6P_MTU) + #define INP_UNMAPPABLEOPTS (IN6P_HOPOPTS|IN6P_DSTOPTS|IN6P_RTHDR|\ IN6P_TCLASS|IN6P_AUTOFLOWLABEL) @@ -513,6 +567,7 @@ struct inpcbinfo { /* XXX documentation, prefixes */ #define IN6P_MTUDISC INP_MTUDISC #define IN6P_FAITH INP_FAITH #define IN6P_CONTROLOPTS INP_CONTROLOPTS +#define IN6P_NO_IFT_CELLULAR INP_NO_IFT_CELLULAR /* * socket AF version is {newer than,or include} * actual datagram AF version @@ -530,6 +585,7 @@ struct inpcbinfo { /* XXX documentation, prefixes */ #define sotoin6pcb(so) sotoinpcb(so) /* for KAME src sync over BSD*'s */ #define INP_SOCKAF(so) so->so_proto->pr_domain->dom_family +#define INP_SOCKTYPE(so) so->so_proto->pr_type #define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af) @@ -541,6 +597,8 @@ extern int ipport_lastauto; extern int ipport_hifirstauto; extern int ipport_hilastauto; +struct sysctl_req; + #define INPCB_STATE_INUSE 0x1 /* freshly allocated PCB, it's in use */ #define INPCB_STATE_CACHED 0x2 /* this pcb is sitting in a a cache */ #define INPCB_STATE_DEAD 0x3 /* should treat as gone, will be garbage collected and freed */ @@ -553,19 +611,21 @@ extern void in_losing(struct inpcb *); extern void in_rtchange(struct inpcb *, int); extern int in_pcballoc(struct socket *, struct inpcbinfo *, struct proc *); extern int in_pcbbind(struct inpcb *, struct sockaddr *, struct proc *); -extern int in_pcbconnect(struct inpcb *, struct sockaddr *, struct proc *); +extern int in_pcbconnect(struct inpcb *, struct sockaddr *, struct proc *, unsigned int *); extern void in_pcbdetach(struct inpcb *); extern void in_pcbdispose (struct inpcb *); extern void in_pcbdisconnect(struct inpcb *); extern int in_pcbinshash(struct inpcb *, int); extern int in_pcbladdr(struct inpcb *, struct sockaddr *, - struct sockaddr_in **); + struct sockaddr_in *, unsigned int *); extern struct inpcb *in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_int, int); extern struct inpcb *in_pcblookup_local_and_cleanup(struct inpcbinfo *, struct in_addr, u_int, int); extern struct inpcb *in_pcblookup_hash(struct inpcbinfo *, struct in_addr, u_int, struct in_addr, u_int, int, struct ifnet *); +extern int in_pcblookup_hash_exists(struct inpcbinfo *, struct in_addr, + u_int, struct in_addr, u_int, int, uid_t *, gid_t *, struct ifnet *); extern void in_pcbnotifyall(struct inpcbinfo *, struct in_addr, int, void (*)(struct inpcb *, int)); extern void in_pcbrehash(struct inpcb *); @@ -580,8 +640,11 @@ extern void inpcb_to_compat(struct inpcb *inp, extern void inpcb_to_xinpcb64(struct inpcb *inp, struct xinpcb64 *xinp); #endif +extern int get_pcblist_n(short , struct sysctl_req *, struct inpcbinfo *); extern void inp_route_copyout(struct inpcb *, struct route *); extern void inp_route_copyin(struct inpcb *, struct route *); +extern void inp_bindif(struct inpcb *, unsigned int); +extern int inp_nocellular(struct inpcb *, unsigned int); #endif /* KERNEL */ #endif /* KERNEL_PRIVATE */ diff --git a/bsd/netinet/in_pcblist.c b/bsd/netinet/in_pcblist.c new file mode 100644 index 000000000..9ff8839b5 --- /dev/null +++ b/bsd/netinet/in_pcblist.c @@ -0,0 +1,383 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * Copyright (c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#ifndef ROUNDUP64 +#define ROUNDUP64(x) P2ROUNDUP((x), sizeof(u_int64_t)) +#endif + +#ifndef ADVANCE64 +#define ADVANCE64(p, n) (void*)((char *)(p) + ROUNDUP64(n)) +#endif + + +void sotoxsocket_n(struct socket *, struct xsocket_n *); +void sbtoxsockbuf_n(struct sockbuf *, struct xsockbuf_n *); +void sbtoxsockstat_n(struct socket *, struct xsockstat_n *); +void inpcb_to_xinpcb_n(struct inpcb *, struct xinpcb_n *); +void tcpcb_to_xtcpcb_n(struct tcpcb *, struct xtcpcb_n *); + +__private_extern__ void +sotoxsocket_n(struct socket *so, struct xsocket_n *xso) +{ + xso->xso_len = sizeof(struct xsocket_n); + xso->xso_kind = XSO_SOCKET; + + if (so != NULL) { + xso->xso_so = (u_int64_t)(uintptr_t)so; + xso->so_type = so->so_type; + xso->so_options = so->so_options; + xso->so_linger = so->so_linger; + xso->so_state = so->so_state; + xso->so_pcb = (u_int64_t)(uintptr_t)so->so_pcb; + if (so->so_proto) { + xso->xso_protocol = so->so_proto->pr_protocol; + xso->xso_family = so->so_proto->pr_domain->dom_family; + } else { + xso->xso_protocol = xso->xso_family = 0; + } + xso->so_qlen = so->so_qlen; + xso->so_incqlen = so->so_incqlen; + xso->so_qlimit = so->so_qlimit; + xso->so_timeo = so->so_timeo; + xso->so_error = so->so_error; + xso->so_pgid = so->so_pgid; + xso->so_oobmark = so->so_oobmark; + xso->so_uid = so->so_uid; + } +} + +__private_extern__ void +sbtoxsockbuf_n(struct sockbuf *sb, struct xsockbuf_n *xsb) +{ + xsb->xsb_len = sizeof(struct xsockbuf_n); + xsb->xsb_kind = (sb->sb_flags & SB_RECV) ? XSO_RCVBUF : XSO_SNDBUF; + + if (sb != NULL) { + xsb->sb_cc = sb->sb_cc; + xsb->sb_hiwat = sb->sb_hiwat; + xsb->sb_mbcnt = sb->sb_mbcnt; + xsb->sb_mbmax = sb->sb_mbmax; + xsb->sb_lowat = sb->sb_lowat; + xsb->sb_flags = sb->sb_flags; + xsb->sb_timeo = (short) + (sb->sb_timeo.tv_sec * hz) + sb->sb_timeo.tv_usec / tick; + if (xsb->sb_timeo == 0 && sb->sb_timeo.tv_usec != 0) + xsb->sb_timeo = 1; + } +} + +__private_extern__ void +sbtoxsockstat_n(struct socket *so, struct xsockstat_n *xst) +{ + int i; + + xst->xst_len = sizeof(struct xsockstat_n); + xst->xst_kind = XSO_STATS; + + for (i = 0; i < SO_TC_STATS_MAX; i++) { + xst->xst_tc_stats[i].rxpackets = so->so_tc_stats[i].rxpackets; + xst->xst_tc_stats[i].rxbytes = so->so_tc_stats[i].rxbytes; + xst->xst_tc_stats[i].txpackets = so->so_tc_stats[i].txpackets; + xst->xst_tc_stats[i].txbytes = so->so_tc_stats[i].txbytes; + } +} + +__private_extern__ void +inpcb_to_xinpcb_n(struct inpcb *inp, struct xinpcb_n *xinp) +{ + xinp->xi_len = sizeof(struct xinpcb_n); + xinp->xi_kind = XSO_INPCB; + xinp->xi_inpp = (u_int64_t)(uintptr_t)inp; + xinp->inp_fport = inp->inp_fport; + xinp->inp_lport = inp->inp_lport; + xinp->inp_ppcb = (u_int64_t)(uintptr_t)inp->inp_ppcb; + xinp->inp_gencnt = inp->inp_gencnt; + xinp->inp_flags = inp->inp_flags; + xinp->inp_flow = inp->inp_flow; + xinp->inp_vflag = inp->inp_vflag; + xinp->inp_ip_ttl = inp->inp_ip_ttl; + xinp->inp_ip_p = inp->inp_ip_p; + xinp->inp_dependfaddr.inp6_foreign = inp->inp_dependfaddr.inp6_foreign; + xinp->inp_dependladdr.inp6_local = inp->inp_dependladdr.inp6_local; + xinp->inp_depend4.inp4_ip_tos = inp->inp_depend4.inp4_ip_tos; + xinp->inp_depend6.inp6_hlim = inp->inp_depend6.inp6_hlim; + xinp->inp_depend6.inp6_cksum = inp->inp_depend6.inp6_cksum; + xinp->inp_depend6.inp6_ifindex = inp->inp_depend6.inp6_ifindex; + xinp->inp_depend6.inp6_hops = inp->inp_depend6.inp6_hops; +} + +__private_extern__ void +tcpcb_to_xtcpcb_n(struct tcpcb *tp, struct xtcpcb_n *xt) +{ + int i; + + xt->xt_len = sizeof(struct xtcpcb_n); + xt->xt_kind = XSO_TCPCB; + + xt->t_segq = (u_int32_t)(uintptr_t)tp->t_segq.lh_first; + xt->t_dupacks = tp->t_dupacks; + for (i = 0; i < TCPT_NTIMERS_EXT; i++) + xt->t_timer[i] = tp->t_timer[i]; + xt->t_state = tp->t_state; + xt->t_flags = tp->t_flags; + xt->t_force = tp->t_force; + xt->snd_una = tp->snd_una; + xt->snd_max = tp->snd_max; + xt->snd_nxt = tp->snd_nxt; + xt->snd_up = tp->snd_up; + xt->snd_wl1 = tp->snd_wl1; + xt->snd_wl2 = tp->snd_wl2; + xt->iss = tp->iss; + xt->irs = tp->irs; + xt->rcv_nxt = tp->rcv_nxt; + xt->rcv_adv = tp->rcv_adv; + xt->rcv_wnd = tp->rcv_wnd; + xt->rcv_up = tp->rcv_up; + xt->snd_wnd = tp->snd_wnd; + xt->snd_cwnd = tp->snd_cwnd; + xt->snd_ssthresh = tp->snd_ssthresh; + xt->t_maxopd = tp->t_maxopd; + xt->t_rcvtime = tp->t_rcvtime; + xt->t_starttime = tp->t_starttime; + xt->t_rtttime = tp->t_rtttime; + xt->t_rtseq = tp->t_rtseq; + xt->t_rxtcur = tp->t_rxtcur; + xt->t_maxseg = tp->t_maxseg; + xt->t_srtt = tp->t_srtt; + xt->t_rttvar = tp->t_rttvar; + xt->t_rxtshift = tp->t_rxtshift; + xt->t_rttmin = tp->t_rttmin; + xt->t_rttupdated = tp->t_rttupdated; + xt->max_sndwnd = tp->max_sndwnd; + xt->t_softerror = tp->t_softerror; + xt->t_oobflags = tp->t_oobflags; + xt->t_iobc = tp->t_iobc; + xt->snd_scale = tp->snd_scale; + xt->rcv_scale = tp->rcv_scale; + xt->request_r_scale = tp->request_r_scale; + xt->requested_s_scale = tp->requested_s_scale; + xt->ts_recent = tp->ts_recent; + xt->ts_recent_age = tp->ts_recent_age; + xt->last_ack_sent = tp->last_ack_sent; + xt->cc_send = tp->cc_send; + xt->cc_recv = tp->cc_recv; + xt->snd_recover = tp->snd_recover; + xt->snd_cwnd_prev = tp->snd_cwnd_prev; + xt->snd_ssthresh_prev = tp->snd_ssthresh_prev; + xt->t_badrxtwin = tp->t_badrxtwin; +} + +__private_extern__ int +get_pcblist_n(short proto, struct sysctl_req *req, struct inpcbinfo *pcbinfo) +{ + int error = 0; + int i, n; + struct inpcb *inp, **inp_list = NULL; + inp_gen_t gencnt; + struct xinpgen xig; + void *buf = NULL; + size_t item_size = ROUNDUP64(sizeof(struct xinpcb_n)) + + ROUNDUP64(sizeof(struct xsocket_n)) + + 2 * ROUNDUP64(sizeof(struct xsockbuf_n)) + + ROUNDUP64(sizeof(struct xsockstat_n)); + + if (proto == IPPROTO_TCP) + item_size += ROUNDUP64(sizeof(struct xtcpcb_n)); + + /* + * The process of preparing the PCB list is too time-consuming and + * resource-intensive to repeat twice on every request. + */ + lck_rw_lock_exclusive(pcbinfo->mtx); + if (req->oldptr == USER_ADDR_NULL) { + n = pcbinfo->ipi_count; + req->oldidx = 2 * (sizeof xig) + + (n + n/8) * item_size; + goto done; + } + + if (req->newptr != USER_ADDR_NULL) { + error = EPERM; + goto done; + } + + /* + * OK, now we're committed to doing something. + */ + gencnt = pcbinfo->ipi_gencnt; + n = pcbinfo->ipi_count; + + bzero(&xig, sizeof(xig)); + xig.xig_len = sizeof xig; + xig.xig_count = n; + xig.xig_gen = gencnt; + xig.xig_sogen = so_gencnt; + error = SYSCTL_OUT(req, &xig, sizeof xig); + if (error) { + goto done; + } + /* + * We are done if there is no pcb + */ + if (n == 0) { + goto done; + } + + buf = _MALLOC(item_size, M_TEMP, M_WAITOK); + if (buf == 0) { + error = ENOMEM; + goto done; + } + + inp_list = _MALLOC(n * sizeof *inp_list, M_TEMP, M_WAITOK); + if (inp_list == 0) { + error = ENOMEM; + goto done; + } + + for (inp = pcbinfo->listhead->lh_first, i = 0; inp && i < n; + inp = inp->inp_list.le_next) { + if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) + inp_list[i++] = inp; + } + n = i; + + error = 0; + for (i = 0; i < n; i++) { + inp = inp_list[i]; + if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) { + struct xinpcb_n *xi = (struct xinpcb_n *)buf; + struct xsocket_n *xso = (struct xsocket_n *)ADVANCE64(xi, sizeof(*xi)); + struct xsockbuf_n *xsbrcv = (struct xsockbuf_n *)ADVANCE64(xso, sizeof(*xso)); + struct xsockbuf_n *xsbsnd = (struct xsockbuf_n *)ADVANCE64(xsbrcv, sizeof(*xsbrcv)); + struct xsockstat_n *xsostats = (struct xsockstat_n *)ADVANCE64(xsbsnd, sizeof(*xsbsnd)); + + bzero(buf, item_size); + + inpcb_to_xinpcb_n(inp, xi); + sotoxsocket_n(inp->inp_socket, xso); + sbtoxsockbuf_n(inp->inp_socket ? &inp->inp_socket->so_rcv : NULL, xsbrcv); + sbtoxsockbuf_n(inp->inp_socket ? &inp->inp_socket->so_snd : NULL, xsbsnd); + sbtoxsockstat_n(inp->inp_socket, xsostats); + if (proto == IPPROTO_TCP) { + struct xtcpcb_n *xt = (struct xtcpcb_n *)ADVANCE64(xsostats, sizeof(*xsostats)); + + /* + * inp->inp_ppcb, can only be NULL on + * an initialization race window. + * No need to lock. + */ + if (inp->inp_ppcb == NULL) + continue; + + tcpcb_to_xtcpcb_n((struct tcpcb *)inp->inp_ppcb, xt); + } + error = SYSCTL_OUT(req, buf, item_size); + } + } + if (!error) { + /* + * Give the user an updated idea of our state. + * If the generation differs from what we told + * her before, she knows that something happened + * while we were processing this request, and it + * might be necessary to retry. + */ + bzero(&xig, sizeof(xig)); + xig.xig_len = sizeof xig; + xig.xig_gen = pcbinfo->ipi_gencnt; + xig.xig_sogen = so_gencnt; + xig.xig_count = pcbinfo->ipi_count; + error = SYSCTL_OUT(req, &xig, sizeof xig); + } +done: + lck_rw_done(pcbinfo->mtx); + if (inp_list) + FREE(inp_list, M_TEMP); + if (buf) + FREE(buf, M_TEMP); + return error; +} + diff --git a/bsd/netinet/in_proto.c b/bsd/netinet/in_proto.c index f08af184f..7c979683a 100644 --- a/bsd/netinet/in_proto.c +++ b/bsd/netinet/in_proto.c @@ -164,7 +164,7 @@ struct protosw inetsw[] = { { SOCK_RAW, &inetdomain, IPPROTO_IGMP, PR_ATOMIC|PR_ADDR|PR_LASTHDR, igmp_input, 0, 0, rip_ctloutput, 0, - igmp_init, igmp_fasttimo, igmp_slowtimo, 0, + igmp_init, 0, igmp_slowtimo, 0, 0, &rip_usrreqs, 0, rip_unlock, 0, { 0, 0 }, 0, { 0 } diff --git a/bsd/netinet/in_rmx.c b/bsd/netinet/in_rmx.c index 37ca3d250..2d0c2735d 100644 --- a/bsd/netinet/in_rmx.c +++ b/bsd/netinet/in_rmx.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -75,13 +75,16 @@ #include #include #include +#include #include +#include #include #include #include #include #include +#include extern int tvtohz(struct timeval *); extern int in_inithead(void **head, int off); @@ -139,11 +142,15 @@ in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, if (in_broadcast(sin->sin_addr, rt->rt_ifp)) { rt->rt_flags |= RTF_BROADCAST; } else { + /* Become a regular mutex */ + RT_CONVERT_LOCK(rt); + IFA_LOCK_SPIN(rt->rt_ifa); #define satosin(sa) ((struct sockaddr_in *)sa) if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr == sin->sin_addr.s_addr) rt->rt_flags |= RTF_LOCAL; #undef satosin + IFA_UNLOCK(rt->rt_ifa); } } @@ -160,7 +167,7 @@ in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, * ARP entry and delete it if so. */ rt2 = rtalloc1_scoped_locked(rt_key(rt), 0, - RTF_CLONING | RTF_PRCLONING, sa_get_ifscope(rt_key(rt))); + RTF_CLONING | RTF_PRCLONING, sin_get_ifscope(rt_key(rt))); if (rt2) { RT_LOCK(rt2); if ((rt2->rt_flags & RTF_LLINFO) && @@ -199,9 +206,17 @@ in_validate(struct radix_node *rn) RT_LOCK_ASSERT_HELD(rt); /* This is first reference? */ - if (rt->rt_refcnt == 0 && (rt->rt_flags & RTPRF_OURS)) { - rt->rt_flags &= ~RTPRF_OURS; - rt->rt_rmx.rmx_expire = 0; + if (rt->rt_refcnt == 0) { + if (rt->rt_flags & RTPRF_OURS) { + /* It's one of ours; unexpire it */ + rt->rt_flags &= ~RTPRF_OURS; + rt_setexpire(rt, 0); + } else if ((rt->rt_flags & RTF_LLINFO) && + (rt->rt_flags & RTF_HOST) && rt->rt_gateway != NULL && + rt->rt_gateway->sa_family == AF_LINK) { + /* It's ARP; let it be handled there */ + arp_validate(rt); + } } return (rn); } @@ -236,19 +251,19 @@ in_matroute_args(void *v_arg, struct radix_node_head *head, static int rtq_reallyold = 60*60; /* one hour is ``really old'' */ -SYSCTL_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_RW | CTLFLAG_LOCKED, &rtq_reallyold , 0, "Default expiration time on dynamically learned routes"); static int rtq_minreallyold = 10; /* never automatically crank down to less */ -SYSCTL_INT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, CTLFLAG_RW | CTLFLAG_LOCKED, &rtq_minreallyold , 0, "Minimum time to attempt to hold onto dynamically learned routes"); static int rtq_toomany = 128; /* 128 cached routes is ``too many'' */ -SYSCTL_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW | CTLFLAG_LOCKED, &rtq_toomany , 0, "Upper limit on dynamically learned routes"); #ifdef __APPLE__ @@ -265,12 +280,12 @@ SYSCTL_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW, * If for some reason a circular route is needed, turn this sysctl (net.inet.ip.check_route_selfref) to zero. */ int check_routeselfref = 1; -SYSCTL_INT(_net_inet_ip, OID_AUTO, check_route_selfref, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, check_route_selfref, CTLFLAG_RW | CTLFLAG_LOCKED, &check_routeselfref , 0, ""); #endif int use_routegenid = 1; -SYSCTL_INT(_net_inet_ip, OID_AUTO, use_route_genid, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, use_route_genid, CTLFLAG_RW | CTLFLAG_LOCKED, &use_routegenid , 0, ""); /* @@ -319,12 +334,12 @@ in_clsroute(struct radix_node *rn, __unused struct radix_node_head *head) RT_LOCK(rt); } } else { - struct timeval timenow; + uint64_t timenow; - getmicrotime(&timenow); + timenow = net_uptime(); rt->rt_flags |= RTPRF_OURS; - rt->rt_rmx.rmx_expire = - rt_expiry(rt, timenow.tv_sec, rtq_reallyold); + rt_setexpire(rt, + rt_expiry(rt, timenow, rtq_reallyold)); } } @@ -334,7 +349,7 @@ struct rtqk_arg { int killed; int found; int updating; - time_t nextstop; + uint64_t nextstop; }; /* @@ -348,16 +363,18 @@ in_rtqkill(struct radix_node *rn, void *rock) struct rtqk_arg *ap = rock; struct rtentry *rt = (struct rtentry *)rn; int err; - struct timeval timenow; + uint64_t timenow; - getmicrotime(&timenow); + timenow = net_uptime(); lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); RT_LOCK(rt); if (rt->rt_flags & RTPRF_OURS) { ap->found++; - if (ap->draining || rt->rt_rmx.rmx_expire <= timenow.tv_sec) { + VERIFY(rt->rt_expire == 0 || rt->rt_rmx.rmx_expire != 0); + VERIFY(rt->rt_expire != 0 || rt->rt_rmx.rmx_expire == 0); + if (ap->draining || rt->rt_expire <= timenow) { if (rt->rt_refcnt > 0) panic("rtqkill route really not free"); @@ -380,13 +397,13 @@ in_rtqkill(struct radix_node *rn, void *rock) } } else { if (ap->updating && - (unsigned)(rt->rt_rmx.rmx_expire - timenow.tv_sec) > + (rt->rt_expire - timenow) > rt_expiry(rt, 0, rtq_reallyold)) { - rt->rt_rmx.rmx_expire = rt_expiry(rt, - timenow.tv_sec, rtq_reallyold); + rt_setexpire(rt, rt_expiry(rt, + timenow, rtq_reallyold)); } ap->nextstop = lmin(ap->nextstop, - rt->rt_rmx.rmx_expire); + rt->rt_expire); RT_UNLOCK(rt); } } else { @@ -411,16 +428,16 @@ in_rtqtimo(void *rock) struct radix_node_head *rnh = rock; struct rtqk_arg arg; struct timeval atv; - static time_t last_adjusted_timeout = 0; - struct timeval timenow; + static uint64_t last_adjusted_timeout = 0; + uint64_t timenow; lck_mtx_lock(rnh_lock); /* Get the timestamp after we acquire the lock for better accuracy */ - getmicrotime(&timenow); + timenow = net_uptime(); arg.found = arg.killed = 0; arg.rnh = rnh; - arg.nextstop = timenow.tv_sec + rtq_timeout; + arg.nextstop = timenow + rtq_timeout; arg.draining = arg.updating = 0; rnh->rnh_walktree(rnh, in_rtqkill, &arg); @@ -433,14 +450,14 @@ in_rtqtimo(void *rock) * hard. */ if((arg.found - arg.killed > rtq_toomany) - && (timenow.tv_sec - last_adjusted_timeout >= rtq_timeout) + && ((timenow - last_adjusted_timeout) >= (uint64_t)rtq_timeout) && rtq_reallyold > rtq_minreallyold) { rtq_reallyold = 2*rtq_reallyold / 3; if(rtq_reallyold < rtq_minreallyold) { rtq_reallyold = rtq_minreallyold; } - last_adjusted_timeout = timenow.tv_sec; + last_adjusted_timeout = timenow; #if DIAGNOSTIC log(LOG_DEBUG, "in_rtqtimo: adjusted rtq_reallyold to %d\n", rtq_reallyold); @@ -451,7 +468,7 @@ in_rtqtimo(void *rock) } atv.tv_usec = 0; - atv.tv_sec = arg.nextstop - timenow.tv_sec; + atv.tv_sec = arg.nextstop - timenow; lck_mtx_unlock(rnh_lock); timeout(in_rtqtimo_funnel, rock, tvtohz(&atv)); } @@ -557,8 +574,13 @@ in_ifadown(struct ifaddr *ifa, int delete) lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); + /* + * Holding rnh_lock here prevents the possibility of + * ifa from changing (e.g. in_ifinit), so it is safe + * to access its ifa_addr without locking. + */ if (ifa->ifa_addr->sa_family != AF_INET) - return 1; + return (1); /* trigger route cache reevaluation */ if (use_routegenid) @@ -568,6 +590,8 @@ in_ifadown(struct ifaddr *ifa, int delete) arg.ifa = ifa; arg.del = delete; rnh->rnh_walktree(rnh, in_ifadownkill, &arg); + IFA_LOCK_SPIN(ifa); ifa->ifa_flags &= ~IFA_ROUTE; - return 0; + IFA_UNLOCK(ifa); + return (0); } diff --git a/bsd/netinet/in_tclass.c b/bsd/netinet/in_tclass.c new file mode 100644 index 000000000..54b5fcc1d --- /dev/null +++ b/bsd/netinet/in_tclass.c @@ -0,0 +1,850 @@ +/* + * Copyright (c) 2009-2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern char *proc_name_address(void *p); + +static int tfp_count = 0; + +static TAILQ_HEAD(, tclass_for_proc) tfp_head = TAILQ_HEAD_INITIALIZER(tfp_head); + +struct tclass_for_proc { + TAILQ_ENTRY(tclass_for_proc) tfp_link; + int tfp_class; + pid_t tfp_pid; + char tfp_pname[MAXCOMLEN + 1]; +}; + +extern void tcp_set_background_cc(struct socket *); +extern void tcp_set_foreground_cc(struct socket *); + +int dscp_code_from_mbuf_tclass(int ); + +static int get_pid_tclass(pid_t , int *); +static int get_pname_tclass(const char * , int *); +static int set_pid_tclass(pid_t , int ); +static int set_pname_tclass(const char * , int ); +static int purge_tclass_for_proc(void); +static int flush_tclass_for_proc(void); + + +static lck_grp_attr_t *tclass_lck_grp_attr = NULL; /* mutex group attributes */ +static lck_grp_t *tclass_lck_grp = NULL; /* mutex group definition */ +static lck_attr_t *tclass_lck_attr = NULL; /* mutex attributes */ +static lck_mtx_t *tclass_lock = NULL; + +/* + * Must be called with tclass_lock held + */ +static struct tclass_for_proc * +find_tfp_by_pid(pid_t pid) +{ + struct tclass_for_proc *tfp; + + TAILQ_FOREACH(tfp, &tfp_head, tfp_link) { + if (tfp->tfp_pid == pid) + break; + } + return tfp; +} + +/* + * Must be called with tclass_lock held + */ +static struct tclass_for_proc * +find_tfp_by_pname(const char *pname) +{ + struct tclass_for_proc *tfp; + + TAILQ_FOREACH(tfp, &tfp_head, tfp_link) { + if (strncmp(pname, tfp->tfp_pname, sizeof(tfp->tfp_pname)) == 0) + break; + } + return tfp; +} + +static int +get_tclass_for_curr_proc(void) +{ + struct tclass_for_proc *tfp; + int sotc = SO_TC_BE; + proc_t p = current_proc(); /* Not ref counted */ + pid_t pid = proc_pid(p); + char *pname = proc_name_address(p); + + lck_mtx_lock(tclass_lock); + + TAILQ_FOREACH(tfp, &tfp_head, tfp_link) { + if ((tfp->tfp_pid == pid) || + (tfp->tfp_pid == -1 && strncmp(pname, tfp->tfp_pname, sizeof(tfp->tfp_pname)) == 0)) { + sotc = tfp->tfp_class; + break; + } + } + + lck_mtx_unlock(tclass_lock); + + return sotc; +} + +/* + * Purge entries with PIDs of exited processes + */ +int +purge_tclass_for_proc(void) +{ + int error = 0; + struct tclass_for_proc *tfp, *tvar; + + lck_mtx_lock(tclass_lock); + + TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) { + proc_t p; + + if (tfp->tfp_pid == -1) + continue; + if ((p = proc_find(tfp->tfp_pid)) == NULL) { + tfp_count--; + TAILQ_REMOVE(&tfp_head, tfp, tfp_link); + + _FREE(tfp, M_TEMP); + } else { + proc_rele(p); + } + } + + lck_mtx_unlock(tclass_lock); + + return error; +} + +/* + * Remove one entry + * Must be called with tclass_lock held + */ +static void +free_tclass_for_proc(struct tclass_for_proc *tfp) +{ + if (tfp == NULL) + return; + tfp_count--; + TAILQ_REMOVE(&tfp_head, tfp, tfp_link); + _FREE(tfp, M_TEMP); +} + +/* + * Remove all entries + */ +int +flush_tclass_for_proc(void) +{ + int error = 0; + struct tclass_for_proc *tfp, *tvar; + + lck_mtx_lock(tclass_lock); + + TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) { + free_tclass_for_proc(tfp); + } + + lck_mtx_unlock(tclass_lock); + + return error; + +} + +/* + * Must be called with tclass_lock held + */ +static struct tclass_for_proc * +alloc_tclass_for_proc(pid_t pid, const char *pname, int tclass) +{ + struct tclass_for_proc *tfp; + + if (pid == -1 && pname == NULL) + return NULL; + + tfp = _MALLOC(sizeof(struct tclass_for_proc), M_TEMP, M_NOWAIT | M_ZERO); + if (tfp == NULL) + return NULL; + + tfp->tfp_pid = pid; + tfp->tfp_class = tclass; + /* + * Add per pid entries before per proc name so we can find + * a specific instance of a process before the general name base entry. + */ + if (pid != -1) { + TAILQ_INSERT_HEAD(&tfp_head, tfp, tfp_link); + } else { + strlcpy(tfp->tfp_pname, pname, sizeof(tfp->tfp_pname)); + TAILQ_INSERT_TAIL(&tfp_head, tfp, tfp_link); + } + + tfp_count++; + + return tfp; +} + +/* + * -1 for tclass means to remove the entry + */ +int +set_pid_tclass(pid_t pid, int tclass) +{ + int error = EINVAL; + proc_t p = NULL; + struct filedesc *fdp; + struct fileproc *fp; + struct tclass_for_proc *tfp; + int i; + + p = proc_find(pid); + if (p == NULL) { + printf("set_pid_tclass proc_find(%d) \n", pid); + goto done; + } + + /* Need a tfp */ + lck_mtx_lock(tclass_lock); + + tfp = find_tfp_by_pid(pid); + if (tclass == -1) { + if (tfp != NULL) { + free_tclass_for_proc(tfp); + error = 0; + } + lck_mtx_unlock(tclass_lock); + goto done; + } else { + if (tfp == NULL) { + tfp = alloc_tclass_for_proc(pid, NULL, tclass); + if (tfp == NULL) { + lck_mtx_unlock(tclass_lock); + error = ENOBUFS; + goto done; + } + } else { + tfp->tfp_class = tclass; + } + } + lck_mtx_unlock(tclass_lock); + + if (tfp != NULL) { + proc_fdlock(p); + + fdp = p->p_fd; + for (i = 0; i < fdp->fd_nfiles; i++) { + struct socket *so; + + fp = fdp->fd_ofiles[i]; + if (fp == NULL || (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 || + fp->f_fglob->fg_type != DTYPE_SOCKET) + continue; + + so = (struct socket *)fp->f_fglob->fg_data; + if (so->so_proto->pr_domain->dom_family != AF_INET && + so->so_proto->pr_domain->dom_family != AF_INET6) + continue; + socket_lock(so, 1); + error = so_set_traffic_class(so, tclass != -1 ? tclass : SO_TC_BE); + socket_unlock(so, 1); + if (error != 0) { + printf("set_pid_tclass so_set_traffic_class(%p, %d) failed %d\n", so, tclass, error); + error = 0; + } + } + + proc_fdunlock(p); + } + + error = 0; +done: + if (p != NULL) + proc_rele(p); + + return error; +} + +int +set_pname_tclass(const char *pname, int tclass) +{ + int error = EINVAL; + struct tclass_for_proc *tfp; + + lck_mtx_lock(tclass_lock); + + tfp = find_tfp_by_pname(pname); + if (tclass == -1) { + if (tfp != NULL) + free_tclass_for_proc(tfp); + } else { + if (tfp == NULL) { + tfp = alloc_tclass_for_proc(-1, pname, tclass); + if (tfp == NULL) { + lck_mtx_unlock(tclass_lock); + error = ENOBUFS; + goto done; + } + } else { + tfp->tfp_class = tclass; + } + } + lck_mtx_unlock(tclass_lock); + + error = 0; +done: + + return error; +} + +int +get_pid_tclass(pid_t pid, int *tclass) +{ + int error = EINVAL; + proc_t p = NULL; + struct tclass_for_proc *tfp; + + *tclass = -1; /* Means not set */ + + p = proc_find(pid); + if (p == NULL) { + printf("get_pid_tclass proc_find(%d) \n", pid); + goto done; + } + + /* Need a tfp */ + lck_mtx_lock(tclass_lock); + + tfp = find_tfp_by_pid(pid); + if (tfp != NULL) { + *tclass = tfp->tfp_class ; + error = 0; + } + lck_mtx_unlock(tclass_lock); +done: + if (p != NULL) + proc_rele(p); + + return error; +} + +int +get_pname_tclass(const char *pname, int *tclass) +{ + int error = EINVAL; + struct tclass_for_proc *tfp; + + *tclass = -1; /* Means not set */ + + /* Need a tfp */ + lck_mtx_lock(tclass_lock); + + tfp = find_tfp_by_pname(pname); + if (tfp != NULL) { + *tclass = tfp->tfp_class ; + error = 0; + } + lck_mtx_unlock(tclass_lock); + + return error; +} + + + +/* + * Setting options requires privileges + */ +__private_extern__ int +so_set_tcdbg(struct socket *so, struct so_tcdbg *so_tcdbg) +{ + int error = 0; + + if ((so->so_state & SS_PRIV) == 0) + return EPERM; + + socket_unlock(so, 0); + + switch (so_tcdbg->so_tcdbg_cmd) { + case SO_TCDBG_PID: + error = set_pid_tclass(so_tcdbg->so_tcdbg_pid, so_tcdbg->so_tcdbg_tclass); + break; + + case SO_TCDBG_PNAME: + error = set_pname_tclass(so_tcdbg->so_tcdbg_pname, so_tcdbg->so_tcdbg_tclass); + break; + + case SO_TCDBG_PURGE: + error = purge_tclass_for_proc(); + break; + + case SO_TCDBG_FLUSH: + error = flush_tclass_for_proc(); + break; + + default: + error = EINVAL; + break; + + } + + socket_lock(so, 0); + + return error; +} + +/* + * Not required to be privileged to get + */ +__private_extern__ int +sogetopt_tcdbg(struct socket *so, struct sockopt *sopt) +{ + int error = 0; + struct so_tcdbg so_tcdbg; + void *buf = NULL; + size_t len = sopt->sopt_valsize; + + error = sooptcopyin(sopt, &so_tcdbg, sizeof(struct so_tcdbg), sizeof(struct so_tcdbg)); + if (error != 0) + return error; + + sopt->sopt_valsize = len; + + socket_unlock(so, 0); + + switch (so_tcdbg.so_tcdbg_cmd) { + case SO_TCDBG_PID: + error = get_pid_tclass(so_tcdbg.so_tcdbg_pid, &so_tcdbg.so_tcdbg_tclass); + break; + + case SO_TCDBG_PNAME: + error = get_pname_tclass(so_tcdbg.so_tcdbg_pname, &so_tcdbg.so_tcdbg_tclass); + break; + + case SO_TCDBG_COUNT: + lck_mtx_lock(tclass_lock); + so_tcdbg.so_tcdbg_count = tfp_count; + lck_mtx_unlock(tclass_lock); + break; + + case SO_TCDBG_LIST: { + struct tclass_for_proc *tfp; + int n, alloc_count; + struct so_tcdbg *ptr; + + lck_mtx_lock(tclass_lock); + if ((alloc_count = tfp_count) == 0) { + lck_mtx_unlock(tclass_lock); + error = EINVAL; + break; + } + len = alloc_count * sizeof(struct so_tcdbg); + lck_mtx_unlock(tclass_lock); + + buf = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO); + if (buf == NULL) { + error = ENOBUFS; + break; + } + + lck_mtx_lock(tclass_lock); + n = 0; + ptr = (struct so_tcdbg *)buf; + TAILQ_FOREACH(tfp, &tfp_head, tfp_link) { + if (++n > alloc_count) + break; + if (tfp->tfp_pid != -1) { + ptr->so_tcdbg_cmd = SO_TCDBG_PID; + ptr->so_tcdbg_pid = tfp->tfp_pid; + } else { + ptr->so_tcdbg_cmd = SO_TCDBG_PNAME; + ptr->so_tcdbg_pid = -1; + strlcpy(ptr->so_tcdbg_pname, tfp->tfp_pname, sizeof(ptr->so_tcdbg_pname)); + } + ptr->so_tcdbg_tclass = tfp->tfp_class; + ptr++; + } + + lck_mtx_unlock(tclass_lock); + } + break; + + default: + error = EINVAL; + break; + + } + + socket_lock(so, 0); + + if (error == 0) { + if (buf == NULL) { + error = sooptcopyout(sopt, &so_tcdbg, sizeof(struct so_tcdbg)); + } else { + error = sooptcopyout(sopt, buf, len); + _FREE(buf, M_TEMP); + } + } + return error; +} + + +__private_extern__ int +so_set_traffic_class(struct socket *so, int optval) +{ + int error = 0; + + if (optval < SO_TC_BE || optval > SO_TC_VO) { + error = EINVAL; + } else { + so->so_traffic_class = optval; + + if ((INP_SOCKAF(so) == AF_INET || INP_SOCKAF(so) == AF_INET6) && + INP_SOCKTYPE(so) == SOCK_STREAM) { + set_tcp_stream_priority(so); + } + } + return error; +} + +__private_extern__ void +so_set_default_traffic_class(struct socket *so) +{ + int sotc = SO_TC_BE; + + if (tfp_count > 0 && (INP_SOCKAF(so) == AF_INET || INP_SOCKAF(so) == AF_INET6)) { + sotc = get_tclass_for_curr_proc(); + } + + so->so_traffic_class = sotc; + + return; +} + + +__private_extern__ int +mbuf_traffic_class_from_control(struct mbuf *control) +{ + struct cmsghdr *cm; + + for (cm = M_FIRST_CMSGHDR(control); + cm != NULL; + cm = M_NXT_CMSGHDR(control, cm)) { + int tc; + + if (cm->cmsg_len < sizeof(struct cmsghdr)) + break; + + if (cm->cmsg_level != SOL_SOCKET || + cm->cmsg_type != SO_TRAFFIC_CLASS) + continue; + if (cm->cmsg_len != CMSG_LEN(sizeof(int))) + continue; + + tc = *(int *)CMSG_DATA(cm); + + switch (tc) { + case SO_TC_BE: + return MBUF_TC_BE; + case SO_TC_BK: + return MBUF_TC_BK; + case SO_TC_VI: + return MBUF_TC_VI; + case SO_TC_VO: + return MBUF_TC_VO; + default: + break; + } + } + + return MBUF_TC_UNSPEC; +} + +__private_extern__ int +dscp_code_from_mbuf_tclass(int mtc) +{ + int dscp_code; + + switch (mtc) { + default: + case MBUF_TC_BE: + dscp_code = 0; + break; + case MBUF_TC_BK: + dscp_code = 0x08; + break; + case MBUF_TC_VI: + dscp_code = 0x20; + break; + case MBUF_TC_VO: + dscp_code = 0x30; + break; + } + + return dscp_code; +} + +__private_extern__ void +so_recv_data_stat(struct socket *so, struct mbuf *m, size_t off) +{ + uint32_t sotc = m->m_pkthdr.prio; + + if (sotc >= SO_TC_STATS_MAX) + sotc = SO_TC_BE; + + so->so_tc_stats[sotc].rxpackets += 1; + so->so_tc_stats[sotc].rxbytes += ((m->m_flags & M_PKTHDR) ? m->m_pkthdr.len : 0) + off; + + return; +} + +__private_extern__ void +set_tcp_stream_priority(struct socket *so) +{ + struct tcpcb *tp = intotcpcb(sotoinpcb(so)); + + /* If the socket was marked as a background socket or if the + * traffic class is set to background with traffic class socket + * option then make both send and recv side of the stream to be + * background. The variable sotcdb which can be set with sysctl + * is used to disable these settings for testing. + */ + if (soisbackground(so) || so->so_traffic_class == SO_TC_BK) { + if ((sotcdb & SOTCDB_NO_SENDTCPBG) != 0) { + if (tp->tcp_cc_index == TCP_CC_ALGO_BACKGROUND_INDEX) + tcp_set_foreground_cc(so); + } else { + if (tp->tcp_cc_index != TCP_CC_ALGO_BACKGROUND_INDEX) + tcp_set_background_cc(so); + } + + /* Set receive side background flags */ + if ((sotcdb & SOTCDB_NO_RECVTCPBG) != 0) { + so->so_traffic_mgt_flags &= ~(TRAFFIC_MGT_TCP_RECVBG); + } else { + so->so_traffic_mgt_flags |= TRAFFIC_MGT_TCP_RECVBG; + } + } else { + so->so_traffic_mgt_flags &= ~(TRAFFIC_MGT_TCP_RECVBG); + if (tp->tcp_cc_index == TCP_CC_ALGO_BACKGROUND_INDEX) + tcp_set_foreground_cc(so); + } + return; +} + +/* + * Set traffic class to an IPv4 or IPv6 packet + * - mark the mbuf + * - set the DSCP code following the WMM mapping + */ +__private_extern__ void +set_packet_tclass(struct mbuf *m, struct socket *so, int in_mtc, int isipv6) +{ + int mtc = MBUF_TC_BE; /* Best effort by default */ + struct inpcb *inp = sotoinpcb(so); /* in6pcb and inpcb are the same */ + struct ip *ip = mtod(m, struct ip *); +#if INET6 + struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); +#endif /* INET6 */ + + if (!(m->m_flags & M_PKTHDR)) + return; + + /* + * Here is the precedence: + * 1) TRAFFIC_MGT_SO_BACKGROUND trumps all + * 2) Traffic class passed via ancillary data to sendmsdg(2) + * 3) Traffic class socket option last + */ + if (soisbackground(so)) { + mtc = MBUF_TC_BK; + } else if (in_mtc != MBUF_TC_UNSPEC) { + if (in_mtc >= MBUF_TC_BE && in_mtc <= MBUF_TC_VO) + mtc = in_mtc; + } else { + switch (so->so_traffic_class) { + case SO_TC_BE: + mtc = MBUF_TC_BE; + break; + case SO_TC_BK: + mtc = MBUF_TC_BK; + break; + case SO_TC_VI: + mtc = MBUF_TC_VI; + break; + case SO_TC_VO: + mtc = MBUF_TC_VO; + break; + default: + break; + } + } + + /* + * Set the traffic class in the mbuf packet header prio field + */ + if ((sotcdb & SOTCDB_NO_MTC)) + goto no_mbtc; + m->m_pkthdr.prio = mtc; + +no_mbtc: + /* + * Quick exit when best effort + */ + if (mtc == MBUF_TC_BE) + goto no_dscp; + /* + * Now let set the DSCP code in IPv4 or IPv6 header + * By default do this only for local traffic if a code is not already set + */ + if ((sotcdb & SOTCDB_NO_DSCP)) + goto no_dscp; + + /* + * Test if a IP TOS or IPV6 TCLASS has already been set on the socket or the raw packet + */ + if ((sotcdb & SOTCDB_NO_DSCPTST) == 0) { +#if INET6 + if (isipv6) + { + if ((so->so_type == SOCK_RAW && (ip6->ip6_flow & htonl(0xff << 20)) != 0) || + (inp->in6p_outputopts && inp->in6p_outputopts->ip6po_tclass != -1)) + goto no_dscp; + } + else +#endif /* INET6 */ + { + if ((so->so_type == SOCK_RAW && (inp->inp_flags & INP_HDRINCL)) || + inp->inp_ip_tos != 0) + goto no_dscp; + } + } + + /* + * Test if destination is local + */ + if ((sotcdb & SOTCDB_NO_LCLTST) == 0) { + int islocal = 0; + struct route *ro = &inp->inp_route; + + if (so->so_type == SOCK_STREAM) { + struct tcpcb *tp = intotcpcb(inp); + + if ((tp->t_flags & TF_LOCAL)) + islocal = 1; + } + else +#if INET6 + if (isipv6) + { + if ((ro != NULL && ro->ro_rt != NULL && + (ro->ro_rt->rt_gateway->sa_family == AF_LINK || + (ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK))) || + in6addr_local(&ip6->ip6_dst)) + islocal = 1; + } + else +#endif /* INET6 */ + { + if ((ro != NULL && ro->ro_rt != NULL && + (ro->ro_rt->rt_gateway->sa_family == AF_LINK || + (ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK))) || + inaddr_local(ip->ip_dst)) + islocal = 1; + } + if (islocal == 0) + goto no_dscp; + } + +#if INET6 + if (isipv6) + ip6->ip6_flow |= + htonl(dscp_code_from_mbuf_tclass(m->m_pkthdr.prio) << 20); + else +#endif /* INET6 */ + ip->ip_tos |= dscp_code_from_mbuf_tclass(m->m_pkthdr.prio) << 2; + +no_dscp: + /* + * For TCP with background traffic class switch CC algo based on sysctl + */ + if (so->so_type == SOCK_STREAM) { + set_tcp_stream_priority(so); + } + + /* + * Assume socket and mbuf traffic class values are the same + * Also assume the socket lock is held + */ + so->so_tc_stats[mtc].txpackets += 1; + so->so_tc_stats[mtc].txbytes += m->m_pkthdr.len; + + return; +} + +__private_extern__ void +socket_tclass_init(void) +{ + tclass_lck_grp_attr = lck_grp_attr_alloc_init(); + tclass_lck_grp = lck_grp_alloc_init("tclass", tclass_lck_grp_attr); + tclass_lck_attr = lck_attr_alloc_init(); + if ((tclass_lock = lck_mtx_alloc_init(tclass_lck_grp, tclass_lck_attr)) == NULL) { + panic("failed to allocate memory for tclass\n"); + } +} + + diff --git a/bsd/netinet/in_var.h b/bsd/netinet/in_var.h index df7a968af..0b5d373de 100644 --- a/bsd/netinet/in_var.h +++ b/bsd/netinet/in_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -70,7 +70,7 @@ #include #endif -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #include /* @@ -96,7 +96,7 @@ struct in_ifaddr { struct sockaddr_in ia_sockmask; /* reserve space for general netmask */ TAILQ_ENTRY(in_ifaddr) ia_hash; /* hash bucket entry */ }; -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ struct in_aliasreq { char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */ @@ -155,9 +155,10 @@ struct kev_in_portinuse { #define KEV_INET_PORTINUSE 8 /* use ken_in_portinuse */ #endif -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #include #include +#include /* * Given a pointer to an in_ifaddr (ifaddr), * return a pointer to the addr as a sockaddr_in. @@ -195,9 +196,14 @@ extern int apple_hwcksum_rx; struct in_ifaddr *ia; \ \ lck_rw_lock_shared(in_ifaddr_rwlock); \ - TAILQ_FOREACH(ia, INADDR_HASH((addr).s_addr), ia_hash) \ - if (IA_SIN(ia)->sin_addr.s_addr == (addr).s_addr) \ + TAILQ_FOREACH(ia, INADDR_HASH((addr).s_addr), ia_hash) { \ + IFA_LOCK_SPIN(&ia->ia_ifa); \ + if (IA_SIN(ia)->sin_addr.s_addr == (addr).s_addr) { \ + IFA_UNLOCK(&ia->ia_ifa); \ break; \ + } \ + IFA_UNLOCK(&ia->ia_ifa); \ + } \ (ifp) = (ia == NULL) ? NULL : ia->ia_ifp; \ lck_rw_done(in_ifaddr_rwlock); \ } @@ -217,7 +223,7 @@ extern int apple_hwcksum_rx; (ia) = TAILQ_NEXT((ia), ia_link)) \ continue; \ if ((ia) != NULL) \ - ifaref(&(ia)->ia_ifa); \ + IFA_ADDREF(&(ia)->ia_ifa); \ lck_rw_done(in_ifaddr_rwlock); \ } @@ -226,31 +232,153 @@ extern int apple_hwcksum_rx; * to change that - as it might break a number of things */ +/* + * Legacy IPv4 IGMP per-link structure. + */ struct router_info { struct ifnet *rti_ifp; int rti_type; /* type of router which is querier on this interface */ int rti_time; /* # of slow timeouts since last old query */ - struct router_info *rti_next; + SLIST_ENTRY(router_info) rti_list; +}; + +/* + * IPv4 multicast IGMP-layer source entry. + */ +struct ip_msource { + RB_ENTRY(ip_msource) ims_link; /* RB tree links */ + in_addr_t ims_haddr; /* host byte order */ + struct ims_st { + uint16_t ex; /* # of exclusive members */ + uint16_t in; /* # of inclusive members */ + } ims_st[2]; /* state at t0, t1 */ + uint8_t ims_stp; /* pending query */ +}; + +/* + * IPv4 multicast PCB-layer source entry. + */ +struct in_msource { + RB_ENTRY(ip_msource) ims_link; /* RB tree links */ + in_addr_t ims_haddr; /* host byte order */ + uint8_t imsl_st[2]; /* state before/at commit */ }; +RB_HEAD(ip_msource_tree, ip_msource); /* define struct ip_msource_tree */ + +RB_PROTOTYPE_SC_PREV(__private_extern__, ip_msource_tree, ip_msource, + ims_link, ip_msource_cmp); + +/* + * IPv4 multicast PCB-layer group filter descriptor. + */ +struct in_mfilter { + struct ip_msource_tree imf_sources; /* source list for (S,G) */ + u_long imf_nsrc; /* # of source entries */ + uint8_t imf_st[2]; /* state before/at commit */ +}; + +struct igmp_ifinfo; + /* - * Internet multicast address structure. There is one of these for each IP - * multicast group to which this host belongs on a given network interface. - * For every entry on the interface's if_multiaddrs list which represents - * an IP multicast group, there is one of these structures. They are also - * kept on a system-wide list to make it easier to keep our legacy IGMP code - * compatible with the rest of the world (see IN_FIRST_MULTI et al, below). + * IPv4 group descriptor. + * + * For every entry on an ifnet's if_multiaddrs list which represents + * an IP multicast group, there is one of these structures. + * + * If any source filters are present, then a node will exist in the RB-tree + * to permit fast lookup by source whenever an operation takes place. + * This permits pre-order traversal when we issue reports. + * Source filter trees are kept separately from the socket layer to + * greatly simplify locking. + * + * When IGMPv3 is active, inm_timer is the response to group query timer. + * The state-change timer inm_sctimer is separate; whenever state changes + * for the group the state change record is generated and transmitted, + * and kept if retransmissions are necessary. + * + * FUTURE: inm_link is now only used when groups are being purged + * on a detaching ifnet. It could be demoted to a SLIST_ENTRY, but + * because it is at the very start of the struct, we can't do this + * w/o breaking the ABI for ifmcstat. */ struct in_multi { + decl_lck_mtx_data(, inm_lock); + u_int32_t inm_refcount; /* reference count */ + u_int32_t inm_reqcnt; /* request count for this address */ + u_int32_t inm_debug; /* see ifa_debug flags */ LIST_ENTRY(in_multi) inm_link; /* queue macro glue */ struct in_addr inm_addr; /* IP multicast address, convenience */ struct ifnet *inm_ifp; /* back pointer to ifnet */ struct ifmultiaddr *inm_ifma; /* back pointer to ifmultiaddr */ - u_int inm_timer; /* IGMP membership report timer */ + u_int inm_timer; /* IGMPv1/v2 group / v3 query timer */ u_int inm_state; /* state of the membership */ - struct router_info *inm_rti; /* router info*/ + void *inm_rti; /* unused, legacy field */ + + /* New fields for IGMPv3 follow. */ + struct igmp_ifinfo *inm_igi; /* IGMP info */ + SLIST_ENTRY(in_multi) inm_nrele; /* to-be-released by IGMP */ + u_int32_t inm_nrelecnt; /* deferred release count */ + struct ip_msource_tree inm_srcs; /* tree of sources */ + u_long inm_nsrc; /* # of tree entries */ + + struct ifqueue inm_scq; /* queue of pending + * state-change packets */ + struct timeval inm_lastgsrtv; /* Time of last G-S-R query */ + uint16_t inm_sctimer; /* state-change timer */ + uint16_t inm_scrv; /* state-change rexmit count */ + + /* + * SSM state counters which track state at T0 (the time the last + * state-change report's RV timer went to zero) and T1 + * (time of pending report, i.e. now). + * Used for computing IGMPv3 state-change reports. Several refcounts + * are maintained here to optimize for common use-cases. + */ + struct inm_st { + uint16_t iss_fmode; /* IGMP filter mode */ + uint16_t iss_asm; /* # of ASM listeners */ + uint16_t iss_ex; /* # of exclusive members */ + uint16_t iss_in; /* # of inclusive members */ + uint16_t iss_rec; /* # of recorded sources */ + } inm_st[2]; /* state at t0, t1 */ + + void (*inm_trace) /* callback fn for tracing refs */ + (struct in_multi *, int); }; +#define INM_LOCK_ASSERT_HELD(_inm) \ + lck_mtx_assert(&(_inm)->inm_lock, LCK_MTX_ASSERT_OWNED) + +#define INM_LOCK_ASSERT_NOTHELD(_inm) \ + lck_mtx_assert(&(_inm)->inm_lock, LCK_MTX_ASSERT_NOTOWNED) + +#define INM_LOCK(_inm) \ + lck_mtx_lock(&(_inm)->inm_lock) + +#define INM_LOCK_SPIN(_inm) \ + lck_mtx_lock_spin(&(_inm)->inm_lock) + +#define INM_CONVERT_LOCK(_inm) do { \ + INM_LOCK_ASSERT_HELD(_inm); \ + lck_mtx_convert_spin(&(_inm)->inm_lock); \ +} while (0) + +#define INM_UNLOCK(_inm) \ + lck_mtx_unlock(&(_inm)->inm_lock) + +#define INM_ADDREF(_inm) \ + inm_addref(_inm, 0) + +#define INM_ADDREF_LOCKED(_inm) \ + inm_addref(_inm, 1) + +#define INM_REMREF(_inm) \ + inm_remref(_inm, 0) + +#define INM_REMREF_LOCKED(_inm) \ + inm_remref(_inm, 1) + #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet_ip); SYSCTL_DECL(_net_inet_raw); @@ -269,22 +397,36 @@ struct in_multistep { /* * Macro for looking up the in_multi record for a given IP multicast address * on a given interface. If no matching record is found, "inm" is set null. + * + * We do this differently compared other BSD implementations; instead of + * walking the if_multiaddrs list at the interface and returning the + * ifma_protospec value of a matching entry, we search the global list + * of in_multi records and find it that way. Otherwise either the two + * structures (in_multi, ifmultiaddr) need to be ref counted both ways, + * which will make things too complicated, or they need to reside in the + * same protected domain, which they aren't. + * + * Must be called with in_multihead_lock held. */ -#define IN_LOOKUP_MULTI(addr, ifp, inm) \ - /* struct in_addr addr; */ \ - /* struct ifnet *ifp; */ \ - /* struct in_multi *inm; */ \ -do { \ - struct ifmultiaddr *ifma; \ -\ - LIST_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) { \ - if (ifma->ifma_addr->sa_family == AF_INET \ - && ((struct sockaddr_in *)ifma->ifma_addr)->sin_addr.s_addr == \ - (addr).s_addr) \ - break; \ - } \ - (inm) = ifma ? ifma->ifma_protospec : NULL; \ -} while(0) +#define IN_LOOKUP_MULTI(addr, ifp, inm) \ + /* struct in_addr *addr; */ \ + /* struct ifnet *ifp; */ \ + /* struct in_multi *inm; */ \ +do { \ + struct in_multistep _step; \ + IN_FIRST_MULTI(_step, inm); \ + while ((inm) != NULL) { \ + INM_LOCK_SPIN(inm); \ + if ((inm)->inm_ifp == (ifp) && \ + (inm)->inm_addr.s_addr == (addr)->s_addr) { \ + INM_ADDREF_LOCKED(inm); \ + INM_UNLOCK(inm); \ + break; \ + } \ + INM_UNLOCK(inm); \ + IN_NEXT_MULTI(_step, inm); \ + } \ +} while (0) /* * Macro to step through all of the in_multi records, one at a time. @@ -292,28 +434,57 @@ do { \ * provide. IN_FIRST_MULTI(), below, must be called to initialize "step" * and get the first record. Both macros return a NULL "inm" when there * are no remaining records. + * + * Must be called with in_multihead_lock held. */ -#define IN_NEXT_MULTI(step, inm) \ - /* struct in_multistep step; */ \ - /* struct in_multi *inm; */ \ -do { \ - if (((inm) = (step).i_inm) != NULL) \ - (step).i_inm = LIST_NEXT((step).i_inm, inm_link); \ -} while(0) - -#define IN_FIRST_MULTI(step, inm) \ - /* struct in_multistep step; */ \ - /* struct in_multi *inm; */ \ -do { \ - (step).i_inm = LIST_FIRST(&in_multihead); \ - IN_NEXT_MULTI((step), (inm)); \ -} while(0) +#define IN_NEXT_MULTI(step, inm) \ + /* struct in_multistep step; */ \ + /* struct in_multi *inm; */ \ +do { \ + in_multihead_lock_assert(LCK_RW_ASSERT_HELD); \ + if (((inm) = (step).i_inm) != NULL) \ + (step).i_inm = LIST_NEXT((step).i_inm, inm_link); \ +} while (0) + +#define IN_FIRST_MULTI(step, inm) \ + /* struct in_multistep step; */ \ + /* struct in_multi *inm; */ \ +do { \ + in_multihead_lock_assert(LCK_RW_ASSERT_HELD); \ + (step).i_inm = LIST_FIRST(&in_multihead); \ + IN_NEXT_MULTI((step), (inm)); \ +} while (0) struct route; +struct ip_moptions; + +/* + * Return values for imo_multi_filter(). + */ +#define MCAST_PASS 0 /* Pass */ +#define MCAST_NOTGMEMBER 1 /* This host not a member of group */ +#define MCAST_NOTSMEMBER 2 /* This host excluded source */ +#define MCAST_MUTED 3 /* [deprecated] */ extern void in_ifaddr_init(void); +extern int imo_multi_filter(const struct ip_moptions *, const struct ifnet *, + const struct sockaddr *, const struct sockaddr *); +extern int imo_clone(struct ip_moptions *, struct ip_moptions *); +extern void inm_commit(struct in_multi *); +extern void inm_clear_recorded(struct in_multi *); +extern void inm_print(const struct in_multi *); +extern int inm_record_source(struct in_multi *inm, const in_addr_t); +extern void inm_release(struct in_multi *); +extern void in_multi_init(void); extern struct in_multi *in_addmulti(struct in_addr *, struct ifnet *); -extern void in_delmulti(struct in_multi **); +extern void in_delmulti(struct in_multi *); +extern int in_leavegroup(struct in_multi *, /*const*/ struct in_mfilter *); +extern int in_multi_detach(struct in_multi *); +extern void inm_addref(struct in_multi *, int); +extern void inm_remref(struct in_multi *, int); +extern void inm_purge(struct in_multi *); +extern uint8_t ims_get_mode(const struct in_multi *, + const struct ip_msource *, uint8_t); extern int in_control(struct socket *, u_long, caddr_t, struct ifnet *, struct proc *); extern void in_rtqdrain(void); @@ -321,14 +492,20 @@ extern struct radix_node *in_validate(struct radix_node *); extern void ip_input(struct mbuf *); extern int in_ifadown(struct ifaddr *ifa, int); extern void in_ifscrub(struct ifnet *, struct in_ifaddr *, int); -extern int ipflow_fastforward(struct mbuf *); -#if IPFLOW -extern void ipflow_create(const struct route *, struct mbuf *); -extern void ipflow_slowtimo(void); -#endif /* IPFLOW */ extern u_int32_t inaddr_hashval(u_int32_t); - -#endif /* KERNEL_PRIVATE */ +extern void in_purgeaddrs(struct ifnet *); +extern void imf_leave(struct in_mfilter *); +extern void imf_purge(struct in_mfilter *); + +struct inpcb; + +__private_extern__ int inp_join_group(struct inpcb *, struct sockopt *); +__private_extern__ int inp_leave_group(struct inpcb *, struct sockopt *); +__private_extern__ void in_multihead_lock_exclusive(void); +__private_extern__ void in_multihead_lock_shared(void); +__private_extern__ void in_multihead_lock_assert(int); +__private_extern__ void in_multihead_lock_done(void); +#endif /* XNU_KERNEL_PRIVATE */ /* INET6 stuff */ #include diff --git a/bsd/netinet/ip6.h b/bsd/netinet/ip6.h index 203e86a64..a740ddc49 100644 --- a/bsd/netinet/ip6.h +++ b/bsd/netinet/ip6.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -170,7 +170,10 @@ struct ip6_dest { #define IP6OPT_JUMBO 0xC2 /* 11 0 00010 = 194 */ #define IP6OPT_NSAP_ADDR 0xC3 /* 11 0 00011 */ #define IP6OPT_TUNNEL_LIMIT 0x04 /* 00 0 00100 */ +#ifndef KERNEL_PRIVATE #define IP6OPT_RTALERT 0x05 /* 00 0 00101 (KAME definition) */ +#endif +#define IP6OPT_ROUTER_ALERT 0x05 /* 00 0 00101 (RFC3542, recommended) */ #define IP6OPT_RTALERT_LEN 4 #define IP6OPT_RTALERT_MLD 0 /* Datagram contains an MLD message */ @@ -178,10 +181,6 @@ struct ip6_dest { #define IP6OPT_RTALERT_ACTNET 2 /* contains an Active Networks msg */ #define IP6OPT_MINLEN 2 -#define IP6OPT_BINDING_UPDATE 0xc6 /* 11 0 00110 */ -#define IP6OPT_BINDING_ACK 0x07 /* 00 0 00111 */ -#define IP6OPT_BINDING_REQ 0x08 /* 00 0 01000 */ -#define IP6OPT_HOME_ADDRESS 0xc9 /* 11 0 01001 */ #define IP6OPT_EID 0x8a /* 10 0 01010 */ #define IP6OPT_TYPE(o) ((o) & 0xC0) @@ -192,8 +191,56 @@ struct ip6_dest { #define IP6OPT_MUTABLE 0x20 +/* IPv6 options: common part */ +struct ip6_opt { + u_int8_t ip6o_type; + u_int8_t ip6o_len; +} __attribute__((__packed__)); + +/* Jumbo Payload Option */ +struct ip6_opt_jumbo { + u_int8_t ip6oj_type; + u_int8_t ip6oj_len; + u_int8_t ip6oj_jumbo_len[4]; +} __attribute__((__packed__)); #define IP6OPT_JUMBO_LEN 6 +/* NSAP Address Option */ +struct ip6_opt_nsap { + u_int8_t ip6on_type; + u_int8_t ip6on_len; + u_int8_t ip6on_src_nsap_len; + u_int8_t ip6on_dst_nsap_len; + /* followed by source NSAP */ + /* followed by destination NSAP */ +}__attribute__((__packed__)); + +/* Tunnel Limit Option */ +struct ip6_opt_tunnel { + u_int8_t ip6ot_type; + u_int8_t ip6ot_len; + u_int8_t ip6ot_encap_limit; +}__attribute__((__packed__)); + +/* Router Alert Option */ +struct ip6_opt_router { + u_int8_t ip6or_type; + u_int8_t ip6or_len; + u_int8_t ip6or_value[2]; +}__attribute__((__packed__)); +/* Router alert values (in network byte order) */ +#if BYTE_ORDER == BIG_ENDIAN +#define IP6_ALERT_MLD 0x0000 +#define IP6_ALERT_RSVP 0x0001 +#define IP6_ALERT_AN 0x0002 +#else +#if BYTE_ORDER == LITTLE_ENDIAN +#define IP6_ALERT_MLD 0x0000 +#define IP6_ALERT_RSVP 0x0100 +#define IP6_ALERT_AN 0x0200 +#endif /* LITTLE_ENDIAN */ +#endif + /* Routing header */ struct ip6_rthdr { u_int8_t ip6r_nxt; /* next header */ @@ -235,13 +282,14 @@ struct ip6_frag { /* * Internet implementation parameters. */ -#define IPV6_MAXHLIM 255 /* maximun hoplimit */ +#define IPV6_MAXHLIM 255 /* maximum hoplimit */ #define IPV6_DEFHLIM 64 /* default hlim */ #define IPV6_FRAGTTL 120 /* ttl for fragment packets, in slowtimo tick */ -#define IPV6_HLIMDEC 1 /* subtracted when forwaeding */ +#define IPV6_HLIMDEC 1 /* subtracted when forwarding */ #define IPV6_MMTU 1280 /* minimal MTU and reassembly. 1024 + 256 */ #define IPV6_MAXPACKET 65535 /* ip6 max packet size without Jumbo payload*/ +#define IPV6_MAXOPTHDR 2048 /* max option header size, 256 64-bit words */ #ifdef KERNEL_PRIVATE /* @@ -291,45 +339,12 @@ do { \ * with type "typ". * IP6_EXTHDR_GET0 does the same, except that it aligns the structure at the * very top of mbuf. GET0 is likely to make memory copy than GET. - * - * XXX we're now testing this, needs m_pulldown() */ -#define IP6_EXTHDR_GET(val, typ, m, off, len) \ -do { \ - struct mbuf *t; \ - int tmp; \ - if ((m)->m_len >= (off) + (len)) \ - (val) = (typ)(mtod((m), caddr_t) + (off)); \ - else { \ - t = m_pulldown((m), (off), (len), &tmp); \ - if (t) { \ - if (t->m_len < tmp + (len)) \ - panic("m_pulldown malfunction"); \ - (val) = (typ)(mtod(t, caddr_t) + tmp); \ - } else { \ - (val) = (typ)NULL; \ - (m) = NULL; \ - } \ - } \ -} while (0) +#define IP6_EXTHDR_GET(val, typ, m, off, len) \ + M_STRUCT_GET(val, typ, m, off, len) -#define IP6_EXTHDR_GET0(val, typ, m, off, len) \ -do { \ - struct mbuf *t; \ - if ((off) == 0) \ - (val) = (typ)mtod(m, caddr_t); \ - else { \ - t = m_pulldown((m), (off), (len), NULL); \ - if (t) { \ - if (t->m_len < (len)) \ - panic("m_pulldown malfunction"); \ - (val) = (typ)mtod(t, caddr_t); \ - } else { \ - (val) = (typ)NULL; \ - (m) = NULL; \ - } \ - } \ -} while (0) +#define IP6_EXTHDR_GET0(val, typ, m, off, len) \ + M_STRUCT_GET0(val, typ, m, off, len) #endif /* KERNEL_PRIVATE */ #endif /* !_NETINET_IP6_H_ */ diff --git a/bsd/netinet/ip_divert.c b/bsd/netinet/ip_divert.c index e3c771e6d..600a796d1 100644 --- a/bsd/netinet/ip_divert.c +++ b/bsd/netinet/ip_divert.c @@ -236,12 +236,14 @@ divert_packet(struct mbuf *m, int incoming, int port, int rule) /* Find IP address for receive interface */ ifnet_lock_shared(m->m_pkthdr.rcvif); TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) { - if (ifa->ifa_addr == NULL) - continue; - if (ifa->ifa_addr->sa_family != AF_INET) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET) { + IFA_UNLOCK(ifa); continue; + } divsrc.sin_addr = ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr; + IFA_UNLOCK(ifa); break; } ifnet_lock_done(m->m_pkthdr.rcvif); @@ -314,14 +316,10 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr *addr, struct ip *const ip = mtod(m, struct ip *); struct sockaddr_in *sin = (struct sockaddr_in *)addr; int error = 0; -#if PKT_PRIORITY - mbuf_traffic_class_t mtc = MBUF_TC_NONE; -#endif /* PKT_PRIORITY */ + mbuf_traffic_class_t mtc = MBUF_TC_UNSPEC; if (control != NULL) { -#if PKT_PRIORITY mtc = mbuf_traffic_class_from_control(control); -#endif /* PKT_PRIORITY */ m_freem(control); /* XXX */ } @@ -332,8 +330,8 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr *addr, int len = 0; char *c = sin->sin_zero; - mtag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DIVERT, - sizeof(struct divert_tag), M_NOWAIT); + mtag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DIVERT, + sizeof(struct divert_tag), M_NOWAIT, m); if (mtag == NULL) { error = ENOBUFS; goto cantsend; @@ -359,8 +357,9 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr *addr, /* Reinject packet into the system as incoming or outgoing */ if (!sin || sin->sin_addr.s_addr == 0) { - struct ip_out_args ipoa = { IFSCOPE_NONE }; + struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; struct route ro; + struct ip_moptions *imo; /* * Don't allow both user specified and setsockopt options, @@ -382,10 +381,11 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr *addr, /* Copy the cached route and take an extra reference */ inp_route_copyout(inp, &ro); -#if PKT_PRIORITY - set_traffic_class(m, so, mtc); -#endif /* PKT_PRIORITY */ + set_packet_tclass(m, so, mtc, 0); + imo = inp->inp_moptions; + if (imo != NULL) + IMO_ADDREF(imo); socket_unlock(so, 0); #if CONFIG_MACF_NET mac_mbuf_label_associate_inpcb(inp, m); @@ -394,9 +394,11 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr *addr, error = ip_output(m, inp->inp_options, &ro, (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST | IP_RAWOUTPUT | IP_OUTARGS, - inp->inp_moptions, &ipoa); + imo, &ipoa); socket_lock(so, 0); + if (imo != NULL) + IMO_REMREF(imo); /* Synchronize cached PCB route */ inp_route_copyin(inp, &ro); } else { @@ -417,7 +419,7 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr *addr, goto cantsend; } m->m_pkthdr.rcvif = ifa->ifa_ifp; - ifafree(ifa); + IFA_REMREF(ifa); } #if CONFIG_MACF_NET mac_mbuf_label_associate_socket(so, m); @@ -462,7 +464,7 @@ div_attach(struct socket *so, int proto, struct proc *p) #ifdef MORE_DICVLOCK_DEBUG printf("div_attach: so=%p sopcb=%p lock=%x ref=%x\n", - so, so->so_pcb, ((struct inpcb *)so->so_pcb)->inpcb_mtx, so->so_usecount); + so, so->so_pcb, &(((struct inpcb *)so->so_pcb)->inpcb_mtx), so->so_usecount); #endif return 0; } @@ -474,7 +476,7 @@ div_detach(struct socket *so) #ifdef MORE_DICVLOCK_DEBUG printf("div_detach: so=%p sopcb=%p lock=%x ref=%x\n", - so, so->so_pcb, ((struct inpcb *)so->so_pcb)->inpcb_mtx, so->so_usecount); + so, so->so_pcb, &(((struct inpcb *)so->so_pcb)->inpcb_mtx), so->so_usecount); #endif inp = sotoinpcb(so); if (inp == 0) @@ -656,11 +658,11 @@ div_lock(struct socket *so, int refcount, void *lr) #ifdef MORE_DICVLOCK_DEBUG printf("div_lock: so=%p sopcb=%p lock=%p ref=%x lr=%p\n", so, so->so_pcb, so->so_pcb ? - ((struct inpcb *)so->so_pcb)->inpcb_mtx : NULL, + &(((struct inpcb *)so->so_pcb)->inpcb_mtx) : NULL, so->so_usecount, lr_saved); #endif if (so->so_pcb) { - lck_mtx_lock(((struct inpcb *)so->so_pcb)->inpcb_mtx); + lck_mtx_lock(&((struct inpcb *)so->so_pcb)->inpcb_mtx); } else { panic("div_lock: so=%p NO PCB! lr=%p lrh= lrh= %s\n", so, lr_saved, solockhistory_nr(so)); @@ -697,7 +699,7 @@ div_unlock(struct socket *so, int refcount, void *lr) #ifdef MORE_DICVLOCK_DEBUG printf("div_unlock: so=%p sopcb=%p lock=%p ref=%x lr=%p\n", so, so->so_pcb, so->so_pcb ? - ((struct inpcb *)so->so_pcb)->inpcb_mtx : NULL, + &(((struct inpcb *)so->so_pcb)->inpcb_mtx) : NULL, so->so_usecount, lr_saved); #endif if (refcount) @@ -713,7 +715,7 @@ div_unlock(struct socket *so, int refcount, void *lr) so, so->so_usecount, lr_saved, solockhistory_nr(so)); /* NOTREACHED */ } - mutex_held = ((struct inpcb *)so->so_pcb)->inpcb_mtx; + mutex_held = &((struct inpcb *)so->so_pcb)->inpcb_mtx; if (so->so_usecount == 0 && (inp->inp_wantcnt == WNT_STOPUSING)) { lck_rw_lock_exclusive(divcbinfo.mtx); @@ -739,7 +741,7 @@ div_getlock(struct socket *so, __unused int locktype) if (so->so_usecount < 0) panic("div_getlock: so=%p usecount=%x lrh= %s\n", so, so->so_usecount, solockhistory_nr(so)); - return(inpcb->inpcb_mtx); + return(&inpcb->inpcb_mtx); } else { panic("div_getlock: so=%p NULL NO PCB lrh= %s\n", so, solockhistory_nr(so)); diff --git a/bsd/netinet/ip_dummynet.c b/bsd/netinet/ip_dummynet.c index 54fceaef4..048cff004 100644 --- a/bsd/netinet/ip_dummynet.c +++ b/bsd/netinet/ip_dummynet.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -89,6 +89,7 @@ #include #include #include +//#include #include #include #include @@ -121,6 +122,8 @@ static int red_lookup_depth = 256; /* RED - default lookup table depth */ static int red_avg_pkt_size = 512; /* RED - default medium packet size */ static int red_max_pkt_size = 1500; /* RED - default max packet size */ +static int serialize = 0; + /* * Three heaps contain queues and pipes that the scheduler handles: * @@ -152,9 +155,6 @@ static void ready_event_wfq(struct dn_pipe *p, struct mbuf **head, */ static void dummynet_send(struct mbuf *m); -/* Flag to signify the existance of a dequeued packet chain */ -static int serialize = 0; - #define HASHSIZE 16 #define HASH(num) ((((num) >> 8) ^ ((num) >> 4) ^ (num)) & 0x0f) static struct dn_pipe_head pipehash[HASHSIZE]; /* all pipes */ @@ -163,36 +163,36 @@ static struct dn_flow_set_head flowsethash[HASHSIZE]; /* all flowsets */ #ifdef SYSCTL_NODE SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, - CTLFLAG_RW, 0, "Dummynet"); + CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Dummynet"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size, - CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size"); + CTLFLAG_RW | CTLFLAG_LOCKED, &dn_hash_size, 0, "Default hash table size"); SYSCTL_QUAD(_net_inet_ip_dummynet, OID_AUTO, curr_time, - CTLFLAG_RD, &curr_time, "Current tick"); + CTLFLAG_RD | CTLFLAG_LOCKED, &curr_time, "Current tick"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap, - CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap"); + CTLFLAG_RD | CTLFLAG_LOCKED, &ready_heap.size, 0, "Size of ready heap"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap, - CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap"); + CTLFLAG_RD | CTLFLAG_LOCKED, &extract_heap.size, 0, "Size of extract heap"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, searches, - CTLFLAG_RD, &searches, 0, "Number of queue searches"); + CTLFLAG_RD | CTLFLAG_LOCKED, &searches, 0, "Number of queue searches"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, search_steps, - CTLFLAG_RD, &search_steps, 0, "Number of queue search steps"); + CTLFLAG_RD | CTLFLAG_LOCKED, &search_steps, 0, "Number of queue search steps"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire, - CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty"); + CTLFLAG_RW | CTLFLAG_LOCKED, &pipe_expire, 0, "Expire queue if empty"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len, - CTLFLAG_RW, &dn_max_ratio, 0, + CTLFLAG_RW | CTLFLAG_LOCKED, &dn_max_ratio, 0, "Max ratio between dynamic queues and buckets"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, - CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table"); + CTLFLAG_RD | CTLFLAG_LOCKED, &red_lookup_depth, 0, "Depth of RED lookup table"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, - CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size"); + CTLFLAG_RD | CTLFLAG_LOCKED, &red_avg_pkt_size, 0, "RED Medium packet size"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, - CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size"); + CTLFLAG_RD | CTLFLAG_LOCKED, &red_max_pkt_size, 0, "RED Max packet size"); #endif #ifdef DUMMYNET_DEBUG int dummynet_debug = 0; #ifdef SYSCTL_NODE -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dummynet_debug, +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED, &dummynet_debug, 0, "control debugging printfs"); #endif #define DPRINTF(X) if (dummynet_debug) printf X @@ -457,6 +457,7 @@ char *cp_pipe_to_32_user(struct dn_pipe *p, struct dn_pipe_32 *pipe_bp) pipe_bp->pipe_nr = p->pipe_nr; pipe_bp->bandwidth = p->bandwidth; + pipe_bp->delay = p->delay; bcopy( &(p->scheduler_heap), &(pipe_bp->scheduler_heap), sizeof(struct dn_heap_32)); pipe_bp->scheduler_heap.p = CAST_DOWN_EXPLICIT(user32_addr_t, pipe_bp->scheduler_heap.p); bcopy( &(p->not_eligible_heap), &(pipe_bp->not_eligible_heap), sizeof(struct dn_heap_32)); @@ -497,6 +498,7 @@ char *cp_pipe_to_64_user(struct dn_pipe *p, struct dn_pipe_64 *pipe_bp) pipe_bp->pipe_nr = p->pipe_nr; pipe_bp->bandwidth = p->bandwidth; + pipe_bp->delay = p->delay; bcopy( &(p->scheduler_heap), &(pipe_bp->scheduler_heap), sizeof(struct dn_heap_64)); pipe_bp->scheduler_heap.p = CAST_DOWN(user64_addr_t, pipe_bp->scheduler_heap.p); bcopy( &(p->not_eligible_heap), &(pipe_bp->not_eligible_heap), sizeof(struct dn_heap_64)); @@ -648,47 +650,6 @@ heap_extract(struct dn_heap *h, void *obj) } } -#if 0 -/* - * change object position and update references - * XXX this one is never used! - */ -static void -heap_move(struct dn_heap *h, dn_key new_key, void *object) -{ - int temp; - int i ; - int maxelt = h->elements-1 ; - struct dn_heap_entry buf ; - - if (h->offset <= 0) - panic("cannot move items on this heap"); - - i = *((int *)((char *)object + h->offset)); - if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */ - h->p[i].key = new_key ; - for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ; - i = temp ) { /* bubble up */ - HEAP_SWAP(h->p[i], h->p[temp], buf) ; - SET_OFFSET(h, i); - } - } else { /* must move down */ - h->p[i].key = new_key ; - while ( (temp = HEAP_LEFT(i)) <= maxelt ) { /* found left child */ - if ((temp != maxelt) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key)) - temp++ ; /* select child with min key */ - if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */ - HEAP_SWAP(h->p[i], h->p[temp], buf) ; - SET_OFFSET(h, i); - } else - break ; - i = temp ; - } - } - SET_OFFSET(h, i); -} -#endif /* heap_move, unused */ - /* * heapify() will reorganize data inside an array to maintain the * heap property. It is needed when we delete a bunch of entries. @@ -757,10 +718,10 @@ transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail) { struct mbuf *m ; struct dn_pkt_tag *pkt ; + u_int64_t schedule_time; lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); - - /* Extract packets only if no pending chain is being currently processed */ + ASSERT(serialize >= 0); if (serialize == 0) { while ((m = pipe->head) != NULL) { pkt = dn_tag_get(m); @@ -774,9 +735,13 @@ transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail) *head = m; *tail = m; } + if (*tail != NULL) (*tail)->m_nextpkt = NULL; - } + } + + schedule_time = DN_KEY_LEQ(pkt->output_time, curr_time) ? + curr_time+1 : pkt->output_time; /* if there are leftover packets, put the pipe into the heap for next ready event */ if ((m = pipe->head) != NULL) { @@ -784,7 +749,7 @@ transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail) /* XXX should check errors on heap_insert, by draining the * whole pipe p and hoping in the future we are more successful */ - heap_insert(&extract_heap, pkt->output_time, pipe); + heap_insert(&extract_heap, schedule_time, pipe); } } @@ -1105,21 +1070,17 @@ dummynet(__unused void * unused) break; } } - - /* - * If a packet chain has been dequeued, set serialize=1 so that new - * packets don't get dispatched out of turn - */ + if (head != NULL) - serialize = 1; - - lck_mtx_unlock(dn_mutex); + serialize++; + + lck_mtx_unlock(dn_mutex); /* Send out the de-queued list of ready-to-send packets */ if (head != NULL) { dummynet_send(head); lck_mtx_lock(dn_mutex); - serialize = 0; + serialize--; lck_mtx_unlock(dn_mutex); } } @@ -1193,13 +1154,19 @@ if_tx_rdy(struct ifnet *ifp) p->numbytes = 0 ; /* mark ready for I/O */ ready_event_wfq(p, &head, &tail); } + + if (head != NULL) { + serialize++; + } + lck_mtx_unlock(dn_mutex); /* Send out the de-queued list of ready-to-send packets */ - if (head != NULL) + if (head != NULL) { dummynet_send(head); - + serialize--; + } return 0; } @@ -1214,6 +1181,7 @@ expire_queues(struct dn_flow_set *fs) int i, initial_elements = fs->rq_elements ; struct timeval timenow; + /* reviewed for getmicrotime usage */ getmicrotime(&timenow); if (fs->last_expired == timenow.tv_sec) @@ -1564,8 +1532,8 @@ dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa) goto dropit ; /* XXX expensive to zero, see if we can remove it*/ - mtag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, - sizeof(struct dn_pkt_tag), M_NOWAIT); + mtag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, + sizeof(struct dn_pkt_tag), M_NOWAIT, m); if ( mtag == NULL ) goto dropit ; /* cannot allocate packet header */ m_tag_prepend(m, mtag); /* attach to mbuf chain */ @@ -1591,7 +1559,7 @@ dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa) if (fwa->dst == (struct sockaddr_in *)&fwa->ro->ro_dst) /* dst points into ro */ fwa->dst = (struct sockaddr_in *)&(pkt->ro.ro_dst) ; - pkt->dn_dst = fwa->dst; + bcopy (fwa->dst, &pkt->dn_dst, sizeof(pkt->dn_dst)); pkt->flags = fwa->flags; if (fwa->ipoa != NULL) pkt->ipoa = *(fwa->ipoa); @@ -1619,7 +1587,7 @@ dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa) if (pipe->bandwidth) t = SET_TICKS(m, q, pipe); q->sched_time = curr_time ; - if (t == 0) /* must process it now */ + if (t == 0) /* must process it now */ ready_event( q , &head, &tail ); else heap_insert(&ready_heap, curr_time + t , q ); @@ -1682,9 +1650,10 @@ dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa) ts.tv_nsec = 1 * 1000000; // 1ms timer_enabled = 1; bsd_timeout(dummynet, NULL, &ts); - } + } lck_mtx_unlock(dn_mutex); + if (head != NULL) dummynet_send(head); @@ -1964,9 +1933,9 @@ set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src) x->qsize = 1024*1024 ; } else { if (x->qsize == 0) - x->qsize = 50 ; + x->qsize = 50; if (x->qsize > 100) - x->qsize = 50 ; + x->qsize = 50; } /* configuring RED */ if ( x->flags_fs & DN_IS_RED ) diff --git a/bsd/netinet/ip_dummynet.h b/bsd/netinet/ip_dummynet.h index 83f38d24e..e5dd1f337 100644 --- a/bsd/netinet/ip_dummynet.h +++ b/bsd/netinet/ip_dummynet.h @@ -157,7 +157,7 @@ struct dn_pkt_tag { dn_key output_time; /* when the pkt is due for delivery */ struct ifnet *ifp; /* interface, for ip_output */ - struct sockaddr_in *dn_dst ; + struct sockaddr_in dn_dst ; struct route ro; /* route, for ip_output. MUST COPY */ int flags ; /* flags, for ip_output (IPv6 ?) */ struct ip_out_args ipoa; /* output args, for ip_output. MUST COPY */ diff --git a/bsd/netinet/ip_encap.c b/bsd/netinet/ip_encap.c index 6dd02fe9e..0d487326a 100644 --- a/bsd/netinet/ip_encap.c +++ b/bsd/netinet/ip_encap.c @@ -259,9 +259,7 @@ encap4_input(m, off) #if INET6 int -encap6_input(mp, offp) - struct mbuf **mp; - int *offp; +encap6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ip6_hdr *ip6; @@ -269,10 +267,8 @@ encap6_input(mp, offp) const struct ip6protosw *psw; struct encaptab *ep, *match; int prio, matchprio; - int proto; ip6 = mtod(m, struct ip6_hdr *); - proto = ip6->ip6_nxt; bzero(&s, sizeof(s)); s.sin6_family = AF_INET6; @@ -315,7 +311,7 @@ encap6_input(mp, offp) psw = (const struct ip6protosw *)match->psw; if (psw && psw->pr_input) { encap_fillarg(m, match); - return (*psw->pr_input)(mp, offp); + return (*psw->pr_input)(mp, offp, proto); } else { m_freem(m); return IPPROTO_DONE; @@ -323,7 +319,7 @@ encap6_input(mp, offp) } /* last resort: inject to raw socket */ - return rip6_input(mp, offp); + return rip6_input(mp, offp, proto); } #endif @@ -532,8 +528,8 @@ encap_fillarg( struct m_tag *tag; struct encaptabtag *et; - tag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_ENCAP, - sizeof(struct encaptabtag), M_WAITOK); + tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_ENCAP, + sizeof(struct encaptabtag), M_WAITOK, m); if (tag != NULL) { et = (struct encaptabtag*)(tag + 1); diff --git a/bsd/netinet/ip_encap.h b/bsd/netinet/ip_encap.h index 66dfb2588..1c65ea956 100644 --- a/bsd/netinet/ip_encap.h +++ b/bsd/netinet/ip_encap.h @@ -77,7 +77,7 @@ struct encaptab { void encap_init(void) __attribute__((section("__TEXT, initcode"))); void encap4_input(struct mbuf *, int); -int encap6_input(struct mbuf **, int *); +int encap6_input(struct mbuf **, int *, int); const struct encaptab *encap_attach(int, int, const struct sockaddr *, const struct sockaddr *, const struct sockaddr *, const struct sockaddr *, const struct protosw *, void *); diff --git a/bsd/netinet/ip_flow.c b/bsd/netinet/ip_flow.c deleted file mode 100644 index be5aa9495..000000000 --- a/bsd/netinet/ip_flow.c +++ /dev/null @@ -1,380 +0,0 @@ -/* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/*- - * Copyright (c) 1998 The NetBSD Foundation, Inc. - * All rights reserved. - * - * This code is derived from software contributed to The NetBSD Foundation - * by the 3am Software Foundry ("3am"). It was developed by Matt Thomas. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the NetBSD - * Foundation, Inc. and its contributors. - * 4. Neither the name of The NetBSD Foundation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD: src/sys/netinet/ip_flow.c,v 1.9.2.1 2001/08/08 08:20:35 ru Exp $ - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#if IPFLOW - -#define IPFLOW_TIMER (5 * PR_SLOWHZ) -#define IPFLOW_HASHBITS 6 /* should not be a multiple of 8 */ -#define IPFLOW_HASHSIZE (1 << IPFLOW_HASHBITS) -static LIST_HEAD(ipflowhead, ipflow) ipflows[IPFLOW_HASHSIZE]; -static int ipflow_inuse; -#define IPFLOW_MAX 256 - -#ifdef __APPLE__ -#define M_IPFLOW M_TEMP -#endif - -static int ipflow_active = 0; -SYSCTL_INT(_net_inet_ip, IPCTL_FASTFORWARDING, fastforwarding, CTLFLAG_RW, - &ipflow_active, 0, "Enable flow-based IP forwarding"); - -#ifndef __APPLE__ -static MALLOC_DEFINE(M_IPFLOW, "ip_flow", "IP flow"); -#endif - -static unsigned -ipflow_hash( - struct in_addr dst, - struct in_addr src, - unsigned tos) -{ - unsigned hash = tos; - int idx; - for (idx = 0; idx < 32; idx += IPFLOW_HASHBITS) - hash += (dst.s_addr >> (32 - idx)) + (src.s_addr >> idx); - return hash & (IPFLOW_HASHSIZE-1); -} - -static struct ipflow * -ipflow_lookup( - const struct ip *ip) -{ - unsigned hash; - struct ipflow *ipf; - - hash = ipflow_hash(ip->ip_dst, ip->ip_src, ip->ip_tos); - - ipf = LIST_FIRST(&ipflows[hash]); - while (ipf != NULL) { - if (ip->ip_dst.s_addr == ipf->ipf_dst.s_addr - && ip->ip_src.s_addr == ipf->ipf_src.s_addr - && ip->ip_tos == ipf->ipf_tos) - break; - ipf = LIST_NEXT(ipf, ipf_next); - } - return ipf; -} - -int -ipflow_fastforward( - struct mbuf *m) -{ - struct ip *ip; - struct ipflow *ipf; - struct rtentry *rt; - struct sockaddr *dst; - int error; - - /* - * Are we forwarding packets? Big enough for an IP packet? - */ - if (!ipforwarding || !ipflow_active || m->m_len < sizeof(struct ip)) - return 0; - /* - * IP header with no option and valid version and length - */ - ip = mtod(m, struct ip *); - if (ip->ip_v != IPVERSION || ip->ip_hl != (sizeof(struct ip) >> 2) - || ntohs(ip->ip_len) > m->m_pkthdr.len) - return 0; - /* - * Find a flow. - */ - if ((ipf = ipflow_lookup(ip)) == NULL) - return 0; - - /* - * Route and interface still up? - */ - rt = ipf->ipf_ro.ro_rt; - if ((rt->rt_flags & RTF_UP) == 0 || (rt->rt_ifp->if_flags & IFF_UP) == 0) - return 0; - - /* - * Packet size OK? TTL? - */ - if (m->m_pkthdr.len > rt->rt_ifp->if_mtu || ip->ip_ttl <= IPTTLDEC) - return 0; - - /* - * Everything checks out and so we can forward this packet. - * Modify the TTL and incrementally change the checksum. - */ - ip->ip_ttl -= IPTTLDEC; - if (ip->ip_sum >= htons(0xffff - (IPTTLDEC << 8))) { - ip->ip_sum += htons(IPTTLDEC << 8) + 1; - } else { - ip->ip_sum += htons(IPTTLDEC << 8); - } - - /* - * Send the packet on its way. All we can get back is ENOBUFS - */ - ipf->ipf_uses++; - ipf->ipf_timer = IPFLOW_TIMER; - - if (rt->rt_flags & RTF_GATEWAY) - dst = rt->rt_gateway; - else - dst = &ipf->ipf_ro.ro_dst; -#ifdef __APPLE__ - /* Not sure the rt_dlt is valid here !! XXX */ - if ((error = dlil_output(rt->rt_ifp, PF_INET, m, (caddr_t) rt, dst, 0)) != 0) { - -#else - if ((error = (*rt->rt_ifp->if_output)(rt->rt_ifp, m, dst, rt)) != 0) { -#endif - if (error == ENOBUFS) - ipf->ipf_dropped++; - else - ipf->ipf_errors++; - } - return 1; -} - -static void -ipflow_addstats( - struct ipflow *ipf) -{ - ipf->ipf_ro.ro_rt->rt_use += ipf->ipf_uses; - OSAddAtomic(ipf->ipf_errors + ipf->ipf_dropped, &ipstat.ips_cantforward); - OSAddAtomic(ipf->ipf_uses, &ipstat.ips_forward); - OSAddAtomic(ipf->ipf_uses, &ipstat.ips_fastforward); -} - -static void -ipflow_free( - struct ipflow *ipf) -{ - /* - * Remove the flow from the hash table (at elevated IPL). - * Once it's off the list, we can deal with it at normal - * network IPL. - */ - LIST_REMOVE(ipf, ipf_next); - ipflow_addstats(ipf); - rtfree(ipf->ipf_ro.ro_rt); - ipflow_inuse--; - FREE(ipf, M_IPFLOW); -} - -static struct ipflow * -ipflow_reap( - void) -{ - struct ipflow *ipf, *maybe_ipf = NULL; - int idx; - - for (idx = 0; idx < IPFLOW_HASHSIZE; idx++) { - ipf = LIST_FIRST(&ipflows[idx]); - while (ipf != NULL) { - /* - * If this no longer points to a valid route - * reclaim it. - */ - if ((ipf->ipf_ro.ro_rt->rt_flags & RTF_UP) == 0) - goto done; - /* - * choose the one that's been least recently used - * or has had the least uses in the last 1.5 - * intervals. - */ - if (maybe_ipf == NULL - || ipf->ipf_timer < maybe_ipf->ipf_timer - || (ipf->ipf_timer == maybe_ipf->ipf_timer - && ipf->ipf_last_uses + ipf->ipf_uses < - maybe_ipf->ipf_last_uses + - maybe_ipf->ipf_uses)) - maybe_ipf = ipf; - ipf = LIST_NEXT(ipf, ipf_next); - } - } - ipf = maybe_ipf; - done: - /* - * Remove the entry from the flow table. - */ - LIST_REMOVE(ipf, ipf_next); - ipflow_addstats(ipf); - rtfree(ipf->ipf_ro.ro_rt); - return ipf; -} -/* note: called under the ip_mutex lock */ -void -ipflow_slowtimo( - void) -{ - struct ipflow *ipf; - int idx; - - for (idx = 0; idx < IPFLOW_HASHSIZE; idx++) { - ipf = LIST_FIRST(&ipflows[idx]); - while (ipf != NULL) { - struct ipflow *next_ipf = LIST_NEXT(ipf, ipf_next); - if (--ipf->ipf_timer == 0) { - ipflow_free(ipf); - } else { - ipf->ipf_last_uses = ipf->ipf_uses; - ipf->ipf_ro.ro_rt->rt_use += ipf->ipf_uses; - OSAddAtomic(ipf->ipf_uses, &ipstat.ips_forward); - OSAddAtomic(ipf->ipf_uses, &ipstat.ips_fastforward); - ipstat.ips_forward += ipf->ipf_uses; - ipstat.ips_fastforward += ipf->ipf_uses; - ipf->ipf_uses = 0; - } - ipf = next_ipf; - } - } -} - -void -ipflow_create( - const struct route *ro, - struct mbuf *m) -{ - const struct ip *const ip = mtod(m, struct ip *); - struct ipflow *ipf; - unsigned hash; - - /* - * Don't create cache entries for ICMP messages. - */ - if (!ipflow_active || ip->ip_p == IPPROTO_ICMP) - return; - /* - * See if an existing flow struct exists. If so remove it from it's - * list and free the old route. If not, try to malloc a new one - * (if we aren't at our limit). - */ - ipf = ipflow_lookup(ip); - if (ipf == NULL) { - if (ipflow_inuse == IPFLOW_MAX) { - ipf = ipflow_reap(); - } else { - ipf = (struct ipflow *) _MALLOC(sizeof(*ipf), M_IPFLOW, - M_NOWAIT); - if (ipf == NULL) - return; - ipflow_inuse++; - } - bzero((caddr_t) ipf, sizeof(*ipf)); - } else { - LIST_REMOVE(ipf, ipf_next); - ipflow_addstats(ipf); - rtfree(ipf->ipf_ro.ro_rt); - ipf->ipf_uses = ipf->ipf_last_uses = 0; - ipf->ipf_errors = ipf->ipf_dropped = 0; - } - - /* - * Fill in the updated information. - */ - ipf->ipf_ro = *ro; - RT_ADDREF(ro->ro_rt); - ipf->ipf_dst = ip->ip_dst; - ipf->ipf_src = ip->ip_src; - ipf->ipf_tos = ip->ip_tos; - ipf->ipf_timer = IPFLOW_TIMER; - /* - * Insert into the approriate bucket of the flow table. - */ - hash = ipflow_hash(ip->ip_dst, ip->ip_src, ip->ip_tos); - LIST_INSERT_HEAD(&ipflows[hash], ipf, ipf_next); -} -#else /* !IPFLOW */ -int -ipflow_fastforward(struct mbuf *m) -{ -#pragma unused(m) - /* - * Since this symbol is exported (albeit unsupported), just return - * false to keep things (e.g. PPP) happy, in case ipflow is not - * compiled in. - */ - return (0); -} -#endif /* !IPFLOW */ diff --git a/bsd/netinet/ip_flow.h b/bsd/netinet/ip_flow.h deleted file mode 100644 index 972d96351..000000000 --- a/bsd/netinet/ip_flow.h +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/*- - * Copyright (c) 1998 The NetBSD Foundation, Inc. - * All rights reserved. - * - * This code is derived from software contributed to The NetBSD Foundation - * by the 3am Software Foundry ("3am"). It was developed by Matt Thomas. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the NetBSD - * Foundation, Inc. and its contributors. - * 4. Neither the name of The NetBSD Foundation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD: src/sys/netinet/ip_flow.h,v 1.2 1999/08/28 00:49:22 peter Exp $ - */ - -#ifndef _NETINET_IP_FLOW_H -#define _NETINET_IP_FLOW_H -#include - -#ifdef KERNEL_PRIVATE -struct ipflow { - LIST_ENTRY(ipflow) ipf_next; /* next ipflow in bucket */ - struct in_addr ipf_dst; /* destination address */ - struct in_addr ipf_src; /* source address */ - - u_int8_t ipf_tos; /* type-of-service */ - struct route ipf_ro; /* associated route entry */ - u_int32_t ipf_uses; /* number of uses in this period */ - - int ipf_timer; /* remaining lifetime of this entry */ - u_int32_t ipf_dropped; /* ENOBUFS returned by if_output */ - u_int32_t ipf_errors; /* other errors returned by if_output */ - u_int32_t ipf_last_uses; /* number of uses in last period */ -}; -#endif /* KERNEL_PRIVATE */ - -#endif diff --git a/bsd/netinet/ip_fw.h b/bsd/netinet/ip_fw.h index 6755fab56..53ead3fa0 100644 --- a/bsd/netinet/ip_fw.h +++ b/bsd/netinet/ip_fw.h @@ -42,6 +42,7 @@ #ifndef _IP_FW_H #define _IP_FW_H +#ifdef __APPLE_API_OBSOLETE #include @@ -324,4 +325,5 @@ extern struct ipfw_flow_id last_pkt ; #endif /* KERNEL_PRIVATE */ #endif /* !IPFW2 */ +#endif /* __APPLE_API_OBSOLETE */ #endif /* _IP_FW_H */ diff --git a/bsd/netinet/ip_fw2.c b/bsd/netinet/ip_fw2.c index 9be482912..bb66be5d7 100644 --- a/bsd/netinet/ip_fw2.c +++ b/bsd/netinet/ip_fw2.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2004-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -165,21 +165,21 @@ static int ipfw_sysctl SYSCTL_HANDLER_ARGS; SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Firewall"); SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable, - CTLTYPE_INT | CTLFLAG_RW, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &fw_enable, 0, ipfw_sysctl, "I", "Enable ipfw"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_RW | CTLFLAG_LOCKED, &autoinc_step, 0, "Rule number autincrement step"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass, - CTLFLAG_RW, + CTLFLAG_RW | CTLFLAG_LOCKED, &fw_one_pass, 0, "Only do a single pass through ipfw when using dummynet(4)"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, - CTLFLAG_RW, + CTLFLAG_RW | CTLFLAG_LOCKED, &fw_debug, 0, "Enable printing of debug ip_fw statements"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, - CTLFLAG_RW, + CTLFLAG_RW | CTLFLAG_LOCKED, &fw_verbose, 0, "Log matches to ipfw rules"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged"); /* @@ -251,29 +251,29 @@ static u_int32_t static_len_64; /* size in bytes of static rules for 64 bit clie static u_int32_t dyn_count; /* # of dynamic rules */ static u_int32_t dyn_max = 4096; /* max # of dynamic rules */ -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_RW | CTLFLAG_LOCKED, &dyn_buckets, 0, "Number of dyn. buckets"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, CTLFLAG_RD, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, CTLFLAG_RD | CTLFLAG_LOCKED, &curr_dyn_buckets, 0, "Current Number of dyn. buckets"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_count, CTLFLAG_RD, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_count, CTLFLAG_RD | CTLFLAG_LOCKED, &dyn_count, 0, "Number of dyn. rules"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_max, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_max, CTLFLAG_RW | CTLFLAG_LOCKED, &dyn_max, 0, "Max number of dyn. rules"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD | CTLFLAG_LOCKED, &static_count, 0, "Number of static rules"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW | CTLFLAG_LOCKED, &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW | CTLFLAG_LOCKED, &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW | CTLFLAG_LOCKED, &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW | CTLFLAG_LOCKED, &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW | CTLFLAG_LOCKED, &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW | CTLFLAG_LOCKED, &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW | CTLFLAG_LOCKED, &dyn_keepalive, 0, "Enable keepalives for dyn. rules"); @@ -338,6 +338,8 @@ void ipfwsyslog( int level, const char *format,...) unsigned char pri; int loglen; + bzero(msgBuf, msgsize); + bzero(&ev_msg, sizeof(struct kev_msg)); va_start( ap, format ); loglen = vsnprintf(msgBuf, msgsize, format, ap); va_end( ap ); @@ -965,15 +967,18 @@ iface_match(struct ifnet *ifp, ipfw_insn_if *cmd) ifnet_lock_shared(ifp); TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { - if (ia->ifa_addr == NULL) - continue; - if (ia->ifa_addr->sa_family != AF_INET) + IFA_LOCK(ia); + if (ia->ifa_addr->sa_family != AF_INET) { + IFA_UNLOCK(ia); continue; + } if (cmd->p.ip.s_addr == ((struct sockaddr_in *) (ia->ifa_addr))->sin_addr.s_addr) { + IFA_UNLOCK(ia); ifnet_lock_done(ifp); return(1); /* match */ } + IFA_UNLOCK(ia); } ifnet_lock_done(ifp); } diff --git a/bsd/netinet/ip_fw2.h b/bsd/netinet/ip_fw2.h index 0485bcbc2..5e093b170 100644 --- a/bsd/netinet/ip_fw2.h +++ b/bsd/netinet/ip_fw2.h @@ -55,6 +55,7 @@ #ifndef _IPFW2_H #define _IPFW2_H +#ifdef __APPLE_API_OBSOLETE /* * Define IP Firewall event subclass, and associated events. @@ -634,4 +635,5 @@ extern int fw_enable; #endif /* IPFIREWALL */ #endif /* KERNEL */ +#endif /* __APPLE_API_OBSOLETE */ #endif /* _IPFW2_H */ diff --git a/bsd/netinet/ip_fw2_compat.c b/bsd/netinet/ip_fw2_compat.c index 766fa8fc8..712f49241 100644 --- a/bsd/netinet/ip_fw2_compat.c +++ b/bsd/netinet/ip_fw2_compat.c @@ -1592,7 +1592,7 @@ ipfw_version_latest_to_zero(struct ip_fw *curr_rule, struct ip_old_fw *rule_vers rule_vers0->pipe_ptr = CAST_DOWN_EXPLICIT(void*, rule_vers1.pipe_ptr); rule_vers0->next_rule_ptr = CAST_DOWN_EXPLICIT(void*, rule_vers1.next_rule_ptr); - if (rule_vers1.fw_ipflg && IP_FW_IF_TCPEST_COMPAT) rule_vers0->fw_tcpf |= IP_OLD_FW_TCPF_ESTAB; + if (rule_vers1.fw_ipflg & IP_FW_IF_TCPEST_COMPAT) rule_vers0->fw_tcpf |= IP_OLD_FW_TCPF_ESTAB; } else { struct ip_fw_compat_32 rule_vers1; @@ -1620,7 +1620,7 @@ ipfw_version_latest_to_zero(struct ip_fw *curr_rule, struct ip_old_fw *rule_vers rule_vers0->pipe_ptr = CAST_DOWN_EXPLICIT(void*, rule_vers1.pipe_ptr); rule_vers0->next_rule_ptr = CAST_DOWN_EXPLICIT(void*, rule_vers1.next_rule_ptr); - if (rule_vers1.fw_ipflg && IP_FW_IF_TCPEST_COMPAT) rule_vers0->fw_tcpf |= IP_OLD_FW_TCPF_ESTAB; + if (rule_vers1.fw_ipflg & IP_FW_IF_TCPEST_COMPAT) rule_vers0->fw_tcpf |= IP_OLD_FW_TCPF_ESTAB; } } diff --git a/bsd/netinet/ip_icmp.c b/bsd/netinet/ip_icmp.c index 7f55b2a5f..48ea2f0f5 100644 --- a/bsd/netinet/ip_icmp.c +++ b/bsd/netinet/ip_icmp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -124,23 +124,23 @@ */ struct icmpstat icmpstat; -SYSCTL_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats, CTLFLAG_RD, +SYSCTL_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats, CTLFLAG_RD | CTLFLAG_LOCKED, &icmpstat, icmpstat, ""); static int icmpmaskrepl = 0; -SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_RW, +SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_RW | CTLFLAG_LOCKED, &icmpmaskrepl, 0, ""); static int icmptimestamp = 0; -SYSCTL_INT(_net_inet_icmp, ICMPCTL_TIMESTAMP, timestamp, CTLFLAG_RW, +SYSCTL_INT(_net_inet_icmp, ICMPCTL_TIMESTAMP, timestamp, CTLFLAG_RW | CTLFLAG_LOCKED, &icmptimestamp, 0, ""); static int drop_redirect = 0; -SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_RW, +SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_RW | CTLFLAG_LOCKED, &drop_redirect, 0, ""); static int log_redirect = 0; -SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_RW, +SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_RW | CTLFLAG_LOCKED, &log_redirect, 0, ""); #if ICMP_BANDLIM @@ -151,12 +151,12 @@ SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_RW, */ static int icmplim = 250; -SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RW, +SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RW | CTLFLAG_LOCKED, &icmplim, 0, ""); #else static int icmplim = -1; -SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RD, +SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RD | CTLFLAG_LOCKED, &icmplim, 0, ""); #endif @@ -166,7 +166,7 @@ SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RD, */ static int icmpbmcastecho = 1; -SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_RW, +SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_RW | CTLFLAG_LOCKED, &icmpbmcastecho, 0, ""); @@ -537,8 +537,10 @@ icmp_input(struct mbuf *m, int hlen) (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif); if (ia == 0) break; + IFA_LOCK(&ia->ia_ifa); if (ia->ia_ifp == 0) { - ifafree(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); ia = NULL; break; } @@ -550,7 +552,8 @@ icmp_input(struct mbuf *m, int hlen) else if (ia->ia_ifp->if_flags & IFF_POINTOPOINT) ip->ip_src = satosin(&ia->ia_dstaddr)->sin_addr; } - ifafree(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); reflect: ip->ip_len += hlen; /* since ip_input deducts this */ icmpstat.icps_reflect++; @@ -662,8 +665,13 @@ icmp_reflect(struct mbuf *m) */ lck_rw_lock_shared(in_ifaddr_rwlock); TAILQ_FOREACH(ia, INADDR_HASH(t.s_addr), ia_hash) { - if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr) + IFA_LOCK(&ia->ia_ifa); + if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr) { + IFA_ADDREF_LOCKED(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); goto match; + } + IFA_UNLOCK(&ia->ia_ifa); } /* * Slow path; check for broadcast addresses. Find a source @@ -671,13 +679,16 @@ icmp_reflect(struct mbuf *m) * let IP handle the source interface selection work. */ for (ia = in_ifaddrhead.tqh_first; ia; ia = ia->ia_link.tqe_next) { + IFA_LOCK(&ia->ia_ifa); if (ia->ia_ifp && (ia->ia_ifp->if_flags & IFF_BROADCAST) && - t.s_addr == satosin(&ia->ia_broadaddr)->sin_addr.s_addr) + t.s_addr == satosin(&ia->ia_broadaddr)->sin_addr.s_addr) { + IFA_ADDREF_LOCKED(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); break; + } + IFA_UNLOCK(&ia->ia_ifa); } match: - if (ia) - ifaref(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); icmpdst.sin_addr = t; if ((ia == (struct in_ifaddr *)0) && m->m_pkthdr.rcvif) @@ -695,16 +706,18 @@ icmp_reflect(struct mbuf *m) m_freem(m); goto done; } - ifaref(&ia->ia_ifa); + IFA_ADDREF(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); } #if CONFIG_MACF_NET mac_netinet_icmp_reply(m); #endif + IFA_LOCK_SPIN(&ia->ia_ifa); t = IA_SIN(ia)->sin_addr; + IFA_UNLOCK(&ia->ia_ifa); ip->ip_src = t; ip->ip_ttl = ip_defttl; - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); ia = NULL; if (optlen > 0) { @@ -797,10 +810,10 @@ icmp_send(struct mbuf *m, struct mbuf *opts) int hlen; struct icmp *icp; struct route ro; - struct ip_out_args ipoa = { IFSCOPE_NONE }; + struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif != NULL) - ipoa.ipoa_ifscope = m->m_pkthdr.rcvif->if_index; + ipoa.ipoa_boundif = m->m_pkthdr.rcvif->if_index; hlen = IP_VHL_HL(ip->ip_vhl) << 2; m->m_data += hlen; @@ -1037,6 +1050,7 @@ icmp_dgram_ctloutput(struct socket *so, struct sockopt *sopt) case IP_RECVDSTADDR: case IP_RETOPTS: case IP_MULTICAST_IF: + case IP_MULTICAST_IFINDEX: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: @@ -1054,6 +1068,7 @@ icmp_dgram_ctloutput(struct socket *so, struct sockopt *sopt) #if CONFIG_FORCE_OUT_IFP case IP_FORCE_OUT_IFP: #endif + case IP_NO_IFT_CELLULAR: error = rip_ctloutput(so, sopt); break; @@ -1109,12 +1124,15 @@ icmp_dgram_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *n } TAILQ_FOREACH(ia, INADDR_HASH(ip->ip_src.s_addr), ia_hash) { + IFA_LOCK(&ia->ia_ifa); if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_src.s_addr) { + IFA_UNLOCK(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); socket_lock(so, 0); goto ours; } + IFA_UNLOCK(&ia->ia_ifa); } lck_rw_done(in_ifaddr_rwlock); socket_lock(so, 0); diff --git a/bsd/netinet/ip_id.c b/bsd/netinet/ip_id.c index 7a6fef876..46c7fecd9 100644 --- a/bsd/netinet/ip_id.c +++ b/bsd/netinet/ip_id.c @@ -137,6 +137,7 @@ ip_initid(void) struct timeval timenow; getmicrotime(&timenow); + read_random((void *) &tmp, sizeof(tmp)); ru_x = (tmp & 0xFFFF) % RU_M; /* 15 bits of random seed */ diff --git a/bsd/netinet/ip_input.c b/bsd/netinet/ip_input.c index 10156a869..761b4b40c 100644 --- a/bsd/netinet/ip_input.c +++ b/bsd/netinet/ip_input.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -80,6 +80,8 @@ #include #include #include +#include +#include #include @@ -93,6 +95,7 @@ #include #include #include +#include #include #include @@ -113,6 +116,7 @@ #include #include #include +#include #if CONFIG_MACF_NET #include @@ -167,46 +171,46 @@ SYSCTL_PROC(_net_inet_ip, IPCTL_FORWARDING, forwarding, sysctl_ipforwarding, "I", "Enable IP forwarding between interfaces"); static int ipsendredirects = 1; /* XXX */ -SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW | CTLFLAG_LOCKED, &ipsendredirects, 0, "Enable sending IP redirects"); int ip_defttl = IPDEFTTL; -SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_defttl, 0, "Maximum TTL on IP packets"); static int ip_dosourceroute = 0; -SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_dosourceroute, 0, "Enable forwarding source routed IP packets"); static int ip_acceptsourceroute = 0; SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute, - CTLFLAG_RW, &ip_acceptsourceroute, 0, + CTLFLAG_RW | CTLFLAG_LOCKED, &ip_acceptsourceroute, 0, "Enable accepting source routed IP packets"); static int ip_keepfaith = 0; -SYSCTL_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_keepfaith, 0, "Enable packet capture for FAITH IPv4->IPv6 translater daemon"); static int nipq = 0; /* total # of reass queues */ static int maxnipq; -SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_RW | CTLFLAG_LOCKED, &maxnipq, 0, "Maximum number of IPv4 fragment reassembly queue entries"); static int maxfragsperpacket; -SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_RW | CTLFLAG_LOCKED, &maxfragsperpacket, 0, "Maximum number of IPv4 fragments allowed per packet"); static int maxfrags; -SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW | CTLFLAG_LOCKED, &maxfrags, 0, "Maximum number of IPv4 fragments allowed"); static int currentfrags = 0; int ip_doscopedroute = 1; -SYSCTL_INT(_net_inet_ip, OID_AUTO, scopedroute, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, scopedroute, CTLFLAG_RD | CTLFLAG_LOCKED, &ip_doscopedroute, 0, "Enable IPv4 scoped routing"); /* @@ -223,7 +227,7 @@ SYSCTL_INT(_net_inet_ip, OID_AUTO, scopedroute, CTLFLAG_RW, * packets for those addresses are received. */ static int ip_checkinterface = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_checkinterface, 0, "Verify packet arrives on correct interface"); @@ -251,13 +255,13 @@ static u_int32_t inaddr_nhash; /* hash table size */ static u_int32_t inaddr_hashp; /* next largest prime */ struct ifqueue ipintrq; -SYSCTL_INT(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLFLAG_RW | CTLFLAG_LOCKED, &ipintrq.ifq_maxlen, 0, "Maximum size of the IP input queue"); -SYSCTL_INT(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLFLAG_RD, +SYSCTL_INT(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLFLAG_RD | CTLFLAG_LOCKED, &ipintrq.ifq_drops, 0, "Number of packets dropped from the IP input queue"); struct ipstat ipstat; -SYSCTL_STRUCT(_net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RD, +SYSCTL_STRUCT(_net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RD | CTLFLAG_LOCKED, &ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)"); /* Packet reassembly stuff */ @@ -279,13 +283,13 @@ lck_mtx_t *inet_domain_mutex; extern lck_mtx_t *domain_proto_mtx; #if IPCTL_DEFMTU -SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_mtu, 0, "Default MTU"); #endif #if IPSTEALTH static int ipstealth = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW | CTLFLAG_LOCKED, &ipstealth, 0, ""); #endif @@ -304,17 +308,17 @@ ip_dn_io_t *ip_dn_io_ptr; int (*fr_checkp)(struct ip *, int, struct ifnet *, int, struct mbuf **) = NULL; #endif /* IPFIREWALL */ -SYSCTL_NODE(_net_inet_ip, OID_AUTO, linklocal, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "link local"); +SYSCTL_NODE(_net_inet_ip, OID_AUTO, linklocal, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "link local"); struct ip_linklocal_stat ip_linklocal_stat; -SYSCTL_STRUCT(_net_inet_ip_linklocal, OID_AUTO, stat, CTLFLAG_RD, +SYSCTL_STRUCT(_net_inet_ip_linklocal, OID_AUTO, stat, CTLFLAG_RD | CTLFLAG_LOCKED, &ip_linklocal_stat, ip_linklocal_stat, "Number of link local packets with TTL less than 255"); -SYSCTL_NODE(_net_inet_ip_linklocal, OID_AUTO, in, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "link local input"); +SYSCTL_NODE(_net_inet_ip_linklocal, OID_AUTO, in, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "link local input"); int ip_linklocal_in_allowbadttl = 1; -SYSCTL_INT(_net_inet_ip_linklocal_in, OID_AUTO, allowbadttl, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_linklocal_in, OID_AUTO, allowbadttl, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_linklocal_in_allowbadttl, 0, "Allow incoming link local packets with TTL less than 255"); @@ -359,7 +363,7 @@ void in_dinit(void); extern u_short ip_id; int ip_use_randomid = 1; -SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_use_randomid, 0, "Randomize IP packets IDs"); #endif @@ -379,6 +383,9 @@ ip_init(void) if (!ip_initialized) { + PE_parse_boot_argn("net.inet.ip.scopedroute", + &ip_doscopedroute, sizeof (ip_doscopedroute)); + in_ifaddr_init(); in_ifaddr_rwlock_grp_attr = lck_grp_attr_alloc_init(); @@ -391,6 +398,8 @@ ip_init(void) TAILQ_INIT(&in_ifaddrhead); in_ifaddrhashtbl_init(); + ip_moptions_init(); + pr = pffindproto_locked(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == 0) panic("ip_init"); @@ -531,9 +540,7 @@ in_dinit(void) if (!inetdomain_initted) { -#if 0 - kprintf("Initing %d protosw entries\n", in_proto_count); -#endif + /* kprintf("Initing %d protosw entries\n", in_proto_count); */ dp = &inetdomain; dp->dom_flags = DOM_REENTRANT; @@ -637,6 +644,9 @@ ip_input(struct mbuf *m) #endif ipfilter_t inject_filter_ref = 0; + /* Check if the mbuf is still valid after interface filter processing */ + MBUF_INPUT_CHECK(m, m->m_pkthdr.rcvif); + #if IPFIREWALL args.eh = NULL; args.oif = NULL; @@ -707,6 +717,11 @@ ip_input(struct mbuf *m) if (inject_filter_ref != 0) { ip = mtod(m, struct ip *); hlen = IP_VHL_HL(ip->ip_vhl) << 2; + + DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL, + struct ip *, ip, struct ifnet *, m->m_pkthdr.rcvif, + struct ip *, ip, struct ip6_hdr *, NULL); + ip->ip_len = ntohs(ip->ip_len) - hlen; ip->ip_off = ntohs(ip->ip_off); ip_proto_dispatch_in(m, hlen, ip->ip_p, inject_filter_ref); @@ -801,6 +816,30 @@ ip_input(struct mbuf *m) goto bad; } + DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL, + struct ip *, ip, struct ifnet *, m->m_pkthdr.rcvif, + struct ip *, ip, struct ip6_hdr *, NULL); + + /* + * Naively assume we can attribute inbound data to the route we would + * use to send to this destination. Asymetric routing breaks this + * assumption, but it still allows us to account for traffic from + * a remote node in the routing table. + * this has a very significant performance impact so we bypass + * if nstat_collect is disabled. We may also bypass if the + * protocol is tcp in the future because tcp will have a route that + * we can use to attribute the data to. That does mean we would not + * account for forwarded tcp traffic. + */ + if (nstat_collect) { + struct rtentry *rt = + ifnet_cached_rtlookup_inet(m->m_pkthdr.rcvif, ip->ip_src); + if (rt != NULL) { + nstat_route_rx(rt, 1, m->m_pkthdr.len, 0); + rtfree(rt); + } + } + /* * Convert fields to host representation. */ @@ -839,36 +878,29 @@ ip_input(struct mbuf *m) m_adj(m, ip->ip_len - m->m_pkthdr.len); } -#if IPSEC - if (ipsec_bypass == 0 && ipsec_gethist(m, NULL)) - goto pass; -#endif - - /* - * IpHack's section. - * Right now when no processing on packet has done - * and it is still fresh out of network we do our black - * deals with it. - * - Firewall: deny/allow/divert - * - Xlate: translate packet's addr/port (NAT). - * - Pipe: pass pkt through dummynet. - * - Wrap: fake packet's addr/port - * - Encapsulate: put it in another IP and send out. - */ #if PF /* Invoke inbound packet filter */ - if (pf_af_hook(m->m_pkthdr.rcvif, NULL, &m, AF_INET, TRUE) != 0) { - if (m != NULL) { - panic("%s: unexpected packet %p\n", __func__, m); - /* NOTREACHED */ - } - /* Already freed by callee */ - return; + if (PF_IS_ENABLED) { + int error; + error = pf_af_hook(m->m_pkthdr.rcvif, NULL, &m, AF_INET, TRUE); + if (error != 0) { + if (m != NULL) { + panic("%s: unexpected packet %p\n", __func__, m); + /* NOTREACHED */ + } + /* Already freed by callee */ + return; + } + ip = mtod(m, struct ip *); + hlen = IP_VHL_HL(ip->ip_vhl) << 2; } - ip = mtod(m, struct ip *); - hlen = IP_VHL_HL(ip->ip_vhl) << 2; #endif /* PF */ +#if IPSEC + if (ipsec_bypass == 0 && ipsec_gethist(m, NULL)) + goto pass; +#endif + #if IPFIREWALL #if DUMMYNET iphack: @@ -1015,11 +1047,14 @@ ip_input(struct mbuf *m) * arrived via the correct interface if checking is * enabled. */ + IFA_LOCK_SPIN(&ia->ia_ifa); if (IA_SIN(ia)->sin_addr.s_addr == pkt_dst.s_addr && (!checkif || ia->ia_ifp == m->m_pkthdr.rcvif)) { + IFA_UNLOCK(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); goto ours; } + IFA_UNLOCK(&ia->ia_ifa); } lck_rw_done(in_ifaddr_rwlock); @@ -1037,15 +1072,20 @@ ip_input(struct mbuf *m) ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - if (ifa->ifa_addr->sa_family != AF_INET) + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr->sa_family != AF_INET) { + IFA_UNLOCK(ifa); continue; + } ia = ifatoia(ifa); if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == pkt_dst.s_addr || ia->ia_netbroadcast.s_addr == pkt_dst.s_addr) { + IFA_UNLOCK(ifa); ifnet_lock_done(ifp); goto ours; } + IFA_UNLOCK(ifa); } ifnet_lock_done(ifp); } @@ -1085,14 +1125,15 @@ ip_input(struct mbuf *m) * See if we belong to the destination multicast group on the * arrival interface. */ - ifnet_lock_shared(ifp); - IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm); - ifnet_lock_done(ifp); + in_multihead_lock_shared(); + IN_LOOKUP_MULTI(&ip->ip_dst, ifp, inm); + in_multihead_lock_done(); if (inm == NULL) { OSAddAtomic(1, &ipstat.ips_notmember); m_freem(m); return; } + INM_REMREF(inm); goto ours; } if (ip->ip_dst.s_addr == (u_int32_t)INADDR_BROADCAST) @@ -1350,9 +1391,9 @@ ip_input(struct mbuf *m) struct m_tag *fwd_tag; struct ip_fwd_tag *ipfwd_tag; - fwd_tag = m_tag_alloc(KERNEL_MODULE_TAG_ID, + fwd_tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, sizeof (*ipfwd_tag), - M_NOWAIT); + M_NOWAIT, m); if (fwd_tag == NULL) { goto bad; } @@ -1731,9 +1772,6 @@ ip_slowtimo(void) } } } -#if IPFLOW - ipflow_slowtimo(); -#endif lck_mtx_unlock(ip_mutex); } @@ -1843,7 +1881,7 @@ ip_dooptions(struct mbuf *m, __unused int pass, struct sockaddr_in *next_hop) break; } else { - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); ia = NULL; } off--; /* 0 origin */ @@ -1903,9 +1941,11 @@ ip_dooptions(struct mbuf *m, __unused int pass, struct sockaddr_in *next_hop) goto bad; } ip->ip_dst = ipaddr.sin_addr; + IFA_LOCK(&ia->ia_ifa); (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr), sizeof(struct in_addr)); - ifafree(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); ia = NULL; cp[IPOPT_OFFSET] += sizeof(struct in_addr); /* @@ -1942,9 +1982,11 @@ ip_dooptions(struct mbuf *m, __unused int pass, struct sockaddr_in *next_hop) goto bad; } } + IFA_LOCK(&ia->ia_ifa); (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr), sizeof(struct in_addr)); - ifafree(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); ia = NULL; cp[IPOPT_OFFSET] += sizeof(struct in_addr); break; @@ -1987,10 +2029,12 @@ ip_dooptions(struct mbuf *m, __unused int pass, struct sockaddr_in *next_hop) m->m_pkthdr.rcvif); if (ia == 0) continue; + IFA_LOCK(&ia->ia_ifa); (void)memcpy(sin, &IA_SIN(ia)->sin_addr, sizeof(struct in_addr)); + IFA_UNLOCK(&ia->ia_ifa); ipt->ipt_ptr += sizeof(struct in_addr); - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); ia = NULL; break; @@ -2005,7 +2049,7 @@ ip_dooptions(struct mbuf *m, __unused int pass, struct sockaddr_in *next_hop) sizeof(struct in_addr)); if ((ia = (struct in_ifaddr*)ifa_ifwithaddr((SA)&ipaddr)) == 0) continue; - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); ia = NULL; ipt->ipt_ptr += sizeof(struct in_addr); break; @@ -2057,7 +2101,7 @@ ip_rtaddr(struct in_addr dst) RT_LOCK(ro.ro_rt); if ((rt_ifa = ro.ro_rt->rt_ifa) != NULL) - ifaref(rt_ifa); + IFA_ADDREF(rt_ifa); RT_UNLOCK(ro.ro_rt); rtfree(ro.ro_rt); @@ -2204,12 +2248,12 @@ sysctl_ipforwarding SYSCTL_HANDLER_ARGS for (i = 0; i <= if_index; i++) { struct ifnet *ifp = ifindex2ifnet[i]; if (ifp != NULL) { - lck_mtx_lock(ifp->if_fwd_route_lock); - if (ifp->if_fwd_route.ro_rt != NULL) { + lck_mtx_lock(&ifp->if_cached_route_lock); + if (ifp->if_fwd_route.ro_rt != NULL) rtfree(ifp->if_fwd_route.ro_rt); - ifp->if_fwd_route.ro_rt = NULL; - } - lck_mtx_unlock(ifp->if_fwd_route_lock); + bzero(&ifp->if_fwd_route, + sizeof (ifp->if_fwd_route)); + lck_mtx_unlock(&ifp->if_cached_route_lock); } } ifnet_head_done(); @@ -2228,20 +2272,16 @@ ip_fwd_route_copyout(struct ifnet *ifp, struct route *dst) { struct route *src = &ifp->if_fwd_route; - lck_mtx_lock(ifp->if_fwd_route_lock); + lck_mtx_lock_spin(&ifp->if_cached_route_lock); + lck_mtx_convert_spin(&ifp->if_cached_route_lock); /* Minor sanity check */ if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET) panic("%s: wrong or corrupted route: %p", __func__, src); - /* Copy everything (rt, dst, flags) from ifnet */ - bcopy(src, dst, sizeof (*dst)); - - /* Hold one reference for the local copy of struct route */ - if (dst->ro_rt != NULL) - RT_ADDREF(dst->ro_rt); + route_copyout(dst, src, sizeof(*dst)); - lck_mtx_unlock(ifp->if_fwd_route_lock); + lck_mtx_unlock(&ifp->if_cached_route_lock); } static void @@ -2249,37 +2289,17 @@ ip_fwd_route_copyin(struct ifnet *ifp, struct route *src) { struct route *dst = &ifp->if_fwd_route; - lck_mtx_lock(ifp->if_fwd_route_lock); + lck_mtx_lock_spin(&ifp->if_cached_route_lock); + lck_mtx_convert_spin(&ifp->if_cached_route_lock); /* Minor sanity check */ if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET) panic("%s: wrong or corrupted route: %p", __func__, src); - /* No cached route in the ifnet? */ - if (dst->ro_rt == NULL) { - /* - * Copy everything (rt, dst, flags) from ip_forward(); - * the reference to the route was held at the time - * it was allocated and is kept intact. - */ - bcopy(src, dst, sizeof (*dst)); - } else if (src->ro_rt != NULL) { - /* - * If the same, update just the ro_flags and ditch the one - * in the local copy. Else ditch the one that is currently - * cached, and cache what we got back from ip_output(). - */ - if (dst->ro_rt == src->ro_rt) { - dst->ro_flags = src->ro_flags; - rtfree(src->ro_rt); - src->ro_rt = NULL; - } else { - rtfree(dst->ro_rt); - bcopy(src, dst, sizeof (*dst)); - } - } + if (ifp->if_fwd_cacheok) + route_copyin(src, dst, sizeof(*src)); - lck_mtx_unlock(ifp->if_fwd_route_lock); + lck_mtx_unlock(&ifp->if_cached_route_lock); } /* @@ -2311,7 +2331,7 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop) n_long dest; struct in_addr pkt_dst; u_int32_t nextmtu = 0; - struct ip_out_args ipoa = { IFSCOPE_NONE }; + struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; struct ifnet *ifp = m->m_pkthdr.rcvif; #if PF struct pf_mtag *pf_mtag; @@ -2355,7 +2375,7 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop) #if PF pf_mtag = pf_find_mtag(m); if (pf_mtag != NULL && pf_mtag->rtableid != IFSCOPE_NONE) - ipoa.ipoa_ifscope = pf_mtag->rtableid; + ipoa.ipoa_boundif = pf_mtag->rtableid; #endif /* PF */ ip_fwd_route_copyout(ifp, &fwd_rt); @@ -2372,7 +2392,7 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop) sin->sin_len = sizeof (*sin); sin->sin_addr = pkt_dst; - rtalloc_scoped_ign(&fwd_rt, RTF_PRCLONING, ipoa.ipoa_ifscope); + rtalloc_scoped_ign(&fwd_rt, RTF_PRCLONING, ipoa.ipoa_boundif); if (fwd_rt.ro_rt == NULL) { icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0); goto done; @@ -2417,24 +2437,27 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop) if (rt->rt_ifp == m->m_pkthdr.rcvif && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && satosin(rt_key(rt))->sin_addr.s_addr != 0 && - ipsendredirects && !srcrt) { -#define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa)) + ipsendredirects && !srcrt && rt->rt_ifa != NULL) { + struct in_ifaddr *ia = (struct in_ifaddr *)rt->rt_ifa; u_int32_t src = ntohl(ip->ip_src.s_addr); - if (RTA(rt) && - (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) { - if (rt->rt_flags & RTF_GATEWAY) - dest = satosin(rt->rt_gateway)->sin_addr.s_addr; - else - dest = pkt_dst.s_addr; - /* Router requirements says to only send host redirects */ - type = ICMP_REDIRECT; - code = ICMP_REDIRECT_HOST; + /* Become a regular mutex */ + RT_CONVERT_LOCK(rt); + IFA_LOCK_SPIN(&ia->ia_ifa); + if ((src & ia->ia_subnetmask) == ia->ia_subnet) { + if (rt->rt_flags & RTF_GATEWAY) + dest = satosin(rt->rt_gateway)->sin_addr.s_addr; + else + dest = pkt_dst.s_addr; + /* Router requirements says to only send host redirects */ + type = ICMP_REDIRECT; + code = ICMP_REDIRECT_HOST; #if DIAGNOSTIC - if (ipprintfs) - printf("redirect (%d) to %lx\n", code, (u_int32_t)dest); + if (ipprintfs) + printf("redirect (%d) to %lx\n", code, (u_int32_t)dest); #endif } + IFA_UNLOCK(&ia->ia_ifa); } RT_UNLOCK(rt); @@ -2444,9 +2467,9 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop) struct m_tag *tag; struct ip_fwd_tag *ipfwd_tag; - tag = m_tag_alloc(KERNEL_MODULE_TAG_ID, + tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, - sizeof (*ipfwd_tag), M_NOWAIT); + sizeof (*ipfwd_tag), M_NOWAIT, m); if (tag == NULL) { error = ENOBUFS; m_freem(m); @@ -2473,9 +2496,6 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop) OSAddAtomic(1, &ipstat.ips_redirectsent); else { if (mcopy) { -#if IPFLOW - ipflow_create(&fwd_rt, mcopy); -#endif /* * If we didn't have to go thru ipflow and * the packet was successfully consumed by @@ -2580,6 +2600,7 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop) } sav = key_allocsa_policy(&saidx); if (sav != NULL) { + lck_mtx_lock(sadb_mutex); if (sav->sah != NULL) { ro = &sav->sah->sa_route; if (ro->ro_rt != NULL) { @@ -2591,7 +2612,8 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop) RT_UNLOCK(ro->ro_rt); } } - key_freesav(sav, KEY_SADB_UNLOCKED); + key_freesav(sav, KEY_SADB_LOCKED); + lck_mtx_unlock(sadb_mutex); } } } @@ -2617,27 +2639,41 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop) ip_fwd_route_copyin(ifp, &fwd_rt); } -void +int ip_savecontrol( struct inpcb *inp, struct mbuf **mp, struct ip *ip, struct mbuf *m) { + *mp = NULL; if (inp->inp_socket->so_options & SO_TIMESTAMP) { struct timeval tv; microtime(&tv); - *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), - SCM_TIMESTAMP, SOL_SOCKET); - if (*mp) - mp = &(*mp)->m_next; + mp = sbcreatecontrol_mbuf((caddr_t) &tv, sizeof(tv), + SCM_TIMESTAMP, SOL_SOCKET, mp); + if (*mp == NULL) { + goto no_mbufs; + } } + if ((inp->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + uint64_t time; + + time = mach_absolute_time(); + mp = sbcreatecontrol_mbuf((caddr_t) &time, sizeof(time), + SCM_TIMESTAMP_MONOTONIC, SOL_SOCKET, mp); + + if (*mp == NULL) { + goto no_mbufs; + } + } if (inp->inp_flags & INP_RECVDSTADDR) { - *mp = sbcreatecontrol((caddr_t) &ip->ip_dst, - sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); - if (*mp) - mp = &(*mp)->m_next; + mp = sbcreatecontrol_mbuf((caddr_t) &ip->ip_dst, + sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP, mp); + if (*mp == NULL) { + goto no_mbufs; + } } #ifdef notyet /* XXX @@ -2646,17 +2682,19 @@ ip_savecontrol( */ /* options were tossed already */ if (inp->inp_flags & INP_RECVOPTS) { - *mp = sbcreatecontrol((caddr_t) opts_deleted_above, - sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); - if (*mp) - mp = &(*mp)->m_next; + mp = sbcreatecontrol_mbuf((caddr_t) opts_deleted_above, + sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP, mp); + if (*mp == NULL) { + goto no_mbufs; + } } /* ip_srcroute doesn't do what we want here, need to fix */ if (inp->inp_flags & INP_RECVRETOPTS) { - *mp = sbcreatecontrol((caddr_t) ip_srcroute(), - sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); - if (*mp) - mp = &(*mp)->m_next; + mp = sbcreatecontrol_mbuf((caddr_t) ip_srcroute(), + sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP, mp); + if (*mp == NULL) { + goto no_mbufs; + } } #endif if (inp->inp_flags & INP_RECVIF) { @@ -2669,24 +2707,27 @@ ip_savecontrol( struct sockaddr_dl *sdl2 = &sdlbuf.sdl; ifnet_head_lock_shared(); - if (((ifp = m->m_pkthdr.rcvif)) - && ( ifp->if_index && (ifp->if_index <= if_index))) { + if ((ifp = m->m_pkthdr.rcvif) != NULL && + ifp->if_index && (ifp->if_index <= if_index)) { struct ifaddr *ifa = ifnet_addrs[ifp->if_index - 1]; if (!ifa || !ifa->ifa_addr) goto makedummy; + IFA_LOCK_SPIN(ifa); sdp = (struct sockaddr_dl *)ifa->ifa_addr; /* * Change our mind and don't try copy. */ - if ((sdp->sdl_family != AF_LINK) - || (sdp->sdl_len > sizeof(sdlbuf))) { + if ((sdp->sdl_family != AF_LINK) || + (sdp->sdl_len > sizeof(sdlbuf))) { + IFA_UNLOCK(ifa); goto makedummy; } bcopy(sdp, sdl2, sdp->sdl_len); + IFA_UNLOCK(ifa); } else { -makedummy: +makedummy: sdl2->sdl_len = offsetof(struct sockaddr_dl, sdl_data[0]); sdl2->sdl_family = AF_LINK; @@ -2694,15 +2735,46 @@ ip_savecontrol( sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; } ifnet_head_done(); - *mp = sbcreatecontrol((caddr_t) sdl2, sdl2->sdl_len, - IP_RECVIF, IPPROTO_IP); - if (*mp) - mp = &(*mp)->m_next; + mp = sbcreatecontrol_mbuf((caddr_t) sdl2, sdl2->sdl_len, + IP_RECVIF, IPPROTO_IP, mp); + if (*mp == NULL) { + goto no_mbufs; + } } if (inp->inp_flags & INP_RECVTTL) { - *mp = sbcreatecontrol((caddr_t)&ip->ip_ttl, sizeof(ip->ip_ttl), IP_RECVTTL, IPPROTO_IP); - if (*mp) mp = &(*mp)->m_next; + mp = sbcreatecontrol_mbuf((caddr_t)&ip->ip_ttl, sizeof(ip->ip_ttl), + IP_RECVTTL, IPPROTO_IP, mp); + if (*mp == NULL) { + goto no_mbufs; + } + } + if ((inp->inp_socket->so_flags & SOF_RECV_TRAFFIC_CLASS) != 0) { + int tc = m->m_pkthdr.prio; + + mp = sbcreatecontrol_mbuf((caddr_t) &tc, sizeof(tc), + SO_TRAFFIC_CLASS, SOL_SOCKET, mp); + if (*mp == NULL) { + goto no_mbufs; + } + } + if (inp->inp_flags & INP_PKTINFO) { + struct in_pktinfo pi; + + bzero(&pi, sizeof(struct in_pktinfo)); + bcopy(&ip->ip_dst, &pi.ipi_addr, sizeof(struct in_addr)); + pi.ipi_ifindex = (m && m->m_pkthdr.rcvif) ? m->m_pkthdr.rcvif->if_index : 0; + + mp = sbcreatecontrol_mbuf((caddr_t)&pi, sizeof(struct in_pktinfo), + IP_RECVPKTINFO, IPPROTO_IP, mp); + if (*mp == NULL) { + goto no_mbufs; + } } + return 0; + +no_mbufs: + ipstat.ips_pktdropcntrl++; + return ENOBUFS; } int diff --git a/bsd/netinet/ip_mroute.c b/bsd/netinet/ip_mroute.c index e61d2ed64..f33537ef8 100644 --- a/bsd/netinet/ip_mroute.c +++ b/bsd/netinet/ip_mroute.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -131,12 +131,12 @@ int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, struct ip_moptions *) = _ip_mforward; int -_mrt_ioctl(__unused int req, __unused caddr_t data, __unused struct proc *p) +_mrt_ioctl(__unused u_long req, __unused caddr_t data, __unused struct proc *p) { return EOPNOTSUPP; } -int (*mrt_ioctl)(int, caddr_t, struct proc *) = _mrt_ioctl; +int (*mrt_ioctl)(u_long, caddr_t, struct proc *) = _mrt_ioctl; void rsvp_input(struct mbuf *m, int iphlen) /* XXX must fixup manually */ @@ -293,7 +293,7 @@ static int X_ip_mrouter_done(void); static int X_ip_mrouter_get(struct socket *so, struct sockopt *m); static int X_ip_mrouter_set(struct socket *so, struct sockopt *m); static int X_legal_vif_num(int vif); -static int X_mrt_ioctl(int cmd, caddr_t data); +static int X_mrt_ioctl(u_long cmd, caddr_t data); static int get_sg_cnt(struct sioc_sg_req *); static int get_vif_cnt(struct sioc_vif_req *); @@ -493,7 +493,7 @@ int (*ip_mrouter_get)(struct socket *, struct sockopt *) = X_ip_mrouter_get; * Handle ioctl commands to obtain information from the cache */ static int -X_mrt_ioctl(int cmd, caddr_t data) +X_mrt_ioctl(u_long cmd, caddr_t data) { int error = 0; @@ -512,7 +512,7 @@ X_mrt_ioctl(int cmd, caddr_t data) } #if !defined(MROUTE_LKM) || !MROUTE_LKM -int (*mrt_ioctl)(int, caddr_t) = X_mrt_ioctl; +int (*mrt_ioctl)(u_long, caddr_t) = X_mrt_ioctl; #endif /* @@ -695,7 +695,7 @@ add_vif(struct vifctl *vifcp) ifa = ifa_ifwithaddr((struct sockaddr *)&sin); if (ifa == 0) return EADDRNOTAVAIL; ifp = ifa->ifa_ifp; - ifafree(ifa); + IFA_REMREF(ifa); ifa = NULL; if (vifcp->vifc_flags & VIFF_TUNNEL) { @@ -1099,7 +1099,10 @@ X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, return 1; } + if (imo != NULL) + IMO_LOCK(imo); if ((imo) && ((vifi = imo->imo_multicast_vif) < numvifs)) { + IMO_UNLOCK(imo); if (ip->ip_ttl < 255) ip->ip_ttl++; /* compensate for -1 in *_send routines */ if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { @@ -1110,6 +1113,8 @@ X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, vifp->v_ifp->if_name, vifp->v_ifp->if_unit); } return (ip_mdq(m, ifp, NULL, vifi)); + } else if (imo != NULL) { + IMO_UNLOCK(imo); } if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { printf("Warning: IPPROTO_RSVP from %x to %x without vif option\n", @@ -1807,7 +1812,6 @@ tbf_dq_sel(struct vif *vifp, struct ip *ip) static void tbf_send_packet(struct vif *vifp, struct mbuf *m) { - struct ip_moptions imo; int error; static struct route ro; @@ -1816,10 +1820,18 @@ tbf_send_packet(struct vif *vifp, struct mbuf *m) ip_output(m, (struct mbuf *)0, &vifp->v_route, IP_FORWARDING, (struct ip_moptions *)0, NULL); } else { - imo.imo_multicast_ifp = vifp->v_ifp; - imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; - imo.imo_multicast_loop = 1; - imo.imo_multicast_vif = -1; + struct ip_moptions *imo; + + imo = ip_allocmoptions(M_DONTWAIT); + if (imo == NULL) { + error = ENOMEM; + goto done; + } + + imo->imo_multicast_ifp = vifp->v_ifp; + imo->imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; + imo->imo_multicast_loop = 1; + imo->imo_multicast_vif = -1; /* * Re-entrancy should not be a problem here, because @@ -1828,8 +1840,10 @@ tbf_send_packet(struct vif *vifp, struct mbuf *m) * the loopback interface, thus preventing looping. */ error = ip_output(m, (struct mbuf *)0, &ro, - IP_FORWARDING, &imo, NULL); + IP_FORWARDING, imo, NULL); + IMO_REMREF(imo); +done: if (mrtdebug & DEBUG_XMIT) log(LOG_DEBUG, "phyint_send on vif %d err %d\n", vifp - viftable, error); diff --git a/bsd/netinet/ip_mroute.h b/bsd/netinet/ip_mroute.h index 71c39440a..f234e20ab 100644 --- a/bsd/netinet/ip_mroute.h +++ b/bsd/netinet/ip_mroute.h @@ -298,9 +298,9 @@ extern int (*ip_mrouter_set)(struct socket *, struct sockopt *); extern int (*ip_mrouter_get)(struct socket *, struct sockopt *); extern int (*ip_mrouter_done)(void); #if MROUTING -extern int (*mrt_ioctl)(int, caddr_t); +extern int (*mrt_ioctl)(u_long, caddr_t); #else -extern int (*mrt_ioctl)(int, caddr_t, struct proc *); +extern int (*mrt_ioctl)(u_long, caddr_t, struct proc *); #endif #endif /* KERNEL_PRIVATE */ diff --git a/bsd/netinet/ip_output.c b/bsd/netinet/ip_output.c index 07d74f97f..57f522919 100644 --- a/bsd/netinet/ip_output.c +++ b/bsd/netinet/ip_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,12 +79,17 @@ #include #include #include +#include #include +#include #include #include +#include #include +#include +#include #include #include @@ -124,6 +129,7 @@ #include #include +#include #if DUMMYNET #include @@ -144,20 +150,14 @@ u_short ip_id; static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *); -static struct ifnet *ip_multicast_if(struct in_addr *, int *); static void ip_mloopback(struct ifnet *, struct mbuf *, struct sockaddr_in *, int); -static int ip_getmoptions(struct sockopt *, struct ip_moptions *); static int ip_pcbopts(int, struct mbuf **, struct mbuf *); -static int ip_setmoptions(struct sockopt *, struct ip_moptions **); +static void imo_trace(struct ip_moptions *, int); static void ip_out_cksum_stats(int, u_int32_t); static struct ifaddr *in_selectsrcif(struct ip *, struct route *, unsigned int); -static void ip_bindif(struct inpcb *, unsigned int); -int ip_createmoptions(struct ip_moptions **imop); -int ip_addmembership(struct ip_moptions *imo, struct ip_mreq *mreq); -int ip_dropmembership(struct ip_moptions *imo, struct ip_mreq *mreq); int ip_optcopy(struct ip *, struct ip *); void in_delayed_cksum_offset(struct mbuf *, int ); void in_cksum_offset(struct mbuf* , size_t ); @@ -175,18 +175,50 @@ extern int ipsec_bypass; #endif static int ip_maxchainsent = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, maxchainsent, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, maxchainsent, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_maxchainsent, 0, "use dlil_output_list"); #if DEBUG static int forge_ce = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, forge_ce, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, forge_ce, CTLFLAG_RW | CTLFLAG_LOCKED, &forge_ce, 0, "Forge ECN CE"); #endif /* DEBUG */ static int ip_select_srcif_debug = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, select_srcif_debug, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, select_srcif_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_select_srcif_debug, 0, "log source interface selection debug info"); +#define IMO_TRACE_HIST_SIZE 32 /* size of trace history */ + +/* For gdb */ +__private_extern__ unsigned int imo_trace_hist_size = IMO_TRACE_HIST_SIZE; + +struct ip_moptions_dbg { + struct ip_moptions imo; /* ip_moptions */ + u_int16_t imo_refhold_cnt; /* # of IMO_ADDREF */ + u_int16_t imo_refrele_cnt; /* # of IMO_REMREF */ + /* + * Alloc and free callers. + */ + ctrace_t imo_alloc; + ctrace_t imo_free; + /* + * Circular lists of IMO_ADDREF and IMO_REMREF callers. + */ + ctrace_t imo_refhold[IMO_TRACE_HIST_SIZE]; + ctrace_t imo_refrele[IMO_TRACE_HIST_SIZE]; +}; + +#if DEBUG +static unsigned int imo_debug = 1; /* debugging (enabled) */ +#else +static unsigned int imo_debug; /* debugging (disabled) */ +#endif /* !DEBUG */ +static unsigned int imo_size; /* size of zone element */ +static struct zone *imo_zone; /* zone for ip_moptions */ + +#define IMO_ZONE_MAX 64 /* maximum elements in zone */ +#define IMO_ZONE_NAME "ip_moptions" /* zone name */ + /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). @@ -244,11 +276,12 @@ ip_output_list( struct ifnet *ifp = NULL; struct mbuf *m = m0, **mppn = NULL; int hlen = sizeof (struct ip); - int len = 0, off, error = 0; + int len = 0, error = 0; struct sockaddr_in *dst = NULL; struct in_ifaddr *ia = NULL, *src_ia = NULL; int isbroadcast, sw_csum; struct in_addr pkt_dst; + struct ipf_pktopts *ippo = NULL, ipf_pktopts; #if IPSEC struct route iproute; struct socket *so = NULL; @@ -258,18 +291,24 @@ ip_output_list( int fwd_rewrite_src = 0; #endif #if IPFIREWALL + int off; struct ip_fw_args args; + struct m_tag *tag; + struct sockaddr_in *next_hop_from_ipfwd_tag = NULL; #endif int didfilter = 0; ipfilter_t inject_filter_ref = 0; - struct m_tag *tag; +#if DUMMYNET struct route saved_route; struct ip_out_args saved_ipoa; + struct sockaddr_in dst_buf; +#endif /* DUMMYNET */ struct mbuf * packetlist; int pktcnt = 0, tso = 0; + u_int32_t bytecnt = 0; unsigned int ifscope; + unsigned int nocell; boolean_t select_srcif; - KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); packetlist = m0; @@ -296,7 +335,8 @@ ip_output_list( ro = &saved_route; imo = NULL; - dst = dn_tag->dn_dst; + bcopy(&dn_tag->dn_dst, &dst_buf, sizeof(dst_buf)); + dst = &dst_buf; ifp = dn_tag->ifp; flags = dn_tag->flags; saved_ipoa = dn_tag->ipoa; @@ -323,8 +363,8 @@ ip_output_list( struct ip_fwd_tag *ipfwd_tag; ipfwd_tag = (struct ip_fwd_tag *)(tag+1); - args.next_hop = ipfwd_tag->next_hop; - + next_hop_from_ipfwd_tag = ipfwd_tag->next_hop; + m_tag_delete(m0, tag); } ipfw_tags_done: @@ -340,6 +380,9 @@ ip_output_list( mtod(m, struct ip *)->ip_p); #endif + bzero(&ipf_pktopts, sizeof(struct ipf_pktopts)); + ippo = &ipf_pktopts; + /* * At present the IP_OUTARGS flag implies a request for IP to * perform source interface selection. In the forwarding case, @@ -348,12 +391,22 @@ ip_output_list( */ if (ip_doscopedroute && (flags & IP_OUTARGS)) { select_srcif = !(flags & IP_FORWARDING); - ifscope = ipoa->ipoa_ifscope; + ifscope = ipoa->ipoa_boundif; + ipf_pktopts.ippo_flags = IPPOF_BOUND_IF; + ipf_pktopts.ippo_flags |= (ifscope << IPPOF_SHIFT_IFSCOPE); } else { select_srcif = FALSE; ifscope = IFSCOPE_NONE; } + if (flags & IP_OUTARGS) { + nocell = ipoa->ipoa_nocell; + if (nocell) + ipf_pktopts.ippo_flags |= IPPOF_NO_IFT_CELLULAR; + } else { + nocell = 0; + } + #if IPFIREWALL if (args.rule != NULL) { /* dummynet already saw us */ ip = mtod(m, struct ip *); @@ -361,8 +414,11 @@ ip_output_list( if (ro->ro_rt != NULL) { RT_LOCK_SPIN(ro->ro_rt); ia = (struct in_ifaddr *)ro->ro_rt->rt_ifa; - if (ia) - ifaref(&ia->ia_ifa); + if (ia) { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro->ro_rt); + IFA_ADDREF(&ia->ia_ifa); + } RT_UNLOCK(ro->ro_rt); } #if IPSEC @@ -397,11 +453,29 @@ ip_output_list( } ip = mtod(m, struct ip *); #if IPFIREWALL + /* + * rdar://8542331 + * + * When dealing with a packet chain, we need to reset "next_hop" because + * "dst" may have been changed to the gateway address below for the previous + * packet of the chain. This could cause the route to be inavertandly changed + * to the route to the gateway address (instead of the route to the destination). + */ + args.next_hop = next_hop_from_ipfwd_tag; pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst; #else pkt_dst = ip->ip_dst; #endif + /* + * We must not send if the packet is destined to network zero. + * RFC1122 3.2.1.3 (a) and (b). + */ + if (IN_ZERONET(ntohl(pkt_dst.s_addr))) { + error = EHOSTUNREACH; + goto bad; + } + /* * Fill in IP header. */ @@ -450,7 +524,7 @@ ip_output_list( error = EADDRNOTAVAIL; goto bad; } - ifafree(&src_ia->ia_ifa); + IFA_REMREF(&src_ia->ia_ifa); } /* * Test rt_flags without holding rt_lock for performance @@ -487,7 +561,7 @@ ip_output_list( #define sintosa(sin) ((struct sockaddr *)(sin)) if (flags & IP_ROUTETOIF) { if (ia) - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0) { if ((ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) { OSAddAtomic(1, &ipstat.ips_noroute); @@ -499,15 +573,14 @@ ip_output_list( ip->ip_ttl = 1; isbroadcast = in_broadcast(dst->sin_addr, ifp); } else if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) && - imo != NULL && imo->imo_multicast_ifp != NULL) { + imo != NULL && (ifp = imo->imo_multicast_ifp) != NULL) { /* * Bypass the normal routing lookup for multicast * packets if the interface is specified. */ - ifp = imo->imo_multicast_ifp; isbroadcast = 0; if (ia != NULL) - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); /* Macro takes reference on ia */ IFP_TO_IA(ifp, ia); @@ -530,6 +603,18 @@ ip_output_list( /* Find the source interface */ ifa = in_selectsrcif(ip, ro, ifscope); + /* + * If the source address belongs to a cellular interface + * and the caller forbids our using interfaces of such + * type, pretend that there is no source address. + */ + if (nocell && ifa != NULL && + ifa->ifa_ifp->if_type == IFT_CELLULAR) { + IFA_REMREF(ifa); + error = EADDRNOTAVAIL; + goto bad; + } + /* * If the source address is spoofed (in the case * of IP_RAWOUTPUT), or if this is destined for @@ -560,7 +645,7 @@ ip_output_list( if (ifa != NULL) { if (ifscope == IFSCOPE_NONE) ifscope = ifa->ifa_ifp->if_index; - ifafree(ifa); + IFA_REMREF(ifa); cloneok = (!(flags & IP_RAWOUTPUT) && !(IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)))); } @@ -611,6 +696,23 @@ ip_output_list( rtalloc_ign(ro, ign); else rtalloc_scoped_ign(ro, ign, ifscope); + + /* + * If the route points to a cellular interface and the + * caller forbids our using interfaces of such type, + * pretend that there is no route. + */ + if (nocell && ro->ro_rt != NULL) { + RT_LOCK_SPIN(ro->ro_rt); + if (ro->ro_rt->rt_ifp->if_type == + IFT_CELLULAR) { + RT_UNLOCK(ro->ro_rt); + rtfree(ro->ro_rt); + ro->ro_rt = NULL; + } else { + RT_UNLOCK(ro->ro_rt); + } + } } if (ro->ro_rt == NULL) { @@ -620,11 +722,14 @@ ip_output_list( } if (ia) - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); RT_LOCK_SPIN(ro->ro_rt); ia = ifatoia(ro->ro_rt->rt_ifa); - if (ia) - ifaref(&ia->ia_ifa); + if (ia) { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro->ro_rt); + IFA_ADDREF(&ia->ia_ifa); + } ifp = ro->ro_rt->rt_ifp; ro->ro_rt->rt_use++; if (ro->ro_rt->rt_flags & RTF_GATEWAY) @@ -641,6 +746,9 @@ ip_output_list( if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) { struct in_multi *inm; + u_int32_t vif; + u_int8_t ttl = IP_DEFAULT_MULTICAST_TTL; + u_int8_t loop = IP_DEFAULT_MULTICAST_LOOP; m->m_flags |= M_MCAST; /* @@ -653,22 +761,28 @@ ip_output_list( * See if the caller provided any multicast options */ if (imo != NULL) { - if ((flags & IP_RAWOUTPUT) == 0) ip->ip_ttl = imo->imo_multicast_ttl; - if (imo->imo_multicast_ifp != NULL) { + IMO_LOCK(imo); + vif = imo->imo_multicast_vif; + ttl = imo->imo_multicast_ttl; + loop = imo->imo_multicast_loop; + if ((flags & IP_RAWOUTPUT) == 0) + ip->ip_ttl = ttl; + if (imo->imo_multicast_ifp != NULL) ifp = imo->imo_multicast_ifp; - } + IMO_UNLOCK(imo); #if MROUTING - if (imo->imo_multicast_vif != -1 && - ((flags & IP_RAWOUTPUT) == 0 || ip->ip_src.s_addr == INADDR_ANY)) - ip->ip_src.s_addr = - ip_mcast_src(imo->imo_multicast_vif); + if (vif != -1 && ((flags & IP_RAWOUTPUT) == 0 || + ip->ip_src.s_addr == INADDR_ANY)) + ip->ip_src.s_addr = ip_mcast_src(vif); #endif /* MROUTING */ - } else - if ((flags & IP_RAWOUTPUT) == 0) ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; + } else if ((flags & IP_RAWOUTPUT) == 0) { + vif = -1; + ip->ip_ttl = ttl; + } /* * Confirm that the outgoing interface supports multicast. */ - if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { + if (imo == NULL || vif == -1) { if ((ifp->if_flags & IFF_MULTICAST) == 0) { OSAddAtomic(1, &ipstat.ips_noroute); error = ENETUNREACH; @@ -682,11 +796,15 @@ ip_output_list( if (ip->ip_src.s_addr == INADDR_ANY) { struct in_ifaddr *ia1; lck_rw_lock_shared(in_ifaddr_rwlock); - TAILQ_FOREACH(ia1, &in_ifaddrhead, ia_link) + TAILQ_FOREACH(ia1, &in_ifaddrhead, ia_link) { + IFA_LOCK_SPIN(&ia1->ia_ifa); if (ia1->ia_ifp == ifp) { ip->ip_src = IA_SIN(ia1)->sin_addr; + IFA_UNLOCK(&ia1->ia_ifa); break; } + IFA_UNLOCK(&ia1->ia_ifa); + } lck_rw_done(in_ifaddr_rwlock); if (ip->ip_src.s_addr == INADDR_ANY) { error = ENETUNREACH; @@ -694,11 +812,10 @@ ip_output_list( } } - ifnet_lock_shared(ifp); - IN_LOOKUP_MULTI(pkt_dst, ifp, inm); - ifnet_lock_done(ifp); - if (inm != NULL && - (imo == NULL || imo->imo_multicast_loop)) { + in_multihead_lock_shared(); + IN_LOOKUP_MULTI(&pkt_dst, ifp, inm); + in_multihead_lock_done(); + if (inm != NULL && (imo == NULL || loop)) { /* * If we belong to the destination multicast group * on the outgoing interface, and the caller did not @@ -707,17 +824,16 @@ ip_output_list( if (!TAILQ_EMPTY(&ipv4_filters)) { struct ipfilter *filter; int seen = (inject_filter_ref == 0); - struct ipf_pktopts *ippo = 0, ipf_pktopts; - if (imo) { - ippo = &ipf_pktopts; - ipf_pktopts.ippo_mcast_ifnet = imo->imo_multicast_ifp; - ipf_pktopts.ippo_mcast_ttl = imo->imo_multicast_ttl; - ipf_pktopts.ippo_mcast_loop = imo->imo_multicast_loop; + if (imo != NULL) { + ipf_pktopts.ippo_flags |= IPPOF_MCAST_OPTS; + ipf_pktopts.ippo_mcast_ifnet = ifp; + ipf_pktopts.ippo_mcast_ttl = ttl; + ipf_pktopts.ippo_mcast_loop = loop; } - + ipf_ref(); - + /* 4135317 - always pass network byte order to filter */ #if BYTE_ORDER != BIG_ENDIAN @@ -734,15 +850,17 @@ ip_output_list( result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo); if (result == EJUSTRETURN) { ipf_unref(); + INM_REMREF(inm); goto done; } if (result != 0) { ipf_unref(); + INM_REMREF(inm); goto bad; } } } - + /* set back to host byte order */ ip = mtod(m, struct ip *); @@ -778,15 +896,18 @@ ip_output_list( * as prescribed by rsvpd. */ if (!rsvp_on) - imo = NULL; + imo = NULL; if (ip_mforward(ip, ifp, m, imo) != 0) { m_freem(m); + if (inm != NULL) + INM_REMREF(inm); goto done; } } } #endif /* MROUTING */ - + if (inm != NULL) + INM_REMREF(inm); /* * Multicasts with a time-to-live of zero may be looped- * back, above, but must not be transmitted on a network. @@ -808,7 +929,9 @@ ip_output_list( * of outgoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { + IFA_LOCK_SPIN(&ia->ia_ifa); ip->ip_src = IA_SIN(ia)->sin_addr; + IFA_UNLOCK(&ia->ia_ifa); #if IPFIREWALL_FORWARD /* Keep note that we did this - if the firewall changes * the next-hop, our interface may change, changing the @@ -847,26 +970,30 @@ ip_output_list( sendit: #if PF /* Invoke outbound packet filter */ - if (pf_af_hook(ifp, mppn, &m, AF_INET, FALSE) != 0) { - if (packetlist == m0) { - packetlist = m; - mppn = NULL; - } - if (m != NULL) { - m0 = m; - /* Next packet in the chain */ - goto loopit; - } else if (packetlist != NULL) { - /* No more packet; send down the chain */ - goto sendchain; + if ( PF_IS_ENABLED) { + int rc; + rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE); + if (rc != 0) { + if (packetlist == m0) { + packetlist = m; + mppn = NULL; + } + if (m != NULL) { + m0 = m; + /* Next packet in the chain */ + goto loopit; + } else if (packetlist != NULL) { + /* No more packet; send down the chain */ + goto sendchain; + } + /* Nothing left; we're done */ + goto done; } - /* Nothing left; we're done */ - goto done; + m0 = m; + ip = mtod(m, struct ip *); + pkt_dst = ip->ip_dst; + hlen = IP_VHL_HL(ip->ip_vhl) << 2; } - m0 = m; - ip = mtod(m, struct ip *); - pkt_dst = ip->ip_dst; - hlen = IP_VHL_HL(ip->ip_vhl) << 2; #endif /* PF */ /* * Force IP TTL to 255 following draft-ietf-zeroconf-ipv4-linklocal.txt @@ -882,7 +1009,8 @@ ip_output_list( if (!didfilter && !TAILQ_EMPTY(&ipv4_filters)) { struct ipfilter *filter; int seen = (inject_filter_ref == 0); - + ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS; + /* Check that a TSO frame isn't passed to a filter. * This could happen if a filter is inserted while * TCP is sending the TSO packet. @@ -907,7 +1035,7 @@ ip_output_list( seen = 1; } else if (filter->ipf_filter.ipf_output) { errno_t result; - result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, 0); + result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo); if (result == EJUSTRETURN) { ipf_unref(); goto done; @@ -1011,6 +1139,10 @@ ip_output_list( HTONS(ip->ip_off); #endif + DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL, + struct ip *, ip, struct ifnet *, ifp, + struct ip *, ip, struct ip6_hdr *, NULL); + error = ipsec4_output(&state, sp, flags); m0 = m = state.m; @@ -1071,7 +1203,7 @@ ip_output_list( rtfree(ro->ro_rt); ro->ro_rt = NULL; if (src_ia != NULL) - ifafree(&src_ia->ia_ifa); + IFA_REMREF(&src_ia->ia_ifa); } if (ro->ro_rt == NULL) { @@ -1085,11 +1217,14 @@ ip_output_list( } } else { if (ia) - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); RT_LOCK_SPIN(ro->ro_rt); ia = ifatoia(ro->ro_rt->rt_ifa); - if (ia) - ifaref(&ia->ia_ifa); + if (ia) { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro->ro_rt); + IFA_ADDREF(&ia->ia_ifa); + } ifp = ro->ro_rt->rt_ifp; RT_UNLOCK(ro->ro_rt); } @@ -1107,6 +1242,8 @@ ip_output_list( if (!TAILQ_EMPTY(&ipv4_filters)) { struct ipfilter *filter; + ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS; + /* Check that a TSO frame isn't passed to a filter. * This could happen if a filter is inserted while * TCP is sending the TSO packet. @@ -1128,7 +1265,7 @@ ip_output_list( TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { if (filter->ipf_filter.ipf_output) { errno_t result; - result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, 0); + result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo); if (result == EJUSTRETURN) { ipf_unref(); goto done; @@ -1310,31 +1447,35 @@ ip_output_list( * of ours, we pretend to * be the destination for this packet. */ + IFA_LOCK_SPIN(&ia_fw->ia_ifa); if (IA_SIN(ia_fw)->sin_addr.s_addr == - dst->sin_addr.s_addr) + dst->sin_addr.s_addr) { + IFA_UNLOCK(&ia_fw->ia_ifa); break; + } + IFA_UNLOCK(&ia_fw->ia_ifa); } lck_rw_done(in_ifaddr_rwlock); if (ia_fw) { /* tell ip_input "dont filter" */ struct m_tag *fwd_tag; struct ip_fwd_tag *ipfwd_tag; - - fwd_tag = m_tag_alloc(KERNEL_MODULE_TAG_ID, + + fwd_tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, - sizeof (*ipfwd_tag), M_NOWAIT); + sizeof (*ipfwd_tag), M_NOWAIT, m); if (fwd_tag == NULL) { error = ENOBUFS; goto bad; } - + ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1); ipfwd_tag->next_hop = args.next_hop; m_tag_prepend(m, fwd_tag); if (m->m_pkthdr.rcvif == NULL) - m->m_pkthdr.rcvif = ifunit("lo0"); + m->m_pkthdr.rcvif = lo_ifp; if ((~IF_HWASSIST_CSUM_FLAGS(m->m_pkthdr.rcvif->if_hwassist) & m->m_pkthdr.csum_flags) == 0) { if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { @@ -1387,8 +1528,11 @@ ip_output_list( RT_LOCK_SPIN(ro_fwd->ro_rt); ia_fw = ifatoia(ro_fwd->ro_rt->rt_ifa); - if (ia_fw != NULL) - ifaref(&ia_fw->ia_ifa); + if (ia_fw != NULL) { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro_fwd->ro_rt); + IFA_ADDREF(&ia_fw->ia_ifa); + } ifp = ro_fwd->ro_rt->rt_ifp; ro_fwd->ro_rt->rt_use++; if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY) @@ -1412,9 +1556,12 @@ ip_output_list( * interface, do it again, from the new one. */ if (ia_fw != NULL) { - if (fwd_rewrite_src) + if (fwd_rewrite_src) { + IFA_LOCK_SPIN(&ia_fw->ia_ifa); ip->ip_src = IA_SIN(ia_fw)->sin_addr; - ifafree(&ia_fw->ia_ifa); + IFA_UNLOCK(&ia_fw->ia_ifa); + } + IFA_REMREF(&ia_fw->ia_ifa); } goto pass ; } @@ -1427,9 +1574,9 @@ ip_output_list( error = EACCES; /* not sure this is the right error msg */ goto done; } -#endif /* IPFIREWALL */ pass: +#endif /* IPFIREWALL */ #if __APPLE__ /* Do not allow loopback address to wind up on a wire */ if ((ifp->if_flags & IFF_LOOPBACK) == 0 && @@ -1526,11 +1673,14 @@ ip_output_list( ipsec_delaux(m); #endif if (packetchain == 0) { + if (ro->ro_rt && nstat_collect) + nstat_route_tx(ro->ro_rt, 1, m->m_pkthdr.len, 0); error = ifnet_output(ifp, PF_INET, m, ro->ro_rt, (struct sockaddr *)dst); goto done; } else { /* packet chaining allows us to reuse the route for all packets */ + bytecnt += m->m_pkthdr.len; mppn = &m->m_nextpkt; m = m->m_nextpkt; if (m == NULL) { @@ -1539,10 +1689,13 @@ ip_output_list( #endif /* PF */ if (pktcnt > ip_maxchainsent) ip_maxchainsent = pktcnt; + if (ro->ro_rt && nstat_collect) + nstat_route_tx(ro->ro_rt, pktcnt, bytecnt, 0); //send error = ifnet_output(ifp, PF_INET, packetlist, ro->ro_rt, (struct sockaddr *)dst); pktcnt = 0; + bytecnt = 0; goto done; } @@ -1556,23 +1709,28 @@ ip_output_list( * Must be able to put at least 8 bytes per fragment. */ - if (ip->ip_off & IP_DF || (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4)) { + if (ip->ip_off & IP_DF || (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) || + pktcnt > 0) { error = EMSGSIZE; /* * This case can happen if the user changed the MTU - * * of an interface after enabling IP on it. Because * most netifs don't keep track of routes pointing to * them, there is no way for one to update all its * routes when the MTU is changed. */ - RT_LOCK_SPIN(ro->ro_rt); - if (ro->ro_rt && (ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) - && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) - && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { - ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; + if (ro->ro_rt) { + RT_LOCK_SPIN(ro->ro_rt); + if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) + && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) + && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { + ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; + } + RT_UNLOCK(ro->ro_rt); + } + if (pktcnt > 0) { + m0 = packetlist; } - RT_UNLOCK(ro->ro_rt); OSAddAtomic(1, &ipstat.ips_cantfrag); goto bad; } @@ -1604,6 +1762,8 @@ ip_output_list( #endif if ((packetchain != 0) && (pktcnt > 0)) panic("ip_output: mix of packet in packetlist is wrong=%p", packetlist); + if (ro->ro_rt && nstat_collect) + nstat_route_tx(ro->ro_rt, 1, m->m_pkthdr.len, 0); error = ifnet_output(ifp, PF_INET, m, ro->ro_rt, (struct sockaddr *)dst); } else @@ -1615,7 +1775,7 @@ ip_output_list( done: if (ia) { - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); ia = NULL; } #if IPSEC @@ -1781,8 +1941,11 @@ in_delayed_cksum_offset(struct mbuf *m0, int ip_offset) struct ip *ip; unsigned char buf[sizeof(struct ip)]; u_short csum, offset, ip_len; - struct mbuf *m = m0; - + + /* Save copy of first mbuf pointer and the ip_offset before modifying */ + struct mbuf *m = m0; + int ip_offset_copy = ip_offset; + while (ip_offset >= m->m_len) { ip_offset -= m->m_len; m = m->m_next; @@ -1823,12 +1986,12 @@ in_delayed_cksum_offset(struct mbuf *m0, int ip_offset) * is bogus and we give up. */ ip_len = ip->ip_len; - if (ip_len != (m0->m_pkthdr.len - ip_offset)) { + if (ip_len != (m0->m_pkthdr.len - ip_offset_copy)) { ip_len = SWAP16(ip_len); - if (ip_len != (m0->m_pkthdr.len - ip_offset)) { + if (ip_len != (m0->m_pkthdr.len - ip_offset_copy)) { printf("in_delayed_cksum_offset: ip_len %d (%d) " "doesn't match actual length %d\n", ip->ip_len, - ip_len, (m0->m_pkthdr.len - ip_offset)); + ip_len, (m0->m_pkthdr.len - ip_offset_copy)); return; } } @@ -1880,6 +2043,10 @@ in_cksum_offset(struct mbuf* m, size_t ip_offset) int hlen = 0; unsigned char buf[sizeof(struct ip)]; int swapped = 0; + + /* Save copy of first mbuf pointer and the ip_offset before modifying */ + struct mbuf* m0 = m; + size_t ip_offset_copy = ip_offset; while (ip_offset >= m->m_len) { ip_offset -= m->m_len; @@ -1927,15 +2094,15 @@ in_cksum_offset(struct mbuf* m, size_t ip_offset) * the length and check again. If it still fails, then the packet * is bogus and we give up. */ - if (ntohs(ip->ip_len) != (m->m_pkthdr.len - ip_offset)) { + if (ntohs(ip->ip_len) != (m0->m_pkthdr.len - ip_offset_copy)) { ip->ip_len = SWAP16(ip->ip_len); swapped = 1; - if (ntohs(ip->ip_len) != (m->m_pkthdr.len - ip_offset)) { + if (ntohs(ip->ip_len) != (m0->m_pkthdr.len - ip_offset_copy)) { ip->ip_len = SWAP16(ip->ip_len); printf("in_cksum_offset: ip_len %d (%d) " "doesn't match actual length %lu\n", ip->ip_len, SWAP16(ip->ip_len), - (m->m_pkthdr.len - ip_offset)); + (m0->m_pkthdr.len - ip_offset_copy)); return; } } @@ -2120,6 +2287,7 @@ ip_ctloutput(so, sopt) #if defined(NFAITH) && NFAITH > 0 case IP_FAITH: #endif + case IP_RECVPKTINFO: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) @@ -2164,6 +2332,9 @@ ip_ctloutput(so, sopt) OPTSET(INP_FAITH); break; #endif + case IP_RECVPKTINFO: + OPTSET(INP_PKTINFO); + break; } break; #undef OPTSET @@ -2200,14 +2371,14 @@ ip_ctloutput(so, sopt) break; } - if (sopt->sopt_valsize == 0 || ifname[0] == NULL) { + if (sopt->sopt_valsize == 0 || ifname[0] == '\0') { /* Unbind this socket from any interface */ ifscope = IFSCOPE_NONE; } else { ifnet_t ifp; /* Verify name is NULL terminated */ - if (ifname[sopt->sopt_valsize - 1] != NULL) { + if (ifname[sopt->sopt_valsize - 1] != '\0') { error = EINVAL; break; } @@ -2227,17 +2398,33 @@ ip_ctloutput(so, sopt) */ ifnet_release(ifp); } - ip_bindif(inp, ifscope); + inp_bindif(inp, ifscope); } break; #endif + /* + * Multicast socket options are processed by the in_mcast + * module. + */ case IP_MULTICAST_IF: + case IP_MULTICAST_IFINDEX: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: - error = ip_setmoptions(sopt, &inp->inp_moptions); + case IP_ADD_SOURCE_MEMBERSHIP: + case IP_DROP_SOURCE_MEMBERSHIP: + case IP_BLOCK_SOURCE: + case IP_UNBLOCK_SOURCE: + case IP_MSFILTER: + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + error = inp_setmoptions(inp, sopt); break; case IP_PORTRANGE: @@ -2277,10 +2464,6 @@ ip_ctloutput(so, sopt) struct mbuf *m; int optname; - if (sopt->sopt_valsize > MCLBYTES) { - error = EMSGSIZE; - break; - } if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ @@ -2306,13 +2489,11 @@ ip_ctloutput(so, sopt) break; if (background) { - socket_set_traffic_mgt_flags(so, - TRAFFIC_MGT_SO_BACKGROUND | - TRAFFIC_MGT_SO_BG_REGULATE); + socket_set_traffic_mgt_flags_locked(so, + TRAFFIC_MGT_SO_BACKGROUND); } else { - socket_clear_traffic_mgt_flags(so, - TRAFFIC_MGT_SO_BACKGROUND | - TRAFFIC_MGT_SO_BG_REGULATE); + socket_clear_traffic_mgt_flags_locked(so, + TRAFFIC_MGT_SO_BACKGROUND); } break; @@ -2331,11 +2512,11 @@ ip_ctloutput(so, sopt) * on the destination address type (e.g. unicast, multicast, * or broadcast if applicable) or whether or not the host is * directly reachable. Note that in the multicast transmit - * case, IP_MULTICAST_IF takes precedence over IP_BOUND_IF, - * since the former practically bypasses the routing table; - * in this case, IP_BOUND_IF sets the default interface used - * for sending multicast packets in the absence of an explicit - * transmit interface set via IP_MULTICAST_IF. + * case, IP_MULTICAST_{IF,IFINDEX} takes precedence over + * IP_BOUND_IF, since the former practically bypasses the + * routing table; in this case, IP_BOUND_IF sets the default + * interface used for sending multicast packets in the absence + * of an explicit multicast transmit interface. */ case IP_BOUND_IF: /* This option is settable only for IPv4 */ @@ -2350,7 +2531,28 @@ ip_ctloutput(so, sopt) if (error) break; - ip_bindif(inp, optval); + inp_bindif(inp, optval); + break; + + case IP_NO_IFT_CELLULAR: + /* This option is settable only for IPv4 */ + if (!(inp->inp_vflag & INP_IPV4)) { + error = EINVAL; + break; + } + + error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval)); + + if (error) + break; + + error = inp_nocellular(inp, optval); + break; + + case IP_OUT_IF: + /* This option is not settable */ + error = EINVAL; break; default: @@ -2383,6 +2585,7 @@ ip_ctloutput(so, sopt) #if defined(NFAITH) && NFAITH > 0 case IP_FAITH: #endif + case IP_RECVPKTINFO: switch (sopt->sopt_name) { case IP_TOS: @@ -2429,17 +2632,20 @@ ip_ctloutput(so, sopt) optval = OPTBIT(INP_FAITH); break; #endif + case IP_RECVPKTINFO: + optval = OPTBIT(INP_PKTINFO); + break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; case IP_MULTICAST_IF: + case IP_MULTICAST_IFINDEX: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: - case IP_ADD_MEMBERSHIP: - case IP_DROP_MEMBERSHIP: - error = ip_getmoptions(sopt, inp->inp_moptions); + case IP_MSFILTER: + error = inp_getmoptions(inp, sopt); break; #if IPSEC @@ -2465,7 +2671,7 @@ ip_ctloutput(so, sopt) #if TRAFFIC_MGT case IP_TRAFFIC_MGT_BACKGROUND: { - unsigned background = so->so_traffic_mgt_flags; + unsigned background = (so->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BACKGROUND); return (sooptcopyout(sopt, &background, sizeof(background))); break; } @@ -2477,6 +2683,16 @@ ip_ctloutput(so, sopt) error = sooptcopyout(sopt, &optval, sizeof (optval)); break; + case IP_NO_IFT_CELLULAR: + optval = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; + error = sooptcopyout(sopt, &optval, sizeof (optval)); + break; + + case IP_OUT_IF: + optval = inp->inp_last_outif; + error = sooptcopyout(sopt, &optval, sizeof (optval)); + break; + default: error = ENOPROTOOPT; break; @@ -2591,471 +2807,138 @@ ip_pcbopts( return (EINVAL); } -/* - * XXX - * The whole multicast option thing needs to be re-thought. - * Several of these options are equally applicable to non-multicast - * transmission, and one (IP_MULTICAST_TTL) totally duplicates a - * standard option (IP_TTL). - */ - -/* - * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index. - */ -static struct ifnet * -ip_multicast_if(a, ifindexp) - struct in_addr *a; - int *ifindexp; +void +ip_moptions_init(void) { - int ifindex; - struct ifnet *ifp; + PE_parse_boot_argn("ifa_debug", &imo_debug, sizeof (imo_debug)); - if (ifindexp) - *ifindexp = 0; - if (ntohl(a->s_addr) >> 24 == 0) { - ifindex = ntohl(a->s_addr) & 0xffffff; - ifnet_head_lock_shared(); - if (ifindex < 0 || if_index < ifindex) { - ifnet_head_done(); - return NULL; - } - ifp = ifindex2ifnet[ifindex]; - ifnet_head_done(); - if (ifindexp) - *ifindexp = ifindex; - } else { - INADDR_TO_IFP(*a, ifp); + imo_size = (imo_debug == 0) ? sizeof (struct ip_moptions) : + sizeof (struct ip_moptions_dbg); + + imo_zone = zinit(imo_size, IMO_ZONE_MAX * imo_size, 0, + IMO_ZONE_NAME); + if (imo_zone == NULL) { + panic("%s: failed allocating %s", __func__, IMO_ZONE_NAME); + /* NOTREACHED */ } - return ifp; + zone_change(imo_zone, Z_EXPAND, TRUE); } -/* - * Set the IP multicast options in response to user setsockopt(). - */ -static int -ip_setmoptions(sopt, imop) - struct sockopt *sopt; - struct ip_moptions **imop; +void +imo_addref(struct ip_moptions *imo, int locked) { - int error = 0; - struct in_addr addr; - struct ip_mreq mreq; - struct ifnet *ifp = NULL; - struct ip_moptions *imo = *imop; - int ifindex; - - if (imo == NULL) { - /* - * No multicast option buffer attached to the pcb; - * allocate one and initialize to default values. - */ - error = ip_createmoptions(imop); - if (error != 0) - return error; - imo = *imop; - } - - switch (sopt->sopt_name) { - /* store an index number for the vif you wanna use in the send */ -#if MROUTING - case IP_MULTICAST_VIF: - { - int i; - if (legal_vif_num == 0) { - error = EOPNOTSUPP; - break; - } - error = sooptcopyin(sopt, &i, sizeof i, sizeof i); - if (error) - break; - if (!legal_vif_num(i) && (i != -1)) { - error = EINVAL; - break; - } - imo->imo_multicast_vif = i; - break; - } -#endif /* MROUTING */ - - case IP_MULTICAST_IF: - /* - * Select the interface for outgoing multicast packets. - */ - error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr); - if (error) - break; - /* - * INADDR_ANY is used to remove a previous selection. - * When no interface is selected, a default one is - * chosen every time a multicast packet is sent. - */ - if (addr.s_addr == INADDR_ANY) { - imo->imo_multicast_ifp = NULL; - break; - } - /* - * The selected interface is identified by its local - * IP address. Find the interface and confirm that - * it supports multicasting. - */ - ifp = ip_multicast_if(&addr, &ifindex); - if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { - error = EADDRNOTAVAIL; - break; - } - imo->imo_multicast_ifp = ifp; - if (ifindex) - imo->imo_multicast_addr = addr; - else - imo->imo_multicast_addr.s_addr = INADDR_ANY; - break; - - case IP_MULTICAST_TTL: - /* - * Set the IP time-to-live for outgoing multicast packets. - * The original multicast API required a char argument, - * which is inconsistent with the rest of the socket API. - * We allow either a char or an int. - */ - if (sopt->sopt_valsize == 1) { - u_char ttl; - error = sooptcopyin(sopt, &ttl, 1, 1); - if (error) - break; - imo->imo_multicast_ttl = ttl; - } else { - u_int ttl; - error = sooptcopyin(sopt, &ttl, sizeof ttl, - sizeof ttl); - if (error) - break; - if (ttl > 255) - error = EINVAL; - else - imo->imo_multicast_ttl = ttl; - } - break; - - case IP_MULTICAST_LOOP: - /* - * Set the loopback flag for outgoing multicast packets. - * Must be zero or one. The original multicast API required a - * char argument, which is inconsistent with the rest - * of the socket API. We allow either a char or an int. - */ - if (sopt->sopt_valsize == 1) { - u_char loop; - error = sooptcopyin(sopt, &loop, 1, 1); - if (error) - break; - imo->imo_multicast_loop = !!loop; - } else { - u_int loop; - error = sooptcopyin(sopt, &loop, sizeof loop, - sizeof loop); - if (error) - break; - imo->imo_multicast_loop = !!loop; - } - break; - - case IP_ADD_MEMBERSHIP: - /* - * Add a multicast group membership. - * Group must be a valid IP multicast address. - */ - error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); - if (error) - break; - - error = ip_addmembership(imo, &mreq); - break; - - case IP_DROP_MEMBERSHIP: - /* - * Drop a multicast group membership. - * Group must be a valid IP multicast address. - */ - error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); - if (error) - break; - - error = ip_dropmembership(imo, &mreq); - break; - - default: - error = EOPNOTSUPP; - break; - } + if (!locked) + IMO_LOCK(imo); + else + IMO_LOCK_ASSERT_HELD(imo); - /* - * If all options have default values, no need to keep the mbuf. - */ - if (imo->imo_multicast_ifp == NULL && - imo->imo_multicast_vif == (u_int32_t)-1 && - imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL && - imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP && - imo->imo_num_memberships == 0) { - FREE(*imop, M_IPMOPTS); - *imop = NULL; + if (++imo->imo_refcnt == 0) { + panic("%s: imo %p wraparound refcnt\n", __func__, imo); + /* NOTREACHED */ + } else if (imo->imo_trace != NULL) { + (*imo->imo_trace)(imo, TRUE); } - return (error); + if (!locked) + IMO_UNLOCK(imo); } -/* - * Set the IP multicast options in response to user setsockopt(). - */ -__private_extern__ int -ip_createmoptions( - struct ip_moptions **imop) -{ - struct ip_moptions *imo; - imo = (struct ip_moptions*) _MALLOC(sizeof(*imo), M_IPMOPTS, - M_WAITOK); - - if (imo == NULL) - return (ENOBUFS); - *imop = imo; - imo->imo_multicast_ifp = NULL; - imo->imo_multicast_addr.s_addr = INADDR_ANY; - imo->imo_multicast_vif = -1; - imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; - imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; - imo->imo_num_memberships = 0; - - return 0; -} - -/* - * Add membership to an IPv4 multicast. - */ -__private_extern__ int -ip_addmembership( - struct ip_moptions *imo, - struct ip_mreq *mreq) +void +imo_remref(struct ip_moptions *imo) { - struct route ro; - struct sockaddr_in *dst; - struct ifnet *ifp = NULL; - int error = 0; int i; - bzero((caddr_t)&ro, sizeof(ro)); - - if (!IN_MULTICAST(ntohl(mreq->imr_multiaddr.s_addr))) { - error = EINVAL; - goto done; - } - /* - * If no interface address was provided, use the interface of - * the route to the given multicast address. - */ - if (mreq->imr_interface.s_addr == INADDR_ANY) { - dst = (struct sockaddr_in *)&ro.ro_dst; - dst->sin_len = sizeof(*dst); - dst->sin_family = AF_INET; - dst->sin_addr = mreq->imr_multiaddr; - rtalloc_ign(&ro, 0); - if (ro.ro_rt != NULL) { - ifp = ro.ro_rt->rt_ifp; - } else { - /* If there's no default route, try using loopback */ - mreq->imr_interface.s_addr = htonl(INADDR_LOOPBACK); - } + IMO_LOCK(imo); + if (imo->imo_refcnt == 0) { + panic("%s: imo %p negative refcnt", __func__, imo); + /* NOTREACHED */ + } else if (imo->imo_trace != NULL) { + (*imo->imo_trace)(imo, FALSE); } - if (ifp == NULL) { - ifp = ip_multicast_if(&mreq->imr_interface, NULL); + --imo->imo_refcnt; + if (imo->imo_refcnt > 0) { + IMO_UNLOCK(imo); + return; } - /* - * See if we found an interface, and confirm that it - * supports multicast. - */ - if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { - error = EADDRNOTAVAIL; - goto done; - } - /* - * See if the membership already exists or if all the - * membership slots are full. - */ for (i = 0; i < imo->imo_num_memberships; ++i) { - if (imo->imo_membership[i]->inm_ifp == ifp && - imo->imo_membership[i]->inm_addr.s_addr - == mreq->imr_multiaddr.s_addr) - break; - } - if (i < imo->imo_num_memberships) { - error = EADDRINUSE; - goto done; - } - if (i == IP_MAX_MEMBERSHIPS) { - error = ETOOMANYREFS; - goto done; - } - /* - * Everything looks good; add a new record to the multicast - * address list for the given interface. - */ - if ((imo->imo_membership[i] = - in_addmulti(&mreq->imr_multiaddr, ifp)) == NULL) { - error = ENOBUFS; - goto done; - } - ++imo->imo_num_memberships; + struct in_mfilter *imf; -done: - if (ro.ro_rt != NULL) - rtfree(ro.ro_rt); + imf = imo->imo_mfilters ? &imo->imo_mfilters[i] : NULL; + if (imf != NULL) + imf_leave(imf); - return error; -} + (void) in_leavegroup(imo->imo_membership[i], imf); -/* - * Drop membership of an IPv4 multicast. - */ -__private_extern__ int -ip_dropmembership( - struct ip_moptions *imo, - struct ip_mreq *mreq) -{ - int error = 0; - struct ifnet* ifp = NULL; - int i; - - if (!IN_MULTICAST(ntohl(mreq->imr_multiaddr.s_addr))) { - error = EINVAL; - return error; - } + if (imf != NULL) + imf_purge(imf); - /* - * If an interface address was specified, get a pointer - * to its ifnet structure. - */ - if (mreq->imr_interface.s_addr == INADDR_ANY) - ifp = NULL; - else { - ifp = ip_multicast_if(&mreq->imr_interface, NULL); - if (ifp == NULL) { - error = EADDRNOTAVAIL; - return error; - } + INM_REMREF(imo->imo_membership[i]); + imo->imo_membership[i] = NULL; } - /* - * Find the membership in the membership array. - */ - for (i = 0; i < imo->imo_num_memberships; ++i) { - if ((ifp == NULL || - imo->imo_membership[i]->inm_ifp == ifp) && - imo->imo_membership[i]->inm_addr.s_addr == - mreq->imr_multiaddr.s_addr) - break; + imo->imo_num_memberships = 0; + if (imo->imo_mfilters != NULL) { + FREE(imo->imo_mfilters, M_INMFILTER); + imo->imo_mfilters = NULL; } - if (i == imo->imo_num_memberships) { - error = EADDRNOTAVAIL; - return error; + if (imo->imo_membership != NULL) { + FREE(imo->imo_membership, M_IPMOPTS); + imo->imo_membership = NULL; } - /* - * Give up the multicast address record to which the - * membership points. - */ - in_delmulti(&imo->imo_membership[i]); - /* - * Remove the gap in the membership array. - */ - for (++i; i < imo->imo_num_memberships; ++i) - imo->imo_membership[i-1] = imo->imo_membership[i]; - --imo->imo_num_memberships; - - return error; -} - -/* - * Return the IP multicast options in response to user getsockopt(). - */ -static int -ip_getmoptions(sopt, imo) - struct sockopt *sopt; - register struct ip_moptions *imo; -{ - struct in_addr addr; - struct in_ifaddr *ia; - int error, optval; - u_char coptval; - - error = 0; - switch (sopt->sopt_name) { -#if MROUTING - case IP_MULTICAST_VIF: - if (imo != NULL) - optval = imo->imo_multicast_vif; - else - optval = -1; - error = sooptcopyout(sopt, &optval, sizeof optval); - break; -#endif /* MROUTING */ + IMO_UNLOCK(imo); - case IP_MULTICAST_IF: - if (imo == NULL || imo->imo_multicast_ifp == NULL) - addr.s_addr = INADDR_ANY; - else if (imo->imo_multicast_addr.s_addr) { - /* return the value user has set */ - addr = imo->imo_multicast_addr; - } else { - IFP_TO_IA(imo->imo_multicast_ifp, ia); - addr.s_addr = (ia == NULL) ? INADDR_ANY - : IA_SIN(ia)->sin_addr.s_addr; - if (ia != NULL) - ifafree(&ia->ia_ifa); - } - error = sooptcopyout(sopt, &addr, sizeof addr); - break; - - case IP_MULTICAST_TTL: - if (imo == 0) - optval = coptval = IP_DEFAULT_MULTICAST_TTL; - else - optval = coptval = imo->imo_multicast_ttl; - if (sopt->sopt_valsize == 1) - error = sooptcopyout(sopt, &coptval, 1); - else - error = sooptcopyout(sopt, &optval, sizeof optval); - break; + lck_mtx_destroy(&imo->imo_lock, ifa_mtx_grp); - case IP_MULTICAST_LOOP: - if (imo == 0) - optval = coptval = IP_DEFAULT_MULTICAST_LOOP; - else - optval = coptval = imo->imo_multicast_loop; - if (sopt->sopt_valsize == 1) - error = sooptcopyout(sopt, &coptval, 1); - else - error = sooptcopyout(sopt, &optval, sizeof optval); - break; + if (!(imo->imo_debug & IFD_ALLOC)) { + panic("%s: imo %p cannot be freed", __func__, imo); + /* NOTREACHED */ + } + zfree(imo_zone, imo); +} - default: - error = ENOPROTOOPT; - break; +static void +imo_trace(struct ip_moptions *imo, int refhold) +{ + struct ip_moptions_dbg *imo_dbg = (struct ip_moptions_dbg *)imo; + ctrace_t *tr; + u_int32_t idx; + u_int16_t *cnt; + + if (!(imo->imo_debug & IFD_DEBUG)) { + panic("%s: imo %p has no debug structure", __func__, imo); + /* NOTREACHED */ + } + if (refhold) { + cnt = &imo_dbg->imo_refhold_cnt; + tr = imo_dbg->imo_refhold; + } else { + cnt = &imo_dbg->imo_refrele_cnt; + tr = imo_dbg->imo_refrele; } - return (error); + + idx = atomic_add_16_ov(cnt, 1) % IMO_TRACE_HIST_SIZE; + ctrace_record(&tr[idx]); } -/* - * Discard the IP multicast options. - */ -void -ip_freemoptions(imo) - register struct ip_moptions *imo; +struct ip_moptions * +ip_allocmoptions(int how) { - register int i; + struct ip_moptions *imo; + imo = (how == M_WAITOK) ? zalloc(imo_zone) : zalloc_noblock(imo_zone); if (imo != NULL) { - for (i = 0; i < imo->imo_num_memberships; ++i) - in_delmulti(&imo->imo_membership[i]); - FREE(imo, M_IPMOPTS); + bzero(imo, imo_size); + lck_mtx_init(&imo->imo_lock, ifa_mtx_grp, ifa_mtx_attr); + imo->imo_debug |= IFD_ALLOC; + if (imo_debug != 0) { + imo->imo_debug |= IFD_DEBUG; + imo->imo_trace = imo_trace; + } + IMO_ADDREF(imo); } + + return (imo); } /* @@ -3174,6 +3057,8 @@ ip_mloopback(ifp, m, dst, hlen) * without any locks based on the assumption that ip_output() is single- * threaded per-pcb, i.e. for any given pcb there can only be one thread * performing output at the IP layer. + * + * This routine is analogous to in6_selectroute() for IPv6. */ static struct ifaddr * in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) @@ -3215,9 +3100,9 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) */ if (scope == IFSCOPE_NONE) { scope = rt_ifp->if_index; - if (scope != get_primary_ifscope() && + if (scope != get_primary_ifscope(AF_INET) && ro->ro_rt->generation_id != route_generation) - scope = get_primary_ifscope(); + scope = get_primary_ifscope(AF_INET); } ifa = (struct ifaddr *)ifa_foraddr_scoped(src.s_addr, scope); @@ -3232,7 +3117,7 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) */ ifa = (struct ifaddr *)ifa_foraddr(src.s_addr); if (ifa != NULL) { - ifafree(ifa); + IFA_REMREF(ifa); ifa = NULL; ifscope = IFSCOPE_NONE; } @@ -3240,16 +3125,14 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) if (ip_select_srcif_debug && ifa != NULL) { if (ro->ro_rt != NULL) { - printf("%s->%s ifscope %d->%d ifa_if %s%d " - "ro_if %s%d\n", s_src, s_dst, ifscope, - scope, ifa->ifa_ifp->if_name, - ifa->ifa_ifp->if_unit, rt_ifp->if_name, - rt_ifp->if_unit); + printf("%s->%s ifscope %d->%d ifa_if %s " + "ro_if %s\n", s_src, s_dst, ifscope, + scope, if_name(ifa->ifa_ifp), + if_name(rt_ifp)); } else { - printf("%s->%s ifscope %d->%d ifa_if %s%d\n", + printf("%s->%s ifscope %d->%d ifa_if %s\n", s_src, s_dst, ifscope, scope, - ifa->ifa_ifp->if_name, - ifa->ifa_ifp->if_unit); + if_name(ifa->ifa_ifp)); } } } @@ -3296,7 +3179,7 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) if (ifa->ifa_ifp != rt->rt_ifp) { oifa = ifa; ifa = rt->rt_ifa; - ifaref(ifa); + IFA_ADDREF(ifa); RT_UNLOCK(rt); } else { RT_UNLOCK(rt); @@ -3322,8 +3205,8 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) * as well as the route interface * address, and use this instead. */ - ifafree(oifa); - ifafree(ifa); + IFA_REMREF(oifa); + IFA_REMREF(ifa); ifa = iifa; } else if (!ipforwarding || (rt->rt_flags & RTF_GATEWAY)) { @@ -3334,7 +3217,7 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) * original one, and let the caller * do a scoped route lookup. */ - ifafree(ifa); + IFA_REMREF(ifa); ifa = oifa; } else { /* @@ -3347,7 +3230,7 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) * the original one and use the route * interface address instead. */ - ifafree(oifa); + IFA_REMREF(oifa); } } } else if (ifa != NULL && ro->ro_rt != NULL && @@ -3359,15 +3242,14 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) * as the interface used by the known route; drop the * original one and use the route interface address. */ - ifafree(ifa); + IFA_REMREF(ifa); ifa = ro->ro_rt->rt_ifa; - ifaref(ifa); + IFA_ADDREF(ifa); } if (ip_select_srcif_debug && ifa != NULL) { - printf("%s->%s ifscope %d ifa_if %s%d\n", - s_src, s_dst, ifscope, ifa->ifa_ifp->if_name, - ifa->ifa_ifp->if_unit); + printf("%s->%s ifscope %d ifa_if %s\n", + s_src, s_dst, ifscope, if_name(ifa->ifa_ifp)); } } @@ -3384,16 +3266,14 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) !(ro->ro_rt->rt_flags & RTF_UP))) { if (ip_select_srcif_debug) { if (ifa != NULL) { - printf("%s->%s ifscope %d ro_if %s%d != " - "ifa_if %s%d (cached route cleared)\n", - s_src, s_dst, ifscope, rt_ifp->if_name, - rt_ifp->if_unit, ifa->ifa_ifp->if_name, - ifa->ifa_ifp->if_unit); + printf("%s->%s ifscope %d ro_if %s != " + "ifa_if %s (cached route cleared)\n", + s_src, s_dst, ifscope, if_name(rt_ifp), + if_name(ifa->ifa_ifp)); } else { - printf("%s->%s ifscope %d ro_if %s%d " + printf("%s->%s ifscope %d ro_if %s " "(no ifa_if found)\n", - s_src, s_dst, ifscope, rt_ifp->if_name, - rt_ifp->if_unit); + s_src, s_dst, ifscope, if_name(rt_ifp)); } } @@ -3414,7 +3294,7 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) */ if (IN_LINKLOCAL(ntohl(dst.s_addr)) && !IN_LINKLOCAL(ntohl(src.s_addr)) && ifa != NULL) { - ifafree(ifa); + IFA_REMREF(ifa); ifa = NULL; } } @@ -3444,31 +3324,3 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) return (ifa); } - -/* - * Handler for setting IP_FORCE_OUT_IFP or IP_BOUND_IF socket option. - */ -static void -ip_bindif(struct inpcb *inp, unsigned int ifscope) -{ - /* - * A zero interface scope value indicates an "unbind". - * Otherwise, take in whatever value the app desires; - * the app may already know the scope (or force itself - * to such a scope) ahead of time before the interface - * gets attached. It doesn't matter either way; any - * route lookup from this point on will require an - * exact match for the embedded interface scope. - */ - inp->inp_boundif = ifscope; - if (inp->inp_boundif == IFSCOPE_NONE) - inp->inp_flags &= ~INP_BOUND_IF; - else - inp->inp_flags |= INP_BOUND_IF; - - /* Blow away any cached route in the PCB */ - if (inp->inp_route.ro_rt != NULL) { - rtfree(inp->inp_route.ro_rt); - inp->inp_route.ro_rt = NULL; - } -} diff --git a/bsd/netinet/ip_var.h b/bsd/netinet/ip_var.h index 9d4044d4c..971a88126 100644 --- a/bsd/netinet/ip_var.h +++ b/bsd/netinet/ip_var.h @@ -121,7 +121,7 @@ struct ipq { */ #endif /* KERNEL_PRIVATE */ #define MAX_IPOPTLEN 40 -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE struct ipoption { struct in_addr ipopt_dst; /* first-hop dst if source routed */ @@ -133,21 +133,57 @@ struct ipoption { * passed to ip_output when IP multicast options are in use. */ struct ip_moptions { + decl_lck_mtx_data(, imo_lock); + uint32_t imo_refcnt; /* ref count */ + uint32_t imo_debug; /* see ifa_debug flags */ struct ifnet *imo_multicast_ifp; /* ifp for outgoing multicasts */ u_char imo_multicast_ttl; /* TTL for outgoing multicasts */ u_char imo_multicast_loop; /* 1 => hear sends if a member */ u_short imo_num_memberships; /* no. memberships this socket */ - struct in_multi *imo_membership[IP_MAX_MEMBERSHIPS]; - u_int32_t imo_multicast_vif; /* vif num outgoing multicasts */ + u_short imo_max_memberships; /* max memberships this socket */ + struct in_multi **imo_membership; /* group memberships */ + struct in_mfilter *imo_mfilters; /* source filters */ + u_int32_t imo_multicast_vif; /* vif num outgoing multicasts */ struct in_addr imo_multicast_addr; /* ifindex/addr on MULTICAST_IF */ + void (*imo_trace) /* callback fn for tracing refs */ + (struct ip_moptions *, int); }; +#define IMO_LOCK_ASSERT_HELD(_imo) \ + lck_mtx_assert(&(_imo)->imo_lock, LCK_MTX_ASSERT_OWNED) + +#define IMO_LOCK_ASSERT_NOTHELD(_imo) \ + lck_mtx_assert(&(_imo)->imo_lock, LCK_MTX_ASSERT_NOTOWNED) + +#define IMO_LOCK(_imo) \ + lck_mtx_lock(&(_imo)->imo_lock) + +#define IMO_LOCK_SPIN(_imo) \ + lck_mtx_lock_spin(&(_imo)->imo_lock) + +#define IMO_CONVERT_LOCK(_imo) do { \ + IMO_LOCK_ASSERT_HELD(_imo); \ + lck_mtx_convert_spin(&(_imo)->imo_lock); \ +} while (0) + +#define IMO_UNLOCK(_imo) \ + lck_mtx_unlock(&(_imo)->imo_lock) + +#define IMO_ADDREF(_imo) \ + imo_addref(_imo, 0) + +#define IMO_ADDREF_LOCKED(_imo) \ + imo_addref(_imo, 1) + +#define IMO_REMREF(_imo) \ + imo_remref(_imo) + /* mbuf tag for ip_forwarding info */ struct ip_fwd_tag { struct sockaddr_in *next_hop; /* next_hop */ }; -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ struct ipstat { u_int32_t ips_total; /* total packets received */ @@ -179,6 +215,9 @@ struct ipstat { u_int32_t ips_notmember; /* multicasts for unregistered grps */ u_int32_t ips_nogif; /* no match gif found */ u_int32_t ips_badaddr; /* invalid address on header */ +#ifdef PRIVATE + u_int32_t ips_pktdropcntrl; /* pkt dropped, no mbufs for control data */ +#endif /* PRIVATE */ }; struct ip_linklocal_stat { @@ -206,7 +245,8 @@ struct sockopt; * Extra information passed to ip_output when IP_OUTARGS is set. */ struct ip_out_args { - unsigned int ipoa_ifscope; /* interface scope */ + unsigned int ipoa_boundif; /* bound outgoing interface */ + unsigned int ipoa_nocell; /* don't use IFT_CELLULAR */ }; extern struct ipstat ipstat; @@ -224,9 +264,15 @@ extern int rsvp_on; extern struct pr_usrreqs rip_usrreqs; extern int ip_doscopedroute; +extern void ip_moptions_init(void); +extern struct ip_moptions *ip_allocmoptions(int); +extern int inp_getmoptions(struct inpcb *, struct sockopt *); +extern int inp_setmoptions(struct inpcb *, struct sockopt *); +extern void imo_addref(struct ip_moptions *, int); +extern void imo_remref(struct ip_moptions *); + int ip_ctloutput(struct socket *, struct sockopt *sopt); void ip_drain(void); -void ip_freemoptions(struct ip_moptions *); void ip_init(void) __attribute__((section("__TEXT, initcode"))); extern int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, struct ip_moptions *); @@ -235,7 +281,7 @@ extern int ip_output(struct mbuf *, struct mbuf *, struct route *, int, extern int ip_output_list(struct mbuf *, int, struct mbuf *, struct route *, int, struct ip_moptions *, struct ip_out_args *); struct in_ifaddr *ip_rtaddr(struct in_addr); -void ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *, +int ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *, struct mbuf *); void ip_slowtimo(void); struct mbuf * diff --git a/bsd/netinet/kpi_ipfilter.c b/bsd/netinet/kpi_ipfilter.c index 6aea8ccf2..b03f56cd1 100644 --- a/bsd/netinet/kpi_ipfilter.c +++ b/bsd/netinet/kpi_ipfilter.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -235,7 +235,7 @@ ipf_inject_input( } if (filter_ref == 0 && m->m_pkthdr.rcvif == 0) { - m->m_pkthdr.rcvif = ifunit("lo0"); + m->m_pkthdr.rcvif = lo_ifp; m->m_pkthdr.csum_data = 0; m->m_pkthdr.csum_flags = 0; if (vers == 4) { @@ -245,8 +245,8 @@ ipf_inject_input( } } if (filter_ref != 0) { - mtag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFILT, - sizeof (ipfilter_t), M_NOWAIT); + mtag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFILT, + sizeof (ipfilter_t), M_NOWAIT, m); if (mtag == NULL) { error = ENOMEM; goto done; @@ -262,58 +262,54 @@ ipf_inject_input( } static errno_t -ipf_injectv4_out( - mbuf_t data, - ipfilter_t filter_ref, - ipf_pktopts_t options) +ipf_injectv4_out(mbuf_t data, ipfilter_t filter_ref, ipf_pktopts_t options) { struct route ro; - struct sockaddr_in *sin = (struct sockaddr_in*)&ro.ro_dst; struct ip *ip; struct mbuf *m = (struct mbuf*)data; errno_t error = 0; - struct m_tag *mtag = 0; - struct ip_moptions *imo = 0, ip_moptions; - + struct m_tag *mtag = NULL; + struct ip_moptions *imo = NULL; + struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; + /* Make the IP header contiguous in the mbuf */ - if ((size_t)m->m_len < sizeof(struct ip)) { - m = m_pullup(m, sizeof(struct ip)); - if (m == NULL) return ENOMEM; + if ((size_t)m->m_len < sizeof (struct ip)) { + m = m_pullup(m, sizeof (struct ip)); + if (m == NULL) + return (ENOMEM); } - ip = (struct ip*)m_mtod(m); - + ip = (struct ip *)m_mtod(m); + if (filter_ref != 0) { - mtag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFILT, - sizeof (ipfilter_t), M_NOWAIT); + mtag = m_tag_create(KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_IPFILT, sizeof (ipfilter_t), M_NOWAIT, m); if (mtag == NULL) { m_freem(m); - return ENOMEM; + return (ENOMEM); } - *(ipfilter_t*)(mtag+1) = filter_ref; + *(ipfilter_t *)(mtag + 1) = filter_ref; m_tag_prepend(m, mtag); } - - if (options && (options->ippo_flags & IPPOF_MCAST_OPTS)) { - imo = &ip_moptions; - - bzero(imo, sizeof(struct ip6_moptions)); + + if (options != NULL && (options->ippo_flags & IPPOF_MCAST_OPTS) && + (imo = ip_allocmoptions(M_DONTWAIT)) != NULL) { imo->imo_multicast_ifp = options->ippo_mcast_ifnet; imo->imo_multicast_ttl = options->ippo_mcast_ttl; imo->imo_multicast_loop = options->ippo_mcast_loop; } - - /* Fill out a route structure and get a route */ - bzero(&ro, sizeof(struct route)); - sin->sin_len = sizeof(struct sockaddr_in); - sin->sin_family = AF_INET; - sin->sin_port = 0; - sin->sin_addr = ip->ip_dst; - rtalloc(&ro); - if (ro.ro_rt == NULL) { - m_freem(m); - return ENETUNREACH; + + if (options != NULL && + (options->ippo_flags & (IPPOF_BOUND_IF | IPPOF_NO_IFT_CELLULAR))) { + if (options->ippo_flags & IPPOF_BOUND_IF) { + ipoa.ipoa_boundif = options->ippo_flags >> + IPPOF_SHIFT_IFSCOPE; + } + if (options->ippo_flags & IPPOF_NO_IFT_CELLULAR) + ipoa.ipoa_nocell = 1; } - + + bzero(&ro, sizeof(struct route)); + /* Put ip_len and ip_off in host byte order, ip_output expects that */ #if BYTE_ORDER != BIG_ENDIAN @@ -321,88 +317,85 @@ ipf_injectv4_out( NTOHS(ip->ip_off); #endif - /* Send */ - error = ip_output(m, NULL, &ro, IP_ALLOWBROADCAST | IP_RAWOUTPUT, imo, NULL); - + /* Send; enforce source interface selection via IP_OUTARGS flag */ + error = ip_output(m, NULL, &ro, + IP_ALLOWBROADCAST | IP_RAWOUTPUT | IP_OUTARGS, imo, &ipoa); + /* Release the route */ if (ro.ro_rt) rtfree(ro.ro_rt); - - return error; + + if (imo != NULL) + IMO_REMREF(imo); + + return (error); } #if INET6 static errno_t -ipf_injectv6_out( - mbuf_t data, - ipfilter_t filter_ref, - ipf_pktopts_t options) +ipf_injectv6_out(mbuf_t data, ipfilter_t filter_ref, ipf_pktopts_t options) { struct route_in6 ro; - struct sockaddr_in6 *sin6 = &ro.ro_dst; struct ip6_hdr *ip6; struct mbuf *m = (struct mbuf*)data; errno_t error = 0; - struct m_tag *mtag = 0; - struct ip6_moptions *im6o = 0, ip6_moptions; - + struct m_tag *mtag = NULL; + struct ip6_moptions *im6o = NULL; + struct ip6_out_args ip6oa = { IFSCOPE_NONE, 0 }; + /* Make the IP header contiguous in the mbuf */ if ((size_t)m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); - if (m == NULL) return ENOMEM; + if (m == NULL) + return (ENOMEM); } ip6 = (struct ip6_hdr*)m_mtod(m); if (filter_ref != 0) { - mtag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFILT, - sizeof (ipfilter_t), M_NOWAIT); + mtag = m_tag_create(KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_IPFILT, sizeof (ipfilter_t), M_NOWAIT, m); if (mtag == NULL) { m_freem(m); - return ENOMEM; + return (ENOMEM); } - *(ipfilter_t*)(mtag+1) = filter_ref; + *(ipfilter_t *)(mtag + 1) = filter_ref; m_tag_prepend(m, mtag); } - - if (options && (options->ippo_flags & IPPOF_MCAST_OPTS)) { - im6o = &ip6_moptions; - - bzero(im6o, sizeof(struct ip6_moptions)); + + if (options != NULL && (options->ippo_flags & IPPOF_MCAST_OPTS) && + (im6o = ip6_allocmoptions(M_DONTWAIT)) != NULL) { im6o->im6o_multicast_ifp = options->ippo_mcast_ifnet; im6o->im6o_multicast_hlim = options->ippo_mcast_ttl; im6o->im6o_multicast_loop = options->ippo_mcast_loop; } - - - /* Fill out a route structure and get a route */ - bzero(&ro, sizeof(struct route_in6)); - sin6->sin6_len = sizeof(struct sockaddr_in6); - sin6->sin6_family = AF_INET6; - sin6->sin6_addr = ip6->ip6_dst; -#if 0 - /* This is breaks loopback multicast! */ - /* The scope ID should already at s6_addr16[1] */ - if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) { - /* Hack, pull the scope_id out of the dest addr */ - sin6->sin6_scope_id = ntohs(ip6->ip6_dst.s6_addr16[1]); - ip6->ip6_dst.s6_addr16[1] = 0; - } else - sin6->sin6_scope_id = 0; -#endif - rtalloc((struct route*)&ro); - if (ro.ro_rt == NULL) { - m_freem(m); - return ENETUNREACH; + + if (options != NULL && + (options->ippo_flags & (IPPOF_BOUND_IF | IPPOF_NO_IFT_CELLULAR))) { + if (options->ippo_flags & IPPOF_BOUND_IF) { + ip6oa.ip6oa_boundif = options->ippo_flags >> + IPPOF_SHIFT_IFSCOPE; + } + if (options->ippo_flags & IPPOF_NO_IFT_CELLULAR) + ip6oa.ip6oa_nocell = 1; } - - /* Send */ - error = ip6_output(m, NULL, &ro, 0, im6o, NULL, 0); - + + bzero(&ro, sizeof(struct route_in6)); + + /* + * Send mbuf and ifscope information. Check for correctness + * of ifscope information is done while searching for a route in + * ip6_output. + */ + error = ip6_output(m, NULL, &ro, IPV6_OUTARGS, im6o, NULL, &ip6oa); + /* Release the route */ if (ro.ro_rt) rtfree(ro.ro_rt); - - return error; + + if (im6o != NULL) + IM6O_REMREF(im6o); + + return (error); } #endif /* INET6 */ diff --git a/bsd/netinet/kpi_ipfilter.h b/bsd/netinet/kpi_ipfilter.h index 3d2aaaac9..1f7fae6f0 100644 --- a/bsd/netinet/kpi_ipfilter.h +++ b/bsd/netinet/kpi_ipfilter.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2008-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -50,7 +50,12 @@ struct ipf_pktopts { int ippo_mcast_loop; u_int8_t ippo_mcast_ttl; }; -#define IPPOF_MCAST_OPTS 0x1 +#define IPPOF_MCAST_OPTS 0x1 +#ifdef PRIVATE +#define IPPOF_BOUND_IF 0x2 +#define IPPOF_NO_IFT_CELLULAR 0x4 +#define IPPOF_SHIFT_IFSCOPE 16 +#endif /* PRIVATE */ typedef struct ipf_pktopts *ipf_pktopts_t; diff --git a/bsd/netinet/raw_ip.c b/bsd/netinet/raw_ip.c index c03fde7d0..0b63a3c0d 100644 --- a/bsd/netinet/raw_ip.c +++ b/bsd/netinet/raw_ip.c @@ -200,7 +200,7 @@ rip_input(m, iphlen) register struct inpcb *inp; struct inpcb *last = 0; struct mbuf *opts = 0; - int skipit; + int skipit = 0, ret = 0; ripsrc.sin_addr = ip->ip_src; lck_rw_lock_shared(ripcbinfo.mtx); @@ -220,9 +220,9 @@ rip_input(m, iphlen) if (last) { struct mbuf *n = m_copy(m, 0, (int)M_COPYALL); + skipit = 0; #if IPSEC /* check AH/ESP integrity. */ - skipit = 0; if (ipsec_bypass == 0 && n) { if (ipsec4_in_reject_so(n, last->inp_socket)) { m_freem(n); @@ -235,27 +235,36 @@ rip_input(m, iphlen) #if CONFIG_MACF_NET if (n && skipit == 0) { if (mac_inpcb_check_deliver(last, n, AF_INET, - SOCK_RAW) != 0) + SOCK_RAW) != 0) { + m_freem(n); skipit = 1; + } } #endif if (n && skipit == 0) { int error = 0; - if (last->inp_flags & INP_CONTROLOPTS || - last->inp_socket->so_options & SO_TIMESTAMP) - ip_savecontrol(last, &opts, ip, n); + if ((last->inp_flags & INP_CONTROLOPTS) != 0 || + (last->inp_socket->so_options & SO_TIMESTAMP) != 0 || + (last->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + ret = ip_savecontrol(last, &opts, ip, n); + if (ret != 0) { + m_freem(n); + m_freem(opts); + last = inp; + continue; + } + } if (last->inp_flags & INP_STRIPHDR) { n->m_len -= iphlen; n->m_pkthdr.len -= iphlen; n->m_data += iphlen; } -// ###LOCK need to lock that socket? + so_recv_data_stat(last->inp_socket, m, 0); if (sbappendaddr(&last->inp_socket->so_rcv, (struct sockaddr *)&ripsrc, n, opts, &error) != 0) { sorwakeup(last->inp_socket); - } - else { + } else { if (error) { /* should notify about lost packet */ kprintf("rip_input can't append to socket\n"); @@ -266,10 +275,10 @@ rip_input(m, iphlen) } last = inp; } - lck_rw_done(ripcbinfo.mtx); + + skipit = 0; #if IPSEC /* check AH/ESP integrity. */ - skipit = 0; if (ipsec_bypass == 0 && last) { if (ipsec4_in_reject_so(m, last->inp_socket)) { m_freem(m); @@ -282,20 +291,30 @@ rip_input(m, iphlen) #endif /*IPSEC*/ #if CONFIG_MACF_NET if (last && skipit == 0) { - if (mac_inpcb_check_deliver(last, m, AF_INET, SOCK_RAW) != 0) + if (mac_inpcb_check_deliver(last, m, AF_INET, SOCK_RAW) != 0) { skipit = 1; + m_freem(m); + } } #endif if (skipit == 0) { if (last) { - if (last->inp_flags & INP_CONTROLOPTS || - last->inp_socket->so_options & SO_TIMESTAMP) - ip_savecontrol(last, &opts, ip, m); + if ((last->inp_flags & INP_CONTROLOPTS) != 0 || + (last->inp_socket->so_options & SO_TIMESTAMP) != 0 || + (last->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + ret = ip_savecontrol(last, &opts, ip, m); + if (ret != 0) { + m_freem(m); + m_freem(opts); + goto unlock; + } + } if (last->inp_flags & INP_STRIPHDR) { m->m_len -= iphlen; m->m_pkthdr.len -= iphlen; m->m_data += iphlen; } + so_recv_data_stat(last->inp_socket, m, 0); if (sbappendaddr(&last->inp_socket->so_rcv, (struct sockaddr *)&ripsrc, m, opts, NULL) != 0) { sorwakeup(last->inp_socket); @@ -308,6 +327,12 @@ rip_input(m, iphlen) OSAddAtomic(-1, &ipstat.ips_delivered); } } +unlock: + /* + * Keep the list locked because socket filter may force the socket lock + * to be released when calling sbappendaddr() -- see rdar://7627704 + */ + lck_rw_done(ripcbinfo.mtx); } /* @@ -325,21 +350,19 @@ rip_output( register struct inpcb *inp = sotoinpcb(so); int flags = (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST; struct ip_out_args ipoa; + struct ip_moptions *imo; int error = 0; -#if PKT_PRIORITY - mbuf_traffic_class_t mtc = MBUF_TC_NONE; -#endif /* PKT_PRIORITY */ + mbuf_traffic_class_t mtc = MBUF_TC_UNSPEC; if (control != NULL) { -#if PKT_PRIORITY mtc = mbuf_traffic_class_from_control(control); -#endif /* PKT_PRIORITY */ m_freem(control); } /* If socket was bound to an ifindex, tell ip_output about it */ - ipoa.ipoa_ifscope = (inp->inp_flags & INP_BOUND_IF) ? + ipoa.ipoa_boundif = (inp->inp_flags & INP_BOUND_IF) ? inp->inp_boundif : IFSCOPE_NONE; + ipoa.ipoa_nocell = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; flags |= IP_OUTARGS; /* @@ -401,35 +424,52 @@ rip_output( inp->inp_route.ro_rt = NULL; } -#if PKT_PRIORITY - set_traffic_class(m, so, mtc); -#endif /* PKT_PRIORITY */ + set_packet_tclass(m, so, mtc, 0); #if CONFIG_MACF_NET mac_mbuf_label_associate_inpcb(inp, m); #endif + imo = inp->inp_moptions; + if (imo != NULL) + IMO_ADDREF(imo); /* * The domain lock is held across ip_output, so it is okay * to pass the PCB cached route pointer directly to IP and * the modules beneath it. */ error = ip_output(m, inp->inp_options, &inp->inp_route, flags, - inp->inp_moptions, &ipoa); + imo, &ipoa); -#if IFNET_ROUTE_REFCNT - /* - * Always discard the cached route for unconnected socket - * or if it is a non-unicast route. - */ - if (inp->inp_route.ro_rt != NULL && - ((inp->inp_route.ro_rt->rt_flags & (RTF_MULTICAST|RTF_BROADCAST)) || - inp->inp_socket == NULL || - inp->inp_socket->so_state != SS_ISCONNECTED)) { - rtfree(inp->inp_route.ro_rt); - inp->inp_route.ro_rt = NULL; + if (imo != NULL) + IMO_REMREF(imo); + + if (inp->inp_route.ro_rt != NULL) { + struct rtentry *rt = inp->inp_route.ro_rt; + unsigned int outif; + + if ((rt->rt_flags & (RTF_MULTICAST|RTF_BROADCAST)) || + inp->inp_socket == NULL || + !(inp->inp_socket->so_state & SS_ISCONNECTED)) { + rt = NULL; /* unusable */ + } + /* + * Always discard the cached route for unconnected + * socket or if it is a multicast route. + */ + if (rt == NULL) { + rtfree(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = NULL; + } + /* + * If this is a connected socket and the destination + * route is unicast, update outif with that of the route + * interface index used by IP. + */ + if (rt != NULL && + (outif = rt->rt_ifp->if_index) != inp->inp_last_outif) + inp->inp_last_outif = outif; } -#endif /* IFNET_ROUTE_REFCNT */ return (error); } @@ -642,10 +682,12 @@ rip_ctlinput( lck_rw_lock_shared(in_ifaddr_rwlock); for (ia = in_ifaddrhead.tqh_first; ia; ia = ia->ia_link.tqe_next) { - if (ia->ia_ifa.ifa_addr == sa - && (ia->ia_flags & IFA_ROUTE)) { + IFA_LOCK(&ia->ia_ifa); + if (ia->ia_ifa.ifa_addr == sa && + (ia->ia_flags & IFA_ROUTE)) { done = 1; - ifaref(&ia->ia_ifa); + IFA_ADDREF_LOCKED(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); lck_mtx_lock(rnh_lock); /* @@ -660,9 +702,10 @@ rip_ctlinput( */ in_ifadown(&ia->ia_ifa, 1); lck_mtx_unlock(rnh_lock); - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); break; } + IFA_UNLOCK(&ia->ia_ifa); } if (!done) lck_rw_done(in_ifaddr_rwlock); @@ -672,14 +715,22 @@ rip_ctlinput( lck_rw_lock_shared(in_ifaddr_rwlock); for (ia = in_ifaddrhead.tqh_first; ia; ia = ia->ia_link.tqe_next) { - if (ia->ia_ifa.ifa_addr == sa) + IFA_LOCK(&ia->ia_ifa); + if (ia->ia_ifa.ifa_addr == sa) { + /* keep it locked */ break; + } + IFA_UNLOCK(&ia->ia_ifa); } - if (ia == 0 || (ia->ia_flags & IFA_ROUTE)) { + if (ia == NULL || (ia->ia_flags & IFA_ROUTE) || + (ia->ia_ifa.ifa_debug & IFD_NOTREADY)) { + if (ia != NULL) + IFA_UNLOCK(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); return; } - ifaref(&ia->ia_ifa); + IFA_ADDREF_LOCKED(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); flags = RTF_UP; @@ -690,9 +741,12 @@ rip_ctlinput( flags |= RTF_HOST; err = rtinit(&ia->ia_ifa, RTM_ADD, flags); - if (err == 0) + if (err == 0) { + IFA_LOCK_SPIN(&ia->ia_ifa); ia->ia_flags |= IFA_ROUTE; - ifafree(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); + } + IFA_REMREF(&ia->ia_ifa); break; } } @@ -700,9 +754,9 @@ rip_ctlinput( u_int32_t rip_sendspace = RIPSNDQ; u_int32_t rip_recvspace = RIPRCVQ; -SYSCTL_INT(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, +SYSCTL_INT(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW | CTLFLAG_LOCKED, &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); -SYSCTL_INT(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, +SYSCTL_INT(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW | CTLFLAG_LOCKED, &rip_recvspace, 0, "Maximum incoming raw IP datagram size"); static int @@ -770,6 +824,7 @@ rip_bind(struct socket *so, struct sockaddr *nam, __unused struct proc *p) struct inpcb *inp = sotoinpcb(so); struct sockaddr_in *addr = (struct sockaddr_in *)nam; struct ifaddr *ifa = NULL; + unsigned int outif = 0; if (nam->sa_len != sizeof(*addr)) return EINVAL; @@ -781,10 +836,13 @@ rip_bind(struct socket *so, struct sockaddr *nam, __unused struct proc *p) return EADDRNOTAVAIL; } else if (ifa) { - ifafree(ifa); - ifa = NULL; + IFA_LOCK(ifa); + outif = ifa->ifa_ifp->if_index; + IFA_UNLOCK(ifa); + IFA_REMREF(ifa); } inp->inp_laddr = addr->sin_addr; + inp->inp_last_outif = outif; return 0; } @@ -815,7 +873,7 @@ rip_shutdown(struct socket *so) __private_extern__ int rip_send(struct socket *so, __unused int flags, struct mbuf *m, struct sockaddr *nam, - __unused struct mbuf *control, __unused struct proc *p) + struct mbuf *control, __unused struct proc *p) { struct inpcb *inp = sotoinpcb(so); register u_int32_t dst; @@ -979,7 +1037,7 @@ rip_pcblist SYSCTL_HANDLER_ARGS return error; } -SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0, +SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, rip_pcblist, "S,xinpcb", "List of active raw IP sockets"); #if !CONFIG_EMBEDDED @@ -1082,11 +1140,26 @@ rip_pcblist64 SYSCTL_HANDLER_ARGS return error; } -SYSCTL_PROC(_net_inet_raw, OID_AUTO, pcblist64, CTLFLAG_RD, 0, 0, +SYSCTL_PROC(_net_inet_raw, OID_AUTO, pcblist64, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, rip_pcblist64, "S,xinpcb64", "List of active raw IP sockets"); #endif /* !CONFIG_EMBEDDED */ + +static int +rip_pcblist_n SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int error = 0; + + error = get_pcblist_n(IPPROTO_IP, req, &ripcbinfo); + + return error; +} + +SYSCTL_PROC(_net_inet_raw, OID_AUTO, pcblist_n, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, + rip_pcblist_n, "S,xinpcb_n", "List of active raw IP sockets"); + struct pr_usrreqs rip_usrreqs = { rip_abort, pru_accept_notsupp, rip_attach, rip_bind, rip_connect, pru_connect2_notsupp, in_control, rip_detach, rip_disconnect, diff --git a/bsd/netinet/tcp.h b/bsd/netinet/tcp.h index 3b4d8f92f..a3a183bfe 100644 --- a/bsd/netinet/tcp.h +++ b/bsd/netinet/tcp.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -185,8 +185,6 @@ struct tcphdr { #define TCP_MAX_WINSHIFT 14 /* maximum window shift */ -#define TCP_MAXBURST 4 /* maximum segments in a burst */ - #define TCP_MAXHLEN (0xf<<2) /* max length of header in bytes */ #define TCP_MAXOLEN (TCP_MAXHLEN - sizeof(struct tcphdr)) /* max space left for options */ @@ -202,6 +200,90 @@ struct tcphdr { #define TCP_NOOPT 0x08 /* don't use TCP options */ #define TCP_KEEPALIVE 0x10 /* idle time used when SO_KEEPALIVE is enabled */ #define TCP_CONNECTIONTIMEOUT 0x20 /* connection timeout */ +#define PERSIST_TIMEOUT 0x40 /* time after which a connection in + * persist timeout will terminate. + * see draft-ananth-tcpm-persist-02.txt + */ +#define TCP_RXT_CONNDROPTIME 0x80 /* time after which tcp retransmissions will be + * stopped and the connection will be dropped + */ +#define TCP_RXT_FINDROP 0x100 /* when this option is set, drop a connection + * after retransmitting the FIN 3 times. It will + * prevent holding too many mbufs in socket + * buffer queues. + */ +#ifdef PRIVATE +#define TCP_INFO 0x200 /* retrieve tcp_info structure */ + +/* + * The TCP_INFO socket option comes from the Linux 2.6 TCP API, and permits + * the caller to query certain information about the state of a TCP + * connection. We provide an overlapping set of fields with the Linux + * implementation, but since this is a fixed size structure, room has been + * left for growth. In order to maximize potential future compatibility with + * the Linux API, the same variable names and order have been adopted, and + * padding left to make room for omitted fields in case they are added later. + * + * XXX: This is currently an unstable ABI/API, in that it is expected to + * change. + */ +#pragma pack(4) + +#define TCPI_OPT_TIMESTAMPS 0x01 +#define TCPI_OPT_SACK 0x02 +#define TCPI_OPT_WSCALE 0x04 +#define TCPI_OPT_ECN 0x08 + +struct tcp_info { + u_int8_t tcpi_state; /* TCP FSM state. */ + u_int8_t tcpi_options; /* Options enabled on conn. */ + u_int8_t tcpi_snd_wscale; /* RFC1323 send shift value. */ + u_int8_t tcpi_rcv_wscale; /* RFC1323 recv shift value. */ + + u_int32_t tcpi_snd_mss; /* Max segment size for send. */ + u_int32_t tcpi_rcv_mss; /* Max segment size for receive. */ + + u_int32_t tcpi_snd_ssthresh; /* Slow start threshold. */ + u_int32_t tcpi_snd_cwnd; /* Send congestion window. */ + + u_int32_t tcpi_rcv_space; /* Advertised recv window. */ + + u_int32_t tcpi_snd_wnd; /* Advertised send window. */ + u_int32_t tcpi_snd_bwnd; /* Bandwidth send window. */ + u_int32_t tcpi_snd_nxt; /* Next egress seqno */ + u_int32_t tcpi_rcv_nxt; /* Next ingress seqno */ + + int32_t tcpi_last_outif; /* if_index of interface used to send last */ +}; + +/* + * Note that IPv6 link local addresses should have the appropriate scope ID + */ + +struct info_tuple { + u_int8_t itpl_proto; + union { + struct sockaddr _itpl_sa; + struct sockaddr_in _itpl_sin; + struct sockaddr_in6 _itpl_sin6; + } itpl_localaddr; + union { + struct sockaddr _itpl_sa; + struct sockaddr_in _itpl_sin; + struct sockaddr_in6 _itpl_sin6; + } itpl_remoteaddr; +}; + +#define itpl_local_sa itpl_localaddr._itpl_sa +#define itpl_local_sin itpl_localaddr._itpl_sin +#define itpl_local_sin6 itpl_localaddr._itpl_sin6 +#define itpl_remote_sa itpl_remoteaddr._itpl_sa +#define itpl_remote_sin itpl_remoteaddr._itpl_sin +#define itpl_remote_sin6 itpl_remoteaddr._itpl_sin6 + +#pragma pack() + +#endif /* PRIVATE */ #endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ #endif diff --git a/bsd/netinet/tcp_cc.h b/bsd/netinet/tcp_cc.h new file mode 100644 index 000000000..c78ba3531 --- /dev/null +++ b/bsd/netinet/tcp_cc.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/*- + * Copyright (c) 2008 Swinburne University of Technology, Melbourne, Australia + * All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University, by Lawrence Stewart and James Healy, + * made possible in part by a grant from the Cisco University Research Program + * Fund at Community Foundation Silicon Valley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_CC_H_ +#define _NETINET_CC_H_ + +#ifdef KERNEL + +#include + +#define TCP_CC_ALGO_NEWRENO_INDEX 0 /* default congestion control algorithm */ +#define TCP_CC_ALGO_BACKGROUND_INDEX 1 /* congestion control for background transport */ +#define TCP_CC_ALGO_COUNT 2 /* Count of CC algorithms defined */ + +#define TCP_CA_NAME_MAX 16 /* Maximum characters in the name of a CC algorithm */ + +/* + * Structure to hold definition various actions defined by a congestion control + * algorithm for TCP. This can be used to change the congestion control on a + * connection based on the user settings of priority of a connection. + */ +struct tcp_cc_algo { + char name[TCP_CA_NAME_MAX]; + uint32_t num_sockets; + uint32_t flags; + + /* init the congestion algorithm for the specified control block */ + int (*init) (struct tcpcb *tp); + + /* cleanup any state that is stored in the connection related to the algorithm */ + int (*cleanup) (struct tcpcb *tp); + + /* initialize cwnd at the start of a connection */ + void (*cwnd_init) (struct tcpcb *tp); + + /* called on the receipt of in-sequence ack during congestion avoidance phase */ + void (*inseq_ack_rcvd) (struct tcpcb *tp, struct tcphdr *th); + + /* called on the receipt of a valid ack */ + void (*ack_rcvd) (struct tcpcb *tp, struct tcphdr *th); + + /* called before entering FR */ + void (*pre_fr) (struct tcpcb *tp, struct tcphdr *th); + + /* after exiting FR */ + void (*post_fr) (struct tcpcb *tp, struct tcphdr *th); + + /* perform tasks when data transfer resumes after an idle period */ + void (*after_idle) (struct tcpcb *tp); + + /* perform tasks when the connection's retransmit timer expires */ + void (*after_timeout) (struct tcpcb *tp); + + /* Whether or not to delay the ack */ + int (*delay_ack)(struct tcpcb *tp, struct tcphdr *th); + + /* Switch a connection to this CC algorithm after sending some packets */ + void (*switch_to)(struct tcpcb *tp, uint16_t old_cc_index); + +} __attribute__((aligned(4))); + +extern struct tcp_cc_algo* tcp_cc_algo_list[TCP_CC_ALGO_COUNT]; + +#define CC_ALGO(tp) (tcp_cc_algo_list[tp->tcp_cc_index]) + +#endif /* KERNEL */ +#endif /* _NETINET_CC_H_ */ diff --git a/bsd/netinet/tcp_debug.c b/bsd/netinet/tcp_debug.c index 58b1141b2..8ba9eb6af 100644 --- a/bsd/netinet/tcp_debug.c +++ b/bsd/netinet/tcp_debug.c @@ -96,7 +96,7 @@ #if TCPDEBUG __private_extern__ int tcpconsdebug = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcpconsdebug, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcpconsdebug, CTLFLAG_RW | CTLFLAG_LOCKED, &tcpconsdebug, 0, "Turn tcp debugging on or off"); #endif diff --git a/bsd/netinet/tcp_input.c b/bsd/netinet/tcp_input.c index b65e9d5c6..6f06b2b14 100644 --- a/bsd/netinet/tcp_input.c +++ b/bsd/netinet/tcp_input.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -86,6 +86,7 @@ #include #include #include +#include #include #include @@ -95,6 +96,7 @@ #include /* for ICMP_BANDLIM */ #include #include +#include #if INET6 #include #include @@ -107,6 +109,8 @@ #include #include #include +#include +#include #if INET6 #include #endif @@ -131,10 +135,6 @@ struct tcphdr tcp_savetcp; #include -#ifndef __APPLE__ -MALLOC_DEFINE(M_TSEGQ, "tseg_qent", "TCP segment queue entry"); -#endif - #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 0) #define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 2) #define DBG_FNC_TCP_INPUT NETDBG_CODE(DBG_NETTCP, (3 << 8)) @@ -150,26 +150,31 @@ extern int ipsec_bypass; struct tcpstat tcpstat; static int log_in_vain = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW | CTLFLAG_LOCKED, &log_in_vain, 0, "Log all incoming TCP connections"); static int blackhole = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW | CTLFLAG_LOCKED, &blackhole, 0, "Do not send RST when dropping refused connections"); int tcp_delack_enabled = 3; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_delack_enabled, 0, "Delay ACK to try and piggyback it onto a data packet"); int tcp_lq_overflow = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_lq_overflow, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_lq_overflow, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_lq_overflow, 0, "Listen Queue Overflow"); +int tcp_recv_bg = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbg, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_recv_bg, 0, + "Receive background"); + #if TCP_DROP_SYNFIN static int drop_synfin = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW | CTLFLAG_LOCKED, &drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); #endif @@ -177,59 +182,85 @@ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "TCP Segment Reassembly Queue"); __private_extern__ int tcp_reass_maxseg = 0; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_reass_maxseg, 0, "Global maximum number of TCP Segments in Reassembly Queue"); __private_extern__ int tcp_reass_qsize = 0; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD, +SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_reass_qsize, 0, "Global number of TCP Segments currently in Reassembly Queue"); static int tcp_reass_overflows = 0; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD, +SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_reass_overflows, 0, "Global number of TCP Segment Reassembly Queue Overflows"); __private_extern__ int slowlink_wsize = 8192; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowlink_wsize, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowlink_wsize, CTLFLAG_RW | CTLFLAG_LOCKED, &slowlink_wsize, 0, "Maximum advertised window size for slowlink"); -static int maxseg_unacked = 8; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, maxseg_unacked, CTLFLAG_RW, +int maxseg_unacked = 8; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, maxseg_unacked, CTLFLAG_RW | CTLFLAG_LOCKED, &maxseg_unacked, 0, "Maximum number of outstanding segments left unacked"); -static int tcp_do_rfc3465 = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW, +int tcp_do_rfc3465 = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_rfc3465, 0, ""); -static int tcp_do_rfc3465_lim2 = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465_lim2, CTLFLAG_RW, +int tcp_do_rfc3465_lim2 = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465_lim2, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_rfc3465_lim2, 0, "Appropriate bytes counting w/ L=2*SMSS"); +int rtt_samples_per_slot = 20; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rtt_samples_per_slot, CTLFLAG_RW | CTLFLAG_LOCKED, + &rtt_samples_per_slot, 0, "Number of RTT samples stored for rtt history"); + +int tcp_allowed_iaj = ALLOWED_IAJ; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, recv_allowed_iaj, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_allowed_iaj, 0, "Allowed inter-packet arrival jiter"); + +int tcp_acc_iaj_high_thresh = ACC_IAJ_HIGH_THRESH; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, acc_iaj_high_thresh, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_acc_iaj_high_thresh, 0, "Used in calculating maximum accumulated IAJ"); + #if CONFIG_IFEF_NOWINDOWSCALE int tcp_obey_ifef_nowindowscale = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, obey_ifef_nowindowscale, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, obey_ifef_nowindowscale, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_obey_ifef_nowindowscale, 0, ""); #endif extern int tcp_TCPTV_MIN; +extern int tcp_acc_iaj_high; +extern int tcp_acc_iaj_react_limit; +extern struct zone *tcp_reass_zone; + u_int32_t tcp_now; +struct timeval tcp_uptime; /* uptime when tcp_now was last updated */ +lck_spin_t *tcp_uptime_lock; /* Used to sychronize updates to tcp_now */ struct inpcbhead tcb; #define tcb6 tcb /* for KAME src sync over BSD*'s */ struct inpcbinfo tcbinfo; -static void tcp_dooptions(struct tcpcb *, - u_char *, int, struct tcphdr *, struct tcpopt *, unsigned int); +static void tcp_dooptions(struct tcpcb *, u_char *, int, struct tcphdr *, + struct tcpopt *, unsigned int); static void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *); static void tcp_xmit_timer(struct tcpcb *, int); static inline unsigned int tcp_maxmtu(struct rtentry *); +static inline int tcp_stretch_ack_enable(struct tcpcb *tp); + +#if TRAFFIC_MGT +static inline void update_iaj_state(struct tcpcb *tp, uint32_t tlen, int reset_size); +void compute_iaj(struct tcpcb *tp); +static inline void clear_iaj_state(struct tcpcb *tp); +#endif /* TRAFFIC_MGT */ + #if INET6 static inline unsigned int tcp_maxmtu6(struct rtentry *); #endif @@ -247,9 +278,7 @@ do { \ #define ND6_HINT(tp) #endif -extern u_int32_t *delack_bitmask; - -extern void add_to_time_wait(struct tcpcb *); +extern void add_to_time_wait(struct tcpcb *, uint32_t delay); extern void postevent(struct socket *, struct sockbuf *, int); extern void ipfwsyslog( int level, const char *format,...); @@ -269,37 +298,140 @@ __private_extern__ int tcp_win_scale; #define log_in_vain_log( a ) { log a; } #endif +int tcp_rcvunackwin = TCPTV_UNACKWIN; +int tcp_maxrcvidle = TCPTV_MAXRCVIDLE; +int tcp_rcvsspktcnt = TCP_RCV_SS_PKTCOUNT; -/* - * Indicate whether this ack should be delayed. - * We can delay the ack if: - * - delayed acks are enabled (set to 1) and - * - our last ack wasn't a 0-sized window. We never want to delay - * the ack that opens up a 0-sized window. - * - delayed acks are enabled (set to 2, "more compatible") and - * - our last ack wasn't a 0-sized window. - * - if the peer hasn't sent us a TH_PUSH data packet (this solves 3649245) - * - the peer hasn't sent us a TH_PUSH data packet, if he did, take this as a clue that we - * need to ACK with no delay. This helps higher level protocols who won't send - * us more data even if the window is open because their last "segment" hasn't been ACKed - * - delayed acks are enabled (set to 3, "streaming detection") and - * - if we receive more than "maxseg_unacked" full packets per second on this socket - * - if we don't have more than "maxseg_unacked" delayed so far - * - if those criteria aren't met, acts like "2". Allowing faster acking while browsing for example. - * - */ -#define DELAY_ACK(tp) \ - (((tcp_delack_enabled == 1) && ((tp->t_flags & TF_RXWIN0SENT) == 0)) || \ - (((tcp_delack_enabled == 2) && (tp->t_flags & TF_RXWIN0SENT) == 0) && \ - ((thflags & TH_PUSH) == 0) && ((tp->t_flags & TF_DELACK) == 0)) || \ - (((tcp_delack_enabled == 3) && (tp->t_flags & TF_RXWIN0SENT) == 0) && \ - (tp->t_rcvtime == 0) && ((thflags & TH_PUSH) == 0) && \ - (((tp->t_unacksegs == 0)) || \ - ((tp->rcv_byps > (maxseg_unacked * tp->t_maxseg)) && (tp->t_unacksegs < maxseg_unacked))))) +#define DELAY_ACK(tp, th) (CC_ALGO(tp)->delay_ack != NULL && CC_ALGO(tp)->delay_ack(tp, th)) static int tcp_dropdropablreq(struct socket *head); static void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th); +static void update_base_rtt(struct tcpcb *tp, uint32_t rtt); +uint32_t get_base_rtt(struct tcpcb *tp); +void tcp_set_background_cc(struct socket *so); +void tcp_set_foreground_cc(struct socket *so); +static void tcp_set_new_cc(struct socket *so, uint16_t cc_index); + +#if TRAFFIC_MGT +void +reset_acc_iaj(struct tcpcb *tp) +{ + tp->acc_iaj = 0; + tp->iaj_rwintop = 0; + clear_iaj_state(tp); +} + +static inline void +update_iaj_state(struct tcpcb *tp, uint32_t size, int rst_size) +{ + if (rst_size > 0) + tp->iaj_size = 0; + if (tp->iaj_size == 0 || size >= tp->iaj_size) { + tp->iaj_size = size; + tp->iaj_rcv_ts = tcp_now; + tp->iaj_small_pkt = 0; + } +} + +static inline void +clear_iaj_state(struct tcpcb *tp) +{ + tp->iaj_rcv_ts = 0; +} + +/* For every 32 bit unsigned integer(v), this function will find the + * largest integer n such that (n*n <= v). This takes at most 16 iterations + * irrespective of the value of v and does not involve multiplications. + */ +static inline int +isqrt(unsigned int val) { + unsigned int sqrt_cache[11] = {0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100}; + unsigned int temp, g=0, b=0x8000, bshft=15; + if ( val <= 100) { + for (g = 0; g <= 10; ++g) { + if (sqrt_cache[g] > val) { + g--; + break; + } else if (sqrt_cache[g] == val) { + break; + } + } + } else { + do { + temp = (((g << 1) + b) << (bshft--)); + if (val >= temp) { + g += b; + val -= temp; + } + b >>= 1; + } while ( b > 0 && val > 0); + } + return(g); +} + +void +compute_iaj(struct tcpcb *tp) +{ + /* When accumulated IAJ reaches MAX_ACC_IAJ in milliseconds, throttle the + * receive window to a minimum of MIN_IAJ_WIN packets + */ +#define MAX_ACC_IAJ (tcp_acc_iaj_high_thresh + tcp_acc_iaj_react_limit) + + uint32_t allowed_iaj, acc_iaj = 0; + uint32_t cur_iaj = tcp_now - tp->iaj_rcv_ts; + + uint32_t mean, temp; + int32_t cur_iaj_dev; + cur_iaj_dev = (cur_iaj - tp->avg_iaj); + + /* Allow a jitter of "allowed_iaj" milliseconds. Some connections may have a + * constant jitter more than that. We detect this by using + * standard deviation. + */ + allowed_iaj = tp->avg_iaj + tp->std_dev_iaj; + if (allowed_iaj < tcp_allowed_iaj) + allowed_iaj = tcp_allowed_iaj; + + /* Initially when the connection starts, the senders congestion window + * is small. During this period we avoid throttling a connection because + * we do not have a good starting point for allowed_iaj. IAJ_IGNORE_PKTCNT + * is used to quietly gloss over the first few packets. + */ + if (tp->iaj_pktcnt > IAJ_IGNORE_PKTCNT) { + if ( cur_iaj <= allowed_iaj ) { + if (tp->acc_iaj >= 2) + acc_iaj = tp->acc_iaj - 2; + else + acc_iaj = 0; + } else { + acc_iaj = tp->acc_iaj + (cur_iaj - allowed_iaj); + } + + if (acc_iaj > MAX_ACC_IAJ) + acc_iaj = MAX_ACC_IAJ; + tp->acc_iaj = acc_iaj; + } + + /* Compute weighted average where the history has a weight of + * 15 out of 16 and the current value has a weight of 1 out of 16. + * This will make the short-term measurements have more weight. + */ + tp->avg_iaj = (((tp->avg_iaj << 4) - tp->avg_iaj) + cur_iaj) >> 4; + + /* Compute Root-mean-square of deviation where mean is a weighted + * average as described above + */ + temp = tp->std_dev_iaj * tp->std_dev_iaj; + mean = (((temp << 4) - temp) + (cur_iaj_dev * cur_iaj_dev)) >> 4; + + tp->std_dev_iaj = isqrt(mean); + + DTRACE_TCP3(iaj, struct tcpcb *, tp, uint32_t, cur_iaj, uint32_t, allowed_iaj); + + return; +} +#endif /* TRAFFIC_MGT */ static int tcp_reass(tp, th, tlenp, m) @@ -322,6 +454,27 @@ tcp_reass(tp, th, tlenp, m) */ if (th == NULL) goto present; + + /* If the reassembly queue already has entries or if we are going to add + * a new one, then the connection has reached a loss state. + * Reset the stretch-ack algorithm at this point. + */ + if ((tp->t_flags & TF_STRETCHACK) != 0) + tcp_reset_stretch_ack(tp); + + /* When the connection reaches a loss state, we need to send more acks + * for a period of time so that the sender's congestion window will + * open. Wait until we see some packets on the connection before + * stretching acks again. + */ + tp->t_flagsext |= TF_RCVUNACK_WAITSS; + tp->rcv_waitforss = 0; + + +#if TRAFFIC_MGT + if (tp->acc_iaj > 0) + reset_acc_iaj(tp); +#endif /* TRAFFIC_MGT */ /* * Limit the number of segments in the reassembly queue to prevent @@ -340,8 +493,7 @@ tcp_reass(tp, th, tlenp, m) } /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */ - MALLOC(te, struct tseg_qent *, sizeof (struct tseg_qent), M_TSEGQ, - M_NOWAIT); + te = (struct tseg_qent *) zalloc_noblock(tcp_reass_zone); if (te == NULL) { tcpstat.tcps_rcvmemdrop++; m_freem(m); @@ -371,8 +523,14 @@ tcp_reass(tp, th, tlenp, m) if (i >= *tlenp) { tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += *tlenp; + if (nstat_collect) { + nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1, *tlenp, NSTAT_RX_FLAG_DUPLICATE); + locked_add_64(&tp->t_inpcb->inp_stat->rxpackets, 1); + locked_add_64(&tp->t_inpcb->inp_stat->rxbytes, *tlenp); + tp->t_stat.rxduplicatebytes += *tlenp; + } m_freem(m); - FREE(te, M_TSEGQ); + zfree(tcp_reass_zone, te); tcp_reass_qsize--; /* * Try to present any queued data @@ -389,6 +547,12 @@ tcp_reass(tp, th, tlenp, m) } tcpstat.tcps_rcvoopack++; tcpstat.tcps_rcvoobyte += *tlenp; + if (nstat_collect) { + nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1, *tlenp, NSTAT_RX_FLAG_OUT_OF_ORDER); + locked_add_64(&tp->t_inpcb->inp_stat->rxpackets, 1); + locked_add_64(&tp->t_inpcb->inp_stat->rxbytes, *tlenp); + tp->t_stat.rxoutoforderbytes += *tlenp; + } /* * While we overlap succeeding segments trim them or, @@ -408,7 +572,7 @@ tcp_reass(tp, th, tlenp, m) nq = LIST_NEXT(q, tqe_q); LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); - FREE(q, M_TSEGQ); + zfree(tcp_reass_zone, q); tcp_reass_qsize--; q = nq; } @@ -442,10 +606,11 @@ tcp_reass(tp, th, tlenp, m) if (so->so_state & SS_CANTRCVMORE) m_freem(q->tqe_m); else { + so_recv_data_stat(so, q->tqe_m, 0); /* XXXX */ if (sbappendstream(&so->so_rcv, q->tqe_m)) dowakeup = 1; } - FREE(q, M_TSEGQ); + zfree(tcp_reass_zone, q); tcp_reass_qsize--; q = nq; } while (q && q->tqe_th->th_seq == tp->rcv_nxt); @@ -480,15 +645,15 @@ tcp_reass(tp, th, tlenp, m) */ static void tcp_reduce_congestion_window( - struct tcpcb *tp) + struct tcpcb *tp, struct tcphdr *th) { - u_int win; - - win = min(tp->snd_wnd, tp->snd_cwnd) / - 2 / tp->t_maxseg; - if (win < 2) - win = 2; - tp->snd_ssthresh = win * tp->t_maxseg; + /* + * If the current tcp cc module has + * defined a hook for tasks to run + * before entering FR, call it + */ + if (CC_ALGO(tp)->pre_fr != NULL) + CC_ALGO(tp)->pre_fr(tp, th); ENTER_FASTRECOVERY(tp); tp->snd_recover = tp->snd_max; tp->t_timer[TCPT_REXMT] = 0; @@ -505,10 +670,9 @@ tcp_reduce_congestion_window( */ #if INET6 int -tcp6_input(mp, offp) - struct mbuf **mp; - int *offp; +tcp6_input(struct mbuf **mp, int *offp, int proto) { +#pragma unused(proto) register struct mbuf *m = *mp; struct in6_ifaddr *ia6; @@ -519,20 +683,71 @@ tcp6_input(mp, offp) * better place to put this in? */ ia6 = ip6_getdstifaddr(m); - if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { - struct ip6_hdr *ip6; + if (ia6 != NULL) { + IFA_LOCK_SPIN(&ia6->ia_ifa); + if (ia6->ia6_flags & IN6_IFF_ANYCAST) { + struct ip6_hdr *ip6; - ip6 = mtod(m, struct ip6_hdr *); - icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, + IFA_UNLOCK(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); + ip6 = mtod(m, struct ip6_hdr *); + icmp6_error(m, ICMP6_DST_UNREACH, + ICMP6_DST_UNREACH_ADDR, (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); - return IPPROTO_DONE; + return (IPPROTO_DONE); + } + IFA_UNLOCK(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); } tcp_input(m, *offp); - return IPPROTO_DONE; + return (IPPROTO_DONE); } #endif +/* A receiver will evaluate the flow of packets on a connection + * to see if it can reduce ack traffic. The receiver will start + * stretching acks if all of the following conditions are met: + * 1. tcp_delack_enabled is set to 3 + * 2. If the bytes received in the last 100ms is greater than a threshold + * defined by maxseg_unacked + * 3. If the connection has not been idle for tcp_maxrcvidle period. + * 4. If the connection has seen enough packets to let the slow-start + * finish after connection establishment or after some packet loss. + * + * The receiver will stop stretching acks if there is congestion/reordering + * as indicated by packets on reassembly queue or an ECN. If the delayed-ack + * timer fires while stretching acks, it means that the packet flow has gone + * below the threshold defined by maxseg_unacked and the receiver will stop + * stretching acks. The receiver gets no indication when slow-start is completed + * or when the connection reaches an idle state. That is why we use + * tcp_rcvsspktcnt to cover slow-start and tcp_maxrcvidle to identify idle + * state. + */ + static inline int + tcp_stretch_ack_enable(struct tcpcb *tp) { + if (tp->rcv_by_unackwin >= (maxseg_unacked * tp->t_maxseg) && + TSTMP_GT(tp->rcv_unackwin + tcp_maxrcvidle, tcp_now) && + (((tp->t_flagsext & TF_RCVUNACK_WAITSS) == 0) || + (tp->rcv_waitforss >= tcp_rcvsspktcnt))) { + return(1); + } + return(0); +} + +/* Reset the state related to stretch-ack algorithm. This will make + * the receiver generate an ack every other packet. The receiver + * will start re-evaluating the rate at which packets come to decide + * if it can benefit by lowering the ack traffic. + */ +void +tcp_reset_stretch_ack(struct tcpcb *tp) +{ + tp->t_flags &= ~(TF_STRETCHACK); + tp->rcv_by_unackwin = 0; + tp->rcv_unackwin = tcp_now + tcp_rcvunackwin; +} + void tcp_input(m, off0) struct mbuf *m; @@ -565,7 +780,8 @@ tcp_input(m, off0) #endif struct m_tag *fwd_tag; u_char ip_ecn = IPTOS_ECN_NOTECT; - unsigned int ifscope; + unsigned int ifscope, nocell = 0; + uint8_t isconnected, isdisconnected; /* * Record the interface where this segment arrived on; this does not @@ -579,6 +795,11 @@ tcp_input(m, off0) else ifscope = IFSCOPE_NONE; + /* Since this is an entry point for input processing of tcp packets, we + * can update the tcp clock here. + */ + calculate_tcp_clock(); + /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if (!SLIST_EMPTY(&m->m_pkthdr.tags)) { fwd_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, @@ -616,12 +837,29 @@ tcp_input(m, off0) /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */ ip6 = mtod(m, struct ip6_hdr *); tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; - if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { - tcpstat.tcps_rcvbadsum++; - goto dropnosock; - } th = (struct tcphdr *)((caddr_t)ip6 + off0); + if ((apple_hwcksum_rx != 0) && (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) + th->th_sum = m->m_pkthdr.csum_data; + else + th->th_sum = in6_cksum_phdr(&ip6->ip6_src, + &ip6->ip6_dst, htonl(sizeof(struct tcphdr)), + htonl(IPPROTO_TCP)); + + th->th_sum ^= 0xffff; + if (th->th_sum) { + tcpstat.tcps_rcvbadsum++; + goto dropnosock; + } + } + else { + if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { + tcpstat.tcps_rcvbadsum++; + goto dropnosock; + } + } + KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport), (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])), th->th_seq, th->th_ack, th->th_win); @@ -637,6 +875,11 @@ tcp_input(m, off0) /* XXX stat */ goto dropnosock; } + DTRACE_TCP5(receive, sruct mbuf *, m, struct inpcb *, NULL, + struct ip6_hdr *, ip6, struct tcpcb *, NULL, + struct tcphdr *, th); + + ip_ecn = (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK; } else #endif /* INET6 */ { @@ -662,6 +905,9 @@ tcp_input(m, off0) th = (struct tcphdr *)((caddr_t)ip + off0); tlen = ip->ip_len; + DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL, + struct ip *, ip, struct tcpcb *, NULL, struct tcphdr *, th); + KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport), (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)), th->th_seq, th->th_ack, th->th_win); @@ -821,6 +1067,10 @@ tcp_input(m, off0) * Locate pcb for segment. */ findpcb: + + isconnected = FALSE; + isdisconnected = FALSE; + #if IPFIREWALL_FORWARD if (next_hop != NULL #if INET6 @@ -871,6 +1121,13 @@ tcp_input(m, off0) */ if (inp != NULL && (inp->inp_flags & INP_BOUND_IF)) ifscope = inp->inp_boundif; + /* + * If the PCB is present and the socket isn't allowed to use + * the cellular interface, indicate it as such for tcp_respond. + */ + if (inp != NULL && (inp->inp_flags & INP_NO_IFT_CELLULAR)) + nocell = 1; + #if IPSEC if (ipsec_bypass == 0) { #if INET6 @@ -981,7 +1238,7 @@ tcp_input(m, off0) goto dropnosock; } - tcp_lock(so, 1, (void *)2); + tcp_lock(so, 1, 0); if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { tcp_unlock(so, 1, (void *)2); inp = NULL; // pretend we didn't find it @@ -1034,10 +1291,13 @@ tcp_input(m, off0) struct inpcb *oinp = sotoinpcb(so); #endif /* INET6 */ unsigned int head_ifscope; + unsigned int head_nocell; /* Get listener's bound-to-interface, if any */ head_ifscope = (inp->inp_flags & INP_BOUND_IF) ? inp->inp_boundif : IFSCOPE_NONE; + /* Get listener's no-cellular information, if any */ + head_nocell = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; /* * If the state is LISTEN then ignore segment if it contains an RST. @@ -1130,11 +1390,18 @@ tcp_input(m, off0) if (isipv6 && !ip6_use_deprecated) { struct in6_ifaddr *ia6; - if ((ia6 = ip6_getdstifaddr(m)) && - (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { - tp = NULL; - rstreason = BANDLIM_RST_OPENPORT; - goto dropwithreset; + ia6 = ip6_getdstifaddr(m); + if (ia6 != NULL) { + IFA_LOCK_SPIN(&ia6->ia_ifa); + if (ia6->ia6_flags & IN6_IFF_DEPRECATED) { + IFA_UNLOCK(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); + tp = NULL; + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + IFA_UNLOCK(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); } } #endif @@ -1203,14 +1470,16 @@ tcp_input(m, off0) * can only be set to a non-zero value earlier if * the listener has such a flag set. */ -#if INET6 - if (head_ifscope != IFSCOPE_NONE && !isipv6) { -#else if (head_ifscope != IFSCOPE_NONE) { -#endif /* INET6 */ inp->inp_flags |= INP_BOUND_IF; inp->inp_boundif = head_ifscope; } + /* + * Inherit INP_NO_IFT_CELLULAR from listener. + */ + if (head_nocell) { + inp->inp_flags |= INP_NO_IFT_CELLULAR; + } #if INET6 if (isipv6) inp->in6p_laddr = ip6->ip6_dst; @@ -1277,8 +1546,11 @@ tcp_input(m, off0) } #endif /* inherit states from the listener */ + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_LISTEN); tp->t_state = TCPS_LISTEN; tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT|TF_NODELAY); + tp->t_flagsext |= (tp0->t_flagsext & TF_RXTFINDROP); tp->t_keepinit = tp0->t_keepinit; tp->t_inpcb->inp_ip_ttl = tp0->t_inpcb->inp_ip_ttl; @@ -1300,9 +1572,8 @@ tcp_input(m, off0) KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END,0,0,0,0,0); } } -#if 1 - lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); -#endif + lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); + /* * Radar 3529618 * This is the second part of the MSS DoS prevention code (after @@ -1329,13 +1600,9 @@ tcp_input(m, off0) * this check. * * Account for packet if payload packet, skip over ACK, etc. - * - * The packet per second count is done all the time and is also used - * by "DELAY_ACK" to detect streaming situations. - * */ if (tp->t_state == TCPS_ESTABLISHED && tlen > 0) { - if (tp->rcv_reset > tcp_now) { + if (TSTMP_GT(tp->rcv_reset, tcp_now)) { tp->rcv_pps++; tp->rcv_byps += tlen + off; if (tp->rcv_byps > tp->rcv_maxbyps) @@ -1369,25 +1636,30 @@ tcp_input(m, off0) tp->rcv_pps = 1; tp->rcv_byps = tlen + off; } + + /* Evaluate the rate of arrival of packets to see if the + * receiver can reduce the ack traffic. The algorithm to + * stretch acks will be enabled if the connection meets + * certain criteria defined in tcp_stretch_ack_enable function. + */ + if ((tp->t_flagsext & TF_RCVUNACK_WAITSS) != 0) { + tp->rcv_waitforss++; + } + if (tcp_stretch_ack_enable(tp)) { + tp->t_flags |= TF_STRETCHACK; + tp->t_flagsext &= ~(TF_RCVUNACK_WAITSS); + tp->rcv_waitforss = 0; + } else { + tp->t_flags &= ~(TF_STRETCHACK); + } + if (TSTMP_GT(tp->rcv_unackwin, tcp_now)) { + tp->rcv_by_unackwin += (tlen + off); + } else { + tp->rcv_unackwin = tcp_now + tcp_rcvunackwin; + tp->rcv_by_unackwin = tlen + off; + } } -#if TRAFFIC_MGT - if (so->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BG_REGULATE) { - tcpstat.tcps_bg_rcvtotal++; - - /* Take snapshots of pkts recv; - * tcpcb should have been initialized to 0 when allocated, - * so if 0 then this is the first time we're doing this - */ - if (!tp->tot_recv_snapshot) { - tp->tot_recv_snapshot = tcpstat.tcps_rcvtotal; - } - if (!tp->bg_recv_snapshot) { - tp->bg_recv_snapshot = tcpstat.tcps_bg_rcvtotal; - } - } -#endif /* TRAFFIC_MGT */ - /* Explicit Congestion Notification - Flag that we need to send ECT if + The IP Congestion experienced flag was set. @@ -1413,14 +1685,22 @@ tcp_input(m, off0) if ((thflags & TH_CWR) == TH_CWR) { tp->ecn_flags &= ~TE_SENDECE; } + + /* If we received an explicit notification of congestion in + * ip tos ecn bits or by the CWR bit in TCP header flags, reset + * the ack-strteching state. + */ + if (tp->t_state == TCPS_ESTABLISHED && (tp->t_flags & TF_STRETCHACK) != 0 && + ((ip_ecn == IPTOS_ECN_CE) || ((thflags & TH_CWR) == TH_CWR))) + tcp_reset_stretch_ack(tp); /* * Segment received on connection. * Reset idle time and keep-alive timer. */ - tp->t_rcvtime = 0; + tp->t_rcvtime = tcp_now; if (TCPS_HAVEESTABLISHED(tp->t_state)) - tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp); + tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, TCP_KEEPIDLE(tp)); /* * Process options if not in LISTEN state, @@ -1451,6 +1731,64 @@ tcp_input(m, off0) } } +#if TRAFFIC_MGT + /* Compute inter-packet arrival jitter. According to RFC 3550, inter-packet + * arrival jitter is defined as the difference in packet spacing at the + * receiver compared to the sender for a pair of packets. When two packets + * of maximum segment size come one after the other with consecutive + * sequence numbers, we consider them as packets sent together at the + * sender and use them as a pair to compute inter-packet arrival jitter. + * This metric indicates the delay induced by the network components due + * to queuing in edge/access routers. + */ + if (tp->t_state == TCPS_ESTABLISHED && + (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK|TH_ECE|TH_PUSH)) == TH_ACK && + ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && + ((to.to_flags & TOF_TS) == 0 || + TSTMP_GEQ(to.to_tsval, tp->ts_recent)) && + th->th_seq == tp->rcv_nxt && + LIST_EMPTY(&tp->t_segq)) { + if (tp->iaj_pktcnt <= IAJ_IGNORE_PKTCNT) { + tp->iaj_pktcnt++; + } + + if ( tp->iaj_size == 0 || tlen > tp->iaj_size || + (tlen == tp->iaj_size && tp->iaj_rcv_ts == 0)) { + /* State related to inter-arrival jitter is uninitialized + * or we are trying to find a good first packet to start + * computing the metric + */ + update_iaj_state(tp, tlen, 0); + } else { + if (tlen == tp->iaj_size) { + /* Compute inter-arrival jitter taking this packet + * as the second packet + */ + compute_iaj(tp); + } + if (tlen < tp->iaj_size) { + /* There is a smaller packet in the stream. + * Some times the maximum size supported on a path can + * change if there is a new link with smaller MTU. + * The receiver will not know about this change. + * If there are too many packets smaller than iaj_size, + * we try to learn the iaj_size again. + */ + tp->iaj_small_pkt++; + if (tp->iaj_small_pkt > RESET_IAJ_SIZE_THRESH) { + update_iaj_state(tp, tlen, 1); + } else { + clear_iaj_state(tp); + } + } else { + update_iaj_state(tp, tlen, 0); + } + } + } else { + clear_iaj_state(tp); + } +#endif /* TRAFFIC_MGT */ + /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has @@ -1498,11 +1836,10 @@ tcp_input(m, off0) if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_ssthresh && - ((!tcp_do_newreno && !tp->sack_enable && - tp->t_dupacks < tcprexmtthresh) || - ((tcp_do_newreno || tp->sack_enable) && - !IN_FASTRECOVERY(tp) && to.to_nsacks == 0 && - TAILQ_EMPTY(&tp->snd_holes)))) { + (!IN_FASTRECOVERY(tp) && + ((!tp->sack_enable && tp->t_dupacks < tcprexmtthresh) || + (tp->sack_enable && to.to_nsacks == 0 && + TAILQ_EMPTY(&tp->snd_holes))))) { /* * this is a pure ack for outstanding data. */ @@ -1511,7 +1848,7 @@ tcp_input(m, off0) * "bad retransmit" recovery */ if (tp->t_rxtshift == 1 && - tcp_now < tp->t_badrxtwin) { + TSTMP_LT(tcp_now, tp->t_badrxtwin)) { ++tcpstat.tcps_sndrexmitbad; tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = @@ -1521,6 +1858,11 @@ tcp_input(m, off0) ENTER_FASTRECOVERY(tp); tp->snd_nxt = tp->snd_max; tp->t_badrxtwin = 0; + tp->t_rxtshift = 0; + tp->rxt_start = 0; + DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb, + struct tcpcb *, tp, struct tcphdr *, th, + int32_t, TCP_CC_BAD_REXMT_RECOVERY); } /* * Recalculate the transmit timer / rtt. @@ -1530,33 +1872,29 @@ tcp_input(m, off0) * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ - if (((to.to_flags & TOF_TS) != 0) && (to.to_tsecr != 0)) { /* Makes sure we already have a TS */ - if (!tp->t_rttlow || - tp->t_rttlow > tcp_now - to.to_tsecr) - tp->t_rttlow = tcp_now - to.to_tsecr; + if (((to.to_flags & TOF_TS) != 0) && (to.to_tsecr != 0) && + TSTMP_GEQ(tcp_now, to.to_tsecr)) { tcp_xmit_timer(tp, tcp_now - to.to_tsecr); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { - if (!tp->t_rttlow || - tp->t_rttlow > tcp_now - tp->t_rtttime) - tp->t_rttlow = tcp_now - tp->t_rtttime; - tcp_xmit_timer(tp, tp->t_rtttime); + tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); } acked = th->th_ack - tp->snd_una; tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; - /* - * Grow the congestion window, if the - * connection is cwnd bound. + + /* Handle an ack that is in sequence during congestion + * avoidance phase. The calculations in this function + * assume that snd_una is not updated yet. */ - if (tp->snd_cwnd < tp->snd_wnd) { - tp->t_bytes_acked += acked; - if (tp->t_bytes_acked > tp->snd_cwnd) { - tp->t_bytes_acked -= tp->snd_cwnd; - tp->snd_cwnd += tp->t_maxseg; - } - } + if (CC_ALGO(tp)->inseq_ack_rcvd != NULL) + CC_ALGO(tp)->inseq_ack_rcvd(tp, th); + + DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, struct tcphdr *, th, + int32_t, TCP_CC_INSEQ_ACK_RCVD); + sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) @@ -1583,13 +1921,14 @@ tcp_input(m, off0) if (tp->snd_una == tp->snd_max) tp->t_timer[TCPT_REXMT] = 0; else if (tp->t_timer[TCPT_PERSIST] == 0) - tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); sowwakeup(so); /* has to be done with socket lock held */ if ((so->so_snd.sb_cc) || (tp->t_flags & TF_ACKNOW)) { - tp->t_unacksegs = 0; (void) tcp_output(tp); } + + tcp_check_timer_state(tp); tcp_unlock(so, 1, 0); KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0); return; @@ -1619,10 +1958,15 @@ tcp_input(m, off0) tp->rcv_up = tp->rcv_nxt; tcpstat.tcps_rcvpack++; tcpstat.tcps_rcvbyte += tlen; + if (nstat_collect) { + locked_add_64(&inp->inp_stat->rxpackets, 1); + locked_add_64(&inp->inp_stat->rxbytes, tlen); + } ND6_HINT(tp); /* some progress has been done */ /* * Add data to socket buffer. */ + so_recv_data_stat(so, m, 0); m_adj(m, drop_hdrlen); /* delayed header drop */ if (sbappendstream(&so->so_rcv, m)) sorwakeup(so); @@ -1639,14 +1983,17 @@ tcp_input(m, off0) (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)), th->th_seq, th->th_ack, th->th_win); } - if (DELAY_ACK(tp)) { - tp->t_flags |= TF_DELACK; + if (DELAY_ACK(tp, th)) { + if ((tp->t_flags & TF_DELACK) == 0) { + tp->t_flags |= TF_DELACK; + tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack); + } tp->t_unacksegs++; } else { - tp->t_unacksegs = 0; tp->t_flags |= TF_ACKNOW; tcp_output(tp); } + tcp_check_timer_state(tp); tcp_unlock(so, 1, 0); KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0); return; @@ -1659,9 +2006,8 @@ tcp_input(m, off0) * Receive window is amount of space in rcv queue, * but not less than advertised window. */ -#if 1 - lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); -#endif + lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); + { int win; win = tcp_sbspace(tp); @@ -1692,9 +2038,7 @@ tcp_input(m, off0) register struct sockaddr_in6 *sin6; #endif -#if 1 - lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); -#endif + lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); #if INET6 if (isipv6) { MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6, @@ -1719,9 +2063,7 @@ tcp_input(m, off0) } else #endif { -#if 0 - lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); -#endif + lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, M_NOWAIT); if (sin == NULL) @@ -1734,7 +2076,7 @@ tcp_input(m, off0) laddr = inp->inp_laddr; if (inp->inp_laddr.s_addr == INADDR_ANY) inp->inp_laddr = ip->ip_dst; - if (in_pcbconnect(inp, (struct sockaddr *)sin, proc0)) { + if (in_pcbconnect(inp, (struct sockaddr *)sin, proc0, NULL)) { inp->inp_laddr = laddr; FREE(sin, M_SONAME); goto drop; @@ -1768,9 +2110,15 @@ tcp_input(m, off0) tp->snd_wnd = tiwin; /* initial send-window */ tp->t_flags |= TF_ACKNOW; tp->t_unacksegs = 0; + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED); tp->t_state = TCPS_SYN_RECEIVED; - tp->t_timer[TCPT_KEEP] = tp->t_keepinit ? tp->t_keepinit : tcp_keepinit; + tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, + tp->t_keepinit ? tp->t_keepinit : tcp_keepinit); dropsocket = 0; /* committed to socket */ + + /* reset the incomp processing flag */ + so->so_flags &= ~(SOF_INCOMP_INPROGRESS); tcpstat.tcps_accepts++; if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE | TH_CWR)) { /* ECN-setup SYN */ @@ -1860,13 +2208,15 @@ tcp_input(m, off0) * If there's data, delay ACK; if there's also a FIN * ACKNOW will be turned on later. */ - if (DELAY_ACK(tp) && tlen != 0) { - tp->t_flags |= TF_DELACK; + if (DELAY_ACK(tp, th) && tlen != 0) { + if ((tp->t_flags & TF_DELACK) == 0) { + tp->t_flags |= TF_DELACK; + tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack); + } tp->t_unacksegs++; } else { tp->t_flags |= TF_ACKNOW; - tp->t_unacksegs = 0; } /* * Received in SYN_SENT[*] state. @@ -1874,30 +2224,35 @@ tcp_input(m, off0) * SYN_SENT --> ESTABLISHED * SYN_SENT* --> FIN_WAIT_1 */ - tp->t_starttime = 0; + tp->t_starttime = tcp_now; if (tp->t_flags & TF_NEEDFIN) { + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1); tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_ESTABLISHED); tp->t_state = TCPS_ESTABLISHED; - tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp); + tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, TCP_KEEPIDLE(tp)); + if (nstat_collect) + nstat_route_connect_success(tp->t_inpcb->inp_route.ro_rt); } - /* soisconnected may lead to socket_unlock in case of upcalls, - * make sure this is done when everything is setup. - */ - soisconnected(so); + isconnected = TRUE; } else { - /* - * Received initial SYN in SYN-SENT[*] state => simul- - * taneous open. If segment contains CC option and there is - * a cached CC, apply TAO test; if it succeeds, connection is - * half-synchronized. Otherwise, do 3-way handshake: - * SYN-SENT -> SYN-RECEIVED - * SYN-SENT* -> SYN-RECEIVED* - */ + /* + * Received initial SYN in SYN-SENT[*] state => simul- + * taneous open. If segment contains CC option and there is + * a cached CC, apply TAO test; if it succeeds, connection is + * half-synchronized. Otherwise, do 3-way handshake: + * SYN-SENT -> SYN-RECEIVED + * SYN-SENT* -> SYN-RECEIVED* + */ tp->t_flags |= TF_ACKNOW; tp->t_timer[TCPT_REXMT] = 0; + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED); tp->t_state = TCPS_SYN_RECEIVED; } @@ -2044,6 +2399,8 @@ tcp_input(m, off0) so->so_error = ECONNRESET; close: postevent(so, 0, EV_RESET); + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_CLOSED); tp->t_state = TCPS_CLOSED; tcpstat.tcps_drops++; tp = tcp_close(tp); @@ -2061,10 +2418,6 @@ tcp_input(m, off0) goto drop; } -#if 0 - lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); -#endif - /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. @@ -2090,6 +2443,12 @@ tcp_input(m, off0) tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += tlen; tcpstat.tcps_pawsdrop++; + if (nstat_collect) { + nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1, tlen, NSTAT_RX_FLAG_DUPLICATE); + locked_add_64(&inp->inp_stat->rxpackets, 1); + locked_add_64(&inp->inp_stat->rxbytes, tlen); + tp->t_stat.rxduplicatebytes += tlen; + } if (tlen) goto dropafterack; goto drop; @@ -2136,7 +2495,6 @@ tcp_input(m, off0) * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; - tp->t_unacksegs = 0; todrop = tlen; tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += todrop; @@ -2144,6 +2502,12 @@ tcp_input(m, off0) tcpstat.tcps_rcvpartduppack++; tcpstat.tcps_rcvpartdupbyte += todrop; } + if (nstat_collect) { + nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1, todrop, NSTAT_RX_FLAG_DUPLICATE); + locked_add_64(&inp->inp_stat->rxpackets, 1); + locked_add_64(&inp->inp_stat->rxbytes, todrop); + tp->t_stat.rxduplicatebytes += todrop; + } drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; @@ -2199,7 +2563,6 @@ tcp_input(m, off0) */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; - tp->t_unacksegs = 0; tcpstat.tcps_rcvwinprobe++; } else goto dropafterack; @@ -2281,19 +2644,27 @@ tcp_input(m, off0) (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->snd_scale = tp->requested_s_scale; tp->rcv_scale = tp->request_r_scale; + tp->snd_wnd = th->th_win << tp->snd_scale; + tiwin = tp->snd_wnd; } /* * Make transitions: * SYN-RECEIVED -> ESTABLISHED * SYN-RECEIVED* -> FIN-WAIT-1 */ - tp->t_starttime = 0; + tp->t_starttime = tcp_now; if (tp->t_flags & TF_NEEDFIN) { + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1); tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; } else { + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_ESTABLISHED); tp->t_state = TCPS_ESTABLISHED; - tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp); + tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, TCP_KEEPIDLE(tp)); + if (nstat_collect) + nstat_route_connect_success(tp->t_inpcb->inp_route.ro_rt); } /* * If segment contains data or ACK, will call tcp_reass() @@ -2306,10 +2677,7 @@ tcp_input(m, off0) /* FALLTHROUGH */ - /* soisconnected may lead to socket_unlock in case of upcalls, - * make sure this is done when everything is setup. - */ - soisconnected(so); + isconnected = TRUE; /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range @@ -2364,8 +2732,7 @@ tcp_input(m, off0) th->th_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || - ((tcp_do_newreno || tp->sack_enable) && - IN_FASTRECOVERY(tp))) { + IN_FASTRECOVERY(tp)) { if (tp->sack_enable && IN_FASTRECOVERY(tp)) { int awnd; @@ -2384,12 +2751,15 @@ tcp_input(m, off0) } } else tp->snd_cwnd += tp->t_maxseg; - tp->t_unacksegs = 0; + + DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, struct tcphdr *, th, + int32_t, TCP_CC_IN_FASTRECOVERY); + (void) tcp_output(tp); goto drop; } else if (tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; - u_int win; /* * If we're doing sack, check to @@ -2403,18 +2773,21 @@ tcp_input(m, off0) tp->t_dupacks = 0; break; } - } else if (tcp_do_newreno) { + } else { if (SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; break; } } - win = min(tp->snd_wnd, tp->snd_cwnd) / - 2 / tp->t_maxseg; - if (win < 2) - win = 2; - tp->snd_ssthresh = win * tp->t_maxseg; + + /* + * If the current tcp cc module has + * defined a hook for tasks to run + * before entering FR, call it + */ + if (CC_ALGO(tp)->pre_fr != NULL) + CC_ALGO(tp)->pre_fr(tp, th); ENTER_FASTRECOVERY(tp); tp->snd_recover = tp->snd_max; tp->t_timer[TCPT_REXMT] = 0; @@ -2424,18 +2797,24 @@ tcp_input(m, off0) tcpstat.tcps_sack_recovery_episode++; tp->sack_newdata = tp->snd_nxt; tp->snd_cwnd = tp->t_maxseg; - tp->t_unacksegs = 0; + + DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, struct tcphdr *, th, + int32_t, TCP_CC_ENTER_FASTRECOVERY); + (void) tcp_output(tp); goto drop; } tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; - tp->t_unacksegs = 0; (void) tcp_output(tp); tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * tp->t_dupacks; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; + DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, struct tcphdr *, th, + int32_t, TCP_CC_ENTER_FASTRECOVERY); goto drop; } } else @@ -2446,69 +2825,33 @@ tcp_input(m, off0) * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ - if (!IN_FASTRECOVERY(tp)) { + if (IN_FASTRECOVERY(tp)) { + if (SEQ_LT(th->th_ack, tp->snd_recover)) { + if (tp->sack_enable) + tcp_sack_partialack(tp, th); + else + tcp_newreno_partial_ack(tp, th); + + DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, struct tcphdr *, th, + int32_t, TCP_CC_PARTIAL_ACK); + } else { + EXIT_FASTRECOVERY(tp); + if (CC_ALGO(tp)->post_fr != NULL) + CC_ALGO(tp)->post_fr(tp, th); + tp->t_dupacks = 0; + + DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, struct tcphdr *, th, + int32_t, TCP_CC_EXIT_FASTRECOVERY); + } + } else { /* - * We were not in fast recovery. Reset the duplicate ack + * We were not in fast recovery. Reset the duplicate ack * counter. */ tp->t_dupacks = 0; } - /* - * If the congestion window was inflated to account - * for the other side's cached packets, retract it. - */ - else { - if (tcp_do_newreno || tp->sack_enable) { - if (SEQ_LT(th->th_ack, tp->snd_recover)) { - if (tp->sack_enable) - tcp_sack_partialack(tp, th); - else - tcp_newreno_partial_ack(tp, th); - } - else { - if (tcp_do_newreno) { - int32_t ss = tp->snd_max - th->th_ack; - - /* - * Complete ack. Inflate the congestion window to - * ssthresh and exit fast recovery. - * - * Window inflation should have left us with approx. - * snd_ssthresh outstanding data. But in case we - * would be inclined to send a burst, better to do - * it via the slow start mechanism. - */ - if (ss < tp->snd_ssthresh) - tp->snd_cwnd = ss + tp->t_maxseg; - else - tp->snd_cwnd = tp->snd_ssthresh; - } - else { - /* - * Clamp the congestion window to the crossover point - * and exit fast recovery. - */ - if (tp->snd_cwnd > tp->snd_ssthresh) - tp->snd_cwnd = tp->snd_ssthresh; - } - - EXIT_FASTRECOVERY(tp); - tp->t_dupacks = 0; - tp->t_bytes_acked = 0; - } - } - else { - /* - * Clamp the congestion window to the crossover point - * and exit fast recovery in non-newreno and non-SACK case. - */ - if (tp->snd_cwnd > tp->snd_ssthresh) - tp->snd_cwnd = tp->snd_ssthresh; - EXIT_FASTRECOVERY(tp); - tp->t_dupacks = 0; - tp->t_bytes_acked = 0; - } - } /* @@ -2545,7 +2888,8 @@ tcp_input(m, off0) * original cwnd and ssthresh, and proceed to transmit where * we left off. */ - if (tp->t_rxtshift == 1 && tcp_now < tp->t_badrxtwin) { + if (tp->t_rxtshift == 1 && + TSTMP_LT(tcp_now, tp->t_badrxtwin)) { ++tcpstat.tcps_sndrexmitbad; tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; @@ -2554,6 +2898,12 @@ tcp_input(m, off0) ENTER_FASTRECOVERY(tp); tp->snd_nxt = tp->snd_max; tp->t_badrxtwin = 0; /* XXX probably not required */ + tp->t_rxtshift = 0; + tp->rxt_start = 0; + + DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, struct tcphdr *, th, + int32_t, TCP_CC_BAD_REXMT_RECOVERY); } /* @@ -2571,14 +2921,11 @@ tcp_input(m, off0) * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ - if (((to.to_flags & TOF_TS) != 0) && (to.to_tsecr != 0)) { - if (!tp->t_rttlow || tp->t_rttlow > tcp_now - to.to_tsecr) - tp->t_rttlow = tcp_now - to.to_tsecr; + if (((to.to_flags & TOF_TS) != 0) && (to.to_tsecr != 0) && + TSTMP_GEQ(tcp_now, to.to_tsecr)) { tcp_xmit_timer(tp, tcp_now - to.to_tsecr); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { - if (!tp->t_rttlow || tp->t_rttlow > tcp_now - tp->t_rtttime) - tp->t_rttlow = tcp_now - tp->t_rtttime; - tcp_xmit_timer(tp, tp->t_rtttime); + tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); } /* @@ -2591,7 +2938,7 @@ tcp_input(m, off0) tp->t_timer[TCPT_REXMT] = 0; needoutput = 1; } else if (tp->t_timer[TCPT_PERSIST] == 0) - tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); /* * If no data (only SYN) was ACK'd, @@ -2600,78 +2947,35 @@ tcp_input(m, off0) if (acked == 0) goto step6; - /* - * When new data is acked, open the congestion window. - */ if ((thflags & TH_ECE) != 0 && (tp->ecn_flags & TE_SETUPSENT) != 0) { /* * Reduce the congestion window if we haven't done so. */ - if (!(tp->sack_enable && IN_FASTRECOVERY(tp)) && - !(tcp_do_newreno && SEQ_LEQ(th->th_ack, tp->snd_recover))) { - tcp_reduce_congestion_window(tp); + if (!tp->sack_enable && !IN_FASTRECOVERY(tp) && + SEQ_GEQ(th->th_ack, tp->snd_recover)) { + tcp_reduce_congestion_window(tp, th); + DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, struct tcphdr *, th, + int32_t, TCP_CC_ECN_RCVD); } - } else if ((!tcp_do_newreno && !tp->sack_enable) || - !IN_FASTRECOVERY(tp)) { - /* - * RFC 3465 - Appropriate Byte Counting. - * - * If the window is currently less than ssthresh, - * open the window by the number of bytes ACKed by - * the last ACK, however clamp the window increase - * to an upper limit "L". - * - * In congestion avoidance phase, open the window by - * one segment each time "bytes_acked" grows to be - * greater than or equal to the congestion window. - */ - - register u_int cw = tp->snd_cwnd; - register u_int incr = tp->t_maxseg; - - if (tcp_do_rfc3465) { - - if (cw >= tp->snd_ssthresh) { - tp->t_bytes_acked += acked; - if (tp->t_bytes_acked >= cw) { - /* Time to increase the window. */ - tp->t_bytes_acked -= cw; - } else { - /* No need to increase yet. */ - incr = 0; - } - } else { - /* - * If the user explicitly enables RFC3465 - * use 2*SMSS for the "L" param. Otherwise - * use the more conservative 1*SMSS. - * - * (See RFC 3465 2.3 Choosing the Limit) - */ - u_int abc_lim; - - abc_lim = (tcp_do_rfc3465_lim2 && - tp->snd_nxt == tp->snd_max) ? incr * 2 : incr; - - incr = lmin(acked, abc_lim); - } - } - else { - /* - * If the window gives us less than ssthresh packets - * in flight, open exponentially (segsz per packet). - * Otherwise open linearly: segsz per window - * (segsz^2 / cwnd per packet). - */ - - if (cw >= tp->snd_ssthresh) { - incr = max((incr * incr / cw), 1); - } - } - + } - tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<snd_scale); + /* + * When new data is acked, open the congestion window. + * The specifics of how this is achieved are up to the + * congestion control algorithm in use for this connection. + * + * The calculations in this function assume that snd_una is + * not updated yet. + */ + if (!IN_FASTRECOVERY(tp)) { + if (CC_ALGO(tp)->ack_rcvd != NULL) + CC_ALGO(tp)->ack_rcvd(tp, th); + + DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, struct tcphdr *, th, + int32_t, TCP_CC_ACK_RCVD); } if (acked > so->so_snd.sb_cc) { tp->snd_wnd -= so->so_snd.sb_cc; @@ -2683,15 +2987,15 @@ tcp_input(m, off0) ourfinisacked = 0; } /* detect una wraparound */ - if ((tcp_do_newreno || tp->sack_enable) && - !IN_FASTRECOVERY(tp) && + if ( !IN_FASTRECOVERY(tp) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; - if ((tcp_do_newreno || tp->sack_enable) && - IN_FASTRECOVERY(tp) && + + if (IN_FASTRECOVERY(tp) && SEQ_GEQ(th->th_ack, tp->snd_recover)) EXIT_FASTRECOVERY(tp); + tp->snd_una = th->th_ack; if (tp->sack_enable) { if (SEQ_GT(tp->snd_una, tp->snd_recover)) @@ -2723,10 +3027,12 @@ tcp_input(m, off0) * we'll hang forever. */ if (so->so_state & SS_CANTRCVMORE) { - tp->t_timer[TCPT_2MSL] = tcp_maxidle; - add_to_time_wait(tp); - soisdisconnected(so); + add_to_time_wait(tp, tcp_maxidle); + isconnected = FALSE; + isdisconnected = TRUE; } + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_2); tp->t_state = TCPS_FIN_WAIT_2; /* fall through and make sure we also recognize data ACKed with the FIN */ } @@ -2741,17 +3047,18 @@ tcp_input(m, off0) */ case TCPS_CLOSING: if (ourfinisacked) { + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_TIME_WAIT); tp->t_state = TCPS_TIME_WAIT; tcp_canceltimers(tp); /* Shorten TIME_WAIT [RFC-1644, p.28] */ if (tp->cc_recv != 0 && - tp->t_starttime < (u_int32_t)tcp_msl) - tp->t_timer[TCPT_2MSL] = - tp->t_rxtcur * TCPTV_TWTRUNC; + ((int)(tcp_now - tp->t_starttime)) < tcp_msl) + add_to_time_wait(tp, tp->t_rxtcur * TCPTV_TWTRUNC); else - tp->t_timer[TCPT_2MSL] = 2 * tcp_msl; - add_to_time_wait(tp); - soisdisconnected(so); + add_to_time_wait(tp, 2 * tcp_msl); + isconnected = FALSE; + isdisconnected = TRUE; } tp->t_flags |= TF_ACKNOW; break; @@ -2775,8 +3082,7 @@ tcp_input(m, off0) * it and restart the finack timer. */ case TCPS_TIME_WAIT: - tp->t_timer[TCPT_2MSL] = 2 * tcp_msl; - add_to_time_wait(tp); + add_to_time_wait(tp, 2 * tcp_msl); goto dropafterack; } } @@ -2856,7 +3162,7 @@ tcp_input(m, off0) ) tcp_pulloutofband(so, th, m, drop_hdrlen); /* hdr drop is delayed */ - } else + } else { /* * If no out of band data is expected, * pull receive urgent pointer along @@ -2864,8 +3170,27 @@ tcp_input(m, off0) */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; -dodata: /* XXX */ + } +dodata: + /* Set socket's connect or disconnect state correcly before doing data. + * The following might unlock the socket if there is an upcall or a socket + * filter. + */ + if (isconnected) { + soisconnected(so); + } else if (isdisconnected) { + soisdisconnected(so); + } + + /* Let's check the state of pcb just to make sure that it did not get closed + * when we unlocked above + */ + if (inp->inp_state == INPCB_STATE_DEAD) { + /* Just drop the packet that we are processing and return */ + goto drop; + } + /* * Process the segment text, merging it into the TCP sequencing queue, * and arranging for acknowledgment of receipt if necessary. @@ -2894,25 +3219,31 @@ tcp_input(m, off0) if (th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq) && TCPS_HAVEESTABLISHED(tp->t_state)) { - if (DELAY_ACK(tp) && ((tp->t_flags & TF_ACKNOW) == 0)) { - tp->t_flags |= TF_DELACK; + if (DELAY_ACK(tp, th) && ((tp->t_flags & TF_ACKNOW) == 0)) { + if ((tp->t_flags & TF_DELACK) == 0) { + tp->t_flags |= TF_DELACK; + tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack); + } tp->t_unacksegs++; } else { - tp->t_unacksegs = 0; tp->t_flags |= TF_ACKNOW; } tp->rcv_nxt += tlen; thflags = th->th_flags & TH_FIN; tcpstat.tcps_rcvpack++; tcpstat.tcps_rcvbyte += tlen; + if (nstat_collect) { + locked_add_64(&inp->inp_stat->rxpackets, 1); + locked_add_64(&inp->inp_stat->rxbytes, tlen); + } ND6_HINT(tp); + so_recv_data_stat(so, m, drop_hdrlen); if (sbappendstream(&so->so_rcv, m)) sorwakeup(so); } else { thflags = tcp_reass(tp, th, &tlen, m); tp->t_flags |= TF_ACKNOW; - tp->t_unacksegs = 0; } if (tlen > 0 && tp->sack_enable) @@ -2965,13 +3296,15 @@ tcp_input(m, off0) * Otherwise, since we received a FIN then no * more input can be expected, send ACK now. */ - if (DELAY_ACK(tp) && (tp->t_flags & TF_NEEDSYN)) { - tp->t_flags |= TF_DELACK; + if (DELAY_ACK(tp, th) && (tp->t_flags & TF_NEEDSYN)) { + if ((tp->t_flags & TF_DELACK) == 0) { + tp->t_flags |= TF_DELACK; + tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack); + } tp->t_unacksegs++; } else { tp->t_flags |= TF_ACKNOW; - tp->t_unacksegs = 0; } tp->rcv_nxt++; } @@ -2982,8 +3315,10 @@ tcp_input(m, off0) * enter the CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: - tp->t_starttime = 0; + tp->t_starttime = tcp_now; case TCPS_ESTABLISHED: + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_CLOSE_WAIT); tp->t_state = TCPS_CLOSE_WAIT; break; @@ -2992,6 +3327,8 @@ tcp_input(m, off0) * enter the CLOSING state. */ case TCPS_FIN_WAIT_1: + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_CLOSING); tp->t_state = TCPS_CLOSING; break; @@ -3001,21 +3338,20 @@ tcp_input(m, off0) * standard timers. */ case TCPS_FIN_WAIT_2: + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_TIME_WAIT); tp->t_state = TCPS_TIME_WAIT; tcp_canceltimers(tp); /* Shorten TIME_WAIT [RFC-1644, p.28] */ if (tp->cc_recv != 0 && - tp->t_starttime < (u_int32_t)tcp_msl) { - tp->t_timer[TCPT_2MSL] = - tp->t_rxtcur * TCPTV_TWTRUNC; + ((int)(tcp_now - tp->t_starttime)) < tcp_msl) { + add_to_time_wait(tp, tp->t_rxtcur * TCPTV_TWTRUNC); /* For transaction client, force ACK now. */ tp->t_flags |= TF_ACKNOW; tp->t_unacksegs = 0; } else - tp->t_timer[TCPT_2MSL] = 2 * tcp_msl; - - add_to_time_wait(tp); + add_to_time_wait(tp, 2 * tcp_msl); soisdisconnected(so); break; @@ -3023,8 +3359,7 @@ tcp_input(m, off0) * In TIME_WAIT state restart the 2 MSL time_wait timer. */ case TCPS_TIME_WAIT: - tp->t_timer[TCPT_2MSL] = 2 * tcp_msl; - add_to_time_wait(tp); + add_to_time_wait(tp, 2 * tcp_msl); break; } } @@ -3038,9 +3373,12 @@ tcp_input(m, off0) * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) { - tp->t_unacksegs = 0; (void) tcp_output(tp); } + + tcp_check_timer_state(tp); + + tcp_unlock(so, 1, 0); KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0); return; @@ -3074,8 +3412,9 @@ tcp_input(m, off0) #endif m_freem(m); tp->t_flags |= TF_ACKNOW; - tp->t_unacksegs = 0; (void) tcp_output(tp); + + /* Don't need to check timer state as we should have done it during tcp_output */ tcp_unlock(so, 1, 0); KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0); return; @@ -3119,22 +3458,22 @@ tcp_input(m, off0) if (thflags & TH_ACK) /* mtod() below is safe as long as hdr dropping is delayed */ tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, - TH_RST, ifscope); + TH_RST, ifscope, nocell); else { if (thflags & TH_SYN) tlen++; /* mtod() below is safe as long as hdr dropping is delayed */ tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, - (tcp_seq)0, TH_RST|TH_ACK, ifscope); + (tcp_seq)0, TH_RST|TH_ACK, ifscope, nocell); } /* destroy temporarily created socket */ if (dropsocket) { (void) soabort(so); tcp_unlock(so, 1, 0); } - else - if ((inp != NULL) && (nosock == 0)) - tcp_unlock(so, 1, 0); + else if ((inp != NULL) && (nosock == 0)) { + tcp_unlock(so, 1, 0); + } KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0); return; dropnosock: @@ -3154,9 +3493,9 @@ tcp_input(m, off0) (void) soabort(so); tcp_unlock(so, 1, 0); } - else - if (nosock == 0) - tcp_unlock(so, 1, 0); + else if (nosock == 0) { + tcp_unlock(so, 1, 0); + } KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0); return; } @@ -3301,6 +3640,36 @@ tcp_pulloutofband(so, th, m, off) panic("tcp_pulloutofband"); } +uint32_t +get_base_rtt(struct tcpcb *tp) +{ + uint32_t base_rtt = 0, i; + for (i = 0; i < N_RTT_BASE; ++i) { + if (tp->rtt_hist[i] != 0 && + (base_rtt == 0 || tp->rtt_hist[i] < base_rtt)) + base_rtt = tp->rtt_hist[i]; + } + return base_rtt; +} + +/* Each value of RTT base represents the minimum RTT seen in a minute. + * We keep upto N_RTT_BASE minutes worth of history. + */ +void +update_base_rtt(struct tcpcb *tp, uint32_t rtt) +{ + if (++tp->rtt_count >= rtt_samples_per_slot) { + int i=0; + for (i = (N_RTT_BASE-1); i > 0; --i) { + tp->rtt_hist[i] = tp->rtt_hist[i-1]; + } + tp->rtt_hist[0] = rtt; + tp->rtt_count = 0; + } else { + tp->rtt_hist[0] = min(tp->rtt_hist[0], rtt); + } +} + /* * Collect new round-trip time estimate * and update averages and current timeout. @@ -3314,15 +3683,26 @@ tcp_xmit_timer(tp, rtt) tcpstat.tcps_rttupdated++; tp->t_rttupdated++; + + if (rtt > 0) { + tp->t_rttcur = rtt; + update_base_rtt(tp, rtt); + } + if (tp->t_srtt != 0) { /* * srtt is stored as fixed point with 5 bits after the - * binary point (i.e., scaled by 8). The following magic + * binary point (i.e., scaled by 32). The following magic * is equivalent to the smoothing algorithm in rfc793 with * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed - * point). Adjust rtt to origin 0. + * point). + * + * Freebsd adjusts rtt to origin 0 by subtracting 1 from the provided + * rtt value. This was required because of the way t_rtttime was + * initiailised to 1 before. Since we changed t_rtttime to be based on + * tcp_now, this extra adjustment is not needed. */ - delta = ((rtt - 1) << TCP_DELTA_SHIFT) + delta = (rtt << TCP_DELTA_SHIFT) - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); if ((tp->t_srtt += delta) <= 0) @@ -3355,8 +3735,10 @@ tcp_xmit_timer(tp, rtt) tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } + nstat_route_rtt(tp->t_inpcb->inp_route.ro_rt, tp->t_srtt, tp->t_rttvar); tp->t_rtttime = 0; tp->t_rxtshift = 0; + tp->rxt_start = 0; /* * the retransmit should happen at rtt + 4 * rttvar. @@ -3370,7 +3752,8 @@ tcp_xmit_timer(tp, rtt) * the minimum feasible timer (which is 2 ticks). */ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), - max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); + max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX, + TCP_ADD_REXMTSLOP(tp)); /* * We received an ack for a packet that wasn't retransmitted; @@ -3471,12 +3854,14 @@ tcp_mss(tp, offer, input_ifscope) #if INET6 if (isipv6) { - rt = tcp_rtlookup6(inp); + rt = tcp_rtlookup6(inp, input_ifscope); if (rt != NULL && (IN6_IS_ADDR_LOOPBACK(&inp->in6p_faddr) || IN6_IS_ADDR_LINKLOCAL(&inp->in6p_faddr) || - rt->rt_gateway->sa_family == AF_LINK)) - isnetlocal = TRUE; + rt->rt_gateway->sa_family == AF_LINK || + in6_localaddr(&inp->in6p_faddr))) { + tp->t_flags |= TF_LOCAL; + } } else #endif /* INET6 */ @@ -3484,9 +3869,13 @@ tcp_mss(tp, offer, input_ifscope) rt = tcp_rtlookup(inp, input_ifscope); if (rt != NULL && (rt->rt_gateway->sa_family == AF_LINK || - rt->rt_ifp->if_flags & IFF_LOOPBACK)) - isnetlocal = TRUE; + rt->rt_ifp->if_flags & IFF_LOOPBACK || + in_localaddr(inp->inp_faddr))) { + tp->t_flags |= TF_LOCAL; + } } + isnetlocal = (tp->t_flags & TF_LOCAL); + if (rt == NULL) { tp->t_maxopd = tp->t_maxseg = #if INET6 @@ -3554,7 +3943,7 @@ tcp_mss(tp, offer, input_ifscope) if (rt->rt_rmx.rmx_locks & RTV_RTT) tp->t_rttmin = rtt / (RTM_RTTUNIT / TCP_RETRANSHZ); else - tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCP_RETRANSHZ; + tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN; tp->t_srtt = rtt / (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE)); tcpstat.tcps_usedrtt++; if (rt->rt_rmx.rmx_rttvar) { @@ -3568,10 +3957,11 @@ tcp_mss(tp, offer, input_ifscope) } TCPT_RANGESET(tp->t_rxtcur, ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, - tp->t_rttmin, TCPTV_REXMTMAX); + tp->t_rttmin, TCPTV_REXMTMAX, + TCP_ADD_REXMTSLOP(tp)); } else - tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCP_RETRANSHZ; + tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN; #if INET6 mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt)); @@ -3651,14 +4041,7 @@ tcp_mss(tp, offer, input_ifscope) (void)sbreserve(&so->so_rcv, bufsize); } - /* - * Set the slow-start flight size depending on whether this - * is a local network or not. - */ - if (isnetlocal) - tp->snd_cwnd = mss * ss_fltsz_local; - else - tp->snd_cwnd = mss * ss_fltsz; + set_tcp_stream_priority(so); if (rt->rt_rmx.rmx_ssthresh) { /* @@ -3673,6 +4056,17 @@ tcp_mss(tp, offer, input_ifscope) tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; } + + /* + * Set the slow-start flight size depending on whether this + * is a local network or not. + */ + if (CC_ALGO(tp)->cwnd_init != NULL) + CC_ALGO(tp)->cwnd_init(tp); + + DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb, struct tcpcb *, tp, + struct tcphdr *, NULL, int32_t, TCP_CC_CWND_INIT); + /* Route locked during lookup above */ RT_UNLOCK(rt); } @@ -3701,7 +4095,7 @@ tcp_mssopt(tp) #if INET6 if (isipv6) - rt = tcp_rtlookup6(tp->t_inpcb); + rt = tcp_rtlookup6(tp->t_inpcb, IFSCOPE_NONE); else #endif /* INET6 */ rt = tcp_rtlookup(tp->t_inpcb, IFSCOPE_NONE); @@ -3736,7 +4130,7 @@ tcp_mssopt(tp) /* * On a partial ack arrives, force the retransmission of the * next unacknowledged segment. Do not clear tp->t_dupacks. - * By setting snd_nxt to ti_ack, this forces retransmission timer to + * By setting snd_nxt to th_ack, this forces retransmission timer to * be started again. */ static void @@ -3756,7 +4150,6 @@ tcp_newreno_partial_ack(tp, th) */ tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); tp->t_flags |= TF_ACKNOW; - tp->t_unacksegs = 0; (void) tcp_output(tp); tp->snd_cwnd = ocwnd; if (SEQ_GT(onxt, tp->snd_nxt)) @@ -3828,11 +4221,13 @@ tcp_dropdropablreq(struct socket *head) * and being dropped by another input thread. * If we can't get a hold on this mutex, then grab the next socket in line. */ - if (lck_mtx_try_lock(inp->inpcb_mtx)) { + if (lck_mtx_try_lock(&inp->inpcb_mtx)) { so->so_usecount++; - if ((so->so_usecount == 2) && so->so_state & SS_INCOMP) + if ((so->so_usecount == 2) && + (so->so_state & SS_INCOMP) != 0 && + (so->so_flags & SOF_INCOMP_INPROGRESS) == 0) break; - else {/* don't use if beeing accepted or used in any other way */ + else {/* don't use if being accepted or used in any other way */ in_pcb_checkstate(inp, WNT_RELEASE, 1); tcp_unlock(so, 1, 0); } @@ -3851,44 +4246,120 @@ tcp_dropdropablreq(struct socket *head) if (!so) return 0; - TAILQ_REMOVE(&head->so_incomp, so, so_list); - tcp_unlock(head, 0, 0); - /* Makes sure socket is still in the right state to be discarded */ if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { tcp_unlock(so, 1, 0); - tcp_lock(head, 0, 0); return 0; } if (so->so_usecount != 2 || !(so->so_state & SS_INCOMP)) { - /* do not discard: that socket is beeing accepted */ + /* do not discard: that socket is being accepted */ tcp_unlock(so, 1, 0); - tcp_lock(head, 0, 0); return 0; } - so->so_head = NULL; + TAILQ_REMOVE(&head->so_incomp, so, so_list); + tcp_unlock(head, 0, 0); - /* - * We do not want to lose track of the PCB right away in case we receive - * more segments from the peer - */ + lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED); tp = sototcpcb(so); so->so_flags |= SOF_OVERFLOW; - tp->t_state = TCPS_TIME_WAIT; - (void) tcp_close(tp); + so->so_head = NULL; + + tcp_close(tp); tp->t_unacksegs = 0; + + if (inp->inp_wantcnt > 0 && inp->inp_wantcnt != WNT_STOPUSING) { + /* Some one has a wantcnt on this pcb. Since WNT_ACQUIRE + * doesn't require a lock, it could have happened while + * we are holding the lock. This pcb will have to + * be garbage collected later. + * Release the reference held for so_incomp queue + */ + so->so_usecount--; + + tcp_unlock(so, 1, 0); + } else { + /* Unlock this socket and leave the reference on. We need to + * acquire the pcbinfo lock in order to fully dispose it off + */ + tcp_unlock(so, 0, 0); + + lck_rw_lock_exclusive(tcbinfo.mtx); + + tcp_lock(so, 0, 0); + + /* Release the reference held for so_incomp queue */ + so->so_usecount--; + + if (so->so_usecount != 1 || + (inp->inp_wantcnt > 0 && inp->inp_wantcnt != WNT_STOPUSING)) { + /* There is an extra wantcount or usecount that must + * have been added when the socket was unlocked. This + * socket will have to be garbage collected later + */ + tcp_unlock(so, 1, 0); + } else { + + /* Drop the reference held for this function */ + so->so_usecount--; + + in_pcbdispose(inp); + } + lck_rw_done(tcbinfo.mtx); + } tcpstat.tcps_drops++; - tcp_canceltimers(tp); - add_to_time_wait(tp); - - tcp_unlock(so, 1, 0); + tcp_lock(head, 0, 0); head->so_incqlen--; head->so_qlen--; - return 1; + return(1); +} + +/* Set background congestion control on a socket */ +void +tcp_set_background_cc(struct socket *so) +{ + tcp_set_new_cc(so, TCP_CC_ALGO_BACKGROUND_INDEX); +} + +/* Set foreground congestion control on a socket */ +void +tcp_set_foreground_cc(struct socket *so) +{ + tcp_set_new_cc(so, TCP_CC_ALGO_NEWRENO_INDEX); +} + +static void +tcp_set_new_cc(struct socket *so, uint16_t cc_index) +{ + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp = intotcpcb(inp); + uint16_t old_cc_index = 0; + if (tp->tcp_cc_index != cc_index) { + + old_cc_index = tp->tcp_cc_index; + + if (CC_ALGO(tp)->cleanup != NULL) + CC_ALGO(tp)->cleanup(tp); + tp->tcp_cc_index = cc_index; + + /* Decide if the connection is just starting or if + * we have sent some packets on it. + */ + if (tp->snd_nxt > tp->iss) { + /* Already sent some packets */ + if (CC_ALGO(tp)->switch_to != NULL) + CC_ALGO(tp)->switch_to(tp, old_cc_index); + } else { + if (CC_ALGO(tp)->init != NULL) + CC_ALGO(tp)->init(tp); + } + DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, struct tcphdr *, NULL, + int32_t, TCP_CC_CHANGE_ALGO); + } } static int @@ -3908,7 +4379,7 @@ tcp_getstat SYSCTL_HANDLER_ARGS } -SYSCTL_PROC(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RD, 0, 0, +SYSCTL_PROC(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, tcp_getstat, "S,tcpstat", "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); static int @@ -3936,5 +4407,5 @@ sysctl_rexmtthresh SYSCTL_HANDLER_ARGS return (0); } -SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmt_thresh, CTLTYPE_INT|CTLFLAG_RW, +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmt_thresh, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED, &tcprexmtthresh, 0, &sysctl_rexmtthresh, "I", "Duplicate ACK Threshold for Fast Retransmit"); diff --git a/bsd/netinet/tcp_ledbat.c b/bsd/netinet/tcp_ledbat.c new file mode 100644 index 000000000..5baf28bea --- /dev/null +++ b/bsd/netinet/tcp_ledbat.c @@ -0,0 +1,434 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#if INET6 +#include +#endif +#include +#include +#include +#include +#include +#include +#include + +#include + +/* This file implements an alternate TCP congestion control algorithm + * for background transport developed by LEDBAT working group at IETF and + * described in draft: draft-ietf-ledbat-congestion-02 + */ + +int tcp_ledbat_init(struct tcpcb *tp); +int tcp_ledbat_cleanup(struct tcpcb *tp); +void tcp_ledbat_cwnd_init(struct tcpcb *tp); +void tcp_ledbat_inseq_ack_rcvd(struct tcpcb *tp, struct tcphdr *th); +void tcp_ledbat_ack_rcvd(struct tcpcb *tp, struct tcphdr *th); +void tcp_ledbat_pre_fr(struct tcpcb *tp, struct tcphdr *th); +void tcp_ledbat_post_fr(struct tcpcb *tp, struct tcphdr *th); +void tcp_ledbat_after_idle(struct tcpcb *tp); +void tcp_ledbat_after_timeout(struct tcpcb *tp); +int tcp_ledbat_delay_ack(struct tcpcb *tp, struct tcphdr *th); +void tcp_ledbat_switch_cc(struct tcpcb *tp, uint16_t old_cc_index); + +struct tcp_cc_algo tcp_cc_ledbat = { + .name = "ledbat", + .init = tcp_ledbat_init, + .cleanup = tcp_ledbat_cleanup, + .cwnd_init = tcp_ledbat_cwnd_init, + .inseq_ack_rcvd = tcp_ledbat_inseq_ack_rcvd, + .ack_rcvd = tcp_ledbat_ack_rcvd, + .pre_fr = tcp_ledbat_pre_fr, + .post_fr = tcp_ledbat_post_fr, + .after_idle = tcp_ledbat_after_idle, + .after_timeout = tcp_ledbat_after_timeout, + .delay_ack = tcp_ledbat_delay_ack, + .switch_to = tcp_ledbat_switch_cc +}; + +extern int tcp_do_rfc3465; +extern int tcp_do_rfc3465_lim2; +extern uint32_t get_base_rtt(struct tcpcb *tp); + +/* Target queuing delay in milliseconds. This includes the processing + * and scheduling delay on both of the end-hosts. A LEDBAT sender tries + * to keep queuing delay below this limit. When the queuing delay + * goes above this limit, a LEDBAT sender will start reducing the + * congestion window. + * + * The LEDBAT draft says that target queue delay MUST be 100 ms for + * inter-operability. + */ +int target_qdelay = 100; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, bg_target_qdelay, CTLFLAG_RW | CTLFLAG_LOCKED, + &target_qdelay , 100, "Target queuing delay"); + +/* Allowed increase and tether are used to place an upper bound on + * congestion window based on the amount of data that is outstanding. + * This will limit the congestion window when the amount of data in + * flight is little because the application is writing to the socket + * intermittently and is preventing the connection from becoming idle . + * + * max_allowed_cwnd = allowed_increase + (tether * flight_size) + * cwnd = min(cwnd, max_allowed_cwnd) + * + * 'Allowed_increase' parameter is set to 2. If the flight size is zero, then + * we want the congestion window to be at least 2 packets to reduce the + * delay induced by delayed ack. This helps when the receiver is acking every + * other packet. + * + * 'Tether' is also set to 2. We do not want this to limit the growth of cwnd + * during slow-start. + */ +int allowed_increase = 2; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, bg_allowed_increase, CTLFLAG_RW | CTLFLAG_LOCKED, + &allowed_increase, 1, "Additive constant used to calculate max allowed congestion window"); + +/* Left shift for cwnd to get tether value of 2 */ +int tether_shift = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, bg_tether_shift, CTLFLAG_RW | CTLFLAG_LOCKED, + &tether_shift, 1, "Tether shift for max allowed congestion window"); + +/* Start with an initial window of 2. This will help to get more accurate + * minimum RTT measurement in the beginning. It will help to probe + * the path slowly and will not add to the existing delay if the path is + * already congested. Using 2 packets will reduce the delay induced by delayed-ack. + */ +uint32_t bg_ss_fltsz = 2; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, bg_ss_fltsz, CTLFLAG_RW | CTLFLAG_LOCKED, + &bg_ss_fltsz, 2, "Initial congestion window for background transport"); + +extern int rtt_samples_per_slot; + +static void update_cwnd(struct tcpcb *tp, uint32_t incr) { + uint32_t max_allowed_cwnd = 0, flight_size = 0; + uint32_t qdelay, base_rtt; + int32_t off_target; + + base_rtt = get_base_rtt(tp); + + /* If we do not have a good RTT measurement yet, increment + * congestion window by the default value. + */ + if (base_rtt == 0 || tp->t_rttcur == 0) { + tp->snd_cwnd += incr; + goto check_max; + } + + qdelay = tp->t_rttcur - base_rtt; + off_target = (int32_t)(target_qdelay - qdelay); + + if (off_target >= 0) { + /* Delay decreased or remained the same, we can increase + * the congestion window according to RFC 3465. + * + * Move background slow-start threshold to current + * congestion window so that the next time (after some idle + * period), we can attempt to do slow-start till here if there + * is no increase in rtt + */ + if (tp->bg_ssthresh < tp->snd_cwnd) + tp->bg_ssthresh = tp->snd_cwnd; + tp->snd_cwnd += incr; + + } else { + /* In response to an increase in rtt, reduce the congestion + * window by one-eighth. This will help to yield immediately + * to a competing stream. + */ + uint32_t redwin; + + redwin = tp->snd_cwnd >> 3; + tp->snd_cwnd -= redwin; + if (tp->snd_cwnd < bg_ss_fltsz * tp->t_maxseg) + tp->snd_cwnd = bg_ss_fltsz * tp->t_maxseg; + + /* Lower background slow-start threshold so that the connection + * will go into congestion avoidance phase + */ + if (tp->bg_ssthresh > tp->snd_cwnd) + tp->bg_ssthresh = tp->snd_cwnd; + } +check_max: + /* Calculate the outstanding flight size and restrict the + * congestion window to a factor of flight size. + */ + flight_size = tp->snd_max - tp->snd_una; + + max_allowed_cwnd = (allowed_increase * tp->t_maxseg) + + (flight_size << tether_shift); + tp->snd_cwnd = min(tp->snd_cwnd, max_allowed_cwnd); + return; +} + +int tcp_ledbat_init(struct tcpcb *tp) { +#pragma unused(tp) + OSIncrementAtomic((volatile SInt32 *)&tcp_cc_ledbat.num_sockets); + return 0; +} + +int tcp_ledbat_cleanup(struct tcpcb *tp) { +#pragma unused(tp) + OSDecrementAtomic((volatile SInt32 *)&tcp_cc_ledbat.num_sockets); + return 0; +} + +/* Initialize the congestion window for a connection + * + */ + +void +tcp_ledbat_cwnd_init(struct tcpcb *tp) { + tp->snd_cwnd = tp->t_maxseg * bg_ss_fltsz; + tp->bg_ssthresh = tp->snd_ssthresh; +} + +/* Function to handle an in-sequence ack which is fast-path processing + * of an in sequence ack in tcp_input function (called as header prediction). + * This gets called only during congestion avoidance phase. + */ +void +tcp_ledbat_inseq_ack_rcvd(struct tcpcb *tp, struct tcphdr *th) { + int acked = 0; + u_int32_t incr = 0; + + acked = th->th_ack - tp->snd_una; + tp->t_bytes_acked += acked; + if (tp->t_bytes_acked > tp->snd_cwnd) { + tp->t_bytes_acked -= tp->snd_cwnd; + incr = tp->t_maxseg; + } + + if (tp->snd_cwnd < tp->snd_wnd && incr > 0) { + update_cwnd(tp, incr); + } +} +/* Function to process an ack. + */ +void +tcp_ledbat_ack_rcvd(struct tcpcb *tp, struct tcphdr *th) { + /* + * RFC 3465 - Appropriate Byte Counting. + * + * If the window is currently less than ssthresh, + * open the window by the number of bytes ACKed by + * the last ACK, however clamp the window increase + * to an upper limit "L". + * + * In congestion avoidance phase, open the window by + * one segment each time "bytes_acked" grows to be + * greater than or equal to the congestion window. + */ + + register u_int cw = tp->snd_cwnd; + register u_int incr = tp->t_maxseg; + int acked = 0; + + acked = th->th_ack - tp->snd_una; + tp->t_bytes_acked += acked; + if (cw >= tp->bg_ssthresh) { + /* congestion-avoidance */ + if (tp->t_bytes_acked < cw) { + /* No need to increase yet. */ + incr = 0; + } + } else { + /* + * If the user explicitly enables RFC3465 + * use 2*SMSS for the "L" param. Otherwise + * use the more conservative 1*SMSS. + * + * (See RFC 3465 2.3 Choosing the Limit) + */ + u_int abc_lim; + + abc_lim = (tcp_do_rfc3465_lim2 && + tp->snd_nxt == tp->snd_max) ? incr * 2 : incr; + + incr = lmin(acked, abc_lim); + } + if (tp->t_bytes_acked >= cw) + tp->t_bytes_acked -= cw; + if (incr > 0) + update_cwnd(tp, incr); +} + +void +tcp_ledbat_pre_fr(struct tcpcb *tp, struct tcphdr *th) { +#pragma unused(th) + + uint32_t win; + + win = min(tp->snd_wnd, tp->snd_cwnd) / + 2 / tp->t_maxseg; + if ( win < 2 ) + win = 2; + tp->snd_ssthresh = win * tp->t_maxseg; + if (tp->bg_ssthresh > tp->snd_ssthresh) + tp->bg_ssthresh = tp->snd_ssthresh; +} + +void +tcp_ledbat_post_fr(struct tcpcb *tp, struct tcphdr *th) { + int32_t ss; + + ss = tp->snd_max - th->th_ack; + + /* + * Complete ack. Inflate the congestion window to + * ssthresh and exit fast recovery. + * + * Window inflation should have left us with approx. + * snd_ssthresh outstanding data. But in case we + * would be inclined to send a burst, better to do + * it via the slow start mechanism. + */ + if (ss < (int32_t)tp->snd_ssthresh) + tp->snd_cwnd = ss + tp->t_maxseg; + else + tp->snd_cwnd = tp->snd_ssthresh; + tp->t_bytes_acked = 0; +} + +/* + * Function to handle connections that have been idle for + * some time. Slow start to get ack "clock" running again. + * Clear base history after idle time. + */ +void +tcp_ledbat_after_idle(struct tcpcb *tp) { + int32_t n = N_RTT_BASE, i = (N_RTT_BASE - 1); + + /* Decide how many base history entries have to be cleared + * based on how long the connection has been idle. + */ + + if (tp->t_rttcur > 0) { + int32_t nrtt, idle_time; + + idle_time = tcp_now - tp->t_rcvtime; + nrtt = idle_time / tp->t_rttcur; + n = nrtt / rtt_samples_per_slot; + if (n > N_RTT_BASE) + n = N_RTT_BASE; + } + for (i = (N_RTT_BASE - 1); n > 0; --i, --n) { + tp->rtt_hist[i] = 0; + } + for (n = (N_RTT_BASE - 1); i >= 0; --i, --n) { + tp->rtt_hist[n] = tp->rtt_hist[i]; + tp->rtt_hist[i] = 0; + } + + /* Reset the congestion window */ + tp->snd_cwnd = tp->t_maxseg * bg_ss_fltsz; +} + +/* Function to change the congestion window when the retransmit + * timer fires. The behavior is the same as that for best-effort + * TCP, reduce congestion window to one segment and start probing + * the link using "slow start". The slow start threshold is set + * to half of the current window. Lower the background slow start + * threshold also. + */ +void +tcp_ledbat_after_timeout(struct tcpcb *tp) { + if (tp->t_state >= TCPS_ESTABLISHED) { + u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; + if (win < 2) + win = 2; + tp->snd_cwnd = tp->t_maxseg; + tp->snd_ssthresh = win * tp->t_maxseg; + tp->t_bytes_acked = 0; + tp->t_dupacks = 0; + + if (tp->bg_ssthresh > tp->snd_ssthresh) + tp->bg_ssthresh = tp->snd_ssthresh; + } +} + +/* + * Indicate whether this ack should be delayed. + * We can delay the ack if: + * - our last ack wasn't a 0-sized window. + * - the peer hasn't sent us a TH_PUSH data packet: if he did, take this + * as a clue that we need to ACK without any delay. This helps higher + * level protocols who won't send us more data even if the window is + * open because their last "segment" hasn't been ACKed + * Otherwise the receiver will ack every other full-sized segment or when the + * delayed ack timer fires. This will help to generate better rtt estimates for + * the other end if it is a ledbat sender. + * + */ + +int +tcp_ledbat_delay_ack(struct tcpcb *tp, struct tcphdr *th) { + if ((tp->t_flags & TF_RXWIN0SENT) == 0 && + (th->th_flags & TH_PUSH) == 0 && + (tp->t_flags & TF_DELACK) == 0) + return(1); + return(0); +} + +/* Change a connection to use ledbat. First, lower bg_ssthresh value + * if it needs to be. + */ +void +tcp_ledbat_switch_cc(struct tcpcb *tp, uint16_t old_cc_index) { +#pragma unused(old_cc_index) + uint32_t cwnd; + + if (tp->bg_ssthresh == 0 || tp->bg_ssthresh > tp->snd_ssthresh) + tp->bg_ssthresh = tp->snd_ssthresh; + + cwnd = min(tp->snd_wnd, tp->snd_cwnd); + + if (tp->snd_cwnd > tp->bg_ssthresh) + cwnd = cwnd / tp->t_maxseg; + else + cwnd = cwnd / 2 / tp->t_maxseg; + + if (cwnd < bg_ss_fltsz) + cwnd = bg_ss_fltsz; + + tp->snd_cwnd = cwnd * tp->t_maxseg; + tp->t_bytes_acked = 0; + + OSIncrementAtomic((volatile SInt32 *)&tcp_cc_ledbat.num_sockets); +} diff --git a/bsd/netinet/tcp_newreno.c b/bsd/netinet/tcp_newreno.c new file mode 100644 index 000000000..5c9db2de9 --- /dev/null +++ b/bsd/netinet/tcp_newreno.c @@ -0,0 +1,344 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include + +#include +#include +#include +#include + +#if INET6 +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include + +int tcp_newreno_init(struct tcpcb *tp); +int tcp_newreno_cleanup(struct tcpcb *tp); +void tcp_newreno_cwnd_init_or_reset(struct tcpcb *tp); +void tcp_newreno_inseq_ack_rcvd(struct tcpcb *tp, struct tcphdr *th); +void tcp_newreno_ack_rcvd(struct tcpcb *tp, struct tcphdr *th); +void tcp_newreno_pre_fr(struct tcpcb *tp, struct tcphdr *th); +void tcp_newreno_post_fr(struct tcpcb *tp, struct tcphdr *th); +void tcp_newreno_after_idle(struct tcpcb *tp); +void tcp_newreno_after_timeout(struct tcpcb *tp); +int tcp_newreno_delay_ack(struct tcpcb *tp, struct tcphdr *th); +void tcp_newreno_switch_cc(struct tcpcb *tp, uint16_t old_index); + +struct tcp_cc_algo tcp_cc_newreno = { + .name = "newreno", + .init = tcp_newreno_init, + .cleanup = tcp_newreno_cleanup, + .cwnd_init = tcp_newreno_cwnd_init_or_reset, + .inseq_ack_rcvd = tcp_newreno_inseq_ack_rcvd, + .ack_rcvd = tcp_newreno_ack_rcvd, + .pre_fr = tcp_newreno_pre_fr, + .post_fr = tcp_newreno_post_fr, + .after_idle = tcp_newreno_cwnd_init_or_reset, + .after_timeout = tcp_newreno_after_timeout, + .delay_ack = tcp_newreno_delay_ack, + .switch_to = tcp_newreno_switch_cc +}; + +extern int tcp_do_rfc3465; +extern int tcp_do_rfc3465_lim2; +extern int maxseg_unacked; + +int tcp_newreno_init(struct tcpcb *tp) { +#pragma unused(tp) + OSIncrementAtomic((volatile SInt32 *)&tcp_cc_newreno.num_sockets); + return 0; +} + +int tcp_newreno_cleanup(struct tcpcb *tp) { +#pragma unused(tp) + OSDecrementAtomic((volatile SInt32 *)&tcp_cc_newreno.num_sockets); + return 0; +} + +/* Initialize the congestion window for a connection or + * handles connections that have been idle for + * some time. In this state, no acks are + * expected to clock out any data we send -- + * slow start to get ack "clock" running again. + * + * Set the slow-start flight size depending on whether + * this is a local network or not. + */ +void +tcp_newreno_cwnd_init_or_reset(struct tcpcb *tp) { + if ( tp->t_flags & TF_LOCAL ) + tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local; + else { + /* Calculate initial cwnd according to RFC3390, + * - On a standard link, this will result in a higher cwnd + * and improve initial transfer rate. + * - Keep the old ss_fltsz sysctl for ABI compabitility issues. + * but it will be overriden if tcp_do_rfc3390 sysctl is set. + */ + + if (tcp_do_rfc3390) + tp->snd_cwnd = min(4 * tp->t_maxseg, max(2 * tp->t_maxseg, 4380)); + + else + tp->snd_cwnd = tp->t_maxseg * ss_fltsz; + } +} + + +/* Function to handle an in-sequence ack during congestion avoidance phase. + * This will get called from header prediction code. + */ +void +tcp_newreno_inseq_ack_rcvd(struct tcpcb *tp, struct tcphdr *th) { + int acked = 0; + acked = th->th_ack - tp->snd_una; + /* + * Grow the congestion window, if the + * connection is cwnd bound. + */ + if (tp->snd_cwnd < tp->snd_wnd) { + tp->t_bytes_acked += acked; + if (tp->t_bytes_acked > tp->snd_cwnd) { + tp->t_bytes_acked -= tp->snd_cwnd; + tp->snd_cwnd += tp->t_maxseg; + } + } +} +/* Function to process an ack. + */ +void +tcp_newreno_ack_rcvd(struct tcpcb *tp, struct tcphdr *th) { + /* + * RFC 3465 - Appropriate Byte Counting. + * + * If the window is currently less than ssthresh, + * open the window by the number of bytes ACKed by + * the last ACK, however clamp the window increase + * to an upper limit "L". + * + * In congestion avoidance phase, open the window by + * one segment each time "bytes_acked" grows to be + * greater than or equal to the congestion window. + */ + + register u_int cw = tp->snd_cwnd; + register u_int incr = tp->t_maxseg; + int acked = 0; + + acked = th->th_ack - tp->snd_una; + if (tcp_do_rfc3465) { + + if (cw >= tp->snd_ssthresh) { + tp->t_bytes_acked += acked; + if (tp->t_bytes_acked >= cw) { + /* Time to increase the window. */ + tp->t_bytes_acked -= cw; + } else { + /* No need to increase yet. */ + incr = 0; + } + } else { + /* + * If the user explicitly enables RFC3465 + * use 2*SMSS for the "L" param. Otherwise + * use the more conservative 1*SMSS. + * + * (See RFC 3465 2.3 Choosing the Limit) + */ + u_int abc_lim; + + abc_lim = (tcp_do_rfc3465_lim2 && + tp->snd_nxt == tp->snd_max) ? incr * 2 : incr; + + incr = lmin(acked, abc_lim); + } + } else { + /* + * If the window gives us less than ssthresh packets + * in flight, open exponentially (segsz per packet). + * Otherwise open linearly: segsz per window + * (segsz^2 / cwnd per packet). + */ + + if (cw >= tp->snd_ssthresh) + incr = max((incr * incr / cw), 1); + } + tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<snd_scale); +} + +void +tcp_newreno_pre_fr(struct tcpcb *tp, struct tcphdr *th) { +#pragma unused(th) + + uint32_t win; + + win = min(tp->snd_wnd, tp->snd_cwnd) / + 2 / tp->t_maxseg; + if ( win < 2 ) + win = 2; + tp->snd_ssthresh = win * tp->t_maxseg; +} + +void +tcp_newreno_post_fr(struct tcpcb *tp, struct tcphdr *th) { + int32_t ss; + + ss = tp->snd_max - th->th_ack; + + /* + * Complete ack. Inflate the congestion window to + * ssthresh and exit fast recovery. + * + * Window inflation should have left us with approx. + * snd_ssthresh outstanding data. But in case we + * would be inclined to send a burst, better to do + * it via the slow start mechanism. + */ + if (ss < (int32_t)tp->snd_ssthresh) + tp->snd_cwnd = ss + tp->t_maxseg; + else + tp->snd_cwnd = tp->snd_ssthresh; + tp->t_bytes_acked = 0; +} + +/* Function to change the congestion window when the retransmit + * timer fires. + */ +void +tcp_newreno_after_timeout(struct tcpcb *tp) { + /* + * Close the congestion window down to one segment + * (we'll open it by one segment for each ack we get). + * Since we probably have a window's worth of unacked + * data accumulated, this "slow start" keeps us from + * dumping all that data as back-to-back packets (which + * might overwhelm an intermediate gateway). + * + * There are two phases to the opening: Initially we + * open by one mss on each ack. This makes the window + * size increase exponentially with time. If the + * window is larger than the path can handle, this + * exponential growth results in dropped packet(s) + * almost immediately. To get more time between + * drops but still "push" the network to take advantage + * of improving conditions, we switch from exponential + * to linear window opening at some threshhold size. + * For a threshhold, we use half the current window + * size, truncated to a multiple of the mss. + * + * (the minimum cwnd that will give us exponential + * growth is 2 mss. We don't allow the threshhold + * to go below this.) + */ + if (tp->t_state >= TCPS_ESTABLISHED) { + u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; + if (win < 2) + win = 2; + tp->snd_cwnd = tp->t_maxseg; + tp->snd_ssthresh = win * tp->t_maxseg; + tp->t_bytes_acked = 0; + tp->t_dupacks = 0; + } +} + +/* + * Indicate whether this ack should be delayed. + * We can delay the ack if: + * - delayed acks are enabled and set to 1, same as when value is set to 2. + * We kept this for binary compatibility. + * - delayed acks are enabled and set to 2, will "ack every other packet" + * - if our last ack wasn't a 0-sized window. + * - if the peer hasn't sent us a TH_PUSH data packet (this solves 3649245). + * If TH_PUSH is set, take this as a clue that we need to ACK + * with no delay. This helps higher level protocols who won't send + * us more data even if the window is open because their + * last "segment" hasn't been ACKed + * - delayed acks are enabled and set to 3, will do "streaming detection" + * (see the comment in tcp_input.c) and + * - if we receive more than "maxseg_unacked" full packets in the last 100ms + * - if the connection is not in slow-start or idle or loss/recovery states + * - if those criteria aren't met, it will ack every other packet. + */ + +int +tcp_newreno_delay_ack(struct tcpcb *tp, struct tcphdr *th) { + switch (tcp_delack_enabled) { + case 1: + case 2: + if ((tp->t_flags & TF_RXWIN0SENT) == 0 && + (th->th_flags & TH_PUSH) == 0 && + (tp->t_flags & TF_DELACK) == 0) + return(1); + break; + case 3: + if ((tp->t_flags & TF_RXWIN0SENT) == 0 && + (th->th_flags & TH_PUSH) == 0 && + ((tp->t_unacksegs == 0) || + ((tp->t_flags & TF_STRETCHACK) != 0 && + tp->t_unacksegs < (maxseg_unacked - 1)))) + return(1); + break; + } + return(0); +} + +/* Switch to newreno from a different CC. If the connection is in + * congestion avoidance state, it can continue to use the current + * congestion window because it is going to be conservative. But + * if the connection is in slow-start, we will halve the congestion + * window and let newreno work from there. + */ +void +tcp_newreno_switch_cc(struct tcpcb *tp, uint16_t old_index) { +#pragma unused(old_index) + + uint32_t cwnd = min(tp->snd_wnd, tp->snd_cwnd); + if (tp->snd_cwnd >= tp->snd_ssthresh) { + cwnd = cwnd / tp->t_maxseg; + } else { + cwnd = cwnd / 2 / tp->t_maxseg; + } + if (cwnd < 1) + cwnd = 1; + tp->snd_cwnd = cwnd * tp->t_maxseg; + + /* Start counting bytes for RFC 3465 again */ + tp->t_bytes_acked = 0; + + OSIncrementAtomic((volatile SInt32 *)&tcp_cc_newreno.num_sockets); +} diff --git a/bsd/netinet/tcp_output.c b/bsd/netinet/tcp_output.c index 69a2c2aed..5c310770d 100644 --- a/bsd/netinet/tcp_output.c +++ b/bsd/netinet/tcp_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -81,6 +81,7 @@ #include #include +#include #include #include @@ -89,6 +90,7 @@ #include #include #include +#include #if INET6 #include #include @@ -101,10 +103,12 @@ #include #include #include +#include #if TCPDEBUG #include #endif #include +#include #if IPSEC #include @@ -118,48 +122,55 @@ #define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 3) #define DBG_FNC_TCP_OUTPUT NETDBG_CODE(DBG_NETTCP, (4 << 8) | 1) - #ifdef notyet extern struct mbuf *m_copypack(); #endif int path_mtu_discovery = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW | CTLFLAG_LOCKED, &path_mtu_discovery, 1, "Enable Path MTU Discovery"); int ss_fltsz = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW | CTLFLAG_LOCKED, &ss_fltsz, 1, "Slow start flight size"); int ss_fltsz_local = 8; /* starts with eight segments max */ -SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW | CTLFLAG_LOCKED, &ss_fltsz_local, 1, "Slow start flight size for local networks"); -int tcp_do_newreno = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno, - 0, "Enable NewReno Algorithms"); - int tcp_do_tso = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_tso, 0, "Enable TCP Segmentation Offload"); int tcp_ecn_outbound = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_initiate_out, CTLFLAG_RW, &tcp_ecn_outbound, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_initiate_out, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_ecn_outbound, 0, "Initiate ECN for outbound connections"); int tcp_ecn_inbound = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_negotiate_in, CTLFLAG_RW, &tcp_ecn_inbound, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_negotiate_in, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_ecn_inbound, 0, "Allow ECN negotiation for inbound connections"); int tcp_packet_chaining = 50; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, packetchain, CTLFLAG_RW, &tcp_packet_chaining, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, packetchain, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_packet_chaining, 0, "Enable TCP output packet chaining"); int tcp_output_unlocked = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, socket_unlocked_on_output, CTLFLAG_RW, &tcp_output_unlocked, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, socket_unlocked_on_output, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_output_unlocked, 0, "Unlock TCP when sending packets down to IP"); +int tcp_do_rfc3390 = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_do_rfc3390, 1, "Calculate intial slowstart cwnd depending on MSS"); + +int tcp_min_iaj_win = MIN_IAJ_WIN; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, min_iaj_win, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_min_iaj_win, 1, "Minimum recv win based on inter-packet arrival jitter"); + +int tcp_acc_iaj_react_limit = ACC_IAJ_REACT_LIMIT; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, acc_iaj_react_limit, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_acc_iaj_react_limit, 1, "Accumulated IAJ when receiver starts to react"); + static int32_t packchain_newlist = 0; static int32_t packchain_looped = 0; static int32_t packchain_sent = 0; @@ -181,10 +192,13 @@ extern int ip_use_randomid; #endif /* RANDOM_IP_ID */ extern u_int32_t dlil_filter_count; extern u_int32_t kipf_count; +extern int tcp_recv_bg; static int tcp_ip_output(struct socket *, struct tcpcb *, struct mbuf *, int, struct mbuf *, int, int, int32_t); +static inline int is_tcp_recv_bg(struct socket *so); + static __inline__ u_int16_t get_socket_id(struct socket * s) { @@ -200,6 +214,12 @@ get_socket_id(struct socket * s) return (val); } +static inline int +is_tcp_recv_bg(struct socket *so) +{ + return (so->so_traffic_mgt_flags & TRAFFIC_MGT_TCP_RECVBG); +} + /* * Tcp output routine: figure out what should be sent and send it. * @@ -242,10 +262,10 @@ tcp_output(struct tcpcb *tp) #ifdef IPSEC unsigned ipsec_optlen = 0; #endif - int maxburst = TCP_MAXBURST; int last_off = 0; int m_off; - struct mbuf *m_last = NULL; + int idle_time = 0; + struct mbuf *m_lastm = NULL; struct mbuf *m_head = NULL; struct mbuf *packetlist = NULL; struct mbuf *tp_inp_options = tp->t_inpcb->inp_depend4.inp4_options; @@ -265,28 +285,17 @@ tcp_output(struct tcpcb *tp) * to send, then transmit; otherwise, investigate further. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); - if (idle && tp->t_rcvtime >= tp->t_rxtcur) { - /* - * We have been idle for "a while" and no acks are - * expected to clock out any data we send -- - * slow start to get ack "clock" running again. - * - * Set the slow-start flight size depending on whether - * this is a local network or not. - */ - if ( -#if INET6 - (isipv6 && in6_localaddr(&tp->t_inpcb->in6p_faddr)) || - (!isipv6 && -#endif - in_localaddr(tp->t_inpcb->inp_faddr) -#if INET6 - ) -#endif - ) - tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local; - else - tp->snd_cwnd = tp->t_maxseg * ss_fltsz; + + /* Since idle_time is signed integer, the following integer subtraction + * will take care of wrap around of tcp_now + */ + idle_time = tcp_now - tp->t_rcvtime; + if (idle && idle_time >= tp->t_rxtcur) { + if (CC_ALGO(tp)->after_idle != NULL) + CC_ALGO(tp)->after_idle(tp); + DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb, + struct tcpcb *, tp, struct tcphdr *, NULL, + int32_t, TCP_CC_IDLE_TIMEOUT); } tp->t_flags &= ~TF_LASTIDLE; if (idle) { @@ -344,14 +353,16 @@ tcp_output(struct tcpcb *tp) /* set Retransmit timer if it wasn't set * reset Persist timer and shift register as the - * adversed peer window may not be valid anymore + * advertised peer window may not be valid anymore */ if (!tp->t_timer[TCPT_REXMT]) { - tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); if (tp->t_timer[TCPT_PERSIST]) { tp->t_timer[TCPT_PERSIST] = 0; tp->t_rxtshift = 0; + tp->t_persist_stop = 0; + tp->rxt_start = 0; } } @@ -364,10 +375,12 @@ tcp_output(struct tcpcb *tp) tcp_drop(tp, EADDRNOTAVAIL); return(EADDRNOTAVAIL); } - else + else { + tcp_check_timer_state(tp); return(0); /* silently ignore, keep data in socket: address may be back */ + } } - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); /* * Address is still valid; check for multipages capability @@ -463,6 +476,12 @@ tcp_output(struct tcpcb *tp) tcpstat.tcps_sack_rexmits++; tcpstat.tcps_sack_rexmit_bytes += min(len, tp->t_maxseg); + if (nstat_collect) { + nstat_route_tx(tp->t_inpcb->inp_route.ro_rt, 1, min(len, tp->t_maxseg), NSTAT_TX_FLAG_RETRANSMIT); + locked_add_64(&tp->t_inpcb->inp_stat->txpackets, 1); + locked_add_64(&tp->t_inpcb->inp_stat->txbytes, min(len, tp->t_maxseg)); + tp->t_stat.txretransmitbytes += min(len, tp->t_maxseg); + } } else len = 0; @@ -507,6 +526,8 @@ tcp_output(struct tcpcb *tp) } else { tp->t_timer[TCPT_PERSIST] = 0; tp->t_rxtshift = 0; + tp->rxt_start = 0; + tp->t_persist_stop = 0; } } @@ -587,6 +608,8 @@ tcp_output(struct tcpcb *tp) (TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) { tp->t_flags &= ~TF_CLOSING; (void) tcp_close(tp); + } else { + tcp_check_timer_state(tp); } KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); @@ -604,7 +627,12 @@ tcp_output(struct tcpcb *tp) flags &= ~TH_FIN; } - if (len < 0) { + /* The check here used to be (len < 0). Some times len is zero when + * the congestion window is closed and we need to check if persist timer + * has to be set in that case. But don't set persist until connection + * is established. + */ + if (len <= 0 && !(flags & TH_SYN)) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, @@ -619,6 +647,7 @@ tcp_output(struct tcpcb *tp) if (sendwin == 0) { tp->t_timer[TCPT_REXMT] = 0; tp->t_rxtshift = 0; + tp->rxt_start = 0; tp->snd_nxt = tp->snd_una; if (tp->t_timer[TCPT_PERSIST] == 0) tcp_setpersist(tp); @@ -782,7 +811,7 @@ tcp_output(struct tcpcb *tp) if (tp->sack_enable && (tp->t_state >= TCPS_ESTABLISHED) && SEQ_GT(tp->snd_max, tp->snd_una) && tp->t_timer[TCPT_REXMT] == 0 && tp->t_timer[TCPT_PERSIST] == 0) { - tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); goto just_return; } /* @@ -810,6 +839,7 @@ tcp_output(struct tcpcb *tp) if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 && tp->t_timer[TCPT_PERSIST] == 0) { tp->t_rxtshift = 0; + tp->rxt_start = 0; tcp_setpersist(tp); } just_return: @@ -833,6 +863,8 @@ tcp_output(struct tcpcb *tp) if ((tp->t_flags & (TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) { tp->t_flags &= ~TF_CLOSING; (void) tcp_close(tp); + } else { + tcp_check_timer_state(tp); } KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return (0); @@ -1136,9 +1168,19 @@ tcp_output(struct tcpcb *tp) else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { tcpstat.tcps_sndrexmitpack++; tcpstat.tcps_sndrexmitbyte += len; + if (nstat_collect) { + nstat_route_tx(tp->t_inpcb->inp_route.ro_rt, 1, len, NSTAT_TX_FLAG_RETRANSMIT); + locked_add_64(&tp->t_inpcb->inp_stat->txpackets, 1); + locked_add_64(&tp->t_inpcb->inp_stat->txbytes, len); + tp->t_stat.txretransmitbytes += len; + } } else { tcpstat.tcps_sndpack++; tcpstat.tcps_sndbyte += len; + if (nstat_collect) { + locked_add_64(&tp->t_inpcb->inp_stat->txpackets, 1); + locked_add_64(&tp->t_inpcb->inp_stat->txbytes, len); + } } #ifdef notyet if ((m = m_copypack(so->so_snd.sb_mb, off, @@ -1221,7 +1263,7 @@ tcp_output(struct tcpcb *tp) * setting the mbuf pointer to NULL is sufficient to disable the hint mechanism. */ if (m_head != so->so_snd.sb_mb || sack_rxmit || last_off != off) - m_last = NULL; + m_lastm = NULL; last_off = off + len; m_head = so->so_snd.sb_mb; @@ -1235,7 +1277,7 @@ tcp_output(struct tcpcb *tp) * m_copym_with_hdrs will always return the last mbuf pointer and the offset into it that * it acted on to fullfill the current request, whether a valid 'hint' was passed in or not */ - if ((m = m_copym_with_hdrs(so->so_snd.sb_mb, off, len, M_DONTWAIT, &m_last, &m_off)) == NULL) { + if ((m = m_copym_with_hdrs(so->so_snd.sb_mb, off, len, M_DONTWAIT, &m_lastm, &m_off)) == NULL) { error = ENOBUFS; goto out; } @@ -1285,6 +1327,10 @@ tcp_output(struct tcpcb *tp) ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); tcp_fillheaders(tp, ip6, th); + if ((tp->ecn_flags & TE_SENDIPECT) != 0 && len && + !SEQ_LT(tp->snd_nxt, tp->snd_max)) { + ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); + } } else #endif /* INET6 */ { @@ -1349,13 +1395,25 @@ tcp_output(struct tcpcb *tp) if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0) { if (recwin > (int32_t)slowlink_wsize) recwin = slowlink_wsize; - th->th_win = htons((u_short) (recwin>>tp->rcv_scale)); } - else { - if (recwin > (int32_t)(TCP_MAXWIN << tp->rcv_scale)) - recwin = (int32_t)(TCP_MAXWIN << tp->rcv_scale); - th->th_win = htons((u_short) (recwin>>tp->rcv_scale)); + +#if TRAFFIC_MGT + if (tcp_recv_bg == 1 || is_tcp_recv_bg(so)) { + if (tp->acc_iaj > tcp_acc_iaj_react_limit) { + uint32_t min_iaj_win = tcp_min_iaj_win * tp->t_maxseg; + if (tp->iaj_rwintop == 0 || + SEQ_LT(tp->iaj_rwintop, tp->rcv_adv)) + tp->iaj_rwintop = tp->rcv_adv; + if (SEQ_LT(tp->iaj_rwintop, tp->rcv_nxt + min_iaj_win)) + tp->iaj_rwintop = tp->rcv_nxt + min_iaj_win; + recwin = min(tp->iaj_rwintop - tp->rcv_nxt, recwin); + } } +#endif /* TRAFFIC_MGT */ + + if (recwin > (int32_t)(TCP_MAXWIN << tp->rcv_scale)) + recwin = (int32_t)(TCP_MAXWIN << tp->rcv_scale); + th->th_win = htons((u_short) (recwin>>tp->rcv_scale)); /* * Adjust the RXWIN0SENT flag - indicate that we have advertised @@ -1365,7 +1423,7 @@ tcp_output(struct tcpcb *tp) * to read more data then can be buffered prior to transmitting on * the connection. */ - if (recwin == 0) + if (th->th_win == 0) tp->t_flags |= TF_RXWIN0SENT; else tp->t_flags &= ~TF_RXWIN0SENT; @@ -1387,13 +1445,17 @@ tcp_output(struct tcpcb *tp) */ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ #if INET6 - if (isipv6) + if (isipv6) { /* * ip6_plen is not need to be filled now, and will be filled * in ip6_output. */ - th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), - sizeof(struct tcphdr) + optlen + len); + m->m_pkthdr.csum_flags = CSUM_TCPIPV6; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + if (len + optlen) + th->th_sum = in_addword(th->th_sum, + htons((u_short)(optlen + len))); + } else #endif /* INET6 */ { @@ -1407,7 +1469,6 @@ tcp_output(struct tcpcb *tp) /* * Enable TSO and specify the size of the segments. * The TCP pseudo header checksum is always provided. - * XXX: Fixme: This is currently not the case for IPv6. */ if (tso) { #if INET6 @@ -1450,7 +1511,7 @@ tcp_output(struct tcpcb *tp) * not currently timing anything. */ if (tp->t_rtttime == 0) { - tp->t_rtttime = 1; + tp->t_rtttime = tcp_now; tp->t_rtseq = startseq; tcpstat.tcps_segstimed++; } @@ -1471,8 +1532,10 @@ tcp_output(struct tcpcb *tp) if (tp->t_timer[TCPT_PERSIST]) { tp->t_timer[TCPT_PERSIST] = 0; tp->t_rxtshift = 0; + tp->rxt_start = 0; + tp->t_persist_stop = 0; } - tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); } } else { /* @@ -1510,6 +1573,15 @@ tcp_output(struct tcpcb *tp) */ #if INET6 if (isipv6) { + struct rtentry *rt6; + struct ip6_out_args ip6oa = { IFSCOPE_NONE, 0 }; + unsigned int outif; + + KERNEL_DEBUG(DBG_LAYER_BEG, + ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport), + (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) | + (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)), + 0,0,0); /* * we separately set hoplimit for every segment, since the * user might want to change the value via setsockopt. @@ -1531,46 +1603,40 @@ tcp_output(struct tcpcb *tp) #endif /*IPSEC*/ m->m_pkthdr.socket_id = socket_id; -#if PKT_PRIORITY - set_traffic_class(m, so, MBUF_TC_NONE); -#endif /* PKT_PRIORITY */ - error = ip6_output(m, - inp6_pktopts, - &tp->t_inpcb->in6p_route, - (so_options & SO_DONTROUTE), NULL, NULL, 0); + rt6 = tp->t_inpcb->in6p_route.ro_rt; + if (rt6 != NULL && rt6->rt_ifp != NULL + && rt6->rt_ifp != lo_ifp) + set_packet_tclass(m, so, MBUF_TC_UNSPEC, 1); + + DTRACE_TCP5(send, struct mbuf *, m, struct inpcb *, tp->t_inpcb, struct ip6_hdr *, ip6, + struct tcpcb *, tp, struct tcphdr *, th); + + if (tp->t_inpcb->inp_flags & INP_BOUND_IF) + ip6oa.ip6oa_boundif = tp->t_inpcb->inp_boundif; + + ip6oa.ip6oa_nocell = (tp->t_inpcb->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; + + error = ip6_output(m, inp6_pktopts, &tp->t_inpcb->in6p_route, + (so_options & SO_DONTROUTE) | IPV6_OUTARGS, NULL, NULL, + &ip6oa); + + /* Refresh rt6 as we may have lost the route while in ip6_output() */ + if ((rt6 = tp->t_inpcb->in6p_route.ro_rt) != NULL && + (outif = rt6->rt_ifp->if_index) != tp->t_inpcb->in6p_last_outif) + tp->t_inpcb->in6p_last_outif = outif; } else #endif /* INET6 */ { ip->ip_len = m->m_pkthdr.len; -#if INET6 - if (isipv6) - ip->ip_ttl = in6_selecthlim(tp->t_inpcb, - tp->t_inpcb->in6p_route.ro_rt ? - tp->t_inpcb->in6p_route.ro_rt->rt_ifp - : NULL); - else -#endif /* INET6 */ ip->ip_ttl = tp->t_inpcb->inp_ip_ttl; /* XXX */ ip->ip_tos |= (tp->t_inpcb->inp_ip_tos & ~IPTOS_ECN_MASK); /* XXX */ -#if INET6 - if (isipv6) { - KERNEL_DEBUG(DBG_LAYER_BEG, - ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport), - (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) | - (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)), - 0,0,0); - } - else -#endif - { - KERNEL_DEBUG(DBG_LAYER_BEG, - ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport), - (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) | - (tp->t_inpcb->inp_faddr.s_addr & 0xffff)), - 0,0,0); - } + KERNEL_DEBUG(DBG_LAYER_BEG, + ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport), + (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) | + (tp->t_inpcb->inp_faddr.s_addr & 0xffff)), + 0,0,0); /* * See if we should do MTU discovery. @@ -1596,11 +1662,18 @@ tcp_output(struct tcpcb *tp) lost = 0; m->m_pkthdr.socket_id = socket_id; m->m_nextpkt = NULL; -#if PKT_PRIORITY - set_traffic_class(m, so, MBUF_TC_NONE); -#endif /* PKT_PRIORITY */ + + if (tp->t_inpcb->inp_route.ro_rt != NULL && + tp->t_inpcb->inp_route.ro_rt->rt_ifp != NULL && + tp->t_inpcb->inp_route.ro_rt->rt_ifp != lo_ifp) + set_packet_tclass(m, so, MBUF_TC_UNSPEC, 0); + tp->t_pktlist_sentlen += len; tp->t_lastchain++; + + DTRACE_TCP5(send, struct mbuf *, m, struct inpcb *, tp->t_inpcb, + struct ip *, ip, struct tcpcb *, tp, struct tcphdr *, th); + if (tp->t_pktlist_head != NULL) { tp->t_pktlist_tail->m_nextpkt = m; tp->t_pktlist_tail = m; @@ -1685,12 +1758,17 @@ tcp_output(struct tcpcb *tp) if (error == ENOBUFS) { if (!tp->t_timer[TCPT_REXMT] && !tp->t_timer[TCPT_PERSIST]) - tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); tp->snd_cwnd = tp->t_maxseg; tp->t_bytes_acked = 0; + tcp_check_timer_state(tp); KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); + + DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb, + struct tcpcb *, tp, struct tcphdr *, NULL, + int32_t, TCP_CC_OUTPUT_ERROR); return (0); } if (error == EMSGSIZE) { @@ -1710,21 +1788,26 @@ tcp_output(struct tcpcb *tp) tp->t_flags &= ~TF_TSO; tcp_mtudisc(tp->t_inpcb, 0); + tcp_check_timer_state(tp); + KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return 0; } if ((error == EHOSTUNREACH || error == ENETDOWN) && TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_softerror = error; + tcp_check_timer_state(tp); KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return (0); } + tcp_check_timer_state(tp); KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return (error); } tcpstat.tcps_sndtotal++; +#if INET6 /* * Data sent (as far as we can tell). * If this advertises a larger window than any other segment, @@ -1733,18 +1816,21 @@ tcp_output(struct tcpcb *tp) * we unlock the socket. * NOTE: for now, this is done in tcp_ip_output for IPv4 */ -#if INET6 if (isipv6) { if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + recwin; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); + tp->t_timer[TCPT_DELACK] = 0; + tp->t_unacksegs = 0; } #endif KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END,0,0,0,0,0); - if (sendalot && (!tcp_do_newreno || --maxburst)) + if (sendalot) goto again; + + tcp_check_timer_state(tp); return (0); } @@ -1758,13 +1844,12 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, struct inpcb *inp = tp->t_inpcb; struct ip_out_args ipoa; struct route ro; -#if CONFIG_OUT_IF unsigned int outif; -#endif /* CONFIG_OUT_IF */ /* If socket was bound to an ifindex, tell ip_output about it */ - ipoa.ipoa_ifscope = (inp->inp_flags & INP_BOUND_IF) ? + ipoa.ipoa_boundif = (inp->inp_flags & INP_BOUND_IF) ? inp->inp_boundif : IFSCOPE_NONE; + ipoa.ipoa_nocell = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; flags |= IP_OUTARGS; /* Copy the cached route and take an extra reference */ @@ -1781,14 +1866,21 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, tp->rcv_adv = tp->rcv_nxt + recwin; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); + tp->t_timer[TCPT_DELACK] = 0; + tp->t_unacksegs = 0; /* * If allowed, unlock TCP socket while in IP * but only if the connection is established and - * if we're not sending from an upcall. + * in a normal mode where reentrancy on the tcpcb won't be + * an issue: + * - there is no SACK episode + * - we're not in Fast Recovery mode + * - if we're not sending from an upcall. */ if (tcp_output_unlocked && ((so->so_flags & SOF_UPCALLINUSE) == 0) && - (tp->t_state == TCPS_ESTABLISHED) && (sack_in_progress == 0)) { + (tp->t_state == TCPS_ESTABLISHED) && (sack_in_progress == 0) && + ((tp->t_flags & TF_FASTRECOVERY) == 0)) { unlocked = TRUE; socket_unlock(so, 0); } @@ -1828,7 +1920,6 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, */ cnt = 0; } - error = ip_output_list(pkt, cnt, opt, &ro, flags, 0, &ipoa); if (chain || error) { /* @@ -1846,6 +1937,10 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, if (unlocked) socket_lock(so, 0); + if (ro.ro_rt != NULL && + (outif = ro.ro_rt->rt_ifp->if_index) != inp->inp_last_outif) + inp->inp_last_outif = outif; + /* Synchronize cached PCB route */ inp_route_copyin(inp, &ro); @@ -1858,14 +1953,27 @@ tcp_setpersist(tp) { int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; - if (tp->t_timer[TCPT_REXMT]) - panic("tcp_setpersist: retransmit pending"); + /* If a PERSIST_TIMER option was set we will limit the + * time the persist timer will be active for that connection + * in order to avoid DOS by using zero window probes. + * see rdar://5805356 + */ + + if ((tp->t_persist_timeout != 0) && + (tp->t_timer[TCPT_PERSIST] == 0) && + (tp->t_persist_stop == 0)) { + tp->t_persist_stop = tcp_now + tp->t_persist_timeout; + } + /* * Start/restart persistance timer. */ TCPT_RANGESET(tp->t_timer[TCPT_PERSIST], t * tcp_backoff[tp->t_rxtshift], - TCPTV_PERSMIN, TCPTV_PERSMAX); + TCPTV_PERSMIN, TCPTV_PERSMAX, + TCP_ADD_REXMTSLOP(tp)); + tp->t_timer[TCPT_PERSIST] = OFFSET_FROM_START(tp, tp->t_timer[TCPT_PERSIST]); + if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; } diff --git a/bsd/netinet/tcp_sack.c b/bsd/netinet/tcp_sack.c index 5842ad2b8..69fb8a7d0 100644 --- a/bsd/netinet/tcp_sack.c +++ b/bsd/netinet/tcp_sack.c @@ -103,20 +103,20 @@ #endif /*IPSEC*/ int tcp_do_sack = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW, &tcp_do_sack, 0, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_sack, 0, "Enable/Disable TCP SACK support"); static int tcp_sack_maxholes = 128; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack_maxholes, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack_maxholes, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_sack_maxholes, 0, "Maximum number of TCP SACK holes allowed per connection"); static int tcp_sack_globalmaxholes = 65536; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack_globalmaxholes, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack_globalmaxholes, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_sack_globalmaxholes, 0, "Global maximum number of TCP SACK holes"); static int tcp_sack_globalholes = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack_globalholes, CTLFLAG_RD, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack_globalholes, CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_sack_globalholes, 0, "Global number of TCP SACK holes currently allocated"); @@ -203,6 +203,18 @@ tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end) /* Save the number of SACK blocks. */ tp->rcv_numsacks = num_head + num_saved; + + /* If we are requesting SACK recovery, reset the stretch-ack state + * so that connection will generate more acks after recovery and + * sender's cwnd will open. + */ + if ((tp->t_flags & TF_STRETCHACK) != 0 && tp->rcv_numsacks > 0) + tcp_reset_stretch_ack(tp); + +#if TRAFFIC_MGT + if (tp->acc_iaj > 0 && tp->rcv_numsacks > 0) + reset_acc_iaj(tp); +#endif /* TRAFFIC_MGT */ } /* diff --git a/bsd/netinet/tcp_seq.h b/bsd/netinet/tcp_seq.h index 89a16ef79..df7bfa4e9 100644 --- a/bsd/netinet/tcp_seq.h +++ b/bsd/netinet/tcp_seq.h @@ -79,6 +79,8 @@ /* for modulo comparisons of timestamps */ #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) +#define TSTMP_GT(a,b) ((int)((a)-(b)) > 0) +#define TSTMP_LEQ(a,b) ((int)((a)-(b)) <= 0) #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) /* diff --git a/bsd/netinet/tcp_subr.c b/bsd/netinet/tcp_subr.c index f1b220bc2..8cf658482 100644 --- a/bsd/netinet/tcp_subr.c +++ b/bsd/netinet/tcp_subr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -82,6 +82,7 @@ #include #include #include +#include #include #include @@ -112,6 +113,9 @@ #include #include #include +#include +#include + #if INET6 #include #endif @@ -136,6 +140,7 @@ #include #include +#include #define DBG_FNC_TCP_CLOSE NETDBG_CODE(DBG_NETTCP, ((5 << 8) | 2)) @@ -147,13 +152,13 @@ extern int ipsec_bypass; #endif int tcp_mssdflt = TCP_MSS; -SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_mssdflt , 0, "Default TCP Maximum Segment Size"); #if INET6 int tcp_v6mssdflt = TCP6_MSS; SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, - CTLFLAG_RW, &tcp_v6mssdflt , 0, + CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_v6mssdflt , 0, "Default TCP Maximum Segment Size for IPv6"); #endif @@ -166,7 +171,7 @@ SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, * checking. This setting prevents us from sending too small packets. */ int tcp_minmss = TCP_MINMSS; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_minmss , 0, "Minmum TCP Maximum Segment Size"); /* @@ -182,51 +187,70 @@ __private_extern__ int tcp_minmssoverload = TCP_MINMSSOVERLOAD; #else __private_extern__ int tcp_minmssoverload = 0; #endif -SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmssoverload, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmssoverload, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_minmssoverload , 0, "Number of TCP Segments per Second allowed to" "be under the MINMSS Size"); static int tcp_do_rfc1323 = 1; -SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions"); +// Not used static int tcp_do_rfc1644 = 0; -SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions"); static int do_tcpdrain = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW | CTLFLAG_LOCKED, &do_tcpdrain, 0, "Enable tcp_drain routine for extra help when low on mbufs"); -SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED, &tcbinfo.ipi_count, 0, "Number of active PCBs"); static int icmp_may_rst = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW | CTLFLAG_LOCKED, &icmp_may_rst, 0, "Certain ICMP unreachable messages may abort connections in SYN_SENT"); static int tcp_strict_rfc1948 = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, strict_rfc1948, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, strict_rfc1948, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_strict_rfc1948, 0, "Determines if RFC1948 is followed exactly"); static int tcp_isn_reseed_interval = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); static int tcp_background_io_enabled = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_io_enabled, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_io_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_background_io_enabled, 0, "Background IO Enabled"); -int tcp_TCPTV_MIN = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, rtt_min, CTLFLAG_RW, +int tcp_TCPTV_MIN = 100; /* 100ms minimum RTT */ +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rtt_min, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_TCPTV_MIN, 0, "min rtt value allowed"); +int tcp_rexmt_slop = TCPTV_REXMTSLOP; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmt_slop, CTLFLAG_RW, + &tcp_rexmt_slop, 0, "Slop added to retransmit timeout"); + __private_extern__ int tcp_use_randomport = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, randomize_ports, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, randomize_ports, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_use_randomport, 0, "Randomize TCP port numbers"); +extern struct tcp_cc_algo tcp_cc_newreno; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno_sockets, CTLFLAG_RD | CTLFLAG_LOCKED, + &tcp_cc_newreno.num_sockets, 0, "Number of sockets using newreno"); + +extern struct tcp_cc_algo tcp_cc_ledbat; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_sockets, CTLFLAG_RD | CTLFLAG_LOCKED, + &tcp_cc_ledbat.num_sockets, 0, "Number of sockets using background transport"); + static void tcp_cleartaocache(void); static void tcp_notify(struct inpcb *, int); +static void tcp_cc_init(void); + struct zone *sack_hole_zone; +struct zone *tcp_reass_zone; + +/* The array containing pointers to currently implemented TCP CC algorithms */ +struct tcp_cc_algo* tcp_cc_algo_list[TCP_CC_ALGO_COUNT]; extern unsigned int total_mb_cnt; extern unsigned int total_cl_cnt; @@ -247,7 +271,7 @@ extern int path_mtu_discovery; #endif __private_extern__ int tcp_tcbhashsize = TCBHASHSIZE; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); /* @@ -259,25 +283,25 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD, * to be changed, eventually, for greater efficiency). */ #define ALIGNMENT 32 -#define ALIGNM1 (ALIGNMENT - 1) struct inp_tp { - union { - struct inpcb inp; - char align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1]; - } inp_tp_u; - struct tcpcb tcb; + struct inpcb inp; + struct tcpcb tcb __attribute__((aligned(ALIGNMENT))); }; #undef ALIGNMENT -#undef ALIGNM1 extern struct inpcbhead time_wait_slots[]; -extern u_int32_t *delack_bitmask; +extern struct tcptimerlist tcp_timer_list; int get_inpcb_str_size(void); int get_tcp_str_size(void); static void tcpcb_to_otcpcb(struct tcpcb *, struct otcpcb *); +static lck_attr_t *tcp_uptime_mtx_attr = NULL; /* mutex attributes */ +static lck_grp_t *tcp_uptime_mtx_grp = NULL; /* mutex group definition */ +static lck_grp_attr_t *tcp_uptime_mtx_grp_attr = NULL; /* mutex group attributes */ + + int get_inpcb_str_size(void) { return sizeof(struct inpcb); @@ -291,6 +315,17 @@ int get_tcp_str_size(void) int tcp_freeq(struct tcpcb *tp); +/* + * Initialize TCP congestion control algorithms. + */ + +void +tcp_cc_init(void) +{ + bzero(&tcp_cc_algo_list, sizeof(tcp_cc_algo_list)); + tcp_cc_algo_list[TCP_CC_ALGO_NEWRENO_INDEX] = &tcp_cc_newreno; + tcp_cc_algo_list[TCP_CC_ALGO_BACKGROUND_INDEX] = &tcp_cc_ledbat; +} /* * Tcp initialization @@ -310,9 +345,10 @@ tcp_init() tcp_keepintvl = TCPTV_KEEPINTVL; tcp_maxpersistidle = TCPTV_KEEP_IDLE; tcp_msl = TCPTV_MSL; - read_random(&tcp_now, sizeof(tcp_now)); - tcp_now = tcp_now & 0x3fffffff; /* Starts tcp internal 100ms clock at a random value */ + microuptime(&tcp_uptime); + read_random(&tcp_now, sizeof(tcp_now)); + tcp_now = tcp_now & 0x3fffffff; /* Starts tcp internal clock at a random value */ LIST_INIT(&tcb); tcbinfo.listhead = &tcb; @@ -325,10 +361,26 @@ tcp_init() tcbinfo.hashbase = hashinit(tcp_tcbhashsize, M_PCB, &tcbinfo.hashmask); tcbinfo.porthashbase = hashinit(tcp_tcbhashsize, M_PCB, &tcbinfo.porthashmask); - str_size = (vm_size_t) sizeof(struct inp_tp); + str_size = P2ROUNDUP(sizeof(struct inp_tp), sizeof(u_int64_t)); tcbinfo.ipi_zone = (void *) zinit(str_size, 120000*str_size, 8192, "tcpcb"); + zone_change(tcbinfo.ipi_zone, Z_CALLERACCT, FALSE); + zone_change(tcbinfo.ipi_zone, Z_EXPAND, TRUE); + + str_size = P2ROUNDUP(sizeof(struct sackhole), sizeof(u_int64_t)); sack_hole_zone = zinit(str_size, 120000*str_size, 8192, "sack_hole zone"); + zone_change(sack_hole_zone, Z_CALLERACCT, FALSE); + zone_change(sack_hole_zone, Z_EXPAND, TRUE); + tcp_reass_maxseg = nmbclusters / 16; + str_size = P2ROUNDUP(sizeof(struct tseg_qent), sizeof(u_int64_t)); + tcp_reass_zone = zinit(str_size, (tcp_reass_maxseg + 1) * str_size, + 0, "tcp_reass_zone"); + if (tcp_reass_zone == NULL) { + panic("%s: failed allocating tcp_reass_zone", __func__); + /* NOTREACHED */ + } + zone_change(tcp_reass_zone, Z_CALLERACCT, FALSE); + zone_change(tcp_reass_zone, Z_EXPAND, TRUE); #if INET6 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) @@ -341,10 +393,10 @@ tcp_init() panic("tcp_init"); #undef TCP_MINPROTOHDR - /* + /* * allocate lock group attribute and group for tcp pcb mutexes */ - pcbinfo->mtx_grp_attr = lck_grp_attr_alloc_init(); + pcbinfo->mtx_grp_attr = lck_grp_attr_alloc_init(); pcbinfo->mtx_grp = lck_grp_alloc_init("tcppcb", pcbinfo->mtx_grp_attr); /* @@ -357,18 +409,37 @@ tcp_init() return; /* pretty much dead if this fails... */ } - delack_bitmask = _MALLOC((4 * tcp_tcbhashsize)/32, M_PCB, M_WAITOK); - if (delack_bitmask == 0) - panic("Delack Memory"); - - for (i=0; i < (tcbinfo.hashsize / 32); i++) - delack_bitmask[i] = 0; - for (i=0; i < N_TIME_WAIT_SLOTS; i++) { LIST_INIT(&time_wait_slots[i]); } - timeout(tcp_fasttimo, NULL, hz/TCP_RETRANSHZ); + bzero(&tcp_timer_list, sizeof(tcp_timer_list)); + LIST_INIT(&tcp_timer_list.lhead); + /* + * allocate lock group attribute, group and attribute for the tcp timer list + */ + tcp_timer_list.mtx_grp_attr = lck_grp_attr_alloc_init(); + tcp_timer_list.mtx_grp = lck_grp_alloc_init("tcptimerlist", tcp_timer_list.mtx_grp_attr); + tcp_timer_list.mtx_attr = lck_attr_alloc_init(); + if ((tcp_timer_list.mtx = lck_mtx_alloc_init(tcp_timer_list.mtx_grp, tcp_timer_list.mtx_attr)) == NULL) { + panic("failed to allocate memory for tcp_timer_list.mtx\n"); + }; + tcp_timer_list.fast_quantum = TCP_FASTTIMER_QUANTUM; + tcp_timer_list.slow_quantum = TCP_SLOWTIMER_QUANTUM; + if ((tcp_timer_list.call = thread_call_allocate(tcp_run_timerlist, NULL)) == NULL) { + panic("failed to allocate call entry 1 in tcp_init\n"); + } + + /* + * allocate lock group attribute, group and attribute for tcp_uptime_lock + */ + tcp_uptime_mtx_grp_attr = lck_grp_attr_alloc_init(); + tcp_uptime_mtx_grp = lck_grp_alloc_init("tcpuptime", tcp_uptime_mtx_grp_attr); + tcp_uptime_mtx_attr = lck_attr_alloc_init(); + tcp_uptime_lock = lck_spin_alloc_init(tcp_uptime_mtx_grp, tcp_uptime_mtx_attr); + + /* Initialize TCP congestion control algorithms list */ + tcp_cc_init(); } /* @@ -398,7 +469,9 @@ tcp_fillheaders(tp, ip_ptr, tcp_ptr) ip6->ip6_plen = sizeof(struct tcphdr); ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; - tcp_hdr->th_sum = 0; + tcp_hdr->th_sum = in6_cksum_phdr(&inp->in6p_laddr, + &inp->in6p_faddr, htonl(sizeof(struct tcphdr)), + htonl(IPPROTO_TCP)); } else #endif { @@ -474,7 +547,8 @@ tcp_respond( tcp_seq ack, tcp_seq seq, int flags, - unsigned int ifscope + unsigned int ifscope, + unsigned int nocell ) { register int tlen; @@ -489,6 +563,7 @@ tcp_respond( struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ + unsigned int outif; #if INET6 isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6; @@ -614,9 +689,11 @@ tcp_respond( #if INET6 if (isipv6) { nth->th_sum = 0; - nth->th_sum = in6_cksum(m, IPPROTO_TCP, - sizeof(struct ip6_hdr), - tlen - sizeof(struct ip6_hdr)); + nth->th_sum = in6_cksum_phdr(&ip6->ip6_src, + &ip6->ip6_dst, htons((u_short)(tlen - sizeof(struct ip6_hdr))), + htonl(IPPROTO_TCP)); + m->m_pkthdr.csum_flags = CSUM_TCPIPV6; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, ro6 && ro6->ro_rt ? ro6->ro_rt->rt_ifp : @@ -639,21 +716,29 @@ tcp_respond( return; } #endif -#if PKT_PRIORITY - if (tp != NULL) - set_traffic_class(m, tp->t_inpcb->inp_socket, MBUF_TC_NONE); -#endif /* PKT_PRIORITY */ + + if (tp != NULL) + set_packet_tclass(m, tp->t_inpcb->inp_socket, MBUF_TC_UNSPEC, isipv6); + #if INET6 if (isipv6) { - (void)ip6_output(m, NULL, ro6, 0, NULL, NULL, 0); - if (ro6 == &sro6 && ro6->ro_rt) { - rtfree(ro6->ro_rt); - ro6->ro_rt = NULL; + struct ip6_out_args ip6oa = { ifscope, nocell }; + + (void) ip6_output(m, NULL, ro6, IPV6_OUTARGS, NULL, + NULL, &ip6oa); + if (ro6->ro_rt != NULL) { + if (ro6 == &sro6) { + rtfree(ro6->ro_rt); + ro6->ro_rt = NULL; + } else if ((outif = ro6->ro_rt->rt_ifp->if_index) != + tp->t_inpcb->in6p_last_outif) { + tp->t_inpcb->in6p_last_outif = outif; + } } } else #endif /* INET6 */ { - struct ip_out_args ipoa = { ifscope }; + struct ip_out_args ipoa = { ifscope, nocell }; if (ro != &sro) { /* Copy the cached route and take an extra reference */ @@ -665,6 +750,10 @@ tcp_respond( (void) ip_output(m, NULL, &sro, IP_OUTARGS, NULL, &ipoa); if (ro != &sro) { + if (sro.ro_rt != NULL && + (outif = sro.ro_rt->rt_ifp->if_index) != + tp->t_inpcb->inp_last_outif) + tp->t_inpcb->inp_last_outif = outif; /* Synchronize cached PCB route */ inp_route_copyin(tp->t_inpcb, &sro); } else if (sro.ro_rt != NULL) { @@ -690,13 +779,15 @@ tcp_newtcpcb(inp) int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ + calculate_tcp_clock(); + if (so->cached_in_sock_layer == 0) { it = (struct inp_tp *)inp; tp = &it->tcb; } else tp = (struct tcpcb *) inp->inp_saved_ppcb; - + bzero((char *) tp, sizeof(struct tcpcb)); LIST_INIT(&tp->t_segq); tp->t_maxseg = tp->t_maxopd = @@ -719,12 +810,25 @@ tcp_newtcpcb(inp) tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; tp->t_rttmin = tcp_TCPTV_MIN; tp->t_rxtcur = TCPTV_RTOBASE; + + /* Initialize congestion control algorithm for this connection + * to newreno by default + */ + tp->tcp_cc_index = TCP_CC_ALGO_NEWRENO_INDEX; + if (CC_ALGO(tp)->init != NULL) { + CC_ALGO(tp)->init(tp); + } + tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh_prev = TCP_MAXWIN << TCP_MAX_WINSHIFT; - tp->t_rcvtime = 0; + tp->t_rcvtime = tcp_now; tp->t_bw_rtttime = 0; + tp->tentry.timer_start = tcp_now; + tp->t_persist_timeout = tcp_max_persist_timeout; + tp->t_persist_stop = 0; + tp->t_flagsext |= TF_RCVUNACK_WAITSS; /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, @@ -746,8 +850,13 @@ tcp_drop(tp, errno) int errno; { struct socket *so = tp->t_inpcb->inp_socket; - +#if CONFIG_DTRACE + struct inpcb *inp = tp->t_inpcb; +#endif /* CONFIG_DTRACE */ + if (TCPS_HAVERCVDSYN(tp->t_state)) { + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_CLOSED); tp->t_state = TCPS_CLOSED; (void) tcp_output(tp); tcpstat.tcps_drops++; @@ -778,26 +887,10 @@ tcp_close(tp) int dosavessthresh; if ( inp->inp_ppcb == NULL) /* tcp_close was called previously, bail */ - return NULL; - - /* Clear the timers before we delete the PCB. */ - { - int i; - for (i = 0; i < TCPT_NTIMERS; i++) { - tp->t_timer[i] = 0; - } - } + return(NULL); + tcp_canceltimers(tp); KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_START, tp,0,0,0,0); - switch (tp->t_state) - { - case TCPS_ESTABLISHED: - case TCPS_FIN_WAIT_1: - case TCPS_CLOSING: - case TCPS_CLOSE_WAIT: - case TCPS_LAST_ACK: - break; - } /* * If another thread for this tcp is currently in ip (indicated by @@ -816,6 +909,10 @@ tcp_close(tp) return (NULL); } + if (CC_ALGO(tp)->cleanup != NULL) { + CC_ALGO(tp)->cleanup(tp); + } + #if INET6 rt = isipv6 ? inp->in6p_route.ro_rt : inp->inp_route.ro_rt; #else @@ -853,8 +950,11 @@ tcp_close(tp) if (rt == NULL || !(rt->rt_flags & RTF_UP) || ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr == INADDR_ANY || rt->generation_id != route_generation) { - if (tp->t_state >= TCPS_CLOSE_WAIT) + if (tp->t_state >= TCPS_CLOSE_WAIT) { + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_CLOSING); tp->t_state = TCPS_CLOSING; + } goto no_valid_rt; } @@ -961,17 +1061,23 @@ tcp_close(tp) if (so->cached_in_sock_layer) inp->inp_saved_ppcb = (caddr_t) tp; #endif + /* Issue a wakeup before detach so that we don't miss + * a wakeup + */ + sodisconnectwakeup(so); - soisdisconnected(so); #if INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) in6_pcbdetach(inp); else #endif /* INET6 */ in_pcbdetach(inp); + + /* Call soisdisconnected after detach because it might unlock the socket */ + soisdisconnected(so); tcpstat.tcps_closed++; KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_END, tcpstat.tcps_closed,0,0,0,0); - return ((struct tcpcb *)0); + return(NULL); } int @@ -985,7 +1091,7 @@ tcp_freeq(tp) while((q = LIST_FIRST(&tp->t_segq)) != NULL) { LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); - FREE(q, M_TSEGQ); + zfree(tcp_reass_zone, q); tcp_reass_qsize--; rv = 1; } @@ -1019,7 +1125,7 @@ tcp_drain() != NULL) { LIST_REMOVE(te, tqe_q); m_freem(te->tqe_m); - FREE(te, M_TSEGQ); + zfree(tcp_reass_zone, te); tcp_reass_qsize--; } } @@ -1083,7 +1189,7 @@ tcpcb_to_otcpcb(struct tcpcb *tp, struct otcpcb *otp) otp->t_segq = (u_int32_t)(uintptr_t)tp->t_segq.lh_first; otp->t_dupacks = tp->t_dupacks; - for (i = 0; i < TCPT_NTIMERS; i++) + for (i = 0; i < TCPT_NTIMERS_EXT; i++) otp->t_timer[i] = tp->t_timer[i]; otp->t_inpcb = (_TCPCB_PTR(struct inpcb *))(uintptr_t)tp->t_inpcb; otp->t_state = tp->t_state; @@ -1258,7 +1364,7 @@ tcp_pcblist SYSCTL_HANDLER_ARGS return error; } -SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, +SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); #if !CONFIG_EMBEDDED @@ -1270,7 +1376,7 @@ tcpcb_to_xtcpcb64(struct tcpcb *tp, struct xtcpcb64 *otp) otp->t_segq = (u_int32_t)(uintptr_t)tp->t_segq.lh_first; otp->t_dupacks = tp->t_dupacks; - for (i = 0; i < TCPT_NTIMERS; i++) + for (i = 0; i < TCPT_NTIMERS_EXT; i++) otp->t_timer[i] = tp->t_timer[i]; otp->t_state = tp->t_state; otp->t_flags = tp->t_flags; @@ -1406,44 +1512,60 @@ tcp_pcblist64 SYSCTL_HANDLER_ARGS for (i = 0; i < n; i++) { inp = inp_list[i]; if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) { - struct xtcpcb64 xt; - - bzero(&xt, sizeof(xt)); - xt.xt_len = sizeof xt; - inpcb_to_xinpcb64(inp, &xt.xt_inpcb); - xt.xt_inpcb.inp_ppcb = (u_int64_t)(uintptr_t)inp->inp_ppcb; - if (inp->inp_ppcb != NULL) - tcpcb_to_xtcpcb64((struct tcpcb *)inp->inp_ppcb, &xt); - if (inp->inp_socket) - sotoxsocket64(inp->inp_socket, &xt.xt_inpcb.xi_socket); - error = SYSCTL_OUT(req, &xt, sizeof xt); + struct xtcpcb64 xt; + + bzero(&xt, sizeof(xt)); + xt.xt_len = sizeof xt; + inpcb_to_xinpcb64(inp, &xt.xt_inpcb); + xt.xt_inpcb.inp_ppcb = (u_int64_t)(uintptr_t)inp->inp_ppcb; + if (inp->inp_ppcb != NULL) + tcpcb_to_xtcpcb64((struct tcpcb *)inp->inp_ppcb, &xt); + if (inp->inp_socket) + sotoxsocket64(inp->inp_socket, &xt.xt_inpcb.xi_socket); + error = SYSCTL_OUT(req, &xt, sizeof xt); } } if (!error) { - /* - * Give the user an updated idea of our state. - * If the generation differs from what we told - * her before, she knows that something happened - * while we were processing this request, and it - * might be necessary to retry. - */ - bzero(&xig, sizeof(xig)); - xig.xig_len = sizeof xig; - xig.xig_gen = tcbinfo.ipi_gencnt; - xig.xig_sogen = so_gencnt; - xig.xig_count = tcbinfo.ipi_count; - error = SYSCTL_OUT(req, &xig, sizeof xig); + /* + * Give the user an updated idea of our state. + * If the generation differs from what we told + * her before, she knows that something happened + * while we were processing this request, and it + * might be necessary to retry. + */ + bzero(&xig, sizeof(xig)); + xig.xig_len = sizeof xig; + xig.xig_gen = tcbinfo.ipi_gencnt; + xig.xig_sogen = so_gencnt; + xig.xig_count = tcbinfo.ipi_count; + error = SYSCTL_OUT(req, &xig, sizeof xig); } FREE(inp_list, M_TEMP); lck_rw_done(tcbinfo.mtx); return error; } -SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist64, CTLFLAG_RD, 0, 0, +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist64, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, tcp_pcblist64, "S,xtcpcb64", "List of active TCP connections"); #endif /* !CONFIG_EMBEDDED */ +static int +tcp_pcblist_n SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int error = 0; + + error = get_pcblist_n(IPPROTO_TCP, req, &tcbinfo); + + return error; +} + + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist_n, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, + tcp_pcblist_n, "S,xtcpcb_n", "List of active TCP connections"); + + void tcp_ctlinput(cmd, sa, vip) int cmd; @@ -1618,10 +1740,10 @@ tcp6_ctlinput(cmd, sa, d) in6_pcbnotify(&tcbinfo, sa, th.th_dport, (struct sockaddr *)ip6cp->ip6c_src, - th.th_sport, cmd, notify); + th.th_sport, cmd, NULL, notify); } else { in6_pcbnotify(&tcbinfo, sa, 0, - (struct sockaddr *)(size_t)sa6_src, 0, cmd, notify); + (struct sockaddr *)(size_t)sa6_src, 0, cmd, NULL, notify); } } #endif /* INET6 */ @@ -1773,7 +1895,7 @@ tcp_mtudisc( if (tp) { #if INET6 if (isipv6) - rt = tcp_rtlookup6(inp); + rt = tcp_rtlookup6(inp, IFSCOPE_NONE); else #endif /* INET6 */ rt = tcp_rtlookup(inp, IFSCOPE_NONE); @@ -1837,6 +1959,11 @@ tcp_mtudisc( tp->t_maxseg = mss; + /* + * Reset the slow-start flight size as it may depends on the new MSS + */ + if (CC_ALGO(tp)->cwnd_init != NULL) + CC_ALGO(tp)->cwnd_init(tp); tcpstat.tcps_mturesent++; tp->t_rtttime = 0; tp->snd_nxt = tp->snd_una; @@ -1889,7 +2016,7 @@ tcp_rtlookup(inp, input_ifscope) if (rt != NULL) RT_UNLOCK(rt); - rtalloc_scoped_ign(ro, 0, ifscope); + rtalloc_scoped(ro, ifscope); if ((rt = ro->ro_rt) != NULL) RT_LOCK(rt); } @@ -1934,8 +2061,9 @@ tcp_rtlookup(inp, input_ifscope) #if INET6 struct rtentry * -tcp_rtlookup6(inp) +tcp_rtlookup6(inp, input_ifscope) struct inpcb *inp; + unsigned int input_ifscope; { struct route_in6 *ro6; struct rtentry *rt; @@ -1952,14 +2080,26 @@ tcp_rtlookup6(inp) /* No route yet, so try to acquire one */ if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { struct sockaddr_in6 *dst6; + unsigned int ifscope; dst6 = (struct sockaddr_in6 *)&ro6->ro_dst; dst6->sin6_family = AF_INET6; dst6->sin6_len = sizeof(*dst6); dst6->sin6_addr = inp->in6p_faddr; + + /* + * If the socket was bound to an interface, then + * the bound-to-interface takes precedence over + * the inbound interface passed in by the caller + * (if we get here as part of the output path then + * input_ifscope is IFSCOPE_NONE). + */ + ifscope = (inp->inp_flags & INP_BOUND_IF) ? + inp->inp_boundif : input_ifscope; + if (rt != NULL) RT_UNLOCK(rt); - rtalloc_ign((struct route *)ro6, 0); + rtalloc_scoped((struct route *)ro6, ifscope); if ((rt = ro6->ro_rt) != NULL) RT_LOCK(rt); } @@ -2068,7 +2208,7 @@ tcp_gettaocache(inp) #if INET6 if ((inp->inp_vflag & INP_IPV6) != 0) - rt = tcp_rtlookup6(inp); + rt = tcp_rtlookup6(inp, IFSCOPE_NONE); else #endif /* INET6 */ rt = tcp_rtlookup(inp, IFSCOPE_NONE); @@ -2112,7 +2252,7 @@ tcp_lock(struct socket *so, int refcount, void *lr) lr_saved = lr; if (so->so_pcb != NULL) { - lck_mtx_lock(((struct inpcb *)so->so_pcb)->inpcb_mtx); + lck_mtx_lock(&((struct inpcb *)so->so_pcb)->inpcb_mtx); } else { panic("tcp_lock: so=%p NO PCB! lr=%p lrh= %s\n", so, lr_saved, solockhistory_nr(so)); @@ -2143,7 +2283,7 @@ tcp_unlock(struct socket *so, int refcount, void *lr) #ifdef MORE_TCPLOCK_DEBUG printf("tcp_unlock: so=%p sopcb=%p lock=%p ref=%x lr=%p\n", - so, so->so_pcb, ((struct inpcb *)so->so_pcb)->inpcb_mtx, + so, so->so_pcb, &((struct inpcb *)so->so_pcb)->inpcb_mtx, so->so_usecount, lr_saved); #endif if (refcount) @@ -2159,11 +2299,11 @@ tcp_unlock(struct socket *so, int refcount, void *lr) so, so->so_usecount, lr_saved, solockhistory_nr(so)); /* NOTREACHED */ } else { - lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, + lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); so->unlock_lr[so->next_unlock_lr] = lr_saved; so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX; - lck_mtx_unlock(((struct inpcb *)so->so_pcb)->inpcb_mtx); + lck_mtx_unlock(&((struct inpcb *)so->so_pcb)->inpcb_mtx); } return (0); } @@ -2179,7 +2319,7 @@ tcp_getlock( if (so->so_usecount < 0) panic("tcp_getlock: so=%p usecount=%x lrh= %s\n", so, so->so_usecount, solockhistory_nr(so)); - return(inp->inpcb_mtx); + return(&inp->inpcb_mtx); } else { panic("tcp_getlock: so=%p NULL so_pcb %s\n", @@ -2199,17 +2339,7 @@ tcp_sbspace(struct tcpcb *tp) if (space < 0) space = 0; -#if TRAFFIC_MGT - if (tp->t_inpcb->inp_socket->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BG_REGULATE) { - if (tcp_background_io_enabled && - tp->t_inpcb->inp_socket->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BG_SUPPRESSED) { - tp->t_flags |= TF_RXWIN0SENT; - return 0; /* Triggers TCP window closing by responding there is no space */ - } - } -#endif /* TRAFFIC_MGT */ - - /* Avoid inscreasing window size if the current window + /* Avoid increasing window size if the current window * is already very low, we could be in "persist" mode and * we could break some apps (see rdar://5409343) */ @@ -2252,11 +2382,6 @@ tcp_set_tso(tp, ifp) int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; if (isipv6) { - /* - * Radar 6921834: Disable TSO IPv6 because there is no support - * for TSO & HW checksum in ip6_output yet - */ -#if 0 if (ifp && ifp->if_hwassist & IFNET_TSO_IPV6) { tp->t_flags |= TF_TSO; if (ifp->if_tso_v6_mtu != 0) @@ -2266,7 +2391,6 @@ tcp_set_tso(tp, ifp) } else tp->t_flags &= ~TF_TSO; -#endif } else #endif /* INET6 */ @@ -2281,4 +2405,50 @@ tcp_set_tso(tp, ifp) tp->t_flags &= ~TF_TSO; } } + +#define TIMEVAL_TO_TCPHZ(_tv_) ((_tv_).tv_sec * TCP_RETRANSHZ + (_tv_).tv_usec / TCP_RETRANSHZ_TO_USEC) + +/* Function to calculate the tcp clock. The tcp clock will get updated + * at the boundaries of the tcp layer. This is done at 3 places: + * 1. Right before processing an input tcp packet + * 2. Whenever a connection wants to access the network using tcp_usrreqs + * 3. When a tcp timer fires or before tcp slow timeout + * + */ + +void +calculate_tcp_clock() +{ + struct timeval tv = tcp_uptime; + struct timeval interval = {0, TCP_RETRANSHZ_TO_USEC}; + struct timeval now, hold_now; + uint32_t incr = 0; + + timevaladd(&tv, &interval); + microuptime(&now); + if (timevalcmp(&now, &tv, >)) { + /* time to update the clock */ + lck_spin_lock(tcp_uptime_lock); + if (timevalcmp(&tcp_uptime, &now, >=)) { + /* clock got updated while we were waiting for the lock */ + lck_spin_unlock(tcp_uptime_lock); + return; + } + + microuptime(&now); + hold_now = now; + tv = tcp_uptime; + timevalsub(&now, &tv); + + incr = TIMEVAL_TO_TCPHZ(now); + if (incr > 0) { + tcp_uptime = hold_now; + tcp_now += incr; + } + + lck_spin_unlock(tcp_uptime_lock); + } + return; +} + /* DSEP Review Done pl-20051213-v02 @3253,@3391,@3400 */ diff --git a/bsd/netinet/tcp_timer.c b/bsd/netinet/tcp_timer.c index fd66419d0..706ec823c 100644 --- a/bsd/netinet/tcp_timer.c +++ b/bsd/netinet/tcp_timer.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -71,6 +71,8 @@ #include #include #include +#include +#include #include #include /* before tcp_seq.h, for tcp_random18() */ @@ -89,6 +91,7 @@ #include #include #include +#include #if INET6 #include #endif @@ -97,23 +100,31 @@ #include #endif #include +#include extern void postevent(struct socket *, struct sockbuf *, int); #define DBG_FNC_TCP_FAST NETDBG_CODE(DBG_NETTCP, (5 << 8)) #define DBG_FNC_TCP_SLOW NETDBG_CODE(DBG_NETTCP, (5 << 8) | 1) +#define TIMERENTRY_TO_TP(te) ((struct tcpcb *)((uintptr_t)te - offsetof(struct tcpcb, tentry.le.le_next))) + +#define VERIFY_NEXT_LINK(elm,field) do { \ + if (LIST_NEXT((elm),field) != NULL && \ + LIST_NEXT((elm),field)->field.le_prev != \ + &((elm)->field.le_next)) \ + panic("Bad link elm %p next->prev != elm", (elm)); \ +} while(0) + +#define VERIFY_PREV_LINK(elm,field) do { \ + if (*(elm)->field.le_prev != (elm)) \ + panic("Bad link elm %p prev->next != elm", (elm)); \ +} while(0) + static int background_io_trigger = 5; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_io_trigger, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_io_trigger, CTLFLAG_RW | CTLFLAG_LOCKED, &background_io_trigger, 0, "Background IO Trigger Setting"); -/* - * NOTE - WARNING - * - * - * - * - */ static int sysctl_msec_to_ticks SYSCTL_HANDLER_ARGS { @@ -136,25 +147,42 @@ sysctl_msec_to_ticks SYSCTL_HANDLER_ARGS } int tcp_keepinit; -SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, +SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", ""); int tcp_keepidle; -SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, +SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", ""); int tcp_keepintvl; -SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, +SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", ""); int tcp_msl; -SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); +/* + * Avoid DoS via TCP Robustness in Persist Condition (see http://www.ietf.org/id/draft-ananth-tcpm-persist-02.txt) + * by allowing a system wide maximum persistence timeout value when in Zero Window Probe mode. + * Expressed in milliseconds to be consistent without timeout related values, the TCP socket option is in seconds. + */ +u_int32_t tcp_max_persist_timeout = 0; +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, max_persist_timeout, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_max_persist_timeout, 0, sysctl_msec_to_ticks, "I", "Maximum persistence timout for ZWP"); + static int always_keepalive = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW | CTLFLAG_LOCKED, &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); +/* This parameter determines how long the timer list will stay in fast mode even + * though all connections are idle. In fast mode, the timer will fire more frequently + * anticipating new data. + */ +int timer_fastmode_idlemax = TCP_FASTMODE_IDLEGEN_MAX; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_fastmode_idlemax, CTLFLAG_RW | CTLFLAG_LOCKED, + &timer_fastmode_idlemax, 0, "Maximum idle generations in fast mode"); + /* * See tcp_syn_backoff[] for interval values between SYN retransmits; * the value set below defines the number of retransmits, before we @@ -163,16 +191,25 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, * two options. */ static int tcp_broken_peer_syn_rxmit_thres = 7; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, broken_peer_syn_rxmit_thres, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, broken_peer_syn_rxmit_thres, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_broken_peer_syn_rxmit_thres, 0, "Number of retransmitted SYNs before " "TCP disables rfc1323 and rfc1644 during the rest of attempts"); +static int tcp_timer_advanced = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_timer_advanced, CTLFLAG_RD | CTLFLAG_LOCKED, + &tcp_timer_advanced, 0, "Number of times one of the timers was advanced"); + +static int tcp_resched_timerlist = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_resched_timerlist, CTLFLAG_RD | CTLFLAG_LOCKED, + &tcp_resched_timerlist, 0, + "Number of times timer list was rescheduled as part of processing a packet"); + int tcp_pmtud_black_hole_detect = 1 ; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_pmtud_black_hole_detect, 0, "Path MTU Discovery Black Hole Detection"); int tcp_pmtud_black_hole_mss = 1200 ; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_pmtud_black_hole_mss, 0, "Path MTU Discovery Black Hole Detection lowered MSS"); static int tcp_keepcnt = TCPTV_KEEPCNT; @@ -182,28 +219,68 @@ int tcp_maxpersistidle; /* max idle time in persist */ int tcp_maxidle; +/* TCP delack timer is set to 100 ms. Since the processing of timer list in fast + * mode will happen no faster than 100 ms, the delayed ack timer will fire some where + * between 100 and 200 ms. + */ +int tcp_delack = TCP_RETRANSHZ / 10; + struct inpcbhead time_wait_slots[N_TIME_WAIT_SLOTS]; int cur_tw_slot = 0; -u_int32_t *delack_bitmask; +/* tcp timer list */ +struct tcptimerlist tcp_timer_list; -void add_to_time_wait_locked(struct tcpcb *tp); -void add_to_time_wait(struct tcpcb *tp) ; +/* The frequency of running through the TCP timer list in + * fast and slow mode can be configured. + */ +SYSCTL_UINT(_net_inet_tcp, OID_AUTO, timer_fastquantum, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_timer_list.fast_quantum, TCP_FASTTIMER_QUANTUM, + "Frequency of running timer list in fast mode"); + +SYSCTL_UINT(_net_inet_tcp, OID_AUTO, timer_slowquantum, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_timer_list.slow_quantum, TCP_SLOWTIMER_QUANTUM, + "Frequency of running timer list in slow mode"); + +static void tcp_remove_timer(struct tcpcb *tp); +static void tcp_sched_timerlist(uint32_t offset); +static uint32_t tcp_run_conn_timer(struct tcpcb *tp, uint16_t *next_index); +static void tcp_sched_timers(struct tcpcb *tp); +static inline void tcp_set_lotimer_index(struct tcpcb *); + +/* Macro to compare two timers. If there is a reset of the sign bit, it is + * safe to assume that the timer has wrapped around. By doing signed comparision, + * we take care of wrap around such that the value with the sign bit reset is + * actually ahead of the other. + */ + +static inline int32_t +timer_diff(uint32_t t1, uint32_t toff1, uint32_t t2, uint32_t toff2) { + return (int32_t)((t1 + toff1) - (t2 + toff2)); +}; + +/* Returns true if the timer is on the timer list */ +#define TIMER_IS_ON_LIST(tp) ((tp)->t_flags & TF_TIMER_ONLIST) + + +void add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay); +void add_to_time_wait(struct tcpcb *tp, uint32_t delay) ; static void tcp_garbage_collect(struct inpcb *, int); -void add_to_time_wait_locked(struct tcpcb *tp) +void add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay) { int tw_slot; - struct inpcbinfo *pcbinfo = &tcbinfo; + struct inpcbinfo *pcbinfo = &tcbinfo; + uint32_t timer; /* pcb list should be locked when we get here */ lck_rw_assert(pcbinfo->mtx, LCK_RW_ASSERT_EXCLUSIVE); LIST_REMOVE(tp->t_inpcb, inp_list); - if (tp->t_timer[TCPT_2MSL] <= 0) - tp->t_timer[TCPT_2MSL] = 1; + /* if (tp->t_timer[TCPT_2MSL] <= 0) + tp->t_timer[TCPT_2MSL] = 1; */ /* * Because we're pulling this pcb out of the main TCP pcb list, @@ -211,19 +288,19 @@ void add_to_time_wait_locked(struct tcpcb *tp) * higher timer granularity. */ - tp->t_timer[TCPT_2MSL] = (tp->t_timer[TCPT_2MSL] / TCP_RETRANSHZ) * PR_SLOWHZ; + timer = (delay / TCP_RETRANSHZ) * PR_SLOWHZ; tp->t_rcvtime = (tp->t_rcvtime / TCP_RETRANSHZ) * PR_SLOWHZ; - tp->t_rcvtime += tp->t_timer[TCPT_2MSL] & (N_TIME_WAIT_SLOTS - 1); + tp->t_rcvtime += timer & (N_TIME_WAIT_SLOTS - 1); - tw_slot = (tp->t_timer[TCPT_2MSL] & (N_TIME_WAIT_SLOTS - 1)) + cur_tw_slot; + tw_slot = (timer & (N_TIME_WAIT_SLOTS - 1)) + cur_tw_slot; if (tw_slot >= N_TIME_WAIT_SLOTS) tw_slot -= N_TIME_WAIT_SLOTS; LIST_INSERT_HEAD(&time_wait_slots[tw_slot], tp->t_inpcb, inp_list); } -void add_to_time_wait(struct tcpcb *tp) +void add_to_time_wait(struct tcpcb *tp, uint32_t delay) { struct inpcbinfo *pcbinfo = &tcbinfo; @@ -232,97 +309,10 @@ void add_to_time_wait(struct tcpcb *tp) lck_rw_lock_exclusive(pcbinfo->mtx); tcp_lock(tp->t_inpcb->inp_socket, 0, 0); } - add_to_time_wait_locked(tp); + add_to_time_wait_locked(tp, delay); lck_rw_done(pcbinfo->mtx); } - - - -/* - * Fast timeout routine for processing delayed acks - */ -void -tcp_fasttimo(void *arg) -{ -#pragma unused(arg) - struct inpcb *inp; - register struct tcpcb *tp; - struct socket *so; -#if TCPDEBUG - int ostate; -#endif - - - struct inpcbinfo *pcbinfo = &tcbinfo; - - int delack_done = 0; - - KERNEL_DEBUG(DBG_FNC_TCP_FAST | DBG_FUNC_START, 0,0,0,0,0); - - - lck_rw_lock_shared(pcbinfo->mtx); - - /* Walk the list of valid tcpcbs and send ACKS on the ones with DELACK bit set */ - - LIST_FOREACH(inp, &tcb, inp_list) { - - so = inp->inp_socket; - - if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) - continue; - - tcp_lock(so, 1, 0); - - if ((in_pcb_checkstate(inp, WNT_RELEASE,1) == WNT_STOPUSING) && so->so_usecount == 1) { - tcp_unlock(so, 1, 0); - continue; - } - - tp = intotcpcb(inp); - - if (tp == 0 || tp->t_state == TCPS_LISTEN) { - tcp_unlock(so, 1, 0); - continue; - } - - - /* Only run the retransmit timer in that case */ - if (tp->t_timer[0] && --tp->t_timer[0] == 0) { - tp = tcp_timers(tp, 0); - if (tp == NULL) - goto tpgone; - } - - /* TCP pcb timers following the tcp_now clock rate */ - - tp->t_rcvtime++; - tp->t_starttime++; - if (tp->t_rtttime) - tp->t_rtttime++; - - /* - * Process delayed acks (if enabled) according to PR_FASTHZ, not the retrans timer - */ - - if (tcp_delack_enabled && (tcp_now % (TCP_RETRANSHZ/PR_FASTHZ)) && tp->t_flags & TF_DELACK) { - delack_done++; - tp->t_flags &= ~TF_DELACK; - tp->t_flags |= TF_ACKNOW; - tcpstat.tcps_delack++; - tp->t_unacksegs = 0; - (void) tcp_output(tp); - } -tpgone: - tcp_unlock(so, 1, 0); - } - KERNEL_DEBUG(DBG_FNC_TCP_FAST | DBG_FUNC_END, delack_done, 0, tcpstat.tcps_delack,0,0); - lck_rw_done(pcbinfo->mtx); - - tcp_now++; - timeout(tcp_fasttimo, 0, hz/TCP_RETRANSHZ); -} - static void tcp_garbage_collect(struct inpcb *inp, int istimewait) { @@ -339,12 +329,12 @@ tcp_garbage_collect(struct inpcb *inp, int istimewait) * overflow sockets that are eligible for garbage collection have * their usecounts set to 1. */ - if (so->so_usecount > 1 || !lck_mtx_try_lock_spin(inp->inpcb_mtx)) + if (so->so_usecount > 1 || !lck_mtx_try_lock_spin(&inp->inpcb_mtx)) return; /* Check again under the lock */ if (so->so_usecount > 1) { - lck_mtx_unlock(inp->inpcb_mtx); + lck_mtx_unlock(&inp->inpcb_mtx); return; } @@ -365,7 +355,7 @@ tcp_garbage_collect(struct inpcb *inp, int istimewait) if (inp->inp_state != INPCB_STATE_DEAD) { /* Become a regular mutex */ - lck_mtx_convert_spin(inp->inpcb_mtx); + lck_mtx_convert_spin(&inp->inpcb_mtx); #if INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) in6_pcbdetach(inp); @@ -374,10 +364,10 @@ tcp_garbage_collect(struct inpcb *inp, int istimewait) in_pcbdetach(inp); } so->so_usecount--; - lck_mtx_unlock(inp->inpcb_mtx); + lck_mtx_unlock(&inp->inpcb_mtx); return; } else if (inp->inp_wantcnt != WNT_STOPUSING) { - lck_mtx_unlock(inp->inpcb_mtx); + lck_mtx_unlock(&inp->inpcb_mtx); return; } @@ -392,8 +382,10 @@ tcp_garbage_collect(struct inpcb *inp, int istimewait) * socket is dropped at the end of tcp_input(). */ if (so->so_usecount == 0) { + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_CLOSED); /* Become a regular mutex */ - lck_mtx_convert_spin(inp->inpcb_mtx); + lck_mtx_convert_spin(&inp->inpcb_mtx); if (inp->inp_state != INPCB_STATE_DEAD) { #if INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) @@ -404,20 +396,15 @@ tcp_garbage_collect(struct inpcb *inp, int istimewait) } in_pcbdispose(inp); } else { - lck_mtx_unlock(inp->inpcb_mtx); + lck_mtx_unlock(&inp->inpcb_mtx); } } -static int bg_cnt = 0; -#define BG_COUNTER_MAX 3 - void tcp_slowtimo(void) { struct inpcb *inp, *nxt; struct tcpcb *tp; - struct socket *so; - int i; #if TCPDEBUG int ostate; #endif @@ -432,114 +419,14 @@ tcp_slowtimo(void) tcp_maxidle = tcp_keepcnt * tcp_keepintvl; - lck_rw_lock_shared(pcbinfo->mtx); - - bg_cnt++; - - LIST_FOREACH(inp, &tcb, inp_list) { - - so = inp->inp_socket; + /* Update tcp_now here as it may get used while processing the slow timer */ + calculate_tcp_clock(); - if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) - continue; - - tcp_lock(so, 1, 0); - - if ((in_pcb_checkstate(inp, WNT_RELEASE,1) == WNT_STOPUSING) && so->so_usecount == 1) { - tcp_unlock(so, 1, 0); - continue; - } - tp = intotcpcb(inp); - if (tp == 0 || tp->t_state == TCPS_LISTEN) { - tcp_unlock(so, 1, 0); - continue; - } - - tp = intotcpcb(inp); - - if (tp == 0 || tp->t_state == TCPS_LISTEN) - goto tpgone; - -#if TRAFFIC_MGT - if (so->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BG_REGULATE && - bg_cnt > BG_COUNTER_MAX) { - u_int32_t curr_recvtotal = tcpstat.tcps_rcvtotal; - u_int32_t curr_bg_recvtotal = tcpstat.tcps_bg_rcvtotal; - u_int32_t bg_recvdiff = curr_bg_recvtotal - tp->bg_recv_snapshot; - u_int32_t tot_recvdiff = curr_recvtotal - tp->tot_recv_snapshot; - u_int32_t fg_recv_change = tot_recvdiff - bg_recvdiff; - u_int32_t recv_change; - - if (!(so->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BG_SUPPRESSED)) { - if (tot_recvdiff) - recv_change = (fg_recv_change * 100) / tot_recvdiff; - else - recv_change = 0; - - if (recv_change > background_io_trigger) { - socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BG_SUPPRESSED); - } - - tp->tot_recv_snapshot = curr_recvtotal; - tp->bg_recv_snapshot = curr_bg_recvtotal; - } - else { // SUPPRESSED - // this allows for bg traffic to subside before we start measuring total traffic change - if (tot_recvdiff) - recv_change = (bg_recvdiff * 100) / tot_recvdiff; - else - recv_change = 0; - - if (recv_change < background_io_trigger) { - // Draconian for now: if there is any change at all, keep suppressed - if (!tot_recvdiff) { - socket_clear_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BG_SUPPRESSED); - tp->t_unacksegs = 0; - (void) tcp_output(tp); // open window - } - } - - tp->tot_recv_snapshot = curr_recvtotal; - tp->bg_recv_snapshot = curr_bg_recvtotal; - } - } -#endif /* TRAFFIC_MGT */ - - for (i = 1; i < TCPT_NTIMERS; i++) { - if (tp->t_timer[i] != 0) { - tp->t_timer[i] -= TCP_RETRANSHZ/PR_SLOWHZ; - if (tp->t_timer[i] <= 0) { -#if TCPDEBUG - ostate = tp->t_state; -#endif - - tp->t_timer[i] = 0; /* account for granularity change between tcp_now and slowtimo */ - tp = tcp_timers(tp, i); - if (tp == NULL) - goto tpgone; -#if TCPDEBUG - if (tp->t_inpcb->inp_socket->so_options - & SO_DEBUG) - tcp_trace(TA_USER, ostate, tp, - (void *)0, - (struct tcphdr *)0, - PRU_SLOWTIMO); -#endif - } - } - } -tpgone: - tcp_unlock(so, 1, 0); - } - - if (bg_cnt > 3) - bg_cnt = 0; - - /* Second part of tcp_slowtimo: garbage collect socket/tcpcb - * We need to acquire the list lock exclusively to do this + /* Garbage collect socket/tcpcb: We need to acquire the list lock + * exclusively to do this */ - if (lck_rw_lock_shared_to_exclusive(pcbinfo->mtx) == FALSE) { + if (lck_rw_try_lock_exclusive(pcbinfo->mtx) == FALSE) { if (tcp_gc_done == TRUE) { /* don't sweat it this time. cleanup was done last time */ tcp_gc_done = FALSE; KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked, cur_tw_slot,0,0,0); @@ -617,8 +504,11 @@ tcp_canceltimers(tp) { register int i; + tcp_remove_timer(tp); for (i = 0; i < TCPT_NTIMERS; i++) tp->t_timer[i] = 0; + tp->tentry.timer_start = tcp_now; + tp->tentry.index = TCPT_NONE; } int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = @@ -641,6 +531,7 @@ tcp_timers(tp, timer) struct socket *so_tmp; struct tcptemp *t_template; int optlen = 0; + int idle_time = 0; #if TCPDEBUG int ostate; @@ -651,6 +542,7 @@ tcp_timers(tp, timer) #endif /* INET6 */ so_tmp = tp->t_inpcb->inp_socket; + idle_time = tcp_now - tp->t_rcvtime; switch (timer) { @@ -666,8 +558,8 @@ tcp_timers(tp, timer) tcp_free_sackholes(tp); if (tp->t_state != TCPS_TIME_WAIT && tp->t_state != TCPS_FIN_WAIT_2 && - tp->t_rcvtime < tcp_maxidle) { - tp->t_timer[TCPT_2MSL] = (u_int32_t)tcp_keepintvl; + ((idle_time > 0) && (idle_time < tcp_maxidle))) { + tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp, (u_int32_t)tcp_keepintvl); } else { tp = tcp_close(tp); @@ -682,9 +574,26 @@ tcp_timers(tp, timer) */ case TCPT_REXMT: tcp_free_sackholes(tp); - if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { + /* Drop a connection in the retransmit timer + * 1. If we have retransmitted more than TCP_MAXRXTSHIFT times + * 2. If the time spent in this retransmission episode is more than + * the time limit set with TCP_RXT_CONNDROPTIME socket option + * 3. If TCP_RXT_FINDROP socket option was set and we have already + * retransmitted the FIN 3 times without receiving an ack + */ + if (++tp->t_rxtshift > TCP_MAXRXTSHIFT || + (tp->rxt_conndroptime > 0 && tp->rxt_start > 0 && + (tcp_now - tp->rxt_start) >= tp->rxt_conndroptime) || + ((tp->t_flagsext & TF_RXTFINDROP) != 0 && + (tp->t_flags & TF_SENTFIN) != 0 && + tp->t_rxtshift >= 4)) { + + if ((tp->t_flagsext & TF_RXTFINDROP) != 0) { + tcpstat.tcps_rxtfindrop++; + } else { + tcpstat.tcps_timeoutdrop++; + } tp->t_rxtshift = TCP_MAXRXTSHIFT; - tcpstat.tcps_timeoutdrop++; tp = tcp_drop(tp, tp->t_softerror ? tp->t_softerror : ETIMEDOUT); postevent(so_tmp, 0, EV_TIMEOUT); @@ -709,6 +618,11 @@ tcp_timers(tp, timer) else tp->t_flags &= ~TF_WASFRECOVERY; tp->t_badrxtwin = tcp_now + (tp->t_srtt >> (TCP_RTT_SHIFT)); + + /* Set the time at which retransmission on this + * connection started + */ + tp->rxt_start = tcp_now; } tcpstat.tcps_rexmttimeo++; if (tp->t_state == TCPS_SYN_SENT) @@ -716,8 +630,9 @@ tcp_timers(tp, timer) else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, - tp->t_rttmin, TCPTV_REXMTMAX); - tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + tp->t_rttmin, TCPTV_REXMTMAX, + TCP_ADD_REXMTSLOP(tp)); + tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); /* * Check for potential Path MTU Discovery Black Hole @@ -745,6 +660,12 @@ tcp_timers(tp, timer) tcp_mssdflt; } tp->t_maxseg = tp->t_maxopd - optlen; + + /* + * Reset the slow-start flight size as it may depends on the new MSS + */ + if (CC_ALGO(tp)->cwnd_init != NULL) + CC_ALGO(tp)->cwnd_init(tp); } /* * If further retransmissions are still unsuccessful with a lowered MTU, @@ -759,6 +680,11 @@ tcp_timers(tp, timer) optlen = tp->t_maxopd - tp->t_maxseg; tp->t_maxopd = tp->t_pmtud_saved_maxopd; tp->t_maxseg = tp->t_maxopd - optlen; + /* + * Reset the slow-start flight size as it may depends on the new MSS + */ + if (CC_ALGO(tp)->cwnd_init != NULL) + CC_ALGO(tp)->cwnd_init(tp); } } } @@ -806,41 +732,17 @@ tcp_timers(tp, timer) * If timing a segment in this window, stop the timer. */ tp->t_rtttime = 0; - /* - * Close the congestion window down to one segment - * (we'll open it by one segment for each ack we get). - * Since we probably have a window's worth of unacked - * data accumulated, this "slow start" keeps us from - * dumping all that data as back-to-back packets (which - * might overwhelm an intermediate gateway). - * - * There are two phases to the opening: Initially we - * open by one mss on each ack. This makes the window - * size increase exponentially with time. If the - * window is larger than the path can handle, this - * exponential growth results in dropped packet(s) - * almost immediately. To get more time between - * drops but still "push" the network to take advantage - * of improving conditions, we switch from exponential - * to linear window opening at some threshhold size. - * For a threshhold, we use half the current window - * size, truncated to a multiple of the mss. - * - * (the minimum cwnd that will give us exponential - * growth is 2 mss. We don't allow the threshhold - * to go below this.) - */ - if (tp->t_state >= TCPS_ESTABLISHED) { - u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; - if (win < 2) - win = 2; - tp->snd_cwnd = tp->t_maxseg; - tp->snd_ssthresh = win * tp->t_maxseg; - tp->t_bytes_acked = 0; - tp->t_dupacks = 0; - tp->t_unacksegs = 0; - } + + if (CC_ALGO(tp)->after_timeout != NULL) + CC_ALGO(tp)->after_timeout(tp); + + tp->t_dupacks = 0; EXIT_FASTRECOVERY(tp); + + DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb, + struct tcpcb *, tp, struct tcphdr *, NULL, + int32_t, TCP_CC_REXMT_TIMEOUT); + (void) tcp_output(tp); break; @@ -856,10 +758,15 @@ tcp_timers(tp, timer) * backoff, drop the connection if the idle time * (no responses to probes) reaches the maximum * backoff that we would use if retransmitting. + * + * Drop the connection if we reached the maximum allowed time for + * Zero Window Probes without a non-zero update from the peer. + * See rdar://5805356 */ - if (tp->t_rxtshift == TCP_MAXRXTSHIFT && - (tp->t_rcvtime >= tcp_maxpersistidle || - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { + if ((tp->t_rxtshift == TCP_MAXRXTSHIFT && + (idle_time >= tcp_maxpersistidle || + idle_time >= TCP_REXMTVAL(tp) * tcp_totbackoff)) || + ((tp->t_persist_stop != 0) && (tp->t_persist_stop <= tcp_now))) { tcpstat.tcps_persistdrop++; so_tmp = tp->t_inpcb->inp_socket; tp = tcp_drop(tp, ETIMEDOUT); @@ -868,7 +775,6 @@ tcp_timers(tp, timer) } tcp_setpersist(tp); tp->t_force = 1; - tp->t_unacksegs = 0; (void) tcp_output(tp); tp->t_force = 0; break; @@ -884,7 +790,7 @@ tcp_timers(tp, timer) if ((always_keepalive || tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING || tp->t_state == TCPS_FIN_WAIT_2)) { - if (tp->t_rcvtime >= TCP_KEEPIDLE(tp) + (u_int32_t)tcp_maxidle) + if (idle_time >= TCP_KEEPIDLE(tp) + (u_int32_t)tcp_maxidle) goto dropit; /* * Send a packet designed to force a response @@ -901,21 +807,45 @@ tcp_timers(tp, timer) tcpstat.tcps_keepprobe++; t_template = tcp_maketemplate(tp); if (t_template) { - unsigned int ifscope; + unsigned int ifscope, nocell = 0; if (tp->t_inpcb->inp_flags & INP_BOUND_IF) ifscope = tp->t_inpcb->inp_boundif; else ifscope = IFSCOPE_NONE; + /* + * If the socket isn't allowed to use the + * cellular interface, indicate it as such. + */ + if (tp->t_inpcb->inp_flags & INP_NO_IFT_CELLULAR) + nocell = 1; + tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, (struct mbuf *)NULL, - tp->rcv_nxt, tp->snd_una - 1, 0, ifscope); + tp->rcv_nxt, tp->snd_una - 1, 0, ifscope, + nocell); (void) m_free(dtom(t_template)); } - tp->t_timer[TCPT_KEEP] = tcp_keepintvl; + tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, tcp_keepintvl); } else - tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp); + tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, TCP_KEEPIDLE(tp)); + break; + case TCPT_DELACK: + if (tcp_delack_enabled && (tp->t_flags & TF_DELACK)) { + tp->t_flags &= ~TF_DELACK; + tp->t_timer[TCPT_DELACK] = 0; + tp->t_flags |= TF_ACKNOW; + + /* If delayed ack timer fired while we are stretching acks, + * go back to acking every other packet + */ + if ((tp->t_flags & TF_STRETCHACK) != 0) + tcp_reset_stretch_ack(tp); + + tcpstat.tcps_delack++; + (void) tcp_output(tp); + } break; #if TCPDEBUG @@ -931,3 +861,462 @@ tcp_timers(tp, timer) } return (tp); } + +/* Remove a timer entry from timer list */ +void +tcp_remove_timer(struct tcpcb *tp) +{ + struct tcptimerlist *listp = &tcp_timer_list; + + lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED); + if (!(TIMER_IS_ON_LIST(tp))) { + return; + } + lck_mtx_lock(listp->mtx); + + /* Check if pcb is on timer list again after acquiring the lock */ + if (!(TIMER_IS_ON_LIST(tp))) { + lck_mtx_unlock(listp->mtx); + return; + } + + if (listp->next_te != NULL && listp->next_te == &tp->tentry) + listp->next_te = LIST_NEXT(&tp->tentry, le); + + LIST_REMOVE(&tp->tentry, le); + tp->t_flags &= ~(TF_TIMER_ONLIST); + + listp->entries--; + lck_mtx_unlock(listp->mtx); + + tp->tentry.le.le_next = NULL; + tp->tentry.le.le_prev = NULL; +} + +/* Function to check if the timerlist needs to be rescheduled to run + * the timer entry correctly. Basically, this is to check if we can avoid + * taking the list lock. + */ + +static boolean_t +need_to_resched_timerlist(uint32_t runtime, uint16_t index) { + struct tcptimerlist *listp = &tcp_timer_list; + int32_t diff; + boolean_t is_fast; + + if (runtime == 0 || index == TCPT_NONE) + return FALSE; + is_fast = !(IS_TIMER_SLOW(index)); + + /* If the list is being processed then the state of the list is in flux. + * In this case always acquire the lock and set the state correctly. + */ + if (listp->running) { + return TRUE; + } + + diff = timer_diff(listp->runtime, 0, runtime, 0); + if (diff <= 0) { + /* The list is going to run before this timer */ + return FALSE; + } else { + if (is_fast) { + if (diff <= listp->fast_quantum) + return FALSE; + } else { + if (diff <= listp->slow_quantum) + return FALSE; + } + } + return TRUE; +} + +void +tcp_sched_timerlist(uint32_t offset) +{ + + uint64_t deadline = 0; + struct tcptimerlist *listp = &tcp_timer_list; + + lck_mtx_assert(listp->mtx, LCK_MTX_ASSERT_OWNED); + + listp->runtime = tcp_now + offset; + + clock_interval_to_deadline(offset, NSEC_PER_SEC / TCP_RETRANSHZ, + &deadline); + + thread_call_enter_delayed(listp->call, deadline); +} + +/* Function to run the timers for a connection. + * + * Returns the offset of next timer to be run for this connection which + * can be used to reschedule the timerlist. + */ +uint32_t +tcp_run_conn_timer(struct tcpcb *tp, uint16_t *next_index) { + + struct socket *so; + uint16_t i = 0, index = TCPT_NONE, lo_index = TCPT_NONE; + uint32_t timer_val, offset = 0, lo_timer = 0; + int32_t diff; + boolean_t needtorun[TCPT_NTIMERS]; + int count = 0; + + VERIFY(tp != NULL); + bzero(needtorun, sizeof(needtorun)); + + tcp_lock(tp->t_inpcb->inp_socket, 1, 0); + + so = tp->t_inpcb->inp_socket; + /* Release the want count on inp */ + if (in_pcb_checkstate(tp->t_inpcb, WNT_RELEASE, 1) == WNT_STOPUSING) { + if (TIMER_IS_ON_LIST(tp)) { + tcp_remove_timer(tp); + } + + /* Looks like the TCP connection got closed while we + * were waiting for the lock.. Done + */ + goto done; + } + + /* Since the timer thread needs to wait for tcp lock, it may race + * with another thread that can cancel or reschedule the timer that is + * about to run. Check if we need to run anything. + */ + index = tp->tentry.index; + timer_val = tp->t_timer[index]; + + if (index == TCPT_NONE || tp->tentry.runtime == 0) + goto done; + + diff = timer_diff(tp->tentry.runtime, 0, tcp_now, 0); + if (diff > 0) { + if (tp->tentry.index != TCPT_NONE) { + offset = diff; + *(next_index) = tp->tentry.index; + } + goto done; + } + + tp->t_timer[index] = 0; + if (timer_val > 0) { + tp = tcp_timers(tp, index); + if (tp == NULL) + goto done; + } + + /* Check if there are any other timers that need to be run. While doing it, + * adjust the timer values wrt tcp_now. + */ + for (i = 0; i < TCPT_NTIMERS; ++i) { + if (tp->t_timer[i] != 0) { + diff = timer_diff(tp->tentry.timer_start, tp->t_timer[i], tcp_now, 0); + if (diff <= 0) { + tp->t_timer[i] = 0; + needtorun[i] = TRUE; + count++; + } else { + tp->t_timer[i] = diff; + needtorun[i] = FALSE; + if (lo_timer == 0 || diff < lo_timer) { + lo_timer = diff; + lo_index = i; + } + } + } + } + + tp->tentry.timer_start = tcp_now; + tp->tentry.index = lo_index; + if (lo_index != TCPT_NONE) { + tp->tentry.runtime = tp->tentry.timer_start + tp->t_timer[lo_index]; + } else { + tp->tentry.runtime = 0; + } + + if (count > 0) { + /* run any other timers that are also outstanding at this time. */ + for (i = 0; i < TCPT_NTIMERS; ++i) { + if (needtorun[i]) { + tp->t_timer[i] = 0; + tp = tcp_timers(tp, i); + if (tp == NULL) + goto done; + } + } + tcp_set_lotimer_index(tp); + } + + if (tp->tentry.index < TCPT_NONE) { + offset = tp->t_timer[tp->tentry.index]; + *(next_index) = tp->tentry.index; + } + +done: + if (tp != NULL && tp->tentry.index == TCPT_NONE) { + tcp_remove_timer(tp); + } + tcp_unlock(so, 1, 0); + return offset; +} + +void +tcp_run_timerlist(void * arg1, void * arg2) { + +#pragma unused(arg1, arg2) + + struct tcptimerentry *te, *next_te; + struct tcptimerlist *listp = &tcp_timer_list; + struct tcpcb *tp; + uint32_t next_timer = 0; + uint16_t index = TCPT_NONE; + boolean_t need_fast = FALSE; + uint32_t active_count = 0; + uint32_t mode = TCP_TIMERLIST_FASTMODE; + + calculate_tcp_clock(); + + lck_mtx_lock(listp->mtx); + + listp->running = TRUE; + + LIST_FOREACH_SAFE(te, &listp->lhead, le, next_te) { + uint32_t offset = 0; + uint32_t runtime = te->runtime; + if (TSTMP_GT(runtime, tcp_now)) { + offset = timer_diff(runtime, 0, tcp_now, 0); + if (next_timer == 0 || offset < next_timer) { + next_timer = offset; + } + continue; + } + active_count++; + + tp = TIMERENTRY_TO_TP(te); + + /* Acquire an inp wantcnt on the inpcb so that the socket won't get + * detached even if tcp_close is called + */ + if (in_pcb_checkstate(tp->t_inpcb, WNT_ACQUIRE, 0) == WNT_STOPUSING) { + /* Some how this pcb went into dead state while on the timer list, + * just take it off the list. Since the timer list entry pointers + * are protected by the timer list lock, we can do it here + */ + if (TIMER_IS_ON_LIST(tp)) { + tp->t_flags &= ~(TF_TIMER_ONLIST); + LIST_REMOVE(&tp->tentry, le); + listp->entries--; + + tp->tentry.le.le_next = NULL; + tp->tentry.le.le_prev = NULL; + } + continue; + } + + /* Store the next timerentry pointer before releasing the list lock. + * If that entry has to be removed when we release the lock, this + * pointer will be updated to the element after that. + */ + listp->next_te = next_te; + + VERIFY_NEXT_LINK(&tp->tentry, le); + VERIFY_PREV_LINK(&tp->tentry, le); + + lck_mtx_unlock(listp->mtx); + + index = TCPT_NONE; + offset = tcp_run_conn_timer(tp, &index); + + lck_mtx_lock(listp->mtx); + + next_te = listp->next_te; + listp->next_te = NULL; + + if (offset > 0) { + if (index < TCPT_NONE) { + /* Check if this is a fast_timer. */ + if (!need_fast && !(IS_TIMER_SLOW(index))) { + need_fast = TRUE; + } + + if (next_timer == 0 || offset < next_timer) { + next_timer = offset; + } + } + } + } + + if (!LIST_EMPTY(&listp->lhead)) { + if (listp->mode == TCP_TIMERLIST_FASTMODE) { + if (need_fast || active_count > 0 || + listp->pref_mode == TCP_TIMERLIST_FASTMODE) { + listp->idlegen = 0; + } else { + listp->idlegen++; + if (listp->idlegen > timer_fastmode_idlemax) { + mode = TCP_TIMERLIST_SLOWMODE; + listp->idlegen = 0; + } + } + } else { + if (!need_fast) { + mode = TCP_TIMERLIST_SLOWMODE; + } + } + + if (mode == TCP_TIMERLIST_FASTMODE || + listp->pref_mode == TCP_TIMERLIST_FASTMODE) { + next_timer = listp->fast_quantum; + } else { + if (listp->pref_offset != 0 && + listp->pref_offset < next_timer) + next_timer = listp->pref_offset; + if (next_timer < listp->slow_quantum) + next_timer = listp->slow_quantum; + } + + listp->mode = mode; + + tcp_sched_timerlist(next_timer); + } else { + /* No need to reschedule this timer */ + listp->runtime = 0; + } + + listp->running = FALSE; + listp->pref_mode = 0; + listp->pref_offset = 0; + + lck_mtx_unlock(listp->mtx); +} + +/* Function to verify if a change in timer state is required for a connection */ +void +tcp_sched_timers(struct tcpcb *tp) +{ + struct tcptimerentry *te = &tp->tentry; + uint16_t index = te->index; + struct tcptimerlist *listp = &tcp_timer_list; + uint32_t offset = 0; + boolean_t is_fast; + int list_locked = 0; + + if (tp->t_inpcb->inp_state == INPCB_STATE_DEAD) { + /* Just return without adding the dead pcb to the list */ + if (TIMER_IS_ON_LIST(tp)) { + tcp_remove_timer(tp); + } + return; + } + + if (index == TCPT_NONE) { + tcp_remove_timer(tp); + return; + } + + is_fast = !(IS_TIMER_SLOW(index)); + offset = te->runtime - tcp_now; + if (offset == 0) { + offset = 1; + tcp_timer_advanced++; + } + if (is_fast) + offset = listp->fast_quantum; + + if (!TIMER_IS_ON_LIST(tp)) { + if (!list_locked) { + lck_mtx_lock(listp->mtx); + list_locked = 1; + } + + LIST_INSERT_HEAD(&listp->lhead, te, le); + tp->t_flags |= TF_TIMER_ONLIST; + + listp->entries++; + if (listp->entries > listp->maxentries) + listp->maxentries = listp->entries; + + /* if the list is not scheduled, just schedule it */ + if (listp->runtime == 0) + goto schedule; + + } + + + /* timer entry is currently on the list */ + if (need_to_resched_timerlist(te->runtime, index)) { + tcp_resched_timerlist++; + + if (!list_locked) { + lck_mtx_lock(listp->mtx); + list_locked = 1; + } + + VERIFY_NEXT_LINK(te, le); + VERIFY_PREV_LINK(te, le); + + if (listp->running) { + if (is_fast) { + listp->pref_mode = TCP_TIMERLIST_FASTMODE; + } else if (listp->pref_offset == 0 || + ((int)offset) < listp->pref_offset) { + listp->pref_offset = offset; + } + } else { + int32_t diff; + diff = timer_diff(listp->runtime, 0, tcp_now, offset); + if (diff <= 0) { + /* The list is going to run before this timer */ + goto done; + } else { + goto schedule; + } + } + } + goto done; + +schedule: + if (is_fast) { + listp->mode = TCP_TIMERLIST_FASTMODE; + listp->idlegen = 0; + } + tcp_sched_timerlist(offset); + +done: + if (list_locked) + lck_mtx_unlock(listp->mtx); + + return; +} + +void +tcp_set_lotimer_index(struct tcpcb *tp) { + uint16_t i, lo_index = TCPT_NONE; + uint32_t lo_timer = 0; + for (i = 0; i < TCPT_NTIMERS; ++i) { + if (tp->t_timer[i] != 0 && + (lo_timer == 0 || tp->t_timer[i] < lo_timer)) { + lo_timer = tp->t_timer[i]; + lo_index = i; + } + } + tp->tentry.index = lo_index; + if (lo_index != TCPT_NONE) { + tp->tentry.runtime = tp->tentry.timer_start + tp->t_timer[lo_index]; + } else { + tp->tentry.runtime = 0; + } +} + +void +tcp_check_timer_state(struct tcpcb *tp) { + + lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED); + + tcp_set_lotimer_index(tp); + + tcp_sched_timers(tp); + return; +} diff --git a/bsd/netinet/tcp_timer.h b/bsd/netinet/tcp_timer.h index c4ea59c6b..df1162053 100644 --- a/bsd/netinet/tcp_timer.h +++ b/bsd/netinet/tcp_timer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -65,16 +65,25 @@ #define _NETINET_TCP_TIMER_H_ #include +#ifdef KERNEL +#include +#endif /* KERNEL */ + /* - * Definitions of the TCP timers. These timers are counted - * down PR_SLOWHZ times a second. + * Definitions of the TCP timers. */ -#define TCPT_NTIMERS 4 +#define TCPT_NTIMERS 5 + +/* Keep the external definition the same for binary compatibility */ +#define TCPT_NTIMERS_EXT 4 #define TCPT_REXMT 0 /* retransmit */ #define TCPT_PERSIST 1 /* retransmit persistence */ #define TCPT_KEEP 2 /* keep alive */ #define TCPT_2MSL 3 /* 2*msl quiet time timer */ +#define TCPT_DELACK 4 /* delayed ack timer */ +#define TCPT_MAX 4 +#define TCPT_NONE (TCPT_MAX + 1) /* * The TCPT_REXMT timer is used to force retransmissions. @@ -119,7 +128,7 @@ */ #define TCPTV_MSL ( 15*TCP_RETRANSHZ) /* max seg lifetime (hah!) */ #define TCPTV_SRTTBASE 0 /* base roundtrip time; - if 0, no idea yet */ + if 0, no idea yet */ #define TCPTV_RTOBASE ( 1*TCP_RETRANSHZ) /* assumed RTO if no info */ #define TCPTV_SRTTDFLT ( 1*TCP_RETRANSHZ) /* assumed RTT if no info */ @@ -131,9 +140,18 @@ #define TCPTV_KEEPINTVL ( 75*TCP_RETRANSHZ) /* default probe interval */ #define TCPTV_KEEPCNT 8 /* max probes before drop */ -//#define TCPTV_MIN ( 3*TCP_RETRANSHZ) /* minimum allowable value */ -#define TCPTV_MIN (1) /* minimum allowable value */ -#define TCPTV_REXMTMAX ( 64*TCP_RETRANSHZ) /* max allowable REXMT value */ +#define TCPTV_REXMTMAX ( 64*TCP_RETRANSHZ ) /* max allowable REXMT value */ +#define TCPTV_REXMTMIN ( TCP_RETRANSHZ/33 ) /* min REXMT for non-local connections */ +#define TCPTV_UNACKWIN ( TCP_RETRANSHZ/10 ) /* Window for counting rcv bytes to see if + ack-stretching can start (default 100 ms) */ +#define TCPTV_MAXRCVIDLE (TCP_RETRANSHZ/5 ) /* Receiver idle time, avoid ack-stretching after that*/ + +/* No ack stretching during slow-start, until we see some packets. + * By the time the receiver gets 512 packets, the senders cwnd + * should open by a few hundred packets considering the progression + * during slow-start. + */ +#define TCP_RCV_SS_PKTCOUNT 512 #define TCPTV_TWTRUNC 8 /* RTO factor to truncate TW */ @@ -143,15 +161,81 @@ #ifdef TCPTIMERS static char *tcptimers[] = - { "REXMT", "PERSIST", "KEEP", "2MSL" }; + { "REXMT", "PERSIST", "KEEP", "2MSL" , "DELACK"}; #endif #ifdef KERNEL + +/* We consider persist, keep and 2msl as slow timers which can be coalesced + * at a higher granularity (500 ms). Rexmt and delayed ack are considered fast + * timers which fire in the order of 100ms. + * + * The following conditional is to check if a timer is one of the slow timers. This + * is fast and works well for now. If we add more slow timers for any reason, + * we may need to change this. + */ +#define IS_TIMER_SLOW(ind) ((ind & 0x3) != 0) + +struct tcptimerlist; + +struct tcptimerentry { + LIST_ENTRY(tcptimerentry) le; /* links for timer list */ + uint32_t timer_start; /* tcp clock when the timer was started */ + uint16_t index; /* index of lowest timer that needs to run first */ + uint32_t runtime; /* deadline at which the first timer has to fire */ +}; + +LIST_HEAD(timerlisthead, tcptimerentry); + +struct tcptimerlist { + struct timerlisthead lhead; /* head of the list of timer entries */ + lck_mtx_t *mtx; /* lock to protect the list */ + lck_attr_t *mtx_attr; /* mutex attributes */ + lck_grp_t *mtx_grp; /* mutex group definition */ + lck_grp_attr_t *mtx_grp_attr; /* mutex group attributes */ + uint32_t fast_quantum; /* minimum time quantum to coalesce fast timers */ + uint32_t slow_quantum; /* minimum time quantum to coalesce slow timers */ + thread_call_t call; /* call entry */ + uint32_t runtime; /* time at which this list is going to run */ + uint32_t entries; /* Number of entries on the list */ + uint32_t maxentries; /* Max number of entries at any time */ + + /* Set desired mode when timer list running */ + boolean_t running; /* Set when timer list is being processed */ +#define TCP_TIMERLIST_FASTMODE 0x1 +#define TCP_TIMERLIST_SLOWMODE 0x2 + uint32_t mode; /* Current mode, fast or slow */ + uint32_t pref_mode; /* Preferred mode set by a connection, fast or slow */ + uint32_t pref_offset; /* Preferred offset set by a connection */ + uint32_t idlegen; /* Number of times the list has been idle in fast mode */ + struct tcptimerentry *next_te; /* Store the next timer entry pointer to process */ + +}; + +#define TCP_FASTMODE_IDLEGEN_MAX 20 /* Approximately 2 seconds */ + /* - * Force a time value to be in a certain range. + * Minimum retransmit timeout is set to 30ms. We add a slop of + * 200 ms to the retransmit value to account for processing + * variance and delayed ack. This extra 200ms will help to avoid + * spurious retransmits by taking into consideration the receivers + * that wait for delayed ack timer instead of generating an ack + * for every two packets. + * + * On a local link, the minimum retransmit timeout is 100ms and + * variance is set to 0. This will make the sender a little bit more + * aggressive on local link. When the connection is not established yet, + * there is no need to add an extra 200ms to retransmit timeout because + * the initial value is high (1s) and delayed ack is not a problem in + * that case. */ -#define TCPT_RANGESET(tv, value, tvmin, tvmax) do { \ - (tv) = (value); \ +#define TCPTV_REXMTSLOP ( TCP_RETRANSHZ/5 ) /* rexmt slop allowed (200 ms) */ + +/* macro to decide when retransmit slop (described above) should be added */ +#define TCP_ADD_REXMTSLOP(tp) ((tp->t_flags & TF_LOCAL) != 0 || tp->t_state >= TCPS_ESTABLISHED) + +#define TCPT_RANGESET(tv, value, tvmin, tvmax, addslop) do { \ + (tv) = ((addslop) ? tcp_rexmt_slop : 0) + (value); \ if ((uint32_t)(tv) < (uint32_t)(tvmin)) \ (tv) = (tvmin); \ else if ((uint32_t)(tv) > (uint32_t)(tvmax)) \ @@ -166,16 +250,15 @@ extern int tcp_keepinit; /* time to establish connection */ extern int tcp_keepidle; /* time before keepalive probes begin */ extern int tcp_keepintvl; /* time between keepalive probes */ extern int tcp_maxidle; /* time to drop after starting probes */ +extern int tcp_delack; /* delayed ack timer */ extern int tcp_maxpersistidle; extern int tcp_msl; extern int tcp_ttl; /* time to live for TCP segs */ extern int tcp_backoff[]; +extern int tcp_rexmt_slop; +extern u_int32_t tcp_max_persist_timeout; /* Maximum persistence for Zero Window Probes */ -void tcp_timer_2msl(void *xtp); -void tcp_timer_keep(void *xtp); -void tcp_timer_persist(void *xtp); -void tcp_timer_rexmt(void *xtp); -void tcp_timer_delack(void *xtp); +#define OFFSET_FROM_START(tp, off) ((tcp_now + (off)) - (tp)->tentry.timer_start) #endif /* KERNEL */ #endif /* PRIVATE */ diff --git a/bsd/netinet/tcp_usrreq.c b/bsd/netinet/tcp_usrreq.c index d477b60d5..d4fddb517 100644 --- a/bsd/netinet/tcp_usrreq.c +++ b/bsd/netinet/tcp_usrreq.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -76,6 +76,7 @@ #include #include +#include #include #include @@ -105,6 +106,11 @@ #include #endif /*IPSEC*/ +void tcp_fill_info(struct tcpcb *, struct tcp_info *); +errno_t tcp_fill_info_for_info_tuple(struct info_tuple *, struct tcp_info *); + +int tcp_sysctl_info(struct sysctl_oid *, void *, int , struct sysctl_req *); + /* * TCP protocol interface to socket abstraction. */ @@ -121,26 +127,26 @@ static struct tcpcb * tcp_usrclosed(struct tcpcb *); __private_extern__ int tcp_win_scale = 3; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, win_scale_factor, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, win_scale_factor, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_win_scale, 0, "Window scaling factor"); static u_int32_t tcps_in_sw_cksum; -SYSCTL_UINT(_net_inet_tcp, OID_AUTO, in_sw_cksum, CTLFLAG_RD, +SYSCTL_UINT(_net_inet_tcp, OID_AUTO, in_sw_cksum, CTLFLAG_RD | CTLFLAG_LOCKED, &tcps_in_sw_cksum, 0, "Number of received packets checksummed in software"); static u_int64_t tcps_in_sw_cksum_bytes; -SYSCTL_QUAD(_net_inet_tcp, OID_AUTO, in_sw_cksum_bytes, CTLFLAG_RD, +SYSCTL_QUAD(_net_inet_tcp, OID_AUTO, in_sw_cksum_bytes, CTLFLAG_RD | CTLFLAG_LOCKED, &tcps_in_sw_cksum_bytes, "Amount of received data checksummed in software"); static u_int32_t tcps_out_sw_cksum; -SYSCTL_UINT(_net_inet_tcp, OID_AUTO, out_sw_cksum, CTLFLAG_RD, +SYSCTL_UINT(_net_inet_tcp, OID_AUTO, out_sw_cksum, CTLFLAG_RD | CTLFLAG_LOCKED, &tcps_out_sw_cksum, 0, "Number of transmitted packets checksummed in software"); static u_int64_t tcps_out_sw_cksum_bytes; -SYSCTL_QUAD(_net_inet_tcp, OID_AUTO, out_sw_cksum_bytes, CTLFLAG_RD, +SYSCTL_QUAD(_net_inet_tcp, OID_AUTO, out_sw_cksum_bytes, CTLFLAG_RD | CTLFLAG_LOCKED, &tcps_out_sw_cksum_bytes, "Amount of transmitted data checksummed in software"); @@ -160,9 +166,13 @@ __private_extern__ unsigned int tcp_sockthreshold = 64; #else __private_extern__ unsigned int tcp_sockthreshold = 0; #endif -SYSCTL_INT(_net_inet_tcp, OID_AUTO, sockthreshold, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sockthreshold, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_sockthreshold , 0, "TCP Socket size increased if less than threshold"); + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, info, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, + 0 , 0, tcp_sysctl_info, "S", "TCP info per tuple"); + /* * TCP attaches to socket via pru_attach(), reserving space, * and an internet control block. @@ -186,7 +196,7 @@ tcp_usr_attach(struct socket *so, __unused int proto, struct proc *p) error = EISCONN; goto out; } - + error = tcp_attach(so, p); if (error) goto out; @@ -217,14 +227,15 @@ tcp_usr_detach(struct socket *so) if (inp == 0 || (inp->inp_state == INPCB_STATE_DEAD)) { return EINVAL; /* XXX */ } -#if 1 - lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); -#endif + lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); tp = intotcpcb(inp); /* In case we got disconnected from the peer */ if (tp == 0) goto out; TCPDEBUG1(); + + calculate_tcp_clock(); + tp = tcp_disconnect(tp); out: TCPDEBUG2(PRU_DETACH); @@ -238,6 +249,7 @@ tcp_usr_detach(struct socket *so) } \ tp = intotcpcb(inp); \ TCPDEBUG1(); \ + calculate_tcp_clock(); \ } while(0) #define COMMON_END(req) out: TCPDEBUG2(req); return error; goto out @@ -415,6 +427,8 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p) tp = intotcpcb(inp); TCPDEBUG1(); + calculate_tcp_clock(); + if (nam->sa_family != 0 && nam->sa_family != AF_INET) { error = EAFNOSUPPORT; goto out; @@ -505,9 +519,7 @@ tcp_usr_disconnect(struct socket *so) struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp; -#if 1 - lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); -#endif + lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); COMMON_START(); /* In case we got disconnected from the peer */ if (tp == 0) @@ -529,6 +541,8 @@ tcp_usr_accept(struct socket *so, struct sockaddr **nam) struct tcpcb *tp = NULL; TCPDEBUG0; + in_setpeeraddr(so, nam); + if (so->so_state & SS_ISDISCONNECTED) { error = ECONNABORTED; goto out; @@ -538,7 +552,9 @@ tcp_usr_accept(struct socket *so, struct sockaddr **nam) } tp = intotcpcb(inp); TCPDEBUG1(); - in_setpeeraddr(so, nam); + + calculate_tcp_clock(); + COMMON_END(PRU_ACCEPT); } @@ -560,6 +576,9 @@ tcp6_usr_accept(struct socket *so, struct sockaddr **nam) } tp = intotcpcb(inp); TCPDEBUG1(); + + calculate_tcp_clock(); + in6_mapped_peeraddr(so, nam); COMMON_END(PRU_ACCEPT); } @@ -681,6 +700,9 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, #endif /* INET6 */ tp = intotcpcb(inp); TCPDEBUG1(); + + calculate_tcp_clock(); + if (control) { /* TCP doesn't do control messages (rights, creds, etc) */ if (control->m_len) { @@ -878,10 +900,11 @@ tcp_connect(tp, nam, p) struct socket *so = inp->inp_socket; struct tcpcb *otp; struct sockaddr_in *sin = (struct sockaddr_in *)nam; - struct sockaddr_in *ifaddr; + struct sockaddr_in ifaddr; struct rmxp_tao *taop; struct rmxp_tao tao_noncached; int error; + unsigned int outif = 0; if (inp->inp_lport == 0) { error = in_pcbbind(inp, (struct sockaddr *)0, p); @@ -894,7 +917,7 @@ tcp_connect(tp, nam, p) * earlier incarnation of this same connection still in * TIME_WAIT state, creating an ADDRINUSE error. */ - error = in_pcbladdr(inp, nam, &ifaddr); + error = in_pcbladdr(inp, nam, &ifaddr, &outif); if (error) return error; @@ -902,7 +925,7 @@ tcp_connect(tp, nam, p) oinp = in_pcblookup_hash(inp->inp_pcbinfo, sin->sin_addr, sin->sin_port, inp->inp_laddr.s_addr != INADDR_ANY ? inp->inp_laddr - : ifaddr->sin_addr, + : ifaddr.sin_addr, inp->inp_lport, 0, NULL); tcp_lock(inp->inp_socket, 0, 0); @@ -917,7 +940,7 @@ tcp_connect(tp, nam, p) if (oinp != inp && (otp = intotcpcb(oinp)) != NULL && otp->t_state == TCPS_TIME_WAIT && - otp->t_starttime < (u_int32_t)tcp_msl && + ((int)(tcp_now - otp->t_starttime)) < tcp_msl && (otp->t_flags & TF_RCVD_CC)) otp = tcp_close(otp); else { @@ -930,7 +953,7 @@ tcp_connect(tp, nam, p) tcp_unlock(oinp->inp_socket, 1, 0); } skip_oinp: - if ((inp->inp_laddr.s_addr == INADDR_ANY ? ifaddr->sin_addr.s_addr : + if ((inp->inp_laddr.s_addr == INADDR_ANY ? ifaddr.sin_addr.s_addr : inp->inp_laddr.s_addr) == sin->sin_addr.s_addr && inp->inp_lport == sin->sin_port) return EINVAL; @@ -940,8 +963,10 @@ tcp_connect(tp, nam, p) lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx); socket_lock(inp->inp_socket, 0); } - if (inp->inp_laddr.s_addr == INADDR_ANY) - inp->inp_laddr = ifaddr->sin_addr; + if (inp->inp_laddr.s_addr == INADDR_ANY) { + inp->inp_laddr = ifaddr.sin_addr; + inp->inp_last_outif = outif; + } inp->inp_faddr = sin->sin_addr; inp->inp_fport = sin->sin_port; in_pcbrehash(inp); @@ -968,9 +993,12 @@ tcp_connect(tp, nam, p) soisconnecting(so); tcpstat.tcps_connattempt++; tp->t_state = TCPS_SYN_SENT; - tp->t_timer[TCPT_KEEP] = tp->t_keepinit ? tp->t_keepinit : tcp_keepinit; + tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, + tp->t_keepinit ? tp->t_keepinit : tcp_keepinit); tp->iss = tcp_new_isn(tp); tcp_sendseqinit(tp); + if (nstat_collect) + nstat_route_connect_attempt(inp->inp_route.ro_rt); /* * Generate a CC value for this connection and @@ -1008,6 +1036,7 @@ tcp6_connect(tp, nam, p) struct rmxp_tao *taop; struct rmxp_tao tao_noncached; int error; + unsigned int outif = 0; if (inp->inp_lport == 0) { error = in6_pcbbind(inp, (struct sockaddr *)0, p); @@ -1020,7 +1049,7 @@ tcp6_connect(tp, nam, p) * earlier incarnation of this same connection still in * TIME_WAIT state, creating an ADDRINUSE error. */ - error = in6_pcbladdr(inp, nam, &addr6); + error = in6_pcbladdr(inp, nam, &addr6, &outif); if (error) return error; tcp_unlock(inp->inp_socket, 0, 0); @@ -1034,7 +1063,7 @@ tcp6_connect(tp, nam, p) if (oinp) { if (oinp != inp && (otp = intotcpcb(oinp)) != NULL && otp->t_state == TCPS_TIME_WAIT && - otp->t_starttime < (u_int32_t)tcp_msl && + ((int)(tcp_now - otp->t_starttime)) < tcp_msl && (otp->t_flags & TF_RCVD_CC)) otp = tcp_close(otp); else @@ -1046,8 +1075,10 @@ tcp6_connect(tp, nam, p) lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx); socket_lock(inp->inp_socket, 0); } - if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) + if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { inp->in6p_laddr = addr6; + inp->in6p_last_outif = outif; + } inp->in6p_faddr = sin6->sin6_addr; inp->inp_fport = sin6->sin6_port; if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != 0) @@ -1063,9 +1094,12 @@ tcp6_connect(tp, nam, p) soisconnecting(so); tcpstat.tcps_connattempt++; tp->t_state = TCPS_SYN_SENT; - tp->t_timer[TCPT_KEEP] = tp->t_keepinit ? tp->t_keepinit : tcp_keepinit; + tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, + tp->t_keepinit ? tp->t_keepinit : tcp_keepinit); tp->iss = tcp_new_isn(tp); tcp_sendseqinit(tp); + if (nstat_collect) + nstat_route_connect_attempt(inp->inp_route.ro_rt); /* * Generate a CC value for this connection and @@ -1089,6 +1123,132 @@ tcp6_connect(tp, nam, p) } #endif /* INET6 */ +/* + * Export TCP internal state information via a struct tcp_info + */ +__private_extern__ void +tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) +{ + bzero(ti, sizeof(*ti)); + + ti->tcpi_state = tp->t_state; + + if (tp->t_state > TCPS_LISTEN) { + if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) + ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; + if (tp->t_flags & TF_SACK_PERMIT) + ti->tcpi_options |= TCPI_OPT_SACK; + if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { + ti->tcpi_options |= TCPI_OPT_WSCALE; + ti->tcpi_snd_wscale = tp->snd_scale; + ti->tcpi_rcv_wscale = tp->rcv_scale; + } + + ti->tcpi_snd_mss = tp->t_maxseg; + ti->tcpi_rcv_mss = tp->t_maxseg; + + ti->tcpi_snd_ssthresh = tp->snd_ssthresh; + ti->tcpi_snd_cwnd = tp->snd_cwnd; + + ti->tcpi_rcv_space = tp->rcv_wnd; + + ti->tcpi_snd_wnd = tp->snd_wnd; + ti->tcpi_snd_bwnd = tp->snd_bwnd; + ti->tcpi_snd_nxt = tp->snd_nxt; + ti->tcpi_rcv_nxt = tp->rcv_nxt; + + ti->tcpi_last_outif = tp->t_inpcb->inp_last_outif; + } +} + +__private_extern__ errno_t +tcp_fill_info_for_info_tuple(struct info_tuple *itpl, struct tcp_info *ti) +{ + struct inpcbinfo *pcbinfo = NULL; + struct inpcb *inp = NULL; + struct socket *so; + struct tcpcb *tp; + + if (itpl->itpl_proto == IPPROTO_TCP) + pcbinfo = &tcbinfo; + else + return EINVAL; + + if (itpl->itpl_local_sa.sa_family == AF_INET && + itpl->itpl_remote_sa.sa_family == AF_INET) { + inp = in_pcblookup_hash(pcbinfo, + itpl->itpl_remote_sin.sin_addr, + itpl->itpl_remote_sin.sin_port, + itpl->itpl_local_sin.sin_addr, + itpl->itpl_local_sin.sin_port, + 0, NULL); + } else if (itpl->itpl_local_sa.sa_family == AF_INET6 && + itpl->itpl_remote_sa.sa_family == AF_INET6) { + struct in6_addr ina6_local; + struct in6_addr ina6_remote; + + ina6_local = itpl->itpl_local_sin6.sin6_addr; + if (IN6_IS_SCOPE_LINKLOCAL(&ina6_local) && itpl->itpl_local_sin6.sin6_scope_id) + ina6_local.s6_addr16[1] = htons(itpl->itpl_local_sin6.sin6_scope_id); + + ina6_remote = itpl->itpl_remote_sin6.sin6_addr; + if (IN6_IS_SCOPE_LINKLOCAL(&ina6_remote) && itpl->itpl_remote_sin6.sin6_scope_id) + ina6_remote.s6_addr16[1] = htons(itpl->itpl_remote_sin6.sin6_scope_id); + + inp = in6_pcblookup_hash(pcbinfo, + &ina6_remote, + itpl->itpl_remote_sin6.sin6_port, + &ina6_local, + itpl->itpl_local_sin6.sin6_port, + 0, NULL); + } else + return EINVAL; + if (inp == NULL || (so = inp->inp_socket) == NULL) + return ENOENT; + + socket_lock(so, 0); + if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { + socket_unlock(so, 0); + return ENOENT; + } + tp = intotcpcb(inp); + + tcp_fill_info(tp, ti); + socket_unlock(so, 0); + + return 0; +} + + +__private_extern__ int +tcp_sysctl_info(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + int error; + struct tcp_info ti; + struct info_tuple itpl; + + if (req->newptr == USER_ADDR_NULL) { + return EINVAL; + } + if (req->newlen < sizeof(struct info_tuple)) { + return EINVAL; + } + error = SYSCTL_IN(req, &itpl, sizeof(struct info_tuple)); + if (error != 0) { + return error; + } + error = tcp_fill_info_for_info_tuple(&itpl, &ti); + if (error != 0) { + return error; + } + error = SYSCTL_OUT(req, &ti, sizeof(struct tcp_info)); + if (error != 0) { + return error; + } + + return 0; +} + /* * The new sockopt interface makes it possible for us to block in the * copyin/out step (if we take a page fault). Taking a page fault at @@ -1124,6 +1284,8 @@ tcp_ctloutput(so, sopt) return (ECONNRESET); } + calculate_tcp_clock(); + switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { @@ -1155,7 +1317,17 @@ tcp_ctloutput(so, sopt) else tp->t_flags &= ~opt; break; - + case TCP_RXT_FINDROP: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + opt = TF_RXTFINDROP; + if (optval) + tp->t_flagsext |= opt; + else + tp->t_flagsext &= ~opt; + break; case TCP_MAXSEG: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); @@ -1178,7 +1350,9 @@ tcp_ctloutput(so, sopt) error = EINVAL; else { tp->t_keepidle = optval * TCP_RETRANSHZ; - tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp); /* reset the timer to new value */ + tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, + TCP_KEEPIDLE(tp)); /* reset the timer to new value */ + tcp_check_timer_state(tp); } break; @@ -1193,6 +1367,26 @@ tcp_ctloutput(so, sopt) tp->t_keepinit = optval * TCP_RETRANSHZ; break; + case PERSIST_TIMEOUT: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + if (optval < 0) + error = EINVAL; + else + tp->t_persist_timeout = optval * TCP_RETRANSHZ; + break; + case TCP_RXT_CONNDROPTIME: + error = sooptcopyin(sopt, &optval, sizeof(optval), + sizeof(optval)); + if (error) + break; + if (optval < 0) + error = EINVAL; + else + tp->rxt_conndroptime = optval * TCP_RETRANSHZ; + break; default: error = ENOPROTOOPT; break; @@ -1219,6 +1413,22 @@ tcp_ctloutput(so, sopt) case TCP_CONNECTIONTIMEOUT: optval = tp->t_keepinit / TCP_RETRANSHZ; break; + case PERSIST_TIMEOUT: + optval = tp->t_persist_timeout / TCP_RETRANSHZ; + break; + case TCP_RXT_CONNDROPTIME: + optval = tp->rxt_conndroptime / TCP_RETRANSHZ; + break; + case TCP_RXT_FINDROP: + optval = tp->t_flagsext & TF_RXTFINDROP; + break; + case TCP_INFO: { + struct tcp_info ti; + + tcp_fill_info(tp, &ti); + error = sooptcopyout(sopt, &ti, sizeof(struct tcp_info)); + goto done; + } default: error = ENOPROTOOPT; break; @@ -1227,6 +1437,7 @@ tcp_ctloutput(so, sopt) error = sooptcopyout(sopt, &optval, sizeof optval); break; } +done: return (error); } @@ -1272,9 +1483,9 @@ sysctl_tcp_sospace(struct sysctl_oid *oidp, __unused void *arg1, return error; } -SYSCTL_PROC(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLTYPE_INT | CTLFLAG_RW, +SYSCTL_PROC(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_sendspace , 0, &sysctl_tcp_sospace, "IU", "Maximum outgoing TCP datagram size"); -SYSCTL_PROC(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLTYPE_INT | CTLFLAG_RW, +SYSCTL_PROC(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_recvspace , 0, &sysctl_tcp_sospace, "IU", "Maximum incoming TCP datagram size"); @@ -1353,6 +1564,9 @@ tcp_attach(so, p) so->so_state |= nofd; return (ENOBUFS); } + if (nstat_collect) { + nstat_tcp_new_pcb(inp); + } tp->t_state = TCPS_CLOSED; return (0); } @@ -1425,7 +1639,7 @@ tcp_usrclosed(tp) soisdisconnected(tp->t_inpcb->inp_socket); /* To prevent the connection hanging in FIN_WAIT_2 forever. */ if (tp->t_state == TCPS_FIN_WAIT_2) - tp->t_timer[TCPT_2MSL] = tcp_maxidle; + tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp, tcp_maxidle); } return (tp); } diff --git a/bsd/netinet/tcp_var.h b/bsd/netinet/tcp_var.h index 0fa518d78..4066829cb 100644 --- a/bsd/netinet/tcp_var.h +++ b/bsd/netinet/tcp_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,11 +79,57 @@ struct name { \ #define _TCPCB_LIST_HEAD(name, type) LIST_HEAD(name, type) #endif -#define TCP_RETRANSHZ 10 /* tcp retrans timer (100ms) per hz */ +#define TCP_RETRANSHZ 1000 /* granularity of TCP timestamps, 1ms */ +#define TCP_TIMERHZ 100 /* frequency of TCP fast timer, 100 ms */ + +/* Minimum time quantum within which the timers are coalesced */ +#define TCP_FASTTIMER_QUANTUM TCP_TIMERHZ /* fast mode, once every 100ms */ +#define TCP_SLOWTIMER_QUANTUM TCP_RETRANSHZ / PR_SLOWHZ /* slow mode, once every 500ms */ + +#define TCP_RETRANSHZ_TO_USEC 1000 #ifdef KERNEL_PRIVATE #define N_TIME_WAIT_SLOTS 128 /* must be power of 2 */ +/* Base RTT is stored for N_MIN_RTT_HISTORY slots. This is used to + * estimate expected minimum RTT for delay based congestion control + * algorithms. + */ +#define N_RTT_BASE 5 + +/* Always allow at least 4 packets worth of recv window when adjusting + * recv window using inter-packet arrival jitter. + */ +#define MIN_IAJ_WIN 4 + +/* A variation in delay of this many milliseconds is tolerable. This limit has to + * be low but greater than zero. We also use standard deviation on jitter to adjust + * this limit for different link and connection types. + */ +#define ALLOWED_IAJ 5 + +/* Ignore the first few packets on a connection until the ACK clock gets going + */ +#define IAJ_IGNORE_PKTCNT 40 + +/* Let the accumulated IAJ value increase by this threshold at most. This limit + * will control how many ALLOWED_IAJ measurements a receiver will have to see + * before opening the receive window + */ +#define ACC_IAJ_HIGH_THRESH 100 + +/* When accumulated IAJ reaches this value, the receiver starts to react by + * closing the window + */ +#define ACC_IAJ_REACT_LIMIT 200 + +/* If the number of small packets (smaller than IAJ packet size) seen on a + * connection is more than this threshold, reset the size and learn it again. + * This is needed because the sender might send smaller segments after PMTU + * discovery and the receiver has to learn the new size. + */ +#define RESET_IAJ_SIZE_THRESH 20 + /* * Kernel variables for tcp. */ @@ -133,9 +179,8 @@ struct tcptemp { struct tcpcb { struct tsegqe_head t_segq; int t_dupacks; /* consecutive dup acks recd */ - struct tcptemp *unused; /* unused now: was t_template */ - - int t_timer[TCPT_NTIMERS]; /* tcp timers */ + uint32_t t_timer[TCPT_NTIMERS]; /* tcp timers */ + struct tcptimerentry tentry; /* entry in timer list */ struct inpcb *t_inpcb; /* back pointer to internet pcb */ int t_state; /* state of this connection */ @@ -157,11 +202,9 @@ struct tcpcb { #define TF_RCVD_CC 0x04000 /* a CC was received in SYN */ #define TF_SENDCCNEW 0x08000 /* send CCnew instead of CC in SYN */ #define TF_MORETOCOME 0x10000 /* More data to be appended to sock */ -#define TF_LQ_OVERFLOW 0x20000 /* UNUSED listen queue overflow */ +#define TF_LOCAL 0x20000 /* connection to a host on local link */ #define TF_RXWIN0SENT 0x40000 /* sent a receiver win 0 in response */ #define TF_SLOWLINK 0x80000 /* route is a on a modem speed link */ - - #define TF_LASTIDLE 0x100000 /* connection was previously idle */ #define TF_FASTRECOVERY 0x200000 /* in NewReno Fast Recovery */ #define TF_WASFRECOVERY 0x400000 /* was in NewReno Fast Recovery */ @@ -172,6 +215,8 @@ struct tcpcb { #define TF_CLOSING 0x8000000 /* pending tcp close */ #define TF_TSO 0x10000000 /* TCP Segment Offloading is enable on this connection */ #define TF_BLACKHOLE 0x20000000 /* Path MTU Discovery Black Hole detection */ +#define TF_TIMER_ONLIST 0x40000000 /* pcb is on tcp_timer_list */ +#define TF_STRETCHACK 0x80000000 /* receiver is going to delay acks */ int t_force; /* 1 if forcing out a byte */ @@ -199,14 +244,14 @@ struct tcpcb { * for slow start exponential to * linear switch */ - u_int32_t snd_bandwidth; /* calculated bandwidth or 0 */ + u_int32_t snd_bandwidth; /* calculated bandwidth or 0 */ tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ u_int t_maxopd; /* mss plus options */ - u_int32_t t_rcvtime; /* inactivity time */ - u_int32_t t_starttime; /* time connection was established */ - int t_rtttime; /* round trip time */ + u_int32_t t_rcvtime; /* time at which a packet was received */ + u_int32_t t_starttime; /* time connection was established */ + int t_rtttime; /* tcp clock when rtt calculation was started */ tcp_seq t_rtseq; /* sequence number being timed */ int t_bw_rtttime; /* used for bandwidth calculation */ @@ -220,7 +265,10 @@ struct tcpcb { int t_rxtshift; /* log(2) of rexmt exp. backoff */ u_int t_rttmin; /* minimum rtt allowed */ u_int t_rttbest; /* best rtt we've seen */ + u_int t_rttcur; /* most recent value of rtt */ u_int32_t t_rttupdated; /* number of times rtt sampled */ + u_int32_t rxt_conndroptime; /* retxmt conn gets dropped after this time, when set */ + u_int32_t rxt_start; /* time at a connection starts retransmitting */ u_int32_t max_sndwnd; /* largest window peer has offered */ int t_softerror; /* possible error not yet reported */ @@ -234,6 +282,7 @@ struct tcpcb { u_char rcv_scale; /* window scaling for recv window */ u_char request_r_scale; /* pending window scaling */ u_char requested_s_scale; + u_int16_t tcp_cc_index; /* index of congestion control algorithm */ u_int32_t ts_recent; /* timestamp echo data */ u_int32_t ts_recent_age; /* when last updated */ @@ -251,24 +300,36 @@ struct tcpcb { int t_keepidle; /* keepalive idle timer (override global if > 0) */ int t_lastchain; /* amount of packets chained last time around */ int t_unacksegs; /* received but unacked segments: used for delaying acks */ + u_int32_t t_persist_timeout; /* ZWP persistence limit as set by PERSIST_TIMEOUT */ + u_int32_t t_persist_stop; /* persistence limit deadline if triggered by ZWP */ /* 3529618 MSS overload prevention */ u_int32_t rcv_reset; u_int32_t rcv_pps; u_int32_t rcv_byps; - u_int32_t rcv_maxbyps; + u_int32_t rcv_maxbyps; + +/* Receiver state for stretch-ack algorithm */ + u_int32_t rcv_unackwin; /* to measure win for stretching acks */ + u_int32_t rcv_by_unackwin; /* bytes seen during the last ack-stretching win */ + u_int16_t rcv_waitforss; /* wait for packets during slow-start */ + u_int16_t ecn_flags; +#define TE_SETUPSENT 0x01 /* Indicate we have sent ECN-SETUP SYN or SYN-ACK */ +#define TE_SETUPRECEIVED 0x02 /* Indicate we have received ECN-SETUP SYN or SYN-ACK */ +#define TE_SENDIPECT 0x04 /* Indicate we haven't sent or received non-ECN-setup SYN or SYN-ACK */ +#define TE_SENDCWR 0x08 /* Indicate that the next non-retransmit should have the TCP CWR flag set */ +#define TE_SENDECE 0x10 /* Indicate that the next packet should have the TCP ECE flag set */ tcp_seq snd_high; /* for use in NewReno Fast Recovery */ tcp_seq snd_high_prev; /* snd_high prior to retransmit */ - tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */ u_char snd_limited; /* segments limited transmitted */ /* anti DoS counters */ u_int32_t rcv_second; /* start of interval second */ + /* SACK related state */ int sack_enable; /* enable SACK for this connection */ int snd_numholes; /* number of holes seen by sender */ - TAILQ_HEAD(sackhole_head, sackhole) snd_holes; /* SACK scoreboard (sorted) */ tcp_seq snd_fack; /* last seq number(+1) sack'd by rcv'r*/ @@ -277,18 +338,7 @@ struct tcpcb { tcp_seq sack_newdata; /* New data xmitted in this recovery episode starts at this seq number */ struct sackhint sackhint; /* SACK scoreboard hint */ - int t_rttlow; /* smallest observerved RTT */ - u_long ecn_flags; -#define TE_SETUPSENT 0x01 /* Indicate we have sent ECN-SETUP SYN or SYN-ACK */ -#define TE_SETUPRECEIVED 0x02 /* Indicate we have received ECN-SETUP SYN or SYN-ACK */ -#define TE_SENDIPECT 0x04 /* Indicate we haven't sent or received non-ECN-setup SYN or SYN-ACK */ -#define TE_SENDCWR 0x08 /* Indicate that the next non-retransmit should have the TCP CWR flag set */ -#define TE_SENDECE 0x10 /* Indicate that the next packet should have the TCP ECE flag set */ -#if TRAFFIC_MGT - u_int32_t tot_recv_snapshot; /* snapshot of global total pkts received */ - u_int32_t bg_recv_snapshot; /* snapshot of global background pkts received */ -#endif /* TRAFFIC_MGT */ u_int32_t t_pktlist_sentlen; /* total bytes in transmit chain */ struct mbuf *t_pktlist_head; /* First packet in transmit chain */ struct mbuf *t_pktlist_tail; /* Last packet in transmit chain */ @@ -296,12 +346,57 @@ struct tcpcb { int t_keepinit; /* connection timeout, i.e. idle time in SYN_SENT or SYN_RECV state */ u_int32_t tso_max_segment_size; /* TCP Segment Offloading maximum segment unit for NIC */ u_int t_pmtud_saved_maxopd; /* MSS saved before performing PMTU-D BlackHole detection */ + + struct + { + u_int32_t rxduplicatebytes; + u_int32_t rxoutoforderbytes; + u_int32_t txretransmitbytes; + u_int32_t unused_pad_to_8; + } t_stat; + + /* Background congestion related state */ + uint32_t rtt_hist[N_RTT_BASE]; /* history of minimum RTT */ + uint32_t rtt_count; /* Number of RTT samples in recent base history */ + uint32_t bg_ssthresh; /* Slow start threshold until delay increases */ + uint32_t t_flagsext; /* Another field to accommodate more flags */ +#define TF_RXTFINDROP 0x1 /* Drop conn after retransmitting FIN 3 times */ +#define TF_RCVUNACK_WAITSS 0x2 /* set when the receiver should not stretch acks */ + +#if TRAFFIC_MGT + /* Inter-arrival jitter related state */ + uint32_t iaj_rcv_ts; /* tcp clock when the first packet was received */ + uint16_t iaj_size; /* Size of packet for iaj measurement */ + uint16_t iaj_small_pkt; /* Count of packets smaller than iaj_size */ + uint16_t iaj_pktcnt; /* packet count, to avoid throttling initially */ + uint16_t acc_iaj; /* Accumulated iaj */ + tcp_seq iaj_rwintop; /* recent max advertised window */ + uint32_t avg_iaj; /* Mean */ + uint32_t std_dev_iaj; /* Standard deviation */ +#endif /* TRAFFIC_MGT */ }; #define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY) #define ENTER_FASTRECOVERY(tp) tp->t_flags |= TF_FASTRECOVERY #define EXIT_FASTRECOVERY(tp) tp->t_flags &= ~TF_FASTRECOVERY +#if CONFIG_DTRACE +enum tcp_cc_event { + TCP_CC_CWND_INIT, + TCP_CC_INSEQ_ACK_RCVD, + TCP_CC_ACK_RCVD, + TCP_CC_ENTER_FASTRECOVERY, + TCP_CC_IN_FASTRECOVERY, + TCP_CC_EXIT_FASTRECOVERY, + TCP_CC_PARTIAL_ACK, + TCP_CC_IDLE_TIMEOUT, + TCP_CC_REXMT_TIMEOUT, + TCP_CC_ECN_RCVD, + TCP_CC_BAD_REXMT_RECOVERY, + TCP_CC_OUTPUT_ERROR, + TCP_CC_CHANGE_ALGO +}; +#endif /* CONFIG_DTRACE */ /* * Structure to hold TCP options that are only used during segment @@ -346,18 +441,19 @@ struct rmxp_tao { #define sototcpcb(so) (intotcpcb(sotoinpcb(so))) /* - * The smoothed round-trip time and estimated variance + * The rtt measured is in milliseconds as the timestamp granularity is + * a millisecond. The smoothed round-trip time and estimated variance * are stored as fixed point numbers scaled by the values below. * For convenience, these scales are also used in smoothing the average * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed). - * With these scales, srtt has 3 bits to the right of the binary point, - * and thus an "ALPHA" of 0.875. rttvar has 2 bits to the right of the + * With these scales, srtt has 5 bits to the right of the binary point, + * and thus an "ALPHA" of 0.875. rttvar has 4 bits to the right of the * binary point, and is smoothed with an ALPHA of 0.75. */ #define TCP_RTT_SCALE 32 /* multiplier for srtt; 3 bits frac. */ -#define TCP_RTT_SHIFT 5 /* shift for srtt; 3 bits frac. */ -#define TCP_RTTVAR_SCALE 16 /* multiplier for rttvar; 2 bits */ -#define TCP_RTTVAR_SHIFT 4 /* shift for rttvar; 2 bits */ +#define TCP_RTT_SHIFT 5 /* shift for srtt; 5 bits frac. */ +#define TCP_RTTVAR_SCALE 16 /* multiplier for rttvar; 4 bits */ +#define TCP_RTTVAR_SHIFT 4 /* shift for rttvar; 4 bits */ #define TCP_DELTA_SHIFT 2 /* see tcp_input.c */ /* @@ -399,7 +495,7 @@ struct tcpcb { int t_dupacks; /* consecutive dup acks recd */ u_int32_t unused; /* unused now: was t_template */ - int t_timer[TCPT_NTIMERS]; /* tcp timers */ + int t_timer[TCPT_NTIMERS_EXT]; /* tcp timers */ _TCPCB_PTR(struct inpcb *) t_inpcb; /* back pointer to internet pcb */ int t_state; /* state of this connection */ @@ -452,7 +548,7 @@ struct tcpcb { */ u_int t_maxopd; /* mss plus options */ - u_int32_t t_rcvtime; /* inactivity time */ + u_int32_t t_rcvtime; /* time at which a packet was received */ u_int32_t t_starttime; /* time connection was established */ int t_rtttime; /* round trip time */ tcp_seq t_rtseq; /* sequence number being timed */ @@ -595,9 +691,8 @@ struct tcpstat { u_int32_t tcps_sack_send_blocks; /* SACK blocks (options) sent */ u_int32_t tcps_sack_sboverflow; /* SACK sendblock overflow */ -#if TRAFFIC_MGT u_int32_t tcps_bg_rcvtotal; /* total background packets received */ -#endif /* TRAFFIC_MGT */ + u_int32_t tcps_rxtfindrop; /* drop conn after retransmitting FIN */ }; #pragma pack(4) @@ -633,7 +728,7 @@ struct xtcpcb64 { u_int64_t t_segq; int t_dupacks; /* consecutive dup acks recd */ - int t_timer[TCPT_NTIMERS]; /* tcp timers */ + int t_timer[TCPT_NTIMERS_EXT]; /* tcp timers */ int t_state; /* state of this connection */ u_int t_flags; @@ -665,7 +760,7 @@ struct xtcpcb64 { */ u_int t_maxopd; /* mss plus options */ - u_int32_t t_rcvtime; /* inactivity time */ + u_int32_t t_rcvtime; /* time at which a packet was received */ u_int32_t t_starttime; /* time connection was established */ int t_rtttime; /* round trip time */ tcp_seq t_rtseq; /* sequence number being timed */ @@ -707,6 +802,87 @@ struct xtcpcb64 { #endif /* !CONFIG_EMBEDDED */ +#ifdef PRIVATE + +struct xtcpcb_n { + u_int32_t xt_len; + u_int32_t xt_kind; /* XSO_TCPCB */ + + u_int64_t t_segq; + int t_dupacks; /* consecutive dup acks recd */ + + int t_timer[TCPT_NTIMERS_EXT]; /* tcp timers */ + + int t_state; /* state of this connection */ + u_int t_flags; + + int t_force; /* 1 if forcing out a byte */ + + tcp_seq snd_una; /* send unacknowledged */ + tcp_seq snd_max; /* highest sequence number sent; + * used to recognize retransmits + */ + tcp_seq snd_nxt; /* send next */ + tcp_seq snd_up; /* send urgent pointer */ + + tcp_seq snd_wl1; /* window update seg seq number */ + tcp_seq snd_wl2; /* window update seg ack number */ + tcp_seq iss; /* initial send sequence number */ + tcp_seq irs; /* initial receive sequence number */ + + tcp_seq rcv_nxt; /* receive next */ + tcp_seq rcv_adv; /* advertised window */ + u_int32_t rcv_wnd; /* receive window */ + tcp_seq rcv_up; /* receive urgent pointer */ + + u_int32_t snd_wnd; /* send window */ + u_int32_t snd_cwnd; /* congestion-controlled window */ + u_int32_t snd_ssthresh; /* snd_cwnd size threshold for + * for slow start exponential to + * linear switch + */ + u_int t_maxopd; /* mss plus options */ + + u_int32_t t_rcvtime; /* time at which a packet was received */ + u_int32_t t_starttime; /* time connection was established */ + int t_rtttime; /* round trip time */ + tcp_seq t_rtseq; /* sequence number being timed */ + + int t_rxtcur; /* current retransmit value (ticks) */ + u_int t_maxseg; /* maximum segment size */ + int t_srtt; /* smoothed round-trip time */ + int t_rttvar; /* variance in round-trip time */ + + int t_rxtshift; /* log(2) of rexmt exp. backoff */ + u_int t_rttmin; /* minimum rtt allowed */ + u_int32_t t_rttupdated; /* number of times rtt sampled */ + u_int32_t max_sndwnd; /* largest window peer has offered */ + + int t_softerror; /* possible error not yet reported */ + /* out-of-band data */ + char t_oobflags; /* have some */ + char t_iobc; /* input character */ + /* RFC 1323 variables */ + u_char snd_scale; /* window scaling for send window */ + u_char rcv_scale; /* window scaling for recv window */ + u_char request_r_scale; /* pending window scaling */ + u_char requested_s_scale; + u_int32_t ts_recent; /* timestamp echo data */ + + u_int32_t ts_recent_age; /* when last updated */ + tcp_seq last_ack_sent; + /* RFC 1644 variables */ + tcp_cc cc_send; /* send connection count */ + tcp_cc cc_recv; /* receive connection count */ + tcp_seq snd_recover; /* for use in fast recovery */ + /* experimental */ + u_int32_t snd_cwnd_prev; /* cwnd prior to retransmit */ + u_int32_t snd_ssthresh_prev; /* ssthresh prior to retransmit */ + u_int32_t t_badrxtwin; /* window for retransmit recovery */ +}; + +#endif /* PRIVATE */ + #pragma pack() /* @@ -760,11 +936,14 @@ extern struct tcpstat tcpstat; /* tcp statistics */ extern int tcp_mssdflt; /* XXX */ extern int tcp_minmss; extern int tcp_minmssoverload; -extern int tcp_do_newreno; extern int ss_fltsz; extern int ss_fltsz_local; +extern int tcp_do_rfc3390; /* Calculate ss_fltsz according to RFC 3390 */ #ifdef __APPLE__ extern u_int32_t tcp_now; /* for RFC 1323 timestamps */ +extern struct timeval tcp_uptime; +extern lck_spin_t *tcp_uptime_lock; + extern int tcp_delack_enabled; #endif /* __APPLE__ */ @@ -782,7 +961,6 @@ int tcp_ctloutput(struct socket *, struct sockopt *); struct tcpcb * tcp_drop(struct tcpcb *, int); void tcp_drain(void); -void tcp_fasttimo(void *); struct rmxp_tao * tcp_gettaocache(struct inpcb *); void tcp_init(void) __attribute__((section("__TEXT, initcode"))); @@ -796,10 +974,13 @@ struct tcpcb * int tcp_output(struct tcpcb *); void tcp_respond(struct tcpcb *, void *, struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int, - unsigned int); + unsigned int, unsigned int); struct rtentry *tcp_rtlookup(struct inpcb *, unsigned int); void tcp_setpersist(struct tcpcb *); void tcp_slowtimo(void); +void tcp_check_timer_state(struct tcpcb *tp); +void tcp_run_timerlist(void *arg1, void *arg2); + struct tcptemp * tcp_maketemplate(struct tcpcb *); void tcp_fillheaders(struct tcpcb *, void *, void *); @@ -816,10 +997,16 @@ void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); void tcp_free_sackholes(struct tcpcb *tp); int32_t tcp_sbspace(struct tcpcb *tp); void tcp_set_tso(struct tcpcb *tp, struct ifnet *ifp); +void tcp_reset_stretch_ack(struct tcpcb *tp); +#if TRAFFIC_MGT +void reset_acc_iaj(struct tcpcb *tp); +#endif /* TRAFFIC_MGT */ int tcp_lock (struct socket *, int, void *); int tcp_unlock (struct socket *, int, void *); +void calculate_tcp_clock(void); + #ifdef _KERN_LOCKS_H_ lck_mtx_t * tcp_getlock (struct socket *, int); #else diff --git a/bsd/netinet/udp_usrreq.c b/bsd/netinet/udp_usrreq.c index 500cdcc90..37cc4153c 100644 --- a/bsd/netinet/udp_usrreq.c +++ b/bsd/netinet/udp_usrreq.c @@ -72,6 +72,8 @@ #include #include #include +#include +#include #include @@ -121,35 +123,35 @@ static int udpcksum = 1; #else static int udpcksum = 0; /* XXX */ #endif -SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW, +SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW | CTLFLAG_LOCKED, &udpcksum, 0, ""); static u_int32_t udps_in_sw_cksum; -SYSCTL_UINT(_net_inet_udp, OID_AUTO, in_sw_cksum, CTLFLAG_RD, +SYSCTL_UINT(_net_inet_udp, OID_AUTO, in_sw_cksum, CTLFLAG_RD | CTLFLAG_LOCKED, &udps_in_sw_cksum, 0, "Number of received packets checksummed in software"); static u_int64_t udps_in_sw_cksum_bytes; -SYSCTL_QUAD(_net_inet_udp, OID_AUTO, in_sw_cksum_bytes, CTLFLAG_RD, +SYSCTL_QUAD(_net_inet_udp, OID_AUTO, in_sw_cksum_bytes, CTLFLAG_RD | CTLFLAG_LOCKED, &udps_in_sw_cksum_bytes, "Amount of received data checksummed in software"); static u_int32_t udps_out_sw_cksum; -SYSCTL_UINT(_net_inet_udp, OID_AUTO, out_sw_cksum, CTLFLAG_RD, +SYSCTL_UINT(_net_inet_udp, OID_AUTO, out_sw_cksum, CTLFLAG_RD | CTLFLAG_LOCKED, &udps_out_sw_cksum, 0, "Number of transmitted packets checksummed in software"); static u_int64_t udps_out_sw_cksum_bytes; -SYSCTL_QUAD(_net_inet_udp, OID_AUTO, out_sw_cksum_bytes, CTLFLAG_RD, +SYSCTL_QUAD(_net_inet_udp, OID_AUTO, out_sw_cksum_bytes, CTLFLAG_RD | CTLFLAG_LOCKED, &udps_out_sw_cksum_bytes, "Amount of transmitted data checksummed in software"); int log_in_vain = 0; -SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW, +SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW | CTLFLAG_LOCKED, &log_in_vain, 0, "Log all incoming UDP packets"); static int blackhole = 0; -SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW, +SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW | CTLFLAG_LOCKED, &blackhole, 0, "Do not send port unreachables for refused connects"); struct inpcbhead udb; /* from udp_var.h */ @@ -179,13 +181,13 @@ static int udp_gc_done = FALSE; /* Garbage collection performed last slowtimo */ #endif struct udpstat udpstat; /* from udp_var.h */ -SYSCTL_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RD, +SYSCTL_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RD | CTLFLAG_LOCKED, &udpstat, udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)"); -SYSCTL_INT(_net_inet_udp, OID_AUTO, pcbcount, CTLFLAG_RD, +SYSCTL_INT(_net_inet_udp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED, &udbinfo.ipi_count, 0, "Number of active PCBs"); __private_extern__ int udp_use_randomport = 1; -SYSCTL_INT(_net_inet_udp, OID_AUTO, randomize_ports, CTLFLAG_RW, +SYSCTL_INT(_net_inet_udp, OID_AUTO, randomize_ports, CTLFLAG_RW | CTLFLAG_LOCKED, &udp_use_randomport, 0, "Randomize UDP port numbers"); #if INET6 @@ -254,13 +256,15 @@ udp_input(m, iphlen) register struct udphdr *uh; register struct inpcb *inp; struct mbuf *opts = 0; - int len; + int len, isbroadcast; struct ip save_ip; struct sockaddr *append_sa; struct inpcbinfo *pcbinfo = &udbinfo; struct sockaddr_in udp_in = { sizeof (udp_in), AF_INET, 0, { 0 }, { 0, 0, 0, 0, 0, 0, 0, 0 } }; + struct ip_moptions *imo = NULL; + int foundmembership = 0, ret = 0; #if INET6 struct udp_in6 udp_in6 = { { sizeof (udp_in6.uin6_sin), AF_INET6, 0, 0, @@ -365,8 +369,9 @@ udp_input(m, iphlen) udpstat.udps_nosum++; #endif - if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || - in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { + isbroadcast = in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif); + + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || isbroadcast) { int reuse_sock = 0, mcast_delivered = 0; @@ -409,6 +414,11 @@ udp_input(m, iphlen) if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif + if ((inp->inp_moptions == NULL) && + (ntohl(ip->ip_dst.s_addr) != INADDR_ALLHOSTS_GROUP) && + (isbroadcast == 0) ) + continue; + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) { continue; @@ -441,6 +451,35 @@ udp_input(m, iphlen) } } + if (isbroadcast == 0 && (ntohl(ip->ip_dst.s_addr) != INADDR_ALLHOSTS_GROUP)) { + if((imo = inp->inp_moptions) == NULL) { + udp_unlock(inp->inp_socket, 1, 0); + continue; + } else { + struct sockaddr_in group; + int blocked; + + IMO_LOCK(imo); + + bzero(&group, sizeof(struct sockaddr_in)); + group.sin_len = sizeof(struct sockaddr_in); + group.sin_family = AF_INET; + group.sin_addr = ip->ip_dst; + + blocked = imo_multi_filter(imo, m->m_pkthdr.rcvif, + (struct sockaddr *)&group, + (struct sockaddr *)&udp_in); + if (blocked == MCAST_PASS) + foundmembership = 1; + + IMO_UNLOCK(imo); + if (!foundmembership) { + udp_unlock(inp->inp_socket, 1, 0); + continue; + } + foundmembership = 0; + } + } reuse_sock = inp->inp_socket->so_options& (SO_REUSEPORT|SO_REUSEADDR); { #if IPSEC @@ -537,21 +576,9 @@ udp_input(m, iphlen) } else if (payload_len == 4 && *(u_int32_t*)((caddr_t)uh + sizeof(struct udphdr)) != 0) { /* UDP encapsulated IPSec packet to pass through NAT */ - size_t stripsiz; - - stripsiz = sizeof(struct udphdr); - - ip = mtod(m, struct ip *); - ovbcopy((caddr_t)ip, (caddr_t)(((u_char *)ip) + stripsiz), iphlen); - m->m_data += stripsiz; - m->m_len -= stripsiz; - m->m_pkthdr.len -= stripsiz; - ip = mtod(m, struct ip *); - ip->ip_len = ip->ip_len - stripsiz; - ip->ip_p = IPPROTO_ESP; - KERNEL_DEBUG(DBG_FNC_UDP_INPUT | DBG_FUNC_END, 0,0,0,0,0); - esp4_input(m, iphlen); + /* preserve the udp header */ + esp4_input(m, iphlen + sizeof(struct udphdr)); return; } } @@ -624,8 +651,9 @@ udp_input(m, iphlen) */ udp_in.sin_port = uh->uh_sport; udp_in.sin_addr = ip->ip_src; - if (inp->inp_flags & INP_CONTROLOPTS - || inp->inp_socket->so_options & SO_TIMESTAMP) { + if ((inp->inp_flags & INP_CONTROLOPTS) != 0 + || (inp->inp_socket->so_options & SO_TIMESTAMP) != 0 + || (inp->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { #if INET6 if (inp->inp_vflag & INP_IPV6) { int savedflags; @@ -633,11 +661,17 @@ udp_input(m, iphlen) ip_2_ip6_hdr(&udp_ip6.uip6_ip6, ip); savedflags = inp->inp_flags; inp->inp_flags &= ~INP_UNMAPPABLEOPTS; - ip6_savecontrol(inp, &opts, &udp_ip6.uip6_ip6, m); + ret = ip6_savecontrol(inp, m, &opts); inp->inp_flags = savedflags; } else #endif - ip_savecontrol(inp, &opts, ip, m); + { + ret = ip_savecontrol(inp, &opts, ip, m); + } + if (ret != 0) { + udp_unlock(inp->inp_socket, 1, 0); + goto bad; + } } m_adj(m, iphlen + sizeof(struct udphdr)); @@ -651,10 +685,14 @@ udp_input(m, iphlen) } else #endif append_sa = (struct sockaddr *)&udp_in; + if (nstat_collect) { + locked_add_64(&inp->inp_stat->rxpackets, 1); + locked_add_64(&inp->inp_stat->rxbytes, m->m_pkthdr.len); + } + so_recv_data_stat(inp->inp_socket, m, 0); if (sbappendaddr(&inp->inp_socket->so_rcv, append_sa, m, opts, NULL) == 0) { udpstat.udps_fullsock++; - } - else { + } else { sorwakeup(inp->inp_socket); } udp_unlock(inp->inp_socket, 1, 0); @@ -702,6 +740,7 @@ udp_append(struct inpcb *last, struct ip *ip, struct mbuf *n, int off, { struct sockaddr *append_sa; struct mbuf *opts = 0; + int ret = 0; #if CONFIG_MACF_NET if (mac_inpcb_check_deliver(last, n, AF_INET, SOCK_DGRAM) != 0) { @@ -709,8 +748,9 @@ udp_append(struct inpcb *last, struct ip *ip, struct mbuf *n, int off, return; } #endif - if (last->inp_flags & INP_CONTROLOPTS || - last->inp_socket->so_options & SO_TIMESTAMP) { + if ((last->inp_flags & INP_CONTROLOPTS) != 0 || + (last->inp_socket->so_options & SO_TIMESTAMP) != 0 || + (last->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { #if INET6 if (last->inp_vflag & INP_IPV6) { int savedflags; @@ -721,11 +761,20 @@ udp_append(struct inpcb *last, struct ip *ip, struct mbuf *n, int off, } savedflags = last->inp_flags; last->inp_flags &= ~INP_UNMAPPABLEOPTS; - ip6_savecontrol(last, &opts, &pudp_ip6->uip6_ip6, n); + ret = ip6_savecontrol(last, n, &opts); + if (ret != 0) { + last->inp_flags = savedflags; + goto error; + } last->inp_flags = savedflags; } else #endif - ip_savecontrol(last, &opts, ip, n); + { + ret = ip_savecontrol(last, &opts, ip, n); + if (ret != 0) { + goto error; + } + } } #if INET6 if (last->inp_vflag & INP_IPV6) { @@ -737,11 +786,22 @@ udp_append(struct inpcb *last, struct ip *ip, struct mbuf *n, int off, } else #endif append_sa = (struct sockaddr *)pudp_in; + if (nstat_collect) { + locked_add_64(&last->inp_stat->rxpackets, 1); + locked_add_64(&last->inp_stat->rxbytes, n->m_pkthdr.len); + } + so_recv_data_stat(last->inp_socket, n, 0); m_adj(n, off); if (sbappendaddr(&last->inp_socket->so_rcv, append_sa, n, opts, NULL) == 0) { udpstat.udps_fullsock++; - } else + } else { sorwakeup(last->inp_socket); + } + return; +error: + m_freem(n); + m_freem(opts); + return; } /* @@ -952,7 +1012,7 @@ udp_pcblist SYSCTL_HANDLER_ARGS return error; } -SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, +SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, udp_pcblist, "S,xinpcb", "List of active UDP sockets"); #if !CONFIG_EMBEDDED @@ -1055,11 +1115,27 @@ udp_pcblist64 SYSCTL_HANDLER_ARGS return error; } -SYSCTL_PROC(_net_inet_udp, OID_AUTO, pcblist64, CTLFLAG_RD, 0, 0, +SYSCTL_PROC(_net_inet_udp, OID_AUTO, pcblist64, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, udp_pcblist64, "S,xinpcb64", "List of active UDP sockets"); #endif /* !CONFIG_EMBEDDED */ +static int +udp_pcblist_n SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int error = 0; + + error = get_pcblist_n(IPPROTO_UDP, req, &udbinfo); + + return error; +} + + +SYSCTL_PROC(_net_inet_udp, OID_AUTO, pcblist_n, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, + udp_pcblist_n, "S,xinpcb_n", "List of active UDP sockets"); + + static __inline__ u_int16_t get_socket_id(struct socket * s) { @@ -1075,6 +1151,69 @@ get_socket_id(struct socket * s) return (val); } +static int +udp_check_pktinfo(struct mbuf *control, unsigned int *ifindex, struct in_addr *laddr) +{ + struct cmsghdr *cm = 0; + struct in_pktinfo *pktinfo; + struct ifnet *ifp; + + /* + * XXX: Currently, we assume all the optional information is stored + * in a single mbuf. + */ + if (control->m_next) + return (EINVAL); + + if (control->m_len < CMSG_LEN(0)) + return (EINVAL); + + for (cm = M_FIRST_CMSGHDR(control); cm; cm = M_NXT_CMSGHDR(control, cm)) { + if (cm->cmsg_len < sizeof(struct cmsghdr) || cm->cmsg_len > control->m_len) + return (EINVAL); + + if (cm->cmsg_level != IPPROTO_IP || cm->cmsg_type != IP_PKTINFO) + continue; + + if (cm->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo))) + return (EINVAL); + + pktinfo = (struct in_pktinfo *)CMSG_DATA(cm); + + /* Check for a valid ifindex in pktinfo */ + ifnet_head_lock_shared(); + + if (pktinfo->ipi_ifindex > if_index) { + ifnet_head_done(); + return (ENXIO); + } + + /* If ipi_ifindex is specified it takes precedence over ipi_spec_dst */ + + if (pktinfo->ipi_ifindex) { + ifp = ifindex2ifnet[pktinfo->ipi_ifindex]; + if (ifp == NULL) { + ifnet_head_done(); + return (ENXIO); + } + + ifnet_head_done(); + + *ifindex = pktinfo->ipi_ifindex; + laddr->s_addr = INADDR_ANY; + break; + } + + ifnet_head_done(); + + /* Use the provided ipi_spec_dst address for temp source address */ + *ifindex = 0; + *laddr = pktinfo->ipi_spec_dst; + break; + } + return (0); +} + static int udp_output(inp, m, addr, control, p) register struct inpcb *inp; @@ -1086,28 +1225,34 @@ udp_output(inp, m, addr, control, p) register struct udpiphdr *ui; register int len = m->m_pkthdr.len; struct sockaddr_in *sin; - struct in_addr origladdr, laddr, faddr; + struct in_addr origladdr, laddr, faddr, pi_laddr; u_short lport, fport; - struct sockaddr_in *ifaddr; - int error = 0, udp_dodisconnect = 0; + struct sockaddr_in ifaddr; + int error = 0, udp_dodisconnect = 0, pktinfo = 0; struct socket *so = inp->inp_socket; int soopts = 0; struct mbuf *inpopts; struct ip_moptions *mopts; struct route ro; - struct ip_out_args ipoa; -#if PKT_PRIORITY - mbuf_traffic_class_t mtc = MBUF_TC_NONE; -#endif /* PKT_PRIORITY */ + struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; + mbuf_traffic_class_t mtc = MBUF_TC_UNSPEC; + unsigned int origoutif; + + pi_laddr.s_addr = INADDR_ANY; KERNEL_DEBUG(DBG_FNC_UDP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); if (control != NULL) { -#if PKT_PRIORITY mtc = mbuf_traffic_class_from_control(control); -#endif /* PKT_PRIORITY */ + + error = udp_check_pktinfo(control, &ipoa.ipoa_boundif, &pi_laddr); + m_freem(control); + if (error) + goto release; + pktinfo++; } + KERNEL_DEBUG(DBG_LAYER_OUT_BEG, inp->inp_fport, inp->inp_lport, inp->inp_laddr.s_addr, inp->inp_faddr.s_addr, (htons((u_short)len + sizeof (struct udphdr)))); @@ -1117,11 +1262,16 @@ udp_output(inp, m, addr, control, p) goto release; } - lck_mtx_assert(inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED); - /* If socket was bound to an ifindex, tell ip_output about it */ - ipoa.ipoa_ifscope = (inp->inp_flags & INP_BOUND_IF) ? - inp->inp_boundif : IFSCOPE_NONE; + /* + * If socket was bound to an ifindex, tell ip_output about it. + * If the ancillary IP_PKTINFO option contains an interface index, + * it takes precedence over the one specified by IP_BOUND_IF. + */ + if (ipoa.ipoa_boundif == IFSCOPE_NONE && (inp->inp_flags & INP_BOUND_IF)) + ipoa.ipoa_boundif = inp->inp_boundif; + ipoa.ipoa_nocell = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; soopts |= IP_OUTARGS; /* If there was a routing change, discard cached route and check @@ -1134,22 +1284,45 @@ udp_output(inp, m, addr, control, p) /* src address is gone? */ if ((ia = ifa_foraddr(inp->inp_laddr.s_addr)) == NULL) { - if (inp->inp_flags & INP_INADDR_ANY) { - /* new src will be set later */ - inp->inp_laddr.s_addr = INADDR_ANY; - } else { + if (((inp->inp_flags & INP_INADDR_ANY) == 0) || (so->so_state & SS_ISCONNECTED)) { + /* Rdar://5448998 + * If the source address is gone, return an error if: + * - the source was specified + * - the socket was already connected + */ error = EADDRNOTAVAIL; goto release; + } else { + /* new src will be set later */ + inp->inp_laddr.s_addr = INADDR_ANY; + inp->inp_last_outif = 0; } } if (ia != NULL) - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); if (inp->inp_route.ro_rt != NULL) rtfree(inp->inp_route.ro_rt); inp->inp_route.ro_rt = NULL; } - origladdr= laddr = inp->inp_laddr; + origoutif = inp->inp_last_outif; + + /* IP_PKTINFO option check. + * If a temporary scope or src address is provided, use it for this packet only + * and make sure we forget it after sending this datagram. + */ + + if (pi_laddr.s_addr != INADDR_ANY || + (ipoa.ipoa_boundif != IFSCOPE_NONE && pktinfo)) { + laddr = pi_laddr; /* temp src address for this datagram only */ + origladdr.s_addr = INADDR_ANY; + udp_dodisconnect = 1; /* we don't want to keep the laddr or route */ + inp->inp_flags |= INP_INADDR_ANY; /* remember we don't care about src addr.*/ + } else { + origladdr = laddr = inp->inp_laddr; + } + + origoutif = inp->inp_last_outif; faddr = inp->inp_faddr; lport = inp->inp_lport; fport = inp->inp_fport; @@ -1165,8 +1338,11 @@ udp_output(inp, m, addr, control, p) * In case we don't have a local port set, go through the full connect. * We don't have a local port yet (ie, we can't be looked up), * so it's not an issue if the input runs at the same time we do this. - */ - error = in_pcbconnect(inp, addr, p); + */ + + if (pi_laddr.s_addr != INADDR_ANY) /* if we have a source address specified, use that */ + inp->inp_laddr = pi_laddr; + error = in_pcbconnect(inp, addr, p, &ipoa.ipoa_boundif); /* if a scope is specified, use it */ if (error) { goto release; } @@ -1176,19 +1352,21 @@ udp_output(inp, m, addr, control, p) fport = inp->inp_fport; udp_dodisconnect = 1; } - else { + else { /* Fast path case * we have a full address and a local port. * use those info to build the packet without changing the pcb * and interfering with the input path. See 3851370 + * Note: if we may have a scope from IP_PKTINFO but the + * priority is always given to the scope provided by INP_BOUND_IF. */ if (laddr.s_addr == INADDR_ANY) { - if ((error = in_pcbladdr(inp, addr, &ifaddr)) != 0) + if ((error = in_pcbladdr(inp, addr, &ifaddr, &ipoa.ipoa_boundif)) != 0) goto release; - laddr = ifaddr->sin_addr; + laddr = ifaddr.sin_addr; inp->inp_flags |= INP_INADDR_ANY; /* from pcbconnect: remember we don't care about src addr.*/ } - + faddr = sin->sin_addr; fport = sin->sin_port; } @@ -1256,68 +1434,63 @@ udp_output(inp, m, addr, control, p) inpopts = inp->inp_options; soopts |= (inp->inp_socket->so_options & (SO_DONTROUTE | SO_BROADCAST)); mopts = inp->inp_moptions; + if (mopts != NULL) + IMO_ADDREF(mopts); /* Copy the cached route and take an extra reference */ inp_route_copyout(inp, &ro); -#if PKT_PRIORITY - set_traffic_class(m, so, mtc); -#endif /* PKT_PRIORITY */ + set_packet_tclass(m, so, mtc, 0); socket_unlock(so, 0); - /* XXX jgraessley please look at XXX */ error = ip_output_list(m, 0, inpopts, &ro, soopts, mopts, &ipoa); + m = NULL; socket_lock(so, 0); + if (mopts != NULL) + IMO_REMREF(mopts); + if (error == 0 && nstat_collect) { + locked_add_64(&inp->inp_stat->txpackets, 1); + locked_add_64(&inp->inp_stat->txbytes, len); + } /* Synchronize PCB cached route */ inp_route_copyin(inp, &ro); +abort: if (udp_dodisconnect) { -#if IFNET_ROUTE_REFCNT /* Always discard the cached route for unconnected socket */ if (inp->inp_route.ro_rt != NULL) { rtfree(inp->inp_route.ro_rt); inp->inp_route.ro_rt = NULL; } -#endif /* IFNET_ROUTE_REFCNT */ in_pcbdisconnect(inp); inp->inp_laddr = origladdr; /* XXX rehash? */ - } -#if IFNET_ROUTE_REFCNT - else if (inp->inp_route.ro_rt != NULL && - (inp->inp_route.ro_rt->rt_flags & (RTF_MULTICAST|RTF_BROADCAST))) { - /* Always discard non-unicast cached route */ - rtfree(inp->inp_route.ro_rt); - inp->inp_route.ro_rt = NULL; - } -#endif /* IFNET_ROUTE_REFCNT */ + inp->inp_last_outif = origoutif; + } else if (inp->inp_route.ro_rt != NULL) { + struct rtentry *rt = inp->inp_route.ro_rt; + unsigned int outif; - KERNEL_DEBUG(DBG_FNC_UDP_OUTPUT | DBG_FUNC_END, error, 0,0,0,0); - return (error); - -abort: - if (udp_dodisconnect) { -#if IFNET_ROUTE_REFCNT - /* Always discard the cached route for unconnected socket */ - if (inp->inp_route.ro_rt != NULL) { + if (rt->rt_flags & (RTF_MULTICAST|RTF_BROADCAST)) + rt = NULL; /* unusable */ + /* + * Always discard if it is a multicast or broadcast route. + */ + if (rt == NULL) { rtfree(inp->inp_route.ro_rt); inp->inp_route.ro_rt = NULL; } -#endif /* IFNET_ROUTE_REFCNT */ - in_pcbdisconnect(inp); - inp->inp_laddr = origladdr; /* XXX rehash? */ - } -#if IFNET_ROUTE_REFCNT - else if (inp->inp_route.ro_rt != NULL && - (inp->inp_route.ro_rt->rt_flags & (RTF_MULTICAST|RTF_BROADCAST))) { - /* Always discard non-unicast cached route */ - rtfree(inp->inp_route.ro_rt); - inp->inp_route.ro_rt = NULL; + /* + * If the destination route is unicast, update outif with + * that of the route interface index used by IP. + */ + if (rt != NULL && + (outif = rt->rt_ifp->if_index) != inp->inp_last_outif) + inp->inp_last_outif = outif; } -#endif /* IFNET_ROUTE_REFCNT */ release: - m_freem(m); + if (m != NULL) + m_freem(m); KERNEL_DEBUG(DBG_FNC_UDP_OUTPUT | DBG_FUNC_END, error, 0,0,0,0); return (error); } @@ -1362,10 +1535,10 @@ sysctl_udp_sospace(struct sysctl_oid *oidp, __unused void *arg1, return error; } -SYSCTL_PROC(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLTYPE_INT | CTLFLAG_RW, +SYSCTL_PROC(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &udp_recvspace, 0, &sysctl_udp_sospace, "IU", "Maximum incoming UDP datagram size"); -SYSCTL_PROC(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLTYPE_INT | CTLFLAG_RW, +SYSCTL_PROC(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &udp_sendspace, 0, &sysctl_udp_sospace, "IU", "Maximum outgoing UDP datagram size"); static int @@ -1400,6 +1573,7 @@ udp_attach(struct socket *so, __unused int proto, struct proc *p) inp = (struct inpcb *)so->so_pcb; inp->inp_vflag |= INP_IPV4; inp->inp_ip_ttl = ip_defttl; + nstat_udp_new_pcb(inp); return 0; } @@ -1431,7 +1605,7 @@ udp_connect(struct socket *so, struct sockaddr *nam, struct proc *p) return EINVAL; if (inp->inp_faddr.s_addr != INADDR_ANY) return EISCONN; - error = in_pcbconnect(inp, nam, p); + error = in_pcbconnect(inp, nam, p, NULL); if (error == 0) soisconnected(so); return error; @@ -1464,6 +1638,7 @@ udp_disconnect(struct socket *so) in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; so->so_state &= ~SS_ISCONNECTED; /* XXX */ + inp->inp_last_outif = 0; return 0; } @@ -1514,9 +1689,9 @@ udp_lock(struct socket *so, int refcount, void *debug) lr_saved = debug; if (so->so_pcb) { - lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, + lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_NOTOWNED); - lck_mtx_lock(((struct inpcb *)so->so_pcb)->inpcb_mtx); + lck_mtx_lock(&((struct inpcb *)so->so_pcb)->inpcb_mtx); } else { panic("udp_lock: so=%p NO PCB! lr=%p lrh= %s\n", so, lr_saved, solockhistory_nr(so)); @@ -1548,11 +1723,11 @@ udp_unlock(struct socket *so, int refcount, void *debug) so, lr_saved, solockhistory_nr(so)); /* NOTREACHED */ } else { - lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, + lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); so->unlock_lr[so->next_unlock_lr] = lr_saved; so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX; - lck_mtx_unlock(((struct inpcb *)so->so_pcb)->inpcb_mtx); + lck_mtx_unlock(&((struct inpcb *)so->so_pcb)->inpcb_mtx); } @@ -1566,7 +1741,7 @@ udp_getlock(struct socket *so, __unused int locktype) if (so->so_pcb) - return(inp->inpcb_mtx); + return(&inp->inpcb_mtx); else { panic("udp_getlock: so=%p NULL so_pcb lrh= %s\n", so, solockhistory_nr(so)); @@ -1598,7 +1773,7 @@ udp_slowtimo() continue; so = inp->inp_socket; - if (!lck_mtx_try_lock(inp->inpcb_mtx)) /* skip if busy, no hurry for cleanup... */ + if (!lck_mtx_try_lock(&inp->inpcb_mtx)) /* skip if busy, no hurry for cleanup... */ continue; if (so->so_usecount == 0) { @@ -1612,7 +1787,7 @@ udp_slowtimo() } in_pcbdispose(inp); } else { - lck_mtx_unlock(inp->inpcb_mtx); + lck_mtx_unlock(&inp->inpcb_mtx); } } lck_rw_done(pcbinfo->mtx); diff --git a/bsd/netinet6/Makefile b/bsd/netinet6/Makefile index fc12c8bef..f765bace4 100644 --- a/bsd/netinet6/Makefile +++ b/bsd/netinet6/Makefile @@ -9,14 +9,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ DATAFILES = \ @@ -25,12 +21,12 @@ DATAFILES = \ in6_var.h ip6_mroute.h nd6.h ip6_fw.h PRIVATE_DATAFILES = \ - in6_pcb.h ip6_var.h pim6_var.h + in6_pcb.h ip6_var.h pim6_var.h mld6_var.h PRIVATE_KERNELFILES = \ ah6.h esp6.h esp_rijndael.h in6_gif.h in6_ifattach.h \ in6_prefix.h ip6_ecn.h ip6_fw.h \ - ip6protosw.h ipcomp6.h ipsec6.h mld6_var.h \ + ip6protosw.h ipcomp6.h ipsec6.h \ raw_ip6.h scope6_var.h tcp6_var.h udp6_var.h INSTALL_MI_LIST = ${DATAFILES} diff --git a/bsd/netinet6/ah.h b/bsd/netinet6/ah.h index f77826a0d..cf9ddad2a 100644 --- a/bsd/netinet6/ah.h +++ b/bsd/netinet6/ah.h @@ -71,7 +71,7 @@ struct ah_algorithm { const char *name; int (*init)(struct ah_algorithm_state *, struct secasvar *); void (*update)(struct ah_algorithm_state *, caddr_t, size_t); - void (*result)(struct ah_algorithm_state *, caddr_t); + void (*result)(struct ah_algorithm_state *, caddr_t, size_t); }; #define AH_MAXSUMSIZE 64 // sha2-512's output size diff --git a/bsd/netinet6/ah6.h b/bsd/netinet6/ah6.h index 22cda6f12..688f946d5 100644 --- a/bsd/netinet6/ah6.h +++ b/bsd/netinet6/ah6.h @@ -41,7 +41,7 @@ #ifdef KERNEL_PRIVATE struct secasvar; -extern int ah6_input(struct mbuf **, int *); +extern int ah6_input(struct mbuf **, int *, int); extern int ah6_output(struct mbuf *, u_char *, struct mbuf *, struct secasvar *); extern int ah6_calccksum(struct mbuf *, caddr_t, size_t, diff --git a/bsd/netinet6/ah_core.c b/bsd/netinet6/ah_core.c index 042550b78..27098a76f 100644 --- a/bsd/netinet6/ah_core.c +++ b/bsd/netinet6/ah_core.c @@ -119,42 +119,42 @@ static int ah_sumsiz_zero(struct secasvar *); static int ah_none_mature(struct secasvar *); static int ah_none_init(struct ah_algorithm_state *, struct secasvar *); static void ah_none_loop(struct ah_algorithm_state *, caddr_t, size_t); -static void ah_none_result(struct ah_algorithm_state *, caddr_t); +static void ah_none_result(struct ah_algorithm_state *, caddr_t, size_t); static int ah_keyed_md5_mature(struct secasvar *); static int ah_keyed_md5_init(struct ah_algorithm_state *, struct secasvar *); static void ah_keyed_md5_loop(struct ah_algorithm_state *, caddr_t, size_t); -static void ah_keyed_md5_result(struct ah_algorithm_state *, caddr_t); +static void ah_keyed_md5_result(struct ah_algorithm_state *, caddr_t, size_t); static int ah_keyed_sha1_mature(struct secasvar *); static int ah_keyed_sha1_init(struct ah_algorithm_state *, struct secasvar *); static void ah_keyed_sha1_loop(struct ah_algorithm_state *, caddr_t, size_t); -static void ah_keyed_sha1_result(struct ah_algorithm_state *, caddr_t); +static void ah_keyed_sha1_result(struct ah_algorithm_state *, caddr_t, size_t); static int ah_hmac_md5_mature(struct secasvar *); static int ah_hmac_md5_init(struct ah_algorithm_state *, struct secasvar *); static void ah_hmac_md5_loop(struct ah_algorithm_state *, caddr_t, size_t); -static void ah_hmac_md5_result(struct ah_algorithm_state *, caddr_t); +static void ah_hmac_md5_result(struct ah_algorithm_state *, caddr_t, size_t); static int ah_hmac_sha1_mature(struct secasvar *); static int ah_hmac_sha1_init(struct ah_algorithm_state *, struct secasvar *); static void ah_hmac_sha1_loop(struct ah_algorithm_state *, caddr_t, size_t); -static void ah_hmac_sha1_result(struct ah_algorithm_state *, caddr_t); +static void ah_hmac_sha1_result(struct ah_algorithm_state *, caddr_t, size_t); #if ALLCRYPTO static int ah_sumsiz_sha2_256(struct secasvar *); static int ah_hmac_sha2_256_mature(struct secasvar *); static int ah_hmac_sha2_256_init(struct ah_algorithm_state *, struct secasvar *); static void ah_hmac_sha2_256_loop(struct ah_algorithm_state *, caddr_t, size_t); -static void ah_hmac_sha2_256_result(struct ah_algorithm_state *, caddr_t); +static void ah_hmac_sha2_256_result(struct ah_algorithm_state *, caddr_t, size_t); static int ah_sumsiz_sha2_384(struct secasvar *); static int ah_hmac_sha2_384_mature(struct secasvar *); static int ah_hmac_sha2_384_init(struct ah_algorithm_state *, struct secasvar *); static void ah_hmac_sha2_384_loop(struct ah_algorithm_state *, caddr_t, size_t); -static void ah_hmac_sha2_384_result(struct ah_algorithm_state *, caddr_t); +static void ah_hmac_sha2_384_result(struct ah_algorithm_state *, caddr_t, size_t); static int ah_sumsiz_sha2_512(struct secasvar *); static int ah_hmac_sha2_512_mature(struct secasvar *); static int ah_hmac_sha2_512_init(struct ah_algorithm_state *, struct secasvar *); static void ah_hmac_sha2_512_loop(struct ah_algorithm_state *, caddr_t, size_t); -static void ah_hmac_sha2_512_result(struct ah_algorithm_state *, caddr_t); +static void ah_hmac_sha2_512_result(struct ah_algorithm_state *, caddr_t, size_t); #endif /* ALLCRYPTO */ static void ah_update_mbuf(struct mbuf *, int, int, @@ -280,7 +280,8 @@ ah_none_loop( static void ah_none_result( __unused struct ah_algorithm_state *state, - __unused caddr_t addr) + __unused caddr_t addr, + __unused size_t l) { } @@ -363,9 +364,10 @@ ah_keyed_md5_loop(state, addr, len) } static void -ah_keyed_md5_result(state, addr) +ah_keyed_md5_result(state, addr, l) struct ah_algorithm_state *state; caddr_t addr; + size_t l; { u_char digest[16]; @@ -379,7 +381,7 @@ ah_keyed_md5_result(state, addr) } MD5Final(&digest[0], (MD5_CTX *)state->foo); FREE(state->foo, M_TEMP); - bcopy(&digest[0], (void *)addr, sizeof(digest)); + bcopy(&digest[0], (void *)addr, sizeof(digest) > l ? l : sizeof(digest)); } static int @@ -484,9 +486,10 @@ ah_keyed_sha1_loop(state, addr, len) } static void -ah_keyed_sha1_result(state, addr) +ah_keyed_sha1_result(state, addr, l) struct ah_algorithm_state *state; caddr_t addr; + size_t l; { u_char digest[SHA1_RESULTLEN]; /* SHA-1 generates 160 bits */ SHA1_CTX *ctxt; @@ -500,7 +503,7 @@ ah_keyed_sha1_result(state, addr) (u_int)_KEYLEN(state->sav->key_auth)); } SHA1Final((caddr_t)&digest[0], ctxt); - bcopy(&digest[0], (void *)addr, HMACSIZE); + bcopy(&digest[0], (void *)addr, sizeof(digest) > l ? l : sizeof(digest)); FREE(state->foo, M_TEMP); } @@ -601,9 +604,10 @@ ah_hmac_md5_loop(state, addr, len) } static void -ah_hmac_md5_result(state, addr) +ah_hmac_md5_result(state, addr, l) struct ah_algorithm_state *state; caddr_t addr; + size_t l; { u_char digest[16]; u_char *ipad; @@ -624,7 +628,7 @@ ah_hmac_md5_result(state, addr) MD5Update(ctxt, &digest[0], sizeof(digest)); MD5Final(&digest[0], ctxt); - bcopy(&digest[0], (void *)addr, HMACSIZE); + bcopy(&digest[0], (void *)addr, sizeof(digest) > l ? l : sizeof(digest)); FREE(state->foo, M_TEMP); } @@ -727,9 +731,10 @@ ah_hmac_sha1_loop(state, addr, len) } static void -ah_hmac_sha1_result(state, addr) +ah_hmac_sha1_result(state, addr, l) struct ah_algorithm_state *state; caddr_t addr; + size_t l; { u_char digest[SHA1_RESULTLEN]; /* SHA-1 generates 160 bits */ u_char *ipad; @@ -750,7 +755,7 @@ ah_hmac_sha1_result(state, addr) SHA1Update(ctxt, (caddr_t)&digest[0], sizeof(digest)); SHA1Final((caddr_t)&digest[0], ctxt); - bcopy(&digest[0], (void *)addr, HMACSIZE); + bcopy(&digest[0], (void *)addr, sizeof(digest) > l ? l : sizeof(digest)); FREE(state->foo, M_TEMP); } @@ -869,10 +874,12 @@ ah_hmac_sha2_256_loop(state, addr, len) } static void -ah_hmac_sha2_256_result(state, addr) +ah_hmac_sha2_256_result(state, addr, l) struct ah_algorithm_state *state; caddr_t addr; + size_t l; { + u_char digest[SHA256_DIGEST_LENGTH]; u_char *ipad; u_char *opad; SHA256_CTX *ctxt; @@ -884,13 +891,14 @@ ah_hmac_sha2_256_result(state, addr) opad = (u_char *)(ipad + 64); ctxt = (SHA256_CTX *)(opad + 64); - SHA256_Final((u_int8_t *)addr, ctxt); + SHA256_Final((u_int8_t *)digest, ctxt); - bzero(ctxt, sizeof(*ctxt)); SHA256_Init(ctxt); SHA256_Update(ctxt, opad, 64); - SHA256_Update(ctxt, (const u_int8_t *)addr, SHA256_DIGEST_LENGTH); - SHA256_Final((u_int8_t *)addr, ctxt); + SHA256_Update(ctxt, (const u_int8_t *)digest, sizeof(digest)); + SHA256_Final((u_int8_t *)digest, ctxt); + + bcopy(&digest[0], (void *)addr, sizeof(digest) > l ? l : sizeof(digest)); FREE(state->foo, M_TEMP); } @@ -1009,10 +1017,12 @@ ah_hmac_sha2_384_loop(state, addr, len) } static void -ah_hmac_sha2_384_result(state, addr) +ah_hmac_sha2_384_result(state, addr, l) struct ah_algorithm_state *state; caddr_t addr; + size_t l; { + u_char digest[SHA384_DIGEST_LENGTH]; u_char *ipad; u_char *opad; SHA384_CTX *ctxt; @@ -1024,13 +1034,14 @@ ah_hmac_sha2_384_result(state, addr) opad = (u_char *)(ipad + 128); ctxt = (SHA384_CTX *)(opad + 128); - SHA384_Final((u_int8_t *)addr, ctxt); + SHA384_Final((u_int8_t *)digest, ctxt); - bzero(ctxt, sizeof(*ctxt)); SHA384_Init(ctxt); SHA384_Update(ctxt, opad, 128); - SHA384_Update(ctxt, (const u_int8_t *)addr, SHA384_DIGEST_LENGTH); - SHA384_Final((u_int8_t *)addr, ctxt); + SHA384_Update(ctxt, (const u_int8_t *)digest, sizeof(digest)); + SHA384_Final((u_int8_t *)digest, ctxt); + + bcopy(&digest[0], (void *)addr, sizeof(digest) > l ? l : sizeof(digest)); FREE(state->foo, M_TEMP); } @@ -1149,10 +1160,12 @@ ah_hmac_sha2_512_loop(state, addr, len) } static void -ah_hmac_sha2_512_result(state, addr) +ah_hmac_sha2_512_result(state, addr, l) struct ah_algorithm_state *state; caddr_t addr; + size_t l; { + u_char digest[SHA512_DIGEST_LENGTH]; u_char *ipad; u_char *opad; SHA512_CTX *ctxt; @@ -1164,13 +1177,14 @@ ah_hmac_sha2_512_result(state, addr) opad = (u_char *)(ipad + 128); ctxt = (SHA512_CTX *)(opad + 128); - SHA512_Final((u_int8_t *)addr, ctxt); + SHA512_Final((u_int8_t *)digest, ctxt); - bzero(ctxt, sizeof(*ctxt)); SHA512_Init(ctxt); SHA512_Update(ctxt, opad, 128); - SHA512_Update(ctxt, (const u_int8_t *)addr, SHA512_DIGEST_LENGTH); - SHA512_Final((u_int8_t *)addr, ctxt); + SHA512_Update(ctxt, (const u_int8_t *)digest, sizeof(digest)); + SHA512_Final((u_int8_t *)digest, ctxt); + + bcopy(&digest[0], (void *)addr, sizeof(digest) > l ? l : sizeof(digest)); FREE(state->foo, M_TEMP); } @@ -1453,7 +1467,7 @@ ah4_calccksum(m, ahdat, len, algo, sav) goto fail; } - (algo->result)(&algos, (caddr_t) &sumbuf[0]); + (algo->result)(&algos, (caddr_t) &sumbuf[0], sizeof(sumbuf)); bcopy(&sumbuf[0], ahdat, (*algo->sumsiz)(sav)); if (n) @@ -1680,7 +1694,7 @@ ah6_calccksum(m, ahdat, len, algo, sav) goto fail; } - (algo->result)(&algos, (caddr_t) &sumbuf[0]); + (algo->result)(&algos, (caddr_t) &sumbuf[0], sizeof(sumbuf)); bcopy(&sumbuf[0], ahdat, (*algo->sumsiz)(sav)); /* just in case */ diff --git a/bsd/netinet6/ah_input.c b/bsd/netinet6/ah_input.c index fcffc1ded..a448295b7 100644 --- a/bsd/netinet6/ah_input.c +++ b/bsd/netinet6/ah_input.c @@ -116,6 +116,7 @@ #include #include +#include #include @@ -416,6 +417,9 @@ ah4_input(struct mbuf *m, int off) stripsiz = sizeof(struct newah) + siz1; } if (ipsec4_tunnel_validate(m, off + stripsiz, nxt, sav, &ifamily)) { + ifaddr_t ifa; + struct sockaddr_storage addr; + /* * strip off all the headers that precedes AH. * IP xx AH IP' payload -> IP' payload @@ -481,7 +485,25 @@ ah4_input(struct mbuf *m, int off) IPSEC_STAT_INCREMENT(ipsecstat.in_nomem); goto fail; } - proto_input(PF_INET, m); + + if (ip_doscopedroute) { + struct sockaddr_in *ipaddr; + + bzero(&addr, sizeof(addr)); + ipaddr = (__typeof__(ipaddr))&addr; + ipaddr->sin_family = AF_INET; + ipaddr->sin_len = sizeof(*ipaddr); + ipaddr->sin_addr = ip->ip_dst; + + // update the receiving interface address based on the inner address + ifa = ifa_ifwithaddr((struct sockaddr *)&addr); + if (ifa) { + m->m_pkthdr.rcvif = ifa->ifa_ifp; + IFA_REMREF(ifa); + } + } + if (proto_input(PF_INET, m) != 0) + goto fail; nxt = IPPROTO_DONE; } else { /* @@ -549,6 +571,10 @@ ah4_input(struct mbuf *m, int off) goto fail; } + DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL, + struct ip *, ip, struct ifnet *, m->m_pkthdr.rcvif, + struct ip *, ip, struct ip6_hdr *, NULL); + if (nxt != IPPROTO_DONE) { if ((ip_protox[nxt]->pr_flags & PR_LASTHDR) != 0 && ipsec4_in_reject(m, NULL)) { @@ -583,10 +609,9 @@ ah4_input(struct mbuf *m, int off) #if INET6 int -ah6_input(mp, offp) - struct mbuf **mp; - int *offp; +ah6_input(struct mbuf **mp, int *offp, int proto) { +#pragma unused(proto) struct mbuf *m = *mp; int off = *offp; struct ip6_hdr *ip6; @@ -825,6 +850,9 @@ ah6_input(mp, offp) stripsiz = sizeof(struct newah) + siz1; } if (ipsec6_tunnel_validate(m, off + stripsiz, nxt, sav)) { + ifaddr_t ifa; + struct sockaddr_storage addr; + /* * strip off all the headers that precedes AH. * IP6 xx AH IP6' payload -> IP6' payload @@ -875,7 +903,26 @@ ah6_input(mp, offp) IPSEC_STAT_INCREMENT(ipsec6stat.in_nomem); goto fail; } - proto_input(PF_INET6, m); + + if (ip6_doscopedroute) { + struct sockaddr_in6 *ip6addr; + + bzero(&addr, sizeof(addr)); + ip6addr = (__typeof__(ip6addr))&addr; + ip6addr->sin6_family = AF_INET6; + ip6addr->sin6_len = sizeof(*ip6addr); + ip6addr->sin6_addr = ip6->ip6_dst; + + // update the receiving interface address based on the inner address + ifa = ifa_ifwithaddr((struct sockaddr *)&addr); + if (ifa) { + m->m_pkthdr.rcvif = ifa->ifa_ifp; + IFA_REMREF(ifa); + } + } + + if (proto_input(PF_INET6, m) != 0) + goto fail; nxt = IPPROTO_DONE; } else { /* diff --git a/bsd/netinet6/dest6.c b/bsd/netinet6/dest6.c index ae7a18b8a..993ee1a91 100644 --- a/bsd/netinet6/dest6.c +++ b/bsd/netinet6/dest6.c @@ -54,17 +54,13 @@ * Destination options header processing. */ int -dest6_input(mp, offp) - struct mbuf **mp; - int *offp; +dest6_input(struct mbuf **mp, int *offp, int proto) { +#pragma unused(proto) struct mbuf *m = *mp; int off = *offp, dstoptlen, optlen; struct ip6_dest *dstopts; u_int8_t *opt; - struct ip6_hdr *ip6; - - ip6 = mtod(m, struct ip6_hdr *); /* validation of the length of the header */ #ifndef PULLDOWN_TEST @@ -107,7 +103,7 @@ dest6_input(mp, offp) default: /* unknown option */ optlen = ip6_unknown_opt(opt, m, - opt - mtod(m, u_int8_t *), 0); + opt - mtod(m, u_int8_t *)); if (optlen == -1) return (IPPROTO_DONE); optlen += 2; diff --git a/bsd/netinet6/esp6.h b/bsd/netinet6/esp6.h index e0c40b37f..7b054cd50 100644 --- a/bsd/netinet6/esp6.h +++ b/bsd/netinet6/esp6.h @@ -69,7 +69,7 @@ #ifdef KERNEL_PRIVATE extern int esp6_output(struct mbuf *, u_char *, struct mbuf *, struct secasvar *); -extern int esp6_input(struct mbuf **, int *); +extern int esp6_input(struct mbuf **, int *, int); extern void esp6_ctlinput(int, struct sockaddr *, void *); #endif /* KERNEL_PRIVATE */ diff --git a/bsd/netinet6/esp_core.c b/bsd/netinet6/esp_core.c index 3bae5bd18..905de9ba2 100644 --- a/bsd/netinet6/esp_core.c +++ b/bsd/netinet6/esp_core.c @@ -1203,7 +1203,7 @@ esp_auth(m0, skip, length, sav, sum) break; } } - (*algo->result)(&s, (caddr_t) sumbuf); + (*algo->result)(&s, (caddr_t) sumbuf, sizeof(sumbuf)); bcopy(sumbuf, sum, siz); /*XXX*/ KERNEL_DEBUG(DBG_FNC_ESPAUTH | DBG_FUNC_END, 6,0,0,0,0); return 0; diff --git a/bsd/netinet6/esp_input.c b/bsd/netinet6/esp_input.c index b228fb035..c64150319 100644 --- a/bsd/netinet6/esp_input.c +++ b/bsd/netinet6/esp_input.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -119,6 +119,7 @@ #include #include +#include #include #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETIPSEC, 1) @@ -136,16 +137,37 @@ extern struct protosw inetsw[]; (sizeof(struct esp) < sizeof(struct newesp) \ ? sizeof(struct newesp) : sizeof(struct esp)) +static struct ip * +esp4_input_strip_UDP_encap (struct mbuf *m, int iphlen) +{ + // strip the udp header that's encapsulating ESP + struct ip *ip; + size_t stripsiz = sizeof(struct udphdr); + + ip = mtod(m, __typeof__(ip)); + ovbcopy((caddr_t)ip, (caddr_t)(((u_char *)ip) + stripsiz), iphlen); + m->m_data += stripsiz; + m->m_len -= stripsiz; + m->m_pkthdr.len -= stripsiz; + ip = mtod(m, __typeof__(ip)); + ip->ip_len = ip->ip_len - stripsiz; + ip->ip_p = IPPROTO_ESP; + return ip; +} + void esp4_input(m, off) struct mbuf *m; int off; { struct ip *ip; +#if INET6 struct ip6_hdr *ip6; +#endif /* INET6 */ struct esp *esp; struct esptail esptail; u_int32_t spi; + u_int32_t seq; struct secasvar *sav = NULL; size_t taillen; u_int16_t nxt; @@ -175,6 +197,14 @@ esp4_input(m, off) } ip = mtod(m, struct ip *); + // expect udp-encap and esp packets only + if (ip->ip_p != IPPROTO_ESP && + !(ip->ip_p == IPPROTO_UDP && off >= sizeof(struct udphdr))) { + ipseclog((LOG_DEBUG, + "IPv4 ESP input: invalid protocol type\n")); + IPSEC_STAT_INCREMENT(ipsecstat.in_inval); + goto bad; + } esp = (struct esp *)(((u_int8_t *)ip) + off); #ifdef _IP_VHL hlen = IP_VHL_HL(ip->ip_vhl) << 2; @@ -222,6 +252,7 @@ esp4_input(m, off) goto bad; } + seq = ntohl(((struct newesp *)esp)->esp_seq); if (!((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay && (sav->alg_auth && sav->key_auth))) goto noreplaycheck; @@ -233,7 +264,7 @@ esp4_input(m, off) /* * check for sequence number. */ - if (ipsec_chkreplay(ntohl(((struct newesp *)esp)->esp_seq), sav)) + if (ipsec_chkreplay(seq, sav)) ; /*okey*/ else { IPSEC_STAT_INCREMENT(ipsecstat.in_espreplay); @@ -298,7 +329,7 @@ esp4_input(m, off) * update sequence number. */ if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay) { - if (ipsec_updatereplay(ntohl(((struct newesp *)esp)->esp_seq), sav)) { + if (ipsec_updatereplay(seq, sav)) { IPSEC_STAT_INCREMENT(ipsecstat.in_espreplay); goto bad; } @@ -388,9 +419,40 @@ esp4_input(m, off) #else ip->ip_len = htons(ntohs(ip->ip_len) - taillen); #endif + if (ip->ip_p == IPPROTO_UDP) { + // offset includes the outer ip and udp header lengths. + if (m->m_len < off) { + m = m_pullup(m, off); + if (!m) { + ipseclog((LOG_DEBUG, + "IPv4 ESP input: invalid udp encapsulated ESP packet length \n")); + IPSEC_STAT_INCREMENT(ipsecstat.in_inval); + goto bad; + } + } + + // check the UDP encap header to detect changes in the source port, and then strip the header + off -= sizeof(struct udphdr); // off no longer includes the udphdr's size + // if peer is behind nat and this is the latest esp packet + if ((sav->flags & SADB_X_EXT_NATT_DETECTED_PEER) != 0 && + (sav->flags & SADB_X_EXT_OLD) == 0 && + seq && sav->replay && + seq >= sav->replay->lastseq) { + struct udphdr *encap_uh = (__typeof__(encap_uh))((caddr_t)ip + off); + if (encap_uh->uh_sport && + encap_uh->uh_sport != sav->remote_ike_port) { + sav->remote_ike_port = encap_uh->uh_sport; + } + } + ip = esp4_input_strip_UDP_encap(m, off); + esp = (struct esp *)(((u_int8_t *)ip) + off); + } /* was it transmitted over the IPsec tunnel SA? */ if (ipsec4_tunnel_validate(m, off + esplen + ivlen, nxt, sav, &ifamily)) { + ifaddr_t ifa; + struct sockaddr_storage addr; + /* * strip off all the headers that precedes ESP header. * IP4 xx ESP IP4' payload -> IP4' payload @@ -403,6 +465,8 @@ esp4_input(m, off) tos = ip->ip_tos; m_adj(m, off + esplen + ivlen); if (ifamily == AF_INET) { + struct sockaddr_in *ipaddr; + if (m->m_len < sizeof(*ip)) { m = m_pullup(m, sizeof(*ip)); if (!m) { @@ -421,8 +485,18 @@ esp4_input(m, off) IPSEC_STAT_INCREMENT(ipsecstat.in_inval); goto bad; } + + if (ip_doscopedroute) { + bzero(&addr, sizeof(addr)); + ipaddr = (__typeof__(ipaddr))&addr; + ipaddr->sin_family = AF_INET; + ipaddr->sin_len = sizeof(*ipaddr); + ipaddr->sin_addr = ip->ip_dst; + } #if INET6 } else if (ifamily == AF_INET6) { + struct sockaddr_in6 *ip6addr; + #ifndef PULLDOWN_TEST /* * m_pullup is prohibited in KAME IPv6 input processing @@ -452,7 +526,15 @@ esp4_input(m, off) ipsec6_logpacketstr(ip6, spi), ipsec_logsastr(sav))); IPSEC_STAT_INCREMENT(ipsecstat.in_inval); goto bad; - } + } + + if (ip6_doscopedroute) { + bzero(&addr, sizeof(addr)); + ip6addr = (__typeof__(ip6addr))&addr; + ip6addr->sin6_family = AF_INET6; + ip6addr->sin6_len = sizeof(*ip6addr); + ip6addr->sin6_addr = ip6->ip6_dst; + } #endif /* INET6 */ } else { ipseclog((LOG_ERR, "ipsec tunnel unsupported address family " @@ -466,10 +548,21 @@ esp4_input(m, off) IPSEC_STAT_INCREMENT(ipsecstat.in_nomem); goto bad; } - + + if (ip_doscopedroute || ip6_doscopedroute) { + // update the receiving interface address based on the inner address + ifa = ifa_ifwithaddr((struct sockaddr *)&addr); + if (ifa) { + m->m_pkthdr.rcvif = ifa->ifa_ifp; + IFA_REMREF(ifa); + } + } + /* Clear the csum flags, they can't be valid for the inner headers */ m->m_pkthdr.csum_flags = 0; - proto_input(ifamily == AF_INET ? PF_INET : PF_INET6, m); + if (proto_input(ifamily == AF_INET ? PF_INET : PF_INET6, m) != 0) + goto bad; + nxt = IPPROTO_DONE; KERNEL_DEBUG(DBG_FNC_ESPIN | DBG_FUNC_END, 2,0,0,0,0); } else { @@ -554,6 +647,11 @@ esp4_input(m, off) udp->uh_sport = htons(sav->remote_ike_port); udp->uh_sum = 0; } + + DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL, + struct ip *, ip, struct ifnet *, m->m_pkthdr.rcvif, + struct ip *, ip, struct ip6_hdr *, NULL); + ip_proto_dispatch_in(m, off, nxt, 0); } else m_freem(m); @@ -583,16 +681,16 @@ esp4_input(m, off) #if INET6 int -esp6_input(mp, offp) - struct mbuf **mp; - int *offp; +esp6_input(struct mbuf **mp, int *offp, int proto) { +#pragma unused(proto) struct mbuf *m = *mp; int off = *offp; struct ip6_hdr *ip6; struct esp *esp; struct esptail esptail; u_int32_t spi; + u_int32_t seq; struct secasvar *sav = NULL; size_t taillen; u_int16_t nxt; @@ -667,6 +765,8 @@ esp6_input(mp, offp) goto bad; } + seq = ntohl(((struct newesp *)esp)->esp_seq); + if (!((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay && (sav->alg_auth && sav->key_auth))) goto noreplaycheck; @@ -678,7 +778,7 @@ esp6_input(mp, offp) /* * check for sequence number. */ - if (ipsec_chkreplay(ntohl(((struct newesp *)esp)->esp_seq), sav)) + if (ipsec_chkreplay(seq, sav)) ; /*okey*/ else { IPSEC_STAT_INCREMENT(ipsec6stat.in_espreplay); @@ -740,7 +840,7 @@ esp6_input(mp, offp) * update sequence number. */ if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay) { - if (ipsec_updatereplay(ntohl(((struct newesp *)esp)->esp_seq), sav)) { + if (ipsec_updatereplay(seq, sav)) { IPSEC_STAT_INCREMENT(ipsec6stat.in_espreplay); goto bad; } @@ -828,6 +928,9 @@ esp6_input(mp, offp) /* was it transmitted over the IPsec tunnel SA? */ if (ipsec6_tunnel_validate(m, off + esplen + ivlen, nxt, sav)) { + ifaddr_t ifa; + struct sockaddr_storage addr; + /* * strip off all the headers that precedes ESP header. * IP6 xx ESP IP6' payload -> IP6' payload @@ -872,7 +975,26 @@ esp6_input(mp, offp) IPSEC_STAT_INCREMENT(ipsec6stat.in_nomem); goto bad; } - proto_input(PF_INET6, m); + + if (ip6_doscopedroute) { + struct sockaddr_in6 *ip6addr; + + bzero(&addr, sizeof(addr)); + ip6addr = (__typeof__(ip6addr))&addr; + ip6addr->sin6_family = AF_INET6; + ip6addr->sin6_len = sizeof(*ip6addr); + ip6addr->sin6_addr = ip6->ip6_dst; + + // update the receiving interface address based on the inner address + ifa = ifa_ifwithaddr((struct sockaddr *)&addr); + if (ifa) { + m->m_pkthdr.rcvif = ifa->ifa_ifp; + IFA_REMREF(ifa); + } + } + + if (proto_input(PF_INET6, m) != 0) + goto bad; nxt = IPPROTO_DONE; } else { /* diff --git a/bsd/netinet6/frag6.c b/bsd/netinet6/frag6.c index ea75e5acb..b6b68b920 100644 --- a/bsd/netinet6/frag6.c +++ b/bsd/netinet6/frag6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -77,6 +77,7 @@ #include #include +#include #include #include #include @@ -102,9 +103,6 @@ u_int frag6_nfragpackets; static u_int frag6_nfrags; struct ip6q ip6q; /* ip6 reassemble queue */ -#ifndef __APPLE__ -MALLOC_DEFINE(M_FTABLE, "fragment", "fragment reassembly header"); -#endif extern lck_mtx_t *inet6_domain_mutex; /* @@ -162,10 +160,9 @@ frag6_init() * inet6_domain_mutex is protecting he frag6 queue manipulation. */ int -frag6_input(mp, offp) - struct mbuf **mp; - int *offp; +frag6_input(struct mbuf **mp, int *offp, int proto) { +#pragma unused(proto) struct mbuf *m = *mp, *t; struct ip6_hdr *ip6; struct ip6_frag *ip6f; @@ -176,6 +173,8 @@ frag6_input(mp, offp) int fragoff, frgpartlen; /* must be larger than u_int16_t */ struct ifnet *dstifp; struct ifaddr *ifa = NULL; + u_int8_t ecn, ecn0; + #ifdef IN6_IFSTAT_STRICT struct route_in6 ro; struct sockaddr_in6 *dst; @@ -204,7 +203,7 @@ frag6_input(mp, offp) if (ro.ro_rt != NULL) { RT_LOCK(ro.ro_rt); if ((ifa = ro.ro_rt->rt_ifa) != NULL) { - ifaref(ifa); + IFA_ADDREF(ifa); dstifp = ((struct in6_ifaddr *)ro.ro_rt->rt_ifa)->ia_ifp; } RT_UNLOCK(ro.ro_rt); @@ -222,7 +221,7 @@ frag6_input(mp, offp) icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offset); in6_ifstat_inc(dstifp, ifs6_reass_fail); if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); return IPPROTO_DONE; } @@ -239,7 +238,7 @@ frag6_input(mp, offp) offsetof(struct ip6_hdr, ip6_plen)); in6_ifstat_inc(dstifp, ifs6_reass_fail); if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); return IPPROTO_DONE; } @@ -298,10 +297,11 @@ frag6_input(mp, offp) q6->ip6q_nxtp = (u_char *)nxtp; #endif q6->ip6q_ident = ip6f->ip6f_ident; - q6->ip6q_arrive = 0; /* Is it used anywhere? */ q6->ip6q_ttl = IPV6_FRAGTTL; q6->ip6q_src = ip6->ip6_src; q6->ip6q_dst = ip6->ip6_dst; + q6->ip6q_ecn = + (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK; q6->ip6q_unfrglen = -1; /* The 1st fragment has not arrived. */ q6->ip6q_nfrag = 0; @@ -332,7 +332,7 @@ frag6_input(mp, offp) offsetof(struct ip6_frag, ip6f_offlg)); frag6_doing_reass = 0; if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); return(IPPROTO_DONE); } } @@ -342,7 +342,7 @@ frag6_input(mp, offp) offsetof(struct ip6_frag, ip6f_offlg)); frag6_doing_reass = 0; if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); return(IPPROTO_DONE); } /* @@ -387,10 +387,6 @@ frag6_input(mp, offp) if (ip6af == NULL) goto dropfrag; bzero(ip6af, sizeof(*ip6af)); - ip6af->ip6af_head = ip6->ip6_flow; - ip6af->ip6af_len = ip6->ip6_plen; - ip6af->ip6af_nxt = ip6->ip6_nxt; - ip6af->ip6af_hlim = ip6->ip6_hlim; ip6af->ip6af_mff = ip6f->ip6f_offlg & IP6F_MORE_FRAG; ip6af->ip6af_off = fragoff; ip6af->ip6af_frglen = frgpartlen; @@ -402,6 +398,26 @@ frag6_input(mp, offp) goto insert; } + /* + * Handle ECN by comparing this segment with the first one; + * if CE is set, do not lose CE. + * drop if CE and not-ECT are mixed for the same packet. + */ + ecn = (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK; + ecn0 = q6->ip6q_ecn; + if (ecn == IPTOS_ECN_CE) { + if (ecn0 == IPTOS_ECN_NOTECT) { + FREE(ip6af, M_FTABLE); + goto dropfrag; + } + if (ecn0 != IPTOS_ECN_CE) + q6->ip6q_ecn = IPTOS_ECN_CE; + } + if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) { + FREE(ip6af, M_FTABLE); + goto dropfrag; + } + /* * Find a segment which begins after this one does. */ @@ -450,6 +466,11 @@ frag6_input(mp, offp) * If the incoming framgent overlaps some existing fragments in * the reassembly queue, drop it, since it is dangerous to override * existing fragments from a security point of view. + * We don't know which fragment is the bad guy - here we trust + * fragment that came in earlier, with no real reason. + * + * Note: due to changes after disabling this part, mbuf passed to + * m_adj() below now does not meet the requirement. */ if (af6->ip6af_up != (struct ip6asfrag *)q6) { i = af6->ip6af_up->ip6af_off + af6->ip6af_up->ip6af_frglen @@ -501,7 +522,7 @@ frag6_input(mp, offp) if (af6->ip6af_off != next) { frag6_doing_reass = 0; if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); return IPPROTO_DONE; } next += af6->ip6af_frglen; @@ -509,7 +530,7 @@ frag6_input(mp, offp) if (af6->ip6af_up->ip6af_mff) { frag6_doing_reass = 0; if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); return IPPROTO_DONE; } @@ -538,15 +559,17 @@ frag6_input(mp, offp) ip6->ip6_plen = htons((u_short)next + offset - sizeof(struct ip6_hdr)); ip6->ip6_src = q6->ip6q_src; ip6->ip6_dst = q6->ip6q_dst; + if (q6->ip6q_ecn == IPTOS_ECN_CE) + ip6->ip6_flow |= htonl(IPTOS_ECN_CE << 20); + nxt = q6->ip6q_nxt; #if notyet *q6->ip6q_nxtp = (u_char)(nxt & 0xff); #endif - /* - * Delete frag6 header with as a few cost as possible. - */ - if (offset < m->m_len) { + /* Delete frag6 header */ + if (m->m_len >= offset + sizeof(struct ip6_frag)) { + /* This is the only possible case with !PULLDOWN_TEST */ ovbcopy((caddr_t)ip6, (caddr_t)ip6 + sizeof(struct ip6_frag), offset); m->m_data += sizeof(struct ip6_frag); @@ -596,7 +619,7 @@ frag6_input(mp, offp) frag6_doing_reass = 0; if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); return nxt; dropfrag: @@ -605,7 +628,7 @@ frag6_input(mp, offp) m_freem(m); frag6_doing_reass = 0; if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); return IPPROTO_DONE; } @@ -636,7 +659,7 @@ frag6_freef(q6) /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); - /* restoure source and destination addresses */ + /* restore source and destination addresses */ ip6->ip6_src = q6->ip6q_src; ip6->ip6_dst = q6->ip6q_dst; icmp6_error(m, ICMP6_TIME_EXCEEDED, diff --git a/bsd/netinet6/icmp6.c b/bsd/netinet6/icmp6.c index 02d19734f..43a61a6d2 100644 --- a/bsd/netinet6/icmp6.c +++ b/bsd/netinet6/icmp6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -123,6 +123,7 @@ #include #include #include +#include #if IPSEC #include @@ -148,8 +149,6 @@ static int icmp6errpps_count = 0; static struct timeval icmp6errppslim_last; extern int icmp6_nodeinfo; extern struct inpcbinfo ripcbinfo; -extern lck_mtx_t *ip6_mutex; -extern lck_mtx_t *nd6_mutex; extern lck_mtx_t *inet6_domain_mutex; static void icmp6_errcount(struct icmp6errstat *, int, int); @@ -169,19 +168,12 @@ static int ni6_store_addrs(struct icmp6_nodeinfo *, struct icmp6_nodeinfo *, struct ifnet *, int); static int icmp6_notify_error(struct mbuf *, int, int, int); -#ifdef COMPAT_RFC1885 -/* - * XXX: Compiled out for now, but if enabled we must use a lock for accesses, - * or easier, define it locally inside icmp6_reflect() and don't cache. - */ -static struct route_in6 icmp6_reflect_rt; -#endif void icmp6_init() { - mld6_init(); + mld_init(); } static void @@ -242,13 +234,44 @@ icmp6_errcount(stat, type, code) stat->icp6errs_unknown++; } +/* + * A wrapper function for icmp6_error() necessary when the erroneous packet + * may not contain enough scope zone information. + */ +void +icmp6_error2(struct mbuf *m, int type, int code, int param, + struct ifnet *ifp) +{ + struct ip6_hdr *ip6; + + if (ifp == NULL) + return; + +#ifndef PULLDOWN_TEST + IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr),return ); +#else + if (m->m_len < sizeof(struct ip6_hdr)) { + m = m_pullup(m, sizeof(struct ip6_hdr)); + if (m == NULL) + return; + } +#endif + + ip6 = mtod(m, struct ip6_hdr *); + + if (in6_setscope(&ip6->ip6_src, ifp, NULL) != 0) + return; + if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0) + return; + + icmp6_error(m, type, code, param); +} + /* * Generate an error packet of type error in response to bad IP6 packet. */ void -icmp6_error(m, type, code, param) - struct mbuf *m; - int type, code, param; +icmp6_error(struct mbuf *m, int type, int code, int param) { struct ip6_hdr *oip6, *nip6; struct icmp6_hdr *icmp6; @@ -258,7 +281,6 @@ icmp6_error(m, type, code, param) icmp6stat.icp6s_error++; - lck_mtx_assert(ip6_mutex, LCK_MTX_ASSERT_NOTOWNED); /* count per-type-code statistics */ icmp6_errcount(&icmp6stat.icp6s_outerrhist, type, code); @@ -281,9 +303,15 @@ icmp6_error(m, type, code, param) oip6 = mtod(m, struct ip6_hdr *); /* - * Multicast destination check. For unrecognized option errors, - * this check has already done in ip6_unknown_opt(), so we can - * check only for other errors. + * If the destination address of the erroneous packet is a multicast + * address, or the packet was sent using link-layer multicast, + * we should basically suppress sending an error (RFC 2463, Section + * 2.4). + * We have two exceptions (the item e.2 in that section): + * - the Pakcet Too Big message can be sent for path MTU discovery. + * - the Parameter Problem Message that can be allowed an icmp6 error + * in the option type field. This check has been done in + * ip6_unknown_opt(), so we can just check the type and code. */ if ((m->m_flags & (M_BCAST|M_MCAST) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_dst)) && @@ -292,7 +320,10 @@ icmp6_error(m, type, code, param) code != ICMP6_PARAMPROB_OPTION))) goto freeit; - /* Source address check. XXX: the case of anycast source? */ + /* + * RFC 2463, 2.4 (e.5): source address check. + * XXX: the case of anycast source? + */ if (IN6_IS_ADDR_UNSPECIFIED(&oip6->ip6_src) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_src)) goto freeit; @@ -361,10 +392,8 @@ icmp6_error(m, type, code, param) nip6->ip6_src = oip6->ip6_src; nip6->ip6_dst = oip6->ip6_dst; - if (IN6_IS_SCOPE_LINKLOCAL(&oip6->ip6_src)) - oip6->ip6_src.s6_addr16[1] = 0; - if (IN6_IS_SCOPE_LINKLOCAL(&oip6->ip6_dst)) - oip6->ip6_dst.s6_addr16[1] = 0; + in6_clearscope(&oip6->ip6_src); + in6_clearscope(&oip6->ip6_dst); icmp6 = (struct icmp6_hdr *)(nip6 + 1); icmp6->icmp6_type = type; @@ -373,7 +402,7 @@ icmp6_error(m, type, code, param) /* * icmp6_reflect() is designed to be in the input path. - * icmp6_error() can be called from both input and outut path, + * icmp6_error() can be called from both input and output path, * and if we are in output path rcvif could contain bogus value. * clear m->m_pkthdr.rcvif for safety, we should have enough scope * information in ip header (nip6). @@ -387,7 +416,7 @@ icmp6_error(m, type, code, param) freeit: /* - * If we can't tell wheter or not we can generate ICMP6, free it. + * If we can't tell whether or not we can generate ICMP6, free it. */ m_freem(m); } @@ -396,17 +425,19 @@ icmp6_error(m, type, code, param) * Process a received ICMP6 message. */ int -icmp6_input(mp, offp) - struct mbuf **mp; - int *offp; +icmp6_input(struct mbuf **mp, int *offp, int proto) { +#pragma unused(proto) struct mbuf *m = *mp, *n; + struct ifnet *ifp; struct ip6_hdr *ip6, *nip6; struct icmp6_hdr *icmp6, *nicmp6; int off = *offp; int icmp6len = m->m_pkthdr.len - *offp; int code, sum, noff; + ifp = m->m_pkthdr.rcvif; + #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr), return IPPROTO_DONE); /* m might change if M_LOOP. So, call mtod after this */ @@ -423,6 +454,26 @@ icmp6_input(mp, offp) goto freeit; } + /* + * Check multicast group membership. + * Note: SSM filters are not applied for ICMPv6 traffic. + */ + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { + struct in6_multi *inm; + + in6_multihead_lock_shared(); + IN6_LOOKUP_MULTI(&ip6->ip6_dst, ifp, inm); + in6_multihead_lock_done(); + + if (inm == NULL) { + ip6stat.ip6s_notmember++; + in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); + goto freeit; + } else { + IN6M_REMREF(inm); + } + } + /* * calculate the checksum */ @@ -449,7 +500,7 @@ icmp6_input(mp, offp) if (faithprefix(&ip6->ip6_dst)) { /* * Deliver very specific ICMP6 type only. - * This is important to deilver TOOBIG. Otherwise PMTUD + * This is important to deliver TOOBIG. Otherwise PMTUD * will not work. */ switch (icmp6->icmp6_type) { @@ -468,7 +519,6 @@ icmp6_input(mp, offp) if (icmp6->icmp6_type < ICMP6_INFOMSG_MASK) icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_error); - switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_dstunreach); @@ -483,30 +533,21 @@ icmp6_input(mp, offp) case ICMP6_DST_UNREACH_ADDR: code = PRC_HOSTDEAD; break; -#ifdef COMPAT_RFC1885 - case ICMP6_DST_UNREACH_NOTNEIGHBOR: - code = PRC_UNREACH_SRCFAIL; - break; -#else case ICMP6_DST_UNREACH_BEYONDSCOPE: /* I mean "source address was incorrect." */ code = PRC_PARAMPROB; break; -#endif case ICMP6_DST_UNREACH_NOPORT: code = PRC_UNREACH_PORT; break; default: goto badcode; } - goto deliver; break; case ICMP6_PACKET_TOO_BIG: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_pkttoobig); - if (code != 0) - goto badcode; code = PRC_MSGSIZE; @@ -521,8 +562,10 @@ icmp6_input(mp, offp) icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_timeexceed); switch (code) { case ICMP6_TIME_EXCEED_TRANSIT: + code = PRC_TIMXCEED_INTRANS; + break; case ICMP6_TIME_EXCEED_REASSEMBLY: - code += PRC_TIMXCEED_INTRANS; + code = PRC_TIMXCEED_REASS; break; default: goto badcode; @@ -631,11 +674,12 @@ icmp6_input(mp, offp) goto badcode; break; - case MLD6_LISTENER_QUERY: - case MLD6_LISTENER_REPORT: - if (icmp6len < sizeof(struct mld6_hdr)) + case MLD_LISTENER_QUERY: + case MLD_LISTENER_REPORT: + + if (icmp6len < sizeof(struct mld_hdr)) goto badlen; - if (icmp6->icmp6_type == MLD6_LISTENER_QUERY) /* XXX: ugly... */ + if (icmp6->icmp6_type == MLD_LISTENER_QUERY) /* XXX: ugly... */ icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mldquery); else icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mldreport); @@ -647,31 +691,32 @@ icmp6_input(mp, offp) if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ - mld6_input(m, off); - m = NULL; + if (mld_input(m, off, icmp6len) == IPPROTO_DONE) + m = NULL; goto freeit; } - mld6_input(n, off); + if (mld_input(n, off, icmp6len) != IPPROTO_DONE) + m_freem(n); /* m stays. */ goto rate_limit_checked; break; - case MLD6_LISTENER_DONE: + case MLD_LISTENER_DONE: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mlddone); - if (icmp6len < sizeof(struct mld6_hdr)) /* necessary? */ + if (icmp6len < sizeof(struct mld_hdr)) /* necessary? */ goto badlen; break; /* nothing to be done in kernel */ - case MLD6_MTRACE_RESP: - case MLD6_MTRACE: - /* XXX: these two are experimental. not officially defind. */ + case MLD_MTRACE_RESP: + case MLD_MTRACE: + /* XXX: these two are experimental. not officially defined. */ /* XXX: per-interface statistics? */ break; /* just pass it to applications */ case ICMP6_NI_QUERY: if (!icmp6_nodeinfo) break; - +//### LD 10/20 Check fbsd differences here. Not sure we're more advanced or not. /* By RFC 4620 refuse to answer queries from global scope addresses */ if ((icmp6_nodeinfo & 8) != 8 && in6_addrscope(&ip6->ip6_src) == IPV6_ADDR_SCOPE_GLOBAL) break; @@ -948,7 +993,7 @@ icmp6_notify_error(m, off, icmp6len, code) return(-1); } #endif - + if (nxt == IPPROTO_AH) eoff += (eh->ip6e_len + 2) << 2; else @@ -1023,7 +1068,7 @@ icmp6_notify_error(m, off, icmp6len, code) eoff, sizeof(*fh)); if (fh == NULL) { icmp6stat.icp6s_tooshort++; - return(-1); + return (-1); } #endif /* @@ -1055,14 +1100,23 @@ icmp6_notify_error(m, off, icmp6len, code) icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, - sizeof(*icmp6) + sizeof(struct ip6_hdr)); + sizeof(*icmp6) + sizeof(struct ip6_hdr)); if (icmp6 == NULL) { icmp6stat.icp6s_tooshort++; - return(-1); + return (-1); } #endif + /* + * retrieve parameters from the inner IPv6 header, and convert + * them into sockaddr structures. + * XXX: there is no guarantee that the source or destination + * addresses of the inner packet are in the same scope as + * the addresses of the icmp packet. But there is no other + * way to determine the zone. + */ eip6 = (struct ip6_hdr *)(icmp6 + 1); + bzero(&icmp6dst, sizeof(icmp6dst)); icmp6dst.sin6_len = sizeof(struct sockaddr_in6); icmp6dst.sin6_family = AF_INET6; @@ -1070,39 +1124,16 @@ icmp6_notify_error(m, off, icmp6len, code) icmp6dst.sin6_addr = eip6->ip6_dst; else icmp6dst.sin6_addr = *finaldst; - icmp6dst.sin6_scope_id = in6_addr2scopeid(m->m_pkthdr.rcvif, - &icmp6dst.sin6_addr); -#ifndef SCOPEDROUTING - if (in6_embedscope(&icmp6dst.sin6_addr, &icmp6dst, - NULL, NULL)) { - /* should be impossbile */ - nd6log((LOG_DEBUG, - "icmp6_notify_error: in6_embedscope failed\n")); + if (in6_setscope(&icmp6dst.sin6_addr, m->m_pkthdr.rcvif, NULL)) goto freeit; - } -#endif - - /* - * retrieve parameters from the inner IPv6 header, and convert - * them into sockaddr structures. - */ bzero(&icmp6src, sizeof(icmp6src)); icmp6src.sin6_len = sizeof(struct sockaddr_in6); icmp6src.sin6_family = AF_INET6; icmp6src.sin6_addr = eip6->ip6_src; - icmp6src.sin6_scope_id = in6_addr2scopeid(m->m_pkthdr.rcvif, - &icmp6src.sin6_addr); -#ifndef SCOPEDROUTING - if (in6_embedscope(&icmp6src.sin6_addr, &icmp6src, - NULL, NULL)) { - /* should be impossbile */ - nd6log((LOG_DEBUG, - "icmp6_notify_error: in6_embedscope failed\n")); + if (in6_setscope(&icmp6src.sin6_addr, m->m_pkthdr.rcvif, NULL)) goto freeit; - } -#endif icmp6src.sin6_flowinfo = - (eip6->ip6_flow & IPV6_FLOWLABEL_MASK); + (eip6->ip6_flow & IPV6_FLOWLABEL_MASK); if (finaldst == NULL) finaldst = &eip6->ip6_dst; @@ -1145,9 +1176,16 @@ icmp6_mtudisc_update(ip6cp, validated) u_int mtu = ntohl(icmp6->icmp6_mtu); struct rtentry *rt = NULL; struct sockaddr_in6 sin6; + /* + * we reject ICMPv6 too big with abnormally small value. + * XXX what is the good definition of "abnormally small"? + */ + if (mtu < sizeof(struct ip6_hdr) + sizeof(struct ip6_frag) + 8) + return; if (!validated) return; + /* * In case the suggested mtu is less than IPV6_MMTU, we * only need to remember that it was for above mentioned @@ -1167,19 +1205,16 @@ icmp6_mtudisc_update(ip6cp, validated) htons(m->m_pkthdr.rcvif->if_index); } /* sin6.sin6_scope_id = XXX: should be set if DST is a scoped addr */ - rt = rtalloc1((struct sockaddr *)&sin6, 0, RTF_CLONING | RTF_PRCLONING); + rt = rtalloc1_scoped((struct sockaddr *)&sin6, 0, + RTF_CLONING | RTF_PRCLONING, m->m_pkthdr.rcvif->if_index); if (rt != NULL) { RT_LOCK(rt); if ((rt->rt_flags & RTF_HOST) && - !(rt->rt_rmx.rmx_locks & RTV_MTU)) { - if (mtu < IPV6_MMTU) { - /* xxx */ - rt->rt_rmx.rmx_locks |= RTV_MTU; - } else if (mtu < rt->rt_ifp->if_mtu && - rt->rt_rmx.rmx_mtu > mtu) { - icmp6stat.icp6s_pmtuchg++; - rt->rt_rmx.rmx_mtu = mtu; - } + !(rt->rt_rmx.rmx_locks & RTV_MTU) && + mtu < IN6_LINKMTU(rt->rt_ifp) && + rt->rt_rmx.rmx_mtu > mtu) { + icmp6stat.icp6s_pmtuchg++; + rt->rt_rmx.rmx_mtu = mtu; } RT_UNLOCK(rt); rtfree(rt); @@ -1189,7 +1224,7 @@ icmp6_mtudisc_update(ip6cp, validated) /* * Process a Node Information Query packet, based on * draft-ietf-ipngwg-icmp-name-lookups-07. - * + * * Spec incompatibilities: * - IPv6 Subject address handling * - IPv4 Subject address handling support missing @@ -1216,7 +1251,6 @@ ni6_input(m, off) struct ip6_hdr *ip6; int oldfqdn = 0; /* if 1, return pascal string (03 draft) */ char *subj = NULL; - struct in6_ifaddr *ia6 = NULL; ip6 = mtod(m, struct ip6_hdr *); #ifndef PULLDOWN_TEST @@ -1225,40 +1259,59 @@ ni6_input(m, off) IP6_EXTHDR_GET(ni6, struct icmp6_nodeinfo *, m, off, sizeof(*ni6)); if (ni6 == NULL) { /* m is already reclaimed */ - return NULL; + return (NULL); } #endif + /* + * Validate IPv6 source address. + * The default configuration MUST be to refuse answering queries from + * global-scope addresses according to RFC4602. + * Notes: + * - it's not very clear what "refuse" means; this implementation + * simply drops it. + * - it's not very easy to identify global-scope (unicast) addresses + * since there are many prefixes for them. It should be safer + * and in practice sufficient to check "all" but loopback and + * link-local (note that site-local unicast was deprecated and + * ULA is defined as global scope-wise) + */ + if ((icmp6_nodeinfo & ICMP6_NODEINFO_GLOBALOK) == 0 && + !IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src) && + !IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src)) + goto bad; + /* * Validate IPv6 destination address. * * The Responder must discard the Query without further processing * unless it is one of the Responder's unicast or anycast addresses, or * a link-local scope multicast address which the Responder has joined. - * [icmp-name-lookups-07, Section 4.] + * [RFC4602, Section 5.] */ - bzero(&sin6, sizeof(sin6)); - sin6.sin6_family = AF_INET6; - sin6.sin6_len = sizeof(struct sockaddr_in6); - bcopy(&ip6->ip6_dst, &sin6.sin6_addr, sizeof(sin6.sin6_addr)); - /* XXX scopeid */ - if ((ia6 = (struct in6_ifaddr *)ifa_ifwithaddr((struct sockaddr *)&sin6)) != NULL) { - /* unicast/anycast, fine */ - if ((ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && - (icmp6_nodeinfo & 4) == 0) { - ifafree(&ia6->ia_ifa); - ia6 = NULL; + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { + if (!IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst)) + goto bad; + /* else it's a link-local multicast, fine */ + } else { /* unicast or anycast */ + struct in6_ifaddr *ia6; + + if ((ia6 = ip6_getdstifaddr(m)) == NULL) + goto bad; /* XXX impossible */ + + IFA_LOCK(&ia6->ia_ifa); + if ((ia6->ia6_flags & IN6_IFF_TEMPORARY) && + !(icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK)) { nd6log((LOG_DEBUG, "ni6_input: ignore node info to " "a temporary address in %s:%d", __FILE__, __LINE__)); + IFA_UNLOCK(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); goto bad; } - ifafree(&ia6->ia_ifa); - ia6 = NULL; - } else if (IN6_IS_ADDR_MC_LINKLOCAL(&sin6.sin6_addr)) - ; /* link-local multicast, fine */ - else - goto bad; + IFA_UNLOCK(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); + } /* validate query Subject field. */ qtype = ntohs(ni6->ni_qtype); @@ -1272,6 +1325,7 @@ ni6_input(m, off) /* FALLTHROUGH */ case NI_QTYPE_FQDN: case NI_QTYPE_NODEADDR: + case NI_QTYPE_IPV4ADDR: switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: #if ICMP6_NI_SUBJ_IPV6 != 0 @@ -1291,7 +1345,7 @@ ni6_input(m, off) goto bad; #endif - if (subjlen != sizeof(sin6.sin6_addr)) + if (subjlen != sizeof(struct in6_addr)) goto bad; /* @@ -1313,18 +1367,16 @@ ni6_input(m, off) subjlen, (caddr_t)&sin6.sin6_addr); sin6.sin6_scope_id = in6_addr2scopeid(m->m_pkthdr.rcvif, &sin6.sin6_addr); -#ifndef SCOPEDROUTING - in6_embedscope(&sin6.sin6_addr, &sin6, NULL, NULL); -#endif + in6_embedscope(&sin6.sin6_addr, &sin6, NULL, NULL, + NULL); bzero(&sin6_d, sizeof(sin6_d)); sin6_d.sin6_family = AF_INET6; /* not used, actually */ sin6_d.sin6_len = sizeof(sin6_d); /* ditto */ sin6_d.sin6_addr = ip6->ip6_dst; sin6_d.sin6_scope_id = in6_addr2scopeid(m->m_pkthdr.rcvif, &ip6->ip6_dst); -#ifndef SCOPEDROUTING - in6_embedscope(&sin6_d.sin6_addr, &sin6_d, NULL, NULL); -#endif + in6_embedscope(&sin6_d.sin6_addr, &sin6_d, NULL, NULL, + NULL); subj = (char *)&sin6; if (SA6_ARE_ADDR_EQUAL(&sin6, &sin6_d)) break; @@ -1333,7 +1385,8 @@ ni6_input(m, off) * XXX if we are to allow other cases, we should really * be careful about scope here. * basically, we should disallow queries toward IPv6 - * destination X with subject Y, if scope(X) > scope(Y). + * destination X with subject Y, + * if scope(X) > scope(Y). * if we allow scope(X) > scope(Y), it will result in * information leakage across scope boundary. */ @@ -1376,11 +1429,12 @@ ni6_input(m, off) /* refuse based on configuration. XXX ICMP6_NI_REFUSED? */ switch (qtype) { case NI_QTYPE_FQDN: - if ((icmp6_nodeinfo & 1) == 0) + if ((icmp6_nodeinfo & ICMP6_NODEINFO_FQDNOK) == 0) goto bad; break; case NI_QTYPE_NODEADDR: - if ((icmp6_nodeinfo & 2) == 0) + case NI_QTYPE_IPV4ADDR: + if ((icmp6_nodeinfo & ICMP6_NODEINFO_NODEADDROK) == 0) goto bad; break; } @@ -1399,13 +1453,16 @@ ni6_input(m, off) case NI_QTYPE_NODEADDR: addrs = ni6_addrs(ni6, &ifp, subj); if ((replylen += addrs * (sizeof(struct in6_addr) + - sizeof(u_int32_t))) > MCLBYTES) + sizeof(u_int32_t))) > MCLBYTES) replylen = MCLBYTES; /* XXX: will truncate pkt later */ break; + case NI_QTYPE_IPV4ADDR: + /* unsupported - should respond with unknown Qtype? */ + break; default: /* * XXX: We must return a reply with the ICMP6 code - * `unknown Qtype' in this case. However we regard the case + * `unknown Qtype' in this case. However we regard the case * as an FQDN query for backward compatibility. * Older versions set a random value to this field, * so it rarely varies in the defined qtypes. @@ -1423,7 +1480,9 @@ ni6_input(m, off) MGETHDR(n, M_DONTWAIT, m->m_type); /* MAC-OK */ if (n == NULL) { m_freem(m); - return(NULL); + if (ifp != NULL) + ifnet_release(ifp); + return (NULL); } M_COPY_PKTHDR(n, m); /* just for recvif */ if (replylen > MHLEN) { @@ -1500,13 +1559,17 @@ ni6_input(m, off) nni6->ni_type = ICMP6_NI_REPLY; m_freem(m); - return(n); + if (ifp != NULL) + ifnet_release(ifp); + return (n); - bad: +bad: m_freem(m); if (n) m_freem(n); - return(NULL); + if (ifp != NULL) + ifnet_release(ifp); + return (NULL); } #undef hostnamelen @@ -1693,6 +1756,9 @@ ni6_addrs(ni6, ifpp, subj) int addrs = 0, addrsofif, iffound = 0; int niflags = ni6->ni_flags; + if (ifpp != NULL) + *ifpp = NULL; + if ((niflags & NI_NODEADDR_FLAG_ALL) == 0) { switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: @@ -1705,7 +1771,7 @@ ni6_addrs(ni6, ifpp, subj) * XXX: we only support IPv6 subject address for * this Qtype. */ - return(0); + return (0); } } @@ -1715,8 +1781,11 @@ ni6_addrs(ni6, ifpp, subj) ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; + } ifa6 = (struct in6_ifaddr *)ifa; if ((niflags & NI_NODEADDR_FLAG_ALL) == 0 && @@ -1737,18 +1806,25 @@ ni6_addrs(ni6, ifpp, subj) /* What do we have to do about ::1? */ switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: - if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) + if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) { + IFA_UNLOCK(ifa); continue; + } break; case IPV6_ADDR_SCOPE_SITELOCAL: - if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) + if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) { + IFA_UNLOCK(ifa); continue; + } break; case IPV6_ADDR_SCOPE_GLOBAL: - if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) + if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) { + IFA_UNLOCK(ifa); continue; + } break; default: + IFA_UNLOCK(ifa); continue; } @@ -1757,17 +1833,24 @@ ni6_addrs(ni6, ifpp, subj) * XXX: just experimental. not in the spec. */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 && - (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) + (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) { + IFA_UNLOCK(ifa); continue; /* we need only unicast addresses */ + } if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && - (icmp6_nodeinfo & 4) == 0) { + (icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) { + IFA_UNLOCK(ifa); continue; } addrsofif++; /* count the address */ + IFA_UNLOCK(ifa); } ifnet_lock_done(ifp); if (iffound) { - *ifpp = ifp; + if (ifpp != NULL) { + *ifpp = ifp; + ifnet_reference(ifp); + } ifnet_head_done(); return(addrsofif); } @@ -1776,7 +1859,7 @@ ni6_addrs(ni6, ifpp, subj) } ifnet_head_done(); - return(addrs); + return (addrs); } static int @@ -1798,20 +1881,23 @@ ni6_store_addrs(ni6, nni6, ifp0, resid) getmicrotime(&timenow); if (ifp0 == NULL && !(niflags & NI_NODEADDR_FLAG_ALL)) - return(0); /* needless to copy */ + return (0); /* needless to copy */ again: ifnet_head_lock_shared(); - if (ifp == NULL) ifp = TAILQ_FIRST(&ifnet_head); - + if (ifp == NULL) + ifp = TAILQ_FIRST(&ifnet_head); + for (; ifp; ifp = TAILQ_NEXT(ifp, if_list)) { ifnet_lock_shared(ifp); for (ifa = ifp->if_addrlist.tqh_first; ifa; - ifa = ifa->ifa_list.tqe_next) - { - if (ifa->ifa_addr->sa_family != AF_INET6) + ifa = ifa->ifa_list.tqe_next) { + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; + } ifa6 = (struct in6_ifaddr *)ifa; if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) != 0 && @@ -1825,45 +1911,57 @@ ni6_store_addrs(ni6, nni6, ifp0, resid) if (ifp_dep == NULL) ifp_dep = ifp; + IFA_UNLOCK(ifa); continue; - } - else if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) == 0 && - allow_deprecated != 0) + } else if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) == 0 && + allow_deprecated != 0) { + IFA_UNLOCK(ifa); continue; /* we now collect deprecated addrs */ - + } /* What do we have to do about ::1? */ switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: - if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) + if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) { + IFA_UNLOCK(ifa); continue; + } break; case IPV6_ADDR_SCOPE_SITELOCAL: - if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) + if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) { + IFA_UNLOCK(ifa); continue; + } break; case IPV6_ADDR_SCOPE_GLOBAL: - if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) + if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) { + IFA_UNLOCK(ifa); continue; + } break; default: + IFA_UNLOCK(ifa); continue; } /* * check if anycast is okay. - * XXX: just experimental. not in the spec. + * XXX: just experimental. not in the spec. */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 && - (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) + (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) { + IFA_UNLOCK(ifa); continue; + } if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && - (icmp6_nodeinfo & 4) == 0) { + (icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) { + IFA_UNLOCK(ifa); continue; } /* now we can copy the address */ if (resid < sizeof(struct in6_addr) + sizeof(u_int32_t)) { + IFA_UNLOCK(ifa); /* * We give up much more copy. * Set the truncate flag and return. @@ -1890,7 +1988,8 @@ ni6_store_addrs(ni6, nni6, ifp0, resid) * address configuration by DHCPv6, so the former * case can't happen. */ - if (ifa6->ia6_lifetime.ia6t_expire == 0) + if (ifa6->ia6_lifetime.ia6t_expire == 0 && + (ifa6->ia6_flags & IN6_IFF_TEMPORARY) == 0) ltime = ND6_INFINITE_LIFETIME; else { if (ifa6->ia6_lifetime.ia6t_expire > @@ -1899,7 +1998,7 @@ ni6_store_addrs(ni6, nni6, ifp0, resid) else ltime = 0; } - + bcopy(<ime, cp, sizeof(u_int32_t)); cp += sizeof(u_int32_t); @@ -1910,10 +2009,11 @@ ni6_store_addrs(ni6, nni6, ifp0, resid) if (IN6_IS_ADDR_LINKLOCAL(&ifa6->ia_addr.sin6_addr)) ((struct in6_addr *)cp)->s6_addr16[1] = 0; cp += sizeof(struct in6_addr); - + resid -= (sizeof(struct in6_addr) + sizeof(u_int32_t)); copied += (sizeof(struct in6_addr) + sizeof(u_int32_t)); + IFA_UNLOCK(ifa); } ifnet_lock_done(ifp); if (ifp0) /* we need search only on the specified IF */ @@ -1946,6 +2046,7 @@ icmp6_rip6_input(mp, off) struct sockaddr_in6 rip6src; struct icmp6_hdr *icmp6; struct mbuf *opts = NULL; + int ret = 0; #ifndef PULLDOWN_TEST /* this is assumed to be safe. */ @@ -1958,21 +2059,22 @@ icmp6_rip6_input(mp, off) } #endif + /* + * XXX: the address may have embedded scope zone ID, which should be + * hidden from applications. + */ bzero(&rip6src, sizeof(rip6src)); - rip6src.sin6_len = sizeof(struct sockaddr_in6); rip6src.sin6_family = AF_INET6; - /* KAME hack: recover scopeid */ - (void)in6_recoverscope(&rip6src, &ip6->ip6_src, m->m_pkthdr.rcvif); - + rip6src.sin6_len = sizeof(struct sockaddr_in6); + rip6src.sin6_addr = ip6->ip6_src; + if (sa6_recoverscope(&rip6src)) + return (IPPROTO_DONE); + lck_rw_lock_shared(ripcbinfo.mtx); LIST_FOREACH(in6p, &ripcb, inp_list) { if ((in6p->inp_vflag & INP_IPV6) == 0) continue; -#if HAVE_NRL_INPCB - if (!(in6p->in6p_flags & INP_IPV6)) - continue; -#endif if (in6p->in6p_ip6_nxt != IPPROTO_ICMPV6) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) && @@ -1988,10 +2090,20 @@ icmp6_rip6_input(mp, off) if (last) { struct mbuf *n; if ((n = m_copy(m, 0, (int)M_COPYALL)) != NULL) { - if (last->in6p_flags & IN6P_CONTROLOPTS) - ip6_savecontrol(last, &opts, ip6, n); + if ((last->in6p_flags & IN6P_CONTROLOPTS) != 0 || + (last->in6p_socket->so_options & SO_TIMESTAMP) != 0 || + (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + ret = ip6_savecontrol(last, n, &opts); + if (ret != 0) { + m_freem(n); + m_freem(opts); + last = in6p; + continue; + } + } /* strip intermediate headers */ m_adj(n, off); + so_recv_data_stat(last->in6p_socket, m, 0); if (sbappendaddr(&last->in6p_socket->so_rcv, (struct sockaddr *)&rip6src, n, opts, NULL) != 0) { @@ -2002,21 +2114,35 @@ icmp6_rip6_input(mp, off) } last = in6p; } - lck_rw_done(ripcbinfo.mtx); if (last) { - if (last->in6p_flags & IN6P_CONTROLOPTS) - ip6_savecontrol(last, &opts, ip6, m); + if ((last->in6p_flags & INP_CONTROLOPTS) != 0 || + (last->in6p_socket->so_options & SO_TIMESTAMP) != 0 || + (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + ret = ip6_savecontrol(last, m, &opts); + if (ret != 0) { + goto error; + } + } /* strip intermediate headers */ m_adj(m, off); + so_recv_data_stat(last->in6p_socket, m, 0); if (sbappendaddr(&last->in6p_socket->so_rcv, (struct sockaddr *)&rip6src, m, opts, NULL) != 0) { sorwakeup(last->in6p_socket); } } else { - m_freem(m); - ip6stat.ip6s_delivered--; + goto error; } + lck_rw_done(ripcbinfo.mtx); return IPPROTO_DONE; + +error: + lck_rw_done(ripcbinfo.mtx); + m_freem(m); + m_freem(opts); + ip6stat.ip6s_delivered--; + return IPPROTO_DONE; + } /* @@ -2036,11 +2162,11 @@ icmp6_reflect(m, off) int type, code; struct ifnet *outif = NULL; struct sockaddr_in6 sa6_src, sa6_dst; -#ifdef COMPAT_RFC1885 - int mtu = IPV6_MMTU; - struct sockaddr_in6 *sin6 = &icmp6_reflect_rt.ro_dst; -#endif u_int32_t oflow; + struct ip6_out_args ip6oa = { IFSCOPE_NONE, 0 }; + + if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif != NULL) + ip6oa.ip6oa_boundif = m->m_pkthdr.rcvif->if_index; /* too short to reflect */ if (off < sizeof(struct ip6_hdr)) { @@ -2098,74 +2224,42 @@ icmp6_reflect(m, off) * XXX: make sure to embed scope zone information, using * already embedded IDs or the received interface (if any). * Note that rcvif may be NULL. - * TODO: scoped routing case (XXX). */ bzero(&sa6_src, sizeof(sa6_src)); sa6_src.sin6_family = AF_INET6; sa6_src.sin6_len = sizeof(sa6_src); sa6_src.sin6_addr = ip6->ip6_dst; in6_recoverscope(&sa6_src, &ip6->ip6_dst, m->m_pkthdr.rcvif); - in6_embedscope(&ip6->ip6_dst, &sa6_src, NULL, NULL); + in6_embedscope(&ip6->ip6_dst, &sa6_src, NULL, NULL, NULL); bzero(&sa6_dst, sizeof(sa6_dst)); sa6_dst.sin6_family = AF_INET6; sa6_dst.sin6_len = sizeof(sa6_dst); sa6_dst.sin6_addr = t; in6_recoverscope(&sa6_dst, &t, m->m_pkthdr.rcvif); - in6_embedscope(&t, &sa6_dst, NULL, NULL); + in6_embedscope(&t, &sa6_dst, NULL, NULL, NULL); -#ifdef COMPAT_RFC1885 - /* - * xxx guess MTU - * RFC 1885 requires that echo reply should be truncated if it - * does not fit in with (return) path MTU, but the description was - * removed in the new spec. - */ - if (icmp6_reflect_rt.ro_rt == NULL || - !(icmp6_reflect_rt.ro_rt->rt_flags & RTF_UP) || - icmp6_reflect_rt.ro_rt->generation_id != route_generation || - ! (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, &ip6->ip6_dst))) { - if (icmp6_reflect_rt.ro_rt) { - rtfree(icmp6_reflect_rt.ro_rt); - icmp6_reflect_rt.ro_rt = 0; - } - bzero(sin6, sizeof(*sin6)); - sin6->sin6_family = PF_INET6; - sin6->sin6_len = sizeof(struct sockaddr_in6); - sin6->sin6_addr = ip6->ip6_dst; - - rtalloc_ign((struct route *)&icmp6_reflect_rt.ro_rt, - RTF_PRCLONING); - } - - if (icmp6_reflect_rt.ro_rt == 0) - goto bad; - - RT_LOCK(icmp6_reflect_rt.ro_rt); - if ((icmp6_reflect_rt.ro_rt->rt_flags & RTF_HOST) - && mtu < icmp6_reflect_rt.ro_rt->rt_ifp->if_mtu) - mtu = icmp6_reflect_rt.ro_rt->rt_rmx.rmx_mtu; - RT_UNLOCK(icmp6_reflect_rt.ro_rt); - - if (mtu < m->m_pkthdr.len) { - plen -= (m->m_pkthdr.len - mtu); - m_adj(m, mtu - m->m_pkthdr.len); - } -#endif /* * If the incoming packet was addressed directly to us(i.e. unicast), * use dst as the src for the reply. - * The IN6_IFF_NOTREADY case would be VERY rare, but is possible + * The IN6_IFF_NOTREADY case should be VERY rare, but is possible * (for example) when we encounter an error while forwarding procedure * destined to a duplicated address of ours. + * Note that ip6_getdstifaddr() may fail if we are in an error handling + * procedure of an outgoing packet of our own, in which case we need + * to search in the ifaddr list. */ - lck_mtx_lock(nd6_mutex); - for (ia = in6_ifaddrs; ia; ia = ia->ia_next) + lck_rw_lock_shared(&in6_ifaddr_rwlock); + for (ia = in6_ifaddrs; ia; ia = ia->ia_next) { + IFA_LOCK(&ia->ia_ifa); if (IN6_ARE_ADDR_EQUAL(&t, &ia->ia_addr.sin6_addr) && (ia->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY)) == 0) { + IFA_UNLOCK(&ia->ia_ifa); src = &t; break; } - lck_mtx_unlock(nd6_mutex); + IFA_UNLOCK(&ia->ia_ifa); + } + lck_rw_done(&in6_ifaddr_rwlock); if (ia == NULL && IN6_IS_ADDR_LINKLOCAL(&t) && (m->m_flags & M_LOOP)) { /* * This is the case if the dst is our link-local address @@ -2174,8 +2268,9 @@ icmp6_reflect(m, off) src = &t; } - if (src == 0) { + if (src == NULL) { int e; + struct sockaddr_in6 sin6; struct route_in6 ro; /* @@ -2183,8 +2278,14 @@ icmp6_reflect(m, off) * that we do not own. Select a source address based on the * source address of the erroneous packet. */ + bzero(&sin6, sizeof(sin6)); + sin6.sin6_family = AF_INET6; + sin6.sin6_len = sizeof(sin6); + sin6.sin6_addr = ip6->ip6_dst; /* zone ID should be embedded */ + bzero(&ro, sizeof(ro)); - src = in6_selectsrc(&sa6_src, NULL, NULL, &ro, NULL, &src_storage, &e); + src = in6_selectsrc(&sin6, NULL, NULL, &ro, &outif, + &src_storage, ip6oa.ip6oa_boundif, &e); if (ro.ro_rt) rtfree(ro.ro_rt); /* XXX: we could use this */ if (src == NULL) { @@ -2195,10 +2296,8 @@ icmp6_reflect(m, off) goto bad; } } - - ip6->ip6_src = *src; - oflow = ip6->ip6_flow; /* Save for later */ + ip6->ip6_src = *src; ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; @@ -2206,14 +2305,16 @@ icmp6_reflect(m, off) ip6->ip6_flow |= (oflow & htonl(0x0ff00000)); } ip6->ip6_nxt = IPPROTO_ICMPV6; + lck_rw_lock_shared(nd_if_rwlock); + if (outif) + ip6->ip6_hlim = ND_IFINFO(outif)->chlim; if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_index < nd_ifinfo_indexlim) { /* XXX: This may not be the outgoing interface */ - lck_rw_lock_shared(nd_if_rwlock); ip6->ip6_hlim = nd_ifinfo[m->m_pkthdr.rcvif->if_index].chlim; - lck_rw_done(nd_if_rwlock); } else { ip6->ip6_hlim = ip6_defhlim; } + lck_rw_done(nd_if_rwlock); /* Use the same traffic class as in the request to match IPv4 */ icmp6->icmp6_cksum = 0; icmp6->icmp6_cksum = in6_cksum(m, IPPROTO_ICMPV6, @@ -2230,28 +2331,24 @@ icmp6_reflect(m, off) (void)ipsec_setsocket(m, NULL); #endif /*IPSEC*/ -#ifdef COMPAT_RFC1885 - ip6_output(m, NULL, &icmp6_reflect_rt, 0, NULL, &outif, 0); -#else - ip6_output(m, NULL, NULL, 0, NULL, &outif, 0); -#endif - if (outif) + if (outif != NULL) { + ifnet_release(outif); + outif = NULL; + } + ip6_output(m, NULL, NULL, IPV6_OUTARGS, NULL, &outif, &ip6oa); + if (outif != NULL) { icmp6_ifoutstat_inc(outif, type, code); - + ifnet_release(outif); + } return; - bad: +bad: m_freem(m); + if (outif != NULL) + ifnet_release(outif); return; } -void -icmp6_fasttimo() -{ - - mld6_fasttimeo(); -} - static const char * icmp6_redirect_diag(src6, dst6, tgt6) struct in6_addr *src6; @@ -2307,10 +2404,10 @@ icmp6_redirect_input(m, off) redtgt6 = nd_rd->nd_rd_target; reddst6 = nd_rd->nd_rd_dst; - if (IN6_IS_ADDR_LINKLOCAL(&redtgt6)) - redtgt6.s6_addr16[1] = htons(ifp->if_index); - if (IN6_IS_ADDR_LINKLOCAL(&reddst6)) - reddst6.s6_addr16[1] = htons(ifp->if_index); + if (in6_setscope(&redtgt6, m->m_pkthdr.rcvif, NULL) || + in6_setscope(&reddst6, m->m_pkthdr.rcvif, NULL)) { + goto freeit; + } /* validation */ if (!IN6_IS_ADDR_LINKLOCAL(&src6)) { @@ -2335,7 +2432,7 @@ icmp6_redirect_input(m, off) sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); bcopy(&reddst6, &sin6.sin6_addr, sizeof(reddst6)); - rt = rtalloc1((struct sockaddr *)&sin6, 0, 0); + rt = rtalloc1_scoped((struct sockaddr *)&sin6, 0, 0, ifp->if_index); if (rt) { RT_LOCK(rt); if (rt->rt_gateway == NULL || @@ -2494,6 +2591,7 @@ icmp6_redirect_output(m0, rt) u_char *p; struct ifnet *outif = NULL; struct sockaddr_in6 src_sa; + struct ip6_out_args ip6oa = { IFSCOPE_NONE, 0 }; icmp6_errcount(&icmp6stat.icp6s_outerrhist, ND_REDIRECT, 0); @@ -2565,8 +2663,10 @@ icmp6_redirect_output(m0, rt) IN6_IFF_NOTREADY| IN6_IFF_ANYCAST)) == NULL) goto fail; + IFA_LOCK(&ia->ia_ifa); ifp_ll6 = ia->ia_addr.sin6_addr; - ifafree(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); } /* get ip6 linklocal address for the router. */ @@ -2622,42 +2722,44 @@ icmp6_redirect_output(m0, rt) if (!router_ll6) goto nolladdropt; - { - /* target lladdr option */ - struct rtentry *rt_router = NULL; - int len; - struct sockaddr_dl *sdl; - struct nd_opt_hdr *nd_opt; - char *lladdr; - - /* Callee returns a locked route upon success */ - rt_router = nd6_lookup(router_ll6, 0, ifp, 0); - if (!rt_router) - goto nolladdropt; - RT_LOCK_ASSERT_HELD(rt_router); - len = sizeof(*nd_opt) + ifp->if_addrlen; - len = (len + 7) & ~7; /* round by 8 */ - /* safety check */ - if (len + (p - (u_char *)ip6) > maxlen) { + { + /* target lladdr option */ + struct rtentry *rt_router = NULL; + int len; + struct sockaddr_dl *sdl; + struct nd_opt_hdr *nd_opt; + char *lladdr; + + /* Callee returns a locked route upon success */ + rt_router = nd6_lookup(router_ll6, 0, ifp, 0); + if (!rt_router) + goto nolladdropt; + RT_LOCK_ASSERT_HELD(rt_router); + len = sizeof(*nd_opt) + ifp->if_addrlen; + len = (len + 7) & ~7; /* round by 8 */ + /* safety check */ + if (len + (p - (u_char *)ip6) > maxlen) { + RT_REMREF_LOCKED(rt_router); + RT_UNLOCK(rt_router); + goto nolladdropt; + } + + if (!(rt_router->rt_flags & RTF_GATEWAY) && + (rt_router->rt_flags & RTF_LLINFO) && + (rt_router->rt_gateway->sa_family == AF_LINK) && + (sdl = (struct sockaddr_dl *)rt_router->rt_gateway) && + sdl->sdl_alen) { + nd_opt = (struct nd_opt_hdr *)p; + nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; + nd_opt->nd_opt_len = len >> 3; + lladdr = (char *)(nd_opt + 1); + bcopy(LLADDR(sdl), lladdr, ifp->if_addrlen); + p += len; + } RT_REMREF_LOCKED(rt_router); RT_UNLOCK(rt_router); - goto nolladdropt; - } - if (!(rt_router->rt_flags & RTF_GATEWAY) && - (rt_router->rt_flags & RTF_LLINFO) && - (rt_router->rt_gateway->sa_family == AF_LINK) && - (sdl = (struct sockaddr_dl *)rt_router->rt_gateway) && - sdl->sdl_alen) { - nd_opt = (struct nd_opt_hdr *)p; - nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; - nd_opt->nd_opt_len = len >> 3; - lladdr = (char *)(nd_opt + 1); - bcopy(LLADDR(sdl), lladdr, ifp->if_addrlen); - p += len; - } - RT_REMREF_LOCKED(rt_router); - RT_UNLOCK(rt_router); - } + } + nolladdropt:; m->m_pkthdr.len = m->m_len = p - (u_char *)ip6; @@ -2741,20 +2843,11 @@ nolladdropt:; } noredhdropt:; - if (IN6_IS_ADDR_LINKLOCAL(&sip6->ip6_src)) - sip6->ip6_src.s6_addr16[1] = 0; - if (IN6_IS_ADDR_LINKLOCAL(&sip6->ip6_dst)) - sip6->ip6_dst.s6_addr16[1] = 0; -#if 0 - if (IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src)) - ip6->ip6_src.s6_addr16[1] = 0; - if (IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_dst)) - ip6->ip6_dst.s6_addr16[1] = 0; -#endif - if (IN6_IS_ADDR_LINKLOCAL(&nd_rd->nd_rd_target)) - nd_rd->nd_rd_target.s6_addr16[1] = 0; - if (IN6_IS_ADDR_LINKLOCAL(&nd_rd->nd_rd_dst)) - nd_rd->nd_rd_dst.s6_addr16[1] = 0; + /* XXX: clear embedded link IDs in the inner header */ + in6_clearscope(&sip6->ip6_src); + in6_clearscope(&sip6->ip6_dst); + in6_clearscope(&nd_rd->nd_rd_target); + in6_clearscope(&nd_rd->nd_rd_dst); ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr)); @@ -2768,10 +2861,14 @@ noredhdropt:; if (ipsec_bypass == 0) (void)ipsec_setsocket(m, NULL); #endif /*IPSEC*/ - ip6_output(m, NULL, NULL, 0, NULL, &outif, 0); + + ip6oa.ip6oa_boundif = ifp->if_index; + + ip6_output(m, NULL, NULL, IPV6_OUTARGS, NULL, &outif, &ip6oa); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); icmp6_ifstat_inc(outif, ifs6_out_redirect); + ifnet_release(outif); } icmp6stat.icp6s_outhist[ND_REDIRECT]++; @@ -2786,11 +2883,6 @@ noredhdropt:; m_freem(m0); } -#if HAVE_NRL_INPCB -#define sotoin6pcb sotoinpcb -#define in6pcb inpcb -#define in6p_icmp6filt inp_icmp6filt -#endif /* * ICMPv6 socket option processing. */ @@ -2823,7 +2915,7 @@ icmp6_ctloutput(so, sopt) { struct icmp6_filter *p; - if (optlen != sizeof(*p)) { + if (optlen != 0 && optlen != sizeof(*p)) { error = EMSGSIZE; break; } @@ -2831,8 +2923,17 @@ icmp6_ctloutput(so, sopt) error = EINVAL; break; } - error = sooptcopyin(sopt, inp->in6p_icmp6filt, optlen, - optlen); + + if (optlen == 0) { + /* According to RFC 3542, an installed filter can be + * cleared by issuing a setsockopt for ICMP6_FILTER + * with a zero length. + */ + ICMP6_FILTER_SETPASSALL(inp->in6p_icmp6filt); + } else { + error = sooptcopyin(sopt, inp->in6p_icmp6filt, optlen, + optlen); + } break; } @@ -2851,7 +2952,7 @@ icmp6_ctloutput(so, sopt) break; } error = sooptcopyout(sopt, inp->in6p_icmp6filt, - sizeof(struct icmp6_filter)); + min(sizeof(struct icmp6_filter), optlen)); break; } @@ -2864,11 +2965,6 @@ icmp6_ctloutput(so, sopt) return(error); } -#if HAVE_NRL_INPCB -#undef sotoin6pcb -#undef in6pcb -#undef in6p_icmp6filt -#endif /* * ICMPv6 socket datagram option processing. @@ -2892,16 +2988,19 @@ icmp6_dgram_ctloutput(struct socket *so, struct sockopt *sopt) return EINVAL; switch (sopt->sopt_name) { - case IPV6_PKTOPTIONS: case IPV6_UNICAST_HOPS: case IPV6_CHECKSUM: case IPV6_FAITH: case IPV6_V6ONLY: + case IPV6_USE_MIN_MTU: + case IPV6_RECVRTHDR: + case IPV6_RECVPKTINFO: + case IPV6_RECVHOPLIMIT: + case IPV6_PATHMTU: case IPV6_PKTINFO: case IPV6_HOPLIMIT: case IPV6_HOPOPTS: case IPV6_DSTOPTS: - case IPV6_RTHDR: case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: @@ -2911,6 +3010,15 @@ icmp6_dgram_ctloutput(struct socket *so, struct sockopt *sopt) case IPV6_IPSEC_POLICY: case IPV6_RECVTCLASS: case IPV6_TCLASS: + case IPV6_2292PKTOPTIONS: + case IPV6_2292PKTINFO: + case IPV6_2292HOPLIMIT: + case IPV6_2292HOPOPTS: + case IPV6_2292DSTOPTS: + case IPV6_2292RTHDR: + case IPV6_BOUND_IF: + case IPV6_NO_IFT_CELLULAR: + return ip6_ctloutput(so, sopt); default: @@ -2921,23 +3029,24 @@ icmp6_dgram_ctloutput(struct socket *so, struct sockopt *sopt) } __private_extern__ int -icmp6_dgram_send(struct socket *so, __unused int flags, struct mbuf *m, struct sockaddr *nam, - struct mbuf *control, __unused struct proc *p) +icmp6_dgram_send(struct socket *so, int flags, struct mbuf *m, + struct sockaddr *nam, struct mbuf *control, struct proc *p) { +#pragma unused(flags, p) int error = 0; struct inpcb *inp = sotoinpcb(so); struct sockaddr_in6 tmp; - struct sockaddr_in6 *dst; + struct sockaddr_in6 *dst = (struct sockaddr_in6 *)nam; struct icmp6_hdr *icmp6; if (so->so_uid == 0) - return rip6_output(m, so, (struct sockaddr_in6 *) nam, control); + return rip6_output(m, so, (struct sockaddr_in6 *) nam, control, 0); /* always copy sockaddr to avoid overwrites */ if (so->so_state & SS_ISCONNECTED) { if (nam) { - m_freem(m); - return EISCONN; + m_freem(m); + return EISCONN; } /* XXX */ bzero(&tmp, sizeof(tmp)); @@ -2948,8 +3057,8 @@ icmp6_dgram_send(struct socket *so, __unused int flags, struct mbuf *m, struct s dst = &tmp; } else { if (nam == NULL) { - m_freem(m); - return ENOTCONN; + m_freem(m); + return ENOTCONN; } tmp = *(struct sockaddr_in6 *)nam; dst = &tmp; @@ -2988,7 +3097,7 @@ icmp6_dgram_send(struct socket *so, __unused int flags, struct mbuf *m, struct s } #endif - return rip6_output(m, so, (struct sockaddr_in6 *) nam, control); + return rip6_output(m, so, (struct sockaddr_in6 *) nam, control, 0); bad: m_freem(m); return error; @@ -3124,4 +3233,3 @@ icmp6_ratelimit( return ret; } - diff --git a/bsd/netinet6/in6.c b/bsd/netinet6/in6.c index a9fd82b98..f11a99041 100644 --- a/bsd/netinet6/in6.c +++ b/bsd/netinet6/in6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2009 Apple Inc. All rights reserved. + * Copyright (c) 2003-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -106,6 +106,8 @@ #include #include #include +#include +#include #include #include @@ -122,11 +124,10 @@ #include #include #include -#ifndef SCOPEDROUTING #include #include #include -#endif +#include #include #include @@ -135,9 +136,8 @@ #include #include #include -#ifndef SCOPEDROUTING +#include #include -#endif #include @@ -145,9 +145,6 @@ #include #endif /* PF */ -#ifndef __APPLE__ -MALLOC_DEFINE(M_IPMADDR, "in6_multi", "internet multicast address"); -#endif /* * Definitions of some costant IP6 addresses. */ @@ -159,8 +156,12 @@ const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; +const struct in6_addr in6addr_linklocal_allv2routers = + IN6ADDR_LINKLOCAL_ALLV2ROUTERS_INIT; const struct in6_addr in6mask0 = IN6MASK0; +const struct in6_addr in6mask7 = IN6MASK7; +const struct in6_addr in6mask16 = IN6MASK16; const struct in6_addr in6mask32 = IN6MASK32; const struct in6_addr in6mask64 = IN6MASK64; const struct in6_addr in6mask96 = IN6MASK96; @@ -173,36 +174,53 @@ static int in6_lifaddr_ioctl(struct socket *, u_long, caddr_t, struct ifnet *, struct proc *); static int in6_ifinit(struct ifnet *, struct in6_ifaddr *, struct sockaddr_in6 *, int); -static void in6_unlink_ifa(struct in6_ifaddr *, struct ifnet *, int); +static void in6_unlink_ifa(struct in6_ifaddr *, struct ifnet *); static struct in6_ifaddr *in6_ifaddr_alloc(int); +static void in6_ifaddr_attached(struct ifaddr *); +static void in6_ifaddr_detached(struct ifaddr *); static void in6_ifaddr_free(struct ifaddr *); static void in6_ifaddr_trace(struct ifaddr *, int); static struct in6_aliasreq *in6_aliasreq_to_native(void *, int, struct in6_aliasreq *); -struct in6_multihead in6_multihead; /* XXX BSS initialization */ extern lck_mtx_t *nd6_mutex; -extern lck_mtx_t *ip6_mutex; extern int in6_init2done; +#define IN6IFA_TRACE_HIST_SIZE 32 /* size of trace history */ + +/* For gdb */ +__private_extern__ unsigned int in6ifa_trace_hist_size = IN6IFA_TRACE_HIST_SIZE; + struct in6_ifaddr_dbg { struct in6_ifaddr in6ifa; /* in6_ifaddr */ struct in6_ifaddr in6ifa_old; /* saved in6_ifaddr */ - u_int16_t in6ifa_refhold_cnt; /* # of ifaref */ - u_int16_t in6ifa_refrele_cnt; /* # of ifafree */ + u_int16_t in6ifa_refhold_cnt; /* # of IFA_ADDREF */ + u_int16_t in6ifa_refrele_cnt; /* # of IFA_REMREF */ /* * Alloc and free callers. */ ctrace_t in6ifa_alloc; ctrace_t in6ifa_free; /* - * Circular lists of ifaref and ifafree callers. + * Circular lists of IFA_ADDREF and IFA_REMREF callers. + */ + ctrace_t in6ifa_refhold[IN6IFA_TRACE_HIST_SIZE]; + ctrace_t in6ifa_refrele[IN6IFA_TRACE_HIST_SIZE]; + /* + * Trash list linkage */ - ctrace_t in6ifa_refhold[CTRACE_HIST_SIZE]; - ctrace_t in6ifa_refrele[CTRACE_HIST_SIZE]; + TAILQ_ENTRY(in6_ifaddr_dbg) in6ifa_trash_link; }; -static unsigned int in6ifa_debug; /* debug flags */ +/* List of trash in6_ifaddr entries protected by in6ifa_trash_lock */ +static TAILQ_HEAD(, in6_ifaddr_dbg) in6ifa_trash_head; +static decl_lck_mtx_data(, in6ifa_trash_lock); + +#if DEBUG +static unsigned int in6ifa_debug = 1; /* debugging (enabled) */ +#else +static unsigned int in6ifa_debug; /* debugging (disabled) */ +#endif /* !DEBUG */ static unsigned int in6ifa_size; /* size of zone element */ static struct zone *in6ifa_zone; /* zone for in6_ifaddr */ @@ -232,6 +250,7 @@ in6_ifloop_request(int cmd, struct ifaddr *ifa) * would be happy. Note that we assume the caller of the function * (probably implicitly) set nd6_rtrequest() to ifa->ifa_rtrequest, * which changes the outgoing interface to the loopback interface. + * ifa_addr for INET6 is set once during init; no need to hold lock. */ lck_mtx_lock(rnh_lock); e = rtrequest_locked(cmd, ifa->ifa_addr, ifa->ifa_addr, @@ -290,7 +309,10 @@ in6_ifaddloop(struct ifaddr *ifa) { struct rtentry *rt; - /* If there is no loopback entry, allocate one. */ + /* + * If there is no loopback entry, allocate one. ifa_addr for + * INET6 is set once during init; no need to hold lock. + */ rt = rtalloc1(ifa->ifa_addr, 0, 0); if (rt != NULL) RT_LOCK(rt); @@ -312,7 +334,7 @@ in6_ifaddloop(struct ifaddr *ifa) * if it exists. */ static void -in6_ifremloop(struct ifaddr *ifa, int locked) +in6_ifremloop(struct ifaddr *ifa) { struct in6_ifaddr *ia; struct rtentry *rt; @@ -334,26 +356,29 @@ in6_ifremloop(struct ifaddr *ifa, int locked) * (probably p2p) interfaces. * XXX: we should avoid such a configuration in IPv6... */ - if (!locked) - lck_mtx_lock(nd6_mutex); + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); for (ia = in6_ifaddrs; ia; ia = ia->ia_next) { + IFA_LOCK(&ia->ia_ifa); if (IN6_ARE_ADDR_EQUAL(IFA_IN6(ifa), &ia->ia_addr.sin6_addr)) { ia_count++; - if (ia_count > 1) + if (ia_count > 1) { + IFA_UNLOCK(&ia->ia_ifa); break; + } } + IFA_UNLOCK(&ia->ia_ifa); } - if (!locked) - lck_mtx_unlock(nd6_mutex); + lck_rw_done(&in6_ifaddr_rwlock); if (ia_count == 1) { /* * Before deleting, check if a corresponding loopbacked host * route surely exists. With this check, we can avoid to * delete an interface direct route whose destination is same - * as the address being removed. This can happen when remofing + * as the address being removed. This can happen when removing * a subnet-router anycast address on an interface attahced - * to a shared medium. + * to a shared medium. ifa_addr for INET6 is set once during + * init; no need to hold lock. */ rt = rtalloc1(ifa->ifa_addr, 0, 0); if (rt != NULL) { @@ -370,43 +395,6 @@ in6_ifremloop(struct ifaddr *ifa, int locked) } } -#if 0 -/* Not used */ -int -in6_ifindex2scopeid(idx) - int idx; -{ - struct ifnet *ifp; - struct ifaddr *ifa; - struct sockaddr_in6 *sin6; - - ifnet_head_lock_shared(); - if (idx <= 0 || if_index < idx) { - ifnet_head_done(); - return -1; - } - - ifp = ifindex2ifnet[idx]; - ifnet_head_done(); - - ifnet_lock_shared(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) - { - if (ifa->ifa_addr->sa_family != AF_INET6) - continue; - sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; - if (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr)) { - int scopeid = sin6->sin6_scope_id & 0xffff; - ifnet_lock_done(ifp); - return scopeid; - } - } - ifnet_lock_done(ifp); - - return -1; -} -#endif - int in6_mask2len(mask, lim0) @@ -416,8 +404,8 @@ in6_mask2len(mask, lim0) int x = 0, y; u_char *lim = lim0, *p; - if (lim0 == NULL || - lim0 - (u_char *)mask > sizeof(*mask)) /* ignore the scope_id part */ + /* ignore the scope_id part */ + if (lim0 == NULL || lim0 - (u_char *)mask > sizeof(*mask)) lim = (u_char *)mask + sizeof(*mask); for (p = (u_char *)mask; p < lim; x++, p++) { if (*p != 0xff) @@ -437,12 +425,12 @@ in6_mask2len(mask, lim0) */ if (p < lim) { if (y != 0 && (*p & (0x00ff >> y)) != 0) - return(-1); + return (-1); for (p = p + 1; p < lim; p++) if (*p != 0) - return(-1); + return (-1); } - + return x * 8 + y; } @@ -536,6 +524,25 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, return (mrt6_ioctl(cmd, data)); } #endif + + switch(cmd) { + case SIOCAADDRCTL_POLICY: + case SIOCDADDRCTL_POLICY: + if (!privileged) + return (EPERM); + return (in6_src_ioctl(cmd, data)); + } + + switch (cmd) { + case SIOCDRADD_IN6_32: + case SIOCDRADD_IN6_64: + case SIOCDRDEL_IN6_32: + case SIOCDRDEL_IN6_64: + if (!privileged) + return (EPERM); + return (defrtrlist_ioctl(cmd, data)); + } + if (ifp == NULL) return (EOPNOTSUPP); @@ -612,7 +619,16 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, } /* - * Point ifra and sa6 to the right places depending on the command. + * Find address for this interface, if it exists. + * + * In netinet code, we have checked ifra_addr in SIOCSIF*ADDR operation + * only, and used the first interface address as the target of other + * operations (without checking ifra_addr). This was because netinet + * code/API assumed at most 1 interface address per interface. + * Since IPv6 allows a node to assign multiple addresses + * on a single interface, we almost always look and check the + * presence of ifra_addr, and reject invalid ones here. + * It also decreases duplicated code among SIOC*_IN6 operations. */ switch (cmd) { case SIOCLL_START_32: @@ -643,6 +659,9 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: case SIOCGIFAFLAG_IN6: + case SIOCSNDFLUSH_IN6: + case SIOCSPFXFLUSH_IN6: + case SIOCSRTRFLUSH_IN6: case SIOCGIFALIFETIME_IN6: case SIOCSIFALIFETIME_IN6: case SIOCGIFSTAT_IN6: @@ -665,25 +684,39 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, /* NOTREACHED */ case SIOCAUTOCONF_STOP: { - struct in6_ifaddr *nia = NULL; - ifnet_lock_exclusive(ifp); ifp->if_eflags &= ~IFEF_ACCEPT_RTADVD; ifnet_lock_done(ifp); - /* nuke prefix list. this may try to remove some ifaddrs as well */ - in6_purgeprefix(ifp); - - /* removed autoconfigured address from interface */ - lck_mtx_lock(nd6_mutex); - for (ia = in6_ifaddrs; ia != NULL; ia = nia) { - nia = ia->ia_next; - if (ia->ia_ifa.ifa_ifp != ifp) + /* Remove autoconfigured address from interface */ + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + ia = in6_ifaddrs; + while (ia != NULL) { + if (ia->ia_ifa.ifa_ifp != ifp) { + ia = ia->ia_next; continue; - if (ia->ia6_flags & IN6_IFF_AUTOCONF) - in6_purgeaddr(&ia->ia_ifa, 1); + } + IFA_LOCK(&ia->ia_ifa); + if (ia->ia6_flags & IN6_IFF_AUTOCONF) { + IFA_ADDREF_LOCKED(&ia->ia_ifa); /* for us */ + IFA_UNLOCK(&ia->ia_ifa); + lck_rw_done(&in6_ifaddr_rwlock); + in6_purgeaddr(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); /* for us */ + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + /* + * Purging the address caused in6_ifaddr_rwlock + * to be dropped and reacquired; + * therefore search again from the beginning + * of in6_ifaddrs list. + */ + ia = in6_ifaddrs; + continue; + } + IFA_UNLOCK(&ia->ia_ifa); + ia = ia->ia_next; } - lck_mtx_unlock(nd6_mutex); + lck_rw_done(&in6_ifaddr_rwlock); return (0); } @@ -694,7 +727,7 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, * be done here. They are currently done in in6_ifattach() * for the interfaces that need it. */ - if (((ifp->if_type == IFT_PPP) || ((ifp->if_eflags & IFEF_NOAUTOIPV6LL) != 0)) && + if ((ifp->if_eflags & IFEF_NOAUTOIPV6LL) != 0 && ifra->ifra_addr.sin6_family == AF_INET6 && ifra->ifra_dstaddr.sin6_family == AF_INET6) { /* some interfaces may provide LinkLocal addresses */ @@ -706,28 +739,41 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, /* NOTREACHED */ case SIOCLL_STOP: { - struct in6_ifaddr *nia = NULL; - - /* removed link local addresses from interface */ - - lck_mtx_lock(nd6_mutex); - for (ia = in6_ifaddrs; ia != NULL; ia = nia) { - nia = ia->ia_next; - if (ia->ia_ifa.ifa_ifp != ifp) + /* Remove link local addresses from interface */ + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + ia = in6_ifaddrs; + while (ia != NULL) { + if (ia->ia_ifa.ifa_ifp != ifp) { + ia = ia->ia_next; + continue; + } + IFA_LOCK(&ia->ia_ifa); + if (IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) { + IFA_ADDREF_LOCKED(&ia->ia_ifa); /* for us */ + IFA_UNLOCK(&ia->ia_ifa); + lck_rw_done(&in6_ifaddr_rwlock); + in6_purgeaddr(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); /* for us */ + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + /* + * Purging the address caused in6_ifaddr_rwlock + * to be dropped and reacquired; + * therefore search again from the beginning + * of in6_ifaddrs list. + */ + ia = in6_ifaddrs; continue; - if (IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) - in6_purgeaddr(&ia->ia_ifa, 1); + } + IFA_UNLOCK(&ia->ia_ifa); + ia = ia->ia_next; } - lck_mtx_unlock(nd6_mutex); + lck_rw_done(&in6_ifaddr_rwlock); return (0); } case SIOCPROTOATTACH_IN6_32: case SIOCPROTOATTACH_IN6_64: - if ((error = proto_plumb(PF_INET6, ifp))) - printf("SIOCPROTOATTACH_IN6: %s " - "error=%d\n", if_name(ifp), error); - return (error); + return (in6_domifattach(ifp)); /* NOTREACHED */ case SIOCPROTODETACH_IN6: @@ -772,8 +818,7 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, case SIOCSIFNETMASK_IN6: /* * Since IPv6 allows a node to assign multiple addresses - * on a single interface, SIOCSIFxxx ioctls are not suitable - * and should be unused. + * on a single interface, SIOCSIFxxx ioctls are deprecated. */ /* we decided to obsolete this command (20000704) */ error = EINVAL; @@ -782,10 +827,10 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, case SIOCDIFADDR_IN6: /* * for IPv4, we look for existing in_ifaddr here to allow - * "ifconfig if0 delete" to remove first IPv4 address on the - * interface. For IPv6, as the spec allow multiple interface - * address from the day one, we consider "remove the first one" - * semantics to be not preferable. + * "ifconfig if0 delete" to remove the first IPv4 address on + * the interface. For IPv6, as the spec allows multiple + * interface address from the day one, we consider "remove the + * first one" semantics to be not preferable. */ if (ia == NULL) { error = EADDRNOTAVAIL; @@ -840,13 +885,17 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, lt = (struct in6_addrlifetime_64 *) &ifr->ifr_ifru.ifru_lifetime; - if (lt->ia6t_vltime != ND6_INFINITE_LIFETIME - && lt->ia6t_vltime + timenow.tv_sec < timenow.tv_sec) { + if (((ia->ia6_flags & IN6_IFF_TEMPORARY) != 0 + || lt->ia6t_vltime != ND6_INFINITE_LIFETIME) + && lt->ia6t_vltime + timenow.tv_sec < + timenow.tv_sec) { error = EINVAL; goto ioctl_cleanup; } - if (lt->ia6t_pltime != ND6_INFINITE_LIFETIME - && lt->ia6t_pltime + timenow.tv_sec < timenow.tv_sec) { + if (((ia->ia6_flags & IN6_IFF_TEMPORARY) != 0 + || lt->ia6t_pltime != ND6_INFINITE_LIFETIME) + && lt->ia6t_pltime + timenow.tv_sec < + timenow.tv_sec) { error = EINVAL; goto ioctl_cleanup; } @@ -855,13 +904,17 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, lt = (struct in6_addrlifetime_32 *) &ifr->ifr_ifru.ifru_lifetime; - if (lt->ia6t_vltime != ND6_INFINITE_LIFETIME - && lt->ia6t_vltime + timenow.tv_sec < timenow.tv_sec) { + if (((ia->ia6_flags & IN6_IFF_TEMPORARY) != 0 + || lt->ia6t_vltime != ND6_INFINITE_LIFETIME) + && lt->ia6t_vltime + timenow.tv_sec < + timenow.tv_sec) { error = EINVAL; goto ioctl_cleanup; } - if (lt->ia6t_pltime != ND6_INFINITE_LIFETIME - && lt->ia6t_pltime + timenow.tv_sec < timenow.tv_sec) { + if (((ia->ia6_flags & IN6_IFF_TEMPORARY) != 0 + || lt->ia6t_pltime != ND6_INFINITE_LIFETIME) + && lt->ia6t_pltime + timenow.tv_sec < + timenow.tv_sec) { error = EINVAL; goto ioctl_cleanup; } @@ -870,8 +923,15 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, } switch (cmd) { + case SIOCGIFADDR_IN6: + IFA_LOCK(&ia->ia_ifa); ifr->ifr_addr = ia->ia_addr; + IFA_UNLOCK(&ia->ia_ifa); + if ((error = sa6_recoverscope(&ifr->ifr_addr)) != 0) { + IFA_REMREF(&ia->ia_ifa); + return (error); + } break; case SIOCGIFDSTADDR_IN6: @@ -883,15 +943,25 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, * XXX: should we check if ifa_dstaddr is NULL and return * an error? */ + IFA_LOCK(&ia->ia_ifa); ifr->ifr_dstaddr = ia->ia_dstaddr; + IFA_UNLOCK(&ia->ia_ifa); + if ((error = sa6_recoverscope(&ifr->ifr_dstaddr)) != 0) { + IFA_REMREF(&ia->ia_ifa); + return (error); + } break; case SIOCGIFNETMASK_IN6: + IFA_LOCK(&ia->ia_ifa); ifr->ifr_addr = ia->ia_prefixmask; + IFA_UNLOCK(&ia->ia_ifa); break; case SIOCGIFAFLAG_IN6: + IFA_LOCK(&ia->ia_ifa); ifr->ifr_ifru.ifru_flags6 = ia->ia6_flags; + IFA_UNLOCK(&ia->ia_ifa); break; case SIOCGIFSTAT_IN6: @@ -900,7 +970,7 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, goto ioctl_cleanup; } index = ifp->if_index; - lck_mtx_lock(ip6_mutex); + lck_rw_lock_shared(&in6_ifs_rwlock); if (in6_ifstat == NULL || index >= in6_ifstatmax || in6_ifstat[index] == NULL) { /* return EAFNOSUPPORT? */ @@ -909,7 +979,7 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, } else { ifr->ifr_ifru.ifru_stat = *in6_ifstat[index]; } - lck_mtx_unlock(ip6_mutex); + lck_rw_done(&in6_ifs_rwlock); break; case SIOCGIFSTAT_ICMP6: @@ -918,7 +988,7 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, goto ioctl_cleanup; } index = ifp->if_index; - lck_mtx_lock(ip6_mutex); + lck_rw_lock_shared(&icmp6_ifs_rwlock); if (icmp6_ifstat == NULL || index >= icmp6_ifstatmax || icmp6_ifstat[index] == NULL) { /* return EAFNOSUPPORT? */ @@ -927,10 +997,11 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, } else { ifr->ifr_ifru.ifru_icmp6stat = *icmp6_ifstat[index]; } - lck_mtx_unlock(ip6_mutex); + lck_rw_done(&icmp6_ifs_rwlock); break; case SIOCGIFALIFETIME_IN6: + IFA_LOCK(&ia->ia_ifa); if (p64) { struct in6_addrlifetime_64 *lt; @@ -954,9 +1025,11 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, lt->ia6t_pltime = (uint32_t)ia->ia6_lifetime.ia6t_pltime; } + IFA_UNLOCK(&ia->ia_ifa); break; case SIOCSIFALIFETIME_IN6: + IFA_LOCK(&ia->ia_ifa); if (p64) { struct in6_addrlifetime_64 *lt; @@ -979,16 +1052,19 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, ia->ia6_lifetime.ia6t_pltime = lt->ia6t_pltime; } /* for sanity */ - if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { + if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME || + (ia->ia6_flags & IN6_IFF_TEMPORARY) != 0) { ia->ia6_lifetime.ia6t_expire = timenow.tv_sec + ia->ia6_lifetime.ia6t_vltime; } else ia->ia6_lifetime.ia6t_expire = 0; - if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { + if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME || + (ia->ia6_flags & IN6_IFF_TEMPORARY) != 0) { ia->ia6_lifetime.ia6t_preferred = timenow.tv_sec + ia->ia6_lifetime.ia6t_pltime; } else ia->ia6_lifetime.ia6t_preferred = 0; + IFA_UNLOCK(&ia->ia_ifa); break; case SIOCAIFADDR_IN6_32: @@ -996,17 +1072,13 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, int i; struct nd_prefix pr0, *pr; - /* Attempt to attache the protocol, in case it isn't attached */ - error = proto_plumb(PF_INET6, ifp); + /* Attempt to attach the protocol, in case it isn't attached */ + error = in6_domifattach(ifp); if (error) { - if (error != EEXIST) { - printf("SIOCAIFADDR_IN6: %s can't plumb " - "protocol error=%d\n", if_name(ifp), error); + if (error == EEXIST) + error = 0; + else goto ioctl_cleanup; - } - - /* Ignore, EEXIST */ - error = 0; } else { /* PF_INET6 wasn't previously attached */ if ((error = in6_if_up(ifp, NULL)) != 0) @@ -1017,7 +1089,7 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, * first, make or update the interface address structure, * and link it to the list. */ - if ((error = in6_update_ifa(ifp, ifra, ia, M_WAITOK)) != 0) + if ((error = in6_update_ifa(ifp, ifra, ia, 0, M_WAITOK)) != 0) goto ioctl_cleanup; /* @@ -1056,6 +1128,7 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, ((ifra->ifra_flags & IN6_IFF_AUTOCONF) != 0); pr0.ndpr_vltime = ifra->ifra_lifetime.ia6t_vltime; pr0.ndpr_pltime = ifra->ifra_lifetime.ia6t_pltime; + pr0.ndpr_stateflags |= NDPRF_STATIC; /* add the prefix if there's one. */ if ((pr = nd6_prefix_lookup(&pr0)) == NULL) { @@ -1063,7 +1136,7 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, * nd6_prelist_add will install the corresponding * interface route. */ - if ((error = nd6_prelist_add(&pr0, NULL, &pr)) != 0) + if ((error = nd6_prelist_add(&pr0, NULL, &pr, FALSE)) != 0) goto ioctl_cleanup; if (pr == NULL) { log(LOG_ERR, "nd6_prelist_add succedded but " @@ -1073,19 +1146,21 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, } } if (ia != NULL) - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); if ((ia = in6ifa_ifpwithaddr(ifp, &ifra->ifra_addr.sin6_addr)) == NULL) { /* XXX: this should not happen! */ log(LOG_ERR, "in6_control: addition succeeded, but" " no ifaddr\n"); } else { + IFA_LOCK(&ia->ia_ifa); if ((ia->ia6_flags & IN6_IFF_AUTOCONF) != 0 && ia->ia6_ndpr == NULL) { /* new autoconfed addr */ - lck_mtx_lock(nd6_mutex); - pr->ndpr_refcnt++; - lck_mtx_unlock(nd6_mutex); + NDPR_LOCK(pr); + pr->ndpr_addrcnt++; + VERIFY(pr->ndpr_addrcnt != 0); ia->ia6_ndpr = pr; + NDPR_ADDREF_LOCKED(pr); /* for addr reference */ /* * If this is the first autoconf address from @@ -1093,8 +1168,12 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, * as well (when specified). */ if (ip6_use_tempaddr && - pr->ndpr_refcnt == 1) { + pr->ndpr_addrcnt == 1) { int e; + + NDPR_UNLOCK(pr); + IFA_UNLOCK(&ia->ia_ifa); + if ((e = in6_tmpifadd(ia, 1, M_WAITOK)) != 0) { log(LOG_NOTICE, "in6_control: " @@ -1103,19 +1182,25 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, "errno=%d\n", e); } + } else { + NDPR_UNLOCK(pr); + IFA_UNLOCK(&ia->ia_ifa); } + } else { + IFA_UNLOCK(&ia->ia_ifa); } - /* * this might affect the status of autoconfigured * addresses, that is, this address might make * other addresses detached. */ - pfxlist_onlink_check(0); + lck_mtx_lock(nd6_mutex); + pfxlist_onlink_check(); + lck_mtx_unlock(nd6_mutex); } /* Drop use count held above during lookup/add */ - ndpr_rele(pr, FALSE); + NDPR_REMREF(pr); #if PF pf_ifaddr_hook(ifp, cmd); #endif /* PF */ @@ -1129,24 +1214,29 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, /* * If the address being deleted is the only one that owns * the corresponding prefix, expire the prefix as well. - * XXX: theoretically, we don't have to warry about such + * XXX: theoretically, we don't have to worry about such * relationship, since we separate the address management * and the prefix management. We do this, however, to provide * as much backward compatibility as possible in terms of * the ioctl operation. + * Note that in6_purgeaddr() will decrement ndpr_addrcnt. */ + IFA_LOCK(&ia->ia_ifa); bzero(&pr0, sizeof(pr0)); pr0.ndpr_ifp = ifp; pr0.ndpr_plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); - if (pr0.ndpr_plen == 128) + if (pr0.ndpr_plen == 128) { + IFA_UNLOCK(&ia->ia_ifa); goto purgeaddr; + } pr0.ndpr_prefix = ia->ia_addr; pr0.ndpr_mask = ia->ia_prefixmask.sin6_addr; for (i = 0; i < 4; i++) { pr0.ndpr_prefix.sin6_addr.s6_addr32[i] &= ia->ia_prefixmask.sin6_addr.s6_addr32[i]; } + IFA_UNLOCK(&ia->ia_ifa); /* * The logic of the following condition is a bit complicated. * We expire the prefix when @@ -1155,20 +1245,24 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, * 2. the address does not obey autoconf and there is no * other owner of the prefix. */ - if ((pr = nd6_prefix_lookup(&pr0)) != NULL && - (((ia->ia6_flags & IN6_IFF_AUTOCONF) != 0 && - pr->ndpr_refcnt == 1) || - ((ia->ia6_flags & IN6_IFF_AUTOCONF) == 0 && - pr->ndpr_refcnt == 0))) { - pr->ndpr_expire = 1; /* XXX: just for expiration */ - } + if ((pr = nd6_prefix_lookup(&pr0)) != NULL) { + IFA_LOCK(&ia->ia_ifa); + NDPR_LOCK(pr); + if (((ia->ia6_flags & IN6_IFF_AUTOCONF) != 0 && + pr->ndpr_addrcnt == 1) || + ((ia->ia6_flags & IN6_IFF_AUTOCONF) == 0 && + pr->ndpr_addrcnt == 0)) { + pr->ndpr_expire = 1; /* XXX: just for expiration */ + } + NDPR_UNLOCK(pr); + IFA_UNLOCK(&ia->ia_ifa); - /* Drop use count held above during lookup */ - if (pr != NULL) - ndpr_rele(pr, FALSE); + /* Drop use count held above during lookup */ + NDPR_REMREF(pr); + } purgeaddr: - in6_purgeaddr(&ia->ia_ifa, 0); + in6_purgeaddr(&ia->ia_ifa); #if PF pf_ifaddr_hook(ifp, cmd); #endif /* PF */ @@ -1181,7 +1275,7 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, } ioctl_cleanup: if (ia != NULL) - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); return (error); } @@ -1189,23 +1283,23 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, * Update parameters of an IPv6 interface address. * If necessary, a new entry is created and linked into address chains. * This function is separated from in6_control(). - * XXX: should this be performed under splnet()? */ int -in6_update_ifa(ifp, ifra, ia, how) - struct ifnet *ifp; - struct in6_aliasreq *ifra; - struct in6_ifaddr *ia; - int how; +in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, + struct in6_ifaddr *ia, int flags, int how) { int error = 0, hostIsNew = 0, plen = -1; struct in6_ifaddr *oia; struct sockaddr_in6 dst6; struct in6_addrlifetime *lt; + struct in6_multi *in6m_sol = NULL; + struct in6_multi_mship *imm; struct timeval timenow; + struct rtentry *rt; + struct ifaddr *ifa = NULL; + int delay; - lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_NOTOWNED); /* Validate parameters */ if (ifp == NULL || ifra == NULL) /* this maybe redundant */ return(EINVAL); @@ -1245,14 +1339,15 @@ in6_update_ifa(ifp, ifra, ia, how) (u_char *)&ifra->ifra_prefixmask + ifra->ifra_prefixmask.sin6_len); if (plen <= 0) - return(EINVAL); - } - else { + return (EINVAL); + } else { /* * In this case, ia must not be NULL. We just use its prefix * length. */ + IFA_LOCK(&ia->ia_ifa); plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); + IFA_UNLOCK(&ia->ia_ifa); } /* * If the destination address on a p2p interface is specified, @@ -1260,27 +1355,25 @@ in6_update_ifa(ifp, ifra, ia, how) * zone identifier. */ dst6 = ifra->ifra_dstaddr; - if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) && + if (((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) != 0 ) && (dst6.sin6_family == AF_INET6)) { int scopeid; -#ifndef SCOPEDROUTING if ((error = in6_recoverscope(&dst6, &ifra->ifra_dstaddr.sin6_addr, ifp)) != 0) return(error); -#endif + scopeid = in6_addr2scopeid(ifp, &dst6.sin6_addr); if (dst6.sin6_scope_id == 0) /* user omit to specify the ID. */ dst6.sin6_scope_id = scopeid; else if (dst6.sin6_scope_id != scopeid) return(EINVAL); /* scope ID mismatch. */ -#ifndef SCOPEDROUTING - if ((error = in6_embedscope(&dst6.sin6_addr, &dst6, NULL, NULL)) - != 0) + + if ((error = in6_embedscope(&dst6.sin6_addr, &dst6, NULL, NULL, + NULL)) != 0) return(error); dst6.sin6_scope_id = 0; /* XXX */ -#endif } /* * The destination address can be specified only for a p2p or a @@ -1308,7 +1401,8 @@ in6_update_ifa(ifp, ifra, ia, how) getmicrotime(&timenow); lt = &ifra->ifra_lifetime; - if (lt->ia6t_vltime != ND6_INFINITE_LIFETIME + if ((lt->ia6t_vltime != ND6_INFINITE_LIFETIME + || (ifra->ifra_flags & IN6_IFF_TEMPORARY) != 0) && lt->ia6t_vltime + timenow.tv_sec < timenow.tv_sec) { return EINVAL; } @@ -1321,7 +1415,8 @@ in6_update_ifa(ifp, ifra, ia, how) "in6_update_ifa: valid lifetime is 0 for %s\n", ip6_sprintf(&ifra->ifra_addr.sin6_addr)); } - if (lt->ia6t_pltime != ND6_INFINITE_LIFETIME + if ((lt->ia6t_pltime != ND6_INFINITE_LIFETIME + || (ifra->ifra_flags & IN6_IFF_TEMPORARY) != 0) && lt->ia6t_pltime + timenow.tv_sec < timenow.tv_sec) { return EINVAL; } @@ -1340,11 +1435,15 @@ in6_update_ifa(ifp, ifra, ia, how) */ ia = in6_ifaddr_alloc(how); if (ia == NULL) - return ENOBUFS; - /* Initialize the address and masks */ + return (ENOBUFS); + ifnet_lock_exclusive(ifp); + IFA_LOCK(&ia->ia_ifa); + LIST_INIT(&ia->ia6_memberships); + /* Initialize the address and masks, and put time stamp */ ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; ia->ia_addr.sin6_family = AF_INET6; ia->ia_addr.sin6_len = sizeof(ia->ia_addr); + ia->ia6_createtime = timenow.tv_sec; if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) != 0) { /* * XXX: some functions expect that ifa_dstaddr is not @@ -1359,21 +1458,34 @@ in6_update_ifa(ifp, ifra, ia, how) = (struct sockaddr *)&ia->ia_prefixmask; ia->ia_ifp = ifp; - ifaref(&ia->ia_ifa); - lck_mtx_lock(nd6_mutex); + /* if_attach_ifa() holds a reference for ifa_link */ + if_attach_ifa(ifp, &ia->ia_ifa); + /* hold a reference for this routine */ + IFA_ADDREF_LOCKED(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); + ifnet_lock_done(ifp); + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + /* Hold a reference for in6_ifaddrs link */ + IFA_ADDREF(&ia->ia_ifa); if ((oia = in6_ifaddrs) != NULL) { for ( ; oia->ia_next; oia = oia->ia_next) continue; oia->ia_next = ia; - } else + } else { in6_ifaddrs = ia; - lck_mtx_unlock(nd6_mutex); - - ifnet_lock_exclusive(ifp); - if_attach_ifa(ifp, &ia->ia_ifa); - ifnet_lock_done(ifp); + } + lck_rw_done(&in6_ifaddr_rwlock); + } else { + /* hold a reference for this routine */ + IFA_ADDREF(&ia->ia_ifa); } + ifa = &ia->ia_ifa; + IFA_LOCK(ifa); + + /* update timestamp */ + ia->ia6_updatetime = timenow.tv_sec; + /* set prefix mask */ if (ifra->ifra_prefixmask.sin6_len) { /* @@ -1388,6 +1500,7 @@ in6_update_ifa(ifp, ifra, ia, how) " existing (%s) address should not be changed\n", ip6_sprintf(&ia->ia_addr.sin6_addr)); error = EINVAL; + IFA_UNLOCK(ifa); goto unlink; } ia->ia_prefixmask = ifra->ifra_prefixmask; @@ -1396,82 +1509,145 @@ in6_update_ifa(ifp, ifra, ia, how) /* * If a new destination address is specified, scrub the old one and * install the new destination. Note that the interface must be - * p2p or loopback (see the check above.) + * p2p or loopback (see the check above.) */ if (dst6.sin6_family == AF_INET6 && - !IN6_ARE_ADDR_EQUAL(&dst6.sin6_addr, - &ia->ia_dstaddr.sin6_addr)) { - int e; - - if ((ia->ia_flags & IFA_ROUTE) != 0 && - (e = rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST)) - != 0) { - log(LOG_ERR, "in6_update_ifa: failed to remove " - "a route to the old destination: %s\n", - ip6_sprintf(&ia->ia_addr.sin6_addr)); - /* proceed anyway... */ - } - else + !IN6_ARE_ADDR_EQUAL(&dst6.sin6_addr, &ia->ia_dstaddr.sin6_addr)) { + if ((ia->ia_flags & IFA_ROUTE)) { + int e; + + IFA_UNLOCK(ifa); + if ((e = rtinit(&(ia->ia_ifa), (int)RTM_DELETE, + RTF_HOST)) != 0) { + log(LOG_ERR, "in6_update_ifa: failed to remove " + "a route to the old destination: %s\n", + ip6_sprintf(&ia->ia_addr.sin6_addr)); + /* proceed anyway... */ + } + IFA_LOCK(ifa); + } else { ia->ia_flags &= ~IFA_ROUTE; + } + IFA_LOCK_ASSERT_HELD(ifa); ia->ia_dstaddr = dst6; } + /* + * Set lifetimes. We do not refer to ia6t_expire and ia6t_preferred + * to see if the address is deprecated or invalidated, but initialize + * these members for applications. + */ + ia->ia6_lifetime = ifra->ifra_lifetime; + if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME + || (ifra->ifra_flags & IN6_IFF_TEMPORARY) != 0) { + ia->ia6_lifetime.ia6t_expire = + timenow.tv_sec + ia->ia6_lifetime.ia6t_vltime; + } else + ia->ia6_lifetime.ia6t_expire = 0; + if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME + || (ifra->ifra_flags & IN6_IFF_TEMPORARY) != 0) { + ia->ia6_lifetime.ia6t_preferred = + timenow.tv_sec + ia->ia6_lifetime.ia6t_pltime; + } else + ia->ia6_lifetime.ia6t_preferred = 0; + + IFA_UNLOCK(ifa); /* reset the interface and routing table appropriately. */ if ((error = in6_ifinit(ifp, ia, &ifra->ifra_addr, hostIsNew)) != 0) goto unlink; + IFA_LOCK(ifa); /* - * Beyond this point, we should call in6_purgeaddr upon an error, - * not just go to unlink. + * configure address flags. */ - -#if 0 /* disable this mechanism for now */ - /* update prefix list */ - if (hostIsNew && - (ifra->ifra_flags & IN6_IFF_NOPFX) == 0) { /* XXX */ - int iilen; - - iilen = (sizeof(ia->ia_prefixmask.sin6_addr) << 3) - plen; - if ((error = in6_prefix_add_ifid(iilen, ia)) != 0) { - in6_purgeaddr((struct ifaddr *)ia, 0); - return(error); - } + ia->ia6_flags = ifra->ifra_flags; + /* + * backward compatibility - if IN6_IFF_DEPRECATED is set from the + * userland, make it deprecated. + */ + if ((ifra->ifra_flags & IN6_IFF_DEPRECATED) != 0) { + ia->ia6_lifetime.ia6t_pltime = 0; + ia->ia6_lifetime.ia6t_preferred = timenow.tv_sec; } -#endif + /* + * Make the address tentative before joining multicast addresses, + * so that corresponding MLD responses would not have a tentative + * source address. + */ + ia->ia6_flags &= ~IN6_IFF_DUPLICATED; /* safety */ + if (hostIsNew && in6if_do_dad(ifp)) + ia->ia6_flags |= IN6_IFF_TENTATIVE; + /* + * We are done if we have simply modified an existing address. + */ + if (!hostIsNew) { + IFA_UNLOCK(ifa); + /* release reference held for this routine */ + IFA_REMREF(ifa); + return (error); + } + /* + * Beyond this point, we should call in6_purgeaddr upon an error, + * not just go to unlink. + */ + IFA_LOCK_ASSERT_HELD(ifa); + /* Join necessary multicast groups */ if ((ifp->if_flags & IFF_MULTICAST) != 0) { struct sockaddr_in6 mltaddr, mltmask; - struct in6_multi *in6m; + struct in6_addr llsol; - if (hostIsNew) { + IFA_UNLOCK(ifa); + /* join solicited multicast addr for new host id */ + bzero(&llsol, sizeof(struct in6_addr)); + llsol.s6_addr32[0] = IPV6_ADDR_INT32_MLL; + llsol.s6_addr32[1] = 0; + llsol.s6_addr32[2] = htonl(1); + llsol.s6_addr32[3] = ifra->ifra_addr.sin6_addr.s6_addr32[3]; + llsol.s6_addr8[12] = 0xff; + if ((error = in6_setscope(&llsol, ifp, NULL)) != 0) { + /* XXX: should not happen */ + log(LOG_ERR, "in6_update_ifa: " + "in6_setscope failed\n"); + goto cleanup; + } + delay = 0; + if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* - * join solicited multicast addr for new host id + * We need a random delay for DAD on the address + * being configured. It also means delaying + * transmission of the corresponding MLD report to + * avoid report collision. + * [draft-ietf-ipv6-rfc2462bis-02.txt] */ - struct in6_addr llsol; - bzero(&llsol, sizeof(struct in6_addr)); - llsol.s6_addr16[0] = htons(0xff02); - llsol.s6_addr16[1] = htons(ifp->if_index); - llsol.s6_addr32[1] = 0; - llsol.s6_addr32[2] = htonl(1); - llsol.s6_addr32[3] = - ifra->ifra_addr.sin6_addr.s6_addr32[3]; - llsol.s6_addr8[12] = 0xff; - (void)in6_addmulti(&llsol, ifp, &error, 0); - if (error != 0) { - log(LOG_WARNING, - "in6_update_ifa: addmulti failed for " - "%s on %s (errno=%d)\n", - ip6_sprintf(&llsol), if_name(ifp), - error); - in6_purgeaddr((struct ifaddr *)ia, 0); - return(error); - } + delay = random() % + (MAX_RTR_SOLICITATION_DELAY * PR_SLOWHZ); + } + imm = in6_joingroup(ifp, &llsol, &error, delay); + if (imm == NULL) { + nd6log((LOG_WARNING, + "in6_update_ifa: addmulti failed for " + "%s on %s (errno=%d)\n", + ip6_sprintf(&llsol), if_name(ifp), + error)); + in6_purgeaddr((struct ifaddr *)ia); + /* release reference held for this routine */ + IFA_REMREF(ifa); + return (error); } + in6m_sol = imm->i6mm_maddr; + /* take a refcount for this routine */ + IN6M_ADDREF(in6m_sol); + + IFA_LOCK_SPIN(ifa); + LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); + IFA_UNLOCK(ifa); bzero(&mltmask, sizeof(mltmask)); mltmask.sin6_len = sizeof(struct sockaddr_in6); mltmask.sin6_family = AF_INET6; mltmask.sin6_addr = in6mask32; +#define MLTMASK_LEN 4 /* mltmask's masklen (=32bit=4octet) */ /* * join link-local all-nodes address @@ -1480,111 +1656,134 @@ in6_update_ifa(ifp, ifra, ia, how) mltaddr.sin6_len = sizeof(struct sockaddr_in6); mltaddr.sin6_family = AF_INET6; mltaddr.sin6_addr = in6addr_linklocal_allnodes; - mltaddr.sin6_addr.s6_addr16[1] = htons(ifp->if_index); + if ((error = in6_setscope(&mltaddr.sin6_addr, ifp, NULL)) != + 0) + goto cleanup; /* XXX: should not fail */ - ifnet_lock_shared(ifp); - IN6_LOOKUP_MULTI(mltaddr.sin6_addr, ifp, in6m); - ifnet_lock_done(ifp); - if (in6m == NULL) { - rtrequest(RTM_ADD, - (struct sockaddr *)&mltaddr, - (struct sockaddr *)&ia->ia_addr, - (struct sockaddr *)&mltmask, - RTF_UP|RTF_CLONING, /* xxx */ - (struct rtentry **)0); - (void)in6_addmulti(&mltaddr.sin6_addr, ifp, &error, 0); - if (error != 0) { - log(LOG_WARNING, - "in6_update_ifa: addmulti failed for " - "%s on %s (errno=%d)\n", - ip6_sprintf(&mltaddr.sin6_addr), - if_name(ifp), error); + /* + * XXX: do we really need this automatic routes? + * We should probably reconsider this stuff. Most applications + * actually do not need the routes, since they usually specify + * the outgoing interface. + */ + rt = rtalloc1_scoped((struct sockaddr *)&mltaddr, 0, 0UL, + ia->ia_ifp->if_index); + if (rt) { + if (memcmp(&mltaddr.sin6_addr, + &((struct sockaddr_in6 *)rt_key(rt))->sin6_addr, + MLTMASK_LEN)) { + rtfree(rt); + rt = NULL; } } + if (!rt) { + error = rtrequest_scoped(RTM_ADD, + (struct sockaddr *)&mltaddr, + (struct sockaddr *)&ia->ia_addr, + (struct sockaddr *)&mltmask, RTF_UP | RTF_CLONING, + NULL, ia->ia_ifp->if_index); + if (error) + goto cleanup; + } else { + rtfree(rt); + } + + imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error, 0); + if (!imm) { + nd6log((LOG_WARNING, + "in6_update_ifa: addmulti failed for " + "%s on %s (errno=%d)\n", + ip6_sprintf(&mltaddr.sin6_addr), + if_name(ifp), error)); + goto cleanup; + } + IFA_LOCK_SPIN(ifa); + LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); + IFA_UNLOCK(ifa); /* * join node information group address */ #define hostnamelen strlen(hostname) + delay = 0; + if ((flags & IN6_IFAUPDATE_DADDELAY)) { + /* + * The spec doesn't say anything about delay for this + * group, but the same logic should apply. + */ + delay = random() % + (MAX_RTR_SOLICITATION_DELAY * PR_SLOWHZ); + } if (in6_nigroup(ifp, hostname, hostnamelen, &mltaddr.sin6_addr) == 0) { - ifnet_lock_shared(ifp); - IN6_LOOKUP_MULTI(mltaddr.sin6_addr, ifp, in6m); - ifnet_lock_done(ifp); - if (in6m == NULL && ia != NULL) { - (void)in6_addmulti(&mltaddr.sin6_addr, - ifp, &error, 0); - if (error != 0) { - log(LOG_WARNING, "in6_update_ifa: " - "addmulti failed for " - "%s on %s (errno=%d)\n", - ip6_sprintf(&mltaddr.sin6_addr), - if_name(ifp), error); - } + imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error, + delay); /* XXX jinmei */ + if (!imm) { + nd6log((LOG_WARNING, "in6_update_ifa: " + "addmulti failed for %s on %s " + "(errno=%d)\n", + ip6_sprintf(&mltaddr.sin6_addr), + if_name(ifp), error)); + /* XXX not very fatal, go on... */ + } else { + IFA_LOCK_SPIN(ifa); + LIST_INSERT_HEAD(&ia->ia6_memberships, + imm, i6mm_chain); + IFA_UNLOCK(ifa); } } #undef hostnamelen /* - * join node-local all-nodes address, on loopback. - * XXX: since "node-local" is obsoleted by interface-local, - * we have to join the group on every interface with - * some interface-boundary restriction. + * join interface-local all-nodes address. + * (ff01::1%ifN, and ff01::%ifN/32) */ - if (ifp->if_flags & IFF_LOOPBACK) { - struct in6_ifaddr *ia_loop; - - struct in6_addr loop6 = in6addr_loopback; - ia_loop = in6ifa_ifpwithaddr(ifp, &loop6); - - mltaddr.sin6_addr = in6addr_nodelocal_allnodes; - - ifnet_lock_shared(ifp); - IN6_LOOKUP_MULTI(mltaddr.sin6_addr, ifp, in6m); - ifnet_lock_done(ifp); - if (in6m == NULL && ia_loop != NULL) { - rtrequest(RTM_ADD, - (struct sockaddr *)&mltaddr, - (struct sockaddr *)&ia_loop->ia_addr, - (struct sockaddr *)&mltmask, - RTF_UP, - (struct rtentry **)0); - (void)in6_addmulti(&mltaddr.sin6_addr, ifp, - &error, 0); - if (error != 0) { - log(LOG_WARNING, "in6_update_ifa: " - "addmulti failed for %s on %s " - "(errno=%d)\n", - ip6_sprintf(&mltaddr.sin6_addr), - if_name(ifp), error); - } + mltaddr.sin6_addr = in6addr_nodelocal_allnodes; + if ((error = in6_setscope(&mltaddr.sin6_addr, ifp, NULL)) + != 0) + goto cleanup; /* XXX: should not fail */ + /* XXX: again, do we really need the route? */ + rt = rtalloc1_scoped((struct sockaddr *)&mltaddr, 0, 0UL, + ia->ia_ifp->if_index); + if (rt) { + if (memcmp(&mltaddr.sin6_addr, + &((struct sockaddr_in6 *)rt_key(rt))->sin6_addr, + MLTMASK_LEN)) { + rtfree(rt); + rt = NULL; } - if (ia_loop != NULL) - ifafree(&ia_loop->ia_ifa); } + if (!rt) { + error = rtrequest_scoped(RTM_ADD, + (struct sockaddr *)&mltaddr, + (struct sockaddr *)&ia->ia_addr, + (struct sockaddr *)&mltmask, RTF_UP | RTF_CLONING, + NULL, ia->ia_ifp->if_index); + if (error) + goto cleanup; + } else + rtfree(rt); + + imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error, 0); + if (!imm) { + nd6log((LOG_WARNING, "in6_update_ifa: " + "addmulti failed for %s on %s " + "(errno=%d)\n", + ip6_sprintf(&mltaddr.sin6_addr), + if_name(ifp), error)); + goto cleanup; + } + IFA_LOCK(ifa); + LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); + /* keep it locked */ +#undef MLTMASK_LEN } - - ia->ia6_flags = ifra->ifra_flags; - ia->ia6_flags &= ~IN6_IFF_DUPLICATED; /*safety*/ - ia->ia6_flags &= ~IN6_IFF_NODAD; /* Mobile IPv6 */ - - ia->ia6_lifetime = ifra->ifra_lifetime; - /* for sanity */ - if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { - ia->ia6_lifetime.ia6t_expire = - timenow.tv_sec + ia->ia6_lifetime.ia6t_vltime; - } else - ia->ia6_lifetime.ia6t_expire = 0; - if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { - ia->ia6_lifetime.ia6t_preferred = - timenow.tv_sec + ia->ia6_lifetime.ia6t_pltime; - } else - ia->ia6_lifetime.ia6t_preferred = 0; - + IFA_LOCK_ASSERT_HELD(ifa); /* - * make sure to initialize ND6 information. this is to workaround + * Make sure to initialize ND6 information. this is to workaround * issues with interfaces with IPv6 addresses, which have never brought * up. We are assuming that it is safe to nd6_ifattach multiple times. + * NOTE: this is how stf0 gets initialized */ if ((error = nd6_ifattach(ifp)) != 0) return error; @@ -1594,29 +1793,74 @@ in6_update_ifa(ifp, ifra, ia, how) * XXX It may be of use, if we can administratively * disable DAD. */ - if (in6if_do_dad(ifp) && (ifra->ifra_flags & IN6_IFF_NODAD) == 0) { - ia->ia6_flags |= IN6_IFF_TENTATIVE; - nd6_dad_start((struct ifaddr *)ia, NULL); - } + if (hostIsNew && in6if_do_dad(ifp) && + ((ifra->ifra_flags & IN6_IFF_NODAD) == 0) && + (ia->ia6_flags & IN6_IFF_TENTATIVE)) + { + int mindelay, maxdelay; - return(error); + IFA_UNLOCK(ifa); + delay = 0; + if ((flags & IN6_IFAUPDATE_DADDELAY)) { + /* + * We need to impose a delay before sending an NS + * for DAD. Check if we also needed a delay for the + * corresponding MLD message. If we did, the delay + * should be larger than the MLD delay (this could be + * relaxed a bit, but this simple logic is at least + * safe). + */ + mindelay = 0; + if (in6m_sol != NULL) { + IN6M_LOCK(in6m_sol); + if (in6m_sol->in6m_state == MLD_REPORTING_MEMBER) + mindelay = in6m_sol->in6m_timer; + IN6M_UNLOCK(in6m_sol); + } + maxdelay = MAX_RTR_SOLICITATION_DELAY * hz; + if (maxdelay - mindelay == 0) + delay = 0; + else { + delay = + (random() % (maxdelay - mindelay)) + + mindelay; + } + } + nd6_dad_start((struct ifaddr *)ia, &delay); + } else { + IFA_UNLOCK(ifa); + } +done: + /* release reference held for this routine */ + if (ifa != NULL) + IFA_REMREF(ifa); + if (in6m_sol != NULL) + IN6M_REMREF(in6m_sol); + return (error); - unlink: +unlink: /* * XXX: if a change of an existing address failed, keep the entry * anyway. */ - if (hostIsNew) - in6_unlink_ifa(ia, ifp, 0); - return(error); + if (hostIsNew) { + in6_unlink_ifa(ia, ifp); + } + goto done; + +cleanup: + in6_purgeaddr(&ia->ia_ifa); + goto done; } void -in6_purgeaddr( - struct ifaddr *ifa, int nd6_locked) +in6_purgeaddr(struct ifaddr *ifa) { struct ifnet *ifp = ifa->ifa_ifp; struct in6_ifaddr *ia = (struct in6_ifaddr *) ifa; + struct in6_multi_mship *imm; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_NOTOWNED); /* stop DAD processing */ nd6_dad_stop(ifa); @@ -1625,9 +1869,11 @@ in6_purgeaddr( * delete route to the destination of the address being purged. * The interface must be p2p or loopback in this case. */ + IFA_LOCK(ifa); if ((ia->ia_flags & IFA_ROUTE) != 0 && ia->ia_dstaddr.sin6_len != 0) { int e; + IFA_UNLOCK(ifa); if ((e = rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST)) != 0) { log(LOG_ERR, "in6_purgeaddr: failed to remove " @@ -1636,73 +1882,71 @@ in6_purgeaddr( ip6_sprintf(&ia->ia_addr.sin6_addr), if_name(ifp), e); /* proceed anyway... */ - } - else + } else { + IFA_LOCK_SPIN(ifa); ia->ia_flags &= ~IFA_ROUTE; + IFA_UNLOCK(ifa); + } + } else { + IFA_UNLOCK(ifa); } + IFA_LOCK_ASSERT_NOTHELD(ifa); /* Remove ownaddr's loopback rtentry, if it exists. */ - in6_ifremloop(&(ia->ia_ifa), nd6_locked); - - if (ifp->if_flags & IFF_MULTICAST) { - /* - * delete solicited multicast addr for deleting host id - */ - struct in6_multi *in6m; - struct in6_addr llsol; - bzero(&llsol, sizeof(struct in6_addr)); - llsol.s6_addr16[0] = htons(0xff02); - llsol.s6_addr16[1] = htons(ifp->if_index); - llsol.s6_addr32[1] = 0; - llsol.s6_addr32[2] = htonl(1); - llsol.s6_addr32[3] = - ia->ia_addr.sin6_addr.s6_addr32[3]; - llsol.s6_addr8[12] = 0xff; + in6_ifremloop(&(ia->ia_ifa)); - ifnet_lock_shared(ifp); - IN6_LOOKUP_MULTI(llsol, ifp, in6m); - ifnet_lock_done(ifp); - if (in6m) - in6_delmulti(in6m, nd6_locked); + /* + * leave from multicast groups we have joined for the interface + */ + IFA_LOCK(ifa); + while ((imm = ia->ia6_memberships.lh_first) != NULL) { + LIST_REMOVE(imm, i6mm_chain); + IFA_UNLOCK(ifa); + in6_leavegroup(imm); + IFA_LOCK(ifa); } + IFA_UNLOCK(ifa); - in6_unlink_ifa(ia, ifp, nd6_locked); + /* in6_unlink_ifa() will need exclusive access */ + in6_unlink_ifa(ia, ifp); in6_post_msg(ifp, KEV_INET6_ADDR_DELETED, ia); } static void -in6_unlink_ifa(ia, ifp, nd6_locked) - struct in6_ifaddr *ia; - struct ifnet *ifp; - int nd6_locked; +in6_unlink_ifa(struct in6_ifaddr *ia, struct ifnet *ifp) { - int plen, iilen; struct in6_ifaddr *oia; + struct ifaddr *ifa; + int unlinked; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_NOTOWNED); + + ifa = &ia->ia_ifa; + IFA_ADDREF(ifa); ifnet_lock_exclusive(ifp); - if_detach_ifa(ifp, &ia->ia_ifa); + IFA_LOCK(ifa); + if (ifa->ifa_debug & IFD_ATTACHED) + if_detach_ifa(ifp, ifa); + IFA_UNLOCK(ifa); ifnet_lock_done(ifp); - if (!nd6_locked) - lck_mtx_lock(nd6_mutex); + unlinked = 1; + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); oia = ia; - if (oia == (ia = in6_ifaddrs)) + if (oia == (ia = in6_ifaddrs)) { in6_ifaddrs = ia->ia_next; - else { + } else { while (ia->ia_next && (ia->ia_next != oia)) ia = ia->ia_next; - if (ia->ia_next) + if (ia->ia_next) { ia->ia_next = oia->ia_next; - else { + } else { /* search failed */ printf("Couldn't unlink in6_ifaddr from in6_ifaddr\n"); + unlinked = 0; } } - if (oia->ia6_ifpr) { /* check for safety */ - plen = in6_mask2len(&oia->ia_prefixmask.sin6_addr, NULL); - iilen = (sizeof(oia->ia_prefixmask.sin6_addr) << 3) - plen; - in6_prefix_remove_ifid(iilen, oia); - } /* * When an autoconfigured address is being removed, release the @@ -1710,48 +1954,76 @@ in6_unlink_ifa(ia, ifp, nd6_locked) * affect the status of other (detached) addresses, call * pfxlist_onlink_check(). */ + ifa = &oia->ia_ifa; + IFA_LOCK(ifa); if ((oia->ia6_flags & IN6_IFF_AUTOCONF) != 0) { if (oia->ia6_ndpr == NULL) { log(LOG_NOTICE, "in6_unlink_ifa: autoconf'ed address " "%p has no prefix\n", oia); } else { - oia->ia6_ndpr->ndpr_refcnt--; + struct nd_prefix *pr = oia->ia6_ndpr; + oia->ia6_flags &= ~IN6_IFF_AUTOCONF; oia->ia6_ndpr = NULL; + NDPR_LOCK(pr); + VERIFY(pr->ndpr_addrcnt != 0); + pr->ndpr_addrcnt--; + NDPR_UNLOCK(pr); + NDPR_REMREF(pr); /* release addr reference */ } - - pfxlist_onlink_check(1); - } - if (!nd6_locked) + IFA_UNLOCK(ifa); + lck_rw_done(&in6_ifaddr_rwlock); + lck_mtx_lock(nd6_mutex); + pfxlist_onlink_check(); lck_mtx_unlock(nd6_mutex); - + } else { + IFA_UNLOCK(ifa); + lck_rw_done(&in6_ifaddr_rwlock); + } /* * release another refcnt for the link from in6_ifaddrs. - * Note that we should decrement the refcnt at least once for all *BSD. + * Do this only if it's not already unlinked in the event that we lost + * the race, since in6_ifaddr_rwlock was momentarily dropped above. */ - ifafree(&oia->ia_ifa); + if (unlinked) + IFA_REMREF(ifa); + /* release reference held for this routine */ + IFA_REMREF(ifa); } void -in6_purgeif(ifp) - struct ifnet *ifp; +in6_purgeif(struct ifnet *ifp) { - struct in6_ifaddr *ia, *nia = NULL; + struct in6_ifaddr *ia; - if (ifp == NULL || &ifp->if_addrlist == NULL) + if (ifp == NULL) return; - - lck_mtx_lock(nd6_mutex); - for (ia = in6_ifaddrs; ia != NULL; ia = nia) - { - nia = ia->ia_next; - if (ia->ia_ifa.ifa_ifp != ifp) + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_NOTOWNED); + + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + ia = in6_ifaddrs; + while (ia != NULL) { + if (ia->ia_ifa.ifa_ifp != ifp) { + ia = ia->ia_next; continue; - in6_purgeaddr(&ia->ia_ifa, 1); + } + IFA_ADDREF(&ia->ia_ifa); /* for us */ + lck_rw_done(&in6_ifaddr_rwlock); + in6_purgeaddr(&ia->ia_ifa); + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + IFA_REMREF(&ia->ia_ifa); /* for us */ + /* + * Purging the address would have caused + * in6_ifaddr_rwlock to be dropped and reacquired; + * therefore search again from the beginning + * of in6_ifaddrs list. + */ + ia = in6_ifaddrs; } - lck_mtx_unlock(nd6_mutex); + lck_rw_done(&in6_ifaddr_rwlock); in6_ifdetach(ifp); } @@ -1791,7 +2063,7 @@ in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, /* sanity checks */ if (!data || !ifp) { panic("invalid argument to in6_lifaddr_ioctl"); - /*NOTRECHED*/ + /*NOTREACHED*/ } switch (cmd) { @@ -1845,9 +2117,11 @@ in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp, 0); if (!ifa) return EADDRNOTAVAIL; + IFA_LOCK_SPIN(ifa); hostaddr = *IFA_IN6(ifa); + IFA_UNLOCK(ifa); hostid_found = 1; - ifafree(ifa); + IFA_REMREF(ifa); ifa = NULL; /* prefixlen must be <= 64. */ @@ -1855,10 +2129,10 @@ in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, return EINVAL; prefixlen = iflr->prefixlen; - /* hostaddr part must be zero. */ + /* hostid part must be zero. */ sin6 = (struct sockaddr_in6 *)&iflr->addr; - if (sin6->sin6_addr.s6_addr32[2] != 0 - || sin6->sin6_addr.s6_addr32[3] != 0) { + if (sin6->sin6_addr.s6_addr32[2] != 0 || + sin6->sin6_addr.s6_addr32[3] != 0) { return EINVAL; } } else @@ -1890,7 +2164,7 @@ in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, } ifra.ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6); - in6_len2mask(&ifra.ifra_prefixmask.sin6_addr, prefixlen); + in6_prefixlen2mask(&ifra.ifra_prefixmask.sin6_addr, prefixlen); ifra.ifra_flags = iflr->flags & ~IFLR_PREFIX; if (!p64) { @@ -1935,7 +2209,7 @@ in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, bzero(&mask, sizeof(mask)); if (iflr->flags & IFLR_PREFIX) { /* lookup a prefix rather than address. */ - in6_len2mask(&mask, iflr->prefixlen); + in6_prefixlen2mask(&mask, iflr->prefixlen); sin6 = (struct sockaddr_in6 *)&iflr->addr; bcopy(&sin6->sin6_addr, &match, sizeof(match)); @@ -1955,7 +2229,7 @@ in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, cmp = 0; /* XXX */ } else { /* on deleting an address, do exact match */ - in6_len2mask(&mask, 128); + in6_prefixlen2mask(&mask, 128); sin6 = (struct sockaddr_in6 *)&iflr->addr; bcopy(&sin6->sin6_addr, &match, sizeof(match)); @@ -1966,13 +2240,18 @@ in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; - if (!cmp) + } + if (!cmp) { + IFA_UNLOCK(ifa); break; + } bcopy(IFA_IN6(ifa), &candidate, sizeof(candidate)); -#ifndef SCOPEDROUTING + IFA_UNLOCK(ifa); /* * XXX: this is adhoc, but is necessary to allow * a user to specify fe80::/64 (not /10) for a @@ -1980,7 +2259,6 @@ in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, */ if (IN6_IS_ADDR_LINKLOCAL(&candidate)) candidate.s6_addr16[1] = 0; -#endif candidate.s6_addr32[0] &= mask.s6_addr32[0]; candidate.s6_addr32[1] &= mask.s6_addr32[1]; candidate.s6_addr32[2] &= mask.s6_addr32[2]; @@ -1988,30 +2266,28 @@ in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, if (IN6_ARE_ADDR_EQUAL(&candidate, &match)) break; } + if (ifa != NULL) + IFA_ADDREF(ifa); ifnet_lock_done(ifp); if (!ifa) return EADDRNOTAVAIL; ia = ifa2ia6(ifa); if (cmd == SIOCGLIFADDR) { -#ifndef SCOPEDROUTING struct sockaddr_in6 *s6; -#endif + IFA_LOCK(ifa); /* fill in the if_laddrreq structure */ bcopy(&ia->ia_addr, &iflr->addr, ia->ia_addr.sin6_len); -#ifndef SCOPEDROUTING /* XXX see above */ s6 = (struct sockaddr_in6 *)&iflr->addr; if (IN6_IS_ADDR_LINKLOCAL(&s6->sin6_addr)) { s6->sin6_addr.s6_addr16[1] = 0; s6->sin6_scope_id = in6_addr2scopeid(ifp, &s6->sin6_addr); } -#endif if ((ifp->if_flags & IFF_POINTOPOINT) != 0) { bcopy(&ia->ia_dstaddr, &iflr->dstaddr, ia->ia_dstaddr.sin6_len); -#ifndef SCOPEDROUTING /* XXX see above */ s6 = (struct sockaddr_in6 *)&iflr->dstaddr; if (IN6_IS_ADDR_LINKLOCAL(&s6->sin6_addr)) { s6->sin6_addr.s6_addr16[1] = 0; @@ -2019,7 +2295,6 @@ in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, in6_addr2scopeid(ifp, &s6->sin6_addr); } -#endif } else bzero(&iflr->dstaddr, sizeof(iflr->dstaddr)); @@ -2028,7 +2303,8 @@ in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, NULL); iflr->flags = ia->ia6_flags; /* XXX */ - + IFA_UNLOCK(ifa); + IFA_REMREF(ifa); return 0; } else { struct in6_aliasreq ifra; @@ -2038,6 +2314,7 @@ in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, bcopy(iflr->iflr_name, ifra.ifra_name, sizeof(ifra.ifra_name)); + IFA_LOCK(ifa); bcopy(&ia->ia_addr, &ifra.ifra_addr, ia->ia_addr.sin6_len); if ((ifp->if_flags & IFF_POINTOPOINT) != 0) { @@ -2051,6 +2328,8 @@ in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, ia->ia_prefixmask.sin6_len); ifra.ifra_flags = ia->ia6_flags; + IFA_UNLOCK(ifa); + IFA_REMREF(ifa); if (!p64) { #if defined(__LP64__) struct in6_aliasreq_32 ifra_32; @@ -2120,24 +2399,30 @@ in6_ifinit(ifp, ia, sin6, newhost) ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { - if (ifa->ifa_addr == NULL) - continue; /* just for safety */ - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; + } ifacount++; + IFA_UNLOCK(ifa); } ifnet_lock_done(ifp); + ifa = &ia->ia_ifa; + IFA_LOCK_SPIN(ifa); ia->ia_addr = *sin6; - + IFA_UNLOCK(ifa); if (ifacount <= 1 && (error = ifnet_ioctl(ifp, PF_INET6, SIOCSIFADDR, ia))) { - if (error) { + if (error == EOPNOTSUPP) + error = 0; + else if (error) return(error); - } } + IFA_LOCK(ifa); ia->ia_ifa.ifa_metric = ifp->if_metric; /* we could do in(6)_socktrim here, but just omit it at this moment. */ @@ -2150,11 +2435,14 @@ in6_ifinit(ifp, ia, sin6, newhost) */ plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */ if (plen == 128 && ia->ia_dstaddr.sin6_family == AF_INET6) { + IFA_UNLOCK(ifa); if ((error = rtinit(&(ia->ia_ifa), (int)RTM_ADD, - RTF_UP | RTF_HOST)) != 0) + RTF_UP | RTF_HOST)) != 0) return(error); + IFA_LOCK(ifa); ia->ia_flags |= IFA_ROUTE; } + IFA_LOCK_ASSERT_HELD(ifa); if (plen < 128) { /* * The RTF_CLONING flag is necessary for in6_is_ifloop_auto(). @@ -2166,104 +2454,19 @@ in6_ifinit(ifp, ia, sin6, newhost) if (newhost) { /* set the rtrequest function to create llinfo */ ia->ia_ifa.ifa_rtrequest = nd6_rtrequest; + IFA_UNLOCK(ifa); in6_ifaddloop(&(ia->ia_ifa)); + } else { + IFA_UNLOCK(ifa); } return(error); } -/* - * Add an address to the list of IP6 multicast addresses for a - * given interface. - */ -struct in6_multi * -in6_addmulti(maddr6, ifp, errorp, nd6_locked) - struct in6_addr *maddr6; - struct ifnet *ifp; - int *errorp; - int nd6_locked; -{ - struct in6_multi *in6m; - struct sockaddr_in6 sin6; - struct ifmultiaddr *ifma; - - *errorp = 0; - - /* - * Call generic routine to add membership or increment - * refcount. It wants addresses in the form of a sockaddr, - * so we build one here (being careful to zero the unused bytes). - */ - bzero(&sin6, sizeof sin6); - sin6.sin6_family = AF_INET6; - sin6.sin6_len = sizeof sin6; - sin6.sin6_addr = *maddr6; - *errorp = if_addmulti(ifp, (struct sockaddr *)&sin6, &ifma); - if (*errorp) { - return 0; - } - - /* - * If ifma->ifma_protospec is null, then if_addmulti() created - * a new record. Otherwise, we are done. - */ - if (ifma->ifma_protospec != 0) - return ifma->ifma_protospec; - - /* XXX - if_addmulti uses M_WAITOK. Can this really be called - at interrupt time? If so, need to fix if_addmulti. XXX */ - in6m = (struct in6_multi *)_MALLOC(sizeof(*in6m), M_IPMADDR, M_NOWAIT); - if (in6m == NULL) { - return (NULL); - } - - bzero(in6m, sizeof *in6m); - in6m->in6m_addr = *maddr6; - in6m->in6m_ifp = ifp; - in6m->in6m_ifma = ifma; - ifma->ifma_protospec = in6m; - if (nd6_locked == 0) - lck_mtx_lock(nd6_mutex); - LIST_INSERT_HEAD(&in6_multihead, in6m, in6m_entry); - if (nd6_locked == 0) - lck_mtx_unlock(nd6_mutex); - - /* - * Let MLD6 know that we have joined a new IP6 multicast - * group. - */ - mld6_start_listening(in6m); - return(in6m); -} - -/* - * Delete a multicast address record. - */ void -in6_delmulti( - struct in6_multi *in6m, int nd6locked) +in6_purgeaddrs(struct ifnet *ifp) { - struct ifmultiaddr *ifma = in6m->in6m_ifma; - - if (ifma && ifma->ifma_usecount == 1) { - /* - * No remaining claims to this record; let MLD6 know - * that we are leaving the multicast group. - */ - mld6_stop_listening(in6m); - ifma->ifma_protospec = 0; - if (nd6locked == 0) - lck_mtx_lock(nd6_mutex); - LIST_REMOVE(in6m, in6m_entry); - if (nd6locked == 0) - lck_mtx_unlock(nd6_mutex); - FREE(in6m, M_IPMADDR); - } - /* XXX - should be separate API for when we have an ifma? */ - if (ifma) { - if_delmultiaddr(ifma, 0); - ifma_release(ifma); - } + in6_purgeif(ifp); } /* @@ -2279,19 +2482,23 @@ in6ifa_ifpforlinklocal(ifp, ignoreflags) ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { - if (ifa->ifa_addr == NULL) - continue; /* just for safety */ - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; + } if (IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa))) { if ((((struct in6_ifaddr *)ifa)->ia6_flags & - ignoreflags) != 0) + ignoreflags) != 0) { + IFA_UNLOCK(ifa); continue; + } + IFA_ADDREF_LOCKED(ifa); /* for caller */ + IFA_UNLOCK(ifa); break; } + IFA_UNLOCK(ifa); } - if (ifa != NULL) - ifaref(ifa); ifnet_lock_done(ifp); return((struct in6_ifaddr *)ifa); @@ -2310,15 +2517,18 @@ in6ifa_ifpwithaddr(ifp, addr) ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { - if (ifa->ifa_addr == NULL) - continue; /* just for safety */ - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; - if (IN6_ARE_ADDR_EQUAL(addr, IFA_IN6(ifa))) + } + if (IN6_ARE_ADDR_EQUAL(addr, IFA_IN6(ifa))) { + IFA_ADDREF_LOCKED(ifa); /* for caller */ + IFA_UNLOCK(ifa); break; + } + IFA_UNLOCK(ifa); } - if (ifa != NULL) - ifaref(ifa); ifnet_lock_done(ifp); return((struct in6_ifaddr *)ifa); @@ -2385,7 +2595,7 @@ in6addr_local(struct in6_addr *in6) struct sockaddr_in6 sin6; int local = 0; - if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_LINKLOCAL(in6)) + if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_SCOPE_LINKLOCAL(in6)) return (1); sin6.sin6_family = AF_INET6; @@ -2406,48 +2616,48 @@ in6addr_local(struct in6_addr *in6) } int -in6_localaddr(in6) - struct in6_addr *in6; +in6_localaddr(struct in6_addr *in6) { struct in6_ifaddr *ia; if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_LINKLOCAL(in6)) - return 1; + return (1); - lck_mtx_lock(nd6_mutex); - for (ia = in6_ifaddrs; ia; ia = ia->ia_next) + lck_rw_lock_shared(&in6_ifaddr_rwlock); + for (ia = in6_ifaddrs; ia; ia = ia->ia_next) { + IFA_LOCK_SPIN(&ia->ia_ifa); if (IN6_ARE_MASKED_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr, - &ia->ia_prefixmask.sin6_addr)) { - lck_mtx_unlock(nd6_mutex); - return 1; + &ia->ia_prefixmask.sin6_addr)) { + IFA_UNLOCK(&ia->ia_ifa); + lck_rw_done(&in6_ifaddr_rwlock); + return (1); } - - lck_mtx_unlock(nd6_mutex); + IFA_UNLOCK(&ia->ia_ifa); + } + lck_rw_done(&in6_ifaddr_rwlock); return (0); } int -in6_is_addr_deprecated(sa6) - struct sockaddr_in6 *sa6; +in6_is_addr_deprecated(struct sockaddr_in6 *sa6) { struct in6_ifaddr *ia; - lck_mtx_lock(nd6_mutex); + lck_rw_lock_shared(&in6_ifaddr_rwlock); for (ia = in6_ifaddrs; ia; ia = ia->ia_next) { + IFA_LOCK_SPIN(&ia->ia_ifa); if (IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, - &sa6->sin6_addr) && -#if SCOPEDROUTING - ia->ia_addr.sin6_scope_id == sa6->sin6_scope_id && -#endif + &sa6->sin6_addr) && (ia->ia6_flags & IN6_IFF_DEPRECATED) != 0) { - lck_mtx_unlock(nd6_mutex); + IFA_UNLOCK(&ia->ia_ifa); + lck_rw_done(&in6_ifaddr_rwlock); return(1); /* true */ } - /* XXX: do we still have to go thru the rest of the list? */ + IFA_UNLOCK(&ia->ia_ifa); } - lck_mtx_unlock(nd6_mutex); + lck_rw_done(&in6_ifaddr_rwlock); return(0); /* false */ } @@ -2542,9 +2752,7 @@ in6_ifawithscope( struct in6_ifaddr *ifa_best = NULL; if (oifp == NULL) { -#if 0 - printf("in6_ifawithscope: output interface is not specified\n"); -#endif + /* output interface is not specified */ return(NULL); } @@ -2567,9 +2775,11 @@ in6_ifawithscope( { int tlen = -1, dscopecmp, bscopecmp, matchcmp; - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; - + } src_scope = in6_addrscope(IFA_IN6(ifa)); /* @@ -2577,18 +2787,21 @@ in6_ifawithscope( * nor a duplicated address. */ if (((struct in6_ifaddr *)ifa)->ia6_flags & - IN6_IFF_NOTREADY) + IN6_IFF_NOTREADY) { + IFA_UNLOCK(ifa); continue; - + } /* XXX: is there any case to allow anycasts? */ if (((struct in6_ifaddr *)ifa)->ia6_flags & - IN6_IFF_ANYCAST) + IN6_IFF_ANYCAST) { + IFA_UNLOCK(ifa); continue; - + } if (((struct in6_ifaddr *)ifa)->ia6_flags & - IN6_IFF_DETACHED) + IN6_IFF_DETACHED) { + IFA_UNLOCK(ifa); continue; - + } /* * If this is the first address we find, * keep it anyway. @@ -2620,9 +2833,10 @@ in6_ifawithscope( IN6_ARE_SCOPE_CMP(src_scope, dst_scope) >= 0) goto replace; /* (A) */ if (IN6_ARE_SCOPE_CMP(src_scope, dst_scope) < 0 && - IN6_ARE_SCOPE_CMP(best_scope, dst_scope) >= 0) + IN6_ARE_SCOPE_CMP(best_scope, dst_scope) >= 0) { + IFA_UNLOCK(ifa); continue; /* (B) */ - + } /* * A deprecated address SHOULD NOT be used in new * communications if an alternate (non-deprecated) @@ -2635,16 +2849,19 @@ in6_ifawithscope( * Ignore any deprecated addresses if * specified by configuration. */ - if (!ip6_use_deprecated) + if (!ip6_use_deprecated) { + IFA_UNLOCK(ifa); continue; - + } /* * If we have already found a non-deprecated * candidate, just ignore deprecated addresses. */ if ((ifa_best->ia6_flags & IN6_IFF_DEPRECATED) - == 0) + == 0) { + IFA_UNLOCK(ifa); continue; + } } /* @@ -2660,7 +2877,7 @@ in6_ifawithscope( /* * When we use temporary addresses described in - * RFC 3041, we prefer temporary addresses to + * RFC 4941, we prefer temporary addresses to * public autoconf addresses. Again, note the * invariants from (A) and (B). Also note that we * don't have any preference between static addresses @@ -2685,6 +2902,7 @@ in6_ifawithscope( (ifat->ia6_flags & (IN6_IFF_AUTOCONF|IN6_IFF_TEMPORARY)) == IN6_IFF_AUTOCONF) { + IFA_UNLOCK(ifa); continue; } } @@ -2745,8 +2963,10 @@ in6_ifawithscope( if (bscopecmp == 0) { struct ifnet *bifp = ifa_best->ia_ifp; - if (bifp == oifp && ifp != oifp) /* (1) */ + if (bifp == oifp && ifp != oifp) { /* (1) */ + IFA_UNLOCK(ifa); continue; + } if (bifp != oifp && ifp == oifp) /* (2) */ goto replace; @@ -2761,16 +2981,20 @@ in6_ifawithscope( matchcmp = tlen - blen; if (matchcmp > 0) /* (3) */ goto replace; + IFA_UNLOCK(ifa); continue; /* (4) */ } if (dscopecmp > 0) { - if (bscopecmp > 0) /* (5) */ + if (bscopecmp > 0) { /* (5) */ + IFA_UNLOCK(ifa); continue; + } goto replace; /* (6) */ } if (dscopecmp < 0) { if (bscopecmp > 0) /* (7) */ goto replace; + IFA_UNLOCK(ifa); continue; /* (8) */ } @@ -2778,14 +3002,15 @@ in6_ifawithscope( if (bscopecmp < 0) goto replace; /* (9) */ - replace: - ifaref(ifa); - if (ifa_best) - ifafree(&ifa_best->ia_ifa); - ifa_best = (struct in6_ifaddr *)ifa; +replace: + IFA_ADDREF_LOCKED(ifa); /* for ifa_best */ blen = tlen >= 0 ? tlen : in6_matchlen(IFA_IN6(ifa), dst); - best_scope = in6_addrscope(&ifa_best->ia_addr.sin6_addr); + best_scope = in6_addrscope(&ifa2ia6(ifa)->ia_addr.sin6_addr); + IFA_UNLOCK(ifa); + if (ifa_best) + IFA_REMREF(&ifa_best->ia_ifa); + ifa_best = (struct in6_ifaddr *)ifa; } ifnet_lock_done(ifp); } @@ -2795,6 +3020,7 @@ in6_ifawithscope( if (ifa_best == NULL) ip6stat.ip6s_sources_none++; else { + IFA_LOCK_SPIN(&ifa_best->ia_ifa); if (oifp == ifa_best->ia_ifp) ip6stat.ip6s_sources_sameif[best_scope]++; else @@ -2807,6 +3033,7 @@ in6_ifawithscope( if ((ifa_best->ia6_flags & IN6_IFF_DEPRECATED) != 0) ip6stat.ip6s_sources_deprecated[best_scope]++; + IFA_UNLOCK(&ifa_best->ia_ifa); } return(ifa_best); @@ -2823,7 +3050,7 @@ in6_ifawithifp( { int dst_scope = in6_addrscope(dst), blen = -1, tlen; struct ifaddr *ifa; - struct in6_ifaddr *besta = 0; + struct in6_ifaddr *besta = NULL; struct in6_ifaddr *dep[2]; /* last-resort: deprecated */ dep[0] = dep[1] = NULL; @@ -2837,20 +3064,32 @@ in6_ifawithifp( ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; - if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST) + } + if (ifa2ia6(ifa)->ia6_flags & IN6_IFF_ANYCAST) { + IFA_UNLOCK(ifa); continue; /* XXX: is there any case to allow anycast? */ - if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY) + } + if (ifa2ia6(ifa)->ia6_flags & IN6_IFF_NOTREADY) { + IFA_UNLOCK(ifa); continue; /* don't use this interface */ - if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED) + } + if (ifa2ia6(ifa)->ia6_flags & IN6_IFF_DETACHED) { + IFA_UNLOCK(ifa); continue; - if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) { + } + if (ifa2ia6(ifa)->ia6_flags & IN6_IFF_DEPRECATED) { if (ip6_use_deprecated) { + IFA_ADDREF_LOCKED(ifa); /* for dep[0] */ + IFA_UNLOCK(ifa); if (dep[0] != NULL) - ifafree(&dep[0]->ia_ifa); + IFA_REMREF(&dep[0]->ia_ifa); dep[0] = (struct in6_ifaddr *)ifa; - ifaref(ifa); + } else { + IFA_UNLOCK(ifa); } continue; } @@ -2860,51 +3099,77 @@ in6_ifawithifp( * call in6_matchlen() as few as possible */ if (besta) { - if (blen == -1) + if (blen == -1) { + IFA_UNLOCK(ifa); + IFA_LOCK(&besta->ia_ifa); blen = in6_matchlen(&besta->ia_addr.sin6_addr, dst); + IFA_UNLOCK(&besta->ia_ifa); + IFA_LOCK(ifa); + } tlen = in6_matchlen(IFA_IN6(ifa), dst); if (tlen > blen) { blen = tlen; + IFA_ADDREF_LOCKED(ifa); /* for besta */ + IFA_UNLOCK(ifa); + IFA_REMREF(&besta->ia_ifa); besta = (struct in6_ifaddr *)ifa; + } else { + IFA_UNLOCK(ifa); } - } else + } else { besta = (struct in6_ifaddr *)ifa; + IFA_ADDREF_LOCKED(ifa); /* for besta */ + IFA_UNLOCK(ifa); + } + } else { + IFA_UNLOCK(ifa); } } if (besta) { - ifaref(&besta->ia_ifa); ifnet_lock_done(ifp); if (dep[0] != NULL) - ifafree(&dep[0]->ia_ifa); + IFA_REMREF(&dep[0]->ia_ifa); return(besta); } TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; - if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST) + } + if (ifa2ia6(ifa)->ia6_flags & IN6_IFF_ANYCAST) { + IFA_UNLOCK(ifa); continue; /* XXX: is there any case to allow anycast? */ - if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY) + } + if (ifa2ia6(ifa)->ia6_flags & IN6_IFF_NOTREADY) { + IFA_UNLOCK(ifa); continue; /* don't use this interface */ - if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED) + } + if (ifa2ia6(ifa)->ia6_flags & IN6_IFF_DETACHED) { + IFA_UNLOCK(ifa); continue; - if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) { + } + if (ifa2ia6(ifa)->ia6_flags & IN6_IFF_DEPRECATED) { if (ip6_use_deprecated) { + IFA_ADDREF_LOCKED(ifa); /* for dep[1] */ + IFA_UNLOCK(ifa); if (dep[1] != NULL) - ifafree(&dep[1]->ia_ifa); + IFA_REMREF(&dep[1]->ia_ifa); dep[1] = (struct in6_ifaddr *)ifa; - ifaref(ifa); + } else { + IFA_UNLOCK(ifa); } continue; } - if (ifa != NULL) - ifaref(ifa); + IFA_ADDREF_LOCKED(ifa); /* for caller */ + IFA_UNLOCK(ifa); ifnet_lock_done(ifp); if (dep[0] != NULL) - ifafree(&dep[0]->ia_ifa); + IFA_REMREF(&dep[0]->ia_ifa); if (dep[1] != NULL) - ifafree(&dep[1]->ia_ifa); + IFA_REMREF(&dep[1]->ia_ifa); return (struct in6_ifaddr *)ifa; } ifnet_lock_done(ifp); @@ -2912,7 +3177,7 @@ in6_ifawithifp( /* use the last-resort values, that are, deprecated addresses */ if (dep[0]) { if (dep[1] != NULL) - ifafree(&dep[1]->ia_ifa); + IFA_REMREF(&dep[1]->ia_ifa); return dep[0]; } if (dep[1]) @@ -2945,14 +3210,22 @@ in6_if_up( return error; dad_delay = 0; + ifnet_lock_exclusive(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; + } ia = (struct in6_ifaddr *)ifa; - if (ia->ia6_flags & IN6_IFF_TENTATIVE) + if (ia->ia6_flags & IN6_IFF_TENTATIVE) { + IFA_UNLOCK(ifa); nd6_dad_start(ifa, &dad_delay); + } else { + IFA_UNLOCK(ifa); + } } ifnet_lock_done(ifp); @@ -2966,6 +3239,15 @@ in6if_do_dad( if ((ifp->if_flags & IFF_LOOPBACK) != 0) return(0); + /* + * Skip DAD on service triggered interfaces, for now, + * until we have support for Opportunistic Duplicate + * Address Detection [RFC 4429] and we can then back + * this out. + */ + if (ifp->if_eflags & IFEF_SERVICE_TRIGGERED) + return (0); + switch (ifp->if_type) { #if IFT_DUMMY case IFT_DUMMY: @@ -3018,7 +3300,69 @@ in6_setmaxmtu() if (maxmtu) /* update only when maxmtu is positive */ in6_maxmtu = maxmtu; } - +/* + * Provide the length of interface identifiers to be used for the link attached + * to the given interface. The length should be defined in "IPv6 over + * xxx-link" document. Note that address architecture might also define + * the length for a particular set of address prefixes, regardless of the + * link type. As clarified in rfc2462bis, those two definitions should be + * consistent, and those really are as of August 2004. + */ +int +in6_if2idlen(struct ifnet *ifp) +{ + switch (ifp->if_type) { + case IFT_ETHER: /* RFC2464 */ + case IFT_IEEE8023ADLAG: /* IEEE802.3ad Link Aggregate */ +#ifdef IFT_PROPVIRTUAL + case IFT_PROPVIRTUAL: /* XXX: no RFC. treat it as ether */ +#endif +#ifdef IFT_L2VLAN + case IFT_L2VLAN: /* ditto */ +#endif +#ifdef IFT_IEEE80211 + case IFT_IEEE80211: /* ditto */ +#endif +#ifdef IFT_MIP + case IFT_MIP: /* ditto */ +#endif + return (64); + case IFT_FDDI: /* RFC2467 */ + return (64); + case IFT_ISO88025: /* RFC2470 (IPv6 over Token Ring) */ + return (64); + case IFT_PPP: /* RFC2472 */ + return (64); + case IFT_ARCNET: /* RFC2497 */ + return (64); + case IFT_FRELAY: /* RFC2590 */ + return (64); + case IFT_IEEE1394: /* RFC3146 */ + return (64); + case IFT_GIF: + return (64); /* draft-ietf-v6ops-mech-v2-07 */ + case IFT_LOOP: + return (64); /* XXX: is this really correct? */ + case IFT_OTHER: + return (64); /* for utun interfaces */ + case IFT_CELLULAR: + return (64); /* Packet Data over Cellular */ + default: + /* + * Unknown link type: + * It might be controversial to use the today's common constant + * of 64 for these cases unconditionally. For full compliance, + * we should return an error in this case. On the other hand, + * if we simply miss the standard for the link type or a new + * standard is defined for a new link type, the IFID length + * is very likely to be the common constant. As a compromise, + * we always use the constant, but make an explicit notice + * indicating the "unknown" case. + */ + printf("in6_if2idlen: unknown link type (%d)\n", ifp->if_type); + return (64); + } +} /* * Convert sockaddr_in6 to sockaddr_in. Original sockaddr_in6 must be * v4 mapped addr or v4 compat addr @@ -3030,7 +3374,7 @@ in6_sin6_2_sin(struct sockaddr_in *sin, struct sockaddr_in6 *sin6) sin->sin_len = sizeof(struct sockaddr_in); sin->sin_family = AF_INET; sin->sin_port = sin6->sin6_port; - sin->sin_addr.s_addr = sin6->sin6_addr.s6_addr32[3]; + sin->sin_addr.s_addr = sin6->sin6_addr.s6_addr32[3]; } /* Convert sockaddr_in to sockaddr_in6 in v4 mapped addr format. */ @@ -3096,11 +3440,14 @@ in6_post_msg(struct ifnet *ifp, u_int32_t event_code, struct in6_ifaddr *ifa) struct kev_msg ev_msg; struct kev_in6_data in6_event_data; + bzero(&in6_event_data, sizeof(struct kev_in6_data)); + bzero(&ev_msg, sizeof(struct kev_msg)); ev_msg.vendor_code = KEV_VENDOR_APPLE; ev_msg.kev_class = KEV_NETWORK_CLASS; ev_msg.kev_subclass = KEV_INET6_SUBCLASS; ev_msg.event_code = event_code; + IFA_LOCK(&ifa->ia_ifa); in6_event_data.ia_addr = ifa->ia_addr; in6_event_data.ia_net = ifa->ia_net; in6_event_data.ia_dstaddr = ifa->ia_dstaddr; @@ -3116,6 +3463,7 @@ in6_post_msg(struct ifnet *ifp, u_int32_t event_code, struct in6_ifaddr *ifa) ifa->ia6_lifetime.ia6t_vltime; in6_event_data.ia_lifetime.ia6t_pltime = ifa->ia6_lifetime.ia6t_pltime; + IFA_UNLOCK(&ifa->ia_ifa); if (ifp != NULL) { strncpy(&in6_event_data.link_data.if_name[0], @@ -3137,6 +3485,8 @@ in6_post_msg(struct ifnet *ifp, u_int32_t event_code, struct in6_ifaddr *ifa) void in6_ifaddr_init(void) { + in6_multi_init(); + PE_parse_boot_argn("ifa_debug", &in6ifa_debug, sizeof (in6ifa_debug)); in6ifa_size = (in6ifa_debug == 0) ? sizeof (struct in6_ifaddr) : @@ -3144,10 +3494,15 @@ in6_ifaddr_init(void) in6ifa_zone = zinit(in6ifa_size, IN6IFA_ZONE_MAX * in6ifa_size, 0, IN6IFA_ZONE_NAME); - if (in6ifa_zone == NULL) + if (in6ifa_zone == NULL) { panic("%s: failed allocating %s", __func__, IN6IFA_ZONE_NAME); - + /* NOTREACHED */ + } zone_change(in6ifa_zone, Z_EXPAND, TRUE); + zone_change(in6ifa_zone, Z_CALLERACCT, FALSE); + + lck_mtx_init(&in6ifa_trash_lock, ifa_mtx_grp, ifa_mtx_attr); + TAILQ_INIT(&in6ifa_trash_head); } static struct in6_ifaddr * @@ -3161,11 +3516,14 @@ in6_ifaddr_alloc(int how) bzero(in6ifa, in6ifa_size); in6ifa->ia_ifa.ifa_free = in6_ifaddr_free; in6ifa->ia_ifa.ifa_debug |= IFD_ALLOC; + ifa_lock_init(&in6ifa->ia_ifa); if (in6ifa_debug != 0) { struct in6_ifaddr_dbg *in6ifa_dbg = (struct in6_ifaddr_dbg *)in6ifa; in6ifa->ia_ifa.ifa_debug |= IFD_DEBUG; in6ifa->ia_ifa.ifa_trace = in6_ifaddr_trace; + in6ifa->ia_ifa.ifa_attached = in6_ifaddr_attached; + in6ifa->ia_ifa.ifa_detached = in6_ifaddr_detached; ctrace_record(&in6ifa_dbg->in6ifa_alloc); } } @@ -3175,22 +3533,80 @@ in6_ifaddr_alloc(int how) static void in6_ifaddr_free(struct ifaddr *ifa) { - if (ifa->ifa_refcnt != 0) + IFA_LOCK_ASSERT_HELD(ifa); + + if (ifa->ifa_refcnt != 0) { panic("%s: ifa %p bad ref cnt", __func__, ifa); - if (!(ifa->ifa_debug & IFD_ALLOC)) + /* NOTREACHED */ + } else if (!(ifa->ifa_debug & IFD_ALLOC)) { panic("%s: ifa %p cannot be freed", __func__, ifa); - + /* NOTREACHED */ + } if (ifa->ifa_debug & IFD_DEBUG) { struct in6_ifaddr_dbg *in6ifa_dbg = (struct in6_ifaddr_dbg *)ifa; ctrace_record(&in6ifa_dbg->in6ifa_free); bcopy(&in6ifa_dbg->in6ifa, &in6ifa_dbg->in6ifa_old, sizeof (struct in6_ifaddr)); + if (ifa->ifa_debug & IFD_TRASHED) { + /* Become a regular mutex, just in case */ + IFA_CONVERT_LOCK(ifa); + lck_mtx_lock(&in6ifa_trash_lock); + TAILQ_REMOVE(&in6ifa_trash_head, in6ifa_dbg, + in6ifa_trash_link); + lck_mtx_unlock(&in6ifa_trash_lock); + ifa->ifa_debug &= ~IFD_TRASHED; + } } + IFA_UNLOCK(ifa); + ifa_lock_destroy(ifa); bzero(ifa, sizeof (struct in6_ifaddr)); zfree(in6ifa_zone, ifa); } +static void +in6_ifaddr_attached(struct ifaddr *ifa) +{ + struct in6_ifaddr_dbg *in6ifa_dbg = (struct in6_ifaddr_dbg *)ifa; + + IFA_LOCK_ASSERT_HELD(ifa); + + if (!(ifa->ifa_debug & IFD_DEBUG)) { + panic("%s: ifa %p has no debug structure", __func__, ifa); + /* NOTREACHED */ + } + if (ifa->ifa_debug & IFD_TRASHED) { + /* Become a regular mutex, just in case */ + IFA_CONVERT_LOCK(ifa); + lck_mtx_lock(&in6ifa_trash_lock); + TAILQ_REMOVE(&in6ifa_trash_head, in6ifa_dbg, in6ifa_trash_link); + lck_mtx_unlock(&in6ifa_trash_lock); + ifa->ifa_debug &= ~IFD_TRASHED; + } +} + +static void +in6_ifaddr_detached(struct ifaddr *ifa) +{ + struct in6_ifaddr_dbg *in6ifa_dbg = (struct in6_ifaddr_dbg *)ifa; + + IFA_LOCK_ASSERT_HELD(ifa); + + if (!(ifa->ifa_debug & IFD_DEBUG)) { + panic("%s: ifa %p has no debug structure", __func__, ifa); + /* NOTREACHED */ + } else if (ifa->ifa_debug & IFD_TRASHED) { + panic("%s: ifa %p is already in trash list", __func__, ifa); + /* NOTREACHED */ + } + ifa->ifa_debug |= IFD_TRASHED; + /* Become a regular mutex, just in case */ + IFA_CONVERT_LOCK(ifa); + lck_mtx_lock(&in6ifa_trash_lock); + TAILQ_INSERT_TAIL(&in6ifa_trash_head, in6ifa_dbg, in6ifa_trash_link); + lck_mtx_unlock(&in6ifa_trash_lock); +} + static void in6_ifaddr_trace(struct ifaddr *ifa, int refhold) { @@ -3199,9 +3615,10 @@ in6_ifaddr_trace(struct ifaddr *ifa, int refhold) u_int32_t idx; u_int16_t *cnt; - if (!(ifa->ifa_debug & IFD_DEBUG)) + if (!(ifa->ifa_debug & IFD_DEBUG)) { panic("%s: ifa %p has no debug structure", __func__, ifa); - + /* NOTREACHED */ + } if (refhold) { cnt = &in6ifa_dbg->in6ifa_refhold_cnt; tr = in6ifa_dbg->in6ifa_refhold; @@ -3210,6 +3627,6 @@ in6_ifaddr_trace(struct ifaddr *ifa, int refhold) tr = in6ifa_dbg->in6ifa_refrele; } - idx = OSAddAtomic16(1, (volatile SInt16 *)cnt) % CTRACE_HIST_SIZE; + idx = atomic_add_16_ov(cnt, 1) % IN6IFA_TRACE_HIST_SIZE; ctrace_record(&tr[idx]); } diff --git a/bsd/netinet6/in6.h b/bsd/netinet6/in6.h index fb0479fde..c0838ec43 100644 --- a/bsd/netinet6/in6.h +++ b/bsd/netinet6/in6.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -114,7 +114,7 @@ typedef __uint8_t sa_family_t; * has the table of implementation/integration differences. */ #define __KAME__ -#define __KAME_VERSION "20010528/apple-darwin" +#define __KAME_VERSION "2009/apple-darwin" #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) /* @@ -192,6 +192,10 @@ struct sockaddr_in6 { * Local definition for masks */ #define IN6MASK0 {{{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }}} +#define IN6MASK7 {{{ 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }}} +#define IN6MASK16 {{{ 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }}} #define IN6MASK32 {{{ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, \ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }}} #define IN6MASK64 {{{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, \ @@ -206,6 +210,8 @@ struct sockaddr_in6 { extern const struct sockaddr_in6 sa6_any; extern const struct in6_addr in6mask0; +extern const struct in6_addr in6mask7; +extern const struct in6_addr in6mask16; extern const struct in6_addr in6mask32; extern const struct in6_addr in6mask64; extern const struct in6_addr in6mask96; @@ -250,12 +256,21 @@ extern const struct in6_addr in6mask128; #define IN6ADDR_NODELOCAL_ALLNODES_INIT \ {{{ 0xff, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 }}} +#define IN6ADDR_INTFACELOCAL_ALLNODES_INIT \ + {{{ 0xff, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 }}} #define IN6ADDR_LINKLOCAL_ALLNODES_INIT \ {{{ 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 }}} #define IN6ADDR_LINKLOCAL_ALLROUTERS_INIT \ {{{ 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02 }}} +#define IN6ADDR_LINKLOCAL_ALLV2ROUTERS_INIT \ + {{{ 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x16 }}} +#define IN6ADDR_V4MAPPED_INIT \ + {{{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ + 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 }}} #endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ extern const struct in6_addr in6addr_any; @@ -264,6 +279,7 @@ extern const struct in6_addr in6addr_loopback; extern const struct in6_addr in6addr_nodelocal_allnodes; extern const struct in6_addr in6addr_linklocal_allnodes; extern const struct in6_addr in6addr_linklocal_allrouters; +extern const struct in6_addr in6addr_linklocal_allv2routers; #endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ /* @@ -326,6 +342,11 @@ extern const struct in6_addr in6addr_linklocal_allrouters; (*(const __uint32_t *)(const void *)(&(a)->s6_addr[4]) == 0) && \ (*(const __uint32_t *)(const void *)(&(a)->s6_addr[8]) == ntohl(0x0000ffff))) +/* + * 6to4 + */ +#define IN6_IS_ADDR_6TO4(x) (ntohs((x)->s6_addr16[0]) == 0x2002) + /* * KAME Scope Values */ @@ -339,6 +360,7 @@ extern const struct in6_addr in6addr_linklocal_allrouters; #define IPV6_ADDR_SCOPE_GLOBAL 0x0e #else #define __IPV6_ADDR_SCOPE_NODELOCAL 0x01 +#define __IPV6_ADDR_SCOPE_INTFACELOCAL 0x01 #define __IPV6_ADDR_SCOPE_LINKLOCAL 0x02 #define __IPV6_ADDR_SCOPE_SITELOCAL 0x05 #define __IPV6_ADDR_SCOPE_ORGLOCAL 0x08 /* just used in this file */ @@ -359,6 +381,11 @@ extern const struct in6_addr in6addr_linklocal_allrouters; */ #define IN6_IS_ADDR_MULTICAST(a) ((a)->s6_addr[0] == 0xff) +/* + * Unique Local IPv6 Unicast Addresses (per RFC 4193) + */ +#define IN6_IS_ADDR_UNIQUE_LOCAL(a) (((a)->s6_addr[0] == 0xfc) || ((a)->s6_addr[0] == 0xfd)) + #ifdef KERNEL /*XXX nonstandard*/ #define IPV6_ADDR_MC_SCOPE(a) ((a)->s6_addr[1] & 0x0f) #else @@ -450,6 +477,35 @@ struct route_in6 { */ /* no hdrincl */ #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +/* + * RFC 3542 define the following socket options in a manner incompatible + * with RFC 2292: + * IPV6_PKTINFO + * IPV6_HOPLIMIT + * IPV6_NEXTHOP + * IPV6_HOPOPTS + * IPV6_DSTOPTS + * IPV6_RTHDR + * + * To use the new IPv6 Sockets options introduced by RFC 3542 + * the constant __APPLE_USE_RFC_3542 must be defined before + * including + * + * To use the old IPv6 Sockets options from RFC 2292 + * the constant __APPLE_USE_RFC_2292 must be defined before + * including + * + * Note that eventually RFC 3542 is going to be the + * default and RFC 2292 will be obsolete. + */ +#ifdef XNU_KERNEL_PRIVATE +#define __APPLE_USE_RFC_3542 1 +#endif /* XNU_KERNEL_PRIVATE */ + +#if defined(__APPLE_USE_RFC_3542) && defined(__APPLE_USE_RFC_2292) +#error "__APPLE_USE_RFC_3542 and __APPLE_USE_RFC_2292 cannot be both defined" +#endif + #if 0 /* the followings are relic in IPv4 and hence are disabled */ #define IPV6_OPTIONS 1 /* buf/ip6_opts; set/get IP6 options */ #define IPV6_RECVOPTS 5 /* bool; receive all IP6 opts w/dgram */ @@ -469,14 +525,24 @@ struct route_in6 { #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) #define IPV6_PORTRANGE 14 /* int; range to choose for unspec port */ #define ICMP6_FILTER 18 /* icmp6_filter; icmp6 filter */ -/* RFC2292 options */ -#define IPV6_PKTINFO 19 /* bool; send/recv if, src/dst addr */ -#define IPV6_HOPLIMIT 20 /* bool; hop limit */ -#define IPV6_NEXTHOP 21 /* bool; next hop addr */ -#define IPV6_HOPOPTS 22 /* bool; hop-by-hop option */ -#define IPV6_DSTOPTS 23 /* bool; destination option */ -#define IPV6_RTHDR 24 /* bool; routing header */ -#define IPV6_PKTOPTIONS 25 /* buf/cmsghdr; set/get IPv6 options */ +#define IPV6_2292PKTINFO 19 /* bool; send/recv if, src/dst addr */ +#define IPV6_2292HOPLIMIT 20 /* bool; hop limit */ +#define IPV6_2292NEXTHOP 21 /* bool; next hop addr */ +#define IPV6_2292HOPOPTS 22 /* bool; hop-by-hop option */ +#define IPV6_2292DSTOPTS 23 /* bool; destinaion option */ +#define IPV6_2292RTHDR 24 /* ip6_rthdr: routing header */ +#define IPV6_2292PKTOPTIONS 25 /* buf/cmsghdr; set/get IPv6 options */ + /* obsoleted by RFC3542 */ + +#ifdef __APPLE_USE_RFC_2292 +#define IPV6_PKTINFO IPV6_2292PKTINFO +#define IPV6_HOPLIMIT IPV6_2292HOPLIMIT +#define IPV6_NEXTHOP IPV6_2292NEXTHOP +#define IPV6_HOPOPTS IPV6_2292HOPOPTS +#define IPV6_DSTOPTS IPV6_2292DSTOPTS +#define IPV6_RTHDR IPV6_2292RTHDR +#define IPV6_PKTOPTIONS IPV6_2292PKTOPTIONS +#endif /* __APPLE_USE_RFC_2292 */ #define IPV6_CHECKSUM 26 /* int; checksum offset for raw socket */ #endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ @@ -500,8 +566,80 @@ struct route_in6 { #define IPV6_FW_GET 34 /* get entire firewall rule chain */ #endif /* 1 */ -#define IPV6_RECVTCLASS 35 /* bool; recv traffic class values */ -#define IPV6_TCLASS 36 /* int; send traffic class value */ +/* APPLE: NOTE the value of those 2 options is kept unchanged from + * previous version of darwin/OS X for binary compatibility reasons + * and differ from FreeBSD (values 57 and 61). See below. + */ +#define IPV6_RECVTCLASS 35 /* bool; recv traffic class values */ +#define IPV6_TCLASS 36 /* int; send traffic class value */ + +#ifdef __APPLE_USE_RFC_3542 +/* new socket options introduced in RFC3542 */ +#define IPV6_RTHDRDSTOPTS 57 /* ip6_dest; send dst option before rthdr + * APPLE: Value purposely different than FreeBSD (35) to avoid + * collision with definition of IPV6_RECVTCLASS in previous + * darwin implementations */ + +#define IPV6_RECVPKTINFO 61 /* bool; recv if, dst addr + * APPLE: Value purposely different than FreeBSD (36) to avoid + * collision with definition of IPV6_TCLASS in previous + * darwin implementations */ + +#define IPV6_RECVHOPLIMIT 37 /* bool; recv hop limit */ +#define IPV6_RECVRTHDR 38 /* bool; recv routing header */ +#define IPV6_RECVHOPOPTS 39 /* bool; recv hop-by-hop option */ +#define IPV6_RECVDSTOPTS 40 /* bool; recv dst option after rthdr */ +#ifdef KERNEL +#define IPV6_RECVRTHDRDSTOPTS 41 /* bool; recv dst option before rthdr */ +#endif + +#define IPV6_USE_MIN_MTU 42 /* bool; send packets at the minimum MTU */ +#define IPV6_RECVPATHMTU 43 /* bool; notify an according MTU */ + +#define IPV6_PATHMTU 44 /* mtuinfo; get the current path MTU (sopt), + 4 bytes int; MTU notification (cmsg) */ +#if 0 /*obsoleted during 2292bis -> 3542*/ +#define IPV6_REACHCONF 45 /* no data; ND reachability confirm + (cmsg only/not in of RFC3542) */ +#endif +/* more new socket options introduced in RFC3542 */ +#define IPV6_3542PKTINFO 46 /* in6_pktinfo; send if, src addr */ +#define IPV6_3542HOPLIMIT 47 /* int; send hop limit */ +#define IPV6_3542NEXTHOP 48 /* sockaddr; next hop addr */ +#define IPV6_3542HOPOPTS 49 /* ip6_hbh; send hop-by-hop option */ +#define IPV6_3542DSTOPTS 50 /* ip6_dest; send dst option befor rthdr */ +#define IPV6_3542RTHDR 51 /* ip6_rthdr; send routing header */ + +#define IPV6_PKTINFO IPV6_3542PKTINFO +#define IPV6_HOPLIMIT IPV6_3542HOPLIMIT +#define IPV6_NEXTHOP IPV6_3542NEXTHOP +#define IPV6_HOPOPTS IPV6_3542HOPOPTS +#define IPV6_DSTOPTS IPV6_3542DSTOPTS +#define IPV6_RTHDR IPV6_3542RTHDR + +#define IPV6_AUTOFLOWLABEL 59 /* bool; attach flowlabel automagically */ + +#define IPV6_DONTFRAG 62 /* bool; disable IPv6 fragmentation */ + +#define IPV6_PREFER_TEMPADDR 63 /* int; prefer temporary addresses as + * the source address. + */ + +/* + * The following option is private; do not use it from user applications. + * It is deliberately defined to the same value as IP_MSFILTER. + */ +#define IPV6_MSFILTER 74 /* struct __msfilterreq; + * set/get multicast source filter list. + */ +#endif /* __APPLE_USE_RFC_3542 */ + +#define IPV6_BOUND_IF 125 /* int; set/get bound interface */ + +#ifdef PRIVATE +#define IPV6_NO_IFT_CELLULAR 6969 /* for internal use only */ +#define IPV6_OUT_IF 9696 /* for internal use only */ +#endif /* PRIVATE */ /* to define items, should talk with KAME guys first, for *BSD compatibility */ @@ -515,6 +653,21 @@ struct route_in6 { #define IPV6_DEFAULT_MULTICAST_HOPS 1 /* normally limit m'casts to 1 hop */ #define IPV6_DEFAULT_MULTICAST_LOOP 1 /* normally hear sends if a member */ +/* + * The im6o_membership vector for each socket is now dynamically allocated at + * run-time, bounded by USHRT_MAX, and is reallocated when needed, sized + * according to a power-of-two increment. + */ +#define IPV6_MIN_MEMBERSHIPS 31 +#define IPV6_MAX_MEMBERSHIPS 4095 + +/* + * Default resource limits for IPv6 multicast source filtering. + * These may be modified by sysctl. + */ +#define IPV6_MAX_GROUP_SRC_FILTER 512 /* sources per group */ +#define IPV6_MAX_SOCK_SRC_FILTER 128 /* sources per socket/group */ + /* * Argument structure for IPV6_JOIN_GROUP and IPV6_LEAVE_GROUP. */ @@ -524,13 +677,21 @@ struct ipv6_mreq { }; /* - * IPV6_PKTINFO: Packet information(RFC2292 sec 5) + * IPV6_2292PKTINFO: Packet information(RFC2292 sec 5) */ struct in6_pktinfo { struct in6_addr ipi6_addr; /* src/dst IPv6 address */ unsigned int ipi6_ifindex; /* send/recv interface index */ }; +/* + * Control structure for IPV6_RECVPATHMTU socket option. + */ +struct ip6_mtuinfo { + struct sockaddr_in6 ip6m_addr; /* or sockaddr_storage? */ + uint32_t ip6m_mtu; +}; + /* * Argument for IPV6_PORTRANGE: * - which range to search when port is unspecified at bind() or connect() @@ -582,22 +743,27 @@ struct in6_pktinfo { #define IPV6CTL_RTMINEXPIRE 26 /* min value for expiration time */ #define IPV6CTL_RTMAXCACHE 27 /* trigger level for dynamic expire */ -#define IPV6CTL_USETEMPADDR 32 /* use temporary addresses (RFC3041) */ +#define IPV6CTL_USETEMPADDR 32 /* use temporary addresses [RFC 4941] */ #define IPV6CTL_TEMPPLTIME 33 /* preferred lifetime for tmpaddrs */ #define IPV6CTL_TEMPVLTIME 34 /* valid lifetime for tmpaddrs */ #define IPV6CTL_AUTO_LINKLOCAL 35 /* automatic link-local addr assign */ #define IPV6CTL_RIP6STATS 36 /* raw_ip6 stats */ +#define IPV6CTL_PREFER_TEMPADDR 37 /* prefer temporary addr as src */ +#define IPV6CTL_ADDRCTLPOLICY 38 /* get/set address selection policy */ +#define IPV6CTL_USE_DEFAULTZONE 39 /* use default scope zone */ -#define IPV6CTL_MAXFRAGS 41 /* max fragments */ +#define IPV6CTL_MAXFRAGS 41 /* max fragments */ +#define IPV6CTL_MCAST_PMTU 44 /* enable pMTU discovery for multicast? */ #define IPV6CTL_NEIGHBORGCTHRESH 46 #define IPV6CTL_MAXIFPREFIXES 47 #define IPV6CTL_MAXIFDEFROUTERS 48 #define IPV6CTL_MAXDYNROUTES 49 +#define ICMPV6CTL_ND6_ONLINKNSRFC4861 50 /* New entries should be added here from current IPV6CTL_MAXID value. */ /* to define items, should talk with KAME guys first, for *BSD compatibility */ -#define IPV6CTL_MAXID 50 +#define IPV6CTL_MAXID 51 #ifdef KERNEL_PRIVATE #define CTL_IPV6PROTO_NAMES { \ @@ -651,7 +817,6 @@ struct in6_pktinfo { */ #define M_AUTHIPHDR M_PROTO2 #define M_DECRYPTED M_PROTO3 -#define M_LOOP M_PROTO4 #define M_AUTHIPDGM M_PROTO5 struct cmsghdr; @@ -676,12 +841,65 @@ extern void in6_sin_2_v4mapsin6(struct sockaddr_in *sin, struct sockaddr_in6 *sin6); extern void in6_sin6_2_sin_in_sock(struct sockaddr *nam); extern int in6_sin_2_v4mapsin6_in_sock(struct sockaddr **nam); +extern void in6_delayed_cksum(struct mbuf *, u_int16_t); #define satosin6(sa) ((struct sockaddr_in6 *)(sa)) #define sin6tosa(sin6) ((struct sockaddr *)(sin6)) #define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) extern int in6addr_local(struct in6_addr *); + +#define DEBUG_HWCKSUM 1 /* IPv6 Hardware checksum on/off */ +/* + * in6_cksum_phdr: + * + * Compute significant parts of the IPv6 checksum pseudo-header + * for use in a delayed TCP/UDP checksum calculation. + * + * Args: + * + * src Source IPv6 address + * dst Destination IPv6 address + * len htonl(proto-hdr-len) + * nxt htonl(next-proto-number) + * + * NOTE: We expect the src and dst addresses to be 16-bit + * aligned! + */ +static __inline u_int16_t __unused +in6_cksum_phdr(const struct in6_addr *src, const struct in6_addr *dst, + u_int32_t len, u_int32_t nxt) +{ + u_int32_t sum = 0; + const u_int16_t *w; + + /*LINTED*/ + w = (const u_int16_t *) src; + sum += w[0]; + if (!IN6_IS_SCOPE_LINKLOCAL(src)) + sum += w[1]; + sum += w[2]; sum += w[3]; sum += w[4]; sum += w[5]; + sum += w[6]; sum += w[7]; + + /*LINTED*/ + w = (const u_int16_t *) dst; + sum += w[0]; + if (!IN6_IS_SCOPE_LINKLOCAL(dst)) + sum += w[1]; + sum += w[2]; sum += w[3]; sum += w[4]; sum += w[5]; + sum += w[6]; sum += w[7]; + + sum += (u_int16_t)(len >> 16) + (u_int16_t)(len /*& 0xffff*/); + + sum += (u_int16_t)(nxt >> 16) + (u_int16_t)(nxt /*& 0xffff*/); + + sum = (u_int16_t)(sum >> 16) + (u_int16_t)(sum /*& 0xffff*/); + + if (sum > 0xffff) + sum -= 0xffff; + + return (sum); +} #endif /* KERNEL_PRIVATE */ #ifndef KERNEL @@ -708,23 +926,24 @@ extern int inet6_rthdr_segments(const struct cmsghdr *); extern struct in6_addr *inet6_rthdr_getaddr(struct cmsghdr *, int); extern int inet6_rthdr_getflags(const struct cmsghdr *, int); -extern int inet6_opt_init(void *, size_t); -extern int inet6_opt_append(void *, size_t, int, __uint8_t, - size_t, __uint8_t, void **); -extern int inet6_opt_finish(void *, size_t, int); -extern int inet6_opt_set_val(void *, size_t, void *, int); - -extern int inet6_opt_next(void *, size_t, int, __uint8_t *, - size_t *, void **); -extern int inet6_opt_find(void *, size_t, int, __uint8_t, - size_t *, void **); -extern int inet6_opt_get_val(void *, size_t, void *, int); -extern size_t inet6_rth_space(int, int); -extern void *inet6_rth_init(void *, int, int, int); +extern int inet6_opt_init(void *, socklen_t); +extern int inet6_opt_append(void *, socklen_t, int, __uint8_t, + socklen_t, __uint8_t, void **); +extern int inet6_opt_finish(void *, socklen_t, int); +extern int inet6_opt_set_val(void *, int, void *, socklen_t); + +extern int inet6_opt_next(void *, socklen_t, int, __uint8_t *, + socklen_t *, void **); +extern int inet6_opt_find(void *, socklen_t, int, __uint8_t, + socklen_t *, void **); +extern int inet6_opt_get_val(void *, int, void *, socklen_t); +extern socklen_t inet6_rth_space(int, int); +extern void *inet6_rth_init(void *, socklen_t, int, int); extern int inet6_rth_add(void *, const struct in6_addr *); extern int inet6_rth_reverse(const void *, void *); extern int inet6_rth_segments(const void *); extern struct in6_addr *inet6_rth_getaddr(const void *, int); +extern void addrsel_policy_init(void); __END_DECLS #endif /* !KERNEL */ #endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ diff --git a/bsd/netinet6/in6_cksum.c b/bsd/netinet6/in6_cksum.c index d964493dc..f0352eb72 100644 --- a/bsd/netinet6/in6_cksum.c +++ b/bsd/netinet6/in6_cksum.c @@ -224,7 +224,7 @@ inet6_cksum(struct mbuf *m, unsigned int nxt, unsigned int off, * code and should be modified for each CPU to be as fast as possible. */ -#define ADDCARRY(x) (x > 65535 ? x -= 65535 : x) +#define ADDCARRY(x) do { if (x > 65535) { x -= 65535; } } while (0) #define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);} /* diff --git a/bsd/netinet6/in6_gif.c b/bsd/netinet6/in6_gif.c index 332271e88..d620db95e 100644 --- a/bsd/netinet6/in6_gif.c +++ b/bsd/netinet6/in6_gif.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2009-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -192,11 +192,9 @@ in6_gif_output( m_freem(m); return ENETUNREACH; } - if (ifp->if_flags & IFF_LINK1) - ip_ecn_ingress(ECN_ALLOWED, &otos, &itos); - else - ip_ecn_ingress(ECN_NOCARE, &otos, &itos); - ip6->ip6_flow &= ~ntohl(0xff00000); + ip_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED : ECN_NOCARE, + &otos, &itos); + ip6->ip6_flow &= ~htonl(0xff << 20); ip6->ip6_flow |= htonl((u_int32_t)otos << 20); if (dst->sin6_family != sin6_dst->sin6_family || @@ -244,22 +242,19 @@ in6_gif_output( * it is too painful to ask for resend of inner packet, to achieve * path MTU discovery for encapsulated packets. */ - return(ip6_output(m, 0, &sc->gif_ro6, IPV6_MINMTU, 0, NULL, 0)); + return(ip6_output(m, 0, &sc->gif_ro6, IPV6_MINMTU, 0, NULL, NULL)); #else - return(ip6_output(m, 0, &sc->gif_ro6, 0, 0, NULL, 0)); + return(ip6_output(m, 0, &sc->gif_ro6, 0, 0, NULL, NULL)); #endif } -int in6_gif_input(mp, offp) - struct mbuf **mp; - int *offp; +int in6_gif_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ifnet *gifp = NULL; struct ip6_hdr *ip6; int af = 0; u_int32_t otos; - u_int8_t proto; ip6 = mtod(m, struct ip6_hdr *); @@ -271,7 +266,6 @@ int in6_gif_input(mp, offp) return IPPROTO_DONE; } - proto = ip6->ip6_nxt; otos = ip6->ip6_flow; m_adj(m, *offp); @@ -360,9 +354,6 @@ gif_validate6( sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_addr = ip6->ip6_src; -#ifndef SCOPEDROUTING - sin6.sin6_scope_id = 0; /* XXX */ -#endif rt = rtalloc1((struct sockaddr *)&sin6, 0, 0); if (rt != NULL) diff --git a/bsd/netinet6/in6_gif.h b/bsd/netinet6/in6_gif.h index 8383c6b4e..8baafdd43 100644 --- a/bsd/netinet6/in6_gif.h +++ b/bsd/netinet6/in6_gif.h @@ -37,7 +37,7 @@ #ifdef KERNEL_PRIVATE #define GIF_HLIM 30 -int in6_gif_input(struct mbuf **, int *); +int in6_gif_input(struct mbuf **, int *, int); int in6_gif_output(struct ifnet *, int, struct mbuf *, struct rtentry *); int gif_encapcheck6(const struct mbuf *, int, int, void *); #endif /* KERNEL_PRIVATE */ diff --git a/bsd/netinet6/in6_ifattach.c b/bsd/netinet6/in6_ifattach.c index 5995b212d..10bf295f4 100644 --- a/bsd/netinet6/in6_ifattach.c +++ b/bsd/netinet6/in6_ifattach.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2008 Apple Inc. All rights reserved. + * Copyright (c) 2003-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,6 +79,7 @@ #include #include #include +#include #include #include @@ -97,7 +98,6 @@ size_t in6_ifstatmax = 0; size_t icmp6_ifstatmax = 0; u_int32_t in6_maxmtu = 0; extern lck_mtx_t *nd6_mutex; -extern lck_mtx_t *inet6_domain_mutex; #if IP6_AUTO_LINKLOCAL int ip6_auto_linklocal = IP6_AUTO_LINKLOCAL; @@ -105,13 +105,14 @@ int ip6_auto_linklocal = IP6_AUTO_LINKLOCAL; int ip6_auto_linklocal = 1; /* enable by default */ #endif +int loopattach6_done = 0; + extern struct inpcbinfo udbinfo; extern struct inpcbinfo ripcbinfo; -extern lck_mtx_t *ip6_mutex; static int get_rand_ifid(struct ifnet *, struct in6_addr *); static int generate_tmp_ifid(u_int8_t *, const u_int8_t *, u_int8_t *); -static int get_hw_ifid(struct ifnet *, struct in6_addr *); +int in6_get_hw_ifid(struct ifnet *, struct in6_addr *); static int get_ifid(struct ifnet *, struct ifnet *, struct in6_addr *); static int in6_ifattach_linklocal(struct ifnet *, struct ifnet *, struct in6_aliasreq *); static int in6_ifattach_loopback(struct ifnet *); @@ -133,6 +134,8 @@ static int in6_ifattach_loopback(struct ifnet *); * The goal here is to get an interface identifier that is * (1) random enough and (2) does not change across reboot. * We currently use MD5(hostname) for it. + * + * in6 - upper 64bits are preserved */ static int get_rand_ifid( @@ -141,7 +144,7 @@ get_rand_ifid( { MD5_CTX ctxt; u_int8_t digest[16]; - int len = strlen(hostname); + int hostnlen = strlen(hostname); #if 0 /* we need at least several letters as seed for ifid */ @@ -152,7 +155,7 @@ get_rand_ifid( /* generate 8 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); MD5Init(&ctxt); - MD5Update(&ctxt, hostname, len); + MD5Update(&ctxt, hostname, hostnlen); MD5Final(digest, &ctxt); /* assumes sizeof(digest) > sizeof(ifid) */ @@ -179,7 +182,7 @@ generate_tmp_ifid( u_int32_t val32; struct timeval tv; - /* If there's no hisotry, start with a random seed. */ + /* If there's no history, start with a random seed. */ bzero(nullbuf, sizeof(nullbuf)); if (bcmp(nullbuf, seed0, sizeof(nullbuf)) == 0) { int i; @@ -213,7 +216,7 @@ generate_tmp_ifid( MD5Final(digest, &ctxt); /* - * RFC 3041 3.2.1. (3) + * RFC 4941 3.2.1. (3) * Take the left-most 64-bits of the MD5 digest and set bit 6 (the * left-most bit is numbered 0) to zero. */ @@ -226,8 +229,8 @@ generate_tmp_ifid( * use a random non-zero value as the last resort. */ if (bcmp(nullbuf, ret, sizeof(nullbuf)) == 0) { - log(LOG_INFO, - "generate_tmp_ifid: computed MD5 value is zero.\n"); + nd6log((LOG_INFO, + "generate_tmp_ifid: computed MD5 value is zero.\n")); microtime(&tv); val32 = random() ^ tv.tv_usec; @@ -235,10 +238,10 @@ generate_tmp_ifid( } /* - * RFC 3041 3.2.1. (4) + * RFC 4941 3.2.1. (4) * Take the rightmost 64-bits of the MD5 digest and save them in * stable storage as the history value to be used in the next - * iteration of the algorithm. + * iteration of the algorithm. */ bcopy(&digest[8], seed0, 8); @@ -257,42 +260,35 @@ generate_tmp_ifid( /* * Get interface identifier for the specified interface. * XXX assumes single sockaddr_dl (AF_LINK address) per an interface + * + * in6 - upper 64bits are preserved */ -static int -get_hw_ifid( +int +in6_get_hw_ifid( struct ifnet *ifp, struct in6_addr *in6) /* upper 64bits are preserved */ { - struct ifaddr *ifa; + struct ifaddr *ifa = NULL; struct sockaddr_dl *sdl; u_int8_t *addr; size_t addrlen; static u_int8_t allzero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; static u_int8_t allone[8] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + int err = -1; /* Why doesn't this code use ifnet_addrs? */ ifnet_lock_shared(ifp); - for (ifa = ifp->if_addrlist.tqh_first; - ifa; - ifa = ifa->ifa_list.tqe_next) - { - if (ifa->ifa_addr->sa_family != AF_LINK) - continue; - sdl = (struct sockaddr_dl *)ifa->ifa_addr; - if (sdl == NULL) - continue; - if (sdl->sdl_alen == 0) - continue; - - goto found; + ifa = ifp->if_lladdr; + sdl = (struct sockaddr_dl *)ifa->ifa_addr; + if (sdl->sdl_alen == 0) { + ifnet_lock_done(ifp); + return (-1); } + IFA_ADDREF(ifa); /* for this routine */ ifnet_lock_done(ifp); - return -1; - -found: - ifnet_lock_done(ifp); + IFA_LOCK(ifa); addr = (u_int8_t *) LLADDR(sdl); addrlen = sdl->sdl_alen; @@ -300,6 +296,7 @@ get_hw_ifid( switch (ifp->if_type) { case IFT_ETHER: case IFT_FDDI: + case IFT_ISO88025: case IFT_ATM: case IFT_IEEE1394: case IFT_L2VLAN: @@ -315,7 +312,7 @@ get_hw_ifid( /* look at IEEE802/EUI64 only */ if (addrlen != 8 && addrlen != 6) - return -1; + goto done; /* * check for invalid MAC address - on bsdi, we see it a lot @@ -323,9 +320,9 @@ get_hw_ifid( * card insertion. */ if (bcmp(addr, allzero, addrlen) == 0) - return -1; + goto done; if (bcmp(addr, allone, addrlen) == 0) - return -1; + goto done; /* make EUI64 address */ if (addrlen == 8) @@ -344,9 +341,9 @@ get_hw_ifid( case IFT_ARCNET: if (addrlen != 1) - return -1; + goto done; if (!addr[0]) - return -1; + goto done; bzero(&in6->s6_addr[8], 8); in6->s6_addr[15] = addr[0]; @@ -368,15 +365,18 @@ get_hw_ifid( * identifier source (can be renumbered). * we don't do this. */ - return -1; + goto done; + + case IFT_CELLULAR: + goto done; default: - return -1; + goto done; } /* sanity check: g bit must not indicate "group" */ if (EUI64_GROUP(in6)) - return -1; + goto done; /* convert EUI64 into IPv6 interface identifier */ EUI64_TO_IFID(in6); @@ -387,16 +387,27 @@ get_hw_ifid( */ if ((in6->s6_addr[8] & ~(EUI64_GBIT | EUI64_UBIT)) == 0x00 && bcmp(&in6->s6_addr[9], allzero, 7) == 0) { - return -1; + goto done; } - return 0; + err = 0; /* found */ + +done: + /* This must not be the last reference to the lladdr */ + if (IFA_REMREF_LOCKED(ifa) == NULL) { + panic("%s: unexpected (missing) refcnt ifa=%p", __func__, ifa); + /* NOTREACHED */ + } + IFA_UNLOCK(ifa); + return (err); } /* * Get interface identifier for the specified interface. If it is not * available on ifp0, borrow interface identifier from other information * sources. + * + * altifp - secondary EUI64 source */ static int get_ifid( @@ -407,14 +418,14 @@ get_ifid( struct ifnet *ifp; /* first, try to get it from the interface itself */ - if (get_hw_ifid(ifp0, in6) == 0) { + if (in6_get_hw_ifid(ifp0, in6) == 0) { nd6log((LOG_DEBUG, "%s: got interface identifier from itself\n", if_name(ifp0))); goto success; } /* try secondary EUI64 source. this basically is for ATM PVC */ - if (altifp && get_hw_ifid(altifp, in6) == 0) { + if (altifp && in6_get_hw_ifid(altifp, in6) == 0) { nd6log((LOG_DEBUG, "%s: got interface identifier from %s\n", if_name(ifp0), if_name(altifp))); goto success; @@ -425,7 +436,7 @@ get_ifid( TAILQ_FOREACH(ifp, &ifnet_head, if_list) { if (ifp == ifp0) continue; - if (get_hw_ifid(ifp, in6) != 0) + if (in6_get_hw_ifid(ifp, in6) != 0) continue; /* @@ -488,18 +499,14 @@ in6_ifattach_linklocal( */ strncpy(ifra.ifra_name, if_name(ifp), sizeof(ifra.ifra_name)); - if (((ifp->if_type == IFT_PPP) || ((ifp->if_eflags & IFEF_NOAUTOIPV6LL) != 0)) && - ifra_passed != NULL) /* PPP provided both addresses for us */ + if ((ifp->if_eflags & IFEF_NOAUTOIPV6LL) != 0 && + ifra_passed != NULL) /* interface provided both addresses for us */ bcopy(&ifra_passed->ifra_addr, &(ifra.ifra_addr), sizeof(struct sockaddr_in6)); else { ifra.ifra_addr.sin6_family = AF_INET6; ifra.ifra_addr.sin6_len = sizeof(struct sockaddr_in6); ifra.ifra_addr.sin6_addr.s6_addr16[0] = htons(0xfe80); -#if SCOPEDROUTING - ifra.ifra_addr.sin6_addr.s6_addr16[1] = 0 -#else - ifra.ifra_addr.sin6_addr.s6_addr16[1] = htons(ifp->if_index); /* XXX */ -#endif + ifra.ifra_addr.sin6_addr.s6_addr16[1] = htons(ifp->if_index); ifra.ifra_addr.sin6_addr.s6_addr32[1] = 0; if ((ifp->if_flags & IFF_LOOPBACK) != 0) { ifra.ifra_addr.sin6_addr.s6_addr32[2] = 0; @@ -508,58 +515,42 @@ in6_ifattach_linklocal( if (get_ifid(ifp, altifp, &ifra.ifra_addr.sin6_addr) != 0) { nd6log((LOG_ERR, " %s: no ifid available\n", if_name(ifp))); - return -1; + return EADDRNOTAVAIL; } } -#if SCOPEDROUTING - ifra.ifra_addr.sin6_scope_id = - in6_addr2scopeid(ifp, &ifra.ifra_addr.sin6_addr); -#endif } + if (in6_setscope(&ifra.ifra_addr.sin6_addr, ifp, NULL)) + return (EADDRNOTAVAIL); + ifra.ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6); ifra.ifra_prefixmask.sin6_family = AF_INET6; ifra.ifra_prefixmask.sin6_addr = in6mask64; -#if SCOPEDROUTING - /* take into accound the sin6_scope_id field for routing */ - ifra.ifra_prefixmask.sin6_scope_id = 0xffffffff; -#endif /* link-local addresses should NEVER expire. */ ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME; ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME; - /* - * Do not let in6_update_ifa() do DAD, since we need a random delay - * before sending an NS at the first time the interface becomes up. - * Instead, in6_if_up() will start DAD with a proper random delay. - */ - ifra.ifra_flags |= IN6_IFF_NODAD; - /* * Now call in6_update_ifa() to do a bunch of procedures to configure - * a link-local address. We can set NULL to the 3rd argument, because + * a link-local address. We can set the 3rd argument to NULL, because * we know there's no other link-local address on the interface * and therefore we are adding one (instead of updating one). */ - if ((error = in6_update_ifa(ifp, &ifra, NULL, M_WAITOK)) != 0) { + if ((error = in6_update_ifa(ifp, &ifra, NULL, + IN6_IFAUPDATE_DADDELAY, M_WAITOK)) != 0) { /* * XXX: When the interface does not support IPv6, this call * would fail in the SIOCSIFADDR ioctl. I believe the * notification is rather confusing in this case, so just - * supress it. (jinmei@kame.net 20010130) + * suppress it. (jinmei@kame.net 20010130) */ if (error != EAFNOSUPPORT) - log(LOG_NOTICE, "in6_ifattach_linklocal: failed to " + nd6log((LOG_NOTICE, "in6_ifattach_linklocal: failed to " "configure a link-local address on %s " "(errno=%d)\n", - if_name(ifp), error); - return(-1); + if_name(ifp), error)); + return (EADDRNOTAVAIL); } - /* - * Adjust ia6_flags so that in6_if_up will perform DAD. - * XXX: Some P2P interfaces seem not to send packets just after - * becoming up, so we skip p2p interfaces for safety. - */ ia = in6ifa_ifpforlinklocal(ifp, 0); /* ia must not be NULL */ #if DIAGNOSTIC if (!ia) { @@ -567,19 +558,15 @@ in6_ifattach_linklocal( /*NOTREACHED*/ } #endif - if (in6if_do_dad(ifp) && (ifp->if_flags & IFF_POINTOPOINT) == 0) { - ia->ia6_flags &= ~IN6_IFF_NODAD; - ia->ia6_flags |= IN6_IFF_TENTATIVE; - } - /* - * Make the link-local prefix (fe80::/64%link) as on-link. + * Make the link-local prefix (fe80::%link/64) as on-link. * Since we'd like to manage prefixes separately from addresses, * we make an ND6 prefix structure for the link-local prefix, * and add it to the prefix list as a never-expire prefix. * XXX: this change might affect some existing code base... */ bzero(&pr0, sizeof(pr0)); + lck_mtx_init(&pr0.ndpr_lock, ifa_mtx_grp, ifa_mtx_attr); pr0.ndpr_ifp = ifp; /* this should be 64 at this moment. */ pr0.ndpr_plen = in6_mask2len(&ifra.ifra_prefixmask.sin6_addr, NULL); @@ -598,6 +585,7 @@ in6_ifattach_linklocal( pr0.ndpr_raf_auto = 1; /* probably meaningless */ pr0.ndpr_vltime = ND6_INFINITE_LIFETIME; pr0.ndpr_pltime = ND6_INFINITE_LIFETIME; + pr0.ndpr_stateflags |= NDPRF_STATIC; /* * Since there is no other link-local addresses, nd6_prefix_lookup() * probably returns NULL. However, we cannot always expect the result. @@ -606,21 +594,23 @@ in6_ifattach_linklocal( * valid with referring to the old link-local address. */ if ((pr = nd6_prefix_lookup(&pr0)) == NULL) { - if ((error = nd6_prelist_add(&pr0, NULL, &pr)) != 0) { - printf("in6_ifattach_linklocal: nd6_prelist_add failed %d\n", error); - ifafree(&ia->ia_ifa); + if ((error = nd6_prelist_add(&pr0, NULL, &pr, TRUE)) != 0) { + IFA_REMREF(&ia->ia_ifa); + lck_mtx_destroy(&pr0.ndpr_lock, ifa_mtx_grp); return(error); } } if (ia != NULL) { in6_post_msg(ifp, KEV_INET6_NEW_LL_ADDR, ia); - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); } /* Drop use count held above during lookup/add */ if (pr != NULL) - ndpr_rele(pr, FALSE); + NDPR_REMREF(pr); + + lck_mtx_destroy(&pr0.ndpr_lock, ifa_mtx_grp); return 0; } @@ -670,11 +660,11 @@ in6_ifattach_loopback( * We are sure that this is a newly assigned address, so we can set * NULL to the 3rd arg. */ - if ((error = in6_update_ifa(ifp, &ifra, NULL, M_WAITOK)) != 0) { - log(LOG_ERR, "in6_ifattach_loopback: failed to configure " + if ((error = in6_update_ifa(ifp, &ifra, NULL, 0, M_WAITOK)) != 0) { + nd6log((LOG_ERR, "in6_ifattach_loopback: failed to configure " "the loopback address on %s (errno=%d)\n", - if_name(ifp), error); - return(-1); + if_name(ifp), error)); + return (EADDRNOTAVAIL); } return 0; @@ -724,76 +714,33 @@ in6_nigroup( MD5Final(digest, &ctxt); bzero(in6, sizeof(*in6)); - in6->s6_addr16[0] = htons(0xff02); - if (ifp) - in6->s6_addr16[1] = htons(ifp->if_index); + in6->s6_addr16[0] = IPV6_ADDR_INT16_MLL; in6->s6_addr8[11] = 2; bcopy(digest, &in6->s6_addr32[3], sizeof(in6->s6_addr32[3])); + if (in6_setscope(in6, ifp, NULL)) + return (-1); /* XXX: should not fail */ return 0; } -void -in6_nigroup_attach( - const char *name, - int namelen) +int +in6_domifattach(struct ifnet *ifp) { - struct ifnet *ifp; - struct sockaddr_in6 mltaddr; - struct in6_multi *in6m; - int error; - - bzero(&mltaddr, sizeof(mltaddr)); - mltaddr.sin6_family = AF_INET6; - mltaddr.sin6_len = sizeof(struct sockaddr_in6); - if (in6_nigroup(NULL, name, namelen, &mltaddr.sin6_addr) != 0) - return; + int error = 0; - ifnet_head_lock_shared(); - TAILQ_FOREACH(ifp, &ifnet_head, if_list) { - mltaddr.sin6_addr.s6_addr16[1] = htons(ifp->if_index); - ifnet_lock_shared(ifp); - IN6_LOOKUP_MULTI(mltaddr.sin6_addr, ifp, in6m); - ifnet_lock_done(ifp); - if (!in6m) { - if (!in6_addmulti(&mltaddr.sin6_addr, ifp, &error, 0)) { - nd6log((LOG_ERR, "%s: failed to join %s " - "(errno=%d)\n", if_name(ifp), - ip6_sprintf(&mltaddr.sin6_addr), - error)); - } - } + if ((error = proto_plumb(PF_INET6, ifp))) { + if (error != EEXIST) + log(LOG_ERR, "%s: proto_plumb returned %d if=%s%d\n", + __func__, error, ifp->if_name, ifp->if_unit); + } else { + nd6_ifattach(ifp); + scope6_ifattach(ifp); } - ifnet_head_done(); -} - -void -in6_nigroup_detach( - const char *name, - int namelen) -{ - struct ifnet *ifp; - struct sockaddr_in6 mltaddr; - struct in6_multi *in6m; - - bzero(&mltaddr, sizeof(mltaddr)); - mltaddr.sin6_family = AF_INET6; - mltaddr.sin6_len = sizeof(struct sockaddr_in6); - if (in6_nigroup(NULL, name, namelen, &mltaddr.sin6_addr) != 0) - return; - ifnet_head_lock_shared(); - TAILQ_FOREACH(ifp, &ifnet_head, if_list) { - mltaddr.sin6_addr.s6_addr16[1] = htons(ifp->if_index); - ifnet_lock_shared(ifp); - IN6_LOOKUP_MULTI(mltaddr.sin6_addr, ifp, in6m); - ifnet_lock_done(ifp); - if (in6m) - in6_delmulti(in6m, 0); - } - ifnet_head_done(); + return (error); } + /* * XXX multiple loopback interface needs more care. for instance, * nodelocal address needs to be configured onto only one of them. @@ -807,8 +754,10 @@ in6_ifattach( { static size_t if_indexlim = 8; struct in6_ifaddr *ia; + struct in6_addr in6; int error; + lck_rw_lock_exclusive(&in6_ifs_rwlock); /* * We have some arrays that should be indexed by if_index. * since if_index will grow dynamically, they should grow too. @@ -821,7 +770,6 @@ in6_ifattach( if_indexlim <<= 1; } - lck_mtx_lock(ip6_mutex); /* grow in6_ifstat */ if (in6_ifstatmax < if_indexlim) { size_t n; @@ -830,7 +778,7 @@ in6_ifattach( n = if_indexlim * sizeof(struct in6_ifstat *); q = (caddr_t)_MALLOC(n, M_IFADDR, M_WAITOK); if (q == NULL) { - lck_mtx_unlock(ip6_mutex); + lck_rw_done(&in6_ifs_rwlock); return ENOBUFS; } bzero(q, n); @@ -847,17 +795,14 @@ in6_ifattach( in6_ifstat[ifp->if_index] = (struct in6_ifstat *) _MALLOC(sizeof(struct in6_ifstat), M_IFADDR, M_WAITOK); if (in6_ifstat[ifp->if_index] == NULL) { - lck_mtx_unlock(ip6_mutex); + lck_rw_done(&in6_ifs_rwlock); return ENOBUFS; } bzero(in6_ifstat[ifp->if_index], sizeof(struct in6_ifstat)); } - lck_mtx_unlock(ip6_mutex); + lck_rw_done(&in6_ifs_rwlock); - /* grow icmp6_ifstat, use inet6_domain_mutex as that is used in - * icmp6 routines - */ - lck_mtx_lock(inet6_domain_mutex); + lck_rw_lock_exclusive(&icmp6_ifs_rwlock); if (icmp6_ifstatmax < if_indexlim) { size_t n; caddr_t q; @@ -865,7 +810,7 @@ in6_ifattach( n = if_indexlim * sizeof(struct icmp6_ifstat *); q = (caddr_t)_MALLOC(n, M_IFADDR, M_WAITOK); if (q == NULL) { - lck_mtx_unlock(inet6_domain_mutex); + lck_rw_done(&icmp6_ifs_rwlock); return ENOBUFS; } bzero(q, n); @@ -882,12 +827,12 @@ in6_ifattach( icmp6_ifstat[ifp->if_index] = (struct icmp6_ifstat *) _MALLOC(sizeof(struct icmp6_ifstat), M_IFADDR, M_WAITOK); if (icmp6_ifstat[ifp->if_index] == NULL) { - lck_mtx_unlock(inet6_domain_mutex); + lck_rw_done(&icmp6_ifs_rwlock); return ENOBUFS; } bzero(icmp6_ifstat[ifp->if_index], sizeof(struct icmp6_ifstat)); } - lck_mtx_unlock(inet6_domain_mutex); + lck_rw_done(&icmp6_ifs_rwlock); /* initialize NDP variables */ if ((error = nd6_ifattach(ifp)) != 0) @@ -919,9 +864,9 @@ in6_ifattach( * usually, we require multicast capability to the interface */ if ((ifp->if_flags & IFF_MULTICAST) == 0) { - log(LOG_INFO, "in6_ifattach: " - "%s is not multicast capable, IPv6 not enabled\n", - if_name(ifp)); + nd6log((LOG_INFO, "in6_ifattach: ", + "%s is not multicast capable, IPv6 not enabled\n", + if_name(ifp))); return EINVAL; } @@ -930,12 +875,23 @@ in6_ifattach( * XXX multiple loopback interface case. */ if ((ifp->if_flags & IFF_LOOPBACK) != 0) { - if (in6_ifattach_loopback(ifp) != 0) - printf("in6_ifattach: in6_ifattach_loopback failed\n"); + struct in6_ifaddr *ia6 = NULL; + if (!OSCompareAndSwap(0, 1, (UInt32 *)&loopattach6_done)) { + in6 = in6addr_loopback; + if ((ia6 = in6ifa_ifpwithaddr(ifp, &in6)) == NULL) { + if (in6_ifattach_loopback(ifp) != 0) { + OSCompareAndSwap(1, 0, (UInt32 *)&loopattach6_done); + return EINVAL; + } + } + else { + IFA_REMREF(&ia6->ia_ifa); + } + } } /* - * assign a link-local address, if there's none. + * assign a link-local address, if there's none. */ if (ip6_auto_linklocal) { ia = in6ifa_ifpforlinklocal(ifp, 0); @@ -943,13 +899,13 @@ in6_ifattach( if (in6_ifattach_linklocal(ifp, altifp, ifra) == 0) { /* linklocal address assigned */ } else { - log(LOG_INFO, "in6_ifattach: %s failed to " + nd6log((LOG_INFO, "in6_ifattach: %s failed to " "attach a linklocal address.\n", - if_name(ifp)); + if_name(ifp))); /* failed to assign linklocal address. bark? */ } } else { - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); } } @@ -966,83 +922,147 @@ in6_ifattach( /* * NOTE: in6_ifdetach() does not support loopback if at this moment. - * We don't need this function in bsdi, because interfaces are never removed - * from the ifnet list in bsdi. */ void -in6_ifdetach( - struct ifnet *ifp) +in6_ifdetach(struct ifnet *ifp) { - struct in6_ifaddr *ia, *oia, *nia; - struct ifaddr *ifa, *next; + struct in6_ifaddr *ia, *oia; + struct ifaddr *ifa; struct rtentry *rt; struct sockaddr_in6 sin6; + struct in6_multi_mship *imm; + int unlinked; - /* nuke prefix list. this may try to remove some of ifaddrs as well */ - in6_purgeprefix(ifp); + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_NOTOWNED); /* remove neighbor management table */ nd6_purge(ifp); /* nuke any of IPv6 addresses we have */ - - lck_mtx_lock(nd6_mutex); - for (ia = in6_ifaddrs; ia != NULL; ia = nia) { - nia = ia->ia_next; - if (ia->ia_ifa.ifa_ifp != ifp) + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + ia = in6_ifaddrs; + while (ia != NULL) { + if (ia->ia_ifa.ifa_ifp != ifp) { + ia = ia->ia_next; continue; - in6_purgeaddr(&ia->ia_ifa, 1); + } + IFA_ADDREF(&ia->ia_ifa); /* for us */ + lck_rw_done(&in6_ifaddr_rwlock); + in6_purgeaddr(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); /* for us */ + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + /* + * Purging the address caused in6_ifaddr_rwlock + * to be dropped and reacquired; + * therefore search again from the beginning + * of in6_ifaddrs list. + */ + ia = in6_ifaddrs; } - lck_mtx_unlock(nd6_mutex); + lck_rw_done(&in6_ifaddr_rwlock); ifnet_lock_exclusive(ifp); /* undo everything done by in6_ifattach(), just in case */ - for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = next) - { - next = ifa->ifa_list.tqe_next; - - - if (ifa->ifa_addr->sa_family != AF_INET6 - || !IN6_IS_ADDR_LINKLOCAL(&satosin6(&ifa->ifa_addr)->sin6_addr)) { + ifa = TAILQ_FIRST(&ifp->if_addrlist); + while (ifa != NULL) { + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6 || + !IN6_IS_ADDR_LINKLOCAL(&satosin6(&ifa->ifa_addr)-> + sin6_addr)) { + IFA_UNLOCK(ifa); + ifa = TAILQ_NEXT(ifa, ifa_list); continue; } ia = (struct in6_ifaddr *)ifa; - /* remove from the routing table */ - if ((ia->ia_flags & IFA_ROUTE) && - (rt = rtalloc1((struct sockaddr *)&ia->ia_addr, 0, 0))) { - (void) rtrequest(RTM_DELETE, - (struct sockaddr *)&ia->ia_addr, - (struct sockaddr *)&ia->ia_addr, - (struct sockaddr *)&ia->ia_prefixmask, - rt->rt_flags, (struct rtentry **)0); - rtfree(rt); + /* hold a reference for this routine */ + IFA_ADDREF_LOCKED(ifa); + /* remove from the linked list */ + if_detach_ifa(ifp, ifa); + IFA_UNLOCK(ifa); + + /* + * Leaving the multicast group(s) may involve freeing the + * link address multicast structure(s) for the interface, + * which is protected by ifnet lock. To avoid violating + * lock ordering, we must drop ifnet lock before doing so. + * The ifa won't go away since we held a refcnt above. + */ + ifnet_lock_done(ifp); + + /* + * We have to do this work manually here instead of calling + * in6_purgeaddr() since in6_purgeaddr() uses the RTM_HOST flag. + */ + + /* + * leave from multicast groups we have joined for the interface + */ + IFA_LOCK(ifa); + while ((imm = ia->ia6_memberships.lh_first) != NULL) { + LIST_REMOVE(imm, i6mm_chain); + IFA_UNLOCK(ifa); + in6_leavegroup(imm); + IFA_LOCK(ifa); } - /* remove from the linked list */ - if_detach_ifa(ifp, &ia->ia_ifa); + /* remove from the routing table */ + if (ia->ia_flags & IFA_ROUTE) { + IFA_UNLOCK(ifa); + rt = rtalloc1((struct sockaddr *)&ia->ia_addr, 0, 0); + if (rt != NULL) { + (void) rtrequest(RTM_DELETE, + (struct sockaddr *)&ia->ia_addr, + (struct sockaddr *)&ia->ia_addr, + (struct sockaddr *)&ia->ia_prefixmask, + rt->rt_flags, (struct rtentry **)0); + rtfree(rt); + } + } else { + IFA_UNLOCK(ifa); + } /* also remove from the IPv6 address chain(itojun&jinmei) */ + unlinked = 1; oia = ia; - lck_mtx_lock(nd6_mutex); - if (oia == (ia = in6_ifaddrs)) + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + if (oia == (ia = in6_ifaddrs)) { in6_ifaddrs = ia->ia_next; - else { + } else { while (ia->ia_next && (ia->ia_next != oia)) ia = ia->ia_next; - if (ia->ia_next) + if (ia->ia_next) { ia->ia_next = oia->ia_next; - else { - nd6log((LOG_ERR, + } else { + nd6log((LOG_ERR, "%s: didn't unlink in6ifaddr from " "list\n", if_name(ifp))); + unlinked = 0; } } - lck_mtx_unlock(nd6_mutex); + lck_rw_done(&in6_ifaddr_rwlock); + + ifa = &oia->ia_ifa; + /* + * release another refcnt for the link from in6_ifaddrs. + * Do this only if it's not already unlinked in the event + * that we lost the race, since in6_ifaddr_rwlock was momentarily + * dropped above. + */ + if (unlinked) + IFA_REMREF(ifa); + /* release reference held for this routine */ + IFA_REMREF(ifa); - ifafree(&oia->ia_ifa); + /* + * This is suboptimal, but since we dropped ifnet lock above + * the list might have changed. Repeat the search from the + * beginning until we find the first eligible IPv6 address. + */ + ifnet_lock_exclusive(ifp); + ifa = TAILQ_FIRST(&ifp->if_addrlist); } ifnet_lock_done(ifp); @@ -1128,7 +1148,7 @@ in6_tmpaddrtimer( bzero(nullbuf, sizeof(nullbuf)); for (i = 1; i < nd_ifinfo_indexlim + 1; i++) { ndi = &nd_ifinfo[i]; - if (ndi->flags != ND6_IFF_PERFORMNUD) + if ((ndi->flags | ND6_IFF_PERFORMNUD) != ND6_IFF_PERFORMNUD) continue; if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) != 0) { /* diff --git a/bsd/netinet6/in6_ifattach.h b/bsd/netinet6/in6_ifattach.h index 7fa627f2d..40ffa0379 100644 --- a/bsd/netinet6/in6_ifattach.h +++ b/bsd/netinet6/in6_ifattach.h @@ -1,3 +1,30 @@ +/* + * Copyright (c) 2003-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ /* $KAME: in6_ifattach.h,v 1.4 2000/02/22 14:04:18 itojun Exp $ */ /* @@ -34,13 +61,12 @@ #include #ifdef KERNEL_PRIVATE -void in6_nigroup_attach(const char *, int); -void in6_nigroup_detach(const char *, int); -int in6_ifattach(struct ifnet *, struct ifnet *, struct in6_aliasreq *); -void in6_ifdetach(struct ifnet *); -void in6_get_tmpifid(struct ifnet *, u_int8_t *, const u_int8_t *, int); -void in6_tmpaddrtimer(void *); -int in6_nigroup(struct ifnet *, const char *, int, struct in6_addr *); +extern int in6_domifattach(struct ifnet *); +extern int in6_ifattach(struct ifnet *, struct ifnet *, struct in6_aliasreq *); +extern void in6_ifdetach(struct ifnet *); +extern void in6_get_tmpifid(struct ifnet *, u_int8_t *, const u_int8_t *, int); +extern void in6_tmpaddrtimer(void *); +extern int in6_nigroup(struct ifnet *, const char *, int, struct in6_addr *); #endif /* KERNEL_PRIVATE */ #endif /* _NETINET6_IN6_IFATTACH_H_ */ diff --git a/bsd/netinet6/in6_mcast.c b/bsd/netinet6/in6_mcast.c new file mode 100644 index 000000000..05670d211 --- /dev/null +++ b/bsd/netinet6/in6_mcast.c @@ -0,0 +1,3490 @@ +/* + * Copyright (c) 2010-2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * Copyright (c) 2009 Bruce Simpson. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * IPv6 multicast socket, group, and socket option processing module. + * Normative references: RFC 2292, RFC 3492, RFC 3542, RFC 3678, RFC 3810. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef __SOCKUNION_DECLARED +union sockunion { + struct sockaddr_storage ss; + struct sockaddr sa; + struct sockaddr_dl sdl; + struct sockaddr_in6 sin6; +}; +typedef union sockunion sockunion_t; +#define __SOCKUNION_DECLARED +#endif /* __SOCKUNION_DECLARED */ + +static void im6f_commit(struct in6_mfilter *); +static int im6f_get_source(struct in6_mfilter *imf, + const struct sockaddr_in6 *psin, + struct in6_msource **); +static struct in6_msource * + im6f_graft(struct in6_mfilter *, const uint8_t, + const struct sockaddr_in6 *); +static int im6f_prune(struct in6_mfilter *, const struct sockaddr_in6 *); +static void im6f_rollback(struct in6_mfilter *); +static void im6f_reap(struct in6_mfilter *); +static int im6o_grow(struct ip6_moptions *, size_t); +static size_t im6o_match_group(const struct ip6_moptions *, + const struct ifnet *, const struct sockaddr *); +static struct in6_msource * + im6o_match_source(const struct ip6_moptions *, const size_t, + const struct sockaddr *); +static void im6s_merge(struct ip6_msource *ims, + const struct in6_msource *lims, const int rollback); +static int in6_mc_get(struct ifnet *, const struct in6_addr *, + struct in6_multi **); +static int in6m_get_source(struct in6_multi *inm, + const struct in6_addr *addr, const int noalloc, + struct ip6_msource **pims); +static int in6m_is_ifp_detached(const struct in6_multi *); +static int in6m_merge(struct in6_multi *, /*const*/ struct in6_mfilter *); +static void in6m_reap(struct in6_multi *); +static struct ip6_moptions * + in6p_findmoptions(struct inpcb *); +static int in6p_get_source_filters(struct inpcb *, struct sockopt *); +static int in6p_lookup_v4addr(struct ipv6_mreq *, struct ip_mreq *); +static int in6p_join_group(struct inpcb *, struct sockopt *); +static int in6p_leave_group(struct inpcb *, struct sockopt *); +static struct ifnet * + in6p_lookup_mcast_ifp(const struct inpcb *, + const struct sockaddr_in6 *); +static int in6p_block_unblock_source(struct inpcb *, struct sockopt *); +static int in6p_set_multicast_if(struct inpcb *, struct sockopt *); +static int in6p_set_source_filters(struct inpcb *, struct sockopt *); +static int sysctl_ip6_mcast_filters SYSCTL_HANDLER_ARGS; +static __inline__ int ip6_msource_cmp(const struct ip6_msource *, + const struct ip6_msource *); + +SYSCTL_DECL(_net_inet6_ip6); /* XXX Not in any common header. */ + +SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, mcast, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "IPv6 multicast"); + +static unsigned long in6_mcast_maxgrpsrc = IPV6_MAX_GROUP_SRC_FILTER; +SYSCTL_LONG(_net_inet6_ip6_mcast, OID_AUTO, maxgrpsrc, + CTLFLAG_RW | CTLFLAG_LOCKED, &in6_mcast_maxgrpsrc, + "Max source filters per group"); + +static unsigned long in6_mcast_maxsocksrc = IPV6_MAX_SOCK_SRC_FILTER; +SYSCTL_LONG(_net_inet6_ip6_mcast, OID_AUTO, maxsocksrc, + CTLFLAG_RW | CTLFLAG_LOCKED, &in6_mcast_maxsocksrc, + "Max source filters per socket"); + +int in6_mcast_loop = IPV6_DEFAULT_MULTICAST_LOOP; +SYSCTL_INT(_net_inet6_ip6_mcast, OID_AUTO, loop, CTLFLAG_RW | CTLFLAG_LOCKED, + &in6_mcast_loop, 0, "Loopback multicast datagrams by default"); + +SYSCTL_NODE(_net_inet6_ip6_mcast, OID_AUTO, filters, + CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_ip6_mcast_filters, + "Per-interface stack-wide source filters"); + +RB_GENERATE_PREV(ip6_msource_tree, ip6_msource, im6s_link, ip6_msource_cmp); + +#define IN6M_TRACE_HIST_SIZE 32 /* size of trace history */ + +/* For gdb */ +__private_extern__ unsigned int in6m_trace_hist_size = IN6M_TRACE_HIST_SIZE; + +struct in6_multi_dbg { + struct in6_multi in6m; /* in6_multi */ + u_int16_t in6m_refhold_cnt; /* # of ref */ + u_int16_t in6m_refrele_cnt; /* # of rele */ + /* + * Circular lists of in6m_addref and in6m_remref callers. + */ + ctrace_t in6m_refhold[IN6M_TRACE_HIST_SIZE]; + ctrace_t in6m_refrele[IN6M_TRACE_HIST_SIZE]; + /* + * Trash list linkage + */ + TAILQ_ENTRY(in6_multi_dbg) in6m_trash_link; +}; + +/* List of trash in6_multi entries protected by in6m_trash_lock */ +static TAILQ_HEAD(, in6_multi_dbg) in6m_trash_head; +static decl_lck_mtx_data(, in6m_trash_lock); + +#if DEBUG +static unsigned int in6m_debug = 1; /* debugging (enabled) */ +#else +static unsigned int in6m_debug; /* debugging (disabled) */ +#endif /* !DEBUG */ +static unsigned int in6m_size; /* size of zone element */ +static struct zone *in6m_zone; /* zone for in6_multi */ + +#define IN6M_ZONE_MAX 64 /* maximum elements in zone */ +#define IN6M_ZONE_NAME "in6_multi" /* zone name */ + +static unsigned int imm_size; /* size of zone element */ +static struct zone *imm_zone; /* zone for in6_multi_mship */ + +#define IMM_ZONE_MAX 64 /* maximum elements in zone */ +#define IMM_ZONE_NAME "in6_multi_mship" /* zone name */ + +#define IP6MS_ZONE_MAX 64 /* maximum elements in zone */ +#define IP6MS_ZONE_NAME "ip6_msource" /* zone name */ + +static unsigned int ip6ms_size; /* size of zone element */ +static struct zone *ip6ms_zone; /* zone for ip6_msource */ + +#define IN6MS_ZONE_MAX 64 /* maximum elements in zone */ +#define IN6MS_ZONE_NAME "in6_msource" /* zone name */ + +static unsigned int in6ms_size; /* size of zone element */ +static struct zone *in6ms_zone; /* zone for in6_msource */ + +/* Lock group and attribute for in6_multihead_lock lock */ +static lck_attr_t *in6_multihead_lock_attr; +static lck_grp_t *in6_multihead_lock_grp; +static lck_grp_attr_t *in6_multihead_lock_grp_attr; + +static decl_lck_rw_data(, in6_multihead_lock); +struct in6_multihead in6_multihead; + +static struct in6_multi *in6_multi_alloc(int); +static void in6_multi_free(struct in6_multi *); +static void in6_multi_attach(struct in6_multi *); +static struct in6_multi_mship *in6_multi_mship_alloc(int); +static void in6_multi_mship_free(struct in6_multi_mship *); +static void in6m_trace(struct in6_multi *, int); + +static struct ip6_msource *ip6ms_alloc(int); +static void ip6ms_free(struct ip6_msource *); +static struct in6_msource *in6ms_alloc(int); +static void in6ms_free(struct in6_msource *); + +#define IM6O_CAST_TO_NONCONST(x) ((struct ip6_moptions *)(void *)(uintptr_t)x) +#define IN6M_CAST_TO_NONCONST(x) ((struct in6_multi *)(void *)(uintptr_t)x) + +/* + * IPv6 source tree comparison function. + * + * An ordered predicate is necessary; bcmp() is not documented to return + * an indication of order, memcmp() is, and is an ISO C99 requirement. + */ +static __inline int +ip6_msource_cmp(const struct ip6_msource *a, const struct ip6_msource *b) +{ + return (memcmp(&a->im6s_addr, &b->im6s_addr, sizeof(struct in6_addr))); +} + +/* + * Inline function which wraps assertions for a valid ifp. + */ +static __inline__ int +in6m_is_ifp_detached(const struct in6_multi *inm) +{ + VERIFY(inm->in6m_ifma != NULL); + VERIFY(inm->in6m_ifp == inm->in6m_ifma->ifma_ifp); + + return (!ifnet_is_attached(inm->in6m_ifp, 0)); +} + +/* + * Initialize an in6_mfilter structure to a known state at t0, t1 + * with an empty source filter list. + */ +static __inline__ void +im6f_init(struct in6_mfilter *imf, const int st0, const int st1) +{ + memset(imf, 0, sizeof(struct in6_mfilter)); + RB_INIT(&imf->im6f_sources); + imf->im6f_st[0] = st0; + imf->im6f_st[1] = st1; +} + +/* + * Resize the ip6_moptions vector to the next power-of-two minus 1. + */ +static int +im6o_grow(struct ip6_moptions *imo, size_t newmax) +{ + struct in6_multi **nmships; + struct in6_multi **omships; + struct in6_mfilter *nmfilters; + struct in6_mfilter *omfilters; + size_t idx; + size_t oldmax; + + IM6O_LOCK_ASSERT_HELD(imo); + + nmships = NULL; + nmfilters = NULL; + omships = imo->im6o_membership; + omfilters = imo->im6o_mfilters; + oldmax = imo->im6o_max_memberships; + if (newmax == 0) + newmax = ((oldmax + 1) * 2) - 1; + + if (newmax > IPV6_MAX_MEMBERSHIPS) + return (ETOOMANYREFS); + + if ((nmships = (struct in6_multi **)_REALLOC(omships, + sizeof (struct in6_multi *) * newmax, M_IP6MOPTS, + M_WAITOK | M_ZERO)) == NULL) + return (ENOMEM); + + imo->im6o_membership = nmships; + + if ((nmfilters = (struct in6_mfilter *)_REALLOC(omfilters, + sizeof (struct in6_mfilter) * newmax, M_IN6MFILTER, + M_WAITOK | M_ZERO)) == NULL) + return (ENOMEM); + + imo->im6o_mfilters = nmfilters; + + /* Initialize newly allocated source filter heads. */ + for (idx = oldmax; idx < newmax; idx++) + im6f_init(&nmfilters[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); + + imo->im6o_max_memberships = newmax; + + return (0); +} + +/* + * Find an IPv6 multicast group entry for this ip6_moptions instance + * which matches the specified group, and optionally an interface. + * Return its index into the array, or -1 if not found. + */ +static size_t +im6o_match_group(const struct ip6_moptions *imo, const struct ifnet *ifp, + const struct sockaddr *group) +{ + const struct sockaddr_in6 *gsin6; + struct in6_multi *pinm; + int idx; + int nmships; + + IM6O_LOCK_ASSERT_HELD(IM6O_CAST_TO_NONCONST(imo)); + + gsin6 = (const struct sockaddr_in6 *)group; + + /* The im6o_membership array may be lazy allocated. */ + if (imo->im6o_membership == NULL || imo->im6o_num_memberships == 0) + return (-1); + + nmships = imo->im6o_num_memberships; + for (idx = 0; idx < nmships; idx++) { + pinm = imo->im6o_membership[idx]; + if (pinm == NULL) + continue; + IN6M_LOCK(pinm); + if ((ifp == NULL || (pinm->in6m_ifp == ifp)) && + IN6_ARE_ADDR_EQUAL(&pinm->in6m_addr, + &gsin6->sin6_addr)) { + IN6M_UNLOCK(pinm); + break; + } + IN6M_UNLOCK(pinm); + } + if (idx >= nmships) + idx = -1; + + return (idx); +} + +/* + * Find an IPv6 multicast source entry for this imo which matches + * the given group index for this socket, and source address. + * + * XXX TODO: The scope ID, if present in src, is stripped before + * any comparison. We SHOULD enforce scope/zone checks where the source + * filter entry has a link scope. + * + * NOTE: This does not check if the entry is in-mode, merely if + * it exists, which may not be the desired behaviour. + */ +static struct in6_msource * +im6o_match_source(const struct ip6_moptions *imo, const size_t gidx, + const struct sockaddr *src) +{ + struct ip6_msource find; + struct in6_mfilter *imf; + struct ip6_msource *ims; + const sockunion_t *psa; + + IM6O_LOCK_ASSERT_HELD(IM6O_CAST_TO_NONCONST(imo)); + + VERIFY(src->sa_family == AF_INET6); + VERIFY(gidx != (size_t)-1 && gidx < imo->im6o_num_memberships); + + /* The im6o_mfilters array may be lazy allocated. */ + if (imo->im6o_mfilters == NULL) + return (NULL); + imf = &imo->im6o_mfilters[gidx]; + + psa = (const sockunion_t *)src; + find.im6s_addr = psa->sin6.sin6_addr; + in6_clearscope(&find.im6s_addr); /* XXX */ + ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find); + + return ((struct in6_msource *)ims); +} + +/* + * Perform filtering for multicast datagrams on a socket by group and source. + * + * Returns 0 if a datagram should be allowed through, or various error codes + * if the socket was not a member of the group, or the source was muted, etc. + */ +int +im6o_mc_filter(const struct ip6_moptions *imo, const struct ifnet *ifp, + const struct sockaddr *group, const struct sockaddr *src) +{ + size_t gidx; + struct in6_msource *ims; + int mode; + + IM6O_LOCK_ASSERT_HELD(IM6O_CAST_TO_NONCONST(imo)); + VERIFY(ifp != NULL); + + gidx = im6o_match_group(imo, ifp, group); + if (gidx == (size_t)-1) + return (MCAST_NOTGMEMBER); + + /* + * Check if the source was included in an (S,G) join. + * Allow reception on exclusive memberships by default, + * reject reception on inclusive memberships by default. + * Exclude source only if an in-mode exclude filter exists. + * Include source only if an in-mode include filter exists. + * NOTE: We are comparing group state here at MLD t1 (now) + * with socket-layer t0 (since last downcall). + */ + mode = imo->im6o_mfilters[gidx].im6f_st[1]; + ims = im6o_match_source(imo, gidx, src); + + if ((ims == NULL && mode == MCAST_INCLUDE) || + (ims != NULL && ims->im6sl_st[0] != mode)) + return (MCAST_NOTSMEMBER); + + return (MCAST_PASS); +} + +/* + * Find and return a reference to an in6_multi record for (ifp, group), + * and bump its reference count. + * If one does not exist, try to allocate it, and update link-layer multicast + * filters on ifp to listen for group. + * Assumes the IN6_MULTI lock is held across the call. + * Return 0 if successful, otherwise return an appropriate error code. + */ +static int +in6_mc_get(struct ifnet *ifp, const struct in6_addr *group, + struct in6_multi **pinm) +{ + struct sockaddr_in6 gsin6; + struct ifmultiaddr *ifma; + struct in6_multi *inm; + int error; + + *pinm = NULL; + + in6_multihead_lock_shared(); + IN6_LOOKUP_MULTI(group, ifp, inm); + if (inm != NULL) { + IN6M_LOCK(inm); + VERIFY(inm->in6m_reqcnt >= 1); + inm->in6m_reqcnt++; + VERIFY(inm->in6m_reqcnt != 0); + *pinm = inm; + IN6M_UNLOCK(inm); + in6_multihead_lock_done(); + /* + * We already joined this group; return the in6m + * with a refcount held (via lookup) for caller. + */ + return (0); + } + in6_multihead_lock_done(); + + memset(&gsin6, 0, sizeof(gsin6)); + gsin6.sin6_family = AF_INET6; + gsin6.sin6_len = sizeof(struct sockaddr_in6); + gsin6.sin6_addr = *group; + + /* + * Check if a link-layer group is already associated + * with this network-layer group on the given ifnet. + */ + error = if_addmulti(ifp, (struct sockaddr *)&gsin6, &ifma); + if (error != 0) + return (error); + + /* + * See comments in in6m_remref() for access to ifma_protospec. + */ + in6_multihead_lock_exclusive(); + IFMA_LOCK(ifma); + if ((inm = ifma->ifma_protospec) != NULL) { + VERIFY(ifma->ifma_addr != NULL); + VERIFY(ifma->ifma_addr->sa_family == AF_INET6); + IN6M_ADDREF(inm); /* for caller */ + IFMA_UNLOCK(ifma); + IN6M_LOCK(inm); + VERIFY(inm->in6m_ifma == ifma); + VERIFY(inm->in6m_ifp == ifp); + VERIFY(IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, group)); + if (inm->in6m_debug & IFD_ATTACHED) { + VERIFY(inm->in6m_reqcnt >= 1); + inm->in6m_reqcnt++; + VERIFY(inm->in6m_reqcnt != 0); + *pinm = inm; + IN6M_UNLOCK(inm); + in6_multihead_lock_done(); + IFMA_REMREF(ifma); + /* + * We lost the race with another thread doing + * in6_mc_get(); since this group has already + * been joined; return the inm with a refcount + * held for caller. + */ + return (0); + } + /* + * We lost the race with another thread doing in6_delmulti(); + * the inm referring to the ifma has been detached, thus we + * reattach it back to the in6_multihead list, and return the + * inm with a refcount held for the caller. + */ + in6_multi_attach(inm); + VERIFY((inm->in6m_debug & + (IFD_ATTACHED | IFD_TRASHED)) == IFD_ATTACHED); + *pinm = inm; + IN6M_UNLOCK(inm); + in6_multihead_lock_done(); + IFMA_REMREF(ifma); + return (0); + } + IFMA_UNLOCK(ifma); + + /* + * A new in6_multi record is needed; allocate and initialize it. + * We DO NOT perform an MLD join as the in6_ layer may need to + * push an initial source list down to MLD to support SSM. + * + * The initial source filter state is INCLUDE, {} as per the RFC. + * Pending state-changes per group are subject to a bounds check. + */ + inm = in6_multi_alloc(M_WAITOK); + if (inm == NULL) { + in6_multihead_lock_done(); + IFMA_REMREF(ifma); + return (ENOMEM); + } + IN6M_LOCK(inm); + inm->in6m_addr = *group; + inm->in6m_ifp = ifp; + inm->in6m_mli = MLD_IFINFO(ifp); + VERIFY(inm->in6m_mli != NULL); + MLI_ADDREF(inm->in6m_mli); + inm->in6m_ifma = ifma; /* keep refcount from if_addmulti() */ + inm->in6m_state = MLD_NOT_MEMBER; + /* + * Pending state-changes per group are subject to a bounds check. + */ + inm->in6m_scq.ifq_maxlen = MLD_MAX_STATE_CHANGES; + inm->in6m_st[0].iss_fmode = MCAST_UNDEFINED; + inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED; + RB_INIT(&inm->in6m_srcs); + *pinm = inm; + in6_multi_attach(inm); + VERIFY((inm->in6m_debug & + (IFD_ATTACHED | IFD_TRASHED)) == IFD_ATTACHED); + IN6M_ADDREF_LOCKED(inm); /* for caller */ + IN6M_UNLOCK(inm); + + IFMA_LOCK(ifma); + VERIFY(ifma->ifma_protospec == NULL); + ifma->ifma_protospec = inm; + IFMA_UNLOCK(ifma); + in6_multihead_lock_done(); + + return (0); +} + +/* + * Clear recorded source entries for a group. + * Used by the MLD code. Caller must hold the IN6_MULTI lock. + * FIXME: Should reap. + */ +void +in6m_clear_recorded(struct in6_multi *inm) +{ + struct ip6_msource *ims; + + IN6M_LOCK_ASSERT_HELD(inm); + + RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) { + if (ims->im6s_stp) { + ims->im6s_stp = 0; + --inm->in6m_st[1].iss_rec; + } + } + VERIFY(inm->in6m_st[1].iss_rec == 0); +} + +/* + * Record a source as pending for a Source-Group MLDv2 query. + * This lives here as it modifies the shared tree. + * + * inm is the group descriptor. + * naddr is the address of the source to record in network-byte order. + * + * If the net.inet6.mld.sgalloc sysctl is non-zero, we will + * lazy-allocate a source node in response to an SG query. + * Otherwise, no allocation is performed. This saves some memory + * with the trade-off that the source will not be reported to the + * router if joined in the window between the query response and + * the group actually being joined on the local host. + * + * VIMAGE: XXX: Currently the mld_sgalloc feature has been removed. + * This turns off the allocation of a recorded source entry if + * the group has not been joined. + * + * Return 0 if the source didn't exist or was already marked as recorded. + * Return 1 if the source was marked as recorded by this function. + * Return <0 if any error occured (negated errno code). + */ +int +in6m_record_source(struct in6_multi *inm, const struct in6_addr *addr) +{ + struct ip6_msource find; + struct ip6_msource *ims, *nims; + + IN6M_LOCK_ASSERT_HELD(inm); + + find.im6s_addr = *addr; + ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find); + if (ims && ims->im6s_stp) + return (0); + if (ims == NULL) { + if (inm->in6m_nsrc == in6_mcast_maxgrpsrc) + return (-ENOSPC); + nims = ip6ms_alloc(M_WAITOK); + if (nims == NULL) + return (-ENOMEM); + nims->im6s_addr = find.im6s_addr; + RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims); + ++inm->in6m_nsrc; + ims = nims; + } + + /* + * Mark the source as recorded and update the recorded + * source count. + */ + ++ims->im6s_stp; + ++inm->in6m_st[1].iss_rec; + + return (1); +} + +/* + * Return a pointer to an in6_msource owned by an in6_mfilter, + * given its source address. + * Lazy-allocate if needed. If this is a new entry its filter state is + * undefined at t0. + * + * imf is the filter set being modified. + * addr is the source address. + * + * Caller is expected to be holding im6o_lock. + */ +static int +im6f_get_source(struct in6_mfilter *imf, const struct sockaddr_in6 *psin, + struct in6_msource **plims) +{ + struct ip6_msource find; + struct ip6_msource *ims; + struct in6_msource *lims; + int error; + + error = 0; + ims = NULL; + lims = NULL; + + find.im6s_addr = psin->sin6_addr; + ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find); + lims = (struct in6_msource *)ims; + if (lims == NULL) { + if (imf->im6f_nsrc == in6_mcast_maxsocksrc) + return (ENOSPC); + lims = in6ms_alloc(M_WAITOK); + if (lims == NULL) + return (ENOMEM); + lims->im6s_addr = find.im6s_addr; + lims->im6sl_st[0] = MCAST_UNDEFINED; + RB_INSERT(ip6_msource_tree, &imf->im6f_sources, + (struct ip6_msource *)lims); + ++imf->im6f_nsrc; + } + + *plims = lims; + + return (error); +} + +/* + * Graft a source entry into an existing socket-layer filter set, + * maintaining any required invariants and checking allocations. + * + * The source is marked as being in the new filter mode at t1. + * + * Return the pointer to the new node, otherwise return NULL. + * + * Caller is expected to be holding im6o_lock. + */ +static struct in6_msource * +im6f_graft(struct in6_mfilter *imf, const uint8_t st1, + const struct sockaddr_in6 *psin) +{ + struct in6_msource *lims; + + lims = in6ms_alloc(M_WAITOK); + if (lims == NULL) + return (NULL); + lims->im6s_addr = psin->sin6_addr; + lims->im6sl_st[0] = MCAST_UNDEFINED; + lims->im6sl_st[1] = st1; + RB_INSERT(ip6_msource_tree, &imf->im6f_sources, + (struct ip6_msource *)lims); + ++imf->im6f_nsrc; + + return (lims); +} + +/* + * Prune a source entry from an existing socket-layer filter set, + * maintaining any required invariants and checking allocations. + * + * The source is marked as being left at t1, it is not freed. + * + * Return 0 if no error occurred, otherwise return an errno value. + * + * Caller is expected to be holding im6o_lock. + */ +static int +im6f_prune(struct in6_mfilter *imf, const struct sockaddr_in6 *psin) +{ + struct ip6_msource find; + struct ip6_msource *ims; + struct in6_msource *lims; + + find.im6s_addr = psin->sin6_addr; + ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find); + if (ims == NULL) + return (ENOENT); + lims = (struct in6_msource *)ims; + lims->im6sl_st[1] = MCAST_UNDEFINED; + return (0); +} + +/* + * Revert socket-layer filter set deltas at t1 to t0 state. + * + * Caller is expected to be holding im6o_lock. + */ +static void +im6f_rollback(struct in6_mfilter *imf) +{ + struct ip6_msource *ims, *tims; + struct in6_msource *lims; + + RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) { + lims = (struct in6_msource *)ims; + if (lims->im6sl_st[0] == lims->im6sl_st[1]) { + /* no change at t1 */ + continue; + } else if (lims->im6sl_st[0] != MCAST_UNDEFINED) { + /* revert change to existing source at t1 */ + lims->im6sl_st[1] = lims->im6sl_st[0]; + } else { + /* revert source added t1 */ + MLD_PRINTF(("%s: free in6ms %p\n", __func__, lims)); + RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims); + in6ms_free(lims); + imf->im6f_nsrc--; + } + } + imf->im6f_st[1] = imf->im6f_st[0]; +} + +/* + * Mark socket-layer filter set as INCLUDE {} at t1. + * + * Caller is expected to be holding im6o_lock. + */ +void +im6f_leave(struct in6_mfilter *imf) +{ + struct ip6_msource *ims; + struct in6_msource *lims; + + RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { + lims = (struct in6_msource *)ims; + lims->im6sl_st[1] = MCAST_UNDEFINED; + } + imf->im6f_st[1] = MCAST_INCLUDE; +} + +/* + * Mark socket-layer filter set deltas as committed. + * + * Caller is expected to be holding im6o_lock. + */ +static void +im6f_commit(struct in6_mfilter *imf) +{ + struct ip6_msource *ims; + struct in6_msource *lims; + + RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { + lims = (struct in6_msource *)ims; + lims->im6sl_st[0] = lims->im6sl_st[1]; + } + imf->im6f_st[0] = imf->im6f_st[1]; +} + +/* + * Reap unreferenced sources from socket-layer filter set. + * + * Caller is expected to be holding im6o_lock. + */ +static void +im6f_reap(struct in6_mfilter *imf) +{ + struct ip6_msource *ims, *tims; + struct in6_msource *lims; + + RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) { + lims = (struct in6_msource *)ims; + if ((lims->im6sl_st[0] == MCAST_UNDEFINED) && + (lims->im6sl_st[1] == MCAST_UNDEFINED)) { + MLD_PRINTF(("%s: free in6ms %p\n", __func__, lims)); + RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims); + in6ms_free(lims); + imf->im6f_nsrc--; + } + } +} + +/* + * Purge socket-layer filter set. + * + * Caller is expected to be holding im6o_lock. + */ +void +im6f_purge(struct in6_mfilter *imf) +{ + struct ip6_msource *ims, *tims; + struct in6_msource *lims; + + RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) { + lims = (struct in6_msource *)ims; + MLD_PRINTF(("%s: free in6ms %p\n", __func__, lims)); + RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims); + in6ms_free(lims); + imf->im6f_nsrc--; + } + imf->im6f_st[0] = imf->im6f_st[1] = MCAST_UNDEFINED; + VERIFY(RB_EMPTY(&imf->im6f_sources)); +} + +/* + * Look up a source filter entry for a multicast group. + * + * inm is the group descriptor to work with. + * addr is the IPv6 address to look up. + * noalloc may be non-zero to suppress allocation of sources. + * *pims will be set to the address of the retrieved or allocated source. + * + * Return 0 if successful, otherwise return a non-zero error code. + */ +static int +in6m_get_source(struct in6_multi *inm, const struct in6_addr *addr, + const int noalloc, struct ip6_msource **pims) +{ + struct ip6_msource find; + struct ip6_msource *ims, *nims; + + IN6M_LOCK_ASSERT_HELD(inm); + + find.im6s_addr = *addr; + ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find); + if (ims == NULL && !noalloc) { + if (inm->in6m_nsrc == in6_mcast_maxgrpsrc) + return (ENOSPC); + nims = ip6ms_alloc(M_WAITOK); + if (nims == NULL) + return (ENOMEM); + nims->im6s_addr = *addr; + RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims); + ++inm->in6m_nsrc; + ims = nims; + MLD_PRINTF(("%s: allocated %s as %p\n", __func__, + ip6_sprintf(addr), ims)); + } + + *pims = ims; + return (0); +} + +/* + * Helper function to derive the filter mode on a source entry + * from its internal counters. Predicates are: + * A source is only excluded if all listeners exclude it. + * A source is only included if no listeners exclude it, + * and at least one listener includes it. + * May be used by ifmcstat(8). + */ +uint8_t +im6s_get_mode(const struct in6_multi *inm, const struct ip6_msource *ims, + uint8_t t) +{ + IN6M_LOCK_ASSERT_HELD(IN6M_CAST_TO_NONCONST(inm)); + + t = !!t; + if (inm->in6m_st[t].iss_ex > 0 && + inm->in6m_st[t].iss_ex == ims->im6s_st[t].ex) + return (MCAST_EXCLUDE); + else if (ims->im6s_st[t].in > 0 && ims->im6s_st[t].ex == 0) + return (MCAST_INCLUDE); + return (MCAST_UNDEFINED); +} + +/* + * Merge socket-layer source into MLD-layer source. + * If rollback is non-zero, perform the inverse of the merge. + */ +static void +im6s_merge(struct ip6_msource *ims, const struct in6_msource *lims, + const int rollback) +{ + int n = rollback ? -1 : 1; + + if (lims->im6sl_st[0] == MCAST_EXCLUDE) { + MLD_PRINTF(("%s: t1 ex -= %d on %s\n", __func__, n, + ip6_sprintf(&lims->im6s_addr))); + ims->im6s_st[1].ex -= n; + } else if (lims->im6sl_st[0] == MCAST_INCLUDE) { + MLD_PRINTF(("%s: t1 in -= %d on %s\n", __func__, n, + ip6_sprintf(&lims->im6s_addr))); + ims->im6s_st[1].in -= n; + } + + if (lims->im6sl_st[1] == MCAST_EXCLUDE) { + MLD_PRINTF(("%s: t1 ex += %d on %s\n", __func__, n, + ip6_sprintf(&lims->im6s_addr))); + ims->im6s_st[1].ex += n; + } else if (lims->im6sl_st[1] == MCAST_INCLUDE) { + MLD_PRINTF(("%s: t1 in += %d on %s\n", __func__, n, + ip6_sprintf(&lims->im6s_addr))); + ims->im6s_st[1].in += n; + } +} + +/* + * Atomically update the global in6_multi state, when a membership's + * filter list is being updated in any way. + * + * imf is the per-inpcb-membership group filter pointer. + * A fake imf may be passed for in-kernel consumers. + * + * XXX This is a candidate for a set-symmetric-difference style loop + * which would eliminate the repeated lookup from root of ims nodes, + * as they share the same key space. + * + * If any error occurred this function will back out of refcounts + * and return a non-zero value. + */ +static int +in6m_merge(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf) +{ + struct ip6_msource *ims, *nims; + struct in6_msource *lims; + int schanged, error; + int nsrc0, nsrc1; + + IN6M_LOCK_ASSERT_HELD(inm); + + schanged = 0; + error = 0; + nsrc1 = nsrc0 = 0; + + /* + * Update the source filters first, as this may fail. + * Maintain count of in-mode filters at t0, t1. These are + * used to work out if we transition into ASM mode or not. + * Maintain a count of source filters whose state was + * actually modified by this operation. + */ + RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { + lims = (struct in6_msource *)ims; + if (lims->im6sl_st[0] == imf->im6f_st[0]) nsrc0++; + if (lims->im6sl_st[1] == imf->im6f_st[1]) nsrc1++; + if (lims->im6sl_st[0] == lims->im6sl_st[1]) continue; + error = in6m_get_source(inm, &lims->im6s_addr, 0, &nims); + ++schanged; + if (error) + break; + im6s_merge(nims, lims, 0); + } + if (error) { + struct ip6_msource *bims; + + RB_FOREACH_REVERSE_FROM(ims, ip6_msource_tree, nims) { + lims = (struct in6_msource *)ims; + if (lims->im6sl_st[0] == lims->im6sl_st[1]) + continue; + (void) in6m_get_source(inm, &lims->im6s_addr, 1, &bims); + if (bims == NULL) + continue; + im6s_merge(bims, lims, 1); + } + goto out_reap; + } + + MLD_PRINTF(("%s: imf filters in-mode: %d at t0, %d at t1\n", + __func__, nsrc0, nsrc1)); + + /* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */ + if (imf->im6f_st[0] == imf->im6f_st[1] && + imf->im6f_st[1] == MCAST_INCLUDE) { + if (nsrc1 == 0) { + MLD_PRINTF(("%s: --in on inm at t1\n", __func__)); + --inm->in6m_st[1].iss_in; + } + } + + /* Handle filter mode transition on socket. */ + if (imf->im6f_st[0] != imf->im6f_st[1]) { + MLD_PRINTF(("%s: imf transition %d to %d\n", + __func__, imf->im6f_st[0], imf->im6f_st[1])); + + if (imf->im6f_st[0] == MCAST_EXCLUDE) { + MLD_PRINTF(("%s: --ex on inm at t1\n", __func__)); + --inm->in6m_st[1].iss_ex; + } else if (imf->im6f_st[0] == MCAST_INCLUDE) { + MLD_PRINTF(("%s: --in on inm at t1\n", __func__)); + --inm->in6m_st[1].iss_in; + } + + if (imf->im6f_st[1] == MCAST_EXCLUDE) { + MLD_PRINTF(("%s: ex++ on inm at t1\n", __func__)); + inm->in6m_st[1].iss_ex++; + } else if (imf->im6f_st[1] == MCAST_INCLUDE && nsrc1 > 0) { + MLD_PRINTF(("%s: in++ on inm at t1\n", __func__)); + inm->in6m_st[1].iss_in++; + } + } + + /* + * Track inm filter state in terms of listener counts. + * If there are any exclusive listeners, stack-wide + * membership is exclusive. + * Otherwise, if only inclusive listeners, stack-wide is inclusive. + * If no listeners remain, state is undefined at t1, + * and the MLD lifecycle for this group should finish. + */ + if (inm->in6m_st[1].iss_ex > 0) { + MLD_PRINTF(("%s: transition to EX\n", __func__)); + inm->in6m_st[1].iss_fmode = MCAST_EXCLUDE; + } else if (inm->in6m_st[1].iss_in > 0) { + MLD_PRINTF(("%s: transition to IN\n", __func__)); + inm->in6m_st[1].iss_fmode = MCAST_INCLUDE; + } else { + MLD_PRINTF(("%s: transition to UNDEF\n", __func__)); + inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED; + } + + /* Decrement ASM listener count on transition out of ASM mode. */ + if (imf->im6f_st[0] == MCAST_EXCLUDE && nsrc0 == 0) { + if ((imf->im6f_st[1] != MCAST_EXCLUDE) || + (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) { + MLD_PRINTF(("%s: --asm on inm at t1\n", __func__)); + --inm->in6m_st[1].iss_asm; + } + } + + /* Increment ASM listener count on transition to ASM mode. */ + if (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 == 0) { + MLD_PRINTF(("%s: asm++ on inm at t1\n", __func__)); + inm->in6m_st[1].iss_asm++; + } + + MLD_PRINTF(("%s: merged imf %p to inm %p\n", __func__, imf, inm)); + in6m_print(inm); + +out_reap: + if (schanged > 0) { + MLD_PRINTF(("%s: sources changed; reaping\n", __func__)); + in6m_reap(inm); + } + return (error); +} + +/* + * Mark an in6_multi's filter set deltas as committed. + * Called by MLD after a state change has been enqueued. + */ +void +in6m_commit(struct in6_multi *inm) +{ + struct ip6_msource *ims; + + IN6M_LOCK_ASSERT_HELD(inm); + + MLD_PRINTF(("%s: commit inm %p\n", __func__, inm)); + MLD_PRINTF(("%s: pre commit:\n", __func__)); + in6m_print(inm); + + RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) { + ims->im6s_st[0] = ims->im6s_st[1]; + } + inm->in6m_st[0] = inm->in6m_st[1]; +} + +/* + * Reap unreferenced nodes from an in6_multi's filter set. + */ +static void +in6m_reap(struct in6_multi *inm) +{ + struct ip6_msource *ims, *tims; + + IN6M_LOCK_ASSERT_HELD(inm); + + RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) { + if (ims->im6s_st[0].ex > 0 || ims->im6s_st[0].in > 0 || + ims->im6s_st[1].ex > 0 || ims->im6s_st[1].in > 0 || + ims->im6s_stp != 0) + continue; + MLD_PRINTF(("%s: free ims %p\n", __func__, ims)); + RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims); + ip6ms_free(ims); + inm->in6m_nsrc--; + } +} + +/* + * Purge all source nodes from an in6_multi's filter set. + */ +void +in6m_purge(struct in6_multi *inm) +{ + struct ip6_msource *ims, *tims; + + IN6M_LOCK_ASSERT_HELD(inm); + + RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) { + MLD_PRINTF(("%s: free ims %p\n", __func__, ims)); + RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims); + ip6ms_free(ims); + inm->in6m_nsrc--; + } +} + +/* + * Join a multicast address w/o sources. + * KAME compatibility entry point. + * + */ +struct in6_multi_mship * +in6_joingroup(struct ifnet *ifp, struct in6_addr *mcaddr, + int *errorp, int delay) +{ + struct in6_multi_mship *imm; + int error; + + *errorp = 0; + + imm = in6_multi_mship_alloc(M_WAITOK); + if (imm == NULL) { + *errorp = ENOBUFS; + return (NULL); + } + + delay = (delay * PR_SLOWHZ) / hz; + + error = in6_mc_join(ifp, mcaddr, NULL, &imm->i6mm_maddr, delay); + if (error) { + *errorp = error; + in6_multi_mship_free(imm); + return (NULL); + } + + return (imm); +} + +/* + * Leave a multicast address w/o sources. + * KAME compatibility entry point. + */ +int +in6_leavegroup(struct in6_multi_mship *imm) +{ + if (imm->i6mm_maddr != NULL) { + in6_mc_leave(imm->i6mm_maddr, NULL); + IN6M_REMREF(imm->i6mm_maddr); + imm->i6mm_maddr = NULL; + } + in6_multi_mship_free(imm); + return 0; +} + +/* + * Join a multicast group; real entry point. + * + * Only preserves atomicity at inm level. + * NOTE: imf argument cannot be const due to sys/tree.h limitations. + * + * If the MLD downcall fails, the group is not joined, and an error + * code is returned. + */ +int +in6_mc_join(struct ifnet *ifp, const struct in6_addr *mcaddr, + /*const*/ struct in6_mfilter *imf, struct in6_multi **pinm, + const int delay) +{ + struct in6_mfilter timf; + struct in6_multi *inm = NULL; + int error = 0; + + /* + * Sanity: Check scope zone ID was set for ifp, if and + * only if group is scoped to an interface. + */ + VERIFY(IN6_IS_ADDR_MULTICAST(mcaddr)); + if (IN6_IS_ADDR_MC_LINKLOCAL(mcaddr) || + IN6_IS_ADDR_MC_INTFACELOCAL(mcaddr)) { + VERIFY(mcaddr->s6_addr16[1] != 0); + } + + MLD_PRINTF(("%s: join %s on %p(%s%d))\n", __func__, + ip6_sprintf(mcaddr), ifp, ifp->if_name, ifp->if_unit)); + + *pinm = NULL; + + /* + * If no imf was specified (i.e. kernel consumer), + * fake one up and assume it is an ASM join. + */ + if (imf == NULL) { + im6f_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE); + imf = &timf; + } + + error = in6_mc_get(ifp, mcaddr, &inm); + if (error) { + MLD_PRINTF(("%s: in6_mc_get() failure\n", __func__)); + return (error); + } + + MLD_PRINTF(("%s: merge inm state\n", __func__)); + + IN6M_LOCK(inm); + error = in6m_merge(inm, imf); + if (error) { + MLD_PRINTF(("%s: failed to merge inm state\n", __func__)); + goto out_in6m_release; + } + + MLD_PRINTF(("%s: doing mld downcall\n", __func__)); + error = mld_change_state(inm, delay); + if (error) { + MLD_PRINTF(("%s: failed to update source\n", __func__)); + goto out_in6m_release; + } + +out_in6m_release: + if (error) { + MLD_PRINTF(("%s: dropping ref on %p\n", __func__, inm)); + IN6M_UNLOCK(inm); + IN6M_REMREF(inm); + } else { + IN6M_UNLOCK(inm); + *pinm = inm; /* keep refcount from in6_mc_get() */ + } + + return (error); +} + +/* + * Leave a multicast group; real entry point. + * All source filters will be expunged. + * + * Only preserves atomicity at inm level. + * + * Holding the write lock for the INP which contains imf + * is highly advisable. We can't assert for it as imf does not + * contain a back-pointer to the owning inp. + * + * Note: This is not the same as in6m_release(*) as this function also + * makes a state change downcall into MLD. + */ +int +in6_mc_leave(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf) +{ + struct in6_mfilter timf; + int error, lastref; + + error = 0; + + IN6M_LOCK_ASSERT_NOTHELD(inm); + + in6_multihead_lock_exclusive(); + IN6M_LOCK(inm); + + MLD_PRINTF(("%s: leave inm %p, %s/%s%d, imf %p\n", __func__, + inm, ip6_sprintf(&inm->in6m_addr), + (in6m_is_ifp_detached(inm) ? "null" : inm->in6m_ifp->if_name), + inm->in6m_ifp->if_unit, imf)); + + /* + * If no imf was specified (i.e. kernel consumer), + * fake one up and assume it is an ASM join. + */ + if (imf == NULL) { + im6f_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED); + imf = &timf; + } + + /* + * Begin state merge transaction at MLD layer. + * + * As this particular invocation should not cause any memory + * to be allocated, and there is no opportunity to roll back + * the transaction, it MUST NOT fail. + */ + MLD_PRINTF(("%s: merge inm state\n", __func__)); + + error = in6m_merge(inm, imf); + KASSERT(error == 0, ("%s: failed to merge inm state\n", __func__)); + + MLD_PRINTF(("%s: doing mld downcall\n", __func__)); + error = mld_change_state(inm, 0); +#if MLD_DEBUG + if (error) + MLD_PRINTF(("%s: failed mld downcall\n", __func__)); +#endif + lastref = in6_multi_detach(inm); + VERIFY(!lastref || (!(inm->in6m_debug & IFD_ATTACHED) && + inm->in6m_reqcnt == 0)); + IN6M_UNLOCK(inm); + in6_multihead_lock_done(); + + if (lastref) + IN6M_REMREF(inm); /* for in6_multihead list */ + + return (error); +} + +/* + * Block or unblock an ASM multicast source on an inpcb. + * This implements the delta-based API described in RFC 3678. + * + * The delta-based API applies only to exclusive-mode memberships. + * An MLD downcall will be performed. + * + * Return 0 if successful, otherwise return an appropriate error code. + */ +static int +in6p_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) +{ + struct group_source_req gsr; + sockunion_t *gsa, *ssa; + struct ifnet *ifp; + struct in6_mfilter *imf; + struct ip6_moptions *imo; + struct in6_msource *ims; + struct in6_multi *inm; + size_t idx; + uint16_t fmode; + int error, doblock; + + ifp = NULL; + error = 0; + doblock = 0; + + memset(&gsr, 0, sizeof(struct group_source_req)); + gsa = (sockunion_t *)&gsr.gsr_group; + ssa = (sockunion_t *)&gsr.gsr_source; + + switch (sopt->sopt_name) { + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + error = sooptcopyin(sopt, &gsr, + sizeof(struct group_source_req), + sizeof(struct group_source_req)); + if (error) + return (error); + + if (gsa->sin6.sin6_family != AF_INET6 || + gsa->sin6.sin6_len != sizeof(struct sockaddr_in6)) + return (EINVAL); + + if (ssa->sin6.sin6_family != AF_INET6 || + ssa->sin6.sin6_len != sizeof(struct sockaddr_in6)) + return (EINVAL); + + ifnet_head_lock_shared(); + if (gsr.gsr_interface == 0 || + (u_int)if_index < gsr.gsr_interface) { + ifnet_head_done(); + return (EADDRNOTAVAIL); + } + + ifp = ifindex2ifnet[gsr.gsr_interface]; + ifnet_head_done(); + + if (ifp == NULL) + return (EADDRNOTAVAIL); + + if (sopt->sopt_name == MCAST_BLOCK_SOURCE) + doblock = 1; + break; + + default: + MLD_PRINTF(("%s: unknown sopt_name %d\n", + __func__, sopt->sopt_name)); + return (EOPNOTSUPP); + break; + } + + if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) + return (EINVAL); + + (void) in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); + + /* + * Check if we are actually a member of this group. + */ + imo = in6p_findmoptions(inp); + if (imo == NULL) + return (ENOMEM); + + IM6O_LOCK(imo); + idx = im6o_match_group(imo, ifp, &gsa->sa); + if (idx == (size_t)-1 || imo->im6o_mfilters == NULL) { + error = EADDRNOTAVAIL; + goto out_imo_locked; + } + + VERIFY(imo->im6o_mfilters != NULL); + imf = &imo->im6o_mfilters[idx]; + inm = imo->im6o_membership[idx]; + + /* + * Attempting to use the delta-based API on an + * non exclusive-mode membership is an error. + */ + fmode = imf->im6f_st[0]; + if (fmode != MCAST_EXCLUDE) { + error = EINVAL; + goto out_imo_locked; + } + + /* + * Deal with error cases up-front: + * Asked to block, but already blocked; or + * Asked to unblock, but nothing to unblock. + * If adding a new block entry, allocate it. + */ + ims = im6o_match_source(imo, idx, &ssa->sa); + if ((ims != NULL && doblock) || (ims == NULL && !doblock)) { + MLD_PRINTF(("%s: source %s %spresent\n", __func__, + ip6_sprintf(&ssa->sin6.sin6_addr), + doblock ? "" : "not ")); + error = EADDRNOTAVAIL; + goto out_imo_locked; + } + + /* + * Begin state merge transaction at socket layer. + */ + if (doblock) { + MLD_PRINTF(("%s: %s source\n", __func__, "block")); + ims = im6f_graft(imf, fmode, &ssa->sin6); + if (ims == NULL) + error = ENOMEM; + } else { + MLD_PRINTF(("%s: %s source\n", __func__, "allow")); + error = im6f_prune(imf, &ssa->sin6); + } + + if (error) { + MLD_PRINTF(("%s: merge imf state failed\n", __func__)); + goto out_im6f_rollback; + } + + /* + * Begin state merge transaction at MLD layer. + */ + IN6M_LOCK(inm); + MLD_PRINTF(("%s: merge inm state\n", __func__)); + error = in6m_merge(inm, imf); + if (error) { + MLD_PRINTF(("%s: failed to merge inm state\n", __func__)); + IN6M_UNLOCK(inm); + goto out_im6f_rollback; + } + + MLD_PRINTF(("%s: doing mld downcall\n", __func__)); + error = mld_change_state(inm, 0); + IN6M_UNLOCK(inm); +#if MLD_DEBUG + if (error) + MLD_PRINTF(("%s: failed mld downcall\n", __func__)); +#endif + +out_im6f_rollback: + if (error) + im6f_rollback(imf); + else + im6f_commit(imf); + + im6f_reap(imf); + +out_imo_locked: + IM6O_UNLOCK(imo); + IM6O_REMREF(imo); /* from in6p_findmoptions() */ + return (error); +} + +/* + * Given an inpcb, return its multicast options structure pointer. Accepts + * an unlocked inpcb pointer, but will return it locked. May sleep. + * + */ +static struct ip6_moptions * +in6p_findmoptions(struct inpcb *inp) +{ + struct ip6_moptions *imo; + struct in6_multi **immp; + struct in6_mfilter *imfp; + size_t idx; + + if ((imo = inp->in6p_moptions) != NULL) { + IM6O_ADDREF(imo); /* for caller */ + return (imo); + } + + imo = ip6_allocmoptions(M_WAITOK); + if (imo == NULL) + return (NULL); + + immp = _MALLOC(sizeof (*immp) * IPV6_MIN_MEMBERSHIPS, M_IP6MOPTS, + M_WAITOK | M_ZERO); + if (immp == NULL) { + IM6O_REMREF(imo); + return (NULL); + } + + imfp = _MALLOC(sizeof (struct in6_mfilter) * IPV6_MIN_MEMBERSHIPS, + M_IN6MFILTER, M_WAITOK | M_ZERO); + if (imfp == NULL) { + _FREE(immp, M_IP6MOPTS); + IM6O_REMREF(imo); + return (NULL); + } + + imo->im6o_multicast_ifp = NULL; + imo->im6o_multicast_hlim = ip6_defmcasthlim; + imo->im6o_multicast_loop = in6_mcast_loop; + imo->im6o_num_memberships = 0; + imo->im6o_max_memberships = IPV6_MIN_MEMBERSHIPS; + imo->im6o_membership = immp; + + /* Initialize per-group source filters. */ + for (idx = 0; idx < IPV6_MIN_MEMBERSHIPS; idx++) + im6f_init(&imfp[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); + + imo->im6o_mfilters = imfp; + inp->in6p_moptions = imo; /* keep reference from ip6_allocmoptions() */ + IM6O_ADDREF(imo); /* for caller */ + + return (imo); +} + +/* + * Atomically get source filters on a socket for an IPv6 multicast group. + * Called with INP lock held; returns with lock released. + */ +static int +in6p_get_source_filters(struct inpcb *inp, struct sockopt *sopt) +{ + struct __msfilterreq64 msfr, msfr64; + struct __msfilterreq32 msfr32; + sockunion_t *gsa; + struct ifnet *ifp; + struct ip6_moptions *imo; + struct in6_mfilter *imf; + struct ip6_msource *ims; + struct in6_msource *lims; + struct sockaddr_in6 *psin; + struct sockaddr_storage *ptss; + struct sockaddr_storage *tss; + int error; + size_t idx, nsrcs, ncsrcs; + user_addr_t tmp_ptr; + + imo = inp->in6p_moptions; + VERIFY(imo != NULL); + + if (IS_64BIT_PROCESS(current_proc())) { + error = sooptcopyin(sopt, &msfr64, + sizeof(struct __msfilterreq64), + sizeof(struct __msfilterreq64)); + if (error) + return (error); + /* we never use msfr.msfr_srcs; */ + memcpy(&msfr, &msfr64, sizeof(msfr)); + } else { + error = sooptcopyin(sopt, &msfr32, + sizeof(struct __msfilterreq32), + sizeof(struct __msfilterreq32)); + if (error) + return (error); + /* we never use msfr.msfr_srcs; */ + memcpy(&msfr, &msfr32, sizeof(msfr)); + } + + if (msfr.msfr_group.ss_family != AF_INET6 || + msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6)) + return (EINVAL); + + gsa = (sockunion_t *)&msfr.msfr_group; + if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) + return (EINVAL); + + ifnet_head_lock_shared(); + if (msfr.msfr_ifindex == 0 || (u_int)if_index < msfr.msfr_ifindex) { + ifnet_head_done(); + return (EADDRNOTAVAIL); + } + ifp = ifindex2ifnet[msfr.msfr_ifindex]; + ifnet_head_done(); + + if (ifp == NULL) + return (EADDRNOTAVAIL); + + if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc) + msfr.msfr_nsrcs = in6_mcast_maxsocksrc; + + (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); + + IM6O_LOCK(imo); + /* + * Lookup group on the socket. + */ + idx = im6o_match_group(imo, ifp, &gsa->sa); + if (idx == (size_t)-1 || imo->im6o_mfilters == NULL) { + IM6O_UNLOCK(imo); + return (EADDRNOTAVAIL); + } + imf = &imo->im6o_mfilters[idx]; + + /* + * Ignore memberships which are in limbo. + */ + if (imf->im6f_st[1] == MCAST_UNDEFINED) { + IM6O_UNLOCK(imo); + return (EAGAIN); + } + msfr.msfr_fmode = imf->im6f_st[1]; + + /* + * If the user specified a buffer, copy out the source filter + * entries to userland gracefully. + * We only copy out the number of entries which userland + * has asked for, but we always tell userland how big the + * buffer really needs to be. + */ + tss = NULL; + + if (IS_64BIT_PROCESS(current_proc())) + tmp_ptr = msfr64.msfr_srcs; + else + tmp_ptr = CAST_USER_ADDR_T(msfr32.msfr_srcs); + + if (tmp_ptr != USER_ADDR_NULL && msfr.msfr_nsrcs > 0) { + tss = _MALLOC(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, + M_TEMP, M_WAITOK | M_ZERO); + if (tss == NULL) { + IM6O_UNLOCK(imo); + return (ENOBUFS); + } + } + + /* + * Count number of sources in-mode at t0. + * If buffer space exists and remains, copy out source entries. + */ + nsrcs = msfr.msfr_nsrcs; + ncsrcs = 0; + ptss = tss; + RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { + lims = (struct in6_msource *)ims; + if (lims->im6sl_st[0] == MCAST_UNDEFINED || + lims->im6sl_st[0] != imf->im6f_st[0]) + continue; + if (tss != NULL && nsrcs > 0) { + psin = (struct sockaddr_in6 *)ptss; + psin->sin6_family = AF_INET6; + psin->sin6_len = sizeof(struct sockaddr_in6); + psin->sin6_addr = lims->im6s_addr; + psin->sin6_port = 0; + --nsrcs; + ++ptss; + ++ncsrcs; + } + } + + IM6O_UNLOCK(imo); + + if (tss != NULL) { + error = copyout(tss, tmp_ptr, + sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); + FREE(tss, M_TEMP); + if (error) + return (error); + } + + msfr.msfr_nsrcs = ncsrcs; + if (IS_64BIT_PROCESS(current_proc())) { + msfr64.msfr_ifindex = msfr.msfr_ifindex; + msfr64.msfr_fmode = msfr.msfr_fmode; + msfr64.msfr_nsrcs = msfr.msfr_nsrcs; + memcpy(&msfr64.msfr_group, &msfr.msfr_group, + sizeof(struct sockaddr_storage)); + error = sooptcopyout(sopt, &msfr64, + sizeof(struct __msfilterreq64)); + } else { + msfr32.msfr_ifindex = msfr.msfr_ifindex; + msfr32.msfr_fmode = msfr.msfr_fmode; + msfr32.msfr_nsrcs = msfr.msfr_nsrcs; + memcpy(&msfr64.msfr_group, &msfr.msfr_group, + sizeof(struct sockaddr_storage)); + error = sooptcopyout(sopt, &msfr32, + sizeof(struct __msfilterreq32)); + } + + return (error); +} + +/* + * Return the IP multicast options in response to user getsockopt(). + */ +int +ip6_getmoptions(struct inpcb *inp, struct sockopt *sopt) +{ + struct ip6_moptions *im6o; + int error; + u_int optval; + + im6o = inp->in6p_moptions; + /* + * If socket is neither of type SOCK_RAW or SOCK_DGRAM, + * or is a divert socket, reject it. + */ + if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || + (inp->inp_socket->so_proto->pr_type != SOCK_RAW && + inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) { + return (EOPNOTSUPP); + } + + error = 0; + switch (sopt->sopt_name) { + case IPV6_MULTICAST_IF: + if (im6o != NULL) + IM6O_LOCK(im6o); + if (im6o == NULL || im6o->im6o_multicast_ifp == NULL) { + optval = 0; + } else { + optval = im6o->im6o_multicast_ifp->if_index; + } + if (im6o != NULL) + IM6O_UNLOCK(im6o); + error = sooptcopyout(sopt, &optval, sizeof(u_int)); + break; + + case IPV6_MULTICAST_HOPS: + if (im6o == NULL) { + optval = ip6_defmcasthlim; + } else { + IM6O_LOCK(im6o); + optval = im6o->im6o_multicast_hlim; + IM6O_UNLOCK(im6o); + } + error = sooptcopyout(sopt, &optval, sizeof(u_int)); + break; + + case IPV6_MULTICAST_LOOP: + if (im6o == NULL) { + optval = in6_mcast_loop; /* XXX VIMAGE */ + } else { + IM6O_LOCK(im6o); + optval = im6o->im6o_multicast_loop; + IM6O_UNLOCK(im6o); + } + error = sooptcopyout(sopt, &optval, sizeof(u_int)); + break; + + case IPV6_MSFILTER: + if (im6o == NULL) { + error = EADDRNOTAVAIL; + } else { + error = in6p_get_source_filters(inp, sopt); + } + break; + + default: + error = ENOPROTOOPT; + break; + } + + return (error); +} + +/* + * Look up the ifnet to use for a multicast group membership, + * given the address of an IPv6 group. + * + * This routine exists to support legacy IPv6 multicast applications. + * + * If inp is non-NULL and is bound to an interface, use this socket's + * inp_boundif for any required routing table lookup. + * + * If the route lookup fails, return NULL. + * + * FUTURE: Support multiple forwarding tables for IPv6. + * + * Returns NULL if no ifp could be found. + */ +static struct ifnet * +in6p_lookup_mcast_ifp(const struct inpcb *in6p, + const struct sockaddr_in6 *gsin6) +{ + struct route_in6 ro6; + struct ifnet *ifp; + unsigned int ifscope = IFSCOPE_NONE; + + VERIFY(in6p == NULL || (in6p->inp_vflag & INP_IPV6)); + VERIFY(gsin6->sin6_family == AF_INET6); + if (IN6_IS_ADDR_MULTICAST(&gsin6->sin6_addr) == 0) + return NULL; + + if (in6p != NULL && (in6p->inp_flags & INP_BOUND_IF)) + ifscope = in6p->inp_boundif; + + ifp = NULL; + memset(&ro6, 0, sizeof(struct route_in6)); + memcpy(&ro6.ro_dst, gsin6, sizeof(struct sockaddr_in6)); + rtalloc_scoped_ign((struct route *)&ro6, 0, ifscope); + if (ro6.ro_rt != NULL) { + ifp = ro6.ro_rt->rt_ifp; + VERIFY(ifp != NULL); + rtfree(ro6.ro_rt); + } + + return (ifp); +} + +/* + * Since ipv6_mreq contains an ifindex and ip_mreq contains an AF_INET + * address, we need to lookup the AF_INET address when translating an + * ipv6_mreq structure into an ipmreq structure. + * This is used when userland performs multicast setsockopt() on AF_INET6 + * sockets with AF_INET multicast addresses (IPv6 v4 mapped addresses). + */ +static int +in6p_lookup_v4addr(struct ipv6_mreq *mreq, struct ip_mreq *v4mreq) +{ + struct ifnet *ifp; + struct ifaddr *ifa; + struct sockaddr_in *sin; + + ifnet_head_lock_shared(); + if (mreq->ipv6mr_interface > (unsigned int)if_index) { + ifnet_head_done(); + return (EADDRNOTAVAIL); + } else + ifp = ifindex2ifnet[mreq->ipv6mr_interface]; + ifnet_head_done(); + if (ifp == NULL) + return (EADDRNOTAVAIL); + ifa = ifa_ifpgetprimary(ifp, AF_INET); + if (ifa == NULL) + return (EADDRNOTAVAIL); + sin = (struct sockaddr_in *)ifa->ifa_addr; + v4mreq->imr_interface.s_addr = sin->sin_addr.s_addr; + IFA_REMREF(ifa); + + return (0); +} + +/* + * Join an IPv6 multicast group, possibly with a source. + * + * FIXME: The KAME use of the unspecified address (::) + * to join *all* multicast groups is currently unsupported. + */ +static int +in6p_join_group(struct inpcb *inp, struct sockopt *sopt) +{ + struct group_source_req gsr; + sockunion_t *gsa, *ssa; + struct ifnet *ifp; + struct in6_mfilter *imf; + struct ip6_moptions *imo; + struct in6_multi *inm = NULL; + struct in6_msource *lims = NULL; + size_t idx; + int error, is_new; + uint32_t scopeid = 0; + + ifp = NULL; + imf = NULL; + error = 0; + is_new = 0; + + memset(&gsr, 0, sizeof(struct group_source_req)); + gsa = (sockunion_t *)&gsr.gsr_group; + gsa->ss.ss_family = AF_UNSPEC; + ssa = (sockunion_t *)&gsr.gsr_source; + ssa->ss.ss_family = AF_UNSPEC; + + /* + * Chew everything into struct group_source_req. + * Overwrite the port field if present, as the sockaddr + * being copied in may be matched with a binary comparison. + * Ignore passed-in scope ID. + */ + switch (sopt->sopt_name) { + case IPV6_JOIN_GROUP: { + struct ipv6_mreq mreq; + struct sockaddr_in6 *gsin6; + + error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq), + sizeof(struct ipv6_mreq)); + if (error) + return (error); + if (IN6_IS_ADDR_V4MAPPED(&mreq.ipv6mr_multiaddr)) { + struct ip_mreq v4mreq; + struct sockopt v4sopt; + + v4mreq.imr_multiaddr.s_addr = + mreq.ipv6mr_multiaddr.s6_addr32[3]; + if (mreq.ipv6mr_interface == 0) + v4mreq.imr_interface.s_addr = INADDR_ANY; + else + error = in6p_lookup_v4addr(&mreq, &v4mreq); + if (error) + return (error); + v4sopt.sopt_dir = SOPT_SET; + v4sopt.sopt_level = sopt->sopt_level; + v4sopt.sopt_name = IP_ADD_MEMBERSHIP; + v4sopt.sopt_val = CAST_USER_ADDR_T(&v4mreq); + v4sopt.sopt_valsize = sizeof(v4mreq); + v4sopt.sopt_p = kernproc; + + return (inp_join_group(inp, &v4sopt)); + } + gsa->sin6.sin6_family = AF_INET6; + gsa->sin6.sin6_len = sizeof(struct sockaddr_in6); + gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr; + + gsin6 = &gsa->sin6; + + /* Only allow IPv6 multicast addresses */ + if (IN6_IS_ADDR_MULTICAST(&gsin6->sin6_addr) == 0) { + return (EINVAL); + } + + if (mreq.ipv6mr_interface == 0) { + ifp = in6p_lookup_mcast_ifp(inp, gsin6); + } else { + ifnet_head_lock_shared(); + if ((u_int)if_index < mreq.ipv6mr_interface) { + ifnet_head_done(); + return (EADDRNOTAVAIL); + } + ifp = ifindex2ifnet[mreq.ipv6mr_interface]; + ifnet_head_done(); + } + MLD_PRINTF(("%s: ipv6mr_interface = %d, ifp = %p\n", + __func__, mreq.ipv6mr_interface, ifp)); + break; + } + + case MCAST_JOIN_GROUP: + case MCAST_JOIN_SOURCE_GROUP: + if (sopt->sopt_name == MCAST_JOIN_GROUP) { + error = sooptcopyin(sopt, &gsr, + sizeof(struct group_req), + sizeof(struct group_req)); + } else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { + error = sooptcopyin(sopt, &gsr, + sizeof(struct group_source_req), + sizeof(struct group_source_req)); + } + if (error) + return (error); + + if (gsa->sin6.sin6_family != AF_INET6 || + gsa->sin6.sin6_len != sizeof(struct sockaddr_in6)) + return (EINVAL); + + if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { + if (ssa->sin6.sin6_family != AF_INET6 || + ssa->sin6.sin6_len != sizeof(struct sockaddr_in6)) + return (EINVAL); + if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr)) + return (EINVAL); + /* + * TODO: Validate embedded scope ID in source + * list entry against passed-in ifp, if and only + * if source list filter entry is iface or node local. + */ + in6_clearscope(&ssa->sin6.sin6_addr); + ssa->sin6.sin6_port = 0; + ssa->sin6.sin6_scope_id = 0; + } + + ifnet_head_lock_shared(); + if (gsr.gsr_interface == 0 || + (u_int)if_index < gsr.gsr_interface) { + ifnet_head_done(); + return (EADDRNOTAVAIL); + } + ifp = ifindex2ifnet[gsr.gsr_interface]; + ifnet_head_done(); + break; + + default: + MLD_PRINTF(("%s: unknown sopt_name %d\n", + __func__, sopt->sopt_name)); + return (EOPNOTSUPP); + break; + } + + if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) + return (EINVAL); + + if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) + return (EADDRNOTAVAIL); + + gsa->sin6.sin6_port = 0; + gsa->sin6.sin6_scope_id = 0; + + /* + * Always set the scope zone ID on memberships created from userland. + * Use the passed-in ifp to do this. + */ + (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, &scopeid); + /* + * Some addresses are not valid without an embedded scopeid. + * This check must be present because otherwise we will later hit + * a VERIFY() in in6_mc_join(). + */ + if ((IN6_IS_ADDR_MC_LINKLOCAL(&gsa->sin6.sin6_addr) || + IN6_IS_ADDR_MC_INTFACELOCAL(&gsa->sin6.sin6_addr)) && scopeid == 0) + return (EINVAL); + + imo = in6p_findmoptions(inp); + if (imo == NULL) + return (ENOMEM); + + IM6O_LOCK(imo); + idx = im6o_match_group(imo, ifp, &gsa->sa); + if (idx == (size_t)-1) { + is_new = 1; + } else { + inm = imo->im6o_membership[idx]; + imf = &imo->im6o_mfilters[idx]; + if (ssa->ss.ss_family != AF_UNSPEC) { + /* + * MCAST_JOIN_SOURCE_GROUP on an exclusive membership + * is an error. On an existing inclusive membership, + * it just adds the source to the filter list. + */ + if (imf->im6f_st[1] != MCAST_INCLUDE) { + error = EINVAL; + goto out_imo_locked; + } + /* + * Throw out duplicates. + * + * XXX FIXME: This makes a naive assumption that + * even if entries exist for *ssa in this imf, + * they will be rejected as dupes, even if they + * are not valid in the current mode (in-mode). + * + * in6_msource is transactioned just as for anything + * else in SSM -- but note naive use of in6m_graft() + * below for allocating new filter entries. + * + * This is only an issue if someone mixes the + * full-state SSM API with the delta-based API, + * which is discouraged in the relevant RFCs. + */ + lims = im6o_match_source(imo, idx, &ssa->sa); + if (lims != NULL /*&& + lims->im6sl_st[1] == MCAST_INCLUDE*/) { + error = EADDRNOTAVAIL; + goto out_imo_locked; + } + } else { + /* + * MCAST_JOIN_GROUP on an existing exclusive + * membership is an error; return EADDRINUSE + * to preserve 4.4BSD API idempotence, and + * avoid tedious detour to code below. + * NOTE: This is bending RFC 3678 a bit. + * + * On an existing inclusive membership, this is also + * an error; if you want to change filter mode, + * you must use the userland API setsourcefilter(). + * XXX We don't reject this for imf in UNDEFINED + * state at t1, because allocation of a filter + * is atomic with allocation of a membership. + */ + error = EINVAL; + /* See comments above for EADDRINUSE */ + if (imf->im6f_st[1] == MCAST_EXCLUDE) + error = EADDRINUSE; + goto out_imo_locked; + } + } + + /* + * Begin state merge transaction at socket layer. + */ + + if (is_new) { + if (imo->im6o_num_memberships == imo->im6o_max_memberships) { + error = im6o_grow(imo, 0); + if (error) + goto out_imo_locked; + } + /* + * Allocate the new slot upfront so we can deal with + * grafting the new source filter in same code path + * as for join-source on existing membership. + */ + idx = imo->im6o_num_memberships; + imo->im6o_membership[idx] = NULL; + imo->im6o_num_memberships++; + VERIFY(imo->im6o_mfilters != NULL); + imf = &imo->im6o_mfilters[idx]; + VERIFY(RB_EMPTY(&imf->im6f_sources)); + } + + /* + * Graft new source into filter list for this inpcb's + * membership of the group. The in6_multi may not have + * been allocated yet if this is a new membership, however, + * the in_mfilter slot will be allocated and must be initialized. + * + * Note: Grafting of exclusive mode filters doesn't happen + * in this path. + * XXX: Should check for non-NULL lims (node exists but may + * not be in-mode) for interop with full-state API. + */ + if (ssa->ss.ss_family != AF_UNSPEC) { + /* Membership starts in IN mode */ + if (is_new) { + MLD_PRINTF(("%s: new join w/source\n", __func__); + im6f_init(imf, MCAST_UNDEFINED, MCAST_INCLUDE)); + } else { + MLD_PRINTF(("%s: %s source\n", __func__, "allow")); + } + lims = im6f_graft(imf, MCAST_INCLUDE, &ssa->sin6); + if (lims == NULL) { + MLD_PRINTF(("%s: merge imf state failed\n", + __func__)); + error = ENOMEM; + goto out_im6o_free; + } + } else { + /* No address specified; Membership starts in EX mode */ + if (is_new) { + MLD_PRINTF(("%s: new join w/o source", __func__)); + im6f_init(imf, MCAST_UNDEFINED, MCAST_EXCLUDE); + } + } + + /* + * Begin state merge transaction at MLD layer. + */ + + if (is_new) { + VERIFY(inm == NULL); + error = in6_mc_join(ifp, &gsa->sin6.sin6_addr, imf, &inm, 0); + VERIFY(inm != NULL || error != 0); + if (error) + goto out_im6o_free; + imo->im6o_membership[idx] = inm; /* from in6_mc_join() */ + } else { + MLD_PRINTF(("%s: merge inm state\n", __func__)); + IN6M_LOCK(inm); + error = in6m_merge(inm, imf); + if (error) { + MLD_PRINTF(("%s: failed to merge inm state\n", + __func__)); + IN6M_UNLOCK(inm); + goto out_im6f_rollback; + } + MLD_PRINTF(("%s: doing mld downcall\n", __func__)); + error = mld_change_state(inm, 0); + IN6M_UNLOCK(inm); + if (error) { + MLD_PRINTF(("%s: failed mld downcall\n", + __func__)); + goto out_im6f_rollback; + } + } + +out_im6f_rollback: + if (error) { + im6f_rollback(imf); + if (is_new) + im6f_purge(imf); + else + im6f_reap(imf); + } else { + im6f_commit(imf); + } + +out_im6o_free: + if (error && is_new) { + VERIFY(inm == NULL); + imo->im6o_membership[idx] = NULL; + --imo->im6o_num_memberships; + } + +out_imo_locked: + IM6O_UNLOCK(imo); + IM6O_REMREF(imo); /* from in6p_findmoptions() */ + return (error); +} + +/* + * Leave an IPv6 multicast group on an inpcb, possibly with a source. + */ +static int +in6p_leave_group(struct inpcb *inp, struct sockopt *sopt) +{ + struct ipv6_mreq mreq; + struct group_source_req gsr; + sockunion_t *gsa, *ssa; + struct ifnet *ifp; + struct in6_mfilter *imf; + struct ip6_moptions *imo; + struct in6_msource *ims; + struct in6_multi *inm = NULL; + uint32_t ifindex = 0; + size_t idx; + int error, is_final; + + ifp = NULL; + error = 0; + is_final = 1; + + memset(&gsr, 0, sizeof(struct group_source_req)); + gsa = (sockunion_t *)&gsr.gsr_group; + gsa->ss.ss_family = AF_UNSPEC; + ssa = (sockunion_t *)&gsr.gsr_source; + ssa->ss.ss_family = AF_UNSPEC; + + /* + * Chew everything passed in up into a struct group_source_req + * as that is easier to process. + * Note: Any embedded scope ID in the multicast group passed + * in by userland is ignored, the interface index is the recommended + * mechanism to specify an interface; see below. + */ + switch (sopt->sopt_name) { + case IPV6_LEAVE_GROUP: { + struct sockaddr_in6 *gsin6; + + error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq), + sizeof(struct ipv6_mreq)); + if (error) + return (error); + if (IN6_IS_ADDR_V4MAPPED(&mreq.ipv6mr_multiaddr)) { + struct ip_mreq v4mreq; + struct sockopt v4sopt; + + v4mreq.imr_multiaddr.s_addr = + mreq.ipv6mr_multiaddr.s6_addr32[3]; + if (mreq.ipv6mr_interface == 0) + v4mreq.imr_interface.s_addr = INADDR_ANY; + else + error = in6p_lookup_v4addr(&mreq, &v4mreq); + if (error) + return (error); + v4sopt.sopt_dir = SOPT_SET; + v4sopt.sopt_level = sopt->sopt_level; + v4sopt.sopt_name = IP_DROP_MEMBERSHIP; + v4sopt.sopt_val = CAST_USER_ADDR_T(&v4mreq); + v4sopt.sopt_valsize = sizeof(v4mreq); + v4sopt.sopt_p = kernproc; + + return (inp_leave_group(inp, &v4sopt)); + } + gsa->sin6.sin6_family = AF_INET6; + gsa->sin6.sin6_len = sizeof(struct sockaddr_in6); + gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr; + gsa->sin6.sin6_port = 0; + gsa->sin6.sin6_scope_id = 0; + ifindex = mreq.ipv6mr_interface; + gsin6 = &gsa->sin6; + /* Only allow IPv6 multicast addresses */ + if (IN6_IS_ADDR_MULTICAST(&gsin6->sin6_addr) == 0) { + return (EINVAL); + } + break; + } + + case MCAST_LEAVE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + if (sopt->sopt_name == MCAST_LEAVE_GROUP) { + error = sooptcopyin(sopt, &gsr, + sizeof(struct group_req), + sizeof(struct group_req)); + } else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { + error = sooptcopyin(sopt, &gsr, + sizeof(struct group_source_req), + sizeof(struct group_source_req)); + } + if (error) + return (error); + + if (gsa->sin6.sin6_family != AF_INET6 || + gsa->sin6.sin6_len != sizeof(struct sockaddr_in6)) + return (EINVAL); + if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { + if (ssa->sin6.sin6_family != AF_INET6 || + ssa->sin6.sin6_len != sizeof(struct sockaddr_in6)) + return (EINVAL); + if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr)) + return (EINVAL); + /* + * TODO: Validate embedded scope ID in source + * list entry against passed-in ifp, if and only + * if source list filter entry is iface or node local. + */ + in6_clearscope(&ssa->sin6.sin6_addr); + } + gsa->sin6.sin6_port = 0; + gsa->sin6.sin6_scope_id = 0; + ifindex = gsr.gsr_interface; + break; + + default: + MLD_PRINTF(("%s: unknown sopt_name %d\n", + __func__, sopt->sopt_name)); + return (EOPNOTSUPP); + break; + } + + if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) + return (EINVAL); + + /* + * Validate interface index if provided. If no interface index + * was provided separately, attempt to look the membership up + * from the default scope as a last resort to disambiguate + * the membership we are being asked to leave. + * XXX SCOPE6 lock potentially taken here. + */ + if (ifindex != 0) { + ifnet_head_lock_shared(); + if ((u_int)if_index < ifindex) { + ifnet_head_done(); + return (EADDRNOTAVAIL); + } + ifp = ifindex2ifnet[ifindex]; + ifnet_head_done(); + if (ifp == NULL) + return (EADDRNOTAVAIL); + (void) in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); + } else { + error = sa6_embedscope(&gsa->sin6, ip6_use_defzone); + if (error) + return (EADDRNOTAVAIL); + /* + * Some badly behaved applications don't pass an ifindex + * or a scope ID, which is an API violation. In this case, + * perform a lookup as per a v6 join. + * + * XXX For now, stomp on zone ID for the corner case. + * This is not the 'KAME way', but we need to see the ifp + * directly until such time as this implementation is + * refactored, assuming the scope IDs are the way to go. + */ + ifindex = ntohs(gsa->sin6.sin6_addr.s6_addr16[1]); + if (ifindex == 0) { + MLD_PRINTF(("%s: warning: no ifindex, looking up " + "ifp for group %s.\n", __func__, + ip6_sprintf(&gsa->sin6.sin6_addr))); + ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6); + } else { + ifnet_head_lock_shared(); + ifp = ifindex2ifnet[ifindex]; + ifnet_head_done(); + } + if (ifp == NULL) + return (EADDRNOTAVAIL); + } + + VERIFY(ifp != NULL); + MLD_PRINTF(("%s: ifp = %p\n", __func__, ifp)); + + /* + * Find the membership in the membership array. + */ + imo = in6p_findmoptions(inp); + if (imo == NULL) + return (ENOMEM); + + IM6O_LOCK(imo); + idx = im6o_match_group(imo, ifp, &gsa->sa); + if (idx == (size_t)-1) { + error = EADDRNOTAVAIL; + goto out_locked; + } + inm = imo->im6o_membership[idx]; + imf = &imo->im6o_mfilters[idx]; + + if (ssa->ss.ss_family != AF_UNSPEC) + is_final = 0; + + /* + * Begin state merge transaction at socket layer. + */ + + /* + * If we were instructed only to leave a given source, do so. + * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships. + */ + if (is_final) { + im6f_leave(imf); + } else { + if (imf->im6f_st[0] == MCAST_EXCLUDE) { + error = EADDRNOTAVAIL; + goto out_locked; + } + ims = im6o_match_source(imo, idx, &ssa->sa); + if (ims == NULL) { + MLD_PRINTF(("%s: source %p %spresent\n", __func__, + ip6_sprintf(&ssa->sin6.sin6_addr), + "not ")); + error = EADDRNOTAVAIL; + goto out_locked; + } + MLD_PRINTF(("%s: %s source\n", __func__, "block")); + error = im6f_prune(imf, &ssa->sin6); + if (error) { + MLD_PRINTF(("%s: merge imf state failed\n", + __func__)); + goto out_locked; + } + } + + /* + * Begin state merge transaction at MLD layer. + */ + + if (is_final) { + /* + * Give up the multicast address record to which + * the membership points. Reference held in im6o + * will be released below. + */ + (void) in6_mc_leave(inm, imf); + } else { + MLD_PRINTF(("%s: merge inm state\n", __func__)); + IN6M_LOCK(inm); + error = in6m_merge(inm, imf); + if (error) { + MLD_PRINTF(("%s: failed to merge inm state\n", + __func__)); + IN6M_UNLOCK(inm); + goto out_im6f_rollback; + } + + MLD_PRINTF(("%s: doing mld downcall\n", __func__)); + error = mld_change_state(inm, 0); + if (error) { + MLD_PRINTF(("%s: failed mld downcall\n", __func__)); + } + IN6M_UNLOCK(inm); + } + +out_im6f_rollback: + if (error) + im6f_rollback(imf); + else + im6f_commit(imf); + + im6f_reap(imf); + + if (is_final) { + /* Remove the gap in the membership array. */ + VERIFY(inm == imo->im6o_membership[idx]); + imo->im6o_membership[idx] = NULL; + IN6M_REMREF(inm); + for (++idx; idx < imo->im6o_num_memberships; ++idx) { + imo->im6o_membership[idx-1] = imo->im6o_membership[idx]; + imo->im6o_mfilters[idx-1] = imo->im6o_mfilters[idx]; + } + imo->im6o_num_memberships--; + } + +out_locked: + IM6O_UNLOCK(imo); + IM6O_REMREF(imo); /* from in6p_findmoptions() */ + return (error); +} + +/* + * Select the interface for transmitting IPv6 multicast datagrams. + * + * Either an instance of struct in6_addr or an instance of struct ipv6_mreqn + * may be passed to this socket option. An address of in6addr_any or an + * interface index of 0 is used to remove a previous selection. + * When no interface is selected, one is chosen for every send. + */ +static int +in6p_set_multicast_if(struct inpcb *inp, struct sockopt *sopt) +{ + struct ifnet *ifp; + struct ip6_moptions *imo; + u_int ifindex; + int error; + + if (sopt->sopt_valsize != sizeof(u_int)) + return (EINVAL); + + error = sooptcopyin(sopt, &ifindex, sizeof(u_int), sizeof(u_int)); + if (error) + return (error); + + ifnet_head_lock_shared(); + if ((u_int)if_index < ifindex) { + ifnet_head_done(); + return (EINVAL); + } + + ifp = ifindex2ifnet[ifindex]; + ifnet_head_done(); + if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) + return (EADDRNOTAVAIL); + + imo = in6p_findmoptions(inp); + if (imo == NULL) + return (ENOMEM); + + IM6O_LOCK(imo); + imo->im6o_multicast_ifp = ifp; + IM6O_UNLOCK(imo); + IM6O_REMREF(imo); /* from in6p_findmoptions() */ + + return (0); +} + +/* + * Atomically set source filters on a socket for an IPv6 multicast group. + * + */ +static int +in6p_set_source_filters(struct inpcb *inp, struct sockopt *sopt) +{ + struct __msfilterreq64 msfr, msfr64; + struct __msfilterreq32 msfr32; + sockunion_t *gsa; + struct ifnet *ifp; + struct in6_mfilter *imf; + struct ip6_moptions *imo; + struct in6_multi *inm; + size_t idx; + int error; + user_addr_t tmp_ptr; + + if (IS_64BIT_PROCESS(current_proc())) { + error = sooptcopyin(sopt, &msfr64, + sizeof(struct __msfilterreq64), + sizeof(struct __msfilterreq64)); + if (error) + return (error); + /* we never use msfr.msfr_srcs; */ + memcpy(&msfr, &msfr64, sizeof(msfr)); + } else { + error = sooptcopyin(sopt, &msfr32, + sizeof(struct __msfilterreq32), + sizeof(struct __msfilterreq32)); + if (error) + return (error); + /* we never use msfr.msfr_srcs; */ + memcpy(&msfr, &msfr32, sizeof(msfr)); + } + + if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc) + return (ENOBUFS); + + if (msfr.msfr_fmode != MCAST_EXCLUDE && + msfr.msfr_fmode != MCAST_INCLUDE) + return (EINVAL); + + if (msfr.msfr_group.ss_family != AF_INET6 || + msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6)) + return (EINVAL); + + gsa = (sockunion_t *)&msfr.msfr_group; + if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) + return (EINVAL); + + gsa->sin6.sin6_port = 0; /* ignore port */ + + ifnet_head_lock_shared(); + if (msfr.msfr_ifindex == 0 || (u_int)if_index < msfr.msfr_ifindex) { + ifnet_head_done(); + return (EADDRNOTAVAIL); + } + ifp = ifindex2ifnet[msfr.msfr_ifindex]; + ifnet_head_done(); + if (ifp == NULL) + return (EADDRNOTAVAIL); + + (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); + + /* + * Take the INP write lock. + * Check if this socket is a member of this group. + */ + imo = in6p_findmoptions(inp); + if (imo == NULL) + return (ENOMEM); + + IM6O_LOCK(imo); + idx = im6o_match_group(imo, ifp, &gsa->sa); + if (idx == (size_t)-1 || imo->im6o_mfilters == NULL) { + error = EADDRNOTAVAIL; + goto out_imo_locked; + } + inm = imo->im6o_membership[idx]; + imf = &imo->im6o_mfilters[idx]; + + /* + * Begin state merge transaction at socket layer. + */ + + imf->im6f_st[1] = msfr.msfr_fmode; + + /* + * Apply any new source filters, if present. + * Make a copy of the user-space source vector so + * that we may copy them with a single copyin. This + * allows us to deal with page faults up-front. + */ + if (msfr.msfr_nsrcs > 0) { + struct in6_msource *lims; + struct sockaddr_in6 *psin; + struct sockaddr_storage *kss, *pkss; + unsigned int i; + + if (IS_64BIT_PROCESS(current_proc())) + tmp_ptr = msfr64.msfr_srcs; + else + tmp_ptr = CAST_USER_ADDR_T(msfr32.msfr_srcs); + + MLD_PRINTF(("%s: loading %lu source list entries\n", + __func__, (unsigned long)msfr.msfr_nsrcs)); + kss = _MALLOC(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, + M_TEMP, M_WAITOK); + if (kss == NULL) { + error = ENOMEM; + goto out_imo_locked; + } + + error = copyin(tmp_ptr, kss, + sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); + if (error) { + FREE(kss, M_TEMP); + goto out_imo_locked; + } + + /* + * Mark all source filters as UNDEFINED at t1. + * Restore new group filter mode, as im6f_leave() + * will set it to INCLUDE. + */ + im6f_leave(imf); + imf->im6f_st[1] = msfr.msfr_fmode; + + /* + * Update socket layer filters at t1, lazy-allocating + * new entries. This saves a bunch of memory at the + * cost of one RB_FIND() per source entry; duplicate + * entries in the msfr_nsrcs vector are ignored. + * If we encounter an error, rollback transaction. + * + * XXX This too could be replaced with a set-symmetric + * difference like loop to avoid walking from root + * every time, as the key space is common. + */ + for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) { + psin = (struct sockaddr_in6 *)pkss; + if (psin->sin6_family != AF_INET6) { + error = EAFNOSUPPORT; + break; + } + if (psin->sin6_len != sizeof(struct sockaddr_in6)) { + error = EINVAL; + break; + } + if (IN6_IS_ADDR_MULTICAST(&psin->sin6_addr)) { + error = EINVAL; + break; + } + /* + * TODO: Validate embedded scope ID in source + * list entry against passed-in ifp, if and only + * if source list filter entry is iface or node local. + */ + in6_clearscope(&psin->sin6_addr); + error = im6f_get_source(imf, psin, &lims); + if (error) + break; + lims->im6sl_st[1] = imf->im6f_st[1]; + } + FREE(kss, M_TEMP); + } + + if (error) + goto out_im6f_rollback; + + /* + * Begin state merge transaction at MLD layer. + */ + IN6M_LOCK(inm); + MLD_PRINTF(("%s: merge inm state\n", __func__)); + error = in6m_merge(inm, imf); + if (error) { + MLD_PRINTF(("%s: failed to merge inm state\n", __func__)); + IN6M_UNLOCK(inm); + goto out_im6f_rollback; + } + + MLD_PRINTF(("%s: doing mld downcall\n", __func__)); + error = mld_change_state(inm, 0); + IN6M_UNLOCK(inm); +#if MLD_DEBUG + if (error) + MLD_PRINTF(("%s: failed mld downcall\n", __func__)); +#endif + +out_im6f_rollback: + if (error) + im6f_rollback(imf); + else + im6f_commit(imf); + + im6f_reap(imf); + +out_imo_locked: + IM6O_UNLOCK(imo); + IM6O_REMREF(imo); /* from in6p_findmoptions() */ + + return (error); +} + +/* + * Set the IP multicast options in response to user setsockopt(). + * + * Many of the socket options handled in this function duplicate the + * functionality of socket options in the regular unicast API. However, + * it is not possible to merge the duplicate code, because the idempotence + * of the IPv6 multicast part of the BSD Sockets API must be preserved; + * the effects of these options must be treated as separate and distinct. + * + */ +int +ip6_setmoptions(struct inpcb *inp, struct sockopt *sopt) +{ + struct ip6_moptions *im6o; + int error; + + error = 0; + + /* + * If socket is neither of type SOCK_RAW or SOCK_DGRAM, + * or is a divert socket, reject it. + */ + if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || + (inp->inp_socket->so_proto->pr_type != SOCK_RAW && + inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) + return (EOPNOTSUPP); + + switch (sopt->sopt_name) { + case IPV6_MULTICAST_IF: + error = in6p_set_multicast_if(inp, sopt); + break; + + case IPV6_MULTICAST_HOPS: { + int hlim; + + if (sopt->sopt_valsize != sizeof(int)) { + error = EINVAL; + break; + } + error = sooptcopyin(sopt, &hlim, sizeof(hlim), sizeof(int)); + if (error) + break; + if (hlim < -1 || hlim > 255) { + error = EINVAL; + break; + } else if (hlim == -1) { + hlim = ip6_defmcasthlim; + } + im6o = in6p_findmoptions(inp); + if (im6o == NULL) { + error = ENOMEM; + break; + } + IM6O_LOCK(im6o); + im6o->im6o_multicast_hlim = hlim; + IM6O_UNLOCK(im6o); + IM6O_REMREF(im6o); /* from in6p_findmoptions() */ + break; + } + + case IPV6_MULTICAST_LOOP: { + u_int loop; + + /* + * Set the loopback flag for outgoing multicast packets. + * Must be zero or one. + */ + if (sopt->sopt_valsize != sizeof(u_int)) { + error = EINVAL; + break; + } + error = sooptcopyin(sopt, &loop, sizeof(u_int), sizeof(u_int)); + if (error) + break; + if (loop > 1) { + error = EINVAL; + break; + } + im6o = in6p_findmoptions(inp); + if (im6o == NULL) { + error = ENOMEM; + break; + } + IM6O_LOCK(im6o); + im6o->im6o_multicast_loop = loop; + IM6O_UNLOCK(im6o); + IM6O_REMREF(im6o); /* from in6p_findmoptions() */ + break; + } + + case IPV6_JOIN_GROUP: + case MCAST_JOIN_GROUP: + case MCAST_JOIN_SOURCE_GROUP: + error = in6p_join_group(inp, sopt); + break; + + case IPV6_LEAVE_GROUP: + case MCAST_LEAVE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + error = in6p_leave_group(inp, sopt); + break; + + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + error = in6p_block_unblock_source(inp, sopt); + break; + + case IPV6_MSFILTER: + error = in6p_set_source_filters(inp, sopt); + break; + + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} +/* + * Expose MLD's multicast filter mode and source list(s) to userland, + * keyed by (ifindex, group). + * The filter mode is written out as a uint32_t, followed by + * 0..n of struct in6_addr. + * For use by ifmcstat(8). + */ +static int +sysctl_ip6_mcast_filters SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp) + + struct in6_addr mcaddr; + struct in6_addr src; + struct ifnet *ifp; + struct in6_multi *inm; + struct in6_multistep step; + struct ip6_msource *ims; + int *name; + int retval = 0; + u_int namelen; + uint32_t fmode, ifindex; + + name = (int *)arg1; + namelen = arg2; + + if (req->newptr != USER_ADDR_NULL) + return (EPERM); + + /* int: ifindex + 4 * 32 bits of IPv6 address */ + if (namelen != 5) + return (EINVAL); + + ifindex = name[0]; + ifnet_head_lock_shared(); + if (ifindex <= 0 || ifindex > (u_int)if_index) { + MLD_PRINTF(("%s: ifindex %u out of range\n", + __func__, ifindex)); + ifnet_head_done(); + return (ENOENT); + } + + memcpy(&mcaddr, &name[1], sizeof(struct in6_addr)); + if (!IN6_IS_ADDR_MULTICAST(&mcaddr)) { + MLD_PRINTF(("%s: group %s is not multicast\n", + __func__, ip6_sprintf(&mcaddr))); + ifnet_head_done(); + return (EINVAL); + } + + ifp = ifindex2ifnet[ifindex]; + ifnet_head_done(); + if (ifp == NULL) { + MLD_PRINTF(("%s: no ifp for ifindex %u\n", __func__, ifindex)); + return (ENOENT); + } + /* + * Internal MLD lookups require that scope/zone ID is set. + */ + (void)in6_setscope(&mcaddr, ifp, NULL); + + in6_multihead_lock_shared(); + IN6_FIRST_MULTI(step, inm); + while (inm != NULL) { + IN6M_LOCK(inm); + if (inm->in6m_ifp != ifp) + goto next; + + if (!IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, &mcaddr)) + goto next; + + fmode = inm->in6m_st[1].iss_fmode; + retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t)); + if (retval != 0) { + IN6M_UNLOCK(inm); + break; /* abort */ + } + RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) { + MLD_PRINTF(("%s: visit node %p\n", __func__, ims)); + /* + * Only copy-out sources which are in-mode. + */ + if (fmode != im6s_get_mode(inm, ims, 1)) { + MLD_PRINTF(("%s: skip non-in-mode\n", + __func__)); + continue; /* process next source */ + } + src = ims->im6s_addr; + retval = SYSCTL_OUT(req, &src, sizeof(struct in6_addr)); + if (retval != 0) + break; /* process next inm */ + } +next: + IN6M_UNLOCK(inm); + IN6_NEXT_MULTI(step, inm); + } + in6_multihead_lock_done(); + + return (retval); +} + +void +in6_multi_init(void) +{ + PE_parse_boot_argn("ifa_debug", &in6m_debug, sizeof (in6m_debug)); + + /* Setup lock group and attribute for in6_multihead */ + in6_multihead_lock_grp_attr = lck_grp_attr_alloc_init(); + in6_multihead_lock_grp = lck_grp_alloc_init("in6_multihead", + in6_multihead_lock_grp_attr); + in6_multihead_lock_attr = lck_attr_alloc_init(); + lck_rw_init(&in6_multihead_lock, in6_multihead_lock_grp, + in6_multihead_lock_attr); + + lck_mtx_init(&in6m_trash_lock, in6_multihead_lock_grp, + in6_multihead_lock_attr); + TAILQ_INIT(&in6m_trash_head); + + in6m_size = (in6m_debug == 0) ? sizeof (struct in6_multi) : + sizeof (struct in6_multi_dbg); + in6m_zone = zinit(in6m_size, IN6M_ZONE_MAX * in6m_size, + 0, IN6M_ZONE_NAME); + if (in6m_zone == NULL) { + panic("%s: failed allocating %s", __func__, IN6M_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(in6m_zone, Z_EXPAND, TRUE); + + imm_size = sizeof (struct in6_multi_mship); + imm_zone = zinit(imm_size, IMM_ZONE_MAX * imm_size, 0, IMM_ZONE_NAME); + if (imm_zone == NULL) { + panic("%s: failed allocating %s", __func__, IMM_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(imm_zone, Z_EXPAND, TRUE); + + ip6ms_size = sizeof (struct ip6_msource); + ip6ms_zone = zinit(ip6ms_size, IP6MS_ZONE_MAX * ip6ms_size, + 0, IP6MS_ZONE_NAME); + if (ip6ms_zone == NULL) { + panic("%s: failed allocating %s", __func__, IP6MS_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(ip6ms_zone, Z_EXPAND, TRUE); + + in6ms_size = sizeof (struct in6_msource); + in6ms_zone = zinit(in6ms_size, IN6MS_ZONE_MAX * in6ms_size, + 0, IN6MS_ZONE_NAME); + if (in6ms_zone == NULL) { + panic("%s: failed allocating %s", __func__, IN6MS_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(in6ms_zone, Z_EXPAND, TRUE); +} + +static struct in6_multi * +in6_multi_alloc(int how) +{ + struct in6_multi *in6m; + + in6m = (how == M_WAITOK) ? zalloc(in6m_zone) : + zalloc_noblock(in6m_zone); + if (in6m != NULL) { + bzero(in6m, in6m_size); + lck_mtx_init(&in6m->in6m_lock, in6_multihead_lock_grp, + in6_multihead_lock_attr); + in6m->in6m_debug |= IFD_ALLOC; + if (in6m_debug != 0) { + in6m->in6m_debug |= IFD_DEBUG; + in6m->in6m_trace = in6m_trace; + } + } + return (in6m); +} + +static void +in6_multi_free(struct in6_multi *in6m) +{ + IN6M_LOCK(in6m); + if (in6m->in6m_debug & IFD_ATTACHED) { + panic("%s: attached in6m=%p is being freed", __func__, in6m); + /* NOTREACHED */ + } else if (in6m->in6m_ifma != NULL) { + panic("%s: ifma not NULL for in6m=%p", __func__, in6m); + /* NOTREACHED */ + } else if (!(in6m->in6m_debug & IFD_ALLOC)) { + panic("%s: in6m %p cannot be freed", __func__, in6m); + /* NOTREACHED */ + } else if (in6m->in6m_refcount != 0) { + panic("%s: non-zero refcount in6m=%p", __func__, in6m); + /* NOTREACHED */ + } else if (in6m->in6m_reqcnt != 0) { + panic("%s: non-zero reqcnt in6m=%p", __func__, in6m); + /* NOTREACHED */ + } + + /* Free any pending MLDv2 state-change records */ + IF_DRAIN(&in6m->in6m_scq); + + in6m->in6m_debug &= ~IFD_ALLOC; + if ((in6m->in6m_debug & (IFD_DEBUG | IFD_TRASHED)) == + (IFD_DEBUG | IFD_TRASHED)) { + lck_mtx_lock(&in6m_trash_lock); + TAILQ_REMOVE(&in6m_trash_head, (struct in6_multi_dbg *)in6m, + in6m_trash_link); + lck_mtx_unlock(&in6m_trash_lock); + in6m->in6m_debug &= ~IFD_TRASHED; + } + IN6M_UNLOCK(in6m); + + lck_mtx_destroy(&in6m->in6m_lock, in6_multihead_lock_grp); + zfree(in6m_zone, in6m); +} + +static void +in6_multi_attach(struct in6_multi *in6m) +{ + in6_multihead_lock_assert(LCK_RW_ASSERT_EXCLUSIVE); + IN6M_LOCK_ASSERT_HELD(in6m); + + if (in6m->in6m_debug & IFD_ATTACHED) { + panic("%s: Attempt to attach an already attached in6m=%p", + __func__, in6m); + /* NOTREACHED */ + } else if (in6m->in6m_debug & IFD_TRASHED) { + panic("%s: Attempt to reattach a detached in6m=%p", + __func__, in6m); + /* NOTREACHED */ + } + + in6m->in6m_reqcnt++; + VERIFY(in6m->in6m_reqcnt == 1); + IN6M_ADDREF_LOCKED(in6m); + in6m->in6m_debug |= IFD_ATTACHED; + /* + * Reattach case: If debugging is enabled, take it + * out of the trash list and clear IFD_TRASHED. + */ + if ((in6m->in6m_debug & (IFD_DEBUG | IFD_TRASHED)) == + (IFD_DEBUG | IFD_TRASHED)) { + /* Become a regular mutex, just in case */ + IN6M_CONVERT_LOCK(in6m); + lck_mtx_lock(&in6m_trash_lock); + TAILQ_REMOVE(&in6m_trash_head, (struct in6_multi_dbg *)in6m, + in6m_trash_link); + lck_mtx_unlock(&in6m_trash_lock); + in6m->in6m_debug &= ~IFD_TRASHED; + } + + LIST_INSERT_HEAD(&in6_multihead, in6m, in6m_entry); +} + +int +in6_multi_detach(struct in6_multi *in6m) +{ + in6_multihead_lock_assert(LCK_RW_ASSERT_EXCLUSIVE); + IN6M_LOCK_ASSERT_HELD(in6m); + + if (in6m->in6m_reqcnt == 0) { + panic("%s: in6m=%p negative reqcnt", __func__, in6m); + /* NOTREACHED */ + } + + --in6m->in6m_reqcnt; + if (in6m->in6m_reqcnt > 0) + return (0); + + if (!(in6m->in6m_debug & IFD_ATTACHED)) { + panic("%s: Attempt to detach an unattached record in6m=%p", + __func__, in6m); + /* NOTREACHED */ + } else if (in6m->in6m_debug & IFD_TRASHED) { + panic("%s: in6m %p is already in trash list", __func__, in6m); + /* NOTREACHED */ + } + + /* + * NOTE: Caller calls IFMA_REMREF + */ + in6m->in6m_debug &= ~IFD_ATTACHED; + LIST_REMOVE(in6m, in6m_entry); + + if (in6m->in6m_debug & IFD_DEBUG) { + /* Become a regular mutex, just in case */ + IN6M_CONVERT_LOCK(in6m); + lck_mtx_lock(&in6m_trash_lock); + TAILQ_INSERT_TAIL(&in6m_trash_head, + (struct in6_multi_dbg *)in6m, in6m_trash_link); + lck_mtx_unlock(&in6m_trash_lock); + in6m->in6m_debug |= IFD_TRASHED; + } + + return (1); +} + +void +in6m_addref(struct in6_multi *in6m, int locked) +{ + if (!locked) + IN6M_LOCK_SPIN(in6m); + else + IN6M_LOCK_ASSERT_HELD(in6m); + + if (++in6m->in6m_refcount == 0) { + panic("%s: in6m=%p wraparound refcnt", __func__, in6m); + /* NOTREACHED */ + } else if (in6m->in6m_trace != NULL) { + (*in6m->in6m_trace)(in6m, TRUE); + } + if (!locked) + IN6M_UNLOCK(in6m); +} + +void +in6m_remref(struct in6_multi *in6m, int locked) +{ + struct ifmultiaddr *ifma; + struct mld_ifinfo *mli; + + if (!locked) + IN6M_LOCK_SPIN(in6m); + else + IN6M_LOCK_ASSERT_HELD(in6m); + + if (in6m->in6m_refcount == 0 || (in6m->in6m_refcount == 1 && locked)) { + panic("%s: in6m=%p negative refcnt", __func__, in6m); + /* NOTREACHED */ + } else if (in6m->in6m_trace != NULL) { + (*in6m->in6m_trace)(in6m, FALSE); + } + + --in6m->in6m_refcount; + if (in6m->in6m_refcount > 0) { + if (!locked) + IN6M_UNLOCK(in6m); + return; + } + + /* + * Synchronization with in6_mc_get(). In the event the in6m has been + * detached, the underlying ifma would still be in the if_multiaddrs + * list, and thus can be looked up via if_addmulti(). At that point, + * the only way to find this in6m is via ifma_protospec. To avoid + * race conditions between the last in6m_remref() of that in6m and its + * use via ifma_protospec, in6_multihead lock is used for serialization. + * In order to avoid violating the lock order, we must drop in6m_lock + * before acquiring in6_multihead lock. To prevent the in6m from being + * freed prematurely, we hold an extra reference. + */ + ++in6m->in6m_refcount; + IN6M_UNLOCK(in6m); + in6_multihead_lock_shared(); + IN6M_LOCK_SPIN(in6m); + --in6m->in6m_refcount; + if (in6m->in6m_refcount > 0) { + /* We've lost the race, so abort since in6m is still in use */ + IN6M_UNLOCK(in6m); + in6_multihead_lock_done(); + /* If it was locked, return it as such */ + if (locked) + IN6M_LOCK(in6m); + return; + } + in6m_purge(in6m); + ifma = in6m->in6m_ifma; + in6m->in6m_ifma = NULL; + in6m->in6m_ifp = NULL; + mli = in6m->in6m_mli; + in6m->in6m_mli = NULL; + IN6M_UNLOCK(in6m); + IFMA_LOCK_SPIN(ifma); + ifma->ifma_protospec = NULL; + IFMA_UNLOCK(ifma); + in6_multihead_lock_done(); + + in6_multi_free(in6m); + if_delmulti_ifma(ifma); + /* Release reference held to the underlying ifmultiaddr */ + IFMA_REMREF(ifma); + + if (mli != NULL) + MLI_REMREF(mli); +} + +static void +in6m_trace(struct in6_multi *in6m, int refhold) +{ + struct in6_multi_dbg *in6m_dbg = (struct in6_multi_dbg *)in6m; + ctrace_t *tr; + u_int32_t idx; + u_int16_t *cnt; + + if (!(in6m->in6m_debug & IFD_DEBUG)) { + panic("%s: in6m %p has no debug structure", __func__, in6m); + /* NOTREACHED */ + } + if (refhold) { + cnt = &in6m_dbg->in6m_refhold_cnt; + tr = in6m_dbg->in6m_refhold; + } else { + cnt = &in6m_dbg->in6m_refrele_cnt; + tr = in6m_dbg->in6m_refrele; + } + + idx = atomic_add_16_ov(cnt, 1) % IN6M_TRACE_HIST_SIZE; + ctrace_record(&tr[idx]); +} + +static struct in6_multi_mship * +in6_multi_mship_alloc(int how) +{ + struct in6_multi_mship *imm; + + imm = (how == M_WAITOK) ? zalloc(imm_zone) : zalloc_noblock(imm_zone); + if (imm != NULL) + bzero(imm, imm_size); + + return (imm); +} + +static void +in6_multi_mship_free(struct in6_multi_mship *imm) +{ + if (imm->i6mm_maddr != NULL) { + panic("%s: i6mm_maddr not NULL for imm=%p", __func__, imm); + /* NOTREACHED */ + } + zfree(imm_zone, imm); +} + +void +in6_multihead_lock_exclusive(void) +{ + lck_rw_lock_exclusive(&in6_multihead_lock); +} + +void +in6_multihead_lock_shared(void) +{ + lck_rw_lock_shared(&in6_multihead_lock); +} + +void +in6_multihead_lock_assert(int what) +{ + lck_rw_assert(&in6_multihead_lock, what); +} + +void +in6_multihead_lock_done(void) +{ + lck_rw_done(&in6_multihead_lock); +} + +static struct ip6_msource * +ip6ms_alloc(int how) +{ + struct ip6_msource *i6ms; + + i6ms = (how == M_WAITOK) ? zalloc(ip6ms_zone) : + zalloc_noblock(ip6ms_zone); + if (i6ms != NULL) + bzero(i6ms, ip6ms_size); + + return (i6ms); +} + +static void +ip6ms_free(struct ip6_msource *i6ms) +{ + zfree(ip6ms_zone, i6ms); +} + +static struct in6_msource * +in6ms_alloc(int how) +{ + struct in6_msource *in6ms; + + in6ms = (how == M_WAITOK) ? zalloc(in6ms_zone) : + zalloc_noblock(in6ms_zone); + if (in6ms != NULL) + bzero(in6ms, in6ms_size); + + return (in6ms); +} + +static void +in6ms_free(struct in6_msource *in6ms) +{ + zfree(in6ms_zone, in6ms); +} + +#ifdef MLD_DEBUG + +static const char *in6m_modestrs[] = { "un\n", "in", "ex" }; + +static const char * +in6m_mode_str(const int mode) +{ + if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE) + return (in6m_modestrs[mode]); + return ("??"); +} + +static const char *in6m_statestrs[] = { + "not-member\n", + "silent\n", + "idle\n", + "lazy\n", + "sleeping\n", + "awakening\n", + "query-pending\n", + "sg-query-pending\n", + "leaving" +}; + +static const char * +in6m_state_str(const int state) +{ + if (state >= MLD_NOT_MEMBER && state <= MLD_LEAVING_MEMBER) + return (in6m_statestrs[state]); + return ("??"); +} + +/* + * Dump an in6_multi structure to the console. + */ +void +in6m_print(const struct in6_multi *inm) +{ + int t; + + IN6M_LOCK_ASSERT_HELD(IN6M_CAST_TO_NONCONST(inm)); + + if (mld_debug == 0) + return; + + printf("%s: --- begin in6m %p ---\n", __func__, inm); + printf("addr %s ifp %p(%s%d) ifma %p\n", + ip6_sprintf(&inm->in6m_addr), + inm->in6m_ifp, + inm->in6m_ifp->if_name, + inm->in6m_ifp->if_unit, + inm->in6m_ifma); + printf("timer %u state %s refcount %u scq.len %u\n", + inm->in6m_timer, + in6m_state_str(inm->in6m_state), + inm->in6m_refcount, + inm->in6m_scq.ifq_len); + printf("mli %p nsrc %lu sctimer %u scrv %u\n", + inm->in6m_mli, + inm->in6m_nsrc, + inm->in6m_sctimer, + inm->in6m_scrv); + for (t = 0; t < 2; t++) { + printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t, + in6m_mode_str(inm->in6m_st[t].iss_fmode), + inm->in6m_st[t].iss_asm, + inm->in6m_st[t].iss_ex, + inm->in6m_st[t].iss_in, + inm->in6m_st[t].iss_rec); + } + printf("%s: --- end in6m %p ---\n", __func__, inm); +} + +#else + +void +in6m_print(__unused const struct in6_multi *inm) +{ + +} + +#endif diff --git a/bsd/netinet6/in6_pcb.c b/bsd/netinet6/in6_pcb.c index 20f39a34d..2ea4d7a5d 100644 --- a/bsd/netinet6/in6_pcb.c +++ b/bsd/netinet6/in6_pcb.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2008 Apple Inc. All rights reserved. + * Copyright (c) 2003-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -102,6 +102,8 @@ #include #include #include +#include +#include #include #include @@ -160,7 +162,7 @@ in6_pcblookup_local_and_cleanup( if (inp && inp->inp_wantcnt == WNT_STOPUSING) { struct socket *so = inp->inp_socket; - lck_mtx_lock(inp->inpcb_mtx); + lck_mtx_lock(&inp->inpcb_mtx); if (so->so_usecount == 0) { if (inp->inp_state != INPCB_STATE_DEAD) @@ -169,23 +171,23 @@ in6_pcblookup_local_and_cleanup( inp = NULL; } else { - lck_mtx_unlock(inp->inpcb_mtx); + lck_mtx_unlock(&inp->inpcb_mtx); } } return inp; } + int -in6_pcbbind( - struct inpcb *inp, - struct sockaddr *nam, - struct proc *p) +in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) { struct socket *so = inp->inp_socket; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)NULL; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; u_short lport = 0; int wild = 0, reuseport = (so->so_options & SO_REUSEPORT); + int error; + kauth_cred_t cred; if (!in6_ifaddrs) /* XXX broken! */ return (EADDRNOTAVAIL); @@ -196,6 +198,8 @@ in6_pcbbind( socket_unlock(so, 0); /* keep reference */ lck_rw_lock_exclusive(pcbinfo->mtx); if (nam) { + unsigned int outif = 0; + sin6 = (struct sockaddr_in6 *)nam; if (nam->sa_len != sizeof(*sin6)) { lck_rw_done(pcbinfo->mtx); @@ -212,7 +216,8 @@ in6_pcbbind( } /* KAME hack: embed scopeid */ - if (in6_embedscope(&sin6->sin6_addr, sin6, inp, NULL) != 0) { + if (in6_embedscope(&sin6->sin6_addr, sin6, inp, NULL, + NULL) != 0) { lck_rw_done(pcbinfo->mtx); socket_lock(so, 0); return EINVAL; @@ -232,10 +237,10 @@ in6_pcbbind( if (so->so_options & SO_REUSEADDR) reuseport = SO_REUSEADDR|SO_REUSEPORT; } else if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { - struct ifaddr *ia = NULL; + struct ifaddr *ifa; sin6->sin6_port = 0; /* yech... */ - if ((ia = ifa_ifwithaddr((struct sockaddr *)sin6)) == 0) { + if ((ifa = ifa_ifwithaddr((struct sockaddr *)sin6)) == 0) { lck_rw_done(pcbinfo->mtx); socket_lock(so, 0); return(EADDRNOTAVAIL); @@ -247,26 +252,34 @@ in6_pcbbind( * We should allow to bind to a deprecated address, since * the application dare to use it. */ - if (ia && - ((struct in6_ifaddr *)ia)->ia6_flags & - (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|IN6_IFF_DETACHED)) { - ifafree(ia); - lck_rw_done(pcbinfo->mtx); - socket_lock(so, 0); - return(EADDRNOTAVAIL); + if (ifa != NULL) { + IFA_LOCK_SPIN(ifa); + if (((struct in6_ifaddr *)ifa)->ia6_flags & + (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|IN6_IFF_DETACHED)) { + IFA_UNLOCK(ifa); + IFA_REMREF(ifa); + lck_rw_done(pcbinfo->mtx); + socket_lock(so, 0); + return(EADDRNOTAVAIL); + } + outif = ifa->ifa_ifp->if_index; + IFA_UNLOCK(ifa); + IFA_REMREF(ifa); } - ifafree(ia); - ia = NULL; } if (lport) { struct inpcb *t; /* GROSS */ - if (ntohs(lport) < IPV6PORT_RESERVED && - ((so->so_state & SS_PRIV) == 0)) { - lck_rw_done(pcbinfo->mtx); - socket_lock(so, 0); - return(EACCES); + if (ntohs(lport) < IPV6PORT_RESERVED) { + cred = kauth_cred_proc_ref(p); + error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); + kauth_cred_unref(&cred); + if (error != 0) { + lck_rw_done(pcbinfo->mtx); + socket_lock(so, 0); + return(EACCES); + } } if (so->so_uid && @@ -335,6 +348,7 @@ in6_pcbbind( } } inp->in6p_laddr = sin6->sin6_addr; + inp->in6p_last_outif = outif; } socket_lock(so, 0); if (lport == 0) { @@ -349,10 +363,11 @@ in6_pcbbind( if (in_pcbinshash(inp, 1) != 0) { inp->in6p_laddr = in6addr_any; inp->inp_lport = 0; + inp->in6p_last_outif = 0; lck_rw_done(pcbinfo->mtx); return (EAGAIN); } - } + } lck_rw_done(pcbinfo->mtx); sflt_notify(so, sock_evt_bound, NULL); return(0); @@ -371,17 +386,14 @@ in6_pcbbind( */ int -in6_pcbladdr( - struct inpcb *inp, - struct sockaddr *nam, - struct in6_addr *plocal_addr6) +in6_pcbladdr(struct inpcb *inp, struct sockaddr *nam, + struct in6_addr *plocal_addr6, unsigned int *poutif) { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; struct in6_addr *addr6 = NULL; struct in6_addr src_storage; - - struct ifnet *ifp = NULL; int error = 0; + unsigned int ifscope; if (nam->sa_len != sizeof (*sin6)) return (EINVAL); @@ -391,7 +403,7 @@ in6_pcbladdr( return (EADDRNOTAVAIL); /* KAME hack: embed scopeid */ - if (in6_embedscope(&sin6->sin6_addr, sin6, inp, &ifp) != 0) + if (in6_embedscope(&sin6->sin6_addr, sin6, inp, NULL, NULL) != 0) return EINVAL; if (in6_ifaddrs) { @@ -402,33 +414,37 @@ in6_pcbladdr( if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) sin6->sin6_addr = in6addr_loopback; } - { - /* - * XXX: in6_selectsrc might replace the bound local address - * with the address specified by setsockopt(IPV6_PKTINFO). - * Is it the intended behavior? - */ - addr6 = in6_selectsrc(sin6, inp->in6p_outputopts, - inp->in6p_moptions, - &inp->in6p_route, - &inp->in6p_laddr, &src_storage, &error); - if (addr6 == 0) { - if (error == 0) - error = EADDRNOTAVAIL; - return(error); - } - *plocal_addr6 = *addr6; - /* - * Don't do pcblookup call here; return interface in - * plocal_addr6 - * and exit to caller, that will do the lookup. - */ + + ifscope = (inp->inp_flags & INP_BOUND_IF) ? + inp->inp_boundif : IFSCOPE_NONE; + + /* + * XXX: in6_selectsrc might replace the bound local address + * with the address specified by setsockopt(IPV6_PKTINFO). + * Is it the intended behavior? + */ + addr6 = in6_selectsrc(sin6, inp->in6p_outputopts, inp, + &inp->in6p_route, NULL, &src_storage, ifscope, &error); + if (addr6 == 0) { + if (error == 0) + error = EADDRNOTAVAIL; + return(error); } - /* XXX: what is the point in doing this? */ - if (inp->in6p_route.ro_rt) - ifp = inp->in6p_route.ro_rt->rt_ifp; + if (poutif != NULL) { + struct rtentry *rt; + if ((rt = inp->in6p_route.ro_rt) != NULL) + *poutif = rt->rt_ifp->if_index; + else + *poutif = 0; + } + *plocal_addr6 = *addr6; + /* + * Don't do pcblookup call here; return interface in + * plocal_addr6 + * and exit to caller, that will do the lookup. + */ return(0); } @@ -440,21 +456,22 @@ in6_pcbladdr( * then pick one. */ int -in6_pcbconnect(inp, nam, p) - struct inpcb *inp; - struct sockaddr *nam; - struct proc *p; +in6_pcbconnect( + struct inpcb *inp, + struct sockaddr *nam, + struct proc *p) { struct in6_addr addr6; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; struct inpcb *pcb; int error; + unsigned int outif = 0; /* * Call inner routine, to assign local interface address. * in6_pcbladdr() may automatically fill in sin6_scope_id. */ - if ((error = in6_pcbladdr(inp, nam, &addr6)) != 0) + if ((error = in6_pcbladdr(inp, nam, &addr6, &outif)) != 0) return(error); socket_unlock(inp->inp_socket, 0); pcb = in6_pcblookup_hash(inp->inp_pcbinfo, &sin6->sin6_addr, @@ -474,6 +491,7 @@ in6_pcbconnect(inp, nam, p) return (error); } inp->in6p_laddr = addr6; + inp->in6p_last_outif = outif; } if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) { /*lock inversion issue, mostly with udp multicast packets */ @@ -495,8 +513,8 @@ in6_pcbconnect(inp, nam, p) } void -in6_pcbdisconnect(inp) - struct inpcb *inp; +in6_pcbdisconnect( + struct inpcb *inp) { if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) { /*lock inversion issue, mostly with udp multicast packets */ @@ -515,8 +533,8 @@ in6_pcbdisconnect(inp) } void -in6_pcbdetach(inp) - struct inpcb *inp; +in6_pcbdetach( + struct inpcb *inp) { struct socket *so = inp->inp_socket; struct inpcbinfo *ipi = inp->inp_pcbinfo; @@ -533,13 +551,15 @@ in6_pcbdetach(inp) inp->inp_state = INPCB_STATE_DEAD; if ((so->so_flags & SOF_PCBCLEARING) == 0) { + struct ip_moptions *imo; + struct ip6_moptions *im6o; + inp->inp_vflag = 0; so->so_flags |= SOF_PCBCLEARING; inp->inp_gencnt = ++ipi->ipi_gencnt; if (inp->in6p_options) m_freem(inp->in6p_options); ip6_freepcbopts(inp->in6p_outputopts); - ip6_freemoptions(inp->in6p_moptions); if (inp->in6p_route.ro_rt) { rtfree(inp->in6p_route.ro_rt); inp->in6p_route.ro_rt = NULL; @@ -547,16 +567,23 @@ in6_pcbdetach(inp) /* Check and free IPv4 related resources in case of mapped addr */ if (inp->inp_options) (void)m_free(inp->inp_options); - ip_freemoptions(inp->inp_moptions); + + im6o = inp->in6p_moptions; + inp->in6p_moptions = NULL; + if (im6o != NULL) + IM6O_REMREF(im6o); + + imo = inp->inp_moptions; inp->inp_moptions = NULL; - + if (imo != NULL) + IMO_REMREF(imo); } } struct sockaddr * -in6_sockaddr(port, addr_p) - in_port_t port; - struct in6_addr *addr_p; +in6_sockaddr( + in_port_t port, + struct in6_addr *addr_p) { struct sockaddr_in6 *sin6; @@ -579,9 +606,9 @@ in6_sockaddr(port, addr_p) } struct sockaddr * -in6_v4mapsin6_sockaddr(port, addr_p) - in_port_t port; - struct in_addr *addr_p; +in6_v4mapsin6_sockaddr( + in_port_t port, + struct in_addr *addr_p) { struct sockaddr_in sin; struct sockaddr_in6 *sin6_p; @@ -612,9 +639,9 @@ in6_v4mapsin6_sockaddr(port, addr_p) * because there actually /is/ a programming error somewhere... XXX) */ int -in6_setsockaddr(so, nam) - struct socket *so; - struct sockaddr **nam; +in6_setsockaddr( + struct socket *so, + struct sockaddr **nam) { struct inpcb *inp; struct in6_addr addr; @@ -634,9 +661,9 @@ in6_setsockaddr(so, nam) } int -in6_setpeeraddr(so, nam) - struct socket *so; - struct sockaddr **nam; +in6_setpeeraddr( + struct socket *so, + struct sockaddr **nam) { struct inpcb *inp; struct in6_addr addr; @@ -701,17 +728,15 @@ in6_mapped_peeraddr(struct socket *so, struct sockaddr **nam) * cmds that are uninteresting (e.g., no error in the map). * Call the protocol specific routine (if any) to report * any errors for each matching socket. - * - * Must be called at splnet. */ void -in6_pcbnotify(pcbinfo, dst, fport_arg, src, lport_arg, cmd, notify) +in6_pcbnotify(pcbinfo, dst, fport_arg, src, lport_arg, cmd, cmdarg, notify) struct inpcbinfo *pcbinfo; struct sockaddr *dst; const struct sockaddr *src; u_int fport_arg, lport_arg; int cmd; -// struct inpcb *(*notify)(struct inpcb *, int); + void *cmdarg; void (*notify)(struct inpcb *, int); { struct inpcb *inp, *ninp; @@ -758,6 +783,22 @@ in6_pcbnotify(pcbinfo, dst, fport_arg, src, lport_arg, cmd, notify) if ((inp->inp_vflag & INP_IPV6) == 0) continue; + /* + * If the error designates a new path MTU for a destination + * and the application (associated with this socket) wanted to + * know the value, notify. Note that we notify for all + * disconnected sockets if the corresponding application + * wanted. This is because some UDP applications keep sending + * sockets disconnected. + * XXX: should we avoid to notify the value to TCP sockets? + */ + if (cmd == PRC_MSGSIZE && (inp->inp_flags & IN6P_MTU) != 0 && + (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) || + IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &sa6_dst->sin6_addr))) { + ip6_notify_pmtu(inp, (struct sockaddr_in6 *)dst, + (u_int32_t *)cmdarg); + } + /* * Detect if we should notify the error. If no source and * destination ports are specifed, but non-zero flowinfo and @@ -799,11 +840,11 @@ in6_pcbnotify(pcbinfo, dst, fport_arg, src, lport_arg, cmd, notify) * Lookup a PCB based on the local address and port. */ struct inpcb * -in6_pcblookup_local(pcbinfo, laddr, lport_arg, wild_okay) - struct inpcbinfo *pcbinfo; - struct in6_addr *laddr; - u_int lport_arg; - int wild_okay; +in6_pcblookup_local( + struct inpcbinfo *pcbinfo, + struct in6_addr *laddr, + u_int lport_arg, + int wild_okay) { struct inpcb *inp; int matchwild = 3, wildcard; @@ -883,47 +924,6 @@ in6_pcblookup_local(pcbinfo, laddr, lport_arg, wild_okay) return (match); } } -#ifndef APPLE -/* this is not used in Darwin */ -void -in6_pcbpurgeif0( - struct in6pcb *head, - struct ifnet *ifp) -{ - struct in6pcb *in6p; - struct ip6_moptions *im6o; - struct in6_multi_mship *imm, *nimm; - - for (in6p = head; in6p != NULL; in6p = LIST_NEXT(in6p, inp_list)) { - im6o = in6p->in6p_moptions; - if ((in6p->inp_vflag & INP_IPV6) && - im6o) { - /* - * Unselect the outgoing interface if it is being - * detached. - */ - if (im6o->im6o_multicast_ifp == ifp) - im6o->im6o_multicast_ifp = NULL; - - /* - * Drop multicast group membership if we joined - * through the interface being detached. - * XXX controversial - is it really legal for kernel - * to force this? - */ - for (imm = im6o->im6o_memberships.lh_first; - imm != NULL; imm = nimm) { - nimm = imm->i6mm_chain.le_next; - if (imm->i6mm_maddr->in6m_ifp == ifp) { - LIST_REMOVE(imm, i6mm_chain); - in6_delmulti(imm->i6mm_maddr); - FREE(imm, M_IPMADDR); - } - } - } - } -} -#endif /* * Check for alternatives when higher level complains @@ -932,8 +932,8 @@ in6_pcbpurgeif0( * (by a redirect), time to try a default gateway again. */ void -in6_losing(in6p) - struct inpcb *in6p; +in6_losing( + struct inpcb *in6p) { struct rtentry *rt; struct rt_addrinfo info; @@ -987,6 +987,104 @@ in6_rtchange( } } +/* + * Check if PCB exists hash list. Also returns uid and gid of socket + */ +int +in6_pcblookup_hash_exists( + struct inpcbinfo *pcbinfo, + struct in6_addr *faddr, + u_int fport_arg, + struct in6_addr *laddr, + u_int lport_arg, + int wildcard, + uid_t *uid, + gid_t *gid, + __unused struct ifnet *ifp) +{ + struct inpcbhead *head; + struct inpcb *inp; + u_short fport = fport_arg, lport = lport_arg; + int faith; + int found; + +#if defined(NFAITH) && NFAITH > 0 + faith = faithprefix(laddr); +#else + faith = 0; +#endif + + *uid = UID_MAX; + *gid = GID_MAX; + + lck_rw_lock_shared(pcbinfo->mtx); + + /* + * First look for an exact match. + */ + head = &pcbinfo->hashbase[INP_PCBHASH(faddr->s6_addr32[3] /* XXX */, + lport, fport, + pcbinfo->hashmask)]; + LIST_FOREACH(inp, head, inp_hash) { + if ((inp->inp_vflag & INP_IPV6) == 0) + continue; + if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) && + IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) && + inp->inp_fport == fport && + inp->inp_lport == lport) { + if ((found = (inp->inp_socket != NULL))) { + /* + * Found. Check if pcb is still valid + */ + *uid = inp->inp_socket->so_uid; + *gid = inp->inp_socket->so_gid; + } + lck_rw_done(pcbinfo->mtx); + return (found); + } + } + if (wildcard) { + struct inpcb *local_wild = NULL; + + head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, + pcbinfo->hashmask)]; + LIST_FOREACH(inp, head, inp_hash) { + if ((inp->inp_vflag & INP_IPV6) == 0) + continue; + if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && + inp->inp_lport == lport) { + if (faith && (inp->inp_flags & INP_FAITH) == 0) + continue; + if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, + laddr)) { + if ((found = (inp->inp_socket != NULL))) { + *uid = inp->inp_socket->so_uid; + *gid = inp->inp_socket->so_gid; + } + lck_rw_done(pcbinfo->mtx); + return (found); + } + else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) + local_wild = inp; + } + } + if (local_wild) { + if ((found = (local_wild->inp_socket != NULL))) { + *uid = local_wild->inp_socket->so_uid; + *gid = local_wild->inp_socket->so_gid; + } + lck_rw_done(pcbinfo->mtx); + return (found); + } + } + + /* + * Not found. + */ + lck_rw_done(pcbinfo->mtx); + return (0); +} + /* * Lookup PCB in hash list. */ diff --git a/bsd/netinet6/in6_pcb.h b/bsd/netinet6/in6_pcb.h index 58476cc5e..d83836bbc 100644 --- a/bsd/netinet6/in6_pcb.h +++ b/bsd/netinet6/in6_pcb.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -104,13 +104,16 @@ extern int in6_pcbbind(struct inpcb *, struct sockaddr *, struct proc *); extern int in6_pcbconnect(struct inpcb *, struct sockaddr *, struct proc *); extern void in6_pcbdetach(struct inpcb *); extern void in6_pcbdisconnect(struct inpcb *); -extern int in6_pcbladdr(struct inpcb *, struct sockaddr *, struct in6_addr *); +extern int in6_pcbladdr(struct inpcb *, struct sockaddr *, + struct in6_addr *, unsigned int *); extern struct inpcb *in6_pcblookup_local(struct inpcbinfo *, struct in6_addr *, u_int, int); extern struct inpcb *in6_pcblookup_hash(struct inpcbinfo *, struct in6_addr *, u_int, struct in6_addr *, u_int, int, struct ifnet *); +extern int in6_pcblookup_hash_exists(struct inpcbinfo *, struct in6_addr *, + u_int, struct in6_addr *, u_int, int, uid_t *, gid_t *, struct ifnet *); extern void in6_pcbnotify(struct inpcbinfo *, struct sockaddr *, u_int, - const struct sockaddr *, u_int, int, void (*)(struct inpcb *, int)); + const struct sockaddr *, u_int, int, void *, void (*)(struct inpcb *, int)); extern void in6_rtchange(struct inpcb *, int); extern struct sockaddr *in6_sockaddr(in_port_t port, struct in6_addr *addr_p); extern struct sockaddr *in6_v4mapsin6_sockaddr(in_port_t port, @@ -119,9 +122,6 @@ extern int in6_setpeeraddr(struct socket *so, struct sockaddr **nam); extern int in6_setsockaddr(struct socket *so, struct sockaddr **nam); extern int in6_mapped_sockaddr(struct socket *so, struct sockaddr **nam); extern int in6_mapped_peeraddr(struct socket *so, struct sockaddr **nam); -extern struct in6_addr *in6_selectsrc(struct sockaddr_in6 *, - struct ip6_pktopts *, struct ip6_moptions *, struct route_in6 *, - struct in6_addr *, struct in6_addr *, int *); extern int in6_selecthlim(struct in6pcb *, struct ifnet *); extern int in6_pcbsetport(struct in6_addr *, struct inpcb *, struct proc *, int); diff --git a/bsd/netinet6/in6_prefix.c b/bsd/netinet6/in6_prefix.c index 891917965..da85f486d 100644 --- a/bsd/netinet6/in6_prefix.c +++ b/bsd/netinet6/in6_prefix.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -227,18 +227,25 @@ search_matched_prefix(struct ifnet *ifp, struct in6_prefixreq *ipr) ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; + } if (ipr->ipr_plen <= - in6_matchlen(&ipr->ipr_prefix.sin6_addr, IFA_IN6(ifa))) + in6_matchlen(&ipr->ipr_prefix.sin6_addr, IFA_IN6(ifa))) { + /* keep it locked */ break; + } + IFA_UNLOCK(ifa); } if (ifa == NULL) { ifnet_lock_done(ifp); return NULL; } - + IFA_LOCK_ASSERT_HELD(ifa); rpp = ifpr2rp(((struct in6_ifaddr *)ifa)->ia6_ifpr); + IFA_UNLOCK(ifa); if (rpp != 0) { ifnet_lock_done(ifp); return rpp; @@ -302,24 +309,31 @@ mark_matched_prefixes(u_int32_t cmd, struct ifnet *ifp, struct in6_rrenumreq *ir { struct rr_prefix *rpp; - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; + } matchlen = in6_matchlen(&irr->irr_matchprefix.sin6_addr, IFA_IN6(ifa)); if (irr->irr_m_minlen > matchlen || - irr->irr_m_maxlen < matchlen || irr->irr_m_len > matchlen) - continue; + irr->irr_m_maxlen < matchlen || irr->irr_m_len > matchlen) { + IFA_UNLOCK(ifa); + continue; + } rpp = ifpr2rp(((struct in6_ifaddr *)ifa)->ia6_ifpr); if (rpp != 0) { matched = 1; rpp->rp_statef_addmark = 1; if (cmd == SIOCCIFPREFIX_IN6) rpp->rp_statef_delmark = 1; - } else + } else { log(LOG_WARNING, "in6_prefix.c: mark_matched_prefixes:" "no back pointer to ifprefix for %s. " "ND autoconfigured addr?\n", ip6_sprintf(IFA_IN6(ifa))); + } + IFA_UNLOCK(ifa); } ifnet_lock_done(ifp); return matched; @@ -447,15 +461,17 @@ assign_ra_entry(struct rr_prefix *rpp, int iilen, struct in6_ifaddr *ia) return error; /* copy interface id part */ + IFA_LOCK(&ia->ia_ifa); bit_copy((caddr_t)&rap->ra_ifid, sizeof(rap->ra_ifid) << 3, - (caddr_t)IA6_IN6(ia), - sizeof(*IA6_IN6(ia)) << 3, rpp->rp_plen, iilen); + (caddr_t)IA6_IN6(ia), sizeof(*IA6_IN6(ia)) << 3, + rpp->rp_plen, iilen); /* link to ia, and put into list */ rap->ra_addr = ia; - ifaref(&rap->ra_addr->ia_ifa); + IFA_ADDREF_LOCKED(&rap->ra_addr->ia_ifa); #if 0 /* Can't do this now, because rpp may be on th stack. should fix it? */ ia->ia6_ifpr = rp2ifpr(rpp); #endif + IFA_UNLOCK(&ia->ia_ifa); lck_mtx_lock(prefix6_mutex); LIST_INSERT_HEAD(&rpp->rp_addrhead, rap, ra_entry); lck_mtx_unlock(prefix6_mutex); @@ -478,9 +494,11 @@ in6_prefix_add_llifid(__unused int iilen, struct in6_ifaddr *ia) if ((error = create_ra_entry(&rap)) != 0) return(error); /* copy interface id part */ + IFA_LOCK(&ia->ia_ifa); bit_copy((caddr_t)&rap->ra_ifid, sizeof(rap->ra_ifid) << 3, (caddr_t)IA6_IN6(ia), sizeof(*IA6_IN6(ia)) << 3, 64, (sizeof(rap->ra_ifid) << 3) - 64); + IFA_UNLOCK(&ia->ia_ifa); /* XXX: init dummy so */ bzero(&so, sizeof(so)); /* insert into list */ @@ -500,6 +518,7 @@ in6_prefix_add_llifid(__unused int iilen, struct in6_ifaddr *ia) return 0; } +#if 0 /* * add an address to an interface. if the interface id portion is new, * we will add new interface address (prefix database + new interface id). @@ -507,17 +526,24 @@ in6_prefix_add_llifid(__unused int iilen, struct in6_ifaddr *ia) int in6_prefix_add_ifid(int iilen, struct in6_ifaddr *ia) { - int plen = (sizeof(*IA6_IN6(ia)) << 3) - iilen; + struct in6_addr addr; + int plen; struct ifprefix *ifpr; struct rp_addr *rap; int error = 0; - if (IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) + IFA_LOCK_SPIN(&ia->ia_ifa); + addr = *IA6_IN6(ia); + plen = (sizeof(*IA6_IN6(ia)) << 3) - iilen; + IFA_UNLOCK(&ia->ia_ifa); + + if (IN6_IS_ADDR_LINKLOCAL(&addr)) return(in6_prefix_add_llifid(iilen, ia)); - ifpr = in6_prefixwithifp(ia->ia_ifp, plen, IA6_IN6(ia)); + ifpr = in6_prefixwithifp(ia->ia_ifp, plen, &addr); if (ifpr == NULL) { struct rr_prefix rp; struct socket so; + struct ifnet *ifp; int pplen = (plen == 128) ? 64 : plen; /* XXX hardcoded 64 is bad */ /* allocate a prefix for ia, with default properties */ @@ -525,14 +551,12 @@ in6_prefix_add_ifid(int iilen, struct in6_ifaddr *ia) /* init rp */ bzero(&rp, sizeof(rp)); rp.rp_type = IN6_PREFIX_RR; - rp.rp_ifp = ia->ia_ifp; + rp.rp_ifp = ifp = ia->ia_ifp; rp.rp_plen = pplen; rp.rp_prefix.sin6_len = sizeof(rp.rp_prefix); rp.rp_prefix.sin6_family = AF_INET6; bit_copy((char *)RP_IN6(&rp), sizeof(*RP_IN6(&rp)) << 3, - (char *)&ia->ia_addr.sin6_addr, - sizeof(ia->ia_addr.sin6_addr) << 3, - 0, pplen); + (char *)&addr, sizeof (addr) << 3, 0, pplen); rp.rp_vltime = rp.rp_pltime = RR_INFINITE_LIFETIME; rp.rp_raf_onlink = 1; rp.rp_raf_auto = 1; @@ -541,7 +565,9 @@ in6_prefix_add_ifid(int iilen, struct in6_ifaddr *ia) rp.rp_origin = PR_ORIG_RR; /* can be renumbered */ /* create ra_entry */ + ifnet_lock_shared(ifp); error = link_stray_ia6s(&rp); + ifnet_lock_done(ifp); if (error != 0) { free_rp_entries(&rp); return error; @@ -559,53 +585,69 @@ in6_prefix_add_ifid(int iilen, struct in6_ifaddr *ia) return error; /* search again */ - ifpr = in6_prefixwithifp(ia->ia_ifp, pplen, IA6_IN6(ia)); + ifpr = in6_prefixwithifp(ia->ia_ifp, pplen, &addr); if (ifpr == NULL) return 0; } - rap = search_ifidwithprefix(ifpr2rp(ifpr), IA6_IN6(ia)); + rap = search_ifidwithprefix(ifpr2rp(ifpr), &addr); if (rap != NULL) { if (rap->ra_addr == NULL) { rap->ra_addr = ia; - ifaref(&rap->ra_addr->ia_ifa); + IFA_ADDREF(&rap->ra_addr->ia_ifa); } else if (rap->ra_addr != ia) { /* There may be some inconsistencies between addrs. */ log(LOG_ERR, "ip6_prefix.c: addr %s/%d matched prefix" " already has another ia %p(%s) on its ifid list\n", - ip6_sprintf(IA6_IN6(ia)), plen, - rap->ra_addr, + ip6_sprintf(&addr), plen, rap->ra_addr, ip6_sprintf(IA6_IN6(rap->ra_addr))); return EADDRINUSE /* XXX */; } + IFA_LOCK_SPIN(&ia->ia_ifa); ia->ia6_ifpr = ifpr; + IFA_UNLOCK(&ia->ia_ifa); return 0; } error = assign_ra_entry(ifpr2rp(ifpr), iilen, ia); - if (error == 0) + if (error == 0) { + IFA_LOCK_SPIN(&ia->ia_ifa); ia->ia6_ifpr = ifpr; + IFA_UNLOCK(&ia->ia_ifa); + } return (error); } +#endif +#if 0 void in6_prefix_remove_ifid(__unused int iilen, struct in6_ifaddr *ia) { struct rp_addr *rap; + struct in6_addr addr; + struct ifprefix *ifpr; - if (ia->ia6_ifpr == NULL) + IFA_LOCK_SPIN(&ia->ia_ifa); + if ((ifpr = ia->ia6_ifpr) == NULL) { + IFA_UNLOCK(&ia->ia_ifa); return; - rap = search_ifidwithprefix(ifpr2rp(ia->ia6_ifpr), IA6_IN6(ia)); + } + addr = *IA6_IN6(ia); + IFA_UNLOCK(&ia->ia_ifa); + rap = search_ifidwithprefix(ifpr2rp(ifpr), &addr); if (rap != NULL) { lck_mtx_lock(prefix6_mutex); LIST_REMOVE(rap, ra_entry); lck_mtx_unlock(prefix6_mutex); - if (rap->ra_addr) - ifafree(&rap->ra_addr->ia_ifa); + if (rap->ra_addr) { + IFA_REMREF(&rap->ra_addr->ia_ifa); + rap->ra_addr = NULL; + } FREE(rap, M_RR_ADDR); } - if (LIST_EMPTY(&ifpr2rp(ia->ia6_ifpr)->rp_addrhead)) - rp_remove(ifpr2rp(ia->ia6_ifpr)); + if (LIST_EMPTY(&ifpr2rp(ifpr)->rp_addrhead)) + rp_remove(ifpr2rp(ifpr)); } +#endif void in6_purgeprefix( @@ -665,20 +707,29 @@ add_each_addr(struct socket *so, struct rr_prefix *rpp, struct rp_addr *rap) ia6 = in6ifa_ifpwithaddr(rpp->rp_ifp, &ifra.ifra_addr.sin6_addr); if (ia6 != NULL) { + struct in6_ifaddr *ria6 = NULL; + + IFA_LOCK(&ia6->ia_ifa); if (ia6->ia6_ifpr == NULL) { /* link this addr and the prefix each other */ - if (rap->ra_addr) - ifafree(&rap->ra_addr->ia_ifa); + if (rap->ra_addr != NULL) + ria6 = rap->ra_addr; /* Reference held in in6ifa_ifpwithaddr() */ rap->ra_addr = ia6; ia6->ia6_ifpr = rp2ifpr(rpp); + IFA_UNLOCK(&ia6->ia_ifa); + if (ria6 != NULL) + IFA_REMREF(&ria6->ia_ifa); return; } if (ia6->ia6_ifpr == rp2ifpr(rpp)) { - if (rap->ra_addr) - ifafree(&rap->ra_addr->ia_ifa); + if (rap->ra_addr != NULL) + ria6 = rap->ra_addr; /* Reference held in in6ifa_ifpwithaddr() */ rap->ra_addr = ia6; + IFA_UNLOCK(&ia6->ia_ifa); + if (ria6 != NULL) + IFA_REMREF(&ria6->ia_ifa); return; } /* @@ -697,7 +748,8 @@ add_each_addr(struct socket *so, struct rr_prefix *rpp, struct rp_addr *rap) ip6_sprintf(&ifra.ifra_addr.sin6_addr), rpp->rp_plen, ip6_sprintf(IA6_IN6(ia6)), in6_mask2len(&ia6->ia_prefixmask.sin6_addr, NULL)); - ifafree(&ia6->ia_ifa); + IFA_UNLOCK(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); return; } /* propagate ANYCAST flag if it is set for ancestor addr */ @@ -803,8 +855,10 @@ rrpr_update(struct socket *so, struct rr_prefix *new) LIST_REMOVE(rap, ra_entry); if (search_ifidwithprefix(rpp, &rap->ra_ifid) != NULL) { - if (rap->ra_addr) - ifafree(&rap->ra_addr->ia_ifa); + if (rap->ra_addr) { + IFA_REMREF(&rap->ra_addr->ia_ifa); + rap->ra_addr = NULL; + } FREE(rap, M_RR_ADDR); continue; } @@ -870,11 +924,14 @@ rrpr_update(struct socket *so, struct rr_prefix *new) * init the prefix pointer. */ lck_mtx_lock(prefix6_mutex); - LIST_FOREACH(rap, &rpp->rp_addrhead, ra_entry) - { - if (rap->ra_addr != NULL) { - if (rap->ra_addr->ia6_ifpr == NULL) - rap->ra_addr->ia6_ifpr = rp2ifpr(rpp); + LIST_FOREACH(rap, &rpp->rp_addrhead, ra_entry) { + struct in6_ifaddr *ia6; + + if ((ia6 = rap->ra_addr) != NULL) { + IFA_LOCK(&ia6->ia_ifa); + if (ia6->ia6_ifpr == NULL) + ia6->ia6_ifpr = rp2ifpr(rpp); + IFA_UNLOCK(&ia6->ia_ifa); continue; } add_each_addr(so, rpp, rap); @@ -967,13 +1024,20 @@ init_newprefix(struct in6_rrenumreq *irr, struct ifprefix *ifpr, { struct rp_addr *rap; int error = 0; + struct in6_ifaddr *ia6; if ((error = create_ra_entry(&rap)) != 0) return error; rap->ra_ifid = orap->ra_ifid; - rap->ra_flags.anycast = (orap->ra_addr != NULL && - (orap->ra_addr->ia6_flags & - IN6_IFF_ANYCAST) != 0) ? 1 : 0; + ia6 = orap->ra_addr->ia_ifa; + if (ia6 != NULL) { + IFA_LOCK(&ia6->ia_ifa); + rap->ra_flags.anycast = + ((ia6->ia6_flags & IN6_IFF_ANYCAST) != 0) ? 1 : 0; + IFA_UNLOCK(&ia6->ia_ifa); + } else { + rap->ra_flags.anycast = 0; + } LIST_INSERT_HEAD(&rpp->rp_addrhead, rap, ra_entry); } rpp->rp_vltime = irr->irr_vltime; @@ -1005,8 +1069,10 @@ free_rp_entries(struct rr_prefix *rpp) rap = LIST_FIRST(&rpp->rp_addrhead); LIST_REMOVE(rap, ra_entry); - if (rap->ra_addr) - ifafree(&rap->ra_addr->ia_ifa); + if (rap->ra_addr) { + IFA_REMREF(&rap->ra_addr->ia_ifa); + rap->ra_addr = NULL; + } FREE(rap, M_RR_ADDR); } lck_mtx_unlock(prefix6_mutex); @@ -1054,10 +1120,14 @@ unprefer_prefix(struct rr_prefix *rpp) lck_mtx_lock(prefix6_mutex); for (rap = rpp->rp_addrhead.lh_first; rap != NULL; rap = rap->ra_entry.le_next) { - if (rap->ra_addr == NULL) + struct in6_ifaddr *ia6; + + if ((ia6 = rap->ra_addr) == NULL) continue; - rap->ra_addr->ia6_lifetime.ia6t_preferred = timenow.tv_sec; - rap->ra_addr->ia6_lifetime.ia6t_pltime = 0; + IFA_LOCK(&ia6->ia_ifa); + ia6->ia6_lifetime.ia6t_preferred = timenow.tv_sec; + ia6->ia6_lifetime.ia6t_pltime = 0; + IFA_UNLOCK(&ia6->ia_ifa); } lck_mtx_unlock(prefix6_mutex); @@ -1074,20 +1144,24 @@ delete_each_prefix(struct rr_prefix *rpp, u_char origin) lck_mtx_lock(prefix6_mutex); while (rpp->rp_addrhead.lh_first != NULL) { struct rp_addr *rap; + struct in6_ifaddr *ia6; rap = LIST_FIRST(&rpp->rp_addrhead); if (rap == NULL) { break; } LIST_REMOVE(rap, ra_entry); - if (rap->ra_addr == NULL) { + if ((ia6 = rap->ra_addr) == NULL) { FREE(rap, M_RR_ADDR); continue; } - rap->ra_addr->ia6_ifpr = NULL; + rap->ra_addr = NULL; + IFA_LOCK(&ia6->ia_ifa); + ia6->ia6_ifpr = NULL; + IFA_UNLOCK(&ia6->ia_ifa); - in6_purgeaddr(&rap->ra_addr->ia_ifa, 0); - ifafree(&rap->ra_addr->ia_ifa); + in6_purgeaddr(&ia6->ia_ifa, 0); + IFA_REMREF(&ia6->ia_ifa); FREE(rap, M_RR_ADDR); } rp_remove(rpp); @@ -1122,6 +1196,8 @@ link_stray_ia6s(struct rr_prefix *rpp) { struct ifaddr *ifa; + ifnet_lock_assert(rpp->rp_ifp, IFNET_LCK_ASSERT_OWNED); + for (ifa = rpp->rp_ifp->if_addrlist.tqh_first; ifa; ifa = ifa->ifa_list.tqe_next) { @@ -1129,11 +1205,15 @@ link_stray_ia6s(struct rr_prefix *rpp) struct rr_prefix *orpp; int error = 0; - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; - if (rpp->rp_plen > in6_matchlen(RP_IN6(rpp), IFA_IN6(ifa))) + } + if (rpp->rp_plen > in6_matchlen(RP_IN6(rpp), IFA_IN6(ifa))) { + IFA_UNLOCK(ifa); continue; - + } orpp = ifpr2rp(((struct in6_ifaddr *)ifa)->ia6_ifpr); if (orpp != NULL) { if (!in6_are_prefix_equal(RP_IN6(orpp), RP_IN6(rpp), @@ -1144,8 +1224,10 @@ link_stray_ia6s(struct rr_prefix *rpp) ip6_sprintf(IFA_IN6(ifa)), orpp->rp_plen, ip6_sprintf(RP_IN6(rpp)), rpp->rp_plen); + IFA_UNLOCK(ifa); continue; } + IFA_UNLOCK(ifa); if ((error = assign_ra_entry(rpp, (sizeof(rap->ra_ifid) << 3) - rpp->rp_plen, @@ -1237,23 +1319,28 @@ in6_prefix_ioctl(struct socket *so, u_long cmd, caddr_t data, rp_tmp.rp_origin = ipr->ipr_origin; /* create rp_addr entries, usually at least for lladdr */ + ifnet_lock_shared(ifp); if ((error = link_stray_ia6s(&rp_tmp)) != 0) { + ifnet_lock_done(ifp); free_rp_entries(&rp_tmp); break; } - ifnet_lock_exclusive(ifp); for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = ifa->ifa_list.tqe_next) { - if (ifa->ifa_addr == NULL) - continue; /* just for safety */ - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; - if (IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa)) == 0) + } + if (IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa)) == 0) { + IFA_UNLOCK(ifa); continue; - + } if ((error = create_ra_entry(&rap)) != 0) { + IFA_UNLOCK(ifa); + ifnet_lock_done(ifp); free_rp_entries(&rp_tmp); goto bad; } @@ -1264,6 +1351,7 @@ in6_prefix_ioctl(struct socket *so, u_long cmd, caddr_t data, sizeof(*IFA_IN6(ifa)) << 3, rp_tmp.rp_plen, (sizeof(rap->ra_ifid) << 3) - rp_tmp.rp_plen); + IFA_UNLOCK(ifa); /* insert into list */ lck_mtx_lock(prefix6_mutex); LIST_INSERT_HEAD(&rp_tmp.rp_addrhead, rap, ra_entry); @@ -1292,30 +1380,3 @@ in6_prefix_ioctl(struct socket *so, u_long cmd, caddr_t data, } #endif -void -in6_rr_timer(__unused void *ignored_arg) -{ - struct rr_prefix *rpp; - struct timeval timenow; - - getmicrotime(&timenow); - - /* expire */ - lck_mtx_lock(prefix6_mutex); - rpp = LIST_FIRST(&rr_prefix); - while (rpp) { - if (rpp->rp_expire && rpp->rp_expire < timenow.tv_sec) { - struct rr_prefix *next_rpp; - - next_rpp = LIST_NEXT(rpp, rp_entry); - delete_each_prefix(rpp, PR_ORIG_KERNEL); - rpp = next_rpp; - continue; - } - if (rpp->rp_preferred && rpp->rp_preferred < timenow.tv_sec) - unprefer_prefix(rpp); - rpp = LIST_NEXT(rpp, rp_entry); - } - lck_mtx_unlock(prefix6_mutex); - timeout(in6_rr_timer, (caddr_t)0, ip6_rr_prune * hz); -} diff --git a/bsd/netinet6/in6_prefix.h b/bsd/netinet6/in6_prefix.h index f69562ae4..fa3567676 100644 --- a/bsd/netinet6/in6_prefix.h +++ b/bsd/netinet6/in6_prefix.h @@ -85,7 +85,6 @@ LIST_HEAD(rr_prhead, rr_prefix); extern struct rr_prhead rr_prefix; -void in6_rr_timer(void *); int delete_each_prefix (struct rr_prefix *rpp, u_char origin); #endif /* KERNEL_PRIVATE */ diff --git a/bsd/netinet6/in6_proto.c b/bsd/netinet6/in6_proto.c index b7dbae799..c0228feeb 100644 --- a/bsd/netinet6/in6_proto.c +++ b/bsd/netinet6/in6_proto.c @@ -129,6 +129,7 @@ #include #include #include +#include #include @@ -217,7 +218,7 @@ struct ip6protosw inet6sw[] = { { SOCK_RAW, &inet6domain, IPPROTO_ICMPV6, PR_ATOMIC|PR_ADDR|PR_LASTHDR, icmp6_input, rip6_pr_output, rip6_ctlinput, rip6_ctloutput, 0, - icmp6_init, icmp6_fasttimo, 0, 0, + icmp6_init, 0, mld_slowtimo, 0, 0, &rip6_usrreqs, 0, rip_unlock, 0, @@ -226,7 +227,7 @@ struct ip6protosw inet6sw[] = { { SOCK_DGRAM, &inet6domain, IPPROTO_ICMPV6, PR_ATOMIC|PR_ADDR|PR_LASTHDR, icmp6_input, rip6_pr_output, rip6_ctlinput, icmp6_dgram_ctloutput, 0, - icmp6_init, icmp6_fasttimo, 0, 0, + icmp6_init, 0, mld_slowtimo, 0, 0, &icmp6_dgram_usrreqs, 0, rip_unlock, 0, @@ -398,7 +399,7 @@ int ip6_accept_rtadv = 0; /* "IPV6FORWARDING ? 0 : 1" is dangerous */ int ip6_maxfragpackets; /* initialized in frag6.c:frag6_init() */ int ip6_maxfrags; int ip6_log_interval = 5; -int ip6_hdrnestlimit = 50; /* appropriate? */ +int ip6_hdrnestlimit = 15; /* How many header options will we process? */ int ip6_dad_count = 1; /* DupAddrDetectionTransmits */ u_int32_t ip6_flow_seq; int ip6_auto_flowlabel = 1; @@ -406,16 +407,19 @@ int ip6_gif_hlim = 0; int ip6_use_deprecated = 1; /* allow deprecated addr (RFC2462 5.5.4) */ int ip6_rr_prune = 5; /* router renumbering prefix * walk list every 5 sec. */ -int ip6_v6only = 0; /* Mapped addresses on by default - Radar 3347718 */ +int ip6_mcast_pmtu = 0; /* enable pMTU discovery for multicast? */ +int ip6_v6only = 0; /* Mapped addresses off by default - Radar 3347718 -- REVISITING FOR 10.7 -- TESTING WITH MAPPED@ OFF */ int ip6_neighborgcthresh = 1024; /* Threshold # of NDP entries for GC */ int ip6_maxifprefixes = 16; /* Max acceptable prefixes via RA per IF */ int ip6_maxifdefrouters = 16; /* Max acceptable def routers via RA */ int ip6_maxdynroutes = 1024; /* Max # of routes created via redirect */ +int ip6_only_allow_rfc4193_prefix = 0; /* Only allow RFC4193 style Unique Local IPv6 Unicast prefixes */ u_int32_t ip6_id = 0UL; int ip6_keepfaith = 0; time_t ip6_log_time = (time_t)0L; +int nd6_onlink_ns_rfc4861 = 0; /* allow 'on-link' nd6 NS (as in RFC 4861) */ /* icmp6 */ /* @@ -450,7 +454,7 @@ int udp6_recvspace = 40 * (1024 + sizeof(struct sockaddr_in6)); /* * sysctl related items. */ -SYSCTL_NODE(_net, PF_INET6, inet6, CTLFLAG_RW, 0, +SYSCTL_NODE(_net, PF_INET6, inet6, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Internet6 Family"); /* net.inet6 */ @@ -475,7 +479,8 @@ sysctl_ip6_temppltime SYSCTL_HANDLER_ARGS return (error); old = ip6_temp_preferred_lifetime; error = SYSCTL_IN(req, arg1, sizeof(int)); - if (ip6_temp_preferred_lifetime < + if (ip6_temp_preferred_lifetime > ND6_MAX_LIFETIME || + ip6_temp_preferred_lifetime < ip6_desync_factor + ip6_temp_regen_advance) { ip6_temp_preferred_lifetime = old; return(EINVAL); @@ -495,7 +500,8 @@ sysctl_ip6_tempvltime SYSCTL_HANDLER_ARGS return (error); old = ip6_temp_valid_lifetime; error = SYSCTL_IN(req, arg1, sizeof(int)); - if (ip6_temp_valid_lifetime < ip6_temp_preferred_lifetime) { + if (ip6_temp_valid_lifetime > ND6_MAX_LIFETIME || + ip6_temp_valid_lifetime < ip6_temp_preferred_lifetime) { ip6_temp_preferred_lifetime = old; return(EINVAL); } @@ -503,90 +509,103 @@ sysctl_ip6_tempvltime SYSCTL_HANDLER_ARGS } SYSCTL_INT(_net_inet6_ip6, IPV6CTL_FORWARDING, - forwarding, CTLFLAG_RW, &ip6_forwarding, 0, ""); + forwarding, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_forwarding, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_SENDREDIRECTS, - redirect, CTLFLAG_RW, &ip6_sendredirects, 0, ""); + redirect, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_sendredirects, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFHLIM, - hlim, CTLFLAG_RW, &ip6_defhlim, 0, ""); -SYSCTL_STRUCT(_net_inet6_ip6, IPV6CTL_STATS, stats, CTLFLAG_RD, + hlim, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_defhlim, 0, ""); +SYSCTL_STRUCT(_net_inet6_ip6, IPV6CTL_STATS, stats, CTLFLAG_RD | CTLFLAG_LOCKED, &ip6stat, ip6stat, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, - maxfragpackets, CTLFLAG_RW, &ip6_maxfragpackets, 0, ""); + maxfragpackets, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_maxfragpackets, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS, - maxfrags, CTLFLAG_RW, &ip6_maxfrags, 0, ""); + maxfrags, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_maxfrags, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_ACCEPT_RTADV, - accept_rtadv, CTLFLAG_RW, &ip6_accept_rtadv, 0, ""); + accept_rtadv, CTLFLAG_RW | CTLFLAG_LOCKED, + &ip6_accept_rtadv, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_KEEPFAITH, - keepfaith, CTLFLAG_RW, &ip6_keepfaith, 0, ""); + keepfaith, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_keepfaith, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_LOG_INTERVAL, - log_interval, CTLFLAG_RW, &ip6_log_interval, 0, ""); + log_interval, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_log_interval, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_HDRNESTLIMIT, - hdrnestlimit, CTLFLAG_RW, &ip6_hdrnestlimit, 0, ""); + hdrnestlimit, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_hdrnestlimit, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DAD_COUNT, - dad_count, CTLFLAG_RW, &ip6_dad_count, 0, ""); + dad_count, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_dad_count, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_AUTO_FLOWLABEL, - auto_flowlabel, CTLFLAG_RW, &ip6_auto_flowlabel, 0, ""); + auto_flowlabel, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_auto_flowlabel, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFMCASTHLIM, - defmcasthlim, CTLFLAG_RW, &ip6_defmcasthlim, 0, ""); + defmcasthlim, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_defmcasthlim, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_GIF_HLIM, - gifhlim, CTLFLAG_RW, &ip6_gif_hlim, 0, ""); + gifhlim, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_gif_hlim, 0, ""); SYSCTL_STRING(_net_inet6_ip6, IPV6CTL_KAME_VERSION, - kame_version, CTLFLAG_RD, (void *)((uintptr_t)(__KAME_VERSION)), 0, ""); + kame_version, CTLFLAG_RD | CTLFLAG_LOCKED, (void *)((uintptr_t)(__KAME_VERSION)), 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEPRECATED, - use_deprecated, CTLFLAG_RW, &ip6_use_deprecated, 0, ""); + use_deprecated, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_use_deprecated, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RR_PRUNE, - rr_prune, CTLFLAG_RW, &ip6_rr_prune, 0, ""); + rr_prune, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_rr_prune, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USETEMPADDR, - use_tempaddr, CTLFLAG_RW, &ip6_use_tempaddr, 0, ""); + use_tempaddr, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_use_tempaddr, 0, ""); SYSCTL_OID(_net_inet6_ip6, IPV6CTL_TEMPPLTIME, temppltime, - CTLTYPE_INT|CTLFLAG_RW, &ip6_temp_preferred_lifetime, 0, + CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_temp_preferred_lifetime, 0, sysctl_ip6_temppltime, "I", ""); SYSCTL_OID(_net_inet6_ip6, IPV6CTL_TEMPVLTIME, tempvltime, - CTLTYPE_INT|CTLFLAG_RW, &ip6_temp_valid_lifetime, 0, + CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_temp_valid_lifetime, 0, sysctl_ip6_tempvltime, "I", ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_V6ONLY, - v6only, CTLFLAG_RW, &ip6_v6only, 0, ""); + v6only, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_v6only, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_AUTO_LINKLOCAL, - auto_linklocal, CTLFLAG_RW, &ip6_auto_linklocal, 0, ""); -SYSCTL_STRUCT(_net_inet6_ip6, IPV6CTL_RIP6STATS, rip6stats, CTLFLAG_RD, + auto_linklocal, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_auto_linklocal, 0, ""); +SYSCTL_STRUCT(_net_inet6_ip6, IPV6CTL_RIP6STATS, rip6stats, CTLFLAG_RD | CTLFLAG_LOCKED, &rip6stat, rip6stat, ""); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_PREFER_TEMPADDR, + prefer_tempaddr, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_prefer_tempaddr, 0, ""); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEFAULTZONE, + use_defaultzone, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_use_defzone, 0,""); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MCAST_PMTU, + mcast_pmtu, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_mcast_pmtu, 0, ""); #if MROUTING -SYSCTL_STRUCT(_net_inet6_ip6, OID_AUTO, mrt6stat, CTLFLAG_RD, +SYSCTL_STRUCT(_net_inet6_ip6, OID_AUTO, mrt6stat, CTLFLAG_RD | CTLFLAG_LOCKED, &mrt6stat, mrt6stat, ""); #endif SYSCTL_INT(_net_inet6_ip6, IPV6CTL_NEIGHBORGCTHRESH, - neighborgcthresh, CTLFLAG_RW, &ip6_neighborgcthresh, 0, ""); + neighborgcthresh, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_neighborgcthresh, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXIFPREFIXES, - maxifprefixes, CTLFLAG_RW, &ip6_maxifprefixes, 0, ""); + maxifprefixes, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_maxifprefixes, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXIFDEFROUTERS, - maxifdefrouters, CTLFLAG_RW, &ip6_maxifdefrouters, 0, ""); + maxifdefrouters, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_maxifdefrouters, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXDYNROUTES, - maxdynroutes, CTLFLAG_RW, &ip6_maxdynroutes, 0, ""); - + maxdynroutes, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_maxdynroutes, 0, ""); +SYSCTL_INT(_net_inet6_ip6, OID_AUTO, + only_allow_rfc4193_prefixes, CTLFLAG_RW | CTLFLAG_LOCKED, + &ip6_only_allow_rfc4193_prefix, 0, ""); /* net.inet6.icmp6 */ SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRACCEPT, - rediraccept, CTLFLAG_RW, &icmp6_rediraccept, 0, ""); + rediraccept, CTLFLAG_RW | CTLFLAG_LOCKED, &icmp6_rediraccept, 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRTIMEOUT, - redirtimeout, CTLFLAG_RW, &icmp6_redirtimeout, 0, ""); -SYSCTL_STRUCT(_net_inet6_icmp6, ICMPV6CTL_STATS, stats, CTLFLAG_RD, + redirtimeout, CTLFLAG_RW | CTLFLAG_LOCKED, &icmp6_redirtimeout, 0, ""); +SYSCTL_STRUCT(_net_inet6_icmp6, ICMPV6CTL_STATS, stats, CTLFLAG_RD | CTLFLAG_LOCKED, &icmp6stat, icmp6stat, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_PRUNE, - nd6_prune, CTLFLAG_RW, &nd6_prune, 0, ""); + nd6_prune, CTLFLAG_RW | CTLFLAG_LOCKED, &nd6_prune, 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DELAY, - nd6_delay, CTLFLAG_RW, &nd6_delay, 0, ""); + nd6_delay, CTLFLAG_RW | CTLFLAG_LOCKED, &nd6_delay, 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_UMAXTRIES, - nd6_umaxtries, CTLFLAG_RW, &nd6_umaxtries, 0, ""); + nd6_umaxtries, CTLFLAG_RW | CTLFLAG_LOCKED, &nd6_umaxtries, 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MMAXTRIES, - nd6_mmaxtries, CTLFLAG_RW, &nd6_mmaxtries, 0, ""); + nd6_mmaxtries, CTLFLAG_RW | CTLFLAG_LOCKED, &nd6_mmaxtries, 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_USELOOPBACK, - nd6_useloopback, CTLFLAG_RW, &nd6_useloopback, 0, ""); + nd6_useloopback, CTLFLAG_RW | CTLFLAG_LOCKED, &nd6_useloopback, 0, ""); +SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_ACCEPT_6TO4, + nd6_accept_6to4, CTLFLAG_RW | CTLFLAG_LOCKED, &nd6_accept_6to4, 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO, - nodeinfo, CTLFLAG_RW, &icmp6_nodeinfo, 0, ""); + nodeinfo, CTLFLAG_RW | CTLFLAG_LOCKED, &icmp6_nodeinfo, 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ERRPPSLIMIT, - errppslimit, CTLFLAG_RW, &icmp6errppslim, 0, ""); + errppslimit, CTLFLAG_RW | CTLFLAG_LOCKED, &icmp6errppslim, 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXNUDHINT, - nd6_maxnudhint, CTLFLAG_RW, &nd6_maxnudhint, 0, ""); + nd6_maxnudhint, CTLFLAG_RW | CTLFLAG_LOCKED, &nd6_maxnudhint, 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DEBUG, - nd6_debug, CTLFLAG_RW, &nd6_debug, 0, ""); - + nd6_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &nd6_debug, 0, ""); +SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_ONLINKNSRFC4861, + nd6_onlink_ns_rfc4861, CTLFLAG_RW | CTLFLAG_LOCKED, &nd6_onlink_ns_rfc4861, 0, + "Accept 'on-link' nd6 NS in compliance with RFC 4861."); diff --git a/bsd/netinet6/in6_rmx.c b/bsd/netinet6/in6_rmx.c index d0ad6f2b1..63a66121d 100644 --- a/bsd/netinet6/in6_rmx.c +++ b/bsd/netinet6/in6_rmx.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2009 Apple Inc. All rights reserved. + * Copyright (c) 2003-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -108,8 +108,10 @@ #include #include #include +#include #include #include +#include #include #include @@ -130,8 +132,8 @@ extern int in6_inithead(void **head, int off); static void in6_rtqtimo(void *rock); -static void in6_mtutimo(void *rock); -extern int tvtohz(struct timeval *); +static void in6_mtutimo(void *rock); +extern int tvtohz(struct timeval *); static struct radix_node *in6_matroute_args(void *, struct radix_node_head *, rn_matchf_t *, void *); @@ -195,11 +197,13 @@ in6_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, * should elaborate the code. */ if (rt->rt_flags & RTF_HOST) { + IFA_LOCK_SPIN(rt->rt_ifa); if (IN6_ARE_ADDR_EQUAL(&satosin6(rt->rt_ifa->ifa_addr) ->sin6_addr, &sin6->sin6_addr)) { rt->rt_flags |= RTF_LOCAL; } + IFA_UNLOCK(rt->rt_ifa); } if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU) @@ -214,8 +218,8 @@ in6_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, * Find out if it is because of an * ARP entry and delete it if so. */ - rt2 = rtalloc1_locked((struct sockaddr *)sin6, 0, - RTF_CLONING | RTF_PRCLONING); + rt2 = rtalloc1_scoped_locked((struct sockaddr *)sin6, 0, + RTF_CLONING | RTF_PRCLONING, sin6_get_ifscope(rt_key(rt))); if (rt2) { RT_LOCK(rt2); if ((rt2->rt_flags & RTF_LLINFO) && @@ -253,8 +257,8 @@ in6_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, * net route entry, 3ffe:0501:: -> if0. * This case should not raise an error. */ - rt2 = rtalloc1_locked((struct sockaddr *)sin6, 0, - RTF_CLONING | RTF_PRCLONING); + rt2 = rtalloc1_scoped_locked((struct sockaddr *)sin6, 0, + RTF_CLONING | RTF_PRCLONING, sin6_get_ifscope(rt_key(rt))); if (rt2) { RT_LOCK(rt2); if ((rt2->rt_flags & (RTF_CLONING|RTF_HOST|RTF_GATEWAY)) @@ -294,6 +298,24 @@ in6_deleteroute(void * v_arg, void *netmask_arg, struct radix_node_head *head) return (rn); } +/* + * Validate (unexpire) an expiring AF_INET6 route. + */ +struct radix_node * +in6_validate(struct radix_node *rn) +{ + struct rtentry *rt = (struct rtentry *)rn; + + RT_LOCK_ASSERT_HELD(rt); + + /* This is first reference? */ + if (rt->rt_refcnt == 0 && (rt->rt_flags & RTPRF_OURS)) { + rt->rt_flags &= ~RTPRF_OURS; + rt_setexpire(rt, 0); + } + return (rn); +} + /* * Similar to in6_matroute_args except without the leaf-matching parameters. */ @@ -313,16 +335,11 @@ in6_matroute_args(void *v_arg, struct radix_node_head *head, rn_matchf_t *f, void *w) { struct radix_node *rn = rn_match_args(v_arg, head, f, w); - struct rtentry *rt = (struct rtentry *)rn; - /* This is first reference? */ - if (rt != NULL) { - RT_LOCK_SPIN(rt); - if (rt->rt_refcnt == 0 && (rt->rt_flags & RTPRF_OURS)) { - rt->rt_flags &= ~RTPRF_OURS; - rt->rt_rmx.rmx_expire = 0; - } - RT_UNLOCK(rt); + if (rn != NULL) { + RT_LOCK_SPIN((struct rtentry *)rn); + in6_validate(rn); + RT_UNLOCK((struct rtentry *)rn); } return (rn); } @@ -332,17 +349,17 @@ SYSCTL_DECL(_net_inet6_ip6); static int rtq_reallyold = 60*60; /* one hour is ``really old'' */ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RTEXPIRE, rtexpire, - CTLFLAG_RW, &rtq_reallyold , 0, ""); + CTLFLAG_RW | CTLFLAG_LOCKED, &rtq_reallyold , 0, ""); static int rtq_minreallyold = 10; /* never automatically crank down to less */ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RTMINEXPIRE, rtminexpire, - CTLFLAG_RW, &rtq_minreallyold , 0, ""); + CTLFLAG_RW | CTLFLAG_LOCKED, &rtq_minreallyold , 0, ""); static int rtq_toomany = 128; /* 128 cached routes is ``too many'' */ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RTMAXCACHE, rtmaxcache, - CTLFLAG_RW, &rtq_toomany , 0, ""); + CTLFLAG_RW | CTLFLAG_LOCKED, &rtq_toomany , 0, ""); /* @@ -394,12 +411,12 @@ in6_clsroute(struct radix_node *rn, __unused struct radix_node_head *head) RT_LOCK(rt); } } else { - struct timeval timenow; + uint64_t timenow; - getmicrotime(&timenow); + timenow = net_uptime(); rt->rt_flags |= RTPRF_OURS; - rt->rt_rmx.rmx_expire = - rt_expiry(rt, timenow.tv_sec, rtq_reallyold); + rt_setexpire(rt, + rt_expiry(rt, timenow, rtq_reallyold)); } } @@ -410,7 +427,7 @@ struct rtqk_arg { int draining; int killed; int found; - time_t nextstop; + uint64_t nextstop; }; /* @@ -426,16 +443,17 @@ in6_rtqkill(struct radix_node *rn, void *rock) struct rtqk_arg *ap = rock; struct rtentry *rt = (struct rtentry *)rn; int err; - struct timeval timenow; + uint64_t timenow; - getmicrotime(&timenow); + timenow = net_uptime(); lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); RT_LOCK(rt); if (rt->rt_flags & RTPRF_OURS) { ap->found++; - - if (ap->draining || rt->rt_rmx.rmx_expire <= timenow.tv_sec || + VERIFY(rt->rt_expire == 0 || rt->rt_rmx.rmx_expire != 0); + VERIFY(rt->rt_expire != 0 || rt->rt_rmx.rmx_expire == 0); + if (ap->draining || rt->rt_expire <= timenow || ((rt->rt_flags & RTF_DYNAMIC) != 0 && ip6_maxdynroutes >= 0 && in6dynroutes > ip6_maxdynroutes / 2)) { @@ -461,13 +479,13 @@ in6_rtqkill(struct radix_node *rn, void *rock) } } else { if (ap->updating && - (unsigned)(rt->rt_rmx.rmx_expire - timenow.tv_sec) > + (rt->rt_expire - timenow) > rt_expiry(rt, 0, rtq_reallyold)) { - rt->rt_rmx.rmx_expire = rt_expiry(rt, - timenow.tv_sec, rtq_reallyold); + rt_setexpire(rt, rt_expiry(rt, + timenow, rtq_reallyold)); } ap->nextstop = lmin(ap->nextstop, - rt->rt_rmx.rmx_expire); + rt->rt_expire); RT_UNLOCK(rt); } } else { @@ -486,16 +504,16 @@ in6_rtqtimo(void *rock) struct radix_node_head *rnh = rock; struct rtqk_arg arg; struct timeval atv; - static time_t last_adjusted_timeout = 0; - struct timeval timenow; + static uint64_t last_adjusted_timeout = 0; + uint64_t timenow; lck_mtx_lock(rnh_lock); /* Get the timestamp after we acquire the lock for better accuracy */ - getmicrotime(&timenow); + timenow = net_uptime(); arg.found = arg.killed = 0; arg.rnh = rnh; - arg.nextstop = timenow.tv_sec + rtq_timeout; + arg.nextstop = timenow + rtq_timeout; arg.draining = arg.updating = 0; rnh->rnh_walktree(rnh, in6_rtqkill, &arg); @@ -508,14 +526,14 @@ in6_rtqtimo(void *rock) * hard. */ if ((arg.found - arg.killed > rtq_toomany) - && (timenow.tv_sec - last_adjusted_timeout >= rtq_timeout) + && ((timenow - last_adjusted_timeout) >= (uint64_t)rtq_timeout) && rtq_reallyold > rtq_minreallyold) { rtq_reallyold = 2*rtq_reallyold / 3; if (rtq_reallyold < rtq_minreallyold) { rtq_reallyold = rtq_minreallyold; } - last_adjusted_timeout = timenow.tv_sec; + last_adjusted_timeout = timenow; #if DIAGNOSTIC log(LOG_DEBUG, "in6_rtqtimo: adjusted rtq_reallyold to %d", rtq_reallyold); @@ -526,7 +544,7 @@ in6_rtqtimo(void *rock) } atv.tv_usec = 0; - atv.tv_sec = arg.nextstop - timenow.tv_sec; + atv.tv_sec = arg.nextstop - timenow; lck_mtx_unlock(rnh_lock); timeout(in6_rtqtimo, rock, tvtohz(&atv)); } @@ -536,7 +554,7 @@ in6_rtqtimo(void *rock) */ struct mtuex_arg { struct radix_node_head *rnh; - time_t nextstop; + uint64_t nextstop; }; static int @@ -544,21 +562,23 @@ in6_mtuexpire(struct radix_node *rn, void *rock) { struct rtentry *rt = (struct rtentry *)rn; struct mtuex_arg *ap = rock; - struct timeval timenow; + uint64_t timenow; - getmicrotime(&timenow); + timenow = net_uptime(); /* sanity */ if (!rt) panic("rt == NULL in in6_mtuexpire"); RT_LOCK(rt); - if (rt->rt_rmx.rmx_expire && !(rt->rt_flags & RTF_PROBEMTU)) { - if (rt->rt_rmx.rmx_expire <= timenow.tv_sec) { + VERIFY(rt->rt_expire == 0 || rt->rt_rmx.rmx_expire != 0); + VERIFY(rt->rt_expire != 0 || rt->rt_rmx.rmx_expire == 0); + if (rt->rt_expire && !(rt->rt_flags & RTF_PROBEMTU)) { + if (rt->rt_expire <= timenow) { rt->rt_flags |= RTF_PROBEMTU; } else { ap->nextstop = lmin(ap->nextstop, - rt->rt_rmx.rmx_expire); + rt->rt_expire); } } RT_UNLOCK(rt); @@ -574,24 +594,24 @@ in6_mtutimo(void *rock) struct radix_node_head *rnh = rock; struct mtuex_arg arg; struct timeval atv; - struct timeval timenow; + uint64_t timenow, timo; - getmicrotime(&timenow); + timenow = net_uptime(); arg.rnh = rnh; - arg.nextstop = timenow.tv_sec + MTUTIMO_DEFAULT; + arg.nextstop = timenow + MTUTIMO_DEFAULT; lck_mtx_lock(rnh_lock); rnh->rnh_walktree(rnh, in6_mtuexpire, &arg); atv.tv_usec = 0; - atv.tv_sec = arg.nextstop; - if (atv.tv_sec < timenow.tv_sec) { + timo = arg.nextstop; + if (timo < timenow) { #if DIAGNOSTIC log(LOG_DEBUG, "IPv6: invalid mtu expiration time on routing table\n"); #endif - arg.nextstop = timenow.tv_sec + 30; /*last resort*/ + arg.nextstop = timenow + 30; /*last resort*/ } - atv.tv_sec -= timenow.tv_sec; + atv.tv_sec = timo - timenow; lck_mtx_unlock(rnh_lock); timeout(in6_mtutimo, rock, tvtohz(&atv)); } diff --git a/bsd/netinet6/in6_src.c b/bsd/netinet6/in6_src.c index 71441847b..1eb5cd60f 100644 --- a/bsd/netinet6/in6_src.c +++ b/bsd/netinet6/in6_src.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -104,9 +104,13 @@ #include #include #include +#include +#include +#include #include #include +#include #include #include @@ -118,273 +122,950 @@ #include #include #include +#include #include -#if ENABLE_DEFAULT_SCOPE -#include -#endif #include #include "loop.h" +SYSCTL_DECL(_net_inet6_ip6); + +static int ip6_select_srcif_debug = 0; +SYSCTL_INT(_net_inet6_ip6, OID_AUTO, select_srcif_debug, + CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_select_srcif_debug, 0, + "log source interface selection debug info"); + +#define ADDR_LABEL_NOTAPP (-1) +struct in6_addrpolicy defaultaddrpolicy; + +int ip6_prefer_tempaddr = 1; +#ifdef ENABLE_ADDRSEL +extern lck_mtx_t *addrsel_mutex; +#define ADDRSEL_LOCK() lck_mtx_lock(addrsel_mutex) +#define ADDRSEL_UNLOCK() lck_mtx_unlock(addrsel_mutex) +#else +#define ADDRSEL_LOCK() +#define ADDRSEL_UNLOCK() +#endif + +static int selectroute(struct sockaddr_in6 *, struct sockaddr_in6 *, + struct ip6_pktopts *, struct ip6_moptions *, struct route_in6 *, + struct ifnet **, struct rtentry **, int, int, unsigned int, + unsigned int); +static int in6_selectif(struct sockaddr_in6 *, struct ip6_pktopts *, + struct ip6_moptions *, struct route_in6 *ro, unsigned int, + unsigned int, struct ifnet **); +static void init_policy_queue(void); +static int add_addrsel_policyent(const struct in6_addrpolicy *); +#ifdef ENABLE_ADDRSEL +static int delete_addrsel_policyent(const struct in6_addrpolicy *); +#endif +static int walk_addrsel_policy(int (*)(const struct in6_addrpolicy *, void *), + void *); +static int dump_addrsel_policyent(const struct in6_addrpolicy *, void *); +static struct in6_addrpolicy *match_addrsel_policy(struct sockaddr_in6 *); +void addrsel_policy_init(void); + /* * Return an IPv6 address, which is the most appropriate for a given * destination and user specified options. * If necessary, this function lookups the routing table and returns * an entry to the caller for later use. */ +#define REPLACE(r) do {\ + if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \ + sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \ + ip6stat.ip6s_sources_rule[(r)]++; \ + goto replace; \ +} while(0) +#define NEXTSRC(r) do {\ + if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \ + sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \ + ip6stat.ip6s_sources_rule[(r)]++; \ + goto next; /* XXX: we can't use 'continue' here */ \ +} while(0) +#define BREAK(r) do { \ + if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \ + sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \ + ip6stat.ip6s_sources_rule[(r)]++; \ + goto out; /* XXX: we can't use 'break' here */ \ +} while(0) + struct in6_addr * -in6_selectsrc( - struct sockaddr_in6 *dstsock, - struct ip6_pktopts *opts, - struct ip6_moptions *mopts, - struct route_in6 *ro, - struct in6_addr *laddr, - struct in6_addr *src_storage, - int *errorp) +in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, + struct inpcb *inp, struct route_in6 *ro, + struct ifnet **ifpp, struct in6_addr *src_storage, unsigned int ifscope, + int *errorp) { - struct in6_addr *dst; - struct in6_ifaddr *ia6 = 0; + struct in6_addr dst; + struct ifnet *ifp = NULL; + struct in6_ifaddr *ia = NULL, *ia_best = NULL; struct in6_pktinfo *pi = NULL; + int dst_scope = -1, best_scope = -1, best_matchlen = -1; + struct in6_addrpolicy *dst_policy = NULL, *best_policy = NULL; + u_int32_t odstzone; + int prefer_tempaddr; + struct ip6_moptions *mopts; + struct timeval timenow; + unsigned int nocell; + boolean_t islocal = FALSE; - dst = &dstsock->sin6_addr; + getmicrotime(&timenow); + + dst = dstsock->sin6_addr; /* make a copy for local operation */ *errorp = 0; + if (ifpp != NULL) + *ifpp = NULL; + + if (inp != NULL) { + mopts = inp->in6p_moptions; + nocell = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; + } else { + mopts = NULL; + nocell = 0; + } /* * If the source address is explicitly specified by the caller, - * use it. + * check if the requested source address is indeed a unicast address + * assigned to the node, and can be used as the packet's source + * address. If everything is okay, use the address as source. */ if (opts && (pi = opts->ip6po_pktinfo) && - !IN6_IS_ADDR_UNSPECIFIED(&pi->ipi6_addr)) - return(&pi->ipi6_addr); + !IN6_IS_ADDR_UNSPECIFIED(&pi->ipi6_addr)) { + struct sockaddr_in6 srcsock; + struct in6_ifaddr *ia6; + + /* get the outgoing interface */ + if ((*errorp = in6_selectif(dstsock, opts, mopts, ro, ifscope, + nocell, &ifp)) != 0) { + return (NULL); + } + + /* + * determine the appropriate zone id of the source based on + * the zone of the destination and the outgoing interface. + * If the specified address is ambiguous wrt the scope zone, + * the interface must be specified; otherwise, ifa_ifwithaddr() + * will fail matching the address. + */ + bzero(&srcsock, sizeof(srcsock)); + srcsock.sin6_family = AF_INET6; + srcsock.sin6_len = sizeof(srcsock); + srcsock.sin6_addr = pi->ipi6_addr; + if (ifp) { + *errorp = in6_setscope(&srcsock.sin6_addr, ifp, NULL); + if (*errorp != 0) { + ifnet_release(ifp); + return (NULL); + } + } + ia6 = (struct in6_ifaddr *)ifa_ifwithaddr((struct sockaddr *)(&srcsock)); + if (ia6 == NULL) { + *errorp = EADDRNOTAVAIL; + if (ifp != NULL) + ifnet_release(ifp); + return (NULL); + } + IFA_LOCK_SPIN(&ia6->ia_ifa); + if ((ia6->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY)) || + (nocell && (ia6->ia_ifa.ifa_ifp->if_type == IFT_CELLULAR))) { + IFA_UNLOCK(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); + *errorp = EADDRNOTAVAIL; + if (ifp != NULL) + ifnet_release(ifp); + return (NULL); + } + + *src_storage = satosin6(&ia6->ia_addr)->sin6_addr; + IFA_UNLOCK(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); + if (ifpp != NULL) { + /* if ifp is non-NULL, refcnt held in in6_selectif() */ + *ifpp = ifp; + } else if (ifp != NULL) { + ifnet_release(ifp); + } + return (src_storage); + } /* - * If the source address is not specified but the socket(if any) - * is already bound, use the bound address. + * Otherwise, if the socket has already bound the source, just use it. */ - if (laddr && !IN6_IS_ADDR_UNSPECIFIED(laddr)) - return(laddr); + if (inp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) + return (&inp->in6p_laddr); /* - * If the caller doesn't specify the source address but - * the outgoing interface, use an address associated with - * the interface. + * If the address is not specified, choose the best one based on + * the outgoing interface and the destination address. */ - if (pi && pi->ipi6_ifindex) { - ifnet_t out_ifp = NULL; - ifnet_head_lock_shared(); - if (pi->ipi6_ifindex > if_index) { - ifnet_head_done(); - *errorp = EADDRNOTAVAIL; - return(0); - } else { - out_ifp = ifindex2ifnet[pi->ipi6_ifindex]; + + /* get the outgoing interface */ + if ((*errorp = in6_selectif(dstsock, opts, mopts, ro, ifscope, nocell, + &ifp)) != 0) + return (NULL); + +#ifdef DIAGNOSTIC + if (ifp == NULL) /* this should not happen */ + panic("in6_selectsrc: NULL ifp"); +#endif + *errorp = in6_setscope(&dst, ifp, &odstzone); + if (*errorp != 0) { + if (ifp != NULL) + ifnet_release(ifp); + return (NULL); + } + lck_rw_lock_shared(&in6_ifaddr_rwlock); + + for (ia = in6_ifaddrs; ia; ia = ia->ia_next) { + int new_scope = -1, new_matchlen = -1; + struct in6_addrpolicy *new_policy = NULL; + u_int32_t srczone, osrczone, dstzone; + struct in6_addr src; + struct ifnet *ifp1 = ia->ia_ifp; + + IFA_LOCK(&ia->ia_ifa); + /* + * We'll never take an address that breaks the scope zone + * of the destination. We also skip an address if its zone + * does not contain the outgoing interface. + * XXX: we should probably use sin6_scope_id here. + */ + if (in6_setscope(&dst, ifp1, &dstzone) || + odstzone != dstzone) + goto next; + + src = ia->ia_addr.sin6_addr; + if (in6_setscope(&src, ifp, &osrczone) || + in6_setscope(&src, ifp1, &srczone) || + osrczone != srczone) + goto next; + + /* avoid unusable addresses */ + if ((ia->ia6_flags & + (IN6_IFF_NOTREADY | IN6_IFF_ANYCAST | IN6_IFF_DETACHED))) + goto next; + + if (!ip6_use_deprecated && IFA6_IS_DEPRECATED(ia)) + goto next; + + /* Rule 1: Prefer same address */ + if (IN6_ARE_ADDR_EQUAL(&dst, &ia->ia_addr.sin6_addr)) + BREAK(1); /* there should be no better candidate */ + + if (ia_best == NULL) + REPLACE(0); + + /* Rule 2: Prefer appropriate scope */ + if (dst_scope < 0) + dst_scope = in6_addrscope(&dst); + new_scope = in6_addrscope(&ia->ia_addr.sin6_addr); + if (IN6_ARE_SCOPE_CMP(best_scope, new_scope) < 0) { + if (IN6_ARE_SCOPE_CMP(best_scope, dst_scope) < 0) + REPLACE(2); + NEXTSRC(2); + } else if (IN6_ARE_SCOPE_CMP(new_scope, best_scope) < 0) { + if (IN6_ARE_SCOPE_CMP(new_scope, dst_scope) < 0) + NEXTSRC(2); + REPLACE(2); } - ifnet_head_done(); - - /* XXX boundary check is assumed to be already done. */ - ia6 = in6_ifawithscope(out_ifp, dst); - if (ia6 == 0) { - *errorp = EADDRNOTAVAIL; - return(0); + + /* + * Rule 3: Avoid deprecated addresses. Note that the case of + * !ip6_use_deprecated is already rejected above. + */ + if (!IFA6_IS_DEPRECATED(ia_best) && IFA6_IS_DEPRECATED(ia)) + NEXTSRC(3); + if (IFA6_IS_DEPRECATED(ia_best) && !IFA6_IS_DEPRECATED(ia)) + REPLACE(3); + + /* Rule 4: Prefer home addresses */ + /* + * XXX: This is a TODO. We should probably merge the MIP6 + * case above. + */ + + /* Rule 5: Prefer outgoing interface */ + if (ia_best->ia_ifp == ifp && ia->ia_ifp != ifp) + NEXTSRC(5); + if (ia_best->ia_ifp != ifp && ia->ia_ifp == ifp) + REPLACE(5); + + /* + * Rule 6: Prefer matching label + * Note that best_policy should be non-NULL here. + */ + if (dst_policy == NULL) + dst_policy = in6_addrsel_lookup_policy(dstsock); + if (dst_policy->label != ADDR_LABEL_NOTAPP) { + new_policy = in6_addrsel_lookup_policy(&ia->ia_addr); + if (dst_policy->label == best_policy->label && + dst_policy->label != new_policy->label) + NEXTSRC(6); + if (dst_policy->label != best_policy->label && + dst_policy->label == new_policy->label) + REPLACE(6); } - *src_storage = satosin6(&ia6->ia_addr)->sin6_addr; - ifafree(&ia6->ia_ifa); - return src_storage; + + /* + * Rule 7: Prefer public addresses. + * We allow users to reverse the logic by configuring + * a sysctl variable, so that privacy conscious users can + * always prefer temporary addresses. + * Don't use temporary addresses for local destinations or + * for multicast addresses unless we were passed in an option. + */ + if (IN6_IS_ADDR_MULTICAST(&dst) || + in6_matchlen(&ia_best->ia_addr.sin6_addr, &dst) >= + in6_mask2len(&ia_best->ia_prefixmask.sin6_addr, NULL)) + islocal = TRUE; + if (opts == NULL || + opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_SYSTEM) { + prefer_tempaddr = islocal ? 0 : ip6_prefer_tempaddr; + } else if (opts->ip6po_prefer_tempaddr == + IP6PO_TEMPADDR_NOTPREFER) { + prefer_tempaddr = 0; + } else + prefer_tempaddr = 1; + if (!(ia_best->ia6_flags & IN6_IFF_TEMPORARY) && + (ia->ia6_flags & IN6_IFF_TEMPORARY)) { + if (prefer_tempaddr) + REPLACE(7); + else + NEXTSRC(7); + } + if ((ia_best->ia6_flags & IN6_IFF_TEMPORARY) && + !(ia->ia6_flags & IN6_IFF_TEMPORARY)) { + if (prefer_tempaddr) + NEXTSRC(7); + else + REPLACE(7); + } + + /* + * Rule 8: prefer addresses on alive interfaces. + * This is a KAME specific rule. + */ + if ((ia_best->ia_ifp->if_flags & IFF_UP) && + !(ia->ia_ifp->if_flags & IFF_UP)) + NEXTSRC(8); + if (!(ia_best->ia_ifp->if_flags & IFF_UP) && + (ia->ia_ifp->if_flags & IFF_UP)) + REPLACE(8); + + /* + * Rule 14: Use longest matching prefix. + * Note: in the address selection draft, this rule is + * documented as "Rule 8". However, since it is also + * documented that this rule can be overridden, we assign + * a large number so that it is easy to assign smaller numbers + * to more preferred rules. + */ + new_matchlen = in6_matchlen(&ia->ia_addr.sin6_addr, &dst); + if (best_matchlen < new_matchlen) + REPLACE(14); + if (new_matchlen < best_matchlen) + NEXTSRC(14); + + /* Rule 15 is reserved. */ + + /* + * Last resort: just keep the current candidate. + * Or, do we need more rules? + */ + IFA_UNLOCK(&ia->ia_ifa); + continue; + +replace: + best_scope = (new_scope >= 0 ? new_scope : + in6_addrscope(&ia->ia_addr.sin6_addr)); + best_policy = (new_policy ? new_policy : + in6_addrsel_lookup_policy(&ia->ia_addr)); + best_matchlen = (new_matchlen >= 0 ? new_matchlen : + in6_matchlen(&ia->ia_addr.sin6_addr, &dst)); + IFA_ADDREF_LOCKED(&ia->ia_ifa); /* for ia_best */ + IFA_UNLOCK(&ia->ia_ifa); + if (ia_best != NULL) + IFA_REMREF(&ia_best->ia_ifa); + ia_best = ia; + continue; + +next: + IFA_UNLOCK(&ia->ia_ifa); + continue; + +out: + IFA_ADDREF_LOCKED(&ia->ia_ifa); /* for ia_best */ + IFA_UNLOCK(&ia->ia_ifa); + if (ia_best != NULL) + IFA_REMREF(&ia_best->ia_ifa); + ia_best = ia; + break; + } + + lck_rw_done(&in6_ifaddr_rwlock); + + if (nocell && ia_best != NULL && + (ia_best->ia_ifa.ifa_ifp->if_type == IFT_CELLULAR)) { + IFA_REMREF(&ia_best->ia_ifa); + ia_best = NULL; + } + + if ( (ia = ia_best) == NULL) { + *errorp = EADDRNOTAVAIL; + if (ifp != NULL) + ifnet_release(ifp); + return (NULL); } + IFA_LOCK_SPIN(&ia->ia_ifa); + *src_storage = satosin6(&ia->ia_addr)->sin6_addr; + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); + if (ifpp != NULL) { + /* if ifp is non-NULL, refcnt held in in6_selectif() */ + *ifpp = ifp; + } else if (ifp != NULL) { + ifnet_release(ifp); + } + return (src_storage); +} + +/* + * Given a source IPv6 address (and route, if available), determine the best + * interface to send the packet from. Checking for (and updating) the + * ROF_SRCIF_SELECTED flag in the pcb-supplied route placeholder is done + * without any locks, based on the assumption that in the event this is + * called from ip6_output(), the output operation is single-threaded per-pcb, + * i.e. for any given pcb there can only be one thread performing output at + * the IPv6 layer. + * + * This routine is analogous to in_selectsrcif() for IPv4. + * + * clone - meaningful only for bsdi and freebsd + */ +static int +selectroute(struct sockaddr_in6 *srcsock, struct sockaddr_in6 *dstsock, + struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct route_in6 *ro, + struct ifnet **retifp, struct rtentry **retrt, int clone, + int norouteok, unsigned int ifscope, unsigned int nocell) +{ + int error = 0; + struct ifnet *ifp = NULL; + struct route_in6 *route = NULL; + struct sockaddr_in6 *sin6_next; + struct in6_pktinfo *pi = NULL; + struct in6_addr *dst = &dstsock->sin6_addr; + struct ifaddr *ifa = NULL; + char s_src[MAX_IPv6_STR_LEN], s_dst[MAX_IPv6_STR_LEN]; + boolean_t select_srcif; + +#if 0 + char ip6buf[INET6_ADDRSTRLEN]; + + if (dstsock->sin6_addr.s6_addr32[0] == 0 && + dstsock->sin6_addr.s6_addr32[1] == 0 && + !IN6_IS_ADDR_LOOPBACK(&dstsock->sin6_addr)) { + printf("in6_selectroute: strange destination %s\n", + ip6_sprintf(ip6buf, &dstsock->sin6_addr)); + } else { + printf("in6_selectroute: destination = %s%%%d\n", + ip6_sprintf(ip6buf, &dstsock->sin6_addr), + dstsock->sin6_scope_id); /* for debug */ + } +#endif + + if (retifp != NULL) + *retifp = NULL; + + if (retrt != NULL) + *retrt = NULL; + + if (ip6_select_srcif_debug) { + struct in6_addr src; + src = (srcsock != NULL) ? srcsock->sin6_addr : in6addr_any; + (void) inet_ntop(AF_INET6, &src, s_src, sizeof (s_src)); + (void) inet_ntop(AF_INET6, dst, s_dst, sizeof (s_dst)); + } + + /* + * If the destination address is UNSPECIFIED addr, bail out. + */ + if (IN6_IS_ADDR_UNSPECIFIED(dst)) { + error = EHOSTUNREACH; + goto done; + } + + /* + * Perform source interface selection only if Scoped Routing + * is enabled and a source address that isn't unspecified. + */ + select_srcif = (ip6_doscopedroute && srcsock != NULL && + !IN6_IS_ADDR_UNSPECIFIED(&srcsock->sin6_addr)); + /* - * If the destination address is a link-local unicast address or - * a multicast address, and if the outgoing interface is specified - * by the sin6_scope_id filed, use an address associated with the - * interface. - * XXX: We're now trying to define more specific semantics of - * sin6_scope_id field, so this part will be rewritten in - * the near future. + * If Scoped Routing is disabled, ignore the given ifscope. + * Otherwise even if source selection won't be performed, + * we still obey IPV6_BOUND_IF. */ - if ((IN6_IS_ADDR_LINKLOCAL(dst) || IN6_IS_ADDR_MULTICAST(dst)) && - dstsock->sin6_scope_id) { + if (!ip6_doscopedroute && ifscope != IFSCOPE_NONE) + ifscope = IFSCOPE_NONE; + + /* If the caller specified the outgoing interface explicitly, use it */ + if (opts != NULL && (pi = opts->ip6po_pktinfo) != NULL && + pi->ipi6_ifindex != 0) { /* - * I'm not sure if boundary check for scope_id is done - * somewhere... - * - * Since sin6_scope_id is unsigned, we only need to check against if_index. + * If IPV6_PKTINFO takes precedence over IPV6_BOUND_IF. */ - ifnet_t out_ifp = NULL; + ifscope = pi->ipi6_ifindex; ifnet_head_lock_shared(); - if (if_index < dstsock->sin6_scope_id) { - *errorp = ENXIO; /* XXX: better error? */ - ifnet_head_done(); - return(0); + /* ifp may be NULL if detached or out of range */ + ifp = (ifscope <= if_index) ? ifindex2ifnet[ifscope] : NULL; + ifnet_head_done(); + if (norouteok || retrt == NULL || IN6_IS_ADDR_MULTICAST(dst)) { + /* + * We do not have to check or get the route for + * multicast. If the caller didn't ask/care for + * the route and we have no interface to use, + * it's an error. + */ + if (ifp == NULL) + error = EHOSTUNREACH; + goto done; } else { - out_ifp = ifindex2ifnet[dstsock->sin6_scope_id]; + goto getsrcif; } - ifnet_head_done(); + } - ia6 = in6_ifawithscope(out_ifp, dst); - if (ia6 == 0) { - *errorp = EADDRNOTAVAIL; - return(0); + /* + * If the destination address is a multicast address and the outgoing + * interface for the address is specified by the caller, use it. + */ + if (IN6_IS_ADDR_MULTICAST(dst) && mopts != NULL) { + IM6O_LOCK(mopts); + if ((ifp = mopts->im6o_multicast_ifp) != NULL) { + IM6O_UNLOCK(mopts); + goto done; /* we do not need a route for multicast. */ } - *src_storage = satosin6(&ia6->ia_addr)->sin6_addr; - ifafree(&ia6->ia_ifa); - return src_storage; + IM6O_UNLOCK(mopts); + } + +getsrcif: + /* + * If the outgoing interface was not set via IPV6_BOUND_IF or + * IPV6_PKTINFO, use the scope ID in the destination address. + */ + if (ip6_doscopedroute && ifscope == IFSCOPE_NONE) + ifscope = dstsock->sin6_scope_id; + + /* + * Perform source interface selection; the source IPv6 address + * must belong to one of the addresses of the interface used + * by the route. For performance reasons, do this only if + * there is no route, or if the routing table has changed, + * or if we haven't done source interface selection on this + * route (for this PCB instance) before. + */ + if (!select_srcif || (ro != NULL && ro->ro_rt != NULL && + (ro->ro_rt->rt_flags & RTF_UP) && + ro->ro_rt->generation_id == route_generation && + (ro->ro_flags & ROF_SRCIF_SELECTED))) { + if (ro != NULL && ro->ro_rt != NULL) { + ifa = ro->ro_rt->rt_ifa; + IFA_ADDREF(ifa); + } + goto getroute; } /* - * If the destination address is a multicast address and - * the outgoing interface for the address is specified - * by the caller, use an address associated with the interface. - * There is a sanity check here; if the destination has node-local - * scope, the outgoing interfacde should be a loopback address. - * Even if the outgoing interface is not specified, we also - * choose a loopback interface as the outgoing interface. + * Given the source IPv6 address, find a suitable source interface + * to use for transmission; if a scope ID has been specified, + * optimize the search by looking at the addresses only for that + * interface. This is still suboptimal, however, as we need to + * traverse the per-interface list. */ - if (IN6_IS_ADDR_MULTICAST(dst)) { - struct ifnet *ifp = mopts ? mopts->im6o_multicast_ifp : NULL; + if (ifscope != IFSCOPE_NONE || (ro != NULL && ro->ro_rt != NULL)) { + unsigned int scope = ifscope; + struct ifnet *rt_ifp; + + rt_ifp = (ro->ro_rt != NULL) ? ro->ro_rt->rt_ifp : NULL; - if (ifp == NULL && IN6_IS_ADDR_MC_NODELOCAL(dst)) { - ifp = lo_ifp; + /* + * If no scope is specified and the route is stale (pointing + * to a defunct interface) use the current primary interface; + * this happens when switching between interfaces configured + * with the same IPv6 address. Otherwise pick up the scope + * information from the route; the ULP may have looked up a + * correct route and we just need to verify it here and mark + * it with the ROF_SRCIF_SELECTED flag below. + */ + if (scope == IFSCOPE_NONE) { + scope = rt_ifp->if_index; + if (scope != get_primary_ifscope(AF_INET6) && + ro->ro_rt->generation_id != route_generation) + scope = get_primary_ifscope(AF_INET6); } - if (ifp) { - ia6 = in6_ifawithscope(ifp, dst); - if (ia6 == 0) { - *errorp = EADDRNOTAVAIL; - return(0); + ifa = (struct ifaddr *) + ifa_foraddr6_scoped(&srcsock->sin6_addr, scope); + + if (ip6_select_srcif_debug && ifa != NULL) { + if (ro->ro_rt != NULL) { + printf("%s->%s ifscope %d->%d ifa_if %s " + "ro_if %s\n", s_src, s_dst, ifscope, + scope, if_name(ifa->ifa_ifp), + if_name(rt_ifp)); + } else { + printf("%s->%s ifscope %d->%d ifa_if %s\n", + s_src, s_dst, ifscope, scope, + if_name(ifa->ifa_ifp)); } - *src_storage = satosin6(&ia6->ia_addr)->sin6_addr; - ifafree(&ia6->ia_ifa); - return src_storage; } } /* - * If the next hop address for the packet is specified - * by caller, use an address associated with the route - * to the next hop. + * Slow path; search for an interface having the corresponding source + * IPv6 address if the scope was not specified by the caller, and: + * + * 1) There currently isn't any route, or, + * 2) The interface used by the route does not own that source + * IPv6 address; in this case, the route will get blown away + * and we'll do a more specific scoped search using the newly + * found interface. */ - { - struct sockaddr_in6 *sin6_next; - struct rtentry *rt; - - if (opts && opts->ip6po_nexthop) { - sin6_next = satosin6(opts->ip6po_nexthop); - rt = nd6_lookup(&sin6_next->sin6_addr, 1, NULL, 0); - if (rt != NULL) { - RT_LOCK_ASSERT_HELD(rt); - ia6 = in6_ifawithscope(rt->rt_ifp, dst); - if (ia6 == 0) { - ia6 = ifatoia6(rt->rt_ifa); - if (ia6 != NULL) - ifaref(&ia6->ia_ifa); - } + if (ifa == NULL && ifscope == IFSCOPE_NONE) { + ifa = (struct ifaddr *)ifa_foraddr6(&srcsock->sin6_addr); + + if (ip6_select_srcif_debug && ifa != NULL) { + printf("%s->%s ifscope %d ifa_if %s\n", + s_src, s_dst, ifscope, if_name(ifa->ifa_ifp)); + } + + } + +getroute: + if (ifa != NULL) + ifscope = ifa->ifa_ifp->if_index; + + /* + * If the next hop address for the packet is specified by the caller, + * use it as the gateway. + */ + if (opts != NULL && opts->ip6po_nexthop != NULL) { + struct route_in6 *ron; + + sin6_next = satosin6(opts->ip6po_nexthop); + + /* at this moment, we only support AF_INET6 next hops */ + if (sin6_next->sin6_family != AF_INET6) { + error = EAFNOSUPPORT; /* or should we proceed? */ + goto done; + } + + /* + * If the next hop is an IPv6 address, then the node identified + * by that address must be a neighbor of the sending host. + */ + ron = &opts->ip6po_nextroute; + if (ron->ro_rt != NULL) + RT_LOCK(ron->ro_rt); + if ((ron->ro_rt != NULL && + ((ron->ro_rt->rt_flags & (RTF_UP | RTF_LLINFO)) != + (RTF_UP | RTF_LLINFO) || + ron->ro_rt->generation_id != route_generation || + (select_srcif && (ifa == NULL || + ifa->ifa_ifp != ron->ro_rt->rt_ifp)))) || + !IN6_ARE_ADDR_EQUAL(&satosin6(&ron->ro_dst)->sin6_addr, + &sin6_next->sin6_addr)) { + if (ron->ro_rt != NULL) { + RT_UNLOCK(ron->ro_rt); + rtfree(ron->ro_rt); + ron->ro_rt = NULL; } - if (ia6 == 0) { - *errorp = EADDRNOTAVAIL; - if (rt != NULL) { - RT_REMREF_LOCKED(rt); - RT_UNLOCK(rt); + *satosin6(&ron->ro_dst) = *sin6_next; + } + if (ron->ro_rt == NULL) { + rtalloc_scoped((struct route *)ron, ifscope); + if (ron->ro_rt != NULL) + RT_LOCK(ron->ro_rt); + if (ron->ro_rt == NULL || + !(ron->ro_rt->rt_flags & RTF_LLINFO) || + !IN6_ARE_ADDR_EQUAL(&satosin6(rt_key(ron->ro_rt))-> + sin6_addr, &sin6_next->sin6_addr)) { + if (ron->ro_rt != NULL) { + RT_UNLOCK(ron->ro_rt); + rtfree(ron->ro_rt); + ron->ro_rt = NULL; } - return(0); + error = EHOSTUNREACH; + goto done; + } + } + route = ron; + ifp = ron->ro_rt->rt_ifp; + + /* + * When cloning is required, try to allocate a route to the + * destination so that the caller can store path MTU + * information. + */ + if (!clone) { + if (select_srcif) { + /* Keep the route locked */ + goto validateroute; } - *src_storage = satosin6(&ia6->ia_addr)->sin6_addr; - ifafree(&ia6->ia_ifa); - RT_REMREF_LOCKED(rt); - RT_UNLOCK(rt); - return src_storage; + RT_UNLOCK(ron->ro_rt); + goto done; } + RT_UNLOCK(ron->ro_rt); } /* - * If route is known or can be allocated now, - * our src addr is taken from the i/f, else punt. + * Use a cached route if it exists and is valid, else try to allocate + * a new one. Note that we should check the address family of the + * cached destination, in case of sharing the cache with IPv4. */ - if (ro) { + if (ro == NULL) + goto done; + if (ro->ro_rt != NULL) + RT_LOCK(ro->ro_rt); + if (ro->ro_rt != NULL && (!(ro->ro_rt->rt_flags & RTF_UP) || + satosin6(&ro->ro_dst)->sin6_family != AF_INET6 || + ro->ro_rt->generation_id != route_generation || + !IN6_ARE_ADDR_EQUAL(&satosin6(&ro->ro_dst)->sin6_addr, dst) || + (select_srcif && (ifa == NULL || + ifa->ifa_ifp != ro->ro_rt->rt_ifp)))) { + RT_UNLOCK(ro->ro_rt); + rtfree(ro->ro_rt); + ro->ro_rt = NULL; + } + if (ro->ro_rt == NULL) { + struct sockaddr_in6 *sa6; + if (ro->ro_rt != NULL) - RT_LOCK(ro->ro_rt); - if (ro->ro_rt != NULL && - (!(ro->ro_rt->rt_flags & RTF_UP) || - satosin6(&ro->ro_dst)->sin6_family != AF_INET6 || - ro->ro_rt->generation_id != route_generation || - !IN6_ARE_ADDR_EQUAL(&satosin6(&ro->ro_dst)->sin6_addr, - dst))) { RT_UNLOCK(ro->ro_rt); - rtfree(ro->ro_rt); - ro->ro_rt = NULL; - } - if (ro->ro_rt == NULL || ro->ro_rt->rt_ifp == NULL) { - struct sockaddr_in6 *sa6; - - if (ro->ro_rt != NULL) - RT_UNLOCK(ro->ro_rt); - /* No route yet, so try to acquire one */ - bzero(&ro->ro_dst, sizeof(struct sockaddr_in6)); - sa6 = (struct sockaddr_in6 *)&ro->ro_dst; - sa6->sin6_family = AF_INET6; - sa6->sin6_len = sizeof(struct sockaddr_in6); - sa6->sin6_addr = *dst; -#if SCOPEDROUTING - sa6->sin6_scope_id = dstsock->sin6_scope_id; -#endif - if (IN6_IS_ADDR_MULTICAST(dst)) { - ro->ro_rt = rtalloc1( - &((struct route *)ro)->ro_dst, 0, 0); - } else { - rtalloc_ign((struct route *)ro, 0); - } - if (ro->ro_rt != NULL) - RT_LOCK(ro->ro_rt); + /* No route yet, so try to acquire one */ + bzero(&ro->ro_dst, sizeof(struct sockaddr_in6)); + sa6 = (struct sockaddr_in6 *)&ro->ro_dst; + sa6->sin6_family = AF_INET6; + sa6->sin6_len = sizeof(struct sockaddr_in6); + sa6->sin6_addr = *dst; + if (IN6_IS_ADDR_MULTICAST(dst)) { + ro->ro_rt = rtalloc1_scoped( + &((struct route *)ro)->ro_dst, 0, 0, ifscope); + } else { + rtalloc_scoped((struct route *)ro, ifscope); } + if (ro->ro_rt != NULL) + RT_LOCK(ro->ro_rt); + } + /* + * Do not care about the result if we have the nexthop + * explicitly specified (in case we're asked to clone.) + */ + if (opts != NULL && opts->ip6po_nexthop != NULL) { + if (ro->ro_rt != NULL) + RT_UNLOCK(ro->ro_rt); + goto done; + } + + if (ro->ro_rt != NULL) { + RT_LOCK_ASSERT_HELD(ro->ro_rt); + ifp = ro->ro_rt->rt_ifp; + } else { + error = EHOSTUNREACH; + } + route = ro; + +validateroute: + if (select_srcif) { + boolean_t has_route = (route != NULL && route->ro_rt != NULL); + + if (has_route) + RT_LOCK_ASSERT_HELD(route->ro_rt); /* - * in_pcbconnect() checks out IFF_LOOPBACK to skip using - * the address. But we don't know why it does so. - * It is necessary to ensure the scope even for lo0 - * so doesn't check out IFF_LOOPBACK. + * If there is a non-loopback route with the wrong interface, + * or if there is no interface configured with such an address, + * blow it away. Except for local/loopback, we look for one + * with a matching interface scope/index. */ - if (ro->ro_rt != NULL) { - RT_LOCK_ASSERT_HELD(ro->ro_rt); - ia6 = in6_ifawithscope(ro->ro_rt->rt_ifa->ifa_ifp, dst); - if (ia6 == 0) { - ia6 = ifatoia6(ro->ro_rt->rt_ifa); - if (ia6) - ifaref(&ia6->ia_ifa); + if (has_route && (ifa == NULL || + (ifa->ifa_ifp != ifp && ifp != lo_ifp) || + !(route->ro_rt->rt_flags & RTF_UP))) { + if (ip6_select_srcif_debug) { + if (ifa != NULL) { + printf("%s->%s ifscope %d ro_if %s " + "!= ifa_if %s (cached route " + "cleared)\n", s_src, s_dst, + ifscope, if_name(ifp), + if_name(ifa->ifa_ifp)); + } else { + printf("%s->%s ifscope %d ro_if %s " + "(no ifa_if found)\n", s_src, + s_dst, ifscope, if_name(ifp)); + } } + RT_UNLOCK(route->ro_rt); + rtfree(route->ro_rt); + route->ro_rt = NULL; + route->ro_flags &= ~ROF_SRCIF_SELECTED; + error = EHOSTUNREACH; + /* Undo the settings done above */ + route = NULL; + ifp = NULL; + } else if (has_route) { + route->ro_flags |= ROF_SRCIF_SELECTED; + route->ro_rt->generation_id = route_generation; + RT_UNLOCK(route->ro_rt); + } + } else { + if (ro->ro_rt != NULL) RT_UNLOCK(ro->ro_rt); + if (ifp != NULL && opts != NULL && + opts->ip6po_pktinfo != NULL && + opts->ip6po_pktinfo->ipi6_ifindex != 0) { + /* + * Check if the outgoing interface conflicts with the + * interface specified by ipi6_ifindex (if specified). + * Note that loopback interface is always okay. + * (this may happen when we are sending a packet to + * one of our own addresses.) + */ + if (!(ifp->if_flags & IFF_LOOPBACK) && ifp->if_index != + opts->ip6po_pktinfo->ipi6_ifindex) { + error = EHOSTUNREACH; + goto done; + } } -#if 0 + } + +done: + if (nocell && error == 0) { + if ((ifp != NULL && ifp->if_type == IFT_CELLULAR) || + (route != NULL && route->ro_rt != NULL && + route->ro_rt->rt_ifp->if_type == IFT_CELLULAR)) { + if (route != NULL && route->ro_rt != NULL) { + rtfree(route->ro_rt); + route->ro_rt = NULL; + route->ro_flags &= ~ROF_SRCIF_SELECTED; + route = NULL; + } + ifp = NULL; + error = EHOSTUNREACH; + } + } + + if (ifp == NULL && (route == NULL || route->ro_rt == NULL)) { /* - * xxx The followings are necessary? (kazu) - * I don't think so. - * It's for SO_DONTROUTE option in IPv4.(jinmei) + * This can happen if the caller did not pass a cached route + * nor any other hints. We treat this case an error. */ - if (ia6 == 0) { - struct sockaddr_in6 sin6 = {sizeof(sin6), AF_INET6, 0}; - - sin6->sin6_addr = *dst; + error = EHOSTUNREACH; + } + if (error == EHOSTUNREACH) + ip6stat.ip6s_noroute++; - ia6 = ifatoia6(ifa_ifwithdstaddr(sin6tosa(&sin6))); - if (ia6 == 0) - ia6 = ifatoia6(ifa_ifwithnet(sin6tosa(&sin6))); - if (ia6 == 0) - return(0); - *src_storage = satosin6(&ia6->ia_addr)->sin6_addr; - ifafree(&ia6->ia_ifa); - return src_storage; + if (error == 0) { + if (retifp != NULL) { + if (ifp != NULL) + ifnet_reference(ifp); /* for caller */ + *retifp = ifp; } -#endif /* 0 */ - if (ia6 == 0) { - *errorp = EHOSTUNREACH; /* no route */ - return(0); - } - *src_storage = satosin6(&ia6->ia_addr)->sin6_addr; - ifafree(&ia6->ia_ifa); - return src_storage; + if (retrt != NULL && route != NULL) + *retrt = route->ro_rt; /* ro_rt may be NULL */ + } else if (select_srcif && ip6_select_srcif_debug) { + printf("%s->%s ifscope %d ifa_if %s ro_if %s (error=%d)\n", + s_src, s_dst, ifscope, + (ifa != NULL) ? if_name(ifa->ifa_ifp) : "NONE", + (ifp != NULL) ? if_name(ifp) : "NONE", error); } - *errorp = EADDRNOTAVAIL; - return(0); + if (ifa != NULL) + IFA_REMREF(ifa); + + return (error); +} + +static int +in6_selectif(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, + struct ip6_moptions *mopts, struct route_in6 *ro, unsigned int ifscope, + unsigned int nocell, struct ifnet **retifp) +{ + int error; + struct route_in6 sro; + struct rtentry *rt = NULL; + + if (ro == NULL) { + bzero(&sro, sizeof(sro)); + ro = &sro; + } + + if ((error = selectroute(NULL, dstsock, opts, mopts, ro, retifp, + &rt, 0, 1, ifscope, nocell)) != 0) { + if (ro == &sro && rt && rt == sro.ro_rt) + rtfree(rt); + return (error); + } + + /* + * do not use a rejected or black hole route. + * XXX: this check should be done in the L2 output routine. + * However, if we skipped this check here, we'd see the following + * scenario: + * - install a rejected route for a scoped address prefix + * (like fe80::/10) + * - send a packet to a destination that matches the scoped prefix, + * with ambiguity about the scope zone. + * - pick the outgoing interface from the route, and disambiguate the + * scope zone with the interface. + * - ip6_output() would try to get another route with the "new" + * destination, which may be valid. + * - we'd see no error on output. + * Although this may not be very harmful, it should still be confusing. + * We thus reject the case here. + */ + if (rt && (rt->rt_flags & (RTF_REJECT | RTF_BLACKHOLE))) { + int flags = (rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); + + if (ro == &sro && rt && rt == sro.ro_rt) + rtfree(rt); + return (flags); + } + + /* + * Adjust the "outgoing" interface. If we're going to loop the packet + * back to ourselves, the ifp would be the loopback interface. + * However, we'd rather know the interface associated to the + * destination address (which should probably be one of our own + * addresses.) + */ + if (rt && rt->rt_ifa && rt->rt_ifa->ifa_ifp) { + if (*retifp != NULL) + ifnet_release(*retifp); + *retifp = rt->rt_ifa->ifa_ifp; + ifnet_reference(*retifp); + } + + if (ro == &sro && rt && rt == sro.ro_rt) + rtfree(rt); + return (0); +} + +/* + * clone - meaningful only for bsdi and freebsd + */ +int +in6_selectroute(struct sockaddr_in6 *srcsock, struct sockaddr_in6 *dstsock, + struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct route_in6 *ro, + struct ifnet **retifp, struct rtentry **retrt, int clone, + unsigned int ifscope, unsigned int nocell) +{ + + return (selectroute(srcsock, dstsock, opts, mopts, ro, retifp, + retrt, clone, 0, ifscope, nocell)); } /* @@ -429,6 +1110,7 @@ in6_pcbsetport( u_int16_t lport = 0, first, last, *lastport; int count, error = 0, wild = 0; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; + kauth_cred_t cred; if (!locked) { /* Make sure we don't run into a deadlock: 4052373 */ if (!lck_rw_try_lock_exclusive(pcbinfo->mtx)) { socket_unlock(inp->inp_socket, 0); @@ -448,7 +1130,10 @@ in6_pcbsetport( last = ipport_hilastauto; lastport = &pcbinfo->lasthi; } else if (inp->inp_flags & INP_LOWPORT) { - if ((error = proc_suser(p)) != 0) { + cred = kauth_cred_proc_ref(p); + error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); + kauth_cred_unref(&cred); + if (error != 0) { if (!locked) lck_rw_done(pcbinfo->mtx); return error; @@ -481,6 +1166,7 @@ in6_pcbsetport( * occurred above. */ inp->in6p_laddr = in6addr_any; + inp->in6p_last_outif = 0; if (!locked) lck_rw_done(pcbinfo->mtx); return (EAGAIN); @@ -504,6 +1190,7 @@ in6_pcbsetport( * occurred above. */ inp->in6p_laddr = in6addr_any; + inp->in6p_last_outif = 0; if (!locked) lck_rw_done(pcbinfo->mtx); return (EAGAIN); @@ -520,6 +1207,7 @@ in6_pcbsetport( if (in_pcbinshash(inp, 1) != 0) { inp->in6p_laddr = in6addr_any; inp->inp_lport = 0; + inp->in6p_last_outif = 0; if (!locked) lck_rw_done(pcbinfo->mtx); return (EAGAIN); @@ -530,6 +1218,350 @@ in6_pcbsetport( return(0); } +/* + * * The followings are implementation of the policy table using a + * * simple tail queue. + * * XXX such details should be hidden. + * * XXX implementation using binary tree should be more efficient. + * */ +struct addrsel_policyent { + TAILQ_ENTRY(addrsel_policyent) ape_entry; + struct in6_addrpolicy ape_policy; +}; + +TAILQ_HEAD(addrsel_policyhead, addrsel_policyent); + +struct addrsel_policyhead addrsel_policytab; + +static void +init_policy_queue(void) +{ + + TAILQ_INIT(&addrsel_policytab); +} + +void +addrsel_policy_init(void) +{ + /* + * Default address selection policy based on RFC 3484 and + * draft-arifumi-6man-rfc3484-revise-03. + */ + static const struct in6_addrpolicy defaddrsel[] = { + /* localhost */ + { .addr = { .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_LOOPBACK_INIT, + .sin6_len = sizeof(struct sockaddr_in6) }, + .addrmask = { .sin6_family = AF_INET6, + .sin6_addr = IN6MASK128, + .sin6_len = sizeof(struct sockaddr_in6) }, + .preced = 60, + .label = 0 }, + /* ULA */ + { .addr = { .sin6_family = AF_INET6, + .sin6_addr = {{{ 0xfc }}}, + .sin6_len = sizeof(struct sockaddr_in6) }, + .addrmask = { .sin6_family = AF_INET6, + .sin6_addr = IN6MASK7, + .sin6_len = sizeof(struct sockaddr_in6) }, + .preced = 50, + .label = 1 }, + /* any IPv6 src */ + { .addr = { .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_ANY_INIT, + .sin6_len = sizeof(struct sockaddr_in6) }, + .addrmask = { .sin6_family = AF_INET6, + .sin6_addr = IN6MASK0, + .sin6_len = sizeof(struct sockaddr_in6) }, + .preced = 40, + .label = 2 }, + /* any IPv4 src */ + { .addr = { .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_V4MAPPED_INIT, + .sin6_len = sizeof(struct sockaddr_in6) }, + .addrmask = { .sin6_family = AF_INET6, + .sin6_addr = IN6MASK96, + .sin6_len = sizeof(struct sockaddr_in6) }, + .preced = 30, + .label = 3 }, + /* 6to4 */ + { .addr = { .sin6_family = AF_INET6, + .sin6_addr = {{{ 0x20, 0x02 }}}, + .sin6_len = sizeof(struct sockaddr_in6) }, + .addrmask = { .sin6_family = AF_INET6, + .sin6_addr = IN6MASK16, + .sin6_len = sizeof(struct sockaddr_in6) }, + .preced = 20, + .label = 4 }, + /* Teredo */ + { .addr = { .sin6_family = AF_INET6, + .sin6_addr = {{{ 0x20, 0x01 }}}, + .sin6_len = sizeof(struct sockaddr_in6) }, + .addrmask = { .sin6_family = AF_INET6, + .sin6_addr = IN6MASK32, + .sin6_len = sizeof(struct sockaddr_in6) }, + .preced = 10, + .label = 5 }, + /* v4 compat addresses */ + { .addr = { .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_ANY_INIT, + .sin6_len = sizeof(struct sockaddr_in6) }, + .addrmask = { .sin6_family = AF_INET6, + .sin6_addr = IN6MASK96, + .sin6_len = sizeof(struct sockaddr_in6) }, + .preced = 1, + .label = 10 }, + /* site-local (deprecated) */ + { .addr = { .sin6_family = AF_INET6, + .sin6_addr = {{{ 0xfe, 0xc0 }}}, + .sin6_len = sizeof(struct sockaddr_in6) }, + .addrmask = { .sin6_family = AF_INET6, + .sin6_addr = IN6MASK16, + .sin6_len = sizeof(struct sockaddr_in6) }, + .preced = 1, + .label = 11 }, + /* 6bone (deprecated) */ + { .addr = { .sin6_family = AF_INET6, + .sin6_addr = {{{ 0x3f, 0xfe }}}, + .sin6_len = sizeof(struct sockaddr_in6) }, + .addrmask = { .sin6_family = AF_INET6, + .sin6_addr = IN6MASK16, + .sin6_len = sizeof(struct sockaddr_in6) }, + .preced = 1, + .label = 12 }, + }; + int i; + + init_policy_queue(); + + /* initialize the "last resort" policy */ + bzero(&defaultaddrpolicy, sizeof(defaultaddrpolicy)); + defaultaddrpolicy.label = ADDR_LABEL_NOTAPP; + + for (i = 0; i < sizeof(defaddrsel) / sizeof(defaddrsel[0]); i++) + add_addrsel_policyent(&defaddrsel[i]); + +} + +struct in6_addrpolicy * +in6_addrsel_lookup_policy(struct sockaddr_in6 *key) +{ + struct in6_addrpolicy *match = NULL; + + ADDRSEL_LOCK(); + match = match_addrsel_policy(key); + + if (match == NULL) + match = &defaultaddrpolicy; + else + match->use++; + ADDRSEL_UNLOCK(); + + return (match); +} + +static struct in6_addrpolicy * +match_addrsel_policy(struct sockaddr_in6 *key) +{ + struct addrsel_policyent *pent; + struct in6_addrpolicy *bestpol = NULL, *pol; + int matchlen, bestmatchlen = -1; + u_char *mp, *ep, *k, *p, m; + + TAILQ_FOREACH(pent, &addrsel_policytab, ape_entry) { + matchlen = 0; + + pol = &pent->ape_policy; + mp = (u_char *)&pol->addrmask.sin6_addr; + ep = mp + 16; /* XXX: scope field? */ + k = (u_char *)&key->sin6_addr; + p = (u_char *)&pol->addr.sin6_addr; + for (; mp < ep && *mp; mp++, k++, p++) { + m = *mp; + if ((*k & m) != *p) + goto next; /* not match */ + if (m == 0xff) /* short cut for a typical case */ + matchlen += 8; + else { + while (m >= 0x80) { + matchlen++; + m <<= 1; + } + } + } + + /* matched. check if this is better than the current best. */ + if (bestpol == NULL || + matchlen > bestmatchlen) { + bestpol = pol; + bestmatchlen = matchlen; + } + + next: + continue; + } + + return (bestpol); +} + +static int +add_addrsel_policyent(const struct in6_addrpolicy *newpolicy) +{ + struct addrsel_policyent *new, *pol; + + MALLOC(new, struct addrsel_policyent *, sizeof(*new), M_IFADDR, + M_WAITOK); + + ADDRSEL_LOCK(); + + /* duplication check */ + TAILQ_FOREACH(pol, &addrsel_policytab, ape_entry) { + if (IN6_ARE_ADDR_EQUAL(&newpolicy->addr.sin6_addr, + &pol->ape_policy.addr.sin6_addr) && + IN6_ARE_ADDR_EQUAL(&newpolicy->addrmask.sin6_addr, + &pol->ape_policy.addrmask.sin6_addr)) { + ADDRSEL_UNLOCK(); + FREE(new, M_IFADDR); + return (EEXIST); /* or override it? */ + } + } + + bzero(new, sizeof(*new)); + + /* XXX: should validate entry */ + new->ape_policy = *newpolicy; + + TAILQ_INSERT_TAIL(&addrsel_policytab, new, ape_entry); + ADDRSEL_UNLOCK(); + + return (0); +} +#ifdef ENABLE_ADDRSEL +static int +delete_addrsel_policyent(const struct in6_addrpolicy *key) +{ + struct addrsel_policyent *pol; + + + ADDRSEL_LOCK(); + + /* search for the entry in the table */ + TAILQ_FOREACH(pol, &addrsel_policytab, ape_entry) { + if (IN6_ARE_ADDR_EQUAL(&key->addr.sin6_addr, + &pol->ape_policy.addr.sin6_addr) && + IN6_ARE_ADDR_EQUAL(&key->addrmask.sin6_addr, + &pol->ape_policy.addrmask.sin6_addr)) { + break; + } + } + if (pol == NULL) { + ADDRSEL_UNLOCK(); + return (ESRCH); + } + + TAILQ_REMOVE(&addrsel_policytab, pol, ape_entry); + FREE(pol, M_IFADDR); + pol = NULL; + ADDRSEL_UNLOCK(); + + return (0); +} +#endif /* ENABLE_ADDRSEL */ + +int +walk_addrsel_policy(int (*callback)(const struct in6_addrpolicy *, void *), + void *w) +{ + struct addrsel_policyent *pol; + int error = 0; + + ADDRSEL_LOCK(); + TAILQ_FOREACH(pol, &addrsel_policytab, ape_entry) { + if ((error = (*callback)(&pol->ape_policy, w)) != 0) { + ADDRSEL_UNLOCK(); + return (error); + } + } + ADDRSEL_UNLOCK(); + return (error); +} +/* + * Subroutines to manage the address selection policy table via sysctl. + */ +struct walkarg { + struct sysctl_req *w_req; +}; + + +static int +dump_addrsel_policyent(const struct in6_addrpolicy *pol, void *arg) +{ + int error = 0; + struct walkarg *w = arg; + + error = SYSCTL_OUT(w->w_req, pol, sizeof(*pol)); + + return (error); +} + +static int +in6_src_sysctl SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) +struct walkarg w; + + if (req->newptr) + return EPERM; + bzero(&w, sizeof(w)); + w.w_req = req; + + return (walk_addrsel_policy(dump_addrsel_policyent, &w)); +} + + +SYSCTL_NODE(_net_inet6_ip6, IPV6CTL_ADDRCTLPOLICY, addrctlpolicy, + CTLFLAG_RD | CTLFLAG_LOCKED, in6_src_sysctl, ""); +int +in6_src_ioctl(u_long cmd, caddr_t data) +{ + int i; + struct in6_addrpolicy ent0; + + if (cmd != SIOCAADDRCTL_POLICY && cmd != SIOCDADDRCTL_POLICY) + return (EOPNOTSUPP); /* check for safety */ + + ent0 = *(struct in6_addrpolicy *)data; + + if (ent0.label == ADDR_LABEL_NOTAPP) + return (EINVAL); + /* check if the prefix mask is consecutive. */ + if (in6_mask2len(&ent0.addrmask.sin6_addr, NULL) < 0) + return (EINVAL); + /* clear trailing garbages (if any) of the prefix address. */ + for (i = 0; i < 4; i++) { + ent0.addr.sin6_addr.s6_addr32[i] &= + ent0.addrmask.sin6_addr.s6_addr32[i]; + } + ent0.use = 0; + + switch (cmd) { + case SIOCAADDRCTL_POLICY: +#ifdef ENABLE_ADDRSEL + return (add_addrsel_policyent(&ent0)); +#else + return (ENOTSUP); +#endif + case SIOCDADDRCTL_POLICY: +#ifdef ENABLE_ADDRSEL + return (delete_addrsel_policyent(&ent0)); +#else + return (ENOTSUP); +#endif + } + + return (0); /* XXX: compromise compilers */ +} + /* * generate kernel-internal form (scopeid embedded into s6_addr16[1]). * If the address scope of is link-local, embed the interface index in the @@ -549,21 +1581,17 @@ int in6_embedscope( struct in6_addr *in6, const struct sockaddr_in6 *sin6, -#ifdef HAVE_NRL_INPCB - struct inpcb *in6p, -#define in6p_outputopts inp_outputopts6 -#define in6p_moptions inp_moptions6 -#else struct in6pcb *in6p, -#endif - struct ifnet **ifpp) + struct ifnet **ifpp, + struct ip6_pktopts *opt) { struct ifnet *ifp = NULL; u_int32_t scopeid; + struct ip6_pktopts *optp = NULL; *in6 = sin6->sin6_addr; scopeid = sin6->sin6_scope_id; - if (ifpp) + if (ifpp != NULL) *ifpp = NULL; /* @@ -578,21 +1606,31 @@ in6_embedscope( if (IN6_IS_SCOPE_LINKLOCAL(in6)) { struct in6_pktinfo *pi; + struct ifnet *im6o_multicast_ifp = NULL; + + if (in6p != NULL && IN6_IS_ADDR_MULTICAST(in6) && + in6p->in6p_moptions != NULL) { + IM6O_LOCK(in6p->in6p_moptions); + im6o_multicast_ifp = + in6p->in6p_moptions->im6o_multicast_ifp; + IM6O_UNLOCK(in6p->in6p_moptions); + } + if (opt) + optp = opt; + else if (in6p) + optp = in6p->in6p_outputopts; /* * KAME assumption: link id == interface id */ - ifnet_head_lock_shared(); - if (in6p && in6p->in6p_outputopts && - (pi = in6p->in6p_outputopts->ip6po_pktinfo) && + if (in6p && optp && (pi = optp->ip6po_pktinfo) && pi->ipi6_ifindex) { ifp = ifindex2ifnet[pi->ipi6_ifindex]; in6->s6_addr16[1] = htons(pi->ipi6_ifindex); } else if (in6p && IN6_IS_ADDR_MULTICAST(in6) && - in6p->in6p_moptions && - in6p->in6p_moptions->im6o_multicast_ifp) { - ifp = in6p->in6p_moptions->im6o_multicast_ifp; + in6p->in6p_moptions != NULL && im6o_multicast_ifp != NULL) { + ifp = im6o_multicast_ifp; in6->s6_addr16[1] = htons(ifp->if_index); } else if (scopeid) { /* @@ -610,16 +1648,15 @@ in6_embedscope( } ifnet_head_done(); - if (ifpp) + if (ifpp != NULL) { + if (ifp != NULL) + ifnet_reference(ifp); /* for caller */ *ifpp = ifp; + } } return 0; } -#if HAVE_NRL_INPCB -#undef in6p_outputopts -#undef in6p_moptions -#endif /* * generate standard sockaddr_in6 from embedded form. @@ -667,15 +1704,3 @@ in6_recoverscope( return 0; } - -/* - * just clear the embedded scope identifer. - * XXX: currently used for bsdi4 only as a supplement function. - */ -void -in6_clearscope(addr) - struct in6_addr *addr; -{ - if (IN6_IS_SCOPE_LINKLOCAL(addr)) - addr->s6_addr16[1] = 0; -} diff --git a/bsd/netinet6/in6_var.h b/bsd/netinet6/in6_var.h index e423627db..67f8fa429 100644 --- a/bsd/netinet6/in6_var.h +++ b/bsd/netinet6/in6_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -97,6 +97,11 @@ #define _NETINET6_IN6_VAR_H_ #include +#ifdef XNU_KERNEL_PRIVATE +#include +#include +#endif + #ifdef __APPLE__ #include #endif @@ -116,7 +121,7 @@ struct in6_addrlifetime { u_int32_t ia6t_pltime; /* prefix lifetime */ }; -#if defined(KERNEL_PRIVATE) +#ifdef XNU_KERNEL_PRIVATE struct in6_addrlifetime_32 { u_int32_t ia6t_expire; u_int32_t ia6t_preferred; @@ -125,9 +130,9 @@ struct in6_addrlifetime_32 { }; struct in6_addrlifetime_64 { - time_t ia6t_expire; - time_t ia6t_preferred __attribute__((aligned(8))); - u_int32_t ia6t_vltime __attribute__((aligned(8))); + u_int64_t ia6t_expire; + u_int64_t ia6t_preferred; + u_int32_t ia6t_vltime; u_int32_t ia6t_pltime; }; @@ -150,13 +155,29 @@ struct in6_ifaddr { int ia6_flags; struct in6_addrlifetime ia6_lifetime; + time_t ia6_createtime; /* the creation time of this address, which is + * currently used for temporary addresses only. + */ + time_t ia6_updatetime; + struct ifprefix *ia6_ifpr; /* back pointer to ifprefix */ - struct nd_prefix *ia6_ndpr; /* back pointer to the ND prefix - * (for autoconfigured addresses only) - */ + /* back pointer to the ND prefix (for autoconfigured addresses only) */ + struct nd_prefix *ia6_ndpr; + + /* multicast addresses joined from the kernel */ + LIST_HEAD(, in6_multi_mship) ia6_memberships; +}; +#endif /* XNU_KERNEL_PRIVATE */ + +/* control structure to manage address selection policy */ +struct in6_addrpolicy { + struct sockaddr_in6 addr; /* prefix address */ + struct sockaddr_in6 addrmask; /* prefix mask */ + int preced; /* precedence */ + int label; /* matching label */ + u_quad_t use; /* statistics */ }; -#endif /* KERNEL_PRIVATE */ /* * IPv6 interface statistics, as defined in RFC2465 Ipv6IfStatsEntry (p12). @@ -282,7 +303,7 @@ struct in6_ifreq { union { struct sockaddr_in6 ifru_addr; struct sockaddr_in6 ifru_dstaddr; - short ifru_flags; + int ifru_flags; int ifru_flags6; int ifru_metric; caddr_t ifru_data; @@ -302,7 +323,7 @@ struct in6_aliasreq { struct in6_addrlifetime ifra_lifetime; }; -#if defined(KERNEL_PRIVATE) +#ifdef XNU_KERNEL_PRIVATE struct in6_aliasreq_32 { char ifra_name[IFNAMSIZ]; struct sockaddr_in6 ifra_addr; @@ -320,7 +341,7 @@ struct in6_aliasreq_64 { int ifra_flags; struct in6_addrlifetime_64 ifra_lifetime; }; -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ /* prefix type macro */ #define IN6_PREFIX_ND 1 @@ -404,7 +425,7 @@ struct in6_rrenumreq { #define irr_rrf_decrvalid irr_flags.prf_rr.decrvalid #define irr_rrf_decrprefd irr_flags.prf_rr.decrprefd -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE /* * Given a pointer to an in6_ifaddr (ifaddr), * return a pointer to the addr as a sockaddr_in6 @@ -418,7 +439,7 @@ struct in6_rrenumreq { #define IFA_DSTIN6(x) (&((struct sockaddr_in6 *)((x)->ifa_dstaddr))->sin6_addr) #define IFPR_IN6(x) (&((struct sockaddr_in6 *)((x)->ifpr_prefix))->sin6_addr) -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ /* * Event data, internet6 style. @@ -455,10 +476,10 @@ struct kev_in6_data { #define KEV_INET6_NEW_RTADV_ADDR 5 /* Autoconf router advertised address has appeared */ #define KEV_INET6_DEFROUTER 6 /* Default router dectected by kernel */ -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE /* Utility function used inside netinet6 kernel code for generating events */ void in6_post_msg(struct ifnet *, u_int32_t, struct in6_ifaddr *); -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #define IN6_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ (((d)->s6_addr32[0] ^ (a)->s6_addr32[0]) & (m)->s6_addr32[0]) == 0 && \ @@ -481,37 +502,37 @@ void in6_post_msg(struct ifnet *, u_int32_t, struct in6_ifaddr *); #define SIOCDIFADDR_IN6 _IOW('i', 25, struct in6_ifreq) #define SIOCAIFADDR_IN6 _IOW('i', 26, struct in6_aliasreq) -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #define SIOCAIFADDR_IN6_32 _IOW('i', 26, struct in6_aliasreq_32) #define SIOCAIFADDR_IN6_64 _IOW('i', 26, struct in6_aliasreq_64) -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #define SIOCSIFPHYADDR_IN6 _IOW('i', 62, struct in6_aliasreq) -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #define SIOCSIFPHYADDR_IN6_32 _IOW('i', 62, struct in6_aliasreq_32) #define SIOCSIFPHYADDR_IN6_64 _IOW('i', 62, struct in6_aliasreq_64) -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #define SIOCGIFPSRCADDR_IN6 _IOWR('i', 63, struct in6_ifreq) #define SIOCGIFPDSTADDR_IN6 _IOWR('i', 64, struct in6_ifreq) #define SIOCGIFAFLAG_IN6 _IOWR('i', 73, struct in6_ifreq) #define SIOCGDRLST_IN6 _IOWR('i', 74, struct in6_drlist) -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #define SIOCGDRLST_IN6_32 _IOWR('i', 74, struct in6_drlist_32) #define SIOCGDRLST_IN6_64 _IOWR('i', 74, struct in6_drlist_64) -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #define SIOCGPRLST_IN6 _IOWR('i', 75, struct in6_prlist) -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #define SIOCGPRLST_IN6_32 _IOWR('i', 75, struct in6_prlist_32) #define SIOCGPRLST_IN6_64 _IOWR('i', 75, struct in6_prlist_64) -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #define OSIOCGIFINFO_IN6 _IOWR('i', 108, struct in6_ondireq) #define SIOCGIFINFO_IN6 _IOWR('i', 76, struct in6_ondireq) #define SIOCSNDFLUSH_IN6 _IOWR('i', 77, struct in6_ifreq) #define SIOCGNBRINFO_IN6 _IOWR('i', 78, struct in6_nbrinfo) -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #define SIOCGNBRINFO_IN6_32 _IOWR('i', 78, struct in6_nbrinfo_32) #define SIOCGNBRINFO_IN6_64 _IOWR('i', 78, struct in6_nbrinfo_64) -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #define SIOCSPFXFLUSH_IN6 _IOWR('i', 79, struct in6_ifreq) #define SIOCSRTRFLUSH_IN6 _IOWR('i', 80, struct in6_ifreq) @@ -522,12 +543,12 @@ void in6_post_msg(struct ifnet *, u_int32_t, struct in6_ifaddr *); #define SIOCSDEFIFACE_IN6 _IOWR('i', 85, struct in6_ndifreq) #define SIOCGDEFIFACE_IN6 _IOWR('i', 86, struct in6_ndifreq) -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #define SIOCSDEFIFACE_IN6_32 _IOWR('i', 85, struct in6_ndifreq_32) #define SIOCSDEFIFACE_IN6_64 _IOWR('i', 85, struct in6_ndifreq_64) #define SIOCGDEFIFACE_IN6_32 _IOWR('i', 86, struct in6_ndifreq_32) #define SIOCGDEFIFACE_IN6_64 _IOWR('i', 86, struct in6_ndifreq_64) -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #define SIOCSIFINFO_FLAGS _IOWR('i', 87, struct in6_ndireq) /* XXX */ @@ -548,30 +569,44 @@ void in6_post_msg(struct ifnet *, u_int32_t, struct in6_ifaddr *); struct sioc_sg_req6) /* get s,g pkt cnt */ #define SIOCGETMIFCNT_IN6 _IOWR('u', 107, \ struct sioc_mif_req6) /* get pkt cnt per if */ -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #define SIOCGETMIFCNT_IN6_32 _IOWR('u', 107, struct sioc_mif_req6_32) #define SIOCGETMIFCNT_IN6_64 _IOWR('u', 107, struct sioc_mif_req6_64) -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ + +#define SIOCAADDRCTL_POLICY _IOW('u', 108, struct in6_addrpolicy) +#define SIOCDADDRCTL_POLICY _IOW('u', 109, struct in6_addrpolicy) #ifdef PRIVATE /* * temporary control calls to attach/detach IP to/from an ethernet interface */ #define SIOCPROTOATTACH_IN6 _IOWR('i', 110, struct in6_aliasreq) /* attach proto to interface */ -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #define SIOCPROTOATTACH_IN6_32 _IOWR('i', 110, struct in6_aliasreq_32) #define SIOCPROTOATTACH_IN6_64 _IOWR('i', 110, struct in6_aliasreq_64) -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #define SIOCPROTODETACH_IN6 _IOWR('i', 111, struct in6_ifreq) /* detach proto from interface */ #define SIOCLL_START _IOWR('i', 130, struct in6_aliasreq) /* start aquiring linklocal on interface */ -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #define SIOCLL_START_32 _IOWR('i', 130, struct in6_aliasreq_32) #define SIOCLL_START_64 _IOWR('i', 130, struct in6_aliasreq_64) -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #define SIOCLL_STOP _IOWR('i', 131, struct in6_ifreq) /* deconfigure linklocal from interface */ #define SIOCAUTOCONF_START _IOWR('i', 132, struct in6_ifreq) /* accept rtadvd on this interface */ #define SIOCAUTOCONF_STOP _IOWR('i', 133, struct in6_ifreq) /* stop accepting rtadv for this interface */ + +#define SIOCDRADD_IN6 _IOWR('u', 134, struct in6_defrouter) +#ifdef XNU_KERNEL_PRIVATE +#define SIOCDRADD_IN6_32 _IOWR('u', 134, struct in6_defrouter_32) +#define SIOCDRADD_IN6_64 _IOWR('u', 134, struct in6_defrouter_64) +#endif /* XNU_KERNEL_PRIVATE */ +#define SIOCDRDEL_IN6 _IOWR('u', 135, struct in6_defrouter) +#ifdef XNU_KERNEL_PRIVATE +#define SIOCDRDEL_IN6_32 _IOWR('u', 135, struct in6_defrouter_32) +#define SIOCDRDEL_IN6_64 _IOWR('u', 135, struct in6_defrouter_64) +#endif /* XNU_KERNEL_PRIVATE */ #endif /* PRIVATE */ #define IN6_IFF_ANYCAST 0x01 /* anycast address */ @@ -596,7 +631,7 @@ void in6_post_msg(struct ifnet *, u_int32_t, struct in6_ifaddr *); #define IN6_ARE_SCOPE_EQUAL(a,b) ((a)==(b)) #endif /* KERNEL */ -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE extern struct in6_ifaddr *in6_ifaddrs; extern struct in6_ifstat **in6_ifstat; @@ -604,23 +639,25 @@ extern size_t in6_ifstatmax; extern struct icmp6stat icmp6stat; extern struct icmp6_ifstat **icmp6_ifstat; extern size_t icmp6_ifstatmax; +extern lck_rw_t in6_ifs_rwlock; #define in6_ifstat_inc(ifp, tag) \ -do { \ - int _z_index = ifp ? ifp->if_index : 0; \ - if ((_z_index) && _z_index <= if_index \ - && _z_index < (signed)in6_ifstatmax \ - && in6_ifstat && in6_ifstat[_z_index]) { \ - in6_ifstat[_z_index]->tag++; \ - } \ +do { \ + lck_rw_lock_shared(&in6_ifs_rwlock); \ + int _z_index = ifp ? ifp->if_index : 0; \ + if ((_z_index) && _z_index <= if_index \ + && _z_index < (signed)in6_ifstatmax \ + && in6_ifstat && in6_ifstat[_z_index]) { \ + atomic_add_64(&in6_ifstat[_z_index]->tag, 1); \ + } \ + lck_rw_done(&in6_ifs_rwlock); \ } while (0) +__private_extern__ lck_rw_t in6_ifaddr_rwlock; + extern struct ifqueue ip6intrq; /* IP6 packet input queue */ extern struct in6_addr zeroin6_addr; extern u_char inet6ctlerrmap[]; extern u_int32_t in6_maxmtu; -#ifdef MALLOC_DECLARE -MALLOC_DECLARE(M_IPMADDR); -#endif /* MALLOC_DECLARE */ /* * Macro for finding the internet address structure (in6_ifaddr) corresponding @@ -631,35 +668,156 @@ MALLOC_DECLARE(M_IPMADDR); /* struct ifnet *ifp; */ \ /* struct in6_ifaddr *ia; */ \ do { \ - struct ifaddr *ifa; \ - for (ifa = (ifp)->if_addrlist.tqh_first; ifa; ifa = ifa->ifa_list.tqe_next) { \ - if (!ifa->ifa_addr) \ - continue; \ - if (ifa->ifa_addr->sa_family == AF_INET6) \ + struct ifaddr *_ifa; \ + ifnet_lock_assert(ifp, LCK_RW_ASSERT_HELD); \ + for (_ifa = (ifp)->if_addrlist.tqh_first; _ifa != NULL; \ + _ifa = _ifa->ifa_list.tqe_next) { \ + IFA_LOCK(_ifa); \ + if (_ifa->ifa_addr->sa_family == AF_INET6) { \ + IFA_ADDREF_LOCKED(_ifa); \ + IFA_UNLOCK(_ifa); \ break; \ + } \ + IFA_UNLOCK(_ifa); \ } \ - (ia) = (struct in6_ifaddr *)ifa; \ + (ia) = (struct in6_ifaddr *)_ifa; \ } while (0) /* - * Multi-cast membership entry. One for each group/ifp that a PCB - * belongs to. + * IPv6 multicast MLD-layer source entry. + */ +struct ip6_msource { + RB_ENTRY(ip6_msource) im6s_link; /* RB tree links */ + struct in6_addr im6s_addr; + struct im6s_st { + uint16_t ex; /* # of exclusive members */ + uint16_t in; /* # of inclusive members */ + } im6s_st[2]; /* state at t0, t1 */ + uint8_t im6s_stp; /* pending query */ +}; + +RB_HEAD(ip6_msource_tree, ip6_msource); + +RB_PROTOTYPE_SC_PREV(__private_extern__, ip6_msource_tree, ip6_msource, + im6s_link, ip6_msource_cmp); + +/* + * IPv6 multicast PCB-layer source entry. + * + * NOTE: overlapping use of struct ip6_msource fields at start. + */ +struct in6_msource { + RB_ENTRY(ip6_msource) im6s_link; /* Common field */ + struct in6_addr im6s_addr; /* Common field */ + uint8_t im6sl_st[2]; /* state before/at commit */ +}; + +/* + * IPv6 multicast PCB-layer group filter descriptor. + */ +struct in6_mfilter { + struct ip6_msource_tree im6f_sources; /* source list for (S,G) */ + u_long im6f_nsrc; /* # of source entries */ + uint8_t im6f_st[2]; /* state before/at commit */ +}; + +/* + * Legacy KAME IPv6 multicast membership descriptor. */ struct in6_multi_mship { struct in6_multi *i6mm_maddr; /* Multicast address pointer */ LIST_ENTRY(in6_multi_mship) i6mm_chain; /* multicast options chain */ }; +struct mld_ifinfo; + +/* + * The request count here is a count of requests for this address, not a + * count of pointers to this structure. + */ struct in6_multi { + decl_lck_mtx_data(, in6m_lock); + u_int32_t in6m_refcount; /* reference count */ + u_int32_t in6m_reqcnt; /* request count for this address */ + u_int32_t in6m_debug; /* see ifa_debug flags */ LIST_ENTRY(in6_multi) in6m_entry; /* list glue */ struct in6_addr in6m_addr; /* IP6 multicast address */ struct ifnet *in6m_ifp; /* back pointer to ifnet */ struct ifmultiaddr *in6m_ifma; /* back pointer to ifmultiaddr */ - u_int in6m_refcount; /* # membership claims by sockets */ u_int in6m_state; /* state of the membership */ u_int in6m_timer; /* MLD6 listener report timer */ + /* New fields for MLDv2 follow. */ + struct mld_ifinfo *in6m_mli; /* MLD info */ + SLIST_ENTRY(in6_multi) in6m_nrele; /* to-be-released by MLD */ + u_int32_t in6m_nrelecnt; /* deferred release count */ + struct ip6_msource_tree in6m_srcs; /* tree of sources */ + u_long in6m_nsrc; /* # of tree entries */ + + struct ifqueue in6m_scq; /* queue of pending + * state-change packets */ + struct timeval in6m_lastgsrtv; /* last G-S-R query */ + uint16_t in6m_sctimer; /* state-change timer */ + uint16_t in6m_scrv; /* state-change rexmit count */ + /* + * SSM state counters which track state at T0 (the time the last + * state-change report's RV timer went to zero) and T1 + * (time of pending report, i.e. now). + * Used for computing MLDv2 state-change reports. Several refcounts + * are maintained here to optimize for common use-cases. + */ + struct in6m_st { + uint16_t iss_fmode; /* MLD filter mode */ + uint16_t iss_asm; /* # of ASM listeners */ + uint16_t iss_ex; /* # of exclusive members */ + uint16_t iss_in; /* # of inclusive members */ + uint16_t iss_rec; /* # of recorded sources */ + } in6m_st[2]; /* state at t0, t1 */ + + void (*in6m_trace) /* callback fn for tracing refs */ + (struct in6_multi *, int); }; +#define IN6M_LOCK_ASSERT_HELD(_in6m) \ + lck_mtx_assert(&(_in6m)->in6m_lock, LCK_MTX_ASSERT_OWNED) + +#define IN6M_LOCK_ASSERT_NOTHELD(_in6m) \ + lck_mtx_assert(&(_in6m)->in6m_lock, LCK_MTX_ASSERT_NOTOWNED) + +#define IN6M_LOCK(_in6m) \ + lck_mtx_lock(&(_in6m)->in6m_lock) + +#define IN6M_LOCK_SPIN(_in6m) \ + lck_mtx_lock_spin(&(_in6m)->in6m_lock) + +#define IN6M_CONVERT_LOCK(_in6m) do { \ + IN6M_LOCK_ASSERT_HELD(_in6m); \ + lck_mtx_convert_spin(&(_in6m)->in6m_lock); \ +} while (0) + +#define IN6M_UNLOCK(_in6m) \ + lck_mtx_unlock(&(_in6m)->in6m_lock) + +#define IN6M_ADDREF(_in6m) \ + in6m_addref(_in6m, 0) + +#define IN6M_ADDREF_LOCKED(_in6m) \ + in6m_addref(_in6m, 1) + +#define IN6M_REMREF(_in6m) \ + in6m_remref(_in6m, 0) + +#define IN6M_REMREF_LOCKED(_in6m) \ + in6m_remref(_in6m, 1) + +#define IN6M_TIMER_UNDEF -1 + +/* flags to in6_update_ifa */ +#define IN6_IFAUPDATE_DADDELAY 0x1 /* first time to configure an address */ + +struct ip6_moptions; +struct sockopt; +struct inpcb; + extern LIST_HEAD(in6_multihead, in6_multi) in6_multihead; /* @@ -674,23 +832,36 @@ struct in6_multistep { /* * Macros for looking up the in6_multi record for a given IP6 multicast * address on a given interface. If no matching record is found, "in6m" - * returns NLL. + * returns NULL. + * + * We do this differently compared other BSD implementations; instead of + * walking the if_multiaddrs list at the interface and returning the + * ifma_protospec value of a matching entry, we search the global list + * of in6_multi records and find it that way. Otherwise either the two + * structures (in6_multi, ifmultiaddr) need to be ref counted both ways, + * which will make things too complicated, or they need to reside in the + * same protected domain, which they aren't. + * + * Must be called with in6_multihead_lock held. */ - -#define IN6_LOOKUP_MULTI(addr, ifp, in6m) \ -/* struct in6_addr addr; */ \ -/* struct ifnet *ifp; */ \ -/* struct in6_multi *in6m; */ \ -do { \ - struct ifmultiaddr *_ifma; \ - for (_ifma = (ifp)->if_multiaddrs.lh_first; _ifma; \ - _ifma = _ifma->ifma_link.le_next) { \ - if (_ifma->ifma_addr->sa_family == AF_INET6 \ - && IN6_ARE_ADDR_EQUAL(&((struct sockaddr_in6 *)_ifma->ifma_addr)->sin6_addr, \ - &(addr))) \ - break; \ - } \ - (in6m) = (struct in6_multi *)(_ifma ? _ifma->ifma_protospec : 0); \ +#define IN6_LOOKUP_MULTI(addr, ifp, in6m) \ + /* struct in6_addr *addr; */ \ + /* struct ifnet *ifp; */ \ + /* struct in6_multi *in6m; */ \ +do { \ + struct in6_multistep _step; \ + IN6_FIRST_MULTI(_step, in6m); \ + while ((in6m) != NULL) { \ + IN6M_LOCK_SPIN(in6m); \ + if ((in6m)->in6m_ifp == (ifp) && \ + IN6_ARE_ADDR_EQUAL(&(in6m)->in6m_addr, (addr))) { \ + IN6M_ADDREF_LOCKED(in6m); \ + IN6M_UNLOCK(in6m); \ + break; \ + } \ + IN6M_UNLOCK(in6m); \ + IN6_NEXT_MULTI(_step, in6m); \ + } \ } while(0) /* @@ -699,34 +870,58 @@ do { \ * provide. IN6_FIRST_MULTI(), below, must be called to initialize "step" * and get the first record. Both macros return a NULL "in6m" when there * are no remaining records. + * + * Must be called with in6_multihead_lock held. */ #define IN6_NEXT_MULTI(step, in6m) \ -/* struct in6_multistep step; */ \ -/* struct in6_multi *in6m; */ \ -do { \ - if (((in6m) = (step).i_in6m) != NULL) \ - (step).i_in6m = (step).i_in6m->in6m_entry.le_next; \ -} while(0) + /* struct in6_multistep step; */ \ + /* struct in6_multi *in6m; */ \ +do { \ + in6_multihead_lock_assert(LCK_RW_ASSERT_HELD); \ + if (((in6m) = (step).i_in6m) != NULL) \ + (step).i_in6m = (step).i_in6m->in6m_entry.le_next; \ +} while (0) -#define IN6_FIRST_MULTI(step, in6m) \ -/* struct in6_multistep step; */ \ -/* struct in6_multi *in6m */ \ -do { \ - (step).i_in6m = in6_multihead.lh_first; \ - IN6_NEXT_MULTI((step), (in6m)); \ -} while(0) +#define IN6_FIRST_MULTI(step, in6m) \ + /* struct in6_multistep step; */ \ + /* struct in6_multi *in6m */ \ +do { \ + in6_multihead_lock_assert(LCK_RW_ASSERT_HELD); \ + (step).i_in6m = in6_multihead.lh_first; \ + IN6_NEXT_MULTI((step), (in6m)); \ +} while (0) -extern struct in6_multi *in6_addmulti(struct in6_addr *, struct ifnet *, - int *, int); -extern void in6_delmulti(struct in6_multi *, int); +/* Multicast private KPIs. */ +extern int im6o_mc_filter(const struct ip6_moptions *, const struct ifnet *, + const struct sockaddr *, const struct sockaddr *); +extern int in6_mc_join(struct ifnet *, const struct in6_addr *, + struct in6_mfilter *, struct in6_multi **, int); +extern int in6_mc_leave(struct in6_multi *, struct in6_mfilter *); +extern void in6m_clear_recorded(struct in6_multi *); +extern void in6m_commit(struct in6_multi *); +extern void in6m_purge(struct in6_multi *); +extern void in6m_print(const struct in6_multi *); +extern int in6m_record_source(struct in6_multi *, const struct in6_addr *); +extern int ip6_getmoptions(struct inpcb *, struct sockopt *); +extern int ip6_setmoptions(struct inpcb *, struct sockopt *); + +/* Legacy KAME multicast private KPIs. */ +extern struct in6_multi_mship *in6_joingroup(struct ifnet *, + struct in6_addr *, int *, int); +extern int in6_leavegroup(struct in6_multi_mship *); + +extern void in6_multi_init(void); +extern void in6m_addref(struct in6_multi *, int); +extern void in6m_remref(struct in6_multi *, int); +extern int in6_multi_detach(struct in6_multi *); extern int in6_ifindex2scopeid(int); extern int in6_mask2len(struct in6_addr *, u_char *); extern void in6_len2mask(struct in6_addr *, int); extern int in6_control(struct socket *, u_long, caddr_t, struct ifnet *, struct proc *); extern int in6_update_ifa(struct ifnet *, struct in6_aliasreq *, - struct in6_ifaddr *, int); -extern void in6_purgeaddr(struct ifaddr *, int); + struct in6_ifaddr *, int, int); +extern void in6_purgeaddr(struct ifaddr *); extern int in6if_do_dad(struct ifnet *); extern void in6_purgeif(struct ifnet *); extern void in6_savemkludge(struct in6_ifaddr *); @@ -744,21 +939,35 @@ extern void in6_prefixlen2mask(struct in6_addr *maskp, int len); extern int in6_prefix_add_ifid(int iilen, struct in6_ifaddr *ia); extern void in6_prefix_remove_ifid(int iilen, struct in6_ifaddr *ia); extern void in6_purgeprefix(struct ifnet *); +extern void in6_purgeaddrs(struct ifnet *); extern int in6_is_addr_deprecated(struct sockaddr_in6 *); +extern uint8_t im6s_get_mode(const struct in6_multi *, + const struct ip6_msource *, uint8_t); + +extern void im6f_leave(struct in6_mfilter *); +extern void im6f_purge(struct in6_mfilter *); struct inpcb; +struct ip6_pktopts; extern int in6_embedscope(struct in6_addr *, const struct sockaddr_in6 *, - struct inpcb *, struct ifnet **); + struct inpcb *, struct ifnet **, struct ip6_pktopts *); extern int in6_recoverscope(struct sockaddr_in6 *, const struct in6_addr *, struct ifnet *); -extern void in6_clearscope(struct in6_addr *); extern void in6_aliasreq_64_to_32(struct in6_aliasreq_64 *, struct in6_aliasreq_32 *); extern void in6_aliasreq_32_to_64(struct in6_aliasreq_32 *, struct in6_aliasreq_64 *); extern void in6_ifaddr_init(void); extern void in6_rtqdrain(void); -#endif /* KERNEL_PRIVATE */ +extern struct radix_node *in6_validate(struct radix_node *); +extern int in6_if2idlen(struct ifnet *); +extern int in6_src_ioctl (u_long, caddr_t); + +__private_extern__ void in6_multihead_lock_exclusive(void); +__private_extern__ void in6_multihead_lock_shared(void); +__private_extern__ void in6_multihead_lock_assert(int); +__private_extern__ void in6_multihead_lock_done(void); +#endif /* XNU_KERNEL_PRIVATE */ #endif /* _NETINET6_IN6_VAR_H_ */ diff --git a/bsd/netinet6/ip6_forward.c b/bsd/netinet6/ip6_forward.c index 202d8ccb1..f2d6e3bd6 100644 --- a/bsd/netinet6/ip6_forward.c +++ b/bsd/netinet6/ip6_forward.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2009-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -84,6 +84,7 @@ #include #include #include +#include #include @@ -95,7 +96,6 @@ #include extern int ipsec_bypass; #endif /* IPSEC */ -extern lck_mtx_t *ip6_mutex; #include @@ -120,7 +120,7 @@ extern lck_mtx_t *ip6_mutex; void ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, - int srcrt, int locked) + int srcrt) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct sockaddr_in6 *dst; @@ -128,14 +128,24 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, int error, type = 0, code = 0; struct mbuf *mcopy = NULL; struct ifnet *ifp, *origifp; /* maybe unnecessary */ + u_int32_t inzone, outzone; + struct in6_addr src_in6, dst_in6; #if IPSEC struct secpolicy *sp = NULL; #endif struct timeval timenow; int tunneledv4 = 0; + unsigned int ifscope = IFSCOPE_NONE; +#if PF + struct pf_mtag *pf_mtag; +#endif /* PF */ getmicrotime(&timenow); - +#if PF + pf_mtag = pf_find_mtag(m); + if (pf_mtag != NULL && pf_mtag->rtableid != IFSCOPE_NONE) + ifscope = pf_mtag->rtableid; +#endif /* PF */ #if IPSEC /* @@ -181,12 +191,8 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, if (ip6->ip6_hlim <= IPV6_HLIMDEC) { /* XXX in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard) */ - if (locked) - lck_mtx_unlock(ip6_mutex); icmp6_error(m, ICMP6_TIME_EXCEEDED, ICMP6_TIME_EXCEED_TRANSIT, 0); - if (locked) - lck_mtx_lock(ip6_mutex); return; } ip6->ip6_hlim -= IPV6_HLIMDEC; @@ -293,11 +299,7 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, state.ro = NULL; /* update at ipsec6_output_tunnel() */ state.dst = NULL; /* update at ipsec6_output_tunnel() */ - if (locked) - lck_mtx_unlock(ip6_mutex); error = ipsec6_output_tunnel(&state, sp, 0, &tunneledv4); - if (locked) - lck_mtx_lock(ip6_mutex); key_freesp(sp, KEY_SADB_UNLOCKED); if (tunneledv4) return; /* packet is gone - sent over IPv4 */ @@ -334,15 +336,6 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, skip_ipsec: #endif /* IPSEC */ - /* - * If "locked", ip6forward_rt points to the globally defined - * struct route cache which requires ip6_mutex, e.g. when this - * is called from ip6_input(). Else the caller is responsible - * for the struct route and its serialization (if needed), e.g. - * when this is called from ip6_rthdr0(). - */ - if (locked) - lck_mtx_assert(ip6_mutex, LCK_MTX_ASSERT_OWNED); dst = (struct sockaddr_in6 *)&ip6forward_rt->ro_dst; if ((rt = ip6forward_rt->ro_rt) != NULL) { RT_LOCK(rt); @@ -364,8 +357,8 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, ip6forward_rt->ro_rt = NULL; } /* this probably fails but give it a try again */ - rtalloc_ign((struct route *)ip6forward_rt, - RTF_PRCLONING); + rtalloc_scoped_ign((struct route *)ip6forward_rt, + RTF_PRCLONING, ifscope); if ((rt = ip6forward_rt->ro_rt) != NULL) { RT_LOCK(rt); /* Take an extra ref for ourselves */ @@ -376,14 +369,9 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, if (rt == NULL) { ip6stat.ip6s_noroute++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_noroute); - if (mcopy) { - if (locked) - lck_mtx_unlock(ip6_mutex); + if (mcopy) icmp6_error(mcopy, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOROUTE, 0); - if (locked) - lck_mtx_lock(ip6_mutex); - } m_freem(m); return; } @@ -403,18 +391,14 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, dst->sin6_family = AF_INET6; dst->sin6_addr = ip6->ip6_dst; - rtalloc_ign((struct route *)ip6forward_rt, RTF_PRCLONING); + rtalloc_scoped_ign((struct route *)ip6forward_rt, + RTF_PRCLONING, ifscope); if ((rt = ip6forward_rt->ro_rt) == NULL) { ip6stat.ip6s_noroute++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_noroute); - if (mcopy) { - if (locked) - lck_mtx_unlock(ip6_mutex); + if (mcopy) icmp6_error(mcopy, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOROUTE, 0); - if (locked) - lck_mtx_lock(ip6_mutex); - } m_freem(m); return; } @@ -424,14 +408,29 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, } /* - * Scope check: if a packet can't be delivered to its destination - * for the reason that the destination is beyond the scope of the - * source address, discard the packet and return an icmp6 destination - * unreachable error with Code 2 (beyond scope of source address). - * [draft-ietf-ipngwg-icmp-v3-02.txt, Section 3.1] + * Source scope check: if a packet can't be delivered to its + * destination for the reason that the destination is beyond the scope + * of the source address, discard the packet and return an icmp6 + * destination unreachable error with Code 2 (beyond scope of source + * address). We use a local copy of ip6_src, since in6_setscope() + * will possibly modify its first argument. + * [draft-ietf-ipngwg-icmp-v3-04.txt, Section 3.1] */ - if (in6_addr2scopeid(m->m_pkthdr.rcvif, &ip6->ip6_src) != - in6_addr2scopeid(rt->rt_ifp, &ip6->ip6_src)) { + src_in6 = ip6->ip6_src; + if (in6_setscope(&src_in6, rt->rt_ifp, &outzone)) { + /* XXX: this should not happen */ + ip6stat.ip6s_cantforward++; + ip6stat.ip6s_badscope++; + m_freem(m); + return; + } + if (in6_setscope(&src_in6, m->m_pkthdr.rcvif, &inzone)) { + ip6stat.ip6s_cantforward++; + ip6stat.ip6s_badscope++; + m_freem(m); + return; + } + if (inzone != outzone) { ip6stat.ip6s_cantforward++; ip6stat.ip6s_badscope++; in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard); @@ -450,17 +449,30 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); if (mcopy) { - if (locked) - lck_mtx_unlock(ip6_mutex); icmp6_error(mcopy, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_BEYONDSCOPE, 0); - if (locked) - lck_mtx_lock(ip6_mutex); } m_freem(m); return; } + /* + * Destination scope check: if a packet is going to break the scope + * zone of packet's destination address, discard it. This case should + * usually be prevented by appropriately-configured routing table, but + * we need an explicit check because we may mistakenly forward the + * packet to a different zone by (e.g.) a default route. + */ + dst_in6 = ip6->ip6_dst; + if (in6_setscope(&dst_in6, m->m_pkthdr.rcvif, &inzone) != 0 || + in6_setscope(&dst_in6, rt->rt_ifp, &outzone) != 0 || + inzone != outzone) { + ip6stat.ip6s_cantforward++; + ip6stat.ip6s_badscope++; + m_freem(m); + return; + } + if (m->m_pkthdr.len > rt->rt_ifp->if_mtu) { in6_ifstat_inc(rt->rt_ifp, ifs6_in_toobig); if (mcopy) { @@ -475,7 +487,7 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, #if IPSEC /* * When we do IPsec tunnel ingress, we need to play - * with if_mtu value (decrement IPsec header size + * with the link value (decrement IPsec header size * from mtu value). The code is much simpler than v4 * case, as we have the outgoing interface for * encapsulated packet as "rt->rt_ifp". @@ -499,11 +511,7 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, /* Release extra ref */ RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); - if (locked) - lck_mtx_unlock(ip6_mutex); icmp6_error(mcopy, ICMP6_PACKET_TOO_BIG, 0, mtu); - if (locked) - lck_mtx_lock(ip6_mutex); } else { /* Release extra ref */ RT_REMREF_LOCKED(rt); @@ -525,7 +533,7 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, * Also, don't send redirect if forwarding using a route * modified by a redirect. */ - if (rt->rt_ifp == m->m_pkthdr.rcvif && !srcrt && + if (ip6_sendredirects && rt->rt_ifp == m->m_pkthdr.rcvif && !srcrt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0) { if ((rt->rt_ifp->if_flags & IFF_POINTOPOINT) != 0) { /* @@ -540,12 +548,8 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, */ RT_REMREF_LOCKED(rt); /* Release extra ref */ RT_UNLOCK(rt); - if (locked) - lck_mtx_unlock(ip6_mutex); icmp6_error(mcopy, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 0); - if (locked) - lck_mtx_lock(ip6_mutex); m_freem(m); return; } @@ -611,29 +615,21 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, } else origifp = rt->rt_ifp; -#ifndef SCOPEDROUTING /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); -#endif ifp = rt->rt_ifp; /* Drop the lock but retain the extra ref */ RT_UNLOCK(rt); #if PF - if (locked) - lck_mtx_unlock(ip6_mutex); - /* Invoke outbound packet filter */ error = pf_af_hook(ifp, NULL, &m, AF_INET6, FALSE); - if (locked) - lck_mtx_lock(ip6_mutex); - if (error) { if (m != NULL) { panic("%s: unexpected packet %p\n", __func__, m); @@ -645,7 +641,7 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, ip6 = mtod(m, struct ip6_hdr *); #endif /* PF */ - error = nd6_output(ifp, origifp, m, dst, rt, locked); + error = nd6_output(ifp, origifp, m, dst, rt); if (error) { in6_ifstat_inc(ifp, ifs6_out_discard); ip6stat.ip6s_cantforward++; @@ -697,11 +693,7 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, code = ICMP6_DST_UNREACH_ADDR; break; } - if (locked) - lck_mtx_unlock(ip6_mutex); icmp6_error(mcopy, type, code, 0); - if (locked) - lck_mtx_lock(ip6_mutex); /* Release extra ref */ RT_REMREF(rt); return; diff --git a/bsd/netinet6/ip6_fw.c b/bsd/netinet6/ip6_fw.c index f1b9f0508..ae221caad 100644 --- a/bsd/netinet6/ip6_fw.c +++ b/bsd/netinet6/ip6_fw.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2008 Apple Inc. All rights reserved. + * Copyright (c) 2003-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -150,11 +150,11 @@ static int ip6fw_sysctl SYSCTL_HANDLER_ARGS; SYSCTL_DECL(_net_inet6_ip6); SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Firewall"); SYSCTL_PROC(_net_inet6_ip6_fw, OID_AUTO, enable, - CTLTYPE_INT | CTLFLAG_RW, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_fw_enable, 0, ip6fw_sysctl, "I", "Enable ip6fw"); -SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, debug, CTLFLAG_RW, &fw6_debug, 0, ""); -SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, verbose, CTLFLAG_RW, &fw6_verbose, 0, ""); -SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, verbose_limit, CTLFLAG_RW, &fw6_verbose_limit, 0, ""); +SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED, &fw6_debug, 0, ""); +SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_LOCKED, &fw6_verbose, 0, ""); +SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, verbose_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &fw6_verbose_limit, 0, ""); static int ip6fw_sysctl SYSCTL_HANDLER_ARGS @@ -202,7 +202,6 @@ static void cp_to_user_32( struct ip6_fw_32 *userrule_32, struct ip6_fw *rule); static void cp_from_user_32( struct ip6_fw_32 *userrule_32, struct ip6_fw *rule); static char err_prefix[] = "ip6_fw_ctl:"; -extern lck_mtx_t *ip6_mutex; /* * Returns 1 if the port is matched by the vector, 0 otherwise @@ -390,17 +389,21 @@ iface_match(struct ifnet *ifp, union ip6_fw_if *ifu, int byname) struct ifaddr *ia; ifnet_lock_shared(ifp); - for (ia = ifp->if_addrlist.tqh_first; ia; ia = ia->ifa_list.tqe_next) + for (ia = ifp->if_addrlist.tqh_first; ia; + ia = ia->ifa_list.tqe_next) { - - if (ia->ifa_addr == NULL) - continue; - if (ia->ifa_addr->sa_family != AF_INET6) + IFA_LOCK_SPIN(ia); + if (ia->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ia); continue; + } if (!IN6_ARE_ADDR_EQUAL(&ifu->fu_via_ip6, &(((struct sockaddr_in6 *) - (ia->ifa_addr))->sin6_addr))) + (ia->ifa_addr))->sin6_addr))) { + IFA_UNLOCK(ia); continue; + } + IFA_UNLOCK(ia); ifnet_lock_done(ifp); return(1); } @@ -558,7 +561,7 @@ ip6_fw_chk(struct ip6_hdr **pip6, struct ip6_fw_chain *chain; struct ip6_fw *rule = NULL; struct ip6_hdr *ip6 = *pip6; - struct ifnet *const rif = ((*m)->m_flags & M_LOOP) ? ifunit("lo0") : (*m)->m_pkthdr.rcvif; + struct ifnet *const rif = ((*m)->m_flags & M_LOOP) ? lo_ifp : (*m)->m_pkthdr.rcvif; u_short offset = 0; int off = sizeof(struct ip6_hdr), nxt = ip6->ip6_nxt; u_short src_port, dst_port; @@ -870,18 +873,15 @@ ip6_fw_chk(struct ip6_hdr **pip6, } bcopy(&ti, ip6, sizeof(ti)); tcp_respond(NULL, ip6, (struct tcphdr *)(ip6 + 1), - *m, ack, seq, flags, IFSCOPE_NONE); + *m, ack, seq, flags, IFSCOPE_NONE, 0); *m = NULL; break; } default: /* Send an ICMP unreachable using code */ if (oif) (*m)->m_pkthdr.rcvif = oif; - lck_mtx_assert(ip6_mutex, LCK_MTX_ASSERT_OWNED); - lck_mtx_unlock(ip6_mutex); icmp6_error(*m, ICMP6_DST_UNREACH, rule->fw_reject_code, 0); - lck_mtx_lock(ip6_mutex); *m = NULL; break; } @@ -962,6 +962,7 @@ add_entry6(struct ip6_fw_head *chainptr, struct ip6_fw *frwl) } } + bcopy(ftmp, frwl, sizeof(struct ip6_fw)); splx(s); return (0); } @@ -1400,6 +1401,17 @@ ip6_fw_ctl(struct sockopt *sopt) ip6fw_kev_post_msg(KEV_IP6FW_ADD); } else error = EINVAL; + + if (is64user){ + struct ip6_fw_64 userrule_64; + cp_to_user_64( &userrule_64, &rule); + error = sooptcopyout(sopt, &userrule_64, userrulesize); + } + else { + struct ip6_fw_32 userrule_32; + cp_to_user_32( &userrule_32, &rule); + error = sooptcopyout(sopt, &userrule_32, userrulesize); + } break; case IPV6_FW_DEL: diff --git a/bsd/netinet6/ip6_fw.h b/bsd/netinet6/ip6_fw.h index 32f4f280b..92f913f29 100644 --- a/bsd/netinet6/ip6_fw.h +++ b/bsd/netinet6/ip6_fw.h @@ -42,6 +42,7 @@ #ifndef _IP6_FW_H #define _IP6_FW_H +#ifdef __APPLE_API_OBSOLETE #include @@ -343,4 +344,5 @@ extern int ip6_fw_enable; #endif /* KERNEL_PRIVATE */ +#endif /* __APPLE_API_OBSOLETE */ #endif /* _IP6_FW_H */ diff --git a/bsd/netinet6/ip6_id.c b/bsd/netinet6/ip6_id.c new file mode 100644 index 000000000..26fffd286 --- /dev/null +++ b/bsd/netinet6/ip6_id.c @@ -0,0 +1,304 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/*- + * Copyright (C) 2003 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $KAME: ip6_id.c,v 1.13 2003/09/16 09:11:19 itojun Exp $ + */ + +/*- + * Copyright 1998 Niels Provos + * All rights reserved. + * + * Theo de Raadt came up with the idea of using + * such a mathematical system to generate more random (yet non-repeating) + * ids to solve the resolver/named problem. But Niels designed the + * actual system based on the constraints. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Niels Provos. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $OpenBSD: ip_id.c,v 1.6 2002/03/15 18:19:52 millert Exp $ + */ + +#include + +/* + * seed = random (bits - 1) bit + * n = prime, g0 = generator to n, + * j = random so that gcd(j,n-1) == 1 + * g = g0^j mod n will be a generator again. + * + * X[0] = random seed. + * X[n] = a*X[n-1]+b mod m is a Linear Congruential Generator + * with a = 7^(even random) mod m, + * b = random with gcd(b,m) == 1 + * m = constant and a maximal period of m-1. + * + * The transaction id is determined by: + * id[n] = seed xor (g^X[n] mod n) + * + * Effectivly the id is restricted to the lower (bits - 1) bits, thus + * yielding two different cycles by toggling the msb on and off. + * This avoids reuse issues caused by reseeding. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifndef INT32_MAX +#define INT32_MAX 0x7fffffffU +#endif + +struct randomtab { + const int ru_bits; /* resulting bits */ + const long ru_out; /* Time after wich will be reseeded */ + const u_int32_t ru_max; /* Uniq cycle, avoid blackjack prediction */ + const u_int32_t ru_gen; /* Starting generator */ + const u_int32_t ru_n; /* ru_n: prime, ru_n - 1: product of pfacts[] */ + const u_int32_t ru_agen; /* determine ru_a as ru_agen^(2*rand) */ + const u_int32_t ru_m; /* ru_m = 2^x*3^y */ + const u_int32_t pfacts[4]; /* factors of ru_n */ + + u_int32_t ru_counter; + u_int32_t ru_msb; + + u_int32_t ru_x; + u_int32_t ru_seed, ru_seed2; + u_int32_t ru_a, ru_b; + u_int32_t ru_g; + long ru_reseed; +}; + +static struct randomtab randomtab_32 = { + 32, /* resulting bits */ + 180, /* Time after wich will be reseeded */ + 1000000000, /* Uniq cycle, avoid blackjack prediction */ + 2, /* Starting generator */ + 2147483629, /* RU_N-1 = 2^2*3^2*59652323 */ + 7, /* determine ru_a as RU_AGEN^(2*rand) */ + 1836660096, /* RU_M = 2^7*3^15 - don't change */ + { 2, 3, 59652323, 0 }, /* factors of ru_n */ + 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +static struct randomtab randomtab_20 = { + 20, /* resulting bits */ + 180, /* Time after wich will be reseeded */ + 200000, /* Uniq cycle, avoid blackjack prediction */ + 2, /* Starting generator */ + 524269, /* RU_N-1 = 2^2*3^2*14563 */ + 7, /* determine ru_a as RU_AGEN^(2*rand) */ + 279936, /* RU_M = 2^7*3^7 - don't change */ + { 2, 3, 14563, 0 }, /* factors of ru_n */ + 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +static u_int32_t pmod(u_int32_t, u_int32_t, u_int32_t); +static void initid(struct randomtab *); +static u_int32_t randomid(struct randomtab *); + +/* + * Do a fast modular exponation, returned value will be in the range + * of 0 - (mod-1) + */ +static u_int32_t +pmod(u_int32_t gen, u_int32_t expo, u_int32_t mod) +{ + u_int64_t s, t, u; + + s = 1; + t = gen; + u = expo; + + while (u) { + if (u & 1) + s = (s * t) % mod; + u >>= 1; + t = (t * t) % mod; + } + return (s); +} + +/* + * Initalizes the seed and chooses a suitable generator. Also toggles + * the msb flag. The msb flag is used to generate two distinct + * cycles of random numbers and thus avoiding reuse of ids. + * + * This function is called from id_randomid() when needed, an + * application does not have to worry about it. + */ +static void +initid(struct randomtab *p) +{ + u_int32_t j, i; + int noprime = 1; + struct timeval timenow; + + getmicrotime(&timenow); + + p->ru_x = random() % p->ru_m; + + /* (bits - 1) bits of random seed */ + p->ru_seed = random() & (~0U >> (32 - p->ru_bits + 1)); + p->ru_seed2 = random() & (~0U >> (32 - p->ru_bits + 1)); + + /* Determine the LCG we use */ + p->ru_b = (random() & (~0U >> (32 - p->ru_bits))) | 1; + p->ru_a = pmod(p->ru_agen, + (random() & (~0U >> (32 - p->ru_bits))) & (~1U), p->ru_m); + while (p->ru_b % 3 == 0) + p->ru_b += 2; + + j = random() % p->ru_n; + + /* + * Do a fast gcd(j, RU_N - 1), so we can find a j with + * gcd(j, RU_N - 1) == 1, giving a new generator for + * RU_GEN^j mod RU_N + */ + while (noprime) { + for (i = 0; p->pfacts[i] > 0; i++) + if (j % p->pfacts[i] == 0) + break; + + if (p->pfacts[i] == 0) + noprime = 0; + else + j = (j + 1) % p->ru_n; + } + + p->ru_g = pmod(p->ru_gen, j, p->ru_n); + p->ru_counter = 0; + + p->ru_reseed = timenow.tv_sec + p->ru_out; + p->ru_msb = p->ru_msb ? 0 : (1U << (p->ru_bits - 1)); +} + +static u_int32_t +randomid(struct randomtab *p) +{ + int i, n; + u_int32_t tmp; + struct timeval timenow; + + getmicrotime(&timenow); + + if (p->ru_counter >= p->ru_max || timenow.tv_sec > p->ru_reseed) + initid(p); + + tmp = random(); + + /* Skip a random number of ids */ + n = tmp & 0x3; tmp = tmp >> 2; + if (p->ru_counter + n >= p->ru_max) + initid(p); + + for (i = 0; i <= n; i++) { + /* Linear Congruential Generator */ + p->ru_x = (u_int32_t)((u_int64_t)p->ru_a * p->ru_x + p->ru_b) % p->ru_m; + } + + p->ru_counter += i; + + return (p->ru_seed ^ pmod(p->ru_g, p->ru_seed2 ^ p->ru_x, p->ru_n)) | + p->ru_msb; +} + +u_int32_t +ip6_randomid(void) +{ + + return randomid(&randomtab_32); +} + +u_int32_t +ip6_randomflowlabel(void) +{ + + return randomid(&randomtab_20) & 0xfffff; +} diff --git a/bsd/netinet6/ip6_input.c b/bsd/netinet6/ip6_input.c index 0a8320298..ae8fecd68 100644 --- a/bsd/netinet6/ip6_input.c +++ b/bsd/netinet6/ip6_input.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2008 Apple Inc. All rights reserved. + * Copyright (c) 2003-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -105,8 +105,13 @@ #include #include #include +#include #include #include +#include +#include + +#include #include #include @@ -114,6 +119,7 @@ #include #include #include +#include #include #include @@ -128,7 +134,8 @@ #include #include #include -#include +#include +#include #if IPSEC #include @@ -159,7 +166,14 @@ extern struct ip6protosw inet6sw[]; struct ip6protosw * ip6_protox[IPPROTO_MAX]; static int ip6qmaxlen = IFQ_MAXLEN; -struct in6_ifaddr *in6_ifaddrs; + +static lck_grp_attr_t *in6_ifaddr_rwlock_grp_attr; +static lck_grp_t *in6_ifaddr_rwlock_grp; +static lck_attr_t *in6_ifaddr_rwlock_attr; +decl_lck_rw_data(, in6_ifaddr_rwlock); + +/* Protected by in6_ifaddr_rwlock */ +struct in6_ifaddr *in6_ifaddrs = NULL; int ip6_forward_srcrt; /* XXX */ int ip6_sourcecheck; /* XXX */ @@ -168,7 +182,14 @@ const int int6intrq_present = 1; int ip6_ours_check_algorithm; int in6_init2done = 0; +int in6_init_done = 0; +#define _CASSERT(x) \ + switch (0) { case 0: case (x): ; } +#define IN6_IFSTAT_REQUIRE_ALIGNED_64(f) \ + _CASSERT(!(offsetof(struct in6_ifstat, f) % sizeof (uint64_t))) +#define ICMP6_IFSTAT_REQUIRE_ALIGNED_64(f) \ + _CASSERT(!(offsetof(struct icmp6_ifstat, f) % sizeof (uint64_t))) #if IPFW2 /* firewall hooks */ @@ -181,17 +202,23 @@ struct ip6stat ip6stat; #ifdef __APPLE__ struct ifqueue ip6intrq; -lck_mtx_t *ip6_mutex; +decl_lck_mtx_data(, ip6_init_mutex); lck_mtx_t *dad6_mutex; lck_mtx_t *nd6_mutex; lck_mtx_t *prefix6_mutex; lck_mtx_t *scope6_mutex; +#ifdef ENABLE_ADDRSEL +lck_mtx_t *addrsel_mutex; +#endif +decl_lck_rw_data(, in6_ifs_rwlock); +decl_lck_rw_data(, icmp6_ifs_rwlock); lck_attr_t *ip6_mutex_attr; lck_grp_t *ip6_mutex_grp; lck_grp_attr_t *ip6_mutex_grp_attr; extern lck_mtx_t *inet6_domain_mutex; #endif extern int loopattach_done; +extern void addrsel_policy_init(void); static void ip6_init2(void *); static struct ip6aux *ip6_setdstifaddr(struct mbuf *, struct in6_ifaddr *); @@ -209,6 +236,11 @@ void stfattach(void); extern lck_mtx_t *domain_proto_mtx; +SYSCTL_DECL(_net_inet6_ip6); + +int ip6_doscopedroute = 1; +SYSCTL_INT(_net_inet6_ip6, OID_AUTO, scopedroute, CTLFLAG_RD | CTLFLAG_LOCKED, + &ip6_doscopedroute, 0, "Enable IPv6 scoped routing"); static void ip6_proto_input( @@ -229,6 +261,9 @@ ip6_init() int i; struct timeval tv; + PE_parse_boot_argn("net.inet6.ip6.scopedroute", &ip6_doscopedroute, + sizeof (ip6_doscopedroute)); + #if DIAGNOSTIC if (sizeof(struct protosw) != sizeof(struct ip6protosw)) panic("sizeof(protosw) != sizeof(ip6protosw)"); @@ -251,9 +286,6 @@ ip6_init() ip6_mutex_grp = lck_grp_alloc_init("ip6", ip6_mutex_grp_attr); ip6_mutex_attr = lck_attr_alloc_init(); - if ((ip6_mutex = lck_mtx_alloc_init(ip6_mutex_grp, ip6_mutex_attr)) == NULL) { - panic("ip6_init: can't alloc ip6_mutex\n"); - } if ((dad6_mutex = lck_mtx_alloc_init(ip6_mutex_grp, ip6_mutex_attr)) == NULL) { panic("ip6_init: can't alloc dad6_mutex\n"); } @@ -269,14 +301,90 @@ ip6_init() panic("ip6_init: can't alloc scope6_mutex\n"); } +#ifdef ENABLE_ADDRSEL + if ((addrsel_mutex = lck_mtx_alloc_init(ip6_mutex_grp, ip6_mutex_attr)) == NULL) { + panic("ip6_init: can't alloc addrsel_mutex\n"); + } +#endif + + lck_rw_init(&in6_ifs_rwlock, ip6_mutex_grp, ip6_mutex_attr); + lck_rw_init(&icmp6_ifs_rwlock, ip6_mutex_grp, ip6_mutex_attr); + lck_mtx_init(&ip6_init_mutex, ip6_mutex_grp, ip6_mutex_attr); inet6domain.dom_flags = DOM_REENTRANT; ip6intrq.ifq_maxlen = ip6qmaxlen; + + in6_ifaddr_rwlock_grp_attr = lck_grp_attr_alloc_init(); + in6_ifaddr_rwlock_grp = lck_grp_alloc_init("in6_ifaddr_rwlock", + in6_ifaddr_rwlock_grp_attr); + in6_ifaddr_rwlock_attr = lck_attr_alloc_init(); + lck_rw_init(&in6_ifaddr_rwlock, in6_ifaddr_rwlock_grp, + in6_ifaddr_rwlock_attr); + + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_receive); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_hdrerr); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_toobig); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_noroute); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_addrerr); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_protounknown); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_truncated); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_discard); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_deliver); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_forward); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_request); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_discard); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_fragok); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_fragfail); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_fragcreat); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_reass_reqd); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_reass_ok); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_reass_fail); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_mcast); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_mcast); + + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_msg); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_error); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_dstunreach); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_adminprohib); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_timeexceed); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_paramprob); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_pkttoobig); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_echo); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_echoreply); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_routersolicit); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_routeradvert); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_neighborsolicit); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_neighboradvert); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_redirect); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_mldquery); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_mldreport); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_mlddone); + + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_msg); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_error); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_dstunreach); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_adminprohib); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_timeexceed); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_paramprob); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_pkttoobig); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_echo); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_echoreply); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_routersolicit); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_routeradvert); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_neighborsolicit); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_neighboradvert); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_redirect); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_mldquery); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_mldreport); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_mlddone); + in6_ifaddr_init(); + ip6_moptions_init(); nd6_init(); frag6_init(); icmp6_init(); + addrsel_policy_init(); /* * in many cases, random() here does NOT return random number * as initialization during bootstrap time occur in fixed order. @@ -310,9 +418,6 @@ ip6_init2( /* nd6_timer_init */ timeout(nd6_timer, (caddr_t)0, hz); - /* router renumbering prefix list maintenance */ - timeout(in6_rr_timer, (caddr_t)0, hz); - /* timer for regeneranation of temporary addresses randomize ID */ timeout(in6_tmpaddrtimer, (caddr_t)0, (ip6_temp_preferred_lifetime - ip6_desync_factor - @@ -327,42 +432,27 @@ ip6_init2( #if NSTF stfattach(); #endif -#else - /* nd6_timer_init */ - - callout_init(&nd6_timer_ch); - callout_reset(&nd6_timer_ch, hz, nd6_timer, NULL); - - /* router renumbering prefix list maintenance */ - callout_init(&in6_rr_timer_ch); - callout_reset(&in6_rr_timer_ch, hz, in6_rr_timer, NULL); - - /* timer for regeneranation of temporary addresses randomize ID */ - callout_reset(&in6_tmpaddrtimer_ch, - (ip6_temp_preferred_lifetime - ip6_desync_factor - - ip6_temp_regen_advance) * hz, - in6_tmpaddrtimer, NULL); #endif - in6_init2done = 1; -} -#if __FreeBSD__ -/* cheat */ -/* This must be after route_init(), which is now SI_ORDER_THIRD */ -SYSINIT(netinet6init2, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ip6_init2, NULL); -#endif + lck_mtx_lock(&ip6_init_mutex); + in6_init_done = 1; + wakeup(&in6_init_done); + lck_mtx_unlock(&ip6_init_mutex); +} -/* - * ip6_forward_rt contains the route entry that was recently used during - * the forwarding of an IPv6 packet and thus acts as a route cache. Access - * to this variable is protected by the global lock ip6_mutex. - */ -static struct route_in6 ip6_forward_rt; +void +ip6_fin() +{ + lck_mtx_lock(&ip6_init_mutex); + while (in6_init_done == 0) { + (void) msleep(&in6_init_done, &ip6_init_mutex, 0, "ip6_fin()", NULL); + } + lck_mtx_unlock(&ip6_init_mutex); +} void -ip6_input(m) - struct mbuf *m; +ip6_input(struct mbuf *m) { struct ip6_hdr *ip6; int off = sizeof(struct ip6_hdr), nest; @@ -372,6 +462,16 @@ ip6_input(m) struct ifnet *deliverifp = NULL; ipfilter_t inject_ipfref = 0; int seen; + struct in6_ifaddr *ia6 = NULL; + struct route_in6 ip6_forward_rt; + struct sockaddr_in6 *dst6; + + bzero(&ip6_forward_rt, sizeof(ip6_forward_rt)); + + /* Check if the packet we received is valid after interface filter + * processing + */ + MBUF_INPUT_CHECK(m, m->m_pkthdr.rcvif); /* * No need to proccess packet twice if we've @@ -402,7 +502,6 @@ ip6_input(m) */ ip6_delaux(m); - lck_mtx_lock(ip6_mutex); /* * mbuf statistics */ @@ -425,6 +524,15 @@ ip6_input(m) #undef M2MMAX } + /* drop the packet if IPv6 operation is disabled on the IF */ + lck_rw_lock_shared(nd_if_rwlock); + if (m->m_pkthdr.rcvif->if_index < nd_ifinfo_indexlim && + (nd_ifinfo[m->m_pkthdr.rcvif->if_index].flags & ND6_IFF_IFDISABLED)) { + lck_rw_done(nd_if_rwlock); + goto bad; + } + lck_rw_done(nd_if_rwlock); + in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_receive); ip6stat.ip6s_total++; @@ -447,11 +555,8 @@ ip6_input(m) n = NULL; } } - if (n == NULL) { - m_freem(m); - lck_mtx_unlock(ip6_mutex); - return; /*ENOBUFS*/ - } + if (n == NULL) + goto bad; m_copydata(m, 0, m->m_pkthdr.len, mtod(n, caddr_t)); n->m_len = m->m_pkthdr.len; @@ -459,7 +564,7 @@ ip6_input(m) m = n; } IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), - {lck_mtx_unlock(ip6_mutex); return;}); + {goto done;}); #endif if (m->m_len < sizeof(struct ip6_hdr)) { @@ -468,8 +573,7 @@ ip6_input(m) if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == 0) { ip6stat.ip6s_toosmall++; in6_ifstat_inc(inifp, ifs6_in_hdrerr); - lck_mtx_unlock(ip6_mutex); - return; + goto done; } } @@ -495,10 +599,8 @@ ip6_input(m) m_freem(m); m = NULL; } - if (!m) { - lck_mtx_unlock(ip6_mutex); - return; - } + if (!m) + goto done; } #endif @@ -514,9 +616,14 @@ ip6_input(m) in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } - if ((IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src) || - IN6_IS_ADDR_LOOPBACK(&ip6->ip6_dst)) && - (m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) { + if (IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst) && + !(m->m_flags & M_LOOP)) { + /* + * In this case, the packet should come from the loopback + * interface. However, we cannot just check the if_flags, + * because ip6_mloopback() passes the "actual" interface + * as the outgoing/incoming interface. + */ ip6stat.ip6s_badscope++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; @@ -526,13 +633,13 @@ ip6_input(m) * The following check is not documented in specs. A malicious * party may be able to use IPv4 mapped addr to confuse tcp/udp stack * and bypass security checks (act as if it was from 127.0.0.1 by using - * IPv6 src ::ffff:127.0.0.1). Be cautious. + * IPv6 src ::ffff:127.0.0.1). Be cautious. * * This check chokes if we are in an SIIT cloud. As none of BSDs * support IPv4-less kernel compilation, we cannot support SIIT * environment at all. So, it makes more sense for us to reject any * malicious packets for non-SIIT environment, than try to do a - * partical support for SIIT environment. + * partial support for SIIT environment. */ if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { @@ -556,19 +663,42 @@ ip6_input(m) } #endif + /* + * Naively assume we can attribute inbound data to the route we would + * use to send to this destination. Asymetric routing breaks this + * assumption, but it still allows us to account for traffic from + * a remote node in the routing table. + * this has a very significant performance impact so we bypass + * if nstat_collect is disabled. We may also bypass if the + * protocol is tcp in the future because tcp will have a route that + * we can use to attribute the data to. That does mean we would not + * account for forwarded tcp traffic. + */ + if (nstat_collect) { + struct rtentry *rte = + ifnet_cached_rtlookup_inet6(m->m_pkthdr.rcvif, + &ip6->ip6_src); + if (rte != NULL) { + nstat_route_rx(rte, 1, m->m_pkthdr.len, 0); + rtfree(rte); + } + } + #if PF /* Invoke inbound packet filter */ - lck_mtx_unlock(ip6_mutex); - if (pf_af_hook(m->m_pkthdr.rcvif, NULL, &m, AF_INET6, TRUE) != 0) { - if (m != NULL) { - panic("%s: unexpected packet %p\n", __func__, m); - /* NOTREACHED */ + if (PF_IS_ENABLED) { + int error; + error = pf_af_hook(m->m_pkthdr.rcvif, NULL, &m, AF_INET6, TRUE); + if (error != 0) { + if (m != NULL) { + panic("%s: unexpected packet %p\n", __func__, m); + /* NOTREACHED */ + } + /* Already freed by callee */ + goto done; } - /* Already freed by callee */ - return; + ip6 = mtod(m, struct ip6_hdr *); } - ip6 = mtod(m, struct ip6_hdr *); - lck_mtx_lock(ip6_mutex); #endif /* PF */ /* drop packets if interface ID portion is already filled */ @@ -592,39 +722,11 @@ ip6_input(m) ip6->ip6_dst.s6_addr16[1] = htons(m->m_pkthdr.rcvif->if_index); -#if 0 /* this case seems to be unnecessary. (jinmei, 20010401) */ - /* - * We use rt->rt_ifp to determine if the address is ours or not. - * If rt_ifp is lo0, the address is ours. - * The problem here is, rt->rt_ifp for fe80::%lo0/64 is set to lo0, - * so any address under fe80::%lo0/64 will be mistakenly considered - * local. The special case is supplied to handle the case properly - * by actually looking at interface addresses - * (using in6ifa_ifpwithaddr). - */ - if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) != 0 && - IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_dst)) { - struct in6_ifaddr *ia6; - if (!(ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif, &ip6->ip6_dst))) { - lck_mtx_unlock(ip6_mutex); - icmp6_error(m, ICMP6_DST_UNREACH, - ICMP6_DST_UNREACH_ADDR, 0); - /* m is already freed */ - return; - } - ifafree(&ia6->ia_ifa); - - ours = 1; - deliverifp = m->m_pkthdr.rcvif; - goto hbhcheck; - } -#endif - /* * Multicast check */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { - struct in6_multi *in6m = 0; + struct in6_multi *in6m = NULL; struct ifnet *ifp = m->m_pkthdr.rcvif; in6_ifstat_inc(ifp, ifs6_in_mcast); @@ -632,16 +734,18 @@ ip6_input(m) * See if we belong to the destination multicast group on the * arrival interface. */ - ifnet_lock_shared(ifp); - IN6_LOOKUP_MULTI(ip6->ip6_dst, ifp, in6m); - ifnet_lock_done(ifp); - if (in6m) + in6_multihead_lock_shared(); + IN6_LOOKUP_MULTI(&ip6->ip6_dst, ifp, in6m); + in6_multihead_lock_done(); + if (in6m != NULL) { + IN6M_REMREF(in6m); ours = 1; + } + else #if MROUTING - else if (!ip6_mrouter) { -#else - else { + if (!ip6_mrouter) #endif + { ip6stat.ip6s_notmember++; ip6stat.ip6s_cantforward++; in6_ifstat_inc(ifp, ifs6_in_discard); @@ -651,42 +755,18 @@ ip6_input(m) goto hbhcheck; } - if (ip6_forward_rt.ro_rt != NULL) - RT_LOCK(ip6_forward_rt.ro_rt); /* * Unicast check */ - if (ip6_forward_rt.ro_rt != NULL && - (ip6_forward_rt.ro_rt->rt_flags & RTF_UP) && - IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, - &((struct sockaddr_in6 *)(&ip6_forward_rt.ro_dst))->sin6_addr) && - ip6_forward_rt.ro_rt->generation_id == route_generation) { - ip6stat.ip6s_forward_cachehit++; - } else { - struct sockaddr_in6 *dst6; - - if (ip6_forward_rt.ro_rt != NULL) { - /* route is down/stale or destination is different */ - ip6stat.ip6s_forward_cachemiss++; - RT_UNLOCK(ip6_forward_rt.ro_rt); - rtfree(ip6_forward_rt.ro_rt); - ip6_forward_rt.ro_rt = NULL; - } - - bzero(&ip6_forward_rt.ro_dst, sizeof(struct sockaddr_in6)); - dst6 = (struct sockaddr_in6 *)&ip6_forward_rt.ro_dst; - dst6->sin6_len = sizeof(struct sockaddr_in6); - dst6->sin6_family = AF_INET6; - dst6->sin6_addr = ip6->ip6_dst; -#if SCOPEDROUTING - ip6_forward_rt.ro_dst.sin6_scope_id = - in6_addr2scopeid(m->m_pkthdr.rcvif, &ip6->ip6_dst); -#endif + dst6 = (struct sockaddr_in6 *)&ip6_forward_rt.ro_dst; + dst6->sin6_len = sizeof(struct sockaddr_in6); + dst6->sin6_family = AF_INET6; + dst6->sin6_addr = ip6->ip6_dst; - rtalloc_ign((struct route *)&ip6_forward_rt, RTF_PRCLONING); - if (ip6_forward_rt.ro_rt != NULL) - RT_LOCK(ip6_forward_rt.ro_rt); - } + rtalloc_scoped_ign((struct route *)&ip6_forward_rt, + RTF_PRCLONING, IFSCOPE_NONE); + if (ip6_forward_rt.ro_rt != NULL) + RT_LOCK(ip6_forward_rt.ro_rt); #define rt6_key(r) ((struct sockaddr_in6 *)((r)->rt_nodes->rn_key)) @@ -726,8 +806,7 @@ ip6_input(m) &rt6_key(ip6_forward_rt.ro_rt)->sin6_addr) #endif ip6_forward_rt.ro_rt->rt_ifp->if_type == IFT_LOOP) { - struct in6_ifaddr *ia6 = - (struct in6_ifaddr *)ip6_forward_rt.ro_rt->rt_ifa; + ia6 = (struct in6_ifaddr *)ip6_forward_rt.ro_rt->rt_ifa; /* * record address information into m_aux. @@ -738,31 +817,32 @@ ip6_input(m) * packets to a tentative, duplicated, or somehow invalid * address must not be accepted. */ + RT_CONVERT_LOCK(ip6_forward_rt.ro_rt); /* just in case */ + IFA_LOCK_SPIN(&ia6->ia_ifa); if (!(ia6->ia6_flags & IN6_IFF_NOTREADY)) { + IFA_UNLOCK(&ia6->ia_ifa); /* this address is ready */ ours = 1; deliverifp = ia6->ia_ifp; /* correct? */ /* Count the packet in the ip address stats */ -#ifndef __APPLE__ - ia6->ia_ifa.if_ipackets++; - ia6->ia_ifa.if_ibytes += m->m_pkthdr.len; -#endif RT_UNLOCK(ip6_forward_rt.ro_rt); + ia6 = NULL; goto hbhcheck; - } else { - RT_UNLOCK(ip6_forward_rt.ro_rt); - /* address is not ready, so discard the packet. */ - nd6log((LOG_INFO, - "ip6_input: packet to an unready address %s->%s\n", - ip6_sprintf(&ip6->ip6_src), - ip6_sprintf(&ip6->ip6_dst))); - goto bad; } + IFA_UNLOCK(&ia6->ia_ifa); + RT_UNLOCK(ip6_forward_rt.ro_rt); + /* address is not ready, so discard the packet. */ + nd6log((LOG_INFO, + "ip6_input: packet to an unready address %s->%s\n", + ip6_sprintf(&ip6->ip6_src), + ip6_sprintf(&ip6->ip6_dst))); + ia6 = NULL; + goto bad; } /* - * FAITH(Firewall Aided Internet Translator) + * FAITH (Firewall Aided Internet Translator) */ #if defined(NFAITH) && 0 < NFAITH if (ip6_keepfaith) { @@ -796,9 +876,7 @@ ip6_input(m) * as our interface address (e.g. multicast addresses, addresses * within FAITH prefixes and such). */ - if (deliverifp && !ip6_getdstifaddr(m)) { - struct in6_ifaddr *ia6; - + if (deliverifp && (ia6 = ip6_getdstifaddr(m)) == NULL) { ia6 = in6_ifawithifp(deliverifp, &ip6->ip6_dst); if (ia6) { if (!ip6_setdstifaddr(m, ia6)) { @@ -808,10 +886,16 @@ ip6_input(m) * to the upper layers. */ } - ifafree(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); + ia6 = NULL; } } + if (ia6 != NULL) { + IFA_REMREF(&ia6->ia_ifa); + ia6 = NULL; + } + /* * Process Hop-by-Hop options header if it's contained. * m may be modified in ip6_hopopts_input(). @@ -825,8 +909,7 @@ ip6_input(m) #if 0 /*touches NULL pointer*/ in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); #endif - lck_mtx_unlock(ip6_mutex); - return; /* m have already been freed */ + goto done; /* m have already been freed */ } /* adjust pointer */ @@ -840,17 +923,16 @@ ip6_input(m) if (ip6->ip6_plen == 0 && plen == 0) { /* * Note that if a valid jumbo payload option is - * contained, ip6_hoptops_input() must set a valid - * (non-zero) payload length to the variable plen. + * contained, ip6_hopopts_input() must set a valid + * (non-zero) payload length to the variable plen. */ ip6stat.ip6s_badoptions++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr); - lck_mtx_unlock(ip6_mutex); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, (caddr_t)&ip6->ip6_plen - (caddr_t)ip6); - return; + goto done; } #ifndef PULLDOWN_TEST /* ip6_hopopts_input() ensures that mbuf is contiguous */ @@ -860,18 +942,31 @@ ip6_input(m) sizeof(struct ip6_hbh)); if (hbh == NULL) { ip6stat.ip6s_tooshort++; - lck_mtx_unlock(ip6_mutex); - return; + goto done; } #endif nxt = hbh->ip6h_nxt; /* - * accept the packet if a router alert option is included - * and we act as an IPv6 router. + * If we are acting as a router and the packet contains a + * router alert option, see if we know the option value. + * Currently, we only support the option value for MLD, in which + * case we should pass the packet to the multicast routing + * daemon. */ - if (rtalert != ~0 && ip6_forwarding) - ours = 1; + if (rtalert != ~0 && ip6_forwarding) { + switch (rtalert) { + case IP6OPT_RTALERT_MLD: + ours = 1; + break; + default: + /* + * RFC2711 requires unrecognized values must be + * silently ignored. + */ + break; + } + } } else nxt = ip6->ip6_nxt; @@ -909,20 +1004,14 @@ ip6_input(m) #if MROUTING if (ip6_mrouter && ip6_mforward(ip6, m->m_pkthdr.rcvif, m)) { ip6stat.ip6s_cantforward++; - m_freem(m); - lck_mtx_unlock(ip6_mutex); - return; + goto bad; } #endif - if (!ours) { - m_freem(m); - lck_mtx_unlock(ip6_mutex); - return; - } + if (!ours) + goto bad; } else if (!ours) { - ip6_forward(m, &ip6_forward_rt, 0, 1); - lck_mtx_unlock(ip6_mutex); - return; + ip6_forward(m, &ip6_forward_rt, 0); + goto done; } ip6 = mtod(m, struct ip6_hdr *); @@ -949,17 +1038,16 @@ ip6_input(m) ip6stat.ip6s_delivered++; in6_ifstat_inc(deliverifp, ifs6_in_deliver); - lck_mtx_unlock(ip6_mutex); injectit: nest = 0; while (nxt != IPPROTO_DONE) { struct ipfilter *filter; - int (*pr_input)(struct mbuf **, int *); + int (*pr_input)(struct mbuf **, int *, int); if (ip6_hdrnestlimit && (++nest > ip6_hdrnestlimit)) { ip6stat.ip6s_toomanyhdr++; - goto badunlocked; + goto bad; } /* @@ -969,24 +1057,9 @@ ip6_input(m) if (m->m_pkthdr.len < off) { ip6stat.ip6s_tooshort++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); - goto badunlocked; - } - -#if 0 - /* - * do we need to do it for every header? yeah, other - * functions can play with it (like re-allocate and copy). - */ - mhist = ip6_addaux(m); - if (mhist && M_TRAILINGSPACE(mhist) >= sizeof(nxt)) { - hist = mtod(mhist, caddr_t) + mhist->m_len; - bcopy(&nxt, hist, sizeof(nxt)); - mhist->m_len += sizeof(nxt); - } else { - ip6stat.ip6s_toomanyhdr++; goto bad; } -#endif + #if IPSEC /* @@ -997,7 +1070,7 @@ ip6_input(m) if ((ipsec_bypass == 0) && (ip6_protox[nxt]->pr_flags & PR_LASTHDR) != 0) { if (ipsec6_in_reject(m, NULL)) { IPSEC_STAT_INCREMENT(ipsec6stat.in_polvio); - goto badunlocked; + goto bad; } } #endif @@ -1018,36 +1091,40 @@ ip6_input(m) filter->ipf_filter.cookie, (mbuf_t*)&m, off, nxt); if (result == EJUSTRETURN) { ipf_unref(); - return; + goto done; } if (result != 0) { ipf_unref(); - m_freem(m); - return; + goto bad; } } } ipf_unref(); } + DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL, + struct ip6_hdr *, ip6, struct ifnet *, m->m_pkthdr.rcvif, + struct ip *, NULL, struct ip6_hdr *, ip6); + if ((pr_input = ip6_protox[nxt]->pr_input) == NULL) { m_freem(m); m = NULL; nxt = IPPROTO_DONE; } else if (!(ip6_protox[nxt]->pr_flags & PR_PROTOLOCK)) { lck_mtx_lock(inet6_domain_mutex); - nxt = pr_input(&m, &off); + nxt = pr_input(&m, &off, nxt); lck_mtx_unlock(inet6_domain_mutex); } else { - nxt = pr_input(&m, &off); + nxt = pr_input(&m, &off, nxt); } } +done: + if (ip6_forward_rt.ro_rt != NULL) + rtfree(ip6_forward_rt.ro_rt); return; bad: - lck_mtx_unlock(ip6_mutex); - badunlocked: m_freem(m); - return; + goto done; } /* @@ -1060,8 +1137,13 @@ ip6_setdstifaddr(struct mbuf *m, struct in6_ifaddr *ia6) struct ip6aux *n; n = ip6_addaux(m); - if (n) + if (n != NULL) { + if (ia6 != NULL) + IFA_ADDREF(&ia6->ia_ifa); + if (n->ip6a_dstia6 != NULL) + IFA_REMREF(&n->ip6a_dstia6->ia_ifa); n->ip6a_dstia6 = ia6; + } return (struct ip6aux *)n; /* NULL if failed to set */ } @@ -1072,10 +1154,12 @@ ip6_getdstifaddr(m) struct ip6aux *n; n = ip6_findaux(m); - if (n) - return n->ip6a_dstia6; - else - return NULL; + if (n != NULL) { + if (n->ip6a_dstia6 != NULL) + IFA_ADDREF(&n->ip6a_dstia6->ia_ifa); + return (n->ip6a_dstia6); + } + return (NULL); } /* @@ -1083,11 +1167,8 @@ ip6_getdstifaddr(m) * included, the real payload length will be stored in plenp. */ static int -ip6_hopopts_input(plenp, rtalertp, mp, offp) - u_int32_t *plenp; - u_int32_t *rtalertp; /* XXX: should be stored more smart way */ - struct mbuf **mp; - int *offp; +ip6_hopopts_input(uint32_t *plenp, uint32_t *rtalertp, struct mbuf **mp, + int *offp) { struct mbuf *m = *mp; int off = *offp, hbhlen; @@ -1123,11 +1204,11 @@ ip6_hopopts_input(plenp, rtalertp, mp, offp) if (ip6_process_hopopts(m, (u_int8_t *)hbh + sizeof(struct ip6_hbh), hbhlen, rtalertp, plenp) < 0) - return(-1); + return (-1); *offp = off; *mp = m; - return(0); + return (0); } /* @@ -1167,7 +1248,7 @@ ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp) } optlen = *(opt + 1) + 2; break; - case IP6OPT_RTALERT: + case IP6OPT_ROUTER_ALERT: /* XXX may need check for alignment */ if (hbhlen < IP6OPT_RTALERT_LEN) { ip6stat.ip6s_toosmall++; @@ -1175,11 +1256,9 @@ ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp) } if (*(opt + 1) != IP6OPT_RTALERT_LEN - 2) { /* XXX stat */ - lck_mtx_unlock(ip6_mutex); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); - lck_mtx_lock(ip6_mutex); return(-1); } optlen = IP6OPT_RTALERT_LEN; @@ -1194,11 +1273,9 @@ ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp) } if (*(opt + 1) != IP6OPT_JUMBO_LEN - 2) { /* XXX stat */ - lck_mtx_unlock(ip6_mutex); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); - lck_mtx_lock(ip6_mutex); return(-1); } optlen = IP6OPT_JUMBO_LEN; @@ -1210,11 +1287,9 @@ ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp) ip6 = mtod(m, struct ip6_hdr *); if (ip6->ip6_plen) { ip6stat.ip6s_badoptions++; - lck_mtx_unlock(ip6_mutex); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt - opthead); - lck_mtx_lock(ip6_mutex); return(-1); } @@ -1236,11 +1311,9 @@ ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp) */ if (*plenp != 0) { ip6stat.ip6s_badoptions++; - lck_mtx_unlock(ip6_mutex); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); - lck_mtx_lock(ip6_mutex); return(-1); } #endif @@ -1250,11 +1323,9 @@ ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp) */ if (jumboplen <= IPV6_MAXPACKET) { ip6stat.ip6s_badoptions++; - lck_mtx_unlock(ip6_mutex); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); - lck_mtx_lock(ip6_mutex); return(-1); } *plenp = jumboplen; @@ -1266,9 +1337,8 @@ ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp) goto bad; } optlen = ip6_unknown_opt(opt, m, - erroff + opt - opthead, 1); + erroff + opt - opthead); if (optlen == -1) { - /* ip6_unknown opt unlocked ip6_mutex */ return(-1); } optlen += 2; @@ -1290,11 +1360,7 @@ ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp) * is not continuous in order to return an ICMPv6 error. */ int -ip6_unknown_opt(optp, m, off, locked) - u_int8_t *optp; - struct mbuf *m; - int off; - int locked; +ip6_unknown_opt(uint8_t *optp, struct mbuf *m, int off) { struct ip6_hdr *ip6; @@ -1306,11 +1372,7 @@ ip6_unknown_opt(optp, m, off, locked) return(-1); case IP6OPT_TYPE_FORCEICMP: /* send ICMP even if multicasted */ ip6stat.ip6s_badoptions++; - if (locked) - lck_mtx_unlock(ip6_mutex); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); - if (locked) - lck_mtx_lock(ip6_mutex); return(-1); case IP6OPT_TYPE_ICMP: /* send ICMP if not multicasted */ ip6stat.ip6s_badoptions++; @@ -1318,109 +1380,147 @@ ip6_unknown_opt(optp, m, off, locked) if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || (m->m_flags & (M_BCAST|M_MCAST))) m_freem(m); - else { - if (locked) - lck_mtx_unlock(ip6_mutex); + else icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); - if (locked) - lck_mtx_lock(ip6_mutex); - } return(-1); } m_freem(m); /* XXX: NOTREACHED */ - return(-1); + return (-1); } /* * Create the "control" list for this pcb. - * The function will not modify mbuf chain at all. + * These functions will not modify mbuf chain at all. * - * with KAME mbuf chain restriction: + * With KAME mbuf chain restriction: * The routine will be called from upper layer handlers like tcp6_input(). * Thus the routine assumes that the caller (tcp6_input) have already * called IP6_EXTHDR_CHECK() and all the extension headers are located in the * very first mbuf on the mbuf chain. + * + * ip6_savecontrol_v4 will handle those options that are possible to be + * set on a v4-mapped socket. + * ip6_savecontrol will directly call ip6_savecontrol_v4 to handle those + * options and handle the v6-only ones itself. */ -void -ip6_savecontrol(in6p, mp, ip6, m) - struct inpcb *in6p; - struct mbuf **mp; - struct ip6_hdr *ip6; - struct mbuf *m; +struct mbuf ** +ip6_savecontrol_v4(struct inpcb *inp, struct mbuf *m, struct mbuf **mp, + int *v4only) { - int rthdr_exist = 0; + struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); -#if SO_TIMESTAMP - if ((in6p->in6p_socket->so_options & SO_TIMESTAMP) != 0) { + if ((inp->inp_socket->so_options & SO_TIMESTAMP) != 0) { struct timeval tv; microtime(&tv); - *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), - SCM_TIMESTAMP, SOL_SOCKET); - if (*mp) { - mp = &(*mp)->m_next; - } + mp = sbcreatecontrol_mbuf((caddr_t) &tv, sizeof(tv), + SCM_TIMESTAMP, SOL_SOCKET, mp); + if (*mp == NULL) + return NULL; } -#endif + if ((inp->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + uint64_t time; - /* some OSes call this logic with IPv4 packet, for SO_TIMESTAMP */ - if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) - return; + time = mach_absolute_time(); + mp = sbcreatecontrol_mbuf((caddr_t) &time, sizeof(time), + SCM_TIMESTAMP_MONOTONIC, SOL_SOCKET, mp); + if (*mp == NULL) + return NULL; + } + if ((inp->inp_socket->so_flags & SOF_RECV_TRAFFIC_CLASS) != 0) { + int tc = m->m_pkthdr.prio; + + mp = sbcreatecontrol_mbuf((caddr_t) &tc, sizeof(tc), + SO_TRAFFIC_CLASS, SOL_SOCKET, mp); + if (*mp == NULL) + return NULL; + } + + if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { + if (v4only != NULL) + *v4only = 1; + return (mp); + } + +#define IS2292(inp, x, y) (((inp)->inp_flags & IN6P_RFC2292) ? (x) : (y)) /* RFC 2292 sec. 5 */ - if ((in6p->in6p_flags & IN6P_PKTINFO) != 0) { + if ((inp->inp_flags & IN6P_PKTINFO) != 0) { struct in6_pktinfo pi6; + bcopy(&ip6->ip6_dst, &pi6.ipi6_addr, sizeof(struct in6_addr)); - if (IN6_IS_SCOPE_LINKLOCAL(&pi6.ipi6_addr)) - pi6.ipi6_addr.s6_addr16[1] = 0; - pi6.ipi6_ifindex = (m && m->m_pkthdr.rcvif) - ? m->m_pkthdr.rcvif->if_index - : 0; - *mp = sbcreatecontrol((caddr_t) &pi6, - sizeof(struct in6_pktinfo), IPV6_PKTINFO, - IPPROTO_IPV6); - if (*mp) - mp = &(*mp)->m_next; + in6_clearscope(&pi6.ipi6_addr); /* XXX */ + pi6.ipi6_ifindex = + (m && m->m_pkthdr.rcvif) ? m->m_pkthdr.rcvif->if_index : 0; + + mp = sbcreatecontrol_mbuf((caddr_t) &pi6, + sizeof(struct in6_pktinfo), + IS2292(inp, IPV6_2292PKTINFO, IPV6_PKTINFO), IPPROTO_IPV6, mp); + if (*mp == NULL) + return NULL; } - if ((in6p->in6p_flags & IN6P_HOPLIMIT) != 0) { + if ((inp->inp_flags & IN6P_HOPLIMIT) != 0) { int hlim = ip6->ip6_hlim & 0xff; - *mp = sbcreatecontrol((caddr_t) &hlim, - sizeof(int), IPV6_HOPLIMIT, IPPROTO_IPV6); - if (*mp) - mp = &(*mp)->m_next; + + mp = sbcreatecontrol_mbuf((caddr_t) &hlim, sizeof(int), + IS2292(inp, IPV6_2292HOPLIMIT, IPV6_HOPLIMIT), + IPPROTO_IPV6, mp); + if (*mp == NULL) + return NULL; } - if ((in6p->in6p_flags & IN6P_TCLASS) != 0) { - u_int32_t flowinfo; - int tclass; + if (v4only != NULL) + *v4only = 0; + return (mp); +} + +int +ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp) +{ + struct mbuf **np; + struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); + int v4only = 0; + + *mp = NULL; + np = ip6_savecontrol_v4(in6p, m, mp, &v4only); + if (np == NULL) + goto no_mbufs; - flowinfo = (u_int32_t)ntohl(ip6->ip6_flow & IPV6_FLOWINFO_MASK); - flowinfo >>= 20; + mp = np; + if (v4only) + return(0); - tclass = flowinfo & 0xff; - *mp = sbcreatecontrol((caddr_t) &tclass, sizeof(tclass), - IPV6_TCLASS, IPPROTO_IPV6); - if (*mp) - mp = &(*mp)->m_next; - } + if ((in6p->inp_flags & IN6P_TCLASS) != 0) { + u_int32_t flowinfo; + int tclass; + + flowinfo = (u_int32_t)ntohl(ip6->ip6_flow & IPV6_FLOWINFO_MASK); + flowinfo >>= 20; + + tclass = flowinfo & 0xff; + mp = sbcreatecontrol_mbuf((caddr_t) &tclass, sizeof(tclass), + IPV6_TCLASS, IPPROTO_IPV6, mp); + if (*mp == NULL) + goto no_mbufs; + } /* * IPV6_HOPOPTS socket option. Recall that we required super-user * privilege for the option (see ip6_ctloutput), but it might be too * strict, since there might be some hop-by-hop options which can be * returned to normal user. - * See RFC 2292 section 6. + * See also RFC 2292 section 6 (or RFC 3542 section 8). */ - if ((in6p->in6p_flags & IN6P_HOPOPTS) != 0) { + if ((in6p->inp_flags & IN6P_HOPOPTS) != 0) { /* * Check if a hop-by-hop options header is contatined in the * received packet, and if so, store the options as ancillary * data. Note that a hop-by-hop options header must be - * just after the IPv6 header, which fact is assured through - * the IPv6 input processing. + * just after the IPv6 header, which is assured through the + * IPv6 input processing. */ ip6 = mtod(m, struct ip6_hdr *); if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { @@ -1438,67 +1538,38 @@ ip6_savecontrol(in6p, mp, ip6, m) ip6->ip6_nxt); if (ext == NULL) { ip6stat.ip6s_tooshort++; - return; + return(0); } hbh = mtod(ext, struct ip6_hbh *); hbhlen = (hbh->ip6h_len + 1) << 3; if (hbhlen != ext->m_len) { m_freem(ext); ip6stat.ip6s_tooshort++; - return; + return(0); } #endif /* - * XXX: We copy whole the header even if a jumbo - * payload option is included, which option is to - * be removed before returning in the RFC 2292. - * Note: this constraint is removed in 2292bis. + * XXX: We copy the whole header even if a + * jumbo payload option is included, the option which + * is to be removed before returning according to + * RFC2292. + * Note: this constraint is removed in RFC3542 */ - *mp = sbcreatecontrol((caddr_t)hbh, hbhlen, - IPV6_HOPOPTS, IPPROTO_IPV6); - if (*mp) - mp = &(*mp)->m_next; + mp = sbcreatecontrol_mbuf((caddr_t)hbh, hbhlen, + IS2292(in6p, IPV6_2292HOPOPTS, IPV6_HOPOPTS), + IPPROTO_IPV6, mp); + #if PULLDOWN_TEST m_freem(ext); #endif - } - } - - /* IPV6_DSTOPTS and IPV6_RTHDR socket options */ - if ((in6p->in6p_flags & (IN6P_DSTOPTS | IN6P_RTHDRDSTOPTS)) != 0) { - int proto, off, nxt; - - /* - * go through the header chain to see if a routing header is - * contained in the packet. We need this information to store - * destination options headers (if any) properly. - * XXX: performance issue. We should record this info when - * processing extension headers in incoming routine. - * (todo) use m_aux? - */ - proto = IPPROTO_IPV6; - off = 0; - nxt = -1; - while (1) { - int newoff; - - newoff = ip6_nexthdr(m, off, proto, &nxt); - if (newoff < 0) - break; - if (newoff < off) /* invalid, check for safety */ - break; - if ((proto = nxt) == IPPROTO_ROUTING) { - rthdr_exist = 1; - break; + if (*mp == NULL) { + goto no_mbufs; } - off = newoff; } } - if ((in6p->in6p_flags & - (IN6P_RTHDR | IN6P_DSTOPTS | IN6P_RTHDRDSTOPTS)) != 0) { - ip6 = mtod(m, struct ip6_hdr *); + if ((in6p->inp_flags & (IN6P_RTHDR | IN6P_DSTOPTS)) != 0) { int nxt = ip6->ip6_nxt, off = sizeof(struct ip6_hdr); /* @@ -1543,7 +1614,7 @@ ip6_savecontrol(in6p, mp, ip6, m) ext = ip6_pullexthdr(m, off, nxt); if (ext == NULL) { ip6stat.ip6s_tooshort++; - return; + return(0); } ip6e = mtod(ext, struct ip6_ext *); if (nxt == IPPROTO_AH) @@ -1553,30 +1624,39 @@ ip6_savecontrol(in6p, mp, ip6, m) if (elen != ext->m_len) { m_freem(ext); ip6stat.ip6s_tooshort++; - return; + return(0); } #endif switch (nxt) { case IPPROTO_DSTOPTS: - if ((in6p->in6p_flags & IN6P_DSTOPTS) == 0) + if (!(in6p->inp_flags & IN6P_DSTOPTS)) break; - *mp = sbcreatecontrol((caddr_t)ip6e, elen, - IPV6_DSTOPTS, - IPPROTO_IPV6); - if (*mp) - mp = &(*mp)->m_next; + mp = sbcreatecontrol_mbuf((caddr_t)ip6e, elen, + IS2292(in6p, + IPV6_2292DSTOPTS, IPV6_DSTOPTS), + IPPROTO_IPV6, mp); + if (*mp == NULL) { +#if PULLDOWN_TEST + m_freem(ext); +#endif + goto no_mbufs; + } break; case IPPROTO_ROUTING: - if (!in6p->in6p_flags & IN6P_RTHDR) + if (!in6p->inp_flags & IN6P_RTHDR) break; - *mp = sbcreatecontrol((caddr_t)ip6e, elen, - IPV6_RTHDR, - IPPROTO_IPV6); - if (*mp) - mp = &(*mp)->m_next; + mp = sbcreatecontrol_mbuf((caddr_t)ip6e, elen, + IS2292(in6p, IPV6_2292RTHDR, IPV6_RTHDR), + IPPROTO_IPV6, mp); + if (*mp == NULL) { +#if PULLDOWN_TEST + m_freem(ext); +#endif + goto no_mbufs; + } break; case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ @@ -1584,7 +1664,7 @@ ip6_savecontrol(in6p, mp, ip6, m) default: /* - * other cases have been filtered in the above. + * other cases have been filtered in the above. * none will visit this case. here we supply * the code just in case (nxt overwritten or * other cases). @@ -1608,7 +1688,49 @@ ip6_savecontrol(in6p, mp, ip6, m) loopend: ; } + return(0); +no_mbufs: + ip6stat.ip6s_pktdropcntrl++; + /* XXX increment a stat to show the failure */ + return(ENOBUFS); +} +#undef IS2292 + +void +ip6_notify_pmtu(struct inpcb *in6p, struct sockaddr_in6 *dst, u_int32_t *mtu) +{ + struct socket *so; + struct mbuf *m_mtu; + struct ip6_mtuinfo mtuctl; + + so = in6p->inp_socket; + if (mtu == NULL) + return; + +#ifdef DIAGNOSTIC + if (so == NULL) /* I believe this is impossible */ + panic("ip6_notify_pmtu: socket is NULL"); +#endif + + bzero(&mtuctl, sizeof(mtuctl)); /* zero-clear for safety */ + mtuctl.ip6m_mtu = *mtu; + mtuctl.ip6m_addr = *dst; + if (sa6_recoverscope(&mtuctl.ip6m_addr)) + return; + + if ((m_mtu = sbcreatecontrol((caddr_t)&mtuctl, sizeof(mtuctl), + IPV6_PATHMTU, IPPROTO_IPV6)) == NULL) + return; + + if (sbappendaddr(&so->so_rcv, (struct sockaddr *)dst, NULL, m_mtu, NULL) + == 0) { + m_freem(m_mtu); + /* XXX: should count statistics */ + } else + sorwakeup(so); + + return; } #if PULLDOWN_TEST @@ -1837,8 +1959,8 @@ ip6_addaux( tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_INET6, NULL); if (tag == NULL) { /* Allocate a tag */ - tag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_INET6, - sizeof (struct ip6aux), M_DONTWAIT); + tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_INET6, + sizeof (struct ip6aux), M_DONTWAIT, m); /* Attach it to the mbuf */ if (tag) { @@ -1855,7 +1977,7 @@ ip6_findaux( { struct m_tag *tag; - tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_ENCAP, NULL); + tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_INET6, NULL); return tag ? (struct ip6aux*)(tag + 1) : NULL; } @@ -1866,12 +1988,35 @@ ip6_delaux( { struct m_tag *tag; - tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_ENCAP, NULL); + tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_INET6, NULL); if (tag) { m_tag_delete(m, tag); } } +/* + * Called by m_tag_free(). + */ +void +ip6_destroyaux(struct ip6aux *n) +{ + if (n->ip6a_dstia6 != NULL) { + IFA_REMREF(&n->ip6a_dstia6->ia_ifa); + n->ip6a_dstia6 = NULL; + } +} + +/* + * Called by m_tag_copy() + */ +void +ip6_copyaux(struct ip6aux *src, struct ip6aux *dst) +{ + bcopy(src, dst, sizeof (*dst)); + if (dst->ip6a_dstia6 != NULL) + IFA_ADDREF(&dst->ip6a_dstia6->ia_ifa); +} + /* * System control for IP6 */ diff --git a/bsd/netinet6/ip6_mroute.c b/bsd/netinet6/ip6_mroute.c index da8c4fc96..39f146284 100644 --- a/bsd/netinet6/ip6_mroute.c +++ b/bsd/netinet6/ip6_mroute.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2008 Apple Inc. All rights reserved. + * Copyright (c) 2003-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -104,7 +104,9 @@ #include #include +#include #include +#include #include #include @@ -127,7 +129,6 @@ static int socket_send(struct socket *, struct mbuf *, static int register_send(struct ip6_hdr *, struct mif6 *, struct mbuf *); -extern lck_mtx_t *ip6_mutex; /* * Globals. All but ip6_mrouter, ip6_mrtproto and mrt6stat could be static, * except for netstat or debugging purposes. @@ -258,9 +259,6 @@ static int del_m6if(mifi_t *); static int add_m6fc(struct mf6cctl *); static int del_m6fc(struct mf6cctl *); -#ifndef __APPLE__ -static struct callout expire_upcalls_ch; -#endif /* * Handle MRT setsockopt commands to modify the multicast routing tables. */ @@ -478,12 +476,7 @@ ip6_mrouter_init(so, v, cmd) pim6 = 0;/* used for stubbing out/in pim stuff */ -#ifndef __APPLE__ - callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, - expire_upcalls, NULL); -#else timeout(expire_upcalls, (caddr_t)NULL, EXPIRE_TIMEOUT); -#endif #if MRT6DEBUG if (mrt6debug) @@ -540,20 +533,12 @@ ip6_mrouter_done() } } } -#if notyet - bzero((caddr_t)qtable, sizeof(qtable)); - bzero((caddr_t)tbftable, sizeof(tbftable)); -#endif bzero((caddr_t)mif6table, sizeof(mif6table)); nummifs = 0; pim6 = 0; /* used to stub out/in pim specific code */ -#ifndef __APPLE__ - callout_stop(&expire_upcalls_ch); -#else untimeout(expire_upcalls, (caddr_t)NULL); -#endif /* * Free all multicast forwarding cache entries. @@ -617,7 +602,9 @@ add_m6if(mifcp) return EINVAL; mifp = mif6table + mifcp->mif6c_mifi; if (mifp->m6_ifp) - return EADDRINUSE; /* XXX: is it appropriate? */ + return (EADDRINUSE); /* XXX: is it appropriate? */ + if (mifcp->mif6c_pifi == 0 || mifcp->mif6c_pifi > if_index) + return (ENXIO); ifnet_head_lock_shared(); if (mifcp->mif6c_pifi == 0 || mifcp->mif6c_pifi > if_index) { @@ -653,10 +640,7 @@ add_m6if(mifcp) mifp->m6_flags = mifcp->mif6c_flags; mifp->m6_ifp = ifp; -#if notyet - /* scaling up here allows division by 1024 in critical code */ - mifp->m6_rate_limit = mifcp->mif6c_rate_limit * 1024 / 1000; -#endif + /* initialize per mif pkt counters */ mifp->m6_pkt_in = 0; mifp->m6_pkt_out = 0; @@ -705,10 +689,6 @@ del_m6if(mifip) if_allmulti(ifp, 0); } -#if notyet - bzero((caddr_t)qtable[*mifip], sizeof(qtable[*mifip])); - bzero((caddr_t)mifp->m6_tbf, sizeof(*(mifp->m6_tbf))); -#endif bzero((caddr_t)mifp, sizeof(*mifp)); /* Adjust nummifs down */ @@ -1285,12 +1265,7 @@ expire_upcalls( } } -#ifndef __APPLE__ - callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, - expire_upcalls, NULL); -#else timeout(expire_upcalls, (caddr_t)NULL, EXPIRE_TIMEOUT); -#endif } /* @@ -1306,6 +1281,9 @@ ip6_mdq(m, ifp, rt) mifi_t mifi, iif; struct mif6 *mifp; int plen = m->m_pkthdr.len; + struct in6_addr src0, dst0; /* copies for local work */ + u_int32_t iszone, idzone, oszone, odzone; + int error = 0; /* * Macro to send packet on mif. Since RSVP packets don't get counted on @@ -1437,7 +1415,14 @@ ip6_mdq(m, ifp, rt) * For each mif, forward a copy of the packet if there are group * members downstream on the interface. */ - for (mifp = mif6table, mifi = 0; mifi < nummifs; mifp++, mifi++) + src0 = ip6->ip6_src; + dst0 = ip6->ip6_dst; + if ((error = in6_setscope(&src0, ifp, &iszone)) != 0 || + (error = in6_setscope(&dst0, ifp, &idzone)) != 0) { + ip6stat.ip6s_badscope++; + return (error); + } + for (mifp = mif6table, mifi = 0; mifi < nummifs; mifp++, mifi++) { if (IF_ISSET(mifi, &rt->mf6c_ifset)) { /* * check if the outgoing packet is going to break @@ -1445,23 +1430,25 @@ ip6_mdq(m, ifp, rt) * XXX For packets through PIM register tunnel * interface, we believe a routing daemon. */ - if ((mif6table[rt->mf6c_parent].m6_flags & - MIFF_REGISTER) == 0 && - (mif6table[mifi].m6_flags & MIFF_REGISTER) == 0 && - (in6_addr2scopeid(ifp, &ip6->ip6_dst) != - in6_addr2scopeid(mif6table[mifi].m6_ifp, - &ip6->ip6_dst) || - in6_addr2scopeid(ifp, &ip6->ip6_src) != - in6_addr2scopeid(mif6table[mifi].m6_ifp, - &ip6->ip6_src))) { - ip6stat.ip6s_badscope++; - continue; + if (!(mif6table[rt->mf6c_parent].m6_flags & + MIFF_REGISTER) && + !(mif6table[mifi].m6_flags & MIFF_REGISTER)) { + if (in6_setscope(&src0, mif6table[mifi].m6_ifp, + &oszone) || + in6_setscope(&dst0, mif6table[mifi].m6_ifp, + &odzone) || + iszone != oszone || + idzone != odzone) { + ip6stat.ip6s_badscope++; + continue; + } } mifp->m6_pkt_out++; mifp->m6_bytes_out += plen; MC6_SEND(ip6, mifp, m); } + } return 0; } @@ -1501,16 +1488,22 @@ phyint_send(ip6, mifp, m) * sending queue. */ if (m->m_pkthdr.rcvif == NULL) { - struct ip6_moptions im6o; + struct ip6_moptions *im6o; - im6o.im6o_multicast_ifp = ifp; - /* XXX: ip6_output will override ip6->ip6_hlim */ - im6o.im6o_multicast_hlim = ip6->ip6_hlim; - im6o.im6o_multicast_loop = 1; - error = ip6_output(mb_copy, NULL, &ro, - IPV6_FORWARDING, &im6o, NULL, 0); + im6o = ip6_allocmoptions(M_DONTWAIT); + if (im6o == NULL) { + m_freem(mb_copy); + return; + } + im6o->im6o_multicast_ifp = ifp; + /* XXX: ip6_output will override ip6->ip6_hlim */ + im6o->im6o_multicast_hlim = ip6->ip6_hlim; + im6o->im6o_multicast_loop = 1; + error = ip6_output(mb_copy, NULL, &ro, IPV6_FORWARDING, + im6o, NULL, NULL); + IM6O_REMREF(im6o); #if MRT6DEBUG if (mrt6debug & DEBUG_XMIT) log(LOG_DEBUG, "phyint_send on mif %d err %d\n", @@ -1524,10 +1517,11 @@ phyint_send(ip6, mifp, m) * on the outgoing interface, loop back a copy. */ dst6 = (struct sockaddr_in6 *)&ro.ro_dst; - ifnet_lock_shared(ifp); + in6_multihead_lock_shared(); IN6_LOOKUP_MULTI(ip6->ip6_dst, ifp, in6m); - ifnet_lock_done(ifp); + in6_multihead_lock_done(); if (in6m != NULL) { + IN6M_REMREF(in6m); dst6->sin6_len = sizeof(struct sockaddr_in6); dst6->sin6_family = AF_INET6; dst6->sin6_addr = ip6->ip6_dst; @@ -1552,10 +1546,8 @@ phyint_send(ip6, mifp, m) mb_copy->m_pkthdr.csum_data = 0; mb_copy->m_pkthdr.csum_flags = 0; - lck_mtx_unlock(ip6_mutex); error = dlil_output(ifp, PF_INET6, mb_copy, NULL, (struct sockaddr *)&ro.ro_dst, 0); - lck_mtx_lock(ip6_mutex); #else error = (*ifp->if_output)(ifp, mb_copy, (struct sockaddr *)&ro.ro_dst, @@ -1567,21 +1559,28 @@ phyint_send(ip6, mifp, m) mifp - mif6table, error); #endif } else { -#if MULTICAST_PMTUD - icmp6_error(mb_copy, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu); -#else + /* + * pMTU discovery is intentionally disabled by default, since + * various router may notify pMTU in multicast, which can be + * a DDoS to a router + */ + if (ip6_mcast_pmtu) + icmp6_error(mb_copy, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu); #if MRT6DEBUG - if (mrt6debug & DEBUG_XMIT) - log(LOG_DEBUG, - "phyint_send: packet too big on %s o %s g %s" - " size %d(discarded)\n", - if_name(ifp), - ip6_sprintf(&ip6->ip6_src), - ip6_sprintf(&ip6->ip6_dst), - mb_copy->m_pkthdr.len); + else { + if (mrt6debug & DEBUG_XMIT) { + log(LOG_DEBUG, + "phyint_send: packet too big on %s o %s " + "g %s size %d(discarded)\n", + if_name(ifp), + ip6_sprintf(&ip6->ip6_src), + ip6_sprintf(&ip6->ip6_dst), + mb_copy->m_pkthdr.len); + } + } #endif /* MRT6DEBUG */ m_freem(mb_copy); /* simply discard the packet */ -#endif + } } @@ -1666,9 +1665,7 @@ register_send(ip6, mif, m) * is stripped off, and the inner packet is passed to register_mforward. */ int -pim6_input(mp, offp) - struct mbuf **mp; - int *offp; +pim6_input(struct mbuf **mp, int *offp, int proto) { struct pim *pim; /* pointer to a pim struct */ struct ip6_hdr *ip6; @@ -1676,13 +1673,11 @@ pim6_input(mp, offp) struct mbuf *m = *mp; int minlen; int off = *offp; - int proto; ++pim6stat.pim6s_rcv_total; ip6 = mtod(m, struct ip6_hdr *); pimlen = m->m_pkthdr.len - *offp; - proto = ip6->ip6_nxt; /* * Validate lengths @@ -1881,9 +1876,7 @@ pim6_input(mp, offp) #ifdef __APPLE__ if (lo_ifp) { - lck_mtx_unlock(ip6_mutex); dlil_output(lo_ifp, PF_INET6, m, 0, (struct sockaddr *)&dst, 0); - lck_mtx_lock(ip6_mutex); } else { printf("Warning: pim6_input call to dlil_find_dltag failed!\n"); diff --git a/bsd/netinet6/ip6_mroute.h b/bsd/netinet6/ip6_mroute.h index 5eef448db..193efea2a 100644 --- a/bsd/netinet6/ip6_mroute.h +++ b/bsd/netinet6/ip6_mroute.h @@ -79,9 +79,9 @@ /* * Multicast Routing set/getsockopt commands. */ -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #define MRT6_OINIT 100 /* initialize forwarder (omrt6msg) */ -#endif +#endif /* XNU_KERNEL_PRIVATE */ #define MRT6_DONE 101 /* shut down forwarder */ #define MRT6_ADD_MIF 102 /* add multicast interface */ #define MRT6_DEL_MIF 103 /* delete multicast interface */ @@ -164,7 +164,7 @@ struct mrt6stat { u_quad_t mrt6s_upq_sockfull; /* upcalls dropped - socket full */ }; -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #if MRT6_OINIT /* * Struct used to communicate from kernel to multicast router @@ -185,7 +185,7 @@ struct omrt6msg { struct in6_addr im6_src, im6_dst; }; #endif -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ /* * Structure used to communicate from kernel to multicast router. @@ -229,7 +229,7 @@ struct sioc_mif_req6 { u_quad_t obytes; /* Output byte count on mif */ }; -#if defined(KERNEL_PRIVATE) +#if defined(XNU_KERNEL_PRIVATE) struct sioc_mif_req6_32 { mifi_t mifi; u_quad_t icount; @@ -245,7 +245,7 @@ struct sioc_mif_req6_64 { u_quad_t ibytes; u_quad_t obytes; } __attribute__((aligned(8))); -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #ifdef PRIVATE /* @@ -314,14 +314,14 @@ struct rtdetq { /* XXX: rtdetq is also defined in ip_mroute.h */ #endif /* _NETINET_IP_MROUTE_H_ */ #if MROUTING -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE extern struct mrt6stat mrt6stat; extern int ip6_mrouter_set(struct socket *, struct sockopt *); extern int ip6_mrouter_get(struct socket *, struct sockopt *); extern int ip6_mrouter_done(void); extern int mrt6_ioctl(u_long, caddr_t); -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #endif /* PRIVATE */ #endif diff --git a/bsd/netinet6/ip6_output.c b/bsd/netinet6/ip6_output.c index 309686f7f..7abf54ca7 100644 --- a/bsd/netinet6/ip6_output.c +++ b/bsd/netinet6/ip6_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -110,6 +110,11 @@ #include #include #include +#include +#include +#include + +#include #include #include @@ -120,10 +125,13 @@ #include #include #include +#include #include #include #include #include +#include +#include #if IPSEC #include @@ -133,7 +141,6 @@ #include extern int ipsec_bypass; #endif /* IPSEC */ -extern lck_mtx_t *nd6_mutex; #if CONFIG_MACF_NET #include @@ -161,23 +168,54 @@ struct ip6_exthdrs { struct mbuf *ip6e_dest2; }; +int ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt); static int ip6_pcbopts(struct ip6_pktopts **, struct mbuf *, struct socket *, struct sockopt *sopt); -static int ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt); +static int ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt, int uproto); static int ip6_getpcbopt(struct ip6_pktopts *pktopt, int optname, struct sockopt *sopt); -static int ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt); -static int ip6_setmoptions(int, struct inpcb *, struct mbuf *); -static int ip6_getmoptions(int, struct ip6_moptions *, struct mbuf **); +static int ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, int sticky, int cmsg, int uproto); +static void im6o_trace(struct ip6_moptions *, int); static int ip6_copyexthdr(struct mbuf **, caddr_t, int); static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int, struct ip6_frag **); static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t); static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *); +static int ip6_getpmtu (struct route_in6 *, struct route_in6 *, + struct ifnet *, struct in6_addr *, u_int32_t *, int *); + +#define IM6O_TRACE_HIST_SIZE 32 /* size of trace history */ + +/* For gdb */ +__private_extern__ unsigned int im6o_trace_hist_size = IM6O_TRACE_HIST_SIZE; + +struct ip6_moptions_dbg { + struct ip6_moptions im6o; /* ip6_moptions */ + u_int16_t im6o_refhold_cnt; /* # of IM6O_ADDREF */ + u_int16_t im6o_refrele_cnt; /* # of IM6O_REMREF */ + /* + * Alloc and free callers. + */ + ctrace_t im6o_alloc; + ctrace_t im6o_free; + /* + * Circular lists of IM6O_ADDREF and IM6O_REMREF callers. + */ + ctrace_t im6o_refhold[IM6O_TRACE_HIST_SIZE]; + ctrace_t im6o_refrele[IM6O_TRACE_HIST_SIZE]; +}; + +#if DEBUG +static unsigned int im6o_debug = 1; /* debugging (enabled) */ +#else +static unsigned int im6o_debug; /* debugging (disabled) */ +#endif /* !DEBUG */ + +static unsigned int im6o_size; /* size of zone element */ +static struct zone *im6o_zone; /* zone for ip6_moptions */ + +#define IM6O_ZONE_MAX 64 /* maximum elements in zone */ +#define IM6O_ZONE_NAME "ip6_moptions" /* zone name */ -extern int ip_createmoptions(struct ip_moptions **imop); -extern int ip_addmembership(struct ip_moptions *imo, struct ip_mreq *mreq); -extern int ip_dropmembership(struct ip_moptions *imo, struct ip_mreq *mreq); -extern lck_mtx_t *ip6_mutex; /* * IP6 output. The packet in mbuf chain m contains a skeletal IP6 @@ -198,32 +236,39 @@ ip6_output( int flags, struct ip6_moptions *im6o, struct ifnet **ifpp, /* XXX: just for statistics */ - int locked) + struct ip6_out_args *ip6oa) { struct ip6_hdr *ip6, *mhip6; - struct ifnet *ifp, *origifp; + struct ifnet *ifp = NULL, *origifp = NULL; struct mbuf *m = m0; int hlen, tlen, len, off; struct route_in6 ip6route; - struct sockaddr_in6 *dst; + struct rtentry *rt = NULL; + struct sockaddr_in6 *dst, src_sa, dst_sa; int error = 0; struct in6_ifaddr *ia = NULL; u_int32_t mtu; + int alwaysfrag = 0, dontfrag = 0; u_int32_t optlen = 0, plen = 0, unfragpartlen = 0; struct ip6_exthdrs exthdrs; - struct in6_addr finaldst; + struct in6_addr finaldst, src0, dst0; + u_int32_t zone; struct route_in6 *ro_pmtu = NULL; int hdrsplit = 0; int needipsec = 0; ipfilter_t inject_filter_ref; - + int tso; + unsigned int ifscope; + unsigned int nocell; + boolean_t select_srcif; + struct ipf_pktopts *ippo = NULL, ipf_pktopts; + u_int32_t ifmtu; + #if IPSEC int needipsectun = 0; struct socket *so = NULL; struct secpolicy *sp = NULL; - if (!locked) - lck_mtx_lock(ip6_mutex); /* for AH processing. stupid to have "socket" variable in IP layer... */ if (ipsec_bypass == 0) { @@ -232,10 +277,32 @@ ip6_output( } #endif /* IPSEC */ + bzero(&ipf_pktopts, sizeof(struct ipf_pktopts)); + ippo = &ipf_pktopts; + ip6 = mtod(m, struct ip6_hdr *); inject_filter_ref = ipf_get_inject_filter(m); + finaldst = ip6->ip6_dst; + if (ip6_doscopedroute && (flags & IPV6_OUTARGS)) { + select_srcif = !(flags & (IPV6_FORWARDING | IPV6_UNSPECSRC | IPV6_FLAG_NOSRCIFSEL)); + ifscope = ip6oa->ip6oa_boundif; + ipf_pktopts.ippo_flags = IPPOF_BOUND_IF; + ipf_pktopts.ippo_flags |= (ifscope << IPPOF_SHIFT_IFSCOPE); + } else { + select_srcif = FALSE; + ifscope = IFSCOPE_NONE; + } + + if (flags & IPV6_OUTARGS) { + nocell = ip6oa->ip6oa_nocell; + if (nocell) + ipf_pktopts.ippo_flags |= IPPOF_NO_IFT_CELLULAR; + } else { + nocell = 0; + } + #define MAKE_EXTHDR(hp, mp) \ do { \ if (hp) { \ @@ -253,7 +320,19 @@ ip6_output( /* Hop-by-Hop options header */ MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh); /* Destination options header(1st part) */ - MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1); + if (opt->ip6po_rthdr) { + /* + * Destination options header(1st part) + * This only makes sense with a routing header. + * See Section 9.2 of RFC 3542. + * Disabling this part just for MIP6 convenience is + * a bad idea. We need to think carefully about a + * way to make the advanced API coexist with MIP6 + * options, which might automatically be inserted in + * the kernel. + */ + MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1); + } /* Routing header */ MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr); /* Destination options header(2nd part) */ @@ -314,12 +393,24 @@ ip6_output( * Keep the length of the unfragmentable part for fragmentation. */ optlen = 0; - if (exthdrs.ip6e_hbh) optlen += exthdrs.ip6e_hbh->m_len; - if (exthdrs.ip6e_dest1) optlen += exthdrs.ip6e_dest1->m_len; - if (exthdrs.ip6e_rthdr) optlen += exthdrs.ip6e_rthdr->m_len; + if (exthdrs.ip6e_hbh) + optlen += exthdrs.ip6e_hbh->m_len; + if (exthdrs.ip6e_dest1) + optlen += exthdrs.ip6e_dest1->m_len; + if (exthdrs.ip6e_rthdr) + optlen += exthdrs.ip6e_rthdr->m_len; unfragpartlen = optlen + sizeof(struct ip6_hdr); + /* NOTE: we don't add AH/ESP length here. do that later. */ - if (exthdrs.ip6e_dest2) optlen += exthdrs.ip6e_dest2->m_len; + if (exthdrs.ip6e_dest2) + optlen += exthdrs.ip6e_dest2->m_len; + + + if (needipsec && + (m->m_pkthdr.csum_flags & CSUM_DELAY_IPV6_DATA) != 0) { + in6_delayed_cksum(m, sizeof(struct ip6_hdr) + optlen); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IPV6_DATA; + } /* * If we need IPsec, or there is at least one extension header, @@ -419,14 +510,14 @@ ip6_output( struct ipfilter *filter; int seen = (inject_filter_ref == 0); int fixscope = 0; - struct ipf_pktopts *ippo = 0, ipf_pktopts; - + if (im6o != NULL && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { - ippo = &ipf_pktopts; - ippo->ippo_flags = IPPOF_MCAST_OPTS; + ippo->ippo_flags |= IPPOF_MCAST_OPTS; + IM6O_LOCK(im6o); ippo->ippo_mcast_ifnet = im6o->im6o_multicast_ifp; ippo->ippo_mcast_ttl = im6o->im6o_multicast_hlim; ippo->ippo_mcast_loop = im6o->im6o_multicast_loop; + IM6O_UNLOCK(im6o); } /* Hack: embed the scope_id in the destination */ @@ -436,7 +527,6 @@ ip6_output( ip6->ip6_dst.s6_addr16[1] = htons(ro->ro_dst.sin6_scope_id); } { - lck_mtx_unlock(ip6_mutex); ipf_ref(); TAILQ_FOREACH(filter, &ipv6_filters, ipf_link) { /* @@ -452,18 +542,15 @@ ip6_output( result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo); if (result == EJUSTRETURN) { ipf_unref(); - locked = 1; /* Don't want to take lock to unlock it right away */ goto done; } if (result != 0) { ipf_unref(); - locked = 1; /* Don't want to take lock to unlock it right away */ goto bad; } } } ipf_unref(); - lck_mtx_lock(ip6_mutex); } ip6 = mtod(m, struct ip6_hdr *); /* Hack: cleanup embedded scope_id if we put it there */ @@ -495,10 +582,8 @@ ip6_output( bzero(&state, sizeof(state)); state.m = m; - lck_mtx_unlock(ip6_mutex); error = ipsec6_output_trans(&state, nexthdrp, mprev, sp, flags, &needipsectun); - lck_mtx_lock(ip6_mutex); m = state.m; if (error) { /* mbuf is already reclaimed in ipsec6_output_trans. */ @@ -524,13 +609,13 @@ ip6_output( /* ah6_output doesn't modify mbuf chain */ rh->ip6r_segleft = segleft_org; } - } -skip_ipsec2:; -#endif + } } +skip_ipsec2: +#endif /* - * If there is a routing header, replace destination address field + * If there is a routing header, replace the destination address field * with the first hop of the routing header. */ if (exthdrs.ip6e_rthdr) { @@ -538,17 +623,38 @@ skip_ipsec2:; (struct ip6_rthdr *)(mtod(exthdrs.ip6e_rthdr, struct ip6_rthdr *)); struct ip6_rthdr0 *rh0; + struct in6_addr *addr; + struct sockaddr_in6 sa; - finaldst = ip6->ip6_dst; switch (rh->ip6r_type) { case IPV6_RTHDR_TYPE_0: rh0 = (struct ip6_rthdr0 *)rh; - ip6->ip6_dst = rh0->ip6r0_addr[0]; - bcopy((caddr_t)&rh0->ip6r0_addr[1], - (caddr_t)&rh0->ip6r0_addr[0], - sizeof(struct in6_addr)*(rh0->ip6r0_segleft - 1) - ); - rh0->ip6r0_addr[rh0->ip6r0_segleft - 1] = finaldst; + addr = (struct in6_addr *)(rh0 + 1); + + /* + * construct a sockaddr_in6 form of + * the first hop. + * + * XXX: we may not have enough + * information about its scope zone; + * there is no standard API to pass + * the information from the + * application. + */ + bzero(&sa, sizeof(sa)); + sa.sin6_family = AF_INET6; + sa.sin6_len = sizeof(sa); + sa.sin6_addr = addr[0]; + if ((error = sa6_embedscope(&sa, + ip6_use_defzone)) != 0) { + goto bad; + } + ip6->ip6_dst = sa.sin6_addr; + bcopy(&addr[1], &addr[0], sizeof(struct in6_addr) + * (rh0->ip6r0_segleft - 1)); + addr[rh0->ip6r0_segleft - 1] = finaldst; + /* XXX */ + in6_clearscope(addr + rh0->ip6r0_segleft - 1); break; default: /* is it possible? */ error = EINVAL; @@ -558,7 +664,7 @@ skip_ipsec2:; /* Source address validation */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && - (flags & IPV6_DADOUTPUT) == 0) { + (flags & IPV6_UNSPECSRC) == 0) { error = EOPNOTSUPP; ip6stat.ip6s_badscope++; goto bad; @@ -582,6 +688,38 @@ skip_ipsec2:; if (opt && opt->ip6po_rthdr) ro = &opt->ip6po_route; dst = (struct sockaddr_in6 *)&ro->ro_dst; + + if (ro && ro->ro_rt) + RT_LOCK_ASSERT_NOTHELD(ro->ro_rt); + /* + * if specified, try to fill in the traffic class field. + * do not override if a non-zero value is already set. + * we check the diffserv field and the ecn field separately. + */ + if (opt && opt->ip6po_tclass >= 0) { + int mask = 0; + + if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0) + mask |= 0xfc; + if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0) + mask |= 0x03; + if (mask != 0) + ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20); + } + + /* fill in or override the hop limit field, if necessary. */ + if (opt && opt->ip6po_hlim != -1) + ip6->ip6_hlim = opt->ip6po_hlim & 0xff; + else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { + if (im6o != NULL) { + IM6O_LOCK(im6o); + ip6->ip6_hlim = im6o->im6o_multicast_hlim; + IM6O_UNLOCK(im6o); + } else { + ip6->ip6_hlim = ip6_defmcasthlim; + } + } + /* * If there is a cached route, check that it is to the same * destination and is still up. If not, free it and try again. @@ -602,17 +740,14 @@ skip_ipsec2:; dst->sin6_family = AF_INET6; dst->sin6_len = sizeof(struct sockaddr_in6); dst->sin6_addr = ip6->ip6_dst; -#if SCOPEDROUTING - /* XXX: sin6_scope_id should already be fixed at this point */ - if (IN6_IS_SCOPE_LINKLOCAL(&dst->sin6_addr)) - dst->sin6_scope_id = ntohs(dst->sin6_addr.s6_addr16[1]); -#endif } #if IPSEC if (needipsec && needipsectun) { struct ipsec_output_state state; int tunneledv4 = 0; - +#if CONFIG_DTRACE + struct ifnet *trace_ifp = (ifpp != NULL) ? (*ifpp) : NULL; +#endif /* CONFIG_DTRACE */ /* * All the extension headers will become inaccessible * (since they can be encrypted). @@ -628,9 +763,13 @@ skip_ipsec2:; state.m = m; state.ro = (struct route *)ro; state.dst = (struct sockaddr *)dst; - lck_mtx_unlock(ip6_mutex); + + /* Added a trace here so that we can see packets inside a tunnel */ + DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL, + struct ip6_hdr *, ip6, struct ifnet *, trace_ifp, + struct ip *, NULL, struct ip6_hdr *, ip6); + error = ipsec6_output_tunnel(&state, sp, flags, &tunneledv4); - lck_mtx_lock(ip6_mutex); if (tunneledv4) /* tunneled in IPv4 - packet is gone */ goto done; m = state.m; @@ -657,182 +796,147 @@ skip_ipsec2:; } goto bad; } - + /* + * The packet has been encapsulated so the ifscope is no longer valid + * since it does not apply to the outer address: ignore the ifscope. + */ + ifscope = IFSCOPE_NONE; + if (opt != NULL && opt->ip6po_pktinfo != NULL) { + if (opt->ip6po_pktinfo->ipi6_ifindex != IFSCOPE_NONE) + opt->ip6po_pktinfo->ipi6_ifindex = IFSCOPE_NONE; + } exthdrs.ip6e_ip6 = m; } #endif /* IPSEC */ - if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { - /* Unicast */ - -#define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) -#define sin6tosa(sin6) ((struct sockaddr *)(sin6)) - /* xxx - * interface selection comes here - * if an interface is specified from an upper layer, - * ifp must point it. - */ - if (ro->ro_rt == NULL) { - /* - * non-bsdi always clone routes, if parent is - * PRF_CLONING. - */ - rtalloc_ign((struct route *)ro, 0); - } - if (ro->ro_rt == NULL) { - ip6stat.ip6s_noroute++; - error = EHOSTUNREACH; - /* XXX in6_ifstat_inc(ifp, ifs6_out_discard); */ - goto bad; - } - RT_LOCK_SPIN(ro->ro_rt); - ia = ifatoia6(ro->ro_rt->rt_ifa); - if (ia != NULL) - ifaref(&ia->ia_ifa); - ifp = ro->ro_rt->rt_ifp; - ro->ro_rt->rt_use++; - if (ro->ro_rt->rt_flags & RTF_GATEWAY) - dst = (struct sockaddr_in6 *)ro->ro_rt->rt_gateway; - RT_UNLOCK(ro->ro_rt); - m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */ + /* for safety */ + if (ifp != NULL) { + ifnet_release(ifp); + ifp = NULL; + } - in6_ifstat_inc(ifp, ifs6_out_request); + /* adjust pointer */ + ip6 = mtod(m, struct ip6_hdr *); - /* - * Check if the outgoing interface conflicts with - * the interface specified by ifi6_ifindex (if specified). - * Note that loopback interface is always okay. - * (this may happen when we are sending a packet to one of - * our own addresses.) - */ - if (opt && opt->ip6po_pktinfo - && opt->ip6po_pktinfo->ipi6_ifindex) { - if (!(ifp->if_flags & IFF_LOOPBACK) - && ifp->if_index != opt->ip6po_pktinfo->ipi6_ifindex) { - ip6stat.ip6s_noroute++; - in6_ifstat_inc(ifp, ifs6_out_discard); - error = EHOSTUNREACH; - goto bad; - } + if (select_srcif) { + bzero(&src_sa, sizeof(src_sa)); + src_sa.sin6_family = AF_INET6; + src_sa.sin6_len = sizeof(src_sa); + src_sa.sin6_addr = ip6->ip6_src; + } + bzero(&dst_sa, sizeof(dst_sa)); + dst_sa.sin6_family = AF_INET6; + dst_sa.sin6_len = sizeof(dst_sa); + dst_sa.sin6_addr = ip6->ip6_dst; + + if ((error = in6_selectroute(select_srcif ? &src_sa : NULL, + &dst_sa, opt, im6o, ro, &ifp, &rt, 0, ifscope, nocell)) != 0) { + switch (error) { + case EHOSTUNREACH: + ip6stat.ip6s_noroute++; + break; + case EADDRNOTAVAIL: + default: + break; /* XXX statistics? */ } - + if (ifp != NULL) + in6_ifstat_inc(ifp, ifs6_out_discard); + goto bad; + } + if (rt == NULL) { /* - * if specified, try to fill in the traffic class field. - * do not override if a non-zero value is already set. - * we check the diffserv field and the ecn field separately. + * If in6_selectroute() does not return a route entry, + * dst may not have been updated. */ - if (opt && opt->ip6po_tclass >= 0) { - int mask = 0; - - if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0) - mask |= 0xfc; - if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0) - mask |= 0x03; - if (mask != 0) - ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20); - } + *dst = dst_sa; /* XXX */ + } - if (opt && opt->ip6po_hlim != -1) - ip6->ip6_hlim = opt->ip6po_hlim & 0xff; - } else { - /* Multicast */ - struct in6_multi *in6m; + /* + * then rt (for unicast) and ifp must be non-NULL valid values. + */ + if ((flags & IPV6_FORWARDING) == 0) { + /* XXX: the FORWARDING flag can be set for mrouting. */ + in6_ifstat_inc(ifp, ifs6_out_request); + } + if (rt != NULL) { + RT_LOCK(rt); + ia = (struct in6_ifaddr *)(rt->rt_ifa); + if (ia != NULL) + IFA_ADDREF(&ia->ia_ifa); + rt->rt_use++; + RT_UNLOCK(rt); + } - m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST; + /* + * The outgoing interface must be in the zone of source and + * destination addresses. We should use ia_ifp to support the + * case of sending packets to an address of our own. + */ + if (ia != NULL && ia->ia_ifp) { + ifnet_reference(ia->ia_ifp); + if (origifp != NULL) + ifnet_release(origifp); + origifp = ia->ia_ifp; + } else { + if (ifp != NULL) + ifnet_reference(ifp); + if (origifp != NULL) + ifnet_release(origifp); + origifp = ifp; + } + src0 = ip6->ip6_src; + if (in6_setscope(&src0, origifp, &zone)) + goto badscope; + bzero(&src_sa, sizeof(src_sa)); + src_sa.sin6_family = AF_INET6; + src_sa.sin6_len = sizeof(src_sa); + src_sa.sin6_addr = ip6->ip6_src; + if (sa6_recoverscope(&src_sa) || zone != src_sa.sin6_scope_id) + goto badscope; + + dst0 = ip6->ip6_dst; + if (in6_setscope(&dst0, origifp, &zone)) + goto badscope; + /* re-initialize to be sure */ + bzero(&dst_sa, sizeof(dst_sa)); + dst_sa.sin6_family = AF_INET6; + dst_sa.sin6_len = sizeof(dst_sa); + dst_sa.sin6_addr = ip6->ip6_dst; + if (sa6_recoverscope(&dst_sa) || zone != dst_sa.sin6_scope_id) { + goto badscope; + } - /* - * See if the caller provided any multicast options - */ - ifp = NULL; - if (im6o != NULL) { - ip6->ip6_hlim = im6o->im6o_multicast_hlim; - if (im6o->im6o_multicast_ifp != NULL) - ifp = im6o->im6o_multicast_ifp; - } else - ip6->ip6_hlim = ip6_defmcasthlim; + /* scope check is done. */ + goto routefound; - /* - * See if the caller provided the outgoing interface - * as an ancillary data. - * Boundary check for ifindex is assumed to be already done. - */ - if (opt && opt->ip6po_pktinfo && opt->ip6po_pktinfo->ipi6_ifindex) { - unsigned int index = opt->ip6po_pktinfo->ipi6_ifindex; - ifnet_head_lock_shared(); - if (index > 0 && index <= if_index) { - ifp = ifindex2ifnet[index]; - } - ifnet_head_done(); - } + badscope: + ip6stat.ip6s_badscope++; + in6_ifstat_inc(origifp, ifs6_out_discard); + if (error == 0) + error = EHOSTUNREACH; /* XXX */ + goto bad; - /* - * If the destination is a node-local scope multicast, - * the packet should be loop-backed only. - */ - if (IN6_IS_ADDR_MC_NODELOCAL(&ip6->ip6_dst)) { + routefound: + if (rt && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { + if (opt && opt->ip6po_nextroute.ro_rt) { /* - * If the outgoing interface is already specified, - * it should be a loopback interface. + * The nexthop is explicitly specified by the + * application. We assume the next hop is an IPv6 + * address. */ - if (ifp && (ifp->if_flags & IFF_LOOPBACK) == 0) { - ip6stat.ip6s_badscope++; - error = ENETUNREACH; /* XXX: better error? */ - /* XXX correct ifp? */ - in6_ifstat_inc(ifp, ifs6_out_discard); - goto bad; - } else { - ifp = lo_ifp; - } - } - - /* - * if specified, try to fill in the traffic class field. - * do not override if a non-zero value is already set. - * we check the diffserv field and the ecn field separately. - */ - if (opt && opt->ip6po_tclass >= 0) { - int mask = 0; - - if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0) - mask |= 0xfc; - if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0) - mask |= 0x03; - if (mask != 0) - ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20); + dst = (struct sockaddr_in6 *)opt->ip6po_nexthop; } + else if ((rt->rt_flags & RTF_GATEWAY)) + dst = (struct sockaddr_in6 *)rt->rt_gateway; + } - if (opt && opt->ip6po_hlim != -1) - ip6->ip6_hlim = opt->ip6po_hlim & 0xff; + if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { + m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */ + } else { + struct in6_multi *in6m; - /* - * If caller did not provide an interface lookup a - * default in the routing table. This is either a - * default for the speicfied group (i.e. a host - * route), or a multicast default (a route for the - * ``net'' ff00::/8). - */ - if (ifp == NULL) { - if (ro->ro_rt == NULL) { - ro->ro_rt = rtalloc1( - (struct sockaddr *)&ro->ro_dst, 0, 0); - } - if (ro->ro_rt == NULL) { - ip6stat.ip6s_noroute++; - error = EHOSTUNREACH; - /* XXX in6_ifstat_inc(ifp, ifs6_out_discard) */ - goto bad; - } - RT_LOCK_SPIN(ro->ro_rt); - ia = ifatoia6(ro->ro_rt->rt_ifa); - if (ia != NULL) - ifaref(&ia->ia_ifa); - ifp = ro->ro_rt->rt_ifp; - ro->ro_rt->rt_use++; - RT_UNLOCK(ro->ro_rt); - } + m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST; - if ((flags & IPV6_FORWARDING) == 0) - in6_ifstat_inc(ifp, ifs6_out_request); in6_ifstat_inc(ifp, ifs6_out_mcast); /* @@ -844,11 +948,15 @@ skip_ipsec2:; error = ENETUNREACH; goto bad; } - ifnet_lock_shared(ifp); - IN6_LOOKUP_MULTI(ip6->ip6_dst, ifp, in6m); - ifnet_lock_done(ifp); + in6_multihead_lock_shared(); + IN6_LOOKUP_MULTI(&ip6->ip6_dst, ifp, in6m); + in6_multihead_lock_done(); + if (im6o != NULL) + IM6O_LOCK(im6o); if (in6m != NULL && (im6o == NULL || im6o->im6o_multicast_loop)) { + if (im6o != NULL) + IM6O_UNLOCK(im6o); /* * If we belong to the destination multicast group * on the outgoing interface, and the caller did not @@ -856,6 +964,8 @@ skip_ipsec2:; */ ip6_mloopback(ifp, m, dst); } else { + if (im6o != NULL) + IM6O_UNLOCK(im6o); /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just @@ -870,13 +980,25 @@ skip_ipsec2:; */ #if MROUTING if (ip6_mrouter && (flags & IPV6_FORWARDING) == 0) { + /* + * XXX: ip6_mforward expects that rcvif is NULL + * when it is called from the originating path. + * However, it is not always the case, since + * some versions of MGETHDR() does not + * initialize the field. + */ + m->m_pkthdr.rcvif = NULL; if (ip6_mforward(ip6, ifp, m) != 0) { m_freem(m); + if (in6m != NULL) + IN6M_REMREF(in6m); goto done; } } #endif } + if (in6m != NULL) + IN6M_REMREF(in6m); /* * Multicasts with a hoplimit of zero may be looped back, * above, but must not be transmitted on a network. @@ -885,7 +1007,8 @@ skip_ipsec2:; * loop back a copy if this host actually belongs to the * destination group on the loopback interface. */ - if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK)) { + if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) || + IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) { m_freem(m); goto done; } @@ -895,122 +1018,48 @@ skip_ipsec2:; * Fill the outgoing inteface to tell the upper layer * to increment per-interface statistics. */ - if (ifpp) + if (ifpp != NULL) { + ifnet_reference(ifp); /* for caller */ + if (*ifpp != NULL) + ifnet_release(*ifpp); *ifpp = ifp; - - /* - * Determine path MTU. - */ - if (ro_pmtu != ro) { - /* The first hop and the final destination may differ. */ - struct sockaddr_in6 *sin6_fin = - (struct sockaddr_in6 *)&ro_pmtu->ro_dst; - if (ro_pmtu->ro_rt != NULL && - (!(ro_pmtu->ro_rt->rt_flags & RTF_UP) || - ro_pmtu->ro_rt->generation_id != route_generation || - !IN6_ARE_ADDR_EQUAL(&sin6_fin->sin6_addr, &finaldst))) { - rtfree(ro_pmtu->ro_rt); - ro_pmtu->ro_rt = NULL; - } - if (ro_pmtu->ro_rt == NULL) { - bzero(sin6_fin, sizeof(*sin6_fin)); - sin6_fin->sin6_family = AF_INET6; - sin6_fin->sin6_len = sizeof(struct sockaddr_in6); - sin6_fin->sin6_addr = finaldst; - - rtalloc((struct route *)ro_pmtu); - } } - if (ro_pmtu->ro_rt != NULL) { - u_int32_t ifmtu; - - lck_rw_lock_shared(nd_if_rwlock); - ifmtu = IN6_LINKMTU(ifp); - lck_rw_done(nd_if_rwlock); - RT_LOCK_SPIN(ro_pmtu->ro_rt); - mtu = ro_pmtu->ro_rt->rt_rmx.rmx_mtu; - if (mtu > ifmtu || mtu == 0) { - /* - * The MTU on the route is larger than the MTU on - * the interface! This shouldn't happen, unless the - * MTU of the interface has been changed after the - * interface was brought up. Change the MTU in the - * route to match the interface MTU (as long as the - * field isn't locked). - * - * if MTU on the route is 0, we need to fix the MTU. - * this case happens with path MTU discovery timeouts. - */ - mtu = ifmtu; - if ((ro_pmtu->ro_rt->rt_rmx.rmx_locks & RTV_MTU) == 0) - ro_pmtu->ro_rt->rt_rmx.rmx_mtu = mtu; /* XXX */ - } - RT_UNLOCK(ro_pmtu->ro_rt); - } else { - lck_rw_lock_shared(nd_if_rwlock); - mtu = IN6_LINKMTU(ifp); - lck_rw_done(nd_if_rwlock); - } + /* Determine path MTU. */ + if ((error = ip6_getpmtu(ro_pmtu, ro, ifp, &finaldst, &mtu, + &alwaysfrag)) != 0) + goto bad; /* - * advanced API (IPV6_USE_MIN_MTU) overrides mtu setting + * The caller of this function may specify to use the minimum MTU + * in some cases. + * An advanced API option (IPV6_USE_MIN_MTU) can also override MTU + * setting. The logic is a bit complicated; by default, unicast + * packets will follow path MTU while multicast packets will be sent at + * the minimum MTU. If IP6PO_MINMTU_ALL is specified, all packets + * including unicast ones will be sent at the minimum MTU. Multicast + * packets will always be sent at the minimum MTU unless + * IP6PO_MINMTU_DISABLE is explicitly specified. + * See RFC 3542 for more details. */ - if ((flags & IPV6_MINMTU) != 0 && mtu > IPV6_MMTU) - mtu = IPV6_MMTU; - - /* Fake scoped addresses */ - if ((ifp->if_flags & IFF_LOOPBACK) != 0) { - /* - * If source or destination address is a scoped address, and - * the packet is going to be sent to a loopback interface, - * we should keep the original interface. - */ - - /* - * XXX: this is a very experimental and temporary solution. - * We eventually have sockaddr_in6 and use the sin6_scope_id - * field of the structure here. - * We rely on the consistency between two scope zone ids - * of source and destination, which should already be assured. - * Larger scopes than link will be supported in the future. - */ - u_short index = 0; - origifp = NULL; - if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) - index = ntohs(ip6->ip6_src.s6_addr16[1]); - else if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) - index = ntohs(ip6->ip6_dst.s6_addr16[1]); - ifnet_head_lock_shared(); - if (index > 0 && index <= if_index) { - origifp = ifindex2ifnet[index]; + if (mtu > IPV6_MMTU) { + if ((flags & IPV6_MINMTU)) + mtu = IPV6_MMTU; + else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL) + mtu = IPV6_MMTU; + else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && + (opt == NULL || + opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) { + mtu = IPV6_MMTU; } - ifnet_head_done(); - /* - * XXX: origifp can be NULL even in those two cases above. - * For example, if we remove the (only) link-local address - * from the loopback interface, and try to send a link-local - * address without link-id information. Then the source - * address is ::1, and the destination address is the - * link-local address with its s6_addr16[1] being zero. - * What is worse, if the packet goes to the loopback interface - * by a default rejected route, the null pointer would be - * passed to looutput, and the kernel would hang. - * The following last resort would prevent such disaster. - */ - if (origifp == NULL) - origifp = ifp; } - else - origifp = ifp; -#ifndef SCOPEDROUTING + /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); -#endif #if IPFW2 /* @@ -1038,8 +1087,7 @@ skip_ipsec2:; */ if (exthdrs.ip6e_hbh) { struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *); - u_int32_t dummy1; /* XXX unused */ - u_int32_t dummy2; /* XXX unused */ + u_int32_t dummy; /* XXX unused */ #if DIAGNOSTIC if ((hbh->ip6h_len + 1) << 3 > exthdrs.ip6e_hbh->m_len) @@ -1053,11 +1101,9 @@ skip_ipsec2:; */ m->m_flags |= M_LOOP; m->m_pkthdr.rcvif = ifp; - if (ip6_process_hopopts(m, - (u_int8_t *)(hbh + 1), - ((hbh->ip6h_len + 1) << 3) - - sizeof(struct ip6_hbh), - &dummy1, &dummy2) < 0) { + if (ip6_process_hopopts(m, (u_int8_t *)(hbh + 1), + ((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh), + &dummy, &plen) < 0) { /* m was already freed at this point */ error = EINVAL;/* better error? */ goto done; @@ -1067,75 +1113,127 @@ skip_ipsec2:; } #if PF - lck_mtx_unlock(ip6_mutex); - - /* Invoke outbound packet filter */ - error = pf_af_hook(ifp, NULL, &m, AF_INET6, FALSE); - - lck_mtx_lock(ip6_mutex); + if (PF_IS_ENABLED) { + /* Invoke outbound packet filter */ + error = pf_af_hook(ifp, NULL, &m, AF_INET6, FALSE); - if (error) { - if (m != NULL) { - panic("%s: unexpected packet %p\n", __func__, m); - /* NOTREACHED */ + if (error) { + if (m != NULL) { + panic("%s: unexpected packet %p\n", __func__, m); + /* NOTREACHED */ + } + /* Already freed by callee */ + goto done; } - /* Already freed by callee */ - goto done; + ip6 = mtod(m, struct ip6_hdr *); } - ip6 = mtod(m, struct ip6_hdr *); #endif /* PF */ /* * Send the packet to the outgoing interface. * If necessary, do IPv6 fragmentation before sending. + * + * the logic here is rather complex: + * 1: normal case (dontfrag == 0, alwaysfrag == 0) + * 1-a: send as is if tlen <= path mtu + * 1-b: fragment if tlen > path mtu + * + * 2: if user asks us not to fragment (dontfrag == 1) + * 2-a: send as is if tlen <= interface mtu + * 2-b: error if tlen > interface mtu + * + * 3: if we always need to attach fragment header (alwaysfrag == 1) + * always fragment + * + * 4: if dontfrag == 1 && alwaysfrag == 1 + * error, as we cannot handle this conflicting request */ tlen = m->m_pkthdr.len; - if (tlen <= mtu -#if notyet - /* - * On any link that cannot convey a 1280-octet packet in one piece, - * link-specific fragmentation and reassembly must be provided at - * a layer below IPv6. [RFC 2460, sec.5] - * Thus if the interface has ability of link-level fragmentation, - * we can just send the packet even if the packet size is - * larger than the link's MTU. - * XXX: IFF_FRAGMENTABLE (or such) flag has not been defined yet... - */ - - || ifp->if_flags & IFF_FRAGMENTABLE -#endif - ) - { - /* Record statistics for this interface address. */ - if (ia && !(flags & IPV6_FORWARDING)) { -#ifndef __APPLE__ - ia->ia_ifa.if_opackets++; - ia->ia_ifa.if_obytes += m->m_pkthdr.len; -#endif - } + + if (opt && (opt->ip6po_flags & IP6PO_DONTFRAG)) + dontfrag = 1; + else + dontfrag = 0; + if (dontfrag && alwaysfrag) { /* case 4 */ + /* conflicting request - can't transmit */ + error = EMSGSIZE; + goto bad; + } + + lck_rw_lock_shared(nd_if_rwlock); + ifmtu = IN6_LINKMTU(ifp); + lck_rw_done(nd_if_rwlock); + + if (dontfrag && tlen > ifmtu) { /* case 2-b */ + /* + * Even if the DONTFRAG option is specified, we cannot send the + * packet when the data length is larger than the MTU of the + * outgoing interface. + * Notify the error by sending IPV6_PATHMTU ancillary data as + * well as returning an error code (the latter is not described + * in the API spec.) + */ + u_int32_t mtu32; + struct ip6ctlparam ip6cp; + + mtu32 = (u_int32_t)mtu; + bzero(&ip6cp, sizeof(ip6cp)); + ip6cp.ip6c_cmdarg = (void *)&mtu32; + pfctlinput2(PRC_MSGSIZE, (struct sockaddr *)&ro_pmtu->ro_dst, + (void *)&ip6cp); + + error = EMSGSIZE; + goto bad; + } + + /* + * transmit packet without fragmentation + */ + tso = (ifp->if_hwassist & IFNET_TSO_IPV6) && + (m->m_pkthdr.csum_flags & CSUM_TSO_IPV6); + if (dontfrag || (!alwaysfrag && /* case 1-a and 2-a */ + (tlen <= mtu || tso || (ifp->if_hwassist & CSUM_FRAGMENT_IPV6)))) { + int sw_csum; + + ip6 = mtod(m, struct ip6_hdr *); #ifdef IPSEC /* clean ipsec history once it goes out of the node */ ipsec_delaux(m); #endif - error = nd6_output(ifp, origifp, m, dst, ro->ro_rt, 1); + if (apple_hwcksum_tx == 0) /* Do not let HW handle cksum */ + sw_csum = m->m_pkthdr.csum_flags; + else + sw_csum = m->m_pkthdr.csum_flags & + ~IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist); + + if ((sw_csum & CSUM_DELAY_IPV6_DATA) != 0) { + in6_delayed_cksum(m, sizeof(struct ip6_hdr) + optlen); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IPV6_DATA; + } + if (ro->ro_rt) + RT_LOCK_ASSERT_NOTHELD(ro->ro_rt); + error = nd6_output(ifp, origifp, m, dst, ro->ro_rt); goto done; - } else if (mtu < IPV6_MMTU) { - /* - * note that path MTU is never less than IPV6_MMTU - * (see icmp6_input). - */ + } + + /* + * try to fragment the packet. case 1-b and 3 + */ + if (mtu < IPV6_MMTU) { + /* path MTU cannot be less than IPV6_MMTU */ error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; - } else if (ip6->ip6_plen == 0) { /* jumbo payload cannot be fragmented */ + } else if (ip6->ip6_plen == 0) { + /* jumbo payload cannot be fragmented */ error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } else { struct mbuf **mnext, *m_frgpart; - struct ip6_frag *ip6f = NULL; - u_int32_t id = htonl(ip6_id++); + struct ip6_frag *ip6f; + u_int32_t id = htonl(ip6_randomid()); u_char nextproto; /* @@ -1174,6 +1272,11 @@ skip_ipsec2:; ip6->ip6_nxt = IPPROTO_FRAGMENT; } + if ((m->m_pkthdr.csum_flags & CSUM_DELAY_IPV6_DATA) != 0) { + in6_delayed_cksum(m, sizeof(struct ip6_hdr) + optlen); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IPV6_DATA; + } + /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto @@ -1254,7 +1357,7 @@ skip_ipsec2:; /* clean ipsec history once it goes out of the node */ ipsec_delaux(m); #endif - error = nd6_output(ifp, origifp, m, dst, ro->ro_rt, 1); + error = nd6_output(ifp, origifp, m, dst, ro->ro_rt); } else m_freem(m); @@ -1264,8 +1367,6 @@ skip_ipsec2:; ip6stat.ip6s_fragmented++; done: - if (!locked) - lck_mtx_unlock(ip6_mutex); if (ro == &ip6route && ro->ro_rt) { /* brace necessary for rtfree */ rtfree(ro->ro_rt); } else if (ro_pmtu == &ip6route && ro_pmtu->ro_rt) { @@ -1278,8 +1379,12 @@ skip_ipsec2:; #endif /* IPSEC */ if (ia != NULL) - ifafree(&ia->ia_ifa); - return(error); + IFA_REMREF(&ia->ia_ifa); + if (ifp != NULL) + ifnet_release(ifp); + if (origifp != NULL) + ifnet_release(origifp); + return (error); freehdrs: m_freem(exthdrs.ip6e_hbh); /* m_freem will check if mbuf is 0 */ @@ -1311,7 +1416,7 @@ ip6_copyexthdr(mp, hdr, hlen) MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); - return(ENOBUFS); + return (ENOBUFS); } } m->m_len = hlen; @@ -1319,9 +1424,29 @@ ip6_copyexthdr(mp, hdr, hlen) bcopy(hdr, mtod(m, caddr_t), hlen); *mp = m; - return(0); + return (0); } +/* + * Process a delayed payload checksum calculation. + */ +void +in6_delayed_cksum(struct mbuf *m, uint16_t offset) +{ + uint16_t csum; + + csum = in6_cksum(m, 0, offset, m->m_pkthdr.len - offset); + if (csum == 0 && (m->m_pkthdr.csum_flags & CSUM_UDPIPV6) != 0) { + csum = 0xffff; + } + + offset += (m->m_pkthdr.csum_data & 0xffff); + if ((offset + sizeof(csum)) > m->m_len) { + m_copyback(m, offset, sizeof(csum), &csum); + } else { + *(uint16_t *)(mtod(m, char *) + offset) = csum; + } +} /* * Insert jumbo payload option. */ @@ -1345,7 +1470,7 @@ ip6_insert_jumboopt(exthdrs, plen) if (exthdrs->ip6e_hbh == 0) { MGET(mopt, M_DONTWAIT, MT_DATA); if (mopt == 0) - return(ENOBUFS); + return (ENOBUFS); mopt->m_len = JUMBOOPTLEN; optbuf = mtod(mopt, u_char *); optbuf[1] = 0; /* = ((JUMBOOPTLEN) >> 3) - 1 */ @@ -1361,7 +1486,7 @@ ip6_insert_jumboopt(exthdrs, plen) * other than exthdrs. * - exthdrs->ip6e_hbh is not an mbuf chain. */ - int oldoptlen = mopt->m_len; + u_int32_t oldoptlen = mopt->m_len; struct mbuf *n; /* @@ -1369,7 +1494,7 @@ ip6_insert_jumboopt(exthdrs, plen) * not fit even in an mbuf cluster. */ if (oldoptlen + JUMBOOPTLEN > MCLBYTES) - return(ENOBUFS); + return (ENOBUFS); /* * As a consequence, we must always prepare a cluster @@ -1384,11 +1509,11 @@ ip6_insert_jumboopt(exthdrs, plen) } } if (!n) - return(ENOBUFS); + return (ENOBUFS); n->m_len = oldoptlen + JUMBOOPTLEN; bcopy(mtod(mopt, caddr_t), mtod(n, caddr_t), - oldoptlen); - optbuf = (u_char *) (mtod(n, caddr_t) + oldoptlen); + oldoptlen); + optbuf = mtod(n, u_char *) + oldoptlen; m_freem(mopt); mopt = exthdrs->ip6e_hbh = n; } else { @@ -1415,7 +1540,7 @@ ip6_insert_jumboopt(exthdrs, plen) /* finally, adjust the packet header length */ exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN; - return(0); + return (0); #undef JUMBOOPTLEN } @@ -1432,9 +1557,9 @@ ip6_insertfraghdr(m0, m, hlen, frghdrp) if (hlen > sizeof(struct ip6_hdr)) { n = m_copym(m0, sizeof(struct ip6_hdr), - hlen - sizeof(struct ip6_hdr), M_DONTWAIT); + hlen - sizeof(struct ip6_hdr), M_DONTWAIT); if (n == 0) - return(ENOBUFS); + return (ENOBUFS); m->m_next = n; } else n = m; @@ -1446,8 +1571,8 @@ ip6_insertfraghdr(m0, m, hlen, frghdrp) if ((mlast->m_flags & M_EXT) == 0 && M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) { /* use the trailing space of the last mbuf for the fragment hdr */ - *frghdrp = - (struct ip6_frag *)(mtod(mlast, caddr_t) + mlast->m_len); + *frghdrp = (struct ip6_frag *)(mtod(mlast, caddr_t) + + mlast->m_len); mlast->m_len += sizeof(struct ip6_frag); m->m_pkthdr.len += sizeof(struct ip6_frag); } else { @@ -1456,18 +1581,102 @@ ip6_insertfraghdr(m0, m, hlen, frghdrp) MGET(mfrg, M_DONTWAIT, MT_DATA); if (mfrg == 0) - return(ENOBUFS); + return (ENOBUFS); mfrg->m_len = sizeof(struct ip6_frag); *frghdrp = mtod(mfrg, struct ip6_frag *); mlast->m_next = mfrg; } - return(0); + return (0); } extern int load_ipfw(void); +static int +ip6_getpmtu(struct route_in6 *ro_pmtu, struct route_in6 *ro, + struct ifnet *ifp, struct in6_addr *dst, u_int32_t *mtup, + int *alwaysfragp) +{ + u_int32_t mtu = 0; + int alwaysfrag = 0; + int error = 0; -/* + if (ro_pmtu != ro) { + /* The first hop and the final destination may differ. */ + struct sockaddr_in6 *sa6_dst = + (struct sockaddr_in6 *)&ro_pmtu->ro_dst; + if (ro_pmtu->ro_rt && + ((ro_pmtu->ro_rt->rt_flags & RTF_UP) == 0 || + ro_pmtu->ro_rt->generation_id != route_generation || + !IN6_ARE_ADDR_EQUAL(&sa6_dst->sin6_addr, dst))) { + rtfree(ro_pmtu->ro_rt); + ro_pmtu->ro_rt = (struct rtentry *)NULL; + } + if (ro_pmtu->ro_rt == NULL) { + bzero(sa6_dst, sizeof(*sa6_dst)); + sa6_dst->sin6_family = AF_INET6; + sa6_dst->sin6_len = sizeof(struct sockaddr_in6); + sa6_dst->sin6_addr = *dst; + + rtalloc_scoped((struct route *)ro_pmtu, + ifp != NULL ? ifp->if_index : IFSCOPE_NONE); + } + } + + + if (ro_pmtu->ro_rt != NULL) { + u_int32_t ifmtu; + + lck_rw_lock_shared(nd_if_rwlock); + ifmtu = IN6_LINKMTU(ifp); + lck_rw_done(nd_if_rwlock); + + RT_LOCK_SPIN(ro_pmtu->ro_rt); + mtu = ro_pmtu->ro_rt->rt_rmx.rmx_mtu; + if (mtu > ifmtu || mtu == 0) { + /* + * The MTU on the route is larger than the MTU on + * the interface! This shouldn't happen, unless the + * MTU of the interface has been changed after the + * interface was brought up. Change the MTU in the + * route to match the interface MTU (as long as the + * field isn't locked). + * + * if MTU on the route is 0, we need to fix the MTU. + * this case happens with path MTU discovery timeouts. + */ + mtu = ifmtu; + if ((ro_pmtu->ro_rt->rt_rmx.rmx_locks & RTV_MTU) == 0) + ro_pmtu->ro_rt->rt_rmx.rmx_mtu = mtu; /* XXX */ + } + else if (mtu < IPV6_MMTU) { + /* + * RFC2460 section 5, last paragraph: + * if we record ICMPv6 too big message with + * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU + * or smaller, with framgent header attached. + * (fragment header is needed regardless from the + * packet size, for translators to identify packets) + */ + alwaysfrag = 1; + mtu = IPV6_MMTU; + } + RT_UNLOCK(ro_pmtu->ro_rt); + } else { + if (ifp) { + lck_rw_lock_shared(nd_if_rwlock); + mtu = IN6_LINKMTU(ifp); + lck_rw_done(nd_if_rwlock); + } else + error = EHOSTUNREACH; /* XXX */ + } + + *mtup = mtu; + if (alwaysfragp) + *alwaysfragp = alwaysfrag; + return (error); +} + +/* * IP6 socket option processing. */ int @@ -1475,6 +1684,8 @@ ip6_ctloutput(so, sopt) struct socket *so; struct sockopt *sopt; { + int optdatalen, uproto; + void *optdata; int privileged; struct inpcb *in6p = sotoinpcb(so); int error = 0, optval = 0; @@ -1491,6 +1702,7 @@ ip6_ctloutput(so, sopt) optname = sopt->sopt_name; optlen = sopt->sopt_valsize; p = sopt->sopt_p; + uproto = (int)so->so_proto->pr_protocol; privileged = (proc_suser(p) == 0); @@ -1499,14 +1711,10 @@ ip6_ctloutput(so, sopt) case SOPT_SET: switch (optname) { - case IPV6_PKTOPTIONS: + case IPV6_2292PKTOPTIONS: { struct mbuf *m; - if (sopt->sopt_valsize > MCLBYTES) { - error = EMSGSIZE; - break; - } error = soopt_getm(sopt, &m); /* XXX */ if (error != 0) break; @@ -1532,12 +1740,23 @@ ip6_ctloutput(so, sopt) * receiving ANY hbh/dst options in order to avoid * overhead of parsing options in the kernel. */ + case IPV6_RECVHOPOPTS: + case IPV6_RECVDSTOPTS: + case IPV6_RECVRTHDRDSTOPTS: + if (!privileged) + break; + /* FALLTHROUGH */ case IPV6_UNICAST_HOPS: - case IPV6_CHECKSUM: + case IPV6_HOPLIMIT: case IPV6_FAITH: + case IPV6_RECVPKTINFO: + case IPV6_RECVHOPLIMIT: + case IPV6_RECVRTHDR: + case IPV6_RECVPATHMTU: case IPV6_RECVTCLASS: case IPV6_V6ONLY: + case IPV6_AUTOFLOWLABEL: if (optlen != sizeof(int)) { error = EINVAL; break; @@ -1554,8 +1773,7 @@ ip6_ctloutput(so, sopt) else { /* -1 = kernel default */ in6p->in6p_hops = optval; - - if ((in6p->in6p_vflag & + if ((in6p->inp_vflag & INP_IPV4) != 0) in6p->inp_ip_ttl = optval; } @@ -1563,18 +1781,103 @@ ip6_ctloutput(so, sopt) #define OPTSET(bit) \ do { \ if (optval) \ - in6p->in6p_flags |= (bit); \ + in6p->inp_flags |= (bit); \ else \ - in6p->in6p_flags &= ~(bit); \ -} while (0) -#define OPTBIT(bit) (in6p->in6p_flags & (bit) ? 1 : 0) + in6p->inp_flags &= ~(bit); \ +} while (/*CONSTCOND*/ 0) +#define OPTSET2292(bit) \ +do { \ + in6p->inp_flags |= IN6P_RFC2292; \ + if (optval) \ + in6p->inp_flags |= (bit); \ + else \ + in6p->inp_flags &= ~(bit); \ +} while (/*CONSTCOND*/ 0) +#define OPTBIT(bit) (in6p->inp_flags & (bit) ? 1 : 0) + + case IPV6_RECVPKTINFO: + /* cannot mix with RFC2292 */ + if (OPTBIT(IN6P_RFC2292)) { + error = EINVAL; + break; + } + OPTSET(IN6P_PKTINFO); + break; + + case IPV6_HOPLIMIT: + { + struct ip6_pktopts **optp; + + /* cannot mix with RFC2292 */ + if (OPTBIT(IN6P_RFC2292)) { + error = EINVAL; + break; + } + optp = &in6p->in6p_outputopts; + error = ip6_pcbopt(IPV6_HOPLIMIT, + (u_char *)&optval, sizeof(optval), + optp, uproto); + break; + } + + case IPV6_RECVHOPLIMIT: + /* cannot mix with RFC2292 */ + if (OPTBIT(IN6P_RFC2292)) { + error = EINVAL; + break; + } + OPTSET(IN6P_HOPLIMIT); + break; + + case IPV6_RECVHOPOPTS: + /* cannot mix with RFC2292 */ + if (OPTBIT(IN6P_RFC2292)) { + error = EINVAL; + break; + } + OPTSET(IN6P_HOPOPTS); + break; + + case IPV6_RECVDSTOPTS: + /* cannot mix with RFC2292 */ + if (OPTBIT(IN6P_RFC2292)) { + error = EINVAL; + break; + } + OPTSET(IN6P_DSTOPTS); + break; - case IPV6_CHECKSUM: - in6p->in6p_cksum = optval; + case IPV6_RECVRTHDRDSTOPTS: + /* cannot mix with RFC2292 */ + if (OPTBIT(IN6P_RFC2292)) { + error = EINVAL; + break; + } + OPTSET(IN6P_RTHDRDSTOPTS); + break; + + case IPV6_RECVRTHDR: + /* cannot mix with RFC2292 */ + if (OPTBIT(IN6P_RFC2292)) { + error = EINVAL; + break; + } + OPTSET(IN6P_RTHDR); break; case IPV6_FAITH: - OPTSET(IN6P_FAITH); + OPTSET(INP_FAITH); + break; + + case IPV6_RECVPATHMTU: + /* + * We ignore this option for TCP + * sockets. + * (RFC3542 leaves this case + * unspecified.) + */ + if (uproto != IPPROTO_TCP) + OPTSET(IN6P_MTU); break; case IPV6_V6ONLY: @@ -1583,30 +1886,54 @@ do { \ * available only prior to bind(2). * see ipng mailing list, Jun 22 2001. */ - if (in6p->in6p_lport || - !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr)) - { + if (in6p->inp_lport || + !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr)) { error = EINVAL; break; } OPTSET(IN6P_IPV6_V6ONLY); if (optval) - in6p->in6p_vflag &= ~INP_IPV4; + in6p->inp_vflag &= ~INP_IPV4; else - in6p->in6p_vflag |= INP_IPV4; + in6p->inp_vflag |= INP_IPV4; break; case IPV6_RECVTCLASS: - /* cannot mix with RFC2292 XXX */ + /* we can mix with RFC2292 */ OPTSET(IN6P_TCLASS); break; + case IPV6_AUTOFLOWLABEL: + OPTSET(IN6P_AUTOFLOWLABEL); + break; + } break; - case IPV6_PKTINFO: - case IPV6_HOPLIMIT: - case IPV6_HOPOPTS: - case IPV6_DSTOPTS: - case IPV6_RTHDR: + case IPV6_TCLASS: + case IPV6_DONTFRAG: + case IPV6_USE_MIN_MTU: + case IPV6_PREFER_TEMPADDR: + if (optlen != sizeof(optval)) { + error = EINVAL; + break; + } + error = sooptcopyin(sopt, &optval, + sizeof optval, sizeof optval); + if (error) + break; + { + struct ip6_pktopts **optp; + optp = &in6p->in6p_outputopts; + error = ip6_pcbopt(optname, + (u_char *)&optval, sizeof(optval), + optp, uproto); + break; + } + + case IPV6_2292PKTINFO: + case IPV6_2292HOPLIMIT: + case IPV6_2292HOPOPTS: + case IPV6_2292DSTOPTS: + case IPV6_2292RTHDR: /* RFC 2292 */ if (optlen != sizeof(int)) { error = EINVAL; @@ -1617,68 +1944,74 @@ do { \ if (error) break; switch (optname) { - case IPV6_PKTINFO: - OPTSET(IN6P_PKTINFO); + case IPV6_2292PKTINFO: + OPTSET2292(IN6P_PKTINFO); break; - case IPV6_HOPLIMIT: - OPTSET(IN6P_HOPLIMIT); + case IPV6_2292HOPLIMIT: + OPTSET2292(IN6P_HOPLIMIT); break; - case IPV6_HOPOPTS: + case IPV6_2292HOPOPTS: /* * Check super-user privilege. * See comments for IPV6_RECVHOPOPTS. */ if (!privileged) return(EPERM); - OPTSET(IN6P_HOPOPTS); + OPTSET2292(IN6P_HOPOPTS); break; - case IPV6_DSTOPTS: + case IPV6_2292DSTOPTS: if (!privileged) return(EPERM); - OPTSET(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */ + OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */ break; - case IPV6_RTHDR: - OPTSET(IN6P_RTHDR); + case IPV6_2292RTHDR: + OPTSET2292(IN6P_RTHDR); break; } break; -#undef OPTSET + case IPV6_3542PKTINFO: + case IPV6_3542HOPOPTS: + case IPV6_3542RTHDR: + case IPV6_3542DSTOPTS: + case IPV6_RTHDRDSTOPTS: + case IPV6_3542NEXTHOP: + { + /* new advanced API (RFC3542) */ + struct mbuf *m; - case IPV6_TCLASS: - if (optlen != sizeof(optval)) { + /* cannot mix with RFC2292 */ + if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } - error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); - if (error) + error = soopt_getm(sopt, &m); + if (error != 0) + break; + error = soopt_mcopyin(sopt, m); + if (error) { + m_freem(m); break; - error = ip6_pcbopt(optname, (u_char *)&optval, sizeof(optval), &in6p->in6p_outputopts); + } + error = ip6_pcbopt(optname, mtod(m, u_char *), + m->m_len, &in6p->in6p_outputopts, uproto); + m_freem(m); break; + } +#undef OPTSET case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_JOIN_GROUP: case IPV6_LEAVE_GROUP: - { - struct mbuf *m; - if (sopt->sopt_valsize > MLEN) { - error = EMSGSIZE; - break; - } - /* XXX */ - MGET(m, sopt->sopt_p != kernproc ? - M_WAIT : M_DONTWAIT, MT_HEADER); - if (m == 0) { - error = ENOBUFS; - break; - } - m->m_len = sopt->sopt_valsize; - error = sooptcopyin(sopt, mtod(m, char *), - m->m_len, m->m_len); - error = ip6_setmoptions(sopt->sopt_name, in6p, m); - (void)m_free(m); - } + case IPV6_MSFILTER: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + error = ip6_setmoptions(in6p, sopt); break; case IPV6_PORTRANGE: @@ -1689,18 +2022,18 @@ do { \ switch (optval) { case IPV6_PORTRANGE_DEFAULT: - in6p->in6p_flags &= ~(IN6P_LOWPORT); - in6p->in6p_flags &= ~(IN6P_HIGHPORT); + in6p->inp_flags &= ~(INP_LOWPORT); + in6p->inp_flags &= ~(INP_HIGHPORT); break; case IPV6_PORTRANGE_HIGH: - in6p->in6p_flags &= ~(IN6P_LOWPORT); - in6p->in6p_flags |= IN6P_HIGHPORT; + in6p->inp_flags &= ~(INP_LOWPORT); + in6p->inp_flags |= INP_HIGHPORT; break; case IPV6_PORTRANGE_LOW: - in6p->in6p_flags &= ~(IN6P_HIGHPORT); - in6p->in6p_flags |= IN6P_LOWPORT; + in6p->inp_flags &= ~(INP_HIGHPORT); + in6p->inp_flags |= INP_LOWPORT; break; default: @@ -1716,10 +2049,6 @@ do { \ size_t len = 0; struct mbuf *m; - if (sopt->sopt_valsize > MCLBYTES) { - error = EMSGSIZE; - break; - } if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ @@ -1751,6 +2080,47 @@ do { \ break; #endif /* IPFIREWALL */ + /* + * IPv6 variant of IP_BOUND_IF; for details see + * comments on IP_BOUND_IF in ip_ctloutput(). + */ + case IPV6_BOUND_IF: + /* This option is settable only on IPv6 */ + if (!(in6p->inp_vflag & INP_IPV6)) { + error = EINVAL; + break; + } + + error = sooptcopyin(sopt, &optval, + sizeof (optval), sizeof (optval)); + + if (error) + break; + + inp_bindif(in6p, optval); + break; + + case IPV6_NO_IFT_CELLULAR: + /* This option is settable only for IPv6 */ + if (!(in6p->inp_vflag & INP_IPV6)) { + error = EINVAL; + break; + } + + error = sooptcopyin(sopt, &optval, + sizeof (optval), sizeof (optval)); + + if (error) + break; + + error = inp_nocellular(in6p, optval); + break; + + case IPV6_OUT_IF: + /* This option is not settable */ + error = EINVAL; + break; + default: error = ENOPROTOOPT; break; @@ -1760,41 +2130,69 @@ do { \ case SOPT_GET: switch (optname) { - case IPV6_PKTOPTIONS: - if (in6p->in6p_options) { - struct mbuf *m; - m = m_copym(in6p->in6p_options, - 0, M_COPYALL, M_WAIT); - if (m == NULL) { - error = ENOBUFS; - break; - } - error = soopt_mcopyout(sopt, m); - if (error == 0) - m_freem(m); - } else - sopt->sopt_valsize = 0; + case IPV6_2292PKTOPTIONS: + /* + * RFC3542 (effectively) deprecated the + * semantics of the 2292-style pktoptions. + * Since it was not reliable in nature (i.e., + * applications had to expect the lack of some + * information after all), it would make sense + * to simplify this part by always returning + * empty data. + */ + sopt->sopt_valsize = 0; break; + case IPV6_RECVHOPOPTS: + case IPV6_RECVDSTOPTS: + case IPV6_RECVRTHDRDSTOPTS: case IPV6_UNICAST_HOPS: - case IPV6_CHECKSUM: + case IPV6_RECVPKTINFO: + case IPV6_RECVHOPLIMIT: + case IPV6_RECVRTHDR: + case IPV6_RECVPATHMTU: case IPV6_FAITH: case IPV6_V6ONLY: case IPV6_PORTRANGE: case IPV6_RECVTCLASS: + case IPV6_AUTOFLOWLABEL: switch (optname) { + case IPV6_RECVHOPOPTS: + optval = OPTBIT(IN6P_HOPOPTS); + break; + + case IPV6_RECVDSTOPTS: + optval = OPTBIT(IN6P_DSTOPTS); + break; + + case IPV6_RECVRTHDRDSTOPTS: + optval = OPTBIT(IN6P_RTHDRDSTOPTS); + break; + case IPV6_UNICAST_HOPS: optval = in6p->in6p_hops; break; - case IPV6_CHECKSUM: - optval = in6p->in6p_cksum; + case IPV6_RECVPKTINFO: + optval = OPTBIT(IN6P_PKTINFO); + break; + + case IPV6_RECVHOPLIMIT: + optval = OPTBIT(IN6P_HOPLIMIT); + break; + + case IPV6_RECVRTHDR: + optval = OPTBIT(IN6P_RTHDR); + break; + + case IPV6_RECVPATHMTU: + optval = OPTBIT(IN6P_MTU); break; case IPV6_FAITH: - optval = OPTBIT(IN6P_FAITH); + optval = OPTBIT(INP_FAITH); break; case IPV6_V6ONLY: @@ -1804,10 +2202,10 @@ do { \ case IPV6_PORTRANGE: { int flags; - flags = in6p->in6p_flags; - if (flags & IN6P_HIGHPORT) + flags = in6p->inp_flags; + if (flags & INP_HIGHPORT) optval = IPV6_PORTRANGE_HIGH; - else if (flags & IN6P_LOWPORT) + else if (flags & INP_LOWPORT) optval = IPV6_PORTRANGE_LOW; else optval = 0; @@ -1817,64 +2215,93 @@ do { \ optval = OPTBIT(IN6P_TCLASS); break; + case IPV6_AUTOFLOWLABEL: + optval = OPTBIT(IN6P_AUTOFLOWLABEL); + break; } + if (error) + break; error = sooptcopyout(sopt, &optval, sizeof optval); break; - case IPV6_PKTINFO: - case IPV6_HOPLIMIT: - case IPV6_HOPOPTS: - case IPV6_RTHDR: - case IPV6_DSTOPTS: - if ((optname == IPV6_HOPOPTS || - optname == IPV6_DSTOPTS) && - !privileged) - return(EPERM); + case IPV6_PATHMTU: + { + u_int32_t pmtu = 0; + struct ip6_mtuinfo mtuinfo; + struct route_in6 sro; + + bzero(&sro, sizeof(sro)); + + if (!(so->so_state & SS_ISCONNECTED)) + return (ENOTCONN); + /* + * XXX: we dot not consider the case of source + * routing, or optional information to specify + * the outgoing interface. + */ + error = ip6_getpmtu(&sro, NULL, NULL, + &in6p->in6p_faddr, &pmtu, NULL); + if (sro.ro_rt) + rtfree(sro.ro_rt); + if (error) + break; + if (pmtu > IPV6_MAXPACKET) + pmtu = IPV6_MAXPACKET; + + bzero(&mtuinfo, sizeof(mtuinfo)); + mtuinfo.ip6m_mtu = (u_int32_t)pmtu; + optdata = (void *)&mtuinfo; + optdatalen = sizeof(mtuinfo); + error = sooptcopyout(sopt, optdata, + optdatalen); + break; + } + + case IPV6_2292PKTINFO: + case IPV6_2292HOPLIMIT: + case IPV6_2292HOPOPTS: + case IPV6_2292RTHDR: + case IPV6_2292DSTOPTS: switch (optname) { - case IPV6_PKTINFO: + case IPV6_2292PKTINFO: optval = OPTBIT(IN6P_PKTINFO); break; - case IPV6_HOPLIMIT: + case IPV6_2292HOPLIMIT: optval = OPTBIT(IN6P_HOPLIMIT); break; - case IPV6_HOPOPTS: - if (!privileged) - return(EPERM); + case IPV6_2292HOPOPTS: optval = OPTBIT(IN6P_HOPOPTS); break; - case IPV6_RTHDR: + case IPV6_2292RTHDR: optval = OPTBIT(IN6P_RTHDR); break; - case IPV6_DSTOPTS: - if (!privileged) - return(EPERM); + case IPV6_2292DSTOPTS: optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); break; } error = sooptcopyout(sopt, &optval, - sizeof optval); + sizeof optval); break; - + case IPV6_PKTINFO: + case IPV6_HOPOPTS: + case IPV6_RTHDR: + case IPV6_DSTOPTS: + case IPV6_RTHDRDSTOPTS: + case IPV6_NEXTHOP: case IPV6_TCLASS: - error = ip6_getpcbopt(in6p->in6p_outputopts, optname, sopt); + case IPV6_DONTFRAG: + case IPV6_USE_MIN_MTU: + case IPV6_PREFER_TEMPADDR: + error = ip6_getpcbopt(in6p->in6p_outputopts, + optname, sopt); break; case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: - case IPV6_JOIN_GROUP: - case IPV6_LEAVE_GROUP: - { - struct mbuf *m; - error = ip6_getmoptions(sopt->sopt_name, - in6p->in6p_moptions, &m); - if (error == 0) - error = sooptcopyout(sopt, - mtod(m, char *), m->m_len); - if (m != NULL) - m_freem(m); - } + case IPV6_MSFILTER: + error = ip6_getmoptions(in6p, sopt); break; #if IPSEC @@ -1885,10 +2312,6 @@ do { \ struct mbuf *m = NULL; struct mbuf **mp = &m; - if (sopt->sopt_valsize > MCLBYTES) { - error = EMSGSIZE; - break; - } error = soopt_getm(sopt, &m); /* XXX */ if (error != 0) break; @@ -1921,6 +2344,26 @@ do { \ break; #endif /* IPFIREWALL */ + case IPV6_BOUND_IF: + if (in6p->inp_flags & INP_BOUND_IF) + optval = in6p->inp_boundif; + error = sooptcopyout(sopt, &optval, + sizeof (optval)); + break; + + case IPV6_NO_IFT_CELLULAR: + optval = (in6p->inp_flags & INP_NO_IFT_CELLULAR) + ? 1 : 0; + error = sooptcopyout(sopt, &optval, + sizeof (optval)); + break; + + case IPV6_OUT_IF: + optval = in6p->in6p_last_outif; + error = sooptcopyout(sopt, &optval, + sizeof (optval)); + break; + default: error = ENOPROTOOPT; break; @@ -1933,35 +2376,105 @@ do { \ return(error); } -/* - * Set up IP6 options in pcb for insertion in output packets or - * specifying behavior of outgoing packets. - */ -static int -ip6_pcbopts( - struct ip6_pktopts **pktopt, - struct mbuf *m, - __unused struct socket *so, - struct sockopt *sopt) +int +ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt) { - struct ip6_pktopts *opt = *pktopt; - int error = 0, priv; - struct proc *p = sopt->sopt_p; + int error = 0, optval, optlen; + const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum); + struct inpcb *in6p = sotoinpcb(so); + int level, op, optname; - /* turn off any old options. */ - if (opt) { -#if DIAGNOSTIC - if (opt->ip6po_pktinfo || opt->ip6po_nexthop || - opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 || - opt->ip6po_rhinfo.ip6po_rhi_rthdr) - printf("ip6_pcbopts: all specified options are cleared.\n"); -#endif - ip6_clearpktopts(opt, 1, -1); - } else { - opt = _MALLOC(sizeof(*opt), M_IP6OPT, M_WAITOK); - if (opt == NULL) - return ENOBUFS; - } + level = sopt->sopt_level; + op = sopt->sopt_dir; + optname = sopt->sopt_name; + optlen = sopt->sopt_valsize; + + if (level != IPPROTO_IPV6) { + return (EINVAL); + } + + switch (optname) { + case IPV6_CHECKSUM: + /* + * For ICMPv6 sockets, no modification allowed for checksum + * offset, permit "no change" values to help existing apps. + * + * RFC3542 says: "An attempt to set IPV6_CHECKSUM + * for an ICMPv6 socket will fail." + * The current behavior does not meet RFC3542. + */ + switch (op) { + case SOPT_SET: + if (optlen != sizeof(int)) { + error = EINVAL; + break; + } + error = sooptcopyin(sopt, &optval, sizeof(optval), + sizeof(optval)); + if (error) + break; + if ((optval % 2) != 0) { + /* the API assumes even offset values */ + error = EINVAL; + } else if (so->so_proto->pr_protocol == + IPPROTO_ICMPV6) { + if (optval != icmp6off) + error = EINVAL; + } else + in6p->in6p_cksum = optval; + break; + + case SOPT_GET: + if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) + optval = icmp6off; + else + optval = in6p->in6p_cksum; + + error = sooptcopyout(sopt, &optval, sizeof(optval)); + break; + + default: + error = EINVAL; + break; + } + break; + + default: + error = ENOPROTOOPT; + break; + } + + return (error); +} + +/* + * Set up IP6 options in pcb for insertion in output packets or + * specifying behavior of outgoing packets. + */ +static int +ip6_pcbopts( + struct ip6_pktopts **pktopt, + struct mbuf *m, + __unused struct socket *so, + __unused struct sockopt *sopt) +{ + struct ip6_pktopts *opt = *pktopt; + int error = 0; + + /* turn off any old options. */ + if (opt) { +#if DIAGNOSTIC + if (opt->ip6po_pktinfo || opt->ip6po_nexthop || + opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 || + opt->ip6po_rhinfo.ip6po_rhi_rthdr) + printf("ip6_pcbopts: all specified options are cleared.\n"); +#endif + ip6_clearpktopts(opt, -1); + } else { + opt = _MALLOC(sizeof(*opt), M_IP6OPT, M_WAITOK); + if (opt == NULL) + return ENOBUFS; + } *pktopt = NULL; if (!m || m->m_len == 0) { @@ -1974,11 +2487,9 @@ ip6_pcbopts( return(0); } - priv = (proc_suser(p) == 0); - /* set options specified by user. */ - if ((error = ip6_setpktoptions(m, opt, priv, 1)) != 0) { - ip6_clearpktopts(opt, 1, -1); /* XXX: discard all options */ + if ((error = ip6_setpktopts(m, opt, NULL, so->so_proto->pr_protocol)) != 0) { + ip6_clearpktopts(opt, -1); /* XXX: discard all options */ FREE(opt, M_IP6OPT); return(error); } @@ -1986,19 +2497,36 @@ ip6_pcbopts( return(0); } +/* + * initialize ip6_pktopts. beware that there are non-zero default values in + * the struct. + */ +void +ip6_initpktopts(struct ip6_pktopts *opt) +{ + + bzero(opt, sizeof(*opt)); + opt->ip6po_hlim = -1; /* -1 means default hop limit */ + opt->ip6po_tclass = -1; /* -1 means default traffic class */ + opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY; + opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM; +} + static int -ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt) +ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt, int uproto) { struct ip6_pktopts *opt; opt = *pktopt; if (opt == NULL) { opt = _MALLOC(sizeof(*opt), M_IP6OPT, M_WAITOK); + if (opt == NULL) + return(ENOBUFS); ip6_initpktopts(opt); *pktopt = opt; } - return (ip6_setpktopt(optname, buf, len, opt)); + return (ip6_setpktopt(optname, buf, len, opt, 1, 0, uproto)); } static int @@ -2006,15 +2534,85 @@ ip6_getpcbopt(struct ip6_pktopts *pktopt, int optname, struct sockopt *sopt) { void *optdata = NULL; int optdatalen = 0; - int deftclass = 0; + struct ip6_ext *ip6e; int error = 0; + struct in6_pktinfo null_pktinfo; + int deftclass = 0, on; + int defminmtu = IP6PO_MINMTU_MCASTONLY; + int defpreftemp = IP6PO_TEMPADDR_SYSTEM; switch (optname) { + case IPV6_PKTINFO: + if (pktopt && pktopt->ip6po_pktinfo) + optdata = (void *)pktopt->ip6po_pktinfo; + else { + /* XXX: we don't have to do this every time... */ + bzero(&null_pktinfo, sizeof(null_pktinfo)); + optdata = (void *)&null_pktinfo; + } + optdatalen = sizeof(struct in6_pktinfo); + break; case IPV6_TCLASS: if (pktopt && pktopt->ip6po_tclass >= 0) - optdata = &pktopt->ip6po_tclass; + optdata = (void *)&pktopt->ip6po_tclass; + else + optdata = (void *)&deftclass; + optdatalen = sizeof(int); + break; + case IPV6_HOPOPTS: + if (pktopt && pktopt->ip6po_hbh) { + optdata = (void *)pktopt->ip6po_hbh; + ip6e = (struct ip6_ext *)pktopt->ip6po_hbh; + optdatalen = (ip6e->ip6e_len + 1) << 3; + } + break; + case IPV6_RTHDR: + if (pktopt && pktopt->ip6po_rthdr) { + optdata = (void *)pktopt->ip6po_rthdr; + ip6e = (struct ip6_ext *)pktopt->ip6po_rthdr; + optdatalen = (ip6e->ip6e_len + 1) << 3; + } + break; + case IPV6_RTHDRDSTOPTS: + if (pktopt && pktopt->ip6po_dest1) { + optdata = (void *)pktopt->ip6po_dest1; + ip6e = (struct ip6_ext *)pktopt->ip6po_dest1; + optdatalen = (ip6e->ip6e_len + 1) << 3; + } + break; + case IPV6_DSTOPTS: + if (pktopt && pktopt->ip6po_dest2) { + optdata = (void *)pktopt->ip6po_dest2; + ip6e = (struct ip6_ext *)pktopt->ip6po_dest2; + optdatalen = (ip6e->ip6e_len + 1) << 3; + } + break; + case IPV6_NEXTHOP: + if (pktopt && pktopt->ip6po_nexthop) { + optdata = (void *)pktopt->ip6po_nexthop; + optdatalen = pktopt->ip6po_nexthop->sa_len; + } + break; + case IPV6_USE_MIN_MTU: + if (pktopt) + optdata = (void *)&pktopt->ip6po_minmtu; else - optdata = &deftclass; + optdata = (void *)&defminmtu; + optdatalen = sizeof(int); + break; + case IPV6_DONTFRAG: + if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG)) + on = 1; + else + on = 0; + optdata = (void *)&on; + optdatalen = sizeof(on); + break; + case IPV6_PREFER_TEMPADDR: + if (pktopt) + optdata = (void *)&pktopt->ip6po_prefer_tempaddr; + else + optdata = (void *)&defpreftemp; optdatalen = sizeof(int); break; default: /* should not happen */ @@ -2025,81 +2623,48 @@ ip6_getpcbopt(struct ip6_pktopts *pktopt, int optname, struct sockopt *sopt) } error = sooptcopyout(sopt, optdata, optdatalen); - return (error); -} - -static int -ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt) -{ - switch (optname) { - case IPV6_TCLASS: - { - int tclass; - if (len != sizeof(int)) - return (EINVAL); - tclass = *(int *)buf; - if (tclass < -1 || tclass > 255) - return (EINVAL); - - opt->ip6po_tclass = tclass; - break; - } - - default: - return (ENOPROTOOPT); - } /* end of switch */ - - return (0); -} - -/* - * initialize ip6_pktopts. beware that there are non-zero default values in - * the struct. - */ -void -ip6_initpktopts(opt) - struct ip6_pktopts *opt; -{ - bzero(opt, sizeof(*opt)); - opt->ip6po_hlim = -1; /* -1 means default hop limit */ - opt->ip6po_tclass = -1; /* -1 means default traffic class */ + return (error); } void -ip6_clearpktopts(pktopt, needfree, optname) +ip6_clearpktopts(pktopt, optname) struct ip6_pktopts *pktopt; - int needfree, optname; + int optname; { if (pktopt == NULL) return; - if (optname == -1) { - if (needfree && pktopt->ip6po_pktinfo) + if (optname == -1 || optname == IPV6_PKTINFO) { + if (pktopt->ip6po_pktinfo) FREE(pktopt->ip6po_pktinfo, M_IP6OPT); pktopt->ip6po_pktinfo = NULL; } - if (optname == -1) + if (optname == -1 || optname == IPV6_HOPLIMIT) pktopt->ip6po_hlim = -1; - if (optname == -1) + if (optname == -1 || optname == IPV6_TCLASS) pktopt->ip6po_tclass = -1; - if (optname == -1) { - if (needfree && pktopt->ip6po_nexthop) + if (optname == -1 || optname == IPV6_NEXTHOP) { + if (pktopt->ip6po_nextroute.ro_rt) { + rtfree(pktopt->ip6po_nextroute.ro_rt); + pktopt->ip6po_nextroute.ro_rt = NULL; + } + if (pktopt->ip6po_nexthop) FREE(pktopt->ip6po_nexthop, M_IP6OPT); pktopt->ip6po_nexthop = NULL; } - if (optname == -1) { - if (needfree && pktopt->ip6po_hbh) + if (optname == -1 || optname == IPV6_HOPOPTS) { + if (pktopt->ip6po_hbh) FREE(pktopt->ip6po_hbh, M_IP6OPT); pktopt->ip6po_hbh = NULL; } - if (optname == -1) { - if (needfree && pktopt->ip6po_dest1) + if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) { + if (pktopt->ip6po_dest1) FREE(pktopt->ip6po_dest1, M_IP6OPT); pktopt->ip6po_dest1 = NULL; } - if (optname == -1) { - if (needfree && pktopt->ip6po_rhinfo.ip6po_rhi_rthdr) + if (optname == -1 || optname == IPV6_RTHDR) { + if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr) FREE(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT); pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL; if (pktopt->ip6po_route.ro_rt) { @@ -2107,8 +2672,8 @@ ip6_clearpktopts(pktopt, needfree, optname) pktopt->ip6po_route.ro_rt = NULL; } } - if (optname == -1) { - if (needfree && pktopt->ip6po_dest2) + if (optname == -1 || optname == IPV6_DSTOPTS) { + if (pktopt->ip6po_dest2) FREE(pktopt->ip6po_dest2, M_IP6OPT); pktopt->ip6po_dest2 = NULL; } @@ -2126,25 +2691,17 @@ do {\ }\ } while (0) -struct ip6_pktopts * -ip6_copypktopts(src, canwait) - struct ip6_pktopts *src; - int canwait; +static int +copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait) { - struct ip6_pktopts *dst; - - if (src == NULL) { + if (dst == NULL || src == NULL) { printf("ip6_clearpktopts: invalid argument\n"); - return(NULL); + return (EINVAL); } - dst = _MALLOC(sizeof(*dst), M_IP6OPT, canwait); - if (dst == NULL && canwait == M_NOWAIT) - return (NULL); - bzero(dst, sizeof(*dst)); - dst->ip6po_hlim = src->ip6po_hlim; dst->ip6po_tclass = src->ip6po_tclass; + dst->ip6po_flags = src->ip6po_flags; if (src->ip6po_pktinfo) { dst->ip6po_pktinfo = _MALLOC(sizeof(*dst->ip6po_pktinfo), M_IP6OPT, canwait); @@ -2164,20 +2721,33 @@ ip6_copypktopts(src, canwait) PKTOPT_EXTHDRCPY(ip6po_dest1); PKTOPT_EXTHDRCPY(ip6po_dest2); PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */ - return(dst); + return (0); bad: - if (dst->ip6po_pktinfo) FREE(dst->ip6po_pktinfo, M_IP6OPT); - if (dst->ip6po_nexthop) FREE(dst->ip6po_nexthop, M_IP6OPT); - if (dst->ip6po_hbh) FREE(dst->ip6po_hbh, M_IP6OPT); - if (dst->ip6po_dest1) FREE(dst->ip6po_dest1, M_IP6OPT); - if (dst->ip6po_dest2) FREE(dst->ip6po_dest2, M_IP6OPT); - if (dst->ip6po_rthdr) FREE(dst->ip6po_rthdr, M_IP6OPT); - FREE(dst, M_IP6OPT); - return(NULL); + ip6_clearpktopts(dst, -1); + return (ENOBUFS); } #undef PKTOPT_EXTHDRCPY +struct ip6_pktopts * +ip6_copypktopts(struct ip6_pktopts *src, int canwait) +{ + int error; + struct ip6_pktopts *dst; + + dst = _MALLOC(sizeof(*dst), M_IP6OPT, canwait); + if (dst == NULL) + return (NULL); + ip6_initpktopts(dst); + + if ((error = copypktopts(dst, src, canwait)) != 0) { + FREE(dst, M_IP6OPT); + return (NULL); + } + + return (dst); +} + void ip6_freepcbopts(pktopt) struct ip6_pktopts *pktopt; @@ -2185,707 +2755,596 @@ ip6_freepcbopts(pktopt) if (pktopt == NULL) return; - ip6_clearpktopts(pktopt, 1, -1); + ip6_clearpktopts(pktopt, -1); FREE(pktopt, M_IP6OPT); } -/* - * Set the IP6 multicast options in response to user setsockopt(). - */ -static int -ip6_setmoptions( - int optname, - struct inpcb* in6p, - struct mbuf *m) +void +ip6_moptions_init(void) { - int error = 0; - u_int loop, ifindex; - struct ipv6_mreq *mreq; - struct ifnet *ifp; - struct ip6_moptions **im6op = &in6p->in6p_moptions; - struct ip6_moptions *im6o = *im6op; - struct ip_moptions *imo; - struct route_in6 ro; - struct sockaddr_in6 *dst; - struct in6_multi_mship *imm; - - if (im6o == NULL) { - /* - * No multicast option buffer attached to the pcb; - * allocate one and initialize to default values. - */ - im6o = (struct ip6_moptions *) - _MALLOC(sizeof(*im6o), M_IPMOPTS, M_WAITOK); - - if (im6o == NULL) - return(ENOBUFS); - *im6op = im6o; - im6o->im6o_multicast_ifp = NULL; - im6o->im6o_multicast_hlim = ip6_defmcasthlim; - im6o->im6o_multicast_loop = IPV6_DEFAULT_MULTICAST_LOOP; - LIST_INIT(&im6o->im6o_memberships); - } - - if (in6p->inp_moptions == NULL) { - /* - * No IPv4 multicast option buffer attached to the pcb; - * call ip_createmoptions to allocate one and initialize - * to default values. - */ - error = ip_createmoptions(&in6p->inp_moptions); - if (error != 0) - return error; - } - imo = in6p->inp_moptions; - - switch (optname) { - - case IPV6_MULTICAST_IF: - /* - * Select the interface for outgoing multicast packets. - */ - if (m == NULL || m->m_len != sizeof(u_int)) { - error = EINVAL; - break; - } - bcopy(mtod(m, u_int *), &ifindex, sizeof(ifindex)); - - ifnet_head_lock_shared(); - /* Don't need to check is ifindex is < 0 since it's unsigned */ - if (if_index < ifindex) { - error = ENXIO; /* XXX EINVAL? */ - ifnet_head_done(); - break; - } - ifp = ifindex2ifnet[ifindex]; - ifnet_head_done(); - if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { - error = EADDRNOTAVAIL; - break; - } - im6o->im6o_multicast_ifp = ifp; - imo->imo_multicast_ifp = ifp; - break; - - case IPV6_MULTICAST_HOPS: - { - /* - * Set the IP6 hoplimit for outgoing multicast packets. - */ - int optval; - if (m == NULL || m->m_len != sizeof(int)) { - error = EINVAL; - break; - } - bcopy(mtod(m, u_int *), &optval, sizeof(optval)); - if (optval < -1 || optval >= 256) - error = EINVAL; - else if (optval == -1) { - im6o->im6o_multicast_hlim = ip6_defmcasthlim; - imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; - } else { - im6o->im6o_multicast_hlim = optval; - imo->imo_multicast_ttl = optval; - } - break; - } - - case IPV6_MULTICAST_LOOP: - /* - * Set the loopback flag for outgoing multicast packets. - * Must be zero or one. - */ - if (m == NULL || m->m_len != sizeof(u_int)) { - error = EINVAL; - break; - } - bcopy(mtod(m, u_int *), &loop, sizeof(loop)); - if (loop > 1) { - error = EINVAL; - break; - } - im6o->im6o_multicast_loop = loop; - imo->imo_multicast_loop = loop; - break; - - case IPV6_JOIN_GROUP: - /* - * Add a multicast group membership. - * Group must be a valid IP6 multicast address. - */ - if (m == NULL || m->m_len != sizeof(struct ipv6_mreq)) { - error = EINVAL; - break; - } - mreq = mtod(m, struct ipv6_mreq *); - /* - * If the interface is specified, validate it. - * - * Don't need to check if it's < 0, since it's unsigned - */ - ifnet_head_lock_shared(); - if (if_index < mreq->ipv6mr_interface) { - ifnet_head_done(); - error = ENXIO; /* XXX EINVAL? */ - break; - } - ifp = ifindex2ifnet[mreq->ipv6mr_interface]; - ifnet_head_done(); - - if (IN6_IS_ADDR_UNSPECIFIED(&mreq->ipv6mr_multiaddr)) { - /* - * We use the unspecified address to specify to accept - * all multicast addresses. Only super user is allowed - * to do this. - */ - if (suser(kauth_cred_get(), 0)) - { - error = EACCES; - break; - } - } else if (IN6_IS_ADDR_V4MAPPED(&mreq->ipv6mr_multiaddr)) { - struct ip_mreq v4req; - - v4req.imr_multiaddr.s_addr = mreq->ipv6mr_multiaddr.s6_addr32[3]; - v4req.imr_interface.s_addr = INADDR_ANY; - - /* Find an IPv4 address on the specified interface. */ - if (mreq->ipv6mr_interface != 0) { - struct in_ifaddr *ifa; - - lck_rw_lock_shared(in_ifaddr_rwlock); - TAILQ_FOREACH(ifa, &in_ifaddrhead, ia_link) { - if (ifa->ia_ifp == ifp) { - v4req.imr_interface = IA_SIN(ifa)->sin_addr; - break; - } - } - lck_rw_done(in_ifaddr_rwlock); - - if (v4req.imr_multiaddr.s_addr == 0) { - /* Interface has no IPv4 address. */ - error = EINVAL; - break; - } - } - - error = ip_addmembership(imo, &v4req); - break; - } else if (!IN6_IS_ADDR_MULTICAST(&mreq->ipv6mr_multiaddr)) { - error = EINVAL; - break; - } - /* - * If no interface was explicitly specified, choose an - * appropriate one according to the given multicast address. - */ - if (mreq->ipv6mr_interface == 0) { - /* - * If the multicast address is in node-local scope, - * the interface should be a loopback interface. - * Otherwise, look up the routing table for the - * address, and choose the outgoing interface. - * XXX: is it a good approach? - */ - if (IN6_IS_ADDR_MC_NODELOCAL(&mreq->ipv6mr_multiaddr)) { - ifp = lo_ifp; - } else { - ro.ro_rt = NULL; - dst = (struct sockaddr_in6 *)&ro.ro_dst; - bzero(dst, sizeof(*dst)); - dst->sin6_len = sizeof(struct sockaddr_in6); - dst->sin6_family = AF_INET6; - dst->sin6_addr = mreq->ipv6mr_multiaddr; - rtalloc((struct route *)&ro); - if (ro.ro_rt == NULL) { - error = EADDRNOTAVAIL; - break; - } - ifp = ro.ro_rt->rt_ifp; - rtfree(ro.ro_rt); - ro.ro_rt = NULL; - } - } + PE_parse_boot_argn("ifa_debug", &im6o_debug, sizeof (im6o_debug)); - /* - * See if we found an interface, and confirm that it - * supports multicast - */ - if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { - error = EADDRNOTAVAIL; - break; - } - /* - * Put interface index into the multicast address, - * if the address has link-local scope. - */ - if (IN6_IS_ADDR_MC_LINKLOCAL(&mreq->ipv6mr_multiaddr)) { - mreq->ipv6mr_multiaddr.s6_addr16[1] - = htons(mreq->ipv6mr_interface); - } - /* - * See if the membership already exists. - */ - lck_mtx_lock(nd6_mutex); - for (imm = im6o->im6o_memberships.lh_first; - imm != NULL; imm = imm->i6mm_chain.le_next) - if (imm->i6mm_maddr->in6m_ifp == ifp && - IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr, - &mreq->ipv6mr_multiaddr)) - break; - if (imm != NULL) { - error = EADDRINUSE; - lck_mtx_unlock(nd6_mutex); - break; - } - /* - * Everything looks good; add a new record to the multicast - * address list for the given interface. - */ - imm = _MALLOC(sizeof(*imm), M_IPMADDR, M_WAITOK); - if (imm == NULL) { - error = ENOBUFS; - lck_mtx_unlock(nd6_mutex); - break; - } - if ((imm->i6mm_maddr = - in6_addmulti(&mreq->ipv6mr_multiaddr, ifp, &error, 1)) == NULL) { - FREE(imm, M_IPMADDR); - lck_mtx_unlock(nd6_mutex); - break; - } - LIST_INSERT_HEAD(&im6o->im6o_memberships, imm, i6mm_chain); - lck_mtx_unlock(nd6_mutex); - break; - - case IPV6_LEAVE_GROUP: - /* - * Drop a multicast group membership. - * Group must be a valid IP6 multicast address. - */ - if (m == NULL || m->m_len != sizeof(struct ipv6_mreq)) { - error = EINVAL; - break; - } - mreq = mtod(m, struct ipv6_mreq *); - /* - * If an interface address was specified, get a pointer - * to its ifnet structure. - * - * Don't need to check if it's < 0, since it's unsigned. - */ - ifnet_head_lock_shared(); - if (if_index < mreq->ipv6mr_interface) { - ifnet_head_done(); - error = ENXIO; /* XXX EINVAL? */ - break; - } - ifp = ifindex2ifnet[mreq->ipv6mr_interface]; - ifnet_head_done(); - - if (IN6_IS_ADDR_UNSPECIFIED(&mreq->ipv6mr_multiaddr)) { - if (suser(kauth_cred_get(), 0)) { - error = EACCES; - break; - } - } else if (IN6_IS_ADDR_V4MAPPED(&mreq->ipv6mr_multiaddr)) { - struct ip_mreq v4req; - - v4req.imr_multiaddr.s_addr = mreq->ipv6mr_multiaddr.s6_addr32[3]; - v4req.imr_interface.s_addr = INADDR_ANY; - - if (ifp != NULL) { - struct in_ifaddr *ifa; - - lck_rw_lock_shared(in_ifaddr_rwlock); - TAILQ_FOREACH(ifa, &in_ifaddrhead, ia_link) { - if (ifa->ia_ifp == ifp) { - v4req.imr_interface = IA_SIN(ifa)->sin_addr; - break; - } - } - lck_rw_done(in_ifaddr_rwlock); - } - - error = ip_dropmembership(imo, &v4req); - break; - } else if (!IN6_IS_ADDR_MULTICAST(&mreq->ipv6mr_multiaddr)) { - error = EINVAL; - break; - } - /* - * Put interface index into the multicast address, - * if the address has link-local scope. - */ - if (IN6_IS_ADDR_MC_LINKLOCAL(&mreq->ipv6mr_multiaddr)) { - mreq->ipv6mr_multiaddr.s6_addr16[1] - = htons(mreq->ipv6mr_interface); - } - /* - * Find the membership in the membership list. - */ - lck_mtx_lock(nd6_mutex); - for (imm = im6o->im6o_memberships.lh_first; - imm != NULL; imm = imm->i6mm_chain.le_next) { - if ((ifp == NULL || - imm->i6mm_maddr->in6m_ifp == ifp) && - IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr, - &mreq->ipv6mr_multiaddr)) - break; - } - if (imm == NULL) { - /* Unable to resolve interface */ - error = EADDRNOTAVAIL; - lck_mtx_unlock(nd6_mutex); - break; - } - /* - * Give up the multicast address record to which the - * membership points. - */ - LIST_REMOVE(imm, i6mm_chain); - in6_delmulti(imm->i6mm_maddr, 1); - lck_mtx_unlock(nd6_mutex); - FREE(imm, M_IPMADDR); - break; - - default: - error = EOPNOTSUPP; - break; + im6o_size = (im6o_debug == 0) ? sizeof (struct ip6_moptions) : + sizeof (struct ip6_moptions_dbg); + + im6o_zone = zinit(im6o_size, IM6O_ZONE_MAX * im6o_size, 0, + IM6O_ZONE_NAME); + if (im6o_zone == NULL) { + panic("%s: failed allocating %s", __func__, IM6O_ZONE_NAME); + /* NOTREACHED */ } + zone_change(im6o_zone, Z_EXPAND, TRUE); +} - /* - * If all options have default values, no need to keep the mbuf. - */ - lck_mtx_lock(nd6_mutex); - if (im6o->im6o_multicast_ifp == NULL && - im6o->im6o_multicast_hlim == ip6_defmcasthlim && - im6o->im6o_multicast_loop == IPV6_DEFAULT_MULTICAST_LOOP && - im6o->im6o_memberships.lh_first == NULL) { - FREE(*im6op, M_IPMOPTS); - *im6op = NULL; - } - if (imo->imo_multicast_ifp == NULL && - imo->imo_multicast_vif == -1 && - imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL && - imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP && - imo->imo_num_memberships == 0) { - ip_freemoptions(imo); - in6p->inp_moptions = 0; - } - lck_mtx_unlock(nd6_mutex); +void +im6o_addref(struct ip6_moptions *im6o, int locked) +{ + if (!locked) + IM6O_LOCK(im6o); + else + IM6O_LOCK_ASSERT_HELD(im6o); - return(error); + if (++im6o->im6o_refcnt == 0) { + panic("%s: im6o %p wraparound refcnt\n", __func__, im6o); + /* NOTREACHED */ + } else if (im6o->im6o_trace != NULL) { + (*im6o->im6o_trace)(im6o, TRUE); + } + + if (!locked) + IM6O_UNLOCK(im6o); } -/* - * Return the IP6 multicast options in response to user getsockopt(). - */ -static int -ip6_getmoptions(optname, im6o, mp) - int optname; - struct ip6_moptions *im6o; - struct mbuf **mp; +void +im6o_remref(struct ip6_moptions *im6o) { - u_int *hlim, *loop, *ifindex; + int i; - *mp = m_get(M_WAIT, MT_HEADER); /*XXX*/ - if (*mp == NULL) - return ENOBUFS; + IM6O_LOCK(im6o); + if (im6o->im6o_refcnt == 0) { + panic("%s: im6o %p negative refcnt", __func__, im6o); + /* NOTREACHED */ + } else if (im6o->im6o_trace != NULL) { + (*im6o->im6o_trace)(im6o, FALSE); + } - switch (optname) { + --im6o->im6o_refcnt; + if (im6o->im6o_refcnt > 0) { + IM6O_UNLOCK(im6o); + return; + } - case IPV6_MULTICAST_IF: - ifindex = mtod(*mp, u_int *); - (*mp)->m_len = sizeof(u_int); - if (im6o == NULL || im6o->im6o_multicast_ifp == NULL) - *ifindex = 0; - else - *ifindex = im6o->im6o_multicast_ifp->if_index; - return(0); + for (i = 0; i < im6o->im6o_num_memberships; ++i) { + struct in6_mfilter *imf; - case IPV6_MULTICAST_HOPS: - hlim = mtod(*mp, u_int *); - (*mp)->m_len = sizeof(u_int); - if (im6o == NULL) - *hlim = ip6_defmcasthlim; - else - *hlim = im6o->im6o_multicast_hlim; - return(0); + imf = im6o->im6o_mfilters ? &im6o->im6o_mfilters[i] : NULL; + if (imf != NULL) + im6f_leave(imf); - case IPV6_MULTICAST_LOOP: - loop = mtod(*mp, u_int *); - (*mp)->m_len = sizeof(u_int); - if (im6o == NULL) - *loop = ip6_defmcasthlim; - else - *loop = im6o->im6o_multicast_loop; - return(0); + (void) in6_mc_leave(im6o->im6o_membership[i], imf); - default: - return(EOPNOTSUPP); + if (imf != NULL) + im6f_purge(imf); + + IN6M_REMREF(im6o->im6o_membership[i]); + im6o->im6o_membership[i] = NULL; + } + im6o->im6o_num_memberships = 0; + if (im6o->im6o_mfilters != NULL) { + FREE(im6o->im6o_mfilters, M_IN6MFILTER); + im6o->im6o_mfilters = NULL; + } + if (im6o->im6o_membership != NULL) { + FREE(im6o->im6o_membership, M_IP6MOPTS); + im6o->im6o_membership = NULL; + } + IM6O_UNLOCK(im6o); + + lck_mtx_destroy(&im6o->im6o_lock, ifa_mtx_grp); + + if (!(im6o->im6o_debug & IFD_ALLOC)) { + panic("%s: im6o %p cannot be freed", __func__, im6o); + /* NOTREACHED */ } + zfree(im6o_zone, im6o); } -/* - * Discard the IP6 multicast options. - */ -void -ip6_freemoptions(im6o) - struct ip6_moptions *im6o; +static void +im6o_trace(struct ip6_moptions *im6o, int refhold) { - struct in6_multi_mship *imm; + struct ip6_moptions_dbg *im6o_dbg = (struct ip6_moptions_dbg *)im6o; + ctrace_t *tr; + u_int32_t idx; + u_int16_t *cnt; - if (im6o == NULL) - return; - - lck_mtx_lock(nd6_mutex); - while ((imm = im6o->im6o_memberships.lh_first) != NULL) { - LIST_REMOVE(imm, i6mm_chain); - if (imm->i6mm_maddr) - in6_delmulti(imm->i6mm_maddr, 1); - FREE(imm, M_IPMADDR); - } - lck_mtx_unlock(nd6_mutex); - FREE(im6o, M_IPMOPTS); + if (!(im6o->im6o_debug & IFD_DEBUG)) { + panic("%s: im6o %p has no debug structure", __func__, im6o); + /* NOTREACHED */ + } + if (refhold) { + cnt = &im6o_dbg->im6o_refhold_cnt; + tr = im6o_dbg->im6o_refhold; + } else { + cnt = &im6o_dbg->im6o_refrele_cnt; + tr = im6o_dbg->im6o_refrele; + } + + idx = atomic_add_16_ov(cnt, 1) % IM6O_TRACE_HIST_SIZE; + ctrace_record(&tr[idx]); +} + +struct ip6_moptions * +ip6_allocmoptions(int how) +{ + struct ip6_moptions *im6o; + + im6o = (how == M_WAITOK) ? + zalloc(im6o_zone) : zalloc_noblock(im6o_zone); + if (im6o != NULL) { + bzero(im6o, im6o_size); + lck_mtx_init(&im6o->im6o_lock, ifa_mtx_grp, ifa_mtx_attr); + im6o->im6o_debug |= IFD_ALLOC; + if (im6o_debug != 0) { + im6o->im6o_debug |= IFD_DEBUG; + im6o->im6o_trace = im6o_trace; + } + IM6O_ADDREF(im6o); + } + + return (im6o); } /* * Set IPv6 outgoing packet options based on advanced API. */ int -ip6_setpktoptions(control, opt, priv, needcopy) - struct mbuf *control; - struct ip6_pktopts *opt; - int priv, needcopy; +ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt, + struct ip6_pktopts *stickyopt, int uproto) { struct cmsghdr *cm = 0; - if (control == 0 || opt == 0) - return(EINVAL); + if (control == NULL || opt == NULL) + return (EINVAL); ip6_initpktopts(opt); + if (stickyopt) { + int error; + + /* + * If stickyopt is provided, make a local copy of the options + * for this particular packet, then override them by ancillary + * objects. + * XXX: copypktopts() does not copy the cached route to a next + * hop (if any). This is not very good in terms of efficiency, + * but we can allow this since this option should be rarely + * used. + */ + if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0) + return (error); + } /* * XXX: Currently, we assume all the optional information is stored * in a single mbuf. */ if (control->m_next) - return(EINVAL); + return (EINVAL); - for (; control->m_len; control->m_data += CMSG_ALIGN(cm->cmsg_len), - control->m_len -= CMSG_ALIGN(cm->cmsg_len)) { - cm = mtod(control, struct cmsghdr *); - if (cm->cmsg_len == 0 || cm->cmsg_len > control->m_len) - return(EINVAL); + if (control->m_len < CMSG_LEN(0)) + return (EINVAL); + + for (cm = M_FIRST_CMSGHDR(control); cm; cm = M_NXT_CMSGHDR(control, cm)) { + int error; + + if (cm->cmsg_len < sizeof(struct cmsghdr) || cm->cmsg_len > control->m_len) + return (EINVAL); if (cm->cmsg_level != IPPROTO_IPV6) continue; + error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm), + cm->cmsg_len - CMSG_LEN(0), opt, 0, 1, uproto); + if (error) + return (error); + } + + return (0); +} +/* + * Set a particular packet option, as a sticky option or an ancillary data + * item. "len" can be 0 only when it's a sticky option. + * We have 4 cases of combination of "sticky" and "cmsg": + * "sticky=0, cmsg=0": impossible + * "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data + * "sticky=1, cmsg=0": RFC3542 socket option + * "sticky=1, cmsg=1": RFC2292 socket option + */ +static int +ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, + int sticky, int cmsg, int uproto) +{ + int minmtupolicy, preftemp; + int error; + + if (!sticky && !cmsg) { +#ifdef DIAGNOSTIC + printf("ip6_setpktopt: impossible case\n"); +#endif + return (EINVAL); + } + + /* + * IPV6_2292xxx is for backward compatibility to RFC2292, and should + * not be specified in the context of RFC3542. Conversely, + * RFC3542 types should not be specified in the context of RFC2292. + */ + if (!cmsg) { + switch (optname) { + case IPV6_2292PKTINFO: + case IPV6_2292HOPLIMIT: + case IPV6_2292NEXTHOP: + case IPV6_2292HOPOPTS: + case IPV6_2292DSTOPTS: + case IPV6_2292RTHDR: + case IPV6_2292PKTOPTIONS: + return (ENOPROTOOPT); + } + } + if (sticky && cmsg) { + switch (optname) { + case IPV6_PKTINFO: + case IPV6_HOPLIMIT: + case IPV6_NEXTHOP: + case IPV6_HOPOPTS: + case IPV6_DSTOPTS: + case IPV6_RTHDRDSTOPTS: + case IPV6_RTHDR: + case IPV6_USE_MIN_MTU: + case IPV6_DONTFRAG: + case IPV6_TCLASS: + case IPV6_PREFER_TEMPADDR: /* XXX: not an RFC3542 option */ + return (ENOPROTOOPT); + } + } + + switch (optname) { + case IPV6_2292PKTINFO: + case IPV6_PKTINFO: + { + struct ifnet *ifp = NULL; + struct in6_pktinfo *pktinfo; + + if (len != sizeof(struct in6_pktinfo)) + return (EINVAL); + + pktinfo = (struct in6_pktinfo *)buf; + /* - * XXX should check if RFC2292 API is mixed with 2292bis API + * An application can clear any sticky IPV6_PKTINFO option by + * doing a "regular" setsockopt with ipi6_addr being + * in6addr_any and ipi6_ifindex being zero. + * [RFC 3542, Section 6] */ - switch (cm->cmsg_type) { - case IPV6_PKTINFO: - if (cm->cmsg_len != CMSG_LEN(sizeof(struct in6_pktinfo))) - return(EINVAL); - if (needcopy) { - /* XXX: Is it really WAITOK? */ - opt->ip6po_pktinfo = - _MALLOC(sizeof(struct in6_pktinfo), - M_IP6OPT, M_WAITOK); - if (opt->ip6po_pktinfo == NULL) - return ENOBUFS; - bcopy(CMSG_DATA(cm), opt->ip6po_pktinfo, - sizeof(struct in6_pktinfo)); - } else - opt->ip6po_pktinfo = - (struct in6_pktinfo *)CMSG_DATA(cm); - if (opt->ip6po_pktinfo->ipi6_ifindex && - IN6_IS_ADDR_LINKLOCAL(&opt->ip6po_pktinfo->ipi6_addr)) - opt->ip6po_pktinfo->ipi6_addr.s6_addr16[1] = - htons(opt->ip6po_pktinfo->ipi6_ifindex); - - if (opt->ip6po_pktinfo->ipi6_ifindex > if_index) { - return(ENXIO); - } + if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo && + pktinfo->ipi6_ifindex == 0 && + IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { + ip6_clearpktopts(opt, optname); + break; + } - /* - * Check if the requested source address is indeed a - * unicast address assigned to the node, and can be - * used as the packet's source address. - */ - if (!IN6_IS_ADDR_UNSPECIFIED(&opt->ip6po_pktinfo->ipi6_addr)) { - struct in6_ifaddr *ia6; - struct sockaddr_in6 sin6; - - bzero(&sin6, sizeof(sin6)); - sin6.sin6_len = sizeof(sin6); - sin6.sin6_family = AF_INET6; - sin6.sin6_addr = - opt->ip6po_pktinfo->ipi6_addr; - ia6 = (struct in6_ifaddr *)ifa_ifwithaddr(sin6tosa(&sin6)); - if (ia6 == NULL || - (ia6->ia6_flags & (IN6_IFF_ANYCAST | - IN6_IFF_NOTREADY)) != 0) { - if (ia6) ifafree(&ia6->ia_ifa); - return(EADDRNOTAVAIL); - } - ifafree(&ia6->ia_ifa); - ia6 = NULL; + if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO && + sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { + return (EINVAL); + } + + /* validate the interface index if specified. */ + ifnet_head_lock_shared(); + + if (pktinfo->ipi6_ifindex > if_index) { + ifnet_head_done(); + return (ENXIO); + } + + if (pktinfo->ipi6_ifindex) { + ifp = ifindex2ifnet[pktinfo->ipi6_ifindex]; + if (ifp == NULL) { + ifnet_head_done(); + return (ENXIO); } - break; + } + + ifnet_head_done(); - case IPV6_HOPLIMIT: - if (cm->cmsg_len != CMSG_LEN(sizeof(int))) - return(EINVAL); + /* + * We store the address anyway, and let in6_selectsrc() + * validate the specified address. This is because ipi6_addr + * may not have enough information about its scope zone, and + * we may need additional information (such as outgoing + * interface or the scope zone of a destination address) to + * disambiguate the scope. + * XXX: the delay of the validation may confuse the + * application when it is used as a sticky option. + */ + if (opt->ip6po_pktinfo == NULL) { + opt->ip6po_pktinfo = _MALLOC(sizeof(*pktinfo), + M_IP6OPT, M_NOWAIT); + if (opt->ip6po_pktinfo == NULL) + return (ENOBUFS); + } + bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo)); + break; + } - opt->ip6po_hlim = *(int *)CMSG_DATA(cm); - if (opt->ip6po_hlim < -1 || opt->ip6po_hlim > 255) - return(EINVAL); - break; + case IPV6_2292HOPLIMIT: + case IPV6_HOPLIMIT: + { + int *hlimp; - case IPV6_TCLASS: - if (cm->cmsg_len != CMSG_LEN(sizeof(int))) - return(EINVAL); + /* + * RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT + * to simplify the ordering among hoplimit options. + */ + if (optname == IPV6_HOPLIMIT && sticky) + return (ENOPROTOOPT); - opt->ip6po_tclass = *(int *)CMSG_DATA(cm); - if (opt->ip6po_tclass < -1 || opt->ip6po_tclass > 255) - return (EINVAL); - break; + if (len != sizeof(int)) + return (EINVAL); + hlimp = (int *)buf; + if (*hlimp < -1 || *hlimp > 255) + return (EINVAL); - case IPV6_NEXTHOP: - if (!priv) - return(EPERM); - - if (cm->cmsg_len < sizeof(u_char) || - /* check if cmsg_len is large enough for sa_len */ - cm->cmsg_len < CMSG_LEN(*CMSG_DATA(cm))) - return(EINVAL); - - if (needcopy) { - opt->ip6po_nexthop = - _MALLOC(*CMSG_DATA(cm), - M_IP6OPT, M_WAITOK); - if (opt->ip6po_nexthop == NULL) - return ENOBUFS; - bcopy(CMSG_DATA(cm), - opt->ip6po_nexthop, - *CMSG_DATA(cm)); - } else - opt->ip6po_nexthop = - (struct sockaddr *)CMSG_DATA(cm); + opt->ip6po_hlim = *hlimp; + break; + } + + case IPV6_TCLASS: + { + int tclass; + + if (len != sizeof(int)) + return (EINVAL); + tclass = *(int *)buf; + if (tclass < -1 || tclass > 255) + return (EINVAL); + + opt->ip6po_tclass = tclass; + break; + } + + case IPV6_2292NEXTHOP: + case IPV6_NEXTHOP: + error = suser(kauth_cred_get(), 0); + if (error) + return (EACCES); + + if (len == 0) { /* just remove the option */ + ip6_clearpktopts(opt, IPV6_NEXTHOP); break; + } - case IPV6_HOPOPTS: + /* check if cmsg_len is large enough for sa_len */ + if (len < sizeof(struct sockaddr) || len < *buf) + return (EINVAL); + + switch (((struct sockaddr *)buf)->sa_family) { + case AF_INET6: { - struct ip6_hbh *hbh; - int hbhlen; - - if (cm->cmsg_len < CMSG_LEN(sizeof(struct ip6_hbh))) - return(EINVAL); - hbh = (struct ip6_hbh *)CMSG_DATA(cm); - hbhlen = (hbh->ip6h_len + 1) << 3; - if (cm->cmsg_len != CMSG_LEN(hbhlen)) - return(EINVAL); - - if (needcopy) { - opt->ip6po_hbh = - _MALLOC(hbhlen, M_IP6OPT, M_WAITOK); - if (opt->ip6po_hbh == NULL) - return ENOBUFS; - bcopy(hbh, opt->ip6po_hbh, hbhlen); - } else - opt->ip6po_hbh = hbh; + struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf; + + if (sa6->sin6_len != sizeof(struct sockaddr_in6)) + return (EINVAL); + + if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) || + IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) { + return (EINVAL); + } + if ((error = sa6_embedscope(sa6, ip6_use_defzone)) + != 0) { + return (error); + } break; } + case AF_LINK: /* should eventually be supported */ + default: + return (EAFNOSUPPORT); + } - case IPV6_DSTOPTS: - { - struct ip6_dest *dest, **newdest; - int destlen; - - if (cm->cmsg_len < CMSG_LEN(sizeof(struct ip6_dest))) - return(EINVAL); - dest = (struct ip6_dest *)CMSG_DATA(cm); - destlen = (dest->ip6d_len + 1) << 3; - if (cm->cmsg_len != CMSG_LEN(destlen)) - return(EINVAL); - - /* - * The old advacned API is ambiguous on this - * point. Our approach is to determine the - * position based according to the existence - * of a routing header. Note, however, that - * this depends on the order of the extension - * headers in the ancillary data; the 1st part - * of the destination options header must - * appear before the routing header in the - * ancillary data, too. - * RFC2292bis solved the ambiguity by - * introducing separate cmsg types. + /* turn off the previous option, then set the new option. */ + ip6_clearpktopts(opt, IPV6_NEXTHOP); + opt->ip6po_nexthop = _MALLOC(*buf, M_IP6OPT, M_NOWAIT); + if (opt->ip6po_nexthop == NULL) + return (ENOBUFS); + bcopy(buf, opt->ip6po_nexthop, *buf); + break; + + case IPV6_2292HOPOPTS: + case IPV6_HOPOPTS: + { + struct ip6_hbh *hbh; + int hbhlen; + + /* + * XXX: We don't allow a non-privileged user to set ANY HbH + * options, since per-option restriction has too much + * overhead. + */ + error = suser(kauth_cred_get(), 0); + if (error) + return (EACCES); + + if (len == 0) { + ip6_clearpktopts(opt, IPV6_HOPOPTS); + break; /* just remove the option */ + } + + /* message length validation */ + if (len < sizeof(struct ip6_hbh)) + return (EINVAL); + hbh = (struct ip6_hbh *)buf; + hbhlen = (hbh->ip6h_len + 1) << 3; + if (len != hbhlen) + return (EINVAL); + + /* turn off the previous option, then set the new option. */ + ip6_clearpktopts(opt, IPV6_HOPOPTS); + opt->ip6po_hbh = _MALLOC(hbhlen, M_IP6OPT, M_NOWAIT); + if (opt->ip6po_hbh == NULL) + return (ENOBUFS); + bcopy(hbh, opt->ip6po_hbh, hbhlen); + + break; + } + + case IPV6_2292DSTOPTS: + case IPV6_DSTOPTS: + case IPV6_RTHDRDSTOPTS: + { + struct ip6_dest *dest, **newdest = NULL; + int destlen; + + error = suser(kauth_cred_get(), 0); + if (error) + return (EACCES); + + if (len == 0) { + ip6_clearpktopts(opt, optname); + break; /* just remove the option */ + } + + /* message length validation */ + if (len < sizeof(struct ip6_dest)) + return (EINVAL); + dest = (struct ip6_dest *)buf; + destlen = (dest->ip6d_len + 1) << 3; + if (len != destlen) + return (EINVAL); + + /* + * Determine the position that the destination options header + * should be inserted; before or after the routing header. + */ + switch (optname) { + case IPV6_2292DSTOPTS: + /* + * The old advacned API is ambiguous on this point. + * Our approach is to determine the position based + * according to the existence of a routing header. + * Note, however, that this depends on the order of the + * extension headers in the ancillary data; the 1st + * part of the destination options header must appear + * before the routing header in the ancillary data, + * too. + * RFC3542 solved the ambiguity by introducing + * separate ancillary data or option types. */ if (opt->ip6po_rthdr == NULL) newdest = &opt->ip6po_dest1; else newdest = &opt->ip6po_dest2; - - if (needcopy) { - *newdest = _MALLOC(destlen, M_IP6OPT, M_WAITOK); - if (*newdest == NULL) - return ENOBUFS; - bcopy(dest, *newdest, destlen); - } else - *newdest = dest; - + break; + case IPV6_RTHDRDSTOPTS: + newdest = &opt->ip6po_dest1; + break; + case IPV6_DSTOPTS: + newdest = &opt->ip6po_dest2; break; } - case IPV6_RTHDR: - { - struct ip6_rthdr *rth; - int rthlen; - - if (cm->cmsg_len < CMSG_LEN(sizeof(struct ip6_rthdr))) - return(EINVAL); - rth = (struct ip6_rthdr *)CMSG_DATA(cm); - rthlen = (rth->ip6r_len + 1) << 3; - if (cm->cmsg_len != CMSG_LEN(rthlen)) - return(EINVAL); - - switch (rth->ip6r_type) { - case IPV6_RTHDR_TYPE_0: - /* must contain one addr */ - if (rth->ip6r_len == 0) - return(EINVAL); - /* length must be even */ - if (rth->ip6r_len % 2) - return(EINVAL); - if (rth->ip6r_len / 2 != rth->ip6r_segleft) - return(EINVAL); - break; - default: - return(EINVAL); /* not supported */ - } + /* turn off the previous option, then set the new option. */ + ip6_clearpktopts(opt, optname); + *newdest = _MALLOC(destlen, M_IP6OPT, M_NOWAIT); + if (*newdest == NULL) + return (ENOBUFS); + bcopy(dest, *newdest, destlen); - if (needcopy) { - opt->ip6po_rthdr = _MALLOC(rthlen, M_IP6OPT, - M_WAITOK); - if (opt->ip6po_rthdr == NULL) - return ENOBUFS; - bcopy(rth, opt->ip6po_rthdr, rthlen); - } else - opt->ip6po_rthdr = rth; + break; + } - break; + case IPV6_2292RTHDR: + case IPV6_RTHDR: + { + struct ip6_rthdr *rth; + int rthlen; + + if (len == 0) { + ip6_clearpktopts(opt, IPV6_RTHDR); + break; /* just remove the option */ } + /* message length validation */ + if (len < sizeof(struct ip6_rthdr)) + return (EINVAL); + rth = (struct ip6_rthdr *)buf; + rthlen = (rth->ip6r_len + 1) << 3; + if (len != rthlen) + return (EINVAL); + + switch (rth->ip6r_type) { + case IPV6_RTHDR_TYPE_0: + if (rth->ip6r_len == 0) /* must contain one addr */ + return (EINVAL); + if (rth->ip6r_len % 2) /* length must be even */ + return (EINVAL); + if (rth->ip6r_len / 2 != rth->ip6r_segleft) + return (EINVAL); + break; default: - return(ENOPROTOOPT); + return (EINVAL); /* not supported */ } + + /* turn off the previous option */ + ip6_clearpktopts(opt, IPV6_RTHDR); + opt->ip6po_rthdr = _MALLOC(rthlen, M_IP6OPT, M_NOWAIT); + if (opt->ip6po_rthdr == NULL) + return (ENOBUFS); + bcopy(rth, opt->ip6po_rthdr, rthlen); + + break; } - return(0); + case IPV6_USE_MIN_MTU: + if (len != sizeof(int)) + return (EINVAL); + minmtupolicy = *(int *)buf; + if (minmtupolicy != IP6PO_MINMTU_MCASTONLY && + minmtupolicy != IP6PO_MINMTU_DISABLE && + minmtupolicy != IP6PO_MINMTU_ALL) { + return (EINVAL); + } + opt->ip6po_minmtu = minmtupolicy; + break; + + case IPV6_DONTFRAG: + if (len != sizeof(int)) + return (EINVAL); + + if (uproto == IPPROTO_TCP || *(int *)buf == 0) { + /* + * we ignore this option for TCP sockets. + * (RFC3542 leaves this case unspecified.) + */ + opt->ip6po_flags &= ~IP6PO_DONTFRAG; + } else + opt->ip6po_flags |= IP6PO_DONTFRAG; + break; + + case IPV6_PREFER_TEMPADDR: + if (len != sizeof(int)) + return (EINVAL); + preftemp = *(int *)buf; + if (preftemp != IP6PO_TEMPADDR_SYSTEM && + preftemp != IP6PO_TEMPADDR_NOTPREFER && + preftemp != IP6PO_TEMPADDR_PREFER) { + return (EINVAL); + } + opt->ip6po_prefer_tempaddr = preftemp; + break; + + default: + return (ENOPROTOOPT); + } /* end of switch */ + + return (0); } /* @@ -2927,28 +3386,28 @@ ip6_mloopback( #endif ip6 = mtod(copym, struct ip6_hdr *); -#ifndef SCOPEDROUTING /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); -#endif #ifdef __APPLE__ /* Makes sure the HW checksum flags are cleaned before sending the packet */ + if ((copym->m_pkthdr.csum_flags & CSUM_DELAY_IPV6_DATA) != 0) { + in6_delayed_cksum(copym, sizeof(struct ip6_hdr)); + copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_IPV6_DATA; + } copym->m_pkthdr.rcvif = 0; copym->m_pkthdr.csum_data = 0; copym->m_pkthdr.csum_flags = 0; if (lo_ifp) { copym->m_pkthdr.rcvif = ifp; - lck_mtx_unlock(ip6_mutex); dlil_output(lo_ifp, PF_INET6, copym, 0, (struct sockaddr *)dst, 0); - lck_mtx_lock(ip6_mutex); } else m_free(copym); #else @@ -3002,7 +3461,7 @@ ip6_optlen(in6p) len = 0; #define elen(x) \ - (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0) + (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0) len += elen(in6p->in6p_outputopts->ip6po_hbh); if (in6p->in6p_outputopts->ip6po_rthdr) @@ -3013,4 +3472,3 @@ ip6_optlen(in6p) return len; #undef elen } - diff --git a/bsd/netinet6/ip6_var.h b/bsd/netinet6/ip6_var.h index 9aa8e0e3f..acb9c3857 100644 --- a/bsd/netinet6/ip6_var.h +++ b/bsd/netinet6/ip6_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -97,37 +97,29 @@ #define _NETINET6_IP6_VAR_H_ #include -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE /* * IP6 reassembly queue structure. Each fragment * being reassembled is attached to one of these structures. */ struct ip6q { - u_int32_t ip6q_head; - u_int16_t ip6q_len; - u_int8_t ip6q_nxt; /* ip6f_nxt in first fragment */ - u_int8_t ip6q_hlim; struct ip6asfrag *ip6q_down; struct ip6asfrag *ip6q_up; u_int32_t ip6q_ident; - u_int8_t ip6q_arrive; + u_int8_t ip6q_nxt; + u_int8_t ip6q_ecn; u_int8_t ip6q_ttl; - struct in6_addr ip6q_src, ip6q_dst; + struct in6_addr ip6q_src, ip6q_dst; struct ip6q *ip6q_next; struct ip6q *ip6q_prev; int ip6q_unfrglen; /* len of unfragmentable part */ #if notyet u_char *ip6q_nxtp; #endif - int ip6q_nfrag; /* number of fragments */ + int ip6q_nfrag; /* # of fragments */ }; struct ip6asfrag { - u_int32_t ip6af_head; - u_int16_t ip6af_len; - u_int8_t ip6af_nxt; - u_int8_t ip6af_hlim; - /* must not override the above members during reassembling */ struct ip6asfrag *ip6af_down; struct ip6asfrag *ip6af_up; struct mbuf *ip6af_m; @@ -140,12 +132,49 @@ struct ip6asfrag { #define IP6_REASS_MBUF(ip6af) (*(struct mbuf **)&((ip6af)->ip6af_m)) struct ip6_moptions { + decl_lck_mtx_data(, im6o_lock); + uint32_t im6o_refcnt; /* ref count */ + uint32_t im6o_debug; /* see ifa_debug flags */ struct ifnet *im6o_multicast_ifp; /* ifp for outgoing multicasts */ u_char im6o_multicast_hlim; /* hoplimit for outgoing multicasts */ u_char im6o_multicast_loop; /* 1 >= hear sends if a member */ - LIST_HEAD(, in6_multi_mship) im6o_memberships; + u_short im6o_num_memberships; /* no. memberships this socket */ + u_short im6o_max_memberships; /* max memberships this socket */ + struct in6_multi **im6o_membership; /* group memberships */ + struct in6_mfilter *im6o_mfilters; /* source filters */ + void (*im6o_trace) /* callback fn for tracing refs */ + (struct ip6_moptions *, int); }; +#define IM6O_LOCK_ASSERT_HELD(_im6o) \ + lck_mtx_assert(&(_im6o)->im6o_lock, LCK_MTX_ASSERT_OWNED) + +#define IM6O_LOCK_ASSERT_NOTHELD(_im6o) \ + lck_mtx_assert(&(_im6o)->im6o_lock, LCK_MTX_ASSERT_NOTOWNED) + +#define IM6O_LOCK(_im6o) \ + lck_mtx_lock(&(_im6o)->im6o_lock) + +#define IM6O_LOCK_SPIN(_im6o) \ + lck_mtx_lock_spin(&(_im6o)->im6o_lock) + +#define IM6O_CONVERT_LOCK(_im6o) do { \ + IM6O_LOCK_ASSERT_HELD(_im6o); \ + lck_mtx_convert_spin(&(_im6o)->im6o_lock); \ +} while (0) + +#define IM6O_UNLOCK(_im6o) \ + lck_mtx_unlock(&(_im6o)->im6o_lock) + +#define IM6O_ADDREF(_im6o) \ + im6o_addref(_im6o, 0) + +#define IM6O_ADDREF_LOCKED(_im6o) \ + im6o_addref(_im6o, 1) + +#define IM6O_REMREF(_im6o) \ + im6o_remref(_im6o) + /* * Control options for outgoing packets */ @@ -158,6 +187,14 @@ struct ip6po_rhinfo { #define ip6po_rthdr ip6po_rhinfo.ip6po_rhi_rthdr #define ip6po_route ip6po_rhinfo.ip6po_rhi_route +/* Nexthop related info */ +struct ip6po_nhinfo { + struct sockaddr *ip6po_nhi_nexthop; + struct route_in6 ip6po_nhi_route; /* Route to the nexthop */ +}; +#define ip6po_nexthop ip6po_nhinfo.ip6po_nhi_nexthop +#define ip6po_nextroute ip6po_nhinfo.ip6po_nhi_route + struct ip6_pktopts { struct mbuf *ip6po_m; /* Pointer to mbuf storing the data */ int ip6po_hlim; /* Hoplimit for outgoing packets */ @@ -165,8 +202,9 @@ struct ip6_pktopts { /* Outgoing IF/address information */ struct in6_pktinfo *ip6po_pktinfo; - struct sockaddr *ip6po_nexthop; /* Next-hop address */ - + /* Next-hop address information */ + struct ip6po_nhinfo ip6po_nhinfo; + struct ip6_hbh *ip6po_hbh; /* Hop-by-Hop options header */ /* Destination options header (before a routing header) */ @@ -178,13 +216,32 @@ struct ip6_pktopts { /* Destination options header (after a routing header) */ struct ip6_dest *ip6po_dest2; - int ip6po_tclass; /* traffic class */ + int ip6po_tclass; /* traffic class */ + + int ip6po_minmtu; /* fragment vs PMTU discovery policy */ +#define IP6PO_MINMTU_MCASTONLY -1 /* default; send at min MTU for multicast*/ +#define IP6PO_MINMTU_DISABLE 0 /* always perform pmtu disc */ +#define IP6PO_MINMTU_ALL 1 /* always send at min MTU */ + + int ip6po_prefer_tempaddr; /* whether temporary addresses are + preferred as source address */ +#define IP6PO_TEMPADDR_SYSTEM -1 /* follow the system default */ +#define IP6PO_TEMPADDR_NOTPREFER 0 /* not prefer temporary address */ +#define IP6PO_TEMPADDR_PREFER 1 /* prefer temporary address */ + + int ip6po_flags; +#if 0 /* parameters in this block is obsolete. do not reuse the values. */ +#define IP6PO_REACHCONF 0x01 /* upper-layer reachability confirmation. */ +#define IP6PO_MINMTU 0x02 /* use minimum MTU (IPV6_USE_MIN_MTU) */ +#endif +#define IP6PO_DONTFRAG 0x04 /* disable fragmentation (IPV6_DONTFRAG) */ +#define IP6PO_USECOA 0x08 /* use care of address */ }; /* * Control options for incoming packets */ -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ struct ip6stat { u_quad_t ip6s_total; /* total packets received */ @@ -201,7 +258,7 @@ struct ip6stat { u_quad_t ip6s_localout; /* total ip packets generated here */ u_quad_t ip6s_odropped; /* lost packets due to nobufs, etc. */ u_quad_t ip6s_reassembled; /* total packets reassembled ok */ - u_quad_t ip6s_fragmented; /* datagrams sucessfully fragmented */ + u_quad_t ip6s_fragmented; /* datagrams successfully fragmented */ u_quad_t ip6s_ofragments; /* output fragments created */ u_quad_t ip6s_cantfrag; /* don't fragment flag was set, etc. */ u_quad_t ip6s_badoptions; /* error in option processing */ @@ -240,11 +297,17 @@ struct ip6stat { * from the destination is chosen. */ u_quad_t ip6s_sources_otherscope[16]; - /* number of times that an deprecated address is chosen */ + /* number of times that a deprecated address is chosen */ u_quad_t ip6s_sources_deprecated[16]; u_quad_t ip6s_forward_cachehit; u_quad_t ip6s_forward_cachemiss; + + /* number of times that each rule of source selection is applied. */ + u_quad_t ip6s_sources_rule[16]; +#ifdef PRIVATE + u_quad_t ip6s_pktdropcntrl; /* pkt dropped, no mbufs for control data */ +#endif /* PRIVATE */ }; #ifdef KERNEL_PRIVATE @@ -279,12 +342,28 @@ struct ip6aux { }; /* flags passed to ip6_output as last parameter */ -#define IPV6_DADOUTPUT 0x01 /* DAD */ +#define IPV6_UNSPECSRC 0x01 /* allow :: as the source address */ #define IPV6_FORWARDING 0x02 /* most of IPv6 header exists */ #define IPV6_MINMTU 0x04 /* use minimum MTU (IPV6_USE_MIN_MTU) */ +#define IPV6_FLAG_NOSRCIFSEL 0x80 /* bypas source address selection */ +#define IPV6_OUTARGS 0x100 /* has ancillary output info */ + +#ifdef __NO_STRICT_ALIGNMENT +#define IP6_HDR_ALIGNED_P(ip) 1 +#else +#define IP6_HDR_ALIGNED_P(ip) ((((intptr_t) (ip)) & 3) == 0) +#endif + +/* + * Extra information passed to ip6_output when IP6_OUTARGS is set. + */ +struct ip6_out_args { + unsigned int ip6oa_boundif; /* bound outgoing interface */ + unsigned int ip6oa_nocell; /* don't use IFT_CELLULAR */ +}; extern struct ip6stat ip6stat; /* statistics */ -extern u_int32_t ip6_id; /* fragment identifier */ +extern u_int32_t ip6_id; /* fragment identifier */ extern int ip6_defhlim; /* default hop limit */ extern int ip6_defmcasthlim; /* default multicast hop limit */ extern int ip6_forwarding; /* act as router? */ @@ -293,7 +372,8 @@ extern int ip6_gif_hlim; /* Hop limit for gif encap packet */ extern int ip6_use_deprecated; /* allow deprecated addr as source */ extern int ip6_rr_prune; /* router renumbering prefix * walk list every 5 sec. */ -#define ip6_mapped_addr_on (!ip6_v6only) +extern int ip6_mcast_pmtu; /* enable pMTU discovery for multicast? */ +#define ip6_mapped_addr_on (!ip6_v6only) extern int ip6_v6only; extern int ip6_neighborgcthresh; /* Threshold # of NDP entries for GC */ @@ -304,8 +384,8 @@ extern int ip6_maxdynroutes; /* Max # of routes created via redirect */ extern struct socket *ip6_mrouter; /* multicast routing daemon */ #endif extern int ip6_sendredirects; /* send IP redirects when forwarding? */ -extern int ip6_maxfragpackets; /* Maximum packets in reassembly queue */ -extern int ip6_maxfrags; /* Maximum fragments in reassembly queue */ +extern int ip6_maxfragpackets; /* Maximum packets in reassembly queue */ +extern int ip6_maxfrags; /* Maximum fragments in reassembly queue */ extern int ip6_sourcecheck; /* Verify source interface */ extern int ip6_sourcecheck_interval; /* Interval between log messages */ extern int ip6_accept_rtadv; /* Acts as a host not a router */ @@ -314,6 +394,7 @@ extern int ip6_log_interval; extern time_t ip6_log_time; extern int ip6_hdrnestlimit; /* upper limit of # of extension headers */ extern int ip6_dad_count; /* DupAddrDetectionTransmits */ +extern int ip6_only_allow_rfc4193_prefix; /* RFC4193 Unique Local Unicast Prefixes only */ extern u_int32_t ip6_flow_seq; extern int ip6_auto_flowlabel; @@ -325,9 +406,16 @@ extern int ip6_lowportmin; /* minimum reserved port */ extern int ip6_lowportmax; /* maximum reserved port */ extern int ip6_use_tempaddr; /* whether to use temporary addresses. */ +extern int ip6_prefer_tempaddr; /* whether to prefer temporary addresses + in the source address selection */ +extern int ip6_use_defzone; /* whether to use the default scope zone + when unspecified */ extern struct pr_usrreqs rip6_usrreqs; extern struct pr_usrreqs icmp6_dgram_usrreqs; + +extern int ip6_doscopedroute; + struct sockopt; struct inpcb; @@ -340,51 +428,71 @@ int icmp6_dgram_attach(struct socket *, int , struct proc *); struct in6_ifaddr; void ip6_init(void); +void ip6_fin(void); void ip6_input(struct mbuf *); struct in6_ifaddr *ip6_getdstifaddr(struct mbuf *); void ip6_freepcbopts(struct ip6_pktopts *); -void ip6_freemoptions(struct ip6_moptions *); -int ip6_unknown_opt(u_int8_t *, struct mbuf *, int, int); +int ip6_unknown_opt(u_int8_t *, struct mbuf *, int); char * ip6_get_prevhdr(struct mbuf *, int); int ip6_nexthdr(struct mbuf *, int, int, int *); int ip6_lasthdr(struct mbuf *, int, int, int *); +extern void ip6_moptions_init(void); +extern struct ip6_moptions *ip6_allocmoptions(int); +extern void im6o_addref(struct ip6_moptions *, int); +extern void im6o_remref(struct ip6_moptions *); + struct ip6aux *ip6_addaux(struct mbuf *); struct ip6aux *ip6_findaux(struct mbuf *); void ip6_delaux(struct mbuf *); +extern void ip6_destroyaux(struct ip6aux *); +extern void ip6_copyaux(struct ip6aux *, struct ip6aux *); int ip6_mforward(struct ip6_hdr *, struct ifnet *, struct mbuf *); int ip6_process_hopopts(struct mbuf *, u_int8_t *, int, u_int32_t *, u_int32_t *); -void ip6_savecontrol(struct inpcb *, struct mbuf **, struct ip6_hdr *, - struct mbuf *); -void ip6_forward(struct mbuf *, struct route_in6 *, int, int); - +struct mbuf **ip6_savecontrol_v4(struct inpcb *, struct mbuf *, + struct mbuf **, int *); +int ip6_savecontrol(struct inpcb *, struct mbuf *, struct mbuf **); +void ip6_forward(struct mbuf *, struct route_in6 *, int); +void ip6_notify_pmtu __P((struct inpcb *, struct sockaddr_in6 *, + u_int32_t *)); void ip6_mloopback(struct ifnet *, struct mbuf *, struct sockaddr_in6 *); -int ip6_output(struct mbuf *, struct ip6_pktopts *, - struct route_in6 *, - int, - struct ip6_moptions *, struct ifnet **, int locked); +int ip6_output(struct mbuf *, struct ip6_pktopts *, struct route_in6 *, + int, struct ip6_moptions *, struct ifnet **, + struct ip6_out_args *); int ip6_ctloutput(struct socket *, struct sockopt *sopt); void ip6_initpktopts(struct ip6_pktopts *); int ip6_setpktoptions(struct mbuf *, struct ip6_pktopts *, int, int); -void ip6_clearpktopts(struct ip6_pktopts *, int, int); +void ip6_clearpktopts(struct ip6_pktopts *, int); struct ip6_pktopts *ip6_copypktopts(struct ip6_pktopts *, int); int ip6_optlen(struct inpcb *); -int route6_input(struct mbuf **, int *); +int route6_input(struct mbuf **, int *, int); void frag6_init(void); -int frag6_input(struct mbuf **, int *); +int frag6_input(struct mbuf **, int *, int); void frag6_slowtimo(void); void frag6_drain(void); -int rip6_input(struct mbuf **mp, int *offset); +int rip6_input(struct mbuf **, int *, int); void rip6_ctlinput(int, struct sockaddr *, void *); int rip6_ctloutput(struct socket *so, struct sockopt *sopt); -int rip6_output(struct mbuf *, struct socket *, struct sockaddr_in6 *, struct mbuf *); +int rip6_output(struct mbuf *, struct socket *, struct sockaddr_in6 *, struct mbuf *, int); + +int dest6_input(struct mbuf **, int *, int); +extern struct in6_addr *in6_selectsrc(struct sockaddr_in6 *, + struct ip6_pktopts *, struct inpcb *, struct route_in6 *, + struct ifnet **, struct in6_addr *, unsigned int, int *); +extern struct in6_addrpolicy * + in6_addrsel_lookup_policy(struct sockaddr_in6 *); +int in6_selectroute(struct sockaddr_in6 *, struct sockaddr_in6 *, + struct ip6_pktopts *, struct ip6_moptions *, struct route_in6 *, + struct ifnet **, struct rtentry **, int, unsigned int, unsigned int); +int ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt, struct ip6_pktopts *stickyopt, int uproto); +u_int32_t ip6_randomid(void); +u_int32_t ip6_randomflowlabel(void); -int dest6_input(struct mbuf **, int *); #endif /* KERNEL */ #endif /* KERNEL_PRIVATE */ diff --git a/bsd/netinet6/ip6protosw.h b/bsd/netinet6/ip6protosw.h index 303f61964..dbadffc81 100644 --- a/bsd/netinet6/ip6protosw.h +++ b/bsd/netinet6/ip6protosw.h @@ -1,6 +1,6 @@ /* $FreeBSD: src/sys/netinet6/ip6protosw.h,v 1.2.2.3 2001/07/03 11:01:54 ume Exp $ */ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -122,7 +122,7 @@ struct pr_usrreqs; * argument type for the last arg of pr_ctlinput(). * should be consulted only with AF_INET6 family. * - * IPv6 ICMP IPv6 [exthdrs] finalhdr paylaod + * IPv6 ICMP IPv6 [exthdrs] finalhdr payload * ^ ^ ^ ^ * | | ip6c_ip6 ip6c_off * | ip6c_icmp6 @@ -157,7 +157,7 @@ struct ip6protosw { short pr_protocol; /* protocol number */ unsigned int pr_flags; /* see below */ /* protocol-protocol hooks */ - int (*pr_input)(struct mbuf **, int *); + int (*pr_input)(struct mbuf **, int *, int); /* input to protocol (from below) */ int (*pr_output)(struct mbuf *m, struct socket *so, struct sockaddr_in6 *, struct mbuf *); @@ -173,8 +173,12 @@ struct ip6protosw { /* utility hooks */ void (*pr_init)(void); /* initialization hook */ +#if __APPLE__ + void (*pr_unused)(void); /* placeholder - fasttimo is removed */ +#else void (*pr_fasttimo)(void); /* fast timeout (200ms) */ +#endif void (*pr_slowtimo)(void); /* slow timeout (500ms) */ void (*pr_drain)(void); diff --git a/bsd/netinet6/ipcomp6.h b/bsd/netinet6/ipcomp6.h index 8fd6fdba9..2bd7b6678 100644 --- a/bsd/netinet6/ipcomp6.h +++ b/bsd/netinet6/ipcomp6.h @@ -40,7 +40,7 @@ #include #ifdef KERNEL_PRIVATE -extern int ipcomp6_input(struct mbuf **, int *); +extern int ipcomp6_input(struct mbuf **, int *, int); extern int ipcomp6_output(struct mbuf *, u_char *, struct mbuf *, struct secasvar *); #endif /* KERNEL_PRIVATE */ diff --git a/bsd/netinet6/ipcomp_input.c b/bsd/netinet6/ipcomp_input.c index c9473dd8b..3c6a9a43d 100644 --- a/bsd/netinet6/ipcomp_input.c +++ b/bsd/netinet6/ipcomp_input.c @@ -78,6 +78,7 @@ #include #include +#include #define IPLEN_FLIPPED @@ -214,6 +215,11 @@ ipcomp4_input(struct mbuf *m, int off) IPSEC_STAT_INCREMENT(ipsecstat.in_polvio); goto fail; } + + DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL, + struct ip *, ip, struct ifnet *, m->m_pkthdr.rcvif, + struct ip *, ip, struct ip6_hdr *, NULL); + ip_proto_dispatch_in(m, off, nxt, 0); } else m_freem(m); @@ -233,10 +239,9 @@ ipcomp4_input(struct mbuf *m, int off) #if INET6 int -ipcomp6_input(mp, offp) - struct mbuf **mp; - int *offp; +ipcomp6_input(struct mbuf **mp, int *offp, int proto) { +#pragma unused(proto) struct mbuf *m, *md; int off; struct ip6_hdr *ip6; diff --git a/bsd/netinet6/ipsec.c b/bsd/netinet6/ipsec.c index 6a7da3d2b..91fd6db6d 100644 --- a/bsd/netinet6/ipsec.c +++ b/bsd/netinet6/ipsec.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -143,7 +143,6 @@ int ipsec_debug = 0; #define DBG_FNC_IPSEC_OUT NETDBG_CODE(DBG_NETIPSEC, (3 << 8)) extern lck_mtx_t *sadb_mutex; -extern lck_mtx_t *ip6_mutex; struct ipsecstat ipsecstat; int ip4_ah_cleartos = 1; @@ -169,33 +168,33 @@ SYSCTL_DECL(_net_inet6_ipsec6); #endif /* net.inet.ipsec */ SYSCTL_STRUCT(_net_inet_ipsec, IPSECCTL_STATS, - stats, CTLFLAG_RD, &ipsecstat, ipsecstat, ""); -SYSCTL_PROC(_net_inet_ipsec, IPSECCTL_DEF_POLICY, def_policy, CTLTYPE_INT|CTLFLAG_RW, + stats, CTLFLAG_RD | CTLFLAG_LOCKED, &ipsecstat, ipsecstat, ""); +SYSCTL_PROC(_net_inet_ipsec, IPSECCTL_DEF_POLICY, def_policy, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED, &ip4_def_policy.policy, 0, &sysctl_def_policy, "I", ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_ESP_TRANSLEV, esp_trans_deflev, - CTLFLAG_RW, &ip4_esp_trans_deflev, 0, ""); + CTLFLAG_RW | CTLFLAG_LOCKED, &ip4_esp_trans_deflev, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_ESP_NETLEV, esp_net_deflev, - CTLFLAG_RW, &ip4_esp_net_deflev, 0, ""); + CTLFLAG_RW | CTLFLAG_LOCKED, &ip4_esp_net_deflev, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_AH_TRANSLEV, ah_trans_deflev, - CTLFLAG_RW, &ip4_ah_trans_deflev, 0, ""); + CTLFLAG_RW | CTLFLAG_LOCKED, &ip4_ah_trans_deflev, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_AH_NETLEV, ah_net_deflev, - CTLFLAG_RW, &ip4_ah_net_deflev, 0, ""); + CTLFLAG_RW | CTLFLAG_LOCKED, &ip4_ah_net_deflev, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_AH_CLEARTOS, - ah_cleartos, CTLFLAG_RW, &ip4_ah_cleartos, 0, ""); + ah_cleartos, CTLFLAG_RW | CTLFLAG_LOCKED, &ip4_ah_cleartos, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_AH_OFFSETMASK, - ah_offsetmask, CTLFLAG_RW, &ip4_ah_offsetmask, 0, ""); + ah_offsetmask, CTLFLAG_RW | CTLFLAG_LOCKED, &ip4_ah_offsetmask, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DFBIT, - dfbit, CTLFLAG_RW, &ip4_ipsec_dfbit, 0, ""); + dfbit, CTLFLAG_RW | CTLFLAG_LOCKED, &ip4_ipsec_dfbit, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_ECN, - ecn, CTLFLAG_RW, &ip4_ipsec_ecn, 0, ""); + ecn, CTLFLAG_RW | CTLFLAG_LOCKED, &ip4_ipsec_ecn, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEBUG, - debug, CTLFLAG_RW, &ipsec_debug, 0, ""); + debug, CTLFLAG_RW | CTLFLAG_LOCKED, &ipsec_debug, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_ESP_RANDPAD, - esp_randpad, CTLFLAG_RW, &ip4_esp_randpad, 0, ""); + esp_randpad, CTLFLAG_RW | CTLFLAG_LOCKED, &ip4_esp_randpad, 0, ""); /* for performance, we bypass ipsec until a security policy is set */ int ipsec_bypass = 1; -SYSCTL_INT(_net_inet_ipsec, OID_AUTO, bypass, CTLFLAG_RD, &ipsec_bypass,0, ""); +SYSCTL_INT(_net_inet_ipsec, OID_AUTO, bypass, CTLFLAG_RD | CTLFLAG_LOCKED, &ipsec_bypass,0, ""); /* * NAT Traversal requires a UDP port for encapsulation, @@ -204,7 +203,7 @@ SYSCTL_INT(_net_inet_ipsec, OID_AUTO, bypass, CTLFLAG_RD, &ipsec_bypass,0, ""); * for nat traversal. */ SYSCTL_INT(_net_inet_ipsec, OID_AUTO, esp_port, - CTLFLAG_RW, &esp_udp_encap_port, 0, ""); + CTLFLAG_RW | CTLFLAG_LOCKED, &esp_udp_encap_port, 0, ""); #if INET6 struct ipsecstat ipsec6stat; @@ -218,23 +217,23 @@ int ip6_esp_randpad = -1; /* net.inet6.ipsec6 */ SYSCTL_STRUCT(_net_inet6_ipsec6, IPSECCTL_STATS, - stats, CTLFLAG_RD, &ipsec6stat, ipsecstat, ""); + stats, CTLFLAG_RD | CTLFLAG_LOCKED, &ipsec6stat, ipsecstat, ""); SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_POLICY, - def_policy, CTLFLAG_RW, &ip6_def_policy.policy, 0, ""); + def_policy, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_def_policy.policy, 0, ""); SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_ESP_TRANSLEV, esp_trans_deflev, - CTLFLAG_RW, &ip6_esp_trans_deflev, 0, ""); + CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_esp_trans_deflev, 0, ""); SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_ESP_NETLEV, esp_net_deflev, - CTLFLAG_RW, &ip6_esp_net_deflev, 0, ""); + CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_esp_net_deflev, 0, ""); SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_AH_TRANSLEV, ah_trans_deflev, - CTLFLAG_RW, &ip6_ah_trans_deflev, 0, ""); + CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_ah_trans_deflev, 0, ""); SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_AH_NETLEV, ah_net_deflev, - CTLFLAG_RW, &ip6_ah_net_deflev, 0, ""); + CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_ah_net_deflev, 0, ""); SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_ECN, - ecn, CTLFLAG_RW, &ip6_ipsec_ecn, 0, ""); + ecn, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_ipsec_ecn, 0, ""); SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEBUG, - debug, CTLFLAG_RW, &ipsec_debug, 0, ""); + debug, CTLFLAG_RW | CTLFLAG_LOCKED, &ipsec_debug, 0, ""); SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_ESP_RANDPAD, - esp_randpad, CTLFLAG_RW, &ip6_esp_randpad, 0, ""); + esp_randpad, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_esp_randpad, 0, ""); #endif /* INET6 */ static int ipsec_setspidx_mbuf(struct secpolicyindex *, u_int, u_int, @@ -1717,7 +1716,7 @@ ipsec_get_reqlevel(isr) ? (ipsec_debug \ ? log(LOG_INFO, "fixed system default level " #lev ":%d->%d\n",\ (lev), IPSEC_LEVEL_REQUIRE) \ - : 0), \ + : (void)0), \ (lev) = IPSEC_LEVEL_REQUIRE, \ (lev) \ : (lev)) @@ -2961,13 +2960,19 @@ ipsec4_output( } ip = mtod(state->m, struct ip *); + // grab sadb_mutex, before updating sah's route cache + lck_mtx_lock(sadb_mutex); state->ro = &sav->sah->sa_route; state->dst = (struct sockaddr *)&state->ro->ro_dst; dst4 = (struct sockaddr_in *)state->dst; + if (state->ro->ro_rt != NULL) { + RT_LOCK(state->ro->ro_rt); + } if (state->ro->ro_rt != NULL && (state->ro->ro_rt->generation_id != route_generation || !(state->ro->ro_rt->rt_flags & RTF_UP) || dst4->sin_addr.s_addr != ip->ip_dst.s_addr)) { + RT_UNLOCK(state->ro->ro_rt); rtfree(state->ro->ro_rt); state->ro->ro_rt = NULL; } @@ -2976,11 +2981,14 @@ ipsec4_output( dst4->sin_len = sizeof(*dst4); dst4->sin_addr = ip->ip_dst; rtalloc(state->ro); - } - if (state->ro->ro_rt == 0) { - OSAddAtomic(1, &ipstat.ips_noroute); - error = EHOSTUNREACH; - goto bad; + if (state->ro->ro_rt == 0) { + OSAddAtomic(1, &ipstat.ips_noroute); + error = EHOSTUNREACH; + // release sadb_mutex, after updating sah's route cache + lck_mtx_unlock(sadb_mutex); + goto bad; + } + RT_LOCK(state->ro->ro_rt); } /* @@ -2996,6 +3004,9 @@ ipsec4_output( state->dst = (struct sockaddr *)state->ro->ro_rt->rt_gateway; dst4 = (struct sockaddr_in *)state->dst; } + RT_UNLOCK(state->ro->ro_rt); + // release sadb_mutex, after updating sah's route cache + lck_mtx_unlock(sadb_mutex); } state->m = ipsec4_splithdr(state->m); @@ -3384,7 +3395,8 @@ ipsec6_output_tunnel( struct ip *ip; struct sockaddr_in* dst4; struct route *ro4 = NULL; - struct ip_out_args ipoa = { IFSCOPE_NONE }; + struct route ro4_copy; + struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; /* * must be last isr because encapsulated IPv6 packet @@ -3406,12 +3418,18 @@ ipsec6_output_tunnel( /* Now we have an IPv4 packet */ ip = mtod(state->m, struct ip *); + // grab sadb_mutex, to update sah's route cache and get a local copy of it + lck_mtx_lock(sadb_mutex); ro4 = &sav->sah->sa_route; dst4 = (struct sockaddr_in *)&ro4->ro_dst; + if (ro4->ro_rt) { + RT_LOCK(ro4->ro_rt); + } if (ro4->ro_rt != NULL && (ro4->ro_rt->generation_id != route_generation || !(ro4->ro_rt->rt_flags & RTF_UP) || dst4->sin_addr.s_addr != ip->ip_dst.s_addr)) { + RT_UNLOCK(ro4->ro_rt); rtfree(ro4->ro_rt); ro4->ro_rt = NULL; } @@ -3419,10 +3437,18 @@ ipsec6_output_tunnel( dst4->sin_family = AF_INET; dst4->sin_len = sizeof(*dst4); dst4->sin_addr = ip->ip_dst; + } else { + RT_UNLOCK(ro4->ro_rt); } + route_copyout(&ro4_copy, ro4, sizeof(ro4_copy)); + // release sadb_mutex, after updating sah's route cache and getting a local copy + lck_mtx_unlock(sadb_mutex); state->m = ipsec4_splithdr(state->m); if (!state->m) { error = ENOMEM; + if (ro4_copy.ro_rt != NULL) { + rtfree(ro4_copy.ro_rt); + } goto bad; } switch (isr->saidx.proto) { @@ -3430,6 +3456,9 @@ ipsec6_output_tunnel( #if IPSEC_ESP if ((error = esp4_output(state->m, sav)) != 0) { state->m = NULL; + if (ro4_copy.ro_rt != NULL) { + rtfree(ro4_copy.ro_rt); + } goto bad; } break; @@ -3438,17 +3467,26 @@ ipsec6_output_tunnel( m_freem(state->m); state->m = NULL; error = EINVAL; + if (ro4_copy.ro_rt != NULL) { + rtfree(ro4_copy.ro_rt); + } goto bad; #endif case IPPROTO_AH: if ((error = ah4_output(state->m, sav)) != 0) { state->m = NULL; + if (ro4_copy.ro_rt != NULL) { + rtfree(ro4_copy.ro_rt); + } goto bad; } break; case IPPROTO_IPCOMP: if ((error = ipcomp4_output(state->m, sav)) != 0) { state->m = NULL; + if (ro4_copy.ro_rt != NULL) { + rtfree(ro4_copy.ro_rt); + } goto bad; } break; @@ -3459,17 +3497,27 @@ ipsec6_output_tunnel( m_freem(state->m); state->m = NULL; error = EINVAL; + if (ro4_copy.ro_rt != NULL) { + rtfree(ro4_copy.ro_rt); + } goto bad; } if (state->m == 0) { error = ENOMEM; + if (ro4_copy.ro_rt != NULL) { + rtfree(ro4_copy.ro_rt); + } goto bad; } ip = mtod(state->m, struct ip *); ip->ip_len = ntohs(ip->ip_len); /* flip len field before calling ip_output */ - error = ip_output(state->m, NULL, ro4, IP_OUTARGS, NULL, &ipoa); + error = ip_output(state->m, NULL, &ro4_copy, IP_OUTARGS, NULL, &ipoa); state->m = NULL; + // grab sadb_mutex, to synchronize the sah's route cache with the local copy + lck_mtx_lock(sadb_mutex); + route_copyin(&ro4_copy, ro4, sizeof(ro4_copy)); + lck_mtx_unlock(sadb_mutex); if (error != 0) goto bad; goto done; @@ -3481,14 +3529,20 @@ ipsec6_output_tunnel( error = EAFNOSUPPORT; goto bad; } - + + // grab sadb_mutex, before updating sah's route cache + lck_mtx_lock(sadb_mutex); state->ro = &sav->sah->sa_route; state->dst = (struct sockaddr *)&state->ro->ro_dst; dst6 = (struct sockaddr_in6 *)state->dst; + if (state->ro->ro_rt) { + RT_LOCK(state->ro->ro_rt); + } if (state->ro->ro_rt != NULL && (state->ro->ro_rt->generation_id != route_generation || !(state->ro->ro_rt->rt_flags & RTF_UP) || !IN6_ARE_ADDR_EQUAL(&dst6->sin6_addr, &ip6->ip6_dst))) { + RT_UNLOCK(state->ro->ro_rt); rtfree(state->ro->ro_rt); state->ro->ro_rt = NULL; } @@ -3498,11 +3552,16 @@ ipsec6_output_tunnel( dst6->sin6_len = sizeof(*dst6); dst6->sin6_addr = ip6->ip6_dst; rtalloc(state->ro); + if (state->ro->ro_rt) { + RT_LOCK(state->ro->ro_rt); + } } if (state->ro->ro_rt == 0) { ip6stat.ip6s_noroute++; IPSEC_STAT_INCREMENT(ipsec6stat.out_noroute); error = EHOSTUNREACH; + // release sadb_mutex, after updating sah's route cache + lck_mtx_unlock(sadb_mutex); goto bad; } @@ -3519,6 +3578,9 @@ ipsec6_output_tunnel( state->dst = (struct sockaddr *)state->ro->ro_rt->rt_gateway; dst6 = (struct sockaddr_in6 *)state->dst; } + RT_UNLOCK(state->ro->ro_rt); + // release sadb_mutex, after updating sah's route cache + lck_mtx_unlock(sadb_mutex); } state->m = ipsec6_splithdr(state->m); @@ -3982,8 +4044,8 @@ ipsec_addaux( struct ipsec_tag *itag; /* Allocate a tag */ - tag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPSEC, - IPSEC_TAG_SIZE, M_DONTWAIT); + tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPSEC, + IPSEC_TAG_SIZE, M_DONTWAIT, m); if (tag) { itag = (struct ipsec_tag*)(tag + 1); @@ -4128,7 +4190,8 @@ ipsec_send_natt_keepalive( struct udphdr *uh; struct ip *ip; int error; - struct ip_out_args ipoa = { IFSCOPE_NONE }; + struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; + struct route ro; lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_NOTOWNED); @@ -4168,8 +4231,23 @@ ipsec_send_natt_keepalive( uh->uh_ulen = htons(1 + sizeof(struct udphdr)); uh->uh_sum = 0; *(u_int8_t*)((char*)m_mtod(m) + sizeof(struct ip) + sizeof(struct udphdr)) = 0xFF; - - error = ip_output(m, NULL, &sav->sah->sa_route, IP_OUTARGS | IP_NOIPSEC, NULL, &ipoa); + + // grab sadb_mutex, to get a local copy of sah's route cache + lck_mtx_lock(sadb_mutex); + if (sav->sah->sa_route.ro_rt != NULL && + rt_key(sav->sah->sa_route.ro_rt)->sa_family != AF_INET) { + rtfree(sav->sah->sa_route.ro_rt); + sav->sah->sa_route.ro_rt = NULL; + } + route_copyout(&ro, &sav->sah->sa_route, sizeof(ro)); + lck_mtx_unlock(sadb_mutex); + + error = ip_output(m, NULL, &ro, IP_OUTARGS | IP_NOIPSEC, NULL, &ipoa); + + // grab sadb_mutex, to synchronize the sah's route cache with the local copy + lck_mtx_lock(sadb_mutex); + route_copyin(&ro, &sav->sah->sa_route, sizeof(ro)); + lck_mtx_unlock(sadb_mutex); if (error == 0) { sav->natt_last_activity = natt_now; return TRUE; diff --git a/bsd/netinet6/mld6.c b/bsd/netinet6/mld6.c index 7e9a882e8..90bcb94b4 100644 --- a/bsd/netinet6/mld6.c +++ b/bsd/netinet6/mld6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,13 +25,8 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ - -/* $FreeBSD: src/sys/netinet6/mld6.c,v 1.4.2.2 2001/07/03 11:01:54 ume Exp $ */ -/* $KAME: mld6.c,v 1.27 2001/04/04 05:17:30 itojun Exp $ */ - -/* - * Copyright (C) 1998 WIDE Project. - * All rights reserved. +/*- + * Copyright (c) 2009 Bruce Simpson. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -41,14 +36,14 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the project nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. * - * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) @@ -103,426 +98,3312 @@ * Version 2.0. */ +#include + #include #include #include #include #include -#include +#include +#include +#include +#include + +#include #include +#include #include #include +#include #include #include +#include #include +#include #include -#include +/* Lock group and attribute for mld6_mtx */ +static lck_attr_t *mld_mtx_attr; +static lck_grp_t *mld_mtx_grp; +static lck_grp_attr_t *mld_mtx_grp_attr; + +/* + * Locking and reference counting: + * + * mld_mtx mainly protects mli_head. In cases where both mld_mtx and + * in6_multihead_lock must be held, the former must be acquired first in order + * to maintain lock ordering. It is not a requirement that mld_mtx be + * acquired first before in6_multihead_lock, but in case both must be acquired + * in succession, the correct lock ordering must be followed. + * + * Instead of walking the if_multiaddrs list at the interface and returning + * the ifma_protospec value of a matching entry, we search the global list + * of in6_multi records and find it that way; this is done with in6_multihead + * lock held. Doing so avoids the race condition issues that many other BSDs + * suffer from (therefore in our implementation, ifma_protospec will never be + * NULL for as long as the in6_multi is valid.) + * + * The above creates a requirement for the in6_multi to stay in in6_multihead + * list even after the final MLD leave (in MLDv2 mode) until no longer needs + * be retransmitted (this is not required for MLDv1.) In order to handle + * this, the request and reference counts of the in6_multi are bumped up when + * the state changes to MLD_LEAVING_MEMBER, and later dropped in the timeout + * handler. Each in6_multi holds a reference to the underlying mld_ifinfo. + * + * Thus, the permitted lock oder is: + * + * mld_mtx, in6_multihead_lock, inm6_lock, mli_lock + * + * Any may be taken independently, but if any are held at the same time, + * the above lock order must be followed. + */ +static decl_lck_mtx_data(, mld_mtx); + +static void mli_initvar(struct mld_ifinfo *, struct ifnet *, int); +static struct mld_ifinfo *mli_alloc(int); +static void mli_free(struct mld_ifinfo *); +static void mli_delete(const struct ifnet *); +static void mld_dispatch_packet(struct mbuf *); +static void mld_final_leave(struct in6_multi *, struct mld_ifinfo *); +static int mld_handle_state_change(struct in6_multi *, + struct mld_ifinfo *); +static int mld_initial_join(struct in6_multi *, struct mld_ifinfo *, + const int); +#ifdef MLD_DEBUG +static const char * mld_rec_type_to_str(const int); +#endif +static void mld_set_version(struct mld_ifinfo *, const int); +static void mld_flush_relq(struct mld_ifinfo *); +static void mld_dispatch_queue(struct mld_ifinfo *, struct ifqueue *, int); +static int mld_v1_input_query(struct ifnet *, const struct ip6_hdr *, + /*const*/ struct mld_hdr *); +static int mld_v1_input_report(struct ifnet *, const struct ip6_hdr *, + /*const*/ struct mld_hdr *); +static void mld_v1_process_group_timer(struct in6_multi *, const int); +static void mld_v1_process_querier_timers(struct mld_ifinfo *); +static int mld_v1_transmit_report(struct in6_multi *, const int); +static void mld_v1_update_group(struct in6_multi *, const int); +static void mld_v2_cancel_link_timers(struct mld_ifinfo *); +static void mld_v2_dispatch_general_query(struct mld_ifinfo *); +static struct mbuf * + mld_v2_encap_report(struct ifnet *, struct mbuf *); +static int mld_v2_enqueue_filter_change(struct ifqueue *, + struct in6_multi *); +static int mld_v2_enqueue_group_record(struct ifqueue *, + struct in6_multi *, const int, const int, const int, + const int); +static int mld_v2_input_query(struct ifnet *, const struct ip6_hdr *, + struct mbuf *, const int, const int); +static int mld_v2_merge_state_changes(struct in6_multi *, + struct ifqueue *); +static void mld_v2_process_group_timers(struct mld_ifinfo *, + struct ifqueue *, struct ifqueue *, + struct in6_multi *, const int); +static int mld_v2_process_group_query(struct in6_multi *, + int, struct mbuf *, const int); +static int sysctl_mld_gsr SYSCTL_HANDLER_ARGS; +static int sysctl_mld_ifinfo SYSCTL_HANDLER_ARGS; + +/* + * Normative references: RFC 2710, RFC 3590, RFC 3810. + * + * XXX LOR PREVENTION + * A special case for IPv6 is the in6_setscope() routine. ip6_output() + * will not accept an ifp; it wants an embedded scope ID, unlike + * ip_output(), which happily takes the ifp given to it. The embedded + * scope ID is only used by MLD to select the outgoing interface. + * + * As such, we exploit the fact that the scope ID is just the interface + * index, and embed it in the IPv6 destination address accordingly. + * This is potentially NOT VALID for MLDv1 reports, as they + * are always sent to the multicast group itself; as MLDv2 + * reports are always sent to ff02::16, this is not an issue + * when MLDv2 is in use. + */ + +#define MLD_EMBEDSCOPE(pin6, zoneid) \ + (pin6)->s6_addr16[1] = htons((zoneid) & 0xFFFF) + +static struct timeval mld_gsrdelay = {10, 0}; +static LIST_HEAD(, mld_ifinfo) mli_head; + +static int interface_timers_running6; +static int state_change_timers_running6; +static int current_state_timers_running6; + +static decl_lck_mtx_data(, mld6_mtx); + +#define MLD_LOCK() \ + lck_mtx_lock(&mld6_mtx) +#define MLD_LOCK_ASSERT_HELD() \ + lck_mtx_assert(&mld6_mtx, LCK_MTX_ASSERT_OWNED) +#define MLD_LOCK_ASSERT_NOTHELD() \ + lck_mtx_assert(&mld6_mtx, LCK_MTX_ASSERT_NOTOWNED) +#define MLD_UNLOCK() \ + lck_mtx_unlock(&mld6_mtx) + +#define MLI_ZONE_MAX 64 /* maximum elements in zone */ +#define MLI_ZONE_NAME "mld_ifinfo" /* zone name */ + +static unsigned int mli_size; /* size of zone element */ +static struct zone *mli_zone; /* zone for mld_ifinfo */ + +SYSCTL_DECL(_net_inet6); /* Note: Not in any common header. */ + +SYSCTL_NODE(_net_inet6, OID_AUTO, mld, CTLFLAG_RW | CTLFLAG_LOCKED, 0, + "IPv6 Multicast Listener Discovery"); +SYSCTL_PROC(_net_inet6_mld, OID_AUTO, gsrdelay, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + &mld_gsrdelay.tv_sec, 0, sysctl_mld_gsr, "I", + "Rate limit for MLDv2 Group-and-Source queries in seconds"); + +SYSCTL_NODE(_net_inet6_mld, OID_AUTO, ifinfo, CTLFLAG_RD | CTLFLAG_LOCKED, + sysctl_mld_ifinfo, "Per-interface MLDv2 state"); + +static int mld_v1enable = 1; +SYSCTL_INT(_net_inet6_mld, OID_AUTO, v1enable, CTLFLAG_RW | CTLFLAG_LOCKED, + &mld_v1enable, 0, "Enable fallback to MLDv1"); + +static int mld_use_allow = 1; +SYSCTL_INT(_net_inet6_mld, OID_AUTO, use_allow, CTLFLAG_RW | CTLFLAG_LOCKED, + &mld_use_allow, 0, "Use ALLOW/BLOCK for RFC 4604 SSM joins/leaves"); + +#ifdef MLD_DEBUG +int mld_debug = 0; +SYSCTL_INT(_net_inet6_mld, OID_AUTO, + debug, CTLFLAG_RW | CTLFLAG_LOCKED, &mld_debug, 0, ""); +#endif +/* + * Packed Router Alert option structure declaration. + */ +struct mld_raopt { + struct ip6_hbh hbh; + struct ip6_opt pad; + struct ip6_opt_router ra; +} __packed; + +/* + * Router Alert hop-by-hop option header. + */ +static struct mld_raopt mld_ra = { + .hbh = { 0, 0 }, + .pad = { .ip6o_type = IP6OPT_PADN, 0 }, + .ra = { + .ip6or_type = (u_int8_t)IP6OPT_ROUTER_ALERT, + .ip6or_len = (u_int8_t)(IP6OPT_RTALERT_LEN - 2), + .ip6or_value = {((IP6OPT_RTALERT_MLD >> 8) & 0xFF), + (IP6OPT_RTALERT_MLD & 0xFF) } + } +}; +static struct ip6_pktopts mld_po; + +/* + * Retrieve or set threshold between group-source queries in seconds. + */ +static int +sysctl_mld_gsr SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error; + int i; + + MLD_LOCK(); + + i = mld_gsrdelay.tv_sec; + + error = sysctl_handle_int(oidp, &i, 0, req); + if (error || !req->newptr) + goto out_locked; + + if (i < -1 || i >= 60) { + error = EINVAL; + goto out_locked; + } + + mld_gsrdelay.tv_sec = i; + +out_locked: + MLD_UNLOCK(); + return (error); +} +/* + * Expose struct mld_ifinfo to userland, keyed by ifindex. + * For use by ifmcstat(8). + * + */ +static int +sysctl_mld_ifinfo SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp) + int *name; + int error; + u_int namelen; + struct ifnet *ifp; + struct mld_ifinfo *mli; + struct mld_ifinfo_u mli_u; + + name = (int *)arg1; + namelen = arg2; + + if (req->newptr != USER_ADDR_NULL) + return (EPERM); + + if (namelen != 1) + return (EINVAL); + + MLD_LOCK(); + + if (name[0] <= 0 || name[0] > (u_int)if_index) { + error = ENOENT; + goto out_locked; + } + + error = ENOENT; + + ifnet_head_lock_shared(); + ifp = ifindex2ifnet[name[0]]; + ifnet_head_done(); + if (ifp == NULL) + goto out_locked; + + bzero(&mli_u, sizeof (mli_u)); + + LIST_FOREACH(mli, &mli_head, mli_link) { + MLI_LOCK(mli); + if (ifp != mli->mli_ifp) { + MLI_UNLOCK(mli); + continue; + } + + mli_u.mli_ifindex = mli->mli_ifp->if_index; + mli_u.mli_version = mli->mli_version; + mli_u.mli_v1_timer = mli->mli_v1_timer; + mli_u.mli_v2_timer = mli->mli_v2_timer; + mli_u.mli_flags = mli->mli_flags; + mli_u.mli_rv = mli->mli_rv; + mli_u.mli_qi = mli->mli_qi; + mli_u.mli_qri = mli->mli_qri; + mli_u.mli_uri = mli->mli_uri; + MLI_UNLOCK(mli); + + error = SYSCTL_OUT(req, &mli_u, sizeof (mli_u)); + break; + } + +out_locked: + MLD_UNLOCK(); + return (error); +} + +/* + * Dispatch an entire queue of pending packet chains. + * + * Must not be called with in6m_lock held. + */ +static void +mld_dispatch_queue(struct mld_ifinfo *mli, struct ifqueue *ifq, int limit) +{ + struct mbuf *m; + + if (mli != NULL) + MLI_LOCK_ASSERT_HELD(mli); + + for (;;) { + IF_DEQUEUE(ifq, m); + if (m == NULL) + break; + MLD_PRINTF(("%s: dispatch %p from %p\n", __func__, ifq, m)); + if (mli != NULL) + MLI_UNLOCK(mli); + mld_dispatch_packet(m); + if (mli != NULL) + MLI_LOCK(mli); + if (--limit == 0) + break; + } -#if CONFIG_MACF_NET -#include -#endif /* MAC_NET */ + if (mli != NULL) + MLI_LOCK_ASSERT_HELD(mli); +} /* - * Protocol constants + * Filter outgoing MLD report state by group. + * + * Reports are ALWAYS suppressed for ALL-HOSTS (ff02::1) + * and node-local addresses. However, kernel and socket consumers + * always embed the KAME scope ID in the address provided, so strip it + * when performing comparison. + * Note: This is not the same as the *multicast* scope. + * + * Return zero if the given group is one for which MLD reports + * should be suppressed, or non-zero if reports should be issued. */ +static __inline__ int +mld_is_addr_reported(const struct in6_addr *addr) +{ + + VERIFY(IN6_IS_ADDR_MULTICAST(addr)); + + if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_NODELOCAL) + return (0); + + if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_LINKLOCAL) { + struct in6_addr tmp = *addr; + in6_clearscope(&tmp); + if (IN6_ARE_ADDR_EQUAL(&tmp, &in6addr_linklocal_allnodes)) + return (0); + } + + return (1); +} -/* denotes that the MLD max response delay field specifies time in milliseconds */ -#define MLD6_TIMER_SCALE 1000 /* - * time between repetitions of a node's initial report of interest in a - * multicast address(in seconds) + * Attach MLD when PF_INET6 is attached to an interface. */ -#define MLD6_UNSOLICITED_REPORT_INTERVAL 10 +struct mld_ifinfo * +mld_domifattach(struct ifnet *ifp, int how) +{ + struct mld_ifinfo *mli; + + MLD_PRINTF(("%s: called for ifp %p(%s%d)\n", + __func__, ifp, ifp->if_name, ifp->if_unit)); + + mli = mli_alloc(how); + if (mli == NULL) + return (NULL); + + MLD_LOCK(); + + MLI_LOCK(mli); + mli_initvar(mli, ifp, 0); + mli->mli_debug |= IFD_ATTACHED; + MLI_ADDREF_LOCKED(mli); /* hold a reference for mli_head */ + MLI_ADDREF_LOCKED(mli); /* hold a reference for caller */ + MLI_UNLOCK(mli); + + LIST_INSERT_HEAD(&mli_head, mli, mli_link); + + MLD_UNLOCK(); -extern lck_mtx_t *nd6_mutex; -static struct ip6_pktopts ip6_opts; -static int mld6_timers_are_running; -static int mld6_init_done = 0 ; -/* XXX: These are necessary for KAME's link-local hack */ -static struct in6_addr mld6_all_nodes_linklocal = IN6ADDR_LINKLOCAL_ALLNODES_INIT; -static struct in6_addr mld6_all_routers_linklocal = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; + MLD_PRINTF(("allocate mld_ifinfo for ifp %p(%s%d)\n", + ifp, ifp->if_name, ifp->if_unit)); -static void mld6_sendpkt(struct in6_multi *, int, const struct in6_addr *); + return (mli); +} +/* + * Attach MLD when PF_INET6 is reattached to an interface. Caller is + * expected to have an outstanding reference to the mli. + */ void -mld6_init() +mld_domifreattach(struct mld_ifinfo *mli) { - static u_int8_t hbh_buf[8]; - struct ip6_hbh *hbh = (struct ip6_hbh *)hbh_buf; - u_int16_t rtalert_code = htons((u_int16_t)IP6OPT_RTALERT_MLD); + struct ifnet *ifp; - if (mld6_init_done) - return; + MLD_LOCK(); - mld6_init_done = 1; - mld6_timers_are_running = 0; + MLI_LOCK(mli); + VERIFY(!(mli->mli_debug & IFD_ATTACHED)); + ifp = mli->mli_ifp; + VERIFY(ifp != NULL); + mli_initvar(mli, ifp, 1); + mli->mli_debug |= IFD_ATTACHED; + MLI_ADDREF_LOCKED(mli); /* hold a reference for mli_head */ + MLI_UNLOCK(mli); - /* ip6h_nxt will be fill in later */ - hbh->ip6h_len = 0; /* (8 >> 3) - 1 */ + LIST_INSERT_HEAD(&mli_head, mli, mli_link); - /* XXX: grotty hard coding... */ - hbh_buf[2] = IP6OPT_PADN; /* 2 byte padding */ - hbh_buf[3] = 0; - hbh_buf[4] = IP6OPT_RTALERT; - hbh_buf[5] = IP6OPT_RTALERT_LEN - 2; - bcopy((caddr_t)&rtalert_code, &hbh_buf[6], sizeof(u_int16_t)); + MLD_UNLOCK(); - ip6_initpktopts(&ip6_opts); - ip6_opts.ip6po_hbh = hbh; + MLD_PRINTF(("reattached mld_ifinfo for ifp %p(%s%d)\n", + ifp, ifp->if_name, ifp->if_unit)); } +/* + * Hook for domifdetach. + */ void -mld6_start_listening( - struct in6_multi *in6m) -{ - /* - * RFC2710 page 10: - * The node never sends a Report or Done for the link-scope all-nodes - * address. - * MLD messages are never sent for multicast addresses whose scope is 0 - * (reserved) or 1 (node-local). - */ - mld6_all_nodes_linklocal.s6_addr16[1] = - htons(in6m->in6m_ifp->if_index); /* XXX */ - if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &mld6_all_nodes_linklocal) || - IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) { - in6m->in6m_timer = 0; - in6m->in6m_state = MLD6_OTHERLISTENER; - } else { - mld6_sendpkt(in6m, MLD6_LISTENER_REPORT, NULL); - in6m->in6m_timer = MLD6_RANDOM_DELAY( - MLD6_UNSOLICITED_REPORT_INTERVAL * PR_FASTHZ); - in6m->in6m_state = MLD6_IREPORTEDLAST; - mld6_timers_are_running = 1; +mld_domifdetach(struct ifnet *ifp) +{ + + MLD_PRINTF(("%s: called for ifp %p(%s%d)\n", + __func__, ifp, ifp->if_name, ifp->if_unit)); + + MLD_LOCK(); + mli_delete(ifp); + MLD_UNLOCK(); +} + +/* + * Called at interface detach time. Note that we only flush all deferred + * responses and record releases; all remaining inm records and their source + * entries related to this interface are left intact, in order to handle + * the reattach case. + */ +static void +mli_delete(const struct ifnet *ifp) +{ + struct mld_ifinfo *mli, *tmli; + + MLD_LOCK_ASSERT_HELD(); + + LIST_FOREACH_SAFE(mli, &mli_head, mli_link, tmli) { + MLI_LOCK(mli); + if (mli->mli_ifp == ifp) { + /* + * Free deferred General Query responses. + */ + IF_DRAIN(&mli->mli_gq); + IF_DRAIN(&mli->mli_v1q); + mld_flush_relq(mli); + VERIFY(SLIST_EMPTY(&mli->mli_relinmhead)); + mli->mli_debug &= ~IFD_ATTACHED; + MLI_UNLOCK(mli); + + LIST_REMOVE(mli, mli_link); + MLI_REMREF(mli); /* release mli_head reference */ + return; + } + MLI_UNLOCK(mli); + } + panic("%s: mld_ifinfo not found for ifp %p\n", __func__, ifp); +} + +static void +mli_initvar(struct mld_ifinfo *mli, struct ifnet *ifp, int reattach) +{ + MLI_LOCK_ASSERT_HELD(mli); + + mli->mli_ifp = ifp; + mli->mli_version = MLD_VERSION_2; + mli->mli_flags = 0; + mli->mli_rv = MLD_RV_INIT; + mli->mli_qi = MLD_QI_INIT; + mli->mli_qri = MLD_QRI_INIT; + mli->mli_uri = MLD_URI_INIT; + + /* ifnet is not yet attached; no need to hold ifnet lock */ + if (!(ifp->if_flags & IFF_MULTICAST)) + mli->mli_flags |= MLIF_SILENT; + if (mld_use_allow) + mli->mli_flags |= MLIF_USEALLOW; + if (!reattach) + SLIST_INIT(&mli->mli_relinmhead); + + /* + * Responses to general queries are subject to bounds. + */ + mli->mli_gq.ifq_maxlen = MLD_MAX_RESPONSE_PACKETS; + mli->mli_v1q.ifq_maxlen = MLD_MAX_RESPONSE_PACKETS; +} + +static struct mld_ifinfo * +mli_alloc(int how) +{ + struct mld_ifinfo *mli; + + mli = (how == M_WAITOK) ? zalloc(mli_zone) : zalloc_noblock(mli_zone); + if (mli != NULL) { + bzero(mli, mli_size); + lck_mtx_init(&mli->mli_lock, mld_mtx_grp, mld_mtx_attr); + mli->mli_debug |= IFD_ALLOC; + } + return (mli); +} + +static void +mli_free(struct mld_ifinfo *mli) +{ + MLI_LOCK(mli); + if (mli->mli_debug & IFD_ATTACHED) { + panic("%s: attached mli=%p is being freed", __func__, mli); + /* NOTREACHED */ + } else if (mli->mli_ifp != NULL) { + panic("%s: ifp not NULL for mli=%p", __func__, mli); + /* NOTREACHED */ + } else if (!(mli->mli_debug & IFD_ALLOC)) { + panic("%s: mli %p cannot be freed", __func__, mli); + /* NOTREACHED */ + } else if (mli->mli_refcnt != 0) { + panic("%s: non-zero refcnt mli=%p", __func__, mli); + /* NOTREACHED */ } + mli->mli_debug &= ~IFD_ALLOC; + MLI_UNLOCK(mli); + + lck_mtx_destroy(&mli->mli_lock, mld_mtx_grp); + zfree(mli_zone, mli); } void -mld6_stop_listening( - struct in6_multi *in6m) +mli_addref(struct mld_ifinfo *mli, int locked) { - mld6_all_nodes_linklocal.s6_addr16[1] = - htons(in6m->in6m_ifp->if_index); /* XXX */ - mld6_all_routers_linklocal.s6_addr16[1] = - htons(in6m->in6m_ifp->if_index); /* XXX: necessary when mrouting */ + if (!locked) + MLI_LOCK_SPIN(mli); + else + MLI_LOCK_ASSERT_HELD(mli); - if (in6m->in6m_state == MLD6_IREPORTEDLAST && - (!IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &mld6_all_nodes_linklocal)) && - IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) > IPV6_ADDR_SCOPE_NODELOCAL) - mld6_sendpkt(in6m, MLD6_LISTENER_DONE, - &mld6_all_routers_linklocal); + if (++mli->mli_refcnt == 0) { + panic("%s: mli=%p wraparound refcnt", __func__, mli); + /* NOTREACHED */ + } + if (!locked) + MLI_UNLOCK(mli); } void -mld6_input( - struct mbuf *m, - int off) -{ - struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); - struct mld6_hdr *mldh; - struct ifnet *ifp = m->m_pkthdr.rcvif; - struct in6_multi *in6m; - struct in6_ifaddr *ia; - struct ifmultiaddr *ifma; - int timer; /* timer value in the MLD query header */ - -#ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, sizeof(*mldh), return); - mldh = (struct mld6_hdr *)(mtod(m, caddr_t) + off); -#else - IP6_EXTHDR_GET(mldh, struct mld6_hdr *, m, off, sizeof(*mldh)); - if (mldh == NULL) { - icmp6stat.icp6s_tooshort++; - return; +mli_remref(struct mld_ifinfo *mli) +{ + struct ifnet *ifp; + + MLI_LOCK_SPIN(mli); + + if (mli->mli_refcnt == 0) { + panic("%s: mli=%p negative refcnt", __func__, mli); + /* NOTREACHED */ } -#endif - /* source address validation */ - ip6 = mtod(m, struct ip6_hdr *);/* in case mpullup */ - if (!IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src)) { - log(LOG_ERR, - "mld6_input: src %s is not link-local (grp=%s)\n", - ip6_sprintf(&ip6->ip6_src), - ip6_sprintf(&mldh->mld6_addr)); - /* - * spec (RFC2710) does not explicitly - * specify to discard the packet from a non link-local - * source address. But we believe it's expected to do so. - * XXX: do we have to allow :: as source? - */ - m_freem(m); + --mli->mli_refcnt; + if (mli->mli_refcnt > 0) { + MLI_UNLOCK(mli); return; } + ifp = mli->mli_ifp; + mli->mli_ifp = NULL; + IF_DRAIN(&mli->mli_gq); + IF_DRAIN(&mli->mli_v1q); + mld_flush_relq(mli); + VERIFY(SLIST_EMPTY(&mli->mli_relinmhead)); + MLI_UNLOCK(mli); + + MLD_PRINTF(("%s: freeing mld_ifinfo for ifp %p(%s%d)\n", + __func__, ifp, ifp->if_name, ifp->if_unit)); + + mli_free(mli); +} + +/* + * Process a received MLDv1 general or address-specific query. + * Assumes that the query header has been pulled up to sizeof(mld_hdr). + * + * NOTE: Can't be fully const correct as we temporarily embed scope ID in + * mld_addr. This is OK as we own the mbuf chain. + */ +static int +mld_v1_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, + /*const*/ struct mld_hdr *mld) +{ + struct mld_ifinfo *mli; + struct in6_multi *inm; + int is_general_query; + uint16_t timer; + + is_general_query = 0; + + if (!mld_v1enable) { + MLD_PRINTF(("ignore v1 query %s on ifp %p(%s%d)\n", + ip6_sprintf(&mld->mld_addr), + ifp, ifp->if_name, ifp->if_unit)); + return (0); + } + /* - * In the MLD6 specification, there are 3 states and a flag. - * - * In Non-Listener state, we simply don't have a membership record. - * In Delaying Listener state, our timer is running (in6m->in6m_timer) - * In Idle Listener state, our timer is not running (in6m->in6m_timer==0) - * - * The flag is in6m->in6m_state, it is set to MLD6_OTHERLISTENER if - * we have heard a report from another member, or MLD6_IREPORTEDLAST - * if we sent the last report. + * RFC3810 Section 6.2: MLD queries must originate from + * a router's link-local address. */ - switch(mldh->mld6_type) { - case MLD6_LISTENER_QUERY: - if (ifp->if_flags & IFF_LOOPBACK) - break; - - if (!IN6_IS_ADDR_UNSPECIFIED(&mldh->mld6_addr) && - !IN6_IS_ADDR_MULTICAST(&mldh->mld6_addr)) - break; /* print error or log stat? */ - if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld6_addr)) - mldh->mld6_addr.s6_addr16[1] = - htons(ifp->if_index); /* XXX */ + if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) { + MLD_PRINTF(("ignore v1 query src %s on ifp %p(%s%d)\n", + ip6_sprintf(&ip6->ip6_src), + ifp, ifp->if_name, ifp->if_unit)); + return (0); + } + /* + * Do address field validation upfront before we accept + * the query. + */ + if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) { /* - * - Start the timers in all of our membership records - * that the query applies to for the interface on - * which the query arrived excl. those that belong - * to the "all-nodes" group (ff02::1). - * - Restart any timer that is already running but has - * A value longer than the requested timeout. - * - Use the value specified in the query message as - * the maximum timeout. + * MLDv1 General Query. + * If this was not sent to the all-nodes group, ignore it. */ - ifnet_lock_exclusive(ifp); - IFP_TO_IA6(ifp, ia); - if (ia == NULL) - break; + struct in6_addr dst; + dst = ip6->ip6_dst; + in6_clearscope(&dst); + if (!IN6_ARE_ADDR_EQUAL(&dst, &in6addr_linklocal_allnodes)) + return (EINVAL); + is_general_query = 1; + } else { /* - * XXX: System timer resolution is too low to handle Max - * Response Delay, so set 1 to the internal timer even if - * the calculated value equals to zero when Max Response - * Delay is positive. + * Embed scope ID of receiving interface in MLD query for + * lookup whilst we don't hold other locks. */ - timer = ntohs(mldh->mld6_maxdelay)*PR_FASTHZ/MLD6_TIMER_SCALE; - if (timer == 0 && mldh->mld6_maxdelay) - timer = 1; - mld6_all_nodes_linklocal.s6_addr16[1] = - htons(ifp->if_index); /* XXX */ - - LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) - { - if (ifma->ifma_addr->sa_family != AF_INET6) - continue; - in6m = (struct in6_multi *)ifma->ifma_protospec; - if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, - &mld6_all_nodes_linklocal) || - IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) < - IPV6_ADDR_SCOPE_LINKLOCAL) - continue; + in6_setscope(&mld->mld_addr, ifp, NULL); + } - if (IN6_IS_ADDR_UNSPECIFIED(&mldh->mld6_addr) || - IN6_ARE_ADDR_EQUAL(&mldh->mld6_addr, - &in6m->in6m_addr)) - { - if (timer == 0) { - /* send a report immediately */ - mld6_sendpkt(in6m, MLD6_LISTENER_REPORT, - NULL); - in6m->in6m_timer = 0; /* reset timer */ - in6m->in6m_state = MLD6_IREPORTEDLAST; - } - else if (in6m->in6m_timer == 0 || /*idle state*/ - in6m->in6m_timer > timer) { - in6m->in6m_timer = - MLD6_RANDOM_DELAY(timer); - mld6_timers_are_running = 1; - } - } - } - ifnet_lock_done(ifp); + /* + * Switch to MLDv1 host compatibility mode. + */ + mli = MLD_IFINFO(ifp); + VERIFY(mli != NULL); - if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld6_addr)) - mldh->mld6_addr.s6_addr16[1] = 0; /* XXX */ - break; - case MLD6_LISTENER_REPORT: - /* - * For fast leave to work, we have to know that we are the - * last person to send a report for this group. Reports - * can potentially get looped back if we are a multicast - * router, so discard reports sourced by me. - * Note that it is impossible to check IFF_LOOPBACK flag of - * ifp for this purpose, since ip6_mloopback pass the physical - * interface to looutput. - */ - if (m->m_flags & M_LOOP) /* XXX: grotty flag, but efficient */ - break; + MLI_LOCK(mli); + mld_set_version(mli, MLD_VERSION_1); + MLI_UNLOCK(mli); - if (!IN6_IS_ADDR_MULTICAST(&mldh->mld6_addr)) - break; + timer = (ntohs(mld->mld_maxdelay) * PR_SLOWHZ) / MLD_TIMER_SCALE; + if (timer == 0) + timer = 1; + + if (is_general_query) { + struct in6_multistep step; - if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld6_addr)) - mldh->mld6_addr.s6_addr16[1] = - htons(ifp->if_index); /* XXX */ + MLD_PRINTF(("process v1 general query on ifp %p(%s%d)\n", + ifp, ifp->if_name, ifp->if_unit)); /* - * If we belong to the group being reported, stop - * our timer for that group. + * For each reporting group joined on this + * interface, kick the report timer. */ - ifnet_lock_shared(ifp); - IN6_LOOKUP_MULTI(mldh->mld6_addr, ifp, in6m); - if (in6m) { - in6m->in6m_timer = 0; /* transit to idle state */ - in6m->in6m_state = MLD6_OTHERLISTENER; /* clear flag */ + in6_multihead_lock_shared(); + IN6_FIRST_MULTI(step, inm); + while (inm != NULL) { + IN6M_LOCK(inm); + if (inm->in6m_ifp == ifp) + mld_v1_update_group(inm, timer); + IN6M_UNLOCK(inm); + IN6_NEXT_MULTI(step, inm); } - ifnet_lock_done(ifp); + in6_multihead_lock_done(); + } else { + /* + * MLDv1 Group-Specific Query. + * If this is a group-specific MLDv1 query, we need only + * look up the single group to process it. + */ + in6_multihead_lock_shared(); + IN6_LOOKUP_MULTI(&mld->mld_addr, ifp, inm); + in6_multihead_lock_done(); - if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld6_addr)) - mldh->mld6_addr.s6_addr16[1] = 0; /* XXX */ - break; - default: /* this is impossible */ - log(LOG_ERR, "mld6_input: illegal type(%d)", mldh->mld6_type); - break; + if (inm != NULL) { + IN6M_LOCK(inm); + MLD_PRINTF(("process v1 query %s on ifp %p(%s%d)\n", + ip6_sprintf(&mld->mld_addr), + ifp, ifp->if_name, ifp->if_unit)); + mld_v1_update_group(inm, timer); + IN6M_UNLOCK(inm); + IN6M_REMREF(inm); /* from IN6_LOOKUP_MULTI */ + } + /* XXX Clear embedded scope ID as userland won't expect it. */ + in6_clearscope(&mld->mld_addr); } - m_freem(m); + return (0); } -void -mld6_fasttimeo() +/* + * Update the report timer on a group in response to an MLDv1 query. + * + * If we are becoming the reporting member for this group, start the timer. + * If we already are the reporting member for this group, and timer is + * below the threshold, reset it. + * + * We may be updating the group for the first time since we switched + * to MLDv2. If we are, then we must clear any recorded source lists, + * and transition to REPORTING state; the group timer is overloaded + * for group and group-source query responses. + * + * Unlike MLDv2, the delay per group should be jittered + * to avoid bursts of MLDv1 reports. + */ +static void +mld_v1_update_group(struct in6_multi *inm, const int timer) { - struct in6_multi *in6m; - struct in6_multistep step; + IN6M_LOCK_ASSERT_HELD(inm); - /* - * Quick check to see if any work needs to be done, in order - * to minimize the overhead of fasttimo processing. - */ - if (!mld6_timers_are_running) - return; + MLD_PRINTF(("%s: %s/%s%d timer=%d\n", __func__, + ip6_sprintf(&inm->in6m_addr), + inm->in6m_ifp->if_name, inm->in6m_ifp->if_unit, timer)); - lck_mtx_lock(nd6_mutex); - mld6_timers_are_running = 0; - IN6_FIRST_MULTI(step, in6m); - while (in6m != NULL) { - if (in6m->in6m_timer == 0) { - /* do nothing */ - } else if (--in6m->in6m_timer == 0) { - mld6_sendpkt(in6m, MLD6_LISTENER_REPORT, NULL); - in6m->in6m_state = MLD6_IREPORTEDLAST; - } else { - mld6_timers_are_running = 1; + switch (inm->in6m_state) { + case MLD_NOT_MEMBER: + case MLD_SILENT_MEMBER: + break; + case MLD_REPORTING_MEMBER: + if (inm->in6m_timer != 0 && + inm->in6m_timer <= timer) { + MLD_PRINTF(("%s: REPORTING and timer running, " + "skipping.\n", __func__)); + break; } - IN6_NEXT_MULTI(step, in6m); + /* FALLTHROUGH */ + case MLD_SG_QUERY_PENDING_MEMBER: + case MLD_G_QUERY_PENDING_MEMBER: + case MLD_IDLE_MEMBER: + case MLD_LAZY_MEMBER: + case MLD_AWAKENING_MEMBER: + MLD_PRINTF(("%s: ->REPORTING\n", __func__)); + inm->in6m_state = MLD_REPORTING_MEMBER; + inm->in6m_timer = MLD_RANDOM_DELAY(timer); + current_state_timers_running6 = 1; + break; + case MLD_SLEEPING_MEMBER: + MLD_PRINTF(("%s: ->AWAKENING\n", __func__)); + inm->in6m_state = MLD_AWAKENING_MEMBER; + break; + case MLD_LEAVING_MEMBER: + break; } - lck_mtx_unlock(nd6_mutex); } -static void -mld6_sendpkt( - struct in6_multi *in6m, - int type, - const struct in6_addr *dst) +/* + * Process a received MLDv2 general, group-specific or + * group-and-source-specific query. + * + * Assumes that the query header has been pulled up to sizeof(mldv2_query). + * + * Return 0 if successful, otherwise an appropriate error code is returned. + */ +static int +mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, + struct mbuf *m, const int off, const int icmp6len) { - struct mbuf *mh, *md; - struct mld6_hdr *mldh; - struct ip6_hdr *ip6; - struct ip6_moptions im6o; - struct in6_ifaddr *ia; - struct ifnet *ifp = in6m->in6m_ifp; - struct ifnet *outif = NULL; + struct mld_ifinfo *mli; + struct mldv2_query *mld; + struct in6_multi *inm; + uint32_t maxdelay, nsrc, qqi; + int is_general_query; + uint16_t timer; + uint8_t qrv; - /* - * At first, find a link local address on the outgoing interface - * to use as the source address of the MLD packet. - */ - if ((ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST)) - == NULL) - return; + is_general_query = 0; /* - * Allocate mbufs to store ip6 header and MLD header. - * We allocate 2 mbufs and make chain in advance because - * it is more convenient when inserting the hop-by-hop option later. + * RFC3810 Section 6.2: MLD queries must originate from + * a router's link-local address. */ - MGETHDR(mh, M_DONTWAIT, MT_HEADER); - if (mh == NULL) { - ifafree(&ia->ia_ifa); - return; - } - MGET(md, M_DONTWAIT, MT_DATA); - if (md == NULL) { - m_free(mh); - ifafree(&ia->ia_ifa); - return; + if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) { + MLD_PRINTF(("ignore v1 query src %s on ifp %p(%s%d)\n", + ip6_sprintf(&ip6->ip6_src), + ifp, ifp->if_name, ifp->if_unit)); + return (0); } - mh->m_next = md; - mh->m_pkthdr.rcvif = NULL; -#ifdef __darwin8_notyet -#if CONFIG_MACF_NET - mac_create_mbuf_linklayer(in6m->in6m_ifp, m); -#endif -#endif - mh->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mld6_hdr); - mh->m_len = sizeof(struct ip6_hdr); - MH_ALIGN(mh, sizeof(struct ip6_hdr)); + MLD_PRINTF(("input v2 query on ifp %p(%s%d)\n", ifp, ifp->if_name, + ifp->if_unit)); - /* fill in the ip6 header */ - ip6 = mtod(mh, struct ip6_hdr *); - ip6->ip6_flow = 0; - ip6->ip6_vfc &= ~IPV6_VERSION_MASK; - ip6->ip6_vfc |= IPV6_VERSION; - /* ip6_plen will be set later */ - ip6->ip6_nxt = IPPROTO_ICMPV6; - /* ip6_hlim will be set by im6o.im6o_multicast_hlim */ - ip6->ip6_src = ia->ia_addr.sin6_addr; - ip6->ip6_dst = dst ? *dst : in6m->in6m_addr; - - /* fill in the MLD header */ - md->m_len = sizeof(struct mld6_hdr); - mldh = mtod(md, struct mld6_hdr *); - mldh->mld6_type = type; - mldh->mld6_code = 0; - mldh->mld6_cksum = 0; - /* XXX: we assume the function will not be called for query messages */ - mldh->mld6_maxdelay = 0; - mldh->mld6_reserved = 0; - mldh->mld6_addr = in6m->in6m_addr; - if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld6_addr)) - mldh->mld6_addr.s6_addr16[1] = 0; /* XXX */ - mldh->mld6_cksum = in6_cksum(mh, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), - sizeof(struct mld6_hdr)); - - /* construct multicast option */ - bzero(&im6o, sizeof(im6o)); - im6o.im6o_multicast_ifp = ifp; - im6o.im6o_multicast_hlim = 1; - - /* - * Request loopback of the report if we are acting as a multicast - * router, so that the process-level routing daemon can hear it. - */ -#if MROUTING - im6o.im6o_multicast_loop = (ip6_mrouter != NULL); -#else - im6o.im6o_multicast_loop = 0; -#endif + mld = (struct mldv2_query *)(mtod(m, uint8_t *) + off); - /* increment output statictics */ - icmp6stat.icp6s_outhist[type]++; + maxdelay = ntohs(mld->mld_maxdelay); /* in 1/10ths of a second */ + if (maxdelay >= 32678) { + maxdelay = (MLD_MRC_MANT(maxdelay) | 0x1000) << + (MLD_MRC_EXP(maxdelay) + 3); + } + timer = (maxdelay * PR_SLOWHZ) / MLD_TIMER_SCALE; + if (timer == 0) + timer = 1; - ip6_output(mh, &ip6_opts, NULL, 0, &im6o, &outif, 0); - if (outif) { - icmp6_ifstat_inc(outif, ifs6_out_msg); - switch (type) { - case MLD6_LISTENER_QUERY: - icmp6_ifstat_inc(outif, ifs6_out_mldquery); - break; - case MLD6_LISTENER_REPORT: - icmp6_ifstat_inc(outif, ifs6_out_mldreport); - break; - case MLD6_LISTENER_DONE: - icmp6_ifstat_inc(outif, ifs6_out_mlddone); - break; - } + qrv = MLD_QRV(mld->mld_misc); + if (qrv < 2) { + MLD_PRINTF(("%s: clamping qrv %d to %d\n", __func__, + qrv, MLD_RV_INIT)); + qrv = MLD_RV_INIT; } - ifafree(&ia->ia_ifa); -} + qqi = mld->mld_qqi; + if (qqi >= 128) { + qqi = MLD_QQIC_MANT(mld->mld_qqi) << + (MLD_QQIC_EXP(mld->mld_qqi) + 3); + } + + nsrc = ntohs(mld->mld_numsrc); + if (nsrc > MLD_MAX_GS_SOURCES) + return (EMSGSIZE); + if (icmp6len < sizeof(struct mldv2_query) + + (nsrc * sizeof(struct in6_addr))) + return (EMSGSIZE); + + /* + * Do further input validation upfront to avoid resetting timers + * should we need to discard this query. + */ + if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) { + /* + * General Queries SHOULD be directed to ff02::1. + * A general query with a source list has undefined + * behaviour; discard it. + */ + struct in6_addr dst; + + dst = ip6->ip6_dst; + in6_clearscope(&dst); + if (!IN6_ARE_ADDR_EQUAL(&dst, &in6addr_linklocal_allnodes) || + nsrc > 0) + return (EINVAL); + is_general_query = 1; + } else { + /* + * Embed scope ID of receiving interface in MLD query for + * lookup whilst we don't hold other locks (due to KAME + * locking lameness). We own this mbuf chain just now. + */ + in6_setscope(&mld->mld_addr, ifp, NULL); + } + + mli = MLD_IFINFO(ifp); + VERIFY(mli != NULL); + + MLI_LOCK(mli); + /* + * Discard the v2 query if we're in Compatibility Mode. + * The RFC is pretty clear that hosts need to stay in MLDv1 mode + * until the Old Version Querier Present timer expires. + */ + if (mli->mli_version != MLD_VERSION_2) { + MLI_UNLOCK(mli); + return (0); + } + + mld_set_version(mli, MLD_VERSION_2); + mli->mli_rv = qrv; + mli->mli_qi = qqi; + mli->mli_qri = maxdelay; + + MLD_PRINTF(("%s: qrv %d qi %d maxdelay %d\n", __func__, qrv, qqi, + maxdelay)); + + if (is_general_query) { + /* + * MLDv2 General Query. + * + * Schedule a current-state report on this ifp for + * all groups, possibly containing source lists. + * + * If there is a pending General Query response + * scheduled earlier than the selected delay, do + * not schedule any other reports. + * Otherwise, reset the interface timer. + */ + MLD_PRINTF(("process v2 general query on ifp %p(%s%d)\n", + ifp, ifp->if_name, ifp->if_unit)); + if (mli->mli_v2_timer == 0 || mli->mli_v2_timer >= timer) { + mli->mli_v2_timer = MLD_RANDOM_DELAY(timer); + interface_timers_running6 = 1; + } + MLI_UNLOCK(mli); + } else { + MLI_UNLOCK(mli); + /* + * MLDv2 Group-specific or Group-and-source-specific Query. + * + * Group-source-specific queries are throttled on + * a per-group basis to defeat denial-of-service attempts. + * Queries for groups we are not a member of on this + * link are simply ignored. + */ + in6_multihead_lock_shared(); + IN6_LOOKUP_MULTI(&mld->mld_addr, ifp, inm); + in6_multihead_lock_done(); + if (inm == NULL) + return (0); + + IN6M_LOCK(inm); +#ifndef __APPLE__ + /* TODO: need ratecheck equivalent */ + if (nsrc > 0) { + if (!ratecheck(&inm->in6m_lastgsrtv, + &mld_gsrdelay)) { + MLD_PRINTF(("%s: GS query throttled.\n", + __func__)); + IN6M_UNLOCK(inm); + IN6M_REMREF(inm); /* from IN6_LOOKUP_MULTI */ + return (0); + } + } +#endif + MLD_PRINTF(("process v2 group query on ifp %p(%s%d)\n", + ifp, ifp->if_name, ifp->if_unit)); + /* + * If there is a pending General Query response + * scheduled sooner than the selected delay, no + * further report need be scheduled. + * Otherwise, prepare to respond to the + * group-specific or group-and-source query. + */ + MLI_LOCK(mli); + if (mli->mli_v2_timer == 0 || mli->mli_v2_timer >= timer) { + MLI_UNLOCK(mli); + mld_v2_process_group_query(inm, timer, m, off); + } else { + MLI_UNLOCK(mli); + } + IN6M_UNLOCK(inm); + IN6M_REMREF(inm); /* from IN6_LOOKUP_MULTI */ + /* XXX Clear embedded scope ID as userland won't expect it. */ + in6_clearscope(&mld->mld_addr); + } + + return (0); +} + +/* + * Process a recieved MLDv2 group-specific or group-and-source-specific + * query. + * Return <0 if any error occured. Currently this is ignored. + */ +static int +mld_v2_process_group_query(struct in6_multi *inm, int timer, struct mbuf *m0, + const int off) +{ + struct mldv2_query *mld; + int retval; + uint16_t nsrc; + + IN6M_LOCK_ASSERT_HELD(inm); + + retval = 0; + mld = (struct mldv2_query *)(mtod(m0, uint8_t *) + off); + + switch (inm->in6m_state) { + case MLD_NOT_MEMBER: + case MLD_SILENT_MEMBER: + case MLD_SLEEPING_MEMBER: + case MLD_LAZY_MEMBER: + case MLD_AWAKENING_MEMBER: + case MLD_IDLE_MEMBER: + case MLD_LEAVING_MEMBER: + return (retval); + break; + case MLD_REPORTING_MEMBER: + case MLD_G_QUERY_PENDING_MEMBER: + case MLD_SG_QUERY_PENDING_MEMBER: + break; + } + + nsrc = ntohs(mld->mld_numsrc); + + /* + * Deal with group-specific queries upfront. + * If any group query is already pending, purge any recorded + * source-list state if it exists, and schedule a query response + * for this group-specific query. + */ + if (nsrc == 0) { + if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER || + inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER) { + in6m_clear_recorded(inm); + timer = min(inm->in6m_timer, timer); + } + inm->in6m_state = MLD_G_QUERY_PENDING_MEMBER; + inm->in6m_timer = MLD_RANDOM_DELAY(timer); + current_state_timers_running6 = 1; + return (retval); + } + + /* + * Deal with the case where a group-and-source-specific query has + * been received but a group-specific query is already pending. + */ + if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER) { + timer = min(inm->in6m_timer, timer); + inm->in6m_timer = MLD_RANDOM_DELAY(timer); + current_state_timers_running6 = 1; + return (retval); + } + + /* + * Finally, deal with the case where a group-and-source-specific + * query has been received, where a response to a previous g-s-r + * query exists, or none exists. + * In this case, we need to parse the source-list which the Querier + * has provided us with and check if we have any source list filter + * entries at T1 for these sources. If we do not, there is no need + * schedule a report and the query may be dropped. + * If we do, we must record them and schedule a current-state + * report for those sources. + */ + if (inm->in6m_nsrc > 0) { + struct mbuf *m; + uint8_t *sp; + int i, nrecorded; + int soff; + + m = m0; + soff = off + sizeof(struct mldv2_query); + nrecorded = 0; + for (i = 0; i < nsrc; i++) { + sp = mtod(m, uint8_t *) + soff; + retval = in6m_record_source(inm, + (const struct in6_addr *)sp); + if (retval < 0) + break; + nrecorded += retval; + soff += sizeof(struct in6_addr); + if (soff >= m->m_len) { + soff = soff - m->m_len; + m = m->m_next; + if (m == NULL) + break; + } + } + if (nrecorded > 0) { + MLD_PRINTF(( "%s: schedule response to SG query\n", + __func__)); + inm->in6m_state = MLD_SG_QUERY_PENDING_MEMBER; + inm->in6m_timer = MLD_RANDOM_DELAY(timer); + current_state_timers_running6 = 1; + } + } + + return (retval); +} + +/* + * Process a received MLDv1 host membership report. + * Assumes mld points to mld_hdr in pulled up mbuf chain. + * + * NOTE: Can't be fully const correct as we temporarily embed scope ID in + * mld_addr. This is OK as we own the mbuf chain. + */ +static int +mld_v1_input_report(struct ifnet *ifp, const struct ip6_hdr *ip6, + /*const*/ struct mld_hdr *mld) +{ + struct in6_addr src, dst; + struct in6_ifaddr *ia; + struct in6_multi *inm; + + if (!mld_v1enable) { + MLD_PRINTF(("ignore v1 report %s on ifp %p(%s%d)\n", + ip6_sprintf(&mld->mld_addr), + ifp, ifp->if_name, ifp->if_unit)); + return (0); + } + + if (ifp->if_flags & IFF_LOOPBACK) + return (0); + + /* + * MLDv1 reports must originate from a host's link-local address, + * or the unspecified address (when booting). + */ + src = ip6->ip6_src; + in6_clearscope(&src); + if (!IN6_IS_SCOPE_LINKLOCAL(&src) && !IN6_IS_ADDR_UNSPECIFIED(&src)) { + MLD_PRINTF(("ignore v1 query src %s on ifp %p(%s%d)\n", + ip6_sprintf(&ip6->ip6_src), + ifp, ifp->if_name, ifp->if_unit)); + return (EINVAL); + } + + /* + * RFC2710 Section 4: MLDv1 reports must pertain to a multicast + * group, and must be directed to the group itself. + */ + dst = ip6->ip6_dst; + in6_clearscope(&dst); + if (!IN6_IS_ADDR_MULTICAST(&mld->mld_addr) || + !IN6_ARE_ADDR_EQUAL(&mld->mld_addr, &dst)) { + MLD_PRINTF(("ignore v1 query dst %s on ifp %p(%s%d)\n", + ip6_sprintf(&ip6->ip6_dst), + ifp, ifp->if_name, ifp->if_unit)); + return (EINVAL); + } + + /* + * Make sure we don't hear our own membership report, as fast + * leave requires knowing that we are the only member of a + * group. Assume we used the link-local address if available, + * otherwise look for ::. + * + * XXX Note that scope ID comparison is needed for the address + * returned by in6ifa_ifpforlinklocal(), but SHOULD NOT be + * performed for the on-wire address. + */ + ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST); + if (ia != NULL) { + IFA_LOCK(&ia->ia_ifa); + if ((IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, IA6_IN6(ia)))){ + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); + return (0); + } + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); + } else if (IN6_IS_ADDR_UNSPECIFIED(&src)) { + return (0); + } + + MLD_PRINTF(("process v1 report %s on ifp %p(%s%d)\n", + ip6_sprintf(&mld->mld_addr), ifp, ifp->if_name, ifp->if_unit)); + + /* + * Embed scope ID of receiving interface in MLD query for lookup + * whilst we don't hold other locks (due to KAME locking lameness). + */ + if (!IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) + in6_setscope(&mld->mld_addr, ifp, NULL); + + /* + * MLDv1 report suppression. + * If we are a member of this group, and our membership should be + * reported, and our group timer is pending or about to be reset, + * stop our group timer by transitioning to the 'lazy' state. + */ + in6_multihead_lock_shared(); + IN6_LOOKUP_MULTI(&mld->mld_addr, ifp, inm); + in6_multihead_lock_done(); + + if (inm != NULL) { + struct mld_ifinfo *mli; + + IN6M_LOCK(inm); + mli = inm->in6m_mli; + VERIFY(mli != NULL); + + MLI_LOCK(mli); + /* + * If we are in MLDv2 host mode, do not allow the + * other host's MLDv1 report to suppress our reports. + */ + if (mli->mli_version == MLD_VERSION_2) { + MLI_UNLOCK(mli); + IN6M_UNLOCK(inm); + IN6M_REMREF(inm); /* from IN6_LOOKUP_MULTI */ + goto out; + } + MLI_UNLOCK(mli); + + inm->in6m_timer = 0; + + switch (inm->in6m_state) { + case MLD_NOT_MEMBER: + case MLD_SILENT_MEMBER: + case MLD_SLEEPING_MEMBER: + break; + case MLD_REPORTING_MEMBER: + case MLD_IDLE_MEMBER: + case MLD_AWAKENING_MEMBER: + MLD_PRINTF(("report suppressed for %s on ifp %p(%s%d)\n", + ip6_sprintf(&mld->mld_addr), + ifp, ifp->if_name, ifp->if_unit)); + case MLD_LAZY_MEMBER: + inm->in6m_state = MLD_LAZY_MEMBER; + break; + case MLD_G_QUERY_PENDING_MEMBER: + case MLD_SG_QUERY_PENDING_MEMBER: + case MLD_LEAVING_MEMBER: + break; + } + IN6M_UNLOCK(inm); + IN6M_REMREF(inm); /* from IN6_LOOKUP_MULTI */ + } + +out: + /* XXX Clear embedded scope ID as userland won't expect it. */ + in6_clearscope(&mld->mld_addr); + + return (0); +} + +/* + * MLD input path. + * + * Assume query messages which fit in a single ICMPv6 message header + * have been pulled up. + * Assume that userland will want to see the message, even if it + * otherwise fails kernel input validation; do not free it. + * Pullup may however free the mbuf chain m if it fails. + * + * Return IPPROTO_DONE if we freed m. Otherwise, return 0. + */ +int +mld_input(struct mbuf *m, int off, int icmp6len) +{ + struct ifnet *ifp; + struct ip6_hdr *ip6; + struct mld_hdr *mld; + int mldlen; + + MLD_PRINTF(("%s: called w/mbuf (%p,%d)\n", __func__, m, off)); + + ifp = m->m_pkthdr.rcvif; + + ip6 = mtod(m, struct ip6_hdr *); + + /* Pullup to appropriate size. */ + mld = (struct mld_hdr *)(mtod(m, uint8_t *) + off); + if (mld->mld_type == MLD_LISTENER_QUERY && + icmp6len >= sizeof(struct mldv2_query)) { + mldlen = sizeof(struct mldv2_query); + } else { + mldlen = sizeof(struct mld_hdr); + } + IP6_EXTHDR_GET(mld, struct mld_hdr *, m, off, mldlen); + if (mld == NULL) { + icmp6stat.icp6s_badlen++; + return (IPPROTO_DONE); + } + + /* + * Userland needs to see all of this traffic for implementing + * the endpoint discovery portion of multicast routing. + */ + switch (mld->mld_type) { + case MLD_LISTENER_QUERY: + icmp6_ifstat_inc(ifp, ifs6_in_mldquery); + if (icmp6len == sizeof(struct mld_hdr)) { + if (mld_v1_input_query(ifp, ip6, mld) != 0) + return (0); + } else if (icmp6len >= sizeof(struct mldv2_query)) { + if (mld_v2_input_query(ifp, ip6, m, off, + icmp6len) != 0) + return (0); + } + break; + case MLD_LISTENER_REPORT: + icmp6_ifstat_inc(ifp, ifs6_in_mldreport); + if (mld_v1_input_report(ifp, ip6, mld) != 0) + return (0); + break; + case MLDV2_LISTENER_REPORT: + icmp6_ifstat_inc(ifp, ifs6_in_mldreport); + break; + case MLD_LISTENER_DONE: + icmp6_ifstat_inc(ifp, ifs6_in_mlddone); + break; + default: + break; + } + + return (0); +} + +/* + * MLD6 slowtimo handler. + * Combiles both the slow and fast timer into one. We loose some responsivness but + * allows the system to avoid having a pr_fasttimo, thus allowing for power savings. + */ +void +mld_slowtimo(void) +{ + struct ifqueue scq; /* State-change packets */ + struct ifqueue qrq; /* Query response packets */ + struct ifnet *ifp; + struct mld_ifinfo *mli; + struct in6_multi *inm; + int uri_fasthz = 0; + + MLD_LOCK(); + + LIST_FOREACH(mli, &mli_head, mli_link) { + MLI_LOCK(mli); + mld_v1_process_querier_timers(mli); + MLI_UNLOCK(mli); + } + + /* + * Quick check to see if any work needs to be done, in order to + * minimize the overhead of fasttimo processing. + */ + if (!current_state_timers_running6 && + !interface_timers_running6 && + !state_change_timers_running6) { + MLD_UNLOCK(); + return; + } + + /* + * MLDv2 General Query response timer processing. + */ + if (interface_timers_running6) { +#if 0 + MLD_PRINTF(("%s: interface timers running\n", __func__)); +#endif + interface_timers_running6 = 0; + LIST_FOREACH(mli, &mli_head, mli_link) { + MLI_LOCK(mli); + if (mli->mli_v2_timer == 0) { + /* Do nothing. */ + } else if (--mli->mli_v2_timer == 0) { + mld_v2_dispatch_general_query(mli); + } else { + interface_timers_running6 = 1; + } + MLI_UNLOCK(mli); + } + } + + if (!current_state_timers_running6 && + !state_change_timers_running6) + goto out_locked; + + current_state_timers_running6 = 0; + state_change_timers_running6 = 0; +#if 0 + MLD_PRINTF(("%s: state change timers running\n", __func__)); +#endif + + memset(&qrq, 0, sizeof(struct ifqueue)); + qrq.ifq_maxlen = MLD_MAX_G_GS_PACKETS; + + memset(&scq, 0, sizeof(struct ifqueue)); + scq.ifq_maxlen = MLD_MAX_STATE_CHANGE_PACKETS; + + /* + * MLD host report and state-change timer processing. + * Note: Processing a v2 group timer may remove a node. + */ + LIST_FOREACH(mli, &mli_head, mli_link) { + struct in6_multistep step; + + MLI_LOCK(mli); + ifp = mli->mli_ifp; + uri_fasthz = MLD_RANDOM_DELAY(mli->mli_uri * PR_SLOWHZ); + MLI_UNLOCK(mli); + + in6_multihead_lock_shared(); + IN6_FIRST_MULTI(step, inm); + while (inm != NULL) { + IN6M_LOCK(inm); + if (inm->in6m_ifp != ifp) + goto next; + + MLI_LOCK(mli); + switch (mli->mli_version) { + case MLD_VERSION_1: + mld_v1_process_group_timer(inm, + mli->mli_version); + break; + case MLD_VERSION_2: + mld_v2_process_group_timers(mli, &qrq, + &scq, inm, uri_fasthz); + break; + } + MLI_UNLOCK(mli); +next: + IN6M_UNLOCK(inm); + IN6_NEXT_MULTI(step, inm); + } + in6_multihead_lock_done(); + + MLI_LOCK(mli); + if (mli->mli_version == MLD_VERSION_1) { + mld_dispatch_queue(mli, &mli->mli_v1q, 0); + } else if (mli->mli_version == MLD_VERSION_2) { + MLI_UNLOCK(mli); + mld_dispatch_queue(NULL, &qrq, 0); + mld_dispatch_queue(NULL, &scq, 0); + VERIFY(qrq.ifq_len == 0); + VERIFY(scq.ifq_len == 0); + MLI_LOCK(mli); + } + /* + * In case there are still any pending membership reports + * which didn't get drained at version change time. + */ + IF_DRAIN(&mli->mli_v1q); + /* + * Release all deferred inm records, and drain any locally + * enqueued packets; do it even if the current MLD version + * for the link is no longer MLDv2, in order to handle the + * version change case. + */ + mld_flush_relq(mli); + VERIFY(SLIST_EMPTY(&mli->mli_relinmhead)); + MLI_UNLOCK(mli); + + IF_DRAIN(&qrq); + IF_DRAIN(&scq); + } + +out_locked: + MLD_UNLOCK(); +} + +/* + * Free the in6_multi reference(s) for this MLD lifecycle. + * + * Caller must be holding mli_lock. + */ +static void +mld_flush_relq(struct mld_ifinfo *mli) +{ + struct in6_multi *inm; + +again: + MLI_LOCK_ASSERT_HELD(mli); + inm = SLIST_FIRST(&mli->mli_relinmhead); + if (inm != NULL) { + int lastref; + + SLIST_REMOVE_HEAD(&mli->mli_relinmhead, in6m_nrele); + MLI_UNLOCK(mli); + + in6_multihead_lock_exclusive(); + IN6M_LOCK(inm); + VERIFY(inm->in6m_nrelecnt != 0); + inm->in6m_nrelecnt--; + lastref = in6_multi_detach(inm); + VERIFY(!lastref || (!(inm->in6m_debug & IFD_ATTACHED) && + inm->in6m_reqcnt == 0)); + IN6M_UNLOCK(inm); + in6_multihead_lock_done(); + /* from mli_relinmhead */ + IN6M_REMREF(inm); + /* from in6_multihead_list */ + if (lastref) + IN6M_REMREF(inm); + + MLI_LOCK(mli); + goto again; + } +} + +/* + * Update host report group timer. + * Will update the global pending timer flags. + */ +static void +mld_v1_process_group_timer(struct in6_multi *inm, const int mld_version) +{ +#pragma unused(mld_version) + int report_timer_expired; + + IN6M_LOCK_ASSERT_HELD(inm); + MLI_LOCK_ASSERT_HELD(inm->in6m_mli); + + if (inm->in6m_timer == 0) { + report_timer_expired = 0; + } else if (--inm->in6m_timer == 0) { + report_timer_expired = 1; + } else { + current_state_timers_running6 = 1; + return; + } + + switch (inm->in6m_state) { + case MLD_NOT_MEMBER: + case MLD_SILENT_MEMBER: + case MLD_IDLE_MEMBER: + case MLD_LAZY_MEMBER: + case MLD_SLEEPING_MEMBER: + case MLD_AWAKENING_MEMBER: + break; + case MLD_REPORTING_MEMBER: + if (report_timer_expired) { + inm->in6m_state = MLD_IDLE_MEMBER; + (void) mld_v1_transmit_report(inm, + MLD_LISTENER_REPORT); + IN6M_LOCK_ASSERT_HELD(inm); + MLI_LOCK_ASSERT_HELD(inm->in6m_mli); + } + break; + case MLD_G_QUERY_PENDING_MEMBER: + case MLD_SG_QUERY_PENDING_MEMBER: + case MLD_LEAVING_MEMBER: + break; + } +} + +/* + * Update a group's timers for MLDv2. + * Will update the global pending timer flags. + * Note: Unlocked read from mli. + */ +static void +mld_v2_process_group_timers(struct mld_ifinfo *mli, + struct ifqueue *qrq, struct ifqueue *scq, + struct in6_multi *inm, const int uri_fasthz) +{ + int query_response_timer_expired; + int state_change_retransmit_timer_expired; + + IN6M_LOCK_ASSERT_HELD(inm); + MLI_LOCK_ASSERT_HELD(mli); + VERIFY(mli == inm->in6m_mli); + + query_response_timer_expired = 0; + state_change_retransmit_timer_expired = 0; + + /* + * During a transition from compatibility mode back to MLDv2, + * a group record in REPORTING state may still have its group + * timer active. This is a no-op in this function; it is easier + * to deal with it here than to complicate the slow-timeout path. + */ + if (inm->in6m_timer == 0) { + query_response_timer_expired = 0; + } else if (--inm->in6m_timer == 0) { + query_response_timer_expired = 1; + } else { + current_state_timers_running6 = 1; + } + + if (inm->in6m_sctimer == 0) { + state_change_retransmit_timer_expired = 0; + } else if (--inm->in6m_sctimer == 0) { + state_change_retransmit_timer_expired = 1; + } else { + state_change_timers_running6 = 1; + } + + /* We are in fasttimo, so be quick about it. */ + if (!state_change_retransmit_timer_expired && + !query_response_timer_expired) + return; + + switch (inm->in6m_state) { + case MLD_NOT_MEMBER: + case MLD_SILENT_MEMBER: + case MLD_SLEEPING_MEMBER: + case MLD_LAZY_MEMBER: + case MLD_AWAKENING_MEMBER: + case MLD_IDLE_MEMBER: + break; + case MLD_G_QUERY_PENDING_MEMBER: + case MLD_SG_QUERY_PENDING_MEMBER: + /* + * Respond to a previously pending Group-Specific + * or Group-and-Source-Specific query by enqueueing + * the appropriate Current-State report for + * immediate transmission. + */ + if (query_response_timer_expired) { + int retval; + + retval = mld_v2_enqueue_group_record(qrq, inm, 0, 1, + (inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER), + 0); + MLD_PRINTF(("%s: enqueue record = %d\n", + __func__, retval)); + inm->in6m_state = MLD_REPORTING_MEMBER; + in6m_clear_recorded(inm); + } + /* FALLTHROUGH */ + case MLD_REPORTING_MEMBER: + case MLD_LEAVING_MEMBER: + if (state_change_retransmit_timer_expired) { + /* + * State-change retransmission timer fired. + * If there are any further pending retransmissions, + * set the global pending state-change flag, and + * reset the timer. + */ + if (--inm->in6m_scrv > 0) { + inm->in6m_sctimer = uri_fasthz; + state_change_timers_running6 = 1; + } + /* + * Retransmit the previously computed state-change + * report. If there are no further pending + * retransmissions, the mbuf queue will be consumed. + * Update T0 state to T1 as we have now sent + * a state-change. + */ + (void) mld_v2_merge_state_changes(inm, scq); + + in6m_commit(inm); + MLD_PRINTF(("%s: T1 -> T0 for %s/%s%d\n", __func__, + ip6_sprintf(&inm->in6m_addr), + inm->in6m_ifp->if_name, inm->in6m_ifp->if_unit)); + + /* + * If we are leaving the group for good, make sure + * we release MLD's reference to it. + * This release must be deferred using a SLIST, + * as we are called from a loop which traverses + * the in_ifmultiaddr TAILQ. + */ + if (inm->in6m_state == MLD_LEAVING_MEMBER && + inm->in6m_scrv == 0) { + inm->in6m_state = MLD_NOT_MEMBER; + /* + * A reference has already been held in + * mld_final_leave() for this inm, so + * no need to hold another one. We also + * bumped up its request count then, so + * that it stays in in6_multihead. Both + * of them will be released when it is + * dequeued later on. + */ + VERIFY(inm->in6m_nrelecnt != 0); + SLIST_INSERT_HEAD(&mli->mli_relinmhead, + inm, in6m_nrele); + } + } + break; + } +} + +/* + * Switch to a different version on the given interface, + * as per Section 9.12. + */ +static void +mld_set_version(struct mld_ifinfo *mli, const int mld_version) +{ + int old_version_timer; + + MLI_LOCK_ASSERT_HELD(mli); + + MLD_PRINTF(("%s: switching to v%d on ifp %p(%s%d)\n", __func__, + mld_version, mli->mli_ifp, mli->mli_ifp->if_name, + mli->mli_ifp->if_unit)); + + if (mld_version == MLD_VERSION_1) { + /* + * Compute the "Older Version Querier Present" timer as per + * Section 9.12. + */ + old_version_timer = (mli->mli_rv * mli->mli_qi) + mli->mli_qri; + old_version_timer *= PR_SLOWHZ; + mli->mli_v1_timer = old_version_timer; + } + + if (mli->mli_v1_timer > 0 && mli->mli_version != MLD_VERSION_1) { + mli->mli_version = MLD_VERSION_1; + mld_v2_cancel_link_timers(mli); + } + + MLI_LOCK_ASSERT_HELD(mli); +} + +/* + * Cancel pending MLDv2 timers for the given link and all groups + * joined on it; state-change, general-query, and group-query timers. + */ +static void +mld_v2_cancel_link_timers(struct mld_ifinfo *mli) +{ + struct ifnet *ifp; + struct in6_multi *inm; + struct in6_multistep step; + + MLI_LOCK_ASSERT_HELD(mli); + + MLD_PRINTF(("%s: cancel v2 timers on ifp %p(%s%d)\n", __func__, + mli->mli_ifp, mli->mli_ifp->if_name, mli->mli_ifp->if_unit)); + + /* + * Fast-track this potentially expensive operation + * by checking all the global 'timer pending' flags. + */ + if (!interface_timers_running6 && + !state_change_timers_running6 && + !current_state_timers_running6) + return; + + mli->mli_v2_timer = 0; + ifp = mli->mli_ifp; + MLI_UNLOCK(mli); + + in6_multihead_lock_shared(); + IN6_FIRST_MULTI(step, inm); + while (inm != NULL) { + IN6M_LOCK(inm); + if (inm->in6m_ifp != ifp) + goto next; + + switch (inm->in6m_state) { + case MLD_NOT_MEMBER: + case MLD_SILENT_MEMBER: + case MLD_IDLE_MEMBER: + case MLD_LAZY_MEMBER: + case MLD_SLEEPING_MEMBER: + case MLD_AWAKENING_MEMBER: + break; + case MLD_LEAVING_MEMBER: + /* + * If we are leaving the group and switching + * version, we need to release the final + * reference held for issuing the INCLUDE {}. + * During mld_final_leave(), we bumped up both the + * request and reference counts. Since we cannot + * call in6_multi_detach() here, defer this task to + * the timer routine. + */ + VERIFY(inm->in6m_nrelecnt != 0); + MLI_LOCK(mli); + SLIST_INSERT_HEAD(&mli->mli_relinmhead, inm, + in6m_nrele); + MLI_UNLOCK(mli); + /* FALLTHROUGH */ + case MLD_G_QUERY_PENDING_MEMBER: + case MLD_SG_QUERY_PENDING_MEMBER: + in6m_clear_recorded(inm); + /* FALLTHROUGH */ + case MLD_REPORTING_MEMBER: + inm->in6m_sctimer = 0; + inm->in6m_timer = 0; + inm->in6m_state = MLD_REPORTING_MEMBER; + /* + * Free any pending MLDv2 state-change records. + */ + IF_DRAIN(&inm->in6m_scq); + break; + } +next: + IN6M_UNLOCK(inm); + IN6_NEXT_MULTI(step, inm); + } + in6_multihead_lock_done(); + + MLI_LOCK(mli); +} + +/* + * Update the Older Version Querier Present timers for a link. + * See Section 9.12 of RFC 3810. + */ +static void +mld_v1_process_querier_timers(struct mld_ifinfo *mli) +{ + MLI_LOCK_ASSERT_HELD(mli); + + if (mli->mli_version != MLD_VERSION_2 && --mli->mli_v1_timer == 0) { + /* + * MLDv1 Querier Present timer expired; revert to MLDv2. + */ + MLD_PRINTF(("%s: transition from v%d -> v%d on %p(%s%d)\n", + __func__, mli->mli_version, MLD_VERSION_2, + mli->mli_ifp, mli->mli_ifp->if_name, mli->mli_ifp->if_unit)); + mli->mli_version = MLD_VERSION_2; + } +} + +/* + * Transmit an MLDv1 report immediately. + */ +static int +mld_v1_transmit_report(struct in6_multi *in6m, const int type) +{ + struct ifnet *ifp; + struct in6_ifaddr *ia; + struct ip6_hdr *ip6; + struct mbuf *mh, *md; + struct mld_hdr *mld; + int error = 0; + + IN6M_LOCK_ASSERT_HELD(in6m); + MLI_LOCK_ASSERT_HELD(in6m->in6m_mli); + + ifp = in6m->in6m_ifp; + /* ia may be NULL if link-local address is tentative. */ + ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST); + + MGETHDR(mh, M_DONTWAIT, MT_HEADER); + if (mh == NULL) { + if (ia != NULL) + IFA_REMREF(&ia->ia_ifa); + return (ENOMEM); + } + MGET(md, M_DONTWAIT, MT_DATA); + if (md == NULL) { + m_free(mh); + if (ia != NULL) + IFA_REMREF(&ia->ia_ifa); + return (ENOMEM); + } + mh->m_next = md; + + /* + * FUTURE: Consider increasing alignment by ETHER_HDR_LEN, so + * that ether_output() does not need to allocate another mbuf + * for the header in the most common case. + */ + MH_ALIGN(mh, sizeof(struct ip6_hdr)); + mh->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mld_hdr); + mh->m_len = sizeof(struct ip6_hdr); + + ip6 = mtod(mh, struct ip6_hdr *); + ip6->ip6_flow = 0; + ip6->ip6_vfc &= ~IPV6_VERSION_MASK; + ip6->ip6_vfc |= IPV6_VERSION; + ip6->ip6_nxt = IPPROTO_ICMPV6; + if (ia != NULL) + IFA_LOCK(&ia->ia_ifa); + ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any; + if (ia != NULL) { + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); + ia = NULL; + } + ip6->ip6_dst = in6m->in6m_addr; + + md->m_len = sizeof(struct mld_hdr); + mld = mtod(md, struct mld_hdr *); + mld->mld_type = type; + mld->mld_code = 0; + mld->mld_cksum = 0; + mld->mld_maxdelay = 0; + mld->mld_reserved = 0; + mld->mld_addr = in6m->in6m_addr; + in6_clearscope(&mld->mld_addr); + mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6, + sizeof(struct ip6_hdr), sizeof(struct mld_hdr)); + + mh->m_flags |= M_MLDV1; + + + /* + * Due to the fact that at this point we are possibly holding + * in6_multihead_lock in shared or exclusive mode, we can't call + * mld_dispatch_packet() here since that will eventually call + * ip6_output(), which will try to lock in6_multihead_lock and cause + * a deadlock. + * Instead we defer the work to the mld_slowtimo() thread, thus + * avoiding unlocking in_multihead_lock here. + */ + if (IF_QFULL(&in6m->in6m_mli->mli_v1q)) { + MLD_PRINTF(("%s: v1 outbound queue full\n", __func__)); + error = ENOMEM; + m_freem(mh); + } else + IF_ENQUEUE(&in6m->in6m_mli->mli_v1q, mh); + + return (error); +} + +/* + * Process a state change from the upper layer for the given IPv6 group. + * + * Each socket holds a reference on the in6_multi in its own ip_moptions. + * The socket layer will have made the necessary updates to.the group + * state, it is now up to MLD to issue a state change report if there + * has been any change between T0 (when the last state-change was issued) + * and T1 (now). + * + * We use the MLDv2 state machine at group level. The MLd module + * however makes the decision as to which MLD protocol version to speak. + * A state change *from* INCLUDE {} always means an initial join. + * A state change *to* INCLUDE {} always means a final leave. + * + * If delay is non-zero, and the state change is an initial multicast + * join, the state change report will be delayed by 'delay' ticks + * in units of PR_FASTHZ if MLDv1 is active on the link; otherwise + * the initial MLDv2 state change report will be delayed by whichever + * is sooner, a pending state-change timer or delay itself. + */ +int +mld_change_state(struct in6_multi *inm, const int delay) +{ + struct mld_ifinfo *mli; + struct ifnet *ifp; + int error = 0; + + IN6M_LOCK_ASSERT_HELD(inm); + VERIFY(inm->in6m_mli != NULL); + MLI_LOCK_ASSERT_NOTHELD(inm->in6m_mli); + + /* + * Try to detect if the upper layer just asked us to change state + * for an interface which has now gone away. + */ + VERIFY(inm->in6m_ifma != NULL); + ifp = inm->in6m_ifma->ifma_ifp; + /* + * Sanity check that netinet6's notion of ifp is the same as net's. + */ + VERIFY(inm->in6m_ifp == ifp); + + mli = MLD_IFINFO(ifp); + VERIFY(mli != NULL); + + /* + * If we detect a state transition to or from MCAST_UNDEFINED + * for this group, then we are starting or finishing an MLD + * life cycle for this group. + */ + if (inm->in6m_st[1].iss_fmode != inm->in6m_st[0].iss_fmode) { + MLD_PRINTF(("%s: inm transition %d -> %d\n", __func__, + inm->in6m_st[0].iss_fmode, inm->in6m_st[1].iss_fmode)); + if (inm->in6m_st[0].iss_fmode == MCAST_UNDEFINED) { + MLD_PRINTF(("%s: initial join\n", __func__)); + error = mld_initial_join(inm, mli, delay); + goto out; + } else if (inm->in6m_st[1].iss_fmode == MCAST_UNDEFINED) { + MLD_PRINTF(("%s: final leave\n", __func__)); + mld_final_leave(inm, mli); + goto out; + } + } else { + MLD_PRINTF(("%s: filter set change\n", __func__)); + } + + error = mld_handle_state_change(inm, mli); + +out: + return (error); +} + +/* + * Perform the initial join for an MLD group. + * + * When joining a group: + * If the group should have its MLD traffic suppressed, do nothing. + * MLDv1 starts sending MLDv1 host membership reports. + * MLDv2 will schedule an MLDv2 state-change report containing the + * initial state of the membership. + * + * If the delay argument is non-zero, then we must delay sending the + * initial state change for delay ticks (in units of PR_FASTHZ). + */ +static int +mld_initial_join(struct in6_multi *inm, struct mld_ifinfo *mli, + const int delay) +{ + struct ifnet *ifp; + struct ifqueue *ifq; + int error, retval, syncstates; + int odelay; + + IN6M_LOCK_ASSERT_HELD(inm); + MLI_LOCK_ASSERT_NOTHELD(mli); + + MLD_PRINTF(("%s: initial join %s on ifp %p(%s%d)\n", + __func__, ip6_sprintf(&inm->in6m_addr), + inm->in6m_ifp, inm->in6m_ifp->if_name, inm->in6m_ifp->if_unit)); + + error = 0; + syncstates = 1; + + ifp = inm->in6m_ifp; + + MLI_LOCK(mli); + VERIFY(mli->mli_ifp == ifp); + + /* + * Groups joined on loopback or marked as 'not reported', + * enter the MLD_SILENT_MEMBER state and + * are never reported in any protocol exchanges. + * All other groups enter the appropriate state machine + * for the version in use on this link. + * A link marked as MLIF_SILENT causes MLD to be completely + * disabled for the link. + */ + if ((ifp->if_flags & IFF_LOOPBACK) || + (mli->mli_flags & MLIF_SILENT) || + !mld_is_addr_reported(&inm->in6m_addr)) { + MLD_PRINTF(("%s: not kicking state machine for silent group\n", + __func__)); + inm->in6m_state = MLD_SILENT_MEMBER; + inm->in6m_timer = 0; + } else { + /* + * Deal with overlapping in6_multi lifecycle. + * If this group was LEAVING, then make sure + * we drop the reference we picked up to keep the + * group around for the final INCLUDE {} enqueue. + * Since we cannot call in6_multi_detach() here, + * defer this task to the timer routine. + */ + if (mli->mli_version == MLD_VERSION_2 && + inm->in6m_state == MLD_LEAVING_MEMBER) { + VERIFY(inm->in6m_nrelecnt != 0); + SLIST_INSERT_HEAD(&mli->mli_relinmhead, inm, + in6m_nrele); + } + + inm->in6m_state = MLD_REPORTING_MEMBER; + + switch (mli->mli_version) { + case MLD_VERSION_1: + /* + * If a delay was provided, only use it if + * it is greater than the delay normally + * used for an MLDv1 state change report, + * and delay sending the initial MLDv1 report + * by not transitioning to the IDLE state. + */ + odelay = MLD_RANDOM_DELAY(MLD_V1_MAX_RI * PR_SLOWHZ); + if (delay) { + inm->in6m_timer = max(delay, odelay); + current_state_timers_running6 = 1; + } else { + inm->in6m_state = MLD_IDLE_MEMBER; + error = mld_v1_transmit_report(inm, + MLD_LISTENER_REPORT); + + IN6M_LOCK_ASSERT_HELD(inm); + MLI_LOCK_ASSERT_HELD(mli); + + if (error == 0) { + inm->in6m_timer = odelay; + current_state_timers_running6 = 1; + } + } + break; + + case MLD_VERSION_2: + /* + * Defer update of T0 to T1, until the first copy + * of the state change has been transmitted. + */ + syncstates = 0; + + /* + * Immediately enqueue a State-Change Report for + * this interface, freeing any previous reports. + * Don't kick the timers if there is nothing to do, + * or if an error occurred. + */ + ifq = &inm->in6m_scq; + IF_DRAIN(ifq); + retval = mld_v2_enqueue_group_record(ifq, inm, 1, + 0, 0, (mli->mli_flags & MLIF_USEALLOW)); + MLD_PRINTF(("%s: enqueue record = %d\n", + __func__, retval)); + if (retval <= 0) { + error = retval * -1; + break; + } + + /* + * Schedule transmission of pending state-change + * report up to RV times for this link. The timer + * will fire at the next mld_fasttimo (~200ms), + * giving us an opportunity to merge the reports. + * + * If a delay was provided to this function, only + * use this delay if sooner than the existing one. + */ + VERIFY(mli->mli_rv > 1); + inm->in6m_scrv = mli->mli_rv; + if (delay) { + if (inm->in6m_sctimer > 1) { + inm->in6m_sctimer = + min(inm->in6m_sctimer, delay); + } else + inm->in6m_sctimer = delay; + } else + inm->in6m_sctimer = 1; + state_change_timers_running6 = 1; + + error = 0; + break; + } + } + MLI_UNLOCK(mli); + + /* + * Only update the T0 state if state change is atomic, + * i.e. we don't need to wait for a timer to fire before we + * can consider the state change to have been communicated. + */ + if (syncstates) { + in6m_commit(inm); + MLD_PRINTF(("%s: T1 -> T0 for %s/%s%d\n", __func__, + ip6_sprintf(&inm->in6m_addr), + inm->in6m_ifp->if_name, ifp->if_unit)); + } + + return (error); +} + +/* + * Issue an intermediate state change during the life-cycle. + */ +static int +mld_handle_state_change(struct in6_multi *inm, struct mld_ifinfo *mli) +{ + struct ifnet *ifp; + int retval; + + IN6M_LOCK_ASSERT_HELD(inm); + MLI_LOCK_ASSERT_NOTHELD(mli); + + MLD_PRINTF(("%s: state change for %s on ifp %p(%s%d)\n", + __func__, ip6_sprintf(&inm->in6m_addr), + inm->in6m_ifp, inm->in6m_ifp->if_name, inm->in6m_ifp->if_unit)); + + ifp = inm->in6m_ifp; + + MLI_LOCK(mli); + VERIFY(mli->mli_ifp == ifp); + + if ((ifp->if_flags & IFF_LOOPBACK) || + (mli->mli_flags & MLIF_SILENT) || + !mld_is_addr_reported(&inm->in6m_addr) || + (mli->mli_version != MLD_VERSION_2)) { + MLI_UNLOCK(mli); + if (!mld_is_addr_reported(&inm->in6m_addr)) { + MLD_PRINTF(("%s: not kicking state machine for silent " + "group\n", __func__)); + } + MLD_PRINTF(("%s: nothing to do\n", __func__)); + in6m_commit(inm); + MLD_PRINTF(("%s: T1 -> T0 for %s/%s%d\n", __func__, + ip6_sprintf(&inm->in6m_addr), + inm->in6m_ifp->if_name, inm->in6m_ifp->if_unit)); + return (0); + } + + IF_DRAIN(&inm->in6m_scq); + + retval = mld_v2_enqueue_group_record(&inm->in6m_scq, inm, 1, 0, 0, + (mli->mli_flags & MLIF_USEALLOW)); + MLD_PRINTF(("%s: enqueue record = %d\n", __func__, retval)); + if (retval <= 0) { + MLI_UNLOCK(mli); + return (-retval); + } + /* + * If record(s) were enqueued, start the state-change + * report timer for this group. + */ + inm->in6m_scrv = mli->mli_rv; + inm->in6m_sctimer = 1; + state_change_timers_running6 = 1; + MLI_UNLOCK(mli); + + return (0); +} + +/* + * Perform the final leave for a multicast address. + * + * When leaving a group: + * MLDv1 sends a DONE message, if and only if we are the reporter. + * MLDv2 enqueues a state-change report containing a transition + * to INCLUDE {} for immediate transmission. + */ +static void +mld_final_leave(struct in6_multi *inm, struct mld_ifinfo *mli) +{ + int syncstates = 1; + + IN6M_LOCK_ASSERT_HELD(inm); + MLI_LOCK_ASSERT_NOTHELD(mli); + + MLD_PRINTF(("%s: final leave %s on ifp %p(%s%d)\n", + __func__, ip6_sprintf(&inm->in6m_addr), + inm->in6m_ifp, inm->in6m_ifp->if_name, inm->in6m_ifp->if_unit)); + + switch (inm->in6m_state) { + case MLD_NOT_MEMBER: + case MLD_SILENT_MEMBER: + case MLD_LEAVING_MEMBER: + /* Already leaving or left; do nothing. */ + MLD_PRINTF(("%s: not kicking state machine for silent group\n", + __func__)); + break; + case MLD_REPORTING_MEMBER: + case MLD_IDLE_MEMBER: + case MLD_G_QUERY_PENDING_MEMBER: + case MLD_SG_QUERY_PENDING_MEMBER: + MLI_LOCK(mli); + if (mli->mli_version == MLD_VERSION_1) { + if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER || + inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER) { + panic("%s: MLDv2 state reached, not MLDv2 " + "mode\n", __func__); + /* NOTREACHED */ + } + mld_v1_transmit_report(inm, MLD_LISTENER_DONE); + + IN6M_LOCK_ASSERT_HELD(inm); + MLI_LOCK_ASSERT_HELD(mli); + + inm->in6m_state = MLD_NOT_MEMBER; + } else if (mli->mli_version == MLD_VERSION_2) { + /* + * Stop group timer and all pending reports. + * Immediately enqueue a state-change report + * TO_IN {} to be sent on the next fast timeout, + * giving us an opportunity to merge reports. + */ + IF_DRAIN(&inm->in6m_scq); + inm->in6m_timer = 0; + inm->in6m_scrv = mli->mli_rv; + MLD_PRINTF(("%s: Leaving %s/%s%d with %d " + "pending retransmissions.\n", __func__, + ip6_sprintf(&inm->in6m_addr), + inm->in6m_ifp->if_name, inm->in6m_ifp->if_unit, + inm->in6m_scrv)); + if (inm->in6m_scrv == 0) { + inm->in6m_state = MLD_NOT_MEMBER; + inm->in6m_sctimer = 0; + } else { + int retval; + /* + * Stick around in the in6_multihead list; + * the final detach will be issued by + * mld_v2_process_group_timers() when + * the retransmit timer expires. + */ + IN6M_ADDREF_LOCKED(inm); + VERIFY(inm->in6m_debug & IFD_ATTACHED); + inm->in6m_reqcnt++; + VERIFY(inm->in6m_reqcnt >= 1); + inm->in6m_nrelecnt++; + VERIFY(inm->in6m_nrelecnt != 0); + + retval = mld_v2_enqueue_group_record( + &inm->in6m_scq, inm, 1, 0, 0, + (mli->mli_flags & MLIF_USEALLOW)); + KASSERT(retval != 0, + ("%s: enqueue record = %d\n", __func__, + retval)); + + inm->in6m_state = MLD_LEAVING_MEMBER; + inm->in6m_sctimer = 1; + state_change_timers_running6 = 1; + syncstates = 0; + } + } + MLI_UNLOCK(mli); + break; + case MLD_LAZY_MEMBER: + case MLD_SLEEPING_MEMBER: + case MLD_AWAKENING_MEMBER: + /* Our reports are suppressed; do nothing. */ + break; + } + + if (syncstates) { + in6m_commit(inm); + MLD_PRINTF(("%s: T1 -> T0 for %s/%s%d\n", __func__, + ip6_sprintf(&inm->in6m_addr), + inm->in6m_ifp->if_name, inm->in6m_ifp->if_unit)); + inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED; + MLD_PRINTF(("%s: T1 now MCAST_UNDEFINED for %p/%s%d\n", + __func__, &inm->in6m_addr, inm->in6m_ifp->if_name, + inm->in6m_ifp->if_unit)); + } +} + +/* + * Enqueue an MLDv2 group record to the given output queue. + * + * If is_state_change is zero, a current-state record is appended. + * If is_state_change is non-zero, a state-change report is appended. + * + * If is_group_query is non-zero, an mbuf packet chain is allocated. + * If is_group_query is zero, and if there is a packet with free space + * at the tail of the queue, it will be appended to providing there + * is enough free space. + * Otherwise a new mbuf packet chain is allocated. + * + * If is_source_query is non-zero, each source is checked to see if + * it was recorded for a Group-Source query, and will be omitted if + * it is not both in-mode and recorded. + * + * If use_block_allow is non-zero, state change reports for initial join + * and final leave, on an inclusive mode group with a source list, will be + * rewritten to use the ALLOW_NEW and BLOCK_OLD record types, respectively. + * + * The function will attempt to allocate leading space in the packet + * for the IPv6+ICMP headers to be prepended without fragmenting the chain. + * + * If successful the size of all data appended to the queue is returned, + * otherwise an error code less than zero is returned, or zero if + * no record(s) were appended. + */ +static int +mld_v2_enqueue_group_record(struct ifqueue *ifq, struct in6_multi *inm, + const int is_state_change, const int is_group_query, + const int is_source_query, const int use_block_allow) +{ + struct mldv2_record mr; + struct mldv2_record *pmr; + struct ifnet *ifp; + struct ip6_msource *ims, *nims; + struct mbuf *m0, *m, *md; + int error, is_filter_list_change; + int minrec0len, m0srcs, msrcs, nbytes, off; + int record_has_sources; + int now; + int type; + uint8_t mode; + + IN6M_LOCK_ASSERT_HELD(inm); + MLI_LOCK_ASSERT_HELD(inm->in6m_mli); + + error = 0; + ifp = inm->in6m_ifp; + is_filter_list_change = 0; + m = NULL; + m0 = NULL; + m0srcs = 0; + msrcs = 0; + nbytes = 0; + nims = NULL; + record_has_sources = 1; + pmr = NULL; + type = MLD_DO_NOTHING; + mode = inm->in6m_st[1].iss_fmode; + + /* + * If we did not transition out of ASM mode during t0->t1, + * and there are no source nodes to process, we can skip + * the generation of source records. + */ + if (inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0 && + inm->in6m_nsrc == 0) + record_has_sources = 0; + + if (is_state_change) { + /* + * Queue a state change record. + * If the mode did not change, and there are non-ASM + * listeners or source filters present, + * we potentially need to issue two records for the group. + * If there are ASM listeners, and there was no filter + * mode transition of any kind, do nothing. + * + * If we are transitioning to MCAST_UNDEFINED, we need + * not send any sources. A transition to/from this state is + * considered inclusive with some special treatment. + * + * If we are rewriting initial joins/leaves to use + * ALLOW/BLOCK, and the group's membership is inclusive, + * we need to send sources in all cases. + */ + if (mode != inm->in6m_st[0].iss_fmode) { + if (mode == MCAST_EXCLUDE) { + MLD_PRINTF(("%s: change to EXCLUDE\n", + __func__)); + type = MLD_CHANGE_TO_EXCLUDE_MODE; + } else { + MLD_PRINTF(("%s: change to INCLUDE\n", + __func__)); + if (use_block_allow) { + /* + * XXX + * Here we're interested in state + * edges either direction between + * MCAST_UNDEFINED and MCAST_INCLUDE. + * Perhaps we should just check + * the group state, rather than + * the filter mode. + */ + if (mode == MCAST_UNDEFINED) { + type = MLD_BLOCK_OLD_SOURCES; + } else { + type = MLD_ALLOW_NEW_SOURCES; + } + } else { + type = MLD_CHANGE_TO_INCLUDE_MODE; + if (mode == MCAST_UNDEFINED) + record_has_sources = 0; + } + } + } else { + if (record_has_sources) { + is_filter_list_change = 1; + } else { + type = MLD_DO_NOTHING; + } + } + } else { + /* + * Queue a current state record. + */ + if (mode == MCAST_EXCLUDE) { + type = MLD_MODE_IS_EXCLUDE; + } else if (mode == MCAST_INCLUDE) { + type = MLD_MODE_IS_INCLUDE; + VERIFY(inm->in6m_st[1].iss_asm == 0); + } + } + + /* + * Generate the filter list changes using a separate function. + */ + if (is_filter_list_change) + return (mld_v2_enqueue_filter_change(ifq, inm)); + + if (type == MLD_DO_NOTHING) { + MLD_PRINTF(("%s: nothing to do for %s/%s%d\n", + __func__, ip6_sprintf(&inm->in6m_addr), + inm->in6m_ifp->if_name, inm->in6m_ifp->if_unit)); + return (0); + } + + /* + * If any sources are present, we must be able to fit at least + * one in the trailing space of the tail packet's mbuf, + * ideally more. + */ + minrec0len = sizeof(struct mldv2_record); + if (record_has_sources) + minrec0len += sizeof(struct in6_addr); + MLD_PRINTF(("%s: queueing %s for %s/%s%d\n", __func__, + mld_rec_type_to_str(type), + ip6_sprintf(&inm->in6m_addr), + inm->in6m_ifp->if_name, inm->in6m_ifp->if_unit)); + + /* + * Check if we have a packet in the tail of the queue for this + * group into which the first group record for this group will fit. + * Otherwise allocate a new packet. + * Always allocate leading space for IP6+RA+ICMPV6+REPORT. + * Note: Group records for G/GSR query responses MUST be sent + * in their own packet. + */ + m0 = ifq->ifq_tail; + if (!is_group_query && + m0 != NULL && + (m0->m_pkthdr.vt_nrecs + 1 <= MLD_V2_REPORT_MAXRECS) && + (m0->m_pkthdr.len + minrec0len) < + (ifp->if_mtu - MLD_MTUSPACE)) { + m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - + sizeof(struct mldv2_record)) / + sizeof(struct in6_addr); + m = m0; + MLD_PRINTF(("%s: use existing packet\n", __func__)); + } else { + if (IF_QFULL(ifq)) { + MLD_PRINTF(("%s: outbound queue full\n", __func__)); + return (-ENOMEM); + } + m = NULL; + m0srcs = (ifp->if_mtu - MLD_MTUSPACE - + sizeof(struct mldv2_record)) / sizeof(struct in6_addr); + if (!is_state_change && !is_group_query) + m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + if (m == NULL) + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m == NULL) + return (-ENOMEM); + + MLD_PRINTF(("%s: allocated first packet\n", __func__)); + } + + /* + * Append group record. + * If we have sources, we don't know how many yet. + */ + mr.mr_type = type; + mr.mr_datalen = 0; + mr.mr_numsrc = 0; + mr.mr_addr = inm->in6m_addr; + in6_clearscope(&mr.mr_addr); + if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) { + if (m != m0) + m_freem(m); + MLD_PRINTF(("%s: m_append() failed.\n", __func__)); + return (-ENOMEM); + } + nbytes += sizeof(struct mldv2_record); + + /* + * Append as many sources as will fit in the first packet. + * If we are appending to a new packet, the chain allocation + * may potentially use clusters; use m_getptr() in this case. + * If we are appending to an existing packet, we need to obtain + * a pointer to the group record after m_append(), in case a new + * mbuf was allocated. + * + * Only append sources which are in-mode at t1. If we are + * transitioning to MCAST_UNDEFINED state on the group, and + * use_block_allow is zero, do not include source entries. + * Otherwise, we need to include this source in the report. + * + * Only report recorded sources in our filter set when responding + * to a group-source query. + */ + if (record_has_sources) { + if (m == m0) { + md = m_last(m); + pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + + md->m_len - nbytes); + } else { + md = m_getptr(m, 0, &off); + pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + + off); + } + msrcs = 0; + RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, + nims) { + MLD_PRINTF(("%s: visit node %s\n", __func__, + ip6_sprintf(&ims->im6s_addr))); + now = im6s_get_mode(inm, ims, 1); + MLD_PRINTF(("%s: node is %d\n", __func__, now)); + if ((now != mode) || + (now == mode && + (!use_block_allow && mode == MCAST_UNDEFINED))) { + MLD_PRINTF(("%s: skip node\n", __func__)); + continue; + } + if (is_source_query && ims->im6s_stp == 0) { + MLD_PRINTF(("%s: skip unrecorded node\n", + __func__)); + continue; + } + MLD_PRINTF(("%s: append node\n", __func__)); + if (!m_append(m, sizeof(struct in6_addr), + (void *)&ims->im6s_addr)) { + if (m != m0) + m_freem(m); + MLD_PRINTF(("%s: m_append() failed.\n", + __func__)); + return (-ENOMEM); + } + nbytes += sizeof(struct in6_addr); + ++msrcs; + if (msrcs == m0srcs) + break; + } + MLD_PRINTF(("%s: msrcs is %d this packet\n", __func__, + msrcs)); + pmr->mr_numsrc = htons(msrcs); + nbytes += (msrcs * sizeof(struct in6_addr)); + } + + if (is_source_query && msrcs == 0) { + MLD_PRINTF(("%s: no recorded sources to report\n", __func__)); + if (m != m0) + m_freem(m); + return (0); + } + + /* + * We are good to go with first packet. + */ + if (m != m0) { + MLD_PRINTF(("%s: enqueueing first packet\n", __func__)); + m->m_pkthdr.vt_nrecs = 1; + m->m_pkthdr.rcvif = ifp; + IF_ENQUEUE(ifq, m); + } else { + m->m_pkthdr.vt_nrecs++; + } + /* + * No further work needed if no source list in packet(s). + */ + if (!record_has_sources) + return (nbytes); + + /* + * Whilst sources remain to be announced, we need to allocate + * a new packet and fill out as many sources as will fit. + * Always try for a cluster first. + */ + while (nims != NULL) { + if (IF_QFULL(ifq)) { + MLD_PRINTF(("%s: outbound queue full\n", __func__)); + return (-ENOMEM); + } + m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + if (m == NULL) + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m == NULL) + return (-ENOMEM); + md = m_getptr(m, 0, &off); + pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + off); + MLD_PRINTF(("%s: allocated next packet\n", __func__)); + + if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) { + if (m != m0) + m_freem(m); + MLD_PRINTF(("%s: m_append() failed.\n", __func__)); + return (-ENOMEM); + } + m->m_pkthdr.vt_nrecs = 1; + nbytes += sizeof(struct mldv2_record); + + m0srcs = (ifp->if_mtu - MLD_MTUSPACE - + sizeof(struct mldv2_record)) / sizeof(struct in6_addr); + + msrcs = 0; + RB_FOREACH_FROM(ims, ip6_msource_tree, nims) { + MLD_PRINTF(("%s: visit node %s\n", + __func__, ip6_sprintf(&ims->im6s_addr))); + now = im6s_get_mode(inm, ims, 1); + if ((now != mode) || + (now == mode && + (!use_block_allow && mode == MCAST_UNDEFINED))) { + MLD_PRINTF(("%s: skip node\n", __func__)); + continue; + } + if (is_source_query && ims->im6s_stp == 0) { + MLD_PRINTF(("%s: skip unrecorded node\n", + __func__)); + continue; + } + MLD_PRINTF(("%s: append node\n", __func__)); + if (!m_append(m, sizeof(struct in6_addr), + (void *)&ims->im6s_addr)) { + if (m != m0) + m_freem(m); + MLD_PRINTF(("%s: m_append() failed.\n", + __func__)); + return (-ENOMEM); + } + ++msrcs; + if (msrcs == m0srcs) + break; + } + pmr->mr_numsrc = htons(msrcs); + nbytes += (msrcs * sizeof(struct in6_addr)); + + MLD_PRINTF(("%s: enqueueing next packet\n", __func__)); + m->m_pkthdr.rcvif = ifp; + IF_ENQUEUE(ifq, m); + } + + return (nbytes); +} + +/* + * Type used to mark record pass completion. + * We exploit the fact we can cast to this easily from the + * current filter modes on each ip_msource node. + */ +typedef enum { + REC_NONE = 0x00, /* MCAST_UNDEFINED */ + REC_ALLOW = 0x01, /* MCAST_INCLUDE */ + REC_BLOCK = 0x02, /* MCAST_EXCLUDE */ + REC_FULL = REC_ALLOW | REC_BLOCK +} rectype_t; + +/* + * Enqueue an MLDv2 filter list change to the given output queue. + * + * Source list filter state is held in an RB-tree. When the filter list + * for a group is changed without changing its mode, we need to compute + * the deltas between T0 and T1 for each source in the filter set, + * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records. + * + * As we may potentially queue two record types, and the entire R-B tree + * needs to be walked at once, we break this out into its own function + * so we can generate a tightly packed queue of packets. + * + * XXX This could be written to only use one tree walk, although that makes + * serializing into the mbuf chains a bit harder. For now we do two walks + * which makes things easier on us, and it may or may not be harder on + * the L2 cache. + * + * If successful the size of all data appended to the queue is returned, + * otherwise an error code less than zero is returned, or zero if + * no record(s) were appended. + */ +static int +mld_v2_enqueue_filter_change(struct ifqueue *ifq, struct in6_multi *inm) +{ + static const int MINRECLEN = + sizeof(struct mldv2_record) + sizeof(struct in6_addr); + struct ifnet *ifp; + struct mldv2_record mr; + struct mldv2_record *pmr; + struct ip6_msource *ims, *nims; + struct mbuf *m, *m0, *md; + int m0srcs, nbytes, npbytes, off, rsrcs, schanged; + int nallow, nblock; + uint8_t mode, now, then; + rectype_t crt, drt, nrt; + + IN6M_LOCK_ASSERT_HELD(inm); + + if (inm->in6m_nsrc == 0 || + (inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0)) + return (0); + + ifp = inm->in6m_ifp; /* interface */ + mode = inm->in6m_st[1].iss_fmode; /* filter mode at t1 */ + crt = REC_NONE; /* current group record type */ + drt = REC_NONE; /* mask of completed group record types */ + nrt = REC_NONE; /* record type for current node */ + m0srcs = 0; /* # source which will fit in current mbuf chain */ + npbytes = 0; /* # of bytes appended this packet */ + nbytes = 0; /* # of bytes appended to group's state-change queue */ + rsrcs = 0; /* # sources encoded in current record */ + schanged = 0; /* # nodes encoded in overall filter change */ + nallow = 0; /* # of source entries in ALLOW_NEW */ + nblock = 0; /* # of source entries in BLOCK_OLD */ + nims = NULL; /* next tree node pointer */ + + /* + * For each possible filter record mode. + * The first kind of source we encounter tells us which + * is the first kind of record we start appending. + * If a node transitioned to UNDEFINED at t1, its mode is treated + * as the inverse of the group's filter mode. + */ + while (drt != REC_FULL) { + do { + m0 = ifq->ifq_tail; + if (m0 != NULL && + (m0->m_pkthdr.vt_nrecs + 1 <= + MLD_V2_REPORT_MAXRECS) && + (m0->m_pkthdr.len + MINRECLEN) < + (ifp->if_mtu - MLD_MTUSPACE)) { + m = m0; + m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - + sizeof(struct mldv2_record)) / + sizeof(struct in6_addr); + MLD_PRINTF(("%s: use previous packet\n", + __func__)); + } else { + m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + if (m == NULL) + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m == NULL) { + MLD_PRINTF(("%s: m_get*() failed\n", + __func__)); + return (-ENOMEM); + } + m->m_pkthdr.vt_nrecs = 0; + m0srcs = (ifp->if_mtu - MLD_MTUSPACE - + sizeof(struct mldv2_record)) / + sizeof(struct in6_addr); + npbytes = 0; + MLD_PRINTF(("%s: allocated new packet\n", + __func__)); + } + /* + * Append the MLD group record header to the + * current packet's data area. + * Recalculate pointer to free space for next + * group record, in case m_append() allocated + * a new mbuf or cluster. + */ + memset(&mr, 0, sizeof(mr)); + mr.mr_addr = inm->in6m_addr; + in6_clearscope(&mr.mr_addr); + if (!m_append(m, sizeof(mr), (void *)&mr)) { + if (m != m0) + m_freem(m); + MLD_PRINTF(("%s: m_append() failed\n", + __func__)); + return (-ENOMEM); + } + npbytes += sizeof(struct mldv2_record); + if (m != m0) { + /* new packet; offset in chain */ + md = m_getptr(m, npbytes - + sizeof(struct mldv2_record), &off); + pmr = (struct mldv2_record *)(mtod(md, + uint8_t *) + off); + } else { + /* current packet; offset from last append */ + md = m_last(m); + pmr = (struct mldv2_record *)(mtod(md, + uint8_t *) + md->m_len - + sizeof(struct mldv2_record)); + } + /* + * Begin walking the tree for this record type + * pass, or continue from where we left off + * previously if we had to allocate a new packet. + * Only report deltas in-mode at t1. + * We need not report included sources as allowed + * if we are in inclusive mode on the group, + * however the converse is not true. + */ + rsrcs = 0; + if (nims == NULL) { + nims = RB_MIN(ip6_msource_tree, + &inm->in6m_srcs); + } + RB_FOREACH_FROM(ims, ip6_msource_tree, nims) { + MLD_PRINTF(("%s: visit node %s\n", __func__, + ip6_sprintf(&ims->im6s_addr))); + now = im6s_get_mode(inm, ims, 1); + then = im6s_get_mode(inm, ims, 0); + MLD_PRINTF(("%s: mode: t0 %d, t1 %d\n", + __func__, then, now)); + if (now == then) { + MLD_PRINTF(("%s: skip unchanged\n", + __func__)); + continue; + } + if (mode == MCAST_EXCLUDE && + now == MCAST_INCLUDE) { + MLD_PRINTF(("%s: skip IN src on EX " + "group\n", __func__)); + continue; + } + nrt = (rectype_t)now; + if (nrt == REC_NONE) + nrt = (rectype_t)(~mode & REC_FULL); + if (schanged++ == 0) { + crt = nrt; + } else if (crt != nrt) + continue; + if (!m_append(m, sizeof(struct in6_addr), + (void *)&ims->im6s_addr)) { + if (m != m0) + m_freem(m); + MLD_PRINTF(("%s: m_append() failed\n", + __func__)); + return (-ENOMEM); + } + nallow += !!(crt == REC_ALLOW); + nblock += !!(crt == REC_BLOCK); + if (++rsrcs == m0srcs) + break; + } + /* + * If we did not append any tree nodes on this + * pass, back out of allocations. + */ + if (rsrcs == 0) { + npbytes -= sizeof(struct mldv2_record); + if (m != m0) { + MLD_PRINTF(("%s: m_free(m)\n", + __func__)); + m_freem(m); + } else { + MLD_PRINTF(("%s: m_adj(m, -mr)\n", + __func__)); + m_adj(m, -((int)sizeof( + struct mldv2_record))); + } + continue; + } + npbytes += (rsrcs * sizeof(struct in6_addr)); + if (crt == REC_ALLOW) + pmr->mr_type = MLD_ALLOW_NEW_SOURCES; + else if (crt == REC_BLOCK) + pmr->mr_type = MLD_BLOCK_OLD_SOURCES; + pmr->mr_numsrc = htons(rsrcs); + /* + * Count the new group record, and enqueue this + * packet if it wasn't already queued. + */ + m->m_pkthdr.vt_nrecs++; + m->m_pkthdr.rcvif = ifp; + if (m != m0) + IF_ENQUEUE(ifq, m); + nbytes += npbytes; + } while (nims != NULL); + drt |= crt; + crt = (~crt & REC_FULL); + } + + MLD_PRINTF(("%s: queued %d ALLOW_NEW, %d BLOCK_OLD\n", __func__, + nallow, nblock)); + + return (nbytes); +} + +static int +mld_v2_merge_state_changes(struct in6_multi *inm, struct ifqueue *ifscq) +{ + struct ifqueue *gq; + struct mbuf *m; /* pending state-change */ + struct mbuf *m0; /* copy of pending state-change */ + struct mbuf *mt; /* last state-change in packet */ + struct mbuf *n; + int docopy, domerge; + u_int recslen; + + IN6M_LOCK_ASSERT_HELD(inm); + + docopy = 0; + domerge = 0; + recslen = 0; + + /* + * If there are further pending retransmissions, make a writable + * copy of each queued state-change message before merging. + */ + if (inm->in6m_scrv > 0) + docopy = 1; + + gq = &inm->in6m_scq; +#ifdef MLD_DEBUG + if (gq->ifq_head == NULL) { + MLD_PRINTF(("%s: WARNING: queue for inm %p is empty\n", + __func__, inm)); + } +#endif + + /* + * Use IF_REMQUEUE() instead of IF_DEQUEUE() below, since the + * packet might not always be at the head of the ifqueue. + */ + m = gq->ifq_head; + while (m != NULL) { + /* + * Only merge the report into the current packet if + * there is sufficient space to do so; an MLDv2 report + * packet may only contain 65,535 group records. + * Always use a simple mbuf chain concatentation to do this, + * as large state changes for single groups may have + * allocated clusters. + */ + domerge = 0; + mt = ifscq->ifq_tail; + if (mt != NULL) { + recslen = m_length(m); + + if ((mt->m_pkthdr.vt_nrecs + + m->m_pkthdr.vt_nrecs <= + MLD_V2_REPORT_MAXRECS) && + (mt->m_pkthdr.len + recslen <= + (inm->in6m_ifp->if_mtu - MLD_MTUSPACE))) + domerge = 1; + } + + if (!domerge && IF_QFULL(gq)) { + MLD_PRINTF(("%s: outbound queue full, skipping whole " + "packet %p\n", __func__, m)); + n = m->m_nextpkt; + if (!docopy) { + IF_REMQUEUE(gq, m); + m_freem(m); + } + m = n; + continue; + } + + if (!docopy) { + MLD_PRINTF(("%s: dequeueing %p\n", __func__, m)); + n = m->m_nextpkt; + IF_REMQUEUE(gq, m); + m0 = m; + m = n; + } else { + MLD_PRINTF(("%s: copying %p\n", __func__, m)); + m0 = m_dup(m, M_NOWAIT); + if (m0 == NULL) + return (ENOMEM); + m0->m_nextpkt = NULL; + m = m->m_nextpkt; + } + + if (!domerge) { + MLD_PRINTF(("%s: queueing %p to ifscq %p)\n", + __func__, m0, ifscq)); + m0->m_pkthdr.rcvif = inm->in6m_ifp; + IF_ENQUEUE(ifscq, m0); + } else { + struct mbuf *mtl; /* last mbuf of packet mt */ + + MLD_PRINTF(("%s: merging %p with ifscq tail %p)\n", + __func__, m0, mt)); + + mtl = m_last(mt); + m0->m_flags &= ~M_PKTHDR; + mt->m_pkthdr.len += recslen; + mt->m_pkthdr.vt_nrecs += + m0->m_pkthdr.vt_nrecs; + + mtl->m_next = m0; + } + } + + return (0); +} + +/* + * Respond to a pending MLDv2 General Query. + */ +static void +mld_v2_dispatch_general_query(struct mld_ifinfo *mli) +{ + struct ifnet *ifp; + struct in6_multi *inm; + struct in6_multistep step; + int retval; + + MLI_LOCK_ASSERT_HELD(mli); + + VERIFY(mli->mli_version == MLD_VERSION_2); + + ifp = mli->mli_ifp; + MLI_UNLOCK(mli); + + in6_multihead_lock_shared(); + IN6_FIRST_MULTI(step, inm); + while (inm != NULL) { + IN6M_LOCK(inm); + if (inm->in6m_ifp != ifp) + goto next; + + switch (inm->in6m_state) { + case MLD_NOT_MEMBER: + case MLD_SILENT_MEMBER: + break; + case MLD_REPORTING_MEMBER: + case MLD_IDLE_MEMBER: + case MLD_LAZY_MEMBER: + case MLD_SLEEPING_MEMBER: + case MLD_AWAKENING_MEMBER: + inm->in6m_state = MLD_REPORTING_MEMBER; + MLI_LOCK(mli); + retval = mld_v2_enqueue_group_record(&mli->mli_gq, + inm, 0, 0, 0, 0); + MLI_UNLOCK(mli); + MLD_PRINTF(("%s: enqueue record = %d\n", + __func__, retval)); + break; + case MLD_G_QUERY_PENDING_MEMBER: + case MLD_SG_QUERY_PENDING_MEMBER: + case MLD_LEAVING_MEMBER: + break; + } +next: + IN6M_UNLOCK(inm); + IN6_NEXT_MULTI(step, inm); + } + in6_multihead_lock_done(); + + MLI_LOCK(mli); + mld_dispatch_queue(mli, &mli->mli_gq, MLD_MAX_RESPONSE_BURST); + MLI_LOCK_ASSERT_HELD(mli); + + /* + * Slew transmission of bursts over 500ms intervals. + */ + if (mli->mli_gq.ifq_head != NULL) { + mli->mli_v2_timer = 1 + MLD_RANDOM_DELAY( + MLD_RESPONSE_BURST_INTERVAL); + interface_timers_running6 = 1; + } +} + +/* + * Transmit the next pending message in the output queue. + * + * Must not be called with in6m_lockm or mli_lock held. + */ +static void +mld_dispatch_packet(struct mbuf *m) +{ + struct ip6_moptions *im6o; + struct ifnet *ifp; + struct ifnet *oifp = NULL; + struct mbuf *m0; + struct mbuf *md; + struct ip6_hdr *ip6; + struct mld_hdr *mld; + int error; + int off; + int type; + + MLD_PRINTF(("%s: transmit %p\n", __func__, m)); + + /* + * Check if the ifnet is still attached. + */ + ifp = m->m_pkthdr.rcvif; + if (ifp == NULL || !ifnet_is_attached(ifp, 0)) { + MLD_PRINTF(("%s: dropped %p as ifindex %u went away.\n", + __func__, m, (u_int)if_index)); + m_freem(m); + ip6stat.ip6s_noroute++; + return; + } + + im6o = ip6_allocmoptions(M_WAITOK); + if (im6o == NULL) { + m_freem(m); + return; + } + + im6o->im6o_multicast_hlim = 1; +#if MROUTING + im6o->im6o_multicast_loop = (ip6_mrouter != NULL); +#else + im6o->im6o_multicast_loop = 0; +#endif + im6o->im6o_multicast_ifp = ifp; + + if (m->m_flags & M_MLDV1) { + m0 = m; + } else { + m0 = mld_v2_encap_report(ifp, m); + if (m0 == NULL) { + MLD_PRINTF(("%s: dropped %p\n", __func__, m)); + /* + * mld_v2_encap_report() has already freed our mbuf. + */ + IM6O_REMREF(im6o); + ip6stat.ip6s_odropped++; + return; + } + } + + m->m_flags &= ~(M_PROTOFLAGS); + m0->m_pkthdr.rcvif = lo_ifp; + + ip6 = mtod(m0, struct ip6_hdr *); +#if 0 + (void) in6_setscope(&ip6->ip6_dst, ifp, NULL); /* XXX LOR */ +#else + /* + * XXX XXX Break some KPI rules to prevent an LOR which would + * occur if we called in6_setscope() at transmission. + * See comments at top of file. + */ + MLD_EMBEDSCOPE(&ip6->ip6_dst, ifp->if_index); +#endif + + /* + * Retrieve the ICMPv6 type before handoff to ip6_output(), + * so we can bump the stats. + */ + md = m_getptr(m0, sizeof(struct ip6_hdr), &off); + mld = (struct mld_hdr *)(mtod(md, uint8_t *) + off); + type = mld->mld_type; + + error = ip6_output(m0, &mld_po, NULL, IPV6_UNSPECSRC, im6o, + &oifp, NULL); + + IM6O_REMREF(im6o); + + if (error) { + MLD_PRINTF(("%s: ip6_output(%p) = %d\n", __func__, m0, error)); + if (oifp != NULL) + ifnet_release(oifp); + return; + } + + icmp6stat.icp6s_outhist[type]++; + if (oifp != NULL) { + icmp6_ifstat_inc(oifp, ifs6_out_msg); + switch (type) { + case MLD_LISTENER_REPORT: + case MLDV2_LISTENER_REPORT: + icmp6_ifstat_inc(oifp, ifs6_out_mldreport); + break; + case MLD_LISTENER_DONE: + icmp6_ifstat_inc(oifp, ifs6_out_mlddone); + break; + } + ifnet_release(oifp); + } +} + +/* + * Encapsulate an MLDv2 report. + * + * KAME IPv6 requires that hop-by-hop options be passed separately, + * and that the IPv6 header be prepended in a separate mbuf. + * + * Returns a pointer to the new mbuf chain head, or NULL if the + * allocation failed. + */ +static struct mbuf * +mld_v2_encap_report(struct ifnet *ifp, struct mbuf *m) +{ + struct mbuf *mh; + struct mldv2_report *mld; + struct ip6_hdr *ip6; + struct in6_ifaddr *ia; + int mldreclen; + + VERIFY(m->m_flags & M_PKTHDR); + + /* + * RFC3590: OK to send as :: or tentative during DAD. + */ + ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST); + if (ia == NULL) + MLD_PRINTF(("%s: warning: ia is NULL\n", __func__)); + + MGETHDR(mh, M_DONTWAIT, MT_HEADER); + if (mh == NULL) { + if (ia != NULL) + IFA_REMREF(&ia->ia_ifa); + m_freem(m); + return (NULL); + } + MH_ALIGN(mh, sizeof(struct ip6_hdr) + sizeof(struct mldv2_report)); + + mldreclen = m_length(m); + MLD_PRINTF(("%s: mldreclen is %d\n", __func__, mldreclen)); + + mh->m_len = sizeof(struct ip6_hdr) + sizeof(struct mldv2_report); + mh->m_pkthdr.len = sizeof(struct ip6_hdr) + + sizeof(struct mldv2_report) + mldreclen; + + ip6 = mtod(mh, struct ip6_hdr *); + ip6->ip6_flow = 0; + ip6->ip6_vfc &= ~IPV6_VERSION_MASK; + ip6->ip6_vfc |= IPV6_VERSION; + ip6->ip6_nxt = IPPROTO_ICMPV6; + if (ia != NULL) + IFA_LOCK(&ia->ia_ifa); + ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any; + if (ia != NULL) { + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); + ia = NULL; + } + ip6->ip6_dst = in6addr_linklocal_allv2routers; + /* scope ID will be set in netisr */ + + mld = (struct mldv2_report *)(ip6 + 1); + mld->mld_type = MLDV2_LISTENER_REPORT; + mld->mld_code = 0; + mld->mld_cksum = 0; + mld->mld_v2_reserved = 0; + mld->mld_v2_numrecs = htons(m->m_pkthdr.vt_nrecs); + m->m_pkthdr.vt_nrecs = 0; + m->m_flags &= ~M_PKTHDR; + + mh->m_next = m; + mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6, + sizeof(struct ip6_hdr), sizeof(struct mldv2_report) + mldreclen); + return (mh); +} + +#ifdef MLD_DEBUG +static const char * +mld_rec_type_to_str(const int type) +{ + switch (type) { + case MLD_CHANGE_TO_EXCLUDE_MODE: + return "TO_EX"; + break; + case MLD_CHANGE_TO_INCLUDE_MODE: + return "TO_IN"; + break; + case MLD_MODE_IS_EXCLUDE: + return "MODE_EX"; + break; + case MLD_MODE_IS_INCLUDE: + return "MODE_IN"; + break; + case MLD_ALLOW_NEW_SOURCES: + return "ALLOW_NEW"; + break; + case MLD_BLOCK_OLD_SOURCES: + return "BLOCK_OLD"; + break; + default: + break; + } + return "unknown"; +} +#endif + +void +mld_init(void) +{ + + MLD_PRINTF(("%s: initializing\n", __func__)); + + /* Setup lock group and attribute for mld6_mtx */ + mld_mtx_grp_attr = lck_grp_attr_alloc_init(); + mld_mtx_grp = lck_grp_alloc_init("mld_mtx\n", mld_mtx_grp_attr); + mld_mtx_attr = lck_attr_alloc_init(); + lck_mtx_init(&mld_mtx, mld_mtx_grp, mld_mtx_attr); + + ip6_initpktopts(&mld_po); + mld_po.ip6po_hlim = 1; + mld_po.ip6po_hbh = &mld_ra.hbh; + mld_po.ip6po_prefer_tempaddr = IP6PO_TEMPADDR_NOTPREFER; + mld_po.ip6po_flags = IP6PO_DONTFRAG; + LIST_INIT(&mli_head); + + mli_size = sizeof (struct mld_ifinfo); + mli_zone = zinit(mli_size, MLI_ZONE_MAX * mli_size, + 0, MLI_ZONE_NAME); + if (mli_zone == NULL) { + panic("%s: failed allocating %s", __func__, MLI_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(mli_zone, Z_EXPAND, TRUE); + zone_change(mli_zone, Z_CALLERACCT, FALSE); +} diff --git a/bsd/netinet6/mld6.h b/bsd/netinet6/mld6.h new file mode 100644 index 000000000..ceb41365c --- /dev/null +++ b/bsd/netinet6/mld6.h @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2010 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/*- + * Copyright (c) 2009 Bruce Simpson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET6_MLD6_H_ +#define _NETINET6_MLD6_H_ + +/* + * Multicast Listener Discovery (MLD) definitions. + */ + +/* Minimum length of any MLD protocol message. */ +#define MLD_MINLEN sizeof(struct icmp6_hdr) + +/* + * MLD v2 query format. + * See for struct mld_hdr + * (MLDv1 query and host report format). + */ +struct mldv2_query { + struct icmp6_hdr mld_icmp6_hdr; /* ICMPv6 header */ + struct in6_addr mld_addr; /* address being queried */ + uint8_t mld_misc; /* reserved/suppress/robustness */ + uint8_t mld_qqi; /* querier's query interval */ + uint16_t mld_numsrc; /* number of sources */ + /* followed by 1..numsrc source addresses */ +} __attribute__((__packed__)); +#define MLD_V2_QUERY_MINLEN sizeof(struct mldv2_query) +#define MLD_MRC_EXP(x) ((ntohs((x)) >> 12) & 0x0007) +#define MLD_MRC_MANT(x) (ntohs((x)) & 0x0fff) +#define MLD_QQIC_EXP(x) (((x) >> 4) & 0x07) +#define MLD_QQIC_MANT(x) ((x) & 0x0f) +#define MLD_QRESV(x) (((x) >> 4) & 0x0f) +#define MLD_SFLAG(x) (((x) >> 3) & 0x01) +#define MLD_QRV(x) ((x) & 0x07) + +/* + * MLDv2 host membership report header. + * mld_type: MLDV2_LISTENER_REPORT + */ +struct mldv2_report { + struct icmp6_hdr mld_icmp6_hdr; + /* followed by 1..numgrps records */ +} __attribute__((__packed__)); +/* overlaid on struct icmp6_hdr. */ +#define mld_numrecs mld_icmp6_hdr.icmp6_data16[1] + +struct mldv2_record { + uint8_t mr_type; /* record type */ + uint8_t mr_datalen; /* length of auxiliary data */ + uint16_t mr_numsrc; /* number of sources */ + struct in6_addr mr_addr; /* address being reported */ + /* followed by 1..numsrc source addresses */ +} __attribute__((__packed__)); +#define MLD_V2_REPORT_MAXRECS 65535 + +/* + * MLDv2 report modes. + */ +#define MLD_DO_NOTHING 0 /* don't send a record */ +#define MLD_MODE_IS_INCLUDE 1 /* MODE_IN */ +#define MLD_MODE_IS_EXCLUDE 2 /* MODE_EX */ +#define MLD_CHANGE_TO_INCLUDE_MODE 3 /* TO_IN */ +#define MLD_CHANGE_TO_EXCLUDE_MODE 4 /* TO_EX */ +#define MLD_ALLOW_NEW_SOURCES 5 /* ALLOW_NEW */ +#define MLD_BLOCK_OLD_SOURCES 6 /* BLOCK_OLD */ + +/* + * MLDv2 query types. + */ +#define MLD_V2_GENERAL_QUERY 1 +#define MLD_V2_GROUP_QUERY 2 +#define MLD_V2_GROUP_SOURCE_QUERY 3 + +/* + * Maximum report interval for MLDv1 host membership reports. + */ +#define MLD_V1_MAX_RI 10 + +/* + * MLD_TIMER_SCALE denotes that the MLD code field specifies + * time in milliseconds. + */ +#define MLD_TIMER_SCALE 1000 + +#endif /* _NETINET6_MLD6_H_ */ diff --git a/bsd/netinet6/mld6_var.h b/bsd/netinet6/mld6_var.h index bbeda1ff9..7652cdca9 100644 --- a/bsd/netinet6/mld6_var.h +++ b/bsd/netinet6/mld6_var.h @@ -1,3 +1,31 @@ +/* + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + /* $FreeBSD: src/sys/netinet6/mld6_var.h,v 1.1.2.1 2000/07/15 07:14:36 kris Exp $ */ /* $KAME: mld6_var.h,v 1.4 2000/03/25 07:23:54 sumikawa Exp $ */ @@ -34,21 +62,186 @@ #define _NETINET6_MLD6_VAR_H_ #include -#ifdef KERNEL_PRIVATE +/* + * Multicast Listener Discovery (MLD) + * implementation-specific definitions. + */ + +#ifdef PRIVATE +/* + * Per-link MLD state. + */ +#ifndef XNU_KERNEL_PRIVATE +struct mld_ifinfo { +#else +struct mld_ifinfo_u { +#endif /* XNU_KERNEL_PRIVATE */ + uint32_t mli_ifindex; /* interface this instance belongs to */ + uint32_t mli_version; /* MLDv1 Host Compatibility Mode */ + uint32_t mli_v1_timer; /* MLDv1 Querier Present timer (s) */ + uint32_t mli_v2_timer; /* MLDv2 General Query (interface) timer (s)*/ + uint32_t mli_flags; /* MLD per-interface flags */ + uint32_t mli_rv; /* MLDv2 Robustness Variable */ + uint32_t mli_qi; /* MLDv2 Query Interval (s) */ + uint32_t mli_qri; /* MLDv2 Query Response Interval (s) */ + uint32_t mli_uri; /* MLDv2 Unsolicited Report Interval (s) */ + uint32_t _pad; +}; + +#define MLIF_SILENT 0x00000001 /* Do not use MLD on this ifp */ +#define MLIF_USEALLOW 0x00000002 /* Use ALLOW/BLOCK for joins/leaves */ + +/* + * MLD version tag. + */ +#define MLD_VERSION_NONE 0 /* Invalid */ +#define MLD_VERSION_1 1 +#define MLD_VERSION_2 2 /* Default */ +#endif /* PRIVATE */ + +#ifdef XNU_KERNEL_PRIVATE +#include + +#define MLD_DEBUG 1 +#ifdef MLD_DEBUG +extern int mld_debug; +#define MLD_PRINTF(x) do { if (mld_debug) printf x; } while (0) +#else +#define MLD_PRINTF(x) +#endif + +#define MLD_RANDOM_DELAY(X) (random() % (X) + 1) +#define MLD_MAX_STATE_CHANGES 24 /* Max pending changes per group */ + +/* + * MLD per-group states. + */ +#define MLD_NOT_MEMBER 0 /* Can garbage collect group */ +#define MLD_SILENT_MEMBER 1 /* Do not perform MLD for group */ +#define MLD_REPORTING_MEMBER 2 /* MLDv1 we are reporter */ +#define MLD_IDLE_MEMBER 3 /* MLDv1 we reported last */ +#define MLD_LAZY_MEMBER 4 /* MLDv1 other member reporting */ +#define MLD_SLEEPING_MEMBER 5 /* MLDv1 start query response */ +#define MLD_AWAKENING_MEMBER 6 /* MLDv1 group timer will start */ +#define MLD_G_QUERY_PENDING_MEMBER 7 /* MLDv2 group query pending */ +#define MLD_SG_QUERY_PENDING_MEMBER 8 /* MLDv2 source query pending */ +#define MLD_LEAVING_MEMBER 9 /* MLDv2 dying gasp (pending last */ + /* retransmission of INCLUDE {}) */ +/* + * MLDv2 protocol control variables. + */ +#define MLD_RV_INIT 2 /* Robustness Variable */ +#define MLD_RV_MIN 1 +#define MLD_RV_MAX 7 + +#define MLD_QI_INIT 125 /* Query Interval (s) */ +#define MLD_QI_MIN 1 +#define MLD_QI_MAX 255 -#define MLD6_RANDOM_DELAY(X) (random() % (X) + 1) +#define MLD_QRI_INIT 10 /* Query Response Interval (s) */ +#define MLD_QRI_MIN 1 +#define MLD_QRI_MAX 255 + +#define MLD_URI_INIT 3 /* Unsolicited Report Interval (s) */ +#define MLD_URI_MIN 0 +#define MLD_URI_MAX 10 + +#define MLD_MAX_GS_SOURCES 256 /* # of sources in rx GS query */ +#define MLD_MAX_G_GS_PACKETS 8 /* # of packets to answer G/GS */ +#define MLD_MAX_STATE_CHANGE_PACKETS 8 /* # of packets per state change */ +#define MLD_MAX_RESPONSE_PACKETS 16 /* # of packets for general query */ +#define MLD_MAX_RESPONSE_BURST 4 /* # of responses to send at once */ +#define MLD_RESPONSE_BURST_INTERVAL (PR_SLOWHZ) /* 500ms */ + +/* + * MLD-specific mbuf flags. + */ +#define M_MLDV1 M_PROTO1 /* Packet is MLDv1 */ +#define M_GROUPREC M_PROTO3 /* mbuf chain is a group record */ + +/* + * Leading space for MLDv2 reports inside MTU. + * + * NOTE: This differs from IGMPv3 significantly. KAME IPv6 requires + * that a fully formed mbuf chain *without* the Router Alert option + * is passed to ip6_output(), however we must account for it in the + * MTU if we need to split an MLDv2 report into several packets. + * + * We now put the MLDv2 report header in the initial mbuf containing + * the IPv6 header. + */ +#define MLD_MTUSPACE (sizeof(struct ip6_hdr) + sizeof(struct mld_raopt) + \ + sizeof(struct icmp6_hdr)) + +struct mld_ifinfo { + decl_lck_mtx_data(, mli_lock); + uint32_t mli_refcnt; /* reference count */ + uint32_t mli_debug; /* see ifa_debug flags */ + LIST_ENTRY(mld_ifinfo) mli_link; + struct ifnet *mli_ifp; /* interface this instance belongs to */ + uint32_t mli_version; /* MLDv1 Host Compatibility Mode */ + uint32_t mli_v1_timer; /* MLDv1 Querier Present timer (s) */ + uint32_t mli_v2_timer; /* MLDv2 General Query (interface) timer (s)*/ + uint32_t mli_flags; /* MLD per-interface flags */ + uint32_t mli_rv; /* MLDv2 Robustness Variable */ + uint32_t mli_qi; /* MLDv2 Query Interval (s) */ + uint32_t mli_qri; /* MLDv2 Query Response Interval (s) */ + uint32_t mli_uri; /* MLDv2 Unsolicited Report Interval (s) */ + SLIST_HEAD(,in6_multi) mli_relinmhead; /* released groups */ + struct ifqueue mli_gq; /* queue of general query responses */ + struct ifqueue mli_v1q; /* MLDv1 message queue */ +}; + +#define MLI_LOCK_ASSERT_HELD(_mli) \ + lck_mtx_assert(&(_mli)->mli_lock, LCK_MTX_ASSERT_OWNED) + +#define MLI_LOCK_ASSERT_NOTHELD(_mli) \ + lck_mtx_assert(&(_mli)->mli_lock, LCK_MTX_ASSERT_NOTOWNED) + +#define MLI_LOCK(_mli) \ + lck_mtx_lock(&(_mli)->mli_lock) + +#define MLI_LOCK_SPIN(_mli) \ + lck_mtx_lock_spin(&(_mli)->mli_lock) + +#define MLI_CONVERT_LOCK(_mli) do { \ + MLI_LOCK_ASSERT_HELD(_mli); \ + lck_mtx_convert_spin(&(_mli)->mli_lock); \ +} while (0) + +#define MLI_UNLOCK(_mli) \ + lck_mtx_unlock(&(_mli)->mli_lock) + +#define MLI_ADDREF(_mli) \ + mli_addref(_mli, 0) + +#define MLI_ADDREF_LOCKED(_mli) \ + mli_addref(_mli, 1) + +#define MLI_REMREF(_mli) \ + mli_remref(_mli) /* - * States for MLD stop-listening processing + * Per-link MLD context. */ -#define MLD6_OTHERLISTENER 0 -#define MLD6_IREPORTEDLAST 1 +#define MLD_IFINFO(ifp) ((ifp)->if_mli) + +extern int mld_change_state(struct in6_multi *, const int); +extern struct mld_ifinfo *mld_domifattach(struct ifnet *, int); +extern void mld_domifreattach(struct mld_ifinfo *); +extern void mld_domifdetach(struct ifnet *); +extern void mld_fasttimo(void); +extern void mld_ifdetach(struct ifnet *); +extern int mld_input(struct mbuf *, int, int); +extern void mld_slowtimo(void); +extern void mld_init(void); +extern void mli_addref(struct mld_ifinfo *, int); +extern void mli_remref(struct mld_ifinfo *); + +#ifdef SYSCTL_DECL +SYSCTL_DECL(_net_inet6_mld); +#endif -void mld6_init(void); -void mld6_input(struct mbuf *, int); -void mld6_start_listening(struct in6_multi *); -void mld6_stop_listening(struct in6_multi *); -void mld6_fasttimeo(void); -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #endif /* _NETINET6_MLD6_VAR_H_ */ diff --git a/bsd/netinet6/nd6.c b/bsd/netinet6/nd6.c index d71746042..77ab7630a 100644 --- a/bsd/netinet6/nd6.c +++ b/bsd/netinet6/nd6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,6 +79,8 @@ #include #include #include +#include + #include #include @@ -86,19 +88,19 @@ #include #include #include -#include +#include #include #include +#include #include #include #include -#include #include #include #include #include -#include +#include #include #include "loop.h" @@ -125,6 +127,7 @@ int nd6_gctimer = (60 * 60 * 24); /* 1 day: garbage collection timer */ int nd6_maxndopt = 10; /* max # of ND options allowed */ int nd6_maxnudhint = 0; /* max # of subsequent upper layer hints */ +int nd6_maxqueuelen = 1; /* max # of packets cached in unresolved ND entries */ #if ND6_DEBUG int nd6_debug = 1; @@ -132,6 +135,8 @@ int nd6_debug = 1; int nd6_debug = 0; #endif +static int nd6_is_new_addr_neighbor (struct sockaddr_in6 *, struct ifnet *); + /* for debugging? */ static int nd6_inuse, nd6_allocated; @@ -151,7 +156,8 @@ static int nd6_inuse, nd6_allocated; * * - Routing lock (rnh_lock) * - * ln_hold, ln_asked, ln_expire, ln_state, ln_router, ln_byhint, ln_flags + * ln_hold, ln_asked, ln_expire, ln_state, ln_router, ln_byhint, ln_flags, + * ln_llreach, ln_lastused * * - Routing entry lock (rt_lock) * @@ -161,7 +167,7 @@ static int nd6_inuse, nd6_allocated; * freed until the route itself is freed. */ struct llinfo_nd6 llinfo_nd6 = { - &llinfo_nd6, &llinfo_nd6, NULL, NULL, 0, 0, 0, 0, 0, 0 + &llinfo_nd6, &llinfo_nd6, NULL, NULL, 0, 0, 0, 0, 0, 0, NULL, 0 }; /* Protected by nd_if_rwlock */ @@ -177,16 +183,22 @@ lck_rw_t *nd_if_rwlock; struct nd_drhead nd_defrouter; struct nd_prhead nd_prefix = { 0 }; +/* Serialization variables for nd6_drain() */ +static boolean_t nd6_drain_busy; +static void *nd6_drain_waitchan = &nd6_drain_busy; +static int nd6_drain_waiters = 0; + int nd6_recalc_reachtm_interval = ND6_RECALC_REACHTM_INTERVAL; static struct sockaddr_in6 all1_sa; static int regen_tmpaddr(struct in6_ifaddr *); -extern lck_mtx_t *ip6_mutex; extern lck_mtx_t *nd6_mutex; static void nd6_slowtimo(void *ignored_arg); static struct llinfo_nd6 *nd6_llinfo_alloc(void); static void nd6_llinfo_free(void *); +static void nd6_llinfo_purge(struct rtentry *); +static void nd6_llinfo_get_ri(struct rtentry *, struct rt_reach_info *); static void nd6_siocgdrlst(void *, int); static void nd6_siocgprlst(void *, int); @@ -249,6 +261,10 @@ nd6_init() panic("%s: failed allocating llinfo_nd6_zone", __func__); zone_change(llinfo_nd6_zone, Z_EXPAND, TRUE); + zone_change(llinfo_nd6_zone, Z_CALLERACCT, FALSE); + + nd6_nbr_init(); + nd6_rtr_init(); nd6_init_done = 1; @@ -278,9 +294,48 @@ nd6_llinfo_free(void *arg) ln->ln_hold = NULL; } + /* Purge any link-layer info caching */ + VERIFY(ln->ln_rt->rt_llinfo == ln); + if (ln->ln_rt->rt_llinfo_purge != NULL) + ln->ln_rt->rt_llinfo_purge(ln->ln_rt); + zfree(llinfo_nd6_zone, ln); } +static void +nd6_llinfo_purge(struct rtentry *rt) +{ + struct llinfo_nd6 *ln = rt->rt_llinfo; + + RT_LOCK_ASSERT_HELD(rt); + VERIFY(rt->rt_llinfo_purge == nd6_llinfo_purge && ln != NULL); + + if (ln->ln_llreach != NULL) { + RT_CONVERT_LOCK(rt); + ifnet_llreach_free(ln->ln_llreach); + ln->ln_llreach = NULL; + } + ln->ln_lastused = 0; +} + +static void +nd6_llinfo_get_ri(struct rtentry *rt, struct rt_reach_info *ri) +{ + struct llinfo_nd6 *ln = rt->rt_llinfo; + struct if_llreach *lr = ln->ln_llreach; + + if (lr == NULL) { + bzero(ri, sizeof (*ri)); + } else { + IFLR_LOCK(lr); + /* Export to rt_reach_info structure */ + ifnet_lr2ri(lr, ri); + /* Export ND6 send expiration time */ + ri->ri_snd_expire = ifnet_llreach_up2cal(lr, ln->ln_lastused); + IFLR_UNLOCK(lr); + } +} + int nd6_ifattach(struct ifnet *ifp) { @@ -338,7 +393,6 @@ nd6_ifattach(struct ifnet *ifp) ND.basereachable = REACHABLE_TIME; ND.reachable = ND_COMPUTE_RTIME(ND.basereachable); ND.retrans = RETRANS_TIMER; - ND.receivedra = 0; ND.flags = ND6_IFF_PERFORMNUD; lck_rw_done(nd_if_rwlock); nd6_setmtu(ifp); @@ -378,8 +432,8 @@ nd6_setmtu(struct ifnet *ifp) * the sanity checks related to the maximum MTU allowed for the * interface (a value that is known only by the interface layer), * by sending the request down via ifnet_ioctl(). The use of the - * ND level maxmtu and linkmtu (the latter obtained via RA) are done - * via IN6_LINKMTU() which does further checking against if_mtu. + * ND level maxmtu and linkmtu are done via IN6_LINKMTU() which + * does further checking against if_mtu. */ maxmtu = ndi->maxmtu = ifp->if_mtu; @@ -394,6 +448,7 @@ nd6_setmtu(struct ifnet *ifp) "new link MTU on %s%d (%u) is too small for IPv6\n", ifp->if_name, ifp->if_unit, (uint32_t)ndi->maxmtu); } + ndi->linkmtu = ifp->if_mtu; lck_rw_done(nd_if_rwlock); /* also adjust in6_maxmtu if necessary. */ @@ -480,16 +535,16 @@ nd6_options( struct nd_opt_hdr *nd_opt; int i = 0; - if (!ndopts) - panic("ndopts == NULL in nd6_options\n"); - if (!ndopts->nd_opts_last) - panic("uninitialized ndopts in nd6_options\n"); - if (!ndopts->nd_opts_search) + if (ndopts == NULL) + panic("ndopts == NULL in nd6_options"); + if (ndopts->nd_opts_last == NULL) + panic("uninitialized ndopts in nd6_options"); + if (ndopts->nd_opts_search == NULL) return 0; while (1) { nd_opt = nd6_option(ndopts); - if (!nd_opt && !ndopts->nd_opts_last) { + if (nd_opt == NULL && ndopts->nd_opts_last == NULL) { /* * Message validation requires that all included * options have a length that is greater than zero. @@ -499,7 +554,7 @@ nd6_options( return -1; } - if (!nd_opt) + if (nd_opt == NULL) goto skip1; switch (nd_opt->nd_opt_type) { @@ -525,6 +580,9 @@ nd6_options( ndopts->nd_opts_pi_end = (struct nd_opt_prefix_info *)nd_opt; break; + case ND_OPT_RDNSS: + /* ignore */ + break; default: /* * Unknown options must be silently ignored, @@ -581,7 +639,6 @@ nd6_drain(__unused void *ignored_arg) struct rtentry *rt; struct sockaddr_in6 *dst; struct llinfo_nd6 *next; - struct nd_ifinfo ndi; /* ln_next/prev/rt is protected by rnh_lock */ next = ln->ln_next; @@ -634,7 +691,6 @@ nd6_drain(__unused void *ignored_arg) ln = next; continue; } - ndi = nd_ifinfo[ifp->if_index]; lck_rw_done(nd_if_rwlock); RT_LOCK_ASSERT_HELD(rt); @@ -643,13 +699,15 @@ nd6_drain(__unused void *ignored_arg) case ND6_LLINFO_INCOMPLETE: if (ln->ln_asked < nd6_mmaxtries) { ln->ln_asked++; + lck_rw_lock_shared(nd_if_rwlock); ln->ln_expire = timenow.tv_sec + - ndi.retrans / 1000; + nd_ifinfo[ifp->if_index].retrans / 1000; + lck_rw_done(nd_if_rwlock); RT_ADDREF_LOCKED(rt); RT_UNLOCK(rt); lck_mtx_unlock(rnh_lock); nd6_ns_output(ifp, NULL, &dst->sin6_addr, - ln, 0, 0); + ln, 0); RT_REMREF(rt); } else { struct mbuf *m = ln->ln_hold; @@ -701,22 +759,26 @@ nd6_drain(__unused void *ignored_arg) break; case ND6_LLINFO_DELAY: - if ((ndi.flags & ND6_IFF_PERFORMNUD) != 0) { + lck_rw_lock_shared(nd_if_rwlock); + if ((nd_ifinfo[ifp->if_index].flags & + ND6_IFF_PERFORMNUD) != 0) { /* We need NUD */ ln->ln_asked = 1; ln->ln_state = ND6_LLINFO_PROBE; ln->ln_expire = timenow.tv_sec + - ndi.retrans / 1000; + nd_ifinfo[ifp->if_index].retrans / 1000; + lck_rw_done(nd_if_rwlock); RT_ADDREF_LOCKED(rt); RT_UNLOCK(rt); lck_mtx_unlock(rnh_lock); nd6_ns_output(ifp, &dst->sin6_addr, - &dst->sin6_addr, ln, 0, 0); + &dst->sin6_addr, ln, 0); lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_NOTOWNED); RT_REMREF(rt); goto again; } + lck_rw_done(nd_if_rwlock); ln->ln_state = ND6_LLINFO_STALE; /* XXX */ ln->ln_expire = rt_expiry(rt, timenow.tv_sec, nd6_gctimer); @@ -726,13 +788,15 @@ nd6_drain(__unused void *ignored_arg) case ND6_LLINFO_PROBE: if (ln->ln_asked < nd6_umaxtries) { ln->ln_asked++; + lck_rw_lock_shared(nd_if_rwlock); ln->ln_expire = timenow.tv_sec + - ndi.retrans / 1000; + nd_ifinfo[ifp->if_index].retrans / 1000; + lck_rw_done(nd_if_rwlock); RT_ADDREF_LOCKED(rt); RT_UNLOCK(rt); lck_mtx_unlock(rnh_lock); nd6_ns_output(ifp, &dst->sin6_addr, - &dst->sin6_addr, ln, 0, 0); + &dst->sin6_addr, ln, 0); RT_REMREF(rt); } else { RT_UNLOCK(rt); @@ -771,12 +835,13 @@ nd6_drain(__unused void *ignored_arg) if (dr->expire && dr->expire < timenow.tv_sec) { struct nd_defrouter *t; t = TAILQ_NEXT(dr, dr_entry); - defrtrlist_del(dr, 1); + defrtrlist_del(dr); dr = t; } else { dr = TAILQ_NEXT(dr, dr_entry); } } + lck_mtx_unlock(nd6_mutex); /* * expire interface addresses. @@ -784,22 +849,21 @@ nd6_drain(__unused void *ignored_arg) * However, from a stricter speci-confrmance standpoint, we should * rather separate address lifetimes and prefix lifetimes. */ - addrloop: +addrloop: + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); for (ia6 = in6_ifaddrs; ia6; ia6 = nia6) { nia6 = ia6->ia_next; + IFA_LOCK(&ia6->ia_ifa); + /* + * Extra reference for ourselves; it's no-op if + * we don't have to regenerate temporary address, + * otherwise it protects the address from going + * away since we drop in6_ifaddr_rwlock below. + */ + IFA_ADDREF_LOCKED(&ia6->ia_ifa); /* check address lifetime */ lt6 = &ia6->ia6_lifetime; if (IFA6_IS_INVALID(ia6)) { - int regen = 0; - - /* - * Extra reference for ourselves; it's no-op if - * we don't have to regenerate temporary address, - * otherwise it protects the address from going - * away since we drop nd6_mutex below. - */ - ifaref(&ia6->ia_ifa); - /* * If the expiring address is temporary, try * regenerating a new one. This would be useful when @@ -818,20 +882,27 @@ nd6_drain(__unused void *ignored_arg) * hang. This is safe because the goto addrloop * leads to a reevaluation of the in6_ifaddrs list */ - lck_mtx_unlock(nd6_mutex); - if (regen_tmpaddr(ia6) == 0) - regen = 1; - lck_mtx_lock(nd6_mutex); + IFA_UNLOCK(&ia6->ia_ifa); + lck_rw_done(&in6_ifaddr_rwlock); + (void) regen_tmpaddr(ia6); + } else { + IFA_UNLOCK(&ia6->ia_ifa); + lck_rw_done(&in6_ifaddr_rwlock); } - in6_purgeaddr(&ia6->ia_ifa, 1); + /* + * Purging the address would have caused + * in6_ifaddr_rwlock to be dropped and reacquired; + * therefore search again from the beginning + * of in6_ifaddrs list. + */ + in6_purgeaddr(&ia6->ia_ifa); /* Release extra reference taken above */ - ifafree(&ia6->ia_ifa); - - if (regen) - goto addrloop; /* XXX: see below */ + IFA_REMREF(&ia6->ia_ifa); + goto addrloop; } + IFA_LOCK_ASSERT_HELD(&ia6->ia_ifa); if (IFA6_IS_DEPRECATED(ia6)) { int oldflags = ia6->ia6_flags; @@ -846,7 +917,8 @@ nd6_drain(__unused void *ignored_arg) (oldflags & IN6_IFF_DEPRECATED) == 0) { /* see NOTE above */ - lck_mtx_unlock(nd6_mutex); + IFA_UNLOCK(&ia6->ia_ifa); + lck_rw_done(&in6_ifaddr_rwlock); if (regen_tmpaddr(ia6) == 0) { /* * A new temporary address is @@ -860,10 +932,13 @@ nd6_drain(__unused void *ignored_arg) * loop just for safety. Or does this * significantly reduce performance?? */ - lck_mtx_lock(nd6_mutex); + /* Release extra reference */ + IFA_REMREF(&ia6->ia_ifa); goto addrloop; } - lck_mtx_lock(nd6_mutex); + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + } else { + IFA_UNLOCK(&ia6->ia_ifa); } } else { /* @@ -871,8 +946,26 @@ nd6_drain(__unused void *ignored_arg) * preferred. */ ia6->ia6_flags &= ~IN6_IFF_DEPRECATED; + IFA_UNLOCK(&ia6->ia_ifa); } + lck_rw_assert(&in6_ifaddr_rwlock, LCK_RW_ASSERT_EXCLUSIVE); + /* Release extra reference taken above */ + IFA_REMREF(&ia6->ia_ifa); } + lck_rw_done(&in6_ifaddr_rwlock); + + lck_mtx_lock(nd6_mutex); + /* + * Since we drop the nd6_mutex in prelist_remove, we want to run this + * section single threaded. + */ + while (nd6_drain_busy) { + nd6_drain_waiters++; + msleep(nd6_drain_waitchan, nd6_mutex, (PZERO-1), + __func__, NULL); + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + } + nd6_drain_busy = TRUE; /* expire prefix list */ pr = nd_prefix.lh_first; @@ -882,19 +975,38 @@ nd6_drain(__unused void *ignored_arg) * since pltime is just for autoconf, pltime processing for * prefix is not necessary. */ + NDPR_LOCK(pr); + if (pr->ndpr_stateflags & NDPRF_PROCESSED) { + NDPR_UNLOCK(pr); + pr = pr->ndpr_next; + continue; + } if (pr->ndpr_expire && pr->ndpr_expire < timenow.tv_sec) { - struct nd_prefix *t; - t = pr->ndpr_next; - /* * address expiration and prefix expiration are * separate. NEVER perform in6_purgeaddr here. */ - - prelist_remove(pr, 1); - pr = t; - } else + pr->ndpr_stateflags |= NDPRF_PROCESSED; + NDPR_ADDREF_LOCKED(pr); + prelist_remove(pr); + NDPR_UNLOCK(pr); + NDPR_REMREF(pr); + pr = nd_prefix.lh_first; + } else { + pr->ndpr_stateflags |= NDPRF_PROCESSED; + NDPR_UNLOCK(pr); pr = pr->ndpr_next; + } + } + LIST_FOREACH(pr, &nd_prefix, ndpr_entry) { + NDPR_LOCK(pr); + pr->ndpr_stateflags &= ~NDPRF_PROCESSED; + NDPR_UNLOCK(pr); + } + nd6_drain_busy = FALSE; + if (nd6_drain_waiters > 0) { + nd6_drain_waiters = 0; + wakeup(nd6_drain_waitchan); } lck_mtx_unlock(nd6_mutex); } @@ -921,25 +1033,29 @@ regen_tmpaddr( getmicrotime(&timenow); ifp = ia6->ia_ifa.ifa_ifp; - ifnet_lock_exclusive(ifp); + ifnet_lock_shared(ifp); for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = ifa->ifa_list.tqe_next) { struct in6_ifaddr *it6; - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; - + } it6 = (struct in6_ifaddr *)ifa; /* ignore no autoconf addresses. */ - if ((it6->ia6_flags & IN6_IFF_AUTOCONF) == 0) + if ((it6->ia6_flags & IN6_IFF_AUTOCONF) == 0) { + IFA_UNLOCK(ifa); continue; - + } /* ignore autoconf addresses with different prefixes. */ - if (it6->ia6_ndpr == NULL || it6->ia6_ndpr != ia6->ia6_ndpr) + if (it6->ia6_ndpr == NULL || it6->ia6_ndpr != ia6->ia6_ndpr) { + IFA_UNLOCK(ifa); continue; - + } /* * Now we are looking at an autoconf address with the same * prefix as ours. If the address is temporary and is still @@ -949,6 +1065,9 @@ regen_tmpaddr( */ if ((it6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && !IFA6_IS_DEPRECATED(it6)) { + IFA_UNLOCK(ifa); + if (public_ifa6 != NULL) + IFA_REMREF(&public_ifa6->ia_ifa); public_ifa6 = NULL; break; } @@ -959,8 +1078,15 @@ regen_tmpaddr( * loop here, because there may be a still-preferred temporary * address with the prefix. */ - if (!IFA6_IS_DEPRECATED(it6)) - public_ifa6 = it6; + if (!IFA6_IS_DEPRECATED(it6)) { + IFA_ADDREF_LOCKED(ifa); /* for public_ifa6 */ + IFA_UNLOCK(ifa); + if (public_ifa6 != NULL) + IFA_REMREF(&public_ifa6->ia_ifa); + public_ifa6 = it6; + } else { + IFA_UNLOCK(ifa); + } } ifnet_lock_done(ifp); @@ -970,8 +1096,10 @@ regen_tmpaddr( if ((e = in6_tmpifadd(public_ifa6, 0, M_WAITOK)) != 0) { log(LOG_NOTICE, "regen_tmpaddr: failed to create a new" " tmp addr,errno=%d\n", e); + IFA_REMREF(&public_ifa6->ia_ifa); return(-1); } + IFA_REMREF(&public_ifa6->ia_ifa); return(0); } @@ -987,7 +1115,7 @@ nd6_purge( struct ifnet *ifp) { struct llinfo_nd6 *ln; - struct nd_defrouter *dr, *ndr, drany; + struct nd_defrouter *dr, *ndr; struct nd_prefix *pr, *npr; /* Nuke default router list entries toward ifp */ @@ -999,18 +1127,38 @@ nd6_purge( */ for (dr = TAILQ_NEXT(dr, dr_entry); dr; dr = ndr) { ndr = TAILQ_NEXT(dr, dr_entry); + if (dr->stateflags & NDDRF_INSTALLED) + continue; if (dr->ifp == ifp) - defrtrlist_del(dr, 1); + defrtrlist_del(dr); } dr = TAILQ_FIRST(&nd_defrouter); if (dr->ifp == ifp) - defrtrlist_del(dr, 1); + defrtrlist_del(dr); + } + + for (dr = TAILQ_FIRST(&nd_defrouter); dr; dr = ndr) { + ndr = TAILQ_NEXT(dr, dr_entry); + if (!(dr->stateflags & NDDRF_INSTALLED)) + continue; + + if (dr->ifp == ifp) + defrtrlist_del(dr); } /* Nuke prefix list entries toward ifp */ for (pr = nd_prefix.lh_first; pr; pr = npr) { npr = pr->ndpr_next; + NDPR_LOCK(pr); if (pr->ndpr_ifp == ifp) { + /* + * Because if_detach() does *not* release prefixes + * while purging addresses the reference count will + * still be above zero. We therefore reset it to + * make sure that the prefix really gets purged. + */ + pr->ndpr_addrcnt = 0; + /* * Previously, pr->ndpr_addr is removed as well, * but I strongly believe we don't have to do it. @@ -1019,27 +1167,28 @@ nd6_purge( * by itself. * (jinmei@kame.net 20010129) */ - prelist_remove(pr, 1); + NDPR_ADDREF_LOCKED(pr); + prelist_remove(pr); + NDPR_UNLOCK(pr); + NDPR_REMREF(pr); + } else { + NDPR_UNLOCK(pr); } } + lck_mtx_unlock(nd6_mutex); /* cancel default outgoing interface setting */ if (nd6_defifindex == ifp->if_index) { - /* Release nd6_mutex as it will be acquired - * during nd6_setdefaultiface again - */ - lck_mtx_unlock(nd6_mutex); nd6_setdefaultiface(0); - lck_mtx_lock(nd6_mutex); } if (!ip6_forwarding && (ip6_accept_rtadv || (ifp->if_eflags & IFEF_ACCEPT_RTADVD))) { + lck_mtx_lock(nd6_mutex); /* refresh default router list */ - bzero(&drany, sizeof(drany)); - defrouter_delreq(&drany, 0); - defrouter_select(); + defrouter_reset(); + defrouter_select(ifp); + lck_mtx_unlock(nd6_mutex); } - lck_mtx_unlock(nd6_mutex); /* * Nuke neighbor cache entries for the ifp. @@ -1098,28 +1247,31 @@ nd6_lookup( { struct rtentry *rt; struct sockaddr_in6 sin6; + unsigned int ifscope; bzero(&sin6, sizeof(sin6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_addr = *addr6; -#if SCOPEDROUTING - sin6.sin6_scope_id = in6_addr2scopeid(ifp, addr6); -#endif - if (rt_locked) - lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); - rt = rt_locked ? rtalloc1_locked((struct sockaddr *)&sin6, create, 0) : - rtalloc1((struct sockaddr *)&sin6, create, 0); + ifscope = (ifp != NULL) ? ifp->if_index : IFSCOPE_NONE; + if (rt_locked) { + lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); + rt = rtalloc1_scoped_locked((struct sockaddr *)&sin6, + create, 0, ifscope); + } else { + rt = rtalloc1_scoped((struct sockaddr *)&sin6, + create, 0, ifscope); + } if (rt != NULL) { RT_LOCK(rt); if ((rt->rt_flags & RTF_LLINFO) == 0) { /* - * This is the case for the default route. If we - * want to create a neighbor cache for the address, - * we should free the route for the destination and - * allocate an interface route. + * This is the case for the default route. + * If we want to create a neighbor cache for the + * address, we should free the route for the + * destination and allocate an interface route. */ if (create) { RT_UNLOCK(rt); @@ -1134,6 +1286,7 @@ nd6_lookup( if (rt == NULL) { if (create && ifp) { struct ifaddr *ifa; + u_int32_t ifa_flags; int e; /* @@ -1155,11 +1308,14 @@ nd6_lookup( */ if (!rt_locked) lck_mtx_lock(rnh_lock); - if ((e = rtrequest_locked(RTM_ADD, + IFA_LOCK_SPIN(ifa); + ifa_flags = ifa->ifa_flags; + IFA_UNLOCK(ifa); + if ((e = rtrequest_scoped_locked(RTM_ADD, (struct sockaddr *)&sin6, ifa->ifa_addr, (struct sockaddr *)&all1_sa, - (ifa->ifa_flags | RTF_HOST | RTF_LLINFO) & - ~RTF_CLONING, &rt)) != 0) { + (ifa_flags | RTF_HOST | RTF_LLINFO) & + ~RTF_CLONING, &rt, ifscope)) != 0) { if (e != EEXIST) log(LOG_ERR, "%s: failed to add route " "for a neighbor(%s), errno=%d\n", @@ -1167,7 +1323,7 @@ nd6_lookup( } if (!rt_locked) lck_mtx_unlock(rnh_lock); - ifafree(ifa); + IFA_REMREF(ifa); if (rt == NULL) return(NULL); @@ -1191,10 +1347,13 @@ nd6_lookup( * it might be the loopback interface if the entry is for our * own address on a non-loopback interface. Instead, we should * use rt->rt_ifa->ifa_ifp, which would specify the REAL - * interface. + * interface. + * Note also that ifa_ifp and ifp may differ when we connect two + * interfaces to a same link, install a link prefix to an interface, + * and try to install a neighbor cache on an interface that does not + * have a route to the prefix. */ - if (ifp == NULL || (ifp->if_type == IFT_PPP) || - (ifp->if_eflags & IFEF_NOAUTOIPV6LL) || + if (ifp == NULL || (rt->rt_flags & RTF_GATEWAY) || (rt->rt_flags & RTF_LLINFO) == 0 || rt->rt_gateway->sa_family != AF_LINK || rt->rt_llinfo == NULL || (ifp && rt->rt_ifa->ifa_ifp != ifp)) { @@ -1215,73 +1374,132 @@ nd6_lookup( } /* - * Detect if a given IPv6 address identifies a neighbor on a given link. - * XXX: should take care of the destination of a p2p link? + * Test whether a given IPv6 address is a neighbor or not, ignoring + * the actual neighbor cache. The neighbor cache is ignored in order + * to not reenter the routing code from within itself. */ -int -nd6_is_addr_neighbor( +static int +nd6_is_new_addr_neighbor( struct sockaddr_in6 *addr, - struct ifnet *ifp, - int rt_locked) + struct ifnet *ifp) { - struct ifaddr *ifa; - struct rtentry *rt; - int i; + struct nd_prefix *pr; + struct ifaddr *dstaddr; -#define IFADDR6(a) ((((struct in6_ifaddr *)(a))->ia_addr).sin6_addr) -#define IFMASK6(a) ((((struct in6_ifaddr *)(a))->ia_prefixmask).sin6_addr) + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); /* * A link-local address is always a neighbor. - * XXX: we should use the sin6_scope_id field rather than the embedded - * interface index. + * XXX: a link does not necessarily specify a single interface. */ - if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr) && - ntohs(*(u_int16_t *)&addr->sin6_addr.s6_addr[2]) == ifp->if_index) - return(1); + if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr)) { + struct sockaddr_in6 sin6_copy; + u_int32_t zone; + + /* + * We need sin6_copy since sa6_recoverscope() may modify the + * content (XXX). + */ + sin6_copy = *addr; + if (sa6_recoverscope(&sin6_copy)) + return (0); /* XXX: should be impossible */ + if (in6_setscope(&sin6_copy.sin6_addr, ifp, &zone)) + return (0); + if (sin6_copy.sin6_scope_id == zone) + return (1); + else + return (0); + } /* * If the address matches one of our addresses, * it should be a neighbor. + * If the address matches one of our on-link prefixes, it should be a + * neighbor. */ - ifnet_lock_shared(ifp); - for (ifa = ifp->if_addrlist.tqh_first; - ifa; - ifa = ifa->ifa_list.tqe_next) - { - if (ifa->ifa_addr->sa_family != AF_INET6) + for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { + NDPR_LOCK(pr); + if (pr->ndpr_ifp != ifp) { + NDPR_UNLOCK(pr); + continue; + } + if (!(pr->ndpr_stateflags & NDPRF_ONLINK)) { + NDPR_UNLOCK(pr); continue; + } + if (IN6_ARE_MASKED_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr, + &addr->sin6_addr, &pr->ndpr_mask)) { + NDPR_UNLOCK(pr); + return (1); + } + NDPR_UNLOCK(pr); + } - for (i = 0; i < 4; i++) { - if ((IFADDR6(ifa).s6_addr32[i] ^ - addr->sin6_addr.s6_addr32[i]) & - IFMASK6(ifa).s6_addr32[i]) - continue; + /* + * If the address is assigned on the node of the other side of + * a p2p interface, the address should be a neighbor. + */ + dstaddr = ifa_ifwithdstaddr((struct sockaddr *)addr); + if (dstaddr != NULL) { + if (dstaddr->ifa_ifp == ifp) { + IFA_REMREF(dstaddr); + return (1); } - ifnet_lock_done(ifp); - return(1); + IFA_REMREF(dstaddr); + dstaddr = NULL; } - ifnet_lock_done(ifp); + + /* + * If the default router list is empty, all addresses are regarded + * as on-link, and thus, as a neighbor. + * XXX: we restrict the condition to hosts, because routers usually do + * not have the "default router list". + */ + if (!ip6_forwarding && TAILQ_FIRST(&nd_defrouter) == NULL && + nd6_defifindex == ifp->if_index) { + return (1); + } + + return (0); +} + + +/* + * Detect if a given IPv6 address identifies a neighbor on a given link. + * XXX: should take care of the destination of a p2p link? + */ +int +nd6_is_addr_neighbor(struct sockaddr_in6 *addr, struct ifnet *ifp, int rt_locked) +{ + struct rtentry *rt; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_lock(nd6_mutex); + if (nd6_is_new_addr_neighbor(addr, ifp)) { + lck_mtx_unlock(nd6_mutex); + return (1); + } + lck_mtx_unlock(nd6_mutex); /* * Even if the address matches none of our addresses, it might be - * in the neighbor cache. Callee returns a locked route upon - * success. + * in the neighbor cache. */ if ((rt = nd6_lookup(&addr->sin6_addr, 0, ifp, rt_locked)) != NULL) { RT_LOCK_ASSERT_HELD(rt); RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); - return(1); + return (1); } - return(0); -#undef IFADDR6 -#undef IFMASK6 + return (0); } /* * Free an nd6 llinfo entry. + * Since the function would cause significant changes in the kernel, DO NOT + * make it global, unless you have a strong reason for the change, and are sure + * that the change is safe. */ void nd6_free( @@ -1324,12 +1542,15 @@ nd6_free( * See a corresponding comment in nd6_na_input(). */ RT_UNLOCK(rt); + lck_mtx_unlock(nd6_mutex); rt6_flush(&in6, rt->rt_ifp); + lck_mtx_lock(nd6_mutex); } else { RT_UNLOCK(rt); } if (dr) { + NDDR_REMREF(dr); /* * Unreachablity of a router might affect the default * router selection and on-link detection of advertised @@ -1353,21 +1574,12 @@ nd6_free( * the check now. */ RT_UNLOCK(rt); - pfxlist_onlink_check(1); + pfxlist_onlink_check(); - if (dr == TAILQ_FIRST(&nd_defrouter)) { - /* - * It is used as the current default router, - * so we have to move it to the end of the - * list and choose a new one. - * XXX: it is not very efficient if this is - * the only router. - */ - TAILQ_REMOVE(&nd_defrouter, dr, dr_entry); - TAILQ_INSERT_TAIL(&nd_defrouter, dr, dr_entry); - - defrouter_select(); - } + /* + * refresh default router list + */ + defrouter_select(rt->rt_ifp); } RT_LOCK_ASSERT_NOTHELD(rt); } else { @@ -1390,7 +1602,7 @@ nd6_free( /* * Upper-layer reachability hint for Neighbor Unreachability Detection. * - * XXX cost-effective metods? + * XXX cost-effective methods? */ void nd6_nud_hint( @@ -1444,8 +1656,8 @@ nd6_nud_hint( ln->ln_state = ND6_LLINFO_REACHABLE; if (ln->ln_expire) { lck_rw_lock_shared(nd_if_rwlock); - ln->ln_expire = rt_expiry(rt, timenow.tv_sec, - nd_ifinfo[rt->rt_ifp->if_index].reachable); + ln->ln_expire = timenow.tv_sec + + nd_ifinfo[rt->rt_ifp->if_index].reachable; lck_rw_done(nd_if_rwlock); } done: @@ -1490,17 +1702,17 @@ nd6_rtrequest( if (!nd6_need_cache(ifp)) { /* stf case */ no_nd_cache = 1; } else { + struct sockaddr_in6 sin6; + + rtkey_to_sa6(rt, &sin6); /* * nd6_is_addr_neighbor() may call nd6_lookup(), * therefore we drop rt_lock to avoid deadlock - * during the lookup. Using rt_key(rt) is still - * safe because it won't change while rnh_lock - * is held. + * during the lookup. */ RT_ADDREF_LOCKED(rt); RT_UNLOCK(rt); - no_nd_cache = !nd6_is_addr_neighbor( - (struct sockaddr_in6 *)rt_key(rt), ifp, 1); + no_nd_cache = !nd6_is_addr_neighbor(&sin6, ifp, 1); RT_LOCK(rt); RT_REMREF_LOCKED(rt); } @@ -1535,12 +1747,13 @@ nd6_rtrequest( * SIN(rt_mask(rt))->sin_addr.s_addr != 0xffffffff) * rt->rt_flags |= RTF_CLONING; */ - if (rt->rt_flags & (RTF_CLONING | RTF_LLINFO)) { + if ((rt->rt_flags & RTF_CLONING) || + ((rt->rt_flags & RTF_LLINFO) && ln == NULL)) { /* - * Case 1: This route should come from - * a route to interface. RTF_LLINFO flag is set - * for a host route whose destination should be - * treated as on-link. + * Case 1: This route should come from a route to + * interface (RTF_CLONING case) or the route should be + * treated as on-link but is currently not + * (RTF_LLINFO && ln == NULL case). */ if (rt_setgate(rt, rt_key(rt), (struct sockaddr *)&null_sdl) == 0) { @@ -1575,15 +1788,6 @@ nd6_rtrequest( * (or should we allow proxy ND configuration only for * routers? there's no mention about proxy ND from hosts) */ -#if 0 - /* XXX it does not work */ - if (rt->rt_flags & RTF_ANNOUNCE) - nd6_na_output(ifp, - &SIN6(rt_key(rt))->sin6_addr, - &SIN6(rt_key(rt))->sin6_addr, - ip6_forwarding ? ND_NA_FLAG_ROUTER : 0, - 1, NULL); -#endif /* FALLTHROUGH */ case RTM_RESOLVE: if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) == 0) { @@ -1593,9 +1797,12 @@ nd6_rtrequest( */ if (gate->sa_family != AF_LINK || gate->sa_len < sizeof(null_sdl)) { - log(LOG_DEBUG, - "nd6_rtrequest: bad gateway value: %s\n", - if_name(ifp)); + /* Don't complain in case of RTM_ADD */ + if (req == RTM_RESOLVE) { + log(LOG_DEBUG, + "nd6_rtrequest: bad gateway " + "value: %s\n", if_name(ifp)); + } break; } SDL(gate)->sdl_type = ifp->if_type; @@ -1612,6 +1819,8 @@ nd6_rtrequest( log(LOG_DEBUG, "nd6_rtrequest: malloc failed\n"); break; } + rt->rt_llinfo_get_ri = nd6_llinfo_get_ri; + rt->rt_llinfo_purge = nd6_llinfo_purge; rt->rt_llinfo_free = nd6_llinfo_free; nd6_inuse++; @@ -1688,14 +1897,22 @@ nd6_rtrequest( SDL(gate)->sdl_alen = ifp->if_addrlen; } if (nd6_useloopback) { -#if IFNET_ROUTE_REFCNT - /* Adjust route ref count for the interfaces */ - if (rt->rt_if_ref_fn != NULL && - rt->rt_ifp != lo_ifp) { - rt->rt_if_ref_fn(lo_ifp, 1); - rt->rt_if_ref_fn(rt->rt_ifp, -1); + if (rt->rt_ifp != lo_ifp) { + /* + * Purge any link-layer info caching. + */ + if (rt->rt_llinfo_purge != NULL) + rt->rt_llinfo_purge(rt); + + /* + * Adjust route ref count for the + * interfaces. + */ + if (rt->rt_if_ref_fn != NULL) { + rt->rt_if_ref_fn(lo_ifp, 1); + rt->rt_if_ref_fn(rt->rt_ifp, -1); + } } -#endif /* IFNET_ROUTE_REFCNT */ rt->rt_ifp = lo_ifp; /* XXX */ /* * Make sure rt_ifa be equal to the ifaddr @@ -1709,7 +1926,7 @@ nd6_rtrequest( rtsetifa(rt, ifa); } } - ifafree(ifa); + IFA_REMREF(ifa); } else if (rt->rt_flags & RTF_ANNOUNCE) { ln->ln_expire = 0; ln->ln_state = ND6_LLINFO_REACHABLE; @@ -1718,26 +1935,30 @@ nd6_rtrequest( /* join solicited node multicast for proxy ND */ if (ifp->if_flags & IFF_MULTICAST) { struct in6_addr llsol; + struct in6_multi *in6m; int error; llsol = SIN6(rt_key(rt))->sin6_addr; - llsol.s6_addr16[0] = htons(0xff02); - llsol.s6_addr16[1] = htons(ifp->if_index); + llsol.s6_addr32[0] = IPV6_ADDR_INT32_MLL; llsol.s6_addr32[1] = 0; llsol.s6_addr32[2] = htonl(1); llsol.s6_addr8[12] = 0xff; - - if (!in6_addmulti(&llsol, ifp, &error, 0)) { + if (in6_setscope(&llsol, ifp, NULL)) + break; + error = in6_mc_join(ifp, &llsol, NULL, &in6m, 0); + if (error) { nd6log((LOG_ERR, "%s: failed to join " "%s (errno=%d)\n", if_name(ifp), ip6_sprintf(&llsol), error)); + } else { + IN6M_REMREF(in6m); } } } break; case RTM_DELETE: - if (!ln) + if (ln == NULL) break; /* leave from solicited node multicast for proxy ND */ if ((rt->rt_flags & RTF_ANNOUNCE) != 0 && @@ -1746,17 +1967,19 @@ nd6_rtrequest( struct in6_multi *in6m; llsol = SIN6(rt_key(rt))->sin6_addr; - llsol.s6_addr16[0] = htons(0xff02); - llsol.s6_addr16[1] = htons(ifp->if_index); + llsol.s6_addr32[0] = IPV6_ADDR_INT32_MLL; llsol.s6_addr32[1] = 0; llsol.s6_addr32[2] = htonl(1); llsol.s6_addr8[12] = 0xff; - - ifnet_lock_shared(ifp); - IN6_LOOKUP_MULTI(llsol, ifp, in6m); - ifnet_lock_done(ifp); - if (in6m) - in6_delmulti(in6m, 0); + if (in6_setscope(&llsol, ifp, NULL) == 0) { + in6_multihead_lock_shared(); + IN6_LOOKUP_MULTI(&llsol, ifp, in6m); + in6_multihead_lock_done(); + if (in6m != NULL) { + in6_mc_leave(in6m, NULL); + IN6M_REMREF(in6m); + } + } } nd6_inuse--; /* @@ -1767,10 +1990,18 @@ nd6_rtrequest( */ if (ln->ln_flags & ND6_LNF_IN_USE) LN_DEQUEUE(ln); + + /* + * Purge any link-layer info caching. + */ + if (rt->rt_llinfo_purge != NULL) + rt->rt_llinfo_purge(rt); + rt->rt_flags &= ~RTF_LLINFO; - if (ln->ln_hold != NULL) + if (ln->ln_hold != NULL) { m_freem(ln->ln_hold); - ln->ln_hold = NULL; + ln->ln_hold = NULL; + } } } @@ -1835,7 +2066,6 @@ nd6_siocgprlst(void *data, int data_is_64) struct in6_prlist_64 *prl_64 = (struct in6_prlist_64 *)data; struct in6_prlist_32 *prl_32 = (struct in6_prlist_32 *)data; struct nd_prefix *pr; - struct rr_prefix *rpp; int i = 0; lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); @@ -1852,8 +2082,9 @@ nd6_siocgprlst(void *data, int data_is_64) struct nd_pfxrouter *pfr; int j; + NDPR_LOCK(pr); (void) in6_embedscope(&prl_64->prefix[i].prefix, - &pr->ndpr_prefix, NULL, NULL); + &pr->ndpr_prefix, NULL, NULL, NULL); prl_64->prefix[i].raflags = pr->ndpr_raf; prl_64->prefix[i].prefixlen = pr->ndpr_plen; prl_64->prefix[i].vltime = pr->ndpr_vltime; @@ -1884,27 +2115,12 @@ nd6_siocgprlst(void *data, int data_is_64) } prl_64->prefix[i].advrtrs = j; prl_64->prefix[i].origin = PR_ORIG_RA; + NDPR_UNLOCK(pr); i++; pr = pr->ndpr_next; } - for (rpp = LIST_FIRST(&rr_prefix); rpp; - rpp = LIST_NEXT(rpp, rp_entry)) { - if (i >= PRLSTSIZ) - break; - (void) in6_embedscope(&prl_64->prefix[i].prefix, - &pr->ndpr_prefix, NULL, NULL); - prl_64->prefix[i].raflags = rpp->rp_raf; - prl_64->prefix[i].prefixlen = rpp->rp_plen; - prl_64->prefix[i].vltime = rpp->rp_vltime; - prl_64->prefix[i].pltime = rpp->rp_pltime; - prl_64->prefix[i].if_index = rpp->rp_ifp->if_index; - prl_64->prefix[i].expire = rpp->rp_expire; - prl_64->prefix[i].advrtrs = 0; - prl_64->prefix[i].origin = rpp->rp_origin; - i++; - } return; } /* For 32-bit process */ @@ -1912,8 +2128,9 @@ nd6_siocgprlst(void *data, int data_is_64) struct nd_pfxrouter *pfr; int j; + NDPR_LOCK(pr); (void) in6_embedscope(&prl_32->prefix[i].prefix, - &pr->ndpr_prefix, NULL, NULL); + &pr->ndpr_prefix, NULL, NULL, NULL); prl_32->prefix[i].raflags = pr->ndpr_raf; prl_32->prefix[i].prefixlen = pr->ndpr_plen; prl_32->prefix[i].vltime = pr->ndpr_vltime; @@ -1944,27 +2161,11 @@ nd6_siocgprlst(void *data, int data_is_64) } prl_32->prefix[i].advrtrs = j; prl_32->prefix[i].origin = PR_ORIG_RA; + NDPR_UNLOCK(pr); i++; pr = pr->ndpr_next; } - - for (rpp = LIST_FIRST(&rr_prefix); rpp; - rpp = LIST_NEXT(rpp, rp_entry)) { - if (i >= PRLSTSIZ) - break; - (void) in6_embedscope(&prl_32->prefix[i].prefix, - &pr->ndpr_prefix, NULL, NULL); - prl_32->prefix[i].raflags = rpp->rp_raf; - prl_32->prefix[i].prefixlen = rpp->rp_plen; - prl_32->prefix[i].vltime = rpp->rp_vltime; - prl_32->prefix[i].pltime = rpp->rp_pltime; - prl_32->prefix[i].if_index = rpp->rp_ifp->if_index; - prl_32->prefix[i].expire = rpp->rp_expire; - prl_32->prefix[i].advrtrs = 0; - prl_32->prefix[i].origin = rpp->rp_origin; - i++; - } } int @@ -1972,7 +2173,7 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) { struct in6_ndireq *ndi = (struct in6_ndireq *)data; struct in6_ondireq *ondi = (struct in6_ondireq *)data; - struct nd_defrouter *dr, any; + struct nd_defrouter *dr; struct nd_prefix *pr; struct rtentry *rt; int i = ifp->if_index, error = 0; @@ -2018,7 +2219,6 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) ondi->ndi.flags = nd_ifinfo[i].flags; ondi->ndi.recalctm = nd_ifinfo[i].recalctm; ondi->ndi.chlim = nd_ifinfo[i].chlim; - ondi->ndi.receivedra = nd_ifinfo[i].receivedra; lck_rw_done(nd_if_rwlock); break; @@ -2040,10 +2240,9 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) * xxx sumikawa: should not delete route if default * route equals to the top of default router list */ - bzero(&any, sizeof(any)); lck_mtx_lock(nd6_mutex); - defrouter_delreq(&any, 1); - defrouter_select(); + defrouter_reset(); + defrouter_select(ifp); lck_mtx_unlock(nd6_mutex); /* xxx sumikawa: flush prefix list */ break; @@ -2051,28 +2250,75 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) case SIOCSPFXFLUSH_IN6: { /* flush all the prefix advertised by routers */ struct nd_prefix *next; - lck_mtx_lock(nd6_mutex); + lck_mtx_lock(nd6_mutex); for (pr = nd_prefix.lh_first; pr; pr = next) { - struct in6_ifaddr *ia, *ia_next; + struct in6_ifaddr *ia; next = pr->ndpr_next; - if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) + NDPR_LOCK(pr); + if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) { + NDPR_UNLOCK(pr); continue; /* XXX */ - + } + if (ifp != lo_ifp && pr->ndpr_ifp != ifp) { + NDPR_UNLOCK(pr); + continue; + } /* do we really have to remove addresses as well? */ - for (ia = in6_ifaddrs; ia; ia = ia_next) { - /* ia might be removed. keep the next ptr. */ - ia_next = ia->ia_next; + NDPR_ADDREF_LOCKED(pr); + NDPR_UNLOCK(pr); + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + ia = in6_ifaddrs; + while (ia != NULL) { + IFA_LOCK(&ia->ia_ifa); + if ((ia->ia6_flags & IN6_IFF_AUTOCONF) == 0) { + IFA_UNLOCK(&ia->ia_ifa); + ia = ia->ia_next; + continue; + } - if ((ia->ia6_flags & IN6_IFF_AUTOCONF) == 0) + if (ia->ia6_ndpr == pr) { + IFA_ADDREF_LOCKED(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); + lck_rw_done(&in6_ifaddr_rwlock); + lck_mtx_unlock(nd6_mutex); + in6_purgeaddr(&ia->ia_ifa); + lck_mtx_lock(nd6_mutex); + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + IFA_REMREF(&ia->ia_ifa); + /* + * Purging the address caused + * in6_ifaddr_rwlock to be + * dropped and + * reacquired; therefore search again + * from the beginning of in6_ifaddrs. + * The same applies for the prefix list. + */ + ia = in6_ifaddrs; + next = nd_prefix.lh_first; continue; - if (ia->ia6_ndpr == pr) - in6_purgeaddr(&ia->ia_ifa, 1); + } + IFA_UNLOCK(&ia->ia_ifa); + ia = ia->ia_next; } - prelist_remove(pr, 1); + lck_rw_done(&in6_ifaddr_rwlock); + NDPR_LOCK(pr); + prelist_remove(pr); + NDPR_UNLOCK(pr); + /* + * If we were trying to restart this loop + * above by changing the value of 'next', we might + * end up freeing the only element on the list + * when we call NDPR_REMREF(). + * When this happens, we also have get out of this + * loop because we have nothing else to do. + */ + if (pr == next) + next = NULL; + NDPR_REMREF(pr); } lck_mtx_unlock(nd6_mutex); break; @@ -2090,9 +2336,12 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) */ for (dr = TAILQ_NEXT(dr, dr_entry); dr; dr = next) { next = TAILQ_NEXT(dr, dr_entry); - defrtrlist_del(dr, 1); + if (ifp == lo_ifp || dr->ifp == ifp) + defrtrlist_del(dr); } - defrtrlist_del(TAILQ_FIRST(&nd_defrouter), 1); + if (ifp == lo_ifp || + TAILQ_FIRST(&nd_defrouter)->ifp == ifp) + defrtrlist_del(TAILQ_FIRST(&nd_defrouter)); } lck_mtx_unlock(nd6_mutex); break; @@ -2183,8 +2432,9 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) struct in6_ndifreq_64 *ndif_64 = (struct in6_ndifreq_64 *)data; struct in6_ndifreq_32 *ndif_32 = (struct in6_ndifreq_32 *)data; - return (nd6_setdefaultiface(cmd == SIOCSDEFIFACE_IN6_64 ? - ndif_64->ifindex : ndif_32->ifindex)); + error = nd6_setdefaultiface(cmd == SIOCSDEFIFACE_IN6_64 ? + ndif_64->ifindex : ndif_32->ifindex); + return (error); /* NOTREACHED */ } } @@ -2214,9 +2464,9 @@ nd6_cache_lladdr( int newstate = 0; struct timeval timenow; - if (!ifp) + if (ifp == NULL) panic("ifp == NULL in nd6_cache_lladdr"); - if (!from) + if (from == NULL) panic("from == NULL in nd6_cache_lladdr"); /* nothing must be updated for unspecified address */ @@ -2236,12 +2486,6 @@ nd6_cache_lladdr( rt = nd6_lookup(from, 0, ifp, 0); if (rt == NULL) { -#if 0 - /* nothing must be done if there's no lladdr */ - if (!lladdr || !lladdrlen) - return; -#endif - if ((rt = nd6_lookup(from, 1, ifp, 0)) == NULL) return; RT_LOCK_ASSERT_HELD(rt); @@ -2257,6 +2501,8 @@ nd6_cache_lladdr( is_newentry = 0; } + if (rt == NULL) + return; if ((rt->rt_flags & (RTF_GATEWAY | RTF_LLINFO)) != RTF_LLINFO) { fail: RT_UNLOCK(rt); @@ -2264,10 +2510,10 @@ nd6_cache_lladdr( rtfree(rt); return; } - ln = rt->rt_llinfo; - if (!ln) + ln = (struct llinfo_nd6 *)rt->rt_llinfo; + if (ln == NULL) goto fail; - if (!rt->rt_gateway) + if (rt->rt_gateway == NULL) goto fail; if (rt->rt_gateway->sa_family != AF_LINK) goto fail; @@ -2300,18 +2546,21 @@ nd6_cache_lladdr( */ sdl->sdl_alen = ifp->if_addrlen; bcopy(lladdr, LLADDR(sdl), ifp->if_addrlen); + + /* cache the gateway (sender HW) address */ + nd6_llreach_alloc(rt, ifp, LLADDR(sdl), sdl->sdl_alen, FALSE); } if (!is_newentry) { - if ((!olladdr && lladdr) /* (3) */ - || (olladdr && lladdr && llchange)) { /* (5) */ + if ((!olladdr && lladdr != NULL) || /* (3) */ + (olladdr && lladdr != NULL && llchange)) { /* (5) */ do_update = 1; newstate = ND6_LLINFO_STALE; } else /* (1-2,4) */ do_update = 0; } else { do_update = 1; - if (!lladdr) /* (6) */ + if (lladdr == NULL) /* (6) */ newstate = ND6_LLINFO_NOSTATE; else /* (7) */ newstate = ND6_LLINFO_STALE; @@ -2331,18 +2580,19 @@ nd6_cache_lladdr( * we must set the timer now, although it is actually * meaningless. */ - ln->ln_expire = rt_expiry(rt, timenow.tv_sec, - nd6_gctimer); + ln->ln_expire = timenow.tv_sec + nd6_gctimer; ln->ln_hold = NULL; if (m != NULL) { + struct sockaddr_in6 sin6; + + rtkey_to_sa6(rt, &sin6); /* * we assume ifp is not a p2p here, so just * set the 2nd argument as the 1st one. */ RT_UNLOCK(rt); - nd6_output(ifp, ifp, m, - (struct sockaddr_in6 *)rt_key(rt), rt, 0); + nd6_output(ifp, ifp, m, &sin6, rt); RT_LOCK(rt); } } else if (ln->ln_state == ND6_LLINFO_INCOMPLETE) { @@ -2375,7 +2625,7 @@ nd6_cache_lladdr( * 0 n y -- (3) c s s * 0 y y n (4) c s s * 0 y y y (5) c s s - * 1 -- n -- (6) c c c s + * 1 -- n -- (6) c c c s * 1 -- y -- (7) c c s c s * * (c=clear s=set) @@ -2391,8 +2641,8 @@ nd6_cache_lladdr( case ND_REDIRECT: /* * If the icmp is a redirect to a better router, always set the - * is_router flag. Otherwise, if the entry is newly created, - * clear the flag. [RFC 2461, sec 8.3] + * is_router flag. Otherwise, if the entry is newly created, + * clear the flag. [RFC 2461, sec 8.3] */ if (code == ND_REDIRECT_ROUTER) ln->ln_router = 1; @@ -2409,8 +2659,8 @@ nd6_cache_lladdr( /* * Mark an entry with lladdr as a router. */ - if ((!is_newentry && (olladdr || lladdr)) /* (2-5) */ - || (is_newentry && lladdr)) { /* (7) */ + if ((!is_newentry && (olladdr || lladdr)) || /* (2-5) */ + (is_newentry && lladdr)) { /* (7) */ ln->ln_router = 1; } break; @@ -2436,7 +2686,7 @@ nd6_cache_lladdr( RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); lck_mtx_lock(nd6_mutex); - defrouter_select(); + defrouter_select(ifp); lck_mtx_unlock(nd6_mutex); } else { RT_REMREF_LOCKED(rt); @@ -2475,7 +2725,7 @@ nd6_slowtimo( #define senderr(e) { error = (e); goto bad;} int nd6_output(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, - struct sockaddr_in6 *dst, struct rtentry *hint0, int locked) + struct sockaddr_in6 *dst, struct rtentry *hint0) { struct mbuf *m = m0; struct rtentry *rt = hint0, *hint = hint0; @@ -2520,14 +2770,14 @@ nd6_output(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, if (!(rt->rt_flags & RTF_UP)) { RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); - if ((hint = rt = rtalloc1((struct sockaddr *)dst, - 1, 0)) != NULL) { + if ((hint = rt = rtalloc1_scoped((struct sockaddr *)dst, + 1, 0, ifp->if_index)) != NULL) { RT_LOCK_SPIN(rt); if (rt->rt_ifp != ifp) { /* XXX: loop care? */ RT_UNLOCK(rt); error = nd6_output(ifp, origifp, m0, - dst, rt, locked); + dst, rt); rtfree(rt); return (error); } @@ -2541,7 +2791,7 @@ nd6_output(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, struct in6_ifaddr *ia6 = NULL; struct sockaddr_in6 gw6; - gw6 = *((struct sockaddr_in6 *)rt->rt_gateway); + rtgw_to_sa6(rt, &gw6); /* * Must drop rt_lock since nd6_is_addr_neighbor() * calls nd6_lookup() and acquires rnh_lock. @@ -2564,7 +2814,7 @@ nd6_output(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, * XXX: we may need a more generic rule here. */ if (ia6 != NULL) - ifafree(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); if ((ifp->if_flags & IFF_POINTOPOINT) == 0) senderr(EHOSTUNREACH); goto sendpkt; @@ -2601,7 +2851,8 @@ nd6_output(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, RT_UNLOCK(rt); rtfree(gwrt); lookup: - gwrt = rtalloc1((struct sockaddr *)&gw6, 1, 0); + gwrt = rtalloc1_scoped((struct sockaddr *)&gw6, + 1, 0, ifp->if_index); RT_LOCK(rt); /* @@ -2680,6 +2931,12 @@ nd6_output(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, if (rt && (rt->rt_flags & RTF_LLINFO) != 0) { ln = rt->rt_llinfo; } else { + struct sockaddr_in6 sin6; + /* + * Clear out Scope ID field in case it is set. + */ + sin6 = *dst; + sin6.sin6_scope_id = 0; /* * Since nd6_is_addr_neighbor() internally calls nd6_lookup(), * the condition below is not very efficient. But we believe @@ -2689,7 +2946,7 @@ nd6_output(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, */ if (rt != NULL) RT_UNLOCK(rt); - if (nd6_is_addr_neighbor(dst, ifp, 0)) { + if (nd6_is_addr_neighbor(&sin6, ifp, 0)) { /* "rtrele" may have been used, so clean up "rt" now */ if (rt != NULL) { /* Don't free "hint0" */ @@ -2795,7 +3052,7 @@ nd6_output(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, lck_rw_done(nd_if_rwlock); RT_UNLOCK(rt); /* We still have a reference on rt (for ln) */ - nd6_ns_output(ifp, NULL, &dst->sin6_addr, ln, 0, locked); + nd6_ns_output(ifp, NULL, &dst->sin6_addr, ln, 0); } else { RT_UNLOCK(rt); } @@ -2828,19 +3085,20 @@ nd6_output(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, if (rt != NULL) RT_LOCK_ASSERT_NOTHELD(rt); - /* Clean up HW checksum flags before sending the packet */ - m->m_pkthdr.csum_data = 0; - m->m_pkthdr.csum_flags = 0; + /* discard the packet if IPv6 operation is disabled on the interface */ + lck_rw_lock_shared(nd_if_rwlock); + if ((nd_ifinfo[ifp->if_index].flags & ND6_IFF_IFDISABLED)) { + lck_rw_done(nd_if_rwlock); + error = ENETDOWN; /* better error? */ + goto bad; + } + lck_rw_done(nd_if_rwlock); if ((ifp->if_flags & IFF_LOOPBACK) != 0) { /* forwarding rules require the original scope_id */ m->m_pkthdr.rcvif = origifp; - if (locked) - lck_mtx_unlock(ip6_mutex); error = dlil_output(origifp, PF_INET6, m, (caddr_t)rt, (struct sockaddr *)dst, 0); - if (locked) - lck_mtx_lock(ip6_mutex); goto release; } else { /* Do not allow loopback address to wind up on a wire */ @@ -2862,13 +3120,20 @@ nd6_output(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, } } + if (rt != NULL) { + RT_LOCK_SPIN(rt); + /* Mark use timestamp */ + if (rt->rt_llinfo != NULL) + nd6_llreach_use(rt->rt_llinfo); + RT_UNLOCK(rt); + } + + if (hint && nstat_collect) + nstat_route_tx(hint, 1, m->m_pkthdr.len, 0); + m->m_pkthdr.rcvif = NULL; - if (locked) - lck_mtx_unlock(ip6_mutex); error = dlil_output(ifp, PF_INET6, m, (caddr_t)rt, (struct sockaddr *)dst, 0); - if (locked) - lck_mtx_lock(ip6_mutex); goto release; bad: @@ -2923,8 +3188,13 @@ nd6_need_cache( #if IFT_IEEE80211 case IFT_IEEE80211: #endif - case IFT_BRIDGE: case IFT_GIF: /* XXX need more cases? */ + case IFT_PPP: +#if IFT_TUNNEL + case IFT_TUNNEL: +#endif + case IFT_BRIDGE: + case IFT_CELLULAR: return(1); default: return(0); @@ -3110,6 +3380,8 @@ nd6_sysctl_drlist SYSCTL_HANDLER_ARGS "default router list (%s)\n", ip6_sprintf(&dr->rtaddr)); d->flags = dr->flags; + d->stateflags = dr->stateflags; + d->stateflags &= ~NDDRF_PROCESSED; d->rtlifetime = dr->rtlifetime; d->expire = dr->expire; d->if_index = dr->ifp->if_index; @@ -3140,6 +3412,8 @@ nd6_sysctl_drlist SYSCTL_HANDLER_ARGS "default router list (%s)\n", ip6_sprintf(&dr->rtaddr)); d_32->flags = dr->flags; + d_32->stateflags = dr->stateflags; + d_32->stateflags &= ~NDDRF_PROCESSED; d_32->rtlifetime = dr->rtlifetime; d_32->expire = dr->expire; d_32->if_index = dr->ifp->if_index; @@ -3184,6 +3458,7 @@ nd6_sysctl_prlist SYSCTL_HANDLER_ARGS bzero(p, sizeof (*p)); sin6 = (struct sockaddr_in6 *)(p + 1); + NDPR_LOCK(pr); p->prefix = pr->ndpr_prefix; if (in6_recoverscope(&p->prefix, &p->prefix.sin6_addr, pr->ndpr_ifp) != 0) @@ -3196,7 +3471,7 @@ nd6_sysctl_prlist SYSCTL_HANDLER_ARGS p->pltime = pr->ndpr_pltime; p->if_index = pr->ndpr_ifp->if_index; p->expire = pr->ndpr_expire; - p->refcnt = pr->ndpr_refcnt; + p->refcnt = pr->ndpr_addrcnt; p->flags = pr->ndpr_stateflags; p->origin = PR_ORIG_RA; advrtrs = 0; @@ -3222,6 +3497,7 @@ nd6_sysctl_prlist SYSCTL_HANDLER_ARGS advrtrs++; } p->advrtrs = advrtrs; + NDPR_UNLOCK(pr); } else { panic("buffer too short"); } @@ -3246,6 +3522,7 @@ nd6_sysctl_prlist SYSCTL_HANDLER_ARGS bzero(p_32, sizeof (*p_32)); sin6 = (struct sockaddr_in6 *)(p_32 + 1); + NDPR_LOCK(pr); p_32->prefix = pr->ndpr_prefix; if (in6_recoverscope(&p_32->prefix, &p_32->prefix.sin6_addr, pr->ndpr_ifp) != 0) @@ -3258,7 +3535,7 @@ nd6_sysctl_prlist SYSCTL_HANDLER_ARGS p_32->pltime = pr->ndpr_pltime; p_32->if_index = pr->ndpr_ifp->if_index; p_32->expire = pr->ndpr_expire; - p_32->refcnt = pr->ndpr_refcnt; + p_32->refcnt = pr->ndpr_addrcnt; p_32->flags = pr->ndpr_stateflags; p_32->origin = PR_ORIG_RA; advrtrs = 0; @@ -3284,6 +3561,7 @@ nd6_sysctl_prlist SYSCTL_HANDLER_ARGS advrtrs++; } p_32->advrtrs = advrtrs; + NDPR_UNLOCK(pr); } else { panic("buffer too short"); } @@ -3297,7 +3575,7 @@ nd6_sysctl_prlist SYSCTL_HANDLER_ARGS return (error); } SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ND6_DRLIST, nd6_drlist, - CTLFLAG_RD, 0, 0, nd6_sysctl_drlist, "S,in6_defrouter",""); + CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, nd6_sysctl_drlist, "S,in6_defrouter",""); SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ND6_PRLIST, nd6_prlist, - CTLFLAG_RD, 0, 0, nd6_sysctl_prlist, "S,in6_defrouter",""); + CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, nd6_sysctl_prlist, "S,in6_defrouter",""); diff --git a/bsd/netinet6/nd6.h b/bsd/netinet6/nd6.h index 26fca3c11..601e075aa 100644 --- a/bsd/netinet6/nd6.h +++ b/bsd/netinet6/nd6.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -69,7 +69,7 @@ #include -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #include struct llinfo_nd6 { @@ -89,12 +89,14 @@ struct llinfo_nd6 { short ln_router; /* 2^0: ND6 router bit */ int ln_byhint; /* # of times we made it reachable by UL hint */ u_int32_t ln_flags; /* flags; see below */ + struct if_llreach *ln_llreach; /* link-layer reachability record */ + u_int64_t ln_lastused; /* last used timestamp */ }; /* Values for ln_flags */ #define ND6_LNF_TIMER_SKIP 0x1 /* modified by nd6_timer() */ #define ND6_LNF_IN_USE 0x2 /* currently in llinfo_nd6 list */ -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #define ND6_LLINFO_PURGE -3 #define ND6_LLINFO_NOSTATE -2 @@ -112,16 +114,27 @@ struct llinfo_nd6 { #define ND6_LLINFO_DELAY 3 #define ND6_LLINFO_PROBE 4 -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #define ND6_IS_LLINFO_PROBREACH(n) ((n)->ln_state > ND6_LLINFO_INCOMPLETE) -#endif /* KERNEL_PRIVATE */ - -#if !defined(KERNEL_PRIVATE) +#define ND6_LLINFO_PERMANENT(n) (((n)->ln_expire == 0) && ((n)->ln_state > ND6_LLINFO_INCOMPLETE)) +#define ND6_IFF_PERFORMNUD 0x1 +#define ND6_IFF_ACCEPT_RTADV 0x2 /* APPLE: not used. Innterface specific router advertisments are + * handled with a specific ifnet flag: IFEF_ACCEPT_RTADVD + */ +#define ND6_IFF_PREFER_SOURCE 0x4 /* APPLE: NOT USED not related to ND. */ +#define ND6_IFF_IFDISABLED 0x8 /* IPv6 operation is disabled due to + * DAD failure. (XXX: not ND-specific) + */ +#define ND6_IFF_DONT_SET_IFROUTE 0x10 /* NOT USED */ + +#endif /* XNU_KERNEL_PRIVATE */ + +#if !defined(XNU_KERNEL_PRIVATE) struct nd_ifinfo { #else /* For binary compatibility, this structure must not change */ struct nd_ifinfo_compat { -#endif /* !KERNEL_PRIVATE */ +#endif /* !XNU_KERNEL_PRIVATE */ u_int32_t linkmtu; /* LinkMTU */ u_int32_t maxmtu; /* Upper bound of LinkMTU */ u_int32_t basereachable; /* BaseReachableTime */ @@ -137,7 +150,7 @@ struct nd_ifinfo_compat { u_int8_t randomid[8]; /* current random ID */ }; -#if defined(KERNEL_PRIVATE) +#if defined(XNU_KERNEL_PRIVATE) struct nd_ifinfo { u_int32_t linkmtu; /* LinkMTU */ u_int32_t maxmtu; /* Upper bound of LinkMTU */ @@ -147,7 +160,7 @@ struct nd_ifinfo { u_int32_t flags; /* Flags */ int recalctm; /* BaseReacable re-calculation timer */ u_int8_t chlim; /* CurHopLimit */ - u_int8_t receivedra; + u_int8_t initialized; /* Flag to see the entry is initialized */ /* the following 3 members are for privacy extension for addrconf */ u_int8_t randomseed0[8]; /* upper 64 bits of MD5 digest */ u_int8_t randomseed1[8]; /* lower 64 bits (usually the EUI64 IFID) */ @@ -156,7 +169,7 @@ struct nd_ifinfo { int32_t nprefixes; int32_t ndefrouters; }; -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #define ND6_IFF_PERFORMNUD 0x1 @@ -169,7 +182,7 @@ struct in6_nbrinfo { int expire; /* lifetime for NDP state transition */ }; -#if defined(KERNEL_PRIVATE) +#if defined(XNU_KERNEL_PRIVATE) struct in6_nbrinfo_32 { char ifname[IFNAMSIZ]; struct in6_addr addr; @@ -187,7 +200,7 @@ struct in6_nbrinfo_64 { int state; int expire; } __attribute__((aligned(8))); -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #define DRLSTSIZ 10 #define PRLSTSIZ 10 @@ -203,7 +216,7 @@ struct in6_drlist { } defrouter[DRLSTSIZ]; }; -#if defined(KERNEL_PRIVATE) +#if defined(XNU_KERNEL_PRIVATE) struct in6_drlist_32 { char ifname[IFNAMSIZ]; struct { @@ -225,20 +238,30 @@ struct in6_drlist_64 { u_short if_index __attribute__((aligned(8))); } defrouter[DRLSTSIZ] __attribute__((aligned(8))); }; -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ + +/* valid values for stateflags */ +#define NDDRF_INSTALLED 0x1 /* installed in the routing table */ +#define NDDRF_IFSCOPE 0x2 /* installed as a scoped route */ +#define NDDRF_STATIC 0x4 /* for internal use only */ +#ifdef XNU_KERNEL_PRIVATE +#define NDDRF_PROCESSED 0x10 +#endif struct in6_defrouter { struct sockaddr_in6 rtaddr; u_char flags; + u_char stateflags; u_short rtlifetime; u_long expire; u_short if_index; }; -#if defined(KERNEL_PRIVATE) +#if defined(XNU_KERNEL_PRIVATE) struct in6_defrouter_32 { struct sockaddr_in6 rtaddr; u_char flags; + u_char stateflags; u_short rtlifetime; u_int32_t expire; u_short if_index; @@ -247,11 +270,12 @@ struct in6_defrouter_32 { struct in6_defrouter_64 { struct sockaddr_in6 rtaddr; u_char flags; + u_char stateflags; u_short rtlifetime; u_long expire __attribute__((aligned(8))); u_short if_index __attribute__((aligned(8))); } __attribute__((aligned(8))); -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ struct in6_prlist { char ifname[IFNAMSIZ]; @@ -269,7 +293,7 @@ struct in6_prlist { } prefix[PRLSTSIZ]; }; -#if defined(KERNEL_PRIVATE) +#if defined(XNU_KERNEL_PRIVATE) struct in6_prlist_32 { char ifname[IFNAMSIZ]; struct { @@ -302,7 +326,7 @@ struct in6_prlist_64 { struct in6_addr advrtr[DRLSTSIZ]; } prefix[PRLSTSIZ]; }; -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ struct in6_prefix { struct sockaddr_in6 prefix; @@ -319,7 +343,7 @@ struct in6_prefix { /* struct sockaddr_in6 advrtr[] */ }; -#if defined(KERNEL_PRIVATE) +#if defined(XNU_KERNEL_PRIVATE) struct in6_prefix_32 { struct sockaddr_in6 prefix; struct prf_ra raflags; @@ -331,7 +355,7 @@ struct in6_prefix_32 { u_int32_t flags; int refcnt; u_short if_index; - u_short advrtrs; + u_short advrtrs; /* number of advertisement routers */ /* struct sockaddr_in6 advrtr[] */ }; @@ -349,7 +373,7 @@ struct in6_prefix_64 { u_short advrtrs; /* struct sockaddr_in6 advrtr[] */ }; -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ struct in6_ondireq { char ifname[IFNAMSIZ]; @@ -366,7 +390,7 @@ struct in6_ondireq { } ndi; }; -#if !defined(KERNEL_PRIVATE) +#if !defined(XNU_KERNEL_PRIVATE) struct in6_ndireq { char ifname[IFNAMSIZ]; struct nd_ifinfo ndi; @@ -376,14 +400,17 @@ struct in6_ndireq { char ifname[IFNAMSIZ]; struct nd_ifinfo_compat ndi; }; -#endif /* !KERNEL_PRIVATE */ +#endif /* !XNU_KERNEL_PRIVATE */ struct in6_ndifreq { char ifname[IFNAMSIZ]; u_long ifindex; }; -#if defined(KERNEL_PRIVATE) +#define MAX_RTR_SOLICITATION_DELAY 1 /* 1sec */ +#define RTR_SOLICITATION_INTERVAL 4 /* 4sec */ + +#if defined(XNU_KERNEL_PRIVATE) struct in6_ndifreq_32 { char ifname[IFNAMSIZ]; u_int32_t ifindex; @@ -393,11 +420,16 @@ struct in6_ndifreq_64 { char ifname[IFNAMSIZ]; u_long ifindex __attribute__((aligned(8))); }; -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ /* Prefix status */ #define NDPRF_ONLINK 0x1 #define NDPRF_DETACHED 0x2 +#define NDPRF_STATIC 0x100 +#define NDPRF_IFSCOPE 0x1000 +#ifdef XNU_KERNEL_PRIVATE +#define NDPRF_PROCESSED 0x08000 +#endif /* protocol constants */ #define MAX_RTR_SOLICITATION_DELAY 1 /*1sec*/ @@ -405,8 +437,9 @@ struct in6_ndifreq_64 { #define MAX_RTR_SOLICITATIONS 3 #define ND6_INFINITE_LIFETIME 0xffffffff +#define ND6_MAX_LIFETIME 0x7fffffff -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE /* * Protects nd_ifinfo[] */ @@ -448,18 +481,59 @@ __private_extern__ lck_rw_t *nd_if_rwlock; TAILQ_HEAD(nd_drhead, nd_defrouter); struct nd_defrouter { + decl_lck_mtx_data(, nddr_lock); + uint32_t nddr_refcount; + uint32_t nddr_debug; TAILQ_ENTRY(nd_defrouter) dr_entry; - struct in6_addr rtaddr; - u_char flags; /* flags on RA message */ - u_short rtlifetime; + struct in6_addr rtaddr; + u_char flags; /* flags on RA message */ + u_char stateflags; + u_short rtlifetime; u_int32_t expire; - u_int32_t advint; /* Mobile IPv6 addition (milliseconds) */ - u_int32_t advint_expire; /* Mobile IPv6 addition */ - int advints_lost; /* Mobile IPv6 addition */ - struct ifnet *ifp; + struct ifnet *ifp; + unsigned int genid; + int err; + void (*nddr_trace) /* callback fn for tracing refs */ + (struct nd_defrouter *, int); }; +#define NDDR_LOCK_ASSERT_HELD(_nddr) \ + lck_mtx_assert(&(_nddr)->nddr_lock, LCK_MTX_ASSERT_OWNED) + +#define NDDR_LOCK_ASSERT_NOTHELD(_nddr) \ + lck_mtx_assert(&(_nddr)->nddr_lock, LCK_MTX_ASSERT_NOTOWNED) + +#define NDDR_LOCK(_nddr) \ + lck_mtx_lock(&(_nddr)->nddr_lock) + +#define NDDR_LOCK_SPIN(_nddr) \ + lck_mtx_lock_spin(&(_nddr)->nddr_lock) + +#define NDDR_CONVERT_LOCK(_nddr) do { \ + NDPR_LOCK_ASSERT_HELD(_nddr); \ + lck_mtx_convert_spin(&(_nddr)->nddr_lock); \ +} while (0) + +#define NDDR_UNLOCK(_nddr) \ + lck_mtx_unlock(&(_nddr)->nddr_lock) + +#define NDDR_ADDREF(_nddr) \ + nddr_addref(_nddr, 0) + +#define NDDR_ADDREF_LOCKED(_nddr) \ + nddr_addref(_nddr, 1) + +#define NDDR_REMREF(_nddr) do { \ + (void) nddr_remref(_nddr, 0); \ +} while (0) + +#define NDDR_REMREF_LOCKED(_nddr) \ + nddr_remref(_nddr, 1) + struct nd_prefix { + decl_lck_mtx_data(, ndpr_lock); + u_int32_t ndpr_refcount; /* reference count */ + u_int32_t ndpr_debug; /* see ifa_debug flags */ struct ifnet *ndpr_ifp; LIST_ENTRY(nd_prefix) ndpr_entry; struct sockaddr_in6 ndpr_prefix; /* prefix */ @@ -467,15 +541,17 @@ struct nd_prefix { struct in6_addr ndpr_addr; /* address that is derived from the prefix */ u_int32_t ndpr_vltime; /* advertised valid lifetime */ u_int32_t ndpr_pltime; /* advertised preferred lifetime */ - time_t ndpr_expire; /* expiration time of the prefix */ time_t ndpr_preferred; /* preferred time of the prefix */ + time_t ndpr_expire; /* expiration time of the prefix */ + time_t ndpr_lastupdate; /* reception time of last advertisement */ struct prf_ra ndpr_flags; u_int32_t ndpr_stateflags; /* actual state flags */ /* list of routers that advertise the prefix: */ LIST_HEAD(pr_rtrhead, nd_pfxrouter) ndpr_advrtrs; u_char ndpr_plen; - int ndpr_refcnt; /* reference counter from addresses */ - int ndpr_usecnt; /* actual use count; prevents free */ + int ndpr_addrcnt; /* reference counter from addresses */ + void (*ndpr_trace) /* callback fn for tracing refs */ + (struct nd_prefix *, int); }; #define ndpr_next ndpr_entry.le_next @@ -483,13 +559,46 @@ struct nd_prefix { #define ndpr_raf ndpr_flags #define ndpr_raf_onlink ndpr_flags.onlink #define ndpr_raf_auto ndpr_flags.autonomous - +#define ndpr_raf_router ndpr_flags.router /* * We keep expired prefix for certain amount of time, for validation purposes. * 1800s = MaxRtrAdvInterval */ #define NDPR_KEEP_EXPIRED (1800 * 2) +#define NDPR_LOCK_ASSERT_HELD(_ndpr) \ + lck_mtx_assert(&(_ndpr)->ndpr_lock, LCK_MTX_ASSERT_OWNED) + +#define NDPR_LOCK_ASSERT_NOTHELD(_ndpr) \ + lck_mtx_assert(&(_ndpr)->ndpr_lock, LCK_MTX_ASSERT_NOTOWNED) + +#define NDPR_LOCK(_ndpr) \ + lck_mtx_lock(&(_ndpr)->ndpr_lock) + +#define NDPR_LOCK_SPIN(_ndpr) \ + lck_mtx_lock_spin(&(_ndpr)->ndpr_lock) + +#define NDPR_CONVERT_LOCK(_ndpr) do { \ + NDPR_LOCK_ASSERT_HELD(_ndpr); \ + lck_mtx_convert_spin(&(_ndpr)->ndpr_lock); \ +} while (0) + +#define NDPR_UNLOCK(_ndpr) \ + lck_mtx_unlock(&(_ndpr)->ndpr_lock) + +#define NDPR_ADDREF(_ndpr) \ + ndpr_addref(_ndpr, 0) + +#define NDPR_ADDREF_LOCKED(_ndpr) \ + ndpr_addref(_ndpr, 1) + +#define NDPR_REMREF(_ndpr) do { \ + (void) ndpr_remref(_ndpr, 0); \ +} while (0) + +#define NDPR_REMREF_LOCKED(_ndpr) \ + ndpr_remref(_ndpr, 1) + /* * Message format for use in obtaining information about prefixes * from inet6 sysctl function @@ -533,6 +642,7 @@ extern int nd6_delay; extern int nd6_umaxtries; extern int nd6_mmaxtries; extern int nd6_useloopback; +extern int nd6_accept_6to4; extern int nd6_maxnudhint; extern int nd6_gctimer; extern struct llinfo_nd6 llinfo_nd6; @@ -541,20 +651,21 @@ extern struct nd_drhead nd_defrouter; extern struct nd_prhead nd_prefix; extern int nd6_debug; extern size_t nd_ifinfo_indexlim; +extern int nd6_onlink_ns_rfc4861; -#define nd6log(x) do { if (nd6_debug) log x; } while (0) - -extern struct callout nd6_timer_ch; +#define nd6log(x) do { if (nd6_debug >= 1) log x; } while (0) +#define nd6log2(x) do { if (nd6_debug >= 2) log x; } while (0) /* nd6_rtr.c */ extern int nd6_defifindex; extern int ip6_desync_factor; /* seconds */ +/* ND6_INFINITE_LIFETIME does not apply to temporary addresses */ extern u_int32_t ip6_temp_preferred_lifetime; /* seconds */ extern u_int32_t ip6_temp_valid_lifetime; /* seconds */ extern int ip6_temp_regen_advance; /* seconds */ union nd_opts { - struct nd_opt_hdr *nd_opt_array[9]; /*max = home agent info*/ + struct nd_opt_hdr *nd_opt_array[8]; /* max = target address list */ struct { struct nd_opt_hdr *zero; struct nd_opt_hdr *src_lladdr; @@ -562,9 +673,6 @@ union nd_opts { struct nd_opt_prefix_info *pi_beg; /* multiple opts, start */ struct nd_opt_rd_hdr *rh; struct nd_opt_mtu *mtu; - struct nd_opt_hdr *six; - struct nd_opt_advint *adv; - struct nd_opt_hai *hai; struct nd_opt_hdr *search; /* multiple opts */ struct nd_opt_hdr *last; /* multiple opts */ int done; @@ -577,8 +685,6 @@ union nd_opts { #define nd_opts_pi_end nd_opt_each.pi_end #define nd_opts_rh nd_opt_each.rh #define nd_opts_mtu nd_opt_each.mtu -#define nd_opts_adv nd_opt_each.adv -#define nd_opts_hai nd_opt_each.hai #define nd_opts_search nd_opt_each.search #define nd_opts_last nd_opt_each.last #define nd_opts_done nd_opt_each.done @@ -604,49 +710,62 @@ extern int nd6_ioctl(u_long, caddr_t, struct ifnet *); extern void nd6_cache_lladdr(struct ifnet *, struct in6_addr *, char *, int, int, int); extern int nd6_output(struct ifnet *, struct ifnet *, struct mbuf *, - struct sockaddr_in6 *, struct rtentry *, int); + struct sockaddr_in6 *, struct rtentry *); extern int nd6_storelladdr(struct ifnet *, struct rtentry *, struct mbuf *, struct sockaddr *, u_char *); extern int nd6_need_cache(struct ifnet *); extern void nd6_drain(void *); /* nd6_nbr.c */ +extern void nd6_nbr_init(void); extern void nd6_na_input(struct mbuf *, int, int); extern void nd6_na_output(struct ifnet *, const struct in6_addr *, const struct in6_addr *, u_int32_t, int, struct sockaddr *); extern void nd6_ns_input(struct mbuf *, int, int); extern void nd6_ns_output(struct ifnet *, const struct in6_addr *, - const struct in6_addr *, struct llinfo_nd6 *, int, int); + const struct in6_addr *, struct llinfo_nd6 *, int); extern caddr_t nd6_ifptomac(struct ifnet *); extern void nd6_dad_start(struct ifaddr *, int *); extern void nd6_dad_stop(struct ifaddr *); -extern void nd6_dad_duplicated(struct ifaddr *); +extern void nd6_dad_duplicated(struct ifaddr *, boolean_t); +extern void nd6_llreach_alloc(struct rtentry *, struct ifnet *, void *, + unsigned int, boolean_t); +extern void nd6_llreach_set_reachable(struct ifnet *, void *, unsigned int); +extern void nd6_llreach_use(struct llinfo_nd6 *); /* nd6_rtr.c */ +extern void nd6_rtr_init(void); extern void nd6_rs_input(struct mbuf *, int, int); extern void nd6_ra_input(struct mbuf *, int, int); extern void prelist_del(struct nd_prefix *); -extern void defrouter_addreq(struct nd_defrouter *); -extern void defrouter_delreq(struct nd_defrouter *, int); -extern void defrouter_select(void); -extern void defrtrlist_del(struct nd_defrouter *, int); -extern void prelist_remove(struct nd_prefix *, int); +extern void defrouter_addreq(struct nd_defrouter *, boolean_t); +extern void defrouter_delreq(struct nd_defrouter *); +extern void defrouter_select(struct ifnet *); +extern void defrouter_reset(void); +extern int defrtrlist_ioctl(u_long, caddr_t); +extern void defrtrlist_del(struct nd_defrouter *); +extern int defrtrlist_add_static(struct nd_defrouter *); +extern int defrtrlist_del_static(struct nd_defrouter *); +extern void prelist_remove(struct nd_prefix *); extern int prelist_update(struct nd_prefix *, struct nd_defrouter *, - struct mbuf *); + struct mbuf *, int); extern int nd6_prelist_add(struct nd_prefix *, struct nd_defrouter *, - struct nd_prefix **); -extern int nd6_prefix_onlink(struct nd_prefix *, int, int); + struct nd_prefix **, boolean_t); +extern int nd6_prefix_onlink(struct nd_prefix *); +extern int nd6_prefix_onlink_scoped(struct nd_prefix *, unsigned int); extern int nd6_prefix_offlink(struct nd_prefix *); -extern void pfxlist_onlink_check(int); +extern void pfxlist_onlink_check(void); extern struct nd_defrouter *defrouter_lookup(struct in6_addr *, struct ifnet *); extern struct nd_prefix *nd6_prefix_lookup(struct nd_prefix *); extern int in6_init_prefix_ltimes(struct nd_prefix *ndpr); extern void rt6_flush(struct in6_addr *, struct ifnet *); extern int nd6_setdefaultiface(int); extern int in6_tmpifadd(const struct in6_ifaddr *, int, int); -extern void ndpr_hold(struct nd_prefix *, boolean_t); -extern void ndpr_rele(struct nd_prefix *, boolean_t); -#endif /* KERNEL_PRIVATE */ +extern void nddr_addref(struct nd_defrouter *, int); +extern struct nd_defrouter *nddr_remref(struct nd_defrouter *, int); +extern void ndpr_addref(struct nd_prefix *, int); +extern struct nd_prefix *ndpr_remref(struct nd_prefix *, int); +#endif /* XNU_KERNEL_PRIVATE */ #ifdef KERNEL diff --git a/bsd/netinet6/nd6_nbr.c b/bsd/netinet6/nd6_nbr.c index 5b0d744a6..b2abd8169 100644 --- a/bsd/netinet6/nd6_nbr.c +++ b/bsd/netinet6/nd6_nbr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -67,11 +67,19 @@ #include #include #include +#include +#include +#include #include +#include +#include + #include +#include #include #include +#include #include #include @@ -80,6 +88,7 @@ #include #include #include +#include #include #if IPSEC @@ -96,27 +105,147 @@ extern int ipsec_bypass; struct dadq; static struct dadq *nd6_dad_find(struct ifaddr *); -#ifndef __APPLE__ -static void nd6_dad_starttimer(struct dadq *, int); -static void nd6_dad_stoptimer(struct dadq *); -#else void nd6_dad_stoptimer(struct ifaddr *); -#endif static void nd6_dad_timer(struct ifaddr *); static void nd6_dad_ns_output(struct dadq *, struct ifaddr *); static void nd6_dad_ns_input(struct ifaddr *); -static void nd6_dad_na_input(struct ifaddr *); +static void nd6_dad_na_input(struct ifaddr *, caddr_t, int); +static void dad_addref(struct dadq *, int); +static void dad_remref(struct dadq *); +static struct dadq *nd6_dad_attach(struct dadq *, struct ifaddr *); +static void nd6_dad_detach(struct dadq *, struct ifaddr *); static int dad_ignore_ns = 0; /* ignore NS in DAD - specwise incorrect*/ static int dad_maxtry = 15; /* max # of *tries* to transmit DAD packet */ +static unsigned int dad_size; /* size of zone element */ +static struct zone *dad_zone; /* zone for dadq */ + +#define DAD_ZONE_MAX 64 /* maximum elements in zone */ +#define DAD_ZONE_NAME "nd6_dad" /* zone name */ + +#define DAD_LOCK_ASSERT_HELD(_dp) \ + lck_mtx_assert(&(_dp)->dad_lock, LCK_MTX_ASSERT_OWNED) + +#define DAD_LOCK_ASSERT_NOTHELD(_dp) \ + lck_mtx_assert(&(_dp)->dad_lock, LCK_MTX_ASSERT_NOTOWNED) + +#define DAD_LOCK(_dp) \ + lck_mtx_lock(&(_dp)->dad_lock) + +#define DAD_LOCK_SPIN(_dp) \ + lck_mtx_lock_spin(&(_dp)->dad_lock) + +#define DAD_CONVERT_LOCK(_dp) do { \ + DAD_LOCK_ASSERT_HELD(_dp); \ + lck_mtx_convert_spin(&(_dp)->dad_lock); \ +} while (0) + +#define DAD_UNLOCK(_dp) \ + lck_mtx_unlock(&(_dp)->dad_lock) + +#define DAD_ADDREF(_dp) \ + dad_addref(_dp, 0) + +#define DAD_ADDREF_LOCKED(_dp) \ + dad_addref(_dp, 1) + +#define DAD_REMREF(_dp) \ + dad_remref(_dp) + extern lck_mtx_t *dad6_mutex; extern lck_mtx_t *nd6_mutex; +extern int in6_get_hw_ifid(struct ifnet *, struct in6_addr *); + +static int nd6_llreach_base = (LL_BASE_REACHABLE / 1000); /* seconds */ + +SYSCTL_DECL(_net_inet6_icmp6); + +SYSCTL_INT(_net_inet6_icmp6, OID_AUTO, nd6_llreach_base, + CTLFLAG_RW | CTLFLAG_LOCKED, &nd6_llreach_base, LL_BASE_REACHABLE, + "default ND6 link-layer reachability max lifetime (in seconds)"); + +#define SIN6(s) ((struct sockaddr_in6 *)s) + /* - * Input an Neighbor Solicitation Message. + * Obtain a link-layer source cache entry for the sender. + * + * NOTE: This is currently only for ND6/Ethernet. + */ +void +nd6_llreach_alloc(struct rtentry *rt, struct ifnet *ifp, void *addr, + unsigned int alen, boolean_t solicited) +{ + struct llinfo_nd6 *ln = rt->rt_llinfo; + + if (nd6_llreach_base != 0 && + ln->ln_expire != 0 && rt->rt_ifp != lo_ifp && + ifp->if_addrlen == IF_LLREACH_MAXLEN && /* Ethernet */ + alen == ifp->if_addrlen) { + struct if_llreach *lr; + const char *why = NULL, *type = ""; + + /* Become a regular mutex, just in case */ + RT_CONVERT_LOCK(rt); + + if ((lr = ln->ln_llreach) != NULL) { + type = (solicited ? "ND6 advertisement" : + "ND6 unsolicited announcement"); + /* + * If target has changed, create a new record; + * otherwise keep existing record. + */ + IFLR_LOCK(lr); + if (bcmp(addr, lr->lr_key.addr, alen) != 0) { + IFLR_UNLOCK(lr); + /* Purge any link-layer info caching */ + VERIFY(rt->rt_llinfo_purge != NULL); + rt->rt_llinfo_purge(rt); + lr = NULL; + why = " for different target HW address; " + "using new llreach record"; + } else { + lr->lr_probes = 0; /* reset probe count */ + IFLR_UNLOCK(lr); + if (solicited) { + why = " for same target HW address; " + "keeping existing llreach record"; + } + } + } + + if (lr == NULL) { + lr = ln->ln_llreach = ifnet_llreach_alloc(ifp, + ETHERTYPE_IPV6, addr, alen, nd6_llreach_base); + if (lr != NULL) { + lr->lr_probes = 0; /* reset probe count */ + if (why == NULL) + why = "creating new llreach record"; + } + } + + if (nd6_debug && lr != NULL && why != NULL) { + char tmp[MAX_IPv6_STR_LEN]; + + nd6log((LOG_DEBUG, "%s%d: %s%s for %s\n", ifp->if_name, + ifp->if_unit, type, why, inet_ntop(AF_INET6, + &SIN6(rt_key(rt))->sin6_addr, tmp, sizeof (tmp)))); + } + } +} + +void +nd6_llreach_use(struct llinfo_nd6 *ln) +{ + if (ln->ln_llreach != NULL) + ln->ln_lastused = net_uptime(); +} + +/* + * Input a Neighbor Solicitation Message. * * Based on RFC 2461 - * Based on RFC 2462 (duplicated address detection) + * Based on RFC 2462 (duplicate address detection) */ void nd6_ns_input( @@ -151,6 +280,8 @@ nd6_ns_input( #endif ip6 = mtod(m, struct ip6_hdr *); /* adjust pointer for safety */ taddr6 = nd_ns->nd_ns_target; + if (in6_setscope(&taddr6, ifp, NULL) != 0) + goto bad; if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, @@ -161,18 +292,36 @@ nd6_ns_input( } if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) { - /* dst has to be solicited node multicast address. */ - if (daddr6.s6_addr16[0] == IPV6_ADDR_INT16_MLL + /* dst has to be a solicited node multicast address. */ + if (daddr6.s6_addr16[0] == IPV6_ADDR_INT16_MLL && /* don't check ifindex portion */ - && daddr6.s6_addr32[1] == 0 - && daddr6.s6_addr32[2] == IPV6_ADDR_INT32_ONE - && daddr6.s6_addr8[12] == 0xff) { + daddr6.s6_addr32[1] == 0 && + daddr6.s6_addr32[2] == IPV6_ADDR_INT32_ONE && + daddr6.s6_addr8[12] == 0xff) { ; /* good */ } else { nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet " "(wrong ip6 dst)\n")); goto bad; } + } else if (!nd6_onlink_ns_rfc4861) { + struct sockaddr_in6 src_sa6; + + /* + * According to recent IETF discussions, it is not a good idea + * to accept a NS from an address which would not be deemed + * to be a neighbor otherwise. This point is expected to be + * clarified in future revisions of the specification. + */ + bzero(&src_sa6, sizeof(src_sa6)); + src_sa6.sin6_family = AF_INET6; + src_sa6.sin6_len = sizeof(src_sa6); + src_sa6.sin6_addr = saddr6; + if (!nd6_is_addr_neighbor(&src_sa6, ifp, 0)) { + nd6log((LOG_INFO, "nd6_ns_input: " + "NS packet from non-neighbor\n")); + goto bad; + } } if (IN6_IS_ADDR_MULTICAST(&taddr6)) { @@ -180,9 +329,6 @@ nd6_ns_input( goto bad; } - if (IN6_IS_SCOPE_LINKLOCAL(&taddr6)) - taddr6.s6_addr16[1] = htons(ifp->if_index); - icmp6len -= sizeof(*nd_ns); nd6_option_init(nd_ns + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { @@ -196,7 +342,7 @@ nd6_ns_input( lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1); lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3; } - + if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && lladdr) { nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet " "(link-layer address option)\n")); @@ -213,12 +359,6 @@ nd6_ns_input( * In implementation, we add target link-layer address by default. * We do not add one in MUST NOT cases. */ -#if 0 /* too much! */ - ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &daddr6); - if (ifa && (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST)) - tlladdr = 0; - else -#endif if (!IN6_IS_ADDR_MULTICAST(&daddr6)) tlladdr = 0; else @@ -234,16 +374,18 @@ nd6_ns_input( ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6); /* (2) check. */ - if (!ifa) { + if (ifa == NULL) { struct rtentry *rt; struct sockaddr_in6 tsin6; - bzero(&tsin6, sizeof tsin6); + bzero(&tsin6, sizeof tsin6); tsin6.sin6_len = sizeof(struct sockaddr_in6); tsin6.sin6_family = AF_INET6; tsin6.sin6_addr = taddr6; - rt = rtalloc1((struct sockaddr *)&tsin6, 0, 0); + rt = rtalloc1_scoped((struct sockaddr *)&tsin6, 0, 0, + ifp->if_index); + if (rt != NULL) { RT_LOCK(rt); if ((rt->rt_flags & RTF_ANNOUNCE) != 0 && @@ -262,7 +404,7 @@ nd6_ns_input( rtfree(rt); } } - if (!ifa) { + if (ifa == NULL) { /* * We've got an NS packet, and we don't have that adddress * assigned for us. We MUST silently ignore it. @@ -270,11 +412,15 @@ nd6_ns_input( */ goto freeit; } + IFA_LOCK(ifa); myaddr6 = *IFA_IN6(ifa); anycast = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST; tentative = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE; - if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DUPLICATED) + if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DUPLICATED) { + IFA_UNLOCK(ifa); goto freeit; + } + IFA_UNLOCK(ifa); if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, @@ -306,7 +452,7 @@ nd6_ns_input( if (tentative) { /* * If source address is unspecified address, it is for - * duplicated address detection. + * duplicate address detection. * * If not, the packet is for addess resolution; * silently ignore it. @@ -327,7 +473,8 @@ nd6_ns_input( */ if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) { saddr6 = in6addr_linklocal_allnodes; - saddr6.s6_addr16[1] = htons(ifp->if_index); + if (in6_setscope(&saddr6, ifp, NULL) != 0) + goto bad; nd6_na_output(ifp, &saddr6, &taddr6, ((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) @@ -346,7 +493,7 @@ nd6_ns_input( freeit: m_freem(m); if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); return; bad: @@ -356,17 +503,17 @@ nd6_ns_input( icmp6stat.icp6s_badns++; m_freem(m); if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); } /* - * Output an Neighbor Solicitation Message. Caller specifies: + * Output a Neighbor Solicitation Message. Caller specifies: * - ICMP6 header source IP6 address * - ND6 header target IP6 address * - ND6 header source datalink address * * Based on RFC 2461 - * Based on RFC 2462 (duplicated address detection) + * Based on RFC 2462 (duplicate address detection) * * Caller must bump up ln->ln_rt refcnt to make sure 'ln' doesn't go * away if there is a llinfo_nd6 passed in. @@ -377,22 +524,29 @@ nd6_ns_output( const struct in6_addr *daddr6, const struct in6_addr *taddr6, struct llinfo_nd6 *ln, /* for source address determination */ - int dad, /* duplicated address detection */ - int locked) + int dad) /* duplicated address detection */ { struct mbuf *m; struct ip6_hdr *ip6; struct nd_neighbor_solicit *nd_ns; struct in6_ifaddr *ia = NULL; - struct ip6_moptions im6o; + struct in6_addr *src, src_in, src_storage; + struct ip6_moptions *im6o = NULL; + struct ifnet *outif = NULL; int icmp6len; int maxlen; + int flags; caddr_t mac; - struct ifnet *outif = NULL; - + struct route_in6 ro; + struct ip6_out_args ip6oa = { IFSCOPE_NONE, 0 }; + + bzero(&ro, sizeof(ro)); + if (IN6_IS_ADDR_MULTICAST(taddr6)) return; + ip6oa.ip6oa_boundif = ifp->if_index; + /* estimate the size of message */ maxlen = sizeof(*ip6) + sizeof(*nd_ns); maxlen += (sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7; @@ -418,9 +572,16 @@ nd6_ns_output( if (daddr6 == NULL || IN6_IS_ADDR_MULTICAST(daddr6)) { m->m_flags |= M_MCAST; - im6o.im6o_multicast_ifp = ifp; - im6o.im6o_multicast_hlim = 255; - im6o.im6o_multicast_loop = 0; + + im6o = ip6_allocmoptions(M_DONTWAIT); + if (im6o == NULL) { + m_freem(m); + return; + } + + im6o->im6o_multicast_ifp = ifp; + im6o->im6o_multicast_hlim = 255; + im6o->im6o_multicast_loop = 0; } icmp6len = sizeof(*nd_ns); @@ -439,27 +600,15 @@ nd6_ns_output( ip6->ip6_dst = *daddr6; else { ip6->ip6_dst.s6_addr16[0] = IPV6_ADDR_INT16_MLL; - ip6->ip6_dst.s6_addr16[1] = htons(ifp->if_index); + ip6->ip6_dst.s6_addr16[1] = 0; ip6->ip6_dst.s6_addr32[1] = 0; ip6->ip6_dst.s6_addr32[2] = IPV6_ADDR_INT32_ONE; ip6->ip6_dst.s6_addr32[3] = taddr6->s6_addr32[3]; ip6->ip6_dst.s6_addr8[12] = 0xff; + if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0) + goto bad; } if (!dad) { -#if 0 /* KAME way, exact address scope match */ - /* - * Select a source whose scope is the same as that of the dest. - * Typically, the dest is link-local solicitation multicast - * (i.e. neighbor discovery) or link-local/global unicast - * (i.e. neighbor un-reachability detection). - */ - ia = in6_ifawithifp(ifp, &ip6->ip6_dst); - if (ia == NULL) { - m_freem(m); - return; - } - ip6->ip6_src = ia->ia_addr.sin6_addr; -#else /* spec-wise correct */ /* * RFC2461 7.2.2: * "If the source address of the packet prompting the @@ -473,72 +622,82 @@ nd6_ns_output( * (saddr6), if: * - saddr6 is given from the caller (by giving "ln"), and * - saddr6 belongs to the outgoing interface. - * Otherwise, we perform a scope-wise match. + * Otherwise, we perform the source address selection as usual. */ - struct ip6_hdr *hip6 = NULL; /* hold ip6 */ - struct in6_addr saddr6; + struct ip6_hdr *hip6; /* hold ip6 */ + struct in6_addr *hsrc = NULL; /* Caller holds ref on this route */ if (ln != NULL) { RT_LOCK(ln->ln_rt); + /* + * assuming every packet in ln_hold has the same IP + * header + */ if (ln->ln_hold != NULL) { hip6 = mtod(ln->ln_hold, struct ip6_hdr *); /* XXX pullup? */ if (sizeof (*hip6) < ln->ln_hold->m_len) - saddr6 = hip6->ip6_src; + hsrc = &hip6->ip6_src; else - hip6 = NULL; + hsrc = NULL; + } + /* Update probe count, if applicable */ + if (ln->ln_llreach != NULL) { + IFLR_LOCK_SPIN(ln->ln_llreach); + ln->ln_llreach->lr_probes++; + IFLR_UNLOCK(ln->ln_llreach); } - /* - * hip6 is used only to indicate whether or - * not there is a valid source address from - * the held packet in ln_hold. For obvious - * reasons we should not dereference it after - * releasing the lock though we can simply - * test if it's non-NULL. - */ RT_UNLOCK(ln->ln_rt); - } - if (ia != NULL) - ifafree(&ia->ia_ifa); - if (hip6 != NULL && (ia = in6ifa_ifpwithaddr(ifp, &saddr6))) { - bcopy(&saddr6, &ip6->ip6_src, sizeof (saddr6)); - } else { - ia = in6_ifawithifp(ifp, &ip6->ip6_dst); - if (ia == NULL) { - if (ln != NULL) { - RT_LOCK(ln->ln_rt); - if (ln->ln_hold != NULL) - m_freem(ln->ln_hold); - ln->ln_hold = NULL; - RT_UNLOCK(ln->ln_rt); - } - m_freem(m); - return; - } - ip6->ip6_src = ia->ia_addr.sin6_addr; } if (ia != NULL) { - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); ia = NULL; } -#endif + if (hsrc != NULL && (ia = in6ifa_ifpwithaddr(ifp, hsrc))) { + src = hsrc; + IFA_REMREF(&ia->ia_ifa); + ia = NULL; + } else { + int error; + struct sockaddr_in6 dst_sa; + + bzero(&dst_sa, sizeof(dst_sa)); + dst_sa.sin6_family = AF_INET6; + dst_sa.sin6_len = sizeof(dst_sa); + dst_sa.sin6_addr = ip6->ip6_dst; + + src = in6_selectsrc(&dst_sa, NULL, + NULL, &ro, NULL, &src_storage, ip6oa.ip6oa_boundif, + &error); + if (src == NULL) { + nd6log((LOG_DEBUG, + "nd6_ns_output: source can't be " + "determined: dst=%s, error=%d\n", + ip6_sprintf(&dst_sa.sin6_addr), + error)); + goto bad; + } + } } else { /* * Source address for DAD packet must always be IPv6 * unspecified address. (0::0) + * We actually don't have to 0-clear the address (we did it + * above), but we do so here explicitly to make the intention + * clearer. */ - bzero(&ip6->ip6_src, sizeof(ip6->ip6_src)); + bzero(&src_in, sizeof(src_in)); + src = &src_in; } + ip6->ip6_src = *src; nd_ns = (struct nd_neighbor_solicit *)(ip6 + 1); nd_ns->nd_ns_type = ND_NEIGHBOR_SOLICIT; nd_ns->nd_ns_code = 0; nd_ns->nd_ns_reserved = 0; nd_ns->nd_ns_target = *taddr6; - - if (IN6_IS_SCOPE_LINKLOCAL(&nd_ns->nd_ns_target)) - nd_ns->nd_ns_target.s6_addr16[1] = 0; + in6_clearscope(&nd_ns->nd_ns_target); /* XXX */ /* * Add source link-layer address option. @@ -577,19 +736,43 @@ nd6_ns_output( if (ipsec_bypass == 0) (void)ipsec_setsocket(m, NULL); #endif - ip6_output(m, NULL, NULL, dad ? IPV6_DADOUTPUT : 0, &im6o, &outif, locked); + flags = dad ? IPV6_UNSPECSRC : 0; + flags |= IPV6_OUTARGS; + + ip6_output(m, NULL, NULL, flags, im6o, &outif, &ip6oa); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); icmp6_ifstat_inc(outif, ifs6_out_neighborsolicit); + ifnet_release(outif); } icmp6stat.icp6s_outhist[ND_NEIGHBOR_SOLICIT]++; + + if (im6o != NULL) + IM6O_REMREF(im6o); + if (ro.ro_rt) { /* we don't cache this route. */ + rtfree(ro.ro_rt); + } + if (ia != NULL) + IFA_REMREF(&ia->ia_ifa); + return; + +bad: + if (im6o != NULL) + IM6O_REMREF(im6o); + if (ro.ro_rt) { + rtfree(ro.ro_rt); + } + m_freem(m); + if (ia != NULL) + IFA_REMREF(&ia->ia_ifa); + return; } /* * Neighbor advertisement input handling. * * Based on RFC 2461 - * Based on RFC 2462 (duplicated address detection) + * Based on RFC 2462 (duplicate address detection) * * the following items are not implemented yet: * - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD) @@ -604,9 +787,6 @@ nd6_na_input( struct ifnet *ifp = m->m_pkthdr.rcvif; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_neighbor_advert *nd_na; -#if 0 - struct in6_addr saddr6 = ip6->ip6_src; -#endif struct in6_addr daddr6 = ip6->ip6_dst; struct in6_addr taddr6; int flags; @@ -640,14 +820,15 @@ nd6_na_input( return; } #endif - taddr6 = nd_na->nd_na_target; + flags = nd_na->nd_na_flags_reserved; is_router = ((flags & ND_NA_FLAG_ROUTER) != 0); is_solicited = ((flags & ND_NA_FLAG_SOLICITED) != 0); is_override = ((flags & ND_NA_FLAG_OVERRIDE) != 0); - if (IN6_IS_SCOPE_LINKLOCAL(&taddr6)) - taddr6.s6_addr16[1] = htons(ifp->if_index); + taddr6 = nd_na->nd_na_target; + if (in6_setscope(&taddr6, ifp, NULL)) + goto bad; /* XXX: impossible */ if (IN6_IS_ADDR_MULTICAST(&taddr6)) { nd6log((LOG_ERR, @@ -687,10 +868,14 @@ nd6_na_input( * * Otherwise, process as defined in RFC 2461. */ - if (ifa - && (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE)) { - nd6_dad_na_input(ifa); - goto freeit; + if (ifa != NULL) { + IFA_LOCK(ifa); + if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE) { + IFA_UNLOCK(ifa); + nd6_dad_na_input(ifa, lladdr, lladdrlen); + goto freeit; + } + IFA_UNLOCK(ifa); } /* Just for safety, maybe unnecessary. */ @@ -710,8 +895,8 @@ nd6_na_input( } /* - * If no neighbor cache entry is found, NA SHOULD silently be discarded. - * Callee returns a locked route upon success. + * If no neighbor cache entry is found, NA SHOULD silently be + * discarded. */ if ((rt = nd6_lookup(&taddr6, 0, ifp, 0)) == NULL) goto freeit; @@ -762,7 +947,9 @@ nd6_na_input( * affect the status of associated prefixes.. */ RT_UNLOCK(rt); - pfxlist_onlink_check(0); + lck_mtx_lock(nd6_mutex); + pfxlist_onlink_check(); + lck_mtx_unlock(nd6_mutex); RT_LOCK(rt); } } else { @@ -802,7 +989,7 @@ nd6_na_input( * 1 1 y n (2a) L *->REACHABLE * 1 1 y y (2a) L *->REACHABLE */ - if (!is_override && (lladdr && llchange)) { /* (1) */ + if (!is_override && (lladdr != NULL && llchange)) { /* (1) */ /* * If state is REACHABLE, make it STALE. * no other updates should be done. @@ -866,13 +1053,14 @@ nd6_na_input( * Lock to protect the default router list. * XXX: this might be unnecessary, since this function * is only called under the network software interrupt - * context. However, we keep it just for safety. + * context. However, we keep it just for safety. */ RT_UNLOCK(rt); lck_mtx_lock(nd6_mutex); dr = defrouter_lookup(in6, rt_ifp); if (dr) { - defrtrlist_del(dr, 1); + defrtrlist_del(dr); + NDDR_REMREF(dr); lck_mtx_unlock(nd6_mutex); } else { @@ -894,33 +1082,51 @@ nd6_na_input( } RT_LOCK_ASSERT_HELD(rt); rt->rt_flags &= ~RTF_REJECT; + + /* cache the gateway (sender HW) address */ + nd6_llreach_alloc(rt, ifp, LLADDR(sdl), sdl->sdl_alen, TRUE); + + /* update the llinfo, send a queued packet if there is one */ ln->ln_asked = 0; if (ln->ln_hold != NULL) { - struct mbuf *n = ln->ln_hold; - ln->ln_hold = NULL; + struct mbuf *m_hold, *m_hold_next; + struct sockaddr_in6 sin6; + + rtkey_to_sa6(rt, &sin6); /* - * we assume ifp is not a loopback here, so just set the 2nd - * argument as the 1st one. + * reset the ln_hold in advance, to explicitly + * prevent a ln_hold lookup in nd6_output() + * (wouldn't happen, though...) */ - RT_UNLOCK(rt); - nd6_output(ifp, ifp, n, (struct sockaddr_in6 *)rt_key(rt), - rt, 0); - RT_LOCK_SPIN(rt); + for (m_hold = ln->ln_hold; + m_hold; m_hold = m_hold_next) { + m_hold_next = m_hold->m_nextpkt; + m_hold->m_nextpkt = NULL; + /* + * we assume ifp is not a loopback here, so just set + * the 2nd argument as the 1st one. + */ + RT_UNLOCK(rt); + nd6_output(ifp, ifp, m_hold, &sin6, rt); + RT_LOCK_SPIN(rt); + } + ln->ln_hold = NULL; + } RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); - freeit: +freeit: m_freem(m); if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); return; - bad: +bad: icmp6stat.icp6s_badna++; m_freem(m); if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); } /* @@ -931,11 +1137,14 @@ nd6_na_input( * the following items are not implemented yet: * - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD) * - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD) + * + * tlladdr - 1 if include target link-layer address + * sdl0 - sockaddr_dl (= proxy NA) or NULL */ void nd6_na_output( struct ifnet *ifp, - const struct in6_addr *daddr6, + const struct in6_addr *daddr6_0, const struct in6_addr *taddr6, uint32_t flags, int tlladdr, /* 1 if include target link-layer address */ @@ -944,12 +1153,20 @@ nd6_na_output( struct mbuf *m; struct ip6_hdr *ip6; struct nd_neighbor_advert *nd_na; - struct in6_ifaddr *ia = NULL; - struct ip6_moptions im6o; - int icmp6len; - int maxlen; + struct ip6_moptions *im6o = NULL; caddr_t mac = NULL; - struct ifnet *outif = NULL; + struct route_in6 ro; + struct in6_addr *src, src_storage, daddr6; + struct sockaddr_in6 dst_sa; + int icmp6len, maxlen, error; + struct ifnet *outif = NULL; + struct ip6_out_args ip6oa = { IFSCOPE_NONE, 0 }; + + bzero(&ro, sizeof(ro)); + + daddr6 = *daddr6_0; /* make a local copy for modification */ + + ip6oa.ip6oa_boundif = ifp->if_index; /* estimate the size of message */ maxlen = sizeof(*ip6) + sizeof(*nd_na); @@ -974,11 +1191,18 @@ nd6_na_output( return; m->m_pkthdr.rcvif = NULL; - if (IN6_IS_ADDR_MULTICAST(daddr6)) { + if (IN6_IS_ADDR_MULTICAST(&daddr6)) { m->m_flags |= M_MCAST; - im6o.im6o_multicast_ifp = ifp; - im6o.im6o_multicast_hlim = 255; - im6o.im6o_multicast_loop = 0; + + im6o = ip6_allocmoptions(M_DONTWAIT); + if (im6o == NULL) { + m_freem(m); + return; + } + + im6o->im6o_multicast_ifp = ifp; + im6o->im6o_multicast_hlim = 255; + im6o->im6o_multicast_loop = 0; } icmp6len = sizeof(*nd_na); @@ -992,35 +1216,44 @@ nd6_na_output( ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 255; - if (IN6_IS_ADDR_UNSPECIFIED(daddr6)) { + if (IN6_IS_ADDR_UNSPECIFIED(&daddr6)) { /* reply to DAD */ - ip6->ip6_dst.s6_addr16[0] = IPV6_ADDR_INT16_MLL; - ip6->ip6_dst.s6_addr16[1] = htons(ifp->if_index); - ip6->ip6_dst.s6_addr32[1] = 0; - ip6->ip6_dst.s6_addr32[2] = 0; - ip6->ip6_dst.s6_addr32[3] = IPV6_ADDR_INT32_ONE; + daddr6.s6_addr16[0] = IPV6_ADDR_INT16_MLL; + daddr6.s6_addr16[1] = 0; + daddr6.s6_addr32[1] = 0; + daddr6.s6_addr32[2] = 0; + daddr6.s6_addr32[3] = IPV6_ADDR_INT32_ONE; + if (in6_setscope(&daddr6, ifp, NULL)) + goto bad; + flags &= ~ND_NA_FLAG_SOLICITED; } else - ip6->ip6_dst = *daddr6; + ip6->ip6_dst = daddr6; + + bzero(&dst_sa, sizeof(struct sockaddr_in6)); + dst_sa.sin6_family = AF_INET6; + dst_sa.sin6_len = sizeof(struct sockaddr_in6); + dst_sa.sin6_addr = daddr6; /* * Select a source whose scope is the same as that of the dest. */ - ia = in6_ifawithifp(ifp, &ip6->ip6_dst); - if (ia == NULL) { - m_freem(m); - return; + bcopy(&dst_sa, &ro.ro_dst, sizeof(dst_sa)); + src = in6_selectsrc(&dst_sa, NULL, NULL, &ro, NULL, &src_storage, + ip6oa.ip6oa_boundif, &error); + if (src == NULL) { + nd6log((LOG_DEBUG, "nd6_na_output: source can't be " + "determined: dst=%s, error=%d\n", + ip6_sprintf(&dst_sa.sin6_addr), error)); + goto bad; } - ip6->ip6_src = ia->ia_addr.sin6_addr; - ifafree(&ia->ia_ifa); - ia = NULL; + ip6->ip6_src = *src; nd_na = (struct nd_neighbor_advert *)(ip6 + 1); nd_na->nd_na_type = ND_NEIGHBOR_ADVERT; nd_na->nd_na_code = 0; nd_na->nd_na_target = *taddr6; - if (IN6_IS_SCOPE_LINKLOCAL(&nd_na->nd_na_target)) - nd_na->nd_na_target.s6_addr16[1] = 0; + in6_clearscope(&nd_na->nd_na_target); /* XXX */ /* * "tlladdr" indicates NS's condition for adding tlladdr or not. @@ -1046,7 +1279,7 @@ nd6_na_output( if (tlladdr && mac) { int optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen; struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_na + 1); - + /* roundup to 8 bytes alignment! */ optlen = (optlen + 7) & ~7; @@ -1071,23 +1304,63 @@ nd6_na_output( if (ipsec_bypass == 0) (void)ipsec_setsocket(m, NULL); #endif - ip6_output(m, NULL, NULL, 0, &im6o, &outif, 0); + ip6_output(m, NULL, NULL, IPV6_OUTARGS, im6o, &outif, &ip6oa); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); icmp6_ifstat_inc(outif, ifs6_out_neighboradvert); + ifnet_release(outif); } icmp6stat.icp6s_outhist[ND_NEIGHBOR_ADVERT]++; + + if (im6o != NULL) + IM6O_REMREF(im6o); + if (ro.ro_rt) { + rtfree(ro.ro_rt); + } + return; + +bad: + if (im6o != NULL) + IM6O_REMREF(im6o); + if (ro.ro_rt) { + rtfree(ro.ro_rt); + } + m_freem(m); + return; } caddr_t nd6_ifptomac( struct ifnet *ifp) { - return ((caddr_t)ifnet_lladdr(ifp)); + switch (ifp->if_type) { + case IFT_ARCNET: + case IFT_ETHER: + case IFT_IEEE8023ADLAG: + case IFT_FDDI: + case IFT_IEEE1394: +#ifdef IFT_L2VLAN + case IFT_L2VLAN: +#endif +#ifdef IFT_IEEE80211 + case IFT_IEEE80211: +#endif +#ifdef IFT_CARP + case IFT_CARP: +#endif + case IFT_BRIDGE: + case IFT_ISO88025: + return ((caddr_t)ifnet_lladdr(ifp)); + default: + return NULL; + } } TAILQ_HEAD(dadq_head, dadq); struct dadq { + decl_lck_mtx_data(, dad_lock); + u_int32_t dad_refcount; /* reference count */ + int dad_attached; TAILQ_ENTRY(dadq) dad_list; struct ifaddr *dad_ifa; int dad_count; /* max NS to send */ @@ -1095,28 +1368,46 @@ struct dadq { int dad_ns_ocount; /* NS sent so far */ int dad_ns_icount; int dad_na_icount; + int dad_na_ixcount; /* Count of IFDISABLED eligible NA rx'd */ }; static struct dadq_head dadq; -static int dad_init = 0; + +void +nd6_nbr_init(void) +{ + TAILQ_INIT(&dadq); + + dad_size = sizeof (struct dadq); + dad_zone = zinit(dad_size, DAD_ZONE_MAX * dad_size, 0, DAD_ZONE_NAME); + if (dad_zone == NULL) { + panic("%s: failed allocating %s", __func__, DAD_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(dad_zone, Z_EXPAND, TRUE); + zone_change(dad_zone, Z_CALLERACCT, FALSE); +} static struct dadq * -nd6_dad_find( - struct ifaddr *ifa) +nd6_dad_find(struct ifaddr *ifa) { struct dadq *dp; + lck_mtx_lock(dad6_mutex); for (dp = dadq.tqh_first; dp; dp = dp->dad_list.tqe_next) { + DAD_LOCK_SPIN(dp); if (dp->dad_ifa == ifa) { + DAD_ADDREF_LOCKED(dp); + DAD_UNLOCK(dp); lck_mtx_unlock(dad6_mutex); - return dp; + return (dp); } + DAD_UNLOCK(dp); } lck_mtx_unlock(dad6_mutex); - return NULL; + return (NULL); } -#ifdef __APPLE__ void nd6_dad_stoptimer( struct ifaddr *ifa) @@ -1124,28 +1415,9 @@ nd6_dad_stoptimer( untimeout((void (*)(void *))nd6_dad_timer, (void *)ifa); } -#else -static void -nd6_dad_starttimer( - struct dadq *dp, - int ticks) -{ - - callout_reset(&dp->dad_timer_ch, ticks, - (void (*)(void *))nd6_dad_timer, (void *)dp->dad_ifa); -} - -static void -nd6_dad_stoptimer( - struct dadq *dp) -{ - - callout_stop(&dp->dad_timer_ch); -} -#endif /* - * Start Duplicated Address Detection (DAD) for specified interface address. + * Start Duplicate Address Detection (DAD) for specified interface address. */ void nd6_dad_start( @@ -1155,43 +1427,45 @@ nd6_dad_start( struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct dadq *dp; - if (!dad_init) { - TAILQ_INIT(&dadq); - dad_init++; - } - /* * If we don't need DAD, don't do it. * There are several cases: * - DAD is disabled (ip6_dad_count == 0) * - the interface address is anycast */ + IFA_LOCK(&ia->ia_ifa); if (!(ia->ia6_flags & IN6_IFF_TENTATIVE)) { log(LOG_DEBUG, "nd6_dad_start: called with non-tentative address " "%s(%s)\n", ip6_sprintf(&ia->ia_addr.sin6_addr), ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); + IFA_UNLOCK(&ia->ia_ifa); return; } if (ia->ia6_flags & IN6_IFF_ANYCAST) { ia->ia6_flags &= ~IN6_IFF_TENTATIVE; + IFA_UNLOCK(&ia->ia_ifa); return; } if (!ip6_dad_count) { ia->ia6_flags &= ~IN6_IFF_TENTATIVE; + IFA_UNLOCK(&ia->ia_ifa); return; } - if (!ifa->ifa_ifp) + IFA_UNLOCK(&ia->ia_ifa); + if (ifa->ifa_ifp == NULL) panic("nd6_dad_start: ifa->ifa_ifp == NULL"); - if (!(ifa->ifa_ifp->if_flags & IFF_UP)) + if (!(ifa->ifa_ifp->if_flags & IFF_UP)) { return; - if (nd6_dad_find(ifa) != NULL) { + } + if ((dp = nd6_dad_find(ifa)) != NULL) { + DAD_REMREF(dp); /* DAD already in progress */ return; } - dp = _MALLOC(sizeof(*dp), M_IP6NDP, M_NOWAIT); + dp = zalloc(dad_zone); if (dp == NULL) { log(LOG_ERR, "nd6_dad_start: memory allocation failed for " "%s(%s)\n", @@ -1199,10 +1473,11 @@ nd6_dad_start( ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); return; } - bzero(dp, sizeof(*dp)); - lck_mtx_lock(dad6_mutex); - TAILQ_INSERT_TAIL(&dadq, (struct dadq *)dp, dad_list); - lck_mtx_unlock(dad6_mutex); + bzero(dp, dad_size); + lck_mtx_init(&dp->dad_lock, ifa_mtx_grp, ifa_mtx_attr); + + /* Callee adds one reference for us */ + dp = nd6_dad_attach(dp, ifa); nd6log((LOG_DEBUG, "%s: starting DAD for %s\n", if_name(ifa->ifa_ifp), ip6_sprintf(&ia->ia_addr.sin6_addr))); @@ -1213,11 +1488,6 @@ nd6_dad_start( * first packet to be sent from the interface after interface * (re)initialization. */ - dp->dad_ifa = ifa; - ifaref(ifa); /*just for safety*/ - dp->dad_count = ip6_dad_count; - dp->dad_ns_icount = dp->dad_na_icount = 0; - dp->dad_ns_ocount = dp->dad_ns_tcount = 0; if (tick_delay == NULL) { u_int32_t retrans; nd6_dad_ns_output(dp, ifa); @@ -1236,19 +1506,61 @@ nd6_dad_start( timeout((void (*)(void *))nd6_dad_timer, (void *)ifa, ntick); } + + DAD_REMREF(dp); /* drop our reference */ +} + +static struct dadq * +nd6_dad_attach(struct dadq *dp, struct ifaddr *ifa) +{ + lck_mtx_lock(dad6_mutex); + DAD_LOCK(dp); + dp->dad_ifa = ifa; + IFA_ADDREF(ifa); /* for dad_ifa */ + dp->dad_count = ip6_dad_count; + dp->dad_ns_icount = dp->dad_na_icount = 0; + dp->dad_ns_ocount = dp->dad_ns_tcount = 0; + dp->dad_na_ixcount = 0; + VERIFY(!dp->dad_attached); + dp->dad_attached = 1; + DAD_ADDREF_LOCKED(dp); /* for caller */ + DAD_ADDREF_LOCKED(dp); /* for dadq_head list */ + TAILQ_INSERT_TAIL(&dadq, (struct dadq *)dp, dad_list); + DAD_UNLOCK(dp); + lck_mtx_unlock(dad6_mutex); + + return (dp); +} + +static void +nd6_dad_detach(struct dadq *dp, struct ifaddr *ifa) +{ + int detached; + + lck_mtx_lock(dad6_mutex); + DAD_LOCK(dp); + if ((detached = dp->dad_attached)) { + VERIFY(dp->dad_ifa == ifa); + TAILQ_REMOVE(&dadq, (struct dadq *)dp, dad_list); + dp->dad_list.tqe_next = NULL; + dp->dad_list.tqe_prev = NULL; + dp->dad_attached = 0; + } + DAD_UNLOCK(dp); + lck_mtx_unlock(dad6_mutex); + if (detached) { + DAD_REMREF(dp); /* drop dadq_head reference */ + } } /* * terminate DAD unconditionally. used for address removals. */ void -nd6_dad_stop( - struct ifaddr *ifa) +nd6_dad_stop(struct ifaddr *ifa) { struct dadq *dp; - if (!dad_init) - return; dp = nd6_dad_find(ifa); if (!dp) { /* DAD wasn't started yet */ @@ -1257,21 +1569,42 @@ nd6_dad_stop( untimeout((void (*)(void *))nd6_dad_timer, (void *)ifa); - lck_mtx_lock(dad6_mutex); - TAILQ_REMOVE(&dadq, (struct dadq *)dp, dad_list); - lck_mtx_unlock(dad6_mutex); - FREE(dp, M_IP6NDP); - dp = NULL; - ifafree(ifa); + nd6_dad_detach(dp, ifa); + DAD_REMREF(dp); /* drop our reference */ } static void -nd6_dad_timer( - struct ifaddr *ifa) +nd6_unsol_na_output(struct ifaddr *ifa) { struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; - struct dadq *dp; + struct ifnet *ifp = ifa->ifa_ifp; + struct in6_addr saddr6, taddr6; + + if ((ifp->if_flags & IFF_UP) == 0 || + (ifp->if_flags & IFF_RUNNING) == 0) + return; + + IFA_LOCK_SPIN(&ia->ia_ifa); + taddr6 = ia->ia_addr.sin6_addr; + IFA_UNLOCK(&ia->ia_ifa); + if (in6_setscope(&taddr6, ifp, NULL) != 0) + return; + saddr6 = in6addr_linklocal_allnodes; + if (in6_setscope(&saddr6, ifp, NULL) != 0) + return; + + nd6log((LOG_INFO, "%s: sending unsolicited NA\n", + if_name(ifa->ifa_ifp))); + + nd6_na_output(ifp, &saddr6, &taddr6, ND_NA_FLAG_OVERRIDE, 1, NULL); +} + +static void +nd6_dad_timer(struct ifaddr *ifa) +{ + struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; + struct dadq *dp = NULL; /* Sanity check */ if (ia == NULL) { @@ -1283,11 +1616,13 @@ nd6_dad_timer( log(LOG_ERR, "nd6_dad_timer: DAD structure not found\n"); goto done; } + IFA_LOCK(&ia->ia_ifa); if (ia->ia6_flags & IN6_IFF_DUPLICATED) { log(LOG_ERR, "nd6_dad_timer: called with duplicated address " "%s(%s)\n", ip6_sprintf(&ia->ia_addr.sin6_addr), ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); + IFA_UNLOCK(&ia->ia_ifa); goto done; } if ((ia->ia6_flags & IN6_IFF_TENTATIVE) == 0) { @@ -1295,26 +1630,26 @@ nd6_dad_timer( "%s(%s)\n", ip6_sprintf(&ia->ia_addr.sin6_addr), ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); + IFA_UNLOCK(&ia->ia_ifa); goto done; } + IFA_UNLOCK(&ia->ia_ifa); /* timeouted with IFF_{RUNNING,UP} check */ + DAD_LOCK(dp); if (dp->dad_ns_tcount > dad_maxtry) { + DAD_UNLOCK(dp); nd6log((LOG_INFO, "%s: could not run DAD, driver problem?\n", if_name(ifa->ifa_ifp))); - lck_mtx_lock(dad6_mutex); - TAILQ_REMOVE(&dadq, (struct dadq *)dp, dad_list); - lck_mtx_unlock(dad6_mutex); - FREE(dp, M_IP6NDP); - dp = NULL; - ifafree(ifa); + nd6_dad_detach(dp, ifa); goto done; } /* Need more checks? */ if (dp->dad_ns_ocount < dp->dad_count) { u_int32_t retrans; + DAD_UNLOCK(dp); /* * We have more NS to go. Send NS packet for DAD. */ @@ -1341,71 +1676,50 @@ nd6_dad_timer( } if (dp->dad_ns_icount) { -#if 0 /* heuristics */ - /* - * if - * - we have sent many(?) DAD NS, and - * - the number of NS we sent equals to the - * number of NS we've got, and - * - we've got no NA - * we may have a faulty network card/driver which - * loops back multicasts to myself. - */ - if (3 < dp->dad_count - && dp->dad_ns_icount == dp->dad_count - && dp->dad_na_icount == 0) { - log(LOG_INFO, "DAD questionable for %s(%s): " - "network card loops back multicast?\n", - ip6_sprintf(&ia->ia_addr.sin6_addr), - if_name(ifa->ifa_ifp)); - /* XXX consider it a duplicate or not? */ - /* duplicate++; */ - } else { - /* We've seen NS, means DAD has failed. */ - duplicate++; - } -#else /* We've seen NS, means DAD has failed. */ duplicate++; -#endif } + DAD_UNLOCK(dp); if (duplicate) { /* (*dp) will be freed in nd6_dad_duplicated() */ - dp = NULL; - nd6_dad_duplicated(ifa); + nd6_dad_duplicated(ifa, TRUE); } else { /* * We are done with DAD. No NA came, no NS came. - * duplicated address found. + * No duplicate address found. */ + IFA_LOCK_SPIN(&ia->ia_ifa); ia->ia6_flags &= ~IN6_IFF_TENTATIVE; + IFA_UNLOCK(&ia->ia_ifa); nd6log((LOG_DEBUG, "%s: DAD complete for %s - no duplicates found\n", if_name(ifa->ifa_ifp), ip6_sprintf(&ia->ia_addr.sin6_addr))); - - lck_mtx_lock(dad6_mutex); - TAILQ_REMOVE(&dadq, (struct dadq *)dp, dad_list); - lck_mtx_unlock(dad6_mutex); + /* + * Send an Unsolicited Neighbor Advertisement so that + * other machines on the network are aware of us + * (important when we are waking from sleep). + */ + nd6_unsol_na_output(ifa); in6_post_msg(ia->ia_ifp, KEV_INET6_NEW_USER_ADDR, ia); - FREE(dp, M_IP6NDP); - dp = NULL; - ifafree(ifa); + nd6_dad_detach(dp, ifa); } } done: - return; + if (dp != NULL) + DAD_REMREF(dp); /* drop our reference */ } void -nd6_dad_duplicated( - struct ifaddr *ifa) +nd6_dad_duplicated(struct ifaddr *ifa, boolean_t dontignhwdup) { struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct dadq *dp; + struct ifnet *ifp = ifa->ifa_ifp; + int hwdupposs; dp = nd6_dad_find(ifa); if (dp == NULL) { @@ -1413,80 +1727,99 @@ nd6_dad_duplicated( return; } + hwdupposs = 0; + IFA_LOCK(&ia->ia_ifa); + DAD_LOCK(dp); log(LOG_ERR, "%s: DAD detected duplicate IPv6 address %s: " - "NS in/out=%d/%d, NA in=%d\n", - if_name(ifa->ifa_ifp), ip6_sprintf(&ia->ia_addr.sin6_addr), - dp->dad_ns_icount, dp->dad_ns_ocount, dp->dad_na_icount); - + "NS in/out=%d/%d, NA in=%d inx=%d\n", + if_name(ifp), ip6_sprintf(&ia->ia_addr.sin6_addr), + dp->dad_ns_icount, dp->dad_ns_ocount, dp->dad_na_icount, + dp->dad_na_ixcount); + hwdupposs = dp->dad_na_ixcount; + DAD_UNLOCK(dp); ia->ia6_flags &= ~IN6_IFF_TENTATIVE; ia->ia6_flags |= IN6_IFF_DUPLICATED; + IFA_UNLOCK(&ia->ia_ifa); /* We are done with DAD, with duplicated address found. (failure) */ untimeout((void (*)(void *))nd6_dad_timer, (void *)ifa); - + IFA_LOCK(&ia->ia_ifa); log(LOG_ERR, "%s: DAD complete for %s - duplicate found\n", - if_name(ifa->ifa_ifp), ip6_sprintf(&ia->ia_addr.sin6_addr)); + if_name(ifp), ip6_sprintf(&ia->ia_addr.sin6_addr)); log(LOG_ERR, "%s: manual intervention required\n", - if_name(ifa->ifa_ifp)); - - lck_mtx_lock(dad6_mutex); - TAILQ_REMOVE(&dadq, (struct dadq *)dp, dad_list); - lck_mtx_unlock(dad6_mutex); - FREE(dp, M_IP6NDP); - dp = NULL; - ifafree(ifa); + if_name(ifp)); + IFA_UNLOCK(&ia->ia_ifa); + + if (hwdupposs || + (dontignhwdup && IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr))) { + log(LOG_ERR, "%s: possible hardware address duplication " + "detected, disable IPv6\n", if_name(ifp)); + + lck_rw_lock_shared(nd_if_rwlock); + nd_ifinfo[ifp->if_index].flags |= + ND6_IFF_IFDISABLED; + lck_rw_done(nd_if_rwlock); + } + + /* Send an event to the configuration agent so that the + * duplicate address will be notified to the user and will + * be removed. + */ + in6_post_msg(ifp, KEV_INET6_NEW_USER_ADDR, ia); + nd6_dad_detach(dp, ifa); + DAD_REMREF(dp); /* drop our reference */ } static void -nd6_dad_ns_output( - struct dadq *dp, - struct ifaddr *ifa) +nd6_dad_ns_output(struct dadq *dp, struct ifaddr *ifa) { struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct ifnet *ifp = ifa->ifa_ifp; + struct in6_addr taddr6; + DAD_LOCK(dp); dp->dad_ns_tcount++; if ((ifp->if_flags & IFF_UP) == 0) { -#if 0 - printf("%s: interface down?\n", if_name(ifp)); -#endif + DAD_UNLOCK(dp); return; } if ((ifp->if_flags & IFF_RUNNING) == 0) { -#if 0 - printf("%s: interface not running?\n", if_name(ifp)); -#endif + DAD_UNLOCK(dp); return; } dp->dad_ns_ocount++; - nd6_ns_output(ifp, NULL, &ia->ia_addr.sin6_addr, NULL, 1, 0); + DAD_UNLOCK(dp); + IFA_LOCK_SPIN(&ia->ia_ifa); + taddr6 = ia->ia_addr.sin6_addr; + IFA_UNLOCK(&ia->ia_ifa); + nd6_ns_output(ifp, NULL, &taddr6, NULL, 1); } static void -nd6_dad_ns_input( - struct ifaddr *ifa) +nd6_dad_ns_input(struct ifaddr *ifa) { - struct in6_ifaddr *ia; - const struct in6_addr *taddr6; struct dadq *dp; int duplicate; + struct ifnet *ifp; - if (!ifa) + if (ifa == NULL) panic("ifa == NULL in nd6_dad_ns_input"); - ia = (struct in6_ifaddr *)ifa; - taddr6 = &ia->ia_addr.sin6_addr; + ifp = ifa->ifa_ifp; duplicate = 0; dp = nd6_dad_find(ifa); /* Quickhack - completely ignore DAD NS packets */ if (dad_ignore_ns) { + struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; + IFA_LOCK(&ia->ia_ifa); nd6log((LOG_INFO, "nd6_dad_ns_input: ignoring DAD NS packet for " - "address %s(%s)\n", ip6_sprintf(taddr6), + "address %s(%s)\n", ip6_sprintf(&ia->ia_addr.sin6_addr), if_name(ifa->ifa_ifp))); + IFA_UNLOCK(&ia->ia_ifa); return; } @@ -1494,37 +1827,184 @@ nd6_dad_ns_input( * if I'm yet to start DAD, someone else started using this address * first. I have a duplicate and you win. */ - if (!dp || dp->dad_ns_ocount == 0) + if (dp != NULL) + DAD_LOCK(dp); + if (dp == NULL || dp->dad_ns_ocount == 0) duplicate++; /* XXX more checks for loopback situation - see nd6_dad_timer too */ if (duplicate) { - dp = NULL; /* will be freed in nd6_dad_duplicated() */ - nd6_dad_duplicated(ifa); - } else { + if (dp != NULL) { + DAD_UNLOCK(dp); + DAD_REMREF(dp); + dp = NULL; + } + nd6_dad_duplicated(ifa, TRUE); + } else if (dp != NULL) { /* * not sure if I got a duplicate. * increment ns count and see what happens. */ - if (dp) - dp->dad_ns_icount++; + dp->dad_ns_icount++; + DAD_UNLOCK(dp); + DAD_REMREF(dp); } } static void -nd6_dad_na_input( - struct ifaddr *ifa) +nd6_dad_na_input(struct ifaddr *ifa, caddr_t lladdr, int lladdrlen) { + struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct dadq *dp; + int hwdupposs; - if (!ifa) + if (ifa == NULL) panic("ifa == NULL in nd6_dad_na_input"); dp = nd6_dad_find(ifa); - if (dp) - dp->dad_na_icount++; - + if (dp == NULL) { + log(LOG_ERR, "nd6_dad_na_input: DAD structure not found\n"); + return; + } + + /* + * If the address is a link-local address formed from an interface + * identifier based on the hardware address which is supposed to be + * uniquely assigned (e.g., EUI-64 for an Ethernet interface), IP + * operation on the interface SHOULD be disabled according to RFC 4862, + * section 5.4.5, but here we decide not to disable if the target + * hardware address is not also ours, which is a transitory possibility + * in the presence of network-resident sleep proxies on the local link. + */ + hwdupposs = 0; + IFA_LOCK(ifa); + if (IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) { + struct ifnet *ifp; + struct in6_addr in6; + + IFA_UNLOCK(ifa); + ifp = ifa->ifa_ifp; + + /* + * To avoid over-reaction, we only apply this logic when we are + * very sure that hardware addresses are supposed to be unique. + */ + switch (ifp->if_type) { + case IFT_BRIDGE: + case IFT_ETHER: + case IFT_FDDI: + case IFT_ATM: + case IFT_IEEE1394: +#ifdef IFT_IEEE80211 + case IFT_IEEE80211: +#endif + /* Check if our hardware address matches the target */ + if (lladdr != NULL && lladdrlen > 0) { + struct ifaddr *llifa; + struct sockaddr_dl *sdl; + + llifa = ifp->if_lladdr; + IFA_LOCK(llifa); + sdl = (struct sockaddr_dl *)llifa->ifa_addr; + if (lladdrlen == sdl->sdl_alen || + bcmp(lladdr, LLADDR(sdl), lladdrlen) == 0) + hwdupposs = 1; + IFA_UNLOCK(llifa); + } + in6 = ia->ia_addr.sin6_addr; + if (in6_get_hw_ifid(ifp, &in6) != 0) + break; + /* + * Apply this logic only to the EUI-64 form of + * link-local interface identifiers. + */ + IFA_LOCK(ifa); + if (hwdupposs && + !IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &in6)) { + hwdupposs = 0; + } else if (lladdr == NULL && + IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &in6)) { + /* + * We received a NA with no target link-layer + * address option. This means that someone else + * has our address. Mark it as a hardware + * duplicate so we disable IPv6 later on. + */ + hwdupposs = 1; + } + IFA_UNLOCK(ifa); + break; + default: + break; + } + } else { + IFA_UNLOCK(ifa); + } + + DAD_LOCK_SPIN(dp); + dp->dad_na_icount++; + if (hwdupposs) + dp->dad_na_ixcount++; + DAD_UNLOCK(dp); + DAD_REMREF(dp); + /* remove the address. */ - nd6_dad_duplicated(ifa); + nd6_dad_duplicated(ifa, FALSE); +} + +static void +dad_addref(struct dadq *dp, int locked) +{ + if (!locked) + DAD_LOCK_SPIN(dp); + else + DAD_LOCK_ASSERT_HELD(dp); + + if (++dp->dad_refcount == 0) { + panic("%s: dad %p wraparound refcnt\n", __func__, dp); + /* NOTREACHED */ + } + if (!locked) + DAD_UNLOCK(dp); +} + +static void +dad_remref(struct dadq *dp) +{ + struct ifaddr *ifa; + + DAD_LOCK_SPIN(dp); + if (dp->dad_refcount == 0) + panic("%s: dad %p negative refcnt\n", __func__, dp); + --dp->dad_refcount; + if (dp->dad_refcount > 0) { + DAD_UNLOCK(dp); + return; + } + DAD_UNLOCK(dp); + + if (dp->dad_attached || + dp->dad_list.tqe_next != NULL || dp->dad_list.tqe_prev != NULL) { + panic("%s: attached dad=%p is being freed", __func__, dp); + /* NOTREACHED */ + } + + if ((ifa = dp->dad_ifa) != NULL) { + IFA_REMREF(ifa); /* drop dad_ifa reference */ + dp->dad_ifa = NULL; + } + + lck_mtx_destroy(&dp->dad_lock, ifa_mtx_grp); + zfree(dad_zone, dp); +} + +void +nd6_llreach_set_reachable(struct ifnet *ifp, void *addr, unsigned int alen) +{ + /* Nothing more to do if it's disabled */ + if (nd6_llreach_base == 0) + return; + + ifnet_llreach_set_reachable(ifp, ETHERTYPE_IPV6, addr, alen); } diff --git a/bsd/netinet6/nd6_rtr.c b/bsd/netinet6/nd6_rtr.c index 10e965185..2e5c5eae5 100644 --- a/bsd/netinet6/nd6_rtr.c +++ b/bsd/netinet6/nd6_rtr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2008 Apple Inc. All rights reserved. + * Copyright (c) 2003-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -70,7 +70,11 @@ #include #include #include +#include + #include +#include +#include #include #include @@ -91,41 +95,170 @@ #define SDL(s) ((struct sockaddr_dl *)s) +static struct nd_defrouter *defrtrlist_update_common(struct nd_defrouter *, + boolean_t); static struct nd_defrouter *defrtrlist_update(struct nd_defrouter *); -static struct in6_ifaddr *in6_ifadd(struct nd_prefix *, - struct in6_addr *); + +static struct in6_ifaddr *in6_ifadd(struct nd_prefix *, int); +static void defrtrlist_sync(struct ifnet *); + +static void defrouter_select_common(struct ifnet *, int); + static struct nd_pfxrouter *pfxrtr_lookup(struct nd_prefix *, struct nd_defrouter *); static void pfxrtr_add(struct nd_prefix *, struct nd_defrouter *); static void pfxrtr_del(struct nd_pfxrouter *); static struct nd_pfxrouter *find_pfxlist_reachable_router(struct nd_prefix *); -static void defrouter_addifreq(struct ifnet *); static void nd6_rtmsg(int, struct rtentry *); -static void in6_init_address_ltimes(struct nd_prefix *ndpr, - struct in6_addrlifetime *lt6); +static int nd6_prefix_onlink_common(struct nd_prefix *, boolean_t, + unsigned int); +static struct nd_prefix *nd6_prefix_equal_lookup(struct nd_prefix *, boolean_t); +static void nd6_prefix_sync(struct ifnet *); + +static void in6_init_address_ltimes(struct nd_prefix *, + struct in6_addrlifetime *, boolean_t); static int rt6_deleteroute(struct radix_node *, void *); +static struct nd_defrouter *nddr_alloc(int); +static void nddr_free(struct nd_defrouter *); +static void nddr_trace(struct nd_defrouter *, int); + +static struct nd_prefix *ndpr_alloc(int); +static void ndpr_free(struct nd_prefix *); +static void ndpr_trace(struct nd_prefix *, int); + extern int nd6_recalc_reachtm_interval; static struct ifnet *nd6_defifp; int nd6_defifindex; +static unsigned int nd6_defrouter_genid; + +int ip6_use_tempaddr = 1; /* use temp addr by default for testing now */ -int ip6_use_tempaddr = 0; +int nd6_accept_6to4 = 1; int ip6_desync_factor; u_int32_t ip6_temp_preferred_lifetime = DEF_TEMP_PREFERRED_LIFETIME; u_int32_t ip6_temp_valid_lifetime = DEF_TEMP_VALID_LIFETIME; /* * shorter lifetimes for debugging purposes. -int ip6_temp_preferred_lifetime = 800; -static int ip6_temp_valid_lifetime = 1800; +u_int32_t ip6_temp_preferred_lifetime = 800; +static u_int32_t ip6_temp_valid_lifetime = 1800; */ int ip6_temp_regen_advance = TEMPADDR_REGEN_ADVANCE; extern lck_mtx_t *nd6_mutex; +/* Serialization variables for single thread access to nd_prefix */ +static boolean_t nd_prefix_busy; +static void *nd_prefix_waitchan = &nd_prefix_busy; +static int nd_prefix_waiters = 0; + +/* Serialization variables for single thread access to nd_defrouter */ +static boolean_t nd_defrouter_busy; +static void *nd_defrouter_waitchan = &nd_defrouter_busy; +static int nd_defrouter_waiters = 0; + +/* RTPREF_MEDIUM has to be 0! */ +#define RTPREF_HIGH 1 +#define RTPREF_MEDIUM 0 +#define RTPREF_LOW (-1) +#define RTPREF_RESERVED (-2) +#define RTPREF_INVALID (-3) /* internal */ + +#define NDPR_TRACE_HIST_SIZE 32 /* size of trace history */ + +/* For gdb */ +__private_extern__ unsigned int ndpr_trace_hist_size = NDPR_TRACE_HIST_SIZE; + +struct nd_prefix_dbg { + struct nd_prefix ndpr_pr; /* nd_prefix */ + u_int16_t ndpr_refhold_cnt; /* # of ref */ + u_int16_t ndpr_refrele_cnt; /* # of rele */ + /* + * Circular lists of ndpr_addref and ndpr_remref callers. + */ + ctrace_t ndpr_refhold[NDPR_TRACE_HIST_SIZE]; + ctrace_t ndpr_refrele[NDPR_TRACE_HIST_SIZE]; +}; + +static unsigned int ndpr_debug; /* debug flags */ +static unsigned int ndpr_size; /* size of zone element */ +static struct zone *ndpr_zone; /* zone for nd_prefix */ + +#define NDPR_ZONE_MAX 64 /* maximum elements in zone */ +#define NDPR_ZONE_NAME "nd6_prefix" /* zone name */ + +#define NDDR_TRACE_HIST_SIZE 32 /* size of trace history */ + +/* For gdb */ +__private_extern__ unsigned int nddr_trace_hist_size = NDDR_TRACE_HIST_SIZE; + +struct nd_defrouter_dbg { + struct nd_defrouter nddr_dr; /* nd_defrouter */ + uint16_t nddr_refhold_cnt; /* # of ref */ + uint16_t nddr_refrele_cnt; /* # of rele */ + /* + * Circular lists of ndpr_addref and ndpr_remref callers. + */ + ctrace_t nddr_refhold[NDDR_TRACE_HIST_SIZE]; + ctrace_t nddr_refrele[NDDR_TRACE_HIST_SIZE]; +}; + +static unsigned int nddr_debug; /* debug flags */ +static unsigned int nddr_size; /* size of zone element */ +static struct zone *nddr_zone; /* zone for nd_defrouter */ + +#define NDDR_ZONE_MAX 64 /* maximum elements in zone */ +#define NDDR_ZONE_NAME "nd6_defrouter" /* zone name */ + +static unsigned int ndprtr_size; /* size of zone element */ +static struct zone *ndprtr_zone; /* zone for nd_pfxrouter */ + +#define NDPRTR_ZONE_MAX 64 /* maximum elements in zone */ +#define NDPRTR_ZONE_NAME "nd6_pfxrouter" /* zone name */ + +void +nd6_rtr_init(void) +{ + PE_parse_boot_argn("ifa_debug", &ndpr_debug, sizeof (ndpr_debug)); + PE_parse_boot_argn("ifa_debug", &nddr_debug, sizeof (nddr_debug)); + + ndpr_size = (ndpr_debug == 0) ? sizeof (struct nd_prefix) : + sizeof (struct nd_prefix_dbg); + ndpr_zone = zinit(ndpr_size, NDPR_ZONE_MAX * ndpr_size, 0, + NDPR_ZONE_NAME); + if (ndpr_zone == NULL) { + panic("%s: failed allocating %s", __func__, NDPR_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(ndpr_zone, Z_EXPAND, TRUE); + zone_change(ndpr_zone, Z_CALLERACCT, FALSE); + + nddr_size = (nddr_debug == 0) ? sizeof (struct nd_defrouter) : + sizeof (struct nd_defrouter_dbg); + nddr_zone = zinit(nddr_size, NDDR_ZONE_MAX * nddr_size, 0, + NDDR_ZONE_NAME); + if (nddr_zone == NULL) { + panic("%s: failed allocating %s", __func__, NDDR_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(nddr_zone, Z_EXPAND, TRUE); + zone_change(nddr_zone, Z_CALLERACCT, FALSE); + + ndprtr_size = sizeof (struct nd_pfxrouter); + ndprtr_zone = zinit(ndprtr_size, NDPRTR_ZONE_MAX * ndprtr_size, 0, + NDPRTR_ZONE_NAME); + if (ndprtr_zone == NULL) { + panic("%s: failed allocating %s", __func__, NDPRTR_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(ndprtr_zone, Z_EXPAND, TRUE); + zone_change(ndprtr_zone, Z_CALLERACCT, FALSE); +} + /* * Receive Router Solicitation Message - just for routers. * Router solicitation/advertisement is mostly managed by userland program @@ -143,17 +276,8 @@ nd6_rs_input( struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_router_solicit *nd_rs; struct in6_addr saddr6 = ip6->ip6_src; -#if 0 - struct in6_addr daddr6 = ip6->ip6_dst; -#endif char *lladdr = NULL; int lladdrlen = 0; -#if 0 - struct sockaddr_dl *sdl = (struct sockaddr_dl *)NULL; - struct llinfo_nd6 *ln = (struct llinfo_nd6 *)NULL; - struct rtentry *rt = NULL; - int is_newentry; -#endif union nd_opts ndopts; /* If I'm not a router, ignore it. */ @@ -170,11 +294,25 @@ nd6_rs_input( } /* - * Don't update the neighbor cache, if src = ::. - * This indicates that the src has no IP address assigned yet. - */ - if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) + * Don't update the neighbor cache, if src = :: or a non-neighbor. + * The former case indicates that the src has no IP address assigned + * yet. See nd6_ns_input() for the latter case. + */ + if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) goto freeit; + else { + struct sockaddr_in6 src_sa6; + + bzero(&src_sa6, sizeof(src_sa6)); + src_sa6.sin6_family = AF_INET6; + src_sa6.sin6_len = sizeof(src_sa6); + src_sa6.sin6_addr = ip6->ip6_src; + if (!nd6_is_addr_neighbor(&src_sa6, ifp, 0)) { + nd6log((LOG_INFO, "nd6_rs_input: " + "RS packet from non-neighbor\n")); + goto freeit; + } + } #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, icmp6len, return); @@ -238,14 +376,9 @@ nd6_ra_input( struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_router_advert *nd_ra; struct in6_addr saddr6 = ip6->ip6_src; -#if 0 - struct in6_addr daddr6 = ip6->ip6_dst; - int flags; /* = nd_ra->nd_ra_flags_reserved; */ - int is_managed = ((flags & ND_RA_FLAG_MANAGED) != 0); - int is_other = ((flags & ND_RA_FLAG_OTHER) != 0); -#endif + int mcast = 0; union nd_opts ndopts; - struct nd_defrouter *dr; + struct nd_defrouter *dr = NULL; struct timeval timenow; getmicrotime(&timenow); @@ -292,20 +425,22 @@ nd6_ra_input( struct nd_defrouter dr0; u_int32_t advreachable = nd_ra->nd_ra_reachable; + /* remember if this is a multicasted advertisement */ + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) + mcast = 1; + lck_rw_lock_shared(nd_if_rwlock); if (ifp->if_index >= nd_ifinfo_indexlim) { lck_rw_done(nd_if_rwlock); goto freeit; } ndi = &nd_ifinfo[ifp->if_index]; + bzero(&dr0, sizeof (dr0)); dr0.rtaddr = saddr6; dr0.flags = nd_ra->nd_ra_flags_reserved; dr0.rtlifetime = ntohs(nd_ra->nd_ra_router_lifetime); dr0.expire = timenow.tv_sec + dr0.rtlifetime; dr0.ifp = ifp; - dr0.advint = 0; /* Mobile IPv6 */ - dr0.advint_expire = 0; /* Mobile IPv6 */ - dr0.advints_lost = 0; /* Mobile IPv6 */ /* unspecified or not? (RFC 2461 6.3.4) */ if (advreachable) { advreachable = ntohl(advreachable); @@ -322,7 +457,9 @@ nd6_ra_input( ndi->chlim = nd_ra->nd_ra_curhoplimit; lck_rw_done(nd_if_rwlock); ndi = NULL; + lck_mtx_lock(nd6_mutex); dr = defrtrlist_update(&dr0); + lck_mtx_unlock(nd6_mutex); } /* @@ -366,18 +503,9 @@ nd6_ra_input( continue; } - /* aggregatable unicast address, rfc2374 */ - if ((pi->nd_opt_pi_prefix.s6_addr8[0] & 0xe0) == 0x20 - && pi->nd_opt_pi_prefix_len != 64) { - nd6log((LOG_INFO, - "nd6_ra_input: invalid prefixlen " - "%d for rfc2374 prefix %s, ignored\n", - pi->nd_opt_pi_prefix_len, - ip6_sprintf(&pi->nd_opt_pi_prefix))); - continue; - } - bzero(&pr, sizeof(pr)); + lck_mtx_init(&pr.ndpr_lock, ifa_mtx_grp, ifa_mtx_attr); + NDPR_LOCK(&pr); pr.ndpr_prefix.sin6_family = AF_INET6; pr.ndpr_prefix.sin6_len = sizeof(pr.ndpr_prefix); pr.ndpr_prefix.sin6_addr = pi->nd_opt_pi_prefix; @@ -392,10 +520,35 @@ nd6_ra_input( pr.ndpr_pltime = ntohl(pi->nd_opt_pi_preferred_time); - if (in6_init_prefix_ltimes(&pr)) - continue; /* prefix lifetime init failed */ + /* + * Exceptions to stateless autoconfiguration processing: + * + nd6_accept_6to4 == 0 && address has 6to4 prefix + * + ip6_only_allow_rfc4193_prefix != 0 && address not RFC 4193 + */ + if (ip6_only_allow_rfc4193_prefix && + !IN6_IS_ADDR_UNIQUE_LOCAL(&pi->nd_opt_pi_prefix)) { + nd6log((LOG_INFO, + "nd6_ra_input: no SLAAC on prefix %s [not RFC 4193]\n", + ip6_sprintf(&pi->nd_opt_pi_prefix))); + pr.ndpr_raf_auto = 0; + } + else if (!nd6_accept_6to4 && + IN6_IS_ADDR_6TO4(&pi->nd_opt_pi_prefix)) { + nd6log((LOG_INFO, + "nd6_ra_input: no SLAAC on prefix %s [6to4]\n", + ip6_sprintf(&pi->nd_opt_pi_prefix))); + pr.ndpr_raf_auto = 0; + } - (void)prelist_update(&pr, dr, m); + if (in6_init_prefix_ltimes(&pr)) { + NDPR_UNLOCK(&pr); + lck_mtx_destroy(&pr.ndpr_lock, ifa_mtx_grp); + continue; /* prefix lifetime init failed */ + } else { + NDPR_UNLOCK(&pr); + } + (void)prelist_update(&pr, dr, m, mcast); + lck_mtx_destroy(&pr.ndpr_lock, ifa_mtx_grp); } } @@ -475,16 +628,20 @@ nd6_ra_input( * router's neighbor cache, which might also affect our on-link * detection of adveritsed prefixes. */ - pfxlist_onlink_check(0); + lck_mtx_lock(nd6_mutex); + pfxlist_onlink_check(); + lck_mtx_unlock(nd6_mutex); } freeit: m_freem(m); + if (dr) + NDDR_REMREF(dr); return; bad: icmp6stat.icp6s_badra++; - m_freem(m); + goto freeit; } /* @@ -503,13 +660,16 @@ nd6_rtmsg(cmd, rt) RT_LOCK_ASSERT_HELD(rt); bzero((caddr_t)&info, sizeof(info)); - /* Lock ifp for if_addrlist */ + /* Lock ifp for if_lladdr */ ifnet_lock_shared(ifp); info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rt_mask(rt); - info.rti_info[RTAX_IFP] = - TAILQ_FIRST(&ifp->if_addrlist)->ifa_addr; + /* + * ifa_addr pointers for both should always be valid + * in this context; no need to hold locks. + */ + info.rti_info[RTAX_IFP] = ifp->if_lladdr->ifa_addr; info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; rt_missmsg(cmd, &info, rt->rt_flags, 0); @@ -517,11 +677,21 @@ nd6_rtmsg(cmd, rt) } void -defrouter_addreq( - struct nd_defrouter *new) +defrouter_addreq(struct nd_defrouter *new, boolean_t scoped) { struct sockaddr_in6 def, mask, gate; struct rtentry *newrt = NULL; + unsigned int ifscope; + int err; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_NOTOWNED); + + if (new->stateflags & NDDRF_INSTALLED) + return; + + nd6log2((LOG_INFO, "%s: adding default router %s, scoped=%d, " + "static=%d\n", if_name(new->ifp), ip6_sprintf(&new->rtaddr), + scoped, (new->stateflags & NDDRF_STATIC) ? 1 : 0)); Bzero(&def, sizeof(def)); Bzero(&mask, sizeof(mask)); @@ -532,65 +702,28 @@ defrouter_addreq( def.sin6_family = mask.sin6_family = gate.sin6_family = AF_INET6; gate.sin6_addr = new->rtaddr; - (void) rtrequest(RTM_ADD, (struct sockaddr *)&def, + ifscope = scoped ? new->ifp->if_index : IFSCOPE_NONE; + + err = rtrequest_scoped(RTM_ADD, (struct sockaddr *)&def, (struct sockaddr *)&gate, (struct sockaddr *)&mask, - RTF_GATEWAY, &newrt); + RTF_GATEWAY, &newrt, ifscope); + if (newrt) { RT_LOCK(newrt); nd6_rtmsg(RTM_ADD, newrt); /* tell user process */ RT_REMREF_LOCKED(newrt); RT_UNLOCK(newrt); - } - return; -} - -/* Add a route to a given interface as default */ -void -defrouter_addifreq( - struct ifnet *ifp) -{ - struct sockaddr_in6 def, mask; - struct ifaddr *ifa = NULL; - struct rtentry *newrt = NULL; - int error; - u_int32_t flags; - - bzero(&def, sizeof(def)); - bzero(&mask, sizeof(mask)); - - def.sin6_len = mask.sin6_len = sizeof(struct sockaddr_in6); - def.sin6_family = mask.sin6_family = AF_INET6; - - /* - * Search for an ifaddr beloging to the specified interface. - * XXX: An IPv6 address are required to be assigned on the interface. - */ - if ((ifa = ifaof_ifpforaddr((struct sockaddr *)&def, ifp)) == NULL) { - nd6log((LOG_ERR, /* better error? */ - "defrouter_addifreq: failed to find an ifaddr " - "to install a route to interface %s\n", - if_name(ifp))); - return; - } - - flags = ifa->ifa_flags; - error = rtrequest(RTM_ADD, (struct sockaddr *)&def, ifa->ifa_addr, - (struct sockaddr *)&mask, flags, &newrt); - if (error != 0) { - nd6log((LOG_ERR, - "defrouter_addifreq: failed to install a route to " - "interface %s (errno = %d)\n", - if_name(ifp), error)); + new->stateflags |= NDDRF_INSTALLED; + if (ifscope != IFSCOPE_NONE) + new->stateflags |= NDDRF_IFSCOPE; + new->genid = nd6_defrouter_genid; } else { - if (newrt) { - RT_LOCK(newrt); - nd6_rtmsg(RTM_ADD, newrt); - RT_REMREF_LOCKED(newrt); - RT_UNLOCK(newrt); - } - in6_post_msg(ifp, KEV_INET6_DEFROUTER, (struct in6_ifaddr *)ifa); + nd6log((LOG_ERR, "%s: failed to add default router " + "%s on %s scoped %d (errno = %d)\n", __func__, + ip6_sprintf(&gate.sin6_addr), if_name(new->ifp), + (ifscope != IFSCOPE_NONE), err)); } - ifafree(ifa); + new->err = err; } struct nd_defrouter * @@ -600,25 +733,47 @@ defrouter_lookup( { struct nd_defrouter *dr; - lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); for (dr = TAILQ_FIRST(&nd_defrouter); dr; dr = TAILQ_NEXT(dr, dr_entry)) { - if (dr->ifp == ifp && IN6_ARE_ADDR_EQUAL(addr, &dr->rtaddr)) + NDDR_LOCK(dr); + if (dr->ifp == ifp && IN6_ARE_ADDR_EQUAL(addr, &dr->rtaddr)) { + NDDR_ADDREF_LOCKED(dr); + NDDR_UNLOCK(dr); return(dr); + } + NDDR_UNLOCK(dr); } - return(NULL); /* search failed */ + return (NULL); /* search failed */ } +/* + * Remove the default route for a given router. + * This is just a subroutine function for defrouter_select(), and should + * not be called from anywhere else. + */ void -defrouter_delreq( - struct nd_defrouter *dr, - int dofree) +defrouter_delreq(struct nd_defrouter *dr) { struct sockaddr_in6 def, mask, gate; struct rtentry *oldrt = NULL; + unsigned int ifscope; + int err; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_NOTOWNED); + + /* ifp would be NULL for the "drany" case */ + if (dr->ifp != NULL && !(dr->stateflags & NDDRF_INSTALLED)) + return; + + NDDR_LOCK_ASSERT_HELD(dr); + + nd6log2((LOG_INFO, "%s: removing default router %s, scoped=%d, " + "static=%d\n", dr->ifp != NULL ? if_name(dr->ifp) : "ANY", + ip6_sprintf(&dr->rtaddr), (dr->stateflags & NDDRF_IFSCOPE) ? 1 : 0, + (dr->stateflags & NDDRF_STATIC) ? 1 : 0)); Bzero(&def, sizeof(def)); Bzero(&mask, sizeof(mask)); @@ -629,28 +784,155 @@ defrouter_delreq( def.sin6_family = mask.sin6_family = gate.sin6_family = AF_INET6; gate.sin6_addr = dr->rtaddr; - (void) rtrequest(RTM_DELETE, (struct sockaddr *)&def, - (struct sockaddr *)&gate, (struct sockaddr *)&mask, - RTF_GATEWAY, &oldrt); + if (dr->ifp != NULL) { + ifscope = (dr->stateflags & NDDRF_IFSCOPE) ? + dr->ifp->if_index : IFSCOPE_NONE; + } else { + ifscope = IFSCOPE_NONE; + } + err = rtrequest_scoped(RTM_DELETE, + (struct sockaddr *)&def, (struct sockaddr *)&gate, + (struct sockaddr *)&mask, RTF_GATEWAY, &oldrt, ifscope); + if (oldrt) { RT_LOCK(oldrt); nd6_rtmsg(RTM_DELETE, oldrt); RT_UNLOCK(oldrt); rtfree(oldrt); + } else if (err != ESRCH) { + nd6log((LOG_ERR, "%s: failed to delete default router " + "%s on %s scoped %d (errno = %d)\n", __func__, + ip6_sprintf(&gate.sin6_addr), dr->ifp != NULL ? + if_name(dr->ifp) : "ANY", (ifscope != IFSCOPE_NONE), err)); + } + /* ESRCH means it's no longer in the routing table; ignore it */ + if (oldrt != NULL || err == ESRCH) { + dr->stateflags &= ~NDDRF_INSTALLED; + if (ifscope != IFSCOPE_NONE) + dr->stateflags &= ~NDDRF_IFSCOPE; + } + dr->err = 0; +} + + +/* + * remove all default routes from default router list + */ +void +defrouter_reset(void) +{ + struct nd_defrouter *dr, drany; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + + dr = TAILQ_FIRST(&nd_defrouter); + while (dr) { + NDDR_LOCK(dr); + if (dr->stateflags & NDDRF_INSTALLED) { + NDDR_ADDREF_LOCKED(dr); + NDDR_UNLOCK(dr); + lck_mtx_unlock(nd6_mutex); + NDDR_LOCK(dr); + defrouter_delreq(dr); + NDDR_UNLOCK(dr); + lck_mtx_lock(nd6_mutex); + NDDR_REMREF(dr); + dr = TAILQ_FIRST(&nd_defrouter); + } else { + NDDR_UNLOCK(dr); + dr = TAILQ_NEXT(dr, dr_entry); + } + } + + /* Nuke primary (non-scoped) default router */ + if (ip6_doscopedroute) { + bzero(&drany, sizeof (drany)); + lck_mtx_init(&drany.nddr_lock, ifa_mtx_grp, ifa_mtx_attr); + lck_mtx_unlock(nd6_mutex); + NDDR_LOCK(&drany); + defrouter_delreq(&drany); + NDDR_UNLOCK(&drany); + lck_mtx_destroy(&drany.nddr_lock, ifa_mtx_grp); + lck_mtx_lock(nd6_mutex); + } + +} + +int +defrtrlist_ioctl(u_long cmd, caddr_t data) +{ + struct in6_defrouter_32 *r_32 = (struct in6_defrouter_32 *)data; + struct in6_defrouter_64 *r_64 = (struct in6_defrouter_64 *)data; + struct nd_defrouter dr0; + unsigned int ifindex; + struct ifnet *dr_ifp; + int error = 0, add = 0; + + switch (cmd) { + case SIOCDRADD_IN6_32: + case SIOCDRADD_IN6_64: + ++add; + /* FALLTHRU */ + case SIOCDRDEL_IN6_32: + case SIOCDRDEL_IN6_64: + bzero(&dr0, sizeof (dr0)); + if (cmd == SIOCDRADD_IN6_64 || cmd == SIOCDRDEL_IN6_64) { + dr0.rtaddr = r_64->rtaddr.sin6_addr; + dr0.flags = r_64->flags; + ifindex = r_64->if_index; + } else { + dr0.rtaddr = r_32->rtaddr.sin6_addr; + dr0.flags = r_32->flags; + ifindex = r_32->if_index; + } + ifnet_head_lock_shared(); + /* Don't need to check is ifindex is < 0 since it's unsigned */ + if (if_index < ifindex || + (dr_ifp = ifindex2ifnet[ifindex]) == NULL) { + ifnet_head_done(); + error = EINVAL; + break; + } + dr0.ifp = dr_ifp; + ifnet_head_done(); + + if (IN6_IS_SCOPE_EMBED(&dr0.rtaddr)) { + uint16_t *scope = &dr0.rtaddr.s6_addr16[1]; + + if (*scope == 0) { + *scope = htons(dr_ifp->if_index); + } else if (*scope != htons(dr_ifp->if_index)) { + error = EINVAL; + break; + } + } + + if (add) + error = defrtrlist_add_static(&dr0); + if (!add || error != 0) { + int err = defrtrlist_del_static(&dr0); + if (!add) + error = err; + } + break; + + default: + error = EOPNOTSUPP; /* check for safety */ + break; } - if (dofree) /* XXX: necessary? */ - FREE(dr, M_IP6NDP); + return (error); } void -defrtrlist_del( - struct nd_defrouter *dr, int nd6locked) +defrtrlist_del(struct nd_defrouter *dr) { struct nd_defrouter *deldr = NULL; struct nd_prefix *pr; struct ifnet *ifp = dr->ifp; + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + /* * Flush all the routing table entries that use the router * as a next hop. @@ -658,38 +940,60 @@ defrtrlist_del( if (!ip6_forwarding && (ip6_accept_rtadv || (ifp->if_eflags & IFEF_ACCEPT_RTADVD))) { /* above is a good condition? */ + NDDR_ADDREF(dr); + lck_mtx_unlock(nd6_mutex); rt6_flush(&dr->rtaddr, ifp); + lck_mtx_lock(nd6_mutex); + NDDR_REMREF(dr); } - if (nd6locked == 0) - lck_mtx_lock(nd6_mutex); if (dr == TAILQ_FIRST(&nd_defrouter)) deldr = dr; /* The router is primary. */ TAILQ_REMOVE(&nd_defrouter, dr, dr_entry); + ++nd6_defrouter_genid; + + nd6log2((LOG_INFO, "%s: freeing defrouter %s\n", if_name(dr->ifp), + ip6_sprintf(&dr->rtaddr))); + + /* + * Delete it from the routing table. + */ + NDDR_ADDREF(dr); + lck_mtx_unlock(nd6_mutex); + NDDR_LOCK(dr); + defrouter_delreq(dr); + NDDR_UNLOCK(dr); + lck_mtx_lock(nd6_mutex); + NDDR_REMREF(dr); /* * Also delete all the pointers to the router in each prefix lists. */ for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { struct nd_pfxrouter *pfxrtr; + + NDPR_LOCK(pr); if ((pfxrtr = pfxrtr_lookup(pr, dr)) != NULL) pfxrtr_del(pfxrtr); + NDPR_UNLOCK(pr); } - pfxlist_onlink_check(1); + + pfxlist_onlink_check(); /* - * If the router is the primary one, choose a new one. - * Note that defrouter_select() will remove the current gateway - * from the routing table. + * If the router is the primary one, choose a new one. If Scoped + * Routing is enabled, always try to pick another eligible router + * on this interface. */ - if (deldr) - defrouter_select(); + if ((deldr || ip6_doscopedroute) && !ip6_forwarding && + (ip6_accept_rtadv || (ifp->if_eflags & IFEF_ACCEPT_RTADVD))) + defrouter_select(ifp); lck_rw_lock_shared(nd_if_rwlock); if (ifp->if_index < nd_ifinfo_indexlim) { struct nd_ifinfo *ndi = &nd_ifinfo[ifp->if_index]; - ndi->ndefrouters--; + atomic_add_32(&ndi->ndefrouters, -1); if (ndi->ndefrouters < 0) { log(LOG_WARNING, "defrtrlist_del: negative " "count on %s\n", if_name(ifp)); @@ -697,177 +1001,822 @@ defrtrlist_del( } lck_rw_done(nd_if_rwlock); - if (nd6locked == 0) - lck_mtx_unlock(nd6_mutex); + NDDR_REMREF(dr); /* remove list reference */ +} + +int +defrtrlist_add_static(struct nd_defrouter *new) +{ + struct nd_defrouter *dr; + int err = 0; + + new->rtlifetime = -1; + new->stateflags |= NDDRF_STATIC; + + /* we only want the preference level */ + new->flags &= ND_RA_FLAG_RTPREF_MASK; + + lck_mtx_lock(nd6_mutex); + dr = defrouter_lookup(&new->rtaddr, new->ifp); + if (dr != NULL && !(dr->stateflags & NDDRF_STATIC)) { + err = EINVAL; + } else { + if (dr != NULL) + NDDR_REMREF(dr); + dr = defrtrlist_update(new); + if (dr != NULL) + err = dr->err; + else + err = ENOMEM; + } + if (dr != NULL) + NDDR_REMREF(dr); + lck_mtx_unlock(nd6_mutex); + + return (err); +} + +int +defrtrlist_del_static(struct nd_defrouter *new) +{ + struct nd_defrouter *dr; + + lck_mtx_lock(nd6_mutex); + dr = defrouter_lookup(&new->rtaddr, new->ifp); + if (dr == NULL || !(dr->stateflags & NDDRF_STATIC)) { + if (dr != NULL) + NDDR_REMREF(dr); + dr = NULL; + } else { + defrtrlist_del(dr); + NDDR_REMREF(dr); + } + lck_mtx_unlock(nd6_mutex); - FREE(dr, M_IP6NDP); + return (dr != NULL ? 0 : EINVAL); +} + +/* + * for default router selection + * regards router-preference field as a 2-bit signed integer + */ +static int +rtpref(struct nd_defrouter *dr) +{ + switch (dr->flags & ND_RA_FLAG_RTPREF_MASK) { + case ND_RA_FLAG_RTPREF_HIGH: + return (RTPREF_HIGH); + case ND_RA_FLAG_RTPREF_MEDIUM: + case ND_RA_FLAG_RTPREF_RSV: + return (RTPREF_MEDIUM); + case ND_RA_FLAG_RTPREF_LOW: + return (RTPREF_LOW); + default: + /* + * This case should never happen. If it did, it would mean a + * serious bug of kernel internal. We thus always bark here. + * Or, can we even panic? + */ + log(LOG_ERR, "rtpref: impossible RA flag %x\n", dr->flags); + return (RTPREF_INVALID); + } + /* NOTREACHED */ } /* - * Default Router Selection according to Section 6.3.6 of RFC 2461: - * 1) Routers that are reachable or probably reachable should be - * preferred. + * Default Router Selection according to Section 6.3.6 of RFC 2461 and + * draft-ietf-ipngwg-router-selection: + * + * 1) Routers that are reachable or probably reachable should be preferred. + * If we have more than one (probably) reachable router, prefer ones + * with the highest router preference. * 2) When no routers on the list are known to be reachable or * probably reachable, routers SHOULD be selected in a round-robin - * fashion. + * fashion, regardless of router preference values. * 3) If the Default Router List is empty, assume that all * destinations are on-link. + * + * When Scoped Routing is enabled, the selection logic is amended as follows: + * + * a) When a default interface is specified, the primary/non-scoped default + * router will be set to the reachable router on that link (if any) with + * the highest router preference. + * b) When there are more than one routers on the same link, the one with + * the highest router preference will be installed, either as scoped or + * non-scoped route entry. If they all share the same preference value, + * the one installed will be the static or the first encountered reachable + * router, i.e. static one wins over dynamic. + * c) When no routers on the list are known to be reachable, or probably + * reachable, no round-robin selection will take place when the default + * interface is set. + * + * We assume nd_defrouter is sorted by router preference value. + * Since the code below covers both with and without router preference cases, + * we do not need to classify the cases by ifdef. */ -void -defrouter_select() +static void +defrouter_select_common(struct ifnet *ifp, int ignore) { - struct nd_defrouter *dr, anydr; + struct nd_defrouter *dr, *selected_dr = NULL, *installed_dr = NULL; + struct nd_defrouter *installed_dr0 = NULL; struct rtentry *rt = NULL; struct llinfo_nd6 *ln = NULL; + int update = 0; + boolean_t found_installedrt = FALSE; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); /* - * Search for a (probably) reachable router from the list. + * This function should be called only when acting as an autoconfigured + * host. Although the remaining part of this function is not effective + * if the node is not an autoconfigured host, we explicitly exclude + * such cases here for safety. */ - lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + if (ip6_forwarding || (!ignore && !ip6_accept_rtadv && + !(ifp->if_eflags & IFEF_ACCEPT_RTADVD))) { + nd6log((LOG_WARNING, + "defrouter_select: called unexpectedly (forwarding=%d, " + "accept_rtadv=%d)\n", ip6_forwarding, ip6_accept_rtadv)); + return; + } + + /* + * Let's handle easy case (3) first: + * If default router list is empty, there's nothing to be done. + */ + if (!TAILQ_FIRST(&nd_defrouter)) + return; + + /* + * Due to the number of times we drop nd6_mutex, we need to + * serialize this function. + */ + while (nd_defrouter_busy) { + nd_defrouter_waiters++; + msleep(nd_defrouter_waitchan, nd6_mutex, (PZERO-1), + __func__, NULL); + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + } + nd_defrouter_busy = TRUE; + /* + * Search for a (probably) reachable router from the list. + * We just pick up the first reachable one (if any), assuming that + * the ordering rule of the list described in defrtrlist_update(). + * + * For all intents and purposes of Scoped Routing: + * selected_dr = candidate for primary router + * installed_dr = currently installed primary router + */ for (dr = TAILQ_FIRST(&nd_defrouter); dr; dr = TAILQ_NEXT(dr, dr_entry)) { + boolean_t reachable; + /* Callee returns a locked route upon success */ + reachable = FALSE; + NDDR_ADDREF(dr); /* for this for loop */ + lck_mtx_unlock(nd6_mutex); if ((rt = nd6_lookup(&dr->rtaddr, 0, dr->ifp, 0)) != NULL) { RT_LOCK_ASSERT_HELD(rt); if ((ln = rt->rt_llinfo) != NULL && ND6_IS_LLINFO_PROBREACH(ln)) { - RT_REMREF_LOCKED(rt); - RT_UNLOCK(rt); - /* Got it, and move it to the head */ - TAILQ_REMOVE(&nd_defrouter, dr, dr_entry); - TAILQ_INSERT_HEAD(&nd_defrouter, dr, dr_entry); - break; + reachable = TRUE; + if (selected_dr == NULL && + (!ip6_doscopedroute || + dr->ifp == nd6_defifp)) { + selected_dr = dr; + NDDR_ADDREF(selected_dr); + } } RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); + rt = NULL; } - } + lck_mtx_lock(nd6_mutex); - if ((dr = TAILQ_FIRST(&nd_defrouter))) { - /* - * De-install the previous default gateway and install - * a new one. - * Note that if there is no reachable router in the list, - * the head entry will be used anyway. - * XXX: do we have to check the current routing table entry? - */ - bzero(&anydr, sizeof(anydr)); - defrouter_delreq(&anydr, 0); - defrouter_addreq(dr); - } - else { - /* - * The Default Router List is empty, so install the default - * route to an inteface. - * XXX: The specification does not say this mechanism should - * be restricted to hosts, but this would be not useful - * (even harmful) for routers. - */ - if (!ip6_forwarding) { + /* Handle case (b) */ + if (ip6_doscopedroute && dr->ifp == nd6_defifp && + (selected_dr == NULL || rtpref(dr) > rtpref(selected_dr) || + (rtpref(dr) == rtpref(selected_dr) && + (dr->stateflags & NDDRF_STATIC) && + !(selected_dr->stateflags & NDDRF_STATIC)))) { + if (selected_dr) + NDDR_REMREF(selected_dr); + selected_dr = dr; + NDDR_ADDREF(selected_dr); + } + + if (!(dr->stateflags & NDDRF_INSTALLED)) { /* - * De-install the current default route - * in advance. + * If the router hasn't been installed and it is + * reachable, try to install it later on below. + * If it's static, try to install it anyway. */ - bzero(&anydr, sizeof(anydr)); - defrouter_delreq(&anydr, 0); - if (nd6_defifp) { - /* - * Install a route to the default interface - * as default route. - * XXX: we enable this for host only, because - * this may override a default route installed - * a user process (e.g. routing daemon) in a - * router case. - */ - defrouter_addifreq(nd6_defifp); + if (reachable || (dr->stateflags & NDDRF_STATIC)) { + dr->genid = -1; + ++update; + nd6log2((LOG_INFO, "%s: possible router %s, " + "scoped=%d, static=%d\n", if_name(dr->ifp), + ip6_sprintf(&dr->rtaddr), + (dr->stateflags & NDDRF_IFSCOPE) ? 1 : 0, + (dr->stateflags & NDDRF_STATIC) ? 1 : 0)); + } + NDDR_REMREF(dr); /* for this for loop */ + continue; + } + + /* Record the currently installed primary/non-scoped router */ + if (!ip6_doscopedroute || !(dr->stateflags & NDDRF_IFSCOPE)) { + if (installed_dr == NULL) { + installed_dr = dr; + NDDR_ADDREF(installed_dr); } else { - nd6log((LOG_INFO, "defrouter_select: " - "there's no default router and no default" - " interface\n")); + /* this should not happen; warn for diagnosis */ + log(LOG_ERR, "defrouter_select: more than one " + "%s default router is installed\n", + ip6_doscopedroute ? "non-scoped" : ""); } } + NDDR_REMREF(dr); /* for this for loop */ } - return; -} + /* If none was selected, use the currently installed one */ + if (ip6_doscopedroute && selected_dr == NULL && installed_dr != NULL) { + selected_dr = installed_dr; + NDDR_ADDREF(selected_dr); + } -static struct nd_defrouter * -defrtrlist_update( - struct nd_defrouter *new) -{ - struct nd_defrouter *dr, *n; - struct ifnet *ifp = new->ifp; - struct nd_ifinfo *ndi; + /* + * Install the unreachable one(s) if necesssary. + */ + for (dr = TAILQ_FIRST(&nd_defrouter); dr; + dr = TAILQ_NEXT(dr, dr_entry)) { + struct nd_defrouter *_dr; - lck_mtx_lock(nd6_mutex); - if ((dr = defrouter_lookup(&new->rtaddr, ifp)) != NULL) { - /* entry exists */ - if (new->rtlifetime == 0) { - defrtrlist_del(dr, 1); - dr = NULL; - } else { - /* override */ - dr->flags = new->flags; /* xxx flag check */ + if (!ip6_doscopedroute) + break; + + NDDR_LOCK(dr); + + /* If already (or will be) installed, skip */ + if ((dr->stateflags & NDDRF_INSTALLED) || dr->genid == -1) { + NDDR_UNLOCK(dr); + continue; + } + + /* See if there is already a default router for the link */ + for (_dr = TAILQ_FIRST(&nd_defrouter); _dr; + _dr = TAILQ_NEXT(_dr, dr_entry)) { + if (_dr != dr) + NDDR_LOCK(_dr); + if (_dr == dr || _dr->ifp != dr->ifp) { + if (_dr != dr) + NDDR_UNLOCK(_dr); + continue; + } + + if ((_dr->stateflags & NDDRF_INSTALLED) || + _dr->genid == -1) { + if (_dr != dr) + NDDR_UNLOCK(_dr); + break; + } + if (_dr != dr) + NDDR_UNLOCK(_dr); + } + + /* If none so far, schedule it to be installed below */ + if (_dr == NULL) { + dr->genid = -1; + ++update; + nd6log2((LOG_INFO, "%s: possible router %s, " + "static=%d (unreachable)\n", if_name(dr->ifp), + ip6_sprintf(&dr->rtaddr), + (dr->stateflags & NDDRF_STATIC) ? 1 : 0)); + } + NDDR_UNLOCK(dr); + } + + dr = selected_dr; + if (dr != NULL) { + nd6log2((LOG_INFO, "%s: considering primary default router %s, " + "static=%d [round 1]\n", if_name(dr->ifp), + ip6_sprintf(&dr->rtaddr), + (dr->stateflags & NDDRF_STATIC) ? 1 : 0)); + } + + /* + * If none of the default routers was found to be reachable, + * round-robin the list regardless of preference, except when + * Scoped Routing is enabled per case (c). + * + * Otherwise, if we have an installed router, check if the selected + * (reachable) router should really be preferred to the installed one. + * We only prefer the new router when the old one is not reachable + * or when the new one has a really higher preference value. + */ + if (!ip6_doscopedroute && selected_dr == NULL) { + if (installed_dr == NULL || + !TAILQ_NEXT(installed_dr, dr_entry)) { + selected_dr = TAILQ_FIRST(&nd_defrouter); + if (selected_dr) + NDDR_ADDREF(selected_dr); + } else { + selected_dr = TAILQ_NEXT(installed_dr, dr_entry); + if (selected_dr) + NDDR_ADDREF(selected_dr); + } + } else if (selected_dr != NULL && installed_dr != NULL) { + lck_mtx_unlock(nd6_mutex); + rt = nd6_lookup(&installed_dr->rtaddr, 0, installed_dr->ifp, 0); + if (rt) { + RT_LOCK_ASSERT_HELD(rt); + if ((ln = (struct llinfo_nd6 *)rt->rt_llinfo) && + ND6_IS_LLINFO_PROBREACH(ln) && + (!ip6_doscopedroute || + installed_dr->ifp == nd6_defifp) && + rtpref(selected_dr) <= rtpref(installed_dr)) { + NDDR_REMREF(selected_dr); + selected_dr = installed_dr; + NDDR_ADDREF(selected_dr); + } + RT_REMREF_LOCKED(rt); + RT_UNLOCK(rt); + rt = NULL; + found_installedrt = TRUE; + } + lck_mtx_lock(nd6_mutex); + } + + if (ip6_doscopedroute) { + /* + * If the installed primary router is not on the current + * IPv6 default interface, demote it to a scoped entry. + */ + if (installed_dr != NULL && installed_dr->ifp != nd6_defifp && + !(installed_dr->stateflags & NDDRF_IFSCOPE)) { + if (selected_dr != NULL && + selected_dr->ifp != nd6_defifp) { + NDDR_REMREF(selected_dr); + selected_dr = NULL; + } + ++update; + } + + /* + * If the selected router is currently scoped, make sure + * we update (it needs to be promoted to primary.) + */ + if (selected_dr != NULL && + (selected_dr->stateflags & NDDRF_IFSCOPE)) + ++update; + + /* + * If the installed router is no longe reachable, remove + * it and install the selected router instead. + */ + if (installed_dr != NULL && selected_dr != NULL && + installed_dr != selected_dr && found_installedrt == FALSE) { + installed_dr0 = installed_dr; /* skip it below */ + /* NB: we previousled referenced installed_dr */ + installed_dr = NULL; + selected_dr->genid = -1; + ++update; + } + } + + /* + * If Scoped Routing is enabled and there's nothing to update, + * just return. Otherwise, if Scoped Routing is disabled and if + * the selected router is different than the installed one, + * remove the installed router and install the selected one. + */ + dr = selected_dr; + VERIFY(dr != NULL || ip6_doscopedroute); + if (!ip6_doscopedroute || !update) { + if (dr == NULL) + goto out; + + if (dr != installed_dr) { + nd6log2((LOG_INFO, "%s: no update, selected router %s, " + "installed router %s\n", if_name(dr->ifp), + ip6_sprintf(&dr->rtaddr), installed_dr != NULL ? + ip6_sprintf(&installed_dr->rtaddr) : "NONE")); + } else { + nd6log2((LOG_INFO, "%s: no update, router is %s\n", + if_name(dr->ifp), ip6_sprintf(&dr->rtaddr))); + } + if (!ip6_doscopedroute && installed_dr != dr) { + /* + * No need to ADDREF dr because at this point + * dr points to selected_dr, which already holds + * a reference. + */ + lck_mtx_unlock(nd6_mutex); + if (installed_dr) { + NDDR_LOCK(installed_dr); + defrouter_delreq(installed_dr); + NDDR_UNLOCK(installed_dr); + } + NDDR_LOCK(dr); + defrouter_addreq(dr, FALSE); + NDDR_UNLOCK(dr); + lck_mtx_lock(nd6_mutex); + } + goto out; + } + + /* + * Scoped Routing is enabled and we need to update. The selected + * router needs to be installed as primary/non-scoped entry. If + * there is any existing entry that is non-scoped, remove it from + * the routing table and reinstall it as scoped entry. + */ + if (dr != NULL) { + nd6log2((LOG_INFO, "%s: considering primary default router %s, " + "static=%d [round 2]\n", if_name(dr->ifp), + ip6_sprintf(&dr->rtaddr), + (dr->stateflags & NDDRF_STATIC) ? 1 : 0)); + } + + /* + * On the following while loops we use two flags: + * dr->genid + * NDDRF_PROCESSED + * + * genid is used to skip entries that are not to be added/removed on the + * second while loop. + * NDDRF_PROCESSED is used to skip entries that were already processed. + * This is necessary because we drop the nd6_mutex and start the while + * loop again. + */ + TAILQ_FOREACH(dr, &nd_defrouter, dr_entry) { + NDDR_LOCK(dr); + VERIFY((dr->stateflags & NDDRF_PROCESSED) == 0); + NDDR_UNLOCK(dr); + } + /* Remove conflicting entries */ + dr = TAILQ_FIRST(&nd_defrouter); + while (dr) { + NDDR_LOCK(dr); + if (!(dr->stateflags & NDDRF_INSTALLED) || + dr->stateflags & NDDRF_PROCESSED) { + NDDR_UNLOCK(dr); + dr = TAILQ_NEXT(dr, dr_entry); + continue; + } + dr->stateflags |= NDDRF_PROCESSED; + + /* A NULL selected_dr will remove primary default route */ + if ((dr == selected_dr && (dr->stateflags & NDDRF_IFSCOPE)) || + (dr != selected_dr && !(dr->stateflags & NDDRF_IFSCOPE))) { + NDDR_ADDREF_LOCKED(dr); + NDDR_UNLOCK(dr); + lck_mtx_unlock(nd6_mutex); + NDDR_LOCK(dr); + defrouter_delreq(dr); + NDDR_UNLOCK(dr); + lck_mtx_lock(nd6_mutex); + NDDR_LOCK(dr); + if (dr && dr != installed_dr0) + dr->genid = -1; + NDDR_UNLOCK(dr); + NDDR_REMREF(dr); + /* + * Since we lost nd6_mutex, we have to start over. + */ + dr = TAILQ_FIRST(&nd_defrouter); + continue; + } + NDDR_UNLOCK(dr); + dr = TAILQ_NEXT(dr, dr_entry); + } + + /* -1 is a special number, make sure we don't use it for genid */ + if (++nd6_defrouter_genid == -1) + nd6_defrouter_genid = 1; + + TAILQ_FOREACH(dr, &nd_defrouter, dr_entry) { + NDDR_LOCK(dr); + dr->stateflags &= ~NDDRF_PROCESSED; + NDDR_UNLOCK(dr); + } + /* Add the entries back */ + dr = TAILQ_FIRST(&nd_defrouter); + while (dr) { + struct nd_defrouter *_dr; + + NDDR_LOCK(dr); + if (dr->stateflags & NDDRF_PROCESSED || + dr->genid != -1) { + NDDR_UNLOCK(dr); + dr = TAILQ_NEXT(dr, dr_entry); + continue; + } + dr->stateflags |= NDDRF_PROCESSED; + + /* Handle case (b) */ + for (_dr = TAILQ_FIRST(&nd_defrouter); _dr; + _dr = TAILQ_NEXT(_dr, dr_entry)) { + if (_dr == dr) + continue; + /* + * This is safe because we previously checked if + * _dr == dr. + */ + NDDR_LOCK(_dr); + if (_dr->ifp == dr->ifp && rtpref(_dr) >= rtpref(dr) && + (_dr->stateflags & NDDRF_INSTALLED)) { + NDDR_ADDREF_LOCKED(_dr); + NDDR_UNLOCK(_dr); + break; + } + NDDR_UNLOCK(_dr); + } + + /* If same preference and i/f, static entry takes precedence */ + if (_dr != NULL && rtpref(_dr) == rtpref(dr) && + !(_dr->stateflags & NDDRF_STATIC) && + (dr->stateflags & NDDRF_STATIC)) { + lck_mtx_unlock(nd6_mutex); + NDDR_LOCK(_dr); + defrouter_delreq(_dr); + NDDR_UNLOCK(_dr); + lck_mtx_lock(nd6_mutex); + NDDR_REMREF(_dr); + _dr = NULL; + } + + if (_dr == NULL && !(dr->stateflags & NDDRF_INSTALLED)) { + NDDR_ADDREF_LOCKED(dr); + NDDR_UNLOCK(dr); + lck_mtx_unlock(nd6_mutex); + NDDR_LOCK(dr); + defrouter_addreq(dr, (selected_dr == NULL || + dr->ifp != selected_dr->ifp)); + dr->genid = nd6_defrouter_genid; + NDDR_UNLOCK(dr); + lck_mtx_lock(nd6_mutex); + NDDR_REMREF(dr); + /* + * Since we lost nd6_mutex, we have to start over. + */ + dr = TAILQ_FIRST(&nd_defrouter); + continue; + } + NDDR_UNLOCK(dr); + dr = TAILQ_NEXT(dr, dr_entry); + } +out: + TAILQ_FOREACH(dr, &nd_defrouter, dr_entry) { + NDDR_LOCK(dr); + dr->stateflags &= ~NDDRF_PROCESSED; + NDDR_UNLOCK(dr); + } + if (selected_dr) + NDDR_REMREF(selected_dr); + if (installed_dr) + NDDR_REMREF(installed_dr); + if (installed_dr0) + NDDR_REMREF(installed_dr0); + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + VERIFY(nd_defrouter_busy); + nd_defrouter_busy = FALSE; + if (nd_defrouter_waiters > 0) { + nd_defrouter_waiters = 0; + wakeup(nd_defrouter_waitchan); + } +} + +void +defrouter_select(struct ifnet *ifp) +{ + return (defrouter_select_common(ifp, 0)); +} + +static struct nd_defrouter * +defrtrlist_update_common(struct nd_defrouter *new, boolean_t scoped) +{ + struct nd_defrouter *dr, *n; + struct ifnet *ifp = new->ifp; + struct nd_ifinfo *ndi; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + + if ((dr = defrouter_lookup(&new->rtaddr, ifp)) != NULL) { + /* entry exists */ + if (new->rtlifetime == 0) { + defrtrlist_del(dr); + NDDR_REMREF(dr); + dr = NULL; + } else { + int oldpref = rtpref(dr); + + /* override */ + dr->flags = new->flags; /* xxx flag check */ dr->rtlifetime = new->rtlifetime; dr->expire = new->expire; + + /* + * If the preference does not change, there's no need + * to sort the entries. If Scoped Routing is enabled, + * put the primary/non-scoped router at the top of the + * list of routers in the same preference band, unless + * it's already at that position. + */ + if (ip6_doscopedroute) { + struct nd_defrouter *p = NULL; + + /* same preference and scoped; just return */ + if (rtpref(new) == oldpref && scoped) + return (dr); + + n = TAILQ_FIRST(&nd_defrouter); + while (n != NULL) { + /* preference changed; sort it */ + if (rtpref(new) != oldpref) + break; + + /* not at the top of band; sort it */ + if (n != dr && rtpref(n) == oldpref && + (!p || rtpref(p) > rtpref(n))) + break; + + p = n; + n = TAILQ_NEXT(n, dr_entry); + } + + /* nothing has changed, just return */ + if (n == NULL && (scoped || + !(dr->stateflags & NDDRF_IFSCOPE))) + return (dr); + } else if (rtpref(new) == oldpref) { + return (dr); + } + + /* + * preferred router may be changed, so relocate + * this router. + * XXX: calling TAILQ_REMOVE directly is a bad manner. + * However, since defrtrlist_del() has many side + * effects, we intentionally do so here. + * defrouter_select() below will handle routing + * changes later. + */ + TAILQ_REMOVE(&nd_defrouter, dr, dr_entry); + new->stateflags = dr->stateflags; + new->stateflags &= ~NDDRF_PROCESSED; + + lck_rw_lock_shared(nd_if_rwlock); + VERIFY(ifp->if_index < nd_ifinfo_indexlim); + ndi = &nd_ifinfo[ifp->if_index]; + lck_rw_done(nd_if_rwlock); + n = dr; + goto insert; } - lck_mtx_unlock(nd6_mutex); - return(dr); + return (dr); } + VERIFY(dr == NULL); + /* entry does not exist */ if (new->rtlifetime == 0) { - lck_mtx_unlock(nd6_mutex); return(NULL); } - n = (struct nd_defrouter *)_MALLOC(sizeof(*n), M_IP6NDP, M_NOWAIT); + n = nddr_alloc(M_WAITOK); if (n == NULL) { - lck_mtx_unlock(nd6_mutex); return(NULL); } lck_rw_lock_shared(nd_if_rwlock); + ndi = &nd_ifinfo[ifp->if_index]; if (ifp->if_index >= nd_ifinfo_indexlim) goto freeit; - ndi = &nd_ifinfo[ifp->if_index]; if (ip6_maxifdefrouters >= 0 && ndi->ndefrouters >= ip6_maxifdefrouters) { freeit: lck_rw_done(nd_if_rwlock); - lck_mtx_unlock(nd6_mutex); - FREE(n, M_IP6NDP); + nddr_free(n); return (NULL); } - ndi->ndefrouters++; + + NDDR_ADDREF(n); /* for the nd_defrouter list */ + NDDR_ADDREF(n); /* for the caller */ + + ++nd6_defrouter_genid; + atomic_add_32(&ndi->ndefrouters, 1); lck_rw_done(nd_if_rwlock); - bzero(n, sizeof(*n)); - *n = *new; + nd6log2((LOG_INFO, "%s: allocating defrouter %s\n", if_name(ifp), + ip6_sprintf(&new->rtaddr))); + + NDDR_LOCK(n); + memcpy(&n->rtaddr, &new->rtaddr, sizeof(n->rtaddr)); + n->flags = new->flags; + n->stateflags = new->stateflags; + n->stateflags &= ~NDDRF_PROCESSED; + n->rtlifetime = new->rtlifetime; + n->expire = new->expire; + n->ifp = new->ifp; + n->genid = new->genid; + n->err = new->err; + NDDR_UNLOCK(n); +insert: /* - * Insert the new router at the end of the Default Router List. - * If there is no other router, install it anyway. Otherwise, - * just continue to use the current default router. + * Insert the new router in the Default Router List; + * The Default Router List should be in the descending order + * of router-preferece. When Scoped Routing is disabled, routers + * with the same preference are sorted in the arriving time order; + * otherwise, the first entry in the list of routers having the same + * preference is the primary default router, when the interface used + * by the entry is the default interface. */ - TAILQ_INSERT_TAIL(&nd_defrouter, n, dr_entry); - if (TAILQ_FIRST(&nd_defrouter) == n) - defrouter_select(); - lck_mtx_unlock(nd6_mutex); - return(n); + /* insert at the end of the group */ + for (dr = TAILQ_FIRST(&nd_defrouter); dr; + dr = TAILQ_NEXT(dr, dr_entry)) { + if (rtpref(n) > rtpref(dr) || + (ip6_doscopedroute && !scoped && rtpref(n) == rtpref(dr))) + break; + } + if (dr) + TAILQ_INSERT_BEFORE(dr, n, dr_entry); + else + TAILQ_INSERT_TAIL(&nd_defrouter, n, dr_entry); + + /* Ignore auto-configuration checks for static route entries */ + defrouter_select_common(ifp, (n->stateflags & NDDRF_STATIC)); + + return (n); +} + +static struct nd_defrouter * +defrtrlist_update(struct nd_defrouter *new) +{ + struct nd_defrouter *dr; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + dr = defrtrlist_update_common(new, + (nd6_defifp != NULL && new->ifp != nd6_defifp)); + + return (dr); +} + +static void +defrtrlist_sync(struct ifnet *ifp) +{ + struct nd_defrouter *dr, new; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + + if (!ip6_doscopedroute) { + defrouter_select(ifp); + return; + } + + for (dr = TAILQ_FIRST(&nd_defrouter); dr; + dr = TAILQ_NEXT(dr, dr_entry)) { + NDDR_LOCK(dr); + if (dr->ifp == ifp && (dr->stateflags & NDDRF_INSTALLED)) + break; + NDDR_UNLOCK(dr); + } + + if (dr == NULL) { + /* + * Set ignore flag; the chosen default interface might + * not be configured to accept RAs. + */ + defrouter_select_common(ifp, 1); + } else { + memcpy(&new.rtaddr, &dr->rtaddr, sizeof(new.rtaddr)); + new.flags = dr->flags; + new.stateflags = dr->stateflags; + new.stateflags &= ~NDDRF_PROCESSED; + new.rtlifetime = dr->rtlifetime; + new.expire = dr->expire; + new.ifp = dr->ifp; + new.genid = dr->genid; + new.err = dr->err; + NDDR_UNLOCK(dr); + dr = defrtrlist_update_common(&new, FALSE); + if (dr) + NDDR_REMREF(dr); + } } static struct nd_pfxrouter * -pfxrtr_lookup( - struct nd_prefix *pr, - struct nd_defrouter *dr) +pfxrtr_lookup(struct nd_prefix *pr, struct nd_defrouter *dr) { struct nd_pfxrouter *search; - + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); - for (search = pr->ndpr_advrtrs.lh_first; search; search = search->pfr_next) { + NDPR_LOCK_ASSERT_HELD(pr); + + for (search = pr->ndpr_advrtrs.lh_first; search; + search = search->pfr_next) { if (search->router == dr) break; } @@ -876,23 +1825,24 @@ pfxrtr_lookup( } static void -pfxrtr_add( - struct nd_prefix *pr, - struct nd_defrouter *dr) +pfxrtr_add(struct nd_prefix *pr, struct nd_defrouter *dr) { struct nd_pfxrouter *new; lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + NDPR_LOCK_ASSERT_NOTHELD(pr); - new = (struct nd_pfxrouter *)_MALLOC(sizeof(*new), M_IP6NDP, M_NOWAIT); + new = zalloc(ndprtr_zone); if (new == NULL) return; bzero(new, sizeof(*new)); new->router = dr; + NDPR_LOCK(pr); LIST_INSERT_HEAD(&pr->ndpr_advrtrs, new, pfr_entry); - - pfxlist_onlink_check(1); + NDPR_UNLOCK(pr); + + pfxlist_onlink_check(); } static void @@ -901,65 +1851,32 @@ pfxrtr_del( { lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); LIST_REMOVE(pfr, pfr_entry); - FREE(pfr, M_IP6NDP); + zfree(ndprtr_zone, pfr); } struct nd_prefix * -nd6_prefix_lookup( - struct nd_prefix *pr) +nd6_prefix_lookup(struct nd_prefix *pr) { struct nd_prefix *search; lck_mtx_lock(nd6_mutex); for (search = nd_prefix.lh_first; search; search = search->ndpr_next) { + NDPR_LOCK(search); if (pr->ndpr_ifp == search->ndpr_ifp && pr->ndpr_plen == search->ndpr_plen && in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr, - &search->ndpr_prefix.sin6_addr, - pr->ndpr_plen) - ) { + &search->ndpr_prefix.sin6_addr, pr->ndpr_plen)) { + NDPR_ADDREF_LOCKED(search); + NDPR_UNLOCK(search); break; } + NDPR_UNLOCK(search); } - if (search != NULL) - ndpr_hold(search, TRUE); lck_mtx_unlock(nd6_mutex); return(search); } -void -ndpr_hold(struct nd_prefix *pr, boolean_t locked) -{ - if (!locked) - lck_mtx_lock(nd6_mutex); - - if (pr->ndpr_usecnt < 0) - panic("%s: bad usecnt %d for pr %p\n", __func__, - pr->ndpr_usecnt, pr); - - pr->ndpr_usecnt++; - - if (!locked) - lck_mtx_unlock(nd6_mutex); -} - -void -ndpr_rele(struct nd_prefix *pr, boolean_t locked) -{ - if (!locked) - lck_mtx_lock(nd6_mutex); - - if (pr->ndpr_usecnt <= 0) - panic("%s: bad usecnt %d for pr %p\n", __func__, - pr->ndpr_usecnt, pr); - - pr->ndpr_usecnt--; - - if (!locked) - lck_mtx_unlock(nd6_mutex); -} - static void purge_detached(struct ifnet *ifp) { @@ -969,52 +1886,75 @@ purge_detached(struct ifnet *ifp) lck_mtx_lock(nd6_mutex); - for (pr = nd_prefix.lh_first; pr; pr = pr_next) { + pr = nd_prefix.lh_first; +repeat: + while (pr) { pr_next = pr->ndpr_next; + NDPR_LOCK(pr); if (pr->ndpr_ifp != ifp || IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr) || ((pr->ndpr_stateflags & NDPRF_DETACHED) == 0 && - !LIST_EMPTY(&pr->ndpr_advrtrs))) + !LIST_EMPTY(&pr->ndpr_advrtrs))) { + NDPR_UNLOCK(pr); + pr = pr_next; continue; -repeat: + } + NDPR_UNLOCK(pr); ifnet_lock_shared(ifp); for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = ifa_next) { ifa_next = ifa->ifa_list.tqe_next; - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; + } ia = (struct in6_ifaddr *)ifa; if ((ia->ia6_flags & IN6_IFF_AUTOCONF) == IN6_IFF_AUTOCONF && ia->ia6_ndpr == pr) { - ifaref(ifa); + IFA_ADDREF_LOCKED(ifa); /* for us */ + IFA_UNLOCK(ifa); /* * Purging the address requires writer access * to the address list, so drop the ifnet lock * now and repeat from beginning. */ ifnet_lock_done(ifp); - in6_purgeaddr(ifa, 1); - ifafree(ifa); + lck_mtx_unlock(nd6_mutex); + in6_purgeaddr(ifa); + lck_mtx_lock(nd6_mutex); + IFA_REMREF(ifa); /* drop ours */ + pr = nd_prefix.lh_first; goto repeat; } + IFA_UNLOCK(ifa); } ifnet_lock_done(ifp); - if (pr->ndpr_refcnt == 0) - prelist_remove(pr, 1); + NDPR_LOCK(pr); + if (pr->ndpr_addrcnt == 0) { + NDPR_ADDREF_LOCKED(pr); + prelist_remove(pr); + NDPR_UNLOCK(pr); + NDPR_REMREF(pr); + } else { + NDPR_UNLOCK(pr); + } + pr = pr_next; } lck_mtx_unlock(nd6_mutex); } int -nd6_prelist_add( - struct nd_prefix *pr, - struct nd_defrouter *dr, - struct nd_prefix **newp) +nd6_prelist_add(struct nd_prefix *pr, struct nd_defrouter *dr, + struct nd_prefix **newp, boolean_t force_scoped) { struct nd_prefix *new = NULL; struct ifnet *ifp = pr->ndpr_ifp; struct nd_ifinfo *ndi = NULL; - int i; + int i, error; + struct timeval timenow; + + getmicrotime(&timenow); if (ip6_maxifprefixes >= 0) { lck_rw_lock_shared(nd_if_rwlock); @@ -1041,15 +1981,32 @@ nd6_prelist_add( lck_rw_done(nd_if_rwlock); } - new = (struct nd_prefix *)_MALLOC(sizeof(*new), M_IP6NDP, M_NOWAIT); + new = ndpr_alloc(M_WAITOK); if (new == NULL) return ENOMEM; - bzero(new, sizeof(*new)); - *new = *pr; - if (newp != NULL) - *newp = new; - /* initilization */ + NDPR_LOCK(new); + NDPR_LOCK(pr); + new->ndpr_ifp = pr->ndpr_ifp; + new->ndpr_prefix = pr->ndpr_prefix; + new->ndpr_plen = pr->ndpr_plen; + new->ndpr_vltime = pr->ndpr_vltime; + new->ndpr_pltime = pr->ndpr_pltime; + new->ndpr_flags = pr->ndpr_flags; + if (pr->ndpr_stateflags & NDPRF_STATIC) + new->ndpr_stateflags |= NDPRF_STATIC; + NDPR_UNLOCK(pr); + if ((error = in6_init_prefix_ltimes(new)) != 0) { + NDPR_UNLOCK(new); + ndpr_free(new); + return(error); + } + new->ndpr_lastupdate = timenow.tv_sec; + if (newp != NULL) { + *newp = new; + NDPR_ADDREF_LOCKED(new); /* for caller */ + } + /* initialization */ LIST_INIT(&new->ndpr_advrtrs); in6_prefixlen2mask(&new->ndpr_mask, new->ndpr_plen); /* make prefix in the canonical form */ @@ -1057,22 +2014,25 @@ nd6_prelist_add( new->ndpr_prefix.sin6_addr.s6_addr32[i] &= new->ndpr_mask.s6_addr32[i]; - /* link ndpr_entry to nd_prefix list */ + NDPR_UNLOCK(new); + lck_mtx_lock(nd6_mutex); + /* link ndpr_entry to nd_prefix list */ LIST_INSERT_HEAD(&nd_prefix, new, ndpr_entry); - - new->ndpr_usecnt = 0; - ndpr_hold(new, TRUE); + new->ndpr_debug |= IFD_ATTACHED; + NDPR_ADDREF(new); /* for nd_prefix list */ /* ND_OPT_PI_FLAG_ONLINK processing */ if (new->ndpr_raf_onlink) { int e; - if ((e = nd6_prefix_onlink(new, 0, 1)) != 0) { + if ((e = nd6_prefix_onlink_common(new, force_scoped, + new->ndpr_ifp->if_index)) != 0) { nd6log((LOG_ERR, "nd6_prelist_add: failed to make " - "the prefix %s/%d on-link on %s (errno=%d)\n", - ip6_sprintf(&pr->ndpr_prefix.sin6_addr), - pr->ndpr_plen, if_name(ifp), e)); + "the prefix %s/%d on-link %s on %s (errno=%d)\n", + ip6_sprintf(&new->ndpr_prefix.sin6_addr), + new->ndpr_plen, force_scoped ? "scoped" : + "non-scoped", if_name(ifp), e)); /* proceed anyway. XXX: is it correct? */ } } @@ -1088,7 +2048,7 @@ nd6_prelist_add( * isn't necessary since the array never shrinks. */ ndi = &nd_ifinfo[ifp->if_index]; - ndi->nprefixes++; + atomic_add_32(&ndi->nprefixes, 1); lck_rw_done(nd_if_rwlock); lck_mtx_unlock(nd6_mutex); @@ -1096,54 +2056,63 @@ nd6_prelist_add( return 0; } +/* + * Caller must have held an extra reference on nd_prefix. + */ void -prelist_remove( - struct nd_prefix *pr, int nd6locked) +prelist_remove(struct nd_prefix *pr) { struct nd_pfxrouter *pfr, *next; struct ifnet *ifp = pr->ndpr_ifp; int e; + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + NDPR_LOCK_ASSERT_HELD(pr); + /* make sure to invalidate the prefix until it is really freed. */ pr->ndpr_vltime = 0; pr->ndpr_pltime = 0; -#if 0 + /* * Though these flags are now meaningless, we'd rather keep the value - * not to confuse users when executing "ndp -p". + * of pr->ndpr_raf_onlink and pr->ndpr_raf_auto not to confuse users + * when executing "ndp -p". */ - pr->ndpr_raf_onlink = 0; - pr->ndpr_raf_auto = 0; -#endif - if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0 && - (e = nd6_prefix_offlink(pr)) != 0) { - nd6log((LOG_ERR, "prelist_remove: failed to make %s/%d offlink " - "on %s, errno=%d\n", - ip6_sprintf(&pr->ndpr_prefix.sin6_addr), - pr->ndpr_plen, if_name(ifp), e)); - /* what should we do? */ - } - if (nd6locked == 0) + if ((pr->ndpr_stateflags & NDPRF_ONLINK)) { + NDPR_ADDREF_LOCKED(pr); + NDPR_UNLOCK(pr); + lck_mtx_unlock(nd6_mutex); + if ((e = nd6_prefix_offlink(pr)) != 0) { + nd6log((LOG_ERR, "prelist_remove: failed to make " + "%s/%d offlink on %s, errno=%d\n", + ip6_sprintf(&pr->ndpr_prefix.sin6_addr), + pr->ndpr_plen, if_name(ifp), e)); + /* what should we do? */ + } lck_mtx_lock(nd6_mutex); + NDPR_LOCK(pr); + if (NDPR_REMREF_LOCKED(pr) == NULL) + return; + } - if (pr->ndpr_usecnt > 0 || pr->ndpr_refcnt > 0) - goto done; /* notice here? */ + if (pr->ndpr_addrcnt > 0) + return; /* notice here? */ /* unlink ndpr_entry from nd_prefix list */ LIST_REMOVE(pr, ndpr_entry); + pr->ndpr_debug &= ~IFD_ATTACHED; /* free list of routers that adversed the prefix */ for (pfr = pr->ndpr_advrtrs.lh_first; pfr; pfr = next) { next = pfr->pfr_next; - - FREE(pfr, M_IP6NDP); + pfxrtr_del(pfr); } lck_rw_lock_shared(nd_if_rwlock); if (ifp->if_index < nd_ifinfo_indexlim) { struct nd_ifinfo *ndi = &nd_ifinfo[ifp->if_index]; - ndi->nprefixes--; + atomic_add_32(&ndi->nprefixes, -1); if (ndi->nprefixes < 0) { log(LOG_WARNING, "prelist_remove: negative " "count on %s\n", if_name(ifp)); @@ -1151,19 +2120,21 @@ prelist_remove( } lck_rw_done(nd_if_rwlock); - FREE(pr, M_IP6NDP); + /* This must not be the last reference to the nd_prefix */ + if (NDPR_REMREF_LOCKED(pr) == NULL) { + panic("%s: unexpected (missing) refcnt ndpr=%p", __func__, pr); + /* NOTREACHED */ + } - pfxlist_onlink_check(1); -done: - if (nd6locked == 0) - lck_mtx_unlock(nd6_mutex); + pfxlist_onlink_check(); } int prelist_update( struct nd_prefix *new, struct nd_defrouter *dr, /* may be NULL */ - struct mbuf *m) + struct mbuf *m, + int mcast) { struct in6_ifaddr *ia6 = NULL, *ia6_match = NULL; struct ifaddr *ifa; @@ -1175,6 +2146,9 @@ prelist_update( struct in6_addrlifetime lt6_tmp; struct timeval timenow; + /* no need to lock "new" here, as it is local to the caller */ + NDPR_LOCK_ASSERT_NOTHELD(new); + auth = 0; if (m) { /* @@ -1199,6 +2173,8 @@ prelist_update( * and the autonomous (A) bit should NOT be changed from 1 * to 0. */ + lck_mtx_lock(nd6_mutex); + NDPR_LOCK(pr); if (new->ndpr_raf_onlink == 1) pr->ndpr_raf_onlink = 1; if (new->ndpr_raf_auto == 1) @@ -1214,7 +2190,8 @@ prelist_update( (pr->ndpr_stateflags & NDPRF_ONLINK) == 0) { int e; - if ((e = nd6_prefix_onlink(pr, 0, 0)) != 0) { + NDPR_UNLOCK(pr); + if ((e = nd6_prefix_onlink(pr)) != 0) { nd6log((LOG_ERR, "prelist_update: failed to make " "the prefix %s/%d on-link on %s " @@ -1223,11 +2200,15 @@ prelist_update( pr->ndpr_plen, if_name(pr->ndpr_ifp), e)); /* proceed anyway. XXX: is it correct? */ } + NDPR_LOCK(pr); } - - lck_mtx_lock(nd6_mutex); - if (dr && pfxrtr_lookup(pr, dr) == NULL) + + if (dr && pfxrtr_lookup(pr, dr) == NULL) { + NDPR_UNLOCK(pr); pfxrtr_add(pr, dr); + } else { + NDPR_UNLOCK(pr); + } lck_mtx_unlock(nd6_mutex); } else { struct nd_prefix *newpr = NULL; @@ -1241,7 +2222,7 @@ prelist_update( bzero(&new->ndpr_addr, sizeof(struct in6_addr)); - error = nd6_prelist_add(new, dr, &newpr); + error = nd6_prelist_add(new, dr, &newpr, FALSE); if (error != 0 || newpr == NULL) { nd6log((LOG_NOTICE, "prelist_update: " "nd6_prelist_add failed for %s/%d on %s " @@ -1256,9 +2237,10 @@ prelist_update( * XXX: from the ND point of view, we can ignore a prefix * with the on-link bit being zero. However, we need a * prefix structure for references from autoconfigured - * addresses. Thus, we explicitly make suret that the prefix + * addresses. Thus, we explicitly make sure that the prefix * itself expires now. */ + NDPR_LOCK(newpr); if (newpr->ndpr_raf_onlink == 0) { newpr->ndpr_vltime = 0; newpr->ndpr_pltime = 0; @@ -1266,6 +2248,7 @@ prelist_update( } pr = newpr; + NDPR_UNLOCK(newpr); } /* @@ -1282,84 +2265,109 @@ prelist_update( * nd6_ra_input. */ + /* 5.5.3 (c). Consistency check on lifetimes: pltime <= vltime. */ + if (new->ndpr_pltime > new->ndpr_vltime) { + error = EINVAL; /* XXX: won't be used */ + goto end; + } + /* - * 5.5.3 (c). Consistency check on lifetimes: pltime <= vltime. - * This should have been done in nd6_ra_input. + * 5.5.3 (d). If the prefix advertised is not equal to the prefix of + * an address configured by stateless autoconfiguration already in the + * list of addresses associated with the interface, and the Valid + * Lifetime is not 0, form an address. We first check if we have + * a matching prefix. + * Note: we apply a clarification in rfc2462bis-02 here. We only + * consider autoconfigured addresses while RFC2462 simply said + * "address". */ - /* - * 5.5.3 (d). If the prefix advertised does not match the prefix of an - * address already in the list, and the Valid Lifetime is not 0, - * form an address. Note that even a manually configured address - * should reject autoconfiguration of a new address. - */ - getmicrotime(&timenow); + getmicrotime(&timenow); - ifnet_lock_exclusive(ifp); + ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { struct in6_ifaddr *ifa6; - int ifa_plen; - u_int32_t storedlifetime; + u_int32_t remaininglifetime; - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; - + } ifa6 = (struct in6_ifaddr *)ifa; + /* + * We only consider autoconfigured addresses as per rfc2462bis. + */ + if (!(ifa6->ia6_flags & IN6_IFF_AUTOCONF)) { + IFA_UNLOCK(ifa); + continue; + } /* * Spec is not clear here, but I believe we should concentrate * on unicast (i.e. not anycast) addresses. * XXX: other ia6_flags? detached or duplicated? */ - if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0) + if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0) { + IFA_UNLOCK(ifa); continue; - - ifa_plen = in6_mask2len(&ifa6->ia_prefixmask.sin6_addr, NULL); - if (ifa_plen != new->ndpr_plen || - !in6_are_prefix_equal(&ifa6->ia_addr.sin6_addr, - &new->ndpr_prefix.sin6_addr, - ifa_plen)) + } + /* + * Ignore the address if it is not associated with a prefix + * or is associated with a prefix that is different from this + * one. (pr is never NULL here) + */ + if (ifa6->ia6_ndpr != pr) { + IFA_UNLOCK(ifa); continue; + } - if (ia6_match == NULL) /* remember the first one */ + if (ia6_match == NULL) { /* remember the first one */ ia6_match = ifa6; - - if ((ifa6->ia6_flags & IN6_IFF_AUTOCONF) == 0) - continue; + IFA_ADDREF_LOCKED(ifa); /* for ia6_match */ + } /* * An already autoconfigured address matched. Now that we * are sure there is at least one matched address, we can * proceed to 5.5.3. (e): update the lifetimes according to the * "two hours" rule and the privacy extension. + * We apply some clarifications in rfc2462bis: + * - use remaininglifetime instead of storedlifetime as a + * variable name + * - remove the dead code in the "two-hour" rule */ #define TWOHOUR (120*60) lt6_tmp = ifa6->ia6_lifetime; - storedlifetime = IFA6_IS_INVALID(ifa6) ? 0 : - (lt6_tmp.ia6t_expire - timenow.tv_sec); + if (lt6_tmp.ia6t_vltime == ND6_INFINITE_LIFETIME) + remaininglifetime = ND6_INFINITE_LIFETIME; + else if (timenow.tv_sec - ifa6->ia6_updatetime > + lt6_tmp.ia6t_vltime) { + /* + * The case of "invalid" address. We should usually + * not see this case. + */ + remaininglifetime = 0; + } else + remaininglifetime = lt6_tmp.ia6t_vltime - + (timenow.tv_sec - ifa6->ia6_updatetime); + + /* when not updating, keep the current stored lifetime. */ + lt6_tmp.ia6t_vltime = remaininglifetime; if (TWOHOUR < new->ndpr_vltime || - storedlifetime < new->ndpr_vltime) { + remaininglifetime < new->ndpr_vltime) { lt6_tmp.ia6t_vltime = new->ndpr_vltime; - } else if (storedlifetime <= TWOHOUR -#if 0 - /* - * This condition is logically redundant, so we just - * omit it. - * See IPng 6712, 6717, and 6721. - */ - && new->ndpr_vltime <= storedlifetime -#endif - ) { + } else if (remaininglifetime <= TWOHOUR) { if (auth) { lt6_tmp.ia6t_vltime = new->ndpr_vltime; } } else { /* * new->ndpr_vltime <= TWOHOUR && - * TWOHOUR < storedlifetime + * TWOHOUR < remaininglifetime */ lt6_tmp.ia6t_vltime = TWOHOUR; } @@ -1367,57 +2375,108 @@ prelist_update( /* The 2 hour rule is not imposed for preferred lifetime. */ lt6_tmp.ia6t_pltime = new->ndpr_pltime; - in6_init_address_ltimes(pr, <6_tmp); - - /* - * When adjusting the lifetimes of an existing temporary - * address, only lower the lifetimes. - * RFC 3041 3.3. (1). - * XXX: how should we modify ia6t_[pv]ltime? - */ + /* Special handling for lifetimes of temporary addresses. */ if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0) { - if (lt6_tmp.ia6t_expire == 0 || /* no expire */ - lt6_tmp.ia6t_expire > - ifa6->ia6_lifetime.ia6t_expire) { - lt6_tmp.ia6t_expire = - ifa6->ia6_lifetime.ia6t_expire; - } - if (lt6_tmp.ia6t_preferred == 0 || /* no expire */ - lt6_tmp.ia6t_preferred > - ifa6->ia6_lifetime.ia6t_preferred) { - lt6_tmp.ia6t_preferred = - ifa6->ia6_lifetime.ia6t_preferred; - } + u_int32_t maxvltime, maxpltime; + + /* Constrain lifetimes to system limits. */ + if (lt6_tmp.ia6t_vltime > ip6_temp_valid_lifetime) + lt6_tmp.ia6t_vltime = ip6_temp_valid_lifetime; + if (lt6_tmp.ia6t_pltime > ip6_temp_preferred_lifetime) + lt6_tmp.ia6t_pltime = + ip6_temp_preferred_lifetime - + ip6_desync_factor; + + /* + * According to RFC 4941, section 3.3 (1), we only + * update the lifetimes when they are in the maximum + * intervals. + */ + if (ip6_temp_valid_lifetime > + (u_int32_t)((timenow.tv_sec - ifa6->ia6_createtime) + + ip6_desync_factor)) { + maxvltime = ip6_temp_valid_lifetime - + (timenow.tv_sec - ifa6->ia6_createtime) - + ip6_desync_factor; + } else + maxvltime = 0; + if (ip6_temp_preferred_lifetime > + (u_int32_t)((timenow.tv_sec - ifa6->ia6_createtime) + + ip6_desync_factor)) { + maxpltime = ip6_temp_preferred_lifetime - + (timenow.tv_sec - ifa6->ia6_createtime) - + ip6_desync_factor; + } else + maxpltime = 0; + + if (lt6_tmp.ia6t_vltime > maxvltime) + lt6_tmp.ia6t_vltime = maxvltime; + if (lt6_tmp.ia6t_pltime > maxpltime) + lt6_tmp.ia6t_pltime = maxpltime; } + in6_init_address_ltimes(pr, <6_tmp, + !!(ifa6->ia6_flags & IN6_IFF_TEMPORARY)); + ifa6->ia6_lifetime = lt6_tmp; + ifa6->ia6_updatetime = timenow.tv_sec; + IFA_UNLOCK(ifa); } ifnet_lock_done(ifp); if (ia6_match == NULL && new->ndpr_vltime) { + int ifidlen; + /* + * 5.5.3 (d) (continued) * No address matched and the valid lifetime is non-zero. * Create a new address. */ - if ((ia6 = in6_ifadd(new, NULL)) != NULL) { + + /* + * Prefix Length check: + * If the sum of the prefix length and interface identifier + * length does not equal 128 bits, the Prefix Information + * option MUST be ignored. The length of the interface + * identifier is defined in a separate link-type specific + * document. + */ + ifidlen = in6_if2idlen(ifp); + if (ifidlen < 0) { + /* this should not happen, so we always log it. */ + log(LOG_ERR, "prelist_update: IFID undefined (%s)\n", + if_name(ifp)); + goto end; + } + NDPR_LOCK(pr); + if (ifidlen + pr->ndpr_plen != 128) { + nd6log((LOG_INFO, + "prelist_update: invalid prefixlen " + "%d for %s, ignored\n", + pr->ndpr_plen, if_name(ifp))); + NDPR_UNLOCK(pr); + goto end; + } + NDPR_UNLOCK(pr); + + if ((ia6 = in6_ifadd(new, mcast)) != NULL) { /* * note that we should use pr (not new) for reference. */ - lck_mtx_lock(nd6_mutex); - pr->ndpr_refcnt++; - lck_mtx_unlock(nd6_mutex); + IFA_LOCK(&ia6->ia_ifa); + NDPR_LOCK(pr); ia6->ia6_ndpr = pr; - -#if 0 - /* XXXYYY Don't do this, according to Jinmei. */ - pr->ndpr_addr = new->ndpr_addr; -#endif + NDPR_ADDREF_LOCKED(pr); /* for addr reference */ + pr->ndpr_addrcnt++; + VERIFY(pr->ndpr_addrcnt != 0); + NDPR_UNLOCK(pr); + IFA_UNLOCK(&ia6->ia_ifa); /* - * RFC 3041 3.3 (2). + * RFC 4941 3.3 (2). * When a new public address is created as described * in RFC2462, also create a new temporary address. * - * RFC 3041 3.5. + * RFC 4941 3.5. * When an interface connects to a new link, a new * randomized interface identifier should be generated * immediately together with a new set of temporary @@ -1426,35 +2485,264 @@ prelist_update( */ if (ip6_use_tempaddr) { int e; - if ((e = in6_tmpifadd(ia6, 1, M_NOWAIT)) != 0) { + if ((e = in6_tmpifadd(ia6, 1, M_WAITOK)) != 0) { nd6log((LOG_NOTICE, "prelist_update: " "failed to create a temporary " "address, errno=%d\n", e)); } } - ifafree(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); ia6 = NULL; - /* - * A newly added address might affect the status - * of other addresses, so we check and update it. - * XXX: what if address duplication happens? - */ - pfxlist_onlink_check(0); - } else { - /* just set an error. do not bark here. */ - error = EADDRNOTAVAIL; /* XXX: might be unused. */ - } + /* + * A newly added address might affect the status + * of other addresses, so we check and update it. + * XXX: what if address duplication happens? + */ + lck_mtx_lock(nd6_mutex); + pfxlist_onlink_check(); + lck_mtx_unlock(nd6_mutex); + } else { + /* just set an error. do not bark here. */ + error = EADDRNOTAVAIL; /* XXX: might be unused. */ + } + } + +afteraddrconf: + +end: + if (pr != NULL) + NDPR_REMREF(pr); + if (ia6_match != NULL) + IFA_REMREF(&ia6_match->ia_ifa); + return error; +} + +/* + * Neighbor Discover Default Router structure reference counting routines. + */ +static struct nd_defrouter * +nddr_alloc(int how) +{ + struct nd_defrouter *dr; + + dr = (how == M_WAITOK) ? zalloc(nddr_zone) : zalloc_noblock(nddr_zone); + if (dr != NULL) { + bzero(dr, nddr_size); + lck_mtx_init(&dr->nddr_lock, ifa_mtx_grp, ifa_mtx_attr); + dr->nddr_debug |= IFD_ALLOC; + if (nddr_debug != 0) { + dr->nddr_debug |= IFD_DEBUG; + dr->nddr_trace = nddr_trace; + } + } + return (dr); +} + +static void +nddr_free(struct nd_defrouter *dr) +{ + NDDR_LOCK(dr); + if (dr->nddr_debug & IFD_ATTACHED) { + panic("%s: attached nddr %p is being freed", __func__, dr); + /* NOTREACHED */ + } else if (!(dr->nddr_debug & IFD_ALLOC)) { + panic("%s: nddr %p cannot be freed", __func__, dr); + /* NOTREACHED */ + } + dr->nddr_debug &= ~IFD_ALLOC; + NDDR_UNLOCK(dr); + + lck_mtx_destroy(&dr->nddr_lock, ifa_mtx_grp); + zfree(nddr_zone, dr); +} + +static void +nddr_trace(struct nd_defrouter *dr, int refhold) +{ + struct nd_defrouter_dbg *dr_dbg = (struct nd_defrouter_dbg *)dr; + ctrace_t *tr; + uint32_t idx; + uint16_t *cnt; + + if (!(dr->nddr_debug & IFD_DEBUG)) { + panic("%s: nddr %p has no debug structure", __func__, dr); + /* NOTREACHED */ + } + if (refhold) { + cnt = &dr_dbg->nddr_refhold_cnt; + tr = dr_dbg->nddr_refhold; + } else { + cnt = &dr_dbg->nddr_refrele_cnt; + tr = dr_dbg->nddr_refrele; + } + + idx = atomic_add_16_ov(cnt, 1) % NDDR_TRACE_HIST_SIZE; + ctrace_record(&tr[idx]); +} + +void +nddr_addref(struct nd_defrouter *nddr, int locked) +{ + + if (!locked) + NDDR_LOCK_SPIN(nddr); + else + NDDR_LOCK_ASSERT_HELD(nddr); + + if (++nddr->nddr_refcount == 0) { + panic("%s: nddr %p wraparound refcnt\n", __func__, nddr); + /* NOTREACHED */ + } else if (nddr->nddr_trace != NULL) { + (*nddr->nddr_trace)(nddr, TRUE); + } + + if (!locked) + NDDR_UNLOCK(nddr); +} + +struct nd_defrouter * +nddr_remref(struct nd_defrouter *nddr, int locked) +{ + + if (!locked) + NDDR_LOCK_SPIN(nddr); + else + NDDR_LOCK_ASSERT_HELD(nddr); + + if (nddr->nddr_refcount == 0) { + panic("%s: nddr %p negative refcnt\n", __func__, nddr); + /* NOTREACHED */ + } else if (nddr->nddr_trace != NULL) { + (*nddr->nddr_trace)(nddr, FALSE); + } + + if (--nddr->nddr_refcount == 0) { + NDDR_UNLOCK(nddr); + nddr_free(nddr); + nddr = NULL; + } + + if (!locked && nddr != NULL) + NDDR_UNLOCK(nddr); + + return (nddr); +} + +/* + * Neighbor Discover Prefix structure reference counting routines. + */ +static struct nd_prefix * +ndpr_alloc(int how) +{ + struct nd_prefix *pr; + + pr = (how == M_WAITOK) ? zalloc(ndpr_zone) : zalloc_noblock(ndpr_zone); + if (pr != NULL) { + bzero(pr, ndpr_size); + lck_mtx_init(&pr->ndpr_lock, ifa_mtx_grp, ifa_mtx_attr); + pr->ndpr_debug |= IFD_ALLOC; + if (ndpr_debug != 0) { + pr->ndpr_debug |= IFD_DEBUG; + pr->ndpr_trace = ndpr_trace; + } + } + return (pr); +} + +static void +ndpr_free(struct nd_prefix *pr) +{ + NDPR_LOCK(pr); + if (pr->ndpr_debug & IFD_ATTACHED) { + panic("%s: attached ndpr %p is being freed", __func__, pr); + /* NOTREACHED */ + } else if (!(pr->ndpr_debug & IFD_ALLOC)) { + panic("%s: ndpr %p cannot be freed", __func__, pr); + /* NOTREACHED */ + } + pr->ndpr_debug &= ~IFD_ALLOC; + NDPR_UNLOCK(pr); + + lck_mtx_destroy(&pr->ndpr_lock, ifa_mtx_grp); + zfree(ndpr_zone, pr); +} + +static void +ndpr_trace(struct nd_prefix *pr, int refhold) +{ + struct nd_prefix_dbg *pr_dbg = (struct nd_prefix_dbg *)pr; + ctrace_t *tr; + u_int32_t idx; + u_int16_t *cnt; + + if (!(pr->ndpr_debug & IFD_DEBUG)) { + panic("%s: ndpr %p has no debug structure", __func__, pr); + /* NOTREACHED */ + } + if (refhold) { + cnt = &pr_dbg->ndpr_refhold_cnt; + tr = pr_dbg->ndpr_refhold; + } else { + cnt = &pr_dbg->ndpr_refrele_cnt; + tr = pr_dbg->ndpr_refrele; + } + + idx = atomic_add_16_ov(cnt, 1) % NDPR_TRACE_HIST_SIZE; + ctrace_record(&tr[idx]); +} + +void +ndpr_addref(struct nd_prefix *ndpr, int locked) +{ + if (!locked) + NDPR_LOCK_SPIN(ndpr); + else + NDPR_LOCK_ASSERT_HELD(ndpr); + + if (++ndpr->ndpr_refcount == 0) { + panic("%s: ndpr %p wraparound refcnt\n", __func__, ndpr); + /* NOTREACHED */ + } else if (ndpr->ndpr_trace != NULL) { + (*ndpr->ndpr_trace)(ndpr, TRUE); + } + + if (!locked) + NDPR_UNLOCK(ndpr); +} + +struct nd_prefix * +ndpr_remref(struct nd_prefix *ndpr, int locked) +{ + if (!locked) + NDPR_LOCK_SPIN(ndpr); + else + NDPR_LOCK_ASSERT_HELD(ndpr); + + if (ndpr->ndpr_refcount == 0) { + panic("%s: ndpr %p negative refcnt\n", __func__, ndpr); + /* NOTREACHED */ + } else if (ndpr->ndpr_trace != NULL) { + (*ndpr->ndpr_trace)(ndpr, FALSE); } -afteraddrconf: + if (--ndpr->ndpr_refcount == 0) { + if (ndpr->ndpr_addrcnt != 0) { + panic("%s: freeing ndpr %p with outstanding address " + "reference (%d)", __func__, ndpr, + ndpr->ndpr_addrcnt); + /* NOTREACHED */ + } + NDPR_UNLOCK(ndpr); + ndpr_free(ndpr); + ndpr = NULL; + } -end: - if (pr != NULL) - ndpr_rele(pr, FALSE); + if (!locked && ndpr != NULL) + NDPR_UNLOCK(ndpr); - return error; + return (ndpr); } /* @@ -1463,17 +2751,19 @@ prelist_update( * XXX: lengthy function name... */ static struct nd_pfxrouter * -find_pfxlist_reachable_router( - struct nd_prefix *pr) +find_pfxlist_reachable_router(struct nd_prefix *pr) { struct nd_pfxrouter *pfxrtr; struct rtentry *rt; struct llinfo_nd6 *ln; lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + NDPR_LOCK_ASSERT_HELD(pr); for (pfxrtr = LIST_FIRST(&pr->ndpr_advrtrs); pfxrtr; pfxrtr = LIST_NEXT(pfxrtr, pfr_entry)) { + NDPR_UNLOCK(pr); + lck_mtx_unlock(nd6_mutex); /* Callee returns a locked route upon success */ if ((rt = nd6_lookup(&pfxrtr->router->rtaddr, 0, pfxrtr->router->ifp, 0)) != NULL) { @@ -1482,14 +2772,19 @@ find_pfxlist_reachable_router( ND6_IS_LLINFO_PROBREACH(ln)) { RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); + lck_mtx_lock(nd6_mutex); + NDPR_LOCK(pr); break; /* found */ } RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); } + lck_mtx_lock(nd6_mutex); + NDPR_LOCK(pr); } + NDPR_LOCK_ASSERT_HELD(pr); - return(pfxrtr); + return (pfxrtr); } @@ -1507,61 +2802,150 @@ find_pfxlist_reachable_router( * is no router around us. */ void -pfxlist_onlink_check(int nd6locked) +pfxlist_onlink_check(void) { - struct nd_prefix *pr; + struct nd_prefix *pr, *prclear; struct in6_ifaddr *ifa; + struct nd_defrouter *dr; + struct nd_pfxrouter *pfxrtr = NULL; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + + while (nd_prefix_busy) { + nd_prefix_waiters++; + msleep(nd_prefix_waitchan, nd6_mutex, (PZERO-1), + __func__, NULL); + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + } + nd_prefix_busy = TRUE; /* * Check if there is a prefix that has a reachable advertising * router. */ - if (nd6locked == 0) - lck_mtx_lock(nd6_mutex); - lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); - for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { - if (pr->ndpr_raf_onlink && find_pfxlist_reachable_router(pr)) + pr = nd_prefix.lh_first; + while (pr) { + NDPR_LOCK(pr); + if (pr->ndpr_stateflags & NDPRF_PROCESSED) { + NDPR_UNLOCK(pr); + pr = pr->ndpr_next; + continue; + } + NDPR_ADDREF_LOCKED(pr); + if (pr->ndpr_raf_onlink && find_pfxlist_reachable_router(pr) && + (pr->ndpr_debug & IFD_ATTACHED)) { + NDPR_UNLOCK(pr); + NDPR_REMREF(pr); break; + } + pr->ndpr_stateflags |= NDPRF_PROCESSED; + NDPR_UNLOCK(pr); + NDPR_REMREF(pr); + /* + * Since find_pfxlist_reachable_router() drops the nd6_mutex, we + * have to start over, but the NDPRF_PROCESSED flag will stop + * us from checking the same prefix twice. + */ + pr = nd_prefix.lh_first; + } + LIST_FOREACH(prclear, &nd_prefix, ndpr_entry) { + NDPR_LOCK(prclear); + prclear->ndpr_stateflags &= ~NDPRF_PROCESSED; + NDPR_UNLOCK(prclear); } - if (pr) { + /* + * If we have no such prefix, check whether we still have a router + * that does not advertise any prefixes. + */ + if (pr == NULL) { + for (dr = TAILQ_FIRST(&nd_defrouter); dr; + dr = TAILQ_NEXT(dr, dr_entry)) { + struct nd_prefix *pr0; + + for (pr0 = nd_prefix.lh_first; pr0; + pr0 = pr0->ndpr_next) { + NDPR_LOCK(pr0); + if ((pfxrtr = pfxrtr_lookup(pr0, dr)) != NULL) { + NDPR_UNLOCK(pr0); + break; + } + NDPR_UNLOCK(pr0); + } + if (pfxrtr != NULL) + break; + } + } + if (pr != NULL || (TAILQ_FIRST(&nd_defrouter) && pfxrtr == NULL)) { /* - * There is at least one prefix that has a reachable router. + * There is at least one prefix that has a reachable router, + * or at least a router which probably does not advertise + * any prefixes. The latter would be the case when we move + * to a new link where we have a router that does not provide + * prefixes and we configure an address by hand. * Detach prefixes which have no reachable advertising * router, and attach other prefixes. */ - for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { - /* XXX: a link-local prefix should never be detached */ - if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) - continue; - + pr = nd_prefix.lh_first; + while (pr) { + NDPR_LOCK(pr); /* - * we aren't interested in prefixes without the L bit - * set. + * We aren't interested prefixes already processed, + * nor in prefixes without the L bit + * set nor in static prefixes */ - if (pr->ndpr_raf_onlink == 0) + if (pr->ndpr_raf_onlink == 0 || + pr->ndpr_stateflags & NDPRF_PROCESSED || + pr->ndpr_stateflags & NDPRF_STATIC) { + NDPR_UNLOCK(pr); + pr = pr->ndpr_next; continue; - + } + NDPR_ADDREF_LOCKED(pr); if ((pr->ndpr_stateflags & NDPRF_DETACHED) == 0 && - find_pfxlist_reachable_router(pr) == NULL) + find_pfxlist_reachable_router(pr) == NULL && + (pr->ndpr_debug & IFD_ATTACHED)) pr->ndpr_stateflags |= NDPRF_DETACHED; if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0 && - find_pfxlist_reachable_router(pr) != 0) + find_pfxlist_reachable_router(pr) != NULL && + (pr->ndpr_debug & IFD_ATTACHED)) pr->ndpr_stateflags &= ~NDPRF_DETACHED; + pr->ndpr_stateflags |= NDPRF_PROCESSED; + NDPR_UNLOCK(pr); + NDPR_REMREF(pr); + /* + * Since find_pfxlist_reachable_router() drops the + * nd6_mutex, we have to start over, but the + * NDPRF_PROCESSED flag will stop us from checking + * the same prefix twice. + */ + pr = nd_prefix.lh_first; } } else { /* there is no prefix that has a reachable router */ for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { - if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) + NDPR_LOCK(pr); + if (pr->ndpr_raf_onlink == 0 || + pr->ndpr_stateflags & NDPRF_STATIC) { + NDPR_UNLOCK(pr); continue; - - if (pr->ndpr_raf_onlink == 0) - continue; - + } if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0) pr->ndpr_stateflags &= ~NDPRF_DETACHED; + NDPR_UNLOCK(pr); } } + LIST_FOREACH(prclear, &nd_prefix, ndpr_entry) { + NDPR_LOCK(prclear); + prclear->ndpr_stateflags &= ~NDPRF_PROCESSED; + NDPR_UNLOCK(prclear); + } + VERIFY(nd_prefix_busy); + nd_prefix_busy = FALSE; + if (nd_prefix_waiters > 0) { + nd_prefix_waiters = 0; + wakeup(nd_prefix_waitchan); + } /* * Remove each interface route associated with a (just) detached @@ -1571,17 +2955,21 @@ pfxlist_onlink_check(int nd6locked) * interfaces. Such cases will be handled in nd6_prefix_onlink, * so we don't have to care about them. */ - for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { + pr = nd_prefix.lh_first; + while (pr) { int e; - if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) - continue; - - if (pr->ndpr_raf_onlink == 0) + NDPR_LOCK(pr); + if (pr->ndpr_raf_onlink == 0 || + pr->ndpr_stateflags & NDPRF_STATIC) { + NDPR_UNLOCK(pr); + pr = pr->ndpr_next; continue; - + } if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0 && (pr->ndpr_stateflags & NDPRF_ONLINK) != 0) { + NDPR_UNLOCK(pr); + lck_mtx_unlock(nd6_mutex); if ((e = nd6_prefix_offlink(pr)) != 0) { nd6log((LOG_ERR, "pfxlist_onlink_check: failed to " @@ -1589,18 +2977,25 @@ pfxlist_onlink_check(int nd6locked) ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, e)); } + lck_mtx_lock(nd6_mutex); + pr = nd_prefix.lh_first; + continue; } if ((pr->ndpr_stateflags & NDPRF_DETACHED) == 0 && (pr->ndpr_stateflags & NDPRF_ONLINK) == 0 && pr->ndpr_raf_onlink) { - if ((e = nd6_prefix_onlink(pr, 0, 1)) != 0) { + NDPR_UNLOCK(pr); + if ((e = nd6_prefix_onlink(pr)) != 0) { nd6log((LOG_ERR, "pfxlist_onlink_check: failed to " "make %s/%d offlink, errno=%d\n", ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, e)); } + } else { + NDPR_UNLOCK(pr); } + pr = pr->ndpr_next; } /* @@ -1611,118 +3006,298 @@ pfxlist_onlink_check(int nd6locked) * always be attached. * The precise detection logic is same as the one for prefixes. */ + lck_rw_lock_shared(&in6_ifaddr_rwlock); for (ifa = in6_ifaddrs; ifa; ifa = ifa->ia_next) { - if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) - continue; + struct nd_prefix *ndpr; - if (ifa->ia6_ndpr == NULL) { + IFA_LOCK(&ifa->ia_ifa); + if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) { + IFA_UNLOCK(&ifa->ia_ifa); + continue; + } + if ((ndpr = ifa->ia6_ndpr) == NULL) { /* * This can happen when we first configure the address * (i.e. the address exists, but the prefix does not). * XXX: complicated relationships... */ + IFA_UNLOCK(&ifa->ia_ifa); continue; } + NDPR_ADDREF(ndpr); + IFA_UNLOCK(&ifa->ia_ifa); - if (find_pfxlist_reachable_router(ifa->ia6_ndpr)) + NDPR_LOCK(ndpr); + if (find_pfxlist_reachable_router(ndpr)) { + NDPR_UNLOCK(ndpr); + NDPR_REMREF(ndpr); break; + } + NDPR_UNLOCK(ndpr); + NDPR_REMREF(ndpr); } if (ifa) { for (ifa = in6_ifaddrs; ifa; ifa = ifa->ia_next) { - if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) - continue; + struct nd_prefix *ndpr; - if (ifa->ia6_ndpr == NULL) /* XXX: see above. */ + IFA_LOCK(&ifa->ia_ifa); + if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) { + IFA_UNLOCK(&ifa->ia_ifa); continue; - - if (find_pfxlist_reachable_router(ifa->ia6_ndpr)) - ifa->ia6_flags &= ~IN6_IFF_DETACHED; - else + } + if ((ndpr = ifa->ia6_ndpr) == NULL) { + /* XXX: see above. */ + IFA_UNLOCK(&ifa->ia_ifa); + continue; + } + NDPR_ADDREF(ndpr); + IFA_UNLOCK(&ifa->ia_ifa); + NDPR_LOCK(ndpr); + if (find_pfxlist_reachable_router(ndpr)) { + NDPR_UNLOCK(ndpr); + IFA_LOCK(&ifa->ia_ifa); + if (ifa->ia6_flags & IN6_IFF_DETACHED) { + ifa->ia6_flags &= ~IN6_IFF_DETACHED; + ifa->ia6_flags |= IN6_IFF_TENTATIVE; + IFA_UNLOCK(&ifa->ia_ifa); + nd6_dad_start((struct ifaddr *)ifa, 0); + } else { + IFA_UNLOCK(&ifa->ia_ifa); + } + } else { + NDPR_UNLOCK(ndpr); + IFA_LOCK(&ifa->ia_ifa); ifa->ia6_flags |= IN6_IFF_DETACHED; + IFA_UNLOCK(&ifa->ia_ifa); + } + NDPR_REMREF(ndpr); } } else { for (ifa = in6_ifaddrs; ifa; ifa = ifa->ia_next) { - if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) + IFA_LOCK(&ifa->ia_ifa); + if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) { + IFA_UNLOCK(&ifa->ia_ifa); continue; + } + if (ifa->ia6_flags & IN6_IFF_DETACHED) { + ifa->ia6_flags &= ~IN6_IFF_DETACHED; + ifa->ia6_flags |= IN6_IFF_TENTATIVE; + IFA_UNLOCK(&ifa->ia_ifa); + /* Do we need a delay in this case? */ + nd6_dad_start((struct ifaddr *)ifa, 0); + } else { + IFA_UNLOCK(&ifa->ia_ifa); + } + } + } + lck_rw_done(&in6_ifaddr_rwlock); +} + +static struct nd_prefix * +nd6_prefix_equal_lookup(struct nd_prefix *pr, boolean_t primary_only) +{ + struct nd_prefix *opr; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + + for (opr = nd_prefix.lh_first; opr; opr = opr->ndpr_next) { + if (opr == pr) + continue; + + NDPR_LOCK(opr); + if ((opr->ndpr_stateflags & NDPRF_ONLINK) == 0) { + NDPR_UNLOCK(opr); + continue; + } + if (opr->ndpr_plen == pr->ndpr_plen && + in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr, + &opr->ndpr_prefix.sin6_addr, pr->ndpr_plen) && + (!primary_only || + !(opr->ndpr_stateflags & NDPRF_IFSCOPE))) { + NDPR_ADDREF_LOCKED(opr); + NDPR_UNLOCK(opr); + return (opr); + } + NDPR_UNLOCK(opr); + } + return (NULL); +} + +/* + * Synchronize the interface routes of similar prefixes on different + * interfaces; the one using the default interface would be (re)installed + * as a primary/non-scoped entry, and the rest as scoped entri(es). + */ +static void +nd6_prefix_sync(struct ifnet *ifp) +{ + struct nd_prefix *pr, *opr; + int err = 0; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + + if (!ip6_doscopedroute || ifp == NULL) + return; - ifa->ia6_flags &= ~IN6_IFF_DETACHED; + for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { + NDPR_LOCK(pr); + if (!(pr->ndpr_stateflags & NDPRF_ONLINK)) { + NDPR_UNLOCK(pr); + continue; + } + if (pr->ndpr_ifp == ifp && + (pr->ndpr_stateflags & NDPRF_IFSCOPE) && + !IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) { + NDPR_UNLOCK(pr); + break; } + NDPR_UNLOCK(pr); } - if (nd6locked == 0) + + if (pr == NULL) + return; + + /* Remove conflicting entries */ + opr = nd6_prefix_equal_lookup(pr, TRUE); + if (opr != NULL) { lck_mtx_unlock(nd6_mutex); + err = nd6_prefix_offlink(opr); + lck_mtx_lock(nd6_mutex); + if (err != 0) { + nd6log((LOG_ERR, + "%s: failed to make %s/%d offlink on %s, " + "errno=%d\n", __func__, + ip6_sprintf(&opr->ndpr_prefix.sin6_addr), + opr->ndpr_plen, if_name(opr->ndpr_ifp), err)); + } + } else { + nd6log((LOG_ERR, + "%s: scoped %s/%d on %s has no matching unscoped prefix\n", + __func__, ip6_sprintf(&pr->ndpr_prefix.sin6_addr), + pr->ndpr_plen, if_name(pr->ndpr_ifp))); + } + + lck_mtx_unlock(nd6_mutex); + err = nd6_prefix_offlink(pr); + lck_mtx_lock(nd6_mutex); + if (err != 0) { + nd6log((LOG_ERR, + "%s: failed to make %s/%d offlink on %s, errno=%d\n", + __func__, ip6_sprintf(&pr->ndpr_prefix.sin6_addr), + pr->ndpr_plen, if_name(pr->ndpr_ifp), err)); + } + + /* Add the entries back */ + if (opr != NULL) { + err = nd6_prefix_onlink_scoped(opr, opr->ndpr_ifp->if_index); + if (err != 0) { + nd6log((LOG_ERR, + "%s: failed to make %s/%d scoped onlink on %s, " + "errno=%d\n", __func__, + ip6_sprintf(&opr->ndpr_prefix.sin6_addr), + opr->ndpr_plen, if_name(opr->ndpr_ifp), err)); + } + } + + err = nd6_prefix_onlink_scoped(pr, IFSCOPE_NONE); + if (err != 0) { + nd6log((LOG_ERR, + "%s: failed to make %s/%d onlink on %s, errno=%d\n", + __func__, ip6_sprintf(&pr->ndpr_prefix.sin6_addr), + pr->ndpr_plen, if_name(pr->ndpr_ifp), err)); + } + + if (err != 0) { + nd6log((LOG_ERR, + "%s: error promoting %s/%d to %s from %s\n", + __func__, ip6_sprintf(&pr->ndpr_prefix.sin6_addr), + pr->ndpr_plen, if_name(pr->ndpr_ifp), + (opr != NULL) ? if_name(opr->ndpr_ifp) : "NONE")); + } else { + nd6log2((LOG_INFO, + "%s: %s/%d promoted, previously on %s\n", + if_name(pr->ndpr_ifp), + ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, + (opr != NULL) ? if_name(opr->ndpr_ifp) : "NONE")); + } + + if (opr != NULL) + NDPR_REMREF(opr); } -int -nd6_prefix_onlink( - struct nd_prefix *pr, int rtlocked, int nd6locked) +static int +nd6_prefix_onlink_common(struct nd_prefix *pr, boolean_t force_scoped, + unsigned int ifscope) { struct ifaddr *ifa; struct ifnet *ifp = pr->ndpr_ifp; - struct sockaddr_in6 mask6; + struct sockaddr_in6 mask6, prefix; struct nd_prefix *opr; u_int32_t rtflags; int error = 0; struct rtentry *rt = NULL; + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ + NDPR_LOCK(pr); if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0) { nd6log((LOG_ERR, - "nd6_prefix_onlink: %s/%d is already on-link\n", - ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen); - return(EEXIST)); + "nd6_prefix_onlink: %s/%d on %s scoped=%d is already " + "on-link\n", ip6_sprintf(&pr->ndpr_prefix.sin6_addr), + pr->ndpr_plen, if_name(pr->ndpr_ifp), + (pr->ndpr_stateflags & NDPRF_IFSCOPE) ? 1 : 0); + NDPR_UNLOCK(pr); + return (EEXIST)); } + NDPR_UNLOCK(pr); /* * Add the interface route associated with the prefix. Before * installing the route, check if there's the same prefix on another * interface, and the prefix has already installed the interface route. - * Although such a configuration is expected to be rare, we explicitly - * allow it. */ - if (nd6locked == 0) - lck_mtx_lock(nd6_mutex); - else - lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); - for (opr = nd_prefix.lh_first; opr; opr = opr->ndpr_next) { - if (opr == pr) - continue; - - if ((opr->ndpr_stateflags & NDPRF_ONLINK) == 0) - continue; - - if (opr->ndpr_plen == pr->ndpr_plen && - in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr, - &opr->ndpr_prefix.sin6_addr, - pr->ndpr_plen)) { - if (nd6locked == 0) - lck_mtx_unlock(nd6_mutex); - return(0); - } + opr = nd6_prefix_equal_lookup(pr, FALSE); + if (opr != NULL) + NDPR_REMREF(opr); + + if (!ip6_doscopedroute) { + /* if an interface route already exists, just return */ + if (opr != NULL) + return (0); + ifscope = IFSCOPE_NONE; + } else if (!force_scoped) { + /* + * If a primary/non-scoped interface route already exists, + * install the new one as a scoped entry. If the existing + * interface route is scoped, install new as non-scoped. + */ + ifscope = (opr != NULL) ? ifp->if_index : IFSCOPE_NONE; + opr = nd6_prefix_equal_lookup(pr, TRUE); + if (opr != NULL) + NDPR_REMREF(opr); + else if (ifscope != IFSCOPE_NONE) + ifscope = IFSCOPE_NONE; } - if (nd6locked == 0) - lck_mtx_unlock(nd6_mutex); /* - * We prefer link-local addresses as the associated interface address. + * We prefer link-local addresses as the associated interface address. */ /* search for a link-local addr */ ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY| IN6_IFF_ANYCAST); if (ifa == NULL) { - /* XXX: freebsd does not have ifa_ifwithaf */ - ifnet_lock_exclusive(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) - { - if (ifa->ifa_addr->sa_family == AF_INET6) - break; - } - if (ifa != NULL) - ifaref(ifa); + struct in6_ifaddr *ia6; + ifnet_lock_shared(ifp); + IFP_TO_IA6(ifp, ia6); ifnet_lock_done(ifp); + if (ia6 != NULL) + ifa = &ia6->ia_ifa; /* should we care about ia6_flags? */ } + NDPR_LOCK(pr); if (ifa == NULL) { /* * This can still happen, when, for example, we receive an RA @@ -1735,7 +3310,8 @@ nd6_prefix_onlink( " to add route for a prefix(%s/%d) on %s\n", ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(ifp))); - return(0); + NDPR_UNLOCK(pr); + return (0); } /* @@ -1745,11 +3321,12 @@ nd6_prefix_onlink( bzero(&mask6, sizeof(mask6)); mask6.sin6_len = sizeof(mask6); mask6.sin6_addr = pr->ndpr_mask; + prefix = pr->ndpr_prefix; + NDPR_UNLOCK(pr); - if (rtlocked == 0) - lck_mtx_lock(rnh_lock); - + IFA_LOCK_SPIN(ifa); rtflags = ifa->ifa_flags | RTF_CLONING | RTF_UP; + IFA_UNLOCK(ifa); if (nd6_need_cache(ifp)) { /* explicitly set in case ifa_flags does not set the flag. */ rtflags |= RTF_CLONING; @@ -1759,54 +3336,81 @@ nd6_prefix_onlink( */ rtflags &= ~RTF_CLONING; } - error = rtrequest_locked(RTM_ADD, (struct sockaddr *)&pr->ndpr_prefix, - ifa->ifa_addr, (struct sockaddr *)&mask6, - rtflags, &rt); - if (error == 0) { - if (rt != NULL) { /* this should be non NULL, though */ - RT_LOCK(rt); - nd6_rtmsg(RTM_ADD, rt); - RT_UNLOCK(rt); - } - pr->ndpr_stateflags |= NDPRF_ONLINK; - } - else { + + lck_mtx_unlock(nd6_mutex); + + error = rtrequest_scoped(RTM_ADD, (struct sockaddr *)&prefix, + ifa->ifa_addr, (struct sockaddr *)&mask6, rtflags, &rt, + ifscope); + + if (rt != NULL) { + RT_LOCK(rt); + nd6_rtmsg(RTM_ADD, rt); + RT_UNLOCK(rt); + RT_REMREF(rt); + } else { + NDPR_LOCK(pr); nd6log((LOG_ERR, "nd6_prefix_onlink: failed to add route for a" - " prefix (%s/%d) on %s, gw=%s, mask=%s, flags=%lx " - "errno = %d\n", + " prefix (%s/%d) on %s, gw=%s, mask=%s, flags=%lx," + " scoped=%d, errno = %d\n", ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(ifp), ip6_sprintf(&((struct sockaddr_in6 *)ifa->ifa_addr)->sin6_addr), - ip6_sprintf(&mask6.sin6_addr), rtflags, error)); + ip6_sprintf(&mask6.sin6_addr), rtflags, + (ifscope != IFSCOPE_NONE), error)); + NDPR_UNLOCK(pr); } - if (rt != NULL) - RT_REMREF(rt); + lck_mtx_lock(nd6_mutex); + + NDPR_LOCK(pr); + pr->ndpr_stateflags &= ~NDPRF_IFSCOPE; + if (rt != NULL || error == EEXIST) { + pr->ndpr_stateflags |= NDPRF_ONLINK; + if (ifscope != IFSCOPE_NONE) + pr->ndpr_stateflags |= NDPRF_IFSCOPE; + } + NDPR_UNLOCK(pr); - if (rtlocked == 0) - lck_mtx_unlock(rnh_lock); + IFA_REMREF(ifa); - ifafree(ifa); + return (error); +} - return(error); +int +nd6_prefix_onlink(struct nd_prefix *pr) +{ + return (nd6_prefix_onlink_common(pr, FALSE, IFSCOPE_NONE)); } int -nd6_prefix_offlink( - struct nd_prefix *pr) +nd6_prefix_onlink_scoped(struct nd_prefix *pr, unsigned int ifscope) { - int error = 0; + return (nd6_prefix_onlink_common(pr, TRUE, ifscope)); +} + +int +nd6_prefix_offlink(struct nd_prefix *pr) +{ + int plen, error = 0; struct ifnet *ifp = pr->ndpr_ifp; struct nd_prefix *opr; - struct sockaddr_in6 sa6, mask6; + struct sockaddr_in6 sa6, mask6, prefix; struct rtentry *rt = NULL; + unsigned int ifscope; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_NOTOWNED); /* sanity check */ + NDPR_LOCK(pr); if ((pr->ndpr_stateflags & NDPRF_ONLINK) == 0) { nd6log((LOG_ERR, - "nd6_prefix_offlink: %s/%d is already off-link\n", - ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen)); - return(EEXIST); + "nd6_prefix_offlink: %s/%d on %s scoped=%d is already " + "off-link\n", ip6_sprintf(&pr->ndpr_prefix.sin6_addr), + pr->ndpr_plen, if_name(pr->ndpr_ifp), + (pr->ndpr_stateflags & NDPRF_IFSCOPE) ? 1 : 0)); + NDPR_UNLOCK(pr); + return (EEXIST); } bzero(&sa6, sizeof(sa6)); @@ -1818,48 +3422,66 @@ nd6_prefix_offlink( mask6.sin6_family = AF_INET6; mask6.sin6_len = sizeof(sa6); bcopy(&pr->ndpr_mask, &mask6.sin6_addr, sizeof(struct in6_addr)); - lck_mtx_lock(rnh_lock); - error = rtrequest_locked(RTM_DELETE, (struct sockaddr *)&sa6, NULL, - (struct sockaddr *)&mask6, 0, &rt); - if (error == 0) { - pr->ndpr_stateflags &= ~NDPRF_ONLINK; + prefix = pr->ndpr_prefix; + plen = pr->ndpr_plen; + NDPR_UNLOCK(pr); + ifscope = (pr->ndpr_stateflags & NDPRF_IFSCOPE) ? + ifp->if_index : IFSCOPE_NONE; + + error = rtrequest_scoped(RTM_DELETE, (struct sockaddr *)&sa6, + NULL, (struct sockaddr *)&mask6, 0, &rt, ifscope); + + if (rt != NULL) { /* report the route deletion to the routing socket. */ - if (rt != NULL) { - RT_LOCK(rt); - nd6_rtmsg(RTM_DELETE, rt); - RT_UNLOCK(rt); - } + RT_LOCK(rt); + nd6_rtmsg(RTM_DELETE, rt); + RT_UNLOCK(rt); + rtfree(rt); /* - * There might be the same prefix on another interface, - * the prefix which could not be on-link just because we have - * the interface route (see comments in nd6_prefix_onlink). - * If there's one, try to make the prefix on-link on the - * interface. + * The following check takes place only when Scoped Routing + * is not enabled. There might be the same prefix on another + * interface, the prefix which could not be on-link just + * because we have the interface route (see comments in + * nd6_prefix_onlink). If there's one, try to make the prefix + * on-link on the interface. */ - lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); - for (opr = nd_prefix.lh_first; opr; opr = opr->ndpr_next) { - if (opr == pr) - continue; + lck_mtx_lock(nd6_mutex); + opr = nd_prefix.lh_first; + while (opr) { + /* does not apply in the Scoped Routing case */ + if (ip6_doscopedroute) + break; - if ((opr->ndpr_stateflags & NDPRF_ONLINK) != 0) + if (opr == pr) { + opr = opr->ndpr_next; continue; + } + NDPR_LOCK(opr); + if ((opr->ndpr_stateflags & NDPRF_ONLINK) != 0) { + NDPR_UNLOCK(opr); + opr = opr->ndpr_next; + continue; + } /* * KAME specific: detached prefixes should not be * on-link. */ - if ((opr->ndpr_stateflags & NDPRF_DETACHED) != 0) + if ((opr->ndpr_stateflags & NDPRF_DETACHED) != 0) { + NDPR_UNLOCK(opr); + opr = opr->ndpr_next; continue; - - if (opr->ndpr_plen == pr->ndpr_plen && - in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr, - &opr->ndpr_prefix.sin6_addr, - pr->ndpr_plen)) { + } + if (opr->ndpr_plen == plen && + in6_are_prefix_equal(&prefix.sin6_addr, + &opr->ndpr_prefix.sin6_addr, plen)) { int e; - if ((e = nd6_prefix_onlink(opr, 1, 1)) != 0) { + NDPR_UNLOCK(opr); + lck_mtx_unlock(nd6_mutex); + if ((e = nd6_prefix_onlink(opr)) != 0) { nd6log((LOG_ERR, "nd6_prefix_offlink: failed to " "recover a prefix %s/%d from %s " @@ -1868,40 +3490,41 @@ nd6_prefix_offlink( opr->ndpr_plen, if_name(ifp), if_name(opr->ndpr_ifp), e)); } + lck_mtx_lock(nd6_mutex); + opr = nd_prefix.lh_first; + } else { + NDPR_UNLOCK(opr); + opr = opr->ndpr_next; } } - } - else { - /* XXX: can we still set the NDPRF_ONLINK flag? */ + lck_mtx_unlock(nd6_mutex); + } else { nd6log((LOG_ERR, "nd6_prefix_offlink: failed to delete route: " - "%s/%d on %s (errno = %d)\n", - ip6_sprintf(&sa6.sin6_addr), pr->ndpr_plen, if_name(ifp), - error)); + "%s/%d on %s, scoped %d, (errno = %d)\n", + ip6_sprintf(&sa6.sin6_addr), plen, if_name(ifp), + (ifscope != IFSCOPE_NONE), error)); } - if (rt != NULL) - rtfree_locked(rt); - - lck_mtx_unlock(rnh_lock); + NDPR_LOCK(pr); + pr->ndpr_stateflags &= ~(NDPRF_ONLINK | NDPRF_IFSCOPE); + NDPR_UNLOCK(pr); - return(error); + return (error); } static struct in6_ifaddr * in6_ifadd( struct nd_prefix *pr, - struct in6_addr *ifid) /* Mobile IPv6 addition */ + int mcast) { struct ifnet *ifp = pr->ndpr_ifp; - struct ifaddr *ifa; struct in6_aliasreq ifra; struct in6_ifaddr *ia, *ib; int error, plen0; + int updateflags; struct in6_addr mask; - int prefixlen = pr->ndpr_plen; - - in6_len2mask(&mask, prefixlen); + int prefixlen; /* * find a link-local address (will be interface ID). @@ -1915,41 +3538,32 @@ in6_ifadd( * (2) RFC2462 5.4 suggesting the use of the same interface identifier * for multiple addresses on a single interface, and possible shortcut * of DAD. we omitted DAD for this reason in the past. - * (3) a user can prevent autoconfiguration of global address + * (3) a user can prevent autoconfiguration of global address * by removing link-local address by hand (this is partly because we - * don't have other way to control the use of IPv6 on a interface. + * don't have other way to control the use of IPv6 on an interface. * this has been our design choice - cf. NRL's "ifconfig auto"). * (4) it is easier to manage when an interface has addresses * with the same interface identifier, than to have multiple addresses * with different interface identifiers. - * - * Mobile IPv6 addition: allow for caller to specify a wished interface - * ID. This is to not break connections when moving addresses between - * interfaces. */ - ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp, 0);/* 0 is OK? */ - if (ifa) - ib = (struct in6_ifaddr *)ifa; - else - return NULL; - -#if 0 /* don't care link local addr state, and always do DAD */ - /* if link-local address is not eligible, do not autoconfigure. */ - if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY) { - printf("in6_ifadd: link-local address not ready\n"); - ifafree(ifa); - return NULL; - } -#endif + ib = in6ifa_ifpforlinklocal(ifp, 0);/* 0 is OK? */ + if (ib == NULL) + return (NULL); - /* prefixlen + ifidlen must be equal to 128 */ + IFA_LOCK(&ib->ia_ifa); + NDPR_LOCK(pr); + prefixlen = pr->ndpr_plen; + in6_len2mask(&mask, prefixlen); plen0 = in6_mask2len(&ib->ia_prefixmask.sin6_addr, NULL); + /* prefixlen + ifidlen must be equal to 128 */ if (prefixlen != plen0) { nd6log((LOG_INFO, "in6_ifadd: wrong prefixlen for %s " "(prefix=%d ifid=%d)\n", if_name(ifp), prefixlen, 128 - plen0)); - ifafree(ifa); - return NULL; + NDPR_UNLOCK(pr); + IFA_UNLOCK(&ib->ia_ifa); + IFA_REMREF(&ib->ia_ifa); + return (NULL); } /* make ifaddr */ @@ -1971,53 +3585,57 @@ in6_ifadd( ifra.ifra_addr.sin6_addr.s6_addr32[3] &= mask.s6_addr32[3]; /* interface ID */ - if (ifid == NULL || IN6_IS_ADDR_UNSPECIFIED(ifid)) - ifid = &ib->ia_addr.sin6_addr; - ifra.ifra_addr.sin6_addr.s6_addr32[0] - |= (ifid->s6_addr32[0] & ~mask.s6_addr32[0]); - ifra.ifra_addr.sin6_addr.s6_addr32[1] - |= (ifid->s6_addr32[1] & ~mask.s6_addr32[1]); - ifra.ifra_addr.sin6_addr.s6_addr32[2] - |= (ifid->s6_addr32[2] & ~mask.s6_addr32[2]); - ifra.ifra_addr.sin6_addr.s6_addr32[3] - |= (ifid->s6_addr32[3] & ~mask.s6_addr32[3]); - + ifra.ifra_addr.sin6_addr.s6_addr32[0] |= + (ib->ia_addr.sin6_addr.s6_addr32[0] & ~mask.s6_addr32[0]); + ifra.ifra_addr.sin6_addr.s6_addr32[1] |= + (ib->ia_addr.sin6_addr.s6_addr32[1] & ~mask.s6_addr32[1]); + ifra.ifra_addr.sin6_addr.s6_addr32[2] |= + (ib->ia_addr.sin6_addr.s6_addr32[2] & ~mask.s6_addr32[2]); + ifra.ifra_addr.sin6_addr.s6_addr32[3] |= + (ib->ia_addr.sin6_addr.s6_addr32[3] & ~mask.s6_addr32[3]); + /* new prefix mask. */ ifra.ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6); ifra.ifra_prefixmask.sin6_family = AF_INET6; bcopy(&mask, &ifra.ifra_prefixmask.sin6_addr, sizeof(ifra.ifra_prefixmask.sin6_addr)); - /* - * lifetime. - * XXX: in6_init_address_ltimes would override these values later. - * We should reconsider this logic. - */ + /* lifetimes. */ ifra.ifra_lifetime.ia6t_vltime = pr->ndpr_vltime; ifra.ifra_lifetime.ia6t_pltime = pr->ndpr_pltime; /* XXX: scope zone ID? */ ifra.ifra_flags |= IN6_IFF_AUTOCONF; /* obey autoconf */ + + NDPR_UNLOCK(pr); + IFA_UNLOCK(&ib->ia_ifa); + IFA_REMREF(&ib->ia_ifa); + /* - * temporarily set the nopfx flag to avoid conflict. - * XXX: we should reconsider the entire mechanism about prefix - * manipulation. + * Make sure that we do not have this address already. This should + * usually not happen, but we can still see this case, e.g., if we + * have manually configured the exact address to be configured. */ - ifra.ifra_flags |= IN6_IFF_NOPFX; + if ((ib = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr)) != NULL) { + IFA_REMREF(&ib->ia_ifa); + /* this should be rare enough to make an explicit log */ + log(LOG_INFO, "in6_ifadd: %s is already configured\n", + ip6_sprintf(&ifra.ifra_addr.sin6_addr)); + return (NULL); + } /* - * keep the new address, regardless of the result of in6_update_ifa. - * XXX: this address is now meaningless. - * We should reconsider its role. + * Allocate ifaddr structure, link into chain, etc. + * If we are going to create a new address upon receiving a multicasted + * RA, we need to impose a random delay before starting DAD. + * [draft-ietf-ipv6-rfc2462bis-02.txt, Section 5.4.2] */ - pr->ndpr_addr = ifra.ifra_addr.sin6_addr; - - ifafree(ifa); - ifa = NULL; - - /* allocate ifaddr structure, link into chain, etc. */ - if ((error = in6_update_ifa(ifp, &ifra, NULL, M_NOWAIT)) != 0) { + updateflags = 0; + if (mcast) + updateflags |= IN6_IFAUPDATE_DADDELAY; + error = in6_update_ifa(ifp, &ifra, NULL, updateflags, M_WAITOK); + if (error != 0) { nd6log((LOG_ERR, "in6_ifadd: failed to make ifaddr %s on %s (errno=%d)\n", ip6_sprintf(&ifra.ifra_addr.sin6_addr), if_name(ifp), @@ -2032,6 +3650,8 @@ in6_ifadd( return(ia); /* this must NOT be NULL. */ } +#define IA6_NONCONST(i) ((struct in6_ifaddr *)(uintptr_t)(i)) + int in6_tmpifadd( const struct in6_ifaddr *ia0, /* corresponding public address */ @@ -2043,14 +3663,18 @@ in6_tmpifadd( struct in6_aliasreq ifra; int i, error; int trylimit = 3; /* XXX: adhoc value */ + int updateflags; u_int32_t randid[2]; time_t vltime0, pltime0; struct timeval timenow; + struct in6_addr addr; + struct nd_prefix *ndpr; getmicrotime(&timenow); bzero(&ifra, sizeof(ifra)); strncpy(ifra.ifra_name, if_name(ifp), sizeof(ifra.ifra_name)); + IFA_LOCK(&IA6_NONCONST(ia0)->ia_ifa); ifra.ifra_addr = ia0->ia_addr; /* copy prefix mask */ ifra.ifra_prefixmask = ia0->ia_prefixmask; @@ -2059,24 +3683,26 @@ in6_tmpifadd( ifra.ifra_addr.sin6_addr.s6_addr32[i] &= ifra.ifra_prefixmask.sin6_addr.s6_addr32[i]; } + addr = ia0->ia_addr.sin6_addr; + IFA_UNLOCK(&IA6_NONCONST(ia0)->ia_ifa); - again: +again: in6_get_tmpifid(ifp, (u_int8_t *)randid, - (const u_int8_t *)&ia0->ia_addr.sin6_addr.s6_addr[8], - forcegen); - ifra.ifra_addr.sin6_addr.s6_addr32[2] - |= (randid[0] & ~(ifra.ifra_prefixmask.sin6_addr.s6_addr32[2])); - ifra.ifra_addr.sin6_addr.s6_addr32[3] - |= (randid[1] & ~(ifra.ifra_prefixmask.sin6_addr.s6_addr32[3])); + (const u_int8_t *)&addr.s6_addr[8], forcegen); + + ifra.ifra_addr.sin6_addr.s6_addr32[2] |= + (randid[0] & ~(ifra.ifra_prefixmask.sin6_addr.s6_addr32[2])); + ifra.ifra_addr.sin6_addr.s6_addr32[3] |= + (randid[1] & ~(ifra.ifra_prefixmask.sin6_addr.s6_addr32[3])); /* - * If by chance the new temporary address is the same as an address - * already assigned to the interface, generate a new randomized - * interface identifier and repeat this step. - * RFC 3041 3.3 (4). + * in6_get_tmpifid() quite likely provided a unique interface ID. + * However, we may still have a chance to see collision, because + * there may be a time lag between generation of the ID and generation + * of the address. So, we'll do one more sanity check. */ if ((ia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr)) != NULL) { - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); if (trylimit-- == 0) { nd6log((LOG_NOTICE, "in6_tmpifadd: failed to find " "a unique random IFID\n")); @@ -2093,25 +3719,22 @@ in6_tmpifadd( * of the public address or TEMP_PREFERRED_LIFETIME - * DESYNC_FACTOR. */ - if (ia0->ia6_lifetime.ia6t_expire != 0) { - vltime0 = IFA6_IS_INVALID(ia0) ? 0 : - (ia0->ia6_lifetime.ia6t_expire - timenow.tv_sec); - if (vltime0 > ip6_temp_valid_lifetime) - vltime0 = ip6_temp_valid_lifetime; - } else + IFA_LOCK(&IA6_NONCONST(ia0)->ia_ifa); + vltime0 = IFA6_IS_INVALID(ia0) + ? 0 + : (ia0->ia6_lifetime.ia6t_vltime - + (timenow.tv_sec - ia0->ia6_updatetime)); + if (vltime0 > ip6_temp_valid_lifetime) vltime0 = ip6_temp_valid_lifetime; - if (ia0->ia6_lifetime.ia6t_preferred != 0) { - pltime0 = IFA6_IS_DEPRECATED(ia0) ? 0 : - (ia0->ia6_lifetime.ia6t_preferred - timenow.tv_sec); - if (pltime0 > ip6_temp_preferred_lifetime - ip6_desync_factor){ - pltime0 = ip6_temp_preferred_lifetime - - ip6_desync_factor; - } - } else + pltime0 = IFA6_IS_DEPRECATED(ia0) + ? 0 + : (ia0->ia6_lifetime.ia6t_pltime - + (timenow.tv_sec - ia0->ia6_updatetime)); + if (pltime0 > ip6_temp_preferred_lifetime - ip6_desync_factor) pltime0 = ip6_temp_preferred_lifetime - ip6_desync_factor; ifra.ifra_lifetime.ia6t_vltime = vltime0; ifra.ifra_lifetime.ia6t_pltime = pltime0; - + IFA_UNLOCK(&IA6_NONCONST(ia0)->ia_ifa); /* * A temporary address is created only if this calculated Preferred * Lifetime is greater than REGEN_ADVANCE time units. @@ -2124,8 +3747,13 @@ in6_tmpifadd( ifra.ifra_flags |= (IN6_IFF_AUTOCONF|IN6_IFF_TEMPORARY); /* allocate ifaddr structure, link into chain, etc. */ - if ((error = in6_update_ifa(ifp, &ifra, NULL, how)) != 0) - return(error); + updateflags = 0; + + if (how) + updateflags |= IN6_IFAUPDATE_DADDELAY; + + if ((error = in6_update_ifa(ifp, &ifra, NULL, updateflags, how)) != 0) + return (error); newia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr); if (newia == NULL) { /* XXX: can it happen? */ @@ -2134,9 +3762,37 @@ in6_tmpifadd( "no ifaddr\n")); return(EINVAL); /* XXX */ } - lck_mtx_lock(nd6_mutex); - newia->ia6_ndpr = ia0->ia6_ndpr; - newia->ia6_ndpr->ndpr_refcnt++; + IFA_LOCK(&IA6_NONCONST(ia0)->ia_ifa); + ndpr = ia0->ia6_ndpr; + if (ndpr == NULL) { + /* + * We lost the race with another thread that has purged + * ia0 address; in this case, purge the tmp addr as well. + */ + nd6log((LOG_ERR, "in6_tmpifadd: no public address\n")); + VERIFY(!(ia0->ia6_flags & IN6_IFF_AUTOCONF)); + IFA_UNLOCK(&IA6_NONCONST(ia0)->ia_ifa); + in6_purgeaddr(&newia->ia_ifa); + IFA_REMREF(&newia->ia_ifa); + return (EADDRNOTAVAIL); + } + NDPR_ADDREF(ndpr); /* for us */ + IFA_UNLOCK(&IA6_NONCONST(ia0)->ia_ifa); + IFA_LOCK(&newia->ia_ifa); + if (newia->ia6_ndpr != NULL) { + NDPR_LOCK(newia->ia6_ndpr); + VERIFY(newia->ia6_ndpr->ndpr_addrcnt != 0); + newia->ia6_ndpr->ndpr_addrcnt--; + NDPR_UNLOCK(newia->ia6_ndpr); + NDPR_REMREF(newia->ia6_ndpr); /* release addr reference */ + } + newia->ia6_ndpr = ndpr; + NDPR_LOCK(newia->ia6_ndpr); + newia->ia6_ndpr->ndpr_addrcnt++; + VERIFY(newia->ia6_ndpr->ndpr_addrcnt != 0); + NDPR_ADDREF_LOCKED(newia->ia6_ndpr); /* for addr reference */ + NDPR_UNLOCK(newia->ia6_ndpr); + IFA_UNLOCK(&newia->ia_ifa); /* * A newly added address might affect the status of other addresses. * XXX: when the temporary address is generated with a new public @@ -2145,18 +3801,25 @@ in6_tmpifadd( * and, in fact, we surely need the check when we create a new * temporary address due to deprecation of an old temporary address. */ - pfxlist_onlink_check(1); + lck_mtx_lock(nd6_mutex); + pfxlist_onlink_check(); lck_mtx_unlock(nd6_mutex); - ifafree(&newia->ia_ifa); + IFA_REMREF(&newia->ia_ifa); + + /* remove our reference */ + NDPR_REMREF(ndpr); return(0); -} +} +#undef IA6_NONCONST int in6_init_prefix_ltimes(struct nd_prefix *ndpr) { struct timeval timenow; + NDPR_LOCK_ASSERT_HELD(ndpr); + getmicrotime(&timenow); /* check if preferred lifetime > valid lifetime. RFC2462 5.5.3 (c) */ if (ndpr->ndpr_pltime > ndpr->ndpr_vltime) { @@ -2178,14 +3841,15 @@ in6_init_prefix_ltimes(struct nd_prefix *ndpr) } static void -in6_init_address_ltimes(__unused struct nd_prefix *new, struct in6_addrlifetime *lt6) +in6_init_address_ltimes(__unused struct nd_prefix *new, + struct in6_addrlifetime *lt6, boolean_t is_temporary) { struct timeval timenow; getmicrotime(&timenow); /* Valid lifetime must not be updated unless explicitly specified. */ /* init ia6t_expire */ - if (lt6->ia6t_vltime == ND6_INFINITE_LIFETIME) + if (!is_temporary && lt6->ia6t_vltime == ND6_INFINITE_LIFETIME) lt6->ia6t_expire = 0; else { lt6->ia6t_expire = timenow.tv_sec; @@ -2193,7 +3857,7 @@ in6_init_address_ltimes(__unused struct nd_prefix *new, struct in6_addrlifetime } /* init ia6t_preferred */ - if (lt6->ia6t_pltime == ND6_INFINITE_LIFETIME) + if (!is_temporary && lt6->ia6t_pltime == ND6_INFINITE_LIFETIME) lt6->ia6t_preferred = 0; else { lt6->ia6t_preferred = timenow.tv_sec; @@ -2281,6 +3945,8 @@ nd6_setdefaultiface( { int error = 0; ifnet_t def_ifp = NULL; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_NOTOWNED); ifnet_head_lock_shared(); if (ifindex < 0 || if_index < ifindex) { @@ -2292,12 +3958,21 @@ nd6_setdefaultiface( lck_mtx_lock(nd6_mutex); if (nd6_defifindex != ifindex) { + struct ifnet *odef_ifp = nd6_defifp; + nd6_defifindex = ifindex; if (nd6_defifindex > 0) nd6_defifp = def_ifp; else nd6_defifp = NULL; + if (nd6_defifp != NULL) + nd6log((LOG_INFO, "%s: is now the default " + "interface (was %s)\n", if_name(nd6_defifp), + odef_ifp != NULL ? if_name(odef_ifp) : "NONE")); + else + nd6log((LOG_INFO, "No default interface set\n")); + /* * If the Default Router List is empty, install a route * to the specified interface as default or remove the default @@ -2306,8 +3981,10 @@ nd6_setdefaultiface( * we do this here to avoid re-install the default route * if the list is NOT empty. */ - if (TAILQ_FIRST(&nd_defrouter) == NULL) - defrouter_select(); + if (ip6_doscopedroute || TAILQ_FIRST(&nd_defrouter) == NULL) { + defrtrlist_sync(nd6_defifp); + nd6_prefix_sync(nd6_defifp); + } /* * Our current implementation assumes one-to-one maping between @@ -2316,7 +3993,7 @@ nd6_setdefaultiface( */ scope6_setdefault(nd6_defifp); } - lck_mtx_unlock(nd6_mutex); + return(error); } diff --git a/bsd/netinet6/raw_ip6.c b/bsd/netinet6/raw_ip6.c index 169b7992d..f5c48648e 100644 --- a/bsd/netinet6/raw_ip6.c +++ b/bsd/netinet6/raw_ip6.c @@ -138,6 +138,7 @@ extern struct inpcbhead ripcb; extern struct inpcbinfo ripcbinfo; extern u_int32_t rip_sendspace; extern u_int32_t rip_recvspace; +extern int ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt); struct rip6stat rip6stat; @@ -149,7 +150,8 @@ struct rip6stat rip6stat; int rip6_input( struct mbuf **mp, - int *offp) + int *offp, + int proto) { struct mbuf *m = *mp; register struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); @@ -157,7 +159,7 @@ rip6_input( struct inpcb *last = 0; struct mbuf *opts = NULL; struct sockaddr_in6 rip6src; - int proto = ip6->ip6_nxt; + int ret; rip6stat.rip6s_ipackets++; @@ -206,11 +208,20 @@ rip6_input( } else #endif /*IPSEC*/ if (n) { - if (last->in6p_flags & IN6P_CONTROLOPTS || - last->in6p_socket->so_options & SO_TIMESTAMP) - ip6_savecontrol(last, &opts, ip6, n); + if ((last->in6p_flags & IN6P_CONTROLOPTS) != 0 || + (last->in6p_socket->so_options & SO_TIMESTAMP) != 0 || + (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + ret = ip6_savecontrol(last, n, &opts); + if (ret != 0) { + m_freem(n); + m_freem(opts); + last = in6p; + continue; + } + } /* strip intermediate headers */ m_adj(n, *offp); + so_recv_data_stat(last->in6p_socket, m, 0); if (sbappendaddr(&last->in6p_socket->so_rcv, (struct sockaddr *)&rip6src, n, opts, NULL) == 0) { @@ -222,7 +233,7 @@ rip6_input( } last = in6p; } - lck_rw_done(ripcbinfo.mtx); + #if IPSEC /* * Check AH/ESP integrity. @@ -235,11 +246,21 @@ rip6_input( } else #endif /*IPSEC*/ if (last) { - if (last->in6p_flags & IN6P_CONTROLOPTS || - last->in6p_socket->so_options & SO_TIMESTAMP) - ip6_savecontrol(last, &opts, ip6, m); + if ((last->in6p_flags & IN6P_CONTROLOPTS) != 0 || + (last->in6p_socket->so_options & SO_TIMESTAMP) != 0 || + (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + ret = ip6_savecontrol(last, m, &opts); + if (ret != 0) { + m_freem(m); + m_freem(opts); + ip6stat.ip6s_delivered--; + goto unlock; + } + + } /* strip intermediate headers */ m_adj(m, *offp); + so_recv_data_stat(last->in6p_socket, m, 0); if (sbappendaddr(&last->in6p_socket->so_rcv, (struct sockaddr *)&rip6src, m, opts, NULL) == 0) { rip6stat.rip6s_fullsock++; @@ -259,6 +280,10 @@ rip6_input( } ip6stat.ip6s_delivered--; } + +unlock: + lck_rw_done(ripcbinfo.mtx); + return IPPROTO_DONE; } @@ -270,6 +295,7 @@ rip6_ctlinput( { struct ip6_hdr *ip6; struct mbuf *m; + void *cmdarg = NULL; int off = 0; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; @@ -294,6 +320,7 @@ rip6_ctlinput( m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; + cmdarg = ip6cp->ip6c_cmdarg; sa6_src = ip6cp->ip6c_src; } else { m = NULL; @@ -302,7 +329,7 @@ rip6_ctlinput( } (void) in6_pcbnotify(&ripcbinfo, sa, 0, (const struct sockaddr *)sa6_src, - 0, cmd, notify); + 0, cmd, cmdarg, notify); } /* @@ -314,7 +341,8 @@ rip6_output( register struct mbuf *m, struct socket *so, struct sockaddr_in6 *dstsock, - struct mbuf *control) + struct mbuf *control, + int israw) { struct in6_addr *dst; struct ip6_hdr *ip6; @@ -322,25 +350,29 @@ rip6_output( u_int plen = m->m_pkthdr.len; int error = 0; struct ip6_pktopts opt, *optp = 0; + struct ip6_moptions *im6o = NULL; struct ifnet *oifp = NULL; int type = 0, code = 0; /* for ICMPv6 output statistics only */ - int priv = 0; -#if PKT_PRIORITY - mbuf_traffic_class_t mtc = MBUF_TC_NONE; -#endif /* PKT_PRIORITY */ + mbuf_traffic_class_t mtc = MBUF_TC_UNSPEC; + struct ip6_out_args ip6oa = { IFSCOPE_NONE, 0 }; + int flags = IPV6_OUTARGS; + + if (dstsock && IN6_IS_ADDR_V4MAPPED(&dstsock->sin6_addr)) { + m_freem(m); + return (EINVAL); + } in6p = sotoin6pcb(so); - priv = 0; - if (so->so_uid == 0) - priv = 1; + ip6oa.ip6oa_boundif = (in6p->inp_flags & INP_BOUND_IF) ? + in6p->inp_boundif : IFSCOPE_NONE; + ip6oa.ip6oa_nocell = (in6p->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; + dst = &dstsock->sin6_addr; if (control) { -#if PKT_PRIORITY mtc = mbuf_traffic_class_from_control(control); -#endif /* PKT_PRIORITY */ - if ((error = ip6_setpktoptions(control, &opt, priv, 0)) != 0) + if ((error = ip6_setpktopts(control, &opt, NULL, so->so_proto->pr_protocol)) != 0) goto bad; optp = &opt; } else @@ -374,6 +406,8 @@ rip6_output( */ ip6->ip6_dst = *dst; + im6o = in6p->in6p_moptions; + /* * If the scope of the destination is link-local, embed the interface * index in the address. @@ -382,7 +416,13 @@ rip6_output( */ if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) { struct in6_pktinfo *pi; + struct ifnet *im6o_multicast_ifp = NULL; + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && im6o != NULL) { + IM6O_LOCK(im6o); + im6o_multicast_ifp = im6o->im6o_multicast_ifp; + IM6O_UNLOCK(im6o); + } /* * XXX Boundary check is assumed to be already done in * ip6_setpktoptions(). @@ -391,10 +431,12 @@ rip6_output( if (optp && (pi = optp->ip6po_pktinfo) && pi->ipi6_ifindex) { ip6->ip6_dst.s6_addr16[1] = htons(pi->ipi6_ifindex); oifp = ifindex2ifnet[pi->ipi6_ifindex]; + if (oifp != NULL) + ifnet_reference(oifp); } else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && - in6p->in6p_moptions && - in6p->in6p_moptions->im6o_multicast_ifp) { - oifp = in6p->in6p_moptions->im6o_multicast_ifp; + im6o != NULL && im6o_multicast_ifp != NULL) { + oifp = im6o_multicast_ifp; + ifnet_reference(oifp); ip6->ip6_dst.s6_addr16[1] = htons(oifp->if_index); } else if (dstsock->sin6_scope_id) { /* @@ -421,11 +463,13 @@ rip6_output( struct in6_addr *in6a; struct in6_addr storage; u_short index = 0; - if ((in6a = in6_selectsrc(dstsock, optp, - in6p->in6p_moptions, - &in6p->in6p_route, - &in6p->in6p_laddr, - &storage, &error)) == 0) { + + if (israw != 0 && optp && optp->ip6po_pktinfo && !IN6_IS_ADDR_UNSPECIFIED(&optp->ip6po_pktinfo->ipi6_addr)) { + in6a = &optp->ip6po_pktinfo->ipi6_addr; + flags |= IPV6_FLAG_NOSRCIFSEL; + } else if ((in6a = in6_selectsrc(dstsock, optp, in6p, + &in6p->in6p_route, NULL, &storage, ip6oa.ip6oa_boundif, + &error)) == 0) { if (error == 0) error = EADDRNOTAVAIL; goto bad; @@ -436,11 +480,15 @@ rip6_output( if (in6p->in6p_route.ro_rt->rt_ifp != NULL) index = in6p->in6p_route.ro_rt->rt_ifp->if_index; RT_UNLOCK(in6p->in6p_route.ro_rt); + if (oifp != NULL) + ifnet_release(oifp); ifnet_head_lock_shared(); if (index == 0 || if_index < index) { panic("bad if_index on interface from route"); } oifp = ifindex2ifnet[index]; + if (oifp != NULL) + ifnet_reference(oifp); ifnet_head_done(); } } @@ -463,7 +511,7 @@ rip6_output( off = offsetof(struct icmp6_hdr, icmp6_cksum); else off = in6p->in6p_cksum; - if (plen < off + 1) { + if (plen < (unsigned int)(off + 1)) { error = EINVAL; goto bad; } @@ -494,26 +542,48 @@ rip6_output( in6p->in6p_route.ro_rt = NULL; } -#if PKT_PRIORITY - set_traffic_class(m, so, mtc); -#endif /* PKT_PRIORITY */ + if (oifp != NULL) { + ifnet_release(oifp); + oifp = NULL; + } - error = ip6_output(m, optp, &in6p->in6p_route, 0, - in6p->in6p_moptions, &oifp, 0); + set_packet_tclass(m, so, mtc, 1); + + if (im6o != NULL) + IM6O_ADDREF(im6o); -#if IFNET_ROUTE_REFCNT - /* - * Always discard the cached route for unconnected socket - * or if it is a multicast route. - */ - if (in6p->in6p_route.ro_rt != NULL && - ((in6p->in6p_route.ro_rt->rt_flags & RTF_MULTICAST) || - in6p->in6p_socket == NULL || - in6p->in6p_socket->so_state != SS_ISCONNECTED)) { - rtfree(in6p->in6p_route.ro_rt); - in6p->in6p_route.ro_rt = NULL; + error = ip6_output(m, optp, &in6p->in6p_route, flags, im6o, + &oifp, &ip6oa); + + if (im6o != NULL) + IM6O_REMREF(im6o); + + if (in6p->in6p_route.ro_rt != NULL) { + struct rtentry *rt = in6p->in6p_route.ro_rt; + unsigned int outif; + + if ((rt->rt_flags & RTF_MULTICAST) || + in6p->in6p_socket == NULL || + !(in6p->in6p_socket->so_state & SS_ISCONNECTED)) { + rt = NULL; /* unusable */ + } + /* + * Always discard the cached route for unconnected + * socket or if it is a multicast route. + */ + if (rt == NULL) { + rtfree(in6p->in6p_route.ro_rt); + in6p->in6p_route.ro_rt = NULL; + } + /* + * If this is a connected socket and the destination + * route is not multicast, update outif with that of + * the route interface index used by IP. + */ + if (rt != NULL && + (outif = rt->rt_ifp->if_index) != in6p->in6p_last_outif) + in6p->in6p_last_outif = outif; } -#endif /* IFNET_ROUTE_REFCNT */ if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { if (oifp) @@ -535,9 +605,11 @@ rip6_output( } if (control) { if (optp == &opt) - ip6_clearpktopts(optp, 0, -1); + ip6_clearpktopts(optp, -1); m_freem(control); } + if (oifp != NULL) + ifnet_release(oifp); return(error); } @@ -594,8 +666,13 @@ rip6_ctloutput( case MRT6_PIM: #if MROUTING error = ip6_mrouter_get(so, sopt); +#else + error = ENOPROTOOPT; +#endif /* MROUTING */ + break; + case IPV6_CHECKSUM: + error = ip6_raw_ctloutput(so, sopt); break; -#endif default: error = ip6_ctloutput(so, sopt); break; @@ -627,8 +704,13 @@ rip6_ctloutput( case MRT6_PIM: #if MROUTING error = ip6_mrouter_set(so, sopt); - break; +#else + error = ENOPROTOOPT; #endif + break; + case IPV6_CHECKSUM: + error = ip6_raw_ctloutput(so, sopt); + break; default: error = ip6_ctloutput(so, sopt); break; @@ -714,7 +796,8 @@ rip6_bind(struct socket *so, struct sockaddr *nam, __unused struct proc *p) { struct inpcb *inp = sotoinpcb(so); struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam; - struct ifaddr *ia = NULL; + struct ifaddr *ifa = NULL; + unsigned int outif = 0; if (nam->sa_len != sizeof(*addr)) return EINVAL; @@ -727,18 +810,23 @@ rip6_bind(struct socket *so, struct sockaddr *nam, __unused struct proc *p) } #endif if (!IN6_IS_ADDR_UNSPECIFIED(&addr->sin6_addr) && - (ia = ifa_ifwithaddr((struct sockaddr *)addr)) == 0) + (ifa = ifa_ifwithaddr((struct sockaddr *)addr)) == 0) return EADDRNOTAVAIL; - if (ia && - ((struct in6_ifaddr *)ia)->ia6_flags & - (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY| - IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) { - if (ia) ifafree(ia); - return(EADDRNOTAVAIL); + if (ifa != NULL) { + IFA_LOCK(ifa); + if (((struct in6_ifaddr *)ifa)->ia6_flags & + (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY| + IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) { + IFA_UNLOCK(ifa); + IFA_REMREF(ifa); + return(EADDRNOTAVAIL); + } + outif = ifa->ifa_ifp->if_index; + IFA_UNLOCK(ifa); + IFA_REMREF(ifa); } - if (ia != NULL) - ifafree(ia); inp->in6p_laddr = addr->sin6_addr; + inp->in6p_last_outif = outif; return 0; } @@ -753,6 +841,7 @@ rip6_connect(struct socket *so, struct sockaddr *nam, __unused struct proc *p) #if ENABLE_DEFAULT_SCOPE struct sockaddr_in6 tmp; #endif + unsigned int outif = 0, ifscope; if (nam->sa_len != sizeof(*addr)) return EINVAL; @@ -768,14 +857,20 @@ rip6_connect(struct socket *so, struct sockaddr *nam, __unused struct proc *p) addr->sin6_scope_id = scope6_addr2default(&addr->sin6_addr); } #endif + + ifscope = (inp->inp_flags & INP_BOUND_IF) ? + inp->inp_boundif : IFSCOPE_NONE; + /* Source address selection. XXX: need pcblookup? */ - in6a = in6_selectsrc(addr, inp->in6p_outputopts, - inp->in6p_moptions, &inp->in6p_route, - &inp->in6p_laddr, &storage, &error); + in6a = in6_selectsrc(addr, inp->in6p_outputopts, inp, &inp->in6p_route, + NULL, &storage, ifscope, &error); if (in6a == NULL) return (error ? error : EADDRNOTAVAIL); inp->in6p_laddr = *in6a; inp->in6p_faddr = addr->sin6_addr; + if (inp->in6p_route.ro_rt != NULL) + outif = inp->in6p_route.ro_rt->rt_ifp->if_index; + inp->in6p_last_outif = outif; soisconnected(so); return 0; } @@ -788,12 +883,13 @@ rip6_shutdown(struct socket *so) } static int -rip6_send(struct socket *so, __unused int flags, struct mbuf *m, struct sockaddr *nam, - struct mbuf *control, __unused struct proc *p) +rip6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, + struct mbuf *control, struct proc *p) { +#pragma unused(flags, p) struct inpcb *inp = sotoinpcb(so); struct sockaddr_in6 tmp; - struct sockaddr_in6 *dst; + struct sockaddr_in6 *dst = (struct sockaddr_in6 *)nam; /* always copy sockaddr to avoid overwrites */ if (so->so_state & SS_ISCONNECTED) { @@ -821,7 +917,7 @@ rip6_send(struct socket *so, __unused int flags, struct mbuf *m, struct sockaddr dst->sin6_scope_id = scope6_addr2default(&dst->sin6_addr); } #endif - return rip6_output(m, so, dst, control); + return rip6_output(m, so, dst, control, 1); } struct pr_usrreqs rip6_usrreqs = { diff --git a/bsd/netinet6/route6.c b/bsd/netinet6/route6.c index 36617f2d0..a0dc6c6a6 100644 --- a/bsd/netinet6/route6.c +++ b/bsd/netinet6/route6.c @@ -1,3 +1,31 @@ +/* + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + /* $FreeBSD: src/sys/netinet6/route6.c,v 1.1.2.3 2001/07/03 11:01:55 ume Exp $ */ /* $KAME: route6.c,v 1.24 2001/03/14 03:07:05 itojun Exp $ */ @@ -52,8 +80,9 @@ static int ip6_rthdr0(struct mbuf *, struct ip6_hdr *, #endif /* IP6_RTHDR0_ALLOWED */ int -route6_input(struct mbuf **mp, int *offp) +route6_input(struct mbuf **mp, int *offp, int proto) { +#pragma unused(proto) struct ip6_hdr *ip6; struct mbuf *m = *mp; struct ip6_rthdr *rh; @@ -143,7 +172,7 @@ ip6_rthdr0(m, ip6, rh0) struct ip6_rthdr0 *rh0; { int addrs, index; - struct in6_addr *nextaddr, tmpaddr; + struct in6_addr *nextaddr, tmpaddr, ia6 = NULL; struct route_in6 ip6forward_rt; if (rh0->ip6r0_segleft == 0) @@ -156,20 +185,20 @@ ip6_rthdr0(m, ip6, rh0) ) { /* * Type 0 routing header can't contain more than 23 addresses. - * RFC 2462: this limitation was removed since stict/loose + * RFC 2462: this limitation was removed since strict/loose * bitmap field was deleted. */ ip6stat.ip6s_badoptions++; icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, (caddr_t)&rh0->ip6r0_len - (caddr_t)ip6); - return(-1); + return (-1); } if ((addrs = rh0->ip6r0_len / 2) < rh0->ip6r0_segleft) { ip6stat.ip6s_badoptions++; icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, (caddr_t)&rh0->ip6r0_segleft - (caddr_t)ip6); - return(-1); + return (-1); } index = addrs - rh0->ip6r0_segleft; @@ -188,7 +217,7 @@ ip6_rthdr0(m, ip6, rh0) IN6_IS_ADDR_V4COMPAT(nextaddr)) { ip6stat.ip6s_badoptions++; m_freem(m); - return(-1); + return (-1); } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst) || @@ -196,16 +225,31 @@ ip6_rthdr0(m, ip6, rh0) IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) { ip6stat.ip6s_badoptions++; m_freem(m); - return(-1); + return (-1); + } + + /* + * Determine the scope zone of the next hop, based on the interface + * of the current hop. [RFC4007, Section 9] + * Then disambiguate the scope zone for the next hop (if necessary). + */ + if ((ia6 = ip6_getdstifaddr(m)) == NULL) + goto bad; + if (in6_setscope(nextaddr, ia6->ia_ifp, NULL) != 0) { + ip6stat.ip6s_badscope++; + IFA_REMREF(&ia6->ia_ifa); + ia6 = NULL; + goto bad; } + IFA_REMREF(&ia6->ia_ifa); + ia6 = NULL; /* * Swap the IPv6 destination address and nextaddr. Forward the packet. */ tmpaddr = *nextaddr; *nextaddr = ip6->ip6_dst; - if (IN6_IS_ADDR_LINKLOCAL(nextaddr)) - nextaddr->s6_addr16[1] = 0; + in6_clearscope(nextaddr); /* XXX */ ip6->ip6_dst = tmpaddr; if (IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_dst)) ip6->ip6_dst.s6_addr16[1] = htons(m->m_pkthdr.rcvif->if_index); diff --git a/bsd/netinet6/scope6.c b/bsd/netinet6/scope6.c index 70e90dfa9..2d4eedf76 100644 --- a/bsd/netinet6/scope6.c +++ b/bsd/netinet6/scope6.c @@ -1,3 +1,31 @@ +/* + * Copyright (c) 2009-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + /* $FreeBSD: src/sys/netinet6/scope6.c,v 1.3 2002/03/25 10:12:51 ume Exp $ */ /* $KAME: scope6.c,v 1.10 2000/07/24 13:29:31 itojun Exp $ */ @@ -36,6 +64,8 @@ #include #include #include +#include +#include #include #include @@ -47,13 +77,12 @@ extern lck_mtx_t *scope6_mutex; -struct scope6_id { - /* - * 16 is correspondent to 4bit multicast scope field. - * i.e. from node-local to global with some reserved/unassigned types. - */ - u_int32_t s6id_list[16]; -}; +#ifdef ENABLE_DEFAULT_SCOPE +int ip6_use_defzone = 1; +#else +int ip6_use_defzone = 0; +#endif + static size_t if_scope_indexlim = 8; struct scope6_id *scope6_ids = NULL; @@ -103,6 +132,7 @@ scope6_ifattach( * XXX: IPV6_ADDR_SCOPE_xxx macros are not standard. * Should we rather hardcode here? */ + SID.s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] = ifp->if_index; SID.s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] = ifp->if_index; #if MULTI_SCOPE /* by default, we don't care about scope boundary for these scopes. */ @@ -133,14 +163,14 @@ scope6_set( /* * TODO(XXX): after setting, we should reflect the changes to - * interface addresses, routing table entries, PCB entries... + * interface addresses, routing table entries, PCB entries... */ lck_mtx_lock(scope6_mutex); for (i = 0; i < 16; i++) { if (idlist[i] && idlist[i] != scope6_ids[ifp->if_index].s6id_list[i]) { - if (i == IPV6_ADDR_SCOPE_LINKLOCAL && + if (i == IPV6_ADDR_SCOPE_INTFACELOCAL && idlist[i] > if_index) { /* * XXX: theoretically, there should be no @@ -216,8 +246,8 @@ struct in6_addr *addr; * return scope doesn't work. */ switch (scope) { - case IPV6_ADDR_SCOPE_NODELOCAL: - return IPV6_ADDR_SCOPE_NODELOCAL; + case IPV6_ADDR_SCOPE_INTFACELOCAL: + return IPV6_ADDR_SCOPE_INTFACELOCAL; break; case IPV6_ADDR_SCOPE_LINKLOCAL: return IPV6_ADDR_SCOPE_LINKLOCAL; @@ -231,11 +261,15 @@ struct in6_addr *addr; } } + /* + * Regard loopback and unspecified addresses as global, since + * they have no ambiguity. + */ if (bcmp(&in6addr_loopback, addr, sizeof(*addr) - 1) == 0) { if (addr->s6_addr8[15] == 1) /* loopback */ - return IPV6_ADDR_SCOPE_NODELOCAL; - if (addr->s6_addr8[15] == 0) /* unspecified */ return IPV6_ADDR_SCOPE_LINKLOCAL; + if (addr->s6_addr8[15] == 0) /* unspecified */ + return IPV6_ADDR_SCOPE_GLOBAL; /* XXX: correct? */ } return IPV6_ADDR_SCOPE_GLOBAL; @@ -282,6 +316,106 @@ in6_addr2scopeid( return retid; } +/* + * Validate the specified scope zone ID in the sin6_scope_id field. If the ID + * is unspecified (=0), needs to be specified, and the default zone ID can be + * used, the default value will be used. + * This routine then generates the kernel-internal form: if the address scope + * of is interface-local or link-local, embed the interface index in the + * address. + */ +int +sa6_embedscope(struct sockaddr_in6 *sin6, int defaultok) +{ + struct ifnet *ifp; + u_int32_t zoneid; + + if ((zoneid = sin6->sin6_scope_id) == 0 && defaultok) + zoneid = scope6_addr2default(&sin6->sin6_addr); + + if (zoneid != 0 && + (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) || + IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr))) { + /* + * At this moment, we only check interface-local and + * link-local scope IDs, and use interface indices as the + * zone IDs assuming a one-to-one mapping between interfaces + * and links. + */ + if (if_index < zoneid) + return (ENXIO); + ifnet_head_lock_shared(); + ifp = ifindex2ifnet[zoneid]; + if (ifp == NULL) {/* XXX: this can happen for some OS */ + ifnet_head_done(); + return (ENXIO); + } + ifnet_head_done(); + /* XXX assignment to 16bit from 32bit variable */ + sin6->sin6_addr.s6_addr16[1] = htons(zoneid & 0xffff); + + sin6->sin6_scope_id = 0; + } + + return 0; +} + +void +rtkey_to_sa6(struct rtentry *rt, struct sockaddr_in6 *sin6) +{ + VERIFY(rt_key(rt)->sa_family == AF_INET6); + + *sin6 = *((struct sockaddr_in6 *)rt_key(rt)); + sin6->sin6_scope_id = 0; +} + +void +rtgw_to_sa6(struct rtentry *rt, struct sockaddr_in6 *sin6) +{ + VERIFY(rt->rt_flags & RTF_GATEWAY); + + *sin6 = *((struct sockaddr_in6 *)rt->rt_gateway); + sin6->sin6_scope_id = 0; +} + +/* + * generate standard sockaddr_in6 from embedded form. + */ +int +sa6_recoverscope(struct sockaddr_in6 *sin6) +{ + u_int32_t zoneid; + + if (sin6->sin6_scope_id != 0) { + log(LOG_NOTICE, + "sa6_recoverscope: assumption failure (non 0 ID): %s%%%d\n", + ip6_sprintf(&sin6->sin6_addr), sin6->sin6_scope_id); + /* XXX: proceed anyway... */ + } + if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) || + IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr)) { + /* + * KAME assumption: link id == interface id + */ + zoneid = ntohs(sin6->sin6_addr.s6_addr16[1]); + if (zoneid) { + /* sanity check */ + if (if_index < zoneid) + return (ENXIO); + ifnet_head_lock_shared(); + if (ifindex2ifnet[zoneid] == NULL) { + ifnet_head_done(); + return (ENXIO); + } + ifnet_head_done(); + sin6->sin6_addr.s6_addr16[1] = 0; + sin6->sin6_scope_id = zoneid; + } + } + + return 0; +} + void scope6_setdefault( struct ifnet *ifp) /* note that this might be NULL */ @@ -294,11 +428,14 @@ scope6_setdefault( */ lck_mtx_lock(scope6_mutex); if (ifp) { + scope6_ids[0].s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] = + ifp->if_index; scope6_ids[0].s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] = ifp->if_index; - } - else + } else { + scope6_ids[0].s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] = 0; scope6_ids[0].s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] = 0; + } lck_mtx_unlock(scope6_mutex); } @@ -328,3 +465,93 @@ scope6_addr2default( lck_mtx_unlock(scope6_mutex); return (id); } + +/* + * Determine the appropriate scope zone ID for in6 and ifp. If ret_id is + * non NULL, it is set to the zone ID. If the zone ID needs to be embedded + * in the in6_addr structure, in6 will be modified. + * + * ret_id - unnecessary? + */ +int +in6_setscope(struct in6_addr *in6, struct ifnet *ifp, u_int32_t *ret_id) +{ + int scope; + u_int32_t zoneid = 0; + int index = ifp->if_index; + +#ifdef DIAGNOSTIC + if (scope6_ids == NULL) { /* should not happen */ + panic("in6_setscope: scope array is NULL"); + /* NOTREACHED */ + } +#endif + + /* + * special case: the loopback address can only belong to a loopback + * interface. + */ + if (IN6_IS_ADDR_LOOPBACK(in6)) { + if (!(ifp->if_flags & IFF_LOOPBACK)) { + return (EINVAL); + } else { + if (ret_id != NULL) + *ret_id = 0; /* there's no ambiguity */ + return (0); + } + } + + scope = in6_addrscope(in6); + +#define SID scope6_ids[index] + lck_mtx_lock(scope6_mutex); + switch (scope) { + case IPV6_ADDR_SCOPE_INTFACELOCAL: /* should be interface index */ + zoneid = SID.s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL]; + break; + + case IPV6_ADDR_SCOPE_LINKLOCAL: + zoneid = SID.s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL]; + break; + + case IPV6_ADDR_SCOPE_SITELOCAL: + zoneid = SID.s6id_list[IPV6_ADDR_SCOPE_SITELOCAL]; + break; + + case IPV6_ADDR_SCOPE_ORGLOCAL: + zoneid = SID.s6id_list[IPV6_ADDR_SCOPE_ORGLOCAL]; + break; +#undef SID + default: + zoneid = 0; /* XXX: treat as global. */ + break; + } + lck_mtx_unlock(scope6_mutex); + + if (ret_id != NULL) + *ret_id = zoneid; + + if (IN6_IS_SCOPE_LINKLOCAL(in6) || IN6_IS_ADDR_MC_INTFACELOCAL(in6)) + in6->s6_addr16[1] = htons(zoneid & 0xffff); /* XXX */ + + return (0); +} + +/* + * Just clear the embedded scope identifier. Return 0 if the original address + * is intact; return non 0 if the address is modified. + */ +int +in6_clearscope(struct in6_addr *in6) +{ + int modified = 0; + + if (IN6_IS_SCOPE_LINKLOCAL(in6) || IN6_IS_ADDR_MC_INTFACELOCAL(in6)) { + if (in6->s6_addr16[1] != 0) + modified = 1; + in6->s6_addr16[1] = 0; + } + + return (modified); +} + diff --git a/bsd/netinet6/scope6_var.h b/bsd/netinet6/scope6_var.h index 2b3a9954a..d028aefb8 100644 --- a/bsd/netinet6/scope6_var.h +++ b/bsd/netinet6/scope6_var.h @@ -1,3 +1,31 @@ +/* + * Copyright (c) 2009-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + /* $FreeBSD: src/sys/netinet6/scope6_var.h,v 1.1.2.1 2000/07/15 07:14:38 kris Exp $ */ /* $KAME: scope6_var.h,v 1.4 2000/05/18 15:03:27 jinmei Exp $ */ @@ -35,13 +63,31 @@ #include #ifdef KERNEL_PRIVATE + +struct scope6_id { + /* + * 16 is correspondent to 4bit multicast scope field. + * i.e. from node-local to global with some reserved/unassigned types. + */ + u_int32_t s6id_list[16]; +}; + +void scope6_init (void); int scope6_ifattach(struct ifnet *); +void scope6_ifdetach (struct scope6_id *); int scope6_set(struct ifnet *, u_int32_t *); int scope6_get(struct ifnet *, u_int32_t *); void scope6_setdefault(struct ifnet *); int scope6_get_default(u_int32_t *); u_int32_t scope6_in6_addrscope(struct in6_addr *); u_int32_t scope6_addr2default(struct in6_addr *); +int sa6_embedscope (struct sockaddr_in6 *, int); +int sa6_recoverscope (struct sockaddr_in6 *); +int in6_setscope (struct in6_addr *, struct ifnet *, u_int32_t *); +int in6_clearscope (struct in6_addr *); +extern void rtkey_to_sa6(struct rtentry *, struct sockaddr_in6 *); +extern void rtgw_to_sa6(struct rtentry *, struct sockaddr_in6 *); + #endif /* KERNEL_PRIVATE */ #endif /* _NETINET6_SCOPE6_VAR_H_ */ diff --git a/bsd/netinet6/tcp6_var.h b/bsd/netinet6/tcp6_var.h index 9d7c44968..5fded19e6 100644 --- a/bsd/netinet6/tcp6_var.h +++ b/bsd/netinet6/tcp6_var.h @@ -1,3 +1,31 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. @@ -79,8 +107,8 @@ extern int tcp_v6mssdflt; /* XXX */ struct ip6_hdr; void tcp6_ctlinput(int, struct sockaddr *, void *); void tcp6_init(void); -int tcp6_input(struct mbuf **, int *); -struct rtentry *tcp_rtlookup6(struct inpcb *); +int tcp6_input(struct mbuf **, int *, int); +struct rtentry *tcp_rtlookup6(struct inpcb *, unsigned int); extern struct pr_usrreqs tcp6_usrreqs; diff --git a/bsd/netinet6/udp6_output.c b/bsd/netinet6/udp6_output.c index e3d3198f4..0fb9a6993 100644 --- a/bsd/netinet6/udp6_output.c +++ b/bsd/netinet6/udp6_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -107,9 +107,12 @@ #include #include +#include + #include #include #include +#include #include #include @@ -177,27 +180,28 @@ udp6_output(in6p, m, addr6, control, p) struct in6_addr *laddr, *faddr; u_short fport; int error = 0; - struct ip6_pktopts opt, *stickyopt = in6p->in6p_outputopts; - int priv; + struct ip6_pktopts opt, *optp = NULL; + struct ip6_moptions *im6o; int af = AF_INET6, hlen = sizeof(struct ip6_hdr); int flags; struct sockaddr_in6 tmp; struct in6_addr storage; -#if PKT_PRIORITY - mbuf_traffic_class_t mtc = MBUF_TC_NONE; -#endif /* PKT_PRIORITY */ + mbuf_traffic_class_t mtc = MBUF_TC_UNSPEC; + struct ip6_out_args ip6oa = { IFSCOPE_NONE, 0 }; + + if (in6p->inp_flags & INP_BOUND_IF) + ip6oa.ip6oa_boundif = in6p->inp_boundif; - priv = (proc_suser(p) == 0); + ip6oa.ip6oa_nocell = (in6p->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; if (control) { -#if PKT_PRIORITY mtc = mbuf_traffic_class_from_control(control); -#endif /* PKT_PRIORITY */ - if ((error = ip6_setpktoptions(control, &opt, priv, 0)) != 0) + if ((error = ip6_setpktopts(control, &opt, NULL, IPPROTO_UDP)) != 0) goto release; - in6p->in6p_outputopts = &opt; - } + optp = &opt; + } else + optp = in6p->in6p_outputopts; if (addr6) { /* @@ -246,16 +250,16 @@ udp6_output(in6p, m, addr6, control, p) } /* KAME hack: embed scopeid */ - if (in6_embedscope(&sin6->sin6_addr, sin6, in6p, NULL) != 0) { + if (in6_embedscope(&sin6->sin6_addr, sin6, in6p, NULL, + optp) != 0) { error = EINVAL; goto release; } if (!IN6_IS_ADDR_V4MAPPED(faddr)) { - laddr = in6_selectsrc(sin6, in6p->in6p_outputopts, - in6p->in6p_moptions, - &in6p->in6p_route, - &in6p->in6p_laddr, &storage, &error); + laddr = in6_selectsrc(sin6, optp, + in6p, &in6p->in6p_route, NULL, &storage, + ip6oa.ip6oa_boundif, &error); } else laddr = &in6p->in6p_laddr; /* XXX */ if (laddr == NULL) { @@ -333,12 +337,12 @@ udp6_output(in6p, m, addr6, control, p) ip6->ip6_src = *laddr; ip6->ip6_dst = *faddr; - if ((udp6->uh_sum = in6_cksum(m, IPPROTO_UDP, - sizeof(struct ip6_hdr), plen)) == 0) { - udp6->uh_sum = 0xffff; - } + udp6->uh_sum = in6_cksum_phdr(laddr, faddr, + htonl(plen), htonl(IPPROTO_UDP)); + m->m_pkthdr.csum_flags = CSUM_UDPIPV6; + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); - flags = 0; + flags = IPV6_OUTARGS; udp6stat.udp6s_opackets++; #ifdef IPSEC @@ -348,26 +352,50 @@ udp6_output(in6p, m, addr6, control, p) } #endif /*IPSEC*/ m->m_pkthdr.socket_id = get_socket_id(in6p->in6p_socket); + + set_packet_tclass(m, in6p->in6p_socket, mtc, 1); + + im6o = in6p->in6p_moptions; + if (im6o != NULL) + IM6O_ADDREF(im6o); + + error = ip6_output(m, optp, &in6p->in6p_route, + flags, im6o, NULL, &ip6oa); + + if (im6o != NULL) + IM6O_REMREF(im6o); -#if PKT_PRIORITY - set_traffic_class(m, in6p->in6p_socket, mtc); -#endif /* PKT_PRIORITY */ - error = ip6_output(m, in6p->in6p_outputopts, &in6p->in6p_route, - flags, in6p->in6p_moptions, NULL, 0); + if (error == 0 && nstat_collect) { + locked_add_64(&in6p->inp_stat->txpackets, 1); + locked_add_64(&in6p->inp_stat->txbytes, ulen); + } -#if IFNET_ROUTE_REFCNT - /* - * Always discard the cached route for unconnected socket - * or if it is a multicast route. - */ - if (in6p->in6p_route.ro_rt != NULL && - ((in6p->in6p_route.ro_rt->rt_flags & RTF_MULTICAST) || - in6p->in6p_socket == NULL || - in6p->in6p_socket->so_state != SS_ISCONNECTED)) { - rtfree(in6p->in6p_route.ro_rt); - in6p->in6p_route.ro_rt = NULL; + if (in6p->in6p_route.ro_rt != NULL) { + struct rtentry *rt = in6p->in6p_route.ro_rt; + unsigned int outif; + + if ((rt->rt_flags & RTF_MULTICAST) || + in6p->in6p_socket == NULL || + !(in6p->in6p_socket->so_state & SS_ISCONNECTED)) { + rt = NULL; /* unusable */ + } + /* + * Always discard the cached route for unconnected + * socket or if it is a multicast route. + */ + if (rt == NULL) { + rtfree(in6p->in6p_route.ro_rt); + in6p->in6p_route.ro_rt = NULL; + } + /* + * If this is a connected socket and the destination + * route is not multicast, update outif with that of + * the route interface index used by IP. + */ + if (rt != NULL && (outif = rt->rt_ifp->if_index) != + in6p->in6p_last_outif) + in6p->in6p_last_outif = outif; } -#endif /* IFNET_ROUTE_REFCNT */ break; case AF_INET: error = EAFNOSUPPORT; @@ -380,8 +408,8 @@ udp6_output(in6p, m, addr6, control, p) releaseopt: if (control) { - ip6_clearpktopts(in6p->in6p_outputopts, 0, -1); - in6p->in6p_outputopts = stickyopt; + if (optp == &opt) + ip6_clearpktopts(optp, -1); m_freem(control); } return(error); diff --git a/bsd/netinet6/udp6_usrreq.c b/bsd/netinet6/udp6_usrreq.c index fed294b90..c88c0d169 100644 --- a/bsd/netinet6/udp6_usrreq.c +++ b/bsd/netinet6/udp6_usrreq.c @@ -1,3 +1,31 @@ +/* + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + /* $FreeBSD: src/sys/netinet6/udp6_usrreq.c,v 1.6.2.6 2001/07/29 19:32:40 ume Exp $ */ /* $KAME: udp6_usrreq.c,v 1.27 2001/05/21 05:45:10 jinmei Exp $ */ @@ -83,6 +111,7 @@ #include #include #include +#include #include #include @@ -104,7 +133,6 @@ #include extern int ipsec_bypass; #endif /*IPSEC*/ -extern lck_mtx_t *nd6_mutex; /* * UDP protocol inplementation. @@ -112,7 +140,6 @@ extern lck_mtx_t *nd6_mutex; */ extern struct protosw inetsw[]; -static int in6_mcmatch(struct inpcb *, struct in6_addr *, struct ifnet *); static int udp6_detach(struct socket *so); static void udp6_append(struct inpcb *, struct ip6_hdr *, struct sockaddr_in6 *, struct mbuf *, int); @@ -131,53 +158,37 @@ extern int fw_verbose; #define log_in_vain_log( a ) { log a; } #endif -static int -in6_mcmatch( - struct inpcb *in6p, - register struct in6_addr *ia6, - struct ifnet *ifp) -{ - struct ip6_moptions *im6o = in6p->in6p_moptions; - struct in6_multi_mship *imm; - - if (im6o == NULL) - return 0; - - lck_mtx_lock(nd6_mutex); - for (imm = im6o->im6o_memberships.lh_first; imm != NULL; - imm = imm->i6mm_chain.le_next) { - if ((ifp == NULL || - imm->i6mm_maddr->in6m_ifp == ifp) && - IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr, - ia6)) { - lck_mtx_unlock(nd6_mutex); - return 1; - } - } - lck_mtx_unlock(nd6_mutex); - return 0; -} - /* * subroutine of udp6_input(), mainly for source code readability. */ static void -udp6_append(struct inpcb *last, struct ip6_hdr *ip6, +udp6_append(struct inpcb *last, __unused struct ip6_hdr *ip6, struct sockaddr_in6 *udp_in6, struct mbuf *n, int off) { struct mbuf *opts = NULL; - + int ret = 0; #if CONFIG_MACF_NET if (mac_inpcb_check_deliver(last, n, AF_INET6, SOCK_DGRAM) != 0) { m_freem(n); return; } #endif - if (last->in6p_flags & IN6P_CONTROLOPTS || - last->in6p_socket->so_options & SO_TIMESTAMP) - ip6_savecontrol(last, &opts, ip6, n); - + if ((last->in6p_flags & IN6P_CONTROLOPTS) != 0 || + (last->in6p_socket->so_options & SO_TIMESTAMP) != 0 || + (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + ret = ip6_savecontrol(last, n, &opts); + if (ret != 0) { + m_freem(n); + m_freem(opts); + return; + } + } m_adj(n, off); + if (nstat_collect) { + locked_add_64(&last->inp_stat->rxpackets, 1); + locked_add_64(&last->inp_stat->rxbytes, n->m_pkthdr.len); + } + so_recv_data_stat(last->in6p_socket, n, 0); if (sbappendaddr(&last->in6p_socket->so_rcv, (struct sockaddr *)udp_in6, n, opts, NULL) == 0) udpstat.udps_fullsock++; @@ -188,20 +199,25 @@ udp6_append(struct inpcb *last, struct ip6_hdr *ip6, int udp6_input( struct mbuf **mp, - int *offp) + int *offp, + int proto) { +#pragma unused(proto) struct mbuf *m = *mp; + struct ifnet *ifp; register struct ip6_hdr *ip6; register struct udphdr *uh; register struct inpcb *in6p; struct mbuf *opts = NULL; int off = *offp; - int plen, ulen; + int plen, ulen, ret = 0; struct sockaddr_in6 udp_in6; struct inpcbinfo *pcbinfo = &udbinfo; + struct sockaddr_in6 fromsa; IP6_EXTHDR_CHECK(m, off, sizeof(struct udphdr), return IPPROTO_DONE); + ifp = m->m_pkthdr.rcvif; ip6 = mtod(m, struct ip6_hdr *); #if defined(NFAITH) && 0 < NFAITH @@ -223,20 +239,40 @@ udp6_input( goto bad; } + /* destination port of 0 is illegal, based on RFC768. */ + if (uh->uh_dport == 0) + goto bad; + /* * Checksum extended UDP header and data. */ + if (uh->uh_sum) { + if ((apple_hwcksum_rx != 0) && (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) { + uh->uh_sum = m->m_pkthdr.csum_data; + uh->uh_sum ^= 0xffff; + } + else { + if (in6_cksum(m, IPPROTO_UDP, off, ulen) != 0) { + udpstat.udps_badsum++; + goto bad; + } + } + } #ifndef __APPLE__ - if (uh->uh_sum == 0) + else udpstat.udps_nosum++; #endif - else if (in6_cksum(m, IPPROTO_UDP, off, ulen) != 0) { - udpstat.udps_badsum++; - goto bad; - } + + /* + * Construct sockaddr format source address. + */ + init_sin6(&fromsa, m); + fromsa.sin6_port = uh->uh_sport; + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { int reuse_sock = 0, mcast_delivered = 0; + struct ip6_moptions *imo; struct mbuf *n = NULL; /* @@ -299,11 +335,27 @@ udp6_input( udp_unlock(in6p->in6p_socket, 1, 0); continue; } - if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr)) { - if (!IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, - &ip6->ip6_dst) && - !in6_mcmatch(in6p, &ip6->ip6_dst, - m->m_pkthdr.rcvif)) { + + /* + * Handle socket delivery policy for any-source + * and source-specific multicast. [RFC3678] + */ + imo = in6p->in6p_moptions; + if (imo && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { + struct sockaddr_in6 mcaddr; + int blocked; + + IM6O_LOCK(imo); + bzero(&mcaddr, sizeof(struct sockaddr_in6)); + mcaddr.sin6_len = sizeof(struct sockaddr_in6); + mcaddr.sin6_family = AF_INET6; + mcaddr.sin6_addr = ip6->ip6_dst; + + blocked = im6o_mc_filter(imo, ifp, + (struct sockaddr *)&mcaddr, + (struct sockaddr *)&fromsa); + IM6O_UNLOCK(imo); + if (blocked != MCAST_PASS) { udp_unlock(in6p->in6p_socket, 1, 0); continue; } @@ -444,10 +496,21 @@ udp6_input( init_sin6(&udp_in6, m); /* general init */ udp_in6.sin6_port = uh->uh_sport; - if (in6p->in6p_flags & IN6P_CONTROLOPTS - || in6p->in6p_socket->so_options & SO_TIMESTAMP) - ip6_savecontrol(in6p, &opts, ip6, m); + if ((in6p->in6p_flags & IN6P_CONTROLOPTS) != 0 || + (in6p->in6p_socket->so_options & SO_TIMESTAMP) != 0 || + (in6p->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + ret = ip6_savecontrol(in6p, m, &opts); + if (ret != 0) { + udp_unlock(in6p->in6p_socket, 1, 0); + goto bad; + } + } m_adj(m, off + sizeof(struct udphdr)); + if (nstat_collect) { + locked_add_64(&in6p->inp_stat->rxpackets, 1); + locked_add_64(&in6p->inp_stat->rxbytes, m->m_pkthdr.len); + } + so_recv_data_stat(in6p->in6p_socket, m, 0); if (sbappendaddr(&in6p->in6p_socket->so_rcv, (struct sockaddr *)&udp_in6, m, opts, NULL) == 0) { @@ -527,10 +590,10 @@ udp6_ctlinput( (void) in6_pcbnotify(&udbinfo, sa, uh.uh_dport, (struct sockaddr*)ip6cp->ip6c_src, - uh.uh_sport, cmd, notify); + uh.uh_sport, cmd, NULL, notify); } else (void) in6_pcbnotify(&udbinfo, sa, 0, (struct sockaddr *)&sa6_src, - 0, cmd, notify); + 0, cmd, NULL, notify); } #ifndef __APPLE__ @@ -561,6 +624,12 @@ udp6_getcred SYSCTL_HANDLER_ARGS error = ENOENT; goto out; } + /* + * XXX This should not be copying out a credential!!!! This + * XXX is an opaque type, and is not intended to be introspected, + * XXX and the size of this structure *WILL* change as planned MACF + * XXX and kauth changes go forward. + */ error = SYSCTL_OUT(req, inp->inp_socket->so_cred->pc_ucred, sizeof(*(kauth_cred_t)0)); @@ -619,6 +688,7 @@ udp6_attach(struct socket *so, __unused int proto, struct proc *p) * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = ip_defttl; + nstat_udp_new_pcb(inp); return 0; } @@ -676,7 +746,7 @@ udp6_connect(struct socket *so, struct sockaddr *nam, struct proc *p) if (inp->inp_faddr.s_addr != INADDR_ANY) return EISCONN; in6_sin6_2_sin(&sin, sin6_p); - error = in_pcbconnect(inp, (struct sockaddr *)&sin, p); + error = in_pcbconnect(inp, (struct sockaddr *)&sin, p, NULL); if (error == 0) { inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; @@ -732,6 +802,7 @@ udp6_disconnect(struct socket *so) in6_pcbdisconnect(inp); inp->in6p_laddr = in6addr_any; + inp->in6p_last_outif = 0; so->so_state &= ~SS_ISCONNECTED; /* XXX */ return 0; } diff --git a/bsd/netinet6/udp6_var.h b/bsd/netinet6/udp6_var.h index 18274d10f..bd6916e4e 100644 --- a/bsd/netinet6/udp6_var.h +++ b/bsd/netinet6/udp6_var.h @@ -72,7 +72,7 @@ SYSCTL_DECL(_net_inet6_udp6); extern struct pr_usrreqs udp6_usrreqs; void udp6_ctlinput(int, struct sockaddr *, void *); -int udp6_input(struct mbuf **, int *); +int udp6_input(struct mbuf **, int *, int); int udp6_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct proc *p); diff --git a/bsd/netkey/Makefile b/bsd/netkey/Makefile index def3c0629..1a68c8a44 100644 --- a/bsd/netkey/Makefile +++ b/bsd/netkey/Makefile @@ -9,14 +9,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ DATAFILES = \ diff --git a/bsd/netkey/key.c b/bsd/netkey/key.c index 73a605869..457f772ec 100644 --- a/bsd/netkey/key.c +++ b/bsd/netkey/key.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -78,6 +78,7 @@ #include #include #include +#include #include @@ -152,9 +153,6 @@ lck_grp_attr_t *pfkey_stat_mutex_grp_attr; lck_attr_t *pfkey_stat_mutex_attr; lck_mtx_t *pfkey_stat_mutex; - -extern lck_mtx_t *nd6_mutex; - /* * Note on SA reference counting: * - SAs that are not in DEAD state will have (total external reference + 1) @@ -270,61 +268,61 @@ static int ipsec_esp_auth = 0; static int ipsec_ah_keymin = 128; SYSCTL_DECL(_net_key); - -SYSCTL_INT(_net_key, KEYCTL_DEBUG_LEVEL, debug, CTLFLAG_RW, \ +/* Thread safe: no accumulated state */ +SYSCTL_INT(_net_key, KEYCTL_DEBUG_LEVEL, debug, CTLFLAG_RW | CTLFLAG_LOCKED, \ &key_debug_level, 0, ""); /* max count of trial for the decision of spi value */ -SYSCTL_INT(_net_key, KEYCTL_SPI_TRY, spi_trycnt, CTLFLAG_RW, \ +SYSCTL_INT(_net_key, KEYCTL_SPI_TRY, spi_trycnt, CTLFLAG_RW | CTLFLAG_LOCKED, \ &key_spi_trycnt, 0, ""); /* minimum spi value to allocate automatically. */ -SYSCTL_INT(_net_key, KEYCTL_SPI_MIN_VALUE, spi_minval, CTLFLAG_RW, \ +SYSCTL_INT(_net_key, KEYCTL_SPI_MIN_VALUE, spi_minval, CTLFLAG_RW | CTLFLAG_LOCKED, \ &key_spi_minval, 0, ""); /* maximun spi value to allocate automatically. */ -SYSCTL_INT(_net_key, KEYCTL_SPI_MAX_VALUE, spi_maxval, CTLFLAG_RW, \ +SYSCTL_INT(_net_key, KEYCTL_SPI_MAX_VALUE, spi_maxval, CTLFLAG_RW | CTLFLAG_LOCKED, \ &key_spi_maxval, 0, ""); /* interval to initialize randseed */ -SYSCTL_INT(_net_key, KEYCTL_RANDOM_INT, int_random, CTLFLAG_RW, \ +SYSCTL_INT(_net_key, KEYCTL_RANDOM_INT, int_random, CTLFLAG_RW | CTLFLAG_LOCKED, \ &key_int_random, 0, ""); -/* lifetime for larval SA */ -SYSCTL_INT(_net_key, KEYCTL_LARVAL_LIFETIME, larval_lifetime, CTLFLAG_RW, \ +/* lifetime for larval SA; thread safe due to > compare */ +SYSCTL_INT(_net_key, KEYCTL_LARVAL_LIFETIME, larval_lifetime, CTLFLAG_RW | CTLFLAG_LOCKED, \ &key_larval_lifetime, 0, ""); /* counter for blocking to send SADB_ACQUIRE to IKEd */ -SYSCTL_INT(_net_key, KEYCTL_BLOCKACQ_COUNT, blockacq_count, CTLFLAG_RW, \ +SYSCTL_INT(_net_key, KEYCTL_BLOCKACQ_COUNT, blockacq_count, CTLFLAG_RW | CTLFLAG_LOCKED, \ &key_blockacq_count, 0, ""); -/* lifetime for blocking to send SADB_ACQUIRE to IKEd */ -SYSCTL_INT(_net_key, KEYCTL_BLOCKACQ_LIFETIME, blockacq_lifetime, CTLFLAG_RW, \ +/* lifetime for blocking to send SADB_ACQUIRE to IKEd: Thread safe, > compare */ +SYSCTL_INT(_net_key, KEYCTL_BLOCKACQ_LIFETIME, blockacq_lifetime, CTLFLAG_RW | CTLFLAG_LOCKED, \ &key_blockacq_lifetime, 0, ""); /* ESP auth */ -SYSCTL_INT(_net_key, KEYCTL_ESP_AUTH, esp_auth, CTLFLAG_RW, \ +SYSCTL_INT(_net_key, KEYCTL_ESP_AUTH, esp_auth, CTLFLAG_RW | CTLFLAG_LOCKED, \ &ipsec_esp_auth, 0, ""); /* minimum ESP key length */ -SYSCTL_INT(_net_key, KEYCTL_ESP_KEYMIN, esp_keymin, CTLFLAG_RW, \ +SYSCTL_INT(_net_key, KEYCTL_ESP_KEYMIN, esp_keymin, CTLFLAG_RW | CTLFLAG_LOCKED, \ &ipsec_esp_keymin, 0, ""); /* minimum AH key length */ -SYSCTL_INT(_net_key, KEYCTL_AH_KEYMIN, ah_keymin, CTLFLAG_RW, \ +SYSCTL_INT(_net_key, KEYCTL_AH_KEYMIN, ah_keymin, CTLFLAG_RW | CTLFLAG_LOCKED, \ &ipsec_ah_keymin, 0, ""); /* perfered old SA rather than new SA */ -SYSCTL_INT(_net_key, KEYCTL_PREFERED_OLDSA, prefered_oldsa, CTLFLAG_RW,\ +SYSCTL_INT(_net_key, KEYCTL_PREFERED_OLDSA, prefered_oldsa, CTLFLAG_RW | CTLFLAG_LOCKED,\ &key_preferred_oldsa, 0, ""); /* time between NATT keepalives in seconds, 0 disabled */ -SYSCTL_INT(_net_key, KEYCTL_NATT_KEEPALIVE_INTERVAL, natt_keepalive_interval, CTLFLAG_RW,\ +SYSCTL_INT(_net_key, KEYCTL_NATT_KEEPALIVE_INTERVAL, natt_keepalive_interval, CTLFLAG_RW | CTLFLAG_LOCKED,\ &natt_keepalive_interval, 0, ""); /* PF_KEY statistics */ -SYSCTL_STRUCT(_net_key, KEYCTL_PFKEYSTAT, pfkeystat, CTLFLAG_RD,\ +SYSCTL_STRUCT(_net_key, KEYCTL_PFKEYSTAT, pfkeystat, CTLFLAG_RD | CTLFLAG_LOCKED,\ &pfkeystat, pfkeystat, ""); #ifndef LIST_FOREACH @@ -566,7 +564,7 @@ void key_init(void); /* * PF_KEY init - * setup locks and call raw_init() + * setup locks, call raw_init(), and then init timer and associated data * */ void @@ -597,7 +595,46 @@ key_init(void) LIST_INIT(&spihash[i]); raw_init(); + + bzero((caddr_t)&key_cb, sizeof(key_cb)); + for (i = 0; i < IPSEC_DIR_MAX; i++) { + LIST_INIT(&sptree[i]); + } + ipsec_policy_count = 0; + + LIST_INIT(&sahtree); + + for (i = 0; i <= SADB_SATYPE_MAX; i++) { + LIST_INIT(®tree[i]); + } + ipsec_sav_count = 0; + +#ifndef IPSEC_NONBLOCK_ACQUIRE + LIST_INIT(&acqtree); +#endif + LIST_INIT(&spacqtree); + + /* system default */ +#if INET + ip4_def_policy.policy = IPSEC_POLICY_NONE; + ip4_def_policy.refcnt++; /*never reclaim this*/ +#endif +#if INET6 + ip6_def_policy.policy = IPSEC_POLICY_NONE; + ip6_def_policy.refcnt++; /*never reclaim this*/ +#endif + +#ifndef IPSEC_DEBUG2 + timeout((void *)key_timehandler, (void *)0, hz); +#endif /*IPSEC_DEBUG2*/ + + /* initialize key statistics */ + keystat.getspi_count = 1; + +#ifndef __APPLE__ + printf("IPsec: Initialized Security Association Processing.\n"); +#endif } @@ -609,9 +646,9 @@ key_init(void) * others: found and return the pointer. */ struct secpolicy * -key_allocsp(spidx, dir) - struct secpolicyindex *spidx; - u_int dir; +key_allocsp( + struct secpolicyindex *spidx, + u_int dir) { struct secpolicy *sp; struct timeval tv; @@ -670,8 +707,11 @@ key_allocsp(spidx, dir) * XXX slow */ struct secpolicy * -key_gettunnel(osrc, odst, isrc, idst) - struct sockaddr *osrc, *odst, *isrc, *idst; +key_gettunnel( + struct sockaddr *osrc, + struct sockaddr *odst, + struct sockaddr *isrc, + struct sockaddr *idst) { struct secpolicy *sp; const int dir = IPSEC_DIR_INBOUND; @@ -744,10 +784,10 @@ key_gettunnel(osrc, odst, isrc, idst) * ENOENT: policy may be valid, but SA with REQUIRE is on acquiring. */ int -key_checkrequest(isr, saidx, sav) - struct ipsecrequest *isr; - struct secasindex *saidx; - struct secasvar **sav; +key_checkrequest( + struct ipsecrequest *isr, + struct secasindex *saidx, + struct secasvar **sav) { u_int level; int error; @@ -814,8 +854,8 @@ key_checkrequest(isr, saidx, sav) u_int32_t sah_search_calls = 0; u_int32_t sah_search_count = 0; struct secasvar * -key_allocsa_policy(saidx) - struct secasindex *saidx; +key_allocsa_policy( + struct secasindex *saidx) { struct secashead *sah; struct secasvar *sav; @@ -879,10 +919,10 @@ key_allocsa_policy(saidx) * others : found, pointer to a SA. */ static struct secasvar * -key_do_allocsa_policy(sah, state, dstport) - struct secashead *sah; - u_int state; - u_int16_t dstport; +key_do_allocsa_policy( + struct secashead *sah, + u_int state, + u_int16_t dstport) { struct secasvar *sav, *nextsav, *candidate, *natt_candidate, *no_natt_candidate, *d; @@ -1060,10 +1100,12 @@ key_do_allocsa_policy(sah, state, dstport) * keep source address in IPsec SA. We see a tricky situation here. */ struct secasvar * -key_allocsa(family, src, dst, proto, spi) - u_int family, proto; - caddr_t src, dst; - u_int32_t spi; +key_allocsa( + u_int family, + caddr_t src, + caddr_t dst, + u_int proto, + u_int32_t spi) { struct secasvar *sav, *match; u_int stateidx, state, tmpidx, matchidx; @@ -1214,8 +1256,8 @@ key_allocsa(family, src, dst, proto, spi) } u_int16_t -key_natt_get_translated_port(outsav) - struct secasvar *outsav; +key_natt_get_translated_port( + struct secasvar *outsav) { struct secasindex saidx; @@ -1271,10 +1313,10 @@ key_natt_get_translated_port(outsav) } static int -key_do_get_translated_port(sah, outsav, state) - struct secashead *sah; - struct secasvar *outsav; - u_int state; +key_do_get_translated_port( + struct secashead *sah, + struct secasvar *outsav, + u_int state) { struct secasvar *currsav, *nextsav, *candidate; @@ -1338,9 +1380,9 @@ key_do_get_translated_port(sah, outsav, state) * For both the packet without socket and key_freeso(). */ void -key_freesp(sp, locked) - struct secpolicy *sp; - int locked; +key_freesp( + struct secpolicy *sp, + int locked) { /* sanity check */ @@ -1371,8 +1413,8 @@ static void key_freesp_so(struct secpolicy **); * For the packet with socket. */ void -key_freeso(so) - struct socket *so; +key_freeso( + struct socket *so) { /* sanity check */ @@ -1429,8 +1471,8 @@ key_freeso(so) } static void -key_freesp_so(sp) - struct secpolicy **sp; +key_freesp_so( + struct secpolicy **sp) { /* sanity check */ @@ -1464,9 +1506,9 @@ key_freesp_so(sp) * for a policy. */ void -key_freesav(sav, locked) - struct secasvar *sav; - int locked; +key_freesav( + struct secasvar *sav, + int locked) { /* sanity check */ @@ -1494,8 +1536,8 @@ key_freesav(sav, locked) * free security policy entry. */ static void -key_delsp(sp) - struct secpolicy *sp; +key_delsp( + struct secpolicy *sp) { /* sanity check */ @@ -1534,8 +1576,8 @@ key_delsp(sp) * others : found, pointer to a SP. */ static struct secpolicy * -key_getsp(spidx) - struct secpolicyindex *spidx; +key_getsp( + struct secpolicyindex *spidx) { struct secpolicy *sp; @@ -1563,8 +1605,8 @@ key_getsp(spidx) * others : found, pointer to a SP. */ static struct secpolicy * -key_getspbyid(id) - u_int32_t id; +key_getspbyid( + u_int32_t id) { struct secpolicy *sp; @@ -1592,7 +1634,7 @@ key_getspbyid(id) } struct secpolicy * -key_newsp() +key_newsp(void) { struct secpolicy *newsp = NULL; @@ -1613,10 +1655,10 @@ key_newsp() * so must be set properly later. */ struct secpolicy * -key_msg2sp(xpl0, len, error) - struct sadb_x_policy *xpl0; - size_t len; - int *error; +key_msg2sp( + struct sadb_x_policy *xpl0, + size_t len, + int *error) { struct secpolicy *newsp; @@ -1835,7 +1877,7 @@ key_msg2sp(xpl0, len, error) } static u_int32_t -key_newreqid() +key_newreqid(void) { lck_mtx_lock(sadb_mutex); static u_int32_t auto_reqid = IPSEC_MANUAL_REQID_MAX + 1; @@ -1853,8 +1895,8 @@ key_newreqid() * copy secpolicy struct to sadb_x_policy structure indicated. */ struct mbuf * -key_sp2msg(sp) - struct secpolicy *sp; +key_sp2msg( + struct secpolicy *sp) { struct sadb_x_policy *xpl; int tlen; @@ -2006,10 +2048,10 @@ key_gather_mbuf(struct mbuf *m, const struct sadb_msghdr *mhp, * m will always be freed. */ static int -key_spdadd(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_spdadd( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { struct sadb_address *src0, *dst0; struct sadb_x_policy *xpl0, *xpl; @@ -2266,7 +2308,7 @@ key_spdadd(so, m, mhp) * others: success. */ static u_int32_t -key_getnewspid() +key_getnewspid(void) { u_int32_t newid = 0; int count = key_spi_trycnt; /* XXX */ @@ -2304,10 +2346,10 @@ key_getnewspid() * m will always be freed. */ static int -key_spddelete(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_spddelete( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { struct sadb_address *src0, *dst0; struct sadb_x_policy *xpl0; @@ -2406,10 +2448,10 @@ key_spddelete(so, m, mhp) * m will always be freed. */ static int -key_spddelete2(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_spddelete2( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { u_int32_t id; struct secpolicy *sp; @@ -2507,10 +2549,10 @@ key_spddelete2(so, m, mhp) * m will always be freed. */ static int -key_spdget(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_spdget( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { u_int32_t id; struct secpolicy *sp; @@ -2562,8 +2604,8 @@ key_spdget(so, m, mhp) * others: error number */ int -key_spdacquire(sp) - struct secpolicy *sp; +key_spdacquire( + struct secpolicy *sp) { struct mbuf *result = NULL, *m; struct secspacq *newspacq; @@ -2637,10 +2679,10 @@ key_spdacquire(sp) * m will always be freed. */ static int -key_spdflush(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_spdflush( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { struct sadb_msg *newmsg; struct secpolicy *sp; @@ -2690,10 +2732,10 @@ key_spdflush(so, m, mhp) */ static int -key_spddump(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_spddump( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { struct secpolicy *sp, **spbuf = NULL, **sp_ptr; int cnt = 0, bufcount; @@ -2762,10 +2804,11 @@ key_spddump(so, m, mhp) } static struct mbuf * -key_setdumpsp(sp, type, seq, pid) - struct secpolicy *sp; - u_int8_t type; - u_int32_t seq, pid; +key_setdumpsp( + struct secpolicy *sp, + u_int8_t type, + u_int32_t seq, + u_int32_t pid) { struct mbuf *result = NULL, *m; @@ -2820,8 +2863,8 @@ key_setdumpsp(sp, type, seq, pid) * get PFKEY message length for security policy and request. */ static u_int -key_getspreqmsglen(sp) - struct secpolicy *sp; +key_getspreqmsglen( + struct secpolicy *sp) { u_int tlen; @@ -2858,12 +2901,12 @@ key_getspreqmsglen(sp) * others : error number */ static int -key_spdexpire(sp) - struct secpolicy *sp; +key_spdexpire( + struct secpolicy *sp) { struct mbuf *result = NULL, *m; int len; - int error = -1; + int error = EINVAL; struct sadb_lifetime *lt; lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_NOTOWNED); @@ -2969,9 +3012,9 @@ key_spdexpire(sp) * others : pointer to new SA head. */ static struct secashead * -key_newsah(saidx, dir) - struct secasindex *saidx; - u_int8_t dir; +key_newsah( + struct secasindex *saidx, + u_int8_t dir) { struct secashead *newsah; @@ -3019,8 +3062,8 @@ key_newsah(saidx, dir) * delete SA index and all SA registerd. */ static void -key_delsah(sah) - struct secashead *sah; +key_delsah( + struct secashead *sah) { struct secasvar *sav, *nextsav; u_int stateidx, state; @@ -3092,11 +3135,11 @@ key_delsah(sah) * does not modify mbuf. does not free mbuf on error. */ static struct secasvar * -key_newsav(m, mhp, sah, errp) - struct mbuf *m; - const struct sadb_msghdr *mhp; - struct secashead *sah; - int *errp; +key_newsav( + struct mbuf *m, + const struct sadb_msghdr *mhp, + struct secashead *sah, + int *errp) { struct secasvar *newsav; const struct sadb_sa *xsa; @@ -3187,8 +3230,8 @@ key_newsav(m, mhp, sah, errp) * free() SA variable entry. */ static void -key_delsav(sav) - struct secasvar *sav; +key_delsav( + struct secasvar *sav) { lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); @@ -3256,8 +3299,8 @@ key_delsav(sav) * others : found, pointer to a SA. */ static struct secashead * -key_getsah(saidx) - struct secasindex *saidx; +key_getsah( + struct secasindex *saidx) { struct secashead *sah; @@ -3281,9 +3324,9 @@ key_getsah(saidx) * others : found, pointer to a SA. */ static struct secasvar * -key_checkspidup(saidx, spi) - struct secasindex *saidx; - u_int32_t spi; +key_checkspidup( + struct secasindex *saidx, + u_int32_t spi) { struct secasvar *sav; u_int stateidx, state; @@ -3314,9 +3357,9 @@ key_checkspidup(saidx, spi) } static void -key_setspi(sav, spi) - struct secasvar *sav; - u_int32_t spi; +key_setspi( + struct secasvar *sav, + u_int32_t spi) { lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); sav->spi = spi; @@ -3333,9 +3376,9 @@ key_setspi(sav, spi) * others : found, pointer to a SA. */ static struct secasvar * -key_getsavbyspi(sah, spi) - struct secashead *sah; - u_int32_t spi; +key_getsavbyspi( + struct secashead *sah, + u_int32_t spi) { struct secasvar *sav, *match; u_int stateidx, state, matchidx; @@ -3370,10 +3413,10 @@ key_getsavbyspi(sah, spi) * does not modify mbuf. does not free mbuf on error. */ static int -key_setsaval(sav, m, mhp) - struct secasvar *sav; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_setsaval( + struct secasvar *sav, + struct mbuf *m, + const struct sadb_msghdr *mhp) { #if IPSEC_ESP const struct esp_algorithm *algo; @@ -3688,8 +3731,8 @@ key_setsaval(sav, m, mhp) * other: errno */ static int -key_mature(sav) - struct secasvar *sav; +key_mature( + struct secasvar *sav) { int mature; int checkmask = 0; /* 2^0: ealg 2^1: aalg 2^2: calg */ @@ -3867,10 +3910,12 @@ key_mature(sav) * subroutine for SADB_GET and SADB_DUMP. */ static struct mbuf * -key_setdumpsa(sav, type, satype, seq, pid) - struct secasvar *sav; - u_int8_t type, satype; - u_int32_t seq, pid; +key_setdumpsa( + struct secasvar *sav, + u_int8_t type, + u_int8_t satype, + u_int32_t seq, + u_int32_t pid) { struct mbuf *result = NULL, *tres = NULL, *m; int l = 0; @@ -4016,12 +4061,13 @@ key_setdumpsa(sav, type, satype, seq, pid) * set data into sadb_msg. */ static struct mbuf * -key_setsadbmsg(type, tlen, satype, seq, pid, reserved) - u_int8_t type, satype; - u_int16_t tlen; - u_int32_t seq; - pid_t pid; - u_int16_t reserved; +key_setsadbmsg( + u_int8_t type, + u_int16_t tlen, + u_int8_t satype, + u_int32_t seq, + pid_t pid, + u_int16_t reserved) { struct mbuf *m; struct sadb_msg *p; @@ -4062,8 +4108,8 @@ key_setsadbmsg(type, tlen, satype, seq, pid, reserved) * copy secasvar data into sadb_address. */ static struct mbuf * -key_setsadbsa(sav) - struct secasvar *sav; +key_setsadbsa( + struct secasvar *sav) { struct mbuf *m; struct sadb_sa *p; @@ -4096,11 +4142,11 @@ key_setsadbsa(sav) * set data into sadb_address. */ static struct mbuf * -key_setsadbaddr(exttype, saddr, prefixlen, ul_proto) - u_int16_t exttype; - struct sockaddr *saddr; - u_int8_t prefixlen; - u_int16_t ul_proto; +key_setsadbaddr( + u_int16_t exttype, + struct sockaddr *saddr, + u_int8_t prefixlen, + u_int16_t ul_proto) { struct mbuf *m; struct sadb_address *p; @@ -4218,11 +4264,12 @@ key_setsadbsastat (u_int32_t dir, * set data into sadb_ident. */ static struct mbuf * -key_setsadbident(exttype, idtype, string, stringlen, id) - u_int16_t exttype, idtype; - caddr_t string; - int stringlen; - u_int64_t id; +key_setsadbident( + u_int16_t exttype, + u_int16_t idtype, + caddr_t string, + int stringlen, + u_int64_t id) { struct mbuf *m; struct sadb_ident *p; @@ -4257,9 +4304,10 @@ key_setsadbident(exttype, idtype, string, stringlen, id) * set data into sadb_x_sa2. */ static struct mbuf * -key_setsadbxsa2(mode, seq, reqid) - u_int8_t mode; - u_int32_t seq, reqid; +key_setsadbxsa2( + u_int8_t mode, + u_int32_t seq, + u_int32_t reqid) { struct mbuf *m; struct sadb_x_sa2 *p; @@ -4291,10 +4339,10 @@ key_setsadbxsa2(mode, seq, reqid) * set data into sadb_x_policy */ static struct mbuf * -key_setsadbxpolicy(type, dir, id) - u_int16_t type; - u_int8_t dir; - u_int32_t id; +key_setsadbxpolicy( + u_int16_t type, + u_int8_t dir, + u_int32_t id) { struct mbuf *m; struct sadb_x_policy *p; @@ -4325,9 +4373,9 @@ key_setsadbxpolicy(type, dir, id) * copy a buffer into the new buffer allocated. */ static void * -key_newbuf(src, len) - const void *src; - u_int len; +key_newbuf( + const void *src, + u_int len) { caddr_t new; @@ -4352,8 +4400,8 @@ key_newbuf(src, len) * 0: false */ int -key_ismyaddr(sa) - struct sockaddr *sa; +key_ismyaddr( + struct sockaddr *sa) { #if INET struct sockaddr_in *sin; @@ -4370,15 +4418,17 @@ key_ismyaddr(sa) lck_rw_lock_shared(in_ifaddr_rwlock); sin = (struct sockaddr_in *)sa; for (ia = in_ifaddrhead.tqh_first; ia; - ia = ia->ia_link.tqe_next) - { + ia = ia->ia_link.tqe_next) { + IFA_LOCK_SPIN(&ia->ia_ifa); if (sin->sin_family == ia->ia_addr.sin_family && sin->sin_len == ia->ia_addr.sin_len && sin->sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr) { + IFA_UNLOCK(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); return 1; } + IFA_UNLOCK(&ia->ia_ifa); } lck_rw_done(in_ifaddr_rwlock); break; @@ -4402,19 +4452,22 @@ key_ismyaddr(sa) #include static int -key_ismyaddr6(sin6) - struct sockaddr_in6 *sin6; +key_ismyaddr6( + struct sockaddr_in6 *sin6) { struct in6_ifaddr *ia; struct in6_multi *in6m; - lck_mtx_lock(nd6_mutex); + lck_rw_lock_shared(&in6_ifaddr_rwlock); for (ia = in6_ifaddrs; ia; ia = ia->ia_next) { + IFA_LOCK(&ia->ia_ifa); if (key_sockaddrcmp((struct sockaddr *)&sin6, (struct sockaddr *)&ia->ia_addr, 0) == 0) { - lck_mtx_unlock(nd6_mutex); + IFA_UNLOCK(&ia->ia_ifa); + lck_rw_done(&in6_ifaddr_rwlock); return 1; } + IFA_UNLOCK(&ia->ia_ifa); /* * XXX Multicast @@ -4423,15 +4476,16 @@ key_ismyaddr6(sin6) * XXX scope */ in6m = NULL; - ifnet_lock_shared(ia->ia_ifp); - IN6_LOOKUP_MULTI(sin6->sin6_addr, ia->ia_ifp, in6m); - ifnet_lock_done(ia->ia_ifp); - if (in6m) { - lck_mtx_unlock(nd6_mutex); + in6_multihead_lock_shared(); + IN6_LOOKUP_MULTI(&sin6->sin6_addr, ia->ia_ifp, in6m); + in6_multihead_lock_done(); + if (in6m != NULL) { + lck_rw_done(&in6_ifaddr_rwlock); + IN6M_REMREF(in6m); return 1; } } - lck_mtx_unlock(nd6_mutex); + lck_rw_done(&in6_ifaddr_rwlock); /* loopback, just for safety */ if (IN6_IS_ADDR_LOOPBACK(&sin6->sin6_addr)) @@ -4454,9 +4508,10 @@ key_ismyaddr6(sin6) * 0 : not equal */ static int -key_cmpsaidx(saidx0, saidx1, flag) - struct secasindex *saidx0, *saidx1; - int flag; +key_cmpsaidx( + struct secasindex *saidx0, + struct secasindex *saidx1, + int flag) { /* sanity */ if (saidx0 == NULL && saidx1 == NULL) @@ -4517,8 +4572,9 @@ key_cmpsaidx(saidx0, saidx1, flag) * 0 : not equal */ static int -key_cmpspidx_exactly(spidx0, spidx1) - struct secpolicyindex *spidx0, *spidx1; +key_cmpspidx_exactly( + struct secpolicyindex *spidx0, + struct secpolicyindex *spidx1) { /* sanity */ if (spidx0 == NULL && spidx1 == NULL) @@ -4554,8 +4610,9 @@ key_cmpspidx_exactly(spidx0, spidx1) * 0 : not equal */ static int -key_cmpspidx_withmask(spidx0, spidx1) - struct secpolicyindex *spidx0, *spidx1; +key_cmpspidx_withmask( + struct secpolicyindex *spidx0, + struct secpolicyindex *spidx1) { /* sanity */ if (spidx0 == NULL && spidx1 == NULL) @@ -4652,10 +4709,10 @@ key_cmpspidx_withmask(spidx0, spidx1) /* returns 0 on match */ static int -key_sockaddrcmp(sa1, sa2, port) - struct sockaddr *sa1; - struct sockaddr *sa2; - int port; +key_sockaddrcmp( + struct sockaddr *sa1, + struct sockaddr *sa2, + int port) { if (sa1->sa_family != sa2->sa_family || sa1->sa_len != sa2->sa_len) return 1; @@ -4707,9 +4764,10 @@ key_sockaddrcmp(sa1, sa2, port) * 0 : not equal */ static int -key_bbcmp(p1, p2, bits) - caddr_t p1, p2; - u_int bits; +key_bbcmp( + caddr_t p1, + caddr_t p2, + u_int bits) { u_int8_t mask; @@ -5154,7 +5212,7 @@ key_timehandler(void) * to initialize a seed for random() */ static void -key_srandom() +key_srandom(void) { #ifdef __APPLE__ /* Our PRNG is based on Yarrow and doesn't need to be seeded */ @@ -5171,7 +5229,7 @@ key_srandom() } u_int32_t -key_random() +key_random(void) { u_int32_t value; @@ -5180,9 +5238,9 @@ key_random() } void -key_randomfill(p, l) - void *p; - size_t l; +key_randomfill( + void *p, + size_t l) { #ifdef __APPLE__ @@ -5217,8 +5275,8 @@ key_randomfill(p, l) * 0: invalid satype. */ static u_int16_t -key_satype2proto(satype) - u_int8_t satype; +key_satype2proto( + u_int8_t satype) { switch (satype) { case SADB_SATYPE_UNSPEC: @@ -5242,8 +5300,8 @@ key_satype2proto(satype) * 0: invalid protocol type. */ static u_int8_t -key_proto2satype(proto) - u_int16_t proto; +key_proto2satype( + u_int16_t proto) { switch (proto) { case IPPROTO_AH: @@ -5273,10 +5331,10 @@ key_proto2satype(proto) * other if success, return pointer to the message to send. */ static int -key_getspi(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_getspi( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { struct sadb_address *src0, *dst0; struct secasindex saidx; @@ -5483,9 +5541,9 @@ key_getspi(so, m, mhp) * others: success. */ static u_int32_t -key_do_getnewspi(spirange, saidx) - struct sadb_spirange *spirange; - struct secasindex *saidx; +key_do_getnewspi( + struct sadb_spirange *spirange, + struct secasindex *saidx) { u_int32_t newspi; u_int32_t keymin, keymax; @@ -5567,10 +5625,10 @@ key_do_getnewspi(spirange, saidx) * m will always be freed. */ static int -key_update(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_update( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { struct sadb_sa *sa0; struct sadb_address *src0, *dst0; @@ -5743,9 +5801,9 @@ key_update(so, m, mhp) */ #if IPSEC_DOSEQCHECK static struct secasvar * -key_getsavbyseq(sah, seq) - struct secashead *sah; - u_int32_t seq; +key_getsavbyseq( + struct secashead *sah, + u_int32_t seq) { struct secasvar *sav; u_int state; @@ -5789,10 +5847,10 @@ key_getsavbyseq(sah, seq) * m will always be freed. */ static int -key_add(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_add( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { struct sadb_sa *sa0; struct sadb_address *src0, *dst0; @@ -5925,10 +5983,10 @@ key_add(so, m, mhp) /* m is retained */ static int -key_setident(sah, m, mhp) - struct secashead *sah; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_setident( + struct secashead *sah, + struct mbuf *m, + const struct sadb_msghdr *mhp) { const struct sadb_ident *idsrc, *iddst; int idsrclen, iddstlen; @@ -6009,9 +6067,9 @@ key_setident(sah, m, mhp) * it is caller's responsibility to free the result. */ static struct mbuf * -key_getmsgbuf_x1(m, mhp) - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_getmsgbuf_x1( + struct mbuf *m, + const struct sadb_msghdr *mhp) { struct mbuf *n; int mbufItems[] = {SADB_EXT_RESERVED, SADB_EXT_SA, @@ -6056,10 +6114,10 @@ static int key_delete_all(struct socket *, struct mbuf *, * m will always be freed. */ static int -key_delete(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_delete( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { struct sadb_sa *sa0; struct sadb_address *src0, *dst0; @@ -6169,11 +6227,11 @@ key_delete(so, m, mhp) * delete all SAs for src/dst. Called from key_delete(). */ static int -key_delete_all(so, m, mhp, proto) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; - u_int16_t proto; +key_delete_all( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp, + u_int16_t proto) { struct sadb_address *src0, *dst0; struct secasindex saidx; @@ -6259,10 +6317,10 @@ key_delete_all(so, m, mhp, proto) * m will always be freed. */ static int -key_get(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_get( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { struct sadb_sa *sa0; struct sadb_address *src0, *dst0; @@ -6427,8 +6485,8 @@ key_getsastatbyspi (struct sastat *stat_arg, /* XXX make it sysctl-configurable? */ static void -key_getcomb_setlifetime(comb) - struct sadb_comb *comb; +key_getcomb_setlifetime( + struct sadb_comb *comb) { comb->sadb_comb_soft_allocations = 1; @@ -6447,7 +6505,7 @@ key_getcomb_setlifetime(comb) * XXX no idea if the user wants ESP authentication or not */ static struct mbuf * -key_getcomb_esp() +key_getcomb_esp(void) { struct sadb_comb *comb; const struct esp_algorithm *algo; @@ -6529,7 +6587,7 @@ key_getcomb_esp() * XXX reorder combinations by preference */ static struct mbuf * -key_getcomb_ah() +key_getcomb_ah(void) { struct sadb_comb *comb; const struct ah_algorithm *algo; @@ -6588,7 +6646,7 @@ key_getcomb_ah() * XXX reorder combinations by preference */ static struct mbuf * -key_getcomb_ipcomp() +key_getcomb_ipcomp(void) { struct sadb_comb *comb; const struct ipcomp_algorithm *algo; @@ -6634,8 +6692,8 @@ key_getcomb_ipcomp() * XXX sysctl interface to ipsec_{ah,esp}_keymin */ static struct mbuf * -key_getprop(saidx) - const struct secasindex *saidx; +key_getprop( + const struct secasindex *saidx) { struct sadb_prop *prop; struct mbuf *m, *n; @@ -6698,9 +6756,9 @@ key_getprop(saidx) * others: error number */ static int -key_acquire(saidx, sp) - struct secasindex *saidx; - struct secpolicy *sp; +key_acquire( + struct secasindex *saidx, + struct secpolicy *sp) { struct mbuf *result = NULL, *m; #ifndef IPSEC_NONBLOCK_ACQUIRE @@ -6883,8 +6941,8 @@ key_acquire(saidx, sp) #ifndef IPSEC_NONBLOCK_ACQUIRE static struct secacq * -key_newacq(saidx) - struct secasindex *saidx; +key_newacq( + struct secasindex *saidx) { struct secacq *newacq; struct timeval tv; @@ -6913,8 +6971,8 @@ key_newacq(saidx) } static struct secacq * -key_getacq(saidx) - struct secasindex *saidx; +key_getacq( + struct secasindex *saidx) { struct secacq *acq; @@ -6929,8 +6987,8 @@ key_getacq(saidx) } static struct secacq * -key_getacqbyseq(seq) - u_int32_t seq; +key_getacqbyseq( + u_int32_t seq) { struct secacq *acq; @@ -6946,8 +7004,8 @@ key_getacqbyseq(seq) #endif static struct secspacq * -key_newspacq(spidx) - struct secpolicyindex *spidx; +key_newspacq( + struct secpolicyindex *spidx) { struct secspacq *acq; struct timeval tv; @@ -6975,8 +7033,8 @@ key_newspacq(spidx) } static struct secspacq * -key_getspacq(spidx) - struct secpolicyindex *spidx; +key_getspacq( + struct secpolicyindex *spidx) { struct secspacq *acq; @@ -7005,10 +7063,10 @@ key_getspacq(spidx) * m will always be freed. */ static int -key_acquire2(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_acquire2( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { const struct sadb_address *src0, *dst0; struct secasindex saidx; @@ -7134,10 +7192,10 @@ key_acquire2(so, m, mhp) * m will always be freed. */ static int -key_register(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_register( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { struct secreg *reg, *newreg = 0; @@ -7303,8 +7361,8 @@ key_register(so, m, mhp) * XXX: I want to do free a socket marked done SADB_RESIGER to socket. */ void -key_freereg(so) - struct socket *so; +key_freereg( + struct socket *so) { struct secreg *reg; int i; @@ -7344,8 +7402,8 @@ key_freereg(so) * others : error number */ static int -key_expire(sav) - struct secasvar *sav; +key_expire( + struct secasvar *sav) { int satype; struct mbuf *result = NULL, *m; @@ -7471,10 +7529,10 @@ key_expire(sav) * m will always be freed. */ static int -key_flush(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_flush( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { struct sadb_msg *newmsg; struct secashead *sah, *nextsah; @@ -7560,10 +7618,10 @@ struct sav_dump_elem { }; static int -key_dump(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_dump( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { struct secashead *sah; struct secasvar *sav; @@ -7679,10 +7737,10 @@ key_dump(so, m, mhp) * m will always be freed. */ static int -key_promisc(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_promisc( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { int olen; @@ -7771,9 +7829,9 @@ static int (*key_typesw[])(struct socket *, struct mbuf *, * length for buffer to send to user process. */ int -key_parse(m, so) - struct mbuf *m; - struct socket *so; +key_parse( + struct mbuf *m, + struct socket *so) { struct sadb_msg *msg; struct sadb_msghdr mh; @@ -8026,10 +8084,10 @@ key_parse(m, so) } static int -key_senderror(so, m, code) - struct socket *so; - struct mbuf *m; - int code; +key_senderror( + struct socket *so, + struct mbuf *m, + int code) { struct sadb_msg *msg; @@ -8049,9 +8107,9 @@ key_senderror(so, m, code) * XXX larger-than-MCLBYTES extension? */ static int -key_align(m, mhp) - struct mbuf *m; - struct sadb_msghdr *mhp; +key_align( + struct mbuf *m, + struct sadb_msghdr *mhp) { struct mbuf *n; struct sadb_ext *ext; @@ -8156,9 +8214,9 @@ key_align(m, mhp) } static int -key_validate_ext(ext, len) - const struct sadb_ext *ext; - int len; +key_validate_ext( + const struct sadb_ext *ext, + int len) { struct sockaddr *sa; enum { NONE, ADDR } checktype = NONE; @@ -8216,50 +8274,8 @@ key_validate_ext(ext, len) } void -key_domain_init() +key_domain_init(void) { - int i; - - bzero((caddr_t)&key_cb, sizeof(key_cb)); - - for (i = 0; i < IPSEC_DIR_MAX; i++) { - LIST_INIT(&sptree[i]); - } - ipsec_policy_count = 0; - - LIST_INIT(&sahtree); - - for (i = 0; i <= SADB_SATYPE_MAX; i++) { - LIST_INIT(®tree[i]); - } - ipsec_sav_count = 0; - -#ifndef IPSEC_NONBLOCK_ACQUIRE - LIST_INIT(&acqtree); -#endif - LIST_INIT(&spacqtree); - - /* system default */ -#if INET - ip4_def_policy.policy = IPSEC_POLICY_NONE; - ip4_def_policy.refcnt++; /*never reclaim this*/ -#endif -#if INET6 - ip6_def_policy.policy = IPSEC_POLICY_NONE; - ip6_def_policy.refcnt++; /*never reclaim this*/ -#endif - -#ifndef IPSEC_DEBUG2 - timeout((void *)key_timehandler, (void *)0, hz); -#endif /*IPSEC_DEBUG2*/ - - /* initialize key statistics */ - keystat.getspi_count = 1; - -#ifndef __APPLE__ - printf("IPsec: Initialized Security Association Processing.\n"); -#endif - return; } @@ -8290,9 +8306,9 @@ key_checktunnelsanity( /* record data transfer on SA, and update timestamps */ void -key_sa_recordxfer(sav, m) - struct secasvar *sav; - struct mbuf *m; +key_sa_recordxfer( + struct secasvar *sav, + struct mbuf *m) { @@ -8343,8 +8359,8 @@ key_sa_recordxfer(sav, m) /* dumb version */ void -key_sa_routechange(dst) - struct sockaddr *dst; +key_sa_routechange( + struct sockaddr *dst) { struct secashead *sah; struct route *ro; @@ -8364,9 +8380,9 @@ key_sa_routechange(dst) } static void -key_sa_chgstate(sav, state) - struct secasvar *sav; - u_int8_t state; +key_sa_chgstate( + struct secasvar *sav, + u_int8_t state) { if (sav == NULL) @@ -8386,8 +8402,8 @@ key_sa_chgstate(sav, state) } void -key_sa_stir_iv(sav) - struct secasvar *sav; +key_sa_stir_iv( + struct secasvar *sav) { lck_mtx_lock(sadb_mutex); if (!sav->iv) @@ -8398,8 +8414,8 @@ key_sa_stir_iv(sav) /* XXX too much? */ static struct mbuf * -key_alloc_mbuf(l) - int l; +key_alloc_mbuf( + int l) { struct mbuf *m = NULL, *n; int len, t; diff --git a/bsd/nfs/Makefile b/bsd/nfs/Makefile index 10e246402..d4c4ce3cb 100644 --- a/bsd/nfs/Makefile +++ b/bsd/nfs/Makefile @@ -9,14 +9,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ DATAFILES = \ diff --git a/bsd/nfs/krpc.h b/bsd/nfs/krpc.h index 16fde5248..5f3b87677 100644 --- a/bsd/nfs/krpc.h +++ b/bsd/nfs/krpc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -44,7 +44,7 @@ int krpc_portmap(struct sockaddr_in *sin, /* - * RPC definitions for the portmapper + * RPC definitions for the portmapper (portmap and rpcbind) */ #define PMAPPORT 111 #define PMAPPROG 100000 @@ -56,6 +56,24 @@ int krpc_portmap(struct sockaddr_in *sin, #define PMAPPROC_DUMP 4 #define PMAPPROC_CALLIT 5 +#define RPCBPROG PMAPPROG +#define RPCBVERS3 3 +#define RPCBVERS4 4 +#define RPCBPROC_NULL 0 +#define RPCBPROC_SET 1 +#define RPCBPROC_UNSET 2 +#define RPCBPROC_GETADDR 3 +#define RPCBPROC_DUMP 4 +#define RPCBPROC_CALLIT 5 +#define RPCBPROC_BCAST RPCBPROC_CALLIT +#define RPCBPROC_GETTIME 6 +#define RPCBPROC_UADDR2TADDR 7 +#define RPCBPROC_TADDR2UADDR 8 +#define RPCBPROC_GETVERSADDR 9 +#define RPCBPROC_INDIRECT 10 +#define RPCBPROC_GETADDRLIST 11 +#define RPCBPROC_GETSTAT 12 + /* * RPC definitions for bootparamd diff --git a/bsd/nfs/nfs.h b/bsd/nfs/nfs.h index 821af8e5b..41b025389 100644 --- a/bsd/nfs/nfs.h +++ b/bsd/nfs/nfs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -146,14 +146,71 @@ __private_extern__ int nfs_ticks; * (These sizes are always a power of 2. If the kernel malloc() changes * to one that does not allocate space in powers of 2 size, then this all * becomes bunk!). - * Note that some of these structures come out of there own nfs zones. -*/ + * Note that some of these structures come out of their own nfs zones. + */ #define NFS_NODEALLOC 1024 #define NFS_MNTALLOC 1024 #define NFS_SVCALLOC 512 +#define NFS_ARGSVERSION_XDR 88 /* NFS mount args are in XDR format */ + +#define NFS_XDRARGS_VERSION_0 0 +#define NFS_MATTR_BITMAP_LEN 1 /* length of mount attributes bitmap */ +#define NFS_MFLAG_BITMAP_LEN 1 /* length of mount flags bitmap */ + +/* NFS mount attributes */ +#define NFS_MATTR_FLAGS 0 /* mount flags (NFS_MATTR_*) */ +#define NFS_MATTR_NFS_VERSION 1 /* NFS protocol version */ +#define NFS_MATTR_NFS_MINOR_VERSION 2 /* NFS protocol minor version */ +#define NFS_MATTR_READ_SIZE 3 /* READ RPC size */ +#define NFS_MATTR_WRITE_SIZE 4 /* WRITE RPC size */ +#define NFS_MATTR_READDIR_SIZE 5 /* READDIR RPC size */ +#define NFS_MATTR_READAHEAD 6 /* block readahead count */ +#define NFS_MATTR_ATTRCACHE_REG_MIN 7 /* minimum attribute cache time */ +#define NFS_MATTR_ATTRCACHE_REG_MAX 8 /* maximum attribute cache time */ +#define NFS_MATTR_ATTRCACHE_DIR_MIN 9 /* minimum attribute cache time for dirs */ +#define NFS_MATTR_ATTRCACHE_DIR_MAX 10 /* maximum attribute cache time for dirs */ +#define NFS_MATTR_LOCK_MODE 11 /* advisory file locking mode (NFS_LOCK_MODE_*) */ +#define NFS_MATTR_SECURITY 12 /* RPC security flavors to use */ +#define NFS_MATTR_MAX_GROUP_LIST 13 /* max # of RPC AUTH_SYS groups */ +#define NFS_MATTR_SOCKET_TYPE 14 /* socket transport type as a netid-like string */ +#define NFS_MATTR_NFS_PORT 15 /* port # to use for NFS protocol */ +#define NFS_MATTR_MOUNT_PORT 16 /* port # to use for MOUNT protocol */ +#define NFS_MATTR_REQUEST_TIMEOUT 17 /* initial RPC request timeout value */ +#define NFS_MATTR_SOFT_RETRY_COUNT 18 /* max RPC retransmissions for soft mounts */ +#define NFS_MATTR_DEAD_TIMEOUT 19 /* how long until unresponsive mount is considered dead */ +#define NFS_MATTR_FH 20 /* file handle for mount directory */ +#define NFS_MATTR_FS_LOCATIONS 21 /* list of locations for the file system */ +#define NFS_MATTR_MNTFLAGS 22 /* VFS mount flags (MNT_*) */ +#define NFS_MATTR_MNTFROM 23 /* fixed string to use for "f_mntfromname" */ + +/* NFS mount flags */ +#define NFS_MFLAG_SOFT 0 /* soft mount (requests fail if unresponsive) */ +#define NFS_MFLAG_INTR 1 /* allow operations to be interrupted */ +#define NFS_MFLAG_RESVPORT 2 /* use a reserved port */ +#define NFS_MFLAG_NOCONNECT 3 /* don't connect the socket (UDP) */ +#define NFS_MFLAG_DUMBTIMER 4 /* don't estimate RTT dynamically */ +#define NFS_MFLAG_CALLUMNT 5 /* call MOUNTPROC_UMNT on unmount */ +#define NFS_MFLAG_RDIRPLUS 6 /* request additional info when reading directories */ +#define NFS_MFLAG_NONEGNAMECACHE 7 /* don't do negative name caching */ +#define NFS_MFLAG_MUTEJUKEBOX 8 /* don't treat jukebox errors as unresponsive */ +#define NFS_MFLAG_EPHEMERAL 9 /* ephemeral (mirror) mount */ +#define NFS_MFLAG_NOCALLBACK 10 /* don't provide callback RPC service */ +#define NFS_MFLAG_NONAMEDATTR 11 /* don't use named attributes */ +#define NFS_MFLAG_NOACL 12 /* don't support ACLs */ +#define NFS_MFLAG_ACLONLY 13 /* only support ACLs - not mode */ +#define NFS_MFLAG_NFC 14 /* send NFC strings */ +#define NFS_MFLAG_NOQUOTA 15 /* don't support QUOTA requests */ +#define NFS_MFLAG_MNTUDP 16 /* MOUNT protocol should use UDP */ +#define NFS_MFLAG_MNTQUICK 17 /* use short timeouts while mounting */ + +/* NFS advisory file locking modes */ +#define NFS_LOCK_MODE_ENABLED 0 /* advisory file locking enabled */ +#define NFS_LOCK_MODE_DISABLED 1 /* do not support advisory file locking */ +#define NFS_LOCK_MODE_LOCAL 2 /* perform advisory file locking locally */ + /* - * Arguments to mount NFS + * Old-style arguments to mount NFS */ #define NFS_ARGSVERSION 6 /* change when nfs_args changes */ struct nfs_args { @@ -197,115 +254,11 @@ struct nfs_args { /* NFS_ARGSVERSION 5 ends here */ uint32_t deadtimeout; /* secs until unresponsive mount considered dead */ }; -struct nfs_args5 { - int version; /* args structure version number */ -#ifdef KERNEL - user32_addr_t addr; /* file server address */ -#else - struct sockaddr *addr; /* file server address */ -#endif - int addrlen; /* length of address */ - int sotype; /* Socket type */ - int proto; /* and Protocol */ -#ifdef KERNEL - user32_addr_t fh; /* File handle to be mounted */ -#else - u_char *fh; /* File handle to be mounted */ -#endif - int fhsize; /* Size, in bytes, of fh */ - int flags; /* flags */ - int wsize; /* write size in bytes */ - int rsize; /* read size in bytes */ - int readdirsize; /* readdir size in bytes */ - int timeo; /* initial timeout in .1 secs */ - int retrans; /* times to retry send */ - int maxgrouplist; /* Max. size of group list */ - int readahead; /* # of blocks to readahead */ - int leaseterm; /* obsolete: Term (sec) of lease */ - int deadthresh; /* obsolete: Retrans threshold */ -#ifdef KERNEL - user32_addr_t hostname; /* server's name */ -#else - char *hostname; /* server's name */ -#endif - /* NFS_ARGSVERSION 3 ends here */ - int acregmin; /* reg file min attr cache timeout */ - int acregmax; /* reg file max attr cache timeout */ - int acdirmin; /* dir min attr cache timeout */ - int acdirmax; /* dir max attr cache timeout */ - /* NFS_ARGSVERSION 4 ends here */ - uint32_t auth; /* security mechanism flavor */ -}; -struct nfs_args4 { - int version; /* args structure version number */ -#ifdef KERNEL - user32_addr_t addr; /* file server address */ -#else - struct sockaddr *addr; /* file server address */ -#endif - int addrlen; /* length of address */ - int sotype; /* Socket type */ - int proto; /* and Protocol */ -#ifdef KERNEL - user32_addr_t fh; /* File handle to be mounted */ -#else - u_char *fh; /* File handle to be mounted */ -#endif - int fhsize; /* Size, in bytes, of fh */ - int flags; /* flags */ - int wsize; /* write size in bytes */ - int rsize; /* read size in bytes */ - int readdirsize; /* readdir size in bytes */ - int timeo; /* initial timeout in .1 secs */ - int retrans; /* times to retry send */ - int maxgrouplist; /* Max. size of group list */ - int readahead; /* # of blocks to readahead */ - int leaseterm; /* obsolete: Term (sec) of lease */ - int deadthresh; /* obsolete: Retrans threshold */ -#ifdef KERNEL - user32_addr_t hostname; /* server's name */ -#else - char *hostname; /* server's name */ -#endif - /* NFS_ARGSVERSION 3 ends here */ - int acregmin; /* reg file min attr cache timeout */ - int acregmax; /* reg file max attr cache timeout */ - int acdirmin; /* dir min attr cache timeout */ - int acdirmax; /* dir max attr cache timeout */ -}; -struct nfs_args3 { - int version; /* args structure version number */ -#ifdef KERNEL - user32_addr_t addr; /* file server address */ -#else - struct sockaddr *addr; /* file server address */ -#endif - int addrlen; /* length of address */ - int sotype; /* Socket type */ - int proto; /* and Protocol */ -#ifdef KERNEL - user32_addr_t fh; /* File handle to be mounted */ -#else - u_char *fh; /* File handle to be mounted */ -#endif - int fhsize; /* Size, in bytes, of fh */ - int flags; /* flags */ - int wsize; /* write size in bytes */ - int rsize; /* read size in bytes */ - int readdirsize; /* readdir size in bytes */ - int timeo; /* initial timeout in .1 secs */ - int retrans; /* times to retry send */ - int maxgrouplist; /* Max. size of group list */ - int readahead; /* # of blocks to readahead */ - int leaseterm; /* obsolete: Term (sec) of lease */ - int deadthresh; /* obsolete: Retrans threshold */ -#ifdef KERNEL - user32_addr_t hostname; /* server's name */ -#else - char *hostname; /* server's name */ -#endif -}; +/* incremental size additions in each version of nfs_args */ +#define NFS_ARGSVERSION4_INCSIZE (4 * sizeof(int)) +#define NFS_ARGSVERSION5_INCSIZE (sizeof(uint32_t)) +#define NFS_ARGSVERSION6_INCSIZE (sizeof(uint32_t)) #ifdef KERNEL /* LP64 version of nfs_args. all pointers and longs @@ -341,83 +294,10 @@ struct user_nfs_args { /* NFS_ARGSVERSION 5 ends here */ uint32_t deadtimeout; /* secs until unresponsive mount considered dead */ }; -struct user_nfs_args5 { - int version; /* args structure version number */ - user_addr_t addr __attribute((aligned(8))); /* file server address */ - int addrlen; /* length of address */ - int sotype; /* Socket type */ - int proto; /* and Protocol */ - user_addr_t fh __attribute((aligned(8))); /* File handle to be mounted */ - int fhsize; /* Size, in bytes, of fh */ - int flags; /* flags */ - int wsize; /* write size in bytes */ - int rsize; /* read size in bytes */ - int readdirsize; /* readdir size in bytes */ - int timeo; /* initial timeout in .1 secs */ - int retrans; /* times to retry send */ - int maxgrouplist; /* Max. size of group list */ - int readahead; /* # of blocks to readahead */ - int leaseterm; /* obsolete: Term (sec) of lease */ - int deadthresh; /* obsolete: Retrans threshold */ - user_addr_t hostname __attribute((aligned(8))); /* server's name */ - /* NFS_ARGSVERSION 3 ends here */ - int acregmin; /* reg file min attr cache timeout */ - int acregmax; /* reg file max attr cache timeout */ - int acdirmin; /* dir min attr cache timeout */ - int acdirmax; /* dir max attr cache timeout */ - /* NFS_ARGSVERSION 4 ends here */ - uint32_t auth; /* security mechanism flavor */ -}; -struct user_nfs_args4 { - int version; /* args structure version number */ - user_addr_t addr __attribute((aligned(8))); /* file server address */ - int addrlen; /* length of address */ - int sotype; /* Socket type */ - int proto; /* and Protocol */ - user_addr_t fh __attribute((aligned(8))); /* File handle to be mounted */ - int fhsize; /* Size, in bytes, of fh */ - int flags; /* flags */ - int wsize; /* write size in bytes */ - int rsize; /* read size in bytes */ - int readdirsize; /* readdir size in bytes */ - int timeo; /* initial timeout in .1 secs */ - int retrans; /* times to retry send */ - int maxgrouplist; /* Max. size of group list */ - int readahead; /* # of blocks to readahead */ - int leaseterm; /* obsolete: Term (sec) of lease */ - int deadthresh; /* obsolete: Retrans threshold */ - user_addr_t hostname __attribute((aligned(8))); /* server's name */ - /* NFS_ARGSVERSION 3 ends here */ - int acregmin; /* reg file min attr cache timeout */ - int acregmax; /* reg file max attr cache timeout */ - int acdirmin; /* dir min attr cache timeout */ - int acdirmax; /* dir max attr cache timeout */ -}; -struct user_nfs_args3 { - int version; /* args structure version number */ - user_addr_t addr __attribute((aligned(8))); /* file server address */ - int addrlen; /* length of address */ - int sotype; /* Socket type */ - int proto; /* and Protocol */ - user_addr_t fh __attribute((aligned(8))); /* File handle to be mounted */ - int fhsize; /* Size, in bytes, of fh */ - int flags; /* flags */ - int wsize; /* write size in bytes */ - int rsize; /* read size in bytes */ - int readdirsize; /* readdir size in bytes */ - int timeo; /* initial timeout in .1 secs */ - int retrans; /* times to retry send */ - int maxgrouplist; /* Max. size of group list */ - int readahead; /* # of blocks to readahead */ - int leaseterm; /* obsolete: Term (sec) of lease */ - int deadthresh; /* obsolete: Retrans threshold */ - user_addr_t hostname __attribute((aligned(8))); /* server's name */ -}; - #endif // KERNEL /* - * NFS mount option flags + * Old-style NFS mount option flags */ #define NFSMNT_SOFT 0x00000001 /* soft mount (hard is default) */ #define NFSMNT_WSIZE 0x00000002 /* set write size */ @@ -446,6 +326,27 @@ struct user_nfs_args3 { #define NFSMNT_SECFLAVOR 0x01000000 /* Use security flavor */ #define NFSMNT_SECSYSOK 0x02000000 /* Server can support auth sys */ #define NFSMNT_MUTEJUKEBOX 0x04000000 /* don't treat jukebox errors as unresponsive */ +#define NFSMNT_NOQUOTA 0x08000000 /* don't support QUOTA requests */ + + +/* + * fs.nfs sysctl(3) NFS_MOUNTINFO defines + */ +#define NFS_MOUNT_INFO_VERSION 0 /* nfsstat mount information version */ +#define NFS_MIATTR_BITMAP_LEN 1 /* length of mount info attributes bitmap */ +#define NFS_MIFLAG_BITMAP_LEN 1 /* length of mount info flags bitmap */ + +/* NFS mount info attributes */ +#define NFS_MIATTR_FLAGS 0 /* mount info flags bitmap (MIFLAG_*) */ +#define NFS_MIATTR_ORIG_ARGS 1 /* original mount args passed into mount call */ +#define NFS_MIATTR_CUR_ARGS 2 /* current mount args values */ +#define NFS_MIATTR_CUR_LOC_INDEX 3 /* current fs location index */ + +/* NFS mount info flags */ +#define NFS_MIFLAG_DEAD 0 /* mount is dead */ +#define NFS_MIFLAG_NOTRESP 1 /* server is unresponsive */ +#define NFS_MIFLAG_RECOVERY 2 /* mount in recovery */ + /* * Structures for the nfssvc(2) syscall. Not that anyone but nfsd @@ -831,6 +732,7 @@ struct nfsstats { * Flags for nfsclnt() system call. */ #define NFSCLNT_LOCKDANS 0x200 +#define NFSCLNT_LOCKDNOTIFY 0x400 /* * fs.nfs sysctl(3) identifiers @@ -839,6 +741,7 @@ struct nfsstats { #define NFS_EXPORTSTATS 3 /* gets exported directory stats */ #define NFS_USERSTATS 4 /* gets exported directory active user stats */ #define NFS_USERCOUNT 5 /* gets current count of active nfs users */ +#define NFS_MOUNTINFO 6 /* gets information about an NFS mount */ #ifndef NFS_WDELAYHASHSIZ #define NFS_WDELAYHASHSIZ 16 /* and with this */ @@ -882,6 +785,11 @@ struct nfs_open_file; struct nfs_lock_owner; struct nfs_file_lock; struct nfsreq; +struct nfs_rpc_record_state; +struct nfs_fs_locations; +struct nfs_location_index; +struct nfs_socket; +struct nfs_socket_search; /* * The set of signals the interrupt an I/O in progress for NFSMNT_INT mounts. @@ -926,6 +834,28 @@ struct nfsreq_cbinfo { uint32_t rcb_args[3]; /* additional callback args */ }; +/* + * Arguments to use if a request needs to call SECINFO to handle a WRONGSEC error + * + * If only node is set, use the parent file handle and this node's name; otherwise, + * use any file handle and name provided. + */ +struct nfsreq_secinfo_args { + nfsnode_t rsia_np; /* the node */ + const char *rsia_name; /* alternate name string */ + u_char *rsia_fh; /* alternate file handle */ + uint32_t rsia_namelen; /* length of string */ + uint32_t rsia_fhsize; /* length of fh */ +}; +#define NFSREQ_SECINFO_SET(SI, NP, FH, FHSIZE, NAME, NAMELEN) \ + do { \ + (SI)->rsia_np = (NP); \ + (SI)->rsia_fh = (FH); \ + (SI)->rsia_fhsize = (FHSIZE); \ + (SI)->rsia_name = (NAME); \ + (SI)->rsia_namelen = (NAMELEN); \ + } while (0) + /* * NFS outstanding request list element */ @@ -959,8 +889,11 @@ struct nfsreq { SLIST_HEAD(, gss_seq) r_gss_seqlist; /* RPCSEC_GSS sequence numbers */ uint32_t r_gss_argoff; /* RPCSEC_GSS offset to args */ uint32_t r_gss_arglen; /* RPCSEC_GSS arg length */ + uint32_t r_auth; /* security flavor request sent with */ + uint32_t *r_wrongsec; /* wrongsec: other flavors to try */ int r_error; /* request error */ struct nfsreq_cbinfo r_callback; /* callback info */ + struct nfsreq_secinfo_args r_secinfo; /* secinfo args */ }; /* @@ -992,9 +925,10 @@ __private_extern__ lck_grp_t *nfs_request_grp; #define R_RESENDQ 0x00004000 /* async request currently on resendq */ #define R_SENDING 0x00008000 /* request currently being sent */ +#define R_NOINTR 0x20000000 /* request should not be interupted by a signal */ #define R_RECOVER 0x40000000 /* a state recovery RPC - during NFSSTA_RECOVER */ #define R_SETUP 0x80000000 /* a setup RPC - during (re)connection */ -#define R_OPTMASK 0xc0000000 /* mask of all RPC option flags */ +#define R_OPTMASK 0xe0000000 /* mask of all RPC option flags */ /* Flag values for r_lflags */ #define RL_BUSY 0x0001 /* Locked. */ @@ -1002,10 +936,21 @@ __private_extern__ lck_grp_t *nfs_request_grp; #define RL_QUEUED 0x0004 /* request is on the queue */ __private_extern__ u_int32_t nfs_xid, nfs_xidwrap; -__private_extern__ int nfs_iosize, nfs_access_cache_timeout, nfs_access_delete, nfs_allow_async, nfs_statfs_rate_limit; +__private_extern__ int nfs_iosize, nfs_allow_async, nfs_statfs_rate_limit; +__private_extern__ int nfs_access_cache_timeout, nfs_access_delete, nfs_access_dotzfs, nfs_access_for_getattr; __private_extern__ int nfs_lockd_mounts, nfs_lockd_request_sent, nfs_single_des; __private_extern__ int nfs_tprintf_initial_delay, nfs_tprintf_delay; __private_extern__ int nfsiod_thread_count, nfsiod_thread_max, nfs_max_async_writes; +__private_extern__ int nfs_idmap_ctrl, nfs_callback_port; + +/* bits for nfs_idmap_ctrl: */ +#define NFS_IDMAP_CTRL_USE_IDMAP_SERVICE 0x00000001 /* use the ID mapping service */ +#define NFS_IDMAP_CTRL_FALLBACK_NO_COMMON_IDS 0x00000002 /* fallback should NOT handle common IDs like "root" and "nobody" */ +#define NFS_IDMAP_CTRL_FALLBACK_NO_WELLKNOWN_IDS 0x00000004 /* fallback should NOT handle the well known "XXX@" IDs */ +#define NFS_IDMAP_CTRL_UNKNOWN_IS_99 0x00000008 /* for unknown IDs use uid/gid 99 instead of -2/nobody */ +#define NFS_IDMAP_CTRL_COMPARE_RESULTS 0x00000010 /* compare results of ID mapping service and fallback */ +#define NFS_IDMAP_CTRL_LOG_FAILED_MAPPINGS 0x00000020 /* log failed ID mapping attempts */ +#define NFS_IDMAP_CTRL_LOG_SUCCESSFUL_MAPPINGS 0x00000040 /* log successful ID mapping attempts */ #define NFSIOD_MAX (MIN(nfsiod_thread_max, NFS_MAXASYNCTHREAD)) @@ -1018,14 +963,6 @@ struct nfs_dulookup { char du_smallname[48]; /* buffer for small names */ }; -/* - * Network address hash list element - */ -union nethostaddr { - u_int32_t had_inetaddr; - mbuf_t had_nam; -}; - /* * One nfsrv_sock structure is maintained for each socket the * server is servicing requests on. @@ -1071,7 +1008,7 @@ struct nfsrv_sock { #define SLPNOLIST ((struct nfsrv_sock *)0xdeadbeef) /* sentinel value for sockets not in the nfsrv_sockwg list */ -__private_extern__ struct nfsrv_sock *nfsrv_udpsock; +__private_extern__ struct nfsrv_sock *nfsrv_udpsock, *nfsrv_udp6sock; /* * global NFS server socket lists: @@ -1148,7 +1085,7 @@ __private_extern__ lck_mtx_t *nfs_global_mutex; /* NFSv4 callback globals */ __private_extern__ int nfs4_callback_timer_on; -__private_extern__ in_port_t nfs4_cb_port; +__private_extern__ in_port_t nfs4_cb_port, nfs4_cb_port6; /* nfs timer call structures */ __private_extern__ thread_call_t nfs_request_timer_call; @@ -1180,15 +1117,23 @@ void nfs4_mount_callback_shutdown(struct nfsmount *); void nfs4_cb_accept(socket_t, void *, int); void nfs4_cb_rcv(socket_t, void *, int); void nfs4_callback_timer(void *, void *); +int nfs4_secinfo_rpc(struct nfsmount *, struct nfsreq_secinfo_args *, kauth_cred_t, uint32_t *, int *); +int nfs4_get_fs_locations(struct nfsmount *, nfsnode_t, u_char *, int, const char *, vfs_context_t, struct nfs_fs_locations *); +void nfs_fs_locations_cleanup(struct nfs_fs_locations *); +void nfs4_default_attrs_for_referral_trigger(nfsnode_t, char *, int, struct nfs_vattr *, fhandle_t *); -int nfs_connect(struct nfsmount *, int); +int nfs_sockaddr_cmp(struct sockaddr *, struct sockaddr *); +int nfs_connect(struct nfsmount *, int, int); void nfs_disconnect(struct nfsmount *); void nfs_need_reconnect(struct nfsmount *); void nfs_mount_sock_thread_wake(struct nfsmount *); void nfs_mount_check_dead_timeout(struct nfsmount *); +void nfs_rpc_record_state_init(struct nfs_rpc_record_state *); +void nfs_rpc_record_state_cleanup(struct nfs_rpc_record_state *); +int nfs_rpc_record_read(socket_t, struct nfs_rpc_record_state *, int, int *, mbuf_t *); int nfs_getattr(nfsnode_t, struct nfs_vattr *, vfs_context_t, int); -int nfs_getattrcache(nfsnode_t, struct nfs_vattr *); +int nfs_getattrcache(nfsnode_t, struct nfs_vattr *, int); int nfs_loadattrcache(nfsnode_t, struct nfs_vattr *, u_int64_t *, int); int nfs_attrcachetimeout(nfsnode_t); @@ -1196,6 +1141,7 @@ int nfs_buf_page_inval(vnode_t vp, off_t offset); int nfs_vinvalbuf(vnode_t, int, vfs_context_t, int); int nfs_vinvalbuf2(vnode_t, int, thread_t, kauth_cred_t, int); int nfs_vinvalbuf_internal(nfsnode_t, int, thread_t, kauth_cred_t, int, int); +void nfs_wait_bufs(nfsnode_t); int nfs_request_create(nfsnode_t, mount_t, struct nfsm_chain *, int, thread_t, kauth_cred_t, struct nfsreq **); void nfs_request_destroy(struct nfsreq *); @@ -1205,14 +1151,14 @@ int nfs_request_add_header(struct nfsreq *); int nfs_request_send(struct nfsreq *, int); void nfs_request_wait(struct nfsreq *); int nfs_request_finish(struct nfsreq *, struct nfsm_chain *, int *); -int nfs_request(nfsnode_t, mount_t, struct nfsm_chain *, int, vfs_context_t, struct nfsm_chain *, u_int64_t *, int *); -int nfs_request2(nfsnode_t, mount_t, struct nfsm_chain *, int, thread_t, kauth_cred_t, int, struct nfsm_chain *, u_int64_t *, int *); +int nfs_request(nfsnode_t, mount_t, struct nfsm_chain *, int, vfs_context_t, struct nfsreq_secinfo_args *, struct nfsm_chain *, u_int64_t *, int *); +int nfs_request2(nfsnode_t, mount_t, struct nfsm_chain *, int, thread_t, kauth_cred_t, struct nfsreq_secinfo_args *, int, struct nfsm_chain *, u_int64_t *, int *); int nfs_request_gss(mount_t, struct nfsm_chain *, thread_t, kauth_cred_t, int, struct nfs_gss_clnt_ctx *, struct nfsm_chain *, int *); -int nfs_request_async(nfsnode_t, mount_t, struct nfsm_chain *, int, thread_t, kauth_cred_t, struct nfsreq_cbinfo *cb, struct nfsreq **); +int nfs_request_async(nfsnode_t, mount_t, struct nfsm_chain *, int, thread_t, kauth_cred_t, struct nfsreq_secinfo_args *, int, struct nfsreq_cbinfo *, struct nfsreq **); int nfs_request_async_finish(struct nfsreq *, struct nfsm_chain *, u_int64_t *, int *); void nfs_request_async_cancel(struct nfsreq *); void nfs_request_timer(void *, void *); -int nfs_aux_request(struct nfsmount *, thread_t, struct sockaddr_in *, mbuf_t, uint32_t, int, int, struct nfsm_chain *); +int nfs_request_using_gss(struct nfsreq *); void nfs_get_xid(uint64_t *); int nfs_sigintr(struct nfsmount *, struct nfsreq *, thread_t, int); int nfs_noremotehang(thread_t); @@ -1221,6 +1167,24 @@ int nfs_send(struct nfsreq *, int); int nfs_sndlock(struct nfsreq *); void nfs_sndunlock(struct nfsreq *); +int nfs_uaddr2sockaddr(const char *, struct sockaddr *); + +int nfs_aux_request(struct nfsmount *, thread_t, struct sockaddr *, socket_t, int, mbuf_t, uint32_t, int, int, struct nfsm_chain *); +int nfs_portmap_lookup(struct nfsmount *, vfs_context_t, struct sockaddr *, socket_t, uint32_t, uint32_t, uint32_t, int); + +void nfs_location_next(struct nfs_fs_locations *, struct nfs_location_index *); +int nfs_location_index_cmp(struct nfs_location_index *, struct nfs_location_index *); +void nfs_location_mntfromname(struct nfs_fs_locations *, struct nfs_location_index, char *, int, int); +int nfs_socket_create(struct nfsmount *, struct sockaddr *, int, in_port_t, uint32_t, uint32_t, int, struct nfs_socket **); +void nfs_socket_destroy(struct nfs_socket *); +void nfs_socket_options(struct nfsmount *, struct nfs_socket *); +void nfs_connect_upcall(socket_t, void *, int); +int nfs_connect_error_class(int); +int nfs_connect_search_loop(struct nfsmount *, struct nfs_socket_search *); +void nfs_socket_search_update_error(struct nfs_socket_search *, int); +void nfs_socket_search_cleanup(struct nfs_socket_search *); +void nfs_mount_connect_thread(void *, __unused wait_result_t); + int nfs_lookitup(nfsnode_t, char *, int, vfs_context_t, nfsnode_t *); void nfs_dulookup_init(struct nfs_dulookup *, nfsnode_t, const char *, int, vfs_context_t); void nfs_dulookup_start(struct nfs_dulookup *, nfsnode_t, vfs_context_t); @@ -1229,16 +1193,28 @@ int nfs_dir_buf_cache_lookup(nfsnode_t, nfsnode_t *, struct componentname *, vfs int nfs_dir_buf_search(struct nfsbuf *, struct componentname *, fhandle_t *, struct nfs_vattr *, uint64_t *, time_t *, daddr64_t *, int); void nfs_name_cache_purge(nfsnode_t, nfsnode_t, struct componentname *, vfs_context_t); +uint32_t nfs4_ace_nfstype_to_vfstype(uint32_t, int *); +uint32_t nfs4_ace_vfstype_to_nfstype(uint32_t, int *); +uint32_t nfs4_ace_nfsflags_to_vfsflags(uint32_t); +uint32_t nfs4_ace_vfsflags_to_nfsflags(uint32_t); +uint32_t nfs4_ace_nfsmask_to_vfsrights(uint32_t); +uint32_t nfs4_ace_vfsrights_to_nfsmask(uint32_t); +int nfs4_id2guid(char *, guid_t *, int); +int nfs4_guid2id(guid_t *, char *, int *, int); + int nfs_parsefattr(struct nfsm_chain *, int, struct nfs_vattr *); -int nfs4_parsefattr(struct nfsm_chain *, struct nfs_fsattr *, struct nfs_vattr *, fhandle_t *, struct dqblk *); +int nfs4_parsefattr(struct nfsm_chain *, struct nfs_fsattr *, struct nfs_vattr *, fhandle_t *, struct dqblk *, struct nfs_fs_locations *); void nfs_vattr_set_supported(uint32_t *, struct vnode_attr *); +void nfs_vattr_set_bitmap(struct nfsmount *, uint32_t *, struct vnode_attr *); void nfs3_pathconf_cache(struct nfsmount *, struct nfs_fsattr *); +int nfs3_mount_rpc(struct nfsmount *, struct sockaddr *, int, int, char *, vfs_context_t, int, fhandle_t *, struct nfs_sec *); void nfs3_umount_rpc(struct nfsmount *, vfs_context_t, int); -int nfs_node_mode_slot(nfsnode_t, uid_t, int); +int nfs_node_access_slot(nfsnode_t, uid_t, int); +void nfs_vnode_notify(nfsnode_t, uint32_t); void nfs_avoid_needless_id_setting_on_create(nfsnode_t, struct vnode_attr *, vfs_context_t); int nfs4_create_rpc(vfs_context_t, nfsnode_t, struct componentname *, struct vnode_attr *, int, char *, nfsnode_t *); -int nfs_open_state_set_busy(nfsnode_t, vfs_context_t); +int nfs_open_state_set_busy(nfsnode_t, thread_t); void nfs_open_state_clear_busy(nfsnode_t); struct nfs_open_owner *nfs_open_owner_find(struct nfsmount *, kauth_cred_t, int); void nfs_open_owner_destroy(struct nfs_open_owner *); @@ -1248,21 +1224,34 @@ int nfs_open_owner_set_busy(struct nfs_open_owner *, thread_t); void nfs_open_owner_clear_busy(struct nfs_open_owner *); void nfs_owner_seqid_increment(struct nfs_open_owner *, struct nfs_lock_owner *, int); int nfs_open_file_find(nfsnode_t, struct nfs_open_owner *, struct nfs_open_file **, uint32_t, uint32_t, int); +int nfs_open_file_find_internal(nfsnode_t, struct nfs_open_owner *, struct nfs_open_file **, uint32_t, uint32_t, int); void nfs_open_file_destroy(struct nfs_open_file *); int nfs_open_file_set_busy(struct nfs_open_file *, thread_t); void nfs_open_file_clear_busy(struct nfs_open_file *); +void nfs_open_file_add_open(struct nfs_open_file *, uint32_t, uint32_t, int); +void nfs_open_file_remove_open_find(struct nfs_open_file *, uint32_t, uint32_t, uint32_t *, uint32_t *, int*); +void nfs_open_file_remove_open(struct nfs_open_file *, uint32_t, uint32_t); void nfs_get_stateid(nfsnode_t, thread_t, kauth_cred_t, nfs_stateid *); int nfs4_open(nfsnode_t, struct nfs_open_file *, uint32_t, uint32_t, vfs_context_t); -int nfs4_close(nfsnode_t, struct nfs_open_file *, uint32_t, uint32_t, vfs_context_t); -int nfs4_check_for_locks(struct nfs_open_owner *, struct nfs_open_file *); -void nfs4_reopen(struct nfs_open_file *, thread_t); +int nfs4_open_delegated(nfsnode_t, struct nfs_open_file *, uint32_t, uint32_t, vfs_context_t); +int nfs_close(nfsnode_t, struct nfs_open_file *, uint32_t, uint32_t, vfs_context_t); +int nfs_check_for_locks(struct nfs_open_owner *, struct nfs_open_file *); +int nfs4_reopen(struct nfs_open_file *, thread_t); int nfs4_open_rpc(struct nfs_open_file *, vfs_context_t, struct componentname *, struct vnode_attr *, vnode_t, vnode_t *, int, int, int); int nfs4_open_rpc_internal(struct nfs_open_file *, vfs_context_t, thread_t, kauth_cred_t, struct componentname *, struct vnode_attr *, vnode_t, vnode_t *, int, int, int); +int nfs4_open_confirm_rpc(struct nfsmount *, nfsnode_t, u_char *, int, struct nfs_open_owner *, nfs_stateid *, thread_t, kauth_cred_t, struct nfs_vattr *, uint64_t *); int nfs4_open_reopen_rpc(struct nfs_open_file *, thread_t, kauth_cred_t, struct componentname *, vnode_t, vnode_t *, int, int); int nfs4_open_reclaim_rpc(struct nfs_open_file *, int, int); +int nfs4_claim_delegated_open_rpc(struct nfs_open_file *, int, int, int); +int nfs4_claim_delegated_state_for_open_file(struct nfs_open_file *, int); +int nfs4_claim_delegated_state_for_node(nfsnode_t, int); int nfs4_open_downgrade_rpc(nfsnode_t, struct nfs_open_file *, vfs_context_t); int nfs4_close_rpc(nfsnode_t, struct nfs_open_file *, thread_t, kauth_cred_t, int); -int nfs4_delegreturn_rpc(struct nfsmount *, u_char *, int, struct nfs_stateid *, thread_t, kauth_cred_t); +void nfs4_delegation_return_enqueue(nfsnode_t); +int nfs4_delegation_return(nfsnode_t, int, thread_t, kauth_cred_t); +int nfs4_delegreturn_rpc(struct nfsmount *, u_char *, int, struct nfs_stateid *, int, thread_t, kauth_cred_t); +void nfs_release_open_state_for_node(nfsnode_t, int); +void nfs_revoke_open_state_for_node(nfsnode_t); struct nfs_lock_owner *nfs_lock_owner_find(nfsnode_t, proc_t, int); void nfs_lock_owner_destroy(struct nfs_lock_owner *); void nfs_lock_owner_ref(struct nfs_lock_owner *); @@ -1273,37 +1262,52 @@ void nfs_lock_owner_insert_held_lock(struct nfs_lock_owner *, struct nfs_file_lo struct nfs_file_lock *nfs_file_lock_alloc(struct nfs_lock_owner *); void nfs_file_lock_destroy(struct nfs_file_lock *); int nfs_file_lock_conflict(struct nfs_file_lock *, struct nfs_file_lock *, int *); -int nfs4_lock_rpc(nfsnode_t, struct nfs_open_file *, struct nfs_file_lock *, int, thread_t, kauth_cred_t); -int nfs4_unlock_rpc(nfsnode_t, struct nfs_lock_owner *, int, uint64_t, uint64_t, vfs_context_t); -int nfs4_getlock(nfsnode_t, struct nfs_lock_owner *, struct flock *, uint64_t, uint64_t, vfs_context_t); -int nfs4_setlock(nfsnode_t, struct nfs_open_file *, struct nfs_lock_owner *, int, uint64_t, uint64_t, int, short, vfs_context_t); -int nfs4_unlock(nfsnode_t, struct nfs_open_file *, struct nfs_lock_owner *, uint64_t, uint64_t, int, vfs_context_t); +int nfs4_lock_rpc(nfsnode_t, struct nfs_open_file *, struct nfs_file_lock *, int, int, thread_t, kauth_cred_t); +int nfs_unlock_rpc(nfsnode_t, struct nfs_lock_owner *, int, uint64_t, uint64_t, thread_t, kauth_cred_t, int); +int nfs_advlock_getlock(nfsnode_t, struct nfs_lock_owner *, struct flock *, uint64_t, uint64_t, vfs_context_t); +int nfs_advlock_setlock(nfsnode_t, struct nfs_open_file *, struct nfs_lock_owner *, int, uint64_t, uint64_t, int, short, vfs_context_t); +int nfs_advlock_unlock(nfsnode_t, struct nfs_open_file *, struct nfs_lock_owner *, uint64_t, uint64_t, int, vfs_context_t); + +nfsnode_t nfs4_named_attr_dir_get(nfsnode_t, int, vfs_context_t); +int nfs4_named_attr_get(nfsnode_t, struct componentname *, uint32_t, int, vfs_context_t, nfsnode_t *, struct nfs_open_file **); +int nfs4_named_attr_remove(nfsnode_t, nfsnode_t, const char *, vfs_context_t); -int nfs_mount_state_in_use_start(struct nfsmount *); +int nfs_mount_state_in_use_start(struct nfsmount *, thread_t); int nfs_mount_state_in_use_end(struct nfsmount *, int); int nfs_mount_state_error_should_restart(int); +int nfs_mount_state_error_delegation_lost(int); uint nfs_mount_state_max_restarts(struct nfsmount *); int nfs_mount_state_wait_for_recovery(struct nfsmount *); -void nfs4_recover(struct nfsmount *); +void nfs_need_recover(struct nfsmount *nmp, int error); +void nfs_recover(struct nfsmount *); int nfs_vnop_access(struct vnop_access_args *); - -int nfs3_vnop_open(struct vnop_open_args *); -int nfs3_vnop_close(struct vnop_close_args *); +int nfs_vnop_remove(struct vnop_remove_args *); +int nfs_vnop_read(struct vnop_read_args *); +int nfs_vnop_write(struct vnop_write_args *); +int nfs_vnop_open(struct vnop_open_args *); +int nfs_vnop_close(struct vnop_close_args *); +int nfs_vnop_advlock(struct vnop_advlock_args *); +int nfs_vnop_mmap(struct vnop_mmap_args *); +int nfs_vnop_mnomap(struct vnop_mnomap_args *); int nfs4_vnop_create(struct vnop_create_args *); int nfs4_vnop_mknod(struct vnop_mknod_args *); -int nfs4_vnop_open(struct vnop_open_args *); int nfs4_vnop_close(struct vnop_close_args *); -int nfs4_vnop_mmap(struct vnop_mmap_args *); -int nfs4_vnop_mnomap(struct vnop_mnomap_args *); int nfs4_vnop_getattr(struct vnop_getattr_args *); -int nfs4_vnop_read(struct vnop_read_args *); int nfs4_vnop_link(struct vnop_link_args *); int nfs4_vnop_mkdir(struct vnop_mkdir_args *); int nfs4_vnop_rmdir(struct vnop_rmdir_args *); int nfs4_vnop_symlink(struct vnop_symlink_args *); -int nfs4_vnop_advlock(struct vnop_advlock_args *ap); +int nfs4_vnop_getxattr(struct vnop_getxattr_args *); +int nfs4_vnop_setxattr(struct vnop_setxattr_args *); +int nfs4_vnop_removexattr(struct vnop_removexattr_args *); +int nfs4_vnop_listxattr(struct vnop_listxattr_args *); +#if NAMEDSTREAMS +int nfs4_vnop_getnamedstream(struct vnop_getnamedstream_args *); +int nfs4_vnop_makenamedstream(struct vnop_makenamedstream_args *); +int nfs4_vnop_removenamedstream(struct vnop_removenamedstream_args *); +#endif int nfs_read_rpc(nfsnode_t, uio_t, vfs_context_t); int nfs_write_rpc(nfsnode_t, uio_t, vfs_context_t, int *, uint64_t *); @@ -1311,8 +1315,8 @@ int nfs_write_rpc2(nfsnode_t, uio_t, thread_t, kauth_cred_t, int *, uint64_t *); int nfs3_access_rpc(nfsnode_t, u_int32_t *, vfs_context_t); int nfs4_access_rpc(nfsnode_t, u_int32_t *, vfs_context_t); -int nfs3_getattr_rpc(nfsnode_t, mount_t, u_char *, size_t, vfs_context_t, struct nfs_vattr *, u_int64_t *); -int nfs4_getattr_rpc(nfsnode_t, mount_t, u_char *, size_t, vfs_context_t, struct nfs_vattr *, u_int64_t *); +int nfs3_getattr_rpc(nfsnode_t, mount_t, u_char *, size_t, int, vfs_context_t, struct nfs_vattr *, u_int64_t *); +int nfs4_getattr_rpc(nfsnode_t, mount_t, u_char *, size_t, int, vfs_context_t, struct nfs_vattr *, u_int64_t *); int nfs3_setattr_rpc(nfsnode_t, struct vnode_attr *, vfs_context_t); int nfs4_setattr_rpc(nfsnode_t, struct vnode_attr *, vfs_context_t); int nfs3_read_rpc_async(nfsnode_t, off_t, size_t, thread_t, kauth_cred_t, struct nfsreq_cbinfo *, struct nfsreq **); @@ -1327,18 +1331,24 @@ int nfs3_readdir_rpc(nfsnode_t, struct nfsbuf *, vfs_context_t); int nfs4_readdir_rpc(nfsnode_t, struct nfsbuf *, vfs_context_t); int nfs3_readlink_rpc(nfsnode_t, char *, uint32_t *, vfs_context_t); int nfs4_readlink_rpc(nfsnode_t, char *, uint32_t *, vfs_context_t); -int nfs3_commit_rpc(nfsnode_t, u_int64_t, u_int64_t, kauth_cred_t); -int nfs4_commit_rpc(nfsnode_t, u_int64_t, u_int64_t, kauth_cred_t); +int nfs3_commit_rpc(nfsnode_t, uint64_t, uint64_t, kauth_cred_t, uint64_t); +int nfs4_commit_rpc(nfsnode_t, uint64_t, uint64_t, kauth_cred_t, uint64_t); int nfs3_lookup_rpc_async(nfsnode_t, char *, int, vfs_context_t, struct nfsreq **); int nfs4_lookup_rpc_async(nfsnode_t, char *, int, vfs_context_t, struct nfsreq **); -int nfs3_lookup_rpc_async_finish(nfsnode_t, vfs_context_t, struct nfsreq *, u_int64_t *, fhandle_t *, struct nfs_vattr *); -int nfs4_lookup_rpc_async_finish(nfsnode_t, vfs_context_t, struct nfsreq *, u_int64_t *, fhandle_t *, struct nfs_vattr *); +int nfs3_lookup_rpc_async_finish(nfsnode_t, char *, int, vfs_context_t, struct nfsreq *, u_int64_t *, fhandle_t *, struct nfs_vattr *); +int nfs4_lookup_rpc_async_finish(nfsnode_t, char *, int, vfs_context_t, struct nfsreq *, u_int64_t *, fhandle_t *, struct nfs_vattr *); int nfs3_remove_rpc(nfsnode_t, char *, int, thread_t, kauth_cred_t); int nfs4_remove_rpc(nfsnode_t, char *, int, thread_t, kauth_cred_t); int nfs3_rename_rpc(nfsnode_t, char *, int, nfsnode_t, char *, int, vfs_context_t); int nfs4_rename_rpc(nfsnode_t, char *, int, nfsnode_t, char *, int, vfs_context_t); int nfs3_pathconf_rpc(nfsnode_t, struct nfs_fsattr *, vfs_context_t); int nfs4_pathconf_rpc(nfsnode_t, struct nfs_fsattr *, vfs_context_t); +int nfs3_setlock_rpc(nfsnode_t, struct nfs_open_file *, struct nfs_file_lock *, int, int, thread_t, kauth_cred_t); +int nfs4_setlock_rpc(nfsnode_t, struct nfs_open_file *, struct nfs_file_lock *, int, int, thread_t, kauth_cred_t); +int nfs3_unlock_rpc(nfsnode_t, struct nfs_lock_owner *, int, uint64_t, uint64_t, int, thread_t, kauth_cred_t); +int nfs4_unlock_rpc(nfsnode_t, struct nfs_lock_owner *, int, uint64_t, uint64_t, int, thread_t, kauth_cred_t); +int nfs3_getlock_rpc(nfsnode_t, struct nfs_lock_owner *, struct flock *, uint64_t, uint64_t, vfs_context_t); +int nfs4_getlock_rpc(nfsnode_t, struct nfs_lock_owner *, struct flock *, uint64_t, uint64_t, vfs_context_t); void nfsrv_active_user_list_reclaim(void); void nfsrv_cleancache(void); @@ -1363,7 +1373,7 @@ int nfsrv_is_initialized(void); int nfsrv_namei(struct nfsrv_descript *, vfs_context_t, struct nameidata *, struct nfs_filehandle *, vnode_t *, struct nfs_export **, struct nfs_export_options **); -void nfsrv_rcv(socket_t, caddr_t, int); +void nfsrv_rcv(socket_t, void *, int); void nfsrv_rcv_locked(socket_t, struct nfsrv_sock *, int); int nfsrv_rephead(struct nfsrv_descript *, struct nfsrv_sock *, struct nfsm_chain *, size_t); int nfsrv_send(struct nfsrv_sock *, mbuf_t, mbuf_t); @@ -1410,6 +1420,15 @@ struct nfs_diskless; int nfs_boot_init(struct nfs_diskless *); int nfs_boot_getfh(struct nfs_diskless *, int, int); +#if CONFIG_TRIGGERS +resolver_result_t nfs_mirror_mount_trigger_resolve(vnode_t, const struct componentname *, enum path_operation, int, void *, vfs_context_t); +resolver_result_t nfs_mirror_mount_trigger_unresolve(vnode_t, int, void *, vfs_context_t); +resolver_result_t nfs_mirror_mount_trigger_rearm(vnode_t, int, void *, vfs_context_t); +int nfs_mirror_mount_domount(vnode_t, vnode_t, vfs_context_t); +void nfs_ephemeral_mount_harvester_start(void); +void nfs_ephemeral_mount_harvester(__unused void *arg, __unused wait_result_t wr); +#endif + __END_DECLS #endif /* KERNEL */ diff --git a/bsd/nfs/nfs4_subs.c b/bsd/nfs/nfs4_subs.c index 6b1786cda..3d65f7985 100644 --- a/bsd/nfs/nfs4_subs.c +++ b/bsd/nfs/nfs4_subs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006-2009 Apple Inc. All rights reserved. + * Copyright (c) 2006-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -87,7 +87,11 @@ * * In an attempt to differentiate mounts we include the mntfromname and mntonname * strings to the client ID (as long as they fit). We also make sure that the - * value does not conflict with any existing values in use. + * value does not conflict with any existing values in use (changing the unique ID). + * + * Note that info such as the server's address may change over the lifetime of the + * mount. But the client ID will not be updated because we don't want it changing + * simply because we switched to a different server address. */ int nfs4_init_clientid(struct nfsmount *nmp) @@ -120,7 +124,7 @@ nfs4_init_clientid(struct nfsmount *nmp) return (ENOMEM); vsfs = vfs_statfs(nmp->nm_mountp); - saddr = mbuf_data(nmp->nm_nam); + saddr = nmp->nm_saddr; ncip->nci_idlen = sizeof(uint32_t) + sizeof(en0addr) + saddr->sa_len + strlen(vsfs->f_mntfromname) + 1 + strlen(vsfs->f_mntonname) + 1; if (ncip->nci_idlen > NFS4_OPAQUE_LIMIT) @@ -199,10 +203,12 @@ nfs4_setclientid(struct nfsmount *nmp) thread_t thd; kauth_cred_t cred; struct nfsm_chain nmreq, nmrep; - struct sockaddr_in sin; - uint8_t *addr; - char raddr[32]; - int ralen = 0; + struct sockaddr_storage ss; + void *sinaddr = NULL; + char raddr[MAX_IPv6_STR_LEN]; + char uaddr[MAX_IPv6_STR_LEN+16]; + int ualen = 0; + in_port_t port; thd = current_thread(); cred = IS_VALID_CRED(nmp->nm_mcred) ? nmp->nm_mcred : vfs_context_ucred(vfs_context_kernel()); @@ -224,26 +230,35 @@ nfs4_setclientid(struct nfsmount *nmp) nfsm_chain_add_64(error, &nmreq, nmp->nm_mounttime); nfsm_chain_add_32(error, &nmreq, nmp->nm_longid->nci_idlen); nfsm_chain_add_opaque(error, &nmreq, nmp->nm_longid->nci_id, nmp->nm_longid->nci_idlen); + nfsmout_if(error); /* cb_client4 callback; */ - if (nmp->nm_cbid && nfs4_cb_port && - !(error = sock_getsockname(nmp->nm_so, (struct sockaddr*)&sin, sizeof(sin)))) { - /* assemble r_addr = h1.h2.h3.h4.p1.p2 */ - /* h = source address of nmp->nm_so */ - /* p = nfs4_cb_port */ - addr = (uint8_t*)&sin.sin_addr.s_addr; - ralen = snprintf(raddr, sizeof(raddr), "%d.%d.%d.%d.%d.%d", - addr[0], addr[1], addr[2], addr[3], - ((nfs4_cb_port >> 8) & 0xff), - (nfs4_cb_port & 0xff)); - /* make sure it fit, give up if it didn't */ - if (ralen >= (int)sizeof(raddr)) - ralen = 0; - } - if (ralen > 0) { + if (!NMFLAG(nmp, NOCALLBACK) && nmp->nm_cbid && nfs4_cb_port && + !sock_getsockname(nmp->nm_nso->nso_so, (struct sockaddr*)&ss, sizeof(ss))) { + if (ss.ss_family == AF_INET) { + sinaddr = &((struct sockaddr_in*)&ss)->sin_addr; + port = nfs4_cb_port; + } else if (ss.ss_family == AF_INET6) { + sinaddr = &((struct sockaddr_in6*)&ss)->sin6_addr; + port = nfs4_cb_port6; + } + if (sinaddr && port && (inet_ntop(ss.ss_family, sinaddr, raddr, sizeof(raddr)) == raddr)) { + /* assemble r_addr = universal address (nmp->nm_nso->nso_so source IP addr + port) */ + ualen = snprintf(uaddr, sizeof(uaddr), "%s.%d.%d", raddr, + ((port >> 8) & 0xff), + (port & 0xff)); + /* make sure it fit, give up if it didn't */ + if (ualen >= (int)sizeof(uaddr)) + ualen = 0; + } + } + if (ualen > 0) { /* add callback info */ nfsm_chain_add_32(error, &nmreq, NFS4_CALLBACK_PROG); /* callback program */ - nfsm_chain_add_string(error, &nmreq, "tcp", 3); /* callback r_netid */ - nfsm_chain_add_string(error, &nmreq, raddr, ralen); /* callback r_addr */ + if (ss.ss_family == AF_INET) + nfsm_chain_add_string(error, &nmreq, "tcp", 3); /* callback r_netid */ + else if (ss.ss_family == AF_INET6) + nfsm_chain_add_string(error, &nmreq, "tcp6", 4); /* callback r_netid */ + nfsm_chain_add_string(error, &nmreq, uaddr, ualen); /* callback r_addr */ nfsm_chain_add_32(error, &nmreq, nmp->nm_cbid); /* callback_ident */ } else { /* don't provide valid callback info */ @@ -255,9 +270,11 @@ nfs4_setclientid(struct nfsmount *nmp) nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request2(NULL, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, thd, cred, R_SETUP, &nmrep, &xid, &status); + error = nfs_request2(NULL, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, thd, cred, NULL, R_SETUP, &nmrep, &xid, &status); nfsm_chain_skip_tag(error, &nmrep); nfsm_chain_get_32(error, &nmrep, numops); + if (!error && (numops != 1) && status) + error = status; nfsm_chain_op_check(error, &nmrep, NFS_OP_SETCLIENTID); if (error == NFSERR_CLID_INUSE) printf("nfs4_setclientid: client ID in use?\n"); @@ -267,43 +284,57 @@ nfs4_setclientid(struct nfsmount *nmp) nfsm_chain_cleanup(&nmreq); nfsm_chain_cleanup(&nmrep); - // SETCLIENTID_CONFIRM, PUTFH, GETATTR(FS) - numops = nmp->nm_dnp ? 3 : 1; - nfsm_chain_build_alloc_init(error, &nmreq, 28 * NFSX_UNSIGNED); + // SETCLIENTID_CONFIRM + numops = 1; + nfsm_chain_build_alloc_init(error, &nmreq, 15 * NFSX_UNSIGNED); nfsm_chain_add_compound_header(error, &nmreq, "setclid_conf", numops); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_SETCLIENTID_CONFIRM); nfsm_chain_add_64(error, &nmreq, nmp->nm_clientid); nfsm_chain_add_64(error, &nmreq, verifier); - if (nmp->nm_dnp) { - /* refresh fs attributes too */ - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); - nfsm_chain_add_fh(error, &nmreq, nmp->nm_vers, nmp->nm_dnp->n_fhp, nmp->nm_dnp->n_fhsize); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - NFS_CLEAR_ATTRIBUTES(bitmap); - NFS4_PER_FS_ATTRIBUTES(bitmap); - nfsm_chain_add_bitmap(error, &nmreq, bitmap, NFS_ATTR_BITMAP_LEN); - } nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request2(NULL, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, thd, cred, R_SETUP, &nmrep, &xid, &status); + error = nfs_request2(NULL, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, thd, cred, NULL, R_SETUP, &nmrep, &xid, &status); nfsm_chain_skip_tag(error, &nmrep); nfsm_chain_get_32(error, &nmrep, numops); nfsm_chain_op_check(error, &nmrep, NFS_OP_SETCLIENTID_CONFIRM); if (error) printf("nfs4_setclientid: confirm error %d\n", error); - if (nmp->nm_dnp) { - nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); - nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsmout_if(error); - lck_mtx_lock(&nmp->nm_lock); - error = nfs4_parsefattr(&nmrep, &nmp->nm_fsattr, NULL, NULL, NULL); - lck_mtx_unlock(&nmp->nm_lock); - } + lck_mtx_lock(&nmp->nm_lock); + if (!error) + nmp->nm_state |= NFSSTA_CLIENTID; + lck_mtx_unlock(&nmp->nm_lock); + nfsmout_if(error || !nmp->nm_dnp); + + /* take the opportunity to refresh fs attributes too */ + // PUTFH, GETATTR(FS) + numops = 2; + nfsm_chain_build_alloc_init(error, &nmreq, 23 * NFSX_UNSIGNED); + nfsm_chain_add_compound_header(error, &nmreq, "setclid_attr", numops); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, nmp->nm_vers, nmp->nm_dnp->n_fhp, nmp->nm_dnp->n_fhsize); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + NFS_CLEAR_ATTRIBUTES(bitmap); + NFS4_PER_FS_ATTRIBUTES(bitmap); + nfsm_chain_add_bitmap(error, &nmreq, bitmap, NFS_ATTR_BITMAP_LEN); + nfsm_chain_build_done(error, &nmreq); + nfsm_assert(error, (numops == 0), EPROTO); + nfsmout_if(error); + error = nfs_request2(NULL, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, thd, cred, NULL, R_SETUP, &nmrep, &xid, &status); + nfsm_chain_skip_tag(error, &nmrep); + nfsm_chain_get_32(error, &nmrep, numops); + lck_mtx_lock(&nmp->nm_lock); + nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); + if (!error) + error = nfs4_parsefattr(&nmrep, &nmp->nm_fsattr, NULL, NULL, NULL, NULL); + lck_mtx_unlock(&nmp->nm_lock); + if (error) /* ignore any error from the getattr */ + error = 0; nfsmout: nfsm_chain_cleanup(&nmreq); nfsm_chain_cleanup(&nmrep); @@ -341,7 +372,7 @@ nfs4_renew(struct nfsmount *nmp, int rpcflag) nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); error = nfs_request2(NULL, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, - current_thread(), cred, rpcflag, &nmrep, &xid, &status); + current_thread(), cred, NULL, rpcflag, &nmrep, &xid, &status); nfsm_chain_skip_tag(error, &nmrep); nfsm_chain_get_32(error, &nmrep, numops); nfsm_chain_op_check(error, &nmrep, NFS_OP_RENEW); @@ -381,8 +412,7 @@ nfs4_renew_timer(void *param0, __unused void *param1) if (error && (error != ETIMEDOUT) && (nmp->nm_clientid == clientid) && !(nmp->nm_state & NFSSTA_RECOVER)) { printf("nfs4_renew_timer: error %d, initiating recovery\n", error); - nmp->nm_state |= NFSSTA_RECOVER; - nfs_mount_sock_thread_wake(nmp); + nfs_need_recover(nmp, error); } interval = nmp->nm_fsattr.nfsa_lease / (error ? 4 : 2); @@ -392,6 +422,1034 @@ nfs4_renew_timer(void *param0, __unused void *param1) nfs_interval_timer_start(nmp->nm_renew_timer, interval * 1000); } +/* + * get the list of supported security flavors + * + * How we get them depends on what args we are given: + * + * FH? Name? Action + * ----- ----- ------ + * YES YES Use the fh and name provided + * YES NO 4.1-only just use the fh provided + * NO YES Use the node's (or root) fh and the name provided + * NO NO Use the node's parent and the node's name (4.1 will just use node's fh) + */ +int +nfs4_secinfo_rpc(struct nfsmount *nmp, struct nfsreq_secinfo_args *siap, kauth_cred_t cred, uint32_t *sec, int *seccountp) +{ + int error = 0, status, nfsvers, numops, namelen, fhsize; + vnode_t dvp = NULLVP; + nfsnode_t np, dnp; + u_char *fhp; + const char *vname = NULL, *name; + uint64_t xid; + struct nfsm_chain nmreq, nmrep; + + *seccountp = 0; + if (!nmp) + return (ENXIO); + nfsvers = nmp->nm_vers; + np = siap->rsia_np; + + nfsm_chain_null(&nmreq); + nfsm_chain_null(&nmrep); + + fhp = siap->rsia_fh; + fhsize = fhp ? siap->rsia_fhsize : 0; + name = siap->rsia_name; + namelen = name ? siap->rsia_namelen : 0; + if (name && !namelen) + namelen = strlen(name); + if (!fhp && name) { + if (!np) /* use PUTROOTFH */ + goto gotargs; + fhp = np->n_fhp; + fhsize = np->n_fhsize; + } + if (fhp && name) + goto gotargs; + + if (!np) + return (EIO); + nfs_node_lock_force(np); + if ((vnode_vtype(NFSTOV(np)) != VDIR) && np->n_sillyrename) { + /* + * The node's been sillyrenamed, so we need to use + * the sillyrename directory/name to do the open. + */ + struct nfs_sillyrename *nsp = np->n_sillyrename; + dnp = nsp->nsr_dnp; + dvp = NFSTOV(dnp); + if ((error = vnode_get(dvp))) { + nfs_node_unlock(np); + goto nfsmout; + } + fhp = dnp->n_fhp; + fhsize = dnp->n_fhsize; + name = nsp->nsr_name; + namelen = nsp->nsr_namlen; + } else { + /* + * [sigh] We can't trust VFS to get the parent right for named + * attribute nodes. (It likes to reparent the nodes after we've + * created them.) Luckily we can probably get the right parent + * from the n_parent we have stashed away. + */ + if ((np->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR) && + (((dvp = np->n_parent)) && (error = vnode_get(dvp)))) + dvp = NULL; + if (!dvp) + dvp = vnode_getparent(NFSTOV(np)); + vname = vnode_getname(NFSTOV(np)); + if (!dvp || !vname) { + if (!error) + error = EIO; + nfs_node_unlock(np); + goto nfsmout; + } + dnp = VTONFS(dvp); + fhp = dnp->n_fhp; + fhsize = dnp->n_fhsize; + name = vname; + namelen = strnlen(vname, MAXPATHLEN); + } + nfs_node_unlock(np); + +gotargs: + // PUT(ROOT)FH + SECINFO + numops = 2; + nfsm_chain_build_alloc_init(error, &nmreq, + 4 * NFSX_UNSIGNED + NFSX_FH(nfsvers) + nfsm_rndup(namelen)); + nfsm_chain_add_compound_header(error, &nmreq, "secinfo", numops); + numops--; + if (fhp) { + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, nfsvers, fhp, fhsize); + } else { + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTROOTFH); + } + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_SECINFO); + nfsm_chain_add_name(error, &nmreq, name, namelen, nmp); + nfsm_chain_build_done(error, &nmreq); + nfsm_assert(error, (numops == 0), EPROTO); + nfsmout_if(error); + error = nfs_request2(np, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, + current_thread(), cred, NULL, 0, &nmrep, &xid, &status); + nfsm_chain_skip_tag(error, &nmrep); + nfsm_chain_get_32(error, &nmrep, numops); + nfsm_chain_op_check(error, &nmrep, fhp ? NFS_OP_PUTFH : NFS_OP_PUTROOTFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_SECINFO); + nfsmout_if(error); + error = nfsm_chain_get_secinfo(&nmrep, sec, seccountp); +nfsmout: + nfsm_chain_cleanup(&nmreq); + nfsm_chain_cleanup(&nmrep); + if (vname) + vnode_putname(vname); + if (dvp != NULLVP) + vnode_put(dvp); + return (error); +} + +/* + * Parse an NFSv4 SECINFO array to an array of pseudo flavors. + * (Note: also works for MOUNTv3 security arrays.) + */ +int +nfsm_chain_get_secinfo(struct nfsm_chain *nmc, uint32_t *sec, int *seccountp) +{ + int error = 0, secmax, seccount, srvcount; + uint32_t flavor, val; + u_char oid[12]; + + seccount = srvcount = 0; + secmax = *seccountp; + *seccountp = 0; + + nfsm_chain_get_32(error, nmc, srvcount); + while (!error && (srvcount > 0) && (seccount < secmax)) { + nfsm_chain_get_32(error, nmc, flavor); + nfsmout_if(error); + switch (flavor) { + case RPCAUTH_NONE: + case RPCAUTH_SYS: + case RPCAUTH_KRB5: + case RPCAUTH_KRB5I: + case RPCAUTH_KRB5P: + sec[seccount++] = flavor; + break; + case RPCSEC_GSS: + /* we only recognize KRB5, KRB5I, KRB5P */ + nfsm_chain_get_32(error, nmc, val); /* OID length */ + nfsmout_if(error); + if (val != sizeof(krb5_mech)) { + nfsm_chain_adv(error, nmc, val); + nfsm_chain_adv(error, nmc, 2*NFSX_UNSIGNED); + break; + } + nfsm_chain_get_opaque(error, nmc, val, oid); /* OID bytes */ + nfsmout_if(error); + if (bcmp(oid, krb5_mech, sizeof(krb5_mech))) { + nfsm_chain_adv(error, nmc, 2*NFSX_UNSIGNED); + break; + } + nfsm_chain_get_32(error, nmc, val); /* QOP */ + nfsm_chain_get_32(error, nmc, val); /* SERVICE */ + nfsmout_if(error); + switch (val) { + case RPCSEC_GSS_SVC_NONE: + sec[seccount++] = RPCAUTH_KRB5; + break; + case RPCSEC_GSS_SVC_INTEGRITY: + sec[seccount++] = RPCAUTH_KRB5I; + break; + case RPCSEC_GSS_SVC_PRIVACY: + sec[seccount++] = RPCAUTH_KRB5P; + break; + } + break; + } + srvcount--; + } +nfsmout: + if (!error) + *seccountp = seccount; + return (error); +} + + +/* + * Fetch the FS_LOCATIONS attribute for the node found at directory/name. + */ +int +nfs4_get_fs_locations( + struct nfsmount *nmp, + nfsnode_t dnp, + u_char *fhp, + int fhsize, + const char *name, + vfs_context_t ctx, + struct nfs_fs_locations *nfslsp) +{ + int error = 0, numops, status; + uint32_t bitmap[NFS_ATTR_BITMAP_LEN]; + struct nfsreq rq, *req = &rq; + struct nfsreq_secinfo_args si; + struct nfsm_chain nmreq, nmrep; + uint64_t xid; + + if (!fhp && dnp) { + fhp = dnp->n_fhp; + fhsize = dnp->n_fhsize; + } + if (!fhp) + return (EINVAL); + + nfsm_chain_null(&nmreq); + nfsm_chain_null(&nmrep); + + NFSREQ_SECINFO_SET(&si, NULL, fhp, fhsize, name, 0); + numops = 3; + nfsm_chain_build_alloc_init(error, &nmreq, 18 * NFSX_UNSIGNED); + nfsm_chain_add_compound_header(error, &nmreq, "fs_locations", numops); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, NFS_VER4, fhp, fhsize); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_LOOKUP); + nfsm_chain_add_name(error, &nmreq, name, strlen(name), nmp); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + NFS_CLEAR_ATTRIBUTES(bitmap); + NFS_BITMAP_SET(bitmap, NFS_FATTR_FS_LOCATIONS); + nfsm_chain_add_bitmap(error, &nmreq, bitmap, NFS_ATTR_BITMAP_LEN); + nfsm_chain_build_done(error, &nmreq); + nfsm_assert(error, (numops == 0), EPROTO); + nfsmout_if(error); + error = nfs_request_async(dnp, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, + vfs_context_thread(ctx), vfs_context_ucred(ctx), &si, 0, NULL, &req); + if (!error) + error = nfs_request_async_finish(req, &nmrep, &xid, &status); + nfsm_chain_skip_tag(error, &nmrep); + nfsm_chain_get_32(error, &nmrep, numops); + nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_LOOKUP); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); + nfsmout_if(error); + error = nfs4_parsefattr(&nmrep, NULL, NULL, NULL, NULL, nfslsp); +nfsmout: + nfsm_chain_cleanup(&nmrep); + nfsm_chain_cleanup(&nmreq); + return (error); +} + +/* + * Referral trigger nodes may not have many attributes provided by the + * server, so put some default values in place. + */ +void +nfs4_default_attrs_for_referral_trigger( + nfsnode_t dnp, + char *name, + int namelen, + struct nfs_vattr *nvap, + fhandle_t *fhp) +{ + struct timeval now; + microtime(&now); + int len; + + nvap->nva_flags = NFS_FFLAG_TRIGGER | NFS_FFLAG_TRIGGER_REFERRAL; + if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_TYPE)) { + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_TYPE); + nvap->nva_type = VDIR; + } + if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_FSID)) { + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_FSID); + nvap->nva_fsid.major = 0; + nvap->nva_fsid.minor = 0; + } + if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_OWNER) && dnp) { + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_OWNER); + nvap->nva_uid = dnp->n_vattr.nva_uid; + nvap->nva_uuuid = dnp->n_vattr.nva_uuuid; + } + if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_OWNER_GROUP) && dnp) { + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_OWNER_GROUP); + nvap->nva_gid = dnp->n_vattr.nva_gid; + nvap->nva_guuid = dnp->n_vattr.nva_guuid; + } + if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_MODE)) { + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_MODE); + nvap->nva_mode = 0777; + } + if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_SIZE)) { + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_SIZE); + nvap->nva_size = 0; + } + if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_SPACE_USED)) { + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_SPACE_USED); + nvap->nva_bytes = 0; + } + if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_NUMLINKS)) { + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_NUMLINKS); + nvap->nva_nlink = 2; + } + if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_TIME_ACCESS)) { + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_TIME_ACCESS); + nvap->nva_timesec[NFSTIME_ACCESS] = now.tv_sec; + nvap->nva_timensec[NFSTIME_ACCESS] = now.tv_usec * 1000; + } + if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_TIME_MODIFY)) { + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_TIME_MODIFY); + nvap->nva_timesec[NFSTIME_MODIFY] = now.tv_sec; + nvap->nva_timensec[NFSTIME_MODIFY] = now.tv_usec * 1000; + } + if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_TIME_METADATA)) { + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_TIME_METADATA); + nvap->nva_timesec[NFSTIME_CHANGE] = now.tv_sec; + nvap->nva_timensec[NFSTIME_CHANGE] = now.tv_usec * 1000; + } + if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_FILEID)) { + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_FILEID); + nvap->nva_fileid = 42; + } + if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_FILEHANDLE) && dnp && name && fhp) { + /* Build a fake filehandle made up of parent node pointer and name */ + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_FILEHANDLE); + bcopy(&dnp, &fhp->fh_data[0], sizeof(dnp)); + len = sizeof(fhp->fh_data) - sizeof(dnp); + bcopy(name, &fhp->fh_data[0] + sizeof(dnp), MIN(len, namelen)); + fhp->fh_len = sizeof(dnp) + namelen; + if (fhp->fh_len > (int)sizeof(fhp->fh_data)) + fhp->fh_len = sizeof(fhp->fh_data); + } +} + +/* + * Set NFS bitmap according to what's set in vnode_attr (and supported by the server). + */ +void +nfs_vattr_set_bitmap(struct nfsmount *nmp, uint32_t *bitmap, struct vnode_attr *vap) +{ + int i; + + NFS_CLEAR_ATTRIBUTES(bitmap); + if (VATTR_IS_ACTIVE(vap, va_data_size)) + NFS_BITMAP_SET(bitmap, NFS_FATTR_SIZE); + if (VATTR_IS_ACTIVE(vap, va_acl) && (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_ACL)) + NFS_BITMAP_SET(bitmap, NFS_FATTR_ACL); + if (VATTR_IS_ACTIVE(vap, va_flags)) { + NFS_BITMAP_SET(bitmap, NFS_FATTR_ARCHIVE); + NFS_BITMAP_SET(bitmap, NFS_FATTR_HIDDEN); + } + // NFS_BITMAP_SET(bitmap, NFS_FATTR_MIMETYPE) + if (VATTR_IS_ACTIVE(vap, va_mode) && !NMFLAG(nmp, ACLONLY)) + NFS_BITMAP_SET(bitmap, NFS_FATTR_MODE); + if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_uuuid)) + NFS_BITMAP_SET(bitmap, NFS_FATTR_OWNER); + if (VATTR_IS_ACTIVE(vap, va_gid) || VATTR_IS_ACTIVE(vap, va_guuid)) + NFS_BITMAP_SET(bitmap, NFS_FATTR_OWNER_GROUP); + // NFS_BITMAP_SET(bitmap, NFS_FATTR_SYSTEM) + if (vap->va_vaflags & VA_UTIMES_NULL) { + NFS_BITMAP_SET(bitmap, NFS_FATTR_TIME_ACCESS_SET); + NFS_BITMAP_SET(bitmap, NFS_FATTR_TIME_MODIFY_SET); + } else { + if (VATTR_IS_ACTIVE(vap, va_access_time)) + NFS_BITMAP_SET(bitmap, NFS_FATTR_TIME_ACCESS_SET); + if (VATTR_IS_ACTIVE(vap, va_modify_time)) + NFS_BITMAP_SET(bitmap, NFS_FATTR_TIME_MODIFY_SET); + } + if (VATTR_IS_ACTIVE(vap, va_backup_time)) + NFS_BITMAP_SET(bitmap, NFS_FATTR_TIME_BACKUP); + if (VATTR_IS_ACTIVE(vap, va_create_time)) + NFS_BITMAP_SET(bitmap, NFS_FATTR_TIME_CREATE); + /* and limit to what is supported by server */ + for (i=0; i < NFS_ATTR_BITMAP_LEN; i++) + bitmap[i] &= nmp->nm_fsattr.nfsa_supp_attr[i]; +} + +/* + * Convert between NFSv4 and VFS ACE types + */ +uint32_t +nfs4_ace_nfstype_to_vfstype(uint32_t nfsacetype, int *errorp) +{ + switch (nfsacetype) { + case NFS_ACE_ACCESS_ALLOWED_ACE_TYPE: + return KAUTH_ACE_PERMIT; + case NFS_ACE_ACCESS_DENIED_ACE_TYPE: + return KAUTH_ACE_DENY; + case NFS_ACE_SYSTEM_AUDIT_ACE_TYPE: + return KAUTH_ACE_AUDIT; + case NFS_ACE_SYSTEM_ALARM_ACE_TYPE: + return KAUTH_ACE_ALARM; + } + *errorp = EBADRPC; + return 0; +} + +uint32_t +nfs4_ace_vfstype_to_nfstype(uint32_t vfstype, int *errorp) +{ + switch (vfstype) { + case KAUTH_ACE_PERMIT: + return NFS_ACE_ACCESS_ALLOWED_ACE_TYPE; + case KAUTH_ACE_DENY: + return NFS_ACE_ACCESS_DENIED_ACE_TYPE; + case KAUTH_ACE_AUDIT: + return NFS_ACE_SYSTEM_AUDIT_ACE_TYPE; + case KAUTH_ACE_ALARM: + return NFS_ACE_SYSTEM_ALARM_ACE_TYPE; + } + *errorp = EINVAL; + return 0; +} + +/* + * Convert between NFSv4 and VFS ACE flags + */ +uint32_t +nfs4_ace_nfsflags_to_vfsflags(uint32_t nfsflags) +{ + uint32_t vfsflags = 0; + + if (nfsflags & NFS_ACE_FILE_INHERIT_ACE) + vfsflags |= KAUTH_ACE_FILE_INHERIT; + if (nfsflags & NFS_ACE_DIRECTORY_INHERIT_ACE) + vfsflags |= KAUTH_ACE_DIRECTORY_INHERIT; + if (nfsflags & NFS_ACE_NO_PROPAGATE_INHERIT_ACE) + vfsflags |= KAUTH_ACE_LIMIT_INHERIT; + if (nfsflags & NFS_ACE_INHERIT_ONLY_ACE) + vfsflags |= KAUTH_ACE_ONLY_INHERIT; + if (nfsflags & NFS_ACE_SUCCESSFUL_ACCESS_ACE_FLAG) + vfsflags |= KAUTH_ACE_SUCCESS; + if (nfsflags & NFS_ACE_FAILED_ACCESS_ACE_FLAG) + vfsflags |= KAUTH_ACE_FAILURE; + if (nfsflags & NFS_ACE_INHERITED_ACE) + vfsflags |= KAUTH_ACE_INHERITED; + + return (vfsflags); +} + +uint32_t +nfs4_ace_vfsflags_to_nfsflags(uint32_t vfsflags) +{ + uint32_t nfsflags = 0; + + if (vfsflags & KAUTH_ACE_FILE_INHERIT) + nfsflags |= NFS_ACE_FILE_INHERIT_ACE; + if (vfsflags & KAUTH_ACE_DIRECTORY_INHERIT) + nfsflags |= NFS_ACE_DIRECTORY_INHERIT_ACE; + if (vfsflags & KAUTH_ACE_LIMIT_INHERIT) + nfsflags |= NFS_ACE_NO_PROPAGATE_INHERIT_ACE; + if (vfsflags & KAUTH_ACE_ONLY_INHERIT) + nfsflags |= NFS_ACE_INHERIT_ONLY_ACE; + if (vfsflags & KAUTH_ACE_SUCCESS) + nfsflags |= NFS_ACE_SUCCESSFUL_ACCESS_ACE_FLAG; + if (vfsflags & KAUTH_ACE_FAILURE) + nfsflags |= NFS_ACE_FAILED_ACCESS_ACE_FLAG; + if (vfsflags & KAUTH_ACE_INHERITED) + nfsflags |= NFS_ACE_INHERITED_ACE; + + return (nfsflags); +} + +/* + * Convert between NFSv4 ACE access masks and VFS access rights + */ +uint32_t +nfs4_ace_nfsmask_to_vfsrights(uint32_t nfsmask) +{ + uint32_t vfsrights = 0; + + if (nfsmask & NFS_ACE_READ_DATA) + vfsrights |= KAUTH_VNODE_READ_DATA; + if (nfsmask & NFS_ACE_LIST_DIRECTORY) + vfsrights |= KAUTH_VNODE_LIST_DIRECTORY; + if (nfsmask & NFS_ACE_WRITE_DATA) + vfsrights |= KAUTH_VNODE_WRITE_DATA; + if (nfsmask & NFS_ACE_ADD_FILE) + vfsrights |= KAUTH_VNODE_ADD_FILE; + if (nfsmask & NFS_ACE_APPEND_DATA) + vfsrights |= KAUTH_VNODE_APPEND_DATA; + if (nfsmask & NFS_ACE_ADD_SUBDIRECTORY) + vfsrights |= KAUTH_VNODE_ADD_SUBDIRECTORY; + if (nfsmask & NFS_ACE_READ_NAMED_ATTRS) + vfsrights |= KAUTH_VNODE_READ_EXTATTRIBUTES; + if (nfsmask & NFS_ACE_WRITE_NAMED_ATTRS) + vfsrights |= KAUTH_VNODE_WRITE_EXTATTRIBUTES; + if (nfsmask & NFS_ACE_EXECUTE) + vfsrights |= KAUTH_VNODE_EXECUTE; + if (nfsmask & NFS_ACE_DELETE_CHILD) + vfsrights |= KAUTH_VNODE_DELETE_CHILD; + if (nfsmask & NFS_ACE_READ_ATTRIBUTES) + vfsrights |= KAUTH_VNODE_READ_ATTRIBUTES; + if (nfsmask & NFS_ACE_WRITE_ATTRIBUTES) + vfsrights |= KAUTH_VNODE_WRITE_ATTRIBUTES; + if (nfsmask & NFS_ACE_DELETE) + vfsrights |= KAUTH_VNODE_DELETE; + if (nfsmask & NFS_ACE_READ_ACL) + vfsrights |= KAUTH_VNODE_READ_SECURITY; + if (nfsmask & NFS_ACE_WRITE_ACL) + vfsrights |= KAUTH_VNODE_WRITE_SECURITY; + if (nfsmask & NFS_ACE_WRITE_OWNER) + vfsrights |= KAUTH_VNODE_CHANGE_OWNER; + if (nfsmask & NFS_ACE_SYNCHRONIZE) + vfsrights |= KAUTH_VNODE_SYNCHRONIZE; + if ((nfsmask & NFS_ACE_GENERIC_READ) == NFS_ACE_GENERIC_READ) + vfsrights |= KAUTH_ACE_GENERIC_READ; + if ((nfsmask & NFS_ACE_GENERIC_WRITE) == NFS_ACE_GENERIC_WRITE) + vfsrights |= KAUTH_ACE_GENERIC_WRITE; + if ((nfsmask & NFS_ACE_GENERIC_EXECUTE) == NFS_ACE_GENERIC_EXECUTE) + vfsrights |= KAUTH_ACE_GENERIC_EXECUTE; + + return (vfsrights); +} + +uint32_t +nfs4_ace_vfsrights_to_nfsmask(uint32_t vfsrights) +{ + uint32_t nfsmask = 0; + + if (vfsrights & KAUTH_VNODE_READ_DATA) + nfsmask |= NFS_ACE_READ_DATA; + if (vfsrights & KAUTH_VNODE_LIST_DIRECTORY) + nfsmask |= NFS_ACE_LIST_DIRECTORY; + if (vfsrights & KAUTH_VNODE_WRITE_DATA) + nfsmask |= NFS_ACE_WRITE_DATA; + if (vfsrights & KAUTH_VNODE_ADD_FILE) + nfsmask |= NFS_ACE_ADD_FILE; + if (vfsrights & KAUTH_VNODE_APPEND_DATA) + nfsmask |= NFS_ACE_APPEND_DATA; + if (vfsrights & KAUTH_VNODE_ADD_SUBDIRECTORY) + nfsmask |= NFS_ACE_ADD_SUBDIRECTORY; + if (vfsrights & KAUTH_VNODE_READ_EXTATTRIBUTES) + nfsmask |= NFS_ACE_READ_NAMED_ATTRS; + if (vfsrights & KAUTH_VNODE_WRITE_EXTATTRIBUTES) + nfsmask |= NFS_ACE_WRITE_NAMED_ATTRS; + if (vfsrights & KAUTH_VNODE_EXECUTE) + nfsmask |= NFS_ACE_EXECUTE; + if (vfsrights & KAUTH_VNODE_DELETE_CHILD) + nfsmask |= NFS_ACE_DELETE_CHILD; + if (vfsrights & KAUTH_VNODE_READ_ATTRIBUTES) + nfsmask |= NFS_ACE_READ_ATTRIBUTES; + if (vfsrights & KAUTH_VNODE_WRITE_ATTRIBUTES) + nfsmask |= NFS_ACE_WRITE_ATTRIBUTES; + if (vfsrights & KAUTH_VNODE_DELETE) + nfsmask |= NFS_ACE_DELETE; + if (vfsrights & KAUTH_VNODE_READ_SECURITY) + nfsmask |= NFS_ACE_READ_ACL; + if (vfsrights & KAUTH_VNODE_WRITE_SECURITY) + nfsmask |= NFS_ACE_WRITE_ACL; + if (vfsrights & KAUTH_VNODE_CHANGE_OWNER) + nfsmask |= NFS_ACE_WRITE_OWNER; + if (vfsrights & KAUTH_VNODE_SYNCHRONIZE) + nfsmask |= NFS_ACE_SYNCHRONIZE; + if (vfsrights & KAUTH_ACE_GENERIC_READ) + nfsmask |= NFS_ACE_GENERIC_READ; + if (vfsrights & KAUTH_ACE_GENERIC_WRITE) + nfsmask |= NFS_ACE_GENERIC_WRITE; + if (vfsrights & KAUTH_ACE_GENERIC_EXECUTE) + nfsmask |= NFS_ACE_GENERIC_EXECUTE; + if (vfsrights & KAUTH_ACE_GENERIC_ALL) + nfsmask |= (KAUTH_ACE_GENERIC_READ|KAUTH_ACE_GENERIC_WRITE|NFS_ACE_GENERIC_EXECUTE); + + return (nfsmask); +} + +/* + * Map an NFSv4 ID string to a VFS guid. + * + * Try to use the ID mapping service... but we may fallback to trying to do it ourselves. + */ +int +nfs4_id2guid(/*const*/ char *id, guid_t *guidp, int isgroup) +{ + int error1 = 0, error = 0, compare; + guid_t guid1, guid2, *gp; + ntsid_t sid; + long num, unknown; + const char *p, *at; + + *guidp = kauth_null_guid; + compare = ((nfs_idmap_ctrl & NFS_IDMAP_CTRL_USE_IDMAP_SERVICE) && + (nfs_idmap_ctrl & NFS_IDMAP_CTRL_COMPARE_RESULTS)); + unknown = (nfs_idmap_ctrl & NFS_IDMAP_CTRL_UNKNOWN_IS_99) ? 99 : -2; + + /* + * First check if it is just a simple numeric ID string or a special "XXX@" name. + * If it's a number, there's no need trying to ask the IDMAP service to map it. + * If it's a special "XXX@" name, we want to make sure to treat it as a group. + */ + num = 1; + at = NULL; + p = id; + while (*p) { + if ((*p < '0') || (*p > '9')) + num = 0; + if (*p == '@') + at = p; + p++; + } + if (at && !at[1] && !isgroup) + isgroup = 1; /* special "XXX@" names should always be treated as groups */ + if (num) { + /* must be numeric ID (or empty) */ + num = *id ? strtol(id, NULL, 10) : unknown; + gp = guidp; + goto gotnumid; + } + + if (nfs_idmap_ctrl & NFS_IDMAP_CTRL_USE_IDMAP_SERVICE) { + /* + * Ask the ID mapping service to map the ID string to a GUID. + * + * [sigh] this isn't a "pwnam/grnam" it's an NFS ID string! + */ + gp = compare ? &guid1 : guidp; + if (isgroup) + error = kauth_cred_grnam2guid(id, gp); + else + error = kauth_cred_pwnam2guid(id, gp); + if (error && (nfs_idmap_ctrl & NFS_IDMAP_CTRL_LOG_FAILED_MAPPINGS)) + printf("nfs4_id2guid: idmap failed for %s %s error %d\n", id, isgroup ? "G" : " ", error); + if (!error && (nfs_idmap_ctrl & NFS_IDMAP_CTRL_LOG_SUCCESSFUL_MAPPINGS)) + printf("nfs4_id2guid: idmap for %s %s got guid " + "%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x\n", + id, isgroup ? "G" : " ", + gp->g_guid[0], gp->g_guid[1], gp->g_guid[2], gp->g_guid[3], + gp->g_guid[4], gp->g_guid[5], gp->g_guid[6], gp->g_guid[7], + gp->g_guid[8], gp->g_guid[9], gp->g_guid[10], gp->g_guid[11], + gp->g_guid[12], gp->g_guid[13], gp->g_guid[14], gp->g_guid[15]); + error1 = error; + } + if (error || compare || !(nfs_idmap_ctrl & NFS_IDMAP_CTRL_USE_IDMAP_SERVICE)) { + /* + * fallback path... see if we can come up with an answer ourselves. + */ + gp = compare ? &guid2 : guidp; + + if (!(nfs_idmap_ctrl & NFS_IDMAP_CTRL_FALLBACK_NO_WELLKNOWN_IDS) && at && !at[1]) { + /* must be a special ACE "who" ID */ + bzero(&sid, sizeof(sid)); + sid.sid_kind = 1; + sid.sid_authcount = 1; + if (!strcmp(id, "OWNER@")) { + // S-1-3-0 + sid.sid_authority[5] = 3; + sid.sid_authorities[0] = 0; + } else if (!strcmp(id, "GROUP@")) { + // S-1-3-1 + sid.sid_authority[5] = 3; + sid.sid_authorities[0] = 1; + } else if (!strcmp(id, "EVERYONE@")) { + // S-1-1-0 + sid.sid_authority[5] = 1; + sid.sid_authorities[0] = 0; + } else if (!strcmp(id, "INTERACTIVE@")) { + // S-1-5-4 + sid.sid_authority[5] = 5; + sid.sid_authorities[0] = 4; + } else if (!strcmp(id, "NETWORK@")) { + // S-1-5-2 + sid.sid_authority[5] = 5; + sid.sid_authorities[0] = 2; + } else if (!strcmp(id, "DIALUP@")) { + // S-1-5-1 + sid.sid_authority[5] = 5; + sid.sid_authorities[0] = 1; + } else if (!strcmp(id, "BATCH@")) { + // S-1-5-3 + sid.sid_authority[5] = 5; + sid.sid_authorities[0] = 3; + } else if (!strcmp(id, "ANONYMOUS@")) { + // S-1-5-7 + sid.sid_authority[5] = 5; + sid.sid_authorities[0] = 7; + } else if (!strcmp(id, "AUTHENTICATED@")) { + // S-1-5-11 + sid.sid_authority[5] = 5; + sid.sid_authorities[0] = 11; + } else if (!strcmp(id, "SERVICE@")) { + // S-1-5-6 + sid.sid_authority[5] = 5; + sid.sid_authorities[0] = 6; + } else { + // S-1-0-0 "NOBODY" + sid.sid_authority[5] = 0; + sid.sid_authorities[0] = 0; + } + error = kauth_cred_ntsid2guid(&sid, gp); + } else { + if (!(nfs_idmap_ctrl & NFS_IDMAP_CTRL_FALLBACK_NO_COMMON_IDS) && at) { + /* must be user@domain */ + /* try to identify some well-known IDs */ + if (!strncmp(id, "root@", 5)) + num = 0; + else if (!strncmp(id, "wheel@", 6)) + num = 0; + else if (!strncmp(id, "nobody@", 7)) + num = -2; + else if (!strncmp(id, "nfsnobody@", 10)) + num = -2; + else + num = unknown; + } else if (!(nfs_idmap_ctrl & NFS_IDMAP_CTRL_FALLBACK_NO_COMMON_IDS) && !strcmp(id, "nobody")) { + num = -2; + } else { + num = unknown; + } +gotnumid: + if (isgroup) + error = kauth_cred_gid2guid((gid_t)num, gp); + else + error = kauth_cred_uid2guid((uid_t)num, gp); + } + if (error && (nfs_idmap_ctrl & NFS_IDMAP_CTRL_LOG_FAILED_MAPPINGS)) + printf("nfs4_id2guid: fallback map failed for %s %s error %d\n", id, isgroup ? "G" : " ", error); + if (!error && (nfs_idmap_ctrl & NFS_IDMAP_CTRL_LOG_SUCCESSFUL_MAPPINGS)) + printf("nfs4_id2guid: fallback map for %s %s got guid " + "%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x\n", + id, isgroup ? "G" : " ", + gp->g_guid[0], gp->g_guid[1], gp->g_guid[2], gp->g_guid[3], + gp->g_guid[4], gp->g_guid[5], gp->g_guid[6], gp->g_guid[7], + gp->g_guid[8], gp->g_guid[9], gp->g_guid[10], gp->g_guid[11], + gp->g_guid[12], gp->g_guid[13], gp->g_guid[14], gp->g_guid[15]); + } + + if (compare) { + /* compare the results, log if different */ + if (!error1 && !error) { + if (!kauth_guid_equal(&guid1, &guid2)) + printf("nfs4_id2guid: idmap/fallback results differ for %s %s - " + "idmap %02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x " + "fallback %02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x\n", + id, isgroup ? "G" : " ", + guid1.g_guid[0], guid1.g_guid[1], guid1.g_guid[2], guid1.g_guid[3], + guid1.g_guid[4], guid1.g_guid[5], guid1.g_guid[6], guid1.g_guid[7], + guid1.g_guid[8], guid1.g_guid[9], guid1.g_guid[10], guid1.g_guid[11], + guid1.g_guid[12], guid1.g_guid[13], guid1.g_guid[14], guid1.g_guid[15], + guid2.g_guid[0], guid2.g_guid[1], guid2.g_guid[2], guid2.g_guid[3], + guid2.g_guid[4], guid2.g_guid[5], guid2.g_guid[6], guid2.g_guid[7], + guid2.g_guid[8], guid2.g_guid[9], guid2.g_guid[10], guid2.g_guid[11], + guid2.g_guid[12], guid2.g_guid[13], guid2.g_guid[14], guid2.g_guid[15]); + /* copy idmap result to output guid */ + *guidp = guid1; + } else if (error1 && !error) { + printf("nfs4_id2guid: idmap/fallback results differ for %s %s - " + "idmap error %d " + "fallback %02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x\n", + id, isgroup ? "G" : " ", + error1, + guid2.g_guid[0], guid2.g_guid[1], guid2.g_guid[2], guid2.g_guid[3], + guid2.g_guid[4], guid2.g_guid[5], guid2.g_guid[6], guid2.g_guid[7], + guid2.g_guid[8], guid2.g_guid[9], guid2.g_guid[10], guid2.g_guid[11], + guid2.g_guid[12], guid2.g_guid[13], guid2.g_guid[14], guid2.g_guid[15]); + /* copy fallback result to output guid */ + *guidp = guid2; + } else if (!error1 && error) { + printf("nfs4_id2guid: idmap/fallback results differ for %s %s - " + "idmap %02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x " + "fallback error %d\n", + id, isgroup ? "G" : " ", + guid1.g_guid[0], guid1.g_guid[1], guid1.g_guid[2], guid1.g_guid[3], + guid1.g_guid[4], guid1.g_guid[5], guid1.g_guid[6], guid1.g_guid[7], + guid1.g_guid[8], guid1.g_guid[9], guid1.g_guid[10], guid1.g_guid[11], + guid1.g_guid[12], guid1.g_guid[13], guid1.g_guid[14], guid1.g_guid[15], + error); + /* copy idmap result to output guid */ + *guidp = guid1; + error = 0; + } else { + if (error1 != error) + printf("nfs4_id2guid: idmap/fallback results differ for %s %s - " + "idmap error %d fallback error %d\n", + id, isgroup ? "G" : " ", error1, error); + } + } + + return (error); +} + +/* + * Map a VFS guid to an NFSv4 ID string. + * + * Try to use the ID mapping service... but we may fallback to trying to do it ourselves. + */ +int +nfs4_guid2id(guid_t *guidp, char *id, int *idlen, int isgroup) +{ + int error1 = 0, error = 0, compare; + int id1len, id2len, len; + char *id1buf, *id1; + char numbuf[32]; + const char *id2 = NULL; + + id1buf = id1 = NULL; + id1len = id2len = 0; + compare = ((nfs_idmap_ctrl & NFS_IDMAP_CTRL_USE_IDMAP_SERVICE) && + (nfs_idmap_ctrl & NFS_IDMAP_CTRL_COMPARE_RESULTS)); + + if (nfs_idmap_ctrl & NFS_IDMAP_CTRL_USE_IDMAP_SERVICE) { + /* + * Ask the ID mapping service to map the GUID to an ID string. + * + * [sigh] this isn't a "pwnam" it's an NFS id string! + */ + + /* + * Stupid kauth_cred_guid2pwnam() function requires that the buffer + * be at least MAXPATHLEN bytes long even though most if not all ID + * strings will be much much shorter than that. + */ + if (compare || (*idlen < MAXPATHLEN)) { + MALLOC_ZONE(id1buf, char*, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (!id1buf) + return (ENOMEM); + id1 = id1buf; + id1len = MAXPATHLEN; + } else { + id1 = id; + id1len = *idlen; + } + + if (isgroup) + error = kauth_cred_guid2grnam(guidp, id1); + else + error = kauth_cred_guid2pwnam(guidp, id1); + if (error && (nfs_idmap_ctrl & NFS_IDMAP_CTRL_LOG_FAILED_MAPPINGS)) + printf("nfs4_guid2id: idmap failed for " + "%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x %s " + "error %d\n", + guidp->g_guid[0], guidp->g_guid[1], guidp->g_guid[2], guidp->g_guid[3], + guidp->g_guid[4], guidp->g_guid[5], guidp->g_guid[6], guidp->g_guid[7], + guidp->g_guid[8], guidp->g_guid[9], guidp->g_guid[10], guidp->g_guid[11], + guidp->g_guid[12], guidp->g_guid[13], guidp->g_guid[14], guidp->g_guid[15], + isgroup ? "G" : " ", error); + if (!error && (nfs_idmap_ctrl & NFS_IDMAP_CTRL_LOG_SUCCESSFUL_MAPPINGS)) + printf("nfs4_guid2id: idmap for " + "%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x %s " + "got ID %s\n", + guidp->g_guid[0], guidp->g_guid[1], guidp->g_guid[2], guidp->g_guid[3], + guidp->g_guid[4], guidp->g_guid[5], guidp->g_guid[6], guidp->g_guid[7], + guidp->g_guid[8], guidp->g_guid[9], guidp->g_guid[10], guidp->g_guid[11], + guidp->g_guid[12], guidp->g_guid[13], guidp->g_guid[14], guidp->g_guid[15], + isgroup ? "G" : " ", id1); + error1 = error; + if (!error) { + if (compare) { + id1len = strnlen(id1, id1len); + } else if (id1 == id1buf) { + /* copy idmap result to output buffer */ + len = strlcpy(id, id1, *idlen); + if (len >= *idlen) + error = ENOSPC; + else + *idlen = len; + } + } + } + if (error || compare || !(nfs_idmap_ctrl & NFS_IDMAP_CTRL_USE_IDMAP_SERVICE)) { + /* + * fallback path... see if we can come up with an answer ourselves. + */ + ntsid_t sid; + uid_t uid; + + if (!(nfs_idmap_ctrl & NFS_IDMAP_CTRL_FALLBACK_NO_WELLKNOWN_IDS)) { + error = kauth_cred_guid2ntsid(guidp, &sid); + if (!error && (sid.sid_kind == 1) && (sid.sid_authcount == 1)) { + /* check if it's one of our well-known ACE WHO names */ + if (sid.sid_authority[5] == 0) { + if (sid.sid_authorities[0] == 0) // S-1-0-0 + id2 = "nobody@localdomain"; + } else if (sid.sid_authority[5] == 1) { + if (sid.sid_authorities[0] == 0) // S-1-1-0 + id2 = "EVERYONE@"; + } else if (sid.sid_authority[5] == 3) { + if (sid.sid_authorities[0] == 0) // S-1-3-0 + id2 = "OWNER@"; + else if (sid.sid_authorities[0] == 1) // S-1-3-1 + id2 = "GROUP@"; + } else if (sid.sid_authority[5] == 5) { + if (sid.sid_authorities[0] == ntohl(1)) // S-1-5-1 + id2 = "DIALUP@"; + else if (sid.sid_authorities[0] == ntohl(2)) // S-1-5-2 + id2 = "NETWORK@"; + else if (sid.sid_authorities[0] == ntohl(3)) // S-1-5-3 + id2 = "BATCH@"; + else if (sid.sid_authorities[0] == ntohl(4)) // S-1-5-4 + id2 = "INTERACTIVE@"; + else if (sid.sid_authorities[0] == ntohl(6)) // S-1-5-6 + id2 = "SERVICE@"; + else if (sid.sid_authorities[0] == ntohl(7)) // S-1-5-7 + id2 = "ANONYMOUS@"; + else if (sid.sid_authorities[0] == ntohl(11)) // S-1-5-11 + id2 = "AUTHENTICATED@"; + } + } + } + if (!id2) { + /* OK, let's just try mapping it to a UID/GID */ + if (isgroup) + error = kauth_cred_guid2gid(guidp, (gid_t*)&uid); + else + error = kauth_cred_guid2uid(guidp, &uid); + if (!error) { + if (!(nfs_idmap_ctrl & NFS_IDMAP_CTRL_FALLBACK_NO_COMMON_IDS)) { + /* map well known uid's to strings */ + if (uid == 0) + id2 = isgroup ? "wheel@localdomain" : "root@localdomain"; + else if (uid == (uid_t)-2) + id2 = "nobody@localdomain"; + } + if (!id2) { + /* or just use a decimal number string. */ + snprintf(numbuf, sizeof(numbuf), "%d", uid); + id2 = numbuf; + } + } + } + if (error && (nfs_idmap_ctrl & NFS_IDMAP_CTRL_LOG_FAILED_MAPPINGS)) + printf("nfs4_guid2id: fallback map failed for " + "%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x %s " + "error %d\n", + guidp->g_guid[0], guidp->g_guid[1], guidp->g_guid[2], guidp->g_guid[3], + guidp->g_guid[4], guidp->g_guid[5], guidp->g_guid[6], guidp->g_guid[7], + guidp->g_guid[8], guidp->g_guid[9], guidp->g_guid[10], guidp->g_guid[11], + guidp->g_guid[12], guidp->g_guid[13], guidp->g_guid[14], guidp->g_guid[15], + isgroup ? "G" : " ", error); + if (!error && (nfs_idmap_ctrl & NFS_IDMAP_CTRL_LOG_SUCCESSFUL_MAPPINGS)) + printf("nfs4_guid2id: fallback map for " + "%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x %s " + "got ID %s\n", + guidp->g_guid[0], guidp->g_guid[1], guidp->g_guid[2], guidp->g_guid[3], + guidp->g_guid[4], guidp->g_guid[5], guidp->g_guid[6], guidp->g_guid[7], + guidp->g_guid[8], guidp->g_guid[9], guidp->g_guid[10], guidp->g_guid[11], + guidp->g_guid[12], guidp->g_guid[13], guidp->g_guid[14], guidp->g_guid[15], + isgroup ? "G" : " ", id2); + if (!error && id2) { + if (compare) { + id2len = strnlen(id2, MAXPATHLEN); + } else { + /* copy fallback result to output buffer */ + len = strlcpy(id, id2, *idlen); + if (len >= *idlen) + error = ENOSPC; + else + *idlen = len; + } + } + } + + if (compare) { + /* compare the results, log if different */ + if (!error1 && !error) { + if ((id1len != id2len) || strncmp(id1, id2, id1len)) + printf("nfs4_guid2id: idmap/fallback results differ for " + "%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x %s " + "idmap %s fallback %s\n", + guidp->g_guid[0], guidp->g_guid[1], guidp->g_guid[2], guidp->g_guid[3], + guidp->g_guid[4], guidp->g_guid[5], guidp->g_guid[6], guidp->g_guid[7], + guidp->g_guid[8], guidp->g_guid[9], guidp->g_guid[10], guidp->g_guid[11], + guidp->g_guid[12], guidp->g_guid[13], guidp->g_guid[14], guidp->g_guid[15], + isgroup ? "G" : " ", id1, id2); + if (id1 == id1buf) { + /* copy idmap result to output buffer */ + len = strlcpy(id, id1, *idlen); + if (len >= *idlen) + error = ENOSPC; + else + *idlen = len; + } + } else if (error1 && !error) { + printf("nfs4_guid2id: idmap/fallback results differ for " + "%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x %s " + "idmap error %d fallback %s\n", + guidp->g_guid[0], guidp->g_guid[1], guidp->g_guid[2], guidp->g_guid[3], + guidp->g_guid[4], guidp->g_guid[5], guidp->g_guid[6], guidp->g_guid[7], + guidp->g_guid[8], guidp->g_guid[9], guidp->g_guid[10], guidp->g_guid[11], + guidp->g_guid[12], guidp->g_guid[13], guidp->g_guid[14], guidp->g_guid[15], + isgroup ? "G" : " ", error1, id2); + /* copy fallback result to output buffer */ + len = strlcpy(id, id2, *idlen); + if (len >= *idlen) + error = ENOSPC; + else + *idlen = len; + } else if (!error1 && error) { + printf("nfs4_guid2id: idmap/fallback results differ for " + "%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x %s " + "idmap %s fallback error %d\n", + guidp->g_guid[0], guidp->g_guid[1], guidp->g_guid[2], guidp->g_guid[3], + guidp->g_guid[4], guidp->g_guid[5], guidp->g_guid[6], guidp->g_guid[7], + guidp->g_guid[8], guidp->g_guid[9], guidp->g_guid[10], guidp->g_guid[11], + guidp->g_guid[12], guidp->g_guid[13], guidp->g_guid[14], guidp->g_guid[15], + isgroup ? "G" : " ", id1, error); + if (id1 == id1buf) { + /* copy idmap result to output buffer */ + len = strlcpy(id, id1, *idlen); + if (len >= *idlen) + error = ENOSPC; + else + *idlen = len; + } + error = 0; + } else { + if (error1 != error) + printf("nfs4_guid2id: idmap/fallback results differ for %s %s - " + "idmap error %d fallback error %d\n", + id, isgroup ? "G" : " ", error1, error); + } + } + if (id1buf) + FREE_ZONE(id1buf, MAXPATHLEN, M_NAMEI); + return (error); +} + + /* * Set a vnode attr's supported bits according to the given bitmap */ @@ -403,11 +1461,10 @@ nfs_vattr_set_supported(uint32_t *bitmap, struct vnode_attr *vap) // if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_CHANGE)) if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_SIZE)) VATTR_SET_SUPPORTED(vap, va_data_size); - // if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_NAMED_ATTR)) if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_FSID)) VATTR_SET_SUPPORTED(vap, va_fsid); -// if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_ACL)) -// VATTR_SET_SUPPORTED(vap, va_acl); + if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_ACL)) + VATTR_SET_SUPPORTED(vap, va_acl); if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_ARCHIVE)) VATTR_SET_SUPPORTED(vap, va_flags); if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_FILEID)) @@ -419,10 +1476,14 @@ nfs_vattr_set_supported(uint32_t *bitmap, struct vnode_attr *vap) VATTR_SET_SUPPORTED(vap, va_mode); if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_NUMLINKS)) VATTR_SET_SUPPORTED(vap, va_nlink); - if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_OWNER)) + if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_OWNER)) { VATTR_SET_SUPPORTED(vap, va_uid); - if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_OWNER_GROUP)) + VATTR_SET_SUPPORTED(vap, va_uuuid); + } + if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_OWNER_GROUP)) { VATTR_SET_SUPPORTED(vap, va_gid); + VATTR_SET_SUPPORTED(vap, va_guuid); + } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_RAWDEV)) VATTR_SET_SUPPORTED(vap, va_rdev); if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_SPACE_USED)) @@ -450,15 +1511,20 @@ nfs4_parsefattr( struct nfs_fsattr *nfsap, struct nfs_vattr *nvap, fhandle_t *fhp, - struct dqblk *dqbp) + struct dqblk *dqbp, + struct nfs_fs_locations *nfslsp) { - int error = 0, attrbytes; - uint32_t val, val2, val3, i, j; - uint32_t bitmap[NFS_ATTR_BITMAP_LEN], len; - char *s; + int error = 0, error2, rderror = 0, attrbytes; + uint32_t val, val2, val3, i; + uint32_t bitmap[NFS_ATTR_BITMAP_LEN], len, slen; + char sbuf[64], *s; struct nfs_fsattr nfsa_dummy; struct nfs_vattr nva_dummy; struct dqblk dqb_dummy; + kauth_acl_t acl = NULL; + uint32_t ace_type, ace_flags, ace_mask; + struct nfs_fs_locations nfsls_dummy; + struct sockaddr_storage ss; /* if not interested in some values... throw 'em into a local dummy variable */ if (!nfsap) @@ -467,8 +1533,14 @@ nfs4_parsefattr( nvap = &nva_dummy; if (!dqbp) dqbp = &dqb_dummy; + if (!nfslsp) + nfslsp = &nfsls_dummy; + bzero(nfslsp, sizeof(*nfslsp)); attrbytes = val = val2 = val3 = 0; + s = sbuf; + slen = sizeof(sbuf); + NVATTR_INIT(nvap); len = NFS_ATTR_BITMAP_LEN; nfsm_chain_get_bitmap(error, nmc, bitmap, len); @@ -489,17 +1561,19 @@ nfs4_parsefattr( if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_TYPE)) { nfsm_chain_get_32(error, nmc, val); nvap->nva_type = nfstov_type(val, NFS_VER4); + if ((val == NFATTRDIR) || (val == NFNAMEDATTR)) + nvap->nva_flags |= NFS_FFLAG_IS_ATTR; + else + nvap->nva_flags &= ~NFS_FFLAG_IS_ATTR; attrbytes -= NFSX_UNSIGNED; } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_FH_EXPIRE_TYPE)) { nfsm_chain_get_32(error, nmc, val); nfsmout_if(error); - if (val != NFS_FH_PERSISTENT) - printf("nfs: warning: non-persistent file handles!\n"); - if (val & ~0xff) - printf("nfs: warning unknown fh type: 0x%x\n", val); nfsap->nfsa_flags &= ~NFS_FSFLAG_FHTYPE_MASK; nfsap->nfsa_flags |= val << NFS_FSFLAG_FHTYPE_SHIFT; + if (val & ~0xff) + printf("nfs: warning unknown fh type: 0x%x\n", val); attrbytes -= NFSX_UNSIGNED; } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_CHANGE)) { @@ -529,9 +1603,9 @@ nfs4_parsefattr( if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_NAMED_ATTR)) { nfsm_chain_get_32(error, nmc, val); if (val) - nvap->nva_flags |= NFS_FFLAG_NAMED_ATTR; + nvap->nva_flags |= NFS_FFLAG_HAS_NAMED_ATTRS; else - nvap->nva_flags &= ~NFS_FFLAG_NAMED_ATTR; + nvap->nva_flags &= ~NFS_FFLAG_HAS_NAMED_ATTRS; attrbytes -= NFSX_UNSIGNED; } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_FSID)) { @@ -552,26 +1626,79 @@ nfs4_parsefattr( attrbytes -= NFSX_UNSIGNED; } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_RDATTR_ERROR)) { - nfsm_chain_get_32(error, nmc, error); + nfsm_chain_get_32(error, nmc, rderror); attrbytes -= NFSX_UNSIGNED; - nfsmout_if(error); + if (!rderror) { /* no error */ + NFS_BITMAP_CLR(bitmap, NFS_FATTR_RDATTR_ERROR); + NFS_BITMAP_CLR(nvap->nva_bitmap, NFS_FATTR_RDATTR_ERROR); + } } - if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_ACL)) { /* skip for now */ + if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_ACL)) { + error2 = 0; + ace_type = ace_flags = ace_mask = 0; nfsm_chain_get_32(error, nmc, val); /* ACE count */ + if (!error && (val > KAUTH_ACL_MAX_ENTRIES)) + error = EOVERFLOW; + if (!error && !((acl = kauth_acl_alloc(val)))) + error = ENOMEM; + if (!error && acl) { + acl->acl_entrycount = val; + acl->acl_flags = 0; + } + attrbytes -= NFSX_UNSIGNED; + nfsm_assert(error, (attrbytes >= 0), EBADRPC); for (i=0; !error && (i < val); i++) { - nfsm_chain_adv(error, nmc, 3 * NFSX_UNSIGNED); - nfsm_chain_get_32(error, nmc, val2); /* string length */ - nfsm_chain_adv(error, nmc, nfsm_rndup(val2)); - attrbytes -= 4*NFSX_UNSIGNED + nfsm_rndup(val2); + nfsm_chain_get_32(error, nmc, ace_type); + nfsm_chain_get_32(error, nmc, ace_flags); + nfsm_chain_get_32(error, nmc, ace_mask); + nfsm_chain_get_32(error, nmc, len); + acl->acl_ace[i].ace_flags = nfs4_ace_nfstype_to_vfstype(ace_type, &error); + acl->acl_ace[i].ace_flags |= nfs4_ace_nfsflags_to_vfsflags(ace_flags); + acl->acl_ace[i].ace_rights = nfs4_ace_nfsmask_to_vfsrights(ace_mask); + if (!error && !error2 && (len >= slen)) { + if (s != sbuf) { + FREE(s, M_TEMP); + s = sbuf; + slen = sizeof(sbuf); + } + MALLOC(s, char*, len+16, M_TEMP, M_WAITOK); + if (s) + slen = len+16; + else + error2 = ENOMEM; + } + if (error2) + nfsm_chain_adv(error, nmc, nfsm_rndup(len)); + else + nfsm_chain_get_opaque(error, nmc, len, s); + if (!error && !error2) { + s[len] = '\0'; + error2 = nfs4_id2guid(s, &acl->acl_ace[i].ace_applicable, + (ace_flags & NFS_ACE_IDENTIFIER_GROUP)); + if (error2 && (nfs_idmap_ctrl & NFS_IDMAP_CTRL_LOG_FAILED_MAPPINGS)) + printf("nfs4_parsefattr: ACE WHO %s is no one, no guid?, error %d\n", s, error2); + } + attrbytes -= 4*NFSX_UNSIGNED + nfsm_rndup(len); nfsm_assert(error, (attrbytes >= 0), EBADRPC); } + nfsmout_if(error); + if ((nvap != &nva_dummy) && !error2) { + nvap->nva_acl = acl; + acl = NULL; + } } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_ACLSUPPORT)) { + /* + * Support ACLs if: the server supports DENY/ALLOC ACEs and + * (just to be safe) FATTR_ACL is in the supported list too. + */ nfsm_chain_get_32(error, nmc, val); - if (val) + if ((val & (NFS_ACL_SUPPORT_ALLOW_ACL|NFS_ACL_SUPPORT_DENY_ACL)) && + NFS_BITMAP_ISSET(nfsap->nfsa_supp_attr, NFS_FATTR_ACL)) { nfsap->nfsa_flags |= NFS_FSFLAG_ACL; - else + } else { nfsap->nfsa_flags &= ~NFS_FSFLAG_ACL; + } attrbytes -= NFSX_UNSIGNED; } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_ARCHIVE)) { /* SF_ARCHIVED */ @@ -640,23 +1767,151 @@ nfs4_parsefattr( nfsm_chain_get_64(error, nmc, nfsap->nfsa_files_total); attrbytes -= 2 * NFSX_UNSIGNED; } - if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_FS_LOCATIONS)) { /* skip for now */ - nfsm_chain_get_32(error, nmc, val); /* root path length */ - nfsm_chain_adv(error, nmc, nfsm_rndup(val)); /* root path */ - attrbytes -= (2 * NFSX_UNSIGNED) + nfsm_rndup(val); - nfsm_chain_get_32(error, nmc, val); /* location count */ - for (i=0; !error && (i < val); i++) { - nfsm_chain_get_32(error, nmc, val2); /* server string length */ - nfsm_chain_adv(error, nmc, nfsm_rndup(val2)); /* server string */ - attrbytes -= (2 * NFSX_UNSIGNED) + nfsm_rndup(val2); - nfsm_chain_get_32(error, nmc, val2); /* pathname component count */ - for (j=0; !error && (j < val2); j++) { - nfsm_chain_get_32(error, nmc, val3); /* component length */ - nfsm_chain_adv(error, nmc, nfsm_rndup(val3)); /* component */ - attrbytes -= NFSX_UNSIGNED + nfsm_rndup(val3); - nfsm_assert(error, (attrbytes >= 0), EBADRPC); + if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_FS_LOCATIONS)) { + uint32_t loc, serv, comp; + struct nfs_fs_location *fsl; + struct nfs_fs_server *fss; + struct nfs_fs_path *fsp; + + /* get root pathname */ + fsp = &nfslsp->nl_root; + nfsm_chain_get_32(error, nmc, fsp->np_compcount); /* component count */ + attrbytes -= NFSX_UNSIGNED; + /* sanity check component count */ + if (!error && (fsp->np_compcount > MAXPATHLEN)) + error = EBADRPC; + nfsmout_if(error); + if (fsp->np_compcount) { + MALLOC(fsp->np_components, char **, fsp->np_compcount * sizeof(char*), M_TEMP, M_WAITOK|M_ZERO); + if (!fsp->np_components) + error = ENOMEM; + } + for (comp = 0; comp < fsp->np_compcount; comp++) { + nfsm_chain_get_32(error, nmc, val); /* component length */ + /* sanity check component length */ + if (!error && (val == 0)) { + /* + * Apparently some people think a path with zero components should + * be encoded with one zero-length component. So, just ignore any + * zero length components. + */ + comp--; + fsp->np_compcount--; + if (fsp->np_compcount == 0) { + FREE(fsp->np_components, M_TEMP); + fsp->np_components = NULL; + } + attrbytes -= NFSX_UNSIGNED; + continue; + } + if (!error && ((val < 1) || (val > MAXPATHLEN))) + error = EBADRPC; + nfsmout_if(error); + MALLOC(fsp->np_components[comp], char *, val+1, M_TEMP, M_WAITOK|M_ZERO); + if (!fsp->np_components[comp]) + error = ENOMEM; + nfsmout_if(error); + nfsm_chain_get_opaque(error, nmc, val, fsp->np_components[comp]); /* component */ + attrbytes -= NFSX_UNSIGNED + nfsm_rndup(val); + } + nfsm_chain_get_32(error, nmc, nfslsp->nl_numlocs); /* fs location count */ + attrbytes -= NFSX_UNSIGNED; + /* sanity check location count */ + if (!error && (nfslsp->nl_numlocs > 256)) + error = EBADRPC; + nfsmout_if(error); + if (nfslsp->nl_numlocs > 0) { + MALLOC(nfslsp->nl_locations, struct nfs_fs_location **, nfslsp->nl_numlocs * sizeof(struct nfs_fs_location*), M_TEMP, M_WAITOK|M_ZERO); + if (!nfslsp->nl_locations) + error = ENOMEM; + } + nfsmout_if(error); + for (loc = 0; loc < nfslsp->nl_numlocs; loc++) { + nfsmout_if(error); + MALLOC(fsl, struct nfs_fs_location *, sizeof(struct nfs_fs_location), M_TEMP, M_WAITOK|M_ZERO); + if (!fsl) + error = ENOMEM; + nfslsp->nl_locations[loc] = fsl; + nfsm_chain_get_32(error, nmc, fsl->nl_servcount); /* server count */ + attrbytes -= NFSX_UNSIGNED; + /* sanity check server count */ + if (!error && ((fsl->nl_servcount < 1) || (fsl->nl_servcount > 256))) + error = EBADRPC; + nfsmout_if(error); + MALLOC(fsl->nl_servers, struct nfs_fs_server **, fsl->nl_servcount * sizeof(struct nfs_fs_server*), M_TEMP, M_WAITOK|M_ZERO); + if (!fsl->nl_servers) + error = ENOMEM; + for (serv = 0; serv < fsl->nl_servcount; serv++) { + nfsmout_if(error); + MALLOC(fss, struct nfs_fs_server *, sizeof(struct nfs_fs_server), M_TEMP, M_WAITOK|M_ZERO); + if (!fss) + error = ENOMEM; + fsl->nl_servers[serv] = fss; + nfsm_chain_get_32(error, nmc, val); /* server name length */ + /* sanity check server name length */ + if (!error && ((val < 1) || (val > MAXPATHLEN))) + error = EINVAL; + nfsmout_if(error); + MALLOC(fss->ns_name, char *, val+1, M_TEMP, M_WAITOK|M_ZERO); + if (!fss->ns_name) + error = ENOMEM; + nfsm_chain_get_opaque(error, nmc, val, fss->ns_name); /* server name */ + attrbytes -= NFSX_UNSIGNED + nfsm_rndup(val); + nfsmout_if(error); + /* copy name to address if it converts to a sockaddr */ + if (nfs_uaddr2sockaddr(fss->ns_name, (struct sockaddr*)&ss)) { + fss->ns_addrcount = 1; + MALLOC(fss->ns_addresses, char **, sizeof(char *), M_TEMP, M_WAITOK|M_ZERO); + if (!fss->ns_addresses) + error = ENOMEM; + nfsmout_if(error); + MALLOC(fss->ns_addresses[0], char *, val+1, M_TEMP, M_WAITOK|M_ZERO); + if (!fss->ns_addresses[0]) + error = ENOMEM; + nfsmout_if(error); + strlcpy(fss->ns_addresses[0], fss->ns_name, val+1); + } + } + /* get pathname */ + fsp = &fsl->nl_path; + nfsm_chain_get_32(error, nmc, fsp->np_compcount); /* component count */ + attrbytes -= NFSX_UNSIGNED; + /* sanity check component count */ + if (!error && (fsp->np_compcount > MAXPATHLEN)) + error = EINVAL; + nfsmout_if(error); + if (fsp->np_compcount) { + MALLOC(fsp->np_components, char **, fsp->np_compcount * sizeof(char*), M_TEMP, M_WAITOK|M_ZERO); + if (!fsp->np_components) + error = ENOMEM; + } + for (comp = 0; comp < fsp->np_compcount; comp++) { + nfsm_chain_get_32(error, nmc, val); /* component length */ + /* sanity check component length */ + if (!error && (val == 0)) { + /* + * Apparently some people think a path with zero components should + * be encoded with one zero-length component. So, just ignore any + * zero length components. + */ + comp--; + fsp->np_compcount--; + if (fsp->np_compcount == 0) { + FREE(fsp->np_components, M_TEMP); + fsp->np_components = NULL; + } + attrbytes -= NFSX_UNSIGNED; + continue; + } + if (!error && ((val < 1) || (val > MAXPATHLEN))) + error = EINVAL; + nfsmout_if(error); + MALLOC(fsp->np_components[comp], char *, val+1, M_TEMP, M_WAITOK|M_ZERO); + if (!fsp->np_components[comp]) + error = ENOMEM; + nfsm_chain_get_opaque(error, nmc, val, fsp->np_components[comp]); /* component */ + attrbytes -= NFSX_UNSIGNED + nfsm_rndup(val); } - nfsm_assert(error, (attrbytes >= 0), EBADRPC); } nfsm_assert(error, (attrbytes >= 0), EBADRPC); } @@ -724,34 +1979,68 @@ nfs4_parsefattr( attrbytes -= NFSX_UNSIGNED; } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_OWNER)) { - /* XXX Need ID mapping infrastructure - use ugly hack for now */ nfsm_chain_get_32(error, nmc, len); - nfsm_chain_get_opaque_pointer(error, nmc, len, s); + if (!error && (len >= slen)) { + if (s != sbuf) { + FREE(s, M_TEMP); + s = sbuf; + slen = sizeof(sbuf); + } + MALLOC(s, char*, len+16, M_TEMP, M_WAITOK); + if (s) + slen = len+16; + else + error = ENOMEM; + } + nfsm_chain_get_opaque(error, nmc, len, s); + if (!error) { + s[len] = '\0'; + error = nfs4_id2guid(s, &nvap->nva_uuuid, 0); + if (!error) + error = kauth_cred_guid2uid(&nvap->nva_uuuid, &nvap->nva_uid); + if (error) { + /* unable to get either GUID or UID, set to default */ + nvap->nva_uid = (uid_t)((nfs_idmap_ctrl & NFS_IDMAP_CTRL_UNKNOWN_IS_99) ? 99 : -2); + if (nfs_idmap_ctrl & NFS_IDMAP_CTRL_LOG_FAILED_MAPPINGS) + printf("nfs4_parsefattr: owner %s is no one, no %s?, error %d\n", s, + kauth_guid_equal(&nvap->nva_uuuid, &kauth_null_guid) ? "guid" : "uid", + error); + error = 0; + } + } attrbytes -= NFSX_UNSIGNED + nfsm_rndup(len); - nfsmout_if(error); - if ((*s >= '0') && (*s <= '9')) - nvap->nva_uid = strtol(s, NULL, 10); - else if (!strncmp(s, "nobody@", 7)) - nvap->nva_uid = -2; - else if (!strncmp(s, "root@", 5)) - nvap->nva_uid = 0; - else - nvap->nva_uid = 99; /* unknown */ } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_OWNER_GROUP)) { - /* XXX Need ID mapping infrastructure - use ugly hack for now */ nfsm_chain_get_32(error, nmc, len); - nfsm_chain_get_opaque_pointer(error, nmc, len, s); + if (!error && (len >= slen)) { + if (s != sbuf) { + FREE(s, M_TEMP); + s = sbuf; + slen = sizeof(sbuf); + } + MALLOC(s, char*, len+16, M_TEMP, M_WAITOK); + if (s) + slen = len+16; + else + error = ENOMEM; + } + nfsm_chain_get_opaque(error, nmc, len, s); + if (!error) { + s[len] = '\0'; + error = nfs4_id2guid(s, &nvap->nva_guuid, 1); + if (!error) + error = kauth_cred_guid2gid(&nvap->nva_guuid, &nvap->nva_gid); + if (error) { + /* unable to get either GUID or GID, set to default */ + nvap->nva_gid = (gid_t)((nfs_idmap_ctrl & NFS_IDMAP_CTRL_UNKNOWN_IS_99) ? 99 : -2); + if (nfs_idmap_ctrl & NFS_IDMAP_CTRL_LOG_FAILED_MAPPINGS) + printf("nfs4_parsefattr: group %s is no one, no %s?, error %d\n", s, + kauth_guid_equal(&nvap->nva_guuid, &kauth_null_guid) ? "guid" : "gid", + error); + error = 0; + } + } attrbytes -= NFSX_UNSIGNED + nfsm_rndup(len); - nfsmout_if(error); - if ((*s >= '0') && (*s <= '9')) - nvap->nva_gid = strtol(s, NULL, 10); - else if (!strncmp(s, "nobody@", 7)) - nvap->nva_gid = -2; - else if (!strncmp(s, "root@", 5)) - nvap->nva_uid = 0; - else - nvap->nva_gid = 99; /* unknown */ } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_QUOTA_AVAIL_HARD)) { nfsm_chain_get_64(error, nmc, dqbp->dqb_bhardlimit); @@ -828,14 +2117,32 @@ nfs4_parsefattr( nfsm_chain_adv(error, nmc, 4*NFSX_UNSIGNED); /* just skip it */ attrbytes -= 4 * NFSX_UNSIGNED; } - if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_MOUNTED_ON_FILEID)) { /* skip for now */ + if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_MOUNTED_ON_FILEID)) { +#if CONFIG_TRIGGERS + /* we prefer the mounted on file ID, so just replace the fileid */ + nfsm_chain_get_64(error, nmc, nvap->nva_fileid); +#else nfsm_chain_adv(error, nmc, 2*NFSX_UNSIGNED); +#endif attrbytes -= 2 * NFSX_UNSIGNED; } /* advance over any leftover attrbytes */ nfsm_assert(error, (attrbytes >= 0), EBADRPC); nfsm_chain_adv(error, nmc, nfsm_rndup(attrbytes)); nfsmout: + if (error) + nfs_fs_locations_cleanup(nfslsp); + if (!error && rderror) + error = rderror; + /* free up temporary resources */ + if (s && (s != sbuf)) + FREE(s, M_TEMP); + if (acl) + kauth_acl_free(acl); + if (error && nvap->nva_acl) { + kauth_acl_free(nvap->nva_acl); + nvap->nva_acl = NULL; + } return (error); } @@ -845,51 +2152,18 @@ nfs4_parsefattr( int nfsm_chain_add_fattr4_f(struct nfsm_chain *nmc, struct vnode_attr *vap, struct nfsmount *nmp) { - int error = 0, attrbytes, slen, i; - uint32_t *pattrbytes; + int error = 0, attrbytes, slen, len, i, isgroup; + uint32_t *pattrbytes, val, acecount;; uint32_t bitmap[NFS_ATTR_BITMAP_LEN]; - char s[32]; + char sbuf[64], *s; + kauth_acl_t acl; + gid_t gid; - /* - * Do this in two passes. - * First calculate the bitmap, then pack - * everything together and set the size. - */ + s = sbuf; + slen = sizeof(sbuf); - NFS_CLEAR_ATTRIBUTES(bitmap); - if (VATTR_IS_ACTIVE(vap, va_data_size)) - NFS_BITMAP_SET(bitmap, NFS_FATTR_SIZE); - if (VATTR_IS_ACTIVE(vap, va_acl)) { - // NFS_BITMAP_SET(bitmap, NFS_FATTR_ACL) - } - if (VATTR_IS_ACTIVE(vap, va_flags)) { - NFS_BITMAP_SET(bitmap, NFS_FATTR_ARCHIVE); - NFS_BITMAP_SET(bitmap, NFS_FATTR_HIDDEN); - } - // NFS_BITMAP_SET(bitmap, NFS_FATTR_MIMETYPE) - if (VATTR_IS_ACTIVE(vap, va_mode)) - NFS_BITMAP_SET(bitmap, NFS_FATTR_MODE); - if (VATTR_IS_ACTIVE(vap, va_uid)) - NFS_BITMAP_SET(bitmap, NFS_FATTR_OWNER); - if (VATTR_IS_ACTIVE(vap, va_gid)) - NFS_BITMAP_SET(bitmap, NFS_FATTR_OWNER_GROUP); - // NFS_BITMAP_SET(bitmap, NFS_FATTR_SYSTEM) - if (vap->va_vaflags & VA_UTIMES_NULL) { - NFS_BITMAP_SET(bitmap, NFS_FATTR_TIME_ACCESS_SET); - NFS_BITMAP_SET(bitmap, NFS_FATTR_TIME_MODIFY_SET); - } else { - if (VATTR_IS_ACTIVE(vap, va_access_time)) - NFS_BITMAP_SET(bitmap, NFS_FATTR_TIME_ACCESS_SET); - if (VATTR_IS_ACTIVE(vap, va_modify_time)) - NFS_BITMAP_SET(bitmap, NFS_FATTR_TIME_MODIFY_SET); - } - if (VATTR_IS_ACTIVE(vap, va_backup_time)) - NFS_BITMAP_SET(bitmap, NFS_FATTR_TIME_BACKUP); - if (VATTR_IS_ACTIVE(vap, va_create_time)) - NFS_BITMAP_SET(bitmap, NFS_FATTR_TIME_CREATE); - /* and limit to what is supported by server */ - for (i=0; i < NFS_ATTR_BITMAP_LEN; i++) - bitmap[i] &= nmp->nm_fsattr.nfsa_supp_attr[i]; + /* First calculate the bitmap... */ + nfs_vattr_set_bitmap(nmp, bitmap, vap); /* * Now pack it all together: @@ -905,7 +2179,43 @@ nfsm_chain_add_fattr4_f(struct nfsm_chain *nmc, struct vnode_attr *vap, struct n nfsm_chain_add_64(error, nmc, vap->va_data_size); attrbytes += 2*NFSX_UNSIGNED; } - // NFS_BITMAP_ISSET(bitmap, NFS_FATTR_ACL) + if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_ACL)) { + acl = vap->va_acl; + if (!acl || (acl->acl_entrycount == KAUTH_FILESEC_NOACL)) + acecount = 0; + else + acecount = acl->acl_entrycount; + nfsm_chain_add_32(error, nmc, acecount); + attrbytes += NFSX_UNSIGNED; + for (i=0; !error && (i < (int)acecount); i++) { + val = (acl->acl_ace[i].ace_flags & KAUTH_ACE_KINDMASK); + val = nfs4_ace_vfstype_to_nfstype(val, &error); + nfsm_chain_add_32(error, nmc, val); + val = nfs4_ace_vfsflags_to_nfsflags(acl->acl_ace[i].ace_flags); + nfsm_chain_add_32(error, nmc, val); + val = nfs4_ace_vfsrights_to_nfsmask(acl->acl_ace[i].ace_rights); + nfsm_chain_add_32(error, nmc, val); + len = slen; + isgroup = (kauth_cred_guid2gid(&acl->acl_ace[i].ace_applicable, &gid) == 0); + error = nfs4_guid2id(&acl->acl_ace[i].ace_applicable, s, &len, isgroup); + if (error == ENOSPC) { + if (s != sbuf) { + FREE(s, M_TEMP); + s = sbuf; + } + len += 8; + MALLOC(s, char*, len, M_TEMP, M_WAITOK); + if (s) { + slen = len; + error = nfs4_guid2id(&acl->acl_ace[i].ace_applicable, s, &len, isgroup); + } else { + error = ENOMEM; + } + } + nfsm_chain_add_name(error, nmc, s, len, nmp); + attrbytes += 4*NFSX_UNSIGNED + nfsm_rndup(len); + } + } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_ARCHIVE)) { nfsm_chain_add_32(error, nmc, (vap->va_flags & SF_ARCHIVED) ? 1 : 0); attrbytes += NFSX_UNSIGNED; @@ -920,26 +2230,56 @@ nfsm_chain_add_fattr4_f(struct nfsm_chain *nmc, struct vnode_attr *vap, struct n attrbytes += NFSX_UNSIGNED; } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_OWNER)) { - /* XXX Need ID mapping infrastructure - use ugly hack for now */ - if (vap->va_uid == 0) - slen = snprintf(s, sizeof(s), "root@localdomain"); - else if (vap->va_uid == (uid_t)-2) - slen = snprintf(s, sizeof(s), "nobody@localdomain"); - else - slen = snprintf(s, sizeof(s), "%d", vap->va_uid); - nfsm_chain_add_string(error, nmc, s, slen); - attrbytes += NFSX_UNSIGNED + nfsm_rndup(slen); + nfsmout_if(error); + /* if we have va_uuuid use it, otherwise use uid */ + if (!VATTR_IS_ACTIVE(vap, va_uuuid)) { + error = kauth_cred_uid2guid(vap->va_uid, &vap->va_uuuid); + nfsmout_if(error); + } + len = slen; + error = nfs4_guid2id(&vap->va_uuuid, s, &len, 0); + if (error == ENOSPC) { + if (s != sbuf) { + FREE(s, M_TEMP); + s = sbuf; + } + len += 8; + MALLOC(s, char*, len, M_TEMP, M_WAITOK); + if (s) { + slen = len; + error = nfs4_guid2id(&vap->va_uuuid, s, &len, 0); + } else { + error = ENOMEM; + } + } + nfsm_chain_add_name(error, nmc, s, len, nmp); + attrbytes += NFSX_UNSIGNED + nfsm_rndup(len); } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_OWNER_GROUP)) { - /* XXX Need ID mapping infrastructure - use ugly hack for now */ - if (vap->va_gid == 0) - slen = snprintf(s, sizeof(s), "root@localdomain"); - else if (vap->va_gid == (gid_t)-2) - slen = snprintf(s, sizeof(s), "nobody@localdomain"); - else - slen = snprintf(s, sizeof(s), "%d", vap->va_gid); - nfsm_chain_add_string(error, nmc, s, slen); - attrbytes += NFSX_UNSIGNED + nfsm_rndup(slen); + nfsmout_if(error); + /* if we have va_guuid use it, otherwise use gid */ + if (!VATTR_IS_ACTIVE(vap, va_guuid)) { + error = kauth_cred_gid2guid(vap->va_gid, &vap->va_guuid); + nfsmout_if(error); + } + len = slen; + error = nfs4_guid2id(&vap->va_guuid, s, &len, 1); + if (error == ENOSPC) { + if (s != sbuf) { + FREE(s, M_TEMP); + s = sbuf; + } + len += 8; + MALLOC(s, char*, len, M_TEMP, M_WAITOK); + if (s) { + slen = len; + error = nfs4_guid2id(&vap->va_guuid, s, &len, 1); + } else { + error = ENOMEM; + } + } + nfsm_chain_add_name(error, nmc, s, len, nmp); + attrbytes += NFSX_UNSIGNED + nfsm_rndup(len); } // NFS_BITMAP_SET(bitmap, NFS_FATTR_SYSTEM) if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_TIME_ACCESS_SET)) { @@ -978,16 +2318,100 @@ nfsm_chain_add_fattr4_f(struct nfsm_chain *nmc, struct vnode_attr *vap, struct n /* Now, set the attribute data length */ *pattrbytes = txdr_unsigned(attrbytes); nfsmout: + if (s && (s != sbuf)) + FREE(s, M_TEMP); return (error); } +/* + * Got the given error and need to start recovery (if not already started). + * Note: nmp must be locked! + */ +void +nfs_need_recover(struct nfsmount *nmp, int error) +{ + int wake = !(nmp->nm_state & NFSSTA_RECOVER); + + nmp->nm_state |= NFSSTA_RECOVER; + if ((error == NFSERR_ADMIN_REVOKED) || + (error == NFSERR_EXPIRED) || + (error == NFSERR_STALE_CLIENTID)) + nmp->nm_state |= NFSSTA_RECOVER_EXPIRED; + if (wake) + nfs_mount_sock_thread_wake(nmp); +} + +/* + * After recovery due to state expiry, check each node and + * drop any lingering delegation we thought we had. + * + * If a node has an open that is not lost and is not marked + * for reopen, then we hold onto any delegation because it is + * likely newly-granted. + */ +static void +nfs4_expired_check_delegation(nfsnode_t np, vfs_context_t ctx) +{ + struct nfsmount *nmp = NFSTONMP(np); + struct nfs_open_file *nofp; + int drop = 1; + + if ((np->n_flag & NREVOKE) || !(np->n_openflags & N_DELEG_MASK)) + return; + + lck_mtx_lock(&np->n_openlock); + + TAILQ_FOREACH(nofp, &np->n_opens, nof_link) { + if (!nofp->nof_opencnt) + continue; + if (nofp->nof_flags & NFS_OPEN_FILE_LOST) + continue; + if (nofp->nof_flags & NFS_OPEN_FILE_REOPEN) + continue; + /* we have an open that is not lost and not marked for reopen */ + // XXX print out what's keeping this node from dropping the delegation. + NP(nofp->nof_np, "nfs4_expired_check_delegation: !drop: opencnt %d flags 0x%x access %d %d mmap %d %d", + nofp->nof_opencnt, nofp->nof_flags, + nofp->nof_access, nofp->nof_deny, + nofp->nof_mmap_access, nofp->nof_mmap_deny); + drop = 0; + break; + } + + if (drop) { + /* need to drop a delegation */ + if (np->n_dreturn.tqe_next != NFSNOLIST) { + /* remove this node from the delegation return list */ + lck_mtx_lock(&nmp->nm_lock); + if (np->n_dreturn.tqe_next != NFSNOLIST) { + TAILQ_REMOVE(&nmp->nm_dreturnq, np, n_dreturn); + np->n_dreturn.tqe_next = NFSNOLIST; + } + lck_mtx_unlock(&nmp->nm_lock); + } + if (np->n_openflags & N_DELEG_MASK) { + np->n_openflags &= ~N_DELEG_MASK; + lck_mtx_lock(&nmp->nm_lock); + if (np->n_dlink.tqe_next != NFSNOLIST) { + TAILQ_REMOVE(&nmp->nm_delegations, np, n_dlink); + np->n_dlink.tqe_next = NFSNOLIST; + } + lck_mtx_unlock(&nmp->nm_lock); + nfs4_delegreturn_rpc(nmp, np->n_fhp, np->n_fhsize, &np->n_dstateid, + 0, vfs_context_thread(ctx), vfs_context_ucred(ctx)); + } + } + + lck_mtx_unlock(&np->n_openlock); +} + /* * Recover state for an NFS mount. * * Iterates over all open files, reclaiming opens and lock state. */ void -nfs4_recover(struct nfsmount *nmp) +nfs_recover(struct nfsmount *nmp) { struct timespec ts = { 1, 0 }; int error, lost, reopen; @@ -996,6 +2420,8 @@ nfs4_recover(struct nfsmount *nmp) struct nfs_file_lock *nflp, *nextnflp; struct nfs_lock_owner *nlop; thread_t thd = current_thread(); + nfsnode_t np, nextnp; + struct timeval now; restart: error = 0; @@ -1020,25 +2446,36 @@ nfs4_recover(struct nfsmount *nmp) } while (nmp->nm_stateinuse); if (error) { if (error == EPIPE) - printf("nfs recovery reconnecting\n"); + printf("nfs recovery reconnecting for %s, 0x%x\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname, nmp->nm_stategenid); else - printf("nfs recovery aborted\n"); + printf("nfs recovery aborted for %s, 0x%x\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname, nmp->nm_stategenid); lck_mtx_unlock(&nmp->nm_lock); return; } - printf("nfs recovery started\n"); + microuptime(&now); + if (now.tv_sec == nmp->nm_recover_start) { + printf("nfs recovery throttled for %s, 0x%x\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname, nmp->nm_stategenid); + lck_mtx_unlock(&nmp->nm_lock); + tsleep(&lbolt, (PZERO-1), "nfsrecoverrestart", hz); + goto restart; + } + nmp->nm_recover_start = now.tv_sec; if (++nmp->nm_stategenid == 0) ++nmp->nm_stategenid; + printf("nfs recovery started for %s, 0x%x\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname, nmp->nm_stategenid); lck_mtx_unlock(&nmp->nm_lock); /* for each open owner... */ TAILQ_FOREACH(noop, &nmp->nm_open_owners, noo_link) { /* for each of its opens... */ TAILQ_FOREACH(nofp, &noop->noo_opens, nof_oolink) { - if (!nofp->nof_access || (nofp->nof_flags & NFS_OPEN_FILE_LOST)) + if (!nofp->nof_access || (nofp->nof_flags & NFS_OPEN_FILE_LOST) || (nofp->nof_np->n_flag & NREVOKE)) continue; lost = reopen = 0; + /* for NFSv2/v3, just skip straight to lock reclaim */ + if (nmp->nm_vers < NFS_VER4) + goto reclaim_locks; if (nofp->nof_rw_drw) error = nfs4_open_reclaim_rpc(nofp, NFS_OPEN_SHARE_ACCESS_BOTH, NFS_OPEN_SHARE_DENY_BOTH); if (!error && nofp->nof_w_drw) @@ -1056,45 +2493,80 @@ nfs4_recover(struct nfsmount *nmp) */ if (!error && nofp->nof_rw) { error = nfs4_open_reclaim_rpc(nofp, NFS_OPEN_SHARE_ACCESS_BOTH, NFS_OPEN_SHARE_DENY_NONE); - if ((error == NFSERR_ADMIN_REVOKED) || (error == NFSERR_EXPIRED) || (error == NFSERR_NO_GRACE)) - reopen = 1; + if ((error == NFSERR_ADMIN_REVOKED) || (error == NFSERR_EXPIRED) || (error == NFSERR_NO_GRACE)) { + reopen = error; + error = 0; + } } - if (!error && nofp->nof_w) { + if (!error && !reopen && nofp->nof_w) { error = nfs4_open_reclaim_rpc(nofp, NFS_OPEN_SHARE_ACCESS_WRITE, NFS_OPEN_SHARE_DENY_NONE); - if ((error == NFSERR_ADMIN_REVOKED) || (error == NFSERR_EXPIRED) || (error == NFSERR_NO_GRACE)) - reopen = 1; + if ((error == NFSERR_ADMIN_REVOKED) || (error == NFSERR_EXPIRED) || (error == NFSERR_NO_GRACE)) { + reopen = error; + error = 0; + } } - if (!error && nofp->nof_r) { + if (!error && !reopen && nofp->nof_r) { error = nfs4_open_reclaim_rpc(nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE); - if ((error == NFSERR_ADMIN_REVOKED) || (error == NFSERR_EXPIRED) || (error == NFSERR_NO_GRACE)) - reopen = 1; + if ((error == NFSERR_ADMIN_REVOKED) || (error == NFSERR_EXPIRED) || (error == NFSERR_NO_GRACE)) { + reopen = error; + error = 0; + } } - if (error) { + /* + * If we hold delegated state but we don't have any non-delegated opens, + * then we should attempt to claim that state now (but don't return the + * delegation unless asked to). + */ + if ((nofp->nof_d_rw_drw || nofp->nof_d_w_drw || nofp->nof_d_r_drw || + nofp->nof_d_rw_dw || nofp->nof_d_w_dw || nofp->nof_d_r_dw || + nofp->nof_d_rw || nofp->nof_d_w || nofp->nof_d_r) && + (!nofp->nof_rw_drw && !nofp->nof_w_drw && !nofp->nof_r_drw && + !nofp->nof_rw_dw && !nofp->nof_w_dw && !nofp->nof_r_dw && + !nofp->nof_rw && !nofp->nof_w && !nofp->nof_r)) { + if (!error && !nfs_open_state_set_busy(nofp->nof_np, NULL)) { + error = nfs4_claim_delegated_state_for_node(nofp->nof_np, R_RECOVER); + if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) + reopen = EAGAIN; + nfs_open_state_clear_busy(nofp->nof_np); + /* if claim didn't go well, we may need to return delegation now */ + if (nofp->nof_np->n_openflags & N_DELEG_RETURN) { + nfs4_delegation_return(nofp->nof_np, R_RECOVER, thd, noop->noo_cred); + if (!(nmp->nm_sockflags & NMSOCK_READY)) + error = ETIMEDOUT; /* looks like we need a reconnect */ + } + } + } + + /* + * Handle any issue claiming open state. + * Potential reopens need to first confirm that there are no locks. + */ + if (error || reopen) { /* restart recovery? */ if ((error == ETIMEDOUT) || nfs_mount_state_error_should_restart(error)) { if (error == ETIMEDOUT) nfs_need_reconnect(nmp); tsleep(&lbolt, (PZERO-1), "nfsrecoverrestart", 0); - printf("nfs recovery restarting %d\n", error); + printf("nfs recovery restarting for %s, 0x%x, error %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nmp->nm_stategenid, error); goto restart; } - if (reopen && (nfs4_check_for_locks(noop, nofp) == 0)) { + if (reopen && (nfs_check_for_locks(noop, nofp) == 0)) { /* just reopen the file on next access */ - const char *vname = vnode_getname(NFSTOV(nofp->nof_np)); - printf("nfs4_recover: %d, need reopen for %s\n", error, vname ? vname : "???"); - vnode_putname(vname); + NP(nofp->nof_np, "nfs_recover: %d, need reopen for %d %p 0x%x", reopen, + kauth_cred_getuid(noop->noo_cred), nofp->nof_np, nofp->nof_np->n_flag); lck_mtx_lock(&nofp->nof_lock); nofp->nof_flags |= NFS_OPEN_FILE_REOPEN; lck_mtx_unlock(&nofp->nof_lock); - error = 0; } else { /* open file state lost */ + if (reopen) + NP(nofp->nof_np, "nfs_recover: %d, can't reopen because of locks %d %p", reopen, + kauth_cred_getuid(noop->noo_cred), nofp->nof_np); lost = 1; error = 0; - lck_mtx_lock(&nofp->nof_lock); - nofp->nof_flags &= ~NFS_OPEN_FILE_REOPEN; - lck_mtx_unlock(&nofp->nof_lock); + reopen = 0; } } else { /* no error, so make sure the reopen flag isn't set */ @@ -1102,83 +2574,97 @@ nfs4_recover(struct nfsmount *nmp) nofp->nof_flags &= ~NFS_OPEN_FILE_REOPEN; lck_mtx_unlock(&nofp->nof_lock); } + /* * Scan this node's lock owner list for entries with this open owner, * then walk the lock owner's held lock list recovering each lock. */ -rescanlocks: +reclaim_locks: TAILQ_FOREACH(nlop, &nofp->nof_np->n_lock_owners, nlo_link) { + if (lost || reopen) + break; if (nlop->nlo_open_owner != noop) continue; TAILQ_FOREACH_SAFE(nflp, &nlop->nlo_locks, nfl_lolink, nextnflp) { + /* skip dead & blocked lock requests (shouldn't be any in the held lock list) */ if (nflp->nfl_flags & (NFS_FILE_LOCK_DEAD|NFS_FILE_LOCK_BLOCKED)) continue; - if (!lost) { - error = nfs4_lock_rpc(nofp->nof_np, nofp, nflp, 1, thd, noop->noo_cred); - if (!error) - continue; - /* restart recovery? */ - if ((error == ETIMEDOUT) || nfs_mount_state_error_should_restart(error)) { - if (error == ETIMEDOUT) - nfs_need_reconnect(nmp); - tsleep(&lbolt, (PZERO-1), "nfsrecoverrestart", 0); - printf("nfs recovery restarting %d\n", error); - goto restart; - } - /* lock state lost - attempt to close file */ - lost = 1; - error = nfs4_close_rpc(nofp->nof_np, nofp, NULL, noop->noo_cred, R_RECOVER); - if ((error == ETIMEDOUT) || nfs_mount_state_error_should_restart(error)) { - if (error == ETIMEDOUT) - nfs_need_reconnect(nmp); - tsleep(&lbolt, (PZERO-1), "nfsrecoverrestart", 0); - printf("nfs recovery restarting %d\n", error); - goto restart; - } - error = 0; - /* rescan locks so we can drop them all */ - goto rescanlocks; - } - if (lost) { - /* kill/remove the lock */ - lck_mtx_lock(&nofp->nof_np->n_openlock); - nflp->nfl_flags |= NFS_FILE_LOCK_DEAD; - lck_mtx_lock(&nlop->nlo_lock); - nextnflp = TAILQ_NEXT(nflp, nfl_lolink); - TAILQ_REMOVE(&nlop->nlo_locks, nflp, nfl_lolink); - lck_mtx_unlock(&nlop->nlo_lock); - if (nflp->nfl_blockcnt) { - /* wake up anyone blocked on this lock */ - wakeup(nflp); - } else { - /* remove nflp from lock list and destroy */ - TAILQ_REMOVE(&nofp->nof_np->n_locks, nflp, nfl_link); - nfs_file_lock_destroy(nflp); - } - lck_mtx_unlock(&nofp->nof_np->n_openlock); + /* skip delegated locks */ + if (nflp->nfl_flags & NFS_FILE_LOCK_DELEGATED) + continue; + error = nmp->nm_funcs->nf_setlock_rpc(nofp->nof_np, nofp, nflp, 1, R_RECOVER, thd, noop->noo_cred); + if (error) + NP(nofp->nof_np, "nfs: lock reclaim (0x%llx, 0x%llx) %s %d", + nflp->nfl_start, nflp->nfl_end, + error ? "failed" : "succeeded", error); + if (!error) + continue; + /* restart recovery? */ + if ((error == ETIMEDOUT) || nfs_mount_state_error_should_restart(error)) { + if (error == ETIMEDOUT) + nfs_need_reconnect(nmp); + tsleep(&lbolt, (PZERO-1), "nfsrecoverrestart", 0); + printf("nfs recovery restarting for %s, 0x%x, error %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nmp->nm_stategenid, error); + goto restart; } + /* lock state lost - attempt to close file */ + lost = 1; + error = 0; + break; } } + + /* + * If we've determined that we need to reopen the file then we probably + * didn't receive any delegation we think we hold. We should attempt to + * return that delegation (and claim any delegated state). + * + * If we hold a delegation that is marked for return, then we should + * return it now. + */ + if ((nofp->nof_np->n_openflags & N_DELEG_RETURN) || + (reopen && (nofp->nof_np->n_openflags & N_DELEG_MASK))) { + nfs4_delegation_return(nofp->nof_np, R_RECOVER, thd, noop->noo_cred); + if (!(nmp->nm_sockflags & NMSOCK_READY)) { + /* looks like we need a reconnect */ + tsleep(&lbolt, (PZERO-1), "nfsrecoverrestart", 0); + printf("nfs recovery restarting for %s, 0x%x, error %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nmp->nm_stategenid, error); + goto restart; + } + } + if (lost) { /* revoke open file state */ - lck_mtx_lock(&nofp->nof_lock); - nofp->nof_flags |= NFS_OPEN_FILE_LOST; - lck_mtx_unlock(&nofp->nof_lock); - const char *vname = vnode_getname(NFSTOV(nofp->nof_np)); - printf("nfs4_recover: state lost for %s\n", vname ? vname : "???"); - vnode_putname(vname); + NP(nofp->nof_np, "nfs_recover: state lost for %d %p 0x%x", + kauth_cred_getuid(noop->noo_cred), nofp->nof_np, nofp->nof_np->n_flag); + nfs_revoke_open_state_for_node(nofp->nof_np); } } } if (!error) { + /* If state expired, make sure we're not holding onto any stale delegations */ lck_mtx_lock(&nmp->nm_lock); - nmp->nm_state &= ~NFSSTA_RECOVER; + if ((nmp->nm_vers >= NFS_VER4) && (nmp->nm_state & NFSSTA_RECOVER_EXPIRED)) { +recheckdeleg: + TAILQ_FOREACH_SAFE(np, &nmp->nm_delegations, n_dlink, nextnp) { + lck_mtx_unlock(&nmp->nm_lock); + nfs4_expired_check_delegation(np, vfs_context_kernel()); + lck_mtx_lock(&nmp->nm_lock); + if (nextnp == NFSNOLIST) + goto recheckdeleg; + } + } + nmp->nm_state &= ~(NFSSTA_RECOVER|NFSSTA_RECOVER_EXPIRED); wakeup(&nmp->nm_state); - printf("nfs recovery completed\n"); + printf("nfs recovery completed for %s, 0x%x\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nmp->nm_stategenid); lck_mtx_unlock(&nmp->nm_lock); } else { - printf("nfs recovery failed %d\n", error); + printf("nfs recovery failed for %s, 0x%x, error %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nmp->nm_stategenid, error); } } diff --git a/bsd/nfs/nfs4_vnops.c b/bsd/nfs/nfs4_vnops.c index ffd12d88f..ca874aa7c 100644 --- a/bsd/nfs/nfs4_vnops.c +++ b/bsd/nfs/nfs4_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006-2009 Apple Inc. All rights reserved. + * Copyright (c) 2006-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -46,7 +46,9 @@ #include #include #include -#include +#include +#include +#include #include @@ -78,17 +80,22 @@ #include int -nfs4_access_rpc(nfsnode_t np, u_int32_t *mode, vfs_context_t ctx) +nfs4_access_rpc(nfsnode_t np, u_int32_t *access, vfs_context_t ctx) { int error = 0, lockerror = ENOENT, status, numops, slot; u_int64_t xid; struct nfsm_chain nmreq, nmrep; struct timeval now; - uint32_t access = 0, supported = 0, missing; + uint32_t access_result = 0, supported = 0, missing; struct nfsmount *nmp = NFSTONMP(np); int nfsvers = nmp->nm_vers; uid_t uid; + struct nfsreq_secinfo_args si; + if (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (0); + + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -101,15 +108,14 @@ nfs4_access_rpc(nfsnode_t np, u_int32_t *mode, vfs_context_t ctx) nfsm_chain_add_fh(error, &nmreq, nfsvers, np->n_fhp, np->n_fhsize); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_ACCESS); - nfsm_chain_add_32(error, &nmreq, *mode); + nfsm_chain_add_32(error, &nmreq, *access); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, np); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &nmrep, &xid, &status); + error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &si, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; @@ -118,9 +124,9 @@ nfs4_access_rpc(nfsnode_t np, u_int32_t *mode, vfs_context_t ctx) nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); nfsm_chain_op_check(error, &nmrep, NFS_OP_ACCESS); nfsm_chain_get_32(error, &nmrep, supported); - nfsm_chain_get_32(error, &nmrep, access); + nfsm_chain_get_32(error, &nmrep, access_result); nfsmout_if(error); - if ((missing = (*mode & ~supported))) { + if ((missing = (*access & ~supported))) { /* missing support for something(s) we wanted */ if (missing & NFS_ACCESS_DELETE) { /* @@ -129,25 +135,35 @@ nfs4_access_rpc(nfsnode_t np, u_int32_t *mode, vfs_context_t ctx) * and just let any subsequent delete action fail * if it really isn't deletable. */ - access |= NFS_ACCESS_DELETE; + access_result |= NFS_ACCESS_DELETE; } } + /* ".zfs" subdirectories may erroneously give a denied answer for modify/delete */ + if (nfs_access_dotzfs) { + vnode_t dvp = NULLVP; + if (np->n_flag & NISDOTZFSCHILD) /* may be able to create/delete snapshot dirs */ + access_result |= (NFS_ACCESS_MODIFY|NFS_ACCESS_EXTEND|NFS_ACCESS_DELETE); + else if (((dvp = vnode_getparent(NFSTOV(np))) != NULLVP) && (VTONFS(dvp)->n_flag & NISDOTZFSCHILD)) + access_result |= NFS_ACCESS_DELETE; /* may be able to delete snapshot dirs */ + if (dvp != NULLVP) + vnode_put(dvp); + } /* Some servers report DELETE support but erroneously give a denied answer. */ - if ((*mode & NFS_ACCESS_DELETE) && nfs_access_delete && !(access & NFS_ACCESS_DELETE)) - access |= NFS_ACCESS_DELETE; + if (nfs_access_delete && (*access & NFS_ACCESS_DELETE) && !(access_result & NFS_ACCESS_DELETE)) + access_result |= NFS_ACCESS_DELETE; nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, np, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, nfsvers, &xid); nfsmout_if(error); uid = kauth_cred_getuid(vfs_context_ucred(ctx)); - slot = nfs_node_mode_slot(np, uid, 1); - np->n_modeuid[slot] = uid; + slot = nfs_node_access_slot(np, uid, 1); + np->n_accessuid[slot] = uid; microuptime(&now); - np->n_modestamp[slot] = now.tv_sec; - np->n_mode[slot] = access; + np->n_accessstamp[slot] = now.tv_sec; + np->n_access[slot] = access_result; - /* pass back the mode returned with this request */ - *mode = np->n_mode[slot]; + /* pass back the access returned with this request */ + *access = np->n_access[slot]; nfsmout: if (!lockerror) nfs_node_unlock(np); @@ -162,18 +178,31 @@ nfs4_getattr_rpc( mount_t mp, u_char *fhp, size_t fhsize, + int flags, vfs_context_t ctx, struct nfs_vattr *nvap, u_int64_t *xidp) { struct nfsmount *nmp = mp ? VFSTONFS(mp) : NFSTONMP(np); - int error = 0, status, nfsvers, numops; + int error = 0, status, nfsvers, numops, rpcflags = 0, acls; + uint32_t bitmap[NFS_ATTR_BITMAP_LEN]; struct nfsm_chain nmreq, nmrep; + struct nfsreq_secinfo_args si; if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; + acls = (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_ACL); + + if (np && (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL)) { + nfs4_default_attrs_for_referral_trigger(VTONFS(np->n_parent), NULL, 0, nvap, NULL); + return (0); + } + + if (flags & NGA_MONITOR) /* vnode monitor requests should be soft */ + rpcflags = R_RECOVER; + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -186,20 +215,29 @@ nfs4_getattr_rpc( nfsm_chain_add_fh(error, &nmreq, nfsvers, fhp, fhsize); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + NFS_COPY_ATTRIBUTES(nfs_getattr_bitmap, bitmap); + if ((flags & NGA_ACL) && acls) + NFS_BITMAP_SET(bitmap, NFS_FATTR_ACL); + nfsm_chain_add_bitmap_supported(error, &nmreq, bitmap, nmp, np); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request(np, mp, &nmreq, NFSPROC4_COMPOUND, ctx, &nmrep, xidp, &status); + error = nfs_request2(np, mp, &nmreq, NFSPROC4_COMPOUND, + vfs_context_thread(ctx), vfs_context_ucred(ctx), + NULL, rpcflags, &nmrep, xidp, &status); nfsm_chain_skip_tag(error, &nmrep); nfsm_chain_get_32(error, &nmrep, numops); nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); nfsmout_if(error); - NFS_CLEAR_ATTRIBUTES(nvap->nva_bitmap); - error = nfs4_parsefattr(&nmrep, NULL, nvap, NULL, NULL); + error = nfs4_parsefattr(&nmrep, NULL, nvap, NULL, NULL, NULL); + nfsmout_if(error); + if ((flags & NGA_ACL) && acls && !NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_ACL)) { + /* we asked for the ACL but didn't get one... assume there isn't one */ + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_ACL); + nvap->nva_acl = NULL; + } nfsmout: nfsm_chain_cleanup(&nmreq); nfsm_chain_cleanup(&nmrep); @@ -214,10 +252,14 @@ nfs4_readlink_rpc(nfsnode_t np, char *buf, uint32_t *buflenp, vfs_context_t ctx) uint32_t len = 0; u_int64_t xid; struct nfsm_chain nmreq, nmrep; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(np); if (!nmp) return (ENXIO); + if (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -230,14 +272,13 @@ nfs4_readlink_rpc(nfsnode_t np, char *buf, uint32_t *buflenp, vfs_context_t ctx) nfsm_chain_add_fh(error, &nmreq, NFS_VER4, np->n_fhp, np->n_fhsize); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, np); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_READLINK); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &nmrep, &xid, &status); + error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &si, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; @@ -245,7 +286,7 @@ nfs4_readlink_rpc(nfsnode_t np, char *buf, uint32_t *buflenp, vfs_context_t ctx) nfsm_chain_get_32(error, &nmrep, numops); nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, np, NFS_VER4, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, NFS_VER4, &xid); nfsm_chain_op_check(error, &nmrep, NFS_OP_READLINK); nfsm_chain_get_32(error, &nmrep, len); nfsmout_if(error); @@ -280,12 +321,16 @@ nfs4_read_rpc_async( int error = 0, nfsvers, numops; nfs_stateid stateid; struct nfsm_chain nmreq; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(np); if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; + if (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); // PUTFH, READ, GETATTR @@ -303,12 +348,11 @@ nfs4_read_rpc_async( nfsm_chain_add_32(error, &nmreq, len); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, np); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request_async(np, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, cb, reqp); + error = nfs_request_async(np, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, &si, 0, cb, reqp); nfsmout: nfsm_chain_cleanup(&nmreq); return (error); @@ -354,7 +398,7 @@ nfs4_read_rpc_async_finish( error = nfsm_chain_get_uio(&nmrep, *lenp, uio); } nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, np, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, nfsvers, &xid); if (!lockerror) nfs_node_unlock(np); if (eofp) { @@ -363,6 +407,8 @@ nfs4_read_rpc_async_finish( *eofp = eof; } nfsm_chain_cleanup(&nmrep); + if (np->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR) + microuptime(&np->n_lastio); return (error); } @@ -378,15 +424,25 @@ nfs4_write_rpc_async( struct nfsreq **reqp) { struct nfsmount *nmp; + mount_t mp; int error = 0, nfsvers, numops; nfs_stateid stateid; struct nfsm_chain nmreq; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(np); if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; + if (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); + + /* for async mounts, don't bother sending sync write requests */ + if ((iomode != NFS_WRITE_UNSTABLE) && nfs_allow_async && + ((mp = NFSTOMP(np))) && (vfs_flags(mp) & MNT_ASYNC)) + iomode = NFS_WRITE_UNSTABLE; + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); // PUTFH, WRITE, GETATTR @@ -407,13 +463,12 @@ nfs4_write_rpc_async( error = nfsm_chain_add_uio(&nmreq, uio, len); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, np); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request_async(np, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, cb, reqp); + error = nfs_request_async(np, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, &si, 0, cb, reqp); nfsmout: nfsm_chain_cleanup(&nmreq); return (error); @@ -475,7 +530,7 @@ nfs4_write_rpc_async_finish( } lck_mtx_unlock(&nmp->nm_lock); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, np, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, nfsvers, &xid); nfsmout: if (!lockerror) nfs_node_unlock(np); @@ -484,6 +539,8 @@ nfs4_write_rpc_async_finish( ((mp = NFSTOMP(np))) && (vfs_flags(mp) & MNT_ASYNC)) committed = NFS_WRITE_FILESYNC; *iomodep = committed; + if (np->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR) + microuptime(&np->n_lastio); return (error); } @@ -500,11 +557,15 @@ nfs4_remove_rpc( int nfsvers, numops; u_int64_t xid; struct nfsm_chain nmreq, nmrep; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(dnp); if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; + if (dnp->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); + NFSREQ_SECINFO_SET(&si, dnp, NULL, 0, NULL, 0); restart: nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -518,16 +579,15 @@ nfs4_remove_rpc( nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_REMOVE); - nfsm_chain_add_string(error, &nmreq, name, namelen); + nfsm_chain_add_name(error, &nmreq, name, namelen, nmp); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, dnp); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request2(dnp, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, 0, &nmrep, &xid, &status); + error = nfs_request2(dnp, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, &si, 0, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(dnp))) error = lockerror; @@ -538,7 +598,7 @@ nfs4_remove_rpc( remove_error = error; nfsm_chain_check_change_info(error, &nmrep, dnp); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, dnp, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, dnp, nfsvers, &xid); if (error && !lockerror) NATTRINVALIDATE(dnp); nfsmout: @@ -571,12 +631,18 @@ nfs4_rename_rpc( struct nfsmount *nmp; u_int64_t xid, savedxid; struct nfsm_chain nmreq, nmrep; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(fdnp); if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; + if (fdnp->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); + if (tdnp->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); + NFSREQ_SECINFO_SET(&si, fdnp, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -594,23 +660,21 @@ nfs4_rename_rpc( nfsm_chain_add_fh(error, &nmreq, nfsvers, tdnp->n_fhp, tdnp->n_fhsize); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_RENAME); - nfsm_chain_add_string(error, &nmreq, fnameptr, fnamelen); - nfsm_chain_add_string(error, &nmreq, tnameptr, tnamelen); + nfsm_chain_add_name(error, &nmreq, fnameptr, fnamelen, nmp); + nfsm_chain_add_name(error, &nmreq, tnameptr, tnamelen, nmp); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, tdnp); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_RESTOREFH); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, fdnp); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request(fdnp, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &nmrep, &xid, &status); + error = nfs_request(fdnp, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &si, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock2(fdnp, tdnp))) error = lockerror; @@ -625,13 +689,13 @@ nfs4_rename_rpc( /* directory attributes: if we don't get them, make sure to invalidate */ nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); savedxid = xid; - nfsm_chain_loadattr(error, &nmrep, tdnp, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, tdnp, nfsvers, &xid); if (error && !lockerror) NATTRINVALIDATE(tdnp); nfsm_chain_op_check(error, &nmrep, NFS_OP_RESTOREFH); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); xid = savedxid; - nfsm_chain_loadattr(error, &nmrep, fdnp, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, fdnp, nfsvers, &xid); if (error && !lockerror) NATTRINVALIDATE(fdnp); nfsmout: @@ -642,9 +706,6 @@ nfs4_rename_rpc( tdnp->n_flag |= NMODIFIED; nfs_node_unlock2(fdnp, tdnp); } - /* Kludge: Map EEXIST => 0 assuming that it is a reply to a retry. */ - if (error == EEXIST) - error = 0; return (error); } @@ -655,7 +716,7 @@ int nfs4_readdir_rpc(nfsnode_t dnp, struct nfsbuf *bp, vfs_context_t ctx) { struct nfsmount *nmp; - int error = 0, lockerror, nfsvers, rdirplus, bigcookies, numops; + int error = 0, lockerror, nfsvers, namedattr, rdirplus, bigcookies, numops; int i, status, more_entries = 1, eof, bp_dropped = 0; uint32_t nmreaddirsize, nmrsize; uint32_t namlen, skiplen, fhlen, xlen, attrlen, reclen, space_free, space_needed; @@ -669,6 +730,7 @@ nfs4_readdir_rpc(nfsnode_t dnp, struct nfsbuf *bp, vfs_context_t ctx) const char *tag; uint32_t entry_attrs[NFS_ATTR_BITMAP_LEN]; struct timeval now; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(dnp); if (!nmp) @@ -677,7 +739,11 @@ nfs4_readdir_rpc(nfsnode_t dnp, struct nfsbuf *bp, vfs_context_t ctx) nmreaddirsize = nmp->nm_readdirsize; nmrsize = nmp->nm_rsize; bigcookies = nmp->nm_state & NFSSTA_BIGCOOKIES; - rdirplus = ((nfsvers > NFS_VER2) && (nmp->nm_flag & NFSMNT_RDIRPLUS)) ? 1 : 0; + namedattr = (dnp->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR) ? 1 : 0; + rdirplus = (NMFLAG(nmp, RDIRPLUS) || namedattr) ? 1 : 0; + if (dnp->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); + NFSREQ_SECINFO_SET(&si, dnp, NULL, 0, NULL, 0); /* * Set up attribute request for entries. @@ -686,18 +752,15 @@ nfs4_readdir_rpc(nfsnode_t dnp, struct nfsbuf *bp, vfs_context_t ctx) */ if (rdirplus) { tag = "readdirplus"; - for (i=0; i < NFS_ATTR_BITMAP_LEN; i++) - entry_attrs[i] = - nfs_getattr_bitmap[i] & - nmp->nm_fsattr.nfsa_supp_attr[i]; + NFS_COPY_ATTRIBUTES(nfs_getattr_bitmap, entry_attrs); NFS_BITMAP_SET(entry_attrs, NFS_FATTR_FILEHANDLE); } else { tag = "readdir"; NFS_CLEAR_ATTRIBUTES(entry_attrs); NFS_BITMAP_SET(entry_attrs, NFS_FATTR_TYPE); NFS_BITMAP_SET(entry_attrs, NFS_FATTR_FILEID); + NFS_BITMAP_SET(entry_attrs, NFS_FATTR_MOUNTED_ON_FILEID); } - /* XXX NFS_BITMAP_SET(entry_attrs, NFS_FATTR_MOUNTED_ON_FILEID); */ NFS_BITMAP_SET(entry_attrs, NFS_FATTR_RDATTR_ERROR); /* lock to protect access to cookie verifier */ @@ -722,8 +785,10 @@ nfs4_readdir_rpc(nfsnode_t dnp, struct nfsbuf *bp, vfs_context_t ctx) /* * The NFS client is responsible for the "." and ".." entries in the * directory. So, we put them at the start of the first buffer. + * Don't bother for attribute directories. */ - if ((bp->nb_lblkno == 0) && (ndbhp->ndbh_count == 0)) { + if (((bp->nb_lblkno == 0) && (ndbhp->ndbh_count == 0)) && + !(dnp->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR)) { fh.fh_len = 0; fhlen = rdirplus ? fh.fh_len + 1 : 0; xlen = rdirplus ? (fhlen + sizeof(time_t)) : 0; @@ -790,20 +855,19 @@ nfs4_readdir_rpc(nfsnode_t dnp, struct nfsbuf *bp, vfs_context_t ctx) nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, dnp); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_READDIR); nfsm_chain_add_64(error, &nmreq, (cookie <= 2) ? 0 : cookie); nfsm_chain_add_64(error, &nmreq, dnp->n_cookieverf); nfsm_chain_add_32(error, &nmreq, nmreaddirsize); nfsm_chain_add_32(error, &nmreq, nmrsize); - nfsm_chain_add_bitmap(error, &nmreq, entry_attrs, NFS_ATTR_BITMAP_LEN); + nfsm_chain_add_bitmap_supported(error, &nmreq, entry_attrs, nmp, dnp); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfs_node_unlock(dnp); nfsmout_if(error); - error = nfs_request(dnp, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &nmrep, &xid, &status); + error = nfs_request(dnp, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &si, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(dnp))) error = lockerror; @@ -813,7 +877,7 @@ nfs4_readdir_rpc(nfsnode_t dnp, struct nfsbuf *bp, vfs_context_t ctx) nfsm_chain_get_32(error, &nmrep, numops); nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, dnp, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, dnp, nfsvers, &xid); nfsm_chain_op_check(error, &nmrep, NFS_OP_READDIR); nfsm_chain_get_64(error, &nmrep, dnp->n_cookieverf); nfsm_chain_get_32(error, &nmrep, more_entries); @@ -898,14 +962,21 @@ nfs4_readdir_rpc(nfsnode_t dnp, struct nfsbuf *bp, vfs_context_t ctx) nfsm_rndup(namlen + skiplen) - nfsm_rndup(namlen)); nfsmout_if(error); nvattrp = rdirplus ? NFS_DIR_BUF_NVATTR(bp, ndbhp->ndbh_count) : &nvattr; - NFS_CLEAR_ATTRIBUTES(nvattrp->nva_bitmap); - error = nfs4_parsefattr(&nmrep, NULL, nvattrp, &fh, NULL); + error = nfs4_parsefattr(&nmrep, NULL, nvattrp, &fh, NULL, NULL); + if (!error && NFS_BITMAP_ISSET(nvattrp->nva_bitmap, NFS_FATTR_ACL)) { + /* we do NOT want ACLs returned to us here */ + NFS_BITMAP_CLR(nvattrp->nva_bitmap, NFS_FATTR_ACL); + if (nvattrp->nva_acl) { + kauth_acl_free(nvattrp->nva_acl); + nvattrp->nva_acl = NULL; + } + } if (error && NFS_BITMAP_ISSET(nvattrp->nva_bitmap, NFS_FATTR_RDATTR_ERROR)) { - /* OK, we didn't get attributes, whatever... */ - if (rdirplus) /* mark the attributes invalid */ - bzero(nvattrp, sizeof(struct nfs_vattr)); - else - NFS_CLEAR_ATTRIBUTES(nvattrp->nva_bitmap); + /* OK, we may not have gotten all of the attributes but we will use what we can. */ + if ((error == NFSERR_MOVED) || (error == NFSERR_INVAL)) { + /* set this up to look like a referral trigger */ + nfs4_default_attrs_for_referral_trigger(dnp, dp->d_name, namlen, nvattrp, &fh); + } error = 0; } /* check for more entries after this one */ @@ -913,7 +984,9 @@ nfs4_readdir_rpc(nfsnode_t dnp, struct nfsbuf *bp, vfs_context_t ctx) nfsmout_if(error); /* Skip any "." and ".." entries returned from server. */ - if ((dp->d_name[0] == '.') && ((namlen == 1) || ((namlen == 2) && (dp->d_name[1] == '.')))) { + /* Also skip any bothersome named attribute entries. */ + if (((dp->d_name[0] == '.') && ((namlen == 1) || ((namlen == 2) && (dp->d_name[1] == '.')))) || + (namedattr && (namlen == 11) && (!strcmp(dp->d_name, "SUNWattr_ro") || !strcmp(dp->d_name, "SUNWattr_rw")))) { lastcookie = cookie; continue; } @@ -1001,23 +1074,30 @@ nfs4_lookup_rpc_async( vfs_context_t ctx, struct nfsreq **reqp) { - int error = 0, isdotdot = 0, getattrs = 1, nfsvers, numops; + int error = 0, isdotdot = 0, nfsvers, numops; struct nfsm_chain nmreq; uint32_t bitmap[NFS_ATTR_BITMAP_LEN]; struct nfsmount *nmp; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(dnp); if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; + if (dnp->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); - if ((name[0] == '.') && (name[1] == '.') && (namelen == 2)) + if ((name[0] == '.') && (name[1] == '.') && (namelen == 2)) { isdotdot = 1; + NFSREQ_SECINFO_SET(&si, dnp, NULL, 0, NULL, 0); + } else { + NFSREQ_SECINFO_SET(&si, dnp, dnp->n_fhp, dnp->n_fhsize, name, namelen); + } nfsm_chain_null(&nmreq); - // PUTFH, GETATTR, LOOKUP(P), GETATTR (FH) - numops = getattrs ? 4 : 3; + // PUTFH, GETATTR, LOOKUP(P), GETFH, GETATTR (FH) + numops = 5; nfsm_chain_build_alloc_init(error, &nmreq, 20 * NFSX_UNSIGNED + namelen); nfsm_chain_add_compound_header(error, &nmreq, "lookup", numops); numops--; @@ -1025,50 +1105,59 @@ nfs4_lookup_rpc_async( nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, dnp); numops--; if (isdotdot) { nfsm_chain_add_32(error, &nmreq, NFS_OP_LOOKUPP); } else { nfsm_chain_add_32(error, &nmreq, NFS_OP_LOOKUP); - nfsm_chain_add_string(error, &nmreq, name, namelen); - } - if (getattrs) { - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - NFS_COPY_ATTRIBUTES(nfs_getattr_bitmap, bitmap); - NFS_BITMAP_SET(bitmap, NFS_FATTR_FILEHANDLE); - nfsm_chain_add_bitmap_masked(error, &nmreq, bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_name(error, &nmreq, name, namelen, nmp); } + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETFH); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + NFS_COPY_ATTRIBUTES(nfs_getattr_bitmap, bitmap); + /* some ".zfs" directories can't handle being asked for some attributes */ + if ((dnp->n_flag & NISDOTZFS) && !isdotdot) + NFS_BITMAP_CLR(bitmap, NFS_FATTR_NAMED_ATTR); + if ((dnp->n_flag & NISDOTZFSCHILD) && isdotdot) + NFS_BITMAP_CLR(bitmap, NFS_FATTR_NAMED_ATTR); + if (((namelen == 4) && (name[0] == '.') && (name[1] == 'z') && (name[2] == 'f') && (name[3] == 's'))) + NFS_BITMAP_CLR(bitmap, NFS_FATTR_NAMED_ATTR); + nfsm_chain_add_bitmap_supported(error, &nmreq, bitmap, nmp, NULL); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC4_COMPOUND, - vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, reqp); + vfs_context_thread(ctx), vfs_context_ucred(ctx), &si, 0, NULL, reqp); nfsmout: nfsm_chain_cleanup(&nmreq); return (error); } + int nfs4_lookup_rpc_async_finish( nfsnode_t dnp, - __unused vfs_context_t ctx, + char *name, + int namelen, + vfs_context_t ctx, struct nfsreq *req, u_int64_t *xidp, fhandle_t *fhp, struct nfs_vattr *nvap) { - int error = 0, lockerror = ENOENT, status, nfsvers, numops; - uint32_t val = 0; + int error = 0, lockerror = ENOENT, status, nfsvers, numops, isdotdot = 0; + uint32_t op = NFS_OP_LOOKUP; u_int64_t xid; struct nfsmount *nmp; struct nfsm_chain nmrep; nmp = NFSTONMP(dnp); nfsvers = nmp->nm_vers; + if ((name[0] == '.') && (name[1] == '.') && (namelen == 2)) + isdotdot = 1; nfsm_chain_null(&nmrep); @@ -1082,47 +1171,69 @@ nfs4_lookup_rpc_async_finish( nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); if (xidp) *xidp = xid; - nfsm_chain_loadattr(error, &nmrep, dnp, nfsvers, NULL, &xid); - - // nfsm_chain_op_check(error, &nmrep, (isdotdot ? NFS_OP_LOOKUPP : NFS_OP_LOOKUP)); - nfsm_chain_get_32(error, &nmrep, val); - nfsm_assert(error, (val == NFS_OP_LOOKUPP) || (val == NFS_OP_LOOKUP), EBADRPC); - nfsm_chain_get_32(error, &nmrep, val); - nfsm_assert(error, (val == NFS_OK), val); + nfsm_chain_loadattr(error, &nmrep, dnp, nfsvers, &xid); + nfsm_chain_op_check(error, &nmrep, (isdotdot ? NFS_OP_LOOKUPP : NFS_OP_LOOKUP)); nfsmout_if(error || !fhp || !nvap); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETFH); + nfsm_chain_get_32(error, &nmrep, fhp->fh_len); + nfsm_chain_get_opaque(error, &nmrep, fhp->fh_len, fhp->fh_data); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsmout_if(error); - NFS_CLEAR_ATTRIBUTES(nvap->nva_bitmap); - error = nfs4_parsefattr(&nmrep, NULL, nvap, fhp, NULL); - if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_FILEHANDLE)) { - error = EBADRPC; - goto nfsmout; + if ((error == NFSERR_MOVED) || (error == NFSERR_INVAL)) { + /* set this up to look like a referral trigger */ + nfs4_default_attrs_for_referral_trigger(dnp, name, namelen, nvap, fhp); + error = 0; + } else { + nfsmout_if(error); + error = nfs4_parsefattr(&nmrep, NULL, nvap, NULL, NULL, NULL); } nfsmout: if (!lockerror) nfs_node_unlock(dnp); nfsm_chain_cleanup(&nmrep); + if (!error && (op == NFS_OP_LOOKUP) && (nmp->nm_state & NFSSTA_NEEDSECINFO)) { + /* We still need to get SECINFO to set default for mount. */ + /* Do so for the first LOOKUP that returns successfully. */ + struct nfs_sec sec; + + sec.count = NX_MAX_SEC_FLAVORS; + error = nfs4_secinfo_rpc(nmp, &req->r_secinfo, vfs_context_ucred(ctx), sec.flavors, &sec.count); + /* [sigh] some implementations return "illegal" error for unsupported ops */ + if (error == NFSERR_OP_ILLEGAL) + error = 0; + if (!error) { + /* set our default security flavor to the first in the list */ + lck_mtx_lock(&nmp->nm_lock); + if (sec.count) + nmp->nm_auth = sec.flavors[0]; + nmp->nm_state &= ~NFSSTA_NEEDSECINFO; + lck_mtx_unlock(&nmp->nm_lock); + } + } return (error); } int nfs4_commit_rpc( nfsnode_t np, - u_int64_t offset, - u_int64_t count, - kauth_cred_t cred) + uint64_t offset, + uint64_t count, + kauth_cred_t cred, + uint64_t wverf) { struct nfsmount *nmp; int error = 0, lockerror, status, nfsvers, numops; - u_int64_t xid, wverf; + u_int64_t xid, newwverf; uint32_t count32; struct nfsm_chain nmreq, nmrep; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(np); FSDBG(521, np, offset, count, nmp ? nmp->nm_state : 0); if (!nmp) return (ENXIO); + if (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); if (!(nmp->nm_state & NFSSTA_HASWRITEVERF)) return (0); nfsvers = nmp->nm_vers; @@ -1132,6 +1243,7 @@ nfs4_commit_rpc( else count32 = count; + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -1148,13 +1260,12 @@ nfs4_commit_rpc( nfsm_chain_add_32(error, &nmreq, count32); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, np); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); error = nfs_request2(np, NULL, &nmreq, NFSPROC4_COMPOUND, - current_thread(), cred, 0, &nmrep, &xid, &status); + current_thread(), cred, &si, 0, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; @@ -1162,17 +1273,17 @@ nfs4_commit_rpc( nfsm_chain_get_32(error, &nmrep, numops); nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); nfsm_chain_op_check(error, &nmrep, NFS_OP_COMMIT); - nfsm_chain_get_64(error, &nmrep, wverf); + nfsm_chain_get_64(error, &nmrep, newwverf); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, np, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, nfsvers, &xid); if (!lockerror) nfs_node_unlock(np); nfsmout_if(error); lck_mtx_lock(&nmp->nm_lock); - if (nmp->nm_verf != wverf) { - nmp->nm_verf = wverf; + if (nmp->nm_verf != newwverf) + nmp->nm_verf = newwverf; + if (wverf != newwverf) error = NFSERR_STALEWRITEVERF; - } lck_mtx_unlock(&nmp->nm_lock); nfsmout: nfsm_chain_cleanup(&nmreq); @@ -1192,11 +1303,16 @@ nfs4_pathconf_rpc( struct nfsmount *nmp = NFSTONMP(np); uint32_t bitmap[NFS_ATTR_BITMAP_LEN]; struct nfs_vattr nvattr; + struct nfsreq_secinfo_args si; if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; + if (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); + NVATTR_INIT(&nvattr); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -1217,20 +1333,18 @@ nfs4_pathconf_rpc( NFS_BITMAP_SET(bitmap, NFS_FATTR_CHOWN_RESTRICTED); NFS_BITMAP_SET(bitmap, NFS_FATTR_CASE_INSENSITIVE); NFS_BITMAP_SET(bitmap, NFS_FATTR_CASE_PRESERVING); - nfsm_chain_add_bitmap_masked(error, &nmreq, bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, bitmap, nmp, np); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &nmrep, &xid, &status); + error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &si, &nmrep, &xid, &status); nfsm_chain_skip_tag(error, &nmrep); nfsm_chain_get_32(error, &nmrep, numops); nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); nfsmout_if(error); - NFS_CLEAR_ATTRIBUTES(nvattr.nva_bitmap); - error = nfs4_parsefattr(&nmrep, nfsap, &nvattr, NULL, NULL); + error = nfs4_parsefattr(&nmrep, nfsap, &nvattr, NULL, NULL, NULL); nfsmout_if(error); if ((lockerror = nfs_node_lock(np))) error = lockerror; @@ -1239,6 +1353,7 @@ nfs4_pathconf_rpc( if (!lockerror) nfs_node_unlock(np); nfsmout: + NVATTR_CLEANUP(&nvattr); nfsm_chain_cleanup(&nmreq); nfsm_chain_cleanup(&nmrep); return (error); @@ -1254,79 +1369,102 @@ nfs4_vnop_getattr( } */ *ap) { struct vnode_attr *vap = ap->a_vap; + struct nfsmount *nmp; struct nfs_vattr nva; - int error; + int error, acls, ngaflags; + + if (!(nmp = VTONMP(ap->a_vp))) + return (ENXIO); + acls = (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_ACL); - error = nfs_getattr(VTONFS(ap->a_vp), &nva, ap->a_context, NGA_CACHED); + ngaflags = NGA_CACHED; + if (VATTR_IS_ACTIVE(vap, va_acl) && acls) + ngaflags |= NGA_ACL; + error = nfs_getattr(VTONFS(ap->a_vp), &nva, ap->a_context, ngaflags); if (error) return (error); /* copy what we have in nva to *a_vap */ - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_RAWDEV)) { + if (VATTR_IS_ACTIVE(vap, va_rdev) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_RAWDEV)) { dev_t rdev = makedev(nva.nva_rawdev.specdata1, nva.nva_rawdev.specdata2); VATTR_RETURN(vap, va_rdev, rdev); } - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_NUMLINKS)) + if (VATTR_IS_ACTIVE(vap, va_nlink) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_NUMLINKS)) VATTR_RETURN(vap, va_nlink, nva.nva_nlink); - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_SIZE)) + if (VATTR_IS_ACTIVE(vap, va_data_size) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_SIZE)) VATTR_RETURN(vap, va_data_size, nva.nva_size); // VATTR_RETURN(vap, va_data_alloc, ???); // VATTR_RETURN(vap, va_total_size, ???); - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_SPACE_USED)) + if (VATTR_IS_ACTIVE(vap, va_total_alloc) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_SPACE_USED)) VATTR_RETURN(vap, va_total_alloc, nva.nva_bytes); - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_OWNER)) + if (VATTR_IS_ACTIVE(vap, va_uid) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_OWNER)) VATTR_RETURN(vap, va_uid, nva.nva_uid); - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_OWNER_GROUP)) + if (VATTR_IS_ACTIVE(vap, va_uuuid) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_OWNER)) + VATTR_RETURN(vap, va_uuuid, nva.nva_uuuid); + if (VATTR_IS_ACTIVE(vap, va_gid) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_OWNER_GROUP)) VATTR_RETURN(vap, va_gid, nva.nva_gid); - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_MODE)) - VATTR_RETURN(vap, va_mode, nva.nva_mode); - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_ARCHIVE) || - NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_HIDDEN)) { + if (VATTR_IS_ACTIVE(vap, va_guuid) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_OWNER_GROUP)) + VATTR_RETURN(vap, va_guuid, nva.nva_guuid); + if (VATTR_IS_ACTIVE(vap, va_mode)) { + if (NMFLAG(nmp, ACLONLY) || !NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_MODE)) + VATTR_RETURN(vap, va_mode, 0777); + else + VATTR_RETURN(vap, va_mode, nva.nva_mode); + } + if (VATTR_IS_ACTIVE(vap, va_flags) && + (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_ARCHIVE) || + NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_HIDDEN) || + (nva.nva_flags & NFS_FFLAG_TRIGGER))) { uint32_t flags = 0; - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_ARCHIVE)) + if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_ARCHIVE) && + (nva.nva_flags & NFS_FFLAG_ARCHIVED)) flags |= SF_ARCHIVED; - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_HIDDEN)) + if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_HIDDEN) && + (nva.nva_flags & NFS_FFLAG_HIDDEN)) flags |= UF_HIDDEN; VATTR_RETURN(vap, va_flags, flags); } - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_TIME_CREATE)) { + if (VATTR_IS_ACTIVE(vap, va_create_time) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_TIME_CREATE)) { vap->va_create_time.tv_sec = nva.nva_timesec[NFSTIME_CREATE]; vap->va_create_time.tv_nsec = nva.nva_timensec[NFSTIME_CREATE]; VATTR_SET_SUPPORTED(vap, va_create_time); } - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_TIME_ACCESS)) { + if (VATTR_IS_ACTIVE(vap, va_access_time) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_TIME_ACCESS)) { vap->va_access_time.tv_sec = nva.nva_timesec[NFSTIME_ACCESS]; vap->va_access_time.tv_nsec = nva.nva_timensec[NFSTIME_ACCESS]; VATTR_SET_SUPPORTED(vap, va_access_time); } - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_TIME_MODIFY)) { + if (VATTR_IS_ACTIVE(vap, va_modify_time) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_TIME_MODIFY)) { vap->va_modify_time.tv_sec = nva.nva_timesec[NFSTIME_MODIFY]; vap->va_modify_time.tv_nsec = nva.nva_timensec[NFSTIME_MODIFY]; VATTR_SET_SUPPORTED(vap, va_modify_time); } - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_TIME_METADATA)) { + if (VATTR_IS_ACTIVE(vap, va_change_time) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_TIME_METADATA)) { vap->va_change_time.tv_sec = nva.nva_timesec[NFSTIME_CHANGE]; vap->va_change_time.tv_nsec = nva.nva_timensec[NFSTIME_CHANGE]; VATTR_SET_SUPPORTED(vap, va_change_time); } - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_TIME_BACKUP)) { + if (VATTR_IS_ACTIVE(vap, va_backup_time) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_TIME_BACKUP)) { vap->va_backup_time.tv_sec = nva.nva_timesec[NFSTIME_BACKUP]; vap->va_backup_time.tv_nsec = nva.nva_timensec[NFSTIME_BACKUP]; VATTR_SET_SUPPORTED(vap, va_backup_time); } - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_FILEID)) + if (VATTR_IS_ACTIVE(vap, va_fileid) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_FILEID)) VATTR_RETURN(vap, va_fileid, nva.nva_fileid); - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_TYPE)) + if (VATTR_IS_ACTIVE(vap, va_type) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_TYPE)) VATTR_RETURN(vap, va_type, nva.nva_type); - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_CHANGE)) + if (VATTR_IS_ACTIVE(vap, va_filerev) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_CHANGE)) VATTR_RETURN(vap, va_filerev, nva.nva_change); + if (VATTR_IS_ACTIVE(vap, va_acl) && acls) { + VATTR_RETURN(vap, va_acl, nva.nva_acl); + nva.nva_acl = NULL; + } + // other attrs we might support someday: // VATTR_RETURN(vap, va_encoding, ??? /* potentially unnormalized UTF-8? */); - // struct kauth_acl *va_acl; /* access control list */ - // guid_t va_uuuid; /* file owner UUID */ - // guid_t va_guuid; /* file group UUID */ + NVATTR_CLEANUP(&nva); return (error); } @@ -1337,15 +1475,20 @@ nfs4_setattr_rpc( vfs_context_t ctx) { struct nfsmount *nmp = NFSTONMP(np); - int error = 0, lockerror = ENOENT, status, nfsvers, numops; + int error = 0, setattr_error = 0, lockerror = ENOENT, status, nfsvers, numops; u_int64_t xid, nextxid; struct nfsm_chain nmreq, nmrep; uint32_t bitmap[NFS_ATTR_BITMAP_LEN], bmlen; + uint32_t getbitmap[NFS_ATTR_BITMAP_LEN]; + uint32_t setbitmap[NFS_ATTR_BITMAP_LEN]; nfs_stateid stateid; + struct nfsreq_secinfo_args si; if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; + if (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); if (VATTR_IS_ACTIVE(vap, va_flags) && (vap->va_flags & ~(SF_ARCHIVED|UF_HIDDEN))) { /* we don't support setting unsupported flags (duh!) */ @@ -1355,9 +1498,39 @@ nfs4_setattr_rpc( return (ENOTSUP); /* return ENOTSUP for chflags(2) */ } + /* don't bother requesting some changes if they don't look like they are changing */ + if (VATTR_IS_ACTIVE(vap, va_uid) && (vap->va_uid == np->n_vattr.nva_uid)) + VATTR_CLEAR_ACTIVE(vap, va_uid); + if (VATTR_IS_ACTIVE(vap, va_gid) && (vap->va_gid == np->n_vattr.nva_gid)) + VATTR_CLEAR_ACTIVE(vap, va_gid); + if (VATTR_IS_ACTIVE(vap, va_uuuid) && kauth_guid_equal(&vap->va_uuuid, &np->n_vattr.nva_uuuid)) + VATTR_CLEAR_ACTIVE(vap, va_uuuid); + if (VATTR_IS_ACTIVE(vap, va_guuid) && kauth_guid_equal(&vap->va_guuid, &np->n_vattr.nva_guuid)) + VATTR_CLEAR_ACTIVE(vap, va_guuid); + +tryagain: + /* do nothing if no attributes will be sent */ + nfs_vattr_set_bitmap(nmp, bitmap, vap); + if (!bitmap[0] && !bitmap[1]) + return (0); + + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); + /* + * Prepare GETATTR bitmap: if we are setting the ACL or mode, we + * need to invalidate any cached ACL. And if we had an ACL cached, + * we might as well also fetch the new value. + */ + NFS_COPY_ATTRIBUTES(nfs_getattr_bitmap, getbitmap); + if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_ACL) || + NFS_BITMAP_ISSET(bitmap, NFS_FATTR_MODE)) { + if (NACLVALID(np)) + NFS_BITMAP_SET(getbitmap, NFS_FATTR_ACL); + NACLINVALIDATE(np); + } + // PUTFH, SETATTR, GETATTR numops = 3; nfsm_chain_build_alloc_init(error, &nmreq, 40 * NFSX_UNSIGNED); @@ -1375,25 +1548,32 @@ nfs4_setattr_rpc( nfsm_chain_add_fattr4(error, &nmreq, vap, nmp); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, getbitmap, nmp, np); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &nmrep, &xid, &status); + error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &si, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; nfsm_chain_skip_tag(error, &nmrep); nfsm_chain_get_32(error, &nmrep, numops); nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); + nfsmout_if(error); nfsm_chain_op_check(error, &nmrep, NFS_OP_SETATTR); + nfsmout_if(error == EBADRPC); + setattr_error = error; + error = 0; bmlen = NFS_ATTR_BITMAP_LEN; - nfsm_chain_get_bitmap(error, &nmrep, bitmap, bmlen); - nfsmout_if(error); - nfs_vattr_set_supported(bitmap, vap); + nfsm_chain_get_bitmap(error, &nmrep, setbitmap, bmlen); + if (!error) { + if (VATTR_IS_ACTIVE(vap, va_data_size) && (np->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR)) + microuptime(&np->n_lastio); + nfs_vattr_set_supported(setbitmap, vap); + error = setattr_error; + } nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, np, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, nfsvers, &xid); if (error) NATTRINVALIDATE(np); /* @@ -1416,6 +1596,20 @@ nfs4_setattr_rpc( nfs_node_unlock(np); nfsm_chain_cleanup(&nmreq); nfsm_chain_cleanup(&nmrep); + if ((setattr_error == EINVAL) && VATTR_IS_ACTIVE(vap, va_acl) && VATTR_IS_ACTIVE(vap, va_mode) && !NMFLAG(nmp, ACLONLY)) { + /* + * Some server's may not like ACL/mode combos that get sent. + * If it looks like that's what the server choked on, try setting + * just the ACL and not the mode (unless it looks like everything + * but mode was already successfully set). + */ + if (((bitmap[0] & setbitmap[0]) != bitmap[0]) || + ((bitmap[1] & (setbitmap[1]|NFS_FATTR_MODE)) != bitmap[1])) { + VATTR_CLEAR_ACTIVE(vap, va_mode); + error = 0; + goto tryagain; + } + } return (error); } @@ -1426,7 +1620,7 @@ int nfs_mount_state_wait_for_recovery(struct nfsmount *nmp) { struct timespec ts = { 1, 0 }; - int error = 0, slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0; + int error = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0; lck_mtx_lock(&nmp->nm_lock); while (nmp->nm_state & NFSSTA_RECOVER) { @@ -1434,6 +1628,7 @@ nfs_mount_state_wait_for_recovery(struct nfsmount *nmp) break; nfs_mount_sock_thread_wake(nmp); msleep(&nmp->nm_state, &nmp->nm_lock, slpflag|(PZERO-1), "nfsrecoverwait", &ts); + slpflag = 0; } lck_mtx_unlock(&nmp->nm_lock); @@ -1447,19 +1642,24 @@ nfs_mount_state_wait_for_recovery(struct nfsmount *nmp) * the recovery thread until we're done). */ int -nfs_mount_state_in_use_start(struct nfsmount *nmp) +nfs_mount_state_in_use_start(struct nfsmount *nmp, thread_t thd) { struct timespec ts = { 1, 0 }; - int error = 0, slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0; + int error = 0, slpflag = (NMFLAG(nmp, INTR) && thd) ? PCATCH : 0; if (!nmp) return (ENXIO); lck_mtx_lock(&nmp->nm_lock); + if (nmp->nm_state & (NFSSTA_FORCE|NFSSTA_DEAD)) { + lck_mtx_unlock(&nmp->nm_lock); + return (ENXIO); + } while (nmp->nm_state & NFSSTA_RECOVER) { - if ((error = nfs_sigintr(nmp, NULL, current_thread(), 1))) + if ((error = nfs_sigintr(nmp, NULL, thd, 1))) break; nfs_mount_sock_thread_wake(nmp); msleep(&nmp->nm_state, &nmp->nm_lock, slpflag|(PZERO-1), "nfsrecoverwait", &ts); + slpflag = 0; } if (!error) nmp->nm_stateinuse++; @@ -1482,11 +1682,9 @@ nfs_mount_state_in_use_end(struct nfsmount *nmp, int error) return (restart); lck_mtx_lock(&nmp->nm_lock); if (restart && (error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE)) { - if (!(nmp->nm_state & NFSSTA_RECOVER)) { - printf("nfs_mount_state_in_use_end: error %d, initiating recovery\n", error); - nmp->nm_state |= NFSSTA_RECOVER; - nfs_mount_sock_thread_wake(nmp); - } + printf("nfs_mount_state_in_use_end: error %d, initiating recovery for %s, 0x%x\n", + error, vfs_statfs(nmp->nm_mountp)->f_mntfromname, nmp->nm_stategenid); + nfs_need_recover(nmp, error); } if (nmp->nm_stateinuse > 0) nmp->nm_stateinuse--; @@ -1531,22 +1729,39 @@ nfs_mount_state_max_restarts(struct nfsmount *nmp) return (MAX(nmp->nm_fsattr.nfsa_lease, 60)); } +/* + * Does the error mean we probably lost a delegation? + */ +int +nfs_mount_state_error_delegation_lost(int error) +{ + switch (error) { + case NFSERR_STALE_STATEID: + case NFSERR_ADMIN_REVOKED: + case NFSERR_EXPIRED: + case NFSERR_OLD_STATEID: + case NFSERR_BAD_STATEID: + case NFSERR_GRACE: /* ugh! (stupid) RFC 3530 specifically disallows CLAIM_DELEGATE_CUR during grace period? */ + return (1); + } + return (0); +} + /* * Mark an NFS node's open state as busy. */ int -nfs_open_state_set_busy(nfsnode_t np, vfs_context_t ctx) +nfs_open_state_set_busy(nfsnode_t np, thread_t thd) { struct nfsmount *nmp; - thread_t thd = vfs_context_thread(ctx); struct timespec ts = {2, 0}; int error = 0, slpflag; nmp = NFSTONMP(np); if (!nmp) return (ENXIO); - slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0; + slpflag = (NMFLAG(nmp, INTR) && thd) ? PCATCH : 0; lck_mtx_lock(&np->n_openlock); while (np->n_openflags & N_OPENBUSY) { @@ -1554,6 +1769,7 @@ nfs_open_state_set_busy(nfsnode_t np, vfs_context_t ctx) break; np->n_openflags |= N_OPENWANT; msleep(&np->n_openflags, &np->n_openlock, slpflag, "nfs_open_state_set_busy", &ts); + slpflag = 0; } if (!error) np->n_openflags |= N_OPENBUSY; @@ -1688,7 +1904,7 @@ nfs_open_owner_set_busy(struct nfs_open_owner *noop, thread_t thd) nmp = noop->noo_mount; if (!nmp) return (ENXIO); - slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0; + slpflag = (NMFLAG(nmp, INTR) && thd) ? PCATCH : 0; lck_mtx_lock(&noop->noo_lock); while (noop->noo_flags & NFS_OPEN_OWNER_BUSY) { @@ -1696,6 +1912,7 @@ nfs_open_owner_set_busy(struct nfs_open_owner *noop, thread_t thd) break; noop->noo_flags |= NFS_OPEN_OWNER_WANT; msleep(noop, &noop->noo_lock, slpflag, "nfs_open_owner_set_busy", &ts); + slpflag = 0; } if (!error) noop->noo_flags |= NFS_OPEN_OWNER_BUSY; @@ -1761,6 +1978,24 @@ nfs_open_file_find( uint32_t accessMode, uint32_t denyMode, int alloc) +{ + *nofpp = NULL; + return nfs_open_file_find_internal(np, noop, nofpp, accessMode, denyMode, alloc); +} + +/* + * Internally, allow using a provisional nodeless nofp (passed in via *nofpp) + * if an existing one is not found. This is used in "create" scenarios to + * officially add the provisional nofp to the node once the node is created. + */ +int +nfs_open_file_find_internal( + nfsnode_t np, + struct nfs_open_owner *noop, + struct nfs_open_file **nofpp, + uint32_t accessMode, + uint32_t denyMode, + int alloc) { struct nfs_open_file *nofp = NULL, *nofp2, *newnofp = NULL; @@ -1777,7 +2012,6 @@ nfs_open_file_find( if ((accessMode & nofp2->nof_deny) || (denyMode & nofp2->nof_access)) { /* This request conflicts with an existing open on this client. */ lck_mtx_unlock(&np->n_openlock); - *nofpp = NULL; return (EACCES); } } @@ -1786,14 +2020,12 @@ nfs_open_file_find( * If this open owner doesn't have an open * file structure yet, we create one for it. */ - if (!nofp && !newnofp && alloc) { + if (!nofp && !*nofpp && !newnofp && alloc) { lck_mtx_unlock(&np->n_openlock); alloc: MALLOC(newnofp, struct nfs_open_file *, sizeof(struct nfs_open_file), M_TEMP, M_WAITOK); - if (!newnofp) { - *nofpp = NULL; + if (!newnofp) return (ENOMEM); - } bzero(newnofp, sizeof(*newnofp)); lck_mtx_init(&newnofp->nof_lock, nfs_open_grp, LCK_ATTR_NULL); newnofp->nof_owner = noop; @@ -1805,15 +2037,20 @@ nfs_open_file_find( if (np) goto tryagain; } - if (!nofp && newnofp) { - if (np) - TAILQ_INSERT_HEAD(&np->n_opens, newnofp, nof_link); - nofp = newnofp; + if (!nofp) { + if (*nofpp) { + (*nofpp)->nof_np = np; + nofp = *nofpp; + } else { + nofp = newnofp; + } + if (nofp && np) + TAILQ_INSERT_HEAD(&np->n_opens, nofp, nof_link); } if (np) lck_mtx_unlock(&np->n_openlock); - if (newnofp && (nofp != newnofp)) + if (alloc && newnofp && (nofp != newnofp)) nfs_open_file_destroy(newnofp); *nofpp = nofp; @@ -1848,7 +2085,7 @@ nfs_open_file_set_busy(struct nfs_open_file *nofp, thread_t thd) nmp = nofp->nof_owner->noo_mount; if (!nmp) return (ENXIO); - slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0; + slpflag = (NMFLAG(nmp, INTR) && thd) ? PCATCH : 0; lck_mtx_lock(&nofp->nof_lock); while (nofp->nof_flags & NFS_OPEN_FILE_BUSY) { @@ -1856,6 +2093,7 @@ nfs_open_file_set_busy(struct nfs_open_file *nofp, thread_t thd) break; nofp->nof_flags |= NFS_OPEN_FILE_WANT; msleep(nofp, &nofp->nof_lock, slpflag, "nfs_open_file_set_busy", &ts); + slpflag = 0; } if (!error) nofp->nof_flags |= NFS_OPEN_FILE_BUSY; @@ -1884,147 +2122,525 @@ nfs_open_file_clear_busy(struct nfs_open_file *nofp) } /* - * Get the current (delegation, lock, open, default) stateid for this node. - * If node has a delegation, use that stateid. - * If pid has a lock, use the lockowner's stateid. - * Or use the open file's stateid. - * If no open file, use a default stateid of all ones. + * Add the open state for the given access/deny modes to this open file. */ void -nfs_get_stateid(nfsnode_t np, thread_t thd, kauth_cred_t cred, nfs_stateid *sid) +nfs_open_file_add_open(struct nfs_open_file *nofp, uint32_t accessMode, uint32_t denyMode, int delegated) { - struct nfsmount *nmp = NFSTONMP(np); - proc_t p = thd ? get_bsdthreadtask_info(thd) : current_thread(); // XXX async I/O requests don't have a thread - struct nfs_open_owner *noop = NULL; - struct nfs_open_file *nofp = NULL; - struct nfs_lock_owner *nlop = NULL; - nfs_stateid *s = NULL; - - if (np->n_openflags & N_DELEG_MASK) - s = &np->n_dstateid; - else if (p) - nlop = nfs_lock_owner_find(np, p, 0); - if (nlop && !TAILQ_EMPTY(&nlop->nlo_locks)) { - /* we hold locks, use lock stateid */ - s = &nlop->nlo_stateid; - } else if (((noop = nfs_open_owner_find(nmp, cred, 0))) && - (nfs_open_file_find(np, noop, &nofp, 0, 0, 0) == 0) && - !(nofp->nof_flags & NFS_OPEN_FILE_LOST) && - nofp->nof_access) { - /* we (should) have the file open, use open stateid */ - if (nofp->nof_flags & NFS_OPEN_FILE_REOPEN) - nfs4_reopen(nofp, thd); - if (!(nofp->nof_flags & NFS_OPEN_FILE_LOST)) - s = &nofp->nof_stateid; - } + lck_mtx_lock(&nofp->nof_lock); + nofp->nof_access |= accessMode; + nofp->nof_deny |= denyMode; - if (s) { - sid->seqid = s->seqid; - sid->other[0] = s->other[0]; - sid->other[1] = s->other[1]; - sid->other[2] = s->other[2]; + if (delegated) { + if (denyMode == NFS_OPEN_SHARE_DENY_NONE) { + if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) + nofp->nof_d_r++; + else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) + nofp->nof_d_w++; + else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) + nofp->nof_d_rw++; + } else if (denyMode == NFS_OPEN_SHARE_DENY_WRITE) { + if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) + nofp->nof_d_r_dw++; + else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) + nofp->nof_d_w_dw++; + else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) + nofp->nof_d_rw_dw++; + } else { /* NFS_OPEN_SHARE_DENY_BOTH */ + if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) + nofp->nof_d_r_drw++; + else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) + nofp->nof_d_w_drw++; + else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) + nofp->nof_d_rw_drw++; + } } else { - const char *vname = vnode_getname(NFSTOV(np)); - printf("nfs_get_stateid: no stateid for %s\n", vname ? vname : "???"); - vnode_putname(vname); - sid->seqid = sid->other[0] = sid->other[1] = sid->other[2] = 0xffffffff; + if (denyMode == NFS_OPEN_SHARE_DENY_NONE) { + if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) + nofp->nof_r++; + else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) + nofp->nof_w++; + else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) + nofp->nof_rw++; + } else if (denyMode == NFS_OPEN_SHARE_DENY_WRITE) { + if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) + nofp->nof_r_dw++; + else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) + nofp->nof_w_dw++; + else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) + nofp->nof_rw_dw++; + } else { /* NFS_OPEN_SHARE_DENY_BOTH */ + if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) + nofp->nof_r_drw++; + else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) + nofp->nof_w_drw++; + else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) + nofp->nof_rw_drw++; + } } - if (nlop) - nfs_lock_owner_rele(nlop); - if (noop) - nfs_open_owner_rele(noop); + + nofp->nof_opencnt++; + lck_mtx_unlock(&nofp->nof_lock); } /* - * We always send the open RPC even if this open's mode is a subset of all - * the existing opens. This makes sure that we will always be able to do a - * downgrade to any of the open modes. - * - * Note: local conflicts should have already been checked. (nfs_open_file_find) + * Find which particular open combo will be closed and report what + * the new modes will be and whether the open was delegated. */ -int -nfs4_open( - nfsnode_t np, +void +nfs_open_file_remove_open_find( struct nfs_open_file *nofp, uint32_t accessMode, uint32_t denyMode, - vfs_context_t ctx) + uint32_t *newAccessMode, + uint32_t *newDenyMode, + int *delegated) { - vnode_t vp = NFSTOV(np); - vnode_t dvp = NULL; - struct componentname cn; - const char *vname = NULL; - size_t namelen; - char smallname[128]; - char *filename = NULL; - int error = 0, readtoo = 0; - - dvp = vnode_getparent(vp); - vname = vnode_getname(vp); - if (!dvp || !vname) { - error = EIO; - goto out; - } - filename = &smallname[0]; - namelen = snprintf(filename, sizeof(smallname), "%s", vname); - if (namelen >= sizeof(smallname)) { - namelen++; /* snprintf result doesn't include '\0' */ - MALLOC(filename, char *, namelen, M_TEMP, M_WAITOK); - if (!filename) { - error = ENOMEM; - goto out; - } - snprintf(filename, namelen, "%s", vname); - } - bzero(&cn, sizeof(cn)); - cn.cn_nameptr = filename; - cn.cn_namelen = namelen; - - if (!(accessMode & NFS_OPEN_SHARE_ACCESS_READ)) { - /* - * Try to open it for read access too, - * so the buffer cache can read data. - */ - readtoo = 1; - accessMode |= NFS_OPEN_SHARE_ACCESS_READ; - } -tryagain: - error = nfs4_open_rpc(nofp, ctx, &cn, NULL, dvp, &vp, NFS_OPEN_NOCREATE, accessMode, denyMode); - if (error) { - if (!nfs_mount_state_error_should_restart(error) && readtoo) { - /* try again without the extra read access */ - accessMode &= ~NFS_OPEN_SHARE_ACCESS_READ; - readtoo = 0; - goto tryagain; - } - goto out; - } - nofp->nof_access |= accessMode; - nofp->nof_deny |= denyMode; + /* + * Calculate new modes: a mode bit gets removed when there's only + * one count in all the corresponding counts + */ + *newAccessMode = nofp->nof_access; + *newDenyMode = nofp->nof_deny; + if ((accessMode & NFS_OPEN_SHARE_ACCESS_READ) && + (nofp->nof_access & NFS_OPEN_SHARE_ACCESS_READ) && + ((nofp->nof_r + nofp->nof_d_r + + nofp->nof_rw + nofp->nof_d_rw + + nofp->nof_r_dw + nofp->nof_d_r_dw + + nofp->nof_rw_dw + nofp->nof_d_rw_dw + + nofp->nof_r_drw + nofp->nof_d_r_drw + + nofp->nof_rw_dw + nofp->nof_d_rw_dw) == 1)) + *newAccessMode &= ~NFS_OPEN_SHARE_ACCESS_READ; + if ((accessMode & NFS_OPEN_SHARE_ACCESS_WRITE) && + (nofp->nof_access & NFS_OPEN_SHARE_ACCESS_WRITE) && + ((nofp->nof_w + nofp->nof_d_w + + nofp->nof_rw + nofp->nof_d_rw + + nofp->nof_w_dw + nofp->nof_d_w_dw + + nofp->nof_rw_dw + nofp->nof_d_rw_dw + + nofp->nof_w_drw + nofp->nof_d_w_drw + + nofp->nof_rw_dw + nofp->nof_d_rw_dw) == 1)) + *newAccessMode &= ~NFS_OPEN_SHARE_ACCESS_WRITE; + if ((denyMode & NFS_OPEN_SHARE_DENY_READ) && + (nofp->nof_deny & NFS_OPEN_SHARE_DENY_READ) && + ((nofp->nof_r_drw + nofp->nof_d_r_drw + + nofp->nof_w_drw + nofp->nof_d_w_drw + + nofp->nof_rw_drw + nofp->nof_d_rw_drw) == 1)) + *newDenyMode &= ~NFS_OPEN_SHARE_DENY_READ; + if ((denyMode & NFS_OPEN_SHARE_DENY_WRITE) && + (nofp->nof_deny & NFS_OPEN_SHARE_DENY_WRITE) && + ((nofp->nof_r_drw + nofp->nof_d_r_drw + + nofp->nof_w_drw + nofp->nof_d_w_drw + + nofp->nof_rw_drw + nofp->nof_d_rw_drw + + nofp->nof_r_dw + nofp->nof_d_r_dw + + nofp->nof_w_dw + nofp->nof_d_w_dw + + nofp->nof_rw_dw + nofp->nof_d_rw_dw) == 1)) + *newDenyMode &= ~NFS_OPEN_SHARE_DENY_WRITE; + + /* Find the corresponding open access/deny mode counter. */ if (denyMode == NFS_OPEN_SHARE_DENY_NONE) { if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) - nofp->nof_r++; + *delegated = (nofp->nof_d_r != 0); else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) - nofp->nof_w++; + *delegated = (nofp->nof_d_w != 0); else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) - nofp->nof_rw++; + *delegated = (nofp->nof_d_rw != 0); + else + *delegated = 0; } else if (denyMode == NFS_OPEN_SHARE_DENY_WRITE) { if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) - nofp->nof_r_dw++; + *delegated = (nofp->nof_d_r_dw != 0); else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) - nofp->nof_w_dw++; + *delegated = (nofp->nof_d_w_dw != 0); else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) - nofp->nof_rw_dw++; + *delegated = (nofp->nof_d_rw_dw != 0); + else + *delegated = 0; } else { /* NFS_OPEN_SHARE_DENY_BOTH */ if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) - nofp->nof_r_drw++; + *delegated = (nofp->nof_d_r_drw != 0); else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) - nofp->nof_w_drw++; + *delegated = (nofp->nof_d_w_drw != 0); else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) - nofp->nof_rw_drw++; + *delegated = (nofp->nof_d_rw_drw != 0); + else + *delegated = 0; } - nofp->nof_opencnt++; +} + +/* + * Remove the open state for the given access/deny modes to this open file. + */ +void +nfs_open_file_remove_open(struct nfs_open_file *nofp, uint32_t accessMode, uint32_t denyMode) +{ + uint32_t newAccessMode, newDenyMode; + int delegated = 0; + + lck_mtx_lock(&nofp->nof_lock); + nfs_open_file_remove_open_find(nofp, accessMode, denyMode, &newAccessMode, &newDenyMode, &delegated); + + /* Decrement the corresponding open access/deny mode counter. */ + if (denyMode == NFS_OPEN_SHARE_DENY_NONE) { + if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) { + if (delegated) { + if (nofp->nof_d_r == 0) + NP(nofp->nof_np, "nfs: open(R) delegated count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_d_r--; + } else { + if (nofp->nof_r == 0) + NP(nofp->nof_np, "nfs: open(R) count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_r--; + } + } else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) { + if (delegated) { + if (nofp->nof_d_w == 0) + NP(nofp->nof_np, "nfs: open(W) delegated count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_d_w--; + } else { + if (nofp->nof_w == 0) + NP(nofp->nof_np, "nfs: open(W) count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_w--; + } + } else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) { + if (delegated) { + if (nofp->nof_d_rw == 0) + NP(nofp->nof_np, "nfs: open(RW) delegated count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_d_rw--; + } else { + if (nofp->nof_rw == 0) + NP(nofp->nof_np, "nfs: open(RW) count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_rw--; + } + } + } else if (denyMode == NFS_OPEN_SHARE_DENY_WRITE) { + if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) { + if (delegated) { + if (nofp->nof_d_r_dw == 0) + NP(nofp->nof_np, "nfs: open(R,DW) delegated count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_d_r_dw--; + } else { + if (nofp->nof_r_dw == 0) + NP(nofp->nof_np, "nfs: open(R,DW) count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_r_dw--; + } + } else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) { + if (delegated) { + if (nofp->nof_d_w_dw == 0) + NP(nofp->nof_np, "nfs: open(W,DW) delegated count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_d_w_dw--; + } else { + if (nofp->nof_w_dw == 0) + NP(nofp->nof_np, "nfs: open(W,DW) count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_w_dw--; + } + } else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) { + if (delegated) { + if (nofp->nof_d_rw_dw == 0) + NP(nofp->nof_np, "nfs: open(RW,DW) delegated count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_d_rw_dw--; + } else { + if (nofp->nof_rw_dw == 0) + NP(nofp->nof_np, "nfs: open(RW,DW) count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_rw_dw--; + } + } + } else { /* NFS_OPEN_SHARE_DENY_BOTH */ + if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) { + if (delegated) { + if (nofp->nof_d_r_drw == 0) + NP(nofp->nof_np, "nfs: open(R,DRW) delegated count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_d_r_drw--; + } else { + if (nofp->nof_r_drw == 0) + NP(nofp->nof_np, "nfs: open(R,DRW) count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_r_drw--; + } + } else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) { + if (delegated) { + if (nofp->nof_d_w_drw == 0) + NP(nofp->nof_np, "nfs: open(W,DRW) delegated count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_d_w_drw--; + } else { + if (nofp->nof_w_drw == 0) + NP(nofp->nof_np, "nfs: open(W,DRW) count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_w_drw--; + } + } else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) { + if (delegated) { + if (nofp->nof_d_rw_drw == 0) + NP(nofp->nof_np, "nfs: open(RW,DRW) delegated count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_d_rw_drw--; + } else { + if (nofp->nof_rw_drw == 0) + NP(nofp->nof_np, "nfs: open(RW,DRW) count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_rw_drw--; + } + } + } + + /* update the modes */ + nofp->nof_access = newAccessMode; + nofp->nof_deny = newDenyMode; + nofp->nof_opencnt--; + lck_mtx_unlock(&nofp->nof_lock); +} + + +/* + * Get the current (delegation, lock, open, default) stateid for this node. + * If node has a delegation, use that stateid. + * If pid has a lock, use the lockowner's stateid. + * Or use the open file's stateid. + * If no open file, use a default stateid of all ones. + */ +void +nfs_get_stateid(nfsnode_t np, thread_t thd, kauth_cred_t cred, nfs_stateid *sid) +{ + struct nfsmount *nmp = NFSTONMP(np); + proc_t p = thd ? get_bsdthreadtask_info(thd) : current_proc(); // XXX async I/O requests don't have a thread + struct nfs_open_owner *noop = NULL; + struct nfs_open_file *nofp = NULL; + struct nfs_lock_owner *nlop = NULL; + nfs_stateid *s = NULL; + + if (np->n_openflags & N_DELEG_MASK) { + s = &np->n_dstateid; + } else { + if (p) + nlop = nfs_lock_owner_find(np, p, 0); + if (nlop && !TAILQ_EMPTY(&nlop->nlo_locks)) { + /* we hold locks, use lock stateid */ + s = &nlop->nlo_stateid; + } else if (((noop = nfs_open_owner_find(nmp, cred, 0))) && + (nfs_open_file_find(np, noop, &nofp, 0, 0, 0) == 0) && + !(nofp->nof_flags & NFS_OPEN_FILE_LOST) && + nofp->nof_access) { + /* we (should) have the file open, use open stateid */ + if (nofp->nof_flags & NFS_OPEN_FILE_REOPEN) + nfs4_reopen(nofp, thd); + if (!(nofp->nof_flags & NFS_OPEN_FILE_LOST)) + s = &nofp->nof_stateid; + } + } + + if (s) { + sid->seqid = s->seqid; + sid->other[0] = s->other[0]; + sid->other[1] = s->other[1]; + sid->other[2] = s->other[2]; + } else { + /* named attributes may not have a stateid for reads, so don't complain for them */ + if (!(np->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR)) + NP(np, "nfs_get_stateid: no stateid"); + sid->seqid = sid->other[0] = sid->other[1] = sid->other[2] = 0xffffffff; + } + if (nlop) + nfs_lock_owner_rele(nlop); + if (noop) + nfs_open_owner_rele(noop); +} + + +/* + * When we have a delegation, we may be able to perform the OPEN locally. + * Perform the OPEN by checking the delegation ACE and/or checking via ACCESS. + */ +int +nfs4_open_delegated( + nfsnode_t np, + struct nfs_open_file *nofp, + uint32_t accessMode, + uint32_t denyMode, + vfs_context_t ctx) +{ + int error = 0, ismember, readtoo = 0, authorized = 0; + uint32_t action; + struct kauth_acl_eval eval; + kauth_cred_t cred = vfs_context_ucred(ctx); + + if (!(accessMode & NFS_OPEN_SHARE_ACCESS_READ)) { + /* + * Try to open it for read access too, + * so the buffer cache can read data. + */ + readtoo = 1; + accessMode |= NFS_OPEN_SHARE_ACCESS_READ; + } + +tryagain: + action = 0; + if (accessMode & NFS_OPEN_SHARE_ACCESS_READ) + action |= KAUTH_VNODE_READ_DATA; + if (accessMode & NFS_OPEN_SHARE_ACCESS_WRITE) + action |= KAUTH_VNODE_WRITE_DATA; + + /* evaluate ACE (if we have one) */ + if (np->n_dace.ace_flags) { + eval.ae_requested = action; + eval.ae_acl = &np->n_dace; + eval.ae_count = 1; + eval.ae_options = 0; + if (np->n_vattr.nva_uid == kauth_cred_getuid(cred)) + eval.ae_options |= KAUTH_AEVAL_IS_OWNER; + error = kauth_cred_ismember_gid(cred, np->n_vattr.nva_gid, &ismember); + if (!error && ismember) + eval.ae_options |= KAUTH_AEVAL_IN_GROUP; + + eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS; + eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS; + eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS; + eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS; + + error = kauth_acl_evaluate(cred, &eval); + + if (!error && (eval.ae_result == KAUTH_RESULT_ALLOW)) + authorized = 1; + } + + if (!authorized) { + /* need to ask the server via ACCESS */ + struct vnop_access_args naa; + naa.a_desc = &vnop_access_desc; + naa.a_vp = NFSTOV(np); + naa.a_action = action; + naa.a_context = ctx; + if (!(error = nfs_vnop_access(&naa))) + authorized = 1; + } + + if (!authorized) { + if (readtoo) { + /* try again without the extra read access */ + accessMode &= ~NFS_OPEN_SHARE_ACCESS_READ; + readtoo = 0; + goto tryagain; + } + return (error ? error : EACCES); + } + + nfs_open_file_add_open(nofp, accessMode, denyMode, 1); + + return (0); +} + + +/* + * Open a file with the given access/deny modes. + * + * If we have a delegation, we may be able to handle the open locally. + * Otherwise, we will always send the open RPC even if this open's mode is + * a subset of all the existing opens. This makes sure that we will always + * be able to do a downgrade to any of the open modes. + * + * Note: local conflicts should have already been checked in nfs_open_file_find(). + */ +int +nfs4_open( + nfsnode_t np, + struct nfs_open_file *nofp, + uint32_t accessMode, + uint32_t denyMode, + vfs_context_t ctx) +{ + vnode_t vp = NFSTOV(np); + vnode_t dvp = NULL; + struct componentname cn; + const char *vname = NULL; + size_t namelen; + char smallname[128]; + char *filename = NULL; + int error = 0, readtoo = 0; + + /* + * We can handle the OPEN ourselves if we have a delegation, + * unless it's a read delegation and the open is asking for + * either write access or deny read. We also don't bother to + * use the delegation if it's being returned. + */ + if (np->n_openflags & N_DELEG_MASK) { + if ((error = nfs_open_state_set_busy(np, vfs_context_thread(ctx)))) + return (error); + if ((np->n_openflags & N_DELEG_MASK) && !(np->n_openflags & N_DELEG_RETURN) && + (((np->n_openflags & N_DELEG_MASK) == N_DELEG_WRITE) || + (!(accessMode & NFS_OPEN_SHARE_ACCESS_WRITE) && !(denyMode & NFS_OPEN_SHARE_DENY_READ)))) { + error = nfs4_open_delegated(np, nofp, accessMode, denyMode, ctx); + nfs_open_state_clear_busy(np); + return (error); + } + nfs_open_state_clear_busy(np); + } + + /* + * [sigh] We can't trust VFS to get the parent right for named + * attribute nodes. (It likes to reparent the nodes after we've + * created them.) Luckily we can probably get the right parent + * from the n_parent we have stashed away. + */ + if ((np->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR) && + (((dvp = np->n_parent)) && (error = vnode_get(dvp)))) + dvp = NULL; + if (!dvp) + dvp = vnode_getparent(vp); + vname = vnode_getname(vp); + if (!dvp || !vname) { + if (!error) + error = EIO; + goto out; + } + filename = &smallname[0]; + namelen = snprintf(filename, sizeof(smallname), "%s", vname); + if (namelen >= sizeof(smallname)) { + MALLOC(filename, char *, namelen+1, M_TEMP, M_WAITOK); + if (!filename) { + error = ENOMEM; + goto out; + } + snprintf(filename, namelen+1, "%s", vname); + } + bzero(&cn, sizeof(cn)); + cn.cn_nameptr = filename; + cn.cn_namelen = namelen; + + if (!(accessMode & NFS_OPEN_SHARE_ACCESS_READ)) { + /* + * Try to open it for read access too, + * so the buffer cache can read data. + */ + readtoo = 1; + accessMode |= NFS_OPEN_SHARE_ACCESS_READ; + } +tryagain: + error = nfs4_open_rpc(nofp, ctx, &cn, NULL, dvp, &vp, NFS_OPEN_NOCREATE, accessMode, denyMode); + if (error) { + if (!nfs_mount_state_error_should_restart(error) && + (error != EINTR) && (error != ERESTART) && readtoo) { + /* try again without the extra read access */ + accessMode &= ~NFS_OPEN_SHARE_ACCESS_READ; + readtoo = 0; + goto tryagain; + } + goto out; + } + nfs_open_file_add_open(nofp, accessMode, denyMode, 0); out: if (filename && (filename != &smallname[0])) FREE(filename, M_TEMP); @@ -2035,142 +2651,176 @@ nfs4_open( return (error); } - int -nfs4_vnop_open( - struct vnop_open_args /* { +nfs_vnop_mmap( + struct vnop_mmap_args /* { struct vnodeop_desc *a_desc; vnode_t a_vp; - int a_mode; + int a_fflags; vfs_context_t a_context; } */ *ap) { vfs_context_t ctx = ap->a_context; vnode_t vp = ap->a_vp; nfsnode_t np = VTONFS(vp); + int error = 0, accessMode, denyMode, delegated; struct nfsmount *nmp; - int error, accessMode, denyMode, opened = 0; struct nfs_open_owner *noop = NULL; struct nfs_open_file *nofp = NULL; - if (!(ap->a_mode & (FREAD|FWRITE))) - return (EINVAL); - nmp = VTONMP(vp); if (!nmp) return (ENXIO); - /* First, call the common code */ - if ((error = nfs3_vnop_open(ap))) - return (error); - - if (!vnode_isreg(vp)) { - /* Just mark that it was opened */ - lck_mtx_lock(&np->n_openlock); - np->n_openrefcnt++; - lck_mtx_unlock(&np->n_openlock); - return (0); - } + if (!vnode_isreg(vp) || !(ap->a_fflags & (PROT_READ|PROT_WRITE))) + return (EINVAL); + if (np->n_flag & NREVOKE) + return (EIO); - /* mode contains some combination of: FREAD, FWRITE, O_SHLOCK, O_EXLOCK */ - accessMode = 0; - if (ap->a_mode & FREAD) - accessMode |= NFS_OPEN_SHARE_ACCESS_READ; - if (ap->a_mode & FWRITE) + /* + * fflags contains some combination of: PROT_READ, PROT_WRITE + * Since it's not possible to mmap() without having the file open for reading, + * read access is always there (regardless if PROT_READ is not set). + */ + accessMode = NFS_OPEN_SHARE_ACCESS_READ; + if (ap->a_fflags & PROT_WRITE) accessMode |= NFS_OPEN_SHARE_ACCESS_WRITE; - if (ap->a_mode & O_EXLOCK) - denyMode = NFS_OPEN_SHARE_DENY_BOTH; - else if (ap->a_mode & O_SHLOCK) - denyMode = NFS_OPEN_SHARE_DENY_WRITE; - else - denyMode = NFS_OPEN_SHARE_DENY_NONE; + denyMode = NFS_OPEN_SHARE_DENY_NONE; noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 1); if (!noop) return (ENOMEM); restart: - error = nfs_mount_state_in_use_start(nmp); + error = nfs_mount_state_in_use_start(nmp, NULL); if (error) { nfs_open_owner_rele(noop); return (error); } - - error = nfs_open_file_find(np, noop, &nofp, accessMode, denyMode, 1); - if (!error && (nofp->nof_flags & NFS_OPEN_FILE_LOST)) { - const char *vname = vnode_getname(NFSTOV(np)); - printf("nfs_vnop_open: LOST %s\n", vname); - vnode_putname(vname); + if (np->n_flag & NREVOKE) { error = EIO; + nfs_mount_state_in_use_end(nmp, 0); + nfs_open_owner_rele(noop); + return (error); + } + + error = nfs_open_file_find(np, noop, &nofp, 0, 0, 1); + if (error || (!error && (nofp->nof_flags & NFS_OPEN_FILE_LOST))) { + NP(np, "nfs_vnop_mmap: no open file for owner, error %d, %d", error, kauth_cred_getuid(noop->noo_cred)); + error = EPERM; } if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { nfs_mount_state_in_use_end(nmp, 0); - nfs4_reopen(nofp, vfs_context_thread(ctx)); + error = nfs4_reopen(nofp, NULL); nofp = NULL; - goto restart; + if (!error) + goto restart; } if (!error) - error = nfs_open_file_set_busy(nofp, vfs_context_thread(ctx)); + error = nfs_open_file_set_busy(nofp, NULL); if (error) { nofp = NULL; goto out; } /* - * If we just created the file and the modes match, then we simply use - * the open performed in the create. Otherwise, send the request. + * The open reference for mmap must mirror an existing open because + * we may need to reclaim it after the file is closed. + * So grab another open count matching the accessMode passed in. + * If we already had an mmap open, prefer read/write without deny mode. + * This means we may have to drop the current mmap open first. */ - if ((nofp->nof_flags & NFS_OPEN_FILE_CREATE) && - (nofp->nof_creator == current_thread()) && - (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) && - (denyMode == NFS_OPEN_SHARE_DENY_NONE)) { - nofp->nof_flags &= ~NFS_OPEN_FILE_CREATE; - nofp->nof_creator = NULL; - } else { - if (!opened) - error = nfs4_open(np, nofp, accessMode, denyMode, ctx); - if ((error == EACCES) && (nofp->nof_flags & NFS_OPEN_FILE_CREATE) && - (nofp->nof_creator == current_thread())) { - /* - * Ugh. This can happen if we just created the file with read-only - * perms and we're trying to open it for real with different modes - * (e.g. write-only or with a deny mode) and the server decides to - * not allow the second open because of the read-only perms. - * The best we can do is to just use the create's open. - * We may have access we don't need or we may not have a requested - * deny mode. We may log complaints later, but we'll try to avoid it. - */ - if (denyMode != NFS_OPEN_SHARE_DENY_NONE) { - const char *vname = vnode_getname(NFSTOV(np)); - printf("nfs4_vnop_open: deny mode foregone on create, %s\n", vname); - vnode_putname(vname); - } - nofp->nof_creator = NULL; + + if (!nofp->nof_access) { + if (accessMode != NFS_OPEN_SHARE_ACCESS_READ) { + /* not asking for just read access -> fail */ + error = EPERM; + goto out; + } + /* we don't have the file open, so open it for read access */ + if (nmp->nm_vers < NFS_VER4) { + /* NFS v2/v3 opens are always allowed - so just add it. */ + nfs_open_file_add_open(nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE, 0); error = 0; + } else { + error = nfs4_open(np, nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE, ctx); } + if (!error) + nofp->nof_flags |= NFS_OPEN_FILE_NEEDCLOSE; if (error) goto out; - opened = 1; - /* - * If we had just created the file, we already had it open. - * If the actual open mode is less than what we grabbed at - * create time, then we'll downgrade the open here. - */ - if ((nofp->nof_flags & NFS_OPEN_FILE_CREATE) && - (nofp->nof_creator == current_thread())) { - error = nfs4_close(np, nofp, NFS_OPEN_SHARE_ACCESS_BOTH, NFS_OPEN_SHARE_DENY_NONE, ctx); - if (error) { - const char *vname = vnode_getname(NFSTOV(np)); - printf("nfs_vnop_open: create close error %d, %s\n", error, vname); - vnode_putname(vname); - } - if (!nfs_mount_state_error_should_restart(error)) { - error = 0; - nofp->nof_flags &= ~NFS_OPEN_FILE_CREATE; - } + } + + /* determine deny mode for open */ + if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) { + if (nofp->nof_d_rw || nofp->nof_d_rw_dw || nofp->nof_d_rw_drw) { + delegated = 1; + if (nofp->nof_d_rw) + denyMode = NFS_OPEN_SHARE_DENY_NONE; + else if (nofp->nof_d_rw_dw) + denyMode = NFS_OPEN_SHARE_DENY_WRITE; + else if (nofp->nof_d_rw_drw) + denyMode = NFS_OPEN_SHARE_DENY_BOTH; + } else if (nofp->nof_rw || nofp->nof_rw_dw || nofp->nof_rw_drw) { + delegated = 0; + if (nofp->nof_rw) + denyMode = NFS_OPEN_SHARE_DENY_NONE; + else if (nofp->nof_rw_dw) + denyMode = NFS_OPEN_SHARE_DENY_WRITE; + else if (nofp->nof_rw_drw) + denyMode = NFS_OPEN_SHARE_DENY_BOTH; + } else { + error = EPERM; + } + } else { /* NFS_OPEN_SHARE_ACCESS_READ */ + if (nofp->nof_d_r || nofp->nof_d_r_dw || nofp->nof_d_r_drw) { + delegated = 1; + if (nofp->nof_d_r) + denyMode = NFS_OPEN_SHARE_DENY_NONE; + else if (nofp->nof_d_r_dw) + denyMode = NFS_OPEN_SHARE_DENY_WRITE; + else if (nofp->nof_d_r_drw) + denyMode = NFS_OPEN_SHARE_DENY_BOTH; + } else if (nofp->nof_r || nofp->nof_r_dw || nofp->nof_r_drw) { + delegated = 0; + if (nofp->nof_r) + denyMode = NFS_OPEN_SHARE_DENY_NONE; + else if (nofp->nof_r_dw) + denyMode = NFS_OPEN_SHARE_DENY_WRITE; + else if (nofp->nof_r_drw) + denyMode = NFS_OPEN_SHARE_DENY_BOTH; + } else { + error = EPERM; + } + } + if (error) /* mmap mode without proper open mode */ + goto out; + + /* + * If the existing mmap access is more than the new access OR the + * existing access is the same and the existing deny mode is less, + * then we'll stick with the existing mmap open mode. + */ + if ((nofp->nof_mmap_access > accessMode) || + ((nofp->nof_mmap_access == accessMode) && (nofp->nof_mmap_deny <= denyMode))) + goto out; + + /* update mmap open mode */ + if (nofp->nof_mmap_access) { + error = nfs_close(np, nofp, nofp->nof_mmap_access, nofp->nof_mmap_deny, ctx); + if (error) { + if (!nfs_mount_state_error_should_restart(error)) + NP(np, "nfs_vnop_mmap: close of previous mmap mode failed: %d, %d", error, kauth_cred_getuid(nofp->nof_owner->noo_cred)); + NP(np, "nfs_vnop_mmap: update, close error %d, %d", error, kauth_cred_getuid(nofp->nof_owner->noo_cred)); + goto out; } + nofp->nof_mmap_access = nofp->nof_mmap_deny = 0; } + nfs_open_file_add_open(nofp, accessMode, denyMode, delegated); + nofp->nof_mmap_access = accessMode; + nofp->nof_mmap_deny = denyMode; + out: if (nofp) nfs_open_file_clear_busy(nofp); @@ -2180,601 +2830,134 @@ nfs4_vnop_open( } if (noop) nfs_open_owner_rele(noop); - if (error) { - const char *vname = vnode_getname(NFSTOV(np)); - printf("nfs_vnop_open: error %d, %s\n", error, vname); - vnode_putname(vname); - } return (error); } -int -nfs4_close( - nfsnode_t np, - struct nfs_open_file *nofp, - uint32_t accessMode, - uint32_t denyMode, - vfs_context_t ctx) -{ - struct nfs_lock_owner *nlop; - int error = 0, changed = 0, closed = 0; - uint32_t newAccessMode, newDenyMode; - - /* warn if modes don't match current state */ - if (((accessMode & nofp->nof_access) != accessMode) || ((denyMode & nofp->nof_deny) != denyMode)) { - const char *vname = vnode_getname(NFSTOV(np)); - printf("nfs4_close: mode mismatch %d %d, current %d %d, %s\n", - accessMode, denyMode, nofp->nof_access, nofp->nof_deny, vname); - vnode_putname(vname); - } - - /* - * If we're closing a write-only open, we may not have a write-only count - * if we also grabbed read access. So, check the read-write count. - */ - if (denyMode == NFS_OPEN_SHARE_DENY_NONE) { - if ((accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) && - (nofp->nof_w == 0) && nofp->nof_rw) - accessMode = NFS_OPEN_SHARE_ACCESS_BOTH; - } else if (denyMode == NFS_OPEN_SHARE_DENY_WRITE) { - if ((accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) && - (nofp->nof_w_dw == 0) && nofp->nof_rw_dw) - accessMode = NFS_OPEN_SHARE_ACCESS_BOTH; - } else { /* NFS_OPEN_SHARE_DENY_BOTH */ - if ((accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) && - (nofp->nof_w_drw == 0) && nofp->nof_rw_drw) - accessMode = NFS_OPEN_SHARE_ACCESS_BOTH; - } - - /* - * Calculate new modes: a mode bit gets removed when there's only - * one count in all the corresponding counts - */ - newAccessMode = nofp->nof_access; - newDenyMode = nofp->nof_deny; - if ((accessMode & NFS_OPEN_SHARE_ACCESS_READ) && - (newAccessMode & NFS_OPEN_SHARE_ACCESS_READ) && - ((nofp->nof_r + nofp->nof_rw + nofp->nof_r_dw + - nofp->nof_rw_dw + nofp->nof_r_drw + nofp->nof_rw_dw) == 1)) { - newAccessMode &= ~NFS_OPEN_SHARE_ACCESS_READ; - changed = 1; - } - if ((accessMode & NFS_OPEN_SHARE_ACCESS_WRITE) && - (newAccessMode & NFS_OPEN_SHARE_ACCESS_WRITE) && - ((nofp->nof_w + nofp->nof_rw + nofp->nof_w_dw + - nofp->nof_rw_dw + nofp->nof_w_drw + nofp->nof_rw_dw) == 1)) { - newAccessMode &= ~NFS_OPEN_SHARE_ACCESS_WRITE; - changed = 1; - } - if ((denyMode & NFS_OPEN_SHARE_DENY_READ) && - (newDenyMode & NFS_OPEN_SHARE_DENY_READ) && - ((nofp->nof_r_drw + nofp->nof_w_drw + nofp->nof_rw_drw) == 1)) { - newDenyMode &= ~NFS_OPEN_SHARE_DENY_READ; - changed = 1; - } - if ((denyMode & NFS_OPEN_SHARE_DENY_WRITE) && - (newDenyMode & NFS_OPEN_SHARE_DENY_WRITE) && - ((nofp->nof_r_drw + nofp->nof_w_drw + nofp->nof_rw_drw + - nofp->nof_r_dw + nofp->nof_w_dw + nofp->nof_rw_dw) == 1)) { - newDenyMode &= ~NFS_OPEN_SHARE_DENY_WRITE; - changed = 1; - } - - - if ((newAccessMode == 0) || (nofp->nof_opencnt == 1)) { - /* - * No more access after this close, so clean up and close it. - */ - closed = 1; - if (!(nofp->nof_flags & NFS_OPEN_FILE_LOST)) - error = nfs4_close_rpc(np, nofp, vfs_context_thread(ctx), vfs_context_ucred(ctx), 0); - if (error == NFSERR_LOCKS_HELD) { - /* - * Hmm... the server says we have locks we need to release first - * Find the lock owner and try to unlock everything. - */ - nlop = nfs_lock_owner_find(np, vfs_context_proc(ctx), 0); - if (nlop) { - nfs4_unlock_rpc(np, nlop, F_WRLCK, 0, UINT64_MAX, ctx); - nfs_lock_owner_rele(nlop); - } - error = nfs4_close_rpc(np, nofp, vfs_context_thread(ctx), vfs_context_ucred(ctx), 0); - } - } else if (changed) { - /* - * File is still open but with less access, so downgrade the open. - */ - if (!(nofp->nof_flags & NFS_OPEN_FILE_LOST)) - error = nfs4_open_downgrade_rpc(np, nofp, ctx); - } - - if (error) { - const char *vname = vnode_getname(NFSTOV(np)); - printf("nfs4_close: error %d, %s\n", error, vname); - vnode_putname(vname); - return (error); - } - - /* Decrement the corresponding open access/deny mode counter. */ - if (denyMode == NFS_OPEN_SHARE_DENY_NONE) { - if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) { - if (nofp->nof_r == 0) - printf("nfs4_close: open(R) count underrun\n"); - else - nofp->nof_r--; - } else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) { - if (nofp->nof_w == 0) - printf("nfs4_close: open(W) count underrun\n"); - else - nofp->nof_w--; - } else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) { - if (nofp->nof_rw == 0) - printf("nfs4_close: open(RW) count underrun\n"); - else - nofp->nof_rw--; - } - } else if (denyMode == NFS_OPEN_SHARE_DENY_WRITE) { - if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) { - if (nofp->nof_r_dw == 0) - printf("nfs4_close: open(R,DW) count underrun\n"); - else - nofp->nof_r_dw--; - } else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) { - if (nofp->nof_w_dw == 0) - printf("nfs4_close: open(W,DW) count underrun\n"); - else - nofp->nof_w_dw--; - } else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) { - if (nofp->nof_rw_dw == 0) - printf("nfs4_close: open(RW,DW) count underrun\n"); - else - nofp->nof_rw_dw--; - } - } else { /* NFS_OPEN_SHARE_DENY_BOTH */ - if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) { - if (nofp->nof_r_drw == 0) - printf("nfs4_close: open(R,DRW) count underrun\n"); - else - nofp->nof_r_drw--; - } else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) { - if (nofp->nof_w_drw == 0) - printf("nfs4_close: open(W,DRW) count underrun\n"); - else - nofp->nof_w_drw--; - } else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) { - if (nofp->nof_rw_drw == 0) - printf("nfs4_close: open(RW,DRW) count underrun\n"); - else - nofp->nof_rw_drw--; - } - } - /* update the modes */ - nofp->nof_access = newAccessMode; - nofp->nof_deny = newDenyMode; - if (closed) { - if (nofp->nof_r || nofp->nof_w || - (nofp->nof_rw && !((nofp->nof_flags & NFS_OPEN_FILE_CREATE) && !nofp->nof_creator && (nofp->nof_rw == 1))) || - nofp->nof_r_dw || nofp->nof_w_dw || nofp->nof_rw_dw || - nofp->nof_r_drw || nofp->nof_w_drw || nofp->nof_rw_drw) - printf("nfs4_close: unexpected count: %u %u %u dw %u %u %u drw %u %u %u flags 0x%x\n", - nofp->nof_r, nofp->nof_w, nofp->nof_rw, - nofp->nof_r_dw, nofp->nof_w_dw, nofp->nof_rw_dw, - nofp->nof_r_drw, nofp->nof_w_drw, nofp->nof_rw_drw, - nofp->nof_flags); - /* clear out all open info, just to be safe */ - nofp->nof_access = nofp->nof_deny = 0; - nofp->nof_mmap_access = nofp->nof_mmap_deny = 0; - nofp->nof_r = nofp->nof_w = nofp->nof_rw = 0; - nofp->nof_r_dw = nofp->nof_w_dw = nofp->nof_rw_dw = 0; - nofp->nof_r_drw = nofp->nof_w_drw = nofp->nof_rw_drw = 0; - nofp->nof_flags &= ~NFS_OPEN_FILE_CREATE; - /* XXX we may potentially want to clean up idle/unused open file structures */ - } - nofp->nof_opencnt--; - if (nofp->nof_flags & NFS_OPEN_FILE_LOST) { - error = EIO; - if (!nofp->nof_opencnt) - nofp->nof_flags &= ~NFS_OPEN_FILE_LOST; - const char *vname = vnode_getname(NFSTOV(np)); - printf("nfs_close: LOST%s, %s\n", !(nofp->nof_flags & NFS_OPEN_FILE_LOST) ? " (last)" : "", vname); - vnode_putname(vname); - } - return (error); -} int -nfs4_vnop_close( - struct vnop_close_args /* { +nfs_vnop_mnomap( + struct vnop_mnomap_args /* { struct vnodeop_desc *a_desc; vnode_t a_vp; - int a_fflag; vfs_context_t a_context; } */ *ap) { vfs_context_t ctx = ap->a_context; vnode_t vp = ap->a_vp; - int fflag = ap->a_fflag; - int error, common_error, accessMode, denyMode; nfsnode_t np = VTONFS(vp); struct nfsmount *nmp; - struct nfs_open_owner *noop = NULL; struct nfs_open_file *nofp = NULL; + off_t size; + int error; nmp = VTONMP(vp); if (!nmp) return (ENXIO); - /* First, call the common code */ - common_error = nfs3_vnop_close(ap); + /* flush buffers/ubc before we drop the open (in case it's our last open) */ + nfs_flush(np, MNT_WAIT, vfs_context_thread(ctx), V_IGNORE_WRITEERR); + if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp))) + ubc_msync(vp, 0, size, NULL, UBC_PUSHALL | UBC_SYNC); - if (!vnode_isreg(vp)) { - /* Just mark that it was closed */ - lck_mtx_lock(&np->n_openlock); - np->n_openrefcnt--; + /* walk all open files and close all mmap opens */ +loop: + error = nfs_mount_state_in_use_start(nmp, NULL); + if (error) + return (error); + lck_mtx_lock(&np->n_openlock); + TAILQ_FOREACH(nofp, &np->n_opens, nof_link) { + if (!nofp->nof_mmap_access) + continue; lck_mtx_unlock(&np->n_openlock); - return (common_error); + if (nofp->nof_flags & NFS_OPEN_FILE_REOPEN) { + nfs_mount_state_in_use_end(nmp, 0); + error = nfs4_reopen(nofp, NULL); + if (!error) + goto loop; + } + if (!error) + error = nfs_open_file_set_busy(nofp, NULL); + if (error) { + lck_mtx_lock(&np->n_openlock); + break; + } + if (nofp->nof_mmap_access) { + error = nfs_close(np, nofp, nofp->nof_mmap_access, nofp->nof_mmap_deny, ctx); + if (!nfs_mount_state_error_should_restart(error)) { + if (error) /* not a state-operation-restarting error, so just clear the access */ + NP(np, "nfs_vnop_mnomap: close of mmap mode failed: %d, %d", error, kauth_cred_getuid(nofp->nof_owner->noo_cred)); + nofp->nof_mmap_access = nofp->nof_mmap_deny = 0; + } + if (error) + NP(np, "nfs_vnop_mnomap: error %d, %d", error, kauth_cred_getuid(nofp->nof_owner->noo_cred)); + } + nfs_open_file_clear_busy(nofp); + nfs_mount_state_in_use_end(nmp, error); + goto loop; } + lck_mtx_unlock(&np->n_openlock); + nfs_mount_state_in_use_end(nmp, error); + return (error); +} - noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 0); - if (!noop) { - printf("nfs4_vnop_close: can't get open owner!\n"); - return (EIO); - } +/* + * Search a node's lock owner list for the owner for this process. + * If not found and "alloc" is set, then allocate a new one. + */ +struct nfs_lock_owner * +nfs_lock_owner_find(nfsnode_t np, proc_t p, int alloc) +{ + pid_t pid = proc_pid(p); + struct nfs_lock_owner *nlop, *newnlop = NULL; -restart: - error = nfs_mount_state_in_use_start(nmp); - if (error) { - nfs_open_owner_rele(noop); - return (error); +tryagain: + lck_mtx_lock(&np->n_openlock); + TAILQ_FOREACH(nlop, &np->n_lock_owners, nlo_link) { + if (nlop->nlo_pid != pid) + continue; + if (timevalcmp(&nlop->nlo_pid_start, &p->p_start, ==)) + break; + /* stale lock owner... reuse it if we can */ + if (nlop->nlo_refcnt) { + TAILQ_REMOVE(&np->n_lock_owners, nlop, nlo_link); + nlop->nlo_flags &= ~NFS_LOCK_OWNER_LINK; + lck_mtx_unlock(&np->n_openlock); + goto tryagain; + } + nlop->nlo_pid_start = p->p_start; + nlop->nlo_seqid = 0; + nlop->nlo_stategenid = 0; + break; } - error = nfs_open_file_find(np, noop, &nofp, 0, 0, 0); - if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { - nfs_mount_state_in_use_end(nmp, 0); - nfs4_reopen(nofp, vfs_context_thread(ctx)); - nofp = NULL; - goto restart; - } - if (error) { - const char *vname = vnode_getname(NFSTOV(np)); - printf("nfs4_vnop_close: no open file for owner %d, %s\n", error, vname); - vnode_putname(vname); - error = EBADF; - goto out; + if (!nlop && !newnlop && alloc) { + lck_mtx_unlock(&np->n_openlock); + MALLOC(newnlop, struct nfs_lock_owner *, sizeof(struct nfs_lock_owner), M_TEMP, M_WAITOK); + if (!newnlop) + return (NULL); + bzero(newnlop, sizeof(*newnlop)); + lck_mtx_init(&newnlop->nlo_lock, nfs_open_grp, LCK_ATTR_NULL); + newnlop->nlo_pid = pid; + newnlop->nlo_pid_start = p->p_start; + newnlop->nlo_name = OSAddAtomic(1, &nfs_lock_owner_seqnum); + TAILQ_INIT(&newnlop->nlo_locks); + goto tryagain; } - error = nfs_open_file_set_busy(nofp, vfs_context_thread(ctx)); - if (error) { - nofp = NULL; - goto out; + if (!nlop && newnlop) { + newnlop->nlo_flags |= NFS_LOCK_OWNER_LINK; + TAILQ_INSERT_HEAD(&np->n_lock_owners, newnlop, nlo_link); + nlop = newnlop; } + lck_mtx_unlock(&np->n_openlock); - /* fflag contains some combination of: FREAD, FWRITE, FHASLOCK */ - accessMode = 0; - if (fflag & FREAD) - accessMode |= NFS_OPEN_SHARE_ACCESS_READ; - if (fflag & FWRITE) - accessMode |= NFS_OPEN_SHARE_ACCESS_WRITE; -// XXX It would be nice if we still had the O_EXLOCK/O_SHLOCK flags that were on the open -// if (fflag & O_EXLOCK) -// denyMode = NFS_OPEN_SHARE_DENY_BOTH; -// else if (fflag & O_SHLOCK) -// denyMode = NFS_OPEN_SHARE_DENY_WRITE; -// else -// denyMode = NFS_OPEN_SHARE_DENY_NONE; - if (fflag & FHASLOCK) { - /* XXX assume FHASLOCK is for the deny mode and not flock */ - /* FHASLOCK flock will be unlocked in the close path, but the flag is not cleared. */ - if (nofp->nof_deny & NFS_OPEN_SHARE_DENY_READ) - denyMode = NFS_OPEN_SHARE_DENY_BOTH; - else if (nofp->nof_deny & NFS_OPEN_SHARE_DENY_WRITE) - denyMode = NFS_OPEN_SHARE_DENY_WRITE; - else - denyMode = NFS_OPEN_SHARE_DENY_NONE; - } else { - denyMode = NFS_OPEN_SHARE_DENY_NONE; - } + if (newnlop && (nlop != newnlop)) + nfs_lock_owner_destroy(newnlop); - if (!accessMode) { - error = EINVAL; - goto out; - } + if (nlop) + nfs_lock_owner_ref(nlop); - error = nfs4_close(np, nofp, accessMode, denyMode, ctx); - if (error) { - const char *vname = vnode_getname(NFSTOV(np)); - printf("nfs_vnop_close: close error %d, %s\n", error, vname); - vnode_putname(vname); - } - -out: - if (nofp) - nfs_open_file_clear_busy(nofp); - if (nfs_mount_state_in_use_end(nmp, error)) { - nofp = NULL; - goto restart; - } - if (noop) - nfs_open_owner_rele(noop); - if (error) { - const char *vname = vnode_getname(NFSTOV(np)); - printf("nfs_vnop_close: error %d, %s\n", error, vname); - vnode_putname(vname); - } - if (!error) - error = common_error; - return (error); -} - -int -nfs4_vnop_mmap( - struct vnop_mmap_args /* { - struct vnodeop_desc *a_desc; - vnode_t a_vp; - int a_fflags; - vfs_context_t a_context; - } */ *ap) -{ - vfs_context_t ctx = ap->a_context; - vnode_t vp = ap->a_vp; - nfsnode_t np = VTONFS(vp); - int error = 0, accessMode, denyMode; - struct nfsmount *nmp; - struct nfs_open_owner *noop = NULL; - struct nfs_open_file *nofp = NULL; - - nmp = VTONMP(vp); - if (!nmp) - return (ENXIO); - - if (!vnode_isreg(vp) || !(ap->a_fflags & (PROT_READ|PROT_WRITE))) - return (EINVAL); - - /* - * fflags contains some combination of: PROT_READ, PROT_WRITE - * Since it's not possible to mmap() without having the file open for reading, - * read access is always there (regardless if PROT_READ is not set). - */ - accessMode = NFS_OPEN_SHARE_ACCESS_READ; - if (ap->a_fflags & PROT_WRITE) - accessMode |= NFS_OPEN_SHARE_ACCESS_WRITE; - denyMode = NFS_OPEN_SHARE_DENY_NONE; - - noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 0); - if (!noop) { - printf("nfs4_vnop_mmap: no open owner\n"); - return (EPERM); - } - -restart: - error = nfs_mount_state_in_use_start(nmp); - if (error) { - nfs_open_owner_rele(noop); - return (error); - } - - error = nfs_open_file_find(np, noop, &nofp, 0, 0, 1); - if (error || (!error && (nofp->nof_flags & NFS_OPEN_FILE_LOST))) { - printf("nfs4_vnop_mmap: no open file for owner %d\n", error); - error = EPERM; - } - if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { - nfs_mount_state_in_use_end(nmp, 0); - nfs4_reopen(nofp, vfs_context_thread(ctx)); - nofp = NULL; - goto restart; - } - if (!error) - error = nfs_open_file_set_busy(nofp, vfs_context_thread(ctx)); - if (error) { - nofp = NULL; - goto out; - } - - /* - * The open reference for mmap must mirror an existing open because - * we may need to reclaim it after the file is closed. - * So grab another open count matching the accessMode passed in. - * If we already had an mmap open, prefer read/write without deny mode. - * This means we may have to drop the current mmap open first. - */ - - /* determine deny mode for open */ - if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) { - if (nofp->nof_rw) - denyMode = NFS_OPEN_SHARE_DENY_NONE; - else if (nofp->nof_rw_dw) - denyMode = NFS_OPEN_SHARE_DENY_WRITE; - else if (nofp->nof_rw_drw) - denyMode = NFS_OPEN_SHARE_DENY_BOTH; - else - error = EPERM; - } else { /* NFS_OPEN_SHARE_ACCESS_READ */ - if (nofp->nof_r) - denyMode = NFS_OPEN_SHARE_DENY_NONE; - else if (nofp->nof_r_dw) - denyMode = NFS_OPEN_SHARE_DENY_WRITE; - else if (nofp->nof_r_drw) - denyMode = NFS_OPEN_SHARE_DENY_BOTH; - else - error = EPERM; - } - if (error) /* mmap mode without proper open mode */ - goto out; - - /* - * If the existing mmap access is more than the new access OR the - * existing access is the same and the existing deny mode is less, - * then we'll stick with the existing mmap open mode. - */ - if ((nofp->nof_mmap_access > accessMode) || - ((nofp->nof_mmap_access == accessMode) && (nofp->nof_mmap_deny <= denyMode))) - goto out; - - /* update mmap open mode */ - if (nofp->nof_mmap_access) { - error = nfs4_close(np, nofp, nofp->nof_mmap_access, nofp->nof_mmap_deny, ctx); - if (error) { - if (!nfs_mount_state_error_should_restart(error)) - printf("nfs_vnop_mmap: close of previous mmap mode failed: %d\n", error); - const char *vname = vnode_getname(NFSTOV(np)); - printf("nfs_vnop_mmap: update, close error %d, %s\n", error, vname); - vnode_putname(vname); - goto out; - } - nofp->nof_mmap_access = nofp->nof_mmap_deny = 0; - } - - if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) { - if (denyMode == NFS_OPEN_SHARE_DENY_NONE) - nofp->nof_rw++; - else if (denyMode == NFS_OPEN_SHARE_DENY_WRITE) - nofp->nof_rw_dw++; - else /* NFS_OPEN_SHARE_DENY_BOTH */ - nofp->nof_rw_drw++; - } else if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) { - if (denyMode == NFS_OPEN_SHARE_DENY_NONE) - nofp->nof_r++; - else if (denyMode == NFS_OPEN_SHARE_DENY_WRITE) - nofp->nof_r_dw++; - else /* NFS_OPEN_SHARE_DENY_BOTH */ - nofp->nof_r_drw++; - } - nofp->nof_mmap_access = accessMode; - nofp->nof_mmap_deny = denyMode; - nofp->nof_opencnt++; - -out: - if (nofp) - nfs_open_file_clear_busy(nofp); - if (nfs_mount_state_in_use_end(nmp, error)) { - nofp = NULL; - goto restart; - } - if (noop) - nfs_open_owner_rele(noop); - return (error); -} - - -int -nfs4_vnop_mnomap( - struct vnop_mnomap_args /* { - struct vnodeop_desc *a_desc; - vnode_t a_vp; - vfs_context_t a_context; - } */ *ap) -{ - vfs_context_t ctx = ap->a_context; - vnode_t vp = ap->a_vp; - nfsnode_t np = VTONFS(vp); - struct nfsmount *nmp; - struct nfs_open_file *nofp = NULL; - int error; - - nmp = VTONMP(vp); - if (!nmp) - return (ENXIO); - - /* walk all open files and close all mmap opens */ -loop: - error = nfs_mount_state_in_use_start(nmp); - if (error) - return (error); - lck_mtx_lock(&np->n_openlock); - TAILQ_FOREACH(nofp, &np->n_opens, nof_link) { - if (!nofp->nof_mmap_access) - continue; - lck_mtx_unlock(&np->n_openlock); - if (nofp->nof_flags & NFS_OPEN_FILE_REOPEN) { - nfs_mount_state_in_use_end(nmp, 0); - nfs4_reopen(nofp, vfs_context_thread(ctx)); - goto loop; - } - error = nfs_open_file_set_busy(nofp, vfs_context_thread(ctx)); - if (error) { - lck_mtx_lock(&np->n_openlock); - break; - } - if (nofp->nof_mmap_access) { - error = nfs4_close(np, nofp, nofp->nof_mmap_access, nofp->nof_mmap_deny, ctx); - if (!nfs_mount_state_error_should_restart(error)) { - if (error) /* not a state-operation-restarting error, so just clear the access */ - printf("nfs_vnop_mnomap: close of mmap mode failed: %d\n", error); - nofp->nof_mmap_access = nofp->nof_mmap_deny = 0; - } - if (error) { - const char *vname = vnode_getname(NFSTOV(np)); - printf("nfs_vnop_mnomap: error %d, %s\n", error, vname); - vnode_putname(vname); - } - } - nfs_open_file_clear_busy(nofp); - nfs_mount_state_in_use_end(nmp, error); - goto loop; - } - lck_mtx_unlock(&np->n_openlock); - nfs_mount_state_in_use_end(nmp, error); - return (error); -} - -/* - * Search a node's lock owner list for the owner for this process. - * If not found and "alloc" is set, then allocate a new one. - */ -struct nfs_lock_owner * -nfs_lock_owner_find(nfsnode_t np, proc_t p, int alloc) -{ - pid_t pid = proc_pid(p); - struct nfs_lock_owner *nlop, *newnlop = NULL; - -tryagain: - lck_mtx_lock(&np->n_openlock); - TAILQ_FOREACH(nlop, &np->n_lock_owners, nlo_link) { - if (nlop->nlo_pid != pid) - continue; - if (timevalcmp(&nlop->nlo_pid_start, &p->p_start, ==)) - break; - /* stale lock owner... reuse it if we can */ - if (nlop->nlo_refcnt) { - TAILQ_REMOVE(&np->n_lock_owners, nlop, nlo_link); - nlop->nlo_flags &= ~NFS_LOCK_OWNER_LINK; - lck_mtx_unlock(&np->n_openlock); - goto tryagain; - } - nlop->nlo_pid_start = p->p_start; - nlop->nlo_seqid = 0; - nlop->nlo_stategenid = 0; - break; - } - - if (!nlop && !newnlop && alloc) { - lck_mtx_unlock(&np->n_openlock); - MALLOC(newnlop, struct nfs_lock_owner *, sizeof(struct nfs_lock_owner), M_TEMP, M_WAITOK); - if (!newnlop) - return (NULL); - bzero(newnlop, sizeof(*newnlop)); - lck_mtx_init(&newnlop->nlo_lock, nfs_open_grp, LCK_ATTR_NULL); - newnlop->nlo_pid = pid; - newnlop->nlo_pid_start = p->p_start; - newnlop->nlo_name = OSAddAtomic(1, &nfs_lock_owner_seqnum); - TAILQ_INIT(&newnlop->nlo_locks); - goto tryagain; - } - if (!nlop && newnlop) { - newnlop->nlo_flags |= NFS_LOCK_OWNER_LINK; - TAILQ_INSERT_HEAD(&np->n_lock_owners, newnlop, nlo_link); - nlop = newnlop; - } - lck_mtx_unlock(&np->n_openlock); - - if (newnlop && (nlop != newnlop)) - nfs_lock_owner_destroy(newnlop); - - if (nlop) - nfs_lock_owner_ref(nlop); - - return (nlop); -} + return (nlop); +} /* * destroy a lock owner that's no longer needed @@ -2838,7 +3021,7 @@ nfs_lock_owner_set_busy(struct nfs_lock_owner *nlop, thread_t thd) nmp = nlop->nlo_open_owner->noo_mount; if (!nmp) return (ENXIO); - slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0; + slpflag = (NMFLAG(nmp, INTR) && thd) ? PCATCH : 0; lck_mtx_lock(&nlop->nlo_lock); while (nlop->nlo_flags & NFS_LOCK_OWNER_BUSY) { @@ -2846,6 +3029,7 @@ nfs_lock_owner_set_busy(struct nfs_lock_owner *nlop, thread_t thd) break; nlop->nlo_flags |= NFS_LOCK_OWNER_WANT; msleep(nlop, &nlop->nlo_lock, slpflag, "nfs_lock_owner_set_busy", &ts); + slpflag = 0; } if (!error) nlop->nlo_flags |= NFS_LOCK_OWNER_BUSY; @@ -2977,11 +3161,12 @@ nfs_file_lock_conflict(struct nfs_file_lock *nflp1, struct nfs_file_lock *nflp2, * Send an NFSv4 LOCK RPC to the server. */ int -nfs4_lock_rpc( +nfs4_setlock_rpc( nfsnode_t np, struct nfs_open_file *nofp, struct nfs_file_lock *nflp, int reclaim, + int flags, thread_t thd, kauth_cred_t cred) { @@ -2991,10 +3176,13 @@ nfs4_lock_rpc( uint64_t xid; uint32_t locktype; int error = 0, lockerror = ENOENT, newlocker, numops, status; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(np); if (!nmp) return (ENXIO); + if (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); newlocker = (nlop->nlo_stategenid != nmp->nm_stategenid); locktype = (nflp->nfl_flags & NFS_FILE_LOCK_WAIT) ? @@ -3027,6 +3215,7 @@ nfs4_lock_rpc( return (error); } + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -3039,8 +3228,7 @@ nfs4_lock_rpc( nfsm_chain_add_fh(error, &nmreq, NFS_VER4, np->n_fhp, np->n_fhsize); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, np); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_LOCK); nfsm_chain_add_32(error, &nmreq, locktype); @@ -3061,7 +3249,7 @@ nfs4_lock_rpc( nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request2(np, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, (reclaim ? R_RECOVER : 0), &nmrep, &xid, &status); + error = nfs_request2(np, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, &si, flags|R_NOINTR, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; @@ -3070,7 +3258,7 @@ nfs4_lock_rpc( nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); nfsmout_if(error); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, np, NFS_VER4, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, NFS_VER4, &xid); nfsmout_if(error); nfsm_chain_op_check(error, &nmrep, NFS_OP_LOCK); nfs_owner_seqid_increment(newlocker ? nofp->nof_owner : NULL, nlop, error); @@ -3103,21 +3291,27 @@ nfs4_unlock_rpc( int type, uint64_t start, uint64_t end, - vfs_context_t ctx) + int flags, + thread_t thd, + kauth_cred_t cred) { struct nfsmount *nmp; struct nfsm_chain nmreq, nmrep; uint64_t xid; int error = 0, lockerror = ENOENT, numops, status; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(np); if (!nmp) return (ENXIO); + if (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); - error = nfs_lock_owner_set_busy(nlop, vfs_context_thread(ctx)); + error = nfs_lock_owner_set_busy(nlop, NULL); if (error) return (error); + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -3130,8 +3324,7 @@ nfs4_unlock_rpc( nfsm_chain_add_fh(error, &nmreq, NFS_VER4, np->n_fhp, np->n_fhsize); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, np); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_LOCKU); nfsm_chain_add_32(error, &nmreq, (type == F_WRLCK) ? NFS_LOCK_TYPE_WRITE : NFS_LOCK_TYPE_READ); @@ -3143,7 +3336,7 @@ nfs4_unlock_rpc( nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &nmrep, &xid, &status); + error = nfs_request2(np, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, &si, flags|R_NOINTR, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; @@ -3152,7 +3345,7 @@ nfs4_unlock_rpc( nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); nfsmout_if(error); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, np, NFS_VER4, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, NFS_VER4, &xid); nfsmout_if(error); nfsm_chain_op_check(error, &nmrep, NFS_OP_LOCKU); nfs_owner_seqid_increment(NULL, nlop, error); @@ -3167,14 +3360,10 @@ nfs4_unlock_rpc( } /* - * Check for any conflicts with the given lock. - * - * Checking for a lock doesn't require the file to be opened. - * So we skip all the open owner, open file, lock owner work - * and just check for a conflicting lock. + * Send an NFSv4 LOCKT RPC to the server. */ int -nfs4_getlock( +nfs4_getlock_rpc( nfsnode_t np, struct nfs_lock_owner *nlop, struct flock *fl, @@ -3183,39 +3372,20 @@ nfs4_getlock( vfs_context_t ctx) { struct nfsmount *nmp; - struct nfs_file_lock *nflp; struct nfsm_chain nmreq, nmrep; uint64_t xid, val64 = 0; uint32_t val = 0; - int error = 0, lockerror = ENOENT, numops, status; + int error = 0, lockerror, numops, status; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(np); if (!nmp) return (ENXIO); + if (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); - lck_mtx_lock(&np->n_openlock); - /* scan currently held locks for conflict */ - TAILQ_FOREACH(nflp, &np->n_locks, nfl_link) { - if (nflp->nfl_flags & NFS_FILE_LOCK_BLOCKED) - continue; - if ((start <= nflp->nfl_end) && (end >= nflp->nfl_start) && - ((fl->l_type == F_WRLCK) || (nflp->nfl_type == F_WRLCK))) - break; - } - if (nflp) { - /* found a conflicting lock */ - fl->l_type = nflp->nfl_type; - fl->l_pid = (nflp->nfl_flags & NFS_FILE_LOCK_STYLE_FLOCK) ? -1 : nflp->nfl_owner->nlo_pid; - fl->l_start = nflp->nfl_start; - fl->l_len = NFS_FLOCK_LENGTH(nflp->nfl_start, nflp->nfl_end); - fl->l_whence = SEEK_SET; - } - lck_mtx_unlock(&np->n_openlock); - if (nflp) - return (0); - - /* no conflict found locally, so ask the server */ - + lockerror = ENOENT; + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -3228,8 +3398,7 @@ nfs4_getlock( nfsm_chain_add_fh(error, &nmreq, NFS_VER4, np->n_fhp, np->n_fhsize); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, np); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_LOCKT); nfsm_chain_add_32(error, &nmreq, (fl->l_type == F_WRLCK) ? NFS_LOCK_TYPE_WRITE : NFS_LOCK_TYPE_READ); @@ -3240,7 +3409,7 @@ nfs4_getlock( nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &nmrep, &xid, &status); + error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &si, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; @@ -3249,7 +3418,7 @@ nfs4_getlock( nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); nfsmout_if(error); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, np, NFS_VER4, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, NFS_VER4, &xid); nfsmout_if(error); nfsm_chain_op_check(error, &nmrep, NFS_OP_LOCKT); if (error == NFSERR_DENIED) { @@ -3272,6 +3441,74 @@ nfs4_getlock( return (error); } + +/* + * Check for any conflicts with the given lock. + * + * Checking for a lock doesn't require the file to be opened. + * So we skip all the open owner, open file, lock owner work + * and just check for a conflicting lock. + */ +int +nfs_advlock_getlock( + nfsnode_t np, + struct nfs_lock_owner *nlop, + struct flock *fl, + uint64_t start, + uint64_t end, + vfs_context_t ctx) +{ + struct nfsmount *nmp; + struct nfs_file_lock *nflp; + int error = 0, answered = 0; + + nmp = NFSTONMP(np); + if (!nmp) + return (ENXIO); + +restart: + if ((error = nfs_mount_state_in_use_start(nmp, vfs_context_thread(ctx)))) + return (error); + + lck_mtx_lock(&np->n_openlock); + /* scan currently held locks for conflict */ + TAILQ_FOREACH(nflp, &np->n_locks, nfl_link) { + if (nflp->nfl_flags & (NFS_FILE_LOCK_BLOCKED|NFS_FILE_LOCK_DEAD)) + continue; + if ((start <= nflp->nfl_end) && (end >= nflp->nfl_start) && + ((fl->l_type == F_WRLCK) || (nflp->nfl_type == F_WRLCK))) + break; + } + if (nflp) { + /* found a conflicting lock */ + fl->l_type = nflp->nfl_type; + fl->l_pid = (nflp->nfl_flags & NFS_FILE_LOCK_STYLE_FLOCK) ? -1 : nflp->nfl_owner->nlo_pid; + fl->l_start = nflp->nfl_start; + fl->l_len = NFS_FLOCK_LENGTH(nflp->nfl_start, nflp->nfl_end); + fl->l_whence = SEEK_SET; + answered = 1; + } else if ((np->n_openflags & N_DELEG_WRITE) && !(np->n_openflags & N_DELEG_RETURN)) { + /* + * If we have a write delegation, we know there can't be other + * locks on the server. So the answer is no conflicting lock found. + */ + fl->l_type = F_UNLCK; + answered = 1; + } + lck_mtx_unlock(&np->n_openlock); + if (answered) { + nfs_mount_state_in_use_end(nmp, 0); + return (0); + } + + /* no conflict found locally, so ask the server */ + error = nmp->nm_funcs->nf_getlock_rpc(np, nlop, fl, start, end, ctx); + + if (nfs_mount_state_in_use_end(nmp, error)) + goto restart; + return (error); +} + /* * Acquire a file lock for the given range. * @@ -3284,7 +3521,7 @@ nfs4_getlock( * queue again to coalesce any locks adjacent to the new one. */ int -nfs4_setlock( +nfs_advlock_setlock( nfsnode_t np, struct nfs_open_file *nofp, struct nfs_lock_owner *nlop, @@ -3304,7 +3541,10 @@ nfs4_setlock( nmp = NFSTONMP(np); if (!nmp) return (ENXIO); - slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0; + slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0; + + if ((type != F_RDLCK) && (type != F_WRLCK)) + return (EINVAL); /* allocate a new lock */ newnflp = nfs_file_lock_alloc(nlop); @@ -3335,14 +3575,22 @@ nfs4_setlock( restart: restart = 0; - error = nfs_mount_state_in_use_start(nmp); + error = nfs_mount_state_in_use_start(nmp, vfs_context_thread(ctx)); if (error) goto error_out; inuse = 1; + if (np->n_flag & NREVOKE) { + error = EIO; + nfs_mount_state_in_use_end(nmp, 0); + inuse = 0; + goto error_out; + } if (nofp->nof_flags & NFS_OPEN_FILE_REOPEN) { nfs_mount_state_in_use_end(nmp, 0); inuse = 0; - nfs4_reopen(nofp, vfs_context_thread(ctx)); + error = nfs4_reopen(nofp, vfs_context_thread(ctx)); + if (error) + goto error_out; goto restart; } @@ -3354,7 +3602,8 @@ nfs4_setlock( } /* scan current list of locks (held and pending) for conflicts */ - for (nflp = TAILQ_NEXT(newnflp, nfl_link); nflp; nflp = TAILQ_NEXT(nflp, nfl_link)) { + for (nflp = TAILQ_NEXT(newnflp, nfl_link); nflp; nflp = nextnflp) { + nextnflp = TAILQ_NEXT(nflp, nfl_link); if (!nfs_file_lock_conflict(newnflp, nflp, &willsplit)) continue; /* Conflict */ @@ -3374,10 +3623,10 @@ nfs4_setlock( lck_mtx_unlock(&np->n_openlock); nfs_mount_state_in_use_end(nmp, 0); inuse = 0; - error = nfs4_unlock(np, nofp, nlop, 0, UINT64_MAX, NFS_FILE_LOCK_STYLE_FLOCK, ctx); + error = nfs_advlock_unlock(np, nofp, nlop, 0, UINT64_MAX, NFS_FILE_LOCK_STYLE_FLOCK, ctx); flocknflp = NULL; if (!error) - error = nfs_mount_state_in_use_start(nmp); + error = nfs_mount_state_in_use_start(nmp, vfs_context_thread(ctx)); if (error) { lck_mtx_lock(&np->n_openlock); break; @@ -3388,7 +3637,8 @@ nfs4_setlock( if (!nfs_file_lock_conflict(newnflp, nflp, NULL)) break; } - msleep(nflp, &np->n_openlock, slpflag, "nfs4_setlock_blocked", &ts); + msleep(nflp, &np->n_openlock, slpflag, "nfs_advlock_setlock_blocked", &ts); + slpflag = 0; error = nfs_sigintr(NFSTONMP(np), NULL, vfs_context_thread(ctx), 0); if (!error && (nmp->nm_state & NFSSTA_RECOVER)) { /* looks like we have a recover pending... restart */ @@ -3399,6 +3649,8 @@ nfs4_setlock( lck_mtx_lock(&np->n_openlock); break; } + if (!error && (np->n_flag & NREVOKE)) + error = EIO; } while (!error && nfs_file_lock_conflict(newnflp, nflp, NULL)); nflp->nfl_blockcnt--; if ((nflp->nfl_flags & NFS_FILE_LOCK_DEAD) && !nflp->nfl_blockcnt) { @@ -3407,6 +3659,9 @@ nfs4_setlock( } if (error || restart) break; + /* We have released n_openlock and we can't trust that nextnflp is still valid. */ + /* So, start this lock-scanning loop over from where it started. */ + nextnflp = TAILQ_NEXT(newnflp, nfl_link); } lck_mtx_unlock(&np->n_openlock); if (restart) @@ -3428,16 +3683,50 @@ nfs4_setlock( } /* once scan for local conflicts is clear, send request to server */ - if ((error = nfs_open_state_set_busy(np, ctx))) + if ((error = nfs_open_state_set_busy(np, vfs_context_thread(ctx)))) goto error_out; busy = 1; delay = 0; do { - error = nfs4_lock_rpc(np, nofp, newnflp, 0, vfs_context_thread(ctx), vfs_context_ucred(ctx)); + /* do we have a delegation? (that we're not returning?) */ + if ((np->n_openflags & N_DELEG_MASK) && !(np->n_openflags & N_DELEG_RETURN)) { + if (np->n_openflags & N_DELEG_WRITE) { + /* with a write delegation, just take the lock delegated */ + newnflp->nfl_flags |= NFS_FILE_LOCK_DELEGATED; + error = 0; + /* make sure the lock owner knows its open owner */ + if (!nlop->nlo_open_owner) { + nfs_open_owner_ref(nofp->nof_owner); + nlop->nlo_open_owner = nofp->nof_owner; + } + break; + } else { + /* + * If we don't have any non-delegated opens but we do have + * delegated opens, then we need to first claim the delegated + * opens so that the lock request on the server can be associated + * with an open it knows about. + */ + if ((!nofp->nof_rw_drw && !nofp->nof_w_drw && !nofp->nof_r_drw && + !nofp->nof_rw_dw && !nofp->nof_w_dw && !nofp->nof_r_dw && + !nofp->nof_rw && !nofp->nof_w && !nofp->nof_r) && + (nofp->nof_d_rw_drw || nofp->nof_d_w_drw || nofp->nof_d_r_drw || + nofp->nof_d_rw_dw || nofp->nof_d_w_dw || nofp->nof_d_r_dw || + nofp->nof_d_rw || nofp->nof_d_w || nofp->nof_d_r)) { + error = nfs4_claim_delegated_state_for_open_file(nofp, 0); + if (error) + break; + } + } + } + if (np->n_flag & NREVOKE) + error = EIO; + if (!error) + error = nmp->nm_funcs->nf_setlock_rpc(np, nofp, newnflp, 0, 0, vfs_context_thread(ctx), vfs_context_ucred(ctx)); if (!error || ((error != NFSERR_DENIED) && (error != NFSERR_GRACE))) break; /* request was denied due to either conflict or grace period */ - if ((error != NFSERR_GRACE) && !(newnflp->nfl_flags & NFS_FILE_LOCK_WAIT)) { + if ((error == NFSERR_DENIED) && !(newnflp->nfl_flags & NFS_FILE_LOCK_WAIT)) { error = EAGAIN; break; } @@ -3447,13 +3736,13 @@ nfs4_setlock( busy = 0; nfs_mount_state_in_use_end(nmp, 0); inuse = 0; - error2 = nfs4_unlock(np, nofp, nlop, 0, UINT64_MAX, NFS_FILE_LOCK_STYLE_FLOCK, ctx); + error2 = nfs_advlock_unlock(np, nofp, nlop, 0, UINT64_MAX, NFS_FILE_LOCK_STYLE_FLOCK, ctx); flocknflp = NULL; if (!error2) - error2 = nfs_mount_state_in_use_start(nmp); + error2 = nfs_mount_state_in_use_start(nmp, vfs_context_thread(ctx)); if (!error2) { inuse = 1; - error2 = nfs_open_state_set_busy(np, ctx); + error2 = nfs_open_state_set_busy(np, vfs_context_thread(ctx)); } if (error2) { error = error2; @@ -3461,12 +3750,18 @@ nfs4_setlock( } busy = 1; } - /* wait a little bit and send the request again */ - if (error == NFSERR_GRACE) - delay = 4; - if (delay < 4) - delay++; - tsleep(newnflp, slpflag, "nfs4_setlock_delay", delay * (hz/2)); + /* + * Wait a little bit and send the request again. + * Except for retries of blocked v2/v3 request where we've already waited a bit. + */ + if ((nmp->nm_vers >= NFS_VER4) || (error == NFSERR_GRACE)) { + if (error == NFSERR_GRACE) + delay = 4; + if (delay < 4) + delay++; + tsleep(newnflp, slpflag, "nfs_advlock_setlock_delay", delay * (hz/2)); + slpflag = 0; + } error = nfs_sigintr(NFSTONMP(np), NULL, vfs_context_thread(ctx), 0); if (!error && (nmp->nm_state & NFSSTA_RECOVER)) { /* looks like we have a recover pending... restart */ @@ -3476,6 +3771,8 @@ nfs4_setlock( inuse = 0; goto restart; } + if (!error && (np->n_flag & NREVOKE)) + error = EIO; } while (!error); error_out: @@ -3545,7 +3842,7 @@ nfs4_setlock( /* We're replacing a range in the middle of a lock. */ /* The current lock will be split into two locks. */ /* Update locks and insert new lock after current lock. */ - nflp2->nfl_flags |= (nflp->nfl_flags & NFS_FILE_LOCK_STYLE_MASK); + nflp2->nfl_flags |= (nflp->nfl_flags & (NFS_FILE_LOCK_STYLE_MASK|NFS_FILE_LOCK_DELEGATED)); nflp2->nfl_type = nflp->nfl_type; nflp2->nfl_start = newnflp->nfl_end + 1; nflp2->nfl_end = nflp->nfl_end; @@ -3635,8 +3932,11 @@ nfs4_setlock( return (error); } +/* + * Release all (same style) locks within the given range. + */ int -nfs4_unlock( +nfs_advlock_unlock( nfsnode_t np, struct nfs_open_file *nofp, struct nfs_lock_owner *nlop, @@ -3654,14 +3954,16 @@ nfs4_unlock( return (ENXIO); restart: - if ((error = nfs_mount_state_in_use_start(nmp))) + if ((error = nfs_mount_state_in_use_start(nmp, NULL))) return (error); if (nofp->nof_flags & NFS_OPEN_FILE_REOPEN) { nfs_mount_state_in_use_end(nmp, 0); - nfs4_reopen(nofp, vfs_context_thread(ctx)); + error = nfs4_reopen(nofp, NULL); + if (error) + return (error); goto restart; } - if ((error = nfs_open_state_set_busy(np, ctx))) { + if ((error = nfs_open_state_set_busy(np, NULL))) { nfs_mount_state_in_use_end(nmp, error); return (error); } @@ -3725,11 +4027,13 @@ nfs4_unlock( ((nflp->nfl_flags & NFS_FILE_LOCK_STYLE_MASK) == NFS_FILE_LOCK_STYLE_POSIX)) { uint64_t s = 0; int type = TAILQ_FIRST(&nlop->nlo_locks)->nfl_type; - while (nflp) { + int delegated = (TAILQ_FIRST(&nlop->nlo_locks)->nfl_flags & NFS_FILE_LOCK_DELEGATED); + while (!delegated && nflp) { if ((nflp->nfl_flags & NFS_FILE_LOCK_STYLE_MASK) == NFS_FILE_LOCK_STYLE_POSIX) { /* unlock the range preceding this lock */ lck_mtx_unlock(&np->n_openlock); - error = nfs4_unlock_rpc(np, nlop, type, s, nflp->nfl_start-1, ctx); + error = nmp->nm_funcs->nf_unlock_rpc(np, nlop, type, s, nflp->nfl_start-1, 0, + vfs_context_thread(ctx), vfs_context_ucred(ctx)); if (nfs_mount_state_error_should_restart(error)) { nfs_open_state_clear_busy(np); nfs_mount_state_in_use_end(nmp, error); @@ -3742,16 +4046,19 @@ nfs4_unlock( } nflp = TAILQ_NEXT(nflp, nfl_lolink); } - lck_mtx_unlock(&np->n_openlock); - error = nfs4_unlock_rpc(np, nlop, type, s, end, ctx); - if (nfs_mount_state_error_should_restart(error)) { - nfs_open_state_clear_busy(np); - nfs_mount_state_in_use_end(nmp, error); - goto restart; + if (!delegated) { + lck_mtx_unlock(&np->n_openlock); + error = nmp->nm_funcs->nf_unlock_rpc(np, nlop, type, s, end, 0, + vfs_context_thread(ctx), vfs_context_ucred(ctx)); + if (nfs_mount_state_error_should_restart(error)) { + nfs_open_state_clear_busy(np); + nfs_mount_state_in_use_end(nmp, error); + goto restart; + } + lck_mtx_lock(&np->n_openlock); + if (error) + goto out; } - lck_mtx_lock(&np->n_openlock); - if (error) - goto out; send_unlock_rpcs = 0; } @@ -3767,9 +4074,10 @@ nfs4_unlock( /* here's one to unlock */ if ((start <= nflp->nfl_start) && (end >= nflp->nfl_end)) { /* The entire lock is being unlocked. */ - if (send_unlock_rpcs) { + if (send_unlock_rpcs && !(nflp->nfl_flags & NFS_FILE_LOCK_DELEGATED)) { lck_mtx_unlock(&np->n_openlock); - error = nfs4_unlock_rpc(np, nlop, nflp->nfl_type, nflp->nfl_start, nflp->nfl_end, ctx); + error = nmp->nm_funcs->nf_unlock_rpc(np, nlop, nflp->nfl_type, nflp->nfl_start, nflp->nfl_end, 0, + vfs_context_thread(ctx), vfs_context_ucred(ctx)); if (nfs_mount_state_error_should_restart(error)) { nfs_open_state_clear_busy(np); nfs_mount_state_in_use_end(nmp, error); @@ -3788,9 +4096,10 @@ nfs4_unlock( } else if ((start > nflp->nfl_start) && (end < nflp->nfl_end)) { /* We're unlocking a range in the middle of a lock. */ /* The current lock will be split into two locks. */ - if (send_unlock_rpcs) { + if (send_unlock_rpcs && !(nflp->nfl_flags & NFS_FILE_LOCK_DELEGATED)) { lck_mtx_unlock(&np->n_openlock); - error = nfs4_unlock_rpc(np, nlop, nflp->nfl_type, start, end, ctx); + error = nmp->nm_funcs->nf_unlock_rpc(np, nlop, nflp->nfl_type, start, end, 0, + vfs_context_thread(ctx), vfs_context_ucred(ctx)); if (nfs_mount_state_error_should_restart(error)) { nfs_open_state_clear_busy(np); nfs_mount_state_in_use_end(nmp, error); @@ -3801,7 +4110,7 @@ nfs4_unlock( if (error) break; /* update locks and insert new lock after current lock */ - newnflp->nfl_flags |= (nflp->nfl_flags & NFS_FILE_LOCK_STYLE_MASK); + newnflp->nfl_flags |= (nflp->nfl_flags & (NFS_FILE_LOCK_STYLE_MASK|NFS_FILE_LOCK_DELEGATED)); newnflp->nfl_type = nflp->nfl_type; newnflp->nfl_start = end + 1; newnflp->nfl_end = nflp->nfl_end; @@ -3812,9 +4121,10 @@ nfs4_unlock( newnflp = NULL; } else if (start > nflp->nfl_start) { /* We're unlocking the end of a lock. */ - if (send_unlock_rpcs) { + if (send_unlock_rpcs && !(nflp->nfl_flags & NFS_FILE_LOCK_DELEGATED)) { lck_mtx_unlock(&np->n_openlock); - error = nfs4_unlock_rpc(np, nlop, nflp->nfl_type, start, nflp->nfl_end, ctx); + error = nmp->nm_funcs->nf_unlock_rpc(np, nlop, nflp->nfl_type, start, nflp->nfl_end, 0, + vfs_context_thread(ctx), vfs_context_ucred(ctx)); if (nfs_mount_state_error_should_restart(error)) { nfs_open_state_clear_busy(np); nfs_mount_state_in_use_end(nmp, error); @@ -3828,9 +4138,10 @@ nfs4_unlock( nflp->nfl_end = start - 1; } else if (end < nflp->nfl_end) { /* We're unlocking the start of a lock. */ - if (send_unlock_rpcs) { + if (send_unlock_rpcs && !(nflp->nfl_flags & NFS_FILE_LOCK_DELEGATED)) { lck_mtx_unlock(&np->n_openlock); - error = nfs4_unlock_rpc(np, nlop, nflp->nfl_type, nflp->nfl_start, end, ctx); + error = nmp->nm_funcs->nf_unlock_rpc(np, nlop, nflp->nfl_type, nflp->nfl_start, end, 0, + vfs_context_thread(ctx), vfs_context_ucred(ctx)); if (nfs_mount_state_error_should_restart(error)) { nfs_open_state_clear_busy(np); nfs_mount_state_in_use_end(nmp, error); @@ -3866,7 +4177,7 @@ nfs4_unlock( * NFSv4 advisory file locking */ int -nfs4_vnop_advlock( +nfs_vnop_advlock( struct vnop_advlock_args /* { struct vnodeop_desc *a_desc; vnode_t a_vp; @@ -3884,19 +4195,34 @@ nfs4_vnop_advlock( int flags = ap->a_flags; vfs_context_t ctx = ap->a_context; struct nfsmount *nmp; - struct nfs_vattr nvattr; struct nfs_open_owner *noop = NULL; struct nfs_open_file *nofp = NULL; struct nfs_lock_owner *nlop = NULL; off_t lstart; uint64_t start, end; int error = 0, modified, style; + enum vtype vtype; #define OFF_MAX QUAD_MAX nmp = VTONMP(ap->a_vp); if (!nmp) return (ENXIO); + lck_mtx_lock(&nmp->nm_lock); + if ((nmp->nm_vers <= NFS_VER3) && (nmp->nm_lockmode == NFS_LOCK_MODE_DISABLED)) { + lck_mtx_unlock(&nmp->nm_lock); + return (ENOTSUP); + } + lck_mtx_unlock(&nmp->nm_lock); + if (np->n_flag & NREVOKE) + return (EIO); + vtype = vnode_vtype(ap->a_vp); + if (vtype == VDIR) /* ignore lock requests on directories */ + return (0); + if (vtype != VREG) /* anything other than regular files is invalid */ + return (EINVAL); + + /* Convert the flock structure into a start and end. */ switch (fl->l_whence) { case SEEK_SET: case SEEK_CUR: @@ -3915,7 +4241,7 @@ nfs4_vnop_advlock( nfs_node_unlock(np); if (modified && ((error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1)))) return (error); - if ((error = nfs_getattr(np, &nvattr, ctx, NGA_UNCACHED))) + if ((error = nfs_getattr(np, NULL, ctx, NGA_UNCACHED))) return (error); nfs_data_lock(np, NFS_DATA_LOCK_SHARED); if ((np->n_size > OFF_MAX) || @@ -3944,8 +4270,8 @@ nfs4_vnop_advlock( end = start - 1; start += fl->l_len; } - if (error) - return (error); + if ((nmp->nm_vers == NFS_VER2) && ((start > INT32_MAX) || (fl->l_len && (end > INT32_MAX)))) + return (EINVAL); style = (flags & F_FLOCK) ? NFS_FILE_LOCK_STYLE_FLOCK : NFS_FILE_LOCK_STYLE_POSIX; if ((style == NFS_FILE_LOCK_STYLE_FLOCK) && ((start != 0) || (end != UINT64_MAX))) @@ -3956,17 +4282,17 @@ nfs4_vnop_advlock( if (!nlop) { error = (op == F_UNLCK) ? 0 : ENOMEM; if (error) - printf("nfs4_vnop_advlock: no lock owner %d\n", error); + NP(np, "nfs_vnop_advlock: no lock owner, error %d", error); goto out; } if (op == F_GETLK) { - error = nfs4_getlock(np, nlop, fl, start, end, ctx); + error = nfs_advlock_getlock(np, nlop, fl, start, end, ctx); } else { /* find the open owner */ noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 0); if (!noop) { - printf("nfs4_vnop_advlock: no open owner\n"); + NP(np, "nfs_vnop_advlock: no open owner %d", kauth_cred_getuid(vfs_context_ucred(ctx))); error = EPERM; goto out; } @@ -3976,24 +4302,25 @@ nfs4_vnop_advlock( if (error) error = EBADF; if (!error && (nofp->nof_flags & NFS_OPEN_FILE_LOST)) { - printf("nfs_vnop_advlock: LOST\n"); + NP(np, "nfs_vnop_advlock: LOST %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); error = EIO; } if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { - nfs4_reopen(nofp, vfs_context_thread(ctx)); + error = nfs4_reopen(nofp, ((op == F_UNLCK) ? NULL : vfs_context_thread(ctx))); nofp = NULL; - goto restart; + if (!error) + goto restart; } if (error) { - printf("nfs4_vnop_advlock: no open file %d\n", error); + NP(np, "nfs_vnop_advlock: no open file %d, %d", error, kauth_cred_getuid(noop->noo_cred)); goto out; } if (op == F_UNLCK) { - error = nfs4_unlock(np, nofp, nlop, start, end, style, ctx); + error = nfs_advlock_unlock(np, nofp, nlop, start, end, style, ctx); } else if ((op == F_SETLK) || (op == F_SETLKW)) { if ((op == F_SETLK) && (flags & F_WAIT)) op = F_SETLKW; - error = nfs4_setlock(np, nofp, nlop, op, start, end, style, fl->l_type, ctx); + error = nfs_advlock_setlock(np, nofp, nlop, op, start, end, style, fl->l_type, ctx); } else { /* not getlk, unlock or lock? */ error = EINVAL; @@ -4012,7 +4339,7 @@ nfs4_vnop_advlock( * Check if an open owner holds any locks on a file. */ int -nfs4_check_for_locks(struct nfs_open_owner *noop, struct nfs_open_file *nofp) +nfs_check_for_locks(struct nfs_open_owner *noop, struct nfs_open_file *nofp) { struct nfs_lock_owner *nlop; @@ -4028,19 +4355,21 @@ nfs4_check_for_locks(struct nfs_open_owner *noop, struct nfs_open_file *nofp) /* * Reopen simple (no deny, no locks) open state that was lost. */ -void +int nfs4_reopen(struct nfs_open_file *nofp, thread_t thd) { struct nfs_open_owner *noop = nofp->nof_owner; struct nfsmount *nmp = NFSTONMP(nofp->nof_np); - vnode_t vp = NFSTOV(nofp->nof_np); + nfsnode_t np = nofp->nof_np; + vnode_t vp = NFSTOV(np); vnode_t dvp = NULL; struct componentname cn; const char *vname = NULL; + const char *name = NULL; size_t namelen; char smallname[128]; char *filename = NULL; - int error = 0, done = 0, slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0; + int error = 0, done = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0; struct timespec ts = { 1, 0 }; lck_mtx_lock(&nofp->nof_lock); @@ -4048,38 +4377,67 @@ nfs4_reopen(struct nfs_open_file *nofp, thread_t thd) if ((error = nfs_sigintr(nmp, NULL, thd, 0))) break; msleep(&nofp->nof_flags, &nofp->nof_lock, slpflag|(PZERO-1), "nfsreopenwait", &ts); + slpflag = 0; } - if (!(nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { + if (error || !(nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { lck_mtx_unlock(&nofp->nof_lock); - return; + return (error); } nofp->nof_flags |= NFS_OPEN_FILE_REOPENING; lck_mtx_unlock(&nofp->nof_lock); - dvp = vnode_getparent(vp); - vname = vnode_getname(vp); - if (!dvp || !vname) { - error = EIO; - goto out; + nfs_node_lock_force(np); + if ((vnode_vtype(vp) != VDIR) && np->n_sillyrename) { + /* + * The node's been sillyrenamed, so we need to use + * the sillyrename directory/name to do the open. + */ + struct nfs_sillyrename *nsp = np->n_sillyrename; + dvp = NFSTOV(nsp->nsr_dnp); + if ((error = vnode_get(dvp))) { + nfs_node_unlock(np); + goto out; + } + name = nsp->nsr_name; + } else { + /* + * [sigh] We can't trust VFS to get the parent right for named + * attribute nodes. (It likes to reparent the nodes after we've + * created them.) Luckily we can probably get the right parent + * from the n_parent we have stashed away. + */ + if ((np->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR) && + (((dvp = np->n_parent)) && (error = vnode_get(dvp)))) + dvp = NULL; + if (!dvp) + dvp = vnode_getparent(vp); + vname = vnode_getname(vp); + if (!dvp || !vname) { + if (!error) + error = EIO; + nfs_node_unlock(np); + goto out; + } + name = vname; } filename = &smallname[0]; - namelen = snprintf(filename, sizeof(smallname), "%s", vname); + namelen = snprintf(filename, sizeof(smallname), "%s", name); if (namelen >= sizeof(smallname)) { - namelen++; /* snprintf result doesn't include '\0' */ - MALLOC(filename, char *, namelen, M_TEMP, M_WAITOK); + MALLOC(filename, char *, namelen+1, M_TEMP, M_WAITOK); if (!filename) { error = ENOMEM; goto out; } - snprintf(filename, namelen, "%s", vname); + snprintf(filename, namelen+1, "%s", name); } + nfs_node_unlock(np); bzero(&cn, sizeof(cn)); cn.cn_nameptr = filename; cn.cn_namelen = namelen; restart: done = 0; - if ((error = nfs_mount_state_in_use_start(nmp))) + if ((error = nfs_mount_state_in_use_start(nmp, thd))) goto out; if (nofp->nof_rw) @@ -4092,19 +4450,22 @@ nfs4_reopen(struct nfs_open_file *nofp, thread_t thd) if (nfs_mount_state_in_use_end(nmp, error)) { if (error == NFSERR_GRACE) goto restart; + printf("nfs4_reopen: RPC failed, error %d, lost %d, %s\n", error, + (nofp->nof_flags & NFS_OPEN_FILE_LOST) ? 1 : 0, name ? name : "???"); error = 0; goto out; } done = 1; out: + if (error && (error != EINTR) && (error != ERESTART)) + nfs_revoke_open_state_for_node(np); lck_mtx_lock(&nofp->nof_lock); nofp->nof_flags &= ~NFS_OPEN_FILE_REOPENING; - if (error) - nofp->nof_flags |= NFS_OPEN_FILE_LOST; if (done) nofp->nof_flags &= ~NFS_OPEN_FILE_REOPEN; - else - printf("nfs4_reopen: failed, error %d, lost %d\n", error, (nofp->nof_flags & NFS_OPEN_FILE_LOST) ? 1 : 0); + else if (error) + printf("nfs4_reopen: failed, error %d, lost %d, %s\n", error, + (nofp->nof_flags & NFS_OPEN_FILE_LOST) ? 1 : 0, name ? name : "???"); lck_mtx_unlock(&nofp->nof_lock); if (filename && (filename != &smallname[0])) FREE(filename, M_TEMP); @@ -4112,6 +4473,7 @@ nfs4_reopen(struct nfs_open_file *nofp, thread_t thd) vnode_putname(vname); if (dvp != NULLVP) vnode_put(dvp); + return (error); } /* @@ -4147,13 +4509,73 @@ nfs4_open_reopen_rpc( int share_access, int share_deny) { - return (nfs4_open_rpc_internal(nofp, NULL, thd, cred, cnp, NULL, dvp, vpp, 0, share_access, share_deny)); + return (nfs4_open_rpc_internal(nofp, NULL, thd, cred, cnp, NULL, dvp, vpp, NFS_OPEN_NOCREATE, share_access, share_deny)); +} + +/* + * Send an OPEN_CONFIRM RPC to confirm an OPEN. + */ +int +nfs4_open_confirm_rpc( + struct nfsmount *nmp, + nfsnode_t dnp, + u_char *fhp, + int fhlen, + struct nfs_open_owner *noop, + nfs_stateid *sid, + thread_t thd, + kauth_cred_t cred, + struct nfs_vattr *nvap, + uint64_t *xidp) +{ + struct nfsm_chain nmreq, nmrep; + int error = 0, status, numops; + struct nfsreq_secinfo_args si; + + NFSREQ_SECINFO_SET(&si, dnp, NULL, 0, NULL, 0); + nfsm_chain_null(&nmreq); + nfsm_chain_null(&nmrep); + + // PUTFH, OPEN_CONFIRM, GETATTR + numops = 3; + nfsm_chain_build_alloc_init(error, &nmreq, 23 * NFSX_UNSIGNED); + nfsm_chain_add_compound_header(error, &nmreq, "open_confirm", numops); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, nmp->nm_vers, fhp, fhlen); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_OPEN_CONFIRM); + nfsm_chain_add_stateid(error, &nmreq, sid); + nfsm_chain_add_32(error, &nmreq, noop->noo_seqid); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, dnp); + nfsm_chain_build_done(error, &nmreq); + nfsm_assert(error, (numops == 0), EPROTO); + nfsmout_if(error); + error = nfs_request2(dnp, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, &si, R_NOINTR, &nmrep, xidp, &status); + + nfsm_chain_skip_tag(error, &nmrep); + nfsm_chain_get_32(error, &nmrep, numops); + nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); + nfsmout_if(error); + nfsm_chain_op_check(error, &nmrep, NFS_OP_OPEN_CONFIRM); + nfs_owner_seqid_increment(noop, NULL, error); + nfsm_chain_get_stateid(error, &nmrep, sid); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); + nfsmout_if(error); + error = nfs4_parsefattr(&nmrep, NULL, nvap, NULL, NULL, NULL); +nfsmout: + nfsm_chain_cleanup(&nmreq); + nfsm_chain_cleanup(&nmrep); + return (error); } /* * common OPEN RPC code * * If create is set, ctx must be passed in. + * Returns a node on success if no node passed in. */ int nfs4_open_rpc_internal( @@ -4171,20 +4593,24 @@ nfs4_open_rpc_internal( { struct nfsmount *nmp; struct nfs_open_owner *noop = nofp->nof_owner; - struct nfs_vattr nvattr, dnvattr; + struct nfs_vattr nvattr; int error = 0, open_error = EIO, lockerror = ENOENT, busyerror = ENOENT, status; - int nfsvers, numops, exclusive = 0, gotuid, gotgid; + int nfsvers, namedattrs, numops, exclusive = 0, gotuid, gotgid; u_int64_t xid, savedxid = 0; nfsnode_t dnp = VTONFS(dvp); nfsnode_t np, newnp = NULL; vnode_t newvp = NULL; struct nfsm_chain nmreq, nmrep; uint32_t bitmap[NFS_ATTR_BITMAP_LEN], bmlen; - uint32_t rflags, delegation = 0, recall = 0, val; + uint32_t rflags, delegation, recall; struct nfs_stateid stateid, dstateid, *sid; fhandle_t fh; - struct nfsreq *req = NULL; + struct nfsreq rq, *req = &rq; struct nfs_dulookup dul; + char sbuf[64], *s; + uint32_t ace_type, ace_flags, ace_mask, len, slen; + struct kauth_ace ace; + struct nfsreq_secinfo_args si; if (create && !ctx) return (EINVAL); @@ -4193,6 +4619,9 @@ nfs4_open_rpc_internal( if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; + namedattrs = (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR); + if (dnp->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); np = *vpp ? VTONFS(*vpp) : NULL; if (create && vap) { @@ -4200,6 +4629,8 @@ nfs4_open_rpc_internal( nfs_avoid_needless_id_setting_on_create(dnp, vap, ctx); gotuid = VATTR_IS_ACTIVE(vap, va_uid); gotgid = VATTR_IS_ACTIVE(vap, va_gid); + if (exclusive && (!VATTR_IS_ACTIVE(vap, va_access_time) || !VATTR_IS_ACTIVE(vap, va_modify_time))) + vap->va_vaflags |= VA_UTIMES_NULL; } else { exclusive = gotuid = gotgid = 0; } @@ -4213,7 +4644,12 @@ nfs4_open_rpc_internal( if ((error = nfs_open_owner_set_busy(noop, thd))) return (error); again: - rflags = 0; + rflags = delegation = recall = 0; + ace.ace_flags = 0; + s = sbuf; + slen = sizeof(sbuf); + NVATTR_INIT(&nvattr); + NFSREQ_SECINFO_SET(&si, dnp, NULL, 0, cnp->cn_nameptr, cnp->cn_namelen); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -4232,13 +4668,9 @@ nfs4_open_rpc_internal( nfsm_chain_add_32(error, &nmreq, noop->noo_seqid); nfsm_chain_add_32(error, &nmreq, share_access); nfsm_chain_add_32(error, &nmreq, share_deny); - - // open owner: clientid + uid - nfsm_chain_add_64(error, &nmreq, nmp->nm_clientid); // open_owner4.clientid + nfsm_chain_add_64(error, &nmreq, nmp->nm_clientid); nfsm_chain_add_32(error, &nmreq, NFSX_UNSIGNED); - nfsm_chain_add_32(error, &nmreq, kauth_cred_getuid(noop->noo_cred)); // open_owner4.owner - - // openflag4 + nfsm_chain_add_32(error, &nmreq, kauth_cred_getuid(noop->noo_cred)); nfsm_chain_add_32(error, &nmreq, create); if (create) { if (exclusive) { @@ -4253,40 +4685,36 @@ nfs4_open_rpc_internal( nfsm_chain_add_fattr4(error, &nmreq, vap, nmp); } } - - // open_claim4 nfsm_chain_add_32(error, &nmreq, NFS_CLAIM_NULL); - nfsm_chain_add_string(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen); + nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); NFS_COPY_ATTRIBUTES(nfs_getattr_bitmap, bitmap); NFS_BITMAP_SET(bitmap, NFS_FATTR_FILEHANDLE); - nfsm_chain_add_bitmap_masked(error, &nmreq, bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, bitmap, nmp, np); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_RESTOREFH); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, dnp); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); if (!error) error = busyerror = nfs_node_set_busy(dnp, thd); nfsmout_if(error); - if (create) + if (create && !namedattrs) nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx); - error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, NULL, &req); + error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, &si, R_NOINTR, NULL, &req); if (!error) { - if (create) + if (create && !namedattrs) nfs_dulookup_start(&dul, dnp, ctx); error = nfs_request_async_finish(req, &nmrep, &xid, &status); savedxid = xid; } - if (create) + if (create && !namedattrs) nfs_dulookup_finish(&dul, dnp, ctx); if ((lockerror = nfs_node_lock(dnp))) @@ -4309,51 +4737,69 @@ nfs4_open_rpc_internal( case NFS_OPEN_DELEGATE_NONE: break; case NFS_OPEN_DELEGATE_READ: - nfsm_chain_get_stateid(error, &nmrep, &dstateid); - nfsm_chain_get_32(error, &nmrep, recall); - // ACE: (skip) XXX - nfsm_chain_adv(error, &nmrep, 3 * NFSX_UNSIGNED); - nfsm_chain_get_32(error, &nmrep, val); /* string length */ - nfsm_chain_adv(error, &nmrep, nfsm_rndup(val)); - break; case NFS_OPEN_DELEGATE_WRITE: nfsm_chain_get_stateid(error, &nmrep, &dstateid); nfsm_chain_get_32(error, &nmrep, recall); - // space (skip) XXX - nfsm_chain_adv(error, &nmrep, 3 * NFSX_UNSIGNED); - // ACE: (skip) XXX - nfsm_chain_adv(error, &nmrep, 3 * NFSX_UNSIGNED); - nfsm_chain_get_32(error, &nmrep, val); /* string length */ - nfsm_chain_adv(error, &nmrep, nfsm_rndup(val)); + if (delegation == NFS_OPEN_DELEGATE_WRITE) // space (skip) XXX + nfsm_chain_adv(error, &nmrep, 3 * NFSX_UNSIGNED); + /* if we have any trouble accepting the ACE, just invalidate it */ + ace_type = ace_flags = ace_mask = len = 0; + nfsm_chain_get_32(error, &nmrep, ace_type); + nfsm_chain_get_32(error, &nmrep, ace_flags); + nfsm_chain_get_32(error, &nmrep, ace_mask); + nfsm_chain_get_32(error, &nmrep, len); + ace.ace_flags = nfs4_ace_nfstype_to_vfstype(ace_type, &error); + ace.ace_flags |= nfs4_ace_nfsflags_to_vfsflags(ace_flags); + ace.ace_rights = nfs4_ace_nfsmask_to_vfsrights(ace_mask); + if (!error && (len >= slen)) { + MALLOC(s, char*, len+1, M_TEMP, M_WAITOK); + if (s) + slen = len+1; + else + ace.ace_flags = 0; + } + if (s) + nfsm_chain_get_opaque(error, &nmrep, len, s); + else + nfsm_chain_adv(error, &nmrep, nfsm_rndup(len)); + if (!error && s) { + s[len] = '\0'; + if (nfs4_id2guid(s, &ace.ace_applicable, (ace_flags & NFS_ACE_IDENTIFIER_GROUP))) + ace.ace_flags = 0; + } + if (error || !s) + ace.ace_flags = 0; + if (s && (s != sbuf)) + FREE(s, M_TEMP); break; default: error = EBADRPC; break; } /* At this point if we have no error, the object was created/opened. */ - /* if we don't get attributes, then we should lookitup. */ open_error = error; nfsmout_if(error); - if (create && !exclusive) + if (create && vap && !exclusive) nfs_vattr_set_supported(bitmap, vap); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); nfsmout_if(error); - NFS_CLEAR_ATTRIBUTES(nvattr.nva_bitmap); - error = nfs4_parsefattr(&nmrep, NULL, &nvattr, &fh, NULL); + error = nfs4_parsefattr(&nmrep, NULL, &nvattr, &fh, NULL, NULL); nfsmout_if(error); if (!NFS_BITMAP_ISSET(nvattr.nva_bitmap, NFS_FATTR_FILEHANDLE)) { - printf("nfs: open/create didn't return filehandle?\n"); + printf("nfs: open/create didn't return filehandle? %s\n", cnp->cn_nameptr); error = EBADRPC; goto nfsmout; } if (!create && np && !NFS_CMPFH(np, fh.fh_data, fh.fh_len)) { // XXX for the open case, what if fh doesn't match the vnode we think we're opening? - printf("nfs4_open_rpc: warning: file handle mismatch\n"); + // Solaris Named Attributes may do this due to a bug.... so don't warn for named attributes. + if (!(np->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR)) + NP(np, "nfs4_open_rpc: warning: file handle mismatch"); } /* directory attributes: if we don't get them, make sure to invalidate */ nfsm_chain_op_check(error, &nmrep, NFS_OP_RESTOREFH); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, dnp, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, dnp, nfsvers, &xid); if (error) NATTRINVALIDATE(dnp); nfsmout_if(error); @@ -4364,39 +4810,8 @@ nfs4_open_rpc_internal( if (rflags & NFS_OPEN_RESULT_CONFIRM) { nfs_node_unlock(dnp); lockerror = ENOENT; - nfsm_chain_cleanup(&nmreq); - nfsm_chain_cleanup(&nmrep); - // PUTFH, OPEN_CONFIRM, GETATTR - numops = 3; - nfsm_chain_build_alloc_init(error, &nmreq, 23 * NFSX_UNSIGNED); - nfsm_chain_add_compound_header(error, &nmreq, "open_confirm", numops); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); - nfsm_chain_add_fh(error, &nmreq, nfsvers, fh.fh_data, fh.fh_len); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_OPEN_CONFIRM); - nfsm_chain_add_stateid(error, &nmreq, sid); - nfsm_chain_add_32(error, &nmreq, noop->noo_seqid); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); - nfsm_chain_build_done(error, &nmreq); - nfsm_assert(error, (numops == 0), EPROTO); - nfsmout_if(error); - error = nfs_request2(dnp, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, 0, &nmrep, &xid, &status); - - nfsm_chain_skip_tag(error, &nmrep); - nfsm_chain_get_32(error, &nmrep, numops); - nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); - nfsmout_if(error); - nfsm_chain_op_check(error, &nmrep, NFS_OP_OPEN_CONFIRM); - nfs_owner_seqid_increment(noop, NULL, error); - nfsm_chain_get_stateid(error, &nmrep, sid); - nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsmout_if(error); - NFS_CLEAR_ATTRIBUTES(nvattr.nva_bitmap); - error = nfs4_parsefattr(&nmrep, NULL, &nvattr, NULL, NULL); + NVATTR_CLEANUP(&nvattr); + error = nfs4_open_confirm_rpc(nmp, dnp, fh.fh_data, fh.fh_len, noop, sid, thd, cred, &nvattr, &xid); nfsmout_if(error); savedxid = xid; if ((lockerror = nfs_node_lock(dnp))) @@ -4415,17 +4830,18 @@ nfs4_open_rpc_internal( dnp->n_flag |= NMODIFIED; nfs_node_unlock(dnp); lockerror = ENOENT; - nfs_getattr(dnp, &dnvattr, ctx, NGA_CACHED); + nfs_getattr(dnp, NULL, ctx, NGA_CACHED); } if (!lockerror) nfs_node_unlock(dnp); - if (!error && create && fh.fh_len) { + if (!error && !np && fh.fh_len) { /* create the vnode with the filehandle and attributes */ xid = savedxid; - error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, NG_MAKEENTRY, &newnp); + error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, rq.r_auth, NG_MAKEENTRY, &newnp); if (!error) newvp = NFSTOV(newnp); } + NVATTR_CLEANUP(&nvattr); if (!busyerror) nfs_node_clear_busy(dnp); if ((delegation == NFS_OPEN_DELEGATE_READ) || (delegation == NFS_OPEN_DELEGATE_WRITE)) { @@ -4437,15 +4853,39 @@ nfs4_open_rpc_internal( np->n_openflags &= ~N_DELEG_MASK; np->n_openflags |= ((delegation == NFS_OPEN_DELEGATE_READ) ? N_DELEG_READ : N_DELEG_WRITE); np->n_dstateid = dstateid; + np->n_dace = ace; + if (np->n_dlink.tqe_next == NFSNOLIST) { + lck_mtx_lock(&nmp->nm_lock); + if (np->n_dlink.tqe_next == NFSNOLIST) + TAILQ_INSERT_TAIL(&nmp->nm_delegations, np, n_dlink); + lck_mtx_unlock(&nmp->nm_lock); + } lck_mtx_unlock(&np->n_openlock); - } - if (recall) { - nfs4_delegreturn_rpc(nmp, fh.fh_data, fh.fh_len, &dstateid, thd, cred); + } else { + /* give the delegation back */ if (np) { - lck_mtx_lock(&np->n_openlock); - np->n_openflags &= ~N_DELEG_MASK; - lck_mtx_unlock(&np->n_openlock); + if (NFS_CMPFH(np, fh.fh_data, fh.fh_len)) { + /* update delegation state and return it */ + lck_mtx_lock(&np->n_openlock); + np->n_openflags &= ~N_DELEG_MASK; + np->n_openflags |= ((delegation == NFS_OPEN_DELEGATE_READ) ? N_DELEG_READ : N_DELEG_WRITE); + np->n_dstateid = dstateid; + np->n_dace = ace; + if (np->n_dlink.tqe_next == NFSNOLIST) { + lck_mtx_lock(&nmp->nm_lock); + if (np->n_dlink.tqe_next == NFSNOLIST) + TAILQ_INSERT_TAIL(&nmp->nm_delegations, np, n_dlink); + lck_mtx_unlock(&nmp->nm_lock); + } + lck_mtx_unlock(&np->n_openlock); + /* don't need to send a separate delegreturn for fh */ + fh.fh_len = 0; + } + /* return np's current delegation */ + nfs4_delegation_return(np, 0, thd, cred); } + if (fh.fh_len) /* return fh's delegation if it wasn't for np */ + nfs4_delegreturn_rpc(nmp, fh.fh_data, fh.fh_len, &dstateid, 0, thd, cred); } } if (error) { @@ -4478,6 +4918,266 @@ nfs4_open_rpc_internal( return (error); } + +/* + * Send an OPEN RPC to claim a delegated open for a file + */ +int +nfs4_claim_delegated_open_rpc( + struct nfs_open_file *nofp, + int share_access, + int share_deny, + int flags) +{ + struct nfsmount *nmp; + struct nfs_open_owner *noop = nofp->nof_owner; + struct nfs_vattr nvattr; + int error = 0, lockerror = ENOENT, status; + int nfsvers, numops; + u_int64_t xid; + nfsnode_t np = nofp->nof_np; + struct nfsm_chain nmreq, nmrep; + uint32_t bitmap[NFS_ATTR_BITMAP_LEN], bmlen; + uint32_t rflags = 0, delegation, recall = 0; + fhandle_t fh; + struct nfs_stateid dstateid; + char sbuf[64], *s = sbuf; + uint32_t ace_type, ace_flags, ace_mask, len, slen = sizeof(sbuf); + struct kauth_ace ace; + vnode_t dvp = NULL; + const char *vname = NULL; + const char *name = NULL; + size_t namelen; + char smallname[128]; + char *filename = NULL; + struct nfsreq_secinfo_args si; + + nmp = NFSTONMP(np); + if (!nmp) + return (ENXIO); + nfsvers = nmp->nm_vers; + + nfs_node_lock_force(np); + if ((vnode_vtype(NFSTOV(np)) != VDIR) && np->n_sillyrename) { + /* + * The node's been sillyrenamed, so we need to use + * the sillyrename directory/name to do the open. + */ + struct nfs_sillyrename *nsp = np->n_sillyrename; + dvp = NFSTOV(nsp->nsr_dnp); + if ((error = vnode_get(dvp))) { + nfs_node_unlock(np); + goto out; + } + name = nsp->nsr_name; + } else { + /* + * [sigh] We can't trust VFS to get the parent right for named + * attribute nodes. (It likes to reparent the nodes after we've + * created them.) Luckily we can probably get the right parent + * from the n_parent we have stashed away. + */ + if ((np->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR) && + (((dvp = np->n_parent)) && (error = vnode_get(dvp)))) + dvp = NULL; + if (!dvp) + dvp = vnode_getparent(NFSTOV(np)); + vname = vnode_getname(NFSTOV(np)); + if (!dvp || !vname) { + if (!error) + error = EIO; + nfs_node_unlock(np); + goto out; + } + name = vname; + } + filename = &smallname[0]; + namelen = snprintf(filename, sizeof(smallname), "%s", name); + if (namelen >= sizeof(smallname)) { + MALLOC(filename, char *, namelen+1, M_TEMP, M_WAITOK); + if (!filename) { + error = ENOMEM; + goto out; + } + snprintf(filename, namelen+1, "%s", name); + } + nfs_node_unlock(np); + + if ((error = nfs_open_owner_set_busy(noop, NULL))) + return (error); + + NVATTR_INIT(&nvattr); + delegation = NFS_OPEN_DELEGATE_NONE; + dstateid = np->n_dstateid; + NFSREQ_SECINFO_SET(&si, VTONFS(dvp), NULL, 0, filename, namelen); + + nfsm_chain_null(&nmreq); + nfsm_chain_null(&nmrep); + + // PUTFH, OPEN, GETATTR(FH) + numops = 3; + nfsm_chain_build_alloc_init(error, &nmreq, 48 * NFSX_UNSIGNED); + nfsm_chain_add_compound_header(error, &nmreq, "open_claim_d", numops); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, nfsvers, VTONFS(dvp)->n_fhp, VTONFS(dvp)->n_fhsize); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_OPEN); + nfsm_chain_add_32(error, &nmreq, noop->noo_seqid); + nfsm_chain_add_32(error, &nmreq, share_access); + nfsm_chain_add_32(error, &nmreq, share_deny); + // open owner: clientid + uid + nfsm_chain_add_64(error, &nmreq, nmp->nm_clientid); // open_owner4.clientid + nfsm_chain_add_32(error, &nmreq, NFSX_UNSIGNED); + nfsm_chain_add_32(error, &nmreq, kauth_cred_getuid(noop->noo_cred)); // open_owner4.owner + // openflag4 + nfsm_chain_add_32(error, &nmreq, NFS_OPEN_NOCREATE); + // open_claim4 + nfsm_chain_add_32(error, &nmreq, NFS_CLAIM_DELEGATE_CUR); + nfsm_chain_add_stateid(error, &nmreq, &np->n_dstateid); + nfsm_chain_add_name(error, &nmreq, filename, namelen, nmp); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + NFS_COPY_ATTRIBUTES(nfs_getattr_bitmap, bitmap); + NFS_BITMAP_SET(bitmap, NFS_FATTR_FILEHANDLE); + nfsm_chain_add_bitmap_supported(error, &nmreq, bitmap, nmp, np); + nfsm_chain_build_done(error, &nmreq); + nfsm_assert(error, (numops == 0), EPROTO); + nfsmout_if(error); + + error = nfs_request2(np, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, current_thread(), + noop->noo_cred, &si, flags|R_NOINTR, &nmrep, &xid, &status); + + if ((lockerror = nfs_node_lock(np))) + error = lockerror; + nfsm_chain_skip_tag(error, &nmrep); + nfsm_chain_get_32(error, &nmrep, numops); + nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); + nfsmout_if(error); + nfsm_chain_op_check(error, &nmrep, NFS_OP_OPEN); + nfs_owner_seqid_increment(noop, NULL, error); + nfsm_chain_get_stateid(error, &nmrep, &nofp->nof_stateid); + nfsm_chain_check_change_info(error, &nmrep, np); + nfsm_chain_get_32(error, &nmrep, rflags); + bmlen = NFS_ATTR_BITMAP_LEN; + nfsm_chain_get_bitmap(error, &nmrep, bitmap, bmlen); + nfsm_chain_get_32(error, &nmrep, delegation); + if (!error) + switch (delegation) { + case NFS_OPEN_DELEGATE_NONE: + // if (!(np->n_openflags & N_DELEG_RETURN)) /* don't warn if delegation is being returned */ + // printf("nfs: open delegated claim didn't return a delegation %s\n", filename ? filename : "???"); + break; + case NFS_OPEN_DELEGATE_READ: + case NFS_OPEN_DELEGATE_WRITE: + if ((((np->n_openflags & N_DELEG_MASK) == N_DELEG_READ) && + (delegation == NFS_OPEN_DELEGATE_WRITE)) || + (((np->n_openflags & N_DELEG_MASK) == N_DELEG_WRITE) && + (delegation == NFS_OPEN_DELEGATE_READ))) + printf("nfs: open delegated claim returned a different delegation type! have %s got %s %s\n", + ((np->n_openflags & N_DELEG_MASK) == N_DELEG_WRITE) ? "W" : "R", + (delegation == NFS_OPEN_DELEGATE_WRITE) ? "W" : "R", filename ? filename : "???"); + nfsm_chain_get_stateid(error, &nmrep, &dstateid); + nfsm_chain_get_32(error, &nmrep, recall); + if (delegation == NFS_OPEN_DELEGATE_WRITE) // space (skip) XXX + nfsm_chain_adv(error, &nmrep, 3 * NFSX_UNSIGNED); + /* if we have any trouble accepting the ACE, just invalidate it */ + ace_type = ace_flags = ace_mask = len = 0; + nfsm_chain_get_32(error, &nmrep, ace_type); + nfsm_chain_get_32(error, &nmrep, ace_flags); + nfsm_chain_get_32(error, &nmrep, ace_mask); + nfsm_chain_get_32(error, &nmrep, len); + ace.ace_flags = nfs4_ace_nfstype_to_vfstype(ace_type, &error); + ace.ace_flags |= nfs4_ace_nfsflags_to_vfsflags(ace_flags); + ace.ace_rights = nfs4_ace_nfsmask_to_vfsrights(ace_mask); + if (!error && (len >= slen)) { + MALLOC(s, char*, len+1, M_TEMP, M_WAITOK); + if (s) + slen = len+1; + else + ace.ace_flags = 0; + } + if (s) + nfsm_chain_get_opaque(error, &nmrep, len, s); + else + nfsm_chain_adv(error, &nmrep, nfsm_rndup(len)); + if (!error && s) { + s[len] = '\0'; + if (nfs4_id2guid(s, &ace.ace_applicable, (ace_flags & NFS_ACE_IDENTIFIER_GROUP))) + ace.ace_flags = 0; + } + if (error || !s) + ace.ace_flags = 0; + if (s && (s != sbuf)) + FREE(s, M_TEMP); + if (!error) { + /* stuff the latest delegation state in the node */ + lck_mtx_lock(&np->n_openlock); + np->n_openflags &= ~N_DELEG_MASK; + np->n_openflags |= ((delegation == NFS_OPEN_DELEGATE_READ) ? N_DELEG_READ : N_DELEG_WRITE); + np->n_dstateid = dstateid; + np->n_dace = ace; + if (np->n_dlink.tqe_next == NFSNOLIST) { + lck_mtx_lock(&nmp->nm_lock); + if (np->n_dlink.tqe_next == NFSNOLIST) + TAILQ_INSERT_TAIL(&nmp->nm_delegations, np, n_dlink); + lck_mtx_unlock(&nmp->nm_lock); + } + lck_mtx_unlock(&np->n_openlock); + } + break; + default: + error = EBADRPC; + break; + } + nfsmout_if(error); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); + error = nfs4_parsefattr(&nmrep, NULL, &nvattr, &fh, NULL, NULL); + nfsmout_if(error); + if (!NFS_BITMAP_ISSET(nvattr.nva_bitmap, NFS_FATTR_FILEHANDLE)) { + printf("nfs: open reclaim didn't return filehandle? %s\n", filename ? filename : "???"); + error = EBADRPC; + goto nfsmout; + } + if (!NFS_CMPFH(np, fh.fh_data, fh.fh_len)) { + // XXX what if fh doesn't match the vnode we think we're re-opening? + // Solaris Named Attributes may do this due to a bug.... so don't warn for named attributes. + if (!(np->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR)) + printf("nfs4_claim_delegated_open_rpc: warning: file handle mismatch %s\n", filename ? filename : "???"); + } + error = nfs_loadattrcache(np, &nvattr, &xid, 1); + nfsmout_if(error); + if (rflags & NFS_OPEN_RESULT_LOCKTYPE_POSIX) + nofp->nof_flags |= NFS_OPEN_FILE_POSIXLOCK; +nfsmout: + NVATTR_CLEANUP(&nvattr); + nfsm_chain_cleanup(&nmreq); + nfsm_chain_cleanup(&nmrep); + if (!lockerror) + nfs_node_unlock(np); + nfs_open_owner_clear_busy(noop); + if ((delegation == NFS_OPEN_DELEGATE_READ) || (delegation == NFS_OPEN_DELEGATE_WRITE)) { + if (recall) { + /* + * We're making a delegated claim. + * Don't return the delegation here in case we have more to claim. + * Just make sure it's queued up to be returned. + */ + nfs4_delegation_return_enqueue(np); + } + } +out: + // if (!error) + // printf("nfs: open claim delegated (%d, %d) succeeded for %s\n", share_access, share_deny, filename ? filename : "???"); + if (filename && (filename != &smallname[0])) + FREE(filename, M_TEMP); + if (vname) + vnode_putname(vname); + if (dvp != NULLVP) + vnode_put(dvp); + return (error); +} + /* * Send an OPEN RPC to reclaim an open file. */ @@ -4496,19 +5196,26 @@ nfs4_open_reclaim_rpc( nfsnode_t np = nofp->nof_np; struct nfsm_chain nmreq, nmrep; uint32_t bitmap[NFS_ATTR_BITMAP_LEN], bmlen; - uint32_t rflags = 0, delegation, recall = 0, val; + uint32_t rflags = 0, delegation, recall = 0; fhandle_t fh; struct nfs_stateid dstateid; + char sbuf[64], *s = sbuf; + uint32_t ace_type, ace_flags, ace_mask, len, slen = sizeof(sbuf); + struct kauth_ace ace; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(np); if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; - if ((error = nfs_open_owner_set_busy(noop, current_thread()))) + if ((error = nfs_open_owner_set_busy(noop, NULL))) return (error); + NVATTR_INIT(&nvattr); delegation = NFS_OPEN_DELEGATE_NONE; + dstateid = np->n_dstateid; + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -4542,13 +5249,13 @@ nfs4_open_reclaim_rpc( nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); NFS_COPY_ATTRIBUTES(nfs_getattr_bitmap, bitmap); NFS_BITMAP_SET(bitmap, NFS_FATTR_FILEHANDLE); - nfsm_chain_add_bitmap_masked(error, &nmreq, bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, bitmap, nmp, np); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request2(np, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, current_thread(), noop->noo_cred, R_RECOVER, &nmrep, &xid, &status); + error = nfs_request2(np, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, current_thread(), + noop->noo_cred, &si, R_RECOVER|R_NOINTR, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; @@ -4567,38 +5274,66 @@ nfs4_open_reclaim_rpc( if (!error) switch (delegation) { case NFS_OPEN_DELEGATE_NONE: - break; - case NFS_OPEN_DELEGATE_READ: - nfsm_chain_get_stateid(error, &nmrep, &dstateid); - nfsm_chain_get_32(error, &nmrep, recall); - // ACE: (skip) XXX - nfsm_chain_adv(error, &nmrep, 3 * NFSX_UNSIGNED); - nfsm_chain_get_32(error, &nmrep, val); /* string length */ - nfsm_chain_adv(error, &nmrep, nfsm_rndup(val)); - if (!error) { - /* stuff the delegation state in the node */ - lck_mtx_lock(&np->n_openlock); - np->n_openflags &= ~N_DELEG_MASK; - np->n_openflags |= N_DELEG_READ; - np->n_dstateid = dstateid; - lck_mtx_unlock(&np->n_openlock); + if (np->n_openflags & N_DELEG_MASK) { + /* + * Hey! We were supposed to get our delegation back even + * if it was getting immediately recalled. Bad server! + * + * Just try to return the existing delegation. + */ + // NP(np, "nfs: open reclaim didn't return delegation?"); + delegation = (np->n_openflags & N_DELEG_WRITE) ? NFS_OPEN_DELEGATE_WRITE : NFS_OPEN_DELEGATE_READ; + recall = 1; } break; + case NFS_OPEN_DELEGATE_READ: case NFS_OPEN_DELEGATE_WRITE: nfsm_chain_get_stateid(error, &nmrep, &dstateid); nfsm_chain_get_32(error, &nmrep, recall); - // space (skip) XXX - nfsm_chain_adv(error, &nmrep, 3 * NFSX_UNSIGNED); - // ACE: (skip) XXX - nfsm_chain_adv(error, &nmrep, 3 * NFSX_UNSIGNED); - nfsm_chain_get_32(error, &nmrep, val); /* string length */ - nfsm_chain_adv(error, &nmrep, nfsm_rndup(val)); + if (delegation == NFS_OPEN_DELEGATE_WRITE) // space (skip) XXX + nfsm_chain_adv(error, &nmrep, 3 * NFSX_UNSIGNED); + /* if we have any trouble accepting the ACE, just invalidate it */ + ace_type = ace_flags = ace_mask = len = 0; + nfsm_chain_get_32(error, &nmrep, ace_type); + nfsm_chain_get_32(error, &nmrep, ace_flags); + nfsm_chain_get_32(error, &nmrep, ace_mask); + nfsm_chain_get_32(error, &nmrep, len); + ace.ace_flags = nfs4_ace_nfstype_to_vfstype(ace_type, &error); + ace.ace_flags |= nfs4_ace_nfsflags_to_vfsflags(ace_flags); + ace.ace_rights = nfs4_ace_nfsmask_to_vfsrights(ace_mask); + if (!error && (len >= slen)) { + MALLOC(s, char*, len+1, M_TEMP, M_WAITOK); + if (s) + slen = len+1; + else + ace.ace_flags = 0; + } + if (s) + nfsm_chain_get_opaque(error, &nmrep, len, s); + else + nfsm_chain_adv(error, &nmrep, nfsm_rndup(len)); + if (!error && s) { + s[len] = '\0'; + if (nfs4_id2guid(s, &ace.ace_applicable, (ace_flags & NFS_ACE_IDENTIFIER_GROUP))) + ace.ace_flags = 0; + } + if (error || !s) + ace.ace_flags = 0; + if (s && (s != sbuf)) + FREE(s, M_TEMP); if (!error) { /* stuff the delegation state in the node */ lck_mtx_lock(&np->n_openlock); np->n_openflags &= ~N_DELEG_MASK; - np->n_openflags |= N_DELEG_WRITE; + np->n_openflags |= ((delegation == NFS_OPEN_DELEGATE_READ) ? N_DELEG_READ : N_DELEG_WRITE); np->n_dstateid = dstateid; + np->n_dace = ace; + if (np->n_dlink.tqe_next == NFSNOLIST) { + lck_mtx_lock(&nmp->nm_lock); + if (np->n_dlink.tqe_next == NFSNOLIST) + TAILQ_INSERT_TAIL(&nmp->nm_delegations, np, n_dlink); + lck_mtx_unlock(&nmp->nm_lock); + } lck_mtx_unlock(&np->n_openlock); } break; @@ -4608,35 +5343,37 @@ nfs4_open_reclaim_rpc( } nfsmout_if(error); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - NFS_CLEAR_ATTRIBUTES(nvattr.nva_bitmap); - error = nfs4_parsefattr(&nmrep, NULL, &nvattr, &fh, NULL); + error = nfs4_parsefattr(&nmrep, NULL, &nvattr, &fh, NULL, NULL); nfsmout_if(error); if (!NFS_BITMAP_ISSET(nvattr.nva_bitmap, NFS_FATTR_FILEHANDLE)) { - printf("nfs: open reclaim didn't return filehandle?\n"); + NP(np, "nfs: open reclaim didn't return filehandle?"); error = EBADRPC; goto nfsmout; } if (!NFS_CMPFH(np, fh.fh_data, fh.fh_len)) { // XXX what if fh doesn't match the vnode we think we're re-opening? - printf("nfs4_open_reclaim_rpc: warning: file handle mismatch\n"); + // That should be pretty hard in this case, given that we are doing + // the open reclaim using the file handle (and not a dir/name pair). + // Solaris Named Attributes may do this due to a bug.... so don't warn for named attributes. + if (!(np->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR)) + NP(np, "nfs4_open_reclaim_rpc: warning: file handle mismatch"); } error = nfs_loadattrcache(np, &nvattr, &xid, 1); nfsmout_if(error); if (rflags & NFS_OPEN_RESULT_LOCKTYPE_POSIX) nofp->nof_flags |= NFS_OPEN_FILE_POSIXLOCK; nfsmout: + // if (!error) + // NP(np, "nfs: open reclaim (%d, %d) succeeded", share_access, share_deny); + NVATTR_CLEANUP(&nvattr); nfsm_chain_cleanup(&nmreq); nfsm_chain_cleanup(&nmrep); if (!lockerror) nfs_node_unlock(np); nfs_open_owner_clear_busy(noop); if ((delegation == NFS_OPEN_DELEGATE_READ) || (delegation == NFS_OPEN_DELEGATE_WRITE)) { - if (recall) { - nfs4_delegreturn_rpc(nmp, fh.fh_data, fh.fh_len, &dstateid, current_thread(), noop->noo_cred); - lck_mtx_lock(&np->n_openlock); - np->n_openflags &= ~N_DELEG_MASK; - lck_mtx_unlock(&np->n_openlock); - } + if (recall) + nfs4_delegation_return_enqueue(np); } return (error); } @@ -4652,15 +5389,17 @@ nfs4_open_downgrade_rpc( int error, lockerror = ENOENT, status, nfsvers, numops; struct nfsm_chain nmreq, nmrep; u_int64_t xid; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(np); if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; - if ((error = nfs_open_owner_set_busy(noop, vfs_context_thread(ctx)))) + if ((error = nfs_open_owner_set_busy(noop, NULL))) return (error); + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -4679,12 +5418,13 @@ nfs4_open_downgrade_rpc( nfsm_chain_add_32(error, &nmreq, nofp->nof_deny); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, np); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &nmrep, &xid, &status); + error = nfs_request2(np, NULL, &nmreq, NFSPROC4_COMPOUND, + vfs_context_thread(ctx), vfs_context_ucred(ctx), + &si, R_NOINTR, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; @@ -4696,7 +5436,7 @@ nfs4_open_downgrade_rpc( nfs_owner_seqid_increment(noop, NULL, error); nfsm_chain_get_stateid(error, &nmrep, &nofp->nof_stateid); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, np, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, nfsvers, &xid); nfsmout: if (!lockerror) nfs_node_unlock(np); @@ -4712,26 +5452,28 @@ nfs4_close_rpc( struct nfs_open_file *nofp, thread_t thd, kauth_cred_t cred, - int flag) + int flags) { struct nfs_open_owner *noop = nofp->nof_owner; struct nfsmount *nmp; int error, lockerror = ENOENT, status, nfsvers, numops; struct nfsm_chain nmreq, nmrep; u_int64_t xid; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(np); if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; - if ((error = nfs_open_owner_set_busy(noop, thd))) + if ((error = nfs_open_owner_set_busy(noop, NULL))) return (error); + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); - // PUTFH, CLOSE, GETFH + // PUTFH, CLOSE, GETATTR numops = 3; nfsm_chain_build_alloc_init(error, &nmreq, 23 * NFSX_UNSIGNED); nfsm_chain_add_compound_header(error, &nmreq, "close", numops); @@ -4744,12 +5486,11 @@ nfs4_close_rpc( nfsm_chain_add_stateid(error, &nmreq, &nofp->nof_stateid); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, np); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request2(np, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, flag, &nmrep, &xid, &status); + error = nfs_request2(np, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, &si, flags|R_NOINTR, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; @@ -4761,7 +5502,7 @@ nfs4_close_rpc( nfs_owner_seqid_increment(noop, NULL, error); nfsm_chain_get_stateid(error, &nmrep, &nofp->nof_stateid); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, np, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, nfsvers, &xid); nfsmout: if (!lockerror) nfs_node_unlock(np); @@ -4772,685 +5513,2630 @@ nfs4_close_rpc( } -int -nfs4_delegreturn_rpc(struct nfsmount *nmp, u_char *fhp, int fhlen, struct nfs_stateid *sid, thread_t thd, kauth_cred_t cred) -{ - int error = 0, status, numops; - uint64_t xid; - struct nfsm_chain nmreq, nmrep; - - nfsm_chain_null(&nmreq); - nfsm_chain_null(&nmrep); - - // PUTFH, DELEGRETURN - numops = 2; - nfsm_chain_build_alloc_init(error, &nmreq, 16 * NFSX_UNSIGNED); - nfsm_chain_add_compound_header(error, &nmreq, "delegreturn", numops); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); - nfsm_chain_add_fh(error, &nmreq, nmp->nm_vers, fhp, fhlen); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_DELEGRETURN); - nfsm_chain_add_stateid(error, &nmreq, sid); - nfsm_chain_build_done(error, &nmreq); - nfsm_assert(error, (numops == 0), EPROTO); - nfsmout_if(error); - error = nfs_request2(NULL, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, thd, cred, R_RECOVER, &nmrep, &xid, &status); - nfsm_chain_skip_tag(error, &nmrep); - nfsm_chain_get_32(error, &nmrep, numops); - nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); - nfsm_chain_op_check(error, &nmrep, NFS_OP_DELEGRETURN); -nfsmout: - nfsm_chain_cleanup(&nmreq); - nfsm_chain_cleanup(&nmrep); - return (error); -} - - /* - * NFSv4 read call. - * Just call nfs_bioread() to do the work. - * - * Note: the exec code paths have a tendency to call VNOP_READ (and VNOP_MMAP) - * without first calling VNOP_OPEN, so we make sure the file is open here. + * Claim the delegated open combinations this open file holds. */ int -nfs4_vnop_read( - struct vnop_read_args /* { - struct vnodeop_desc *a_desc; - vnode_t a_vp; - struct uio *a_uio; - int a_ioflag; - vfs_context_t a_context; - } */ *ap) +nfs4_claim_delegated_state_for_open_file(struct nfs_open_file *nofp, int flags) { - vnode_t vp = ap->a_vp; - vfs_context_t ctx = ap->a_context; - nfsnode_t np; + struct nfs_open_owner *noop = nofp->nof_owner; + struct nfs_lock_owner *nlop; + struct nfs_file_lock *nflp, *nextnflp; struct nfsmount *nmp; - struct nfs_open_owner *noop; - struct nfs_open_file *nofp; - int error; - - if (vnode_vtype(ap->a_vp) != VREG) - return (EPERM); - - np = VTONFS(vp); - nmp = NFSTONMP(np); - if (!nmp) - return (ENXIO); + int error = 0, reopen = 0; - noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 1); - if (!noop) - return (ENOMEM); -restart: - error = nfs_open_file_find(np, noop, &nofp, 0, 0, 1); - if (!error && (nofp->nof_flags & NFS_OPEN_FILE_LOST)) { - printf("nfs_vnop_read: LOST\n"); - error = EIO; + if (nofp->nof_d_rw_drw) { + error = nfs4_claim_delegated_open_rpc(nofp, NFS_OPEN_SHARE_ACCESS_BOTH, NFS_OPEN_SHARE_DENY_BOTH, flags); + if (!error) { + lck_mtx_lock(&nofp->nof_lock); + nofp->nof_rw_drw += nofp->nof_d_rw_drw; + nofp->nof_d_rw_drw = 0; + lck_mtx_unlock(&nofp->nof_lock); + } } - if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { - nfs4_reopen(nofp, vfs_context_thread(ctx)); - nofp = NULL; - goto restart; + if (!error && nofp->nof_d_w_drw) { + error = nfs4_claim_delegated_open_rpc(nofp, NFS_OPEN_SHARE_ACCESS_WRITE, NFS_OPEN_SHARE_DENY_BOTH, flags); + if (!error) { + lck_mtx_lock(&nofp->nof_lock); + nofp->nof_w_drw += nofp->nof_d_w_drw; + nofp->nof_d_w_drw = 0; + lck_mtx_unlock(&nofp->nof_lock); + } } - if (error) { - nfs_open_owner_rele(noop); + if (!error && nofp->nof_d_r_drw) { + error = nfs4_claim_delegated_open_rpc(nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_BOTH, flags); + if (!error) { + lck_mtx_lock(&nofp->nof_lock); + nofp->nof_r_drw += nofp->nof_d_r_drw; + nofp->nof_d_r_drw = 0; + lck_mtx_unlock(&nofp->nof_lock); + } + } + if (!error && nofp->nof_d_rw_dw) { + error = nfs4_claim_delegated_open_rpc(nofp, NFS_OPEN_SHARE_ACCESS_BOTH, NFS_OPEN_SHARE_DENY_WRITE, flags); + if (!error) { + lck_mtx_lock(&nofp->nof_lock); + nofp->nof_rw_dw += nofp->nof_d_rw_dw; + nofp->nof_d_rw_dw = 0; + lck_mtx_unlock(&nofp->nof_lock); + } + } + if (!error && nofp->nof_d_w_dw) { + error = nfs4_claim_delegated_open_rpc(nofp, NFS_OPEN_SHARE_ACCESS_WRITE, NFS_OPEN_SHARE_DENY_WRITE, flags); + if (!error) { + lck_mtx_lock(&nofp->nof_lock); + nofp->nof_w_dw += nofp->nof_d_w_dw; + nofp->nof_d_w_dw = 0; + lck_mtx_unlock(&nofp->nof_lock); + } + } + if (!error && nofp->nof_d_r_dw) { + error = nfs4_claim_delegated_open_rpc(nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_WRITE, flags); + if (!error) { + lck_mtx_lock(&nofp->nof_lock); + nofp->nof_r_dw += nofp->nof_d_r_dw; + nofp->nof_d_r_dw = 0; + lck_mtx_unlock(&nofp->nof_lock); + } + } + /* non-deny-mode opens may be reopened if no locks are held */ + if (!error && nofp->nof_d_rw) { + error = nfs4_claim_delegated_open_rpc(nofp, NFS_OPEN_SHARE_ACCESS_BOTH, NFS_OPEN_SHARE_DENY_NONE, flags); + /* for some errors, we should just try reopening the file */ + if (nfs_mount_state_error_delegation_lost(error)) + reopen = error; + if (!error || reopen) { + lck_mtx_lock(&nofp->nof_lock); + nofp->nof_rw += nofp->nof_d_rw; + nofp->nof_d_rw = 0; + lck_mtx_unlock(&nofp->nof_lock); + } + } + /* if we've already set reopen, we should move these other two opens from delegated to not delegated */ + if ((!error || reopen) && nofp->nof_d_w) { + if (!error) { + error = nfs4_claim_delegated_open_rpc(nofp, NFS_OPEN_SHARE_ACCESS_WRITE, NFS_OPEN_SHARE_DENY_NONE, flags); + /* for some errors, we should just try reopening the file */ + if (nfs_mount_state_error_delegation_lost(error)) + reopen = error; + } + if (!error || reopen) { + lck_mtx_lock(&nofp->nof_lock); + nofp->nof_w += nofp->nof_d_w; + nofp->nof_d_w = 0; + lck_mtx_unlock(&nofp->nof_lock); + } + } + if ((!error || reopen) && nofp->nof_d_r) { + if (!error) { + error = nfs4_claim_delegated_open_rpc(nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE, flags); + /* for some errors, we should just try reopening the file */ + if (nfs_mount_state_error_delegation_lost(error)) + reopen = error; + } + if (!error || reopen) { + lck_mtx_lock(&nofp->nof_lock); + nofp->nof_r += nofp->nof_d_r; + nofp->nof_d_r = 0; + lck_mtx_unlock(&nofp->nof_lock); + } + } + + if (reopen) { + /* + * Any problems with the delegation probably indicates that we + * should review/return all of our current delegation state. + */ + if ((nmp = NFSTONMP(nofp->nof_np))) { + nfs4_delegation_return_enqueue(nofp->nof_np); + lck_mtx_lock(&nmp->nm_lock); + nfs_need_recover(nmp, NFSERR_EXPIRED); + lck_mtx_unlock(&nmp->nm_lock); + } + if (reopen && (nfs_check_for_locks(noop, nofp) == 0)) { + /* just reopen the file on next access */ + NP(nofp->nof_np, "nfs4_claim_delegated_state_for_open_file: %d, need reopen, %d", + reopen, kauth_cred_getuid(nofp->nof_owner->noo_cred)); + lck_mtx_lock(&nofp->nof_lock); + nofp->nof_flags |= NFS_OPEN_FILE_REOPEN; + lck_mtx_unlock(&nofp->nof_lock); + return (0); + } + if (reopen) + NP(nofp->nof_np, "nfs4_claim_delegated_state_for_open_file: %d, locks prevent reopen, %d", + reopen, kauth_cred_getuid(nofp->nof_owner->noo_cred)); + } + + if (!error && ((nmp = NFSTONMP(nofp->nof_np)))) { + /* claim delegated locks */ + TAILQ_FOREACH(nlop, &nofp->nof_np->n_lock_owners, nlo_link) { + if (nlop->nlo_open_owner != noop) + continue; + TAILQ_FOREACH_SAFE(nflp, &nlop->nlo_locks, nfl_lolink, nextnflp) { + /* skip dead & blocked lock requests (shouldn't be any in the held lock list) */ + if (nflp->nfl_flags & (NFS_FILE_LOCK_DEAD|NFS_FILE_LOCK_BLOCKED)) + continue; + /* skip non-delegated locks */ + if (!(nflp->nfl_flags & NFS_FILE_LOCK_DELEGATED)) + continue; + error = nmp->nm_funcs->nf_setlock_rpc(nofp->nof_np, nofp, nflp, 0, flags, current_thread(), noop->noo_cred); + if (error) { + NP(nofp->nof_np, "nfs: delegated lock claim (0x%llx, 0x%llx) failed %d, %d", + nflp->nfl_start, nflp->nfl_end, error, kauth_cred_getuid(nofp->nof_owner->noo_cred)); + break; + } + // else { + // NP(nofp->nof_np, "nfs: delegated lock claim (0x%llx, 0x%llx) succeeded, %d", + // nflp->nfl_start, nflp->nfl_end, kauth_cred_getuid(nofp->nof_owner->noo_cred)); + // } + } + if (error) + break; + } + } + + if (!error) /* all state claimed successfully! */ + return (0); + + /* restart if it looks like a problem more than just losing the delegation */ + if (!nfs_mount_state_error_delegation_lost(error) && + ((error == ETIMEDOUT) || nfs_mount_state_error_should_restart(error))) { + NP(nofp->nof_np, "nfs delegated lock claim error %d, %d", error, kauth_cred_getuid(nofp->nof_owner->noo_cred)); + if ((error == ETIMEDOUT) && ((nmp = NFSTONMP(nofp->nof_np)))) + nfs_need_reconnect(nmp); return (error); } - if (!nofp->nof_access) { - /* we don't have the file open, so open it for read access */ - error = nfs_mount_state_in_use_start(nmp); - if (error) { - nfs_open_owner_rele(noop); - return (error); + + /* delegated state lost (once held but now not claimable) */ + NP(nofp->nof_np, "nfs delegated state claim error %d, state lost, %d", error, kauth_cred_getuid(nofp->nof_owner->noo_cred)); + + /* + * Any problems with the delegation probably indicates that we + * should review/return all of our current delegation state. + */ + if ((nmp = NFSTONMP(nofp->nof_np))) { + nfs4_delegation_return_enqueue(nofp->nof_np); + lck_mtx_lock(&nmp->nm_lock); + nfs_need_recover(nmp, NFSERR_EXPIRED); + lck_mtx_unlock(&nmp->nm_lock); + } + + /* revoke all open file state */ + nfs_revoke_open_state_for_node(nofp->nof_np); + + return (error); +} + +/* + * Release all open state for the given node. + */ +void +nfs_release_open_state_for_node(nfsnode_t np, int force) +{ + struct nfsmount *nmp = NFSTONMP(np); + struct nfs_open_file *nofp; + struct nfs_file_lock *nflp, *nextnflp; + + /* drop held locks */ + TAILQ_FOREACH_SAFE(nflp, &np->n_locks, nfl_link, nextnflp) { + /* skip dead & blocked lock requests */ + if (nflp->nfl_flags & (NFS_FILE_LOCK_DEAD|NFS_FILE_LOCK_BLOCKED)) + continue; + /* send an unlock if not a delegated lock */ + if (!force && nmp && !(nflp->nfl_flags & NFS_FILE_LOCK_DELEGATED)) + nmp->nm_funcs->nf_unlock_rpc(np, nflp->nfl_owner, F_WRLCK, nflp->nfl_start, nflp->nfl_end, R_RECOVER, + NULL, nflp->nfl_owner->nlo_open_owner->noo_cred); + /* kill/remove the lock */ + lck_mtx_lock(&np->n_openlock); + nflp->nfl_flags |= NFS_FILE_LOCK_DEAD; + lck_mtx_lock(&nflp->nfl_owner->nlo_lock); + TAILQ_REMOVE(&nflp->nfl_owner->nlo_locks, nflp, nfl_lolink); + lck_mtx_unlock(&nflp->nfl_owner->nlo_lock); + if (nflp->nfl_blockcnt) { + /* wake up anyone blocked on this lock */ + wakeup(nflp); + } else { + /* remove nflp from lock list and destroy */ + TAILQ_REMOVE(&np->n_locks, nflp, nfl_link); + nfs_file_lock_destroy(nflp); } - error = nfs_open_file_set_busy(nofp, vfs_context_thread(ctx)); + lck_mtx_unlock(&np->n_openlock); + } + + lck_mtx_lock(&np->n_openlock); + + /* drop all opens */ + TAILQ_FOREACH(nofp, &np->n_opens, nof_link) { + if (nofp->nof_flags & NFS_OPEN_FILE_LOST) + continue; + /* mark open state as lost */ + lck_mtx_lock(&nofp->nof_lock); + nofp->nof_flags &= ~NFS_OPEN_FILE_REOPEN; + nofp->nof_flags |= NFS_OPEN_FILE_LOST; + lck_mtx_unlock(&nofp->nof_lock); + if (!force && nmp && (nmp->nm_vers >= NFS_VER4)) + nfs4_close_rpc(np, nofp, NULL, nofp->nof_owner->noo_cred, R_RECOVER); + } + + lck_mtx_unlock(&np->n_openlock); +} + +/* + * State for a node has been lost, drop it, and revoke the node. + * Attempt to return any state if possible in case the server + * might somehow think we hold it. + */ +void +nfs_revoke_open_state_for_node(nfsnode_t np) +{ + struct nfsmount *nmp; + + /* mark node as needing to be revoked */ + nfs_node_lock_force(np); + if (np->n_flag & NREVOKE) /* already revoked? */ + { + NP(np, "nfs_revoke_open_state_for_node(): already revoked"); + nfs_node_unlock(np); + return; + } + np->n_flag |= NREVOKE; + nfs_node_unlock(np); + + nfs_release_open_state_for_node(np, 0); + NP(np, "nfs: state lost for %p 0x%x", np, np->n_flag); + + /* mark mount as needing a revoke scan and have the socket thread do it. */ + if ((nmp = NFSTONMP(np))) { + lck_mtx_lock(&nmp->nm_lock); + nmp->nm_state |= NFSSTA_REVOKE; + nfs_mount_sock_thread_wake(nmp); + lck_mtx_unlock(&nmp->nm_lock); + } +} + +/* + * Claim the delegated open combinations that each of this node's open files hold. + */ +int +nfs4_claim_delegated_state_for_node(nfsnode_t np, int flags) +{ + struct nfs_open_file *nofp; + int error = 0; + + lck_mtx_lock(&np->n_openlock); + + /* walk the open file list looking for opens with delegated state to claim */ +restart: + TAILQ_FOREACH(nofp, &np->n_opens, nof_link) { + if (!nofp->nof_d_rw_drw && !nofp->nof_d_w_drw && !nofp->nof_d_r_drw && + !nofp->nof_d_rw_dw && !nofp->nof_d_w_dw && !nofp->nof_d_r_dw && + !nofp->nof_d_rw && !nofp->nof_d_w && !nofp->nof_d_r) + continue; + lck_mtx_unlock(&np->n_openlock); + error = nfs4_claim_delegated_state_for_open_file(nofp, flags); + lck_mtx_lock(&np->n_openlock); if (error) - nofp = NULL; - if (!error) - error = nfs4_open(np, nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE, ctx); - if (!error) - nofp->nof_flags |= NFS_OPEN_FILE_NEEDCLOSE; - if (nofp) - nfs_open_file_clear_busy(nofp); - if (nfs_mount_state_in_use_end(nmp, error)) { - nofp = NULL; - goto restart; + break; + goto restart; + } + + lck_mtx_unlock(&np->n_openlock); + + return (error); +} + +/* + * Mark a node as needed to have its delegation returned. + * Queue it up on the delegation return queue. + * Make sure the thread is running. + */ +void +nfs4_delegation_return_enqueue(nfsnode_t np) +{ + struct nfsmount *nmp; + + nmp = NFSTONMP(np); + if (!nmp) + return; + + lck_mtx_lock(&np->n_openlock); + np->n_openflags |= N_DELEG_RETURN; + lck_mtx_unlock(&np->n_openlock); + + lck_mtx_lock(&nmp->nm_lock); + if (np->n_dreturn.tqe_next == NFSNOLIST) + TAILQ_INSERT_TAIL(&nmp->nm_dreturnq, np, n_dreturn); + nfs_mount_sock_thread_wake(nmp); + lck_mtx_unlock(&nmp->nm_lock); +} + +/* + * return any delegation we may have for the given node + */ +int +nfs4_delegation_return(nfsnode_t np, int flags, thread_t thd, kauth_cred_t cred) +{ + struct nfsmount *nmp; + fhandle_t fh; + nfs_stateid dstateid; + int error; + + nmp = NFSTONMP(np); + if (!nmp) + return (ENXIO); + + /* first, make sure the node's marked for delegation return */ + lck_mtx_lock(&np->n_openlock); + np->n_openflags |= (N_DELEG_RETURN|N_DELEG_RETURNING); + lck_mtx_unlock(&np->n_openlock); + + /* make sure nobody else is using the delegation state */ + if ((error = nfs_open_state_set_busy(np, NULL))) + goto out; + + /* claim any delegated state */ + if ((error = nfs4_claim_delegated_state_for_node(np, flags))) + goto out; + + /* return the delegation */ + lck_mtx_lock(&np->n_openlock); + dstateid = np->n_dstateid; + fh.fh_len = np->n_fhsize; + bcopy(np->n_fhp, &fh.fh_data, fh.fh_len); + lck_mtx_unlock(&np->n_openlock); + error = nfs4_delegreturn_rpc(NFSTONMP(np), fh.fh_data, fh.fh_len, &dstateid, flags, thd, cred); + /* assume delegation is gone for all errors except ETIMEDOUT, NFSERR_*MOVED */ + if ((error != ETIMEDOUT) && (error != NFSERR_MOVED) && (error != NFSERR_LEASE_MOVED)) { + lck_mtx_lock(&np->n_openlock); + np->n_openflags &= ~N_DELEG_MASK; + lck_mtx_lock(&nmp->nm_lock); + if (np->n_dlink.tqe_next != NFSNOLIST) { + TAILQ_REMOVE(&nmp->nm_delegations, np, n_dlink); + np->n_dlink.tqe_next = NFSNOLIST; + } + lck_mtx_unlock(&nmp->nm_lock); + lck_mtx_unlock(&np->n_openlock); + } + +out: + /* make sure it's no longer on the return queue and clear the return flags */ + lck_mtx_lock(&nmp->nm_lock); + if (np->n_dreturn.tqe_next != NFSNOLIST) { + TAILQ_REMOVE(&nmp->nm_dreturnq, np, n_dreturn); + np->n_dreturn.tqe_next = NFSNOLIST; + } + lck_mtx_unlock(&nmp->nm_lock); + lck_mtx_lock(&np->n_openlock); + np->n_openflags &= ~(N_DELEG_RETURN|N_DELEG_RETURNING); + lck_mtx_unlock(&np->n_openlock); + + if (error) { + NP(np, "nfs4_delegation_return, error %d", error); + if (error == ETIMEDOUT) + nfs_need_reconnect(nmp); + if (nfs_mount_state_error_should_restart(error)) { + /* make sure recovery happens */ + lck_mtx_lock(&nmp->nm_lock); + nfs_need_recover(nmp, nfs_mount_state_error_delegation_lost(error) ? NFSERR_EXPIRED : 0); + lck_mtx_unlock(&nmp->nm_lock); } } - nfs_open_owner_rele(noop); - if (error) - return (error); - return (nfs_bioread(VTONFS(ap->a_vp), ap->a_uio, ap->a_ioflag, ap->a_context)); + + nfs_open_state_clear_busy(np); + + return (error); } /* - * Note: the NFSv4 CREATE RPC is for everything EXCEPT regular files. - * Files are created using the NFSv4 OPEN RPC. So we must open the - * file to create it and then close it. + * RPC to return a delegation for a file handle + */ +int +nfs4_delegreturn_rpc(struct nfsmount *nmp, u_char *fhp, int fhlen, struct nfs_stateid *sid, int flags, thread_t thd, kauth_cred_t cred) +{ + int error = 0, status, numops; + uint64_t xid; + struct nfsm_chain nmreq, nmrep; + struct nfsreq_secinfo_args si; + + NFSREQ_SECINFO_SET(&si, NULL, fhp, fhlen, NULL, 0); + nfsm_chain_null(&nmreq); + nfsm_chain_null(&nmrep); + + // PUTFH, DELEGRETURN + numops = 2; + nfsm_chain_build_alloc_init(error, &nmreq, 16 * NFSX_UNSIGNED); + nfsm_chain_add_compound_header(error, &nmreq, "delegreturn", numops); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, nmp->nm_vers, fhp, fhlen); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_DELEGRETURN); + nfsm_chain_add_stateid(error, &nmreq, sid); + nfsm_chain_build_done(error, &nmreq); + nfsm_assert(error, (numops == 0), EPROTO); + nfsmout_if(error); + error = nfs_request2(NULL, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, thd, cred, &si, flags, &nmrep, &xid, &status); + nfsm_chain_skip_tag(error, &nmrep); + nfsm_chain_get_32(error, &nmrep, numops); + nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_DELEGRETURN); +nfsmout: + nfsm_chain_cleanup(&nmreq); + nfsm_chain_cleanup(&nmrep); + return (error); +} + + +/* + * NFS read call. + * Just call nfs_bioread() to do the work. + * + * Note: the exec code paths have a tendency to call VNOP_READ (and VNOP_MMAP) + * without first calling VNOP_OPEN, so we make sure the file is open here. + */ +int +nfs_vnop_read( + struct vnop_read_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + struct uio *a_uio; + int a_ioflag; + vfs_context_t a_context; + } */ *ap) +{ + vnode_t vp = ap->a_vp; + vfs_context_t ctx = ap->a_context; + nfsnode_t np; + struct nfsmount *nmp; + struct nfs_open_owner *noop; + struct nfs_open_file *nofp; + int error; + + if (vnode_vtype(ap->a_vp) != VREG) + return (EPERM); + + np = VTONFS(vp); + nmp = NFSTONMP(np); + if (!nmp) + return (ENXIO); + if (np->n_flag & NREVOKE) + return (EIO); + + noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 1); + if (!noop) + return (ENOMEM); +restart: + error = nfs_open_file_find(np, noop, &nofp, 0, 0, 1); + if (!error && (nofp->nof_flags & NFS_OPEN_FILE_LOST)) { + NP(np, "nfs_vnop_read: LOST %d", kauth_cred_getuid(noop->noo_cred)); + error = EIO; + } + if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { + error = nfs4_reopen(nofp, vfs_context_thread(ctx)); + nofp = NULL; + if (!error) + goto restart; + } + if (error) { + nfs_open_owner_rele(noop); + return (error); + } + if (!nofp->nof_access) { + /* we don't have the file open, so open it for read access */ + error = nfs_mount_state_in_use_start(nmp, vfs_context_thread(ctx)); + if (error) { + nfs_open_owner_rele(noop); + return (error); + } + if (np->n_flag & NREVOKE) { + error = EIO; + nfs_mount_state_in_use_end(nmp, 0); + nfs_open_owner_rele(noop); + return (error); + } + error = nfs_open_file_set_busy(nofp, vfs_context_thread(ctx)); + if (error) + nofp = NULL; + if (!error) { + if (nmp->nm_vers < NFS_VER4) { + /* NFS v2/v3 opens are always allowed - so just add it. */ + nfs_open_file_add_open(nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE, 0); + } else { + error = nfs4_open(np, nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE, ctx); + } + } + if (!error) + nofp->nof_flags |= NFS_OPEN_FILE_NEEDCLOSE; + if (nofp) + nfs_open_file_clear_busy(nofp); + if (nfs_mount_state_in_use_end(nmp, error)) { + nofp = NULL; + goto restart; + } + } + nfs_open_owner_rele(noop); + if (error) + return (error); + return (nfs_bioread(VTONFS(ap->a_vp), ap->a_uio, ap->a_ioflag, ap->a_context)); +} + +/* + * Note: the NFSv4 CREATE RPC is for everything EXCEPT regular files. + * Files are created using the NFSv4 OPEN RPC. So we must open the + * file to create it and then close it. + */ +int +nfs4_vnop_create( + struct vnop_create_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; + vnode_t *a_vpp; + struct componentname *a_cnp; + struct vnode_attr *a_vap; + vfs_context_t a_context; + } */ *ap) +{ + vfs_context_t ctx = ap->a_context; + struct componentname *cnp = ap->a_cnp; + struct vnode_attr *vap = ap->a_vap; + vnode_t dvp = ap->a_dvp; + vnode_t *vpp = ap->a_vpp; + struct nfsmount *nmp; + nfsnode_t np; + int error = 0, busyerror = 0, accessMode, denyMode; + struct nfs_open_owner *noop = NULL; + struct nfs_open_file *newnofp = NULL, *nofp = NULL; + + nmp = VTONMP(dvp); + if (!nmp) + return (ENXIO); + + if (vap) + nfs_avoid_needless_id_setting_on_create(VTONFS(dvp), vap, ctx); + + noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 1); + if (!noop) + return (ENOMEM); + +restart: + error = nfs_mount_state_in_use_start(nmp, vfs_context_thread(ctx)); + if (error) { + nfs_open_owner_rele(noop); + return (error); + } + + /* grab a provisional, nodeless open file */ + error = nfs_open_file_find(NULL, noop, &newnofp, 0, 0, 1); + if (!error && (newnofp->nof_flags & NFS_OPEN_FILE_LOST)) { + printf("nfs_vnop_create: LOST\n"); + error = EIO; + } + if (!error && (newnofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { + /* This shouldn't happen given that this is a new, nodeless nofp */ + nfs_mount_state_in_use_end(nmp, 0); + error = nfs4_reopen(newnofp, vfs_context_thread(ctx)); + nfs_open_file_destroy(newnofp); + newnofp = NULL; + if (!error) + goto restart; + } + if (!error) + error = nfs_open_file_set_busy(newnofp, vfs_context_thread(ctx)); + if (error) { + if (newnofp) + nfs_open_file_destroy(newnofp); + newnofp = NULL; + goto out; + } + + /* + * We're just trying to create the file. + * We'll create/open it RW, and set NFS_OPEN_FILE_CREATE. + */ + accessMode = NFS_OPEN_SHARE_ACCESS_BOTH; + denyMode = NFS_OPEN_SHARE_DENY_NONE; + + /* Do the open/create */ + error = nfs4_open_rpc(newnofp, ctx, cnp, vap, dvp, vpp, NFS_OPEN_CREATE, accessMode, denyMode); + if ((error == EACCES) && vap && !(vap->va_vaflags & VA_EXCLUSIVE) && + VATTR_IS_ACTIVE(vap, va_mode) && !(vap->va_mode & S_IWUSR)) { + /* + * Hmm... it looks like we may have a situation where the request was + * retransmitted because we didn't get the first response which successfully + * created/opened the file and then the second time we were denied the open + * because the mode the file was created with doesn't allow write access. + * + * We'll try to work around this by temporarily updating the mode and + * retrying the open. + */ + struct vnode_attr vattr; + + /* first make sure it's there */ + int error2 = nfs_lookitup(VTONFS(dvp), cnp->cn_nameptr, cnp->cn_namelen, ctx, &np); + if (!error2 && np) { + nfs_node_unlock(np); + *vpp = NFSTOV(np); + if (vnode_vtype(NFSTOV(np)) == VREG) { + VATTR_INIT(&vattr); + VATTR_SET(&vattr, va_mode, (vap->va_mode | S_IWUSR)); + if (!nfs4_setattr_rpc(np, &vattr, ctx)) { + error2 = nfs4_open_rpc(newnofp, ctx, cnp, NULL, dvp, vpp, NFS_OPEN_NOCREATE, accessMode, denyMode); + VATTR_INIT(&vattr); + VATTR_SET(&vattr, va_mode, vap->va_mode); + nfs4_setattr_rpc(np, &vattr, ctx); + if (!error2) + error = 0; + } + } + if (error) { + vnode_put(*vpp); + *vpp = NULL; + } + } + } + if (!error && !*vpp) { + printf("nfs4_open_rpc returned without a node?\n"); + /* Hmmm... with no node, we have no filehandle and can't close it */ + error = EIO; + } + if (error) { + /* need to cleanup our temporary nofp */ + nfs_open_file_clear_busy(newnofp); + nfs_open_file_destroy(newnofp); + newnofp = NULL; + goto out; + } + /* After we have a node, add our open file struct to the node */ + np = VTONFS(*vpp); + nfs_open_file_add_open(newnofp, accessMode, denyMode, 0); + nofp = newnofp; + error = nfs_open_file_find_internal(np, noop, &nofp, 0, 0, 0); + if (error) { + /* This shouldn't happen, because we passed in a new nofp to use. */ + printf("nfs_open_file_find_internal failed! %d\n", error); + goto out; + } else if (nofp != newnofp) { + /* + * Hmm... an open file struct already exists. + * Mark the existing one busy and merge our open into it. + * Then destroy the one we created. + * Note: there's no chance of an open confict because the + * open has already been granted. + */ + busyerror = nfs_open_file_set_busy(nofp, NULL); + nfs_open_file_add_open(nofp, accessMode, denyMode, 0); + nofp->nof_stateid = newnofp->nof_stateid; + if (newnofp->nof_flags & NFS_OPEN_FILE_POSIXLOCK) + nofp->nof_flags |= NFS_OPEN_FILE_POSIXLOCK; + nfs_open_file_clear_busy(newnofp); + nfs_open_file_destroy(newnofp); + } + newnofp = NULL; + /* mark the node as holding a create-initiated open */ + nofp->nof_flags |= NFS_OPEN_FILE_CREATE; + nofp->nof_creator = current_thread(); +out: + if (nofp && !busyerror) + nfs_open_file_clear_busy(nofp); + if (nfs_mount_state_in_use_end(nmp, error)) { + nofp = newnofp = NULL; + busyerror = 0; + goto restart; + } + if (noop) + nfs_open_owner_rele(noop); + return (error); +} + +/* + * Note: the NFSv4 CREATE RPC is for everything EXCEPT regular files. + */ +int +nfs4_create_rpc( + vfs_context_t ctx, + nfsnode_t dnp, + struct componentname *cnp, + struct vnode_attr *vap, + int type, + char *link, + nfsnode_t *npp) +{ + struct nfsmount *nmp; + struct nfs_vattr nvattr; + int error = 0, create_error = EIO, lockerror = ENOENT, busyerror = ENOENT, status; + int nfsvers, namedattrs, numops; + u_int64_t xid, savedxid = 0; + nfsnode_t np = NULL; + vnode_t newvp = NULL; + struct nfsm_chain nmreq, nmrep; + uint32_t bitmap[NFS_ATTR_BITMAP_LEN], bmlen; + const char *tag; + nfs_specdata sd; + fhandle_t fh; + struct nfsreq rq, *req = &rq; + struct nfs_dulookup dul; + struct nfsreq_secinfo_args si; + + nmp = NFSTONMP(dnp); + if (!nmp) + return (ENXIO); + nfsvers = nmp->nm_vers; + namedattrs = (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR); + if (dnp->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); + + sd.specdata1 = sd.specdata2 = 0; + + switch (type) { + case NFLNK: + tag = "symlink"; + break; + case NFBLK: + case NFCHR: + tag = "mknod"; + if (!VATTR_IS_ACTIVE(vap, va_rdev)) + return (EINVAL); + sd.specdata1 = major(vap->va_rdev); + sd.specdata2 = minor(vap->va_rdev); + break; + case NFSOCK: + case NFFIFO: + tag = "mknod"; + break; + case NFDIR: + tag = "mkdir"; + break; + default: + return (EINVAL); + } + + nfs_avoid_needless_id_setting_on_create(dnp, vap, ctx); + + error = busyerror = nfs_node_set_busy(dnp, vfs_context_thread(ctx)); + if (!namedattrs) + nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx); + + NFSREQ_SECINFO_SET(&si, dnp, NULL, 0, NULL, 0); + NVATTR_INIT(&nvattr); + nfsm_chain_null(&nmreq); + nfsm_chain_null(&nmrep); + + // PUTFH, SAVEFH, CREATE, GETATTR(FH), RESTOREFH, GETATTR + numops = 6; + nfsm_chain_build_alloc_init(error, &nmreq, 66 * NFSX_UNSIGNED); + nfsm_chain_add_compound_header(error, &nmreq, tag, numops); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_SAVEFH); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_CREATE); + nfsm_chain_add_32(error, &nmreq, type); + if (type == NFLNK) { + nfsm_chain_add_name(error, &nmreq, link, strlen(link), nmp); + } else if ((type == NFBLK) || (type == NFCHR)) { + nfsm_chain_add_32(error, &nmreq, sd.specdata1); + nfsm_chain_add_32(error, &nmreq, sd.specdata2); + } + nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp); + nfsm_chain_add_fattr4(error, &nmreq, vap, nmp); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + NFS_COPY_ATTRIBUTES(nfs_getattr_bitmap, bitmap); + NFS_BITMAP_SET(bitmap, NFS_FATTR_FILEHANDLE); + nfsm_chain_add_bitmap_supported(error, &nmreq, bitmap, nmp, NULL); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_RESTOREFH); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, dnp); + nfsm_chain_build_done(error, &nmreq); + nfsm_assert(error, (numops == 0), EPROTO); + nfsmout_if(error); + + error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC4_COMPOUND, + vfs_context_thread(ctx), vfs_context_ucred(ctx), &si, 0, NULL, &req); + if (!error) { + if (!namedattrs) + nfs_dulookup_start(&dul, dnp, ctx); + error = nfs_request_async_finish(req, &nmrep, &xid, &status); + } + + if ((lockerror = nfs_node_lock(dnp))) + error = lockerror; + nfsm_chain_skip_tag(error, &nmrep); + nfsm_chain_get_32(error, &nmrep, numops); + nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_SAVEFH); + nfsmout_if(error); + nfsm_chain_op_check(error, &nmrep, NFS_OP_CREATE); + nfsm_chain_check_change_info(error, &nmrep, dnp); + bmlen = NFS_ATTR_BITMAP_LEN; + nfsm_chain_get_bitmap(error, &nmrep, bitmap, bmlen); + /* At this point if we have no error, the object was created. */ + /* if we don't get attributes, then we should lookitup. */ + create_error = error; + nfsmout_if(error); + nfs_vattr_set_supported(bitmap, vap); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); + nfsmout_if(error); + error = nfs4_parsefattr(&nmrep, NULL, &nvattr, &fh, NULL, NULL); + nfsmout_if(error); + if (!NFS_BITMAP_ISSET(nvattr.nva_bitmap, NFS_FATTR_FILEHANDLE)) { + printf("nfs: create/%s didn't return filehandle? %s\n", tag, cnp->cn_nameptr); + error = EBADRPC; + goto nfsmout; + } + /* directory attributes: if we don't get them, make sure to invalidate */ + nfsm_chain_op_check(error, &nmrep, NFS_OP_RESTOREFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); + savedxid = xid; + nfsm_chain_loadattr(error, &nmrep, dnp, nfsvers, &xid); + if (error) + NATTRINVALIDATE(dnp); + +nfsmout: + nfsm_chain_cleanup(&nmreq); + nfsm_chain_cleanup(&nmrep); + + if (!lockerror) { + if (!create_error && (dnp->n_flag & NNEGNCENTRIES)) { + dnp->n_flag &= ~NNEGNCENTRIES; + cache_purge_negatives(NFSTOV(dnp)); + } + dnp->n_flag |= NMODIFIED; + nfs_node_unlock(dnp); + /* nfs_getattr() will check changed and purge caches */ + nfs_getattr(dnp, NULL, ctx, NGA_CACHED); + } + + if (!error && fh.fh_len) { + /* create the vnode with the filehandle and attributes */ + xid = savedxid; + error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, rq.r_auth, NG_MAKEENTRY, &np); + if (!error) + newvp = NFSTOV(np); + } + NVATTR_CLEANUP(&nvattr); + + if (!namedattrs) + nfs_dulookup_finish(&dul, dnp, ctx); + + /* + * Kludge: Map EEXIST => 0 assuming that you have a reply to a retry + * if we can succeed in looking up the object. + */ + if ((create_error == EEXIST) || (!create_error && !newvp)) { + error = nfs_lookitup(dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx, &np); + if (!error) { + newvp = NFSTOV(np); + if (vnode_vtype(newvp) != nfstov_type(type, nfsvers)) + error = EEXIST; + } + } + if (!busyerror) + nfs_node_clear_busy(dnp); + if (error) { + if (newvp) { + nfs_node_unlock(np); + vnode_put(newvp); + } + } else { + nfs_node_unlock(np); + *npp = np; + } + return (error); +} + +int +nfs4_vnop_mknod( + struct vnop_mknod_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; + vnode_t *a_vpp; + struct componentname *a_cnp; + struct vnode_attr *a_vap; + vfs_context_t a_context; + } */ *ap) +{ + nfsnode_t np = NULL; + struct nfsmount *nmp; + int error; + + nmp = VTONMP(ap->a_dvp); + if (!nmp) + return (ENXIO); + + if (!VATTR_IS_ACTIVE(ap->a_vap, va_type)) + return (EINVAL); + switch (ap->a_vap->va_type) { + case VBLK: + case VCHR: + case VFIFO: + case VSOCK: + break; + default: + return (ENOTSUP); + } + + error = nfs4_create_rpc(ap->a_context, VTONFS(ap->a_dvp), ap->a_cnp, ap->a_vap, + vtonfs_type(ap->a_vap->va_type, nmp->nm_vers), NULL, &np); + if (!error) + *ap->a_vpp = NFSTOV(np); + return (error); +} + +int +nfs4_vnop_mkdir( + struct vnop_mkdir_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; + vnode_t *a_vpp; + struct componentname *a_cnp; + struct vnode_attr *a_vap; + vfs_context_t a_context; + } */ *ap) +{ + nfsnode_t np = NULL; + int error; + + error = nfs4_create_rpc(ap->a_context, VTONFS(ap->a_dvp), ap->a_cnp, ap->a_vap, + NFDIR, NULL, &np); + if (!error) + *ap->a_vpp = NFSTOV(np); + return (error); +} + +int +nfs4_vnop_symlink( + struct vnop_symlink_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; + vnode_t *a_vpp; + struct componentname *a_cnp; + struct vnode_attr *a_vap; + char *a_target; + vfs_context_t a_context; + } */ *ap) +{ + nfsnode_t np = NULL; + int error; + + error = nfs4_create_rpc(ap->a_context, VTONFS(ap->a_dvp), ap->a_cnp, ap->a_vap, + NFLNK, ap->a_target, &np); + if (!error) + *ap->a_vpp = NFSTOV(np); + return (error); +} + +int +nfs4_vnop_link( + struct vnop_link_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + vnode_t a_tdvp; + struct componentname *a_cnp; + vfs_context_t a_context; + } */ *ap) +{ + vfs_context_t ctx = ap->a_context; + vnode_t vp = ap->a_vp; + vnode_t tdvp = ap->a_tdvp; + struct componentname *cnp = ap->a_cnp; + int error = 0, lockerror = ENOENT, status; + struct nfsmount *nmp; + nfsnode_t np = VTONFS(vp); + nfsnode_t tdnp = VTONFS(tdvp); + int nfsvers, numops; + u_int64_t xid, savedxid; + struct nfsm_chain nmreq, nmrep; + struct nfsreq_secinfo_args si; + + if (vnode_mount(vp) != vnode_mount(tdvp)) + return (EXDEV); + + nmp = VTONMP(vp); + if (!nmp) + return (ENXIO); + nfsvers = nmp->nm_vers; + if (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); + if (tdnp->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); + + /* + * Push all writes to the server, so that the attribute cache + * doesn't get "out of sync" with the server. + * XXX There should be a better way! + */ + nfs_flush(np, MNT_WAIT, vfs_context_thread(ctx), V_IGNORE_WRITEERR); + + if ((error = nfs_node_set_busy2(tdnp, np, vfs_context_thread(ctx)))) + return (error); + + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); + nfsm_chain_null(&nmreq); + nfsm_chain_null(&nmrep); + + // PUTFH(SOURCE), SAVEFH, PUTFH(DIR), LINK, GETATTR(DIR), RESTOREFH, GETATTR + numops = 7; + nfsm_chain_build_alloc_init(error, &nmreq, 29 * NFSX_UNSIGNED + cnp->cn_namelen); + nfsm_chain_add_compound_header(error, &nmreq, "link", numops); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, nfsvers, np->n_fhp, np->n_fhsize); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_SAVEFH); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, nfsvers, tdnp->n_fhp, tdnp->n_fhsize); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_LINK); + nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, tdnp); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_RESTOREFH); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, np); + nfsm_chain_build_done(error, &nmreq); + nfsm_assert(error, (numops == 0), EPROTO); + nfsmout_if(error); + error = nfs_request(tdnp, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &si, &nmrep, &xid, &status); + + if ((lockerror = nfs_node_lock2(tdnp, np))) { + error = lockerror; + goto nfsmout; + } + nfsm_chain_skip_tag(error, &nmrep); + nfsm_chain_get_32(error, &nmrep, numops); + nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_SAVEFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_LINK); + nfsm_chain_check_change_info(error, &nmrep, tdnp); + /* directory attributes: if we don't get them, make sure to invalidate */ + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); + savedxid = xid; + nfsm_chain_loadattr(error, &nmrep, tdnp, nfsvers, &xid); + if (error) + NATTRINVALIDATE(tdnp); + /* link attributes: if we don't get them, make sure to invalidate */ + nfsm_chain_op_check(error, &nmrep, NFS_OP_RESTOREFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); + xid = savedxid; + nfsm_chain_loadattr(error, &nmrep, np, nfsvers, &xid); + if (error) + NATTRINVALIDATE(np); +nfsmout: + nfsm_chain_cleanup(&nmreq); + nfsm_chain_cleanup(&nmrep); + if (!lockerror) + tdnp->n_flag |= NMODIFIED; + /* Kludge: Map EEXIST => 0 assuming that it is a reply to a retry. */ + if (error == EEXIST) + error = 0; + if (!error && (tdnp->n_flag & NNEGNCENTRIES)) { + tdnp->n_flag &= ~NNEGNCENTRIES; + cache_purge_negatives(tdvp); + } + if (!lockerror) + nfs_node_unlock2(tdnp, np); + nfs_node_clear_busy2(tdnp, np); + return (error); +} + +int +nfs4_vnop_rmdir( + struct vnop_rmdir_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; + vnode_t a_vp; + struct componentname *a_cnp; + vfs_context_t a_context; + } */ *ap) +{ + vfs_context_t ctx = ap->a_context; + vnode_t vp = ap->a_vp; + vnode_t dvp = ap->a_dvp; + struct componentname *cnp = ap->a_cnp; + struct nfsmount *nmp; + int error = 0, namedattrs; + nfsnode_t np = VTONFS(vp); + nfsnode_t dnp = VTONFS(dvp); + struct nfs_dulookup dul; + + if (vnode_vtype(vp) != VDIR) + return (EINVAL); + + nmp = NFSTONMP(dnp); + if (!nmp) + return (ENXIO); + namedattrs = (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR); + + if ((error = nfs_node_set_busy2(dnp, np, vfs_context_thread(ctx)))) + return (error); + + if (!namedattrs) { + nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx); + nfs_dulookup_start(&dul, dnp, ctx); + } + + error = nfs4_remove_rpc(dnp, cnp->cn_nameptr, cnp->cn_namelen, + vfs_context_thread(ctx), vfs_context_ucred(ctx)); + + nfs_name_cache_purge(dnp, np, cnp, ctx); + /* nfs_getattr() will check changed and purge caches */ + nfs_getattr(dnp, NULL, ctx, NGA_CACHED); + if (!namedattrs) + nfs_dulookup_finish(&dul, dnp, ctx); + nfs_node_clear_busy2(dnp, np); + + /* + * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry. + */ + if (error == ENOENT) + error = 0; + if (!error) { + /* + * remove nfsnode from hash now so we can't accidentally find it + * again if another object gets created with the same filehandle + * before this vnode gets reclaimed + */ + lck_mtx_lock(nfs_node_hash_mutex); + if (np->n_hflag & NHHASHED) { + LIST_REMOVE(np, n_hash); + np->n_hflag &= ~NHHASHED; + FSDBG(266, 0, np, np->n_flag, 0xb1eb1e); + } + lck_mtx_unlock(nfs_node_hash_mutex); + } + return (error); +} + +/* + * NFSv4 Named Attributes + * + * Both the extended attributes interface and the named streams interface + * are backed by NFSv4 named attributes. The implementations for both use + * a common set of routines in an attempt to reduce code duplication, to + * increase efficiency, to increase caching of both names and data, and to + * confine the complexity. + * + * Each NFS node caches its named attribute directory's file handle. + * The directory nodes for the named attribute directories are handled + * exactly like regular directories (with a couple minor exceptions). + * Named attribute nodes are also treated as much like regular files as + * possible. + * + * Most of the heavy lifting is done by nfs4_named_attr_get(). + */ + +/* + * Get the given node's attribute directory node. + * If !fetch, then only return a cached node. + * Otherwise, we will attempt to fetch the node from the server. + * (Note: the node should be marked busy.) */ -int -nfs4_vnop_create( - struct vnop_create_args /* { - struct vnodeop_desc *a_desc; - vnode_t a_dvp; - vnode_t *a_vpp; - struct componentname *a_cnp; - struct vnode_attr *a_vap; - vfs_context_t a_context; - } */ *ap) +nfsnode_t +nfs4_named_attr_dir_get(nfsnode_t np, int fetch, vfs_context_t ctx) { - vfs_context_t ctx = ap->a_context; - struct componentname *cnp = ap->a_cnp; - struct vnode_attr *vap = ap->a_vap; - vnode_t dvp = ap->a_dvp; - vnode_t *vpp = ap->a_vpp; + nfsnode_t adnp = NULL; struct nfsmount *nmp; - nfsnode_t np; - int error = 0; - struct nfs_open_owner *noop = NULL; - struct nfs_open_file *nofp = NULL; + int error = 0, status, numops; + struct nfsm_chain nmreq, nmrep; + u_int64_t xid; + uint32_t bitmap[NFS_ATTR_BITMAP_LEN]; + fhandle_t fh; + struct nfs_vattr nvattr; + struct componentname cn; + struct nfsreq rq, *req = &rq; + struct nfsreq_secinfo_args si; - nmp = VTONMP(dvp); + nmp = NFSTONMP(np); if (!nmp) - return (ENXIO); - - nfs_avoid_needless_id_setting_on_create(VTONFS(dvp), vap, ctx); + return (NULL); + if (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (NULL); - noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 1); - if (!noop) - return (ENOMEM); + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); + NVATTR_INIT(&nvattr); + nfsm_chain_null(&nmreq); + nfsm_chain_null(&nmrep); -restart: - error = nfs_mount_state_in_use_start(nmp); - if (error) { - nfs_open_owner_rele(noop); - return (error); + bzero(&cn, sizeof(cn)); + cn.cn_nameptr = __CAST_AWAY_QUALIFIER(_PATH_FORKSPECIFIER, const, char *); /* "/..namedfork/" */ + cn.cn_namelen = strlen(_PATH_FORKSPECIFIER); + cn.cn_nameiop = LOOKUP; + + if (np->n_attrdirfh) { + // XXX can't set parent correctly (to np) yet + error = nfs_nget(nmp->nm_mountp, NULL, &cn, np->n_attrdirfh+1, *np->n_attrdirfh, + NULL, NULL, RPCAUTH_UNKNOWN, NG_NOCREATE, &adnp); + if (adnp) + goto nfsmout; + } + if (!fetch) { + error = ENOENT; + goto nfsmout; } - error = nfs_open_file_find(NULL, noop, &nofp, 0, 0, 1); - if (!error && (nofp->nof_flags & NFS_OPEN_FILE_LOST)) { - printf("nfs_vnop_create: LOST\n"); - error = EIO; - } - if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { - nfs_mount_state_in_use_end(nmp, 0); - nfs4_reopen(nofp, vfs_context_thread(ctx)); - nofp = NULL; - goto restart; - } + // PUTFH, OPENATTR, GETATTR + numops = 3; + nfsm_chain_build_alloc_init(error, &nmreq, 22 * NFSX_UNSIGNED); + nfsm_chain_add_compound_header(error, &nmreq, "openattr", numops); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, nmp->nm_vers, np->n_fhp, np->n_fhsize); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_OPENATTR); + nfsm_chain_add_32(error, &nmreq, 0); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + NFS_COPY_ATTRIBUTES(nfs_getattr_bitmap, bitmap); + NFS_BITMAP_SET(bitmap, NFS_FATTR_FILEHANDLE); + nfsm_chain_add_bitmap_masked(error, &nmreq, bitmap, + NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_build_done(error, &nmreq); + nfsm_assert(error, (numops == 0), EPROTO); + nfsmout_if(error); + error = nfs_request_async(np, NULL, &nmreq, NFSPROC4_COMPOUND, + vfs_context_thread(ctx), vfs_context_ucred(ctx), &si, 0, NULL, &req); if (!error) - error = nfs_open_file_set_busy(nofp, vfs_context_thread(ctx)); - if (error) { - nofp = NULL; - goto out; - } - - nofp->nof_opencnt++; - nofp->nof_access = NFS_OPEN_SHARE_ACCESS_BOTH; - nofp->nof_deny = NFS_OPEN_SHARE_DENY_NONE; - nofp->nof_rw++; + error = nfs_request_async_finish(req, &nmrep, &xid, &status); - error = nfs4_open_rpc(nofp, ctx, cnp, vap, dvp, vpp, NFS_OPEN_CREATE, - NFS_OPEN_SHARE_ACCESS_BOTH, NFS_OPEN_SHARE_DENY_NONE); - if (!error && !*vpp) { - printf("nfs4_open_rpc returned without a node?\n"); - /* Hmmm... with no node, we have no filehandle and can't close it */ - error = EIO; + nfsm_chain_skip_tag(error, &nmrep); + nfsm_chain_get_32(error, &nmrep, numops); + nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_OPENATTR); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); + nfsmout_if(error); + error = nfs4_parsefattr(&nmrep, NULL, &nvattr, &fh, NULL, NULL); + nfsmout_if(error); + if (!NFS_BITMAP_ISSET(nvattr.nva_bitmap, NFS_FATTR_FILEHANDLE) || !fh.fh_len) { + error = ENOENT; + goto nfsmout; } - if (error) { - nofp->nof_rw--; - nofp->nof_access = 0; - nofp->nof_deny = 0; - nofp->nof_opencnt--; - } - if (*vpp) { - nofp->nof_np = np = VTONFS(*vpp); - /* insert nofp onto np's open list */ - TAILQ_INSERT_HEAD(&np->n_opens, nofp, nof_link); - if (!error) { - nofp->nof_flags |= NFS_OPEN_FILE_CREATE; - nofp->nof_creator = current_thread(); - } + if (!np->n_attrdirfh || (*np->n_attrdirfh != fh.fh_len)) { + /* (re)allocate attrdir fh buffer */ + if (np->n_attrdirfh) + FREE(np->n_attrdirfh, M_TEMP); + MALLOC(np->n_attrdirfh, u_char*, fh.fh_len+1, M_TEMP, M_WAITOK); } -out: - if (nofp) - nfs_open_file_clear_busy(nofp); - if (nfs_mount_state_in_use_end(nmp, error)) { - nofp = NULL; - goto restart; + if (!np->n_attrdirfh) { + error = ENOMEM; + goto nfsmout; } - if (noop) - nfs_open_owner_rele(noop); - return (error); -} + /* cache the attrdir fh in the node */ + *np->n_attrdirfh = fh.fh_len; + bcopy(fh.fh_data, np->n_attrdirfh+1, fh.fh_len); + /* create node for attrdir */ + // XXX can't set parent correctly (to np) yet + error = nfs_nget(NFSTOMP(np), NULL, &cn, fh.fh_data, fh.fh_len, &nvattr, &xid, rq.r_auth, 0, &adnp); +nfsmout: + NVATTR_CLEANUP(&nvattr); + nfsm_chain_cleanup(&nmreq); + nfsm_chain_cleanup(&nmrep); -void -nfs_avoid_needless_id_setting_on_create(nfsnode_t dnp, struct vnode_attr *vap, vfs_context_t ctx) -{ - /* - * Don't bother setting UID if it's the same as the credential performing the create. - * Don't bother setting GID if it's the same as the directory or credential. - */ - if (VATTR_IS_ACTIVE(vap, va_uid)) { - if (kauth_cred_getuid(vfs_context_ucred(ctx)) == vap->va_uid) - VATTR_CLEAR_ACTIVE(vap, va_uid); - } - if (VATTR_IS_ACTIVE(vap, va_gid)) { - if ((vap->va_gid == dnp->n_vattr.nva_gid) || - (kauth_cred_getgid(vfs_context_ucred(ctx)) == vap->va_gid)) - VATTR_CLEAR_ACTIVE(vap, va_gid); + if (adnp) { + /* sanity check that this node is an attribute directory */ + if (adnp->n_vattr.nva_type != VDIR) + error = EINVAL; + if (!(adnp->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR)) + error = EINVAL; + nfs_node_unlock(adnp); + if (error) + vnode_put(NFSTOV(adnp)); } + return (error ? NULL : adnp); } /* - * Note: the NFSv4 CREATE RPC is for everything EXCEPT regular files. + * Get the given node's named attribute node for the name given. + * + * In an effort to increase the performance of named attribute access, we try + * to reduce server requests by doing the following: + * + * - cache the node's named attribute directory file handle in the node + * - maintain a directory vnode for the attribute directory + * - use name cache entries (positive and negative) to speed up lookups + * - optionally open the named attribute (with the given accessMode) in the same RPC + * - combine attribute directory retrieval with the lookup/open RPC + * - optionally prefetch the named attribute's first block of data in the same RPC + * + * Also, in an attempt to reduce the number of copies/variations of this code, + * parts of the RPC building/processing code are conditionalized on what is + * needed for any particular request (openattr, lookup vs. open, read). + * + * Note that because we may not have the attribute directory node when we start + * the lookup/open, we lock both the node and the attribute directory node. */ + +#define NFS_GET_NAMED_ATTR_CREATE 0x1 +#define NFS_GET_NAMED_ATTR_CREATE_GUARDED 0x2 +#define NFS_GET_NAMED_ATTR_TRUNCATE 0x4 +#define NFS_GET_NAMED_ATTR_PREFETCH 0x8 + int -nfs4_create_rpc( - vfs_context_t ctx, - nfsnode_t dnp, +nfs4_named_attr_get( + nfsnode_t np, struct componentname *cnp, - struct vnode_attr *vap, - int type, - char *link, - nfsnode_t *npp) + uint32_t accessMode, + int flags, + vfs_context_t ctx, + nfsnode_t *anpp, + struct nfs_open_file **nofpp) { struct nfsmount *nmp; - struct nfs_vattr nvattr, dnvattr; - int error = 0, create_error = EIO, lockerror = ENOENT, busyerror = ENOENT, status; - int nfsvers, numops; + int error = 0, open_error = EIO; + int inuse = 0, adlockerror = ENOENT, busyerror = ENOENT, adbusyerror = ENOENT, nofpbusyerror = ENOENT; + int create, guarded, prefetch, truncate, noopbusy = 0; + int open, status, numops, hadattrdir, negnamecache; + struct nfs_vattr nvattr; + struct vnode_attr vattr; + nfsnode_t adnp = NULL, anp = NULL; + vnode_t avp = NULL; u_int64_t xid, savedxid = 0; - nfsnode_t np = NULL; - vnode_t newvp = NULL; struct nfsm_chain nmreq, nmrep; uint32_t bitmap[NFS_ATTR_BITMAP_LEN], bmlen; - const char *tag; - nfs_specdata sd; + uint32_t denyMode, rflags, delegation, recall, eof, rlen, retlen; + nfs_stateid stateid, dstateid; fhandle_t fh; - struct nfsreq *req = NULL; - struct nfs_dulookup dul; + struct nfs_open_owner *noop = NULL; + struct nfs_open_file *newnofp = NULL, *nofp = NULL; + struct vnop_access_args naa; + thread_t thd; + kauth_cred_t cred; + struct timeval now; + char sbuf[64], *s; + uint32_t ace_type, ace_flags, ace_mask, len, slen; + struct kauth_ace ace; + struct nfsreq rq, *req = &rq; + struct nfsreq_secinfo_args si; + + *anpp = NULL; + fh.fh_len = 0; + rflags = delegation = recall = eof = rlen = retlen = 0; + ace.ace_flags = 0; + s = sbuf; + slen = sizeof(sbuf); - nmp = NFSTONMP(dnp); + nmp = NFSTONMP(np); if (!nmp) return (ENXIO); - nfsvers = nmp->nm_vers; - - sd.specdata1 = sd.specdata2 = 0; + NVATTR_INIT(&nvattr); + negnamecache = !NMFLAG(nmp, NONEGNAMECACHE); + thd = vfs_context_thread(ctx); + cred = vfs_context_ucred(ctx); + create = (flags & NFS_GET_NAMED_ATTR_CREATE) ? NFS_OPEN_CREATE : NFS_OPEN_NOCREATE; + guarded = (flags & NFS_GET_NAMED_ATTR_CREATE_GUARDED) ? NFS_CREATE_GUARDED : NFS_CREATE_UNCHECKED; + truncate = (flags & NFS_GET_NAMED_ATTR_TRUNCATE); + prefetch = (flags & NFS_GET_NAMED_ATTR_PREFETCH); + + if (!create) { + error = nfs_getattr(np, &nvattr, ctx, NGA_CACHED); + if (error) + return (error); + if (NFS_BITMAP_ISSET(nvattr.nva_bitmap, NFS_FATTR_NAMED_ATTR) && + !(nvattr.nva_flags & NFS_FFLAG_HAS_NAMED_ATTRS)) + return (ENOATTR); + } else if (accessMode == NFS_OPEN_SHARE_ACCESS_NONE) { + /* shouldn't happen... but just be safe */ + printf("nfs4_named_attr_get: create with no access %s\n", cnp->cn_nameptr); + accessMode = NFS_OPEN_SHARE_ACCESS_READ; + } + open = (accessMode != NFS_OPEN_SHARE_ACCESS_NONE); + if (open) { + /* + * We're trying to open the file. + * We'll create/open it with the given access mode, + * and set NFS_OPEN_FILE_CREATE. + */ + denyMode = NFS_OPEN_SHARE_DENY_NONE; + if (prefetch && guarded) + prefetch = 0; /* no sense prefetching data that can't be there */ - switch (type) { - case NFLNK: - tag = "symlink"; - break; - case NFBLK: - case NFCHR: - tag = "mknod"; - if (!VATTR_IS_ACTIVE(vap, va_rdev)) - return (EINVAL); - sd.specdata1 = major(vap->va_rdev); - sd.specdata2 = minor(vap->va_rdev); - break; - case NFSOCK: - case NFFIFO: - tag = "mknod"; - break; - case NFDIR: - tag = "mkdir"; - break; - default: - return (EINVAL); + noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 1); + if (!noop) + return (ENOMEM); } - nfs_avoid_needless_id_setting_on_create(dnp, vap, ctx); - - error = busyerror = nfs_node_set_busy(dnp, vfs_context_thread(ctx)); - nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx); + if ((error = busyerror = nfs_node_set_busy(np, vfs_context_thread(ctx)))) + return (error); + adnp = nfs4_named_attr_dir_get(np, 0, ctx); + hadattrdir = (adnp != NULL); + if (prefetch) { + microuptime(&now); + /* use the special state ID because we don't have a real one to send */ + stateid.seqid = stateid.other[0] = stateid.other[1] = stateid.other[2] = 0; + rlen = MIN(nmp->nm_rsize, nmp->nm_biosize); + } + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); - // PUTFH, SAVEFH, CREATE, GETATTR(FH), RESTOREFH, GETATTR - numops = 6; - nfsm_chain_build_alloc_init(error, &nmreq, 66 * NFSX_UNSIGNED); - nfsm_chain_add_compound_header(error, &nmreq, tag, numops); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); - nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_SAVEFH); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_CREATE); - nfsm_chain_add_32(error, &nmreq, type); - if (type == NFLNK) { - nfsm_chain_add_string(error, &nmreq, link, strlen(link)); - } else if ((type == NFBLK) || (type == NFCHR)) { - nfsm_chain_add_32(error, &nmreq, sd.specdata1); - nfsm_chain_add_32(error, &nmreq, sd.specdata2); + if (hadattrdir) { + if ((error = adbusyerror = nfs_node_set_busy(adnp, vfs_context_thread(ctx)))) + goto nfsmout; + /* nfs_getattr() will check changed and purge caches */ + error = nfs_getattr(adnp, NULL, ctx, NGA_CACHED); + nfsmout_if(error); + error = cache_lookup(NFSTOV(adnp), &avp, cnp); + switch (error) { + case ENOENT: + /* negative cache entry */ + goto nfsmout; + case 0: + /* cache miss */ + /* try dir buf cache lookup */ + error = nfs_dir_buf_cache_lookup(adnp, &anp, cnp, ctx, 0); + if (!error && anp) { + /* dir buf cache hit */ + *anpp = anp; + error = -1; + } + if (error != -1) /* cache miss */ + break; + /* FALLTHROUGH */ + case -1: + /* cache hit, not really an error */ + OSAddAtomic(1, &nfsstats.lookupcache_hits); + if (!anp && avp) + *anpp = anp = VTONFS(avp); + + nfs_node_clear_busy(adnp); + adbusyerror = ENOENT; + + /* check for directory access */ + naa.a_desc = &vnop_access_desc; + naa.a_vp = NFSTOV(adnp); + naa.a_action = KAUTH_VNODE_SEARCH; + naa.a_context = ctx; + + /* compute actual success/failure based on accessibility */ + error = nfs_vnop_access(&naa); + /* FALLTHROUGH */ + default: + /* we either found it, or hit an error */ + if (!error && guarded) { + /* found cached entry but told not to use it */ + error = EEXIST; + vnode_put(NFSTOV(anp)); + *anpp = anp = NULL; + } + /* we're done if error or we don't need to open */ + if (error || !open) + goto nfsmout; + /* no error and we need to open... */ + } + } + + if (open) { +restart: + error = nfs_mount_state_in_use_start(nmp, vfs_context_thread(ctx)); + if (error) { + nfs_open_owner_rele(noop); + noop = NULL; + goto nfsmout; + } + inuse = 1; + + /* grab an open file - possibly provisional/nodeless if cache_lookup() failed */ + error = nfs_open_file_find(anp, noop, &newnofp, 0, 0, 1); + if (!error && (newnofp->nof_flags & NFS_OPEN_FILE_LOST)) { + printf("nfs4_named_attr_get: LOST %d %s\n", kauth_cred_getuid(noop->noo_cred), cnp->cn_nameptr); + error = EIO; + } + if (!error && (newnofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { + nfs_mount_state_in_use_end(nmp, 0); + error = nfs4_reopen(newnofp, vfs_context_thread(ctx)); + nfs_open_file_destroy(newnofp); + newnofp = NULL; + if (!error) + goto restart; + } + if (!error) + error = nfs_open_file_set_busy(newnofp, vfs_context_thread(ctx)); + if (error) { + if (newnofp) + nfs_open_file_destroy(newnofp); + newnofp = NULL; + goto nfsmout; + } + if (anp) { + /* + * We already have the node. So we just need to open + * it - which we may be able to do with a delegation. + */ + open_error = error = nfs4_open(anp, newnofp, accessMode, denyMode, ctx); + if (!error) { + /* open succeeded, so our open file is no longer temporary */ + nofp = newnofp; + nofpbusyerror = 0; + newnofp = NULL; + if (nofpp) + *nofpp = nofp; + } + goto nfsmout; + } + } + + /* + * We either don't have the attrdir or we didn't find the attribute + * in the name cache, so we need to talk to the server. + * + * If we don't have the attrdir, we'll need to ask the server for that too. + * If the caller is requesting that the attribute be created, we need to + * make sure the attrdir is created. + * The caller may also request that the first block of an existing attribute + * be retrieved at the same time. + */ + + if (open) { + /* need to mark the open owner busy during the RPC */ + if ((error = nfs_open_owner_set_busy(noop, thd))) + goto nfsmout; + noopbusy = 1; + } + + /* + * We'd like to get updated post-open/lookup attributes for the + * directory and we may also want to prefetch some data via READ. + * We'd like the READ results to be last so that we can leave the + * data in the mbufs until the end. + * + * At a minimum we're sending: PUTFH, LOOKUP/OPEN, GETATTR, PUTFH, GETATTR + */ + numops = 5; + if (!hadattrdir) + numops += 3; // also sending: OPENATTR, GETATTR, OPENATTR + if (prefetch) + numops += 4; // also sending: SAVEFH, RESTOREFH, NVERIFY, READ + nfsm_chain_build_alloc_init(error, &nmreq, 64 * NFSX_UNSIGNED + cnp->cn_namelen); + nfsm_chain_add_compound_header(error, &nmreq, "getnamedattr", numops); + if (hadattrdir) { + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, nmp->nm_vers, adnp->n_fhp, adnp->n_fhsize); + } else { + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, nmp->nm_vers, np->n_fhp, np->n_fhsize); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_OPENATTR); + nfsm_chain_add_32(error, &nmreq, create ? 1 : 0); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + NFS_COPY_ATTRIBUTES(nfs_getattr_bitmap, bitmap); + NFS_BITMAP_SET(bitmap, NFS_FATTR_FILEHANDLE); + nfsm_chain_add_bitmap_masked(error, &nmreq, bitmap, + NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + } + if (open) { + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_OPEN); + nfsm_chain_add_32(error, &nmreq, noop->noo_seqid); + nfsm_chain_add_32(error, &nmreq, accessMode); + nfsm_chain_add_32(error, &nmreq, denyMode); + nfsm_chain_add_64(error, &nmreq, nmp->nm_clientid); + nfsm_chain_add_32(error, &nmreq, NFSX_UNSIGNED); + nfsm_chain_add_32(error, &nmreq, kauth_cred_getuid(noop->noo_cred)); + nfsm_chain_add_32(error, &nmreq, create); + if (create) { + nfsm_chain_add_32(error, &nmreq, guarded); + VATTR_INIT(&vattr); + if (truncate) + VATTR_SET(&vattr, va_data_size, 0); + nfsm_chain_add_fattr4(error, &nmreq, &vattr, nmp); + } + nfsm_chain_add_32(error, &nmreq, NFS_CLAIM_NULL); + nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp); + } else { + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_LOOKUP); + nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp); } - nfsm_chain_add_string(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen); - nfsm_chain_add_fattr4(error, &nmreq, vap, nmp); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); NFS_COPY_ATTRIBUTES(nfs_getattr_bitmap, bitmap); NFS_BITMAP_SET(bitmap, NFS_FATTR_FILEHANDLE); nfsm_chain_add_bitmap_masked(error, &nmreq, bitmap, NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_RESTOREFH); + if (prefetch) { + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_SAVEFH); + } + if (hadattrdir) { + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, nmp->nm_vers, adnp->n_fhp, adnp->n_fhsize); + } else { + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, nmp->nm_vers, np->n_fhp, np->n_fhsize); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_OPENATTR); + nfsm_chain_add_32(error, &nmreq, 0); + } numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + if (prefetch) { + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_RESTOREFH); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_NVERIFY); + VATTR_INIT(&vattr); + VATTR_SET(&vattr, va_data_size, 0); + nfsm_chain_add_fattr4(error, &nmreq, &vattr, nmp); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_READ); + nfsm_chain_add_stateid(error, &nmreq, &stateid); + nfsm_chain_add_64(error, &nmreq, 0); + nfsm_chain_add_32(error, &nmreq, rlen); + } nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - - error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC4_COMPOUND, - vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, &req); - if (!error) { - nfs_dulookup_start(&dul, dnp, ctx); + error = nfs_request_async(hadattrdir ? adnp : np, NULL, &nmreq, NFSPROC4_COMPOUND, + vfs_context_thread(ctx), vfs_context_ucred(ctx), &si, open ? R_NOINTR: 0, NULL, &req); + if (!error) error = nfs_request_async_finish(req, &nmrep, &xid, &status); - } - if ((lockerror = nfs_node_lock(dnp))) - error = lockerror; + if (hadattrdir && ((adlockerror = nfs_node_lock(adnp)))) + error = adlockerror; + savedxid = xid; nfsm_chain_skip_tag(error, &nmrep); nfsm_chain_get_32(error, &nmrep, numops); nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); - nfsm_chain_op_check(error, &nmrep, NFS_OP_SAVEFH); - nfsmout_if(error); - nfsm_chain_op_check(error, &nmrep, NFS_OP_CREATE); - nfsm_chain_check_change_info(error, &nmrep, dnp); - bmlen = NFS_ATTR_BITMAP_LEN; - nfsm_chain_get_bitmap(error, &nmrep, bitmap, bmlen); - /* At this point if we have no error, the object was created. */ - /* if we don't get attributes, then we should lookitup. */ - create_error = error; - nfsmout_if(error); - nfs_vattr_set_supported(bitmap, vap); + if (!hadattrdir) { + nfsm_chain_op_check(error, &nmrep, NFS_OP_OPENATTR); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); + nfsmout_if(error); + error = nfs4_parsefattr(&nmrep, NULL, &nvattr, &fh, NULL, NULL); + nfsmout_if(error); + if (NFS_BITMAP_ISSET(nvattr.nva_bitmap, NFS_FATTR_FILEHANDLE) && fh.fh_len) { + if (!np->n_attrdirfh || (*np->n_attrdirfh != fh.fh_len)) { + /* (re)allocate attrdir fh buffer */ + if (np->n_attrdirfh) + FREE(np->n_attrdirfh, M_TEMP); + MALLOC(np->n_attrdirfh, u_char*, fh.fh_len+1, M_TEMP, M_WAITOK); + } + if (np->n_attrdirfh) { + /* remember the attrdir fh in the node */ + *np->n_attrdirfh = fh.fh_len; + bcopy(fh.fh_data, np->n_attrdirfh+1, fh.fh_len); + /* create busied node for attrdir */ + struct componentname cn; + bzero(&cn, sizeof(cn)); + cn.cn_nameptr = __CAST_AWAY_QUALIFIER(_PATH_FORKSPECIFIER, const, char *); /* "/..namedfork/" */ + cn.cn_namelen = strlen(_PATH_FORKSPECIFIER); + cn.cn_nameiop = LOOKUP; + // XXX can't set parent correctly (to np) yet + error = nfs_nget(NFSTOMP(np), NULL, &cn, fh.fh_data, fh.fh_len, &nvattr, &xid, rq.r_auth, 0, &adnp); + if (!error) { + adlockerror = 0; + /* set the node busy */ + SET(adnp->n_flag, NBUSY); + adbusyerror = 0; + } + /* if no adnp, oh well... */ + error = 0; + } + } + NVATTR_CLEANUP(&nvattr); + fh.fh_len = 0; + } + if (open) { + nfsm_chain_op_check(error, &nmrep, NFS_OP_OPEN); + nfs_owner_seqid_increment(noop, NULL, error); + nfsm_chain_get_stateid(error, &nmrep, &newnofp->nof_stateid); + nfsm_chain_check_change_info(error, &nmrep, adnp); + nfsm_chain_get_32(error, &nmrep, rflags); + bmlen = NFS_ATTR_BITMAP_LEN; + nfsm_chain_get_bitmap(error, &nmrep, bitmap, bmlen); + nfsm_chain_get_32(error, &nmrep, delegation); + if (!error) + switch (delegation) { + case NFS_OPEN_DELEGATE_NONE: + break; + case NFS_OPEN_DELEGATE_READ: + case NFS_OPEN_DELEGATE_WRITE: + nfsm_chain_get_stateid(error, &nmrep, &dstateid); + nfsm_chain_get_32(error, &nmrep, recall); + if (delegation == NFS_OPEN_DELEGATE_WRITE) // space (skip) XXX + nfsm_chain_adv(error, &nmrep, 3 * NFSX_UNSIGNED); + /* if we have any trouble accepting the ACE, just invalidate it */ + ace_type = ace_flags = ace_mask = len = 0; + nfsm_chain_get_32(error, &nmrep, ace_type); + nfsm_chain_get_32(error, &nmrep, ace_flags); + nfsm_chain_get_32(error, &nmrep, ace_mask); + nfsm_chain_get_32(error, &nmrep, len); + ace.ace_flags = nfs4_ace_nfstype_to_vfstype(ace_type, &error); + ace.ace_flags |= nfs4_ace_nfsflags_to_vfsflags(ace_flags); + ace.ace_rights = nfs4_ace_nfsmask_to_vfsrights(ace_mask); + if (!error && (len >= slen)) { + MALLOC(s, char*, len+1, M_TEMP, M_WAITOK); + if (s) + slen = len+1; + else + ace.ace_flags = 0; + } + if (s) + nfsm_chain_get_opaque(error, &nmrep, len, s); + else + nfsm_chain_adv(error, &nmrep, nfsm_rndup(len)); + if (!error && s) { + s[len] = '\0'; + if (nfs4_id2guid(s, &ace.ace_applicable, (ace_flags & NFS_ACE_IDENTIFIER_GROUP))) + ace.ace_flags = 0; + } + if (error || !s) + ace.ace_flags = 0; + if (s && (s != sbuf)) + FREE(s, M_TEMP); + break; + default: + error = EBADRPC; + break; + } + /* At this point if we have no error, the object was created/opened. */ + open_error = error; + } else { + nfsm_chain_op_check(error, &nmrep, NFS_OP_LOOKUP); + } nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); nfsmout_if(error); - NFS_CLEAR_ATTRIBUTES(nvattr.nva_bitmap); - error = nfs4_parsefattr(&nmrep, NULL, &nvattr, &fh, NULL); + error = nfs4_parsefattr(&nmrep, NULL, &nvattr, &fh, NULL, NULL); nfsmout_if(error); - if (!NFS_BITMAP_ISSET(nvattr.nva_bitmap, NFS_FATTR_FILEHANDLE)) { - printf("nfs: create/%s didn't return filehandle?\n", tag); - error = EBADRPC; + if (!NFS_BITMAP_ISSET(nvattr.nva_bitmap, NFS_FATTR_FILEHANDLE) || !fh.fh_len) { + error = EIO; goto nfsmout; } - /* directory attributes: if we don't get them, make sure to invalidate */ - nfsm_chain_op_check(error, &nmrep, NFS_OP_RESTOREFH); + if (prefetch) + nfsm_chain_op_check(error, &nmrep, NFS_OP_SAVEFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); + if (!hadattrdir) + nfsm_chain_op_check(error, &nmrep, NFS_OP_OPENATTR); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - savedxid = xid; - nfsm_chain_loadattr(error, &nmrep, dnp, nfsvers, NULL, &xid); - if (error) - NATTRINVALIDATE(dnp); - -nfsmout: - nfsm_chain_cleanup(&nmreq); - nfsm_chain_cleanup(&nmrep); + nfsmout_if(error); + xid = savedxid; + nfsm_chain_loadattr(error, &nmrep, adnp, nmp->nm_vers, &xid); + nfsmout_if(error); - if (!lockerror) { - if (!create_error && (dnp->n_flag & NNEGNCENTRIES)) { - dnp->n_flag &= ~NNEGNCENTRIES; - cache_purge_negatives(NFSTOV(dnp)); + if (open) { + if (rflags & NFS_OPEN_RESULT_LOCKTYPE_POSIX) + newnofp->nof_flags |= NFS_OPEN_FILE_POSIXLOCK; + if (rflags & NFS_OPEN_RESULT_CONFIRM) { + if (adnp) { + nfs_node_unlock(adnp); + adlockerror = ENOENT; + } + NVATTR_CLEANUP(&nvattr); + error = nfs4_open_confirm_rpc(nmp, adnp ? adnp : np, fh.fh_data, fh.fh_len, noop, &newnofp->nof_stateid, thd, cred, &nvattr, &xid); + nfsmout_if(error); + savedxid = xid; + if ((adlockerror = nfs_node_lock(adnp))) + error = adlockerror; } - dnp->n_flag |= NMODIFIED; - nfs_node_unlock(dnp); - /* nfs_getattr() will check changed and purge caches */ - nfs_getattr(dnp, &dnvattr, ctx, NGA_CACHED); } - if (!error && fh.fh_len) { +nfsmout: + if (open && adnp && !adlockerror) { + if (!open_error && (adnp->n_flag & NNEGNCENTRIES)) { + adnp->n_flag &= ~NNEGNCENTRIES; + cache_purge_negatives(NFSTOV(adnp)); + } + adnp->n_flag |= NMODIFIED; + nfs_node_unlock(adnp); + adlockerror = ENOENT; + nfs_getattr(adnp, NULL, ctx, NGA_CACHED); + } + if (adnp && !adlockerror && (error == ENOENT) && + (cnp->cn_flags & MAKEENTRY) && (cnp->cn_nameiop != CREATE) && negnamecache) { + /* add a negative entry in the name cache */ + cache_enter(NFSTOV(adnp), NULL, cnp); + adnp->n_flag |= NNEGNCENTRIES; + } + if (adnp && !adlockerror) { + nfs_node_unlock(adnp); + adlockerror = ENOENT; + } + if (!error && !anp && fh.fh_len) { /* create the vnode with the filehandle and attributes */ xid = savedxid; - error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, NG_MAKEENTRY, &np); - if (!error) - newvp = NFSTOV(np); + error = nfs_nget(NFSTOMP(np), adnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, rq.r_auth, NG_MAKEENTRY, &anp); + if (!error) { + *anpp = anp; + nfs_node_unlock(anp); + } + if (!error && open) { + nfs_open_file_add_open(newnofp, accessMode, denyMode, 0); + /* After we have a node, add our open file struct to the node */ + nofp = newnofp; + error = nfs_open_file_find_internal(anp, noop, &nofp, 0, 0, 0); + if (error) { + /* This shouldn't happen, because we passed in a new nofp to use. */ + printf("nfs_open_file_find_internal failed! %d\n", error); + nofp = NULL; + } else if (nofp != newnofp) { + /* + * Hmm... an open file struct already exists. + * Mark the existing one busy and merge our open into it. + * Then destroy the one we created. + * Note: there's no chance of an open confict because the + * open has already been granted. + */ + nofpbusyerror = nfs_open_file_set_busy(nofp, NULL); + nfs_open_file_add_open(nofp, accessMode, denyMode, 0); + nofp->nof_stateid = newnofp->nof_stateid; + if (newnofp->nof_flags & NFS_OPEN_FILE_POSIXLOCK) + nofp->nof_flags |= NFS_OPEN_FILE_POSIXLOCK; + nfs_open_file_clear_busy(newnofp); + nfs_open_file_destroy(newnofp); + newnofp = NULL; + } + if (!error) { + newnofp = NULL; + nofpbusyerror = 0; + /* mark the node as holding a create-initiated open */ + nofp->nof_flags |= NFS_OPEN_FILE_CREATE; + nofp->nof_creator = current_thread(); + if (nofpp) + *nofpp = nofp; + } + } } + NVATTR_CLEANUP(&nvattr); + if (open && ((delegation == NFS_OPEN_DELEGATE_READ) || (delegation == NFS_OPEN_DELEGATE_WRITE))) { + if (!error && anp && !recall) { + /* stuff the delegation state in the node */ + lck_mtx_lock(&anp->n_openlock); + anp->n_openflags &= ~N_DELEG_MASK; + anp->n_openflags |= ((delegation == NFS_OPEN_DELEGATE_READ) ? N_DELEG_READ : N_DELEG_WRITE); + anp->n_dstateid = dstateid; + anp->n_dace = ace; + if (anp->n_dlink.tqe_next == NFSNOLIST) { + lck_mtx_lock(&nmp->nm_lock); + if (anp->n_dlink.tqe_next == NFSNOLIST) + TAILQ_INSERT_TAIL(&nmp->nm_delegations, anp, n_dlink); + lck_mtx_unlock(&nmp->nm_lock); + } + lck_mtx_unlock(&anp->n_openlock); + } else { + /* give the delegation back */ + if (anp) { + if (NFS_CMPFH(anp, fh.fh_data, fh.fh_len)) { + /* update delegation state and return it */ + lck_mtx_lock(&anp->n_openlock); + anp->n_openflags &= ~N_DELEG_MASK; + anp->n_openflags |= ((delegation == NFS_OPEN_DELEGATE_READ) ? N_DELEG_READ : N_DELEG_WRITE); + anp->n_dstateid = dstateid; + anp->n_dace = ace; + if (anp->n_dlink.tqe_next == NFSNOLIST) { + lck_mtx_lock(&nmp->nm_lock); + if (anp->n_dlink.tqe_next == NFSNOLIST) + TAILQ_INSERT_TAIL(&nmp->nm_delegations, anp, n_dlink); + lck_mtx_unlock(&nmp->nm_lock); + } + lck_mtx_unlock(&anp->n_openlock); + /* don't need to send a separate delegreturn for fh */ + fh.fh_len = 0; + } + /* return anp's current delegation */ + nfs4_delegation_return(anp, 0, thd, cred); + } + if (fh.fh_len) /* return fh's delegation if it wasn't for anp */ + nfs4_delegreturn_rpc(nmp, fh.fh_data, fh.fh_len, &dstateid, 0, thd, cred); + } + } + if (open) { + if (newnofp) { + /* need to cleanup our temporary nofp */ + nfs_open_file_clear_busy(newnofp); + nfs_open_file_destroy(newnofp); + newnofp = NULL; + } else if (nofp && !nofpbusyerror) { + nfs_open_file_clear_busy(nofp); + nofpbusyerror = ENOENT; + } + if (inuse && nfs_mount_state_in_use_end(nmp, error)) { + inuse = 0; + nofp = newnofp = NULL; + rflags = delegation = recall = eof = rlen = retlen = 0; + ace.ace_flags = 0; + s = sbuf; + slen = sizeof(sbuf); + nfsm_chain_cleanup(&nmreq); + nfsm_chain_cleanup(&nmrep); + if (anp) { + vnode_put(NFSTOV(anp)); + *anpp = anp = NULL; + } + hadattrdir = (adnp != NULL); + if (noopbusy) { + nfs_open_owner_clear_busy(noop); + noopbusy = 0; + } + goto restart; + } + if (noop) { + if (noopbusy) { + nfs_open_owner_clear_busy(noop); + noopbusy = 0; + } + nfs_open_owner_rele(noop); + } + } + if (!error && prefetch && nmrep.nmc_mhead) { + nfsm_chain_op_check(error, &nmrep, NFS_OP_RESTOREFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_NVERIFY); + nfsm_chain_op_check(error, &nmrep, NFS_OP_READ); + nfsm_chain_get_32(error, &nmrep, eof); + nfsm_chain_get_32(error, &nmrep, retlen); + if (!error && anp) { + /* + * There can be one problem with doing the prefetch. + * Because we don't have the node before we start the RPC, we + * can't have the buffer busy while the READ is performed. + * So there is a chance that other I/O occured on the same + * range of data while we were performing this RPC. If that + * happens, then it's possible the data we have in the READ + * response is no longer up to date. + * Once we have the node and the buffer, we need to make sure + * that there's no chance we could be putting stale data in + * the buffer. + * So, we check if the range read is dirty or if any I/O may + * have occured on it while we were performing our RPC. + */ + struct nfsbuf *bp = NULL; + int lastpg; + uint32_t pagemask; + + retlen = MIN(retlen, rlen); + + /* check if node needs size update or invalidation */ + if (ISSET(anp->n_flag, NUPDATESIZE)) + nfs_data_update_size(anp, 0); + if (!(error = nfs_node_lock(anp))) { + if (anp->n_flag & NNEEDINVALIDATE) { + anp->n_flag &= ~NNEEDINVALIDATE; + nfs_node_unlock(anp); + error = nfs_vinvalbuf(NFSTOV(anp), V_SAVE|V_IGNORE_WRITEERR, ctx, 1); + if (!error) /* lets play it safe and just drop the data */ + error = EIO; + } else { + nfs_node_unlock(anp); + } + } - nfs_dulookup_finish(&dul, dnp, ctx); - - /* - * Kludge: Map EEXIST => 0 assuming that you have a reply to a retry - * if we can succeed in looking up the object. - */ - if ((create_error == EEXIST) || (!create_error && !newvp)) { - error = nfs_lookitup(dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx, &np); - if (!error) { - newvp = NFSTOV(np); - if (vnode_vtype(newvp) != VLNK) - error = EEXIST; + /* calculate page mask for the range of data read */ + lastpg = (trunc_page_32(retlen) - 1) / PAGE_SIZE; + pagemask = ((1 << (lastpg + 1)) - 1); + + if (!error) + error = nfs_buf_get(anp, 0, nmp->nm_biosize, thd, NBLK_READ|NBLK_NOWAIT, &bp); + /* don't save the data if dirty or potential I/O conflict */ + if (!error && bp && !bp->nb_dirtyoff && !(bp->nb_dirty & pagemask) && + timevalcmp(&anp->n_lastio, &now, <)) { + OSAddAtomic(1, &nfsstats.read_bios); + CLR(bp->nb_flags, (NB_DONE|NB_ASYNC)); + SET(bp->nb_flags, NB_READ); + NFS_BUF_MAP(bp); + nfsm_chain_get_opaque(error, &nmrep, retlen, bp->nb_data); + if (error) { + bp->nb_error = error; + SET(bp->nb_flags, NB_ERROR); + } else { + bp->nb_offio = 0; + bp->nb_endio = rlen; + if ((retlen > 0) && (bp->nb_endio < (int)retlen)) + bp->nb_endio = retlen; + if (eof || (retlen == 0)) { + /* zero out the remaining data (up to EOF) */ + off_t rpcrem, eofrem, rem; + rpcrem = (rlen - retlen); + eofrem = anp->n_size - (NBOFF(bp) + retlen); + rem = (rpcrem < eofrem) ? rpcrem : eofrem; + if (rem > 0) + bzero(bp->nb_data + retlen, rem); + } else if ((retlen < rlen) && !ISSET(bp->nb_flags, NB_ERROR)) { + /* ugh... short read ... just invalidate for now... */ + SET(bp->nb_flags, NB_INVAL); + } + } + nfs_buf_read_finish(bp); + microuptime(&anp->n_lastio); + } + if (bp) + nfs_buf_release(bp, 1); } + error = 0; /* ignore any transient error in processing the prefetch */ } - if (!busyerror) - nfs_node_clear_busy(dnp); - if (error) { - if (newvp) { - nfs_node_unlock(np); - vnode_put(newvp); + if (adnp && !adbusyerror) { + nfs_node_clear_busy(adnp); + adbusyerror = ENOENT; + } + if (!busyerror) { + nfs_node_clear_busy(np); + busyerror = ENOENT; + } + if (adnp) + vnode_put(NFSTOV(adnp)); + if (error && *anpp) { + vnode_put(NFSTOV(*anpp)); + *anpp = NULL; + } + nfsm_chain_cleanup(&nmreq); + nfsm_chain_cleanup(&nmrep); + return (error); +} + +/* + * Remove a named attribute. + */ +int +nfs4_named_attr_remove(nfsnode_t np, nfsnode_t anp, const char *name, vfs_context_t ctx) +{ + nfsnode_t adnp = NULL; + struct nfsmount *nmp; + struct componentname cn; + struct vnop_remove_args vra; + int error, putanp = 0; + + nmp = NFSTONMP(np); + if (!nmp) + return (ENXIO); + + bzero(&cn, sizeof(cn)); + cn.cn_nameptr = __CAST_AWAY_QUALIFIER(name, const, char *); + cn.cn_namelen = strlen(name); + cn.cn_nameiop = DELETE; + cn.cn_flags = 0; + + if (!anp) { + error = nfs4_named_attr_get(np, &cn, NFS_OPEN_SHARE_ACCESS_NONE, + 0, ctx, &anp, NULL); + if ((!error && !anp) || (error == ENOATTR)) + error = ENOENT; + if (error) { + if (anp) { + vnode_put(NFSTOV(anp)); + anp = NULL; + } + goto out; } - } else { - nfs_node_unlock(np); - *npp = np; + putanp = 1; + } + + if ((error = nfs_node_set_busy(np, vfs_context_thread(ctx)))) + goto out; + adnp = nfs4_named_attr_dir_get(np, 1, ctx); + nfs_node_clear_busy(np); + if (!adnp) { + error = ENOENT; + goto out; } + + vra.a_desc = &vnop_remove_desc; + vra.a_dvp = NFSTOV(adnp); + vra.a_vp = NFSTOV(anp); + vra.a_cnp = &cn; + vra.a_flags = 0; + vra.a_context = ctx; + error = nfs_vnop_remove(&vra); +out: + if (adnp) + vnode_put(NFSTOV(adnp)); + if (putanp) + vnode_put(NFSTOV(anp)); return (error); } int -nfs4_vnop_mknod( - struct vnop_mknod_args /* { +nfs4_vnop_getxattr( + struct vnop_getxattr_args /* { struct vnodeop_desc *a_desc; - vnode_t a_dvp; - vnode_t *a_vpp; - struct componentname *a_cnp; - struct vnode_attr *a_vap; + vnode_t a_vp; + const char * a_name; + uio_t a_uio; + size_t *a_size; + int a_options; vfs_context_t a_context; } */ *ap) { - nfsnode_t np = NULL; + vfs_context_t ctx = ap->a_context; struct nfsmount *nmp; - int error; + struct nfs_vattr nvattr; + struct componentname cn; + nfsnode_t anp; + int error = 0, isrsrcfork; - nmp = VTONMP(ap->a_dvp); + nmp = VTONMP(ap->a_vp); if (!nmp) return (ENXIO); - if (!VATTR_IS_ACTIVE(ap->a_vap, va_type)) - return (EINVAL); - switch (ap->a_vap->va_type) { - case VBLK: - case VCHR: - case VFIFO: - case VSOCK: - break; - default: + if (!(nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR)) return (ENOTSUP); + error = nfs_getattr(VTONFS(ap->a_vp), &nvattr, ctx, NGA_CACHED); + if (error) + return (error); + if (NFS_BITMAP_ISSET(nvattr.nva_bitmap, NFS_FATTR_NAMED_ATTR) && + !(nvattr.nva_flags & NFS_FFLAG_HAS_NAMED_ATTRS)) + return (ENOATTR); + + bzero(&cn, sizeof(cn)); + cn.cn_nameptr = __CAST_AWAY_QUALIFIER(ap->a_name, const, char *); + cn.cn_namelen = strlen(ap->a_name); + cn.cn_nameiop = LOOKUP; + cn.cn_flags = MAKEENTRY; + + /* we'll normally try to prefetch data for xattrs... the resource fork is really a stream */ + isrsrcfork = (bcmp(ap->a_name, XATTR_RESOURCEFORK_NAME, sizeof(XATTR_RESOURCEFORK_NAME)) == 0); + + error = nfs4_named_attr_get(VTONFS(ap->a_vp), &cn, NFS_OPEN_SHARE_ACCESS_NONE, + !isrsrcfork ? NFS_GET_NAMED_ATTR_PREFETCH : 0, ctx, &anp, NULL); + if ((!error && !anp) || (error == ENOENT)) + error = ENOATTR; + if (!error) { + if (ap->a_uio) + error = nfs_bioread(anp, ap->a_uio, 0, ctx); + else + *ap->a_size = anp->n_size; } + if (anp) + vnode_put(NFSTOV(anp)); + return (error); +} - error = nfs4_create_rpc(ap->a_context, VTONFS(ap->a_dvp), ap->a_cnp, ap->a_vap, - vtonfs_type(ap->a_vap->va_type, nmp->nm_vers), NULL, &np); +int +nfs4_vnop_setxattr( + struct vnop_setxattr_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + const char * a_name; + uio_t a_uio; + int a_options; + vfs_context_t a_context; + } */ *ap) +{ + vfs_context_t ctx = ap->a_context; + int options = ap->a_options; + uio_t uio = ap->a_uio; + const char *name = ap->a_name; + struct nfsmount *nmp; + struct componentname cn; + nfsnode_t anp = NULL; + int error = 0, closeerror = 0, flags, isrsrcfork, isfinderinfo, empty = 0, i; +#define FINDERINFOSIZE 32 + uint8_t finfo[FINDERINFOSIZE]; + uint32_t *finfop; + struct nfs_open_file *nofp = NULL; + char uio_buf [ UIO_SIZEOF(1) ]; + uio_t auio; + struct vnop_write_args vwa; + + nmp = VTONMP(ap->a_vp); + if (!nmp) + return (ENXIO); + + if (!(nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR)) + return (ENOTSUP); + + if ((options & XATTR_CREATE) && (options & XATTR_REPLACE)) + return (EINVAL); + + /* XXX limitation based on need to back up uio on short write */ + if (uio_iovcnt(uio) > 1) { + printf("nfs4_vnop_setxattr: iovcnt > 1\n"); + return (EINVAL); + } + + bzero(&cn, sizeof(cn)); + cn.cn_nameptr = __CAST_AWAY_QUALIFIER(name, const, char *); + cn.cn_namelen = strlen(name); + cn.cn_nameiop = CREATE; + cn.cn_flags = MAKEENTRY; + + isfinderinfo = (bcmp(name, XATTR_FINDERINFO_NAME, sizeof(XATTR_FINDERINFO_NAME)) == 0); + isrsrcfork = isfinderinfo ? 0 : (bcmp(name, XATTR_RESOURCEFORK_NAME, sizeof(XATTR_RESOURCEFORK_NAME)) == 0); + if (!isrsrcfork) + uio_setoffset(uio, 0); + if (isfinderinfo) { + if (uio_resid(uio) != sizeof(finfo)) + return (ERANGE); + error = uiomove((char*)&finfo, sizeof(finfo), uio); + if (error) + return (error); + /* setting a FinderInfo of all zeroes means remove the FinderInfo */ + empty = 1; + for (i=0, finfop=(uint32_t*)&finfo; i < (int)(sizeof(finfo)/sizeof(uint32_t)); i++) + if (finfop[i]) { + empty = 0; + break; + } + if (empty && !(options & (XATTR_CREATE|XATTR_REPLACE))) { + error = nfs4_named_attr_remove(VTONFS(ap->a_vp), anp, name, ctx); + if (error == ENOENT) + error = 0; + return (error); + } + /* first, let's see if we get a create/replace error */ + } + + /* + * create/open the xattr + * + * We need to make sure not to create it if XATTR_REPLACE. + * For all xattrs except the resource fork, we also want to + * truncate the xattr to remove any current data. We'll do + * that by setting the size to 0 on create/open. + */ + flags = 0; + if (!(options & XATTR_REPLACE)) + flags |= NFS_GET_NAMED_ATTR_CREATE; + if (options & XATTR_CREATE) + flags |= NFS_GET_NAMED_ATTR_CREATE_GUARDED; + if (!isrsrcfork) + flags |= NFS_GET_NAMED_ATTR_TRUNCATE; + + error = nfs4_named_attr_get(VTONFS(ap->a_vp), &cn, NFS_OPEN_SHARE_ACCESS_BOTH, + flags, ctx, &anp, &nofp); + if (!error && !anp) + error = ENOATTR; + if (error) + goto out; + /* grab the open state from the get/create/open */ + if (nofp && !(error = nfs_open_file_set_busy(nofp, NULL))) { + nofp->nof_flags &= ~NFS_OPEN_FILE_CREATE; + nofp->nof_creator = NULL; + nfs_open_file_clear_busy(nofp); + } + + /* Setting an empty FinderInfo really means remove it, skip to the close/remove */ + if (isfinderinfo && empty) + goto doclose; + + /* + * Write the data out and flush. + * + * For FinderInfo, we've already copied the data to finfo, so do I/O from there. + */ + vwa.a_desc = &vnop_write_desc; + vwa.a_vp = NFSTOV(anp); + vwa.a_uio = NULL; + vwa.a_ioflag = 0; + vwa.a_context = ctx; + if (isfinderinfo) { + auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_WRITE, &uio_buf, sizeof(uio_buf)); + uio_addiov(auio, (uintptr_t)&finfo, sizeof(finfo)); + vwa.a_uio = auio; + } else if (uio_resid(uio) > 0) { + vwa.a_uio = uio; + } + if (vwa.a_uio) { + error = nfs_vnop_write(&vwa); + if (!error) + error = nfs_flush(anp, MNT_WAIT, vfs_context_thread(ctx), 0); + } +doclose: + /* Close the xattr. */ + if (nofp) { + int busyerror = nfs_open_file_set_busy(nofp, NULL); + closeerror = nfs_close(anp, nofp, NFS_OPEN_SHARE_ACCESS_BOTH, NFS_OPEN_SHARE_DENY_NONE, ctx); + if (!busyerror) + nfs_open_file_clear_busy(nofp); + } + if (!error && isfinderinfo && empty) { /* Setting an empty FinderInfo really means remove it */ + error = nfs4_named_attr_remove(VTONFS(ap->a_vp), anp, name, ctx); + if (error == ENOENT) + error = 0; + } if (!error) - *ap->a_vpp = NFSTOV(np); + error = closeerror; +out: + if (anp) + vnode_put(NFSTOV(anp)); + if (error == ENOENT) + error = ENOATTR; return (error); } int -nfs4_vnop_mkdir( - struct vnop_mkdir_args /* { +nfs4_vnop_removexattr( + struct vnop_removexattr_args /* { struct vnodeop_desc *a_desc; - vnode_t a_dvp; - vnode_t *a_vpp; - struct componentname *a_cnp; - struct vnode_attr *a_vap; + vnode_t a_vp; + const char * a_name; + int a_options; vfs_context_t a_context; } */ *ap) { - nfsnode_t np = NULL; + struct nfsmount *nmp = VTONMP(ap->a_vp); int error; - error = nfs4_create_rpc(ap->a_context, VTONFS(ap->a_dvp), ap->a_cnp, ap->a_vap, - NFDIR, NULL, &np); - if (!error) - *ap->a_vpp = NFSTOV(np); + if (!nmp) + return (ENXIO); + if (!(nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR)) + return (ENOTSUP); + + error = nfs4_named_attr_remove(VTONFS(ap->a_vp), NULL, ap->a_name, ap->a_context); + if (error == ENOENT) + error = ENOATTR; return (error); } int -nfs4_vnop_symlink( - struct vnop_symlink_args /* { +nfs4_vnop_listxattr( + struct vnop_listxattr_args /* { struct vnodeop_desc *a_desc; - vnode_t a_dvp; - vnode_t *a_vpp; - struct componentname *a_cnp; - struct vnode_attr *a_vap; - char *a_target; + vnode_t a_vp; + uio_t a_uio; + size_t *a_size; + int a_options; vfs_context_t a_context; } */ *ap) { - nfsnode_t np = NULL; - int error; + vfs_context_t ctx = ap->a_context; + nfsnode_t np = VTONFS(ap->a_vp); + uio_t uio = ap->a_uio; + nfsnode_t adnp = NULL; + struct nfsmount *nmp; + int error, done, i; + struct nfs_vattr nvattr; + uint64_t cookie, nextcookie, lbn = 0; + struct nfsbuf *bp = NULL; + struct nfs_dir_buf_header *ndbhp; + struct direntry *dp; - error = nfs4_create_rpc(ap->a_context, VTONFS(ap->a_dvp), ap->a_cnp, ap->a_vap, - NFLNK, ap->a_target, &np); - if (!error) - *ap->a_vpp = NFSTOV(np); + nmp = VTONMP(ap->a_vp); + if (!nmp) + return (ENXIO); + + if (!(nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR)) + return (ENOTSUP); + + error = nfs_getattr(np, &nvattr, ctx, NGA_CACHED); + if (error) + return (error); + if (NFS_BITMAP_ISSET(nvattr.nva_bitmap, NFS_FATTR_NAMED_ATTR) && + !(nvattr.nva_flags & NFS_FFLAG_HAS_NAMED_ATTRS)) + return (0); + + if ((error = nfs_node_set_busy(np, vfs_context_thread(ctx)))) + return (error); + adnp = nfs4_named_attr_dir_get(np, 1, ctx); + nfs_node_clear_busy(np); + if (!adnp) + goto out; + + if ((error = nfs_node_lock(adnp))) + goto out; + + if (adnp->n_flag & NNEEDINVALIDATE) { + adnp->n_flag &= ~NNEEDINVALIDATE; + nfs_invaldir(adnp); + nfs_node_unlock(adnp); + error = nfs_vinvalbuf(NFSTOV(adnp), 0, ctx, 1); + if (!error) + error = nfs_node_lock(adnp); + if (error) + goto out; + } + + /* + * check for need to invalidate when (re)starting at beginning + */ + if (adnp->n_flag & NMODIFIED) { + nfs_invaldir(adnp); + nfs_node_unlock(adnp); + if ((error = nfs_vinvalbuf(NFSTOV(adnp), 0, ctx, 1))) + goto out; + } else { + nfs_node_unlock(adnp); + } + /* nfs_getattr() will check changed and purge caches */ + if ((error = nfs_getattr(adnp, &nvattr, ctx, NGA_UNCACHED))) + goto out; + + if (uio && (uio_resid(uio) == 0)) + goto out; + + done = 0; + nextcookie = lbn = 0; + + while (!error && !done) { + OSAddAtomic(1, &nfsstats.biocache_readdirs); + cookie = nextcookie; +getbuffer: + error = nfs_buf_get(adnp, lbn, NFS_DIRBLKSIZ, vfs_context_thread(ctx), NBLK_READ, &bp); + if (error) + goto out; + ndbhp = (struct nfs_dir_buf_header*)bp->nb_data; + if (!ISSET(bp->nb_flags, NB_CACHE) || !ISSET(ndbhp->ndbh_flags, NDB_FULL)) { + if (!ISSET(bp->nb_flags, NB_CACHE)) { /* initialize the buffer */ + ndbhp->ndbh_flags = 0; + ndbhp->ndbh_count = 0; + ndbhp->ndbh_entry_end = sizeof(*ndbhp); + ndbhp->ndbh_ncgen = adnp->n_ncgen; + } + error = nfs_buf_readdir(bp, ctx); + if (error == NFSERR_DIRBUFDROPPED) + goto getbuffer; + if (error) + nfs_buf_release(bp, 1); + if (error && (error != ENXIO) && (error != ETIMEDOUT) && (error != EINTR) && (error != ERESTART)) { + if (!nfs_node_lock(adnp)) { + nfs_invaldir(adnp); + nfs_node_unlock(adnp); + } + nfs_vinvalbuf(NFSTOV(adnp), 0, ctx, 1); + if (error == NFSERR_BAD_COOKIE) + error = ENOENT; + } + if (error) + goto out; + } + + /* go through all the entries copying/counting */ + dp = NFS_DIR_BUF_FIRST_DIRENTRY(bp); + for (i=0; i < ndbhp->ndbh_count; i++) { + if (!xattr_protected(dp->d_name)) { + if (uio == NULL) { + *ap->a_size += dp->d_namlen + 1; + } else if (uio_resid(uio) < (dp->d_namlen + 1)) { + error = ERANGE; + } else { + error = uiomove(dp->d_name, dp->d_namlen+1, uio); + if (error && (error != EFAULT)) + error = ERANGE; + } + } + nextcookie = dp->d_seekoff; + dp = NFS_DIRENTRY_NEXT(dp); + } + + if (i == ndbhp->ndbh_count) { + /* hit end of buffer, move to next buffer */ + lbn = nextcookie; + /* if we also hit EOF, we're done */ + if (ISSET(ndbhp->ndbh_flags, NDB_EOF)) + done = 1; + } + if (!error && !done && (nextcookie == cookie)) { + printf("nfs readdir cookie didn't change 0x%llx, %d/%d\n", cookie, i, ndbhp->ndbh_count); + error = EIO; + } + nfs_buf_release(bp, 1); + } +out: + if (adnp) + vnode_put(NFSTOV(adnp)); return (error); } +#if NAMEDSTREAMS int -nfs4_vnop_link( - struct vnop_link_args /* { +nfs4_vnop_getnamedstream( + struct vnop_getnamedstream_args /* { struct vnodeop_desc *a_desc; vnode_t a_vp; - vnode_t a_tdvp; - struct componentname *a_cnp; + vnode_t *a_svpp; + const char *a_name; + enum nsoperation a_operation; + int a_flags; vfs_context_t a_context; } */ *ap) { vfs_context_t ctx = ap->a_context; - vnode_t vp = ap->a_vp; - vnode_t tdvp = ap->a_tdvp; - struct componentname *cnp = ap->a_cnp; - int error = 0, lockerror = ENOENT, status; struct nfsmount *nmp; - nfsnode_t np = VTONFS(vp); - nfsnode_t tdnp = VTONFS(tdvp); - int nfsvers, numops; - u_int64_t xid, savedxid; - struct nfsm_chain nmreq, nmrep; - - if (vnode_mount(vp) != vnode_mount(tdvp)) - return (EXDEV); + struct nfs_vattr nvattr; + struct componentname cn; + nfsnode_t anp; + int error = 0; - nmp = VTONMP(vp); + nmp = VTONMP(ap->a_vp); if (!nmp) return (ENXIO); - nfsvers = nmp->nm_vers; - - /* - * Push all writes to the server, so that the attribute cache - * doesn't get "out of sync" with the server. - * XXX There should be a better way! - */ - nfs_flush(np, MNT_WAIT, vfs_context_thread(ctx), V_IGNORE_WRITEERR); - if ((error = nfs_node_set_busy2(tdnp, np, vfs_context_thread(ctx)))) + if (!(nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR)) + return (ENOTSUP); + error = nfs_getattr(VTONFS(ap->a_vp), &nvattr, ctx, NGA_CACHED); + if (error) return (error); + if (NFS_BITMAP_ISSET(nvattr.nva_bitmap, NFS_FATTR_NAMED_ATTR) && + !(nvattr.nva_flags & NFS_FFLAG_HAS_NAMED_ATTRS)) + return (ENOATTR); - nfsm_chain_null(&nmreq); - nfsm_chain_null(&nmrep); - - // PUTFH(SOURCE), SAVEFH, PUTFH(DIR), LINK, GETATTR(DIR), RESTOREFH, GETATTR - numops = 7; - nfsm_chain_build_alloc_init(error, &nmreq, 29 * NFSX_UNSIGNED + cnp->cn_namelen); - nfsm_chain_add_compound_header(error, &nmreq, "link", numops); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); - nfsm_chain_add_fh(error, &nmreq, nfsvers, np->n_fhp, np->n_fhsize); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_SAVEFH); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); - nfsm_chain_add_fh(error, &nmreq, nfsvers, tdnp->n_fhp, tdnp->n_fhsize); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_LINK); - nfsm_chain_add_string(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_RESTOREFH); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); - nfsm_chain_build_done(error, &nmreq); - nfsm_assert(error, (numops == 0), EPROTO); - nfsmout_if(error); - error = nfs_request(tdnp, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &nmrep, &xid, &status); - - if ((lockerror = nfs_node_lock2(tdnp, np))) { - error = lockerror; - goto nfsmout; - } - nfsm_chain_skip_tag(error, &nmrep); - nfsm_chain_get_32(error, &nmrep, numops); - nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); - nfsm_chain_op_check(error, &nmrep, NFS_OP_SAVEFH); - nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); - nfsm_chain_op_check(error, &nmrep, NFS_OP_LINK); - nfsm_chain_check_change_info(error, &nmrep, tdnp); - /* directory attributes: if we don't get them, make sure to invalidate */ - nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - savedxid = xid; - nfsm_chain_loadattr(error, &nmrep, tdnp, nfsvers, NULL, &xid); - if (error) - NATTRINVALIDATE(tdnp); - /* link attributes: if we don't get them, make sure to invalidate */ - nfsm_chain_op_check(error, &nmrep, NFS_OP_RESTOREFH); - nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - xid = savedxid; - nfsm_chain_loadattr(error, &nmrep, np, nfsvers, NULL, &xid); - if (error) - NATTRINVALIDATE(np); -nfsmout: - nfsm_chain_cleanup(&nmreq); - nfsm_chain_cleanup(&nmrep); - if (!lockerror) - tdnp->n_flag |= NMODIFIED; - /* Kludge: Map EEXIST => 0 assuming that it is a reply to a retry. */ - if (error == EEXIST) - error = 0; - if (!error && (tdnp->n_flag & NNEGNCENTRIES)) { - tdnp->n_flag &= ~NNEGNCENTRIES; - cache_purge_negatives(tdvp); - } - if (!lockerror) - nfs_node_unlock2(tdnp, np); - nfs_node_clear_busy2(tdnp, np); + bzero(&cn, sizeof(cn)); + cn.cn_nameptr = __CAST_AWAY_QUALIFIER(ap->a_name, const, char *); + cn.cn_namelen = strlen(ap->a_name); + cn.cn_nameiop = LOOKUP; + cn.cn_flags = MAKEENTRY; + + error = nfs4_named_attr_get(VTONFS(ap->a_vp), &cn, NFS_OPEN_SHARE_ACCESS_NONE, + 0, ctx, &anp, NULL); + if ((!error && !anp) || (error == ENOENT)) + error = ENOATTR; + if (!error && anp) + *ap->a_svpp = NFSTOV(anp); + else if (anp) + vnode_put(NFSTOV(anp)); return (error); } int -nfs4_vnop_rmdir( - struct vnop_rmdir_args /* { +nfs4_vnop_makenamedstream( + struct vnop_makenamedstream_args /* { struct vnodeop_desc *a_desc; - vnode_t a_dvp; + vnode_t *a_svpp; vnode_t a_vp; - struct componentname *a_cnp; + const char *a_name; + int a_flags; vfs_context_t a_context; } */ *ap) { vfs_context_t ctx = ap->a_context; - vnode_t vp = ap->a_vp; - vnode_t dvp = ap->a_dvp; - struct componentname *cnp = ap->a_cnp; + struct nfsmount *nmp; + struct componentname cn; + nfsnode_t anp; int error = 0; - nfsnode_t np = VTONFS(vp); - nfsnode_t dnp = VTONFS(dvp); - struct nfs_vattr dnvattr; - struct nfs_dulookup dul; - if (vnode_vtype(vp) != VDIR) - return (EINVAL); + nmp = VTONMP(ap->a_vp); + if (!nmp) + return (ENXIO); - if ((error = nfs_node_set_busy2(dnp, np, vfs_context_thread(ctx)))) - return (error); + if (!(nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR)) + return (ENOTSUP); - nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx); - nfs_dulookup_start(&dul, dnp, ctx); + bzero(&cn, sizeof(cn)); + cn.cn_nameptr = __CAST_AWAY_QUALIFIER(ap->a_name, const, char *); + cn.cn_namelen = strlen(ap->a_name); + cn.cn_nameiop = CREATE; + cn.cn_flags = MAKEENTRY; + + error = nfs4_named_attr_get(VTONFS(ap->a_vp), &cn, NFS_OPEN_SHARE_ACCESS_BOTH, + NFS_GET_NAMED_ATTR_CREATE, ctx, &anp, NULL); + if ((!error && !anp) || (error == ENOENT)) + error = ENOATTR; + if (!error && anp) + *ap->a_svpp = NFSTOV(anp); + else if (anp) + vnode_put(NFSTOV(anp)); + return (error); +} - error = nfs4_remove_rpc(dnp, cnp->cn_nameptr, cnp->cn_namelen, - vfs_context_thread(ctx), vfs_context_ucred(ctx)); +int +nfs4_vnop_removenamedstream( + struct vnop_removenamedstream_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + vnode_t a_svp; + const char *a_name; + int a_flags; + vfs_context_t a_context; + } */ *ap) +{ + struct nfsmount *nmp = VTONMP(ap->a_vp); + nfsnode_t np = ap->a_vp ? VTONFS(ap->a_vp) : NULL; + nfsnode_t anp = ap->a_svp ? VTONFS(ap->a_svp) : NULL; - nfs_name_cache_purge(dnp, np, cnp, ctx); - /* nfs_getattr() will check changed and purge caches */ - nfs_getattr(dnp, &dnvattr, ctx, NGA_CACHED); - nfs_dulookup_finish(&dul, dnp, ctx); - nfs_node_clear_busy2(dnp, np); + if (!nmp) + return (ENXIO); /* - * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry. + * Given that a_svp is a named stream, checking for + * named attribute support is kinda pointless. */ - if (error == ENOENT) - error = 0; - if (!error) { - /* - * remove nfsnode from hash now so we can't accidentally find it - * again if another object gets created with the same filehandle - * before this vnode gets reclaimed - */ - lck_mtx_lock(nfs_node_hash_mutex); - if (np->n_hflag & NHHASHED) { - LIST_REMOVE(np, n_hash); - np->n_hflag &= ~NHHASHED; - FSDBG(266, 0, np, np->n_flag, 0xb1eb1e); - } - lck_mtx_unlock(nfs_node_hash_mutex); - } - return (error); + if (!(nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR)) + return (ENOTSUP); + + return (nfs4_named_attr_remove(np, anp, ap->a_name, ap->a_context)); } +#endif diff --git a/bsd/nfs/nfs_bio.c b/bsd/nfs/nfs_bio.c index 1c1c19123..4bd1bff61 100644 --- a/bsd/nfs/nfs_bio.c +++ b/bsd/nfs/nfs_bio.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -77,6 +77,7 @@ #include #include #include +#include #include #include @@ -684,6 +685,21 @@ nfs_buf_get( loop: lck_mtx_lock(nfs_buf_mutex); + /* wait for any buffer invalidation/flushing to complete */ + while (np->n_bflag & NBINVALINPROG) { + np->n_bflag |= NBINVALWANT; + ts.tv_sec = 2; + ts.tv_nsec = 0; + msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_buf_get_invalwait", &ts); + if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { + lck_mtx_unlock(nfs_buf_mutex); + FSDBG_BOT(541, np, blkno, 0, error); + return (error); + } + if (np->n_bflag & NBINVALINPROG) + slpflag = 0; + } + /* check for existence of nfsbuf in cache */ if ((bp = nfs_buf_incore(np, blkno))) { /* if busy, set wanted and wait */ @@ -1041,8 +1057,8 @@ nfs_buf_release(struct nfsbuf *bp, int freeup) if (start < NBOFF(bp)) start = NBOFF(bp); if (end > start) { - if (!(rv = ubc_sync_range(vp, start, end, UBC_INVALIDATE))) - printf("nfs_buf_release(): ubc_sync_range failed!\n"); + if ((rv = ubc_msync(vp, start, end, NULL, UBC_INVALIDATE))) + printf("nfs_buf_release(): ubc_msync failed!, error %d\n", rv); } } CLR(bp->nb_flags, NB_PAGELIST); @@ -1508,7 +1524,7 @@ nfs_buf_read_finish(struct nfsbuf *bp) bp->nb_valid = (1 << (round_page_32(bp->nb_validend) / PAGE_SIZE)) - 1; if (bp->nb_validend & PAGE_MASK) { /* zero-fill remainder of last page */ - bzero(bp->nb_data + bp->nb_validend, bp->nb_bufsize - bp->nb_validend); + bzero(bp->nb_data + bp->nb_validend, PAGE_SIZE - (bp->nb_validend & PAGE_MASK)); } } nfs_buf_iodone(bp); @@ -1649,6 +1665,8 @@ nfs_buf_read_rpc_finish(struct nfsreq *req) kauth_cred_ref(cred); cb = req->r_callback; bp = cb.rcb_bp; + if (cb.rcb_func) /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */ + nfs_request_ref(req, 0); nmp = NFSTONMP(np); if (!nmp) { @@ -1673,23 +1691,55 @@ nfs_buf_read_rpc_finish(struct nfsreq *req) error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, auio, &rlen, &eof); if ((error == EINPROGRESS) && cb.rcb_func) { /* async request restarted */ + if (cb.rcb_func) + nfs_request_rele(req); if (IS_VALID_CRED(cred)) kauth_cred_unref(&cred); return; } if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) { lck_mtx_lock(&nmp->nm_lock); - if ((error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid) && !(nmp->nm_state & NFSSTA_RECOVER)) { - printf("nfs_buf_read_rpc_finish: error %d, initiating recovery\n", error); - nmp->nm_state |= NFSSTA_RECOVER; - nfs_mount_sock_thread_wake(nmp); + if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) { + NP(np, "nfs_buf_read_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery", + error, NBOFF(bp)+offset, cb.rcb_args[2], nmp->nm_stategenid); + nfs_need_recover(nmp, error); } lck_mtx_unlock(&nmp->nm_lock); - if (error == NFSERR_GRACE) - tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); - if (!(error = nfs_mount_state_wait_for_recovery(nmp))) { - rlen = 0; - goto readagain; + if (np->n_flag & NREVOKE) { + error = EIO; + } else { + if (error == NFSERR_GRACE) { + if (cb.rcb_func) { + /* + * For an async I/O request, handle a grace delay just like + * jukebox errors. Set the resend time and queue it up. + */ + struct timeval now; + if (req->r_nmrep.nmc_mhead) { + mbuf_freem(req->r_nmrep.nmc_mhead); + req->r_nmrep.nmc_mhead = NULL; + } + req->r_error = 0; + microuptime(&now); + lck_mtx_lock(&req->r_mtx); + req->r_resendtime = now.tv_sec + 2; + req->r_xid = 0; // get a new XID + req->r_flags |= R_RESTART; + req->r_start = 0; + nfs_asyncio_resend(req); + lck_mtx_unlock(&req->r_mtx); + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + /* Note: nfsreq reference taken will be dropped later when finished */ + return; + } + /* otherwise, just pause a couple seconds and retry */ + tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); + } + if (!(error = nfs_mount_state_wait_for_recovery(nmp))) { + rlen = 0; + goto readagain; + } } } if (error) { @@ -1734,6 +1784,7 @@ nfs_buf_read_rpc_finish(struct nfsreq *req) rreq = NULL; goto finish; } + nfs_request_rele(req); /* * We're done here. * Outstanding RPC count is unchanged. @@ -1746,6 +1797,8 @@ nfs_buf_read_rpc_finish(struct nfsreq *req) } out: + if (cb.rcb_func) + nfs_request_rele(req); if (IS_VALID_CRED(cred)) kauth_cred_unref(&cred); @@ -1786,7 +1839,8 @@ nfs_buf_readahead(nfsnode_t np, int ioflag, daddr64_t *rabnp, daddr64_t lastrabn { struct nfsmount *nmp = NFSTONMP(np); struct nfsbuf *bp; - int error = 0, nra; + int error = 0; + uint32_t nra; if (!nmp) return (ENXIO); @@ -1842,7 +1896,6 @@ nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx) { vnode_t vp = NFSTOV(np); struct nfsbuf *bp = NULL; - struct nfs_vattr nvattr; struct nfsmount *nmp = VTONMP(vp); daddr64_t lbn, rabn = 0, lastrabn, maxrabn = -1; off_t diff; @@ -1903,7 +1956,7 @@ nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx) modified = (np->n_flag & NMODIFIED); nfs_node_unlock(np); /* nfs_getattr() will check changed and purge caches */ - error = nfs_getattr(np, &nvattr, ctx, modified ? NGA_UNCACHED : NGA_CACHED); + error = nfs_getattr(np, NULL, ctx, modified ? NGA_UNCACHED : NGA_CACHED); if (error) { FSDBG_BOT(514, np, 0xd1e0004, 0, error); return (error); @@ -1986,6 +2039,12 @@ nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx) np->n_lastread = (uio_offset(uio) - 1) / biosize; nfs_node_unlock(np); + if ((uio_resid(uio) <= 0) || (uio_offset(uio) >= (off_t)np->n_size)) { + nfs_data_unlock(np); + FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), 0xaaaaaaaa); + return (0); + } + /* adjust readahead block number, if necessary */ if (rabn < lbn) rabn = lbn; @@ -2000,12 +2059,6 @@ nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx) readaheads = 1; } - if ((uio_resid(uio) <= 0) || (uio_offset(uio) >= (off_t)np->n_size)) { - nfs_data_unlock(np); - FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), 0xaaaaaaaa); - return (0); - } - OSAddAtomic(1, &nfsstats.biocache_reads); /* @@ -2182,7 +2235,7 @@ nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx) int nfs_async_write_start(struct nfsmount *nmp) { - int error = 0, slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0; + int error = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0; struct timespec ts = {1, 0}; if (nfs_max_async_writes <= 0) @@ -2301,7 +2354,7 @@ nfs_buf_write(struct nfsbuf *bp) } SET(bp->nb_flags, NB_WRITEINPROG); error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp) + bp->nb_dirtyoff, - bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred); + bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred, bp->nb_verf); CLR(bp->nb_flags, NB_WRITEINPROG); if (error) { if (error != NFSERR_STALEWRITEVERF) { @@ -2610,7 +2663,7 @@ nfs_buf_write_dirty_pages(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) CLR(bp->nb_flags, NB_WRITEINPROG); if (!error && (commit != NFS_WRITE_FILESYNC)) { - error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred); + error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred, wverf); if (error == NFSERR_STALEWRITEVERF) { /* verifier changed, so we need to restart all the writes */ iomode = NFS_WRITE_FILESYNC; @@ -2731,6 +2784,9 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred } else { nfs_buf_write_finish(bp, thd, cred); } + /* It may have just been an interrupt... that's OK */ + if (!ISSET(bp->nb_flags, NB_ERROR)) + error = 0; } return (error); @@ -2765,6 +2821,8 @@ nfs_buf_write_rpc_finish(struct nfsreq *req) kauth_cred_ref(cred); cb = req->r_callback; bp = cb.rcb_bp; + if (cb.rcb_func) /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */ + nfs_request_ref(req, 0); nmp = NFSTONMP(np); if (!nmp) { @@ -2785,23 +2843,55 @@ nfs_buf_write_rpc_finish(struct nfsreq *req) error = nmp->nm_funcs->nf_write_rpc_async_finish(np, req, &committed, &rlen, &wverf); if ((error == EINPROGRESS) && cb.rcb_func) { /* async request restarted */ + if (cb.rcb_func) + nfs_request_rele(req); if (IS_VALID_CRED(cred)) kauth_cred_unref(&cred); return; } if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) { lck_mtx_lock(&nmp->nm_lock); - if ((error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid) && !(nmp->nm_state & NFSSTA_RECOVER)) { - printf("nfs_buf_write_rpc_finish: error %d, initiating recovery\n", error); - nmp->nm_state |= NFSSTA_RECOVER; - nfs_mount_sock_thread_wake(nmp); + if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) { + NP(np, "nfs_buf_write_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery", + error, NBOFF(bp)+offset, cb.rcb_args[2], nmp->nm_stategenid); + nfs_need_recover(nmp, error); } lck_mtx_unlock(&nmp->nm_lock); - if (error == NFSERR_GRACE) - tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); - if (!(error = nfs_mount_state_wait_for_recovery(nmp))) { - rlen = 0; - goto writeagain; + if (np->n_flag & NREVOKE) { + error = EIO; + } else { + if (error == NFSERR_GRACE) { + if (cb.rcb_func) { + /* + * For an async I/O request, handle a grace delay just like + * jukebox errors. Set the resend time and queue it up. + */ + struct timeval now; + if (req->r_nmrep.nmc_mhead) { + mbuf_freem(req->r_nmrep.nmc_mhead); + req->r_nmrep.nmc_mhead = NULL; + } + req->r_error = 0; + microuptime(&now); + lck_mtx_lock(&req->r_mtx); + req->r_resendtime = now.tv_sec + 2; + req->r_xid = 0; // get a new XID + req->r_flags |= R_RESTART; + req->r_start = 0; + nfs_asyncio_resend(req); + lck_mtx_unlock(&req->r_mtx); + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + /* Note: nfsreq reference taken will be dropped later when finished */ + return; + } + /* otherwise, just pause a couple seconds and retry */ + tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); + } + if (!(error = nfs_mount_state_wait_for_recovery(nmp))) { + rlen = 0; + goto writeagain; + } } } if (error) { @@ -2863,6 +2953,7 @@ nfs_buf_write_rpc_finish(struct nfsreq *req) wreq = NULL; goto finish; } + nfs_request_rele(req); /* * We're done here. * Outstanding RPC count is unchanged. @@ -2875,8 +2966,10 @@ nfs_buf_write_rpc_finish(struct nfsreq *req) } out: - if (cb.rcb_func) + if (cb.rcb_func) { nfs_async_write_done(nmp); + nfs_request_rele(req); + } /* * Decrement outstanding RPC count on buffer * and call nfs_buf_write_finish on last RPC. @@ -2918,6 +3011,7 @@ nfs_flushcommits(nfsnode_t np, int nowait) struct nfsbuflists blist, commitlist; int error = 0, retv, wcred_set, flags, dirty; u_quad_t off, endoff, toff; + uint64_t wverf; u_int32_t count; kauth_cred_t wcred = NULL; @@ -2956,6 +3050,7 @@ nfs_flushcommits(nfsnode_t np, int nowait) if (nowait) flags |= NBI_NOWAIT; lck_mtx_lock(nfs_buf_mutex); + wverf = nmp->nm_verf; if (!nfs_buf_iterprepare(np, &blist, flags)) { while ((bp = LIST_FIRST(&blist))) { LIST_REMOVE(bp, nb_vnbufs); @@ -2965,8 +3060,8 @@ nfs_flushcommits(nfsnode_t np, int nowait) continue; if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) nfs_buf_check_write_verifier(np, bp); - if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT)) - != (NB_DELWRI | NB_NEEDCOMMIT))) { + if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT)) != (NB_DELWRI | NB_NEEDCOMMIT)) || + (bp->nb_verf != wverf)) { nfs_buf_drop(bp); continue; } @@ -3066,13 +3161,13 @@ nfs_flushcommits(nfsnode_t np, int nowait) count = 0; else count = (endoff - off); - retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred); + retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred, wverf); } else { retv = 0; LIST_FOREACH(bp, &commitlist, nb_vnbufs) { toff = NBOFF(bp) + bp->nb_dirtyoff; count = bp->nb_dirtyend - bp->nb_dirtyoff; - retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred); + retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred, wverf); if (retv) break; } @@ -3161,7 +3256,7 @@ nfs_flush(nfsnode_t np, int waitfor, thread_t thd, int ignore_writeerr) goto out; } nfsvers = nmp->nm_vers; - if (nmp->nm_flag & NFSMNT_INT) + if (NMFLAG(nmp, INTR)) slpflag = PCATCH; if (!LIST_EMPTY(&np->n_dirtyblkhd)) { @@ -3173,8 +3268,9 @@ nfs_flush(nfsnode_t np, int waitfor, thread_t thd, int ignore_writeerr) lck_mtx_lock(nfs_buf_mutex); while (np->n_bflag & NBFLUSHINPROG) { np->n_bflag |= NBFLUSHWANT; - msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_flush", NULL); - if ((error = nfs_sigintr(NFSTONMP(np), NULL, thd, 0))) { + error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_flush", NULL); + if ((error && (error != EWOULDBLOCK)) || + ((error = nfs_sigintr(NFSTONMP(np), NULL, thd, 0)))) { lck_mtx_unlock(nfs_buf_mutex); goto out; } @@ -3458,8 +3554,10 @@ nfs_vinvalbuf_internal( if (error) { FSDBG(554, bp, 0xd00dee, 0xbad, error); nfs_node_lock_force(np); - np->n_error = error; - np->n_flag |= NWRITEERR; + if ((error != EINTR) && (error != ERESTART)) { + np->n_error = error; + np->n_flag |= NWRITEERR; + } /* * There was a write error and we need to * invalidate attrs to sync with server. @@ -3468,7 +3566,7 @@ nfs_vinvalbuf_internal( */ NATTRINVALIDATE(np); nfs_node_unlock(np); - if (error == EINTR) { + if ((error == EINTR) || (error == ERESTART)) { /* * Abort on EINTR. If we don't, we could * be stuck in this loop forever because @@ -3521,12 +3619,13 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf { nfsnode_t np = VTONFS(vp); struct nfsmount *nmp = VTONMP(vp); - int error, rv, slpflag, slptimeo, nflags; + int error, slpflag, slptimeo, nflags, retry = 0; + struct timespec ts = { 2, 0 }; off_t size; FSDBG_TOP(554, np, flags, intrflg, 0); - if (nmp && !(nmp->nm_flag & NFSMNT_INT)) + if (nmp && !NMFLAG(nmp, INTR)) intrflg = 0; if (intrflg) { slpflag = PCATCH; @@ -3540,16 +3639,19 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf lck_mtx_lock(nfs_buf_mutex); while (np->n_bflag & NBINVALINPROG) { np->n_bflag |= NBINVALWANT; - msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", NULL); + msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", &ts); if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { lck_mtx_unlock(nfs_buf_mutex); return (error); } + if (np->n_bflag & NBINVALINPROG) + slpflag = 0; } np->n_bflag |= NBINVALINPROG; lck_mtx_unlock(nfs_buf_mutex); /* Now, flush as required. */ +again: error = nfs_vinvalbuf_internal(np, flags, thd, cred, slpflag, 0); while (error) { FSDBG(554, np, 0, 0, error); @@ -3560,8 +3662,15 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf /* get the pages out of vm also */ if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp))) - if (!(rv = ubc_sync_range(vp, 0, size, UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE))) - panic("nfs_vinvalbuf(): ubc_sync_range failed!"); + if ((error = ubc_msync(vp, 0, size, NULL, UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE))) { + if (error == EINVAL) + panic("nfs_vinvalbuf(): ubc_msync failed!, error %d", error); + if (retry++ < 10) /* retry invalidating a few times */ + goto again; + /* give up */ + printf("nfs_vinvalbuf(): ubc_msync failed!, error %d", error); + + } done: lck_mtx_lock(nfs_buf_mutex); nflags = np->n_bflag; @@ -3574,6 +3683,57 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf return (error); } +/* + * Wait for any busy buffers to complete. + */ +void +nfs_wait_bufs(nfsnode_t np) +{ + struct nfsbuf *bp; + struct nfsbuflists blist; + int error = 0; + + lck_mtx_lock(nfs_buf_mutex); + if (!nfs_buf_iterprepare(np, &blist, NBI_CLEAN)) { + while ((bp = LIST_FIRST(&blist))) { + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); + nfs_buf_refget(bp); + while ((error = nfs_buf_acquire(bp, 0, 0, 0))) { + if (error != EAGAIN) { + nfs_buf_refrele(bp); + nfs_buf_itercomplete(np, &blist, NBI_CLEAN); + lck_mtx_unlock(nfs_buf_mutex); + return; + } + } + nfs_buf_refrele(bp); + nfs_buf_drop(bp); + } + nfs_buf_itercomplete(np, &blist, NBI_CLEAN); + } + if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) { + while ((bp = LIST_FIRST(&blist))) { + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + nfs_buf_refget(bp); + while ((error = nfs_buf_acquire(bp, 0, 0, 0))) { + if (error != EAGAIN) { + nfs_buf_refrele(bp); + nfs_buf_itercomplete(np, &blist, NBI_DIRTY); + lck_mtx_unlock(nfs_buf_mutex); + return; + } + } + nfs_buf_refrele(bp); + nfs_buf_drop(bp); + } + nfs_buf_itercomplete(np, &blist, NBI_DIRTY); + } + lck_mtx_unlock(nfs_buf_mutex); +} + + /* * Add an async I/O request to the mount's async I/O queue and make * sure that an nfsiod will service it. diff --git a/bsd/nfs/nfs_boot.c b/bsd/nfs/nfs_boot.c index 33bc25128..7fcd73bee 100644 --- a/bsd/nfs/nfs_boot.c +++ b/bsd/nfs/nfs_boot.c @@ -177,13 +177,7 @@ static int get_file_handle(struct nfs_dlmount *ndmntp); #define IP_CH(ip) ((u_char *)ip) #define IP_LIST(ip) IP_CH(ip)[0],IP_CH(ip)[1],IP_CH(ip)[2],IP_CH(ip)[3] -extern boolean_t -netboot_iaddr(struct in_addr * iaddr_p); - -extern boolean_t -netboot_rootpath(struct in_addr * server_ip, - char * name, int name_len, - char * path, int path_len); +#include /* * Called with an empty nfs_diskless struct to be filled in. diff --git a/bsd/nfs/nfs_gss.c b/bsd/nfs/nfs_gss.c index b8dbbb4a2..c848bfae6 100644 --- a/bsd/nfs/nfs_gss.c +++ b/bsd/nfs/nfs_gss.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2009 Apple Inc. All rights reserved. + * Copyright (c) 2007-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -147,8 +147,8 @@ int nfs_single_des; * These octet strings are used to encode/decode ASN.1 tokens * in the RPCSEC_GSS verifiers. */ -static u_char krb5_tokhead[] = { 0x60, 0x23 }; -static u_char krb5_mech[] = { 0x06, 0x09, 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x12, 0x01, 0x02, 0x02 }; +static u_char krb5_tokhead[] __attribute__((unused)) = { 0x60, 0x23 }; + u_char krb5_mech[11] = { 0x06, 0x09, 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x12, 0x01, 0x02, 0x02 }; static u_char krb5_mic[] = { 0x01, 0x01, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff }; static u_char krb5_mic3[] = { 0x01, 0x01, 0x04, 0x00, 0xff, 0xff, 0xff, 0xff }; static u_char krb5_wrap[] = { 0x02, 0x01, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff }; @@ -184,11 +184,11 @@ static u_char iv0[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; // DES static int nfs_gss_clnt_ctx_find(struct nfsreq *); static int nfs_gss_clnt_ctx_failover(struct nfsreq *); static int nfs_gss_clnt_ctx_init(struct nfsreq *, struct nfs_gss_clnt_ctx *); +static int nfs_gss_clnt_ctx_init_retry(struct nfsreq *, struct nfs_gss_clnt_ctx *); static int nfs_gss_clnt_ctx_callserver(struct nfsreq *, struct nfs_gss_clnt_ctx *); static char *nfs_gss_clnt_svcname(struct nfsmount *); static int nfs_gss_clnt_gssd_upcall(struct nfsreq *, struct nfs_gss_clnt_ctx *); static void nfs_gss_clnt_ctx_remove(struct nfsmount *, struct nfs_gss_clnt_ctx *); -static int nfs_gss_clnt_ctx_delay(struct nfsreq *, int *); #endif /* NFSCLIENT */ #if NFSSERVER @@ -253,6 +253,25 @@ nfs_gss_init(void) #if NFSCLIENT +/* + * Is it OK to fall back to using AUTH_SYS? + */ +static int +nfs_gss_sysok(struct nfsreq *req) +{ + struct nfsmount *nmp = req->r_nmp; + int i; + + if (req->r_wrongsec) /* Not OK if we're trying to handle a wrongsec error */ + return (0); + if (!nmp->nm_sec.count) /* assume it's OK if we don't have a set of flavors */ + return (1); + for (i=0; i < nmp->nm_sec.count; i++) + if (nmp->nm_sec.flavors[i] == RPCAUTH_SYS) + return (1); + return (0); +} + /* * Find the context for a particular user. * @@ -269,15 +288,14 @@ nfs_gss_clnt_ctx_find(struct nfsreq *req) struct nfs_gss_clnt_ctx *cp; uid_t uid = kauth_cred_getuid(req->r_cred); int error = 0; - int retrycnt = 0; lck_mtx_lock(&nmp->nm_lock); TAILQ_FOREACH(cp, &nmp->nm_gsscl, gss_clnt_entries) { if (cp->gss_clnt_uid == uid) { if (cp->gss_clnt_flags & GSS_CTX_INVAL) continue; - lck_mtx_unlock(&nmp->nm_lock); nfs_gss_clnt_ctx_ref(req, cp); + lck_mtx_unlock(&nmp->nm_lock); return (0); } } @@ -292,8 +310,8 @@ nfs_gss_clnt_ctx_find(struct nfsreq *req) */ TAILQ_FOREACH(cp, &nmp->nm_gsscl, gss_clnt_entries) { if (!(cp->gss_clnt_flags & GSS_CTX_INVAL)) { - lck_mtx_unlock(&nmp->nm_lock); nfs_gss_clnt_ctx_ref(req, cp); + lck_mtx_unlock(&nmp->nm_lock); return (0); } } @@ -310,7 +328,7 @@ nfs_gss_clnt_ctx_find(struct nfsreq *req) * to failover to sec=sys. */ if (req->r_thread == NULL) { - if (nmp->nm_flag & NFSMNT_SECSYSOK) { + if (nfs_gss_sysok(req)) { error = nfs_gss_clnt_ctx_failover(req); } else { printf("nfs_gss_clnt_ctx_find: no context for async\n"); @@ -334,29 +352,7 @@ nfs_gss_clnt_ctx_find(struct nfsreq *req) TAILQ_INSERT_TAIL(&nmp->nm_gsscl, cp, gss_clnt_entries); lck_mtx_unlock(&nmp->nm_lock); -retry: - error = nfs_gss_clnt_ctx_init(req, cp); - if (error == ENEEDAUTH) { - error = nfs_gss_clnt_ctx_delay(req, &retrycnt); - if (!error) - goto retry; - - /* Giving up on this context */ - cp->gss_clnt_flags |= GSS_CTX_INVAL; - - /* - * Wake any threads waiting to use the context - */ - lck_mtx_lock(cp->gss_clnt_mtx); - cp->gss_clnt_thread = NULL; - if (cp->gss_clnt_flags & GSS_NEEDCTX) { - cp->gss_clnt_flags &= ~GSS_NEEDCTX; - wakeup(cp); - } - lck_mtx_unlock(cp->gss_clnt_mtx); - - } - + error = nfs_gss_clnt_ctx_init_retry(req, cp); // Initialize new context if (error) nfs_gss_clnt_ctx_unref(req); @@ -367,7 +363,7 @@ nfs_gss_clnt_ctx_find(struct nfsreq *req) * up a dummy context that allows this user to attempt * sec=sys calls. */ - if (error && (nmp->nm_flag & NFSMNT_SECSYSOK) && + if (error && nfs_gss_sysok(req) && (error != ENXIO) && (error != ETIMEDOUT)) { lck_mtx_lock(&nmp->nm_lock); error = nfs_gss_clnt_ctx_failover(req); @@ -433,7 +429,7 @@ nfs_gss_clnt_cred_put(struct nfsreq *req, struct nfsm_chain *nmc, mbuf_t args) slpflag = (PZERO-1); if (req->r_nmp) { - slpflag |= ((req->r_nmp->nm_flag & NFSMNT_INT) && req->r_thread) ? PCATCH : 0; + slpflag |= (NMFLAG(req->r_nmp, INTR) && req->r_thread && !(req->r_flags & R_NOINTR)) ? PCATCH : 0; recordmark = (req->r_nmp->nm_sotype == SOCK_STREAM); } retry: @@ -483,6 +479,7 @@ nfs_gss_clnt_cred_put(struct nfsreq *req, struct nfsm_chain *nmc, mbuf_t args) if (cp->gss_clnt_thread && cp->gss_clnt_thread != current_thread()) { cp->gss_clnt_flags |= GSS_NEEDCTX; msleep(cp, cp->gss_clnt_mtx, slpflag | PDROP, "ctxwait", NULL); + slpflag &= ~PCATCH; if ((error = nfs_sigintr(req->r_nmp, req, req->r_thread, 0))) return (error); nfs_gss_clnt_ctx_unref(req); @@ -504,6 +501,7 @@ nfs_gss_clnt_cred_put(struct nfsreq *req, struct nfsm_chain *nmc, mbuf_t args) ((cp->gss_clnt_seqnum - cp->gss_clnt_seqwin) + 1) % cp->gss_clnt_seqwin)) { cp->gss_clnt_flags |= GSS_NEEDSEQ; msleep(cp, cp->gss_clnt_mtx, slpflag, "seqwin", NULL); + slpflag &= ~PCATCH; if ((error = nfs_sigintr(req->r_nmp, req, req->r_thread, 0))) { lck_mtx_unlock(cp->gss_clnt_mtx); return (error); @@ -995,9 +993,9 @@ nfs_gss_clnt_ctx_init(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) cp->gss_clnt_proc = RPCSEC_GSS_INIT; cp->gss_clnt_service = - nmp->nm_auth == RPCAUTH_KRB5 ? RPCSEC_GSS_SVC_NONE : - nmp->nm_auth == RPCAUTH_KRB5I ? RPCSEC_GSS_SVC_INTEGRITY : - nmp->nm_auth == RPCAUTH_KRB5P ? RPCSEC_GSS_SVC_PRIVACY : 0; + req->r_auth == RPCAUTH_KRB5 ? RPCSEC_GSS_SVC_NONE : + req->r_auth == RPCAUTH_KRB5I ? RPCSEC_GSS_SVC_INTEGRITY : + req->r_auth == RPCAUTH_KRB5P ? RPCSEC_GSS_SVC_PRIVACY : 0; cp->gss_clnt_gssd_flags = (nfs_single_des ? GSSD_NFS_1DES : 0); /* @@ -1055,7 +1053,9 @@ nfs_gss_clnt_ctx_init(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) /* * The context is apparently established successfully */ + lck_mtx_lock(cp->gss_clnt_mtx); cp->gss_clnt_flags |= GSS_CTX_COMPLETE; + lck_mtx_unlock(cp->gss_clnt_mtx); cp->gss_clnt_proc = RPCSEC_GSS_DATA; microuptime(&now); cp->gss_clnt_ctime = now.tv_sec; // time stamp @@ -1110,13 +1110,13 @@ nfs_gss_clnt_ctx_init(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) * It will be removed when the reference count * drops to zero. */ + lck_mtx_lock(cp->gss_clnt_mtx); if (error) cp->gss_clnt_flags |= GSS_CTX_INVAL; /* * Wake any threads waiting to use the context */ - lck_mtx_lock(cp->gss_clnt_mtx); cp->gss_clnt_thread = NULL; if (cp->gss_clnt_flags & GSS_NEEDCTX) { cp->gss_clnt_flags &= ~GSS_NEEDCTX; @@ -1127,6 +1127,77 @@ nfs_gss_clnt_ctx_init(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) return (error); } +/* + * This function calls nfs_gss_clnt_ctx_init() to set up a new context. + * But if there's a failure in trying to establish the context it keeps + * retrying at progressively longer intervals in case the failure is + * due to some transient condition. For instance, the server might be + * failing the context setup because directory services is not coming + * up in a timely fashion. + */ +static int +nfs_gss_clnt_ctx_init_retry(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) +{ + struct nfsmount *nmp = req->r_nmp; + struct timeval now; + time_t waituntil; + int error, slpflag; + int retries = 0; + int timeo = NFS_TRYLATERDEL; + + if (nmp == NULL) { + error = ENXIO; + goto bad; + } + + /* For an "intr" mount allow a signal to interrupt the retries */ + slpflag = (NMFLAG(nmp, INTR) && !(req->r_flags & R_NOINTR)) ? PCATCH : 0; + + while ((error = nfs_gss_clnt_ctx_init(req, cp)) == ENEEDAUTH) { + microuptime(&now); + waituntil = now.tv_sec + timeo; + while (now.tv_sec < waituntil) { + tsleep(&lbolt, PSOCK | slpflag, "nfs_gss_clnt_ctx_init_retry", 0); + slpflag = 0; + error = nfs_sigintr(req->r_nmp, req, current_thread(), 0); + if (error) + goto bad; + microuptime(&now); + } + + retries++; + /* If it's a soft mount just give up after a while */ + if (NMFLAG(nmp, SOFT) && (retries > nmp->nm_retry)) { + error = ETIMEDOUT; + goto bad; + } + timeo *= 2; + if (timeo > 60) + timeo = 60; + } + + if (error == 0) + return 0; // success +bad: + /* + * Give up on this context + */ + lck_mtx_lock(cp->gss_clnt_mtx); + cp->gss_clnt_flags |= GSS_CTX_INVAL; + + /* + * Wake any threads waiting to use the context + */ + cp->gss_clnt_thread = NULL; + if (cp->gss_clnt_flags & GSS_NEEDCTX) { + cp->gss_clnt_flags &= ~GSS_NEEDCTX; + wakeup(cp); + } + lck_mtx_unlock(cp->gss_clnt_mtx); + + return error; +} + /* * Call the NFS server using a null procedure for context setup. * Even though it's a null procedure and nominally has no arguments @@ -1260,11 +1331,11 @@ static int nfs_gss_clnt_gssd_upcall(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) { kern_return_t kr; - byte_buffer okey = NULL; + gssd_byte_buffer okey = NULL; uint32_t skeylen = 0; int retry_cnt = 0; vm_map_copy_t itoken = NULL; - byte_buffer otoken = NULL; + gssd_byte_buffer otoken = NULL; mach_msg_type_number_t otokenlen; int error = 0; char uprinc[1]; @@ -1279,7 +1350,7 @@ nfs_gss_clnt_gssd_upcall(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) * the kernel is being compiled with -Wwrite-strings. */ uprinc[0] = '\0'; - if (cp->gss_clnt_mport == NULL) { + if (!IPC_PORT_VALID(cp->gss_clnt_mport)) { kr = task_get_gssd_port(get_threadtask(req->r_thread), &cp->gss_clnt_mport); if (kr != KERN_SUCCESS) { printf("nfs_gss_clnt_gssd_upcall: can't get gssd port, status %x (%d)\n", kr, kr); @@ -1298,8 +1369,8 @@ nfs_gss_clnt_gssd_upcall(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) retry: kr = mach_gss_init_sec_context( cp->gss_clnt_mport, - KRB5_MECH, - (byte_buffer) itoken, (mach_msg_type_number_t) cp->gss_clnt_tokenlen, + GSSD_KRB5_MECH, + (gssd_byte_buffer) itoken, (mach_msg_type_number_t) cp->gss_clnt_tokenlen, cp->gss_clnt_uid, uprinc, cp->gss_clnt_svcname, @@ -1512,8 +1583,8 @@ nfs_gss_clnt_ctx_remove(struct nfsmount *nmp, struct nfs_gss_clnt_ctx *cp) if (nmp != NULL) TAILQ_REMOVE(&nmp->nm_gsscl, cp, gss_clnt_entries); - if (cp->gss_clnt_mport) - task_release_special_port(cp->gss_clnt_mport); + task_release_special_port(cp->gss_clnt_mport); + if (cp->gss_clnt_mtx) lck_mtx_destroy(cp->gss_clnt_mtx, nfs_gss_clnt_grp); if (cp->gss_clnt_handle) @@ -1541,7 +1612,6 @@ nfs_gss_clnt_ctx_renew(struct nfsreq *req) int error = 0; uid_t saved_uid; mach_port_t saved_mport; - int retrycnt = 0; if (cp == NULL) return (0); @@ -1590,13 +1660,7 @@ nfs_gss_clnt_ctx_renew(struct nfsreq *req) nfs_gss_clnt_ctx_unref(req); nfs_gss_clnt_ctx_ref(req, ncp); -retry: - error = nfs_gss_clnt_ctx_init(req, ncp); // Initialize new context - if (error == ENEEDAUTH) { - error = nfs_gss_clnt_ctx_delay(req, &retrycnt); - if (!error) - goto retry; - } + error = nfs_gss_clnt_ctx_init_retry(req, ncp); // Initialize new context out: task_release_special_port(saved_mport); if (error) @@ -1610,17 +1674,13 @@ nfs_gss_clnt_ctx_renew(struct nfsreq *req) * The contexts are also destroyed by the server. */ void -nfs_gss_clnt_ctx_unmount(struct nfsmount *nmp, int mntflags) +nfs_gss_clnt_ctx_unmount(struct nfsmount *nmp) { struct nfs_gss_clnt_ctx *cp; - struct ucred temp_cred; - kauth_cred_t cred; struct nfsm_chain nmreq, nmrep; int error, status; struct nfsreq req; - bzero((caddr_t) &temp_cred, sizeof(temp_cred)); - temp_cred.cr_ngroups = 1; req.r_nmp = nmp; for (;;) { @@ -1637,9 +1697,14 @@ nfs_gss_clnt_ctx_unmount(struct nfsmount *nmp, int mntflags) * But don't bother if it's a forced unmount * or if it's a dummy sec=sys context. */ - if (!(mntflags & MNT_FORCE) && cp->gss_clnt_service != RPCSEC_GSS_SVC_SYS) { - temp_cred.cr_uid = cp->gss_clnt_uid; - cred = kauth_cred_create(&temp_cred); + if (!(nmp->nm_state & NFSSTA_FORCE) && (cp->gss_clnt_service != RPCSEC_GSS_SVC_SYS)) { + kauth_cred_t cred; + struct posix_cred temp_pcred; + + bzero((caddr_t) &temp_pcred, sizeof(temp_pcred)); + temp_pcred.cr_ngroups = 1; + temp_pcred.cr_uid = cp->gss_clnt_uid; + cred = posix_cred_create(&temp_pcred); cp->gss_clnt_proc = RPCSEC_GSS_DESTROY; error = 0; @@ -1660,48 +1725,13 @@ nfs_gss_clnt_ctx_unmount(struct nfsmount *nmp, int mntflags) * the reference to remove it if its * refcount is zero. */ + lck_mtx_lock(cp->gss_clnt_mtx); cp->gss_clnt_flags |= GSS_CTX_INVAL; + lck_mtx_unlock(cp->gss_clnt_mtx); nfs_gss_clnt_ctx_unref(&req); } } -/* - * If we get a failure in trying to establish a context we need to wait a - * little while to see if the server is feeling better. In our case this is - * probably a failure in directory services not coming up in a timely fashion. - * This routine sort of mimics receiving a jukebox error. - */ -static int -nfs_gss_clnt_ctx_delay(struct nfsreq *req, int *retry) -{ - int timeo = (1 << *retry) * NFS_TRYLATERDEL; - int error = 0; - struct nfsmount *nmp = req->r_nmp; - struct timeval now; - time_t waituntil; - - if (!nmp) - return (ENXIO); - if ((nmp->nm_flag & NFSMNT_SOFT) && *retry > nmp->nm_retry) - return (ETIMEDOUT); - if (timeo > 60) - timeo = 60; - - microuptime(&now); - waituntil = now.tv_sec + timeo; - while (now.tv_sec < waituntil) { - tsleep(&lbolt, PSOCK, "nfs_gss_clnt_ctx_delay", 0); - error = nfs_sigintr(req->r_nmp, req, current_thread(), 0); - if (error) - break; - microuptime(&now); - } - *retry += 1; - - return (error); -} - - #endif /* NFSCLIENT */ /************* @@ -1733,7 +1763,7 @@ nfs_gss_svc_ctx_find(uint32_t handle) lck_mtx_lock(nfs_gss_svc_ctx_mutex); - LIST_FOREACH(cp, head, gss_svc_entries) + LIST_FOREACH(cp, head, gss_svc_entries) { if (cp->gss_svc_handle == handle) { if (timenow > cp->gss_svc_incarnation + GSS_SVC_CTX_TTL) { /* @@ -1743,14 +1773,20 @@ nfs_gss_svc_ctx_find(uint32_t handle) */ cp->gss_svc_handle = 0; /* - * Make sure though that we stay around for GSS_CTC_PEND seconds + * Make sure though that we stay around for GSS_CTX_PEND seconds * for other threads that might be using the context. */ cp->gss_svc_incarnation = timenow; + cp = NULL; + break; } + lck_mtx_lock(cp->gss_svc_mtx); + cp->gss_svc_refcnt++; + lck_mtx_unlock(cp->gss_svc_mtx); break; } + } lck_mtx_unlock(nfs_gss_svc_ctx_mutex); @@ -1765,10 +1801,26 @@ static void nfs_gss_svc_ctx_insert(struct nfs_gss_svc_ctx *cp) { struct nfs_gss_svc_ctx_hashhead *head; + struct nfs_gss_svc_ctx *p; + lck_mtx_lock(nfs_gss_svc_ctx_mutex); + + /* + * Give the client a random handle so that if we reboot + * it's unlikely the client will get a bad context match. + * Make sure it's not zero or already assigned. + */ +retry: + cp->gss_svc_handle = random(); + if (cp->gss_svc_handle == 0) + goto retry; head = &nfs_gss_svc_ctx_hashtbl[SVC_CTX_HASH(cp->gss_svc_handle)]; + LIST_FOREACH(p, head, gss_svc_entries) + if (p->gss_svc_handle == cp->gss_svc_handle) + goto retry; - lck_mtx_lock(nfs_gss_svc_ctx_mutex); + clock_interval_to_deadline(GSS_CTX_PEND, NSEC_PER_SEC, + &cp->gss_svc_incarnation); LIST_INSERT_HEAD(head, cp, gss_svc_entries); nfs_gss_ctx_count++; @@ -1776,7 +1828,7 @@ nfs_gss_svc_ctx_insert(struct nfs_gss_svc_ctx *cp) nfs_gss_timer_on = 1; nfs_interval_timer_start(nfs_gss_svc_ctx_timer_call, - min(GSS_TIMER_PERIOD, max(GSS_CTX_TTL_MIN, GSS_SVC_CTX_TTL)) * MSECS_PER_SEC); + min(GSS_TIMER_PERIOD, max(GSS_CTX_TTL_MIN, nfsrv_gss_context_ttl)) * MSECS_PER_SEC); } lck_mtx_unlock(nfs_gss_svc_ctx_mutex); @@ -1790,7 +1842,6 @@ nfs_gss_svc_ctx_insert(struct nfs_gss_svc_ctx *cp) void nfs_gss_svc_ctx_timer(__unused void *param1, __unused void *param2) { - struct nfs_gss_svc_ctx_hashhead *head; struct nfs_gss_svc_ctx *cp, *next; uint64_t timenow; int contexts = 0; @@ -1801,19 +1852,17 @@ nfs_gss_svc_ctx_timer(__unused void *param1, __unused void *param2) /* * Scan all the hash chains - * Assume nfs_gss_svc_ctx_mutex is held */ for (i = 0; i < SVC_CTX_HASHSZ; i++) { /* * For each hash chain, look for entries * that haven't been used in a while. */ - head = &nfs_gss_svc_ctx_hashtbl[i]; - for (cp = LIST_FIRST(head); cp; cp = next) { + LIST_FOREACH_SAFE(cp, &nfs_gss_svc_ctx_hashtbl[i], gss_svc_entries, next) { contexts++; - next = LIST_NEXT(cp, gss_svc_entries); - if (timenow > cp->gss_svc_incarnation + - (cp->gss_svc_handle ? GSS_SVC_CTX_TTL : 0)) { + if (timenow > cp->gss_svc_incarnation + + (cp->gss_svc_handle ? GSS_SVC_CTX_TTL : 0) + && cp->gss_svc_refcnt == 0) { /* * A stale context - remove it */ @@ -1836,7 +1885,7 @@ nfs_gss_svc_ctx_timer(__unused void *param1, __unused void *param2) nfs_gss_timer_on = nfs_gss_ctx_count > 0; if (nfs_gss_timer_on) nfs_interval_timer_start(nfs_gss_svc_ctx_timer_call, - min(GSS_TIMER_PERIOD, max(GSS_CTX_TTL_MIN, GSS_SVC_CTX_TTL)) * MSECS_PER_SEC); + min(GSS_TIMER_PERIOD, max(GSS_CTX_TTL_MIN, nfsrv_gss_context_ttl)) * MSECS_PER_SEC); lck_mtx_unlock(nfs_gss_svc_ctx_mutex); } @@ -1921,6 +1970,8 @@ nfs_gss_svc_cred_get(struct nfsrv_descript *nd, struct nfsm_chain *nmc) error = ENOMEM; goto nfsmout; } + cp->gss_svc_mtx = lck_mtx_alloc_init(nfs_gss_svc_grp, LCK_ATTR_NULL); + cp->gss_svc_refcnt = 1; } else { /* @@ -1944,7 +1995,7 @@ nfs_gss_svc_cred_get(struct nfsrv_descript *nd, struct nfsm_chain *nmc) ki = &cp->gss_svc_kinfo; if (proc == RPCSEC_GSS_DATA || proc == RPCSEC_GSS_DESTROY) { - struct ucred temp_cred; + struct posix_cred temp_pcred; if (cp->gss_svc_seqwin == 0) { /* @@ -1975,6 +2026,8 @@ nfs_gss_svc_cred_get(struct nfsrv_descript *nd, struct nfsm_chain *nmc) */ nfsm_chain_get_32(error, nmc, flavor); nfsm_chain_get_32(error, nmc, verflen); + if (error) + goto nfsmout; if (flavor != RPCSEC_GSS || verflen != KRB5_SZ_TOKEN(ki->hash_len)) error = NFSERR_AUTHERR | AUTH_BADVERF; nfsm_chain_get_opaque(error, nmc, verflen, tokbuf); @@ -1997,13 +2050,13 @@ nfs_gss_svc_cred_get(struct nfsrv_descript *nd, struct nfsm_chain *nmc) /* * Set up the user's cred */ - bzero(&temp_cred, sizeof(temp_cred)); - temp_cred.cr_uid = cp->gss_svc_uid; - bcopy(cp->gss_svc_gids, temp_cred.cr_groups, + bzero(&temp_pcred, sizeof(temp_pcred)); + temp_pcred.cr_uid = cp->gss_svc_uid; + bcopy(cp->gss_svc_gids, temp_pcred.cr_groups, sizeof(gid_t) * cp->gss_svc_ngroups); - temp_cred.cr_ngroups = cp->gss_svc_ngroups; + temp_pcred.cr_ngroups = cp->gss_svc_ngroups; - nd->nd_cr = kauth_cred_create(&temp_cred); + nd->nd_cr = posix_cred_create(&temp_pcred); if (nd->nd_cr == NULL) { error = ENOMEM; goto nfsmout; @@ -2135,12 +2188,21 @@ nfs_gss_svc_cred_get(struct nfsrv_descript *nd, struct nfsm_chain *nmc) nfsm_chain_get_32(error, nmc, verflen); if (error || flavor != RPCAUTH_NULL || verflen > 0) error = NFSERR_AUTHERR | RPCSEC_GSS_CREDPROBLEM; - if (error) + if (error) { + if (proc == RPCSEC_GSS_INIT) { + lck_mtx_destroy(cp->gss_svc_mtx, nfs_gss_svc_grp); + FREE(cp, M_TEMP); + cp = NULL; + } goto nfsmout; + } } nd->nd_gss_context = cp; + return 0; nfsmout: + if (cp) + nfs_gss_svc_ctx_deref(cp); return (error); } @@ -2341,7 +2403,6 @@ int nfs_gss_svc_ctx_init(struct nfsrv_descript *nd, struct nfsrv_sock *slp, mbuf_t *mrepp) { struct nfs_gss_svc_ctx *cp = NULL; - uint32_t handle = 0; int error = 0; int autherr = 0; struct nfsm_chain *nmreq, nmrep; @@ -2355,22 +2416,7 @@ nfs_gss_svc_ctx_init(struct nfsrv_descript *nd, struct nfsrv_sock *slp, mbuf_t * switch (cp->gss_svc_proc) { case RPCSEC_GSS_INIT: - /* - * Give the client a random handle so that - * if we reboot it's unlikely the client - * will get a bad context match. - * Make sure it's not zero, or already assigned. - */ - do { - handle = random(); - } while (nfs_gss_svc_ctx_find(handle) != NULL || handle == 0); - cp->gss_svc_handle = handle; - cp->gss_svc_mtx = lck_mtx_alloc_init(nfs_gss_svc_grp, LCK_ATTR_NULL); - clock_interval_to_deadline(GSS_CTX_PEND, NSEC_PER_SEC, - &cp->gss_svc_incarnation); - nfs_gss_svc_ctx_insert(cp); - /* FALLTHRU */ case RPCSEC_GSS_CONTINUE_INIT: @@ -2502,11 +2548,11 @@ nfs_gss_svc_gssd_upcall(struct nfs_gss_svc_ctx *cp) kern_return_t kr; mach_port_t mp; int retry_cnt = 0; - byte_buffer okey = NULL; + gssd_byte_buffer okey = NULL; uint32_t skeylen = 0; uint32_t ret_flags; vm_map_copy_t itoken = NULL; - byte_buffer otoken = NULL; + gssd_byte_buffer otoken = NULL; mach_msg_type_number_t otokenlen; int error = 0; char svcname[] = "nfs"; @@ -2527,7 +2573,7 @@ nfs_gss_svc_gssd_upcall(struct nfs_gss_svc_ctx *cp) retry: kr = mach_gss_accept_sec_context( mp, - (byte_buffer) itoken, (mach_msg_type_number_t) cp->gss_svc_tokenlen, + (gssd_byte_buffer) itoken, (mach_msg_type_number_t) cp->gss_svc_tokenlen, svcname, 0, &cp->gss_svc_context, @@ -2661,6 +2707,24 @@ nfs_gss_svc_seqnum_valid(struct nfs_gss_svc_ctx *cp, uint32_t seq) return (1); } +/* + * Drop a reference to a context + * + * Note that it's OK for the context to exist + * with a refcount of zero. The refcount isn't + * checked until we're about to reap an expired one. + */ +void +nfs_gss_svc_ctx_deref(struct nfs_gss_svc_ctx *cp) +{ + lck_mtx_lock(cp->gss_svc_mtx); + if (cp->gss_svc_refcnt > 0) + cp->gss_svc_refcnt--; + else + printf("nfs_gss_ctx_deref: zero refcount\n"); + lck_mtx_unlock(cp->gss_svc_mtx); +} + /* * Called at NFS server shutdown - destroy all contexts */ @@ -2713,8 +2777,8 @@ extern ipc_port_t ipc_port_copy_send(ipc_port_t); static void task_release_special_port(mach_port_t mp) { - - ipc_port_release_send(mp); + if (IPC_PORT_VALID(mp)) + ipc_port_release_send(mp); } static mach_port_t diff --git a/bsd/nfs/nfs_gss.h b/bsd/nfs/nfs_gss.h index aa6d55e96..ad056e7f2 100644 --- a/bsd/nfs/nfs_gss.h +++ b/bsd/nfs/nfs_gss.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2008 Apple Inc. All rights reserved. + * Copyright (c) 2007-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -50,6 +50,9 @@ enum rpcsec_gss_service { RPCSEC_GSS_SVC_SYS = 4 // sec=sys (fallback) }; +/* encoded krb5 OID */ +extern u_char krb5_mech[11]; + /* * GSS-API things */ @@ -111,8 +114,8 @@ struct nfs_gss_clnt_ctx { mach_port_t gss_clnt_mport; // Mach port for gssd upcall u_char *gss_clnt_verf; // RPC verifier from server char *gss_clnt_svcname; // Service name e.g. "nfs/big.apple.com" - gss_cred gss_clnt_cred_handle; // Opaque cred handle from gssd - gss_ctx gss_clnt_context; // Opaque context handle from gssd + gssd_cred gss_clnt_cred_handle; // Opaque cred handle from gssd + gssd_ctx gss_clnt_context; // Opaque context handle from gssd u_char *gss_clnt_token; // GSS token exchanged via gssd & server uint32_t gss_clnt_tokenlen; // Length of token gss_key_info gss_clnt_kinfo; // GSS key info @@ -136,6 +139,7 @@ struct nfs_gss_svc_ctx { lck_mtx_t *gss_svc_mtx; LIST_ENTRY(nfs_gss_svc_ctx) gss_svc_entries; uint32_t gss_svc_handle; // Identifies server context to client + uint32_t gss_svc_refcnt; // Reference count uint32_t gss_svc_proc; // Current GSS proc from cred uid_t gss_svc_uid; // UID of this user gid_t gss_svc_gids[NGROUPS]; // GIDs of this user @@ -144,8 +148,8 @@ struct nfs_gss_svc_ctx { uint32_t gss_svc_seqmax; // Current max GSS sequence number uint32_t gss_svc_seqwin; // GSS sequence number window uint32_t *gss_svc_seqbits; // Bitmap to track seq numbers - gss_cred gss_svc_cred_handle; // Opaque cred handle from gssd - gss_ctx gss_svc_context; // Opaque context handle from gssd + gssd_cred gss_svc_cred_handle; // Opaque cred handle from gssd + gssd_ctx gss_svc_context; // Opaque context handle from gssd u_char *gss_svc_token; // GSS token exchanged via gssd & client uint32_t gss_svc_tokenlen; // Length of token gss_key_info gss_svc_kinfo; // Session key info @@ -184,12 +188,13 @@ int nfs_gss_clnt_args_restore(struct nfsreq *); int nfs_gss_clnt_ctx_renew(struct nfsreq *); void nfs_gss_clnt_ctx_ref(struct nfsreq *, struct nfs_gss_clnt_ctx *); void nfs_gss_clnt_ctx_unref(struct nfsreq *); -void nfs_gss_clnt_ctx_unmount(struct nfsmount *, int); +void nfs_gss_clnt_ctx_unmount(struct nfsmount *); int nfs_gss_svc_cred_get(struct nfsrv_descript *, struct nfsm_chain *); int nfs_gss_svc_verf_put(struct nfsrv_descript *, struct nfsm_chain *); int nfs_gss_svc_ctx_init(struct nfsrv_descript *, struct nfsrv_sock *, mbuf_t *); int nfs_gss_svc_prepare_reply(struct nfsrv_descript *, struct nfsm_chain *); int nfs_gss_svc_protect_reply(struct nfsrv_descript *, mbuf_t); +void nfs_gss_svc_ctx_deref(struct nfs_gss_svc_ctx *); void nfs_gss_svc_cleanup(void); __END_DECLS diff --git a/bsd/nfs/nfs_lock.c b/bsd/nfs/nfs_lock.c index 590a70619..f76a9b6d0 100644 --- a/bsd/nfs/nfs_lock.c +++ b/bsd/nfs/nfs_lock.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002-2008 Apple Inc. All rights reserved. + * Copyright (c) 2002-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -95,8 +95,6 @@ extern void ipc_port_release_send(ipc_port_t); -#define OFF_MAX QUAD_MAX - /* * pending lock request messages are kept in this queue which is * kept sorted by transaction ID (xid). @@ -104,28 +102,8 @@ extern void ipc_port_release_send(ipc_port_t); static uint64_t nfs_lockxid = 0; static LOCKD_MSG_QUEUE nfs_pendlockq; -/* - * This structure is used to identify processes which have acquired NFS locks. - * Knowing which processes have ever acquired locks allows us to short-circuit - * unlock requests for processes that have never had an NFS file lock. Thus - * avoiding a costly and unnecessary lockd request. - */ -struct nfs_lock_pid { - TAILQ_ENTRY(nfs_lock_pid) lp_lru; /* LRU list */ - LIST_ENTRY(nfs_lock_pid) lp_hash; /* hash chain */ - int lp_valid; /* valid entry? */ - int lp_time; /* last time seen valid */ - pid_t lp_pid; /* The process ID. */ - struct timeval lp_pid_start; /* Start time of process id */ -}; - -#define NFS_LOCK_PID_HASH_SIZE 64 // XXX tune me -#define NFS_LOCK_PID_HASH(pid) \ - (&nfs_lock_pid_hash_tbl[(pid) & nfs_lock_pid_hash]) -static LIST_HEAD(, nfs_lock_pid) *nfs_lock_pid_hash_tbl; -static TAILQ_HEAD(, nfs_lock_pid) nfs_lock_pid_lru; -static u_long nfs_lock_pid_hash; -static uint32_t nfs_lock_pid_hash_trusted; +/* list of mounts that are (potentially) making lockd requests */ +TAILQ_HEAD(nfs_lockd_mount_list,nfsmount) nfs_lockd_mount_list; static lck_grp_t *nfs_lock_lck_grp; static lck_mtx_t *nfs_lock_mutex; @@ -136,7 +114,6 @@ int nfs_lockdmsg_compare_to_answer(LOCKD_MSG_REQUEST *, struct lockd_ans *); LOCKD_MSG_REQUEST *nfs_lockdmsg_find_by_answer(struct lockd_ans *); LOCKD_MSG_REQUEST *nfs_lockdmsg_find_by_xid(uint64_t); uint64_t nfs_lockxid_get(void); -int nfs_lock_pid_check(proc_t, int); int nfs_lockd_send_request(LOCKD_MSG *, int); /* @@ -146,31 +123,40 @@ void nfs_lockinit(void) { TAILQ_INIT(&nfs_pendlockq); - nfs_lock_pid_hash_trusted = 1; - nfs_lock_pid_hash_tbl = hashinit(NFS_LOCK_PID_HASH_SIZE, - M_TEMP, &nfs_lock_pid_hash); - TAILQ_INIT(&nfs_lock_pid_lru); + TAILQ_INIT(&nfs_lockd_mount_list); nfs_lock_lck_grp = lck_grp_alloc_init("nfs_lock", LCK_GRP_ATTR_NULL); nfs_lock_mutex = lck_mtx_alloc_init(nfs_lock_lck_grp, LCK_ATTR_NULL); } /* - * change the count of NFS mounts that may need to make lockd requests + * Register a mount as (potentially) making lockd requests. + */ +void +nfs_lockd_mount_register(struct nfsmount *nmp) +{ + lck_mtx_lock(nfs_lock_mutex); + TAILQ_INSERT_HEAD(&nfs_lockd_mount_list, nmp, nm_ldlink); + nfs_lockd_mounts++; + lck_mtx_unlock(nfs_lock_mutex); +} + +/* + * Unregister a mount as (potentially) making lockd requests. * - * If the mount count drops to zero, then send a shutdown request to + * When the lockd mount count drops to zero, then send a shutdown request to * lockd if we've sent any requests to it. */ void -nfs_lockd_mount_change(int i) +nfs_lockd_mount_unregister(struct nfsmount *nmp) { + int send_shutdown; mach_port_t lockd_port = IPC_PORT_NULL; kern_return_t kr; - int send_shutdown; lck_mtx_lock(nfs_lock_mutex); - - nfs_lockd_mounts += i; + TAILQ_REMOVE(&nfs_lockd_mount_list, nmp, nm_ldlink); + nfs_lockd_mounts--; /* send a shutdown request if there are no more lockd mounts */ send_shutdown = ((nfs_lockd_mounts == 0) && nfs_lockd_request_sent); @@ -183,7 +169,7 @@ nfs_lockd_mount_change(int i) return; /* - * Let lockd know that it is no longer need for any NFS mounts + * Let lockd know that it is no longer needed for any NFS mounts */ kr = host_get_lockd_port(host_priv_self(), &lockd_port); if ((kr != KERN_SUCCESS) || !IPC_PORT_VALID(lockd_port)) { @@ -204,7 +190,7 @@ nfs_lockd_mount_change(int i) * insert a lock request message into the pending queue * (nfs_lock_mutex must be held) */ -inline void +void nfs_lockdmsg_enqueue(LOCKD_MSG_REQUEST *msgreq) { LOCKD_MSG_REQUEST *mr; @@ -230,7 +216,7 @@ nfs_lockdmsg_enqueue(LOCKD_MSG_REQUEST *msgreq) * remove a lock request message from the pending queue * (nfs_lock_mutex must be held) */ -inline void +void nfs_lockdmsg_dequeue(LOCKD_MSG_REQUEST *msgreq) { TAILQ_REMOVE(&nfs_pendlockq, msgreq, lmr_next); @@ -248,7 +234,7 @@ nfs_lockdmsg_dequeue(LOCKD_MSG_REQUEST *msgreq) * * (nfs_lock_mutex must be held) */ -inline LOCKD_MSG_REQUEST * +LOCKD_MSG_REQUEST * nfs_lockdmsg_find_by_xid(uint64_t lockxid) { LOCKD_MSG_REQUEST *mr; @@ -264,8 +250,8 @@ nfs_lockdmsg_find_by_xid(uint64_t lockxid) /* * Because we can't depend on nlm_granted messages containing the same - * cookie we sent with the original lock request, we need code test if - * an nlm_granted answer matches the lock request. We also need code + * cookie we sent with the original lock request, we need code to test + * if an nlm_granted answer matches the lock request. We also need code * that can find a lockd message based solely on the nlm_granted answer. */ @@ -274,7 +260,7 @@ nfs_lockdmsg_find_by_xid(uint64_t lockxid) * * returns 0 on equality and 1 if different */ -inline int +int nfs_lockdmsg_compare_to_answer(LOCKD_MSG_REQUEST *msgreq, struct lockd_ans *ansp) { if (!(ansp->la_flags & LOCKD_ANS_LOCK_INFO)) @@ -307,7 +293,7 @@ nfs_lockdmsg_compare_to_answer(LOCKD_MSG_REQUEST *msgreq, struct lockd_ans *ansp * * (nfs_lock_mutex must be held) */ -inline LOCKD_MSG_REQUEST * +LOCKD_MSG_REQUEST * nfs_lockdmsg_find_by_answer(struct lockd_ans *ansp) { LOCKD_MSG_REQUEST *mr; @@ -325,7 +311,7 @@ nfs_lockdmsg_find_by_answer(struct lockd_ans *ansp) * return the next unique lock request transaction ID * (nfs_lock_mutex must be held) */ -inline uint64_t +uint64_t nfs_lockxid_get(void) { LOCKD_MSG_REQUEST *mr; @@ -359,143 +345,6 @@ nfs_lockxid_get(void) return nfs_lockxid; } - -/* - * Check the nfs_lock_pid hash table for an entry and, if requested, - * add the entry if it is not found. - * - * (Also, if adding, try to clean up some stale entries.) - * (nfs_lock_mutex must be held) - */ -int -nfs_lock_pid_check(proc_t p, int addflag) -{ - struct nfs_lock_pid *lp, *lplru, *lplru_next, *mlp; - TAILQ_HEAD(, nfs_lock_pid) nfs_lock_pid_free; - proc_t plru = PROC_NULL; - pid_t pid; - int error = 0; - struct timeval now; - - TAILQ_INIT(&nfs_lock_pid_free); - mlp = NULL; - -loop: - /* Search hash chain */ - pid = proc_pid(p); - error = ENOENT; - lp = NFS_LOCK_PID_HASH(pid)->lh_first; - for (; lp != NULL; lp = lp->lp_hash.le_next) - if (lp->lp_pid == pid) { - /* found pid... */ - if (timevalcmp(&lp->lp_pid_start, &p->p_start, ==)) { - /* ...and it's valid */ - /* move to tail of LRU */ - TAILQ_REMOVE(&nfs_lock_pid_lru, lp, lp_lru); - microuptime(&now); - lp->lp_time = now.tv_sec; - TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lp, lp_lru); - error = 0; - break; - } - /* ...but it's no longer valid */ - /* remove from hash, invalidate, and move to lru head */ - LIST_REMOVE(lp, lp_hash); - lp->lp_valid = 0; - TAILQ_REMOVE(&nfs_lock_pid_lru, lp, lp_lru); - TAILQ_INSERT_HEAD(&nfs_lock_pid_lru, lp, lp_lru); - lp = NULL; - break; - } - - /* if we didn't find it (valid), use any newly allocated one */ - if (!lp) - lp = mlp; - - /* if we don't have an lp and we've been asked to add it */ - if ((error == ENOENT) && addflag && !lp) { - /* scan lru list for invalid, stale entries to reuse/free */ - int lrucnt = 0; - microuptime(&now); - for (lplru = TAILQ_FIRST(&nfs_lock_pid_lru); lplru; lplru = lplru_next) { - lplru_next = TAILQ_NEXT(lplru, lp_lru); - if (lplru->lp_valid && (lplru->lp_time >= (now.tv_sec - 2))) { - /* - * If the oldest LRU entry is relatively new, then don't - * bother scanning any further. - */ - break; - } - /* remove entry from LRU, and check if it's still in use */ - TAILQ_REMOVE(&nfs_lock_pid_lru, lplru, lp_lru); - if (!lplru->lp_valid || !(plru = proc_find(lplru->lp_pid)) || - timevalcmp(&lplru->lp_pid_start, &plru->p_start, !=)) { - if (plru != PROC_NULL) { - proc_rele(plru); - plru = PROC_NULL; - } - /* no longer in use */ - LIST_REMOVE(lplru, lp_hash); - if (!lp) { - /* we'll reuse this one */ - lp = lplru; - } else { - /* queue it up for freeing */ - TAILQ_INSERT_HEAD(&nfs_lock_pid_free, lplru, lp_lru); - } - } else { - /* still in use */ - if (plru != PROC_NULL) { - proc_rele(plru); - plru = PROC_NULL; - } - lplru->lp_time = now.tv_sec; - TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lplru, lp_lru); - } - /* don't check too many entries at once */ - if (++lrucnt > 8) - break; - } - if (!lp) { - /* we need to allocate a new one */ - lck_mtx_unlock(nfs_lock_mutex); - MALLOC(mlp, struct nfs_lock_pid *, sizeof(struct nfs_lock_pid), - M_TEMP, M_WAITOK | M_ZERO); - lck_mtx_lock(nfs_lock_mutex); - if (mlp) /* make sure somebody hasn't already added this guy */ - goto loop; - error = ENOMEM; - } - } - if ((error == ENOENT) && addflag && lp) { - /* (re)initialize nfs_lock_pid info */ - lp->lp_pid = pid; - lp->lp_pid_start = p->p_start; - /* insert pid in hash */ - LIST_INSERT_HEAD(NFS_LOCK_PID_HASH(lp->lp_pid), lp, lp_hash); - lp->lp_valid = 1; - lp->lp_time = now.tv_sec; - TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lp, lp_lru); - error = 0; - } - - if ((mlp && (lp != mlp)) || TAILQ_FIRST(&nfs_lock_pid_free)) { - lck_mtx_unlock(nfs_lock_mutex); - if (mlp && (lp != mlp)) { - /* we didn't need this one, so we can free it */ - FREE(mlp, M_TEMP); - } - /* free up any stale entries */ - while ((lp = TAILQ_FIRST(&nfs_lock_pid_free))) { - TAILQ_REMOVE(&nfs_lock_pid_free, lp, lp_lru); - FREE(lp, M_TEMP); - } - lck_mtx_lock(nfs_lock_mutex); - } - - return (error); -} - #define MACH_MAX_TRIES 3 int @@ -551,186 +400,49 @@ nfs_lockd_send_request(LOCKD_MSG *msg, int interruptable) * NFS advisory byte-level locks (client) */ int -nfs3_vnop_advlock( - struct vnop_advlock_args /* { - struct vnodeop_desc *a_desc; - vnode_t a_vp; - caddr_t a_id; - int a_op; - struct flock *a_fl; - int a_flags; - vfs_context_t a_context; - } */ *ap) +nfs3_lockd_request( + nfsnode_t np, + int type, + LOCKD_MSG_REQUEST *msgreq, + int flags, + thread_t thd) { - vfs_context_t ctx; - proc_t p; - LOCKD_MSG_REQUEST msgreq; - LOCKD_MSG *msg; - vnode_t vp; - nfsnode_t np; + LOCKD_MSG *msg = &msgreq->lmr_msg; int error, error2; - int interruptable, modified; - struct flock *fl; + int interruptable, slpflag; struct nfsmount *nmp; - struct nfs_vattr nvattr; - off_t start, end; struct timeval now; - int timeo, endtime, lastmsg, wentdown = 0; - int lockpidcheck, nfsvers; - struct sockaddr *saddr; + int timeo, starttime, endtime, lastmsg, wentdown = 0; struct timespec ts; + struct sockaddr *saddr; - ctx = ap->a_context; - p = vfs_context_proc(ctx); - vp = ap->a_vp; - fl = ap->a_fl; - np = VTONFS(vp); - - nmp = VTONMP(vp); - if (!nmp) - return (ENXIO); - lck_mtx_lock(&nmp->nm_lock); - if (nmp->nm_flag & NFSMNT_NOLOCKS) { - lck_mtx_unlock(&nmp->nm_lock); - return (ENOTSUP); - } - nfsvers = nmp->nm_vers; - lck_mtx_unlock(&nmp->nm_lock); - - /* - * The NLM protocol doesn't allow the server to return an error - * on ranges, so we do it. Pre LFS (Large File Summit) - * standards required EINVAL for the range errors. More recent - * standards use EOVERFLOW, but their EINVAL wording still - * encompasses these errors. - * Any code sensitive to this is either: - * 1) written pre-LFS and so can handle only EINVAL, or - * 2) written post-LFS and thus ought to be tolerant of pre-LFS - * implementations. - * Since returning EOVERFLOW certainly breaks 1), we return EINVAL. - */ - if (fl->l_whence != SEEK_END) { - if ((fl->l_whence != SEEK_CUR && fl->l_whence != SEEK_SET) || - fl->l_start < 0 || - (fl->l_len > 0 && fl->l_len - 1 > OFF_MAX - fl->l_start) || - (fl->l_len < 0 && fl->l_start + fl->l_len < 0)) - return (EINVAL); - } - - lck_mtx_lock(nfs_lock_mutex); - - /* - * Need to check if this process has successfully acquired an NFS lock before. - * If not, and this is an unlock request we can simply return success here. - */ - lockpidcheck = nfs_lock_pid_check(p, 0); - lck_mtx_unlock(nfs_lock_mutex); - if (lockpidcheck) { - if (lockpidcheck != ENOENT) - return (lockpidcheck); - if ((ap->a_op == F_UNLCK) && nfs_lock_pid_hash_trusted) - return (0); - } - - /* - * The NFS Lock Manager protocol doesn't directly handle - * negative lengths or SEEK_END, so we need to normalize - * things here where we have all the info. - * (Note: SEEK_CUR is already adjusted for at this point) - */ - /* Convert the flock structure into a start and end. */ - switch (fl->l_whence) { - case SEEK_SET: - case SEEK_CUR: - /* - * Caller is responsible for adding any necessary offset - * to fl->l_start when SEEK_CUR is used. - */ - start = fl->l_start; - break; - case SEEK_END: - /* need to flush, and refetch attributes to make */ - /* sure we have the correct end of file offset */ - if ((error = nfs_node_lock(np))) - return (error); - modified = (np->n_flag & NMODIFIED); - nfs_node_unlock(np); - if (modified && ((error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1)))) - return (error); - if ((error = nfs_getattr(np, &nvattr, ctx, NGA_UNCACHED))) - return (error); - nfs_data_lock(np, NFS_DATA_LOCK_SHARED); - start = np->n_size + fl->l_start; - nfs_data_unlock(np); - break; - default: - return (EINVAL); - } - if (fl->l_len == 0) - end = -1; - else if (fl->l_len > 0) - end = start + fl->l_len - 1; - else { /* l_len is negative */ - end = start - 1; - start += fl->l_len; - } - if (start < 0) - return (EINVAL); - - if ((nfsvers == NFS_VER2) && - ((start >= 0x80000000) || (end >= 0x80000000))) - return (EINVAL); - - /* - * Fill in the information structure. - * We set all values to zero with bzero to clear - * out any information in the sockaddr_storage - * and nfs_filehandle contained in msgreq so that - * we will not leak extraneous information out of - * the kernel when calling up to lockd via our mig - * generated routine. - */ - bzero(&msgreq, sizeof(msgreq)); - msg = &msgreq.lmr_msg; - msg->lm_version = LOCKD_MSG_VERSION; - msg->lm_flags = 0; - - msg->lm_fl = *fl; - msg->lm_fl.l_start = start; - if (end != -1) - msg->lm_fl.l_len = end - start + 1; - msg->lm_fl.l_pid = vfs_context_pid(ctx); - - if (ap->a_flags & F_WAIT) - msg->lm_flags |= LOCKD_MSG_BLOCK; - if (ap->a_op == F_GETLK) - msg->lm_flags |= LOCKD_MSG_TEST; - - nmp = VTONMP(vp); - if (!nmp) + nmp = NFSTONMP(np); + if (!nmp || !nmp->nm_saddr) return (ENXIO); lck_mtx_lock(&nmp->nm_lock); - saddr = mbuf_data(nmp->nm_nam); + saddr = nmp->nm_saddr; bcopy(saddr, &msg->lm_addr, min(sizeof msg->lm_addr, saddr->sa_len)); - msg->lm_fh_len = (nfsvers == NFS_VER2) ? NFSX_V2FH : np->n_fhsize; - bcopy(np->n_fhp, msg->lm_fh, msg->lm_fh_len); - if (nfsvers == NFS_VER3) + if (nmp->nm_vers == NFS_VER3) msg->lm_flags |= LOCKD_MSG_NFSV3; - cru2x(vfs_context_ucred(ctx), &msg->lm_cred); +#if 0 /* not yet */ + if (nmp->nm_sotype != SOCK_DGRAM) + msg->lm_flags |= LOCKD_MSG_TCP; +#endif microuptime(&now); + starttime = now.tv_sec; lastmsg = now.tv_sec - ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay)); - interruptable = nmp->nm_flag & NFSMNT_INT; + interruptable = NMFLAG(nmp, INTR); lck_mtx_unlock(&nmp->nm_lock); lck_mtx_lock(nfs_lock_mutex); /* allocate unique xid */ msg->lm_xid = nfs_lockxid_get(); - nfs_lockdmsg_enqueue(&msgreq); + nfs_lockdmsg_enqueue(msgreq); - timeo = 2; + timeo = 4; for (;;) { nfs_lockd_request_sent = 1; @@ -751,7 +463,7 @@ nfs3_vnop_advlock( * Retry if it takes too long to get a response. * * The timeout numbers were picked out of thin air... they start - * at 2 and double each timeout with a max of 60 seconds. + * at 4 and double each timeout with a max of 30 seconds. * * In order to maintain responsiveness, we pass a small timeout * to msleep and calculate the timeouts ourselves. This allows @@ -759,15 +471,18 @@ nfs3_vnop_advlock( */ wait_for_granted: error = EWOULDBLOCK; + slpflag = (interruptable && (type != F_UNLCK)) ? PCATCH : 0; ts.tv_sec = 2; ts.tv_nsec = 0; microuptime(&now); endtime = now.tv_sec + timeo; while (now.tv_sec < endtime) { error = error2 = 0; - if (!msgreq.lmr_answered) - error = msleep(&msgreq, nfs_lock_mutex, PCATCH | PUSER, "lockd", &ts); - if (msgreq.lmr_answered) { + if (!msgreq->lmr_answered) { + error = msleep(msgreq, nfs_lock_mutex, slpflag | PUSER, "lockd", &ts); + slpflag = 0; + } + if (msgreq->lmr_answered) { /* * Note: it's possible to have a lock granted at * essentially the same time that we get interrupted. @@ -775,8 +490,8 @@ nfs3_vnop_advlock( * error from this request or we might not unlock the * lock that's been granted. */ - nmp = VTONMP(vp); - if ((msgreq.lmr_errno == ENOTSUP) && nmp && + nmp = NFSTONMP(np); + if ((msgreq->lmr_errno == ENOTSUP) && nmp && (nmp->nm_state & NFSSTA_LOCKSWORK)) { /* * We have evidence that locks work, yet lockd @@ -797,58 +512,81 @@ nfs3_vnop_advlock( break; /* check that we still have our mount... */ /* ...and that we still support locks */ - nmp = VTONMP(vp); - if ((error2 = nfs_sigintr(nmp, NULL, vfs_context_thread(ctx), 0))) { + /* ...and that there isn't a recovery pending */ + nmp = NFSTONMP(np); + if ((error2 = nfs_sigintr(nmp, NULL, NULL, 0))) { error = error2; - if (fl->l_type == F_UNLCK) - printf("nfs_vnop_advlock: aborting unlock request, error %d\n", error); + if (type == F_UNLCK) + printf("nfs3_lockd_request: aborting unlock request, error %d\n", error); break; } lck_mtx_lock(&nmp->nm_lock); - if (nmp->nm_flag & NFSMNT_NOLOCKS) { + if (nmp->nm_lockmode == NFS_LOCK_MODE_DISABLED) { + lck_mtx_unlock(&nmp->nm_lock); + break; + } + if ((nmp->nm_state & NFSSTA_RECOVER) && !(flags & R_RECOVER)) { + /* recovery pending... return an error that'll get this operation restarted */ + error = NFSERR_GRACE; lck_mtx_unlock(&nmp->nm_lock); break; } - interruptable = nmp->nm_flag & NFSMNT_INT; + interruptable = NMFLAG(nmp, INTR); lck_mtx_unlock(&nmp->nm_lock); microuptime(&now); } if (error) { /* check that we still have our mount... */ - nmp = VTONMP(vp); - if ((error2 = nfs_sigintr(nmp, NULL, vfs_context_thread(ctx), 0))) { + nmp = NFSTONMP(np); + if ((error2 = nfs_sigintr(nmp, NULL, NULL, 0))) { error = error2; if (error2 != EINTR) { - if (fl->l_type == F_UNLCK) - printf("nfs_vnop_advlock: aborting unlock request, error %d\n", error); + if (type == F_UNLCK) + printf("nfs3_lockd_request: aborting unlock request, error %d\n", error); break; } } /* ...and that we still support locks */ lck_mtx_lock(&nmp->nm_lock); - if (nmp->nm_flag & NFSMNT_NOLOCKS) { + if (nmp->nm_lockmode == NFS_LOCK_MODE_DISABLED) { if (error == EWOULDBLOCK) error = ENOTSUP; lck_mtx_unlock(&nmp->nm_lock); break; } - interruptable = nmp->nm_flag & NFSMNT_INT; - if (error != EWOULDBLOCK) { + /* ...and that there isn't a recovery pending */ + if ((error == EWOULDBLOCK) && (nmp->nm_state & NFSSTA_RECOVER) && !(flags & R_RECOVER)) { + /* recovery pending... return to allow recovery to occur */ + error = NFSERR_DENIED; + lck_mtx_unlock(&nmp->nm_lock); + break; + } + interruptable = NMFLAG(nmp, INTR); + if ((error != EWOULDBLOCK) || + ((nmp->nm_state & NFSSTA_RECOVER) && !(flags & R_RECOVER)) || + ((flags & R_RECOVER) && ((now.tv_sec - starttime) > 30))) { + if ((error == EWOULDBLOCK) && (flags & R_RECOVER)) { + /* give up if this is for recovery and taking too long */ + error = ETIMEDOUT; + } else if ((nmp->nm_state & NFSSTA_RECOVER) && !(flags & R_RECOVER)) { + /* recovery pending... return an error that'll get this operation restarted */ + error = NFSERR_GRACE; + } lck_mtx_unlock(&nmp->nm_lock); /* * We're going to bail on this request. * If we were a blocked lock request, send a cancel. */ - if ((msgreq.lmr_errno == EINPROGRESS) && + if ((msgreq->lmr_errno == EINPROGRESS) && !(msg->lm_flags & LOCKD_MSG_CANCEL)) { /* set this request up as a cancel */ msg->lm_flags |= LOCKD_MSG_CANCEL; - nfs_lockdmsg_dequeue(&msgreq); + nfs_lockdmsg_dequeue(msgreq); msg->lm_xid = nfs_lockxid_get(); - nfs_lockdmsg_enqueue(&msgreq); - msgreq.lmr_saved_errno = error; - msgreq.lmr_errno = 0; - msgreq.lmr_answered = 0; + nfs_lockdmsg_enqueue(msgreq); + msgreq->lmr_saved_errno = error; + msgreq->lmr_errno = 0; + msgreq->lmr_answered = 0; /* reset timeout */ timeo = 2; /* send cancel request */ @@ -859,18 +597,18 @@ nfs3_vnop_advlock( /* warn if we're not getting any response */ microuptime(&now); - if ((msgreq.lmr_errno != EINPROGRESS) && + if ((msgreq->lmr_errno != EINPROGRESS) && !(msg->lm_flags & LOCKD_MSG_DENIED_GRACE) && (nmp->nm_tprintf_initial_delay != 0) && ((lastmsg + nmp->nm_tprintf_delay) < now.tv_sec)) { lck_mtx_unlock(&nmp->nm_lock); lastmsg = now.tv_sec; - nfs_down(nmp, vfs_context_thread(ctx), 0, NFSSTA_LOCKTIMEO, "lockd not responding"); + nfs_down(nmp, thd, 0, NFSSTA_LOCKTIMEO, "lockd not responding"); wentdown = 1; } else lck_mtx_unlock(&nmp->nm_lock); - if (msgreq.lmr_errno == EINPROGRESS) { + if (msgreq->lmr_errno == EINPROGRESS) { /* * We've got a blocked lock request that we are * going to retry. First, we'll want to try to @@ -883,95 +621,63 @@ nfs3_vnop_advlock( * it is NLM_BLOCKED). */ msg->lm_flags |= LOCKD_MSG_CANCEL; - nfs_lockdmsg_dequeue(&msgreq); + nfs_lockdmsg_dequeue(msgreq); msg->lm_xid = nfs_lockxid_get(); - nfs_lockdmsg_enqueue(&msgreq); - msgreq.lmr_saved_errno = msgreq.lmr_errno; - msgreq.lmr_errno = 0; - msgreq.lmr_answered = 0; + nfs_lockdmsg_enqueue(msgreq); + msgreq->lmr_saved_errno = msgreq->lmr_errno; + msgreq->lmr_errno = 0; + msgreq->lmr_answered = 0; timeo = 2; /* send cancel then resend request */ continue; } - if (msg->lm_flags & LOCKD_MSG_DENIED_GRACE) { - /* - * Time to resend a request previously denied due to a grace period. - */ - msg->lm_flags &= ~LOCKD_MSG_DENIED_GRACE; - nfs_lockdmsg_dequeue(&msgreq); - msg->lm_xid = nfs_lockxid_get(); - nfs_lockdmsg_enqueue(&msgreq); - msgreq.lmr_saved_errno = 0; - msgreq.lmr_errno = 0; - msgreq.lmr_answered = 0; - timeo = 2; - /* resend request */ - continue; - } - /* * We timed out, so we will resend the request. */ - timeo *= 2; - if (timeo > 60) - timeo = 60; + if (!(flags & R_RECOVER)) + timeo *= 2; + if (timeo > 30) + timeo = 30; /* resend request */ continue; } /* we got a reponse, so the server's lockd is OK */ - nfs_up(VTONMP(vp), vfs_context_thread(ctx), NFSSTA_LOCKTIMEO, + nfs_up(NFSTONMP(np), thd, NFSSTA_LOCKTIMEO, wentdown ? "lockd alive again" : NULL); wentdown = 0; - if (msgreq.lmr_answered && (msg->lm_flags & LOCKD_MSG_DENIED_GRACE)) { + if (msgreq->lmr_answered && (msg->lm_flags & LOCKD_MSG_DENIED_GRACE)) { /* * The lock request was denied because the server lockd is * still in its grace period. So, we need to try the - * request again in a little bit. + * request again in a little bit. Return the GRACE error so + * the higher levels can perform the retry. */ - timeo = 4; - msgreq.lmr_answered = 0; - goto wait_for_granted; + msgreq->lmr_saved_errno = msgreq->lmr_errno = error = NFSERR_GRACE; } - if (msgreq.lmr_errno == EINPROGRESS) { + if (msgreq->lmr_errno == EINPROGRESS) { /* got NLM_BLOCKED response */ /* need to wait for NLM_GRANTED */ - timeo = 60; - msgreq.lmr_answered = 0; + timeo = 30; + msgreq->lmr_answered = 0; goto wait_for_granted; } if ((msg->lm_flags & LOCKD_MSG_CANCEL) && - (msgreq.lmr_saved_errno == EINPROGRESS)) { + (msgreq->lmr_saved_errno == EINPROGRESS)) { /* * We just got a successful reply to the * cancel of the previous blocked lock request. - * Now, go ahead and resend the request. + * Now, go ahead and return a DENIED error so the + * higher levels can resend the request. */ msg->lm_flags &= ~LOCKD_MSG_CANCEL; - nfs_lockdmsg_dequeue(&msgreq); - msg->lm_xid = nfs_lockxid_get(); - nfs_lockdmsg_enqueue(&msgreq); - msgreq.lmr_saved_errno = 0; - msgreq.lmr_errno = 0; - msgreq.lmr_answered = 0; - timeo = 2; - /* resend request */ - continue; - } - - if ((msg->lm_flags & LOCKD_MSG_TEST) && msgreq.lmr_errno == 0) { - if (msg->lm_fl.l_type != F_UNLCK) { - fl->l_type = msg->lm_fl.l_type; - fl->l_pid = msg->lm_fl.l_pid; - fl->l_start = msg->lm_fl.l_start; - fl->l_len = msg->lm_fl.l_len; - fl->l_whence = SEEK_SET; - } else - fl->l_type = F_UNLCK; + nfs_lockdmsg_dequeue(msgreq); + error = NFSERR_DENIED; + break; } /* @@ -981,11 +687,12 @@ nfs3_vnop_advlock( */ if (msg->lm_flags & LOCKD_MSG_CANCEL) { msg->lm_flags &= ~LOCKD_MSG_CANCEL; - error = msgreq.lmr_saved_errno; - } else - error = msgreq.lmr_errno; + error = msgreq->lmr_saved_errno; + } else { + error = msgreq->lmr_errno; + } - nmp = VTONMP(vp); + nmp = NFSTONMP(np); if ((error == ENOTSUP) && nmp && !(nmp->nm_state & NFSSTA_LOCKSWORK)) { /* * We have NO evidence that locks work and lockd @@ -993,12 +700,18 @@ nfs3_vnop_advlock( * that locks aren't supported and disable them * for this mount. */ + nfs_lockdmsg_dequeue(msgreq); + lck_mtx_unlock(nfs_lock_mutex); lck_mtx_lock(&nmp->nm_lock); - nmp->nm_flag |= NFSMNT_NOLOCKS; + if (nmp->nm_lockmode == NFS_LOCK_MODE_ENABLED) { + nmp->nm_lockmode = NFS_LOCK_MODE_DISABLED; + nfs_lockd_mount_unregister(nmp); + } nmp->nm_state &= ~NFSSTA_LOCKTIMEO; lck_mtx_unlock(&nmp->nm_lock); printf("lockd returned ENOTSUP, disabling locks for nfs server: %s\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname); + return (error); } if (!error) { /* record that NFS file locking has worked on this mount */ @@ -1008,35 +721,162 @@ nfs3_vnop_advlock( nmp->nm_state |= NFSSTA_LOCKSWORK; lck_mtx_unlock(&nmp->nm_lock); } - /* - * If we successfully acquired a lock, make sure this pid - * is in the nfs_lock_pid hash table so we know we can't - * short-circuit unlock requests. - */ - if ((lockpidcheck == ENOENT) && - ((ap->a_op == F_SETLK) || (ap->a_op == F_SETLKW))) { - error = nfs_lock_pid_check(p, 1); - if (error) { - /* - * We couldn't add the pid to the table, - * so we can no longer trust that a pid - * not in the table has no locks. - */ - nfs_lock_pid_hash_trusted = 0; - printf("nfs_vnop_advlock: pid add failed - no longer trusted\n"); - } - } } break; } - nfs_lockdmsg_dequeue(&msgreq); + nfs_lockdmsg_dequeue(msgreq); lck_mtx_unlock(nfs_lock_mutex); return (error); } +/* + * Send an NLM LOCK message to the server + */ +int +nfs3_setlock_rpc( + nfsnode_t np, + struct nfs_open_file *nofp, + struct nfs_file_lock *nflp, + int reclaim, + int flags, + thread_t thd, + kauth_cred_t cred) +{ + struct nfs_lock_owner *nlop = nflp->nfl_owner; + struct nfsmount *nmp; + int error; + LOCKD_MSG_REQUEST msgreq; + LOCKD_MSG *msg; + + nmp = NFSTONMP(np); + if (!nmp) + return (ENXIO); + + if (!nlop->nlo_open_owner) { + nfs_open_owner_ref(nofp->nof_owner); + nlop->nlo_open_owner = nofp->nof_owner; + } + if ((error = nfs_lock_owner_set_busy(nlop, thd))) + return (error); + + /* set up lock message request structure */ + bzero(&msgreq, sizeof(msgreq)); + msg = &msgreq.lmr_msg; + msg->lm_version = LOCKD_MSG_VERSION; + if ((nflp->nfl_flags & NFS_FILE_LOCK_WAIT) && !reclaim) + msg->lm_flags |= LOCKD_MSG_BLOCK; + if (reclaim) + msg->lm_flags |= LOCKD_MSG_RECLAIM; + msg->lm_fh_len = (nmp->nm_vers == NFS_VER2) ? NFSX_V2FH : np->n_fhsize; + bcopy(np->n_fhp, msg->lm_fh, msg->lm_fh_len); + cru2x(cred, &msg->lm_cred); + + msg->lm_fl.l_whence = SEEK_SET; + msg->lm_fl.l_start = nflp->nfl_start; + msg->lm_fl.l_len = NFS_FLOCK_LENGTH(nflp->nfl_start, nflp->nfl_end); + msg->lm_fl.l_type = nflp->nfl_type; + msg->lm_fl.l_pid = nlop->nlo_pid; + + error = nfs3_lockd_request(np, 0, &msgreq, flags, thd); + + nfs_lock_owner_clear_busy(nlop); + return (error); +} + +/* + * Send an NLM UNLOCK message to the server + */ +int +nfs3_unlock_rpc( + nfsnode_t np, + struct nfs_lock_owner *nlop, + __unused int type, + uint64_t start, + uint64_t end, + int flags, + thread_t thd, + kauth_cred_t cred) +{ + struct nfsmount *nmp; + LOCKD_MSG_REQUEST msgreq; + LOCKD_MSG *msg; + + nmp = NFSTONMP(np); + if (!nmp) + return (ENXIO); + + /* set up lock message request structure */ + bzero(&msgreq, sizeof(msgreq)); + msg = &msgreq.lmr_msg; + msg->lm_version = LOCKD_MSG_VERSION; + msg->lm_fh_len = (nmp->nm_vers == NFS_VER2) ? NFSX_V2FH : np->n_fhsize; + bcopy(np->n_fhp, msg->lm_fh, msg->lm_fh_len); + cru2x(cred, &msg->lm_cred); + + msg->lm_fl.l_whence = SEEK_SET; + msg->lm_fl.l_start = start; + msg->lm_fl.l_len = NFS_FLOCK_LENGTH(start, end); + msg->lm_fl.l_type = F_UNLCK; + msg->lm_fl.l_pid = nlop->nlo_pid; + + return (nfs3_lockd_request(np, F_UNLCK, &msgreq, flags, thd)); +} + +/* + * Send an NLM LOCK TEST message to the server + */ +int +nfs3_getlock_rpc( + nfsnode_t np, + struct nfs_lock_owner *nlop, + struct flock *fl, + uint64_t start, + uint64_t end, + vfs_context_t ctx) +{ + struct nfsmount *nmp; + int error; + LOCKD_MSG_REQUEST msgreq; + LOCKD_MSG *msg; + + nmp = NFSTONMP(np); + if (!nmp) + return (ENXIO); + + /* set up lock message request structure */ + bzero(&msgreq, sizeof(msgreq)); + msg = &msgreq.lmr_msg; + msg->lm_version = LOCKD_MSG_VERSION; + msg->lm_flags |= LOCKD_MSG_TEST; + msg->lm_fh_len = (nmp->nm_vers == NFS_VER2) ? NFSX_V2FH : np->n_fhsize; + bcopy(np->n_fhp, msg->lm_fh, msg->lm_fh_len); + cru2x(vfs_context_ucred(ctx), &msg->lm_cred); + + msg->lm_fl.l_whence = SEEK_SET; + msg->lm_fl.l_start = start; + msg->lm_fl.l_len = NFS_FLOCK_LENGTH(start, end); + msg->lm_fl.l_type = fl->l_type; + msg->lm_fl.l_pid = nlop->nlo_pid; + + error = nfs3_lockd_request(np, 0, &msgreq, 0, vfs_context_thread(ctx)); + + if (!error && (msg->lm_flags & LOCKD_MSG_TEST) && !msgreq.lmr_errno) { + if (msg->lm_fl.l_type != F_UNLCK) { + fl->l_type = msg->lm_fl.l_type; + fl->l_pid = msg->lm_fl.l_pid; + fl->l_start = msg->lm_fl.l_start; + fl->l_len = msg->lm_fl.l_len; + fl->l_whence = SEEK_SET; + } else + fl->l_type = F_UNLCK; + } + + return (error); +} + /* * nfslockdans -- * NFS advisory byte-level locks answer from the lock daemon. @@ -1105,3 +945,58 @@ nfslockdans(proc_t p, struct lockd_ans *ansp) return (0); } +/* + * nfslockdnotify -- + * NFS host restart notification from the lock daemon. + * + * Used to initiate reclaiming of held locks when a server we + * have mounted reboots. + */ +int +nfslockdnotify(proc_t p, user_addr_t argp) +{ + int error, i, headsize; + struct lockd_notify ln; + struct nfsmount *nmp; + struct sockaddr *saddr; + + /* Let root make this call. */ + error = proc_suser(p); + if (error) + return (error); + + headsize = (char*)&ln.ln_addr[0] - (char*)&ln.ln_version; + error = copyin(argp, &ln, headsize); + if (error) + return (error); + if (ln.ln_version != LOCKD_NOTIFY_VERSION) + return (EINVAL); + if ((ln.ln_addrcount < 1) || (ln.ln_addrcount > 128)) + return (EINVAL); + argp += headsize; + saddr = (struct sockaddr *)&ln.ln_addr[0]; + + lck_mtx_lock(nfs_lock_mutex); + + for (i=0; i < ln.ln_addrcount; i++) { + error = copyin(argp, &ln.ln_addr[0], sizeof(ln.ln_addr[0])); + if (error) + break; + argp += sizeof(ln.ln_addr[0]); + /* scan lockd mount list for match to this address */ + TAILQ_FOREACH(nmp, &nfs_lockd_mount_list, nm_ldlink) { + /* check if address matches this mount's server address */ + if (!nmp->nm_saddr || nfs_sockaddr_cmp(saddr, nmp->nm_saddr)) + continue; + /* We have a match! Mark it as needing recovery. */ + lck_mtx_lock(&nmp->nm_lock); + nfs_need_recover(nmp, 0); + lck_mtx_unlock(&nmp->nm_lock); + } + } + + lck_mtx_unlock(nfs_lock_mutex); + + return (error); +} + diff --git a/bsd/nfs/nfs_lock.h b/bsd/nfs/nfs_lock.h index 7bd4e91a8..5a5efe3e4 100644 --- a/bsd/nfs/nfs_lock.h +++ b/bsd/nfs/nfs_lock.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002-2008 Apple Inc. All rights reserved. + * Copyright (c) 2002-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -91,6 +91,8 @@ typedef struct nfs_lock_msg { #define LOCKD_MSG_NFSV3 0x0004 /* NFSv3 request */ #define LOCKD_MSG_CANCEL 0x0008 /* cancelling blocked request */ #define LOCKD_MSG_DENIED_GRACE 0x0010 /* lock denied due to grace period */ +#define LOCKD_MSG_RECLAIM 0x0020 /* lock reclaim request */ +#define LOCKD_MSG_TCP 0x0040 /* (try to) use TCP for request */ /* The structure used to maintain the pending request queue */ typedef struct nfs_lock_msg_request { @@ -128,11 +130,26 @@ struct lockd_ans { #define LOCKD_ANS_DENIED_GRACE 0x0008 /* lock denied due to grace period */ +/* + * The structure that lockd hands the kernel for each notify. + */ +#define LOCKD_NOTIFY_VERSION 1 +struct lockd_notify { + int ln_version; /* lockd_notify version */ + int ln_flags; /* notify flags */ + int ln_pad; /* (for alignment) */ + int ln_addrcount; /* # of addresss */ + struct sockaddr_storage ln_addr[1]; /* List of addresses. */ +}; + + #ifdef KERNEL void nfs_lockinit(void); -void nfs_lockd_mount_change(int); -int nfs3_vnop_advlock(struct vnop_advlock_args *ap); +void nfs_lockd_mount_register(struct nfsmount *); +void nfs_lockd_mount_unregister(struct nfsmount *); +int nfs3_lockd_request(nfsnode_t, int, LOCKD_MSG_REQUEST *, int, thread_t); int nfslockdans(proc_t p, struct lockd_ans *ansp); +int nfslockdnotify(proc_t p, user_addr_t argp); #endif #endif /* __APPLE_API_PRIVATE */ diff --git a/bsd/nfs/nfs_node.c b/bsd/nfs/nfs_node.c index 7d1926787..b3f2a47b9 100644 --- a/bsd/nfs/nfs_node.c +++ b/bsd/nfs/nfs_node.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -75,6 +75,7 @@ #include #include #include +#include #include #include @@ -145,6 +146,7 @@ nfs_nget( int fhsize, struct nfs_vattr *nvap, u_int64_t *xidp, + uint32_t auth, int flags, nfsnode_t *npp) { @@ -175,6 +177,21 @@ nfs_nget( if (mp != mp2 || np->n_fhsize != fhsize || bcmp(fhp, np->n_fhp, fhsize)) continue; + if (nvap && (nvap->nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) && + cnp && (cnp->cn_namelen > (fhsize - (int)sizeof(dnp)))) { + /* The name was too long to fit in the file handle. Check it against the node's name. */ + int namecmp = 0; + const char *vname = vnode_getname(NFSTOV(np)); + if (vname) { + if (cnp->cn_namelen != (int)strlen(vname)) + namecmp = 1; + else + namecmp = strncmp(vname, cnp->cn_nameptr, cnp->cn_namelen); + vnode_putname(vname); + } + if (namecmp) /* full name didn't match */ + continue; + } FSDBG(263, dnp, np, np->n_flag, 0xcace0000); /* if the node is locked, sleep on it */ if ((np->n_hflag & NHLOCKED) && !(flags & NG_NOCREATE)) { @@ -246,10 +263,21 @@ nfs_nget( bzero(np, sizeof *np); np->n_hflag |= (NHINIT | NHLOCKED); np->n_mount = mp; + np->n_auth = auth; TAILQ_INIT(&np->n_opens); TAILQ_INIT(&np->n_lock_owners); TAILQ_INIT(&np->n_locks); np->n_dlink.tqe_next = NFSNOLIST; + np->n_dreturn.tqe_next = NFSNOLIST; + np->n_monlink.le_next = NFSNOLIST; + + /* ugh... need to keep track of ".zfs" directories to workaround server bugs */ + if ((nvap->nva_type == VDIR) && cnp && (cnp->cn_namelen == 4) && + (cnp->cn_nameptr[0] == '.') && (cnp->cn_nameptr[1] == 'z') && + (cnp->cn_nameptr[2] == 'f') && (cnp->cn_nameptr[3] == 's')) + np->n_flag |= NISDOTZFS; + if (dnp && (dnp->n_flag & NISDOTZFS)) + np->n_flag |= NISDOTZFSCHILD; if (dnp && cnp && ((cnp->cn_namelen != 2) || (cnp->cn_nameptr[0] != '.') || (cnp->cn_nameptr[1] != '.'))) { @@ -293,6 +321,8 @@ nfs_nget( lck_mtx_unlock(nfs_node_hash_mutex); /* do initial loading of attributes */ + NACLINVALIDATE(np); + NACCESSINVALIDATE(np); error = nfs_loadattrcache(np, nvap, xidp, 1); if (error) { FSDBG(266, 0, np, np->n_flag, 0xb1eb1e); @@ -325,7 +355,6 @@ nfs_nget( NFS_CHANGED_UPDATE(nfsvers, np, nvap); if (nvap->nva_type == VDIR) NFS_CHANGED_UPDATE_NC(nfsvers, np, nvap); - NMODEINVALIDATE(np); /* now, attempt to get a new vnode */ vfsp.vnfs_mp = mp; @@ -363,7 +392,21 @@ nfs_nget( if (!dnp || !cnp || !(flags & NG_MAKEENTRY)) vfsp.vnfs_flags |= VNFS_NOCACHE; - error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &np->n_vnode); +#if CONFIG_TRIGGERS + if ((nfsvers >= NFS_VER4) && (nvap->nva_type == VDIR) && (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER)) { + struct vnode_trigger_param vtp; + bzero(&vtp, sizeof(vtp)); + bcopy(&vfsp, &vtp.vnt_params, sizeof(vfsp)); + vtp.vnt_resolve_func = nfs_mirror_mount_trigger_resolve; + vtp.vnt_unresolve_func = nfs_mirror_mount_trigger_unresolve; + vtp.vnt_rearm_func = nfs_mirror_mount_trigger_rearm; + vtp.vnt_flags = VNT_AUTO_REARM; + error = vnode_create(VNCREATE_TRIGGER, VNCREATE_TRIGGER_SIZE, &vtp, &np->n_vnode); + } else +#endif + { + error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &np->n_vnode); + } if (error) { FSDBG(266, 0, np, np->n_flag, 0xb1eb1e); nfs_node_unlock(np); @@ -425,57 +468,58 @@ nfs_vnop_inactive(ap) nfsnode_t np = VTONFS(ap->a_vp); struct nfs_sillyrename *nsp; struct nfs_vattr nvattr; - int unhash, attrerr, busyerror, error, inuse, busied; + int unhash, attrerr, busyerror, error, inuse, busied, force; struct nfs_open_file *nofp; - const char *vname = NULL; struct componentname cn; struct nfsmount *nmp = NFSTONMP(np); + mount_t mp = vnode_mount(vp); restart: + force = (!mp || (mp->mnt_kern_flag & MNTK_FRCUNMOUNT)); error = 0; - inuse = ((nmp->nm_vers >= NFS_VER4) && (nfs_mount_state_in_use_start(nmp) == 0)); + inuse = (nfs_mount_state_in_use_start(nmp, NULL) == 0); /* There shouldn't be any open or lock state at this point */ lck_mtx_lock(&np->n_openlock); - if (np->n_openrefcnt) { - vname = vnode_getname(vp); - printf("nfs_vnop_inactive: still open: %d %s\n", np->n_openrefcnt, vname ? vname : "//"); - } + if (np->n_openrefcnt && !force) + NP(np, "nfs_vnop_inactive: still open: %d", np->n_openrefcnt); TAILQ_FOREACH(nofp, &np->n_opens, nof_link) { lck_mtx_lock(&nofp->nof_lock); if (nofp->nof_flags & NFS_OPEN_FILE_BUSY) { - if (!vname) - vname = vnode_getname(vp); - printf("nfs_vnop_inactive: open file busy: %s\n", vname ? vname : "//"); + if (!force) + NP(np, "nfs_vnop_inactive: open file busy"); busied = 0; } else { nofp->nof_flags |= NFS_OPEN_FILE_BUSY; busied = 1; } lck_mtx_unlock(&nofp->nof_lock); + if ((np->n_flag & NREVOKE) || (nofp->nof_flags & NFS_OPEN_FILE_LOST)) { + if (busied) + nfs_open_file_clear_busy(nofp); + continue; + } /* * If we just created the file, we already had it open in * anticipation of getting a subsequent open call. If the * node has gone inactive without being open, we need to * clean up (close) the open done in the create. */ - if ((nofp->nof_flags & NFS_OPEN_FILE_CREATE) && nofp->nof_creator) { + if ((nofp->nof_flags & NFS_OPEN_FILE_CREATE) && nofp->nof_creator && !force) { if (nofp->nof_flags & NFS_OPEN_FILE_REOPEN) { lck_mtx_unlock(&np->n_openlock); if (busied) nfs_open_file_clear_busy(nofp); if (inuse) nfs_mount_state_in_use_end(nmp, 0); - nfs4_reopen(nofp, vfs_context_thread(ctx)); - goto restart; + if (!nfs4_reopen(nofp, NULL)) + goto restart; } nofp->nof_flags &= ~NFS_OPEN_FILE_CREATE; lck_mtx_unlock(&np->n_openlock); - error = nfs4_close(np, nofp, NFS_OPEN_SHARE_ACCESS_BOTH, NFS_OPEN_SHARE_DENY_NONE, ctx); + error = nfs_close(np, nofp, NFS_OPEN_SHARE_ACCESS_BOTH, NFS_OPEN_SHARE_DENY_NONE, ctx); if (error) { - if (!vname) - vname = vnode_getname(vp); - printf("nfs_vnop_inactive: create close error: %d, %s\n", error, vname); + NP(np, "nfs_vnop_inactive: create close error: %d", error); nofp->nof_flags |= NFS_OPEN_FILE_CREATE; } if (busied) @@ -495,21 +539,19 @@ nfs_vnop_inactive(ap) nofp->nof_r--; nofp->nof_opencnt--; nofp->nof_access = 0; - } else { + } else if (!force) { lck_mtx_unlock(&np->n_openlock); if (nofp->nof_flags & NFS_OPEN_FILE_REOPEN) { if (busied) nfs_open_file_clear_busy(nofp); if (inuse) nfs_mount_state_in_use_end(nmp, 0); - nfs4_reopen(nofp, vfs_context_thread(ctx)); - goto restart; + if (!nfs4_reopen(nofp, NULL)) + goto restart; } - error = nfs4_close(np, nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE, ctx); + error = nfs_close(np, nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE, ctx); if (error) { - if (!vname) - vname = vnode_getname(vp); - printf("nfs_vnop_inactive: need close error: %d, %s\n", error, vname); + NP(np, "nfs_vnop_inactive: need close error: %d", error); nofp->nof_flags |= NFS_OPEN_FILE_NEEDCLOSE; } if (busied) @@ -519,32 +561,33 @@ nfs_vnop_inactive(ap) goto restart; } } - if (nofp->nof_opencnt) { - if (!vname) - vname = vnode_getname(vp); - printf("nfs_vnop_inactive: file still open: %d %s\n", nofp->nof_opencnt, vname ? vname : "//"); - } - if (nofp->nof_access || nofp->nof_deny || + if (nofp->nof_opencnt && !force) + NP(np, "nfs_vnop_inactive: file still open: %d", nofp->nof_opencnt); + if (!force && (nofp->nof_access || nofp->nof_deny || nofp->nof_mmap_access || nofp->nof_mmap_deny || nofp->nof_r || nofp->nof_w || nofp->nof_rw || nofp->nof_r_dw || nofp->nof_w_dw || nofp->nof_rw_dw || - nofp->nof_r_drw || nofp->nof_w_drw || nofp->nof_rw_drw) { - if (!vname) - vname = vnode_getname(vp); - printf("nfs_vnop_inactive: non-zero access: %d %d %d %d # %u %u %u dw %u %u %u drw %u %u %u %s\n", + nofp->nof_r_drw || nofp->nof_w_drw || nofp->nof_rw_drw || + nofp->nof_d_r || nofp->nof_d_w || nofp->nof_d_rw || + nofp->nof_d_r_dw || nofp->nof_d_w_dw || nofp->nof_d_rw_dw || + nofp->nof_d_r_drw || nofp->nof_d_w_drw || nofp->nof_d_rw_drw)) { + NP(np, "nfs_vnop_inactive: non-zero access: %d %d %d %d # %u.%u %u.%u %u.%u dw %u.%u %u.%u %u.%u drw %u.%u %u.%u %u.%u", nofp->nof_access, nofp->nof_deny, nofp->nof_mmap_access, nofp->nof_mmap_deny, - nofp->nof_r, nofp->nof_w, nofp->nof_rw, - nofp->nof_r_dw, nofp->nof_w_dw, nofp->nof_rw_dw, - nofp->nof_r_drw, nofp->nof_w_drw, nofp->nof_rw_drw, - vname ? vname : "//"); + nofp->nof_r, nofp->nof_d_r, + nofp->nof_w, nofp->nof_d_w, + nofp->nof_rw, nofp->nof_d_rw, + nofp->nof_r_dw, nofp->nof_d_r_dw, + nofp->nof_w_dw, nofp->nof_d_w_dw, + nofp->nof_rw_dw, nofp->nof_d_rw_dw, + nofp->nof_r_drw, nofp->nof_d_r_drw, + nofp->nof_w_drw, nofp->nof_d_w_drw, + nofp->nof_rw_drw, nofp->nof_d_rw_drw); } if (busied) nfs_open_file_clear_busy(nofp); } lck_mtx_unlock(&np->n_openlock); - if (vname) - vnode_putname(vname); if (inuse && nfs_mount_state_in_use_end(nmp, error)) goto restart; @@ -673,42 +716,59 @@ nfs_vnop_reclaim(ap) struct nfs_open_file *nofp, *nextnofp; struct nfs_file_lock *nflp, *nextnflp; struct nfs_lock_owner *nlop, *nextnlop; - const char *vname = NULL; struct nfsmount *nmp = np->n_mount ? VFSTONFS(np->n_mount) : NFSTONMP(np); + mount_t mp = vnode_mount(vp); + int force; FSDBG_TOP(265, vp, np, np->n_flag, 0); + force = (!mp || (mp->mnt_kern_flag & MNTK_FRCUNMOUNT)); /* There shouldn't be any open or lock state at this point */ lck_mtx_lock(&np->n_openlock); if (nmp && (nmp->nm_vers >= NFS_VER4)) { /* need to drop a delegation */ + if (np->n_dreturn.tqe_next != NFSNOLIST) { + /* remove this node from the delegation return list */ + lck_mtx_lock(&nmp->nm_lock); + if (np->n_dreturn.tqe_next != NFSNOLIST) { + TAILQ_REMOVE(&nmp->nm_dreturnq, np, n_dreturn); + np->n_dreturn.tqe_next = NFSNOLIST; + } + lck_mtx_unlock(&nmp->nm_lock); + } if (np->n_dlink.tqe_next != NFSNOLIST) { - /* remove this node from the recall list */ + /* remove this node from the delegation list */ lck_mtx_lock(&nmp->nm_lock); if (np->n_dlink.tqe_next != NFSNOLIST) { - TAILQ_REMOVE(&nmp->nm_recallq, np, n_dlink); + TAILQ_REMOVE(&nmp->nm_delegations, np, n_dlink); np->n_dlink.tqe_next = NFSNOLIST; } lck_mtx_unlock(&nmp->nm_lock); } - if (np->n_openflags & N_DELEG_MASK) { + if ((np->n_openflags & N_DELEG_MASK) && !force) { + /* try to return the delegation */ np->n_openflags &= ~N_DELEG_MASK; nfs4_delegreturn_rpc(nmp, np->n_fhp, np->n_fhsize, &np->n_dstateid, - vfs_context_thread(ctx), vfs_context_ucred(ctx)); + R_RECOVER, vfs_context_thread(ctx), vfs_context_ucred(ctx)); + } + if (np->n_attrdirfh) { + FREE(np->n_attrdirfh, M_TEMP); + np->n_attrdirfh = NULL; } } /* clean up file locks */ TAILQ_FOREACH_SAFE(nflp, &np->n_locks, nfl_link, nextnflp) { - if (!(nflp->nfl_flags & NFS_FILE_LOCK_DEAD)) { - if (!vname) - vname = vnode_getname(vp); - printf("nfs_vnop_reclaim: lock 0x%llx 0x%llx 0x%x (bc %d) %s\n", - nflp->nfl_start, nflp->nfl_end, nflp->nfl_flags, - nflp->nfl_blockcnt, vname ? vname : "//"); + if (!(nflp->nfl_flags & NFS_FILE_LOCK_DEAD) && !force) { + NP(np, "nfs_vnop_reclaim: lock 0x%llx 0x%llx 0x%x (bc %d)", + nflp->nfl_start, nflp->nfl_end, nflp->nfl_flags, nflp->nfl_blockcnt); } - if (!(nflp->nfl_flags & NFS_FILE_LOCK_BLOCKED)) { + if (!(nflp->nfl_flags & (NFS_FILE_LOCK_BLOCKED|NFS_FILE_LOCK_DEAD))) { + /* try sending an unlock RPC if it wasn't delegated */ + if (!(nflp->nfl_flags & NFS_FILE_LOCK_DELEGATED) && !force) + nmp->nm_funcs->nf_unlock_rpc(np, nflp->nfl_owner, F_WRLCK, nflp->nfl_start, nflp->nfl_end, R_RECOVER, + NULL, nflp->nfl_owner->nlo_open_owner->noo_cred); lck_mtx_lock(&nflp->nfl_owner->nlo_lock); TAILQ_REMOVE(&nflp->nfl_owner->nlo_locks, nflp, nfl_lolink); lck_mtx_unlock(&nflp->nfl_owner->nlo_lock); @@ -718,72 +778,79 @@ nfs_vnop_reclaim(ap) } /* clean up lock owners */ TAILQ_FOREACH_SAFE(nlop, &np->n_lock_owners, nlo_link, nextnlop) { - if (!TAILQ_EMPTY(&nlop->nlo_locks)) { - if (!vname) - vname = vnode_getname(vp); - printf("nfs_vnop_reclaim: lock owner with locks %s\n", - vname ? vname : "//"); - } + if (!TAILQ_EMPTY(&nlop->nlo_locks) && !force) + NP(np, "nfs_vnop_reclaim: lock owner with locks"); TAILQ_REMOVE(&np->n_lock_owners, nlop, nlo_link); nfs_lock_owner_destroy(nlop); } /* clean up open state */ - if (np->n_openrefcnt) { - if (!vname) - vname = vnode_getname(vp); - printf("nfs_vnop_reclaim: still open: %d %s\n", - np->n_openrefcnt, vname ? vname : "//"); - } + if (np->n_openrefcnt && !force) + NP(np, "nfs_vnop_reclaim: still open: %d", np->n_openrefcnt); TAILQ_FOREACH_SAFE(nofp, &np->n_opens, nof_link, nextnofp) { - if (nofp->nof_flags & NFS_OPEN_FILE_BUSY) { - if (!vname) - vname = vnode_getname(vp); - printf("nfs_vnop_reclaim: open file busy: %s\n", - vname ? vname : "//"); - } - if (nofp->nof_opencnt) { - if (!vname) - vname = vnode_getname(vp); - printf("nfs_vnop_reclaim: file still open: %d %s\n", - nofp->nof_opencnt, vname ? vname : "//"); - } - if (nofp->nof_access || nofp->nof_deny || - nofp->nof_mmap_access || nofp->nof_mmap_deny || - nofp->nof_r || nofp->nof_w || nofp->nof_rw || - nofp->nof_r_dw || nofp->nof_w_dw || nofp->nof_rw_dw || - nofp->nof_r_drw || nofp->nof_w_drw || nofp->nof_rw_drw) { - if (!vname) - vname = vnode_getname(vp); - printf("nfs_vnop_reclaim: non-zero access: %d %d %d %d # %u %u %u dw %u %u %u drw %u %u %u %s\n", - nofp->nof_access, nofp->nof_deny, - nofp->nof_mmap_access, nofp->nof_mmap_deny, - nofp->nof_r, nofp->nof_w, nofp->nof_rw, - nofp->nof_r_dw, nofp->nof_w_dw, nofp->nof_rw_dw, - nofp->nof_r_drw, nofp->nof_w_drw, nofp->nof_rw_drw, - vname ? vname : "//"); + if (nofp->nof_flags & NFS_OPEN_FILE_BUSY) + NP(np, "nfs_vnop_reclaim: open file busy"); + if (!(np->n_flag & NREVOKE) && !(nofp->nof_flags & NFS_OPEN_FILE_LOST)) { + if (nofp->nof_opencnt && !force) + NP(np, "nfs_vnop_reclaim: file still open: %d", nofp->nof_opencnt); + if (!force && (nofp->nof_access || nofp->nof_deny || + nofp->nof_mmap_access || nofp->nof_mmap_deny || + nofp->nof_r || nofp->nof_w || nofp->nof_rw || + nofp->nof_r_dw || nofp->nof_w_dw || nofp->nof_rw_dw || + nofp->nof_r_drw || nofp->nof_w_drw || nofp->nof_rw_drw || + nofp->nof_d_r || nofp->nof_d_w || nofp->nof_d_rw || + nofp->nof_d_r_dw || nofp->nof_d_w_dw || nofp->nof_d_rw_dw || + nofp->nof_d_r_drw || nofp->nof_d_w_drw || nofp->nof_d_rw_drw)) { + NP(np, "nfs_vnop_reclaim: non-zero access: %d %d %d %d # %u.%u %u.%u %u.%u dw %u.%u %u.%u %u.%u drw %u.%u %u.%u %u.%u", + nofp->nof_access, nofp->nof_deny, + nofp->nof_mmap_access, nofp->nof_mmap_deny, + nofp->nof_r, nofp->nof_d_r, + nofp->nof_w, nofp->nof_d_w, + nofp->nof_rw, nofp->nof_d_rw, + nofp->nof_r_dw, nofp->nof_d_r_dw, + nofp->nof_w_dw, nofp->nof_d_w_dw, + nofp->nof_rw_dw, nofp->nof_d_rw_dw, + nofp->nof_r_drw, nofp->nof_d_r_drw, + nofp->nof_w_drw, nofp->nof_d_w_drw, + nofp->nof_rw_drw, nofp->nof_d_rw_drw); + /* try sending a close RPC if it wasn't delegated */ + if (nofp->nof_r || nofp->nof_w || nofp->nof_rw || + nofp->nof_r_dw || nofp->nof_w_dw || nofp->nof_rw_dw || + nofp->nof_r_drw || nofp->nof_w_drw || nofp->nof_rw_drw) + nfs4_close_rpc(np, nofp, NULL, nofp->nof_owner->noo_cred, R_RECOVER); + } } TAILQ_REMOVE(&np->n_opens, nofp, nof_link); nfs_open_file_destroy(nofp); } lck_mtx_unlock(&np->n_openlock); - lck_mtx_lock(nfs_buf_mutex); - if (!LIST_EMPTY(&np->n_dirtyblkhd) || !LIST_EMPTY(&np->n_cleanblkhd)) { - if (!vname) - vname = vnode_getname(vp); - printf("nfs_reclaim: dropping %s buffers for file %s\n", - (!LIST_EMPTY(&np->n_dirtyblkhd) ? "dirty" : "clean"), - (vname ? vname : "//")); + if (np->n_monlink.le_next != NFSNOLIST) { + /* Wait for any in-progress getattr to complete, */ + /* then remove this node from the monitored node list. */ + lck_mtx_lock(&nmp->nm_lock); + while (np->n_mflag & NMMONSCANINPROG) { + struct timespec ts = { 1, 0 }; + np->n_mflag |= NMMONSCANWANT; + msleep(&np->n_mflag, &nmp->nm_lock, PZERO-1, "nfswaitmonscan", &ts); + } + if (np->n_monlink.le_next != NFSNOLIST) { + LIST_REMOVE(np, n_monlink); + np->n_monlink.le_next = NFSNOLIST; + } + lck_mtx_unlock(&nmp->nm_lock); } + + lck_mtx_lock(nfs_buf_mutex); + if (!force && (!LIST_EMPTY(&np->n_dirtyblkhd) || !LIST_EMPTY(&np->n_cleanblkhd))) + NP(np, "nfs_reclaim: dropping %s buffers", (!LIST_EMPTY(&np->n_dirtyblkhd) ? "dirty" : "clean")); lck_mtx_unlock(nfs_buf_mutex); - if (vname) - vnode_putname(vname); nfs_vinvalbuf(vp, V_IGNORE_WRITEERR, ap->a_context, 0); lck_mtx_lock(nfs_node_hash_mutex); if ((vnode_vtype(vp) != VDIR) && np->n_sillyrename) { - printf("nfs_reclaim: leaving unlinked file %s\n", np->n_sillyrename->nsr_name); + if (!force) + NP(np, "nfs_reclaim: leaving unlinked file %s", np->n_sillyrename->nsr_name); if (np->n_sillyrename->nsr_cred != NOCRED) kauth_cred_unref(&np->n_sillyrename->nsr_cred); vnode_rele(NFSTOV(np->n_sillyrename->nsr_dnp)); @@ -808,6 +875,8 @@ nfs_vnop_reclaim(ap) FREE_ZONE(np->n_cookiecache, sizeof(struct nfsdmap), M_NFSDIROFF); if (np->n_fhsize > NFS_SMALLFH) FREE_ZONE(np->n_fhp, np->n_fhsize, M_NFSBIGFH); + if (np->n_vattr.nva_acl) + kauth_acl_free(np->n_vattr.nva_acl); nfs_node_unlock(np); vnode_clearfsnode(vp); diff --git a/bsd/nfs/nfs_serv.c b/bsd/nfs/nfs_serv.c index e224a921d..956cc9285 100644 --- a/bsd/nfs/nfs_serv.c +++ b/bsd/nfs/nfs_serv.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -89,6 +89,8 @@ #include #include +#include + #include #include #include @@ -114,6 +116,7 @@ lck_grp_t *nfsrv_slp_mutex_group; struct nfsrv_sockhead nfsrv_socklist, nfsrv_deadsocklist, nfsrv_sockwg, nfsrv_sockwait, nfsrv_sockwork; struct nfsrv_sock *nfsrv_udpsock = NULL; +struct nfsrv_sock *nfsrv_udp6sock = NULL; /* NFS exports */ struct nfsrv_expfs_list nfsrv_exports; @@ -232,6 +235,7 @@ nfsrv_init(void) TAILQ_INIT(&nfsd_head); TAILQ_INIT(&nfsd_queue); nfsrv_udpsock = NULL; + nfsrv_udp6sock = NULL; /* initialization complete */ nfsrv_initted = NFSRV_INITIALIZED; @@ -312,15 +316,7 @@ nfsrv_access( * obtain good performance in the optimistic mode. */ if (nfsmode & NFS_ACCESS_READ) { - if (vnode_isdir(vp)) { - testaction = - KAUTH_VNODE_LIST_DIRECTORY | - KAUTH_VNODE_READ_EXTATTRIBUTES; - } else { - testaction = - KAUTH_VNODE_READ_DATA | - KAUTH_VNODE_READ_EXTATTRIBUTES; - } + testaction = vnode_isdir(vp) ? KAUTH_VNODE_LIST_DIRECTORY : KAUTH_VNODE_READ_DATA; if (nfsrv_authorize(vp, NULL, testaction, ctx, nxo, 0)) nfsmode &= ~NFS_ACCESS_READ; } @@ -617,6 +613,9 @@ nfsrv_lookup( nfsmerr_if(error); ni.ni_cnd.cn_nameiop = LOOKUP; +#if CONFIG_TRIGGERS + ni.ni_op = OP_LOOKUP; +#endif ni.ni_cnd.cn_flags = LOCKLEAF; error = nfsm_chain_get_path_namei(nmreq, len, &ni); isdotdot = ((len == 2) && (ni.ni_cnd.cn_pnbuf[0] == '.') && (ni.ni_cnd.cn_pnbuf[1] == '.')); @@ -1052,10 +1051,12 @@ nfsrv_fmod_timer(__unused void *param0, __unused void *param1) * entry and free it. */ LIST_FOREACH_SAFE(fp, &firehead, fm_link, nfp) { - if (nfsrv_fsevents_enabled) + if (nfsrv_fsevents_enabled) { + fp->fm_context.vc_thread = current_thread(); add_fsevent(FSE_CONTENT_MODIFIED, &fp->fm_context, FSE_ARG_VNODE, fp->fm_vp, FSE_ARG_DONE); + } vnode_put(fp->fm_vp); kauth_cred_unref(&fp->fm_context.vc_ucred); LIST_REMOVE(fp, fm_link); @@ -1829,10 +1830,6 @@ nfsrv_create( ni.ni_cnd.cn_nameiop = 0; rdev = 0; - /* - * Save the original credential UID in case they are - * mapped and we need to map the IDs in the attributes. - */ saved_uid = kauth_cred_getuid(nd->nd_cr); nfsm_chain_get_fh_ptr(error, nmreq, nd->nd_vers, nfh.nfh_fhp, nfh.nfh_len); @@ -1841,6 +1838,9 @@ nfsrv_create( nfsmerr_if(error); ni.ni_cnd.cn_nameiop = CREATE; +#if CONFIG_TRIGGERS + ni.ni_op = OP_LINK; +#endif ni.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF; error = nfsm_chain_get_path_namei(nmreq, len, &ni); if (!error) { @@ -1923,17 +1923,6 @@ nfsrv_create( if (vp == NULL) { kauth_acl_t xacl = NULL; - /* - * If the credentials were mapped, we should - * map the same values in the attributes. - */ - if ((vap->va_uid == saved_uid) && (kauth_cred_getuid(nd->nd_cr) != saved_uid)) { - int ismember; - VATTR_SET(vap, va_uid, kauth_cred_getuid(nd->nd_cr)); - if (kauth_cred_ismember_gid(nd->nd_cr, vap->va_gid, &ismember) || !ismember) - VATTR_SET(vap, va_gid, kauth_cred_getgid(nd->nd_cr)); - } - /* authorize before creating */ error = nfsrv_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx, nxo, 0); @@ -1950,20 +1939,17 @@ nfsrv_create( } VATTR_CLEAR_ACTIVE(vap, va_data_size); VATTR_CLEAR_ACTIVE(vap, va_access_time); + /* + * Server policy is to alway use the mapped rpc credential for + * file system object creation. This has the nice side effect of + * enforcing BSD creation semantics + */ + VATTR_CLEAR_ACTIVE(vap, va_uid); + VATTR_CLEAR_ACTIVE(vap, va_gid); /* validate new-file security information */ - if (!error) { + if (!error) error = vnode_authattr_new(dvp, vap, 0, ctx); - if (error && (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid))) { - /* - * Most NFS servers just ignore the UID/GID attributes, so we - * try ignoring them if that'll help the request succeed. - */ - VATTR_CLEAR_ACTIVE(vap, va_uid); - VATTR_CLEAR_ACTIVE(vap, va_gid); - error = vnode_authattr_new(dvp, vap, 0, ctx); - } - } if (vap->va_type == VREG || vap->va_type == VSOCK) { @@ -2024,6 +2010,9 @@ nfsrv_create( vp = NULL; } ni.ni_cnd.cn_nameiop = LOOKUP; +#if CONFIG_TRIGGERS + ni.ni_op = OP_LOOKUP; +#endif ni.ni_cnd.cn_flags &= ~LOCKPARENT; ni.ni_cnd.cn_context = ctx; ni.ni_startdir = dvp; @@ -2168,10 +2157,6 @@ nfsrv_mknod( vp = dvp = dirp = NULL; ni.ni_cnd.cn_nameiop = 0; - /* - * Save the original credential UID in case they are - * mapped and we need to map the IDs in the attributes. - */ saved_uid = kauth_cred_getuid(nd->nd_cr); nfsm_chain_get_fh_ptr(error, nmreq, NFS_VER3, nfh.nfh_fhp, nfh.nfh_len); @@ -2180,6 +2165,9 @@ nfsrv_mknod( nfsmerr_if(error); ni.ni_cnd.cn_nameiop = CREATE; +#if CONFIG_TRIGGERS + ni.ni_op = OP_LINK; +#endif ni.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF; error = nfsm_chain_get_path_namei(nmreq, len, &ni); if (!error) { @@ -2231,17 +2219,6 @@ nfsrv_mknod( } VATTR_SET(vap, va_type, vtyp); - /* - * If the credentials were mapped, we should - * map the same values in the attributes. - */ - if ((vap->va_uid == saved_uid) && (kauth_cred_getuid(nd->nd_cr) != saved_uid)) { - int ismember; - VATTR_SET(vap, va_uid, kauth_cred_getuid(nd->nd_cr)); - if (kauth_cred_ismember_gid(nd->nd_cr, vap->va_gid, &ismember) || !ismember) - VATTR_SET(vap, va_gid, kauth_cred_getgid(nd->nd_cr)); - } - /* authorize before creating */ error = nfsrv_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx, nxo, 0); @@ -2258,20 +2235,18 @@ nfsrv_mknod( } VATTR_CLEAR_ACTIVE(vap, va_data_size); VATTR_CLEAR_ACTIVE(vap, va_access_time); + /* + * Server policy is to alway use the mapped rpc credential for + * file system object creation. This has the nice side effect of + * enforcing BSD creation semantics + */ + VATTR_CLEAR_ACTIVE(vap, va_uid); + VATTR_CLEAR_ACTIVE(vap, va_gid); /* validate new-file security information */ - if (!error) { + if (!error) error = vnode_authattr_new(dvp, vap, 0, ctx); - if (error && (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid))) { - /* - * Most NFS servers just ignore the UID/GID attributes, so we - * try ignoring them if that'll help the request succeed. - */ - VATTR_CLEAR_ACTIVE(vap, va_uid); - VATTR_CLEAR_ACTIVE(vap, va_gid); - error = vnode_authattr_new(dvp, vap, 0, ctx); - } - } + if (error) goto out1; @@ -2295,6 +2270,9 @@ nfsrv_mknod( vp = NULL; } ni.ni_cnd.cn_nameiop = LOOKUP; +#if CONFIG_TRIGGERS + ni.ni_op = OP_LOOKUP; +#endif ni.ni_cnd.cn_flags &= ~LOCKPARENT; ni.ni_cnd.cn_context = vfs_context_current(); ni.ni_startdir = dvp; @@ -2416,6 +2394,9 @@ nfsrv_remove( nfsmerr_if(error); ni.ni_cnd.cn_nameiop = DELETE; +#if CONFIG_TRIGGERS + ni.ni_op = OP_UNLINK; +#endif ni.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF; error = nfsm_chain_get_path_namei(nmreq, len, &ni); if (!error) { @@ -2596,6 +2577,9 @@ nfsrv_rename( kauth_cred_ref(saved_cred); retry: fromni.ni_cnd.cn_nameiop = DELETE; +#if CONFIG_TRIGGERS + fromni.ni_op = OP_UNLINK; +#endif fromni.ni_cnd.cn_flags = WANTPARENT; fromni.ni_cnd.cn_pnbuf = frompath; @@ -2628,6 +2612,9 @@ nfsrv_rename( } toni.ni_cnd.cn_nameiop = RENAME; +#if CONFIG_TRIGGERS + toni.ni_op = OP_RENAME; +#endif toni.ni_cnd.cn_flags = WANTPARENT; toni.ni_cnd.cn_pnbuf = topath; @@ -3175,6 +3162,9 @@ nfsrv_link( goto out; ni.ni_cnd.cn_nameiop = CREATE; +#if CONFIG_TRIGGERS + ni.ni_op = OP_LINK; +#endif ni.ni_cnd.cn_flags = LOCKPARENT; error = nfsm_chain_get_path_namei(nmreq, len, &ni); if (!error) @@ -3307,10 +3297,6 @@ nfsrv_symlink( linkdata = NULL; dirp = NULL; - /* - * Save the original credential UID in case they are - * mapped and we need to map the IDs in the attributes. - */ saved_uid = kauth_cred_getuid(nd->nd_cr); ni.ni_cnd.cn_nameiop = 0; @@ -3322,6 +3308,9 @@ nfsrv_symlink( nfsmerr_if(error); ni.ni_cnd.cn_nameiop = CREATE; +#if CONFIG_TRIGGERS + ni.ni_op = OP_LINK; +#endif ni.ni_cnd.cn_flags = LOCKPARENT; error = nfsm_chain_get_path_namei(nmreq, len, &ni); if (!error) { @@ -3377,42 +3366,33 @@ nfsrv_symlink( goto out; } - /* - * If the credentials were mapped, we should - * map the same values in the attributes. - */ - if ((vap->va_uid == saved_uid) && (kauth_cred_getuid(nd->nd_cr) != saved_uid)) { - int ismember; - VATTR_SET(vap, va_uid, kauth_cred_getuid(nd->nd_cr)); - if (kauth_cred_ismember_gid(nd->nd_cr, vap->va_gid, &ismember) || !ismember) - VATTR_SET(vap, va_gid, kauth_cred_getgid(nd->nd_cr)); - } VATTR_SET(vap, va_type, VLNK); VATTR_CLEAR_ACTIVE(vap, va_data_size); VATTR_CLEAR_ACTIVE(vap, va_access_time); + /* + * Server policy is to alway use the mapped rpc credential for + * file system object creation. This has the nice side effect of + * enforcing BSD creation semantics + */ + VATTR_CLEAR_ACTIVE(vap, va_uid); + VATTR_CLEAR_ACTIVE(vap, va_gid); /* authorize before creating */ error = nfsrv_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx, nxo, 0); /* validate given attributes */ - if (!error) { + if (!error) error = vnode_authattr_new(dvp, vap, 0, ctx); - if (error && (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid))) { - /* - * Most NFS servers just ignore the UID/GID attributes, so we - * try ignoring them if that'll help the request succeed. - */ - VATTR_CLEAR_ACTIVE(vap, va_uid); - VATTR_CLEAR_ACTIVE(vap, va_gid); - error = vnode_authattr_new(dvp, vap, 0, ctx); - } - } + if (!error) error = VNOP_SYMLINK(dvp, &vp, &ni.ni_cnd, vap, linkdata, ctx); if (!error && (nd->nd_vers == NFS_VER3)) { if (vp == NULL) { ni.ni_cnd.cn_nameiop = LOOKUP; +#if CONFIG_TRIGGERS + ni.ni_op = OP_LOOKUP; +#endif ni.ni_cnd.cn_flags &= ~(LOCKPARENT | FOLLOW); ni.ni_cnd.cn_flags |= (NOFOLLOW | LOCKLEAF); ni.ni_cnd.cn_context = ctx; @@ -3508,6 +3488,7 @@ nfsrv_symlink( /* * nfs mkdir service */ + int nfsrv_mkdir( struct nfsrv_descript *nd, @@ -3533,10 +3514,6 @@ nfsrv_mkdir( nmreq = &nd->nd_nmreq; nfsm_chain_null(&nmrep); - /* - * Save the original credential UID in case they are - * mapped and we need to map the IDs in the attributes. - */ saved_uid = kauth_cred_getuid(nd->nd_cr); ni.ni_cnd.cn_nameiop = 0; @@ -3548,6 +3525,9 @@ nfsrv_mkdir( nfsmerr_if(error); ni.ni_cnd.cn_nameiop = CREATE; +#if CONFIG_TRIGGERS + ni.ni_op = OP_LINK; +#endif ni.ni_cnd.cn_flags = LOCKPARENT; error = nfsm_chain_get_path_namei(nmreq, len, &ni); if (!error) { @@ -3593,17 +3573,6 @@ nfsrv_mkdir( goto out; } - /* - * If the credentials were mapped, we should - * map the same values in the attributes. - */ - if ((vap->va_uid == saved_uid) && (kauth_cred_getuid(nd->nd_cr) != saved_uid)) { - int ismember; - VATTR_SET(vap, va_uid, kauth_cred_getuid(nd->nd_cr)); - if (kauth_cred_ismember_gid(nd->nd_cr, vap->va_gid, &ismember) || !ismember) - VATTR_SET(vap, va_gid, kauth_cred_getgid(nd->nd_cr)); - } - error = nfsrv_authorize(dvp, NULL, KAUTH_VNODE_ADD_SUBDIRECTORY, ctx, nxo, 0); /* construct ACL and handle inheritance */ @@ -3617,22 +3586,33 @@ nfsrv_mkdir( if (!error && xacl != NULL) VATTR_SET(vap, va_acl, xacl); } + VATTR_CLEAR_ACTIVE(vap, va_data_size); VATTR_CLEAR_ACTIVE(vap, va_access_time); + /* + * We don't support the S_ISGID bit for directories. Solaris and other + * SRV4 derived systems might set this to get BSD semantics, which we enforce + * any ways. + */ + if (VATTR_IS_ACTIVE(vap, va_mode)) + vap->va_mode &= ~S_ISGID; + /* + * Server policy is to alway use the mapped rpc credential for + * file system object creation. This has the nice side effect of + * enforcing BSD creation semantics + */ + VATTR_CLEAR_ACTIVE(vap, va_uid); + VATTR_CLEAR_ACTIVE(vap, va_gid); /* validate new-file security information */ - if (!error) { + if (!error) error = vnode_authattr_new(dvp, vap, 0, ctx); - if (error && (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid))) { - /* - * Most NFS servers just ignore the UID/GID attributes, so we - * try ignoring them if that'll help the request succeed. - */ - VATTR_CLEAR_ACTIVE(vap, va_uid); - VATTR_CLEAR_ACTIVE(vap, va_gid); - error = vnode_authattr_new(dvp, vap, 0, ctx); - } - } + /* + * vnode_authattr_new can return errors other than EPERM, but that's not going to + * sit well with our clients so we map all errors to EPERM. + */ + if (error) + error = EPERM; if (!error) error = VNOP_MKDIR(dvp, &vp, &ni.ni_cnd, vap, ctx); @@ -3755,6 +3735,9 @@ nfsrv_rmdir( nfsmerr_if(error); ni.ni_cnd.cn_nameiop = DELETE; +#if CONFIG_TRIGGERS + ni.ni_op = OP_UNLINK; +#endif ni.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF; error = nfsm_chain_get_path_namei(nmreq, len, &ni); if (!error) { diff --git a/bsd/nfs/nfs_socket.c b/bsd/nfs/nfs_socket.c index 8d6747009..71b6e5c44 100644 --- a/bsd/nfs/nfs_socket.c +++ b/bsd/nfs/nfs_socket.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -72,6 +72,7 @@ #include #include #include +#include #include #include #include @@ -91,11 +92,13 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -117,6 +120,29 @@ int nfsrv_getreq(struct nfsrv_descript *); extern int nfsv3_procid[NFS_NPROCS]; #endif /* NFSSERVER */ +/* + * compare two sockaddr structures + */ +int +nfs_sockaddr_cmp(struct sockaddr *sa1, struct sockaddr *sa2) +{ + if (!sa1) + return (-1); + if (!sa2) + return (1); + if (sa1->sa_family != sa2->sa_family) + return ((sa1->sa_family < sa2->sa_family) ? -1 : 1); + if (sa1->sa_len != sa2->sa_len) + return ((sa1->sa_len < sa2->sa_len) ? -1 : 1); + if (sa1->sa_family == AF_INET) + return (bcmp(&((struct sockaddr_in*)sa1)->sin_addr, + &((struct sockaddr_in*)sa2)->sin_addr, sizeof(((struct sockaddr_in*)sa1)->sin_addr))); + if (sa1->sa_family == AF_INET6) + return (bcmp(&((struct sockaddr_in6*)sa1)->sin6_addr, + &((struct sockaddr_in6*)sa2)->sin6_addr, sizeof(((struct sockaddr_in6*)sa1)->sin6_addr))); + return (-1); +} + #if NFSCLIENT int nfs_reconnect(struct nfsmount *); @@ -188,214 +214,1270 @@ static int proct[NFS_NPROCS] = { static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, }; /* - * Initialize socket state and perform setup for a new NFS connection. + * Increment location index to next address/server/location. */ -int -nfs_connect(struct nfsmount *nmp, int verbose) +void +nfs_location_next(struct nfs_fs_locations *nlp, struct nfs_location_index *nlip) { - socket_t so; - int error, on = 1, proto; - sock_upcall upcall; - struct sockaddr *saddr; - struct sockaddr_in sin; - struct timeval timeo; - - lck_mtx_lock(&nmp->nm_lock); - nmp->nm_sockflags |= NMSOCK_CONNECTING; - saddr = mbuf_data(nmp->nm_nam); - upcall = (nmp->nm_sotype == SOCK_STREAM) ? nfs_tcp_rcv : nfs_udp_rcv; - lck_mtx_unlock(&nmp->nm_lock); - error = sock_socket(saddr->sa_family, nmp->nm_sotype, - nmp->nm_soproto, upcall, nmp, &nmp->nm_so); - if (error) - goto bad; - lck_mtx_lock(&nmp->nm_lock); - so = nmp->nm_so; - - /* - * Some servers require that the client port be a reserved port number. - */ - if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) { - int portrange = IP_PORTRANGE_LOW; - error = sock_setsockopt(so, IPPROTO_IP, IP_PORTRANGE, &portrange, sizeof(portrange)); - if (!error) { /* bind now to check for failure */ - sin.sin_len = sizeof (struct sockaddr_in); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = INADDR_ANY; - sin.sin_port = 0; - error = sock_bind(so, (struct sockaddr *) &sin); - } - if (error) { - lck_mtx_unlock(&nmp->nm_lock); - goto bad; + uint8_t loc = nlip->nli_loc; + uint8_t serv = nlip->nli_serv; + uint8_t addr = nlip->nli_addr; + + /* move to next address */ + addr++; + if (addr >= nlp->nl_locations[loc]->nl_servers[serv]->ns_addrcount) { + /* no more addresses on current server, go to first address of next server */ +next_server: + addr = 0; + serv++; + if (serv >= nlp->nl_locations[loc]->nl_servcount) { + /* no more servers on current location, go to first server of next location */ + serv = 0; + loc++; + if (loc >= nlp->nl_numlocs) + loc = 0; /* after last location, wrap back around to first location */ } } - /* - * Protocols that do not require connections may be optionally left - * unconnected for servers that reply from a different address/port. + * It's possible for this next server to not have any addresses. + * Check for that here and go to the next server. + * But bail out if we've managed to come back around to the original + * location that was passed in. (That would mean no servers had any + * addresses. And we don't want to spin here forever.) */ - if (nmp->nm_flag & NFSMNT_NOCONN) { - if (nmp->nm_sotype == SOCK_STREAM) { - error = ENOTCONN; - lck_mtx_unlock(&nmp->nm_lock); - goto bad; + if ((loc == nlip->nli_loc) && (serv == nlip->nli_serv) && (addr == nlip->nli_addr)) + return; + if (addr >= nlp->nl_locations[loc]->nl_servers[serv]->ns_addrcount) + goto next_server; + + nlip->nli_loc = loc; + nlip->nli_serv = serv; + nlip->nli_addr = addr; +} + +/* + * Compare two location indices. + */ +int +nfs_location_index_cmp(struct nfs_location_index *nlip1, struct nfs_location_index *nlip2) +{ + if (nlip1->nli_loc != nlip2->nli_loc) + return (nlip1->nli_loc - nlip2->nli_loc); + if (nlip1->nli_serv != nlip2->nli_serv) + return (nlip1->nli_serv - nlip2->nli_serv); + return (nlip1->nli_addr - nlip2->nli_addr); +} + +/* + * Get the mntfromname (or path portion only) for a given location. + */ +void +nfs_location_mntfromname(struct nfs_fs_locations *locs, struct nfs_location_index idx, char *s, int size, int pathonly) +{ + struct nfs_fs_location *fsl = locs->nl_locations[idx.nli_loc]; + char *p; + int cnt, i; + + p = s; + if (!pathonly) { + cnt = snprintf(p, size, "%s:", fsl->nl_servers[idx.nli_serv]->ns_name); + p += cnt; + size -= cnt; + } + if (fsl->nl_path.np_compcount == 0) { + /* mounting root export on server */ + if (size > 0) { + *p++ = '/'; + *p++ = '\0'; } - } else { - int tocnt = 0, optlen = sizeof(error); - struct timespec ts = { 1, 0 }; + return; + } + /* append each server path component */ + for (i=0; (size > 0) && (i < (int)fsl->nl_path.np_compcount); i++) { + cnt = snprintf(p, size, "/%s", fsl->nl_path.np_components[i]); + p += cnt; + size -= cnt; + } +} - lck_mtx_unlock(&nmp->nm_lock); - error = sock_connect(so, mbuf_data(nmp->nm_nam), MSG_DONTWAIT); - if (error && (error != EINPROGRESS)) - goto bad; - lck_mtx_lock(&nmp->nm_lock); - while (!sock_isconnected(so)) { - nfs_mount_check_dead_timeout(nmp); - if ((tocnt++ == 30) && verbose) /* log a warning if connect is taking a while */ - log(LOG_INFO, "nfs_connect: socket connect taking a while for %s\n", - vfs_statfs(nmp->nm_mountp)->f_mntfromname); - /* check for error on socket */ - sock_getsockopt(so, SOL_SOCKET, SO_ERROR, &error, &optlen); - if (error) { - if (verbose) - log(LOG_INFO, "nfs_connect: socket error %d for %s\n", - error, vfs_statfs(nmp->nm_mountp)->f_mntfromname); - break; +/* + * NFS client connect socket upcall. + * (Used only during socket connect/search.) + */ +void +nfs_connect_upcall(socket_t so, void *arg, __unused int waitflag) +{ + struct nfs_socket *nso = arg; + size_t rcvlen; + mbuf_t m; + int error = 0, recv = 1; + + if (nso->nso_flags & NSO_CONNECTING) { + NFS_SOCK_DBG(("nfs connect - socket %p upcall - connecting\n", nso)); + wakeup(nso->nso_wake); + return; + } + + lck_mtx_lock(&nso->nso_lock); + if ((nso->nso_flags & (NSO_UPCALL|NSO_DISCONNECTING|NSO_DEAD)) || !(nso->nso_flags & NSO_PINGING)) { + NFS_SOCK_DBG(("nfs connect - socket %p upcall - nevermind\n", nso)); + lck_mtx_unlock(&nso->nso_lock); + return; + } + NFS_SOCK_DBG(("nfs connect - socket %p upcall\n", nso)); + nso->nso_flags |= NSO_UPCALL; + + /* loop while we make error-free progress */ + while (!error && recv) { + /* make sure we're still interested in this socket */ + if (nso->nso_flags & (NSO_DISCONNECTING|NSO_DEAD)) + break; + lck_mtx_unlock(&nso->nso_lock); + m = NULL; + if (nso->nso_sotype == SOCK_STREAM) { + error = nfs_rpc_record_read(so, &nso->nso_rrs, MSG_DONTWAIT, &recv, &m); + } else { + rcvlen = 1000000; + error = sock_receivembuf(so, NULL, &m, MSG_DONTWAIT, &rcvlen); + recv = m ? 1 : 0; + } + lck_mtx_lock(&nso->nso_lock); + if (m) { + /* match response with request */ + struct nfsm_chain nmrep; + uint32_t reply = 0, rxid = 0, verf_type, verf_len; + uint32_t reply_status, rejected_status, accepted_status; + + nfsm_chain_dissect_init(error, &nmrep, m); + nfsm_chain_get_32(error, &nmrep, rxid); + nfsm_chain_get_32(error, &nmrep, reply); + if (!error && ((reply != RPC_REPLY) || (rxid != nso->nso_pingxid))) + error = EBADRPC; + nfsm_chain_get_32(error, &nmrep, reply_status); + if (!error && (reply_status == RPC_MSGDENIED)) { + nfsm_chain_get_32(error, &nmrep, rejected_status); + if (!error) + error = (rejected_status == RPC_MISMATCH) ? ERPCMISMATCH : EACCES; } - /* abort if this is taking too long or we're unmounting */ - if ((tocnt > 120) || (nmp->nm_sockflags & NMSOCK_UNMOUNT)) { - error = ENOTCONN; - break; + nfsm_chain_get_32(error, &nmrep, verf_type); /* verifier flavor */ + nfsm_chain_get_32(error, &nmrep, verf_len); /* verifier length */ + nfsmout_if(error); + if (verf_len) + nfsm_chain_adv(error, &nmrep, nfsm_rndup(verf_len)); + nfsm_chain_get_32(error, &nmrep, accepted_status); + nfsmout_if(error); + if ((accepted_status == RPC_PROGMISMATCH) && !nso->nso_version) { + uint32_t minvers, maxvers; + nfsm_chain_get_32(error, &nmrep, minvers); + nfsm_chain_get_32(error, &nmrep, maxvers); + nfsmout_if(error); + if (nso->nso_protocol == PMAPPROG) { + if ((minvers > RPCBVERS4) || (maxvers < PMAPVERS)) + error = EPROGMISMATCH; + else if ((nso->nso_saddr->sa_family == AF_INET) && + (PMAPVERS >= minvers) && (PMAPVERS <= maxvers)) + nso->nso_version = PMAPVERS; + else if (nso->nso_saddr->sa_family == AF_INET6) { + if ((RPCBVERS4 >= minvers) && (RPCBVERS4 <= maxvers)) + nso->nso_version = RPCBVERS4; + else if ((RPCBVERS3 >= minvers) && (RPCBVERS3 <= maxvers)) + nso->nso_version = RPCBVERS3; + } + } else if (nso->nso_protocol == NFS_PROG) { + if ((minvers > NFS_VER4) || (maxvers < NFS_VER2)) + error = EPROGMISMATCH; + else if ((NFS_VER3 >= minvers) && (NFS_VER3 <= maxvers)) + nso->nso_version = NFS_VER3; + else if ((NFS_VER2 >= minvers) && (NFS_VER2 <= maxvers)) + nso->nso_version = NFS_VER2; + else if ((NFS_VER4 >= minvers) && (NFS_VER4 <= maxvers)) + nso->nso_version = NFS_VER4; + } + if (!error && nso->nso_version) + accepted_status = RPC_SUCCESS; } - if ((error = nfs_sigintr(nmp, NULL, current_thread(), 1))) - break; - msleep(&nmp->nm_so, &nmp->nm_lock, PSOCK, "nfs_socket_connect", &ts); + if (!error) { + switch (accepted_status) { + case RPC_SUCCESS: + error = 0; + break; + case RPC_PROGUNAVAIL: + error = EPROGUNAVAIL; + break; + case RPC_PROGMISMATCH: + error = EPROGMISMATCH; + break; + case RPC_PROCUNAVAIL: + error = EPROCUNAVAIL; + break; + case RPC_GARBAGE: + error = EBADRPC; + break; + case RPC_SYSTEM_ERR: + default: + error = EIO; + break; + } + } +nfsmout: + nso->nso_flags &= ~NSO_PINGING; + if (error) { + nso->nso_error = error; + nso->nso_flags |= NSO_DEAD; + } else { + nso->nso_flags |= NSO_VERIFIED; + } + mbuf_freem(m); + /* wake up search thread */ + wakeup(nso->nso_wake); + break; } - if ((tocnt > 30) && verbose) - log(LOG_INFO, "nfs_connect: socket connect %s for %s\n", - error ? "aborted" : "completed", - vfs_statfs(nmp->nm_mountp)->f_mntfromname); - if (error) { - lck_mtx_unlock(&nmp->nm_lock); - goto bad; + } + + nso->nso_flags &= ~NSO_UPCALL; + if ((error != EWOULDBLOCK) && (error || !recv)) { + /* problems with the socket... */ + nso->nso_error = error ? error : EPIPE; + nso->nso_flags |= NSO_DEAD; + wakeup(nso->nso_wake); + } + if (nso->nso_flags & NSO_DISCONNECTING) + wakeup(&nso->nso_flags); + lck_mtx_unlock(&nso->nso_lock); +} + +/* + * Create/initialize an nfs_socket structure. + */ +int +nfs_socket_create( + __unused struct nfsmount *nmp, + struct sockaddr *sa, + int sotype, + in_port_t port, + uint32_t protocol, + uint32_t vers, + int resvport, + struct nfs_socket **nsop) +{ + struct nfs_socket *nso; + struct timeval now; + int error; +#ifdef NFS_SOCKET_DEBUGGING + char naddr[MAX_IPv6_STR_LEN]; + void *sinaddr; + + if (sa->sa_family == AF_INET) + sinaddr = &((struct sockaddr_in*)sa)->sin_addr; + else + sinaddr = &((struct sockaddr_in6*)sa)->sin6_addr; + if (inet_ntop(sa->sa_family, sinaddr, naddr, sizeof(naddr)) != naddr) + strlcpy(naddr, "", sizeof(naddr)); +#endif + + *nsop = NULL; + + /* Create the socket. */ + MALLOC(nso, struct nfs_socket *, sizeof(struct nfs_socket), M_TEMP, M_WAITOK|M_ZERO); + if (nso) + MALLOC(nso->nso_saddr, struct sockaddr *, sa->sa_len, M_SONAME, M_WAITOK|M_ZERO); + if (!nso || !nso->nso_saddr) { + if (nso) + FREE(nso, M_TEMP); + return (ENOMEM); + } + lck_mtx_init(&nso->nso_lock, nfs_request_grp, LCK_ATTR_NULL); + nso->nso_sotype = sotype; + if (nso->nso_sotype == SOCK_STREAM) + nfs_rpc_record_state_init(&nso->nso_rrs); + microuptime(&now); + nso->nso_timestamp = now.tv_sec; + bcopy(sa, nso->nso_saddr, sa->sa_len); + if (sa->sa_family == AF_INET) + ((struct sockaddr_in*)nso->nso_saddr)->sin_port = htons(port); + else if (sa->sa_family == AF_INET6) + ((struct sockaddr_in6*)nso->nso_saddr)->sin6_port = htons(port); + nso->nso_protocol = protocol; + nso->nso_version = vers; + + error = sock_socket(sa->sa_family, nso->nso_sotype, 0, NULL, NULL, &nso->nso_so); + + /* Some servers require that the client port be a reserved port number. */ + if (!error && resvport && ((sa->sa_family == AF_INET) || (sa->sa_family == AF_INET6))) { + struct sockaddr_storage ss; + int level = (sa->sa_family == AF_INET) ? IPPROTO_IP : IPPROTO_IPV6; + int optname = (sa->sa_family == AF_INET) ? IP_PORTRANGE : IPV6_PORTRANGE; + int portrange = IP_PORTRANGE_LOW; + + error = sock_setsockopt(nso->nso_so, level, optname, &portrange, sizeof(portrange)); + if (!error) { /* bind now to check for failure */ + ss.ss_len = sa->sa_len; + ss.ss_family = sa->sa_family; + if (ss.ss_family == AF_INET) { + ((struct sockaddr_in*)&ss)->sin_addr.s_addr = INADDR_ANY; + ((struct sockaddr_in*)&ss)->sin_port = htons(0); + } else if (ss.ss_family == AF_INET6) { + ((struct sockaddr_in6*)&ss)->sin6_addr = in6addr_any; + ((struct sockaddr_in6*)&ss)->sin6_port = htons(0); + } else { + error = EINVAL; + } + if (!error) + error = sock_bind(nso->nso_so, (struct sockaddr*)&ss); } } + if (error) { + NFS_SOCK_DBG(("nfs connect %s error %d creating socket %p %s type %d%s port %d prot %d %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, error, nso, naddr, sotype, + resvport ? "r" : "", port, protocol, vers)); + nfs_socket_destroy(nso); + } else { + NFS_SOCK_DBG(("nfs connect %s created socket %p %s type %d%s port %d prot %d %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, naddr, + sotype, resvport ? "r" : "", port, protocol, vers)); + *nsop = nso; + } + return (error); +} + +/* + * Destroy an nfs_socket structure. + */ +void +nfs_socket_destroy(struct nfs_socket *nso) +{ + struct timespec ts = { 4, 0 }; + + lck_mtx_lock(&nso->nso_lock); + nso->nso_flags |= NSO_DISCONNECTING; + if (nso->nso_flags & NSO_UPCALL) /* give upcall a chance to complete */ + msleep(&nso->nso_flags, &nso->nso_lock, PZERO-1, "nfswaitupcall", &ts); + lck_mtx_unlock(&nso->nso_lock); + sock_shutdown(nso->nso_so, SHUT_RDWR); + sock_close(nso->nso_so); + if (nso->nso_sotype == SOCK_STREAM) + nfs_rpc_record_state_cleanup(&nso->nso_rrs); + lck_mtx_destroy(&nso->nso_lock, nfs_request_grp); + if (nso->nso_saddr) + FREE(nso->nso_saddr, M_SONAME); + if (nso->nso_saddr2) + FREE(nso->nso_saddr2, M_SONAME); + NFS_SOCK_DBG(("nfs connect - socket %p destroyed\n", nso)); + FREE(nso, M_TEMP); +} + +/* + * Set common socket options on an nfs_socket. + */ +void +nfs_socket_options(struct nfsmount *nmp, struct nfs_socket *nso) +{ /* * Set socket send/receive timeouts - * - Receive timeout shouldn't matter because all receives are performed + * - Receive timeout shouldn't matter because most receives are performed * in the socket upcall non-blocking. * - Send timeout should allow us to react to a blocked socket. * Soft mounts will want to abort sooner. */ - timeo.tv_usec = 0; - timeo.tv_sec = (nmp->nm_flag & NFSMNT_SOFT) ? 10 : 60; - error |= sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo)); - error |= sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo)); - if (error) { - log(LOG_INFO, "nfs_connect: socket timeout setting errors for %s\n", - vfs_statfs(nmp->nm_mountp)->f_mntfromname); - error = 0; - } + struct timeval timeo; + int on = 1, proto; - if (nmp->nm_sotype == SOCK_STREAM) { + timeo.tv_usec = 0; + timeo.tv_sec = NMFLAG(nmp, SOFT) ? 5 : 60; + sock_setsockopt(nso->nso_so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo)); + sock_setsockopt(nso->nso_so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo)); + if (nso->nso_sotype == SOCK_STREAM) { /* Assume that SOCK_STREAM always requires a connection */ - sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on)); + sock_setsockopt(nso->nso_so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on)); /* set nodelay for TCP */ - sock_gettype(so, NULL, NULL, &proto); + sock_gettype(nso->nso_so, NULL, NULL, &proto); if (proto == IPPROTO_TCP) - sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on)); + sock_setsockopt(nso->nso_so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on)); } - - if (nmp->nm_sotype == SOCK_DGRAM) { /* set socket buffer sizes for UDP */ + if (nso->nso_sotype == SOCK_DGRAM) { /* set socket buffer sizes for UDP */ int reserve = NFS_UDPSOCKBUF; - error |= sock_setsockopt(so, SOL_SOCKET, SO_SNDBUF, &reserve, sizeof(reserve)); - error |= sock_setsockopt(so, SOL_SOCKET, SO_RCVBUF, &reserve, sizeof(reserve)); + sock_setsockopt(nso->nso_so, SOL_SOCKET, SO_SNDBUF, &reserve, sizeof(reserve)); + sock_setsockopt(nso->nso_so, SOL_SOCKET, SO_RCVBUF, &reserve, sizeof(reserve)); + } + /* set SO_NOADDRERR to detect network changes ASAP */ + sock_setsockopt(nso->nso_so, SOL_SOCKET, SO_NOADDRERR, &on, sizeof(on)); + /* just playin' it safe with upcalls */ + sock_setsockopt(nso->nso_so, SOL_SOCKET, SO_UPCALLCLOSEWAIT, &on, sizeof(on)); + /* socket should be interruptible if the mount is */ + if (!NMFLAG(nmp, INTR)) + sock_nointerrupt(nso->nso_so, 1); +} + +/* + * Release resources held in an nfs_socket_search. + */ +void +nfs_socket_search_cleanup(struct nfs_socket_search *nss) +{ + struct nfs_socket *nso, *nsonext; + + TAILQ_FOREACH_SAFE(nso, &nss->nss_socklist, nso_link, nsonext) { + TAILQ_REMOVE(&nss->nss_socklist, nso, nso_link); + nss->nss_sockcnt--; + nfs_socket_destroy(nso); + } + if (nss->nss_sock) { + nfs_socket_destroy(nss->nss_sock); + nss->nss_sock = NULL; + } +} + +/* + * Prefer returning certain errors over others. + * This function returns a ranking of the given error. + */ +int +nfs_connect_error_class(int error) +{ + switch (error) { + case 0: + return (0); + case ETIMEDOUT: + case EAGAIN: + return (1); + case EPIPE: + case EADDRNOTAVAIL: + case ENETDOWN: + case ENETUNREACH: + case ENETRESET: + case ECONNABORTED: + case ECONNRESET: + case EISCONN: + case ENOTCONN: + case ESHUTDOWN: + case ECONNREFUSED: + case EHOSTDOWN: + case EHOSTUNREACH: + return (2); + case ERPCMISMATCH: + case EPROCUNAVAIL: + case EPROGMISMATCH: + case EPROGUNAVAIL: + return (3); + case EBADRPC: + return (4); + default: + return (5); + } +} + +/* + * Make sure a socket search returns the best error. + */ +void +nfs_socket_search_update_error(struct nfs_socket_search *nss, int error) +{ + if (nfs_connect_error_class(error) >= nfs_connect_error_class(nss->nss_error)) + nss->nss_error = error; +} + +/* + * Continue the socket search until we have something to report. + */ +int +nfs_connect_search_loop(struct nfsmount *nmp, struct nfs_socket_search *nss) +{ + struct nfs_socket *nso, *nsonext; + struct timeval now; + struct nfs_fs_location *fsl; + struct nfs_fs_server *fss; + struct sockaddr_storage ss; + char *addrstr; + int error, nomore = 0; + +loop: + microuptime(&now); + NFS_SOCK_DBG(("nfs connect %s search %ld\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname, now.tv_sec)); + + /* Time to start another socket? */ + while ((nss->nss_last < 0) || (nss->nss_sockcnt == 0) || + ((nss->nss_sockcnt < 4) && (now.tv_sec >= (nss->nss_last + 2)))) { + if (nmp->nm_sockflags & NMSOCK_UNMOUNT) + return (EINTR); + /* Find the next address to try... */ + /* Have we run out of locations? */ + if (!nomore && (nss->nss_last != -1) && !nfs_location_index_cmp(&nss->nss_nextloc, &nss->nss_startloc)) + nomore = 1; + if (nomore) { + if (nss->nss_last < 0) + nss->nss_last = now.tv_sec; + break; + } + /* Can we convert the address to a sockaddr? */ + fsl = nmp->nm_locations.nl_locations[nss->nss_nextloc.nli_loc]; + fss = fsl->nl_servers[nss->nss_nextloc.nli_serv]; + addrstr = fss->ns_addresses[nss->nss_nextloc.nli_addr]; + if (!nfs_uaddr2sockaddr(addrstr, (struct sockaddr*)&ss)) { + nfs_location_next(&nmp->nm_locations, &nss->nss_nextloc); + nss->nss_last = -2; + continue; + } + /* Check that socket family is acceptable. */ + if (nmp->nm_sofamily && (ss.ss_family != nmp->nm_sofamily)) { + nfs_location_next(&nmp->nm_locations, &nss->nss_nextloc); + nss->nss_last = -2; + continue; + } + + /* Create the socket. */ + error = nfs_socket_create(nmp, (struct sockaddr*)&ss, nss->nss_sotype, + nss->nss_port, nss->nss_protocol, nss->nss_version, + ((nss->nss_protocol == NFS_PROG) && NMFLAG(nmp, RESVPORT)), &nso); + if (error) + return (error); + + nso->nso_location = nss->nss_nextloc; + nso->nso_wake = nss; + error = sock_setupcall(nso->nso_so, nfs_connect_upcall, nso); if (error) { - log(LOG_INFO, "nfs_connect: socket buffer setting errors for %s\n", + lck_mtx_lock(&nso->nso_lock); + nso->nso_error = error; + nso->nso_flags |= NSO_DEAD; + lck_mtx_unlock(&nso->nso_lock); + } + + TAILQ_INSERT_TAIL(&nss->nss_socklist, nso, nso_link); + nss->nss_sockcnt++; + nfs_location_next(&nmp->nm_locations, &nss->nss_nextloc); + + nss->nss_last = now.tv_sec; + } + + /* check each active socket and try to push it along */ + TAILQ_FOREACH(nso, &nss->nss_socklist, nso_link) { + lck_mtx_lock(&nso->nso_lock); + if (!(nso->nso_flags & NSO_CONNECTED)) { + if ((nso->nso_sotype != SOCK_STREAM) && NMFLAG(nmp, NOCONNECT)) { + /* no connection needed, just say it's already connected */ + nso->nso_flags |= NSO_CONNECTED; + NFS_SOCK_DBG(("nfs connect %s UDP socket %p noconnect\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso)); + } else if (!(nso->nso_flags & NSO_CONNECTING)) { + /* initiate the connection */ + nso->nso_flags |= NSO_CONNECTING; + lck_mtx_unlock(&nso->nso_lock); + NFS_SOCK_DBG(("nfs connect %s connecting socket %p\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso)); + error = sock_connect(nso->nso_so, nso->nso_saddr, MSG_DONTWAIT); + lck_mtx_lock(&nso->nso_lock); + if (error && (error != EINPROGRESS)) { + nso->nso_error = error; + nso->nso_flags |= NSO_DEAD; + lck_mtx_unlock(&nso->nso_lock); + continue; + } + } + if (nso->nso_flags & NSO_CONNECTING) { + /* check the connection */ + if (sock_isconnected(nso->nso_so)) { + NFS_SOCK_DBG(("nfs connect %s socket %p is connected\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso)); + nso->nso_flags &= ~NSO_CONNECTING; + nso->nso_flags |= NSO_CONNECTED; + } else { + int optlen = sizeof(error); + error = 0; + sock_getsockopt(nso->nso_so, SOL_SOCKET, SO_ERROR, &error, &optlen); + if (error) { /* we got an error on the socket */ + NFS_SOCK_DBG(("nfs connect %s socket %p connection error %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, error)); + if (nss->nss_flags & NSS_VERBOSE) + log(LOG_INFO, "nfs_connect: socket error %d for %s\n", + error, vfs_statfs(nmp->nm_mountp)->f_mntfromname); + nso->nso_error = error; + nso->nso_flags |= NSO_DEAD; + lck_mtx_unlock(&nso->nso_lock); + continue; + } + } + } + if (nso->nso_flags & NSO_CONNECTED) + nfs_socket_options(nmp, nso); + } + if (!(nso->nso_flags & NSO_CONNECTED)) { + lck_mtx_unlock(&nso->nso_lock); + continue; + } + if (!(nso->nso_flags & (NSO_PINGING|NSO_VERIFIED)) || + ((nso->nso_sotype == SOCK_DGRAM) && (now.tv_sec >= nso->nso_reqtimestamp+2))) { + /* initiate a NULL RPC request */ + uint64_t xid = nso->nso_pingxid; + mbuf_t m, mreq = NULL; + struct msghdr msg; + size_t reqlen, sentlen; + uint32_t vers; + + if (!(vers = nso->nso_version)) { + if (nso->nso_protocol == PMAPPROG) + vers = (nso->nso_saddr->sa_family == AF_INET) ? PMAPVERS : RPCBVERS4; + else if (nso->nso_protocol == NFS_PROG) + vers = NFS_VER3; + } + lck_mtx_unlock(&nso->nso_lock); + error = nfsm_rpchead2(nmp, nso->nso_sotype, nso->nso_protocol, vers, 0, RPCAUTH_SYS, + vfs_context_ucred(vfs_context_kernel()), NULL, NULL, &xid, &mreq); + lck_mtx_lock(&nso->nso_lock); + if (!error) { + nso->nso_flags |= NSO_PINGING; + nso->nso_pingxid = R_XID32(xid); + nso->nso_reqtimestamp = now.tv_sec; + bzero(&msg, sizeof(msg)); + if ((nso->nso_sotype != SOCK_STREAM) && !sock_isconnected(nso->nso_so)) { + msg.msg_name = nso->nso_saddr; + msg.msg_namelen = nso->nso_saddr->sa_len; + } + for (reqlen=0, m=mreq; m; m = mbuf_next(m)) + reqlen += mbuf_len(m); + lck_mtx_unlock(&nso->nso_lock); + error = sock_sendmbuf(nso->nso_so, &msg, mreq, 0, &sentlen); + NFS_SOCK_DBG(("nfs connect %s verifying socket %p send rv %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, error)); + lck_mtx_lock(&nso->nso_lock); + if (!error && (sentlen != reqlen)) + error = ETIMEDOUT; + } + if (error) { + nso->nso_error = error; + nso->nso_flags |= NSO_DEAD; + lck_mtx_unlock(&nso->nso_lock); + continue; + } + } + if (nso->nso_flags & NSO_VERIFIED) { + /* WOOHOO!! This socket looks good! */ + NFS_SOCK_DBG(("nfs connect %s socket %p verified\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso)); + if (!nso->nso_version) { + /* If the version isn't set, the default must have worked. */ + if (nso->nso_protocol == PMAPPROG) + nso->nso_version = (nso->nso_saddr->sa_family == AF_INET) ? PMAPVERS : RPCBVERS4; + if (nso->nso_protocol == NFS_PROG) + nso->nso_version = NFS_VER3; + } + lck_mtx_unlock(&nso->nso_lock); + TAILQ_REMOVE(&nss->nss_socklist, nso, nso_link); + nss->nss_sockcnt--; + nss->nss_sock = nso; + break; + } + lck_mtx_unlock(&nso->nso_lock); + } + + TAILQ_FOREACH_SAFE(nso, &nss->nss_socklist, nso_link, nsonext) { + lck_mtx_lock(&nso->nso_lock); + if (now.tv_sec >= (nso->nso_timestamp + nss->nss_timeo)) { + /* took too long */ + NFS_SOCK_DBG(("nfs connect %s socket %p timed out\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso)); + nso->nso_error = ETIMEDOUT; + nso->nso_flags |= NSO_DEAD; + } + if (!(nso->nso_flags & NSO_DEAD)) { + lck_mtx_unlock(&nso->nso_lock); + continue; + } + lck_mtx_unlock(&nso->nso_lock); + NFS_SOCK_DBG(("nfs connect %s reaping socket %p %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, nso->nso_error)); + nfs_socket_search_update_error(nss, nso->nso_error); + TAILQ_REMOVE(&nss->nss_socklist, nso, nso_link); + nss->nss_sockcnt--; + nfs_socket_destroy(nso); + if (!nomore) + nss->nss_last = -2; + } + + /* + * Keep looping if we haven't found a socket yet and we have more + * sockets to (continue to) try. + */ + error = 0; + if (!nss->nss_sock && (!TAILQ_EMPTY(&nss->nss_socklist) || !nomore)) { + /* log a warning if connect is taking a while */ + if (((now.tv_sec - nss->nss_timestamp) >= 30) && ((nss->nss_flags & (NSS_VERBOSE|NSS_WARNED)) == NSS_VERBOSE)) { + log(LOG_INFO, "nfs_connect: socket connect taking a while for %s\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname); + nss->nss_flags |= NSS_WARNED; + } + if (nmp->nm_sockflags & NMSOCK_UNMOUNT) + return (EINTR); + if ((error = nfs_sigintr(nmp, NULL, current_thread(), 0))) + return (error); + if (nss->nss_last >= 0) + tsleep(nss, PSOCK, "nfs_connect_search_wait", hz); + goto loop; + } + + NFS_SOCK_DBG(("nfs connect %s returning %d\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname, error)); + return (error); +} + +/* + * Initialize a new NFS connection. + * + * Search for a location to connect a socket to and initialize the connection. + * + * An NFS mount may have multiple locations/servers/addresses available. + * We attempt to connect to each one asynchronously and will start + * several sockets in parallel if other locations are slow to answer. + * We'll use the first NFS socket we can successfully set up. + * + * The search may involve contacting the portmapper service first. + * + * A mount's initial connection may require negotiating some parameters such + * as socket type and NFS version. + */ +int +nfs_connect(struct nfsmount *nmp, int verbose, int timeo) +{ + struct nfs_socket_search nss; + struct nfs_socket *nso, *nsonfs; + struct sockaddr_storage ss; + struct sockaddr *saddr, *oldsaddr; + sock_upcall upcall; + struct timeval now, start; + int error, savederror, nfsvers; + uint8_t sotype = nmp->nm_sotype ? nmp->nm_sotype : SOCK_STREAM; + fhandle_t *fh = NULL; + char *path = NULL; + in_port_t port; + + /* paranoia... check that we have at least one address in the locations */ + uint32_t loc, serv; + for (loc=0; loc < nmp->nm_locations.nl_numlocs; loc++) { + for (serv=0; serv < nmp->nm_locations.nl_locations[loc]->nl_servcount; serv++) { + if (nmp->nm_locations.nl_locations[loc]->nl_servers[serv]->ns_addrcount) + break; + NFS_SOCK_DBG(("nfs connect %s search, server %s has no addresses\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, + nmp->nm_locations.nl_locations[loc]->nl_servers[serv]->ns_name)); + } + if (serv < nmp->nm_locations.nl_locations[loc]->nl_servcount) + break; + } + if (loc >= nmp->nm_locations.nl_numlocs) { + NFS_SOCK_DBG(("nfs connect %s search failed, no addresses\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname)); + return (EINVAL); + } + + lck_mtx_lock(&nmp->nm_lock); + nmp->nm_sockflags |= NMSOCK_CONNECTING; + nmp->nm_nss = &nss; + lck_mtx_unlock(&nmp->nm_lock); + microuptime(&start); + savederror = error = 0; + +tryagain: + /* initialize socket search state */ + bzero(&nss, sizeof(nss)); + nss.nss_error = savederror; + TAILQ_INIT(&nss.nss_socklist); + nss.nss_sotype = sotype; + nss.nss_startloc = nmp->nm_locations.nl_current; + nss.nss_timestamp = start.tv_sec; + nss.nss_timeo = timeo; + if (verbose) + nss.nss_flags |= NSS_VERBOSE; + + /* First time connecting, we may need to negotiate some things */ + if (!(nmp->nm_sockflags & NMSOCK_HASCONNECTED)) { + if (!nmp->nm_vers) { + /* No NFS version specified... */ + if (!nmp->nm_nfsport || (!NM_OMATTR_GIVEN(nmp, FH) && !nmp->nm_mountport)) { + /* ...connect to portmapper first if we (may) need any ports. */ + nss.nss_port = PMAPPORT; + nss.nss_protocol = PMAPPROG; + nss.nss_version = 0; + } else { + /* ...connect to NFS port first. */ + nss.nss_port = nmp->nm_nfsport; + nss.nss_protocol = NFS_PROG; + nss.nss_version = 0; + } + } else if (nmp->nm_vers >= NFS_VER4) { + /* For NFSv4, we use the given (or default) port. */ + nss.nss_port = nmp->nm_nfsport ? nmp->nm_nfsport : NFS_PORT; + nss.nss_protocol = NFS_PROG; + nss.nss_version = 4; + } else { + /* For NFSv3/v2... */ + if (!nmp->nm_nfsport || (!NM_OMATTR_GIVEN(nmp, FH) && !nmp->nm_mountport)) { + /* ...connect to portmapper first if we need any ports. */ + nss.nss_port = PMAPPORT; + nss.nss_protocol = PMAPPROG; + nss.nss_version = 0; + } else { + /* ...connect to NFS port first. */ + nss.nss_port = nmp->nm_nfsport; + nss.nss_protocol = NFS_PROG; + nss.nss_version = nmp->nm_vers; + } + } + NFS_SOCK_DBG(("nfs connect first %s, so type %d port %d prot %d %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nss.nss_sotype, nss.nss_port, + nss.nss_protocol, nss.nss_version)); + } else { + /* we've connected before, just connect to NFS port */ + if (!nmp->nm_nfsport) { + /* need to ask portmapper which port that would be */ + nss.nss_port = PMAPPORT; + nss.nss_protocol = PMAPPROG; + nss.nss_version = 0; + } else { + nss.nss_port = nmp->nm_nfsport; + nss.nss_protocol = NFS_PROG; + nss.nss_version = nmp->nm_vers; + } + NFS_SOCK_DBG(("nfs connect %s, so type %d port %d prot %d %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nss.nss_sotype, nss.nss_port, + nss.nss_protocol, nss.nss_version)); + } + + /* Set next location to first valid location. */ + /* If start location is invalid, find next location. */ + nss.nss_nextloc = nss.nss_startloc; + if ((nss.nss_nextloc.nli_serv >= nmp->nm_locations.nl_locations[nss.nss_nextloc.nli_loc]->nl_servcount) || + (nss.nss_nextloc.nli_addr >= nmp->nm_locations.nl_locations[nss.nss_nextloc.nli_loc]->nl_servers[nss.nss_nextloc.nli_serv]->ns_addrcount)) { + nfs_location_next(&nmp->nm_locations, &nss.nss_nextloc); + if (!nfs_location_index_cmp(&nss.nss_nextloc, &nss.nss_startloc)) { + NFS_SOCK_DBG(("nfs connect %s search failed, couldn't find a valid location index\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname)); + return (ENOENT); + } + } + nss.nss_last = -1; + +keepsearching: + + error = nfs_connect_search_loop(nmp, &nss); + if (error || !nss.nss_sock) { + /* search failed */ + nfs_socket_search_cleanup(&nss); + if (!error && (nss.nss_sotype == SOCK_STREAM) && !nmp->nm_sotype && (nmp->nm_vers < NFS_VER4)) { + /* Try using UDP */ + sotype = SOCK_DGRAM; + savederror = nss.nss_error; + NFS_SOCK_DBG(("nfs connect %s TCP failed %d %d, trying UDP\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, error, nss.nss_error)); + goto tryagain; + } + if (!error) + error = nss.nss_error ? nss.nss_error : ETIMEDOUT; + lck_mtx_lock(&nmp->nm_lock); + nmp->nm_sockflags &= ~NMSOCK_CONNECTING; + nmp->nm_nss = NULL; + lck_mtx_unlock(&nmp->nm_lock); + if (nss.nss_flags & NSS_WARNED) + log(LOG_INFO, "nfs_connect: socket connect aborted for %s\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname); + if (fh) + FREE(fh, M_TEMP); + if (path) + FREE_ZONE(path, MAXPATHLEN, M_NAMEI); + NFS_SOCK_DBG(("nfs connect %s search failed, returning %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, error)); + return (error); + } + + /* try to use nss_sock */ + nso = nss.nss_sock; + nss.nss_sock = NULL; + + /* We may be speaking to portmap first... to determine port(s). */ + if (nso->nso_saddr->sa_family == AF_INET) + port = ntohs(((struct sockaddr_in*)nso->nso_saddr)->sin_port); + else + port = ntohs(((struct sockaddr_in6*)nso->nso_saddr)->sin6_port); + if (port == PMAPPORT) { + /* Use this portmapper port to get the port #s we need. */ + NFS_SOCK_DBG(("nfs connect %s got portmapper socket %p\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso)); + + /* remove the connect upcall so nfs_portmap_lookup() can use this socket */ + sock_setupcall(nso->nso_so, NULL, NULL); + + /* Set up socket address and port for NFS socket. */ + bcopy(nso->nso_saddr, &ss, nso->nso_saddr->sa_len); + + /* If NFS version not set, try NFSv3 then NFSv2. */ + nfsvers = nmp->nm_vers ? nmp->nm_vers : NFS_VER3; + + if (!(port = nmp->nm_nfsport)) { + if (ss.ss_family == AF_INET) + ((struct sockaddr_in*)&ss)->sin_port = htons(0); + else if (ss.ss_family == AF_INET6) + ((struct sockaddr_in6*)&ss)->sin6_port = htons(0); + error = nfs_portmap_lookup(nmp, vfs_context_current(), (struct sockaddr*)&ss, + nso->nso_so, NFS_PROG, nfsvers, + (nso->nso_sotype == SOCK_DGRAM) ? IPPROTO_UDP : IPPROTO_TCP, timeo); + if (!error) { + if (ss.ss_family == AF_INET) + port = ntohs(((struct sockaddr_in*)&ss)->sin_port); + else if (ss.ss_family == AF_INET6) + port = ntohs(((struct sockaddr_in6*)&ss)->sin6_port); + if (!port) + error = EPROGUNAVAIL; + } + if (error && !nmp->nm_vers) { + nfsvers = NFS_VER2; + error = nfs_portmap_lookup(nmp, vfs_context_current(), (struct sockaddr*)&ss, + nso->nso_so, NFS_PROG, nfsvers, + (nso->nso_sotype == SOCK_DGRAM) ? IPPROTO_UDP : IPPROTO_TCP, timeo); + if (!error) { + if (ss.ss_family == AF_INET) + port = ntohs(((struct sockaddr_in*)&ss)->sin_port); + else if (ss.ss_family == AF_INET6) + port = ntohs(((struct sockaddr_in6*)&ss)->sin6_port); + if (!port) + error = EPROGUNAVAIL; + } + } + if (error) { + nfs_socket_search_update_error(&nss, error); + nfs_socket_destroy(nso); + goto keepsearching; + } + } + /* Create NFS protocol socket and add it to the list of sockets. */ + error = nfs_socket_create(nmp, (struct sockaddr*)&ss, nso->nso_sotype, port, + NFS_PROG, nfsvers, NMFLAG(nmp, RESVPORT), &nsonfs); + if (error) { + nfs_socket_search_update_error(&nss, error); + nfs_socket_destroy(nso); + goto keepsearching; + } + nsonfs->nso_location = nso->nso_location; + nsonfs->nso_wake = &nss; + error = sock_setupcall(nsonfs->nso_so, nfs_connect_upcall, nsonfs); + if (error) { + nfs_socket_search_update_error(&nss, error); + nfs_socket_destroy(nsonfs); + nfs_socket_destroy(nso); + goto keepsearching; + } + TAILQ_INSERT_TAIL(&nss.nss_socklist, nsonfs, nso_link); + nss.nss_sockcnt++; + if ((nfsvers < NFS_VER4) && !(nmp->nm_sockflags & NMSOCK_HASCONNECTED) && !NM_OMATTR_GIVEN(nmp, FH)) { + /* Set up socket address and port for MOUNT socket. */ error = 0; + bcopy(nso->nso_saddr, &ss, nso->nso_saddr->sa_len); + port = nmp->nm_mountport; + if (ss.ss_family == AF_INET) + ((struct sockaddr_in*)&ss)->sin_port = htons(port); + else if (ss.ss_family == AF_INET6) + ((struct sockaddr_in6*)&ss)->sin6_port = htons(port); + if (!port) { + /* Get port/sockaddr for MOUNT version corresponding to NFS version. */ + /* If NFS version is unknown, optimistically choose for NFSv3. */ + int mntvers = (nfsvers == NFS_VER2) ? RPCMNT_VER1 : RPCMNT_VER3; + int mntproto = (NM_OMFLAG(nmp, MNTUDP) || (nso->nso_sotype == SOCK_DGRAM)) ? IPPROTO_UDP : IPPROTO_TCP; + error = nfs_portmap_lookup(nmp, vfs_context_current(), (struct sockaddr*)&ss, + nso->nso_so, RPCPROG_MNT, mntvers, mntproto, timeo); + } + if (!error) { + if (ss.ss_family == AF_INET) + port = ntohs(((struct sockaddr_in*)&ss)->sin_port); + else if (ss.ss_family == AF_INET6) + port = ntohs(((struct sockaddr_in6*)&ss)->sin6_port); + if (!port) + error = EPROGUNAVAIL; + } + /* create sockaddr for MOUNT */ + if (!error) + MALLOC(nsonfs->nso_saddr2, struct sockaddr *, ss.ss_len, M_SONAME, M_WAITOK|M_ZERO); + if (!error && !nsonfs->nso_saddr2) + error = ENOMEM; + if (!error) + bcopy(&ss, nsonfs->nso_saddr2, ss.ss_len); + if (error) { + lck_mtx_lock(&nsonfs->nso_lock); + nsonfs->nso_error = error; + nsonfs->nso_flags |= NSO_DEAD; + lck_mtx_unlock(&nsonfs->nso_lock); + } } + nfs_socket_destroy(nso); + goto keepsearching; } - /* set SO_NOADDRERR to detect network changes ASAP */ - error = sock_setsockopt(so, SOL_SOCKET, SO_NOADDRERR, &on, sizeof(on)); + /* nso is an NFS socket */ + NFS_SOCK_DBG(("nfs connect %s got NFS socket %p\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso)); + + /* If NFS version wasn't specified, it was determined during the connect. */ + nfsvers = nmp->nm_vers ? nmp->nm_vers : (int)nso->nso_version; + + /* Perform MOUNT call for initial NFSv2/v3 connection/mount. */ + if ((nfsvers < NFS_VER4) && !(nmp->nm_sockflags & NMSOCK_HASCONNECTED) && !NM_OMATTR_GIVEN(nmp, FH)) { + error = 0; + saddr = nso->nso_saddr2; + if (!saddr) { + /* Need sockaddr for MOUNT port */ + bcopy(nso->nso_saddr, &ss, nso->nso_saddr->sa_len); + port = nmp->nm_mountport; + if (ss.ss_family == AF_INET) + ((struct sockaddr_in*)&ss)->sin_port = htons(port); + else if (ss.ss_family == AF_INET6) + ((struct sockaddr_in6*)&ss)->sin6_port = htons(port); + if (!port) { + /* Get port/sockaddr for MOUNT version corresponding to NFS version. */ + int mntvers = (nfsvers == NFS_VER2) ? RPCMNT_VER1 : RPCMNT_VER3; + int mntproto = (NM_OMFLAG(nmp, MNTUDP) || (nso->nso_sotype == SOCK_DGRAM)) ? IPPROTO_UDP : IPPROTO_TCP; + error = nfs_portmap_lookup(nmp, vfs_context_current(), (struct sockaddr*)&ss, + NULL, RPCPROG_MNT, mntvers, mntproto, timeo); + if (ss.ss_family == AF_INET) + port = ntohs(((struct sockaddr_in*)&ss)->sin_port); + else if (ss.ss_family == AF_INET6) + port = ntohs(((struct sockaddr_in6*)&ss)->sin6_port); + } + if (!error) { + if (port) + saddr = (struct sockaddr*)&ss; + else + error = EPROGUNAVAIL; + } + } + if (saddr) + MALLOC(fh, fhandle_t *, sizeof(fhandle_t), M_TEMP, M_WAITOK|M_ZERO); + if (saddr && fh) + MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (!saddr || !fh || !path) { + if (!error) + error = ENOMEM; + if (fh) + FREE(fh, M_TEMP); + if (path) + FREE_ZONE(path, MAXPATHLEN, M_NAMEI); + fh = NULL; + path = NULL; + nfs_socket_search_update_error(&nss, error); + nfs_socket_destroy(nso); + goto keepsearching; + } + nfs_location_mntfromname(&nmp->nm_locations, nso->nso_location, path, MAXPATHLEN, 1); + error = nfs3_mount_rpc(nmp, saddr, nso->nso_sotype, nfsvers, + path, vfs_context_current(), timeo, fh, &nmp->nm_servsec); + NFS_SOCK_DBG(("nfs connect %s socket %p mount %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, error)); + if (!error) { + /* Make sure we can agree on a security flavor. */ + int o, s; /* indices into mount option and server security flavor lists */ + int found = 0; + + if ((nfsvers == NFS_VER3) && !nmp->nm_servsec.count) { + /* Some servers return an empty list to indicate RPCAUTH_SYS? */ + nmp->nm_servsec.count = 1; + nmp->nm_servsec.flavors[0] = RPCAUTH_SYS; + } + if (nmp->nm_sec.count) { + /* Choose the first flavor in our list that the server supports. */ + if (!nmp->nm_servsec.count) { + /* we don't know what the server supports, just use our first choice */ + nmp->nm_auth = nmp->nm_sec.flavors[0]; + found = 1; + } + for (o=0; !found && (o < nmp->nm_sec.count); o++) + for (s=0; !found && (s < nmp->nm_servsec.count); s++) + if (nmp->nm_sec.flavors[o] == nmp->nm_servsec.flavors[s]) { + nmp->nm_auth = nmp->nm_sec.flavors[o]; + found = 1; + } + } else { + /* Choose the first one we support from the server's list. */ + if (!nmp->nm_servsec.count) { + nmp->nm_auth = RPCAUTH_SYS; + found = 1; + } + for (s=0; s < nmp->nm_servsec.count; s++) + switch (nmp->nm_servsec.flavors[s]) { + case RPCAUTH_SYS: + /* prefer RPCAUTH_SYS to RPCAUTH_NONE */ + if (found && (nmp->nm_auth == RPCAUTH_NONE)) + found = 0; + case RPCAUTH_NONE: + case RPCAUTH_KRB5: + case RPCAUTH_KRB5I: + case RPCAUTH_KRB5P: + if (!found) { + nmp->nm_auth = nmp->nm_servsec.flavors[s]; + found = 1; + } + break; + } + } + error = !found ? EAUTH : 0; + } + FREE_ZONE(path, MAXPATHLEN, M_NAMEI); + path = NULL; + if (error) { + nfs_socket_search_update_error(&nss, error); + FREE(fh, M_TEMP); + fh = NULL; + nfs_socket_destroy(nso); + goto keepsearching; + } + if (nmp->nm_fh) + FREE(nmp->nm_fh, M_TEMP); + nmp->nm_fh = fh; + fh = NULL; + NFS_BITMAP_SET(nmp->nm_flags, NFS_MFLAG_CALLUMNT); + } + + /* put the real upcall in place */ + upcall = (nso->nso_sotype == SOCK_STREAM) ? nfs_tcp_rcv : nfs_udp_rcv; + error = sock_setupcall(nso->nso_so, upcall, nmp); if (error) { - lck_mtx_unlock(&nmp->nm_lock); - goto bad; + nfs_socket_search_update_error(&nss, error); + nfs_socket_destroy(nso); + goto keepsearching; } - /* just playin' it safe */ - sock_setsockopt(so, SOL_SOCKET, SO_UPCALLCLOSEWAIT, &on, sizeof(on)); - if (!(nmp->nm_flag & NFSMNT_INT)) - sock_nointerrupt(so, 1); + if (!(nmp->nm_sockflags & NMSOCK_HASCONNECTED)) { + /* set mntfromname to this location */ + if (!NM_OMATTR_GIVEN(nmp, MNTFROM)) + nfs_location_mntfromname(&nmp->nm_locations, nso->nso_location, + vfs_statfs(nmp->nm_mountp)->f_mntfromname, + sizeof(vfs_statfs(nmp->nm_mountp)->f_mntfromname), 0); + /* some negotiated values need to remain unchanged for the life of the mount */ + if (!nmp->nm_sotype) + nmp->nm_sotype = nso->nso_sotype; + if (!nmp->nm_vers) { + nmp->nm_vers = nfsvers; + /* If we negotiated NFSv4, set nm_nfsport if we ended up on the standard NFS port */ + if ((nfsvers >= NFS_VER4) && !NFS_BITMAP_ISSET(nmp->nm_mattrs, NFS_MATTR_NFS_PORT)) { + if (nso->nso_saddr->sa_family == AF_INET) + port = ((struct sockaddr_in*)nso->nso_saddr)->sin_port = htons(port); + else if (nso->nso_saddr->sa_family == AF_INET6) + port = ((struct sockaddr_in6*)nso->nso_saddr)->sin6_port = htons(port); + else + port = 0; + if (port == NFS_PORT) + nmp->nm_nfsport = NFS_PORT; + } + } + /* do some version-specific pre-mount set up */ + if (nmp->nm_vers >= NFS_VER4) { + microtime(&now); + nmp->nm_mounttime = ((uint64_t)now.tv_sec << 32) | now.tv_usec; + if (!NMFLAG(nmp, NOCALLBACK)) + nfs4_mount_callback_setup(nmp); + } + } - /* Initialize socket state variables */ + /* Initialize NFS socket state variables */ + lck_mtx_lock(&nmp->nm_lock); nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] = nmp->nm_srtt[3] = (NFS_TIMEO << 3); nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] = nmp->nm_sdrtt[3] = 0; - if (nmp->nm_sotype == SOCK_DGRAM) { - /* XXX do we really want to reset this on each reconnect? */ + if (nso->nso_sotype == SOCK_DGRAM) { nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */ nmp->nm_sent = 0; - } else if (nmp->nm_sotype == SOCK_STREAM) { - nmp->nm_markerleft = sizeof(nmp->nm_fragleft); - nmp->nm_fragleft = nmp->nm_reclen = 0; + } else if (nso->nso_sotype == SOCK_STREAM) { nmp->nm_timeouts = 0; } nmp->nm_sockflags &= ~NMSOCK_CONNECTING; nmp->nm_sockflags |= NMSOCK_SETUP; - FSDBG(529, nmp, nmp->nm_state, nmp->nm_flag, nmp->nm_cwnd); + /* move the socket to the mount structure */ + nmp->nm_nso = nso; + oldsaddr = nmp->nm_saddr; + nmp->nm_saddr = nso->nso_saddr; lck_mtx_unlock(&nmp->nm_lock); error = nfs_connect_setup(nmp); -bad: lck_mtx_lock(&nmp->nm_lock); - nmp->nm_sockflags &= ~(NMSOCK_CONNECTING|NMSOCK_SETUP); + nmp->nm_sockflags &= ~NMSOCK_SETUP; if (!error) { nmp->nm_sockflags |= NMSOCK_READY; wakeup(&nmp->nm_sockflags); } + if (error) { + NFS_SOCK_DBG(("nfs connect %s socket %p setup failed %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, error)); + nfs_socket_search_update_error(&nss, error); + nmp->nm_saddr = oldsaddr; + if (!(nmp->nm_sockflags & NMSOCK_HASCONNECTED)) { + /* undo settings made prior to setup */ + if (!NFS_BITMAP_ISSET(nmp->nm_mattrs, NFS_MATTR_SOCKET_TYPE)) + nmp->nm_sotype = 0; + if (!NFS_BITMAP_ISSET(nmp->nm_mattrs, NFS_MATTR_NFS_VERSION)) { + if (nmp->nm_vers >= NFS_VER4) { + if (!NFS_BITMAP_ISSET(nmp->nm_mattrs, NFS_MATTR_NFS_PORT)) + nmp->nm_nfsport = 0; + if (nmp->nm_cbid) + nfs4_mount_callback_shutdown(nmp); + if (IS_VALID_CRED(nmp->nm_mcred)) + kauth_cred_unref(&nmp->nm_mcred); + bzero(&nmp->nm_un, sizeof(nmp->nm_un)); + } + nmp->nm_vers = 0; + } + } + lck_mtx_unlock(&nmp->nm_lock); + nmp->nm_nso = NULL; + nfs_socket_destroy(nso); + goto keepsearching; + } + + /* update current location */ + if ((nmp->nm_locations.nl_current.nli_flags & NLI_VALID) && + (nmp->nm_locations.nl_current.nli_serv != nso->nso_location.nli_serv)) { + /* server has changed, we should initiate failover/recovery */ + // XXX + } + nmp->nm_locations.nl_current = nso->nso_location; + nmp->nm_locations.nl_current.nli_flags |= NLI_VALID; + + if (!(nmp->nm_sockflags & NMSOCK_HASCONNECTED)) { + /* We have now successfully connected... make a note of it. */ + nmp->nm_sockflags |= NMSOCK_HASCONNECTED; + } + lck_mtx_unlock(&nmp->nm_lock); - return (error); + if (oldsaddr) + FREE(oldsaddr, M_SONAME); + + if (nss.nss_flags & NSS_WARNED) + log(LOG_INFO, "nfs_connect: socket connect completed for %s\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname); + + nmp->nm_nss = NULL; + nfs_socket_search_cleanup(&nss); + if (fh) + FREE(fh, M_TEMP); + if (path) + FREE_ZONE(path, MAXPATHLEN, M_NAMEI); + NFS_SOCK_DBG(("nfs connect %s success\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname)); + return (0); } + /* setup & confirm socket connection is functional */ int nfs_connect_setup(struct nfsmount *nmp) { - struct nfsm_chain nmreq, nmrep; - int error = 0, status; - u_int64_t xid; + int error = 0; if (nmp->nm_vers >= NFS_VER4) { - error = nfs4_setclientid(nmp); - if (error) - return (error); - error = nfs4_renew(nmp, R_SETUP); - if ((error == NFSERR_ADMIN_REVOKED) || - (error == NFSERR_EXPIRED) || - (error == NFSERR_LEASE_MOVED) || - (error == NFSERR_STALE_CLIENTID)) { - lck_mtx_lock(&nmp->nm_lock); - nmp->nm_state |= NFSSTA_RECOVER; - lck_mtx_unlock(&nmp->nm_lock); + if (nmp->nm_state & NFSSTA_CLIENTID) { + /* first, try to renew our current state */ + error = nfs4_renew(nmp, R_SETUP); + if ((error == NFSERR_ADMIN_REVOKED) || + (error == NFSERR_CB_PATH_DOWN) || + (error == NFSERR_EXPIRED) || + (error == NFSERR_LEASE_MOVED) || + (error == NFSERR_STALE_CLIENTID)) { + lck_mtx_lock(&nmp->nm_lock); + nfs_need_recover(nmp, error); + lck_mtx_unlock(&nmp->nm_lock); + } } - } else { - /* verify connection's OK by sending a NULL request */ - nfsm_chain_null(&nmreq); - nfsm_chain_null(&nmrep); - nfsm_chain_build_alloc_init(error, &nmreq, 0); - nfsm_chain_build_done(error, &nmreq); - nfsmout_if(error); - error = nfs_request2(NULL, nmp->nm_mountp, &nmreq, NFSPROC_NULL, - current_thread(), NULL, R_SETUP, &nmrep, &xid, &status); - if (!error) - error = status; -nfsmout: - nfsm_chain_cleanup(&nmreq); - nfsm_chain_cleanup(&nmrep); + error = nfs4_setclientid(nmp); } return (error); } @@ -422,10 +1504,10 @@ nfs_reconnect(struct nfsmount *nmp) nfs_disconnect(nmp); - while ((error = nfs_connect(nmp, verbose))) { + while ((error = nfs_connect(nmp, verbose, 30))) { verbose = 0; nfs_disconnect(nmp); - if (error == EINTR || error == ERESTART) + if ((error == EINTR) || (error == ERESTART)) return (EINTR); if (error == EIO) return (EIO); @@ -485,19 +1567,32 @@ nfs_reconnect(struct nfsmount *nmp) void nfs_disconnect(struct nfsmount *nmp) { - socket_t so; + struct nfs_socket *nso; lck_mtx_lock(&nmp->nm_lock); - if ((nmp->nm_sotype == SOCK_STREAM) && nmp->nm_m) { - mbuf_freem(nmp->nm_m); - nmp->nm_m = nmp->nm_mlast = NULL; - } - if (nmp->nm_so) { - so = nmp->nm_so; - nmp->nm_so = NULL; +tryagain: + if (nmp->nm_nso) { + struct timespec ts = { 1, 0 }; + if (nmp->nm_state & NFSSTA_SENDING) { /* wait for sending to complete */ + nmp->nm_state |= NFSSTA_WANTSND; + msleep(&nmp->nm_state, &nmp->nm_lock, PZERO-1, "nfswaitsending", &ts); + goto tryagain; + } + if (nmp->nm_sockflags & NMSOCK_POKE) { /* wait for poking to complete */ + msleep(&nmp->nm_sockflags, &nmp->nm_lock, PZERO-1, "nfswaitpoke", &ts); + goto tryagain; + } + nmp->nm_sockflags |= NMSOCK_DISCONNECTING; + nmp->nm_sockflags &= ~NMSOCK_READY; + nso = nmp->nm_nso; + nmp->nm_nso = NULL; + if (nso->nso_saddr == nmp->nm_saddr) + nso->nso_saddr = NULL; + lck_mtx_unlock(&nmp->nm_lock); + nfs_socket_destroy(nso); + lck_mtx_lock(&nmp->nm_lock); + nmp->nm_sockflags &= ~NMSOCK_DISCONNECTING; lck_mtx_unlock(&nmp->nm_lock); - sock_shutdown(so, SHUT_RDWR); - sock_close(so); } else { lck_mtx_unlock(&nmp->nm_lock); } @@ -536,6 +1631,7 @@ nfs_need_reconnect(struct nfsmount *nmp) lck_mtx_unlock(nfs_request_mutex); } + /* * thread to handle miscellaneous async NFS socket work (reconnects/resends) */ @@ -547,24 +1643,22 @@ nfs_mount_sock_thread(void *arg, __unused wait_result_t wr) thread_t thd = current_thread(); struct nfsreq *req; struct timeval now; - int error, dofinish, force; + int error, dofinish; nfsnode_t np; - fhandle_t fh; - nfs_stateid dstateid; lck_mtx_lock(&nmp->nm_lock); while (!(nmp->nm_sockflags & NMSOCK_READY) || !TAILQ_EMPTY(&nmp->nm_resendq) || + !LIST_EMPTY(&nmp->nm_monlist) || nmp->nm_deadto_start || - ((nmp->nm_vers >= NFS_VER4) && - ((nmp->nm_state & NFSSTA_RECOVER) || !TAILQ_EMPTY(&nmp->nm_recallq)))) + (nmp->nm_state & NFSSTA_RECOVER) || + ((nmp->nm_vers >= NFS_VER4) && !TAILQ_EMPTY(&nmp->nm_dreturnq))) { if (nmp->nm_sockflags & NMSOCK_UNMOUNT) break; - force = (nmp->nm_state & NFSSTA_FORCE); /* do reconnect, if necessary */ - if (!(nmp->nm_sockflags & NMSOCK_READY) && !force) { + if (!(nmp->nm_sockflags & NMSOCK_READY) && !(nmp->nm_state & NFSSTA_FORCE)) { if (nmp->nm_reconnect_start <= 0) { microuptime(&now); nmp->nm_reconnect_start = now.tv_sec; @@ -577,38 +1671,27 @@ nfs_mount_sock_thread(void *arg, __unused wait_result_t wr) } if ((nmp->nm_sockflags & NMSOCK_READY) && (nmp->nm_state & NFSSTA_RECOVER) && - !(nmp->nm_sockflags & NMSOCK_UNMOUNT) && !force) { + !(nmp->nm_sockflags & NMSOCK_UNMOUNT) && + !(nmp->nm_state & NFSSTA_FORCE)) { /* perform state recovery */ lck_mtx_unlock(&nmp->nm_lock); - nfs4_recover(nmp); + nfs_recover(nmp); lck_mtx_lock(&nmp->nm_lock); } - /* handle NFSv4 delegation recalls */ - while ((nmp->nm_vers >= NFS_VER4) && !force && + /* handle NFSv4 delegation returns */ + while ((nmp->nm_vers >= NFS_VER4) && !(nmp->nm_state & NFSSTA_FORCE) && (nmp->nm_sockflags & NMSOCK_READY) && !(nmp->nm_state & NFSSTA_RECOVER) && - ((np = TAILQ_FIRST(&nmp->nm_recallq)))) { - TAILQ_REMOVE(&nmp->nm_recallq, np, n_dlink); - np->n_dlink.tqe_next = NFSNOLIST; + ((np = TAILQ_FIRST(&nmp->nm_dreturnq)))) { lck_mtx_unlock(&nmp->nm_lock); - lck_mtx_lock(&np->n_openlock); - dstateid = np->n_dstateid; - if (np->n_openflags & N_DELEG_MASK) { - fh.fh_len = np->n_fhsize; - bcopy(np->n_fhp, &fh.fh_data, fh.fh_len); - np->n_openflags &= ~N_DELEG_MASK; - lck_mtx_unlock(&np->n_openlock); - nfs4_delegreturn_rpc(nmp, fh.fh_data, fh.fh_len, &dstateid, thd, nmp->nm_mcred); - } else { - lck_mtx_unlock(&np->n_openlock); - } + nfs4_delegation_return(np, R_RECOVER, thd, nmp->nm_mcred); lck_mtx_lock(&nmp->nm_lock); } /* do resends, if necessary/possible */ - while ((((nmp->nm_sockflags & NMSOCK_READY) && !(nmp->nm_state & NFSSTA_RECOVER)) || force) && + while ((((nmp->nm_sockflags & NMSOCK_READY) && !(nmp->nm_state & NFSSTA_RECOVER)) || (nmp->nm_state & NFSSTA_FORCE)) && ((req = TAILQ_FIRST(&nmp->nm_resendq)))) { if (req->r_resendtime) microuptime(&now); - while (req && !force && req->r_resendtime && (now.tv_sec < req->r_resendtime)) + while (req && !(nmp->nm_state & NFSSTA_FORCE) && req->r_resendtime && (now.tv_sec < req->r_resendtime)) req = TAILQ_NEXT(req, r_rchain); if (!req) break; @@ -626,20 +1709,20 @@ nfs_mount_sock_thread(void *arg, __unused wait_result_t wr) lck_mtx_lock(&nmp->nm_lock); continue; } - if ((req->r_flags & R_RESTART) || req->r_gss_ctx) { + if ((req->r_flags & R_RESTART) || nfs_request_using_gss(req)) { req->r_flags &= ~R_RESTART; req->r_resendtime = 0; lck_mtx_unlock(&req->r_mtx); /* async RPCs on GSS mounts need to be rebuilt and resent. */ nfs_reqdequeue(req); - if (req->r_gss_ctx) { + if (nfs_request_using_gss(req)) { nfs_gss_clnt_rpcdone(req); error = nfs_gss_clnt_args_restore(req); if (error == ENEEDAUTH) req->r_xid = 0; } NFS_SOCK_DBG(("nfs async%s restart: p %d x 0x%llx f 0x%x rtt %d\n", - req->r_gss_ctx ? " gss" : "", req->r_procnum, req->r_xid, + nfs_request_using_gss(req) ? " gss" : "", req->r_procnum, req->r_xid, req->r_flags, req->r_rtt)); error = !req->r_nmp ? ENXIO : 0; /* unmounted? */ if (!error) @@ -693,20 +1776,45 @@ nfs_mount_sock_thread(void *arg, __unused wait_result_t wr) } if (nmp->nm_deadto_start) nfs_mount_check_dead_timeout(nmp); - if (force || (nmp->nm_state & NFSSTA_DEAD)) + if (nmp->nm_state & (NFSSTA_FORCE|NFSSTA_DEAD)) break; - if ((nmp->nm_sockflags & NMSOCK_READY) || (nmp->nm_state & NFSSTA_RECOVER)) { + /* check monitored nodes, if necessary/possible */ + if (!LIST_EMPTY(&nmp->nm_monlist)) { + nmp->nm_state |= NFSSTA_MONITOR_SCAN; + LIST_FOREACH(np, &nmp->nm_monlist, n_monlink) { + if (!(nmp->nm_sockflags & NMSOCK_READY) || (nmp->nm_state & (NFSSTA_RECOVER|NFSSTA_UNMOUNTING|NFSSTA_FORCE))) + break; + np->n_mflag |= NMMONSCANINPROG; + lck_mtx_unlock(&nmp->nm_lock); + error = nfs_getattr(np, NULL, vfs_context_kernel(), (NGA_UNCACHED|NGA_MONITOR)); + if (!error && ISSET(np->n_flag, NUPDATESIZE)) /* update quickly to avoid multiple events */ + nfs_data_update_size(np, 0); + lck_mtx_lock(&nmp->nm_lock); + np->n_mflag &= ~NMMONSCANINPROG; + if (np->n_mflag & NMMONSCANWANT) { + np->n_mflag &= ~NMMONSCANWANT; + wakeup(&np->n_mflag); + } + if (error || !(nmp->nm_sockflags & NMSOCK_READY) || (nmp->nm_state & (NFSSTA_RECOVER|NFSSTA_UNMOUNTING|NFSSTA_FORCE))) + break; + } + nmp->nm_state &= ~NFSSTA_MONITOR_SCAN; + if (nmp->nm_state & NFSSTA_UNMOUNTING) + wakeup(&nmp->nm_state); /* let unmounting thread know scan is done */ + } + if ((nmp->nm_sockflags & NMSOCK_READY) || (nmp->nm_state & (NFSSTA_RECOVER|NFSSTA_UNMOUNTING))) { if (nmp->nm_deadto_start || !TAILQ_EMPTY(&nmp->nm_resendq) || (nmp->nm_state & NFSSTA_RECOVER)) ts.tv_sec = 1; else - ts.tv_sec = 30; + ts.tv_sec = 5; msleep(&nmp->nm_sockthd, &nmp->nm_lock, PSOCK, "nfssockthread", &ts); } } /* If we're unmounting, send the unmount RPC, if requested/appropriate. */ - if ((nmp->nm_sockflags & NMSOCK_UNMOUNT) && (nmp->nm_flag & NFSMNT_CALLUMNT) && + if ((nmp->nm_sockflags & NMSOCK_UNMOUNT) && + (nmp->nm_state & NFSSTA_MOUNTED) && NMFLAG(nmp, CALLUMNT) && (nmp->nm_vers < NFS_VER4) && !(nmp->nm_state & (NFSSTA_FORCE|NFSSTA_DEAD))) { lck_mtx_unlock(&nmp->nm_lock); nfs3_umount_rpc(nmp, vfs_context_kernel(), @@ -741,7 +1849,7 @@ nfs_mount_check_dead_timeout(struct nfsmount *nmp) { struct timeval now; - if (!(nmp->nm_flag & NFSMNT_DEADTIMEOUT)) + if (nmp->nm_deadtimeout <= 0) return; if (nmp->nm_deadto_start == 0) return; @@ -755,20 +1863,6 @@ nfs_mount_check_dead_timeout(struct nfsmount *nmp) vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_DEAD, 0); } -/* - * RPC record marker parsing state - */ -struct nfs_rpc_record_state -{ - uint16_t nrrs_lastfrag; /* last fragment of record */ - uint16_t nrrs_markerleft; /* marker bytes remaining */ - uint32_t nrrs_fragleft; /* fragment bytes remaining */ - uint32_t nrrs_reclen; /* length of RPC record */ - mbuf_t nrrs_m; /* mbufs for current record */ - mbuf_t nrrs_mlast; -}; -int nfs_rpc_record_read(socket_t, struct nfs_rpc_record_state *, int *, mbuf_t *); - /* * NFS callback channel socket state */ @@ -776,7 +1870,7 @@ struct nfs_callback_socket { TAILQ_ENTRY(nfs_callback_socket) ncbs_link; socket_t ncbs_so; /* the socket */ - struct sockaddr_in ncbs_sin; /* socket address */ + struct sockaddr_storage ncbs_saddr; /* socket address */ struct nfs_rpc_record_state ncbs_rrs; /* RPC record parsing state */ time_t ncbs_stamp; /* last accessed at */ uint32_t ncbs_flags; /* see below */ @@ -795,7 +1889,9 @@ struct nfs_callback_socket * the requests up with mounts. */ socket_t nfs4_cb_so = NULL; +socket_t nfs4_cb_so6 = NULL; in_port_t nfs4_cb_port = 0; +in_port_t nfs4_cb_port6 = 0; uint32_t nfs4_cb_id = 0; uint32_t nfs4_cb_so_usecount = 0; TAILQ_HEAD(nfs4_cb_sock_list,nfs_callback_socket) nfs4_cb_socks; @@ -813,9 +1909,12 @@ void nfs4_mount_callback_setup(struct nfsmount *nmp) { struct sockaddr_in sin; + struct sockaddr_in6 sin6; socket_t so = NULL; + socket_t so6 = NULL; struct timeval timeo; int error, on = 1; + in_port_t port; lck_mtx_lock(nfs_global_mutex); if (nfs4_cb_id == 0) { @@ -834,32 +1933,34 @@ nfs4_mount_callback_setup(struct nfsmount *nmp) return; } + /* IPv4 */ error = sock_socket(AF_INET, SOCK_STREAM, IPPROTO_TCP, nfs4_cb_accept, NULL, &nfs4_cb_so); if (error) { - log(LOG_INFO, "nfs callback setup: error %d creating listening socket\n", error); + log(LOG_INFO, "nfs callback setup: error %d creating listening IPv4 socket\n", error); goto fail; } so = nfs4_cb_so; + sock_setsockopt(so, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)); sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_addr.s_addr = htonl(INADDR_ANY); - sin.sin_port = 0; + sin.sin_port = htons(nfs_callback_port); /* try to use specified port */ error = sock_bind(so, (struct sockaddr *)&sin); if (error) { - log(LOG_INFO, "nfs callback setup: error %d binding listening socket\n", error); + log(LOG_INFO, "nfs callback setup: error %d binding listening IPv4 socket\n", error); goto fail; } error = sock_getsockname(so, (struct sockaddr *)&sin, sin.sin_len); if (error) { - log(LOG_INFO, "nfs callback setup: error %d getting listening socket port\n", error); + log(LOG_INFO, "nfs callback setup: error %d getting listening IPv4 socket port\n", error); goto fail; } nfs4_cb_port = ntohs(sin.sin_port); error = sock_listen(so, 32); if (error) { - log(LOG_INFO, "nfs callback setup: error %d on listen\n", error); + log(LOG_INFO, "nfs callback setup: error %d on IPv4 listen\n", error); goto fail; } @@ -868,23 +1969,81 @@ nfs4_mount_callback_setup(struct nfsmount *nmp) timeo.tv_sec = 60; error = sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo)); if (error) - log(LOG_INFO, "nfs callback setup: error %d setting socket rx timeout\n", error); + log(LOG_INFO, "nfs callback setup: error %d setting IPv4 socket rx timeout\n", error); error = sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo)); if (error) - log(LOG_INFO, "nfs callback setup: error %d setting socket tx timeout\n", error); + log(LOG_INFO, "nfs callback setup: error %d setting IPv4 socket tx timeout\n", error); sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on)); sock_setsockopt(so, SOL_SOCKET, SO_NOADDRERR, &on, sizeof(on)); sock_setsockopt(so, SOL_SOCKET, SO_UPCALLCLOSEWAIT, &on, sizeof(on)); error = 0; + /* IPv6 */ + error = sock_socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP, nfs4_cb_accept, NULL, &nfs4_cb_so6); + if (error) { + log(LOG_INFO, "nfs callback setup: error %d creating listening IPv6 socket\n", error); + goto fail; + } + so6 = nfs4_cb_so6; + + sock_setsockopt(so6, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)); + sock_setsockopt(so6, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on)); + /* try to use specified port or same port as IPv4 */ + port = nfs_callback_port ? nfs_callback_port : nfs4_cb_port; +ipv6_bind_again: + sin6.sin6_len = sizeof(struct sockaddr_in6); + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = in6addr_any; + sin6.sin6_port = htons(port); + error = sock_bind(so6, (struct sockaddr *)&sin6); + if (error) { + if (port != nfs_callback_port) { + /* if we simply tried to match the IPv4 port, then try any port */ + port = 0; + goto ipv6_bind_again; + } + log(LOG_INFO, "nfs callback setup: error %d binding listening IPv6 socket\n", error); + goto fail; + } + error = sock_getsockname(so6, (struct sockaddr *)&sin6, sin6.sin6_len); + if (error) { + log(LOG_INFO, "nfs callback setup: error %d getting listening IPv6 socket port\n", error); + goto fail; + } + nfs4_cb_port6 = ntohs(sin6.sin6_port); + + error = sock_listen(so6, 32); + if (error) { + log(LOG_INFO, "nfs callback setup: error %d on IPv6 listen\n", error); + goto fail; + } + + /* receive timeout shouldn't matter. If timeout on send, we'll want to drop the socket */ + timeo.tv_usec = 0; + timeo.tv_sec = 60; + error = sock_setsockopt(so6, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo)); + if (error) + log(LOG_INFO, "nfs callback setup: error %d setting IPv6 socket rx timeout\n", error); + error = sock_setsockopt(so6, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo)); + if (error) + log(LOG_INFO, "nfs callback setup: error %d setting IPv6 socket tx timeout\n", error); + sock_setsockopt(so6, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on)); + sock_setsockopt(so6, SOL_SOCKET, SO_NOADDRERR, &on, sizeof(on)); + sock_setsockopt(so6, SOL_SOCKET, SO_UPCALLCLOSEWAIT, &on, sizeof(on)); + error = 0; + fail: if (error) { - nfs4_cb_so = NULL; + nfs4_cb_so = nfs4_cb_so6 = NULL; lck_mtx_unlock(nfs_global_mutex); if (so) { sock_shutdown(so, SHUT_RDWR); sock_close(so); } + if (so6) { + sock_shutdown(so6, SHUT_RDWR); + sock_close(so6); + } } else { lck_mtx_unlock(nfs_global_mutex); } @@ -901,7 +2060,7 @@ void nfs4_mount_callback_shutdown(struct nfsmount *nmp) { struct nfs_callback_socket *ncbsp; - socket_t so; + socket_t so, so6; struct nfs4_cb_sock_list cb_socks; struct timespec ts = {1,0}; @@ -910,12 +2069,14 @@ nfs4_mount_callback_shutdown(struct nfsmount *nmp) /* wait for any callbacks in progress to complete */ while (nmp->nm_cbrefs) msleep(&nmp->nm_cbrefs, nfs_global_mutex, PSOCK, "cbshutwait", &ts); + nmp->nm_cbid = 0; if (--nfs4_cb_so_usecount) { lck_mtx_unlock(nfs_global_mutex); return; } so = nfs4_cb_so; - nfs4_cb_so = NULL; + so6 = nfs4_cb_so6; + nfs4_cb_so = nfs4_cb_so6 = NULL; TAILQ_INIT(&cb_socks); TAILQ_CONCAT(&cb_socks, &nfs4_cb_socks, ncbs_link); lck_mtx_unlock(nfs_global_mutex); @@ -923,10 +2084,15 @@ nfs4_mount_callback_shutdown(struct nfsmount *nmp) sock_shutdown(so, SHUT_RDWR); sock_close(so); } + if (so6) { + sock_shutdown(so6, SHUT_RDWR); + sock_close(so6); + } while ((ncbsp = TAILQ_FIRST(&cb_socks))) { TAILQ_REMOVE(&cb_socks, ncbsp, ncbs_link); sock_shutdown(ncbsp->ncbs_so, SHUT_RDWR); sock_close(ncbsp->ncbs_so); + nfs_rpc_record_state_cleanup(&ncbsp->ncbs_rrs); FREE(ncbsp, M_TEMP); } } @@ -958,6 +2124,7 @@ nfs4_callback_timer(__unused void *param0, __unused void *param1) lck_mtx_unlock(nfs_global_mutex); sock_shutdown(ncbsp->ncbs_so, SHUT_RDWR); sock_close(ncbsp->ncbs_so); + nfs_rpc_record_state_cleanup(&ncbsp->ncbs_rrs); FREE(ncbsp, M_TEMP); goto loop; } @@ -977,10 +2144,13 @@ nfs4_cb_accept(socket_t so, __unused void *arg, __unused int waitflag) struct nfs_callback_socket *ncbsp; struct nfsmount *nmp; struct timeval timeo, now; - struct sockaddr_in *saddr; - int error, on = 1; + int error, on = 1, ip; - if (so != nfs4_cb_so) + if (so == nfs4_cb_so) + ip = 4; + else if (so == nfs4_cb_so6) + ip = 6; + else return; /* allocate/initialize a new nfs_callback_socket */ @@ -990,15 +2160,15 @@ nfs4_cb_accept(socket_t so, __unused void *arg, __unused int waitflag) return; } bzero(ncbsp, sizeof(*ncbsp)); - ncbsp->ncbs_sin.sin_len = sizeof(struct sockaddr_in); - ncbsp->ncbs_rrs.nrrs_markerleft = sizeof(ncbsp->ncbs_rrs.nrrs_fragleft); + ncbsp->ncbs_saddr.ss_len = (ip == 4) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); + nfs_rpc_record_state_init(&ncbsp->ncbs_rrs); /* accept a new socket */ - error = sock_accept(so, (struct sockaddr*)&ncbsp->ncbs_sin, - ncbsp->ncbs_sin.sin_len, MSG_DONTWAIT, + error = sock_accept(so, (struct sockaddr*)&ncbsp->ncbs_saddr, + ncbsp->ncbs_saddr.ss_len, MSG_DONTWAIT, nfs4_cb_rcv, ncbsp, &newso); if (error) { - log(LOG_INFO, "nfs callback accept: error %d accepting socket\n", error); + log(LOG_INFO, "nfs callback accept: error %d accepting IPv%d socket\n", error, ip); FREE(ncbsp, M_TEMP); return; } @@ -1009,11 +2179,12 @@ nfs4_cb_accept(socket_t so, __unused void *arg, __unused int waitflag) timeo.tv_sec = 60; error = sock_setsockopt(newso, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo)); if (error) - log(LOG_INFO, "nfs callback socket: error %d setting socket rx timeout\n", error); + log(LOG_INFO, "nfs callback socket: error %d setting IPv%d socket rx timeout\n", error, ip); error = sock_setsockopt(newso, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo)); if (error) - log(LOG_INFO, "nfs callback socket: error %d setting socket tx timeout\n", error); + log(LOG_INFO, "nfs callback socket: error %d setting IPv%d socket tx timeout\n", error, ip); sock_setsockopt(newso, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on)); + sock_setsockopt(newso, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)); sock_setsockopt(newso, SOL_SOCKET, SO_NOADDRERR, &on, sizeof(on)); sock_setsockopt(newso, SOL_SOCKET, SO_UPCALLCLOSEWAIT, &on, sizeof(on)); @@ -1028,11 +2199,10 @@ nfs4_cb_accept(socket_t so, __unused void *arg, __unused int waitflag) /* verify it's from a host we have mounted */ TAILQ_FOREACH(nmp, &nfs4_cb_mounts, nm_cblink) { - /* check socket's source address matches this mount's server address */ - saddr = mbuf_data(nmp->nm_nam); - if ((ncbsp->ncbs_sin.sin_len == saddr->sin_len) && - (ncbsp->ncbs_sin.sin_family == saddr->sin_family) && - (ncbsp->ncbs_sin.sin_addr.s_addr == saddr->sin_addr.s_addr)) + /* check if socket's source address matches this mount's server address */ + if (!nmp->nm_saddr) + continue; + if (nfs_sockaddr_cmp((struct sockaddr*)&ncbsp->ncbs_saddr, nmp->nm_saddr) == 0) break; } if (!nmp) /* we don't want this socket, mark it dead */ @@ -1077,7 +2247,7 @@ nfs4_cb_rcv(socket_t so, void *arg, __unused int waitflag) /* loop while we make error-free progress */ while (!error && recv) { - error = nfs_rpc_record_read(so, &ncbsp->ncbs_rrs, &recv, &m); + error = nfs_rpc_record_read(so, &ncbsp->ncbs_rrs, MSG_DONTWAIT, &recv, &m); if (m) /* handle the request */ error = nfs4_cb_handler(ncbsp, m); } @@ -1111,7 +2281,6 @@ nfs4_cb_handler(struct nfs_callback_socket *ncbsp, mbuf_t mreq) socket_t so = ncbsp->ncbs_so; struct nfsm_chain nmreq, nmrep; mbuf_t mhead = NULL, mrest = NULL, m; - struct sockaddr_in *saddr; struct msghdr msg; struct nfsmount *nmp; fhandle_t fh; @@ -1203,12 +2372,10 @@ nfs4_cb_handler(struct nfs_callback_socket *ncbsp, mbuf_t mreq) if (nmp->nm_cbid != cbid) continue; /* verify socket's source address matches this mount's server address */ - saddr = mbuf_data(nmp->nm_nam); - if ((ncbsp->ncbs_sin.sin_len != saddr->sin_len) || - (ncbsp->ncbs_sin.sin_family != saddr->sin_family) || - (ncbsp->ncbs_sin.sin_addr.s_addr != saddr->sin_addr.s_addr)) + if (!nmp->nm_saddr) continue; - break; + if (nfs_sockaddr_cmp((struct sockaddr*)&ncbsp->ncbs_saddr, nmp->nm_saddr) == 0) + break; } /* mark the NFS mount as busy */ if (nmp) @@ -1240,7 +2407,7 @@ nfs4_cb_handler(struct nfs_callback_socket *ncbsp, mbuf_t mreq) numops = 0; /* don't process any more ops */ } else { /* find the node for the file handle */ - error = nfs_nget(nmp->nm_mountp, NULL, NULL, fh.fh_data, fh.fh_len, NULL, NULL, NG_NOCREATE, &np); + error = nfs_nget(nmp->nm_mountp, NULL, NULL, fh.fh_data, fh.fh_len, NULL, NULL, RPCAUTH_UNKNOWN, NG_NOCREATE, &np); if (error || !np) { status = NFSERR_BADHANDLE; error = 0; @@ -1301,7 +2468,7 @@ nfs4_cb_handler(struct nfs_callback_socket *ncbsp, mbuf_t mreq) numops = 0; /* don't process any more ops */ } else { /* find the node for the file handle */ - error = nfs_nget(nmp->nm_mountp, NULL, NULL, fh.fh_data, fh.fh_len, NULL, NULL, NG_NOCREATE, &np); + error = nfs_nget(nmp->nm_mountp, NULL, NULL, fh.fh_data, fh.fh_len, NULL, NULL, RPCAUTH_UNKNOWN, NG_NOCREATE, &np); if (error || !np) { status = NFSERR_BADHANDLE; error = 0; @@ -1313,14 +2480,8 @@ nfs4_cb_handler(struct nfs_callback_socket *ncbsp, mbuf_t mreq) status = NFSERR_BAD_STATEID; numops = 0; /* don't process any more ops */ } - if (!status) { - /* add node to recall queue, and wake socket thread */ - lck_mtx_lock(&nmp->nm_lock); - if (np->n_dlink.tqe_next == NFSNOLIST) - TAILQ_INSERT_TAIL(&nmp->nm_recallq, np, n_dlink); - nfs_mount_sock_thread_wake(nmp); - lck_mtx_unlock(&nmp->nm_lock); - } + if (!status) /* add node to recall queue, and wake socket thread */ + nfs4_delegation_return_enqueue(np); if (np) { nfs_node_unlock(np); vnode_put(NFSTOV(np)); @@ -1456,6 +2617,28 @@ nfs4_cb_handler(struct nfs_callback_socket *ncbsp, mbuf_t mreq) } +/* + * Initialize an nfs_rpc_record_state structure. + */ +void +nfs_rpc_record_state_init(struct nfs_rpc_record_state *nrrsp) +{ + bzero(nrrsp, sizeof(*nrrsp)); + nrrsp->nrrs_markerleft = sizeof(nrrsp->nrrs_fragleft); +} + +/* + * Clean up an nfs_rpc_record_state structure. + */ +void +nfs_rpc_record_state_cleanup(struct nfs_rpc_record_state *nrrsp) +{ + if (nrrsp->nrrs_m) { + mbuf_freem(nrrsp->nrrs_m); + nrrsp->nrrs_m = nrrsp->nrrs_mlast = NULL; + } +} + /* * Read the next (marked) RPC record from the socket. * @@ -1463,7 +2646,7 @@ nfs4_cb_handler(struct nfs_callback_socket *ncbsp, mbuf_t mreq) * *mp returns the next complete RPC record */ int -nfs_rpc_record_read(socket_t so, struct nfs_rpc_record_state *nrrsp, int *recvp, mbuf_t *mp) +nfs_rpc_record_read(socket_t so, struct nfs_rpc_record_state *nrrsp, int flags, int *recvp, mbuf_t *mp) { struct iovec aio; struct msghdr msg; @@ -1482,7 +2665,7 @@ nfs_rpc_record_read(socket_t so, struct nfs_rpc_record_state *nrrsp, int *recvp, bzero(&msg, sizeof(msg)); msg.msg_iov = &aio; msg.msg_iovlen = 1; - error = sock_receive(so, &msg, MSG_DONTWAIT, &rcvlen); + error = sock_receive(so, &msg, flags, &rcvlen); if (error || !rcvlen) break; *recvp = 1; @@ -1497,10 +2680,7 @@ nfs_rpc_record_read(socket_t so, struct nfs_rpc_record_state *nrrsp, int *recvp, } nrrsp->nrrs_reclen += nrrsp->nrrs_fragleft; if (nrrsp->nrrs_reclen > NFS_MAXPACKET) { - /* - * This is SERIOUS! We are out of sync with the sender - * and forcing a disconnect/reconnect is all I can do. - */ + /* This is SERIOUS! We are out of sync with the sender. */ log(LOG_ERR, "impossible RPC record length (%d) on callback", nrrsp->nrrs_reclen); error = EFBIG; } @@ -1510,7 +2690,7 @@ nfs_rpc_record_read(socket_t so, struct nfs_rpc_record_state *nrrsp, int *recvp, while (!error && !nrrsp->nrrs_markerleft && nrrsp->nrrs_fragleft) { m = NULL; rcvlen = nrrsp->nrrs_fragleft; - error = sock_receivembuf(so, NULL, &m, MSG_DONTWAIT, &rcvlen); + error = sock_receivembuf(so, NULL, &m, flags, &rcvlen); if (error || !rcvlen || !m) break; *recvp = 1; @@ -1579,7 +2759,7 @@ int nfs_send(struct nfsreq *req, int wait) { struct nfsmount *nmp; - socket_t so; + struct nfs_socket *nso; int error, error2, sotype, rexmit, slpflag = 0, needrecon; struct msghdr msg; struct sockaddr *sendnam; @@ -1597,7 +2777,7 @@ nfs_send(struct nfsreq *req, int wait) return (error); } - error = nfs_sigintr(req->r_nmp, req, req->r_thread, 0); + error = nfs_sigintr(req->r_nmp, req, NULL, 0); if (error) { nfs_sndunlock(req); lck_mtx_lock(&req->r_mtx); @@ -1629,7 +2809,7 @@ nfs_send(struct nfsreq *req, int wait) lck_mtx_lock(&nmp->nm_lock); if (!(nmp->nm_sockflags & NMSOCK_READY) && !((nmp->nm_sockflags & NMSOCK_SETUP) && (req->r_flags & R_SETUP))) { - if (nmp->nm_flag & NFSMNT_INT) + if (NMFLAG(nmp, INTR) && !(req->r_flags & R_NOINTR)) slpflag |= PCATCH; lck_mtx_unlock(&nmp->nm_lock); nfs_sndunlock(req); @@ -1653,7 +2833,7 @@ nfs_send(struct nfsreq *req, int wait) error = EIO; break; } - if ((nmp->nm_flag & NFSMNT_SOFT) && (nmp->nm_reconnect_start > 0)) { + if (NMFLAG(nmp, SOFT) && (nmp->nm_reconnect_start > 0)) { struct timeval now; microuptime(&now); if ((now.tv_sec - nmp->nm_reconnect_start) >= 8) { @@ -1681,9 +2861,11 @@ nfs_send(struct nfsreq *req, int wait) } goto again; } - so = nmp->nm_so; + nso = nmp->nm_nso; + /* note that we're using the mount's socket to do the send */ + nmp->nm_state |= NFSSTA_SENDING; /* will be cleared by nfs_sndunlock() */ lck_mtx_unlock(&nmp->nm_lock); - if (!so) { + if (!nso) { nfs_sndunlock(req); lck_mtx_lock(&req->r_mtx); req->r_flags &= ~R_SENDING; @@ -1700,7 +2882,7 @@ nfs_send(struct nfsreq *req, int wait) lck_mtx_lock(&nmp->nm_lock); if (!(req->r_flags & R_CWND) && (nmp->nm_sent >= nmp->nm_cwnd)) { /* if we can't send this out yet, wait on the cwnd queue */ - slpflag = ((nmp->nm_flag & NFSMNT_INT) && req->r_thread) ? PCATCH : 0; + slpflag = (NMFLAG(nmp, INTR) && req->r_thread) ? PCATCH : 0; lck_mtx_unlock(&nmp->nm_lock); nfs_sndunlock(req); req->r_flags &= ~R_SENDING; @@ -1764,13 +2946,11 @@ nfs_send(struct nfsreq *req, int wait) } bzero(&msg, sizeof(msg)); - if (nmp->nm_nam && (sotype != SOCK_STREAM) && !sock_isconnected(so)) { - if ((sendnam = mbuf_data(nmp->nm_nam))) { - msg.msg_name = (caddr_t)sendnam; - msg.msg_namelen = sendnam->sa_len; - } + if ((sotype != SOCK_STREAM) && !sock_isconnected(nso->nso_so) && ((sendnam = nmp->nm_saddr))) { + msg.msg_name = (caddr_t)sendnam; + msg.msg_namelen = sendnam->sa_len; } - error = sock_sendmbuf(so, &msg, mreqcopy, 0, &sentlen); + error = sock_sendmbuf(nso->nso_so, &msg, mreqcopy, 0, &sentlen); #ifdef NFS_SOCKET_DEBUGGING if (error || (sentlen != req->r_mreqlen)) NFS_SOCK_DBG(("nfs_send: 0x%llx sent %d/%d error %d\n", @@ -1820,9 +3000,9 @@ nfs_send(struct nfsreq *req, int wait) * For now, ignore them all */ if ((error != EINTR) && (error != ERESTART) && - (error != EWOULDBLOCK) && (error != EIO)) { + (error != EWOULDBLOCK) && (error != EIO) && (nso == nmp->nm_nso)) { int clearerror = 0, optlen = sizeof(clearerror); - sock_getsockopt(so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen); + sock_getsockopt(nso->nso_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen); #ifdef NFS_SOCKET_DEBUGGING if (clearerror) NFS_SOCK_DBG(("nfs_send: ignoring UDP socket error %d so %d\n", @@ -1852,7 +3032,7 @@ nfs_send(struct nfsreq *req, int wait) needrecon = 1; break; } - if (needrecon) { /* mark socket as needing reconnect */ + if (needrecon && (nso == nmp->nm_nso)) { /* mark socket as needing reconnect */ NFS_SOCK_DBG(("nfs_send: 0x%llx need reconnect %d\n", req->r_xid, error)); nfs_need_reconnect(nmp); } @@ -1902,20 +3082,19 @@ void nfs_udp_rcv(socket_t so, void *arg, __unused int waitflag) { struct nfsmount *nmp = arg; + struct nfs_socket *nso = nmp->nm_nso; size_t rcvlen; mbuf_t m; int error = 0; - if (nmp->nm_sockflags & NMSOCK_CONNECTING) { - wakeup(&nmp->nm_so); - return; - } - - /* make sure we're on the current socket */ - if (nmp->nm_so != so) + if (nmp->nm_sockflags & NMSOCK_CONNECTING) return; do { + /* make sure we're on the current socket */ + if (!nso || (nso->nso_so != so)) + return; + m = NULL; rcvlen = 1000000; error = sock_receivembuf(so, NULL, &m, MSG_DONTWAIT, &rcvlen); @@ -1935,123 +3114,54 @@ void nfs_tcp_rcv(socket_t so, void *arg, __unused int waitflag) { struct nfsmount *nmp = arg; - struct iovec aio; - struct msghdr msg; - size_t rcvlen; + struct nfs_socket *nso = nmp->nm_nso; + struct nfs_rpc_record_state nrrs; mbuf_t m; int error = 0; - int recv; + int recv = 1; - if (nmp->nm_sockflags & NMSOCK_CONNECTING) { - wakeup(&nmp->nm_so); + if (nmp->nm_sockflags & NMSOCK_CONNECTING) return; - } /* make sure we're on the current socket */ - if (nmp->nm_so != so) - return; - lck_mtx_lock(&nmp->nm_lock); - if (nmp->nm_sockflags & NMSOCK_UPCALL) { - /* upcall is already receiving data - just return */ + nso = nmp->nm_nso; + if (!nso || (nso->nso_so != so) || (nmp->nm_sockflags & (NMSOCK_DISCONNECTING))) { lck_mtx_unlock(&nmp->nm_lock); return; } - nmp->nm_sockflags |= NMSOCK_UPCALL; - -nextfrag: - recv = 0; - - /* read the TCP RPC record marker */ - while (!error && nmp->nm_markerleft) { - aio.iov_base = ((char*)&nmp->nm_fragleft + - sizeof(nmp->nm_fragleft) - nmp->nm_markerleft); - aio.iov_len = nmp->nm_markerleft; - bzero(&msg, sizeof(msg)); - msg.msg_iov = &aio; - msg.msg_iovlen = 1; - lck_mtx_unlock(&nmp->nm_lock); - error = sock_receive(so, &msg, MSG_DONTWAIT, &rcvlen); - lck_mtx_lock(&nmp->nm_lock); - if (error || !rcvlen) - break; - recv = 1; - nmp->nm_markerleft -= rcvlen; - if (nmp->nm_markerleft) - continue; - /* record marker complete */ - nmp->nm_fragleft = ntohl(nmp->nm_fragleft); - if (nmp->nm_fragleft & 0x80000000) { - nmp->nm_sockflags |= NMSOCK_LASTFRAG; - nmp->nm_fragleft &= ~0x80000000; - } - nmp->nm_reclen += nmp->nm_fragleft; - if (nmp->nm_reclen > NFS_MAXPACKET) { - /* - * This is SERIOUS! We are out of sync with the sender - * and forcing a disconnect/reconnect is all I can do. - */ - log(LOG_ERR, "%s (%d) from nfs server %s\n", - "impossible RPC record length", nmp->nm_reclen, - vfs_statfs(nmp->nm_mountp)->f_mntfromname); - error = EFBIG; - } - } + lck_mtx_unlock(&nmp->nm_lock); - /* read the TCP RPC record fragment */ - while (!error && !nmp->nm_markerleft && nmp->nm_fragleft) { - m = NULL; - rcvlen = nmp->nm_fragleft; - lck_mtx_unlock(&nmp->nm_lock); - error = sock_receivembuf(so, NULL, &m, MSG_DONTWAIT, &rcvlen); - lck_mtx_lock(&nmp->nm_lock); - if (error || !rcvlen || !m) - break; - recv = 1; - /* append mbufs to list */ - nmp->nm_fragleft -= rcvlen; - if (!nmp->nm_m) { - nmp->nm_m = m; - } else { - error = mbuf_setnext(nmp->nm_mlast, m); - if (error) { - printf("nfs_tcp_rcv: mbuf_setnext failed %d\n", error); - mbuf_freem(m); - break; - } - } - while (mbuf_next(m)) - m = mbuf_next(m); - nmp->nm_mlast = m; + /* make sure this upcall should be trying to do work */ + lck_mtx_lock(&nso->nso_lock); + if (nso->nso_flags & (NSO_UPCALL|NSO_DISCONNECTING|NSO_DEAD)) { + lck_mtx_unlock(&nso->nso_lock); + return; } + nso->nso_flags |= NSO_UPCALL; + nrrs = nso->nso_rrs; + lck_mtx_unlock(&nso->nso_lock); - /* done reading fragment? */ - m = NULL; - if (!error && !nmp->nm_markerleft && !nmp->nm_fragleft) { - /* reset socket fragment parsing state */ - nmp->nm_markerleft = sizeof(nmp->nm_fragleft); - if (nmp->nm_sockflags & NMSOCK_LASTFRAG) { - /* RPC record complete */ - m = nmp->nm_m; - /* reset socket record parsing state */ - nmp->nm_reclen = 0; - nmp->nm_m = nmp->nm_mlast = NULL; - nmp->nm_sockflags &= ~NMSOCK_LASTFRAG; - } + /* loop while we make error-free progress */ + while (!error && recv) { + error = nfs_rpc_record_read(so, &nrrs, MSG_DONTWAIT, &recv, &m); + if (m) /* match completed response with request */ + nfs_request_match_reply(nmp, m); } - if (m) { /* match completed response with request */ + lck_mtx_lock(&nmp->nm_lock); + if (nmp->nm_nso == nso) { + /* still the same socket, so update socket's RPC parsing state */ + lck_mtx_unlock(&nmp->nm_lock); + lck_mtx_lock(&nso->nso_lock); + nso->nso_rrs = nrrs; + nso->nso_flags &= ~NSO_UPCALL; + lck_mtx_unlock(&nso->nso_lock); + if (nmp->nm_sockflags & NMSOCK_DISCONNECTING) + wakeup(&nmp->nm_sockflags); + } else { lck_mtx_unlock(&nmp->nm_lock); - nfs_request_match_reply(nmp, m); - lck_mtx_lock(&nmp->nm_lock); } - - /* loop if we've been making error-free progress */ - if (!error && recv) - goto nextfrag; - - nmp->nm_sockflags &= ~NMSOCK_UPCALL; - lck_mtx_unlock(&nmp->nm_lock); #ifdef NFS_SOCKET_DEBUGGING if (!recv && (error != EWOULDBLOCK)) NFS_SOCK_DBG(("nfs_tcp_rcv: got nothing, error %d, got FIN?\n", error)); @@ -2077,7 +3187,8 @@ nfs_sock_poke(struct nfsmount *nmp) int dummy; lck_mtx_lock(&nmp->nm_lock); - if ((nmp->nm_sockflags & NMSOCK_UNMOUNT) || !nmp->nm_so) { + if ((nmp->nm_sockflags & NMSOCK_UNMOUNT) || + !(nmp->nm_sockflags & NMSOCK_READY) || !nmp->nm_nso || !nmp->nm_nso->nso_so) { lck_mtx_unlock(&nmp->nm_lock); return; } @@ -2088,7 +3199,7 @@ nfs_sock_poke(struct nfsmount *nmp) bzero(&msg, sizeof(msg)); msg.msg_iov = &aio; msg.msg_iovlen = 1; - error = sock_send(nmp->nm_so, &msg, MSG_DONTWAIT, &len); + error = sock_send(nmp->nm_nso->nso_so, &msg, MSG_DONTWAIT, &len); NFS_SOCK_DBG(("nfs_sock_poke: error %d\n", error)); } @@ -2183,7 +3294,7 @@ nfs_request_match_reply(struct nfsmount *nmp, mbuf_t mrep) /* signal anyone waiting on this request */ wakeup(req); asyncioq = (req->r_callback.rcb_func != NULL); - if (req->r_gss_ctx != NULL) + if (nfs_request_using_gss(req)) nfs_gss_clnt_rpcdone(req); lck_mtx_unlock(&req->r_mtx); lck_mtx_unlock(nfs_request_mutex); @@ -2209,16 +3320,16 @@ int nfs_wait_reply(struct nfsreq *req) { struct timespec ts = { 2, 0 }; - int error = 0, slpflag; + int error = 0, slpflag, first = 1; - if (req->r_nmp && (req->r_nmp->nm_flag & NFSMNT_INT) && req->r_thread) + if (req->r_nmp && NMFLAG(req->r_nmp, INTR) && req->r_thread && !(req->r_flags & R_NOINTR)) slpflag = PCATCH; else slpflag = 0; lck_mtx_lock(&req->r_mtx); while (!req->r_nmrep.nmc_mhead) { - if ((error = nfs_sigintr(req->r_nmp, req, req->r_thread, 0))) + if ((error = nfs_sigintr(req->r_nmp, req, first ? NULL : req->r_thread, 0))) break; if (((error = req->r_error)) || req->r_nmrep.nmc_mhead) break; @@ -2228,9 +3339,9 @@ nfs_wait_reply(struct nfsreq *req) req->r_procnum, req->r_xid, req->r_flags, req->r_rtt)); req->r_flags |= R_SENDING; lck_mtx_unlock(&req->r_mtx); - if (req->r_gss_ctx) { + if (nfs_request_using_gss(req)) { /* - * It's an RPCSEC_GSS mount. + * It's an RPCSEC_GSS request. * Can't just resend the original request * without bumping the cred sequence number. * Go back and re-build the request. @@ -2253,7 +3364,7 @@ nfs_wait_reply(struct nfsreq *req) if (nfs_noremotehang(req->r_thread)) ts.tv_sec = 1; msleep(req, &req->r_mtx, slpflag | (PZERO - 1), "nfswaitreply", &ts); - slpflag = 0; + first = slpflag = 0; } lck_mtx_unlock(&req->r_mtx); @@ -2340,6 +3451,8 @@ nfs_request_create( req->r_nmp = nmp; req->r_np = np; req->r_thread = thd; + if (!thd) + req->r_flags |= R_NOINTR; if (IS_VALID_CRED(cred)) { kauth_cred_ref(cred); req->r_cred = cred; @@ -2353,6 +3466,14 @@ nfs_request_create( req->r_rchain.tqe_next = NFSREQNOLIST; req->r_cchain.tqe_next = NFSREQNOLIST; + /* set auth flavor to use for request */ + if (!req->r_cred) + req->r_auth = RPCAUTH_NONE; + else if (req->r_np && (req->r_np->n_auth != RPCAUTH_INVALID)) + req->r_auth = req->r_np->n_auth; + else + req->r_auth = nmp->nm_auth; + lck_mtx_unlock(&nmp->nm_lock); /* move the request mbuf chain to the nfsreq */ @@ -2394,6 +3515,18 @@ nfs_request_destroy(struct nfsreq *req) lck_mtx_lock(&req->r_mtx); if (nmp) { lck_mtx_lock(&nmp->nm_lock); + if (req->r_flags & R_CWND) { + /* Decrement the outstanding request count. */ + req->r_flags &= ~R_CWND; + nmp->nm_sent -= NFS_CWNDSCALE; + if ((nmp->nm_sent < nmp->nm_cwnd) && !TAILQ_EMPTY(&nmp->nm_cwndq)) { + /* congestion window is open, poke the cwnd queue */ + struct nfsreq *req2 = TAILQ_FIRST(&nmp->nm_cwndq); + TAILQ_REMOVE(&nmp->nm_cwndq, req2, r_cchain); + req2->r_cchain.tqe_next = NFSREQNOLIST; + wakeup(req2); + } + } if (req->r_rchain.tqe_next != NFSREQNOLIST) { TAILQ_REMOVE(&nmp->nm_resendq, req, r_rchain); req->r_rchain.tqe_next = NFSREQNOLIST; @@ -2424,12 +3557,14 @@ nfs_request_destroy(struct nfsreq *req) mbuf_freem(req->r_nmrep.nmc_mhead); if (IS_VALID_CRED(req->r_cred)) kauth_cred_unref(&req->r_cred); - if (req->r_gss_ctx) + if (nfs_request_using_gss(req)) nfs_gss_clnt_rpcdone(req); SLIST_FOREACH_SAFE(gsp, &req->r_gss_seqlist, gss_seqnext, ngsp) FREE(gsp, M_TEMP); if (req->r_gss_ctx) nfs_gss_clnt_ctx_unref(req); + if (req->r_wrongsec) + FREE(req->r_wrongsec, M_TEMP); lck_mtx_destroy(&req->r_mtx, nfs_request_grp); if (req->r_flags & R_ALLOCATED) @@ -2471,7 +3606,7 @@ int nfs_request_add_header(struct nfsreq *req) { struct nfsmount *nmp; - int error = 0, auth_len = 0; + int error = 0; mbuf_t m; /* free up any previous header */ @@ -2485,24 +3620,7 @@ nfs_request_add_header(struct nfsreq *req) if (!nmp) return (ENXIO); - if (!req->r_cred) /* RPCAUTH_NULL */ - auth_len = 0; - else switch (nmp->nm_auth) { - case RPCAUTH_UNIX: - if (req->r_cred->cr_ngroups < 1) - return (EINVAL); - auth_len = ((((req->r_cred->cr_ngroups - 1) > nmp->nm_numgrps) ? - nmp->nm_numgrps : (req->r_cred->cr_ngroups - 1)) << 2) + - 5 * NFSX_UNSIGNED; - break; - case RPCAUTH_KRB5: - case RPCAUTH_KRB5I: - case RPCAUTH_KRB5P: - auth_len = 5 * NFSX_UNSIGNED + 0; // zero context handle for now - break; - } - - error = nfsm_rpchead(req, auth_len, req->r_mrest, &req->r_xid, &req->r_mhead); + error = nfsm_rpchead(req, req->r_mrest, &req->r_xid, &req->r_mhead); if (error) return (error); @@ -2511,7 +3629,7 @@ nfs_request_add_header(struct nfsreq *req) if (!nmp) return (ENXIO); lck_mtx_lock(&nmp->nm_lock); - if (nmp->nm_flag & NFSMNT_SOFT) + if (NMFLAG(nmp, SOFT)) req->r_retry = nmp->nm_retry; else req->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */ @@ -2598,7 +3716,7 @@ nfs_request_finish( uint32_t auth_status = 0; uint32_t accepted_status = 0; struct nfsm_chain nmrep; - int error, auth, clearjbtimeo; + int error, clearjbtimeo; error = req->r_error; @@ -2612,10 +3730,10 @@ nfs_request_finish( nmp = req->r_np ? NFSTONMP(req->r_np) : req->r_nmp; - /* - * Decrement the outstanding request count. - */ if ((req->r_flags & R_CWND) && nmp) { + /* + * Decrement the outstanding request count. + */ req->r_flags &= ~R_CWND; lck_mtx_lock(&nmp->nm_lock); FSDBG(273, R_XID32(req->r_xid), req, nmp->nm_sent, nmp->nm_cwnd); @@ -2630,9 +3748,9 @@ nfs_request_finish( lck_mtx_unlock(&nmp->nm_lock); } - if (req->r_gss_ctx) { // Using gss cred ? + if (nfs_request_using_gss(req)) { /* - * If the request had an RPCSEC_GSS credential + * If the request used an RPCSEC_GSS credential * then reset its sequence number bit in the * request window. */ @@ -2665,7 +3783,7 @@ nfs_request_finish( */ if (!error) { if ((req->r_flags & R_TPRINTFMSG) || - (nmp && (nmp->nm_flag & NFSMNT_SOFT) && + (nmp && NMFLAG(nmp, SOFT) && ((nmp->nm_state & (NFSSTA_TIMEO|NFSSTA_FORCE)) == NFSSTA_TIMEO))) nfs_up(nmp, req->r_thread, NFSSTA_TIMEO, "is alive again"); else @@ -2725,11 +3843,10 @@ nfs_request_finish( nfsm_chain_get_32(error, &nmrep, verf_len); // verifier length nfsmout_if(error); - auth = !req->r_cred ? RPCAUTH_NULL : nmp->nm_auth; - switch (auth) { - case RPCAUTH_NULL: - case RPCAUTH_UNIX: - /* Any AUTH_UNIX verifier is ignored */ + switch (req->r_auth) { + case RPCAUTH_NONE: + case RPCAUTH_SYS: + /* Any AUTH_SYS verifier is ignored */ if (verf_len > 0) nfsm_chain_adv(error, &nmrep, nfsm_rndup(verf_len)); nfsm_chain_get_32(error, &nmrep, accepted_status); @@ -2760,7 +3877,7 @@ nfs_request_finish( /* * It's a JUKEBOX error - delay and try again */ - int delay, slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0; + int delay, slpflag = (NMFLAG(nmp, INTR) && !(req->r_flags & R_NOINTR)) ? PCATCH : 0; mbuf_freem(mrep); req->r_nmrep.nmc_mhead = NULL; @@ -2785,7 +3902,7 @@ nfs_request_finish( nfs_down(req->r_nmp, req->r_thread, 0, NFSSTA_JUKEBOXTIMEO, "resource temporarily unavailable (jukebox)"); } - if ((nmp->nm_flag & NFSMNT_SOFT) && (req->r_delay == 30)) { + if (NMFLAG(nmp, SOFT) && (req->r_delay == 30) && !(req->r_flags & R_NOINTR)) { /* for soft mounts, just give up after a short while */ OSAddAtomic(1, &nfsstats.rpctimeouts); nfs_softterm(req); @@ -2802,6 +3919,7 @@ nfs_request_finish( if ((error = nfs_sigintr(req->r_nmp, req, req->r_thread, 0))) goto nfsmout; tsleep(&lbolt, PSOCK|slpflag, "nfs_jukebox_trylater", 0); + slpflag = 0; } while (--delay > 0); } req->r_xid = 0; // get a new XID @@ -2820,6 +3938,96 @@ nfs_request_finish( nfs_up(nmp, req->r_thread, clearjbtimeo, "resource available again"); } + if ((nmp->nm_vers >= NFS_VER4) && (*status == NFSERR_WRONGSEC)) { + /* + * Hmmm... we need to try a different security flavor. + * The first time a request hits this, we will allocate an array + * to track flavors to try. We fill the array with the mount's + * preferred flavors or the server's preferred flavors or just the + * flavors we support. + */ + uint32_t srvflavors[NX_MAX_SEC_FLAVORS]; + int srvcount, i, j; + + /* Call SECINFO to try to get list of flavors from server. */ + srvcount = NX_MAX_SEC_FLAVORS; + nfs4_secinfo_rpc(nmp, &req->r_secinfo, req->r_cred, srvflavors, &srvcount); + + if (!req->r_wrongsec) { + /* first time... set up flavor array */ + MALLOC(req->r_wrongsec, uint32_t*, NX_MAX_SEC_FLAVORS*sizeof(uint32_t), M_TEMP, M_WAITOK); + if (!req->r_wrongsec) { + error = EACCES; + goto nfsmout; + } + i=0; + if (nmp->nm_sec.count) { /* use the mount's preferred list of flavors */ + for(; i < nmp->nm_sec.count; i++) + req->r_wrongsec[i] = nmp->nm_sec.flavors[i]; + } else if (srvcount) { /* otherwise use the server's list of flavors */ + for(; i < srvcount; i++) + req->r_wrongsec[i] = srvflavors[i]; + } else { /* otherwise, just try the flavors we support. */ + req->r_wrongsec[i++] = RPCAUTH_KRB5P; + req->r_wrongsec[i++] = RPCAUTH_KRB5I; + req->r_wrongsec[i++] = RPCAUTH_KRB5; + req->r_wrongsec[i++] = RPCAUTH_SYS; + req->r_wrongsec[i++] = RPCAUTH_NONE; + } + for(; i < NX_MAX_SEC_FLAVORS; i++) /* invalidate any remaining slots */ + req->r_wrongsec[i] = RPCAUTH_INVALID; + } + + /* clear the current flavor from the list */ + for(i=0; i < NX_MAX_SEC_FLAVORS; i++) + if (req->r_wrongsec[i] == req->r_auth) + req->r_wrongsec[i] = RPCAUTH_INVALID; + + /* find the next flavor to try */ + for(i=0; i < NX_MAX_SEC_FLAVORS; i++) + if (req->r_wrongsec[i] != RPCAUTH_INVALID) { + if (((req->r_wrongsec[i] == RPCAUTH_KRB5P) || + (req->r_wrongsec[i] == RPCAUTH_KRB5I) || + (req->r_wrongsec[i] == RPCAUTH_KRB5)) && (req->r_gss_ctx && + (req->r_gss_ctx->gss_clnt_service == RPCSEC_GSS_SVC_SYS))) { + /* don't bother trying Kerberos if we've already got a fallback context */ + req->r_wrongsec[i] = RPCAUTH_INVALID; + continue; + } + if (!srvcount) /* no server list, just try it */ + break; + /* check that it's in the server's list */ + for(j=0; j < srvcount; j++) + if (req->r_wrongsec[i] == srvflavors[j]) + break; + if (j < srvcount) /* found */ + break; + /* not found in server list */ + req->r_wrongsec[i] = RPCAUTH_INVALID; + } + if (i == NX_MAX_SEC_FLAVORS) { + /* nothing left to try! */ + error = EACCES; + goto nfsmout; + } + + /* retry with the next auth flavor */ + req->r_auth = req->r_wrongsec[i]; + req->r_xid = 0; // get a new XID + req->r_flags |= R_RESTART; + req->r_start = 0; + FSDBG(273, R_XID32(req->r_xid), nmp, req, NFSERR_WRONGSEC); + return (0); + } + if ((nmp->nm_vers >= NFS_VER4) && req->r_wrongsec) { + /* + * We renegotiated security for this request; so update the + * default security flavor for the associated node. + */ + if (req->r_np) + req->r_np->n_auth = req->r_auth; + } + if (*status == NFS_OK) { /* * Successful NFS request @@ -2834,8 +4042,12 @@ nfs_request_finish( * If the File Handle was stale, invalidate the * lookup cache, just in case. */ - if ((*status == ESTALE) && req->r_np) + if ((*status == ESTALE) && req->r_np) { cache_purge(NFSTOV(req->r_np)); + /* if monitored, also send delete event */ + if (vnode_ismonitored(NFSTOV(req->r_np))) + nfs_vnode_notify(req->r_np, (VNODE_EVENT_ATTRIB|VNODE_EVENT_DELETE)); + } if (nmp->nm_vers == NFS_VER2) mbuf_freem(mrep); else @@ -2875,6 +4087,22 @@ nfs_request_finish( return (error); } +/* + * NFS request using a GSS/Kerberos security flavor? + */ +int +nfs_request_using_gss(struct nfsreq *req) +{ + if (!req->r_gss_ctx) + return (0); + switch (req->r_auth) { + case RPCAUTH_KRB5: + case RPCAUTH_KRB5I: + case RPCAUTH_KRB5P: + return (1); + } + return (0); +} /* * Perform an NFS request synchronously. @@ -2887,13 +4115,14 @@ nfs_request( struct nfsm_chain *nmrest, int procnum, vfs_context_t ctx, + struct nfsreq_secinfo_args *si, struct nfsm_chain *nmrepp, u_int64_t *xidp, int *status) { return nfs_request2(np, mp, nmrest, procnum, vfs_context_thread(ctx), vfs_context_ucred(ctx), - 0, nmrepp, xidp, status); + si, 0, nmrepp, xidp, status); } int @@ -2904,6 +4133,7 @@ nfs_request2( int procnum, thread_t thd, kauth_cred_t cred, + struct nfsreq_secinfo_args *si, int flags, struct nfsm_chain *nmrepp, u_int64_t *xidp, @@ -2915,6 +4145,8 @@ nfs_request2( if ((error = nfs_request_create(np, mp, nmrest, procnum, thd, cred, &req))) return (error); req->r_flags |= (flags & R_OPTMASK); + if (si) + req->r_secinfo = *si; FSDBG_TOP(273, R_XID32(req->r_xid), np, procnum, 0); do { @@ -2998,10 +4230,13 @@ nfs_request_async( int procnum, thread_t thd, kauth_cred_t cred, + struct nfsreq_secinfo_args *si, + int flags, struct nfsreq_cbinfo *cb, struct nfsreq **reqp) { struct nfsreq *req; + struct nfsmount *nmp; int error, sent; error = nfs_request_create(np, mp, nmrest, procnum, thd, cred, reqp); @@ -3009,7 +4244,10 @@ nfs_request_async( FSDBG(274, (req ? R_XID32(req->r_xid) : 0), np, procnum, error); if (error) return (error); + req->r_flags |= (flags & R_OPTMASK); req->r_flags |= R_ASYNC; + if (si) + req->r_secinfo = *si; if (cb) req->r_callback = *cb; error = nfs_request_add_header(req); @@ -3021,9 +4259,32 @@ nfs_request_async( lck_mtx_lock(&req->r_mtx); if (!error && !(req->r_flags & R_SENT) && req->r_callback.rcb_func) { /* make sure to wait until this async I/O request gets sent */ - int slpflag = (req->r_nmp && (req->r_nmp->nm_flag & NFSMNT_INT) && req->r_thread) ? PCATCH : 0; + int slpflag = (req->r_nmp && NMFLAG(req->r_nmp, INTR) && req->r_thread && !(req->r_flags & R_NOINTR)) ? PCATCH : 0; struct timespec ts = { 2, 0 }; while (!(req->r_flags & R_SENT)) { + if ((req->r_flags & R_RESENDQ) && ((nmp = req->r_nmp))) { + lck_mtx_lock(&nmp->nm_lock); + if ((nmp->nm_state & NFSSTA_RECOVER) && (req->r_rchain.tqe_next != NFSREQNOLIST)) { + /* + * It's not going to get off the resend queue if we're in recovery. + * So, just take it off ourselves. We could be holding mount state + * busy and thus holding up the start of recovery. + */ + TAILQ_REMOVE(&nmp->nm_resendq, req, r_rchain); + req->r_rchain.tqe_next = NFSREQNOLIST; + if (req->r_flags & R_RESENDQ) + req->r_flags &= ~R_RESENDQ; + lck_mtx_unlock(&nmp->nm_lock); + req->r_flags |= R_SENDING; + lck_mtx_unlock(&req->r_mtx); + error = nfs_send(req, 1); + lck_mtx_lock(&req->r_mtx); + if (error) + break; + continue; + } + lck_mtx_unlock(&nmp->nm_lock); + } if ((error = nfs_sigintr(req->r_nmp, req, req->r_thread, 0))) break; msleep(req, &req->r_mtx, slpflag | (PZERO - 1), "nfswaitsent", &ts); @@ -3052,12 +4313,30 @@ nfs_request_async_finish( int *status) { int error = 0, asyncio = req->r_callback.rcb_func ? 1 : 0; + struct nfsmount *nmp; lck_mtx_lock(&req->r_mtx); if (!asyncio) req->r_flags |= R_ASYNCWAIT; while (req->r_flags & R_RESENDQ) { /* wait until the request is off the resend queue */ struct timespec ts = { 2, 0 }; + if ((nmp = req->r_nmp)) { + lck_mtx_lock(&nmp->nm_lock); + if ((nmp->nm_state & NFSSTA_RECOVER) && (req->r_rchain.tqe_next != NFSREQNOLIST)) { + /* + * It's not going to get off the resend queue if we're in recovery. + * So, just take it off ourselves. We could be holding mount state + * busy and thus holding up the start of recovery. + */ + TAILQ_REMOVE(&nmp->nm_resendq, req, r_rchain); + req->r_rchain.tqe_next = NFSREQNOLIST; + if (req->r_flags & R_RESENDQ) + req->r_flags &= ~R_RESENDQ; + lck_mtx_unlock(&nmp->nm_lock); + break; + } + lck_mtx_unlock(&nmp->nm_lock); + } if ((error = nfs_sigintr(req->r_nmp, req, req->r_thread, 0))) break; msleep(req, &req->r_mtx, PZERO-1, "nfsresendqwait", &ts); @@ -3270,7 +4549,7 @@ nfs_request_timer(__unused void *param0, __unused void *param1) * Put a reasonable limit on the maximum timeout, * and reduce that limit when soft mounts get timeouts or are in reconnect. */ - if (!(nmp->nm_flag & NFSMNT_SOFT)) + if (!NMFLAG(nmp, SOFT)) maxtime = NFS_MAXTIMEO; else if ((req->r_flags & (R_SETUP|R_RECOVER)) || ((nmp->nm_reconnect_start <= 0) || ((now.tv_sec - nmp->nm_reconnect_start) < 8))) @@ -3290,7 +4569,7 @@ nfs_request_timer(__unused void *param0, __unused void *param1) } else { if (req->r_procnum == NFSPROC_NULL && req->r_gss_ctx != NULL) timeo = NFS_MINIDEMTIMEO; // gss context setup - else if (nmp->nm_flag & NFSMNT_DUMBTIMR) + else if (NMFLAG(nmp, DUMBTIMER)) timeo = nmp->nm_timeo; else timeo = NFS_RTO(nmp, proct[req->r_procnum]); @@ -3320,7 +4599,8 @@ nfs_request_timer(__unused void *param0, __unused void *param1) /* if it's been a few seconds, try poking the socket */ if ((nmp->nm_sotype == SOCK_STREAM) && ((now.tv_sec - req->r_start) >= 3) && - !(nmp->nm_sockflags & NMSOCK_POKE)) { + !(nmp->nm_sockflags & (NMSOCK_POKE|NMSOCK_UNMOUNT)) && + (nmp->nm_sockflags & NMSOCK_READY)) { nmp->nm_sockflags |= NMSOCK_POKE; TAILQ_INSERT_TAIL(&nfs_mount_poke_queue, nmp, nm_pokeq); } @@ -3328,7 +4608,7 @@ nfs_request_timer(__unused void *param0, __unused void *param1) } /* For soft mounts (& SETUPs/RECOVERs), check for too many retransmits/timeout. */ - if (((nmp->nm_flag & NFSMNT_SOFT) || (req->r_flags & (R_SETUP|R_RECOVER))) && + if ((NMFLAG(nmp, SOFT) || (req->r_flags & (R_SETUP|R_RECOVER))) && ((req->r_rexmit >= req->r_retry) || /* too many */ ((now.tv_sec - req->r_start)*NFS_HZ > maxtime))) { /* too long */ OSAddAtomic(1, &nfsstats.rpctimeouts); @@ -3344,6 +4624,11 @@ nfs_request_timer(__unused void *param0, __unused void *param1) } else { lck_mtx_unlock(&nmp->nm_lock); } + if (req->r_flags & R_NOINTR) { + /* don't terminate nointr requests on timeout */ + lck_mtx_unlock(&req->r_mtx); + continue; + } NFS_SOCK_DBG(("nfs timer TERMINATE: p %d x 0x%llx f 0x%x rtt %d t %ld\n", req->r_procnum, req->r_xid, req->r_flags, req->r_rtt, now.tv_sec - req->r_start)); @@ -3391,8 +4676,7 @@ nfs_request_timer(__unused void *param0, __unused void *param1) nfs_sock_poke(nmp); lck_mtx_lock(&nmp->nm_lock); nmp->nm_sockflags &= ~NMSOCK_POKE; - if (!(nmp->nm_state & NFSSTA_MOUNTED)) - wakeup(&nmp->nm_sockflags); + wakeup(&nmp->nm_sockflags); lck_mtx_unlock(&nmp->nm_lock); } @@ -3417,6 +4701,7 @@ nfs_noremotehang(thread_t thd) * and the mount is interruptable, or if we are a thread that is in the process * of cancellation (also SIGKILL posted). */ +extern int sigprop[NSIG+1]; int nfs_sigintr(struct nfsmount *nmp, struct nfsreq *req, thread_t thd, int nmplocked) { @@ -3428,19 +4713,17 @@ nfs_sigintr(struct nfsmount *nmp, struct nfsreq *req, thread_t thd, int nmplocke if (req && (req->r_flags & R_SOFTTERM)) return (ETIMEDOUT); /* request has been terminated. */ + if (req && (req->r_flags & R_NOINTR)) + thd = NULL; /* don't check for signal on R_NOINTR */ - /* - * If we're in the progress of a force unmount and there's - * been a timeout, we're dead and fail IO. - */ if (!nmplocked) lck_mtx_lock(&nmp->nm_lock); - if ((nmp->nm_state & NFSSTA_FORCE) && - (nmp->nm_state & (NFSSTA_TIMEO|NFSSTA_JUKEBOXTIMEO|NFSSTA_LOCKTIMEO))) { + if (nmp->nm_state & NFSSTA_FORCE) { + /* If a force unmount is in progress then fail. */ error = EIO; } else if (nmp->nm_mountp->mnt_kern_flag & MNTK_FRCUNMOUNT) { /* Someone is unmounting us, go soft and mark it. */ - nmp->nm_flag |= NFSMNT_SOFT; + NFS_BITMAP_SET(nmp->nm_flags, NFS_MFLAG_SOFT); nmp->nm_state |= NFSSTA_FORCE; } @@ -3464,12 +4747,20 @@ nfs_sigintr(struct nfsmount *nmp, struct nfsreq *req, thread_t thd, int nmplocke if (thd == NULL) return (0); - /* If this thread belongs to kernel task; then abort check is not needed */ - if ((current_proc() != kernproc) && current_thread_aborted()) + /* + * Check if the process is aborted, but don't interrupt if we + * were killed by a signal and this is the exiting thread which + * is attempting to dump core. + */ + if (((p = current_proc()) != kernproc) && current_thread_aborted() && + (!(p->p_acflag & AXSIG) || (p->exit_thread != current_thread()) || + (p->p_sigacts == NULL) || + (p->p_sigacts->ps_sig < 1) || (p->p_sigacts->ps_sig > NSIG) || + !(sigprop[p->p_sigacts->ps_sig] & SA_CORE))) return (EINTR); /* mask off thread and process blocked signals. */ - if ((nmp->nm_flag & NFSMNT_INT) && ((p = get_bsdthreadtask_info(thd))) && + if (NMFLAG(nmp, INTR) && ((p = get_bsdthreadtask_info(thd))) && proc_pendingsignals(p, NFSINT_SIGMASK)) return (EINTR); return (0); @@ -3495,7 +4786,7 @@ nfs_sndlock(struct nfsreq *req) lck_mtx_lock(&nmp->nm_lock); statep = &nmp->nm_state; - if ((nmp->nm_flag & NFSMNT_INT) && req->r_thread) + if (NMFLAG(nmp, INTR) && req->r_thread && !(req->r_flags & R_NOINTR)) slpflag = PCATCH; while (*statep & NFSSTA_SNDLOCK) { if ((error = nfs_sigintr(nmp, req, req->r_thread, 1))) @@ -3530,7 +4821,7 @@ nfs_sndunlock(struct nfsreq *req) statep = &nmp->nm_state; if ((*statep & NFSSTA_SNDLOCK) == 0) panic("nfs sndunlock"); - *statep &= ~NFSSTA_SNDLOCK; + *statep &= ~(NFSSTA_SNDLOCK|NFSSTA_SENDING); if (*statep & NFSSTA_WANTSND) { *statep &= ~NFSSTA_WANTSND; wake = 1; @@ -3544,62 +4835,113 @@ int nfs_aux_request( struct nfsmount *nmp, thread_t thd, - struct sockaddr_in *saddr, + struct sockaddr *saddr, + socket_t so, + int sotype, mbuf_t mreq, uint32_t xid, int bindresv, int timeo, struct nfsm_chain *nmrep) { - int error = 0, on = 1, try, sendat = 2; - socket_t so = NULL; - struct sockaddr_in sin; - struct timeval tv = { 1, 0 }; + int error = 0, on = 1, try, sendat = 2, soproto, recv, optlen, restoreto = 0; + socket_t newso = NULL; + struct sockaddr_storage ss; + struct timeval orig_rcvto, orig_sndto, tv = { 1, 0 }; mbuf_t m, mrep = NULL; struct msghdr msg; uint32_t rxid = 0, reply = 0, reply_status, rejected_status; uint32_t verf_type, verf_len, accepted_status; - size_t readlen; + size_t readlen, sentlen; + struct nfs_rpc_record_state nrrs; - /* create socket and set options */ - if (((error = sock_socket(saddr->sin_family, SOCK_DGRAM, IPPROTO_UDP, NULL, NULL, &so))) || - ((error = sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)))) || - ((error = sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)))) || - ((error = sock_setsockopt(so, SOL_SOCKET, SO_NOADDRERR, &on, sizeof(on))))) - goto nfsmout; - if (bindresv) { - int portrange = IP_PORTRANGE_LOW; - error = sock_setsockopt(so, IPPROTO_IP, IP_PORTRANGE, &portrange, sizeof(portrange)); - nfsmout_if(error); - /* bind now to check for failure */ - sin.sin_len = sizeof (struct sockaddr_in); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = INADDR_ANY; - sin.sin_port = 0; - error = sock_bind(so, (struct sockaddr *) &sin); - nfsmout_if(error); + if (!so) { + /* create socket and set options */ + soproto = (sotype == SOCK_DGRAM) ? IPPROTO_UDP : IPPROTO_TCP; + if ((error = sock_socket(saddr->sa_family, sotype, soproto, NULL, NULL, &newso))) + goto nfsmout; + + if (bindresv) { + int level = (saddr->sa_family == AF_INET) ? IPPROTO_IP : IPPROTO_IPV6; + int optname = (saddr->sa_family == AF_INET) ? IP_PORTRANGE : IPV6_PORTRANGE; + int portrange = IP_PORTRANGE_LOW; + error = sock_setsockopt(newso, level, optname, &portrange, sizeof(portrange)); + nfsmout_if(error); + ss.ss_len = saddr->sa_len; + ss.ss_family = saddr->sa_family; + if (ss.ss_family == AF_INET) { + ((struct sockaddr_in*)&ss)->sin_addr.s_addr = INADDR_ANY; + ((struct sockaddr_in*)&ss)->sin_port = htons(0); + } else if (ss.ss_family == AF_INET6) { + ((struct sockaddr_in6*)&ss)->sin6_addr = in6addr_any; + ((struct sockaddr_in6*)&ss)->sin6_port = htons(0); + } else { + error = EINVAL; + } + if (!error) + error = sock_bind(newso, (struct sockaddr *)&ss); + nfsmout_if(error); + } + + if (sotype == SOCK_STREAM) { + on = 4; /* don't wait too long for the socket to connect */ + sock_setsockopt(newso, IPPROTO_TCP, TCP_CONNECTIONTIMEOUT, &on, sizeof(on)); + error = sock_connect(newso, saddr, 0); + nfsmout_if(error); + } + if (((error = sock_setsockopt(newso, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)))) || + ((error = sock_setsockopt(newso, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)))) || + ((error = sock_setsockopt(newso, SOL_SOCKET, SO_NOADDRERR, &on, sizeof(on))))) + goto nfsmout; + so = newso; + } else { + /* make sure socket is using a one second timeout in this function */ + optlen = sizeof(orig_rcvto); + error = sock_getsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &orig_rcvto, &optlen); + if (!error) { + optlen = sizeof(orig_sndto); + error = sock_getsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &orig_sndto, &optlen); + } + if (!error) { + sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)); + sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)); + restoreto = 1; + } + } + + if (sotype == SOCK_STREAM) { + sendat = 0; /* we only resend the request for UDP */ + nfs_rpc_record_state_init(&nrrs); } for (try=0; try < timeo; try++) { - if ((error = nfs_sigintr(nmp, NULL, thd, 0))) + if ((error = nfs_sigintr(nmp, NULL, !try ? NULL : thd, 0))) break; if (!try || (try == sendat)) { - /* send the request (resending periodically) */ + /* send the request (resending periodically for UDP) */ if ((error = mbuf_copym(mreq, 0, MBUF_COPYALL, MBUF_WAITOK, &m))) goto nfsmout; bzero(&msg, sizeof(msg)); - msg.msg_name = saddr; - msg.msg_namelen = saddr->sin_len; - if ((error = sock_sendmbuf(so, &msg, m, 0, NULL))) + if ((sotype == SOCK_DGRAM) && !sock_isconnected(so)) { + msg.msg_name = saddr; + msg.msg_namelen = saddr->sa_len; + } + if ((error = sock_sendmbuf(so, &msg, m, 0, &sentlen))) goto nfsmout; sendat *= 2; if (sendat > 30) sendat = 30; } /* wait for the response */ - readlen = 1<<18; - bzero(&msg, sizeof(msg)); - error = sock_receivembuf(so, &msg, &mrep, 0, &readlen); + if (sotype == SOCK_STREAM) { + /* try to read (more of) record */ + error = nfs_rpc_record_read(so, &nrrs, 0, &recv, &mrep); + /* if we don't have the whole record yet, we'll keep trying */ + } else { + readlen = 1<<18; + bzero(&msg, sizeof(msg)); + error = sock_receivembuf(so, &msg, &mrep, 0, &readlen); + } if (error == EWOULDBLOCK) continue; nfsmout_if(error); @@ -3615,7 +4957,7 @@ nfs_aux_request( if (reply_status == RPC_MSGDENIED) { nfsm_chain_get_32(error, nmrep, rejected_status); nfsmout_if(error); - error = (rejected_status == RPC_MISMATCH) ? ENOTSUP : EACCES; + error = (rejected_status == RPC_MISMATCH) ? ERPCMISMATCH : EACCES; goto nfsmout; } nfsm_chain_get_32(error, nmrep, verf_type); /* verifier flavor */ @@ -3624,18 +4966,159 @@ nfs_aux_request( if (verf_len) nfsm_chain_adv(error, nmrep, nfsm_rndup(verf_len)); nfsm_chain_get_32(error, nmrep, accepted_status); - nfsm_assert(error, (accepted_status == RPC_SUCCESS), EIO); + nfsmout_if(error); + switch (accepted_status) { + case RPC_SUCCESS: + error = 0; + break; + case RPC_PROGUNAVAIL: + error = EPROGUNAVAIL; + break; + case RPC_PROGMISMATCH: + error = EPROGMISMATCH; + break; + case RPC_PROCUNAVAIL: + error = EPROCUNAVAIL; + break; + case RPC_GARBAGE: + error = EBADRPC; + break; + case RPC_SYSTEM_ERR: + default: + error = EIO; + break; + } break; } nfsmout: - if (so) { - sock_shutdown(so, SHUT_RDWR); - sock_close(so); + if (restoreto) { + sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &orig_rcvto, sizeof(tv)); + sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &orig_sndto, sizeof(tv)); + } + if (newso) { + sock_shutdown(newso, SHUT_RDWR); + sock_close(newso); } mbuf_freem(mreq); return (error); } +int +nfs_portmap_lookup( + struct nfsmount *nmp, + vfs_context_t ctx, + struct sockaddr *sa, + socket_t so, + uint32_t protocol, + uint32_t vers, + uint32_t ipproto, + int timeo) +{ + thread_t thd = vfs_context_thread(ctx); + kauth_cred_t cred = vfs_context_ucred(ctx); + struct sockaddr_storage ss; + struct sockaddr *saddr = (struct sockaddr*)&ss; + struct nfsm_chain nmreq, nmrep; + mbuf_t mreq; + int error = 0, ip, pmprog, pmvers, pmproc, ualen = 0; + uint32_t port; + uint64_t xid = 0; + char uaddr[MAX_IPv6_STR_LEN+16]; + + bcopy(sa, saddr, min(sizeof(ss), sa->sa_len)); + if (saddr->sa_family == AF_INET) { + ip = 4; + pmprog = PMAPPROG; + pmvers = PMAPVERS; + pmproc = PMAPPROC_GETPORT; + } else if (saddr->sa_family == AF_INET6) { + ip = 6; + pmprog = RPCBPROG; + pmvers = RPCBVERS4; + pmproc = RPCBPROC_GETVERSADDR; + } else { + return (EINVAL); + } + nfsm_chain_null(&nmreq); + nfsm_chain_null(&nmrep); + +tryagain: + /* send portmapper request to get port/uaddr */ + if (ip == 4) + ((struct sockaddr_in*)saddr)->sin_port = htons(PMAPPORT); + else + ((struct sockaddr_in6*)saddr)->sin6_port = htons(PMAPPORT); + nfsm_chain_build_alloc_init(error, &nmreq, 8*NFSX_UNSIGNED); + nfsm_chain_add_32(error, &nmreq, protocol); + nfsm_chain_add_32(error, &nmreq, vers); + if (ip == 4) { + nfsm_chain_add_32(error, &nmreq, ipproto); + nfsm_chain_add_32(error, &nmreq, 0); + } else { + if (ipproto == IPPROTO_TCP) + nfsm_chain_add_string(error, &nmreq, "tcp6", 4); + else + nfsm_chain_add_string(error, &nmreq, "udp6", 4); + nfsm_chain_add_string(error, &nmreq, "", 0); /* uaddr */ + nfsm_chain_add_string(error, &nmreq, "", 0); /* owner */ + } + nfsm_chain_build_done(error, &nmreq); + nfsmout_if(error); + error = nfsm_rpchead2(nmp, (ipproto == IPPROTO_UDP) ? SOCK_DGRAM : SOCK_STREAM, + pmprog, pmvers, pmproc, RPCAUTH_SYS, cred, NULL, nmreq.nmc_mhead, + &xid, &mreq); + nfsmout_if(error); + nmreq.nmc_mhead = NULL; + error = nfs_aux_request(nmp, thd, saddr, so, (ipproto == IPPROTO_UDP) ? SOCK_DGRAM : SOCK_STREAM, + mreq, R_XID32(xid), 0, timeo, &nmrep); + + /* grab port from portmap response */ + if (ip == 4) { + nfsm_chain_get_32(error, &nmrep, port); + if (!error) + ((struct sockaddr_in*)sa)->sin_port = htons(port); + } else { + /* get uaddr string and convert to sockaddr */ + nfsm_chain_get_32(error, &nmrep, ualen); + if (!error) { + if (ualen > ((int)sizeof(uaddr)-1)) + error = EIO; + if (ualen < 1) { + /* program is not available, just return a zero port */ + bcopy(sa, saddr, min(sizeof(ss), sa->sa_len)); + ((struct sockaddr_in6*)saddr)->sin6_port = htons(0); + } else { + nfsm_chain_get_opaque(error, &nmrep, ualen, uaddr); + if (!error) { + uaddr[ualen] = '\0'; + if (!nfs_uaddr2sockaddr(uaddr, saddr)) + error = EIO; + } + } + } + if ((error == EPROGMISMATCH) || (error == EPROCUNAVAIL) || (error == EIO) || (error == EBADRPC)) { + /* remote doesn't support rpcbind version or proc (or we couldn't parse uaddr) */ + if (pmvers == RPCBVERS4) { + /* fall back to v3 and GETADDR */ + pmvers = RPCBVERS3; + pmproc = RPCBPROC_GETADDR; + nfsm_chain_cleanup(&nmreq); + nfsm_chain_cleanup(&nmrep); + bcopy(sa, saddr, min(sizeof(ss), sa->sa_len)); + xid = 0; + error = 0; + goto tryagain; + } + } + if (!error) + bcopy(saddr, sa, min(saddr->sa_len, sa->sa_len)); + } +nfsmout: + nfsm_chain_cleanup(&nmreq); + nfsm_chain_cleanup(&nmrep); + return (error); +} + int nfs_msg(thread_t thd, const char *server, @@ -3670,12 +5153,12 @@ nfs_down(struct nfsmount *nmp, thread_t thd, int error, int flags, const char *m lck_mtx_lock(&nmp->nm_lock); timeoutmask = NFSSTA_TIMEO | NFSSTA_LOCKTIMEO | NFSSTA_JUKEBOXTIMEO; - if (nmp->nm_flag & NFSMNT_MUTEJUKEBOX) /* jukebox timeouts don't count as unresponsive if muted */ + if (NMFLAG(nmp, MUTEJUKEBOX)) /* jukebox timeouts don't count as unresponsive if muted */ timeoutmask &= ~NFSSTA_JUKEBOXTIMEO; wasunresponsive = (nmp->nm_state & timeoutmask); /* XXX don't allow users to know about/disconnect unresponsive, soft, nobrowse mounts */ - softnobrowse = ((nmp->nm_flag & NFSMNT_SOFT) && (vfs_flags(nmp->nm_mountp) & MNT_DONTBROWSE)); + softnobrowse = (NMFLAG(nmp, SOFT) && (vfs_flags(nmp->nm_mountp) & MNT_DONTBROWSE)); if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) nmp->nm_state |= NFSSTA_TIMEO; @@ -3686,7 +5169,7 @@ nfs_down(struct nfsmount *nmp, thread_t thd, int error, int flags, const char *m unresponsive = (nmp->nm_state & timeoutmask); - if (unresponsive && (nmp->nm_flag & NFSMNT_DEADTIMEOUT)) { + if (unresponsive && (nmp->nm_deadtimeout > 0)) { microuptime(&now); if (!wasunresponsive) { nmp->nm_deadto_start = now.tv_sec; @@ -3726,12 +5209,12 @@ nfs_up(struct nfsmount *nmp, thread_t thd, int flags, const char *msg) lck_mtx_lock(&nmp->nm_lock); timeoutmask = NFSSTA_TIMEO | NFSSTA_LOCKTIMEO | NFSSTA_JUKEBOXTIMEO; - if (nmp->nm_flag & NFSMNT_MUTEJUKEBOX) /* jukebox timeouts don't count as unresponsive if muted */ + if (NMFLAG(nmp, MUTEJUKEBOX)) /* jukebox timeouts don't count as unresponsive if muted */ timeoutmask &= ~NFSSTA_JUKEBOXTIMEO; wasunresponsive = (nmp->nm_state & timeoutmask); /* XXX don't allow users to know about/disconnect unresponsive, soft, nobrowse mounts */ - softnobrowse = ((nmp->nm_flag & NFSMNT_SOFT) && (vfs_flags(nmp->nm_mountp) & MNT_DONTBROWSE)); + softnobrowse = (NMFLAG(nmp, SOFT) && (vfs_flags(nmp->nm_mountp) & MNT_DONTBROWSE)); if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) nmp->nm_state &= ~NFSSTA_TIMEO; @@ -3916,9 +5399,9 @@ nfsrv_send(struct nfsrv_sock *slp, mbuf_t nam, mbuf_t top) * be called with MBUF_WAITOK from an nfsd. */ void -nfsrv_rcv(socket_t so, caddr_t arg, int waitflag) +nfsrv_rcv(socket_t so, void *arg, int waitflag) { - struct nfsrv_sock *slp = (struct nfsrv_sock *)arg; + struct nfsrv_sock *slp = arg; if (!nfsd_thread_count || !(slp->ns_flag & SLP_VALID)) return; @@ -4250,6 +5733,8 @@ nfsrv_dorec( if (error) { if (nam) mbuf_freem(nam); + if (nd->nd_gss_context) + nfs_gss_svc_ctx_deref(nd->nd_gss_context); FREE_ZONE(nd, sizeof(*nd), M_NFSRVDESC); return (error); } @@ -4274,7 +5759,6 @@ nfsrv_getreq(struct nfsrv_descript *nd) uid_t user_id; gid_t group_id; int ngroups; - struct ucred temp_cred; uint32_t val; nd->nd_cr = NULL; @@ -4331,10 +5815,11 @@ nfsrv_getreq(struct nfsrv_descript *nd) nfsmout_if(error); /* Handle authentication */ - if (auth_type == RPCAUTH_UNIX) { + if (auth_type == RPCAUTH_SYS) { + struct posix_cred temp_pcred; if (nd->nd_procnum == NFSPROC_NULL) return (0); - nd->nd_sec = RPCAUTH_UNIX; + nd->nd_sec = RPCAUTH_SYS; nfsm_chain_adv(error, nmreq, NFSX_UNSIGNED); // skip stamp nfsm_chain_get_32(error, nmreq, len); // hostname length if (len < 0 || len > NFS_MAXNAMLEN) @@ -4343,23 +5828,23 @@ nfsrv_getreq(struct nfsrv_descript *nd) nfsmout_if(error); /* create a temporary credential using the bits from the wire */ - bzero(&temp_cred, sizeof(temp_cred)); + bzero(&temp_pcred, sizeof(temp_pcred)); nfsm_chain_get_32(error, nmreq, user_id); nfsm_chain_get_32(error, nmreq, group_id); - temp_cred.cr_groups[0] = group_id; + temp_pcred.cr_groups[0] = group_id; nfsm_chain_get_32(error, nmreq, len); // extra GID count if ((len < 0) || (len > RPCAUTH_UNIXGIDS)) error = EBADRPC; nfsmout_if(error); for (i = 1; i <= len; i++) if (i < NGROUPS) - nfsm_chain_get_32(error, nmreq, temp_cred.cr_groups[i]); + nfsm_chain_get_32(error, nmreq, temp_pcred.cr_groups[i]); else nfsm_chain_adv(error, nmreq, NFSX_UNSIGNED); nfsmout_if(error); ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1); if (ngroups > 1) - nfsrv_group_sort(&temp_cred.cr_groups[0], ngroups); + nfsrv_group_sort(&temp_pcred.cr_groups[0], ngroups); nfsm_chain_adv(error, nmreq, NFSX_UNSIGNED); // verifier flavor (should be AUTH_NONE) nfsm_chain_get_32(error, nmreq, len); // verifier length if (len < 0 || len > RPCAUTH_MAXSIZ) @@ -4368,9 +5853,9 @@ nfsrv_getreq(struct nfsrv_descript *nd) nfsm_chain_adv(error, nmreq, nfsm_rndup(len)); /* request creation of a real credential */ - temp_cred.cr_uid = user_id; - temp_cred.cr_ngroups = ngroups; - nd->nd_cr = kauth_cred_create(&temp_cred); + temp_pcred.cr_uid = user_id; + temp_pcred.cr_ngroups = ngroups; + nd->nd_cr = posix_cred_create(&temp_pcred); if (nd->nd_cr == NULL) { nd->nd_repstat = ENOMEM; nd->nd_procnum = NFSPROC_NOOP; diff --git a/bsd/nfs/nfs_srvcache.c b/bsd/nfs/nfs_srvcache.c index db1c6e6a7..7fde3da6b 100644 --- a/bsd/nfs/nfs_srvcache.c +++ b/bsd/nfs/nfs_srvcache.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -174,6 +174,7 @@ nfsrv_initcache(void) * If there is any doubt, return FALSE. * The AF_INET family is handled as a special case so that address mbufs * don't need to be saved to store "struct in_addr", which is only 4 bytes. + * Ditto for AF_INET6 which is only 16 bytes. */ static int netaddr_match( @@ -182,17 +183,22 @@ netaddr_match( mbuf_t nam) { struct sockaddr_in *inetaddr; + struct sockaddr_in6 *inet6addr; switch (family) { case AF_INET: inetaddr = mbuf_data(nam); - if (inetaddr->sin_family == AF_INET && - inetaddr->sin_addr.s_addr == haddr->had_inetaddr) + if ((inetaddr->sin_family == AF_INET) && + (inetaddr->sin_addr.s_addr == haddr->had_inetaddr)) return (1); break; - default: + case AF_INET6: + inet6addr = mbuf_data(nam); + if ((inet6addr->sin6_family == AF_INET6) && + !bcmp(&inet6addr->sin6_addr, &haddr->had_inet6addr, sizeof(inet6addr->sin6_addr))) + return (1); break; - }; + } return (0); } @@ -218,7 +224,7 @@ nfsrv_getcache( { struct nfsrvcache *rp; struct nfsm_chain nmrep; - struct sockaddr_in *saddr; + struct sockaddr *saddr; int ret, error; /* @@ -232,7 +238,7 @@ nfsrv_getcache( for (rp = NFSRCHASH(nd->nd_retxid)->lh_first; rp != 0; rp = rp->rc_hash.le_next) { if (nd->nd_retxid == rp->rc_xid && nd->nd_procnum == rp->rc_proc && - netaddr_match(AF_INET, &rp->rc_haddr, nd->nd_nam)) { + netaddr_match(rp->rc_family, &rp->rc_haddr, nd->nd_nam)) { if ((rp->rc_flag & RC_LOCKED) != 0) { rp->rc_flag |= RC_WANTED; msleep(rp, nfsrv_reqcache_mutex, PZERO-1, "nfsrc", NULL); @@ -323,10 +329,15 @@ nfsrv_getcache( rp->rc_state = RC_INPROG; rp->rc_xid = nd->nd_retxid; saddr = mbuf_data(nd->nd_nam); - switch (saddr->sin_family) { + rp->rc_family = saddr->sa_family; + switch (saddr->sa_family) { case AF_INET: rp->rc_flag |= RC_INETADDR; - rp->rc_inetaddr = saddr->sin_addr.s_addr; + rp->rc_inetaddr = ((struct sockaddr_in*)saddr)->sin_addr.s_addr; + break; + case AF_INET6: + rp->rc_flag |= RC_INETADDR; + rp->rc_inet6addr = ((struct sockaddr_in6*)saddr)->sin6_addr; break; default: error = mbuf_copym(nd->nd_nam, 0, MBUF_COPYALL, MBUF_WAITOK, &rp->rc_nam); @@ -366,7 +377,7 @@ nfsrv_updatecache( for (rp = NFSRCHASH(nd->nd_retxid)->lh_first; rp != 0; rp = rp->rc_hash.le_next) { if (nd->nd_retxid == rp->rc_xid && nd->nd_procnum == rp->rc_proc && - netaddr_match(AF_INET, &rp->rc_haddr, nd->nd_nam)) { + netaddr_match(rp->rc_family, &rp->rc_haddr, nd->nd_nam)) { if ((rp->rc_flag & RC_LOCKED) != 0) { rp->rc_flag |= RC_WANTED; msleep(rp, nfsrv_reqcache_mutex, PZERO-1, "nfsrc", NULL); diff --git a/bsd/nfs/nfs_subs.c b/bsd/nfs/nfs_subs.c index 40b55e86e..dccead918 100644 --- a/bsd/nfs/nfs_subs.c +++ b/bsd/nfs/nfs_subs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -99,6 +99,9 @@ #include #include #include +#if NFSCLIENT +#define _NFS_XDR_SUBS_FUNCS_ /* define this to get xdrbuf function definitions */ +#endif #include #include #include @@ -110,6 +113,8 @@ #include #include +#include + /* * NFS globals */ @@ -793,6 +798,33 @@ nfsm_chain_get_uio(struct nfsm_chain *nmc, uint32_t len, uio_t uio) #if NFSCLIENT +int +nfsm_chain_add_string_nfc(struct nfsm_chain *nmc, const uint8_t *s, uint32_t slen) +{ + uint8_t smallbuf[64]; + uint8_t *nfcname = smallbuf; + size_t buflen = sizeof(smallbuf), nfclen; + int error; + + error = utf8_normalizestr(s, slen, nfcname, &nfclen, buflen, UTF_PRECOMPOSED|UTF_NO_NULL_TERM); + if (error == ENAMETOOLONG) { + buflen = MAXPATHLEN; + MALLOC_ZONE(nfcname, uint8_t *, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (nfcname) + error = utf8_normalizestr(s, slen, nfcname, &nfclen, buflen, UTF_PRECOMPOSED|UTF_NO_NULL_TERM); + } + + /* if we got an error, just use the original string */ + if (error) + nfsm_chain_add_string(error, nmc, s, slen); + else + nfsm_chain_add_string(error, nmc, nfcname, nfclen); + + if (nfcname && (nfcname != smallbuf)) + FREE_ZONE(nfcname, MAXPATHLEN, M_NAMEI); + return (error); +} + /* * Add an NFSv2 "sattr" structure to an mbuf chain */ @@ -909,7 +941,7 @@ nfsm_chain_get_fh_attr( error = nfs_parsefattr(nmc, nfsvers, nvap); } else if (gotfh) { /* we need valid attributes in order to call nfs_nget() */ - if (nfs3_getattr_rpc(NULL, NFSTOMP(dnp), fhp->fh_data, fhp->fh_len, ctx, nvap, xidp)) { + if (nfs3_getattr_rpc(NULL, NFSTOMP(dnp), fhp->fh_data, fhp->fh_len, 0, ctx, nvap, xidp)) { gotattr = 0; fhp->fh_len = 0; } @@ -985,7 +1017,6 @@ nfs_get_xid(uint64_t *xidp) int nfsm_rpchead( struct nfsreq *req, - int auth_len, mbuf_t mrest, u_int64_t *xidp, mbuf_t *mreqp) @@ -993,23 +1024,55 @@ nfsm_rpchead( struct nfsmount *nmp = req->r_nmp; int nfsvers = nmp->nm_vers; int proc = ((nfsvers == NFS_VER2) ? nfsv2_procid[req->r_procnum] : (int)req->r_procnum); - int auth_type = (!auth_len && !req->r_cred) ? RPCAUTH_NULL : nmp->nm_auth; - return nfsm_rpchead2(nmp->nm_sotype, NFS_PROG, nfsvers, proc, - auth_type, auth_len, req->r_cred, req, mrest, xidp, mreqp); + return nfsm_rpchead2(nmp, nmp->nm_sotype, NFS_PROG, nfsvers, proc, + req->r_auth, req->r_cred, req, mrest, xidp, mreqp); } int -nfsm_rpchead2(int sotype, int prog, int vers, int proc, int auth_type, int auth_len, +nfsm_rpchead2(struct nfsmount *nmp, int sotype, int prog, int vers, int proc, int auth_type, kauth_cred_t cred, struct nfsreq *req, mbuf_t mrest, u_int64_t *xidp, mbuf_t *mreqp) { mbuf_t mreq, mb; - int error, i, grpsiz, authsiz, reqlen; + int error, i, grpsiz, auth_len = 0, authsiz, reqlen; size_t headlen; struct nfsm_chain nmreq; - /* allocate the packet */ + /* calculate expected auth length */ + switch (auth_type) { + case RPCAUTH_NONE: + auth_len = 0; + break; + case RPCAUTH_SYS: + { + gid_t grouplist[NGROUPS]; + int groupcount = NGROUPS; + + if (!cred) + return (EINVAL); + + (void)kauth_cred_getgroups(cred, grouplist, &groupcount); + if (groupcount < 1) + return (EINVAL); + + auth_len = ((((groupcount - 1) > nmp->nm_numgrps) ? + nmp->nm_numgrps : (groupcount - 1)) << 2) + + 5 * NFSX_UNSIGNED; + break; + } + case RPCAUTH_KRB5: + case RPCAUTH_KRB5I: + case RPCAUTH_KRB5P: + if (!req || !cred) + return (EINVAL); + auth_len = 5 * NFSX_UNSIGNED + 0; // zero context handle for now + break; + default: + return (EINVAL); + } authsiz = nfsm_rndup(auth_len); + + /* allocate the packet */ headlen = authsiz + 10 * NFSX_UNSIGNED; if (sotype == SOCK_STREAM) /* also include room for any RPC Record Mark */ headlen += NFSX_UNSIGNED; @@ -1055,27 +1118,36 @@ nfsm_rpchead2(int sotype, int prog, int vers, int proc, int auth_type, int auth_ add_cred: switch (auth_type) { - case RPCAUTH_NULL: - nfsm_chain_add_32(error, &nmreq, RPCAUTH_NULL); /* auth */ + case RPCAUTH_NONE: + nfsm_chain_add_32(error, &nmreq, RPCAUTH_NONE); /* auth */ nfsm_chain_add_32(error, &nmreq, 0); /* length */ - nfsm_chain_add_32(error, &nmreq, RPCAUTH_NULL); /* verf */ + nfsm_chain_add_32(error, &nmreq, RPCAUTH_NONE); /* verf */ nfsm_chain_add_32(error, &nmreq, 0); /* length */ nfsm_chain_build_done(error, &nmreq); + /* Append the args mbufs */ + if (!error) + error = mbuf_setnext(nmreq.nmc_mcur, mrest); break; - case RPCAUTH_UNIX: - nfsm_chain_add_32(error, &nmreq, RPCAUTH_UNIX); + case RPCAUTH_SYS: { + gid_t grouplist[NGROUPS]; + int groupcount; + + nfsm_chain_add_32(error, &nmreq, RPCAUTH_SYS); nfsm_chain_add_32(error, &nmreq, authsiz); nfsm_chain_add_32(error, &nmreq, 0); /* stamp */ nfsm_chain_add_32(error, &nmreq, 0); /* zero-length hostname */ nfsm_chain_add_32(error, &nmreq, kauth_cred_getuid(cred)); /* UID */ - nfsm_chain_add_32(error, &nmreq, cred->cr_groups[0]); /* GID */ + nfsm_chain_add_32(error, &nmreq, kauth_cred_getgid(cred)); /* GID */ grpsiz = (auth_len >> 2) - 5; nfsm_chain_add_32(error, &nmreq, grpsiz);/* additional GIDs */ + memset(grouplist, 0, sizeof(grouplist)); + groupcount = grpsiz; + (void)kauth_cred_getgroups(cred, grouplist, &groupcount); for (i = 1; i <= grpsiz; i++) - nfsm_chain_add_32(error, &nmreq, cred->cr_groups[i]); + nfsm_chain_add_32(error, &nmreq, grouplist[i]); /* And the verifier... */ - nfsm_chain_add_32(error, &nmreq, RPCAUTH_NULL); /* flavor */ + nfsm_chain_add_32(error, &nmreq, RPCAUTH_NONE); /* flavor */ nfsm_chain_add_32(error, &nmreq, 0); /* length */ nfsm_chain_build_done(error, &nmreq); @@ -1083,16 +1155,24 @@ nfsm_rpchead2(int sotype, int prog, int vers, int proc, int auth_type, int auth_ if (!error) error = mbuf_setnext(nmreq.nmc_mcur, mrest); break; + } case RPCAUTH_KRB5: case RPCAUTH_KRB5I: case RPCAUTH_KRB5P: error = nfs_gss_clnt_cred_put(req, &nmreq, mrest); if (error == ENEEDAUTH) { + gid_t grouplist[NGROUPS]; + int groupcount = NGROUPS; /* * Use sec=sys for this user */ error = 0; - auth_type = RPCAUTH_UNIX; + req->r_auth = auth_type = RPCAUTH_SYS; + (void)kauth_cred_getgroups(cred, grouplist, &groupcount); + auth_len = ((((groupcount - 1) > nmp->nm_numgrps) ? + nmp->nm_numgrps : (groupcount - 1)) << 2) + + 5 * NFSX_UNSIGNED; + authsiz = nfsm_rndup(auth_len); goto add_cred; } break; @@ -1141,6 +1221,21 @@ nfs_parsefattr(struct nfsm_chain *nmc, int nfsvers, struct nfs_vattr *nvap) dev_t rdev; val = val2 = 0; + NVATTR_INIT(nvap); + + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_TYPE); + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_MODE); + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_NUMLINKS); + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_OWNER); + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_OWNER_GROUP); + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_SIZE); + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_SPACE_USED); + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_RAWDEV); + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_FSID); + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_FILEID); + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_TIME_ACCESS); + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_TIME_MODIFY); + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_TIME_METADATA); nfsm_chain_get_32(error, nmc, vtype); nfsm_chain_get_32(error, nmc, vmode); @@ -1241,6 +1336,12 @@ nfs_loadattrcache( vnode_t vp; struct timeval now; struct nfs_vattr *npnvap; + int xattr = np->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR; + int referral = np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL; + int aclbit, monitored, error = 0; + kauth_acl_t acl; + struct nfsmount *nmp; + uint32_t events = np->n_events; if (np->n_hflag & NHINIT) { vp = NULL; @@ -1249,10 +1350,11 @@ nfs_loadattrcache( vp = NFSTOV(np); mp = vnode_mount(vp); } + monitored = vp ? vnode_ismonitored(vp) : 0; FSDBG_TOP(527, np, vp, *xidp >> 32, *xidp); - if (!VFSTONFS(mp)) { + if (!((nmp = VFSTONFS(mp)))) { FSDBG_BOT(527, ENXIO, 1, 0, *xidp); return (ENXIO); } @@ -1298,16 +1400,133 @@ nfs_loadattrcache( */ printf("nfs loadattrcache vnode changed type, was %d now %d\n", vnode_vtype(vp), nvap->nva_type); - FSDBG_BOT(527, ESTALE, 3, 0, *xidp); - return (ESTALE); + error = ESTALE; + if (monitored) + events |= VNODE_EVENT_DELETE; + goto out; } + npnvap = &np->n_vattr; + + /* + * The ACL cache needs special handling because it is not + * always updated. Save current ACL cache state so it can + * be restored after copying the new attributes into place. + */ + aclbit = NFS_BITMAP_ISSET(npnvap->nva_bitmap, NFS_FATTR_ACL); + acl = npnvap->nva_acl; + + if (monitored) { + /* + * For monitored nodes, check for attribute changes that should generate events. + */ + if (NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_NUMLINKS) && + (nvap->nva_nlink != npnvap->nva_nlink)) + events |= VNODE_EVENT_ATTRIB | VNODE_EVENT_LINK; + if (events & VNODE_EVENT_PERMS) + /* no need to do all the checking if it's already set */; + else if (NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_MODE) && + (nvap->nva_mode != npnvap->nva_mode)) + events |= VNODE_EVENT_ATTRIB | VNODE_EVENT_PERMS; + else if (NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_OWNER) && + (nvap->nva_uid != npnvap->nva_uid)) + events |= VNODE_EVENT_ATTRIB | VNODE_EVENT_PERMS; + else if (NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_OWNER_GROUP) && + (nvap->nva_gid != npnvap->nva_gid)) + events |= VNODE_EVENT_ATTRIB | VNODE_EVENT_PERMS; + else if (nmp->nm_vers >= NFS_VER4) { + if (NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_OWNER) && + !kauth_guid_equal(&nvap->nva_uuuid, &npnvap->nva_uuuid)) + events |= VNODE_EVENT_ATTRIB | VNODE_EVENT_PERMS; + else if (NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_OWNER_GROUP) && + !kauth_guid_equal(&nvap->nva_guuid, &npnvap->nva_guuid)) + events |= VNODE_EVENT_ATTRIB | VNODE_EVENT_PERMS; + else if ((NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_ACL) && + nvap->nva_acl && npnvap->nva_acl && + ((nvap->nva_acl->acl_entrycount != npnvap->nva_acl->acl_entrycount) || + bcmp(nvap->nva_acl, npnvap->nva_acl, KAUTH_ACL_COPYSIZE(nvap->nva_acl))))) + events |= VNODE_EVENT_ATTRIB | VNODE_EVENT_PERMS; + } + if (((nmp->nm_vers >= NFS_VER4) && (nvap->nva_change != npnvap->nva_change)) || + (NFS_BITMAP_ISSET(npnvap->nva_bitmap, NFS_FATTR_TIME_MODIFY) && + ((nvap->nva_timesec[NFSTIME_MODIFY] != npnvap->nva_timesec[NFSTIME_MODIFY]) || + (nvap->nva_timensec[NFSTIME_MODIFY] != npnvap->nva_timensec[NFSTIME_MODIFY])))) + events |= VNODE_EVENT_ATTRIB | VNODE_EVENT_WRITE; + if (!events && NFS_BITMAP_ISSET(npnvap->nva_bitmap, NFS_FATTR_RAWDEV) && + ((nvap->nva_rawdev.specdata1 != npnvap->nva_rawdev.specdata1) || + (nvap->nva_rawdev.specdata2 != npnvap->nva_rawdev.specdata2))) + events |= VNODE_EVENT_ATTRIB; + if (!events && NFS_BITMAP_ISSET(npnvap->nva_bitmap, NFS_FATTR_FILEID) && + (nvap->nva_fileid != npnvap->nva_fileid)) + events |= VNODE_EVENT_ATTRIB; + if (!events && NFS_BITMAP_ISSET(npnvap->nva_bitmap, NFS_FATTR_ARCHIVE) && + ((nvap->nva_flags & NFS_FFLAG_ARCHIVED) != (npnvap->nva_flags & NFS_FFLAG_ARCHIVED))) + events |= VNODE_EVENT_ATTRIB; + if (!events && NFS_BITMAP_ISSET(npnvap->nva_bitmap, NFS_FATTR_HIDDEN) && + ((nvap->nva_flags & NFS_FFLAG_HIDDEN) != (npnvap->nva_flags & NFS_FFLAG_HIDDEN))) + events |= VNODE_EVENT_ATTRIB; + if (!events && NFS_BITMAP_ISSET(npnvap->nva_bitmap, NFS_FATTR_TIME_CREATE) && + ((nvap->nva_timesec[NFSTIME_CREATE] != npnvap->nva_timesec[NFSTIME_CREATE]) || + (nvap->nva_timensec[NFSTIME_CREATE] != npnvap->nva_timensec[NFSTIME_CREATE]))) + events |= VNODE_EVENT_ATTRIB; + if (!events && NFS_BITMAP_ISSET(npnvap->nva_bitmap, NFS_FATTR_TIME_BACKUP) && + ((nvap->nva_timesec[NFSTIME_BACKUP] != npnvap->nva_timesec[NFSTIME_BACKUP]) || + (nvap->nva_timensec[NFSTIME_BACKUP] != npnvap->nva_timensec[NFSTIME_BACKUP]))) + events |= VNODE_EVENT_ATTRIB; + } + + /* Copy the attributes to the attribute cache */ + bcopy((caddr_t)nvap, (caddr_t)npnvap, sizeof(*nvap)); + microuptime(&now); np->n_attrstamp = now.tv_sec; np->n_xid = *xidp; + /* NFS_FFLAG_IS_ATTR and NFS_FFLAG_TRIGGER_REFERRAL need to be sticky... */ + if (vp && xattr) + nvap->nva_flags |= xattr; + if (vp && referral) + nvap->nva_flags |= referral; + + if (NFS_BITMAP_ISSET(npnvap->nva_bitmap, NFS_FATTR_ACL)) { + /* we're updating the ACL */ + if (nvap->nva_acl) { + /* make a copy of the acl for the cache */ + npnvap->nva_acl = kauth_acl_alloc(nvap->nva_acl->acl_entrycount); + if (npnvap->nva_acl) { + bcopy(nvap->nva_acl, npnvap->nva_acl, KAUTH_ACL_COPYSIZE(nvap->nva_acl)); + } else { + /* can't make a copy to cache, invalidate ACL cache */ + NFS_BITMAP_CLR(npnvap->nva_bitmap, NFS_FATTR_ACL); + NACLINVALIDATE(np); + aclbit = 0; + } + } + if (acl) { + kauth_acl_free(acl); + acl = NULL; + } + } + if (NFS_BITMAP_ISSET(npnvap->nva_bitmap, NFS_FATTR_ACL)) { + /* update the ACL timestamp */ + np->n_aclstamp = now.tv_sec; + } else { + /* we aren't updating the ACL, so restore original values */ + if (aclbit) + NFS_BITMAP_SET(npnvap->nva_bitmap, NFS_FATTR_ACL); + npnvap->nva_acl = acl; + } - npnvap = &np->n_vattr; - bcopy((caddr_t)nvap, (caddr_t)npnvap, sizeof(*nvap)); +#if CONFIG_TRIGGERS + /* + * For NFSv4, if the fsid doesn't match the fsid for the mount, then + * this node is for a different file system on the server. So we mark + * this node as a trigger node that will trigger the mirror mount. + */ + if ((nmp->nm_vers >= NFS_VER4) && (nvap->nva_type == VDIR) && + ((np->n_vattr.nva_fsid.major != nmp->nm_fsid.major) || + (np->n_vattr.nva_fsid.minor != nmp->nm_fsid.minor))) + np->n_vattr.nva_flags |= NFS_FFLAG_TRIGGER; +#endif if (!vp || (nvap->nva_type != VREG)) { np->n_size = nvap->nva_size; @@ -1332,6 +1551,8 @@ nfs_loadattrcache( */ np->n_newsize = nvap->nva_size; SET(np->n_flag, NUPDATESIZE); + if (monitored) + events |= VNODE_EVENT_ATTRIB | VNODE_EVENT_EXTEND; } } @@ -1346,8 +1567,11 @@ nfs_loadattrcache( } } - FSDBG_BOT(527, 0, np, np->n_size, *xidp); - return (0); +out: + if (monitored && events) + nfs_vnode_notify(np, events); + FSDBG_BOT(527, error, np, np->n_size, *xidp); + return (error); } /* @@ -1359,16 +1583,22 @@ nfs_attrcachetimeout(nfsnode_t np) { struct nfsmount *nmp; struct timeval now; - int isdir, timeo; + int isdir; + uint32_t timeo; if (!(nmp = NFSTONMP(np))) return (0); isdir = vnode_isdir(NFSTOV(np)); - if ((np)->n_flag & NMODIFIED) + if ((nmp->nm_vers >= NFS_VER4) && (np->n_openflags & N_DELEG_MASK)) { + /* If we have a delegation, we always use the max timeout. */ + timeo = isdir ? nmp->nm_acdirmax : nmp->nm_acregmax; + } else if ((np)->n_flag & NMODIFIED) { + /* If we have modifications, we always use the min timeout. */ timeo = isdir ? nmp->nm_acdirmin : nmp->nm_acregmin; - else { + } else { + /* Otherwise, we base the timeout on how old the file seems. */ /* Note that if the client and server clocks are way out of sync, */ /* timeout will probably get clamped to a min or max value */ microtime(&now); @@ -1396,26 +1626,32 @@ nfs_attrcachetimeout(nfsnode_t np) * Must be called with the node locked. */ int -nfs_getattrcache(nfsnode_t np, struct nfs_vattr *nvaper) +nfs_getattrcache(nfsnode_t np, struct nfs_vattr *nvaper, int flags) { struct nfs_vattr *nvap; struct timeval nowup; int32_t timeo; - if (!NATTRVALID(np)) { + /* Check if the attributes are valid. */ + if (!NATTRVALID(np) || ((flags & NGA_ACL) && !NACLVALID(np))) { FSDBG(528, np, 0, 0xffffff01, ENOENT); OSAddAtomic(1, &nfsstats.attrcache_misses); return (ENOENT); } + /* Verify the cached attributes haven't timed out. */ timeo = nfs_attrcachetimeout(np); - microuptime(&nowup); if ((nowup.tv_sec - np->n_attrstamp) >= timeo) { FSDBG(528, np, 0, 0xffffff02, ENOENT); OSAddAtomic(1, &nfsstats.attrcache_misses); return (ENOENT); } + if ((flags & NGA_ACL) && ((nowup.tv_sec - np->n_aclstamp) >= timeo)) { + FSDBG(528, np, 0, 0xffffff02, ENOENT); + OSAddAtomic(1, &nfsstats.attrcache_misses); + return (ENOENT); + } nvap = &np->n_vattr; FSDBG(528, np, nvap->nva_size, np->n_size, 0xcace); @@ -1451,9 +1687,257 @@ nfs_getattrcache(nfsnode_t np, struct nfs_vattr *nvaper) nvaper->nva_timensec[NFSTIME_MODIFY] = np->n_mtim.tv_nsec; } } + if (nvap->nva_acl) { + if (flags & NGA_ACL) { + nvaper->nva_acl = kauth_acl_alloc(nvap->nva_acl->acl_entrycount); + if (!nvaper->nva_acl) + return (ENOMEM); + bcopy(nvap->nva_acl, nvaper->nva_acl, KAUTH_ACL_COPYSIZE(nvap->nva_acl)); + } else { + nvaper->nva_acl = NULL; + } + } return (0); } +/* + * When creating file system objects: + * Don't bother setting UID if it's the same as the credential performing the create. + * Don't bother setting GID if it's the same as the directory or credential. + */ +void +nfs_avoid_needless_id_setting_on_create(nfsnode_t dnp, struct vnode_attr *vap, vfs_context_t ctx) +{ + if (VATTR_IS_ACTIVE(vap, va_uid)) { + if (kauth_cred_getuid(vfs_context_ucred(ctx)) == vap->va_uid) { + VATTR_CLEAR_ACTIVE(vap, va_uid); + VATTR_CLEAR_ACTIVE(vap, va_uuuid); + } + } + if (VATTR_IS_ACTIVE(vap, va_gid)) { + if ((vap->va_gid == dnp->n_vattr.nva_gid) || + (kauth_cred_getgid(vfs_context_ucred(ctx)) == vap->va_gid)) { + VATTR_CLEAR_ACTIVE(vap, va_gid); + VATTR_CLEAR_ACTIVE(vap, va_guuid); + } + } +} + +/* + * Convert a universal address string to a sockaddr structure. + * + * Universal addresses can be in the following formats: + * + * d = decimal (IPv4) + * x = hexadecimal (IPv6) + * p = port (decimal) + * + * d.d.d.d + * d.d.d.d.p.p + * x:x:x:x:x:x:x:x + * x:x:x:x:x:x:x:x.p.p + * x:x:x:x:x:x:d.d.d.d + * x:x:x:x:x:x:d.d.d.d.p.p + * + * IPv6 strings can also have a series of zeroes elided + * IPv6 strings can also have a %scope suffix at the end (after any port) + * + * rules & exceptions: + * - value before : is hex + * - value before . is dec + * - once . hit, all values are dec + * - hex+port case means value before first dot is actually hex + * - . is always preceded by digits except if last hex was double-colon + * + * scan, converting #s to bytes + * first time a . is encountered, scan the rest to count them. + * 2 dots = just port + * 3 dots = just IPv4 no port + * 5 dots = IPv4 and port + */ + +#define IS_DIGIT(C) \ + (((C) >= '0') && ((C) <= '9')) + +#define IS_XDIGIT(C) \ + (IS_DIGIT(C) || \ + (((C) >= 'A') && ((C) <= 'F')) || \ + (((C) >= 'a') && ((C) <= 'f'))) + +int +nfs_uaddr2sockaddr(const char *uaddr, struct sockaddr *addr) +{ + const char *p, *pd; /* pointers to current character in scan */ + const char *pnum; /* pointer to current number to decode */ + const char *pscope; /* pointer to IPv6 scope ID */ + uint8_t a[18]; /* octet array to store address bytes */ + int i; /* index of next octet to decode */ + int dci; /* index of octet to insert double-colon zeroes */ + int dcount, xdcount; /* count of digits in current number */ + int needmore; /* set when we know we need more input (e.g. after colon, period) */ + int dots; /* # of dots */ + int hex; /* contains hex values */ + unsigned long val; /* decoded value */ + int s; /* index used for sliding array to insert elided zeroes */ + +#define HEXVALUE 0 +#define DECIMALVALUE 1 +#define GET(TYPE) \ + do { \ + if ((dcount <= 0) || (dcount > (((TYPE) == DECIMALVALUE) ? 3 : 4))) \ + return (0); \ + if (((TYPE) == DECIMALVALUE) && xdcount) \ + return (0); \ + val = strtoul(pnum, NULL, ((TYPE) == DECIMALVALUE) ? 10 : 16); \ + if (((TYPE) == DECIMALVALUE) && (val >= 256)) \ + return (0); \ + /* check if there is room left in the array */ \ + if (i > (int)(sizeof(a) - (((TYPE) == HEXVALUE) ? 2 : 1) - ((dci != -1) ? 2 : 0))) \ + return (0); \ + if ((TYPE) == HEXVALUE) \ + a[i++] = ((val >> 8) & 0xff); \ + a[i++] = (val & 0xff); \ + } while (0) + + hex = 0; + dots = 0; + dci = -1; + i = dcount = xdcount = 0; + pnum = p = uaddr; + pscope = NULL; + needmore = 1; + if ((*p == ':') && (*++p != ':')) /* if it starts with colon, gotta be a double */ + return (0); + + while (*p) { + if (IS_XDIGIT(*p)) { + dcount++; + if (!IS_DIGIT(*p)) + xdcount++; + needmore = 0; + p++; + } else if (*p == '.') { + /* rest is decimal IPv4 dotted quad and/or port */ + if (!dots) { + /* this is the first, so count them */ + for (pd = p; *pd; pd++) { + if (*pd == '.') { + if (++dots > 5) + return (0); + } else if (hex && (*pd == '%')) { + break; + } else if ((*pd < '0') || (*pd > '9')) { + return (0); + } + } + if ((dots != 2) && (dots != 3) && (dots != 5)) + return (0); + if (hex && (dots == 2)) { /* hex+port */ + if (!dcount && needmore) + return (0); + if (dcount) /* last hex may be elided zero */ + GET(HEXVALUE); + } else { + GET(DECIMALVALUE); + } + } else { + GET(DECIMALVALUE); + } + dcount = xdcount = 0; + needmore = 1; + pnum = ++p; + } else if (*p == ':') { + hex = 1; + if (dots) + return (0); + if (!dcount) { /* missing number, probably double colon */ + if (dci >= 0) /* can only have one double colon */ + return (0); + dci = i; + needmore = 0; + } else { + GET(HEXVALUE); + dcount = xdcount = 0; + needmore = 1; + } + pnum = ++p; + } else if (*p == '%') { /* scope ID delimiter */ + if (!hex) + return (0); + p++; + pscope = p; + break; + } else { /* unexpected character */ + return (0); + } + } + if (needmore && !dcount) + return (0); + if (dcount) /* decode trailing number */ + GET(dots ? DECIMALVALUE : HEXVALUE); + if (dci >= 0) { /* got a double-colon at i, need to insert a range of zeroes */ + /* if we got a port, slide to end of array */ + /* otherwise, slide to end of address (non-port) values */ + int end = ((dots == 2) || (dots == 5)) ? sizeof(a) : (sizeof(a) - 2); + if (i % 2) /* length of zero range must be multiple of 2 */ + return (0); + if (i >= end) /* no room? */ + return (0); + /* slide (i-dci) numbers up from index dci */ + for (s=0; s < (i - dci); s++) + a[end-1-s] = a[i-1-s]; + /* zero (end-i) numbers at index dci */ + for (s=0; s < (end - i); s++) + a[dci+s] = 0; + i = end; + } + + /* copy out resulting socket address */ + if (hex) { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6*)addr; + if ((((dots == 0) || (dots == 3)) && (i != (sizeof(a)-2)))) + return (0); + if ((((dots == 2) || (dots == 5)) && (i != sizeof(a)))) + return (0); + bzero(sin6, sizeof(struct sockaddr_in6)); + sin6->sin6_len = sizeof(struct sockaddr_in6); + sin6->sin6_family = AF_INET6; + bcopy(a, &sin6->sin6_addr.s6_addr, sizeof(struct in6_addr)); + if ((dots == 5) || (dots == 2)) + sin6->sin6_port = htons((a[16] << 8) | a[17]); + if (pscope) { + for (p=pscope; IS_DIGIT(*p); p++) + ; + if (*p && !IS_DIGIT(*p)) { /* name */ + ifnet_t interface = NULL; + if (ifnet_find_by_name(pscope, &interface) == 0) + sin6->sin6_scope_id = ifnet_index(interface); + if (interface) + ifnet_release(interface); + } else { /* decimal number */ + sin6->sin6_scope_id = strtoul(pscope, NULL, 10); + } + /* XXX should we also embed scope id for linklocal? */ + } + } else { + struct sockaddr_in *sin = (struct sockaddr_in*)addr; + if ((dots != 3) && (dots != 5)) + return (0); + if ((dots == 3) && (i != 4)) + return (0); + if ((dots == 5) && (i != 6)) + return (0); + bzero(sin, sizeof(struct sockaddr_in)); + sin->sin_len = sizeof(struct sockaddr_in); + sin->sin_family = AF_INET; + bcopy(a, &sin->sin_addr.s_addr, sizeof(struct in_addr)); + if (dots == 5) + sin->sin_port = htons((a[4] << 8) | a[5]); + } + return (1); +} + + #endif /* NFSCLIENT */ /* @@ -1478,8 +1962,7 @@ int nfsrv_free_netopt(struct radix_node *, void *); int nfsrv_free_addrlist(struct nfs_export *, struct user_nfs_export_args *); struct nfs_export_options *nfsrv_export_lookup(struct nfs_export *, mbuf_t); struct nfs_export *nfsrv_fhtoexport(struct nfs_filehandle *); -int nfsrv_cmp_sockaddr(struct sockaddr_storage *, struct sockaddr_storage *); -struct nfs_user_stat_node *nfsrv_get_user_stat_node(struct nfs_active_user_list *, struct sockaddr_storage *, uid_t); +struct nfs_user_stat_node *nfsrv_get_user_stat_node(struct nfs_active_user_list *, struct sockaddr *, uid_t); void nfsrv_init_user_list(struct nfs_active_user_list *); void nfsrv_free_user_list(struct nfs_active_user_list *); @@ -1939,7 +2422,6 @@ nfsrv_hang_addrlist(struct nfs_export *nx, struct user_nfs_export_args *unxa) unsigned int net; user_addr_t uaddr; kauth_cred_t cred; - struct ucred temp_cred; uaddr = unxa->nxa_nets; for (net = 0; net < unxa->nxa_netcount; net++, uaddr += sizeof(nxna)) { @@ -1948,12 +2430,13 @@ nfsrv_hang_addrlist(struct nfs_export *nx, struct user_nfs_export_args *unxa) return (error); if (nxna.nxna_flags & (NX_MAPROOT|NX_MAPALL)) { - bzero(&temp_cred, sizeof(temp_cred)); - temp_cred.cr_uid = nxna.nxna_cred.cr_uid; - temp_cred.cr_ngroups = nxna.nxna_cred.cr_ngroups; + struct posix_cred temp_pcred; + bzero(&temp_pcred, sizeof(temp_pcred)); + temp_pcred.cr_uid = nxna.nxna_cred.cr_uid; + temp_pcred.cr_ngroups = nxna.nxna_cred.cr_ngroups; for (i=0; i < nxna.nxna_cred.cr_ngroups && i < NGROUPS; i++) - temp_cred.cr_groups[i] = nxna.nxna_cred.cr_groups[i]; - cred = kauth_cred_create(&temp_cred); + temp_pcred.cr_groups[i] = nxna.nxna_cred.cr_groups[i]; + cred = posix_cred_create(&temp_pcred); if (!IS_VALID_CRED(cred)) return (ENOMEM); } else { @@ -2035,13 +2518,34 @@ nfsrv_hang_addrlist(struct nfs_export *nx, struct user_nfs_export_args *unxa) if (cred == cred2) { /* creds are same (or both NULL) */ matched = 1; - } else if (cred && cred2 && (cred->cr_uid == cred2->cr_uid) && - (cred->cr_ngroups == cred2->cr_ngroups)) { - for (i=0; i < cred2->cr_ngroups && i < NGROUPS; i++) - if (cred->cr_groups[i] != cred2->cr_groups[i]) - break; - if (i >= cred2->cr_ngroups || i >= NGROUPS) - matched = 1; + } else if (cred && cred2 && (kauth_cred_getuid(cred) == kauth_cred_getuid(cred2))) { + /* + * Now compare the effective and + * supplementary groups... + * + * Note: This comparison, as written, + * does not correctly indicate that + * the groups are equivalent, since + * other than the first supplementary + * group, which is also the effective + * group, order on the remaining groups + * doesn't matter, and this is an + * ordered compare. + */ + gid_t groups[NGROUPS]; + gid_t groups2[NGROUPS]; + int groupcount = NGROUPS; + int group2count = NGROUPS; + + if (!kauth_cred_getgroups(cred, groups, &groupcount) && + !kauth_cred_getgroups(cred2, groups2, &group2count) && + groupcount == group2count) { + for (i=0; i < group2count; i++) + if (groups[i] != groups2[i]) + break; + if (i >= group2count || i >= NGROUPS) + matched = 1; + } } } if (IS_VALID_CRED(cred)) @@ -2167,7 +2671,8 @@ void enablequotas(struct mount *mp, vfs_context_t ctx); // XXX int nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx) { - int error = 0, pathlen; + int error = 0; + size_t pathlen; struct nfs_exportfs *nxfs, *nxfs2, *nxfs3; struct nfs_export *nx, *nx2, *nx3; struct nfs_filehandle nfh; @@ -2179,10 +2684,10 @@ nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx) if (unxa->nxa_flags == NXA_CHECK) { /* just check if the path is an NFS-exportable file system */ - error = copyinstr(unxa->nxa_fspath, path, MAXPATHLEN, (size_t *)&pathlen); + error = copyinstr(unxa->nxa_fspath, path, MAXPATHLEN, &pathlen); if (error) return (error); - NDINIT(&mnd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, + NDINIT(&mnd, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx); error = namei(&mnd); if (error) @@ -2215,8 +2720,11 @@ nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx) lck_rw_lock_exclusive(&nfsrv_export_rwlock); while ((nxfs = LIST_FIRST(&nfsrv_exports))) { mp = vfs_getvfs_by_mntonname(nxfs->nxfs_path); - if (mp) + if (mp) { vfs_clearflags(mp, MNT_EXPORTED); + mount_iterdrop(mp); + mp = NULL; + } /* delete all exports on this file system */ while ((nx = LIST_FIRST(&nxfs->nxfs_exports))) { LIST_REMOVE(nx, nx_next); @@ -2245,7 +2753,7 @@ nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx) return (0); } - error = copyinstr(unxa->nxa_fspath, path, MAXPATHLEN, (size_t *)&pathlen); + error = copyinstr(unxa->nxa_fspath, path, MAXPATHLEN, &pathlen); if (error) return (error); @@ -2272,8 +2780,12 @@ nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx) if ((unxa->nxa_flags & (NXA_ADD|NXA_OFFLINE)) == NXA_ADD) { /* if adding, verify that the mount is still what we expect */ mp = vfs_getvfs_by_mntonname(nxfs->nxfs_path); + if (mp) { + mount_ref(mp, 0); + mount_iterdrop(mp); + } /* find exported FS root vnode */ - NDINIT(&mnd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, + NDINIT(&mnd, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, UIO_SYSSPACE, CAST_USER_ADDR_T(nxfs->nxfs_path), ctx); error = namei(&mnd); if (error) @@ -2298,7 +2810,7 @@ nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx) } /* find exported FS root vnode */ - NDINIT(&mnd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, + NDINIT(&mnd, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx); error = namei(&mnd); if (error) { @@ -2318,6 +2830,7 @@ nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx) mvp = NULL; } else { mp = vnode_mount(mvp); + mount_ref(mp, 0); /* make sure the file system is NFS-exportable */ nfh.nfh_len = NFSV3_MAX_FID_SIZE; @@ -2366,7 +2879,7 @@ nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx) } if (unxa->nxa_exppath) { - error = copyinstr(unxa->nxa_exppath, path, MAXPATHLEN, (size_t *)&pathlen); + error = copyinstr(unxa->nxa_exppath, path, MAXPATHLEN, &pathlen); if (error) goto out; LIST_FOREACH(nx, &nxfs->nxfs_exports, nx_next) { @@ -2483,6 +2996,9 @@ nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx) vnode_get(xvp); } else { xnd.ni_cnd.cn_nameiop = LOOKUP; +#if CONFIG_TRIGGERS + xnd.ni_op = OP_LOOKUP; +#endif xnd.ni_cnd.cn_flags = LOCKLEAF; xnd.ni_pathlen = pathlen - 1; xnd.ni_cnd.cn_nameptr = xnd.ni_cnd.cn_pnbuf = path; @@ -2600,6 +3116,8 @@ nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx) nameidone(&mnd); } unlock_out: + if (mp) + mount_drop(mp, 0); lck_rw_done(&nfsrv_export_rwlock); return (error); } @@ -2736,6 +3254,12 @@ nfsrv_fhtovp( /* find mount structure */ mp = vfs_getvfs_by_mntonname((*nxp)->nx_fs->nxfs_path); + if (mp) { + error = vfs_busy(mp, LK_NOWAIT); + mount_iterdrop(mp); + if (error) + mp = NULL; + } if (!mp) { /* * We have an export, but no mount? @@ -2746,6 +3270,7 @@ nfsrv_fhtovp( fidp = nfhp->nfh_fhp + sizeof(*nxh); error = VFS_FHTOVP(mp, nxh->nxh_fidlen, fidp, vpp, NULL); + vfs_unbusy(mp); if (error) return (error); /* vnode pointer should be good at this point or ... */ @@ -2863,46 +3388,6 @@ nfsrv_fhmatch(struct nfs_filehandle *fh1, struct nfs_filehandle *fh2) * Functions for dealing with active user lists */ -/* - * Compare address fields of two sockaddr_storage structures. - * Returns zero if they match. - */ -int -nfsrv_cmp_sockaddr(struct sockaddr_storage *sock1, struct sockaddr_storage *sock2) -{ - struct sockaddr_in *ipv4_sock1, *ipv4_sock2; - struct sockaddr_in6 *ipv6_sock1, *ipv6_sock2; - - /* check for valid parameters */ - if (sock1 == NULL || sock2 == NULL) - return 1; - - /* check address length */ - if (sock1->ss_len != sock2->ss_len) - return 1; - - /* Check address family */ - if (sock1->ss_family != sock2->ss_family) - return 1; - - if (sock1->ss_family == AF_INET) { - /* IPv4 */ - ipv4_sock1 = (struct sockaddr_in *)sock1; - ipv4_sock2 = (struct sockaddr_in *)sock2; - - if (!bcmp(&ipv4_sock1->sin_addr, &ipv4_sock2->sin_addr, sizeof(struct in_addr))) - return 0; - } else { - /* IPv6 */ - ipv6_sock1 = (struct sockaddr_in6 *)sock1; - ipv6_sock2 = (struct sockaddr_in6 *)sock2; - - if (!bcmp(&ipv6_sock1->sin6_addr, &ipv6_sock2->sin6_addr, sizeof(struct in6_addr))) - return 0; - } - return 1; -} - /* * Search the hash table for a user node with a matching IP address and uid field. * If found, the node's tm_last timestamp is updated and the node is returned. @@ -2913,7 +3398,7 @@ nfsrv_cmp_sockaddr(struct sockaddr_storage *sock1, struct sockaddr_storage *sock * The list's user_mutex lock MUST be held. */ struct nfs_user_stat_node * -nfsrv_get_user_stat_node(struct nfs_active_user_list *list, struct sockaddr_storage *sock, uid_t uid) +nfsrv_get_user_stat_node(struct nfs_active_user_list *list, struct sockaddr *saddr, uid_t uid) { struct nfs_user_stat_node *unode; struct timeval now; @@ -2922,7 +3407,7 @@ nfsrv_get_user_stat_node(struct nfs_active_user_list *list, struct sockaddr_stor /* seach the hash table */ head = NFS_USER_STAT_HASH(list->user_hashtbl, uid); LIST_FOREACH(unode, head, hash_link) { - if (uid == unode->uid && nfsrv_cmp_sockaddr(sock, &unode->sock) == 0) { + if ((uid == unode->uid) && (nfs_sockaddr_cmp(saddr, (struct sockaddr*)&unode->sock) == 0)) { /* found matching node */ break; } @@ -2964,7 +3449,7 @@ nfsrv_get_user_stat_node(struct nfs_active_user_list *list, struct sockaddr_stor /* Initialize the node */ unode->uid = uid; - bcopy(sock, &unode->sock, sock->ss_len); + bcopy(saddr, &unode->sock, saddr->sa_len); microtime(&now); unode->ops = 0; unode->bytes_read = 0; @@ -2984,15 +3469,15 @@ nfsrv_update_user_stat(struct nfs_export *nx, struct nfsrv_descript *nd, uid_t u { struct nfs_user_stat_node *unode; struct nfs_active_user_list *ulist; - struct sockaddr_storage *sock_stor; + struct sockaddr *saddr; if ((!nfsrv_user_stat_enabled) || (!nx) || (!nd) || (!nd->nd_nam)) return; - sock_stor = (struct sockaddr_storage *)mbuf_data(nd->nd_nam); + saddr = (struct sockaddr *)mbuf_data(nd->nd_nam); /* check address family before going any further */ - if ((sock_stor->ss_family != AF_INET) && (sock_stor->ss_family != AF_INET6)) + if ((saddr->sa_family != AF_INET) && (saddr->sa_family != AF_INET6)) return; ulist = &nx->nx_user_list; @@ -3001,7 +3486,7 @@ nfsrv_update_user_stat(struct nfs_export *nx, struct nfsrv_descript *nd, uid_t u lck_mtx_lock(&ulist->user_mutex); /* get the user node */ - unode = nfsrv_get_user_stat_node(ulist, sock_stor, uid); + unode = nfsrv_get_user_stat_node(ulist, saddr, uid); if (!unode) { lck_mtx_unlock(&ulist->user_mutex); diff --git a/bsd/nfs/nfs_syscalls.c b/bsd/nfs/nfs_syscalls.c index c28eac76c..d6de219ba 100644 --- a/bsd/nfs/nfs_syscalls.c +++ b/bsd/nfs/nfs_syscalls.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -150,36 +150,40 @@ SYSCTL_NODE(_vfs_generic, OID_AUTO, nfs, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "nfs hing #if NFSCLIENT SYSCTL_NODE(_vfs_generic_nfs, OID_AUTO, client, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "nfs client hinge"); -SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, initialdowndelay, CTLFLAG_RW, &nfs_tprintf_initial_delay, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nextdowndelay, CTLFLAG_RW, &nfs_tprintf_delay, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, iosize, CTLFLAG_RW, &nfs_iosize, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_cache_timeout, CTLFLAG_RW, &nfs_access_cache_timeout, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, allow_async, CTLFLAG_RW, &nfs_allow_async, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, statfs_rate_limit, CTLFLAG_RW, &nfs_statfs_rate_limit, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nfsiod_thread_max, CTLFLAG_RW, &nfsiod_thread_max, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nfsiod_thread_count, CTLFLAG_RD, &nfsiod_thread_count, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, lockd_mounts, CTLFLAG_RD, &nfs_lockd_mounts, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, max_async_writes, CTLFLAG_RW, &nfs_max_async_writes, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, single_des, CTLFLAG_RW, &nfs_single_des, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_delete, CTLFLAG_RW, &nfs_access_delete, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, initialdowndelay, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_tprintf_initial_delay, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nextdowndelay, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_tprintf_delay, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_iosize, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_cache_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_cache_timeout, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, allow_async, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_allow_async, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, statfs_rate_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_statfs_rate_limit, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nfsiod_thread_max, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsiod_thread_max, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nfsiod_thread_count, CTLFLAG_RD | CTLFLAG_LOCKED, &nfsiod_thread_count, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, lockd_mounts, CTLFLAG_RD | CTLFLAG_LOCKED, &nfs_lockd_mounts, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, max_async_writes, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_max_async_writes, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, single_des, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_single_des, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_delete, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_delete, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_dotzfs, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_dotzfs, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_for_getattr, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_for_getattr, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, idmap_ctrl, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_idmap_ctrl, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, callback_port, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_callback_port, 0, ""); #endif /* NFSCLIENT */ #if NFSSERVER SYSCTL_NODE(_vfs_generic_nfs, OID_AUTO, server, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "nfs server hinge"); -SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, wg_delay, CTLFLAG_RW, &nfsrv_wg_delay, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, wg_delay_v3, CTLFLAG_RW, &nfsrv_wg_delay_v3, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, require_resv_port, CTLFLAG_RW, &nfsrv_require_resv_port, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, async, CTLFLAG_RW, &nfsrv_async, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, export_hash_size, CTLFLAG_RW, &nfsrv_export_hash_size, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, reqcache_size, CTLFLAG_RW, &nfsrv_reqcache_size, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, request_queue_length, CTLFLAG_RW, &nfsrv_sock_max_rec_queue_length, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, user_stats, CTLFLAG_RW, &nfsrv_user_stat_enabled, 0, ""); -SYSCTL_UINT(_vfs_generic_nfs_server, OID_AUTO, gss_context_ttl, CTLFLAG_RW, &nfsrv_gss_context_ttl, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, wg_delay, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_wg_delay, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, wg_delay_v3, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_wg_delay_v3, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, require_resv_port, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_require_resv_port, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, async, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_async, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, export_hash_size, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_export_hash_size, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, reqcache_size, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_reqcache_size, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, request_queue_length, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_sock_max_rec_queue_length, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, user_stats, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_user_stat_enabled, 0, ""); +SYSCTL_UINT(_vfs_generic_nfs_server, OID_AUTO, gss_context_ttl, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_gss_context_ttl, 0, ""); #if CONFIG_FSE -SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, fsevents, CTLFLAG_RW, &nfsrv_fsevents_enabled, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, fsevents, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_fsevents_enabled, 0, ""); #endif -SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, nfsd_thread_max, CTLFLAG_RW, &nfsd_thread_max, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, nfsd_thread_count, CTLFLAG_RD, &nfsd_thread_count, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, nfsd_thread_max, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsd_thread_max, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, nfsd_thread_count, CTLFLAG_RD | CTLFLAG_LOCKED, &nfsd_thread_count, 0, ""); #endif /* NFSSERVER */ @@ -191,11 +195,19 @@ nfsclnt(proc_t p, struct nfsclnt_args *uap, __unused int *retval) struct lockd_ans la; int error; - if (uap->flag == NFSCLNT_LOCKDANS) { + switch (uap->flag) { + case NFSCLNT_LOCKDANS: error = copyin(uap->argp, &la, sizeof(la)); - return (error != 0 ? error : nfslockdans(p, &la)); + if (!error) + error = nfslockdans(p, &la); + break; + case NFSCLNT_LOCKDNOTIFY: + error = nfslockdnotify(p, uap->argp); + break; + default: + error = EINVAL; } - return EINVAL; + return (error); } /* @@ -389,10 +401,10 @@ getfh(proc_t p, struct getfh_args *uap, __unused int *retval) { vnode_t vp; struct nfs_filehandle nfh; - int error; + int error, fhlen, fidlen; struct nameidata nd; char path[MAXPATHLEN], *ptr; - u_int pathlen; + size_t pathlen; struct nfs_exportfs *nxfs; struct nfs_export *nx; @@ -403,14 +415,20 @@ getfh(proc_t p, struct getfh_args *uap, __unused int *retval) if (error) return (error); - error = copyinstr(uap->fname, path, MAXPATHLEN, (size_t *)&pathlen); + error = copyinstr(uap->fname, path, MAXPATHLEN, &pathlen); + if (!error) + error = copyin(uap->fhp, &fhlen, sizeof(fhlen)); if (error) return (error); + /* limit fh size to length specified (or v3 size by default) */ + if ((fhlen != NFSV2_MAX_FH_SIZE) && (fhlen != NFSV3_MAX_FH_SIZE)) + fhlen = NFSV3_MAX_FH_SIZE; + fidlen = fhlen - sizeof(struct nfs_exphandle); if (!nfsrv_is_initialized()) return (EINVAL); - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, UIO_SYSSPACE, CAST_USER_ADDR_T(path), vfs_context_current()); error = namei(&nd); if (error) @@ -452,9 +470,9 @@ getfh(proc_t p, struct getfh_args *uap, __unused int *retval) nfh.nfh_xh.nxh_expid = htonl(nx->nx_id); nfh.nfh_xh.nxh_flags = 0; nfh.nfh_xh.nxh_reserved = 0; - nfh.nfh_len = NFSV3_MAX_FID_SIZE; + nfh.nfh_len = fidlen; error = VFS_VPTOFH(vp, (int*)&nfh.nfh_len, &nfh.nfh_fid[0], NULL); - if (nfh.nfh_len > (int)NFSV3_MAX_FID_SIZE) + if (nfh.nfh_len > (uint32_t)fidlen) error = EOVERFLOW; nfh.nfh_xh.nxh_fidlen = nfh.nfh_len; nfh.nfh_len += sizeof(nfh.nfh_xh); @@ -465,7 +483,7 @@ getfh(proc_t p, struct getfh_args *uap, __unused int *retval) vnode_put(vp); if (error) return (error); - error = copyout((caddr_t)&nfh, uap->fhp, sizeof(nfh)); + error = copyout((caddr_t)&nfh, uap->fhp, sizeof(fhandle_t)); return (error); } @@ -564,7 +582,7 @@ fhopen( proc_t p, if ((error = VNOP_OPEN(vp, fmode, ctx))) goto bad; - if ((error = vnode_ref_ext(vp, fmode))) + if ((error = vnode_ref_ext(vp, fmode, 0))) goto bad; /* @@ -714,8 +732,12 @@ nfssvc_addsock(socket_t so, mbuf_t mynam) sock_gettype(so, &sodomain, &sotype, &soprotocol); - /* There should be only one UDP socket */ - if ((soprotocol == IPPROTO_UDP) && nfsrv_udpsock) { + /* There should be only one UDP socket for each of IPv4 and IPv6 */ + if ((sodomain == AF_INET) && (soprotocol == IPPROTO_UDP) && nfsrv_udpsock) { + mbuf_freem(mynam); + return (EEXIST); + } + if ((sodomain == AF_INET6) && (soprotocol == IPPROTO_UDP) && nfsrv_udp6sock) { mbuf_freem(mynam); return (EEXIST); } @@ -763,14 +785,26 @@ nfssvc_addsock(socket_t so, mbuf_t mynam) lck_mtx_lock(nfsd_mutex); if (soprotocol == IPPROTO_UDP) { - /* There should be only one UDP socket */ - if (nfsrv_udpsock) { - lck_mtx_unlock(nfsd_mutex); - nfsrv_slpfree(slp); - mbuf_freem(mynam); - return (EEXIST); + if (sodomain == AF_INET) { + /* There should be only one UDP/IPv4 socket */ + if (nfsrv_udpsock) { + lck_mtx_unlock(nfsd_mutex); + nfsrv_slpfree(slp); + mbuf_freem(mynam); + return (EEXIST); + } + nfsrv_udpsock = slp; + } + if (sodomain == AF_INET6) { + /* There should be only one UDP/IPv6 socket */ + if (nfsrv_udp6sock) { + lck_mtx_unlock(nfsd_mutex); + nfsrv_slpfree(slp); + mbuf_freem(mynam); + return (EEXIST); + } + nfsrv_udp6sock = slp; } - nfsrv_udpsock = slp; } /* add the socket to the list */ @@ -782,11 +816,7 @@ nfssvc_addsock(socket_t so, mbuf_t mynam) slp->ns_nam = mynam; /* set up the socket upcall */ - socket_lock(so, 1); - so->so_upcallarg = (caddr_t)slp; - so->so_upcall = nfsrv_rcv; - so->so_rcv.sb_flags |= SB_UPCALL; - socket_unlock(so, 1); + sock_setupcall(so, nfsrv_rcv, slp); /* just playin' it safe */ sock_setsockopt(so, SOL_SOCKET, SO_UPCALLCLOSEWAIT, &on, sizeof(on)); @@ -978,6 +1008,8 @@ nfssvc_nfsd(void) mbuf_freem(nd->nd_nam2); if (IS_VALID_CRED(nd->nd_cr)) kauth_cred_unref(&nd->nd_cr); + if (nd->nd_gss_context) + nfs_gss_svc_ctx_deref(nd->nd_gss_context); FREE_ZONE(nd, sizeof(*nd), M_NFSRVDESC); nd = NULL; } @@ -1000,21 +1032,17 @@ nfssvc_nfsd(void) if (nfsrv_require_resv_port) { /* Check if source port is a reserved port */ - u_short port; - struct sockaddr *nam = mbuf_data(nd->nd_nam); - struct sockaddr_in *sin; - - sin = (struct sockaddr_in *)nam; - port = ntohs(sin->sin_port); - if (port >= IPPORT_RESERVED && - nd->nd_procnum != NFSPROC_NULL) { - char strbuf[MAX_IPv4_STR_LEN]; + in_port_t port = 0; + struct sockaddr *saddr = mbuf_data(nd->nd_nam); + + if (saddr->sa_family == AF_INET) + port = ntohs(((struct sockaddr_in*)saddr)->sin_port); + else if (saddr->sa_family == AF_INET6) + port = ntohs(((struct sockaddr_in6*)saddr)->sin6_port); + if ((port >= IPPORT_RESERVED) && (nd->nd_procnum != NFSPROC_NULL)) { nd->nd_procnum = NFSPROC_NOOP; nd->nd_repstat = (NFSERR_AUTHERR | AUTH_TOOWEAK); cacherep = RC_DOIT; - printf("NFS request from unprivileged port (%s:%d)\n", - inet_ntop(AF_INET, &sin->sin_addr, strbuf, sizeof(strbuf)), - port); } } @@ -1130,6 +1158,8 @@ nfssvc_nfsd(void) nfsm_chain_cleanup(&nd->nd_nmreq); if (IS_VALID_CRED(nd->nd_cr)) kauth_cred_unref(&nd->nd_cr); + if (nd->nd_gss_context) + nfs_gss_svc_ctx_deref(nd->nd_gss_context); FREE_ZONE(nd, sizeof(*nd), M_NFSRVDESC); nfsrv_slpderef(slp); lck_mtx_lock(nfsd_mutex); @@ -1148,6 +1178,8 @@ nfssvc_nfsd(void) mbuf_freem(nd->nd_nam2); if (IS_VALID_CRED(nd->nd_cr)) kauth_cred_unref(&nd->nd_cr); + if (nd->nd_gss_context) + nfs_gss_svc_ctx_deref(nd->nd_gss_context); FREE_ZONE(nd, sizeof(*nd), M_NFSRVDESC); nd = NULL; } @@ -1294,6 +1326,8 @@ nfsrv_slpfree(struct nfsrv_sock *slp) mbuf_freem(nwp->nd_nam2); if (IS_VALID_CRED(nwp->nd_cr)) kauth_cred_unref(&nwp->nd_cr); + if (nwp->nd_gss_context) + nfs_gss_svc_ctx_deref(nwp->nd_gss_context); FREE_ZONE(nwp, sizeof(*nwp), M_NFSRVDESC); } LIST_INIT(&slp->ns_tq); @@ -1455,10 +1489,12 @@ nfsrv_cleanup(void) * Fire off the content modified fsevent for each * entry, remove it from the list, and free it. */ - if (nfsrv_fsevents_enabled) + if (nfsrv_fsevents_enabled) { + fp->fm_context.vc_thread = current_thread(); add_fsevent(FSE_CONTENT_MODIFIED, &fp->fm_context, FSE_ARG_VNODE, fp->fm_vp, FSE_ARG_DONE); + } vnode_put(fp->fm_vp); kauth_cred_unref(&fp->fm_context.vc_ucred); nfp = LIST_NEXT(fp, fm_link); @@ -1475,6 +1511,7 @@ nfsrv_cleanup(void) nfsrv_cleancache(); /* And clear out server cache */ nfsrv_udpsock = NULL; + nfsrv_udp6sock = NULL; } #endif /* NFS_NOSERVER */ diff --git a/bsd/nfs/nfs_vfsops.c b/bsd/nfs/nfs_vfsops.c index e92c58cdf..7a0323fde 100644 --- a/bsd/nfs/nfs_vfsops.c +++ b/bsd/nfs/nfs_vfsops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -88,6 +88,7 @@ #include #include #include +#include #include #include @@ -159,21 +160,29 @@ int nfs_max_async_writes = NFS_DEFMAXASYNCWRITES; int nfs_iosize = NFS_IOSIZE; int nfs_access_cache_timeout = NFS_MAXATTRTIMO; -int nfs_access_delete = 0; +int nfs_access_delete = 1; /* too many servers get this wrong - workaround on by default */ +int nfs_access_dotzfs = 1; +int nfs_access_for_getattr = 0; int nfs_allow_async = 0; int nfs_statfs_rate_limit = NFS_DEFSTATFSRATELIMIT; int nfs_lockd_mounts = 0; int nfs_lockd_request_sent = 0; +int nfs_idmap_ctrl = NFS_IDMAP_CTRL_USE_IDMAP_SERVICE; +int nfs_callback_port = 0; int nfs_tprintf_initial_delay = NFS_TPRINTF_INITIAL_DELAY; int nfs_tprintf_delay = NFS_TPRINTF_DELAY; -int mountnfs(struct user_nfs_args *,mount_t,mbuf_t,vfs_context_t,vnode_t *); +int mountnfs(char *, mount_t, vfs_context_t, vnode_t *); static int nfs_mount_diskless(struct nfs_dlmount *, const char *, int, vnode_t *, mount_t *, vfs_context_t); #if !defined(NO_MOUNT_PRIVATE) static int nfs_mount_diskless_private(struct nfs_dlmount *, const char *, int, vnode_t *, mount_t *, vfs_context_t); #endif /* NO_MOUNT_PRIVATE */ +int nfs_mount_connect(struct nfsmount *); +void nfs_mount_cleanup(struct nfsmount *); +int nfs_mountinfo_assemble(struct nfsmount *, struct xdrbuf *); +int nfs4_mount_update_path_with_symlink(struct nfsmount *, struct nfs_fs_path *, uint32_t, fhandle_t *, int *, fhandle_t *, vfs_context_t); /* * NFS VFS operations. @@ -218,8 +227,8 @@ struct vfsops nfs_vfsops = { /* * version-specific NFS functions */ -int nfs3_mount(struct nfsmount *, vfs_context_t, struct user_nfs_args *, nfsnode_t *); -int nfs4_mount(struct nfsmount *, vfs_context_t, struct user_nfs_args *, nfsnode_t *); +int nfs3_mount(struct nfsmount *, vfs_context_t, nfsnode_t *); +int nfs4_mount(struct nfsmount *, vfs_context_t, nfsnode_t *); int nfs3_fsinfo(struct nfsmount *, nfsnode_t, vfs_context_t); int nfs3_update_statfs(struct nfsmount *, vfs_context_t); int nfs4_update_statfs(struct nfsmount *, vfs_context_t); @@ -247,7 +256,10 @@ struct nfs_funcs nfs3_funcs = { nfs3_lookup_rpc_async, nfs3_lookup_rpc_async_finish, nfs3_remove_rpc, - nfs3_rename_rpc + nfs3_rename_rpc, + nfs3_setlock_rpc, + nfs3_unlock_rpc, + nfs3_getlock_rpc }; struct nfs_funcs nfs4_funcs = { nfs4_mount, @@ -265,7 +277,10 @@ struct nfs_funcs nfs4_funcs = { nfs4_lookup_rpc_async, nfs4_lookup_rpc_async_finish, nfs4_remove_rpc, - nfs4_rename_rpc + nfs4_rename_rpc, + nfs4_setlock_rpc, + nfs4_unlock_rpc, + nfs4_getlock_rpc }; /* @@ -358,8 +373,7 @@ nfs3_update_statfs(struct nfsmount *nmp, vfs_context_t ctx) nfsm_chain_add_fh(error, &nmreq, nfsvers, np->n_fhp, np->n_fhsize); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC_FSSTAT, ctx, - &nmrep, &xid, &status); + error = nfs_request(np, NULL, &nmreq, NFSPROC_FSSTAT, ctx, NULL, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; if (nfsvers == NFS_VER3) @@ -418,6 +432,7 @@ nfs4_update_statfs(struct nfsmount *nmp, vfs_context_t ctx) struct nfsm_chain nmreq, nmrep; uint32_t bitmap[NFS_ATTR_BITMAP_LEN]; struct nfs_vattr nvattr; + struct nfsreq_secinfo_args si; nfsvers = nmp->nm_vers; np = nmp->nm_dnp; @@ -426,6 +441,8 @@ nfs4_update_statfs(struct nfsmount *nmp, vfs_context_t ctx) if ((error = vnode_get(NFSTOV(np)))) return (error); + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); + NVATTR_INIT(&nvattr); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -440,12 +457,11 @@ nfs4_update_statfs(struct nfsmount *nmp, vfs_context_t ctx) nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); NFS_COPY_ATTRIBUTES(nfs_getattr_bitmap, bitmap); NFS4_STATFS_ATTRIBUTES(bitmap); - nfsm_chain_add_bitmap_masked(error, &nmreq, bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, bitmap, nmp, np); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &nmrep, &xid, &status); + error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &si, &nmrep, &xid, &status); nfsm_chain_skip_tag(error, &nmrep); nfsm_chain_get_32(error, &nmrep, numops); nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); @@ -453,8 +469,7 @@ nfs4_update_statfs(struct nfsmount *nmp, vfs_context_t ctx) nfsm_assert(error, NFSTONMP(np), ENXIO); nfsmout_if(error); lck_mtx_lock(&nmp->nm_lock); - NFS_CLEAR_ATTRIBUTES(nvattr.nva_bitmap); - error = nfs4_parsefattr(&nmrep, &nmp->nm_fsattr, &nvattr, NULL, NULL); + error = nfs4_parsefattr(&nmrep, &nmp->nm_fsattr, &nvattr, NULL, NULL, NULL); lck_mtx_unlock(&nmp->nm_lock); nfsmout_if(error); if ((lockerror = nfs_node_lock(np))) @@ -467,6 +482,7 @@ nfs4_update_statfs(struct nfsmount *nmp, vfs_context_t ctx) nfsmout_if(error); nmp->nm_fsattr.nfsa_bsize = NFS_FABLKSIZE; nfsmout: + NVATTR_CLEANUP(&nvattr); nfsm_chain_cleanup(&nmreq); nfsm_chain_cleanup(&nmrep); vnode_put(NFSTOV(np)); @@ -605,6 +621,8 @@ nfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t ctx) caps |= VOL_CAP_FMT_HIDDEN_FILES; valid |= VOL_CAP_FMT_HIDDEN_FILES; // VOL_CAP_FMT_OPENDENYMODES +// caps |= VOL_CAP_FMT_OPENDENYMODES; +// valid |= VOL_CAP_FMT_OPENDENYMODES; } fsap->f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] = // VOL_CAP_FMT_PERSISTENTOBJECTIDS | @@ -655,10 +673,18 @@ nfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t ctx) if (nfsvers >= NFS_VER4) { caps = VOL_CAP_INT_ADVLOCK | VOL_CAP_INT_FLOCK; valid = VOL_CAP_INT_ADVLOCK | VOL_CAP_INT_FLOCK; - // VOL_CAP_INT_EXTENDED_SECURITY - // VOL_CAP_INT_NAMEDSTREAMS - // VOL_CAP_INT_EXTENDED_ATTR - } else if ((nmp->nm_flag & NFSMNT_NOLOCKS)) { + if (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_ACL) + caps |= VOL_CAP_INT_EXTENDED_SECURITY; + valid |= VOL_CAP_INT_EXTENDED_SECURITY; + if (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR) + caps |= VOL_CAP_INT_EXTENDED_ATTR; + valid |= VOL_CAP_INT_EXTENDED_ATTR; +#if NAMEDSTREAMS + if (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR) + caps |= VOL_CAP_INT_NAMEDSTREAMS; + valid |= VOL_CAP_INT_NAMEDSTREAMS; +#endif + } else if (nmp->nm_lockmode == NFS_LOCK_MODE_DISABLED) { /* locks disabled on this mount, so they definitely won't work */ valid = VOL_CAP_INT_ADVLOCK | VOL_CAP_INT_FLOCK; } else if (nmp->nm_state & NFSSTA_LOCKSWORK) { @@ -681,6 +707,7 @@ nfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t ctx) // VOL_CAP_INT_MANLOCK | // VOL_CAP_INT_NAMEDSTREAMS | // VOL_CAP_INT_EXTENDED_ATTR | + VOL_CAP_INT_REMOTE_EVENT | caps; fsap->f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] = VOL_CAP_INT_SEARCHFS | @@ -698,6 +725,7 @@ nfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t ctx) // VOL_CAP_INT_MANLOCK | // VOL_CAP_INT_NAMEDSTREAMS | // VOL_CAP_INT_EXTENDED_ATTR | + VOL_CAP_INT_REMOTE_EVENT | valid; fsap->f_capabilities.capabilities[VOL_CAPABILITIES_RESERVED1] = 0; @@ -749,8 +777,7 @@ nfs3_fsinfo(struct nfsmount *nmp, nfsnode_t np, vfs_context_t ctx) nfsm_chain_add_fh(error, &nmreq, nmp->nm_vers, np->n_fhp, np->n_fhsize); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC_FSINFO, ctx, - &nmrep, &xid, &status); + error = nfs_request(np, NULL, &nmreq, NFSPROC_FSINFO, ctx, NULL, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; nfsm_chain_postop_attr_update(error, &nmrep, np, &xid); @@ -770,7 +797,7 @@ nfs3_fsinfo(struct nfsmount *nmp, nfsnode_t np, vfs_context_t ctx) if (prefsize < nmp->nm_rsize) nmp->nm_rsize = (prefsize + NFS_FABLKSIZE - 1) & ~(NFS_FABLKSIZE - 1); - if (maxsize < nmp->nm_rsize) { + if ((maxsize > 0) && (maxsize < nmp->nm_rsize)) { nmp->nm_rsize = maxsize & ~(NFS_FABLKSIZE - 1); if (nmp->nm_rsize == 0) nmp->nm_rsize = maxsize; @@ -784,7 +811,7 @@ nfs3_fsinfo(struct nfsmount *nmp, nfsnode_t np, vfs_context_t ctx) if (prefsize < nmp->nm_wsize) nmp->nm_wsize = (prefsize + NFS_FABLKSIZE - 1) & ~(NFS_FABLKSIZE - 1); - if (maxsize < nmp->nm_wsize) { + if ((maxsize > 0) && (maxsize < nmp->nm_wsize)) { nmp->nm_wsize = maxsize & ~(NFS_FABLKSIZE - 1); if (nmp->nm_wsize == 0) nmp->nm_wsize = maxsize; @@ -793,10 +820,11 @@ nfs3_fsinfo(struct nfsmount *nmp, nfsnode_t np, vfs_context_t ctx) nfsm_chain_get_32(error, &nmrep, prefsize); nfsmout_if(error); - if (prefsize < nmp->nm_readdirsize) + if ((prefsize > 0) && (prefsize < nmp->nm_readdirsize)) nmp->nm_readdirsize = prefsize; - if (maxsize < nmp->nm_readdirsize) - nmp->nm_readdirsize = maxsize; + if ((nmp->nm_fsattr.nfsa_maxread > 0) && + (nmp->nm_fsattr.nfsa_maxread < nmp->nm_readdirsize)) + nmp->nm_readdirsize = nmp->nm_fsattr.nfsa_maxread; nfsm_chain_get_64(error, &nmrep, nmp->nm_fsattr.nfsa_maxfilesize); @@ -846,7 +874,6 @@ int nfs_mountroot(void) { struct nfs_diskless nd; - struct nfs_vattr nvattr; mount_t mp = NULL; vnode_t vp = NULL; vfs_context_t ctx; @@ -864,9 +891,9 @@ nfs_mountroot(void) */ bzero((caddr_t) &nd, sizeof(nd)); error = nfs_boot_init(&nd); - if (error) { - panic("nfs_boot_init failed with %d\n", error); - } + if (error) + panic("nfs_boot_init: unable to initialize NFS root system information, " + "error %d, check configuration: %s\n", error, PE_boot_args()); /* * Try NFSv3 first, then fallback to NFSv2. @@ -895,27 +922,29 @@ nfs_mountroot(void) } if (v3) { if (sotype == SOCK_STREAM) { - printf("nfs_boot_getfh(v3,TCP) failed with %d, trying UDP...\n", error); + printf("NFS mount (v3,TCP) failed with error %d, trying UDP...\n", error); sotype = SOCK_DGRAM; goto tryagain; } - printf("nfs_boot_getfh(v3,UDP) failed with %d, trying v2...\n", error); + printf("NFS mount (v3,UDP) failed with error %d, trying v2...\n", error); v3 = 0; sotype = SOCK_STREAM; goto tryagain; } else if (sotype == SOCK_STREAM) { - printf("nfs_boot_getfh(v2,TCP) failed with %d, trying UDP...\n", error); + printf("NFS mount (v2,TCP) failed with error %d, trying UDP...\n", error); sotype = SOCK_DGRAM; goto tryagain; + } else { + printf("NFS mount (v2,UDP) failed with error %d, giving up...\n", error); } switch(error) { case EPROGUNAVAIL: - panic("nfs_boot_getfh(v2,UDP) failed: NFS server mountd not responding - check server configuration: %s", PE_boot_args()); + panic("NFS mount failed: NFS server mountd not responding, check server configuration: %s", PE_boot_args()); case EACCES: case EPERM: - panic("nfs_boot_getfh(v2,UDP) failed: NFS server refused mount - check server configuration: %s", PE_boot_args()); + panic("NFS mount failed: NFS server refused mount, check server configuration: %s", PE_boot_args()); default: - panic("nfs_boot_getfh(v2,UDP) failed with %d: %s", error, PE_boot_args()); + panic("NFS mount failed with error %d, check configuration: %s", error, PE_boot_args()); } } @@ -943,20 +972,22 @@ nfs_mountroot(void) { if (v3) { if (sotype == SOCK_STREAM) { - printf("nfs_mount_diskless(v3,TCP) failed with %d, trying UDP...\n", error); + printf("NFS root mount (v3,TCP) failed with %d, trying UDP...\n", error); sotype = SOCK_DGRAM; goto tryagain; } - printf("nfs_mount_diskless(v3,UDP) failed with %d, trying v2...\n", error); + printf("NFS root mount (v3,UDP) failed with %d, trying v2...\n", error); v3 = 0; sotype = SOCK_STREAM; goto tryagain; } else if (sotype == SOCK_STREAM) { - printf("nfs_mount_diskless(v2,TCP) failed with %d, trying UDP...\n", error); + printf("NFS root mount (v2,TCP) failed with %d, trying UDP...\n", error); sotype = SOCK_DGRAM; goto tryagain; + } else { + printf("NFS root mount (v2,UDP) failed with error %d, giving up...\n", error); } - panic("nfs_mount_diskless(v2,UDP) root failed with %d: %s\n", error, PE_boot_args()); + panic("NFS root mount failed with error %d, check configuration: %s\n", error, PE_boot_args()); } } printf("root on %s\n", nd.nd_root.ndm_mntfrom); @@ -969,9 +1000,8 @@ nfs_mountroot(void) if (nd.nd_private.ndm_saddr.sin_addr.s_addr) { error = nfs_mount_diskless_private(&nd.nd_private, "/private", 0, &vppriv, &mppriv, ctx); - if (error) { - panic("nfs_mount_diskless private failed with %d\n", error); - } + if (error) + panic("NFS /private mount failed with error %d, check configuration: %s\n", error, PE_boot_args()); printf("private on %s\n", nd.nd_private.ndm_mntfrom); vfs_unbusy(mppriv); @@ -990,8 +1020,9 @@ nfs_mountroot(void) FREE_ZONE(nd.nd_private.ndm_path, MAXPATHLEN, M_NAMEI); /* Get root attributes (for the time). */ - error = nfs_getattr(VTONFS(vp), &nvattr, ctx, NGA_UNCACHED); - if (error) panic("nfs_mountroot: getattr for root"); + error = nfs_getattr(VTONFS(vp), NULL, ctx, NGA_UNCACHED); + if (error) + panic("NFS mount: failed to get attributes for root directory, error %d, check server", error); return (0); } @@ -1007,13 +1038,18 @@ nfs_mount_diskless( mount_t *mpp, vfs_context_t ctx) { - struct user_nfs_args args; mount_t mp; - mbuf_t m; - int error; + int error, numcomps; + char *xdrbuf, *p, *cp, *frompath, *endserverp; + char uaddr[MAX_IPv4_STR_LEN]; + struct xdrbuf xb; + uint32_t mattrs[NFS_MATTR_BITMAP_LEN]; + uint32_t mflags_mask[NFS_MFLAG_BITMAP_LEN]; + uint32_t mflags[NFS_MFLAG_BITMAP_LEN]; + uint32_t argslength_offset, attrslength_offset, end_offset; if ((error = vfs_rootmountalloc("nfs", ndmntp->ndm_mntfrom, &mp))) { - printf("nfs_mount_diskless: NFS not configured"); + printf("nfs_mount_diskless: NFS not configured\n"); return (error); } @@ -1021,26 +1057,112 @@ nfs_mount_diskless( if (!(mntflag & MNT_RDONLY)) mp->mnt_flag &= ~MNT_RDONLY; - /* Initialize mount args. */ - bzero((caddr_t) &args, sizeof(args)); - args.addr = CAST_USER_ADDR_T(&ndmntp->ndm_saddr); - args.addrlen = ndmntp->ndm_saddr.sin_len; - args.sotype = ndmntp->ndm_sotype; - args.fh = CAST_USER_ADDR_T(&ndmntp->ndm_fh[0]); - args.fhsize = ndmntp->ndm_fhlen; - args.hostname = CAST_USER_ADDR_T(ndmntp->ndm_mntfrom); - args.flags = NFSMNT_RESVPORT; - if (ndmntp->ndm_nfsv3) - args.flags |= NFSMNT_NFSV3; - - error = mbuf_get(MBUF_WAITOK, MBUF_TYPE_SONAME, &m); + /* find the server-side path being mounted */ + frompath = ndmntp->ndm_mntfrom; + if (*frompath == '[') { /* skip IPv6 literal address */ + while (*frompath && (*frompath != ']')) + frompath++; + if (*frompath == ']') + frompath++; + } + while (*frompath && (*frompath != ':')) + frompath++; + endserverp = frompath; + while (*frompath && (*frompath == ':')) + frompath++; + /* count fs location path components */ + p = frompath; + while (*p && (*p == '/')) + p++; + numcomps = 0; + while (*p) { + numcomps++; + while (*p && (*p != '/')) + p++; + while (*p && (*p == '/')) + p++; + } + + /* convert address to universal address string */ + if (inet_ntop(AF_INET, &ndmntp->ndm_saddr.sin_addr, uaddr, sizeof(uaddr)) != uaddr) { + printf("nfs_mount_diskless: bad address\n"); + return (EINVAL); + } + + /* prepare mount attributes */ + NFS_BITMAP_ZERO(mattrs, NFS_MATTR_BITMAP_LEN); + NFS_BITMAP_SET(mattrs, NFS_MATTR_NFS_VERSION); + NFS_BITMAP_SET(mattrs, NFS_MATTR_SOCKET_TYPE); + NFS_BITMAP_SET(mattrs, NFS_MATTR_NFS_PORT); + NFS_BITMAP_SET(mattrs, NFS_MATTR_FH); + NFS_BITMAP_SET(mattrs, NFS_MATTR_FS_LOCATIONS); + NFS_BITMAP_SET(mattrs, NFS_MATTR_MNTFLAGS); + + /* prepare mount flags */ + NFS_BITMAP_ZERO(mflags_mask, NFS_MFLAG_BITMAP_LEN); + NFS_BITMAP_ZERO(mflags, NFS_MFLAG_BITMAP_LEN); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_RESVPORT); + NFS_BITMAP_SET(mflags, NFS_MFLAG_RESVPORT); + + /* build xdr buffer */ + xb_init_buffer(&xb, NULL, 0); + xb_add_32(error, &xb, NFS_ARGSVERSION_XDR); + argslength_offset = xb_offset(&xb); + xb_add_32(error, &xb, 0); // args length + xb_add_32(error, &xb, NFS_XDRARGS_VERSION_0); + xb_add_bitmap(error, &xb, mattrs, NFS_MATTR_BITMAP_LEN); + attrslength_offset = xb_offset(&xb); + xb_add_32(error, &xb, 0); // attrs length + xb_add_32(error, &xb, ndmntp->ndm_nfsv3 ? 3 : 2); // NFS version + xb_add_string(error, &xb, ((ndmntp->ndm_sotype == SOCK_DGRAM) ? "udp" : "tcp"), 3); + xb_add_32(error, &xb, ntohs(ndmntp->ndm_saddr.sin_port)); // NFS port + xb_add_fh(error, &xb, &ndmntp->ndm_fh[0], ndmntp->ndm_fhlen); + /* fs location */ + xb_add_32(error, &xb, 1); /* fs location count */ + xb_add_32(error, &xb, 1); /* server count */ + xb_add_string(error, &xb, ndmntp->ndm_mntfrom, (endserverp - ndmntp->ndm_mntfrom)); /* server name */ + xb_add_32(error, &xb, 1); /* address count */ + xb_add_string(error, &xb, uaddr, strlen(uaddr)); /* address */ + xb_add_32(error, &xb, 0); /* empty server info */ + xb_add_32(error, &xb, numcomps); /* pathname component count */ + p = frompath; + while (*p && (*p == '/')) + p++; + while (*p) { + cp = p; + while (*p && (*p != '/')) + p++; + xb_add_string(error, &xb, cp, (p - cp)); /* component */ + if (error) + break; + while (*p && (*p == '/')) + p++; + } + xb_add_32(error, &xb, 0); /* empty fsl info */ + xb_add_32(error, &xb, mntflag); /* MNT flags */ + xb_build_done(error, &xb); + + /* update opaque counts */ + end_offset = xb_offset(&xb); + if (!error) { + error = xb_seek(&xb, argslength_offset); + xb_add_32(error, &xb, end_offset - argslength_offset + XDRWORD/*version*/); + } + if (!error) { + error = xb_seek(&xb, attrslength_offset); + xb_add_32(error, &xb, end_offset - attrslength_offset - XDRWORD/*don't include length field*/); + } if (error) { - printf("nfs_mount_diskless: mbuf_get(soname) failed"); + printf("nfs_mount_diskless: error %d assembling mount args\n", error); + xb_cleanup(&xb); return (error); } - mbuf_setlen(m, ndmntp->ndm_saddr.sin_len); - bcopy(&ndmntp->ndm_saddr, mbuf_data(m), ndmntp->ndm_saddr.sin_len); - if ((error = mountnfs(&args, mp, m, ctx, vpp))) { + /* grab the assembled buffer */ + xdrbuf = xb_buffer_base(&xb); + xb.xb_flags &= ~XB_CLEANUP; + + /* do the mount */ + if ((error = mountnfs(xdrbuf, mp, ctx, vpp))) { printf("nfs_mountroot: mount %s failed: %d\n", mntname, error); // XXX vfs_rootmountfailed(mp); mount_list_lock(); @@ -1052,10 +1174,11 @@ nfs_mount_diskless( mac_mount_label_destroy(mp); #endif FREE_ZONE(mp, sizeof(struct mount), M_MOUNT); - return (error); + } else { + *mpp = mp; } - *mpp = mp; - return (0); + xb_cleanup(&xb); + return (error); } #if !defined(NO_MOUNT_PRIVATE) @@ -1072,16 +1195,21 @@ nfs_mount_diskless_private( mount_t *mpp, vfs_context_t ctx) { - struct user_nfs_args args; mount_t mp; - mbuf_t m; - int error; + int error, numcomps; proc_t procp; struct vfstable *vfsp; struct nameidata nd; vnode_t vp; + char *xdrbuf = NULL, *p, *cp, *frompath, *endserverp; + char uaddr[MAX_IPv4_STR_LEN]; + struct xdrbuf xb; + uint32_t mattrs[NFS_MATTR_BITMAP_LEN]; + uint32_t mflags_mask[NFS_MFLAG_BITMAP_LEN], mflags[NFS_MFLAG_BITMAP_LEN]; + uint32_t argslength_offset, attrslength_offset, end_offset; procp = current_proc(); /* XXX */ + xb_init(&xb, 0); { /* @@ -1107,7 +1235,7 @@ nfs_mount_diskless_private( /* * Get vnode to be covered */ - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, + NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, CAST_USER_ADDR_T(mntname), ctx); if ((error = namei(&nd))) { printf("nfs_mountroot: private namei failed!\n"); @@ -1189,26 +1317,112 @@ nfs_mount_diskless_private( mac_mount_label_associate(ctx, mp); #endif - /* Initialize mount args. */ - bzero((caddr_t) &args, sizeof(args)); - args.addr = CAST_USER_ADDR_T(&ndmntp->ndm_saddr); - args.addrlen = ndmntp->ndm_saddr.sin_len; - args.sotype = ndmntp->ndm_sotype; - args.fh = CAST_USER_ADDR_T(ndmntp->ndm_fh); - args.fhsize = ndmntp->ndm_fhlen; - args.hostname = CAST_USER_ADDR_T(ndmntp->ndm_mntfrom); - args.flags = NFSMNT_RESVPORT; - if (ndmntp->ndm_nfsv3) - args.flags |= NFSMNT_NFSV3; - - error = mbuf_get(MBUF_WAITOK, MBUF_TYPE_SONAME, &m); + /* find the server-side path being mounted */ + frompath = ndmntp->ndm_mntfrom; + if (*frompath == '[') { /* skip IPv6 literal address */ + while (*frompath && (*frompath != ']')) + frompath++; + if (*frompath == ']') + frompath++; + } + while (*frompath && (*frompath != ':')) + frompath++; + endserverp = frompath; + while (*frompath && (*frompath == ':')) + frompath++; + /* count fs location path components */ + p = frompath; + while (*p && (*p == '/')) + p++; + numcomps = 0; + while (*p) { + numcomps++; + while (*p && (*p != '/')) + p++; + while (*p && (*p == '/')) + p++; + } + + /* convert address to universal address string */ + if (inet_ntop(AF_INET, &ndmntp->ndm_saddr.sin_addr, uaddr, sizeof(uaddr)) != uaddr) { + printf("nfs_mountroot: bad address\n"); + error = EINVAL; + goto out; + } + + /* prepare mount attributes */ + NFS_BITMAP_ZERO(mattrs, NFS_MATTR_BITMAP_LEN); + NFS_BITMAP_SET(mattrs, NFS_MATTR_NFS_VERSION); + NFS_BITMAP_SET(mattrs, NFS_MATTR_SOCKET_TYPE); + NFS_BITMAP_SET(mattrs, NFS_MATTR_NFS_PORT); + NFS_BITMAP_SET(mattrs, NFS_MATTR_FH); + NFS_BITMAP_SET(mattrs, NFS_MATTR_FS_LOCATIONS); + NFS_BITMAP_SET(mattrs, NFS_MATTR_MNTFLAGS); + + /* prepare mount flags */ + NFS_BITMAP_ZERO(mflags_mask, NFS_MFLAG_BITMAP_LEN); + NFS_BITMAP_ZERO(mflags, NFS_MFLAG_BITMAP_LEN); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_RESVPORT); + NFS_BITMAP_SET(mflags, NFS_MFLAG_RESVPORT); + + /* build xdr buffer */ + xb_init_buffer(&xb, NULL, 0); + xb_add_32(error, &xb, NFS_ARGSVERSION_XDR); + argslength_offset = xb_offset(&xb); + xb_add_32(error, &xb, 0); // args length + xb_add_32(error, &xb, NFS_XDRARGS_VERSION_0); + xb_add_bitmap(error, &xb, mattrs, NFS_MATTR_BITMAP_LEN); + attrslength_offset = xb_offset(&xb); + xb_add_32(error, &xb, 0); // attrs length + xb_add_32(error, &xb, ndmntp->ndm_nfsv3 ? 3 : 2); // NFS version + xb_add_string(error, &xb, ((ndmntp->ndm_sotype == SOCK_DGRAM) ? "udp" : "tcp"), 3); + xb_add_32(error, &xb, ntohs(ndmntp->ndm_saddr.sin_port)); // NFS port + xb_add_fh(error, &xb, &ndmntp->ndm_fh[0], ndmntp->ndm_fhlen); + /* fs location */ + xb_add_32(error, &xb, 1); /* fs location count */ + xb_add_32(error, &xb, 1); /* server count */ + xb_add_string(error, &xb, ndmntp->ndm_mntfrom, (endserverp - ndmntp->ndm_mntfrom)); /* server name */ + xb_add_32(error, &xb, 1); /* address count */ + xb_add_string(error, &xb, uaddr, strlen(uaddr)); /* address */ + xb_add_32(error, &xb, 0); /* empty server info */ + xb_add_32(error, &xb, numcomps); /* pathname component count */ + p = frompath; + while (*p && (*p == '/')) + p++; + while (*p) { + cp = p; + while (*p && (*p != '/')) + p++; + xb_add_string(error, &xb, cp, (p - cp)); /* component */ + if (error) + break; + while (*p && (*p == '/')) + p++; + } + xb_add_32(error, &xb, 0); /* empty fsl info */ + xb_add_32(error, &xb, mntflag); /* MNT flags */ + xb_build_done(error, &xb); + + /* update opaque counts */ + end_offset = xb_offset(&xb); + if (!error) { + error = xb_seek(&xb, argslength_offset); + xb_add_32(error, &xb, end_offset - argslength_offset + XDRWORD/*version*/); + } + if (!error) { + error = xb_seek(&xb, attrslength_offset); + xb_add_32(error, &xb, end_offset - attrslength_offset - XDRWORD/*don't include length field*/); + } if (error) { - printf("nfs_mount_diskless_private: mbuf_get(soname) failed"); + printf("nfs_mountroot: error %d assembling mount args\n", error); goto out; } - mbuf_setlen(m, ndmntp->ndm_saddr.sin_len); - bcopy(&ndmntp->ndm_saddr, mbuf_data(m), ndmntp->ndm_saddr.sin_len); - if ((error = mountnfs(&args, mp, m, ctx, &vp))) { + /* grab the assembled buffer */ + xdrbuf = xb_buffer_base(&xb); + xb.xb_flags &= ~XB_CLEANUP; + + /* do the mount */ + if ((error = mountnfs(xdrbuf, mp, ctx, &vp))) { printf("nfs_mountroot: mount %s failed: %d\n", mntname, error); mount_list_lock(); vfsp->vfc_refcount--; @@ -1225,63 +1439,65 @@ nfs_mount_diskless_private( *mpp = mp; *vpp = vp; out: + xb_cleanup(&xb); return (error); } #endif /* NO_MOUNT_PRIVATE */ /* - * VFS Operations. - * - * mount system call + * Convert old style NFS mount args to XDR. */ -int -nfs_vfs_mount(mount_t mp, vnode_t vp, user_addr_t data, vfs_context_t ctx) +static int +nfs_convert_old_nfs_args(mount_t mp, user_addr_t data, vfs_context_t ctx, int argsversion, int inkernel, char **xdrbufp) { - int error, argsvers; + int error = 0, args64bit, argsize, numcomps; struct user_nfs_args args; struct nfs_args tempargs; - mbuf_t nam; + caddr_t argsp; size_t len; - u_char nfh[NFSX_V3FHMAX]; - char *mntfrom; - - error = copyin(data, (caddr_t)&argsvers, sizeof (argsvers)); - if (error) - return (error); - - switch (argsvers) { + u_char nfh[NFS4_FHSIZE]; + char *mntfrom, *endserverp, *frompath, *p, *cp; + struct sockaddr_storage ss; + void *sinaddr; + char uaddr[MAX_IPv6_STR_LEN]; + uint32_t mattrs[NFS_MATTR_BITMAP_LEN]; + uint32_t mflags_mask[NFS_MFLAG_BITMAP_LEN], mflags[NFS_MFLAG_BITMAP_LEN]; + uint32_t nfsvers, nfslockmode = 0, argslength_offset, attrslength_offset, end_offset; + struct xdrbuf xb; + + *xdrbufp = NULL; + + /* allocate a temporary buffer for mntfrom */ + MALLOC_ZONE(mntfrom, char*, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (!mntfrom) + return (ENOMEM); + + args64bit = (inkernel || vfs_context_is64bit(ctx)); + argsp = args64bit ? (void*)&args : (void*)&tempargs; + + argsize = args64bit ? sizeof(args) : sizeof(tempargs); + switch (argsversion) { case 3: - if (vfs_context_is64bit(ctx)) - error = copyin(data, (caddr_t)&args, sizeof (struct user_nfs_args3)); - else - error = copyin(data, (caddr_t)&tempargs, sizeof (struct nfs_args3)); - break; + argsize -= NFS_ARGSVERSION4_INCSIZE; case 4: - if (vfs_context_is64bit(ctx)) - error = copyin(data, (caddr_t)&args, sizeof (struct user_nfs_args4)); - else - error = copyin(data, (caddr_t)&tempargs, sizeof (struct nfs_args4)); - break; + argsize -= NFS_ARGSVERSION5_INCSIZE; case 5: - if (vfs_context_is64bit(ctx)) - error = copyin(data, (caddr_t)&args, sizeof (struct user_nfs_args5)); - else - error = copyin(data, (caddr_t)&tempargs, sizeof (struct nfs_args5)); - break; + argsize -= NFS_ARGSVERSION6_INCSIZE; case 6: - if (vfs_context_is64bit(ctx)) - error = copyin(data, (caddr_t)&args, sizeof (args)); - else - error = copyin(data, (caddr_t)&tempargs, sizeof (tempargs)); break; default: - return (EPROGMISMATCH); + error = EPROGMISMATCH; + goto nfsmout; } - if (error) - return (error); - if (!vfs_context_is64bit(ctx)) { - args.version = tempargs.version; + /* read in the structure */ + if (inkernel) + bcopy(CAST_DOWN(void *, data), argsp, argsize); + else + error = copyin(data, argsp, argsize); + nfsmout_if(error); + + if (!args64bit) { args.addrlen = tempargs.addrlen; args.sotype = tempargs.sotype; args.proto = tempargs.proto; @@ -1299,39 +1515,357 @@ nfs_vfs_mount(mount_t mp, vnode_t vp, user_addr_t data, vfs_context_t ctx) args.addr = CAST_USER_ADDR_T(tempargs.addr); args.fh = CAST_USER_ADDR_T(tempargs.fh); args.hostname = CAST_USER_ADDR_T(tempargs.hostname); - if (argsvers >= 4) { + if (args.version >= 4) { args.acregmin = tempargs.acregmin; args.acregmax = tempargs.acregmax; args.acdirmin = tempargs.acdirmin; args.acdirmax = tempargs.acdirmax; } - if (argsvers >= 5) + if (args.version >= 5) args.auth = tempargs.auth; - if (argsvers >= 6) + if (args.version >= 6) args.deadtimeout = tempargs.deadtimeout; } - if (args.fhsize < 0 || args.fhsize > NFSX_V3FHMAX) - return (EINVAL); + if ((args.fhsize < 0) || (args.fhsize > NFS4_FHSIZE)) { + error = EINVAL; + goto nfsmout; + } if (args.fhsize > 0) { - error = copyin(args.fh, (caddr_t)nfh, args.fhsize); - if (error) - return (error); + if (inkernel) + bcopy(CAST_DOWN(void *, args.fh), (caddr_t)nfh, args.fhsize); + else + error = copyin(args.fh, (caddr_t)nfh, args.fhsize); + nfsmout_if(error); } - mntfrom = &vfs_statfs(mp)->f_mntfromname[0]; - error = copyinstr(args.hostname, mntfrom, MAXPATHLEN-1, &len); - if (error) - return (error); + if (inkernel) + error = copystr(CAST_DOWN(void *, args.hostname), mntfrom, MAXPATHLEN-1, &len); + else + error = copyinstr(args.hostname, mntfrom, MAXPATHLEN-1, &len); + nfsmout_if(error); bzero(&mntfrom[len], MAXPATHLEN - len); - /* sockargs() call must be after above copyin() calls */ - error = sockargs(&nam, args.addr, args.addrlen, MBUF_TYPE_SONAME); - if (error) + /* find the server-side path being mounted */ + frompath = mntfrom; + if (*frompath == '[') { /* skip IPv6 literal address */ + while (*frompath && (*frompath != ']')) + frompath++; + if (*frompath == ']') + frompath++; + } + while (*frompath && (*frompath != ':')) + frompath++; + endserverp = frompath; + while (*frompath && (*frompath == ':')) + frompath++; + /* count fs location path components */ + p = frompath; + while (*p && (*p == '/')) + p++; + numcomps = 0; + while (*p) { + numcomps++; + while (*p && (*p != '/')) + p++; + while (*p && (*p == '/')) + p++; + } + + /* copy socket address */ + if (inkernel) + bcopy(CAST_DOWN(void *, args.addr), &ss, args.addrlen); + else + error = copyin(args.addr, &ss, args.addrlen); + nfsmout_if(error); + ss.ss_len = args.addrlen; + + /* convert address to universal address string */ + if (ss.ss_family == AF_INET) + sinaddr = &((struct sockaddr_in*)&ss)->sin_addr; + else if (ss.ss_family == AF_INET6) + sinaddr = &((struct sockaddr_in6*)&ss)->sin6_addr; + else + sinaddr = NULL; + if (!sinaddr || (inet_ntop(ss.ss_family, sinaddr, uaddr, sizeof(uaddr)) != uaddr)) { + error = EINVAL; + goto nfsmout; + } + + /* prepare mount flags */ + NFS_BITMAP_ZERO(mflags_mask, NFS_MFLAG_BITMAP_LEN); + NFS_BITMAP_ZERO(mflags, NFS_MFLAG_BITMAP_LEN); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_SOFT); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_INTR); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_RESVPORT); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NOCONNECT); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_DUMBTIMER); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_CALLUMNT); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_RDIRPLUS); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NONEGNAMECACHE); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_MUTEJUKEBOX); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NOQUOTA); + if (args.flags & NFSMNT_SOFT) + NFS_BITMAP_SET(mflags, NFS_MFLAG_SOFT); + if (args.flags & NFSMNT_INT) + NFS_BITMAP_SET(mflags, NFS_MFLAG_INTR); + if (args.flags & NFSMNT_RESVPORT) + NFS_BITMAP_SET(mflags, NFS_MFLAG_RESVPORT); + if (args.flags & NFSMNT_NOCONN) + NFS_BITMAP_SET(mflags, NFS_MFLAG_NOCONNECT); + if (args.flags & NFSMNT_DUMBTIMR) + NFS_BITMAP_SET(mflags, NFS_MFLAG_DUMBTIMER); + if (args.flags & NFSMNT_CALLUMNT) + NFS_BITMAP_SET(mflags, NFS_MFLAG_CALLUMNT); + if (args.flags & NFSMNT_RDIRPLUS) + NFS_BITMAP_SET(mflags, NFS_MFLAG_RDIRPLUS); + if (args.flags & NFSMNT_NONEGNAMECACHE) + NFS_BITMAP_SET(mflags, NFS_MFLAG_NONEGNAMECACHE); + if (args.flags & NFSMNT_MUTEJUKEBOX) + NFS_BITMAP_SET(mflags, NFS_MFLAG_MUTEJUKEBOX); + if (args.flags & NFSMNT_NOQUOTA) + NFS_BITMAP_SET(mflags, NFS_MFLAG_NOQUOTA); + + /* prepare mount attributes */ + NFS_BITMAP_ZERO(mattrs, NFS_MATTR_BITMAP_LEN); + NFS_BITMAP_SET(mattrs, NFS_MATTR_FLAGS); + NFS_BITMAP_SET(mattrs, NFS_MATTR_NFS_VERSION); + NFS_BITMAP_SET(mattrs, NFS_MATTR_SOCKET_TYPE); + NFS_BITMAP_SET(mattrs, NFS_MATTR_NFS_PORT); + NFS_BITMAP_SET(mattrs, NFS_MATTR_FH); + NFS_BITMAP_SET(mattrs, NFS_MATTR_FS_LOCATIONS); + NFS_BITMAP_SET(mattrs, NFS_MATTR_MNTFLAGS); + NFS_BITMAP_SET(mattrs, NFS_MATTR_MNTFROM); + if (args.flags & NFSMNT_NFSV4) + nfsvers = 4; + else if (args.flags & NFSMNT_NFSV3) + nfsvers = 3; + else + nfsvers = 2; + if ((args.flags & NFSMNT_RSIZE) && (args.rsize > 0)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_READ_SIZE); + if ((args.flags & NFSMNT_WSIZE) && (args.wsize > 0)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_WRITE_SIZE); + if ((args.flags & NFSMNT_TIMEO) && (args.timeo > 0)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_REQUEST_TIMEOUT); + if ((args.flags & NFSMNT_RETRANS) && (args.retrans > 0)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_SOFT_RETRY_COUNT); + if ((args.flags & NFSMNT_MAXGRPS) && (args.maxgrouplist > 0)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_MAX_GROUP_LIST); + if ((args.flags & NFSMNT_READAHEAD) && (args.readahead > 0)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_READAHEAD); + if ((args.flags & NFSMNT_READDIRSIZE) && (args.readdirsize > 0)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_READDIR_SIZE); + if ((args.flags & NFSMNT_NOLOCKS) || + (args.flags & NFSMNT_LOCALLOCKS)) { + NFS_BITMAP_SET(mattrs, NFS_MATTR_LOCK_MODE); + if (args.flags & NFSMNT_NOLOCKS) + nfslockmode = NFS_LOCK_MODE_DISABLED; + else if (args.flags & NFSMNT_LOCALLOCKS) + nfslockmode = NFS_LOCK_MODE_LOCAL; + else + nfslockmode = NFS_LOCK_MODE_ENABLED; + } + if (args.version >= 4) { + if ((args.flags & NFSMNT_ACREGMIN) && (args.acregmin > 0)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_ATTRCACHE_REG_MIN); + if ((args.flags & NFSMNT_ACREGMAX) && (args.acregmax > 0)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_ATTRCACHE_REG_MAX); + if ((args.flags & NFSMNT_ACDIRMIN) && (args.acdirmin > 0)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_ATTRCACHE_DIR_MIN); + if ((args.flags & NFSMNT_ACDIRMAX) && (args.acdirmax > 0)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_ATTRCACHE_DIR_MAX); + } + if (args.version >= 5) { + if ((args.flags & NFSMNT_SECFLAVOR) || (args.flags & NFSMNT_SECSYSOK)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_SECURITY); + } + if (args.version >= 6) { + if ((args.flags & NFSMNT_DEADTIMEOUT) && (args.deadtimeout > 0)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_DEAD_TIMEOUT); + } + + /* build xdr buffer */ + xb_init_buffer(&xb, NULL, 0); + xb_add_32(error, &xb, args.version); + argslength_offset = xb_offset(&xb); + xb_add_32(error, &xb, 0); // args length + xb_add_32(error, &xb, NFS_XDRARGS_VERSION_0); + xb_add_bitmap(error, &xb, mattrs, NFS_MATTR_BITMAP_LEN); + attrslength_offset = xb_offset(&xb); + xb_add_32(error, &xb, 0); // attrs length + xb_add_bitmap(error, &xb, mflags_mask, NFS_MFLAG_BITMAP_LEN); /* mask */ + xb_add_bitmap(error, &xb, mflags, NFS_MFLAG_BITMAP_LEN); /* value */ + xb_add_32(error, &xb, nfsvers); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_READ_SIZE)) + xb_add_32(error, &xb, args.rsize); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_WRITE_SIZE)) + xb_add_32(error, &xb, args.wsize); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_READDIR_SIZE)) + xb_add_32(error, &xb, args.readdirsize); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_READAHEAD)) + xb_add_32(error, &xb, args.readahead); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_ATTRCACHE_REG_MIN)) { + xb_add_32(error, &xb, args.acregmin); + xb_add_32(error, &xb, 0); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_ATTRCACHE_REG_MAX)) { + xb_add_32(error, &xb, args.acregmax); + xb_add_32(error, &xb, 0); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_ATTRCACHE_DIR_MIN)) { + xb_add_32(error, &xb, args.acdirmin); + xb_add_32(error, &xb, 0); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_ATTRCACHE_DIR_MAX)) { + xb_add_32(error, &xb, args.acdirmax); + xb_add_32(error, &xb, 0); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_LOCK_MODE)) + xb_add_32(error, &xb, nfslockmode); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SECURITY)) { + uint32_t flavors[2], i=0; + if (args.flags & NFSMNT_SECFLAVOR) + flavors[i++] = args.auth; + if ((args.flags & NFSMNT_SECSYSOK) && ((i == 0) || (flavors[0] != RPCAUTH_SYS))) + flavors[i++] = RPCAUTH_SYS; + xb_add_word_array(error, &xb, flavors, i); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MAX_GROUP_LIST)) + xb_add_32(error, &xb, args.maxgrouplist); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SOCKET_TYPE)) + xb_add_string(error, &xb, ((args.sotype == SOCK_DGRAM) ? "udp" : "tcp"), 3); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_NFS_PORT)) + xb_add_32(error, &xb, ((ss.ss_family == AF_INET) ? + ntohs(((struct sockaddr_in*)&ss)->sin_port) : + ntohs(((struct sockaddr_in6*)&ss)->sin6_port))); + /* NFS_MATTR_MOUNT_PORT (not available in old args) */ + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_REQUEST_TIMEOUT)) { + /* convert from .1s increments to time */ + xb_add_32(error, &xb, args.timeo/10); + xb_add_32(error, &xb, (args.timeo%10)*100000000); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SOFT_RETRY_COUNT)) + xb_add_32(error, &xb, args.retrans); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_DEAD_TIMEOUT)) { + xb_add_32(error, &xb, args.deadtimeout); + xb_add_32(error, &xb, 0); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_FH)) + xb_add_fh(error, &xb, &nfh[0], args.fhsize); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_FS_LOCATIONS)) { + xb_add_32(error, &xb, 1); /* fs location count */ + xb_add_32(error, &xb, 1); /* server count */ + xb_add_string(error, &xb, mntfrom, (endserverp - mntfrom)); /* server name */ + xb_add_32(error, &xb, 1); /* address count */ + xb_add_string(error, &xb, uaddr, strlen(uaddr)); /* address */ + xb_add_32(error, &xb, 0); /* empty server info */ + xb_add_32(error, &xb, numcomps); /* pathname component count */ + nfsmout_if(error); + p = frompath; + while (*p && (*p == '/')) + p++; + while (*p) { + cp = p; + while (*p && (*p != '/')) + p++; + xb_add_string(error, &xb, cp, (p - cp)); /* component */ + nfsmout_if(error); + while (*p && (*p == '/')) + p++; + } + xb_add_32(error, &xb, 0); /* empty fsl info */ + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MNTFLAGS)) + xb_add_32(error, &xb, (vfs_flags(mp) & MNT_VISFLAGMASK)); /* VFS MNT_* flags */ + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MNTFROM)) + xb_add_string(error, &xb, mntfrom, strlen(mntfrom)); /* fixed f_mntfromname */ + xb_build_done(error, &xb); + + /* update opaque counts */ + end_offset = xb_offset(&xb); + error = xb_seek(&xb, argslength_offset); + xb_add_32(error, &xb, end_offset - argslength_offset + XDRWORD/*version*/); + nfsmout_if(error); + error = xb_seek(&xb, attrslength_offset); + xb_add_32(error, &xb, end_offset - attrslength_offset - XDRWORD/*don't include length field*/); + + if (!error) { + /* grab the assembled buffer */ + *xdrbufp = xb_buffer_base(&xb); + xb.xb_flags &= ~XB_CLEANUP; + } +nfsmout: + xb_cleanup(&xb); + FREE_ZONE(mntfrom, MAXPATHLEN, M_NAMEI); + return (error); +} + +/* + * VFS Operations. + * + * mount system call + */ +int +nfs_vfs_mount(mount_t mp, vnode_t vp, user_addr_t data, vfs_context_t ctx) +{ + int error = 0, inkernel = vfs_iskernelmount(mp); + uint32_t argsversion, argslength; + char *xdrbuf = NULL; + + /* read in version */ + if (inkernel) + bcopy(CAST_DOWN(void *, data), &argsversion, sizeof(argsversion)); + else if ((error = copyin(data, &argsversion, sizeof(argsversion)))) return (error); - args.fh = CAST_USER_ADDR_T(&nfh[0]); - error = mountnfs(&args, mp, nam, ctx, &vp); + /* If we have XDR args, then all values in the buffer are in network order */ + if (argsversion == htonl(NFS_ARGSVERSION_XDR)) + argsversion = NFS_ARGSVERSION_XDR; + + switch (argsversion) { + case 3: + case 4: + case 5: + case 6: + /* convert old-style args to xdr */ + error = nfs_convert_old_nfs_args(mp, data, ctx, argsversion, inkernel, &xdrbuf); + break; + case NFS_ARGSVERSION_XDR: + /* copy in xdr buffer */ + if (inkernel) + bcopy(CAST_DOWN(void *, (data + XDRWORD)), &argslength, XDRWORD); + else + error = copyin((data + XDRWORD), &argslength, XDRWORD); + if (error) + break; + argslength = ntohl(argslength); + /* put a reasonable limit on the size of the XDR args */ + if (argslength > 16*1024) { + error = E2BIG; + break; + } + /* allocate xdr buffer */ + xdrbuf = xb_malloc(xdr_rndup(argslength)); + if (!xdrbuf) { + error = ENOMEM; + break; + } + if (inkernel) + bcopy(CAST_DOWN(void *, data), xdrbuf, argslength); + else + error = copyin(data, xdrbuf, argslength); + break; + default: + error = EPROGMISMATCH; + } + + if (error) { + if (xdrbuf) + xb_free(xdrbuf); + return (error); + } + error = mountnfs(xdrbuf, mp, ctx, &vp); return (error); } @@ -1339,32 +1873,33 @@ nfs_vfs_mount(mount_t mp, vnode_t vp, user_addr_t data, vfs_context_t ctx) * Common code for mount and mountroot */ +/* Set up an NFSv2/v3 mount */ int nfs3_mount( struct nfsmount *nmp, vfs_context_t ctx, - struct user_nfs_args *argp, nfsnode_t *npp) { int error = 0; struct nfs_vattr nvattr; u_int64_t xid; - u_char *fhp; *npp = NULL; + if (!nmp->nm_fh) + return (EINVAL); + /* * Get file attributes for the mountpoint. These are needed * in order to properly create the root vnode. */ - fhp = CAST_DOWN(u_char *, argp->fh); - error = nfs3_getattr_rpc(NULL, nmp->nm_mountp, fhp, argp->fhsize, + error = nfs3_getattr_rpc(NULL, nmp->nm_mountp, nmp->nm_fh->fh_data, nmp->nm_fh->fh_len, 0, ctx, &nvattr, &xid); if (error) goto out; - error = nfs_nget(nmp->nm_mountp, NULL, NULL, fhp, argp->fhsize, - &nvattr, &xid, NG_MARKROOT, npp); + error = nfs_nget(nmp->nm_mountp, NULL, NULL, nmp->nm_fh->fh_data, nmp->nm_fh->fh_len, + &nvattr, &xid, RPCAUTH_UNKNOWN, NG_MARKROOT, npp); if (*npp) nfs_node_unlock(*npp); if (error) @@ -1403,325 +1938,1150 @@ nfs3_mount( return (error); } +/* + * Update an NFSv4 mount path with the contents of the symlink. + * + * Read the link for the given file handle. + * Insert the link's components into the path. + */ int -nfs4_mount( - struct nfsmount *nmp, - vfs_context_t ctx, - __unused struct user_nfs_args *argp, - nfsnode_t *npp) +nfs4_mount_update_path_with_symlink(struct nfsmount *nmp, struct nfs_fs_path *nfsp, uint32_t curcomp, fhandle_t *dirfhp, int *depthp, fhandle_t *fhp, vfs_context_t ctx) { - struct nfsm_chain nmreq, nmrep; - int error = 0, numops, status, interval; - char *path = &vfs_statfs(nmp->nm_mountp)->f_mntfromname[0]; - char *name, *nextname; - fhandle_t fh; - struct nfs_vattr nvattr; + int error = 0, status, numops; + uint32_t len = 0, comp, newcomp, linkcompcount; u_int64_t xid; + struct nfsm_chain nmreq, nmrep; + struct nfsreq rq, *req = &rq; + struct nfsreq_secinfo_args si; + char *link = NULL, *p, *q, ch; + struct nfs_fs_path nfsp2; + + bzero(&nfsp2, sizeof(nfsp2)); + if (dirfhp->fh_len) + NFSREQ_SECINFO_SET(&si, NULL, dirfhp->fh_data, dirfhp->fh_len, nfsp->np_components[curcomp], 0); + else + NFSREQ_SECINFO_SET(&si, NULL, NULL, 0, nfsp->np_components[curcomp], 0); + nfsm_chain_null(&nmreq); + nfsm_chain_null(&nmrep); - *npp = NULL; - fh.fh_len = 0; - TAILQ_INIT(&nmp->nm_open_owners); - TAILQ_INIT(&nmp->nm_recallq); - nmp->nm_stategenid = 1; + MALLOC_ZONE(link, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (!link) + error = ENOMEM; - /* look up path to get fh and attrs for mount point root */ - numops = 2; // PUTROOTFH + LOOKUP* + GETATTR - while (*path && (*path != '/')) - path++; - name = path; - while (*name) { - while (*name && (*name == '/')) - name++; - if (!*name) - break; - nextname = name; - while (*nextname && (*nextname != '/')) - nextname++; - numops++; - name = nextname; - } - nfsm_chain_build_alloc_init(error, &nmreq, 25 * NFSX_UNSIGNED); - nfsm_chain_add_compound_header(error, &nmreq, "mount", numops); + // PUTFH, READLINK + numops = 2; + nfsm_chain_build_alloc_init(error, &nmreq, 12 * NFSX_UNSIGNED); + nfsm_chain_add_compound_header(error, &nmreq, "readlink", numops); numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTROOTFH); - // (LOOKUP)* - name = path; - while (*name) { - while (*name && (*name == '/')) - name++; - if (!*name) - break; - nextname = name; - while (*nextname && (*nextname != '/')) - nextname++; - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_LOOKUP); - nfsm_chain_add_string(error, &nmreq, name, nextname - name); - name = nextname; - } + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, NFS_VER4, fhp->fh_data, fhp->fh_len); numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - NFS4_DEFAULT_ATTRIBUTES(nmp->nm_fsattr.nfsa_supp_attr); - NFS_BITMAP_SET(nmp->nm_fsattr.nfsa_supp_attr, NFS_FATTR_FILEHANDLE); - nfsm_chain_add_bitmap(error, &nmreq, nmp->nm_fsattr.nfsa_supp_attr, NFS_ATTR_BITMAP_LEN); + nfsm_chain_add_32(error, &nmreq, NFS_OP_READLINK); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request(NULL, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, ctx, &nmrep, &xid, &status); + + error = nfs_request_async(NULL, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, + vfs_context_thread(ctx), vfs_context_ucred(ctx), &si, 0, NULL, &req); + if (!error) + error = nfs_request_async_finish(req, &nmrep, &xid, &status); + nfsm_chain_skip_tag(error, &nmrep); nfsm_chain_get_32(error, &nmrep, numops); - nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTROOTFH); - name = path; - while (*name) { - while (*name && (*name == '/')) - name++; - if (!*name) - break; - nextname = name; - while (*nextname && (*nextname != '/')) - nextname++; - nfsm_chain_op_check(error, &nmrep, NFS_OP_LOOKUP); - name = nextname; - } - nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsmout_if(error); - NFS_CLEAR_ATTRIBUTES(nmp->nm_fsattr.nfsa_bitmap); - NFS_CLEAR_ATTRIBUTES(&nvattr.nva_bitmap); - error = nfs4_parsefattr(&nmrep, &nmp->nm_fsattr, &nvattr, &fh, NULL); - if (!error && !NFS_BITMAP_ISSET(&nvattr.nva_bitmap, NFS_FATTR_FILEHANDLE)) { - printf("nfs: mount didn't return filehandle?\n"); - error = EBADRPC; - } + nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_READLINK); + nfsm_chain_get_32(error, &nmrep, len); nfsmout_if(error); - - error = nfs_nget(nmp->nm_mountp, NULL, NULL, fh.fh_data, fh.fh_len, &nvattr, &xid, NG_MARKROOT, npp); + if (len == 0) + error = ENOENT; + else if (len >= MAXPATHLEN) + len = MAXPATHLEN - 1; + nfsm_chain_get_opaque(error, &nmrep, len, link); nfsmout_if(error); + /* make sure link string is terminated properly */ + link[len] = '\0'; + + /* count the number of components in link */ + p = link; + while (*p && (*p == '/')) + p++; + linkcompcount = 0; + while (*p) { + linkcompcount++; + while (*p && (*p != '/')) + p++; + while (*p && (*p == '/')) + p++; + } - /* adjust I/O sizes to server limits */ - if (NFS_BITMAP_ISSET(nmp->nm_fsattr.nfsa_bitmap, NFS_FATTR_MAXREAD)) { - if (nmp->nm_fsattr.nfsa_maxread < (uint64_t)nmp->nm_rsize) { - nmp->nm_rsize = nmp->nm_fsattr.nfsa_maxread & ~(NFS_FABLKSIZE - 1); - if (nmp->nm_rsize == 0) - nmp->nm_rsize = nmp->nm_fsattr.nfsa_maxread; + /* free up used components */ + for (comp=0; comp <= curcomp; comp++) { + if (nfsp->np_components[comp]) { + FREE(nfsp->np_components[comp], M_TEMP); + nfsp->np_components[comp] = NULL; } } - if (NFS_BITMAP_ISSET(nmp->nm_fsattr.nfsa_bitmap, NFS_FATTR_MAXWRITE)) { - if (nmp->nm_fsattr.nfsa_maxwrite < (uint64_t)nmp->nm_wsize) { - nmp->nm_wsize = nmp->nm_fsattr.nfsa_maxwrite & ~(NFS_FABLKSIZE - 1); - if (nmp->nm_wsize == 0) - nmp->nm_wsize = nmp->nm_fsattr.nfsa_maxwrite; + + /* set up new path */ + nfsp2.np_compcount = nfsp->np_compcount - curcomp - 1 + linkcompcount; + MALLOC(nfsp2.np_components, char **, nfsp2.np_compcount*sizeof(char*), M_TEMP, M_WAITOK|M_ZERO); + if (!nfsp2.np_components) { + error = ENOMEM; + goto nfsmout; + } + + /* add link components */ + p = link; + while (*p && (*p == '/')) + p++; + for (newcomp=0; newcomp < linkcompcount; newcomp++) { + /* find end of component */ + q = p; + while (*q && (*q != '/')) + q++; + MALLOC(nfsp2.np_components[newcomp], char *, q-p+1, M_TEMP, M_WAITOK|M_ZERO); + if (!nfsp2.np_components[newcomp]) { + error = ENOMEM; + break; } + ch = *q; + *q = '\0'; + strlcpy(nfsp2.np_components[newcomp], p, q-p+1); + *q = ch; + p = q; + while (*p && (*p == '/')) + p++; } + nfsmout_if(error); - /* set up lease renew timer */ - nmp->nm_renew_timer = thread_call_allocate(nfs4_renew_timer, nmp); - interval = nmp->nm_fsattr.nfsa_lease / 2; - if (interval < 1) - interval = 1; - nfs_interval_timer_start(nmp->nm_renew_timer, interval * 1000); + /* add remaining components */ + for(comp = curcomp + 1; comp < nfsp->np_compcount; comp++,newcomp++) { + nfsp2.np_components[newcomp] = nfsp->np_components[comp]; + nfsp->np_components[comp] = NULL; + } + + /* move new path into place */ + FREE(nfsp->np_components, M_TEMP); + nfsp->np_components = nfsp2.np_components; + nfsp->np_compcount = nfsp2.np_compcount; + nfsp2.np_components = NULL; + /* for absolute link, let the caller now that the next dirfh is root */ + if (link[0] == '/') { + dirfhp->fh_len = 0; + *depthp = 0; + } nfsmout: - if (*npp) - nfs_node_unlock(*npp); + if (link) + FREE_ZONE(link, MAXPATHLEN, M_NAMEI); + if (nfsp2.np_components) { + for (comp=0; comp < nfsp2.np_compcount; comp++) + if (nfsp2.np_components[comp]) + FREE(nfsp2.np_components[comp], M_TEMP); + FREE(nfsp2.np_components, M_TEMP); + } + nfsm_chain_cleanup(&nmreq); + nfsm_chain_cleanup(&nmrep); return (error); } +/* Set up an NFSv4 mount */ int -mountnfs( - struct user_nfs_args *argp, - mount_t mp, - mbuf_t nam, +nfs4_mount( + struct nfsmount *nmp, vfs_context_t ctx, - vnode_t *vpp) + nfsnode_t *npp) { - struct nfsmount *nmp; - nfsnode_t np; - int error; - uint32_t maxio, iosize; - struct vfsstatfs *sbp; - struct timespec ts = { 1, 0 }; + struct nfsm_chain nmreq, nmrep; + int error = 0, numops, status, interval, isdotdot, loopcnt = 0, depth = 0; + struct nfs_fs_path fspath, *nfsp, fspath2; + uint32_t bitmap[NFS_ATTR_BITMAP_LEN], comp, comp2; + fhandle_t fh, dirfh; + struct nfs_vattr nvattr; + u_int64_t xid; + struct nfsreq rq, *req = &rq; + struct nfsreq_secinfo_args si; + struct nfs_sec sec; + struct nfs_fs_locations nfsls; + + *npp = NULL; + fh.fh_len = dirfh.fh_len = 0; + TAILQ_INIT(&nmp->nm_open_owners); + TAILQ_INIT(&nmp->nm_delegations); + TAILQ_INIT(&nmp->nm_dreturnq); + nmp->nm_stategenid = 1; + NVATTR_INIT(&nvattr); + bzero(&nfsls, sizeof(nfsls)); + nfsm_chain_null(&nmreq); + nfsm_chain_null(&nmrep); /* - * Silently clear NFSMNT_NOCONN if it's a TCP mount, it makes - * no sense in that context. + * If no security flavors were specified we'll want to default to the server's + * preferred flavor. For NFSv4.0 we need a file handle and name to get that via + * SECINFO, so we'll do that on the last component of the server path we are + * mounting. If we are mounting the server's root, we'll need to defer the + * SECINFO call to the first successful LOOKUP request. */ - if (argp->sotype == SOCK_STREAM) - argp->flags &= ~NFSMNT_NOCONN; - - if (vfs_flags(mp) & MNT_UPDATE) { - nmp = VFSTONFS(mp); - /* update paths, file handles, etc, here XXX */ - mbuf_freem(nam); - return (0); - } else { - MALLOC_ZONE(nmp, struct nfsmount *, - sizeof (struct nfsmount), M_NFSMNT, M_WAITOK); - if (!nmp) { - mbuf_freem(nam); - return (ENOMEM); + if (!nmp->nm_sec.count) + nmp->nm_state |= NFSSTA_NEEDSECINFO; + + /* make a copy of the current location's path */ + nfsp = &nmp->nm_locations.nl_locations[nmp->nm_locations.nl_current.nli_loc]->nl_path; + bzero(&fspath, sizeof(fspath)); + fspath.np_compcount = nfsp->np_compcount; + if (fspath.np_compcount > 0) { + MALLOC(fspath.np_components, char **, fspath.np_compcount*sizeof(char*), M_TEMP, M_WAITOK|M_ZERO); + if (!fspath.np_components) { + error = ENOMEM; + goto nfsmout; } - bzero((caddr_t)nmp, sizeof (struct nfsmount)); - lck_mtx_init(&nmp->nm_lock, nfs_mount_grp, LCK_ATTR_NULL); - TAILQ_INIT(&nmp->nm_resendq); - TAILQ_INIT(&nmp->nm_iodq); - TAILQ_INIT(&nmp->nm_gsscl); - vfs_setfsprivate(mp, nmp); - - nfs_nhinit_finish(); + for (comp=0; comp < nfsp->np_compcount; comp++) { + int slen = strlen(nfsp->np_components[comp]); + MALLOC(fspath.np_components[comp], char *, slen+1, M_TEMP, M_WAITOK|M_ZERO); + if (!fspath.np_components[comp]) { + error = ENOMEM; + break; + } + strlcpy(fspath.np_components[comp], nfsp->np_components[comp], slen+1); + } + if (error) + goto nfsmout; } - lck_mtx_lock(&nmp->nm_lock); - /* setup defaults */ - nmp->nm_vers = NFS_VER2; - nmp->nm_timeo = NFS_TIMEO; - nmp->nm_retry = NFS_RETRANS; - if (argp->sotype == SOCK_DGRAM) { - nmp->nm_wsize = NFS_DGRAM_WSIZE; - nmp->nm_rsize = NFS_DGRAM_RSIZE; - } else { - nmp->nm_wsize = NFS_WSIZE; - nmp->nm_rsize = NFS_RSIZE; + /* for mirror mounts, we can just use the file handle passed in */ + if (nmp->nm_fh) { + dirfh.fh_len = nmp->nm_fh->fh_len; + bcopy(nmp->nm_fh->fh_data, dirfh.fh_data, dirfh.fh_len); + NFSREQ_SECINFO_SET(&si, NULL, dirfh.fh_data, dirfh.fh_len, NULL, 0); + goto gotfh; } - nmp->nm_readdirsize = NFS_READDIRSIZE; - nmp->nm_numgrps = NFS_MAXGRPS; - nmp->nm_readahead = NFS_DEFRAHEAD; - nmp->nm_tprintf_delay = nfs_tprintf_delay; - if (nmp->nm_tprintf_delay < 0) - nmp->nm_tprintf_delay = 0; - nmp->nm_tprintf_initial_delay = nfs_tprintf_initial_delay; - if (nmp->nm_tprintf_initial_delay < 0) - nmp->nm_tprintf_initial_delay = 0; - nmp->nm_acregmin = NFS_MINATTRTIMO; - nmp->nm_acregmax = NFS_MAXATTRTIMO; - nmp->nm_acdirmin = NFS_MINDIRATTRTIMO; - nmp->nm_acdirmax = NFS_MAXDIRATTRTIMO; - nmp->nm_auth = RPCAUTH_SYS; - nmp->nm_deadtimeout = 0; - - vfs_getnewfsid(mp); - nmp->nm_mountp = mp; - vfs_setauthopaque(mp); - nmp->nm_flag = argp->flags; - nmp->nm_nam = nam; - - if (argp->flags & NFSMNT_NFSV4) { - nmp->nm_vers = NFS_VER4; - /* NFSv4 is only allowed over TCP. */ - if (argp->sotype != SOCK_STREAM) { - error = EINVAL; - goto bad; + + /* otherwise, we need to get the fh for the directory we are mounting */ + + /* if no components, just get root */ + if (fspath.np_compcount == 0) { +nocomponents: + // PUTROOTFH + GETATTR(FH) + NFSREQ_SECINFO_SET(&si, NULL, NULL, 0, NULL, 0); + numops = 2; + nfsm_chain_build_alloc_init(error, &nmreq, 9 * NFSX_UNSIGNED); + nfsm_chain_add_compound_header(error, &nmreq, "mount", numops); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTROOTFH); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + NFS_CLEAR_ATTRIBUTES(bitmap); + NFS4_DEFAULT_ATTRIBUTES(bitmap); + NFS_BITMAP_SET(bitmap, NFS_FATTR_FILEHANDLE); + nfsm_chain_add_bitmap(error, &nmreq, bitmap, NFS_ATTR_BITMAP_LEN); + nfsm_chain_build_done(error, &nmreq); + nfsm_assert(error, (numops == 0), EPROTO); + nfsmout_if(error); + error = nfs_request_async(NULL, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, + vfs_context_thread(ctx), vfs_context_ucred(ctx), &si, 0, NULL, &req); + if (!error) + error = nfs_request_async_finish(req, &nmrep, &xid, &status); + nfsm_chain_skip_tag(error, &nmrep); + nfsm_chain_get_32(error, &nmrep, numops); + nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTROOTFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); + nfsmout_if(error); + NFS_CLEAR_ATTRIBUTES(nmp->nm_fsattr.nfsa_bitmap); + error = nfs4_parsefattr(&nmrep, &nmp->nm_fsattr, &nvattr, &dirfh, NULL, NULL); + if (!error && !NFS_BITMAP_ISSET(&nvattr.nva_bitmap, NFS_FATTR_FILEHANDLE)) { + printf("nfs: mount didn't return filehandle?\n"); + error = EBADRPC; } - } else if (argp->flags & NFSMNT_NFSV3) - nmp->nm_vers = NFS_VER3; + nfsmout_if(error); + nfsm_chain_cleanup(&nmrep); + nfsm_chain_null(&nmreq); + NVATTR_CLEANUP(&nvattr); + goto gotfh; + } - if (nmp->nm_vers == NFS_VER2) - nmp->nm_flag &= ~NFSMNT_RDIRPLUS; + /* look up each path component */ + for (comp=0; comp < fspath.np_compcount; ) { + isdotdot = 0; + if (fspath.np_components[comp][0] == '.') { + if (fspath.np_components[comp][1] == '\0') { + /* skip "." */ + comp++; + continue; + } + /* treat ".." specially */ + if ((fspath.np_components[comp][1] == '.') && + (fspath.np_components[comp][2] == '\0')) + isdotdot = 1; + if (isdotdot && (dirfh.fh_len == 0)) { + /* ".." in root directory is same as "." */ + comp++; + continue; + } + } + // PUT(ROOT)FH + LOOKUP(P) + GETFH + GETATTR + if (dirfh.fh_len == 0) + NFSREQ_SECINFO_SET(&si, NULL, NULL, 0, isdotdot ? NULL : fspath.np_components[comp], 0); + else + NFSREQ_SECINFO_SET(&si, NULL, dirfh.fh_data, dirfh.fh_len, isdotdot ? NULL : fspath.np_components[comp], 0); + numops = 4; + nfsm_chain_build_alloc_init(error, &nmreq, 18 * NFSX_UNSIGNED); + nfsm_chain_add_compound_header(error, &nmreq, "mount", numops); + numops--; + if (dirfh.fh_len) { + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, NFS_VER4, dirfh.fh_data, dirfh.fh_len); + } else { + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTROOTFH); + } + numops--; + if (isdotdot) { + nfsm_chain_add_32(error, &nmreq, NFS_OP_LOOKUPP); + } else { + nfsm_chain_add_32(error, &nmreq, NFS_OP_LOOKUP); + nfsm_chain_add_name(error, &nmreq, + fspath.np_components[comp], strlen(fspath.np_components[comp]), nmp); + } + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETFH); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + NFS_CLEAR_ATTRIBUTES(bitmap); + NFS4_DEFAULT_ATTRIBUTES(bitmap); + /* if no namedattr support or component is ".zfs", clear NFS_FATTR_NAMED_ATTR */ + if (NMFLAG(nmp, NONAMEDATTR) || !strcmp(fspath.np_components[comp], ".zfs")) + NFS_BITMAP_CLR(bitmap, NFS_FATTR_NAMED_ATTR); + nfsm_chain_add_bitmap(error, &nmreq, bitmap, NFS_ATTR_BITMAP_LEN); + nfsm_chain_build_done(error, &nmreq); + nfsm_assert(error, (numops == 0), EPROTO); + nfsmout_if(error); + error = nfs_request_async(NULL, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, + vfs_context_thread(ctx), vfs_context_ucred(ctx), &si, 0, NULL, &req); + if (!error) + error = nfs_request_async_finish(req, &nmrep, &xid, &status); + nfsm_chain_skip_tag(error, &nmrep); + nfsm_chain_get_32(error, &nmrep, numops); + nfsm_chain_op_check(error, &nmrep, dirfh.fh_len ? NFS_OP_PUTFH : NFS_OP_PUTROOTFH); + nfsm_chain_op_check(error, &nmrep, isdotdot ? NFS_OP_LOOKUPP : NFS_OP_LOOKUP); + nfsmout_if(error); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETFH); + nfsm_chain_get_32(error, &nmrep, fh.fh_len); + nfsm_chain_get_opaque(error, &nmrep, fh.fh_len, fh.fh_data); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); + if (!error) { + NFS_CLEAR_ATTRIBUTES(nmp->nm_fsattr.nfsa_bitmap); + error = nfs4_parsefattr(&nmrep, &nmp->nm_fsattr, &nvattr, NULL, NULL, &nfsls); + } + nfsm_chain_cleanup(&nmrep); + nfsm_chain_null(&nmreq); + if (error) { + /* LOOKUP succeeded but GETATTR failed? This could be a referral. */ + /* Try the lookup again with a getattr for fs_locations. */ + nfs_fs_locations_cleanup(&nfsls); + error = nfs4_get_fs_locations(nmp, NULL, dirfh.fh_data, dirfh.fh_len, fspath.np_components[comp], ctx, &nfsls); + if (!error && (nfsls.nl_numlocs < 1)) + error = ENOENT; + nfsmout_if(error); + if (++loopcnt > MAXSYMLINKS) { + /* too many symlink/referral redirections */ + error = ELOOP; + goto nfsmout; + } + /* tear down the current connection */ + nfs_disconnect(nmp); + /* replace fs locations */ + nfs_fs_locations_cleanup(&nmp->nm_locations); + nmp->nm_locations = nfsls; + bzero(&nfsls, sizeof(nfsls)); + /* initiate a connection using the new fs locations */ + error = nfs_mount_connect(nmp); + if (!error && !(nmp->nm_locations.nl_current.nli_flags & NLI_VALID)) + error = EIO; + nfsmout_if(error); + /* add new server's remote path to beginning of our path and continue */ + nfsp = &nmp->nm_locations.nl_locations[nmp->nm_locations.nl_current.nli_loc]->nl_path; + bzero(&fspath2, sizeof(fspath2)); + fspath2.np_compcount = (fspath.np_compcount - comp - 1) + nfsp->np_compcount; + if (fspath2.np_compcount > 0) { + MALLOC(fspath2.np_components, char **, fspath2.np_compcount*sizeof(char*), M_TEMP, M_WAITOK|M_ZERO); + if (!fspath2.np_components) { + error = ENOMEM; + goto nfsmout; + } + for (comp2=0; comp2 < nfsp->np_compcount; comp2++) { + int slen = strlen(nfsp->np_components[comp2]); + MALLOC(fspath2.np_components[comp2], char *, slen+1, M_TEMP, M_WAITOK|M_ZERO); + if (!fspath2.np_components[comp2]) { + /* clean up fspath2, then error out */ + while (comp2 > 0) { + comp2--; + FREE(fspath2.np_components[comp2], M_TEMP); + } + FREE(fspath2.np_components, M_TEMP); + error = ENOMEM; + goto nfsmout; + } + strlcpy(fspath2.np_components[comp2], nfsp->np_components[comp2], slen+1); + } + if ((fspath.np_compcount - comp - 1) > 0) + bcopy(&fspath.np_components[comp+1], &fspath2.np_components[nfsp->np_compcount], (fspath.np_compcount - comp - 1)*sizeof(char*)); + /* free up unused parts of old path (prior components and component array) */ + do { + FREE(fspath.np_components[comp], M_TEMP); + } while (comp-- > 0); + FREE(fspath.np_components, M_TEMP); + /* put new path in place */ + fspath = fspath2; + } + /* reset dirfh and component index */ + dirfh.fh_len = 0; + comp = 0; + NVATTR_CLEANUP(&nvattr); + if (fspath.np_compcount == 0) + goto nocomponents; + continue; + } + nfsmout_if(error); + /* if file handle is for a symlink, then update the path with the symlink contents */ + if (NFS_BITMAP_ISSET(&nvattr.nva_bitmap, NFS_FATTR_TYPE) && (nvattr.nva_type == VLNK)) { + if (++loopcnt > MAXSYMLINKS) + error = ELOOP; + else + error = nfs4_mount_update_path_with_symlink(nmp, &fspath, comp, &dirfh, &depth, &fh, ctx); + nfsmout_if(error); + /* directory file handle is either left the same or reset to root (if link was absolute) */ + /* path traversal starts at beginning of the path again */ + comp = 0; + NVATTR_CLEANUP(&nvattr); + nfs_fs_locations_cleanup(&nfsls); + continue; + } + NVATTR_CLEANUP(&nvattr); + nfs_fs_locations_cleanup(&nfsls); + /* not a symlink... */ + if ((nmp->nm_state & NFSSTA_NEEDSECINFO) && (comp == (fspath.np_compcount-1)) && !isdotdot) { + /* need to get SECINFO for the directory being mounted */ + if (dirfh.fh_len == 0) + NFSREQ_SECINFO_SET(&si, NULL, NULL, 0, isdotdot ? NULL : fspath.np_components[comp], 0); + else + NFSREQ_SECINFO_SET(&si, NULL, dirfh.fh_data, dirfh.fh_len, isdotdot ? NULL : fspath.np_components[comp], 0); + sec.count = NX_MAX_SEC_FLAVORS; + error = nfs4_secinfo_rpc(nmp, &si, vfs_context_ucred(ctx), sec.flavors, &sec.count); + /* [sigh] some implementations return "illegal" error for unsupported ops */ + if (error == NFSERR_OP_ILLEGAL) + error = 0; + nfsmout_if(error); + /* set our default security flavor to the first in the list */ + if (sec.count) + nmp->nm_auth = sec.flavors[0]; + nmp->nm_state &= ~NFSSTA_NEEDSECINFO; + } + /* advance directory file handle, component index, & update depth */ + dirfh = fh; + comp++; + if (!isdotdot) /* going down the hierarchy */ + depth++; + else if (--depth <= 0) /* going up the hierarchy */ + dirfh.fh_len = 0; /* clear dirfh when we hit root */ + } - if ((argp->flags & NFSMNT_TIMEO) && argp->timeo > 0) { - nmp->nm_timeo = (argp->timeo * NFS_HZ + 5) / 10; - if (nmp->nm_timeo < NFS_MINTIMEO) - nmp->nm_timeo = NFS_MINTIMEO; - else if (nmp->nm_timeo > NFS_MAXTIMEO) - nmp->nm_timeo = NFS_MAXTIMEO; +gotfh: + /* get attrs for mount point root */ + numops = NMFLAG(nmp, NONAMEDATTR) ? 2 : 3; // PUTFH + GETATTR + OPENATTR + nfsm_chain_build_alloc_init(error, &nmreq, 25 * NFSX_UNSIGNED); + nfsm_chain_add_compound_header(error, &nmreq, "mount", numops); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, NFS_VER4, dirfh.fh_data, dirfh.fh_len); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + NFS_CLEAR_ATTRIBUTES(bitmap); + NFS4_DEFAULT_ATTRIBUTES(bitmap); + /* if no namedattr support or last component is ".zfs", clear NFS_FATTR_NAMED_ATTR */ + if (NMFLAG(nmp, NONAMEDATTR) || ((fspath.np_compcount > 0) && !strcmp(fspath.np_components[fspath.np_compcount-1], ".zfs"))) + NFS_BITMAP_CLR(bitmap, NFS_FATTR_NAMED_ATTR); + nfsm_chain_add_bitmap(error, &nmreq, bitmap, NFS_ATTR_BITMAP_LEN); + if (!NMFLAG(nmp, NONAMEDATTR)) { + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_OPENATTR); + nfsm_chain_add_32(error, &nmreq, 0); + } + nfsm_chain_build_done(error, &nmreq); + nfsm_assert(error, (numops == 0), EPROTO); + nfsmout_if(error); + error = nfs_request_async(NULL, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, + vfs_context_thread(ctx), vfs_context_ucred(ctx), &si, 0, NULL, &req); + if (!error) + error = nfs_request_async_finish(req, &nmrep, &xid, &status); + nfsm_chain_skip_tag(error, &nmrep); + nfsm_chain_get_32(error, &nmrep, numops); + nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); + nfsmout_if(error); + NFS_CLEAR_ATTRIBUTES(nmp->nm_fsattr.nfsa_bitmap); + error = nfs4_parsefattr(&nmrep, &nmp->nm_fsattr, &nvattr, NULL, NULL, NULL); + nfsmout_if(error); + if (!NMFLAG(nmp, NONAMEDATTR)) { + nfsm_chain_op_check(error, &nmrep, NFS_OP_OPENATTR); + if (error == ENOENT) + error = 0; + /* [sigh] some implementations return "illegal" error for unsupported ops */ + if (error || !NFS_BITMAP_ISSET(nmp->nm_fsattr.nfsa_supp_attr, NFS_FATTR_NAMED_ATTR)) { + nmp->nm_fsattr.nfsa_flags &= ~NFS_FSFLAG_NAMED_ATTR; + } else { + nmp->nm_fsattr.nfsa_flags |= NFS_FSFLAG_NAMED_ATTR; + } + } else { + nmp->nm_fsattr.nfsa_flags &= ~NFS_FSFLAG_NAMED_ATTR; + } + if (NMFLAG(nmp, NOACL)) /* make sure ACL support is turned off */ + nmp->nm_fsattr.nfsa_flags &= ~NFS_FSFLAG_ACL; + if (NMFLAG(nmp, ACLONLY) && !(nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_ACL)) + NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_ACLONLY); + if (NFS_BITMAP_ISSET(nmp->nm_fsattr.nfsa_supp_attr, NFS_FATTR_FH_EXPIRE_TYPE)) { + uint32_t fhtype = ((nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_FHTYPE_MASK) >> NFS_FSFLAG_FHTYPE_SHIFT); + if (fhtype != NFS_FH_PERSISTENT) + printf("nfs: warning: non-persistent file handles! for %s\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname); } - if ((argp->flags & NFSMNT_RETRANS) && argp->retrans > 1) { - nmp->nm_retry = argp->retrans; - if (nmp->nm_retry > NFS_MAXREXMIT) - nmp->nm_retry = NFS_MAXREXMIT; + /* make sure it's a directory */ + if (!NFS_BITMAP_ISSET(&nvattr.nva_bitmap, NFS_FATTR_TYPE) || (nvattr.nva_type != VDIR)) { + error = ENOTDIR; + goto nfsmout; } - if (nmp->nm_vers != NFS_VER2) { - if (argp->sotype == SOCK_DGRAM) - maxio = NFS_MAXDGRAMDATA; - else - maxio = NFS_MAXDATA; - } else - maxio = NFS_V2MAXDATA; + /* save the NFS fsid */ + nmp->nm_fsid = nvattr.nva_fsid; + + /* create the root node */ + error = nfs_nget(nmp->nm_mountp, NULL, NULL, dirfh.fh_data, dirfh.fh_len, &nvattr, &xid, rq.r_auth, NG_MARKROOT, npp); + nfsmout_if(error); + + if (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_ACL) + vfs_setextendedsecurity(nmp->nm_mountp); - if ((argp->flags & NFSMNT_WSIZE) && argp->wsize > 0) { - nmp->nm_wsize = argp->wsize; - /* Round down to multiple of blocksize */ - nmp->nm_wsize &= ~(NFS_FABLKSIZE - 1); - if (nmp->nm_wsize <= 0) - nmp->nm_wsize = NFS_FABLKSIZE; + /* adjust I/O sizes to server limits */ + if (NFS_BITMAP_ISSET(nmp->nm_fsattr.nfsa_bitmap, NFS_FATTR_MAXREAD) && (nmp->nm_fsattr.nfsa_maxread > 0)) { + if (nmp->nm_fsattr.nfsa_maxread < (uint64_t)nmp->nm_rsize) { + nmp->nm_rsize = nmp->nm_fsattr.nfsa_maxread & ~(NFS_FABLKSIZE - 1); + if (nmp->nm_rsize == 0) + nmp->nm_rsize = nmp->nm_fsattr.nfsa_maxread; + } + } + if (NFS_BITMAP_ISSET(nmp->nm_fsattr.nfsa_bitmap, NFS_FATTR_MAXWRITE) && (nmp->nm_fsattr.nfsa_maxwrite > 0)) { + if (nmp->nm_fsattr.nfsa_maxwrite < (uint64_t)nmp->nm_wsize) { + nmp->nm_wsize = nmp->nm_fsattr.nfsa_maxwrite & ~(NFS_FABLKSIZE - 1); + if (nmp->nm_wsize == 0) + nmp->nm_wsize = nmp->nm_fsattr.nfsa_maxwrite; + } } - if (nmp->nm_wsize > maxio) - nmp->nm_wsize = maxio; - if (nmp->nm_wsize > NFS_MAXBSIZE) - nmp->nm_wsize = NFS_MAXBSIZE; - if ((argp->flags & NFSMNT_RSIZE) && argp->rsize > 0) { - nmp->nm_rsize = argp->rsize; - /* Round down to multiple of blocksize */ - nmp->nm_rsize &= ~(NFS_FABLKSIZE - 1); - if (nmp->nm_rsize <= 0) - nmp->nm_rsize = NFS_FABLKSIZE; + /* set up lease renew timer */ + nmp->nm_renew_timer = thread_call_allocate(nfs4_renew_timer, nmp); + interval = nmp->nm_fsattr.nfsa_lease / 2; + if (interval < 1) + interval = 1; + nfs_interval_timer_start(nmp->nm_renew_timer, interval * 1000); + +nfsmout: + if (fspath.np_components) { + for (comp=0; comp < fspath.np_compcount; comp++) + if (fspath.np_components[comp]) + FREE(fspath.np_components[comp], M_TEMP); + FREE(fspath.np_components, M_TEMP); } - if (nmp->nm_rsize > maxio) - nmp->nm_rsize = maxio; - if (nmp->nm_rsize > NFS_MAXBSIZE) - nmp->nm_rsize = NFS_MAXBSIZE; + NVATTR_CLEANUP(&nvattr); + nfs_fs_locations_cleanup(&nfsls); + if (*npp) + nfs_node_unlock(*npp); + nfsm_chain_cleanup(&nmreq); + nfsm_chain_cleanup(&nmrep); + return (error); +} + +/* + * Thread to handle initial NFS mount connection. + */ +void +nfs_mount_connect_thread(void *arg, __unused wait_result_t wr) +{ + struct nfsmount *nmp = arg; + int error = 0, savederror = 0, slpflag = (NMFLAG(nmp, INTR) ? PCATCH : 0); + int done = 0, timeo, tries, maxtries; - if ((argp->flags & NFSMNT_READDIRSIZE) && argp->readdirsize > 0) { - nmp->nm_readdirsize = argp->readdirsize; + if (NM_OMFLAG(nmp, MNTQUICK)) { + timeo = 8; + maxtries = 1; + } else { + timeo = 30; + maxtries = 2; } - if (nmp->nm_readdirsize > maxio) - nmp->nm_readdirsize = maxio; - if (nmp->nm_readdirsize > nmp->nm_rsize) - nmp->nm_readdirsize = nmp->nm_rsize; - if ((argp->flags & NFSMNT_MAXGRPS) && argp->maxgrouplist >= 0 && - argp->maxgrouplist <= NFS_MAXGRPS) - nmp->nm_numgrps = argp->maxgrouplist; - if ((argp->flags & NFSMNT_READAHEAD) && argp->readahead >= 0 && - argp->readahead <= NFS_MAXRAHEAD) - nmp->nm_readahead = argp->readahead; - if (argp->flags & NFSMNT_READAHEAD) - nmp->nm_readahead = argp->readahead; - if (nmp->nm_readahead < 0) - nmp->nm_readahead = 0; - else if (nmp->nm_readahead > NFS_MAXRAHEAD) - nmp->nm_readahead = NFS_MAXRAHEAD; + for (tries = 0; tries < maxtries; tries++) { + error = nfs_connect(nmp, 1, timeo); + switch (error) { + case ETIMEDOUT: + case EAGAIN: + case EPIPE: + case EADDRNOTAVAIL: + case ENETDOWN: + case ENETUNREACH: + case ENETRESET: + case ECONNABORTED: + case ECONNRESET: + case EISCONN: + case ENOTCONN: + case ESHUTDOWN: + case ECONNREFUSED: + case EHOSTDOWN: + case EHOSTUNREACH: + /* just keep retrying on any of these errors */ + break; + case 0: + default: + /* looks like we got an answer... */ + done = 1; + break; + } - if (argp->version >= 4) { - if ((argp->flags & NFSMNT_ACREGMIN) && argp->acregmin >= 0) - nmp->nm_acregmin = argp->acregmin; - if ((argp->flags & NFSMNT_ACREGMAX) && argp->acregmax >= 0) - nmp->nm_acregmax = argp->acregmax; - if ((argp->flags & NFSMNT_ACDIRMIN) && argp->acdirmin >= 0) - nmp->nm_acdirmin = argp->acdirmin; - if ((argp->flags & NFSMNT_ACDIRMAX) && argp->acdirmax >= 0) - nmp->nm_acdirmax = argp->acdirmax; - if (nmp->nm_acregmin > nmp->nm_acregmax) - nmp->nm_acregmin = nmp->nm_acregmax; - if (nmp->nm_acdirmin > nmp->nm_acdirmax) - nmp->nm_acdirmin = nmp->nm_acdirmax; - } - if (argp->version >= 5) { - if (argp->flags & NFSMNT_SECFLAVOR) { - /* - * Check for valid security flavor - */ - switch (argp->auth) { + /* save the best error */ + if (nfs_connect_error_class(error) >= nfs_connect_error_class(savederror)) + savederror = error; + if (done) { + error = savederror; + break; + } + + /* pause before next attempt */ + if ((error = nfs_sigintr(nmp, NULL, current_thread(), 0))) + break; + error = tsleep(nmp, PSOCK|slpflag, "nfs_mount_connect_retry", 2*hz); + if (error && (error != EWOULDBLOCK)) + break; + error = savederror; + } + + /* update status of mount connect */ + lck_mtx_lock(&nmp->nm_lock); + if (!nmp->nm_mounterror) + nmp->nm_mounterror = error; + nmp->nm_state &= ~NFSSTA_MOUNT_THREAD; + lck_mtx_unlock(&nmp->nm_lock); + wakeup(&nmp->nm_nss); +} + +int +nfs_mount_connect(struct nfsmount *nmp) +{ + int error = 0, slpflag; + thread_t thd; + struct timespec ts = { 2, 0 }; + + /* + * Set up the socket. Perform initial search for a location/server/address to + * connect to and negotiate any unspecified mount parameters. This work is + * done on a kernel thread to satisfy reserved port usage needs. + */ + slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0; + lck_mtx_lock(&nmp->nm_lock); + /* set flag that the thread is running */ + nmp->nm_state |= NFSSTA_MOUNT_THREAD; + if (kernel_thread_start(nfs_mount_connect_thread, nmp, &thd) != KERN_SUCCESS) { + nmp->nm_state &= ~NFSSTA_MOUNT_THREAD; + nmp->nm_mounterror = EIO; + printf("nfs mount %s start socket connect thread failed\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname); + } else { + thread_deallocate(thd); + } + + /* wait until mount connect thread is finished/gone */ + while (nmp->nm_state & NFSSTA_MOUNT_THREAD) { + error = msleep(&nmp->nm_nss, &nmp->nm_lock, slpflag|PSOCK, "nfsconnectthread", &ts); + if ((error && (error != EWOULDBLOCK)) || ((error = nfs_sigintr(nmp, NULL, current_thread(), 1)))) { + /* record error */ + if (!nmp->nm_mounterror) + nmp->nm_mounterror = error; + /* signal the thread that we are aborting */ + nmp->nm_sockflags |= NMSOCK_UNMOUNT; + if (nmp->nm_nss) + wakeup(nmp->nm_nss); + /* and continue waiting on it to finish */ + slpflag = 0; + } + } + lck_mtx_unlock(&nmp->nm_lock); + + /* grab mount connect status */ + error = nmp->nm_mounterror; + + return (error); +} + +/* + * Common code to mount an NFS file system. + */ +int +mountnfs( + char *xdrbuf, + mount_t mp, + vfs_context_t ctx, + vnode_t *vpp) +{ + struct nfsmount *nmp; + nfsnode_t np; + int error = 0; + struct vfsstatfs *sbp; + struct xdrbuf xb; + uint32_t i, val, vers = 0, minorvers, maxio, iosize, len; + uint32_t *mattrs; + uint32_t *mflags_mask; + uint32_t *mflags; + uint32_t argslength, attrslength; + struct nfs_location_index firstloc = { NLI_VALID, 0, 0, 0 }; + + /* make sure mbuf constants are set up */ + if (!nfs_mbuf_mhlen) + nfs_mbuf_init(); + + if (vfs_flags(mp) & MNT_UPDATE) { + nmp = VFSTONFS(mp); + /* update paths, file handles, etc, here XXX */ + xb_free(xdrbuf); + return (0); + } else { + /* allocate an NFS mount structure for this mount */ + MALLOC_ZONE(nmp, struct nfsmount *, + sizeof (struct nfsmount), M_NFSMNT, M_WAITOK); + if (!nmp) { + xb_free(xdrbuf); + return (ENOMEM); + } + bzero((caddr_t)nmp, sizeof (struct nfsmount)); + lck_mtx_init(&nmp->nm_lock, nfs_mount_grp, LCK_ATTR_NULL); + TAILQ_INIT(&nmp->nm_resendq); + TAILQ_INIT(&nmp->nm_iodq); + TAILQ_INIT(&nmp->nm_gsscl); + LIST_INIT(&nmp->nm_monlist); + vfs_setfsprivate(mp, nmp); + vfs_getnewfsid(mp); + nmp->nm_mountp = mp; + vfs_setauthopaque(mp); + + nfs_nhinit_finish(); + + nmp->nm_args = xdrbuf; + + /* set up defaults */ + nmp->nm_vers = 0; + nmp->nm_timeo = NFS_TIMEO; + nmp->nm_retry = NFS_RETRANS; + nmp->nm_sotype = 0; + nmp->nm_sofamily = 0; + nmp->nm_nfsport = 0; + nmp->nm_wsize = NFS_WSIZE; + nmp->nm_rsize = NFS_RSIZE; + nmp->nm_readdirsize = NFS_READDIRSIZE; + nmp->nm_numgrps = NFS_MAXGRPS; + nmp->nm_readahead = NFS_DEFRAHEAD; + nmp->nm_tprintf_delay = nfs_tprintf_delay; + if (nmp->nm_tprintf_delay < 0) + nmp->nm_tprintf_delay = 0; + nmp->nm_tprintf_initial_delay = nfs_tprintf_initial_delay; + if (nmp->nm_tprintf_initial_delay < 0) + nmp->nm_tprintf_initial_delay = 0; + nmp->nm_acregmin = NFS_MINATTRTIMO; + nmp->nm_acregmax = NFS_MAXATTRTIMO; + nmp->nm_acdirmin = NFS_MINDIRATTRTIMO; + nmp->nm_acdirmax = NFS_MAXDIRATTRTIMO; + nmp->nm_auth = RPCAUTH_SYS; + nmp->nm_deadtimeout = 0; + NFS_BITMAP_SET(nmp->nm_flags, NFS_MFLAG_NOACL); + } + + mattrs = nmp->nm_mattrs; + mflags = nmp->nm_mflags; + mflags_mask = nmp->nm_mflags_mask; + + /* set up NFS mount with args */ + xb_init_buffer(&xb, xdrbuf, 2*XDRWORD); + xb_get_32(error, &xb, val); /* version */ + xb_get_32(error, &xb, argslength); /* args length */ + nfsmerr_if(error); + xb_init_buffer(&xb, xdrbuf, argslength); /* restart parsing with actual buffer length */ + xb_get_32(error, &xb, val); /* version */ + xb_get_32(error, &xb, argslength); /* args length */ + xb_get_32(error, &xb, val); /* XDR args version */ + if (val != NFS_XDRARGS_VERSION_0) + error = EINVAL; + len = NFS_MATTR_BITMAP_LEN; + xb_get_bitmap(error, &xb, mattrs, len); /* mount attribute bitmap */ + attrslength = 0; + xb_get_32(error, &xb, attrslength); /* attrs length */ + if (!error && (attrslength > (argslength - ((4+NFS_MATTR_BITMAP_LEN+1)*XDRWORD)))) + error = EINVAL; + nfsmerr_if(error); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_FLAGS)) { + len = NFS_MFLAG_BITMAP_LEN; + xb_get_bitmap(error, &xb, mflags_mask, len); /* mount flag mask */ + len = NFS_MFLAG_BITMAP_LEN; + xb_get_bitmap(error, &xb, mflags, len); /* mount flag values */ + if (!error) { + /* clear all mask bits and OR in all the ones that are set */ + nmp->nm_flags[0] &= ~mflags_mask[0]; + nmp->nm_flags[0] |= (mflags_mask[0] & mflags[0]); + } + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_NFS_VERSION)) { + xb_get_32(error, &xb, vers); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_NFS_MINOR_VERSION)) + xb_get_32(error, &xb, minorvers); + else + minorvers = 0; + nfsmerr_if(error); + switch (vers) { + case 2: + nmp->nm_vers = NFS_VER2; + break; + case 3: + nmp->nm_vers = NFS_VER3; + break; + case 4: + switch (minorvers) { + case 0: + nmp->nm_vers = NFS_VER4; + break; + default: + error = EINVAL; + } + break; + default: + error = EINVAL; + } + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_NFS_MINOR_VERSION)) { + /* should have also gotten NFS version (and already gotten minorvers) */ + if (!NFS_BITMAP_ISSET(mattrs, NFS_MATTR_NFS_VERSION)) + error = EINVAL; + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_READ_SIZE)) + xb_get_32(error, &xb, nmp->nm_rsize); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_WRITE_SIZE)) + xb_get_32(error, &xb, nmp->nm_wsize); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_READDIR_SIZE)) + xb_get_32(error, &xb, nmp->nm_readdirsize); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_READAHEAD)) + xb_get_32(error, &xb, nmp->nm_readahead); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_ATTRCACHE_REG_MIN)) { + xb_get_32(error, &xb, nmp->nm_acregmin); + xb_skip(error, &xb, XDRWORD); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_ATTRCACHE_REG_MAX)) { + xb_get_32(error, &xb, nmp->nm_acregmax); + xb_skip(error, &xb, XDRWORD); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_ATTRCACHE_DIR_MIN)) { + xb_get_32(error, &xb, nmp->nm_acdirmin); + xb_skip(error, &xb, XDRWORD); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_ATTRCACHE_DIR_MAX)) { + xb_get_32(error, &xb, nmp->nm_acdirmax); + xb_skip(error, &xb, XDRWORD); + } + nfsmerr_if(error); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_LOCK_MODE)) { + xb_get_32(error, &xb, val); + switch (val) { + case NFS_LOCK_MODE_DISABLED: + case NFS_LOCK_MODE_LOCAL: + if (nmp->nm_vers >= NFS_VER4) { + /* disabled/local lock mode only allowed on v2/v3 */ + error = EINVAL; + break; + } + /* FALLTHROUGH */ + case NFS_LOCK_MODE_ENABLED: + nmp->nm_lockmode = val; + break; + default: + error = EINVAL; + } + } + nfsmerr_if(error); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SECURITY)) { + uint32_t seccnt; + xb_get_32(error, &xb, seccnt); + if (!error && ((seccnt < 1) || (seccnt > NX_MAX_SEC_FLAVORS))) + error = EINVAL; + nfsmerr_if(error); + nmp->nm_sec.count = seccnt; + for (i=0; i < seccnt; i++) { + xb_get_32(error, &xb, nmp->nm_sec.flavors[i]); + /* Check for valid security flavor */ + switch (nmp->nm_sec.flavors[i]) { + case RPCAUTH_NONE: case RPCAUTH_SYS: case RPCAUTH_KRB5: case RPCAUTH_KRB5I: case RPCAUTH_KRB5P: - nmp->nm_auth = argp->auth; break; default: error = EINVAL; - goto bad; } } + /* start with the first flavor */ + nmp->nm_auth = nmp->nm_sec.flavors[0]; } - if (argp->version >= 6) { - if (argp->flags & NFSMNT_DEADTIMEOUT) - nmp->nm_deadtimeout = argp->deadtimeout; + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MAX_GROUP_LIST)) + xb_get_32(error, &xb, nmp->nm_numgrps); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SOCKET_TYPE)) { + char sotype[6]; + + xb_get_32(error, &xb, val); + if (!error && ((val < 3) || (val > 5))) + error = EINVAL; + nfsmerr_if(error); + error = xb_get_bytes(&xb, sotype, val, 0); + nfsmerr_if(error); + sotype[val] = '\0'; + if (!strcmp(sotype, "tcp")) { + nmp->nm_sotype = SOCK_STREAM; + } else if (!strcmp(sotype, "udp")) { + nmp->nm_sotype = SOCK_DGRAM; + } else if (!strcmp(sotype, "tcp4")) { + nmp->nm_sotype = SOCK_STREAM; + nmp->nm_sofamily = AF_INET; + } else if (!strcmp(sotype, "udp4")) { + nmp->nm_sotype = SOCK_DGRAM; + nmp->nm_sofamily = AF_INET; + } else if (!strcmp(sotype, "tcp6")) { + nmp->nm_sotype = SOCK_STREAM; + nmp->nm_sofamily = AF_INET6; + } else if (!strcmp(sotype, "udp6")) { + nmp->nm_sotype = SOCK_DGRAM; + nmp->nm_sofamily = AF_INET6; + } else if (!strcmp(sotype, "inet4")) { + nmp->nm_sofamily = AF_INET; + } else if (!strcmp(sotype, "inet6")) { + nmp->nm_sofamily = AF_INET6; + } else if (!strcmp(sotype, "inet")) { + nmp->nm_sofamily = 0; /* ok */ + } else { + error = EINVAL; + } + if (!error && (nmp->nm_vers >= NFS_VER4) && nmp->nm_sotype && + (nmp->nm_sotype != SOCK_STREAM)) + error = EINVAL; /* NFSv4 is only allowed over TCP. */ + nfsmerr_if(error); } - if ((nmp->nm_flag & NFSMNT_DEADTIMEOUT) && (nmp->nm_deadtimeout <= 0)) - nmp->nm_flag &= ~NFSMNT_DEADTIMEOUT; + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_NFS_PORT)) + xb_get_32(error, &xb, nmp->nm_nfsport); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MOUNT_PORT)) + xb_get_32(error, &xb, nmp->nm_mountport); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_REQUEST_TIMEOUT)) { + /* convert from time to 0.1s units */ + xb_get_32(error, &xb, nmp->nm_timeo); + xb_get_32(error, &xb, val); + nfsmerr_if(error); + if (val >= 1000000000) + error = EINVAL; + nfsmerr_if(error); + nmp->nm_timeo *= 10; + nmp->nm_timeo += (val+100000000-1)/100000000; + /* now convert to ticks */ + nmp->nm_timeo = (nmp->nm_timeo * NFS_HZ + 5) / 10; + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SOFT_RETRY_COUNT)) { + xb_get_32(error, &xb, val); + if (!error && (val > 1)) + nmp->nm_retry = val; + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_DEAD_TIMEOUT)) { + xb_get_32(error, &xb, nmp->nm_deadtimeout); + xb_skip(error, &xb, XDRWORD); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_FH)) { + nfsmerr_if(error); + MALLOC(nmp->nm_fh, fhandle_t *, sizeof(fhandle_t), M_TEMP, M_WAITOK|M_ZERO); + if (!nmp->nm_fh) + error = ENOMEM; + xb_get_32(error, &xb, nmp->nm_fh->fh_len); + nfsmerr_if(error); + error = xb_get_bytes(&xb, (char*)&nmp->nm_fh->fh_data[0], nmp->nm_fh->fh_len, 0); + } + nfsmerr_if(error); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_FS_LOCATIONS)) { + uint32_t loc, serv, addr, comp; + struct nfs_fs_location *fsl; + struct nfs_fs_server *fss; + struct nfs_fs_path *fsp; + + xb_get_32(error, &xb, nmp->nm_locations.nl_numlocs); /* fs location count */ + /* sanity check location count */ + if (!error && ((nmp->nm_locations.nl_numlocs < 1) || (nmp->nm_locations.nl_numlocs > 256))) + error = EINVAL; + nfsmerr_if(error); + MALLOC(nmp->nm_locations.nl_locations, struct nfs_fs_location **, nmp->nm_locations.nl_numlocs * sizeof(struct nfs_fs_location*), M_TEMP, M_WAITOK|M_ZERO); + if (!nmp->nm_locations.nl_locations) + error = ENOMEM; + for (loc = 0; loc < nmp->nm_locations.nl_numlocs; loc++) { + nfsmerr_if(error); + MALLOC(fsl, struct nfs_fs_location *, sizeof(struct nfs_fs_location), M_TEMP, M_WAITOK|M_ZERO); + if (!fsl) + error = ENOMEM; + nmp->nm_locations.nl_locations[loc] = fsl; + xb_get_32(error, &xb, fsl->nl_servcount); /* server count */ + /* sanity check server count */ + if (!error && ((fsl->nl_servcount < 1) || (fsl->nl_servcount > 256))) + error = EINVAL; + nfsmerr_if(error); + MALLOC(fsl->nl_servers, struct nfs_fs_server **, fsl->nl_servcount * sizeof(struct nfs_fs_server*), M_TEMP, M_WAITOK|M_ZERO); + if (!fsl->nl_servers) + error = ENOMEM; + for (serv = 0; serv < fsl->nl_servcount; serv++) { + nfsmerr_if(error); + MALLOC(fss, struct nfs_fs_server *, sizeof(struct nfs_fs_server), M_TEMP, M_WAITOK|M_ZERO); + if (!fss) + error = ENOMEM; + fsl->nl_servers[serv] = fss; + xb_get_32(error, &xb, val); /* server name length */ + /* sanity check server name length */ + if (!error && ((val < 1) || (val > MAXPATHLEN))) + error = EINVAL; + nfsmerr_if(error); + MALLOC(fss->ns_name, char *, val+1, M_TEMP, M_WAITOK|M_ZERO); + if (!fss->ns_name) + error = ENOMEM; + nfsmerr_if(error); + error = xb_get_bytes(&xb, fss->ns_name, val, 0); /* server name */ + xb_get_32(error, &xb, fss->ns_addrcount); /* address count */ + /* sanity check address count (OK to be zero) */ + if (!error && (fss->ns_addrcount > 256)) + error = EINVAL; + nfsmerr_if(error); + if (fss->ns_addrcount > 0) { + MALLOC(fss->ns_addresses, char **, fss->ns_addrcount * sizeof(char *), M_TEMP, M_WAITOK|M_ZERO); + if (!fss->ns_addresses) + error = ENOMEM; + for (addr = 0; addr < fss->ns_addrcount; addr++) { + xb_get_32(error, &xb, val); /* address length */ + /* sanity check address length */ + if (!error && ((val < 1) || (val > 128))) + error = EINVAL; + nfsmerr_if(error); + MALLOC(fss->ns_addresses[addr], char *, val+1, M_TEMP, M_WAITOK|M_ZERO); + if (!fss->ns_addresses[addr]) + error = ENOMEM; + nfsmerr_if(error); + error = xb_get_bytes(&xb, fss->ns_addresses[addr], val, 0); /* address */ + } + } + xb_get_32(error, &xb, val); /* server info length */ + xb_skip(error, &xb, val); /* skip server info */ + } + /* get pathname */ + fsp = &fsl->nl_path; + xb_get_32(error, &xb, fsp->np_compcount); /* component count */ + /* sanity check component count */ + if (!error && (fsp->np_compcount > MAXPATHLEN)) + error = EINVAL; + nfsmerr_if(error); + if (fsp->np_compcount) { + MALLOC(fsp->np_components, char **, fsp->np_compcount * sizeof(char*), M_TEMP, M_WAITOK|M_ZERO); + if (!fsp->np_components) + error = ENOMEM; + } + for (comp = 0; comp < fsp->np_compcount; comp++) { + xb_get_32(error, &xb, val); /* component length */ + /* sanity check component length */ + if (!error && (val == 0)) { + /* + * Apparently some people think a path with zero components should + * be encoded with one zero-length component. So, just ignore any + * zero length components. + */ + comp--; + fsp->np_compcount--; + if (fsp->np_compcount == 0) { + FREE(fsp->np_components, M_TEMP); + fsp->np_components = NULL; + } + continue; + } + if (!error && ((val < 1) || (val > MAXPATHLEN))) + error = EINVAL; + nfsmerr_if(error); + MALLOC(fsp->np_components[comp], char *, val+1, M_TEMP, M_WAITOK|M_ZERO); + if (!fsp->np_components[comp]) + error = ENOMEM; + nfsmerr_if(error); + error = xb_get_bytes(&xb, fsp->np_components[comp], val, 0); /* component */ + } + xb_get_32(error, &xb, val); /* fs location info length */ + xb_skip(error, &xb, val); /* skip fs location info */ + } + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MNTFLAGS)) + xb_skip(error, &xb, XDRWORD); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MNTFROM)) { + xb_get_32(error, &xb, len); + nfsmerr_if(error); + val = len; + if (val >= sizeof(vfs_statfs(mp)->f_mntfromname)) + val = sizeof(vfs_statfs(mp)->f_mntfromname) - 1; + error = xb_get_bytes(&xb, vfs_statfs(mp)->f_mntfromname, val, 0); + if ((len - val) > 0) + xb_skip(error, &xb, len - val); + nfsmerr_if(error); + vfs_statfs(mp)->f_mntfromname[val] = '\0'; + } + nfsmerr_if(error); + + /* + * Sanity check/finalize settings. + */ + + if (nmp->nm_timeo < NFS_MINTIMEO) + nmp->nm_timeo = NFS_MINTIMEO; + else if (nmp->nm_timeo > NFS_MAXTIMEO) + nmp->nm_timeo = NFS_MAXTIMEO; + if (nmp->nm_retry > NFS_MAXREXMIT) + nmp->nm_retry = NFS_MAXREXMIT; + + if (nmp->nm_numgrps > NFS_MAXGRPS) + nmp->nm_numgrps = NFS_MAXGRPS; + if (nmp->nm_readahead > NFS_MAXRAHEAD) + nmp->nm_readahead = NFS_MAXRAHEAD; + if (nmp->nm_acregmin > nmp->nm_acregmax) + nmp->nm_acregmin = nmp->nm_acregmax; + if (nmp->nm_acdirmin > nmp->nm_acdirmax) + nmp->nm_acdirmin = nmp->nm_acdirmax; + + /* need at least one fs location */ + if (nmp->nm_locations.nl_numlocs < 1) + error = EINVAL; + nfsmerr_if(error); + + /* init mount's mntfromname to first location */ + if (!NM_OMATTR_GIVEN(nmp, MNTFROM)) + nfs_location_mntfromname(&nmp->nm_locations, firstloc, + vfs_statfs(mp)->f_mntfromname, sizeof(vfs_statfs(mp)->f_mntfromname), 0); + + /* Need to save the mounting credential for v4. */ + nmp->nm_mcred = vfs_context_ucred(ctx); + if (IS_VALID_CRED(nmp->nm_mcred)) + kauth_cred_ref(nmp->nm_mcred); + + /* + * If a reserved port is required, check for that privilege. + * (Note that mirror mounts are exempt because the privilege was + * already checked for the original mount.) + */ + if (NMFLAG(nmp, RESVPORT) && !vfs_iskernelmount(mp)) + error = priv_check_cred(nmp->nm_mcred, PRIV_NETINET_RESERVEDPORT, 0); + nfsmerr_if(error); + + /* do mount's initial socket connection */ + error = nfs_mount_connect(nmp); + nfsmerr_if(error); /* set up the version-specific function tables */ if (nmp->nm_vers < NFS_VER4) @@ -1729,39 +3089,67 @@ mountnfs( else nmp->nm_funcs = &nfs4_funcs; - /* Set up the sockets and related info */ - nmp->nm_sotype = argp->sotype; - nmp->nm_soproto = argp->proto; - if (nmp->nm_sotype == SOCK_DGRAM) - TAILQ_INIT(&nmp->nm_cwndq); - - lck_mtx_unlock(&nmp->nm_lock); - - /* make sure mbuf constants are set up */ - if (!nfs_mbuf_mhlen) - nfs_mbuf_init(); - + /* sanity check settings now that version/connection is set */ + if (nmp->nm_vers == NFS_VER2) /* ignore RDIRPLUS on NFSv2 */ + NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_RDIRPLUS); if (nmp->nm_vers >= NFS_VER4) { - struct timeval now; - microtime(&now); - nmp->nm_mounttime = ((uint64_t)now.tv_sec << 32) | now.tv_usec; - nmp->nm_mcred = vfs_context_ucred(ctx); + if (NFS_BITMAP_ISSET(nmp->nm_flags, NFS_MFLAG_ACLONLY)) /* aclonly trumps noacl */ + NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_NOACL); + NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_CALLUMNT); + if (nmp->nm_lockmode != NFS_LOCK_MODE_ENABLED) + error = EINVAL; /* disabled/local lock mode only allowed on v2/v3 */ + } else { + /* ignore these if not v4 */ + NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_NOCALLBACK); + NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_NONAMEDATTR); + NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_NOACL); + NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_ACLONLY); if (IS_VALID_CRED(nmp->nm_mcred)) - kauth_cred_ref(nmp->nm_mcred); - nfs4_mount_callback_setup(nmp); + kauth_cred_unref(&nmp->nm_mcred); + } + nfsmerr_if(error); + + if (nmp->nm_sotype == SOCK_DGRAM) { + /* I/O size defaults for UDP are different */ + if (!NFS_BITMAP_ISSET(mattrs, NFS_MATTR_READ_SIZE)) + nmp->nm_rsize = NFS_DGRAM_RSIZE; + if (!NFS_BITMAP_ISSET(mattrs, NFS_MATTR_WRITE_SIZE)) + nmp->nm_wsize = NFS_DGRAM_WSIZE; } - /* set up the socket */ - if ((error = nfs_connect(nmp, 1))) - goto bad; + /* round down I/O sizes to multiple of NFS_FABLKSIZE */ + nmp->nm_rsize &= ~(NFS_FABLKSIZE - 1); + if (nmp->nm_rsize <= 0) + nmp->nm_rsize = NFS_FABLKSIZE; + nmp->nm_wsize &= ~(NFS_FABLKSIZE - 1); + if (nmp->nm_wsize <= 0) + nmp->nm_wsize = NFS_FABLKSIZE; + + /* and limit I/O sizes to maximum allowed */ + maxio = (nmp->nm_vers == NFS_VER2) ? NFS_V2MAXDATA : + (nmp->nm_sotype == SOCK_DGRAM) ? NFS_MAXDGRAMDATA : NFS_MAXDATA; + if (maxio > NFS_MAXBSIZE) + maxio = NFS_MAXBSIZE; + if (nmp->nm_rsize > maxio) + nmp->nm_rsize = maxio; + if (nmp->nm_wsize > maxio) + nmp->nm_wsize = maxio; + + if (nmp->nm_readdirsize > maxio) + nmp->nm_readdirsize = maxio; + if (nmp->nm_readdirsize > nmp->nm_rsize) + nmp->nm_readdirsize = nmp->nm_rsize; + + /* Set up the sockets and related info */ + if (nmp->nm_sotype == SOCK_DGRAM) + TAILQ_INIT(&nmp->nm_cwndq); /* * Get the root node/attributes from the NFS server and * do any basic, version-specific setup. */ - error = nmp->nm_funcs->nf_mount(nmp, ctx, argp, &np); - if (error) - goto bad; + error = nmp->nm_funcs->nf_mount(nmp, ctx, &np); + nfsmerr_if(error); /* * A reference count is needed on the node representing the @@ -1776,7 +3164,7 @@ mountnfs( vnode_put(*vpp); if (error) { vnode_recycle(*vpp); - goto bad; + goto nfsmerr; } /* @@ -1788,151 +3176,877 @@ mountnfs( if (!error2) vnode_put(*vpp); vnode_recycle(*vpp); - goto bad; + goto nfsmerr; + } + sbp = vfs_statfs(mp); + sbp->f_bsize = nmp->nm_fsattr.nfsa_bsize; + sbp->f_blocks = nmp->nm_fsattr.nfsa_space_total / sbp->f_bsize; + sbp->f_bfree = nmp->nm_fsattr.nfsa_space_free / sbp->f_bsize; + sbp->f_bavail = nmp->nm_fsattr.nfsa_space_avail / sbp->f_bsize; + sbp->f_bused = (nmp->nm_fsattr.nfsa_space_total / sbp->f_bsize) - + (nmp->nm_fsattr.nfsa_space_free / sbp->f_bsize); + sbp->f_files = nmp->nm_fsattr.nfsa_files_total; + sbp->f_ffree = nmp->nm_fsattr.nfsa_files_free; + sbp->f_iosize = nfs_iosize; + + /* + * Calculate the size used for I/O buffers. Use the larger + * of the two sizes to minimise NFS requests but make sure + * that it is at least one VM page to avoid wasting buffer + * space and to allow easy mmapping of I/O buffers. + * The read/write RPC calls handle the splitting up of + * buffers into multiple requests if the buffer size is + * larger than the I/O size. + */ + iosize = max(nmp->nm_rsize, nmp->nm_wsize); + if (iosize < PAGE_SIZE) + iosize = PAGE_SIZE; + nmp->nm_biosize = trunc_page_32(iosize); + + /* For NFSv3 and greater, there is a (relatively) reliable ACCESS call. */ + if (nmp->nm_vers > NFS_VER2) + vfs_setauthopaqueaccess(mp); + + switch (nmp->nm_lockmode) { + case NFS_LOCK_MODE_DISABLED: + break; + case NFS_LOCK_MODE_LOCAL: + vfs_setlocklocal(nmp->nm_mountp); + break; + case NFS_LOCK_MODE_ENABLED: + default: + if (nmp->nm_vers <= NFS_VER3) + nfs_lockd_mount_register(nmp); + break; + } + + /* success! */ + lck_mtx_lock(&nmp->nm_lock); + nmp->nm_state |= NFSSTA_MOUNTED; + lck_mtx_unlock(&nmp->nm_lock); + return (0); +nfsmerr: + nfs_mount_cleanup(nmp); + return (error); +} + +#if CONFIG_TRIGGERS + +/* + * We've detected a file system boundary on the server and + * need to mount a new file system so that our file systems + * MIRROR the file systems on the server. + * + * Build the mount arguments for the new mount and call kernel_mount(). + */ +int +nfs_mirror_mount_domount(vnode_t dvp, vnode_t vp, vfs_context_t ctx) +{ + nfsnode_t np = VTONFS(vp); + nfsnode_t dnp = VTONFS(dvp); + struct nfsmount *nmp = NFSTONMP(np); + char fstype[MFSTYPENAMELEN], *mntfromname = NULL, *path = NULL, *relpath, *p, *cp; + int error = 0, pathbuflen = MAXPATHLEN, i, mntflags = 0, referral, skipcopy = 0; + size_t nlen; + struct xdrbuf xb, xbnew; + uint32_t mattrs[NFS_MATTR_BITMAP_LEN]; + uint32_t newmattrs[NFS_MATTR_BITMAP_LEN]; + uint32_t newmflags[NFS_MFLAG_BITMAP_LEN]; + uint32_t newmflags_mask[NFS_MFLAG_BITMAP_LEN]; + uint32_t argslength = 0, val, count, mlen, mlen2, rlen, relpathcomps; + uint32_t argslength_offset, attrslength_offset, end_offset; + uint32_t numlocs, loc, numserv, serv, numaddr, addr, numcomp, comp; + char buf[XDRWORD]; + struct nfs_fs_locations nfsls; + + referral = (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL); + if (referral) + bzero(&nfsls, sizeof(nfsls)); + + xb_init(&xbnew, 0); + + if (!nmp || (nmp->nm_state & NFSSTA_FORCE)) + return (ENXIO); + + /* allocate a couple path buffers we need */ + MALLOC_ZONE(mntfromname, char *, pathbuflen, M_NAMEI, M_WAITOK); + if (!mntfromname) { + error = ENOMEM; + goto nfsmerr; + } + MALLOC_ZONE(path, char *, pathbuflen, M_NAMEI, M_WAITOK); + if (!path) { + error = ENOMEM; + goto nfsmerr; + } + + /* get the path for the directory being mounted on */ + error = vn_getpath(vp, path, &pathbuflen); + if (error) { + error = ENOMEM; + goto nfsmerr; + } + + /* + * Set up the mntfromname for the new mount based on the + * current mount's mntfromname and the directory's path + * relative to the current mount's mntonname. + * Set up relpath to point at the relative path on the current mount. + * Also, count the number of components in relpath. + * We'll be adding those to each fs location path in the new args. + */ + nlen = strlcpy(mntfromname, vfs_statfs(nmp->nm_mountp)->f_mntfromname, MAXPATHLEN); + if ((nlen > 0) && (mntfromname[nlen-1] == '/')) { /* avoid double '/' in new name */ + mntfromname[nlen-1] = '\0'; + nlen--; + } + relpath = mntfromname + nlen; + nlen = strlcat(mntfromname, path + strlen(vfs_statfs(nmp->nm_mountp)->f_mntonname), MAXPATHLEN); + if (nlen >= MAXPATHLEN) { + error = ENAMETOOLONG; + goto nfsmerr; + } + /* count the number of components in relpath */ + p = relpath; + while (*p && (*p == '/')) + p++; + relpathcomps = 0; + while (*p) { + relpathcomps++; + while (*p && (*p != '/')) + p++; + while (*p && (*p == '/')) + p++; + } + + /* grab a copy of the file system type */ + vfs_name(vnode_mount(vp), fstype); + + /* for referrals, fetch the fs locations */ + if (referral) { + const char *vname = vnode_getname(NFSTOV(np)); + if (!vname) { + error = ENOENT; + } else { + error = nfs4_get_fs_locations(nmp, dnp, NULL, 0, vname, ctx, &nfsls); + vnode_putname(vname); + if (!error && (nfsls.nl_numlocs < 1)) + error = ENOENT; + } + nfsmerr_if(error); + } + + /* set up NFS mount args based on current mount args */ + +#define xb_copy_32(E, XBSRC, XBDST, V) \ + do { \ + if (E) break; \ + xb_get_32((E), (XBSRC), (V)); \ + if (skipcopy) break; \ + xb_add_32((E), (XBDST), (V)); \ + } while (0) +#define xb_copy_opaque(E, XBSRC, XBDST) \ + do { \ + uint32_t __count, __val; \ + xb_copy_32((E), (XBSRC), (XBDST), __count); \ + if (E) break; \ + __count = nfsm_rndup(__count); \ + __count /= XDRWORD; \ + while (__count-- > 0) \ + xb_copy_32((E), (XBSRC), (XBDST), __val); \ + } while (0) + + xb_init_buffer(&xb, nmp->nm_args, 2*XDRWORD); + xb_get_32(error, &xb, val); /* version */ + xb_get_32(error, &xb, argslength); /* args length */ + xb_init_buffer(&xb, nmp->nm_args, argslength); + + xb_init_buffer(&xbnew, NULL, 0); + xb_copy_32(error, &xb, &xbnew, val); /* version */ + argslength_offset = xb_offset(&xbnew); + xb_copy_32(error, &xb, &xbnew, val); /* args length */ + xb_copy_32(error, &xb, &xbnew, val); /* XDR args version */ + count = NFS_MATTR_BITMAP_LEN; + xb_get_bitmap(error, &xb, mattrs, count); /* mount attribute bitmap */ + nfsmerr_if(error); + for (i = 0; i < NFS_MATTR_BITMAP_LEN; i++) + newmattrs[i] = mattrs[i]; + if (referral) + NFS_BITMAP_SET(newmattrs, NFS_MATTR_FS_LOCATIONS); + else + NFS_BITMAP_SET(newmattrs, NFS_MATTR_FH); + NFS_BITMAP_SET(newmattrs, NFS_MATTR_FLAGS); + NFS_BITMAP_SET(newmattrs, NFS_MATTR_MNTFLAGS); + NFS_BITMAP_CLR(newmattrs, NFS_MATTR_MNTFROM); + xb_add_bitmap(error, &xbnew, newmattrs, NFS_MATTR_BITMAP_LEN); + attrslength_offset = xb_offset(&xbnew); + xb_copy_32(error, &xb, &xbnew, val); /* attrs length */ + NFS_BITMAP_ZERO(newmflags_mask, NFS_MFLAG_BITMAP_LEN); + NFS_BITMAP_ZERO(newmflags, NFS_MFLAG_BITMAP_LEN); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_FLAGS)) { + count = NFS_MFLAG_BITMAP_LEN; + xb_get_bitmap(error, &xb, newmflags_mask, count); /* mount flag mask bitmap */ + count = NFS_MFLAG_BITMAP_LEN; + xb_get_bitmap(error, &xb, newmflags, count); /* mount flag bitmap */ + } + NFS_BITMAP_SET(newmflags_mask, NFS_MFLAG_EPHEMERAL); + NFS_BITMAP_SET(newmflags, NFS_MFLAG_EPHEMERAL); + xb_add_bitmap(error, &xbnew, newmflags_mask, NFS_MFLAG_BITMAP_LEN); + xb_add_bitmap(error, &xbnew, newmflags, NFS_MFLAG_BITMAP_LEN); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_NFS_VERSION)) + xb_copy_32(error, &xb, &xbnew, val); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_NFS_MINOR_VERSION)) + xb_copy_32(error, &xb, &xbnew, val); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_READ_SIZE)) + xb_copy_32(error, &xb, &xbnew, val); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_WRITE_SIZE)) + xb_copy_32(error, &xb, &xbnew, val); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_READDIR_SIZE)) + xb_copy_32(error, &xb, &xbnew, val); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_READAHEAD)) + xb_copy_32(error, &xb, &xbnew, val); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_ATTRCACHE_REG_MIN)) { + xb_copy_32(error, &xb, &xbnew, val); + xb_copy_32(error, &xb, &xbnew, val); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_ATTRCACHE_REG_MAX)) { + xb_copy_32(error, &xb, &xbnew, val); + xb_copy_32(error, &xb, &xbnew, val); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_ATTRCACHE_DIR_MIN)) { + xb_copy_32(error, &xb, &xbnew, val); + xb_copy_32(error, &xb, &xbnew, val); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_ATTRCACHE_DIR_MAX)) { + xb_copy_32(error, &xb, &xbnew, val); + xb_copy_32(error, &xb, &xbnew, val); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_LOCK_MODE)) + xb_copy_32(error, &xb, &xbnew, val); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SECURITY)) { + xb_copy_32(error, &xb, &xbnew, count); + while (!error && (count-- > 0)) + xb_copy_32(error, &xb, &xbnew, val); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MAX_GROUP_LIST)) + xb_copy_32(error, &xb, &xbnew, val); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SOCKET_TYPE)) + xb_copy_opaque(error, &xb, &xbnew); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_NFS_PORT)) + xb_copy_32(error, &xb, &xbnew, val); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MOUNT_PORT)) + xb_copy_32(error, &xb, &xbnew, val); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_REQUEST_TIMEOUT)) { + xb_copy_32(error, &xb, &xbnew, val); + xb_copy_32(error, &xb, &xbnew, val); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SOFT_RETRY_COUNT)) + xb_copy_32(error, &xb, &xbnew, val); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_DEAD_TIMEOUT)) { + xb_copy_32(error, &xb, &xbnew, val); + xb_copy_32(error, &xb, &xbnew, val); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_FH)) { + xb_get_32(error, &xb, count); + xb_skip(error, &xb, count); + } + if (!referral) { + /* set the initial file handle to the directory's file handle */ + xb_add_fh(error, &xbnew, np->n_fhp, np->n_fhsize); + } + /* copy/extend/skip fs locations */ + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_FS_LOCATIONS)) { + numlocs = numserv = numaddr = numcomp = 0; + if (referral) /* don't copy the fs locations for a referral */ + skipcopy = 1; + xb_copy_32(error, &xb, &xbnew, numlocs); /* location count */ + for (loc = 0; !error && (loc < numlocs); loc++) { + xb_copy_32(error, &xb, &xbnew, numserv); /* server count */ + for (serv = 0; !error && (serv < numserv); serv++) { + xb_copy_opaque(error, &xb, &xbnew); /* server name */ + xb_copy_32(error, &xb, &xbnew, numaddr); /* address count */ + for (addr = 0; !error && (addr < numaddr); addr++) + xb_copy_opaque(error, &xb, &xbnew); /* address */ + xb_copy_opaque(error, &xb, &xbnew); /* server info */ + } + /* pathname */ + xb_get_32(error, &xb, numcomp); /* component count */ + if (!skipcopy) + xb_add_32(error, &xbnew, numcomp+relpathcomps); /* new component count */ + for (comp = 0; !error && (comp < numcomp); comp++) + xb_copy_opaque(error, &xb, &xbnew); /* component */ + /* add additional components */ + for (comp = 0; !skipcopy && !error && (comp < relpathcomps); comp++) { + p = relpath; + while (*p && (*p == '/')) + p++; + while (*p && !error) { + cp = p; + while (*p && (*p != '/')) + p++; + xb_add_string(error, &xbnew, cp, (p - cp)); /* component */ + while (*p && (*p == '/')) + p++; + } + } + xb_copy_opaque(error, &xb, &xbnew); /* fs location info */ + } + if (referral) + skipcopy = 0; + } + if (referral) { + /* add referral's fs locations */ + xb_add_32(error, &xbnew, nfsls.nl_numlocs); /* FS_LOCATIONS */ + for (loc = 0; !error && (loc < nfsls.nl_numlocs); loc++) { + xb_add_32(error, &xbnew, nfsls.nl_locations[loc]->nl_servcount); + for (serv = 0; !error && (serv < nfsls.nl_locations[loc]->nl_servcount); serv++) { + xb_add_string(error, &xbnew, nfsls.nl_locations[loc]->nl_servers[serv]->ns_name, + strlen(nfsls.nl_locations[loc]->nl_servers[serv]->ns_name)); + xb_add_32(error, &xbnew, nfsls.nl_locations[loc]->nl_servers[serv]->ns_addrcount); + for (addr = 0; !error && (addr < nfsls.nl_locations[loc]->nl_servers[serv]->ns_addrcount); addr++) + xb_add_string(error, &xbnew, nfsls.nl_locations[loc]->nl_servers[serv]->ns_addresses[addr], + strlen(nfsls.nl_locations[loc]->nl_servers[serv]->ns_addresses[addr])); + xb_add_32(error, &xbnew, 0); /* empty server info */ + } + xb_add_32(error, &xbnew, nfsls.nl_locations[loc]->nl_path.np_compcount); + for (comp = 0; !error && (comp < nfsls.nl_locations[loc]->nl_path.np_compcount); comp++) + xb_add_string(error, &xbnew, nfsls.nl_locations[loc]->nl_path.np_components[comp], + strlen(nfsls.nl_locations[loc]->nl_path.np_components[comp])); + xb_add_32(error, &xbnew, 0); /* empty fs location info */ + } + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MNTFLAGS)) + xb_get_32(error, &xb, mntflags); + /* + * We add the following mount flags to the ones for the mounted-on mount: + * MNT_DONTBROWSE - to keep the mount from showing up as a separate volume + * MNT_AUTOMOUNTED - to keep DiskArb from retriggering the mount after + * an unmount (looking for /.autodiskmounted) + */ + mntflags |= (MNT_AUTOMOUNTED | MNT_DONTBROWSE); + xb_add_32(error, &xbnew, mntflags); + if (!referral && NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MNTFROM)) { + /* copy mntfrom string and add relpath */ + rlen = strlen(relpath); + xb_get_32(error, &xb, mlen); + nfsmerr_if(error); + mlen2 = mlen + ((relpath[0] != '/') ? 1 : 0) + rlen; + xb_add_32(error, &xbnew, mlen2); + count = mlen/XDRWORD; + /* copy the original string */ + while (count-- > 0) + xb_copy_32(error, &xb, &xbnew, val); + if (!error && (mlen % XDRWORD)) { + error = xb_get_bytes(&xb, buf, mlen%XDRWORD, 0); + if (!error) + error = xb_add_bytes(&xbnew, buf, mlen%XDRWORD, 1); + } + /* insert a '/' if the relative path doesn't start with one */ + if (!error && (relpath[0] != '/')) { + buf[0] = '/'; + error = xb_add_bytes(&xbnew, buf, 1, 1); + } + /* add the additional relative path */ + if (!error) + error = xb_add_bytes(&xbnew, relpath, rlen, 1); + /* make sure the resulting string has the right number of pad bytes */ + if (!error && (mlen2 != nfsm_rndup(mlen2))) { + bzero(buf, sizeof(buf)); + count = nfsm_rndup(mlen2) - mlen2; + error = xb_add_bytes(&xbnew, buf, count, 1); + } + } + xb_build_done(error, &xbnew); + + /* update opaque counts */ + end_offset = xb_offset(&xbnew); + if (!error) { + error = xb_seek(&xbnew, argslength_offset); + argslength = end_offset - argslength_offset + XDRWORD/*version*/; + xb_add_32(error, &xbnew, argslength); + } + if (!error) { + error = xb_seek(&xbnew, attrslength_offset); + xb_add_32(error, &xbnew, end_offset - attrslength_offset - XDRWORD/*don't include length field*/); + } + nfsmerr_if(error); + + /* + * For kernel_mount() call, use the existing mount flags (instead of the + * original flags) because flags like MNT_NOSUID and MNT_NODEV may have + * been silently enforced. + */ + mntflags = vnode_vfsvisflags(vp); + mntflags |= (MNT_AUTOMOUNTED | MNT_DONTBROWSE); + + /* do the mount */ + error = kernel_mount(fstype, dvp, vp, path, xb_buffer_base(&xbnew), argslength, + mntflags, KERNEL_MOUNT_PERMIT_UNMOUNT | KERNEL_MOUNT_NOAUTH, ctx); + +nfsmerr: + if (error) + printf("nfs: mirror mount of %s on %s failed (%d)\n", + mntfromname, path, error); + /* clean up */ + xb_cleanup(&xbnew); + if (referral) + nfs_fs_locations_cleanup(&nfsls); + if (path) + FREE_ZONE(path, MAXPATHLEN, M_NAMEI); + if (mntfromname) + FREE_ZONE(mntfromname, MAXPATHLEN, M_NAMEI); + if (!error) + nfs_ephemeral_mount_harvester_start(); + return (error); +} + +/* + * trigger vnode functions + */ + +resolver_result_t +nfs_mirror_mount_trigger_resolve( + vnode_t vp, + const struct componentname *cnp, + enum path_operation pop, + __unused int flags, + __unused void *data, + vfs_context_t ctx) +{ + nfsnode_t np = VTONFS(vp); + vnode_t pvp = NULLVP; + int error = 0; + resolver_result_t result; + + /* + * We have a trigger node that doesn't have anything mounted on it yet. + * We'll do the mount if either: + * (a) this isn't the last component of the path OR + * (b) this is an op that looks like it should trigger the mount. + */ + if (cnp->cn_flags & ISLASTCN) { + switch (pop) { + case OP_MOUNT: + case OP_UNMOUNT: + case OP_STATFS: + case OP_LINK: + case OP_UNLINK: + case OP_RENAME: + case OP_MKNOD: + case OP_MKFIFO: + case OP_SYMLINK: + case OP_ACCESS: + case OP_GETATTR: + case OP_MKDIR: + case OP_RMDIR: + case OP_REVOKE: + case OP_GETXATTR: + case OP_LISTXATTR: + /* don't perform the mount for these operations */ + result = vfs_resolver_result(np->n_trigseq, RESOLVER_NOCHANGE, 0); +#ifdef NFS_TRIGGER_DEBUG + NP(np, "nfs trigger RESOLVE: no change, last %d nameiop %d, seq %d", + (cnp->cn_flags & ISLASTCN) ? 1 : 0, cnp->cn_nameiop, np->n_trigseq); +#endif + return (result); + case OP_OPEN: + case OP_CHDIR: + case OP_CHROOT: + case OP_TRUNCATE: + case OP_COPYFILE: + case OP_PATHCONF: + case OP_READLINK: + case OP_SETATTR: + case OP_EXCHANGEDATA: + case OP_SEARCHFS: + case OP_FSCTL: + case OP_SETXATTR: + case OP_REMOVEXATTR: + default: + /* go ahead and do the mount */ + break; + } + } + + if (vnode_mountedhere(vp) != NULL) { + /* + * Um... there's already something mounted. + * Been there. Done that. Let's just say it succeeded. + */ + error = 0; + goto skipmount; + } + + if ((error = nfs_node_set_busy(np, vfs_context_thread(ctx)))) { + result = vfs_resolver_result(np->n_trigseq, RESOLVER_ERROR, error); +#ifdef NFS_TRIGGER_DEBUG + NP(np, "nfs trigger RESOLVE: busy error %d, last %d nameiop %d, seq %d", + error, (cnp->cn_flags & ISLASTCN) ? 1 : 0, cnp->cn_nameiop, np->n_trigseq); +#endif + return (result); + } + + pvp = vnode_getparent(vp); + if (pvp == NULLVP) + error = EINVAL; + if (!error) + error = nfs_mirror_mount_domount(pvp, vp, ctx); +skipmount: + if (!error) + np->n_trigseq++; + result = vfs_resolver_result(np->n_trigseq, error ? RESOLVER_ERROR : RESOLVER_RESOLVED, error); +#ifdef NFS_TRIGGER_DEBUG + NP(np, "nfs trigger RESOLVE: %s %d, last %d nameiop %d, seq %d", + error ? "error" : "resolved", error, + (cnp->cn_flags & ISLASTCN) ? 1 : 0, cnp->cn_nameiop, np->n_trigseq); +#endif + + if (pvp != NULLVP) + vnode_put(pvp); + nfs_node_clear_busy(np); + return (result); +} + +resolver_result_t +nfs_mirror_mount_trigger_unresolve( + vnode_t vp, + int flags, + __unused void *data, + vfs_context_t ctx) +{ + nfsnode_t np = VTONFS(vp); + mount_t mp; + int error; + resolver_result_t result; + + if ((error = nfs_node_set_busy(np, vfs_context_thread(ctx)))) { + result = vfs_resolver_result(np->n_trigseq, RESOLVER_ERROR, error); +#ifdef NFS_TRIGGER_DEBUG + NP(np, "nfs trigger UNRESOLVE: busy error %d, seq %d", error, np->n_trigseq); +#endif + return (result); + } + + mp = vnode_mountedhere(vp); + if (!mp) + error = EINVAL; + if (!error) + error = vfs_unmountbyfsid(&(vfs_statfs(mp)->f_fsid), flags, ctx); + if (!error) + np->n_trigseq++; + result = vfs_resolver_result(np->n_trigseq, error ? RESOLVER_ERROR : RESOLVER_UNRESOLVED, error); +#ifdef NFS_TRIGGER_DEBUG + NP(np, "nfs trigger UNRESOLVE: %s %d, seq %d", + error ? "error" : "unresolved", error, np->n_trigseq); +#endif + nfs_node_clear_busy(np); + return (result); +} + +resolver_result_t +nfs_mirror_mount_trigger_rearm( + vnode_t vp, + __unused int flags, + __unused void *data, + vfs_context_t ctx) +{ + nfsnode_t np = VTONFS(vp); + int error; + resolver_result_t result; + + if ((error = nfs_node_set_busy(np, vfs_context_thread(ctx)))) { + result = vfs_resolver_result(np->n_trigseq, RESOLVER_ERROR, error); +#ifdef NFS_TRIGGER_DEBUG + NP(np, "nfs trigger REARM: busy error %d, seq %d", error, np->n_trigseq); +#endif + return (result); + } + + np->n_trigseq++; + result = vfs_resolver_result(np->n_trigseq, + vnode_mountedhere(vp) ? RESOLVER_RESOLVED : RESOLVER_UNRESOLVED, 0); +#ifdef NFS_TRIGGER_DEBUG + NP(np, "nfs trigger REARM: %s, seq %d", + vnode_mountedhere(vp) ? "resolved" : "unresolved", np->n_trigseq); +#endif + nfs_node_clear_busy(np); + return (result); +} + +/* + * Periodically attempt to unmount ephemeral (mirror) mounts in an attempt to limit + * the number of unused mounts. + */ + +#define NFS_EPHEMERAL_MOUNT_HARVEST_INTERVAL 120 /* how often the harvester runs */ +struct nfs_ephemeral_mount_harvester_info { + fsid_t fsid; /* FSID that we need to try to unmount */ + uint32_t mountcount; /* count of ephemeral mounts seen in scan */ + }; +/* various globals for the harvester */ +static thread_call_t nfs_ephemeral_mount_harvester_timer = NULL; +static int nfs_ephemeral_mount_harvester_on = 0; + +kern_return_t thread_terminate(thread_t); + +static int +nfs_ephemeral_mount_harvester_callback(mount_t mp, void *arg) +{ + struct nfs_ephemeral_mount_harvester_info *hinfo = arg; + struct nfsmount *nmp; + struct timeval now; + + if (strcmp(mp->mnt_vfsstat.f_fstypename, "nfs")) + return (VFS_RETURNED); + nmp = VFSTONFS(mp); + if (!nmp || !NMFLAG(nmp, EPHEMERAL)) + return (VFS_RETURNED); + hinfo->mountcount++; + + /* avoid unmounting mounts that have been triggered within the last harvest interval */ + microtime(&now); + if ((nmp->nm_mounttime >> 32) > ((uint32_t)now.tv_sec - NFS_EPHEMERAL_MOUNT_HARVEST_INTERVAL)) + return (VFS_RETURNED); + + if (hinfo->fsid.val[0] || hinfo->fsid.val[1]) { + /* attempt to unmount previously-found ephemeral mount */ + vfs_unmountbyfsid(&hinfo->fsid, 0, vfs_context_kernel()); + hinfo->fsid.val[0] = hinfo->fsid.val[1] = 0; } - sbp = vfs_statfs(mp); - sbp->f_bsize = nmp->nm_fsattr.nfsa_bsize; - sbp->f_blocks = nmp->nm_fsattr.nfsa_space_total / sbp->f_bsize; - sbp->f_bfree = nmp->nm_fsattr.nfsa_space_free / sbp->f_bsize; - sbp->f_bavail = nmp->nm_fsattr.nfsa_space_avail / sbp->f_bsize; - sbp->f_bused = (nmp->nm_fsattr.nfsa_space_total / sbp->f_bsize) - - (nmp->nm_fsattr.nfsa_space_free / sbp->f_bsize); - sbp->f_files = nmp->nm_fsattr.nfsa_files_total; - sbp->f_ffree = nmp->nm_fsattr.nfsa_files_free; - sbp->f_iosize = nfs_iosize; /* - * Calculate the size used for I/O buffers. Use the larger - * of the two sizes to minimise NFS requests but make sure - * that it is at least one VM page to avoid wasting buffer - * space and to allow easy mmapping of I/O buffers. - * The read/write RPC calls handle the splitting up of - * buffers into multiple requests if the buffer size is - * larger than the I/O size. + * We can't call unmount here since we hold a mount iter ref + * on mp so save its fsid for the next call iteration to unmount. */ - iosize = max(nmp->nm_rsize, nmp->nm_wsize); - if (iosize < PAGE_SIZE) - iosize = PAGE_SIZE; - nmp->nm_biosize = trunc_page_32(iosize); + hinfo->fsid.val[0] = mp->mnt_vfsstat.f_fsid.val[0]; + hinfo->fsid.val[1] = mp->mnt_vfsstat.f_fsid.val[1]; - /* - * V3 mounts give us a (relatively) reliable remote access(2) - * call, so advertise the fact. - * - * XXX this may not be the best way to go, as the granularity - * offered isn't a good match to our needs. - */ - if (nmp->nm_vers != NFS_VER2) - vfs_setauthopaqueaccess(mp); + return (VFS_RETURNED); +} - if (nmp->nm_flag & NFSMNT_LOCALLOCKS) - vfs_setlocklocal(nmp->nm_mountp); - if (!(nmp->nm_flag & (NFSMNT_NOLOCKS|NFSMNT_LOCALLOCKS))) - nfs_lockd_mount_change(1); +/* + * Spawn a thread to do the ephemeral mount harvesting. + */ +static void +nfs_ephemeral_mount_harvester_timer_func(void) +{ + thread_t thd; - lck_mtx_lock(&nmp->nm_lock); - nmp->nm_state |= NFSSTA_MOUNTED; - lck_mtx_unlock(&nmp->nm_lock); - return (0); -bad: - /* mark the socket for termination */ - lck_mtx_lock(&nmp->nm_lock); - nmp->nm_sockflags |= NMSOCK_UNMOUNT; - /* wait for any socket poking to complete */ - while (nmp->nm_sockflags & NMSOCK_POKE) - msleep(&nmp->nm_sockflags, &nmp->nm_lock, PZERO-1, "nfswaitpoke", &ts); - /* wait for the socket thread to terminate */ - while (nmp->nm_sockthd) { - wakeup(&nmp->nm_sockthd); - msleep(&nmp->nm_sockthd, &nmp->nm_lock, PZERO-1, "nfswaitsockthd", &ts); + if (kernel_thread_start(nfs_ephemeral_mount_harvester, NULL, &thd) == KERN_SUCCESS) + thread_deallocate(thd); +} + +/* + * Iterate all mounts looking for NFS ephemeral mounts to try to unmount. + */ +void +nfs_ephemeral_mount_harvester(__unused void *arg, __unused wait_result_t wr) +{ + struct nfs_ephemeral_mount_harvester_info hinfo; + uint64_t deadline; + + hinfo.mountcount = 0; + hinfo.fsid.val[0] = hinfo.fsid.val[1] = 0; + vfs_iterate(VFS_ITERATE_TAIL_FIRST, nfs_ephemeral_mount_harvester_callback, &hinfo); + if (hinfo.fsid.val[0] || hinfo.fsid.val[1]) { + /* attempt to unmount last found ephemeral mount */ + vfs_unmountbyfsid(&hinfo.fsid, 0, vfs_context_kernel()); } - /* tear down the socket */ - lck_mtx_unlock(&nmp->nm_lock); - nfs_disconnect(nmp); - if (nmp->nm_vers >= NFS_VER4) { - if (nmp->nm_cbid) - nfs4_mount_callback_shutdown(nmp); - if (nmp->nm_renew_timer) { - thread_call_cancel(nmp->nm_renew_timer); - thread_call_free(nmp->nm_renew_timer); - } - if (nmp->nm_longid) { - /* remove/deallocate the client ID data */ - lck_mtx_lock(nfs_global_mutex); - TAILQ_REMOVE(&nfsclientids, nmp->nm_longid, nci_link); - if (nmp->nm_longid->nci_id) - FREE(nmp->nm_longid->nci_id, M_TEMP); - FREE(nmp->nm_longid, M_TEMP); - lck_mtx_unlock(nfs_global_mutex); - } - if (IS_VALID_CRED(nmp->nm_mcred)) - kauth_cred_unref(&nmp->nm_mcred); + + lck_mtx_lock(nfs_global_mutex); + if (!hinfo.mountcount) { + /* no more ephemeral mounts - don't need timer */ + nfs_ephemeral_mount_harvester_on = 0; + } else { + /* re-arm the timer */ + clock_interval_to_deadline(NFS_EPHEMERAL_MOUNT_HARVEST_INTERVAL, NSEC_PER_SEC, &deadline); + thread_call_enter_delayed(nfs_ephemeral_mount_harvester_timer, deadline); + nfs_ephemeral_mount_harvester_on = 1; } - lck_mtx_destroy(&nmp->nm_lock, nfs_mount_grp); - FREE_ZONE((caddr_t)nmp, sizeof (struct nfsmount), M_NFSMNT); - mbuf_freem(nam); - return (error); + lck_mtx_unlock(nfs_global_mutex); + + /* thread done */ + thread_terminate(current_thread()); } +/* + * Make sure the NFS ephemeral mount harvester timer is running. + */ void -nfs3_umount_rpc(struct nfsmount *nmp, vfs_context_t ctx, int timeo) +nfs_ephemeral_mount_harvester_start(void) +{ + uint64_t deadline; + + lck_mtx_lock(nfs_global_mutex); + if (nfs_ephemeral_mount_harvester_on) { + lck_mtx_unlock(nfs_global_mutex); + return; + } + if (nfs_ephemeral_mount_harvester_timer == NULL) + nfs_ephemeral_mount_harvester_timer = thread_call_allocate((thread_call_func_t)nfs_ephemeral_mount_harvester_timer_func, NULL); + clock_interval_to_deadline(NFS_EPHEMERAL_MOUNT_HARVEST_INTERVAL, NSEC_PER_SEC, &deadline); + thread_call_enter_delayed(nfs_ephemeral_mount_harvester_timer, deadline); + nfs_ephemeral_mount_harvester_on = 1; + lck_mtx_unlock(nfs_global_mutex); +} + +#endif + +/* + * Send a MOUNT protocol MOUNT request to the server to get the initial file handle (and security). + */ +int +nfs3_mount_rpc(struct nfsmount *nmp, struct sockaddr *sa, int sotype, int nfsvers, char *path, vfs_context_t ctx, int timeo, fhandle_t *fh, struct nfs_sec *sec) { - int error = 0, auth_len, slen; + int error = 0, slen, mntproto; thread_t thd = vfs_context_thread(ctx); kauth_cred_t cred = vfs_context_ucred(ctx); - char *path; uint64_t xid = 0; struct nfsm_chain nmreq, nmrep; mbuf_t mreq; - uint32_t mntport = 0; - struct sockaddr *nam = mbuf_data(nmp->nm_nam); - struct sockaddr_in saddr; - - bcopy(nam, &saddr, min(sizeof(saddr), nam->sa_len)); - auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ? - nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) + - 5 * NFSX_UNSIGNED; + uint32_t mntvers, mntport, val; + struct sockaddr_storage ss; + struct sockaddr *saddr = (struct sockaddr*)&ss; + nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); - /* send portmap request to get mountd port */ - saddr.sin_port = htons(PMAPPORT); - nfsm_chain_build_alloc_init(error, &nmreq, 4*NFSX_UNSIGNED); - nfsm_chain_add_32(error, &nmreq, RPCPROG_MNT); - nfsm_chain_add_32(error, &nmreq, RPCMNT_VER1); - nfsm_chain_add_32(error, &nmreq, IPPROTO_UDP); - nfsm_chain_add_32(error, &nmreq, 0); + mntvers = (nfsvers == NFS_VER2) ? RPCMNT_VER1 : RPCMNT_VER3; + mntproto = (NM_OMFLAG(nmp, MNTUDP) || (sotype == SOCK_DGRAM)) ? IPPROTO_UDP : IPPROTO_TCP; + sec->count = 0; + + bcopy(sa, saddr, min(sizeof(ss), sa->sa_len)); + if (saddr->sa_family == AF_INET) { + if (nmp->nm_mountport) + ((struct sockaddr_in*)saddr)->sin_port = htons(nmp->nm_mountport); + mntport = ntohs(((struct sockaddr_in*)saddr)->sin_port); + } else { + if (nmp->nm_mountport) + ((struct sockaddr_in6*)saddr)->sin6_port = htons(nmp->nm_mountport); + mntport = ntohs(((struct sockaddr_in6*)saddr)->sin6_port); + } + + while (!mntport) { + error = nfs_portmap_lookup(nmp, ctx, saddr, NULL, RPCPROG_MNT, mntvers, mntproto, timeo); + nfsmout_if(error); + if (saddr->sa_family == AF_INET) + mntport = ntohs(((struct sockaddr_in*)saddr)->sin_port); + else + mntport = ntohs(((struct sockaddr_in6*)saddr)->sin6_port); + if (!mntport) { + /* if not found and TCP, then retry with UDP */ + if (mntproto == IPPROTO_UDP) { + error = EPROGUNAVAIL; + break; + } + mntproto = IPPROTO_UDP; + bcopy(sa, saddr, min(sizeof(ss), sa->sa_len)); + } + } + nfsmout_if(error || !mntport); + + /* MOUNT protocol MOUNT request */ + slen = strlen(path); + nfsm_chain_build_alloc_init(error, &nmreq, NFSX_UNSIGNED + nfsm_rndup(slen)); + nfsm_chain_add_name(error, &nmreq, path, slen, nmp); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfsm_rpchead2(SOCK_DGRAM, PMAPPROG, PMAPVERS, PMAPPROC_GETPORT, - RPCAUTH_SYS, auth_len, cred, NULL, nmreq.nmc_mhead, &xid, &mreq); + error = nfsm_rpchead2(nmp, (mntproto == IPPROTO_UDP) ? SOCK_DGRAM : SOCK_STREAM, + RPCPROG_MNT, mntvers, RPCMNT_MOUNT, + RPCAUTH_SYS, cred, NULL, nmreq.nmc_mhead, &xid, &mreq); nfsmout_if(error); nmreq.nmc_mhead = NULL; - error = nfs_aux_request(nmp, thd, &saddr, mreq, R_XID32(xid), 0, timeo, &nmrep); - nfsmout_if(error); - - /* grab mountd port from portmap response */ - nfsm_chain_get_32(error, &nmrep, mntport); + error = nfs_aux_request(nmp, thd, saddr, NULL, + ((mntproto == IPPROTO_UDP) ? SOCK_DGRAM : SOCK_STREAM), + mreq, R_XID32(xid), 1, timeo, &nmrep); nfsmout_if(error); + nfsm_chain_get_32(error, &nmrep, val); + if (!error && val) + error = val; + nfsm_chain_get_fh(error, &nmrep, nfsvers, fh); + if (!error && (nfsvers > NFS_VER2)) { + sec->count = NX_MAX_SEC_FLAVORS; + error = nfsm_chain_get_secinfo(&nmrep, &sec->flavors[0], &sec->count); + } +nfsmout: nfsm_chain_cleanup(&nmreq); nfsm_chain_cleanup(&nmrep); - xid = 0; + return (error); +} + + +/* + * Send a MOUNT protocol UNMOUNT request to tell the server we've unmounted it. + */ +void +nfs3_umount_rpc(struct nfsmount *nmp, vfs_context_t ctx, int timeo) +{ + int error = 0, slen, mntproto; + thread_t thd = vfs_context_thread(ctx); + kauth_cred_t cred = vfs_context_ucred(ctx); + char *path; + uint64_t xid = 0; + struct nfsm_chain nmreq, nmrep; + mbuf_t mreq; + uint32_t mntvers, mntport; + struct sockaddr_storage ss; + struct sockaddr *saddr = (struct sockaddr*)&ss; + + if (!nmp->nm_saddr) + return; + + nfsm_chain_null(&nmreq); + nfsm_chain_null(&nmrep); + + mntvers = (nmp->nm_vers == NFS_VER2) ? RPCMNT_VER1 : RPCMNT_VER3; + mntproto = (NM_OMFLAG(nmp, MNTUDP) || (nmp->nm_sotype == SOCK_DGRAM)) ? IPPROTO_UDP : IPPROTO_TCP; + mntport = nmp->nm_mountport; + + bcopy(nmp->nm_saddr, saddr, min(sizeof(ss), nmp->nm_saddr->sa_len)); + if (saddr->sa_family == AF_INET) + ((struct sockaddr_in*)saddr)->sin_port = htons(mntport); + else + ((struct sockaddr_in6*)saddr)->sin6_port = htons(mntport); + + while (!mntport) { + error = nfs_portmap_lookup(nmp, ctx, saddr, NULL, RPCPROG_MNT, mntvers, mntproto, timeo); + nfsmout_if(error); + if (saddr->sa_family == AF_INET) + mntport = ntohs(((struct sockaddr_in*)saddr)->sin_port); + else + mntport = ntohs(((struct sockaddr_in6*)saddr)->sin6_port); + /* if not found and mntvers > VER1, then retry with VER1 */ + if (!mntport) { + if (mntvers > RPCMNT_VER1) { + mntvers = RPCMNT_VER1; + } else if (mntproto == IPPROTO_TCP) { + mntproto = IPPROTO_UDP; + mntvers = (nmp->nm_vers == NFS_VER2) ? RPCMNT_VER1 : RPCMNT_VER3; + } else { + break; + } + bcopy(nmp->nm_saddr, saddr, min(sizeof(ss), nmp->nm_saddr->sa_len)); + } + } + nfsmout_if(!mntport); /* MOUNT protocol UNMOUNT request */ - saddr.sin_port = htons(mntport); path = &vfs_statfs(nmp->nm_mountp)->f_mntfromname[0]; while (*path && (*path != '/')) path++; slen = strlen(path); nfsm_chain_build_alloc_init(error, &nmreq, NFSX_UNSIGNED + nfsm_rndup(slen)); - nfsm_chain_add_string(error, &nmreq, path, slen); + nfsm_chain_add_name(error, &nmreq, path, slen, nmp); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfsm_rpchead2(SOCK_DGRAM, RPCPROG_MNT, RPCMNT_VER1, RPCMNT_UMOUNT, - RPCAUTH_SYS, auth_len, cred, NULL, nmreq.nmc_mhead, &xid, &mreq); + error = nfsm_rpchead2(nmp, (mntproto == IPPROTO_UDP) ? SOCK_DGRAM : SOCK_STREAM, + RPCPROG_MNT, RPCMNT_VER1, RPCMNT_UMOUNT, + RPCAUTH_SYS, cred, NULL, nmreq.nmc_mhead, &xid, &mreq); nfsmout_if(error); nmreq.nmc_mhead = NULL; - error = nfs_aux_request(nmp, thd, &saddr, mreq, R_XID32(xid), 1, timeo, &nmrep); + error = nfs_aux_request(nmp, thd, saddr, NULL, + ((mntproto == IPPROTO_UDP) ? SOCK_DGRAM : SOCK_STREAM), + mreq, R_XID32(xid), 1, timeo, &nmrep); nfsmout: nfsm_chain_cleanup(&nmreq); nfsm_chain_cleanup(&nmrep); @@ -1949,15 +4063,15 @@ nfs_vfs_unmount( { struct nfsmount *nmp; vnode_t vp; - int error, flags = 0, docallback; - struct nfsreq *req, *treq; - struct nfs_reqqhead iodq; + int error, flags = 0; struct timespec ts = { 1, 0 }; - struct nfs_open_owner *noop, *nextnoop; - nfsnode_t np; nmp = VFSTONFS(mp); lck_mtx_lock(&nmp->nm_lock); + /* + * Set the flag indicating that an unmount attempt is in progress. + */ + nmp->nm_state |= NFSSTA_UNMOUNTING; /* * During a force unmount we want to... * Mark that we are doing a force unmount. @@ -1966,15 +4080,19 @@ nfs_vfs_unmount( if (mntflags & MNT_FORCE) { flags |= FORCECLOSE; nmp->nm_state |= NFSSTA_FORCE; - nmp->nm_flag |= NFSMNT_SOFT; + NFS_BITMAP_SET(nmp->nm_flags, NFS_MFLAG_SOFT); } + /* + * Wait for any in-progress monitored node scan to complete. + */ + while (nmp->nm_state & NFSSTA_MONITOR_SCAN) + msleep(&nmp->nm_state, &nmp->nm_lock, PZERO-1, "nfswaitmonscan", &ts); /* * Goes something like this.. * - Call vflush() to clear out vnodes for this file system, * except for the swap files. Deal with them in 2nd pass. * - Decrement reference on the vnode representing remote root. - * - Close the socket - * - Free up the data structures + * - Clean up the NFS mount structure. */ vp = NFSTOV(nmp->nm_dnp); lck_mtx_unlock(&nmp->nm_lock); @@ -1989,14 +4107,18 @@ nfs_vfs_unmount( error = vflush(mp, NULLVP, flags); /* locks vp in the process */ } else { if (vnode_isinuse(vp, 1)) - return (EBUSY); - error = vflush(mp, vp, flags); + error = EBUSY; + else + error = vflush(mp, vp, flags); } - if (error) + if (error) { + lck_mtx_lock(&nmp->nm_lock); + nmp->nm_state &= ~NFSSTA_UNMOUNTING; + lck_mtx_unlock(&nmp->nm_lock); return (error); + } lck_mtx_lock(&nmp->nm_lock); - nmp->nm_state &= ~NFSSTA_MOUNTED; nmp->nm_dnp = NULL; lck_mtx_unlock(&nmp->nm_lock); @@ -2010,26 +4132,86 @@ nfs_vfs_unmount( vflush(mp, NULLVP, FORCECLOSE); - /* - * Destroy any RPCSEC_GSS contexts - */ - if (!TAILQ_EMPTY(&nmp->nm_gsscl)) - nfs_gss_clnt_ctx_unmount(nmp, mntflags); + nfs_mount_cleanup(nmp); + return (0); +} - /* mark the socket for termination */ - lck_mtx_lock(&nmp->nm_lock); - nmp->nm_sockflags |= NMSOCK_UNMOUNT; +/* + * cleanup/destroy NFS fs locations structure + */ +void +nfs_fs_locations_cleanup(struct nfs_fs_locations *nfslsp) +{ + struct nfs_fs_location *fsl; + struct nfs_fs_server *fss; + struct nfs_fs_path *fsp; + uint32_t loc, serv, addr, comp; + + /* free up fs locations */ + if (!nfslsp->nl_numlocs || !nfslsp->nl_locations) + return; + + for (loc = 0; loc < nfslsp->nl_numlocs; loc++) { + fsl = nfslsp->nl_locations[loc]; + if (!fsl) + continue; + if ((fsl->nl_servcount > 0) && fsl->nl_servers) { + for (serv = 0; serv < fsl->nl_servcount; serv++) { + fss = fsl->nl_servers[serv]; + if (!fss) + continue; + if ((fss->ns_addrcount > 0) && fss->ns_addresses) { + for (addr = 0; addr < fss->ns_addrcount; addr++) + FREE(fss->ns_addresses[addr], M_TEMP); + FREE(fss->ns_addresses, M_TEMP); + } + FREE(fss->ns_name, M_TEMP); + FREE(fss, M_TEMP); + } + FREE(fsl->nl_servers, M_TEMP); + } + fsp = &fsl->nl_path; + if (fsp->np_compcount && fsp->np_components) { + for (comp = 0; comp < fsp->np_compcount; comp++) + if (fsp->np_components[comp]) + FREE(fsp->np_components[comp], M_TEMP); + FREE(fsp->np_components, M_TEMP); + } + FREE(fsl, M_TEMP); + } + FREE(nfslsp->nl_locations, M_TEMP); + nfslsp->nl_numlocs = 0; + nfslsp->nl_locations = NULL; +} + +/* + * cleanup/destroy an nfsmount + */ +void +nfs_mount_cleanup(struct nfsmount *nmp) +{ + struct nfsreq *req, *treq; + struct nfs_reqqhead iodq; + struct timespec ts = { 1, 0 }; + struct nfs_open_owner *noop, *nextnoop; + nfsnode_t np; + int docallback; /* stop callbacks */ - if ((nmp->nm_vers >= NFS_VER4) && nmp->nm_cbid) + if ((nmp->nm_vers >= NFS_VER4) && !NMFLAG(nmp, NOCALLBACK) && nmp->nm_cbid) nfs4_mount_callback_shutdown(nmp); - /* wait for any socket poking to complete */ - while (nmp->nm_sockflags & NMSOCK_POKE) - msleep(&nmp->nm_sockflags, &nmp->nm_lock, PZERO-1, "nfswaitpoke", &ts); + /* Destroy any RPCSEC_GSS contexts */ + if (!TAILQ_EMPTY(&nmp->nm_gsscl)) + nfs_gss_clnt_ctx_unmount(nmp); + + /* mark the socket for termination */ + lck_mtx_lock(&nmp->nm_lock); + nmp->nm_sockflags |= NMSOCK_UNMOUNT; /* Have the socket thread send the unmount RPC, if requested/appropriate. */ - if ((nmp->nm_vers < NFS_VER4) && !(mntflags & MNT_FORCE) && (nmp->nm_flag & NFSMNT_CALLUMNT)) + if ((nmp->nm_vers < NFS_VER4) && (nmp->nm_state & NFSSTA_MOUNTED) && + !(nmp->nm_state & NFSSTA_FORCE) && NMFLAG(nmp, CALLUMNT)) nfs_mount_sock_thread_wake(nmp); /* wait for the socket thread to terminate */ @@ -2043,15 +4225,16 @@ nfs_vfs_unmount( /* tear down the socket */ nfs_disconnect(nmp); - vfs_setfsprivate(mp, NULL); + if (nmp->nm_mountp) + vfs_setfsprivate(nmp->nm_mountp, NULL); lck_mtx_lock(&nmp->nm_lock); - if ((nmp->nm_vers >= NFS_VER4) && nmp->nm_cbid) { - /* clear out any pending recall requests */ - while ((np = TAILQ_FIRST(&nmp->nm_recallq))) { - TAILQ_REMOVE(&nmp->nm_recallq, np, n_dlink); - np->n_dlink.tqe_next = NFSNOLIST; + if ((nmp->nm_vers >= NFS_VER4) && !NMFLAG(nmp, NOCALLBACK) && nmp->nm_cbid) { + /* clear out any pending delegation return requests */ + while ((np = TAILQ_FIRST(&nmp->nm_dreturnq))) { + TAILQ_REMOVE(&nmp->nm_dreturnq, np, n_dreturn); + np->n_dreturn.tqe_next = NFSNOLIST; } } @@ -2061,11 +4244,23 @@ nfs_vfs_unmount( thread_call_free(nmp->nm_renew_timer); } - mbuf_freem(nmp->nm_nam); + if (nmp->nm_saddr) + FREE(nmp->nm_saddr, M_SONAME); + if ((nmp->nm_vers < NFS_VER4) && nmp->nm_rqsaddr) + FREE(nmp->nm_rqsaddr, M_SONAME); lck_mtx_unlock(&nmp->nm_lock); - if ((nmp->nm_vers < NFS_VER4) && !(nmp->nm_flag & (NFSMNT_NOLOCKS|NFSMNT_LOCALLOCKS))) - nfs_lockd_mount_change(-1); + if (nmp->nm_state & NFSSTA_MOUNTED) + switch (nmp->nm_lockmode) { + case NFS_LOCK_MODE_DISABLED: + case NFS_LOCK_MODE_LOCAL: + break; + case NFS_LOCK_MODE_ENABLED: + default: + if (nmp->nm_vers <= NFS_VER3) + nfs_lockd_mount_unregister(nmp); + break; + } if ((nmp->nm_vers >= NFS_VER4) && nmp->nm_longid) { /* remove/deallocate the client ID data */ @@ -2126,24 +4321,41 @@ nfs_vfs_unmount( req->r_callback.rcb_func(req); } - /* clean up open owner list */ + /* clean up common state */ + lck_mtx_lock(&nmp->nm_lock); + while ((np = LIST_FIRST(&nmp->nm_monlist))) { + LIST_REMOVE(np, n_monlink); + np->n_monlink.le_next = NFSNOLIST; + } + TAILQ_FOREACH_SAFE(noop, &nmp->nm_open_owners, noo_link, nextnoop) { + TAILQ_REMOVE(&nmp->nm_open_owners, noop, noo_link); + noop->noo_flags &= ~NFS_OPEN_OWNER_LINK; + if (noop->noo_refcnt) + continue; + nfs_open_owner_destroy(noop); + } + lck_mtx_unlock(&nmp->nm_lock); + + /* clean up NFSv4 state */ if (nmp->nm_vers >= NFS_VER4) { lck_mtx_lock(&nmp->nm_lock); - TAILQ_FOREACH_SAFE(noop, &nmp->nm_open_owners, noo_link, nextnoop) { - TAILQ_REMOVE(&nmp->nm_open_owners, noop, noo_link); - noop->noo_flags &= ~NFS_OPEN_OWNER_LINK; - if (noop->noo_refcnt) - continue; - nfs_open_owner_destroy(noop); + while ((np = TAILQ_FIRST(&nmp->nm_delegations))) { + TAILQ_REMOVE(&nmp->nm_delegations, np, n_dlink); + np->n_dlink.tqe_next = NFSNOLIST; } lck_mtx_unlock(&nmp->nm_lock); - if (IS_VALID_CRED(nmp->nm_mcred)) - kauth_cred_unref(&nmp->nm_mcred); } + if (IS_VALID_CRED(nmp->nm_mcred)) + kauth_cred_unref(&nmp->nm_mcred); + nfs_fs_locations_cleanup(&nmp->nm_locations); + + if (nmp->nm_args) + xb_free(nmp->nm_args); lck_mtx_destroy(&nmp->nm_lock, nfs_mount_grp); + if (nmp->nm_fh) + FREE(nmp->nm_fh, M_TEMP); FREE_ZONE((caddr_t)nmp, sizeof (struct nfsmount), M_NFSMNT); - return (0); } /* @@ -2192,8 +4404,8 @@ nfs_vfs_quotactl( int nfs3_getquota(struct nfsmount *nmp, vfs_context_t ctx, uid_t id, int type, struct dqblk *dqb) { - int error = 0, auth_len, slen, timeo; - int rqvers = (type == GRPQUOTA) ? RPCRQUOTA_EXT_VER : RPCRQUOTA_VER; + int error = 0, slen, timeo; + int rqport = 0, rqproto, rqvers = (type == GRPQUOTA) ? RPCRQUOTA_EXT_VER : RPCRQUOTA_VER; thread_t thd = vfs_context_thread(ctx); kauth_cred_t cred = vfs_context_ucred(ctx); char *path; @@ -2201,70 +4413,70 @@ nfs3_getquota(struct nfsmount *nmp, vfs_context_t ctx, uid_t id, int type, struc struct nfsm_chain nmreq, nmrep; mbuf_t mreq; uint32_t val = 0, bsize = 0; - struct sockaddr *nam = mbuf_data(nmp->nm_nam); - struct sockaddr_in saddr; + struct sockaddr *rqsaddr; struct timeval now; - bcopy(nam, &saddr, min(sizeof(saddr), nam->sa_len)); - auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ? - nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) + - 5 * NFSX_UNSIGNED; - timeo = (nmp->nm_flag & NFSMNT_SOFT) ? 10 : 60; - nfsm_chain_null(&nmreq); - nfsm_chain_null(&nmrep); + if (!nmp->nm_saddr) + return (ENXIO); - /* check if we have a recently cached rquota port */ - if (nmp->nm_rqport) { - microuptime(&now); - if ((nmp->nm_rqportstamp + 60) >= (uint32_t)now.tv_sec) - goto got_rqport; - } + if (NMFLAG(nmp, NOQUOTA)) + return (ENOTSUP); - /* send portmap request to get rquota port */ - saddr.sin_port = htons(PMAPPORT); - nfsm_chain_build_alloc_init(error, &nmreq, 4*NFSX_UNSIGNED); - nfsm_chain_add_32(error, &nmreq, RPCPROG_RQUOTA); - nfsm_chain_add_32(error, &nmreq, rqvers); - nfsm_chain_add_32(error, &nmreq, IPPROTO_UDP); - nfsm_chain_add_32(error, &nmreq, 0); - nfsm_chain_build_done(error, &nmreq); - nfsmout_if(error); - error = nfsm_rpchead2(SOCK_DGRAM, PMAPPROG, PMAPVERS, PMAPPROC_GETPORT, - RPCAUTH_SYS, auth_len, cred, NULL, nmreq.nmc_mhead, &xid, &mreq); - nfsmout_if(error); - nmreq.nmc_mhead = NULL; - error = nfs_aux_request(nmp, thd, &saddr, mreq, R_XID32(xid), 0, timeo, &nmrep); - nfsmout_if(error); + if (!nmp->nm_rqsaddr) + MALLOC(nmp->nm_rqsaddr, struct sockaddr *, sizeof(struct sockaddr_storage), M_SONAME, M_WAITOK|M_ZERO); + if (!nmp->nm_rqsaddr) + return (ENOMEM); + rqsaddr = nmp->nm_rqsaddr; + if (rqsaddr->sa_family == AF_INET6) + rqport = ntohs(((struct sockaddr_in6*)rqsaddr)->sin6_port); + else if (rqsaddr->sa_family == AF_INET) + rqport = ntohs(((struct sockaddr_in*)rqsaddr)->sin_port); - /* grab rquota port from portmap response */ - nfsm_chain_get_32(error, &nmrep, val); - nfsmout_if(error); - nmp->nm_rqport = val; + timeo = NMFLAG(nmp, SOFT) ? 10 : 60; + rqproto = IPPROTO_UDP; /* XXX should prefer TCP if mount is TCP */ + + /* check if we have a recently cached rquota port */ microuptime(&now); - nmp->nm_rqportstamp = now.tv_sec; - nfsm_chain_cleanup(&nmreq); - nfsm_chain_cleanup(&nmrep); - xid = 0; + if (!rqport || ((nmp->nm_rqsaddrstamp + 60) >= (uint32_t)now.tv_sec)) { + /* send portmap request to get rquota port */ + bcopy(nmp->nm_saddr, rqsaddr, min(sizeof(struct sockaddr_storage), nmp->nm_saddr->sa_len)); + error = nfs_portmap_lookup(nmp, ctx, rqsaddr, NULL, RPCPROG_RQUOTA, rqvers, rqproto, timeo); + if (error) + return (error); + if (rqsaddr->sa_family == AF_INET6) + rqport = ntohs(((struct sockaddr_in6*)rqsaddr)->sin6_port); + else if (rqsaddr->sa_family == AF_INET) + rqport = ntohs(((struct sockaddr_in*)rqsaddr)->sin_port); + else + return (EIO); + if (!rqport) + return (ENOTSUP); + microuptime(&now); + nmp->nm_rqsaddrstamp = now.tv_sec; + } -got_rqport: /* rquota request */ - saddr.sin_port = htons(nmp->nm_rqport); + nfsm_chain_null(&nmreq); + nfsm_chain_null(&nmrep); path = &vfs_statfs(nmp->nm_mountp)->f_mntfromname[0]; while (*path && (*path != '/')) path++; slen = strlen(path); nfsm_chain_build_alloc_init(error, &nmreq, 3 * NFSX_UNSIGNED + nfsm_rndup(slen)); - nfsm_chain_add_string(error, &nmreq, path, slen); + nfsm_chain_add_name(error, &nmreq, path, slen, nmp); if (type == GRPQUOTA) nfsm_chain_add_32(error, &nmreq, type); nfsm_chain_add_32(error, &nmreq, id); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfsm_rpchead2(SOCK_DGRAM, RPCPROG_RQUOTA, rqvers, RPCRQUOTA_GET, - RPCAUTH_SYS, auth_len, cred, NULL, nmreq.nmc_mhead, &xid, &mreq); + error = nfsm_rpchead2(nmp, (rqproto == IPPROTO_UDP) ? SOCK_DGRAM : SOCK_STREAM, + RPCPROG_RQUOTA, rqvers, RPCRQUOTA_GET, + RPCAUTH_SYS, cred, NULL, nmreq.nmc_mhead, &xid, &mreq); nfsmout_if(error); nmreq.nmc_mhead = NULL; - error = nfs_aux_request(nmp, thd, &saddr, mreq, R_XID32(xid), 0, timeo, &nmrep); + error = nfs_aux_request(nmp, thd, rqsaddr, NULL, + (rqproto == IPPROTO_UDP) ? SOCK_DGRAM : SOCK_STREAM, + mreq, R_XID32(xid), 0, timeo, &nmrep); nfsmout_if(error); /* parse rquota response */ @@ -2311,6 +4523,7 @@ nfs4_getquota(struct nfsmount *nmp, vfs_context_t ctx, uid_t id, int type, struc uint32_t bitmap[NFS_ATTR_BITMAP_LEN]; thread_t thd = vfs_context_thread(ctx); kauth_cred_t cred = vfs_context_ucred(ctx); + struct nfsreq_secinfo_args si; if (type != USRQUOTA) /* NFSv4 only supports user quotas */ return (ENOTSUP); @@ -2326,12 +4539,13 @@ nfs4_getquota(struct nfsmount *nmp, vfs_context_t ctx, uid_t id, int type, struc * an effective uid that matches the given uid. */ if (id != kauth_cred_getuid(cred)) { - struct ucred temp_cred; - bzero(&temp_cred, sizeof(temp_cred)); - temp_cred.cr_uid = id; - temp_cred.cr_ngroups = cred->cr_ngroups; - bcopy(cred->cr_groups, temp_cred.cr_groups, sizeof(temp_cred.cr_groups)); - cred = kauth_cred_create(&temp_cred); + struct posix_cred temp_pcred; + posix_cred_t pcred = posix_cred_get(cred); + bzero(&temp_pcred, sizeof(temp_pcred)); + temp_pcred.cr_uid = id; + temp_pcred.cr_ngroups = pcred->cr_ngroups; + bcopy(pcred->cr_groups, temp_pcred.cr_groups, sizeof(temp_pcred.cr_groups)); + cred = posix_cred_create(&temp_pcred); if (!IS_VALID_CRED(cred)) return (ENOMEM); } else { @@ -2347,6 +4561,7 @@ nfs4_getquota(struct nfsmount *nmp, vfs_context_t ctx, uid_t id, int type, struc return(error); } + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -2363,19 +4578,18 @@ nfs4_getquota(struct nfsmount *nmp, vfs_context_t ctx, uid_t id, int type, struc NFS_BITMAP_SET(bitmap, NFS_FATTR_QUOTA_AVAIL_HARD); NFS_BITMAP_SET(bitmap, NFS_FATTR_QUOTA_AVAIL_SOFT); NFS_BITMAP_SET(bitmap, NFS_FATTR_QUOTA_USED); - nfsm_chain_add_bitmap_masked(error, &nmreq, bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, bitmap, nmp, NULL); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request2(np, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, 0, &nmrep, &xid, &status); + error = nfs_request2(np, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, &si, 0, &nmrep, &xid, &status); nfsm_chain_skip_tag(error, &nmrep); nfsm_chain_get_32(error, &nmrep, numops); nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); nfsm_assert(error, NFSTONMP(np), ENXIO); nfsmout_if(error); - error = nfs4_parsefattr(&nmrep, NULL, NULL, NULL, dqb); + error = nfs4_parsefattr(&nmrep, NULL, NULL, NULL, dqb, NULL); nfsmout_if(error); nfsm_assert(error, NFSTONMP(np), ENXIO); nfsmout: @@ -2391,7 +4605,7 @@ nfs_vfs_quotactl(mount_t mp, int cmds, uid_t uid, caddr_t datap, vfs_context_t c { struct nfsmount *nmp; int cmd, type, error, nfsvers; - uid_t ruid = vfs_context_ucred(ctx)->cr_ruid; + uid_t euid = kauth_cred_getuid(vfs_context_ucred(ctx)); struct dqblk *dqb = (struct dqblk*)datap; if (!(nmp = VFSTONFS(mp))) @@ -2399,7 +4613,7 @@ nfs_vfs_quotactl(mount_t mp, int cmds, uid_t uid, caddr_t datap, vfs_context_t c nfsvers = nmp->nm_vers; if (uid == ~0U) - uid = ruid; + uid = euid; /* we can only support Q_GETQUOTA */ cmd = cmds >> SUBCMDSHIFT; @@ -2420,7 +4634,7 @@ nfs_vfs_quotactl(mount_t mp, int cmds, uid_t uid, caddr_t datap, vfs_context_t c type = cmds & SUBCMDMASK; if ((u_int)type >= MAXQUOTAS) return (EINVAL); - if ((uid != ruid) && ((error = vfs_context_suser(ctx)))) + if ((uid != euid) && ((error = vfs_context_suser(ctx)))) return (error); if (vfs_busy(mp, LK_NOWAIT)) @@ -2438,7 +4652,7 @@ nfs_vfs_quotactl(mount_t mp, int cmds, uid_t uid, caddr_t datap, vfs_context_t c int nfs_sync_callout(vnode_t, void *); struct nfs_sync_cargs { - thread_t thd; + vfs_context_t ctx; int waitfor; int error; }; @@ -2447,16 +4661,22 @@ int nfs_sync_callout(vnode_t vp, void *arg) { struct nfs_sync_cargs *cargs = (struct nfs_sync_cargs*)arg; + nfsnode_t np = VTONFS(vp); int error; - if (LIST_EMPTY(&VTONFS(vp)->n_dirtyblkhd)) + if (np->n_flag & NREVOKE) { + vn_revoke(vp, REVOKEALL, cargs->ctx); + return (VNODE_RETURNED); + } + + if (LIST_EMPTY(&np->n_dirtyblkhd)) return (VNODE_RETURNED); - if (VTONFS(vp)->n_wrbusy > 0) + if (np->n_wrbusy > 0) return (VNODE_RETURNED); - if (VTONFS(vp)->n_bflag & (NBFLUSHINPROG|NBINVALINPROG)) + if (np->n_bflag & (NBFLUSHINPROG|NBINVALINPROG)) return (VNODE_RETURNED); - error = nfs_flush(VTONFS(vp), cargs->waitfor, cargs->thd, 0); + error = nfs_flush(np, cargs->waitfor, vfs_context_thread(cargs->ctx), 0); if (error) cargs->error = error; @@ -2469,7 +4689,7 @@ nfs_vfs_sync(mount_t mp, int waitfor, vfs_context_t ctx) struct nfs_sync_cargs cargs; cargs.waitfor = waitfor; - cargs.thd = vfs_context_thread(ctx); + cargs.ctx = ctx; cargs.error = 0; vnode_iterate(mp, 0, nfs_sync_callout, &cargs); @@ -2538,6 +4758,290 @@ nfs_vfs_start( return (0); } +/* + * Build the mount info buffer for NFS_MOUNTINFO. + */ +int +nfs_mountinfo_assemble(struct nfsmount *nmp, struct xdrbuf *xb) +{ + struct xdrbuf xbinfo, xborig; + char sotype[6]; + uint32_t origargsvers, origargslength; + uint32_t infolength_offset, curargsopaquelength_offset, curargslength_offset, attrslength_offset, curargs_end_offset, end_offset; + uint32_t miattrs[NFS_MIATTR_BITMAP_LEN]; + uint32_t miflags_mask[NFS_MIFLAG_BITMAP_LEN]; + uint32_t miflags[NFS_MIFLAG_BITMAP_LEN]; + uint32_t mattrs[NFS_MATTR_BITMAP_LEN]; + uint32_t mflags_mask[NFS_MFLAG_BITMAP_LEN]; + uint32_t mflags[NFS_MFLAG_BITMAP_LEN]; + uint32_t loc, serv, addr, comp; + int i, timeo, error = 0; + + /* set up mount info attr and flag bitmaps */ + NFS_BITMAP_ZERO(miattrs, NFS_MIATTR_BITMAP_LEN); + NFS_BITMAP_SET(miattrs, NFS_MIATTR_FLAGS); + NFS_BITMAP_SET(miattrs, NFS_MIATTR_ORIG_ARGS); + NFS_BITMAP_SET(miattrs, NFS_MIATTR_CUR_ARGS); + NFS_BITMAP_SET(miattrs, NFS_MIATTR_CUR_LOC_INDEX); + NFS_BITMAP_ZERO(miflags_mask, NFS_MIFLAG_BITMAP_LEN); + NFS_BITMAP_ZERO(miflags, NFS_MIFLAG_BITMAP_LEN); + NFS_BITMAP_SET(miflags_mask, NFS_MIFLAG_DEAD); + NFS_BITMAP_SET(miflags_mask, NFS_MIFLAG_NOTRESP); + NFS_BITMAP_SET(miflags_mask, NFS_MIFLAG_RECOVERY); + if (nmp->nm_state & NFSSTA_DEAD) + NFS_BITMAP_SET(miflags, NFS_MIFLAG_DEAD); + if ((nmp->nm_state & (NFSSTA_TIMEO|NFSSTA_JUKEBOXTIMEO)) || + ((nmp->nm_state & NFSSTA_LOCKTIMEO) && (nmp->nm_lockmode == NFS_LOCK_MODE_ENABLED))) + NFS_BITMAP_SET(miflags, NFS_MIFLAG_NOTRESP); + if (nmp->nm_state & NFSSTA_RECOVER) + NFS_BITMAP_SET(miflags, NFS_MIFLAG_RECOVERY); + + /* get original mount args length */ + xb_init_buffer(&xborig, nmp->nm_args, 2*XDRWORD); + xb_get_32(error, &xborig, origargsvers); /* version */ + xb_get_32(error, &xborig, origargslength); /* args length */ + nfsmerr_if(error); + + /* set up current mount attributes bitmap */ + NFS_BITMAP_ZERO(mattrs, NFS_MATTR_BITMAP_LEN); + NFS_BITMAP_SET(mattrs, NFS_MATTR_FLAGS); + NFS_BITMAP_SET(mattrs, NFS_MATTR_NFS_VERSION); + if (nmp->nm_vers >= NFS_VER4) + NFS_BITMAP_SET(mattrs, NFS_MATTR_NFS_MINOR_VERSION); + NFS_BITMAP_SET(mattrs, NFS_MATTR_READ_SIZE); + NFS_BITMAP_SET(mattrs, NFS_MATTR_WRITE_SIZE); + NFS_BITMAP_SET(mattrs, NFS_MATTR_READDIR_SIZE); + NFS_BITMAP_SET(mattrs, NFS_MATTR_READAHEAD); + NFS_BITMAP_SET(mattrs, NFS_MATTR_ATTRCACHE_REG_MIN); + NFS_BITMAP_SET(mattrs, NFS_MATTR_ATTRCACHE_REG_MAX); + NFS_BITMAP_SET(mattrs, NFS_MATTR_ATTRCACHE_DIR_MIN); + NFS_BITMAP_SET(mattrs, NFS_MATTR_ATTRCACHE_DIR_MAX); + NFS_BITMAP_SET(mattrs, NFS_MATTR_LOCK_MODE); + NFS_BITMAP_SET(mattrs, NFS_MATTR_SECURITY); + NFS_BITMAP_SET(mattrs, NFS_MATTR_MAX_GROUP_LIST); + NFS_BITMAP_SET(mattrs, NFS_MATTR_SOCKET_TYPE); + NFS_BITMAP_SET(mattrs, NFS_MATTR_NFS_PORT); + if ((nmp->nm_vers < NFS_VER4) && nmp->nm_mountport) + NFS_BITMAP_SET(mattrs, NFS_MATTR_MOUNT_PORT); + NFS_BITMAP_SET(mattrs, NFS_MATTR_REQUEST_TIMEOUT); + if (NMFLAG(nmp, SOFT)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_SOFT_RETRY_COUNT); + if (nmp->nm_deadtimeout) + NFS_BITMAP_SET(mattrs, NFS_MATTR_DEAD_TIMEOUT); + if (nmp->nm_fh) + NFS_BITMAP_SET(mattrs, NFS_MATTR_FH); + NFS_BITMAP_SET(mattrs, NFS_MATTR_FS_LOCATIONS); + NFS_BITMAP_SET(mattrs, NFS_MATTR_MNTFLAGS); + if (origargsvers < NFS_ARGSVERSION_XDR) + NFS_BITMAP_SET(mattrs, NFS_MATTR_MNTFROM); + + /* set up current mount flags bitmap */ + /* first set the flags that we will be setting - either on OR off */ + NFS_BITMAP_ZERO(mflags_mask, NFS_MFLAG_BITMAP_LEN); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_SOFT); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_INTR); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_RESVPORT); + if (nmp->nm_sotype == SOCK_DGRAM) + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NOCONNECT); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_DUMBTIMER); + if (nmp->nm_vers < NFS_VER4) + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_CALLUMNT); + if (nmp->nm_vers >= NFS_VER3) + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_RDIRPLUS); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NONEGNAMECACHE); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_MUTEJUKEBOX); + if (nmp->nm_vers >= NFS_VER4) { + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_EPHEMERAL); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NOCALLBACK); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NONAMEDATTR); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NOACL); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_ACLONLY); + } + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NFC); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NOQUOTA); + if (nmp->nm_vers < NFS_VER4) + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_MNTUDP); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_MNTQUICK); + /* now set the flags that should be set */ + NFS_BITMAP_ZERO(mflags, NFS_MFLAG_BITMAP_LEN); + if (NMFLAG(nmp, SOFT)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_SOFT); + if (NMFLAG(nmp, INTR)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_INTR); + if (NMFLAG(nmp, RESVPORT)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_RESVPORT); + if ((nmp->nm_sotype == SOCK_DGRAM) && NMFLAG(nmp, NOCONNECT)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_NOCONNECT); + if (NMFLAG(nmp, DUMBTIMER)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_DUMBTIMER); + if ((nmp->nm_vers < NFS_VER4) && NMFLAG(nmp, CALLUMNT)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_CALLUMNT); + if ((nmp->nm_vers >= NFS_VER3) && NMFLAG(nmp, RDIRPLUS)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_RDIRPLUS); + if (NMFLAG(nmp, NONEGNAMECACHE)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_NONEGNAMECACHE); + if (NMFLAG(nmp, MUTEJUKEBOX)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_MUTEJUKEBOX); + if (nmp->nm_vers >= NFS_VER4) { + if (NMFLAG(nmp, EPHEMERAL)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_EPHEMERAL); + if (NMFLAG(nmp, NOCALLBACK)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_NOCALLBACK); + if (NMFLAG(nmp, NONAMEDATTR)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_NONAMEDATTR); + if (NMFLAG(nmp, NOACL)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_NOACL); + if (NMFLAG(nmp, ACLONLY)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_ACLONLY); + } + if (NMFLAG(nmp, NFC)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_NFC); + if (NMFLAG(nmp, NOQUOTA) || ((nmp->nm_vers >= NFS_VER4) && + !NFS_BITMAP_ISSET(nmp->nm_fsattr.nfsa_supp_attr, NFS_FATTR_QUOTA_AVAIL_HARD) && + !NFS_BITMAP_ISSET(nmp->nm_fsattr.nfsa_supp_attr, NFS_FATTR_QUOTA_AVAIL_SOFT) && + !NFS_BITMAP_ISSET(nmp->nm_fsattr.nfsa_supp_attr, NFS_FATTR_QUOTA_USED))) + NFS_BITMAP_SET(mflags, NFS_MFLAG_NOQUOTA); + if ((nmp->nm_vers < NFS_VER4) && NMFLAG(nmp, MNTUDP)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_MNTUDP); + if (NMFLAG(nmp, MNTQUICK)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_MNTQUICK); + + /* assemble info buffer: */ + xb_init_buffer(&xbinfo, NULL, 0); + xb_add_32(error, &xbinfo, NFS_MOUNT_INFO_VERSION); + infolength_offset = xb_offset(&xbinfo); + xb_add_32(error, &xbinfo, 0); + xb_add_bitmap(error, &xbinfo, miattrs, NFS_MIATTR_BITMAP_LEN); + xb_add_bitmap(error, &xbinfo, miflags, NFS_MIFLAG_BITMAP_LEN); + xb_add_32(error, &xbinfo, origargslength); + if (!error) + error = xb_add_bytes(&xbinfo, nmp->nm_args, origargslength, 0); + + /* the opaque byte count for the current mount args values: */ + curargsopaquelength_offset = xb_offset(&xbinfo); + xb_add_32(error, &xbinfo, 0); + + /* Encode current mount args values */ + xb_add_32(error, &xbinfo, NFS_ARGSVERSION_XDR); + curargslength_offset = xb_offset(&xbinfo); + xb_add_32(error, &xbinfo, 0); + xb_add_32(error, &xbinfo, NFS_XDRARGS_VERSION_0); + xb_add_bitmap(error, &xbinfo, mattrs, NFS_MATTR_BITMAP_LEN); + attrslength_offset = xb_offset(&xbinfo); + xb_add_32(error, &xbinfo, 0); + xb_add_bitmap(error, &xbinfo, mflags_mask, NFS_MFLAG_BITMAP_LEN); + xb_add_bitmap(error, &xbinfo, mflags, NFS_MFLAG_BITMAP_LEN); + xb_add_32(error, &xbinfo, nmp->nm_vers); /* NFS_VERSION */ + if (nmp->nm_vers >= NFS_VER4) + xb_add_32(error, &xbinfo, 0); /* NFS_MINOR_VERSION */ + xb_add_32(error, &xbinfo, nmp->nm_rsize); /* READ_SIZE */ + xb_add_32(error, &xbinfo, nmp->nm_wsize); /* WRITE_SIZE */ + xb_add_32(error, &xbinfo, nmp->nm_readdirsize); /* READDIR_SIZE */ + xb_add_32(error, &xbinfo, nmp->nm_readahead); /* READAHEAD */ + xb_add_32(error, &xbinfo, nmp->nm_acregmin); /* ATTRCACHE_REG_MIN */ + xb_add_32(error, &xbinfo, 0); /* ATTRCACHE_REG_MIN */ + xb_add_32(error, &xbinfo, nmp->nm_acregmax); /* ATTRCACHE_REG_MAX */ + xb_add_32(error, &xbinfo, 0); /* ATTRCACHE_REG_MAX */ + xb_add_32(error, &xbinfo, nmp->nm_acdirmin); /* ATTRCACHE_DIR_MIN */ + xb_add_32(error, &xbinfo, 0); /* ATTRCACHE_DIR_MIN */ + xb_add_32(error, &xbinfo, nmp->nm_acdirmax); /* ATTRCACHE_DIR_MAX */ + xb_add_32(error, &xbinfo, 0); /* ATTRCACHE_DIR_MAX */ + xb_add_32(error, &xbinfo, nmp->nm_lockmode); /* LOCK_MODE */ + if (nmp->nm_sec.count) { + xb_add_32(error, &xbinfo, nmp->nm_sec.count); /* SECURITY */ + nfsmerr_if(error); + for (i=0; i < nmp->nm_sec.count; i++) + xb_add_32(error, &xbinfo, nmp->nm_sec.flavors[i]); + } else if (nmp->nm_servsec.count) { + xb_add_32(error, &xbinfo, nmp->nm_servsec.count); /* SECURITY */ + nfsmerr_if(error); + for (i=0; i < nmp->nm_servsec.count; i++) + xb_add_32(error, &xbinfo, nmp->nm_servsec.flavors[i]); + } else { + xb_add_32(error, &xbinfo, 1); /* SECURITY */ + xb_add_32(error, &xbinfo, nmp->nm_auth); + } + xb_add_32(error, &xbinfo, nmp->nm_numgrps); /* MAX_GROUP_LIST */ + nfsmerr_if(error); + snprintf(sotype, sizeof(sotype), "%s%s", (nmp->nm_sotype == SOCK_DGRAM) ? "udp" : "tcp", + nmp->nm_sofamily ? (nmp->nm_sofamily == AF_INET) ? "4" : "6" : ""); + xb_add_string(error, &xbinfo, sotype, strlen(sotype)); /* SOCKET_TYPE */ + xb_add_32(error, &xbinfo, ntohs(((struct sockaddr_in*)nmp->nm_saddr)->sin_port)); /* NFS_PORT */ + if ((nmp->nm_vers < NFS_VER4) && nmp->nm_mountport) + xb_add_32(error, &xbinfo, nmp->nm_mountport); /* MOUNT_PORT */ + timeo = (nmp->nm_timeo * 10) / NFS_HZ; + xb_add_32(error, &xbinfo, timeo/10); /* REQUEST_TIMEOUT */ + xb_add_32(error, &xbinfo, (timeo%10)*100000000); /* REQUEST_TIMEOUT */ + if (NMFLAG(nmp, SOFT)) + xb_add_32(error, &xbinfo, nmp->nm_retry); /* SOFT_RETRY_COUNT */ + if (nmp->nm_deadtimeout) { + xb_add_32(error, &xbinfo, nmp->nm_deadtimeout); /* DEAD_TIMEOUT */ + xb_add_32(error, &xbinfo, 0); /* DEAD_TIMEOUT */ + } + if (nmp->nm_fh) + xb_add_fh(error, &xbinfo, &nmp->nm_fh->fh_data[0], nmp->nm_fh->fh_len); /* FH */ + xb_add_32(error, &xbinfo, nmp->nm_locations.nl_numlocs); /* FS_LOCATIONS */ + for (loc = 0; !error && (loc < nmp->nm_locations.nl_numlocs); loc++) { + xb_add_32(error, &xbinfo, nmp->nm_locations.nl_locations[loc]->nl_servcount); + for (serv = 0; !error && (serv < nmp->nm_locations.nl_locations[loc]->nl_servcount); serv++) { + xb_add_string(error, &xbinfo, nmp->nm_locations.nl_locations[loc]->nl_servers[serv]->ns_name, + strlen(nmp->nm_locations.nl_locations[loc]->nl_servers[serv]->ns_name)); + xb_add_32(error, &xbinfo, nmp->nm_locations.nl_locations[loc]->nl_servers[serv]->ns_addrcount); + for (addr = 0; !error && (addr < nmp->nm_locations.nl_locations[loc]->nl_servers[serv]->ns_addrcount); addr++) + xb_add_string(error, &xbinfo, nmp->nm_locations.nl_locations[loc]->nl_servers[serv]->ns_addresses[addr], + strlen(nmp->nm_locations.nl_locations[loc]->nl_servers[serv]->ns_addresses[addr])); + xb_add_32(error, &xbinfo, 0); /* empty server info */ + } + xb_add_32(error, &xbinfo, nmp->nm_locations.nl_locations[loc]->nl_path.np_compcount); + for (comp = 0; !error && (comp < nmp->nm_locations.nl_locations[loc]->nl_path.np_compcount); comp++) + xb_add_string(error, &xbinfo, nmp->nm_locations.nl_locations[loc]->nl_path.np_components[comp], + strlen(nmp->nm_locations.nl_locations[loc]->nl_path.np_components[comp])); + xb_add_32(error, &xbinfo, 0); /* empty fs location info */ + } + xb_add_32(error, &xbinfo, vfs_flags(nmp->nm_mountp)); /* MNTFLAGS */ + if (origargsvers < NFS_ARGSVERSION_XDR) + xb_add_string(error, &xbinfo, vfs_statfs(nmp->nm_mountp)->f_mntfromname, + strlen(vfs_statfs(nmp->nm_mountp)->f_mntfromname)); /* MNTFROM */ + curargs_end_offset = xb_offset(&xbinfo); + + /* NFS_MIATTR_CUR_LOC_INDEX */ + xb_add_32(error, &xbinfo, nmp->nm_locations.nl_current.nli_flags); + xb_add_32(error, &xbinfo, nmp->nm_locations.nl_current.nli_loc); + xb_add_32(error, &xbinfo, nmp->nm_locations.nl_current.nli_serv); + xb_add_32(error, &xbinfo, nmp->nm_locations.nl_current.nli_addr); + + xb_build_done(error, &xbinfo); + + /* update opaque counts */ + end_offset = xb_offset(&xbinfo); + if (!error) { + error = xb_seek(&xbinfo, attrslength_offset); + xb_add_32(error, &xbinfo, curargs_end_offset - attrslength_offset - XDRWORD/*don't include length field*/); + } + if (!error) { + error = xb_seek(&xbinfo, curargslength_offset); + xb_add_32(error, &xbinfo, curargs_end_offset - curargslength_offset + XDRWORD/*version*/); + } + if (!error) { + error = xb_seek(&xbinfo, curargsopaquelength_offset); + xb_add_32(error, &xbinfo, curargs_end_offset - curargslength_offset + XDRWORD/*version*/); + } + if (!error) { + error = xb_seek(&xbinfo, infolength_offset); + xb_add_32(error, &xbinfo, end_offset - infolength_offset + XDRWORD/*version*/); + } + nfsmerr_if(error); + + /* copy result xdrbuf to caller */ + *xb = xbinfo; + + /* and mark the local copy as not needing cleanup */ + xbinfo.xb_flags &= ~XB_CLEANUP; +nfsmerr: + xb_cleanup(&xbinfo); + return (error); +} + /* * Do that sysctl thang... */ @@ -2552,6 +5056,8 @@ nfs_vfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, struct nfsmount *nmp = NULL; struct vfsquery vq; boolean_t is_64_bit; + fsid_t fsid; + struct xdrbuf xb; #if NFSSERVER struct nfs_exportfs *nxfs; struct nfs_export *nx; @@ -2622,6 +5128,32 @@ nfs_vfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, if (newp) return copyin(newp, &nfsstats, sizeof nfsstats); return (0); + case NFS_MOUNTINFO: + /* read in the fsid */ + if (*oldlenp < sizeof(fsid)) + return (EINVAL); + if ((error = copyin(oldp, &fsid, sizeof(fsid)))) + return (error); + /* swizzle it back to host order */ + fsid.val[0] = ntohl(fsid.val[0]); + fsid.val[1] = ntohl(fsid.val[1]); + /* find mount and make sure it's NFS */ + if (((mp = vfs_getvfs(&fsid))) == NULL) + return (ENOENT); + if (strcmp(mp->mnt_vfsstat.f_fstypename, "nfs")) + return (EINVAL); + if (((nmp = VFSTONFS(mp))) == NULL) + return (ENOENT); + xb_init(&xb, 0); + if ((error = nfs_mountinfo_assemble(nmp, &xb))) + return (error); + if (*oldlenp < xb.xb_u.xb_buffer.xbb_len) + error = ENOMEM; + else + error = copyout(xb_buffer_base(&xb), oldp, xb.xb_u.xb_buffer.xbb_len); + *oldlenp = xb.xb_u.xb_buffer.xbb_len; + xb_cleanup(&xb); + break; #if NFSSERVER case NFS_EXPORTSTATS: /* setup export stat descriptor */ @@ -2866,7 +5398,7 @@ nfs_vfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, case VFS_CTL_NOLOCKS: if (req->oldptr != USER_ADDR_NULL) { lck_mtx_lock(&nmp->nm_lock); - val = (nmp->nm_flag & NFSMNT_NOLOCKS) ? 1 : 0; + val = (nmp->nm_lockmode == NFS_LOCK_MODE_DISABLED) ? 1 : 0; lck_mtx_unlock(&nmp->nm_lock); error = SYSCTL_OUT(req, &val, sizeof(val)); if (error) @@ -2877,18 +5409,21 @@ nfs_vfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, if (error) return (error); lck_mtx_lock(&nmp->nm_lock); - if (nmp->nm_flag & NFSMNT_LOCALLOCKS) { + if (nmp->nm_lockmode == NFS_LOCK_MODE_LOCAL) { /* can't toggle locks when using local locks */ error = EINVAL; + } else if ((nmp->nm_vers >= NFS_VER4) && val) { + /* can't disable locks for NFSv4 */ + error = EINVAL; } else if (val) { - if (!(nmp->nm_flag & NFSMNT_NOLOCKS)) - nfs_lockd_mount_change(-1); - nmp->nm_flag |= NFSMNT_NOLOCKS; + if ((nmp->nm_vers <= NFS_VER3) && (nmp->nm_lockmode == NFS_LOCK_MODE_ENABLED)) + nfs_lockd_mount_unregister(nmp); + nmp->nm_lockmode = NFS_LOCK_MODE_DISABLED; nmp->nm_state &= ~NFSSTA_LOCKTIMEO; } else { - if (nmp->nm_flag & NFSMNT_NOLOCKS) - nfs_lockd_mount_change(1); - nmp->nm_flag &= ~NFSMNT_NOLOCKS; + if ((nmp->nm_vers <= NFS_VER3) && (nmp->nm_lockmode == NFS_LOCK_MODE_DISABLED)) + nfs_lockd_mount_register(nmp); + nmp->nm_lockmode = NFS_LOCK_MODE_ENABLED; } lck_mtx_unlock(&nmp->nm_lock); } @@ -2896,14 +5431,13 @@ nfs_vfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, case VFS_CTL_QUERY: lck_mtx_lock(&nmp->nm_lock); /* XXX don't allow users to know about/disconnect unresponsive, soft, nobrowse mounts */ - softnobrowse = ((nmp->nm_flag & NFSMNT_SOFT) && (vfs_flags(nmp->nm_mountp) & MNT_DONTBROWSE)); + softnobrowse = (NMFLAG(nmp, SOFT) && (vfs_flags(nmp->nm_mountp) & MNT_DONTBROWSE)); if (!softnobrowse && (nmp->nm_state & NFSSTA_TIMEO)) vq.vq_flags |= VQ_NOTRESP; - if (!softnobrowse && (nmp->nm_state & NFSSTA_JUKEBOXTIMEO) && - !(nmp->nm_flag & NFSMNT_MUTEJUKEBOX)) + if (!softnobrowse && (nmp->nm_state & NFSSTA_JUKEBOXTIMEO) && !NMFLAG(nmp, MUTEJUKEBOX)) vq.vq_flags |= VQ_NOTRESP; if (!softnobrowse && (nmp->nm_state & NFSSTA_LOCKTIMEO) && - !(nmp->nm_flag & (NFSMNT_NOLOCKS|NFSMNT_LOCALLOCKS))) + (nmp->nm_lockmode == NFS_LOCK_MODE_ENABLED)) vq.vq_flags |= VQ_NOTRESP; if (nmp->nm_state & NFSSTA_DEAD) vq.vq_flags |= VQ_DEAD; diff --git a/bsd/nfs/nfs_vnops.c b/bsd/nfs/nfs_vnops.c index 00199a6df..d1e130b88 100644 --- a/bsd/nfs/nfs_vnops.c +++ b/bsd/nfs/nfs_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -134,11 +134,7 @@ int nfsfifo_vnop_close(struct vnop_close_args *); int nfs_vnop_ioctl(struct vnop_ioctl_args *); int nfs_vnop_select(struct vnop_select_args *); int nfs_vnop_setattr(struct vnop_setattr_args *); -int nfs_vnop_read(struct vnop_read_args *); -int nfs_vnop_write(struct vnop_write_args *); -int nfs_vnop_mmap(struct vnop_mmap_args *); int nfs_vnop_fsync(struct vnop_fsync_args *); -int nfs_vnop_remove(struct vnop_remove_args *); int nfs_vnop_rename(struct vnop_rename_args *); int nfs_vnop_readdir(struct vnop_readdir_args *); int nfs_vnop_readlink(struct vnop_readlink_args *); @@ -148,6 +144,7 @@ int nfs_vnop_pageout(struct vnop_pageout_args *); int nfs_vnop_blktooff(struct vnop_blktooff_args *); int nfs_vnop_offtoblk(struct vnop_offtoblk_args *); int nfs_vnop_blockmap(struct vnop_blockmap_args *); +int nfs_vnop_monitor(struct vnop_monitor_args *); int nfs3_vnop_create(struct vnop_create_args *); int nfs3_vnop_mknod(struct vnop_mknod_args *); @@ -163,8 +160,8 @@ static struct vnodeopv_entry_desc nfsv2_vnodeop_entries[] = { { &vnop_lookup_desc, (vnop_t *)nfs_vnop_lookup }, /* lookup */ { &vnop_create_desc, (vnop_t *)nfs3_vnop_create }, /* create */ { &vnop_mknod_desc, (vnop_t *)nfs3_vnop_mknod }, /* mknod */ - { &vnop_open_desc, (vnop_t *)nfs3_vnop_open }, /* open */ - { &vnop_close_desc, (vnop_t *)nfs3_vnop_close }, /* close */ + { &vnop_open_desc, (vnop_t *)nfs_vnop_open }, /* open */ + { &vnop_close_desc, (vnop_t *)nfs_vnop_close }, /* close */ { &vnop_access_desc, (vnop_t *)nfs_vnop_access }, /* access */ { &vnop_getattr_desc, (vnop_t *)nfs3_vnop_getattr }, /* getattr */ { &vnop_setattr_desc, (vnop_t *)nfs_vnop_setattr }, /* setattr */ @@ -174,6 +171,7 @@ static struct vnodeopv_entry_desc nfsv2_vnodeop_entries[] = { { &vnop_select_desc, (vnop_t *)nfs_vnop_select }, /* select */ { &vnop_revoke_desc, (vnop_t *)nfs_vnop_revoke }, /* revoke */ { &vnop_mmap_desc, (vnop_t *)nfs_vnop_mmap }, /* mmap */ + { &vnop_mnomap_desc, (vnop_t *)nfs_vnop_mnomap }, /* mnomap */ { &vnop_fsync_desc, (vnop_t *)nfs_vnop_fsync }, /* fsync */ { &vnop_remove_desc, (vnop_t *)nfs_vnop_remove }, /* remove */ { &vnop_link_desc, (vnop_t *)nfs3_vnop_link }, /* link */ @@ -187,7 +185,7 @@ static struct vnodeopv_entry_desc nfsv2_vnodeop_entries[] = { { &vnop_reclaim_desc, (vnop_t *)nfs_vnop_reclaim }, /* reclaim */ { &vnop_strategy_desc, (vnop_t *)err_strategy }, /* strategy */ { &vnop_pathconf_desc, (vnop_t *)nfs_vnop_pathconf }, /* pathconf */ - { &vnop_advlock_desc, (vnop_t *)nfs3_vnop_advlock }, /* advlock */ + { &vnop_advlock_desc, (vnop_t *)nfs_vnop_advlock }, /* advlock */ { &vnop_bwrite_desc, (vnop_t *)err_bwrite }, /* bwrite */ { &vnop_pagein_desc, (vnop_t *)nfs_vnop_pagein }, /* Pagein */ { &vnop_pageout_desc, (vnop_t *)nfs_vnop_pageout }, /* Pageout */ @@ -195,6 +193,7 @@ static struct vnodeopv_entry_desc nfsv2_vnodeop_entries[] = { { &vnop_blktooff_desc, (vnop_t *)nfs_vnop_blktooff }, /* blktooff */ { &vnop_offtoblk_desc, (vnop_t *)nfs_vnop_offtoblk }, /* offtoblk */ { &vnop_blockmap_desc, (vnop_t *)nfs_vnop_blockmap }, /* blockmap */ + { &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor }, /* monitor */ { NULL, NULL } }; struct vnodeopv_desc nfsv2_vnodeop_opv_desc = @@ -206,18 +205,18 @@ static struct vnodeopv_entry_desc nfsv4_vnodeop_entries[] = { { &vnop_lookup_desc, (vnop_t *)nfs_vnop_lookup }, /* lookup */ { &vnop_create_desc, (vnop_t *)nfs4_vnop_create }, /* create */ { &vnop_mknod_desc, (vnop_t *)nfs4_vnop_mknod }, /* mknod */ - { &vnop_open_desc, (vnop_t *)nfs4_vnop_open }, /* open */ - { &vnop_close_desc, (vnop_t *)nfs4_vnop_close }, /* close */ + { &vnop_open_desc, (vnop_t *)nfs_vnop_open }, /* open */ + { &vnop_close_desc, (vnop_t *)nfs_vnop_close }, /* close */ { &vnop_access_desc, (vnop_t *)nfs_vnop_access }, /* access */ { &vnop_getattr_desc, (vnop_t *)nfs4_vnop_getattr }, /* getattr */ { &vnop_setattr_desc, (vnop_t *)nfs_vnop_setattr }, /* setattr */ - { &vnop_read_desc, (vnop_t *)nfs4_vnop_read }, /* read */ + { &vnop_read_desc, (vnop_t *)nfs_vnop_read }, /* read */ { &vnop_write_desc, (vnop_t *)nfs_vnop_write }, /* write */ { &vnop_ioctl_desc, (vnop_t *)nfs_vnop_ioctl }, /* ioctl */ { &vnop_select_desc, (vnop_t *)nfs_vnop_select }, /* select */ { &vnop_revoke_desc, (vnop_t *)nfs_vnop_revoke }, /* revoke */ - { &vnop_mmap_desc, (vnop_t *)nfs4_vnop_mmap }, /* mmap */ - { &vnop_mnomap_desc, (vnop_t *)nfs4_vnop_mnomap }, /* mnomap */ + { &vnop_mmap_desc, (vnop_t *)nfs_vnop_mmap }, /* mmap */ + { &vnop_mnomap_desc, (vnop_t *)nfs_vnop_mnomap }, /* mnomap */ { &vnop_fsync_desc, (vnop_t *)nfs_vnop_fsync }, /* fsync */ { &vnop_remove_desc, (vnop_t *)nfs_vnop_remove }, /* remove */ { &vnop_link_desc, (vnop_t *)nfs4_vnop_link }, /* link */ @@ -231,7 +230,7 @@ static struct vnodeopv_entry_desc nfsv4_vnodeop_entries[] = { { &vnop_reclaim_desc, (vnop_t *)nfs_vnop_reclaim }, /* reclaim */ { &vnop_strategy_desc, (vnop_t *)err_strategy }, /* strategy */ { &vnop_pathconf_desc, (vnop_t *)nfs_vnop_pathconf }, /* pathconf */ - { &vnop_advlock_desc, (vnop_t *)nfs4_vnop_advlock }, /* advlock */ + { &vnop_advlock_desc, (vnop_t *)nfs_vnop_advlock }, /* advlock */ { &vnop_bwrite_desc, (vnop_t *)err_bwrite }, /* bwrite */ { &vnop_pagein_desc, (vnop_t *)nfs_vnop_pagein }, /* Pagein */ { &vnop_pageout_desc, (vnop_t *)nfs_vnop_pageout }, /* Pageout */ @@ -239,6 +238,16 @@ static struct vnodeopv_entry_desc nfsv4_vnodeop_entries[] = { { &vnop_blktooff_desc, (vnop_t *)nfs_vnop_blktooff }, /* blktooff */ { &vnop_offtoblk_desc, (vnop_t *)nfs_vnop_offtoblk }, /* offtoblk */ { &vnop_blockmap_desc, (vnop_t *)nfs_vnop_blockmap }, /* blockmap */ + { &vnop_getxattr_desc, (vnop_t *)nfs4_vnop_getxattr }, /* getxattr */ + { &vnop_setxattr_desc, (vnop_t *)nfs4_vnop_setxattr }, /* setxattr */ + { &vnop_removexattr_desc, (vnop_t *)nfs4_vnop_removexattr },/* removexattr */ + { &vnop_listxattr_desc, (vnop_t *)nfs4_vnop_listxattr },/* listxattr */ +#if NAMEDSTREAMS + { &vnop_getnamedstream_desc, (vnop_t *)nfs4_vnop_getnamedstream }, /* getnamedstream */ + { &vnop_makenamedstream_desc, (vnop_t *)nfs4_vnop_makenamedstream }, /* makenamedstream */ + { &vnop_removenamedstream_desc, (vnop_t *)nfs4_vnop_removenamedstream },/* removenamedstream */ +#endif + { &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor }, /* monitor */ { NULL, NULL } }; struct vnodeopv_desc nfsv4_vnodeop_opv_desc = @@ -283,6 +292,7 @@ static struct vnodeopv_entry_desc spec_nfsv2nodeop_entries[] = { { &vnop_blktooff_desc, (vnop_t *)nfs_vnop_blktooff }, /* blktooff */ { &vnop_offtoblk_desc, (vnop_t *)nfs_vnop_offtoblk }, /* offtoblk */ { &vnop_blockmap_desc, (vnop_t *)nfs_vnop_blockmap }, /* blockmap */ + { &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor }, /* monitor */ { NULL, NULL } }; struct vnodeopv_desc spec_nfsv2nodeop_opv_desc = @@ -323,6 +333,16 @@ static struct vnodeopv_entry_desc spec_nfsv4nodeop_entries[] = { { &vnop_blktooff_desc, (vnop_t *)nfs_vnop_blktooff }, /* blktooff */ { &vnop_offtoblk_desc, (vnop_t *)nfs_vnop_offtoblk }, /* offtoblk */ { &vnop_blockmap_desc, (vnop_t *)nfs_vnop_blockmap }, /* blockmap */ + { &vnop_getxattr_desc, (vnop_t *)nfs4_vnop_getxattr }, /* getxattr */ + { &vnop_setxattr_desc, (vnop_t *)nfs4_vnop_setxattr }, /* setxattr */ + { &vnop_removexattr_desc, (vnop_t *)nfs4_vnop_removexattr },/* removexattr */ + { &vnop_listxattr_desc, (vnop_t *)nfs4_vnop_listxattr },/* listxattr */ +#if NAMEDSTREAMS + { &vnop_getnamedstream_desc, (vnop_t *)nfs4_vnop_getnamedstream }, /* getnamedstream */ + { &vnop_makenamedstream_desc, (vnop_t *)nfs4_vnop_makenamedstream }, /* makenamedstream */ + { &vnop_removenamedstream_desc, (vnop_t *)nfs4_vnop_removenamedstream },/* removenamedstream */ +#endif + { &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor }, /* monitor */ { NULL, NULL } }; struct vnodeopv_desc spec_nfsv4nodeop_opv_desc = @@ -365,6 +385,7 @@ static struct vnodeopv_entry_desc fifo_nfsv2nodeop_entries[] = { { &vnop_blktooff_desc, (vnop_t *)nfs_vnop_blktooff }, /* blktooff */ { &vnop_offtoblk_desc, (vnop_t *)nfs_vnop_offtoblk }, /* offtoblk */ { &vnop_blockmap_desc, (vnop_t *)nfs_vnop_blockmap }, /* blockmap */ + { &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor }, /* monitor */ { NULL, NULL } }; struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc = @@ -406,6 +427,16 @@ static struct vnodeopv_entry_desc fifo_nfsv4nodeop_entries[] = { { &vnop_blktooff_desc, (vnop_t *)nfs_vnop_blktooff }, /* blktooff */ { &vnop_offtoblk_desc, (vnop_t *)nfs_vnop_offtoblk }, /* offtoblk */ { &vnop_blockmap_desc, (vnop_t *)nfs_vnop_blockmap }, /* blockmap */ + { &vnop_getxattr_desc, (vnop_t *)nfs4_vnop_getxattr }, /* getxattr */ + { &vnop_setxattr_desc, (vnop_t *)nfs4_vnop_setxattr }, /* setxattr */ + { &vnop_removexattr_desc, (vnop_t *)nfs4_vnop_removexattr },/* removexattr */ + { &vnop_listxattr_desc, (vnop_t *)nfs4_vnop_listxattr },/* listxattr */ +#if NAMEDSTREAMS + { &vnop_getnamedstream_desc, (vnop_t *)nfs4_vnop_getnamedstream }, /* getnamedstream */ + { &vnop_makenamedstream_desc, (vnop_t *)nfs4_vnop_makenamedstream }, /* makenamedstream */ + { &vnop_removenamedstream_desc, (vnop_t *)nfs4_vnop_removenamedstream },/* removenamedstream */ +#endif + { &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor }, /* monitor */ { NULL, NULL } }; struct vnodeopv_desc fifo_nfsv4nodeop_opv_desc = @@ -418,30 +449,30 @@ int nfs_sillyrename(nfsnode_t,nfsnode_t,struct componentname *,vfs_context_t); /* * Find the slot in the access cache for this UID. * If adding and no existing slot is found, reuse slots in FIFO order. - * The index of the next slot to use is kept in the last entry of the n_mode array. + * The index of the next slot to use is kept in the last entry of the n_access array. */ int -nfs_node_mode_slot(nfsnode_t np, uid_t uid, int add) +nfs_node_access_slot(nfsnode_t np, uid_t uid, int add) { int slot; for (slot=0; slot < NFS_ACCESS_CACHE_SIZE; slot++) - if (np->n_modeuid[slot] == uid) + if (np->n_accessuid[slot] == uid) break; if (slot == NFS_ACCESS_CACHE_SIZE) { if (!add) return (-1); - slot = np->n_mode[NFS_ACCESS_CACHE_SIZE]; - np->n_mode[NFS_ACCESS_CACHE_SIZE] = (slot + 1) % NFS_ACCESS_CACHE_SIZE; + slot = np->n_access[NFS_ACCESS_CACHE_SIZE]; + np->n_access[NFS_ACCESS_CACHE_SIZE] = (slot + 1) % NFS_ACCESS_CACHE_SIZE; } return (slot); } int -nfs3_access_rpc(nfsnode_t np, u_int32_t *mode, vfs_context_t ctx) +nfs3_access_rpc(nfsnode_t np, u_int32_t *access, vfs_context_t ctx) { int error = 0, lockerror = ENOENT, status, slot; - uint32_t access = 0; + uint32_t access_result = 0; u_int64_t xid; struct nfsm_chain nmreq, nmrep; struct timeval now; @@ -452,25 +483,24 @@ nfs3_access_rpc(nfsnode_t np, u_int32_t *mode, vfs_context_t ctx) nfsm_chain_build_alloc_init(error, &nmreq, NFSX_FH(NFS_VER3) + NFSX_UNSIGNED); nfsm_chain_add_fh(error, &nmreq, NFS_VER3, np->n_fhp, np->n_fhsize); - nfsm_chain_add_32(error, &nmreq, *mode); + nfsm_chain_add_32(error, &nmreq, *access); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC_ACCESS, ctx, - &nmrep, &xid, &status); + error = nfs_request(np, NULL, &nmreq, NFSPROC_ACCESS, ctx, NULL, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; nfsm_chain_postop_attr_update(error, &nmrep, np, &xid); if (!error) error = status; - nfsm_chain_get_32(error, &nmrep, access); + nfsm_chain_get_32(error, &nmrep, access_result); nfsmout_if(error); uid = kauth_cred_getuid(vfs_context_ucred(ctx)); - slot = nfs_node_mode_slot(np, uid, 1); - np->n_modeuid[slot] = uid; + slot = nfs_node_access_slot(np, uid, 1); + np->n_accessuid[slot] = uid; microuptime(&now); - np->n_modestamp[slot] = now.tv_sec; - np->n_mode[slot] = access; + np->n_accessstamp[slot] = now.tv_sec; + np->n_access[slot] = access_result; /* * If we asked for DELETE but didn't get it, the server @@ -479,11 +509,14 @@ nfs3_access_rpc(nfsnode_t np, u_int32_t *mode, vfs_context_t ctx) * and just let any subsequent delete action fail if it * really isn't deletable. */ - if ((*mode & NFS_ACCESS_DELETE) && - !(np->n_mode[slot] & NFS_ACCESS_DELETE)) - np->n_mode[slot] |= NFS_ACCESS_DELETE; - /* pass back the mode returned with this request */ - *mode = np->n_mode[slot]; + if ((*access & NFS_ACCESS_DELETE) && + !(np->n_access[slot] & NFS_ACCESS_DELETE)) + np->n_access[slot] |= NFS_ACCESS_DELETE; + /* ".zfs" subdirectories may erroneously give a denied answer for add/remove */ + if (nfs_access_dotzfs && (np->n_flag & NISDOTZFSCHILD)) + np->n_access[slot] |= (NFS_ACCESS_MODIFY|NFS_ACCESS_EXTEND|NFS_ACCESS_DELETE); + /* pass back the access returned with this request */ + *access = np->n_access[slot]; nfsmout: if (!lockerror) nfs_node_unlock(np); @@ -495,8 +528,8 @@ nfs3_access_rpc(nfsnode_t np, u_int32_t *mode, vfs_context_t ctx) /* * NFS access vnode op. * For NFS version 2, just return ok. File accesses may fail later. - * For NFS version 3+, use the access RPC to check accessibility. If file modes - * are changed on the server, accesses might still fail later. + * For NFS version 3+, use the access RPC to check accessibility. If file + * permissions are changed on the server, accesses might still fail later. */ int nfs_vnop_access( @@ -510,7 +543,7 @@ nfs_vnop_access( vfs_context_t ctx = ap->a_context; vnode_t vp = ap->a_vp; int error = 0, slot, dorpc; - u_int32_t mode, wmode; + u_int32_t access, waccess; nfsnode_t np = VTONFS(vp); struct nfsmount *nmp; int nfsvers; @@ -541,53 +574,53 @@ nfs_vnop_access( /* * Convert KAUTH primitives to NFS access rights. */ - mode = 0; + access = 0; if (vnode_isdir(vp)) { /* directory */ if (ap->a_action & (KAUTH_VNODE_LIST_DIRECTORY | KAUTH_VNODE_READ_EXTATTRIBUTES)) - mode |= NFS_ACCESS_READ; + access |= NFS_ACCESS_READ; if (ap->a_action & KAUTH_VNODE_SEARCH) - mode |= NFS_ACCESS_LOOKUP; + access |= NFS_ACCESS_LOOKUP; if (ap->a_action & (KAUTH_VNODE_ADD_FILE | KAUTH_VNODE_ADD_SUBDIRECTORY)) - mode |= NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND; + access |= NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND; if (ap->a_action & KAUTH_VNODE_DELETE_CHILD) - mode |= NFS_ACCESS_MODIFY; + access |= NFS_ACCESS_MODIFY; } else { /* file */ if (ap->a_action & (KAUTH_VNODE_READ_DATA | KAUTH_VNODE_READ_EXTATTRIBUTES)) - mode |= NFS_ACCESS_READ; + access |= NFS_ACCESS_READ; if (ap->a_action & KAUTH_VNODE_WRITE_DATA) - mode |= NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND; + access |= NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND; if (ap->a_action & KAUTH_VNODE_APPEND_DATA) - mode |= NFS_ACCESS_EXTEND; + access |= NFS_ACCESS_EXTEND; if (ap->a_action & KAUTH_VNODE_EXECUTE) - mode |= NFS_ACCESS_EXECUTE; + access |= NFS_ACCESS_EXECUTE; } /* common */ if (ap->a_action & KAUTH_VNODE_DELETE) - mode |= NFS_ACCESS_DELETE; + access |= NFS_ACCESS_DELETE; if (ap->a_action & (KAUTH_VNODE_WRITE_ATTRIBUTES | KAUTH_VNODE_WRITE_EXTATTRIBUTES | KAUTH_VNODE_WRITE_SECURITY)) - mode |= NFS_ACCESS_MODIFY; + access |= NFS_ACCESS_MODIFY; /* XXX this is pretty dubious */ if (ap->a_action & KAUTH_VNODE_CHANGE_OWNER) - mode |= NFS_ACCESS_MODIFY; + access |= NFS_ACCESS_MODIFY; /* if caching, always ask for every right */ if (nfs_access_cache_timeout > 0) { - wmode = NFS_ACCESS_READ | NFS_ACCESS_MODIFY | + waccess = NFS_ACCESS_READ | NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND | NFS_ACCESS_EXECUTE | NFS_ACCESS_DELETE | NFS_ACCESS_LOOKUP; } else { - wmode = mode; + waccess = access; } if ((error = nfs_node_lock(np))) @@ -598,39 +631,44 @@ nfs_vnop_access( * this request? */ uid = kauth_cred_getuid(vfs_context_ucred(ctx)); - slot = nfs_node_mode_slot(np, uid, 0); + slot = nfs_node_access_slot(np, uid, 0); dorpc = 1; - if (mode == 0) { + if (access == 0) { /* not asking for any rights understood by NFS, so don't bother doing an RPC */ /* OSAddAtomic(1, &nfsstats.accesscache_hits); */ dorpc = 0; - wmode = 0; - } else if (NMODEVALID(np, slot)) { + waccess = 0; + } else if (NACCESSVALID(np, slot)) { microuptime(&now); - if ((now.tv_sec < (np->n_modestamp[slot] + nfs_access_cache_timeout)) && - ((np->n_mode[slot] & mode) == mode)) { + if ((now.tv_sec < (np->n_accessstamp[slot] + nfs_access_cache_timeout)) && + ((np->n_access[slot] & access) == access)) { /* OSAddAtomic(1, &nfsstats.accesscache_hits); */ dorpc = 0; - wmode = np->n_mode[slot]; + waccess = np->n_access[slot]; } } nfs_node_unlock(np); if (dorpc) { /* Either a no, or a don't know. Go to the wire. */ /* OSAddAtomic(1, &nfsstats.accesscache_misses); */ - error = nmp->nm_funcs->nf_access_rpc(np, &wmode, ctx); + error = nmp->nm_funcs->nf_access_rpc(np, &waccess, ctx); } - if (!error && ((wmode & mode) != mode)) + if (!error && ((waccess & access) != access)) error = EACCES; return (error); } + /* * NFS open vnode op + * + * Perform various update/invalidation checks and then add the + * open to the node. Regular files will have an open file structure + * on the node and, for NFSv4, perform an OPEN request on the server. */ int -nfs3_vnop_open( +nfs_vnop_open( struct vnop_open_args /* { struct vnodeop_desc *a_desc; vnode_t a_vp; @@ -642,17 +680,25 @@ nfs3_vnop_open( vnode_t vp = ap->a_vp; nfsnode_t np = VTONFS(vp); struct nfsmount *nmp; - struct nfs_vattr nvattr; + int error, accessMode, denyMode, opened = 0; + struct nfs_open_owner *noop = NULL; + struct nfs_open_file *nofp = NULL; enum vtype vtype; - int error; + + if (!(ap->a_mode & (FREAD|FWRITE))) + return (EINVAL); nmp = VTONMP(vp); if (!nmp) return (ENXIO); + if (np->n_flag & NREVOKE) + return (EIO); vtype = vnode_vtype(vp); if ((vtype != VREG) && (vtype != VDIR) && (vtype != VLNK)) return (EACCES); + + /* First, check if we need to update/invalidate */ if (ISSET(np->n_flag, NUPDATESIZE)) nfs_data_update_size(np, 0); if ((error = nfs_node_lock(np))) @@ -666,7 +712,7 @@ nfs3_vnop_open( if ((error = nfs_node_lock(np))) return (error); } - if (vnode_vtype(NFSTOV(np)) == VREG) + if (vtype == VREG) np->n_lastrahead = -1; if (np->n_flag & NMODIFIED) { if (vtype == VDIR) @@ -677,12 +723,145 @@ nfs3_vnop_open( } else { nfs_node_unlock(np); } + /* nfs_getattr() will check changed and purge caches */ - return (nfs_getattr(np, &nvattr, ctx, NGA_UNCACHED)); + if ((error = nfs_getattr(np, NULL, ctx, NGA_UNCACHED))) + return (error); + + if (vtype != VREG) { + /* Just mark that it was opened */ + lck_mtx_lock(&np->n_openlock); + np->n_openrefcnt++; + lck_mtx_unlock(&np->n_openlock); + return (0); + } + + /* mode contains some combination of: FREAD, FWRITE, O_SHLOCK, O_EXLOCK */ + accessMode = 0; + if (ap->a_mode & FREAD) + accessMode |= NFS_OPEN_SHARE_ACCESS_READ; + if (ap->a_mode & FWRITE) + accessMode |= NFS_OPEN_SHARE_ACCESS_WRITE; + if (ap->a_mode & O_EXLOCK) + denyMode = NFS_OPEN_SHARE_DENY_BOTH; + else if (ap->a_mode & O_SHLOCK) + denyMode = NFS_OPEN_SHARE_DENY_WRITE; + else + denyMode = NFS_OPEN_SHARE_DENY_NONE; + // XXX don't do deny modes just yet (and never do it for !v4) + denyMode = NFS_OPEN_SHARE_DENY_NONE; + + noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 1); + if (!noop) + return (ENOMEM); + +restart: + error = nfs_mount_state_in_use_start(nmp, vfs_context_thread(ctx)); + if (error) { + nfs_open_owner_rele(noop); + return (error); + } + if (np->n_flag & NREVOKE) { + error = EIO; + nfs_mount_state_in_use_end(nmp, 0); + nfs_open_owner_rele(noop); + return (error); + } + + error = nfs_open_file_find(np, noop, &nofp, accessMode, denyMode, 1); + if (!error && (nofp->nof_flags & NFS_OPEN_FILE_LOST)) { + NP(np, "nfs_vnop_open: LOST %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + error = EIO; + } + if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { + nfs_mount_state_in_use_end(nmp, 0); + error = nfs4_reopen(nofp, vfs_context_thread(ctx)); + nofp = NULL; + if (!error) + goto restart; + } + if (!error) + error = nfs_open_file_set_busy(nofp, vfs_context_thread(ctx)); + if (error) { + nofp = NULL; + goto out; + } + + if (nmp->nm_vers < NFS_VER4) { + /* + * NFS v2/v3 opens are always allowed - so just add it. + */ + nfs_open_file_add_open(nofp, accessMode, denyMode, 0); + goto out; + } + + /* + * If we just created the file and the modes match, then we simply use + * the open performed in the create. Otherwise, send the request. + */ + if ((nofp->nof_flags & NFS_OPEN_FILE_CREATE) && + (nofp->nof_creator == current_thread()) && + (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) && + (denyMode == NFS_OPEN_SHARE_DENY_NONE)) { + nofp->nof_flags &= ~NFS_OPEN_FILE_CREATE; + nofp->nof_creator = NULL; + } else { + if (!opened) + error = nfs4_open(np, nofp, accessMode, denyMode, ctx); + if ((error == EACCES) && (nofp->nof_flags & NFS_OPEN_FILE_CREATE) && + (nofp->nof_creator == current_thread())) { + /* + * Ugh. This can happen if we just created the file with read-only + * perms and we're trying to open it for real with different modes + * (e.g. write-only or with a deny mode) and the server decides to + * not allow the second open because of the read-only perms. + * The best we can do is to just use the create's open. + * We may have access we don't need or we may not have a requested + * deny mode. We may log complaints later, but we'll try to avoid it. + */ + if (denyMode != NFS_OPEN_SHARE_DENY_NONE) + NP(np, "nfs_vnop_open: deny mode foregone on create, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + nofp->nof_creator = NULL; + error = 0; + } + if (error) + goto out; + opened = 1; + /* + * If we had just created the file, we already had it open. + * If the actual open mode is less than what we grabbed at + * create time, then we'll downgrade the open here. + */ + if ((nofp->nof_flags & NFS_OPEN_FILE_CREATE) && + (nofp->nof_creator == current_thread())) { + error = nfs_close(np, nofp, NFS_OPEN_SHARE_ACCESS_BOTH, NFS_OPEN_SHARE_DENY_NONE, ctx); + if (error) + NP(np, "nfs_vnop_open: create close error %d, %d", error, kauth_cred_getuid(nofp->nof_owner->noo_cred)); + if (!nfs_mount_state_error_should_restart(error)) { + error = 0; + nofp->nof_flags &= ~NFS_OPEN_FILE_CREATE; + } + } + } + +out: + if (nofp) + nfs_open_file_clear_busy(nofp); + if (nfs_mount_state_in_use_end(nmp, error)) { + nofp = NULL; + goto restart; + } + if (error) + NP(np, "nfs_vnop_open: error %d, %d", error, kauth_cred_getuid(noop->noo_cred)); + if (noop) + nfs_open_owner_rele(noop); + return (error); } + /* * NFS close vnode op + * * What an NFS client should do upon close after writing is a debatable issue. * Most NFS clients push delayed writes to the server upon close, basically for * two reasons: @@ -700,11 +879,11 @@ nfs3_vnop_open( * * The current code does the following: * for NFS Version 2 - play it safe and flush/invalidate all dirty buffers - * for NFS Version 3 - flush dirty buffers to the server but don't invalidate - * them. + * for NFS Version 3 - flush dirty buffers to the server but don't invalidate them. + * for NFS Version 4 - basically the same as NFSv3 */ int -nfs3_vnop_close( +nfs_vnop_close( struct vnop_close_args /* { struct vnodeop_desc *a_desc; vnode_t a_vp; @@ -716,35 +895,36 @@ nfs3_vnop_close( vnode_t vp = ap->a_vp; nfsnode_t np = VTONFS(vp); struct nfsmount *nmp; - int nfsvers; - int error = 0; + int error = 0, error1, nfsvers; + int fflag = ap->a_fflag; + enum vtype vtype; + int accessMode, denyMode; + struct nfs_open_owner *noop = NULL; + struct nfs_open_file *nofp = NULL; - if (vnode_vtype(vp) != VREG) - return (0); nmp = VTONMP(vp); if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; + vtype = vnode_vtype(vp); + /* First, check if we need to update/flush/invalidate */ if (ISSET(np->n_flag, NUPDATESIZE)) nfs_data_update_size(np, 0); - if ((error = nfs_node_lock(np))) - return (error); + nfs_node_lock_force(np); if (np->n_flag & NNEEDINVALIDATE) { np->n_flag &= ~NNEEDINVALIDATE; nfs_node_unlock(np); nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, ctx, 1); - if ((error = nfs_node_lock(np))) - return (error); + nfs_node_lock_force(np); } - if (np->n_flag & NMODIFIED) { + if ((vtype == VREG) && (np->n_flag & NMODIFIED) && (fflag & FWRITE)) { + /* we're closing an open for write and the file is modified, so flush it */ nfs_node_unlock(np); if (nfsvers != NFS_VER2) error = nfs_flush(np, MNT_WAIT, vfs_context_thread(ctx), 0); else error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1); - if (error) - return (error); nfs_node_lock_force(np); NATTRINVALIDATE(np); } @@ -753,9 +933,266 @@ nfs3_vnop_close( error = np->n_error; } nfs_node_unlock(np); + + if (vtype != VREG) { + /* Just mark that it was closed */ + lck_mtx_lock(&np->n_openlock); + if (np->n_openrefcnt == 0) { + if (fflag & (FREAD|FWRITE)) { + NP(np, "nfs_vnop_close: open reference underrun"); + error = EINVAL; + } + } else if (fflag & (FREAD|FWRITE)) { + np->n_openrefcnt--; + } else { + /* No FREAD/FWRITE set - probably the final close */ + np->n_openrefcnt = 0; + } + lck_mtx_unlock(&np->n_openlock); + return (error); + } + error1 = error; + + /* fflag should contain some combination of: FREAD, FWRITE, FHASLOCK */ + accessMode = 0; + if (fflag & FREAD) + accessMode |= NFS_OPEN_SHARE_ACCESS_READ; + if (fflag & FWRITE) + accessMode |= NFS_OPEN_SHARE_ACCESS_WRITE; +// XXX It would be nice if we still had the O_EXLOCK/O_SHLOCK flags that were on the open +// if (fflag & O_EXLOCK) +// denyMode = NFS_OPEN_SHARE_DENY_BOTH; +// else if (fflag & O_SHLOCK) +// denyMode = NFS_OPEN_SHARE_DENY_WRITE; +// else +// denyMode = NFS_OPEN_SHARE_DENY_NONE; + if (fflag & FHASLOCK) { + /* XXX assume FHASLOCK is for the deny mode and not flock */ + /* FHASLOCK flock will be unlocked in the close path, but the flag is not cleared. */ + if (nofp->nof_deny & NFS_OPEN_SHARE_DENY_READ) + denyMode = NFS_OPEN_SHARE_DENY_BOTH; + else if (nofp->nof_deny & NFS_OPEN_SHARE_DENY_WRITE) + denyMode = NFS_OPEN_SHARE_DENY_WRITE; + else + denyMode = NFS_OPEN_SHARE_DENY_NONE; + } else { + denyMode = NFS_OPEN_SHARE_DENY_NONE; + } + // XXX don't do deny modes just yet (and never do it for !v4) + denyMode = NFS_OPEN_SHARE_DENY_NONE; + + if (!accessMode) { + /* + * No mode given to close? + * Guess this is the final close. + * We should unlock all locks and close all opens. + */ + mount_t mp = vnode_mount(vp); + int force = (!mp || (mp->mnt_kern_flag & MNTK_FRCUNMOUNT)); + nfs_release_open_state_for_node(np, force); + return (error); + } + + noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 0); + if (!noop) { + // printf("nfs_vnop_close: can't get open owner!\n"); + return (EIO); + } + +restart: + error = nfs_mount_state_in_use_start(nmp, NULL); + if (error) { + nfs_open_owner_rele(noop); + return (error); + } + + error = nfs_open_file_find(np, noop, &nofp, 0, 0, 0); + if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { + nfs_mount_state_in_use_end(nmp, 0); + error = nfs4_reopen(nofp, NULL); + nofp = NULL; + if (!error) + goto restart; + } + if (error) { + NP(np, "nfs_vnop_close: no open file for owner, error %d, %d", error, kauth_cred_getuid(noop->noo_cred)); + error = EBADF; + goto out; + } + error = nfs_open_file_set_busy(nofp, NULL); + if (error) { + nofp = NULL; + goto out; + } + + error = nfs_close(np, nofp, accessMode, denyMode, ctx); + if (error) + NP(np, "nfs_vnop_close: close error %d, %d", error, kauth_cred_getuid(noop->noo_cred)); + +out: + if (nofp) + nfs_open_file_clear_busy(nofp); + if (nfs_mount_state_in_use_end(nmp, error)) { + nofp = NULL; + goto restart; + } + if (!error) + error = error1; + if (error) + NP(np, "nfs_vnop_close: error %d, %d", error, kauth_cred_getuid(noop->noo_cred)); + if (noop) + nfs_open_owner_rele(noop); return (error); } +/* + * nfs_close(): common function that does all the heavy lifting of file closure + * + * Takes an open file structure and a set of access/deny modes and figures out how + * to update the open file structure (and the state on the server) appropriately. + */ +int +nfs_close( + nfsnode_t np, + struct nfs_open_file *nofp, + uint32_t accessMode, + uint32_t denyMode, + vfs_context_t ctx) +{ + struct nfs_lock_owner *nlop; + int error = 0, changed = 0, delegated = 0, closed = 0, downgrade = 0; + uint32_t newAccessMode, newDenyMode; + + /* warn if modes don't match current state */ + if (((accessMode & nofp->nof_access) != accessMode) || ((denyMode & nofp->nof_deny) != denyMode)) + NP(np, "nfs_close: mode mismatch %d %d, current %d %d, %d", + accessMode, denyMode, nofp->nof_access, nofp->nof_deny, + kauth_cred_getuid(nofp->nof_owner->noo_cred)); + + /* + * If we're closing a write-only open, we may not have a write-only count + * if we also grabbed read access. So, check the read-write count. + */ + if (denyMode == NFS_OPEN_SHARE_DENY_NONE) { + if ((accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) && + (nofp->nof_w == 0) && (nofp->nof_d_w == 0) && + (nofp->nof_rw || nofp->nof_d_rw)) + accessMode = NFS_OPEN_SHARE_ACCESS_BOTH; + } else if (denyMode == NFS_OPEN_SHARE_DENY_WRITE) { + if ((accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) && + (nofp->nof_w_dw == 0) && (nofp->nof_d_w_dw == 0) && + (nofp->nof_rw_dw || nofp->nof_d_rw_dw)) + accessMode = NFS_OPEN_SHARE_ACCESS_BOTH; + } else { /* NFS_OPEN_SHARE_DENY_BOTH */ + if ((accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) && + (nofp->nof_w_drw == 0) && (nofp->nof_d_w_drw == 0) && + (nofp->nof_rw_drw || nofp->nof_d_rw_drw)) + accessMode = NFS_OPEN_SHARE_ACCESS_BOTH; + } + + nfs_open_file_remove_open_find(nofp, accessMode, denyMode, &newAccessMode, &newDenyMode, &delegated); + if ((newAccessMode != nofp->nof_access) || (newDenyMode != nofp->nof_deny)) + changed = 1; + else + changed = 0; + + if (NFSTONMP(np)->nm_vers < NFS_VER4) /* NFS v2/v3 closes simply need to remove the open. */ + goto v3close; + + if ((newAccessMode == 0) || (nofp->nof_opencnt == 1)) { + /* + * No more access after this close, so clean up and close it. + * Don't send a close RPC if we're closing a delegated open. + */ + nfs_wait_bufs(np); + closed = 1; + if (!delegated && !(nofp->nof_flags & NFS_OPEN_FILE_LOST)) + error = nfs4_close_rpc(np, nofp, vfs_context_thread(ctx), vfs_context_ucred(ctx), 0); + if (error == NFSERR_LOCKS_HELD) { + /* + * Hmm... the server says we have locks we need to release first + * Find the lock owner and try to unlock everything. + */ + nlop = nfs_lock_owner_find(np, vfs_context_proc(ctx), 0); + if (nlop) { + nfs4_unlock_rpc(np, nlop, F_WRLCK, 0, UINT64_MAX, + 0, vfs_context_thread(ctx), vfs_context_ucred(ctx)); + nfs_lock_owner_rele(nlop); + } + error = nfs4_close_rpc(np, nofp, vfs_context_thread(ctx), vfs_context_ucred(ctx), 0); + } + } else if (changed) { + /* + * File is still open but with less access, so downgrade the open. + * Don't send a downgrade RPC if we're closing a delegated open. + */ + if (!delegated && !(nofp->nof_flags & NFS_OPEN_FILE_LOST)) { + downgrade = 1; + /* + * If we have delegated opens, we should probably claim them before sending + * the downgrade because the server may not know the open we are downgrading to. + */ + if (nofp->nof_d_rw_drw || nofp->nof_d_w_drw || nofp->nof_d_r_drw || + nofp->nof_d_rw_dw || nofp->nof_d_w_dw || nofp->nof_d_r_dw || + nofp->nof_d_rw || nofp->nof_d_w || nofp->nof_d_r) + nfs4_claim_delegated_state_for_open_file(nofp, 0); + /* need to remove the open before sending the downgrade */ + nfs_open_file_remove_open(nofp, accessMode, denyMode); + error = nfs4_open_downgrade_rpc(np, nofp, ctx); + if (error) /* Hmm.. that didn't work. Add the open back in. */ + nfs_open_file_add_open(nofp, accessMode, denyMode, delegated); + } + } + + if (error) { + NP(np, "nfs_close: error %d, %d", error, kauth_cred_getuid(nofp->nof_owner->noo_cred)); + return (error); + } + +v3close: + if (!downgrade) + nfs_open_file_remove_open(nofp, accessMode, denyMode); + + if (closed) { + lck_mtx_lock(&nofp->nof_lock); + if (nofp->nof_r || nofp->nof_d_r || nofp->nof_w || nofp->nof_d_w || nofp->nof_d_rw || + (nofp->nof_rw && !((nofp->nof_flags & NFS_OPEN_FILE_CREATE) && !nofp->nof_creator && (nofp->nof_rw == 1))) || + nofp->nof_r_dw || nofp->nof_d_r_dw || nofp->nof_w_dw || nofp->nof_d_w_dw || + nofp->nof_rw_dw || nofp->nof_d_rw_dw || nofp->nof_r_drw || nofp->nof_d_r_drw || + nofp->nof_w_drw || nofp->nof_d_w_drw || nofp->nof_rw_drw || nofp->nof_d_rw_drw) + NP(np, "nfs_close: unexpected count: %u.%u %u.%u %u.%u dw %u.%u %u.%u %u.%u drw %u.%u %u.%u %u.%u flags 0x%x, %d", + nofp->nof_r, nofp->nof_d_r, nofp->nof_w, nofp->nof_d_w, + nofp->nof_rw, nofp->nof_d_rw, nofp->nof_r_dw, nofp->nof_d_r_dw, + nofp->nof_w_dw, nofp->nof_d_w_dw, nofp->nof_rw_dw, nofp->nof_d_rw_dw, + nofp->nof_r_drw, nofp->nof_d_r_drw, nofp->nof_w_drw, nofp->nof_d_w_drw, + nofp->nof_rw_drw, nofp->nof_d_rw_drw, nofp->nof_flags, + kauth_cred_getuid(nofp->nof_owner->noo_cred)); + /* clear out all open info, just to be safe */ + nofp->nof_access = nofp->nof_deny = 0; + nofp->nof_mmap_access = nofp->nof_mmap_deny = 0; + nofp->nof_r = nofp->nof_d_r = 0; + nofp->nof_w = nofp->nof_d_w = 0; + nofp->nof_rw = nofp->nof_d_rw = 0; + nofp->nof_r_dw = nofp->nof_d_r_dw = 0; + nofp->nof_w_dw = nofp->nof_d_w_dw = 0; + nofp->nof_rw_dw = nofp->nof_d_rw_dw = 0; + nofp->nof_r_drw = nofp->nof_d_r_drw = 0; + nofp->nof_w_drw = nofp->nof_d_w_drw = 0; + nofp->nof_rw_drw = nofp->nof_d_rw_drw = 0; + nofp->nof_flags &= ~NFS_OPEN_FILE_CREATE; + lck_mtx_unlock(&nofp->nof_lock); + /* XXX we may potentially want to clean up idle/unused open file structures */ + } + if (nofp->nof_flags & NFS_OPEN_FILE_LOST) { + error = EIO; + NP(np, "nfs_close: LOST%s, %d", !nofp->nof_opencnt ? " (last)" : "", + kauth_cred_getuid(nofp->nof_owner->noo_cred)); + } + return (error); +} + + + int nfs3_getattr_rpc( @@ -763,18 +1200,22 @@ nfs3_getattr_rpc( mount_t mp, u_char *fhp, size_t fhsize, + int flags, vfs_context_t ctx, struct nfs_vattr *nvap, u_int64_t *xidp) { struct nfsmount *nmp = mp ? VFSTONFS(mp) : NFSTONMP(np); - int error = 0, status, nfsvers; + int error = 0, status, nfsvers, rpcflags = 0; struct nfsm_chain nmreq, nmrep; if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; + if (flags & NGA_MONITOR) /* vnode monitor requests should be soft */ + rpcflags = R_RECOVER; + nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -784,8 +1225,9 @@ nfs3_getattr_rpc( nfsm_chain_add_opaque(error, &nmreq, fhp, fhsize); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfs_request(np, mp, &nmreq, NFSPROC_GETATTR, ctx, - &nmrep, xidp, &status); + error = nfs_request2(np, mp, &nmreq, NFSPROC_GETATTR, + vfs_context_thread(ctx), vfs_context_ucred(ctx), + NULL, rpcflags, &nmrep, xidp, &status); if (!error) error = status; nfsmout_if(error); @@ -798,10 +1240,11 @@ nfs3_getattr_rpc( int -nfs_getattr(nfsnode_t np, struct nfs_vattr *nvap, vfs_context_t ctx, int uncached) +nfs_getattr(nfsnode_t np, struct nfs_vattr *nvap, vfs_context_t ctx, int flags) { struct nfsmount *nmp; int error = 0, nfsvers, inprogset = 0, wanted = 0, avoidfloods; + struct nfs_vattr nvattr; struct timespec ts = { 2, 0 }; u_int64_t xid; @@ -811,6 +1254,10 @@ nfs_getattr(nfsnode_t np, struct nfs_vattr *nvap, vfs_context_t ctx, int uncache return (ENXIO); nfsvers = nmp->nm_vers; + if (!nvap) + nvap = &nvattr; + NVATTR_INIT(nvap); + /* Update local times for special files. */ if (np->n_flag & (NACC | NUPD)) { nfs_node_lock_force(np); @@ -823,15 +1270,27 @@ nfs_getattr(nfsnode_t np, struct nfs_vattr *nvap, vfs_context_t ctx, int uncache error = nfs_node_lock(np); nfsmout_if(error); - if (!uncached) { + if (!(flags & (NGA_UNCACHED|NGA_MONITOR)) || ((nfsvers >= NFS_VER4) && (np->n_openflags & N_DELEG_MASK))) { + /* + * Use the cache or wait for any getattr in progress if: + * - it's a cached request, or + * - we have a delegation + */ while (1) { - error = nfs_getattrcache(np, nvap); + error = nfs_getattrcache(np, nvap, flags); if (!error || (error != ENOENT)) { nfs_node_unlock(np); goto nfsmout; } + error = 0; if (!ISSET(np->n_flag, NGETATTRINPROG)) break; + if (flags & NGA_MONITOR) { + /* no need to wait if a request is pending */ + error = EINPROGRESS; + nfs_node_unlock(np); + goto nfsmout; + } SET(np->n_flag, NGETATTRWANT); msleep(np, &np->n_lock, PZERO-1, "nfsgetattrwant", &ts); if ((error = nfs_sigintr(NFSTONMP(np), NULL, vfs_context_thread(ctx), 0))) { @@ -844,30 +1303,33 @@ nfs_getattr(nfsnode_t np, struct nfs_vattr *nvap, vfs_context_t ctx, int uncache } else if (!ISSET(np->n_flag, NGETATTRINPROG)) { SET(np->n_flag, NGETATTRINPROG); inprogset = 1; + } else if (flags & NGA_MONITOR) { + /* no need to make a request if one is pending */ + error = EINPROGRESS; } nfs_node_unlock(np); nmp = NFSTONMP(np); - if (!nmp) { + if (!nmp) error = ENXIO; + if (error) goto nfsmout; - } /* - * Try to get both the attributes and access info by making an - * ACCESS call and seeing if it returns updated attributes. + * We might want to try to get both the attributes and access info by + * making an ACCESS call and seeing if it returns updated attributes. * But don't bother if we aren't caching access info or if the * attributes returned wouldn't be cached. */ - if ((nfsvers != NFS_VER2) && (nfs_access_cache_timeout > 0)) { + if (!(flags & NGA_ACL) && (nfsvers != NFS_VER2) && nfs_access_for_getattr && (nfs_access_cache_timeout > 0)) { if (nfs_attrcachetimeout(np) > 0) { /* OSAddAtomic(1, &nfsstats.accesscache_misses); */ - u_int32_t mode = NFS_ACCESS_ALL; - error = nmp->nm_funcs->nf_access_rpc(np, &mode, ctx); + u_int32_t access = NFS_ACCESS_ALL; + error = nmp->nm_funcs->nf_access_rpc(np, &access, ctx); if (error) goto nfsmout; nfs_node_lock_force(np); - error = nfs_getattrcache(np, nvap); + error = nfs_getattrcache(np, nvap, flags); nfs_node_unlock(np); if (!error || (error != ENOENT)) goto nfsmout; @@ -878,7 +1340,7 @@ nfs_getattr(nfsnode_t np, struct nfs_vattr *nvap, vfs_context_t ctx, int uncache avoidfloods = 0; tryagain: - error = nmp->nm_funcs->nf_getattr_rpc(np, NULL, np->n_fhp, np->n_fhsize, ctx, nvap, &xid); + error = nmp->nm_funcs->nf_getattr_rpc(np, NULL, np->n_fhp, np->n_fhsize, flags, ctx, nvap, &xid); if (!error) { nfs_node_lock_force(np); error = nfs_loadattrcache(np, nvap, &xid, 0); @@ -933,6 +1395,17 @@ nfs_getattr(nfsnode_t np, struct nfs_vattr *nvap, vfs_context_t ctx, int uncache if (wanted) wakeup(np); } + + if (nvap == &nvattr) { + NVATTR_CLEANUP(nvap); + } else if (!(flags & NGA_ACL)) { + /* make sure we don't return an ACL if it wasn't asked for */ + NFS_BITMAP_CLR(nvap->nva_bitmap, NFS_FATTR_ACL); + if (nvap->nva_acl) { + kauth_acl_free(nvap->nva_acl); + nvap->nva_acl = NULL; + } + } FSDBG_BOT(513, np->n_size, error, np->n_vattr.nva_size, np->n_flag); return (error); } @@ -1002,20 +1475,20 @@ nfs_vnop_setattr( struct nfsmount *nmp; struct vnode_attr *vap = ap->a_vap; int error = 0; - int biosize, nfsvers; - u_quad_t origsize; + int biosize, nfsvers, namedattrs; + u_quad_t origsize, vapsize; struct nfs_dulookup dul; nfsnode_t dnp = NULL; vnode_t dvp = NULL; const char *vname = NULL; struct nfs_open_owner *noop = NULL; struct nfs_open_file *nofp = NULL; - struct nfs_vattr nvattr; nmp = VTONMP(vp); if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; + namedattrs = (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR); biosize = nmp->nm_biosize; /* Disallow write attempts if the filesystem is mounted read-only. */ @@ -1058,46 +1531,52 @@ nfs_vnop_setattr( /* flush everything */ error = nfs_vinvalbuf(vp, (vap->va_data_size ? V_SAVE : 0) , ctx, 1); if (error) { - printf("nfs_setattr: nfs_vinvalbuf %d\n", error); + NP(np, "nfs_setattr: nfs_vinvalbuf %d", error); FSDBG_BOT(512, np->n_size, vap->va_data_size, np->n_vattr.nva_size, -1); return (error); } if (nfsvers >= NFS_VER4) { /* setting file size requires having the file open for write access */ + if (np->n_flag & NREVOKE) + return (EIO); noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 1); if (!noop) return (ENOMEM); -retryopen: +restart: + error = nfs_mount_state_in_use_start(nmp, vfs_context_thread(ctx)); + if (error) + return (error); + if (np->n_flag & NREVOKE) { + nfs_mount_state_in_use_end(nmp, 0); + return (EIO); + } error = nfs_open_file_find(np, noop, &nofp, 0, 0, 1); if (!error && (nofp->nof_flags & NFS_OPEN_FILE_LOST)) error = EIO; if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { - nfs4_reopen(nofp, vfs_context_thread(ctx)); + nfs_mount_state_in_use_end(nmp, 0); + error = nfs4_reopen(nofp, vfs_context_thread(ctx)); nofp = NULL; - goto retryopen; + if (!error) + goto restart; } + if (!error) + error = nfs_open_file_set_busy(nofp, vfs_context_thread(ctx)); if (error) { nfs_open_owner_rele(noop); return (error); } if (!(nofp->nof_access & NFS_OPEN_SHARE_ACCESS_WRITE)) { /* we don't have the file open for write access, so open it */ - error = nfs_mount_state_in_use_start(nmp); - if (!error) - error = nfs_open_file_set_busy(nofp, vfs_context_thread(ctx)); - if (error) { - nfs_open_owner_rele(noop); - return (error); - } error = nfs4_open(np, nofp, NFS_OPEN_SHARE_ACCESS_WRITE, NFS_OPEN_SHARE_DENY_NONE, ctx); if (!error) nofp->nof_flags |= NFS_OPEN_FILE_SETATTR; if (nfs_mount_state_error_should_restart(error)) { nfs_open_file_clear_busy(nofp); nofp = NULL; + if (nfs_mount_state_in_use_end(nmp, error)) + goto restart; } - if (nfs_mount_state_in_use_end(nmp, error)) - goto retryopen; } } nfs_data_lock(np, NFS_DATA_LOCK_EXCLUSIVE); @@ -1198,61 +1677,52 @@ nfs_vnop_setattr( nfs_node_unlock(np); } } - if (VATTR_IS_ACTIVE(vap, va_mode) || - VATTR_IS_ACTIVE(vap, va_uid) || - VATTR_IS_ACTIVE(vap, va_gid)) { - if ((error = nfs_node_lock(np))) { - if (VATTR_IS_ACTIVE(vap, va_data_size)) - nfs_data_unlock(np); - return (error); - } - NMODEINVALIDATE(np); + if ((VATTR_IS_ACTIVE(vap, va_mode) || VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid) || + VATTR_IS_ACTIVE(vap, va_acl) || VATTR_IS_ACTIVE(vap, va_uuuid) || VATTR_IS_ACTIVE(vap, va_guuid)) && + !(error = nfs_node_lock(np))) { + NACCESSINVALIDATE(np); nfs_node_unlock(np); - dvp = vnode_getparent(vp); - vname = vnode_getname(vp); - dnp = (dvp && vname) ? VTONFS(dvp) : NULL; - if (dnp) { - error = nfs_node_set_busy(dnp, vfs_context_thread(ctx)); - if (error) { - dnp = NULL; - error = 0; + if (!namedattrs) { + dvp = vnode_getparent(vp); + vname = vnode_getname(vp); + dnp = (dvp && vname) ? VTONFS(dvp) : NULL; + if (dnp) { + error = nfs_node_set_busy(dnp, vfs_context_thread(ctx)); + if (error) { + dnp = NULL; + error = 0; + } + } + if (dnp) { + nfs_dulookup_init(&dul, dnp, vname, strlen(vname), ctx); + nfs_dulookup_start(&dul, dnp, ctx); } - } - if (dnp) { - nfs_dulookup_init(&dul, dnp, vname, strlen(vname), ctx); - nfs_dulookup_start(&dul, dnp, ctx); } } -retrysetattr: - if (VATTR_IS_ACTIVE(vap, va_data_size) && (nfsvers >= NFS_VER4)) - error = nfs_mount_state_in_use_start(nmp); - - if (!error) { + if (!error) error = nmp->nm_funcs->nf_setattr_rpc(np, vap, ctx); - if (VATTR_IS_ACTIVE(vap, va_data_size) && (nfsvers >= NFS_VER4)) - if (nfs_mount_state_in_use_end(nmp, error)) - goto retrysetattr; - } - - if (VATTR_IS_ACTIVE(vap, va_mode) || - VATTR_IS_ACTIVE(vap, va_uid) || - VATTR_IS_ACTIVE(vap, va_gid)) { - if (dnp) { - nfs_dulookup_finish(&dul, dnp, ctx); - nfs_node_clear_busy(dnp); + if (VATTR_IS_ACTIVE(vap, va_mode) || VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid) || + VATTR_IS_ACTIVE(vap, va_acl) || VATTR_IS_ACTIVE(vap, va_uuuid) || VATTR_IS_ACTIVE(vap, va_guuid)) { + if (!namedattrs) { + if (dnp) { + nfs_dulookup_finish(&dul, dnp, ctx); + nfs_node_clear_busy(dnp); + } + if (dvp != NULLVP) + vnode_put(dvp); + if (vname != NULL) + vnode_putname(vname); } - if (dvp != NULLVP) - vnode_put(dvp); - if (vname != NULL) - vnode_putname(vname); } FSDBG_BOT(512, np->n_size, vap->va_data_size, np->n_vattr.nva_size, error); if (VATTR_IS_ACTIVE(vap, va_data_size)) { - if (error && (origsize != np->n_size)) { + if (error && (origsize != np->n_size) && + ((nfsvers < NFS_VER4) || !nfs_mount_state_error_should_restart(error))) { /* make every effort to resync file size w/ server... */ + /* (don't bother if we'll be restarting the operation) */ int err; /* preserve "error" for return */ np->n_size = np->n_vattr.nva_size = origsize; nfs_node_lock_force(np); @@ -1260,10 +1730,12 @@ nfs_vnop_setattr( nfs_node_unlock(np); FSDBG(512, np, np->n_size, np->n_vattr.nva_size, 0xf00d0002); ubc_setsize(vp, (off_t)np->n_size); /* XXX check error */ + vapsize = vap->va_data_size; vap->va_data_size = origsize; err = nmp->nm_funcs->nf_setattr_rpc(np, vap, ctx); if (err) - printf("nfs_vnop_setattr: nfs%d_setattr_rpc %d %d\n", nfsvers, error, err); + NP(np, "nfs_vnop_setattr: nfs%d_setattr_rpc %d %d", nfsvers, error, err); + vap->va_data_size = vapsize; } nfs_node_lock_force(np); /* @@ -1276,22 +1748,26 @@ nfs_vnop_setattr( CLR(np->n_flag, NUPDATESIZE); NATTRINVALIDATE(np); nfs_node_unlock(np); - nfs_getattr(np, &nvattr, ctx, NGA_UNCACHED); + nfs_getattr(np, NULL, ctx, NGA_UNCACHED); } else { nfs_node_unlock(np); } nfs_data_unlock(np); if (nfsvers >= NFS_VER4) { - if (nofp->nof_flags & NFS_OPEN_FILE_SETATTR) { - int err = nfs4_close(np, nofp, NFS_OPEN_SHARE_ACCESS_WRITE, NFS_OPEN_SHARE_DENY_NONE, ctx); - if (err) { - vname = vnode_getname(NFSTOV(np)); - printf("nfs_vnop_setattr: close error: %d, %s\n", err, vname); - vnode_putname(vname); + if (nofp) { + /* don't close our setattr open if we'll be restarting... */ + if (!nfs_mount_state_error_should_restart(error) && + (nofp->nof_flags & NFS_OPEN_FILE_SETATTR)) { + int err = nfs_close(np, nofp, NFS_OPEN_SHARE_ACCESS_WRITE, NFS_OPEN_SHARE_DENY_NONE, ctx); + if (err) + NP(np, "nfs_vnop_setattr: close error: %d", err); + nofp->nof_flags &= ~NFS_OPEN_FILE_SETATTR; } - nofp->nof_flags &= ~NFS_OPEN_FILE_SETATTR; nfs_open_file_clear_busy(nofp); + nofp = NULL; } + if (nfs_mount_state_in_use_end(nmp, error)) + goto restart; nfs_open_owner_rele(noop); } } @@ -1414,8 +1890,7 @@ nfs3_setattr_rpc( } nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC_SETATTR, ctx, - &nmrep, &xid, &status); + error = nfs_request(np, NULL, &nmreq, NFSPROC_SETATTR, ctx, NULL, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; if (nfsvers == NFS_VER3) { @@ -1435,7 +1910,7 @@ nfs3_setattr_rpc( } else { if (!error) error = status; - nfsm_chain_loadattr(error, &nmrep, np, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, nfsvers, &xid); } /* * We just changed the attributes and we want to make sure that we @@ -1495,6 +1970,7 @@ nfs_vnop_lookup( *vpp = NULLVP; dnp = VTONFS(dvp); + NVATTR_INIT(&nvattr); mp = vnode_mount(dvp); nmp = VFSTONFS(mp); @@ -1503,12 +1979,12 @@ nfs_vnop_lookup( goto error_return; } nfsvers = nmp->nm_vers; - negnamecache = !(nmp->nm_flag & NFSMNT_NONEGNAMECACHE); + negnamecache = !NMFLAG(nmp, NONEGNAMECACHE); if ((error = busyerror = nfs_node_set_busy(dnp, vfs_context_thread(ctx)))) goto error_return; /* nfs_getattr() will check changed and purge caches */ - if ((error = nfs_getattr(dnp, &nvattr, ctx, NGA_CACHED))) + if ((error = nfs_getattr(dnp, NULL, ctx, NGA_CACHED))) goto error_return; error = cache_lookup(dvp, vpp, cnp); @@ -1518,7 +1994,7 @@ nfs_vnop_lookup( goto error_return; case 0: /* cache miss */ - if ((nfsvers > NFS_VER2) && (nmp->nm_flag & NFSMNT_RDIRPLUS)) { + if ((nfsvers > NFS_VER2) && NMFLAG(nmp, RDIRPLUS)) { /* if rdirplus, try dir buf cache lookup */ error = nfs_dir_buf_cache_lookup(dnp, &np, cnp, ctx, 0); if (!error && np) { @@ -1535,8 +2011,10 @@ nfs_vnop_lookup( OSAddAtomic(1, &nfsstats.lookupcache_hits); nfs_node_clear_busy(dnp); + busyerror = ENOENT; /* check for directory access */ + naa.a_desc = &vnop_access_desc; naa.a_vp = dvp; naa.a_action = KAUTH_VNODE_SEARCH; naa.a_context = ctx; @@ -1561,6 +2039,11 @@ nfs_vnop_lookup( fh.fh_len = 0; goto found; } + if ((nfsvers >= NFS_VER4) && (dnp->n_vattr.nva_flags & NFS_FFLAG_TRIGGER)) { + /* we should never be looking things up in a trigger directory, return nothing */ + error = ENOENT; + goto error_return; + } /* do we know this name is too long? */ nmp = VTONMP(dvp); @@ -1581,7 +2064,7 @@ nfs_vnop_lookup( error = nmp->nm_funcs->nf_lookup_rpc_async(dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx, &req); nfsmout_if(error); - error = nmp->nm_funcs->nf_lookup_rpc_async_finish(dnp, ctx, req, &xid, &fh, &nvattr); + error = nmp->nm_funcs->nf_lookup_rpc_async_finish(dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx, req, &xid, &fh, &nvattr); nfsmout_if(error); /* is the file handle the same as this directory's file handle? */ @@ -1620,7 +2103,7 @@ nfs_vnop_lookup( nfs_node_unlock(dnp); } else { ngflags = (cnp->cn_flags & MAKEENTRY) ? NG_MAKEENTRY : 0; - error = nfs_nget(mp, dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, ngflags, &np); + error = nfs_nget(mp, dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, rq.r_auth, ngflags, &np); if (error) goto error_return; newvp = NFSTOV(np); @@ -1647,6 +2130,7 @@ nfs_vnop_lookup( nfs_node_unlock(dnp); } error_return: + NVATTR_CLEANUP(&nvattr); if (!busyerror) nfs_node_clear_busy(dnp); if (error && *vpp) { @@ -1656,26 +2140,6 @@ nfs_vnop_lookup( return (error); } -/* - * NFS read call. - * Just call nfs_bioread() to do the work. - */ -int -nfs_vnop_read( - struct vnop_read_args /* { - struct vnodeop_desc *a_desc; - vnode_t a_vp; - struct uio *a_uio; - int a_ioflag; - vfs_context_t a_context; - } */ *ap) -{ - if (vnode_vtype(ap->a_vp) != VREG) - return (EPERM); - return (nfs_bioread(VTONFS(ap->a_vp), ap->a_uio, ap->a_ioflag, ap->a_context)); -} - - /* * NFS readlink call */ @@ -1694,7 +2158,6 @@ nfs_vnop_readlink( int error = 0, nfsvers; uint32_t buflen; uio_t uio = ap->a_uio; - struct nfs_vattr nvattr; struct nfsbuf *bp = NULL; if (vnode_vtype(ap->a_vp) != VLNK) @@ -1711,7 +2174,7 @@ nfs_vnop_readlink( nfsvers = nmp->nm_vers; /* nfs_getattr() will check changed and purge caches */ - if ((error = nfs_getattr(np, &nvattr, ctx, NGA_CACHED))) { + if ((error = nfs_getattr(np, NULL, ctx, NGA_CACHED))) { FSDBG(531, np, 0xd1e0001, 0, error); return (error); } @@ -1764,8 +2227,7 @@ nfs3_readlink_rpc(nfsnode_t np, char *buf, uint32_t *buflenp, vfs_context_t ctx) nfsm_chain_add_fh(error, &nmreq, nfsvers, np->n_fhp, np->n_fhsize); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC_READLINK, ctx, - &nmrep, &xid, &status); + error = nfs_request(np, NULL, &nmreq, NFSPROC_READLINK, ctx, NULL, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; if (nfsvers == NFS_VER3) @@ -1827,6 +2289,10 @@ nfs_read_rpc(nfsnode_t np, uio_t uio, vfs_context_t ctx) while (tsiz > 0) { len = retlen = (tsiz > (user_ssize_t)nmrsize) ? nmrsize : (size_t)tsiz; FSDBG(536, np, txoffset, len, 0); + if (np->n_flag & NREVOKE) { + error = EIO; + break; + } if (nmp->nm_vers >= NFS_VER4) stategenid = nmp->nm_stategenid; error = nmp->nm_funcs->nf_read_rpc_async(np, txoffset, len, @@ -1836,16 +2302,19 @@ nfs_read_rpc(nfsnode_t np, uio_t uio, vfs_context_t ctx) if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && (++restart <= nfs_mount_state_max_restarts(nmp))) { /* guard against no progress */ lck_mtx_lock(&nmp->nm_lock); - if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid) && !(nmp->nm_state & NFSSTA_RECOVER)) { - printf("nfs_read_rpc: error %d, initiating recovery\n", error); - nmp->nm_state |= NFSSTA_RECOVER; - nfs_mount_sock_thread_wake(nmp); + if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid)) { + NP(np, "nfs_read_rpc: error %d, initiating recovery", error); + nfs_need_recover(nmp, error); } lck_mtx_unlock(&nmp->nm_lock); - if (error == NFSERR_GRACE) - tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); - if (!(error = nfs_mount_state_wait_for_recovery(nmp))) - continue; + if (np->n_flag & NREVOKE) { + error = EIO; + } else { + if (error == NFSERR_GRACE) + tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); + if (!(error = nfs_mount_state_wait_for_recovery(nmp))) + continue; + } } if (error) break; @@ -1894,7 +2363,7 @@ nfs3_read_rpc_async( } nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfs_request_async(np, NULL, &nmreq, NFSPROC_READ, thd, cred, cb, reqp); + error = nfs_request_async(np, NULL, &nmreq, NFSPROC_READ, thd, cred, NULL, 0, cb, reqp); nfsmout: nfsm_chain_cleanup(&nmreq); return (error); @@ -1937,7 +2406,7 @@ nfs3_read_rpc_async_finish( nfsm_chain_adv(error, &nmrep, NFSX_UNSIGNED); nfsm_chain_get_32(error, &nmrep, eof); } else { - nfsm_chain_loadattr(error, &nmrep, np, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, nfsvers, &xid); } if (!lockerror) nfs_node_unlock(np); @@ -1980,7 +2449,6 @@ nfs_vnop_write( nfsnode_t np = VTONFS(vp); int ioflag = ap->a_ioflag; struct nfsbuf *bp; - struct nfs_vattr nvattr; struct nfsmount *nmp = VTONMP(vp); daddr64_t lbn; int biosize; @@ -2046,7 +2514,7 @@ nfs_vnop_write( if (ioflag & IO_APPEND) { nfs_data_unlock(np); /* nfs_getattr() will check changed and purge caches */ - error = nfs_getattr(np, &nvattr, ctx, NGA_UNCACHED); + error = nfs_getattr(np, NULL, ctx, NGA_UNCACHED); /* we'll be extending the file, so take the data lock exclusive */ nfs_data_lock(np, NFS_DATA_LOCK_EXCLUSIVE); if (error) { @@ -2272,12 +2740,14 @@ nfs_vnop_write( * * Notes: * We don't want to read anything we're just going to write over. + * We don't want to read anything we're just going drop when the + * I/O is complete (i.e. don't do reads for NOCACHE requests). * We don't want to issue multiple I/Os if we don't have to * (because they're synchronous rpcs). * We don't want to read anything we already have modified in the * page cache. */ - if (!ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, NB_CACHE) && (n < biosize)) { + if (!ISSET(bp->nb_flags, NB_CACHE) && (n < biosize)) { int firstpg, lastpg, dirtypg; int firstpgoff, lastpgoff; start = end = -1; @@ -2296,6 +2766,22 @@ nfs_vnop_write( start = (lastpg * PAGE_SIZE) + lastpgoff; end = (lastpg + 1) * PAGE_SIZE; } + if (ISSET(bp->nb_flags, NB_NOCACHE)) { + /* + * For nocache writes, if there is any partial page at the + * start or end of the write range, then we do the write + * synchronously to make sure that we can drop the data + * from the cache as soon as the WRITE finishes. Normally, + * we would do an unstable write and not drop the data until + * it was committed. But doing that here would risk allowing + * invalid data to be read from the cache between the WRITE + * and the COMMIT. + * (NB_STABLE indicates that data writes should be FILESYNC) + */ + if (end > start) + SET(bp->nb_flags, NB_STABLE); + goto skipread; + } if (end > start) { /* need to read the data in range: start...end-1 */ @@ -2327,8 +2813,11 @@ nfs_vnop_write( uio_reset(auio, boff + start, UIO_SYSSPACE, UIO_READ); uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + start), on - start); error = nfs_read_rpc(np, auio, ctx); - if (error) /* couldn't read the data, so treat buffer as NOCACHE */ + if (error) { + /* couldn't read the data, so treat buffer as synchronous NOCACHE */ SET(bp->nb_flags, (NB_NOCACHE|NB_STABLE)); + goto skipread; + } if (uio_resid(auio) > 0) { FSDBG(516, bp, (caddr_t)uio_curriovbase(auio) - bp->nb_data, uio_resid(auio), 0xd00dee01); bzero(CAST_DOWN(caddr_t, uio_curriovbase(auio)), uio_resid(auio)); @@ -2370,13 +2859,16 @@ nfs_vnop_write( FSDBG(516, bp, start, end - start, 0xd00dee00); bzero(bp->nb_data + start, end - start); error = 0; - } else if (!ISSET(bp->nb_flags, NB_NOCACHE)) { + } else { /* now we'll read the (rest of the) data */ uio_reset(auio, boff + start, UIO_SYSSPACE, UIO_READ); uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + start), end - start); error = nfs_read_rpc(np, auio, ctx); - if (error) /* couldn't read the data, so treat buffer as NOCACHE */ + if (error) { + /* couldn't read the data, so treat buffer as synchronous NOCACHE */ SET(bp->nb_flags, (NB_NOCACHE|NB_STABLE)); + goto skipread; + } if (uio_resid(auio) > 0) { FSDBG(516, bp, (caddr_t)uio_curriovbase(auio) - bp->nb_data, uio_resid(auio), 0xd00dee02); bzero(CAST_DOWN(caddr_t, uio_curriovbase(auio)), uio_resid(auio)); @@ -2400,6 +2892,7 @@ nfs_vnop_write( /* Note: pages being written to will be validated when written */ } } +skipread: if (ISSET(bp->nb_flags, NB_ERROR)) { error = bp->nb_error; @@ -2554,6 +3047,10 @@ nfs_write_rpc2( while (tsiz > 0) { len = (tsiz > nmwsize) ? nmwsize : tsiz; FSDBG(537, np, uio_offset(uio), len, 0); + if (np->n_flag & NREVOKE) { + error = EIO; + break; + } if (nmp->nm_vers >= NFS_VER4) stategenid = nmp->nm_stategenid; error = nmp->nm_funcs->nf_write_rpc_async(np, uio, len, thd, cred, *iomodep, NULL, &req); @@ -2565,16 +3062,19 @@ nfs_write_rpc2( if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && (++restart <= nfs_mount_state_max_restarts(nmp))) { /* guard against no progress */ lck_mtx_lock(&nmp->nm_lock); - if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid) && !(nmp->nm_state & NFSSTA_RECOVER)) { - printf("nfs_write_rpc: error %d, initiating recovery\n", error); - nmp->nm_state |= NFSSTA_RECOVER; - nfs_mount_sock_thread_wake(nmp); + if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid)) { + NP(np, "nfs_write_rpc: error %d, initiating recovery", error); + nfs_need_recover(nmp, error); } lck_mtx_unlock(&nmp->nm_lock); - if (error == NFSERR_GRACE) - tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); - if (!(error = nfs_mount_state_wait_for_recovery(nmp))) - continue; + if (np->n_flag & NREVOKE) { + error = EIO; + } else { + if (error == NFSERR_GRACE) + tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); + if (!(error = nfs_mount_state_wait_for_recovery(nmp))) + continue; + } } if (error) break; @@ -2635,6 +3135,7 @@ nfs3_write_rpc_async( struct nfsreq **reqp) { struct nfsmount *nmp; + mount_t mp; int error = 0, nfsvers; struct nfsm_chain nmreq; @@ -2643,6 +3144,11 @@ nfs3_write_rpc_async( return (ENXIO); nfsvers = nmp->nm_vers; + /* for async mounts, don't bother sending sync write requests */ + if ((iomode != NFS_WRITE_UNSTABLE) && nfs_allow_async && + ((mp = NFSTOMP(np))) && (vfs_flags(mp) & MNT_ASYNC)) + iomode = NFS_WRITE_UNSTABLE; + nfsm_chain_null(&nmreq); nfsm_chain_build_alloc_init(error, &nmreq, NFSX_FH(nfsvers) + 5 * NFSX_UNSIGNED + nfsm_rndup(len)); @@ -2661,7 +3167,7 @@ nfs3_write_rpc_async( error = nfsm_chain_add_uio(&nmreq, uio, len); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfs_request_async(np, NULL, &nmreq, NFSPROC_WRITE, thd, cred, cb, reqp); + error = nfs_request_async(np, NULL, &nmreq, NFSPROC_WRITE, thd, cred, NULL, 0, cb, reqp); nfsmout: nfsm_chain_cleanup(&nmreq); return (error); @@ -2727,7 +3233,7 @@ nfs3_write_rpc_async_finish( } else { if (!error) error = status; - nfsm_chain_loadattr(error, &nmrep, np, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, nfsvers, &xid); nfsmout_if(error); } if (updatemtime) @@ -2769,7 +3275,7 @@ nfs3_vnop_mknod( nfsnode_t np = NULL; struct nfsmount *nmp; nfsnode_t dnp = VTONFS(dvp); - struct nfs_vattr nvattr, dnvattr; + struct nfs_vattr nvattr; fhandle_t fh; int error = 0, lockerror = ENOENT, busyerror = ENOENT, status, wccpostattr = 0; struct timespec premtime = { 0, 0 }; @@ -2777,6 +3283,7 @@ nfs3_vnop_mknod( u_int64_t xid, dxid; int nfsvers, gotuid, gotgid; struct nfsm_chain nmreq, nmrep; + struct nfsreq rq, *req = &rq; nmp = VTONMP(dvp); if (!nmp) @@ -2797,6 +3304,8 @@ nfs3_vnop_mknod( if ((nfsvers == NFS_VER2) && (cnp->cn_namelen > NFS_MAXNAMLEN)) return (ENAMETOOLONG); + nfs_avoid_needless_id_setting_on_create(dnp, vap, ctx); + VATTR_SET_SUPPORTED(vap, va_mode); VATTR_SET_SUPPORTED(vap, va_uid); VATTR_SET_SUPPORTED(vap, va_gid); @@ -2813,7 +3322,7 @@ nfs3_vnop_mknod( NFSX_FH(nfsvers) + 4 * NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(nfsvers)); nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize); - nfsm_chain_add_string(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen); + nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp); if (nfsvers == NFS_VER3) { nfsm_chain_add_32(error, &nmreq, vtonfs_type(vap->va_type, nfsvers)); nfsm_chain_add_v3sattr(error, &nmreq, vap); @@ -2829,7 +3338,10 @@ nfs3_vnop_mknod( error = busyerror = nfs_node_set_busy(dnp, vfs_context_thread(ctx)); nfsmout_if(error); - error = nfs_request(dnp, NULL, &nmreq, NFSPROC_MKNOD, ctx, &nmrep, &xid, &status); + error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC_MKNOD, + vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, 0, NULL, &req); + if (!error) + error = nfs_request_async_finish(req, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(dnp))) error = lockerror; @@ -2857,11 +3369,11 @@ nfs3_vnop_mknod( NFS_CHANGED_UPDATE_NC(nfsvers, dnp, &dnp->n_vattr); nfs_node_unlock(dnp); /* nfs_getattr() will check changed and purge caches */ - nfs_getattr(dnp, &dnvattr, ctx, wccpostattr ? NGA_CACHED : NGA_UNCACHED); + nfs_getattr(dnp, NULL, ctx, wccpostattr ? NGA_CACHED : NGA_UNCACHED); } if (!error && fh.fh_len) - error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, NG_MAKEENTRY, &np); + error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, rq.r_auth, NG_MAKEENTRY, &np); if (!error && !np) error = nfs_lookitup(dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx, &np); if (!error && np) @@ -2870,7 +3382,7 @@ nfs3_vnop_mknod( nfs_node_clear_busy(dnp); if (!error && (gotuid || gotgid) && - (!newvp || nfs_getattrcache(np, &nvattr) || + (!newvp || nfs_getattrcache(np, &nvattr, 0) || (gotuid && (nvattr.nva_uid != vap->va_uid)) || (gotgid && (nvattr.nva_gid != vap->va_gid)))) { /* clear ID bits if server didn't use them (or we can't tell) */ @@ -2908,7 +3420,7 @@ nfs3_vnop_create( vnode_t dvp = ap->a_dvp; struct vnode_attr *vap = ap->a_vap; struct componentname *cnp = ap->a_cnp; - struct nfs_vattr nvattr, dnvattr; + struct nfs_vattr nvattr; fhandle_t fh; nfsnode_t np = NULL; struct nfsmount *nmp; @@ -2920,7 +3432,7 @@ nfs3_vnop_create( u_int64_t xid, dxid; uint32_t val; struct nfsm_chain nmreq, nmrep; - struct nfsreq *req; + struct nfsreq rq, *req = &rq; struct nfs_dulookup dul; nmp = VTONMP(dvp); @@ -2931,6 +3443,8 @@ nfs3_vnop_create( if ((nfsvers == NFS_VER2) && (cnp->cn_namelen > NFS_MAXNAMLEN)) return (ENAMETOOLONG); + nfs_avoid_needless_id_setting_on_create(dnp, vap, ctx); + VATTR_SET_SUPPORTED(vap, va_mode); VATTR_SET_SUPPORTED(vap, va_uid); VATTR_SET_SUPPORTED(vap, va_gid); @@ -2940,11 +3454,13 @@ nfs3_vnop_create( gotuid = VATTR_IS_ACTIVE(vap, va_uid); gotgid = VATTR_IS_ACTIVE(vap, va_gid); - if (vap->va_vaflags & VA_EXCLUSIVE) + if (vap->va_vaflags & VA_EXCLUSIVE) { fmode |= O_EXCL; + if (!VATTR_IS_ACTIVE(vap, va_access_time) || !VATTR_IS_ACTIVE(vap, va_modify_time)) + vap->va_vaflags |= VA_UTIMES_NULL; + } again: - req = NULL; error = busyerror = nfs_node_set_busy(dnp, vfs_context_thread(ctx)); nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx); @@ -2955,7 +3471,7 @@ nfs3_vnop_create( NFSX_FH(nfsvers) + 2 * NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(nfsvers)); nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize); - nfsm_chain_add_string(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen); + nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp); if (nfsvers == NFS_VER3) { if (fmode & O_EXCL) { nfsm_chain_add_32(error, &nmreq, NFS_CREATE_EXCLUSIVE); @@ -2979,7 +3495,7 @@ nfs3_vnop_create( nfsmout_if(error); error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC_CREATE, - vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, &req); + vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, 0, NULL, &req); if (!error) { nfs_dulookup_start(&dul, dnp, ctx); error = nfs_request_async_finish(req, &nmrep, &xid, &status); @@ -3010,11 +3526,11 @@ nfs3_vnop_create( NFS_CHANGED_UPDATE_NC(nfsvers, dnp, &dnp->n_vattr); nfs_node_unlock(dnp); /* nfs_getattr() will check changed and purge caches */ - nfs_getattr(dnp, &dnvattr, ctx, wccpostattr ? NGA_CACHED : NGA_UNCACHED); + nfs_getattr(dnp, NULL, ctx, wccpostattr ? NGA_CACHED : NGA_UNCACHED); } if (!error && fh.fh_len) - error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, NG_MAKEENTRY, &np); + error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, rq.r_auth, NG_MAKEENTRY, &np); if (!error && !np) error = nfs_lookitup(dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx, &np); if (!error && np) @@ -3051,7 +3567,7 @@ nfs3_vnop_create( if (!error) *ap->a_vpp = newvp; if (!error && (gotuid || gotgid) && - (!newvp || nfs_getattrcache(np, &nvattr) || + (!newvp || nfs_getattrcache(np, &nvattr, 0) || (gotuid && (nvattr.nva_uid != vap->va_uid)) || (gotgid && (nvattr.nva_gid != vap->va_gid)))) { /* clear ID bits if server didn't use them (or we can't tell) */ @@ -3091,7 +3607,7 @@ nfs_vnop_remove( struct componentname *cnp = ap->a_cnp; nfsnode_t dnp = VTONFS(dvp); nfsnode_t np = VTONFS(vp); - int error = 0, nfsvers, inuse, gotattr = 0, flushed = 0, setsize = 0; + int error = 0, nfsvers, namedattrs, inuse, gotattr = 0, flushed = 0, setsize = 0; struct nfs_vattr nvattr; struct nfsmount *nmp; struct nfs_dulookup dul; @@ -3102,6 +3618,7 @@ nfs_vnop_remove( if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; + namedattrs = (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR); again_relock: error = nfs_node_set_busy2(dnp, np, vfs_context_thread(ctx)); @@ -3117,7 +3634,8 @@ nfs_vnop_remove( np->n_hflag |= NHLOCKED; lck_mtx_unlock(nfs_node_hash_mutex); - nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx); + if (!namedattrs) + nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx); again: inuse = vnode_isinuse(vp, 0); if ((ap->a_flags & VNODE_REMOVE_NODELETEBUSY) && inuse) { @@ -3152,16 +3670,13 @@ nfs_vnop_remove( nfs_node_unlock(np); return (error); } + if (!namedattrs) + nfs_dulookup_finish(&dul, dnp, ctx); goto again_relock; } - if ((nmp->nm_vers >= NFS_VER4) && (np->n_openflags & N_DELEG_MASK)) { - lck_mtx_lock(&np->n_openlock); - np->n_openflags &= ~N_DELEG_MASK; - lck_mtx_unlock(&np->n_openlock); - nfs4_delegreturn_rpc(nmp, np->n_fhp, np->n_fhsize, &np->n_dstateid, - vfs_context_thread(ctx), vfs_context_ucred(ctx)); - } + if ((nmp->nm_vers >= NFS_VER4) && (np->n_openflags & N_DELEG_MASK)) + nfs4_delegation_return(np, 0, vfs_context_thread(ctx), vfs_context_ucred(ctx)); /* * Purge the name cache so that the chance of a lookup for @@ -3170,7 +3685,8 @@ nfs_vnop_remove( */ nfs_name_cache_purge(dnp, np, cnp, ctx); - nfs_dulookup_start(&dul, dnp, ctx); + if (!namedattrs) + nfs_dulookup_start(&dul, dnp, ctx); /* Do the rpc */ error = nmp->nm_funcs->nf_remove_rpc(dnp, cnp->cn_nameptr, cnp->cn_namelen, @@ -3213,7 +3729,8 @@ nfs_vnop_remove( nfs_node_unlock(np); } } else if (!np->n_sillyrename) { - nfs_dulookup_start(&dul, dnp, ctx); + if (!namedattrs) + nfs_dulookup_start(&dul, dnp, ctx); error = nfs_sillyrename(dnp, np, cnp, ctx); nfs_node_lock_force(np); NATTRINVALIDATE(np); @@ -3222,12 +3739,14 @@ nfs_vnop_remove( nfs_node_lock_force(np); NATTRINVALIDATE(np); nfs_node_unlock(np); - nfs_dulookup_start(&dul, dnp, ctx); + if (!namedattrs) + nfs_dulookup_start(&dul, dnp, ctx); } /* nfs_getattr() will check changed and purge caches */ - nfs_getattr(dnp, &nvattr, ctx, NGA_CACHED); - nfs_dulookup_finish(&dul, dnp, ctx); + nfs_getattr(dnp, NULL, ctx, NGA_CACHED); + if (!namedattrs) + nfs_dulookup_finish(&dul, dnp, ctx); out: /* unlock the node */ lck_mtx_lock(nfs_node_hash_mutex); @@ -3286,11 +3805,11 @@ nfs3_remove_rpc( nfsm_chain_build_alloc_init(error, &nmreq, NFSX_FH(nfsvers) + NFSX_UNSIGNED + nfsm_rndup(namelen)); nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize); - nfsm_chain_add_string(error, &nmreq, name, namelen); + nfsm_chain_add_name(error, &nmreq, name, namelen, nmp); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfs_request2(dnp, NULL, &nmreq, NFSPROC_REMOVE, thd, cred, 0, &nmrep, &xid, &status); + error = nfs_request2(dnp, NULL, &nmreq, NFSPROC_REMOVE, thd, cred, NULL, 0, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(dnp))) error = lockerror; @@ -3398,11 +3917,7 @@ nfs_vnop_rename( tvp = NULL; } } else if (tvp && (nmp->nm_vers >= NFS_VER4) && (tnp->n_openflags & N_DELEG_MASK)) { - lck_mtx_lock(&tnp->n_openlock); - tnp->n_openflags &= ~N_DELEG_MASK; - lck_mtx_unlock(&tnp->n_openlock); - nfs4_delegreturn_rpc(nmp, tnp->n_fhp, tnp->n_fhsize, &tnp->n_dstateid, - vfs_context_thread(ctx), vfs_context_ucred(ctx)); + nfs4_delegation_return(tnp, 0, vfs_context_thread(ctx), vfs_context_ucred(ctx)); } error = nmp->nm_funcs->nf_rename_rpc(fdnp, fcnp->cn_nameptr, fcnp->cn_namelen, @@ -3417,7 +3932,7 @@ nfs_vnop_rename( if (tvp && (tvp != fvp) && !tnp->n_sillyrename) { nfs_node_lock_force(tnp); tvprecycle = (!error && !vnode_isinuse(tvp, 0) && - (nfs_getattrcache(tnp, &nvattr) || (nvattr.nva_nlink == 1))); + (nfs_getattrcache(tnp, &nvattr, 0) || (nvattr.nva_nlink == 1))); nfs_node_unlock(tnp); lck_mtx_lock(nfs_node_hash_mutex); if (tvprecycle && (tnp->n_hflag & NHHASHED)) { @@ -3474,8 +3989,8 @@ nfs_vnop_rename( } out: /* nfs_getattr() will check changed and purge caches */ - nfs_getattr(fdnp, &nvattr, ctx, NGA_CACHED); - nfs_getattr(tdnp, &nvattr, ctx, NGA_CACHED); + nfs_getattr(fdnp, NULL, ctx, NGA_CACHED); + nfs_getattr(tdnp, NULL, ctx, NGA_CACHED); if (locked) { /* unlock node */ lck_mtx_lock(nfs_node_hash_mutex); @@ -3525,13 +4040,13 @@ nfs3_rename_rpc( (NFSX_FH(nfsvers) + NFSX_UNSIGNED) * 2 + nfsm_rndup(fnamelen) + nfsm_rndup(tnamelen)); nfsm_chain_add_fh(error, &nmreq, nfsvers, fdnp->n_fhp, fdnp->n_fhsize); - nfsm_chain_add_string(error, &nmreq, fnameptr, fnamelen); + nfsm_chain_add_name(error, &nmreq, fnameptr, fnamelen, nmp); nfsm_chain_add_fh(error, &nmreq, nfsvers, tdnp->n_fhp, tdnp->n_fhsize); - nfsm_chain_add_string(error, &nmreq, tnameptr, tnamelen); + nfsm_chain_add_name(error, &nmreq, tnameptr, tnamelen, nmp); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfs_request(fdnp, NULL, &nmreq, NFSPROC_RENAME, ctx, &nmrep, &xid, &status); + error = nfs_request(fdnp, NULL, &nmreq, NFSPROC_RENAME, ctx, NULL, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock2(fdnp, tdnp))) error = lockerror; @@ -3617,11 +4132,10 @@ nfs3_vnop_link( NFSX_FH(nfsvers)*2 + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen)); nfsm_chain_add_fh(error, &nmreq, nfsvers, np->n_fhp, np->n_fhsize); nfsm_chain_add_fh(error, &nmreq, nfsvers, tdnp->n_fhp, tdnp->n_fhsize); - nfsm_chain_add_string(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen); + nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC_LINK, ctx, - &nmrep, &xid, &status); + error = nfs_request(np, NULL, &nmreq, NFSPROC_LINK, ctx, NULL, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock2(tdnp, np))) { error = lockerror; @@ -3680,7 +4194,7 @@ nfs3_vnop_symlink( vnode_t dvp = ap->a_dvp; struct vnode_attr *vap = ap->a_vap; struct componentname *cnp = ap->a_cnp; - struct nfs_vattr nvattr, dnvattr; + struct nfs_vattr nvattr; fhandle_t fh; int slen, error = 0, lockerror = ENOENT, busyerror = ENOENT, status, wccpostattr = 0; struct timespec premtime = { 0, 0 }; @@ -3691,7 +4205,7 @@ nfs3_vnop_symlink( nfsnode_t dnp = VTONFS(dvp); struct nfsmount *nmp; struct nfsm_chain nmreq, nmrep; - struct nfsreq *req = NULL; + struct nfsreq rq, *req = &rq; struct nfs_dulookup dul; nmp = VTONMP(dvp); @@ -3704,6 +4218,8 @@ nfs3_vnop_symlink( ((cnp->cn_namelen > NFS_MAXNAMLEN) || (slen > NFS_MAXPATHLEN))) return (ENAMETOOLONG); + nfs_avoid_needless_id_setting_on_create(dnp, vap, ctx); + VATTR_SET_SUPPORTED(vap, va_mode); VATTR_SET_SUPPORTED(vap, va_uid); VATTR_SET_SUPPORTED(vap, va_gid); @@ -3723,17 +4239,17 @@ nfs3_vnop_symlink( NFSX_FH(nfsvers) + 2 * NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen) + nfsm_rndup(slen) + NFSX_SATTR(nfsvers)); nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize); - nfsm_chain_add_string(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen); + nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp); if (nfsvers == NFS_VER3) nfsm_chain_add_v3sattr(error, &nmreq, vap); - nfsm_chain_add_string(error, &nmreq, ap->a_target, slen); + nfsm_chain_add_name(error, &nmreq, ap->a_target, slen, nmp); if (nfsvers == NFS_VER2) nfsm_chain_add_v2sattr(error, &nmreq, vap, -1); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC_SYMLINK, - vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, &req); + vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, 0, NULL, &req); if (!error) { nfs_dulookup_start(&dul, dnp, ctx); error = nfs_request_async_finish(req, &nmrep, &xid, &status); @@ -3767,11 +4283,11 @@ nfs3_vnop_symlink( NFS_CHANGED_UPDATE_NC(nfsvers, dnp, &dnp->n_vattr); nfs_node_unlock(dnp); /* nfs_getattr() will check changed and purge caches */ - nfs_getattr(dnp, &dnvattr, ctx, wccpostattr ? NGA_CACHED : NGA_UNCACHED); + nfs_getattr(dnp, NULL, ctx, wccpostattr ? NGA_CACHED : NGA_UNCACHED); } if (!error && fh.fh_len) - error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, NG_MAKEENTRY, &np); + error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, rq.r_auth, NG_MAKEENTRY, &np); if (!error && np) newvp = NFSTOV(np); @@ -3797,7 +4313,7 @@ nfs3_vnop_symlink( if (!busyerror) nfs_node_clear_busy(dnp); if (!error && (gotuid || gotgid) && - (!newvp || nfs_getattrcache(np, &nvattr) || + (!newvp || nfs_getattrcache(np, &nvattr, 0) || (gotuid && (nvattr.nva_uid != vap->va_uid)) || (gotgid && (nvattr.nva_gid != vap->va_gid)))) { /* clear ID bits if server didn't use them (or we can't tell) */ @@ -3834,7 +4350,7 @@ nfs3_vnop_mkdir( vnode_t dvp = ap->a_dvp; struct vnode_attr *vap = ap->a_vap; struct componentname *cnp = ap->a_cnp; - struct nfs_vattr nvattr, dnvattr; + struct nfs_vattr nvattr; nfsnode_t np = NULL; struct nfsmount *nmp; nfsnode_t dnp = VTONFS(dvp); @@ -3845,7 +4361,7 @@ nfs3_vnop_mkdir( u_int64_t xid, dxid; fhandle_t fh; struct nfsm_chain nmreq, nmrep; - struct nfsreq *req = NULL; + struct nfsreq rq, *req = &rq; struct nfs_dulookup dul; nmp = VTONMP(dvp); @@ -3855,6 +4371,8 @@ nfs3_vnop_mkdir( if ((nfsvers == NFS_VER2) && (cnp->cn_namelen > NFS_MAXNAMLEN)) return (ENAMETOOLONG); + nfs_avoid_needless_id_setting_on_create(dnp, vap, ctx); + VATTR_SET_SUPPORTED(vap, va_mode); VATTR_SET_SUPPORTED(vap, va_uid); VATTR_SET_SUPPORTED(vap, va_gid); @@ -3874,7 +4392,7 @@ nfs3_vnop_mkdir( NFSX_FH(nfsvers) + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(nfsvers)); nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize); - nfsm_chain_add_string(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen); + nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp); if (nfsvers == NFS_VER3) nfsm_chain_add_v3sattr(error, &nmreq, vap); else @@ -3883,7 +4401,7 @@ nfs3_vnop_mkdir( nfsmout_if(error); error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC_MKDIR, - vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, &req); + vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, 0, NULL, &req); if (!error) { nfs_dulookup_start(&dul, dnp, ctx); error = nfs_request_async_finish(req, &nmrep, &xid, &status); @@ -3914,11 +4432,11 @@ nfs3_vnop_mkdir( NFS_CHANGED_UPDATE_NC(nfsvers, dnp, &dnp->n_vattr); nfs_node_unlock(dnp); /* nfs_getattr() will check changed and purge caches */ - nfs_getattr(dnp, &dnvattr, ctx, wccpostattr ? NGA_CACHED : NGA_UNCACHED); + nfs_getattr(dnp, NULL, ctx, wccpostattr ? NGA_CACHED : NGA_UNCACHED); } if (!error && fh.fh_len) - error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, NG_MAKEENTRY, &np); + error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, rq.r_auth, NG_MAKEENTRY, &np); if (!error && np) newvp = NFSTOV(np); @@ -3944,7 +4462,7 @@ nfs3_vnop_mkdir( if (!busyerror) nfs_node_clear_busy(dnp); if (!error && (gotuid || gotgid) && - (!newvp || nfs_getattrcache(np, &nvattr) || + (!newvp || nfs_getattrcache(np, &nvattr, 0) || (gotuid && (nvattr.nva_uid != vap->va_uid)) || (gotgid && (nvattr.nva_gid != vap->va_gid)))) { /* clear ID bits if server didn't use them (or we can't tell) */ @@ -3985,11 +4503,10 @@ nfs3_vnop_rmdir( struct nfsmount *nmp; nfsnode_t np = VTONFS(vp); nfsnode_t dnp = VTONFS(dvp); - struct nfs_vattr dnvattr; int nfsvers; u_int64_t xid; struct nfsm_chain nmreq, nmrep; - struct nfsreq *req = NULL; + struct nfsreq rq, *req = &rq; struct nfs_dulookup dul; nmp = VTONMP(vp); @@ -4010,12 +4527,12 @@ nfs3_vnop_rmdir( nfsm_chain_build_alloc_init(error, &nmreq, NFSX_FH(nfsvers) + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen)); nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize); - nfsm_chain_add_string(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen); + nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC_RMDIR, - vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, &req); + vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, 0, NULL, &req); if (!error) { nfs_dulookup_start(&dul, dnp, ctx); error = nfs_request_async_finish(req, &nmrep, &xid, &status); @@ -4039,7 +4556,7 @@ nfs3_vnop_rmdir( nfs_node_unlock(dnp); nfs_name_cache_purge(dnp, np, cnp, ctx); /* nfs_getattr() will check changed and purge caches */ - nfs_getattr(dnp, &dnvattr, ctx, wccpostattr ? NGA_CACHED : NGA_UNCACHED); + nfs_getattr(dnp, NULL, ctx, wccpostattr ? NGA_CACHED : NGA_UNCACHED); } nfs_dulookup_finish(&dul, dnp, ctx); nfs_node_clear_busy2(dnp, np); @@ -4106,7 +4623,6 @@ nfs_vnop_readdir( struct nfsmount *nmp; uio_t uio = ap->a_uio; int error, nfsvers, extended, numdirent, bigcookies, ptc, done; - struct nfs_vattr nvattr; uint16_t i, iptc, rlen, nlen; uint64_t cookie, nextcookie, lbn = 0; struct nfsbuf *bp = NULL; @@ -4132,6 +4648,11 @@ nfs_vnop_readdir( if (uio_resid(uio) == 0) return (0); + if ((nfsvers >= NFS_VER4) && (dnp->n_vattr.nva_flags & NFS_FFLAG_TRIGGER)) { + /* trigger directories should never be read, return nothing */ + return (0); + } + thd = vfs_context_thread(ctx); numdirent = done = 0; nextcookie = uio_offset(uio); @@ -4164,7 +4685,7 @@ nfs_vnop_readdir( nfs_node_unlock(dnp); } /* nfs_getattr() will check changed and purge caches */ - if ((error = nfs_getattr(dnp, &nvattr, ctx, NGA_UNCACHED))) + if ((error = nfs_getattr(dnp, NULL, ctx, NGA_UNCACHED))) goto out; } else { nfs_node_unlock(dnp); @@ -4412,7 +4933,8 @@ int nfs_dir_cookie_to_lbn(nfsnode_t dnp, uint64_t cookie, int *ptc, uint64_t *lbnp) { struct nfsdmap *ndcc = dnp->n_cookiecache; - int8_t i, eofptc, iptc, found; + int8_t eofptc, found; + int i, iptc; struct nfsmount *nmp; struct nfsbuf *bp, *lastbp; struct nfsbuflists blist; @@ -4586,7 +5108,7 @@ nfs_dir_buf_search( nvattrp = NFS_DIR_BUF_NVATTR(bp, i); if ((ndbhp->ndbh_ncgen != bp->nb_np->n_ncgen) || (fhp->fh_len == 0) || (nvattrp->nva_type == VNON) || (nvattrp->nva_fileid == 0)) { - /* entry is no longer valid */ + /* entry is not valid */ error = ENOENT; break; } @@ -4633,7 +5155,7 @@ nfs_dir_buf_cache_lookup(nfsnode_t dnp, nfsnode_t *npp, struct componentname *cn { nfsnode_t newnp; struct nfsmount *nmp; - int error = 0, slpflag, slptimeo, i, found = 0, count = 0; + int error = 0, i, found = 0, count = 0; u_int64_t xid; struct nfs_vattr nvattr; fhandle_t fh; @@ -4646,8 +5168,6 @@ nfs_dir_buf_cache_lookup(nfsnode_t dnp, nfsnode_t *npp, struct componentname *cn if (!(nmp = NFSTONMP(dnp))) return (ENXIO); - slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0; - slptimeo = 0; if (!purge) *npp = NULL; @@ -4728,7 +5248,7 @@ nfs_dir_buf_cache_lookup(nfsnode_t dnp, nfsnode_t *npp, struct componentname *cn if (!error && found && !purge) { error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, - &nvattr, &xid, NG_MAKEENTRY, &newnp); + &nvattr, &xid, dnp->n_auth, NG_MAKEENTRY, &newnp); if (error) return (error); newnp->n_attrstamp = attrstamp; @@ -4762,7 +5282,7 @@ nfs_name_cache_purge(nfsnode_t dnp, nfsnode_t np, struct componentname *cnp, vfs struct nfsmount *nmp = NFSTONMP(dnp); cache_purge(NFSTOV(np)); - if (nmp && (nmp->nm_vers > NFS_VER2) && (nmp->nm_flag & NFSMNT_RDIRPLUS)) + if (nmp && (nmp->nm_vers > NFS_VER2) && NMFLAG(nmp, RDIRPLUS)) nfs_dir_buf_cache_lookup(dnp, NULL, cnp, ctx, 1); } @@ -4794,7 +5314,7 @@ nfs3_readdir_rpc(nfsnode_t dnp, struct nfsbuf *bp, vfs_context_t ctx) nmrsize = nmp->nm_rsize; bigcookies = nmp->nm_state & NFSSTA_BIGCOOKIES; noplus: - rdirplus = ((nfsvers > NFS_VER2) && (nmp->nm_flag & NFSMNT_RDIRPLUS)) ? 1 : 0; + rdirplus = ((nfsvers > NFS_VER2) && NMFLAG(nmp, RDIRPLUS)) ? 1 : 0; if ((lockerror = nfs_node_lock(dnp))) return (lockerror); @@ -4843,7 +5363,7 @@ nfs3_readdir_rpc(nfsnode_t dnp, struct nfsbuf *bp, vfs_context_t ctx) error = nfs_request(dnp, NULL, &nmreq, rdirplus ? NFSPROC_READDIRPLUS : NFSPROC_READDIR, - ctx, &nmrep, &xid, &status); + ctx, NULL, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(dnp))) error = lockerror; @@ -4864,7 +5384,7 @@ nfs3_readdir_rpc(nfsnode_t dnp, struct nfsbuf *bp, vfs_context_t ctx) if (error == NFSERR_NOTSUPP) { /* oops... it doesn't look like readdirplus is supported */ lck_mtx_lock(&nmp->nm_lock); - nmp->nm_flag &= ~NFSMNT_RDIRPLUS; + NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_RDIRPLUS); lck_mtx_unlock(&nmp->nm_lock); goto noplus; } @@ -5107,6 +5627,10 @@ nfs_sillyrename( /* now, do the rename */ error = nmp->nm_funcs->nf_rename_rpc(dnp, cnp->cn_nameptr, cnp->cn_namelen, dnp, nsp->nsr_name, nsp->nsr_namlen, ctx); + + /* Kludge: Map ENOENT => 0 assuming that it is a reply to a retry. */ + if (error == ENOENT) + error = 0; if (!error) { nfs_node_lock_force(dnp); if (dnp->n_flag & NNEGNCENTRIES) { @@ -5154,11 +5678,11 @@ nfs3_lookup_rpc_async( nfsm_chain_build_alloc_init(error, &nmreq, NFSX_FH(nfsvers) + NFSX_UNSIGNED + nfsm_rndup(namelen)); nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize); - nfsm_chain_add_string(error, &nmreq, name, namelen); + nfsm_chain_add_name(error, &nmreq, name, namelen, nmp); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC_LOOKUP, - vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, reqp); + vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, 0, NULL, reqp); nfsmout: nfsm_chain_cleanup(&nmreq); return (error); @@ -5167,6 +5691,8 @@ nfs3_lookup_rpc_async( int nfs3_lookup_rpc_async_finish( nfsnode_t dnp, + __unused char *name, + __unused int namelen, vfs_context_t ctx, struct nfsreq *req, u_int64_t *xidp, @@ -5206,7 +5732,7 @@ nfs3_lookup_rpc_async_finish( nfsm_chain_postop_attr_get(error, &nmrep, attrflag, nvap); nfsm_chain_postop_attr_update(error, &nmrep, dnp, &xid); if (!error && !attrflag) - error = nfs3_getattr_rpc(NULL, NFSTOMP(dnp), fhp->fh_data, fhp->fh_len, ctx, nvap, xidp); + error = nfs3_getattr_rpc(NULL, NFSTOMP(dnp), fhp->fh_data, fhp->fh_len, 0, ctx, nvap, xidp); } else { error = nfs_parsefattr(&nmrep, nfsvers, nvap); } @@ -5249,6 +5775,8 @@ nfs_lookitup( (namelen > (int)nmp->nm_fsattr.nfsa_maxname)) return (ENAMETOOLONG); + NVATTR_INIT(&nvattr); + /* check for lookup of "." */ if ((name[0] == '.') && (namelen == 1)) { /* skip lookup, we know who we are */ @@ -5259,7 +5787,7 @@ nfs_lookitup( error = nmp->nm_funcs->nf_lookup_rpc_async(dnp, name, namelen, ctx, &req); nfsmout_if(error); - error = nmp->nm_funcs->nf_lookup_rpc_async_finish(dnp, ctx, req, &xid, &fh, &nvattr); + error = nmp->nm_funcs->nf_lookup_rpc_async_finish(dnp, name, namelen, ctx, req, &xid, &fh, &nvattr); nfsmout_if(!npp || error); if (*npp) { @@ -5299,7 +5827,7 @@ nfs_lookitup( cnp->cn_nameptr = name; cnp->cn_namelen = namelen; error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, - &nvattr, &xid, NG_MAKEENTRY, &np); + &nvattr, &xid, rq.r_auth, NG_MAKEENTRY, &np); nfsmout_if(error); newnp = np; } @@ -5307,6 +5835,7 @@ nfs_lookitup( nfsmout: if (npp && !*npp && !error) *npp = newnp; + NVATTR_CLEANUP(&nvattr); return (error); } @@ -5319,11 +5848,14 @@ nfs_dulookup_init(struct nfs_dulookup *dulp, nfsnode_t dnp, const char *name, in { int error, du_namelen; vnode_t du_vp; + struct nfsmount *nmp = NFSTONMP(dnp); /* check for ._ file in name cache */ dulp->du_flags = 0; bzero(&dulp->du_cn, sizeof(dulp->du_cn)); du_namelen = namelen + 2; + if (!nmp || NMFLAG(nmp, NONEGNAMECACHE)) + return; if ((namelen >= 2) && (name[0] == '.') && (name[1] == '_')) return; if (du_namelen >= (int)sizeof(dulp->du_smallname)) @@ -5342,8 +5874,8 @@ nfs_dulookup_init(struct nfs_dulookup *dulp, nfsnode_t dnp, const char *name, in if (error == -1) { vnode_put(du_vp); } else if (!error) { - struct nfsmount *nmp = NFSTONMP(dnp); - if (nmp && (nmp->nm_vers > NFS_VER2) && (nmp->nm_flag & NFSMNT_RDIRPLUS)) { + nmp = NFSTONMP(dnp); + if (nmp && (nmp->nm_vers > NFS_VER2) && NMFLAG(nmp, RDIRPLUS)) { /* if rdirplus, try dir buf cache lookup */ nfsnode_t du_np = NULL; if (!nfs_dir_buf_cache_lookup(dnp, &du_np, &dulp->du_cn, ctx, 0) && du_np) { @@ -5367,7 +5899,7 @@ nfs_dulookup_start(struct nfs_dulookup *dulp, nfsnode_t dnp, vfs_context_t ctx) struct nfsmount *nmp = NFSTONMP(dnp); struct nfsreq *req = &dulp->du_req; - if (!nmp || !(dulp->du_flags & NFS_DULOOKUP_DOIT)) + if (!nmp || !(dulp->du_flags & NFS_DULOOKUP_DOIT) || (dulp->du_flags & NFS_DULOOKUP_INPROG)) return; if (!nmp->nm_funcs->nf_lookup_rpc_async(dnp, dulp->du_cn.cn_nameptr, dulp->du_cn.cn_namelen, ctx, &req)) @@ -5390,7 +5922,9 @@ nfs_dulookup_finish(struct nfs_dulookup *dulp, nfsnode_t dnp, vfs_context_t ctx) if (!nmp || !(dulp->du_flags & NFS_DULOOKUP_INPROG)) goto out; - error = nmp->nm_funcs->nf_lookup_rpc_async_finish(dnp, ctx, &dulp->du_req, &xid, &fh, &nvattr); + NVATTR_INIT(&nvattr); + error = nmp->nm_funcs->nf_lookup_rpc_async_finish(dnp, dulp->du_cn.cn_nameptr, + dulp->du_cn.cn_namelen, ctx, &dulp->du_req, &xid, &fh, &nvattr); dulp->du_flags &= ~NFS_DULOOKUP_INPROG; if (error == ENOENT) { /* add a negative entry in the name cache */ @@ -5400,12 +5934,13 @@ nfs_dulookup_finish(struct nfs_dulookup *dulp, nfsnode_t dnp, vfs_context_t ctx) nfs_node_unlock(dnp); } else if (!error) { error = nfs_nget(NFSTOMP(dnp), dnp, &dulp->du_cn, fh.fh_data, fh.fh_len, - &nvattr, &xid, NG_MAKEENTRY, &du_np); + &nvattr, &xid, dulp->du_req.r_auth, NG_MAKEENTRY, &du_np); if (!error) { nfs_node_unlock(du_np); vnode_put(NFSTOV(du_np)); } } + NVATTR_CLEANUP(&nvattr); out: if (dulp->du_flags & NFS_DULOOKUP_INPROG) nfs_request_async_cancel(&dulp->du_req); @@ -5420,14 +5955,15 @@ nfs_dulookup_finish(struct nfs_dulookup *dulp, nfsnode_t dnp, vfs_context_t ctx) int nfs3_commit_rpc( nfsnode_t np, - u_int64_t offset, - u_int64_t count, - kauth_cred_t cred) + uint64_t offset, + uint64_t count, + kauth_cred_t cred, + uint64_t wverf) { struct nfsmount *nmp; int error = 0, lockerror, status, wccpostattr = 0, nfsvers; struct timespec premtime = { 0, 0 }; - u_int64_t xid, wverf; + u_int64_t xid, newwverf; uint32_t count32; struct nfsm_chain nmreq, nmrep; @@ -5454,7 +5990,7 @@ nfs3_commit_rpc( nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); error = nfs_request2(np, NULL, &nmreq, NFSPROC_COMMIT, - current_thread(), cred, 0, &nmrep, &xid, &status); + current_thread(), cred, NULL, 0, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; /* can we do anything useful with the wcc info? */ @@ -5463,13 +5999,13 @@ nfs3_commit_rpc( nfs_node_unlock(np); if (!error) error = status; - nfsm_chain_get_64(error, &nmrep, wverf); + nfsm_chain_get_64(error, &nmrep, newwverf); nfsmout_if(error); lck_mtx_lock(&nmp->nm_lock); - if (nmp->nm_verf != wverf) { - nmp->nm_verf = wverf; + if (nmp->nm_verf != newwverf) + nmp->nm_verf = newwverf; + if (wverf != newwverf) error = NFSERR_STALEWRITEVERF; - } lck_mtx_unlock(&nmp->nm_lock); nfsmout: nfsm_chain_cleanup(&nmreq); @@ -5494,23 +6030,6 @@ nfs_vnop_blockmap( return (ENOTSUP); } -/* - * Mmap a file - * - * NB Currently unsupported. - */ -/*ARGSUSED*/ -int -nfs_vnop_mmap( - __unused struct vnop_mmap_args /* { - struct vnodeop_desc *a_desc; - vnode_t a_vp; - int a_fflags; - vfs_context_t a_context; - } */ *ap) -{ - return (EINVAL); -} /* * fsync vnode op. Just call nfs_flush(). @@ -5556,8 +6075,7 @@ nfs3_pathconf_rpc( nfsm_chain_add_fh(error, &nmreq, nfsvers, np->n_fhp, np->n_fhsize); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC_PATHCONF, ctx, - &nmrep, &xid, &status); + error = nfs_request(np, NULL, &nmreq, NFSPROC_PATHCONF, ctx, NULL, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; nfsm_chain_postop_attr_update(error, &nmrep, np, &xid); @@ -5653,6 +6171,12 @@ nfs_vnop_pathconf( return (0); } break; + case _PC_XATTR_SIZE_BITS: + /* Do we support xattrs natively? */ + if (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR) + break; /* Yes */ + /* No... so just return an error */ + /* FALLTHROUGH */ default: /* don't bother contacting the server if we know the answer */ return (EINVAL); @@ -5738,6 +6262,7 @@ nfs_vnop_pathconf( else error = EINVAL; break; + case _PC_XATTR_SIZE_BITS: /* same as file size bits if named attrs supported */ case _PC_FILESIZEBITS: if (!NFS_BITMAP_ISSET(nfsap->nfsa_bitmap, NFS_FATTR_MAXFILESIZE)) { *ap->a_retval = 64; @@ -6007,7 +6532,7 @@ nfsfifo_vnop_close( /*ARGSUSED*/ int nfs_vnop_ioctl( - __unused struct vnop_ioctl_args /* { + struct vnop_ioctl_args /* { struct vnodeop_desc *a_desc; vnode_t a_vp; u_int32_t a_command; @@ -6016,12 +6541,23 @@ nfs_vnop_ioctl( vfs_context_t a_context; } */ *ap) { + vfs_context_t ctx = ap->a_context; + vnode_t vp = ap->a_vp; + int error = ENOTTY; - /* - * XXX we were once bogusly enoictl() which returned this (ENOTTY). - * Probably we should return ENODEV. - */ - return (ENOTTY); + switch (ap->a_command) { + + case F_FULLFSYNC: + if (vnode_vfsisrdonly(vp)) + return (EROFS); + if (!VTONMP(vp)) + return (ENXIO); + error = nfs_flush(VTONFS(vp), MNT_WAIT, vfs_context_thread(ctx), 0); + break; + + } + + return (error); } /*ARGSUSED*/ @@ -6135,6 +6671,10 @@ nfs_vnop_pagein( bzero(req, sizeof(req)); nextsend = nextwait = 0; do { + if (np->n_flag & NREVOKE) { + error = EIO; + break; + } /* send requests while we need to and have available slots */ while ((txsize > 0) && (req[nextsend] == NULL)) { iosize = MIN(nmrsize, txsize); @@ -6161,14 +6701,11 @@ nfs_vnop_pagein( nextwait = (nextwait + 1) % MAXPAGINGREQS; if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error)) { lck_mtx_lock(&nmp->nm_lock); - if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid) && !(nmp->nm_state & NFSSTA_RECOVER)) { - printf("nfs_vnop_pagein: error %d, initiating recovery\n", error); - nmp->nm_state |= NFSSTA_RECOVER; - nfs_mount_sock_thread_wake(nmp); + if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid)) { + NP(np, "nfs_vnop_pagein: error %d, initiating recovery", error); + nfs_need_recover(nmp, error); } lck_mtx_unlock(&nmp->nm_lock); - if (error == NFSERR_GRACE) - tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); restart++; goto cancel; } @@ -6200,11 +6737,17 @@ nfs_vnop_pagein( req[nextwait] = NULL; nextwait = (nextwait + 1) % MAXPAGINGREQS; } - if (restart) { - if ((restart <= nfs_mount_state_max_restarts(nmp)) && /* guard against no progress */ - (!(error = nfs_mount_state_wait_for_recovery(nmp)))) - goto tryagain; - printf("nfs_pagein: too many restarts, aborting.\n"); + if (np->n_flag & NREVOKE) { + error = EIO; + } else if (restart) { + if (restart <= nfs_mount_state_max_restarts(nmp)) { /* guard against no progress */ + if (error == NFSERR_GRACE) + tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); + if (!(error = nfs_mount_state_wait_for_recovery(nmp))) + goto tryagain; + } else { + NP(np, "nfs_pagein: too many restarts, aborting"); + } } } @@ -6579,6 +7122,10 @@ nfs_vnop_pageout( bzero(req, sizeof(req)); nextsend = nextwait = 0; do { + if (np->n_flag & NREVOKE) { + error = EIO; + break; + } /* send requests while we need to and have available slots */ while ((txsize > 0) && (req[nextsend] == NULL)) { iosize = MIN(nmwsize, txsize); @@ -6616,14 +7163,11 @@ nfs_vnop_pageout( nfs_node_unlock(np); if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error)) { lck_mtx_lock(&nmp->nm_lock); - if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid) && !(nmp->nm_state & NFSSTA_RECOVER)) { - printf("nfs_vnop_pageout: error %d, initiating recovery\n", error); - nmp->nm_state |= NFSSTA_RECOVER; - nfs_mount_sock_thread_wake(nmp); + if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid)) { + NP(np, "nfs_vnop_pageout: error %d, initiating recovery", error); + nfs_need_recover(nmp, error); } lck_mtx_unlock(&nmp->nm_lock); - if (error == NFSERR_GRACE) - tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); restart = 1; goto cancel; } @@ -6654,16 +7198,13 @@ nfs_vnop_pageout( iomode = NFS_WRITE_UNSTABLE; error = nfs_write_rpc2(np, auio, thd, cred, &iomode, &wverf2); if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error)) { - printf("nfs_vnop_pageout: restart: error %d\n", error); + NP(np, "nfs_vnop_pageout: restart: error %d", error); lck_mtx_lock(&nmp->nm_lock); - if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid) && !(nmp->nm_state & NFSSTA_RECOVER)) { - printf("nfs_vnop_pageout: error %d, initiating recovery\n", error); - nmp->nm_state |= NFSSTA_RECOVER; - nfs_mount_sock_thread_wake(nmp); + if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid)) { + NP(np, "nfs_vnop_pageout: error %d, initiating recovery", error); + nfs_need_recover(nmp, error); } lck_mtx_unlock(&nmp->nm_lock); - if (error == NFSERR_GRACE) - tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); restart = 1; goto cancel; } @@ -6690,7 +7231,7 @@ nfs_vnop_pageout( vrestart = 0; if (!error && (commit != NFS_WRITE_FILESYNC)) { - error = nmp->nm_funcs->nf_commit_rpc(np, f_offset, xsize, cred); + error = nmp->nm_funcs->nf_commit_rpc(np, f_offset, xsize, cred, wverf); if (error == NFSERR_STALEWRITEVERF) { vrestart = 1; error = EIO; @@ -6709,18 +7250,26 @@ nfs_vnop_pageout( np->n_numoutput--; nfs_node_unlock(np); } - if (vrestart) { - if (++vrestarts <= 100) /* guard against no progress */ - goto tryagain; - printf("nfs_pageout: too many restarts, aborting.\n"); - FSDBG(323, f_offset, xsize, ERESTART, -1); - } - if (restart) { - if ((restarts <= nfs_mount_state_max_restarts(nmp)) && /* guard against no progress */ - (!(error = nfs_mount_state_wait_for_recovery(nmp)))) - goto tryagain; - printf("nfs_pageout: too many restarts, aborting.\n"); - FSDBG(323, f_offset, xsize, ERESTART, -1); + if (np->n_flag & NREVOKE) { + error = EIO; + } else { + if (vrestart) { + if (++vrestarts <= 100) /* guard against no progress */ + goto tryagain; + NP(np, "nfs_pageout: too many restarts, aborting"); + FSDBG(323, f_offset, xsize, ERESTART, -1); + } + if (restart) { + if (restarts <= nfs_mount_state_max_restarts(nmp)) { /* guard against no progress */ + if (error == NFSERR_GRACE) + tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); + if (!(error = nfs_mount_state_wait_for_recovery(nmp))) + goto tryagain; + } else { + NP(np, "nfs_pageout: too many restarts, aborting"); + FSDBG(323, f_offset, xsize, ERESTART, -1); + } + } } } @@ -6762,7 +7311,7 @@ nfs_vnop_pageout( abortflags = UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY; if (error <= NFS_ELAST) { if ((errorcount[error] % 100) == 0) - printf("nfs_pageout: unexpected error %d. dumping vm page\n", error); + NP(np, "nfs_pageout: unexpected error %d. dumping vm page", error); errorcount[error]++; } break; @@ -6776,7 +7325,7 @@ nfs_vnop_pageout( break; case SEVER: /* not implemented */ default: - printf("nfs_pageout: action %d not expected\n", action); + NP(np, "nfs_pageout: action %d not expected", action); break; } @@ -6837,3 +7386,84 @@ nfs_vnop_offtoblk( return (0); } +/* + * vnode change monitoring + */ +int +nfs_vnop_monitor( + struct vnop_monitor_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + uint32_t a_events; + uint32_t a_flags; + void *a_handle; + vfs_context_t a_context; + } */ *ap) +{ + nfsnode_t np = VTONFS(ap->a_vp); + struct nfsmount *nmp = VTONMP(ap->a_vp); + int error = 0; + + if (!nmp) + return (ENXIO); + + /* make sure that the vnode's monitoring status is up to date */ + lck_mtx_lock(&nmp->nm_lock); + if (vnode_ismonitored(ap->a_vp)) { + /* This vnode is currently being monitored, make sure we're tracking it. */ + if (np->n_monlink.le_next == NFSNOLIST) { + LIST_INSERT_HEAD(&nmp->nm_monlist, np, n_monlink); + nfs_mount_sock_thread_wake(nmp); + } + } else { + /* This vnode is no longer being monitored, make sure we're not tracking it. */ + /* Wait for any in-progress getattr to complete first. */ + while (np->n_mflag & NMMONSCANINPROG) { + struct timespec ts = { 1, 0 }; + np->n_mflag |= NMMONSCANWANT; + msleep(&np->n_mflag, &nmp->nm_lock, PZERO-1, "nfswaitmonscan", &ts); + } + if (np->n_monlink.le_next != NFSNOLIST) { + LIST_REMOVE(np, n_monlink); + np->n_monlink.le_next = NFSNOLIST; + } + } + lck_mtx_unlock(&nmp->nm_lock); + + return (error); +} + +/* + * Send a vnode notification for the given events. + */ +void +nfs_vnode_notify(nfsnode_t np, uint32_t events) +{ + struct nfsmount *nmp = NFSTONMP(np); + struct nfs_vattr nvattr; + struct vnode_attr vattr, *vap = NULL; + struct timeval now; + + microuptime(&now); + if ((np->n_evtstamp == now.tv_sec) || !nmp) { + /* delay sending this notify */ + np->n_events |= events; + return; + } + events |= np->n_events; + np->n_events = 0; + np->n_evtstamp = now.tv_sec; + + vfs_get_notify_attributes(&vattr); + if (!nfs_getattrcache(np, &nvattr, 0)) { + vap = &vattr; + VATTR_INIT(vap); + VATTR_RETURN(vap, va_fsid, vfs_statfs(nmp->nm_mountp)->f_fsid.val[0]); + VATTR_RETURN(vap, va_fileid, nvattr.nva_fileid); + VATTR_RETURN(vap, va_mode, nvattr.nva_mode); + VATTR_RETURN(vap, va_uid, nvattr.nva_uid); + VATTR_RETURN(vap, va_gid, nvattr.nva_gid); + VATTR_RETURN(vap, va_nlink, nvattr.nva_nlink); + } + vnode_notify(NFSTOV(np), events, vap); +} diff --git a/bsd/nfs/nfsm_subs.h b/bsd/nfs/nfsm_subs.h index 910636f85..434d4f57a 100644 --- a/bsd/nfs/nfsm_subs.h +++ b/bsd/nfs/nfsm_subs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -73,8 +73,8 @@ #ifdef __APPLE_API_PRIVATE -int nfsm_rpchead(struct nfsreq *, int, mbuf_t, u_int64_t *, mbuf_t *); -int nfsm_rpchead2(int, int, int, int, int, int, kauth_cred_t, struct nfsreq *, mbuf_t, u_int64_t *, mbuf_t *); +int nfsm_rpchead(struct nfsreq *, mbuf_t, u_int64_t *, mbuf_t *); +int nfsm_rpchead2(struct nfsmount *, int, int, int, int, int, kauth_cred_t, struct nfsreq *, mbuf_t, u_int64_t *, mbuf_t *); int nfsm_chain_new_mbuf(struct nfsm_chain *, size_t); int nfsm_chain_add_opaque_f(struct nfsm_chain *, const u_char *, uint32_t); @@ -83,6 +83,7 @@ int nfsm_chain_add_uio(struct nfsm_chain *, uio_t, uint32_t); int nfsm_chain_add_fattr4_f(struct nfsm_chain *, struct vnode_attr *, struct nfsmount *); int nfsm_chain_add_v2sattr_f(struct nfsm_chain *, struct vnode_attr *, uint32_t); int nfsm_chain_add_v3sattr_f(struct nfsm_chain *, struct vnode_attr *); +int nfsm_chain_add_string_nfc(struct nfsm_chain *, const uint8_t *, uint32_t); int nfsm_chain_advance(struct nfsm_chain *, uint32_t); int nfsm_chain_offset(struct nfsm_chain *); @@ -93,6 +94,7 @@ int nfsm_chain_get_uio(struct nfsm_chain *, uint32_t, uio_t); int nfsm_chain_get_fh_attr(struct nfsm_chain *, nfsnode_t, vfs_context_t, int, uint64_t *, fhandle_t *, struct nfs_vattr *); int nfsm_chain_get_wcc_data_f(struct nfsm_chain *, nfsnode_t, struct timespec *, int *, u_int64_t *); +int nfsm_chain_get_secinfo(struct nfsm_chain *, uint32_t *, int *); #if NFSSERVER void nfsm_adj(mbuf_t, int, int); @@ -339,6 +341,16 @@ int nfsm_chain_trim_data(struct nfsm_chain *, int, int *); nfsm_chain_add_opaque((E), (NMC), (STR), (LEN)); \ } while (0) +/* add a name to an mbuf chain */ +#define nfsm_chain_add_name(E, NMC, STR, LEN, NMP) \ + do { \ + if (E) break; \ + if (NMFLAG((NMP), NFC)) \ + (E) = nfsm_chain_add_string_nfc((NMC), (const uint8_t*)(STR), (LEN)); \ + else \ + nfsm_chain_add_string((E), (NMC), (STR), (LEN)); \ + } while (0) + /* add an NFSv2 time to an mbuf chain */ #define nfsm_chain_add_v2time(E, NMC, TVP) \ do { \ @@ -454,6 +466,36 @@ int nfsm_chain_trim_data(struct nfsm_chain *, int, int *); nfsm_chain_add_32((E), (NMC), ((B)[__i] & (MASK)[__i])); \ } while (0) +/* add NFSv4 attr bitmap masked with the supported attributes for this mount/node */ +#define nfsm_chain_add_bitmap_supported(E, NMC, B, NMP, NP) \ + do { \ + uint32_t __bitmap[NFS_ATTR_BITMAP_LEN], *__bmp = (B); \ + int __nonamedattr = 0, __noacl = 0, __nomode = 0; \ + if (!((NMP)->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR) || \ + ((NP) && (((nfsnode_t)(NP))->n_flag & (NISDOTZFS|NISDOTZFSCHILD)))) \ + __nonamedattr = 1; \ + if (!((NMP)->nm_fsattr.nfsa_flags & NFS_FSFLAG_ACL)) \ + __noacl = 1; \ + if (NMFLAG((NMP), ACLONLY)) \ + __nomode = 1; \ + if (__nonamedattr || __noacl || __nomode) { \ + /* don't ask for attrs we're not supporting */ \ + /* some ".zfs" directories can't handle being asked for some attributes */ \ + int __ii; \ + NFS_CLEAR_ATTRIBUTES(__bitmap); \ + for (__ii=0; __ii < NFS_ATTR_BITMAP_LEN; __ii++) \ + __bitmap[__ii] = (B)[__ii]; \ + if (__nonamedattr) \ + NFS_BITMAP_CLR(__bitmap, NFS_FATTR_NAMED_ATTR); \ + if (__noacl) \ + NFS_BITMAP_CLR(__bitmap, NFS_FATTR_ACL); \ + if (__nomode) \ + NFS_BITMAP_CLR(__bitmap, NFS_FATTR_MODE); \ + __bmp = __bitmap; \ + } \ + nfsm_chain_add_bitmap_masked((E), (NMC), __bmp, NFS_ATTR_BITMAP_LEN, (NMP)->nm_fsattr.nfsa_supp_attr); \ + } while (0) + /* Add an NFSv4 "stateid" structure to an mbuf chain */ #define nfsm_chain_add_stateid(E, NMC, SID) \ do { \ @@ -642,19 +684,18 @@ int nfsm_chain_trim_data(struct nfsm_chain *, int, int *); } while (0) /* update a node's attribute cache with attributes from an mbuf chain */ -#define nfsm_chain_loadattr(E, NMC, NP, VERS, A, X) \ +#define nfsm_chain_loadattr(E, NMC, NP, VERS, X) \ do { \ - struct nfs_vattr ttvattr, *ttnvap; \ + struct nfs_vattr ttvattr; \ if (E) break; \ - ttnvap = (A) ? (A) : &ttvattr; \ if ((VERS) == NFS_VER4) { \ - NFS_CLEAR_ATTRIBUTES(ttnvap->nva_bitmap); \ - (E) = nfs4_parsefattr((NMC), NULL, ttnvap, NULL, NULL); \ + (E) = nfs4_parsefattr((NMC), NULL, &ttvattr, NULL, NULL, NULL); \ } else { \ - (E) = nfs_parsefattr((NMC), (VERS), ttnvap); \ + (E) = nfs_parsefattr((NMC), (VERS), &ttvattr); \ } \ - if (E) break; \ - (E) = nfs_loadattrcache((NP), ttnvap, (X), 0); \ + if (!(E) && (NP)) \ + (E) = nfs_loadattrcache((NP), &ttvattr, (X), 0); \ + NVATTR_CLEANUP(&ttvattr); \ } while (0) /* get NFSv4 attr bitmap */ @@ -693,7 +734,8 @@ int nfsm_chain_trim_data(struct nfsm_chain *, int, int *); do { \ uint32_t __val = 0; \ nfsm_chain_get_32((E), (NMC), __val); \ - nfsm_assert((E), (__val == (OP)), EBADRPC); \ + /* [sigh] some implementations return the "illegal" op for unsupported ops */ \ + nfsm_assert((E), ((__val == (OP)) || (__val == NFS_OP_ILLEGAL)), EBADRPC); \ nfsm_chain_get_32((E), (NMC), __val); \ nfsm_assert((E), (__val == NFS_OK), __val); \ } while (0) @@ -705,7 +747,7 @@ int nfsm_chain_trim_data(struct nfsm_chain *, int, int *); nfsm_chain_get_32((E), (NMC), __ci_atomic); \ nfsm_chain_get_64((E), (NMC), __ci_before); \ nfsm_chain_get_64((E), (NMC), __ci_after); \ - if (E) break; \ + if ((E) || !(DNP)) break; \ if (__ci_atomic && (__ci_before == (DNP)->n_ncchange)) { \ (DNP)->n_ncchange = __ci_after; \ } else { \ diff --git a/bsd/nfs/nfsmount.h b/bsd/nfs/nfsmount.h index 742c166c5..97f955e2f 100644 --- a/bsd/nfs/nfsmount.h +++ b/bsd/nfs/nfsmount.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -105,29 +105,130 @@ struct nfs_fsattr { #define NFS_FSFLAG_CHOWN_RESTRICTED 0x00000080 #define NFS_FSFLAG_HOMOGENEOUS 0x00000100 #define NFS_FSFLAG_NO_TRUNC 0x00000200 +#define NFS_FSFLAG_NAMED_ATTR 0x00000400 #define NFS_FSFLAG_FHTYPE_MASK 0xFF000000 #define NFS_FSFLAG_FHTYPE_SHIFT 24 +/* + * NFS file system location structures + */ +struct nfs_fs_server { + char * ns_name; /* name of server */ + char ** ns_addresses; /* array of addresses for server */ + uint32_t ns_addrcount; /* # of addresses */ +}; +struct nfs_fs_path { + char ** np_components; /* array of component pointers */ + uint32_t np_compcount; /* # components in path */ +}; +struct nfs_fs_location { + struct nfs_fs_server ** nl_servers; /* array of server pointers */ + struct nfs_fs_path nl_path; /* file system path */ + uint32_t nl_servcount; /* # of servers */ +}; + +struct nfs_location_index { + uint8_t nli_flags; /* misc flags */ + uint8_t nli_loc; /* location index */ + uint8_t nli_serv; /* server index */ + uint8_t nli_addr; /* address index */ +}; +#define NLI_VALID 0x01 /* index is valid */ + +struct nfs_fs_locations { + struct nfs_fs_path nl_root; /* current server's root file system path */ + uint32_t nl_numlocs; /* # of locations */ + struct nfs_location_index nl_current; /* index of current location/server/address */ + struct nfs_fs_location **nl_locations; /* array of fs locations */ +}; + +/* + * RPC record marker parsing state + */ +struct nfs_rpc_record_state { + mbuf_t nrrs_m; /* mbufs for current record */ + mbuf_t nrrs_mlast; + uint16_t nrrs_lastfrag; /* last fragment of record */ + uint16_t nrrs_markerleft; /* marker bytes remaining */ + uint32_t nrrs_fragleft; /* fragment bytes remaining */ + uint32_t nrrs_reclen; /* length of RPC record */ +}; + +/* + * NFS socket structures + */ +struct nfs_socket { + lck_mtx_t nso_lock; /* nfs socket lock */ + TAILQ_ENTRY(nfs_socket) nso_link; /* list of sockets */ + struct sockaddr * nso_saddr; /* socket address */ + struct sockaddr * nso_saddr2; /* additional socket address */ + void * nso_wake; /* address to wake up */ + time_t nso_timestamp; + time_t nso_reqtimestamp; /* last request sent */ + socket_t nso_so; /* socket */ + uint8_t nso_sotype; /* Type of socket */ + uint16_t nso_flags; /* NSO_* flags */ + struct nfs_location_index nso_location; /* location index */ + uint32_t nso_protocol; /* RPC protocol */ + uint32_t nso_version; /* RPC protocol version */ + uint32_t nso_pingxid; /* RPC XID of NULL ping request */ + int nso_error; /* saved error/status */ + struct nfs_rpc_record_state nso_rrs; /* RPC record parsing state (TCP) */ +}; +TAILQ_HEAD(nfssocketlist, nfs_socket); +/* nso_flags */ +#define NSO_UPCALL 0x0001 /* socket upcall in progress */ +#define NSO_DEAD 0x0002 /* socket is dead */ +#define NSO_CONNECTING 0x0004 /* socket is being connected */ +#define NSO_CONNECTED 0x0008 /* socket connection complete */ +#define NSO_PINGING 0x0010 /* socket is being tested */ +#define NSO_VERIFIED 0x0020 /* socket appears functional */ +#define NSO_DISCONNECTING 0x0040 /* socket is being disconnected */ + +/* NFS connect socket search state */ +struct nfs_socket_search { + struct nfs_location_index nss_startloc; /* starting location index */ + struct nfs_location_index nss_nextloc; /* next location index */ + struct nfssocketlist nss_socklist; /* list of active sockets */ + time_t nss_timestamp; /* search start time */ + time_t nss_last; /* timestamp of last socket */ + struct nfs_socket * nss_sock; /* found socket */ + uint8_t nss_sotype; /* TCP/UDP */ + uint8_t nss_sockcnt; /* # of active sockets */ + in_port_t nss_port; /* port # to connect to */ + uint32_t nss_protocol; /* RPC protocol */ + uint32_t nss_version; /* RPC protocol version */ + uint32_t nss_flags; /* (see below) */ + int nss_timeo; /* how long we are willing to wait */ + int nss_error; /* best error we've gotten so far */ +}; +/* nss_flags */ +#define NSS_VERBOSE 0x00000001 /* OK to log info about socket search */ +#define NSS_WARNED 0x00000002 /* logged warning about socket search taking a while */ + /* * function table for calling version-specific NFS functions */ struct nfs_funcs { - int (*nf_mount)(struct nfsmount *, vfs_context_t, struct user_nfs_args *, nfsnode_t *); + int (*nf_mount)(struct nfsmount *, vfs_context_t, nfsnode_t *); int (*nf_update_statfs)(struct nfsmount *, vfs_context_t); int (*nf_getquota)(struct nfsmount *, vfs_context_t, uid_t, int, struct dqblk *); int (*nf_access_rpc)(nfsnode_t, u_int32_t *, vfs_context_t); - int (*nf_getattr_rpc)(nfsnode_t, mount_t, u_char *, size_t, vfs_context_t, struct nfs_vattr *, u_int64_t *); + int (*nf_getattr_rpc)(nfsnode_t, mount_t, u_char *, size_t, int, vfs_context_t, struct nfs_vattr *, u_int64_t *); int (*nf_setattr_rpc)(nfsnode_t, struct vnode_attr *, vfs_context_t); int (*nf_read_rpc_async)(nfsnode_t, off_t, size_t, thread_t, kauth_cred_t, struct nfsreq_cbinfo *, struct nfsreq **); int (*nf_read_rpc_async_finish)(nfsnode_t, struct nfsreq *, uio_t, size_t *, int *); int (*nf_readlink_rpc)(nfsnode_t, char *, uint32_t *, vfs_context_t); int (*nf_write_rpc_async)(nfsnode_t, uio_t, size_t, thread_t, kauth_cred_t, int, struct nfsreq_cbinfo *, struct nfsreq **); int (*nf_write_rpc_async_finish)(nfsnode_t, struct nfsreq *, int *, size_t *, uint64_t *); - int (*nf_commit_rpc)(nfsnode_t, uint64_t, uint64_t, kauth_cred_t); + int (*nf_commit_rpc)(nfsnode_t, uint64_t, uint64_t, kauth_cred_t, uint64_t); int (*nf_lookup_rpc_async)(nfsnode_t, char *, int, vfs_context_t, struct nfsreq **); - int (*nf_lookup_rpc_async_finish)(nfsnode_t, vfs_context_t, struct nfsreq *, u_int64_t *, fhandle_t *, struct nfs_vattr *); + int (*nf_lookup_rpc_async_finish)(nfsnode_t, char *, int, vfs_context_t, struct nfsreq *, u_int64_t *, fhandle_t *, struct nfs_vattr *); int (*nf_remove_rpc)(nfsnode_t, char *, int, thread_t, kauth_cred_t); int (*nf_rename_rpc)(nfsnode_t, char *, int, nfsnode_t, char *, int, vfs_context_t); + int (*nf_setlock_rpc)(nfsnode_t, struct nfs_open_file *, struct nfs_file_lock *, int, int, thread_t, kauth_cred_t); + int (*nf_unlock_rpc)(nfsnode_t, struct nfs_lock_owner *, int, uint64_t, uint64_t, int, thread_t, kauth_cred_t); + int (*nf_getlock_rpc)(nfsnode_t, struct nfs_lock_owner *, struct flock *, uint64_t, uint64_t, vfs_context_t); }; /* @@ -148,12 +249,18 @@ __private_extern__ struct nfsclientidlist nfsclientids; */ struct nfsmount { lck_mtx_t nm_lock; /* nfs mount lock */ - int nm_flag; /* Flags for soft/hard... */ + char * nm_args; /* NFS mount args (XDR) */ + uint32_t nm_mattrs[NFS_MATTR_BITMAP_LEN]; /* mount attributes in mount args */ + uint32_t nm_mflags_mask[NFS_MFLAG_BITMAP_LEN]; /* mount flags mask in mount args */ + uint32_t nm_mflags[NFS_MFLAG_BITMAP_LEN]; /* mount flags in mount args */ + uint32_t nm_flags[NFS_MFLAG_BITMAP_LEN]; /* current mount flags (soft, intr, etc...) */ int nm_state; /* Internal state flags */ int nm_vers; /* NFS version */ struct nfs_funcs *nm_funcs; /* version-specific functions */ + kauth_cred_t nm_mcred; /* credential used for the mount (v4) */ mount_t nm_mountp; /* VFS structure for this filesystem */ nfsnode_t nm_dnp; /* root directory nfsnode pointer */ + struct nfs_fs_locations nm_locations; /* file system locations */ int nm_numgrps; /* Max. size of groupslist */ TAILQ_HEAD(, nfs_gss_clnt_ctx) nm_gsscl; /* GSS user contexts */ int nm_timeo; /* Init timer for NFSMNT_DUMBTIMR */ @@ -162,36 +269,48 @@ struct nfsmount { uint32_t nm_wsize; /* Max size of write rpc */ uint32_t nm_biosize; /* buffer I/O size */ uint32_t nm_readdirsize; /* Size of a readdir rpc */ - int nm_readahead; /* Num. of blocks to readahead */ - int nm_acregmin; /* reg file min attr cache timeout */ - int nm_acregmax; /* reg file max attr cache timeout */ - int nm_acdirmin; /* dir min attr cache timeout */ - int nm_acdirmax; /* dir max attr cache timeout */ - uint32_t nm_auth; /* security mechanism flavor */ + uint32_t nm_readahead; /* Num. of blocks to readahead */ + uint32_t nm_acregmin; /* reg file min attr cache timeout */ + uint32_t nm_acregmax; /* reg file max attr cache timeout */ + uint32_t nm_acdirmin; /* dir min attr cache timeout */ + uint32_t nm_acdirmax; /* dir max attr cache timeout */ + uint32_t nm_auth; /* security mechanism flavor being used */ + struct nfs_sec nm_sec; /* acceptable security mechanism flavors */ + struct nfs_sec nm_servsec; /* server's acceptable security mechanism flavors */ + fhandle_t *nm_fh; /* initial file handle */ + uint8_t nm_lockmode; /* advisory file locking mode */ /* mount info */ uint32_t nm_fsattrstamp; /* timestamp for fs attrs */ struct nfs_fsattr nm_fsattr; /* file system attributes */ uint64_t nm_verf; /* v3/v4 write verifier */ union { struct { /* v2/v3 specific fields */ - u_short rqport; /* cached rquota port */ - uint32_t rqportstamp; /* timestamp of rquota port */ + TAILQ_ENTRY(nfsmount) ldlink; /* chain of mounts registered for lockd use */ + int udp_sent; /* UDP request send count */ + int udp_cwnd; /* UDP request congestion window */ + struct nfs_reqqhead udp_cwndq; /* requests waiting on cwnd */ + struct sockaddr *rqsaddr;/* cached rquota socket address */ + uint32_t rqsaddrstamp; /* timestamp of rquota socket address */ } v3; struct { /* v4 specific fields */ struct nfs_client_id *longid; /* client ID, long form */ uint64_t mounttime; /* used as client ID verifier */ uint64_t clientid; /* client ID, short form */ thread_call_t renew_timer; /* RENEW timer call */ - TAILQ_HEAD(, nfs_open_owner) open_owners; /* list of open owners */ - TAILQ_HEAD(, nfsnode) recallq; /* list of nodes with recalled delegations */ + nfs_fsid fsid; /* NFS file system id */ + TAILQ_HEAD(, nfsnode) delegations; /* list of nodes with delegations */ + TAILQ_HEAD(, nfsnode) dreturnq; /* list of nodes with delegations to return */ TAILQ_ENTRY(nfsmount) cblink; /* chain of mounts registered for callbacks */ - uint32_t stateinuse; /* state in use counter */ - uint32_t stategenid; /* state generation counter */ - kauth_cred_t mcred; /* credential used for the mount */ uint32_t cbid; /* callback channel identifier */ uint32_t cbrefs; /* # callbacks using this mount */ } v4; } nm_un; + /* common state */ + TAILQ_HEAD(, nfs_open_owner) nm_open_owners; /* list of open owners */ + uint32_t nm_stateinuse; /* state in use counter */ + uint32_t nm_stategenid; /* state generation counter */ + time_t nm_recover_start; /* recover start time */ + LIST_HEAD(, nfsnode) nm_monlist; /* list of nodes being monitored */ /* async I/O queue */ struct nfs_reqqhead nm_resendq; /* async I/O resend queue */ struct nfs_reqqhead nm_iodq; /* async I/O request queue */ @@ -199,11 +318,14 @@ struct nfsmount { TAILQ_ENTRY(nfsmount) nm_iodlink; /* chain of mounts awaiting nfsiod */ int nm_asyncwrites; /* outstanding async I/O writes */ /* socket state */ - int nm_sotype; /* Type of socket */ - int nm_soproto; /* and protocol */ - mbuf_t nm_nam; /* Address of server */ + uint8_t nm_sofamily; /* (preferred) protocol family of socket */ + uint8_t nm_sotype; /* (preferred) type of socket */ + in_port_t nm_nfsport; /* NFS protocol port */ + in_port_t nm_mountport; /* MOUNT protocol port (v2/v3) */ + struct nfs_socket_search *nm_nss; /* current socket search structure */ + struct nfs_socket *nm_nso; /* current socket */ + struct sockaddr *nm_saddr; /* Address of server */ u_short nm_sockflags; /* socket state flags */ - socket_t nm_so; /* RPC socket */ time_t nm_deadto_start; /* dead timeout start time */ time_t nm_reconnect_start; /* reconnect start time */ int nm_tprintf_initial_delay; /* delay first "server down" */ @@ -213,27 +335,26 @@ struct nfsmount { int nm_sdrtt[4]; int nm_timeouts; /* Request timeouts */ int nm_jbreqs; /* # R_JBTPRINTFMSG requests */ - union { - struct { - int sent; /* Request send count */ - int cwnd; /* Request congestion window */ - struct nfs_reqqhead cwndq; /* requests waiting on cwnd */ - } udp; - struct { - u_int32_t mleft;/* marker bytes remaining */ - u_int32_t fleft;/* fragment bytes remaining */ - u_int32_t len; /* length of RPC record */ - mbuf_t m; /* mbufs for current record */ - mbuf_t mlast; - } tcp; - } nm_sockstate; + int nm_mounterror; /* status of mount connect */ TAILQ_ENTRY(nfsmount) nm_pokeq; /* mount poke queue chain */ thread_t nm_sockthd; /* socket thread for this mount */ }; +/* macro for checking current mount flags */ +#define NMFLAG(NMP, F) NFS_BITMAP_ISSET((NMP)->nm_flags, NFS_MFLAG_ ## F) +/* macros for checking (original) mount attributes/flags */ +#define NM_OMATTR_GIVEN(NMP, F) NFS_BITMAP_ISSET((NMP)->nm_mattrs, NFS_MATTR_ ## F) +#define NM_OMFLAG_GIVEN(NMP, F) NFS_BITMAP_ISSET((NMP)->nm_mflags_mask, NFS_MFLAG_ ## F) +#define NM_OMFLAG(NMP, F) NFS_BITMAP_ISSET((NMP)->nm_mflags, NFS_MFLAG_ ## F) + /* * NFS mount state flags (nm_state) */ +#define NFSSTA_MOUNT_THREAD 0x00000040 /* nfs_mount_connect_thread running */ +#define NFSSTA_MONITOR_SCAN 0x00000080 /* scan of monitored nodes in progress */ +#define NFSSTA_UNMOUNTING 0x00000100 /* an unmount attempt is in progress */ +#define NFSSTA_NEEDSECINFO 0x00000200 /* need to fetch security info */ +#define NFSSTA_CLIENTID 0x00000400 /* short client ID is valid */ #define NFSSTA_BIGCOOKIES 0x00000800 /* have seen >32bit dir cookies */ #define NFSSTA_JUKEBOXTIMEO 0x00001000 /* experienced a jukebox timeout */ #define NFSSTA_LOCKTIMEO 0x00002000 /* experienced a lock req timeout */ @@ -244,45 +365,41 @@ struct nfsmount { #define NFSSTA_HASWRITEVERF 0x00040000 /* Has write verifier for V3 */ #define NFSSTA_GOTPATHCONF 0x00080000 /* Got the V3 pathconf info */ #define NFSSTA_GOTFSINFO 0x00100000 /* Got the V3 fsinfo */ +#define NFSSTA_SENDING 0x00800000 /* Sending on socket */ #define NFSSTA_SNDLOCK 0x01000000 /* Send socket lock */ #define NFSSTA_WANTSND 0x02000000 /* Want above */ #define NFSSTA_DEAD 0x04000000 /* mount is dead */ #define NFSSTA_RECOVER 0x08000000 /* mount state needs to be recovered */ +#define NFSSTA_RECOVER_EXPIRED 0x10000000 /* mount state expired */ +#define NFSSTA_REVOKE 0x20000000 /* need to scan for revoked nodes */ /* flags for nm_sockflags */ #define NMSOCK_READY 0x0001 /* socket is ready for use */ #define NMSOCK_CONNECTING 0x0002 /* socket is being connect()ed */ #define NMSOCK_SETUP 0x0004 /* socket/connection is being set up */ #define NMSOCK_UNMOUNT 0x0008 /* unmounted, no more socket activity */ -#define NMSOCK_LASTFRAG 0x0010 /* on last fragment of RPC record */ +#define NMSOCK_HASCONNECTED 0x0010 /* socket has connected before */ #define NMSOCK_POKE 0x0020 /* socket needs to be poked */ -#define NMSOCK_UPCALL 0x0040 /* socket upcall in progress */ - -/* aliases for socket state variables */ -#define nm_sent nm_sockstate.udp.sent -#define nm_cwnd nm_sockstate.udp.cwnd -#define nm_cwndq nm_sockstate.udp.cwndq -#define nm_markerleft nm_sockstate.tcp.mleft -#define nm_fragleft nm_sockstate.tcp.fleft -#define nm_reclen nm_sockstate.tcp.len -#define nm_m nm_sockstate.tcp.m -#define nm_mlast nm_sockstate.tcp.mlast +#define NMSOCK_DISCONNECTING 0x0080 /* socket is being disconnected */ /* aliases for version-specific fields */ -#define nm_rqport nm_un.v3.rqport -#define nm_rqportstamp nm_un.v3.rqportstamp +#define nm_ldlink nm_un.v3.ldlink +#define nm_sent nm_un.v3.udp_sent +#define nm_cwnd nm_un.v3.udp_cwnd +#define nm_cwndq nm_un.v3.udp_cwndq +#define nm_rqproto nm_un.v3.rqproto +#define nm_rqsaddr nm_un.v3.rqsaddr +#define nm_rqsaddrstamp nm_un.v3.rqsaddrstamp #define nm_longid nm_un.v4.longid #define nm_clientid nm_un.v4.clientid #define nm_mounttime nm_un.v4.mounttime +#define nm_fsid nm_un.v4.fsid #define nm_renew_timer nm_un.v4.renew_timer -#define nm_open_owners nm_un.v4.open_owners -#define nm_stateinuse nm_un.v4.stateinuse -#define nm_stategenid nm_un.v4.stategenid -#define nm_mcred nm_un.v4.mcred #define nm_cbid nm_un.v4.cbid #define nm_cblink nm_un.v4.cblink #define nm_cbrefs nm_un.v4.cbrefs -#define nm_recallq nm_un.v4.recallq +#define nm_delegations nm_un.v4.delegations +#define nm_dreturnq nm_un.v4.dreturnq #if defined(KERNEL) /* diff --git a/bsd/nfs/nfsnode.h b/bsd/nfs/nfsnode.h index fa0d5bfc4..cce1399ca 100644 --- a/bsd/nfs/nfsnode.h +++ b/bsd/nfs/nfsnode.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -75,6 +75,7 @@ #ifndef _NFS_NFS_H_ #include #endif +#include /* * Silly rename structure that hangs off the nfsnode until the name @@ -255,6 +256,7 @@ struct nfs_dir_buf_header { /* ndbh_flags */ #define NDB_FULL 0x0001 /* buffer has been filled */ #define NDB_EOF 0x0002 /* buffer contains EOF */ +#define NDB_PLUS 0x0004 /* buffer contains RDIRPLUS data */ #define NFS_DIR_BUF_FIRST_DIRENTRY(BP) \ ((struct direntry*)((char*)((BP)->nb_data) + sizeof(*ndbhp))) @@ -313,11 +315,14 @@ struct nfsdmap { struct nfs_vattr { enum vtype nva_type; /* vnode type (for create) */ - uint32_t nva_mode; /* files access mode (and type) */ + uint32_t nva_mode; /* file's access mode (and type) */ uid_t nva_uid; /* owner user id */ gid_t nva_gid; /* owner group id */ + guid_t nva_uuuid; /* owner user UUID */ + guid_t nva_guuid; /* owner group UUID */ + kauth_acl_t nva_acl; /* access control list */ nfs_specdata nva_rawdev; /* device the special file represents */ - uint32_t nva_flags; /* file flags */ + uint32_t nva_flags; /* file flags (see below) */ uint32_t nva_maxlink; /* maximum # of links (v4) */ uint64_t nva_nlink; /* number of references to file */ uint64_t nva_fileid; /* file id */ @@ -330,13 +335,35 @@ struct nfs_vattr { uint32_t nva_bitmap[NFS_ATTR_BITMAP_LEN]; /* attributes that are valid */ }; -#define NFS_FFLAG_ARCHIVED 0x0001 -#define NFS_FFLAG_HIDDEN 0x0002 -#define NFS_FFLAG_NAMED_ATTR 0x0004 /* file has named attributes */ +/* nva_flags */ +#define NFS_FFLAG_ARCHIVED 0x0001 +#define NFS_FFLAG_HIDDEN 0x0002 +#define NFS_FFLAG_HAS_NAMED_ATTRS 0x0004 /* file has named attributes */ +#define NFS_FFLAG_TRIGGER 0x0008 /* node is a trigger/mirror mount point */ +#define NFS_FFLAG_TRIGGER_REFERRAL 0x0010 /* trigger is a referral */ +#define NFS_FFLAG_IS_ATTR 0x8000 /* file is a named attribute file/directory */ /* flags for nfs_getattr() */ -#define NGA_CACHED 0 -#define NGA_UNCACHED 1 +#define NGA_CACHED 0x0001 /* use cached attributes (if still valid) */ +#define NGA_UNCACHED 0x0002 /* fetch new attributes */ +#define NGA_ACL 0x0004 /* fetch ACL */ +#define NGA_MONITOR 0x0008 /* vnode monitor attr update poll */ + +/* macros for initting/cleaning up nfs_vattr structures */ +#define NVATTR_INIT(NVAP) \ + do { \ + NFS_CLEAR_ATTRIBUTES((NVAP)->nva_bitmap); \ + (NVAP)->nva_flags = 0; \ + (NVAP)->nva_acl = NULL; \ + } while (0) +#define NVATTR_CLEANUP(NVAP) \ + do { \ + NFS_CLEAR_ATTRIBUTES((NVAP)->nva_bitmap); \ + if ((NVAP)->nva_acl) { \ + kauth_acl_free((NVAP)->nva_acl); \ + (NVAP)->nva_acl = NULL; \ + } \ + } while (0) /* * macros for detecting node changes @@ -416,17 +443,27 @@ struct nfs_open_file { uint32_t nof_rw; /* read/write opens (deny none) */ uint32_t nof_r_dw; /* read deny-write opens */ /* the rest of the counts have a max of 2 (1 for open + 1 for mmap) */ - uint32_t nof_w_dw:4; /* write deny-write opens (max 2) */ - uint32_t nof_rw_dw:4; /* read/write deny-write opens (max 2) */ - uint32_t nof_r_drw:4; /* read deny-read/write opens (max 2) */ - uint32_t nof_w_drw:4; /* write deny-read/write opens (max 2) */ - uint32_t nof_rw_drw:4; /* read/write deny-read/write opens (max 2) */ + uint32_t nof_w_dw:2; /* write deny-write opens (max 2) */ + uint32_t nof_rw_dw:2; /* read/write deny-write opens (max 2) */ + uint32_t nof_r_drw:2; /* read deny-read/write opens (max 2) */ + uint32_t nof_w_drw:2; /* write deny-read/write opens (max 2) */ + uint32_t nof_rw_drw:2; /* read/write deny-read/write opens (max 2) */ + /* counts of DELEGATED access/deny mode open combinations */ + uint32_t nof_d_w_dw:2; /* write deny-write opens (max 2) */ + uint32_t nof_d_rw_dw:2; /* read/write deny-write opens (max 2) */ + uint32_t nof_d_r_drw:2; /* read deny-read/write opens (max 2) */ + uint32_t nof_d_w_drw:2; /* write deny-read/write opens (max 2) */ + uint32_t nof_d_rw_drw:2; /* read/write deny-read/write opens (max 2) */ + uint32_t nof_d_r; /* read opens (deny none) */ + uint32_t nof_d_w; /* write opens (deny none) */ + uint32_t nof_d_rw; /* read/write opens (deny none) */ + uint32_t nof_d_r_dw; /* read deny-write opens */ }; /* nof_flags */ #define NFS_OPEN_FILE_BUSY 0x0001 /* open state-modifying operation in progress */ #define NFS_OPEN_FILE_WANT 0x0002 /* someone else wants to mark busy */ -#define NFS_OPEN_FILE_CREATE 0x0004 /* has an open(RW) from a VNOP_CREATE call */ -#define NFS_OPEN_FILE_NEEDCLOSE 0x0008 /* has an open(R) from an (unopen) VNOP_READ call */ +#define NFS_OPEN_FILE_CREATE 0x0004 /* has an open(RW) from a "CREATE" call */ +#define NFS_OPEN_FILE_NEEDCLOSE 0x0008 /* has an open(R) from an (unopen) VNOP_READ or VNOP_MMAP call */ #define NFS_OPEN_FILE_SETATTR 0x0020 /* has an open(W) to perform a SETATTR(size) */ #define NFS_OPEN_FILE_POSIXLOCK 0x0040 /* server supports POSIX locking semantics */ #define NFS_OPEN_FILE_LOST 0x0080 /* open state has been lost */ @@ -458,6 +495,7 @@ struct nfs_file_lock { #define NFS_FILE_LOCK_WAIT 0x08 /* may block on conflicting locks */ #define NFS_FILE_LOCK_BLOCKED 0x10 /* request is blocked */ #define NFS_FILE_LOCK_DEAD 0x20 /* lock (request) no longer exists */ +#define NFS_FILE_LOCK_DELEGATED 0x40 /* lock acquired via delegation */ TAILQ_HEAD(nfs_file_lock_queue, nfs_file_lock); @@ -514,14 +552,18 @@ struct nfsnode { lck_rw_t n_datalock; /* nfs node data lock */ void *n_datalockowner;/* nfs node data lock owner (exclusive) */ LIST_ENTRY(nfsnode) n_hash; /* Hash chain */ + LIST_ENTRY(nfsnode) n_monlink; /* list of monitored nodes */ u_quad_t n_size; /* Current size of file */ u_quad_t n_newsize; /* new size of file (pending update) */ u_int64_t n_xid; /* last xid to loadattr */ struct nfs_vattr n_vattr; /* Vnode attribute cache */ time_t n_attrstamp; /* Attr. cache timestamp */ - u_int8_t n_mode[NFS_ACCESS_CACHE_SIZE+1]; /* ACCESS mode cache */ - uid_t n_modeuid[NFS_ACCESS_CACHE_SIZE]; /* credentials having mode */ - time_t n_modestamp[NFS_ACCESS_CACHE_SIZE]; /* mode cache timestamp */ + time_t n_aclstamp; /* ACL cache timestamp */ + time_t n_evtstamp; /* last vnode event timestamp */ + uint32_t n_events; /* pending vnode events */ + u_int8_t n_access[NFS_ACCESS_CACHE_SIZE+1]; /* ACCESS cache */ + uid_t n_accessuid[NFS_ACCESS_CACHE_SIZE]; /* credentials having access */ + time_t n_accessstamp[NFS_ACCESS_CACHE_SIZE]; /* access cache timestamp */ union { struct { struct timespec n3_mtime; /* Prev modify time. */ @@ -530,6 +572,8 @@ struct nfsnode { struct { uint64_t n4_change; /* prev change attribute */ uint64_t n4_ncchange; /* namecache change attribute */ + u_char *n4_attrdirfh; /* associated attr directory fh */ + struct timeval n4_lastio; /* time of most recent I/O on attr */ } v4; } n_un4; vnode_t n_parent; /* this node's parent */ @@ -555,7 +599,9 @@ struct nfsnode { u_short n_flag; /* node flags */ u_short n_hflag; /* node hash flags */ u_short n_bflag; /* node buffer flags */ + u_short n_mflag; /* node mount flags */ u_char n_fh[NFS_SMALLFH];/* Small File Handle */ + uint32_t n_auth; /* security flavor used for this node */ struct nfsbuflists n_cleanblkhd; /* clean blocklist head */ struct nfsbuflists n_dirtyblkhd; /* dirty blocklist head */ union { @@ -567,7 +613,10 @@ struct nfsnode { daddr64_t nd_lastdbl; /* last dir buf lookup block# */ } n_un6; int n_bufiterflags; /* buf iterator flags */ - int n_numoutput; /* I/O in progress */ + union { + int nf_numoutput; /* write I/Os in progress */ + int nd_trigseq; /* vnode trigger seq# */ + } n_un7; /* open state */ lck_mtx_t n_openlock; /* nfs node open lock */ uint32_t n_openflags; /* open state flags */ @@ -578,7 +627,9 @@ struct nfsnode { struct nfs_file_lock_queue n_locks; /* list of locks */ /* delegation state */ nfs_stateid n_dstateid; /* delegation stateid */ - TAILQ_ENTRY(nfsnode) n_dlink; /* delegation recall list link */ + TAILQ_ENTRY(nfsnode) n_dlink; /* delegation list link */ + TAILQ_ENTRY(nfsnode) n_dreturn; /* delegation return list link */ + struct kauth_ace n_dace; /* delegation ACE */ }; #define NFS_DATA_LOCK_SHARED 1 @@ -604,20 +655,25 @@ struct nfsnode { #define n_sillyrename n_un3.nf_silly #define n_wrbusy n_un5.nf_wrbusy #define n_needcommitcnt n_un6.nf_needcommitcnt +#define n_numoutput n_un7.nf_numoutput #define n_cookieverf n_un1.nd_cookieverf #define n_eofcookie n_un2.nd_eofcookie #define n_cookiecache n_un3.nd_cookiecache #define n_ncgen n_un5.nd_ncgen #define n_lastdbl n_un6.nd_lastdbl +#define n_trigseq n_un7.nd_trigseq #define n_mtime n_un4.v3.n3_mtime #define n_ncmtime n_un4.v3.n3_ncmtime #define n_change n_un4.v4.n4_change #define n_ncchange n_un4.v4.n4_ncchange +#define n_attrdirfh n_un4.v4.n4_attrdirfh +#define n_lastio n_un4.v4.n4_lastio /* * Flags for n_flag */ #define NUPDATESIZE 0x0001 /* size of file needs updating */ +#define NREVOKE 0x0002 /* node revoked */ #define NMODIFIED 0x0004 /* Might have a modified buffer in bio */ #define NWRITEERR 0x0008 /* Flag write errors so close will know */ #define NNEEDINVALIDATE 0x0010 /* need to call vinvalbuf() */ @@ -629,6 +685,9 @@ struct nfsnode { #define NNEGNCENTRIES 0x0800 /* directory has negative name cache entries */ #define NBUSY 0x1000 /* node is busy */ #define NBUSYWANT 0x2000 /* waiting on busy node */ +#define NISDOTZFS 0x4000 /* a ".zfs" directory */ +#define NISDOTZFSCHILD 0x8000 /* a child of a ".zfs" directory */ + /* * Flags for n_hflag @@ -648,6 +707,13 @@ struct nfsnode { #define NBINVALINPROG 0x0004 /* Avoid multiple calls to nfs_vinvalbuf() */ #define NBINVALWANT 0x0008 /* waiting for nfs_vinvalbuf() to complete */ +/* + * Flags for n_mflag + * Note: protected by nfsmount's nm_lock + */ +#define NMMONSCANINPROG 0x0001 /* monitored node is currently updating attributes */ +#define NMMONSCANWANT 0x0002 /* waiting for attribute update to complete */ + /* * n_openflags * Note: protected by n_openlock @@ -657,18 +723,22 @@ struct nfsnode { #define N_DELEG_READ 0x0004 /* we have a read delegation */ #define N_DELEG_WRITE 0x0008 /* we have a write delegation */ #define N_DELEG_MASK 0x000c /* delegation mask */ +#define N_DELEG_RETURN 0x0010 /* delegation queued for return */ +#define N_DELEG_RETURNING 0x0020 /* delegation being returned */ -/* attr/mode timestamp macros */ +/* attr/access/ACL cache timestamp macros */ #define NATTRVALID(np) ((np)->n_attrstamp != ~0) #define NATTRINVALIDATE(np) ((np)->n_attrstamp = ~0) -#define NMODEVALID(np, slot) (((slot) >= 0) && ((slot) < 3) && ((np)->n_modestamp[(slot)] != ~0)) -#define NMODEINVALIDATE(np) \ +#define NACCESSVALID(np, slot) (((slot) >= 0) && ((slot) < NFS_ACCESS_CACHE_SIZE) && ((np)->n_accessstamp[(slot)] != ~0)) +#define NACCESSINVALIDATE(np) \ do { \ - (np)->n_modestamp[0] = ~0; \ - (np)->n_modestamp[1] = ~0; \ - (np)->n_modestamp[2] = ~0; \ - (np)->n_mode[3] = 0; \ + int __i; \ + for (__i=0; __i < NFS_ACCESS_CACHE_SIZE; __i++) \ + (np)->n_accessstamp[__i] = ~0; \ + (np)->n_access[NFS_ACCESS_CACHE_SIZE] = 0; \ } while (0) +#define NACLVALID(np) ((np)->n_aclstamp != ~0) +#define NACLINVALIDATE(np) ((np)->n_aclstamp = ~0) /* * NFS-specific flags for nfs_vinvalbuf/nfs_flush @@ -691,6 +761,16 @@ struct nfsnode { /* nfsnode hash table mutex */ __private_extern__ lck_mtx_t *nfs_node_hash_mutex; +/* + * printf-like helper macro that also outputs node name. + */ +#define NP(NP, FMT, ...) \ + do { \ + const char *__vname = (NP) ? vnode_getname(NFSTOV(NP)) : NULL; \ + printf(FMT " %s\n", ##__VA_ARGS__, __vname ? __vname : "???"); \ + if (__vname) vnode_putname(__vname); \ + } while (0) + /* * nfsiod structures */ @@ -743,7 +823,7 @@ void nfs_data_update_size(nfsnode_t, int); /* other stuff */ int nfs_removeit(struct nfs_sillyrename *); -int nfs_nget(mount_t,nfsnode_t,struct componentname *,u_char *,int,struct nfs_vattr *,u_int64_t *,int,nfsnode_t*); +int nfs_nget(mount_t,nfsnode_t,struct componentname *,u_char *,int,struct nfs_vattr *,u_int64_t *,uint32_t,int,nfsnode_t*); void nfs_dir_cookie_cache(nfsnode_t, uint64_t, uint64_t); int nfs_dir_cookie_to_lbn(nfsnode_t, uint64_t, int *, uint64_t *); void nfs_invaldir(nfsnode_t); diff --git a/bsd/nfs/nfsproto.h b/bsd/nfs/nfsproto.h index 9823531f4..ec6bc9311 100644 --- a/bsd/nfs/nfsproto.h +++ b/bsd/nfs/nfsproto.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -348,21 +348,21 @@ typedef enum { NFNON=0, NFREG=1, NFDIR=2, NFBLK=3, NFCHR=4, NFLNK=5, * NFS attribute management stuff */ #define NFS_ATTR_BITMAP_LEN 2 -#define NFS_BITMAP_SET(A, I) (((uint32_t *)(A))[(I)/32] |= 1<<((I)%32)) -#define NFS_BITMAP_CLR(A, I) (((uint32_t *)(A))[(I)/32] &= ~(1<<((I)%32))) -#define NFS_BITMAP_ISSET(A, I) (((uint32_t *)(A))[(I)/32] & (1<<((I)%32))) +#define NFS_BITMAP_SET(B, I) (((uint32_t *)(B))[(I)/32] |= 1<<((I)%32)) +#define NFS_BITMAP_CLR(B, I) (((uint32_t *)(B))[(I)/32] &= ~(1<<((I)%32))) +#define NFS_BITMAP_ISSET(B, I) (((uint32_t *)(B))[(I)/32] & (1<<((I)%32))) +#define NFS_BITMAP_ZERO(B, L) \ + do { \ + int __i; \ + for (__i=0; __i < (L); __i++) \ + ((uint32_t*)(B))[__i] = 0; \ + } while (0) __private_extern__ uint32_t nfs_fs_attr_bitmap[NFS_ATTR_BITMAP_LEN]; __private_extern__ uint32_t nfs_object_attr_bitmap[NFS_ATTR_BITMAP_LEN]; __private_extern__ uint32_t nfs_getattr_bitmap[NFS_ATTR_BITMAP_LEN]; -#define NFS_CLEAR_ATTRIBUTES(A) \ - do { \ - int __i; \ - for (__i=0; __i < NFS_ATTR_BITMAP_LEN; __i++) \ - ((uint32_t*)(A))[__i] = 0; \ - } while (0) - +#define NFS_CLEAR_ATTRIBUTES(A) NFS_BITMAP_ZERO((A), NFS_ATTR_BITMAP_LEN) #define NFS_COPY_ATTRIBUTES(SRC, DST) \ do { \ int __i; \ @@ -571,7 +571,7 @@ __private_extern__ uint32_t nfs_getattr_bitmap[NFS_ATTR_BITMAP_LEN]; /* NFS_BITMAP_SET((A), NFS_FATTR_FILEHANDLE); */ \ /* optional: */ \ /* NFS_BITMAP_SET((A), NFS_FATTR_ACL); */ \ - /* NFS_BITMAP_SET((A), NFS_FATTR_ACLSUPPORT); */ \ + NFS_BITMAP_SET((A), NFS_FATTR_ACLSUPPORT); \ NFS_BITMAP_SET((A), NFS_FATTR_ARCHIVE); \ /* NFS_BITMAP_SET((A), NFS_FATTR_CANSETTIME); */ \ NFS_BITMAP_SET((A), NFS_FATTR_CASE_INSENSITIVE); \ @@ -612,7 +612,7 @@ __private_extern__ uint32_t nfs_getattr_bitmap[NFS_ATTR_BITMAP_LEN]; NFS_BITMAP_SET((A), NFS_FATTR_TIME_METADATA); \ NFS_BITMAP_SET((A), NFS_FATTR_TIME_MODIFY); \ /* NFS_BITMAP_SET((A), NFS_FATTR_TIME_MODIFY_SET); */ \ - /* NFS_BITMAP_SET((A), NFS_FATTR_MOUNTED_ON_FILEID); */ \ + NFS_BITMAP_SET((A), NFS_FATTR_MOUNTED_ON_FILEID); \ } while (0) /* attributes requested when we want to do a "statfs" */ @@ -637,6 +637,7 @@ __private_extern__ uint32_t nfs_getattr_bitmap[NFS_ATTR_BITMAP_LEN]; #define NFS_LIMIT_SIZE 1 #define NFS_LIMIT_BLOCKS 2 /* access/deny modes */ +#define NFS_OPEN_SHARE_ACCESS_NONE 0x00000000 #define NFS_OPEN_SHARE_ACCESS_READ 0x00000001 #define NFS_OPEN_SHARE_ACCESS_WRITE 0x00000002 #define NFS_OPEN_SHARE_ACCESS_BOTH 0x00000003 @@ -740,6 +741,7 @@ __private_extern__ uint32_t nfs_getattr_bitmap[NFS_ATTR_BITMAP_LEN]; #define NFS_ACE_SUCCESSFUL_ACCESS_ACE_FLAG 0x00000010 #define NFS_ACE_FAILED_ACCESS_ACE_FLAG 0x00000020 #define NFS_ACE_IDENTIFIER_GROUP 0x00000040 +#define NFS_ACE_INHERITED_ACE 0x00000080 /* ACE mask flags */ #define NFS_ACE_READ_DATA 0x00000001 #define NFS_ACE_LIST_DIRECTORY 0x00000001 diff --git a/bsd/nfs/nfsrvcache.h b/bsd/nfs/nfsrvcache.h index fa23f1877..06bc9baeb 100644 --- a/bsd/nfs/nfsrvcache.h +++ b/bsd/nfs/nfsrvcache.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -76,6 +76,13 @@ * Definitions for the server recent request cache */ +/* Network address hash list element */ +union nethostaddr { + in_addr_t had_inetaddr; + struct in6_addr had_inet6addr; + mbuf_t had_nam; +}; + #define NFSRVCACHESIZ 64 struct nfsrvcache { @@ -86,6 +93,7 @@ struct nfsrvcache { mbuf_t ru_repmb; /* Reply mbuf list OR */ int ru_repstat; /* Reply status */ } rc_un; + sa_family_t rc_family; /* address family */ union nethostaddr rc_haddr; /* Host address */ u_int32_t rc_proc; /* rpc proc number */ u_char rc_state; /* Current state of request */ @@ -95,6 +103,7 @@ struct nfsrvcache { #define rc_reply rc_un.ru_repmb #define rc_status rc_un.ru_repstat #define rc_inetaddr rc_haddr.had_inetaddr +#define rc_inet6addr rc_haddr.had_inet6addr #define rc_nam rc_haddr.had_nam /* Cache entry states */ diff --git a/bsd/nfs/rpcv2.h b/bsd/nfs/rpcv2.h index 510f5110b..3a288f203 100644 --- a/bsd/nfs/rpcv2.h +++ b/bsd/nfs/rpcv2.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -82,6 +82,7 @@ /* Authentication */ #define RPCAUTH_NULL 0 +#define RPCAUTH_NONE RPCAUTH_NULL #define RPCAUTH_UNIX 1 #define RPCAUTH_SYS RPCAUTH_UNIX #define RPCAUTH_SHORT 2 @@ -89,6 +90,8 @@ #define RPCAUTH_KRB5 390003 #define RPCAUTH_KRB5I 390004 #define RPCAUTH_KRB5P 390005 +#define RPCAUTH_INVALID ~0U +#define RPCAUTH_UNKNOWN RPCAUTH_INVALID #define RPCAUTH_MAXSIZ 400 #define RPCAUTH_UNIXGIDS 16 diff --git a/bsd/nfs/xdr_subs.h b/bsd/nfs/xdr_subs.h index 9a399db19..59356190a 100644 --- a/bsd/nfs/xdr_subs.h +++ b/bsd/nfs/xdr_subs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -64,8 +64,6 @@ * @(#)xdr_subs.h 8.3 (Berkeley) 3/30/95 * FreeBSD-Id: xdr_subs.h,v 1.9 1997/02/22 09:42:53 peter Exp $ */ - - #ifndef _NFS_XDR_SUBS_H_ #define _NFS_XDR_SUBS_H_ @@ -96,5 +94,418 @@ ((uint32_t *)(t))[1] = htonl(((uint32_t *)(f))[_QUAD_LOWWORD]); \ } + +/* + * xdrbuf + * + * generalized functionality for managing the building/dissecting of XDR data + */ +typedef enum xdrbuf_type { XDRBUF_BUFFER=1 } xdrbuf_type; + +struct xdrbuf { + union { + struct { + char * xbb_base; /* base address of buffer */ + uint32_t xbb_size; /* size of buffer */ + uint32_t xbb_len; /* length of data in buffer */ + } xb_buffer; + } xb_u; + char * xb_ptr; /* pointer to current position */ + size_t xb_left; /* bytes remaining in current buffer */ + size_t xb_growsize; /* bytes to allocate when growing */ + xdrbuf_type xb_type; /* type of xdr buffer */ + uint32_t xb_flags; /* XB_* (see below) */ +}; + +#define XB_CLEANUP 0x0001 /* needs cleanup */ + +#define XDRWORD 4 /* the basic XDR building block is a 4 byte (32 bit) word */ +#define xdr_rndup(a) (((a)+3)&(~0x3)) /* round up to XDRWORD size */ +#define xdr_pad(a) (xdr_rndup(a) - (a)) /* calculate round up padding */ + +void xb_init(struct xdrbuf *, xdrbuf_type); +void xb_init_buffer(struct xdrbuf *, char *, size_t); +void xb_cleanup(struct xdrbuf *); +void *xb_malloc(size_t); +void xb_free(void *); +int xb_grow(struct xdrbuf *); +void xb_set_cur_buf_len(struct xdrbuf *); +char *xb_buffer_base(struct xdrbuf *); +int xb_advance(struct xdrbuf *, uint32_t); +int xb_offset(struct xdrbuf *); +int xb_seek(struct xdrbuf *, uint32_t); +int xb_add_bytes(struct xdrbuf *, const char *, uint32_t, int); +int xb_get_bytes(struct xdrbuf *, char *, uint32_t, int); + +#ifdef _NFS_XDR_SUBS_FUNCS_ + +/* + * basic initialization of xdrbuf structure + */ +void +xb_init(struct xdrbuf *xbp, xdrbuf_type type) +{ + bzero(xbp, sizeof(*xbp)); + xbp->xb_type = type; + xbp->xb_flags |= XB_CLEANUP; +} + +/* + * initialize a single-buffer xdrbuf + */ +void +xb_init_buffer(struct xdrbuf *xbp, char *buf, size_t buflen) +{ + xb_init(xbp, XDRBUF_BUFFER); + xbp->xb_u.xb_buffer.xbb_base = buf; + xbp->xb_u.xb_buffer.xbb_size = buflen; + xbp->xb_u.xb_buffer.xbb_len = buflen; + xbp->xb_growsize = 512; + xbp->xb_ptr = buf; + xbp->xb_left = buflen; + if (buf) /* when using an existing buffer, xb code should skip cleanup */ + xbp->xb_flags &= ~XB_CLEANUP; +} + +/* + * get the pointer to the single-buffer xdrbuf's buffer + */ +char * +xb_buffer_base(struct xdrbuf *xbp) +{ + return (xbp->xb_u.xb_buffer.xbb_base); +} + +/* + * clean up any resources held by an xdrbuf + */ +void +xb_cleanup(struct xdrbuf *xbp) +{ + if (!(xbp->xb_flags & XB_CLEANUP)) + return; + switch (xbp->xb_type) { + case XDRBUF_BUFFER: + if (xbp->xb_u.xb_buffer.xbb_base) + xb_free(xbp->xb_u.xb_buffer.xbb_base); + break; + } + xbp->xb_flags &= ~XB_CLEANUP; +} + +/* + * set the length of valid data in the current buffer to + * be up to the current location within the buffer + */ +void +xb_set_cur_buf_len(struct xdrbuf *xbp) +{ + switch (xbp->xb_type) { + case XDRBUF_BUFFER: + xbp->xb_u.xb_buffer.xbb_len = xbp->xb_ptr - xbp->xb_u.xb_buffer.xbb_base; + break; + } +} + +/* + * advance forward through existing data in xdrbuf + */ +int +xb_advance(struct xdrbuf *xbp, uint32_t len) +{ + uint32_t tlen; + + while (len) { + if (xbp->xb_left <= 0) + return (EBADRPC); + tlen = MIN(xbp->xb_left, len); + if (tlen) { + xbp->xb_ptr += tlen; + xbp->xb_left -= tlen; + len -= tlen; + } + } + return (0); +} + +/* + * Calculate the current offset in the XDR buffer. + */ +int +xb_offset(struct xdrbuf *xbp) +{ + uint32_t offset = 0; + + switch (xbp->xb_type) { + case XDRBUF_BUFFER: + offset = xbp->xb_ptr - xbp->xb_u.xb_buffer.xbb_base; + break; + } + + return (offset); +} + +/* + * Seek to the given offset in the existing data in the XDR buffer. + */ +int +xb_seek(struct xdrbuf *xbp, uint32_t offset) +{ + + switch (xbp->xb_type) { + case XDRBUF_BUFFER: + xbp->xb_ptr = xbp->xb_u.xb_buffer.xbb_base + offset; + xbp->xb_left = xbp->xb_u.xb_buffer.xbb_len - offset; + break; + } + + return (0); +} + +/* + * allocate memory + */ +void * +xb_malloc(size_t size) +{ + void *buf = NULL; + +#ifdef KERNEL + MALLOC(buf, void *, size, M_TEMP, M_WAITOK); +#else + buf = malloc(size); +#endif + return (buf); +} +/* + * free a chunk of memory allocated with xb_malloc() + */ +void +xb_free(void *buf) +{ +#ifdef KERNEL + FREE(buf, M_TEMP); +#else + free(buf); +#endif +} + +/* + * Increase space available for new data in XDR buffer. + */ +int +xb_grow(struct xdrbuf *xbp) +{ + char *newbuf, *oldbuf; + size_t newsize, oldsize; + + switch (xbp->xb_type) { + case XDRBUF_BUFFER: + oldsize = xbp->xb_u.xb_buffer.xbb_size; + oldbuf = xbp->xb_u.xb_buffer.xbb_base; + newsize = oldsize + xbp->xb_growsize; + newbuf = xb_malloc(newsize); + if (newbuf == NULL) + return (ENOMEM); + if (oldbuf != NULL) { + bcopy(oldbuf, newbuf, oldsize); + xb_free(oldbuf); + } + xbp->xb_u.xb_buffer.xbb_base = newbuf; + xbp->xb_u.xb_buffer.xbb_size = newsize; + xbp->xb_ptr = newbuf + oldsize; + xbp->xb_left = xbp->xb_growsize; + break; + } + + return (0); +} + +/* + * xb_add_bytes() + * + * Add "count" bytes of opaque data pointed to by "buf" to the given XDR buffer. + */ +int +xb_add_bytes(struct xdrbuf *xbp, const char *buf, uint32_t count, int nopad) +{ + uint32_t len, tlen; + int error; + + len = nopad ? count : xdr_rndup(count); + + /* copy in "count" bytes and zero out any pad bytes */ + while (len) { + if (xbp->xb_left <= 0) { + /* need more space */ + if ((error = xb_grow(xbp))) + return (error); + if (xbp->xb_left <= 0) + return (ENOMEM); + } + tlen = MIN(xbp->xb_left, len); + if (tlen) { + if (count) { + if (tlen > count) + tlen = count; + bcopy(buf, xbp->xb_ptr, tlen); + } else { + bzero(xbp->xb_ptr, tlen); + } + xbp->xb_ptr += tlen; + xbp->xb_left -= tlen; + len -= tlen; + if (count) { + buf += tlen; + count -= tlen; + } + } + } + return (0); +} + +/* + * xb_get_bytes() + * + * Get "count" bytes of opaque data from the given XDR buffer. + */ +int +xb_get_bytes(struct xdrbuf *xbp, char *buf, uint32_t count, int nopad) +{ + uint32_t len, tlen; + + len = nopad ? count : xdr_rndup(count); + + /* copy in "count" bytes and zero out any pad bytes */ + while (len) { + if (xbp->xb_left <= 0) + return (ENOMEM); + tlen = MIN(xbp->xb_left, len); + if (tlen) { + if (count) { + if (tlen > count) + tlen = count; + bcopy(xbp->xb_ptr, buf, tlen); + } + xbp->xb_ptr += tlen; + xbp->xb_left -= tlen; + len -= tlen; + if (count) { + buf += tlen; + count -= tlen; + } + } + } + return (0); +} + +#endif /* _NFS_XDR_SUBS_FUNCS_ */ + + +/* + * macros for building XDR data + */ + +/* finalize the data that has been added to the buffer */ +#define xb_build_done(E, XB) \ + do { \ + if (E) break; \ + xb_set_cur_buf_len(XB); \ + } while (0) + +/* add a 32-bit value */ +#define xb_add_32(E, XB, VAL) \ + do { \ + uint32_t __tmp; \ + if (E) break; \ + __tmp = txdr_unsigned(VAL); \ + (E) = xb_add_bytes((XB), (void*)&__tmp, XDRWORD, 0); \ + } while (0) + +/* add a 64-bit value */ +#define xb_add_64(E, XB, VAL) \ + do { \ + uint64_t __tmp1, __tmp2; \ + if (E) break; \ + __tmp1 = (VAL); \ + txdr_hyper(&__tmp1, &__tmp2); \ + (E) = xb_add_bytes((XB), (char*)&__tmp2, 2 * XDRWORD, 0); \ + } while (0) + +/* add an array of XDR words */ +#define xb_add_word_array(E, XB, A, LEN) \ + do { \ + uint32_t __i; \ + xb_add_32((E), (XB), (LEN)); \ + for (__i=0; __i < (uint32_t)(LEN); __i++) \ + xb_add_32((E), (XB), (A)[__i]); \ + } while (0) +#define xb_add_bitmap(E, XB, B, LEN) xb_add_word_array((E), (XB), (B), (LEN)) + +/* add a file handle */ +#define xb_add_fh(E, XB, FHP, FHLEN) \ + do { \ + xb_add_32((E), (XB), (FHLEN)); \ + if (E) break; \ + (E) = xb_add_bytes((XB), (char*)(FHP), (FHLEN), 0); \ + } while (0) + +/* add a string */ +#define xb_add_string(E, XB, S, LEN) \ + do { \ + xb_add_32((E), (XB), (LEN)); \ + if (E) break; \ + (E) = xb_add_bytes((XB), (const char*)(S), (LEN), 0); \ + } while (0) + + +/* + * macros for decoding XDR data + */ + +/* skip past data in the buffer */ +#define xb_skip(E, XB, LEN) \ + do { \ + if (E) break; \ + (E) = xb_advance((XB), (LEN)); \ + } while (0) + +/* get a 32-bit value */ +#define xb_get_32(E, XB, LVAL) \ + do { \ + uint32_t __tmp; \ + if (E) break; \ + (E) = xb_get_bytes((XB), (char*)&__tmp, XDRWORD, 0); \ + if (E) break; \ + (LVAL) = fxdr_unsigned(uint32_t, __tmp); \ + } while (0) + +/* get a 64-bit value */ +#define xb_get_64(E, XB, LVAL) \ + do { \ + uint64_t __tmp; \ + if (E) break; \ + (E) = xb_get_bytes((XB), (char*)&__tmp, 2 * XDRWORD, 0); \ + if (E) break; \ + fxdr_hyper(&__tmp, &(LVAL)); \ + } while (0) + +/* get an array of XDR words (of a given expected/maximum length) */ +#define xb_get_word_array(E, XB, A, LEN) \ + do { \ + uint32_t __len = 0, __i; \ + xb_get_32((E), (XB), __len); \ + if (E) break; \ + for (__i=0; __i < MIN(__len, (uint32_t)(LEN)); __i++) \ + xb_get_32((E), (XB), (A)[__i]); \ + if (E) break; \ + for (; __i < __len; __i++) \ + xb_skip((E), (XB), XDRWORD); \ + for (; __i < (uint32_t)(LEN); __i++) \ + (A)[__i] = 0; \ + (LEN) = __len; \ + } while (0) +#define xb_get_bitmap(E, XB, B, LEN) xb_get_word_array((E), (XB), (B), (LEN)) + #endif /* __APPLE_API_PRIVATE */ #endif /* _NFS_XDR_SUBS_H_ */ diff --git a/bsd/ppc/Makefile b/bsd/ppc/Makefile deleted file mode 100644 index 21878d7f3..000000000 --- a/bsd/ppc/Makefile +++ /dev/null @@ -1,33 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -DATAFILES = \ - endian.h fasttrap_isa.h param.h profile.h \ - setjmp.h signal.h limits.h _limits.h \ - types.h vmparam.h _structs.h _types.h _param.h - -KERNELFILES = \ - disklabel.h \ - endian.h param.h profile.h \ - signal.h limits.h _limits.h \ - types.h vmparam.h _structs.h _types.h _param.h - -INSTALL_MD_LIST = ${DATAFILES} -INSTALL_MD_LCL_LIST = ${DATAFILES} disklabel.h - -INSTALL_MD_DIR = ppc - -EXPORT_MD_LIST = ${KERNELFILES} - -EXPORT_MD_DIR = ppc - -include $(MakeInc_rule) -include $(MakeInc_dir) - - diff --git a/bsd/ppc/_limits.h b/bsd/ppc/_limits.h deleted file mode 100644 index d512ec411..000000000 --- a/bsd/ppc/_limits.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -#ifndef _PPC__LIMITS_H_ -#define _PPC__LIMITS_H_ - -#define __DARWIN_CLK_TCK 100 /* ticks per second */ - -#endif /* _PPC__LIMITS_H_ */ diff --git a/bsd/ppc/_param.h b/bsd/ppc/_param.h deleted file mode 100644 index 938fc499f..000000000 --- a/bsd/ppc/_param.h +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef _PPC__PARAM_H_ -#define _PPC__PARAM_H_ - -#include - -/* - * Round p (pointer or byte index) up to a correctly-aligned value for all - * data types (int, long, ...). The result is unsigned int and must be - * cast to any desired pointer type. - */ -#define __DARWIN_ALIGNBYTES (sizeof(__darwin_size_t) - 1) -#define __DARWIN_ALIGN(p) ((__darwin_size_t)((char *)(__darwin_size_t)(p) + __DARWIN_ALIGNBYTES) &~ __DARWIN_ALIGNBYTES) - -#define __DARWIN_ALIGNBYTES32 (sizeof(__uint32_t) - 1) -#define __DARWIN_ALIGN32(p) ((__darwin_size_t)((char *)(__darwin_size_t)(p) + __DARWIN_ALIGNBYTES32) &~ __DARWIN_ALIGNBYTES32) - - -#endif /* _PPC__PARAM_H_ */ diff --git a/bsd/ppc/_structs.h b/bsd/ppc/_structs.h deleted file mode 100644 index c028f7efb..000000000 --- a/bsd/ppc/_structs.h +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include - -#ifdef __need_mcontext_t -#ifndef __need_struct_mcontext -#define __need_struct_mcontext -#endif /* __need_struct_mcontext */ -#endif /* __need_mcontext_t */ - -#ifdef __need_mcontext64_t -#ifndef __need_struct_mcontext64 -#define __need_struct_mcontext64 -#endif /* __need_struct_mcontext64 */ -#endif /* __need_mcontext64_t */ - -#if defined(__need_struct_mcontext) || defined(__need_struct_mcontext64) -#include -#endif /* __need_struct_mcontext || __need_struct_mcontext64 */ - -#ifdef __need_struct_mcontext -#undef __need_struct_mcontext -#ifndef _STRUCT_MCONTEXT -#if __DARWIN_UNIX03 -#define _STRUCT_MCONTEXT struct __darwin_mcontext -_STRUCT_MCONTEXT -{ - _STRUCT_PPC_EXCEPTION_STATE __es; - _STRUCT_PPC_THREAD_STATE __ss; - _STRUCT_PPC_FLOAT_STATE __fs; - _STRUCT_PPC_VECTOR_STATE __vs; -}; -#else /* !__DARWIN_UNIX03 */ -#define _STRUCT_MCONTEXT struct mcontext -_STRUCT_MCONTEXT -{ - _STRUCT_PPC_EXCEPTION_STATE es; - _STRUCT_PPC_THREAD_STATE ss; - _STRUCT_PPC_FLOAT_STATE fs; - _STRUCT_PPC_VECTOR_STATE vs; -}; -#endif /* __DARWIN_UNIX03 */ -#endif /* _STRUCT_MCONTEXT */ -#endif /* __need_struct_mcontext */ - -#ifdef __need_struct_mcontext64 -#undef __need_struct_mcontext64 -#ifndef _STRUCT_MCONTEXT64 -#if __DARWIN_UNIX03 -#define _STRUCT_MCONTEXT64 struct __darwin_mcontext64 -_STRUCT_MCONTEXT64 -{ - _STRUCT_PPC_EXCEPTION_STATE64 __es; - _STRUCT_PPC_THREAD_STATE64 __ss; - _STRUCT_PPC_FLOAT_STATE __fs; - _STRUCT_PPC_VECTOR_STATE __vs; -}; -#else /* !__DARWIN_UNIX03 */ -#define _STRUCT_MCONTEXT64 struct mcontext64 -_STRUCT_MCONTEXT64 -{ - _STRUCT_PPC_EXCEPTION_STATE64 es; - _STRUCT_PPC_THREAD_STATE64 ss; - _STRUCT_PPC_FLOAT_STATE fs; - _STRUCT_PPC_VECTOR_STATE vs; -}; -#endif /* __DARWIN_UNIX03 */ -#endif /* _STRUCT_MCONTEXT64 */ -#endif /* __need_struct_mcontext64 */ - -#ifdef __need_mcontext_t -#undef __need_mcontext_t -#ifndef _MCONTEXT_T -#define _MCONTEXT_T -typedef _STRUCT_MCONTEXT *mcontext_t; -#endif /* _MCONTEXT_T */ -#endif /* __need_mcontext_t */ - -#ifdef __need_mcontext64_t -#undef __need_mcontext64_t -#ifndef _MCONTEXT64_T -#define _MCONTEXT64_T -typedef _STRUCT_MCONTEXT64 *mcontext64_t; -#endif /* _MCONTEXT64_T */ -#endif /* __need_mcontext64_t */ - -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -#ifndef PPC_MCONTEXT_SIZE -#define PPC_MCONTEXT_SIZE (PPC_THREAD_STATE_COUNT + PPC_FLOAT_STATE_COUNT + PPC_EXCEPTION_STATE_COUNT + PPC_VECTOR_STATE_COUNT) * sizeof(int) -#endif /* PPC_MCONTEXT_SIZE */ -#ifndef PPC_MCONTEXT64_SIZE -#define PPC_MCONTEXT64_SIZE (PPC_THREAD_STATE64_COUNT + PPC_FLOAT_STATE_COUNT + PPC_EXCEPTION_STATE_COUNT + PPC_VECTOR_STATE_COUNT) * sizeof(int) -#endif /* PPC_MCONTEXT64_SIZE */ -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ - -/* - * For now, just duplicate the 32-bit context as the generic one. - */ -#ifdef __need_struct_sigcontext -#undef __need_struct_sigcontext -#ifndef _STRUCT_SIGCONTEXT -#if __DARWIN_UNIX03 /* signal.h needs struct sigcontext visible */ -#define _STRUCT_SIGCONTEXT struct __darwin_sigcontext -_STRUCT_SIGCONTEXT -{ - int __sc_onstack; /* sigstack state to restore */ - int __sc_mask; /* signal mask to restore */ - int __sc_ir; /* pc */ - int __sc_psw; /* processor status word */ - int __sc_sp; /* stack pointer if sc_regs == NULL */ - void *__sc_regs; /* (kernel private) saved state */ -}; -#else /* !__DARWIN_UNIX03 */ -#define _STRUCT_SIGCONTEXT struct sigcontext -_STRUCT_SIGCONTEXT -{ - int sc_onstack; /* sigstack state to restore */ - int sc_mask; /* signal mask to restore */ - int sc_ir; /* pc */ - int sc_psw; /* processor status word */ - int sc_sp; /* stack pointer if sc_regs == NULL */ - void *sc_regs; /* (kernel private) saved state */ -}; -#endif /* __DARWIN_UNIX03 */ -#endif /* _STRUCT_SIGCONTEXT */ -#endif /* __need_struct_sigcontext */ - -/* - * Information pushed on stack when a signal is delivered. - * This is used by the kernel to restore state following - * execution of the signal handler. It is also made available - * to the handler to allow it to properly restore state if - * a non-standard exit is performed. - */ -#ifdef __need_struct_sigcontext32 -#undef __need_struct_sigcontext32 -#ifndef _STRUCT_SIGCONTEXT32 -#if __DARWIN_UNIX03 -#define _STRUCT_SIGCONTEXT32 struct __darwin_sigcontext32 -_STRUCT_SIGCONTEXT32 -{ - int __sc_onstack; /* sigstack state to restore */ - int __sc_mask; /* signal mask to restore */ - int __sc_ir; /* pc */ - int __sc_psw; /* processor status word */ - int __sc_sp; /* stack pointer if sc_regs == NULL */ - void *__sc_regs; /* (kernel private) saved state */ -}; -#else /* !__DARWIN_UNIX03 */ -#define _STRUCT_SIGCONTEXT32 struct sigcontext32 -_STRUCT_SIGCONTEXT32 -{ - int sc_onstack; /* sigstack state to restore */ - int sc_mask; /* signal mask to restore */ - int sc_ir; /* pc */ - int sc_psw; /* processor status word */ - int sc_sp; /* stack pointer if sc_regs == NULL */ - void *sc_regs; /* (kernel private) saved state */ -}; -#endif /* __DARWIN_UNIX03 */ -#endif /* _STRUCT_SIGCONTEXT32 */ -#endif /* __need_struct_sigcontext32 */ - -#ifdef __need_struct_sigcontext64 -#undef __need_struct_sigcontext64 -#ifndef _STRUCT_SIGCONTEXT64 -#if __DARWIN_UNIX03 -#define _STRUCT_SIGCONTEXT64 struct __darwin_sigcontext64 -_STRUCT_SIGCONTEXT64 -{ - int __sc_onstack; /* sigstack state to restore */ - int __sc_mask; /* signal mask to restore */ - long long __sc_ir; /* pc */ - long long __sc_psw; /* processor status word */ - long long __sc_sp; /* stack pointer if sc_regs == NULL */ - void *__sc_regs; /* (kernel private) saved state */ -}; -#else /* !__DARWIN_UNIX03 */ -#define _STRUCT_SIGCONTEXT64 struct sigcontext64 -_STRUCT_SIGCONTEXT64 -{ - int sc_onstack; /* sigstack state to restore */ - int sc_mask; /* signal mask to restore */ - long long sc_ir; /* pc */ - long long sc_psw; /* processor status word */ - long long sc_sp; /* stack pointer if sc_regs == NULL */ - void *sc_regs; /* (kernel private) saved state */ -}; -#endif /* __DARWIN_UNIX03 */ -#endif /* _STRUCT_SIGCONTEXT64 */ -#endif /* __need_struct_sigcontext64 */ diff --git a/bsd/ppc/_types.h b/bsd/ppc/_types.h deleted file mode 100644 index 4b7855988..000000000 --- a/bsd/ppc/_types.h +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#ifndef _BSD_PPC__TYPES_H_ -#define _BSD_PPC__TYPES_H_ - -/* - * This header file contains integer types. It's intended to also contain - * flotaing point and other arithmetic types, as needed, later. - */ - -#ifdef __GNUC__ -typedef __signed char __int8_t; -#else /* !__GNUC__ */ -typedef char __int8_t; -#endif /* !__GNUC__ */ -typedef unsigned char __uint8_t; -typedef short __int16_t; -typedef unsigned short __uint16_t; -typedef int __int32_t; -typedef unsigned int __uint32_t; -typedef long long __int64_t; -typedef unsigned long long __uint64_t; - -typedef long __darwin_intptr_t; -typedef unsigned int __darwin_natural_t; - -/* - * The rune type below is declared to be an ``int'' instead of the more natural - * ``unsigned long'' or ``long''. Two things are happening here. It is not - * unsigned so that EOF (-1) can be naturally assigned to it and used. Also, - * it looks like 10646 will be a 31 bit standard. This means that if your - * ints cannot hold 32 bits, you will be in trouble. The reason an int was - * chosen over a long is that the is*() and to*() routines take ints (says - * ANSI C), but they use __darwin_ct_rune_t instead of int. By changing it - * here, you lose a bit of ANSI conformance, but your programs will still - * work. - * - * NOTE: rune_t is not covered by ANSI nor other standards, and should not - * be instantiated outside of lib/libc/locale. Use wchar_t. wchar_t and - * rune_t must be the same type. Also wint_t must be no narrower than - * wchar_t, and should also be able to hold all members of the largest - * character set plus one extra value (WEOF). wint_t must be at least 16 bits. - */ - -typedef int __darwin_ct_rune_t; /* ct_rune_t */ - -/* - * mbstate_t is an opaque object to keep conversion state, during multibyte - * stream conversions. The content must not be referenced by user programs. - */ -typedef union { - char __mbstate8[128]; - long long _mbstateL; /* for alignment */ -} __mbstate_t; - -typedef __mbstate_t __darwin_mbstate_t; /* mbstate_t */ - -#if defined(__GNUC__) && defined(__PTRDIFF_TYPE__) -typedef __PTRDIFF_TYPE__ __darwin_ptrdiff_t; /* ptr1 - ptr2 */ -#else -typedef int __darwin_ptrdiff_t; /* ptr1 - ptr2 */ -#endif /* __GNUC__ */ - -#if defined(__GNUC__) && defined(__SIZE_TYPE__) -typedef __SIZE_TYPE__ __darwin_size_t; /* sizeof() */ -#else -typedef unsigned long __darwin_size_t; /* sizeof() */ -#endif - -#if (__GNUC__ > 2) -typedef __builtin_va_list __darwin_va_list; /* va_list */ -#else -typedef char * __darwin_va_list; /* va_list */ -#endif - -#if defined(__GNUC__) && defined(__WCHAR_TYPE__) -typedef __WCHAR_TYPE__ __darwin_wchar_t; /* wchar_t */ -#else -typedef __darwin_ct_rune_t __darwin_wchar_t; /* wchar_t */ -#endif - -typedef __darwin_wchar_t __darwin_rune_t; /* rune_t */ - -#if defined(__GNUC__) && defined(__WINT_TYPE__) -typedef __WINT_TYPE__ __darwin_wint_t; /* wint_t */ -#else -typedef __darwin_ct_rune_t __darwin_wint_t; /* wint_t */ -#endif - -typedef unsigned long __darwin_clock_t; /* clock() */ -typedef __uint32_t __darwin_socklen_t; /* socklen_t (duh) */ -typedef long __darwin_ssize_t; /* byte count or error */ -typedef long __darwin_time_t; /* time() */ - -#endif /* _BSD_PPC__TYPES_H_ */ diff --git a/bsd/ppc/decodePPC.h b/bsd/ppc/decodePPC.h deleted file mode 100644 index 8fb4756f6..000000000 --- a/bsd/ppc/decodePPC.h +++ /dev/null @@ -1,919 +0,0 @@ -/* - * Copyright (c) 2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -struct dcdtab { - - uint8_t dcdFlgs; /* Flags needed to decode */ -#define dcdStep 0x80 /* Step to next table entry on non-match */ -#define dcdJump 0x40 /* Jump to new entry in table. Index is in dcdMatch. */ -#define dcdMask 0x0F /* Index into mask table. 0 matches everything */ - - uint8_t dcdType; /* Instruction type */ -#define diINV 0x00 -#define diTRP 0x01 -#define diSC 0x02 -#define diRFI 0x03 -#define diB 0x04 -#define diBC 0x05 -#define diBLR 0x06 -#define diBCTR 0x07 -#define diOR 0x08 -#define diSPR 0x09 -#define diCMN 0x0A -#define diPRV 0x0B - - uint16_t dcdMatch; /* Extended op code to match */ -}; - -typedef struct dcdtab dcdtab; - -static uint16_t masktab[] = {0x0000, 0x0003, 0x001C, 0x001E, 0x003E, /* Table of extended op masks */ - 0x003F, 0x03FE, 0x03FF, 0x07FC, 0x07FE, 0x07FF}; - -static dcdtab insts[] = { - { 0x40, 0, 64 }, // 0 Maj op = 0, jump to entry 64 - { 0x00, diINV, 0x0000 }, // 1 Maj op = 1, invalid - { 0x00, diTRP, 0x0000 }, // 2 Maj op = 2, tdi - { 0x00, diTRP, 0x0000 }, // 3 Maj op = 3, twi - { 0x40, 0, 65 }, // 4 Maj op = 4, jump to entry 65 - { 0x00, diINV, 0x0000 }, // 5 Maj op = 5, invalid - { 0x00, diINV, 0x0000 }, // 6 Maj op = 6, invalid - { 0x00, diCMN, 0x0000 }, // 7 Maj op = 7, mulli - { 0x00, diCMN, 0x0000 }, // 8 Maj op = 8, subfic - { 0x00, diINV, 0x0000 }, // 9 Maj op = 9, invalid - { 0x00, diCMN, 0x0000 }, // 10 Maj op = 10, cmpli - { 0x00, diCMN, 0x0000 }, // 11 Maj op = 11, cmpi - { 0x00, diCMN, 0x0000 }, // 12 Maj op = 12, addic - { 0x00, diCMN, 0x0000 }, // 13 Maj op = 13, addic. - { 0x00, diCMN, 0x0000 }, // 14 Maj op = 14, addi - { 0x00, diCMN, 0x0000 }, // 15 Maj op = 15, addis - { 0x00, diBC, 0x0000 }, // 16 Maj op = 16, bc - { 0x00, diSC, 0x0000 }, // 17 Maj op = 17, sc - { 0x00, diB, 0x0000 }, // 18 Maj op = 18, b - { 0x40, 0, 209 }, // 19 Maj op = 19, jump to entry 209 - { 0x00, diCMN, 0x0000 }, // 20 Maj op = 20, rlwimi - { 0x00, diCMN, 0x0000 }, // 21 Maj op = 21, rlwinm - { 0x00, diINV, 0x0000 }, // 22 Maj op = 22, invalid - { 0x00, diCMN, 0x0000 }, // 23 Maj op = 23, rlwnm - { 0x00, diOR, 0x0000 }, // 24 Maj op = 24, ori - { 0x00, diCMN, 0x0000 }, // 25 Maj op = 25, oris - { 0x00, diCMN, 0x0000 }, // 26 Maj op = 26, xori - { 0x00, diCMN, 0x0000 }, // 27 Maj op = 27, xoris - { 0x00, diCMN, 0x0000 }, // 28 Maj op = 28, andi. - { 0x00, diCMN, 0x0000 }, // 29 Maj op = 29, andis. - { 0x40, 0, 224 }, // 30 Maj op = 30, jump to entry 224 - { 0x40, 0, 230 }, // 31 Maj op = 31, jump to entry 230 - { 0x00, diCMN, 0x0000 }, // 32 Maj op = 32, lwz - { 0x00, diCMN, 0x0000 }, // 33 Maj op = 33, lwzu - { 0x00, diCMN, 0x0000 }, // 34 Maj op = 34, lbz - { 0x00, diCMN, 0x0000 }, // 35 Maj op = 35, lbzu - { 0x00, diCMN, 0x0000 }, // 36 Maj op = 36, stw - { 0x00, diCMN, 0x0000 }, // 37 Maj op = 37, stwu - { 0x00, diCMN, 0x0000 }, // 38 Maj op = 38, stb - { 0x00, diCMN, 0x0000 }, // 39 Maj op = 39, stbu - { 0x00, diCMN, 0x0000 }, // 40 Maj op = 40, lhz - { 0x00, diCMN, 0x0000 }, // 41 Maj op = 41, lhzu - { 0x00, diCMN, 0x0000 }, // 42 Maj op = 42, lha - { 0x00, diCMN, 0x0000 }, // 43 Maj op = 43, lhau - { 0x00, diCMN, 0x0000 }, // 44 Maj op = 44, sth - { 0x00, diCMN, 0x0000 }, // 45 Maj op = 45, sthu - { 0x00, diCMN, 0x0000 }, // 46 Maj op = 46, lmw - { 0x00, diCMN, 0x0000 }, // 47 Maj op = 47, stmw - { 0x00, diCMN, 0x0000 }, // 48 Maj op = 48, lfs - { 0x00, diCMN, 0x0000 }, // 49 Maj op = 49, lfsu - { 0x00, diCMN, 0x0000 }, // 50 Maj op = 50, lfd - { 0x00, diCMN, 0x0000 }, // 51 Maj op = 51, lfdu - { 0x00, diCMN, 0x0000 }, // 52 Maj op = 52, stfs - { 0x00, diCMN, 0x0000 }, // 53 Maj op = 53, stfsu - { 0x00, diCMN, 0x0000 }, // 54 Maj op = 54, stfd - { 0x00, diCMN, 0x0000 }, // 55 Maj op = 55, stfdu - { 0x00, diINV, 0x0000 }, // 56 Maj op = 56, invalid - { 0x00, diINV, 0x0000 }, // 57 Maj op = 57, invalid - { 0x40, 0, 365 }, // 58 Maj op = 58, jump to entry 365 - { 0x40, 0, 368 }, // 59 Maj op = 59, jump to entry 368 - { 0x00, diINV, 0x0000 }, // 60 Maj op = 60, invalid - { 0x00, diINV, 0x0000 }, // 61 Maj op = 61, invalid - { 0x40, 0, 378 }, // 62 Maj op = 62, jump to entry 378 - { 0x40, 0, 380 }, // 63 Maj op = 63, jump to entry 380 - { 0x09, diCMN, 0x0200 }, // 64 Maj op = 0, mask = 07FE, xop = 0x0200 ( 256) - attn - { 0x85, diCMN, 0x0020 }, // 65 Maj op = 4, mask = 003F, xop = 0x0020 ( 32) - vmhaddshs - { 0x85, diCMN, 0x0021 }, // 66 Maj op = 4, mask = 003F, xop = 0x0021 ( 33) - vmhraddshs - { 0x85, diCMN, 0x0022 }, // 67 Maj op = 4, mask = 003F, xop = 0x0022 ( 34) - vmladduhm - { 0x85, diCMN, 0x0024 }, // 68 Maj op = 4, mask = 003F, xop = 0x0024 ( 36) - vmsumubm - { 0x85, diCMN, 0x0025 }, // 69 Maj op = 4, mask = 003F, xop = 0x0025 ( 37) - vmsummbm - { 0x85, diCMN, 0x0026 }, // 70 Maj op = 4, mask = 003F, xop = 0x0026 ( 38) - vmsumuhm - { 0x85, diCMN, 0x0027 }, // 71 Maj op = 4, mask = 003F, xop = 0x0027 ( 39) - vmsumuhs - { 0x85, diCMN, 0x0028 }, // 72 Maj op = 4, mask = 003F, xop = 0x0028 ( 40) - vmsumshm - { 0x85, diCMN, 0x0029 }, // 73 Maj op = 4, mask = 003F, xop = 0x0029 ( 41) - vmsumshs - { 0x85, diCMN, 0x002A }, // 74 Maj op = 4, mask = 003F, xop = 0x002A ( 42) - vsel - { 0x85, diCMN, 0x002B }, // 75 Maj op = 4, mask = 003F, xop = 0x002B ( 43) - vperm - { 0x85, diCMN, 0x002C }, // 76 Maj op = 4, mask = 003F, xop = 0x002C ( 44) - vsldoi - { 0x85, diCMN, 0x002E }, // 77 Maj op = 4, mask = 003F, xop = 0x002E ( 46) - vmaddfp - { 0x85, diCMN, 0x002F }, // 78 Maj op = 4, mask = 003F, xop = 0x002F ( 47) - vnmsubfp - { 0x87, diCMN, 0x0006 }, // 79 Maj op = 4, mask = 03FF, xop = 0x0006 ( 6) - vcmpequb - { 0x87, diCMN, 0x0046 }, // 80 Maj op = 4, mask = 03FF, xop = 0x0046 ( 70) - vcmpequh - { 0x87, diCMN, 0x0086 }, // 81 Maj op = 4, mask = 03FF, xop = 0x0086 ( 134) - vcmpequw - { 0x87, diCMN, 0x00C6 }, // 82 Maj op = 4, mask = 03FF, xop = 0x00C6 ( 198) - vcmpeqfp - { 0x87, diCMN, 0x01C6 }, // 83 Maj op = 4, mask = 03FF, xop = 0x01C6 ( 454) - vcmpgefp - { 0x87, diCMN, 0x0206 }, // 84 Maj op = 4, mask = 03FF, xop = 0x0206 ( 518) - vcmpgtub - { 0x87, diCMN, 0x0246 }, // 85 Maj op = 4, mask = 03FF, xop = 0x0246 ( 582) - vcmpgtuh - { 0x87, diCMN, 0x0286 }, // 86 Maj op = 4, mask = 03FF, xop = 0x0286 ( 646) - vcmpgtuw - { 0x87, diCMN, 0x02C6 }, // 87 Maj op = 4, mask = 03FF, xop = 0x02C6 ( 710) - vcmpgtfp - { 0x87, diCMN, 0x0306 }, // 88 Maj op = 4, mask = 03FF, xop = 0x0306 ( 774) - vcmpgtsb - { 0x87, diCMN, 0x0346 }, // 89 Maj op = 4, mask = 03FF, xop = 0x0346 ( 838) - vcmpgtsh - { 0x87, diCMN, 0x0386 }, // 90 Maj op = 4, mask = 03FF, xop = 0x0386 ( 902) - vcmpgtsw - { 0x87, diCMN, 0x03C6 }, // 91 Maj op = 4, mask = 03FF, xop = 0x03C6 ( 966) - vcmpbfp - { 0x8A, diCMN, 0x0000 }, // 92 Maj op = 4, mask = 07FF, xop = 0x0000 ( 0) - vaddubm - { 0x8A, diCMN, 0x0002 }, // 93 Maj op = 4, mask = 07FF, xop = 0x0002 ( 2) - vmaxub - { 0x8A, diCMN, 0x0004 }, // 94 Maj op = 4, mask = 07FF, xop = 0x0004 ( 4) - vrlb - { 0x8A, diCMN, 0x0008 }, // 95 Maj op = 4, mask = 07FF, xop = 0x0008 ( 8) - vmuloub - { 0x8A, diCMN, 0x000A }, // 96 Maj op = 4, mask = 07FF, xop = 0x000A ( 10) - vaddfp - { 0x8A, diCMN, 0x000C }, // 97 Maj op = 4, mask = 07FF, xop = 0x000C ( 12) - vmrghb - { 0x8A, diCMN, 0x000E }, // 98 Maj op = 4, mask = 07FF, xop = 0x000E ( 14) - vpkuhum - { 0x8A, diCMN, 0x0040 }, // 99 Maj op = 4, mask = 07FF, xop = 0x0040 ( 64) - vadduhm - { 0x8A, diCMN, 0x0042 }, // 100 Maj op = 4, mask = 07FF, xop = 0x0042 ( 66) - vmaxuh - { 0x8A, diCMN, 0x0044 }, // 101 Maj op = 4, mask = 07FF, xop = 0x0044 ( 68) - vrlh - { 0x8A, diCMN, 0x0048 }, // 102 Maj op = 4, mask = 07FF, xop = 0x0048 ( 72) - vmulouh - { 0x8A, diCMN, 0x004A }, // 103 Maj op = 4, mask = 07FF, xop = 0x004A ( 74) - vsubfp - { 0x8A, diCMN, 0x004C }, // 104 Maj op = 4, mask = 07FF, xop = 0x004C ( 76) - vmrghh - { 0x8A, diCMN, 0x004E }, // 105 Maj op = 4, mask = 07FF, xop = 0x004E ( 78) - vpkuwum - { 0x8A, diCMN, 0x0080 }, // 106 Maj op = 4, mask = 07FF, xop = 0x0080 ( 128) - vadduwm - { 0x8A, diCMN, 0x0082 }, // 107 Maj op = 4, mask = 07FF, xop = 0x0082 ( 130) - vmaxuw - { 0x8A, diCMN, 0x0084 }, // 108 Maj op = 4, mask = 07FF, xop = 0x0084 ( 132) - vrlw - { 0x8A, diCMN, 0x008C }, // 109 Maj op = 4, mask = 07FF, xop = 0x008C ( 140) - vmrghw - { 0x8A, diCMN, 0x008E }, // 110 Maj op = 4, mask = 07FF, xop = 0x008E ( 142) - vpkuhus - { 0x8A, diCMN, 0x00CE }, // 111 Maj op = 4, mask = 07FF, xop = 0x00CE ( 206) - vpkuwus - { 0x8A, diCMN, 0x0102 }, // 112 Maj op = 4, mask = 07FF, xop = 0x0102 ( 258) - vmaxsb - { 0x8A, diCMN, 0x0104 }, // 113 Maj op = 4, mask = 07FF, xop = 0x0104 ( 260) - vslb - { 0x8A, diCMN, 0x0108 }, // 114 Maj op = 4, mask = 07FF, xop = 0x0108 ( 264) - vmulosb - { 0x8A, diCMN, 0x010A }, // 115 Maj op = 4, mask = 07FF, xop = 0x010A ( 266) - vrefp - { 0x8A, diCMN, 0x010C }, // 116 Maj op = 4, mask = 07FF, xop = 0x010C ( 268) - vmrglb - { 0x8A, diCMN, 0x010E }, // 117 Maj op = 4, mask = 07FF, xop = 0x010E ( 270) - vpkshus - { 0x8A, diCMN, 0x0142 }, // 118 Maj op = 4, mask = 07FF, xop = 0x0142 ( 322) - vmaxsh - { 0x8A, diCMN, 0x0144 }, // 119 Maj op = 4, mask = 07FF, xop = 0x0144 ( 324) - vslh - { 0x8A, diCMN, 0x0148 }, // 120 Maj op = 4, mask = 07FF, xop = 0x0148 ( 328) - vmulosh - { 0x8A, diCMN, 0x014A }, // 121 Maj op = 4, mask = 07FF, xop = 0x014A ( 330) - vrsqrtefp - { 0x8A, diCMN, 0x014C }, // 122 Maj op = 4, mask = 07FF, xop = 0x014C ( 332) - vmrglh - { 0x8A, diCMN, 0x014E }, // 123 Maj op = 4, mask = 07FF, xop = 0x014E ( 334) - vpkswus - { 0x8A, diCMN, 0x0180 }, // 124 Maj op = 4, mask = 07FF, xop = 0x0180 ( 384) - vaddcuw - { 0x8A, diCMN, 0x0182 }, // 125 Maj op = 4, mask = 07FF, xop = 0x0182 ( 386) - vmaxsw - { 0x8A, diCMN, 0x0184 }, // 126 Maj op = 4, mask = 07FF, xop = 0x0184 ( 388) - vslw - { 0x8A, diCMN, 0x018A }, // 127 Maj op = 4, mask = 07FF, xop = 0x018A ( 394) - vexptefp - { 0x8A, diCMN, 0x018C }, // 128 Maj op = 4, mask = 07FF, xop = 0x018C ( 396) - vmrglw - { 0x8A, diCMN, 0x018E }, // 129 Maj op = 4, mask = 07FF, xop = 0x018E ( 398) - vpkshss - { 0x8A, diCMN, 0x01C4 }, // 130 Maj op = 4, mask = 07FF, xop = 0x01C4 ( 452) - vsl - { 0x8A, diCMN, 0x01CA }, // 131 Maj op = 4, mask = 07FF, xop = 0x01CA ( 458) - vlogefp - { 0x8A, diCMN, 0x01CE }, // 132 Maj op = 4, mask = 07FF, xop = 0x01CE ( 462) - vpkswss - { 0x8A, diCMN, 0x0200 }, // 133 Maj op = 4, mask = 07FF, xop = 0x0200 ( 512) - vaddubs - { 0x8A, diCMN, 0x0202 }, // 134 Maj op = 4, mask = 07FF, xop = 0x0202 ( 514) - vminub - { 0x8A, diCMN, 0x0204 }, // 135 Maj op = 4, mask = 07FF, xop = 0x0204 ( 516) - vsrb - { 0x8A, diCMN, 0x0208 }, // 136 Maj op = 4, mask = 07FF, xop = 0x0208 ( 520) - vmuleub - { 0x8A, diCMN, 0x020A }, // 137 Maj op = 4, mask = 07FF, xop = 0x020A ( 522) - vrfin - { 0x8A, diCMN, 0x020C }, // 138 Maj op = 4, mask = 07FF, xop = 0x020C ( 524) - vspltb - { 0x8A, diCMN, 0x020E }, // 139 Maj op = 4, mask = 07FF, xop = 0x020E ( 526) - vupkhsb - { 0x8A, diCMN, 0x0240 }, // 140 Maj op = 4, mask = 07FF, xop = 0x0240 ( 576) - vadduhs - { 0x8A, diCMN, 0x0242 }, // 141 Maj op = 4, mask = 07FF, xop = 0x0242 ( 578) - vminuh - { 0x8A, diCMN, 0x0244 }, // 142 Maj op = 4, mask = 07FF, xop = 0x0244 ( 580) - vsrh - { 0x8A, diCMN, 0x0248 }, // 143 Maj op = 4, mask = 07FF, xop = 0x0248 ( 584) - vmuleuh - { 0x8A, diCMN, 0x024A }, // 144 Maj op = 4, mask = 07FF, xop = 0x024A ( 586) - vrfiz - { 0x8A, diCMN, 0x024C }, // 145 Maj op = 4, mask = 07FF, xop = 0x024C ( 588) - vsplth - { 0x8A, diCMN, 0x024E }, // 146 Maj op = 4, mask = 07FF, xop = 0x024E ( 590) - vupkhsh - { 0x8A, diCMN, 0x0280 }, // 147 Maj op = 4, mask = 07FF, xop = 0x0280 ( 640) - vadduws - { 0x8A, diCMN, 0x0282 }, // 148 Maj op = 4, mask = 07FF, xop = 0x0282 ( 642) - vminuw - { 0x8A, diCMN, 0x0284 }, // 149 Maj op = 4, mask = 07FF, xop = 0x0284 ( 644) - vsrw - { 0x8A, diCMN, 0x028A }, // 150 Maj op = 4, mask = 07FF, xop = 0x028A ( 650) - vrfip - { 0x8A, diCMN, 0x028C }, // 151 Maj op = 4, mask = 07FF, xop = 0x028C ( 652) - vspltw - { 0x8A, diCMN, 0x028E }, // 152 Maj op = 4, mask = 07FF, xop = 0x028E ( 654) - vupklsb - { 0x8A, diCMN, 0x02C4 }, // 153 Maj op = 4, mask = 07FF, xop = 0x02C4 ( 708) - vsr - { 0x8A, diCMN, 0x02CA }, // 154 Maj op = 4, mask = 07FF, xop = 0x02CA ( 714) - vrfim - { 0x8A, diCMN, 0x02CE }, // 155 Maj op = 4, mask = 07FF, xop = 0x02CE ( 718) - vupklsh - { 0x8A, diCMN, 0x0300 }, // 156 Maj op = 4, mask = 07FF, xop = 0x0300 ( 768) - vaddsbs - { 0x8A, diCMN, 0x0302 }, // 157 Maj op = 4, mask = 07FF, xop = 0x0302 ( 770) - vminsb - { 0x8A, diCMN, 0x0304 }, // 158 Maj op = 4, mask = 07FF, xop = 0x0304 ( 772) - vsrab - { 0x8A, diCMN, 0x0308 }, // 159 Maj op = 4, mask = 07FF, xop = 0x0308 ( 776) - vmulesb - { 0x8A, diCMN, 0x030A }, // 160 Maj op = 4, mask = 07FF, xop = 0x030A ( 778) - vcfux - { 0x8A, diCMN, 0x030C }, // 161 Maj op = 4, mask = 07FF, xop = 0x030C ( 780) - vspltisb - { 0x8A, diCMN, 0x030E }, // 162 Maj op = 4, mask = 07FF, xop = 0x030E ( 782) - vpkpx - { 0x8A, diCMN, 0x0340 }, // 163 Maj op = 4, mask = 07FF, xop = 0x0340 ( 832) - vaddshs - { 0x8A, diCMN, 0x0342 }, // 164 Maj op = 4, mask = 07FF, xop = 0x0342 ( 834) - vminsh - { 0x8A, diCMN, 0x0344 }, // 165 Maj op = 4, mask = 07FF, xop = 0x0344 ( 836) - vsrah - { 0x8A, diCMN, 0x0348 }, // 166 Maj op = 4, mask = 07FF, xop = 0x0348 ( 840) - vmulesh - { 0x8A, diCMN, 0x034A }, // 167 Maj op = 4, mask = 07FF, xop = 0x034A ( 842) - vcfsx - { 0x8A, diCMN, 0x034C }, // 168 Maj op = 4, mask = 07FF, xop = 0x034C ( 844) - vspltish - { 0x8A, diCMN, 0x034E }, // 169 Maj op = 4, mask = 07FF, xop = 0x034E ( 846) - vupkhpx - { 0x8A, diCMN, 0x0380 }, // 170 Maj op = 4, mask = 07FF, xop = 0x0380 ( 896) - vaddsws - { 0x8A, diCMN, 0x0382 }, // 171 Maj op = 4, mask = 07FF, xop = 0x0382 ( 898) - vminsw - { 0x8A, diCMN, 0x0384 }, // 172 Maj op = 4, mask = 07FF, xop = 0x0384 ( 900) - vsraw - { 0x8A, diCMN, 0x038A }, // 173 Maj op = 4, mask = 07FF, xop = 0x038A ( 906) - vctuxs - { 0x8A, diCMN, 0x038C }, // 174 Maj op = 4, mask = 07FF, xop = 0x038C ( 908) - vspltisw - { 0x8A, diCMN, 0x03CA }, // 175 Maj op = 4, mask = 07FF, xop = 0x03CA ( 970) - vctsxs - { 0x8A, diCMN, 0x03CE }, // 176 Maj op = 4, mask = 07FF, xop = 0x03CE ( 974) - vupklpx - { 0x8A, diCMN, 0x0400 }, // 177 Maj op = 4, mask = 07FF, xop = 0x0400 (1024) - vsububm - { 0x8A, diCMN, 0x0402 }, // 178 Maj op = 4, mask = 07FF, xop = 0x0402 (1026) - vavgub - { 0x8A, diCMN, 0x0404 }, // 179 Maj op = 4, mask = 07FF, xop = 0x0404 (1028) - vand - { 0x8A, diCMN, 0x040A }, // 180 Maj op = 4, mask = 07FF, xop = 0x040A (1034) - vmaxfp - { 0x8A, diCMN, 0x040C }, // 181 Maj op = 4, mask = 07FF, xop = 0x040C (1036) - vslo - { 0x8A, diCMN, 0x0440 }, // 182 Maj op = 4, mask = 07FF, xop = 0x0440 (1088) - vsubuhm - { 0x8A, diCMN, 0x0442 }, // 183 Maj op = 4, mask = 07FF, xop = 0x0442 (1090) - vavguh - { 0x8A, diCMN, 0x0444 }, // 184 Maj op = 4, mask = 07FF, xop = 0x0444 (1092) - vandc - { 0x8A, diCMN, 0x044A }, // 185 Maj op = 4, mask = 07FF, xop = 0x044A (1098) - vminfp - { 0x8A, diCMN, 0x044C }, // 186 Maj op = 4, mask = 07FF, xop = 0x044C (1100) - vsro - { 0x8A, diCMN, 0x0480 }, // 187 Maj op = 4, mask = 07FF, xop = 0x0480 (1152) - vsubuwm - { 0x8A, diCMN, 0x0482 }, // 188 Maj op = 4, mask = 07FF, xop = 0x0482 (1154) - vavguw - { 0x8A, diCMN, 0x0484 }, // 189 Maj op = 4, mask = 07FF, xop = 0x0484 (1156) - vor - { 0x8A, diCMN, 0x04C4 }, // 190 Maj op = 4, mask = 07FF, xop = 0x04C4 (1220) - vxor - { 0x8A, diCMN, 0x0502 }, // 191 Maj op = 4, mask = 07FF, xop = 0x0502 (1282) - vavgsb - { 0x8A, diCMN, 0x0504 }, // 192 Maj op = 4, mask = 07FF, xop = 0x0504 (1284) - vnor - { 0x8A, diCMN, 0x0542 }, // 193 Maj op = 4, mask = 07FF, xop = 0x0542 (1346) - vavgsh - { 0x8A, diCMN, 0x0580 }, // 194 Maj op = 4, mask = 07FF, xop = 0x0580 (1408) - vsubcuw - { 0x8A, diCMN, 0x0582 }, // 195 Maj op = 4, mask = 07FF, xop = 0x0582 (1410) - vavgsw - { 0x8A, diCMN, 0x0600 }, // 196 Maj op = 4, mask = 07FF, xop = 0x0600 (1536) - vsububs - { 0x8A, diCMN, 0x0604 }, // 197 Maj op = 4, mask = 07FF, xop = 0x0604 (1540) - mfvscr - { 0x8A, diCMN, 0x0608 }, // 198 Maj op = 4, mask = 07FF, xop = 0x0608 (1544) - vsum4ubs - { 0x8A, diCMN, 0x0640 }, // 199 Maj op = 4, mask = 07FF, xop = 0x0640 (1600) - vsubuhs - { 0x8A, diCMN, 0x0644 }, // 200 Maj op = 4, mask = 07FF, xop = 0x0644 (1604) - mtvscr - { 0x8A, diCMN, 0x0648 }, // 201 Maj op = 4, mask = 07FF, xop = 0x0648 (1608) - vsum4shs - { 0x8A, diCMN, 0x0680 }, // 202 Maj op = 4, mask = 07FF, xop = 0x0680 (1664) - vsubuws - { 0x8A, diCMN, 0x0688 }, // 203 Maj op = 4, mask = 07FF, xop = 0x0688 (1672) - vsum2sws - { 0x8A, diCMN, 0x0700 }, // 204 Maj op = 4, mask = 07FF, xop = 0x0700 (1792) - vsubsbs - { 0x8A, diCMN, 0x0708 }, // 205 Maj op = 4, mask = 07FF, xop = 0x0708 (1800) - vsum4sbs - { 0x8A, diCMN, 0x0740 }, // 206 Maj op = 4, mask = 07FF, xop = 0x0740 (1856) - vsubshs - { 0x8A, diCMN, 0x0780 }, // 207 Maj op = 4, mask = 07FF, xop = 0x0780 (1920) - vsubsws - { 0x0A, diCMN, 0x0788 }, // 208 Maj op = 4, mask = 07FF, xop = 0x0788 (1928) - vsumsws - { 0x89, diCMN, 0x0000 }, // 209 Maj op = 19, mask = 07FE, xop = 0x0000 ( 0) - mcrf - { 0x89, diBLR, 0x0020 }, // 210 Maj op = 19, mask = 07FE, xop = 0x0020 ( 16) - bclr - { 0x89, diPRV, 0x0024 }, // 211 Maj op = 19, mask = 07FE, xop = 0x0024 ( 18) - rfid - { 0x89, diCMN, 0x0042 }, // 212 Maj op = 19, mask = 07FE, xop = 0x0042 ( 33) - crnor - { 0x89, diPRV, 0x0064 }, // 213 Maj op = 19, mask = 07FE, xop = 0x0064 ( 50) - rfi - { 0x89, diCMN, 0x0102 }, // 214 Maj op = 19, mask = 07FE, xop = 0x0102 ( 129) - crandc - { 0x89, diCMN, 0x012C }, // 215 Maj op = 19, mask = 07FE, xop = 0x012C ( 150) - isync - { 0x89, diCMN, 0x0182 }, // 216 Maj op = 19, mask = 07FE, xop = 0x0182 ( 193) - crxor - { 0x89, diCMN, 0x01C2 }, // 217 Maj op = 19, mask = 07FE, xop = 0x01C2 ( 225) - crnand - { 0x89, diCMN, 0x0202 }, // 218 Maj op = 19, mask = 07FE, xop = 0x0202 ( 257) - crand - { 0x89, diPRV, 0x0224 }, // 219 Maj op = 19, mask = 07FE, xop = 0x0224 ( 274) - hrfid - { 0x89, diCMN, 0x0242 }, // 220 Maj op = 19, mask = 07FE, xop = 0x0242 ( 289) - creqv - { 0x89, diCMN, 0x0342 }, // 221 Maj op = 19, mask = 07FE, xop = 0x0342 ( 417) - crorc - { 0x89, diCMN, 0x0382 }, // 222 Maj op = 19, mask = 07FE, xop = 0x0382 ( 449) - cror - { 0x09, diBCTR, 0x0420 }, // 223 Maj op = 19, mask = 07FE, xop = 0x0420 ( 528) - bctr - { 0x82, diCMN, 0x0000 }, // 224 Maj op = 30, mask = 001C, xop = 0x0000 ( 0) - rldicl - { 0x82, diCMN, 0x0004 }, // 225 Maj op = 30, mask = 001C, xop = 0x0004 ( 1) - rldicr - { 0x82, diCMN, 0x0008 }, // 226 Maj op = 30, mask = 001C, xop = 0x0008 ( 2) - rldic - { 0x82, diCMN, 0x000C }, // 227 Maj op = 30, mask = 001C, xop = 0x000C ( 3) - rldimi - { 0x83, diCMN, 0x0010 }, // 228 Maj op = 30, mask = 001E, xop = 0x0010 ( 8) - rldcl - { 0x03, diCMN, 0x0012 }, // 229 Maj op = 30, mask = 001E, xop = 0x0012 ( 9) - rldcr - { 0x86, diCMN, 0x0010 }, // 230 Maj op = 31, mask = 03FE, xop = 0x0010 ( 8) - subfc - { 0x86, diCMN, 0x0012 }, // 231 Maj op = 31, mask = 03FE, xop = 0x0012 ( 9) - mulhdu - { 0x86, diCMN, 0x0014 }, // 232 Maj op = 31, mask = 03FE, xop = 0x0014 ( 10) - addc - { 0x86, diCMN, 0x0016 }, // 233 Maj op = 31, mask = 03FE, xop = 0x0016 ( 11) - mulhwu - { 0x86, diCMN, 0x0050 }, // 234 Maj op = 31, mask = 03FE, xop = 0x0050 ( 40) - subf - { 0x86, diCMN, 0x0092 }, // 235 Maj op = 31, mask = 03FE, xop = 0x0092 ( 73) - mulhd - { 0x86, diCMN, 0x0096 }, // 236 Maj op = 31, mask = 03FE, xop = 0x0096 ( 75) - mulhw - { 0x86, diCMN, 0x00D0 }, // 237 Maj op = 31, mask = 03FE, xop = 0x00D0 ( 104) - neg - { 0x86, diCMN, 0x0110 }, // 238 Maj op = 31, mask = 03FE, xop = 0x0110 ( 136) - subfe - { 0x86, diCMN, 0x0114 }, // 239 Maj op = 31, mask = 03FE, xop = 0x0114 ( 138) - adde - { 0x86, diCMN, 0x0190 }, // 240 Maj op = 31, mask = 03FE, xop = 0x0190 ( 200) - subfze - { 0x86, diCMN, 0x0194 }, // 241 Maj op = 31, mask = 03FE, xop = 0x0194 ( 202) - addze - { 0x86, diCMN, 0x01D0 }, // 242 Maj op = 31, mask = 03FE, xop = 0x01D0 ( 232) - subfme - { 0x86, diCMN, 0x01D2 }, // 243 Maj op = 31, mask = 03FE, xop = 0x01D2 ( 233) - mulld - { 0x86, diCMN, 0x01D4 }, // 244 Maj op = 31, mask = 03FE, xop = 0x01D4 ( 234) - addme - { 0x86, diCMN, 0x01D6 }, // 245 Maj op = 31, mask = 03FE, xop = 0x01D6 ( 235) - mullw - { 0x86, diCMN, 0x0214 }, // 246 Maj op = 31, mask = 03FE, xop = 0x0214 ( 266) - add - { 0x86, diCMN, 0x0392 }, // 247 Maj op = 31, mask = 03FE, xop = 0x0392 ( 457) - divdu - { 0x86, diCMN, 0x0396 }, // 248 Maj op = 31, mask = 03FE, xop = 0x0396 ( 459) - divwu - { 0x86, diCMN, 0x03D2 }, // 249 Maj op = 31, mask = 03FE, xop = 0x03D2 ( 489) - divd - { 0x86, diCMN, 0x03D6 }, // 250 Maj op = 31, mask = 03FE, xop = 0x03D6 ( 491) - divw - { 0x88, diCMN, 0x0674 }, // 251 Maj op = 31, mask = 07FC, xop = 0x0674 ( 413) - sradi - { 0x89, diCMN, 0x0000 }, // 252 Maj op = 31, mask = 07FE, xop = 0x0000 ( 0) - cmp - { 0x89, diTRP, 0x0008 }, // 253 Maj op = 31, mask = 07FE, xop = 0x0008 ( 4) - tw - { 0x89, diCMN, 0x000C }, // 254 Maj op = 31, mask = 07FE, xop = 0x000C ( 6) - lvsl - { 0x89, diCMN, 0x000E }, // 255 Maj op = 31, mask = 07FE, xop = 0x000E ( 7) - lvebx - { 0x89, diCMN, 0x0026 }, // 256 Maj op = 31, mask = 07FE, xop = 0x0026 ( 19) - mfcr - { 0x89, diCMN, 0x0028 }, // 257 Maj op = 31, mask = 07FE, xop = 0x0028 ( 20) - lwarx - { 0x89, diCMN, 0x002A }, // 258 Maj op = 31, mask = 07FE, xop = 0x002A ( 21) - ldx - { 0x89, diCMN, 0x002E }, // 259 Maj op = 31, mask = 07FE, xop = 0x002E ( 23) - lwzx - { 0x89, diCMN, 0x0030 }, // 260 Maj op = 31, mask = 07FE, xop = 0x0030 ( 24) - slw - { 0x89, diCMN, 0x0034 }, // 261 Maj op = 31, mask = 07FE, xop = 0x0034 ( 26) - cntlzw - { 0x89, diCMN, 0x0036 }, // 262 Maj op = 31, mask = 07FE, xop = 0x0036 ( 27) - sld - { 0x89, diCMN, 0x0038 }, // 263 Maj op = 31, mask = 07FE, xop = 0x0038 ( 28) - and - { 0x89, diCMN, 0x0040 }, // 264 Maj op = 31, mask = 07FE, xop = 0x0040 ( 32) - cmpl - { 0x89, diCMN, 0x004C }, // 265 Maj op = 31, mask = 07FE, xop = 0x004C ( 38) - lvsr - { 0x89, diCMN, 0x004E }, // 266 Maj op = 31, mask = 07FE, xop = 0x004E ( 39) - lvehx - { 0x89, diCMN, 0x006A }, // 267 Maj op = 31, mask = 07FE, xop = 0x006A ( 53) - ldux - { 0x89, diCMN, 0x006C }, // 268 Maj op = 31, mask = 07FE, xop = 0x006C ( 54) - dcbst - { 0x89, diCMN, 0x006E }, // 269 Maj op = 31, mask = 07FE, xop = 0x006E ( 55) - lwzux - { 0x89, diCMN, 0x0074 }, // 270 Maj op = 31, mask = 07FE, xop = 0x0074 ( 58) - cntlzd - { 0x89, diCMN, 0x0078 }, // 271 Maj op = 31, mask = 07FE, xop = 0x0078 ( 60) - andc - { 0x89, diTRP, 0x0088 }, // 272 Maj op = 31, mask = 07FE, xop = 0x0088 ( 68) - td - { 0x89, diCMN, 0x008E }, // 273 Maj op = 31, mask = 07FE, xop = 0x008E ( 71) - lvewx - { 0x89, diPRV, 0x00A6 }, // 274 Maj op = 31, mask = 07FE, xop = 0x00A6 ( 83) - mfmsr - { 0x89, diCMN, 0x00A8 }, // 275 Maj op = 31, mask = 07FE, xop = 0x00A8 ( 84) - ldarx - { 0x89, diCMN, 0x00AC }, // 276 Maj op = 31, mask = 07FE, xop = 0x00AC ( 86) - dcbf - { 0x89, diCMN, 0x00AE }, // 277 Maj op = 31, mask = 07FE, xop = 0x00AE ( 87) - lbzx - { 0x89, diCMN, 0x00CE }, // 278 Maj op = 31, mask = 07FE, xop = 0x00CE ( 103) - lvx - { 0x89, diCMN, 0x00EE }, // 279 Maj op = 31, mask = 07FE, xop = 0x00EE ( 119) - lbzux - { 0x89, diCMN, 0x00F8 }, // 280 Maj op = 31, mask = 07FE, xop = 0x00F8 ( 124) - nor - { 0x89, diCMN, 0x010E }, // 281 Maj op = 31, mask = 07FE, xop = 0x010E ( 135) - stvebx - { 0x89, diCMN, 0x0120 }, // 282 Maj op = 31, mask = 07FE, xop = 0x0120 ( 144) - mtcrf - { 0x89, diPRV, 0x0124 }, // 283 Maj op = 31, mask = 07FE, xop = 0x0124 ( 146) - mtmsr - { 0x89, diCMN, 0x012A }, // 284 Maj op = 31, mask = 07FE, xop = 0x012A ( 149) - stdx - { 0x89, diCMN, 0x012C }, // 285 Maj op = 31, mask = 07FE, xop = 0x012C ( 150) - stwcx - { 0x89, diCMN, 0x012E }, // 286 Maj op = 31, mask = 07FE, xop = 0x012E ( 151) - stwx - { 0x89, diCMN, 0x014E }, // 287 Maj op = 31, mask = 07FE, xop = 0x014E ( 167) - stvehx - { 0x89, diPRV, 0x0164 }, // 288 Maj op = 31, mask = 07FE, xop = 0x0164 ( 178) - mtmsrd - { 0x89, diCMN, 0x016A }, // 289 Maj op = 31, mask = 07FE, xop = 0x016A ( 181) - stdux - { 0x89, diCMN, 0x016E }, // 290 Maj op = 31, mask = 07FE, xop = 0x016E ( 183) - stwux - { 0x89, diCMN, 0x018E }, // 291 Maj op = 31, mask = 07FE, xop = 0x018E ( 199) - stvewx - { 0x89, diCMN, 0x01A4 }, // 292 Maj op = 31, mask = 07FE, xop = 0x01A4 ( 210) - mtsr - { 0x89, diCMN, 0x01AC }, // 293 Maj op = 31, mask = 07FE, xop = 0x01AC ( 214) - stdcx. - { 0x89, diCMN, 0x01AE }, // 294 Maj op = 31, mask = 07FE, xop = 0x01AE ( 215) - stbx - { 0x89, diCMN, 0x01CE }, // 295 Maj op = 31, mask = 07FE, xop = 0x01CE ( 231) - stvx - { 0x89, diPRV, 0x01E4 }, // 296 Maj op = 31, mask = 07FE, xop = 0x01E4 ( 242) - mtsrin - { 0x89, diCMN, 0x01EC }, // 297 Maj op = 31, mask = 07FE, xop = 0x01EC ( 246) - dcbtst - { 0x89, diCMN, 0x01EE }, // 298 Maj op = 31, mask = 07FE, xop = 0x01EE ( 247) - stbux - { 0x89, diPRV, 0x0224 }, // 299 Maj op = 31, mask = 07FE, xop = 0x0224 ( 274) - tlbiel - { 0x89, diCMN, 0x022C }, // 300 Maj op = 31, mask = 07FE, xop = 0x022C ( 278) - dcbt - { 0x89, diCMN, 0x022E }, // 301 Maj op = 31, mask = 07FE, xop = 0x022E ( 279) - lhzx - { 0x89, diCMN, 0x0238 }, // 302 Maj op = 31, mask = 07FE, xop = 0x0238 ( 284) - eqv - { 0x89, diPRV, 0x0264 }, // 303 Maj op = 31, mask = 07FE, xop = 0x0264 ( 306) - tlbie - { 0x89, diPRV, 0x026C }, // 304 Maj op = 31, mask = 07FE, xop = 0x026C ( 310) - eciwx - { 0x89, diCMN, 0x026E }, // 305 Maj op = 31, mask = 07FE, xop = 0x026E ( 311) - lhzux - { 0x89, diCMN, 0x0278 }, // 306 Maj op = 31, mask = 07FE, xop = 0x0278 ( 316) - xor - { 0x89, diSPR, 0x02A6 }, // 307 Maj op = 31, mask = 07FE, xop = 0x02A6 ( 339) - mfspr - { 0x89, diCMN, 0x02AA }, // 308 Maj op = 31, mask = 07FE, xop = 0x02AA ( 341) - lwax - { 0x89, diCMN, 0x02AC }, // 309 Maj op = 31, mask = 07FE, xop = 0x02AC ( 342) - dst - { 0x89, diCMN, 0x02AE }, // 310 Maj op = 31, mask = 07FE, xop = 0x02AE ( 343) - lhax - { 0x89, diCMN, 0x02CE }, // 311 Maj op = 31, mask = 07FE, xop = 0x02CE ( 359) - lvxl - { 0x89, diPRV, 0x02E4 }, // 312 Maj op = 31, mask = 07FE, xop = 0x02E4 ( 370) - tlbia - { 0x89, diCMN, 0x02E6 }, // 313 Maj op = 31, mask = 07FE, xop = 0x02E6 ( 371) - mftb - { 0x89, diCMN, 0x02EA }, // 314 Maj op = 31, mask = 07FE, xop = 0x02EA ( 373) - lwaux - { 0x89, diCMN, 0x02EC }, // 315 Maj op = 31, mask = 07FE, xop = 0x02EC ( 374) - dstst - { 0x89, diCMN, 0x02EE }, // 316 Maj op = 31, mask = 07FE, xop = 0x02EE ( 375) - lhaux - { 0x89, diPRV, 0x0324 }, // 317 Maj op = 31, mask = 07FE, xop = 0x0324 ( 402) - slbmte - { 0x89, diCMN, 0x032E }, // 318 Maj op = 31, mask = 07FE, xop = 0x032E ( 407) - sthx - { 0x89, diCMN, 0x0338 }, // 319 Maj op = 31, mask = 07FE, xop = 0x0338 ( 412) - orc - { 0x89, diPRV, 0x0364 }, // 320 Maj op = 31, mask = 07FE, xop = 0x0364 ( 434) - slbie - { 0x89, diPRV, 0x036C }, // 321 Maj op = 31, mask = 07FE, xop = 0x036C ( 438) - ecowx - { 0x89, diCMN, 0x036E }, // 322 Maj op = 31, mask = 07FE, xop = 0x036E ( 439) - sthux - { 0x89, diOR, 0x0378 }, // 323 Maj op = 31, mask = 07FE, xop = 0x0378 ( 444) - or - { 0x89, diSPR, 0x03A6 }, // 324 Maj op = 31, mask = 07FE, xop = 0x03A6 ( 467) - mtspr - { 0x89, diCMN, 0x03B8 }, // 325 Maj op = 31, mask = 07FE, xop = 0x03B8 ( 476) - nand - { 0x89, diCMN, 0x03CE }, // 326 Maj op = 31, mask = 07FE, xop = 0x03CE ( 487) - stvxl - { 0x89, diPRV, 0x03E4 }, // 327 Maj op = 31, mask = 07FE, xop = 0x03E4 ( 498) - slbia - { 0x89, diCMN, 0x0400 }, // 328 Maj op = 31, mask = 07FE, xop = 0x0400 ( 512) - mcrxr - { 0x89, diCMN, 0x042A }, // 329 Maj op = 31, mask = 07FE, xop = 0x042A ( 533) - lswx - { 0x89, diCMN, 0x042C }, // 330 Maj op = 31, mask = 07FE, xop = 0x042C ( 534) - lwbrx - { 0x89, diCMN, 0x042E }, // 331 Maj op = 31, mask = 07FE, xop = 0x042E ( 535) - lfsx - { 0x89, diCMN, 0x0430 }, // 332 Maj op = 31, mask = 07FE, xop = 0x0430 ( 536) - srw - { 0x89, diCMN, 0x0436 }, // 333 Maj op = 31, mask = 07FE, xop = 0x0436 ( 539) - srd - { 0x89, diPRV, 0x046C }, // 334 Maj op = 31, mask = 07FE, xop = 0x046C ( 566) - tlbsync - { 0x89, diCMN, 0x046E }, // 335 Maj op = 31, mask = 07FE, xop = 0x046E ( 567) - lfsux - { 0x89, diPRV, 0x04A6 }, // 336 Maj op = 31, mask = 07FE, xop = 0x04A6 ( 595) - mfsr - { 0x89, diCMN, 0x04AA }, // 337 Maj op = 31, mask = 07FE, xop = 0x04AA ( 597) - lswi - { 0x89, diCMN, 0x04AC }, // 338 Maj op = 31, mask = 07FE, xop = 0x04AC ( 598) - sync - { 0x89, diCMN, 0x04AE }, // 339 Maj op = 31, mask = 07FE, xop = 0x04AE ( 599) - lfdx - { 0x89, diCMN, 0x04EE }, // 340 Maj op = 31, mask = 07FE, xop = 0x04EE ( 631) - lfdux - { 0x89, diPRV, 0x0526 }, // 341 Maj op = 31, mask = 07FE, xop = 0x0526 ( 659) - mfsrin - { 0x89, diCMN, 0x052A }, // 342 Maj op = 31, mask = 07FE, xop = 0x052A ( 661) - stswx - { 0x89, diCMN, 0x052C }, // 343 Maj op = 31, mask = 07FE, xop = 0x052C ( 662) - stwbrx - { 0x89, diCMN, 0x052E }, // 344 Maj op = 31, mask = 07FE, xop = 0x052E ( 663) - stfsx - { 0x89, diCMN, 0x056E }, // 345 Maj op = 31, mask = 07FE, xop = 0x056E ( 695) - stfsux - { 0x89, diCMN, 0x05AA }, // 346 Maj op = 31, mask = 07FE, xop = 0x05AA ( 725) - stswi - { 0x89, diCMN, 0x05AE }, // 347 Maj op = 31, mask = 07FE, xop = 0x05AE ( 727) - stfdx - { 0x89, diCMN, 0x05EC }, // 348 Maj op = 31, mask = 07FE, xop = 0x05EC ( 758) - dcba - { 0x89, diCMN, 0x05EE }, // 349 Maj op = 31, mask = 07FE, xop = 0x05EE ( 759) - stfdux - { 0x89, diCMN, 0x062C }, // 350 Maj op = 31, mask = 07FE, xop = 0x062C ( 790) - lhbrx - { 0x89, diCMN, 0x0630 }, // 351 Maj op = 31, mask = 07FE, xop = 0x0630 ( 792) - sraw - { 0x89, diCMN, 0x0634 }, // 352 Maj op = 31, mask = 07FE, xop = 0x0634 ( 794) - srad - { 0x89, diCMN, 0x066C }, // 353 Maj op = 31, mask = 07FE, xop = 0x066C ( 822) - dss - { 0x89, diCMN, 0x0670 }, // 354 Maj op = 31, mask = 07FE, xop = 0x0670 ( 824) - srawi - { 0x89, diPRV, 0x06A6 }, // 355 Maj op = 31, mask = 07FE, xop = 0x06A6 ( 851) - slbmfev - { 0x89, diCMN, 0x06AC }, // 356 Maj op = 31, mask = 07FE, xop = 0x06AC ( 854) - eieio - { 0x89, diPRV, 0x0726 }, // 357 Maj op = 31, mask = 07FE, xop = 0x0726 ( 915) - slbmfee - { 0x89, diCMN, 0x072C }, // 358 Maj op = 31, mask = 07FE, xop = 0x072C ( 918) - sthbrx - { 0x89, diCMN, 0x0734 }, // 359 Maj op = 31, mask = 07FE, xop = 0x0734 ( 922) - extsh - { 0x89, diCMN, 0x0774 }, // 360 Maj op = 31, mask = 07FE, xop = 0x0774 ( 954) - extsb - { 0x89, diCMN, 0x07AC }, // 361 Maj op = 31, mask = 07FE, xop = 0x07AC ( 982) - icbi - { 0x89, diCMN, 0x07AE }, // 362 Maj op = 31, mask = 07FE, xop = 0x07AE ( 983) - stfiwx - { 0x89, diCMN, 0x07B4 }, // 363 Maj op = 31, mask = 07FE, xop = 0x07B4 ( 986) - extsw - { 0x09, diCMN, 0x07EC }, // 364 Maj op = 31, mask = 07FE, xop = 0x07EC (1014) - dcbz - { 0x81, diCMN, 0x0000 }, // 365 Maj op = 58, mask = 0003, xop = 0x0000 ( 0) - ld - { 0x81, diCMN, 0x0001 }, // 366 Maj op = 58, mask = 0003, xop = 0x0001 ( 1) - ldu - { 0x01, diCMN, 0x0002 }, // 367 Maj op = 58, mask = 0003, xop = 0x0002 ( 2) - lwa - { 0x84, diCMN, 0x0024 }, // 368 Maj op = 59, mask = 003E, xop = 0x0024 ( 18) - fdivs - { 0x84, diCMN, 0x0028 }, // 369 Maj op = 59, mask = 003E, xop = 0x0028 ( 20) - fsubs - { 0x84, diCMN, 0x002A }, // 370 Maj op = 59, mask = 003E, xop = 0x002A ( 21) - fadds - { 0x84, diCMN, 0x002C }, // 371 Maj op = 59, mask = 003E, xop = 0x002C ( 22) - fsqrts - { 0x84, diCMN, 0x0030 }, // 372 Maj op = 59, mask = 003E, xop = 0x0030 ( 24) - fres - { 0x84, diCMN, 0x0032 }, // 373 Maj op = 59, mask = 003E, xop = 0x0032 ( 25) - fmuls - { 0x84, diCMN, 0x0038 }, // 374 Maj op = 59, mask = 003E, xop = 0x0038 ( 28) - fmsubs - { 0x84, diCMN, 0x003A }, // 375 Maj op = 59, mask = 003E, xop = 0x003A ( 29) - fmadds - { 0x84, diCMN, 0x003C }, // 376 Maj op = 59, mask = 003E, xop = 0x003C ( 30) - fnmsubs - { 0x04, diCMN, 0x003E }, // 377 Maj op = 59, mask = 003E, xop = 0x003E ( 31) - fnmadds - { 0x81, diCMN, 0x0000 }, // 378 Maj op = 62, mask = 0003, xop = 0x0000 ( 0) - std - { 0x01, diCMN, 0x0001 }, // 379 Maj op = 62, mask = 0003, xop = 0x0001 ( 1) - stdu - { 0x84, diCMN, 0x0024 }, // 380 Maj op = 63, mask = 003E, xop = 0x0024 ( 18) - fdiv - { 0x84, diCMN, 0x0028 }, // 381 Maj op = 63, mask = 003E, xop = 0x0028 ( 20) - fsub - { 0x84, diCMN, 0x002A }, // 382 Maj op = 63, mask = 003E, xop = 0x002A ( 21) - fadd - { 0x84, diCMN, 0x002C }, // 383 Maj op = 63, mask = 003E, xop = 0x002C ( 22) - fsqrt - { 0x84, diCMN, 0x002E }, // 384 Maj op = 63, mask = 003E, xop = 0x002E ( 23) - fsel - { 0x84, diCMN, 0x0032 }, // 385 Maj op = 63, mask = 003E, xop = 0x0032 ( 25) - fmul - { 0x84, diCMN, 0x0034 }, // 386 Maj op = 63, mask = 003E, xop = 0x0034 ( 26) - frsqrte - { 0x84, diCMN, 0x0038 }, // 387 Maj op = 63, mask = 003E, xop = 0x0038 ( 28) - fmsub - { 0x84, diCMN, 0x003A }, // 388 Maj op = 63, mask = 003E, xop = 0x003A ( 29) - fmadd - { 0x84, diCMN, 0x003C }, // 389 Maj op = 63, mask = 003E, xop = 0x003C ( 30) - fnmsub - { 0x84, diCMN, 0x003E }, // 390 Maj op = 63, mask = 003E, xop = 0x003E ( 31) - fnmadd - { 0x89, diCMN, 0x0000 }, // 391 Maj op = 63, mask = 07FE, xop = 0x0000 ( 0) - fcmpu - { 0x89, diCMN, 0x0018 }, // 392 Maj op = 63, mask = 07FE, xop = 0x0018 ( 12) - frsp - { 0x89, diCMN, 0x001C }, // 393 Maj op = 63, mask = 07FE, xop = 0x001C ( 14) - fctiw - { 0x89, diCMN, 0x001E }, // 394 Maj op = 63, mask = 07FE, xop = 0x001E ( 15) - fctiwz - { 0x89, diCMN, 0x0040 }, // 395 Maj op = 63, mask = 07FE, xop = 0x0040 ( 32) - fcmpo - { 0x89, diCMN, 0x004C }, // 396 Maj op = 63, mask = 07FE, xop = 0x004C ( 38) - mtfsb1 - { 0x89, diCMN, 0x0050 }, // 397 Maj op = 63, mask = 07FE, xop = 0x0050 ( 40) - fneg - { 0x89, diCMN, 0x0080 }, // 398 Maj op = 63, mask = 07FE, xop = 0x0080 ( 64) - mcrfs - { 0x89, diCMN, 0x008C }, // 399 Maj op = 63, mask = 07FE, xop = 0x008C ( 70) - mtfsb0 - { 0x89, diCMN, 0x0090 }, // 400 Maj op = 63, mask = 07FE, xop = 0x0090 ( 72) - fmr - { 0x89, diCMN, 0x010C }, // 401 Maj op = 63, mask = 07FE, xop = 0x010C ( 134) - mtfsfi - { 0x89, diCMN, 0x0110 }, // 402 Maj op = 63, mask = 07FE, xop = 0x0110 ( 136) - fnabs - { 0x89, diCMN, 0x0210 }, // 403 Maj op = 63, mask = 07FE, xop = 0x0210 ( 264) - fabs - { 0x89, diCMN, 0x048E }, // 404 Maj op = 63, mask = 07FE, xop = 0x048E ( 583) - mffs - { 0x89, diCMN, 0x058E }, // 405 Maj op = 63, mask = 07FE, xop = 0x058E ( 711) - mtfsf - { 0x89, diCMN, 0x065C }, // 406 Maj op = 63, mask = 07FE, xop = 0x065C ( 814) - fctid - { 0x89, diCMN, 0x065E }, // 407 Maj op = 63, mask = 07FE, xop = 0x065E ( 815) - fctidz - { 0x09, diCMN, 0x069C }, // 408 Maj op = 63, mask = 07FE, xop = 0x069C ( 846) - fcfid -}; - -#ifdef __decodePPC_debug__ -char *instname[] = { - "Jump entry...", - "Invalid", - "tdi", - "twi", - "Jump entry...", - "Invalid", - "Invalid", - "mulli", - "subfic", - "Invalid", - "cmpli", - "cmpi", - "addic", - "addic.", - "addi", - "addis", - "bc", - "sc", - "b", - "Jump entry...", - "rlwimi", - "rlwinm", - "Invalid", - "rlwnm", - "ori", - "oris", - "xori", - "xoris", - "andi.", - "andis.", - "Jump entry...", - "Jump entry...", - "lwz", - "lwzu", - "lbz", - "lbzu", - "stw", - "stwu", - "stb", - "stbu", - "lhz", - "lhzu", - "lha", - "lhau", - "sth", - "sthu", - "lmw", - "stmw", - "lfs", - "lfsu", - "lfd", - "lfdu", - "stfs", - "stfsu", - "stfd", - "stfdu", - "Invalid", - "Invalid", - "Jump entry...", - "Jump entry...", - "Invalid", - "Invalid", - "Jump entry...", - "Jump entry...", - "attn", - "vmhaddshs", - "vmhraddshs", - "vmladduhm", - "vmsumubm", - "vmsummbm", - "vmsumuhm", - "vmsumuhs", - "vmsumshm", - "vmsumshs", - "vsel", - "vperm", - "vsldoi", - "vmaddfp", - "vnmsubfp", - "vcmpequb", - "vcmpequh", - "vcmpequw", - "vcmpeqfp", - "vcmpgefp", - "vcmpgtub", - "vcmpgtuh", - "vcmpgtuw", - "vcmpgtfp", - "vcmpgtsb", - "vcmpgtsh", - "vcmpgtsw", - "vcmpbfp", - "vaddubm", - "vmaxub", - "vrlb", - "vmuloub", - "vaddfp", - "vmrghb", - "vpkuhum", - "vadduhm", - "vmaxuh", - "vrlh", - "vmulouh", - "vsubfp", - "vmrghh", - "vpkuwum", - "vadduwm", - "vmaxuw", - "vrlw", - "vmrghw", - "vpkuhus", - "vpkuwus", - "vmaxsb", - "vslb", - "vmulosb", - "vrefp", - "vmrglb", - "vpkshus", - "vmaxsh", - "vslh", - "vmulosh", - "vrsqrtefp", - "vmrglh", - "vpkswus", - "vaddcuw", - "vmaxsw", - "vslw", - "vexptefp", - "vmrglw", - "vpkshss", - "vsl", - "vlogefp", - "vpkswss", - "vaddubs", - "vminub", - "vsrb", - "vmuleub", - "vrfin", - "vspltb", - "vupkhsb", - "vadduhs", - "vminuh", - "vsrh", - "vmuleuh", - "vrfiz", - "vsplth", - "vupkhsh", - "vadduws", - "vminuw", - "vsrw", - "vrfip", - "vspltw", - "vupklsb", - "vsr", - "vrfim", - "vupklsh", - "vaddsbs", - "vminsb", - "vsrab", - "vmulesb", - "vcfux", - "vspltisb", - "vpkpx", - "vaddshs", - "vminsh", - "vsrah", - "vmulesh", - "vcfsx", - "vspltish", - "vupkhpx", - "vaddsws", - "vminsw", - "vsraw", - "vctuxs", - "vspltisw", - "vctsxs", - "vupklpx", - "vsububm", - "vavgub", - "vand", - "vmaxfp", - "vslo", - "vsubuhm", - "vavguh", - "vandc", - "vminfp", - "vsro", - "vsubuwm", - "vavguw", - "vor", - "vxor", - "vavgsb", - "vnor", - "vavgsh", - "vsubcuw", - "vavgsw", - "vsububs", - "mfvscr", - "vsum4ubs", - "vsubuhs", - "mtvscr", - "vsum4shs", - "vsubuws", - "vsum2sws", - "vsubsbs", - "vsum4sbs", - "vsubshs", - "vsubsws", - "vsumsws", - "mcrf", - "bclr", - "rfid", - "crnor", - "rfi", - "crandc", - "isync", - "crxor", - "crnand", - "crand", - "hrfid", - "creqv", - "crorc", - "cror", - "bctr", - "rldicl", - "rldicr", - "rldic", - "rldimi", - "rldcl", - "rldcr", - "subfc", - "mulhdu", - "addc", - "mulhwu", - "subf", - "mulhd", - "mulhw", - "neg", - "subfe", - "adde", - "subfze", - "addze", - "subfme", - "mulld", - "addme", - "mullw", - "add", - "divdu", - "divwu", - "divd", - "divw", - "sradi", - "cmp", - "tw", - "lvsl", - "lvebx", - "mfcr", - "lwarx", - "ldx", - "lwzx", - "slw", - "cntlzw", - "sld", - "and", - "cmpl", - "lvsr", - "lvehx", - "ldux", - "dcbst", - "lwzux", - "cntlzd", - "andc", - "td", - "lvewx", - "mfmsr", - "ldarx", - "dcbf", - "lbzx", - "lvx", - "lbzux", - "nor", - "stvebx", - "mtcrf", - "mtmsr", - "stdx", - "stwcx", - "stwx", - "stvehx", - "mtmsrd", - "stdux", - "stwux", - "stvewx", - "mtsr", - "stdcx.", - "stbx", - "stvx", - "mtsrin", - "dcbtst", - "stbux", - "tlbiel", - "dcbt", - "lhzx", - "eqv", - "tlbie", - "eciwx", - "lhzux", - "xor", - "mfspr", - "lwax", - "dst", - "lhax", - "lvxl", - "tlbia", - "mftb", - "lwaux", - "dstst", - "lhaux", - "slbmte", - "sthx", - "orc", - "slbie", - "ecowx", - "sthux", - "or", - "mtspr", - "nand", - "stvxl", - "slbia", - "mcrxr", - "lswx", - "lwbrx", - "lfsx", - "srw", - "srd", - "tlbsync", - "lfsux", - "mfsr", - "lswi", - "sync", - "lfdx", - "lfdux", - "mfsrin", - "stswx", - "stwbrx", - "stfsx", - "stfsux", - "stswi", - "stfdx", - "dcba", - "stfdux", - "lhbrx", - "sraw", - "srad", - "dss", - "srawi", - "slbmfev", - "eieio", - "slbmfee", - "sthbrx", - "extsh", - "extsb", - "icbi", - "stfiwx", - "extsw", - "dcbz", - "ld", - "ldu", - "lwa", - "fdivs", - "fsubs", - "fadds", - "fsqrts", - "fres", - "fmuls", - "fmsubs", - "fmadds", - "fnmsubs", - "fnmadds", - "std", - "stdu", - "fdiv", - "fsub", - "fadd", - "fsqrt", - "fsel", - "fmul", - "frsqrte", - "fmsub", - "fmadd", - "fnmsub", - "fnmadd", - "fcmpu", - "frsp", - "fctiw", - "fctiwz", - "fcmpo", - "mtfsb1", - "fneg", - "mcrfs", - "mtfsb0", - "fmr", - "mtfsfi", - "fnabs", - "fabs", - "mffs", - "mtfsf", - "fctid", - "fctidz", - "fcfid", -}; -#endif - -static dcdtab dcdfail = { 0x00, diINV, 0x0000 }; // Decode failed - -static uint32_t sprtbl[] = { - 0xCCC03274, // spr 0 to 31 - 0x00000000, // spr 32 to 63 - 0x00000000, // spr 64 to 95 - 0x00000000, // spr 96 to 127 - 0x00000080, // spr 128 to 159 - 0x00000000, // spr 160 to 191 - 0x00000000, // spr 192 to 223 - 0x00000000, // spr 224 to 255 - 0x9000FCAD, // spr 256 to 287 - 0x0000C3F3, // spr 288 to 319 - 0x00000000, // spr 320 to 351 - 0x00000000, // spr 352 to 383 - 0x00000000, // spr 384 to 415 - 0x00000000, // spr 416 to 447 - 0x00000000, // spr 448 to 479 - 0x00000000, // spr 480 to 511 - 0x0000FFFF, // spr 512 to 543 - 0x00000000, // spr 544 to 575 - 0x00000000, // spr 576 to 607 - 0x00000000, // spr 608 to 639 - 0x00000000, // spr 640 to 671 - 0x00000000, // spr 672 to 703 - 0x00000000, // spr 704 to 735 - 0x00000000, // spr 736 to 767 - 0x3FFF3FFF, // spr 768 to 799 - 0x00000000, // spr 800 to 831 - 0x00000000, // spr 832 to 863 - 0x00000000, // spr 864 to 895 - 0x00000000, // spr 896 to 927 - 0xE1FFE1FF, // spr 928 to 959 - 0x0000FE80, // spr 960 to 991 - 0x0000FFFF, // spr 992 to 1023 -}; diff --git a/bsd/ppc/endian.h b/bsd/ppc/endian.h deleted file mode 100644 index c6929f117..000000000 --- a/bsd/ppc/endian.h +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * Copyright (c) 1995 NeXT Computer, Inc. All rights reserved. - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1987, 1991, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)endian.h 8.1 (Berkeley) 6/10/93 - */ - -#ifndef _PPC_ENDIAN_H_ -#define _PPC_ENDIAN_H_ - -#include - -/* - * Define the order of 32-bit words in 64-bit words. - */ -#define _QUAD_HIGHWORD 0 -#define _QUAD_LOWWORD 1 - -/* - * Definitions for byte order, according to byte significance from low - * address to high. - */ -#define __DARWIN_LITTLE_ENDIAN 1234 /* LSB first: i386, vax */ -#define __DARWIN_BIG_ENDIAN 4321 /* MSB first: 68000, ibm, net, ppc */ -#define __DARWIN_PDP_ENDIAN 3412 /* LSB first in word, MSW first in long */ - -#define __DARWIN_BYTE_ORDER __DARWIN_BIG_ENDIAN - -#if defined(KERNEL) || (!defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE)) - -#define LITTLE_ENDIAN __DARWIN_LITTLE_ENDIAN -#define BIG_ENDIAN __DARWIN_BIG_ENDIAN -#define PDP_ENDIAN __DARWIN_PDP_ENDIAN - -#define BYTE_ORDER __DARWIN_BYTE_ORDER - -#include - -#endif /* defined(KERNEL) || (!defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE)) */ -#endif /* !_PPC_ENDIAN_H_ */ diff --git a/bsd/ppc/exec.h b/bsd/ppc/exec.h deleted file mode 100644 index 471543a1d..000000000 --- a/bsd/ppc/exec.h +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1994, The University of Utah and - * the Center for Software Science at the University of Utah (CSS). - * All rights reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * THE UNIVERSITY OF UTAH AND CSS ALLOW FREE USE OF THIS SOFTWARE IN ITS "AS - * IS" CONDITION. THE UNIVERSITY OF UTAH AND CSS DISCLAIM ANY LIABILITY OF - * ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * CSS requests users of this software to return to css-dist@cs.utah.edu any - * improvements that they make and grant CSS redistribution rights. - * - */ - -#ifndef _BSD_PPC_EXEC_H_ -#define _BSD_PPC_EXEC_H_ - - -#include - -#ifdef BSD_KERNEL_PRIVATE -/* Size of a page in an object file. */ -#define __LDPGSZ 4096 - -/* Valid magic number check. */ -#define N_BADMAG(ex) \ - ((ex).a_magic != NMAGIC && (ex).a_magic != OMAGIC && \ - (ex).a_magic != ZMAGIC) - -/* Address of the bottom of the text segment. */ -#define N_TXTADDR(X) 0 - -/* Address of the bottom of the data segment. */ -#define N_DATADDR(ex) \ - (N_TXTADDR(ex) + ((ex).a_magic == OMAGIC ? (ex).a_text \ - : __LDPGSZ + ((ex).a_text - 1 & ~(__LDPGSZ - 1)))) - -/* Text segment offset. */ -#define N_TXTOFF(ex) \ - ((ex).a_magic == ZMAGIC ? __LDPGSZ : sizeof(struct exec)) - -/* Data segment offset. */ -#define N_DATOFF(ex) \ - (N_TXTOFF(ex) + ((ex).a_magic != ZMAGIC ? (ex).a_text : \ - __LDPGSZ + ((ex).a_text - 1 & ~(__LDPGSZ - 1)))) - -/* Symbol table offset. */ -#define N_SYMOFF(ex) \ - (N_TXTOFF(ex) + (ex).a_text + (ex).a_data + (ex).a_trsize + \ - (ex).a_drsize) - -/* String table offset. */ -#define N_STROFF(ex) (N_SYMOFF(ex) + (ex).a_syms) - -/* Description of the object file header (a.out format). */ -struct exec { -#define OMAGIC 0407 /* old impure format */ -#define NMAGIC 0410 /* read-only text */ -#define ZMAGIC 0413 /* demand load format */ -#define QMAGIC 0314 /* demand load format. Header in text. */ - unsigned int a_magic; /* magic number */ - - unsigned int a_text; /* text segment size */ - unsigned int a_data; /* initialized data size */ - unsigned int a_bss; /* uninitialized data size */ - unsigned int a_syms; /* symbol table size */ - unsigned int a_entry; /* entry point */ - unsigned int a_trsize; /* text relocation size */ - unsigned int a_drsize; /* data relocation size */ -}; - -#endif /* BSD_KERNEL_PRIVATE */ - -#endif /* _BSD_PPC_EXEC_H_ */ - diff --git a/bsd/ppc/fasttrap_isa.h b/bsd/ppc/fasttrap_isa.h deleted file mode 100644 index b4a2cb4c2..000000000 --- a/bsd/ppc/fasttrap_isa.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (c) 2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _FASTTRAP_ISA_H -#define _FASTTRAP_ISA_H - -/* #pragma ident "@(#)fasttrap_isa.h 1.4 05/06/08 SMI" */ - -#include -#if defined(__APPLE__) -#include -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -typedef uint32_t fasttrap_instr_t; - -typedef struct fasttrap_machtp { - fasttrap_instr_t ftmt_instr; /* Original instruction */ - int32_t ftmt_trgt; /* Offset or absolute address */ - uint8_t ftmt_type; /* Emulation function type */ -#define ftmtNOP 0 -#define ftmtCommon 1 -#define ftmtB 2 -#define ftmtBC 3 -#define ftmtBLR 4 -#define ftmtBCTR 5 - uint8_t ftmt_bo; /* Branch options */ - uint8_t ftmt_bi; /* Condition bit */ - uint8_t ftmt_flgs; /* Flags */ -#define ftmtAbs 2 -#define ftmtLink 1 -} fasttrap_machtp_t; - -#define ftt_instr ftt_mtp.ftmt_instr -#define ftt_trgt ftt_mtp.ftmt_trgt -#define ftt_type ftt_mtp.ftmt_type -#define ftt_bo ftt_mtp.ftmt_bo -#define ftt_bi ftt_mtp.ftmt_bi -#define ftt_flgs ftt_mtp.ftmt_flgs - -#define FASTTRAP_INSTR 0x0FFFDDDD -#define T_DTRACE_RET (0x2E * 4) - -#define FASTTRAP_RETURN_AFRAMES 7 -#define FASTTRAP_ENTRY_AFRAMES 7 -#define FASTTRAP_OFFSET_AFRAMES 6 - -#ifdef __cplusplus -} -#endif - -#endif /* _FASTTRAP_ISA_H */ diff --git a/bsd/ppc/limits.h b/bsd/ppc/limits.h deleted file mode 100644 index 8f7decbec..000000000 --- a/bsd/ppc/limits.h +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (c) 1988, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)limits.h 8.3 (Berkeley) 1/4/94 - */ - -#ifndef _PPC_LIMITS_H_ -#define _PPC_LIMITS_H_ - -#include -#include - -#define CHAR_BIT 8 /* number of bits in a char */ -#define MB_LEN_MAX 6 /* Allow 31 bit UTF2 */ - -#if !defined(_ANSI_SOURCE) && (!defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE)) -#define CLK_TCK __DARWIN_CLK_TCK /* ticks per second */ -#endif /* !_ANSI_SOURCE && (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ - -/* - * According to ANSI (section 2.2.4.2), the values below must be usable by - * #if preprocessing directives. Additionally, the expression must have the - * same type as would an expression that is an object of the corresponding - * type converted according to the integral promotions. The subtraction for - * INT_MIN and LONG_MIN is so the value is not unsigned; 2147483648 is an - * unsigned int for 32-bit two's complement ANSI compilers (section 3.1.3.2). - * These numbers work for pcc as well. The UINT_MAX and ULONG_MAX values - * are written as hex so that GCC will be quiet about large integer constants. - */ -#define SCHAR_MAX 127 /* min value for a signed char */ -#define SCHAR_MIN (-128) /* max value for a signed char */ - -#define UCHAR_MAX 255 /* max value for an unsigned char */ -#define CHAR_MAX 127 /* max value for a char */ -#define CHAR_MIN (-128) /* min value for a char */ - -#define USHRT_MAX 65535 /* max value for an unsigned short */ -#define SHRT_MAX 32767 /* max value for a short */ -#define SHRT_MIN (-32768) /* min value for a short */ - -#define UINT_MAX 0xffffffff /* max value for an unsigned int */ -#define INT_MAX 2147483647 /* max value for an int */ -#define INT_MIN (-2147483647-1) /* min value for an int */ - -#ifdef __LP64__ -#define ULONG_MAX 0xffffffffffffffffUL /* max unsigned long */ -#define LONG_MAX 0x7fffffffffffffffL /* max signed long */ -#define LONG_MIN (-0x7fffffffffffffffL-1) /* min signed long */ -#else /* !__LP64__ */ -#define ULONG_MAX 0xffffffffUL /* max unsigned long */ -#define LONG_MAX 2147483647L /* max signed long */ -#define LONG_MIN (-2147483647L-1) /* min signed long */ -#endif /* __LP64__ */ - -#define ULLONG_MAX 0xffffffffffffffffULL /* max unsigned long long */ -#define LLONG_MAX 0x7fffffffffffffffLL /* max signed long long */ -#define LLONG_MIN (-0x7fffffffffffffffLL-1) /* min signed long long */ - -#if !defined(_ANSI_SOURCE) -#ifdef __LP64__ -#define LONG_BIT 64 -#else /* !__LP64__ */ -#define LONG_BIT 32 -#endif /* __LP64__ */ -#define SSIZE_MAX LONG_MAX /* max value for a ssize_t */ -#define WORD_BIT 32 - -#if (!defined(_POSIX_C_SOURCE) && !defined(_XOPEN_SOURCE)) || defined(_DARWIN_C_SOURCE) -#define SIZE_T_MAX ULONG_MAX /* max value for a size_t */ - -#define UQUAD_MAX ULLONG_MAX -#define QUAD_MAX LLONG_MAX -#define QUAD_MIN LLONG_MIN - -#endif /* (!_POSIX_C_SOURCE && !_XOPEN_SOURCE) || _DARWIN_C_SOURCE */ -#endif /* !_ANSI_SOURCE */ - -#endif /* _PPC_LIMITS_H_ */ diff --git a/bsd/ppc/param.h b/bsd/ppc/param.h deleted file mode 100644 index a434e3c4c..000000000 --- a/bsd/ppc/param.h +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1993,1995 NeXT Computer, Inc. All Rights Reserved */ - -#ifndef _PPC_PARAM_H_ -#define _PPC_PARAM_H_ - -#include - -/* - * Round p (pointer or byte index) up to a correctly-aligned value for all - * data types (int, long, ...). The result is unsigned int and must be - * cast to any desired pointer type. - */ -#define ALIGNBYTES __DARWIN_ALIGNBYTES -#define ALIGN(p) __DARWIN_ALIGN(p) - -#define NBPG 4096 /* bytes/page */ -#define PGOFSET (NBPG-1) /* byte offset into page */ -#define PGSHIFT 12 /* LOG2(NBPG) */ - -#define NBSEG 0x40000000 /* bytes/segment (quadrant) */ -#define SEGOFSET (NBSEG-1) /* byte offset into segment */ -#define SEGSHIFT 30 /* LOG2(NBSEG) */ - -#define DEV_BSIZE 512 -#define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ -#define BLKDEV_IOSIZE 2048 -#define MAXPHYS (128 * 1024) /* max raw I/O transfer size */ - -#define STACK_GROWTH_UP 0 /* stack grows to lower addresses */ - -#define CLSIZE 1 -#define CLSIZELOG2 0 - -#define STACKSIZE 4 /* pages in kernel stack */ -#define UPAGES 0 /* total pages in u-area */ - /* red zone is beyond this */ - -/* - * Constants related to network buffer management. - * MCLBYTES must be no larger than CLBYTES (the software page size), and, - * on machines that exchange pages of input or output buffers with mbuf - * clusters (MAPPED_MBUFS), MCLBYTES must also be an integral multiple - * of the hardware page size. - */ -#define MSIZE 256 /* size of an mbuf */ -#define MCLBYTES 2048 /* large enough for ether MTU */ -#define MCLSHIFT 11 -#define MCLOFSET (MCLBYTES - 1) -#ifndef NMBCLUSTERS -#if GATEWAY -#define NMBCLUSTERS ((1024 * 1024) / MCLBYTES) /* cl map size: 1MB */ -#else -#define NMBCLUSTERS ((1024 * 1024) / MCLBYTES) - /* cl map size was 0.5MB when MSIZE was 128, now it's 1MB*/ -#endif -#endif - -/* pages ("clicks") (NBPG bytes) to disk blocks */ -#define ctod(x) ((x)<<(PGSHIFT-DEV_BSHIFT)) -#define dtoc(x) ((x)>>(PGSHIFT-DEV_BSHIFT)) -#define dtob(x) ((x)<>PGSHIFT) -#ifdef __APPLE__ -#define btodb(bytes, devBlockSize) \ - ((unsigned)(bytes) / devBlockSize) -#define dbtob(db, devBlockSize) \ - ((unsigned)(db) * devBlockSize) -#else -#define btodb(bytes) /* calculates (bytes / DEV_BSIZE) */ \ - ((unsigned)(bytes) >> DEV_BSHIFT) -#define dbtob(db) /* calculates (db * DEV_BSIZE) */ \ - ((unsigned)(db) << DEV_BSHIFT) -#endif - -/* - * Map a ``block device block'' to a file system block. - * This should be device dependent, and should use the bsize - * field from the disk label. - * For now though just use DEV_BSIZE. - */ -#define bdbtofsb(bn) ((bn) / (BLKDEV_IOSIZE/DEV_BSIZE)) - -/* from machdep/ppc/proc_reg.h */ -#ifdef __BIG_ENDIAN__ -#define ENDIAN_MASK(val,size) (1 << (size-1 - val)) -#else -#error code not ported to little endian targets yet -#endif /* __BIG_ENDIAN__ */ - -#ifndef MASK -#define MASK(PART) ENDIAN_MASK(PART ## _BIT, 32) -#endif - -#define MSR_EE_BIT 16 -#define MSR_PR_BIT 17 -#define USERMODE(msr) (msr & MASK(MSR_PR) ? TRUE : FALSE) -#define BASEPRI(msr) (msr & MASK(MSR_EE) ? TRUE : FALSE) -/* end of from proc_reg.h */ - -#if defined(KERNEL) || defined(STANDALONE) -#define DELAY(n) delay(n) -#else -#define DELAY(n) { register int N = (n); while (--N > 0); } -#endif /* defined(KERNEL) || defined(STANDALONE) */ - -#define NPIDS 16 /* maximum number of PIDs per process */ -#define NIOPIDS 8 /* maximum number of IO space PIDs */ - -#endif /* _PPC_PARAM_H_ */ diff --git a/bsd/ppc/profile.h b/bsd/ppc/profile.h deleted file mode 100644 index 7be38b3a9..000000000 --- a/bsd/ppc/profile.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1997, Apple Computer, Inc. All rights reserved. - * - */ - -#ifndef _BSD_PPC_PROFILE_H_ -#define _BSD_PPC_PROFILE_H_ - -#include - -#ifdef KERNEL -#ifdef __APPLE_API_UNSTABLE -/* - * Block interrupts during mcount so that those interrupts can also be - * counted (as soon as we get done with the current counting). On the - * PPC platfom, can't do splhigh/splx as those are C routines and can - * recursively invoke mcount. - */ -extern unsigned long disable_ee(void); -extern void restore_ee(unsigned long smsr); - -#define MCOUNT_INIT register unsigned long smsr; - -#define MCOUNT_ENTER smsr = disable_ee(); - -#define MCOUNT_EXIT restore_ee(smsr); - -#endif /* __APPLE_API_UNSTABLE */ -#endif /* KERNEL */ - -#endif /* _BSD_PPC_PROFILE_H_ */ diff --git a/bsd/ppc/reboot.h b/bsd/ppc/reboot.h deleted file mode 100644 index 75e3a7656..000000000 --- a/bsd/ppc/reboot.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef _BSD_PPC_REBOOT_H_ -#define _BSD_PPC_REBOOT_H_ - -#include - -/* - * Empty file (publicly) - */ -#ifdef BSD_KERNEL_PRIVATE -/* - * Use most significant 16 bits to avoid collisions with - * machine independent flags. - */ -#define RB_POWERDOWN 0x00010000 /* power down on halt */ -#define RB_NOBOOTRC 0x00020000 /* don't run '/etc/rc.boot' */ -#define RB_DEBUG 0x00040000 /* drop into mini monitor on panic */ -#define RB_EJECT 0x00080000 /* eject disks on halt */ -#define RB_COMMAND 0x00100000 /* new boot command specified */ -#define RB_NOFP 0x00200000 /* don't use floating point */ -#define RB_BOOTNEXT 0x00400000 /* reboot into NeXT */ -#define RB_BOOTDOS 0x00800000 /* reboot into DOS */ - - -#endif /* BSD_KERNEL_PRIVATE */ - -#endif /* _BSD_PPC_REBOOT_H_ */ - diff --git a/bsd/ppc/setjmp.h b/bsd/ppc/setjmp.h deleted file mode 100644 index 27eb59ab0..000000000 --- a/bsd/ppc/setjmp.h +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1998 Apple Computer, Inc. All rights reserved. - * - * File: ppc/setjmp.h - * - * Declaration of setjmp routines and data structures. - */ -#ifndef _BSD_PPC_SETJMP_H_ -#define _BSD_PPC_SETJMP_H_ - -#include - -#define __need_struct_sigcontext -#if defined(KERNEL) -#define __need_struct_sigcontext32 -#define __need_struct_sigcontext64 -#endif /* KERNEL */ -#include - -struct _jmp_buf { -#if __DARWIN_UNIX03 - _STRUCT_SIGCONTEXT __sigcontext; /* kernel state preserved by set/longjmp */ - unsigned int __vmask __attribute__((aligned(8))); /* vector mask register */ - unsigned int __vreg[32 * 4] __attribute__((aligned(16))); - /* 32 128-bit vector registers */ -#else /* !__DARWIN_UNIX03 */ - _STRUCT_SIGCONTEXT sigcontext; /* kernel state preserved by set/longjmp */ - unsigned int vmask __attribute__((aligned(8))); /* vector mask register */ - unsigned int vreg[32 * 4] __attribute__((aligned(16))); - /* 32 128-bit vector registers */ -#endif /* __DARWIN_UNIX03 */ -}; - -/* - * _JBLEN is number of ints required to save the following: - * r1, r2, r13-r31, lr, cr, ctr, xer, sig == 26 register_t sized - * fr14 - fr31 = 18 doubles - * vmask, 32 vector registers = 129 ints - * 2 ints to get all the elements aligned - * - * register_t is 2 ints for ppc64 threads - */ -#define _JBLEN64 (26*2 + 18*2 + 129 + 1) -#define _JBLEN32 (26 + 18*2 + 129 + 1) -#define _JBLEN_MAX _JBLEN64 - -/* - * Locally scoped sizes - */ -#if defined(__ppc64__) -#define _JBLEN _JBLEN64 -#else -#define _JBLEN _JBLEN32 -#endif - -#if defined(KERNEL) -typedef _STRUCT_SIGCONTEXT32 jmp_buf32[1]; -typedef struct __sigjmp_buf32 { - int __storage[_JBLEN32 + 1] __attribute__((aligned(8))); - } sigjmp_buf32[1]; - -typedef struct sigcontext64 jmp_buf64[1]; -typedef struct __sigjmp_buf64 { - int __storage[_JBLEN64 + 1] __attribute__((aligned(8))); - } sigjmp_buf64[1]; - -/* - * JMM - have to decide how the kernel will deal with this. - * For now, hard-code the 32-bit types. - */ -typedef _STRUCT_SIGCONTEXT32 jmp_buf[1]; -typedef struct __sigjmp_buf32 sigjmp_buf[1]; - -#else -typedef int jmp_buf[_JBLEN]; -typedef int sigjmp_buf[_JBLEN + 1]; -#endif - -__BEGIN_DECLS -int setjmp(jmp_buf); -void longjmp(jmp_buf, int); - -#ifndef _ANSI_SOURCE -int _setjmp(jmp_buf); -void _longjmp(jmp_buf, int); -int sigsetjmp(sigjmp_buf, int); -void siglongjmp(sigjmp_buf, int); -#endif /* _ANSI_SOURCE */ - -#if !defined(_ANSI_SOURCE) && (!defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE)) -void longjmperror(void); -#endif /* neither ANSI nor POSIX */ -__END_DECLS - -#endif /* !_BSD_PPC_SETJMP_H_ */ diff --git a/bsd/ppc/signal.h b/bsd/ppc/signal.h deleted file mode 100644 index 31af83a02..000000000 --- a/bsd/ppc/signal.h +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1992, 1993 NeXT Computer, Inc. - */ - -#ifndef _PPC_SIGNAL_H_ -#define _PPC_SIGNAL_H_ 1 - -#include - -#ifndef _ANSI_SOURCE - -typedef int sig_atomic_t; - -#include - -#ifdef __APPLE_API_OBSOLETE - -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) - -#define __need_struct_sigcontext -#define __need_struct_sigcontext32 -#define __need_struct_sigcontext64 -#include - -/* - * Machine-dependant flags used in sigvec call. - */ -#define SV_SAVE_REGS 0x1000 /* Save all regs in sigcontext */ - -/* - * regs_saved_t -- Describes which registers beyond what the kernel cares - * about are saved to and restored from this sigcontext. - * - * The default is REGS_SAVED_CALLER, only the caller saved registers - * are saved. If the SV_SAVE_REGS flag was set when the signal - * handler was registered with sigvec() then all the registers will be - * saved in the sigcontext, and REGS_SAVED_ALL will be set. The C - * library uses REGS_SAVED_NONE in order to quickly restore kernel - * state during a longjmp(). - */ -typedef enum { - REGS_SAVED_NONE, /* Only kernel managed regs restored */ - REGS_SAVED_CALLER, /* "Caller saved" regs: rpc, a0-a7, - t0-t4, at, lk0-lk1, xt1-xt20, - xr0-xr1 */ - REGS_SAVED_ALL /* All registers */ -} regs_saved_t; - -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ - -#endif /* __APPLE_API_OBSOLETE */ - -#endif /* _ANSI_SOURCE */ - -#endif /* _PPC_SIGNAL_H_ */ - diff --git a/bsd/ppc/types.h b/bsd/ppc/types.h deleted file mode 100644 index 21265f8e0..000000000 --- a/bsd/ppc/types.h +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright 1995 NeXT Computer, Inc. All rights reserved. - */ -/* - * Copyright (c) 1990, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)types.h 8.3 (Berkeley) 1/5/94 - */ - -#ifndef _MACHTYPES_H_ -#define _MACHTYPES_H_ - -#ifndef __ASSEMBLER__ -#include -#include -/* - * Basic integral types. Omit the typedef if - * not possible for a machine/compiler combination. - */ -#ifndef _INT8_T -#define _INT8_T -typedef __signed char int8_t; -#endif -typedef unsigned char u_int8_t; -#ifndef _INT16_T -#define _INT16_T -typedef short int16_t; -#endif -typedef unsigned short u_int16_t; -#ifndef _INT32_T -#define _INT32_T -typedef int int32_t; -#endif -typedef unsigned int u_int32_t; -#ifndef _INT64_T -#define _INT64_T -typedef long long int64_t; -#endif -typedef unsigned long long u_int64_t; - -#if __LP64__ -typedef int64_t register_t; -#else -typedef int32_t register_t; -#endif - -#ifndef _INTPTR_T -#define _INTPTR_T -typedef __darwin_intptr_t intptr_t; -#endif -#ifndef _UINTPTR_T -#define _UINTPTR_T -typedef unsigned long uintptr_t; -#endif - -#if !defined(_ANSI_SOURCE) && (!defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE)) -/* These types are used for reserving the largest possible size. */ -typedef u_int64_t user_addr_t; -typedef u_int64_t user_size_t; -typedef int64_t user_ssize_t; -typedef int64_t user_long_t; -typedef u_int64_t user_ulong_t; -typedef int64_t user_time_t; -typedef int64_t user_off_t; -#define USER_ADDR_NULL ((user_addr_t) 0) -#define CAST_USER_ADDR_T(a_ptr) ((user_addr_t)((uintptr_t)(a_ptr))) - -#ifdef KERNEL - -/* - * These types are used when you know the word size of the target - * user process. They can be used to create struct layouts independent - * of the types and alignment requirements of the current running - * kernel. - */ - -/* - * The default ABI for the 32-bit PowerPC userspace is called "Power" - * alignment, and aligns fundamental integral data types to their - * natural boundary, with a maximum alignment of 4, even for 8-byte - * quantites. Power alignment also pads a structure to 8-byte alignment - * if the first field is an 8-byte quantity, which is not handled by - * these typedefs. The default ABI for 64-bit PowerPC userspace is called - * "Natural" alignment, and aligns fundamental integral data types - * to their natural boundaries. - */ - -typedef __uint64_t user64_addr_t __attribute__((aligned(8))); -typedef __uint64_t user64_size_t __attribute__((aligned(8))); -typedef __int64_t user64_ssize_t __attribute__((aligned(8))); -typedef __int64_t user64_long_t __attribute__((aligned(8))); -typedef __uint64_t user64_ulong_t __attribute__((aligned(8))); -typedef __int64_t user64_time_t __attribute__((aligned(8))); -typedef __int64_t user64_off_t __attribute__((aligned(8))); - -typedef __uint32_t user32_addr_t; -typedef __uint32_t user32_size_t; -typedef __int32_t user32_ssize_t; -typedef __int32_t user32_long_t; -typedef __uint32_t user32_ulong_t; -typedef __int32_t user32_time_t; -typedef __int64_t user32_off_t __attribute__((aligned(4))); - -#endif /* KERNEL */ - -#endif /* !_ANSI_SOURCE && (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ - -/* This defines the size of syscall arguments after copying into the kernel: */ -typedef u_int64_t syscall_arg_t; - -#ifndef __offsetof -#define __offsetof(type, field) ((size_t)(&((type *)0)->field)) -#endif - -#endif /* __ASSEMBLER__ */ -#endif /* _MACHTYPES_H_ */ diff --git a/bsd/ppc/ucontext.h b/bsd/ppc/ucontext.h deleted file mode 100644 index 5c391c283..000000000 --- a/bsd/ppc/ucontext.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef _PPC_UCONTEXT_H_ -#define _PPC_UCONTEXT_H_ - - -#include - -#if !__DARWIN_UNIX03 -struct mcontext { - struct ppc_exception_state es; - struct ppc_thread_state ss; - struct ppc_float_state fs; - struct ppc_vector_state vs; -}; -#define PPC_MCONTEXT_SIZE (PPC_THREAD_STATE_COUNT + PPC_FLOAT_STATE_COUNT + PPC_EXCEPTION_STATE_COUNT + PPC_VECTOR_STATE_COUNT) * sizeof(int) -#else /* __DARWIN_UNIX03 */ -struct __darwin_mcontext { - struct __darwin_ppc_exception_state es; - struct __darwin_ppc_thread_state ss; - struct __darwin_ppc_float_state fs; - struct __darwin_ppc_vector_state vs; -}; -#endif /* __DARWIN_UNIX03 */ - -#ifndef _MCONTEXT_T -#define _MCONTEXT_T -typedef __darwin_mcontext_t mcontext_t; -#endif - -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -struct mcontext64 { - struct ppc_exception_state64 es; - struct ppc_thread_state64 ss; - struct ppc_float_state fs; - struct ppc_vector_state vs; -}; -#define PPC_MCONTEXT64_SIZE (PPC_THREAD_STATE64_COUNT + PPC_FLOAT_STATE_COUNT + PPC_EXCEPTION_STATE_COUNT + PPC_VECTOR_STATE_COUNT) * sizeof(int) - -#ifndef _MCONTEXT64_T -#define _MCONTEXT64_T -typedef struct mcontext64 * mcontext64_t; -#endif - -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ - -#endif /* _PPC_UCONTEXT_H_ */ diff --git a/bsd/ppc/vmparam.h b/bsd/ppc/vmparam.h deleted file mode 100644 index 8e682fcdf..000000000 --- a/bsd/ppc/vmparam.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef _BSD_PPC_VMPARAM_H_ -#define _BSD_PPC_VMPARAM_H_ 1 - -#include - -#define USRSTACK (0xc0000000) - -/* - * put the default 64-bit stack at the max address - * (minus one 32-bit address space for other incidentals) - */ -#define USRSTACK64 (0x00007FFF5FC00000ULL) - -/* - * Virtual memory related constants, all in bytes - */ -#ifndef DFLDSIZ -#define DFLDSIZ (RLIM_INFINITY) /* initial data size limit */ -// XXX Not enforced -//#define DFLDSIZ (6*1024*1024) /* initial data size limit */ -#endif -#ifndef MAXDSIZ -#define MAXDSIZ (RLIM_INFINITY) /* max data size */ -#endif -#ifndef DFLSSIZ -#define DFLSSIZ (8*1024*1024) /* initial stack size limit */ -#endif -#ifndef MAXSSIZ -#define MAXSSIZ (64*1024*1024) /* max stack size */ -#endif -#ifndef DFLCSIZ -#define DFLCSIZ (0) /* initial core size limit */ -#endif -#ifndef MAXCSIZ -#define MAXCSIZ (RLIM_INFINITY) /* max core size */ -#endif - -#endif /* _BSD_PPC_VMPARAM_H_ */ diff --git a/bsd/security/Makefile b/bsd/security/Makefile index b574d2956..92974f6e2 100644 --- a/bsd/security/Makefile +++ b/bsd/security/Makefile @@ -10,8 +10,6 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ audit -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ INSTINC_SUBDIRS_X86_64 = \ @@ -21,8 +19,6 @@ INSTINC_SUBDIRS_ARM = \ EXPINC_SUBDIRS = \ audit -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ diff --git a/bsd/security/audit/Makefile b/bsd/security/audit/Makefile index 660e7c155..ac552f60d 100644 --- a/bsd/security/audit/Makefile +++ b/bsd/security/audit/Makefile @@ -9,8 +9,6 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ INSTINC_SUBDIRS_X86_64 = \ @@ -19,8 +17,6 @@ INSTINC_SUBDIRS_ARM = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ diff --git a/bsd/security/audit/audit.c b/bsd/security/audit/audit.c index c454867bf..1ee6c85cd 100644 --- a/bsd/security/audit/audit.c +++ b/bsd/security/audit/audit.c @@ -231,23 +231,25 @@ audit_record_ctor(proc_t p, struct kaudit_record *ar) ar->k_ar.ar_magic = AUDIT_RECORD_MAGIC; nanotime(&ar->k_ar.ar_starttime); - cred = kauth_cred_proc_ref(p); + if (PROC_NULL != p) { + cred = kauth_cred_proc_ref(p); - /* - * Export the subject credential. - */ - cru2x(cred, &ar->k_ar.ar_subj_cred); - ar->k_ar.ar_subj_ruid = cred->cr_ruid; - ar->k_ar.ar_subj_rgid = cred->cr_rgid; - ar->k_ar.ar_subj_egid = cred->cr_groups[0]; - ar->k_ar.ar_subj_pid = p->p_pid; - ar->k_ar.ar_subj_auid = cred->cr_audit.as_aia_p->ai_auid; - ar->k_ar.ar_subj_asid = cred->cr_audit.as_aia_p->ai_asid; - bcopy(&cred->cr_audit.as_mask, &ar->k_ar.ar_subj_amask, - sizeof(struct au_mask)); - bcopy(&cred->cr_audit.as_aia_p->ai_termid, &ar->k_ar.ar_subj_term_addr, - sizeof(struct au_tid_addr)); - kauth_cred_unref(&cred); + /* + * Export the subject credential. + */ + cru2x(cred, &ar->k_ar.ar_subj_cred); + ar->k_ar.ar_subj_ruid = kauth_cred_getruid(cred); + ar->k_ar.ar_subj_rgid = kauth_cred_getrgid(cred); + ar->k_ar.ar_subj_egid = kauth_cred_getgid(cred); + ar->k_ar.ar_subj_pid = p->p_pid; + ar->k_ar.ar_subj_auid = cred->cr_audit.as_aia_p->ai_auid; + ar->k_ar.ar_subj_asid = cred->cr_audit.as_aia_p->ai_asid; + bcopy(&cred->cr_audit.as_mask, &ar->k_ar.ar_subj_amask, + sizeof(struct au_mask)); + bcopy(&cred->cr_audit.as_aia_p->ai_termid, + &ar->k_ar.ar_subj_term_addr, sizeof(struct au_tid_addr)); + kauth_cred_unref(&cred); + } } static void @@ -311,6 +313,7 @@ audit_init(void) audit_kinfo.ai_termid.at_type = AU_IPv4; audit_kinfo.ai_termid.at_addr[0] = INADDR_ANY; + _audit_lck_grp_init(); mtx_init(&audit_mtx, "audit_mtx", NULL, MTX_DEF); KINFO_LOCK_INIT(); cv_init(&audit_worker_cv, "audit_worker_cv"); @@ -353,7 +356,7 @@ audit_shutdown(void) /* * Return the current thread's audit record, if any. */ -__inline__ struct kaudit_record * +struct kaudit_record * currecord(void) { @@ -373,11 +376,24 @@ audit_new(int event, proc_t p, __unused struct uthread *uthread) { struct kaudit_record *ar; int no_record; + int audit_override; + /* + * Override the audit_suspended and audit_enabled if it always + * audits session events. + * + * XXXss - This really needs to be a generalized call to a filter + * interface so if other things that use the audit subsystem in the + * future can simply plugged in. + */ + audit_override = (AUE_SESSION_START == event || + AUE_SESSION_UPDATE == event || AUE_SESSION_END == event || + AUE_SESSION_CLOSE == event); + mtx_lock(&audit_mtx); no_record = (audit_suspended || !audit_enabled); mtx_unlock(&audit_mtx); - if (no_record) + if (!audit_override && no_record) return (NULL); /* @@ -395,10 +411,13 @@ audit_new(int event, proc_t p, __unused struct uthread *uthread) ar->k_ar.ar_event = event; #if CONFIG_MACF - if (audit_mac_new(p, ar) != 0) { - zfree(audit_record_zone, ar); - return (NULL); - } + if (PROC_NULL != p) { + if (audit_mac_new(p, ar) != 0) { + zfree(audit_record_zone, ar); + return (NULL); + } + } else + ar->k_ar.ar_mac_records = NULL; #endif mtx_lock(&audit_mtx); @@ -414,7 +433,8 @@ audit_free(struct kaudit_record *ar) audit_record_dtor(ar); #if CONFIG_MACF - audit_mac_free(ar); + if (NULL != ar->k_ar.ar_mac_records) + audit_mac_free(ar); #endif zfree(audit_record_zone, ar); } @@ -427,6 +447,7 @@ audit_commit(struct kaudit_record *ar, int error, int retval) au_id_t auid; int sorf; struct au_mask *aumask; + int audit_override; if (ar == NULL) return; @@ -487,6 +508,17 @@ audit_commit(struct kaudit_record *ar, int error, int retval) event = ar->k_ar.ar_event; class = au_event_class(event); + /* + * See if we need to override the audit_suspend and audit_enabled + * flags. + * + * XXXss - This check needs to be generalized so new filters can + * easily be added. + */ + audit_override = (AUE_SESSION_START == event || + AUE_SESSION_UPDATE == event || AUE_SESSION_END == event || + AUE_SESSION_CLOSE == event); + ar->k_ar_commit |= AR_COMMIT_KERNEL; if (au_preselect(event, class, aumask, sorf) != 0) ar->k_ar_commit |= AR_PRESELECT_TRAIL; @@ -494,7 +526,8 @@ audit_commit(struct kaudit_record *ar, int error, int retval) ar->k_ar_commit & AR_PRESELECT_TRAIL) != 0) ar->k_ar_commit |= AR_PRESELECT_PIPE; if ((ar->k_ar_commit & (AR_PRESELECT_TRAIL | AR_PRESELECT_PIPE | - AR_PRESELECT_USER_TRAIL | AR_PRESELECT_USER_PIPE)) == 0) { + AR_PRESELECT_USER_TRAIL | AR_PRESELECT_USER_PIPE | + AR_PRESELECT_FILTER)) == 0) { mtx_lock(&audit_mtx); audit_pre_q_len--; mtx_unlock(&audit_mtx); @@ -511,7 +544,7 @@ audit_commit(struct kaudit_record *ar, int error, int retval) * enabled should still be committed? */ mtx_lock(&audit_mtx); - if (audit_suspended || !audit_enabled) { + if (!audit_override && (audit_suspended || !audit_enabled)) { audit_pre_q_len--; mtx_unlock(&audit_mtx); audit_free(ar); diff --git a/bsd/security/audit/audit.h b/bsd/security/audit/audit.h index 5af1da795..d85139b2b 100644 --- a/bsd/security/audit/audit.h +++ b/bsd/security/audit/audit.h @@ -174,7 +174,7 @@ void audit_syscall_exit(int error, struct proc *proc, void audit_mach_syscall_enter(unsigned short audit_event); void audit_mach_syscall_exit(int retval, struct uthread *uthread); -extern struct auditinfo_addr audit_default_aia; +extern struct auditinfo_addr *audit_default_aia_p; /* * The remaining kernel functions are conditionally compiled in as they are @@ -262,20 +262,23 @@ typedef struct ucred *kauth_cred_t; void audit_session_ref(kauth_cred_t cred); void audit_session_unref(kauth_cred_t cred); -void audit_session_procnew(kauth_cred_t cred); -void audit_session_procexit(kauth_cred_t cred); +void audit_session_procnew(proc_t p); +void audit_session_procexit(proc_t p); int audit_session_spawnjoin(proc_t p, ipc_port_t port); +void audit_sdev_submit(au_id_t auid, au_asid_t asid, void *record, + u_int record_len); + /* * Audit session macros. */ -#define IS_VALID_SESSION(a) ((a) != NULL && (a) != &audit_default_aia) +#define IS_VALID_SESSION(a) ((a) != NULL && (a) != audit_default_aia_p) #define AUDIT_SESSION_REF(cred) audit_session_ref(cred) #define AUDIT_SESSION_UNREF(cred) audit_session_unref(cred) -#define AUDIT_SESSION_PROCNEW(cred) audit_session_procnew(cred) -#define AUDIT_SESSION_PROCEXIT(cred) audit_session_procexit(cred) +#define AUDIT_SESSION_PROCNEW(p) audit_session_procnew(p) +#define AUDIT_SESSION_PROCEXIT(p) audit_session_procexit(p) #if CONFIG_MACF /* @@ -292,8 +295,8 @@ extern au_event_t sys_au_event[]; #define AUDIT_RECORD() \ ((struct uthread*)get_bsdthread_info(current_thread()))->uu_ar -#ifndef AUDIT_USE_BUILDIN_EXPECT -#define AUDIT_USE_BUILDIN_EXPECT +#ifndef AUDIT_USE_BUILTIN_EXPECT +#define AUDIT_USE_BUILTIN_EXPECT #endif #ifdef AUDIT_USE_BUILTIN_EXPECT diff --git a/bsd/security/audit/audit_arg.c b/bsd/security/audit/audit_arg.c index 66792758f..eb6d5d434 100644 --- a/bsd/security/audit/audit_arg.c +++ b/bsd/security/audit/audit_arg.c @@ -308,10 +308,10 @@ audit_arg_process(struct kaudit_record *ar, proc_t p) ar->k_ar.ar_arg_asid = my_cred->cr_audit.as_aia_p->ai_asid; bcopy(&my_cred->cr_audit.as_aia_p->ai_termid, &ar->k_ar.ar_arg_termid_addr, sizeof(au_tid_addr_t)); - ar->k_ar.ar_arg_euid = my_cred->cr_uid; - ar->k_ar.ar_arg_egid = my_cred->cr_groups[0]; - ar->k_ar.ar_arg_ruid = my_cred->cr_ruid; - ar->k_ar.ar_arg_rgid = my_cred->cr_rgid; + ar->k_ar.ar_arg_euid = kauth_cred_getuid(my_cred); + ar->k_ar.ar_arg_egid = kauth_cred_getgid(my_cred); + ar->k_ar.ar_arg_ruid = kauth_cred_getruid(my_cred); + ar->k_ar.ar_arg_rgid = kauth_cred_getrgid(my_cred); kauth_cred_unref(&my_cred); ar->k_ar.ar_arg_pid = p->p_pid; ARG_SET_VALID(ar, ARG_AUID | ARG_EUID | ARG_EGID | ARG_RUID | diff --git a/bsd/security/audit/audit_bsd.c b/bsd/security/audit/audit_bsd.c index fdae0d79d..6f4d416c9 100644 --- a/bsd/security/audit/audit_bsd.c +++ b/bsd/security/audit/audit_bsd.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2008-2009 Apple Inc. + * Copyright (c) 2008-2010 Apple Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -59,6 +59,11 @@ struct mhdr { char mh_data[0]; }; +/* + * The lock group for the audit subsystem. + */ +static lck_grp_t *audit_lck_grp = NULL; + #define AUDIT_MHMAGIC 0x4D656C53 #if AUDIT_MALLOC_DEBUG @@ -174,28 +179,25 @@ _audit_malloc(size_t size, au_malloc_type_t *type, int flags, const char *fn) _audit_malloc(size_t size, au_malloc_type_t *type, int flags) #endif { - union { - struct mhdr hdr; - char mem[size + sizeof (struct mhdr)]; - } *mem; - size_t memsize = sizeof (*mem); + struct mhdr *hdr; + size_t memsize = sizeof (*hdr) + size; if (size == 0) return (NULL); if (flags & M_NOWAIT) { - mem = (void *)kalloc_noblock(memsize); + hdr = (void *)kalloc_noblock(memsize); } else { - mem = (void *)kalloc(memsize); - if (mem == NULL) + hdr = (void *)kalloc(memsize); + if (hdr == NULL) panic("_audit_malloc: kernel memory exhausted"); } - if (mem == NULL) + if (hdr == NULL) return (NULL); - mem->hdr.mh_size = memsize; - mem->hdr.mh_type = type; - mem->hdr.mh_magic = AUDIT_MHMAGIC; + hdr->mh_size = memsize; + hdr->mh_type = type; + hdr->mh_magic = AUDIT_MHMAGIC; if (flags & M_ZERO) - memset(mem->hdr.mh_data, 0, size); + memset(hdr->mh_data, 0, size); #if AUDIT_MALLOC_DEBUG if (type != NULL && type->mt_type < NUM_MALLOC_TYPES) { OSAddAtomic64(memsize, &type->mt_size); @@ -206,7 +208,7 @@ _audit_malloc(size_t size, au_malloc_type_t *type, int flags) audit_malloc_types[type->mt_type] = type; } #endif /* AUDIT_MALLOC_DEBUG */ - return (mem->hdr.mh_data); + return (hdr->mh_data); } /* @@ -316,15 +318,99 @@ _audit_cv_wait_sig(struct cv *cvp, lck_mtx_t *mp, const char *desc) } /* - * Simple recursive lock. + * BSD Mutexes. + */ +void +#if DIAGNOSTIC +_audit_mtx_init(struct mtx *mp, const char *lckname) +#else +_audit_mtx_init(struct mtx *mp, __unused const char *lckname) +#endif +{ + mp->mtx_lock = lck_mtx_alloc_init(audit_lck_grp, LCK_ATTR_NULL); + KASSERT(mp->mtx_lock != NULL, + ("_audit_mtx_init: Could not allocate a mutex.")); +#if DIAGNOSTIC + strlcpy(mp->mtx_name, lckname, AU_MAX_LCK_NAME); +#endif +} + +void +_audit_mtx_destroy(struct mtx *mp) +{ + + if (mp->mtx_lock) { + lck_mtx_free(mp->mtx_lock, audit_lck_grp); + mp->mtx_lock = NULL; + } +} + +/* + * BSD rw locks. */ void -_audit_rlck_init(struct rlck *lp, const char *grpname) +#if DIAGNOSTIC +_audit_rw_init(struct rwlock *lp, const char *lckname) +#else +_audit_rw_init(struct rwlock *lp, __unused const char *lckname) +#endif +{ + lp->rw_lock = lck_rw_alloc_init(audit_lck_grp, LCK_ATTR_NULL); + KASSERT(lp->rw_lock != NULL, + ("_audit_rw_init: Could not allocate a rw lock.")); +#if DIAGNOSTIC + strlcpy(lp->rw_name, lckname, AU_MAX_LCK_NAME); +#endif +} + +void +_audit_rw_destroy(struct rwlock *lp) +{ + + if (lp->rw_lock) { + lck_rw_free(lp->rw_lock, audit_lck_grp); + lp->rw_lock = NULL; + } +} +/* + * Wait on a condition variable in a continuation (i.e. yield kernel stack). + * A cv_signal or cv_broadcast on the same condition variable will cause + * the thread to be scheduled. + */ +int +_audit_cv_wait_continuation(struct cv *cvp, lck_mtx_t *mp, thread_continue_t function) { + int status = KERN_SUCCESS; + + cvp->cv_waiters++; + assert_wait(cvp, THREAD_UNINT); + lck_mtx_unlock(mp); + + status = thread_block(function); - lp->rl_grp = lck_grp_alloc_init(grpname, LCK_GRP_ATTR_NULL); - lp->rl_mtx = lck_mtx_alloc_init(lp->rl_grp, LCK_ATTR_NULL); + /* should not be reached, but just in case, re-lock */ + lck_mtx_lock(mp); + + return status; +} + +/* + * Simple recursive lock. + */ +void +#if DIAGNOSTIC +_audit_rlck_init(struct rlck *lp, const char *lckname) +#else +_audit_rlck_init(struct rlck *lp, __unused const char *lckname) +#endif +{ + lp->rl_mtx = lck_mtx_alloc_init(audit_lck_grp, LCK_ATTR_NULL); + KASSERT(lp->rl_mtx != NULL, + ("_audit_rlck_init: Could not allocate a recursive lock.")); +#if DIAGNOSTIC + strlcpy(lp->rl_name, lckname, AU_MAX_LCK_NAME); +#endif lp->rl_thread = 0; lp->rl_recurse = 0; } @@ -368,12 +454,8 @@ _audit_rlck_destroy(struct rlck *lp) { if (lp->rl_mtx) { - lck_mtx_free(lp->rl_mtx, lp->rl_grp); - lp->rl_mtx = 0; - } - if (lp->rl_grp) { - lck_grp_free(lp->rl_grp); - lp->rl_grp = 0; + lck_mtx_free(lp->rl_mtx, audit_lck_grp); + lp->rl_mtx = NULL; } } @@ -397,12 +479,19 @@ _audit_rlck_assert(struct rlck *lp, u_int assert) * Simple sleep lock. */ void -_audit_slck_init(struct slck *lp, const char *grpname) +#if DIAGNOSTIC +_audit_slck_init(struct slck *lp, const char *lckname) +#else +_audit_slck_init(struct slck *lp, __unused const char *lckname) +#endif { - lp->sl_grp = lck_grp_alloc_init(grpname, LCK_GRP_ATTR_NULL); - lp->sl_mtx = lck_mtx_alloc_init(lp->sl_grp, LCK_ATTR_NULL); - + lp->sl_mtx = lck_mtx_alloc_init(audit_lck_grp, LCK_ATTR_NULL); + KASSERT(lp->sl_mtx != NULL, + ("_audit_slck_init: Could not allocate a sleep lock.")); +#if DIAGNOSTIC + strlcpy(lp->sl_name, lckname, AU_MAX_LCK_NAME); +#endif lp->sl_locked = 0; lp->sl_waiting = 0; } @@ -442,7 +531,7 @@ _audit_slck_unlock(struct slck *lp) lp->sl_waiting = 0; /* Wake up *all* sleeping threads. */ - thread_wakeup_prim((event_t) lp, /*1 thr*/ 0, THREAD_AWAKENED); + wakeup((event_t) lp); } lck_mtx_unlock(lp->sl_mtx); } @@ -482,12 +571,8 @@ _audit_slck_destroy(struct slck *lp) { if (lp->sl_mtx) { - lck_mtx_free(lp->sl_mtx, lp->sl_grp); - lp->sl_mtx = 0; - } - if (lp->sl_grp) { - lck_grp_free(lp->sl_grp); - lp->sl_grp = 0; + lck_mtx_free(lp->sl_mtx, audit_lck_grp); + lp->sl_mtx = NULL; } } @@ -545,6 +630,18 @@ _audit_ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps) return (rv); } +/* + * Initialize lock group for audit related locks/mutexes. + */ +void +_audit_lck_grp_init(void) +{ + audit_lck_grp = lck_grp_alloc_init("Audit", LCK_GRP_ATTR_NULL); + + KASSERT(audit_lck_grp != NULL, + ("audit_get_lck_grp: Could not allocate the audit lock group.")); +} + int audit_send_trigger(unsigned int trigger) { diff --git a/bsd/security/audit/audit_bsd.h b/bsd/security/audit/audit_bsd.h index 23b61a5df..72db99f35 100644 --- a/bsd/security/audit/audit_bsd.h +++ b/bsd/security/audit/audit_bsd.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2008, Apple Inc. + * Copyright (c) 2008-2009, Apple Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -55,6 +55,8 @@ #endif #endif /* DIAGNOSTIC */ +#define AU_MAX_LCK_NAME 32 + #if __DARWIN_BYTE_ORDER == __DARWIN_BIG_ENDIAN #define be16enc(p, d) *(p) = (d) #define be32enc(p, d) *(p) = (d) @@ -176,7 +178,9 @@ struct cv { */ struct mtx { lck_mtx_t *mtx_lock; - lck_grp_t *mtx_grp; +#if DIAGNOSTIC + char mtx_name[AU_MAX_LCK_NAME]; +#endif }; /* @@ -184,7 +188,9 @@ struct mtx { */ struct rwlock { lck_rw_t *rw_lock; - lck_grp_t *rw_grp; +#if DIAGNOSTIC + char rw_name[AU_MAX_LCK_NAME]; +#endif }; /* @@ -192,9 +198,11 @@ struct rwlock { */ struct slck { lck_mtx_t *sl_mtx; - lck_grp_t *sl_grp; int sl_locked; int sl_waiting; +#if DIAGNOSTIC + char sl_name[AU_MAX_LCK_NAME]; +#endif }; /* @@ -202,9 +210,11 @@ struct slck { */ struct rlck { lck_mtx_t *rl_mtx; - lck_grp_t *rl_grp; uint32_t rl_recurse; thread_t rl_thread; +#if DIAGNOSTIC + char rl_name[AU_MAX_LCK_NAME]; +#endif }; /* @@ -216,6 +226,8 @@ void _audit_cv_signal(struct cv *cvp); void _audit_cv_broadcast(struct cv *cvp); void _audit_cv_wait(struct cv *cvp, lck_mtx_t *mp, const char *desc); int _audit_cv_wait_sig(struct cv *cvp, lck_mtx_t *mp, const char *desc); +int _audit_cv_wait_continuation(struct cv *cvp, lck_mtx_t *mp, + thread_continue_t function); #define cv_init(cvp, desc) _audit_cv_init(cvp, desc) #define cv_destroy(cvp) _audit_cv_destroy(cvp) #define cv_signal(cvp) _audit_cv_signal(cvp) @@ -223,28 +235,20 @@ int _audit_cv_wait_sig(struct cv *cvp, lck_mtx_t *mp, const char *desc); #define cv_broadcastpri(cvp, pri) _audit_cv_broadcast(cvp) #define cv_wait(cvp, mp) _audit_cv_wait(cvp, (mp)->mtx_lock, #cvp) #define cv_wait_sig(cvp, mp) _audit_cv_wait_sig(cvp, (mp)->mtx_lock, #cvp) +#define cv_wait_continuation(cvp,mp,f) \ + _audit_cv_wait_continuation(cvp, (mp)->mtx_lock, f) /* * BSD Mutexes. */ -#define LOCK_MAX_NAME 64 -#define mtx_init(mp, name, type, opts) do { \ - (mp)->mtx_grp = lck_grp_alloc_init(name, LCK_GRP_ATTR_NULL); \ - (mp)->mtx_lock = lck_mtx_alloc_init((mp)->mtx_grp, \ - LCK_ATTR_NULL); \ -} while(0) -#define mtx_lock(mp) lck_mtx_lock((mp)->mtx_lock) -#define mtx_unlock(mp) lck_mtx_unlock((mp)->mtx_lock) -#define mtx_destroy(mp) do { \ - if ((mp)->mtx_lock) { \ - lck_mtx_free((mp)->mtx_lock, (mp)->mtx_grp); \ - (mp)->mtx_lock = 0; \ - } \ - if ((mp)->mtx_grp) { \ - lck_grp_free((mp)->mtx_grp); \ - (mp)->mtx_grp = 0; \ - } \ -} while (0) +void _audit_mtx_init(struct mtx *mp, const char *name); +void _audit_mtx_destroy(struct mtx *mp); +#define mtx_init(mp, name, type, opts) \ + _audit_mtx_init(mp, name) +#define mtx_lock(mp) lck_mtx_lock((mp)->mtx_lock) +#define mtx_unlock(mp) lck_mtx_unlock((mp)->mtx_lock) +#define mtx_destroy(mp) _audit_mtx_destroy(mp) +#define mtx_yield(mp) lck_mtx_yield((mp)->mtx_lock) /* * Sleep lock functions. @@ -277,25 +281,14 @@ void _audit_rlck_destroy(struct rlck *lp); /* * BSD rw locks. */ -#define rw_init(lp, name) do { \ - (lp)->rw_grp = lck_grp_alloc_init(name, LCK_GRP_ATTR_NULL); \ - (lp)->rw_lock = lck_rw_alloc_init((lp)->rw_grp, \ - LCK_ATTR_NULL); \ -} while(0) +void _audit_rw_init(struct rwlock *lp, const char *name); +void _audit_rw_destroy(struct rwlock *lp); +#define rw_init(lp, name) _audit_rw_init(lp, name) #define rw_rlock(lp) lck_rw_lock_shared((lp)->rw_lock) #define rw_runlock(lp) lck_rw_unlock_shared((lp)->rw_lock) #define rw_wlock(lp) lck_rw_lock_exclusive((lp)->rw_lock) #define rw_wunlock(lp) lck_rw_unlock_exclusive((lp)->rw_lock) -#define rw_destroy(lp) do { \ - if ((lp)->rw_lock) { \ - lck_rw_free((lp)->rw_lock, (lp)->rw_grp); \ - (lp)->rw_lock = 0; \ - } \ - if ((lp)->rw_grp) { \ - lck_grp_free((lp)->rw_grp); \ - (lp)->rw_grp = 0; \ - } \ -} while (0) +#define rw_destroy(lp) _audit_rw_destroy(lp) #define MA_OWNED LCK_MTX_ASSERT_OWNED #define RA_LOCKED LCK_RW_ASSERT_HELD @@ -319,6 +312,11 @@ void _audit_rlck_destroy(struct rlck *lp); #define slck_assert(lp, wht) #endif /* DIAGNOSTIC */ +/* + * Synchronization initialization. + */ +void _audit_lck_grp_init(void); + /* * BSD (IPv6) event rate limiter. */ diff --git a/bsd/security/audit/audit_bsm.c b/bsd/security/audit/audit_bsm.c index 0ee35a074..6f665d890 100644 --- a/bsd/security/audit/audit_bsm.c +++ b/bsd/security/audit/audit_bsm.c @@ -1757,6 +1757,24 @@ kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau) } break; + case AUE_SESSION_START: + case AUE_SESSION_UPDATE: + case AUE_SESSION_END: + case AUE_SESSION_CLOSE: + if (ARG_IS_VALID(kar, ARG_VALUE64)) { + tok = au_to_arg64(1, "sflags", ar->ar_arg_value64); + kau_write(rec, tok); + } + if (ARG_IS_VALID(kar, ARG_AMASK)) { + tok = au_to_arg32(2, "am_success", + ar->ar_arg_amask.am_success); + kau_write(rec, tok); + tok = au_to_arg32(3, "am_failure", + ar->ar_arg_amask.am_failure); + kau_write(rec, tok); + } + break; + /************************ * Mach system calls * ************************/ @@ -1884,7 +1902,7 @@ kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau) } #if CONFIG_MACF - do { + if (NULL != ar->ar_mac_records) { /* Convert the audit data from the MAC policies */ struct mac_audit_record *mar; @@ -1913,7 +1931,7 @@ kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau) kau_write(rec, tok); } - } while (0); + } #endif kau_write(rec, subj_tok); diff --git a/bsd/security/audit/audit_ioctl.h b/bsd/security/audit/audit_ioctl.h index 806f8ae93..1059532b9 100644 --- a/bsd/security/audit/audit_ioctl.h +++ b/bsd/security/audit/audit_ioctl.h @@ -31,6 +31,7 @@ #define _SECURITY_AUDIT_AUDIT_IOCTL_H_ #define AUDITPIPE_IOBASE 'A' +#define AUDITSDEV_IOBASE 'S' /* * Data structures used for complex ioctl arguments. Do not change existing @@ -79,4 +80,28 @@ struct auditpipe_ioctl_preselect { #define AUDITPIPE_GET_DROPS _IOR(AUDITPIPE_IOBASE, 102, u_int64_t) #define AUDITPIPE_GET_TRUNCATES _IOR(AUDITPIPE_IOBASE, 103, u_int64_t) +/* + * Ioctls for the audit session device. + */ +#define AUDITSDEV_GET_QLEN _IOR(AUDITSDEV_IOBASE, 1, u_int) +#define AUDITSDEV_GET_QLIMIT _IOR(AUDITSDEV_IOBASE, 2, u_int) +#define AUDITSDEV_SET_QLIMIT _IOW(AUDITSDEV_IOBASE, 3, u_int) +#define AUDITSDEV_GET_QLIMIT_MIN _IOR(AUDITSDEV_IOBASE, 4, u_int) +#define AUDITSDEV_GET_QLIMIT_MAX _IOR(AUDITSDEV_IOBASE, 5, u_int) +#define AUDITSDEV_FLUSH _IO(AUDITSDEV_IOBASE, 6) +#define AUDITSDEV_GET_MAXDATA _IOR(AUDITSDEV_IOBASE, 7, u_int) + +/* + * Ioctls to retrieve and set the ALLSESSIONS flag in the audit session device. + */ +#define AUDITSDEV_GET_ALLSESSIONS _IOR(AUDITSDEV_IOBASE, 100, u_int) +#define AUDITSDEV_SET_ALLSESSIONS _IOW(AUDITSDEV_IOBASE, 101, u_int) + +/* + * Ioctls to retrieve audit sessions device statistics. + */ +#define AUDITSDEV_GET_INSERTS _IOR(AUDITSDEV_IOBASE, 200, u_int64_t) +#define AUDITSDEV_GET_READS _IOR(AUDITSDEV_IOBASE, 201, u_int64_t) +#define AUDITSDEV_GET_DROPS _IOR(AUDITSDEV_IOBASE, 202, u_int64_t) + #endif /* _SECURITY_AUDIT_AUDIT_IOCTL_H_ */ diff --git a/bsd/security/audit/audit_private.h b/bsd/security/audit/audit_private.h index 803a2b936..aa26d7ede 100644 --- a/bsd/security/audit/audit_private.h +++ b/bsd/security/audit/audit_private.h @@ -113,6 +113,8 @@ extern au_class_t audit_kevent_mask; #define AR_PRESELECT_USER_TRAIL 0x00004000U #define AR_PRESELECT_USER_PIPE 0x00008000U +#define AR_PRESELECT_FILTER 0x00010000U + #define AR_DRAIN_QUEUE 0x80000000U /* @@ -171,6 +173,7 @@ union auditon_udata { int au_trigger; au_evclass_map_t au_evclass; au_mask_t au_mask; + au_asflgs_t au_flags; auditinfo_t au_auinfo; auditpinfo_t au_aupinfo; auditpinfo_addr_t au_aupinfo_addr; @@ -440,7 +443,7 @@ int audit_mac_syscall_exit(unsigned short code, struct uthread *uthread, * Audit Session. */ void audit_session_init(void); -int audit_session_setaia(proc_t p, auditinfo_addr_t *aia_p, int newprocess); +int audit_session_setaia(proc_t p, auditinfo_addr_t *aia_p); auditinfo_addr_t *audit_session_update(auditinfo_addr_t *new_aia); int audit_session_lookup(au_asid_t asid, auditinfo_addr_t *ret_aia); diff --git a/bsd/security/audit/audit_session.c b/bsd/security/audit/audit_session.c index 8e05f9dcd..4b63e0082 100644 --- a/bsd/security/audit/audit_session.c +++ b/bsd/security/audit/audit_session.c @@ -27,46 +27,40 @@ * POSSIBILITY OF SUCH DAMAGE. */ +#include + #include -#include +#include #include +#include +#include #include +#include #include #include #include #include #include +#include + +#include #include #include +#include +#include + #include #include +#include #include #include +#include #include -kern_return_t ipc_object_copyin(ipc_space_t, mach_port_name_t, - mach_msg_type_name_t, ipc_port_t *); -void ipc_port_release_send(ipc_port_t); - -/* - * The default auditinfo_addr entry for ucred. - */ -struct auditinfo_addr audit_default_aia = { - .ai_auid = AU_DEFAUDITID, - .ai_asid = AU_DEFAUDITSID, - .ai_termid = { .at_type = AU_IPv4, }, -}; - -#if CONFIG_AUDIT - -/* - * Currently the hash table is a fixed size. - */ -#define HASH_TABLE_SIZE 97 -#define HASH_ASID(asid) (audit_session_hash(asid) % HASH_TABLE_SIZE) +#include /* * Audit Session Entry. This is treated as an object with public and private @@ -84,119 +78,397 @@ struct au_sentry { long se_refcnt; /* Reference count. */ long se_procnt; /* Processes in session. */ ipc_port_t se_port; /* Session port. */ - struct klist se_klist; /* Knotes for session */ - struct mtx se_klist_mtx; /* se_klist mutex */ LIST_ENTRY(au_sentry) se_link; /* Hash bucket link list (1) */ }; typedef struct au_sentry au_sentry_t; #define AU_SENTRY_PTR(aia_p) ((au_sentry_t *)(aia_p)) +/* + * The default au_sentry/auditinfo_addr entry for ucred. + */ + +static au_sentry_t audit_default_se = { + .se_auinfo = { + .ai_auid = AU_DEFAUDITID, + .ai_asid = AU_DEFAUDITSID, + .ai_termid = { .at_type = AU_IPv4, }, + }, + .se_refcnt = 1, + .se_procnt = 1, +}; + +struct auditinfo_addr *audit_default_aia_p = &audit_default_se.se_auinfo; + +kern_return_t ipc_object_copyin(ipc_space_t, mach_port_name_t, + mach_msg_type_name_t, ipc_port_t *); +void ipc_port_release_send(ipc_port_t); + +#if CONFIG_AUDIT + + +/* + * Currently the hash table is a fixed size. + */ +#define HASH_TABLE_SIZE 97 +#define HASH_ASID(asid) (audit_session_hash(asid) % HASH_TABLE_SIZE) + static struct rwlock se_entry_lck; /* (1) lock for se_link above */ LIST_HEAD(au_sentry_head, au_sentry); static struct au_sentry_head *au_sentry_bucket = NULL; +#define AU_HISTORY_LOGGING 0 +#if AU_HISTORY_LOGGING +typedef enum au_history_event { + AU_HISTORY_EVENT_UNKNOWN = 0, + AU_HISTORY_EVENT_REF = 1, + AU_HISTORY_EVENT_UNREF = 2, + AU_HISTORY_EVENT_BIRTH = 3, + AU_HISTORY_EVENT_DEATH = 4, + AU_HISTORY_EVENT_FIND = 5 +} au_history_event_t; + +#define AU_HISTORY_MAX_STACK_DEPTH 8 + +struct au_history { + struct au_sentry *ptr; + struct au_sentry se; + void *stack[AU_HISTORY_MAX_STACK_DEPTH]; + unsigned int stack_depth; + au_history_event_t event; +}; + +static struct au_history *au_history; +static size_t au_history_size = 65536; +static unsigned int au_history_index; + +static inline unsigned int +au_history_entries(void) +{ + if (au_history_index >= au_history_size) + return au_history_size; + else + return au_history_index; +} + +static inline void +au_history_record(au_sentry_t *se, au_history_event_t event) +{ + struct au_history *p; + unsigned int i; + + i = OSAddAtomic(1, &au_history_index); + p = &au_history[i % au_history_size]; + + bzero(p, sizeof(*p)); + p->event = event; + bcopy(se, &p->se, sizeof(p->se)); + p->stack_depth = OSBacktrace(&p->stack[0], AU_HISTORY_MAX_STACK_DEPTH); + p->ptr = se; +} +#else +#define au_history_record(se, event) do {} while (0) +#endif + +MALLOC_DEFINE(M_AU_SESSION, "audit_session", "Audit session data"); + +static void audit_ref_session(au_sentry_t *se); +static void audit_unref_session(au_sentry_t *se); + +static void audit_session_event(int event, auditinfo_addr_t *aia_p); + +/* + * Audit session device. + */ + +static MALLOC_DEFINE(M_AUDIT_SDEV, "audit_sdev", "Audit sdevs"); +static MALLOC_DEFINE(M_AUDIT_SDEV_ENTRY, "audit_sdevent", + "Audit sdev entries and buffers"); + +/* + * Default audit sdev buffer parameters. + */ +#define AUDIT_SDEV_QLIMIT_DEFAULT 128 +#define AUDIT_SDEV_QLIMIT_MIN 1 +#define AUDIT_SDEV_QLIMIT_MAX 1024 + /* - * Audit Propagation Knote List is a list of kevent knotes that are assosiated - * with an any ASID knote. If the any ASID gets modified or deleted these are - * modified or deleted as well. + * Entry structure. */ -struct au_plist { - struct knote *pl_knote; /* ptr to per-session knote */ - LIST_ENTRY(au_plist) pl_link; /* list link (2) */ +struct audit_sdev_entry { + void *ase_record; + u_int ase_record_len; + TAILQ_ENTRY(audit_sdev_entry) ase_queue; }; -typedef struct au_plist au_plist_t; -struct au_plisthead { - struct rlck ph_rlck; /* (2) lock for pl_link list */ - LIST_HEAD(au_plhead, au_plist) ph_head; /* list head */ +/* + * Per audit sdev structure. + */ + +struct audit_sdev { + int asdev_open; + +#define AUDIT_SDEV_ASYNC 0x00000001 +#define AUDIT_SDEV_NBIO 0x00000002 + +#define AUDIT_SDEV_ALLSESSIONS 0x00010000 + u_int asdev_flags; + + struct selinfo asdev_selinfo; + pid_t asdev_sigio; + + au_id_t asdev_auid; + au_asid_t asdev_asid; + + /* Per-sdev mutex for most fields in this struct. */ + struct mtx asdev_mtx; + + /* + * Per-sdev sleep lock serializing user-generated reads and + * flushes. uiomove() is called to copy out the current head + * record's data whie the record remains in the queue, so we + * prevent other threads from removing it using this lock. + */ + struct slck asdev_sx; + + /* + * Condition variable to signal when data has been delivered to + * a sdev. + */ + struct cv asdev_cv; + + /* Count and bound of records in the queue. */ + u_int asdev_qlen; + u_int asdev_qlimit; + + /* The number of bytes of data across all records. */ + u_int asdev_qbyteslen; + + /* + * The amount read so far of the first record in the queue. + * (The number of bytes available for reading in the queue is + * qbyteslen - qoffset.) + */ + u_int asdev_qoffset; + + /* + * Per-sdev operation statistics. + */ + u_int64_t asdev_inserts; /* Records added. */ + u_int64_t asdev_reads; /* Records read. */ + u_int64_t asdev_drops; /* Records dropped. */ + + /* + * Current pending record list. This is protected by a + * combination of asdev_mtx and asdev_sx. Note that both + * locks are required to remove a record from the head of the + * queue, as an in-progress read may sleep while copying and, + * therefore, cannot hold asdev_mtx. + */ + TAILQ_HEAD(, audit_sdev_entry) asdev_queue; + + /* Global sdev list. */ + TAILQ_ENTRY(audit_sdev) asdev_list; }; -typedef struct au_plisthead au_plisthead_t; -#define EV_ANY_ASID EV_FLAG0 +#define AUDIT_SDEV_LOCK(asdev) mtx_lock(&(asdev)->asdev_mtx) +#define AUDIT_SDEV_LOCK_ASSERT(asdev) mtx_assert(&(asdev)->asdev_mtx, \ + MA_OWNED) +#define AUDIT_SDEV_LOCK_DESTROY(asdev) mtx_destroy(&(asdev)->asdev_mtx) +#define AUDIT_SDEV_LOCK_INIT(asdev) mtx_init(&(asdev)->asdev_mtx, \ + "audit_sdev_mtx", NULL, MTX_DEF) +#define AUDIT_SDEV_UNLOCK(asdev) mtx_unlock(&(asdev)->asdev_mtx) +#define AUDIT_SDEV_MTX(asdev) (&(asdev)->asdev_mtx) + +#define AUDIT_SDEV_SX_LOCK_DESTROY(asd) slck_destroy(&(asd)->asdev_sx) +#define AUDIT_SDEV_SX_LOCK_INIT(asd) slck_init(&(asd)->asdev_sx, \ + "audit_sdev_sx") +#define AUDIT_SDEV_SX_XLOCK_ASSERT(asd) slck_assert(&(asd)->asdev_sx, \ + SA_XLOCKED) +#define AUDIT_SDEV_SX_XLOCK_SIG(asd) slck_lock_sig(&(asd)->asdev_sx) +#define AUDIT_SDEV_SX_XUNLOCK(asd) slck_unlock(&(asd)->asdev_sx) -MALLOC_DEFINE(M_AU_SESSION, "audit_session", "Audit session data"); -MALLOC_DEFINE(M_AU_EV_PLIST, "audit_ev_plist", "Audit session event plist"); +/* + * Cloning variables and constants. + */ +#define AUDIT_SDEV_NAME "auditsessions" +#define MAX_AUDIT_SDEVS 32 + +static int audit_sdev_major; +static void *devnode; + +/* + * Global list of audit sdevs. The list is protected by a rw lock. + * Individaul record queues are protected by per-sdev locks. These + * locks synchronize between threads walking the list to deliver to + * individual sdevs and adds/removes of sdevs. + */ +static TAILQ_HEAD(, audit_sdev) audit_sdev_list; +static struct rwlock audit_sdev_lock; + +#define AUDIT_SDEV_LIST_LOCK_INIT() rw_init(&audit_sdev_lock, \ + "audit_sdev_list_lock") +#define AUDIT_SDEV_LIST_RLOCK() rw_rlock(&audit_sdev_lock) +#define AUDIT_SDEV_LIST_RUNLOCK() rw_runlock(&audit_sdev_lock) +#define AUDIT_SDEV_LIST_WLOCK() rw_wlock(&audit_sdev_lock) +#define AUDIT_SDEV_LIST_WLOCK_ASSERT() rw_assert(&audit_sdev_lock, \ + RA_WLOCKED) +#define AUDIT_SDEV_LIST_WUNLOCK() rw_wunlock(&audit_sdev_lock) + +/* + * dev_t doesn't have a pointer for "softc" data so we have to keep track of + * it with the following global array (indexed by the minor number). + * + * XXX We may want to dynamically grow this as need. + */ +static struct audit_sdev *audit_sdev_dtab[MAX_AUDIT_SDEVS]; /* - * Kevent filters. + * Special device methods and definition. */ -static int audit_filt_sessionattach(struct knote *kn); -static void audit_filt_sessiondetach(struct knote *kn); -static void audit_filt_sessiontouch(struct knote *kn, - struct kevent64_s *kev, long type); -static int audit_filt_session(struct knote *kn, long hint); - -static void audit_register_kevents(uint32_t asid, uint32_t auid); - -struct filterops audit_session_filtops = { - .f_attach = audit_filt_sessionattach, - .f_detach = audit_filt_sessiondetach, - .f_touch = audit_filt_sessiontouch, - .f_event = audit_filt_session, +static open_close_fcn_t audit_sdev_open; +static open_close_fcn_t audit_sdev_close; +static read_write_fcn_t audit_sdev_read; +static ioctl_fcn_t audit_sdev_ioctl; +static select_fcn_t audit_sdev_poll; + +static struct cdevsw audit_sdev_cdevsw = { + .d_open = audit_sdev_open, + .d_close = audit_sdev_close, + .d_read = audit_sdev_read, + .d_write = eno_rdwrt, + .d_ioctl = audit_sdev_ioctl, + .d_stop = eno_stop, + .d_reset = eno_reset, + .d_ttys = NULL, + .d_select = audit_sdev_poll, + .d_mmap = eno_mmap, + .d_strategy = eno_strat, + .d_type = 0 }; /* - * The klist for consumers that are interested in any session (ASID). This list - * is not associated with any data structure but is used for registering - * new kevents when sessions are created. This klist is lock by - * anyas_klist_mtx. - */ -static struct klist anyas_klist; -struct mtx anyas_klist_mtx; - -#define AUDIT_ANYAS_KLIST_LOCK_INIT() mtx_init(&anyas_klist_mtx, \ - "audit anyas_klist_mtx", NULL, MTX_DEF) -#define AUDIT_ANYAS_KLIST_LOCK() mtx_lock(&anyas_klist_mtx) -#define AUDIT_ANYAS_KLIST_UNLOCK() mtx_unlock(&anyas_klist_mtx) -#define AUDIT_ANYAS_KLIST_LOCK_ASSERT() mtx_assert(&anyas_klist_mtx, MA_OWNED) + * Global statistics on audit sdevs. + */ +static int audit_sdev_count; /* Current number of sdevs. */ +static u_int64_t audit_sdev_ever; /* Sdevs ever allocated. */ +static u_int64_t audit_sdev_records; /* Total records seen. */ +static u_int64_t audit_sdev_drops; /* Global record drop count. */ + +static int audit_sdev_init(void); #define AUDIT_SENTRY_RWLOCK_INIT() rw_init(&se_entry_lck, \ - "audit se_entry_lck") + "se_entry_lck") #define AUDIT_SENTRY_RLOCK() rw_rlock(&se_entry_lck) #define AUDIT_SENTRY_WLOCK() rw_wlock(&se_entry_lck) #define AUDIT_SENTRY_RWLOCK_ASSERT() rw_assert(&se_entry_lck, RA_LOCKED) #define AUDIT_SENTRY_RUNLOCK() rw_runlock(&se_entry_lck) #define AUDIT_SENTRY_WUNLOCK() rw_wunlock(&se_entry_lck) -#define AUDIT_SE_KLIST_LOCK_INIT(se, n) mtx_init(&(se)->se_klist_mtx, \ - n, NULL, MTX_DEF) -#define AUDIT_SE_KLIST_LOCK(se) mtx_lock(&(se)->se_klist_mtx) -#define AUDIT_SE_KLIST_UNLOCK(se) mtx_unlock(&(se)->se_klist_mtx) -#define AUDIT_SE_KLIST_LOCK_DESTROY(se) mtx_destroy(&(se)->se_klist_mtx) -#define AUDIT_SE_KLIST_LOCK_ASSERT(se) mtx_assert(&(se)->se_klist_mtx, \ - MA_OWNED) - -#define AUDIT_PLIST_LOCK_INIT(pl) rlck_init(&(pl)->ph_rlck, \ - "audit ph_rlck") -#define AUDIT_PLIST_LOCK(pl) rlck_lock(&(pl)->ph_rlck) -#define AUDIT_PLIST_UNLOCK(pl) rlck_unlock(&(pl)->ph_rlck) -#define AUDIT_PLIST_LOCK_DESTROY(pl) rlck_destroy(&(pl)->ph_rlck) - +/* Access control on the auditinfo_addr.ai_flags member. */ +static uint64_t audit_session_superuser_set_sflags_mask; +static uint64_t audit_session_superuser_clear_sflags_mask; +static uint64_t audit_session_member_set_sflags_mask; +static uint64_t audit_session_member_clear_sflags_mask; +SYSCTL_NODE(, OID_AUTO, audit, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Audit controls"); +SYSCTL_NODE(_audit, OID_AUTO, session, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Audit sessions"); +SYSCTL_QUAD(_audit_session, OID_AUTO, superuser_set_sflags_mask, CTLFLAG_RW | CTLFLAG_LOCKED, + &audit_session_superuser_set_sflags_mask, + "Audit session flags settable by superuser"); +SYSCTL_QUAD(_audit_session, OID_AUTO, superuser_clear_sflags_mask, CTLFLAG_RW | CTLFLAG_LOCKED, + &audit_session_superuser_clear_sflags_mask, + "Audit session flags clearable by superuser"); +SYSCTL_QUAD(_audit_session, OID_AUTO, member_set_sflags_mask, CTLFLAG_RW | CTLFLAG_LOCKED, + &audit_session_member_set_sflags_mask, + "Audit session flags settable by a session member"); +SYSCTL_QUAD(_audit_session, OID_AUTO, member_clear_sflags_mask, CTLFLAG_RW | CTLFLAG_LOCKED, + &audit_session_member_clear_sflags_mask, + "Audit session flags clearable by a session member"); + +#define AUDIT_SESSION_DEBUG 0 #if AUDIT_SESSION_DEBUG +/* + * The following is debugging code that can be used to get a snapshot of the + * session state. The audit session information is read out using sysctl: + * + * error = sysctlbyname("kern.audit_session_debug", buffer_ptr, &buffer_len, + * NULL, 0); + */ #include +/* + * The per session record structure for the snapshot data. + */ struct au_sentry_debug { auditinfo_addr_t se_auinfo; - long se_refcnt; - long se_procnt; + int64_t se_refcnt; /* refereence count */ + int64_t se_procnt; /* process count */ + int64_t se_ptcnt; /* process count from + proc table */ }; typedef struct au_sentry_debug au_sentry_debug_t; static int audit_sysctl_session_debug(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); -SYSCTL_PROC(_kern, OID_AUTO, audit_session_debug, CTLFLAG_RD, NULL, 0, - audit_sysctl_session_debug, "S,audit_session_debug", +SYSCTL_PROC(_kern, OID_AUTO, audit_session_debug, CTLFLAG_RD | CTLFLAG_LOCKED, + NULL, 0, audit_sysctl_session_debug, "S,audit_session_debug", "Current session debug info for auditing."); /* - * Copy out the session debug info via the sysctl interface. The userland code - * is something like the following: + * Callouts for proc_interate() which is used to reconcile the audit session + * proc state information with the proc table. We get everything we need + * in the filterfn while the proc_lock() is held so we really don't need the + * callout() function. + */ +static int +audit_session_debug_callout(__unused proc_t p, __unused void *arg) +{ + + return (PROC_RETURNED_DONE); +} + +static int +audit_session_debug_filterfn(proc_t p, void *st) +{ + kauth_cred_t cred = p->p_ucred; + auditinfo_addr_t *aia_p = cred->cr_audit.as_aia_p; + au_sentry_debug_t *sed_tab = (au_sentry_debug_t *) st; + au_sentry_debug_t *sdtp; + au_sentry_t *se; + + if (IS_VALID_SESSION(aia_p)) { + sdtp = &sed_tab[0]; + do { + if (aia_p->ai_asid == sdtp->se_asid) { + sdtp->se_ptcnt++; + + /* Do some santy checks. */ + se = AU_SENTRY_PTR(aia_p); + if (se->se_refcnt != sdtp->se_refcnt) { + sdtp->se_refcnt = + (int64_t)se->se_refcnt; + } + if (se->se_procnt != sdtp->se_procnt) { + sdtp->se_procnt = + (int64_t)se->se_procnt; + } + break; + } + sdtp++; + } while (sdtp->se_asid != 0 && sdtp->se_auid != 0); + } else { + /* add it to the default sesison */ + sed_tab->se_ptcnt++; + } + + return (0); +} + +/* + * Copy out the session debug info via the sysctl interface. * - * error = sysctlbyname("kern.audit_session_debug", buffer_ptr, &buffer_len, - * NULL, 0); */ static int audit_sysctl_session_debug(__unused struct sysctl_oid *oidp, @@ -223,6 +495,7 @@ audit_sysctl_session_debug(__unused struct sysctl_oid *oidp, if (se != NULL) entry_cnt++; + entry_cnt++; /* add one for the default entry */ /* * If just querying then return the space required. There is an * obvious race condition here so we just fudge this by 3 in case @@ -258,10 +531,18 @@ audit_sysctl_session_debug(__unused struct sysctl_oid *oidp, */ sz = 0; next_sed = sed_tab; + /* add the first entry for processes not tracked in sessions. */ + bcopy(audit_default_aia_p, &next_sed->se_auinfo, sizeof (au_sentry_t)); + next_sed->se_refcnt = (int64_t)audit_default_se.se_refcnt; + next_sed->se_procnt = (int64_t)audit_default_se.se_procnt; + next_sed++; + sz += sizeof(au_sentry_debug_t); for(i = 0; i < HASH_TABLE_SIZE; i++) { LIST_FOREACH(se, &au_sentry_bucket[i], se_link) { if (se != NULL) { - bcopy(se, next_sed, sizeof(next_sed)); + next_sed->se_auinfo = se->se_auinfo; + next_sed->se_refcnt = (int64_t)se->se_refcnt; + next_sed->se_procnt = (int64_t)se->se_procnt; next_sed++; sz += sizeof(au_sentry_debug_t); } @@ -269,6 +550,12 @@ audit_sysctl_session_debug(__unused struct sysctl_oid *oidp, } AUDIT_SENTRY_RUNLOCK(); + /* Reconcile with the process table. */ + (void) proc_iterate(PROC_ALLPROCLIST | PROC_ZOMBPROCLIST, + audit_session_debug_callout, NULL, + audit_session_debug_filterfn, (void *)&sed_tab[0]); + + req->oldlen = sz; err = SYSCTL_OUT(req, sed_tab, sz); kfree(sed_tab, entry_cnt * sizeof(au_sentry_debug_t)); @@ -278,6 +565,65 @@ audit_sysctl_session_debug(__unused struct sysctl_oid *oidp, #endif /* AUDIT_SESSION_DEBUG */ +/* + * Create and commit a session audit event. The proc and se arguments needs to + * be that of the subject and not necessarily the current process. + */ +static void +audit_session_event(int event, auditinfo_addr_t *aia_p) +{ + struct kaudit_record *ar; + + KASSERT(AUE_SESSION_START == event || AUE_SESSION_UPDATE == event || + AUE_SESSION_END == event || AUE_SESSION_CLOSE == event, + ("audit_session_event: invalid event: %d", event)); + + if (NULL == aia_p) + return; + + /* + * Create a new audit record. The record will contain the subject + * ruid, rgid, egid, pid, auid, asid, amask, and term_addr + * (implicitly added by audit_new). + */ + ar = audit_new(event, PROC_NULL, /* Not used */ NULL); + if (NULL == ar) + return; + + /* + * Audit session events are always generated because they are used + * by some userland consumers so just set the preselect flag. + */ + ar->k_ar_commit |= AR_PRESELECT_FILTER; + + /* + * Populate the subject information. Note that the ruid, rgid, + * egid, and pid values are incorrect. We only need the auditinfo_addr + * information. + */ + ar->k_ar.ar_subj_ruid = 0; + ar->k_ar.ar_subj_rgid = 0; + ar->k_ar.ar_subj_egid = 0; + ar->k_ar.ar_subj_pid = 0; + ar->k_ar.ar_subj_auid = aia_p->ai_auid; + ar->k_ar.ar_subj_asid = aia_p->ai_asid; + bcopy(&aia_p->ai_termid, &ar->k_ar.ar_subj_term_addr, + sizeof(struct au_tid_addr)); + + /* Add the audit masks to the record. */ + ar->k_ar.ar_arg_amask.am_success = aia_p->ai_mask.am_success; + ar->k_ar.ar_arg_amask.am_failure = aia_p->ai_mask.am_failure; + ARG_SET_VALID(ar, ARG_AMASK); + + /* Add the audit session flags to the record. */ + ar->k_ar.ar_arg_value64 = aia_p->ai_flags; + ARG_SET_VALID(ar, ARG_VALUE64); + + + /* Commit the record to the queue. */ + audit_commit(ar, 0, 0); +} + /* * Hash the audit session ID using a simple 32-bit mix. */ @@ -296,7 +642,8 @@ audit_session_hash(au_asid_t asid) /* * Do an hash lookup and find the session entry for a given ASID. Return NULL - * if not found. + * if not found. If the session is found then audit_session_find takes a + * reference. */ static au_sentry_t * audit_session_find(au_asid_t asid) @@ -309,23 +656,14 @@ audit_session_find(au_asid_t asid) hkey = HASH_ASID(asid); LIST_FOREACH(found_se, &au_sentry_bucket[hkey], se_link) - if (found_se->se_asid == asid) + if (found_se->se_asid == asid) { + au_history_record(found_se, AU_HISTORY_EVENT_FIND); + audit_ref_session(found_se); return (found_se); + } return (NULL); } -/* - * Call kqueue knote while holding the session entry klist lock. - */ -static void -audit_session_knote(au_sentry_t *se, long hint) -{ - - AUDIT_SE_KLIST_LOCK(se); - KNOTE(&se->se_klist, hint); - AUDIT_SE_KLIST_UNLOCK(se); -} - /* * Remove the given audit_session entry from the hash table. */ @@ -335,20 +673,35 @@ audit_session_remove(au_sentry_t *se) uint32_t hkey; au_sentry_t *found_se, *tmp_se; + au_history_record(se, AU_HISTORY_EVENT_DEATH); KASSERT(se->se_refcnt == 0, ("audit_session_remove: ref count != 0")); + KASSERT(se != &audit_default_se, + ("audit_session_remove: removing default session")); hkey = HASH_ASID(se->se_asid); AUDIT_SENTRY_WLOCK(); + /* + * Check and see if someone got a reference before we got the lock. + */ + if (se->se_refcnt != 0) { + AUDIT_SENTRY_WUNLOCK(); + return; + } + + audit_session_portdestroy(&se->se_port); LIST_FOREACH_SAFE(found_se, &au_sentry_bucket[hkey], se_link, tmp_se) { if (found_se == se) { - audit_session_knote(found_se, NOTE_AS_CLOSE); + /* + * Generate an audit event to notify userland of the + * session close. + */ + audit_session_event(AUE_SESSION_CLOSE, + &found_se->se_auinfo); LIST_REMOVE(found_se, se_link); AUDIT_SENTRY_WUNLOCK(); - AUDIT_SE_KLIST_LOCK_DESTROY(found_se); - found_se->se_refcnt = 0; free(found_se, M_AU_SESSION); return; @@ -365,6 +718,11 @@ audit_ref_session(au_sentry_t *se) { long old_val; + if (se == NULL || se == &audit_default_se) + return; + + au_history_record(se, AU_HISTORY_EVENT_REF); + old_val = OSAddAtomicLong(1, &se->se_refcnt); KASSERT(old_val < 100000, ("audit_ref_session: Too many references on session.")); @@ -378,6 +736,11 @@ audit_unref_session(au_sentry_t *se) { long old_val; + if (se == NULL || se == &audit_default_se) + return; + + au_history_record(se, AU_HISTORY_EVENT_UNREF); + old_val = OSAddAtomicLong(-1, &se->se_refcnt); if (old_val == 1) audit_session_remove(se); @@ -393,6 +756,9 @@ audit_inc_procount(au_sentry_t *se) { long old_val; + if (se == NULL || se == &audit_default_se) + return; + old_val = OSAddAtomicLong(1, &se->se_procnt); KASSERT(old_val <= PID_MAX, ("audit_inc_procount: proc count > PID_MAX")); @@ -407,9 +773,16 @@ audit_dec_procount(au_sentry_t *se) { long old_val; + if (se == NULL || se == &audit_default_se) + return; + old_val = OSAddAtomicLong(-1, &se->se_procnt); + /* + * If this was the last process generate an audit event to notify + * userland of the session ending. + */ if (old_val == 1) - audit_session_knote(se, NOTE_AS_END); + audit_session_event(AUE_SESSION_END, &se->se_auinfo); KASSERT(old_val >= 1, ("audit_dec_procount: proc count < 0")); } @@ -426,7 +799,7 @@ audit_update_sentry(au_sentry_t *se, auditinfo_addr_t *new_aia) auditinfo_addr_t *aia = &se->se_auinfo; int update; - KASSERT(new_aia != &audit_default_aia, + KASSERT(new_aia != audit_default_aia_p, ("audit_update_sentry: Trying to update the default aia.")); update = (aia->ai_auid != new_aia->ai_auid || @@ -464,64 +837,66 @@ audit_session_nextid(void) * reference to the entry that must be unref'ed. */ static auditinfo_addr_t * -audit_session_new(auditinfo_addr_t *new_aia, int newprocess) +audit_session_new(auditinfo_addr_t *new_aia_p, auditinfo_addr_t *old_aia_p) { - au_asid_t asid; + au_asid_t new_asid; au_sentry_t *se = NULL; + au_sentry_t *found_se = NULL; auditinfo_addr_t *aia = NULL; - char nm[LOCK_MAX_NAME]; - KASSERT(new_aia != NULL, ("audit_session_new: new_aia == NULL")); + KASSERT(new_aia_p != NULL, ("audit_session_new: new_aia_p == NULL")); - asid = new_aia->ai_asid; + new_asid = new_aia_p->ai_asid; -#if 0 /* XXX this assertion is currently broken by securityd/LoginWindow */ - KASSERT((asid != AU_ASSIGN_ASID && asid <= PID_MAX), - ("audit_session_new: illegal ASID value: %d", asid)); -#endif - /* * Alloc a new session entry now so we don't wait holding the lock. */ se = malloc(sizeof(au_sentry_t), M_AU_SESSION, M_WAITOK | M_ZERO); - snprintf(nm, sizeof(nm), "audit se_klist_mtx %d", asid); - AUDIT_SE_KLIST_LOCK_INIT(se, nm); - /* * Find an unique session ID, if desired. */ AUDIT_SENTRY_WLOCK(); - if (asid == AU_ASSIGN_ASID) { + if (new_asid == AU_ASSIGN_ASID) { do { - asid = (au_asid_t)audit_session_nextid(); - } while(audit_session_find(asid) != NULL); + + new_asid = (au_asid_t)audit_session_nextid(); + found_se = audit_session_find(new_asid); + + /* + * If the session ID is currently active then drop the + * reference and try again. + */ + if (found_se != NULL) + audit_unref_session(found_se); + else + break; + } while(1); } else { - au_sentry_t *found_se = NULL; /* * Check to see if the requested ASID is already in the * hash table. If so, update it with the new auditinfo. */ - if ((found_se = audit_session_find(asid)) != NULL) { + if ((found_se = audit_session_find(new_asid)) != NULL) { int updated; - updated = audit_update_sentry(found_se, new_aia); - audit_ref_session(found_se); + updated = audit_update_sentry(found_se, new_aia_p); AUDIT_SENTRY_WUNLOCK(); - AUDIT_SE_KLIST_LOCK_DESTROY(se); free(se, M_AU_SESSION); - if (updated) - audit_session_knote(found_se, NOTE_AS_UPDATE); + /* If a different session then add this process in. */ + if (new_aia_p != old_aia_p) + audit_inc_procount(found_se); /* - * If this is a new process joining this session then - * we need to update the proc count. + * If the session information was updated then + * generate an audit event to notify userland. */ - if (newprocess) - audit_inc_procount(found_se); + if (updated) + audit_session_event(AUE_SESSION_UPDATE, + &found_se->se_auinfo); return (&found_se->se_auinfo); } @@ -539,25 +914,23 @@ audit_session_new(auditinfo_addr_t *new_aia, int newprocess) */ se->se_port = IPC_PORT_NULL; aia = &se->se_auinfo; - aia->ai_asid = asid; - aia->ai_auid = new_aia->ai_auid; - bzero(&new_aia->ai_mask, sizeof(new_aia->ai_mask)); - bcopy(&new_aia->ai_termid, &aia->ai_termid, sizeof(aia->ai_termid)); - aia->ai_flags = new_aia->ai_flags; + aia->ai_asid = new_asid; + aia->ai_auid = new_aia_p->ai_auid; + bzero(&new_aia_p->ai_mask, sizeof(new_aia_p->ai_mask)); + bcopy(&new_aia_p->ai_termid, &aia->ai_termid, sizeof(aia->ai_termid)); + aia->ai_flags = new_aia_p->ai_flags; /* * Add it to the hash table. */ - LIST_INSERT_HEAD(&au_sentry_bucket[HASH_ASID(asid)], se, se_link); + LIST_INSERT_HEAD(&au_sentry_bucket[HASH_ASID(new_asid)], se, se_link); AUDIT_SENTRY_WUNLOCK(); /* - * Register kevents for consumers wanting events for any ASID - * and knote the event. + * Generate an audit event to notify userland of the new session. */ - audit_register_kevents(se->se_asid, se->se_auid); - audit_session_knote(se, NOTE_AS_START); - + audit_session_event(AUE_SESSION_START, aia); + au_history_record(se, AU_HISTORY_EVENT_BIRTH); return (aia); } @@ -577,13 +950,22 @@ audit_session_lookup(au_asid_t asid, auditinfo_addr_t *ret_aia) AUDIT_SENTRY_RUNLOCK(); return (1); } + /* We have a reference on the session so it is safe to drop the lock. */ + AUDIT_SENTRY_RUNLOCK(); if (ret_aia != NULL) bcopy(&se->se_auinfo, ret_aia, sizeof(*ret_aia)); - AUDIT_SENTRY_RUNLOCK(); + audit_unref_session(se); return (0); } +void +audit_session_aiaref(auditinfo_addr_t *aia_p) +{ + + audit_ref_session(AU_SENTRY_PTR(aia_p)); +} + /* * Add a reference to the session entry. */ @@ -596,9 +978,13 @@ audit_session_ref(kauth_cred_t cred) ("audit_session_ref: Invalid kauth_cred.")); aia_p = cred->cr_audit.as_aia_p; + audit_session_aiaref(aia_p); +} + +void audit_session_aiaunref(auditinfo_addr_t *aia_p) +{ - if (IS_VALID_SESSION(aia_p)) - audit_ref_session(AU_SENTRY_PTR(aia_p)); + audit_unref_session(AU_SENTRY_PTR(aia_p)); } /* @@ -613,14 +999,17 @@ audit_session_unref(kauth_cred_t cred) ("audit_session_unref: Invalid kauth_cred.")); aia_p = cred->cr_audit.as_aia_p; - - if (IS_VALID_SESSION(aia_p)) - audit_unref_session(AU_SENTRY_PTR(aia_p)); + audit_session_aiaunref(aia_p); } +/* + * Increment the per audit session process count. Assumes that the caller has + * a reference on the process' cred. + */ void -audit_session_procnew(kauth_cred_t cred) +audit_session_procnew(proc_t p) { + kauth_cred_t cred = p->p_ucred; auditinfo_addr_t *aia_p; KASSERT(IS_VALID_CRED(cred), @@ -628,13 +1017,17 @@ audit_session_procnew(kauth_cred_t cred) aia_p = cred->cr_audit.as_aia_p; - if (IS_VALID_SESSION(aia_p)) - audit_inc_procount(AU_SENTRY_PTR(aia_p)); + audit_inc_procount(AU_SENTRY_PTR(aia_p)); } +/* + * Decrement the per audit session process count. Assumes that the caller has + * a reference on the cred. + */ void -audit_session_procexit(kauth_cred_t cred) +audit_session_procexit(proc_t p) { + kauth_cred_t cred = p->p_ucred; auditinfo_addr_t *aia_p; KASSERT(IS_VALID_CRED(cred), @@ -642,8 +1035,7 @@ audit_session_procexit(kauth_cred_t cred) aia_p = cred->cr_audit.as_aia_p; - if (IS_VALID_SESSION(aia_p)) - audit_dec_procount(AU_SENTRY_PTR(aia_p)); + audit_dec_procount(AU_SENTRY_PTR(aia_p)); } /* @@ -658,450 +1050,109 @@ audit_session_init(void) ("audit_session_init: ASSIGNED_ASID_MAX is not large enough.")); AUDIT_SENTRY_RWLOCK_INIT(); - AUDIT_ANYAS_KLIST_LOCK_INIT(); au_sentry_bucket = malloc( sizeof(struct au_sentry) * HASH_TABLE_SIZE, M_AU_SESSION, M_WAITOK | M_ZERO); for (i = 0; i < HASH_TABLE_SIZE; i++) LIST_INIT(&au_sentry_bucket[i]); -} - -/* - * Allocate a new kevent propagation list (plist). - */ -static caddr_t -audit_new_plist(void) -{ - au_plisthead_t *plhead; - - plhead = malloc(sizeof(au_plisthead_t), M_AU_EV_PLIST, M_WAITOK | - M_ZERO); - - LIST_INIT(&plhead->ph_head); - AUDIT_PLIST_LOCK_INIT(plhead); - return ((caddr_t) plhead); + (void)audit_sdev_init(); +#if AU_HISTORY_LOGGING + au_history = malloc(sizeof(struct au_history) * au_history_size, + M_AU_SESSION, M_WAITOK|M_ZERO); +#endif } -/* - * Destroy a kevent propagation list (plist). The anyas_klist_mtx mutex must be - * held by the caller. - */ -static void -audit_destroy_plist(struct knote *anyas_kn) +static int +audit_session_update_check(kauth_cred_t cred, auditinfo_addr_t *old, + auditinfo_addr_t *new) { - au_plisthead_t *plhead; - au_plist_t *plentry, *ple_tmp; - struct kevent64_s kev; - - KASSERT(anyas_kn != NULL, ("audit_destroy_plist: anyas = NULL")); - plhead = (au_plisthead_t *)anyas_kn->kn_hook; - KASSERT(plhead != NULL, ("audit_destroy_plist: plhead = NULL")); - - /* - * Delete everything in the propagation list. + uint64_t n; + + /* If the current audit ID is not the default then it is immutable. */ + if (old->ai_auid != AU_DEFAUDITID && old->ai_auid != new->ai_auid) + return (EINVAL); + + /* If the current termid is not the default then it is immutable. */ + if ((old->ai_termid.at_type != AU_IPv4 || + old->ai_termid.at_port != 0 || + old->ai_termid.at_addr[0] != 0) && + (old->ai_termid.at_port != new->ai_termid.at_port || + old->ai_termid.at_type != new->ai_termid.at_type || + 0 != bcmp(&old->ai_termid.at_addr, &new->ai_termid.at_addr, + sizeof (old->ai_termid.at_addr)))) + return (EINVAL); + + /* The flags may be set only according to the + * audit_session_*_set_sflags_masks. */ - AUDIT_PLIST_LOCK(plhead); - LIST_FOREACH_SAFE(plentry, &plhead->ph_head, pl_link, ple_tmp) { - struct kqueue *kq = plentry->pl_knote->kn_kq; - - kev.ident = plentry->pl_knote->kn_id; - kev.filter = EVFILT_SESSION; - kev.flags = EV_DELETE; - - /* - * The plist entry gets removed in rm_from_plist() which is - * called indirectly by kevent_register(). - */ - kevent_register(kq, &kev, NULL); - } - AUDIT_PLIST_UNLOCK(plhead); - - /* - * Remove the head. + n = ~old->ai_flags & new->ai_flags; + if (0 != n && + !((n == (audit_session_superuser_set_sflags_mask & n) && + kauth_cred_issuser(cred)) || + (n == (audit_session_member_set_sflags_mask & n) && + old->ai_asid == new->ai_asid))) + return (EINVAL); + + /* The flags may be cleared only according to the + * audit_session_*_clear_sflags_masks. */ - AUDIT_PLIST_LOCK_DESTROY(plhead); - free(plhead, M_AU_EV_PLIST); + n = ~new->ai_flags & old->ai_flags; + if (0 != n && + !((n == (audit_session_superuser_clear_sflags_mask & n) && + kauth_cred_issuser(cred)) || + (n == (audit_session_member_clear_sflags_mask & n) && + old->ai_asid == new->ai_asid))) + return (EINVAL); + + /* The audit masks are mutable. */ + return (0); } /* - * Add a knote pointer entry to the kevent propagation list. + * Safely update kauth cred of the given process with new the given audit info. */ -static void -audit_add_to_plist(struct knote *anyas_kn, struct knote *kn) +int +audit_session_setaia(proc_t p, auditinfo_addr_t *new_aia_p) { - au_plisthead_t *plhead; - au_plist_t *plentry; - - KASSERT(anyas_kn != NULL, ("audit_add_to_plist: anyas = NULL")); - plhead = (au_plisthead_t *)anyas_kn->kn_hook; - KASSERT(plhead != NULL, ("audit_add_to_plist: plhead = NULL")); + kauth_cred_t my_cred, my_new_cred; + struct au_session as; + struct au_session tmp_as; + auditinfo_addr_t caia, *old_aia_p; + int ret; - plentry = malloc(sizeof(au_plist_t), M_AU_EV_PLIST, M_WAITOK | M_ZERO); + /* + * If this is going to modify an existing session then do some + * immutable checks. + */ + if (audit_session_lookup(new_aia_p->ai_asid, &caia) == 0) { + my_cred = kauth_cred_proc_ref(p); + ret = audit_session_update_check(my_cred, &caia, new_aia_p); + kauth_cred_unref(&my_cred); + if (ret) + return (ret); + } - plentry->pl_knote = kn; - AUDIT_PLIST_LOCK(plhead); - LIST_INSERT_HEAD(&plhead->ph_head, plentry, pl_link); - AUDIT_PLIST_UNLOCK(plhead); -} + my_cred = kauth_cred_proc_ref(p); + bcopy(&new_aia_p->ai_mask, &as.as_mask, sizeof(as.as_mask)); + old_aia_p = my_cred->cr_audit.as_aia_p; + /* audit_session_new() adds a reference on the session */ + as.as_aia_p = audit_session_new(new_aia_p, old_aia_p); -/* - * Remote a knote pointer entry from the kevent propagation list. The lock - * on the plist may already be head (by audit_destroy_plist() above) so we use - * a recursive lock. - */ -static void -audit_rm_from_plist(struct knote *kn) -{ - struct knote *anyas_kn; - au_plisthead_t *plhd; - au_plist_t *plentry, *ple_tmp; - - KASSERT(kn != NULL, ("audit_rm_from_plist: kn = NULL")); - anyas_kn = (struct knote *)kn->kn_hook; - KASSERT(anyas_kn != NULL, ("audit_rm_to_plist: anyas = NULL")); - plhd = (au_plisthead_t *)anyas_kn->kn_hook; - - AUDIT_PLIST_LOCK(plhd); - LIST_FOREACH_SAFE(plentry, &plhd->ph_head, pl_link, ple_tmp) { - if (plentry->pl_knote == kn) { - LIST_REMOVE(plentry, pl_link); - free(plentry, M_AU_EV_PLIST); - AUDIT_PLIST_UNLOCK(plhd); - return; - } - } - AUDIT_PLIST_UNLOCK(plhd); -} + /* If the process left a session then update the process count. */ + if (old_aia_p != new_aia_p) + audit_dec_procount(AU_SENTRY_PTR(old_aia_p)); -/* - * The attach filter for EVFILT_SESSION. - */ -static int -audit_filt_sessionattach(struct knote *kn) -{ - au_sentry_t *se = NULL; /* - * Check flags for the events we currently support. + * We are modifying the audit info in a credential so we need a new + * credential (or take another reference on an existing credential that + * matches our new one). We must do this because the audit info in the + * credential is used as part of our hash key. Get current credential + * in the target process and take a reference while we muck with it. */ - if ((kn->kn_sfflags & (NOTE_AS_START | NOTE_AS_END | NOTE_AS_CLOSE - | NOTE_AS_UPDATE | NOTE_AS_ERR)) == 0) - return (ENOTSUP); - - /* - * If the interest is in any session then add to the any ASID knote - * list. Otherwise, add it to the knote list assosiated with the - * given session. - */ - if (kn->kn_id == AS_ANY_ASID) { - - kn->kn_flags |= EV_CLEAR; - kn->kn_ptr.p_se = NULL; - - /* - * Attach a kevent propagation list for any kevents that get - * added. - */ - kn->kn_hook = audit_new_plist(); - - AUDIT_ANYAS_KLIST_LOCK(); - KNOTE_ATTACH(&anyas_klist, kn); - AUDIT_ANYAS_KLIST_UNLOCK(); - - return (0); - } else { - - /* - * NOTE: The anyas klist lock will be held in this - * part of the code when indirectly called from - * audit_register_kevents() below. - */ - - /* - * Check to make sure it is a valid ASID. - */ - if (kn->kn_id > ASSIGNED_ASID_MAX) - return (EINVAL); - - AUDIT_SENTRY_RLOCK(); - se = audit_session_find(kn->kn_id); - AUDIT_SENTRY_RUNLOCK(); - if (se == NULL) - return (EINVAL); - - AUDIT_SE_KLIST_LOCK(se); - kn->kn_flags |= EV_CLEAR; - kn->kn_ptr.p_se = se; - - /* - * If this attach is the result of an "any ASID" (pseudo) - * kevent then attach the any session knote ptr to this knote. - * Also, add this knote to the its propagation list. - */ - if (kn->kn_flags & EV_ANY_ASID) { - struct knote *anyas_kn = - (struct knote *)((uintptr_t)kn->kn_kevent.ext[0]); - kn->kn_hook = (caddr_t) anyas_kn; - kn->kn_flags &= ~EV_ANY_ASID; - audit_add_to_plist(anyas_kn, kn); - } else - kn->kn_hook = NULL; - KNOTE_ATTACH(&se->se_klist, kn); - AUDIT_SE_KLIST_UNLOCK(se); - - return (0); - } -} - -/* - * The detach filter for EVFILT_SESSION. - */ -static void -audit_filt_sessiondetach(struct knote *kn) -{ - au_sentry_t *se = NULL; - - if (kn->kn_id == AS_ANY_ASID) { - - AUDIT_ANYAS_KLIST_LOCK(); - audit_destroy_plist(kn); - KNOTE_DETACH(&anyas_klist, kn); - AUDIT_ANYAS_KLIST_UNLOCK(); - - } else { - /* - * If this knote was created by any ASID kevent then remove - * from kevent propagation list. - */ - if (kn->kn_hook != NULL) { - audit_rm_from_plist(kn); - kn->kn_hook = NULL; - } - - /* - * Check to see if already detached. - */ - se = kn->kn_ptr.p_se; - if (se != NULL) { - AUDIT_SE_KLIST_LOCK(se); - kn->kn_ptr.p_se = NULL; - KNOTE_DETACH(&se->se_klist, kn); - AUDIT_SE_KLIST_UNLOCK(se); - } - } -} - -/* - * The touch filter for EVFILT_SESSION. Check for any ASID kevent updates and - * propagate the change. - */ -static void -audit_filt_sessiontouch(struct knote *kn, struct kevent64_s *kev, long type) -{ - struct knote *ple_kn; - struct kqueue *kq; - au_sentry_t *se; - au_plisthead_t *plhead; - au_plist_t *plentry; - struct kevent64_s newkev; - - switch (type) { - case EVENT_REGISTER: - kn->kn_sfflags = kev->fflags; - kn->kn_sdata = kev->data; - /* - * If an any ASID kevent was updated then we may need to - * propagate the update. - */ - if (kev->ident == AS_ANY_ASID && kn->kn_hook != NULL) { - - /* - * Propagate the change to each of the session kevents - * that were created by this any ASID kevent. - */ - plhead = (au_plisthead_t *)kn->kn_hook; - AUDIT_PLIST_LOCK(plhead); - LIST_FOREACH(plentry, &plhead->ph_head, pl_link) { - - if ((ple_kn = plentry->pl_knote) == NULL) - continue; - if ((se = ple_kn->kn_ptr.p_se) == NULL) - continue; - if ((kq = ple_kn->kn_kq) == NULL) - continue; - - newkev.ident = plentry->pl_knote->kn_id; - newkev.filter = EVFILT_SESSION; - newkev.flags = kev->flags; - newkev.fflags = kev->fflags; - newkev.data = kev->data; - newkev.udata = kev->udata; - kevent_register(kq, &newkev, NULL); - } - AUDIT_PLIST_UNLOCK(plhead); - } - break; - - case EVENT_PROCESS: - *kev = kn->kn_kevent; - if (kn->kn_flags & EV_CLEAR) { - kn->kn_data = 0; - kn->kn_fflags = 0; - } - break; - - default: - KASSERT((type == EVENT_REGISTER || type == EVENT_PROCESS), - ("filt_sessiontouch(): invalid type (%ld)", type)); - break; - } -} - -/* - * Event filter for EVFILT_SESSION. The AUDIT_SE_KLIST_LOCK should be held - * by audit_session_knote(). - */ -static int -audit_filt_session(struct knote *kn, long hint) -{ - int events = (int)hint; - au_sentry_t *se = kn->kn_ptr.p_se; - - if (hint != 0 && se != NULL) { - - if (kn->kn_sfflags & events) { - kn->kn_fflags |= events; - kn->kn_data = se->se_auid; - } - - /* - * If this is the last possible event for the knote, - * detach the knote from the audit session before the - * session goes away. - */ - if (events & NOTE_AS_CLOSE) { - - /* - * If created by any ASID kevent then remove from - * propagation list. - */ - if (kn->kn_hook != NULL) { - audit_rm_from_plist(kn); - kn->kn_hook = NULL; - } - kn->kn_flags |= (EV_EOF | EV_ONESHOT); - kn->kn_ptr.p_se = NULL; - AUDIT_SE_KLIST_LOCK_ASSERT(se); - KNOTE_DETACH(&se->se_klist, kn); - - return (1); - } - } - return (kn->kn_fflags != 0); -} - -/* - * For all the consumers wanting events for all sessions, register new - * kevents associated with the session for the given ASID. The actual - * attachment is done by the EVFILT_SESSION attach filter above. - */ -static void -audit_register_kevents(uint32_t asid, uint32_t auid) -{ - struct knote *kn; - - AUDIT_ANYAS_KLIST_LOCK(); - SLIST_FOREACH(kn, &anyas_klist, kn_selnext) { - struct kqueue *kq = kn->kn_kq; - struct kevent64_s kev; - int err; - - kev.ident = asid; - kev.filter = EVFILT_SESSION; - kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ANY_ASID; - kev.fflags = kn->kn_sfflags; - kev.data = auid; - kev.udata = kn->kn_kevent.udata; - - /* - * Save the knote ptr for this "any ASID" knote for the attach - * filter. - */ - kev.ext[0] = (uint64_t)((uintptr_t)kn); - - /* - * XXX kevent_register() may block here alloc'ing a new knote. - * We may want to think about using a lockless linked list or - * at least a sleep rwlock for the anyas_klist. - */ - err = kevent_register(kq, &kev, NULL); - if (err) - kn->kn_fflags |= NOTE_AS_ERR; - } - AUDIT_ANYAS_KLIST_UNLOCK(); -} - -/* - * Safely update kauth cred of the given process with new the given audit info. - * If the newprocess flag is set then we need to account for this process in - * the proc count. - */ -int -audit_session_setaia(proc_t p, auditinfo_addr_t *aia_p, int newprocess) -{ - kauth_cred_t my_cred, my_new_cred; - struct au_session as; - struct au_session tmp_as; - auditinfo_addr_t caia; - - /* - * If this is going to modify an existing session then do some - * immutable checks. - */ - if (audit_session_lookup(aia_p->ai_asid, &caia) == 0) { - - /* - * If the current audit ID is not the default then it is - * immutable. - */ - if (caia.ai_auid != AU_DEFAUDITID && - caia.ai_auid != aia_p->ai_auid) - return (EINVAL); - - /* - * If the current termid is not the default then it is - * immutable. - */ - if ((caia.ai_termid.at_type != AU_IPv4 || - caia.ai_termid.at_port != 0 || - caia.ai_termid.at_addr[0] != 0) && - (caia.ai_termid.at_port != aia_p->ai_termid.at_port || - caia.ai_termid.at_type != aia_p->ai_termid.at_type || - bcmp(&caia.ai_termid.at_addr, &aia_p->ai_termid.at_addr, - sizeof (caia.ai_termid.at_addr) )) ) - return (EINVAL); - - /* The audit flags are immutable. */ - if (caia.ai_flags != aia_p->ai_flags) - return (EINVAL); - - /* The audit masks are mutable. */ - } - - my_cred = kauth_cred_proc_ref(p); - bcopy(&aia_p->ai_mask, &as.as_mask, sizeof(as.as_mask)); - as.as_aia_p = audit_session_new(aia_p, newprocess); - - /* - * We are modifying the audit info in a credential so we need a new - * credential (or take another reference on an existing credential that - * matches our new one). We must do this because the audit info in the - * credential is used as part of our hash key. Get current credential - * in the target process and take a reference while we muck with it. - */ - for (;;) { + for (;;) { /* * Set the credential with new info. If there is no change, @@ -1129,6 +1180,8 @@ audit_session_setaia(proc_t p, auditinfo_addr_t *aia_p, int newprocess) continue; } p->p_ucred = my_new_cred; + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); proc_unlock(p); } /* @@ -1137,11 +1190,11 @@ audit_session_setaia(proc_t p, auditinfo_addr_t *aia_p, int newprocess) kauth_cred_unref(&my_cred); break; } - audit_session_unref(my_new_cred); - /* - * Propagate the change from the process to the Mach task. - */ + /* Drop the reference taken by audit_session_new() above. */ + audit_unref_session(AU_SENTRY_PTR(as.as_aia_p)); + + /* Propagate the change from the process to the Mach task. */ set_security_token(p); return (0); @@ -1180,6 +1233,7 @@ audit_session_self(proc_t p, __unused struct audit_session_self_args *uap, aia_p = cred->cr_audit.as_aia_p; if (!IS_VALID_SESSION(aia_p)) { + /* Can't join the default session. */ err = EINVAL; goto done; } @@ -1194,91 +1248,190 @@ audit_session_self(proc_t p, __unused struct audit_session_self_args *uap, bcopy(&cred->cr_audit.as_mask, &se->se_mask, sizeof(se->se_mask)); - if ((sendport = audit_session_mksend(aia_p, &se->se_port)) == NULL) { - /* failed to alloc new port */ - err = ENOMEM; - goto done; - } - /* - * This reference on the session is unref'ed in - * audit_session_port_destory(). This reference is needed so the - * session doesn't get dropped until the session join is done. + * Get a send right to the session's Mach port and insert it in the + * process' mach port namespace. */ - audit_ref_session(se); - + sendport = audit_session_mksend(aia_p, &se->se_port); + *ret_port = ipc_port_copyout_send(sendport, get_task_ipcspace(p->task)); done: if (cred != NULL) kauth_cred_unref(&cred); - if (err == 0) - *ret_port = ipc_port_copyout_send(sendport, - get_task_ipcspace(p->task)); - else + if (err != 0) *ret_port = MACH_PORT_NULL; - return (err); } -void -audit_session_portaiadestroy(struct auditinfo_addr *port_aia_p) +/* + * audit_session_port (system call) + * + * Description: Obtain a Mach send right for the given session ID. + * + * Parameters: p Process calling audit_session_port(). + * uap->asid The target audit session ID. The special + * value -1 can be used to target the process's + * own session. + * uap->portnamep User address at which to place port name. + * + * Returns: 0 Success + * EINVAL The calling process' session has not be set. + * EINVAL The given session ID could not be found. + * EINVAL The Mach port right could not be copied out. + * ESRCH Bad process, can't get valid cred for process. + * EPERM Only the superuser can reference sessions other + * than the process's own. + * ENOMEM Port allocation failed due to no free memory. + */ +int +audit_session_port(proc_t p, struct audit_session_port_args *uap, + __unused int *retval) { - au_sentry_t *se; + ipc_port_t sendport = IPC_PORT_NULL; + mach_port_name_t portname = MACH_PORT_NULL; + kauth_cred_t cred = NULL; + auditinfo_addr_t *aia_p = NULL; + au_sentry_t *se = NULL; + int err = 0; + + /* Note: Currently this test will never be true, because + * ASSIGNED_ASID_MAX is effectively (uint32_t)-2. + */ + if (uap->asid != -1 && (uint32_t)uap->asid > ASSIGNED_ASID_MAX) { + err = EINVAL; + goto done; + } + cred = kauth_cred_proc_ref(p); + if (!IS_VALID_CRED(cred)) { + err = ESRCH; + goto done; + } + aia_p = cred->cr_audit.as_aia_p; - KASSERT(port_aia_p != NULL, - ("audit_session_infodestroy: port_aia_p = NULL")); + /* Find the session corresponding to the requested audit + * session ID. If found, take a reference on it so that + * the session is not dropped until the join is later done. + */ + if (uap->asid == (au_asid_t)-1 || + uap->asid == aia_p->ai_asid) { - se = AU_SENTRY_PTR(port_aia_p); + if (!IS_VALID_SESSION(aia_p)) { + /* Can't join the default session. */ + err = EINVAL; + goto done; + } + + /* No privilege is required to obtain a port for our + * own session. + */ + se = AU_SENTRY_PTR(aia_p); + audit_ref_session(se); + } else if (kauth_cred_issuser(cred)) { + /* The superuser may obtain a port for any existing + * session. + */ + AUDIT_SENTRY_RLOCK(); + se = audit_session_find(uap->asid); + AUDIT_SENTRY_RUNLOCK(); + if (NULL == se) { + err = EINVAL; + goto done; + } + aia_p = &se->se_auinfo; + } else { + err = EPERM; + goto done; + } /* - * Drop the reference added in audit_session_self(). + * Processes that join using this mach port will inherit this process' + * pre-selection masks. */ - if (se != NULL) { - se->se_port = IPC_PORT_NULL; - audit_unref_session(se); + if (se->se_port == IPC_PORT_NULL) + bcopy(&cred->cr_audit.as_mask, &se->se_mask, + sizeof(se->se_mask)); + + /* + * Use the session reference to create a mach port reference for the + * session (at which point we are free to drop the session reference) + * and then copy out the mach port to the process' mach port namespace. + */ + sendport = audit_session_mksend(aia_p, &se->se_port); + portname = ipc_port_copyout_send(sendport, get_task_ipcspace(p->task)); + if (!MACH_PORT_VALID(portname)) { + err = EINVAL; + goto done; } + err = copyout(&portname, uap->portnamep, sizeof(mach_port_name_t)); +done: + if (cred != NULL) + kauth_cred_unref(&cred); + if (NULL != se) + audit_unref_session(se); + if (MACH_PORT_VALID(portname) && 0 != err) + (void)mach_port_deallocate(get_task_ipcspace(p->task), + portname); + return (err); } static int audit_session_join_internal(proc_t p, ipc_port_t port, au_asid_t *new_asid) { - auditinfo_addr_t *port_aia_p, *old_aia_p; - kauth_cred_t cred = NULL; + auditinfo_addr_t *new_aia_p, *old_aia_p; + kauth_cred_t my_cred = NULL; au_asid_t old_asid; int err = 0; *new_asid = AU_DEFAUDITSID; - if ((port_aia_p = audit_session_porttoaia(port)) == NULL) { + if ((new_aia_p = audit_session_porttoaia(port)) == NULL) { err = EINVAL; goto done; } - *new_asid = port_aia_p->ai_asid; - cred = kauth_cred_proc_ref(p); - if (!IS_VALID_CRED(cred)) { - kauth_cred_unref(&cred); + proc_lock(p); + kauth_cred_ref(p->p_ucred); + my_cred = p->p_ucred; + if (!IS_VALID_CRED(my_cred)) { + kauth_cred_unref(&my_cred); + proc_unlock(p); err = ESRCH; goto done; } - old_aia_p = cred->cr_audit.as_aia_p; + old_aia_p = my_cred->cr_audit.as_aia_p; old_asid = old_aia_p->ai_asid; + *new_asid = new_aia_p->ai_asid; /* * Add process in if not already in the session. */ if (*new_asid != old_asid) { - audit_session_setaia(p, port_aia_p, 1); - /* - * If this process was in a valid session before then we - * need to decrement the process count of the session it - * came from. - */ - if (IS_VALID_SESSION(old_aia_p)) - audit_dec_procount(AU_SENTRY_PTR(old_aia_p)); + kauth_cred_t my_new_cred; + struct au_session new_as; + + bcopy(&new_aia_p->ai_mask, &new_as.as_mask, + sizeof(new_as.as_mask)); + new_as.as_aia_p = new_aia_p; + + my_new_cred = kauth_cred_setauditinfo(my_cred, &new_as); + p->p_ucred = my_new_cred; + PROC_UPDATE_CREDS_ONPROC(p); + + /* Increment the proc count of new session */ + audit_inc_procount(AU_SENTRY_PTR(new_aia_p)); + + proc_unlock(p); + + /* Propagate the change from the process to the Mach task. */ + set_security_token(p); + + /* Decrement the process count of the former session. */ + audit_dec_procount(AU_SENTRY_PTR(old_aia_p)); + } else { + proc_unlock(p); } - kauth_cred_unref(&cred); + kauth_cred_unref(&my_cred); done: if (port != IPC_PORT_NULL) @@ -1312,8 +1465,10 @@ audit_session_spawnjoin(proc_t p, ipc_port_t port) * Parameters: p Process calling session join. * uap->port A Mach send right. * - * Returns: *ret_asid Audit session ID of new session, which may - * be AU_DEFAUDITSID in the failure case. + * Returns: *ret_asid Audit session ID of new session. + * In the failure case the return value will be -1 + * and 'errno' will be set to a non-zero value + * described below. * * Errno: 0 Success * EINVAL Invalid Mach port name. @@ -1338,6 +1493,540 @@ audit_session_join(proc_t p, struct audit_session_join_args *uap, return (err); } +/* + * Audit session device. + */ + +/* + * Free an audit sdev entry. + */ +static void +audit_sdev_entry_free(struct audit_sdev_entry *ase) +{ + + free(ase->ase_record, M_AUDIT_SDEV_ENTRY); + free(ase, M_AUDIT_SDEV_ENTRY); +} + +/* + * Append individual record to a queue. Allocate queue-local buffer and + * add to the queue. If the queue is full or we can't allocate memory, + * drop the newest record. + */ +static void +audit_sdev_append(struct audit_sdev *asdev, void *record, u_int record_len) +{ + struct audit_sdev_entry *ase; + + AUDIT_SDEV_LOCK_ASSERT(asdev); + + if (asdev->asdev_qlen >= asdev->asdev_qlimit) { + asdev->asdev_drops++; + audit_sdev_drops++; + return; + } + + ase = malloc(sizeof (*ase), M_AUDIT_SDEV_ENTRY, M_NOWAIT | M_ZERO); + if (NULL == ase) { + asdev->asdev_drops++; + audit_sdev_drops++; + return; + } + + ase->ase_record = malloc(record_len, M_AUDIT_SDEV_ENTRY, M_NOWAIT); + if (NULL == ase->ase_record) { + free(ase, M_AUDIT_SDEV_ENTRY); + asdev->asdev_drops++; + audit_sdev_drops++; + return; + } + + bcopy(record, ase->ase_record, record_len); + ase->ase_record_len = record_len; + + TAILQ_INSERT_TAIL(&asdev->asdev_queue, ase, ase_queue); + asdev->asdev_inserts++; + asdev->asdev_qlen++; + asdev->asdev_qbyteslen += ase->ase_record_len; + selwakeup(&asdev->asdev_selinfo); + if (asdev->asdev_flags & AUDIT_SDEV_ASYNC) + pgsigio(asdev->asdev_sigio, SIGIO); + + cv_broadcast(&asdev->asdev_cv); +} + +/* + * Submit an audit record to be queued in the audit session device. + */ +void +audit_sdev_submit(__unused au_id_t auid, __unused au_asid_t asid, void *record, + u_int record_len) +{ + struct audit_sdev *asdev; + + /* + * Lockless read to avoid lock overhead if sessio devices are not in + * use. + */ + if (NULL == TAILQ_FIRST(&audit_sdev_list)) + return; + + AUDIT_SDEV_LIST_RLOCK(); + TAILQ_FOREACH(asdev, &audit_sdev_list, asdev_list) { + AUDIT_SDEV_LOCK(asdev); + + /* + * Only append to the sdev queue if the AUID and ASID match that + * of the process that opened this session device or if the + * ALLSESSIONS flag is set. + */ + if ((/* XXXss auid == asdev->asdev_auid && */ + asid == asdev->asdev_asid) || + (asdev->asdev_flags & AUDIT_SDEV_ALLSESSIONS) != 0) + audit_sdev_append(asdev, record, record_len); + AUDIT_SDEV_UNLOCK(asdev); + } + AUDIT_SDEV_LIST_RUNLOCK(); + + /* Unlocked increment. */ + audit_sdev_records++; +} + +/* + * Allocate a new audit sdev. Connects the sdev, on succes, to the global + * list and updates statistics. + */ +static struct audit_sdev * +audit_sdev_alloc(void) +{ + struct audit_sdev *asdev; + + AUDIT_SDEV_LIST_WLOCK_ASSERT(); + + asdev = malloc(sizeof (*asdev), M_AUDIT_SDEV, M_NOWAIT | M_ZERO); + if (NULL == asdev) + return (NULL); + + asdev->asdev_qlimit = AUDIT_SDEV_QLIMIT_DEFAULT; + TAILQ_INIT(&asdev->asdev_queue); + AUDIT_SDEV_LOCK_INIT(asdev); + AUDIT_SDEV_SX_LOCK_INIT(asdev); + cv_init(&asdev->asdev_cv, "audit_sdev_cv"); + + /* + * Add to global list and update global statistics. + */ + TAILQ_INSERT_HEAD(&audit_sdev_list, asdev, asdev_list); + audit_sdev_count++; + audit_sdev_ever++; + + return (asdev); +} + +/* + * Flush all records currently present in an audit sdev. + */ +static void +audit_sdev_flush(struct audit_sdev *asdev) +{ + struct audit_sdev_entry *ase; + + AUDIT_SDEV_LOCK_ASSERT(asdev); + + while ((ase = TAILQ_FIRST(&asdev->asdev_queue)) != NULL) { + TAILQ_REMOVE(&asdev->asdev_queue, ase, ase_queue); + asdev->asdev_qbyteslen -= ase->ase_record_len; + audit_sdev_entry_free(ase); + asdev->asdev_qlen--; + } + asdev->asdev_qoffset = 0; + + KASSERT(0 == asdev->asdev_qlen, ("audit_sdev_flush: asdev_qlen")); + KASSERT(0 == asdev->asdev_qbyteslen, + ("audit_sdev_flush: asdev_qbyteslen")); +} + +/* + * Free an audit sdev. + */ +static void +audit_sdev_free(struct audit_sdev *asdev) +{ + + AUDIT_SDEV_LIST_WLOCK_ASSERT(); + AUDIT_SDEV_LOCK_ASSERT(asdev); + + /* XXXss - preselect hook here */ + audit_sdev_flush(asdev); + cv_destroy(&asdev->asdev_cv); + AUDIT_SDEV_SX_LOCK_DESTROY(asdev); + AUDIT_SDEV_LOCK_DESTROY(asdev); + + TAILQ_REMOVE(&audit_sdev_list, asdev, asdev_list); + free(asdev, M_AUDIT_SDEV); + audit_sdev_count--; +} + +/* + * Get the auditinfo_addr of the proc and check to see if suser. Will return + * non-zero if not suser. + */ +static int +audit_sdev_get_aia(proc_t p, struct auditinfo_addr *aia_p) +{ + int error; + kauth_cred_t scred; + + scred = kauth_cred_proc_ref(p); + error = suser(scred, &p->p_acflag); + + if (NULL != aia_p) + bcopy(scred->cr_audit.as_aia_p, aia_p, sizeof (*aia_p)); + kauth_cred_unref(&scred); + + return (error); +} + +/* + * Audit session dev open method. + */ +static int +audit_sdev_open(dev_t dev, __unused int flags, __unused int devtype, proc_t p) +{ + struct audit_sdev *asdev; + struct auditinfo_addr aia; + int u; + + u = minor(dev); + if (u < 0 || u > MAX_AUDIT_SDEVS) + return (ENXIO); + + (void) audit_sdev_get_aia(p, &aia); + + AUDIT_SDEV_LIST_WLOCK(); + asdev = audit_sdev_dtab[u]; + if (NULL == asdev) { + asdev = audit_sdev_alloc(); + if (NULL == asdev) { + AUDIT_SDEV_LIST_WUNLOCK(); + return (ENOMEM); + } + audit_sdev_dtab[u] = asdev; + } else { + KASSERT(asdev->asdev_open, ("audit_sdev_open: Already open")); + AUDIT_SDEV_LIST_WUNLOCK(); + return (EBUSY); + } + asdev->asdev_open = 1; + asdev->asdev_auid = aia.ai_auid; + asdev->asdev_asid = aia.ai_asid; + asdev->asdev_flags = 0; + + AUDIT_SDEV_LIST_WUNLOCK(); + + return (0); +} + +/* + * Audit session dev close method. + */ +static int +audit_sdev_close(dev_t dev, __unused int flags, __unused int devtype, + __unused proc_t p) +{ + struct audit_sdev *asdev; + int u; + + u = minor(dev); + asdev = audit_sdev_dtab[u]; + + KASSERT(asdev != NULL, ("audit_sdev_close: asdev == NULL")); + KASSERT(asdev->asdev_open, ("audit_sdev_close: !asdev_open")); + + AUDIT_SDEV_LIST_WLOCK(); + AUDIT_SDEV_LOCK(asdev); + asdev->asdev_open = 0; + audit_sdev_free(asdev); /* sdev lock unlocked in audit_sdev_free() */ + audit_sdev_dtab[u] = NULL; + AUDIT_SDEV_LIST_WUNLOCK(); + + return (0); +} + +/* + * Audit session dev ioctl method. + */ +static int +audit_sdev_ioctl(dev_t dev, u_long cmd, caddr_t data, + __unused int flag, proc_t p) +{ + struct audit_sdev *asdev; + int error; + + asdev = audit_sdev_dtab[minor(dev)]; + KASSERT(asdev != NULL, ("audit_sdev_ioctl: asdev == NULL")); + + error = 0; + + switch (cmd) { + case FIONBIO: + AUDIT_SDEV_LOCK(asdev); + if (*(int *)data) + asdev->asdev_flags |= AUDIT_SDEV_NBIO; + else + asdev->asdev_flags &= ~AUDIT_SDEV_NBIO; + AUDIT_SDEV_UNLOCK(asdev); + break; + + case FIONREAD: + AUDIT_SDEV_LOCK(asdev); + *(int *)data = asdev->asdev_qbyteslen - asdev->asdev_qoffset; + AUDIT_SDEV_UNLOCK(asdev); + break; + + case AUDITSDEV_GET_QLEN: + *(u_int *)data = asdev->asdev_qlen; + break; + + case AUDITSDEV_GET_QLIMIT: + *(u_int *)data = asdev->asdev_qlimit; + break; + + case AUDITSDEV_SET_QLIMIT: + if (*(u_int *)data >= AUDIT_SDEV_QLIMIT_MIN || + *(u_int *)data <= AUDIT_SDEV_QLIMIT_MAX) { + asdev->asdev_qlimit = *(u_int *)data; + } else + error = EINVAL; + break; + + case AUDITSDEV_GET_QLIMIT_MIN: + *(u_int *)data = AUDIT_SDEV_QLIMIT_MIN; + break; + + case AUDITSDEV_GET_QLIMIT_MAX: + *(u_int *)data = AUDIT_SDEV_QLIMIT_MAX; + break; + + case AUDITSDEV_FLUSH: + if (AUDIT_SDEV_SX_XLOCK_SIG(asdev) != 0) + return (EINTR); + AUDIT_SDEV_LOCK(asdev); + audit_sdev_flush(asdev); + AUDIT_SDEV_UNLOCK(asdev); + AUDIT_SDEV_SX_XUNLOCK(asdev); + break; + + case AUDITSDEV_GET_MAXDATA: + *(u_int *)data = MAXAUDITDATA; + break; + + /* XXXss these should be 64 bit, maybe. */ + case AUDITSDEV_GET_INSERTS: + *(u_int *)data = asdev->asdev_inserts; + break; + + case AUDITSDEV_GET_READS: + *(u_int *)data = asdev->asdev_reads; + break; + + case AUDITSDEV_GET_DROPS: + *(u_int *)data = asdev->asdev_drops; + break; + + case AUDITSDEV_GET_ALLSESSIONS: + error = audit_sdev_get_aia(p, NULL); + if (error) + break; + *(u_int *)data = (asdev->asdev_flags & AUDIT_SDEV_ALLSESSIONS) ? + 1 : 0; + break; + + case AUDITSDEV_SET_ALLSESSIONS: + error = audit_sdev_get_aia(p, NULL); + if (error) + break; + + AUDIT_SDEV_LOCK(asdev); + if (*(int *)data) + asdev->asdev_flags |= AUDIT_SDEV_ALLSESSIONS; + else + asdev->asdev_flags &= ~AUDIT_SDEV_ALLSESSIONS; + AUDIT_SDEV_UNLOCK(asdev); + break; + + default: + error = ENOTTY; + } + + return (error); +} + +/* + * Audit session dev read method. + */ +static int +audit_sdev_read(dev_t dev, struct uio *uio, __unused int flag) +{ + struct audit_sdev_entry *ase; + struct audit_sdev *asdev; + u_int toread; + int error; + + asdev = audit_sdev_dtab[minor(dev)]; + KASSERT(NULL != asdev, ("audit_sdev_read: asdev == NULL")); + + /* + * We hold a sleep lock over read and flush because we rely on the + * stability of a record in the queue during uiomove. + */ + if (0 != AUDIT_SDEV_SX_XLOCK_SIG(asdev)) + return (EINTR); + AUDIT_SDEV_LOCK(asdev); + while (TAILQ_EMPTY(&asdev->asdev_queue)) { + if (asdev->asdev_flags & AUDIT_SDEV_NBIO) { + AUDIT_SDEV_UNLOCK(asdev); + AUDIT_SDEV_SX_XUNLOCK(asdev); + return (EAGAIN); + } + error = cv_wait_sig(&asdev->asdev_cv, AUDIT_SDEV_MTX(asdev)); + if (error) { + AUDIT_SDEV_UNLOCK(asdev); + AUDIT_SDEV_SX_XUNLOCK(asdev); + return (error); + } + } + + /* + * Copy as many remaining bytes from the current record to userspace + * as we can. Keep processing records until we run out of records in + * the queue or until the user buffer runs out of space. + * + * We rely on the sleep lock to maintain ase's stability here. + */ + asdev->asdev_reads++; + while ((ase = TAILQ_FIRST(&asdev->asdev_queue)) != NULL && + uio_resid(uio) > 0) { + AUDIT_SDEV_LOCK_ASSERT(asdev); + + KASSERT(ase->ase_record_len > asdev->asdev_qoffset, + ("audit_sdev_read: record_len > qoffset (1)")); + toread = MIN(ase->ase_record_len - asdev->asdev_qoffset, + uio_resid(uio)); + AUDIT_SDEV_UNLOCK(asdev); + error = uiomove((char *) ase->ase_record + asdev->asdev_qoffset, + toread, uio); + if (error) { + AUDIT_SDEV_SX_XUNLOCK(asdev); + return (error); + } + + /* + * If the copy succeeded then update book-keeping, and if no + * bytes remain in the current record then free it. + */ + AUDIT_SDEV_LOCK(asdev); + KASSERT(TAILQ_FIRST(&asdev->asdev_queue) == ase, + ("audit_sdev_read: queue out of sync after uiomove")); + asdev->asdev_qoffset += toread; + KASSERT(ase->ase_record_len >= asdev->asdev_qoffset, + ("audit_sdev_read: record_len >= qoffset (2)")); + if (asdev->asdev_qoffset == ase->ase_record_len) { + TAILQ_REMOVE(&asdev->asdev_queue, ase, ase_queue); + asdev->asdev_qbyteslen -= ase->ase_record_len; + audit_sdev_entry_free(ase); + asdev->asdev_qlen--; + asdev->asdev_qoffset = 0; + } + } + AUDIT_SDEV_UNLOCK(asdev); + AUDIT_SDEV_SX_XUNLOCK(asdev); + return (0); +} + +/* + * Audit session device poll method. + */ +static int +audit_sdev_poll(dev_t dev, int events, void *wql, struct proc *p) +{ + struct audit_sdev *asdev; + int revents; + + revents = 0; + asdev = audit_sdev_dtab[minor(dev)]; + KASSERT(NULL != asdev, ("audit_sdev_poll: asdev == NULL")); + + if (events & (POLLIN | POLLRDNORM)) { + AUDIT_SDEV_LOCK(asdev); + if (NULL != TAILQ_FIRST(&asdev->asdev_queue)) + revents |= events & (POLLIN | POLLRDNORM); + else + selrecord(p, &asdev->asdev_selinfo, wql); + AUDIT_SDEV_UNLOCK(asdev); + } + return (revents); +} + +/* + * Audit sdev clone routine. Provides a new minor number or returns -1. + * This called with DEVFS_LOCK held. + */ +static int +audit_sdev_clone(__unused dev_t dev, int action) +{ + int i; + + if (DEVFS_CLONE_ALLOC == action) { + for(i = 0; i < MAX_AUDIT_SDEVS; i++) + if (NULL == audit_sdev_dtab[i]) + return (i); + + /* + * This really should return -1 here but that seems to + * hang things in devfs. We instead return 0 and let + * audit_sdev_open tell userland the bad news. + */ + return (0); + } + + return (-1); +} + +static int +audit_sdev_init(void) +{ + dev_t dev; + + TAILQ_INIT(&audit_sdev_list); + AUDIT_SDEV_LIST_LOCK_INIT(); + + audit_sdev_major = cdevsw_add(-1, &audit_sdev_cdevsw); + if (audit_sdev_major < 0) + return (KERN_FAILURE); + + dev = makedev(audit_sdev_major, 0); + devnode = devfs_make_node_clone(dev, DEVFS_CHAR, UID_ROOT, GID_WHEEL, + 0644, audit_sdev_clone, AUDIT_SDEV_NAME, 0); + + if (NULL == devnode) + return (KERN_FAILURE); + + return (KERN_SUCCESS); +} + +/* XXXss +static int +audit_sdev_shutdown(void) +{ + + devfs_remove(devnode); + (void) cdevsw_remove(audit_sdev_major, &audit_sdev_cdevsw); + + return (KERN_SUCCESS); +} +*/ + #else int @@ -1358,4 +2047,12 @@ audit_session_join(proc_t p, struct audit_session_join_args *uap, return (ENOSYS); } +int +audit_session_port(proc_t p, struct audit_session_port_args *uap, int *retval) +{ +#pragma unused(p, uap, retval) + + return (ENOSYS); +} + #endif /* CONFIG_AUDIT */ diff --git a/bsd/security/audit/audit_syscalls.c b/bsd/security/audit/audit_syscalls.c index 0ad24367a..43d93bdda 100644 --- a/bsd/security/audit/audit_syscalls.c +++ b/bsd/security/audit/audit_syscalls.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 1999-2009, Apple Inc. + * Copyright (c) 1999-2010, Apple Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -246,7 +246,7 @@ int auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) { kauth_cred_t scred; - int error; + int error = 0; union auditon_udata udata; proc_t tp = PROC_NULL; struct auditinfo_addr aia; @@ -288,6 +288,8 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) case A_GETPINFO_ADDR: case A_SENDTRIGGER: case A_GETSINFO_ADDR: + case A_GETSFLAGS: + case A_SETSFLAGS: error = copyin(uap->data, (void *)&udata, uap->length); if (error) return (error); @@ -296,33 +298,45 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) break; } + /* Check appropriate privilege. */ + switch (uap->cmd) { /* * A_GETSINFO doesn't require priviledge but only superuser * gets to see the audit masks. */ - error = suser(kauth_cred_get(), &p->p_acflag); - if (A_GETSINFO_ADDR == uap->cmd) { + case A_GETSINFO_ADDR: if ((sizeof(udata.au_kau_info) != uap->length) || (audit_session_lookup(udata.au_kau_info.ai_asid, &udata.au_kau_info) != 0)) - return (EINVAL); - if (error) { + error = EINVAL; + else if (!kauth_cred_issuser(kauth_cred_get())) { udata.au_kau_info.ai_mask.am_success = ~0; udata.au_kau_info.ai_mask.am_failure = ~0; } - } else - if (error) - return (error); + break; + case A_GETSFLAGS: + case A_SETSFLAGS: + /* Getting one's own audit session flags requires no + * privilege. Setting the flags is subject to access + * control implemented in audit_session_setaia(). + */ + break; + default: + error = suser(kauth_cred_get(), &p->p_acflag); + break; + } + if (error) + return (error); /* * XXX Need to implement these commands by accessing the global * values associated with the commands. */ - mtx_lock(&audit_mtx); switch (uap->cmd) { case A_OLDGETPOLICY: case A_GETPOLICY: if (sizeof(udata.au_policy64) == uap->length) { + mtx_lock(&audit_mtx); if (!audit_fail_stop) udata.au_policy64 |= AUDIT_CNT; if (audit_panic_on_write_fail) @@ -331,12 +345,12 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) udata.au_policy64 |= AUDIT_ARGV; if (audit_arge) udata.au_policy64 |= AUDIT_ARGE; + mtx_unlock(&audit_mtx); break; } - if (sizeof(udata.au_policy) != uap->length) { - mtx_unlock(&audit_mtx); + if (sizeof(udata.au_policy) != uap->length) return (EINVAL); - } + mtx_lock(&audit_mtx); if (!audit_fail_stop) udata.au_policy |= AUDIT_CNT; if (audit_panic_on_write_fail) @@ -345,60 +359,61 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) udata.au_policy |= AUDIT_ARGV; if (audit_arge) udata.au_policy |= AUDIT_ARGE; + mtx_unlock(&audit_mtx); break; case A_OLDSETPOLICY: case A_SETPOLICY: if (sizeof(udata.au_policy64) == uap->length) { if (udata.au_policy64 & ~(AUDIT_CNT|AUDIT_AHLT| - AUDIT_ARGV|AUDIT_ARGE)) { - mtx_unlock(&audit_mtx); + AUDIT_ARGV|AUDIT_ARGE)) return (EINVAL); - } + mtx_lock(&audit_mtx); audit_fail_stop = ((udata.au_policy64 & AUDIT_CNT) == 0); audit_panic_on_write_fail = (udata.au_policy64 & AUDIT_AHLT); audit_argv = (udata.au_policy64 & AUDIT_ARGV); audit_arge = (udata.au_policy64 & AUDIT_ARGE); - + mtx_unlock(&audit_mtx); break; } if ((sizeof(udata.au_policy) != uap->length) || (udata.au_policy & ~(AUDIT_CNT|AUDIT_AHLT|AUDIT_ARGV| - AUDIT_ARGE))) { - mtx_unlock(&audit_mtx); + AUDIT_ARGE))) return (EINVAL); - } /* * XXX - Need to wake up waiters if the policy relaxes? */ + mtx_lock(&audit_mtx); audit_fail_stop = ((udata.au_policy & AUDIT_CNT) == 0); audit_panic_on_write_fail = (udata.au_policy & AUDIT_AHLT); audit_argv = (udata.au_policy & AUDIT_ARGV); audit_arge = (udata.au_policy & AUDIT_ARGE); + mtx_unlock(&audit_mtx); break; case A_GETKMASK: - if (sizeof(udata.au_mask) != uap->length) { - mtx_unlock(&audit_mtx); + if (sizeof(udata.au_mask) != uap->length) return (EINVAL); - } + mtx_lock(&audit_mtx); udata.au_mask = audit_nae_mask; + mtx_unlock(&audit_mtx); break; case A_SETKMASK: - if (sizeof(udata.au_mask) != uap->length) { - mtx_unlock(&audit_mtx); + if (sizeof(udata.au_mask) != uap->length) return (EINVAL); - } + mtx_lock(&audit_mtx); audit_nae_mask = udata.au_mask; AUDIT_CHECK_IF_KEVENTS_MASK(audit_nae_mask); + mtx_unlock(&audit_mtx); break; case A_OLDGETQCTRL: case A_GETQCTRL: if (sizeof(udata.au_qctrl64) == uap->length) { + mtx_lock(&audit_mtx); udata.au_qctrl64.aq64_hiwater = (u_int64_t)audit_qctrl.aq_hiwater; udata.au_qctrl64.aq64_lowater = @@ -409,13 +424,14 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) (u_int64_t)audit_qctrl.aq_delay; udata.au_qctrl64.aq64_minfree = (int64_t)audit_qctrl.aq_minfree; + mtx_unlock(&audit_mtx); break; } - if (sizeof(udata.au_qctrl) != uap->length) { - mtx_unlock(&audit_mtx); + if (sizeof(udata.au_qctrl) != uap->length) return (EINVAL); - } + mtx_lock(&audit_mtx); udata.au_qctrl = audit_qctrl; + mtx_unlock(&audit_mtx); break; case A_OLDSETQCTRL: @@ -426,10 +442,9 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) udata.au_qctrl64.aq64_hiwater) || (udata.au_qctrl64.aq64_bufsz > AQ_MAXBUFSZ) || (udata.au_qctrl64.aq64_minfree < 0) || - (udata.au_qctrl64.aq64_minfree > 100)) { - mtx_unlock(&audit_mtx); + (udata.au_qctrl64.aq64_minfree > 100)) return (EINVAL); - } + mtx_lock(&audit_mtx); audit_qctrl.aq_hiwater = (int)udata.au_qctrl64.aq64_hiwater; audit_qctrl.aq_lowater = @@ -439,77 +454,67 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) audit_qctrl.aq_minfree = (int)udata.au_qctrl64.aq64_minfree; audit_qctrl.aq_delay = -1; /* Not used. */ - - break; + mtx_unlock(&audit_mtx); + break; } if ((sizeof(udata.au_qctrl) != uap->length) || (udata.au_qctrl.aq_hiwater > AQ_MAXHIGH) || (udata.au_qctrl.aq_lowater >= udata.au_qctrl.aq_hiwater) || (udata.au_qctrl.aq_bufsz > AQ_MAXBUFSZ) || (udata.au_qctrl.aq_minfree < 0) || - (udata.au_qctrl.aq_minfree > 100)) { - mtx_unlock(&audit_mtx); + (udata.au_qctrl.aq_minfree > 100)) return (EINVAL); - } + mtx_lock(&audit_mtx); audit_qctrl = udata.au_qctrl; /* XXX The queue delay value isn't used with the kernel. */ audit_qctrl.aq_delay = -1; + mtx_unlock(&audit_mtx); break; case A_GETCWD: - mtx_unlock(&audit_mtx); return (ENOSYS); - break; case A_GETCAR: - mtx_unlock(&audit_mtx); return (ENOSYS); - break; case A_GETSTAT: - mtx_unlock(&audit_mtx); return (ENOSYS); - break; case A_SETSTAT: - mtx_unlock(&audit_mtx); return (ENOSYS); - break; case A_SETUMASK: - mtx_unlock(&audit_mtx); return (ENOSYS); - break; case A_SETSMASK: - mtx_unlock(&audit_mtx); return (ENOSYS); - break; case A_OLDGETCOND: case A_GETCOND: if (sizeof(udata.au_cond64) == uap->length) { + mtx_lock(&audit_mtx); if (audit_enabled && !audit_suspended) udata.au_cond64 = AUC_AUDITING; else udata.au_cond64 = AUC_NOAUDIT; - + mtx_unlock(&audit_mtx); break; } - if (sizeof(udata.au_cond) != uap->length) { - mtx_unlock(&audit_mtx); + if (sizeof(udata.au_cond) != uap->length) return (EINVAL); - } + mtx_lock(&audit_mtx); if (audit_enabled && !audit_suspended) udata.au_cond = AUC_AUDITING; else udata.au_cond = AUC_NOAUDIT; + mtx_unlock(&audit_mtx); break; case A_OLDSETCOND: case A_SETCOND: if (sizeof(udata.au_cond64) == uap->length) { + mtx_lock(&audit_mtx); if (udata.au_cond64 == AUC_NOAUDIT) audit_suspended = 1; if (udata.au_cond64 == AUC_AUDITING) @@ -518,14 +523,15 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) audit_suspended = 1; mtx_unlock(&audit_mtx); audit_shutdown(); - mtx_lock(&audit_mtx); + break; } + mtx_unlock(&audit_mtx); break; } if (sizeof(udata.au_cond) != uap->length) { - mtx_unlock(&audit_mtx); return (EINVAL); } + mtx_lock(&audit_mtx); if (udata.au_cond == AUC_NOAUDIT) audit_suspended = 1; if (udata.au_cond == AUC_AUDITING) @@ -534,40 +540,32 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) audit_suspended = 1; mtx_unlock(&audit_mtx); audit_shutdown(); - mtx_lock(&audit_mtx); + break; } + mtx_unlock(&audit_mtx); break; case A_GETCLASS: - if (sizeof(udata.au_evclass) != uap->length) { - mtx_unlock(&audit_mtx); + if (sizeof(udata.au_evclass) != uap->length) return (EINVAL); - } udata.au_evclass.ec_class = au_event_class( udata.au_evclass.ec_number); break; case A_SETCLASS: - if (sizeof(udata.au_evclass) != uap->length) { - mtx_unlock(&audit_mtx); + if (sizeof(udata.au_evclass) != uap->length) return (EINVAL); - } au_evclassmap_insert(udata.au_evclass.ec_number, udata.au_evclass.ec_class); break; case A_GETPINFO: if ((sizeof(udata.au_aupinfo) != uap->length) || - IS_NOT_VALID_PID(udata.au_aupinfo.ap_pid)) { - mtx_unlock(&audit_mtx); + IS_NOT_VALID_PID(udata.au_aupinfo.ap_pid)) return (EINVAL); - } - if ((tp = proc_find(udata.au_aupinfo.ap_pid)) == NULL) { - mtx_unlock(&audit_mtx); + if ((tp = proc_find(udata.au_aupinfo.ap_pid)) == NULL) return (ESRCH); - } - mtx_unlock(&audit_mtx); scred = kauth_cred_proc_ref(tp); if (scred->cr_audit.as_aia_p->ai_termid.at_type == AU_IPv6) { kauth_cred_unref(&scred); @@ -590,19 +588,14 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) kauth_cred_unref(&scred); proc_rele(tp); tp = PROC_NULL; - mtx_lock(&audit_mtx); break; case A_SETPMASK: if ((sizeof(udata.au_aupinfo) != uap->length) || - IS_NOT_VALID_PID(udata.au_aupinfo.ap_pid)) { - mtx_unlock(&audit_mtx); + IS_NOT_VALID_PID(udata.au_aupinfo.ap_pid)) return (EINVAL); - } - if ((tp = proc_find(udata.au_aupinfo.ap_pid)) == NULL) { - mtx_unlock(&audit_mtx); + if ((tp = proc_find(udata.au_aupinfo.ap_pid)) == NULL) return (ESRCH); - } scred = kauth_cred_proc_ref(tp); bcopy(scred->cr_audit.as_aia_p, &aia, sizeof(aia)); kauth_cred_unref(&scred); @@ -611,44 +604,38 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) aia.ai_mask.am_failure = udata.au_aupinfo.ap_mask.am_failure; AUDIT_CHECK_IF_KEVENTS_MASK(aia.ai_mask); - error = audit_session_setaia(tp, &aia, 0); - mtx_unlock(&audit_mtx); + error = audit_session_setaia(tp, &aia); proc_rele(tp); tp = PROC_NULL; if (error) return (error); - mtx_lock(&audit_mtx); break; case A_SETFSIZE: if ((sizeof(udata.au_fstat) != uap->length) || ((udata.au_fstat.af_filesz != 0) && - (udata.au_fstat.af_filesz < MIN_AUDIT_FILE_SIZE))) { - mtx_unlock(&audit_mtx); + (udata.au_fstat.af_filesz < MIN_AUDIT_FILE_SIZE))) return (EINVAL); - } + mtx_lock(&audit_mtx); audit_fstat.af_filesz = udata.au_fstat.af_filesz; + mtx_unlock(&audit_mtx); break; case A_GETFSIZE: - if (sizeof(udata.au_fstat) != uap->length) { - mtx_unlock(&audit_mtx); + if (sizeof(udata.au_fstat) != uap->length) return (EINVAL); - } + mtx_lock(&audit_mtx); udata.au_fstat.af_filesz = audit_fstat.af_filesz; udata.au_fstat.af_currsz = audit_fstat.af_currsz; + mtx_unlock(&audit_mtx); break; case A_GETPINFO_ADDR: if ((sizeof(udata.au_aupinfo_addr) != uap->length) || - IS_NOT_VALID_PID(udata.au_aupinfo_addr.ap_pid)) { - mtx_unlock(&audit_mtx); + IS_NOT_VALID_PID(udata.au_aupinfo_addr.ap_pid)) return (EINVAL); - } - if ((tp = proc_find(udata.au_aupinfo.ap_pid)) == NULL) { - mtx_unlock(&audit_mtx); + if ((tp = proc_find(udata.au_aupinfo.ap_pid)) == NULL) return (ESRCH); - } WARN_IF_AINFO_ADDR_CHANGED(uap->length, sizeof(auditpinfo_addr_t), "auditon(A_GETPINFO_ADDR,...)", "auditpinfo_addr_t"); @@ -672,41 +659,48 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) break; case A_GETKAUDIT: - mtx_unlock(&audit_mtx); if (sizeof(udata.au_kau_info) != uap->length) return (EINVAL); audit_get_kinfo(&udata.au_kau_info); - mtx_lock(&audit_mtx); break; case A_SETKAUDIT: if ((sizeof(udata.au_kau_info) != uap->length) || (udata.au_kau_info.ai_termid.at_type != AU_IPv4 && - udata.au_kau_info.ai_termid.at_type != AU_IPv6)) { - mtx_unlock(&audit_mtx); + udata.au_kau_info.ai_termid.at_type != AU_IPv6)) return (EINVAL); - } - mtx_unlock(&audit_mtx); audit_set_kinfo(&udata.au_kau_info); - mtx_lock(&audit_mtx); break; case A_SENDTRIGGER: if ((sizeof(udata.au_trigger) != uap->length) || (udata.au_trigger < AUDIT_TRIGGER_MIN) || - (udata.au_trigger > AUDIT_TRIGGER_MAX)) { - mtx_unlock(&audit_mtx); + (udata.au_trigger > AUDIT_TRIGGER_MAX)) return (EINVAL); - } - mtx_unlock(&audit_mtx); return (audit_send_trigger(udata.au_trigger)); case A_GETSINFO_ADDR: /* Handled above before switch(). */ break; + case A_GETSFLAGS: + if (sizeof(udata.au_flags) != uap->length) + return (EINVAL); + bcopy(&(kauth_cred_get()->cr_audit.as_aia_p->ai_flags), + &udata.au_flags, sizeof(udata.au_flags)); + break; + + case A_SETSFLAGS: + if (sizeof(udata.au_flags) != uap->length) + return (EINVAL); + bcopy(kauth_cred_get()->cr_audit.as_aia_p, &aia, sizeof(aia)); + aia.ai_flags = udata.au_flags; + error = audit_session_setaia(p, &aia); + if (error) + return (error); + break; + default: - mtx_unlock(&audit_mtx); return (EINVAL); } @@ -730,15 +724,13 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) case A_GETPINFO_ADDR: case A_GETKAUDIT: case A_GETSINFO_ADDR: + case A_GETSFLAGS: error = copyout((void *)&udata, uap->data, uap->length); - if (error) { - mtx_unlock(&audit_mtx); + if (error) return (ENOSYS); - } break; } - mtx_unlock(&audit_mtx); return (0); } @@ -803,7 +795,7 @@ setauid(proc_t p, struct setauid_args *uap, __unused int32_t *retval) bcopy(&scred->cr_audit.as_mask, &aia.ai_mask, sizeof(au_mask_t)); kauth_cred_unref(&scred); aia.ai_auid = id; - error = audit_session_setaia(p, &aia, 0); + error = audit_session_setaia(p, &aia); return (error); } @@ -917,7 +909,7 @@ setaudit(proc_t p, struct setaudit_args *uap, __unused int32_t *retval) newaia.ai_termid.at_port = ai.ai_termid.port; newaia.ai_termid.at_type = AU_IPv4; - error = audit_session_setaia(p, &newaia, 0); + error = audit_session_setaia(p, &newaia); if (error) return (error); @@ -1007,7 +999,7 @@ setaudit_addr(proc_t p, struct setaudit_addr_args *uap, if (aia.ai_asid == AU_DEFAUDITSID) aia.ai_asid = AU_ASSIGN_ASID; - error = audit_session_setaia(p, &aia, 0); + error = audit_session_setaia(p, &aia); if (error) return (error); @@ -1053,7 +1045,7 @@ auditctl(proc_t p, struct auditctl_args *uap, __unused int32_t *retval) if (uap->path == USER_ADDR_NULL) return (EINVAL); - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | LOCKLEAF | AUDITVNPATH1, (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32), uap->path, vfs_context_current()); error = vn_open(&nd, AUDIT_OPEN_FLAGS, 0); diff --git a/bsd/security/audit/audit_worker.c b/bsd/security/audit/audit_worker.c index d307a7eb9..d9ef366a2 100644 --- a/bsd/security/audit/audit_worker.c +++ b/bsd/security/audit/audit_worker.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 1999-2008 Apple Inc. + * Copyright (c) 1999-2010 Apple Inc. * Copyright (c) 2006-2008 Robert N. M. Watson * All rights reserved. * @@ -203,16 +203,11 @@ audit_record_write(struct vnode *vp, struct vfs_context *ctx, void *data, */ if (audit_qctrl.aq_minfree != 0) { temp = mnt_stat->f_blocks / (100 / audit_qctrl.aq_minfree); - if (mnt_stat->f_bfree < temp) { - if (ppsratecheck(&last_lowspace_trigger, - &cur_lowspace_trigger, 1)) { + if (mnt_stat->f_bfree < temp && + ppsratecheck(&last_lowspace_trigger, + &cur_lowspace_trigger, 1)) (void)audit_send_trigger( AUDIT_TRIGGER_LOW_SPACE); - printf("Warning: audit space low (< %d%% free)" - "on audit log file-system\n", - audit_qctrl.aq_minfree); - } - } } /* @@ -358,7 +353,8 @@ audit_worker_process_record(struct kaudit_record *ar) if (!(ar->k_ar_commit & AR_COMMIT_KERNEL) || ((ar->k_ar_commit & AR_PRESELECT_PIPE) == 0 && - (ar->k_ar_commit & AR_PRESELECT_TRAIL) == 0)) + (ar->k_ar_commit & AR_PRESELECT_TRAIL) == 0 && + (ar->k_ar_commit & AR_PRESELECT_FILTER) == 0)) goto out; auid = ar->k_ar.ar_subj_auid; @@ -395,6 +391,16 @@ audit_worker_process_record(struct kaudit_record *ar) ar->k_ar_commit & AR_PRESELECT_TRAIL, bsm->data, bsm->len); + if (ar->k_ar_commit & AR_PRESELECT_FILTER) { + + /* + * XXXss - This needs to be generalized so new filters can + * be easily plugged in. + */ + audit_sdev_submit(auid, ar->k_ar.ar_subj_asid, bsm->data, + bsm->len); + } + kau_free(bsm); out: if (trail_locked) @@ -417,7 +423,9 @@ audit_worker(void) struct kaudit_record *ar; int lowater_signal; - audit_ctx.vc_thread = current_thread(); + if (audit_ctx.vc_thread == NULL) + audit_ctx.vc_thread = current_thread(); + TAILQ_INIT(&ar_worklist); mtx_lock(&audit_mtx); while (1) { @@ -427,7 +435,8 @@ audit_worker(void) * Wait for a record. */ while (TAILQ_EMPTY(&audit_q)) - cv_wait(&audit_worker_cv, &audit_mtx); + cv_wait_continuation(&audit_worker_cv, &audit_mtx, + (thread_continue_t)audit_worker); /* * If there are records in the global audit record queue, diff --git a/bsd/sys/Makefile b/bsd/sys/Makefile index f74bfe8d0..53f457741 100644 --- a/bsd/sys/Makefile +++ b/bsd/sys/Makefile @@ -7,25 +7,9 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir include $(MakeInc_cmd) include $(MakeInc_def) -ALLPRODUCTS = AppleTV iPhone MacOSX -PRODUCT = $(shell tconf --product) -EXTRAUNIFDEF = $(foreach x,$(ALLPRODUCTS),$(if $(findstring $(PRODUCT),$(x)),-DPRODUCT_$(x),-UPRODUCT_$(x))) -SINCFRAME_UNIFDEF += $(EXTRAUNIFDEF) -SPINCFRAME_UNIFDEF += $(EXTRAUNIFDEF) -KINCFRAME_UNIFDEF += $(EXTRAUNIFDEF) -KPINCFRAME_UNIFDEF += $(EXTRAUNIFDEF) +INSTINC_SUBDIRS = -INSTINC_SUBDIRS = \ - -INSTINC_SUBDIRS_PPC = \ - -INSTINC_SUBDIRS_I386 = \ - -EXPINC_SUBDIRS = \ - -EXPINC_SUBDIRS_PPC = \ - -EXPINC_SUBDIRS_I386 = \ +EXPINC_SUBDIRS = # Installs header file for user level - # $(DSTROOT)/System/Library/Frameworks/System.framework/Headers @@ -55,17 +39,22 @@ DATAFILES = \ # $(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders PRIVATE_DATAFILES = \ codesign.h \ + content_protection.h \ disklabel.h \ + fileport.h \ fsctl.h \ fsgetpath.h \ fslog.h \ + imgsrc.h \ ipcs.h \ shm_internal.h \ spawn_internal.h \ tree.h \ ux_exception.h \ proc_info.h \ - vnioctl.h + process_policy.h \ + vnioctl.h \ + priv.h # Installs header file for kernel extensions - # $(DSTROOT)/System/Library/Frameworks/Kernel.framework/Headers @@ -97,10 +86,13 @@ KERNELFILES = \ # Installs header file for Apple internal use for kernel extensions - # $(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders PRIVATE_KERNELFILES = \ + codesign.h \ cprotect.h \ + content_protection.h \ decmpfs.h \ disktab.h \ fbt.h \ + fileport.h \ fsctl.h \ fslog.h \ mach_swapon.h \ @@ -115,19 +107,20 @@ PRIVATE_KERNELFILES = \ user.h \ vfs_context.h \ vmmeter.h \ - spawn_internal.h + spawn_internal.h \ + priv.h # /System/Library/Frameworks/System.framework/Headers and /usr/include INSTALL_MI_LIST = ${DATAFILES} -INSTALL_MI_GEN_LIST = syscall.h +INSTALL_MI_GEN_LIST = syscall.h _posix_availability.h _symbol_aliasing.h INSTALL_MI_DIR = sys -EXPORT_MI_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} linker_set.h bsdtask_info.h filedesc.h pipe.h resourcevar.h semaphore.h \ +EXPORT_MI_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} linker_set.h bsdtask_info.h pthread_internal.h filedesc.h pipe.h resourcevar.h semaphore.h \ vnode_internal.h proc_internal.h file_internal.h mount_internal.h \ - uio_internal.h + uio_internal.h tree.h EXPORT_MI_GEN_LIST = syscall.h sysproto.h @@ -156,6 +149,16 @@ sysproto.h: $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS) @echo "Generating bsd/sys/$@ from $<"; $(_v)$(MAKESYSCALLS) $< proto > /dev/null +MAKE_POSIX_AVAILABILITY = $(SRCROOT)/bsd/sys/make_posix_availability.sh +_posix_availability.h: $(MAKE_POSIX_AVAILABILITY) + @echo "Generating bsd/sys/$@" + $(_v)$(MAKE_POSIX_AVAILABILITY) $@ + +MAKE_SYMBOL_ALIASING = $(SRCROOT)/bsd/sys/make_symbol_aliasing.sh +_symbol_aliasing.h: $(MAKE_SYMBOL_ALIASING) + @echo "Generating bsd/sys/$@" + $(_v)$(MAKE_SYMBOL_ALIASING) $@ + include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/bsd/sys/attr.h b/bsd/sys/attr.h index 5f85717d7..42a8b7673 100644 --- a/bsd/sys/attr.h +++ b/bsd/sys/attr.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -49,6 +49,12 @@ /* The following option only valid when requesting ATTR_CMN_RETURNED_ATTRS */ #define FSOPT_PACK_INVAL_ATTRS 0x00000008 +#ifdef PRIVATE +#define FSOPT_EXCHANGE_DATA_ONLY 0x0000010 +#endif + + + /* we currently aren't anywhere near this amount for a valid * fssearchblock.sizeofsearchparams1 or fssearchblock.sizeofsearchparams2 * but we put a sanity check in to avoid abuse of the value passed in from @@ -208,6 +214,12 @@ typedef struct vol_capabilities_attr { * * VOL_CAP_FMT_DECMPFS_COMPRESSION: When set, the volume supports transparent * decompression of compressed files using decmpfs. + * + * VOL_CAP_FMT_64BIT_OBJECT_IDS: When set, the volume uses object IDs that + * are 64-bit. This means that ATTR_CMN_FILEID and ATTR_CMN_PARENTID are the + * only legitimate attributes for obtaining object IDs from this volume and the + * 32-bit fid_objno fields of the fsobj_id_t returned by ATTR_CMN_OBJID, + * ATTR_CMN_OBJPERMID, and ATTR_CMN_PAROBJID are undefined. */ #define VOL_CAP_FMT_PERSISTENTOBJECTIDS 0x00000001 #define VOL_CAP_FMT_SYMBOLICLINKS 0x00000002 @@ -225,7 +237,8 @@ typedef struct vol_capabilities_attr { #define VOL_CAP_FMT_HIDDEN_FILES 0x00002000 #define VOL_CAP_FMT_PATH_FROM_ID 0x00004000 #define VOL_CAP_FMT_NO_VOLUME_SIZES 0x00008000 -#define VOL_CAP_FMT_DECMPFS_COMPRESSION 0x00010000 +#define VOL_CAP_FMT_DECMPFS_COMPRESSION 0x00010000 +#define VOL_CAP_FMT_64BIT_OBJECT_IDS 0x00020000 /* @@ -338,13 +351,15 @@ typedef struct vol_attributes_attr { #define ATTR_CMN_FILEID 0x02000000 #define ATTR_CMN_PARENTID 0x04000000 #define ATTR_CMN_FULLPATH 0x08000000 +#define ATTR_CMN_ADDEDTIME 0x10000000 + /* * ATTR_CMN_RETURNED_ATTRS is only valid with getattrlist(2). * It is always the first attribute in the return buffer. */ #define ATTR_CMN_RETURNED_ATTRS 0x80000000 -#define ATTR_CMN_VALIDMASK 0x8FE7FFFF +#define ATTR_CMN_VALIDMASK 0x9FE7FFFF #define ATTR_CMN_SETMASK 0x01C7FF00 #define ATTR_CMN_VOLSETMASK 0x00006700 @@ -378,7 +393,9 @@ typedef struct vol_attributes_attr { #define ATTR_DIR_LINKCOUNT 0x00000001 #define ATTR_DIR_ENTRYCOUNT 0x00000002 #define ATTR_DIR_MOUNTSTATUS 0x00000004 -#define DIR_MNTSTATUS_MNTPOINT 0x00000001 +/* ATTR_DIR_MOUNTSTATUS Flags: */ +#define DIR_MNTSTATUS_MNTPOINT 0x00000001 +#define DIR_MNTSTATUS_TRIGGER 0x00000002 #define ATTR_DIR_VALIDMASK 0x00000007 #define ATTR_DIR_SETMASK 0x00000000 @@ -394,11 +411,9 @@ typedef struct vol_attributes_attr { #define ATTR_FILE_DATAALLOCSIZE 0x00000400 #define ATTR_FILE_RSRCLENGTH 0x00001000 #define ATTR_FILE_RSRCALLOCSIZE 0x00002000 -/* Only used when CONFIG_PROTECT is ON */ -#define ATTR_FILE_PROTECTION_CLASS 0x00004000 -#define ATTR_FILE_VALIDMASK 0x000077FF -#define ATTR_FILE_SETMASK 0x00004020 +#define ATTR_FILE_VALIDMASK 0x000037FF +#define ATTR_FILE_SETMASK 0x00000020 #define ATTR_FORK_TOTALSIZE 0x00000001 #define ATTR_FORK_ALLOCSIZE 0x00000002 diff --git a/bsd/sys/buf.h b/bsd/sys/buf.h index 80bf3d384..f1d7f924b 100644 --- a/bsd/sys/buf.h +++ b/bsd/sys/buf.h @@ -90,6 +90,7 @@ #define B_PASSIVE 0x00000800 /* PASSIVE I/Os are ignored by THROTTLE I/O */ #define B_IOSTREAMING 0x00001000 /* sequential access pattern detected */ #define B_THROTTLED_IO 0x00002000 /* low priority I/O */ +#define B_ENCRYPTED_IO 0x00004000 /* Encrypted I/O */ /* * make sure to check when adding flags that * that the new flags don't overlap the definitions @@ -121,6 +122,8 @@ void buf_markinvalid(buf_t); */ void buf_markdelayed(buf_t); +void buf_markclean(buf_t); + /*! @function buf_markeintr @abstract Mark a buffer as having been interrupted during I/O. @@ -634,6 +637,32 @@ errno_t buf_setupl(buf_t, upl_t, uint32_t); */ buf_t buf_clone(buf_t, int, int, void (*)(buf_t, void *), void *); + +/*! + @function buf_create_shadow + @abstract Create a shadow buffer with optional private storage and an optional callback. + @param bp Buffer to shadow. + @param force_copy If TRUE, do not link the shadaow to 'bp' and if 'external_storage' == NULL, + force a copy of the data associated with 'bp'. + @param external_storage If non-NULL, associate it with the new buffer as its storage instead of the + storage currently associated with 'bp'. + @param iodone Callback to be called from buf_biodone() when I/O completes, in the sense of buf_setcallback(). + @param arg Argument to pass to iodone() callback. + @return NULL if the buffer to be shadowed is not B_META or a primary buffer (i.e. not a shadow buffer); otherwise, the new buffer. +*/ + +buf_t buf_create_shadow(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg); + + +/*! + @function buf_shadow + @abstract returns true if 'bp' is a shadow of another buffer. + @param bp Buffer to query. + @return 1 if 'bp' is a shadow, 0 otherwise. +*/ +int buf_shadow(buf_t bp); + + /*! @function buf_alloc @abstract Allocate an uninitialized buffer. @@ -659,6 +688,7 @@ void buf_free(buf_t); */ #define BUF_WRITE_DATA 0x0001 /* write data blocks first */ #define BUF_SKIP_META 0x0002 /* skip over metadata blocks */ +#define BUF_INVALIDATE_LOCKED 0x0004 /* force B_LOCKED blocks to be invalidated */ /*! @function buf_invalidateblks @@ -966,8 +996,38 @@ buf_t buf_getblk(vnode_t, daddr64_t, int, int, int, int); @return Always returns a new buffer. */ buf_t buf_geteblk(int); + +/*! + @function buf_clear_redundancy_flags + @abstract Clear flags on a buffer. + @discussion: buffer_redundancy_flags &= ~flags + @param bp Buffer whose flags to clear. + @param flags Flags to remove from buffer's mask + @return void. + */ +void buf_clear_redundancy_flags(buf_t, uint32_t); + +/*! + @function buf_redundancyflags + @abstract Get redundancy flags set on a buffer. + @param bp Buffer whose redundancy flags to grab. + @return flags. + */ +uint32_t buf_redundancy_flags(buf_t); + +/*! + @function buf_setredundancyflags + @abstract Set redundancy flags on a buffer. + @discussion: buffer_redundancy_flags |= flags + @param bp Buffer whose flags to set. + @param flags Flags to add to buffer's redundancy flags + @return void. + */ +void buf_set_redundancy_flags(buf_t, uint32_t); + #ifdef KERNEL_PRIVATE -void buf_setfilter(buf_t, void (*)(buf_t, void *), void *, void **, void **); +void buf_setfilter(buf_t, void (*)(buf_t, void *), void *, void (**)(buf_t, void *), void **); + /*! @function buf_getcpaddr diff --git a/bsd/sys/buf_internal.h b/bsd/sys/buf_internal.h index 5718861f6..d80eb21c8 100644 --- a/bsd/sys/buf_internal.h +++ b/bsd/sys/buf_internal.h @@ -115,7 +115,16 @@ struct buf { int b_dirtyend; /* Offset of end of dirty region. */ int b_validoff; /* Offset in buffer of valid region. */ int b_validend; /* Offset of end of valid region. */ + + /* store extra information related to redundancy of data, such as + * which redundancy copy to use, etc + */ + uint32_t b_redundancy_flags; + proc_t b_proc; /* Associated proc; NULL if kernel. */ +#ifdef BUF_MAKE_PRIVATE + buf_t b_data_store; +#endif #if CONFIG_PROTECT struct cprotect *b_cpentry; /* address of cp_entry, to be passed further down */ #endif /* CONFIG_PROTECT */ @@ -131,6 +140,12 @@ struct buf { /* cluster_io definitions for use with io bufs */ #define b_uploffset b_bufsize +#define b_orig b_freelist.tqe_prev +#define b_shadow b_freelist.tqe_next +#define b_shadow_ref b_validoff +#ifdef BUF_MAKE_PRIVATE +#define b_data_ref b_validend +#endif #define b_trans_head b_freelist.tqe_prev #define b_trans_next b_freelist.tqe_next #define b_iostate b_rcred @@ -143,20 +158,25 @@ struct buf { #define BL_BUSY 0x00000001 /* I/O in progress. */ #define BL_WANTED 0x00000002 /* Process wants this buffer. */ #define BL_IOBUF 0x00000004 /* buffer allocated via 'buf_alloc' */ -#define BL_CALLDONE 0x00000008 /* callback routine on B_CALL bp has completed */ #define BL_WANTDEALLOC 0x00000010 /* buffer should be put on empty list when clean */ +#define BL_SHADOW 0x00000020 +#define BL_EXTERNAL 0x00000040 +#define BL_WAITSHADOW 0x00000080 +#define BL_IOBUF_ALLOC 0x00000100 /* * Parameters for buffer cache garbage collection */ #define BUF_STALE_THRESHHOLD 30 /* Collect if untouched in the last 30 seconds */ -#define BUF_MAX_GC_COUNT 1000 /* Generally 6-8 MB */ +#define BUF_MAX_GC_COUNT 1024 /* Generally 6-8 MB */ +#define BUF_MAX_GC_BATCH_SIZE 128 /* Under a single grab of the lock */ /* * mask used by buf_flags... these are the readable external flags */ #define BUF_X_RDFLAGS (B_PHYS | B_RAW | B_LOCKED | B_ASYNC | B_READ | B_WRITE | B_PAGEIO |\ - B_META | B_CLUSTER | B_DELWRI | B_FUA | B_PASSIVE | B_IOSTREAMING | B_THROTTLED_IO) + B_META | B_CLUSTER | B_DELWRI | B_FUA | B_PASSIVE | B_IOSTREAMING | B_THROTTLED_IO |\ + B_ENCRYPTED_IO) /* * mask used by buf_clearflags/buf_setflags... these are the writable external flags */ @@ -189,11 +209,10 @@ struct buf { /* * private flags used by by the cluster layer */ -#define B_NEED_IODONE 0x20000000 /* need biodone on the real_bp associated with a cluster_io */ +#define B_TWANTED 0x20000000 /* but_t that is part of a cluster level transaction is wanted */ #define B_COMMIT_UPL 0x40000000 /* commit/abort the UPL on I/O success/failure */ #define B_TDONE 0x80000000 /* buf_t that is part of a cluster level transaction has completed */ - /* Flags to low-level allocation routines. */ #define B_CLRBUF 0x01 /* Request allocated buffer be cleared. */ #define B_SYNC 0x02 /* Do all allocations synchronously. */ @@ -222,6 +241,8 @@ extern struct buf *buf_headers; /* The buffer headers. */ __BEGIN_DECLS +buf_t buf_create_shadow_priv(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg); + buf_t alloc_io_buf(vnode_t, int); void free_io_buf(buf_t); @@ -239,8 +260,6 @@ void bufinit(void) __attribute__((section("__TEXT, initcode"))); void buf_list_lock(void); void buf_list_unlock(void); -void buf_biowait_callback(buf_t); - void cluster_init(void) __attribute__((section("__TEXT, initcode"))); void buf_drop(buf_t); errno_t buf_acquire(buf_t, int, int, int); @@ -248,6 +267,9 @@ errno_t buf_acquire(buf_t, int, int, int); int count_busy_buffers(void); int count_lock_queue(void); +#ifdef BUF_MAKE_PRIVATE +errno_t buf_make_private(buf_t bp); +#endif __END_DECLS diff --git a/bsd/sys/cdefs.h b/bsd/sys/cdefs.h index 59e922bea..7076ef572 100644 --- a/bsd/sys/cdefs.h +++ b/bsd/sys/cdefs.h @@ -164,6 +164,12 @@ #define __unused #endif +#if defined(__GNUC__) && __GNUC__ >= 4 +#define __used __attribute__((__used__)) +#else +#define __used +#endif + /* * GCC 2.95 provides `__restrict' as an extension to C90 to support the * C99-specific `restrict' type qualifier. We happen to use `__restrict' as @@ -196,7 +202,7 @@ #define __scanflike(fmtarg, firstvararg) #endif -#define __IDSTRING(name,string) static const char name[] __unused = string +#define __IDSTRING(name,string) static const char name[] __used = string #ifndef __COPYRIGHT #define __COPYRIGHT(s) __IDSTRING(copyright,s) @@ -215,7 +221,7 @@ #endif /* - * COMPILATION ENVIRONMENTS + * COMPILATION ENVIRONMENTS -- see compat(5) for additional detail * * DEFAULT By default newly complied code will get POSIX APIs plus * Apple API extensions in scope. @@ -259,24 +265,24 @@ #define __DARWIN_SUF_DARWIN10 "_darwin10" #define __DARWIN10_ALIAS(sym) __asm("_" __STRING(sym) __DARWIN_SUF_DARWIN10) #else /* !KERNEL */ -#ifdef PRODUCT_AppleTV -/* Product: AppleTV */ +#ifdef PLATFORM_iPhoneOS +/* Platform: iPhoneOS */ #define __DARWIN_ONLY_64_BIT_INO_T 1 #define __DARWIN_ONLY_UNIX_CONFORMANCE 1 #define __DARWIN_ONLY_VERS_1050 1 -#endif /* PRODUCT_AppleTV */ -#ifdef PRODUCT_iPhone -/* Product: iPhone */ +#endif /* PLATFORM_iPhoneOS */ +#ifdef PLATFORM_iPhoneSimulator +/* Platform: iPhoneSimulator */ #define __DARWIN_ONLY_64_BIT_INO_T 1 #define __DARWIN_ONLY_UNIX_CONFORMANCE 1 #define __DARWIN_ONLY_VERS_1050 1 -#endif /* PRODUCT_iPhone */ -#ifdef PRODUCT_MacOSX -/* Product: MacOSX */ +#endif /* PLATFORM_iPhoneSimulator */ +#ifdef PLATFORM_MacOSX +/* Platform: MacOSX */ #define __DARWIN_ONLY_64_BIT_INO_T 0 /* #undef __DARWIN_ONLY_UNIX_CONFORMANCE (automatically set for 64-bit) */ #define __DARWIN_ONLY_VERS_1050 0 -#endif /* PRODUCT_MacOSX */ +#endif /* PLATFORM_MacOSX */ #endif /* KERNEL */ /* @@ -313,6 +319,8 @@ # error "Can't define _NONSTD_SOURCE when only UNIX conformance is available." # endif /* _NONSTD_SOURCE */ # define __DARWIN_UNIX03 1 +# elif defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && ((__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__-0) < 1040) +# define __DARWIN_UNIX03 0 # elif defined(_DARWIN_C_SOURCE) || defined(_XOPEN_SOURCE) || defined(_POSIX_C_SOURCE) # if defined(_NONSTD_SOURCE) # error "Can't define both _NONSTD_SOURCE and any of _DARWIN_C_SOURCE, _XOPEN_SOURCE or _POSIX_C_SOURCE." @@ -438,13 +446,19 @@ /* * symbol release macros */ -#if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && ((__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__-0) < 1060) -#undef __DARWIN_10_6_AND_LATER -#define __DARWIN_10_6_AND_LATER_ALIAS(x) /* nothing */ -#else /* 10.6 and beyond */ -#define __DARWIN_10_6_AND_LATER -#define __DARWIN_10_6_AND_LATER_ALIAS(x) x +#ifdef KERNEL +#define __DARWIN_ALIAS_STARTING(_mac, _iphone, x) +#else +#include + +#if defined(__IPHONE_OS_VERSION_MIN_REQUIRED) +#define __DARWIN_ALIAS_STARTING(_mac, _iphone, x) __DARWIN_ALIAS_STARTING_IPHONE_##_iphone(x) +#elif defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) +#define __DARWIN_ALIAS_STARTING(_mac, _iphone, x) __DARWIN_ALIAS_STARTING_MAC_##_mac(x) +#else +#define __DARWIN_ALIAS_STARTING(_mac, _iphone, x) #endif +#endif /* KERNEL */ /* @@ -460,6 +474,7 @@ * _POSIX_C_SOURCE == 199506L 1003.1c-1995, 1003.1i-1995, * and the omnibus ISO/IEC 9945-1: 1996 * _POSIX_C_SOURCE == 200112L 1003.1-2001 + * _POSIX_C_SOURCE == 200809L 1003.1-2008 * * In addition, the X/Open Portability Guide, which is now the Single UNIX * Specification, defines a feature-test macro which indicates the version of @@ -480,10 +495,13 @@ /* Deal with various X/Open Portability Guides and Single UNIX Spec. */ #ifdef _XOPEN_SOURCE -#if _XOPEN_SOURCE - 0L >= 600L +#if _XOPEN_SOURCE - 0L >= 700L && (!defined(_POSIX_C_SOURCE) || _POSIX_C_SOURCE - 0L < 200809L) +#undef _POSIX_C_SOURCE +#define _POSIX_C_SOURCE 200809L +#elif _XOPEN_SOURCE - 0L >= 600L && (!defined(_POSIX_C_SOURCE) || _POSIX_C_SOURCE - 0L < 200112L) #undef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200112L -#elif _XOPEN_SOURCE - 0L >= 500L +#elif _XOPEN_SOURCE - 0L >= 500L && (!defined(_POSIX_C_SOURCE) || _POSIX_C_SOURCE - 0L < 199506L) #undef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 199506L #endif @@ -497,6 +515,44 @@ #define _POSIX_C_SOURCE 198808L #endif +/* + * Deprecation macro + */ +#if defined(__GNUC__) && ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))) +#define __deprecated __attribute__((deprecated)) +#define __unavailable __attribute__((unavailable)) +#else +#define __deprecated /* nothing */ +#define __unavailable /* nothing */ +#endif + +/* POSIX C deprecation macros */ +#ifdef KERNEL +#define __POSIX_C_DEPRECATED(ver) +#else +#include + +#define __POSIX_C_DEPRECATED(ver) ___POSIX_C_DEPRECATED_STARTING_##ver +#endif + +/* + * Set a single macro which will always be defined and can be used to determine + * the appropriate namespace. For POSIX, these values will correspond to + * _POSIX_C_SOURCE value. Currently there are two additional levels corresponding + * to ANSI (_ANSI_SOURCE) and Darwin extensions (_DARWIN_C_SOURCE) + */ +#define __DARWIN_C_ANSI 010000L +#define __DARWIN_C_FULL 900000L + +#if defined(_ANSI_SOURCE) +#define __DARWIN_C_LEVEL __DARWIN_C_ANSI +#elif defined(_POSIX_C_SOURCE) && !defined(_DARWIN_C_SOURCE) && !defined(_NONSTD_SOURCE) +#define __DARWIN_C_LEVEL _POSIX_C_SOURCE +#else +#define __DARWIN_C_LEVEL __DARWIN_C_FULL +#endif + + /* * long long is not supported in c89 (__STRICT_ANSI__), but g++ -ansi and * c99 still want long longs. While not perfect, we allow long longs for @@ -512,22 +568,7 @@ * long doubles. This applies only to ppc; i386 already has long double * support, while ppc64 doesn't have any backwards history. */ -#if defined(__ppc__) -# if defined(__LDBL_MANT_DIG__) && defined(__DBL_MANT_DIG__) && \ - __LDBL_MANT_DIG__ > __DBL_MANT_DIG__ -# if __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__-0 < 1040 -# define __DARWIN_LDBL_COMPAT(x) __asm("_" __STRING(x) "$LDBLStub") -# else -# define __DARWIN_LDBL_COMPAT(x) __asm("_" __STRING(x) "$LDBL128") -# endif -# define __DARWIN_LDBL_COMPAT2(x) __asm("_" __STRING(x) "$LDBL128") -# define __DARWIN_LONG_DOUBLE_IS_DOUBLE 0 -# else -# define __DARWIN_LDBL_COMPAT(x) /* nothing */ -# define __DARWIN_LDBL_COMPAT2(x) /* nothing */ -# define __DARWIN_LONG_DOUBLE_IS_DOUBLE 1 -# endif -#elif defined(__i386__) || defined(__ppc64__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) # define __DARWIN_LDBL_COMPAT(x) /* nothing */ # define __DARWIN_LDBL_COMPAT2(x) /* nothing */ # define __DARWIN_LONG_DOUBLE_IS_DOUBLE 0 @@ -535,15 +576,6 @@ # error Unknown architecture #endif -/* - * Deprecation macro - */ -#if __GNUC__ >= 3 -#define __deprecated __attribute__((deprecated)) -#else -#define __deprecated /* nothing */ -#endif - /***************************************** * Public darwin-specific feature macros *****************************************/ @@ -605,7 +637,7 @@ * catastrophic run-time failures. */ #ifndef __CAST_AWAY_QUALIFIER -#define __CAST_AWAY_QUALIFIER(variable, qualifier, type) (type) ((char *)0 + ((qualifier char *)(variable) - (qualifier char *)0) ) +#define __CAST_AWAY_QUALIFIER(variable, qualifier, type) (type) (long)(variable) #endif #endif /* !_CDEFS_H_ */ diff --git a/bsd/sys/codesign.h b/bsd/sys/codesign.h index 56ae21668..e72c25044 100644 --- a/bsd/sys/codesign.h +++ b/bsd/sys/codesign.h @@ -38,6 +38,7 @@ #define CS_EXEC_SET_HARD 0x1000 /* set CS_HARD on any exec'ed process */ #define CS_EXEC_SET_KILL 0x2000 /* set CS_KILL on any exec'ed process */ #define CS_KILLED 0x10000 /* was killed by kernel for invalidity */ +#define CS_RESTRICT 0x20000 /* tell dyld to treat restricted */ /* csops operations */ #define CS_OPS_STATUS 0 /* return status */ @@ -47,6 +48,8 @@ #define CS_OPS_PIDPATH 4 /* get executable's pathname */ #define CS_OPS_CDHASH 5 /* get code directory hash */ #define CS_OPS_PIDOFFSET 6 /* get offset of active Mach-o slice */ +#define CS_OPS_ENTITLEMENTS_BLOB 7 /* get entitlements blob */ +#define CS_OPS_MARKRESTRICT 8 /* set RESTRICT flag (sticky) */ #ifndef KERNEL diff --git a/bsd/sys/conf.h b/bsd/sys/conf.h index 4cf53a914..39e4fef37 100644 --- a/bsd/sys/conf.h +++ b/bsd/sys/conf.h @@ -71,6 +71,8 @@ #include #include +#include +#include /* * Definitions of device driver entry switches @@ -194,10 +196,24 @@ struct cdevsw { int d_type; }; +#ifdef BSD_KERNEL_PRIVATE +void devsw_init(void); + +extern uint64_t cdevsw_flags[]; +#define CDEVSW_SELECT_KQUEUE 0x01 +#define CDEVSW_USE_OFFSET 0x02 + +struct thread; + +typedef struct devsw_lock { + TAILQ_ENTRY(devsw_lock) dl_list; + struct thread *dl_thread; + dev_t dl_dev; + int dl_mode; +} *devsw_lock_t; + +#endif /* BSD_KERNEL_PRIVATE */ -#ifdef KERNEL_PRIVATE -extern struct cdevsw cdevsw[]; -#endif /* KERNEL_PRIVATE */ /* * Contents of empty cdevsw slot. @@ -276,6 +292,16 @@ extern struct swdevt swdevt[]; * else -1 */ __BEGIN_DECLS +#ifdef KERNEL_PRIVATE +extern struct cdevsw cdevsw[]; +extern int cdevsw_setkqueueok(int, struct cdevsw*, int); +#endif /* KERNEL_PRIVATE */ + +#ifdef BSD_KERNEL_PRIVATE +extern void devsw_lock(dev_t, int); +extern void devsw_unlock(dev_t, int); +#endif /* BSD_KERNEL_PRIVATE */ + int bdevsw_isfree(int); int bdevsw_add(int, struct bdevsw *); int bdevsw_remove(int, struct bdevsw *); diff --git a/osfmk/ppc/Performance.h b/bsd/sys/content_protection.h similarity index 71% rename from osfmk/ppc/Performance.h rename to bsd/sys/content_protection.h index 4442d603e..a4066e184 100644 --- a/osfmk/ppc/Performance.h +++ b/bsd/sys/content_protection.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,21 +25,23 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * Keep special performance related stuff in here - */ -#define PERF_HIST 0 -#define PMIHIST_SIZE 0x00400000 -#define perfClear 0 -#define perfStart 1 -#define perfStop 2 -#define perfMap 3 +#ifndef _SYS_CONTENT_PROTECTION_H_ +#define _SYS_CONTENT_PROTECTION_H_ -#ifndef ASSEMBLER +#ifdef PRIVATE + +/* + * Protection classes vary in their restrictions on read/writability. A is generally + * the strictest, and D is effectively no restriction. + */ +#define PROTECTION_CLASS_A 1 +#define PROTECTION_CLASS_B 2 +#define PROTECTION_CLASS_C 3 +#define PROTECTION_CLASS_D 4 +#define PROTECTION_CLASS_E 5 +#define PROTECTION_CLASS_F 6 -extern unsigned int PMIhist; -extern unsigned int PMIhistV; -extern unsigned int PerfCtl(unsigned int cmd, unsigned int parm0); +#endif /* PRIVATE */ -#endif /* !ASSEMBLER */ +#endif /* _SYS_CONTENT_PROTECTION_H_ */ diff --git a/bsd/sys/cprotect.h b/bsd/sys/cprotect.h index 2edf9aed2..bebe3bb43 100644 --- a/bsd/sys/cprotect.h +++ b/bsd/sys/cprotect.h @@ -36,77 +36,114 @@ extern "C" { #if KERNEL_PRIVATE #include +#include #include -#define PROTECTION_CLASS_A 1 -#define PROTECTION_CLASS_B 2 -#define PROTECTION_CLASS_C 3 -#define PROTECTION_CLASS_D 4 -#define PROTECTION_CLASS_E 5 +#define CP_KEYSIZE 32 /* 8x4 = 32, 32x8 = 256 */ +#define CP_WRAPPEDKEYSIZE 40 /* 2x4 = 8, 8x8 = 64 */ -#define KEYSIZE 8 /* 8x4 = 32, 32x8 = 256 */ -#define INTEGRITYSIZE 2 /* 2x4 = 8, 8x8 = 64 */ +/* lock events from AppleKeyStore */ +#define CP_LOCKED_STATE 0 /* Device is locked */ +#define CP_UNLOCKED_STATE 1 /* Device is unlocked */ -#define LOCKED_STATE 0 -#define UNLOCKED_STATE 1 +#define CP_LOCKED_KEYCHAIN 0 +#define CP_UNLOCKED_KEYCHAIN 1 -#define LOCKED_KEYCHAIN 0 -#define UNLOCKED_KEYCHAIN 1 +/* For struct cprotect: cp_flags */ +#define CP_NEEDS_KEYS 0x1 /* File needs persistent keys */ +#define CP_KEY_FLUSHED 0x2 /* File's unwrapped key has been purged from memory */ +#define CP_NO_XATTR 0x4 /* Key info has not been saved as EA to the FS */ -#define CONTENT_PROTECTION_XATTR_NAME "com.apple.system.cprotect" +/* Content Protection VNOP Operation flags */ +#define CP_READ_ACCESS 0x1 +#define CP_WRITE_ACCESS 0x2 -#define kEMBCKeyHandleSpecial ~1 +#define CONTENT_PROTECTION_XATTR_NAME "com.apple.system.cprotect" +#define CP_CURRENT_MAJOR_VERS 2 +#define CP_CURRENT_MINOR_VERS 0 -/* SLIST_HEAD(cp_list, cp_entry) cp_head = LIST_HEAD_INITIALIZER(cp_head); */ -/* struct cp_list *cprotect_list_headp; /\* List head *\/ */ typedef struct cprotect *cprotect_t; typedef struct cp_wrap_func *cp_wrap_func_t; typedef struct cp_global_state *cp_global_state_t; typedef struct cp_xattr *cp_xattr_t; +typedef struct cnode * cnode_ptr_t; +//forward declare the struct. +struct hfsmount; -typedef int wrapper_t(uint32_t properties, void *key_bytes, size_t key_length, void **wrapped_data, uint32_t *wrapped_length); -typedef int unwrapper_t(uint32_t properties, void *wrapped_data, size_t wrapped_data_length, void **key_bytes, uint32_t *key_length); +/* The wrappers are invoked by the AKS kext */ +typedef int wrapper_t(uint32_t properties, void *key_bytes, size_t key_length, void *wrapped_data, size_t *wrapped_length); +typedef int unwrapper_t(uint32_t properties, void *wrapped_data, size_t wrapped_data_length, void *key_bytes, size_t *key_length); +/* + * Runtime-only structure containing the content protection status + * for the given file. This is contained within the cnode + */ struct cprotect { - uint32_t cache_key[KEYSIZE]; - uint32_t special_data; - uint32_t pclass; - uint8_t cache_key_flushed; - uint8_t lock_state; /* lock_state: 0 means unlocked. 1 means locked */ -}; - -struct cp_entry { - SLIST_ENTRY(cp_entry) cp_list; - struct cprotect *protected_entry; + uint8_t cp_cache_key[CP_KEYSIZE]; + uint8_t cp_persistent_key[CP_WRAPPEDKEYSIZE]; + uint32_t cp_flags; + uint32_t cp_pclass; }; struct cp_wrap_func { - wrapper_t *wrapper; - unwrapper_t *unwrapper; + wrapper_t *wrapper; + unwrapper_t *unwrapper; }; struct cp_global_state { + uint8_t wrap_functions_set; uint8_t lock_state; - uint8_t wrap_functions_set; }; +/* + * On-disk structure written as the per-file EA payload + * All on-disk multi-byte fields for the CP XATTR must be stored + * little-endian on-disk. This means they must be endian swapped to + * L.E on getxattr() and converted to LE on setxattr(). + */ struct cp_xattr { - uint32_t persistent_class; - uint8_t persistent_key[32]; - uint8_t persistent_integrity[8]; - uint8_t xattr_version; + u_int16_t xattr_major_version; + u_int16_t xattr_minor_version; + u_int32_t flags; + u_int32_t persistent_class; + u_int32_t key_size; + uint8_t persistent_key[CP_WRAPPEDKEYSIZE]; }; -int cp_create_init(vnode_t, vfs_context_t); +/* Same is true for the root EA, all fields must be written little endian. */ +struct cp_root_xattr { + u_int16_t major_version; + u_int16_t minor_version; + u_int64_t flags; + u_int32_t reserved1; + u_int32_t reserved2; + u_int32_t reserved3; + u_int32_t reserved4; +}; + + +/* + * Functions to check the status of a CP and to query + * the containing filesystem to see if it is supported. + */ +int cp_vnode_getclass(vnode_t, int *); +int cp_vnode_setclass(vnode_t, uint32_t); + int cp_key_store_action(int); int cp_register_wraps(cp_wrap_func_t); -struct cprotect *cp_vnode_entry_alloc(void); -void cp_vnode_entry_init(vnode_t); -int cp_vnode_entry_init_needed(vnode_t); -struct cp_xattr * cp_vn_getxattr(vnode_t, vfs_context_t); -int cp_vn_setxattr(vnode_t, uint32_t, vfs_context_t); + +int cp_entry_init(cnode_ptr_t, struct mount *); +int cp_entry_create_keys(cnode_ptr_t); +void cp_entry_destroy(cnode_ptr_t); + +cnode_ptr_t cp_get_protected_cnode(vnode_t); +int cp_handle_vnop(cnode_ptr_t, int); +int cp_fs_protected (mount_t); +int cp_getrootxattr (struct hfsmount *hfsmp, struct cp_root_xattr *outxattr); +int cp_setrootxattr (struct hfsmount *hfsmp, struct cp_root_xattr *newxattr); +int cp_handle_relocate (cnode_ptr_t cp); #endif /* KERNEL_PRIVATE */ diff --git a/bsd/sys/decmpfs.h b/bsd/sys/decmpfs.h index 72e99ee18..f8a61d288 100644 --- a/bsd/sys/decmpfs.h +++ b/bsd/sys/decmpfs.h @@ -84,7 +84,7 @@ typedef struct decmpfs_cnode { uint32_t cmp_type; uint32_t lockcount; void *lockowner; /* cnode's lock owner (if a thread is currently holding an exclusive lock) */ - uint64_t uncompressed_size; + uint64_t uncompressed_size __attribute__((aligned(8))); lck_rw_t compressed_data_lock; #if !DECMPFS_SUPPORTS_SWAP64 /* we need a lock since we can't atomically fetch/set 64 bits */ diff --git a/bsd/sys/disk.h b/bsd/sys/disk.h index 0232617ca..745aa6710 100644 --- a/bsd/sys/disk.h +++ b/bsd/sys/disk.h @@ -161,12 +161,27 @@ typedef struct #ifdef KERNEL #define DK_FEATURE_FORCE_UNIT_ACCESS 0x00000001 + +typedef struct +{ + uint64_t offset; + uint64_t length; + + uint8_t reserved0128[12]; /* reserved, clear to zero */ + + dev_t dev; +} dk_physical_extent_t; + #define DKIOCGETBLOCKCOUNT32 _IOR('d', 25, uint32_t) #define DKIOCSETBLOCKSIZE _IOW('d', 24, uint32_t) #define DKIOCGETBSDUNIT _IOR('d', 27, uint32_t) -#define DKIOCISSOLIDSTATE _IOR('d', 79, uint32_t) +#define DKIOCISSOLIDSTATE _IOR('d', 79, uint32_t) #define DKIOCISVIRTUAL _IOR('d', 72, uint32_t) #define DKIOCGETBASE _IOR('d', 73, uint64_t) +#define DKIOCGETTHROTTLEMASK _IOR('d', 80, uint64_t) +#define DKIOCLOCKPHYSICALEXTENTS _IO('d', 81) +#define DKIOCGETPHYSICALEXTENT _IOWR('d', 82, dk_physical_extent_t) +#define DKIOCUNLOCKPHYSICALEXTENTS _IO('d', 83) #endif /* KERNEL */ #endif /* _SYS_DISK_H_ */ diff --git a/bsd/sys/dtrace.h b/bsd/sys/dtrace.h index 6d9c6976a..d81a48a4f 100644 --- a/bsd/sys/dtrace.h +++ b/bsd/sys/dtrace.h @@ -119,6 +119,8 @@ typedef int64_t hrtime_t; typedef enum { B_FALSE = 0, B_TRUE = 1 } _dtrace_boolean; +typedef uint8_t UUID[16]; /* For modctl use in dtrace.h */ + struct modctl; /* In lieu of Solaris */ /* NOTHING */ /* In lieu of Solaris */ #include /* In lieu of Solaris */ @@ -508,6 +510,15 @@ typedef struct dtrace_difv { #define DTRACEACT_RAISE (DTRACEACT_PROC_DESTRUCTIVE + 2) #define DTRACEACT_SYSTEM (DTRACEACT_PROC_DESTRUCTIVE + 3) #define DTRACEACT_FREOPEN (DTRACEACT_PROC_DESTRUCTIVE + 4) + +#if defined(__APPLE__) +/* + * Dtrace stop() will task_suspend the currently running process. + * Dtrace pidresume(pid) will task_resume it. + */ + +#define DTRACEACT_PIDRESUME (DTRACEACT_PROC_DESTRUCTIVE + 50) +#endif /* __APPLE__ */ #define DTRACEACT_PROC_CONTROL 0x0300 @@ -1340,6 +1351,34 @@ typedef struct dtrace_providerdesc { #define DTRACEIOC_FORMAT (DTRACEIOC | 16) /* get format str */ #define DTRACEIOC_DOFGET (DTRACEIOC | 17) /* get DOF */ #define DTRACEIOC_REPLICATE (DTRACEIOC | 18) /* replicate enab */ +#define DTRACEIOC_MODUUIDSLIST (DTRACEIOC | 30) /* APPLE ONLY, query for modules with missing symbols */ +#define DTRACEIOC_PROVMODSYMS (DTRACEIOC | 31) /* APPLE ONLY, provide missing symbols for a given module */ + +/* + * The following structs are used to provide symbol information to the kernel from userspace. + */ + +typedef struct dtrace_symbol { + uint64_t dtsym_addr; /* address of the symbol */ + uint64_t dtsym_size; /* size of the symbol, must be uint64_t to maintain alignment when called by 64b uproc in i386 kernel */ + char dtsym_name[DTRACE_FUNCNAMELEN]; /* symbol name */ +} dtrace_symbol_t; + +typedef struct dtrace_module_symbols { + UUID dtmodsyms_uuid; + uint64_t dtmodsyms_count; + dtrace_symbol_t dtmodsyms_symbols[1]; +} dtrace_module_symbols_t; + +#define DTRACE_MODULE_SYMBOLS_SIZE(count) (sizeof(dtrace_module_symbols_t) + ((count - 1) * sizeof(dtrace_symbol_t))) + +typedef struct dtrace_module_uuids_list { + uint64_t dtmul_count; + UUID dtmul_uuid[1]; +} dtrace_module_uuids_list_t; + +#define DTRACE_MODULE_UUIDS_LIST_SIZE(count) (sizeof(dtrace_module_uuids_list_t) + ((count - 1) * sizeof(UUID))) + #endif /* __APPLE__ */ /* @@ -1566,7 +1605,7 @@ typedef struct dof_ioctl_data { * dtps_provide_module(); see "Arguments and Notes" for dtrace_register(), * below. * - * 1.4 void dtps_enable(void *arg, dtrace_id_t id, void *parg) + * 1.4 int dtps_enable(void *arg, dtrace_id_t id, void *parg) * * 1.4.1 Overview * @@ -1587,7 +1626,8 @@ typedef struct dof_ioctl_data { * * 1.4.3 Return value * - * None. + * On success, dtps_enable() should return 0. On failure, -1 should be + * returned. * * 1.4.4 Caller's context * @@ -2141,7 +2181,7 @@ typedef struct dof_ioctl_data { typedef struct dtrace_pops { void (*dtps_provide)(void *arg, const dtrace_probedesc_t *spec); void (*dtps_provide_module)(void *arg, struct modctl *mp); - void (*dtps_enable)(void *arg, dtrace_id_t id, void *parg); + int (*dtps_enable)(void *arg, dtrace_id_t id, void *parg); void (*dtps_disable)(void *arg, dtrace_id_t id, void *parg); void (*dtps_suspend)(void *arg, dtrace_id_t id, void *parg); void (*dtps_resume)(void *arg, dtrace_id_t id, void *parg); @@ -2357,10 +2397,7 @@ struct regs; extern int (*dtrace_pid_probe_ptr)(struct regs *); extern int (*dtrace_return_probe_ptr)(struct regs *); #else -#if defined (__ppc__) || defined (__ppc64__) -extern int (*dtrace_pid_probe_ptr)(ppc_saved_state_t *regs); -extern int (*dtrace_return_probe_ptr)(ppc_saved_state_t* regs); -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) extern int (*dtrace_pid_probe_ptr)(x86_saved_state_t *regs); extern int (*dtrace_return_probe_ptr)(x86_saved_state_t* regs); #else @@ -2382,8 +2419,13 @@ extern void dtrace_membar_producer(void); extern void dtrace_membar_consumer(void); extern void (*dtrace_cpu_init)(processorid_t); +#if !defined(__APPLE__) extern void (*dtrace_modload)(struct modctl *); extern void (*dtrace_modunload)(struct modctl *); +#else +extern int (*dtrace_modload)(struct kmod_info *); +extern int (*dtrace_modunload)(struct kmod_info *); +#endif /* __APPLE__ */ extern void (*dtrace_helpers_cleanup)(proc_t*); extern void (*dtrace_helpers_fork)(proc_t *parent, proc_t *child); extern void (*dtrace_cpustart_init)(void); @@ -2427,14 +2469,11 @@ extern int dtrace_instr_size(uchar_t *instr); extern int dtrace_instr_size_isa(uchar_t *, model_t, int *); extern void dtrace_invop_add(int (*)(uintptr_t, uintptr_t *, uintptr_t)); extern void dtrace_invop_remove(int (*)(uintptr_t, uintptr_t *, uintptr_t)); -extern void dtrace_invop_callsite(void); +extern void *dtrace_invop_callsite_pre; +extern void *dtrace_invop_callsite_post; #endif -#if defined (__ppc__) || defined (__ppc64__) -extern void dtrace_invop_add(int (*)(uintptr_t, uintptr_t *, uintptr_t)); -extern void dtrace_invop_remove(int (*)(uintptr_t, uintptr_t *, uintptr_t)); -#endif #undef proc_t #endif /* __APPLE__ */ @@ -2472,13 +2511,6 @@ extern void dtrace_invop_remove(int (*)(uintptr_t, uintptr_t *, uintptr_t)); #endif -#if defined (__ppc__) || defined (__ppc64__) -#define DTRACE_INVOP_NOP 4 -#define DTRACE_INVOP_RET 5 -#define DTRACE_INVOP_BCTR 6 -#define DTRACE_INVOP_TAILJUMP 7 -#endif - #endif /* __APPLE__ */ diff --git a/bsd/sys/dtrace_glue.h b/bsd/sys/dtrace_glue.h index 5612fe80c..6b3665b02 100644 --- a/bsd/sys/dtrace_glue.h +++ b/bsd/sys/dtrace_glue.h @@ -43,6 +43,10 @@ #include #include +#if defined(__i386__) || defined(__x86_64__) +#include +#endif + /* * cmn_err */ @@ -100,17 +104,17 @@ extern lck_mtx_t mod_lock; /* * Per-CPU data. */ -typedef struct cpu { +typedef struct dtrace_cpu { processorid_t cpu_id; /* CPU number */ - struct cpu *cpu_next; /* next existing CPU */ + struct dtrace_cpu *cpu_next; /* next existing CPU */ lck_rw_t cpu_ft_lock; /* DTrace: fasttrap lock */ uintptr_t cpu_dtrace_caller; /* DTrace: caller, if any */ hrtime_t cpu_dtrace_chillmark; /* DTrace: chill mark time */ hrtime_t cpu_dtrace_chilled; /* DTrace: total chill time */ boolean_t cpu_dtrace_invop_underway; /* DTrace gaurds against invalid op re-entrancy */ -} cpu_t; +} dtrace_cpu_t; -extern cpu_t *cpu_list; +extern dtrace_cpu_t *cpu_list; /* * The cpu_core structure consists of per-CPU state available in any context. @@ -130,7 +134,8 @@ typedef struct cpu_core { } cpu_core_t; extern cpu_core_t *cpu_core; -extern unsigned int real_ncpus; + + extern int cpu_number(void); /* From #include . Called from probe context, must blacklist. */ #define CPU (&(cpu_list[cpu_number()])) /* Pointer to current CPU */ @@ -187,6 +192,55 @@ extern void unregister_cpu_setup_func(cpu_setup_func_t *, void *); CPU_DTRACE_BADSTACK) #define CPU_DTRACE_ERROR (CPU_DTRACE_FAULT | CPU_DTRACE_DROP) +/* + * Loadable Modules + */ + +/* Keep the compiler happy */ +struct dtrace_module_symbols; + +/* Solaris' modctl structure, greatly simplified, shadowing parts of xnu kmod structure. */ +typedef struct modctl { + struct modctl *mod_next; + struct modctl *mod_stale; // stale module chain + uint32_t mod_id; // the kext unique identifier + char mod_modname[KMOD_MAX_NAME]; + int mod_loadcnt; + char mod_loaded; + char mod_flags; // See flags below + int mod_nenabled; // # of enabled DTrace probes in module + vm_address_t mod_address; // starting address (of Mach-o header blob) + vm_size_t mod_size; // total size (of blob) + UUID mod_uuid; + struct dtrace_module_symbols* mod_user_symbols; +} modctl_t; + +/* Definitions for mod_flags */ +#define MODCTL_IS_MACH_KERNEL 0x01 // This module represents /mach_kernel +#define MODCTL_HAS_KERNEL_SYMBOLS 0x02 // Kernel symbols (nlist) are available +#define MODCTL_FBT_PROBES_PROVIDED 0x04 // fbt probes have been provided +#define MODCTL_FBT_INVALID 0x08 // Module is invalid for fbt probes +#define MODCTL_SDT_PROBES_PROVIDED 0x10 // sdt probes have been provided +#define MODCTL_SDT_INVALID 0x20 // Module is invalid for sdt probes +#define MODCTL_HAS_UUID 0x40 // Module has UUID + +/* Simple/singular mod_flags accessors */ +#define MOD_IS_MACH_KERNEL(mod) (mod->mod_flags & MODCTL_IS_MACH_KERNEL) +#define MOD_HAS_KERNEL_SYMBOLS(mod) (mod->mod_flags & MODCTL_HAS_KERNEL_SYMBOLS) +#define MOD_HAS_USERSPACE_SYMBOLS(mod) (mod->mod_user_symbols) /* No point in duplicating state in the flags bits */ +#define MOD_FBT_PROBES_PROVIDED(mod) (mod->mod_flags & MODCTL_FBT_PROBES_PROVIDED) +#define MOD_FBT_INVALID(mod) (mod->mod_flags & MODCTL_FBT_INVALID) +#define MOD_SDT_PROBES_PROVIDED(mod) (mod->mod_flags & MODCTL_SDT_PROBES_PROVIDED) +#define MOD_SDT_INVALID(mod) (mod->mod_flags & MODCTL_SDT_INVALID) +#define MOD_HAS_UUID(mod) (mod->mod_flags & MODCTL_HAS_UUID) + +/* Compound accessors */ +#define MOD_FBT_DONE(mod) (MOD_FBT_PROBES_PROVIDED(mod) || MOD_FBT_INVALID(mod)) +#define MOD_SDT_DONE(mod) (MOD_SDT_PROBES_PROVIDED(mod) || MOD_SDT_INVALID(mod)) +#define MOD_SYMBOLS_DONE(mod) (MOD_FBT_DONE(mod) && MOD_SDT_DONE(mod)) + +extern modctl_t *dtrace_modctl_list; + /* * cred_t */ @@ -244,8 +298,8 @@ typedef struct cyc_handler { } cyc_handler_t; typedef struct cyc_omni_handler { - void (*cyo_online)(void *, cpu_t *, cyc_handler_t *, cyc_time_t *); - void (*cyo_offline)(void *, cpu_t *, void *); + void (*cyo_online)(void *, dtrace_cpu_t *, cyc_handler_t *, cyc_time_t *); + void (*cyo_offline)(void *, dtrace_cpu_t *, void *); void *cyo_arg; } cyc_omni_handler_t; @@ -389,25 +443,6 @@ extern void kmem_cache_destroy(kmem_cache_t *); typedef struct _kthread kthread_t; /* For dtrace_vtime_switch(), dtrace_panicked and dtrace_errthread */ -/* - * Loadable Modules - */ - -#if 0 /* kmod_lock has been removed */ -decl_simple_lock_data(extern,kmod_lock) -#endif /* 0 */ - -/* Want to use Darwin's kmod_info in place of the Solaris modctl. - Can't typedef since the (many) usages in the code are "struct modctl *" */ -extern kmod_info_t *kmod; -#define modctl kmod_info - -#define mod_modname name -#define mod_loadcnt id -#define mod_next next -#define mod_loaded info_version /* XXX Is always > 0, hence TRUE */ -#define modules kmod - /* * proc */ @@ -472,15 +507,6 @@ static inline void atomic_add_64( uint64_t *theValue, int64_t theAmount ) { (void)OSAddAtomic64( theAmount, (SInt64 *)theValue ); } -#elif defined(__ppc__) -static inline void atomic_add_64( uint64_t *theValue, int64_t theAmount ) -{ - // FIXME - // atomic_add_64() is at present only called from fasttrap.c to increment - // or decrement a 64bit counter. Narrow to 32bits since ppc32 (G4) has - // no convenient 64bit atomic op. - (void)OSAddAtomic( (int32_t)theAmount, &(((SInt32 *)theValue)[1])); -} #endif /* diff --git a/bsd/sys/dtrace_impl.h b/bsd/sys/dtrace_impl.h index 4ef2ef655..7f42cff5e 100644 --- a/bsd/sys/dtrace_impl.h +++ b/bsd/sys/dtrace_impl.h @@ -1008,6 +1008,45 @@ typedef enum dtrace_activity { #define DTRACE_DOF_MODE_LAZY_ON 1 #define DTRACE_DOF_MODE_LAZY_OFF 2 #define DTRACE_DOF_MODE_NON_LAZY 3 + +/* + * dtrace kernel symbol modes are used to control when the kernel may dispose of + * symbol information used by the fbt/sdt provider. The kernel itself, as well as + * every kext, has symbol table/nlist info that has historically been preserved + * for dtrace's use. This allowed dtrace to be lazy about allocating fbt/sdt probes, + * at the expense of keeping the symbol info in the kernel permanently. + * + * Starting in 10.7+, fbt probes may be created from userspace, in the same + * fashion as pid probes. The kernel allows dtrace "first right of refusal" + * whenever symbol data becomes available (such as a kext load). If dtrace is + * active, it will immediately read/copy the needed data, and then the kernel + * may free it. If dtrace is not active, it returns immediately, having done + * no work or allocations, and the symbol data is freed. Should dtrace need + * this data later, it is expected that the userspace client will push the + * data into the kernel via ioctl calls. + * + * The kernel symbol modes are used to control what dtrace does with symbol data: + * + * DTRACE_KERNEL_SYMBOLS_NEVER Effectively disables fbt/sdt + * DTRACE_KERNEL_SYMBOLS_FROM_KERNEL Immediately read/copy symbol data + * DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE Wait for symbols from userspace + * DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL Immediately read/copy symbol data + * + * It is legal to transition between DTRACE_KERNEL_SYMBOLS_FROM_KERNEL and + * DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE. The DTRACE_KERNEL_SYMBOLS_NEVER and + * DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL are permanent modes, intended to + * disable fbt probes entirely, or prevent any symbols being loaded from + * userspace. +* + * The kernel symbol mode is kept in dtrace_kernel_symbol_mode, which is protected + * by the dtrace_lock. + */ + +#define DTRACE_KERNEL_SYMBOLS_NEVER 0 +#define DTRACE_KERNEL_SYMBOLS_FROM_KERNEL 1 +#define DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE 2 +#define DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL 3 + #endif /* __APPLE__ */ /* diff --git a/bsd/sys/errno.h b/bsd/sys/errno.h index 54d5d0371..231c68ead 100644 --- a/bsd/sys/errno.h +++ b/bsd/sys/errno.h @@ -69,8 +69,9 @@ #ifndef _SYS_ERRNO_H_ #define _SYS_ERRNO_H_ -#if !defined(KERNEL) && !defined(KERNEL_PRIVATE) #include + +#if !defined(KERNEL) && !defined(KERNEL_PRIVATE) __BEGIN_DECLS extern int * __error(void); #define errno (*__error()) @@ -96,7 +97,7 @@ __END_DECLS #define ENOMEM 12 /* Cannot allocate memory */ #define EACCES 13 /* Permission denied */ #define EFAULT 14 /* Bad address */ -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL #define ENOTBLK 15 /* Block device required */ #endif #define EBUSY 16 /* Device / Resource busy */ @@ -134,9 +135,9 @@ __END_DECLS #define EPROTOTYPE 41 /* Protocol wrong type for socket */ #define ENOPROTOOPT 42 /* Protocol not available */ #define EPROTONOSUPPORT 43 /* Protocol not supported */ -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL #define ESOCKTNOSUPPORT 44 /* Socket type not supported */ -#endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ +#endif #define ENOTSUP 45 /* Operation not supported */ #if !__DARWIN_UNIX03 && !defined(KERNEL) /* @@ -150,9 +151,9 @@ __END_DECLS #define EOPNOTSUPP ENOTSUP /* Operation not supported on socket */ #endif /* !__DARWIN_UNIX03 && !KERNEL */ -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL #define EPFNOSUPPORT 46 /* Protocol family not supported */ -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ +#endif #define EAFNOSUPPORT 47 /* Address family not supported by protocol family */ #define EADDRINUSE 48 /* Address already in use */ #define EADDRNOTAVAIL 49 /* Can't assign requested address */ @@ -166,10 +167,10 @@ __END_DECLS #define ENOBUFS 55 /* No buffer space available */ #define EISCONN 56 /* Socket is already connected */ #define ENOTCONN 57 /* Socket is not connected */ -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL #define ESHUTDOWN 58 /* Can't send after socket shutdown */ #define ETOOMANYREFS 59 /* Too many references: can't splice */ -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ +#endif #define ETIMEDOUT 60 /* Operation timed out */ #define ECONNREFUSED 61 /* Connection refused */ @@ -177,34 +178,34 @@ __END_DECLS #define ENAMETOOLONG 63 /* File name too long */ /* should be rearranged */ -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL #define EHOSTDOWN 64 /* Host is down */ -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ +#endif #define EHOSTUNREACH 65 /* No route to host */ #define ENOTEMPTY 66 /* Directory not empty */ /* quotas & mush */ -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL #define EPROCLIM 67 /* Too many processes */ #define EUSERS 68 /* Too many users */ -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ +#endif #define EDQUOT 69 /* Disc quota exceeded */ /* Network File System */ #define ESTALE 70 /* Stale NFS file handle */ -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL #define EREMOTE 71 /* Too many levels of remote in path */ #define EBADRPC 72 /* RPC struct is bad */ #define ERPCMISMATCH 73 /* RPC version wrong */ #define EPROGUNAVAIL 74 /* RPC prog. not avail */ #define EPROGMISMATCH 75 /* Program version wrong */ #define EPROCUNAVAIL 76 /* Bad procedure for program */ -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ +#endif #define ENOLCK 77 /* No locks available */ #define ENOSYS 78 /* Function not implemented */ -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL #define EFTYPE 79 /* Inappropriate file type or format */ #define EAUTH 80 /* Authentication error */ #define ENEEDAUTH 81 /* Need authenticator */ @@ -212,26 +213,26 @@ __END_DECLS /* Intelligent device errors */ #define EPWROFF 82 /* Device power is off */ #define EDEVERR 83 /* Device error, e.g. paper out */ -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ +#endif #define EOVERFLOW 84 /* Value too large to be stored in data type */ /* Program loading errors */ -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL #define EBADEXEC 85 /* Bad executable */ #define EBADARCH 86 /* Bad CPU type in executable */ #define ESHLIBVERS 87 /* Shared library version mismatch */ #define EBADMACHO 88 /* Malformed Macho file */ -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ +#endif #define ECANCELED 89 /* Operation canceled */ #define EIDRM 90 /* Identifier removed */ #define ENOMSG 91 /* No message of desired type */ #define EILSEQ 92 /* Illegal byte sequence */ -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL #define ENOATTR 93 /* Attribute not found */ -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ +#endif #define EBADMSG 94 /* Bad message */ #define EMULTIHOP 95 /* Reserved */ @@ -249,9 +250,14 @@ __END_DECLS #define ENOPOLICY 103 /* No such policy registered */ -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -#define ELAST 103 /* Must be equal largest errno */ -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ +#if __DARWIN_C_LEVEL >= 200809L +#define ENOTRECOVERABLE 104 /* State not recoverable */ +#define EOWNERDEAD 105 /* Previous owner died */ +#endif + +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL +#define ELAST 105 /* Must be equal largest errno */ +#endif #ifdef KERNEL /* pseudo-errors returned inside kernel to modify return to process */ @@ -261,6 +267,10 @@ __END_DECLS #ifdef BSD_KERNEL_PRIVATE #define ERECYCLE (-5) /* restart lookup under heavy vnode pressure/recycling */ #define EREDRIVEOPEN (-6) +#define EKEEPLOOKING (-7) +/* used for cvwait error returns to Libc */ +#define ECVCERORR 256 +#define ECVPERORR 512 #else /* BSD_KERNEL_PRIVATE */ /* -5 and -6 are reserved for kernel internal use */ #endif /* BSD_KERNEL_PRIVATE */ diff --git a/bsd/sys/event.h b/bsd/sys/event.h index abbd60045..05b31174a 100644 --- a/bsd/sys/event.h +++ b/bsd/sys/event.h @@ -70,9 +70,10 @@ #define EVFILT_MACHPORT (-8) /* Mach portsets */ #define EVFILT_FS (-9) /* Filesystem events */ #define EVFILT_USER (-10) /* User events */ -#define EVFILT_SESSION (-11) /* Audit session events */ + /* (-11) unused */ +#define EVFILT_VM (-12) /* Virtual memory events */ -#define EVFILT_SYSCOUNT 11 +#define EVFILT_SYSCOUNT 12 #define EVFILT_THREADMARKER EVFILT_SYSCOUNT /* Internal use only */ #pragma pack(4) @@ -191,7 +192,6 @@ struct kevent64_s { * On input, NOTE_TRIGGER causes the event to be triggered for output. */ #define NOTE_TRIGGER 0x01000000 -#define EV_TRIGGER 0x0100 /*deprecated--for backwards compatibility only*/ /* * On input, the top two bits of fflags specifies how the lower twenty four @@ -233,16 +233,26 @@ struct kevent64_s { * that hangs off the proc structure. They also both play games with the hint * passed to KNOTE(). If NOTE_SIGNAL is passed as a hint, then the lower bits * of the hint contain the signal. IF NOTE_FORK is passed, then the lower bits - * contain the PID of the child. + * contain the PID of the child. */ #define NOTE_EXIT 0x80000000 /* process exited */ #define NOTE_FORK 0x40000000 /* process forked */ #define NOTE_EXEC 0x20000000 /* process exec'd */ #define NOTE_REAP 0x10000000 /* process reaped */ #define NOTE_SIGNAL 0x08000000 /* shared with EVFILT_SIGNAL */ +#define NOTE_EXITSTATUS 0x04000000 /* exit status to be returned, valid for child process only */ +#define NOTE_RESOURCEEND 0x02000000 /* resource limit reached, resource type returned */ #define NOTE_PDATAMASK 0x000fffff /* mask for pid/signal */ #define NOTE_PCTRLMASK (~NOTE_PDATAMASK) +/* + * data/hint fflags for EVFILT_VM, shared with userspace. + */ +#define NOTE_VM_PRESSURE 0x80000000 /* will react on memory pressure */ +#define NOTE_VM_PRESSURE_TERMINATE 0x40000000 /* will quit on memory pressure, possibly after cleaning up dirty state */ +#define NOTE_VM_PRESSURE_SUDDEN_TERMINATE 0x20000000 /* will quit immediately on memory pressure */ +#define NOTE_VM_ERROR 0x10000000 /* there was an error */ + /* * data/hint fflags for EVFILT_TIMER, shared with userspace. * The default is a (repeating) interval timer with the data @@ -258,7 +268,7 @@ struct kevent64_s { /* * data/hint fflags for EVFILT_MACHPORT, shared with userspace. * - * Only portsets are support at this time. + * Only portsets are supported at this time. * * The fflags field can optionally contain the MACH_RCV_MSG, MACH_RCV_LARGE, * and related trailer receive options as defined in . @@ -275,29 +285,6 @@ struct kevent64_s { * contains the name of the actual port detected with a message waiting. */ -/* - * data/hint fflags for EVFILT_SESSION, shared with userspace. - * - * The kevent ident field should be set to AU_SESSION_ANY_ASID if interested - * in events for any session. - * - * NOTE_AS_UPDATE may be going away since struct auditinfo_addr may become - * immutable once initially set. - */ -#define NOTE_AS_START 0x00000001 /* start of new session */ -#define NOTE_AS_END 0x00000002 /* start of new session */ -#define NOTE_AS_ERR 0x00000004 /* error tracking new session */ -#define NOTE_AS_CLOSE 0x00000008 /* currently unsupported */ -#define NOTE_AS_UPDATE 0x00000010 /* session data updated */ - -/* - * Kevent ident value for any session. - */ -#define AS_ANY_ASID 0xFFFFFFFF - -struct au_sentry; /* Audit session entry */ - - /* * DEPRECATED!!!!!!!!! * NOTE_TRACK, NOTE_TRACKERR, and NOTE_CHILD are no longer supported as of 10.5 @@ -338,7 +325,6 @@ struct knote { struct fileproc *p_fp; /* file data pointer */ struct proc *p_proc; /* proc pointer */ struct ipc_pset *p_pset; /* pset pointer */ - struct au_sentry *p_se; /* Audit session ptr */ } kn_ptr; struct filterops *kn_fop; int kn_status; /* status bits */ @@ -378,7 +364,7 @@ struct filterops { /* Optional f_touch operation, called only if !f_isfd && non-NULL */ void (*f_touch)(struct knote *kn, struct kevent64_s *kev, long type); /* Optional f_peek operation, called only if KN_STAYQUEUED is set */ - int (*f_peek)(struct knote *kn); + unsigned (*f_peek)(struct knote *kn); }; struct proc; @@ -399,6 +385,7 @@ extern int knote_detach(struct klist *list, struct knote *kn); extern int knote_link_wait_queue(struct knote *kn, struct wait_queue *wq); extern void knote_unlink_wait_queue(struct knote *kn, struct wait_queue *wq); extern void knote_fdclose(struct proc *p, int fd); +extern void knote_markstayqueued(struct knote *kn); #endif /* !KERNEL_PRIVATE */ diff --git a/bsd/sys/fasttrap_impl.h b/bsd/sys/fasttrap_impl.h index 259841c70..a4017cc41 100644 --- a/bsd/sys/fasttrap_impl.h +++ b/bsd/sys/fasttrap_impl.h @@ -201,10 +201,7 @@ extern int fasttrap_tracepoint_init(proc_t *, fasttrap_tracepoint_t *, extern int fasttrap_tracepoint_install(proc_t *, fasttrap_tracepoint_t *); extern int fasttrap_tracepoint_remove(proc_t *, fasttrap_tracepoint_t *); -#if defined (__ppc__) || defined (__ppc64__) -extern int fasttrap_pid_probe(ppc_saved_state_t *regs); -extern int fasttrap_return_probe(ppc_saved_state_t* regs); -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) extern int fasttrap_pid_probe(x86_saved_state_t *regs); extern int fasttrap_return_probe(x86_saved_state_t* regs); #else diff --git a/bsd/sys/fbt.h b/bsd/sys/fbt.h index c72208c99..3796443ea 100644 --- a/bsd/sys/fbt.h +++ b/bsd/sys/fbt.h @@ -29,9 +29,7 @@ #ifndef _FBT_H #define _FBT_H -#if defined (__ppc__) || defined (__ppc64__) -typedef uint32_t machine_inst_t; -#elif defined(__i386__) || defined (__x86_64__) +#if defined(__i386__) || defined (__x86_64__) typedef uint8_t machine_inst_t; #else #error Unknown Architecture @@ -45,18 +43,25 @@ typedef struct fbt_probe { int8_t fbtp_rval; machine_inst_t fbtp_patchval; machine_inst_t fbtp_savedval; + machine_inst_t fbtp_currentval; uintptr_t fbtp_roffset; dtrace_id_t fbtp_id; + /* FIXME! + * This field appears to only be used in error messages. + * It puts this structure into the next size bucket in kmem_alloc + * wasting 32 bytes per probe. (in i386 only) + */ char fbtp_name[MAX_FBTP_NAME_CHARS]; struct modctl *fbtp_ctl; int fbtp_loadcnt; +#if !defined(__APPLE__) int fbtp_symndx; - int fbtp_primary; +#endif struct fbt_probe *fbtp_next; } fbt_probe_t; extern int dtrace_invop(uintptr_t, uintptr_t *, uintptr_t); extern int fbt_invop(uintptr_t, uintptr_t *, uintptr_t); extern void fbt_provide_module(void *, struct modctl *); - +extern int fbt_enable (void *arg, dtrace_id_t id, void *parg); #endif /* _FBT_H */ diff --git a/bsd/sys/fcntl.h b/bsd/sys/fcntl.h index e9302b184..f6cbe9d5a 100644 --- a/bsd/sys/fcntl.h +++ b/bsd/sys/fcntl.h @@ -77,6 +77,9 @@ */ #include #include +#ifndef KERNEL +#include +#endif /* We should not be exporting size_t here. Temporary for gcc bootstrapping. */ #ifndef _SIZE_T @@ -168,6 +171,14 @@ typedef __darwin_pid_t pid_t; #define O_DSYNC 0x400000 /* synch I/O data integrity */ #endif +#ifdef KERNEL +#define FNODIRECT 0x800000 /* fcntl(F_NODIRECT, 1) */ +#endif + +#if __DARWIN_C_LEVEL >= 200809L +#define O_CLOEXEC 0x1000000 /* implicitly set FD_CLOEXEC */ +#endif + #ifdef KERNEL /* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */ #define FFLAGS(oflags) ((oflags) + 1) @@ -220,6 +231,7 @@ typedef __darwin_pid_t pid_t; #define F_SETLK 8 /* set record locking information */ #define F_SETLKW 9 /* F_SETLK; wait if blocked */ #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#define F_FLUSH_DATA 40 #define F_CHKCLEAN 41 /* Used for regression test */ #define F_PREALLOCATE 42 /* Preallocate storage */ #define F_SETSIZE 43 /* Truncate a file without zeroing space */ @@ -248,14 +260,35 @@ typedef __darwin_pid_t pid_t; #define F_ADDFILESIGS 61 /* add signature from same file (used by dyld for shared libs) */ -#define F_GETPROTECTIONCLASS 62 /* Get the protection class of a file from the EA, returns int */ -#define F_SETPROTECTIONCLASS 63 /* Set the protection class of a file for the EA, requires int */ +#define F_NODIRECT 62 /* used in conjunction with F_NOCACHE to indicate that DIRECT, synchonous writes */ + /* should not be used (i.e. its ok to temporaily create cached pages) */ + +#define F_GETPROTECTIONCLASS 63 /* Get the protection class of a file from the EA, returns int */ +#define F_SETPROTECTIONCLASS 64 /* Set the protection class of a file for the EA, requires int */ + +#define F_LOG2PHYS_EXT 65 /* file offset to device offset, extended */ + +#define F_GETLKPID 66 /* get record locking information, per-process */ + +#ifdef PRIVATE +#define F_MOVEDATAEXTENTS 69 /* Swap only the data associated with two files */ +#endif + +#define F_SETBACKINGSTORE 70 /* Mark the file as being the backing store for another filesystem */ +#define F_GETPATH_MTMINFO 71 /* return the full path of the FD, but error in specific mtmd circumstances */ + +#define F_SETNOSIGPIPE 73 /* No SIGPIPE generated on EPIPE */ +#define F_GETNOSIGPIPE 74 /* Status of SIGPIPE for this fd */ // FS-specific fcntl()'s numbers begin at 0x00010000 and go up #define FCNTL_FS_SPECIFIC_BASE 0x00010000 #endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ +#if __DARWIN_C_LEVEL >= 200809L +#define F_DUPFD_CLOEXEC 67 /* mark the dup with FD_CLOEXEC */ +#endif + /* file descriptor flags (F_GETFD, F_SETFD) */ #define FD_CLOEXEC 1 /* close-on-exec flag */ @@ -296,7 +329,7 @@ typedef __darwin_pid_t pid_t; #define S_IFLNK 0120000 /* [XSI] symbolic link */ #define S_IFSOCK 0140000 /* [XSI] socket */ #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -#define S_IFWHT 0160000 /* whiteout */ +#define S_IFWHT 0160000 /* OBSOLETE: whiteout */ #endif /* File mode */ @@ -464,13 +497,22 @@ typedef struct user_fbootstraptransfer { * For them the fcntl will nedd to switch from using BMAP to CMAP * and a per filesystem type flag will be needed to interpret the * contiguous bytes count result from CMAP. + * + * F_LOG2PHYS_EXT is a variant of F_LOG2PHYS that uses a passed in + * file offset and length instead of the current file offset. + * F_LOG2PHYS_EXT operates on the same structure as F_LOG2PHYS, but + * treats it as an in/out. */ #pragma pack(4) struct log2phys { - unsigned int l2p_flags; /* unused so far */ - off_t l2p_contigbytes; /* unused so far */ - off_t l2p_devoffset; /* bytes into device */ + unsigned int l2p_flags; /* unused so far */ + off_t l2p_contigbytes; /* F_LOG2PHYS: unused so far */ + /* F_LOG2PHYS_EXT: IN: number of bytes to be queried */ + /* OUT: number of contiguous bytes at this position */ + off_t l2p_devoffset; /* F_LOG2PHYS: OUT: bytes into device */ + /* F_LOG2PHYS_EXT: IN: bytes into file */ + /* OUT: bytes into device */ }; #pragma pack() @@ -544,6 +586,13 @@ int fcntl(int, int, ...) __DARWIN_ALIAS_C(fcntl); #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) #ifdef PRIVATE +/* + * These definitions are retained temporarily for compatibility. + * If you want to use fileports, please use + * #include + * or + * #include + */ #ifndef _FILEPORT_T #define _FILEPORT_T typedef __darwin_mach_port_t fileport_t; @@ -561,7 +610,7 @@ void filesec_free(filesec_t); int filesec_get_property(filesec_t, filesec_property_t, void *); int filesec_query_property(filesec_t, filesec_property_t, int *); int filesec_set_property(filesec_t, filesec_property_t, const void *); -int filesec_unset_property(filesec_t, filesec_property_t); +int filesec_unset_property(filesec_t, filesec_property_t) __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2); #define _FILESEC_UNSET_PROPERTY ((void *)0) #define _FILESEC_REMOVE_ACL ((void *)1) #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ diff --git a/bsd/sys/file.h b/bsd/sys/file.h index b236f0840..bd3629144 100644 --- a/bsd/sys/file.h +++ b/bsd/sys/file.h @@ -81,6 +81,8 @@ #define _KAUTH_CRED_T struct ucred; typedef struct ucred *kauth_cred_t; +struct posix_cred; +typedef struct posix_cred *posix_cred_t; #endif /* !_KAUTH_CRED_T */ #pragma pack(4) diff --git a/bsd/sys/file_internal.h b/bsd/sys/file_internal.h index 7592c745d..9fcb4d1f3 100644 --- a/bsd/sys/file_internal.h +++ b/bsd/sys/file_internal.h @@ -105,14 +105,17 @@ struct fileproc { #define FP_WAITCLOSE 0x0040 #define FP_AIOISSUED 0x0080 #define FP_WAITEVENT 0x0100 +#define FP_SELCONFLICT 0x0200 /* select conflict on an individual fp */ -#define FP_VALID_FLAGS (FP_INCREATE | FP_INCLOSE | FP_INSELECT | FP_INCHRREAD | FP_WRITTEN | FP_WRITTEN | FP_CLOSING | FP_WAITCLOSE | FP_AIOISSUED | FP_WAITEVENT) +#define FP_VALID_FLAGS (FP_INCREATE | FP_INCLOSE | FP_INSELECT | FP_INCHRREAD | FP_WRITTEN | FP_CLOSING | FP_WAITCLOSE | FP_AIOISSUED | FP_WAITEVENT | FP_SELCONFLICT) #ifndef _KAUTH_CRED_T #define _KAUTH_CRED_T struct ucred; typedef struct ucred *kauth_cred_t; +struct posix_cred; +typedef struct posix_cred *posix_cred_t; #endif /* !_KAUTH_CRED_T */ /* file types */ @@ -133,6 +136,7 @@ typedef enum { #define FG_RMMSGQ 0x08 /* the fileglob is being removed from msgqueue */ #define FG_WRMMSGQ 0x10 /* wait for the fileglob to be removed from msgqueue */ #define FG_PORTMADE 0x20 /* a port was at some point created for this fileglob */ +#define FG_NOSIGPIPE 0x40 /* don't deliver SIGPIPE with EPIPE return */ struct fileglob { LIST_ENTRY(fileglob) f_list;/* list of active files */ @@ -159,11 +163,9 @@ struct fileglob { int (*fo_drain) (struct fileproc *fp, vfs_context_t ctx); } *fg_ops; off_t fg_offset; - caddr_t fg_data; /* vnode or socket or SHM or semaphore */ + void *fg_data; /* vnode or socket or SHM or semaphore */ lck_mtx_t fg_lock; int32_t fg_lflags; /* file global flags */ - unsigned int fg_lockpc[4]; - unsigned int fg_unlockpc[4]; #if CONFIG_MACF struct label *fg_label; /* JMM - use the one in the cred? */ #endif diff --git a/bsd/sys/filedesc.h b/bsd/sys/filedesc.h index 7ea50f5a9..740e2d6f4 100644 --- a/bsd/sys/filedesc.h +++ b/bsd/sys/filedesc.h @@ -121,7 +121,9 @@ struct filedesc { #ifdef KERNEL #define UF_RESVWAIT 0x10 /* close in progress */ -#define UF_VALID_FLAGS (UF_EXCLOSE| UF_RESERVED | UF_CLOSING | UF_RESVWAIT) +#define UF_INHERIT 0x20 /* "inherit-on-exec" */ +#define UF_VALID_FLAGS \ + (UF_EXCLOSE | UF_RESERVED | UF_CLOSING | UF_RESVWAIT | UF_INHERIT) #endif /* KERNEL */ /* @@ -148,7 +150,7 @@ extern void ffree(struct file *fp); #ifdef __APPLE_API_PRIVATE extern struct filedesc *fdcopy(proc_t p, struct vnode *uth_cdir); extern void fdfree(proc_t p); -extern void fdexec(proc_t p); +extern void fdexec(proc_t p, short flags); #endif /* __APPLE_API_PRIVATE */ #endif /* KERNEL */ diff --git a/osfmk/ppc/machine_cpu.h b/bsd/sys/fileport.h similarity index 68% rename from osfmk/ppc/machine_cpu.h rename to bsd/sys/fileport.h index 88fe14def..779179baf 100644 --- a/osfmk/ppc/machine_cpu.h +++ b/bsd/sys/fileport.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,27 +25,32 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#ifndef _PPC_MACHINE_CPU_H_ -#define _PPC_MACHINE_CPU_H_ -#include -#include -#include -#include +#ifndef _SYS_FILEPORT_H_ +#define _SYS_FILEPORT_H_ -extern void cpu_machine_init( - void); +#include +#include -extern void cpu_doshutdown( - void); +#ifndef KERNEL -extern void cpu_signal_handler( - void); +__BEGIN_DECLS -typedef void (*broadcastFunc) (uint32_t); +#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -int32_t cpu_broadcast(uint32_t *, broadcastFunc, uint32_t); +#ifndef _FILEPORT_T +#define _FILEPORT_T +typedef __darwin_mach_port_t fileport_t; +#define FILEPORT_NULL ((fileport_t)0) +#endif /* _FILEPORT_T */ -#define cpu_pause() /* Not for this architecture */ +int fileport_makeport(int, fileport_t *); +int fileport_makefd(fileport_t); -#endif /* _PPC_MACHINE_CPU_H_ */ +#endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ + +__END_DECLS + +#endif /* !KERNEL */ + +#endif /* !_SYS_FILEPORT_H_ */ diff --git a/bsd/sys/fsctl.h b/bsd/sys/fsctl.h index b70ba0651..40c6b10b1 100644 --- a/bsd/sys/fsctl.h +++ b/bsd/sys/fsctl.h @@ -70,26 +70,161 @@ #define _SYS_FSCTL_H_ #include +#include -#define FSIOC_SYNC_VOLUME _IOW('A', 1, uint32_t) -#define FSCTL_SYNC_VOLUME IOCBASECMD(FSIOC_SYNC_VOLUME) +#ifdef XNU_KERNEL_PRIVATE -#define FSCTL_SYNC_FULLSYNC (1<<0) /* Flush the data fully to disk, if supported by the filesystem */ -#define FSCTL_SYNC_WAIT (1<<1) /* Wait for the sync to complete */ +typedef struct user64_namespace_handler_info { + user64_addr_t token; + user64_addr_t flags; + user64_addr_t fdptr; +} user64_namespace_handler_info; + +typedef struct user32_namespace_handler_info { + user32_addr_t token; + user32_addr_t flags; + user32_addr_t fdptr; +} user32_namespace_handler_info; + +typedef struct namespace_handler_info { + user_addr_t token; + user_addr_t flags; + user_addr_t fdptr; +} namespace_handler_info; + +typedef struct user64_namespace_handler_info_ext { + user64_addr_t token; + user64_addr_t flags; + user64_addr_t fdptr; + user64_addr_t infoptr; +} user64_namespace_handler_info_ext; + +typedef struct user32_namespace_handler_info_ext { + user32_addr_t token; + user32_addr_t flags; + user32_addr_t fdptr; + user32_addr_t infoptr; +} user32_namespace_handler_info_ext; + +typedef struct namespace_handler_info_ext { + user_addr_t token; + user_addr_t flags; + user_addr_t fdptr; + user_addr_t infoptr; +} namespace_handler_info_ext; + +extern int resolve_nspace_item(struct vnode *vp, uint64_t op); +extern int resolve_nspace_item_ext(struct vnode *vp, uint64_t op, void *arg); +extern int get_nspace_item_status(struct vnode *vp, int32_t *status); + +#else + +typedef struct namespace_handler_info { + int32_t *token; + int64_t *flags; + int32_t *fdptr; +} namespace_handler_info; + +typedef struct namespace_handler_info_ext { + int32_t *token; + int64_t *flags; + int32_t *fdptr; + int64_t *infoptr; // for snapshot write events, the kernel puts an offset/length pair here +} namespace_handler_info_ext; + + +#endif /* XNU_KERNEL_PRIVATE */ + +#define NAMESPACE_HANDLER_READ_OP 0x0001 +#define NAMESPACE_HANDLER_WRITE_OP 0x0002 +#define NAMESPACE_HANDLER_DELETE_OP 0x0004 +#define NAMESPACE_HANDLER_TRUNCATE_OP 0x0008 +#define NAMESPACE_HANDLER_RENAME_OP 0x0010 +#define NAMESPACE_HANDLER_METADATA_WRITE_OP 0x0020 +#define NAMESPACE_HANDLER_METADATA_DELETE_OP 0x0040 +#define NAMESPACE_HANDLER_METADATA_MOD 0x0080 +#define NAMESPACE_HANDLER_LINK_CREATE 0x0200 + +#define NAMESPACE_HANDLER_NSPACE_EVENT 0x1000 +#define NAMESPACE_HANDLER_SNAPSHOT_EVENT 0x0100 +#define NAMESPACE_HANDLER_TRACK_EVENT 0x2000 + +#define NAMESPACE_HANDLER_EVENT_TYPE_MASK (NAMESPACE_HANDLER_NSPACE_EVENT | NAMESPACE_HANDLER_SNAPSHOT_EVENT | NAMESPACE_HANDLER_TRACK_EVENT) + +#define DATALESS_CMPFS_TYPE 0x80000001 +typedef int32_t nspace_handler_info[2]; +typedef char fstypename_t[MFSTYPENAMELEN]; + +#ifdef KERNEL + +typedef struct user64_package_ext_info { + user64_addr_t strings; + uint32_t num_entries; + uint32_t max_width; +} user64_package_ext_info; + +typedef struct user32_package_ext_info { + user32_addr_t strings; + uint32_t num_entries; + uint32_t max_width; +} user32_package_ext_info; + +#endif // KERNEL + typedef struct package_ext_info { const char *strings; uint32_t num_entries; uint32_t max_width; } package_ext_info; -#define FSIOC_SET_PACKAGE_EXTS _IOW('A', 2, struct package_ext_info) -#define FSCTL_SET_PACKAGE_EXTS IOCBASECMD(FSIOC_SET_PACKAGE_EXTS) +#define FSCTL_SYNC_FULLSYNC (1<<0) /* Flush the data fully to disk, if supported by the filesystem */ +#define FSCTL_SYNC_WAIT (1<<1) /* Wait for the sync to complete */ + + +#define FSIOC_SYNC_VOLUME _IOW('A', 1, uint32_t) +#define FSCTL_SYNC_VOLUME IOCBASECMD(FSIOC_SYNC_VOLUME) + +#define FSIOC_SET_PACKAGE_EXTS _IOW('A', 2, struct package_ext_info) +#define FSCTL_SET_PACKAGE_EXTS IOCBASECMD(FSIOC_SET_PACKAGE_EXTS) + +#define FSIOC_WAIT_FOR_SYNC _IOR('A', 3, int32_t) +#define FSCTL_WAIT_FOR_SYNC IOCBASECMD(FSIOC_WAIT_FOR_SYNC) + +#define FSIOC_NAMESPACE_HANDLER_GET _IOW('A', 4, struct namespace_handler_info) +#define FSCTL_NAMESPACE_HANDLER_GET IOCBASECMD(FSIOC_NAMESPACE_HANDLER_GET) + +#define FSIOC_NAMESPACE_HANDLER_UPDATE _IOW('A', 5, nspace_handler_info) +#define FSCTL_NAMESPACE_HANDLER_UPDATE IOCBASECMD(FSIOC_NAMESPACE_HANDLER_UPDATE) + +#define FSIOC_NAMESPACE_HANDLER_UNBLOCK _IOW('A', 6, nspace_handler_info) +#define FSCTL_NAMESPACE_HANDLER_UNBLOCK IOCBASECMD(FSIOC_NAMESPACE_HANDLER_UNBLOCK) -#define FSIOC_WAIT_FOR_SYNC _IOR('A', 3, int32_t) -#define FSCTL_WAIT_FOR_SYNC IOCBASECMD(FSIOC_WAIT_FOR_SYNC) +#define FSIOC_NAMESPACE_HANDLER_CANCEL _IOW('A', 7, nspace_handler_info) +#define FSCTL_NAMESPACE_HANDLER_CANCEL IOCBASECMD(FSIOC_NAMESPACE_HANDLER_CANCEL) +#define FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME _IOW('A', 8, int32_t) +#define FSCTL_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME IOCBASECMD(FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME) + +#define FSIOC_OLD_SNAPSHOT_HANDLER_GET _IOW('A', 9, struct namespace_handler_info) +#define FSCTL_OLD_SNAPSHOT_HANDLER_GET IOCBASECMD(FSIOC_OLD_SNAPSHOT_HANDLER_GET) + +#define FSIOC_SET_FSTYPENAME_OVERRIDE _IOW('A', 10, fstypename_t) +#define FSCTL_SET_FSTYPENAME_OVERRIDE IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE) + +#define FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS _IOW('A', 11, int32_t) +#define FSCTL_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS IOCBASECMD(FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS) + +#define FSIOC_TRACKED_HANDLER_GET _IOW('A', 12, struct namespace_handler_info) +#define FSCTL_TRACKED_HANDLER_GET IOCBASECMD(FSIOC_TRACKED_HANDLER_GET) + +#define FSIOC_SNAPSHOT_HANDLER_GET_EXT _IOW('A', 13, struct namespace_handler_info_ext) +#define FSCTL_SNAPSHOT_HANDLER_GET_EXT IOCBASECMD(FSIOC_SNAPSHOT_HANDLER_GET_EXT) + +// +// IO commands 14, 15, 16, and 17 are currently unused +// // // Spotlight and fseventsd use these fsctl()'s to find out @@ -104,27 +239,10 @@ typedef struct package_ext_info { // or else it will break binary compatibility with mds // and fseventsd. // -#define SPOTLIGHT_IOC_GET_MOUNT_TIME _IOR('h', 18, u_int32_t) -#define SPOTLIGHT_FSCTL_GET_MOUNT_TIME IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME) -#define SPOTLIGHT_IOC_GET_LAST_MTIME _IOR('h', 19, u_int32_t) -#define SPOTLIGHT_FSCTL_GET_LAST_MTIME IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME) - - -#ifdef KERNEL - -typedef struct user64_package_ext_info { - user64_addr_t strings; - uint32_t num_entries; - uint32_t max_width; -} user64_package_ext_info; - -typedef struct user32_package_ext_info { - user32_addr_t strings; - uint32_t num_entries; - uint32_t max_width; -} user32_package_ext_info; - -#endif // KERNEL +#define SPOTLIGHT_IOC_GET_MOUNT_TIME _IOR('h', 18, u_int32_t) +#define SPOTLIGHT_FSCTL_GET_MOUNT_TIME IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME) +#define SPOTLIGHT_IOC_GET_LAST_MTIME _IOR('h', 19, u_int32_t) +#define SPOTLIGHT_FSCTL_GET_LAST_MTIME IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME) #ifndef KERNEL diff --git a/bsd/sys/fsevents.h b/bsd/sys/fsevents.h index e5cb3ec3c..82c16ac48 100644 --- a/bsd/sys/fsevents.h +++ b/bsd/sys/fsevents.h @@ -161,6 +161,7 @@ typedef struct fse_info { } fse_info; int get_fse_info(struct vnode *vp, fse_info *fse, vfs_context_t ctx); +int vnode_get_fse_info_from_vap(vnode_t vp, fse_info *fse, struct vnode_attr *vap); char *get_pathbuff(void); void release_pathbuff(char *path); diff --git a/bsd/sys/fslog.h b/bsd/sys/fslog.h index c1bee8c64..1266f3075 100644 --- a/bsd/sys/fslog.h +++ b/bsd/sys/fslog.h @@ -87,6 +87,14 @@ void fslog_fs_corrupt(struct mount *mnt); void fslog_io_error(const buf_t bp); #endif /* BSD_KERNEL_PRIVATE */ + +#ifdef XNU_KERNEL_PRIVATE + +/* Log information about external modification of a target process */ +void fslog_extmod_msgtracer(proc_t caller, proc_t target); + +#endif /* XNU_KERNEL_PRIVATE */ + #endif /* KERNEL */ /* Keys used by FSLog */ diff --git a/bsd/sys/imageboot.h b/bsd/sys/imageboot.h index 9ab02b5ab..a77c9cca8 100644 --- a/bsd/sys/imageboot.h +++ b/bsd/sys/imageboot.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2006-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -28,7 +28,12 @@ #ifndef _IMAGEBOOT_H_ #define _IMAGEBOOT_H_ -int imageboot_needed(void); -int imageboot_setup(void); +int imageboot_needed(void); +void imageboot_setup(void); +int imageboot_format_is_valid(const char *root_path); +int imageboot_mount_image(const char *root_path, int height); + +#define IMAGEBOOT_CONTAINER_ARG "container-dmg" +#define IMAGEBOOT_ROOT_ARG "root-dmg" #endif diff --git a/bsd/sys/imgact.h b/bsd/sys/imgact.h index fa9be0460..0a194b779 100644 --- a/bsd/sys/imgact.h +++ b/bsd/sys/imgact.h @@ -85,15 +85,23 @@ struct image_params { char *ip_vdata; /* file data (up to one page) */ int ip_flags; /* image flags */ int ip_argc; /* argument count */ - char *ip_argv; /* argument vector beginning */ int ip_envc; /* environment count */ + int ip_applec; /* apple vector count */ + + char *ip_startargv; /* argument vector beginning */ + char *ip_endargv; /* end of argv/start of envv */ + char *ip_endenvv; /* end of envv/start of applev */ + char *ip_strings; /* base address for strings */ char *ip_strendp; /* current end pointer */ - char *ip_strendargvp; /* end of argv/start of envp */ - int ip_strspace; /* remaining space */ + + int ip_argspace; /* remaining space of NCARGS limit (argv+envv) */ + int ip_strspace; /* remaining total string space */ + user_size_t ip_arch_offset; /* subfile offset in ip_vp */ user_size_t ip_arch_size; /* subfile length in ip_vp */ - char ip_interp_name[IMG_SHSIZE]; /* interpreter name */ + char ip_interp_buffer[IMG_SHSIZE]; /* interpreter buffer space */ + int ip_interp_sugid_fd; /* fd for sugid script */ /* Next two fields are for support of architecture translation... */ char *ip_p_comm; /* optional alt p->p_comm */ @@ -112,14 +120,16 @@ struct image_params { /* * Image flags */ -#define IMGPF_NONE 0x00000000 /* No flags */ -#define IMGPF_INTERPRET 0x00000001 /* Interpreter invoked */ -#define IMGPF_POWERPC 0x00000002 /* ppc mode for x86 */ +#define IMGPF_NONE 0x00000000 /* No flags */ +#define IMGPF_INTERPRET 0x00000001 /* Interpreter invoked */ +#define IMGPF_POWERPC 0x00000002 /* ppc mode for x86 */ #if CONFIG_EMBEDDED #undef IMGPF_POWERPC #endif -#define IMGPF_WAS_64BIT 0x00000004 /* exec from a 64Bit binary */ -#define IMGPF_IS_64BIT 0x00000008 /* exec to a 64Bit binary */ -#define IMGPF_SPAWN 0x00000010 /* spawn (without setexec) */ +#define IMGPF_WAS_64BIT 0x00000004 /* exec from a 64Bit binary */ +#define IMGPF_IS_64BIT 0x00000008 /* exec to a 64Bit binary */ +#define IMGPF_SPAWN 0x00000010 /* spawn (without setexec) */ +#define IMGPF_DISABLE_ASLR 0x00000020 /* disable ASLR */ +#define IMGPF_ALLOW_DATA_EXEC 0x00000040 /* forcibly disallow data execution */ #endif /* !_SYS_IMGACT */ diff --git a/bsd/ppc/ptrace.h b/bsd/sys/imgsrc.h similarity index 74% rename from bsd/ppc/ptrace.h rename to bsd/sys/imgsrc.h index be9af6886..aac577176 100644 --- a/bsd/ppc/ptrace.h +++ b/bsd/sys/imgsrc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,8 +25,9 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ /* - * Copyright (c) 1992, 1993 + * Copyright (c) 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -57,11 +58,42 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)ptrace.h 8.1 (Berkeley) 6/11/93 */ -/* - * Machine dependent trace commands. - * - * None for the PowerPC at this time. +#ifndef _SYS_IMGSRC_H_ +#define _SYS_IMGSRC_H_ + +#include +/* + * For mount(2), defined here for easy use with System.framework/PrivateHeaders. */ +#define MNT_IMGSRC_BY_INDEX 0x20000000 + +typedef struct imgsrc_info +{ + uint32_t ii_height; /* Nesting height: 0 is outermost */ + uint32_t ii_flags; /* Currently unused */ + dev_t ii_dev; /* dev_t for this volume */ + char ii_reserved[24];/* TBD */ +} *imgsrc_info_t; + +struct mnt_imgsrc_args { + uint32_t mi_height; /* As determined from an imgsrc_info structure */ + uint32_t mi_flags; /* TBD */ + const char* mi_devpath; /* Path to devnode */ +}; + +#ifdef BSD_KERNEL_PRIVATE +struct user64_mnt_imgsrc_args { + uint32_t mi_height; + uint32_t mi_flags; + user64_addr_t mi_devpath; +}; + +struct user32_mnt_imgsrc_args { + uint32_t mi_height; + uint32_t mi_flags; + user32_addr_t mi_devpath; +}; +#endif /* XNU_KERNEL_PRIVATE */ +#endif /* _SYS_IMGSRC_H_ */ diff --git a/bsd/sys/kauth.h b/bsd/sys/kauth.h index 33078a1f4..94f0b1e1e 100644 --- a/bsd/sys/kauth.h +++ b/bsd/sys/kauth.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2007 Apple Inc. All rights reserved. + * Copyright (c) 2004-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -106,11 +106,14 @@ struct kauth_identity_extlookup { #define KAUTH_EXTLOOKUP_WANT_MEMBERSHIP (1<<12) #define KAUTH_EXTLOOKUP_VALID_MEMBERSHIP (1<<13) #define KAUTH_EXTLOOKUP_ISMEMBER (1<<14) +#define KAUTH_EXTLOOKUP_VALID_PWNAM (1<<15) +#define KAUTH_EXTLOOKUP_WANT_PWNAM (1<<16) +#define KAUTH_EXTLOOKUP_VALID_GRNAM (1<<17) +#define KAUTH_EXTLOOKUP_WANT_GRNAM (1<<18) __darwin_pid_t el_info_pid; /* request on behalf of PID */ + u_int64_t el_extend; /* extension field */ u_int32_t el_info_reserved_1; /* reserved (APPLE) */ - u_int32_t el_info_reserved_2; /* reserved (APPLE) */ - u_int32_t el_info_reserved_3; /* reserved (APPLE) */ uid_t el_uid; /* user ID */ guid_t el_uguid; /* user GUID */ @@ -177,7 +180,6 @@ struct kauth_cred { int kc_nwhtgroups; /* whiteout group list */ gid_t *kc_whtgroups; - struct auditinfo cr_au; struct au_session cr_audit; /* user auditing data */ int kc_nsupplement; /* entry count in supplemental data pointer array */ @@ -192,6 +194,16 @@ struct kauth_cred { /* Kernel SPI for now */ __BEGIN_DECLS +/* + * Routines specific to credentials with POSIX credential labels attached + * + * XXX Should be in policy_posix.h, with struct posix_cred + */ +extern kauth_cred_t posix_cred_create(posix_cred_t pcred); +extern posix_cred_t posix_cred_get(kauth_cred_t cred); +extern void posix_cred_label(kauth_cred_t cred, posix_cred_t pcred); +extern int posix_cred_access(kauth_cred_t cred, id_t object_uid, id_t object_gid, mode_t object_mode, mode_t mode_req); + extern uid_t kauth_getuid(void); extern uid_t kauth_getruid(void); extern gid_t kauth_getgid(void); @@ -221,7 +233,15 @@ extern int kauth_proc_label_update(struct proc *p, void *label); extern kauth_cred_t kauth_cred_find(kauth_cred_t cred); extern uid_t kauth_cred_getuid(kauth_cred_t _cred); +extern uid_t kauth_cred_getruid(kauth_cred_t _cred); +extern uid_t kauth_cred_getsvuid(kauth_cred_t _cred); extern gid_t kauth_cred_getgid(kauth_cred_t _cred); +extern gid_t kauth_cred_getrgid(kauth_cred_t _cred); +extern gid_t kauth_cred_getsvgid(kauth_cred_t _cred); +extern int kauth_cred_pwnam2guid(char *pwnam, guid_t *guidp); +extern int kauth_cred_grnam2guid(char *grnam, guid_t *guidp); +extern int kauth_cred_guid2pwnam(guid_t *guidp, char *pwnam); +extern int kauth_cred_guid2grnam(guid_t *guidp, char *grnam); extern int kauth_cred_guid2uid(guid_t *_guid, uid_t *_uidp); extern int kauth_cred_guid2gid(guid_t *_guid, gid_t *_gidp); extern int kauth_cred_ntsid2uid(ntsid_t *_sid, uid_t *_uidp); @@ -273,7 +293,7 @@ extern void kauth_cred_uthread_update(struct uthread *, proc_t); #ifdef CONFIG_MACF extern int kauth_proc_label_update_execve(struct proc *p, struct vfs_context *ctx, struct vnode *vp, struct label *scriptlabel, struct label *execlabel); #endif -extern int kauth_cred_getgroups(gid_t *_groups, int *_groupcount); +extern int kauth_cred_getgroups(kauth_cred_t _cred, gid_t *_groups, int *_groupcount); extern int kauth_cred_assume(uid_t _uid); extern int kauth_cred_gid_subset(kauth_cred_t _cred1, kauth_cred_t _cred2, int *_resultp); struct auditinfo_addr; @@ -468,6 +488,7 @@ struct kauth_acl_eval { int ae_options; #define KAUTH_AEVAL_IS_OWNER (1<<0) /* authorizing operation for owner */ #define KAUTH_AEVAL_IN_GROUP (1<<1) /* authorizing operation for groupmember */ +#define KAUTH_AEVAL_IN_GROUP_UNKNOWN (1<<2) /* authorizing operation for unknown group membership */ /* expansions for 'generic' rights bits */ kauth_ace_rights_t ae_exp_gall; kauth_ace_rights_t ae_exp_gread; diff --git a/bsd/sys/kdebug.h b/bsd/sys/kdebug.h index 9f7b789c9..393c413df 100644 --- a/bsd/sys/kdebug.h +++ b/bsd/sys/kdebug.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -47,6 +47,11 @@ __BEGIN_DECLS #include #endif /* KERNEL_BUILD */ +#ifdef XNU_KERNEL_PRIVATE +#include +#include +#endif + /* * types of faults that vm_fault handles * and creates trace entries for @@ -77,21 +82,22 @@ __BEGIN_DECLS /* The Kernel Debug Classes */ -#define DBG_MACH 1 -#define DBG_NETWORK 2 -#define DBG_FSYSTEM 3 -#define DBG_BSD 4 -#define DBG_IOKIT 5 -#define DBG_DRIVERS 6 -#define DBG_TRACE 7 +#define DBG_MACH 1 +#define DBG_NETWORK 2 +#define DBG_FSYSTEM 3 +#define DBG_BSD 4 +#define DBG_IOKIT 5 +#define DBG_DRIVERS 6 +#define DBG_TRACE 7 #define DBG_DLIL 8 #define DBG_SECURITY 9 -#define DBG_MISC 20 -#define DBG_DYLD 31 -#define DBG_QT 32 -#define DBG_APPS 33 -#define DBG_LAUNCHD 34 -#define DBG_MIG 255 +#define DBG_CORESTORAGE 10 +#define DBG_MISC 20 +#define DBG_DYLD 31 +#define DBG_QT 32 +#define DBG_APPS 33 +#define DBG_LAUNCHD 34 +#define DBG_MIG 255 /* **** The Kernel Debug Sub Classes for Mach (DBG_MACH) **** */ #define DBG_MACH_EXCP_KTRAP_x86 0x02 /* Kernel Traps on x86 */ @@ -114,6 +120,7 @@ __BEGIN_DECLS #define DBG_MACH_MSGID_INVALID 0x50 /* Messages - invalid */ #define DBG_MACH_LOCKS 0x60 /* new lock APIs */ #define DBG_MACH_PMAP 0x70 /* pmap */ +#define DBG_MACH_MP 0x90 /* MP related */ /* Codes for Scheduler (DBG_MACH_SCHED) */ #define MACH_SCHED 0x0 /* Scheduler */ @@ -127,6 +134,16 @@ __BEGIN_DECLS #define MACH_DEMOTE 0x8 /* promotion undone */ #define MACH_IDLE 0x9 /* processor idling */ #define MACH_STACK_DEPTH 0xa /* stack depth at switch */ +#define MACH_MOVED 0xb /* did not use original scheduling decision */ +#define MACH_FAIRSHARE_ENTER 0xc /* move to fairshare band */ +#define MACH_FAIRSHARE_EXIT 0xd /* exit fairshare band */ +#define MACH_FAILSAFE 0xe /* tripped fixed-pri/RT failsafe */ +#define MACH_GET_URGENCY 0x14 /* Urgency queried by platform */ +#define MACH_URGENCY 0x15 /* Urgency (RT/BG/NORMAL) communicated + * to platform */ +#define MACH_REDISPATCH 0x16 /* "next thread" thread redispatched */ +#define MACH_REMOTE_AST 0x17 /* AST signal issued to remote processor */ +#define MACH_SCHED_LPA_BROKEN 0x18 /* last_processor affinity broken in choose_processor */ /* Codes for pmap (DBG_MACH_PMAP) */ #define PMAP__CREATE 0x0 @@ -176,31 +193,32 @@ __BEGIN_DECLS #define DBG_IOMCURS 5 /* Memory Cursor */ #define DBG_IOMDESC 6 /* Memory Descriptors */ #define DBG_IOPOWER 7 /* Power Managerment */ -#define DBG_IOSERVICE 8 /* Matching etc. */ +#define DBG_IOSERVICE 8 /* Matching etc. */ /* **** 9-32 reserved for internal IOKit usage **** */ #define DBG_IOSTORAGE 32 /* Storage layers */ #define DBG_IONETWORK 33 /* Network layers */ #define DBG_IOKEYBOARD 34 /* Keyboard */ -#define DBG_IOHID 35 /* HID Devices */ -#define DBG_IOAUDIO 36 /* Audio */ +#define DBG_IOHID 35 /* HID Devices */ +#define DBG_IOAUDIO 36 /* Audio */ #define DBG_IOSERIAL 37 /* Serial */ -#define DBG_IOTTY 38 /* TTY layers */ -#define DBG_IOSAM 39 /* SCSI Architecture Model layers */ -#define DBG_IOPARALLELATA 40 /* Parallel ATA */ +#define DBG_IOTTY 38 /* TTY layers */ +#define DBG_IOSAM 39 /* SCSI Architecture Model layers */ +#define DBG_IOPARALLELATA 40 /* Parallel ATA */ #define DBG_IOPARALLELSCSI 41 /* Parallel SCSI */ -#define DBG_IOSATA 42 /* Serial-ATA */ -#define DBG_IOSAS 43 /* SAS */ +#define DBG_IOSATA 42 /* Serial-ATA */ +#define DBG_IOSAS 43 /* SAS */ #define DBG_IOFIBRECHANNEL 44 /* FiberChannel */ -#define DBG_IOUSB 45 /* USB */ +#define DBG_IOUSB 45 /* USB */ #define DBG_IOBLUETOOTH 46 /* Bluetooth */ #define DBG_IOFIREWIRE 47 /* FireWire */ #define DBG_IOINFINIBAND 48 /* Infiniband */ -#define DBG_IOCPUPM 49 /* CPU Power Management */ +#define DBG_IOCPUPM 49 /* CPU Power Management */ #define DBG_IOGRAPHICS 50 /* Graphics */ #define DBG_HIBERNATE 51 /* hibernation related events */ + /* Backwards compatibility */ #define DBG_IOPOINTING DBG_IOHID /* OBSOLETE: Use DBG_IOHID instead */ #define DBG_IODISK DBG_IOSTORAGE /* OBSOLETE: Use DBG_IOSTORAGE instead */ @@ -223,7 +241,7 @@ __BEGIN_DECLS #define DBG_DRVFIREWIRE 16 /* FireWire */ #define DBG_DRVINFINIBAND 17 /* Infiniband */ #define DBG_DRVGRAPHICS 18 /* Graphics */ -#define DBG_DRVSD 19 /* Secure Digital */ +#define DBG_DRVSD 19 /* Secure Digital */ /* Backwards compatibility */ #define DBG_DRVPOINTING DBG_DRVHID /* OBSOLETE: Use DBG_DRVHID instead */ @@ -236,7 +254,7 @@ __BEGIN_DECLS #define DBG_DLIL_PR_FLT 4 /* DLIL Protocol Filter */ #define DBG_DLIL_IF_FLT 5 /* DLIL Interface FIlter */ -/* The Kernel Debug Sub Classes for File System */ +/* The Kernel Debug Sub Classes for File System (DBG_FSYSTEM) */ #define DBG_FSRW 1 /* reads and writes to the filesystem */ #define DBG_DKRW 2 /* reads and writes to the disk */ #define DBG_FSVN 3 /* vnode operations (inc. locking/unlocking) */ @@ -244,6 +262,7 @@ __BEGIN_DECLS #define DBG_JOURNAL 5 /* journaling operations */ #define DBG_IOCTL 6 /* ioctl to the disk */ #define DBG_BOOTCACHE 7 /* bootcache operations */ +#define DBG_HFS 8 /* HFS-specific events; see bsd/hfs/hfs_kdebug.h */ /* The Kernel Debug Sub Classes for BSD */ #define DBG_BSD_PROC 0x01 /* process/signals related */ @@ -256,11 +275,15 @@ __BEGIN_DECLS /* The Codes for BSD subcode class DBG_BSD_PROC */ #define BSD_PROC_EXIT 1 /* process exit */ #define BSD_PROC_FRCEXIT 2 /* Kernel force termination */ + /* The Kernel Debug Sub Classes for DBG_TRACE */ #define DBG_TRACE_DATA 0 #define DBG_TRACE_STRING 1 #define DBG_TRACE_INFO 2 +/* The Kernel Debug Sub Classes for DBG_CORESTORAGE */ +#define DBG_CS_IO 0 + /* The Kernel Debug Sub Classes for DBG_MISC */ #define DBG_EVENT 0x10 #define DBG_BUFFER 0x20 @@ -274,6 +297,8 @@ __BEGIN_DECLS #define DKIO_ASYNC 0x04 #define DKIO_META 0x08 #define DKIO_PAGING 0x10 +#define DKIO_THROTTLE 0x20 +#define DKIO_PASSIVE 0x40 /* Codes for Application Sub Classes */ #define DBG_APP_SAMBA 128 @@ -343,25 +368,38 @@ extern unsigned int kdebug_enable; #define KDEBUG_ENABLE_CHUD 0x4 #if (!defined(NO_KDEBUG)) - +#ifdef XNU_KERNEL_PRIVATE #define KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e) \ do { \ - if (kdebug_enable) \ + if (__improbable(kdebug_enable)) \ kernel_debug(x,(uintptr_t)a,(uintptr_t)b,(uintptr_t)c, \ (uintptr_t)d,(uintptr_t)e); \ } while(0) #define KERNEL_DEBUG_CONSTANT1(x,a,b,c,d,e) \ do { \ - if (kdebug_enable) \ + if (__improbable(kdebug_enable)) \ kernel_debug1(x,(uintptr_t)a,(uintptr_t)b,(uintptr_t)c, \ (uintptr_t)d,(uintptr_t)e); \ } while(0) +#else /* XNU_KERNEL_PRIVATE */ +#define KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e) \ +do { \ + if (kdebug_enable) \ + kernel_debug(x,(uintptr_t)a,(uintptr_t)b,(uintptr_t)c, \ + (uintptr_t)d,(uintptr_t)e); \ +} while(0) -#else - -#define KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e) -#define KERNEL_DEBUG_CONSTANT1(x,a,b,c,d,e) +#define KERNEL_DEBUG_CONSTANT1(x,a,b,c,d,e) \ +do { \ + if (kdebug_enable) \ + kernel_debug1(x,(uintptr_t)a,(uintptr_t)b,(uintptr_t)c, \ + (uintptr_t)d,(uintptr_t)e); \ +} while(0) +#endif /* XNU_KERNEL_PRIVATE */ +#else /*!NO_KDEBUG */ +#define KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e) do { } while(0) +#define KERNEL_DEBUG_CONSTANT1(x,a,b,c,d,e) do { } while(0) #define __kdebug_constant_only __unused #endif @@ -384,23 +422,37 @@ extern void kernel_debug1( #if (KDEBUG && (!defined(NO_KDEBUG))) - +#ifdef XNU_KERNEL_PRIVATE #define KERNEL_DEBUG(x,a,b,c,d,e) \ do { \ - if (kdebug_enable) \ + if (__improbable(kdebug_enable)) \ kernel_debug((uint32_t)x, (uintptr_t)a, (uintptr_t)b, \ (uintptr_t)c, (uintptr_t)d, (uintptr_t)e); \ } while(0) #define KERNEL_DEBUG1(x,a,b,c,d,e) \ do { \ - if (kdebug_enable) \ + if (__improbable(kdebug_enable)) \ kernel_debug1((uint32_t)x, (uintptr_t)a, (uintptr_t)b, \ (uintptr_t)c, (uintptr_t)d, (uintptr_t)e); \ } while(0) #define __kdebug_only +#else /* !XNU_KERNEL_PRIVATE */ +#define KERNEL_DEBUG(x,a,b,c,d,e) \ +do { \ + if (kdebug_enable) \ + kernel_debug((uint32_t)x, (uintptr_t)a, (uintptr_t)b, \ + (uintptr_t)c, (uintptr_t)d, (uintptr_t)e); \ +} while(0) +#define KERNEL_DEBUG1(x,a,b,c,d,e) \ +do { \ + if (kdebug_enable) \ + kernel_debug1((uint32_t)x, (uintptr_t)a, (uintptr_t)b, \ + (uintptr_t)c, (uintptr_t)d, (uintptr_t)e); \ +} while(0) +#endif /* XNU_KERNEL_PRIVATE */ #else #define KERNEL_DEBUG(x,a,b,c,d,e) do {} while (0) @@ -410,6 +462,7 @@ do { \ #endif #ifdef KERNEL_PRIVATE +#include struct proc; extern void kdbg_trace_data(struct proc *proc, long *arg_pid); @@ -417,6 +470,19 @@ extern void kdbg_trace_string(struct proc *proc, long *arg1, long *arg2, long *a extern void kdbg_dump_trace_to_file(const char *); void start_kern_tracing(unsigned int); +struct task; +extern void kdbg_get_task_name(char*, int, struct task *task); +void disable_wrap(uint32_t *old_slowcheck, uint32_t *old_flags); +void enable_wrap(uint32_t old_slowcheck, boolean_t lostevents); +void release_storage_unit(int cpu, uint32_t storage_unit); +int allocate_storage_unit(int cpu); + +void trace_handler_map_ctrl_page(uintptr_t addr, unsigned long ctrl_page_size, unsigned long storage_size, unsigned long kds_ptr_size); +void trace_handler_map_bufinfo(uintptr_t addr, unsigned long size); +void trace_handler_unmap_bufinfo(void); +void trace_handler_map_buffer(int index, uintptr_t addr, unsigned long size); +void trace_handler_unmap_buffer(int index); +void trace_set_timebases(uint64_t tsc, uint64_t ns); #endif /* KERNEL_PRIVATE */ @@ -446,7 +512,7 @@ typedef struct { #if !defined(__LP64__) #define KDBG_TIMESTAMP_MASK 0x00ffffffffffffffULL -#define KDBG_CPU_MASK 0x0f00000000000000ULL +#define KDBG_CPU_MASK 0xff00000000000000ULL #define KDBG_CPU_SHIFT 56 static inline void kdbg_set_cpu(kd_buf *kp, int cpu) @@ -460,9 +526,9 @@ kdbg_get_cpu(kd_buf *kp) return (int) (((kp)->timestamp & KDBG_CPU_MASK) >> KDBG_CPU_SHIFT); } static inline void -kdbg_set_timestamp(kd_buf *kp, uint64_t time) +kdbg_set_timestamp(kd_buf *kp, uint64_t thetime) { - kp->timestamp = time & KDBG_TIMESTAMP_MASK; + kp->timestamp = thetime & KDBG_TIMESTAMP_MASK; } static inline uint64_t kdbg_get_timestamp(kd_buf *kp) @@ -470,9 +536,9 @@ kdbg_get_timestamp(kd_buf *kp) return kp->timestamp & KDBG_TIMESTAMP_MASK; } static inline void -kdbg_set_timestamp_and_cpu(kd_buf *kp, uint64_t time, int cpu) +kdbg_set_timestamp_and_cpu(kd_buf *kp, uint64_t thetime, int cpu) { - kp->timestamp = (time & KDBG_TIMESTAMP_MASK) | + kp->timestamp = (thetime & KDBG_TIMESTAMP_MASK) | (((uint64_t) cpu) << KDBG_CPU_SHIFT); } #else @@ -488,9 +554,9 @@ kdbg_get_cpu(kd_buf *kp) return kp->cpuid; } static inline void -kdbg_set_timestamp(kd_buf *kp, uint64_t time) +kdbg_set_timestamp(kd_buf *kp, uint64_t thetime) { - kp->timestamp = time; + kp->timestamp = thetime; } static inline uint64_t kdbg_get_timestamp(kd_buf *kp) @@ -498,9 +564,9 @@ kdbg_get_timestamp(kd_buf *kp) return kp->timestamp; } static inline void -kdbg_set_timestamp_and_cpu(kd_buf *kp, uint64_t time, int cpu) +kdbg_set_timestamp_and_cpu(kd_buf *kp, uint64_t thetime, int cpu) { - kdbg_set_timestamp(kp, time); + kdbg_set_timestamp(kp, thetime); kdbg_set_cpu(kp, cpu); } #endif @@ -541,6 +607,18 @@ typedef struct { char command[20]; } kd_threadmap; + +typedef struct { + int version_no; + int thread_count; + uint64_t TOD_secs; + uint32_t TOD_usecs; +} RAW_header; + +#define RAW_VERSION0 0x55aa0000 +#define RAW_VERSION1 0x55aa0101 + + #define KDBG_CLASSTYPE 0x10000 #define KDBG_SUBCLSTYPE 0x20000 #define KDBG_RANGETYPE 0x40000 diff --git a/bsd/sys/kern_control.h b/bsd/sys/kern_control.h index 0e83895b4..4a5a411d6 100644 --- a/bsd/sys/kern_control.h +++ b/bsd/sys/kern_control.h @@ -252,6 +252,7 @@ typedef errno_t (*ctl_disconnect_func)(kern_ctl_ref kctlref, u_int32_t unit, voi @param unitinfo The user-defined private data initialized by the ctl_connect_func callback. @param m The data sent by the client to the kernel control in an + mbuf chain. Your function is responsible for releasing the mbuf chain. @param flags The flags specified by the client when calling send/sendto/sendmsg (MSG_OOB/MSG_DONTROUTE). @@ -433,6 +434,11 @@ ctl_enqueuembuf(kern_ctl_ref kctlref, u_int32_t unit, mbuf_t m, u_int32_t flags) errno_t ctl_getenqueuespace(kern_ctl_ref kctlref, u_int32_t unit, size_t *space); +#ifdef KERNEL_PRIVATE +u_int32_t ctl_id_by_name(const char *name); +errno_t ctl_name_by_id(u_int32_t id, char *out_name, size_t maxsize); +#endif /* KERNEL_PRIVATE */ + __END_DECLS #endif /* KERNEL */ diff --git a/bsd/sys/kern_memorystatus.h b/bsd/sys/kern_memorystatus.h index b89337521..4a05a490f 100644 --- a/bsd/sys/kern_memorystatus.h +++ b/bsd/sys/kern_memorystatus.h @@ -52,7 +52,8 @@ enum { kMemoryStatusLevelNote = 1, - kMemoryStatusSnapshotNote = 2 + kMemoryStatusSnapshotNote = 2, + kMemoryStatusHibernationNote = 3 }; enum { @@ -109,19 +110,48 @@ typedef struct jetsam_snapshot { jetsam_snapshot_entry_t entries[1]; } jetsam_snapshot_t; +typedef struct jetsam_hibernation_entry { + uint32_t pid; + uint32_t flags; + uint32_t pages; +} jetsam_hibernation_entry_t; + +#endif /* !MACH_KERNEL_PRIVATE */ + enum { - kJetsamFlagsFrontmost = (1 << 0), - kJetsamFlagsKilled = (1 << 1), - kJetsamFlagsKilledHiwat = (1 << 2) + kJetsamFlagsFrontmost = (1 << 0), + kJetsamFlagsKilled = (1 << 1), + kJetsamFlagsKilledHiwat = (1 << 2), + kJetsamFlagsHibernated = (1 << 3), + kJetsamFlagsKilledVnodes = (1 << 4), + kJetsamFlagsKilledSwap = (1 << 5), + kJetsamFlagsThawed = (1 << 6), + kJetsamFlagsKilledVM = (1 << 7), + kJetsamFlagsSuspForDiagnosis = (1 << 8) }; -#endif /* !MACH_KERNEL_PRIVATE */ #ifdef KERNEL extern void kern_memorystatus_init(void) __attribute__((section("__TEXT, initcode"))); -extern int jetsam_kill_top_proc(void); +extern int jetsam_kill_top_proc(boolean_t any, uint32_t reason); extern int kern_memorystatus_wakeup; extern int kern_memorystatus_level; +extern unsigned int kern_memorystatus_delta; + +#ifdef CONFIG_FREEZE +extern void kern_hibernation_init(void) __attribute__((section("__TEXT, initcode"))); +extern int kern_hibernation_wakeup; + +void kern_hibernation_on_pid_suspend(int pid); +void kern_hibernation_on_pid_resume(int pid, task_t task); +void kern_hibernation_on_pid_hibernate(int pid); +#endif + +#if CONFIG_EMBEDDED +#define VM_CHECK_MEMORYSTATUS do { vm_check_memorystatus(); } while(0) +#else /*CONFIG_EMBEDDED*/ +#define VM_CHECK_MEMORYSTATUS do {} while(0) +#endif #endif /* KERNEL */ #endif /* SYS_KERN_MEMORYSTATUS_H */ diff --git a/bsd/sys/kpi_mbuf.h b/bsd/sys/kpi_mbuf.h index 00134b226..24239b9f4 100644 --- a/bsd/sys/kpi_mbuf.h +++ b/bsd/sys/kpi_mbuf.h @@ -55,6 +55,7 @@ @constant MBUF_EXT Indicates this mbuf has external data. @constant MBUF_PKTHDR Indicates this mbuf has a packet header. @constant MBUF_EOR Indicates this mbuf is the end of a record. + @constant MBUF_LOOP Indicates this packet is looped back. @constant MBUF_BCAST Indicates this packet will be sent or was received as a brodcast. @constant MBUF_MCAST Indicates this packet will be sent or was @@ -72,13 +73,15 @@ enum { MBUF_EXT = 0x0001, /* has associated external storage */ MBUF_PKTHDR = 0x0002, /* start of record */ MBUF_EOR = 0x0004, /* end of record */ + MBUF_LOOP = 0x0040, /* packet is looped back */ MBUF_BCAST = 0x0100, /* send/received as link-level broadcast */ MBUF_MCAST = 0x0200, /* send/received as link-level multicast */ MBUF_FRAG = 0x0400, /* packet is a fragment of a larger packet */ MBUF_FIRSTFRAG = 0x0800, /* packet is first fragment */ MBUF_LASTFRAG = 0x1000, /* packet is last fragment */ - MBUF_PROMISC = 0x2000 /* packet is promiscuous */ + MBUF_PROMISC = 0x2000, /* packet is promiscuous */ + MBUF_HASFCS = 0x4000 /* packet has FCS */ }; typedef u_int32_t mbuf_flags_t; @@ -145,6 +148,10 @@ typedef u_int32_t mbuf_type_t; calculated yet. @constant MBUF_CSUM_REQ_UDP Indicates the UDP checksum has not been calculated yet. + @constant MBUF_CSUM_REQ_TCPIPV6 Indicates the TCP checksum for IPv6 + has not been calculated yet. + @constant MBUF_CSUM_REQ_UDPIPV6 Indicates the UDP checksum for IPv6 + has not been calculated yet. */ enum { MBUF_TSO_IPV4 = 0x100000, @@ -158,7 +165,9 @@ enum { #endif /* KERNEL_PRIVATE */ MBUF_CSUM_REQ_IP = 0x0001, MBUF_CSUM_REQ_TCP = 0x0002, - MBUF_CSUM_REQ_UDP = 0x0004 + MBUF_CSUM_REQ_UDP = 0x0004, + MBUF_CSUM_REQ_TCPIPV6 = 0x0020, + MBUF_CSUM_REQ_UDPIPV6 = 0x0040 }; typedef u_int32_t mbuf_csum_request_flags_t; @@ -178,7 +187,7 @@ typedef u_int32_t mbuf_csum_request_flags_t; hardware should be passed as the second parameter of mbuf_set_csum_performed. The hardware calculated checksum value can be retrieved using the second parameter passed to - mbuf_get_csum_performed. + mbuf_get_csum_performed. This should be done for IPv4 or IPv6. @constant MBUF_CSUM_PSEUDO_HDR If set, this indicates that the checksum value for MBUF_CSUM_DID_DATA includes the pseudo header value. If this is not set, the stack will calculate the pseudo @@ -1183,6 +1192,15 @@ extern u_int32_t mbuf_get_mlen(void); */ extern u_int32_t mbuf_get_mhlen(void); +/*! + @function mbuf_get_minclsize + @discussion This routine returns the minimum number of data bytes + before an external cluster is used. This is equivalent to the + legacy MINCLSIZE macro. + @result The minimum number of bytes before a cluster will be used. + */ +extern u_int32_t mbuf_get_minclsize(void); + /*! @function mbuf_clear_csum_performed @discussion Clears the hardware checksum flags and values. @@ -1330,32 +1348,8 @@ extern void mbuf_tag_free(mbuf_t mbuf, mbuf_tag_id_t module_id, */ extern void mbuf_stats(struct mbuf_stat *stats); -#ifdef KERNEL_PRIVATE -/* - @enum mbuf_priority_t - @abstract Priority of a packet. - @discussion Some mbufs represent packets containing application data. - The priority of the application data is represented by the - mbuf priority, as determined by the system. - @constant MBUF_PRIORITY_NORMAL Indicates the packet contains - normal priority data. - @constant MBUF_PRIORITY_BACKGROUND Indicates the packet contains - background priority data. - */ -typedef enum { - MBUF_PRIORITY_NORMAL = 0, - MBUF_PRIORITY_BACKGROUND = 1 -} mbuf_priority_t; - -/* - @function mbuf_get_priority - @discussion Get the priority value of the packet. - @param mbuf The mbuf to obtain the priority value from. - @result The priority value of the packet. - */ -extern mbuf_priority_t mbuf_get_priority(mbuf_t mbuf); -/* +/*! @enum mbuf_traffic_class_t @abstract Traffic class of a packet @discussion Property that represent the category of traffic of a packet. @@ -1367,15 +1361,19 @@ extern mbuf_priority_t mbuf_get_priority(mbuf_t mbuf); */ typedef enum { #ifdef XNU_KERNEL_PRIVATE - MBUF_TC_NONE = -1, + MBUF_TC_UNSPEC = -1, /* Internal: not specified */ #endif MBUF_TC_BE = 0, MBUF_TC_BK = 1, MBUF_TC_VI = 2, MBUF_TC_VO = 3 +#ifdef XNU_KERNEL_PRIVATE + , + MBUF_TC_MAX = 4 /* Internal: traffic class count */ +#endif } mbuf_traffic_class_t; -/* +/*! @function mbuf_get_traffic_class @discussion Get the traffic class of an mbuf packet @param mbuf The mbuf to get the traffic class of. @@ -1383,7 +1381,7 @@ typedef enum { */ extern mbuf_traffic_class_t mbuf_get_traffic_class(mbuf_t mbuf); -/* +/*! @function mbuf_set_traffic_class @discussion Set the traffic class of an mbuf packet. @param mbuf The mbuf to set the traffic class on. @@ -1391,7 +1389,6 @@ extern mbuf_traffic_class_t mbuf_get_traffic_class(mbuf_t mbuf); @result 0 on success, EINVAL if bad paramater is passed */ extern errno_t mbuf_set_traffic_class(mbuf_t mbuf, mbuf_traffic_class_t tc); -#endif /* KERNEL_PRIVATE */ /* IF_QUEUE interaction */ diff --git a/bsd/sys/kpi_socket.h b/bsd/sys/kpi_socket.h index 5e380f5a7..5f2093369 100644 --- a/bsd/sys/kpi_socket.h +++ b/bsd/sys/kpi_socket.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2008-2011 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -56,7 +56,12 @@ struct timeval; Calls to your upcall function are not serialized and may be called concurrently from multiple threads in the kernel. - Your upcall function will be called when: + Your upcall function will be called: + when there is data more than the low water mark for reading, + or when there is space for a write, + or when there is a connection to accept, + or when a socket is connected, + or when a socket is closed or disconnected @param so A reference to the socket that's ready. @param cookie The cookie passed in when the socket was created. @@ -227,11 +232,16 @@ extern errno_t sock_settclassopt(socket_t so, const void* optval, size_t optlen) */ extern errno_t sock_gettclassopt(socket_t so, void* optval, size_t* optlen); +#ifdef XNU_KERNEL_PRIVATE +extern void socket_set_traffic_mgt_flags_locked(socket_t so, u_int32_t flags); +extern void socket_clear_traffic_mgt_flags_locked(socket_t so, u_int32_t flags); +#endif /* XNU_KERNEL_PRIVATE */ #ifdef BSD_KERNEL_PRIVATE extern void socket_set_traffic_mgt_flags(socket_t so, u_int32_t flags); extern void socket_clear_traffic_mgt_flags(socket_t so, u_int32_t flags); +extern errno_t socket_defunct(struct proc *, socket_t so, int); #endif /* BSD_KERNEL_PRIVATE */ -#endif +#endif /* KERNEL_PRIVATE */ /*! @function sock_listen @@ -473,6 +483,22 @@ extern errno_t sock_getaddr(socket_t so, struct sockaddr **psockname, @param sockname The socket name to be freed. */ extern void sock_freeaddr(struct sockaddr *sockname); + +/* + @function sock_setupcall + @discussion Set the notifier function to be called when an event + occurs on the socket. This may be set to NULL to disable + further notifications. Setting the function does not + affect currently notifications about to be sent or being sent. + Note: When this function is used on a socket passed from userspace + it is crucial to call sock_retain() on the socket otherwise a callback + could be dispatched on a closed socket and cause a crash. + @param sock The socket. + @param callback The notifier function + @param context A cookie passed directly to the callback +*/ +extern errno_t sock_setupcall(socket_t sock, sock_upcall callback, void* context); + #endif /* KERNEL_PRIVATE */ __END_DECLS diff --git a/bsd/sys/make_posix_availability.sh b/bsd/sys/make_posix_availability.sh new file mode 100755 index 000000000..5aa58b364 --- /dev/null +++ b/bsd/sys/make_posix_availability.sh @@ -0,0 +1,71 @@ +#! /bin/sh - +# +# Copyright (c) 2010 Apple Inc. All rights reserved. +# +# @APPLE_OSREFERENCE_LICENSE_HEADER_START@ +# +# This file contains Original Code and/or Modifications of Original Code +# as defined in and that are subject to the Apple Public Source License +# Version 2.0 (the 'License'). You may not use this file except in +# compliance with the License. Please obtain a copy of the License at +# http://www.opensource.apple.com/apsl/ and read it before using this +# file. +# +# The Original Code and all software distributed under the License are +# distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER +# EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +# INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. +# Please see the License for the specific language governing rights and +# limitations under the License. +# +# @APPLE_OSREFERENCE_LICENSE_HEADER_END@ +# + +POSIX_VALUES="198808L 199009L 199209L 199309L 199506L 200112L 200809L" + +{ +cat < directly. Use instead." +#endif + +EOF + +for value in ${POSIX_VALUES} ; do + echo "#if !defined(_DARWIN_C_SOURCE) && defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= ${value}" + echo "#define ___POSIX_C_DEPRECATED_STARTING_${value} __deprecated" + echo "#else" + echo "#define ___POSIX_C_DEPRECATED_STARTING_${value}" + echo "#endif" + echo +done +} > $1 + diff --git a/bsd/sys/make_symbol_aliasing.sh b/bsd/sys/make_symbol_aliasing.sh new file mode 100755 index 000000000..fa5f0e33c --- /dev/null +++ b/bsd/sys/make_symbol_aliasing.sh @@ -0,0 +1,86 @@ +#! /bin/bash - +# +# Copyright (c) 2010 Apple Inc. All rights reserved. +# +# @APPLE_OSREFERENCE_LICENSE_HEADER_START@ +# +# This file contains Original Code and/or Modifications of Original Code +# as defined in and that are subject to the Apple Public Source License +# Version 2.0 (the 'License'). You may not use this file except in +# compliance with the License. Please obtain a copy of the License at +# http://www.opensource.apple.com/apsl/ and read it before using this +# file. +# +# The Original Code and all software distributed under the License are +# distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER +# EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +# INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. +# Please see the License for the specific language governing rights and +# limitations under the License. +# +# @APPLE_OSREFERENCE_LICENSE_HEADER_END@ +# + +{ +cat < directly. Use instead." +#endif + +EOF + +for ver in $(${SDKROOT}/usr/local/libexec/availability.pl --ios) ; do + ver_major=${ver%.*} + ver_minor=${ver#*.} + value=$(printf "%d%02d00" ${ver_major} ${ver_minor}) + str=$(printf "__IPHONE_%d_%d" ${ver_major} ${ver_minor}) + echo "#if defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ >= ${value}" + echo "#define __DARWIN_ALIAS_STARTING_IPHONE_${str}(x) x" + echo "#else" + echo "#define __DARWIN_ALIAS_STARTING_IPHONE_${str}(x)" + echo "#endif" + echo "" +done + +for ver in $(${SDKROOT}/usr/local/libexec/availability.pl --macosx) ; do + ver_major=${ver%.*} + ver_minor=${ver#*.} + value=$(printf "%d%d0" ${ver_major} ${ver_minor}) + str=$(printf "__MAC_%d_%d" ${ver_major} ${ver_minor}) + echo "#if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ >= ${value}" + echo "#define __DARWIN_ALIAS_STARTING_MAC_${str}(x) x" + echo "#else" + echo "#define __DARWIN_ALIAS_STARTING_MAC_${str}(x)" + echo "#endif" + echo "" +done +} > $1 + diff --git a/bsd/sys/malloc.h b/bsd/sys/malloc.h index dcbaaded7..4e8688735 100644 --- a/bsd/sys/malloc.h +++ b/bsd/sys/malloc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -179,7 +179,7 @@ #define M_IP6NDP 86 /* IPv6 Neighbour Discovery*/ #define M_IP6OPT 87 /* IPv6 options management */ #define M_IP6MISC 88 /* IPv6 misc. memory */ -#define M_TSEGQ 89 /* TCP segment queue entry */ +#define M_TSEGQ 89 /* TCP segment queue entry, unused */ #define M_IGMP 90 #define M_JNL_JNL 91 /* Journaling: "struct journal" */ #define M_JNL_TR 92 /* Journaling: "struct transaction" */ @@ -204,8 +204,13 @@ #if HFS_COMPRESSION #define M_DECMPFS_CNODE 109 /* decmpfs cnode structures */ #endif /* HFS_COMPRESSION */ +#define M_INMFILTER 110 /* IPv4 multicast PCB-layer source filter */ +#define M_IPMSOURCE 111 /* IPv4 multicast IGMP-layer source filter */ +#define M_IN6MFILTER 112 /* IPv6 multicast PCB-layer source filter */ +#define M_IP6MOPTS 113 /* IPv6 multicast options */ +#define M_IP6MSOURCE 114 /* IPv6 multicast MLD-layer source filter */ -#define M_LAST 110 /* Must be last type + 1 */ +#define M_LAST 115 /* Must be last type + 1 */ #else /* BSD_KERNEL_PRIVATE */ @@ -253,6 +258,9 @@ extern struct kmemstats kmemstats[]; #define FREE(addr, type) \ _FREE((void *)addr, type) +#define REALLOC(space, cast, addr, size, type, flags) \ + (space) = (cast)_REALLOC(addr, size, type, flags) + #define MALLOC_ZONE(space, cast, size, type, flags) \ (space) = (cast)_MALLOC_ZONE(size, type, flags) @@ -268,6 +276,12 @@ extern void _FREE( void *addr, int type); +extern void *_REALLOC( + void *addr, + size_t size, + int type, + int flags); + extern void *_MALLOC_ZONE( size_t size, int type, diff --git a/bsd/sys/mbuf.h b/bsd/sys/mbuf.h index 247d7bb71..f0d45c565 100644 --- a/bsd/sys/mbuf.h +++ b/bsd/sys/mbuf.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 1999-2010 Apple Inc. All rights reserved. + * Copyright (c) 1999-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,12 +22,12 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* Copyright (c) 1998, 1999 Apple Computer, Inc. All Rights Reserved */ /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/* +/* * Mach Operating System * Copyright (c) 1987 Carnegie-Mellon University * All rights reserved. The CMU software License Agreement specifies @@ -68,11 +68,6 @@ * SUCH DAMAGE. * * @(#)mbuf.h 8.3 (Berkeley) 1/21/94 - ********************************************************************** - * HISTORY - * 20-May-95 Mac Gillon (mgillon) at NeXT - * New version based on 4.4 - * Purged old history */ /* * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce @@ -87,7 +82,7 @@ #include #include -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #include #include @@ -99,18 +94,10 @@ /* * Mbufs are of a single size, MSIZE (machine/param.h), which * includes overhead. An mbuf may add a single "mbuf cluster" of size - * MCLBYTES (also in machine/param.h), which has no additional overhead - * and is used instead of the internal data area; this is done when - * at least MINCLSIZE of data must be stored. - */ - -/* - * These macros are mapped to the appropriate KPIs, so that private code - * can be simply recompiled in order to be forward-compatible with future - * changes toward the struture sizes. + * MCLBYTES/MBIGCLBYTES/M16KCLBYTES (also in machine/param.h), which has + * no additional overhead and is used instead of the internal data area; + * this is done when at least MINCLSIZE of data must be stored. */ -#define MLEN mbuf_get_mlen() /* normal data len */ -#define MHLEN mbuf_get_mhlen() /* data len w/pkthdr */ /* * The following _MLEN and _MHLEN macros are private to xnu. Private code @@ -120,24 +107,31 @@ #define _MLEN (MSIZE - sizeof(struct m_hdr)) /* normal data len */ #define _MHLEN (_MLEN - sizeof(struct pkthdr)) /* data len w/pkthdr */ -#define MINCLSIZE (MHLEN + MLEN) /* smallest amount to put in cluster */ -#define M_MAXCOMPRESS (MHLEN / 2) /* max amount to copy for compression */ +#define NMBPBGSHIFT (MBIGCLSHIFT - MSIZESHIFT) +#define NMBPBG (1 << NMBPBGSHIFT) /* # of mbufs per big cl */ -#define NMBPCL (sizeof(union mcluster) / sizeof(struct mbuf)) +#define NCLPBGSHIFT (MBIGCLSHIFT - MCLSHIFT) +#define NCLPBG (1 << NCLPBGSHIFT) /* # of cl per big cl */ + +#define NMBPCLSHIFT (NMBPBGSHIFT - NCLPBGSHIFT) +#define NMBPCL (1 << NMBPCLSHIFT) /* # of mbufs per cl */ + +#define NCLPJCLSHIFT ((M16KCLSHIFT - MBIGCLSHIFT) + NCLPBGSHIFT) +#define NCLPJCL (1 << NCLPJCLSHIFT) /* # of cl per jumbo cl */ /* * Macros for type conversion * mtod(m,t) - convert mbuf pointer to data pointer of correct type * dtom(x) - convert data pointer within mbuf to mbuf pointer (XXX) */ -#define mtod(m,t) ((t)m_mtod(m)) -#define dtom(x) m_dtom(x) +#define mtod(m, t) ((t)m_mtod(m)) +#define dtom(x) m_dtom(x) /* header at beginning of each mbuf: */ struct m_hdr { struct mbuf *mh_next; /* next buffer in chain */ struct mbuf *mh_nextpkt; /* next chain in queue/record */ - int32_t mh_len; /* amount of data in this mbuf */ + int32_t mh_len; /* amount of data in this mbuf */ caddr_t mh_data; /* location of data */ short mh_type; /* type of data in this mbuf */ short mh_flags; /* flags; see below */ @@ -147,10 +141,29 @@ struct m_hdr { * Packet tag structure (see below for details). */ struct m_tag { + u_int64_t m_tag_cookie; /* Error checking */ +#ifndef __LP64__ + u_int32_t pad; /* For structure alignment */ +#endif /* !__LP64__ */ SLIST_ENTRY(m_tag) m_tag_link; /* List of packet tags */ - u_int16_t m_tag_type; /* Module specific type */ - u_int16_t m_tag_len; /* Length of data */ - u_int32_t m_tag_id; /* Module ID */ + u_int16_t m_tag_type; /* Module specific type */ + u_int16_t m_tag_len; /* Length of data */ + u_int32_t m_tag_id; /* Module ID */ +}; + +#ifdef __LP64__ +#define M_TAG_ALIGN(len) \ + P2ROUNDUP(len, sizeof (u_int64_t)) + sizeof (struct m_tag) +#else +#define M_TAG_ALIGN(len) \ + P2ROUNDUP(len, sizeof (u_int32_t)) + sizeof (struct m_tag) +#endif /* !__LP64__ */ + +#define M_TAG_VALID_PATTERN 0xfeedfacefeedfaceULL +#define M_TAG_FREE_PATTERN 0xdeadbeefdeadbeefULL + +struct m_taghdr { + u_int64_t refcnt; /* Number of tags in this mbuf */ }; /* record/packet header in first mbuf of chain; valid if M_PKTHDR set */ @@ -160,14 +173,14 @@ struct pkthdr { /* variables for ip and tcp reassembly */ void *header; /* pointer to packet header */ - /* variables for hardware checksum */ - /* Note: csum_flags is used for hardware checksum and VLAN */ - int csum_flags; /* flags regarding checksum */ - int csum_data; /* data field used by csum routines */ + /* variables for hardware checksum */ + /* Note: csum_flags is used for hardware checksum and VLAN */ + int csum_flags; /* flags regarding checksum */ + int csum_data; /* data field used by csum routines */ u_int tso_segsz; /* TSO segment size (actual MSS) */ u_short vlan_tag; /* VLAN tag, host byte order */ u_short socket_id; /* socket id */ - SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */ + SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */ #if PF_PKTHDR /* * Be careful; {en,dis}abling PF_PKTHDR will require xnu recompile; @@ -176,16 +189,16 @@ struct pkthdr { */ struct pf_mtag pf_mtag; #endif /* PF_PKTHDR */ -#if PKT_PRIORITY u_int32_t prio; /* packet priority */ -#endif /* PKT_PRIORITY */ + u_short vt_nrecs; /* # of IGMPv3 records in this chain */ + u_short _pad; }; /* description of external storage mapped into mbuf, valid if M_EXT set */ struct m_ext { caddr_t ext_buf; /* start of buffer */ - void (*ext_free)(caddr_t , u_int, caddr_t); /* free routine if not the usual */ + void (*ext_free)(caddr_t, u_int, caddr_t); /* free routine if not the usual */ u_int ext_size; /* size of buffer, for ext_free */ caddr_t ext_arg; /* additional ext_free argument */ struct ext_refsq { /* references held */ @@ -226,58 +239,71 @@ struct mbuf { #define m_pktdat M_dat.MH.MH_dat.MH_databuf #define m_dat M_dat.M_databuf -/* mbuf flags */ +/* mbuf flags (private) */ #define M_EXT 0x0001 /* has associated external storage */ #define M_PKTHDR 0x0002 /* start of record */ #define M_EOR 0x0004 /* end of record */ #define M_PROTO1 0x0008 /* protocol-specific */ #define M_PROTO2 0x0010 /* protocol-specific */ #define M_PROTO3 0x0020 /* protocol-specific */ -#define M_PROTO4 0x0040 /* protocol-specific */ +#define M_LOOP 0x0040 /* packet is looped back */ #define M_PROTO5 0x0080 /* protocol-specific */ -/* mbuf pkthdr flags, also in m_flags */ +/* mbuf pkthdr flags, also in m_flags (private) */ #define M_BCAST 0x0100 /* send/received as link-level broadcast */ #define M_MCAST 0x0200 /* send/received as link-level multicast */ #define M_FRAG 0x0400 /* packet is a fragment of a larger packet */ #define M_FIRSTFRAG 0x0800 /* packet is first fragment */ #define M_LASTFRAG 0x1000 /* packet is last fragment */ #define M_PROMISC 0x2000 /* packet is promiscuous (shouldn't go to stack) */ +#define M_HASFCS 0x4000 /* packet has FCS */ +#define M_TAGHDR 0x8000 /* m_tag hdr structure at top of mbuf data */ + +/* + * Flags to purge when crossing layers. + */ +#define M_PROTOFLAGS \ + (M_PROTO1|M_PROTO2|M_PROTO3|M_PROTO5) /* flags copied when copying m_pkthdr */ -#define M_COPYFLAGS (M_PKTHDR|M_EOR|M_PROTO1|M_PROTO2|M_PROTO3 | \ - M_PROTO4|M_PROTO5|M_BCAST|M_MCAST|M_FRAG | \ - M_FIRSTFRAG|M_LASTFRAG|M_PROMISC) - -/* flags indicating hw checksum support and sw checksum requirements [freebsd4.1]*/ -#define CSUM_IP 0x0001 /* will csum IP */ -#define CSUM_TCP 0x0002 /* will csum TCP */ -#define CSUM_UDP 0x0004 /* will csum UDP */ -#define CSUM_IP_FRAGS 0x0008 /* will csum IP fragments */ -#define CSUM_FRAGMENT 0x0010 /* will do IP fragmentation */ - -#define CSUM_IP_CHECKED 0x0100 /* did csum IP */ -#define CSUM_IP_VALID 0x0200 /* ... the csum is valid */ -#define CSUM_DATA_VALID 0x0400 /* csum_data field is valid */ -#define CSUM_PSEUDO_HDR 0x0800 /* csum_data has pseudo hdr */ -#define CSUM_TCP_SUM16 0x1000 /* simple TCP Sum16 computation */ - -#define CSUM_DELAY_DATA (CSUM_TCP | CSUM_UDP) -#define CSUM_DELAY_IP (CSUM_IP) /* XXX add ipv6 here too? */ +#define M_COPYFLAGS \ + (M_PKTHDR|M_EOR|M_PROTO1|M_PROTO2|M_PROTO3 | \ + M_LOOP|M_PROTO5|M_BCAST|M_MCAST|M_FRAG | \ + M_FIRSTFRAG|M_LASTFRAG|M_PROMISC|M_HASFCS) + +/* flags indicating hw checksum support and sw checksum requirements [freebsd4.1] */ +#define CSUM_IP 0x0001 /* will csum IP */ +#define CSUM_TCP 0x0002 /* will csum TCP */ +#define CSUM_UDP 0x0004 /* will csum UDP */ +#define CSUM_IP_FRAGS 0x0008 /* will csum IP fragments */ +#define CSUM_FRAGMENT 0x0010 /* will do IP fragmentation */ +#define CSUM_TCPIPV6 0x0020 /* will csum TCP for IPv6 */ +#define CSUM_UDPIPV6 0x0040 /* will csum UDP for IPv6 */ +#define CSUM_FRAGMENT_IPV6 0x0080 /* will do IPv6 fragmentation */ + +#define CSUM_IP_CHECKED 0x0100 /* did csum IP */ +#define CSUM_IP_VALID 0x0200 /* ... the csum is valid */ +#define CSUM_DATA_VALID 0x0400 /* csum_data field is valid */ +#define CSUM_PSEUDO_HDR 0x0800 /* csum_data has pseudo hdr */ +#define CSUM_TCP_SUM16 0x1000 /* simple TCP Sum16 computation */ + +#define CSUM_DELAY_DATA (CSUM_TCP | CSUM_UDP) +#define CSUM_DELAY_IP (CSUM_IP) /* IPv4 only: no IPv6 IP cksum */ +#define CSUM_DELAY_IPV6_DATA (CSUM_TCPIPV6 | CSUM_UDPIPV6) +#define CSUM_DATA_IPV6_VALID CSUM_DATA_VALID /* csum_data field is valid */ /* * Note: see also IF_HWASSIST_CSUM defined in */ /* bottom 16 bits reserved for hardware checksum */ -#define CSUM_CHECKSUM_MASK 0xffff +#define CSUM_CHECKSUM_MASK 0xffff /* VLAN tag present */ -#define CSUM_VLAN_TAG_VALID 0x10000 /* vlan_tag field is valid */ +#define CSUM_VLAN_TAG_VALID 0x10000 /* vlan_tag field is valid */ /* TCP Segment Offloading requested on this mbuf */ -#define CSUM_TSO_IPV4 0x100000 /* This mbuf needs to be segmented by the NIC */ -#define CSUM_TSO_IPV6 0x200000 /* This mbuf needs to be segmented by the NIC */ -#endif /* KERNEL_PRIVATE */ - +#define CSUM_TSO_IPV4 0x100000 /* This mbuf needs to be segmented by the NIC */ +#define CSUM_TSO_IPV6 0x200000 /* This mbuf needs to be segmented by the NIC */ +#endif /* XNU_KERNEL_PRIVATE */ /* mbuf types */ #define MT_FREE 0 /* should be on free list */ @@ -293,20 +319,12 @@ struct mbuf { #define MT_FTABLE 11 /* fragment reassembly header */ #define MT_RIGHTS 12 /* access rights */ #define MT_IFADDR 13 /* interface address */ -#define MT_CONTROL 14 /* extra-data protocol message */ -#define MT_OOBDATA 15 /* expedited data */ -#define MT_TAG 16 /* volatile metadata associated to pkts */ -#define MT_MAX 32 /* enough? */ - -#ifdef KERNEL_PRIVATE - -/* flags to m_get/MGET */ -/* Need to include malloc.h to get right options for malloc */ -#include - -#define M_DONTWAIT M_NOWAIT -#define M_WAIT M_WAITOK +#define MT_CONTROL 14 /* extra-data protocol message */ +#define MT_OOBDATA 15 /* expedited data */ +#define MT_TAG 16 /* volatile metadata associated to pkts */ +#define MT_MAX 32 /* enough? */ +#ifdef XNU_KERNEL_PRIVATE /* * mbuf allocation/deallocation macros: * @@ -319,9 +337,9 @@ struct mbuf { */ #if 1 -#define MCHECK(m) m_mcheck(m) +#define MCHECK(m) m_mcheck(m) #else -#define MCHECK(m) +#define MCHECK(m) #endif #define MGET(m, how, type) ((m) = m_get((how), (type))) @@ -347,27 +365,27 @@ union mcluster { #define MCLALLOC(p, how) ((p) = m_mclalloc(how)) -#define MCLFREE(p) m_mclfree(p) +#define MCLFREE(p) m_mclfree(p) -#define MCLGET(m, how) ((m) = m_mclget(m, how)) +#define MCLGET(m, how) ((m) = m_mclget(m, how)) /* * Mbuf big cluster */ - union mbigcluster { union mbigcluster *mbc_next; - char mbc_buf[NBPG]; + char mbc_buf[MBIGCLBYTES]; }; -#define M16KCLBYTES (16 * 1024) - +/* + * Mbuf jumbo cluster + */ union m16kcluster { union m16kcluster *m16kcl_next; char m16kcl_buf[M16KCLBYTES]; }; -#define MCLHASREFERENCE(m) m_mclhasreference(m) +#define MCLHASREFERENCE(m) m_mclhasreference(m) /* * MFREE(struct mbuf *m, struct mbuf *n) @@ -388,14 +406,19 @@ union m16kcluster { * Set the m_data pointer of a newly-allocated mbuf (m_get/MGET) to place * an object of the specified size at the end of the mbuf, longword aligned. */ -#define M_ALIGN(m, len) \ - { (m)->m_data += (MLEN - (len)) &~ (sizeof(long) - 1); } +#define M_ALIGN(m, len) \ +do { \ + (m)->m_data += (MLEN - (len)) &~ (sizeof (long) - 1); \ +} while (0) + /* * As above, for mbufs allocated with m_gethdr/MGETHDR * or initialized by M_COPY_PKTHDR. */ -#define MH_ALIGN(m, len) \ - { (m)->m_data += (MHLEN - (len)) &~ (sizeof(long) - 1); } +#define MH_ALIGN(m, len) \ +do { \ + (m)->m_data += (MHLEN - (len)) &~ (sizeof (long) - 1); \ +} while (0) /* * Compute the amount of space available @@ -417,21 +440,84 @@ union m16kcluster { * If how is M_DONTWAIT and allocation fails, the original mbuf chain * is freed and m is set to NULL. */ -#define M_PREPEND(m, plen, how) ((m) = m_prepend_2((m), (plen), (how))) +#define M_PREPEND(m, plen, how) ((m) = m_prepend_2((m), (plen), (how))) /* change mbuf to new type */ -#define MCHTYPE(m, t) m_mchtype(m, t) - -/* length to m_copy to copy all */ -#define M_COPYALL 1000000000 +#define MCHTYPE(m, t) m_mchtype(m, t) /* compatiblity with 4.3 */ -#define m_copy(m, o, l) m_copym((m), (o), (l), M_DONTWAIT) +#define m_copy(m, o, l) m_copym((m), (o), (l), M_DONTWAIT) #define MBSHIFT 20 /* 1MB */ +#define MBSIZE (1 << MBSHIFT) #define GBSHIFT 30 /* 1GB */ +#define GBSIZE (1 << GBSHIFT) -#endif /* KERNEL_PRIVATE */ +/* + * M_STRUCT_GET ensures that intermediate protocol header (from "off" to + * "len") is located in single mbuf, on contiguous memory region. + * The pointer to the region will be returned to pointer variable "val", + * with type "typ". + * + * M_STRUCT_GET0 does the same, except that it aligns the structure at + * very top of mbuf. GET0 is likely to make memory copy than GET. + */ +#define M_STRUCT_GET(val, typ, m, off, len) \ +do { \ + struct mbuf *t; \ + int tmp; \ + \ + if ((m)->m_len >= (off) + (len)) { \ + (val) = (typ)(mtod((m), caddr_t) + (off)); \ + } else { \ + t = m_pulldown((m), (off), (len), &tmp); \ + if (t != NULL) { \ + if (t->m_len < tmp + (len)) \ + panic("m_pulldown malfunction"); \ + (val) = (typ)(mtod(t, caddr_t) + tmp); \ + } else { \ + (val) = (typ)NULL; \ + (m) = NULL; \ + } \ + } \ +} while (0) + +#define M_STRUCT_GET0(val, typ, m, off, len) \ +do { \ + struct mbuf *t; \ + \ + if ((off) == 0) { \ + (val) = (typ)mtod(m, caddr_t); \ + } else { \ + t = m_pulldown((m), (off), (len), NULL); \ + if (t != NULL) { \ + if (t->m_len < (len)) \ + panic("m_pulldown malfunction"); \ + (val) = (typ)mtod(t, caddr_t); \ + } else { \ + (val) = (typ)NULL; \ + (m) = NULL; \ + } \ + } \ +} while (0) + +#define MBUF_INPUT_CHECK(m, rcvif) \ +do { \ + if (!(m->m_flags & MBUF_PKTHDR) || \ + m->m_len < 0 || \ + m->m_len > ((njcl > 0) ? njclbytes : MBIGCLBYTES) || \ + m->m_type == MT_FREE || \ + ((m->m_flags & M_EXT) != 0 && m->m_ext.ext_buf == NULL)) { \ + panic("Failed mbuf validity check: mbuf %p len %d " \ + "type %d flags 0x%x data %p rcvif %s%d ifflags 0x%x", \ + m, m->m_len, m->m_type, m->m_flags, \ + ((m->m_flags & M_EXT) ? m->m_ext.ext_buf : m->m_data), \ + rcvif->if_name, rcvif->if_unit, \ + (rcvif->if_flags & 0xffff)); \ + } \ +} while (0) + +#endif /* XNU_KERNEL_PRIVATE */ /* * Mbuf statistics (legacy). @@ -481,7 +567,7 @@ struct ombstat { */ #define MAX_MBUF_CNAME 15 -#if defined(KERNEL_PRIVATE) +#if defined(XNU_KERNEL_PRIVATE) /* For backwards compatibility with 32-bit userland process */ struct omb_class_stat { char mbcl_cname[MAX_MBUF_CNAME + 1]; /* class name */ @@ -506,7 +592,7 @@ struct omb_class_stat { u_int32_t mbcl_mc_nwretry_cnt; /* # of no-wait retry attempts */ u_int64_t mbcl_reserved[4]; /* for future use */ } __attribute__((__packed__)); -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ typedef struct mb_class_stat { char mbcl_cname[MAX_MBUF_CNAME + 1]; /* class name */ @@ -540,13 +626,13 @@ typedef struct mb_class_stat { #define MCS_PURGING 2 /* cache is being purged */ #define MCS_OFFLINE 3 /* cache is offline (resizing) */ -#if defined(KERNEL_PRIVATE) +#if defined(XNU_KERNEL_PRIVATE) /* For backwards compatibility with 32-bit userland process */ struct omb_stat { u_int32_t mbs_cnt; /* number of classes */ struct omb_class_stat mbs_class[1]; /* class array */ } __attribute__((__packed__)); -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ typedef struct mb_stat { u_int32_t mbs_cnt; /* number of classes */ @@ -556,120 +642,199 @@ typedef struct mb_stat { mb_class_stat_t mbs_class[1]; /* class array */ } mb_stat_t; +#ifdef PRIVATE +#define MLEAK_STACK_DEPTH 16 /* Max PC stack depth */ + +typedef struct mleak_trace_stat { + u_int64_t mltr_collisions; + u_int64_t mltr_hitcount; + u_int64_t mltr_allocs; + u_int64_t mltr_depth; + u_int64_t mltr_addr[MLEAK_STACK_DEPTH]; +} mleak_trace_stat_t; + +typedef struct mleak_stat { + u_int32_t ml_isaddr64; /* 64-bit KVA? */ + u_int32_t ml_cnt; /* number of traces */ + mleak_trace_stat_t ml_trace[1]; /* trace array */ +} mleak_stat_t; + +struct mleak_table { + u_int32_t mleak_capture; /* sampling capture counter */ + u_int32_t mleak_sample_factor; /* sample factor */ + + /* Times two active records want to occupy the same spot */ + u_int64_t alloc_collisions; + u_int64_t trace_collisions; + + /* Times new record lands on spot previously occupied by freed alloc */ + u_int64_t alloc_overwrites; + u_int64_t trace_overwrites; + + /* Times a new alloc or trace is put into the hash table */ + u_int64_t alloc_recorded; + u_int64_t trace_recorded; + + /* Total number of outstanding allocs */ + u_int64_t outstanding_allocs; + + /* Times mleak_log returned false because couldn't acquire the lock */ + u_int64_t total_conflicts; +}; +#endif /* PRIVATE */ + #ifdef KERNEL_PRIVATE +__BEGIN_DECLS -#ifdef KERNEL -extern union mcluster *mbutl; /* virtual address of mclusters */ -extern union mcluster *embutl; /* ending virtual address of mclusters */ -extern struct mbstat mbstat; /* statistics */ -extern unsigned int nmbclusters; /* number of mapped clusters */ -extern int njcl; /* # of clusters for jumbo sizes */ -extern int njclbytes; /* size of a jumbo cluster */ -extern int max_linkhdr; /* largest link-level header */ -extern int max_protohdr; /* largest protocol header */ -extern int max_hdr; /* largest link+protocol header */ -extern int max_datalen; /* MHLEN - max_hdr */ +/* + * Exported (private) + */ + +extern struct mbstat mbstat; /* statistics */ + +__END_DECLS +#endif /* KERNEL_PRIVATE */ +#ifdef XNU_KERNEL_PRIVATE __BEGIN_DECLS -/* Not exported */ -__private_extern__ unsigned int mbuf_default_ncl(int, uint64_t); + +/* + * Not exported (xnu private) + */ + +/* flags to m_get/MGET */ +/* Need to include malloc.h to get right options for malloc */ +#include + +struct mbuf; + +/* length to m_copy to copy all */ +#define M_COPYALL 1000000000 + +#define M_DONTWAIT M_NOWAIT +#define M_WAIT M_WAITOK + +/* + * These macros are mapped to the appropriate KPIs, so that private code + * can be simply recompiled in order to be forward-compatible with future + * changes toward the struture sizes. + */ +#define MLEN mbuf_get_mlen() /* normal data len */ +#define MHLEN mbuf_get_mhlen() /* data len w/pkthdr */ + +#define MINCLSIZE mbuf_get_minclsize() /* cluster usage threshold */ + +extern void m_freem(struct mbuf *); +extern char *mcl_to_paddr(char *); +extern void m_adj(struct mbuf *, int); +extern void m_cat(struct mbuf *, struct mbuf *); +extern void m_copydata(struct mbuf *, int, int, void *); +extern struct mbuf *m_copym(struct mbuf *, int, int, int); +extern struct mbuf *m_get(int, int); +extern struct mbuf *m_gethdr(int, int); +extern struct mbuf *m_getpacket(void); +extern struct mbuf *m_getpackets(int, int, int); +extern struct mbuf *m_mclget(struct mbuf *, int); +extern void *m_mtod(struct mbuf *); +extern struct mbuf *m_prepend_2(struct mbuf *, int, int); +extern struct mbuf *m_pullup(struct mbuf *, int); +extern struct mbuf *m_split(struct mbuf *, int, int); +extern void m_mclfree(caddr_t p); + +__private_extern__ union mbigcluster *mbutl; /* start VA of mbuf pool */ +__private_extern__ union mbigcluster *embutl; /* end VA of mbuf pool */ +__private_extern__ unsigned int nmbclusters; /* number of mapped clusters */ +__private_extern__ int njcl; /* # of jumbo clusters */ +__private_extern__ int njclbytes; /* size of a jumbo cluster */ +__private_extern__ int max_linkhdr; /* largest link-level header */ +__private_extern__ int max_protohdr; /* largest protocol header */ +__private_extern__ int max_hdr; /* largest link+protocol header */ +__private_extern__ int max_datalen; /* MHLEN - max_hdr */ + +__private_extern__ unsigned int mbuf_default_ncl(int, u_int64_t); __private_extern__ void mbinit(void); __private_extern__ struct mbuf *m_clattach(struct mbuf *, int, caddr_t, - void (*)(caddr_t , u_int, caddr_t), u_int, caddr_t, int); + void (*)(caddr_t, u_int, caddr_t), u_int, caddr_t, int); __private_extern__ caddr_t m_bigalloc(int); __private_extern__ void m_bigfree(caddr_t, u_int, caddr_t); __private_extern__ struct mbuf *m_mbigget(struct mbuf *, int); __private_extern__ caddr_t m_16kalloc(int); __private_extern__ void m_16kfree(caddr_t, u_int, caddr_t); __private_extern__ struct mbuf *m_m16kget(struct mbuf *, int); -__private_extern__ void mbuf_growth_aggressive(void); -__private_extern__ void mbuf_growth_normal(void); - -/* Exported */ -struct mbuf *m_copym(struct mbuf *, int, int, int); -struct mbuf *m_split(struct mbuf *, int, int); -struct mbuf *m_free(struct mbuf *); -struct mbuf *m_get(int, int); -struct mbuf *m_getpacket(void); -struct mbuf *m_getclr(int, int); -struct mbuf *m_gethdr(int, int); -struct mbuf *m_prepend(struct mbuf *, int, int); -struct mbuf *m_prepend_2(struct mbuf *, int, int); -struct mbuf *m_pullup(struct mbuf *, int); -struct mbuf *m_retry(int, int); -struct mbuf *m_retryhdr(int, int); -void m_adj(struct mbuf *, int); -void m_freem(struct mbuf *); -int m_freem_list(struct mbuf *); -struct mbuf *m_devget(char *, int, int, struct ifnet *, void (*)(const void *, void *, size_t)); -char *mcl_to_paddr(char *); -struct mbuf *m_pulldown(struct mbuf*, int, int, int*); - -extern struct mbuf *m_getcl(int, int, int); -struct mbuf *m_mclget(struct mbuf *, int); -caddr_t m_mclalloc(int); -void m_mclfree(caddr_t p); -int m_mclhasreference(struct mbuf *); -void m_copy_pkthdr(struct mbuf *, struct mbuf*); - -int m_mclref(struct mbuf *); -int m_mclunref(struct mbuf *); - -void * m_mtod(struct mbuf *); -struct mbuf * m_dtom(void *); -int m_mtocl(void *); -union mcluster *m_cltom(int ); - -int m_trailingspace(struct mbuf *); -int m_leadingspace(struct mbuf *); - -struct mbuf *m_normalize(struct mbuf *m); -void m_mchtype(struct mbuf *m, int t); -void m_mcheck(struct mbuf*); - -extern void m_copyback(struct mbuf *, int , int , const void *); -extern struct mbuf *m_copyback_cow(struct mbuf *, int, int, const void *, int); -extern int m_makewritable(struct mbuf **, int, int, int); -void m_copydata(struct mbuf *, int , int , void *); -struct mbuf* m_dup(struct mbuf *m, int how); -void m_cat(struct mbuf *, struct mbuf *); -struct mbuf *m_copym_with_hdrs(struct mbuf*, int, int, int, struct mbuf**, int*); -struct mbuf *m_getpackets(int, int, int); -struct mbuf * m_getpackethdrs(int , int ); -struct mbuf* m_getpacket_how(int ); -struct mbuf * m_getpackets_internal(unsigned int *, int , int , int , size_t); -struct mbuf * m_allocpacket_internal(unsigned int * , size_t , unsigned int *, int , int , size_t ); -__END_DECLS +__private_extern__ struct mbuf *m_free(struct mbuf *); +__private_extern__ struct mbuf *m_getclr(int, int); +__private_extern__ struct mbuf *m_getptr(struct mbuf *, int, int *); +__private_extern__ unsigned int m_length(struct mbuf *); +__private_extern__ struct mbuf *m_prepend(struct mbuf *, int, int); +__private_extern__ struct mbuf *m_copyup(struct mbuf *, int, int); +__private_extern__ struct mbuf *m_retry(int, int); +__private_extern__ struct mbuf *m_retryhdr(int, int); +__private_extern__ int m_freem_list(struct mbuf *); +__private_extern__ int m_append(struct mbuf *, int, caddr_t); +__private_extern__ struct mbuf *m_last(struct mbuf *); +__private_extern__ struct mbuf *m_devget(char *, int, int, struct ifnet *, + void (*)(const void *, void *, size_t)); +__private_extern__ struct mbuf *m_pulldown(struct mbuf *, int, int, int *); + +__private_extern__ struct mbuf *m_getcl(int, int, int); +__private_extern__ caddr_t m_mclalloc(int); +__private_extern__ int m_mclhasreference(struct mbuf *); +__private_extern__ void m_copy_pkthdr(struct mbuf *, struct mbuf *); + +__private_extern__ struct mbuf *m_dtom(void *); +__private_extern__ int m_mtocl(void *); +__private_extern__ union mcluster *m_cltom(int); + +__private_extern__ int m_trailingspace(struct mbuf *); +__private_extern__ int m_leadingspace(struct mbuf *); + +__private_extern__ struct mbuf *m_normalize(struct mbuf *m); +__private_extern__ void m_mchtype(struct mbuf *m, int t); +__private_extern__ void m_mcheck(struct mbuf *); + +__private_extern__ void m_copyback(struct mbuf *, int, int, const void *); +__private_extern__ struct mbuf *m_copyback_cow(struct mbuf *, int, int, + const void *, int); +__private_extern__ int m_makewritable(struct mbuf **, int, int, int); +__private_extern__ struct mbuf *m_dup(struct mbuf *m, int how); +__private_extern__ struct mbuf *m_copym_with_hdrs(struct mbuf *, int, int, int, + struct mbuf **, int *); +__private_extern__ struct mbuf *m_getpackethdrs(int, int); +__private_extern__ struct mbuf *m_getpacket_how(int); +__private_extern__ struct mbuf *m_getpackets_internal(unsigned int *, int, + int, int, size_t); +__private_extern__ struct mbuf *m_allocpacket_internal(unsigned int *, size_t, + unsigned int *, int, int, size_t); /* - Packets may have annotations attached by affixing a list of "packet - tags" to the pkthdr structure. Packet tags are dynamically allocated - semi-opaque data structures that have a fixed header (struct m_tag) - that specifies the size of the memory block and an pair that - identifies it. The id identifies the module and the type identifies the - type of data for that module. The id of zero is reserved for the kernel. - - Note that the packet tag returned by m_tag_allocate has the default - memory alignment implemented by malloc. To reference private data one - can use a construct like: - - struct m_tag *mtag = m_tag_allocate(...); - struct foo *p = (struct foo *)(mtag+1); - - if the alignment of struct m_tag is sufficient for referencing members - of struct foo. Otherwise it is necessary to embed struct m_tag within - the private data structure to insure proper alignment; e.g. - - struct foo { - struct m_tag tag; - ... - }; - struct foo *p = (struct foo *) m_tag_allocate(...); - struct m_tag *mtag = &p->tag; + * Packets may have annotations attached by affixing a list of "packet + * tags" to the pkthdr structure. Packet tags are dynamically allocated + * semi-opaque data structures that have a fixed header (struct m_tag) + * that specifies the size of the memory block and an pair that + * identifies it. The id identifies the module and the type identifies the + * type of data for that module. The id of zero is reserved for the kernel. + * + * Note that the packet tag returned by m_tag_allocate has the default + * memory alignment implemented by malloc. To reference private data one + * can use a construct like: + * + * struct m_tag *mtag = m_tag_allocate(...); + * struct foo *p = (struct foo *)(mtag+1); + * + * if the alignment of struct m_tag is sufficient for referencing members + * of struct foo. Otherwise it is necessary to embed struct m_tag within + * the private data structure to insure proper alignment; e.g. + * + * struct foo { + * struct m_tag tag; + * ... + * }; + * struct foo *p = (struct foo *) m_tag_allocate(...); + * struct m_tag *mtag = &p->tag; */ -#define KERNEL_MODULE_TAG_ID 0 +#define KERNEL_MODULE_TAG_ID 0 enum { KERNEL_TAG_TYPE_NONE = 0, @@ -685,45 +850,27 @@ enum { KERNEL_TAG_TYPE_PF = 11 }; -/* - * As a temporary and low impact solution to replace the even uglier - * approach used so far in some parts of the network stack (which relies - * on global variables), packet tag-like annotations are stored in MT_TAG - * mbufs (or lookalikes) prepended to the actual mbuf chain. - * - * m_type = MT_TAG - * m_flags = m_tag_id - * m_next = next buffer in chain. - * - * BE VERY CAREFUL not to pass these blocks to the mbuf handling routines. - */ -#define _m_tag_id m_hdr.mh_flags - -__BEGIN_DECLS - /* Packet tag routines */ -struct m_tag *m_tag_alloc(u_int32_t id, u_int16_t type, int len, int wait); -void m_tag_free(struct m_tag *); -void m_tag_prepend(struct mbuf *, struct m_tag *); -void m_tag_unlink(struct mbuf *, struct m_tag *); -void m_tag_delete(struct mbuf *, struct m_tag *); -void m_tag_delete_chain(struct mbuf *, struct m_tag *); -struct m_tag *m_tag_locate(struct mbuf *,u_int32_t id, u_int16_t type, - struct m_tag *); -struct m_tag *m_tag_copy(struct m_tag *, int wait); -int m_tag_copy_chain(struct mbuf *to, struct mbuf *from, int wait); -void m_tag_init(struct mbuf *); -struct m_tag *m_tag_first(struct mbuf *); -struct m_tag *m_tag_next(struct mbuf *, struct m_tag *); - -extern void m_prio_init(struct mbuf *); -extern void m_prio_background(struct mbuf *); +__private_extern__ struct m_tag *m_tag_alloc(u_int32_t, u_int16_t, int, int); +__private_extern__ struct m_tag *m_tag_create(u_int32_t, u_int16_t, int, int, + struct mbuf *); +__private_extern__ void m_tag_free(struct m_tag *); +__private_extern__ void m_tag_prepend(struct mbuf *, struct m_tag *); +__private_extern__ void m_tag_unlink(struct mbuf *, struct m_tag *); +__private_extern__ void m_tag_delete(struct mbuf *, struct m_tag *); +__private_extern__ void m_tag_delete_chain(struct mbuf *, struct m_tag *); +__private_extern__ struct m_tag *m_tag_locate(struct mbuf *, u_int32_t, + u_int16_t, struct m_tag *); +__private_extern__ struct m_tag *m_tag_copy(struct m_tag *, int); +__private_extern__ int m_tag_copy_chain(struct mbuf *, struct mbuf *, int); +__private_extern__ void m_tag_init(struct mbuf *); +__private_extern__ struct m_tag *m_tag_first(struct mbuf *); +__private_extern__ struct m_tag *m_tag_next(struct mbuf *, struct m_tag *); + +__private_extern__ void m_prio_init(struct mbuf *); __END_DECLS - -#endif /* KERNEL */ - -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #ifdef KERNEL #include #endif /* KERNEL */ diff --git a/bsd/sys/mcache.h b/bsd/sys/mcache.h index 21a169223..443e05b01 100644 --- a/bsd/sys/mcache.h +++ b/bsd/sys/mcache.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006-2007 Apple Inc. All rights reserved. + * Copyright (c) 2006-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -38,6 +38,7 @@ extern "C" { #include #include #include +#include #ifdef ASSERT #undef ASSERT @@ -57,11 +58,40 @@ extern "C" { #define ASSERT(EX) ((void)0) #endif -#if defined(__ppc__) -#define CPU_CACHE_SIZE 128 +#define atomic_add_16_ov(a, n) \ + ((u_int16_t) OSAddAtomic16(n, (volatile SInt16 *)a)) + +#define atomic_add_16(a, n) \ + ((void) atomic_add_16_ov(a, n)) + +#define atomic_add_32_ov(a, n) \ + ((u_int32_t) OSAddAtomic(n, (volatile SInt32 *)a)) + +#define atomic_add_32(a, n) \ + ((void) atomic_add_32_ov(a, n)) + +#define atomic_add_64_ov(a, n) \ + ((u_int64_t) OSAddAtomic64(n, (volatile SInt64 *)a)) + +#define atomic_add_64(a, n) \ + ((void) atomic_add_64_ov(a, n)) + +#define atomic_set_64(a, n) do { \ + while (!OSCompareAndSwap64(*a, n, (volatile UInt64 *)a)) \ + ; \ +} while (0) + +#if defined(__LP64__) +#define atomic_get_64(n, a) do { \ + (n) = *(a); \ +} while (0) #else +#define atomic_get_64(n, a) do { \ + (n) = atomic_add_64_ov(a, 0); \ +} while (0) +#endif /* __LP64__ */ + #define CPU_CACHE_SIZE 64 -#endif #ifndef IS_P2ALIGNED #define IS_P2ALIGNED(v, a) \ @@ -152,6 +182,7 @@ typedef unsigned int (*mcache_allocfn_t)(void *, mcache_obj_t ***, unsigned int, int); typedef void (*mcache_freefn_t)(void *, mcache_obj_t *, boolean_t); typedef void (*mcache_auditfn_t)(void *, mcache_obj_t *, boolean_t); +typedef void (*mcache_logfn_t)(u_int32_t, mcache_obj_t *, boolean_t); typedef void (*mcache_notifyfn_t)(void *, u_int32_t); typedef struct mcache { @@ -164,6 +195,7 @@ typedef struct mcache { mcache_allocfn_t mc_slab_alloc; /* slab layer allocate callback */ mcache_freefn_t mc_slab_free; /* slab layer free callback */ mcache_auditfn_t mc_slab_audit; /* slab layer audit callback */ + mcache_logfn_t mc_slab_log; /* slab layer log callback */ mcache_notifyfn_t mc_slab_notify; /* slab layer notify callback */ void *mc_private; /* opaque arg to callbacks */ size_t mc_bufsize; /* object size */ @@ -210,11 +242,12 @@ typedef struct mcache { /* Valid values for mc_flags */ #define MCF_VERIFY 0x00000001 /* enable verification */ -#define MCF_AUDIT 0x00000002 /* enable transaction auditing */ +#define MCF_TRACE 0x00000002 /* enable transaction auditing */ #define MCF_NOCPUCACHE 0x00000010 /* disable CPU layer caching */ +#define MCF_NOLEAKLOG 0x00000100 /* disable leak logging */ -#define MCF_DEBUG (MCF_VERIFY | MCF_AUDIT) -#define MCF_FLAGS_MASK (MCF_DEBUG | MCF_NOCPUCACHE) +#define MCF_DEBUG (MCF_VERIFY | MCF_TRACE) +#define MCF_FLAGS_MASK (MCF_DEBUG | MCF_NOCPUCACHE | MCF_NOLEAKLOG) /* Valid values for notify callback */ #define MCN_RETRYALLOC 0x00000001 /* Allocation should be retried */ @@ -245,8 +278,8 @@ __private_extern__ mcache_t *mcache_create(const char *, size_t, __private_extern__ void *mcache_alloc(mcache_t *, int); __private_extern__ void mcache_free(mcache_t *, void *); __private_extern__ mcache_t *mcache_create_ext(const char *, size_t, - mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_notifyfn_t, - void *, u_int32_t, int); + mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_logfn_t, + mcache_notifyfn_t, void *, u_int32_t, int); __private_extern__ void mcache_destroy(mcache_t *); __private_extern__ unsigned int mcache_alloc_ext(mcache_t *, mcache_obj_t **, unsigned int, int); diff --git a/bsd/sys/mman.h b/bsd/sys/mman.h index a82aec943..109c63634 100644 --- a/bsd/sys/mman.h +++ b/bsd/sys/mman.h @@ -130,6 +130,7 @@ typedef __darwin_size_t size_t; #define MAP_NOEXTEND 0x0100 /* for MAP_FILE, don't change file size */ #define MAP_HASSEMAPHORE 0x0200 /* region may contain semaphores */ #define MAP_NOCACHE 0x0400 /* don't cache pages for this mapping */ +#define MAP_JIT 0x0800 /* Allocate a region that will be used for JIT purposes */ #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ /* diff --git a/bsd/sys/mount.h b/bsd/sys/mount.h index 8633a1465..f6594c0ef 100644 --- a/bsd/sys/mount.h +++ b/bsd/sys/mount.h @@ -293,9 +293,6 @@ struct vfs_attr { * NFS export related mount flags. */ #define MNT_EXPORTED 0x00000100 /* file system is exported */ -#ifdef PRIVATE -#define MNT_IMGSRC 0x00000200 -#endif /* CONFIG_IMGSRC_ACCESS */ /* * MAC labeled / "quarantined" flag @@ -319,6 +316,9 @@ struct vfs_attr { #define MNT_DEFWRITE 0x02000000 /* filesystem should defer writes */ #define MNT_MULTILABEL 0x04000000 /* MAC support for individual labels */ #define MNT_NOATIME 0x10000000 /* disable update of file access time */ +#ifdef BSD_KERNEL_PRIVATE +/* #define MNT_IMGSRC_BY_INDEX 0x20000000 see sys/imgsrc.h */ +#endif /* BSD_KERNEL_PRIVATE */ /* backwards compatibility only */ #define MNT_UNKNOWNPERMISSIONS MNT_IGNORE_OWNERSHIP @@ -334,7 +334,8 @@ struct vfs_attr { MNT_LOCAL | MNT_QUOTA | \ MNT_ROOTFS | MNT_DOVOLFS | MNT_DONTBROWSE | \ MNT_IGNORE_OWNERSHIP | MNT_AUTOMOUNTED | MNT_JOURNALED | \ - MNT_NOUSERXATTR | MNT_DEFWRITE | MNT_MULTILABEL | MNT_NOATIME | MNT_CPROTECT ) + MNT_NOUSERXATTR | MNT_DEFWRITE | MNT_MULTILABEL | \ + MNT_NOATIME | MNT_CPROTECT) /* * External filesystem command modifier flags. * Unmount can use the MNT_FORCE flag. @@ -440,6 +441,7 @@ union union_vfsidctl { /* the fields vc_vers and vc_fsid are compatible */ #define VFS_CTL_NEWADDR 0x00010004 /* reconnect to new address */ #define VFS_CTL_TIMEO 0x00010005 /* set timeout for vfs notification */ #define VFS_CTL_NOLOCKS 0x00010006 /* disable file locking */ +#define VFS_CTL_SADDR 0x00010007 /* get server address */ struct vfsquery { u_int32_t vq_flags; @@ -684,6 +686,9 @@ struct vfsops { /* * flags passed into vfs_iterate */ +#ifdef PRIVATE +#define VFS_ITERATE_TAIL_FIRST (1 << 0) +#endif /* PRIVATE */ /* * return values from callback @@ -1164,14 +1169,88 @@ void vfs_event_signal(fsid_t *, u_int32_t, intptr_t); */ void vfs_event_init(void); /* XXX We should not export this */ #ifdef KERNEL_PRIVATE +int vfs_getbyid(fsid_t *fsid, ino64_t ino, vnode_t *vpp, vfs_context_t ctx); int vfs_getattr(mount_t mp, struct vfs_attr *vfa, vfs_context_t ctx); int vfs_setattr(mount_t mp, struct vfs_attr *vfa, vfs_context_t ctx); int vfs_extendedsecurity(mount_t); mount_t vfs_getvfs_by_mntonname(char *); void vfs_markdependency(mount_t); vnode_t vfs_vnodecovered(mount_t mp); /* Returns vnode with an iocount that must be released with vnode_put() */ -void * vfs_mntlabel(mount_t mp); /* Safe to cast to "struct label*"; returns "void*" to limit dependence of mount.h on security headers. */ +vnode_t vfs_devvp(mount_t mp); /* Please see block comment with implementation */ +void * vfs_mntlabel(mount_t mp); /* Safe to cast to "struct label*"; returns "void*" to limit dependence of mount.h on security headers. */ void vfs_setunmountpreflight(mount_t mp); +void vfs_setcompoundopen(mount_t mp); +uint64_t vfs_throttle_mask(mount_t mp); + +struct vnode_trigger_info; + +/*! + @function vfs_addtrigger + @abstract Create an "external" trigger vnode: look up a vnode and mark it as + a trigger. Can only safely be called in the context of a callback set by + vfs_settriggercallback(). May only be used on a file which is not already + marked as a trigger. + @param relpath Path relative to root of mountpoint at which to mark trigger. + @param vtip Information about trigger; analogous to "vnode_trigger_param" + argument to vnode_create. + @param ctx Authorization context. + */ +int vfs_addtrigger(mount_t mp, const char *relpath, struct vnode_trigger_info *vtip, vfs_context_t ctx); + + +/*! + @enum vfs_trigger_callback_op_t + @abstract Operation to perform after an attempted unmount (successful or otherwise). + @constant VTC_REPLACE Unmount failed: attempt to replace triggers. Only valid + VFS operation to perform in this context is vfs_addtrigger(). + @constant VTC_RELEASE Unmount succeeded: release external triggering context. + */ +typedef enum { + VTC_REPLACE, + VTC_RELEASE +} vfs_trigger_callback_op_t; + +/*! + @typedef vfs_trigger_callback_t + @abstract Callback to be passed to vfs_settriggercallback() and invoked from + unmount context. + @param mp Mountpoint on which unmount is occurring. + @param op Operation (see vfs_trigger_callback_op_t) + @param data Context passed to vfs_settriggercallback() + @param ctx Authorization context in which unmount is occurring. + */ +typedef void vfs_trigger_callback_t(mount_t mp, vfs_trigger_callback_op_t op, void *data, vfs_context_t ctx); + +/*! + @function vfs_settriggercallback + @abstract Install a callback to be called after unmount attempts on a volume, + to restore triggers for failed unmounts and release state for successful ones. + @discussion Installs a callback which will be called in two situations: a + failed unmount where vnodes may have been reclaimed and a successful unmount. + Gives an external trigger-marking entity an opportunity to replace triggers + which may have been reclaimed. The callback can only be installed (not + cleared), and only one callback can be installed. The callback will be called + with a read-write lock held on the mount point; in the VTC_REPLACE case, the + only valid VFS operation to perform in the context of the callback is + vfs_addtrigger() on the mountpoint in question. This rwlock is held in order + to attempt to provide some modicum of coverage from lookups which might find + missing trigger vnodes and receive spurious ENOENTs. Note that this + protection is incomplete--current working directories, or traversals up into a + volume via ".." may still find missing triggers. As of this writing, no + serialization mechanism exists to do better than this. + When the "op" is VTC_RELEASE, the mountpoint is going away, and the only valid + VFS operation is to free the private data pointer if needed. The callback + will be called immediately, with VTC_REPLACE, from vfs_settriggercallback(), + if installation is successful. + @param fsid FSID for filesystem in question. + @param vtc Callback pointer. + @param data Context pointer to be passed to callback. + @param flags Currently unused. + @param ctx Authorization context. + @return 0 for success. EBUSY if a trigger has already been installed. + */ +int vfs_settriggercallback(fsid_t *fsid, vfs_trigger_callback_t vtc, void *data, uint32_t flags, vfs_context_t ctx); + #endif /* KERNEL_PRIVATE */ __END_DECLS diff --git a/bsd/sys/mount_internal.h b/bsd/sys/mount_internal.h index b069b1a0f..141fb3eeb 100644 --- a/bsd/sys/mount_internal.h +++ b/bsd/sys/mount_internal.h @@ -115,6 +115,7 @@ struct mount { struct vnodelst mnt_newvnodes; /* list of vnodes this mount */ uint32_t mnt_flag; /* flags */ uint32_t mnt_kern_flag; /* kernel only flags */ + uint32_t mnt_compound_ops; /* Available compound operations */ uint32_t mnt_lflag; /* mount life cycle flags */ uint32_t mnt_maxsymlinklen; /* max size of short symlink */ struct vfsstatfs mnt_vfsstat; /* cache of filesystem stats */ @@ -131,17 +132,22 @@ struct mount { uint32_t mnt_ioqueue_depth; /* the maxiumum number of commands a device can accept */ uint32_t mnt_ioscale; /* scale the various throttles/limits imposed on the amount of I/O in flight */ uint32_t mnt_ioflags; /* flags for underlying device */ - pending_io_t mnt_pending_write_size; /* byte count of pending writes */ - pending_io_t mnt_pending_read_size; /* byte count of pending reads */ + pending_io_t mnt_pending_write_size __attribute__((aligned(sizeof(pending_io_t)))); /* byte count of pending writes */ + pending_io_t mnt_pending_read_size __attribute__((aligned(sizeof(pending_io_t)))); /* byte count of pending reads */ lck_rw_t mnt_rwlock; /* mutex readwrite lock */ lck_mtx_t mnt_renamelock; /* mutex that serializes renames that change shape of tree */ vnode_t mnt_devvp; /* the device mounted on for local file systems */ uint32_t mnt_devbsdunit; /* the BSD unit number of the device */ + uint64_t mnt_throttle_mask; /* the throttle mask of what devices will be affected by I/O from this mnt */ void *mnt_throttle_info; /* used by the throttle code */ int32_t mnt_crossref; /* refernces to cover lookups crossing into mp */ int32_t mnt_iterref; /* refernces to cover iterations; drained makes it -ve */ - +#if CONFIG_TRIGGERS + int32_t mnt_numtriggers; /* num of trigger vnodes for this mount */ + vfs_trigger_callback_t *mnt_triggercallback; + void *mnt_triggerdata; +#endif /* XXX 3762912 hack to support HFS filesystem 'owner' */ uid_t mnt_fsowner; gid_t mnt_fsgroup; @@ -190,6 +196,7 @@ struct mount { */ pid_t mnt_dependent_pid; void *mnt_dependent_process; + char fstypename_override[MFSTYPENAMELEN]; }; /* @@ -228,6 +235,12 @@ extern struct mount * dead_mountp; * because the bits here were broken out from the high bits * of the mount flags. */ +#define MNTK_DENY_READDIREXT 0x00000200 /* Deny Extended-style readdir's for this volume */ +#define MNTK_PERMIT_UNMOUNT 0x00000400 /* Allow (non-forced) unmounts by UIDs other than the one that mounted the volume */ +#ifdef NFSCLIENT +#define MNTK_TYPENAME_OVERRIDE 0x00000800 /* override the fstypename for statfs() */ +#endif /* NFSCLIENT */ +#define MNTK_KERNEL_MOUNT 0x00001000 /* mount came from kernel side */ #ifdef CONFIG_IMGSRC_ACCESS #define MNTK_HAS_MOVED 0x00002000 #define MNTK_BACKS_ROOT 0x00004000 @@ -392,13 +405,11 @@ struct user32_statfs { }; /* - * throttle I/Os are affected only by normal I/Os happening on the same bsd device node. For example, disk1s3 and - * disk1s5 are the same device node, while disk1s3 and disk2 are not (although disk2 might be a mounted disk image file - * and the disk image file resides on a partition in disk1). The following constant defines the maximum number of - * different bsd device nodes the algorithm can consider, and larger numbers are rounded by this maximum. Since - * throttled I/O is usually useful in non-server environment only, a small number 16 is enough in most cases + * throttle I/Os are affected only by normal I/Os happening on the same spindle. Currently we use a 64-bit integer to + * represent what devices are affected, so we can handle at most 64 different spindles. Since + * throttled I/O is usually useful in non-server environment only, this number is enough in most cases. */ -#define LOWPRI_MAX_NUM_DEV 16 +#define LOWPRI_MAX_NUM_DEV 64 __BEGIN_DECLS @@ -425,7 +436,7 @@ void vfs_unmountall(void); int safedounmount(struct mount *, int, vfs_context_t); int dounmount(struct mount *, int, int, vfs_context_t); -/* xnuy internal api */ +/* xnu internal api */ void mount_dropcrossref(mount_t, vnode_t, int); mount_t mount_lookupby_volfsid(int, int); mount_t mount_list_lookupby_fsid(fsid_t *, int, int); @@ -437,11 +448,31 @@ void mount_iterdrop(mount_t); void mount_iterdrain(mount_t); void mount_iterreset(mount_t); +/* tags a volume as not supporting extended readdir for NFS exports */ +#ifdef BSD_KERNEL_PRIVATE +void mount_set_noreaddirext (mount_t); +#endif + +/* Private NFS spi */ +#define KERNEL_MOUNT_NOAUTH 0x01 /* Don't check the UID of the directory we are mounting on */ +#define KERNEL_MOUNT_PERMIT_UNMOUNT 0x02 /* Allow (non-forced) unmounts by users other the one who mounted the volume */ +#if NFSCLIENT +/* + * NOTE: kernel_mount() does not force MNT_NOSUID, MNT_NOEXEC, or MNT_NODEC for non-privileged + * mounting credentials, as the mount(2) system call does. + */ +int kernel_mount(char *, vnode_t, vnode_t, const char *, void *, size_t, int, uint32_t, vfs_context_t); +boolean_t vfs_iskernelmount(mount_t); +#endif + /* throttled I/O api */ int throttle_get_io_policy(struct uthread **ut); -extern void throttle_lowpri_io(boolean_t ok_to_sleep); int throttle_io_will_be_throttled(int lowpri_window_msecs, mount_t mp); +/* throttled I/O helper function */ +/* convert the lowest bit to a device index */ +extern int num_trailing_0(uint64_t n); + __END_DECLS #endif /* !_SYS_MOUNT_INTERNAL_H_ */ diff --git a/bsd/sys/msgbuf.h b/bsd/sys/msgbuf.h index e05b73e9e..5b8211cac 100644 --- a/bsd/sys/msgbuf.h +++ b/bsd/sys/msgbuf.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -65,22 +65,24 @@ #include -#define MSG_BSIZE 4096 +#define MAX_MSG_BSIZE (1*1024*1024) struct msgbuf { #define MSG_MAGIC 0x063061 - long msg_magic; - long msg_size; - long msg_bufx; /* write pointer */ - long msg_bufr; /* read pointer */ - char *msg_bufc; /* buffer */ + int msg_magic; + int msg_size; + int msg_bufx; /* write pointer */ + int msg_bufr; /* read pointer */ + char *msg_bufc; /* buffer */ }; -#ifdef KERNEL + +#ifdef XNU_KERNEL_PRIVATE __BEGIN_DECLS extern struct msgbuf *msgbufp; extern void log_putc(char); extern void log_putc_locked(char); -extern void log_setsize(long size); +extern int log_setsize(int size); extern int log_dmesg(user_addr_t, uint32_t, int32_t *); __END_DECLS -#endif +#endif /* XNU_KERNEL_PRIVATE */ + #endif /* !_SYS_MSGBUF_H_ */ diff --git a/bsd/sys/namei.h b/bsd/sys/namei.h index 5aa2f701a..56d3ecf13 100644 --- a/bsd/sys/namei.h +++ b/bsd/sys/namei.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -90,12 +90,15 @@ /* * Encapsulation of namei parameters. */ -struct nameidata { +struct nameidata { /* * Arguments to namei/lookup. */ user_addr_t ni_dirp; /* pathname pointer */ enum uio_seg ni_segflg; /* location of pathname */ +#if CONFIG_TRIGGERS + enum path_operation ni_op; /* intended operation, see enum path_operation in vnode.h */ +#endif /* CONFIG_TRIGGERS */ /* * Arguments to lookup. */ @@ -116,8 +119,25 @@ struct nameidata { u_long ni_loopcnt; /* count of symlinks encountered */ struct componentname ni_cnd; + int32_t ni_flag; + int ni_ncgeneration; /* For a batched vnop, grab generation beforehand */ }; +#define NAMEI_CONTLOOKUP 0x002 /* Continue processing a lookup which was partially processed in a compound VNOP */ +#define NAMEI_TRAILINGSLASH 0x004 /* There was at least one trailing slash after last component */ +#define NAMEI_UNFINISHED 0x008 /* We broke off a lookup to do a compound op */ +/* + * XXX Hack: we need to encode the intended VNOP in order to + * be able to include information about which operations a filesystem + * supports in the decision to break off a lookup early. + */ +#define NAMEI_COMPOUNDOPEN 0x010 +#define NAMEI_COMPOUNDREMOVE 0x020 +#define NAMEI_COMPOUNDMKDIR 0x040 +#define NAMEI_COMPOUNDRMDIR 0x080 +#define NAMEI_COMPOUNDRENAME 0x100 +#define NAMEI_COMPOUND_OP_MASK (NAMEI_COMPOUNDOPEN | NAMEI_COMPOUNDREMOVE | NAMEI_COMPOUNDMKDIR | NAMEI_COMPOUNDRMDIR | NAMEI_COMPOUNDRENAME) + #ifdef KERNEL /* * namei operational modifier flags, stored in ni_cnd.flags @@ -169,7 +189,27 @@ struct nameidata { /* * Initialization of an nameidata structure. */ -#define NDINIT(ndp, op, flags, segflg, namep, ctx) { \ + +#if CONFIG_TRIGGERS +/* Note: vnode triggers require more precise path operation (ni_op) */ + +#define NDINIT(ndp, op, pop, flags, segflg, namep, ctx) { \ + (ndp)->ni_cnd.cn_nameiop = op; \ + (ndp)->ni_op = pop; \ + (ndp)->ni_cnd.cn_flags = flags; \ + if ((segflg) == UIO_USERSPACE) { \ + (ndp)->ni_segflg = ((IS_64BIT_PROCESS(vfs_context_proc(ctx))) ? UIO_USERSPACE64 : UIO_USERSPACE32); \ + } \ + else { \ + (ndp)->ni_segflg = segflg; \ + } \ + (ndp)->ni_dirp = namep; \ + (ndp)->ni_cnd.cn_context = ctx; \ + (ndp)->ni_flag = 0; \ + (ndp)->ni_cnd.cn_ndp = (ndp); \ +} +#else +#define NDINIT(ndp, op, _unused_, flags, segflg, namep, ctx) { \ (ndp)->ni_cnd.cn_nameiop = op; \ (ndp)->ni_cnd.cn_flags = flags; \ if ((segflg) == UIO_USERSPACE) { \ @@ -180,7 +220,11 @@ struct nameidata { } \ (ndp)->ni_dirp = namep; \ (ndp)->ni_cnd.cn_context = ctx; \ + (ndp)->ni_flag = 0; \ + (ndp)->ni_cnd.cn_ndp = (ndp); \ } +#endif /* CONFIG_TRIGGERS */ + #endif /* KERNEL */ /* @@ -210,21 +254,25 @@ struct namecache { int namei(struct nameidata *ndp); void nameidone(struct nameidata *); +void namei_unlock_fsnode(struct nameidata *ndp); int lookup(struct nameidata *ndp); int relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp); +void lookup_compound_vnop_post_hook(int error, vnode_t dvp, vnode_t vp, struct nameidata *ndp, int did_create); /* * namecache function prototypes */ void cache_purgevfs(mount_t mp); int cache_lookup_path(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, - vfs_context_t context, int *trailing_slash, int *dp_authorized, vnode_t last_dp); + vfs_context_t context, int *dp_authorized, vnode_t last_dp); void vnode_cache_authorized_action(vnode_t vp, vfs_context_t context, kauth_action_t action); void vnode_uncache_authorized_action(vnode_t vp, kauth_action_t action); boolean_t vnode_cache_is_stale(vnode_t vp); boolean_t vnode_cache_is_authorized(vnode_t vp, vfs_context_t context, kauth_action_t action); +int lookup_validate_creation_path(struct nameidata *ndp); +int namei_compound_available(vnode_t dp, struct nameidata *ndp); #endif /* KERNEL */ diff --git a/bsd/dev/ppc/memmove.c b/bsd/sys/netboot.h similarity index 72% rename from bsd/dev/ppc/memmove.c rename to bsd/sys/netboot.h index e102c248a..717100d7f 100644 --- a/bsd/dev/ppc/memmove.c +++ b/bsd/sys/netboot.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,30 +25,26 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* Copyright (c) 1991,1993 NeXT Computer, Inc. All rights reserved. - * - */ -#include +/* + * netboot.h + * - definitions for network booting/rooting + */ -void ovbcopy(const void *src, void *dst, size_t ulen) -{ - bcopy(src, dst, ulen); -} +#ifndef _SYS_NETBOOT_H +#define _SYS_NETBOOT_H -#if 0 -void *memcpy(void *dst, const void *src, unsigned int ulen) -{ - bcopy(src, dst, ulen); - return dst; -} +#include +#include -void *memmove(void *dst, const void *src, unsigned int ulen) -{ - bcopy(src, dst, ulen); - return dst; -} +int netboot_setup(void); +int netboot_mountroot(void); +int netboot_root(void); -#endif /* 0 */ +boolean_t netboot_iaddr(struct in_addr * iaddr_p); +boolean_t netboot_rootpath(struct in_addr * server_ip, + char * name, int name_len, + char * path, int path_len); +#endif /* _SYS_NETBOOT_H */ diff --git a/bsd/sys/priv.h b/bsd/sys/priv.h new file mode 100644 index 000000000..1abb898bf --- /dev/null +++ b/bsd/sys/priv.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/*- + * Copyright (c) 2006 nCircle Network Security, Inc. + * All rights reserved. + * + * This software was developed by Robert N. M. Watson for the TrustedBSD + * Project under contract to nCircle Network Security, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR, NCIRCLE NETWORK SECURITY, + * INC., OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD: src/sys/sys/priv.h,v 1.38.2.1.2.1 2009/10/25 01:10:29 kensmith Exp $ + */ + +/* + * Kernel privilege checking interface. + */ +#ifndef _SYS_PRIV_H_ +#define _SYS_PRIV_H_ + +/* + * Privilege list, sorted loosely by kernel subsystem. + * + * Think carefully before adding or reusing one of these privileges -- are + * there existing instances referring to the same privilege? Particular + * numeric privilege assignments are part of the kernel extension ABI. + */ + +/* + * The remaining privileges typically correspond to one or a small + * number of specific privilege checks, and have (relatively) precise + * meanings. They are loosely sorted into a set of base system + * privileges, such as the ability to reboot, and then loosely by + * subsystem, indicated by a subsystem name. + */ +#define PRIV_ADJTIME 1000 /* Set time adjustment. */ + +/* + * IPv4 and IPv6 privileges. + */ +#define PRIV_NETINET_RESERVEDPORT 11000 /* Bind low port number. */ + +#ifdef KERNEL +/* + * Privilege check interface. No flags are currently defined for the API. + */ +#include +int priv_check_cred(kauth_cred_t cred, int priv, int flags); +#endif + +#endif /* !_SYS_PRIV_H_ */ diff --git a/bsd/sys/proc.h b/bsd/sys/proc.h index 92c86c0a1..8cec174a3 100644 --- a/bsd/sys/proc.h +++ b/bsd/sys/proc.h @@ -167,7 +167,7 @@ struct extern_proc { #define P_TIMEOUT 0x00000400 /* Timing out during sleep */ #define P_TRACED 0x00000800 /* Debugged process being traced */ -#define P_RESV3 0x00001000 /* (P_WAITED)Debugging prc has waited for child */ +#define P_DISABLE_ASLR 0x00001000 /* Disable address space layout randomization */ #define P_WEXIT 0x00002000 /* Working on exiting */ #define P_EXEC 0x00004000 /* Process called exec. */ @@ -252,7 +252,7 @@ extern int proc_pid(proc_t); extern int proc_ppid(proc_t); /* returns 1 if the process is marked for no remote hangs */ extern int proc_noremotehang(proc_t); -/* returns 1 is the process is marked for force quota */ +/* returns 1 if the process is marked for force quota */ extern int proc_forcequota(proc_t); /* this routine returns 1 if the process is running with 64bit address space, else 0 */ @@ -292,9 +292,41 @@ extern int msleep1(void *chan, lck_mtx_t *mtx, int pri, const char *wmesg, u_int extern int proc_pidversion(proc_t); extern int proc_getcdhash(proc_t, unsigned char *); #endif /* KERNEL_PRIVATE */ +#ifdef XNU_KERNEL_PRIVATE +/* + * This returns an unique 64bit id of a given process. + * Caller needs to hold proper reference on the + * passed in process strucutre. + */ +extern uint64_t proc_uniqueid(proc_t); +extern uint64_t proc_selfuniqueid(void); +extern void proc_getexecutableuuid(proc_t, unsigned char *, unsigned long); +#endif /* XNU_KERNEL_PRIVATE*/ __END_DECLS #endif /* KERNEL */ +#ifdef PRIVATE + +/* Values for pid_shutdown_sockets */ +#ifdef KERNEL +#define SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL 0x0 +#endif /* KERNEL */ +#define SHUTDOWN_SOCKET_LEVEL_DISCONNECT_SVC 0x1 +#define SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL 0x2 + +#ifndef KERNEL + +__BEGIN_DECLS + +int pid_suspend(int pid); +int pid_resume(int pid); + + +__END_DECLS + +#endif /* !KERNEL */ +#endif /* PRIVATE */ + #endif /* !_SYS_PROC_H_ */ diff --git a/bsd/sys/proc_info.h b/bsd/sys/proc_info.h index e22cd3ab4..67842664d 100644 --- a/bsd/sys/proc_info.h +++ b/bsd/sys/proc_info.h @@ -50,6 +50,7 @@ __BEGIN_DECLS #define PROC_TTY_ONLY 3 #define PROC_UID_ONLY 4 #define PROC_RUID_ONLY 5 +#define PROC_PPID_ONLY 6 struct proc_bsdinfo { uint32_t pbi_flags; /* 64bit; emulated etc */ @@ -77,25 +78,47 @@ struct proc_bsdinfo { }; +struct proc_bsdshortinfo { + uint32_t pbsi_pid; /* process id */ + uint32_t pbsi_ppid; /* process parent id */ + uint32_t pbsi_pgid; /* process perp id */ + uint32_t pbsi_status; /* p_stat value, SZOMB, SRUN, etc */ + char pbsi_comm[MAXCOMLEN]; /* upto 16 characters of process name */ + uint32_t pbsi_flags; /* 64bit; emulated etc */ + uid_t pbsi_uid; /* current uid on process */ + gid_t pbsi_gid; /* current gid on process */ + uid_t pbsi_ruid; /* current ruid on process */ + gid_t pbsi_rgid; /* current tgid on process */ + uid_t pbsi_svuid; /* current svuid on process */ + gid_t pbsi_svgid; /* current svgid on process */ + uint32_t pbsi_rfu; /* reserved for future use*/ +}; + /* pbi_flags values */ -#define PROC_FLAG_SYSTEM 1 -#define PROC_FLAG_TRACED 2 -#define PROC_FLAG_INEXIT 4 +#define PROC_FLAG_SYSTEM 1 /* System process */ +#define PROC_FLAG_TRACED 2 /* process currently being traced, possibly by gdb */ +#define PROC_FLAG_INEXIT 4 /* process is working its way in exit() */ #define PROC_FLAG_PPWAIT 8 -#define PROC_FLAG_LP64 0x10 -#define PROC_FLAG_SLEADER 0x20 -#define PROC_FLAG_CTTY 0x40 -#define PROC_FLAG_CONTROLT 0x80 -#define PROC_FLAG_THCWD 0x100 +#define PROC_FLAG_LP64 0x10 /* 64bit process */ +#define PROC_FLAG_SLEADER 0x20 /* The process is the session leader */ +#define PROC_FLAG_CTTY 0x40 /* process has a control tty */ +#define PROC_FLAG_CONTROLT 0x80 /* Has a controlling terminal */ +#define PROC_FLAG_THCWD 0x100 /* process has a thread with cwd */ /* process control bits for resource starvation */ -#define PROC_FLAG_PC_THROTTLE 0x200 -#define PROC_FLAG_PC_SUSP 0x400 -#define PROC_FLAG_PC_KILL 0x600 +#define PROC_FLAG_PC_THROTTLE 0x200 /* In resource starvation situations, this process is to be throttled */ +#define PROC_FLAG_PC_SUSP 0x400 /* In resource starvation situations, this process is to be suspended */ +#define PROC_FLAG_PC_KILL 0x600 /* In resource starvation situations, this process is to be terminated */ #define PROC_FLAG_PC_MASK 0x600 /* process action bits for resource starvation */ -#define PROC_FLAG_PA_THROTTLE 0x800 -#define PROC_FLAG_PA_SUSP 0x1000 +#define PROC_FLAG_PA_THROTTLE 0x800 /* The process is currently throttled due to resource starvation */ +#define PROC_FLAG_PA_SUSP 0x1000 /* The process is currently suspended due to resource starvation */ +#define PROC_FLAG_PSUGID 0x2000 /* process has set privileges since last exec */ +#define PROC_FLAG_EXEC 0x4000 /* process has called exec */ +#ifdef PRIVATE +#define PROC_FLAG_DARWINBG 0x8000 /* process in darwin background */ +#define PROC_FLAG_EXT_DARWINBG 0x10000 /* process in darwin background - external enforcement */ +#endif struct proc_taskinfo { @@ -174,6 +197,7 @@ struct proc_regioninfo { #define SM_TRUESHARED 5 #define SM_PRIVATE_ALIASED 6 #define SM_SHARED_ALIASED 7 +#define SM_LARGE_PAGE 8 /* @@ -199,9 +223,16 @@ struct proc_workqueueinfo { uint32_t pwq_nthreads; /* total number of workqueue threads */ uint32_t pwq_runthreads; /* total number of running workqueue threads */ uint32_t pwq_blockedthreads; /* total number of blocked workqueue threads */ - uint32_t reserved[1]; /* reserved for future use */ + uint32_t pwq_state; }; +/* + * workqueue state (pwq_state field) + */ +#define WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT 0x1 +#define WQ_EXCEEDED_TOTAL_THREAD_LIMIT 0x2 + + struct proc_fileinfo { uint32_t fi_openflags; uint32_t fi_status; @@ -561,6 +592,11 @@ struct proc_fdinfo { uint32_t proc_fdtype; }; +struct proc_fileportinfo { + uint32_t proc_fileport; + uint32_t proc_fdtype; +}; + /* Flavors for proc_pidinfo() */ #define PROC_PIDLISTFDS 1 #define PROC_PIDLISTFD_SIZE (sizeof(struct proc_fdinfo)) @@ -600,6 +636,12 @@ struct proc_fdinfo { #define PROC_PIDWORKQUEUEINFO 12 #define PROC_PIDWORKQUEUEINFO_SIZE (sizeof(struct proc_workqueueinfo)) +#define PROC_PIDT_SHORTBSDINFO 13 +#define PROC_PIDT_SHORTBSDINFO_SIZE (sizeof(struct proc_bsdshortinfo)) + +#define PROC_PIDLISTFILEPORTS 14 +#define PROC_PIDLISTFILEPORTS_SIZE (sizeof(struct proc_fileportinfo)) + /* Flavors for proc_pidfdinfo */ #define PROC_PIDFDVNODEINFO 1 @@ -626,9 +668,29 @@ struct proc_fdinfo { #define PROC_PIDFDATALKINFO 8 #define PROC_PIDFDATALKINFO_SIZE (sizeof(struct appletalk_fdinfo)) +/* Flavors for proc_pidfileportinfo */ + +#define PROC_PIDFILEPORTVNODEPATHINFO 2 /* out: vnode_fdinfowithpath */ +#define PROC_PIDFILEPORTVNODEPATHINFO_SIZE \ + PROC_PIDFDVNODEPATHINFO_SIZE + +#define PROC_PIDFILEPORTSOCKETINFO 3 /* out: socket_fdinfo */ +#define PROC_PIDFILEPORTSOCKETINFO_SIZE PROC_PIDFDSOCKETINFO_SIZE + +#define PROC_PIDFILEPORTPSHMINFO 5 /* out: pshm_fdinfo */ +#define PROC_PIDFILEPORTPSHMINFO_SIZE PROC_PIDFDPSHMINFO_SIZE + +#define PROC_PIDFILEPORTPIPEINFO 6 /* out: pipe_fdinfo */ +#define PROC_PIDFILEPORTPIPEINFO_SIZE PROC_PIDFDPIPEINFO_SIZE + /* used for proc_setcontrol */ #define PROC_SELFSET_PCONTROL 1 +#define PROC_SELFSET_THREADNAME 2 +#define PROC_SELFSET_THREADNAME_SIZE (MAXTHREADNAMESIZE -1) + +#define PROC_SELFSET_VMRSRCOWNER 3 + #ifdef XNU_KERNEL_PRIVATE #ifndef pshmnode struct pshmnode; diff --git a/bsd/sys/proc_internal.h b/bsd/sys/proc_internal.h index 52e4197dd..26b91b3cd 100644 --- a/bsd/sys/proc_internal.h +++ b/bsd/sys/proc_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -184,6 +184,14 @@ struct proc; #define PROC_NULL (struct proc *)0 +#define PROC_UPDATE_CREDS_ONPROC(p) { \ + p->p_uid = kauth_cred_getuid(p->p_ucred); \ + p->p_gid = kauth_cred_getgid(p->p_ucred); \ + p->p_ruid = kauth_cred_getruid(p->p_ucred); \ + p->p_rgid = kauth_cred_getrgid(p->p_ucred); \ + p->p_svuid = kauth_cred_getsvuid(p->p_ucred); \ + p->p_svgid = kauth_cred_getsvgid(p->p_ucred); \ + } /* * Description of a process. * @@ -203,6 +211,13 @@ struct proc { struct proc * p_pptr; /* Pointer to parent process.(LL) */ pid_t p_ppid; /* process's parent pid number */ pid_t p_pgrpid; /* process group id of the process (LL)*/ + uid_t p_uid; + gid_t p_gid; + uid_t p_ruid; + gid_t p_rgid; + uid_t p_svuid; + gid_t p_svgid; + uint64_t p_uniqueid; /* process uniqe ID */ lck_mtx_t p_mlock; /* mutex lock for proc */ @@ -281,6 +296,7 @@ struct proc { lck_mtx_t p_dtrace_sprlock; /* sun proc lock emulation */ int p_dtrace_probes; /* (PL) are there probes for this proc? */ u_int p_dtrace_count; /* (sprlock) number of DTrace tracepoints */ + uint8_t p_dtrace_stop; /* indicates a DTrace-desired stop */ struct dtrace_ptss_page* p_dtrace_ptss_pages; /* (sprlock) list of user ptss pages */ struct dtrace_ptss_page_entry* p_dtrace_ptss_free_list; /* (atomic) list of individual ptss entries */ struct dtrace_helpers* p_dtrace_helpers; /* (dtrace_lock) DTrace per-proc private */ @@ -314,7 +330,9 @@ struct proc { char p_name[(2*MAXCOMLEN)+1]; /* PL */ struct pgrp *p_pgrp; /* Pointer to process group. (LL) */ +#if CONFIG_EMBEDDED int p_iopol_disk; /* disk I/O policy (PL) */ +#endif /* CONFIG_EMBEDDED */ uint32_t p_csflags; /* flags for codesign (PL) */ uint32_t p_pcaction; /* action for process control on starvation */ uint8_t p_uuid[16]; /* from LC_UUID load command */ @@ -330,6 +348,7 @@ struct proc { struct klist p_klist; /* knote list (PL ?)*/ struct rusage *p_ru; /* Exit information. (PL) */ + int p_sigwaitcnt; thread_t p_signalholder; thread_t p_transholder; @@ -408,10 +427,13 @@ struct proc { #define P_LLIMWAIT 0x00040000 #define P_LWAITED 0x00080000 #define P_LINSIGNAL 0x00100000 -#define P_LSIGNALWAIT 0x00200000 +#define P_UNUSED 0x00200000 /* Unused */ #define P_LRAGE_VNODES 0x00400000 #define P_LREGISTER 0x00800000 /* thread start fns registered */ +#if CONFIG_EMBEDDED #define P_LBACKGROUND 0x01000000 +#endif /* CONFIG_EMBEDDED */ +#define P_LVMRSRCOWNER 0x02000000 /* can handle the resource ownership of */ /* Process control state for resource starvation */ #define P_PCTHROTTLE 1 @@ -426,7 +448,7 @@ struct proc { #define PROC_SETACTION_STATE(p) (p->p_pcaction = (PROC_CONTROL_STATE(p) | (PROC_CONTROL_STATE(p) << 16))) #define PROC_RESETACTION_STATE(p) (p->p_pcaction = PROC_CONTROL_STATE(p)) -/* advisory flags in the proc */ +/* additional process flags */ #define P_LADVLOCK 0x01 /* defns for proc_iterate */ @@ -580,10 +602,10 @@ extern lck_mtx_t * proc_list_mlock; extern lck_mtx_t * proc_klist_mlock; #define BSD_SIMUL_EXECS 33 /* 32 , allow for rounding */ -#define BSD_PAGABLE_MAP_SIZE (BSD_SIMUL_EXECS * (NCARGS + PAGE_SIZE)) -__private_extern__ int execargs_cache_size; -__private_extern__ int execargs_free_count; -__private_extern__ vm_offset_t * execargs_cache; +#define BSD_PAGEABLE_SIZE_PER_EXEC (NCARGS + PAGE_SIZE + PAGE_SIZE) /* page for apple vars, page for executable header */ +extern int execargs_cache_size; +extern int execargs_free_count; +extern vm_offset_t * execargs_cache; #define SESS_LEADER(p, sessp) ((sessp)->s_leader == (p)) @@ -611,9 +633,11 @@ extern LIST_HEAD(sesshashhead, session) *sesshashtbl; extern u_long sesshash; extern lck_grp_t * proc_lck_grp; +#if CONFIG_FINE_LOCK_GROUPS extern lck_grp_t * proc_mlock_grp; extern lck_grp_t * proc_fdmlock_grp; extern lck_grp_t * proc_slock_grp; +#endif extern lck_grp_attr_t * proc_lck_grp_attr; extern lck_attr_t * proc_lck_attr; @@ -638,6 +662,9 @@ __private_extern__ int proc_core_name(const char *name, uid_t uid, pid_t pid, char *cr_name, size_t cr_name_len); extern int isinferior(struct proc *, struct proc *); __private_extern__ struct proc *pzfind(pid_t); /* Find zombie by id. */ +__private_extern__ struct proc *proc_find_zombref(pid_t); /* Find zombie by id. */ +__private_extern__ void proc_drop_zombref(struct proc * p); /* Find zombie by id. */ + extern struct lctx *lcfind(pid_t); /* Find a login context by id */ extern struct lctx *lccreate(void); /* Create a new login context */ @@ -699,6 +726,7 @@ void proc_transcommit(struct proc *, int locked); void proc_transend(struct proc *, int locked); int proc_transwait(struct proc *, int locked); void proc_rele_locked(struct proc * p); +struct proc *proc_ref_locked(struct proc * p); void proc_knote(struct proc * p, long hint); void proc_knote_drain(struct proc *p); void workqueue_init_lock(proc_t p); diff --git a/bsd/sys/process_policy.h b/bsd/sys/process_policy.h new file mode 100644 index 000000000..19f3c2617 --- /dev/null +++ b/bsd/sys/process_policy.h @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2010 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _SYS_PROCESS_POLICY_H +#define _SYS_PROCESS_POLICY_H + +#include +#include +#include +#include + +__BEGIN_DECLS + +/* defns of scope */ +#define PROC_POLICY_SCOPE_PROCESS 1 /* the policy setting is for process wide effect */ +#define PROC_POLICY_SCOPE_THREAD 2 /* the policy setting is for thread inside a proc */ + +/* defns of actions with no attributes */ +#define PROC_POLICY_ACTION_APPLY 1 /* enforce the set policy */ +#define PROC_POLICY_ACTION_RESTORE 2 /* revert the applied action back */ +#define PROC_POLICY_ACTION_DENYINHERIT 3 /* set for no inheritence of the specified policy */ +#define PROC_POLICY_ACTION_DENYSELFSET 4 /* set for the process to set its own policy */ +#define PROC_POLICY_ACTION_ENABLE 5 /* enable policy and its actions */ +#define PROC_POLICY_ACTION_DISABLE 6 /* disable policy and its actions, also clears any actions that have already happened */ +/* defns of actions with attributes */ +#define PROC_POLICY_ACTION_SET 10 /* set the policy attributes */ +#define PROC_POLICY_ACTION_GET 11 /* get the policy attributes */ +#define PROC_POLICY_ACTION_ADD 12 /* add a policy attribute */ +#define PROC_POLICY_ACTION_REMOVE 13 /* remove a policy attribute */ + +/* policies */ +#define PROC_POLICY NONE 0 +#define PROC_POLICY_BACKGROUND 1 /* darwin background policy */ +#define PROC_POLICY_HARDWARE_ACCESS 2 /* access to various hardware */ +#define PROC_POLICY_RESOURCE_STARVATION 3 /* behavior on resource starvation */ +#define PROC_POLICY_RESOURCE_USAGE 4 /* behavior on resource consumption */ +#define PROC_POLICY_RESERVED 5 /* behavior on resource consumption */ +#define PROC_POLICY_APPTYPE 6 /* behavior on resource consumption */ + +/* sub policies for background policy */ +#define PROC_POLICY_BG_NONE 0 /* none */ +#define PROC_POLICY_BG_LOWCPUPRI 1 /* Low cpu priority */ +#define PROC_POLICY_BG_DISKTHROTTLE 2 /* disk accesses throttled */ +#define PROC_POLICY_BG_NETTHROTTLE 4 /* network accesses throttled */ +#define PROC_POLICY_BG_GPUDENY 8 /* no access to GPU */ +#if CONFIG_EMBEDDED +#define PROC_POLICY_BG_ALL 0x0F +#else /* CONFIG_EMBEDDED */ +#define PROC_POLICY_BG_ALL 0x07 +#endif /* CONFIG_EMBEDDED */ +#define PROC_POLICY_BG_DEFAULT PROC_POLICY_BG_ALL + +/* sub policies for hardware */ +#define PROC_POLICY_HWACCESS_NONE 0 +#define PROC_POLICY_HWACCESS_DISK 1 /* disk access */ +#define PROC_POLICY_HWACCESS_GPU 2 /* GPU access */ +#define PROC_POLICY_HWACCESS_NETWORK 3 /* network access */ +#define PROC_POLICY_HWACCESS_CPU 4 /* cpu access */ + +/* attribute values for disk hardware access, bit different as it should reflect IOPOL_XXX */ +#define PROC_POLICY_DISKACC_NONE 0 +#define PROC_POLICY_DISKACC_NORMAL 1 /* normal access to the disk */ +#define PROC_POLICY_DISKACC_PASSIVE 2 /* treat the I/Os as passive */ +#define PROC_POLICY_DISKACC_THROTTLE 3 /* throttle the disk IOs */ +#define PROC_POLICY_DISKACC_DEFAULT 0 + +/* attribute values for GPU hardware access */ +#define PROC_POLICY_GPUACC_NONE 0 +#define PROC_POLICY_GPUACC_FULLACCESS 0 /* complete access to the GPU */ +#define PROC_POLICY_GPUACC_DENYACCESS 1 /* deny any access to the GPU */ +#define PROC_POLICY_GPUACC_DEFAULT 0 /* default is complete access */ + +/* atrribute values for network hardware access */ +#define PROC_POLICY_NETACC_NONE 0 +#define PROC_POLICY_NETACC_NORMAL 0 /* complete access to the network */ +#define PROC_POLICY_NETACC_THROTTLE 1 /* throttle access to network */ +#define PROC_POLICY_NETACC_DEFAULT 0 /* default is complete access */ + +/* atrribute values for network hardware access */ +#define PROC_POLICY_CPUACC_NONE 0 +#define PROC_POLICY_CPUACC_ALL 0 /* access to all avialable cpus */ +#define PROC_POLICY_CPUACC_ONE 1 /* access to only one available cpu */ +#define PROC_POLICY_CPUACC_LLCACHE 2 /* access to only one last level cache */ +#define PROC_POLICY_CPUACC_DEFAULT 0 /* default is access to all cpus */ + + +/* System Resource management (ie usage and starvation related) definitions */ + +/* sub policies for resource starvation */ +#define PROC_POLICY_RS_NONE 0 +#define PROC_POLICY_RS_VIRTUALMEM 1 /* virtual memory starvation */ + +/* sub policies for resource usage */ +#define PROC_POLICY_RUSAGE_NONE 0 +#define PROC_POLICY_RUSAGE_WIREDMEM 1 /* wired memory usages */ +#define PROC_POLICY_RUSAGE_VIRTMEM 2 /* virtual memory usage */ +#define PROC_POLICY_RUSAGE_CPU 3 /* amount of cpu usage */ +#define PROC_POLICY_RUSAGE_DISK 4 /* amount of disk usage */ +#define PROC_POLICY_RUSAGE_NETWORK 5 /* amount of network usage */ +#define PROC_POLICY_RUSAGE_POWER 6 /* amount of power/battery consumption */ + +/* attribute values for the resource usage and low resource */ +#define PROC_POLICY_RSRCACT_NONE 0 +#define PROC_POLICY_RSRCACT_THROTTLE 1 /* throttle on resource condition */ +#define PROC_POLICY_RSRCACT_SUSPEND 2 /* suspend on resource condition */ +#define PROC_POLICY_RSRCACT_TERMINATE 3 /* kill on resource condition */ +#define PROC_POLICY_RSRCACT_NOTIFY 4 /* send kqueue notification */ + + +/* type of resource for kqueue notifiction */ +#define PROC_POLICY_RSRTYPE_CPU 1 +#define PROC_POLICY_RSRTYPE_WIREDMEM 2 +#define PROC_POLICY_RSRTYPE_VIRTUALMEM 4 +#define PROC_POLICY_RSRTYPE_DISK 8 +#define PROC_POLICY_RSRTYPE_NETWORK 0x010 +#define PROC_POLICY_RSRTYPE_POWER 0x20 + + +typedef struct proc_policy_attribute { + uint32_t ppattr_attribute; /* the policy attribute to be modified or returned */ + uint32_t ppattr_resv; /* pad field */ + uint64_t ppattr_value1; /* 64bit policy specific attribute */ + uint64_t ppattr_value2; /* 64bit policy specific attribute */ + uint64_t ppattr_value3; /* 64bit policy specific attribute */ + uint64_t ppattr_resv1[4]; /* reserved for future use */ +} proc_policy_attribute_t; + + +typedef struct proc_policy_cpuusage_attr { + uint32_t ppattr_cpu_attr ; /* specified action as in PROC_POLICY_RSRCACT_xx */ + uint32_t ppattr_cpu_percentage; /* percentage of interval */ + uint64_t ppattr_cpu_attr_interval; /* 64bit interval in nsecs */ + uint64_t ppattr_cpu_attr_deadline; /* 64bit deadline in nsecs */ +} proc_policy_cpuusage_attr_t; + + +/* sub policies for PROC_POLICY_APPTYPE */ +#define PROC_POLICY_OSX_APPTYPE_NONE 0 +#define PROC_POLICY_OSX_APPTYPE_TAL 1 /* TAL based launched */ +#define PROC_POLICY_OSX_APPTYPE_WIDGET 2 /* for dashboard client */ +#define PROC_POLICY_OSX_APPTYPE_DASHCLIENT 2 /* rename to move away from widget */ +#define PROC_POLICY_IOS_APPTYPE 3 /* ios specific handling */ +#define PROC_POLICY_IOS_NONUITYPE 4 /* ios non graphics type */ + +#ifndef KERNEL +int process_policy(int scope, int action, int policy, int policy_subtype, proc_policy_attribute_t * attrp, pid_t target_pid, uint64_t target_threadid); +#endif /* KERNEL */ + + +__END_DECLS + +#endif /*_SYS_PROCESS_POLICY_H */ diff --git a/bsd/sys/protosw.h b/bsd/sys/protosw.h index bdf36a317..75b6d6b28 100644 --- a/bsd/sys/protosw.h +++ b/bsd/sys/protosw.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -70,7 +70,12 @@ #include #define PR_SLOWHZ 2 /* 2 slow timeouts per second */ +#ifndef __APPLE__ +/* + * See rdar://7617868: pr_fasttimo was removed use your own timer or pr_slowtimo instead + */ #define PR_FASTHZ 5 /* 5 fast timeouts per second */ +#endif #ifdef PRIVATE @@ -105,7 +110,8 @@ struct socket_filter; * The userreq routine interfaces protocols to the system and is * described below. */ - + +#include #include #include #ifdef KERNEL @@ -132,8 +138,12 @@ struct protosw { void *pr_ousrreq; /* utility hooks */ void (*pr_init)(void); /* initialization hook */ +#if __APPLE__ + void (*pr_unused)(void); /* placeholder - fasttimo is removed */ +#else void (*pr_fasttimo)(void); /* fast timeout (200ms) */ +#endif void (*pr_slowtimo)(void); /* slow timeout (500ms) */ void (*pr_drain)(void); @@ -408,6 +418,7 @@ char *prcorequests[] = { __BEGIN_DECLS void domaininit(void) __attribute__((section("__TEXT, initcode"))); +void domainfin(void) __attribute__((section("__TEXT, fincode"))); void pfctlinput(int, struct sockaddr *); void pfctlinput2(int, struct sockaddr *, void *); @@ -418,6 +429,7 @@ struct protosw *pffindtype(int family, int type); extern int net_add_proto(struct protosw *, struct domain *); extern int net_del_proto(int, int, struct domain *); +extern u_int64_t net_uptime(void); __END_DECLS /* Temp hack to link static domains together */ diff --git a/bsd/sys/pthread_internal.h b/bsd/sys/pthread_internal.h index 7d0cfae29..6cc80f5f3 100644 --- a/bsd/sys/pthread_internal.h +++ b/bsd/sys/pthread_internal.h @@ -29,12 +29,34 @@ #ifndef _SYS_PTHREAD_INTERNAL_H_ #define _SYS_PTHREAD_INTERNAL_H_ -#undef pthread_mutexattr_t; - +#include #include +struct ksyn_waitq_element { + TAILQ_ENTRY(ksyn_waitq_element) kwe_list; /* link to other list members */ + void * kwe_kwqqueue; /* queue blocked on */ + uint32_t kwe_flags; /* flags */ + uint32_t kwe_lockseq; /* the sequence of the entry */ + uint32_t kwe_count; /* upper bound on number of matches still pending */ + uint32_t kwe_psynchretval; /* thread retval */ + void *kwe_uth; /* uthread */ +}; +typedef struct ksyn_waitq_element * ksyn_waitq_element_t; + +/* kew_flags defns */ +#define KWE_THREAD_INWAIT 1 +#define KWE_THREAD_PREPOST 2 +#define KWE_THREAD_BROADCAST 4 + + #define WORKITEM_SIZE 64 -#define WORKQUEUE_NUMPRIOS 3 + +#define WORKQUEUE_HIGH_PRIOQUEUE 0 /* high priority queue */ +#define WORKQUEUE_DEFAULT_PRIOQUEUE 1 /* default priority queue */ +#define WORKQUEUE_LOW_PRIOQUEUE 2 /* low priority queue */ +#define WORKQUEUE_BG_PRIOQUEUE 3 /* background priority queue */ + +#define WORKQUEUE_NUMPRIOS 4 #define WORKQUEUE_OVERCOMMIT 0x10000 @@ -57,6 +79,8 @@ struct threadlist { #define TH_LIST_SUSPENDED 0x08 #define TH_LIST_BUSY 0x10 #define TH_LIST_NEED_WAKEUP 0x20 +#define TH_LIST_CONSTRAINED 0x40 + struct workitem { TAILQ_ENTRY(workitem) wi_entry; @@ -83,6 +107,7 @@ struct workqueue { uint32_t wq_timer_interval; uint32_t wq_affinity_max; uint32_t wq_threads_scheduled; + uint32_t wq_constrained_threads_scheduled; uint32_t wq_nthreads; uint32_t wq_thidlecount; uint32_t wq_reqconc[WORKQUEUE_NUMPRIOS]; /* requested concurrency for each priority level */ @@ -100,6 +125,8 @@ struct workqueue { #define WQL_ATIMER_BUSY 0x01 #define WQL_ATIMER_WAITING 0x02 +#define WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT 0x04 +#define WQL_EXCEEDED_TOTAL_THREAD_LIMIT 0x08 #define WQ_VECT_SET_BIT(vector, bit) \ @@ -121,7 +148,7 @@ struct workqueue { /* workq_kernreturn commands */ #define WQOPS_QUEUE_ADD 1 -#define WQOPS_QUEUE_REMOVE 2 +#define WQOPS_QUEUE_REMOVE_OBSOLETE 2 #define WQOPS_THREAD_RETURN 4 #define WQOPS_THREAD_SETCONC 8 @@ -129,12 +156,12 @@ struct workqueue { #define PTH_DEFAULT_GUARDSIZE 4*1024 #define MAX_PTHREAD_SIZE 64*1024 -void workqueue_exit(struct proc *); - -void pthread_init(void); extern lck_grp_attr_t *pthread_lck_grp_attr; extern lck_grp_t *pthread_lck_grp; extern lck_attr_t *pthread_lck_attr; +void workqueue_exit(struct proc *); +void pthread_init(void); +void psynch_zoneinit(void); #endif /* _SYS_PTHREAD_INTERNAL_H_ */ diff --git a/bsd/sys/queue.h b/bsd/sys/queue.h index a8e96cd4c..9ccb63e74 100644 --- a/bsd/sys/queue.h +++ b/bsd/sys/queue.h @@ -133,8 +133,11 @@ * _INSERT_AFTER + + + + + * _INSERT_TAIL - - + + + * _CONCAT - - + + - + * _REMOVE_AFTER + - + - - * _REMOVE_HEAD + - + - - + * _REMOVE_HEAD_UNTIL - - + - - * _REMOVE + + + + + + * _SWAP - + + + - * */ #ifdef QUEUE_MACRO_DEBUG @@ -232,12 +235,16 @@ struct { \ struct type *curelm = SLIST_FIRST((head)); \ while (SLIST_NEXT(curelm, field) != (elm)) \ curelm = SLIST_NEXT(curelm, field); \ - SLIST_NEXT(curelm, field) = \ - SLIST_NEXT(SLIST_NEXT(curelm, field), field); \ + SLIST_REMOVE_AFTER(curelm, field); \ } \ TRASHIT((elm)->field.sle_next); \ } while (0) +#define SLIST_REMOVE_AFTER(elm, field) do { \ + SLIST_NEXT(elm, field) = \ + SLIST_NEXT(SLIST_NEXT(elm, field), field); \ +} while (0) + #define SLIST_REMOVE_HEAD(head, field) do { \ SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field); \ } while (0) @@ -324,9 +331,7 @@ struct { \ struct type *curelm = STAILQ_FIRST((head)); \ while (STAILQ_NEXT(curelm, field) != (elm)) \ curelm = STAILQ_NEXT(curelm, field); \ - if ((STAILQ_NEXT(curelm, field) = \ - STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\ - (head)->stqh_last = &STAILQ_NEXT((curelm), field);\ + STAILQ_REMOVE_AFTER(head, curelm, field); \ } \ TRASHIT((elm)->field.stqe_next); \ } while (0) @@ -337,11 +342,31 @@ struct { \ (head)->stqh_last = &STAILQ_FIRST((head)); \ } while (0) -#define STAILQ_REMOVE_HEAD_UNTIL(head, elm, field) do { \ - if ((STAILQ_FIRST((head)) = STAILQ_NEXT((elm), field)) == NULL) \ - (head)->stqh_last = &STAILQ_FIRST((head)); \ +#define STAILQ_REMOVE_HEAD_UNTIL(head, elm, field) do { \ + if ((STAILQ_FIRST((head)) = STAILQ_NEXT((elm), field)) == NULL) \ + (head)->stqh_last = &STAILQ_FIRST((head)); \ } while (0) +#define STAILQ_REMOVE_AFTER(head, elm, field) do { \ + if ((STAILQ_NEXT(elm, field) = \ + STAILQ_NEXT(STAILQ_NEXT(elm, field), field)) == NULL) \ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ +} while (0) + +#define STAILQ_SWAP(head1, head2, type) do { \ + struct type *swap_first = STAILQ_FIRST(head1); \ + struct type **swap_last = (head1)->stqh_last; \ + STAILQ_FIRST(head1) = STAILQ_FIRST(head2); \ + (head1)->stqh_last = (head2)->stqh_last; \ + STAILQ_FIRST(head2) = swap_first; \ + (head2)->stqh_last = swap_last; \ + if (STAILQ_EMPTY(head1)) \ + (head1)->stqh_last = &STAILQ_FIRST(head1); \ + if (STAILQ_EMPTY(head2)) \ + (head2)->stqh_last = &STAILQ_FIRST(head2); \ +} while (0) + + /* * List declarations. */ @@ -444,6 +469,16 @@ struct { \ TRASHIT((elm)->field.le_prev); \ } while (0) +#define LIST_SWAP(head1, head2, type, field) do { \ + struct type *swap_tmp = LIST_FIRST((head1)); \ + LIST_FIRST((head1)) = LIST_FIRST((head2)); \ + LIST_FIRST((head2)) = swap_tmp; \ + if ((swap_tmp = LIST_FIRST((head1))) != NULL) \ + swap_tmp->field.le_prev = &LIST_FIRST((head1)); \ + if ((swap_tmp = LIST_FIRST((head2))) != NULL) \ + swap_tmp->field.le_prev = &LIST_FIRST((head2)); \ +} while (0) + /* * Tail queue declarations. */ @@ -574,6 +609,23 @@ struct { \ QMD_TRACE_ELEM(&(elm)->field); \ } while (0) +#define TAILQ_SWAP(head1, head2, type, field) do { \ + struct type *swap_first = (head1)->tqh_first; \ + struct type **swap_last = (head1)->tqh_last; \ + (head1)->tqh_first = (head2)->tqh_first; \ + (head1)->tqh_last = (head2)->tqh_last; \ + (head2)->tqh_first = swap_first; \ + (head2)->tqh_last = swap_last; \ + if ((swap_first = (head1)->tqh_first) != NULL) \ + swap_first->field.tqe_prev = &(head1)->tqh_first; \ + else \ + (head1)->tqh_last = &(head1)->tqh_first; \ + if ((swap_first = (head2)->tqh_first) != NULL) \ + swap_first->field.tqe_prev = &(head2)->tqh_first; \ + else \ + (head2)->tqh_last = &(head2)->tqh_first; \ +} while (0) + /* * Circular queue definitions. */ diff --git a/bsd/sys/reboot.h b/bsd/sys/reboot.h index f79f2a9e2..6c64e53b8 100644 --- a/bsd/sys/reboot.h +++ b/bsd/sys/reboot.h @@ -135,7 +135,7 @@ #include __BEGIN_DECLS -void boot(int, int, char *); +int boot(int, int, char *); __END_DECLS #define PROC_SHUTDOWN_LOG "/var/log/kernel-shutdown.log" diff --git a/bsd/sys/resource.h b/bsd/sys/resource.h index 72c969c12..fbe8e6266 100644 --- a/bsd/sys/resource.h +++ b/bsd/sys/resource.h @@ -68,6 +68,9 @@ #include #include +#ifndef KERNEL +#include +#endif /* [XSI] The timeval structure shall be defined as described in * @@ -121,6 +124,12 @@ typedef __uint64_t rlim_t; */ #define PRIO_DARWIN_BG 0x1000 +/* + * use PRIO_DARWIN_NONUI to restrict a process's ability to make calls to + * the GPU. + */ +#define PRIO_DARWIN_NONUI 0x1001 + #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ @@ -305,13 +314,13 @@ struct _iopol_param_t { __BEGIN_DECLS int getpriority(int, id_t); #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -int getiopolicy_np(int, int); +int getiopolicy_np(int, int) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); #endif /* !_POSIX_C_SOURCE || _DARWIN_C_SOURCE */ int getrlimit(int, struct rlimit *) __DARWIN_ALIAS(getrlimit); int getrusage(int, struct rusage *); int setpriority(int, id_t, int); #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -int setiopolicy_np(int, int, int); +int setiopolicy_np(int, int, int) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); #endif /* !_POSIX_C_SOURCE || _DARWIN_C_SOURCE */ int setrlimit(int, const struct rlimit *) __DARWIN_ALIAS(setrlimit); __END_DECLS diff --git a/bsd/sys/sdt_impl.h b/bsd/sys/sdt_impl.h index cbd117b61..e9531067c 100644 --- a/bsd/sys/sdt_impl.h +++ b/bsd/sys/sdt_impl.h @@ -74,6 +74,9 @@ struct module { }; extern int sdt_invop(uintptr_t, uintptr_t *, uintptr_t); +#if defined (__APPLE__) +extern uint64_t sdt_getarg(void *, dtrace_id_t, void *, int, int); +#endif /* __APPLE__ */ void sdt_provide_module(void *, struct modctl *); void sdt_init(void); @@ -85,8 +88,6 @@ extern int sdt_probetab_mask; #if defined(__i386__) || defined(__x86_64__) typedef uint8_t sdt_instr_t; -#elif defined(__ppc__) || defined(__ppc64__) -typedef uint32_t sdt_instr_t; #else #error Unknown implementation #endif diff --git a/bsd/sys/signal.h b/bsd/sys/signal.h index faa6fcc1d..d2e40fd76 100644 --- a/bsd/sys/signal.h +++ b/bsd/sys/signal.h @@ -145,12 +145,6 @@ #define __need_mcontext_t #define __need_stack_t #define __need_ucontext_t -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -#if defined(__ppc__) || defined(__ppc64__) -#define __need_mcontext64_t -#define __need_ucontext64_t -#endif /* __ppc__ || __ppc64__ */ -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ #include #ifndef _PID_T diff --git a/bsd/sys/socket.h b/bsd/sys/socket.h index 6b37dfced..3fc35997c 100644 --- a/bsd/sys/socket.h +++ b/bsd/sys/socket.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -76,6 +76,14 @@ #include #include +#ifdef PRIVATE +#include +#endif /* PRIVATE */ + +#ifndef KERNEL +#include +#endif + /* * Definitions related to sockets: types, address families, options. */ @@ -130,6 +138,23 @@ struct iovec { size_t iov_len; /* [XSI] Size of region iov_base points to */ }; #endif + +#ifdef PRIVATE +#define SO_TCDBG_PID 0x01 /* Set/get traffic class for PID */ +#define SO_TCDBG_PNAME 0x02 /* Set/get traffic class for processes of that name */ +#define SO_TCDBG_PURGE 0x04 /* Purge entries for unused PIDs */ +#define SO_TCDBG_FLUSH 0x08 /* Flush all entries */ +#define SO_TCDBG_COUNT 0x10 /* Get count of entries */ +#define SO_TCDBG_LIST 0x20 /* List entries */ + +struct so_tcdbg { + u_int32_t so_tcdbg_cmd; + int32_t so_tcdbg_tclass; + u_int32_t so_tcdbg_count; + pid_t so_tcdbg_pid; + char so_tcdbg_pname[MAXCOMLEN + 1]; +}; +#endif /* PRIVATE */ /* * Types @@ -161,6 +186,7 @@ struct iovec { #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) #define SO_REUSEPORT 0x0200 /* allow local address & port reuse */ #define SO_TIMESTAMP 0x0400 /* timestamp received dgram traffic */ +#define SO_TIMESTAMP_MONOTONIC 0x0800 /* Monotonically increasing timestamp on rcvd dgram */ #ifndef __APPLE__ #define SO_ACCEPTFILTER 0x1000 /* there is an accept filter */ #else @@ -184,6 +210,8 @@ struct iovec { #define SO_TYPE 0x1008 /* get socket type */ #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) /*efine SO_PRIVSTATE 0x1009 get/deny privileged state */ +#define SO_LABEL 0x1010 /* socket's MAC label */ +#define SO_PEERLABEL 0x1011 /* socket's peer MAC label */ #ifdef __APPLE__ #define SO_NREAD 0x1020 /* APPLE: get 1st-packet byte count */ #define SO_NKE 0x1021 /* APPLE: Install socket-level NKE */ @@ -203,16 +231,26 @@ struct iovec { #define SO_RANDOMPORT 0x1082 /* APPLE: request local port randomization */ #define SO_NP_EXTENSIONS 0x1083 /* To turn off some POSIX behavior */ #endif + #ifdef PRIVATE #define SO_EXECPATH 0x1085 /* Application Firewall Socket option */ -#define SO_TRAFFIC_CLASS 0x1086 /* Traffic class */ +#define SO_TRAFFIC_CLASS 0x1086 /* Traffic class (int)*/ #define SO_TC_BE 0 /* Best effort, normal */ #define SO_TC_BK 1 /* Background, low priority or bulk traffic */ #define SO_TC_VI 2 /* Interactive video, constant bit rate, low latency */ #define SO_TC_VO 3 /* Interactive voice, constant bit rate, lowest latency */ -#endif -#define SO_LABEL 0x1010 /* socket's MAC label */ -#define SO_PEERLABEL 0x1011 /* socket's peer MAC label */ +#define SO_TC_MAX 4 /* Max traffic class value */ + +/* Background socket configuration flags */ +#define TRAFFIC_MGT_SO_BACKGROUND 0x0001 /* background socket */ +#define TRAFFIC_MGT_TCP_RECVBG 0x0002 /* Only TCP sockets, receiver throttling */ + +#define SO_RECV_TRAFFIC_CLASS 0x1087 /* Receive traffic class (bool)*/ +#define SO_TRAFFIC_CLASS_DBG 0x1088 /* Debug traffic class (struct so_tcdbg) */ +#define SO_TRAFFIC_CLASS_STATS 0x1089 /* Traffic class statistics */ +#define SO_DEFUNCTOK 0x1100 /* can be defunct'd */ +#define SO_ISDEFUNCT 0x1101 /* get defunct status */ +#endif /* PRIVATE */ #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ /* @@ -429,6 +467,9 @@ struct sockaddr_storage { */ #define PF_VLAN ((uint32_t)0x766c616e) /* 'vlan' */ #define PF_BOND ((uint32_t)0x626f6e64) /* 'bond' */ +#ifdef KERNEL_PRIVATE +#define PF_BRIDGE ((uint32_t)0x62726467) /* 'brdg' */ +#endif /* KERNEL_PRIVATE */ /* * Definitions for network related sysctl, CTL_NET. @@ -492,14 +533,18 @@ struct sockaddr_storage { * Fifth: type of info, defined below * Sixth: flag(s) to mask with for NET_RT_FLAGS */ -#define NET_RT_DUMP 1 /* dump; may limit to a.f. */ -#define NET_RT_FLAGS 2 /* by flags, e.g. RESOLVING */ -#define NET_RT_IFLIST 3 /* survey interface list */ -#define NET_RT_STAT 4 /* routing statistics */ -#define NET_RT_TRASH 5 /* routes not in table but not freed */ -#define NET_RT_IFLIST2 6 /* interface list with addresses */ -#define NET_RT_DUMP2 7 /* dump; may limit to a.f. */ -#define NET_RT_MAXID 8 +#define NET_RT_DUMP 1 /* dump; may limit to a.f. */ +#define NET_RT_FLAGS 2 /* by flags, e.g. RESOLVING */ +#define NET_RT_IFLIST 3 /* survey interface list */ +#define NET_RT_STAT 4 /* routing statistics */ +#define NET_RT_TRASH 5 /* routes not in table but not freed */ +#define NET_RT_IFLIST2 6 /* interface list with addresses */ +#define NET_RT_DUMP2 7 /* dump; may limit to a.f. */ +#ifdef PRIVATE +#define NET_RT_DUMPX 8 /* private */ +#define NET_RT_DUMPX_FLAGS 9 /* private */ +#endif /* PRIVATE */ +#define NET_RT_MAXID 10 #endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ #ifdef KERNEL_PRIVATE @@ -512,6 +557,8 @@ struct sockaddr_storage { { "trash", CTLTYPE_INT }, \ { "iflist2", CTLTYPE_STRUCT }, \ { "dump2", CTLTYPE_STRUCT }, \ + { "dumpx", CTLTYPE_STRUCT }, \ + { "dumpx_flags", CTLTYPE_STRUCT }, \ } #endif /* KERNEL_PRIVATE */ @@ -595,7 +642,13 @@ struct user32_msghdr { #define MSG_DONTWAIT 0x80 /* this message should be nonblocking */ #define MSG_EOF 0x100 /* data completes connection */ #ifdef __APPLE__ +#ifndef PRIVATE +#ifdef __APPLE_API_OBSOLETE +#define MSG_WAITSTREAM 0x200 /* wait up to full request.. may return partial */ +#endif +#else #define MSG_WAITSTREAM 0x200 /* wait up to full request.. may return partial */ +#endif #define MSG_FLUSH 0x400 /* Start of 'hold' seq; dump so_temp */ #define MSG_HOLD 0x800 /* Hold frag in so_temp */ #define MSG_SEND 0x1000 /* Send the packet in so_temp */ @@ -680,7 +733,7 @@ struct cmsgcred { ((unsigned char *)(mhdr)->msg_control + \ (mhdr)->msg_controllen)) ? \ (struct cmsghdr *)0L /* NULL */ : \ - (struct cmsghdr *)((unsigned char *)(cmsg) + \ + (struct cmsghdr *)(void *)((unsigned char *)(cmsg) + \ __DARWIN_ALIGN32((__uint32_t)(cmsg)->cmsg_len)))) #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) @@ -694,10 +747,11 @@ struct cmsgcred { #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ /* "Socket"-level control message types: */ -#define SCM_RIGHTS 0x01 /* access rights (array of int) */ +#define SCM_RIGHTS 0x01 /* access rights (array of int) */ #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -#define SCM_TIMESTAMP 0x02 /* timestamp (struct timeval) */ -#define SCM_CREDS 0x03 /* process creds (struct cmsgcred) */ +#define SCM_TIMESTAMP 0x02 /* timestamp (struct timeval) */ +#define SCM_CREDS 0x03 /* process creds (struct cmsgcred) */ +#define SCM_TIMESTAMP_MONOTONIC 0x04 /* timestamp (uint64_t) */ #ifdef KERNEL_PRIVATE /* @@ -792,7 +846,7 @@ ssize_t sendto(int, const void *, size_t, int, const struct sockaddr *, socklen_t) __DARWIN_ALIAS_C(sendto); int setsockopt(int, int, int, const void *, socklen_t); int shutdown(int, int); -int sockatmark(int); +int sockatmark(int) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); int socket(int, int, int); int socketpair(int, int, int, int *) __DARWIN_ALIAS(socketpair); @@ -804,7 +858,6 @@ int sendfile(int, int, off_t, off_t *, struct sf_hdtr *, int); void pfctlinput(int, struct sockaddr *); #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ __END_DECLS - #endif /* !KERNEL */ #ifdef KERNEL diff --git a/bsd/sys/socketvar.h b/bsd/sys/socketvar.h index 35560c65b..3c81716fe 100644 --- a/bsd/sys/socketvar.h +++ b/bsd/sys/socketvar.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -108,6 +108,17 @@ extern char netio[], netcon[], netcls[]; typedef u_quad_t so_gen_t; +#ifdef PRIVATE +#define SO_TC_STATS_MAX 4 + +struct data_stats { + u_int64_t rxpackets; + u_int64_t rxbytes; + u_int64_t txpackets; + u_int64_t txbytes; +}; +#endif /* PRIVATE */ + #ifdef KERNEL_PRIVATE #ifndef __APPLE__ /* We don't support BSD style socket filters */ @@ -196,6 +207,7 @@ struct socket { void (*so_upcall)(struct socket *so, caddr_t arg, int waitf); caddr_t so_upcallarg; /* Arg for above */ uid_t so_uid; /* who opened the socket */ + gid_t so_gid; /* gid of whoever opened the socket */ /* NB: generation count must not be first; easiest to make it last. */ so_gen_t so_gencnt; /* generation count */ #ifndef __APPLE__ @@ -220,7 +232,7 @@ struct socket { #define SOF_NOSIGPIPE 0x1 #define SOF_NOADDRAVAIL 0x2 /* EADDRNOTAVAIL if src addr is gone */ #define SOF_PCBCLEARING 0x4 /* pru_disconnect done; don't call pru_detach */ -#define SOF_DEFUNCT 0x8 /* accepted socket marked as inactive */ +#define SOF_DEFUNCT 0x8 /* socket marked as inactive */ #define SOF_CLOSEWAIT 0x10 /* blocked in close awaiting some events */ #define SOF_UPCALLINUSE 0x20 /* socket upcall is currently in progress */ #define SOF_REUSESHAREUID 0x40 /* Allows SO_REUSEADDR/SO_REUSEPORT for multiple so_uid */ @@ -233,6 +245,9 @@ struct socket { #define SOF_UPCALLCLOSEWAIT 0x800 /* block on close until an upcall returns */ #define SOF_BINDRANDOMPORT 0x1000 /* Request a randomized port number for the bind */ #define SOF_NPX_SETOPTSHUT 0x2000 /* Non POSIX extension to allow setsockopt(2) after shut down */ +#define SOF_RECV_TRAFFIC_CLASS 0x4000 /* Receive traffic class as ancillary data */ +#define SOF_NODEFUNCT 0x8000 /* socket cannot be defunct'd */ +#define SOF_INCOMP_INPROGRESS 0x10000 /* incomp socket still being processed */ int so_usecount; /* refcounting of socket use */; int so_retaincnt; u_int32_t so_filteruse; /* usecount for the socket filters */ @@ -252,10 +267,36 @@ struct socket { struct label *so_label; /* MAC label for socket */ struct label *so_peerlabel; /* cached MAC label for socket peer */ thread_t so_background_thread; /* thread that marked this socket background */ -#if PKT_PRIORITY int so_traffic_class; -#endif /* PKT_PRIORITY */ + + // last process to interact with this socket + u_int64_t last_upid; + pid_t last_pid; + + struct data_stats so_tc_stats[SO_TC_STATS_MAX]; }; + +/* Control message accessor in mbufs */ + +#define _MIN_NXT_CMSGHDR_PTR(cmsg) \ + ((char *)(cmsg) + \ + __DARWIN_ALIGN32((__uint32_t)(cmsg)->cmsg_len) + \ + __DARWIN_ALIGN32(sizeof(struct cmsghdr))) + +#define M_FIRST_CMSGHDR(m) \ + ((char *)(m) != (char *)0L && (size_t)(m)->m_len >= sizeof(struct cmsghdr) && \ + (socklen_t)(m)->m_len >= __DARWIN_ALIGN32(((struct cmsghdr *)(m)->m_data)->cmsg_len) ?\ + (struct cmsghdr *)(m)->m_data : \ + (struct cmsghdr *)0L) + +#define M_NXT_CMSGHDR(m, cmsg) \ + ((char *)(cmsg) == (char *)0L ? M_FIRST_CMSGHDR(m) : \ + _MIN_NXT_CMSGHDR_PTR(cmsg) > ((char *)(m)->m_data) + (m)->m_len || \ + _MIN_NXT_CMSGHDR_PTR(cmsg) < (char *)(m)->m_data ? \ + (struct cmsghdr *)0L /* NULL */ : \ + (struct cmsghdr *)((unsigned char *)(cmsg) + \ + __DARWIN_ALIGN32((__uint32_t)(cmsg)->cmsg_len))) + #endif /* KERNEL_PRIVATE */ /* @@ -278,6 +319,7 @@ struct socket { #define SS_ISDISCONNECTED 0x2000 /* socket disconnected from peer */ #define SS_DRAINING 0x4000 /* close waiting for blocked system calls to drain */ +#define SS_DEFUNCT 0x8000 /* has been fully defunct'd */ #if defined(__LP64__) #define _XSOCKET_PTR(x) u_int32_t @@ -288,13 +330,13 @@ struct socket { #pragma pack(4) struct xsockbuf { - u_int32_t sb_cc; - u_int32_t sb_hiwat; - u_int32_t sb_mbcnt; - u_int32_t sb_mbmax; - int32_t sb_lowat; - short sb_flags; - short sb_timeo; + u_int32_t sb_cc; + u_int32_t sb_hiwat; + u_int32_t sb_mbcnt; + u_int32_t sb_mbmax; + int32_t sb_lowat; + short sb_flags; + short sb_timeo; }; /* @@ -348,6 +390,56 @@ struct xsocket64 { #endif /* !CONFIG_EMBEDDED */ +#ifdef PRIVATE + +#define XSO_SOCKET 0x001 +#define XSO_RCVBUF 0x002 +#define XSO_SNDBUF 0x004 +#define XSO_STATS 0x008 +#define XSO_INPCB 0x010 +#define XSO_TCPCB 0x020 + +struct xsocket_n { + u_int32_t xso_len; /* length of this structure */ + u_int32_t xso_kind; /* XSO_SOCKET */ + u_int64_t xso_so; /* makes a convenient handle */ + short so_type; + short so_options; + short so_linger; + short so_state; + u_int64_t so_pcb; /* another convenient handle */ + int xso_protocol; + int xso_family; + short so_qlen; + short so_incqlen; + short so_qlimit; + short so_timeo; + u_short so_error; + pid_t so_pgid; + u_int32_t so_oobmark; + uid_t so_uid; /* XXX */ +}; + +struct xsockbuf_n { + u_int32_t xsb_len; /* length of this structure */ + u_int32_t xsb_kind; /* XSO_RCVBUF or XSO_SNDBUF */ + u_int32_t sb_cc; + u_int32_t sb_hiwat; + u_int32_t sb_mbcnt; + u_int32_t sb_mbmax; + int32_t sb_lowat; + short sb_flags; + short sb_timeo; +}; + +struct xsockstat_n { + u_int32_t xst_len; /* length of this structure */ + u_int32_t xst_kind; /* XSO_STATS */ + struct data_stats xst_tc_stats[SO_TC_STATS_MAX]; +}; + +#endif /* PRIVATE */ + #pragma pack() #ifdef KERNEL_PRIVATE @@ -434,6 +526,7 @@ extern so_gen_t so_gencnt; extern int socket_debug; extern int sosendjcl; extern int sosendjcl_ignore_capab; +extern int sodefunctlog; extern int somaxconn; struct file; @@ -444,6 +537,7 @@ struct stat; struct ucred; struct uio; struct knote; +struct so_tcdbg; #define SBLASTRECORDCHK(sb, s) \ if (socket_debug) sblastrecordchk(sb, s); @@ -458,6 +552,20 @@ struct knote; } \ } +#define SODEFUNCTLOG(x) do { if (sodefunctlog) printf x; } while (0) + +/* + * For debugging traffic class behaviors + */ +#define SOTCDB_NO_DSCP 0x01 /* Do not set DSCP code in IP header */ +#define SOTCDB_NO_MTC 0x02 /* Do not set the mbuf traffic class */ +#define SOTCDB_NO_SENDTCPBG 0x04 /* Do not use background TCP CC algorithm for sender */ +#define SOTCDB_NO_LCLTST 0x08 /* Do not test for local destination for setting DSCP */ +#define SOTCDB_NO_DSCPTST 0x10 /* Overwritte any existing DSCP code */ +#define SOTCDB_NO_RECVTCPBG 0x20 /* Do not use throttling on receiver-side of TCP */ + +extern u_int32_t sotcdb; + /* * From uipc_socket and friends */ @@ -481,6 +589,7 @@ extern void sbcheck(struct sockbuf *sb); extern void sblastmbufchk(struct sockbuf *, const char *); extern void sblastrecordchk(struct sockbuf *, const char *); extern struct mbuf *sbcreatecontrol(caddr_t p, int size, int type, int level); +extern struct mbuf **sbcreatecontrol_mbuf(caddr_t p, int size, int type, int level, struct mbuf** m); extern void sbdrop(struct sockbuf *sb, int len); extern void sbdroprecord(struct sockbuf *sb); extern void sbflush(struct sockbuf *sb); @@ -512,11 +621,14 @@ extern void sofree(struct socket *so); extern void soreference(struct socket *so); extern void sodereference(struct socket *so); extern void somultipages(struct socket *, boolean_t); +extern int sosetdefunct(struct proc *, struct socket *, int level, boolean_t); +extern int sodefunct(struct proc *, struct socket *, int level); extern int sogetopt(struct socket *so, struct sockopt *sopt); extern void sohasoutofband(struct socket *so); extern void soisconnected(struct socket *so); extern void soisconnecting(struct socket *so); extern void soisdisconnected(struct socket *so); +extern void sodisconnectwakeup(struct socket *so); extern void soisdisconnecting(struct socket *so); extern int soisbackground(struct socket *so); extern int solisten(struct socket *so, int backlog); @@ -531,8 +643,15 @@ extern int socket_unlock(struct socket *so, int refcount); extern void sofreelastref(struct socket *, int); extern int sogetaddr_locked(struct socket *, struct sockaddr **, int); extern const char *solockhistory_nr(struct socket *); -extern void set_traffic_class(struct mbuf *, struct socket *, int); +extern void set_packet_tclass(struct mbuf *, struct socket *, int, int); extern int mbuf_traffic_class_from_control(struct mbuf *); +extern void set_tcp_stream_priority(struct socket *so); +extern int so_set_traffic_class(struct socket *, int); +extern void so_set_default_traffic_class(struct socket *); +extern void socket_tclass_init(void); +extern int so_set_tcdbg(struct socket *, struct so_tcdbg *); +extern int sogetopt_tcdbg(struct socket *, struct sockopt *); +extern void so_recv_data_stat(struct socket *, struct mbuf *, size_t); /* * XXX; prepare mbuf for (__FreeBSD__ < 3) routines. @@ -557,6 +676,7 @@ extern void sotoxsocket(struct socket *so, struct xsocket *xso); #if !CONFIG_EMBEDDED extern void sotoxsocket64(struct socket *so, struct xsocket64 *xso); #endif +extern void sbwakeup(struct sockbuf *sb); extern void sowakeup(struct socket *so, struct sockbuf *sb); extern int soioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p); diff --git a/bsd/sys/sockio.h b/bsd/sys/sockio.h index 8d415e6e4..3a6b1371b 100644 --- a/bsd/sys/sockio.h +++ b/bsd/sys/sockio.h @@ -158,6 +158,18 @@ #define SIOCGIFALTMTU _IOWR('i', 72, struct ifreq) /* get if alternate mtu */ #define SIOCSIFBOND _IOW('i', 70, struct ifreq) /* set bond if config */ #define SIOCGIFBOND _IOWR('i', 71, struct ifreq) /* get bond if config */ + +#ifdef PRIVATE +/* + * temporary control calls to attach/detach IP to/from an ethernet interface + */ +#define SIOCPROTOATTACH _IOWR('i', 80, struct ifreq) /* attach proto to interface */ +#define SIOCPROTODETACH _IOWR('i', 81, struct ifreq) /* detach proto from interface */ +#endif /* PRIVATE */ + +#define SIOCSIFCAP _IOW('i', 90, struct ifreq) /* set IF features */ +#define SIOCGIFCAP _IOWR('i', 91, struct ifreq) /* get IF features */ + #define SIOCIFCREATE _IOWR('i', 120, struct ifreq) /* create clone if */ #define SIOCIFDESTROY _IOW('i', 121, struct ifreq) /* destroy clone if */ #define SIOCIFCREATE2 _IOWR('i', 122, struct ifreq) /* create clone if with data */ @@ -192,11 +204,6 @@ #define SIOCIFGCLONERS64 _IOWR('i', 129, struct if_clonereq64) /* get cloners */ #endif /* KERNEL */ -/* - * temporary control calls to attach/detach IP to/from an ethernet interface - */ -#define SIOCPROTOATTACH _IOWR('i', 80, struct ifreq) /* attach proto to interface */ -#define SIOCPROTODETACH _IOWR('i', 81, struct ifreq) /* detach proto from interface */ #endif /* PRIVATE */ #define SIOCGIFASYNCMAP _IOWR('i', 124, struct ifreq) /* get ppp asyncmap */ diff --git a/bsd/sys/spawn.h b/bsd/sys/spawn.h index f54fcc396..4947902dd 100644 --- a/bsd/sys/spawn.h +++ b/bsd/sys/spawn.h @@ -58,6 +58,15 @@ */ #define POSIX_SPAWN_SETEXEC 0x0040 #define POSIX_SPAWN_START_SUSPENDED 0x0080 +#ifdef PRIVATE +#define _POSIX_SPAWN_DISABLE_ASLR 0x0100 +#define _POSIX_SPAWN_ALLOW_DATA_EXEC 0x2000 +#define POSIX_SPAWN_OSX_TALAPP_START 0x0400 +#define POSIX_SPAWN_OSX_WIDGET_START 0x0800 +#define POSIX_SPAWN_OSX_DBCLIENT_START 0x0800 /* not a bug, same as widget just rename */ +#define POSIX_SPAWN_IOS_APP_START 0x1000 +#endif /* PRIVATE */ +#define POSIX_SPAWN_CLOEXEC_DEFAULT 0x4000 /* * Possible values to be set for the process control actions on resource starvation. diff --git a/bsd/sys/spawn_internal.h b/bsd/sys/spawn_internal.h index 0e8943947..d29526095 100644 --- a/bsd/sys/spawn_internal.h +++ b/bsd/sys/spawn_internal.h @@ -30,7 +30,7 @@ /* * [SPN] Support for _POSIX_SPAWN * - * This file contains intern datastructures which are externally represented + * This file contains internal data structures which are externally represented * as opaque void pointers to prevent introspection. This permits us to * change the underlying implementation of the code to maintain it or to * support new features, as needed, without the consumer needing to recompile @@ -110,7 +110,8 @@ typedef struct _posix_spawnattr { typedef enum { PSFA_OPEN = 0, PSFA_CLOSE = 1, - PSFA_DUP2 = 2 + PSFA_DUP2 = 2, + PSFA_INHERIT = 3 } psfa_t; diff --git a/bsd/sys/stat.h b/bsd/sys/stat.h index bcc8b79b4..d5daf6120 100644 --- a/bsd/sys/stat.h +++ b/bsd/sys/stat.h @@ -443,7 +443,7 @@ extern void munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp); #define S_IFLNK 0120000 /* [XSI] symbolic link */ #define S_IFSOCK 0140000 /* [XSI] socket */ #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -#define S_IFWHT 0160000 /* whiteout */ +#define S_IFWHT 0160000 /* OBSOLETE: whiteout */ #endif /* File mode */ @@ -489,7 +489,7 @@ extern void munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp); #define S_ISLNK(m) (((m) & S_IFMT) == S_IFLNK) /* symbolic link */ #define S_ISSOCK(m) (((m) & S_IFMT) == S_IFSOCK) /* socket */ #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -#define S_ISWHT(m) (((m) & S_IFMT) == S_IFWHT) /* whiteout */ +#define S_ISWHT(m) (((m) & S_IFMT) == S_IFWHT) /* OBSOLETE: whiteout */ #endif /* @@ -553,7 +553,8 @@ extern void munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp); */ /* #define UF_NOUNLINK 0x00000010 */ /* file may not be removed or renamed */ #define UF_COMPRESSED 0x00000020 /* file is hfs-compressed */ -/* Bits 0x0040 through 0x4000 are currently undefined. */ +#define UF_TRACKED 0x00000040 /* file renames and deletes are tracked */ +/* Bits 0x0080 through 0x4000 are currently undefined. */ #define UF_HIDDEN 0x00008000 /* hint that this item should not be */ /* displayed in a GUI */ /* @@ -607,13 +608,13 @@ int chmodx_np(const char *, filesec_t); int fchflags(int, __uint32_t); int fchmodx_np(int, filesec_t); int fstatx_np(int, struct stat *, filesec_t) __DARWIN_INODE64(fstatx_np); -int lchflags(const char *, __uint32_t); -int lchmod(const char *, mode_t); +int lchflags(const char *, __uint32_t) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); +int lchmod(const char *, mode_t) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); int lstatx_np(const char *, struct stat *, filesec_t) __DARWIN_INODE64(lstatx_np); int mkdirx_np(const char *, filesec_t); int mkfifox_np(const char *, filesec_t); int statx_np(const char *, struct stat *, filesec_t) __DARWIN_INODE64(statx_np); -int umaskx_np(filesec_t); +int umaskx_np(filesec_t) __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_4,__MAC_10_6,__IPHONE_NA,__IPHONE_NA); #if !__DARWIN_ONLY_64_BIT_INO_T /* The following deprecated routines are simillar to stat and friends except provide struct stat64 instead of struct stat */ diff --git a/bsd/sys/sys_domain.h b/bsd/sys/sys_domain.h index 013959c8a..981d9f107 100644 --- a/bsd/sys/sys_domain.h +++ b/bsd/sys/sys_domain.h @@ -96,8 +96,9 @@ struct ctl_cb { lck_mtx_t *mtx; struct socket *so; /* controlling socket */ struct kctl *kctl; /* back pointer to controller */ - u_int32_t unit; void *userdata; + u_int32_t unit; + u_int32_t usecount; }; diff --git a/bsd/sys/sysctl.h b/bsd/sys/sysctl.h index 083432071..1da032f48 100644 --- a/bsd/sys/sysctl.h +++ b/bsd/sys/sysctl.h @@ -110,12 +110,34 @@ * type given below. Each sysctl level defines a set of name/type * pairs to be used by sysctl(1) in manipulating the subsystem. * - * When declaring new sysctl names, please use the CTLFLAG_LOCKED - * flag in the type to indicate that all necessary locking will - * be handled within the sysctl. Any sysctl defined without - * CTLFLAG_LOCKED is considered legacy and will be protected by - * both the kernel funnel and the sysctl memlock. This is not - * optimal, so it is best to handle locking yourself. + * When declaring new sysctl names, unless your sysctl is callable + * from the paging path, please use the CTLFLAG_LOCKED flag in the + * type to indicate that all necessary locking will be handled + * within the sysctl. + * + * Any sysctl defined without CTLFLAG_LOCKED is considered legacy + * and will be protected by both wiring the user process pages and, + * if it is a 32 bit legacy KEXT, by the obsolete kernel funnel. + * + * Note: This is not optimal, so it is best to handle locking + * yourself, if you are able to do so. A simple design + * pattern for use to avoid in a single function known + * to potentially be in the paging path ot doing a DMA + * to physical memory in a user space process is: + * + * lock + * perform operation vs. local buffer + * unlock + * SYSCTL_OUT(rey, local buffer, length) + * + * ...this assumes you are not using a deep call graph + * or are unable to pass a local buffer address as a + * parameter into your deep call graph. + * + * Note that very large user buffers can fail the wire + * if to do so would require more physical pages than + * are available (the caller will get an ENOMEM error, + * see sysctl_mem_hold() for details). */ struct ctlname { char *ctl_name; /* subsystem name */ @@ -139,7 +161,8 @@ struct ctlname { #define CTLFLAG_MASKED 0x04000000 /* deprecated variable, do not display */ #define CTLFLAG_NOAUTO 0x02000000 /* do not auto-register */ #define CTLFLAG_KERN 0x01000000 /* valid inside the kernel */ -#define CTLFLAG_LOCKED 0x00800000 /* node will handle locking itself (highly encouraged) */ +#define CTLFLAG_LOCKED 0x00800000 /* node will handle locking itself */ +#define CTLFLAG_OID2 0x00400000 /* struct sysctl_oid has version info */ /* * USE THIS instead of a hardwired number from the categories below @@ -161,33 +184,6 @@ struct ctlname { #define SYSCTL_HANDLER_ARGS (struct sysctl_oid *oidp, void *arg1, int arg2, \ struct sysctl_req *req) -/* - * Locking and stats - */ -struct sysctl_lock { - int sl_lock; - int sl_want; - int sl_locked; -}; - -#define MEMLOCK_LOCK() \ - do { \ - while (memlock.sl_lock) { \ - memlock.sl_want = 1; \ - (void) tsleep((caddr_t)&memlock, PRIBIO+1, "sysctl", 0); \ - memlock.sl_locked++; \ - } \ - memlock.sl_lock = 1; \ - } while(0) - -#define MEMLOCK_UNLOCK() \ - do { \ - memlock.sl_lock = 0; \ - if (memlock.sl_want) { \ - memlock.sl_want = 0; \ - wakeup((caddr_t)&memlock); \ - } \ - }while(0) /* * This describes the access space for a sysctl request. This is needed @@ -195,22 +191,55 @@ struct sysctl_lock { */ struct sysctl_req { struct proc *p; - int lock; - user_addr_t oldptr; - size_t oldlen; - size_t oldidx; - int (*oldfunc)(struct sysctl_req *, const void *, size_t); - user_addr_t newptr; - size_t newlen; - size_t newidx; - int (*newfunc)(struct sysctl_req *, void *, size_t); + int lock; + user_addr_t oldptr; /* pointer to user supplied buffer */ + size_t oldlen; /* user buffer length (also returned) */ + size_t oldidx; /* total data iteratively copied out */ + int (*oldfunc)(struct sysctl_req *, const void *, size_t); + user_addr_t newptr; /* buffer containing new value */ + size_t newlen; /* length of new value */ + size_t newidx; /* total data iteratively copied in */ + int (*newfunc)(struct sysctl_req *, void *, size_t); }; SLIST_HEAD(sysctl_oid_list, sysctl_oid); +#define SYSCTL_OID_VERSION 1 /* current OID structure version */ + /* * This describes one "oid" in the MIB tree. Potentially more nodes can * be hidden behind it, expanded by the handler. + * + * NOTES: We implement binary comparibility between CTLFLAG_OID2 and + * pre-CTLFLAG_OID2 structure in sysctl_register_oid() and in + * sysctl_unregister_oid() using the fact that the fields up + * to oid_fmt are unchanged, and that the field immediately + * following is on an alignment boundary following a pointer + * type and is also a pointer. This lets us get the previous + * size of the structure, and the copy-cut-off point, using + * the offsetof() language primitive, and these values are + * used in conjunction with the fact that earlier and future + * statically compiled sysctl_oid structures are declared via + * macros. This lets us overload the macros so that the addition + * of the CTLFLAG_OID2 in newly compiled code containing sysctl + * node declarations, subsequently allowing us to to avoid + * changing the KPI used for non-static (un)registration in + * KEXTs. + * + * This depends on the fact that people declare SYSCTLs, + * rather than declaring sysctl_oid structures. All new code + * should avoid declaring struct sysctl_oid's directly without + * the macros; the current risk for this is limited to losing + * your description field and ending up with a malloc'ed copy, + * as if it were a legacy binary static declaration via SYSCTL; + * in the future, we may deprecate access to a named structure + * type in third party code. Use the macros, or our code will + * end up with compile errors when that happens. + * + * Please try to include a long description of the field in any + * new sysctl declarations (all the macros support this). This + * field may be the only human readable documentation your users + * get for your sysctl. */ struct sysctl_oid { struct sysctl_oid_list *oid_parent; @@ -222,6 +251,9 @@ struct sysctl_oid { const char *oid_name; int (*oid_handler) SYSCTL_HANDLER_ARGS; const char *oid_fmt; + const char *oid_descr; /* offsetof() field / long description */ + int oid_version; + int oid_refcnt; }; #define SYSCTL_IN(r, p, l) (r->newfunc)(r, p, l) @@ -267,7 +299,7 @@ __END_DECLS #define SYSCTL_OID(parent, nbr, name, kind, a1, a2, handler, fmt, descr) \ struct sysctl_oid sysctl_##parent##_##name = { \ &sysctl_##parent##_children, { 0 }, \ - nbr, kind, a1, a2, #name, handler, fmt }; \ + nbr, kind|CTLFLAG_OID2, a1, a2, #name, handler, fmt, descr, SYSCTL_OID_VERSION, 0 }; \ SYSCTL_LINKER_SET_ENTRY(__sysctl_set, sysctl_##parent##_##name) /* This constructs a node from which other oids can hang. */ @@ -510,6 +542,9 @@ SYSCTL_DECL(_user); #define KERN_KDPIDEX 14 #define KERN_KDSETRTCDEC 15 #define KERN_KDGETENTROPY 16 +#define KERN_KDWRITETR 17 +#define KERN_KDWRITEMAP 18 + /* KERN_PANICINFO types */ #define KERN_PANICINFO_MAXSIZE 1 /* quad: panic UI image size limit */ diff --git a/bsd/sys/sysent.h b/bsd/sys/sysent.h index b83a67c44..df71d010f 100644 --- a/bsd/sys/sysent.h +++ b/bsd/sys/sysent.h @@ -31,9 +31,6 @@ #include #include -#ifdef __ppc__ -#include -#endif #ifdef KERNEL_PRIVATE #ifdef __APPLE_API_PRIVATE @@ -59,7 +56,7 @@ extern struct sysent sysent[]; #endif /* __INIT_SYSENT_C__ */ extern int nsysent; -#define NUM_SYSENT 434 /* Current number of defined syscalls */ +#define NUM_SYSENT 439 /* Current number of defined syscalls */ /* sy_funnel flags bits */ #define FUNNEL_MASK 0x07f diff --git a/bsd/sys/syslog.h b/bsd/sys/syslog.h index e85a4a817..71004cf2a 100644 --- a/bsd/sys/syslog.h +++ b/bsd/sys/syslog.h @@ -26,7 +26,7 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/* +/*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * @@ -38,10 +38,6 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. @@ -59,9 +55,10 @@ * SUCH DAMAGE. * * @(#)syslog.h 8.1 (Berkeley) 6/2/93 + * $FreeBSD: src/sys/sys/syslog.h,v 1.27.2.1.4.1 2010/06/14 02:09:06 kensmith Exp $ */ -#ifndef _SYS_SYSLOG_H_ +#ifndef _SYS_SYSLOG_H_ #define _SYS_SYSLOG_H_ #include @@ -88,54 +85,61 @@ #define LOG_DEBUG 7 /* debug-level messages */ #define LOG_PRIMASK 0x07 /* mask to extract priority part (internal) */ - /* extract priority */ +/* extract priority */ #define LOG_PRI(p) ((p) & LOG_PRIMASK) -#define LOG_MAKEPRI(fac, pri) (((fac) << 3) | (pri)) +#define LOG_MAKEPRI(fac, pri) ((fac) | (pri)) #ifdef SYSLOG_NAMES #define INTERNAL_NOPRI 0x10 /* the "no priority" priority */ - /* mark "facility" */ -#define INTERNAL_MARK LOG_MAKEPRI(LOG_NFACILITIES, 0) +/* mark "facility" */ +#define INTERNAL_MARK LOG_MAKEPRI((LOG_NFACILITIES<<3), 0) typedef struct _code { - char *c_name; - int c_val; + const char *c_name; + int c_val; } CODE; CODE prioritynames[] = { - { "alert", LOG_ALERT }, - { "crit", LOG_CRIT }, - { "debug", LOG_DEBUG }, - { "emerg", LOG_EMERG }, - { "err", LOG_ERR }, - { "error", LOG_ERR }, /* DEPRECATED */ - { "info", LOG_INFO }, - { "none", INTERNAL_NOPRI }, /* INTERNAL */ - { "notice", LOG_NOTICE }, - { "panic", LOG_EMERG }, /* DEPRECATED */ - { "warn", LOG_WARNING }, /* DEPRECATED */ - { "warning", LOG_WARNING }, - { 0, -1 } + { "alert", LOG_ALERT, }, + { "crit", LOG_CRIT, }, + { "debug", LOG_DEBUG, }, + { "emerg", LOG_EMERG, }, + { "err", LOG_ERR, }, + { "error", LOG_ERR, }, /* DEPRECATED */ + { "info", LOG_INFO, }, + { "none", INTERNAL_NOPRI, }, /* INTERNAL */ + { "notice", LOG_NOTICE, }, + { "panic", LOG_EMERG, }, /* DEPRECATED */ + { "warn", LOG_WARNING, }, /* DEPRECATED */ + { "warning", LOG_WARNING, }, + { NULL, -1, } }; #endif /* facility codes */ -#define LOG_KERN (0<<3) /* kernel messages */ -#define LOG_USER (1<<3) /* random user-level messages */ -#define LOG_MAIL (2<<3) /* mail system */ -#define LOG_DAEMON (3<<3) /* system daemons */ -#define LOG_AUTH (4<<3) /* security/authorization messages */ -#define LOG_SYSLOG (5<<3) /* messages generated internally by syslogd */ -#define LOG_LPR (6<<3) /* line printer subsystem */ -#define LOG_NEWS (7<<3) /* network news subsystem */ -#define LOG_UUCP (8<<3) /* UUCP subsystem */ -#define LOG_CRON (9<<3) /* clock daemon */ -#define LOG_AUTHPRIV (10<<3) /* security/authorization messages (private) */ -#define LOG_FTP (11<<3) /* ftp daemon */ -#define LOG_NETINFO (12<<3) /* NetInfo */ +#define LOG_KERN (0<<3) /* kernel messages */ +#define LOG_USER (1<<3) /* random user-level messages */ +#define LOG_MAIL (2<<3) /* mail system */ +#define LOG_DAEMON (3<<3) /* system daemons */ +#define LOG_AUTH (4<<3) /* authorization messages */ +#define LOG_SYSLOG (5<<3) /* messages generated internally by syslogd */ +#define LOG_LPR (6<<3) /* line printer subsystem */ +#define LOG_NEWS (7<<3) /* network news subsystem */ +#define LOG_UUCP (8<<3) /* UUCP subsystem */ +#define LOG_CRON (9<<3) /* clock daemon */ +#define LOG_AUTHPRIV (10<<3) /* authorization messages (private) */ +/* Facility #10 clashes in DEC UNIX, where */ +/* it's defined as LOG_MEGASAFE for AdvFS */ +/* event logging. */ +#define LOG_FTP (11<<3) /* ftp daemon */ +//#define LOG_NTP (12<<3) /* NTP subsystem */ +//#define LOG_SECURITY (13<<3) /* security subsystems (firewalling, etc.) */ +//#define LOG_CONSOLE (14<<3) /* /dev/console output */ +#define LOG_NETINFO (12<<3) /* NetInfo */ #define LOG_REMOTEAUTH (13<<3) /* remote authentication/authorization */ -#define LOG_INSTALL (14<<3) /* installer subsystem */ -#define LOG_RAS (15<<3) /* Remote Access Service (VPN / PPP) */ +#define LOG_INSTALL (14<<3) /* installer subsystem */ +#define LOG_RAS (15<<3) /* Remote Access Service (VPN / PPP) */ +/* other codes through 15 reserved for system use */ #define LOG_LOCAL0 (16<<3) /* reserved for local use */ #define LOG_LOCAL1 (17<<3) /* reserved for local use */ #define LOG_LOCAL2 (18<<3) /* reserved for local use */ @@ -145,43 +149,43 @@ CODE prioritynames[] = { #define LOG_LOCAL6 (22<<3) /* reserved for local use */ #define LOG_LOCAL7 (23<<3) /* reserved for local use */ -#define LOG_LAUNCHD (24<<3) /* launchd - general bootstrap daemon */ +#define LOG_LAUNCHD (24<<3) /* launchd - general bootstrap daemon */ #define LOG_NFACILITIES 25 /* current number of facilities */ #define LOG_FACMASK 0x03f8 /* mask to extract facility part */ - /* facility of pri */ +/* facility of pri */ #define LOG_FAC(p) (((p) & LOG_FACMASK) >> 3) #ifdef SYSLOG_NAMES CODE facilitynames[] = { - { "auth", LOG_AUTH }, - { "authpriv", LOG_AUTHPRIV }, - { "cron", LOG_CRON }, - { "daemon", LOG_DAEMON }, - { "ftp", LOG_FTP }, - { "install", LOG_INSTALL }, - { "kern", LOG_KERN }, - { "lpr", LOG_LPR }, - { "mail", LOG_MAIL }, - { "mark", INTERNAL_MARK }, /* INTERNAL */ - { "netinfo", LOG_NETINFO }, - { "ras", LOG_RAS }, - { "remoteauth", LOG_REMOTEAUTH }, - { "news", LOG_NEWS }, - { "security", LOG_AUTH }, /* DEPRECATED */ - { "syslog", LOG_SYSLOG }, - { "user", LOG_USER }, - { "uucp", LOG_UUCP }, - { "local0", LOG_LOCAL0 }, - { "local1", LOG_LOCAL1 }, - { "local2", LOG_LOCAL2 }, - { "local3", LOG_LOCAL3 }, - { "local4", LOG_LOCAL4 }, - { "local5", LOG_LOCAL5 }, - { "local6", LOG_LOCAL6 }, - { "local7", LOG_LOCAL7 }, - { "launchd", LOG_LAUNCHD }, - { 0, -1 } + { "auth", LOG_AUTH, }, + { "authpriv", LOG_AUTHPRIV, }, + { "cron", LOG_CRON, }, + { "daemon", LOG_DAEMON, }, + { "ftp", LOG_FTP, }, + { "install", LOG_INSTALL }, + { "kern", LOG_KERN, }, + { "lpr", LOG_LPR, }, + { "mail", LOG_MAIL, }, + { "mark", INTERNAL_MARK, }, /* INTERNAL */ + { "netinfo", LOG_NETINFO, }, + { "ras", LOG_RAS }, + { "remoteauth", LOG_REMOTEAUTH }, + { "news", LOG_NEWS, }, + { "security", LOG_AUTH }, /* DEPRECATED */ + { "syslog", LOG_SYSLOG, }, + { "user", LOG_USER, }, + { "uucp", LOG_UUCP, }, + { "local0", LOG_LOCAL0, }, + { "local1", LOG_LOCAL1, }, + { "local2", LOG_LOCAL2, }, + { "local3", LOG_LOCAL3, }, + { "local4", LOG_LOCAL4, }, + { "local5", LOG_LOCAL5, }, + { "local6", LOG_LOCAL6, }, + { "local7", LOG_LOCAL7, }, + { "launchd", LOG_LAUNCHD }, + { NULL, -1, } }; #endif @@ -211,18 +215,24 @@ CODE facilitynames[] = { #define LOG_PERROR 0x20 /* log to stderr as well */ #ifndef KERNEL -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -#include /* for __darwin_va_list */ -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ + +/* + * Don't use va_list in the vsyslog() prototype. Va_list is typedef'd in two + * places ( and ), so if we include one + * of them here we may collide with the utility's includes. It's unreasonable + * for utilities to have to include one of them to include syslog.h, so we get + * __va_list from and use it. + */ +#include __BEGIN_DECLS void closelog(void); void openlog(const char *, int, int); int setlogmask(int); -void syslog(int, const char *, ...) __DARWIN_LDBL_COMPAT(syslog); -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -void vsyslog(int, const char *, __darwin_va_list) __DARWIN_LDBL_COMPAT(vsyslog); -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ +void syslog(int, const char *, ...) __printflike(2, 3) __DARWIN_LDBL_COMPAT(syslog); +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL +void vsyslog(int, const char *, __darwin_va_list) __printflike(2, 0) __DARWIN_LDBL_COMPAT(vsyslog); +#endif __END_DECLS #else /* !KERNEL */ diff --git a/bsd/sys/systm.h b/bsd/sys/systm.h index d5fdbe392..f08bc477c 100644 --- a/bsd/sys/systm.h +++ b/bsd/sys/systm.h @@ -223,10 +223,17 @@ void bsd_untimeout(void (*)(void *), void *arg); void set_fsblocksize(struct vnode *); uint64_t tvtoabstime(struct timeval *); void *throttle_info_create(void); -void throttle_info_mount_ref(mount_t mp, void * throttle_info); -void throttle_info_mount_rel(mount_t mp); +void throttle_info_mount_ref(mount_t mp, void * throttle_info); +void throttle_info_mount_rel(mount_t mp); void throttle_info_release(void *throttle_info); void throttle_info_update(void *throttle_info, int flags); +uint32_t throttle_lowpri_io(int sleep_amount); +void throttle_set_thread_io_policy(int policy); +typedef struct __throttle_info_handle *throttle_info_handle_t; +int throttle_info_ref_by_mask( + uint64_t throttle_mask, throttle_info_handle_t *throttle_info_handle); +void throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle); +void throttle_info_update_by_mask(void *throttle_info_handle, int flags); __END_DECLS #endif /* !_SYS_SYSTM_H_ */ diff --git a/bsd/sys/time.h b/bsd/sys/time.h index 732d1ae76..a32ed62d6 100644 --- a/bsd/sys/time.h +++ b/bsd/sys/time.h @@ -68,6 +68,8 @@ #include #ifdef KERNEL #include /* user_time_t */ +#else /* !KERNEL */ +#include #endif /* KERNEL */ /* @@ -240,7 +242,7 @@ __BEGIN_DECLS #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) int adjtime(const struct timeval *, struct timeval *); int futimes(int, const struct timeval *); -int lutimes(const char *, const struct timeval *); +int lutimes(const char *, const struct timeval *) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); int settimeofday(const struct timeval *, const struct timezone *); #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ diff --git a/bsd/sys/tree.h b/bsd/sys/tree.h index f4bf40c73..42427ca31 100644 --- a/bsd/sys/tree.h +++ b/bsd/sys/tree.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -26,693 +26,4 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* $NetBSD: tree.h,v 1.13 2006/08/27 22:32:38 christos Exp $ */ -/* $OpenBSD: tree.h,v 1.7 2002/10/17 21:51:54 art Exp $ */ -/* - * Copyright 2002 Niels Provos - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _SYS_TREE_H_ -#define _SYS_TREE_H_ - -/* - * This file defines data structures for different types of trees: - * splay trees and red-black trees. - * - * A splay tree is a self-organizing data structure. Every operation - * on the tree causes a splay to happen. The splay moves the requested - * node to the root of the tree and partly rebalances it. - * - * This has the benefit that request locality causes faster lookups as - * the requested nodes move to the top of the tree. On the other hand, - * every lookup causes memory writes. - * - * The Balance Theorem bounds the total access time for m operations - * and n inserts on an initially empty tree as O((m + n)lg n). The - * amortized cost for a sequence of m accesses to a splay tree is O(lg n); - * - * A red-black tree is a binary search tree with the node color as an - * extra attribute. It fulfills a set of conditions: - * - every search path from the root to a leaf consists of the - * same number of black nodes, - * - each red node (except for the root) has a black parent, - * - each leaf node is black. - * - * Every operation on a red-black tree is bounded as O(lg n). - * The maximum height of a red-black tree is 2lg (n+1). - */ - -#define SPLAY_HEAD(name, type) \ -struct name { \ - struct type *sph_root; /* root of the tree */ \ -} - -#define SPLAY_INITIALIZER(root) \ - { NULL } - -#define SPLAY_INIT(root) do { \ - (root)->sph_root = NULL; \ -} while (/*CONSTCOND*/ 0) - -#define SPLAY_ENTRY(type) \ -struct { \ - struct type *spe_left; /* left element */ \ - struct type *spe_right; /* right element */ \ -} - -#define SPLAY_LEFT(elm, field) (elm)->field.spe_left -#define SPLAY_RIGHT(elm, field) (elm)->field.spe_right -#define SPLAY_ROOT(head) (head)->sph_root -#define SPLAY_EMPTY(head) (SPLAY_ROOT(head) == NULL) - -/* SPLAY_ROTATE_{LEFT,RIGHT} expect that tmp hold SPLAY_{RIGHT,LEFT} */ -#define SPLAY_ROTATE_RIGHT(head, tmp, field) do { \ - SPLAY_LEFT((head)->sph_root, field) = SPLAY_RIGHT(tmp, field); \ - SPLAY_RIGHT(tmp, field) = (head)->sph_root; \ - (head)->sph_root = tmp; \ -} while (/*CONSTCOND*/ 0) - -#define SPLAY_ROTATE_LEFT(head, tmp, field) do { \ - SPLAY_RIGHT((head)->sph_root, field) = SPLAY_LEFT(tmp, field); \ - SPLAY_LEFT(tmp, field) = (head)->sph_root; \ - (head)->sph_root = tmp; \ -} while (/*CONSTCOND*/ 0) - -#define SPLAY_LINKLEFT(head, tmp, field) do { \ - SPLAY_LEFT(tmp, field) = (head)->sph_root; \ - tmp = (head)->sph_root; \ - (head)->sph_root = SPLAY_LEFT((head)->sph_root, field); \ -} while (/*CONSTCOND*/ 0) - -#define SPLAY_LINKRIGHT(head, tmp, field) do { \ - SPLAY_RIGHT(tmp, field) = (head)->sph_root; \ - tmp = (head)->sph_root; \ - (head)->sph_root = SPLAY_RIGHT((head)->sph_root, field); \ -} while (/*CONSTCOND*/ 0) - -#define SPLAY_ASSEMBLE(head, node, left, right, field) do { \ - SPLAY_RIGHT(left, field) = SPLAY_LEFT((head)->sph_root, field); \ - SPLAY_LEFT(right, field) = SPLAY_RIGHT((head)->sph_root, field);\ - SPLAY_LEFT((head)->sph_root, field) = SPLAY_RIGHT(node, field); \ - SPLAY_RIGHT((head)->sph_root, field) = SPLAY_LEFT(node, field); \ -} while (/*CONSTCOND*/ 0) - -/* Generates prototypes and inline functions */ - -#define SPLAY_PROTOTYPE(name, type, field, cmp) \ -void name##_SPLAY(struct name *, struct type *); \ -void name##_SPLAY_MINMAX(struct name *, int); \ -struct type *name##_SPLAY_INSERT(struct name *, struct type *); \ -struct type *name##_SPLAY_REMOVE(struct name *, struct type *); \ - \ -/* Finds the node with the same key as elm */ \ -static __inline struct type * \ -name##_SPLAY_FIND(struct name *head, struct type *elm) \ -{ \ - if (SPLAY_EMPTY(head)) \ - return(NULL); \ - name##_SPLAY(head, elm); \ - if ((cmp)(elm, (head)->sph_root) == 0) \ - return (head->sph_root); \ - return (NULL); \ -} \ - \ -static __inline struct type * \ -name##_SPLAY_NEXT(struct name *head, struct type *elm) \ -{ \ - name##_SPLAY(head, elm); \ - if (SPLAY_RIGHT(elm, field) != NULL) { \ - elm = SPLAY_RIGHT(elm, field); \ - while (SPLAY_LEFT(elm, field) != NULL) { \ - elm = SPLAY_LEFT(elm, field); \ - } \ - } else \ - elm = NULL; \ - return (elm); \ -} \ - \ -static __inline struct type * \ -name##_SPLAY_MIN_MAX(struct name *head, int val) \ -{ \ - name##_SPLAY_MINMAX(head, val); \ - return (SPLAY_ROOT(head)); \ -} - -/* Main splay operation. - * Moves node close to the key of elm to top - */ -#define SPLAY_GENERATE(name, type, field, cmp) \ -struct type * \ -name##_SPLAY_INSERT(struct name *head, struct type *elm) \ -{ \ - if (SPLAY_EMPTY(head)) { \ - SPLAY_LEFT(elm, field) = SPLAY_RIGHT(elm, field) = NULL; \ - } else { \ - int __comp; \ - name##_SPLAY(head, elm); \ - __comp = (cmp)(elm, (head)->sph_root); \ - if(__comp < 0) { \ - SPLAY_LEFT(elm, field) = SPLAY_LEFT((head)->sph_root, field);\ - SPLAY_RIGHT(elm, field) = (head)->sph_root; \ - SPLAY_LEFT((head)->sph_root, field) = NULL; \ - } else if (__comp > 0) { \ - SPLAY_RIGHT(elm, field) = SPLAY_RIGHT((head)->sph_root, field);\ - SPLAY_LEFT(elm, field) = (head)->sph_root; \ - SPLAY_RIGHT((head)->sph_root, field) = NULL; \ - } else \ - return ((head)->sph_root); \ - } \ - (head)->sph_root = (elm); \ - return (NULL); \ -} \ - \ -struct type * \ -name##_SPLAY_REMOVE(struct name *head, struct type *elm) \ -{ \ - struct type *__tmp; \ - if (SPLAY_EMPTY(head)) \ - return (NULL); \ - name##_SPLAY(head, elm); \ - if ((cmp)(elm, (head)->sph_root) == 0) { \ - if (SPLAY_LEFT((head)->sph_root, field) == NULL) { \ - (head)->sph_root = SPLAY_RIGHT((head)->sph_root, field);\ - } else { \ - __tmp = SPLAY_RIGHT((head)->sph_root, field); \ - (head)->sph_root = SPLAY_LEFT((head)->sph_root, field);\ - name##_SPLAY(head, elm); \ - SPLAY_RIGHT((head)->sph_root, field) = __tmp; \ - } \ - return (elm); \ - } \ - return (NULL); \ -} \ - \ -void \ -name##_SPLAY(struct name *head, struct type *elm) \ -{ \ - struct type __node, *__left, *__right, *__tmp; \ - int __comp; \ -\ - SPLAY_LEFT(&__node, field) = SPLAY_RIGHT(&__node, field) = NULL;\ - __left = __right = &__node; \ -\ - while ((__comp = (cmp)(elm, (head)->sph_root)) != 0) { \ - if (__comp < 0) { \ - __tmp = SPLAY_LEFT((head)->sph_root, field); \ - if (__tmp == NULL) \ - break; \ - if ((cmp)(elm, __tmp) < 0){ \ - SPLAY_ROTATE_RIGHT(head, __tmp, field); \ - if (SPLAY_LEFT((head)->sph_root, field) == NULL)\ - break; \ - } \ - SPLAY_LINKLEFT(head, __right, field); \ - } else if (__comp > 0) { \ - __tmp = SPLAY_RIGHT((head)->sph_root, field); \ - if (__tmp == NULL) \ - break; \ - if ((cmp)(elm, __tmp) > 0){ \ - SPLAY_ROTATE_LEFT(head, __tmp, field); \ - if (SPLAY_RIGHT((head)->sph_root, field) == NULL)\ - break; \ - } \ - SPLAY_LINKRIGHT(head, __left, field); \ - } \ - } \ - SPLAY_ASSEMBLE(head, &__node, __left, __right, field); \ -} \ - \ -/* Splay with either the minimum or the maximum element \ - * Used to find minimum or maximum element in tree. \ - */ \ -void name##_SPLAY_MINMAX(struct name *head, int __comp) \ -{ \ - struct type __node, *__left, *__right, *__tmp; \ -\ - SPLAY_LEFT(&__node, field) = SPLAY_RIGHT(&__node, field) = NULL;\ - __left = __right = &__node; \ -\ - while (1) { \ - if (__comp < 0) { \ - __tmp = SPLAY_LEFT((head)->sph_root, field); \ - if (__tmp == NULL) \ - break; \ - if (__comp < 0){ \ - SPLAY_ROTATE_RIGHT(head, __tmp, field); \ - if (SPLAY_LEFT((head)->sph_root, field) == NULL)\ - break; \ - } \ - SPLAY_LINKLEFT(head, __right, field); \ - } else if (__comp > 0) { \ - __tmp = SPLAY_RIGHT((head)->sph_root, field); \ - if (__tmp == NULL) \ - break; \ - if (__comp > 0) { \ - SPLAY_ROTATE_LEFT(head, __tmp, field); \ - if (SPLAY_RIGHT((head)->sph_root, field) == NULL)\ - break; \ - } \ - SPLAY_LINKRIGHT(head, __left, field); \ - } \ - } \ - SPLAY_ASSEMBLE(head, &__node, __left, __right, field); \ -} - -#define SPLAY_NEGINF -1 -#define SPLAY_INF 1 - -#define SPLAY_INSERT(name, x, y) name##_SPLAY_INSERT(x, y) -#define SPLAY_REMOVE(name, x, y) name##_SPLAY_REMOVE(x, y) -#define SPLAY_FIND(name, x, y) name##_SPLAY_FIND(x, y) -#define SPLAY_NEXT(name, x, y) name##_SPLAY_NEXT(x, y) -#define SPLAY_MIN(name, x) (SPLAY_EMPTY(x) ? NULL \ - : name##_SPLAY_MIN_MAX(x, SPLAY_NEGINF)) -#define SPLAY_MAX(name, x) (SPLAY_EMPTY(x) ? NULL \ - : name##_SPLAY_MIN_MAX(x, SPLAY_INF)) - -#define SPLAY_FOREACH(x, name, head) \ - for ((x) = SPLAY_MIN(name, head); \ - (x) != NULL; \ - (x) = SPLAY_NEXT(name, head, x)) - -/* Macros that define a red-black tree */ -#define RB_HEAD(name, type) \ -struct name { \ - struct type *rbh_root; /* root of the tree */ \ -} - -#define RB_INITIALIZER(root) \ - { NULL } - -#define RB_INIT(root) do { \ - (root)->rbh_root = NULL; \ -} while (/*CONSTCOND*/ 0) - -#define RB_BLACK 0 -#define RB_RED 1 -#define RB_ENTRY(type) \ -struct { \ - struct type *rbe_left; /* left element */ \ - struct type *rbe_right; /* right element */ \ - struct type *rbe_parent; /* parent element */ \ - int rbe_color; /* node color */ \ -} - -#define RB_LEFT(elm, field) (elm)->field.rbe_left -#define RB_RIGHT(elm, field) (elm)->field.rbe_right -#define RB_PARENT(elm, field) (elm)->field.rbe_parent -#define RB_COLOR(elm, field) (elm)->field.rbe_color -#define RB_ROOT(head) (head)->rbh_root -#define RB_EMPTY(head) (RB_ROOT(head) == NULL) - -#define RB_SET(elm, parent, field) do { \ - RB_PARENT(elm, field) = parent; \ - RB_LEFT(elm, field) = RB_RIGHT(elm, field) = NULL; \ - RB_COLOR(elm, field) = RB_RED; \ -} while (/*CONSTCOND*/ 0) - -#define RB_SET_BLACKRED(black, red, field) do { \ - RB_COLOR(black, field) = RB_BLACK; \ - RB_COLOR(red, field) = RB_RED; \ -} while (/*CONSTCOND*/ 0) - -#ifndef RB_AUGMENT -#define RB_AUGMENT(x) (void)(x) -#endif - -#define RB_ROTATE_LEFT(head, elm, tmp, field) do { \ - (tmp) = RB_RIGHT(elm, field); \ - if ((RB_RIGHT(elm, field) = RB_LEFT(tmp, field)) != NULL) { \ - RB_PARENT(RB_LEFT(tmp, field), field) = (elm); \ - } \ - RB_AUGMENT(elm); \ - if ((RB_PARENT(tmp, field) = RB_PARENT(elm, field)) != NULL) { \ - if ((elm) == RB_LEFT(RB_PARENT(elm, field), field)) \ - RB_LEFT(RB_PARENT(elm, field), field) = (tmp); \ - else \ - RB_RIGHT(RB_PARENT(elm, field), field) = (tmp); \ - } else \ - (head)->rbh_root = (tmp); \ - RB_LEFT(tmp, field) = (elm); \ - RB_PARENT(elm, field) = (tmp); \ - RB_AUGMENT(tmp); \ - if ((RB_PARENT(tmp, field))) \ - RB_AUGMENT(RB_PARENT(tmp, field)); \ -} while (/*CONSTCOND*/ 0) - -#define RB_ROTATE_RIGHT(head, elm, tmp, field) do { \ - (tmp) = RB_LEFT(elm, field); \ - if ((RB_LEFT(elm, field) = RB_RIGHT(tmp, field)) != NULL) { \ - RB_PARENT(RB_RIGHT(tmp, field), field) = (elm); \ - } \ - RB_AUGMENT(elm); \ - if ((RB_PARENT(tmp, field) = RB_PARENT(elm, field)) != NULL) { \ - if ((elm) == RB_LEFT(RB_PARENT(elm, field), field)) \ - RB_LEFT(RB_PARENT(elm, field), field) = (tmp); \ - else \ - RB_RIGHT(RB_PARENT(elm, field), field) = (tmp); \ - } else \ - (head)->rbh_root = (tmp); \ - RB_RIGHT(tmp, field) = (elm); \ - RB_PARENT(elm, field) = (tmp); \ - RB_AUGMENT(tmp); \ - if ((RB_PARENT(tmp, field))) \ - RB_AUGMENT(RB_PARENT(tmp, field)); \ -} while (/*CONSTCOND*/ 0) - -/* Generates prototypes and inline functions */ -#define RB_PROTOTYPE(name, type, field, cmp) \ -void name##_RB_INSERT_COLOR(struct name *, struct type *); \ -void name##_RB_REMOVE_COLOR(struct name *, struct type *, struct type *);\ -struct type *name##_RB_REMOVE(struct name *, struct type *); \ -struct type *name##_RB_INSERT(struct name *, struct type *); \ -struct type *name##_RB_FIND(struct name *, struct type *); \ -struct type *name##_RB_NEXT(struct type *); \ -struct type *name##_RB_MINMAX(struct name *, int); - -/* Generates prototypes (with storage class) and inline functions */ -#define RB_PROTOTYPE_SC(_sc_, name, type, field, cmp) \ -_sc_ void name##_RB_INSERT_COLOR(struct name *, struct type *); \ -_sc_ void name##_RB_REMOVE_COLOR(struct name *, struct type *, struct type *); \ -_sc_ struct type *name##_RB_REMOVE(struct name *, struct type *); \ -_sc_ struct type *name##_RB_INSERT(struct name *, struct type *); \ -_sc_ struct type *name##_RB_FIND(struct name *, struct type *); \ -_sc_ struct type *name##_RB_NEXT(struct type *); \ -_sc_ struct type *name##_RB_MINMAX(struct name *, int); - -/* Main rb operation. - * Moves node close to the key of elm to top - */ -#define RB_GENERATE(name, type, field, cmp) \ -void \ -name##_RB_INSERT_COLOR(struct name *head, struct type *elm) \ -{ \ - struct type *parent, *gparent, *tmp; \ - while ((parent = RB_PARENT(elm, field)) != NULL && \ - RB_COLOR(parent, field) == RB_RED) { \ - gparent = RB_PARENT(parent, field); \ - if (parent == RB_LEFT(gparent, field)) { \ - tmp = RB_RIGHT(gparent, field); \ - if (tmp && RB_COLOR(tmp, field) == RB_RED) { \ - RB_COLOR(tmp, field) = RB_BLACK; \ - RB_SET_BLACKRED(parent, gparent, field);\ - elm = gparent; \ - continue; \ - } \ - if (RB_RIGHT(parent, field) == elm) { \ - RB_ROTATE_LEFT(head, parent, tmp, field);\ - tmp = parent; \ - parent = elm; \ - elm = tmp; \ - } \ - RB_SET_BLACKRED(parent, gparent, field); \ - RB_ROTATE_RIGHT(head, gparent, tmp, field); \ - } else { \ - tmp = RB_LEFT(gparent, field); \ - if (tmp && RB_COLOR(tmp, field) == RB_RED) { \ - RB_COLOR(tmp, field) = RB_BLACK; \ - RB_SET_BLACKRED(parent, gparent, field);\ - elm = gparent; \ - continue; \ - } \ - if (RB_LEFT(parent, field) == elm) { \ - RB_ROTATE_RIGHT(head, parent, tmp, field);\ - tmp = parent; \ - parent = elm; \ - elm = tmp; \ - } \ - RB_SET_BLACKRED(parent, gparent, field); \ - RB_ROTATE_LEFT(head, gparent, tmp, field); \ - } \ - } \ - RB_COLOR(head->rbh_root, field) = RB_BLACK; \ -} \ - \ -void \ -name##_RB_REMOVE_COLOR(struct name *head, struct type *parent, struct type *elm) \ -{ \ - struct type *tmp; \ - while ((elm == NULL || RB_COLOR(elm, field) == RB_BLACK) && \ - elm != RB_ROOT(head)) { \ - if (RB_LEFT(parent, field) == elm) { \ - tmp = RB_RIGHT(parent, field); \ - if (RB_COLOR(tmp, field) == RB_RED) { \ - RB_SET_BLACKRED(tmp, parent, field); \ - RB_ROTATE_LEFT(head, parent, tmp, field);\ - tmp = RB_RIGHT(parent, field); \ - } \ - if ((RB_LEFT(tmp, field) == NULL || \ - RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) &&\ - (RB_RIGHT(tmp, field) == NULL || \ - RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK)) {\ - RB_COLOR(tmp, field) = RB_RED; \ - elm = parent; \ - parent = RB_PARENT(elm, field); \ - } else { \ - if (RB_RIGHT(tmp, field) == NULL || \ - RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK) {\ - struct type *oleft; \ - if ((oleft = RB_LEFT(tmp, field)) \ - != NULL) \ - RB_COLOR(oleft, field) = RB_BLACK;\ - RB_COLOR(tmp, field) = RB_RED; \ - RB_ROTATE_RIGHT(head, tmp, oleft, field);\ - tmp = RB_RIGHT(parent, field); \ - } \ - RB_COLOR(tmp, field) = RB_COLOR(parent, field);\ - RB_COLOR(parent, field) = RB_BLACK; \ - if (RB_RIGHT(tmp, field)) \ - RB_COLOR(RB_RIGHT(tmp, field), field) = RB_BLACK;\ - RB_ROTATE_LEFT(head, parent, tmp, field);\ - elm = RB_ROOT(head); \ - break; \ - } \ - } else { \ - tmp = RB_LEFT(parent, field); \ - if (RB_COLOR(tmp, field) == RB_RED) { \ - RB_SET_BLACKRED(tmp, parent, field); \ - RB_ROTATE_RIGHT(head, parent, tmp, field);\ - tmp = RB_LEFT(parent, field); \ - } \ - if ((RB_LEFT(tmp, field) == NULL || \ - RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) &&\ - (RB_RIGHT(tmp, field) == NULL || \ - RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK)) {\ - RB_COLOR(tmp, field) = RB_RED; \ - elm = parent; \ - parent = RB_PARENT(elm, field); \ - } else { \ - if (RB_LEFT(tmp, field) == NULL || \ - RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) {\ - struct type *oright; \ - if ((oright = RB_RIGHT(tmp, field)) \ - != NULL) \ - RB_COLOR(oright, field) = RB_BLACK;\ - RB_COLOR(tmp, field) = RB_RED; \ - RB_ROTATE_LEFT(head, tmp, oright, field);\ - tmp = RB_LEFT(parent, field); \ - } \ - RB_COLOR(tmp, field) = RB_COLOR(parent, field);\ - RB_COLOR(parent, field) = RB_BLACK; \ - if (RB_LEFT(tmp, field)) \ - RB_COLOR(RB_LEFT(tmp, field), field) = RB_BLACK;\ - RB_ROTATE_RIGHT(head, parent, tmp, field);\ - elm = RB_ROOT(head); \ - break; \ - } \ - } \ - } \ - if (elm) \ - RB_COLOR(elm, field) = RB_BLACK; \ -} \ - \ -struct type * \ -name##_RB_REMOVE(struct name *head, struct type *elm) \ -{ \ - struct type *child, *parent, *old = elm; \ - int color; \ - if (RB_LEFT(elm, field) == NULL) \ - child = RB_RIGHT(elm, field); \ - else if (RB_RIGHT(elm, field) == NULL) \ - child = RB_LEFT(elm, field); \ - else { \ - struct type *left; \ - elm = RB_RIGHT(elm, field); \ - while ((left = RB_LEFT(elm, field)) != NULL) \ - elm = left; \ - child = RB_RIGHT(elm, field); \ - parent = RB_PARENT(elm, field); \ - color = RB_COLOR(elm, field); \ - if (child) \ - RB_PARENT(child, field) = parent; \ - if (parent) { \ - if (RB_LEFT(parent, field) == elm) \ - RB_LEFT(parent, field) = child; \ - else \ - RB_RIGHT(parent, field) = child; \ - RB_AUGMENT(parent); \ - } else \ - RB_ROOT(head) = child; \ - if (RB_PARENT(elm, field) == old) \ - parent = elm; \ - (elm)->field = (old)->field; \ - if (RB_PARENT(old, field)) { \ - if (RB_LEFT(RB_PARENT(old, field), field) == old)\ - RB_LEFT(RB_PARENT(old, field), field) = elm;\ - else \ - RB_RIGHT(RB_PARENT(old, field), field) = elm;\ - RB_AUGMENT(RB_PARENT(old, field)); \ - } else \ - RB_ROOT(head) = elm; \ - RB_PARENT(RB_LEFT(old, field), field) = elm; \ - if (RB_RIGHT(old, field)) \ - RB_PARENT(RB_RIGHT(old, field), field) = elm; \ - if (parent) { \ - left = parent; \ - do { \ - RB_AUGMENT(left); \ - } while ((left = RB_PARENT(left, field)) != NULL); \ - } \ - goto color; \ - } \ - parent = RB_PARENT(elm, field); \ - color = RB_COLOR(elm, field); \ - if (child) \ - RB_PARENT(child, field) = parent; \ - if (parent) { \ - if (RB_LEFT(parent, field) == elm) \ - RB_LEFT(parent, field) = child; \ - else \ - RB_RIGHT(parent, field) = child; \ - RB_AUGMENT(parent); \ - } else \ - RB_ROOT(head) = child; \ -color: \ - if (color == RB_BLACK) \ - name##_RB_REMOVE_COLOR(head, parent, child); \ - return (old); \ -} \ - \ -/* Inserts a node into the RB tree */ \ -struct type * \ -name##_RB_INSERT(struct name *head, struct type *elm) \ -{ \ - struct type *tmp; \ - struct type *parent = NULL; \ - int comp = 0; \ - tmp = RB_ROOT(head); \ - while (tmp) { \ - parent = tmp; \ - comp = (cmp)(elm, parent); \ - if (comp < 0) \ - tmp = RB_LEFT(tmp, field); \ - else if (comp > 0) \ - tmp = RB_RIGHT(tmp, field); \ - else \ - return (tmp); \ - } \ - RB_SET(elm, parent, field); \ - if (parent != NULL) { \ - if (comp < 0) \ - RB_LEFT(parent, field) = elm; \ - else \ - RB_RIGHT(parent, field) = elm; \ - RB_AUGMENT(parent); \ - } else \ - RB_ROOT(head) = elm; \ - name##_RB_INSERT_COLOR(head, elm); \ - return (NULL); \ -} \ - \ -/* Finds the node with the same key as elm */ \ -struct type * \ -name##_RB_FIND(struct name *head, struct type *elm) \ -{ \ - struct type *tmp = RB_ROOT(head); \ - int comp; \ - while (tmp) { \ - comp = cmp(elm, tmp); \ - if (comp < 0) \ - tmp = RB_LEFT(tmp, field); \ - else if (comp > 0) \ - tmp = RB_RIGHT(tmp, field); \ - else \ - return (tmp); \ - } \ - return (NULL); \ -} \ - \ -/* ARGSUSED */ \ -struct type * \ -name##_RB_NEXT(struct type *elm) \ -{ \ - if (RB_RIGHT(elm, field)) { \ - elm = RB_RIGHT(elm, field); \ - while (RB_LEFT(elm, field)) \ - elm = RB_LEFT(elm, field); \ - } else { \ - if (RB_PARENT(elm, field) && \ - (elm == RB_LEFT(RB_PARENT(elm, field), field))) \ - elm = RB_PARENT(elm, field); \ - else { \ - while (RB_PARENT(elm, field) && \ - (elm == RB_RIGHT(RB_PARENT(elm, field), field)))\ - elm = RB_PARENT(elm, field); \ - elm = RB_PARENT(elm, field); \ - } \ - } \ - return (elm); \ -} \ - \ -struct type * \ -name##_RB_MINMAX(struct name *head, int val) \ -{ \ - struct type *tmp = RB_ROOT(head); \ - struct type *parent = NULL; \ - while (tmp) { \ - parent = tmp; \ - if (val < 0) \ - tmp = RB_LEFT(tmp, field); \ - else \ - tmp = RB_RIGHT(tmp, field); \ - } \ - return (parent); \ -} - -#define RB_NEGINF -1 -#define RB_INF 1 - -#define RB_INSERT(name, x, y) name##_RB_INSERT(x, y) -#define RB_REMOVE(name, x, y) name##_RB_REMOVE(x, y) -#define RB_FIND(name, x, y) name##_RB_FIND(x, y) -#define RB_NEXT(name, x, y) name##_RB_NEXT(y) -#define RB_MIN(name, x) name##_RB_MINMAX(x, RB_NEGINF) -#define RB_MAX(name, x) name##_RB_MINMAX(x, RB_INF) - -#define RB_FOREACH(x, name, head) \ - for ((x) = RB_MIN(name, head); \ - (x) != NULL; \ - (x) = name##_RB_NEXT(x)) - -#endif /* _SYS_TREE_H_ */ +#include diff --git a/bsd/sys/tty.h b/bsd/sys/tty.h index f0f546c48..ecfb234d5 100644 --- a/bsd/sys/tty.h +++ b/bsd/sys/tty.h @@ -220,6 +220,8 @@ struct clist; #define TS_DSR_OFLOW 0x800000 /* For CDSR_OFLOW. */ #endif +#define TS_IOCTL_NOT_OK 0x1000000 /* Workaround */ + /* Character type information. */ #define ORDINARY 0 diff --git a/bsd/sys/ubc.h b/bsd/sys/ubc.h index a26ba1caa..4ee9e86cf 100644 --- a/bsd/sys/ubc.h +++ b/bsd/sys/ubc.h @@ -70,6 +70,7 @@ int ubc_setcred(struct vnode *, struct proc *) __deprecated; /* code signing */ struct cs_blob; struct cs_blob *ubc_cs_blob_get(vnode_t, cpu_type_t, off_t); +int cs_entitlements_blob_get(proc_t p, void **, size_t *); #endif /* cluster IO routines */ diff --git a/bsd/sys/ubc_internal.h b/bsd/sys/ubc_internal.h index 775a8457b..d7197f089 100644 --- a/bsd/sys/ubc_internal.h +++ b/bsd/sys/ubc_internal.h @@ -87,6 +87,8 @@ struct cl_readahead { struct cl_writebehind { lck_mtx_t cl_lockw; void * cl_scmap; /* pointer to sparse cluster map */ + off_t cl_last_write; /* offset of the end of the last write */ + off_t cl_seq_written; /* sequentially written bytes */ int cl_sparse_pushes; /* number of pushes outside of the cl_lockw in progress */ int cl_sparse_wait; /* synchronous push is in progress */ int cl_number; /* number of packed write behind clusters currently valid */ @@ -124,6 +126,13 @@ struct ubc_info { struct cl_writebehind *cl_wbehind; /* cluster write behind context */ struct cs_blob *cs_blobs; /* for CODE SIGNING */ +#if CHECK_CS_VALIDATION_BITMAP + void *cs_valid_bitmap; /* right now: used only for signed files on the read-only root volume */ + uint64_t cs_valid_bitmap_size; /* Save original bitmap size in case the file size changes. + * In the future, we may want to reconsider changing the + * underlying bitmap to reflect the new file size changes. + */ +#endif /* CHECK_CS_VALIDATION_BITMAP */ }; /* Defines for ui_flags */ @@ -159,6 +168,7 @@ __private_extern__ uint32_t cluster_hard_throttle_limit(vnode_t, uint32_t *, uin #define UBC_FOR_PAGEOUT 0x0002 memory_object_control_t ubc_getobject(vnode_t, int); +boolean_t ubc_strict_uncached_IO(vnode_t); int ubc_info_init(vnode_t); int ubc_info_init_withsize(vnode_t, off_t); @@ -181,6 +191,8 @@ int ubc_cs_getcdhash(vnode_t, off_t, unsigned char *); kern_return_t ubc_cs_blob_allocate(vm_offset_t *, vm_size_t *); void ubc_cs_blob_deallocate(vm_offset_t, vm_size_t); +kern_return_t ubc_cs_validation_bitmap_allocate( vnode_t ); +void ubc_cs_validation_bitmap_deallocate( vnode_t ); __END_DECLS diff --git a/bsd/sys/ucontext.h b/bsd/sys/ucontext.h index b31d50ed3..249cf5e23 100644 --- a/bsd/sys/ucontext.h +++ b/bsd/sys/ucontext.h @@ -35,12 +35,6 @@ #define __need_mcontext_t #define __need_stack_t #define __need_ucontext_t -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -#if defined(__ppc__) || defined(__ppc64__) -#define __need_mcontext64_t -#define __need_ucontext64_t -#endif /* __ppc__|| __ppc64__ */ -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ #include #ifndef _SIGSET_T diff --git a/bsd/sys/ucred.h b/bsd/sys/ucred.h index 0d8b0f2a4..6d914a4df 100644 --- a/bsd/sys/ucred.h +++ b/bsd/sys/ucred.h @@ -90,6 +90,7 @@ struct ucred { TAILQ_ENTRY(ucred) cr_link; /* never modify this without KAUTH_CRED_HASH_LOCK */ u_long cr_ref; /* reference count */ +struct posix_cred { /* * The credential hash depends on everything from this point on * (see kauth_cred_get_hashkey) @@ -102,15 +103,9 @@ struct ucred { gid_t cr_rgid; /* real group id */ gid_t cr_svgid; /* saved group id */ uid_t cr_gmuid; /* UID for group membership purposes */ - /* - * XXX - cr_au will be replaced with cr_audit below. - * cr_au is here to keep kexts from breaking. It seems to - * be currently used by the ucred hashing as well. - */ - struct auditinfo cr_au; /* XXX This needs to go away. */ - struct label *cr_label; /* MAC label */ - int cr_flags; /* flags on credential */ +} cr_posix; + struct label *cr_label; /* MAC label */ /* * NOTE: If anything else (besides the flags) * added after the label, you must change @@ -121,6 +116,7 @@ struct ucred { #ifndef _KAUTH_CRED_T #define _KAUTH_CRED_T typedef struct ucred *kauth_cred_t; +typedef struct posix_cred *posix_cred_t; #endif /* !_KAUTH_CRED_T */ /* diff --git a/bsd/sys/un.h b/bsd/sys/un.h index 479058ff2..400620396 100644 --- a/bsd/sys/un.h +++ b/bsd/sys/un.h @@ -83,8 +83,13 @@ struct sockaddr_un { }; #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) + +/* Level number of get/setsockopt for local domain sockets */ +#define SOL_LOCAL 0 + /* Socket options. */ #define LOCAL_PEERCRED 0x001 /* retrieve peer credentails */ + #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ diff --git a/bsd/sys/unistd.h b/bsd/sys/unistd.h index d80b3bbd3..c778c66f3 100644 --- a/bsd/sys/unistd.h +++ b/bsd/sys/unistd.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -174,6 +174,7 @@ struct accessx_descriptor { #define _PC_REC_XFER_ALIGN 23 /* Recommended buffer alignment */ #define _PC_SYMLINK_MAX 24 /* Max # of bytes in symlink name */ #define _PC_SYNC_IO 25 /* Sync I/O [SIO] supported? */ +#define _PC_XATTR_SIZE_BITS 26 /* # of bits to represent maximum xattr size */ /* configurable system strings */ #define _CS_PATH 1 diff --git a/bsd/sys/unpcb.h b/bsd/sys/unpcb.h index 2376c11f8..a50aebe36 100644 --- a/bsd/sys/unpcb.h +++ b/bsd/sys/unpcb.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2008-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -134,7 +134,7 @@ struct unpcb { unp_gen_t unp_gencnt; /* generation count of this instance */ int unp_flags; /* flags */ struct xucred unp_peercred; /* peer credentials, if applicable */ - lck_mtx_t *unp_mtx; /* per unpcb lock */ + decl_lck_mtx_data( ,unp_mtx); /* per unpcb lock */ int rw_thrcount; /* disconnect should wait for this count to become zero */ }; #endif /* KERNEL */ @@ -155,6 +155,7 @@ struct unpcb { #define UNP_HAVEPC 0x0001 #define UNP_HAVEPCCACHED 0x0002 #define UNP_DONTDISCONNECT 0x0004 +#define UNP_TRACE_MDNS 0x1000 #ifdef KERNEL struct unpcb_compat { diff --git a/bsd/sys/user.h b/bsd/sys/user.h index 66f355110..4a59aa866 100644 --- a/bsd/sys/user.h +++ b/bsd/sys/user.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,12 +79,15 @@ #endif #include /* XXX */ #include - + + #ifdef KERNEL +#ifdef BSD_KERNEL_PRIVATE +#include /* for uu_kwe entry */ +#endif /* BSD_KERNEL_PRIVATE */ #ifdef __APPLE_API_PRIVATE #include - #if !defined(__LP64__) || defined(XNU_KERNEL_PRIVATE) /* * VFS context structure (part of uthread) @@ -124,7 +127,7 @@ struct uthread { int poll; int error; int count; - int kfcount; + int _reserved1; // UNUSED: avoid changing size for now char * wql; } uu_select; /* saved state for select() */ /* to support kevent continuations */ @@ -156,7 +159,9 @@ struct uthread { caddr_t uu_wchan; /* sleeping thread wait channel */ const char *uu_wmesg; /* ... wait message */ int uu_flag; +#if CONFIG_EMBEDDED int uu_iopol_disk; /* disk I/O policy */ +#endif /* CONFIG_EMBEDDED */ struct proc * uu_proc; void * uu_userstate; wait_queue_set_t uu_wqset; /* cached across select calls */ @@ -172,12 +177,12 @@ struct uthread { struct kaudit_record *uu_ar; /* audit record */ struct task* uu_aio_task; /* target task for async io */ - - /* network support for dlil layer locking */ - u_int32_t dlil_incremented_read; + lck_mtx_t *uu_mtx; int uu_lowpri_window; + boolean_t uu_throttle_isssd; + boolean_t uu_throttle_bc; void * uu_throttle_info; /* pointer to throttled I/Os info */ struct kern_sigaltstack uu_sigstk; @@ -191,12 +196,14 @@ struct uthread { int uu_iocount; int uu_vpindex; void * uu_vps[32]; + void * uu_pcs[32][10]; #endif #if CONFIG_DTRACE siginfo_t t_dtrace_siginfo; uint32_t t_dtrace_errno; /* Most recent errno */ - uint8_t t_dtrace_stop; /* indicates a DTrace-desired stop */ + uint8_t t_dtrace_stop; /* indicates a DTrace desired stop */ uint8_t t_dtrace_sig; /* signal sent via DTrace's raise() */ + uint64_t t_dtrace_resumepid; /* DTrace's pidresume() pid */ union __tdu { struct __tds { @@ -232,10 +239,7 @@ struct uthread { #endif /* CONFIG_DTRACE */ void * uu_threadlist; char * pth_name; - TAILQ_ENTRY(uthread) uu_mtxlist; /* psynch waiters list*/ - uint32_t uu_lockseq; /* seq on arrival */ - uint32_t uu_psynchretval; /* pmtx retval */ - void * uu_kwqqueue; /* queue blocked on */ + struct ksyn_waitq_element uu_kwe; /* user for pthread synch */ }; typedef struct uthread * uthread_t; @@ -252,7 +256,9 @@ typedef struct uthread * uthread_t; #define UT_PASSIVE_IO 0x00000100 /* this thread issues passive I/O */ #define UT_PROCEXIT 0x00000200 /* this thread completed the proc exit */ #define UT_RAGE_VNODES 0x00000400 /* rapid age any vnodes created by this thread */ +#if CONFIG_EMBEDDED #define UT_BACKGROUND 0x00000800 /* this thread is in background state */ +#endif /* !CONFIG_EMBEDDED */ #define UT_BACKGROUND_TRAFFIC_MGT 0x00001000 /* background traffic is regulated */ #define UT_VFORK 0x02000000 /* thread has vfork children */ diff --git a/bsd/sys/vfs_context.h b/bsd/sys/vfs_context.h index 16453bb7a..fd31f99e3 100644 --- a/bsd/sys/vfs_context.h +++ b/bsd/sys/vfs_context.h @@ -5,7 +5,9 @@ #include #include #include +#ifdef BSD_KERNEL_PRIVATE #include +#endif #include /* diff --git a/bsd/sys/vnode.h b/bsd/sys/vnode.h index 65620f277..965518cb0 100644 --- a/bsd/sys/vnode.h +++ b/bsd/sys/vnode.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -145,6 +145,7 @@ enum vtagtype { #define IO_BACKGROUND IO_PASSIVE /* used for backward compatibility. to be removed after IO_BACKGROUND is no longer * used by DiskImages in-kernel mode */ #define IO_NOAUTH 0x8000 /* No authorization checks. */ +#define IO_NODIRECT 0x10000 /* don't use direct synchronous writes if IO_NOCACHE is specified */ /* @@ -159,15 +160,15 @@ struct componentname { uint32_t cn_flags; /* flags (see below) */ #ifdef BSD_KERNEL_PRIVATE vfs_context_t cn_context; - void * pad_obsolete2; + struct nameidata *cn_ndp; /* pointer back to nameidata */ /* XXX use of these defines are deprecated */ #define cn_proc (cn_context->vc_proc + 0) /* non-lvalue */ #define cn_cred (cn_context->vc_ucred + 0) /* non-lvalue */ #else - void * obsolete1; /* use vfs_context_t */ - void * obsolete2; /* use vfs_context_t */ + void * cn_reserved1; /* use vfs_context_t */ + void * cn_reserved2; /* use vfs_context_t */ #endif /* * Shared between lookup and commit routines. @@ -201,8 +202,8 @@ struct componentname { #define ISDOTDOT 0x00002000 /* current component name is .. */ #define MAKEENTRY 0x00004000 /* entry is to be added to name cache */ #define ISLASTCN 0x00008000 /* this is last component of pathname */ -#define ISWHITEOUT 0x00020000 /* found whiteout */ -#define DOWHITEOUT 0x00040000 /* do whiteouts */ +#define ISWHITEOUT 0x00020000 /* OBSOLETE: found whiteout */ +#define DOWHITEOUT 0x00040000 /* OBSOLETE: do whiteouts */ /* The following structure specifies a vnode for creation */ @@ -228,6 +229,234 @@ struct vnode_fsparam { #define VNCREATE_FLAVOR 0 #define VCREATESIZE sizeof(struct vnode_fsparam) + +#ifdef KERNEL_PRIVATE +/* + * Resolver callback SPI for trigger vnodes + * + * Only available from kernels built with CONFIG_TRIGGERS option + */ + +/*! + @enum Pathname Lookup Operations + @abstract Constants defining pathname operations (passed to resolver callbacks) + */ +enum path_operation { + OP_LOOKUP, + OP_MOUNT, + OP_UNMOUNT, + OP_STATFS, + OP_OPEN, + OP_LINK, + OP_UNLINK, + OP_RENAME, + OP_CHDIR, + OP_CHROOT, + OP_MKNOD, + OP_MKFIFO, + OP_SYMLINK, + OP_ACCESS, + OP_PATHCONF, + OP_READLINK, + OP_GETATTR, + OP_SETATTR, + OP_TRUNCATE, + OP_COPYFILE, + OP_MKDIR, + OP_RMDIR, + OP_REVOKE, + OP_EXCHANGEDATA, + OP_SEARCHFS, + OP_FSCTL, + OP_GETXATTR, + OP_SETXATTR, + OP_REMOVEXATTR, + OP_LISTXATTR, + OP_MAXOP /* anything beyond previous entry is invalid */ +}; + +/* + * is operation a traditional trigger (autofs)? + * 1 if trigger, 0 if no trigger + */ +extern int vfs_istraditionaltrigger(enum path_operation op, const struct componentname *cnp); + +/*! + @enum resolver status + @abstract Constants defining resolver status + @constant RESOLVER_RESOLVED the resolver has finished (typically means a successful mount) + @constant RESOLVER_NOCHANGE the resolver status didn't change + @constant RESOLVER_UNRESOLVED the resolver has finished (typically means a successful unmount) + @constant RESOLVER_ERROR the resolver encountered an error (errno passed in aux value) + @constant RESOLVER_STOP a request to destroy trigger XXX do we need this??? + */ +enum resolver_status { + RESOLVER_RESOLVED, + RESOLVER_NOCHANGE, + RESOLVER_UNRESOLVED, + RESOLVER_ERROR, + RESOLVER_STOP +}; + +typedef uint64_t resolver_result_t; + +/* + * Compound resolver result + * + * The trigger vnode callbacks use a compound result value. In addition + * to the resolver status, it contains a sequence number and an auxiliary + * value. + * + * The sequence value is used by VFS to sequence-stamp trigger vnode + * state transitions. It is expected to be incremented each time a + * resolver changes state (ie resolved or unresolved). A result + * containing a stale sequence (older than a trigger vnode's current + * value) will be ignored by VFS. + * + * The auxiliary value is currently only used to deliver the errno + * value for RESOLVER_ERROR status conditions. When a RESOLVER_ERROR + * occurs, VFS will propagate this error back to the syscall that + * encountered the trigger vnode. + */ +extern resolver_result_t vfs_resolver_result(uint32_t seq, enum resolver_status stat, int aux); + +/* + * Extract values from a compound resolver result + */ +extern enum resolver_status vfs_resolver_status(resolver_result_t); +extern uint32_t vfs_resolver_sequence(resolver_result_t); +extern int vfs_resolver_auxiliary(resolver_result_t); + + +/*! + @typedef trigger_vnode_resolve_callback_t + @abstract function prototype for a trigger vnode resolve callback + @discussion This function is associated with a trigger vnode during a vnode create. It is + typically called when a lookup operation occurs for a trigger vnode + @param vp The trigger vnode which needs resolving + @param cnp Various data about lookup, e.g. filename and state flags + @param pop The pathname operation that initiated the lookup (see enum path_operation). + @param flags + @param data Arbitrary data supplied by vnode trigger creator + @param ctx Context for authentication. + @return RESOLVER_RESOLVED, RESOLVER_NOCHANGE, RESOLVER_UNRESOLVED or RESOLVER_ERROR +*/ +typedef resolver_result_t (* trigger_vnode_resolve_callback_t)( + vnode_t vp, + const struct componentname * cnp, + enum path_operation pop, + int flags, + void * data, + vfs_context_t ctx); + +/*! + @typedef trigger_vnode_unresolve_callback_t + @abstract function prototype for a trigger vnode unresolve callback + @discussion This function is associated with a trigger vnode during a vnode create. It is + called to unresolve a trigger vnode (typically this means unmount). + @param vp The trigger vnode which needs unresolving + @param flags Unmount flags + @param data Arbitrary data supplied by vnode trigger creator + @param ctx Context for authentication. + @return RESOLVER_NOCHANGE, RESOLVER_UNRESOLVED or RESOLVER_ERROR +*/ +typedef resolver_result_t (* trigger_vnode_unresolve_callback_t)( + vnode_t vp, + int flags, + void * data, + vfs_context_t ctx); + +/*! + @typedef trigger_vnode_rearm_callback_t + @abstract function prototype for a trigger vnode rearm callback + @discussion This function is associated with a trigger vnode during a vnode create. It is + called to verify a rearm from VFS (i.e. should VFS rearm the trigger?). + @param vp The trigger vnode which needs rearming + @param flags + @param data Arbitrary data supplied by vnode trigger creator + @param ctx Context for authentication. + @return RESOLVER_NOCHANGE or RESOLVER_ERROR +*/ +typedef resolver_result_t (* trigger_vnode_rearm_callback_t)( + vnode_t vp, + int flags, + void * data, + vfs_context_t ctx); + +/*! + @typedef trigger_vnode_reclaim_callback_t + @abstract function prototype for a trigger vnode reclaim callback + @discussion This function is associated with a trigger vnode during a vnode create. It is + called to deallocate private callback argument data + @param vp The trigger vnode associated with the data + @param data The arbitrary data supplied by vnode trigger creator +*/ +typedef void (* trigger_vnode_reclaim_callback_t)( + vnode_t vp, + void * data); + +/*! + @function vnode_trigger_update + @abstract Update a trigger vnode's state. + @discussion This allows a resolver to notify VFS of a state change in a trigger vnode. + @param vp The trigger vnode whose information to update. + @param result A compound resolver result value + @return EINVAL if result value is invalid or vp isn't a trigger vnode + */ +extern int vnode_trigger_update(vnode_t vp, resolver_result_t result); + +struct vnode_trigger_info { + trigger_vnode_resolve_callback_t vti_resolve_func; + trigger_vnode_unresolve_callback_t vti_unresolve_func; + trigger_vnode_rearm_callback_t vti_rearm_func; + trigger_vnode_reclaim_callback_t vti_reclaim_func; + void * vti_data; /* auxiliary data (optional) */ + uint32_t vti_flags; /* optional flags (see below) */ +}; + +/* + * SPI for creating a trigger vnode + * + * Uses the VNCREATE_TRIGGER flavor with existing vnode_create() KPI + * + * Only one resolver per vnode. + * + * ERRORS (in addition to vnode_create errors): + * EINVAL (invalid resolver info, like invalid flags) + * ENOTDIR (only directories can have a resolver) + * EPERM (vnode cannot be a trigger - eg root dir of a file system) + * ENOMEM + */ +struct vnode_trigger_param { + struct vnode_fsparam vnt_params; /* same as for VNCREATE_FLAVOR */ + trigger_vnode_resolve_callback_t vnt_resolve_func; + trigger_vnode_unresolve_callback_t vnt_unresolve_func; + trigger_vnode_rearm_callback_t vnt_rearm_func; + trigger_vnode_reclaim_callback_t vnt_reclaim_func; + void * vnt_data; /* auxiliary data (optional) */ + uint32_t vnt_flags; /* optional flags (see below) */ +}; + +#define VNCREATE_TRIGGER (('T' << 8) + ('V')) +#define VNCREATE_TRIGGER_SIZE sizeof(struct vnode_trigger_param) + +/* + * vnode trigger flags (vnt_flags) + * + * VNT_AUTO_REARM: + * On unmounts of a trigger mount, automatically re-arm the trigger. + * + * VNT_NO_DIRECT_MOUNT: + * A trigger vnode instance that doesn't directly trigger a mount, + * instead it triggers the mounting of sub-trigger nodes. + */ +#define VNT_AUTO_REARM (1 << 0) +#define VNT_NO_DIRECT_MOUNT (1 << 1) +#define VNT_VALID_MASK (VNT_AUTO_REARM | VNT_NO_DIRECT_MOUNT) + +#endif /* KERNEL_PRIVATE */ + + /* * Vnode attributes, new-style. * @@ -287,6 +516,7 @@ struct vnode_fsparam { #define VNODE_ATTR_va_guuid (1LL<<27) /* 08000000 */ #define VNODE_ATTR_va_nchildren (1LL<<28) /* 10000000 */ #define VNODE_ATTR_va_dirlinkcount (1LL<<29) /* 20000000 */ +#define VNODE_ATTR_va_addedtime (1LL<<30) /* 40000000 */ #define VNODE_ATTR_BIT(n) (VNODE_ATTR_ ## n) /* @@ -307,7 +537,8 @@ struct vnode_fsparam { VNODE_ATTR_BIT(va_name) | \ VNODE_ATTR_BIT(va_type) | \ VNODE_ATTR_BIT(va_nchildren) | \ - VNODE_ATTR_BIT(va_dirlinkcount)) + VNODE_ATTR_BIT(va_dirlinkcount)| \ + VNODE_ATTR_BIT(va_addedtime)) /* * Attributes that can be applied to a new file object. */ @@ -381,14 +612,23 @@ struct vnode_attr { uint64_t va_dirlinkcount; /* Real references to dir (i.e. excluding "." and ".." refs) */ /* add new fields here only */ +#ifdef BSD_KERNEL_PRIVATE + struct kauth_acl *va_base_acl; +#else + void * va_reserved1; +#endif /* BSD_KERNEL_PRIVATE */ + struct timespec va_addedtime; /* timestamp when item was added to parent directory */ + }; /* * Flags for va_vaflags. */ -#define VA_UTIMES_NULL 0x010000 /* utimes argument was NULL */ -#define VA_EXCLUSIVE 0x020000 /* exclusive create request */ +#define VA_UTIMES_NULL 0x010000 /* utimes argument was NULL */ +#define VA_EXCLUSIVE 0x020000 /* exclusive create request */ +#define VA_NOINHERIT 0x040000 /* Don't inherit ACLs from parent */ +#define VA_NOAUTH 0x080000 /* * Modes. Some values same as Ixxx entries from inode.h for now. @@ -761,6 +1001,14 @@ int vnode_isnocache(vnode_t); */ int vnode_israge(vnode_t); +/*! + @function vnode_needssnapshots + @abstract Check if a vnode needs snapshots events (regardless of its ctime status) + @param vp The vnode to test. + @return Nonzero if vnode needs snapshot events, 0 otherwise + */ +int vnode_needssnapshots(vnode_t); + /*! @function vnode_setnocache @abstract Set a vnode to not have its data cached in memory (i.e. we write-through to disk and always read from disk). @@ -992,6 +1240,20 @@ int vfs_context_rele(vfs_context_t); vfs_context_t vfs_context_current(void); #ifdef KERNEL_PRIVATE int vfs_context_bind(vfs_context_t); + +/*! + @function vfs_ctx_skipatime + @abstract Check to see if this context should skip updating a vnode's access times. + @discussion This is currently tied to the vnode rapid aging process. If the process is marked for rapid aging, + then the kernel should not update vnodes it touches for access time purposes. This will check to see if the + specified process and/or thread is marked for rapid aging when it manipulates vnodes. + @param ctx The context being investigated. + @return 1 if we should skip access time updates. + @return 0 if we should NOT skip access time updates. + */ + +int vfs_ctx_skipatime(vfs_context_t ctx); + #endif /*! @@ -1048,6 +1310,10 @@ int vnode_get(vnode_t); */ int vnode_getwithvid(vnode_t, uint32_t); +#ifdef BSD_KERNEL_PRIVATE +int vnode_getwithvid_drainok(vnode_t, uint32_t); +#endif /* BSD_KERNEL_PRIVATE */ + /*! @function vnode_getwithref @abstract Increase the iocount on a vnode on which a usecount (persistent reference) is held. @@ -1172,6 +1438,17 @@ int vnode_notify(vnode_t, uint32_t, struct vnode_attr*); */ int vnode_ismonitored(vnode_t); + +/*! + @function vnode_isdyldsharedcache + @abstract Check whether a file is a dyld shared cache file. + @param vp Vnode to examine. + @discussion Will not reenter the filesystem. + @return nonzero if a dyld shared cache file, zero otherwise. + */ +int vnode_isdyldsharedcache(vnode_t); + + /*! @function vfs_get_notify_attributes @abstract Determine what attributes are required to send up a notification with vnode_notify(). @@ -1298,7 +1575,7 @@ int vn_getpath(struct vnode *vp, char *pathbuf, int *len); */ #define VNODE_LOOKUP_NOFOLLOW 0x01 #define VNODE_LOOKUP_NOCROSSMOUNT 0x02 -#define VNODE_LOOKUP_DOWHITEOUT 0x04 +#define VNODE_LOOKUP_DOWHITEOUT 0x04 /* OBSOLETE */ /*! @function vnode_lookup @abstract Convert a path into a vnode. @@ -1368,6 +1645,7 @@ int vnode_iterate(struct mount *, int, int (*)(struct vnode *, void *), void *); #define VNODE_ITERATE_INACTIVE 0x200 #ifdef BSD_KERNEL_PRIVATE #define VNODE_ALWAYS 0x400 +#define VNODE_DRAINO 0x800 #endif /* BSD_KERNEL_PRIVATE */ /* @@ -1545,6 +1823,20 @@ void vnode_putname(const char *name); */ vnode_t vnode_getparent(vnode_t vp); +#ifdef KERNEL_PRIVATE +/*! + @function vnode_lookup_continue_needed + @abstract Determine whether vnode needs additional processing in VFS before being opened. + @discussion If result is zero, filesystem can open this vnode. If result is nonzero, + additional processing is needed in VFS (e.g. symlink, mountpoint). Nonzero results should + be passed up to VFS. + @param vp Vnode to consider opening (found by filesystem). + @param cnp Componentname as passed to filesystem from VFS. + @result 0 to indicate that a vnode can be opened, or an error that should be passed up to VFS. + */ +int vnode_lookup_continue_needed(vnode_t vp, struct componentname *cnp); +#endif /* KERNEL_PRIVATE */ + #ifdef BSD_KERNEL_PRIVATE /* Not in export list so can be private */ struct stat; diff --git a/bsd/sys/vnode_if.h b/bsd/sys/vnode_if.h index 66812b08d..33ae10047 100644 --- a/bsd/sys/vnode_if.h +++ b/bsd/sys/vnode_if.h @@ -92,6 +92,13 @@ extern struct vnodeop_desc vnop_default_desc; extern struct vnodeop_desc vnop_lookup_desc; +#ifdef KERNEL_PRIVATE +extern struct vnodeop_desc vnop_compound_open_desc; +extern struct vnodeop_desc vnop_compound_remove_desc; +extern struct vnodeop_desc vnop_compound_rename_desc; +extern struct vnodeop_desc vnop_compound_mkdir_desc; +extern struct vnodeop_desc vnop_compound_rmdir_desc; +#endif /* KERNEL_PRIVATE */ extern struct vnodeop_desc vnop_create_desc; extern struct vnodeop_desc vnop_whiteout_desc; extern struct vnodeop_desc vnop_mknod_desc; @@ -257,6 +264,44 @@ struct vnop_open_args { vfs_context_t a_context; }; +#ifdef KERNEL_PRIVATE +struct vnop_compound_open_args { + struct vnodeop_desc *a_desc; + + vnode_t a_dvp; /* Directory in which to open/create */ + vnode_t *a_vpp; /* Resulting vnode */ + int a_fmode; /* Open mode */ + struct componentname *a_cnp; /* Path to look up */ + struct vnode_attr *a_vap; /* Attributes with which to create, if appropriate */ + uint32_t a_flags; /* VNOP-control flags */ + uint32_t *a_status; /* Information about results */ + + vfs_context_t a_context; /* Authorization context */ + + int (*a_open_create_authorizer)( /* Authorizer for create case */ + vnode_t dvp, /* Directory in which to create */ + struct componentname *cnp, /* As passed to VNOP */ + struct vnode_attr *vap, /* As passed to VNOP */ + vfs_context_t ctx, /* Context */ + void *reserved); /* Who knows */ + + int (*a_open_existing_authorizer)( /* Authorizer for preexisting case */ + vnode_t vp, /* vp to open */ + struct componentname *cnp, /* Lookup state */ + int fmode, /* As passed to VNOP */ + vfs_context_t ctx, /* Context */ + void *reserved); /* Who knows */ + + void *a_reserved; +}; + +/* Control flags */ +#define VNOP_COMPOUND_OPEN_DO_CREATE 0x00000001 + +/* Results */ +#define COMPOUND_OPEN_STATUS_DID_CREATE 0x00000001 +#endif /* KERNEL_PRIVATE */ + /*! @function VNOP_OPEN @abstract Call down to a filesystem to open a file. @@ -272,6 +317,11 @@ struct vnop_open_args { extern errno_t VNOP_OPEN(vnode_t, int, vfs_context_t); #endif /* XNU_KERNEL_PRIVATE */ +#ifdef BSD_KERNEL_PRIVATE +struct nameidata; +extern int VNOP_COMPOUND_OPEN(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, int32_t flags, int32_t fmode, uint32_t *status, struct vnode_attr *vap, vfs_context_t ctx); +#endif + struct vnop_close_args { struct vnodeop_desc *a_desc; vnode_t a_vp; @@ -381,8 +431,7 @@ struct vnop_read_args { @discussion VNOP_READ() is where the hard work of of the read() system call happens. The filesystem may use the buffer cache, the cluster layer, or an alternative method to get its data; uio routines will be used to see that data is copied to the correct virtual address in the correct address space and will update its uio argument - to indicate how much data has been moved. Filesystems will not receive a read request on a file without having - first received a VNOP_OPEN(). + to indicate how much data has been moved. @param vp The vnode to read from. @param uio Description of request, including file offset, amount of data requested, destination address for data, and whether that destination is in kernel or user space. @@ -406,8 +455,7 @@ struct vnop_write_args { @discussion VNOP_WRITE() is to write() as VNOP_READ() is to read(). The filesystem may use the buffer cache, the cluster layer, or an alternative method to write its data; uio routines will be used to see that data is copied to the correct virtual address in the correct address space and will update its uio argument - to indicate how much data has been moved. Filesystems will not receive a write request on a file without having - first received a VNOP_OPEN(). + to indicate how much data has been moved. @param vp The vnode to write to. @param uio Description of request, including file offset, amount of data to write, source address for data, and whether that destination is in kernel or user space. @@ -600,6 +648,28 @@ struct vnop_remove_args { extern errno_t VNOP_REMOVE(vnode_t, vnode_t, struct componentname *, int, vfs_context_t); #endif /* XNU_KERNEL_PRIVATE */ +#ifdef KERNEL_PRIVATE +struct vnop_compound_remove_args { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; /* Directory in which to lookup and remove */ + vnode_t *a_vpp; /* File to remove; may or may not point to NULL pointer */ + struct componentname *a_cnp; /* Name of file to remove */ + struct vnode_attr *a_vap; /* Destination for file attributes on successful delete */ + uint32_t a_flags; /* Control flags (unused) */ + vfs_context_t a_context; /* Authorization context */ + int (*a_remove_authorizer)( /* Authorizer callback */ + vnode_t dvp, /* Directory in which to delete */ + vnode_t vp, /* File to delete */ + struct componentname *cnp, /* As passed to VNOP */ + vfs_context_t ctx, /* As passed to VNOP */ + void *reserved); /* Always NULL */ + void *a_reserved; /* Unused */ +}; +#endif /* KERNEL_PRIVATE */ + +#ifdef BSD_KERNEL_PRIVATE +extern errno_t VNOP_COMPOUND_REMOVE(vnode_t, vnode_t*, struct nameidata *, int32_t flags, struct vnode_attr *vap, vfs_context_t); +#endif struct vnop_link_args { struct vnodeop_desc *a_desc; vnode_t a_vp; @@ -650,6 +720,43 @@ struct vnop_rename_args { extern errno_t VNOP_RENAME(vnode_t, vnode_t, struct componentname *, vnode_t, vnode_t, struct componentname *, vfs_context_t); #endif /* XNU_KERNEL_PRIVATE */ +#ifdef KERNEL_PRIVATE +struct vnop_compound_rename_args { + struct vnodeop_desc *a_desc; + + vnode_t a_fdvp; /* Directory from which to rename */ + vnode_t *a_fvpp; /* Vnode to rename (can point to a NULL pointer) */ + struct componentname *a_fcnp; /* Source name */ + struct vnode_attr *a_fvap; + + vnode_t a_tdvp; /* Directory to which to rename */ + vnode_t *a_tvpp; /* Vnode to rename over (can point to a NULL pointer) */ + struct componentname *a_tcnp; /* Destination name */ + struct vnode_attr *a_tvap; + + uint32_t a_flags; /* Control flags: currently unused */ + vfs_context_t a_context; /* Authorization context */ + int (*a_rename_authorizer)( /* Authorization callback */ + vnode_t fdvp, /* As passed to VNOP */ + vnode_t fvp, /* Vnode to rename */ + struct componentname *fcnp, /* As passed to VNOP */ + vnode_t tdvp, /* As passed to VNOP */ + vnode_t tvp, /* Vnode to rename over (can be NULL) */ + struct componentname *tcnp, /* As passed to VNOP */ + vfs_context_t ctx, /* As passed to VNOP */ + void *reserved); /* Always NULL */ + void *a_reserved; /* Currently unused */ +}; +#endif /* KERNEL_PRIVATE */ + +#ifdef XNU_KERNEL_PRIVATE +errno_t +VNOP_COMPOUND_RENAME( + struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, struct vnode_attr *fvap, + struct vnode *tdvp, struct vnode **tvpp, struct componentname *tcnp, struct vnode_attr *tvap, + uint32_t flags,vfs_context_t ctx); +#endif /* XNU_KERNEL_PRIVATE */ + struct vnop_mkdir_args { struct vnodeop_desc *a_desc; vnode_t a_dvp; @@ -674,6 +781,27 @@ struct vnop_mkdir_args { extern errno_t VNOP_MKDIR(vnode_t, vnode_t *, struct componentname *, struct vnode_attr *, vfs_context_t); #endif /* XNU_KERNEL_PRIVATE */ + +#ifdef KERNEL_PRIVATE +struct vnop_compound_mkdir_args { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; /* Directory in which to create */ + vnode_t *a_vpp; /* Destination for found or created vnode */ + struct componentname *a_cnp; /* Name of directory to create */ + struct vnode_attr *a_vap; /* Creation attributes */ + uint32_t a_flags; /* Control flags (unused) */ + vfs_context_t a_context; /* Authorization context */ +#if 0 + int (*a_mkdir_authorizer)(vnode_t dvp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx, void *reserved); +#endif /* 0 */ + void *a_reserved; /* Unused */ +}; +#endif /* KERNEL_PRIVATE */ + +#ifdef XNU_KERNEL_PRIVATE +extern errno_t VNOP_COMPOUND_MKDIR(vnode_t, vnode_t *, struct nameidata *, struct vnode_attr *, vfs_context_t); +#endif /* XNU_KERNEL_PRIVATE */ + struct vnop_rmdir_args { struct vnodeop_desc *a_desc; vnode_t a_dvp; @@ -695,6 +823,30 @@ struct vnop_rmdir_args { extern errno_t VNOP_RMDIR(vnode_t, vnode_t, struct componentname *, vfs_context_t); #endif /* XNU_KERNEL_PRIVATE */ +#ifdef KERNEL_PRIVATE +struct vnop_compound_rmdir_args { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; /* Directory in which to look up and delete */ + vnode_t *a_vpp; /* Destination for found vnode */ + struct componentname *a_cnp; /* Name to delete */ + struct vnode_attr *a_vap; /* Location in which to store attributes if delete succeeds (can be NULL) */ + uint32_t a_flags; /* Control flags (currently unused) */ + vfs_context_t a_context; /* Context for authorization */ + int (*a_rmdir_authorizer)( /* Authorization callback */ + vnode_t dvp, /* As passed to VNOP */ + vnode_t vp, /* Directory to delete */ + struct componentname *cnp, /* As passed to VNOP */ + vfs_context_t ctx, /* As passed to VNOP */ + void *reserved); /* Always NULL */ + void *a_reserved; /* Unused */ +}; +#endif /* KERNEL_PRIVATE */ + +#ifdef XNU_KERNEL_PRIVATE +extern errno_t VNOP_COMPOUND_RMDIR(vnode_t, vnode_t*, struct nameidata *, struct vnode_attr *vap, vfs_context_t); +#endif /* XNU_KERNEL_PRIVATE */ + + struct vnop_symlink_args { struct vnodeop_desc *a_desc; vnode_t a_dvp; @@ -723,7 +875,6 @@ struct vnop_symlink_args { extern errno_t VNOP_SYMLINK(vnode_t, vnode_t *, struct componentname *, struct vnode_attr *, char *, vfs_context_t); #endif /* XNU_KERNEL_PRIVATE */ - /* * * When VNOP_READDIR is called from the NFS Server, the nfs_data @@ -941,7 +1092,7 @@ struct vnop_allocate_args { a file. It can be used to either shrink or grow a file. If the file shrinks, its ubc size will be modified accordingly, but if it grows, then the ubc size is unchanged; space is set aside without being actively used by the file. VNOP_ALLOCATE() is currently only - called as part of the F_PREALLOCATE fcntl, and is supported only by AFP and HFS. + called as part of the F_PREALLOCATE fcntl. @param vp The vnode for which to preallocate space. @param length Desired preallocated file length. @param flags @@ -1009,12 +1160,20 @@ struct vnop_pageout_args { @abstract Write data from a mapped file back to disk. @discussion VNOP_PAGEOUT() is called when data from a mapped file needs to be flushed to disk, either because of an msync() call or due to memory pressure. Filesystems are for the most part expected to - just call cluster_pageout(). + just call cluster_pageout(). However, if they opt into the VFC_VFSVNOP_PAGEOUTV2 flag, then + they will be responsible for creating their own UPLs. @param vp The vnode for which to page out data. - @param pl UPL describing pages needing to be paged out. - @param pl_offset Offset in UPL from which to start paging out data. - @param f_offset Offset in file of data needing to be paged out. - @param size Amount of data to page out (in bytes). + @param pl UPL describing pages needed to be paged out. If UPL is NULL, then it means the filesystem + has opted into VFC_VFSVNOP_PAGEOUTV2 semantics, which means that it will create and operate on its own UPLs + as opposed to relying on the one passed down into the filesystem. This means that the filesystem must be + responsible for N cluster_pageout calls for N dirty ranges in the UPL. + @param pl_offset Offset in UPL from which to start paging out data. Under the new VFC_VFSVNOP_PAGEOUTV2 + semantics, this is the offset in the range specified that must be paged out if the associated page is dirty. + @param f_offset Offset in file of data needing to be paged out. Under the new VFC_VFSVNOP_PAGEOUTV2 + semantics, this represents the offset in the file where we should start looking for dirty pages. + @param size Amount of data to page out (in bytes). Under VFC_VFSVNOP_PAGEOUTV2, this represents + the size of the range to be considered. The fileystem is free to extend or shrink the specified range + to better fit its blocking model as long as the page at 'pl_offset' is included. @param flags UPL-style flags: UPL_IOSYNC, UPL_NOCOMMIT, UPL_NORDAHEAD, UPL_VNODE_PAGER, UPL_MSYNC. Filesystems should generally leave it to the cluster layer to handle these flags. See the memory_object_types.h header in the kernel framework if interested. @@ -1042,6 +1201,36 @@ struct vnop_searchfs_args { vfs_context_t a_context; }; +/* + @function VNOP_SEARCHFS + @abstract Search a filesystem quickly for files or directories that match the passed-in search criteria. + @discussion VNOP_SEARCHFS is a getattrlist-based system call which is implemented almost entirely inside + supported filesystems. Callers provide a set of criteria to match against, and the filesystem is responsible + for finding all files or directories that match the criteria. Once these files or directories are found, + the user-requested attributes of these files is provided as output. The set of searchable attributes is a + subset of the getattrlist attributes. For example, ATTR_CMN_UUID is not a valid searchable attribute as of + 10.6. A common usage scenario could be to request all files whose mod dates is greater than time X, less than + time Y, and provide the inode ID and filename of the matching objects as output. + @param vp The vnode representing the mountpoint of the filesystem to be searched. + @param a_searchparams1 If one-argument search criteria is requested, the search criteria would go here. However, + some search criteria, like ATTR_CMN_MODTIME, can be bounded. The user could request files modified between time X + and time Y. In this case, the lower bound goes in a_searchparams1. + @param a_searchparams2 If two-argument search criteria is requested, the upper bound goes in here. + @param a_searchattrs Contains the getattrlist-style attribute bits which are requested by the current search. + @param a_maxmatches The maximum number of matches to return in a single system call. + @param a_timelimit The suggested maximum amount of time we can spend in the kernel to service this system call. + Filesystems should use this as a guide only, and set their own internal maximum time to avoid denial of service. + @param a_returnattrs The getattrlist-style attributes to return for items in the filesystem that match the search + criteria above. + @param a_scriptcode Currently ignored. + @param a_uio The uio in which to write out the search matches. + @param a_searchstate Sometimes searches cannot be completed in a single system call. In this case, we provide + an identifier back to the user which indicates where to resume a previously-started search. This is an opaque structure + used by the filesystem to identify where to resume said search. + @param a_context The context in which to perform the filesystem search. + @return 0 on success, EAGAIN for searches which could not be completed in 1 call, and other ERRNOS as needed. + */ + #ifdef XNU_KERNEL_PRIVATE extern errno_t VNOP_SEARCHFS(vnode_t, void *, void *, struct attrlist *, uint32_t, struct timeval *, struct attrlist *, uint32_t *, uint32_t, uint32_t, struct uio *, struct searchstate *, vfs_context_t); #endif /* XNU_KERNEL_PRIVATE */ diff --git a/bsd/sys/vnode_internal.h b/bsd/sys/vnode_internal.h index dbff3a50d..7d17be99e 100644 --- a/bsd/sys/vnode_internal.h +++ b/bsd/sys/vnode_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -100,6 +100,29 @@ struct unsafe_fsnode { void * fsnodeowner; }; +#if CONFIG_TRIGGERS +/* + * VFS Internal (private) trigger vnode resolver info. + */ +struct vnode_resolve { + lck_mtx_t vr_lock; /* protects vnode_resolve_t fields */ + trigger_vnode_resolve_callback_t vr_resolve_func; + trigger_vnode_unresolve_callback_t vr_unresolve_func; + trigger_vnode_rearm_callback_t vr_rearm_func; + trigger_vnode_reclaim_callback_t vr_reclaim_func; + void * vr_data; /* private data for resolver */ + uint32_t vr_flags; + uint32_t vr_lastseq; +}; +typedef struct vnode_resolve *vnode_resolve_t; + +/* private vr_flags */ +#define VNT_RESOLVED (1UL << 31) +#define VNT_VFS_UNMOUNTED (1UL << 30) +#define VNT_EXTERNAL (1UL << 29) + +#endif /* CONFIG_TRIGGERS */ + /* * Reading or writing any of these items requires holding the appropriate lock. * v_freelist is locked by the global vnode_list_lock @@ -166,6 +189,9 @@ struct vnode { #if CONFIG_MACF struct label *v_label; /* MAC security label */ #endif +#if CONFIG_TRIGGERS + vnode_resolve_t v_resolve; /* trigger vnode resolve info (VDIR only) */ +#endif /* CONFIG_TRIGGERS */ }; #define v_mountedhere v_un.vu_mountedhere @@ -199,7 +225,6 @@ struct vnode { #define VL_TERMWANT 0x0008 /* there's a waiter for recycle finish (vnode_getiocount)*/ #define VL_DEAD 0x0010 /* vnode is dead, cleaned of filesystem-specific info */ #define VL_MARKTERM 0x0020 /* vnode should be recycled when no longer referenced */ -#define VL_MOUNTDEAD 0x0040 /* v_moutnedhere is dead */ #define VL_NEEDINACTIVE 0x0080 /* delay VNOP_INACTIVE until iocount goes to 0 */ #define VL_LABEL 0x0100 /* vnode is marked for labeling */ @@ -224,7 +249,7 @@ struct vnode { #define VDEVFLUSH 0x000040 /* device vnode after vflush */ #define VMOUNT 0x000080 /* mount operation in progress */ #define VBWAIT 0x000100 /* waiting for output to complete */ - /* Free slot here after removing VALIASED for radar #5971707 */ +#define VSHARED_DYLD 0x000200 /* vnode is a dyld shared cache file */ #define VNOCACHE_DATA 0x000400 /* don't keep data cached once it's been consumed */ #define VSTANDARD 0x000800 /* vnode obtained from common pool */ #define VAGE 0x001000 /* Insert vnode at head of free list */ @@ -244,6 +269,7 @@ struct vnode { #define VISNAMEDSTREAM 0x400000 /* vnode is a named stream (eg HFS resource fork) */ #endif #define VOPENEVT 0x800000 /* if process is P_CHECKOPENEVT, then or in the O_EVTONLY flag on open */ +#define VNEEDSSNAPSHOT 0x1000000 /* * Global vnode data. @@ -251,7 +277,8 @@ struct vnode { extern struct vnode *rootvnode; /* root (i.e. "/") vnode */ #ifdef CONFIG_IMGSRC_ACCESS -extern struct vnode *imgsrc_rootvnode; +#define MAX_IMAGEBOOT_NESTING 2 +extern struct vnode *imgsrc_rootvnodes[]; #endif /* CONFIG_IMGSRC_ACCESS */ @@ -367,6 +394,10 @@ int vn_open(struct nameidata *ndp, int fmode, int cmode); int vn_open_modflags(struct nameidata *ndp, int *fmode, int cmode); int vn_open_auth(struct nameidata *ndp, int *fmode, struct vnode_attr *); int vn_close(vnode_t, int flags, vfs_context_t ctx); +errno_t vn_remove(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, int32_t flags, struct vnode_attr *vap, vfs_context_t ctx); +errno_t vn_rename(struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, struct vnode_attr *fvap, + struct vnode *tdvp, struct vnode **tvpp, struct componentname *tcnp, struct vnode_attr *tvap, + uint32_t flags, vfs_context_t ctx); void lock_vnode_and_post(vnode_t, int); @@ -377,14 +408,30 @@ void lock_vnode_and_post(vnode_t, int); } \ } while (0) - +/* Authorization subroutines */ +int vn_authorize_open_existing(vnode_t vp, struct componentname *cnp, int fmode, vfs_context_t ctx, void *reserved); +int vn_authorize_create(vnode_t, struct componentname *, struct vnode_attr *, vfs_context_t, void*); +int vn_attribute_prepare(vnode_t dvp, struct vnode_attr *vap, uint32_t *defaulted_fieldsp, vfs_context_t ctx); +void vn_attribute_cleanup(struct vnode_attr *vap, uint32_t defaulted_fields); +int vn_authorize_unlink(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, void *reserved); +int vn_authorize_rename(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, + struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, + vfs_context_t ctx, void *reserved); +int vn_authorize_rmdir(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, void *reserved); + +typedef int (*vn_create_authorizer_t)(vnode_t, struct componentname *, struct vnode_attr *, vfs_context_t, void*); +int vn_authorize_mkdir(vnode_t, struct componentname *, struct vnode_attr *, vfs_context_t, void*); +int vn_authorize_null(vnode_t, struct componentname *, struct vnode_attr *, vfs_context_t, void*); +/* End of authorization subroutines */ #define VN_CREATE_NOAUTH (1<<0) #define VN_CREATE_NOINHERIT (1<<1) #define VN_CREATE_UNION (1<<2) #define VN_CREATE_NOLABEL (1<<3) -errno_t vn_create(vnode_t, vnode_t *, struct componentname *, struct vnode_attr *, int flags, vfs_context_t); - +#define VN_CREATE_DOOPEN (1<<4) /* Open file if a batched operation is available */ +errno_t vn_create(vnode_t, vnode_t *, struct nameidata *, struct vnode_attr *, uint32_t, int, uint32_t*, vfs_context_t); +int vn_mkdir(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, vfs_context_t ctx); +int vn_rmdir(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, vfs_context_t ctx); int vn_getxattr(vnode_t, const char *, uio_t, size_t *, int, vfs_context_t); int vn_setxattr(vnode_t, const char *, uio_t, int, vfs_context_t); @@ -415,6 +462,7 @@ void cache_enter_with_gen(vnode_t dvp, vnode_t vp, struct componentname *cnp, in const char *cache_enter_create(vnode_t dvp, vnode_t vp, struct componentname *cnp); int vn_pathconf(vnode_t, int, int32_t *, vfs_context_t); +extern int nc_disabled; #define vnode_lock_convert(v) lck_mtx_convert_spin(&(v)->v_lock) @@ -423,12 +471,16 @@ void vnode_lock_spin(vnode_t); void vnode_list_lock(void); void vnode_list_unlock(void); -int vnode_ref_ext(vnode_t, int); + +#define VNODE_REF_FORCE 0x1 +int vnode_ref_ext(vnode_t, int, int); + void vnode_rele_ext(vnode_t, int, int); void vnode_rele_internal(vnode_t, int, int, int); #ifdef BSD_KERNEL_PRIVATE int vnode_getalways(vnode_t); int vget_internal(vnode_t, int, int); +errno_t vnode_getiocount(vnode_t, unsigned int, int); #endif /* BSD_KERNEL_PRIVATE */ int vnode_get_locked(vnode_t); int vnode_put_locked(vnode_t); @@ -448,6 +500,24 @@ errno_t vnode_setsize(vnode_t, off_t, int ioflag, vfs_context_t); int vnode_setattr_fallback(vnode_t vp, struct vnode_attr *vap, vfs_context_t ctx); int vnode_isspec(vnode_t vp); + +#ifdef BSD_KERNEL_PRIVATE + +typedef uint32_t compound_vnop_id_t; +#define COMPOUND_VNOP_OPEN 0x01 +#define COMPOUND_VNOP_MKDIR 0x02 +#define COMPOUND_VNOP_RENAME 0x04 +#define COMPOUND_VNOP_REMOVE 0x08 +#define COMPOUND_VNOP_RMDIR 0x10 + +int vnode_compound_rename_available(vnode_t vp); +int vnode_compound_rmdir_available(vnode_t vp); +int vnode_compound_mkdir_available(vnode_t vp); +int vnode_compound_remove_available(vnode_t vp); +int vnode_compound_open_available(vnode_t vp); +int vnode_compound_op_available(vnode_t, compound_vnop_id_t); +#endif /* BSD_KERNEL_PRIVATE */ + void vn_setunionwait(vnode_t); void vn_checkunionwait(vnode_t); void vn_clearunionwait(vnode_t, int); @@ -471,9 +541,18 @@ int vfs_sysctl(int *name, uint32_t namelen, user_addr_t oldp, size_t *oldlenp, int sysctl_vnode(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); #ifdef BSD_KERNEL_PRIVATE -void vnode_knoteupdate(struct knote *kn); void vnode_setneedinactive(vnode_t); int vnode_hasnamedstreams(vnode_t); /* Does this vnode have associated named streams? */ -#endif + +void nspace_proc_exit(struct proc *p); + +#if CONFIG_TRIGGERS +/* VFS Internal Vnode Trigger Interfaces (Private) */ +int vnode_trigger_resolve(vnode_t, struct nameidata *, vfs_context_t); +void vnode_trigger_rearm(vnode_t, vfs_context_t); +void vfs_nested_trigger_unmounts(mount_t, int, vfs_context_t); +#endif /* CONFIG_TRIGGERS */ + +#endif /* BSD_KERNEL_PRIVATE */ #endif /* !_SYS_VNODE_INTERNAL_H_ */ diff --git a/bsd/sys/xattr.h b/bsd/sys/xattr.h index c9ecf4275..bd91c3c31 100644 --- a/bsd/sys/xattr.h +++ b/bsd/sys/xattr.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -59,7 +59,19 @@ __BEGIN_DECLS int xattr_protected(const char *); int xattr_validatename(const char *); -#define XATTR_MAXSIZE (64 * 1024 * 1024) +/* Maximum extended attribute size supported by VFS */ +#define XATTR_MAXSIZE (64 * 1024 * 1024) + +#ifdef PRIVATE +/* Maximum extended attribute size in an Apple Double file */ +#define AD_XATTR_MAXSIZE (128 * 1024) + +/* Number of bits used to represent the maximum size of + * extended attribute stored in an Apple Double file. + */ +#define AD_XATTR_SIZE_BITS 18 +#endif /* PRIVATE */ + __END_DECLS #endif /* KERNEL */ diff --git a/bsd/uuid/Makefile b/bsd/uuid/Makefile index 8d5af9310..1f7f17bfc 100644 --- a/bsd/uuid/Makefile +++ b/bsd/uuid/Makefile @@ -9,14 +9,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ # In both the framework PrivateHeader area and /usr/include/uuid diff --git a/bsd/vfs/Makefile b/bsd/vfs/Makefile index 3d578ffd7..b9ddbedcc 100644 --- a/bsd/vfs/Makefile +++ b/bsd/vfs/Makefile @@ -9,14 +9,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ DATAFILES = \ diff --git a/bsd/vfs/kpi_vfs.c b/bsd/vfs/kpi_vfs.c index 50338b255..a18760397 100644 --- a/bsd/vfs/kpi_vfs.c +++ b/bsd/vfs/kpi_vfs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -605,6 +605,13 @@ VFS_VPTOFH(struct vnode * vp, int *fhlenp, unsigned char * fhp, vfs_context_t ct } +/* returns the cached throttle mask for the mount_t */ +uint64_t +vfs_throttle_mask(mount_t mp) +{ + return(mp->mnt_throttle_mask); +} + /* returns a copy of vfs type name for the mount_t */ void vfs_name(mount_t mp, char * buffer) @@ -943,6 +950,27 @@ vfs_vnodecovered(mount_t mp) } } +/* + * Returns device vnode backing a mountpoint with an iocount (if valid vnode exists). + * The iocount must be released with vnode_put(). Note that this KPI is subtle + * with respect to the validity of using this device vnode for anything substantial + * (which is discouraged). If commands are sent to the device driver without + * taking proper steps to ensure that the device is still open, chaos may ensue. + * Similarly, this routine should only be called if there is some guarantee that + * the mount itself is still valid. + */ +vnode_t +vfs_devvp(mount_t mp) +{ + vnode_t vp = mp->mnt_devvp; + + if ((vp != NULLVP) && (vnode_get(vp) == 0)) { + return vp; + } + + return NULLVP; +} + /* * return the io attributes associated with mount_t */ @@ -1002,7 +1030,6 @@ extern int vfs_opv_numops; errno_t vfs_fsadd(struct vfs_fsentry *vfe, vfstable_t * handle) { -#pragma unused(data) struct vfstable *newvfstbl = NULL; int i,j; int (***opv_desc_vector_p)(void *); @@ -1686,6 +1713,40 @@ vnode_israge(vnode_t vp) return ((vp->v_flag & VRAGE)? 1 : 0); } +int +vnode_needssnapshots(vnode_t vp) +{ + return ((vp->v_flag & VNEEDSSNAPSHOT)? 1 : 0); +} + + +/* Check the process/thread to see if we should skip atime updates */ +int +vfs_ctx_skipatime (vfs_context_t ctx) { + struct uthread *ut; + proc_t proc; + thread_t thr; + + proc = vfs_context_proc(ctx); + thr = vfs_context_thread (ctx); + + /* Validate pointers in case we were invoked via a kernel context */ + if (thr && proc) { + ut = get_bsdthread_info (thr); + + if (proc->p_lflag & P_LRAGE_VNODES) { + return 1; + } + + if (ut) { + if (ut->uu_flag & UT_RAGE_VNODES) { + return 1; + } + } + } + return 0; +} + /* is vnode_t marked to not keep data cached once it's been consumed */ int vnode_isnocache(vnode_t vp) @@ -1743,6 +1804,46 @@ vnode_islnk(vnode_t vp) return ((vp->v_type == VLNK)? 1 : 0); } +int +vnode_lookup_continue_needed(vnode_t vp, struct componentname *cnp) +{ + struct nameidata *ndp = cnp->cn_ndp; + + if (ndp == NULL) { + panic("vnode_lookup_continue_needed(): cnp->cn_ndp is NULL\n"); + } + + if (vnode_isdir(vp)) { + if (vp->v_mountedhere != NULL) { + goto yes; + } + +#if CONFIG_TRIGGERS + if (vp->v_resolve) { + goto yes; + } +#endif /* CONFIG_TRIGGERS */ + + } + + + if (vnode_islnk(vp)) { + /* From lookup(): || *ndp->ni_next == '/') No need for this, we know we're NULL-terminated here */ + if (cnp->cn_flags & FOLLOW) { + goto yes; + } + if (ndp->ni_flag & NAMEI_TRAILINGSLASH) { + goto yes; + } + } + + return 0; + +yes: + ndp->ni_flag |= NAMEI_CONTLOOKUP; + return EKEEPLOOKING; +} + /* is vnode_t a fifo ? */ int vnode_isfifo(vnode_t vp) @@ -2041,6 +2142,37 @@ vnode_vfsisrdonly(vnode_t vp) return ((vp->v_mount->mnt_flag & MNT_RDONLY)? 1 : 0); } +int +vnode_compound_rename_available(vnode_t vp) +{ + return vnode_compound_op_available(vp, COMPOUND_VNOP_RENAME); +} +int +vnode_compound_rmdir_available(vnode_t vp) +{ + return vnode_compound_op_available(vp, COMPOUND_VNOP_RMDIR); +} +int +vnode_compound_mkdir_available(vnode_t vp) +{ + return vnode_compound_op_available(vp, COMPOUND_VNOP_MKDIR); +} +int +vnode_compound_remove_available(vnode_t vp) +{ + return vnode_compound_op_available(vp, COMPOUND_VNOP_REMOVE); +} +int +vnode_compound_open_available(vnode_t vp) +{ + return vnode_compound_op_available(vp, COMPOUND_VNOP_OPEN); +} + +int +vnode_compound_op_available(vnode_t vp, compound_vnop_id_t opid) +{ + return ((vp->v_mount->mnt_compound_ops & opid) != 0); +} /* * Returns vnode ref to current working directory; if a per-thread current @@ -2769,6 +2901,15 @@ vnode_notify(vnode_t vp, uint32_t events, struct vnode_attr *vap) return 0; } + + +int +vnode_isdyldsharedcache(vnode_t vp) +{ + return ((vp->v_flag & VSHARED_DYLD) ? 1 : 0); +} + + /* * For a filesystem that isn't tracking its own vnode watchers: * check whether a vnode is being monitored. @@ -2778,27 +2919,6 @@ vnode_ismonitored(vnode_t vp) { return (vp->v_knotes.slh_first != NULL); } -/* - * Conceived as a function available only in BSD kernel so that if kevent_register - * changes what a knote of type EVFILT_VNODE is watching, it can push - * that updated information down to a networked filesystem that may - * need to update server-side monitoring. - * - * Blunted to do nothing--because we want to get both kqueue and fsevents support - * from the VNOP_MONITOR design, we always want all the events a filesystem can provide us. - */ -void -vnode_knoteupdate(__unused struct knote *kn) -{ -#if 0 - vnode_t vp = (vnode_t)kn->kn_hook; - if (vnode_getwithvid(vp, kn->kn_hookid) == 0) { - VNOP_MONITOR(vp, kn->kn_sfflags, VNODE_MONITOR_UPDATE, (void*)kn, NULL); - vnode_put(vp); - } -#endif -} - /* * Initialize a struct vnode_attr and activate the attributes required * by the vnode_notify() call. @@ -2811,6 +2931,44 @@ vfs_get_notify_attributes(struct vnode_attr *vap) return 0; } +#if CONFIG_TRIGGERS +int +vfs_settriggercallback(fsid_t *fsid, vfs_trigger_callback_t vtc, void *data, uint32_t flags __unused, vfs_context_t ctx) +{ + int error; + mount_t mp; + + mp = mount_list_lookupby_fsid(fsid, 0 /* locked */, 1 /* withref */); + if (mp == NULL) { + return ENOENT; + } + + error = vfs_busy(mp, LK_NOWAIT); + mount_iterdrop(mp); + + if (error != 0) { + return ENOENT; + } + + mount_lock(mp); + if (mp->mnt_triggercallback != NULL) { + error = EBUSY; + mount_unlock(mp); + goto out; + } + + mp->mnt_triggercallback = vtc; + mp->mnt_triggerdata = data; + mount_unlock(mp); + + mp->mnt_triggercallback(mp, VTC_REPLACE, data, ctx); + +out: + vfs_unbusy(mp); + return 0; +} +#endif /* CONFIG_TRIGGERS */ + /* * Definition of vnode operations. */ @@ -2909,13 +3067,87 @@ VNOP_LOOKUP(vnode_t dvp, vnode_t *vpp, struct componentname *cnp, vfs_context_t } #if 0 -/* - *# - *#% create dvp L L L - *#% create vpp - L - - *# - */ - +struct vnop_compound_open_args { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; + vnode_t *a_vpp; + struct componentname *a_cnp; + int32_t a_flags; + int32_t a_fmode; + struct vnode_attr *a_vap; + vfs_context_t a_context; + void *a_reserved; +}; +#endif /* 0 */ + +int +VNOP_COMPOUND_OPEN(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, int32_t flags, int32_t fmode, uint32_t *statusp, struct vnode_attr *vap, vfs_context_t ctx) +{ + int _err; + struct vnop_compound_open_args a; + int did_create = 0; + int want_create; + uint32_t tmp_status = 0; + struct componentname *cnp = &ndp->ni_cnd; + + want_create = (flags & VNOP_COMPOUND_OPEN_DO_CREATE); + + a.a_desc = &vnop_compound_open_desc; + a.a_dvp = dvp; + a.a_vpp = vpp; /* Could be NULL */ + a.a_cnp = cnp; + a.a_flags = flags; + a.a_fmode = fmode; + a.a_status = (statusp != NULL) ? statusp : &tmp_status; + a.a_vap = vap; + a.a_context = ctx; + a.a_open_create_authorizer = vn_authorize_create; + a.a_open_existing_authorizer = vn_authorize_open_existing; + a.a_reserved = NULL; + + if (dvp == NULLVP) { + panic("No dvp?"); + } + if (want_create && !vap) { + panic("Want create, but no vap?"); + } + if (!want_create && vap) { + panic("Don't want create, but have a vap?"); + } + + _err = (*dvp->v_op[vnop_compound_open_desc.vdesc_offset])(&a); + + did_create = (*a.a_status & COMPOUND_OPEN_STATUS_DID_CREATE); + + if (did_create && !want_create) { + panic("Filesystem did a create, even though none was requested?"); + } + + if (did_create) { + if (!NATIVE_XATTR(dvp)) { + /* + * Remove stale Apple Double file (if any). + */ + xattrfile_remove(dvp, cnp->cn_nameptr, ctx, 0); + } + + /* On create, provide kqueue notification */ + post_event_if_success(dvp, _err, NOTE_WRITE); + } + + lookup_compound_vnop_post_hook(_err, dvp, *vpp, ndp, did_create); +#if 0 /* FSEvents... */ + if (*vpp && _err && _err != EKEEPLOOKING) { + vnode_put(*vpp); + *vpp = NULLVP; + } +#endif /* 0 */ + + return (_err); + +} + +#if 0 struct vnop_create_args { struct vnodeop_desc *a_desc; vnode_t a_dvp; @@ -3094,34 +3326,34 @@ struct vnop_open_args { }; #endif /* 0*/ errno_t -VNOP_OPEN(vnode_t vp, int mode, vfs_context_t ctx) +VNOP_OPEN(vnode_t vp, int mode, vfs_context_t ctx) { int _err; struct vnop_open_args a; #ifndef __LP64__ int thread_safe; - int funnel_state = 0; + int funnel_state = 0; #endif /* __LP64__ */ if (ctx == NULL) { ctx = vfs_context_current(); - } + } a.a_desc = &vnop_open_desc; a.a_vp = vp; a.a_mode = mode; - a.a_context = ctx; + a.a_context = ctx; #ifndef __LP64__ thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { funnel_state = thread_funnel_set(kernel_flock, TRUE); if (vp->v_type != VCHR && vp->v_type != VFIFO && vp->v_type != VSOCK) { - if ( (_err = lock_fsnode(vp, NULL)) ) { - (void) thread_funnel_set(kernel_flock, funnel_state); - return (_err); - } - } - } + if ( (_err = lock_fsnode(vp, NULL)) ) { + (void) thread_funnel_set(kernel_flock, funnel_state); + return (_err); + } + } + } #endif /* __LP64__ */ _err = (*vp->v_op[vnop_open_desc.vdesc_offset])(&a); @@ -3130,9 +3362,9 @@ VNOP_OPEN(vnode_t vp, int mode, vfs_context_t ctx) if (!thread_safe) { if (vp->v_type != VCHR && vp->v_type != VFIFO && vp->v_type != VSOCK) { unlock_fsnode(vp, NULL); - } + } (void) thread_funnel_set(kernel_flock, funnel_state); - } + } #endif /* __LP64__ */ return (_err); @@ -4012,6 +4244,49 @@ VNOP_REMOVE(vnode_t dvp, vnode_t vp, struct componentname * cnp, int flags, vfs_ return (_err); } +int +VNOP_COMPOUND_REMOVE(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, int32_t flags, struct vnode_attr *vap, vfs_context_t ctx) +{ + int _err; + struct vnop_compound_remove_args a; + int no_vp = (*vpp == NULLVP); + + a.a_desc = &vnop_compound_remove_desc; + a.a_dvp = dvp; + a.a_vpp = vpp; + a.a_cnp = &ndp->ni_cnd; + a.a_flags = flags; + a.a_vap = vap; + a.a_context = ctx; + a.a_remove_authorizer = vn_authorize_unlink; + + _err = (*dvp->v_op[vnop_compound_remove_desc.vdesc_offset])(&a); + if (_err == 0) { + vnode_setneedinactive(*vpp); + + if ( !(NATIVE_XATTR(dvp)) ) { + /* + * Remove any associated extended attribute file (._ AppleDouble file). + */ + xattrfile_remove(dvp, ndp->ni_cnd.cn_nameptr, ctx, 1); + } + } + + post_event_if_success(*vpp, _err, NOTE_DELETE | NOTE_LINK); + post_event_if_success(dvp, _err, NOTE_WRITE); + + if (no_vp) { + lookup_compound_vnop_post_hook(_err, dvp, *vpp, ndp, 0); + if (*vpp && _err && _err != EKEEPLOOKING) { + vnode_put(*vpp); + *vpp = NULLVP; + } + } + + //printf("VNOP_COMPOUND_REMOVE() returning %d\n", _err); + + return (_err); +} #if 0 /* @@ -4085,114 +4360,33 @@ VNOP_LINK(vnode_t vp, vnode_t tdvp, struct componentname * cnp, vfs_context_t ct return (_err); } - -#if 0 -/* - *# - *#% rename fdvp U U U - *#% rename fvp U U U - *#% rename tdvp L U U - *#% rename tvp X U U - *# - */ -struct vnop_rename_args { - struct vnodeop_desc *a_desc; - vnode_t a_fdvp; - vnode_t a_fvp; - struct componentname *a_fcnp; - vnode_t a_tdvp; - vnode_t a_tvp; - struct componentname *a_tcnp; - vfs_context_t a_context; -}; -#endif /* 0*/ errno_t -VNOP_RENAME(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, - struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, - vfs_context_t ctx) +vn_rename(struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, struct vnode_attr *fvap, + struct vnode *tdvp, struct vnode **tvpp, struct componentname *tcnp, struct vnode_attr *tvap, + uint32_t flags, vfs_context_t ctx) { - int _err = 0; - int events; - struct vnop_rename_args a; - char smallname1[48]; - char smallname2[48]; - char *xfromname = NULL; - char *xtoname = NULL; -#ifndef __LP64__ - int funnel_state = 0; - vnode_t lock_first = NULL, lock_second = NULL; - vnode_t fdvp_unsafe = NULLVP; - vnode_t tdvp_unsafe = NULLVP; -#endif /* __LP64__ */ + int _err; vnode_t src_attr_vp = NULLVP; vnode_t dst_attr_vp = NULLVP; struct nameidata fromnd; struct nameidata tond; + char smallname1[48]; + char smallname2[48]; + char *xfromname = NULL; + char *xtoname = NULL; + int batched; - a.a_desc = &vnop_rename_desc; - a.a_fdvp = fdvp; - a.a_fvp = fvp; - a.a_fcnp = fcnp; - a.a_tdvp = tdvp; - a.a_tvp = tvp; - a.a_tcnp = tcnp; - a.a_context = ctx; + batched = vnode_compound_rename_available(fdvp); #ifndef __LP64__ - if (!THREAD_SAFE_FS(fdvp)) - fdvp_unsafe = fdvp; - if (!THREAD_SAFE_FS(tdvp)) - tdvp_unsafe = tdvp; + vnode_t fdvp_unsafe = (THREAD_SAFE_FS(fdvp) ? NULLVP : fdvp); +#endif /* __LP64__ */ - if (fdvp_unsafe != NULLVP) { - /* - * Lock parents in vnode address order to avoid deadlocks - * note that it's possible for the fdvp to be unsafe, - * but the tdvp to be safe because tvp could be a directory - * in the root of a filesystem... in that case, tdvp is the - * in the filesystem that this root is mounted on - */ - if (tdvp_unsafe == NULL || fdvp_unsafe == tdvp_unsafe) { - lock_first = fdvp_unsafe; - lock_second = NULL; - } else if (fdvp_unsafe < tdvp_unsafe) { - lock_first = fdvp_unsafe; - lock_second = tdvp_unsafe; - } else { - lock_first = tdvp_unsafe; - lock_second = fdvp_unsafe; - } - if ( (_err = lock_fsnode(lock_first, &funnel_state)) ) - return (_err); - - if (lock_second != NULL && (_err = lock_fsnode(lock_second, NULL))) { - unlock_fsnode(lock_first, &funnel_state); - return (_err); - } - - /* - * Lock both children in vnode address order to avoid deadlocks - */ - if (tvp == NULL || tvp == fvp) { - lock_first = fvp; - lock_second = NULL; - } else if (fvp < tvp) { - lock_first = fvp; - lock_second = tvp; - } else { - lock_first = tvp; - lock_second = fvp; - } - if ( (_err = lock_fsnode(lock_first, NULL)) ) - goto out1; - - if (lock_second != NULL && (_err = lock_fsnode(lock_second, NULL))) { - unlock_fsnode(lock_first, NULL); - goto out1; - } + if (!batched) { + if (*fvpp == NULLVP) + panic("Not batched, and no fvp?"); } -#endif /* __LP64__ */ - + /* * We need to preflight any potential AppleDouble file for the source file * before doing the rename operation, since we could potentially be doing @@ -4235,8 +4429,8 @@ VNOP_RENAME(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, * is only for AppleDouble files. */ if (xfromname != NULL) { - NDINIT(&fromnd, RENAME, NOFOLLOW | USEDVP | CN_NBMOUNTLOOK, UIO_SYSSPACE, - CAST_USER_ADDR_T(xfromname), ctx); + NDINIT(&fromnd, RENAME, OP_RENAME, NOFOLLOW | USEDVP | CN_NBMOUNTLOOK, + UIO_SYSSPACE, CAST_USER_ADDR_T(xfromname), ctx); fromnd.ni_dvp = fdvp; error = namei(&fromnd); @@ -4267,21 +4461,18 @@ VNOP_RENAME(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, } } + if (batched) { + _err = VNOP_COMPOUND_RENAME(fdvp, fvpp, fcnp, fvap, tdvp, tvpp, tcnp, tvap, flags, ctx); + if (_err != 0) { + printf("VNOP_COMPOUND_RENAME() returned %d\n", _err); + } - /* do the rename of the main file. */ - _err = (*fdvp->v_op[vnop_rename_desc.vdesc_offset])(&a); - -#ifndef __LP64__ - if (fdvp_unsafe != NULLVP) { - if (lock_second != NULL) - unlock_fsnode(lock_second, NULL); - unlock_fsnode(lock_first, NULL); + } else { + _err = VNOP_RENAME(fdvp, *fvpp, fcnp, tdvp, *tvpp, tcnp, ctx); } -#endif /* __LP64__ */ if (_err == 0) { - if (tvp && tvp != fvp) - vnode_setneedinactive(tvp); + mac_vnode_notify_rename(ctx, *fvpp, tdvp, tcnp); } /* @@ -4295,7 +4486,7 @@ VNOP_RENAME(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, * Note that tdvp already has an iocount reference. Make sure to check that we * get a valid vnode from namei. */ - NDINIT(&tond, RENAME, + NDINIT(&tond, RENAME, OP_RENAME, NOCACHE | NOFOLLOW | USEDVP | CN_NBMOUNTLOOK, UIO_SYSSPACE, CAST_USER_ADDR_T(xtoname), ctx); tond.ni_dvp = tdvp; @@ -4309,81 +4500,15 @@ VNOP_RENAME(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, } if (src_attr_vp) { - /* attempt to rename src -> dst */ - - a.a_desc = &vnop_rename_desc; - a.a_fdvp = fdvp; - a.a_fvp = src_attr_vp; - a.a_fcnp = &fromnd.ni_cnd; - a.a_tdvp = tdvp; - a.a_tvp = dst_attr_vp; - a.a_tcnp = &tond.ni_cnd; - a.a_context = ctx; - -#ifndef __LP64__ - if (fdvp_unsafe != NULLVP) { - /* - * Lock in vnode address order to avoid deadlocks - */ - if (dst_attr_vp == NULL || dst_attr_vp == src_attr_vp) { - lock_first = src_attr_vp; - lock_second = NULL; - } else if (src_attr_vp < dst_attr_vp) { - lock_first = src_attr_vp; - lock_second = dst_attr_vp; - } else { - lock_first = dst_attr_vp; - lock_second = src_attr_vp; - } - if ( (error = lock_fsnode(lock_first, NULL)) == 0) { - if (lock_second != NULL && (error = lock_fsnode(lock_second, NULL)) ) - unlock_fsnode(lock_first, NULL); - } + if (batched) { + error = VNOP_COMPOUND_RENAME(fdvp, &src_attr_vp, &fromnd.ni_cnd, NULL, + tdvp, &dst_attr_vp, &tond.ni_cnd, NULL, + 0, ctx); + } else { + error = VNOP_RENAME(fdvp, src_attr_vp, &fromnd.ni_cnd, + tdvp, dst_attr_vp, &tond.ni_cnd, ctx); } -#endif /* __LP64__ */ - if (error == 0) { - const char *oname; - vnode_t oparent; - /* Save these off so we can later verify them (fix up below) */ - oname = src_attr_vp->v_name; - oparent = src_attr_vp->v_parent; - - error = (*fdvp->v_op[vnop_rename_desc.vdesc_offset])(&a); - -#ifndef __LP64__ - if (fdvp_unsafe != NULLVP) { - if (lock_second != NULL) - unlock_fsnode(lock_second, NULL); - unlock_fsnode(lock_first, NULL); - } -#endif /* __LP64__ */ - - if (error == 0) { - vnode_setneedinactive(src_attr_vp); - - if (dst_attr_vp && dst_attr_vp != src_attr_vp) - vnode_setneedinactive(dst_attr_vp); - /* - * Fix up name & parent pointers on ._ file - */ - if (oname == src_attr_vp->v_name && - oparent == src_attr_vp->v_parent) { - int update_flags; - - update_flags = VNODE_UPDATE_NAME; - - if (fdvp != tdvp) - update_flags |= VNODE_UPDATE_PARENT; - - vnode_update_identity(src_attr_vp, tdvp, - tond.ni_cnd.cn_nameptr, - tond.ni_cnd.cn_namelen, - tond.ni_cnd.cn_hash, - update_flags); - } - } - } /* kevent notifications for moving resource files * _err is zero if we're here, so no need to notify directories, code * below will do that. only need to post the rename on the source and @@ -4449,6 +4574,125 @@ VNOP_RENAME(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, FREE(xtoname, M_TEMP); } + return _err; +} + + +#if 0 +/* + *# + *#% rename fdvp U U U + *#% rename fvp U U U + *#% rename tdvp L U U + *#% rename tvp X U U + *# + */ +struct vnop_rename_args { + struct vnodeop_desc *a_desc; + vnode_t a_fdvp; + vnode_t a_fvp; + struct componentname *a_fcnp; + vnode_t a_tdvp; + vnode_t a_tvp; + struct componentname *a_tcnp; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_RENAME(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, + struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, + vfs_context_t ctx) +{ + int _err = 0; + int events; + struct vnop_rename_args a; +#ifndef __LP64__ + int funnel_state = 0; + vnode_t lock_first = NULL, lock_second = NULL; + vnode_t fdvp_unsafe = NULLVP; + vnode_t tdvp_unsafe = NULLVP; +#endif /* __LP64__ */ + + a.a_desc = &vnop_rename_desc; + a.a_fdvp = fdvp; + a.a_fvp = fvp; + a.a_fcnp = fcnp; + a.a_tdvp = tdvp; + a.a_tvp = tvp; + a.a_tcnp = tcnp; + a.a_context = ctx; + +#ifndef __LP64__ + if (!THREAD_SAFE_FS(fdvp)) + fdvp_unsafe = fdvp; + if (!THREAD_SAFE_FS(tdvp)) + tdvp_unsafe = tdvp; + + if (fdvp_unsafe != NULLVP) { + /* + * Lock parents in vnode address order to avoid deadlocks + * note that it's possible for the fdvp to be unsafe, + * but the tdvp to be safe because tvp could be a directory + * in the root of a filesystem... in that case, tdvp is the + * in the filesystem that this root is mounted on + */ + if (tdvp_unsafe == NULL || fdvp_unsafe == tdvp_unsafe) { + lock_first = fdvp_unsafe; + lock_second = NULL; + } else if (fdvp_unsafe < tdvp_unsafe) { + lock_first = fdvp_unsafe; + lock_second = tdvp_unsafe; + } else { + lock_first = tdvp_unsafe; + lock_second = fdvp_unsafe; + } + if ( (_err = lock_fsnode(lock_first, &funnel_state)) ) + return (_err); + + if (lock_second != NULL && (_err = lock_fsnode(lock_second, NULL))) { + unlock_fsnode(lock_first, &funnel_state); + return (_err); + } + + /* + * Lock both children in vnode address order to avoid deadlocks + */ + if (tvp == NULL || tvp == fvp) { + lock_first = fvp; + lock_second = NULL; + } else if (fvp < tvp) { + lock_first = fvp; + lock_second = tvp; + } else { + lock_first = tvp; + lock_second = fvp; + } + if ( (_err = lock_fsnode(lock_first, NULL)) ) + goto out1; + + if (lock_second != NULL && (_err = lock_fsnode(lock_second, NULL))) { + unlock_fsnode(lock_first, NULL); + goto out1; + } + } +#endif /* __LP64__ */ + + /* do the rename of the main file. */ + _err = (*fdvp->v_op[vnop_rename_desc.vdesc_offset])(&a); + +#ifndef __LP64__ + if (fdvp_unsafe != NULLVP) { + if (lock_second != NULL) + unlock_fsnode(lock_second, NULL); + unlock_fsnode(lock_first, NULL); + } +#endif /* __LP64__ */ + + if (_err == 0) { + if (tvp && tvp != fvp) + vnode_setneedinactive(tvp); + } + #ifndef __LP64__ out1: if (fdvp_unsafe != NULLVP) { @@ -4488,6 +4732,112 @@ VNOP_RENAME(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, return (_err); } +int +VNOP_COMPOUND_RENAME( + struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, struct vnode_attr *fvap, + struct vnode *tdvp, struct vnode **tvpp, struct componentname *tcnp, struct vnode_attr *tvap, + uint32_t flags, vfs_context_t ctx) +{ + int _err = 0; + int events; + struct vnop_compound_rename_args a; + int no_fvp, no_tvp; + + no_fvp = (*fvpp) == NULLVP; + no_tvp = (*tvpp) == NULLVP; + + a.a_desc = &vnop_compound_rename_desc; + + a.a_fdvp = fdvp; + a.a_fvpp = fvpp; + a.a_fcnp = fcnp; + a.a_fvap = fvap; + + a.a_tdvp = tdvp; + a.a_tvpp = tvpp; + a.a_tcnp = tcnp; + a.a_tvap = tvap; + + a.a_flags = flags; + a.a_context = ctx; + a.a_rename_authorizer = vn_authorize_rename; + a.a_reserved = NULL; + + /* do the rename of the main file. */ + _err = (*fdvp->v_op[vnop_compound_rename_desc.vdesc_offset])(&a); + + if (_err == 0) { + if (*tvpp && *tvpp != *fvpp) + vnode_setneedinactive(*tvpp); + } + + /* Wrote at least one directory. If transplanted a dir, also changed link counts */ + if (0 == _err && *fvpp != *tvpp) { + if (!*fvpp) { + panic("No fvpp after compound rename?"); + } + + events = NOTE_WRITE; + if (vnode_isdir(*fvpp)) { + /* Link count on dir changed only if we are moving a dir and... + * --Moved to new dir, not overwriting there + * --Kept in same dir and DID overwrite + */ + if (((fdvp != tdvp) && (!*tvpp)) || ((fdvp == tdvp) && (*tvpp))) { + events |= NOTE_LINK; + } + } + + lock_vnode_and_post(fdvp, events); + if (fdvp != tdvp) { + lock_vnode_and_post(tdvp, events); + } + + /* If you're replacing the target, post a deletion for it */ + if (*tvpp) + { + lock_vnode_and_post(*tvpp, NOTE_DELETE); + } + + lock_vnode_and_post(*fvpp, NOTE_RENAME); + } + + if (no_fvp) { + lookup_compound_vnop_post_hook(_err, fdvp, *fvpp, fcnp->cn_ndp, 0); + } + if (no_tvp && *tvpp != NULLVP) { + lookup_compound_vnop_post_hook(_err, tdvp, *tvpp, tcnp->cn_ndp, 0); + } + + if (_err && _err != EKEEPLOOKING) { + if (*fvpp) { + vnode_put(*fvpp); + *fvpp = NULLVP; + } + if (*tvpp) { + vnode_put(*tvpp); + *tvpp = NULLVP; + } + } + + return (_err); +} + +int +vn_mkdir(struct vnode *dvp, struct vnode **vpp, struct nameidata *ndp, + struct vnode_attr *vap, vfs_context_t ctx) +{ + if (ndp->ni_cnd.cn_nameiop != CREATE) { + panic("Non-CREATE nameiop in vn_mkdir()?"); + } + + if (vnode_compound_mkdir_available(dvp)) { + return VNOP_COMPOUND_MKDIR(dvp, vpp, ndp, vap, ctx); + } else { + return VNOP_MKDIR(dvp, vpp, &ndp->ni_cnd, vap, ctx); + } +} + #if 0 /* *# @@ -4550,6 +4900,59 @@ VNOP_MKDIR(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, return (_err); } +int +VNOP_COMPOUND_MKDIR(struct vnode *dvp, struct vnode **vpp, struct nameidata *ndp, + struct vnode_attr *vap, vfs_context_t ctx) +{ + int _err; + struct vnop_compound_mkdir_args a; + + a.a_desc = &vnop_compound_mkdir_desc; + a.a_dvp = dvp; + a.a_vpp = vpp; + a.a_cnp = &ndp->ni_cnd; + a.a_vap = vap; + a.a_flags = 0; + a.a_context = ctx; +#if 0 + a.a_mkdir_authorizer = vn_authorize_mkdir; +#endif /* 0 */ + a.a_reserved = NULL; + + _err = (*dvp->v_op[vnop_compound_mkdir_desc.vdesc_offset])(&a); + if (_err == 0 && !NATIVE_XATTR(dvp)) { + /* + * Remove stale Apple Double file (if any). + */ + xattrfile_remove(dvp, ndp->ni_cnd.cn_nameptr, ctx, 0); + } + + post_event_if_success(dvp, _err, NOTE_LINK | NOTE_WRITE); + + lookup_compound_vnop_post_hook(_err, dvp, *vpp, ndp, (_err == 0)); + if (*vpp && _err && _err != EKEEPLOOKING) { + vnode_put(*vpp); + *vpp = NULLVP; + } + + return (_err); +} + +int +vn_rmdir(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, vfs_context_t ctx) +{ + if (vnode_compound_rmdir_available(dvp)) { + return VNOP_COMPOUND_RMDIR(dvp, vpp, ndp, vap, ctx); + } else { + if (*vpp == NULLVP) { + panic("NULL vp, but not a compound VNOP?"); + } + if (vap != NULL) { + panic("Non-NULL vap, but not a compound VNOP?"); + } + return VNOP_RMDIR(dvp, *vpp, &ndp->ni_cnd, ctx); + } +} #if 0 /* @@ -4618,6 +5021,53 @@ VNOP_RMDIR(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, vfs_c return (_err); } +int +VNOP_COMPOUND_RMDIR(struct vnode *dvp, struct vnode **vpp, struct nameidata *ndp, + struct vnode_attr *vap, vfs_context_t ctx) +{ + int _err; + struct vnop_compound_rmdir_args a; + int no_vp; + + a.a_desc = &vnop_mkdir_desc; + a.a_dvp = dvp; + a.a_vpp = vpp; + a.a_cnp = &ndp->ni_cnd; + a.a_vap = vap; + a.a_flags = 0; + a.a_context = ctx; + a.a_rmdir_authorizer = vn_authorize_rmdir; + a.a_reserved = NULL; + + no_vp = (*vpp == NULLVP); + + _err = (*dvp->v_op[vnop_compound_rmdir_desc.vdesc_offset])(&a); + if (_err == 0 && !NATIVE_XATTR(dvp)) { + /* + * Remove stale Apple Double file (if any). + */ + xattrfile_remove(dvp, ndp->ni_cnd.cn_nameptr, ctx, 0); + } + + if (*vpp) { + post_event_if_success(*vpp, _err, NOTE_DELETE | NOTE_LINK); + } + post_event_if_success(dvp, _err, NOTE_LINK | NOTE_WRITE); + + if (no_vp) { + lookup_compound_vnop_post_hook(_err, dvp, *vpp, ndp, 0); + +#if 0 /* Removing orphaned ._ files requires a vp.... */ + if (*vpp && _err && _err != EKEEPLOOKING) { + vnode_put(*vpp); + *vpp = NULLVP; + } +#endif /* 0 */ + } + + return (_err); +} + /* * Remove a ._ AppleDouble file */ @@ -4642,7 +5092,7 @@ xattrfile_remove(vnode_t dvp, const char * basename, vfs_context_t ctx, int forc MALLOC(filename, char *, len, M_TEMP, M_WAITOK); len = snprintf(filename, len, "._%s", basename); } - NDINIT(&nd, DELETE, WANTPARENT | LOCKLEAF | NOFOLLOW | USEDVP, UIO_SYSSPACE, + NDINIT(&nd, DELETE, OP_UNLINK, WANTPARENT | LOCKLEAF | NOFOLLOW | USEDVP, UIO_SYSSPACE, CAST_USER_ADDR_T(filename), ctx); nd.ni_dvp = dvp; if (namei(&nd) != 0) @@ -4678,32 +5128,9 @@ xattrfile_remove(vnode_t dvp, const char * basename, vfs_context_t ctx, int forc } } if (force) { - struct vnop_remove_args a; int error; -#ifndef __LP64__ - int thread_safe = THREAD_SAFE_FS(dvp); -#endif /* __LP64__ */ - a.a_desc = &vnop_remove_desc; - a.a_dvp = nd.ni_dvp; - a.a_vp = xvp; - a.a_cnp = &nd.ni_cnd; - a.a_context = ctx; - -#ifndef __LP64__ - if (!thread_safe) { - if ( (lock_fsnode(xvp, NULL)) ) - goto out1; - } -#endif /* __LP64__ */ - - error = (*dvp->v_op[vnop_remove_desc.vdesc_offset])(&a); - -#ifndef __LP64__ - if (!thread_safe) - unlock_fsnode(xvp, NULL); -#endif /* __LP64__ */ - + error = VNOP_REMOVE(dvp, xvp, &nd.ni_cnd, 0, ctx); if (error == 0) vnode_setneedinactive(xvp); @@ -4745,7 +5172,7 @@ xattrfile_setattr(vnode_t dvp, const char * basename, struct vnode_attr * vap, MALLOC(filename, char *, len, M_TEMP, M_WAITOK); len = snprintf(filename, len, "._%s", basename); } - NDINIT(&nd, LOOKUP, NOFOLLOW | USEDVP, UIO_SYSSPACE, + NDINIT(&nd, LOOKUP, OP_SETATTR, NOFOLLOW | USEDVP, UIO_SYSSPACE, CAST_USER_ADDR_T(filename), ctx); nd.ni_dvp = dvp; if (namei(&nd) != 0) @@ -4847,7 +5274,6 @@ VNOP_SYMLINK(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, xattrfile_remove(dvp, cnp->cn_nameptr, ctx, 0); } - #ifndef __LP64__ if (!thread_safe) { unlock_fsnode(dvp, &funnel_state); @@ -5454,6 +5880,16 @@ VNOP_PAGEOUT(struct vnode *vp, upl_t pl, upl_offset_t pl_offset, off_t f_offset, return (_err); } +int +vn_remove(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, int32_t flags, struct vnode_attr *vap, vfs_context_t ctx) +{ + if (vnode_compound_remove_available(dvp)) { + return VNOP_COMPOUND_REMOVE(dvp, vpp, ndp, flags, vap, ctx); + } else { + return VNOP_REMOVE(dvp, *vpp, &ndp->ni_cnd, flags, ctx); + } +} + #if 0 /* diff --git a/bsd/vfs/vfs_attrlist.c b/bsd/vfs/vfs_attrlist.c index b94375efd..091ee16ab 100644 --- a/bsd/vfs/vfs_attrlist.c +++ b/bsd/vfs/vfs_attrlist.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1995-2008 Apple Inc. All rights reserved. + * Copyright (c) 1995-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -395,6 +395,7 @@ static struct getattrlist_attrtab getattrlist_common_tab[] = { {ATTR_CMN_FILEID, VATTR_BIT(va_fileid), sizeof(uint64_t), KAUTH_VNODE_READ_ATTRIBUTES}, {ATTR_CMN_PARENTID, VATTR_BIT(va_parentid), sizeof(uint64_t), KAUTH_VNODE_READ_ATTRIBUTES}, {ATTR_CMN_FULLPATH, 0, sizeof(struct attrreference), KAUTH_VNODE_READ_ATTRIBUTES }, + {ATTR_CMN_ADDEDTIME, VATTR_BIT(va_addedtime), ATTR_TIME_SIZE, KAUTH_VNODE_READ_ATTRIBUTES}, {ATTR_CMN_RETURNED_ATTRS, 0, sizeof(attribute_set_t), 0}, {0, 0, 0, 0} }; @@ -523,6 +524,27 @@ getattrlist_fixupattrs(attribute_set_t *asp, struct vnode_attr *vap) if (asp->commonattr) { tab = getattrlist_common_tab; do { + /* + * This if() statement is slightly confusing. We're trying to + * iterate through all of the bits listed in the array + * getattr_common_tab, and see if the filesystem was expected + * to support it, and whether or not we need to do anything about this. + * + * This array is full of structs that have 4 fields (attr, bits, size, action). + * The first is used to store the ATTR_CMN_* bit that was being requested + * from userland. The second stores the VATTR_BIT corresponding to the field + * filled in vnode_attr struct. If it is 0, then we don't typically expect + * the filesystem to fill in this field. The third is the size of the field, + * and the fourth is the type of kauth actions needed. + * + * So, for all of the ATTR_CMN bits listed in this array, we iterate through + * them, and check to see if it was both passed down to the filesystem via the + * va_active bitfield, and whether or not we expect it to be emitted from + * the filesystem. If it wasn't supported, then we un-twiddle the bit and move + * on. This is done so that we can uncheck those bits and re-request + * a vnode_getattr from the filesystem again. + */ + if ((tab->attr & asp->commonattr) && (tab->bits & vap->va_active) && (tab->bits & vap->va_supported) == 0) { @@ -1108,6 +1130,7 @@ getvolattrlist(vnode_t vp, struct getattrlist_args *uap, struct attrlist *alp, } if (alp->volattr & ATTR_VOL_UUID) { ATTR_PACK(&ab, vs.f_uuid); + ab.actual.volattr |= ATTR_VOL_UUID; } if (alp->volattr & ATTR_VOL_ATTRIBUTES) { /* fix up volume attribute information */ @@ -1188,6 +1211,7 @@ getattrlist_internal(vnode_t vp, struct getattrlist_args *uap, proc_t p, vfs_con int return_valid; int pack_invalid; int vtype = 0; + uint32_t perms = 0; proc_is64 = proc_is64bit(p); VATTR_INIT(&va); @@ -1604,6 +1628,30 @@ getattrlist_internal(vnode_t vp, struct getattrlist_args *uap, proc_t p, vfs_con ATTR_PACK_TIME(ab, va.va_backup_time, proc_is64); ab.actual.commonattr |= ATTR_CMN_BKUPTIME; } + /* + * They are requesting user access, we should obtain this before getting + * the finder info. For some network file systems this is a performance + * improvement. + */ + if (al.commonattr & ATTR_CMN_USERACCESS) { /* this is expensive */ + if (vtype == VDIR) { + if (vnode_authorize(vp, NULL, + KAUTH_VNODE_ACCESS | KAUTH_VNODE_ADD_FILE | KAUTH_VNODE_ADD_SUBDIRECTORY | KAUTH_VNODE_DELETE_CHILD, ctx) == 0) + perms |= W_OK; + if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_LIST_DIRECTORY, ctx) == 0) + perms |= R_OK; + if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_SEARCH, ctx) == 0) + perms |= X_OK; + } else { + if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA, ctx) == 0) + perms |= W_OK; + if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_READ_DATA, ctx) == 0) + perms |= R_OK; + if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_EXECUTE, ctx) == 0) + perms |= X_OK; + } + } + if (al.commonattr & ATTR_CMN_FNDRINFO) { uio_t auio; size_t fisize = 32; @@ -1654,25 +1702,8 @@ getattrlist_internal(vnode_t vp, struct getattrlist_args *uap, proc_t p, vfs_con ATTR_PACK4(ab, va.va_flags); ab.actual.commonattr |= ATTR_CMN_FLAGS; } - if (al.commonattr & ATTR_CMN_USERACCESS) { /* this is expensive */ - uint32_t perms = 0; - if (vtype == VDIR) { - if (vnode_authorize(vp, NULL, - KAUTH_VNODE_ACCESS | KAUTH_VNODE_ADD_FILE | KAUTH_VNODE_ADD_SUBDIRECTORY | KAUTH_VNODE_DELETE_CHILD, ctx) == 0) - perms |= W_OK; - if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_LIST_DIRECTORY, ctx) == 0) - perms |= R_OK; - if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_SEARCH, ctx) == 0) - perms |= X_OK; - } else { - if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA, ctx) == 0) - perms |= W_OK; - if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_READ_DATA, ctx) == 0) - perms |= R_OK; - if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_EXECUTE, ctx) == 0) - perms |= X_OK; - } - + /* We already obtain the user access, so just fill in the buffer here */ + if (al.commonattr & ATTR_CMN_USERACCESS) { #if CONFIG_MACF /* * Rather than MAC preceding DAC, in this case we want @@ -1737,6 +1768,12 @@ getattrlist_internal(vnode_t vp, struct getattrlist_args *uap, proc_t p, vfs_con attrlist_pack_string (&ab, fullpathptr, fullpathlen); ab.actual.commonattr |= ATTR_CMN_FULLPATH; } + + if (al.commonattr & ATTR_CMN_ADDEDTIME) { + ATTR_PACK_TIME(ab, va.va_addedtime, proc_is64); + ab.actual.commonattr |= ATTR_CMN_ADDEDTIME; + } + /* directory attributes *********************************************/ if (al.dirattr && (vtype == VDIR)) { @@ -1749,24 +1786,100 @@ getattrlist_internal(vnode_t vp, struct getattrlist_args *uap, proc_t p, vfs_con ab.actual.dirattr |= ATTR_DIR_ENTRYCOUNT; } if (al.dirattr & ATTR_DIR_MOUNTSTATUS) { - ATTR_PACK_CAST(&ab, uint32_t, (vp->v_flag & VROOT) ? - DIR_MNTSTATUS_MNTPOINT : 0); + uint32_t mntstat; + + mntstat = (vp->v_flag & VROOT) ? DIR_MNTSTATUS_MNTPOINT : 0; +#if CONFIG_TRIGGERS + /* + * Report back on active vnode triggers + * that can directly trigger a mount + */ + if (vp->v_resolve && + !(vp->v_resolve->vr_flags & VNT_NO_DIRECT_MOUNT)) { + mntstat |= DIR_MNTSTATUS_TRIGGER; + } +#endif + ATTR_PACK4(ab, mntstat); ab.actual.dirattr |= ATTR_DIR_MOUNTSTATUS; } } /* file attributes **************************************************/ if (al.fileattr && (vtype != VDIR)) { + + size_t rsize = 0; + uint64_t rlength = 0; + uint64_t ralloc = 0; + /* + * Pre-fetch the rsrc attributes now so we only get them once. + * Fetch the resource fork size/allocation via xattr interface + */ + if (al.fileattr & (ATTR_FILE_TOTALSIZE | ATTR_FILE_ALLOCSIZE | ATTR_FILE_RSRCLENGTH | ATTR_FILE_RSRCALLOCSIZE)) { + if ((error = vn_getxattr(vp, XATTR_RESOURCEFORK_NAME, NULL, &rsize, XATTR_NOSECURITY, ctx)) != 0) { + if ((error == ENOENT) || (error == ENOATTR) || (error == ENOTSUP) || (error == EPERM)|| (error == EACCES)) { + rsize = 0; + error = 0; + } else { + goto out; + } + } + rlength = rsize; + + if (al.fileattr & (ATTR_FILE_RSRCALLOCSIZE | ATTR_FILE_ALLOCSIZE)) { + uint32_t blksize = vp->v_mount->mnt_vfsstat.f_bsize; + if (blksize == 0) { + blksize = 512; + } + ralloc = roundup(rsize, blksize); + } + } + if (al.fileattr & ATTR_FILE_LINKCOUNT) { ATTR_PACK4(ab, (uint32_t)va.va_nlink); ab.actual.fileattr |= ATTR_FILE_LINKCOUNT; } + /* + * Note the following caveats for the TOTALSIZE and ALLOCSIZE attributes: + * We infer that if the filesystem does not support va_data_size or va_data_alloc + * it must not know about alternate forks. So when we need to gather + * the total size or total alloc, it's OK to substitute the total size for + * the data size below. This is because it is likely a flat filesystem and we must + * be using AD files to store the rsrc fork and EAs. + * + * Additionally, note that getattrlist is barred from being called on + * resource fork paths. (Search for CN_ALLOWRSRCFORK). So if the filesystem does + * support va_data_size, it is guaranteed to represent the data fork's size. This + * is an important distinction to make because when we call vnode_getattr on + * an HFS resource fork vnode, to get the size, it will vend out the resource + * fork's size (it only gets the size of the passed-in vnode). + */ if (al.fileattr & ATTR_FILE_TOTALSIZE) { - ATTR_PACK8(ab, va.va_total_size); + uint64_t totalsize = rlength; + + if (VATTR_IS_SUPPORTED(&va, va_data_size)) { + totalsize += va.va_data_size; + } else { + totalsize += va.va_total_size; + } + + ATTR_PACK8(ab, totalsize); ab.actual.fileattr |= ATTR_FILE_TOTALSIZE; } if (al.fileattr & ATTR_FILE_ALLOCSIZE) { - ATTR_PACK8(ab, va.va_total_alloc); + uint64_t totalalloc = ralloc; + + /* + * If data_alloc is supported, then it must represent the + * data fork size. + */ + if (VATTR_IS_SUPPORTED(&va, va_data_alloc)) { + totalalloc += va.va_data_alloc; + } + else { + totalalloc += va.va_total_alloc; + } + + ATTR_PACK8(ab, totalalloc); ab.actual.fileattr |= ATTR_FILE_ALLOCSIZE; } if (al.fileattr & ATTR_FILE_IOBLOCKSIZE) { @@ -1793,6 +1906,12 @@ getattrlist_internal(vnode_t vp, struct getattrlist_args *uap, proc_t p, vfs_con ATTR_PACK4(ab, dev); ab.actual.fileattr |= ATTR_FILE_DEVTYPE; } + + /* + * If the filesystem does not support datalength + * or dataallocsize, then we infer that totalsize and + * totalalloc are substitutes. + */ if (al.fileattr & ATTR_FILE_DATALENGTH) { if (VATTR_IS_SUPPORTED(&va, va_data_size)) { ATTR_PACK8(ab, va.va_data_size); @@ -1809,37 +1928,17 @@ getattrlist_internal(vnode_t vp, struct getattrlist_args *uap, proc_t p, vfs_con } ab.actual.fileattr |= ATTR_FILE_DATAALLOCSIZE; } - /* fetch resource fork size/allocation via xattr interface */ - if (al.fileattr & (ATTR_FILE_RSRCLENGTH | ATTR_FILE_RSRCALLOCSIZE)) { - size_t rsize; - uint64_t rlength; - - if ((error = vn_getxattr(vp, XATTR_RESOURCEFORK_NAME, NULL, &rsize, XATTR_NOSECURITY, ctx)) != 0) { - if ((error == ENOENT) || (error == ENOATTR) || (error == ENOTSUP) || (error == EPERM)) { - rsize = 0; - error = 0; - } else { - goto out; - } - } - if (al.fileattr & ATTR_FILE_RSRCLENGTH) { - rlength = rsize; - ATTR_PACK8(ab, rlength); - ab.actual.fileattr |= ATTR_FILE_RSRCLENGTH; - } - if (al.fileattr & ATTR_FILE_RSRCALLOCSIZE) { - uint32_t blksize = vp->v_mount->mnt_vfsstat.f_bsize; - if (blksize == 0) - blksize = 512; - rlength = roundup(rsize, blksize); - ATTR_PACK8(ab, rlength); - ab.actual.fileattr |= ATTR_FILE_RSRCALLOCSIZE; - } + /* already got the resource fork size/allocation above */ + if (al.fileattr & ATTR_FILE_RSRCLENGTH) { + ATTR_PACK8(ab, rlength); + ab.actual.fileattr |= ATTR_FILE_RSRCLENGTH; } - if (al.fileattr & ATTR_FILE_PROTECTION_CLASS) { + if (al.fileattr & ATTR_FILE_RSRCALLOCSIZE) { + ATTR_PACK8(ab, ralloc); + ab.actual.fileattr |= ATTR_FILE_RSRCALLOCSIZE; } } - + /* diagnostic */ if (!return_valid && (ab.fixedcursor - ab.base) != fixedsize) panic("packed field size mismatch; allocated %ld but packed %ld for common %08x vol %08x", @@ -1938,7 +2037,7 @@ getattrlist(proc_t p, struct getattrlist_args *uap, __unused int32_t *retval) nameiflags = NOTRIGGER | AUDITVNPATH1; if (!(uap->options & FSOPT_NOFOLLOW)) nameiflags |= FOLLOW; - NDINIT(&nd, LOOKUP, nameiflags, UIO_USERSPACE, uap->path, ctx); + NDINIT(&nd, LOOKUP, OP_GETATTR, nameiflags, UIO_USERSPACE, uap->path, ctx); if ((error = namei(&nd)) != 0) goto out; @@ -2198,8 +2297,6 @@ setattrlist_internal(vnode_t vp, struct setattrlist_args *uap, proc_t p, vfs_con VFS_DEBUG(ctx, vp, "ATTRLIST - XXX device type change not implemented"); goto out; } - if (al.fileattr & ATTR_FILE_PROTECTION_CLASS) { - } /* * Validate and authorize. @@ -2325,10 +2422,10 @@ setattrlist(proc_t p, struct setattrlist_args *uap, __unused int32_t *retval) /* * Look up the file. */ - nameiflags = 0; + nameiflags = AUDITVNPATH1; if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW; - NDINIT(&nd, LOOKUP, nameiflags | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); + NDINIT(&nd, LOOKUP, OP_SETATTR, nameiflags, UIO_USERSPACE, uap->path, ctx); if ((error = namei(&nd)) != 0) goto out; vp = nd.ni_vp; diff --git a/bsd/vfs/vfs_bio.c b/bsd/vfs/vfs_bio.c index 69b9e8520..0d474ed28 100644 --- a/bsd/vfs/vfs_bio.c +++ b/bsd/vfs/vfs_bio.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -110,12 +110,13 @@ #include + #if BALANCE_QUEUES static __inline__ void bufqinc(int q); static __inline__ void bufqdec(int q); #endif -static int bcleanbuf(buf_t bp, boolean_t discard); +int bcleanbuf(buf_t bp, boolean_t discard); static int brecover_data(buf_t bp); static boolean_t incore(vnode_t vp, daddr64_t blkno); /* timeout is in msecs */ @@ -125,7 +126,13 @@ static void buf_reassign(buf_t bp, vnode_t newvp); static errno_t buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo); static int buf_iterprepare(vnode_t vp, struct buflists *, int flags); static void buf_itercomplete(vnode_t vp, struct buflists *, int flags); -boolean_t buffer_cache_gc(int); +static boolean_t buffer_cache_gc(int); +static buf_t buf_brelse_shadow(buf_t bp); +static void buf_free_meta_store(buf_t bp); + +static buf_t buf_create_shadow_internal(buf_t bp, boolean_t force_copy, + uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv); + __private_extern__ int bdwrite_internal(buf_t, int); @@ -156,6 +163,7 @@ long nbdwrite = 0; int blaundrycnt = 0; static int boot_nbuf_headers = 0; +static TAILQ_HEAD(delayqueue, buf) delaybufqueue; static TAILQ_HEAD(ioqueue, buf) iobufqueue; static TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES]; @@ -231,7 +239,7 @@ int lru_is_stale = LRU_IS_STALE; int age_is_stale = AGE_IS_STALE; int meta_is_stale = META_IS_STALE; - +#define MAXLAUNDRY 10 /* LIST_INSERT_HEAD() with assertions */ static __inline__ void @@ -278,7 +286,28 @@ bremhash(buf_t bp) *bp->b_hash.le_prev = (bp)->b_hash.le_next; } +/* + * buf_mtxp held. + */ +static __inline__ void +bmovelaundry(buf_t bp) +{ + bp->b_whichq = BQ_LAUNDRY; + bp->b_timestamp = buf_timestamp(); + binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY); + blaundrycnt++; +} +static __inline__ void +buf_release_credentials(buf_t bp) +{ + if (IS_VALID_CRED(bp->b_rcred)) { + kauth_cred_unref(&bp->b_rcred); + } + if (IS_VALID_CRED(bp->b_wcred)) { + kauth_cred_unref(&bp->b_wcred); + } +} int @@ -315,6 +344,17 @@ buf_markdelayed(buf_t bp) { SET(bp->b_flags, B_DONE); } +void +buf_markclean(buf_t bp) { + + if (ISSET(bp->b_flags, B_DELWRI)) { + CLR(bp->b_flags, B_DELWRI); + + OSAddAtomicLong(-1, &nbdwrite); + buf_reassign(bp, bp->b_vp); + } +} + void buf_markeintr(buf_t bp) { @@ -571,15 +611,179 @@ buf_clone(buf_t bp, int io_offset, int io_size, void (*iodone)(buf_t, void *), v } +int +buf_shadow(buf_t bp) +{ + if (bp->b_lflags & BL_SHADOW) + return 1; + return 0; +} + + +buf_t +buf_create_shadow_priv(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg) +{ + return (buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 1)); +} + +buf_t +buf_create_shadow(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg) +{ + return (buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 0)); +} + + +static buf_t +buf_create_shadow_internal(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv) +{ + buf_t io_bp; + + KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_START, bp, 0, 0, 0, 0); + + if ( !(bp->b_flags & B_META) || (bp->b_lflags & BL_IOBUF)) { + + KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_END, bp, 0, 0, 0, 0); + return (NULL); + } +#ifdef BUF_MAKE_PRIVATE + if (bp->b_shadow_ref && bp->b_data_ref == 0 && external_storage == 0) + panic("buf_create_shadow: %p is in the private state (%d, %d)", bp, bp->b_shadow_ref, bp->b_data_ref); +#endif + io_bp = alloc_io_buf(bp->b_vp, priv); + + io_bp->b_flags = bp->b_flags & (B_META | B_ZALLOC | B_ASYNC | B_READ | B_FUA); + io_bp->b_blkno = bp->b_blkno; + io_bp->b_lblkno = bp->b_lblkno; + + if (iodone) { + io_bp->b_transaction = arg; + io_bp->b_iodone = iodone; + io_bp->b_flags |= B_CALL; + } + if (force_copy == FALSE) { + io_bp->b_bcount = bp->b_bcount; + io_bp->b_bufsize = bp->b_bufsize; + + if (external_storage) { + io_bp->b_datap = external_storage; +#ifdef BUF_MAKE_PRIVATE + io_bp->b_data_store = NULL; +#endif + } else { + io_bp->b_datap = bp->b_datap; +#ifdef BUF_MAKE_PRIVATE + io_bp->b_data_store = bp; +#endif + } + *(buf_t *)(&io_bp->b_orig) = bp; + + lck_mtx_lock_spin(buf_mtxp); + + io_bp->b_lflags |= BL_SHADOW; + io_bp->b_shadow = bp->b_shadow; + bp->b_shadow = io_bp; + bp->b_shadow_ref++; + +#ifdef BUF_MAKE_PRIVATE + if (external_storage) + io_bp->b_lflags |= BL_EXTERNAL; + else + bp->b_data_ref++; +#endif + lck_mtx_unlock(buf_mtxp); + } else { + if (external_storage) { +#ifdef BUF_MAKE_PRIVATE + io_bp->b_lflags |= BL_EXTERNAL; +#endif + io_bp->b_bcount = bp->b_bcount; + io_bp->b_bufsize = bp->b_bufsize; + io_bp->b_datap = external_storage; + } else { + allocbuf(io_bp, bp->b_bcount); + + io_bp->b_lflags |= BL_IOBUF_ALLOC; + } + bcopy((caddr_t)bp->b_datap, (caddr_t)io_bp->b_datap, bp->b_bcount); + +#ifdef BUF_MAKE_PRIVATE + io_bp->b_data_store = NULL; +#endif + } + KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, io_bp, 0); + + return (io_bp); +} + + +#ifdef BUF_MAKE_PRIVATE +errno_t +buf_make_private(buf_t bp) +{ + buf_t ds_bp; + buf_t t_bp; + struct buf my_buf; + + KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_START, bp, bp->b_shadow_ref, 0, 0, 0); + + if (bp->b_shadow_ref == 0 || bp->b_data_ref == 0 || ISSET(bp->b_lflags, BL_SHADOW)) { + + KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0); + return (EINVAL); + } + my_buf.b_flags = B_META; + my_buf.b_datap = (uintptr_t)NULL; + allocbuf(&my_buf, bp->b_bcount); + + bcopy((caddr_t)bp->b_datap, (caddr_t)my_buf.b_datap, bp->b_bcount); + + lck_mtx_lock_spin(buf_mtxp); + + for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) { + if ( !ISSET(bp->b_lflags, BL_EXTERNAL)) + break; + } + ds_bp = t_bp; + + if (ds_bp == NULL && bp->b_data_ref) + panic("buf_make_private: b_data_ref != 0 && ds_bp == NULL"); + + if (ds_bp && (bp->b_data_ref == 0 || bp->b_shadow_ref == 0)) + panic("buf_make_private: ref_count == 0 && ds_bp != NULL"); + + if (ds_bp == NULL) { + lck_mtx_unlock(buf_mtxp); + + buf_free_meta_store(&my_buf); + + KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0); + return (EINVAL); + } + for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) { + if ( !ISSET(t_bp->b_lflags, BL_EXTERNAL)) + t_bp->b_data_store = ds_bp; + } + ds_bp->b_data_ref = bp->b_data_ref; + + bp->b_data_ref = 0; + bp->b_datap = my_buf.b_datap; + + lck_mtx_unlock(buf_mtxp); + + KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, 0, 0); + return (0); +} +#endif + void buf_setfilter(buf_t bp, void (*filter)(buf_t, void *), void *transaction, - void **old_iodone, void **old_transaction) + void (**old_iodone)(buf_t, void *), void **old_transaction) { - if (old_iodone) - *old_iodone = (void *)(bp->b_iodone); + if (old_iodone) + *old_iodone = bp->b_iodone; if (old_transaction) - *old_transaction = (void *)(bp->b_transaction); + *old_transaction = bp->b_transaction; bp->b_transaction = transaction; bp->b_iodone = filter; @@ -884,6 +1088,13 @@ buf_strategy(vnode_t devvp, void *ap) vnode_t vp = bp->b_vp; int bmap_flags; errno_t error; +#if CONFIG_DTRACE + int dtrace_io_start_flag = 0; /* We only want to trip the io:::start + * probe once, with the true phisical + * block in place (b_blkno) + */ + +#endif if (vp == NULL || vp->v_type == VCHR || vp->v_type == VBLK) panic("buf_strategy: b_vp == NULL || vtype == VCHR | VBLK\n"); @@ -893,7 +1104,6 @@ buf_strategy(vnode_t devvp, void *ap) * end up issuing the I/O... */ bp->b_dev = devvp->v_rdev; - DTRACE_IO1(start, buf_t, bp); if (bp->b_flags & B_READ) bmap_flags = VNODE_READ; @@ -909,6 +1119,7 @@ buf_strategy(vnode_t devvp, void *ap) * to deal with filesystem block sizes * that aren't equal to the page size */ + DTRACE_IO1(start, buf_t, bp); return (cluster_bp(bp)); } if (bp->b_blkno == bp->b_lblkno) { @@ -916,30 +1127,53 @@ buf_strategy(vnode_t devvp, void *ap) size_t contig_bytes; if ((error = VNOP_BLKTOOFF(vp, bp->b_lblkno, &f_offset))) { + DTRACE_IO1(start, buf_t, bp); buf_seterror(bp, error); buf_biodone(bp); return (error); } if ((error = VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))) { + DTRACE_IO1(start, buf_t, bp); buf_seterror(bp, error); buf_biodone(bp); return (error); } + + DTRACE_IO1(start, buf_t, bp); +#if CONFIG_DTRACE + dtrace_io_start_flag = 1; +#endif /* CONFIG_DTRACE */ + if ((bp->b_blkno == -1) || (contig_bytes == 0)) { /* Set block number to force biodone later */ bp->b_blkno = -1; buf_clear(bp); } - else if ((long)contig_bytes < bp->b_bcount) + else if ((long)contig_bytes < bp->b_bcount) { return (buf_strategy_fragmented(devvp, bp, f_offset, contig_bytes)); + } } + +#if CONFIG_DTRACE + if (dtrace_io_start_flag == 0) { + DTRACE_IO1(start, buf_t, bp); + dtrace_io_start_flag = 1; + } +#endif /* CONFIG_DTRACE */ + if (bp->b_blkno == -1) { buf_biodone(bp); return (0); } } + +#if CONFIG_DTRACE + if (dtrace_io_start_flag == 0) + DTRACE_IO1(start, buf_t, bp); +#endif /* CONFIG_DTRACE */ + /* * we can issue the I/O because... * either B_CLUSTER is set which @@ -1067,6 +1301,7 @@ int buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo) { buf_t bp; + int aflags; int error = 0; int must_rescan = 1; struct buflists local_iterblkhd; @@ -1097,6 +1332,7 @@ buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo) goto try_dirty_list; } while (!LIST_EMPTY(&local_iterblkhd)) { + bp = LIST_FIRST(&local_iterblkhd); LIST_REMOVE(bp, b_vnbufs); @@ -1108,7 +1344,12 @@ buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo) if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META))) continue; - if ( (error = (int)buf_acquire_locked(bp, BAC_REMOVE | BAC_SKIP_LOCKED, slpflag, slptimeo)) ) { + aflags = BAC_REMOVE; + + if ( !(flags & BUF_INVALIDATE_LOCKED) ) + aflags |= BAC_SKIP_LOCKED; + + if ( (error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo)) ) { if (error == EDEADLK) /* * this buffer was marked B_LOCKED... @@ -1136,6 +1377,10 @@ buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo) } lck_mtx_unlock(buf_mtxp); + if (bp->b_flags & B_LOCKED) + KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 0, 0); + + CLR(bp->b_flags, B_LOCKED); SET(bp->b_flags, B_INVAL); buf_brelse(bp); @@ -1170,7 +1415,12 @@ buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo) if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META))) continue; - if ( (error = (int)buf_acquire_locked(bp, BAC_REMOVE | BAC_SKIP_LOCKED, slpflag, slptimeo)) ) { + aflags = BAC_REMOVE; + + if ( !(flags & BUF_INVALIDATE_LOCKED) ) + aflags |= BAC_SKIP_LOCKED; + + if ( (error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo)) ) { if (error == EDEADLK) /* * this buffer was marked B_LOCKED... @@ -1198,6 +1448,10 @@ buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo) } lck_mtx_unlock(buf_mtxp); + if (bp->b_flags & B_LOCKED) + KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 1, 0); + + CLR(bp->b_flags, B_LOCKED); SET(bp->b_flags, B_INVAL); if (ISSET(bp->b_flags, B_DELWRI) && (flags & BUF_WRITE_DATA)) @@ -1360,6 +1614,19 @@ bremfree_locked(buf_t bp) { struct bqueues *dp = NULL; int whichq; + + whichq = bp->b_whichq; + + if (whichq == -1) { + if (bp->b_shadow_ref == 0) + panic("bremfree_locked: %p not on freelist", bp); + /* + * there are clones pointing to 'bp'... + * therefore, it was not put on a freelist + * when buf_brelse was last called on 'bp' + */ + return; + } /* * We only calculate the head of the freelist when removing * the last element of the list as that is the only time that @@ -1367,8 +1634,6 @@ bremfree_locked(buf_t bp) * * NB: This makes an assumption about how tailq's are implemented. */ - whichq = bp->b_whichq; - if (bp->b_freelist.tqe_next == NULL) { dp = &bufqueues[whichq]; @@ -1385,6 +1650,7 @@ bremfree_locked(buf_t bp) bp->b_whichq = -1; bp->b_timestamp = 0; + bp->b_shadow = 0; } /* @@ -1432,7 +1698,7 @@ brelvp_locked(buf_t bp) static void buf_reassign(buf_t bp, vnode_t newvp) { - register struct buflists *listheadp; + struct buflists *listheadp; if (newvp == NULL) { printf("buf_reassign: NULL"); @@ -1502,8 +1768,11 @@ bufinit(void) binsheadfree(bp, dp, BQ_EMPTY); binshash(bp, &invalhash); } - boot_nbuf_headers = nbuf_headers; + + TAILQ_INIT(&iobufqueue); + TAILQ_INIT(&delaybufqueue); + for (; i < nbuf_headers + niobuf_headers; i++) { bp = &buf_headers[i]; bufhdrinit(bp); @@ -1601,8 +1870,10 @@ bufzoneinit(void) meta_zones[i].mz_max, PAGE_SIZE, meta_zones[i].mz_name); + zone_change(meta_zones[i].mz_zone, Z_CALLERACCT, FALSE); } buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers"); + zone_change(buf_hdr_zone, Z_CALLERACCT, FALSE); } static __inline__ zone_t @@ -1853,7 +2124,7 @@ vn_bwrite(struct vnop_bwrite_args *ap) * headers, we can get in to the situation where "too" many * buf_bdwrite()s can create situation where the kernel can create * buffers faster than the disks can service. Doing a buf_bawrite() in - * cases were we have "too many" outstanding buf_bdwrite()s avoids that. + * cases where we have "too many" outstanding buf_bdwrite()s avoids that. */ __private_extern__ int bdwrite_internal(buf_t bp, int return_error) @@ -1955,6 +2226,116 @@ buf_bawrite(buf_t bp) } + +static void +buf_free_meta_store(buf_t bp) +{ + if (bp->b_bufsize) { + if (ISSET(bp->b_flags, B_ZALLOC)) { + zone_t z; + + z = getbufzone(bp->b_bufsize); + zfree(z, (void *)bp->b_datap); + } else + kmem_free(kernel_map, bp->b_datap, bp->b_bufsize); + + bp->b_datap = (uintptr_t)NULL; + bp->b_bufsize = 0; + } +} + + +static buf_t +buf_brelse_shadow(buf_t bp) +{ + buf_t bp_head; + buf_t bp_temp; + buf_t bp_return = NULL; +#ifdef BUF_MAKE_PRIVATE + buf_t bp_data; + int data_ref = 0; +#endif + lck_mtx_lock_spin(buf_mtxp); + + bp_head = (buf_t)bp->b_orig; + + if (bp_head->b_whichq != -1) + panic("buf_brelse_shadow: bp_head on freelist %d\n", bp_head->b_whichq); + +#ifdef BUF_MAKE_PRIVATE + if (bp_data = bp->b_data_store) { + bp_data->b_data_ref--; + /* + * snapshot the ref count so that we can check it + * outside of the lock... we only want the guy going + * from 1 -> 0 to try and release the storage + */ + data_ref = bp_data->b_data_ref; + } +#endif + KERNEL_DEBUG(0xbbbbc008 | DBG_FUNC_START, bp, bp_head, bp_head->b_shadow_ref, 0, 0); + + bp_head->b_shadow_ref--; + + for (bp_temp = bp_head; bp_temp && bp != bp_temp->b_shadow; bp_temp = bp_temp->b_shadow); + + if (bp_temp == NULL) + panic("buf_brelse_shadow: bp not on list %p", bp_head); + + bp_temp->b_shadow = bp_temp->b_shadow->b_shadow; + +#ifdef BUF_MAKE_PRIVATE + /* + * we're about to free the current 'owner' of the data buffer and + * there is at least one other shadow buf_t still pointing at it + * so transfer it to the first shadow buf left in the chain + */ + if (bp == bp_data && data_ref) { + if ((bp_data = bp_head->b_shadow) == NULL) + panic("buf_brelse_shadow: data_ref mismatch bp(%p)", bp); + + for (bp_temp = bp_data; bp_temp; bp_temp = bp_temp->b_shadow) + bp_temp->b_data_store = bp_data; + bp_data->b_data_ref = data_ref; + } +#endif + if (bp_head->b_shadow_ref == 0 && bp_head->b_shadow) + panic("buf_relse_shadow: b_shadow != NULL && b_shadow_ref == 0 bp(%p)", bp); + if (bp_head->b_shadow_ref && bp_head->b_shadow == 0) + panic("buf_relse_shadow: b_shadow == NULL && b_shadow_ref != 0 bp(%p)", bp); + + if (bp_head->b_shadow_ref == 0) { + if (!ISSET(bp_head->b_lflags, BL_BUSY)) { + + CLR(bp_head->b_flags, B_AGE); + bp_head->b_timestamp = buf_timestamp(); + + if (ISSET(bp_head->b_flags, B_LOCKED)) { + bp_head->b_whichq = BQ_LOCKED; + binstailfree(bp_head, &bufqueues[BQ_LOCKED], BQ_LOCKED); + } else { + bp_head->b_whichq = BQ_META; + binstailfree(bp_head, &bufqueues[BQ_META], BQ_META); + } + } else if (ISSET(bp_head->b_lflags, BL_WAITSHADOW)) { + CLR(bp_head->b_lflags, BL_WAITSHADOW); + + bp_return = bp_head; + } + } + lck_mtx_unlock(buf_mtxp); +#ifdef BUF_MAKE_PRIVATE + if (bp == bp_data && data_ref == 0) + buf_free_meta_store(bp); + + bp->b_data_store = NULL; +#endif + KERNEL_DEBUG(0xbbbbc008 | DBG_FUNC_END, bp, 0, 0, 0, 0); + + return (bp_return); +} + + /* * Release a buffer on to the free lists. * Described in Bach (p. 46). @@ -1979,7 +2360,18 @@ buf_brelse(buf_t bp) bp->b_tag = 0; #endif if (bp->b_lflags & BL_IOBUF) { + buf_t shadow_master_bp = NULL; + + if (ISSET(bp->b_lflags, BL_SHADOW)) + shadow_master_bp = buf_brelse_shadow(bp); + else if (ISSET(bp->b_lflags, BL_IOBUF_ALLOC)) + buf_free_meta_store(bp); free_io_buf(bp); + + if (shadow_master_bp) { + bp = shadow_master_bp; + goto finish_shadow_master; + } return; } @@ -1999,7 +2391,7 @@ buf_brelse(buf_t bp) if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) { if (ISSET(bp->b_flags, B_FILTER)) { /* if necessary, call out */ void (*iodone_func)(struct buf *, void *) = bp->b_iodone; - void *arg = (void *)bp->b_transaction; + void *arg = bp->b_transaction; CLR(bp->b_flags, B_FILTER); /* but note callout done */ bp->b_iodone = NULL; @@ -2020,7 +2412,7 @@ buf_brelse(buf_t bp) kern_return_t kret; int upl_flags; - if ( (upl == NULL) ) { + if (upl == NULL) { if ( !ISSET(bp->b_flags, B_INVAL)) { kret = ubc_create_upl(bp->b_vp, ubc_blktooff(bp->b_vp, bp->b_lblkno), @@ -2082,6 +2474,9 @@ buf_brelse(buf_t bp) if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL) || (ISSET(bp->b_lflags, BL_WANTDEALLOC) && !ISSET(bp->b_flags, B_DELWRI))) { + + boolean_t delayed_buf_free_meta_store = FALSE; + /* * If it's invalid or empty, dissociate it from its vnode, * release its storage if B_META, and @@ -2091,34 +2486,34 @@ buf_brelse(buf_t bp) OSAddAtomicLong(-1, &nbdwrite); if (ISSET(bp->b_flags, B_META)) { - if (bp->b_bufsize) { - if (ISSET(bp->b_flags, B_ZALLOC)) { - zone_t z; - - z = getbufzone(bp->b_bufsize); - zfree(z, (void *)bp->b_datap); - } else - kmem_free(kernel_map, bp->b_datap, bp->b_bufsize); - - bp->b_datap = (uintptr_t)NULL; - bp->b_bufsize = 0; - } + if (bp->b_shadow_ref) + delayed_buf_free_meta_store = TRUE; + else + buf_free_meta_store(bp); } /* * nuke any credentials we were holding */ - if (IS_VALID_CRED(bp->b_rcred)) { - kauth_cred_unref(&bp->b_rcred); - } - if (IS_VALID_CRED(bp->b_wcred)) { - kauth_cred_unref(&bp->b_wcred); + buf_release_credentials(bp); + + lck_mtx_lock_spin(buf_mtxp); + + if (bp->b_shadow_ref) { + SET(bp->b_lflags, BL_WAITSHADOW); + + lck_mtx_unlock(buf_mtxp); + + return; } - CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA)); + if (delayed_buf_free_meta_store == TRUE) { - bufq = &bufqueues[BQ_EMPTY]; - bp->b_whichq = BQ_EMPTY; + lck_mtx_unlock(buf_mtxp); +finish_shadow_master: + buf_free_meta_store(bp); - lck_mtx_lock_spin(buf_mtxp); + lck_mtx_lock_spin(buf_mtxp); + } + CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA)); if (bp->b_vp) brelvp_locked(bp); @@ -2127,8 +2522,10 @@ buf_brelse(buf_t bp) BLISTNONE(bp); binshash(bp, &invalhash); - binsheadfree(bp, bufq, BQ_EMPTY); + bp->b_whichq = BQ_EMPTY; + binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY); } else { + /* * It has valid data. Put it on the end of the appropriate * queue, so that it'll stick around for as long as possible. @@ -2143,13 +2540,32 @@ buf_brelse(buf_t bp) whichq = BQ_LRU; /* valid data */ bufq = &bufqueues[whichq]; - CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE)); - bp->b_whichq = whichq; bp->b_timestamp = buf_timestamp(); - lck_mtx_lock_spin(buf_mtxp); - - binstailfree(bp, bufq, whichq); + lck_mtx_lock_spin(buf_mtxp); + + /* + * the buf_brelse_shadow routine doesn't take 'ownership' + * of the parent buf_t... it updates state that is protected by + * the buf_mtxp, and checks for BL_BUSY to determine whether to + * put the buf_t back on a free list. b_shadow_ref is protected + * by the lock, and since we have not yet cleared B_BUSY, we need + * to check it while holding the lock to insure that one of us + * puts this buf_t back on a free list when it is safe to do so + */ + if (bp->b_shadow_ref == 0) { + CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE)); + bp->b_whichq = whichq; + binstailfree(bp, bufq, whichq); + } else { + /* + * there are still cloned buf_t's pointing + * at this guy... need to keep it off the + * freelists until a buf_brelse is done on + * the last clone + */ + CLR(bp->b_flags, (B_ASYNC | B_NOCACHE)); + } } if (needbuffer) { /* @@ -2581,6 +2997,23 @@ buf_geteblk(int size) return (bp); } +uint32_t +buf_redundancy_flags(buf_t bp) +{ + return bp->b_redundancy_flags; +} + +void +buf_set_redundancy_flags(buf_t bp, uint32_t flags) +{ + SET(bp->b_redundancy_flags, flags); +} + +void +buf_clear_redundancy_flags(buf_t bp, uint32_t flags) +{ + CLR(bp->b_redundancy_flags, flags); +} /* * With UBC, there is no need to expand / shrink the file data @@ -2861,14 +3294,14 @@ getnewbuf(int slpflag, int slptimeo, int * queue) /* * Clean a buffer. - * Returns 0 is buffer is ready to use, + * Returns 0 if buffer is ready to use, * Returns 1 if issued a buf_bawrite() to indicate * that the buffer is not ready. * * buf_mtxp is held upon entry * returns with buf_mtxp locked */ -static int +int bcleanbuf(buf_t bp, boolean_t discard) { /* Remove from the queue */ @@ -2887,10 +3320,7 @@ bcleanbuf(buf_t bp, boolean_t discard) SET(bp->b_lflags, BL_WANTDEALLOC); } - bp->b_whichq = BQ_LAUNDRY; - bp->b_timestamp = buf_timestamp(); - binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY); - blaundrycnt++; + bmovelaundry(bp); lck_mtx_unlock(buf_mtxp); @@ -2926,30 +3356,12 @@ bcleanbuf(buf_t bp, boolean_t discard) BLISTNONE(bp); - if (ISSET(bp->b_flags, B_META)) { - vm_offset_t elem; - - elem = (vm_offset_t)bp->b_datap; - bp->b_datap = (uintptr_t)0xdeadbeef; - - if (ISSET(bp->b_flags, B_ZALLOC)) { - zone_t z; - - z = getbufzone(bp->b_bufsize); - zfree(z, (void *)elem); - } else - kmem_free(kernel_map, elem, bp->b_bufsize); - } + if (ISSET(bp->b_flags, B_META)) + buf_free_meta_store(bp); trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); - /* nuke any credentials we were holding */ - if (IS_VALID_CRED(bp->b_rcred)) { - kauth_cred_unref(&bp->b_rcred); - } - if (IS_VALID_CRED(bp->b_wcred)) { - kauth_cred_unref(&bp->b_wcred); - } + buf_release_credentials(bp); /* If discarding, just move to the empty queue */ if (discard) { @@ -3163,24 +3575,6 @@ buf_biowait(buf_t bp) return (0); } -/* - * Wait for the callback operation on a B_CALL buffer to complete. - */ -void -buf_biowait_callback(buf_t bp) -{ - while (!ISSET(bp->b_lflags, BL_CALLDONE)) { - - lck_mtx_lock_spin(buf_mtxp); - - if (!ISSET(bp->b_lflags, BL_CALLDONE)) { - DTRACE_IO1(wait__start, buf_t, bp); - (void) msleep(bp, buf_mtxp, PDROP | (PRIBIO+1), "buf_biowait", NULL); - DTRACE_IO1(wait__done, buf_t, bp); - } else - lck_mtx_unlock(buf_mtxp); - } -} /* * Mark I/O complete on a buffer. @@ -3242,6 +3636,11 @@ buf_biodone(buf_t bp) else if (bp->b_flags & B_PAGEIO) code |= DKIO_PAGING; + if (bp->b_flags & B_THROTTLED_IO) + code |= DKIO_THROTTLE; + else if (bp->b_flags & B_PASSIVE) + code |= DKIO_PASSIVE; + KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE, bp, (uintptr_t)bp->b_vp, bp->b_resid, bp->b_error, 0); @@ -3252,11 +3651,14 @@ buf_biodone(buf_t bp) microuptime(&priority_IO_timestamp_for_root); hard_throttle_on_root = 0; } + /* * I/O was done, so don't believe - * the DIRTY state from VM anymore + * the DIRTY state from VM anymore... + * and we need to reset the THROTTLED/PASSIVE + * indicators */ - CLR(bp->b_flags, B_WASDIRTY); + CLR(bp->b_flags, (B_WASDIRTY | B_THROTTLED_IO | B_PASSIVE)); DTRACE_IO1(done, buf_t, bp); if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW)) @@ -3269,46 +3671,26 @@ buf_biodone(buf_t bp) if (ISSET(bp->b_flags, (B_CALL | B_FILTER))) { /* if necessary, call out */ void (*iodone_func)(struct buf *, void *) = bp->b_iodone; - void *arg = (void *)bp->b_transaction; + void *arg = bp->b_transaction; int callout = ISSET(bp->b_flags, B_CALL); + if (iodone_func == NULL) + panic("biodone: bp @ %p has NULL b_iodone!\n", bp); + CLR(bp->b_flags, (B_CALL | B_FILTER)); /* filters and callouts are one-shot */ bp->b_iodone = NULL; bp->b_transaction = NULL; - if (iodone_func == NULL) { - panic("biodone: bp @ %p has NULL b_iodone!\n", bp); - } else { - if (callout) - SET(bp->b_flags, B_DONE); /* note that it's done */ - (*iodone_func)(bp, arg); - } - if (callout) { - int need_wakeup = 0; + if (callout) + SET(bp->b_flags, B_DONE); /* note that it's done */ - /* + (*iodone_func)(bp, arg); + + if (callout) { + /* * assumes that the callback function takes * ownership of the bp and deals with releasing it if necessary - * BL_WANTED indicates that we've decided to wait on the - * completion of this I/O in a synchronous manner... we - * still call the callback function, but in addition we - * will do a wakeup... BL_CALLDONE indicates that the callback - * routine has completed and its ok for the waiter to take - * 'ownership' of this bp back */ - lck_mtx_lock_spin(buf_mtxp); - - if (bp->b_lflags & BL_WANTED) { - CLR(bp->b_lflags, BL_WANTED); - need_wakeup = 1; - } - SET(bp->b_lflags, BL_CALLDONE); - - lck_mtx_unlock(buf_mtxp); - - if (need_wakeup) - wakeup(bp); - goto biodone_done; } /* @@ -3390,8 +3772,8 @@ void vfs_bufstats() { int i, j, count; - register struct buf *bp; - register struct bqueues *dp; + struct buf *bp; + struct bqueues *dp; int counts[MAXBSIZE/CLBYTES+1]; static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" }; @@ -3418,7 +3800,7 @@ vfs_bufstats() } #endif /* DIAGNOSTIC */ -#define NRESERVEDIOBUFS 64 +#define NRESERVEDIOBUFS 128 buf_t @@ -3433,9 +3815,7 @@ alloc_io_buf(vnode_t vp, int priv) bufstats.bufs_iobufsleeps++; need_iobuffer = 1; - (void) msleep(&need_iobuffer, iobuffer_mtxp, PDROP | (PRIBIO+1), (const char *)"alloc_io_buf", NULL); - - lck_mtx_lock_spin(iobuffer_mtxp); + (void) msleep(&need_iobuffer, iobuffer_mtxp, PSPIN | (PRIBIO+1), (const char *)"alloc_io_buf", NULL); } TAILQ_REMOVE(&iobufqueue, bp, b_freelist); @@ -3457,6 +3837,7 @@ alloc_io_buf(vnode_t vp, int priv) bp->b_datap = 0; bp->b_flags = 0; bp->b_lflags = BL_BUSY | BL_IOBUF; + bp->b_redundancy_flags = 0; bp->b_blkno = bp->b_lblkno = 0; #ifdef JOE_DEBUG bp->b_owner = current_thread(); @@ -3551,6 +3932,8 @@ bcleanbuf_thread_init(void) thread_deallocate(thread); } +typedef int (*bcleanbufcontinuation)(int); + static void bcleanbuf_thread(void) { @@ -3562,10 +3945,9 @@ bcleanbuf_thread(void) lck_mtx_lock_spin(buf_mtxp); while ( (bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY])) == NULL) { - (void)msleep((void *)&bufqueues[BQ_LAUNDRY], buf_mtxp, PDROP | PRIBIO, "blaundry", NULL); - - lck_mtx_lock_spin(buf_mtxp); + (void)msleep0(&bufqueues[BQ_LAUNDRY], buf_mtxp, PRIBIO|PDROP, "blaundry", 0, (bcleanbufcontinuation)bcleanbuf_thread); } + /* * Remove from the queue */ @@ -3597,7 +3979,7 @@ bcleanbuf_thread(void) binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY); blaundrycnt++; - /* we never leave a busy page on the laundary queue */ + /* we never leave a busy page on the laundry queue */ CLR(bp->b_lflags, BL_BUSY); buf_busycount--; #ifdef JOE_DEBUG @@ -3606,12 +3988,18 @@ bcleanbuf_thread(void) #endif lck_mtx_unlock(buf_mtxp); - - if (loopcnt > 10) { - (void)tsleep((void *)&bufqueues[BQ_LAUNDRY], PRIBIO, "blaundry", 1); + + if (loopcnt > MAXLAUNDRY) { + /* + * bawrite_internal() can return errors if we're throttled. If we've + * done several I/Os and failed, give the system some time to unthrottle + * the vnode + */ + (void)tsleep((void *)&bufqueues[BQ_LAUNDRY], PRIBIO, "blaundry", 1); loopcnt = 0; } else { - (void)thread_block(THREAD_CONTINUE_NULL); + /* give other threads a chance to run */ + (void)thread_block(THREAD_CONTINUE_NULL); loopcnt++; } } @@ -3680,34 +4068,125 @@ buffer_cache_gc(int all) { buf_t bp; boolean_t did_large_zfree = FALSE; + boolean_t need_wakeup = FALSE; int now = buf_timestamp(); - uint32_t count = 0; + uint32_t found = 0, total_found = 0; + struct bqueues privq; int thresh_hold = BUF_STALE_THRESHHOLD; if (all) thresh_hold = 0; + /* + * We only care about metadata (incore storage comes from zalloc()). + * No more than 1024 buffers total, and only those not accessed within the + * last 30s. We will also only examine 128 buffers during a single grab + * of the lock in order to limit lock hold time. + */ + lck_mtx_lock(buf_mtxp); + do { + found = 0; + TAILQ_INIT(&privq); + need_wakeup = FALSE; - lck_mtx_lock_spin(buf_mtxp); + while (((bp = TAILQ_FIRST(&bufqueues[BQ_META]))) && + (now > bp->b_timestamp) && + (now - bp->b_timestamp > thresh_hold) && + (found < BUF_MAX_GC_BATCH_SIZE)) { + + /* Remove from free list */ + bremfree_locked(bp); + found++; + +#ifdef JOE_DEBUG + bp->b_owner = current_thread(); + bp->b_tag = 12; +#endif + + /* If dirty, move to laundry queue and remember to do wakeup */ + if (ISSET(bp->b_flags, B_DELWRI)) { + SET(bp->b_lflags, BL_WANTDEALLOC); + + bmovelaundry(bp); + need_wakeup = TRUE; + + continue; + } + + /* + * Mark busy and put on private list. We could technically get + * away without setting BL_BUSY here. + */ + SET(bp->b_lflags, BL_BUSY); + buf_busycount++; - /* We only care about metadata (incore storage comes from zalloc()) */ - bp = TAILQ_FIRST(&bufqueues[BQ_META]); + /* + * Remove from hash and dissociate from vp. + */ + bremhash(bp); + if (bp->b_vp) { + brelvp_locked(bp); + } - /* Only collect buffers unused in the last N seconds. Note: ordered by timestamp. */ - while ((bp != NULL) && ((now - bp->b_timestamp) > thresh_hold) && (all || (count < BUF_MAX_GC_COUNT))) { - int result, size; - boolean_t is_zalloc; + TAILQ_INSERT_TAIL(&privq, bp, b_freelist); + } - size = buf_size(bp); - is_zalloc = ISSET(bp->b_flags, B_ZALLOC); + if (found == 0) { + break; + } - result = bcleanbuf(bp, TRUE); - if ((result == 0) && is_zalloc && (size >= PAGE_SIZE)) { - /* We've definitely freed at least a page to a zone */ - did_large_zfree = TRUE; + /* Drop lock for batch processing */ + lck_mtx_unlock(buf_mtxp); + + /* Wakeup and yield for laundry if need be */ + if (need_wakeup) { + wakeup(&bufqueues[BQ_LAUNDRY]); + (void)thread_block(THREAD_CONTINUE_NULL); } - bp = TAILQ_FIRST(&bufqueues[BQ_META]); - count++; - } + + /* Clean up every buffer on private list */ + TAILQ_FOREACH(bp, &privq, b_freelist) { + /* Take note if we've definitely freed at least a page to a zone */ + if ((ISSET(bp->b_flags, B_ZALLOC)) && (buf_size(bp) >= PAGE_SIZE)) { + did_large_zfree = TRUE; + } + + trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); + + /* Free Storage */ + buf_free_meta_store(bp); + + /* Release credentials */ + buf_release_credentials(bp); + + /* Prepare for moving to empty queue */ + CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED + | B_AGE | B_ASYNC | B_NOCACHE | B_FUA)); + bp->b_whichq = BQ_EMPTY; + BLISTNONE(bp); + } + + lck_mtx_lock(buf_mtxp); + + /* Back under lock, move them all to invalid hash and clear busy */ + TAILQ_FOREACH(bp, &privq, b_freelist) { + binshash(bp, &invalhash); + CLR(bp->b_lflags, BL_BUSY); + buf_busycount--; + +#ifdef JOE_DEBUG + if (bp->b_owner != current_thread()) { + panic("Buffer stolen from buffer_cache_gc()"); + } + bp->b_owner = current_thread(); + bp->b_tag = 13; +#endif + } + + /* And do a big bulk move to the empty queue */ + TAILQ_CONCAT(&bufqueues[BQ_EMPTY], &privq, b_freelist); + total_found += found; + + } while ((all || (total_found < BUF_MAX_GC_COUNT)) && (found == BUF_MAX_GC_BATCH_SIZE)); lck_mtx_unlock(buf_mtxp); diff --git a/bsd/vfs/vfs_cache.c b/bsd/vfs/vfs_cache.c index ba73d95a4..3096d1294 100644 --- a/bsd/vfs/vfs_cache.c +++ b/bsd/vfs/vfs_cache.c @@ -114,6 +114,7 @@ long numcache; /* number of cache entries allocated */ int desiredNodes; int desiredNegNodes; int ncs_negtotal; +int nc_disabled = 0; TAILQ_HEAD(, namecache) nchead; /* chain of all name cache entries */ TAILQ_HEAD(, namecache) neghead; /* chain of only negative cache entries */ @@ -309,8 +310,22 @@ build_path(vnode_t first_vp, char *buff, int buflen, int *outlen, int flags, vfs */ if (((vp->v_parent != NULLVP) && !fixhardlink) || (flags & BUILDPATH_NO_FS_ENTER)) { - vp = vp->v_parent; + /* + * In this if () block we are not allowed to enter the filesystem + * to conclusively get the most accurate parent identifier. + * As a result, if 'vp' does not identify '/' and it + * does not have a valid v_parent, then error out + * and disallow further path construction + */ + if ((vp->v_parent == NULLVP) && (rootvnode != vp)) { + /* Only '/' is allowed to have a NULL parent pointer */ + ret = EINVAL; + + /* The code below will exit early if 'tvp = vp' == NULL */ + } + vp = vp->v_parent; + /* * if the vnode we have in hand isn't a directory and it * has a v_parent, then we started with the resource fork @@ -808,11 +823,18 @@ void vnode_uncache_authorized_action(vnode_t vp, kauth_action_t action) } -boolean_t vnode_cache_is_authorized(vnode_t vp, vfs_context_t ctx, kauth_action_t action) +extern int bootarg_vnode_cache_defeat; /* default = 0, from bsd_init.c */ + +boolean_t +vnode_cache_is_authorized(vnode_t vp, vfs_context_t ctx, kauth_action_t action) { kauth_cred_t ucred; boolean_t retval = FALSE; + /* Boot argument to defeat rights caching */ + if (bootarg_vnode_cache_defeat) + return FALSE; + if ( (vp->v_mount->mnt_kern_flag & (MNTK_AUTH_OPAQUE | MNTK_AUTH_CACHE_TTL)) ) { /* * a TTL is enabled on the rights cache... handle it here @@ -937,7 +959,7 @@ boolean_t vnode_cache_is_stale(vnode_t vp) */ int cache_lookup_path(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, - vfs_context_t ctx, int *trailing_slash, int *dp_authorized, vnode_t last_dp) + vfs_context_t ctx, int *dp_authorized, vnode_t last_dp) { char *cp; /* pointer into pathname argument */ int vid; @@ -951,8 +973,12 @@ cache_lookup_path(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, unsigned int hash; int error = 0; +#if CONFIG_TRIGGERS + vnode_t trigger_vp; +#endif /* CONFIG_TRIGGERS */ + ucred = vfs_context_ucred(ctx); - *trailing_slash = 0; + ndp->ni_flag &= ~(NAMEI_TRAILINGSLASH); NAME_CACHE_LOCK_SHARED(); @@ -999,7 +1025,7 @@ cache_lookup_path(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, ndp->ni_pathlen--; if (*cp == '\0') { - *trailing_slash = 1; + ndp->ni_flag |= NAMEI_TRAILINGSLASH; *ndp->ni_next = '\0'; } } @@ -1073,10 +1099,12 @@ cache_lookup_path(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, *dp_authorized = 1; if ( (cnp->cn_flags & (ISLASTCN | ISDOTDOT)) ) { - if (cnp->cn_nameiop != LOOKUP) - break; - if (cnp->cn_flags & (LOCKPARENT | NOCACHE)) - break; + if (cnp->cn_nameiop != LOOKUP) + break; + if (cnp->cn_flags & LOCKPARENT) + break; + if (cnp->cn_flags & NOCACHE) + break; if (cnp->cn_flags & ISDOTDOT) { /* * Force directory hardlinks to go to @@ -1126,6 +1154,7 @@ cache_lookup_path(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, vp = NULL; break; } + if ( (mp = vp->v_mountedhere) && ((cnp->cn_flags & NOCROSSMOUNT) == 0)) { if (mp->mnt_realrootvp == NULLVP || mp->mnt_generation != mount_generation || @@ -1133,6 +1162,20 @@ cache_lookup_path(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, break; vp = mp->mnt_realrootvp; } + +#if CONFIG_TRIGGERS + /* + * After traversing all mountpoints stacked here, if we have a + * trigger in hand, resolve it. Note that we don't need to + * leave the fast path if the mount has already happened. + */ + if ((vp->v_resolve != NULL) && + (vp->v_resolve->vr_resolve_func != NULL)) { + break; + } +#endif /* CONFIG_TRIGGERS */ + + dp = vp; vp = NULLVP; @@ -1184,7 +1227,7 @@ cache_lookup_path(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, * immediately w/o waiting... it always succeeds */ vnode_get(dp); - } else if ( (vnode_getwithvid(dp, vid)) ) { + } else if ( (vnode_getwithvid_drainok(dp, vid)) ) { /* * failure indicates the vnode * changed identity or is being @@ -1202,7 +1245,7 @@ cache_lookup_path(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, } } if (vp != NULLVP) { - if ( (vnode_getwithvid(vp, vvid)) ) { + if ( (vnode_getwithvid_drainok(vp, vvid)) ) { vp = NULLVP; /* @@ -1219,9 +1262,24 @@ cache_lookup_path(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, } } } + ndp->ni_dvp = dp; ndp->ni_vp = vp; +#if CONFIG_TRIGGERS + trigger_vp = vp ? vp : dp; + if ((error == 0) && (trigger_vp != NULLVP) && vnode_isdir(trigger_vp)) { + error = vnode_trigger_resolve(trigger_vp, ndp, ctx); + if (error) { + if (vp) + vnode_put(vp); + if (dp) + vnode_put(dp); + goto errorout; + } + } +#endif /* CONFIG_TRIGGERS */ + errorout: /* * If we came into cache_lookup_path after an iteration of the lookup loop that @@ -1249,6 +1307,10 @@ cache_lookup_locked(vnode_t dvp, struct componentname *cnp) long namelen = cnp->cn_namelen; unsigned int hashval = (cnp->cn_hash & NCHASHMASK); + if (nc_disabled) { + return NULL; + } + ncpp = NCHHASH(dvp, cnp->cn_hash); LIST_FOREACH(ncp, ncpp, nc_hash) { if ((ncp->nc_dvp == dvp) && (ncp->nc_hashval == hashval)) { @@ -1328,6 +1390,10 @@ cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp) cnp->cn_hash = hash_string(cnp->cn_nameptr, cnp->cn_namelen); hashval = (cnp->cn_hash & NCHASHMASK); + if (nc_disabled) { + return 0; + } + NAME_CACHE_LOCK_SHARED(); relook: @@ -1485,6 +1551,9 @@ cache_enter_locked(struct vnode *dvp, struct vnode *vp, struct componentname *cn struct namecache *ncp, *negp; struct nchashhead *ncpp; + if (nc_disabled) + return; + /* * if the entry is for -ve caching vp is null */ @@ -1799,7 +1868,10 @@ cache_purge(vnode_t vp) struct namecache *ncp; kauth_cred_t tcred = NULL; - if ((LIST_FIRST(&vp->v_nclinks) == NULL) && (LIST_FIRST(&vp->v_ncchildren) == NULL) && (vp->v_cred == NOCRED)) + if ((LIST_FIRST(&vp->v_nclinks) == NULL) && + (LIST_FIRST(&vp->v_ncchildren) == NULL) && + (vp->v_cred == NOCRED) && + (vp->v_parent == NULLVP)) return; NAME_CACHE_LOCK(); @@ -1973,9 +2045,6 @@ add_name_internal(const char *name, uint32_t len, u_int hashval, boolean_t need_ uint32_t lock_index; char *ptr; - if (hashval == 0) { - hashval = hash_string(name, 0); - } /* * if the length already accounts for the null-byte, then * subtract one so later on we don't index past the end @@ -1984,6 +2053,10 @@ add_name_internal(const char *name, uint32_t len, u_int hashval, boolean_t need_ if (len > 0 && name[len-1] == '\0') { len--; } + if (hashval == 0) { + hashval = hash_string(name, len); + } + /* * take this lock 'shared' to keep the hash stable * if someone else decides to grow the pool they diff --git a/bsd/vfs/vfs_cluster.c b/bsd/vfs/vfs_cluster.c index 499056a3b..0e8bd67dd 100644 --- a/bsd/vfs/vfs_cluster.c +++ b/bsd/vfs/vfs_cluster.c @@ -82,6 +82,7 @@ #include #include #include +#include #include #include @@ -90,6 +91,8 @@ #include #include +#include + #if 0 #undef KERNEL_DEBUG #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT @@ -111,6 +114,8 @@ #define CL_DIRECT_IO 0x1000 #define CL_PASSIVE 0x2000 #define CL_IOSTREAMING 0x4000 +#define CL_CLOSE 0x8000 +#define CL_ENCRYPTED 0x10000 #define MAX_VECTOR_UPL_ELEMENTS 8 #define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE) * PAGE_SIZE @@ -122,6 +127,7 @@ extern void vector_upl_set_pagelist(upl_t); extern void vector_upl_set_iostate(upl_t, upl_t, vm_offset_t, u_int32_t); struct clios { + lck_mtx_t io_mtxp; u_int io_completed; /* amount of io that has currently completed */ u_int io_issued; /* amount of io that was successfully issued */ int io_error; /* error code of first error encountered */ @@ -131,7 +137,6 @@ struct clios { static lck_grp_t *cl_mtx_grp; static lck_attr_t *cl_mtx_attr; static lck_grp_attr_t *cl_mtx_grp_attr; -static lck_mtx_t *cl_mtxp; static lck_mtx_t *cl_transaction_mtxp; @@ -157,6 +162,8 @@ static int cluster_iodone(buf_t bp, void *callback_arg); static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags); static int cluster_hard_throttle_on(vnode_t vp, uint32_t); +static void cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name); + static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg); static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference); @@ -183,10 +190,10 @@ static void cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t files static int cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg); -static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int (*)(buf_t, void *), void *callback_arg); +static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int flags, int (*)(buf_t, void *), void *callback_arg); static void sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg); -static void sparse_cluster_push(void **cmapp, vnode_t vp, off_t EOF, int push_flag, int (*)(buf_t, void *), void *callback_arg); +static void sparse_cluster_push(void **cmapp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*)(buf_t, void *), void *callback_arg); static void sparse_cluster_add(void **cmapp, vnode_t vp, struct cl_extent *, off_t EOF, int (*)(buf_t, void *), void *callback_arg); static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp); @@ -203,12 +210,20 @@ static kern_return_t vfs_drt_control(void **cmapp, int op_type); #define MAX_VECTS 16 #define MIN_DIRECT_WRITE_SIZE (4 * PAGE_SIZE) +#define WRITE_THROTTLE 6 +#define WRITE_THROTTLE_SSD 2 +#define WRITE_BEHIND 1 +#define WRITE_BEHIND_SSD 1 +#define PREFETCH 3 +#define PREFETCH_SSD 2 + #define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * base) #define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE)) -#define MAX_PREFETCH(vp, io_size) (io_size * IO_SCALE(vp, 3)) +#define MAX_PREFETCH(vp, size, is_ssd) (size * IO_SCALE(vp, (is_ssd && !ignore_is_ssd) ? PREFETCH_SSD : PREFETCH)) - -int speculative_reads_disabled = 0; +int ignore_is_ssd = 0; +int speculative_reads_disabled = 0; +uint32_t speculative_prefetch_max = (MAX_UPL_SIZE * 3); /* * throttle the number of async writes that @@ -235,15 +250,6 @@ cluster_init(void) { */ cl_mtx_attr = lck_attr_alloc_init(); - /* - * allocate and initialize mutex's used to protect updates and waits - * on the cluster_io context - */ - cl_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr); - - if (cl_mtxp == NULL) - panic("cluster_init: failed to allocate cl_mtxp"); - cl_transaction_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr); if (cl_transaction_mtxp == NULL) @@ -412,7 +418,7 @@ cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *c if (wbp->cl_number) { lck_mtx_lock(&wbp->cl_lockw); - cluster_try_push(wbp, vp, newEOF, PUSH_ALL | PUSH_SYNC, callback, callback_arg); + cluster_try_push(wbp, vp, newEOF, PUSH_ALL | PUSH_SYNC, 0, callback, callback_arg); lck_mtx_unlock(&wbp->cl_lockw); } @@ -450,6 +456,27 @@ cluster_hard_throttle_on(vnode_t vp, uint32_t hard_throttle) } +static void +cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name) +{ + + lck_mtx_lock(&iostate->io_mtxp); + + while ((iostate->io_issued - iostate->io_completed) > target) { + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, + iostate->io_issued, iostate->io_completed, target, 0, 0); + + iostate->io_wanted = 1; + msleep((caddr_t)&iostate->io_wanted, &iostate->io_mtxp, PRIBIO + 1, wait_name, NULL); + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, + iostate->io_issued, iostate->io_completed, target, 0, 0); + } + lck_mtx_unlock(&iostate->io_mtxp); +} + + static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags) { @@ -457,7 +484,7 @@ cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_fla int page_in = 0; int page_out = 0; - if (io_flags & B_PHYS) + if ((io_flags & (B_PHYS | B_CACHE)) == (B_PHYS | B_CACHE)) /* * direct write of any flavor, or a direct read that wasn't aligned */ @@ -517,33 +544,44 @@ cluster_iodone(buf_t bp, void *callback_arg) cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0); if (cbp_head->b_trans_next || !(cbp_head->b_flags & B_EOT)) { + boolean_t need_wakeup = FALSE; lck_mtx_lock_spin(cl_transaction_mtxp); bp->b_flags |= B_TDONE; + if (bp->b_flags & B_TWANTED) { + CLR(bp->b_flags, B_TWANTED); + need_wakeup = TRUE; + } for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) { - /* + /* * all I/O requests that are part of this transaction * have to complete before we can process it */ - if ( !(cbp->b_flags & B_TDONE)) { + if ( !(cbp->b_flags & B_TDONE)) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0); lck_mtx_unlock(cl_transaction_mtxp); + + if (need_wakeup == TRUE) + wakeup(bp); + return 0; } if (cbp->b_flags & B_EOT) - transaction_complete = TRUE; + transaction_complete = TRUE; } lck_mtx_unlock(cl_transaction_mtxp); + if (need_wakeup == TRUE) + wakeup(bp); + if (transaction_complete == FALSE) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, cbp_head, 0, 0, 0, 0); - return 0; } } @@ -609,7 +647,7 @@ cluster_iodone(buf_t bp, void *callback_arg) * someone has issued multiple I/Os asynchrounsly * and is waiting for them to complete (streaming) */ - lck_mtx_lock_spin(cl_mtxp); + lck_mtx_lock_spin(&iostate->io_mtxp); if (error && iostate->io_error == 0) iostate->io_error = error; @@ -624,7 +662,7 @@ cluster_iodone(buf_t bp, void *callback_arg) iostate->io_wanted = 0; need_wakeup = 1; } - lck_mtx_unlock(cl_mtxp); + lck_mtx_unlock(&iostate->io_mtxp); if (need_wakeup) wakeup((caddr_t)&iostate->io_wanted); @@ -649,7 +687,7 @@ cluster_iodone(buf_t bp, void *callback_arg) ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags); } } - if ((b_flags & B_NEED_IODONE) && real_bp) { + if (real_bp) { if (error) { real_bp->b_flags |= B_ERROR; real_bp->b_error = error; @@ -735,27 +773,36 @@ cluster_wait_IO(buf_t cbp_head, int async) /* * async callback completion will not normally * generate a wakeup upon I/O completion... - * by setting BL_WANTED, we will force a wakeup + * by setting B_TWANTED, we will force a wakeup * to occur as any outstanding I/Os complete... - * I/Os already completed will have BL_CALLDONE already - * set and we won't block in buf_biowait_callback.. + * I/Os already completed will have B_TDONE already + * set and we won't cause us to block * note that we're actually waiting for the bp to have * completed the callback function... only then * can we safely take back ownership of the bp - * need the main buf mutex in order to safely - * update b_lflags */ - buf_list_lock(); + lck_mtx_lock_spin(cl_transaction_mtxp); for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) - cbp->b_lflags |= BL_WANTED; + cbp->b_flags |= B_TWANTED; - buf_list_unlock(); + lck_mtx_unlock(cl_transaction_mtxp); } for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) { - if (async) - buf_biowait_callback(cbp); - else + + if (async) { + while (!ISSET(cbp->b_flags, B_TDONE)) { + + lck_mtx_lock_spin(cl_transaction_mtxp); + + if (!ISSET(cbp->b_flags, B_TDONE)) { + DTRACE_IO1(wait__start, buf_t, cbp); + (void) msleep(cbp, cl_transaction_mtxp, PDROP | (PRIBIO+1), "cluster_wait_IO", NULL); + DTRACE_IO1(wait__done, buf_t, cbp); + } else + lck_mtx_unlock(cl_transaction_mtxp); + } + } else buf_biowait(cbp); } } @@ -781,7 +828,7 @@ cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, i * so that cluster_iodone sees the transaction as completed */ for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) - cbp->b_flags |= B_TDONE; + cbp->b_flags |= B_TDONE; error = cluster_iodone(*cbp_head, callback_arg); @@ -910,10 +957,9 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no else { u_int max_cluster; u_int max_cluster_size; - u_int max_prefetch; - + u_int scale; + max_cluster_size = MAX_CLUSTER_SIZE(vp); - max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ)); if (max_iosize > max_cluster_size) max_cluster = max_cluster_size; @@ -922,8 +968,16 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no if (size < max_cluster) max_cluster = size; + + if ((vp->v_mount->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) + scale = WRITE_THROTTLE_SSD; + else + scale = WRITE_THROTTLE; - async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), (max_prefetch / max_cluster) - 1); + if (flags & CL_CLOSE) + scale += MAX_CLUSTERS; + + async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), ((scale * max_cluster_size) / max_cluster) - 1); } } } @@ -935,12 +989,14 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no io_flags |= B_IOSTREAMING; if (flags & CL_COMMIT) io_flags |= B_COMMIT_UPL; - if (flags & CL_PRESERVE) + if (flags & CL_DIRECT_IO) io_flags |= B_PHYS; - if (flags & CL_KEEPCACHED) - io_flags |= B_CACHE; + if (flags & (CL_PRESERVE | CL_KEEPCACHED)) + io_flags |= B_CACHE; if (flags & CL_PASSIVE) io_flags |= B_PASSIVE; + if (flags & CL_ENCRYPTED) + io_flags |= B_ENCRYPTED_IO; if (vp->v_flag & VSYSTEM) io_flags |= B_META; @@ -997,7 +1053,7 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no off_t e_offset; int pageout_flags; - if(upl_get_internal_vectorupl(upl)) + if (upl_get_internal_vectorupl(upl)) panic("Vector UPLs should not take this code-path\n"); /* * we're writing into a 'hole' @@ -1104,7 +1160,6 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no } if (vnode_pageout(vp, upl, trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) { error = EINVAL; - break; } e_offset = round_page_64(f_offset + 1); io_size = e_offset - f_offset; @@ -1133,6 +1188,11 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no */ size = 0; } + if (error) { + if (size == 0) + flags &= ~CL_COMMIT; + break; + } continue; } lblkno = (daddr64_t)(f_offset / PAGE_SIZE_64); @@ -1370,10 +1430,8 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no cbp_head = cbp; cbp_tail = cbp; - if ( (cbp_head->b_real_bp = real_bp) ) { - cbp_head->b_flags |= B_NEED_IODONE; + if ( (cbp_head->b_real_bp = real_bp) ) real_bp = (buf_t)NULL; - } } *(buf_t *)(&cbp->b_trans_head) = cbp_head; @@ -1479,7 +1537,7 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no * since we never really issued the io * just go ahead and adjust it back */ - lck_mtx_lock_spin(cl_mtxp); + lck_mtx_lock_spin(&iostate->io_mtxp); if (iostate->io_error == 0) iostate->io_error = error; @@ -1493,7 +1551,7 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no iostate->io_wanted = 0; need_wakeup = 1; } - lck_mtx_unlock(cl_mtxp); + lck_mtx_unlock(&iostate->io_mtxp); if (need_wakeup) wakeup((caddr_t)&iostate->io_wanted); @@ -1604,8 +1662,16 @@ cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct return; } - max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ)); + max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ), (vp->v_mount->mnt_kern_flag & MNTK_SSD)); + if ((max_prefetch / PAGE_SIZE) > speculative_prefetch_max) + max_prefetch = (speculative_prefetch_max * PAGE_SIZE); + + if (max_prefetch <= PAGE_SIZE) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, + rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0); + return; + } if (extent->e_addr < rap->cl_maxra) { if ((rap->cl_maxra - extent->e_addr) > ((max_prefetch / PAGE_SIZE) / 4)) { @@ -1667,18 +1733,7 @@ cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offs off_t max_size; int local_flags; - if (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) - /* - * if we know we're issuing this I/O to a virtual device (i.e. disk image) - * then we don't want to enforce this throttle... if we do, we can - * potentially deadlock since we're stalling the pageout thread at a time - * when the disk image might need additional memory (which won't be available - * if the pageout thread can't run)... instead we'll just depend on the throttle - * that the pageout thread now has in place to deal with external files - */ - local_flags = CL_PAGEOUT; - else - local_flags = CL_PAGEOUT | CL_THROTTLE; + local_flags = CL_PAGEOUT | CL_THROTTLE; if ((flags & UPL_IOSYNC) == 0) local_flags |= CL_ASYNC; @@ -1686,6 +1741,8 @@ cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offs local_flags |= CL_COMMIT; if ((flags & UPL_KEEPCACHED)) local_flags |= CL_KEEPCACHED; + if (flags & UPL_PAGING_ENCRYPTED) + local_flags |= CL_ENCRYPTED; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE, @@ -1762,6 +1819,8 @@ cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offse local_flags |= CL_COMMIT; if (flags & UPL_IOSTREAMING) local_flags |= CL_IOSTREAMING; + if (flags & UPL_PAGING_ENCRYPTED) + local_flags |= CL_ENCRYPTED; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE, @@ -1869,12 +1928,12 @@ cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t } /* * do a write through the cache if one of the following is true.... - * NOCACHE is not true and + * NOCACHE is not true or NODIRECT is true * the uio request doesn't target USERSPACE * otherwise, find out if we want the direct or contig variant for * the first vector in the uio request */ - if ( (flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ) + if ( ((flags & (IO_NOCACHE | IO_NODIRECT)) == IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ) retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE); if ( (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)) && write_type == IO_DIRECT) @@ -2027,6 +2086,8 @@ cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, in iostate.io_error = 0; iostate.io_wanted = 0; + lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); + mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; @@ -2207,23 +2268,9 @@ cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, in * if there are already too many outstanding writes * wait until some complete before issuing the next */ - if (iostate.io_issued > iostate.io_completed) { - - lck_mtx_lock(cl_mtxp); - - while ((iostate.io_issued - iostate.io_completed) > (max_upl_size * IO_SCALE(vp, 2))) { - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, max_upl_size * IO_SCALE(vp, 2), 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_direct", NULL); + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, max_upl_size * IO_SCALE(vp, 2), "cluster_write_direct"); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, max_upl_size * IO_SCALE(vp, 2), 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) { /* * one of the earlier writes we issued ran into a hard error @@ -2303,7 +2350,7 @@ cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, in wait_for_dwrites: - if(retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) { + if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) { retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); reset_vector_run_state(); } @@ -2313,23 +2360,13 @@ cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, in * make sure all async writes issued as part of this stream * have completed before we return */ - lck_mtx_lock(cl_mtxp); - - while (iostate.io_issued != iostate.io_completed) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_direct", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - } - lck_mtx_unlock(cl_mtxp); + cluster_iostate_wait(&iostate, 0, "cluster_write_direct"); } if (iostate.io_error) retval = iostate.io_error; + lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); + if (io_req_size && retval == 0) { /* * we couldn't handle the tail of this request in DIRECT mode @@ -2392,6 +2429,8 @@ cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, iostate.io_error = 0; iostate.io_wanted = 0; + lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); + next_cwrite: io_size = *write_length; @@ -2480,22 +2519,9 @@ cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, * if there are already too many outstanding writes * wait until some have completed before issuing the next */ - if (iostate.io_issued > iostate.io_completed) { - lck_mtx_lock(cl_mtxp); - - while ((iostate.io_issued - iostate.io_completed) > (MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2))) { + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig"); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_contig", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) { /* * one of the earlier writes we issued ran into a hard error @@ -2539,25 +2565,14 @@ cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, * make sure all async writes that are part of this stream * have completed before we proceed */ - if (iostate.io_issued > iostate.io_completed) { - - lck_mtx_lock(cl_mtxp); - - while (iostate.io_issued != iostate.io_completed) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_contig", NULL); + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, 0, "cluster_write_contig"); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) error = iostate.io_error; + lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); + if (error == 0 && tail_size) error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg); @@ -2632,6 +2647,9 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old off_t zero_off; long long zero_cnt1; off_t zero_off1; + off_t write_off = 0; + int write_cnt = 0; + boolean_t first_pass = FALSE; struct cl_extent cl; struct cl_writebehind *wbp; int bflag; @@ -2713,7 +2731,16 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old retval, 0, 0, 0, 0); return (0); } - + if (uio) { + write_off = uio->uio_offset; + write_cnt = uio_resid(uio); + /* + * delay updating the sequential write info + * in the control block until we've obtained + * the lock for it + */ + first_pass = TRUE; + } while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) { /* * for this iteration of the loop, figure out where our starting point is @@ -3008,7 +3035,7 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old */ wbp->cl_number = 0; - sparse_cluster_push(&(wbp->cl_scmap), vp, newEOF, PUSH_ALL, callback, callback_arg); + sparse_cluster_push(&(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg); /* * no clusters of either type present at this point * so just go directly to start_new_cluster since @@ -3017,7 +3044,17 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old * to avoid the deadlock with sparse_cluster_push */ goto start_new_cluster; - } + } + if (first_pass) { + if (write_off == wbp->cl_last_write) + wbp->cl_seq_written += write_cnt; + else + wbp->cl_seq_written = write_cnt; + + wbp->cl_last_write = write_off + write_cnt; + + first_pass = FALSE; + } if (wbp->cl_number == 0) /* * no clusters currently present @@ -3132,14 +3169,27 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old */ goto delay_io; - if (wbp->cl_number < MAX_CLUSTERS) + if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && + wbp->cl_number == MAX_CLUSTERS && + wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) { + uint32_t n; + + if (vp->v_mount->mnt_kern_flag & MNTK_SSD) + n = WRITE_BEHIND_SSD; + else + n = WRITE_BEHIND; + + while (n--) + cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg); + } + if (wbp->cl_number < MAX_CLUSTERS) { /* * we didn't find an existing cluster to * merge into, but there's room to start * a new one */ goto start_new_cluster; - + } /* * no exisitng cluster to merge with and no * room to start a new one... we'll try @@ -3157,7 +3207,7 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old */ if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) { - ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, callback, callback_arg); + ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg); } /* @@ -3176,18 +3226,6 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old continue; } - /* - * we pushed one cluster successfully, so we must be sequentially writing this file - * otherwise, we would have failed and fallen into the sparse cluster support - * so let's take the opportunity to push out additional clusters... - * this will give us better I/O locality if we're in a copy loop - * (i.e. we won't jump back and forth between the read and write points - */ - if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) { - while (wbp->cl_number) - cluster_try_push(wbp, vp, newEOF, 0, callback, callback_arg); - } - start_new_cluster: wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr; wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr; @@ -3342,19 +3380,25 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file struct cl_extent extent; int bflag; int take_reference = 1; +#if CONFIG_EMBEDDED struct uthread *ut; +#endif /* CONFIG_EMBEDDED */ int policy = IOPOL_DEFAULT; - + boolean_t iolock_inited = FALSE; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START, (int)uio->uio_offset, io_req_size, (int)filesize, flags, 0); +#if !CONFIG_EMBEDDED + policy = proc_get_task_selfdiskacc(); +#else /* !CONFIG_EMBEDDED */ policy = current_proc()->p_iopol_disk; ut = get_bsdthread_info(current_thread()); if (ut->uu_iopol_disk != IOPOL_DEFAULT) policy = ut->uu_iopol_disk; +#endif /* !CONFIG_EMBEDDED */ if (policy == IOPOL_THROTTLE || (flags & IO_NOCACHE)) take_reference = 0; @@ -3365,7 +3409,7 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file bflag = 0; max_io_size = cluster_max_io_size(vp->v_mount, CL_READ); - max_prefetch = MAX_PREFETCH(vp, max_io_size); + max_prefetch = MAX_PREFETCH(vp, max_io_size, (vp->v_mount->mnt_kern_flag & MNTK_SSD)); max_rd_size = max_prefetch; last_request_offset = uio->uio_offset + io_req_size; @@ -3464,7 +3508,7 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file io_requested = io_resid; - retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, last_ioread_offset == 0 ? take_reference : 0); + retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, take_reference); xsize = io_requested - io_resid; @@ -3576,6 +3620,11 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file * we may have to clip the size of it to keep from reading past * the end of the last physical block associated with the file */ + if (iolock_inited == FALSE) { + lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); + + iolock_inited = TRUE; + } upl_offset = start_pg * PAGE_SIZE; io_size = (last_pg - start_pg) * PAGE_SIZE; @@ -3588,6 +3637,18 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, CL_READ | CL_ASYNC | bflag, (buf_t)NULL, &iostate, callback, callback_arg); + + if (rap) { + if (extent.e_addr < rap->cl_maxra) { + /* + * we've just issued a read for a block that should have been + * in the cache courtesy of the read-ahead engine... something + * has gone wrong with the pipeline, so reset the read-ahead + * logic which will cause us to restart from scratch + */ + rap->cl_maxra = 0; + } + } } if (error == 0) { /* @@ -3666,22 +3727,9 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file rap->cl_lastr = extent.e_addr; } } - if (iostate.io_issued > iostate.io_completed) { + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); - lck_mtx_lock(cl_mtxp); - - while (iostate.io_issued != iostate.io_completed) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_copy", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) error = iostate.io_error; else { @@ -3693,6 +3741,9 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file io_req_size -= (val_size - io_requested); } + } else { + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); } if (start_pg < last_pg) { /* @@ -3773,6 +3824,20 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file } } } + if (iolock_inited == TRUE) { + if (iostate.io_issued > iostate.io_completed) { + /* + * cluster_io returned an error after it + * had already issued some I/O. we need + * to wait for that I/O to complete before + * we can destroy the iostate mutex... + * 'retval' already contains the early error + * so no need to pick it up from iostate.io_error + */ + cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); + } + lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); + } if (rap != NULL) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END, (int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0); @@ -3819,6 +3884,7 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t max_upl_size; u_int32_t max_rd_size; u_int32_t max_rd_ahead; + boolean_t strict_uncached_IO = FALSE; u_int32_t vector_upl_iosize = 0; int issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1); @@ -3835,6 +3901,7 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, max_rd_ahead = max_rd_size * IO_SCALE(vp, 2); io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO; + if (flags & IO_PASSIVE) io_flag |= CL_PASSIVE; @@ -3843,6 +3910,8 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, iostate.io_error = 0; iostate.io_wanted = 0; + lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); + devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; @@ -3862,6 +3931,9 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, */ devblocksize = PAGE_SIZE; } + + strict_uncached_IO = ubc_strict_uncached_IO(vp); + next_dread: io_req_size = *read_length; iov_base = uio_curriovbase(uio); @@ -3913,8 +3985,9 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, * cluster_copy_ubc_data returns the resid * in io_size */ - retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0); - + if (strict_uncached_IO == FALSE) { + retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0); + } /* * calculate the number of bytes actually copied * starting size - residual @@ -3991,21 +4064,26 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, */ goto wait_for_dreads; } - if ((xsize = io_size) > max_rd_size) - xsize = max_rd_size; - io_size = 0; + if (strict_uncached_IO == FALSE) { - ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size); + if ((xsize = io_size) > max_rd_size) + xsize = max_rd_size; - if (io_size == 0) { - /* - * a page must have just come into the cache - * since the first page in this range is no - * longer absent, go back and re-evaluate - */ - continue; + io_size = 0; + + ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size); + + if (io_size == 0) { + /* + * a page must have just come into the cache + * since the first page in this range is no + * longer absent, go back and re-evaluate + */ + continue; + } } + iov_base = uio_curriovbase(uio); upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK); @@ -4097,22 +4175,9 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, * if there are already too many outstanding reads * wait until some have completed before issuing the next read */ - if (iostate.io_issued > iostate.io_completed) { + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct"); - lck_mtx_lock(cl_mtxp); - - while ((iostate.io_issued - iostate.io_completed) > max_rd_ahead) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, max_rd_ahead, 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_direct", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, max_rd_ahead, 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) { /* * one of the earlier reads we issued ran into a hard error @@ -4191,25 +4256,14 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, * make sure all async reads that are part of this stream * have completed before we return */ - if (iostate.io_issued > iostate.io_completed) { + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, 0, "cluster_read_direct"); - lck_mtx_lock(cl_mtxp); - - while (iostate.io_issued != iostate.io_completed) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_direct", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) retval = iostate.io_error; + lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); + if (io_req_size && retval == 0) { /* * we couldn't handle the tail of this request in DIRECT mode @@ -4273,6 +4327,8 @@ cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, iostate.io_error = 0; iostate.io_wanted = 0; + lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); + next_cread: io_size = *read_length; @@ -4370,21 +4426,9 @@ cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, * if there are already too many outstanding reads * wait until some have completed before issuing the next */ - if (iostate.io_issued > iostate.io_completed) { - lck_mtx_lock(cl_mtxp); - - while ((iostate.io_issued - iostate.io_completed) > (MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2))) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_contig", NULL); + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig"); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) { /* * one of the earlier reads we issued ran into a hard error @@ -4425,25 +4469,14 @@ cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, * make sure all async reads that are part of this stream * have completed before we proceed */ - if (iostate.io_issued > iostate.io_completed) { - - lck_mtx_lock(cl_mtxp); - - while (iostate.io_issued != iostate.io_completed) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_contig", NULL); + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, 0, "cluster_read_contig"); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) error = iostate.io_error; + lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); + if (error == 0 && tail_size) error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg); @@ -4787,7 +4820,7 @@ cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *ca lck_mtx_unlock(&wbp->cl_lockw); - sparse_cluster_push(&scmap, vp, ubc_getsize(vp), PUSH_ALL | IO_PASSIVE, callback, callback_arg); + sparse_cluster_push(&scmap, vp, ubc_getsize(vp), PUSH_ALL, flags | IO_PASSIVE, callback, callback_arg); lck_mtx_lock(&wbp->cl_lockw); @@ -4796,11 +4829,11 @@ cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *ca if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0) wakeup((caddr_t)&wbp->cl_sparse_pushes); } else { - sparse_cluster_push(&(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL | IO_PASSIVE, callback, callback_arg); + sparse_cluster_push(&(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags | IO_PASSIVE, callback, callback_arg); } retval = 1; } else { - retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL | IO_PASSIVE, callback, callback_arg); + retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags | IO_PASSIVE, callback, callback_arg); } lck_mtx_unlock(&wbp->cl_lockw); @@ -4861,7 +4894,7 @@ cluster_release(struct ubc_info *ubc) static int -cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int (*callback)(buf_t, void *), void *callback_arg) +cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg) { int cl_index; int cl_index1; @@ -4944,15 +4977,15 @@ cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_fla int flags; struct cl_extent cl; + flags = io_flags & (IO_PASSIVE|IO_CLOSE); + /* * try to push each cluster in turn... */ if (l_clusters[cl_index].io_flags & CLW_IONOCACHE) - flags = IO_NOCACHE; - else - flags = 0; + flags |= IO_NOCACHE; - if ((l_clusters[cl_index].io_flags & CLW_IOPASSIVE) || (push_flag & IO_PASSIVE)) + if (l_clusters[cl_index].io_flags & CLW_IOPASSIVE) flags |= IO_PASSIVE; if (push_flag & PUSH_SYNC) @@ -5057,9 +5090,9 @@ cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*c kern_return_t kret; if (flags & IO_PASSIVE) - bflag = CL_PASSIVE; + bflag = CL_PASSIVE; else - bflag = 0; + bflag = 0; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START, (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0); @@ -5186,6 +5219,9 @@ cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*c if ( !(flags & IO_SYNC)) io_flags |= CL_ASYNC; + if (flags & IO_CLOSE) + io_flags |= CL_CLOSE; + retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); @@ -5237,7 +5273,7 @@ sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*c * from the write-behind context (the cluster_push case), the wb lock is not held */ static void -sparse_cluster_push(void **scmap, vnode_t vp, off_t EOF, int push_flag, int (*callback)(buf_t, void *), void *callback_arg) +sparse_cluster_push(void **scmap, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg) { struct cl_extent cl; off_t offset; @@ -5255,7 +5291,7 @@ sparse_cluster_push(void **scmap, vnode_t vp, off_t EOF, int push_flag, int (*ca cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64); cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64); - cluster_push_now(vp, &cl, EOF, push_flag & IO_PASSIVE, callback, callback_arg); + cluster_push_now(vp, &cl, EOF, io_flags & (IO_PASSIVE|IO_CLOSE), callback, callback_arg); if ( !(push_flag & PUSH_ALL) ) break; @@ -5285,7 +5321,7 @@ sparse_cluster_add(void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF, in * only a partial update was done * push out some pages and try again */ - sparse_cluster_push(scmap, vp, EOF, 0, callback, callback_arg); + sparse_cluster_push(scmap, vp, EOF, 0, 0, callback, callback_arg); offset += (new_dirty * PAGE_SIZE_64); length -= (new_dirty * PAGE_SIZE); @@ -5308,9 +5344,9 @@ cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t int bflag; if (flags & IO_PASSIVE) - bflag = CL_PASSIVE; + bflag = CL_PASSIVE; else - bflag = 0; + bflag = 0; upl_flags = UPL_SET_LITE; @@ -5479,7 +5515,7 @@ cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int m io_size = *io_resid; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START, - (int)uio->uio_offset, 0, io_size, 0, 0); + (int)uio->uio_offset, io_size, mark_dirty, take_reference, 0); control = ubc_getobject(vp, UBC_FLAGS_NONE); diff --git a/bsd/vfs/vfs_conf.c b/bsd/vfs/vfs_conf.c index 529129d9c..a4a962b66 100644 --- a/bsd/vfs/vfs_conf.c +++ b/bsd/vfs/vfs_conf.c @@ -86,8 +86,8 @@ struct mount *rootfs; struct vnode *rootvnode; #ifdef CONFIG_IMGSRC_ACCESS -struct vnode *imgsrc_rootvnode; -#endif /* IMGSRC_ACESS */ +struct vnode *imgsrc_rootvnodes[MAX_IMAGEBOOT_NESTING]; /* [0] -> source volume, [1] -> first disk image */ +#endif /* CONFIG_IMGSRC_ACCESS */ int (*mountroot)(void) = NULL; @@ -102,7 +102,6 @@ extern struct vfsops nfs_vfsops; extern int nfs_mountroot(void); extern struct vfsops afs_vfsops; extern struct vfsops null_vfsops; -extern struct vfsops union_vfsops; extern struct vfsops devfs_vfsops; /* @@ -117,7 +116,7 @@ typedef int (*mountroot_t)(mount_t, vnode_t, vfs_context_t); static struct vfstable vfstbllist[] = { /* HFS/HFS+ Filesystem */ #if HFS - { &hfs_vfsops, "hfs", 17, 0, (MNT_LOCAL | MNT_DOVOLFS), hfs_mountroot, NULL, 0, 0, VFC_VFSLOCALARGS | VFC_VFSREADDIR_EXTENDED | VFS_THREAD_SAFE_FLAG | VFC_VFS64BITREADY | VFC_VFSVNOP_PAGEOUTV2, NULL, 0}, + { &hfs_vfsops, "hfs", 17, 0, (MNT_LOCAL | MNT_DOVOLFS), hfs_mountroot, NULL, 0, 0, VFC_VFSLOCALARGS | VFC_VFSREADDIR_EXTENDED | VFS_THREAD_SAFE_FLAG | VFC_VFS64BITREADY | VFC_VFSVNOP_PAGEOUTV2 | VFC_VFSVNOP_PAGEINV2, NULL, 0}, #endif /* Memory-based Filesystem */ @@ -140,18 +139,6 @@ static struct vfstable vfstbllist[] = { #endif #endif /* __LP64__ */ - /* Loopback (Minimal) Filesystem Layer */ -#ifndef __LP64__ -#if NULLFS - { &null_vfsops, "loopback", 9, 0, 0, NULL, NULL, 0, 0, VFC_VFSGENERICARGS , NULL, 0}, -#endif -#endif /* __LP64__ */ - - /* Union (translucent) Filesystem */ -#if UNION - { &union_vfsops, "unionfs", 15, 0, 0, NULL, NULL, 0, 0, VFC_VFSGENERICARGS | VFS_THREAD_SAFE_FLAG | VFC_VFS64BITREADY, NULL, 0}, -#endif - /* Device Filesystem */ #if DEVFS #if CONFIG_MACF @@ -214,7 +201,6 @@ extern struct vnodeopv_desc hfs_vnodeop_opv_desc; extern struct vnodeopv_desc hfs_std_vnodeop_opv_desc; extern struct vnodeopv_desc hfs_specop_opv_desc; extern struct vnodeopv_desc hfs_fifoop_opv_desc; -extern struct vnodeopv_desc union_vnodeop_opv_desc; extern struct vnodeopv_desc devfs_vnodeop_opv_desc; extern struct vnodeopv_desc devfs_spec_vnodeop_opv_desc; #if FDESC @@ -241,9 +227,6 @@ struct vnodeopv_desc *vfs_opv_descs[] = { &fifo_nfsv4nodeop_opv_desc, #endif #endif -#if NULLFS - &null_vnodeop_opv_desc, -#endif #if HFS &hfs_vnodeop_opv_desc, &hfs_std_vnodeop_opv_desc, @@ -252,9 +235,6 @@ struct vnodeopv_desc *vfs_opv_descs[] = { &hfs_fifoop_opv_desc, #endif #endif -#if UNION - &union_vnodeop_opv_desc, -#endif #if DEVFS &devfs_vnodeop_opv_desc, &devfs_spec_vnodeop_opv_desc, diff --git a/bsd/vfs/vfs_fsevents.c b/bsd/vfs/vfs_fsevents.c index e09f990dc..0132a60dd 100644 --- a/bsd/vfs/vfs_fsevents.c +++ b/bsd/vfs/vfs_fsevents.c @@ -206,6 +206,7 @@ fsevents_internal_init(void) // ever grow beyond what we initially filled it with zone_change(event_zone, Z_EXHAUST, TRUE); zone_change(event_zone, Z_COLLECT, FALSE); + zone_change(event_zone, Z_CALLERACCT, FALSE); } static void @@ -1821,6 +1822,11 @@ fmod_watch(fs_event_watcher *watcher, struct uio *uio) if (watcher->event_list[kfse->type] == FSE_REPORT && watcher_cares_about_dev(watcher, kfse->dev)) { + if (last_event_ptr == kfse) { + last_event_ptr = NULL; + last_event_type = -1; + last_coalesced_time = 0; + } error = copy_out_kfse(watcher, kfse, uio); if (error != 0) { // if an event won't fit or encountered an error while @@ -2667,18 +2673,24 @@ get_fse_info(struct vnode *vp, fse_info *fse, __unused vfs_context_t ctx) memset(fse, 0, sizeof(fse_info)); return -1; } - - fse->ino = (ino64_t)va.va_fileid; - fse->dev = (dev_t)va.va_fsid; - fse->mode = (int32_t)vnode_vttoif(vnode_vtype(vp)) | va.va_mode; - fse->uid = (uid_t)va.va_uid; - fse->gid = (gid_t)va.va_gid; + + return vnode_get_fse_info_from_vap(vp, fse, &va); +} + +int +vnode_get_fse_info_from_vap(vnode_t vp, fse_info *fse, struct vnode_attr *vap) +{ + fse->ino = (ino64_t)vap->va_fileid; + fse->dev = (dev_t)vap->va_fsid; + fse->mode = (int32_t)vnode_vttoif(vnode_vtype(vp)) | vap->va_mode; + fse->uid = (uid_t)vap->va_uid; + fse->gid = (gid_t)vap->va_gid; if (vp->v_flag & VISHARDLINK) { fse->mode |= FSE_MODE_HLINK; if (vp->v_type == VDIR) { - fse->nlink = (uint64_t)va.va_dirlinkcount; + fse->nlink = (uint64_t)vap->va_dirlinkcount; } else { - fse->nlink = (uint64_t)va.va_nlink; + fse->nlink = (uint64_t)vap->va_nlink; } } diff --git a/bsd/vfs/vfs_fslog.c b/bsd/vfs/vfs_fslog.c index 618e4546c..580ea60b4 100644 --- a/bsd/vfs/vfs_fslog.c +++ b/bsd/vfs/vfs_fslog.c @@ -42,6 +42,8 @@ #include #include +#include + /* String to append as format modifier for each key-value pair */ #define FSLOG_KEYVAL_FMT "[%s %s] " #define FSLOG_KEYVAL_FMT_LEN (sizeof(FSLOG_KEYVAL_FMT) - 1) @@ -341,12 +343,10 @@ static int escape_str(char *str, int len, int buflen) void fslog_fs_corrupt(struct mount *mnt) { if (mnt != NULL) { - if (mnt->mnt_vfsstat.f_mntonname != NULL) { - fslog_err(FSLOG_MSG_SINGLE, - FSLOG_KEY_ERR_TYPE, FSLOG_VAL_ERR_TYPE_FS, - FSLOG_KEY_MNTPT, mnt->mnt_vfsstat.f_mntonname, - NULL); - } + fslog_err(FSLOG_MSG_SINGLE, + FSLOG_KEY_ERR_TYPE, FSLOG_VAL_ERR_TYPE_FS, + FSLOG_KEY_MNTPT, mnt->mnt_vfsstat.f_mntonname, + NULL); } return; @@ -458,3 +458,73 @@ void fslog_io_error(const buf_t bp) return; } + +static void +_fslog_extmod_msgtracer_internal(int level, const char *facility, int num_pairs, ...) +{ + va_list ap; + + va_start(ap, num_pairs); + (void) fslog_asl_msg(level, facility, + num_pairs, ap, NULL); + va_end(ap); +} + +/* Log information about external modification of a process, + * using MessageTracer formatting. Assumes that both the caller + * and target are appropriately locked. + * Currently prints following information - + * 1. Caller process name (truncated to 16 characters) + * 2. Caller process Mach-O UUID + * 3. Target process name (truncated to 16 characters) + * 4. Target process Mach-O UUID + */ +void +fslog_extmod_msgtracer(proc_t caller, proc_t target) +{ + if ((caller != PROC_NULL) && (target != PROC_NULL)) { + + /* + * Print into buffer large enough for "ThisIsAnApplicat(BC223DD7-B314-42E0-B6B0-C5D2E6638337)", + * including space for escaping, and NUL byte included in sizeof(uuid_string_t). + */ + + uuid_string_t uuidstr; + char c_name[2*MAXCOMLEN + 2 /* () */ + sizeof(uuid_string_t)]; + char t_name[2*MAXCOMLEN + 2 /* () */ + sizeof(uuid_string_t)]; + + strlcpy(c_name, caller->p_comm, sizeof(c_name)); + uuid_unparse_upper(caller->p_uuid, uuidstr); + strlcat(c_name, "(", sizeof(c_name)); + strlcat(c_name, uuidstr, sizeof(c_name)); + strlcat(c_name, ")", sizeof(c_name)); + if (0 != escape_str(c_name, strlen(c_name), sizeof(c_name))) { + return; + } + + strlcpy(t_name, target->p_comm, sizeof(t_name)); + uuid_unparse_upper(target->p_uuid, uuidstr); + strlcat(t_name, "(", sizeof(t_name)); + strlcat(t_name, uuidstr, sizeof(t_name)); + strlcat(t_name, ")", sizeof(t_name)); + if (0 != escape_str(t_name, strlen(t_name), sizeof(t_name))) { + return; + } + +#if DEBUG + printf("EXTMOD: %s(%d) -> %s(%d)\n", + c_name, + proc_pid(caller), + t_name, + proc_pid(target)); +#endif + + _fslog_extmod_msgtracer_internal(LOG_DEBUG, "messagetracer", + 4, + "com.apple.message.domain", "com.apple.kernel.external_modification", /* 0 */ + "com.apple.message.signature", c_name, /* 1 */ + "com.apple.message.signature2", t_name, /* 2 */ + "com.apple.message.result", "noop", /* 3 */ + NULL); + } +} diff --git a/bsd/vfs/vfs_init.c b/bsd/vfs/vfs_init.c index 253bbcd77..2c83c4725 100644 --- a/bsd/vfs/vfs_init.c +++ b/bsd/vfs/vfs_init.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -261,6 +261,12 @@ lck_grp_t * vnode_lck_grp; lck_grp_attr_t * vnode_lck_grp_attr; lck_attr_t * vnode_lck_attr; +#if CONFIG_TRIGGERS +/* vars for vnode trigger resolver */ +lck_grp_t * trigger_vnode_lck_grp; +lck_grp_attr_t * trigger_vnode_lck_grp_attr; +lck_attr_t * trigger_vnode_lck_attr; +#endif /* vars for vnode list lock */ lck_grp_t * vnode_list_lck_grp; @@ -289,6 +295,9 @@ lck_mtx_t * mnt_list_mtx_lock; lck_mtx_t *pkg_extensions_lck; struct mount * dead_mountp; + +extern void nspace_handler_init(void); + /* * Initialize the vnode structures and initialize each file system type. */ @@ -324,6 +333,12 @@ vfsinit(void) /* Allocate vnode lock attribute */ vnode_lck_attr = lck_attr_alloc_init(); +#if CONFIG_TRIGGERS + trigger_vnode_lck_grp_attr = lck_grp_attr_alloc_init(); + trigger_vnode_lck_grp = lck_grp_alloc_init("trigger_vnode", trigger_vnode_lck_grp_attr); + trigger_vnode_lck_attr = lck_attr_alloc_init(); +#endif + /* Allocate fs config lock group attribute and group */ fsconf_lck_grp_attr= lck_grp_attr_alloc_init(); @@ -373,6 +388,7 @@ vfsinit(void) */ journal_init(); #endif + nspace_handler_init(); /* * Build vnode operation vectors. diff --git a/bsd/vfs/vfs_journal.c b/bsd/vfs/vfs_journal.c index 0a967aba9..4999f814b 100644 --- a/bsd/vfs/vfs_journal.c +++ b/bsd/vfs/vfs_journal.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1995-2010 Apple Inc. All rights reserved. + * Copyright (c) 2002-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -58,9 +59,36 @@ #include #include /* OSAddAtomic */ -extern task_t kernel_task; +kern_return_t thread_terminate(thread_t); -#define DBG_JOURNAL_FLUSH 1 +/* + * Set sysctl vfs.generic.jnl.kdebug.trim=1 to enable KERNEL_DEBUG_CONSTANT + * logging of trim-related calls within the journal. (They're + * disabled by default because there can be a lot of these events, + * and we don't want to overwhelm the kernel debug buffer. If you + * want to watch these events in particular, just set the sysctl.) + */ +static int jnl_kdebug = 0; +SYSCTL_DECL(_vfs_generic); +SYSCTL_NODE(_vfs_generic, OID_AUTO, jnl, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Journal"); +SYSCTL_NODE(_vfs_generic_jnl, OID_AUTO, kdebug, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Journal kdebug"); +SYSCTL_INT(_vfs_generic_jnl_kdebug, OID_AUTO, trim, CTLFLAG_RW|CTLFLAG_LOCKED, &jnl_kdebug, 0, "Enable kdebug logging for journal TRIM"); + +#define DBG_JOURNAL_FLUSH FSDBG_CODE(DBG_JOURNAL, 1) +#define DBG_JOURNAL_TRIM_ADD FSDBG_CODE(DBG_JOURNAL, 2) +#define DBG_JOURNAL_TRIM_REMOVE FSDBG_CODE(DBG_JOURNAL, 3) +#define DBG_JOURNAL_TRIM_REMOVE_PENDING FSDBG_CODE(DBG_JOURNAL, 4) +#define DBG_JOURNAL_TRIM_REALLOC FSDBG_CODE(DBG_JOURNAL, 5) +#define DBG_JOURNAL_TRIM_FLUSH FSDBG_CODE(DBG_JOURNAL, 6) +#define DBG_JOURNAL_TRIM_UNMAP FSDBG_CODE(DBG_JOURNAL, 7) + +/* + * Cap the journal max size to 2GB. On HFS, it will attempt to occupy + * a full allocation block if the current size is smaller than the allocation + * block on which it resides. Once we hit the exabyte filesystem range, then + * it will use 2GB allocation blocks. As a result, make the cap 2GB. + */ +#define MAX_JOURNAL_SIZE 0x80000000U #include /* DTRACE_IO1 */ #else @@ -80,6 +108,13 @@ extern task_t kernel_task; #include "vfs_journal.h" +#include + +#if 0 +#undef KERNEL_DEBUG +#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT +#endif + #ifndef CONFIG_HFS_TRIM #define CONFIG_HFS_TRIM 0 #endif @@ -104,10 +139,10 @@ SYSCTL_UINT (_kern, OID_AUTO, jnl_trim_flush, CTLFLAG_RW, &jnl_trim_flush_limit, /* XXX next prototytype should be from libsa/stdlib.h> but conflicts libkern */ __private_extern__ void qsort( - void * array, - size_t nmembers, - size_t member_size, - int (*)(const void *, const void *)); + void * array, + size_t nmembers, + size_t member_size, + int (*)(const void *, const void *)); @@ -116,8 +151,13 @@ __private_extern__ void qsort( // fields as well as the first entry of binfo[] #define BLHDR_CHECKSUM_SIZE 32 - -static int end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg); +static void lock_condition(journal *jnl, boolean_t *condition, const char *condition_name); +static void wait_condition(journal *jnl, boolean_t *condition, const char *condition_name); +static void unlock_condition(journal *jnl, boolean_t *condition); +static void finish_end_thread(transaction *tr); +static void write_header_thread(journal *jnl); +static int finish_end_transaction(transaction *tr, errno_t (*callback)(void*), void *callback_arg); +static int end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg, boolean_t drop_lock, boolean_t must_wait); static void abort_transaction(journal *jnl, transaction *tr); static void dump_journal(journal *jnl); @@ -125,8 +165,8 @@ static __inline__ void lock_journal(journal *jnl); static __inline__ void unlock_journal(journal *jnl); static __inline__ void lock_oldstart(journal *jnl); static __inline__ void unlock_oldstart(journal *jnl); - - +static __inline__ void lock_flush(journal *jnl); +static __inline__ void unlock_flush(journal *jnl); // @@ -134,10 +174,10 @@ static __inline__ void unlock_oldstart(journal *jnl); // typedef struct bucket { - off_t block_num; - uint32_t jnl_offset; - uint32_t block_size; - int32_t cksum; + off_t block_num; + uint32_t jnl_offset; + uint32_t block_size; + int32_t cksum; } bucket; #define STARTING_BUCKETS 256 @@ -149,56 +189,56 @@ static int do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_ static int insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr, int overwriting); #define CHECK_JOURNAL(jnl) \ - do { \ - if (jnl == NULL) {\ - panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__);\ - }\ - if (jnl->jdev == NULL) { \ - panic("%s:%d: jdev is null!\n", __FILE__, __LINE__);\ - } \ - if (jnl->fsdev == NULL) { \ - panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__);\ - } \ - if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) {\ - panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n",\ - __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);\ - }\ - if ( jnl->jhdr->start <= 0 \ - || jnl->jhdr->start > jnl->jhdr->size) {\ - panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \ - __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size);\ - }\ - if ( jnl->jhdr->end <= 0 \ - || jnl->jhdr->end > jnl->jhdr->size) {\ - panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \ - __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size);\ - }\ - } while(0) + do { \ + if (jnl == NULL) { \ + panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__); \ + } \ + if (jnl->jdev == NULL) { \ + panic("%s:%d: jdev is null!\n", __FILE__, __LINE__); \ + } \ + if (jnl->fsdev == NULL) { \ + panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__); \ + } \ + if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) { \ + panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n", \ + __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC); \ + } \ + if ( jnl->jhdr->start <= 0 \ + || jnl->jhdr->start > jnl->jhdr->size) { \ + panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \ + __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size); \ + } \ + if ( jnl->jhdr->end <= 0 \ + || jnl->jhdr->end > jnl->jhdr->size) { \ + panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \ + __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size); \ + } \ + } while(0) #define CHECK_TRANSACTION(tr) \ - do {\ - if (tr == NULL) {\ - panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__);\ - }\ - if (tr->jnl == NULL) {\ - panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__);\ - }\ - if (tr->blhdr != (block_list_header *)tr->tbuffer) {\ - panic("%s:%d: blhdr (%p) != tbuffer (%p)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer);\ - }\ - if (tr->total_bytes < 0) {\ - panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes);\ - }\ - if (tr->journal_start < 0) {\ - panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start);\ - }\ - if (tr->journal_end < 0) {\ - panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end);\ - }\ - if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > (tr->jnl->jhdr->size/tr->jnl->jhdr->jhdr_size))) {\ - panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks);\ - }\ - } while(0) + do { \ + if (tr == NULL) { \ + panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__); \ + } \ + if (tr->jnl == NULL) { \ + panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__); \ + } \ + if (tr->blhdr != (block_list_header *)tr->tbuffer) { \ + panic("%s:%d: blhdr (%p) != tbuffer (%p)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer); \ + } \ + if (tr->total_bytes < 0) { \ + panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes); \ + } \ + if (tr->journal_start < 0) { \ + panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start); \ + } \ + if (tr->journal_end < 0) { \ + panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end); \ + } \ + if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > (tr->jnl->jhdr->size/tr->jnl->jhdr->jhdr_size))) { \ + panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks); \ + } \ + } while(0) @@ -210,14 +250,14 @@ static int insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, of static int calc_checksum(char *ptr, int len) { - int i, cksum=0; + int i, cksum=0; - // this is a lame checksum but for now it'll do - for(i=0; i < len; i++, ptr++) { + // this is a lame checksum but for now it'll do + for(i = 0; i < len; i++, ptr++) { cksum = (cksum << 8) ^ (cksum + *(unsigned char *)ptr); - } + } - return (~cksum); + return (~cksum); } // @@ -247,6 +287,18 @@ unlock_journal(journal *jnl) lck_mtx_unlock(&jnl->jlock); } +static __inline__ void +lock_flush(journal *jnl) +{ + lck_mtx_lock(&jnl->flock); +} + +static __inline__ void +unlock_flush(journal *jnl) +{ + lck_mtx_unlock(&jnl->flock); +} + static __inline__ void lock_oldstart(journal *jnl) { @@ -277,78 +329,80 @@ unlock_oldstart(journal *jnl) static size_t do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction) { - int err, curlen=len; - size_t io_sz = 0; - buf_t bp; - off_t max_iosize; + int err, curlen=len; + size_t io_sz = 0; + buf_t bp; + off_t max_iosize; - if (*offset < 0 || *offset > jnl->jhdr->size) { + if (*offset < 0 || *offset > jnl->jhdr->size) { panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size); - } - - if (direction & JNL_WRITE) - max_iosize = jnl->max_write_size; - else if (direction & JNL_READ) - max_iosize = jnl->max_read_size; - else - max_iosize = 128 * 1024; + } + + if (direction & JNL_WRITE) + max_iosize = jnl->max_write_size; + else if (direction & JNL_READ) + max_iosize = jnl->max_read_size; + else + max_iosize = 128 * 1024; - again: - bp = alloc_io_buf(jnl->jdev, 1); +again: + bp = alloc_io_buf(jnl->jdev, 1); - if (*offset + (off_t)curlen > jnl->jhdr->size && *offset != 0 && jnl->jhdr->size != 0) { + if (*offset + (off_t)curlen > jnl->jhdr->size && *offset != 0 && jnl->jhdr->size != 0) { if (*offset == jnl->jhdr->size) { *offset = jnl->jhdr->jhdr_size; } else { curlen = (off_t)jnl->jhdr->size - *offset; } - } + } if (curlen > max_iosize) { curlen = max_iosize; } - if (curlen <= 0) { + if (curlen <= 0) { panic("jnl: do_jnl_io: curlen == %d, offset 0x%llx len %zd\n", curlen, *offset, len); - } + } if (*offset == 0 && (direction & JNL_HEADER) == 0) { panic("jnl: request for i/o to jnl-header without JNL_HEADER flag set! (len %d, data %p)\n", curlen, data); } - if (direction & JNL_READ) - buf_setflags(bp, B_READ); - else { - /* - * don't have to set any flags - */ - vnode_startwrite(jnl->jdev); - } - buf_setsize(bp, curlen); - buf_setcount(bp, curlen); - buf_setdataptr(bp, (uintptr_t)data); - buf_setblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size)); - buf_setlblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size)); - if ((direction & JNL_WRITE) && (jnl->flags & JOURNAL_DO_FUA_WRITES)) { - buf_markfua(bp); - } + if (direction & JNL_READ) + buf_setflags(bp, B_READ); + else { + /* + * don't have to set any flags + */ + vnode_startwrite(jnl->jdev); + } + buf_setsize(bp, curlen); + buf_setcount(bp, curlen); + buf_setdataptr(bp, (uintptr_t)data); + buf_setblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size)); + buf_setlblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size)); + + if ((direction & JNL_WRITE) && (jnl->flags & JOURNAL_DO_FUA_WRITES)) { + buf_markfua(bp); + } - DTRACE_IO1(journal__start, buf_t, bp); - err = VNOP_STRATEGY(bp); - if (!err) { + DTRACE_IO1(journal__start, buf_t, bp); + err = VNOP_STRATEGY(bp); + if (!err) { err = (int)buf_biowait(bp); - } - DTRACE_IO1(journal__done, buf_t, bp); - free_io_buf(bp); + } + DTRACE_IO1(journal__done, buf_t, bp); + free_io_buf(bp); - if (err) { - printf("jnl: %s: do_jnl_io: strategy err 0x%x\n", jnl->jdev_name, err); - return 0; - } + if (err) { + printf("jnl: %s: do_jnl_io: strategy err 0x%x\n", jnl->jdev_name, err); + return 0; + } + + *offset += curlen; + io_sz += curlen; - *offset += curlen; - io_sz += curlen; - if (io_sz != len) { + if (io_sz != len) { // handle wrap-around data = (char *)data + curlen; curlen = len - io_sz; @@ -356,21 +410,21 @@ do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction *offset = jnl->jhdr->jhdr_size; } goto again; - } + } - return io_sz; + return io_sz; } static size_t read_journal_data(journal *jnl, off_t *offset, void *data, size_t len) { - return do_journal_io(jnl, offset, data, len, JNL_READ); + return do_journal_io(jnl, offset, data, len, JNL_READ); } static size_t write_journal_data(journal *jnl, off_t *offset, void *data, size_t len) { - return do_journal_io(jnl, offset, data, len, JNL_WRITE); + return do_journal_io(jnl, offset, data, len, JNL_WRITE); } @@ -383,64 +437,66 @@ read_journal_header(journal *jnl, void *data, size_t len) } static int -write_journal_header(journal *jnl, int updating_start) -{ - static int num_err_prints = 0; - int ret=0; - off_t jhdr_offset = 0; - struct vfs_context context; - - context.vc_thread = current_thread(); - context.vc_ucred = NOCRED; - // - // Flush the track cache if we're not doing force-unit-access - // writes. - // - if (!updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) { - ret = VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, &context); - } - if (ret != 0) { - // - // Only print this error if it's a different error than the - // previous one, or if it's the first time for this device - // or if the total number of printfs is less than 25. We - // allow for up to 25 printfs to insure that some make it - // into the on-disk syslog. Otherwise if we only printed - // one, it's possible it would never make it to the syslog - // for the root volume and that makes debugging hard. +write_journal_header(journal *jnl, int updating_start, uint32_t sequence_num) +{ + static int num_err_prints = 0; + int ret=0; + off_t jhdr_offset = 0; + struct vfs_context context; + + context.vc_thread = current_thread(); + context.vc_ucred = NOCRED; + // + // Flush the track cache if we're not doing force-unit-access + // writes. // - if ( ret != jnl->last_flush_err - || (jnl->flags & JOURNAL_FLUSHCACHE_ERR) == 0 - || num_err_prints++ < 25) { + if (!updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) { + ret = VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, &context); + } + if (ret != 0) { + // + // Only print this error if it's a different error than the + // previous one, or if it's the first time for this device + // or if the total number of printfs is less than 25. We + // allow for up to 25 printfs to insure that some make it + // into the on-disk syslog. Otherwise if we only printed + // one, it's possible it would never make it to the syslog + // for the root volume and that makes debugging hard. + // + if ( ret != jnl->last_flush_err + || (jnl->flags & JOURNAL_FLUSHCACHE_ERR) == 0 + || num_err_prints++ < 25) { - printf("jnl: %s: flushing fs disk buffer returned 0x%x\n", jnl->jdev_name, ret); + printf("jnl: %s: flushing fs disk buffer returned 0x%x\n", jnl->jdev_name, ret); - jnl->flags |= JOURNAL_FLUSHCACHE_ERR; - jnl->last_flush_err = ret; + jnl->flags |= JOURNAL_FLUSHCACHE_ERR; + jnl->last_flush_err = ret; + } } - } - jnl->jhdr->checksum = 0; - jnl->jhdr->checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE); - if (do_journal_io(jnl, &jhdr_offset, jnl->header_buf, jnl->jhdr->jhdr_size, JNL_WRITE|JNL_HEADER) != (size_t)jnl->jhdr->jhdr_size) { - printf("jnl: %s: write_journal_header: error writing the journal header!\n", jnl->jdev_name); - jnl->flags |= JOURNAL_INVALID; - return -1; - } - - // If we're not doing force-unit-access writes, then we - // have to flush after writing the journal header so that - // a future transaction doesn't sneak out to disk before - // the header does and thus overwrite data that the old - // journal header refers to. Saw this exact case happen - // on an IDE bus analyzer with Larry Barras so while it - // may seem obscure, it's not. - // - if (updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) { - VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, &context); - } + jnl->jhdr->sequence_num = sequence_num; + jnl->jhdr->checksum = 0; + jnl->jhdr->checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE); - return 0; + if (do_journal_io(jnl, &jhdr_offset, jnl->header_buf, jnl->jhdr->jhdr_size, JNL_WRITE|JNL_HEADER) != (size_t)jnl->jhdr->jhdr_size) { + printf("jnl: %s: write_journal_header: error writing the journal header!\n", jnl->jdev_name); + jnl->flags |= JOURNAL_INVALID; + return -1; + } + + // If we're not doing force-unit-access writes, then we + // have to flush after writing the journal header so that + // a future transaction doesn't sneak out to disk before + // the header does and thus overwrite data that the old + // journal header refers to. Saw this exact case happen + // on an IDE bus analyzer with Larry Barras so while it + // may seem obscure, it's not. + // + if (updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) { + VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, &context); + } + + return 0; } @@ -456,18 +512,29 @@ write_journal_header(journal *jnl, int updating_start) static void free_old_stuff(journal *jnl) { - transaction *tr, *next; + transaction *tr, *next; + block_list_header *blhdr=NULL, *next_blhdr=NULL; - lock_oldstart(jnl); - tr = jnl->tr_freeme; - jnl->tr_freeme = NULL; - unlock_oldstart(jnl); + if (jnl->tr_freeme == NULL) + return; - for(; tr; tr=next) { - next = tr->next; - FREE_ZONE(tr, sizeof(transaction), M_JNL_TR); - } + lock_oldstart(jnl); + tr = jnl->tr_freeme; + jnl->tr_freeme = NULL; + unlock_oldstart(jnl); + + for(; tr; tr=next) { + for (blhdr = tr->blhdr; blhdr; blhdr = next_blhdr) { + next_blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum); + blhdr->binfo[0].bnum = 0xdeadc0de; + + kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size); + KERNEL_DEBUG(0xbbbbc01c, jnl, tr, tr->tbuffer_size, 0, 0); + } + next = tr->next; + FREE_ZONE(tr, sizeof(transaction), M_JNL_TR); + } } @@ -481,167 +548,169 @@ free_old_stuff(journal *jnl) static void buffer_flushed_callback(struct buf *bp, void *arg) { - transaction *tr; - journal *jnl; - transaction *ctr, *prev=NULL, *next; - size_t i; - int bufsize, amt_flushed, total_bytes; + transaction *tr; + journal *jnl; + transaction *ctr, *prev=NULL, *next; + size_t i; + int bufsize, amt_flushed, total_bytes; - //printf("jnl: buf flush: bp @ 0x%x l/blkno %qd/%qd vp 0x%x tr @ 0x%x\n", - // bp, buf_lblkno(bp), buf_blkno(bp), buf_vnode(bp), arg); + //printf("jnl: buf flush: bp @ 0x%x l/blkno %qd/%qd vp 0x%x tr @ 0x%x\n", + // bp, buf_lblkno(bp), buf_blkno(bp), buf_vnode(bp), arg); - // snarf out the bits we want - bufsize = buf_size(bp); - tr = (transaction *)arg; + // snarf out the bits we want + bufsize = buf_size(bp); + tr = (transaction *)arg; - // then we've already seen it - if (tr == NULL) { + // then we've already seen it + if (tr == NULL) { return; - } + } - CHECK_TRANSACTION(tr); + CHECK_TRANSACTION(tr); - jnl = tr->jnl; - if (jnl->flags & JOURNAL_INVALID) { + jnl = tr->jnl; + if (jnl->flags & JOURNAL_INVALID) { return; - } + } - CHECK_JOURNAL(jnl); + CHECK_JOURNAL(jnl); - amt_flushed = tr->num_killed; - total_bytes = tr->total_bytes; + amt_flushed = tr->num_killed; + total_bytes = tr->total_bytes; - // update the number of blocks that have been flushed. - // this buf may represent more than one block so take - // that into account. - // - // OSAddAtomic() returns the value of tr->num_flushed before the add - // - amt_flushed += OSAddAtomic(bufsize, &tr->num_flushed); + // update the number of blocks that have been flushed. + // this buf may represent more than one block so take + // that into account. + // + // OSAddAtomic() returns the value of tr->num_flushed before the add + // + amt_flushed += OSAddAtomic(bufsize, &tr->num_flushed); - // if this transaction isn't done yet, just return as - // there is nothing to do. - // - // NOTE: we are careful to not reference anything through - // the tr pointer after doing the OSAddAtomic(). if - // this if statement fails then we are the last one - // and then it's ok to dereference "tr". - // - if ((amt_flushed + bufsize) < total_bytes) { + // if this transaction isn't done yet, just return as + // there is nothing to do. + // + // NOTE: we are careful to not reference anything through + // the tr pointer after doing the OSAddAtomic(). if + // this if statement fails then we are the last one + // and then it's ok to dereference "tr". + // + if ((amt_flushed + bufsize) < total_bytes) { return; - } + } - // this will single thread checking the transaction - lock_oldstart(jnl); + // this will single thread checking the transaction + lock_oldstart(jnl); - if (tr->total_bytes == (int)0xfbadc0de) { - // then someone beat us to it... - unlock_oldstart(jnl); - return; - } + if (tr->total_bytes == (int)0xfbadc0de) { + // then someone beat us to it... + unlock_oldstart(jnl); + return; + } - // mark this so that we're the owner of dealing with the - // cleanup for this transaction - tr->total_bytes = 0xfbadc0de; + // mark this so that we're the owner of dealing with the + // cleanup for this transaction + tr->total_bytes = 0xfbadc0de; - //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n", - // tr, tr->journal_start, tr->journal_end, jnl); + //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n", + // tr, tr->journal_start, tr->journal_end, jnl); - // find this entry in the old_start[] index and mark it completed - for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) { + // find this entry in the old_start[] index and mark it completed + for(i = 0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) { - if ((off_t)(jnl->old_start[i] & ~(0x8000000000000000ULL)) == tr->journal_start) { - jnl->old_start[i] &= ~(0x8000000000000000ULL); - break; + if ((off_t)(jnl->old_start[i] & ~(0x8000000000000000ULL)) == tr->journal_start) { + jnl->old_start[i] &= ~(0x8000000000000000ULL); + break; + } } - } - if (i >= sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) { - panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr %p, jnl %p)\n", - tr->journal_start, tr, jnl); - } + if (i >= sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) { + panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr %p, jnl %p)\n", + tr->journal_start, tr, jnl); + } - // if we are here then we need to update the journal header - // to reflect that this transaction is complete - if (tr->journal_start == jnl->active_start) { - jnl->active_start = tr->journal_end; - tr->journal_start = tr->journal_end = (off_t)0; - } + // if we are here then we need to update the journal header + // to reflect that this transaction is complete + if (tr->journal_start == jnl->active_start) { + jnl->active_start = tr->journal_end; + tr->journal_start = tr->journal_end = (off_t)0; + } - // go through the completed_trs list and try to coalesce - // entries, restarting back at the beginning if we have to. - for(ctr=jnl->completed_trs; ctr; prev=ctr, ctr=next) { - if (ctr->journal_start == jnl->active_start) { - jnl->active_start = ctr->journal_end; - if (prev) { - prev->next = ctr->next; - } - if (ctr == jnl->completed_trs) { - jnl->completed_trs = ctr->next; - } + // go through the completed_trs list and try to coalesce + // entries, restarting back at the beginning if we have to. + for (ctr = jnl->completed_trs; ctr; prev=ctr, ctr=next) { + if (ctr->journal_start == jnl->active_start) { + jnl->active_start = ctr->journal_end; + if (prev) { + prev->next = ctr->next; + } + if (ctr == jnl->completed_trs) { + jnl->completed_trs = ctr->next; + } - next = jnl->completed_trs; // this starts us over again - ctr->next = jnl->tr_freeme; - jnl->tr_freeme = ctr; - ctr = NULL; - } else if (tr->journal_end == ctr->journal_start) { - ctr->journal_start = tr->journal_start; - next = jnl->completed_trs; // this starts us over again - ctr = NULL; - tr->journal_start = tr->journal_end = (off_t)0; - } else if (tr->journal_start == ctr->journal_end) { - ctr->journal_end = tr->journal_end; - next = ctr->next; - tr->journal_start = tr->journal_end = (off_t)0; - } else if (ctr->next && ctr->journal_end == ctr->next->journal_start) { - // coalesce the next entry with this one and link the next - // entry in at the head of the tr_freeme list - next = ctr->next; // temporarily use the "next" variable - ctr->journal_end = next->journal_end; - ctr->next = next->next; - next->next = jnl->tr_freeme; // link in the next guy at the head of the tr_freeme list - jnl->tr_freeme = next; - - next = jnl->completed_trs; // this starts us over again - ctr = NULL; - } else { - next = ctr->next; + next = jnl->completed_trs; // this starts us over again + ctr->next = jnl->tr_freeme; + jnl->tr_freeme = ctr; + ctr = NULL; + } else if (tr->journal_end == ctr->journal_start) { + ctr->journal_start = tr->journal_start; + next = jnl->completed_trs; // this starts us over again + ctr = NULL; + tr->journal_start = tr->journal_end = (off_t)0; + } else if (tr->journal_start == ctr->journal_end) { + ctr->journal_end = tr->journal_end; + next = ctr->next; + tr->journal_start = tr->journal_end = (off_t)0; + } else if (ctr->next && ctr->journal_end == ctr->next->journal_start) { + // coalesce the next entry with this one and link the next + // entry in at the head of the tr_freeme list + next = ctr->next; // temporarily use the "next" variable + ctr->journal_end = next->journal_end; + ctr->next = next->next; + next->next = jnl->tr_freeme; // link in the next guy at the head of the tr_freeme list + jnl->tr_freeme = next; + + next = jnl->completed_trs; // this starts us over again + ctr = NULL; + } else { + next = ctr->next; + } } - } - // if this is true then we didn't merge with anyone - // so link ourselves in at the head of the completed - // transaction list. - if (tr->journal_start != 0) { - // put this entry into the correct sorted place - // in the list instead of just at the head. - // + // if this is true then we didn't merge with anyone + // so link ourselves in at the head of the completed + // transaction list. + if (tr->journal_start != 0) { + // put this entry into the correct sorted place + // in the list instead of just at the head. + // - prev = NULL; - for(ctr=jnl->completed_trs; ctr && tr->journal_start > ctr->journal_start; prev=ctr, ctr=ctr->next) { - // just keep looping - } + prev = NULL; + for (ctr = jnl->completed_trs; ctr && tr->journal_start > ctr->journal_start; prev=ctr, ctr=ctr->next) { + // just keep looping + } - if (ctr == NULL && prev == NULL) { - jnl->completed_trs = tr; - tr->next = NULL; - } else if (ctr == jnl->completed_trs) { - tr->next = jnl->completed_trs; - jnl->completed_trs = tr; + if (ctr == NULL && prev == NULL) { + jnl->completed_trs = tr; + tr->next = NULL; + } else if (ctr == jnl->completed_trs) { + tr->next = jnl->completed_trs; + jnl->completed_trs = tr; + } else { + tr->next = prev->next; + prev->next = tr; + } } else { - tr->next = prev->next; - prev->next = tr; - } - } else { - // if we're here this tr got merged with someone else so - // put it on the list to be free'd - tr->next = jnl->tr_freeme; - jnl->tr_freeme = tr; - } - unlock_oldstart(jnl); + // if we're here this tr got merged with someone else so + // put it on the list to be free'd + tr->next = jnl->tr_freeme; + jnl->tr_freeme = tr; + } + unlock_oldstart(jnl); + + unlock_condition(jnl, &jnl->asyncIO); } @@ -655,51 +724,51 @@ buffer_flushed_callback(struct buf *bp, void *arg) static void swap_journal_header(journal *jnl) { - jnl->jhdr->magic = SWAP32(jnl->jhdr->magic); - jnl->jhdr->endian = SWAP32(jnl->jhdr->endian); - jnl->jhdr->start = SWAP64(jnl->jhdr->start); - jnl->jhdr->end = SWAP64(jnl->jhdr->end); - jnl->jhdr->size = SWAP64(jnl->jhdr->size); - jnl->jhdr->blhdr_size = SWAP32(jnl->jhdr->blhdr_size); - jnl->jhdr->checksum = SWAP32(jnl->jhdr->checksum); - jnl->jhdr->jhdr_size = SWAP32(jnl->jhdr->jhdr_size); - jnl->jhdr->sequence_num = SWAP32(jnl->jhdr->sequence_num); + jnl->jhdr->magic = SWAP32(jnl->jhdr->magic); + jnl->jhdr->endian = SWAP32(jnl->jhdr->endian); + jnl->jhdr->start = SWAP64(jnl->jhdr->start); + jnl->jhdr->end = SWAP64(jnl->jhdr->end); + jnl->jhdr->size = SWAP64(jnl->jhdr->size); + jnl->jhdr->blhdr_size = SWAP32(jnl->jhdr->blhdr_size); + jnl->jhdr->checksum = SWAP32(jnl->jhdr->checksum); + jnl->jhdr->jhdr_size = SWAP32(jnl->jhdr->jhdr_size); + jnl->jhdr->sequence_num = SWAP32(jnl->jhdr->sequence_num); } static void swap_block_list_header(journal *jnl, block_list_header *blhdr) { - int i; + int i; - blhdr->max_blocks = SWAP16(blhdr->max_blocks); - blhdr->num_blocks = SWAP16(blhdr->num_blocks); - blhdr->bytes_used = SWAP32(blhdr->bytes_used); - blhdr->checksum = SWAP32(blhdr->checksum); - blhdr->flags = SWAP32(blhdr->flags); - - if (blhdr->num_blocks >= ((jnl->jhdr->blhdr_size / sizeof(block_info)) - 1)) { - printf("jnl: %s: blhdr num blocks looks suspicious (%d / blhdr size %d). not swapping.\n", jnl->jdev_name, blhdr->num_blocks, jnl->jhdr->blhdr_size); - return; - } + blhdr->max_blocks = SWAP16(blhdr->max_blocks); + blhdr->num_blocks = SWAP16(blhdr->num_blocks); + blhdr->bytes_used = SWAP32(blhdr->bytes_used); + blhdr->checksum = SWAP32(blhdr->checksum); + blhdr->flags = SWAP32(blhdr->flags); + + if (blhdr->num_blocks >= ((jnl->jhdr->blhdr_size / sizeof(block_info)) - 1)) { + printf("jnl: %s: blhdr num blocks looks suspicious (%d / blhdr size %d). not swapping.\n", jnl->jdev_name, blhdr->num_blocks, jnl->jhdr->blhdr_size); + return; + } - for(i=0; i < blhdr->num_blocks; i++) { + for(i = 0; i < blhdr->num_blocks; i++) { blhdr->binfo[i].bnum = SWAP64(blhdr->binfo[i].bnum); blhdr->binfo[i].u.bi.bsize = SWAP32(blhdr->binfo[i].u.bi.bsize); blhdr->binfo[i].u.bi.b.cksum = SWAP32(blhdr->binfo[i].u.bi.b.cksum); - } + } } static int update_fs_block(journal *jnl, void *block_ptr, off_t fs_block, size_t bsize) { - int ret; - struct buf *oblock_bp=NULL; + int ret; + struct buf *oblock_bp=NULL; - // first read the block we want. - ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp); - if (ret != 0) { - printf("jnl: %s: update_fs_block: error reading fs block # %lld! (ret %d)\n", jnl->jdev_name, fs_block, ret); + // first read the block we want. + ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp); + if (ret != 0) { + printf("jnl: %s: update_fs_block: error reading fs block # %lld! (ret %d)\n", jnl->jdev_name, fs_block, ret); if (oblock_bp) { buf_brelse(oblock_bp); @@ -709,277 +778,277 @@ update_fs_block(journal *jnl, void *block_ptr, off_t fs_block, size_t bsize) // let's try to be aggressive here and just re-write the block oblock_bp = buf_getblk(jnl->fsdev, (daddr64_t)fs_block, bsize, 0, 0, BLK_META); if (oblock_bp == NULL) { - printf("jnl: %s: update_fs_block: buf_getblk() for %lld failed! failing update.\n", jnl->jdev_name, fs_block); - return -1; + printf("jnl: %s: update_fs_block: buf_getblk() for %lld failed! failing update.\n", jnl->jdev_name, fs_block); + return -1; } - } + } - // make sure it's the correct size. - if (buf_size(oblock_bp) != bsize) { + // make sure it's the correct size. + if (buf_size(oblock_bp) != bsize) { buf_brelse(oblock_bp); return -1; - } + } - // copy the journal data over top of it - memcpy((char *)0 + buf_dataptr(oblock_bp), block_ptr, bsize); + // copy the journal data over top of it + memcpy((char *)buf_dataptr(oblock_bp), block_ptr, bsize); - if ((ret = VNOP_BWRITE(oblock_bp)) != 0) { - printf("jnl: %s: update_fs_block: failed to update block %lld (ret %d)\n", jnl->jdev_name, fs_block,ret); - return ret; - } + if ((ret = VNOP_BWRITE(oblock_bp)) != 0) { + printf("jnl: %s: update_fs_block: failed to update block %lld (ret %d)\n", jnl->jdev_name, fs_block,ret); + return ret; + } - // and now invalidate it so that if someone else wants to read - // it in a different size they'll be able to do it. - ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp); - if (oblock_bp) { + // and now invalidate it so that if someone else wants to read + // it in a different size they'll be able to do it. + ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp); + if (oblock_bp) { buf_markinvalid(oblock_bp); buf_brelse(oblock_bp); - } + } - return 0; + return 0; } static int grow_table(struct bucket **buf_ptr, int num_buckets, int new_size) { - struct bucket *newBuf; - int current_size = num_buckets, i; + struct bucket *newBuf; + int current_size = num_buckets, i; - // return if newsize is less than the current size - if (new_size < num_buckets) { - return current_size; - } + // return if newsize is less than the current size + if (new_size < num_buckets) { + return current_size; + } - if ((MALLOC(newBuf, struct bucket *, new_size*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) { - printf("jnl: grow_table: no memory to expand coalesce buffer!\n"); - return -1; - } + if ((MALLOC(newBuf, struct bucket *, new_size*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) { + printf("jnl: grow_table: no memory to expand coalesce buffer!\n"); + return -1; + } - // printf("jnl: lookup_bucket: expanded co_buf to %d elems\n", new_size); + // printf("jnl: lookup_bucket: expanded co_buf to %d elems\n", new_size); - // copy existing elements - bcopy(*buf_ptr, newBuf, num_buckets*sizeof(struct bucket)); + // copy existing elements + bcopy(*buf_ptr, newBuf, num_buckets*sizeof(struct bucket)); - // initialize the new ones - for(i=num_buckets; i < new_size; i++) { - newBuf[i].block_num = (off_t)-1; - } + // initialize the new ones + for(i = num_buckets; i < new_size; i++) { + newBuf[i].block_num = (off_t)-1; + } - // free the old container - FREE(*buf_ptr, M_TEMP); + // free the old container + FREE(*buf_ptr, M_TEMP); - // reset the buf_ptr - *buf_ptr = newBuf; + // reset the buf_ptr + *buf_ptr = newBuf; - return new_size; + return new_size; } static int lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full) { - int lo, hi, index, matches, i; + int lo, hi, index, matches, i; - if (num_full == 0) { - return 0; // table is empty, so insert at index=0 - } + if (num_full == 0) { + return 0; // table is empty, so insert at index=0 + } - lo = 0; - hi = num_full - 1; - index = -1; + lo = 0; + hi = num_full - 1; + index = -1; - // perform binary search for block_num - do { - int mid = (hi - lo)/2 + lo; - off_t this_num = (*buf_ptr)[mid].block_num; + // perform binary search for block_num + do { + int mid = (hi - lo)/2 + lo; + off_t this_num = (*buf_ptr)[mid].block_num; - if (block_num == this_num) { - index = mid; - break; - } + if (block_num == this_num) { + index = mid; + break; + } - if (block_num < this_num) { - hi = mid; - continue; - } + if (block_num < this_num) { + hi = mid; + continue; + } - if (block_num > this_num) { - lo = mid + 1; - continue; - } - } while(lo < hi); + if (block_num > this_num) { + lo = mid + 1; + continue; + } + } while (lo < hi); - // check if lo and hi converged on the match - if (block_num == (*buf_ptr)[hi].block_num) { - index = hi; - } + // check if lo and hi converged on the match + if (block_num == (*buf_ptr)[hi].block_num) { + index = hi; + } - // if no existing entry found, find index for new one - if (index == -1) { - index = (block_num < (*buf_ptr)[hi].block_num) ? hi : hi + 1; - } else { - // make sure that we return the right-most index in the case of multiple matches - matches = 0; - i = index + 1; - while(i < num_full && block_num == (*buf_ptr)[i].block_num) { - matches++; - i++; - } - - index += matches; - } + // if no existing entry found, find index for new one + if (index == -1) { + index = (block_num < (*buf_ptr)[hi].block_num) ? hi : hi + 1; + } else { + // make sure that we return the right-most index in the case of multiple matches + matches = 0; + i = index + 1; + while (i < num_full && block_num == (*buf_ptr)[i].block_num) { + matches++; + i++; + } + + index += matches; + } - return index; + return index; } static int insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr, int overwriting) { - if (!overwriting) { - // grow the table if we're out of space - if (*num_full_ptr >= *num_buckets_ptr) { - int new_size = *num_buckets_ptr * 2; - int grow_size = grow_table(buf_ptr, *num_buckets_ptr, new_size); + if (!overwriting) { + // grow the table if we're out of space + if (*num_full_ptr >= *num_buckets_ptr) { + int new_size = *num_buckets_ptr * 2; + int grow_size = grow_table(buf_ptr, *num_buckets_ptr, new_size); - if (grow_size < new_size) { - printf("jnl: %s: add_block: grow_table returned an error!\n", jnl->jdev_name); - return -1; - } + if (grow_size < new_size) { + printf("jnl: %s: add_block: grow_table returned an error!\n", jnl->jdev_name); + return -1; + } - *num_buckets_ptr = grow_size; //update num_buckets to reflect the new size - } + *num_buckets_ptr = grow_size; //update num_buckets to reflect the new size + } - // if we're not inserting at the end, we need to bcopy - if (blk_index != *num_full_ptr) { - bcopy( (*buf_ptr)+(blk_index), (*buf_ptr)+(blk_index+1), (*num_full_ptr-blk_index)*sizeof(struct bucket) ); - } + // if we're not inserting at the end, we need to bcopy + if (blk_index != *num_full_ptr) { + bcopy( (*buf_ptr)+(blk_index), (*buf_ptr)+(blk_index+1), (*num_full_ptr-blk_index)*sizeof(struct bucket) ); + } - (*num_full_ptr)++; // increment only if we're not overwriting - } + (*num_full_ptr)++; // increment only if we're not overwriting + } - // sanity check the values we're about to add - if ((off_t)offset >= jnl->jhdr->size) { - offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size); - } - if (size <= 0) { - panic("jnl: insert_block: bad size in insert_block (%zd)\n", size); - } - - (*buf_ptr)[blk_index].block_num = num; - (*buf_ptr)[blk_index].block_size = size; - (*buf_ptr)[blk_index].jnl_offset = offset; - (*buf_ptr)[blk_index].cksum = cksum; + // sanity check the values we're about to add + if ((off_t)offset >= jnl->jhdr->size) { + offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size); + } + if (size <= 0) { + panic("jnl: insert_block: bad size in insert_block (%zd)\n", size); + } + + (*buf_ptr)[blk_index].block_num = num; + (*buf_ptr)[blk_index].block_size = size; + (*buf_ptr)[blk_index].jnl_offset = offset; + (*buf_ptr)[blk_index].cksum = cksum; - return blk_index; + return blk_index; } static int do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr) { - int num_to_remove, index, i, overwrite, err; - size_t jhdr_size = jnl->jhdr->jhdr_size, new_offset; - off_t overlap, block_start, block_end; - - block_start = block_num*jhdr_size; - block_end = block_start + size; - overwrite = (block_num == (*buf_ptr)[blk_index].block_num && size >= (*buf_ptr)[blk_index].block_size); - - // first, eliminate any overlap with the previous entry - if (blk_index != 0 && !overwrite) { - off_t prev_block_start = (*buf_ptr)[blk_index-1].block_num*jhdr_size; - off_t prev_block_end = prev_block_start + (*buf_ptr)[blk_index-1].block_size; - overlap = prev_block_end - block_start; - if (overlap > 0) { - if (overlap % jhdr_size != 0) { - panic("jnl: do_overlap: overlap with previous entry not a multiple of %zd\n", jhdr_size); - } - - // if the previous entry completely overlaps this one, we need to break it into two pieces. - if (prev_block_end > block_end) { - off_t new_num = block_end / jhdr_size; - size_t new_size = prev_block_end - block_end; - - new_offset = (*buf_ptr)[blk_index-1].jnl_offset + (block_end - prev_block_start); + int num_to_remove, index, i, overwrite, err; + size_t jhdr_size = jnl->jhdr->jhdr_size, new_offset; + off_t overlap, block_start, block_end; + + block_start = block_num*jhdr_size; + block_end = block_start + size; + overwrite = (block_num == (*buf_ptr)[blk_index].block_num && size >= (*buf_ptr)[blk_index].block_size); + + // first, eliminate any overlap with the previous entry + if (blk_index != 0 && !overwrite) { + off_t prev_block_start = (*buf_ptr)[blk_index-1].block_num*jhdr_size; + off_t prev_block_end = prev_block_start + (*buf_ptr)[blk_index-1].block_size; + overlap = prev_block_end - block_start; + if (overlap > 0) { + if (overlap % jhdr_size != 0) { + panic("jnl: do_overlap: overlap with previous entry not a multiple of %zd\n", jhdr_size); + } + + // if the previous entry completely overlaps this one, we need to break it into two pieces. + if (prev_block_end > block_end) { + off_t new_num = block_end / jhdr_size; + size_t new_size = prev_block_end - block_end; + + new_offset = (*buf_ptr)[blk_index-1].jnl_offset + (block_end - prev_block_start); - err = insert_block(jnl, buf_ptr, blk_index, new_num, new_size, new_offset, cksum, num_buckets_ptr, num_full_ptr, 0); - if (err < 0) { - panic("jnl: do_overlap: error inserting during pre-overlap\n"); - } - } + err = insert_block(jnl, buf_ptr, blk_index, new_num, new_size, new_offset, cksum, num_buckets_ptr, num_full_ptr, 0); + if (err < 0) { + panic("jnl: do_overlap: error inserting during pre-overlap\n"); + } + } - // Regardless, we need to truncate the previous entry to the beginning of the overlap - (*buf_ptr)[blk_index-1].block_size = block_start - prev_block_start; - (*buf_ptr)[blk_index-1].cksum = 0; // have to blow it away because there's no way to check it + // Regardless, we need to truncate the previous entry to the beginning of the overlap + (*buf_ptr)[blk_index-1].block_size = block_start - prev_block_start; + (*buf_ptr)[blk_index-1].cksum = 0; // have to blow it away because there's no way to check it + } } - } - // then, bail out fast if there's no overlap with the entries that follow - if (!overwrite && block_end <= (off_t)((*buf_ptr)[blk_index].block_num*jhdr_size)) { - return 0; // no overlap, no overwrite - } else if (overwrite && (blk_index + 1 >= *num_full_ptr || block_end <= (off_t)((*buf_ptr)[blk_index+1].block_num*jhdr_size))) { + // then, bail out fast if there's no overlap with the entries that follow + if (!overwrite && block_end <= (off_t)((*buf_ptr)[blk_index].block_num*jhdr_size)) { + return 0; // no overlap, no overwrite + } else if (overwrite && (blk_index + 1 >= *num_full_ptr || block_end <= (off_t)((*buf_ptr)[blk_index+1].block_num*jhdr_size))) { - (*buf_ptr)[blk_index].cksum = cksum; // update this - return 1; // simple overwrite - } + (*buf_ptr)[blk_index].cksum = cksum; // update this + return 1; // simple overwrite + } - // Otherwise, find all cases of total and partial overlap. We use the special - // block_num of -2 to designate entries that are completely overlapped and must - // be eliminated. The block_num, size, and jnl_offset of partially overlapped - // entries must be adjusted to keep the array consistent. - index = blk_index; - num_to_remove = 0; - while(index < *num_full_ptr && block_end > (off_t)((*buf_ptr)[index].block_num*jhdr_size)) { - if (block_end >= (off_t)(((*buf_ptr)[index].block_num*jhdr_size + (*buf_ptr)[index].block_size))) { - (*buf_ptr)[index].block_num = -2; // mark this for deletion - num_to_remove++; - } else { - overlap = block_end - (*buf_ptr)[index].block_num*jhdr_size; - if (overlap > 0) { - if (overlap % jhdr_size != 0) { - panic("jnl: do_overlap: overlap of %lld is not multiple of %zd\n", overlap, jhdr_size); - } - - // if we partially overlap this entry, adjust its block number, jnl offset, and size - (*buf_ptr)[index].block_num += (overlap / jhdr_size); // make sure overlap is multiple of jhdr_size, or round up - (*buf_ptr)[index].cksum = 0; + // Otherwise, find all cases of total and partial overlap. We use the special + // block_num of -2 to designate entries that are completely overlapped and must + // be eliminated. The block_num, size, and jnl_offset of partially overlapped + // entries must be adjusted to keep the array consistent. + index = blk_index; + num_to_remove = 0; + while (index < *num_full_ptr && block_end > (off_t)((*buf_ptr)[index].block_num*jhdr_size)) { + if (block_end >= (off_t)(((*buf_ptr)[index].block_num*jhdr_size + (*buf_ptr)[index].block_size))) { + (*buf_ptr)[index].block_num = -2; // mark this for deletion + num_to_remove++; + } else { + overlap = block_end - (*buf_ptr)[index].block_num*jhdr_size; + if (overlap > 0) { + if (overlap % jhdr_size != 0) { + panic("jnl: do_overlap: overlap of %lld is not multiple of %zd\n", overlap, jhdr_size); + } + + // if we partially overlap this entry, adjust its block number, jnl offset, and size + (*buf_ptr)[index].block_num += (overlap / jhdr_size); // make sure overlap is multiple of jhdr_size, or round up + (*buf_ptr)[index].cksum = 0; - new_offset = (*buf_ptr)[index].jnl_offset + overlap; // check for wrap-around - if ((off_t)new_offset >= jnl->jhdr->size) { - new_offset = jhdr_size + (new_offset - jnl->jhdr->size); - } - (*buf_ptr)[index].jnl_offset = new_offset; + new_offset = (*buf_ptr)[index].jnl_offset + overlap; // check for wrap-around + if ((off_t)new_offset >= jnl->jhdr->size) { + new_offset = jhdr_size + (new_offset - jnl->jhdr->size); + } + (*buf_ptr)[index].jnl_offset = new_offset; - (*buf_ptr)[index].block_size -= overlap; // sanity check for negative value - if ((*buf_ptr)[index].block_size <= 0) { - panic("jnl: do_overlap: after overlap, new block size is invalid (%u)\n", (*buf_ptr)[index].block_size); - // return -1; // if above panic is removed, return -1 for error + (*buf_ptr)[index].block_size -= overlap; // sanity check for negative value + if ((*buf_ptr)[index].block_size <= 0) { + panic("jnl: do_overlap: after overlap, new block size is invalid (%u)\n", (*buf_ptr)[index].block_size); + // return -1; // if above panic is removed, return -1 for error + } + } + } - } - - } - index++; - } + index++; + } - // bcopy over any completely overlapped entries, starting at the right (where the above loop broke out) - index--; // start with the last index used within the above loop - while(index >= blk_index) { - if ((*buf_ptr)[index].block_num == -2) { - if (index == *num_full_ptr-1) { - (*buf_ptr)[index].block_num = -1; // it's the last item in the table... just mark as free - } else { - bcopy( (*buf_ptr)+(index+1), (*buf_ptr)+(index), (*num_full_ptr - (index + 1)) * sizeof(struct bucket) ); - } - (*num_full_ptr)--; - } - index--; - } + // bcopy over any completely overlapped entries, starting at the right (where the above loop broke out) + index--; // start with the last index used within the above loop + while (index >= blk_index) { + if ((*buf_ptr)[index].block_num == -2) { + if (index == *num_full_ptr-1) { + (*buf_ptr)[index].block_num = -1; // it's the last item in the table... just mark as free + } else { + bcopy( (*buf_ptr)+(index+1), (*buf_ptr)+(index), (*num_full_ptr - (index + 1)) * sizeof(struct bucket) ); + } + (*num_full_ptr)--; + } + index--; + } - // eliminate any stale entries at the end of the table - for(i=*num_full_ptr; i < (*num_full_ptr + num_to_remove); i++) { - (*buf_ptr)[i].block_num = -1; - } + // eliminate any stale entries at the end of the table + for(i = *num_full_ptr; i < (*num_full_ptr + num_to_remove); i++) { + (*buf_ptr)[i].block_num = -1; + } - return 0; // if we got this far, we need to insert the entry into the table (rather than overwrite) + return 0; // if we got this far, we need to insert the entry into the table (rather than overwrite) } // PR-3105942: Coalesce writes to the same block in journal replay @@ -993,90 +1062,90 @@ do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num static int add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr) { - int blk_index, overwriting; + int blk_index, overwriting; - // on return from lookup_bucket(), blk_index is the index into the table where block_num should be - // inserted (or the index of the elem to overwrite). - blk_index = lookup_bucket( buf_ptr, block_num, *num_full_ptr); + // on return from lookup_bucket(), blk_index is the index into the table where block_num should be + // inserted (or the index of the elem to overwrite). + blk_index = lookup_bucket( buf_ptr, block_num, *num_full_ptr); - // check if the index is within bounds (if we're adding this block to the end of - // the table, blk_index will be equal to num_full) - if (blk_index < 0 || blk_index > *num_full_ptr) { - //printf("jnl: add_block: trouble adding block to co_buf\n"); - return -1; - } // else printf("jnl: add_block: adding block 0x%llx at i=%d\n", block_num, blk_index); + // check if the index is within bounds (if we're adding this block to the end of + // the table, blk_index will be equal to num_full) + if (blk_index < 0 || blk_index > *num_full_ptr) { + //printf("jnl: add_block: trouble adding block to co_buf\n"); + return -1; + } // else printf("jnl: add_block: adding block 0x%llx at i=%d\n", block_num, blk_index); - // Determine whether we're overwriting an existing entry by checking for overlap - overwriting = do_overlap(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr); - if (overwriting < 0) { - return -1; // if we got an error, pass it along - } + // Determine whether we're overwriting an existing entry by checking for overlap + overwriting = do_overlap(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr); + if (overwriting < 0) { + return -1; // if we got an error, pass it along + } - // returns the index, or -1 on error - blk_index = insert_block(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr, overwriting); + // returns the index, or -1 on error + blk_index = insert_block(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr, overwriting); - return blk_index; + return blk_index; } static int replay_journal(journal *jnl) { - int i, orig_checksum, checksum, check_block_checksums=0, bad_blocks=0; - size_t ret; - size_t max_bsize = 0; /* protected by block_ptr */ - block_list_header *blhdr; - off_t offset, txn_start_offset=0, blhdr_offset, orig_jnl_start; - char *buff, *block_ptr=NULL; - struct bucket *co_buf; - int num_buckets = STARTING_BUCKETS, num_full, check_past_jnl_end = 1, in_uncharted_territory=0; - uint32_t last_sequence_num = 0; + int i, orig_checksum, checksum, check_block_checksums=0, bad_blocks=0; + size_t ret; + size_t max_bsize = 0; /* protected by block_ptr */ + block_list_header *blhdr; + off_t offset, txn_start_offset=0, blhdr_offset, orig_jnl_start; + char *buff, *block_ptr=NULL; + struct bucket *co_buf; + int num_buckets = STARTING_BUCKETS, num_full, check_past_jnl_end = 1, in_uncharted_territory=0; + uint32_t last_sequence_num = 0; - // wrap the start ptr if it points to the very end of the journal - if (jnl->jhdr->start == jnl->jhdr->size) { + // wrap the start ptr if it points to the very end of the journal + if (jnl->jhdr->start == jnl->jhdr->size) { jnl->jhdr->start = jnl->jhdr->jhdr_size; - } - if (jnl->jhdr->end == jnl->jhdr->size) { + } + if (jnl->jhdr->end == jnl->jhdr->size) { jnl->jhdr->end = jnl->jhdr->jhdr_size; - } + } - if (jnl->jhdr->start == jnl->jhdr->end) { + if (jnl->jhdr->start == jnl->jhdr->end) { return 0; - } + } - orig_jnl_start = jnl->jhdr->start; + orig_jnl_start = jnl->jhdr->start; - // allocate memory for the header_block. we'll read each blhdr into this - if (kmem_alloc(kernel_map, (vm_offset_t *)&buff, jnl->jhdr->blhdr_size)) { + // allocate memory for the header_block. we'll read each blhdr into this + if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&buff, jnl->jhdr->blhdr_size)) { printf("jnl: %s: replay_journal: no memory for block buffer! (%d bytes)\n", - jnl->jdev_name, jnl->jhdr->blhdr_size); + jnl->jdev_name, jnl->jhdr->blhdr_size); return -1; - } + } - // allocate memory for the coalesce buffer - if ((MALLOC(co_buf, struct bucket *, num_buckets*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) { - printf("jnl: %s: replay_journal: no memory for coalesce buffer!\n", jnl->jdev_name); - return -1; - } + // allocate memory for the coalesce buffer + if ((MALLOC(co_buf, struct bucket *, num_buckets*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) { + printf("jnl: %s: replay_journal: no memory for coalesce buffer!\n", jnl->jdev_name); + return -1; + } - restart_replay: +restart_replay: - // initialize entries - for(i=0; i < num_buckets; i++) { - co_buf[i].block_num = -1; - } - num_full = 0; // empty at first + // initialize entries + for(i = 0; i < num_buckets; i++) { + co_buf[i].block_num = -1; + } + num_full = 0; // empty at first - printf("jnl: %s: replay_journal: from: %lld to: %lld (joffset 0x%llx)\n", - jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end, jnl->jdev_offset); + printf("jnl: %s: replay_journal: from: %lld to: %lld (joffset 0x%llx)\n", + jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end, jnl->jdev_offset); - while(check_past_jnl_end || jnl->jhdr->start != jnl->jhdr->end) { + while (check_past_jnl_end || jnl->jhdr->start != jnl->jhdr->end) { offset = blhdr_offset = jnl->jhdr->start; ret = read_journal_data(jnl, &offset, buff, jnl->jhdr->blhdr_size); if (ret != (size_t)jnl->jhdr->blhdr_size) { - printf("jnl: %s: replay_journal: Could not read block list header block @ 0x%llx!\n", jnl->jdev_name, offset); - bad_blocks = 1; - goto bad_txn_handling; + printf("jnl: %s: replay_journal: Could not read block list header block @ 0x%llx!\n", jnl->jdev_name, offset); + bad_blocks = 1; + goto bad_txn_handling; } blhdr = (block_list_header *)buff; @@ -1101,101 +1170,101 @@ replay_journal(journal *jnl) // anything // if (checksum != orig_checksum) { - if (check_past_jnl_end && in_uncharted_territory) { + if (check_past_jnl_end && in_uncharted_territory) { - if (blhdr_offset != jnl->jhdr->end) { - printf("jnl: %s: Extra txn replay stopped @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset); - } + if (blhdr_offset != jnl->jhdr->end) { + printf("jnl: %s: Extra txn replay stopped @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset); + } - check_past_jnl_end = 0; - jnl->jhdr->end = blhdr_offset; - continue; - } + check_past_jnl_end = 0; + jnl->jhdr->end = blhdr_offset; + continue; + } - printf("jnl: %s: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n", + printf("jnl: %s: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n", jnl->jdev_name, blhdr_offset, orig_checksum, checksum); - if (blhdr_offset == orig_jnl_start) { - // if there's nothing in the journal at all, just bail out altogether. - goto bad_replay; - } + if (blhdr_offset == orig_jnl_start) { + // if there's nothing in the journal at all, just bail out altogether. + goto bad_replay; + } - bad_blocks = 1; - goto bad_txn_handling; + bad_blocks = 1; + goto bad_txn_handling; } if ( (last_sequence_num != 0) - && (blhdr->binfo[0].u.bi.b.sequence_num != 0) - && (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num) - && (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num+1)) { + && (blhdr->binfo[0].u.bi.b.sequence_num != 0) + && (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num) + && (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num+1)) { - txn_start_offset = jnl->jhdr->end = blhdr_offset; + txn_start_offset = jnl->jhdr->end = blhdr_offset; - if (check_past_jnl_end) { - check_past_jnl_end = 0; - printf("jnl: %s: 2: extra replay stopped @ %lld / 0x%llx (seq %d < %d)\n", - jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num); - continue; - } + if (check_past_jnl_end) { + check_past_jnl_end = 0; + printf("jnl: %s: 2: extra replay stopped @ %lld / 0x%llx (seq %d < %d)\n", + jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num); + continue; + } - printf("jnl: %s: txn sequence numbers out of order in txn @ %lld / %llx! (%d < %d)\n", - jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num); - bad_blocks = 1; - goto bad_txn_handling; + printf("jnl: %s: txn sequence numbers out of order in txn @ %lld / %llx! (%d < %d)\n", + jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num); + bad_blocks = 1; + goto bad_txn_handling; } last_sequence_num = blhdr->binfo[0].u.bi.b.sequence_num; if (blhdr_offset >= jnl->jhdr->end && jnl->jhdr->start <= jnl->jhdr->end) { - if (last_sequence_num == 0) { - check_past_jnl_end = 0; - printf("jnl: %s: pre-sequence-num-enabled txn's - can not go further than end (%lld %lld).\n", - jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end); - if (jnl->jhdr->start != jnl->jhdr->end) { - jnl->jhdr->start = jnl->jhdr->end; + if (last_sequence_num == 0) { + check_past_jnl_end = 0; + printf("jnl: %s: pre-sequence-num-enabled txn's - can not go further than end (%lld %lld).\n", + jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end); + if (jnl->jhdr->start != jnl->jhdr->end) { + jnl->jhdr->start = jnl->jhdr->end; + } + continue; } - continue; - } - printf("jnl: %s: examining extra transactions starting @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset); + printf("jnl: %s: examining extra transactions starting @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset); } if ( blhdr->max_blocks <= 0 || blhdr->max_blocks > (jnl->jhdr->size/jnl->jhdr->jhdr_size) - || blhdr->num_blocks <= 0 || blhdr->num_blocks > blhdr->max_blocks) { - printf("jnl: %s: replay_journal: bad looking journal entry: max: %d num: %d\n", - jnl->jdev_name, blhdr->max_blocks, blhdr->num_blocks); - bad_blocks = 1; - goto bad_txn_handling; + || blhdr->num_blocks <= 0 || blhdr->num_blocks > blhdr->max_blocks) { + printf("jnl: %s: replay_journal: bad looking journal entry: max: %d num: %d\n", + jnl->jdev_name, blhdr->max_blocks, blhdr->num_blocks); + bad_blocks = 1; + goto bad_txn_handling; } max_bsize = 0; - for(i=1; i < blhdr->num_blocks; i++) { + for (i = 1; i < blhdr->num_blocks; i++) { if (blhdr->binfo[i].bnum < 0 && blhdr->binfo[i].bnum != (off_t)-1) { - printf("jnl: %s: replay_journal: bogus block number 0x%llx\n", jnl->jdev_name, blhdr->binfo[i].bnum); - bad_blocks = 1; - goto bad_txn_handling; + printf("jnl: %s: replay_journal: bogus block number 0x%llx\n", jnl->jdev_name, blhdr->binfo[i].bnum); + bad_blocks = 1; + goto bad_txn_handling; } if ((size_t)blhdr->binfo[i].u.bi.bsize > max_bsize) { - max_bsize = blhdr->binfo[i].u.bi.bsize; + max_bsize = blhdr->binfo[i].u.bi.bsize; } } if (blhdr->flags & BLHDR_CHECK_CHECKSUMS) { - check_block_checksums = 1; - if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) { - goto bad_replay; - } + check_block_checksums = 1; + if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) { + goto bad_replay; + } } else { - block_ptr = NULL; + block_ptr = NULL; } if (blhdr->flags & BLHDR_FIRST_HEADER) { - txn_start_offset = blhdr_offset; + txn_start_offset = blhdr_offset; } //printf("jnl: replay_journal: adding %d blocks in journal entry @ 0x%llx to co_buf\n", // blhdr->num_blocks-1, jnl->jhdr->start); bad_blocks = 0; - for(i=1; i < blhdr->num_blocks; i++) { + for (i = 1; i < blhdr->num_blocks; i++) { int size, ret_val; off_t number; @@ -1204,48 +1273,48 @@ replay_journal(journal *jnl) // don't add "killed" blocks if (number == (off_t)-1) { - //printf("jnl: replay_journal: skipping killed fs block (index %d)\n", i); + //printf("jnl: replay_journal: skipping killed fs block (index %d)\n", i); } else { - if (check_block_checksums) { - int32_t disk_cksum; - off_t block_offset; + if (check_block_checksums) { + int32_t disk_cksum; + off_t block_offset; - block_offset = offset; + block_offset = offset; - // read the block so we can check the checksum - ret = read_journal_data(jnl, &block_offset, block_ptr, size); - if (ret != (size_t)size) { - printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset); - bad_blocks = 1; - goto bad_txn_handling; - } + // read the block so we can check the checksum + ret = read_journal_data(jnl, &block_offset, block_ptr, size); + if (ret != (size_t)size) { + printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset); + bad_blocks = 1; + goto bad_txn_handling; + } - disk_cksum = calc_checksum(block_ptr, size); - - // there is no need to swap the checksum from disk because - // it got swapped when the blhdr was read in. - if (blhdr->binfo[i].u.bi.b.cksum != 0 && disk_cksum != blhdr->binfo[i].u.bi.b.cksum) { - printf("jnl: %s: txn starting at %lld (%lld) @ index %3d bnum %lld (%d) with disk cksum != blhdr cksum (0x%.8x 0x%.8x)\n", - jnl->jdev_name, txn_start_offset, blhdr_offset, i, number, size, disk_cksum, blhdr->binfo[i].u.bi.b.cksum); - printf("jnl: 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x\n", - *(int *)&block_ptr[0*sizeof(int)], *(int *)&block_ptr[1*sizeof(int)], *(int *)&block_ptr[2*sizeof(int)], *(int *)&block_ptr[3*sizeof(int)], - *(int *)&block_ptr[4*sizeof(int)], *(int *)&block_ptr[5*sizeof(int)], *(int *)&block_ptr[6*sizeof(int)], *(int *)&block_ptr[7*sizeof(int)]); - - bad_blocks = 1; - goto bad_txn_handling; + disk_cksum = calc_checksum(block_ptr, size); + + // there is no need to swap the checksum from disk because + // it got swapped when the blhdr was read in. + if (blhdr->binfo[i].u.bi.b.cksum != 0 && disk_cksum != blhdr->binfo[i].u.bi.b.cksum) { + printf("jnl: %s: txn starting at %lld (%lld) @ index %3d bnum %lld (%d) with disk cksum != blhdr cksum (0x%.8x 0x%.8x)\n", + jnl->jdev_name, txn_start_offset, blhdr_offset, i, number, size, disk_cksum, blhdr->binfo[i].u.bi.b.cksum); + printf("jnl: 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x\n", + *(int *)&block_ptr[0*sizeof(int)], *(int *)&block_ptr[1*sizeof(int)], *(int *)&block_ptr[2*sizeof(int)], *(int *)&block_ptr[3*sizeof(int)], + *(int *)&block_ptr[4*sizeof(int)], *(int *)&block_ptr[5*sizeof(int)], *(int *)&block_ptr[6*sizeof(int)], *(int *)&block_ptr[7*sizeof(int)]); + + bad_blocks = 1; + goto bad_txn_handling; + } } - } - // add this bucket to co_buf, coalescing where possible - // printf("jnl: replay_journal: adding block 0x%llx\n", number); - ret_val = add_block(jnl, &co_buf, number, size, (size_t) offset, blhdr->binfo[i].u.bi.b.cksum, &num_buckets, &num_full); + // add this bucket to co_buf, coalescing where possible + // printf("jnl: replay_journal: adding block 0x%llx\n", number); + ret_val = add_block(jnl, &co_buf, number, size, (size_t) offset, blhdr->binfo[i].u.bi.b.cksum, &num_buckets, &num_full); - if (ret_val == -1) { - printf("jnl: %s: replay_journal: trouble adding block to co_buf\n", jnl->jdev_name); - goto bad_replay; - } // else printf("jnl: replay_journal: added block 0x%llx at i=%d\n", number); + if (ret_val == -1) { + printf("jnl: %s: replay_journal: trouble adding block to co_buf\n", jnl->jdev_name); + goto bad_replay; + } // else printf("jnl: replay_journal: added block 0x%llx at i=%d\n", number); } // increment offset @@ -1256,28 +1325,28 @@ replay_journal(journal *jnl) // into account // if (offset >= jnl->jhdr->size) { - offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size); + offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size); } } if (block_ptr) { - kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize); - block_ptr = NULL; + kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize); + block_ptr = NULL; } - bad_txn_handling: +bad_txn_handling: if (bad_blocks) { - if (txn_start_offset == 0) { - printf("jnl: %s: no known good txn start offset! aborting journal replay.\n", jnl->jdev_name); - goto bad_replay; - } + if (txn_start_offset == 0) { + printf("jnl: %s: no known good txn start offset! aborting journal replay.\n", jnl->jdev_name); + goto bad_replay; + } - jnl->jhdr->start = orig_jnl_start; - jnl->jhdr->end = txn_start_offset; - check_past_jnl_end = 0; - last_sequence_num = 0; - printf("jnl: %s: restarting journal replay (%lld - %lld)!\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end); - goto restart_replay; + jnl->jhdr->start = orig_jnl_start; + jnl->jhdr->end = txn_start_offset; + check_past_jnl_end = 0; + last_sequence_num = 0; + printf("jnl: %s: restarting journal replay (%lld - %lld)!\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end); + goto restart_replay; } jnl->jhdr->start += blhdr->bytes_used; @@ -1287,98 +1356,98 @@ replay_journal(journal *jnl) } if (jnl->jhdr->start == jnl->jhdr->end) { - in_uncharted_territory = 1; + in_uncharted_territory = 1; } - } + } - if (jnl->jhdr->start != jnl->jhdr->end) { - printf("jnl: %s: start %lld != end %lld. resetting end.\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end); - jnl->jhdr->end = jnl->jhdr->start; - } + if (jnl->jhdr->start != jnl->jhdr->end) { + printf("jnl: %s: start %lld != end %lld. resetting end.\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end); + jnl->jhdr->end = jnl->jhdr->start; + } - //printf("jnl: replay_journal: replaying %d blocks\n", num_full); + //printf("jnl: replay_journal: replaying %d blocks\n", num_full); - /* - * make sure it's at least one page in size, so - * start max_bsize at PAGE_SIZE - */ - for (i = 0, max_bsize = PAGE_SIZE; i < num_full; i++) { - - if (co_buf[i].block_num == (off_t)-1) - continue; + /* + * make sure it's at least one page in size, so + * start max_bsize at PAGE_SIZE + */ + for (i = 0, max_bsize = PAGE_SIZE; i < num_full; i++) { - if (co_buf[i].block_size > max_bsize) - max_bsize = co_buf[i].block_size; - } - /* - * round max_bsize up to the nearest PAGE_SIZE multiple - */ - if (max_bsize & (PAGE_SIZE - 1)) { - max_bsize = (max_bsize + PAGE_SIZE) & ~(PAGE_SIZE - 1); - } + if (co_buf[i].block_num == (off_t)-1) + continue; - if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) { - goto bad_replay; - } + if (co_buf[i].block_size > max_bsize) + max_bsize = co_buf[i].block_size; + } + /* + * round max_bsize up to the nearest PAGE_SIZE multiple + */ + if (max_bsize & (PAGE_SIZE - 1)) { + max_bsize = (max_bsize + PAGE_SIZE) & ~(PAGE_SIZE - 1); + } + + if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) { + goto bad_replay; + } - // Replay the coalesced entries in the co-buf - for(i=0; i < num_full; i++) { - size_t size = co_buf[i].block_size; - off_t jnl_offset = (off_t) co_buf[i].jnl_offset; - off_t number = co_buf[i].block_num; + // Replay the coalesced entries in the co-buf + for(i = 0; i < num_full; i++) { + size_t size = co_buf[i].block_size; + off_t jnl_offset = (off_t) co_buf[i].jnl_offset; + off_t number = co_buf[i].block_num; - // printf("replaying co_buf[%d]: block 0x%llx, size 0x%x, jnl_offset 0x%llx\n", i, co_buf[i].block_num, - // co_buf[i].block_size, co_buf[i].jnl_offset); + // printf("replaying co_buf[%d]: block 0x%llx, size 0x%x, jnl_offset 0x%llx\n", i, co_buf[i].block_num, + // co_buf[i].block_size, co_buf[i].jnl_offset); - if (number == (off_t)-1) { - // printf("jnl: replay_journal: skipping killed fs block\n"); - } else { + if (number == (off_t)-1) { + // printf("jnl: replay_journal: skipping killed fs block\n"); + } else { - // do journal read, and set the phys. block - ret = read_journal_data(jnl, &jnl_offset, block_ptr, size); - if (ret != size) { - printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset); - goto bad_replay; - } + // do journal read, and set the phys. block + ret = read_journal_data(jnl, &jnl_offset, block_ptr, size); + if (ret != size) { + printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset); + goto bad_replay; + } - if (update_fs_block(jnl, block_ptr, number, size) != 0) { - goto bad_replay; - } + if (update_fs_block(jnl, block_ptr, number, size) != 0) { + goto bad_replay; + } + } } - } + + // done replaying; update jnl header + if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num) != 0) { + goto bad_replay; + } - // done replaying; update jnl header - if (write_journal_header(jnl, 1) != 0) { - goto bad_replay; - } - - printf("jnl: %s: journal replay done.\n", jnl->jdev_name); + printf("jnl: %s: journal replay done.\n", jnl->jdev_name); - // free block_ptr - if (block_ptr) { - kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize); - block_ptr = NULL; - } + // free block_ptr + if (block_ptr) { + kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize); + block_ptr = NULL; + } - // free the coalesce buffer - FREE(co_buf, M_TEMP); - co_buf = NULL; + // free the coalesce buffer + FREE(co_buf, M_TEMP); + co_buf = NULL; - kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size); - return 0; + kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size); + return 0; - bad_replay: - if (block_ptr) { +bad_replay: + if (block_ptr) { kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize); - } - if (co_buf) { - FREE(co_buf, M_TEMP); - } - kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size); + } + if (co_buf) { + FREE(co_buf, M_TEMP); + } + kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size); - return -1; + return -1; } @@ -1413,11 +1482,11 @@ size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz) } } - // size up the transaction buffer... can't be larger than the number - // of blocks that can fit in a block_list_header block. - if (tbuffer_size == 0) { + // size up the transaction buffer... can't be larger than the number + // of blocks that can fit in a block_list_header block. + if (tbuffer_size == 0) { jnl->tbuffer_size = def_tbuffer_size; - } else { + } else { // make sure that the specified tbuffer_size isn't too small if (tbuffer_size < jnl->jhdr->blhdr_size * 2) { tbuffer_size = jnl->jhdr->blhdr_size * 2; @@ -1428,23 +1497,23 @@ size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz) } jnl->tbuffer_size = tbuffer_size; - } + } - if (jnl->tbuffer_size > (jnl->jhdr->size / 2)) { + if (jnl->tbuffer_size > (jnl->jhdr->size / 2)) { jnl->tbuffer_size = (jnl->jhdr->size / 2); - } + } - if (jnl->tbuffer_size > MAX_TRANSACTION_BUFFER_SIZE) { + if (jnl->tbuffer_size > MAX_TRANSACTION_BUFFER_SIZE) { jnl->tbuffer_size = MAX_TRANSACTION_BUFFER_SIZE; - } + } - jnl->jhdr->blhdr_size = (jnl->tbuffer_size / jnl->jhdr->jhdr_size) * sizeof(block_info); - if (jnl->jhdr->blhdr_size < phys_blksz) { - jnl->jhdr->blhdr_size = phys_blksz; - } else if ((jnl->jhdr->blhdr_size % phys_blksz) != 0) { + jnl->jhdr->blhdr_size = (jnl->tbuffer_size / jnl->jhdr->jhdr_size) * sizeof(block_info); + if (jnl->jhdr->blhdr_size < phys_blksz) { + jnl->jhdr->blhdr_size = phys_blksz; + } else if ((jnl->jhdr->blhdr_size % phys_blksz) != 0) { // have to round up so we're an even multiple of the physical block size jnl->jhdr->blhdr_size = (jnl->jhdr->blhdr_size + (phys_blksz - 1)) & ~(phys_blksz - 1); - } + } } @@ -1452,96 +1521,99 @@ size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz) static void get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl, struct vfs_context *context) { - off_t readblockcnt; - off_t writeblockcnt; - off_t readmaxcnt=0, tmp_readmaxcnt; - off_t writemaxcnt=0, tmp_writemaxcnt; - off_t readsegcnt, writesegcnt; - int32_t features; - - if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&features, 0, context) == 0) { - if (features & DK_FEATURE_FORCE_UNIT_ACCESS) { - const char *name = vnode_name(devvp); - jnl->flags |= JOURNAL_DO_FUA_WRITES; - printf("jnl: %s: enabling FUA writes (features 0x%x)\n", name ? name : "no-name-dev", features); + off_t readblockcnt; + off_t writeblockcnt; + off_t readmaxcnt=0, tmp_readmaxcnt; + off_t writemaxcnt=0, tmp_writemaxcnt; + off_t readsegcnt, writesegcnt; + int32_t features; + + if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&features, 0, context) == 0) { + if (features & DK_FEATURE_FORCE_UNIT_ACCESS) { + const char *name = vnode_name(devvp); + jnl->flags |= JOURNAL_DO_FUA_WRITES; + printf("jnl: %s: enabling FUA writes (features 0x%x)\n", name ? name : "no-name-dev", features); + } + if (features & DK_FEATURE_UNMAP) { + jnl->flags |= JOURNAL_USE_UNMAP; + } } - } - // - // First check the max read size via several different mechanisms... - // - VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD, (caddr_t)&readmaxcnt, 0, context); + // + // First check the max read size via several different mechanisms... + // + VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD, (caddr_t)&readmaxcnt, 0, context); - if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD, (caddr_t)&readblockcnt, 0, context) == 0) { - tmp_readmaxcnt = readblockcnt * phys_blksz; - if (readmaxcnt == 0 || (readblockcnt > 0 && tmp_readmaxcnt < readmaxcnt)) { - readmaxcnt = tmp_readmaxcnt; - } - } + if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD, (caddr_t)&readblockcnt, 0, context) == 0) { + tmp_readmaxcnt = readblockcnt * phys_blksz; + if (readmaxcnt == 0 || (readblockcnt > 0 && tmp_readmaxcnt < readmaxcnt)) { + readmaxcnt = tmp_readmaxcnt; + } + } - if (VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD, (caddr_t)&readsegcnt, 0, context)) { - readsegcnt = 0; - } + if (VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD, (caddr_t)&readsegcnt, 0, context)) { + readsegcnt = 0; + } - if (readsegcnt > 0 && (readsegcnt * PAGE_SIZE) < readmaxcnt) { - readmaxcnt = readsegcnt * PAGE_SIZE; - } + if (readsegcnt > 0 && (readsegcnt * PAGE_SIZE) < readmaxcnt) { + readmaxcnt = readsegcnt * PAGE_SIZE; + } - if (readmaxcnt == 0) { - readmaxcnt = 128 * 1024; - } else if (readmaxcnt > UINT32_MAX) { - readmaxcnt = UINT32_MAX; - } + if (readmaxcnt == 0) { + readmaxcnt = 128 * 1024; + } else if (readmaxcnt > UINT32_MAX) { + readmaxcnt = UINT32_MAX; + } - // - // Now check the max writes size via several different mechanisms... - // - VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE, (caddr_t)&writemaxcnt, 0, context); + // + // Now check the max writes size via several different mechanisms... + // + VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE, (caddr_t)&writemaxcnt, 0, context); - if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE, (caddr_t)&writeblockcnt, 0, context) == 0) { - tmp_writemaxcnt = writeblockcnt * phys_blksz; - if (writemaxcnt == 0 || (writeblockcnt > 0 && tmp_writemaxcnt < writemaxcnt)) { - writemaxcnt = tmp_writemaxcnt; - } - } + if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE, (caddr_t)&writeblockcnt, 0, context) == 0) { + tmp_writemaxcnt = writeblockcnt * phys_blksz; + if (writemaxcnt == 0 || (writeblockcnt > 0 && tmp_writemaxcnt < writemaxcnt)) { + writemaxcnt = tmp_writemaxcnt; + } + } - if (VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE, (caddr_t)&writesegcnt, 0, context)) { - writesegcnt = 0; - } + if (VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE, (caddr_t)&writesegcnt, 0, context)) { + writesegcnt = 0; + } - if (writesegcnt > 0 && (writesegcnt * PAGE_SIZE) < writemaxcnt) { - writemaxcnt = writesegcnt * PAGE_SIZE; - } + if (writesegcnt > 0 && (writesegcnt * PAGE_SIZE) < writemaxcnt) { + writemaxcnt = writesegcnt * PAGE_SIZE; + } - if (writemaxcnt == 0) { - writemaxcnt = 128 * 1024; - } else if (writemaxcnt > UINT32_MAX) { - writemaxcnt = UINT32_MAX; - } + if (writemaxcnt == 0) { + writemaxcnt = 128 * 1024; + } else if (writemaxcnt > UINT32_MAX) { + writemaxcnt = UINT32_MAX; + } - jnl->max_read_size = readmaxcnt; - jnl->max_write_size = writemaxcnt; - // printf("jnl: %s: max read/write: %lld k / %lld k\n", - // jnl->jdev_name ? jnl->jdev_name : "unknown", - // jnl->max_read_size/1024, jnl->max_write_size/1024); + jnl->max_read_size = readmaxcnt; + jnl->max_write_size = writemaxcnt; + // printf("jnl: %s: max read/write: %lld k / %lld k\n", + // jnl->jdev_name ? jnl->jdev_name : "unknown", + // jnl->max_read_size/1024, jnl->max_write_size/1024); } static const char * get_jdev_name(struct vnode *jvp) { - const char *jdev_name; + const char *jdev_name; - jdev_name = vnode_name(jvp); - if (jdev_name == NULL) { - jdev_name = vfs_addname("unknown-dev", strlen("unknown-dev"), 0, 0); - } else { - // this just bumps the refcount on the name so we have our own copy - jdev_name = vfs_addname(jdev_name, strlen(jdev_name), 0, 0); - } + jdev_name = vnode_name(jvp); + if (jdev_name == NULL) { + jdev_name = vfs_addname("unknown-dev", strlen("unknown-dev"), 0, 0); + } else { + // this just bumps the refcount on the name so we have our own copy + jdev_name = vfs_addname(jdev_name, strlen(jdev_name), 0, 0); + } - return jdev_name; + return jdev_name; } @@ -1556,143 +1628,167 @@ journal_create(struct vnode *jvp, void (*flush)(void *arg), void *arg) { - journal *jnl; - uint32_t phys_blksz, new_txn_base; - struct vfs_context context; - const char *jdev_name; + journal *jnl; + uint32_t phys_blksz, new_txn_base; + u_int32_t min_size; + struct vfs_context context; + const char *jdev_name; + /* + * Cap the journal max size to 2GB. On HFS, it will attempt to occupy + * a full allocation block if the current size is smaller than the allocation + * block on which it resides. Once we hit the exabyte filesystem range, then + * it will use 2GB allocation blocks. As a result, make the cap 2GB. + */ + context.vc_thread = current_thread(); + context.vc_ucred = FSCRED; - context.vc_thread = current_thread(); - context.vc_ucred = FSCRED; + jdev_name = get_jdev_name(jvp); - jdev_name = get_jdev_name(jvp); + /* Get the real physical block size. */ + if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) { + return NULL; + } - /* Get the real physical block size. */ - if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) { - return NULL; - } + if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) { + printf("jnl: create: journal size %lld looks bogus.\n", journal_size); + return NULL; + } - if (journal_size < (256*1024) || journal_size > (1024*1024*1024)) { - printf("jnl: create: journal size %lld looks bogus.\n", journal_size); - return NULL; - } + min_size = phys_blksz * (phys_blksz / sizeof(block_info)); + /* Reject journals that are too small given the sector size of the device */ + if (journal_size < min_size) { + printf("jnl: create: journal size (%lld) too small given sector size of (%u)\n", + journal_size, phys_blksz); + return NULL; + } - if (phys_blksz > min_fs_blksz) { + if (phys_blksz > min_fs_blksz) { printf("jnl: %s: create: error: phys blksize %u bigger than min fs blksize %zd\n", - jdev_name, phys_blksz, min_fs_blksz); + jdev_name, phys_blksz, min_fs_blksz); return NULL; - } + } - if ((journal_size % phys_blksz) != 0) { + if ((journal_size % phys_blksz) != 0) { printf("jnl: %s: create: journal size 0x%llx is not an even multiple of block size 0x%ux\n", - jdev_name, journal_size, phys_blksz); + jdev_name, journal_size, phys_blksz); return NULL; - } + } - MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK); - memset(jnl, 0, sizeof(*jnl)); + MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK); + memset(jnl, 0, sizeof(*jnl)); - jnl->jdev = jvp; - jnl->jdev_offset = offset; - jnl->fsdev = fsvp; - jnl->flush = flush; - jnl->flush_arg = arg; - jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK); - jnl->jdev_name = jdev_name; - lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr); + jnl->jdev = jvp; + jnl->jdev_offset = offset; + jnl->fsdev = fsvp; + jnl->flush = flush; + jnl->flush_arg = arg; + jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK); + jnl->jdev_name = jdev_name; + lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr); - get_io_info(jvp, phys_blksz, jnl, &context); + get_io_info(jvp, phys_blksz, jnl, &context); - if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) { - printf("jnl: %s: create: could not allocate space for header buffer (%u bytes)\n", jdev_name, phys_blksz); - goto bad_kmem_alloc; - } - jnl->header_buf_size = phys_blksz; + if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) { + printf("jnl: %s: create: could not allocate space for header buffer (%u bytes)\n", jdev_name, phys_blksz); + goto bad_kmem_alloc; + } + jnl->header_buf_size = phys_blksz; - jnl->jhdr = (journal_header *)jnl->header_buf; - memset(jnl->jhdr, 0, sizeof(journal_header)); + jnl->jhdr = (journal_header *)jnl->header_buf; + memset(jnl->jhdr, 0, sizeof(journal_header)); - // we have to set this up here so that do_journal_io() will work - jnl->jhdr->jhdr_size = phys_blksz; + // we have to set this up here so that do_journal_io() will work + jnl->jhdr->jhdr_size = phys_blksz; - // - // We try and read the journal header to see if there is already one - // out there. If there is, it's possible that it has transactions - // in it that we might replay if we happen to pick a sequence number - // that is a little less than the old one, there is a crash and the - // last txn written ends right at the start of a txn from the previous - // incarnation of this file system. If all that happens we would - // replay the transactions from the old file system and that would - // destroy your disk. Although it is extremely unlikely for all those - // conditions to happen, the probability is non-zero and the result is - // severe - you lose your file system. Therefore if we find a valid - // journal header and the sequence number is non-zero we write junk - // over the entire journal so that there is no way we will encounter - // any old transactions. This is slow but should be a rare event - // since most tools erase the journal. - // - if ( read_journal_header(jnl, jnl->jhdr, phys_blksz) == phys_blksz - && jnl->jhdr->magic == JOURNAL_HEADER_MAGIC - && jnl->jhdr->sequence_num != 0) { + // + // We try and read the journal header to see if there is already one + // out there. If there is, it's possible that it has transactions + // in it that we might replay if we happen to pick a sequence number + // that is a little less than the old one, there is a crash and the + // last txn written ends right at the start of a txn from the previous + // incarnation of this file system. If all that happens we would + // replay the transactions from the old file system and that would + // destroy your disk. Although it is extremely unlikely for all those + // conditions to happen, the probability is non-zero and the result is + // severe - you lose your file system. Therefore if we find a valid + // journal header and the sequence number is non-zero we write junk + // over the entire journal so that there is no way we will encounter + // any old transactions. This is slow but should be a rare event + // since most tools erase the journal. + // + if ( read_journal_header(jnl, jnl->jhdr, phys_blksz) == phys_blksz + && jnl->jhdr->magic == JOURNAL_HEADER_MAGIC + && jnl->jhdr->sequence_num != 0) { - new_txn_base = (jnl->jhdr->sequence_num + (journal_size / phys_blksz) + (random() % 16384)) & 0x00ffffff; - printf("jnl: create: avoiding old sequence number 0x%x (0x%x)\n", jnl->jhdr->sequence_num, new_txn_base); + new_txn_base = (jnl->jhdr->sequence_num + (journal_size / phys_blksz) + (random() % 16384)) & 0x00ffffff; + printf("jnl: create: avoiding old sequence number 0x%x (0x%x)\n", jnl->jhdr->sequence_num, new_txn_base); #if 0 - int i; - off_t pos=0; + int i; + off_t pos=0; - for(i=1; i < journal_size / phys_blksz; i++) { - pos = i*phys_blksz; + for(i = 1; i < journal_size / phys_blksz; i++) { + pos = i*phys_blksz; - // we don't really care what data we write just so long - // as it's not a valid transaction header. since we have - // the header_buf sitting around we'll use that. - write_journal_data(jnl, &pos, jnl->header_buf, phys_blksz); - } - printf("jnl: create: done clearing journal (i=%d)\n", i); + // we don't really care what data we write just so long + // as it's not a valid transaction header. since we have + // the header_buf sitting around we'll use that. + write_journal_data(jnl, &pos, jnl->header_buf, phys_blksz); + } + printf("jnl: create: done clearing journal (i=%d)\n", i); #endif - } else { - new_txn_base = random() & 0x00ffffff; - } + } else { + new_txn_base = random() & 0x00ffffff; + } - memset(jnl->header_buf, 0, phys_blksz); + memset(jnl->header_buf, 0, phys_blksz); - jnl->jhdr->magic = JOURNAL_HEADER_MAGIC; - jnl->jhdr->endian = ENDIAN_MAGIC; - jnl->jhdr->start = phys_blksz; // start at block #1, block #0 is for the jhdr itself - jnl->jhdr->end = phys_blksz; - jnl->jhdr->size = journal_size; - jnl->jhdr->jhdr_size = phys_blksz; - size_up_tbuffer(jnl, tbuffer_size, phys_blksz); - - jnl->active_start = jnl->jhdr->start; - - // XXXdbg - for testing you can force the journal to wrap around - // jnl->jhdr->start = jnl->jhdr->size - (phys_blksz*3); - // jnl->jhdr->end = jnl->jhdr->size - (phys_blksz*3); + jnl->jhdr->magic = JOURNAL_HEADER_MAGIC; + jnl->jhdr->endian = ENDIAN_MAGIC; + jnl->jhdr->start = phys_blksz; // start at block #1, block #0 is for the jhdr itself + jnl->jhdr->end = phys_blksz; + jnl->jhdr->size = journal_size; + jnl->jhdr->jhdr_size = phys_blksz; + size_up_tbuffer(jnl, tbuffer_size, phys_blksz); + + jnl->active_start = jnl->jhdr->start; + + // XXXdbg - for testing you can force the journal to wrap around + // jnl->jhdr->start = jnl->jhdr->size - (phys_blksz*3); + // jnl->jhdr->end = jnl->jhdr->size - (phys_blksz*3); - jnl->jhdr->sequence_num = new_txn_base; + jnl->jhdr->sequence_num = new_txn_base; - lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr); + lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr); + lck_mtx_init(&jnl->flock, jnl_mutex_group, jnl_lock_attr); + lck_rw_init(&jnl->trim_lock, jnl_mutex_group, jnl_lock_attr); + + jnl->flushing = FALSE; + jnl->asyncIO = FALSE; + jnl->flush_aborted = FALSE; + jnl->writing_header = FALSE; + jnl->async_trim = NULL; + jnl->sequence_num = jnl->jhdr->sequence_num; + + if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num) != 0) { + printf("jnl: %s: journal_create: failed to write journal header.\n", jdev_name); + goto bad_write; + } - if (write_journal_header(jnl, 1) != 0) { - printf("jnl: %s: journal_create: failed to write journal header.\n", jdev_name); - goto bad_write; - } + return jnl; - return jnl; +bad_write: + kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz); +bad_kmem_alloc: + if (jdev_name) { + vfs_removename(jdev_name); + } + jnl->jhdr = NULL; + FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL); - bad_write: - kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz); - bad_kmem_alloc: - if (jdev_name) { - vfs_removename(jdev_name); - } - jnl->jhdr = NULL; - FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL); - return NULL; + return NULL; } @@ -1707,69 +1803,78 @@ journal_open(struct vnode *jvp, void (*flush)(void *arg), void *arg) { - journal *jnl; - uint32_t orig_blksz=0; - uint32_t phys_blksz; - int orig_checksum, checksum; - struct vfs_context context; - const char *jdev_name = get_jdev_name(jvp); - - context.vc_thread = current_thread(); - context.vc_ucred = FSCRED; - - /* Get the real physical block size. */ - if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) { + journal *jnl; + uint32_t orig_blksz=0; + uint32_t phys_blksz; + u_int32_t min_size = 0; + int orig_checksum, checksum; + struct vfs_context context; + const char *jdev_name = get_jdev_name(jvp); + + context.vc_thread = current_thread(); + context.vc_ucred = FSCRED; + + /* Get the real physical block size. */ + if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) { return NULL; - } + } - if (phys_blksz > min_fs_blksz) { + if (phys_blksz > min_fs_blksz) { printf("jnl: %s: open: error: phys blksize %u bigger than min fs blksize %zd\n", - jdev_name, phys_blksz, min_fs_blksz); + jdev_name, phys_blksz, min_fs_blksz); return NULL; - } + } - if (journal_size < (256*1024) || journal_size > (1024*1024*1024)) { - printf("jnl: open: journal size %lld looks bogus.\n", journal_size); - return NULL; - } + if (journal_size < (256*1024) || journal_size > (1024*1024*1024)) { + printf("jnl: open: journal size %lld looks bogus.\n", journal_size); + return NULL; + } + + min_size = phys_blksz * (phys_blksz / sizeof(block_info)); + /* Reject journals that are too small given the sector size of the device */ + if (journal_size < min_size) { + printf("jnl: open: journal size (%lld) too small given sector size of (%u)\n", + journal_size, phys_blksz); + return NULL; + } - if ((journal_size % phys_blksz) != 0) { + if ((journal_size % phys_blksz) != 0) { printf("jnl: %s: open: journal size 0x%llx is not an even multiple of block size 0x%x\n", - jdev_name, journal_size, phys_blksz); + jdev_name, journal_size, phys_blksz); return NULL; - } + } - MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK); - memset(jnl, 0, sizeof(*jnl)); + MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK); + memset(jnl, 0, sizeof(*jnl)); - jnl->jdev = jvp; - jnl->jdev_offset = offset; - jnl->fsdev = fsvp; - jnl->flush = flush; - jnl->flush_arg = arg; - jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK); - jnl->jdev_name = jdev_name; - lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr); + jnl->jdev = jvp; + jnl->jdev_offset = offset; + jnl->fsdev = fsvp; + jnl->flush = flush; + jnl->flush_arg = arg; + jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK); + jnl->jdev_name = jdev_name; + lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr); - get_io_info(jvp, phys_blksz, jnl, &context); + get_io_info(jvp, phys_blksz, jnl, &context); - if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) { - printf("jnl: %s: create: could not allocate space for header buffer (%u bytes)\n", jdev_name, phys_blksz); - goto bad_kmem_alloc; - } - jnl->header_buf_size = phys_blksz; + if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) { + printf("jnl: %s: create: could not allocate space for header buffer (%u bytes)\n", jdev_name, phys_blksz); + goto bad_kmem_alloc; + } + jnl->header_buf_size = phys_blksz; - jnl->jhdr = (journal_header *)jnl->header_buf; - memset(jnl->jhdr, 0, sizeof(journal_header)); + jnl->jhdr = (journal_header *)jnl->header_buf; + memset(jnl->jhdr, 0, sizeof(journal_header)); - // we have to set this up here so that do_journal_io() will work - jnl->jhdr->jhdr_size = phys_blksz; + // we have to set this up here so that do_journal_io() will work + jnl->jhdr->jhdr_size = phys_blksz; - if (read_journal_header(jnl, jnl->jhdr, phys_blksz) != phys_blksz) { + if (read_journal_header(jnl, jnl->jhdr, phys_blksz) != phys_blksz) { printf("jnl: %s: open: could not read %u bytes for the journal header.\n", - jdev_name, phys_blksz); + jdev_name, phys_blksz); goto bad_journal; - } + } orig_checksum = jnl->jhdr->checksum; jnl->jhdr->checksum = 0; @@ -1784,18 +1889,18 @@ journal_open(struct vnode *jvp, checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE); } - if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC && jnl->jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) { + if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC && jnl->jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) { printf("jnl: %s: open: journal magic is bad (0x%x != 0x%x)\n", - jnl->jdev_name, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC); + jnl->jdev_name, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC); goto bad_journal; - } + } // only check if we're the current journal header magic value if (jnl->jhdr->magic == JOURNAL_HEADER_MAGIC) { if (orig_checksum != checksum) { printf("jnl: %s: open: journal checksum is bad (0x%x != 0x%x)\n", - jdev_name, orig_checksum, checksum); + jdev_name, orig_checksum, checksum); //goto bad_journal; } @@ -1807,16 +1912,16 @@ journal_open(struct vnode *jvp, } if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) { - /* - * The volume has probably been resized (such that we had to adjust the - * logical sector size), or copied to media with a different logical - * sector size. + /* + * The volume has probably been resized (such that we had to adjust the + * logical sector size), or copied to media with a different logical + * sector size. * * Temporarily change the device's logical block size to match the * journal's header size. This will allow us to replay the journal * safely. If the replay succeeds, we will update the journal's header * size (later in this function). - */ + */ orig_blksz = phys_blksz; phys_blksz = jnl->jhdr->jhdr_size; @@ -1825,27 +1930,27 @@ journal_open(struct vnode *jvp, printf("jnl: %s: open: temporarily switched block size from %u to %u\n", jdev_name, orig_blksz, phys_blksz); } - - if ( jnl->jhdr->start <= 0 - || jnl->jhdr->start > jnl->jhdr->size - || jnl->jhdr->start > 1024*1024*1024) { + + if ( jnl->jhdr->start <= 0 + || jnl->jhdr->start > jnl->jhdr->size + || jnl->jhdr->start > 1024*1024*1024) { printf("jnl: %s: open: jhdr start looks bad (0x%llx max size 0x%llx)\n", - jdev_name, jnl->jhdr->start, jnl->jhdr->size); + jdev_name, jnl->jhdr->start, jnl->jhdr->size); goto bad_journal; - } + } - if ( jnl->jhdr->end <= 0 - || jnl->jhdr->end > jnl->jhdr->size - || jnl->jhdr->end > 1024*1024*1024) { + if ( jnl->jhdr->end <= 0 + || jnl->jhdr->end > jnl->jhdr->size + || jnl->jhdr->end > 1024*1024*1024) { printf("jnl: %s: open: jhdr end looks bad (0x%llx max size 0x%llx)\n", - jdev_name, jnl->jhdr->end, jnl->jhdr->size); + jdev_name, jnl->jhdr->end, jnl->jhdr->size); goto bad_journal; - } + } - if (jnl->jhdr->size < (256*1024) || jnl->jhdr->size > 1024*1024*1024) { - printf("jnl: %s: open: jhdr size looks bad (0x%llx)\n", jdev_name, jnl->jhdr->size); - goto bad_journal; - } + if (jnl->jhdr->size < (256*1024) || jnl->jhdr->size > 1024*1024*1024) { + printf("jnl: %s: open: jhdr size looks bad (0x%llx)\n", jdev_name, jnl->jhdr->size); + goto bad_journal; + } // XXXdbg - can't do these checks because hfs writes all kinds of // non-uniform sized blocks even on devices that have a block size @@ -1853,28 +1958,28 @@ journal_open(struct vnode *jvp, // therefore these checks will fail and so we just have to punt and // do more relaxed checking... // XXXdbg if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) { - if ((jnl->jhdr->start % 512) != 0) { + if ((jnl->jhdr->start % 512) != 0) { printf("jnl: %s: open: journal start (0x%llx) not a multiple of 512?\n", - jdev_name, jnl->jhdr->start); + jdev_name, jnl->jhdr->start); goto bad_journal; - } + } //XXXdbg if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) { - if ((jnl->jhdr->end % 512) != 0) { + if ((jnl->jhdr->end % 512) != 0) { printf("jnl: %s: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n", - jdev_name, jnl->jhdr->end, jnl->jhdr->jhdr_size); + jdev_name, jnl->jhdr->end, jnl->jhdr->jhdr_size); goto bad_journal; - } + } - // take care of replaying the journal if necessary - if (flags & JOURNAL_RESET) { - printf("jnl: %s: journal start/end pointers reset! (jnl %p; s 0x%llx e 0x%llx)\n", - jdev_name, jnl, jnl->jhdr->start, jnl->jhdr->end); - jnl->jhdr->start = jnl->jhdr->end; - } else if (replay_journal(jnl) != 0) { - printf("jnl: %s: journal_open: Error replaying the journal!\n", jdev_name); - goto bad_journal; - } + // take care of replaying the journal if necessary + if (flags & JOURNAL_RESET) { + printf("jnl: %s: journal start/end pointers reset! (jnl %p; s 0x%llx e 0x%llx)\n", + jdev_name, jnl, jnl->jhdr->start, jnl->jhdr->end); + jnl->jhdr->start = jnl->jhdr->end; + } else if (replay_journal(jnl) != 0) { + printf("jnl: %s: journal_open: Error replaying the journal!\n", jdev_name); + goto bad_journal; + } /* * When we get here, we know that the journal is empty (jnl->jhdr->start == @@ -1891,6 +1996,7 @@ journal_open(struct vnode *jvp, VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context); phys_blksz = orig_blksz; orig_blksz = 0; + printf("jnl: %s: open: restored block size to %u\n", jdev_name, phys_blksz); jnl->jhdr->jhdr_size = phys_blksz; jnl->jhdr->start = phys_blksz; @@ -1899,23 +2005,24 @@ journal_open(struct vnode *jvp, (journal_size / phys_blksz) + (random() % 16384)) & 0x00ffffff; - if (write_journal_header(jnl, 1)) { + if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num)) { printf("jnl: %s: open: failed to update journal header size\n", jdev_name); goto bad_journal; } } - + // make sure this is in sync! jnl->active_start = jnl->jhdr->start; + jnl->sequence_num = jnl->jhdr->sequence_num; // set this now, after we've replayed the journal size_up_tbuffer(jnl, tbuffer_size, phys_blksz); // TODO: Does this need to change if the device's logical block size changed? if ((off_t)(jnl->jhdr->blhdr_size/sizeof(block_info)-1) > (jnl->jhdr->size/jnl->jhdr->jhdr_size)) { - printf("jnl: %s: open: jhdr size and blhdr size are not compatible (0x%llx, %d, %d)\n", jdev_name, jnl->jhdr->size, - jnl->jhdr->blhdr_size, jnl->jhdr->jhdr_size); - goto bad_journal; + printf("jnl: %s: open: jhdr size and blhdr size are not compatible (0x%llx, %d, %d)\n", jdev_name, jnl->jhdr->size, + jnl->jhdr->blhdr_size, jnl->jhdr->jhdr_size); + goto bad_journal; } lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr); @@ -1926,7 +2033,7 @@ journal_open(struct vnode *jvp, if (orig_blksz != 0) { phys_blksz = orig_blksz; VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context); - printf("jnl: %s: open: restored block size after error\n", jdev_name); + printf("jnl: %s: open: restored block size to %u after error\n", jdev_name, orig_blksz); } kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz); bad_kmem_alloc: @@ -1945,110 +2052,109 @@ journal_is_clean(struct vnode *jvp, struct vnode *fsvp, size_t min_fs_block_size) { - journal jnl; - uint32_t phys_blksz; - int ret; - int orig_checksum, checksum; - struct vfs_context context; - const char *jdev_name = get_jdev_name(jvp); - - context.vc_thread = current_thread(); - context.vc_ucred = FSCRED; - - /* Get the real physical block size. */ - if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) { - printf("jnl: %s: is_clean: failed to get device block size.\n", jdev_name); - return EINVAL; - } + journal jnl; + uint32_t phys_blksz; + int ret; + int orig_checksum, checksum; + struct vfs_context context; + const char *jdev_name = get_jdev_name(jvp); + + context.vc_thread = current_thread(); + context.vc_ucred = FSCRED; + + /* Get the real physical block size. */ + if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) { + printf("jnl: %s: is_clean: failed to get device block size.\n", jdev_name); + return EINVAL; + } - if (phys_blksz > (uint32_t)min_fs_block_size) { - printf("jnl: %s: is_clean: error: phys blksize %d bigger than min fs blksize %zd\n", - jdev_name, phys_blksz, min_fs_block_size); - return EINVAL; - } + if (phys_blksz > (uint32_t)min_fs_block_size) { + printf("jnl: %s: is_clean: error: phys blksize %d bigger than min fs blksize %zd\n", + jdev_name, phys_blksz, min_fs_block_size); + return EINVAL; + } - if (journal_size < (256*1024) || journal_size > (1024*1024*1024)) { - printf("jnl: is_clean: journal size %lld looks bogus.\n", journal_size); - return EINVAL; - } + if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) { + printf("jnl: is_clean: journal size %lld looks bogus.\n", journal_size); + return EINVAL; + } - if ((journal_size % phys_blksz) != 0) { - printf("jnl: %s: is_clean: journal size 0x%llx is not an even multiple of block size 0x%x\n", - jdev_name, journal_size, phys_blksz); - return EINVAL; - } + if ((journal_size % phys_blksz) != 0) { + printf("jnl: %s: is_clean: journal size 0x%llx is not an even multiple of block size 0x%x\n", + jdev_name, journal_size, phys_blksz); + return EINVAL; + } - memset(&jnl, 0, sizeof(jnl)); + memset(&jnl, 0, sizeof(jnl)); - if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl.header_buf, phys_blksz)) { - printf("jnl: %s: is_clean: could not allocate space for header buffer (%d bytes)\n", jdev_name, phys_blksz); - return ENOMEM; - } - jnl.header_buf_size = phys_blksz; + if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl.header_buf, phys_blksz)) { + printf("jnl: %s: is_clean: could not allocate space for header buffer (%d bytes)\n", jdev_name, phys_blksz); + return ENOMEM; + } + jnl.header_buf_size = phys_blksz; - get_io_info(jvp, phys_blksz, &jnl, &context); + get_io_info(jvp, phys_blksz, &jnl, &context); - jnl.jhdr = (journal_header *)jnl.header_buf; - memset(jnl.jhdr, 0, sizeof(journal_header)); + jnl.jhdr = (journal_header *)jnl.header_buf; + memset(jnl.jhdr, 0, sizeof(journal_header)); - jnl.jdev = jvp; - jnl.jdev_offset = offset; - jnl.fsdev = fsvp; + jnl.jdev = jvp; + jnl.jdev_offset = offset; + jnl.fsdev = fsvp; - // we have to set this up here so that do_journal_io() will work - jnl.jhdr->jhdr_size = phys_blksz; + // we have to set this up here so that do_journal_io() will work + jnl.jhdr->jhdr_size = phys_blksz; - if (read_journal_header(&jnl, jnl.jhdr, phys_blksz) != (unsigned)phys_blksz) { - printf("jnl: %s: is_clean: could not read %d bytes for the journal header.\n", - jdev_name, phys_blksz); - ret = EINVAL; - goto get_out; - } + if (read_journal_header(&jnl, jnl.jhdr, phys_blksz) != (unsigned)phys_blksz) { + printf("jnl: %s: is_clean: could not read %d bytes for the journal header.\n", + jdev_name, phys_blksz); + ret = EINVAL; + goto get_out; + } - orig_checksum = jnl.jhdr->checksum; - jnl.jhdr->checksum = 0; - - if (jnl.jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) { - // do this before the swap since it's done byte-at-a-time - orig_checksum = SWAP32(orig_checksum); - checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE); - swap_journal_header(&jnl); - jnl.flags |= JOURNAL_NEED_SWAP; - } else { - checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE); - } + orig_checksum = jnl.jhdr->checksum; + jnl.jhdr->checksum = 0; - if (jnl.jhdr->magic != JOURNAL_HEADER_MAGIC && jnl.jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) { - printf("jnl: %s: is_clean: journal magic is bad (0x%x != 0x%x)\n", - jdev_name, jnl.jhdr->magic, JOURNAL_HEADER_MAGIC); - ret = EINVAL; - goto get_out; - } + if (jnl.jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) { + // do this before the swap since it's done byte-at-a-time + orig_checksum = SWAP32(orig_checksum); + checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE); + swap_journal_header(&jnl); + jnl.flags |= JOURNAL_NEED_SWAP; + } else { + checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE); + } - if (orig_checksum != checksum) { - printf("jnl: %s: is_clean: journal checksum is bad (0x%x != 0x%x)\n", jdev_name, orig_checksum, checksum); - ret = EINVAL; - goto get_out; - } + if (jnl.jhdr->magic != JOURNAL_HEADER_MAGIC && jnl.jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) { + printf("jnl: %s: is_clean: journal magic is bad (0x%x != 0x%x)\n", + jdev_name, jnl.jhdr->magic, JOURNAL_HEADER_MAGIC); + ret = EINVAL; + goto get_out; + } - // - // if the start and end are equal then the journal is clean. - // otherwise it's not clean and therefore an error. - // - if (jnl.jhdr->start == jnl.jhdr->end) { - ret = 0; - } else { - ret = EBUSY; // so the caller can differentiate an invalid journal from a "busy" one - } + if (orig_checksum != checksum) { + printf("jnl: %s: is_clean: journal checksum is bad (0x%x != 0x%x)\n", jdev_name, orig_checksum, checksum); + ret = EINVAL; + goto get_out; + } - get_out: - kmem_free(kernel_map, (vm_offset_t)jnl.header_buf, phys_blksz); - if (jdev_name) { - vfs_removename(jdev_name); - } - - return ret; + // + // if the start and end are equal then the journal is clean. + // otherwise it's not clean and therefore an error. + // + if (jnl.jhdr->start == jnl.jhdr->end) { + ret = 0; + } else { + ret = EBUSY; // so the caller can differentiate an invalid journal from a "busy" one + } +get_out: + kmem_free(kernel_map, (vm_offset_t)jnl.header_buf, phys_blksz); + if (jdev_name) { + vfs_removename(jdev_name); + } + + return ret; } @@ -2056,26 +2162,31 @@ journal_is_clean(struct vnode *jvp, void journal_close(journal *jnl) { - volatile off_t *start, *end; - int counter=0; + volatile off_t *start, *end; + int counter=0; - CHECK_JOURNAL(jnl); + CHECK_JOURNAL(jnl); // set this before doing anything that would block so that // we start tearing things down properly. // jnl->flags |= JOURNAL_CLOSE_PENDING; - if (jnl->owner != current_thread()) { + if (jnl->owner != current_thread()) { lock_journal(jnl); - } + } - // - // only write stuff to disk if the journal is still valid - // - if ((jnl->flags & JOURNAL_INVALID) == 0) { + wait_condition(jnl, &jnl->flushing, "journal_close"); + + // + // only write stuff to disk if the journal is still valid + // + if ((jnl->flags & JOURNAL_INVALID) == 0) { if (jnl->active_tr) { + /* + * "journal_end_transaction" will fire the flush asynchronously + */ journal_end_transaction(jnl); } @@ -2084,8 +2195,17 @@ journal_close(journal *jnl) transaction *tr = jnl->cur_tr; jnl->cur_tr = NULL; - end_transaction(tr, 1, NULL, NULL); // force it to get flushed + /* + * "end_transaction" will wait for any in-progress flush to complete + * before flushing "cur_tr" synchronously("must_wait" == TRUE) + */ + end_transaction(tr, 1, NULL, NULL, FALSE, TRUE); } + /* + * if there was an "active_tr", make sure we wait for + * it to flush if there was no "cur_tr" to process + */ + wait_condition(jnl, &jnl->flushing, "journal_close"); //start = &jnl->jhdr->start; start = &jnl->active_start; @@ -2101,20 +2221,22 @@ journal_close(journal *jnl) if (*start != *end) { printf("jnl: %s: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n", - jnl->jdev_name, *start, *end); + jnl->jdev_name, *start, *end); } // make sure this is in sync when we close the journal jnl->jhdr->start = jnl->active_start; // if this fails there's not much we can do at this point... - write_journal_header(jnl, 1); - } else { + write_journal_header(jnl, 1, jnl->sequence_num); + } else { // if we're here the journal isn't valid any more. // so make sure we don't leave any locked blocks lying around printf("jnl: %s: close: journal %p, is invalid. aborting outstanding transactions\n", jnl->jdev_name, jnl); + if (jnl->active_tr || jnl->cur_tr) { transaction *tr; + if (jnl->active_tr) { tr = jnl->active_tr; jnl->active_tr = NULL; @@ -2122,45 +2244,45 @@ journal_close(journal *jnl) tr = jnl->cur_tr; jnl->cur_tr = NULL; } - abort_transaction(jnl, tr); + if (jnl->active_tr || jnl->cur_tr) { - panic("jnl: %s: close: jnl @ %p had both an active and cur tr\n", jnl->jdev_name, jnl); + panic("jnl: %s: close: jnl @ %p had both an active and cur tr\n", jnl->jdev_name, jnl); } } - } + } - free_old_stuff(jnl); + free_old_stuff(jnl); - kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->header_buf_size); - jnl->jhdr = (void *)0xbeefbabe; + kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->header_buf_size); + jnl->jhdr = (void *)0xbeefbabe; - if (jnl->jdev_name) { - vfs_removename(jnl->jdev_name); - } + if (jnl->jdev_name) { + vfs_removename(jnl->jdev_name); + } - FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL); + FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL); } static void dump_journal(journal *jnl) { - transaction *ctr; - - printf("journal for dev %s:", jnl->jdev_name); - printf(" jdev_offset %.8llx\n", jnl->jdev_offset); - printf(" magic: 0x%.8x\n", jnl->jhdr->magic); - printf(" start: 0x%.8llx\n", jnl->jhdr->start); - printf(" end: 0x%.8llx\n", jnl->jhdr->end); - printf(" size: 0x%.8llx\n", jnl->jhdr->size); - printf(" blhdr size: %d\n", jnl->jhdr->blhdr_size); - printf(" jhdr size: %d\n", jnl->jhdr->jhdr_size); - printf(" chksum: 0x%.8x\n", jnl->jhdr->checksum); + transaction *ctr; + + printf("journal for dev %s:", jnl->jdev_name); + printf(" jdev_offset %.8llx\n", jnl->jdev_offset); + printf(" magic: 0x%.8x\n", jnl->jhdr->magic); + printf(" start: 0x%.8llx\n", jnl->jhdr->start); + printf(" end: 0x%.8llx\n", jnl->jhdr->end); + printf(" size: 0x%.8llx\n", jnl->jhdr->size); + printf(" blhdr size: %d\n", jnl->jhdr->blhdr_size); + printf(" jhdr size: %d\n", jnl->jhdr->jhdr_size); + printf(" chksum: 0x%.8x\n", jnl->jhdr->checksum); - printf(" completed transactions:\n"); - for(ctr=jnl->completed_trs; ctr; ctr=ctr->next) { + printf(" completed transactions:\n"); + for (ctr = jnl->completed_trs; ctr; ctr = ctr->next) { printf(" 0x%.8llx - 0x%.8llx\n", ctr->journal_start, ctr->journal_end); - } + } } @@ -2168,18 +2290,18 @@ dump_journal(journal *jnl) static off_t free_space(journal *jnl) { - off_t free_space_offset; + off_t free_space_offset; - if (jnl->jhdr->start < jnl->jhdr->end) { + if (jnl->jhdr->start < jnl->jhdr->end) { free_space_offset = jnl->jhdr->size - (jnl->jhdr->end - jnl->jhdr->start) - jnl->jhdr->jhdr_size; - } else if (jnl->jhdr->start > jnl->jhdr->end) { + } else if (jnl->jhdr->start > jnl->jhdr->end) { free_space_offset = jnl->jhdr->start - jnl->jhdr->end; - } else { + } else { // journal is completely empty free_space_offset = jnl->jhdr->size - jnl->jhdr->jhdr_size; - } + } - return free_space_offset; + return free_space_offset; } @@ -2188,46 +2310,50 @@ free_space(journal *jnl) // The "desired_size" is in bytes. // static int -check_free_space(journal *jnl, int desired_size) +check_free_space(journal *jnl, int desired_size, boolean_t *delayed_header_write, uint32_t sequence_num) { - size_t i; - int counter=0; + size_t i; + int counter=0; + + //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n", + // desired_size, free_space(jnl)); - //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n", -// desired_size, free_space(jnl)); + if (delayed_header_write) + *delayed_header_write = FALSE; - while (1) { + while (1) { int old_start_empty; + // make sure there's space in the journal to hold this transaction + if (free_space(jnl) > desired_size && jnl->old_start[0] == 0) { + break; + } if (counter++ == 5000) { dump_journal(jnl); panic("jnl: check_free_space: buffer flushing isn't working " - "(jnl @ %p s %lld e %lld f %lld [active start %lld]).\n", jnl, - jnl->jhdr->start, jnl->jhdr->end, free_space(jnl), jnl->active_start); + "(jnl @ %p s %lld e %lld f %lld [active start %lld]).\n", jnl, + jnl->jhdr->start, jnl->jhdr->end, free_space(jnl), jnl->active_start); } if (counter > 7500) { - printf("jnl: %s: check_free_space: giving up waiting for free space.\n", jnl->jdev_name); - return ENOSPC; + printf("jnl: %s: check_free_space: giving up waiting for free space.\n", jnl->jdev_name); + return ENOSPC; } - // make sure there's space in the journal to hold this transaction - if (free_space(jnl) > desired_size && jnl->old_start[0] == 0) { - break; - } // // here's where we lazily bump up jnl->jhdr->start. we'll consume // entries until there is enough space for the next transaction. // old_start_empty = 1; lock_oldstart(jnl); - for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) { + + for (i = 0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) { int lcl_counter; lcl_counter = 0; while (jnl->old_start[i] & 0x8000000000000000LL) { if (lcl_counter++ > 1000) { panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl %p).\n", - jnl->old_start[i], jnl); + jnl->old_start[i], jnl); } unlock_oldstart(jnl); @@ -2245,10 +2371,16 @@ check_free_space(journal *jnl, int desired_size) old_start_empty = 0; jnl->jhdr->start = jnl->old_start[i]; jnl->old_start[i] = 0; + if (free_space(jnl) > desired_size) { - unlock_oldstart(jnl); - write_journal_header(jnl, 1); - lock_oldstart(jnl); + + if (delayed_header_write) + *delayed_header_write = TRUE; + else { + unlock_oldstart(jnl); + write_journal_header(jnl, 1, sequence_num); + lock_oldstart(jnl); + } break; } } @@ -2268,7 +2400,11 @@ check_free_space(journal *jnl, int desired_size) // start of the loop. // jnl->jhdr->start = jnl->active_start; - write_journal_header(jnl, 1); + + if (delayed_header_write) + *delayed_header_write = TRUE; + else + write_journal_header(jnl, 1, sequence_num); continue; } @@ -2283,9 +2419,9 @@ check_free_space(journal *jnl, int desired_size) // wait for a while to avoid being cpu-bound (this will // put us to sleep for 10 milliseconds) tsleep((caddr_t)jnl, PRIBIO, "check_free_space2", 1); - } + } - return 0; + return 0; } /* @@ -2297,31 +2433,31 @@ journal_allocate_transaction(journal *jnl) transaction *tr; MALLOC_ZONE(tr, transaction *, sizeof(transaction), M_JNL_TR, M_WAITOK); - memset(tr, 0, sizeof(transaction)); + memset(tr, 0, sizeof(transaction)); - tr->tbuffer_size = jnl->tbuffer_size; + tr->tbuffer_size = jnl->tbuffer_size; - if (kmem_alloc(kernel_map, (vm_offset_t *)&tr->tbuffer, tr->tbuffer_size)) { + if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&tr->tbuffer, tr->tbuffer_size)) { FREE_ZONE(tr, sizeof(transaction), M_JNL_TR); jnl->active_tr = NULL; return ENOMEM; - } + } - // journal replay code checksum check depends on this. - memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE); - // Fill up the rest of the block with unimportant bytes (0x5a 'Z' chosen for visibility) - memset(tr->tbuffer + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE); + // journal replay code checksum check depends on this. + memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE); + // Fill up the rest of the block with unimportant bytes (0x5a 'Z' chosen for visibility) + memset(tr->tbuffer + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE); - tr->blhdr = (block_list_header *)tr->tbuffer; - tr->blhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1; - tr->blhdr->num_blocks = 1; // accounts for this header block - tr->blhdr->bytes_used = jnl->jhdr->blhdr_size; - tr->blhdr->flags = BLHDR_CHECK_CHECKSUMS | BLHDR_FIRST_HEADER; + tr->blhdr = (block_list_header *)tr->tbuffer; + tr->blhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1; + tr->blhdr->num_blocks = 1; // accounts for this header block + tr->blhdr->bytes_used = jnl->jhdr->blhdr_size; + tr->blhdr->flags = BLHDR_CHECK_CHECKSUMS | BLHDR_FIRST_HEADER; - tr->sequence_num = ++jnl->jhdr->sequence_num; - tr->num_blhdrs = 1; - tr->total_bytes = jnl->jhdr->blhdr_size; - tr->jnl = jnl; + tr->sequence_num = ++jnl->sequence_num; + tr->num_blhdrs = 1; + tr->total_bytes = jnl->jhdr->blhdr_size; + tr->jnl = jnl; jnl->active_tr = tr; @@ -2331,67 +2467,72 @@ journal_allocate_transaction(journal *jnl) int journal_start_transaction(journal *jnl) { - int ret; + int ret; - CHECK_JOURNAL(jnl); + CHECK_JOURNAL(jnl); - if (jnl->flags & JOURNAL_INVALID) { - return EINVAL; - } + free_old_stuff(jnl); - if (jnl->owner == current_thread()) { + if (jnl->flags & JOURNAL_INVALID) { + return EINVAL; + } + if (jnl->owner == current_thread()) { if (jnl->active_tr == NULL) { panic("jnl: start_tr: active_tr is NULL (jnl @ %p, owner %p, current_thread %p\n", - jnl, jnl->owner, current_thread()); + jnl, jnl->owner, current_thread()); } jnl->nested_count++; return 0; - } - - lock_journal(jnl); + } + lock_journal(jnl); - if (jnl->owner != NULL || jnl->nested_count != 0 || jnl->active_tr != NULL) { + if (jnl->owner != NULL || jnl->nested_count != 0 || jnl->active_tr != NULL) { panic("jnl: start_tr: owner %p, nested count %d, active_tr %p jnl @ %p\n", - jnl->owner, jnl->nested_count, jnl->active_tr, jnl); - } + jnl->owner, jnl->nested_count, jnl->active_tr, jnl); + } - jnl->owner = current_thread(); - jnl->nested_count = 1; + jnl->owner = current_thread(); + jnl->nested_count = 1; - free_old_stuff(jnl); +#if JOE + // make sure there's room in the journal + if (free_space(jnl) < jnl->tbuffer_size) { - // make sure there's room in the journal - if (free_space(jnl) < jnl->tbuffer_size) { - // this is the call that really waits for space to free up - // as well as updating jnl->jhdr->start - if (check_free_space(jnl, jnl->tbuffer_size) != 0) { - printf("jnl: %s: start transaction failed: no space\n", jnl->jdev_name); - ret = ENOSPC; - goto bad_start; + KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_START, jnl, 0, 0, 0, 0); + + // this is the call that really waits for space to free up + // as well as updating jnl->jhdr->start + if (check_free_space(jnl, jnl->tbuffer_size, NULL, jnl->sequence_num) != 0) { + printf("jnl: %s: start transaction failed: no space\n", jnl->jdev_name); + ret = ENOSPC; + goto bad_start; + } + KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_END, jnl, 0, 0, 0, 0); } - } +#endif - // if there's a buffered transaction, use it. - if (jnl->cur_tr) { + // if there's a buffered transaction, use it. + if (jnl->cur_tr) { jnl->active_tr = jnl->cur_tr; jnl->cur_tr = NULL; return 0; - } + } ret = journal_allocate_transaction(jnl); if (ret) { goto bad_start; } - // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, jnl->active_tr); + // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, jnl->active_tr); - return 0; + return 0; - bad_start: +bad_start: jnl->owner = NULL; jnl->nested_count = 0; unlock_journal(jnl); + return ret; } @@ -2399,118 +2540,123 @@ journal_start_transaction(journal *jnl) int journal_modify_block_start(journal *jnl, struct buf *bp) { - transaction *tr; + transaction *tr; - CHECK_JOURNAL(jnl); + CHECK_JOURNAL(jnl); + - if (jnl->flags & JOURNAL_INVALID) { + free_old_stuff(jnl); + + if (jnl->flags & JOURNAL_INVALID) { return EINVAL; - } + } - // XXXdbg - for debugging I want this to be true. later it may - // not be necessary. - if ((buf_flags(bp) & B_META) == 0) { + // XXXdbg - for debugging I want this to be true. later it may + // not be necessary. + if ((buf_flags(bp) & B_META) == 0) { panic("jnl: modify_block_start: bp @ %p is not a meta-data block! (jnl %p)\n", bp, jnl); - } + } - tr = jnl->active_tr; - CHECK_TRANSACTION(tr); + tr = jnl->active_tr; + CHECK_TRANSACTION(tr); - if (jnl->owner != current_thread()) { + if (jnl->owner != current_thread()) { panic("jnl: modify_block_start: called w/out a transaction! jnl %p, owner %p, curact %p\n", - jnl, jnl->owner, current_thread()); - } - - free_old_stuff(jnl); + jnl, jnl->owner, current_thread()); + } - //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d; total bytes %d)\n", - // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes); + //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d; total bytes %d)\n", + // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes); - // can't allow blocks that aren't an even multiple of the - // underlying block size. - if ((buf_size(bp) % jnl->jhdr->jhdr_size) != 0) { - uint32_t phys_blksz, bad=0; + // can't allow blocks that aren't an even multiple of the + // underlying block size. + if ((buf_size(bp) % jnl->jhdr->jhdr_size) != 0) { + uint32_t phys_blksz, bad=0; - if (VNOP_IOCTL(jnl->jdev, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, vfs_context_kernel())) { - bad = 1; - } else if (phys_blksz != (uint32_t)jnl->jhdr->jhdr_size) { - if (phys_blksz < 512) { - panic("jnl: mod block start: phys blksz %d is too small (%d, %d)\n", - phys_blksz, buf_size(bp), jnl->jhdr->jhdr_size); - } - - if ((buf_size(bp) % phys_blksz) != 0) { - bad = 1; - } else if (phys_blksz < (uint32_t)jnl->jhdr->jhdr_size) { - jnl->jhdr->jhdr_size = phys_blksz; - } else { - // the phys_blksz is now larger... need to realloc the jhdr - char *new_header_buf; - - printf("jnl: %s: phys blksz got bigger (was: %d/%d now %d)\n", - jnl->jdev_name, jnl->header_buf_size, jnl->jhdr->jhdr_size, phys_blksz); - if (kmem_alloc(kernel_map, (vm_offset_t *)&new_header_buf, phys_blksz)) { - printf("jnl: modify_block_start: %s: create: phys blksz change (was %d, now %d) but could not allocate space for new header\n", - jnl->jdev_name, jnl->jhdr->jhdr_size, phys_blksz); - bad = 1; - } else { - memcpy(new_header_buf, jnl->header_buf, jnl->header_buf_size); - memset(&new_header_buf[jnl->header_buf_size], 0x18, (phys_blksz - jnl->header_buf_size)); - kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->header_buf_size); - jnl->header_buf = new_header_buf; - jnl->header_buf_size = phys_blksz; - - jnl->jhdr = (journal_header *)jnl->header_buf; - jnl->jhdr->jhdr_size = phys_blksz; - } - } - } else { - bad = 1; - } + if (VNOP_IOCTL(jnl->jdev, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, vfs_context_kernel())) { + bad = 1; + } else if (phys_blksz != (uint32_t)jnl->jhdr->jhdr_size) { + if (phys_blksz < 512) { + panic("jnl: mod block start: phys blksz %d is too small (%d, %d)\n", + phys_blksz, buf_size(bp), jnl->jhdr->jhdr_size); + } + + if ((buf_size(bp) % phys_blksz) != 0) { + bad = 1; + } else if (phys_blksz < (uint32_t)jnl->jhdr->jhdr_size) { + jnl->jhdr->jhdr_size = phys_blksz; + } else { + // the phys_blksz is now larger... need to realloc the jhdr + char *new_header_buf; + + printf("jnl: %s: phys blksz got bigger (was: %d/%d now %d)\n", + jnl->jdev_name, jnl->header_buf_size, jnl->jhdr->jhdr_size, phys_blksz); + if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&new_header_buf, phys_blksz)) { + printf("jnl: modify_block_start: %s: create: phys blksz change (was %d, now %d) but could not allocate space for new header\n", + jnl->jdev_name, jnl->jhdr->jhdr_size, phys_blksz); + bad = 1; + } else { + memcpy(new_header_buf, jnl->header_buf, jnl->header_buf_size); + memset(&new_header_buf[jnl->header_buf_size], 0x18, (phys_blksz - jnl->header_buf_size)); + kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->header_buf_size); + jnl->header_buf = new_header_buf; + jnl->header_buf_size = phys_blksz; + + jnl->jhdr = (journal_header *)jnl->header_buf; + jnl->jhdr->jhdr_size = phys_blksz; + } + } + } else { + bad = 1; + } - if (bad) { - panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n", - buf_size(bp), jnl->jhdr->jhdr_size); - return -1; - } - } + if (bad) { + panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n", + buf_size(bp), jnl->jhdr->jhdr_size); + return -1; + } + } - // make sure that this transaction isn't bigger than the whole journal - if (tr->total_bytes+buf_size(bp) >= (jnl->jhdr->size - jnl->jhdr->jhdr_size)) { + // make sure that this transaction isn't bigger than the whole journal + if (tr->total_bytes+buf_size(bp) >= (jnl->jhdr->size - jnl->jhdr->jhdr_size)) { panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr %p bp %p)\n", - tr->total_bytes, (tr->jnl->jhdr->size - jnl->jhdr->jhdr_size), buf_size(bp), tr, bp); + tr->total_bytes, (tr->jnl->jhdr->size - jnl->jhdr->jhdr_size), buf_size(bp), tr, bp); return -1; - } + } - // if the block is dirty and not already locked we have to write - // it out before we muck with it because it has data that belongs - // (presumably) to another transaction. - // - if ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI) { + // if the block is dirty and not already locked we have to write + // it out before we muck with it because it has data that belongs + // (presumably) to another transaction. + // + if ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI) { if (buf_flags(bp) & B_ASYNC) { panic("modify_block_start: bp @ %p has async flag set!\n", bp); } + if (bp->b_shadow_ref) + panic("modify_block_start: dirty bp @ %p has shadows!\n", bp); // this will cause it to not be buf_brelse()'d buf_setflags(bp, B_NORELSE); VNOP_BWRITE(bp); - } - buf_setflags(bp, B_LOCKED); - - return 0; + } + buf_setflags(bp, B_LOCKED); + + return 0; } int journal_modify_block_abort(journal *jnl, struct buf *bp) { - transaction *tr; + transaction *tr; block_list_header *blhdr; - int i; + int i; - CHECK_JOURNAL(jnl); + CHECK_JOURNAL(jnl); - tr = jnl->active_tr; + free_old_stuff(jnl); + + tr = jnl->active_tr; // // if there's no active transaction then we just want to @@ -2522,26 +2668,24 @@ journal_modify_block_abort(journal *jnl, struct buf *bp) return 0; } - if (jnl->flags & JOURNAL_INVALID) { + if (jnl->flags & JOURNAL_INVALID) { /* Still need to buf_brelse(). Callers assume we consume the bp. */ buf_brelse(bp); return EINVAL; - } + } - CHECK_TRANSACTION(tr); + CHECK_TRANSACTION(tr); - if (jnl->owner != current_thread()) { + if (jnl->owner != current_thread()) { panic("jnl: modify_block_abort: called w/out a transaction! jnl %p, owner %p, curact %p\n", - jnl, jnl->owner, current_thread()); - } - - free_old_stuff(jnl); + jnl, jnl->owner, current_thread()); + } - // printf("jnl: modify_block_abort: tr 0x%x bp 0x%x\n", jnl->active_tr, bp); + // printf("jnl: modify_block_abort: tr 0x%x bp 0x%x\n", jnl->active_tr, bp); - // first check if it's already part of this transaction - for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) { - for(i=1; i < blhdr->num_blocks; i++) { + // first check if it's already part of this transaction + for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) { + for (i = 1; i < blhdr->num_blocks; i++) { if (bp == blhdr->binfo[i].u.bp) { break; } @@ -2550,7 +2694,7 @@ journal_modify_block_abort(journal *jnl, struct buf *bp) if (i < blhdr->num_blocks) { break; } - } + } // // if blhdr is null, then this block has only had modify_block_start @@ -2560,76 +2704,75 @@ journal_modify_block_abort(journal *jnl, struct buf *bp) // on it and so we need to keep it locked in memory. // if (blhdr == NULL) { - buf_clearflags(bp, B_LOCKED); + buf_clearflags(bp, B_LOCKED); } - buf_brelse(bp); - return 0; + buf_brelse(bp); + return 0; } int -journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(struct buf *bp, void *arg), void *arg) +journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(buf_t bp, void *arg), void *arg) { - int i = 1; - int tbuffer_offset=0; - char *blkptr; - block_list_header *blhdr, *prev=NULL; - transaction *tr; + int i = 1; + int tbuffer_offset=0; + block_list_header *blhdr, *prev=NULL; + transaction *tr; + + CHECK_JOURNAL(jnl); - CHECK_JOURNAL(jnl); + free_old_stuff(jnl); - if (jnl->flags & JOURNAL_INVALID) { + if (jnl->flags & JOURNAL_INVALID) { /* Still need to buf_brelse(). Callers assume we consume the bp. */ buf_brelse(bp); return EINVAL; - } + } - tr = jnl->active_tr; - CHECK_TRANSACTION(tr); + tr = jnl->active_tr; + CHECK_TRANSACTION(tr); - if (jnl->owner != current_thread()) { + if (jnl->owner != current_thread()) { panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n", - jnl, jnl->owner, current_thread()); - } - - free_old_stuff(jnl); + jnl, jnl->owner, current_thread()); + } - //printf("jnl: mod block end: (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d, total bytes %d)\n", - // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes); + //printf("jnl: mod block end: (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d, total bytes %d)\n", + // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes); - if ((buf_flags(bp) & B_LOCKED) == 0) { + if ((buf_flags(bp) & B_LOCKED) == 0) { panic("jnl: modify_block_end: bp %p not locked! jnl @ %p\n", bp, jnl); - } + } - // first check if it's already part of this transaction - for(blhdr=tr->blhdr; blhdr; prev=blhdr,blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) { + // first check if it's already part of this transaction + for (blhdr = tr->blhdr; blhdr; prev = blhdr, blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) { tbuffer_offset = jnl->jhdr->blhdr_size; - for(i=1; i < blhdr->num_blocks; i++) { + for (i = 1; i < blhdr->num_blocks; i++) { if (bp == blhdr->binfo[i].u.bp) { break; } if (blhdr->binfo[i].bnum != (off_t)-1) { - tbuffer_offset += buf_size(blhdr->binfo[i].u.bp); + tbuffer_offset += buf_size(blhdr->binfo[i].u.bp); } else { - tbuffer_offset += blhdr->binfo[i].u.bi.bsize; + tbuffer_offset += blhdr->binfo[i].u.bi.bsize; } } if (i < blhdr->num_blocks) { break; } - } + } - if (blhdr == NULL - && prev - && (prev->num_blocks+1) <= prev->max_blocks - && (prev->bytes_used+buf_size(bp)) <= (uint32_t)tr->tbuffer_size) { + if (blhdr == NULL + && prev + && (prev->num_blocks+1) <= prev->max_blocks + && (prev->bytes_used+buf_size(bp)) <= (uint32_t)tr->tbuffer_size) { blhdr = prev; - } else if (blhdr == NULL) { - block_list_header *nblhdr; + } else if (blhdr == NULL) { + block_list_header *nblhdr; if (prev == NULL) { panic("jnl: modify block end: no way man, prev == NULL?!?, jnl %p, bp %p\n", jnl, bp); } @@ -2641,9 +2784,9 @@ journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(struct buf * // through prev->binfo[0].bnum. that's a skanky way to do things but // avoids having yet another linked list of small data structures to manage. - if (kmem_alloc(kernel_map, (vm_offset_t *)&nblhdr, tr->tbuffer_size)) { + if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&nblhdr, tr->tbuffer_size)) { panic("jnl: end_tr: no space for new block tr @ %p (total bytes: %d)!\n", - tr, tr->total_bytes); + tr, tr->total_bytes); } // journal replay code checksum check depends on this. @@ -2667,25 +2810,15 @@ journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(struct buf * blhdr = nblhdr; tbuffer_offset = jnl->jhdr->blhdr_size; i = 1; - } + } - if ((i+1) > blhdr->max_blocks) { + if ((i+1) > blhdr->max_blocks) { panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i, blhdr->max_blocks); - } - - // if the function pointer is not set then copy the - // block of data now. if the function pointer is set - // the copy will happen after calling the callback in - // end_transaction() just before it goes to disk. - // - if (func == NULL) { - blkptr = (char *)&((char *)blhdr)[tbuffer_offset]; - memcpy(blkptr, (char *)0 + buf_dataptr(bp), buf_size(bp)); } - // if this is true then this is a new block we haven't seen - if (i >= blhdr->num_blocks) { + // if this is true then this is a new block we haven't seen + if (i >= blhdr->num_blocks) { int bsize; vnode_t vp; @@ -2695,8 +2828,9 @@ journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(struct buf * blhdr->binfo[i].bnum = (off_t)(buf_blkno(bp)); blhdr->binfo[i].u.bp = bp; + if (func) { - void *old_func=NULL, *old_arg=NULL; + void (*old_func)(buf_t, void *)=NULL, *old_arg=NULL; buf_setfilter(bp, func, arg, &old_func, &old_arg); if (old_func != NULL && old_func != func) { @@ -2708,48 +2842,48 @@ journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(struct buf * tr->total_bytes += bsize; blhdr->num_blocks++; - } - buf_bdwrite(bp); + } + buf_bdwrite(bp); - return 0; + return 0; } int journal_kill_block(journal *jnl, struct buf *bp) { - int i; - int bflags; - block_list_header *blhdr; - transaction *tr; + int i; + int bflags; + block_list_header *blhdr; + transaction *tr; - CHECK_JOURNAL(jnl); + CHECK_JOURNAL(jnl); - if (jnl->flags & JOURNAL_INVALID) { + free_old_stuff(jnl); + + if (jnl->flags & JOURNAL_INVALID) { return EINVAL; - } + } - tr = jnl->active_tr; - CHECK_TRANSACTION(tr); + tr = jnl->active_tr; + CHECK_TRANSACTION(tr); - if (jnl->owner != current_thread()) { + if (jnl->owner != current_thread()) { panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n", - jnl, jnl->owner, current_thread()); - } - - free_old_stuff(jnl); + jnl, jnl->owner, current_thread()); + } - bflags = buf_flags(bp); + bflags = buf_flags(bp); - if ( !(bflags & B_LOCKED)) - panic("jnl: modify_block_end: called with bp not B_LOCKED"); + if ( !(bflags & B_LOCKED)) + panic("jnl: modify_block_end: called with bp not B_LOCKED"); - /* - * bp must be BL_BUSY and B_LOCKED - */ - // first check if it's already part of this transaction - for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) { + /* + * bp must be BL_BUSY and B_LOCKED + * first check if it's already part of this transaction + */ + for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) { - for(i=1; i < blhdr->num_blocks; i++) { + for (i = 1; i < blhdr->num_blocks; i++) { if (bp == blhdr->binfo[i].u.bp) { vnode_t vp; @@ -2783,9 +2917,38 @@ journal_kill_block(journal *jnl, struct buf *bp) if (i < blhdr->num_blocks) { break; } - } + } - return 0; + return 0; +} + + +/* +;________________________________________________________________________________ +; +; Routine: journal_trim_set_callback +; +; Function: Provide the journal with a routine to be called back when a +; TRIM has (or would have) been issued to the device. That +; is, the transaction has been flushed to the device, and the +; blocks freed by the transaction are now safe for reuse. +; +; CAUTION: If the journal becomes invalid (eg., due to an I/O +; error when trying to write to the journal), this callback +; will stop getting called, even if extents got freed before +; the journal became invalid! +; +; Input Arguments: +; jnl - The journal structure for the filesystem. +; callback - The function to call when the TRIM is complete. +; arg - An argument to be passed to callback. +;________________________________________________________________________________ +*/ +__private_extern__ void +journal_trim_set_callback(journal *jnl, jnl_trim_callback_t callback, void *arg) +{ + jnl->trim_callback = callback; + jnl->trim_callback_arg = arg; } @@ -2802,7 +2965,7 @@ journal_kill_block(journal *jnl, struct buf *bp) ; grown successfully. ; ; Input Arguments: -; tr - The transaction containing the extent list. +; trim - The trim list to be resized. ; ; Output: ; (result) - ENOMEM or 0. @@ -2813,53 +2976,107 @@ journal_kill_block(journal *jnl, struct buf *bp) ;________________________________________________________________________________ */ static int -journal_trim_realloc(transaction *tr) +trim_realloc(struct jnl_trim_list *trim) { - if (CONFIG_HFS_TRIM) { - void *new_extents; - uint32_t new_allocated_count; - - new_allocated_count = tr->trim.allocated_count + JOURNAL_DEFAULT_TRIM_EXTENTS; - new_extents = kalloc(new_allocated_count * sizeof(dk_extent_t)); - if (new_extents == NULL) { - printf("journal_trim_realloc: unable to grow extent list!\n"); - /* - * Since we could be called when allocating space previously marked - * to be trimmed, we need to empty out the list to be safe. - */ - tr->trim.extent_count = 0; - return ENOMEM; - } - - /* Copy the old extent list to the newly allocated list. */ - if (tr->trim.extents != NULL) { - memmove(new_extents, - tr->trim.extents, - tr->trim.allocated_count * sizeof(dk_extent_t)); - kfree(tr->trim.extents, - tr->trim.allocated_count * sizeof(dk_extent_t)); - } - - tr->trim.allocated_count = new_allocated_count; - tr->trim.extents = new_extents; + void *new_extents; + uint32_t new_allocated_count; + + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_START, trim, 0, trim->allocated_count, trim->extent_count, 0); + + new_allocated_count = trim->allocated_count + JOURNAL_DEFAULT_TRIM_EXTENTS; + new_extents = kalloc(new_allocated_count * sizeof(dk_extent_t)); + if (new_extents == NULL) { + printf("jnl: trim_realloc: unable to grow extent list!\n"); + /* + * Since we could be called when allocating space previously marked + * to be trimmed, we need to empty out the list to be safe. + */ + trim->extent_count = 0; + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_END, ENOMEM, 0, trim->allocated_count, 0, 0); + return ENOMEM; } + + /* Copy the old extent list to the newly allocated list. */ + if (trim->extents != NULL) { + memmove(new_extents, + trim->extents, + trim->allocated_count * sizeof(dk_extent_t)); + kfree(trim->extents, + trim->allocated_count * sizeof(dk_extent_t)); + } + + trim->allocated_count = new_allocated_count; + trim->extents = new_extents; + + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_END, 0, 0, new_allocated_count, trim->extent_count, 0); + return 0; } +/* +;________________________________________________________________________________ +; +; Routine: trim_search_extent +; +; Function: Search the given extent list to see if any of its extents +; overlap the given extent. +; +; Input Arguments: +; trim - The trim list to be searched. +; offset - The first byte of the range to be searched for. +; length - The number of bytes of the extent being searched for. +; +; Output: +; (result) - TRUE if one or more extents overlap, FALSE otherwise. +;________________________________________________________________________________ +*/ +static int +trim_search_extent(struct jnl_trim_list *trim, uint64_t offset, uint64_t length) +{ + uint64_t end = offset + length; + uint32_t lower = 0; /* Lowest index to search */ + uint32_t upper = trim->extent_count; /* Highest index to search + 1 */ + uint32_t middle; + + /* A binary search over the extent list. */ + while (lower < upper) { + middle = (lower + upper) / 2; + + if (trim->extents[middle].offset >= end) + upper = middle; + else if (trim->extents[middle].offset + trim->extents[middle].length <= offset) + lower = middle + 1; + else + return TRUE; + } + + return FALSE; +} + + /* ;________________________________________________________________________________ ; ; Routine: journal_trim_add_extent ; -; Function: Make note of a range of bytes that should be unmapped -; (trimmed). That is, the given range of bytes no longer have -; useful content, and the device can unmap the previous -; contents. For example, a solid state disk may reuse the -; underlying storage for other blocks. +; Function: Keep track of extents that have been freed as part of this +; transaction. If the underlying device supports TRIM (UNMAP), +; then those extents will be trimmed/unmapped once the +; transaction has been written to the journal. (For example, +; SSDs can support trim/unmap and avoid having to recopy those +; blocks when doing wear leveling, and may reuse the same +; phsyical blocks for different logical blocks.) ; -; The extent will be unmapped after the transaction is written -; to the journal. +; HFS also uses this, in combination with journal_trim_set_callback, +; to add recently freed extents to its free extent cache, but +; only after the transaction that freed them is committed to +; disk. (This reduces the chance of overwriting live data in +; a way that causes data loss if a transaction never gets +; written to the journal.) ; ; Input Arguments: ; jnl - The journal for the volume containing the byte range. @@ -2870,113 +3087,114 @@ journal_trim_realloc(transaction *tr) __private_extern__ int journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length) { - if (CONFIG_HFS_TRIM) { - uint64_t end; - transaction *tr; - dk_extent_t *extent; - uint32_t insert_index; - uint32_t replace_count; - - CHECK_JOURNAL(jnl); + uint64_t end; + transaction *tr; + dk_extent_t *extent; + uint32_t insert_index; + uint32_t replace_count; - if (jnl->flags & JOURNAL_TRIM_ERR) { - /* - * A previous trim failed, so we have disabled trim for this volume - * for as long as it remains mounted. - */ - return 0; - } - - if (jnl->flags & JOURNAL_INVALID) { - return EINVAL; - } + CHECK_JOURNAL(jnl); + + /* TODO: Is it OK to manipulate the trim list even if JOURNAL_INVALID is set? I think so... */ + if (jnl->flags & JOURNAL_INVALID) { + return EINVAL; + } + + tr = jnl->active_tr; + CHECK_TRANSACTION(tr); + + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_START, jnl, offset, length, tr->trim.extent_count, 0); + + if (jnl->owner != current_thread()) { + panic("jnl: trim_add_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n", + jnl, jnl->owner, current_thread()); + } + + free_old_stuff(jnl); - tr = jnl->active_tr; - CHECK_TRANSACTION(tr); + end = offset + length; - if (jnl->owner != current_thread()) { - panic("jnl: trim_add_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n", - jnl, jnl->owner, current_thread()); - } + /* + * Find the range of existing extents that can be combined with the + * input extent. We start by counting the number of extents that end + * strictly before the input extent, then count the number of extents + * that overlap or are contiguous with the input extent. + */ + extent = tr->trim.extents; + insert_index = 0; + while (insert_index < tr->trim.extent_count && extent->offset + extent->length < offset) { + ++insert_index; + ++extent; + } + replace_count = 0; + while (insert_index + replace_count < tr->trim.extent_count && extent->offset <= end) { + ++replace_count; + ++extent; + } - free_old_stuff(jnl); - - end = offset + length; - - /* - * Find the range of existing extents that can be combined with the - * input extent. We start by counting the number of extents that end - * strictly before the input extent, then count the number of extents - * that overlap or are contiguous with the input extent. - */ - extent = tr->trim.extents; - insert_index = 0; - while (insert_index < tr->trim.extent_count && extent->offset + extent->length < offset) { - ++insert_index; - ++extent; - } - replace_count = 0; - while (insert_index + replace_count < tr->trim.extent_count && extent->offset <= end) { - ++replace_count; - ++extent; + /* + * If none of the existing extents can be combined with the input extent, + * then just insert it in the list (before item number insert_index). + */ + if (replace_count == 0) { + /* If the list was already full, we need to grow it. */ + if (tr->trim.extent_count == tr->trim.allocated_count) { + if (trim_realloc(&tr->trim) != 0) { + printf("jnl: trim_add_extent: out of memory!"); + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, ENOMEM, 0, 0, tr->trim.extent_count, 0); + return ENOMEM; + } } - /* - * If none of the existing extents can be combined with the input extent, - * then just insert it in the list (before item number insert_index). - */ - if (replace_count == 0) { - /* If the list was already full, we need to grow it. */ - if (tr->trim.extent_count == tr->trim.allocated_count) { - if (journal_trim_realloc(tr) != 0) { - printf("jnl: trim_add_extent: out of memory!"); - return ENOMEM; - } - } - - /* Shift any existing extents with larger offsets. */ - if (insert_index < tr->trim.extent_count) { - memmove(&tr->trim.extents[insert_index+1], - &tr->trim.extents[insert_index], - (tr->trim.extent_count - insert_index) * sizeof(dk_extent_t)); - } - tr->trim.extent_count++; - - /* Store the new extent in the list. */ - tr->trim.extents[insert_index].offset = offset; - tr->trim.extents[insert_index].length = length; - - /* We're done. */ - return 0; + /* Shift any existing extents with larger offsets. */ + if (insert_index < tr->trim.extent_count) { + memmove(&tr->trim.extents[insert_index+1], + &tr->trim.extents[insert_index], + (tr->trim.extent_count - insert_index) * sizeof(dk_extent_t)); } + tr->trim.extent_count++; - /* - * Update extent number insert_index to be the union of the input extent - * and all of the replaced extents. - */ - if (tr->trim.extents[insert_index].offset < offset) - offset = tr->trim.extents[insert_index].offset; - extent = &tr->trim.extents[insert_index + replace_count - 1]; - if (extent->offset + extent->length > end) - end = extent->offset + extent->length; + /* Store the new extent in the list. */ tr->trim.extents[insert_index].offset = offset; - tr->trim.extents[insert_index].length = end - offset; + tr->trim.extents[insert_index].length = length; - /* - * If we were replacing more than one existing extent, then shift any - * extents with larger offsets, and update the count of extents. - * - * We're going to leave extent #insert_index alone since it was just updated, above. - * We need to move extents from index (insert_index + replace_count) through the end of - * the list by (replace_count - 1) positions so that they overwrite extent #(insert_index + 1). - */ - if (replace_count > 1 && (insert_index + replace_count) < tr->trim.extent_count) { - memmove(&tr->trim.extents[insert_index + 1], - &tr->trim.extents[insert_index + replace_count], - (tr->trim.extent_count - insert_index - replace_count) * sizeof(dk_extent_t)); - } - tr->trim.extent_count -= replace_count - 1; - } + /* We're done. */ + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, 0, 0, 0, tr->trim.extent_count, 0); + return 0; + } + + /* + * Update extent number insert_index to be the union of the input extent + * and all of the replaced extents. + */ + if (tr->trim.extents[insert_index].offset < offset) + offset = tr->trim.extents[insert_index].offset; + extent = &tr->trim.extents[insert_index + replace_count - 1]; + if (extent->offset + extent->length > end) + end = extent->offset + extent->length; + tr->trim.extents[insert_index].offset = offset; + tr->trim.extents[insert_index].length = end - offset; + + /* + * If we were replacing more than one existing extent, then shift any + * extents with larger offsets, and update the count of extents. + * + * We're going to leave extent #insert_index alone since it was just updated, above. + * We need to move extents from index (insert_index + replace_count) through the end of + * the list by (replace_count - 1) positions so that they overwrite extent #(insert_index + 1). + */ + if (replace_count > 1 && (insert_index + replace_count) < tr->trim.extent_count) { + memmove(&tr->trim.extents[insert_index + 1], + &tr->trim.extents[insert_index + replace_count], + (tr->trim.extent_count - insert_index - replace_count) * sizeof(dk_extent_t)); + } + tr->trim.extent_count -= replace_count - 1; + + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, 0, 0, 0, tr->trim.extent_count, 0); return 0; } @@ -2984,153 +3202,217 @@ journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length) /* ;________________________________________________________________________________ ; -; Routine: journal_trim_remove_extent +; Routine: trim_remove_extent ; -; Function: Make note of a range of bytes, some of which may have previously -; been passed to journal_trim_add_extent, is now in use on the -; volume. The given bytes will be not be trimmed as part of -; this transaction. +; Function: Indicate that a range of bytes, some of which may have previously +; been passed to journal_trim_add_extent, is now allocated. +; Any overlapping ranges currently in the journal's trim list will +; be removed. If the underlying device supports TRIM (UNMAP), then +; these extents will not be trimmed/unmapped when the transaction +; is written to the journal. +; +; HFS also uses this to prevent newly allocated space from being +; added to its free extent cache (if some portion of the newly +; allocated space was recently freed). ; ; Input Arguments: -; jnl - The journal for the volume containing the byte range. +; trim - The trim list to update. ; offset - The first byte of the range to be trimmed. ; length - The number of bytes of the extent being trimmed. ;________________________________________________________________________________ */ -__private_extern__ int -journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length) +static int +trim_remove_extent(struct jnl_trim_list *trim, uint64_t offset, uint64_t length) { - if (CONFIG_HFS_TRIM) { - u_int64_t end; - dk_extent_t *extent; - transaction *tr; - u_int32_t keep_before; - u_int32_t keep_after; - - CHECK_JOURNAL(jnl); - - if (jnl->flags & JOURNAL_TRIM_ERR) { - /* - * A previous trim failed, so we have disabled trim for this volume - * for as long as it remains mounted. - */ - return 0; - } - - if (jnl->flags & JOURNAL_INVALID) { - return EINVAL; - } - - tr = jnl->active_tr; - CHECK_TRANSACTION(tr); + u_int64_t end; + dk_extent_t *extent; + u_int32_t keep_before; + u_int32_t keep_after; - if (jnl->owner != current_thread()) { - panic("jnl: trim_remove_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n", - jnl, jnl->owner, current_thread()); - } + end = offset + length; - free_old_stuff(jnl); + /* + * Find any existing extents that start before or end after the input + * extent. These extents will be modified if they overlap the input + * extent. Other extents between them will be deleted. + */ + extent = trim->extents; + keep_before = 0; + while (keep_before < trim->extent_count && extent->offset < offset) { + ++keep_before; + ++extent; + } + keep_after = keep_before; + if (keep_after > 0) { + /* See if previous extent extends beyond both ends of input extent. */ + --keep_after; + --extent; + } + while (keep_after < trim->extent_count && (extent->offset + extent->length) <= end) { + ++keep_after; + ++extent; + } - end = offset + length; + /* + * When we get here, the first keep_before extents (0 .. keep_before-1) + * start before the input extent, and extents (keep_after .. extent_count-1) + * end after the input extent. We'll need to keep, all of those extents, + * but possibly modify #(keep_before-1) and #keep_after to remove the portion + * that overlaps with the input extent. + */ - /* - * Find any existing extents that start before or end after the input - * extent. These extents will be modified if they overlap the input - * extent. Other extents between them will be deleted. - */ - extent = tr->trim.extents; - keep_before = 0; - while (keep_before < tr->trim.extent_count && extent->offset < offset) { - ++keep_before; - ++extent; - } - keep_after = keep_before; - if (keep_after > 0) { - /* See if previous extent extends beyond both ends of input extent. */ - --keep_after; - --extent; - } - while (keep_after < tr->trim.extent_count && (extent->offset + extent->length) <= end) { - ++keep_after; - ++extent; + /* + * Does the input extent start after and end before the same existing + * extent? If so, we have to "punch a hole" in that extent and convert + * it to two separate extents. + */ + if (keep_before > keep_after) { + /* If the list was already full, we need to grow it. */ + if (trim->extent_count == trim->allocated_count) { + if (trim_realloc(trim) != 0) { + printf("jnl: trim_remove_extent: out of memory!"); + return ENOMEM; + } } /* - * When we get here, the first keep_before extents (0 .. keep_before-1) - * start before the input extent, and extents (keep_after .. extent_count-1) - * end after the input extent. We'll need to keep, all of those extents, - * but possibly modify #(keep_before-1) and #keep_after to remove the portion - * that overlaps with the input extent. + * Make room for a new extent by shifting extents #keep_after and later + * down by one extent. When we're done, extents #keep_before and + * #keep_after will be identical, and we can fall through to removing + * the portion that overlaps the input extent. */ + memmove(&trim->extents[keep_before], + &trim->extents[keep_after], + (trim->extent_count - keep_after) * sizeof(dk_extent_t)); + ++trim->extent_count; + ++keep_after; /* - * Does the input extent start after and end before the same existing - * extent? If so, we have to "punch a hole" in that extent and convert - * it to two separate extents. + * Fall through. We now have the case where the length of extent + * #(keep_before - 1) needs to be updated, and the start of extent + * #(keep_after) needs to be updated. */ - if (keep_before > keep_after) { - /* If the list was already full, we need to grow it. */ - if (tr->trim.extent_count == tr->trim.allocated_count) { - if (journal_trim_realloc(tr) != 0) { - printf("jnl: trim_remove_extent: out of memory!"); - return ENOMEM; - } - } - - /* - * Make room for a new extent by shifting extents #keep_after and later - * down by one extent. When we're done, extents #keep_before and - * #keep_after will be identical, and we can fall through to removing - * the portion that overlaps the input extent. - */ - memmove(&tr->trim.extents[keep_before], - &tr->trim.extents[keep_after], - (tr->trim.extent_count - keep_after) * sizeof(dk_extent_t)); - ++tr->trim.extent_count; - ++keep_after; - - /* - * Fall through. We now have the case where the length of extent - * #(keep_before - 1) needs to be updated, and the start of extent - * #(keep_after) needs to be updated. - */ + } + + /* + * May need to truncate the end of extent #(keep_before - 1) if it overlaps + * the input extent. + */ + if (keep_before > 0) { + extent = &trim->extents[keep_before - 1]; + if (extent->offset + extent->length > offset) { + extent->length = offset - extent->offset; } - - /* - * May need to truncate the end of extent #(keep_before - 1) if it overlaps - * the input extent. - */ - if (keep_before > 0) { - extent = &tr->trim.extents[keep_before - 1]; - if (extent->offset + extent->length > offset) { - extent->length = offset - extent->offset; - } + } + + /* + * May need to update the start of extent #(keep_after) if it overlaps the + * input extent. + */ + if (keep_after < trim->extent_count) { + extent = &trim->extents[keep_after]; + if (extent->offset < end) { + extent->length = extent->offset + extent->length - end; + extent->offset = end; } + } + + /* + * If there were whole extents that overlapped the input extent, get rid + * of them by shifting any following extents, and updating the count. + */ + if (keep_after > keep_before && keep_after < trim->extent_count) { + memmove(&trim->extents[keep_before], + &trim->extents[keep_after], + (trim->extent_count - keep_after) * sizeof(dk_extent_t)); + } + trim->extent_count -= keep_after - keep_before; + + return 0; +} + + +/* +;________________________________________________________________________________ +; +; Routine: journal_trim_remove_extent +; +; Function: Make note of a range of bytes, some of which may have previously +; been passed to journal_trim_add_extent, is now in use on the +; volume. The given bytes will be not be trimmed as part of +; this transaction, or a pending trim of a transaction being +; asynchronously flushed. +; +; Input Arguments: +; jnl - The journal for the volume containing the byte range. +; offset - The first byte of the range to be trimmed. +; length - The number of bytes of the extent being trimmed. +;________________________________________________________________________________ +*/ +__private_extern__ int +journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length) +{ + int error = 0; + transaction *tr; + + CHECK_JOURNAL(jnl); + + /* TODO: Is it OK to manipulate the trim list even if JOURNAL_INVALID is set? I think so... */ + if (jnl->flags & JOURNAL_INVALID) { + return EINVAL; + } + + tr = jnl->active_tr; + CHECK_TRANSACTION(tr); + + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE | DBG_FUNC_START, jnl, offset, length, tr->trim.extent_count, 0); + + if (jnl->owner != current_thread()) { + panic("jnl: trim_remove_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n", + jnl, jnl->owner, current_thread()); + } + + free_old_stuff(jnl); + + error = trim_remove_extent(&tr->trim, offset, length); + if (error == 0) { + int found = FALSE; /* - * May need to update the start of extent #(keep_after) if it overlaps the - * input extent. + * See if a pending trim has any extents that overlap with the + * one we were given. */ - if (keep_after < tr->trim.extent_count) { - extent = &tr->trim.extents[keep_after]; - if (extent->offset < end) { - extent->length = extent->offset + extent->length - end; - extent->offset = end; - } - } + lck_rw_lock_shared(&jnl->trim_lock); + if (jnl->async_trim != NULL) + found = trim_search_extent(jnl->async_trim, offset, length); + lck_rw_unlock_shared(&jnl->trim_lock); - /* - * If there were whole extents that overlapped the input extent, get rid - * of them by shifting any following extents, and updating the count. - */ - if (keep_after > keep_before && keep_after < tr->trim.extent_count) { - memmove(&tr->trim.extents[keep_before], - &tr->trim.extents[keep_after], - (tr->trim.extent_count - keep_after) * sizeof(dk_extent_t)); + if (found) { + /* + * There was an overlap, so avoid trimming the extent we + * just allocated. (Otherwise, it might get trimmed after + * we've written to it, which will cause that data to be + * corrupted.) + */ + uint32_t async_extent_count = 0; + + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE_PENDING | DBG_FUNC_START, jnl, offset, length, 0, 0); + lck_rw_lock_exclusive(&jnl->trim_lock); + if (jnl->async_trim != NULL) { + error = trim_remove_extent(jnl->async_trim, offset, length); + async_extent_count = jnl->async_trim->extent_count; + } + lck_rw_unlock_exclusive(&jnl->trim_lock); + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE_PENDING | DBG_FUNC_END, error, 0, 0, async_extent_count, 0); } - tr->trim.extent_count -= keep_after - keep_before; } - return 0; + + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE | DBG_FUNC_END, error, 0, 0, tr->trim.extent_count, 0); + return error; } @@ -3139,29 +3421,70 @@ journal_trim_flush(journal *jnl, transaction *tr) { int errno = 0; - if (CONFIG_HFS_TRIM) { - if ((jnl->flags & JOURNAL_TRIM_ERR) == 0 && tr->trim.extent_count > 0) { - dk_unmap_t unmap; - - bzero(&unmap, sizeof(unmap)); + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH | DBG_FUNC_START, jnl, tr, 0, tr->trim.extent_count, 0); + + if (tr->trim.extent_count > 0) { + dk_unmap_t unmap; + + bzero(&unmap, sizeof(unmap)); + lck_rw_lock_shared(&jnl->trim_lock); + if (CONFIG_HFS_TRIM && (jnl->flags & JOURNAL_USE_UNMAP)) { unmap.extents = tr->trim.extents; unmap.extentsCount = tr->trim.extent_count; + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_START, jnl, tr, 0, tr->trim.extent_count, 0); errno = VNOP_IOCTL(jnl->fsdev, DKIOCUNMAP, (caddr_t)&unmap, FWRITE, vfs_context_kernel()); + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_END, errno, 0, 0, 0, 0); if (errno) { printf("jnl: error %d from DKIOCUNMAP (extents=%lx, count=%u); disabling trim for %s\n", - errno, (unsigned long) (tr->trim.extents), tr->trim.extent_count, + errno, (unsigned long) (unmap.extents), unmap.extentsCount, jnl->jdev_name); - jnl->flags |= JOURNAL_TRIM_ERR; + jnl->flags &= ~JOURNAL_USE_UNMAP; } } - if (tr->trim.extents) { - kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t)); - tr->trim.allocated_count = 0; - tr->trim.extent_count = 0; - tr->trim.extents = NULL; - } + + /* + * Call back into the file system to tell them that we have + * trimmed some extents and that they can now be reused. + * + * CAUTION: If the journal becomes invalid (eg., due to an I/O + * error when trying to write to the journal), this callback + * will stop getting called, even if extents got freed before + * the journal became invalid! + */ + if (jnl->trim_callback) + jnl->trim_callback(jnl->trim_callback_arg, tr->trim.extent_count, tr->trim.extents); + + lck_rw_unlock_shared(&jnl->trim_lock); + } + + /* + * If the transaction we're flushing was the async transaction, then + * tell the current transaction that there is no pending trim + * any more. + * + * NOTE: Since we released the lock, another thread could have + * removed one or more extents from our list. That's not a + * problem since any writes to the re-allocated blocks + * would get sent to the device after the DKIOCUNMAP. + */ + lck_rw_lock_exclusive(&jnl->trim_lock); + if (jnl->async_trim == &tr->trim) + jnl->async_trim = NULL; + lck_rw_unlock_exclusive(&jnl->trim_lock); + + if (tr->trim.extents) { + kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t)); + tr->trim.allocated_count = 0; + tr->trim.extent_count = 0; + tr->trim.extents = NULL; } + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH | DBG_FUNC_END, errno, 0, 0, 0, 0); + return errno; } @@ -3169,23 +3492,23 @@ journal_trim_flush(journal *jnl, transaction *tr) static int journal_binfo_cmp(const void *a, const void *b) { - const block_info *bi_a = (const struct block_info *)a; - const block_info *bi_b = (const struct block_info *)b; - daddr64_t res; + const block_info *bi_a = (const struct block_info *)a; + const block_info *bi_b = (const struct block_info *)b; + daddr64_t res; - if (bi_a->bnum == (off_t)-1) { + if (bi_a->bnum == (off_t)-1) { return 1; - } - if (bi_b->bnum == (off_t)-1) { + } + if (bi_b->bnum == (off_t)-1) { return -1; - } + } - // don't have to worry about negative block - // numbers so this is ok to do. - // - res = (buf_blkno(bi_a->u.bp) - buf_blkno(bi_b->u.bp)); + // don't have to worry about negative block + // numbers so this is ok to do. + // + res = (buf_blkno(bi_a->u.bp) - buf_blkno(bi_b->u.bp)); - return (int)res; + return (int)res; } @@ -3220,27 +3543,27 @@ journal_binfo_cmp(const void *a, const void *b) * -1 An error occurred. The journal is marked invalid. */ static int -end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg) +end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg, boolean_t drop_lock, boolean_t must_wait) { - int i, ret, amt; - errno_t errno; - off_t end; - journal *jnl = tr->jnl; - struct buf *bp, **bparray; - block_list_header *blhdr=NULL, *next=NULL; - size_t tbuffer_offset; + block_list_header *blhdr=NULL, *next=NULL; + int i, ret_val = 0; + errno_t errno; + journal *jnl = tr->jnl; + struct buf *bp; + size_t tbuffer_offset; + boolean_t drop_lock_early; if (jnl->cur_tr) { panic("jnl: jnl @ %p already has cur_tr %p, new tr: %p\n", jnl, jnl->cur_tr, tr); } - // if there weren't any modified blocks in the transaction - // just save off the transaction pointer and return. - if (tr->total_bytes == jnl->jhdr->blhdr_size) { + // if there weren't any modified blocks in the transaction + // just save off the transaction pointer and return. + if (tr->total_bytes == jnl->jhdr->blhdr_size) { jnl->cur_tr = tr; - return 0; - } + goto done; + } // if our transaction buffer isn't very full, just hang // on to it and don't actually flush anything. this is @@ -3248,174 +3571,314 @@ end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void // transaction buffer if it's full or if we have more than // one of them so we don't start hogging too much memory. // - // We also check the number of extents waiting to be trimmed. - // If it is small enough, then keep accumulating more (so we - // can reduce the overhead of trimming). If there was a - // prior trim error, then we stop issuing trims for this + // We also check the device supports UNMAP/TRIM, and if so, + // the number of extents waiting to be trimmed. If it is + // small enough, then keep accumulating more (so we can + // reduce the overhead of trimming). If there was a prior + // trim error, then we stop issuing trims for this // volume, so we can also coalesce transactions. - // + // if ( force_it == 0 && (jnl->flags & JOURNAL_NO_GROUP_COMMIT) == 0 && tr->num_blhdrs < 3 && (tr->total_bytes <= ((tr->tbuffer_size*tr->num_blhdrs) - tr->tbuffer_size/8)) - && ((jnl->flags & JOURNAL_TRIM_ERR) || (tr->trim.extent_count < jnl_trim_flush_limit))) { + && (!(jnl->flags & JOURNAL_USE_UNMAP) || (tr->trim.extent_count < jnl_trim_flush_limit))) { jnl->cur_tr = tr; - return 0; - } + goto done; + } + KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_START, jnl, tr, drop_lock, must_wait, 0); - // if we're here we're going to flush the transaction buffer to disk. - // make sure there is room in the journal first. - check_free_space(jnl, tr->total_bytes); + lock_condition(jnl, &jnl->flushing, "end_transaction"); - // range check the end index - if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) { - panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n", - jnl->jhdr->end, jnl->jhdr->size); - } + /* + * if the previous 'finish_end_transaction' was being run + * asynchronously, it could have encountered a condition + * that caused it to mark the journal invalid... if that + * occurred while we were waiting for it to finish, we + * need to notice and abort the current transaction + */ + if ((jnl->flags & JOURNAL_INVALID) || jnl->flush_aborted == TRUE) { + unlock_condition(jnl, &jnl->flushing); - // this transaction starts where the current journal ends - tr->journal_start = jnl->jhdr->end; - end = jnl->jhdr->end; + abort_transaction(jnl, tr); + ret_val = -1; + KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_END, jnl, tr, ret_val, 0, 0); + goto done; + } - // - // if the first entry in old_start[] isn't free yet, loop calling the - // file system flush routine until it is (or we panic). - // - i = 0; - lock_oldstart(jnl); - while ((jnl->old_start[0] & 0x8000000000000000LL) != 0) { - if (jnl->flush) { - unlock_oldstart(jnl); + /* + * Store a pointer to this transaction's trim list so that + * future transactions can find it. + * + * Note: if there are no extents in the trim list, then don't + * bother saving the pointer since nothing can add new extents + * to the list (and other threads/transactions only care if + * there is a trim pending). + */ + lck_rw_lock_exclusive(&jnl->trim_lock); + if (jnl->async_trim != NULL) + panic("jnl: end_transaction: async_trim already non-NULL!"); + if (tr->trim.extent_count > 0) + jnl->async_trim = &tr->trim; + lck_rw_unlock_exclusive(&jnl->trim_lock); - if (jnl->flush) { - jnl->flush(jnl->flush_arg); - } + /* + * snapshot the transaction sequence number while we are still behind + * the journal lock since it will be bumped upon the start of the + * next transaction group which may overlap the current journal flush... + * we pass the snapshot into write_journal_header during the journal + * flush so that it can write the correct version in the header... + * because we hold the 'flushing' condition variable for the duration + * of the journal flush, 'saved_sequence_num' remains stable + */ + jnl->saved_sequence_num = jnl->sequence_num; - // yield the cpu so others can get in to clear the lock bit - (void)tsleep((void *)jnl, PRIBIO, "jnl-old-start-sleep", 1); + /* + * if we're here we're going to flush the transaction buffer to disk. + * 'check_free_space' will not return untl there is enough free + * space for this transaction in the journal and jnl->old_start[0] + * is avaiable for use + */ + KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_START, jnl, 0, 0, 0, 0); - lock_oldstart(jnl); - } - if (i++ >= 500) { - panic("jnl: transaction that started at 0x%llx is not completing! jnl %p\n", - jnl->old_start[0] & (~0x8000000000000000LL), jnl); - } + check_free_space(jnl, tr->total_bytes, &tr->delayed_header_write, jnl->saved_sequence_num); + + KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_END, jnl, tr->delayed_header_write, 0, 0, 0); + + // range check the end index + if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) { + panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n", + jnl->jhdr->end, jnl->jhdr->size); } + if (tr->delayed_header_write == TRUE) { + thread_t thread = THREAD_NULL; - // - // slide everyone else down and put our latest guy in the last - // entry in the old_start array - // - - /* Because old_start is locked above, we can cast away the volatile qualifier before passing it to memcpy. */ + lock_condition(jnl, &jnl->writing_header, "end_transaction"); + /* + * fire up a thread to write the journal header + * asynchronously... when it finishes, it will call + * unlock_condition... we can overlap the preparation of + * the log and buffers during this time + */ + kernel_thread_start((thread_continue_t)write_header_thread, jnl, &thread); + } else + jnl->write_header_failed = FALSE; + + + // this transaction starts where the current journal ends + tr->journal_start = jnl->jhdr->end; + + lock_oldstart(jnl); + /* + * Because old_start is locked above, we can cast away the volatile qualifier before passing it to memcpy. + * slide everyone else down and put our latest guy in the last + * entry in the old_start array + */ memcpy(__CAST_AWAY_QUALIFIER(&jnl->old_start[0], volatile, void *), __CAST_AWAY_QUALIFIER(&jnl->old_start[1], volatile, void *), sizeof(jnl->old_start)-sizeof(jnl->old_start[0])); jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] = tr->journal_start | 0x8000000000000000LL; unlock_oldstart(jnl); - // for each block, make sure that the physical block # is set - for(blhdr=tr->blhdr; blhdr; blhdr=next) { - char *blkptr; - + for (blhdr = tr->blhdr; blhdr; blhdr = next) { + char *blkptr; + buf_t sbp; + int32_t bsize; + tbuffer_offset = jnl->jhdr->blhdr_size; - for(i=1; i < blhdr->num_blocks; i++) { - daddr64_t blkno; - daddr64_t lblkno; - struct vnode *vp; - bp = blhdr->binfo[i].u.bp; + for (i = 1; i < blhdr->num_blocks; i++) { - // if this block has a callback function set, call - // it now and then copy the data from the bp into - // the journal. if (blhdr->binfo[i].bnum != (off_t)-1) { - void (*func)(struct buf *, void *); + void (*func)(buf_t, void *); void *arg; + bp = blhdr->binfo[i].u.bp; + if (bp == NULL) { panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ %p, tr %p)\n", blhdr->binfo[i].bnum, jnl, tr); } - - buf_setfilter(bp, NULL, NULL, (void **)&func, &arg); - - if (func) { - // acquire the bp here so that we can safely - // mess around with its data. buf_acquire() - // will return EAGAIN if the buffer was busy, - // so loop trying again. - do { - errno = buf_acquire(bp, 0, 0, 0); - } while (errno == EAGAIN); - - if (errno == 0) { + /* + * acquire the bp here so that we can safely + * mess around with its data. buf_acquire() + * will return EAGAIN if the buffer was busy, + * so loop trying again. + */ + do { + errno = buf_acquire(bp, BAC_REMOVE, 0, 0); + } while (errno == EAGAIN); - // call the hook function and then copy the - // data into the transaction buffer... - func(bp, arg); + if (errno) + panic("could not acquire bp %p (err %d)\n", bp, errno); - blkptr = (char *)&((char *)blhdr)[tbuffer_offset]; - memcpy(blkptr, (char *)buf_dataptr(bp), buf_size(bp)); - - buf_drop(bp); + if ((buf_flags(bp) & (B_LOCKED|B_DELWRI)) != (B_LOCKED|B_DELWRI)) { + if (jnl->flags & JOURNAL_CLOSE_PENDING) { + buf_clearflags(bp, B_LOCKED); + buf_brelse(bp); + + /* + * this is an odd case that appears to happen occasionally + * make sure we mark this block as no longer valid + * so that we don't process it in "finish_end_transaction" since + * the bp that is recorded in our array no longer belongs + * to us (normally we substitute a shadow bp to be processed + * issuing a 'buf_bawrite' on a stale buf_t pointer leads + * to all kinds of problems. + */ + blhdr->binfo[i].bnum = (off_t)-1; + continue; } else { - panic("could not acquire bp %p (err %d)\n", bp, errno); + panic("jnl: end_tr: !!!DANGER!!! bp %p flags (0x%x) not LOCKED & DELWRI\n", bp, buf_flags(bp)); } } + bsize = buf_size(bp); - } else { // bnum == -1, only true if a block was "killed" + buf_setfilter(bp, NULL, NULL, &func, &arg); + + blkptr = (char *)&((char *)blhdr)[tbuffer_offset]; - tbuffer_offset += blhdr->binfo[i].u.bi.bsize; - continue; - } + sbp = buf_create_shadow_priv(bp, FALSE, (uintptr_t)blkptr, 0, 0); - tbuffer_offset += buf_size(bp); + if (sbp == NULL) + panic("jnl: buf_create_shadow returned NULL"); - vp = buf_vnode(bp); - blkno = buf_blkno(bp); - lblkno = buf_lblkno(bp); + /* + * copy the data into the transaction buffer... + */ + memcpy(blkptr, (char *)buf_dataptr(bp), bsize); - if (vp == NULL && lblkno == blkno) { - printf("jnl: %s: end_tr: bad news! bp @ %p w/null vp and l/blkno = %qd/%qd. aborting the transaction (tr %p jnl %p).\n", - jnl->jdev_name, bp, lblkno, blkno, tr, jnl); - goto bad_journal; - } - - // if the lblkno is the same as blkno and this bp isn't - // associated with the underlying file system device then - // we need to call bmap() to get the actual physical block. - // - if ((lblkno == blkno) && (vp != jnl->fsdev)) { - off_t f_offset; - size_t contig_bytes; + buf_clearflags(bp, B_LOCKED); + buf_markclean(bp); + buf_drop(bp); - if (VNOP_BLKTOOFF(vp, lblkno, &f_offset)) { - printf("jnl: %s: end_tr: vnop_blktooff failed @ %p, jnl %p\n", jnl->jdev_name, bp, jnl); - goto bad_journal; - } - if (VNOP_BLOCKMAP(vp, f_offset, buf_count(bp), &blkno, &contig_bytes, NULL, 0, NULL)) { - printf("jnl: %s: end_tr: can't blockmap the bp @ %p, jnl %p\n", jnl->jdev_name, bp, jnl); - goto bad_journal; - } - if ((uint32_t)contig_bytes < buf_count(bp)) { - printf("jnl: %s: end_tr: blk not physically contiguous on disk@ %p, jnl %p\n", jnl->jdev_name, bp, jnl); - goto bad_journal; + /* + * adopt the shadow buffer for this block + */ + if (func) { + /* + * transfer FS hook function to the + * shadow buffer... it will get called + * in finish_end_transaction + */ + buf_setfilter(sbp, func, arg, NULL, NULL); } - buf_setblkno(bp, blkno); + blhdr->binfo[i].u.bp = sbp; + + } else { + // bnum == -1, only true if a block was "killed" + bsize = blhdr->binfo[i].u.bi.bsize; } - // update this so we write out the correct physical block number! - blhdr->binfo[i].bnum = (off_t)(blkno); + tbuffer_offset += bsize; } - next = (block_list_header *)((long)blhdr->binfo[0].bnum); - } - + } + /* + * if callback != NULL, we don't want to drop the journal + * lock, or complete end_transaction asynchronously, since + * the caller is expecting the callback to run in the calling + * context + * + * if drop_lock == FALSE, we can't complete end_transaction + * asynchronously + */ + if (callback) + drop_lock_early = FALSE; + else + drop_lock_early = drop_lock; + + if (drop_lock_early == FALSE) + must_wait = TRUE; + + if (drop_lock_early == TRUE) { + jnl->owner = NULL; + unlock_journal(jnl); + drop_lock = FALSE; + } + if (must_wait == TRUE) + ret_val = finish_end_transaction(tr, callback, callback_arg); + else { + thread_t thread = THREAD_NULL; + + /* + * fire up a thread to complete processing this transaction + * asynchronously... when it finishes, it will call + * unlock_condition + */ + kernel_thread_start((thread_continue_t)finish_end_thread, tr, &thread); + } + KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_END, jnl, tr, ret_val, 0, 0); +done: + if (drop_lock == TRUE) { + jnl->owner = NULL; + unlock_journal(jnl); + } + return (ret_val); +} + + +static void +finish_end_thread(transaction *tr) +{ +#if !CONFIG_EMBEDDED + proc_apply_thread_selfdiskacc(IOPOL_PASSIVE); +#else /* !CONFIG_EMBEDDED */ + struct uthread *ut; + + ut = get_bsdthread_info(current_thread()); + ut->uu_iopol_disk = IOPOL_PASSIVE; +#endif /* !CONFIG_EMBEDDED */ + + finish_end_transaction(tr, NULL, NULL); + + thread_deallocate(current_thread()); + thread_terminate(current_thread()); +} +static void +write_header_thread(journal *jnl) +{ +#if !CONFIG_EMBEDDED + proc_apply_thread_selfdiskacc(IOPOL_PASSIVE); +#else /* !CONFIG_EMBEDDED */ + struct uthread *ut; + + ut = get_bsdthread_info(current_thread()); + ut->uu_iopol_disk = IOPOL_PASSIVE; +#endif /* !CONFIG_EMBEDDED */ + + if (write_journal_header(jnl, 1, jnl->saved_sequence_num)) + jnl->write_header_failed = TRUE; + else + jnl->write_header_failed = FALSE; + unlock_condition(jnl, &jnl->writing_header); + + thread_deallocate(current_thread()); + thread_terminate(current_thread()); +} + +static int +finish_end_transaction(transaction *tr, errno_t (*callback)(void*), void *callback_arg) +{ + int i, amt; + int ret = 0; + off_t end; + journal *jnl = tr->jnl; + buf_t bp, *bparray; + vnode_t vp; + block_list_header *blhdr=NULL, *next=NULL; + size_t tbuffer_offset; + int bufs_written = 0; + int ret_val = 0; + + KERNEL_DEBUG(0xbbbbc028|DBG_FUNC_START, jnl, tr, 0, 0, 0); + + end = jnl->jhdr->end; + + for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) { - for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) { amt = blhdr->bytes_used; blhdr->binfo[0].u.bi.b.sequence_num = tr->sequence_num; @@ -3424,64 +3887,139 @@ end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE); if (kmem_alloc(kernel_map, (vm_offset_t *)&bparray, blhdr->num_blocks * sizeof(struct buf *))) { - panic("can't allocate %zd bytes for bparray\n", blhdr->num_blocks * sizeof(struct buf *)); + panic("can't allocate %zd bytes for bparray\n", blhdr->num_blocks * sizeof(struct buf *)); } - - // calculate individual block checksums tbuffer_offset = jnl->jhdr->blhdr_size; - for(i=1; i < blhdr->num_blocks; i++) { - int32_t bsize; + + for (i = 1; i < blhdr->num_blocks; i++) { + void (*func)(buf_t, void *); + void *arg; + int32_t bsize; - if (blhdr->binfo[i].bnum != (off_t)-1) { - bparray[i] = blhdr->binfo[i].u.bp; - bsize = buf_size(bparray[i]); - blhdr->binfo[i].u.bi.bsize = bsize; - blhdr->binfo[i].u.bi.b.cksum = calc_checksum(&((char *)blhdr)[tbuffer_offset], bsize); - } else { - bparray[i] = NULL; - bsize = blhdr->binfo[i].u.bi.bsize; - blhdr->binfo[i].u.bi.b.cksum = 0; - } + /* + * finish preparing the shadow buf_t before + * calculating the individual block checksums + */ + if (blhdr->binfo[i].bnum != (off_t)-1) { + daddr64_t blkno; + daddr64_t lblkno; - tbuffer_offset += bsize; - } + bp = blhdr->binfo[i].u.bp; + + vp = buf_vnode(bp); + blkno = buf_blkno(bp); + lblkno = buf_lblkno(bp); - ret = write_journal_data(jnl, &end, blhdr, amt); + if (vp == NULL && lblkno == blkno) { + printf("jnl: %s: end_tr: bad news! bp @ %p w/null vp and l/blkno = %qd/%qd. aborting the transaction (tr %p jnl %p).\n", + jnl->jdev_name, bp, lblkno, blkno, tr, jnl); + ret_val = -1; + goto bad_journal; + } + + // if the lblkno is the same as blkno and this bp isn't + // associated with the underlying file system device then + // we need to call bmap() to get the actual physical block. + // + if ((lblkno == blkno) && (vp != jnl->fsdev)) { + off_t f_offset; + size_t contig_bytes; + + if (VNOP_BLKTOOFF(vp, lblkno, &f_offset)) { + printf("jnl: %s: end_tr: vnop_blktooff failed @ %p, jnl %p\n", jnl->jdev_name, bp, jnl); + ret_val = -1; + goto bad_journal; + } + if (VNOP_BLOCKMAP(vp, f_offset, buf_count(bp), &blkno, &contig_bytes, NULL, 0, NULL)) { + printf("jnl: %s: end_tr: can't blockmap the bp @ %p, jnl %p\n", jnl->jdev_name, bp, jnl); + ret_val = -1; + goto bad_journal; + } + if ((uint32_t)contig_bytes < buf_count(bp)) { + printf("jnl: %s: end_tr: blk not physically contiguous on disk@ %p, jnl %p\n", jnl->jdev_name, bp, jnl); + ret_val = -1; + goto bad_journal; + } + buf_setblkno(bp, blkno); + } + // update this so we write out the correct physical block number! + blhdr->binfo[i].bnum = (off_t)(blkno); - // always put the bp pointers back - for(i=1; i < blhdr->num_blocks; i++) { - if (blhdr->binfo[i].bnum != (off_t)-1) { - blhdr->binfo[i].u.bp = bparray[i]; - } + /* + * pick up the FS hook function (if any) and prepare + * to fire this buffer off in the next pass + */ + buf_setfilter(bp, buffer_flushed_callback, tr, &func, &arg); + + if (func) { + /* + * call the hook function supplied by the filesystem... + * this needs to happen BEFORE cacl_checksum in case + * the FS morphs the data in the buffer + */ + func(bp, arg); + } + bparray[i] = bp; + bsize = buf_size(bp); + blhdr->binfo[i].u.bi.bsize = bsize; + blhdr->binfo[i].u.bi.b.cksum = calc_checksum(&((char *)blhdr)[tbuffer_offset], bsize); + } else { + bparray[i] = NULL; + bsize = blhdr->binfo[i].u.bi.bsize; + blhdr->binfo[i].u.bi.b.cksum = 0; + } + tbuffer_offset += bsize; } + /* + * if we fired off the journal_write_header asynchronously in + * 'end_transaction', we need to wait for its completion + * before writing the actual journal data + */ + wait_condition(jnl, &jnl->writing_header, "finish_end_transaction"); + + if (jnl->write_header_failed == FALSE) + ret = write_journal_data(jnl, &end, blhdr, amt); + else + ret_val = -1; + /* + * put the bp pointers back so that we can + * make the final pass on them + */ + for (i = 1; i < blhdr->num_blocks; i++) + blhdr->binfo[i].u.bp = bparray[i]; kmem_free(kernel_map, (vm_offset_t)bparray, blhdr->num_blocks * sizeof(struct buf *)); + if (ret_val == -1) + goto bad_journal; + if (ret != amt) { printf("jnl: %s: end_transaction: only wrote %d of %d bytes to the journal!\n", - jnl->jdev_name, ret, amt); + jnl->jdev_name, ret, amt); + ret_val = -1; goto bad_journal; } - } + } + jnl->jhdr->end = end; // update where the journal now ends + tr->journal_end = end; // the transaction ends here too - jnl->jhdr->end = end; // update where the journal now ends - tr->journal_end = end; // the transaction ends here too - if (tr->journal_start == 0 || tr->journal_end == 0) { + if (tr->journal_start == 0 || tr->journal_end == 0) { panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n", - tr->journal_start, tr->journal_end); - } + tr->journal_start, tr->journal_end); + } - if (write_journal_header(jnl, 0) != 0) { + if (write_journal_header(jnl, 0, jnl->saved_sequence_num) != 0) { + ret_val = -1; goto bad_journal; - } - + } /* * If the caller supplied a callback, call it now that the blocks have been * written to the journal. This is used by journal_relocate so, for example, * the file system can change its pointer to the new journal. */ if (callback != NULL && callback(callback_arg) != 0) { + ret_val = -1; goto bad_journal; } @@ -3489,284 +4027,429 @@ end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void // Send a DKIOCUNMAP for the extents trimmed by this transaction, and // free up the extent list. // - errno = journal_trim_flush(jnl, tr); + journal_trim_flush(jnl, tr); - // - // setup for looping through all the blhdr's. we null out the - // tbuffer and blhdr fields so that they're not used any more. - // - blhdr = tr->blhdr; - tr->tbuffer = NULL; - tr->blhdr = NULL; - - // the buffer_flushed_callback will only be called for the - // real blocks that get flushed so we have to account for - // the block_list_headers here. - // - tr->num_flushed = tr->num_blhdrs * jnl->jhdr->blhdr_size; - - // for each block, set the iodone callback and unlock it - for(; blhdr; blhdr=next) { - - // we can re-order the buf ptrs because everything is written out already - qsort(&blhdr->binfo[1], blhdr->num_blocks-1, sizeof(block_info), journal_binfo_cmp); - - for(i=1; i < blhdr->num_blocks; i++) { - if (blhdr->binfo[i].bnum == (off_t)-1) { - continue; - } + // the buffer_flushed_callback will only be called for the + // real blocks that get flushed so we have to account for + // the block_list_headers here. + // + tr->num_flushed = tr->num_blhdrs * jnl->jhdr->blhdr_size; - bp = blhdr->binfo[i].u.bp; + lock_condition(jnl, &jnl->asyncIO, "finish_end_transaction"); - // have to pass BAC_REMOVE here because we're going to bawrite() - // the buffer when we're done - do { - errno = buf_acquire(bp, BAC_REMOVE, 0, 0); - } while (errno == EAGAIN); - - if (errno == 0) { - struct vnode *save_vp; - void *cur_filter; + // + // setup for looping through all the blhdr's. + // + for (blhdr = tr->blhdr; blhdr; blhdr = next) { + uint16_t num_blocks; - if ((buf_flags(bp) & (B_LOCKED|B_DELWRI)) != (B_LOCKED|B_DELWRI)) { - if (jnl->flags & JOURNAL_CLOSE_PENDING) { - buf_clearflags(bp, B_LOCKED); - buf_brelse(bp); - continue; - } else { - panic("jnl: end_tr: !!!DANGER!!! bp %p flags (0x%x) not LOCKED & DELWRI\n", bp, buf_flags(bp)); - } - } - save_vp = buf_vnode(bp); + /* + * grab this info ahead of issuing the buf_bawrites... + * once the last one goes out, its possible for blhdr + * to be freed (especially if we get preempted) before + * we do the last check of num_blocks or + * grab the next blhdr pointer... + */ + next = (block_list_header *)((long)blhdr->binfo[0].bnum); + num_blocks = blhdr->num_blocks; - buf_setfilter(bp, buffer_flushed_callback, tr, &cur_filter, NULL); + /* + * we can re-order the buf ptrs because everything is written out already + */ + qsort(&blhdr->binfo[1], num_blocks-1, sizeof(block_info), journal_binfo_cmp); - if (cur_filter) { - panic("jnl: bp @ %p (blkno %qd, vp %p) has non-null iodone (%p) buffflushcb %p\n", - bp, buf_blkno(bp), save_vp, cur_filter, buffer_flushed_callback); - } - buf_clearflags(bp, B_LOCKED); + /* + * need to make sure that the loop issuing the buf_bawrite's + * does not touch blhdr once the last buf_bawrite has been + * issued... at that point, we no longer have a legitmate + * reference on the associated storage since it will be + * released upon the completion of that last buf_bawrite + */ + for (i = num_blocks-1; i >= 1; i--) { + if (blhdr->binfo[i].bnum != (off_t)-1) + break; + num_blocks--; + } + for (i = 1; i < num_blocks; i++) { - // kicking off the write here helps performance + if ((bp = blhdr->binfo[i].u.bp)) { + vp = buf_vnode(bp); + buf_bawrite(bp); - // XXXdbg this is good for testing: buf_bdwrite(bp); - //buf_bdwrite(bp); // this undoes the vnode_ref() in journal_modify_block_end() - vnode_rele_ext(save_vp, 0, 1); - } else { - printf("jnl: %s: end_transaction: could not acquire block %p (errno %d)!\n", - jnl->jdev_name,bp, errno); + vnode_rele_ext(vp, 0, 1); + + bufs_written++; } } + } + if (bufs_written == 0) { + /* + * since we didn't issue any buf_bawrite's, there is no + * async trigger to cause the memory associated with this + * transaction to be freed... so, move it to the garbage + * list now + */ + lock_oldstart(jnl); - next = (block_list_header *)((long)blhdr->binfo[0].bnum); + tr->next = jnl->tr_freeme; + jnl->tr_freeme = tr; - // we can free blhdr here since we won't need it any more - blhdr->binfo[0].bnum = 0xdeadc0de; - kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size); - } + unlock_oldstart(jnl); - //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n", - // tr, tr->journal_start, tr->journal_end); - return 0; + unlock_condition(jnl, &jnl->asyncIO); + } + //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n", + // tr, tr->journal_start, tr->journal_end); - bad_journal: - jnl->flags |= JOURNAL_INVALID; - jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] &= ~0x8000000000000000LL; - abort_transaction(jnl, tr); // cleans up list of extents to be trimmed - return -1; +bad_journal: + if (ret_val == -1) { + /* + * 'flush_aborted' is protected by the flushing condition... we need to + * set it before dropping the condition so that it will be + * noticed in 'end_transaction'... we add this additional + * aborted condition so that we can drop the 'flushing' condition + * before grabbing the journal lock... this avoids a deadlock + * in 'end_transaction' which is holding the journal lock while + * waiting for the 'flushing' condition to clear... + * everyone else will notice the JOURNAL_INVALID flag + */ + jnl->flush_aborted = TRUE; + + unlock_condition(jnl, &jnl->flushing); + lock_journal(jnl); + + jnl->flags |= JOURNAL_INVALID; + jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] &= ~0x8000000000000000LL; + abort_transaction(jnl, tr); // cleans up list of extents to be trimmed + + unlock_journal(jnl); + } else + unlock_condition(jnl, &jnl->flushing); + + KERNEL_DEBUG(0xbbbbc028|DBG_FUNC_END, jnl, tr, bufs_written, ret_val, 0); + + return (ret_val); +} + + +static void +lock_condition(journal *jnl, boolean_t *condition, const char *condition_name) +{ + + KERNEL_DEBUG(0xbbbbc020|DBG_FUNC_START, jnl, condition, 0, 0, 0); + + lock_flush(jnl); + + while (*condition == TRUE) + msleep(condition, &jnl->flock, PRIBIO, condition_name, NULL); + + *condition = TRUE; + unlock_flush(jnl); + + KERNEL_DEBUG(0xbbbbc020|DBG_FUNC_END, jnl, condition, 0, 0, 0); +} + +static void +wait_condition(journal *jnl, boolean_t *condition, const char *condition_name) +{ + + if (*condition == FALSE) + return; + + KERNEL_DEBUG(0xbbbbc02c|DBG_FUNC_START, jnl, condition, 0, 0, 0); + + lock_flush(jnl); + + while (*condition == TRUE) + msleep(condition, &jnl->flock, PRIBIO, condition_name, NULL); + + unlock_flush(jnl); + + KERNEL_DEBUG(0xbbbbc02c|DBG_FUNC_END, jnl, condition, 0, 0, 0); +} + +static void +unlock_condition(journal *jnl, boolean_t *condition) +{ + lock_flush(jnl); + + *condition = FALSE; + wakeup(condition); + + unlock_flush(jnl); } static void abort_transaction(journal *jnl, transaction *tr) { - int i; - errno_t errno; - block_list_header *blhdr, *next; - struct buf *bp; - struct vnode *save_vp; + block_list_header *blhdr, *next; - // for each block list header, iterate over the blocks then - // free up the memory associated with the block list. - // - // for each block, clear the lock bit and release it. - // - for(blhdr=tr->blhdr; blhdr; blhdr=next) { + // for each block list header, iterate over the blocks then + // free up the memory associated with the block list. + // + // find each of the primary blocks (i.e. the list could + // contain a mix of shadowed and real buf_t's depending + // on when the abort condition was detected) and mark them + // clean and locked in the cache... this at least allows + // the FS a consistent view between it's incore data structures + // and the meta-data held in the cache + // + KERNEL_DEBUG(0xbbbbc034|DBG_FUNC_START, jnl, tr, 0, 0, 0); + + for (blhdr = tr->blhdr; blhdr; blhdr = next) { + int i; + + for (i = 1; i < blhdr->num_blocks; i++) { + buf_t bp, tbp, sbp; + vnode_t bp_vp; + errno_t errno; - for(i=1; i < blhdr->num_blocks; i++) { - if (blhdr->binfo[i].bnum == (off_t)-1) { + if (blhdr->binfo[i].bnum == (off_t)-1) continue; - } - if ( (buf_vnode(blhdr->binfo[i].u.bp) == NULL) || - !(buf_flags(blhdr->binfo[i].u.bp) & B_LOCKED) ) { - continue; - } - errno = buf_meta_bread(buf_vnode(blhdr->binfo[i].u.bp), - buf_lblkno(blhdr->binfo[i].u.bp), - buf_size(blhdr->binfo[i].u.bp), - NOCRED, - &bp); - if (errno == 0) { - if (bp != blhdr->binfo[i].u.bp) { - panic("jnl: abort_tr: got back a different bp! (bp %p should be %p, jnl %p\n", - bp, blhdr->binfo[i].u.bp, jnl); - } + tbp = blhdr->binfo[i].u.bp; - // releasing a bp marked invalid - // also clears the locked and delayed state - buf_markinvalid(bp); - save_vp = buf_vnode(bp); + bp_vp = buf_vnode(tbp); - buf_brelse(bp); + buf_setfilter(tbp, NULL, NULL, NULL, NULL); - vnode_rele_ext(save_vp, 0, 1); - } else { - printf("jnl: %s: abort_tr: could not find block %Ld vp %p!\n", - jnl->jdev_name, blhdr->binfo[i].bnum, blhdr->binfo[i].u.bp); - if (bp) { + if (buf_shadow(tbp)) + sbp = tbp; + else + sbp = NULL; + + if (bp_vp) { + errno = buf_meta_bread(bp_vp, + buf_lblkno(tbp), + buf_size(tbp), + NOCRED, + &bp); + if (errno == 0) { + if (sbp == NULL && bp != tbp && (buf_flags(tbp) & B_LOCKED)) { + panic("jnl: abort_tr: got back a different bp! (bp %p should be %p, jnl %p\n", + bp, tbp, jnl); + } + /* + * once the journal has been marked INVALID and aborted, + * NO meta data can be written back to the disk, so + * mark the buf_t clean and make sure it's locked in the cache + * note: if we found a shadow, the real buf_t needs to be relocked + */ + buf_setflags(bp, B_LOCKED); + buf_markclean(bp); buf_brelse(bp); + + KERNEL_DEBUG(0xbbbbc034|DBG_FUNC_NONE, jnl, tr, bp, 0, 0); + + /* + * this undoes the vnode_ref() in journal_modify_block_end() + */ + vnode_rele_ext(bp_vp, 0, 1); + } else { + printf("jnl: %s: abort_tr: could not find block %Ld vp %p!\n", + jnl->jdev_name, blhdr->binfo[i].bnum, tbp); + if (bp) { + buf_brelse(bp); + } } } + if (sbp) + buf_brelse(sbp); } - next = (block_list_header *)((long)blhdr->binfo[0].bnum); // we can free blhdr here since we won't need it any more blhdr->binfo[0].bnum = 0xdeadc0de; kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size); - } + } + /* + * If the transaction we're aborting was the async transaction, then + * tell the current transaction that there is no pending trim + * any more. + */ + lck_rw_lock_exclusive(&jnl->trim_lock); + if (jnl->async_trim == &tr->trim) + jnl->async_trim = NULL; + lck_rw_unlock_exclusive(&jnl->trim_lock); + if (tr->trim.extents) { kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t)); } tr->trim.allocated_count = 0; tr->trim.extent_count = 0; tr->trim.extents = NULL; - tr->tbuffer = NULL; - tr->blhdr = NULL; - tr->total_bytes = 0xdbadc0de; + tr->tbuffer = NULL; + tr->blhdr = NULL; + tr->total_bytes = 0xdbadc0de; FREE_ZONE(tr, sizeof(transaction), M_JNL_TR); + + KERNEL_DEBUG(0xbbbbc034|DBG_FUNC_END, jnl, tr, 0, 0, 0); } int journal_end_transaction(journal *jnl) { - int ret; + int ret; transaction *tr; - CHECK_JOURNAL(jnl); + CHECK_JOURNAL(jnl); + + free_old_stuff(jnl); if ((jnl->flags & JOURNAL_INVALID) && jnl->owner == NULL) { return 0; } - if (jnl->owner != current_thread()) { + if (jnl->owner != current_thread()) { panic("jnl: end_tr: I'm not the owner! jnl %p, owner %p, curact %p\n", - jnl, jnl->owner, current_thread()); - } - - free_old_stuff(jnl); + jnl, jnl->owner, current_thread()); + } + jnl->nested_count--; - jnl->nested_count--; - if (jnl->nested_count > 0) { + if (jnl->nested_count > 0) { return 0; - } else if (jnl->nested_count < 0) { + } else if (jnl->nested_count < 0) { panic("jnl: jnl @ %p has negative nested count (%d). bad boy.\n", jnl, jnl->nested_count); - } + } - if (jnl->flags & JOURNAL_INVALID) { + if (jnl->flags & JOURNAL_INVALID) { if (jnl->active_tr) { if (jnl->cur_tr != NULL) { panic("jnl: journal @ %p has active tr (%p) and cur tr (%p)\n", - jnl, jnl->active_tr, jnl->cur_tr); + jnl, jnl->active_tr, jnl->cur_tr); } - tr = jnl->active_tr; jnl->active_tr = NULL; + abort_transaction(jnl, tr); } - jnl->owner = NULL; unlock_journal(jnl); return EINVAL; - } - - tr = jnl->active_tr; - CHECK_TRANSACTION(tr); + } - // clear this out here so that when check_free_space() calls - // the FS flush function, we don't panic in journal_flush() - // if the FS were to call that. note: check_free_space() is - // called from end_transaction(). - // - jnl->active_tr = NULL; - ret = end_transaction(tr, 0, NULL, NULL); + tr = jnl->active_tr; + CHECK_TRANSACTION(tr); - jnl->owner = NULL; - unlock_journal(jnl); + // clear this out here so that when check_free_space() calls + // the FS flush function, we don't panic in journal_flush() + // if the FS were to call that. note: check_free_space() is + // called from end_transaction(). + // + jnl->active_tr = NULL; + ret = end_transaction(tr, 0, NULL, NULL, TRUE, FALSE); - return ret; + return ret; } +/* + * Flush the contents of the journal to the disk. + * + * Input: + * wait_for_IO - + * If TRUE, wait to write in-memory journal to the disk + * consistently, and also wait to write all asynchronous + * metadata blocks to its corresponding locations + * consistently on the disk. This means that the journal + * is empty at this point and does not contain any + * transactions. This is overkill in normal scenarios + * but is useful whenever the metadata blocks are required + * to be consistent on-disk instead of just the journal + * being consistent; like before live verification + * and live volume resizing. + * + * If FALSE, only wait to write in-memory journal to the + * disk consistently. This means that the journal still + * contains uncommitted transactions and the file system + * metadata blocks in the journal transactions might be + * written asynchronously to the disk. But there is no + * guarantee that they are written to the disk before + * returning to the caller. Note that this option is + * sufficient for file system data integrity as it + * guarantees consistent journal content on the disk. + */ int -journal_flush(journal *jnl) +journal_flush(journal *jnl, boolean_t wait_for_IO) { - int need_signal = 0; + boolean_t drop_lock = FALSE; - CHECK_JOURNAL(jnl); + CHECK_JOURNAL(jnl); - if (jnl->flags & JOURNAL_INVALID) { + free_old_stuff(jnl); + + if (jnl->flags & JOURNAL_INVALID) { return -1; - } + } - KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_JOURNAL, DBG_JOURNAL_FLUSH)) - | DBG_FUNC_START, 0, 0, 0, 0, 0); + KERNEL_DEBUG(DBG_JOURNAL_FLUSH | DBG_FUNC_START, jnl, 0, 0, 0, 0); - if (jnl->owner != current_thread()) { + if (jnl->owner != current_thread()) { lock_journal(jnl); - need_signal = 1; - } - - free_old_stuff(jnl); + drop_lock = TRUE; + } - // if we're not active, flush any buffered transactions - if (jnl->active_tr == NULL && jnl->cur_tr) { + // if we're not active, flush any buffered transactions + if (jnl->active_tr == NULL && jnl->cur_tr) { transaction *tr = jnl->cur_tr; jnl->cur_tr = NULL; - end_transaction(tr, 1, NULL, NULL); // force it to get flushed - } - if (need_signal) { - unlock_journal(jnl); - } + if (wait_for_IO) { + wait_condition(jnl, &jnl->flushing, "journal_flush"); + wait_condition(jnl, &jnl->asyncIO, "journal_flush"); + } + /* + * "end_transction" will wait for any current async flush + * to complete, before flushing "cur_tr"... because we've + * specified the 'must_wait' arg as TRUE, it will then + * synchronously flush the "cur_tr" + */ + end_transaction(tr, 1, NULL, NULL, drop_lock, TRUE); // force it to get flushed - KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_JOURNAL, DBG_JOURNAL_FLUSH)) - | DBG_FUNC_END, 0, 0, 0, 0, 0); + } else { + if (drop_lock == TRUE) { + unlock_journal(jnl); + } - return 0; + /* Because of pipelined journal, the journal transactions + * might be in process of being flushed on another thread. + * If there is nothing to flush currently, we should + * synchronize ourselves with the pipelined journal thread + * to ensure that all inflight transactions, if any, are + * flushed before we return success to caller. + */ + wait_condition(jnl, &jnl->flushing, "journal_flush"); + } + if (wait_for_IO) { + wait_condition(jnl, &jnl->asyncIO, "journal_flush"); + } + + KERNEL_DEBUG(DBG_JOURNAL_FLUSH | DBG_FUNC_END, jnl, 0, 0, 0, 0); + + return 0; } int journal_active(journal *jnl) { - if (jnl->flags & JOURNAL_INVALID) { + if (jnl->flags & JOURNAL_INVALID) { return -1; - } + } - return (jnl->active_tr == NULL) ? 0 : 1; + return (jnl->active_tr == NULL) ? 0 : 1; } void * journal_owner(journal *jnl) { - return jnl->owner; + return jnl->owner; } int journal_uses_fua(journal *jnl) @@ -3835,37 +4518,37 @@ int journal_uses_fua(journal *jnl) int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbuffer_size, errno_t (*callback)(void *), void *callback_arg) { - int ret; - transaction *tr; + int ret; + transaction *tr; /* * Sanity check inputs, and adjust the size of the transaction buffer. */ - if ((offset % jnl->jhdr->jhdr_size) != 0) { + if ((offset % jnl->jhdr->jhdr_size) != 0) { printf("jnl: %s: relocate: offset 0x%llx is not an even multiple of block size 0x%x\n", - jnl->jdev_name, offset, jnl->jhdr->jhdr_size); + jnl->jdev_name, offset, jnl->jhdr->jhdr_size); return EINVAL; - } - if ((journal_size % jnl->jhdr->jhdr_size) != 0) { + } + if ((journal_size % jnl->jhdr->jhdr_size) != 0) { printf("jnl: %s: relocate: journal size 0x%llx is not an even multiple of block size 0x%x\n", - jnl->jdev_name, journal_size, jnl->jhdr->jhdr_size); + jnl->jdev_name, journal_size, jnl->jhdr->jhdr_size); return EINVAL; - } + } - CHECK_JOURNAL(jnl); + CHECK_JOURNAL(jnl); /* Guarantee we own the active transaction. */ - if (jnl->flags & JOURNAL_INVALID) { + if (jnl->flags & JOURNAL_INVALID) { return EINVAL; - } - if (jnl->owner != current_thread()) { - panic("jnl: relocate: Not the owner! jnl %p, owner %p, curact %p\n", - jnl, jnl->owner, current_thread()); + } + if (jnl->owner != current_thread()) { + panic("jnl: relocate: Not the owner! jnl %p, owner %p, curact %p\n", + jnl, jnl->owner, current_thread()); } - if (tbuffer_size == 0) - tbuffer_size = jnl->tbuffer_size; - size_up_tbuffer(jnl, tbuffer_size, jnl->jhdr->jhdr_size); + if (tbuffer_size == 0) + tbuffer_size = jnl->tbuffer_size; + size_up_tbuffer(jnl, tbuffer_size, jnl->jhdr->jhdr_size); /* * Flush any non-active transactions. We have to temporarily hide the @@ -3875,11 +4558,13 @@ int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbu tr = jnl->active_tr; CHECK_TRANSACTION(tr); jnl->active_tr = NULL; - ret = journal_flush(jnl); + ret = journal_flush(jnl, TRUE); jnl->active_tr = tr; + if (ret) { return ret; } + wait_condition(jnl, &jnl->flushing, "end_transaction"); /* Update the journal's offset and size in memory. */ jnl->jdev_offset = offset; @@ -3893,7 +4578,7 @@ int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbu * before they get written to their normal on-disk locations. */ jnl->active_tr = NULL; - ret = end_transaction(tr, 1, callback, callback_arg); + ret = end_transaction(tr, 1, callback, callback_arg, FALSE, TRUE); if (ret) { printf("jnl: %s: relocate: end_transaction failed (%d)\n", jnl->jdev_name, ret); goto bad_journal; @@ -3912,9 +4597,9 @@ int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbu return 0; bad_journal: - jnl->flags |= JOURNAL_INVALID; - abort_transaction(jnl, tr); - return ret; + jnl->flags |= JOURNAL_INVALID; + abort_transaction(jnl, tr); + return ret; } @@ -3927,62 +4612,62 @@ int journal_uses_fua(__unused journal *jnl) journal * journal_create(__unused struct vnode *jvp, - __unused off_t offset, - __unused off_t journal_size, - __unused struct vnode *fsvp, - __unused size_t min_fs_blksz, - __unused int32_t flags, - __unused int32_t tbuffer_size, - __unused void (*flush)(void *arg), - __unused void *arg) + __unused off_t offset, + __unused off_t journal_size, + __unused struct vnode *fsvp, + __unused size_t min_fs_blksz, + __unused int32_t flags, + __unused int32_t tbuffer_size, + __unused void (*flush)(void *arg), + __unused void *arg) { return NULL; } journal * journal_open(__unused struct vnode *jvp, - __unused off_t offset, - __unused off_t journal_size, - __unused struct vnode *fsvp, - __unused size_t min_fs_blksz, - __unused int32_t flags, - __unused int32_t tbuffer_size, - __unused void (*flush)(void *arg), - __unused void *arg) + __unused off_t offset, + __unused off_t journal_size, + __unused struct vnode *fsvp, + __unused size_t min_fs_blksz, + __unused int32_t flags, + __unused int32_t tbuffer_size, + __unused void (*flush)(void *arg), + __unused void *arg) { - return NULL; + return NULL; } int journal_modify_block_start(__unused journal *jnl, __unused struct buf *bp) { - return EINVAL; + return EINVAL; } int journal_modify_block_end(__unused journal *jnl, - __unused struct buf *bp, - __unused void (*func)(struct buf *bp, void *arg), - __unused void *arg) + __unused struct buf *bp, + __unused void (*func)(struct buf *bp, void *arg), + __unused void *arg) { - return EINVAL; + return EINVAL; } int journal_kill_block(__unused journal *jnl, __unused struct buf *bp) { - return EINVAL; + return EINVAL; } int journal_relocate(__unused journal *jnl, - __unused off_t offset, - __unused off_t journal_size, - __unused int32_t tbuffer_size, - __unused errno_t (*callback)(void *), - __unused void *callback_arg) + __unused off_t offset, + __unused off_t journal_size, + __unused int32_t tbuffer_size, + __unused errno_t (*callback)(void *), + __unused void *callback_arg) { - return EINVAL; + return EINVAL; } void @@ -3993,19 +4678,19 @@ journal_close(__unused journal *jnl) int journal_start_transaction(__unused journal *jnl) { - return EINVAL; + return EINVAL; } int journal_end_transaction(__unused journal *jnl) { - return EINVAL; + return EINVAL; } int -journal_flush(__unused journal *jnl) +journal_flush(__unused journal *jnl, __unused boolean_t wait_for_IO) { - return EINVAL; + return EINVAL; } int @@ -4015,13 +4700,13 @@ journal_is_clean(__unused struct vnode *jvp, __unused struct vnode *fsvp, __unused size_t min_fs_block_size) { - return 0; + return 0; } void * journal_owner(__unused journal *jnl) { - return NULL; + return NULL; } #endif // !JOURNALING diff --git a/bsd/vfs/vfs_journal.h b/bsd/vfs/vfs_journal.h index 310445395..11b24c3ee 100644 --- a/bsd/vfs/vfs_journal.h +++ b/bsd/vfs/vfs_journal.h @@ -1,4 +1,3 @@ - /* * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * @@ -81,20 +80,23 @@ struct jnl_trim_list { dk_extent_t *extents; }; +typedef void (*jnl_trim_callback_t)(void *arg, uint32_t extent_count, const dk_extent_t *extents); + typedef struct transaction { - int tbuffer_size; // in bytes - char *tbuffer; // memory copy of the transaction - block_list_header *blhdr; // points to the first byte of tbuffer - int num_blhdrs; // how many buffers we've allocated - int total_bytes; // total # of bytes in transaction - int num_flushed; // how many bytes have been flushed - int num_killed; // how many bytes were "killed" - off_t journal_start; // where in the journal this transaction starts - off_t journal_end; // where in the journal this transaction ends - struct journal *jnl; // ptr back to the journal structure - struct transaction *next; // list of tr's (either completed or to be free'd) - uint32_t sequence_num; - struct jnl_trim_list trim; + int tbuffer_size; // in bytes + char *tbuffer; // memory copy of the transaction + block_list_header *blhdr; // points to the first byte of tbuffer + int num_blhdrs; // how many buffers we've allocated + int total_bytes; // total # of bytes in transaction + int num_flushed; // how many bytes have been flushed + int num_killed; // how many bytes were "killed" + off_t journal_start; // where in the journal this transaction starts + off_t journal_end; // where in the journal this transaction ends + struct journal *jnl; // ptr back to the journal structure + struct transaction *next; // list of tr's (either completed or to be free'd) + uint32_t sequence_num; + struct jnl_trim_list trim; + boolean_t delayed_header_write; } transaction; @@ -133,6 +135,8 @@ typedef struct journal_header { */ typedef struct journal { lck_mtx_t jlock; // protects the struct journal data + lck_mtx_t flock; // serializes flushing of journal + lck_rw_t trim_lock; // protects the async_trim field, below struct vnode *jdev; // vnode of the device where the journal lives off_t jdev_offset; // byte offset to the start of the journal @@ -145,11 +149,23 @@ typedef struct journal { int32_t flags; int32_t tbuffer_size; // default transaction buffer size - + boolean_t flush_aborted; + boolean_t flushing; + boolean_t asyncIO; + boolean_t writing_header; + boolean_t write_header_failed; + + struct jnl_trim_list *async_trim; // extents to be trimmed by transaction being asynchronously flushed + jnl_trim_callback_t trim_callback; + void *trim_callback_arg; + char *header_buf; // in-memory copy of the journal header int32_t header_buf_size; journal_header *jhdr; // points to the first byte of header_buf + uint32_t saved_sequence_num; + uint32_t sequence_num; + off_t max_read_size; off_t max_write_size; @@ -174,7 +190,7 @@ typedef struct journal { #define JOURNAL_FLUSHCACHE_ERR 0x00040000 // means we already printed this err #define JOURNAL_NEED_SWAP 0x00080000 // swap any data read from disk #define JOURNAL_DO_FUA_WRITES 0x00100000 // do force-unit-access writes -#define JOURNAL_TRIM_ERR 0x00200000 // a previous trim failed +#define JOURNAL_USE_UNMAP 0x00200000 // device supports UNMAP (TRIM) /* journal_open/create options are always in the low-16 bits */ #define JOURNAL_OPTION_FLAGS_MASK 0x0000ffff @@ -306,11 +322,12 @@ int journal_kill_block(journal *jnl, struct buf *bp); #ifdef BSD_KERNEL_PRIVATE int journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length); int journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length); +void journal_trim_set_callback(journal *jnl, jnl_trim_callback_t callback, void *arg); #endif int journal_end_transaction(journal *jnl); int journal_active(journal *jnl); -int journal_flush(journal *jnl); +int journal_flush(journal *jnl, boolean_t wait_for_IO); void *journal_owner(journal *jnl); // compare against current_thread() int journal_uses_fua(journal *jnl); diff --git a/bsd/vfs/vfs_lookup.c b/bsd/vfs/vfs_lookup.c index 553bb41f9..10b885d51 100644 --- a/bsd/vfs/vfs_lookup.c +++ b/bsd/vfs/vfs_lookup.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -111,6 +111,17 @@ static void kdebug_lookup(struct vnode *dp, struct componentname *cnp); static int vfs_getrealpath(const char * path, char * realpath, size_t bufsize, vfs_context_t ctx); #endif +boolean_t lookup_continue_ok(struct nameidata *ndp); +int lookup_traverse_mountpoints(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, int vbusyflags, vfs_context_t ctx); +int lookup_handle_symlink(struct nameidata *ndp, vnode_t *new_dp, vfs_context_t ctx); +int lookup_authorize_search(vnode_t dp, struct componentname *cnp, int dp_authorized_in_cache, vfs_context_t ctx); +void lookup_consider_update_cache(vnode_t dvp, vnode_t vp, struct componentname *cnp, int nc_generation); +int lookup_handle_rsrc_fork(vnode_t dp, struct nameidata *ndp, struct componentname *cnp, int wantparent, vfs_context_t ctx); +int lookup_handle_found_vnode(struct nameidata *ndp, struct componentname *cnp, int rdonly, + int vbusyflags, int *keep_going, int nc_generation, + int wantparent, int atroot, vfs_context_t ctx); +int lookup_handle_emptyname(struct nameidata *ndp, struct componentname *cnp, int wantparent); + /* * Convert a pathname into a pointer to a locked inode. * @@ -150,12 +161,10 @@ int namei(struct nameidata *ndp) { struct filedesc *fdp; /* pointer to file descriptor state */ - char *cp; /* pointer into pathname argument */ struct vnode *dp; /* the directory we are searching */ struct vnode *usedvp = ndp->ni_dvp; /* store pointer to vp in case we must loop due to heavy vnode pressure */ u_long cnpflags = ndp->ni_cnd.cn_flags; /* store in case we have to restore after loop */ - uio_t auio; int error; struct componentname *cnp = &ndp->ni_cnd; vfs_context_t ctx = cnp->cn_context; @@ -164,8 +173,8 @@ namei(struct nameidata *ndp) /* XXX ut should be from context */ uthread_t ut = (struct uthread *)get_bsdthread_info(current_thread()); #endif - char *tmppn; - char uio_buf[ UIO_SIZEOF(1) ]; + + fdp = p->p_fd; #if DIAGNOSTIC if (!vfs_context_ucred(ctx) || !p) @@ -175,7 +184,35 @@ namei(struct nameidata *ndp) if (cnp->cn_flags & OPMASK) panic ("namei: flags contaminated with nameiops"); #endif - fdp = p->p_fd; + + /* + * A compound VNOP found something that needs further processing: + * either a trigger vnode, a covered directory, or a symlink. + */ + if (ndp->ni_flag & NAMEI_CONTLOOKUP) { + int rdonly, vbusyflags, keep_going, wantparent; + + rdonly = cnp->cn_flags & RDONLY; + vbusyflags = ((cnp->cn_flags & CN_NBMOUNTLOOK) != 0) ? LK_NOWAIT : 0; + keep_going = 0; + wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT); + + ndp->ni_flag &= ~(NAMEI_CONTLOOKUP); + + error = lookup_handle_found_vnode(ndp, &ndp->ni_cnd, rdonly, vbusyflags, + &keep_going, ndp->ni_ncgeneration, wantparent, 0, ctx); + if (error) + goto out_drop; + if (keep_going) { + if ((cnp->cn_flags & ISSYMLINK) == 0) { + panic("We need to keep going on a continued lookup, but for vp type %d (tag %d)\n", ndp->ni_vp->v_type, ndp->ni_vp->v_tag); + } + goto continue_symlink; + } + + return 0; + + } vnode_recycled: @@ -310,9 +347,6 @@ namei(struct nameidata *ndp) ndp->ni_vp = NULLVP; for (;;) { - int need_newpathbuf; - u_int linklen; - ndp->ni_startdir = dp; if ( (error = lookup(ndp)) ) { @@ -324,104 +358,13 @@ namei(struct nameidata *ndp) if ((cnp->cn_flags & ISSYMLINK) == 0) { return (0); } -#ifndef __LP64__ - if ((cnp->cn_flags & FSNODELOCKHELD)) { - cnp->cn_flags &= ~FSNODELOCKHELD; - unlock_fsnode(ndp->ni_dvp, NULL); - } -#endif /* __LP64__ */ - - if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { - error = ELOOP; - break; - } -#if CONFIG_MACF - if ((error = mac_vnode_check_readlink(ctx, ndp->ni_vp)) != 0) - break; -#endif /* MAC */ - if (ndp->ni_pathlen > 1 || !(cnp->cn_flags & HASBUF)) - need_newpathbuf = 1; - else - need_newpathbuf = 0; - - if (need_newpathbuf) { - MALLOC_ZONE(cp, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); - if (cp == NULL) { - error = ENOMEM; - break; - } - } else { - cp = cnp->cn_pnbuf; - } - auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf)); - - uio_addiov(auio, CAST_USER_ADDR_T(cp), MAXPATHLEN); - - error = VNOP_READLINK(ndp->ni_vp, auio, ctx); - if (error) { - if (need_newpathbuf) - FREE_ZONE(cp, MAXPATHLEN, M_NAMEI); - break; - } - - /* - * Safe to set unsigned with a [larger] signed type here - * because 0 <= uio_resid <= MAXPATHLEN and MAXPATHLEN - * is only 1024. - */ - linklen = MAXPATHLEN - (u_int)uio_resid(auio); - if (linklen + ndp->ni_pathlen > MAXPATHLEN) { - if (need_newpathbuf) - FREE_ZONE(cp, MAXPATHLEN, M_NAMEI); - error = ENAMETOOLONG; +continue_symlink: + /* Gives us a new path to process, and a starting dir */ + error = lookup_handle_symlink(ndp, &dp, ctx); + if (error != 0) { break; } - if (need_newpathbuf) { - long len = cnp->cn_pnlen; - - tmppn = cnp->cn_pnbuf; - bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen); - cnp->cn_pnbuf = cp; - cnp->cn_pnlen = MAXPATHLEN; - - if ( (cnp->cn_flags & HASBUF) ) - FREE_ZONE(tmppn, len, M_NAMEI); - else - cnp->cn_flags |= HASBUF; - } else - cnp->cn_pnbuf[linklen] = '\0'; - - ndp->ni_pathlen += linklen; - cnp->cn_nameptr = cnp->cn_pnbuf; - - /* - * starting point for 'relative' - * symbolic link path - */ - dp = ndp->ni_dvp; - /* - * get rid of references returned via 'lookup' - */ - vnode_put(ndp->ni_vp); - vnode_put(ndp->ni_dvp); - - ndp->ni_vp = NULLVP; - ndp->ni_dvp = NULLVP; - - /* - * Check if symbolic link restarts us at the root - */ - if (*(cnp->cn_nameptr) == '/') { - while (*(cnp->cn_nameptr) == '/') { - cnp->cn_nameptr++; - ndp->ni_pathlen--; - } - if ((dp = ndp->ni_rootdir) == NULLVP) { - error = ENOENT; - goto error_out; - } - } } /* * only come here if we fail to handle a SYMLINK... @@ -429,6 +372,7 @@ namei(struct nameidata *ndp) * we need to drop the iocount that was picked * up in the lookup routine */ +out_drop: if (ndp->ni_dvp) vnode_put(ndp->ni_dvp); if (ndp->ni_vp) @@ -440,6 +384,7 @@ namei(struct nameidata *ndp) } cnp->cn_pnbuf = NULL; ndp->ni_vp = NULLVP; + ndp->ni_dvp = NULLVP; if (error == ERECYCLE){ /* vnode was recycled underneath us. re-drive lookup to start at the beginning again, since recycling invalidated last lookup*/ @@ -452,143 +397,534 @@ namei(struct nameidata *ndp) return (error); } +int +namei_compound_available(vnode_t dp, struct nameidata *ndp) +{ + if ((ndp->ni_flag & NAMEI_COMPOUNDOPEN) != 0) { + return vnode_compound_open_available(dp); + } -/* - * Search a pathname. - * This is a very central and rather complicated routine. - * - * The pathname is pointed to by ni_ptr and is of length ni_pathlen. - * The starting directory is taken from ni_startdir. The pathname is - * descended until done, or a symbolic link is encountered. The variable - * ni_more is clear if the path is completed; it is set to one if a - * symbolic link needing interpretation is encountered. - * - * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on - * whether the name is to be looked up, created, renamed, or deleted. - * When CREATE, RENAME, or DELETE is specified, information usable in - * creating, renaming, or deleting a directory entry may be calculated. - * If flag has LOCKPARENT or'ed into it, the parent directory is returned - * locked. If flag has WANTPARENT or'ed into it, the parent directory is - * returned unlocked. Otherwise the parent directory is not returned. If - * the target of the pathname exists and LOCKLEAF is or'ed into the flag - * the target is returned locked, otherwise it is returned unlocked. - * When creating or renaming and LOCKPARENT is specified, the target may not - * be ".". When deleting and LOCKPARENT is specified, the target may be ".". - * - * Overall outline of lookup: - * - * dirloop: - * identify next component of name at ndp->ni_ptr - * handle degenerate case where name is null string - * if .. and crossing mount points and on mounted filesys, find parent - * call VNOP_LOOKUP routine for next component name - * directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set - * component vnode returned in ni_vp (if it exists), locked. - * if result vnode is mounted on and crossing mount points, - * find mounted on vnode - * if more components of name, do next level at dirloop - * return the answer in ni_vp, locked if LOCKLEAF set - * if LOCKPARENT set, return locked parent in ni_dvp - * if WANTPARENT set, return unlocked parent in ni_dvp - * - * Returns: 0 Success - * ENOENT No such file or directory - * EBADF Bad file descriptor - * ENOTDIR Not a directory - * EROFS Read-only file system [CREATE] - * EISDIR Is a directory [CREATE] - * cache_lookup_path:ERECYCLE (vnode was recycled from underneath us, redrive lookup again) - * vnode_authorize:EROFS - * vnode_authorize:EACCES - * vnode_authorize:EPERM - * vnode_authorize:??? - * VNOP_LOOKUP:ENOENT No such file or directory - * VNOP_LOOKUP:EJUSTRETURN Restart system call (INTERNAL) - * VNOP_LOOKUP:??? - * VFS_ROOT:ENOTSUP - * VFS_ROOT:ENOENT - * VFS_ROOT:??? - */ + return 0; +} int -lookup(struct nameidata *ndp) +lookup_authorize_search(vnode_t dp, struct componentname *cnp, int dp_authorized_in_cache, vfs_context_t ctx) { - char *cp; /* pointer into pathname argument */ - vnode_t tdp; /* saved dp */ - vnode_t dp; /* the directory we are searching */ - mount_t mp; /* mount table entry */ - int docache = 1; /* == 0 do not cache last component */ - int wantparent; /* 1 => wantparent or lockparent flag */ - int rdonly; /* lookup read-only flag bit */ - int trailing_slash = 0; - int dp_authorized = 0; - int error = 0; - struct componentname *cnp = &ndp->ni_cnd; - vfs_context_t ctx = cnp->cn_context; - int mounted_on_depth = 0; - int dont_cache_mp = 0; - vnode_t mounted_on_dp = NULLVP; - int current_mount_generation = 0; - int vbusyflags = 0; - int nc_generation = 0; - vnode_t last_dp = NULLVP; + int error; - /* - * Setup: break out flag bits into variables. - */ - if (cnp->cn_flags & (NOCACHE | DOWHITEOUT)) { - if ((cnp->cn_flags & NOCACHE) || (cnp->cn_nameiop == DELETE)) - docache = 0; + if (!dp_authorized_in_cache) { + error = vnode_authorize(dp, NULL, KAUTH_VNODE_SEARCH, ctx); + if (error) + return error; } - wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT); - rdonly = cnp->cn_flags & RDONLY; - cnp->cn_flags &= ~ISSYMLINK; - cnp->cn_consume = 0; +#if CONFIG_MACF + error = mac_vnode_check_lookup(ctx, dp, cnp); + if (error) + return error; +#endif /* CONFIG_MACF */ - dp = ndp->ni_startdir; - ndp->ni_startdir = NULLVP; + return 0; +} - if ((cnp->cn_flags & CN_NBMOUNTLOOK) != 0) - vbusyflags = LK_NOWAIT; - cp = cnp->cn_nameptr; +void +lookup_consider_update_cache(vnode_t dvp, vnode_t vp, struct componentname *cnp, int nc_generation) +{ + int isdot_or_dotdot; + isdot_or_dotdot = (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') || (cnp->cn_flags & ISDOTDOT); - if (*cp == '\0') { - if ( (vnode_getwithref(dp)) ) { - dp = NULLVP; - error = ENOENT; - goto bad; + if (vp->v_name == NULL || vp->v_parent == NULLVP) { + int update_flags = 0; + + if (isdot_or_dotdot == 0) { + if (vp->v_name == NULL) + update_flags |= VNODE_UPDATE_NAME; + if (dvp != NULLVP && vp->v_parent == NULLVP) + update_flags |= VNODE_UPDATE_PARENT; + + if (update_flags) + vnode_update_identity(vp, dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, update_flags); } - goto emptyname; } -dirloop: - ndp->ni_vp = NULLVP; + if ( (cnp->cn_flags & MAKEENTRY) && (vp->v_flag & VNCACHEABLE) && LIST_FIRST(&vp->v_nclinks) == NULL) { + /* + * missing from name cache, but should + * be in it... this can happen if volfs + * causes the vnode to be created or the + * name cache entry got recycled but the + * vnode didn't... + * check to make sure that ni_dvp is valid + * cache_lookup_path may return a NULL + * do a quick check to see if the generation of the + * directory matches our snapshot... this will get + * rechecked behind the name cache lock, but if it + * already fails to match, no need to go any further + */ + if (dvp != NULLVP && (nc_generation == dvp->v_nc_generation) && (!isdot_or_dotdot)) + cache_enter_with_gen(dvp, vp, cnp, nc_generation); + } - if ( (error = cache_lookup_path(ndp, cnp, dp, ctx, &trailing_slash, &dp_authorized, last_dp)) ) { - dp = NULLVP; - goto bad; +} + +#if NAMEDRSRCFORK +/* + * Can change ni_dvp and ni_vp. On success, returns with iocounts on stream vnode (always) and + * data fork if requested. On failure, returns with iocount data fork (always) and its parent directory + * (if one was provided). + */ +int +lookup_handle_rsrc_fork(vnode_t dp, struct nameidata *ndp, struct componentname *cnp, int wantparent, vfs_context_t ctx) +{ + vnode_t svp = NULLVP; + enum nsoperation nsop; + int error; + + if (dp->v_type != VREG) { + error = ENOENT; + goto out; } - if ((cnp->cn_flags & ISLASTCN)) { - if (docache) - cnp->cn_flags |= MAKEENTRY; - } else - cnp->cn_flags |= MAKEENTRY; + switch (cnp->cn_nameiop) { + case DELETE: + if (cnp->cn_flags & CN_ALLOWRSRCFORK) { + nsop = NS_DELETE; + } else { + error = EPERM; + goto out; + } + break; + case CREATE: + if (cnp->cn_flags & CN_ALLOWRSRCFORK) { + nsop = NS_CREATE; + } else { + error = EPERM; + goto out; + } + break; + case LOOKUP: + /* Make sure our lookup of "/..namedfork/rsrc" is allowed. */ + if (cnp->cn_flags & CN_ALLOWRSRCFORK) { + nsop = NS_OPEN; + } else { + error = EPERM; + goto out; + } + break; + default: + error = EPERM; + goto out; + } + /* Ask the file system for the resource fork. */ + error = vnode_getnamedstream(dp, &svp, XATTR_RESOURCEFORK_NAME, nsop, 0, ctx); - dp = ndp->ni_dvp; + /* During a create, it OK for stream vnode to be missing. */ + if (error == ENOATTR || error == ENOENT) { + error = (nsop == NS_CREATE) ? 0 : ENOENT; + } + if (error) { + goto out; + } + /* The "parent" of the stream is the file. */ + if (wantparent) { + if (ndp->ni_dvp) { +#ifndef __LP64__ + if (ndp->ni_cnd.cn_flags & FSNODELOCKHELD) { + ndp->ni_cnd.cn_flags &= ~FSNODELOCKHELD; + unlock_fsnode(ndp->ni_dvp, NULL); + } +#endif /* __LP64__ */ + vnode_put(ndp->ni_dvp); + } + ndp->ni_dvp = dp; + } else { + vnode_put(dp); + } + ndp->ni_vp = svp; /* on create this may be null */ - if (ndp->ni_vp != NULLVP) { - /* - * cache_lookup_path returned a non-NULL ni_vp then, - * we're guaranteed that the dp is a VDIR, it's - * been authorized, and vp is not ".." - * - * make sure we don't try to enter the name back into - * the cache if this vp is purged before we get to that - * check since we won't have serialized behind whatever - * activity is occurring in the FS that caused the purge - */ - if (dp != NULLVP) - nc_generation = dp->v_nc_generation - 1; + /* Restore the truncated pathname buffer (for audits). */ + if (ndp->ni_pathlen == 1 && ndp->ni_next[0] == '\0') { + ndp->ni_next[0] = '/'; + } + cnp->cn_flags &= ~MAKEENTRY; - goto returned_from_lookup_path; + return 0; +out: + return error; +} +#endif /* NAMEDRSRCFORK */ + +/* + * iocounts in: + * --One on ni_vp. One on ni_dvp if there is more path, or we didn't come through the + * cache, or we came through the cache and the caller doesn't want the parent. + * + * iocounts out: + * --Leaves us in the correct state for the next step, whatever that might be. + * --If we find a symlink, returns with iocounts on both ni_vp and ni_dvp. + * --If we are to look up another component, then we have an iocount on ni_vp and + * nothing else. + * --If we are done, returns an iocount on ni_vp, and possibly on ni_dvp depending on nameidata flags. + * --In the event of an error, may return with ni_dvp NULL'ed out (in which case, iocount + * was dropped). + */ +int +lookup_handle_found_vnode(struct nameidata *ndp, struct componentname *cnp, int rdonly, + int vbusyflags, int *keep_going, int nc_generation, + int wantparent, int atroot, vfs_context_t ctx) +{ + vnode_t dp; + int error; + char *cp; + + dp = ndp->ni_vp; + *keep_going = 0; + + if (ndp->ni_vp == NULLVP) { + panic("NULL ni_vp in %s\n", __FUNCTION__); + } + + if (atroot) { + goto nextname; + } + +#if CONFIG_TRIGGERS + if (dp->v_resolve) { + error = vnode_trigger_resolve(dp, ndp, ctx); + if (error) { + goto out; + } + } +#endif /* CONFIG_TRIGGERS */ + + /* + * Take into account any additional components consumed by + * the underlying filesystem. + */ + if (cnp->cn_consume > 0) { + cnp->cn_nameptr += cnp->cn_consume; + ndp->ni_next += cnp->cn_consume; + ndp->ni_pathlen -= cnp->cn_consume; + cnp->cn_consume = 0; + } else { + lookup_consider_update_cache(ndp->ni_dvp, dp, cnp, nc_generation); + } + + /* + * Check to see if the vnode has been mounted on... + * if so find the root of the mounted file system. + * Updates ndp->ni_vp. + */ + error = lookup_traverse_mountpoints(ndp, cnp, dp, vbusyflags, ctx); + dp = ndp->ni_vp; + if (error) { + goto out; + } + +#if CONFIG_MACF + if (vfs_flags(vnode_mount(dp)) & MNT_MULTILABEL) { + error = vnode_label(vnode_mount(dp), NULL, dp, NULL, 0, ctx); + if (error) + goto out; + } +#endif + + /* + * Check for symbolic link + */ + if ((dp->v_type == VLNK) && + ((cnp->cn_flags & FOLLOW) || (ndp->ni_flag & NAMEI_TRAILINGSLASH) || *ndp->ni_next == '/')) { + cnp->cn_flags |= ISSYMLINK; + *keep_going = 1; + return (0); + } + + /* + * Check for bogus trailing slashes. + */ + if ((ndp->ni_flag & NAMEI_TRAILINGSLASH)) { + if (dp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + ndp->ni_flag &= ~(NAMEI_TRAILINGSLASH); + } + +nextname: + /* + * Not a symbolic link. If more pathname, + * continue at next component, else return. + * + * Definitely have a dvp if there's another slash + */ + if (*ndp->ni_next == '/') { + cnp->cn_nameptr = ndp->ni_next + 1; + ndp->ni_pathlen--; + while (*cnp->cn_nameptr == '/') { + cnp->cn_nameptr++; + ndp->ni_pathlen--; + } + + cp = cnp->cn_nameptr; + vnode_put(ndp->ni_dvp); + ndp->ni_dvp = NULLVP; + + if (*cp == '\0') { + goto emptyname; + } + + *keep_going = 1; + return 0; + } + + /* + * Disallow directory write attempts on read-only file systems. + */ + if (rdonly && + (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { + error = EROFS; + goto out; + } + + /* If SAVESTART is set, we should have a dvp */ + if (cnp->cn_flags & SAVESTART) { + /* + * note that we already hold a reference + * on both dp and ni_dvp, but for some reason + * can't get another one... in this case we + * need to do vnode_put on dp in 'bad2' + */ + if ( (vnode_get(ndp->ni_dvp)) ) { + error = ENOENT; + goto out; + } + ndp->ni_startdir = ndp->ni_dvp; + } + if (!wantparent && ndp->ni_dvp) { + vnode_put(ndp->ni_dvp); + ndp->ni_dvp = NULLVP; + } + + if (cnp->cn_flags & AUDITVNPATH1) + AUDIT_ARG(vnpath, dp, ARG_VNODE1); + else if (cnp->cn_flags & AUDITVNPATH2) + AUDIT_ARG(vnpath, dp, ARG_VNODE2); + +#if NAMEDRSRCFORK + /* + * Caller wants the resource fork. + */ + if ((cnp->cn_flags & CN_WANTSRSRCFORK) && (dp != NULLVP)) { + error = lookup_handle_rsrc_fork(dp, ndp, cnp, wantparent, ctx); + if (error != 0) + goto out; + + dp = ndp->ni_vp; + } +#endif + if (kdebug_enable) + kdebug_lookup(dp, cnp); + + return 0; + +emptyname: + error = lookup_handle_emptyname(ndp, cnp, wantparent); + if (error != 0) + goto out; + + return 0; +out: + return error; + +} + +/* + * Comes in iocount on ni_vp. May overwrite ni_dvp, but doesn't interpret incoming value. + */ +int +lookup_handle_emptyname(struct nameidata *ndp, struct componentname *cnp, int wantparent) +{ + vnode_t dp; + int error = 0; + + dp = ndp->ni_vp; + cnp->cn_namelen = 0; + /* + * A degenerate name (e.g. / or "") which is a way of + * talking about a directory, e.g. like "/." or ".". + */ + if (dp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + if (cnp->cn_nameiop != LOOKUP) { + error = EISDIR; + goto out; + } + if (wantparent) { + /* + * note that we already hold a reference + * on dp, but for some reason can't + * get another one... in this case we + * need to do vnode_put on dp in 'bad' + */ + if ( (vnode_get(dp)) ) { + error = ENOENT; + goto out; + } + ndp->ni_dvp = dp; + } + cnp->cn_flags &= ~ISDOTDOT; + cnp->cn_flags |= ISLASTCN; + ndp->ni_next = cnp->cn_nameptr; + ndp->ni_vp = dp; + + if (cnp->cn_flags & AUDITVNPATH1) + AUDIT_ARG(vnpath, dp, ARG_VNODE1); + else if (cnp->cn_flags & AUDITVNPATH2) + AUDIT_ARG(vnpath, dp, ARG_VNODE2); + if (cnp->cn_flags & SAVESTART) + panic("lookup: SAVESTART"); + + return 0; +out: + return error; +} +/* + * Search a pathname. + * This is a very central and rather complicated routine. + * + * The pathname is pointed to by ni_ptr and is of length ni_pathlen. + * The starting directory is taken from ni_startdir. The pathname is + * descended until done, or a symbolic link is encountered. The variable + * ni_more is clear if the path is completed; it is set to one if a + * symbolic link needing interpretation is encountered. + * + * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on + * whether the name is to be looked up, created, renamed, or deleted. + * When CREATE, RENAME, or DELETE is specified, information usable in + * creating, renaming, or deleting a directory entry may be calculated. + * If flag has LOCKPARENT or'ed into it, the parent directory is returned + * locked. If flag has WANTPARENT or'ed into it, the parent directory is + * returned unlocked. Otherwise the parent directory is not returned. If + * the target of the pathname exists and LOCKLEAF is or'ed into the flag + * the target is returned locked, otherwise it is returned unlocked. + * When creating or renaming and LOCKPARENT is specified, the target may not + * be ".". When deleting and LOCKPARENT is specified, the target may be ".". + * + * Overall outline of lookup: + * + * dirloop: + * identify next component of name at ndp->ni_ptr + * handle degenerate case where name is null string + * if .. and crossing mount points and on mounted filesys, find parent + * call VNOP_LOOKUP routine for next component name + * directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set + * component vnode returned in ni_vp (if it exists), locked. + * if result vnode is mounted on and crossing mount points, + * find mounted on vnode + * if more components of name, do next level at dirloop + * return the answer in ni_vp, locked if LOCKLEAF set + * if LOCKPARENT set, return locked parent in ni_dvp + * if WANTPARENT set, return unlocked parent in ni_dvp + * + * Returns: 0 Success + * ENOENT No such file or directory + * EBADF Bad file descriptor + * ENOTDIR Not a directory + * EROFS Read-only file system [CREATE] + * EISDIR Is a directory [CREATE] + * cache_lookup_path:ERECYCLE (vnode was recycled from underneath us, redrive lookup again) + * vnode_authorize:EROFS + * vnode_authorize:EACCES + * vnode_authorize:EPERM + * vnode_authorize:??? + * VNOP_LOOKUP:ENOENT No such file or directory + * VNOP_LOOKUP:EJUSTRETURN Restart system call (INTERNAL) + * VNOP_LOOKUP:??? + * VFS_ROOT:ENOTSUP + * VFS_ROOT:ENOENT + * VFS_ROOT:??? + */ +int +lookup(struct nameidata *ndp) +{ + char *cp; /* pointer into pathname argument */ + vnode_t tdp; /* saved dp */ + vnode_t dp; /* the directory we are searching */ + int docache = 1; /* == 0 do not cache last component */ + int wantparent; /* 1 => wantparent or lockparent flag */ + int rdonly; /* lookup read-only flag bit */ + int dp_authorized = 0; + int error = 0; + struct componentname *cnp = &ndp->ni_cnd; + vfs_context_t ctx = cnp->cn_context; + int vbusyflags = 0; + int nc_generation = 0; + vnode_t last_dp = NULLVP; + int keep_going; + int atroot; + + /* + * Setup: break out flag bits into variables. + */ + if (cnp->cn_flags & (NOCACHE | DOWHITEOUT)) { + if ((cnp->cn_flags & NOCACHE) || (cnp->cn_nameiop == DELETE)) + docache = 0; + } + wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT); + rdonly = cnp->cn_flags & RDONLY; + cnp->cn_flags &= ~ISSYMLINK; + cnp->cn_consume = 0; + + dp = ndp->ni_startdir; + ndp->ni_startdir = NULLVP; + + if ((cnp->cn_flags & CN_NBMOUNTLOOK) != 0) + vbusyflags = LK_NOWAIT; + cp = cnp->cn_nameptr; + + if (*cp == '\0') { + if ( (vnode_getwithref(dp)) ) { + dp = NULLVP; + error = ENOENT; + goto bad; + } + ndp->ni_vp = dp; + error = lookup_handle_emptyname(ndp, cnp, wantparent); + if (error) { + goto bad; + } + + return 0; + } +dirloop: + atroot = 0; + ndp->ni_vp = NULLVP; + + if ( (error = cache_lookup_path(ndp, cnp, dp, ctx, &dp_authorized, last_dp)) ) { + dp = NULLVP; + goto bad; + } + if ((cnp->cn_flags & ISLASTCN)) { + if (docache) + cnp->cn_flags |= MAKEENTRY; + } else + cnp->cn_flags |= MAKEENTRY; + + dp = ndp->ni_dvp; + + if (ndp->ni_vp != NULLVP) { + /* + * cache_lookup_path returned a non-NULL ni_vp then, + * we're guaranteed that the dp is a VDIR, it's + * been authorized, and vp is not ".." + * + * make sure we don't try to enter the name back into + * the cache if this vp is purged before we get to that + * check since we won't have serialized behind whatever + * activity is occurring in the FS that caused the purge + */ + if (dp != NULLVP) + nc_generation = dp->v_nc_generation - 1; + + goto returned_from_lookup_path; } /* @@ -618,7 +954,8 @@ lookup(struct nameidata *ndp) error = ENOENT; goto bad; } - goto nextname; + atroot = 1; + goto returned_from_lookup_path; } if ((dp->v_flag & VROOT) == 0 || (cnp->cn_flags & NOCROSSMOUNT)) @@ -653,21 +990,32 @@ lookup(struct nameidata *ndp) goto lookup_error; } if ( (cnp->cn_flags & DONOTAUTH) != DONOTAUTH ) { - if (!dp_authorized) { - error = vnode_authorize(dp, NULL, KAUTH_VNODE_SEARCH, ctx); - if (error) - goto lookup_error; - } -#if CONFIG_MACF - error = mac_vnode_check_lookup(ctx, dp, cnp); - if (error) + error = lookup_authorize_search(dp, cnp, dp_authorized, ctx); + if (error) { goto lookup_error; -#endif /* CONFIG_MACF */ + } + } + + /* + * Now that we've authorized a lookup, can bail out if the filesystem + * will be doing a batched operation. Return an iocount on dvp. + */ +#if NAMEDRSRCFORK + if ((cnp->cn_flags & ISLASTCN) && namei_compound_available(dp, ndp) && !(cnp->cn_flags & CN_WANTSRSRCFORK)) { +#else + if ((cnp->cn_flags & ISLASTCN) && namei_compound_available(dp, ndp)) { +#endif /* NAMEDRSRCFORK */ + ndp->ni_flag |= NAMEI_UNFINISHED; + ndp->ni_ncgeneration = dp->v_nc_generation; + return 0; } nc_generation = dp->v_nc_generation; - if ( (error = VNOP_LOOKUP(dp, &ndp->ni_vp, cnp, ctx)) ) { + error = VNOP_LOOKUP(dp, &ndp->ni_vp, cnp, ctx); + + + if ( error ) { lookup_error: if ((error == ENOENT) && (dp->v_flag & VROOT) && (dp->v_mount != NULL) && @@ -699,18 +1047,9 @@ lookup(struct nameidata *ndp) if (ndp->ni_vp != NULLVP) panic("leaf should be empty"); - /* - * If creating and at end of pathname, then can consider - * allowing file to be created. - */ - if (rdonly) { - error = EROFS; - goto bad; - } - if ((cnp->cn_flags & ISLASTCN) && trailing_slash && !(cnp->cn_flags & WILLBEDIR)) { - error = ENOENT; + error = lookup_validate_creation_path(ndp); + if (error) goto bad; - } /* * We return with ni_vp NULL to indicate that the entry * doesn't currently exist, leaving a pointer to the @@ -731,337 +1070,33 @@ lookup(struct nameidata *ndp) return (0); } returned_from_lookup_path: - dp = ndp->ni_vp; - - /* - * Take into account any additional components consumed by - * the underlying filesystem. - */ - if (cnp->cn_consume > 0) { - cnp->cn_nameptr += cnp->cn_consume; - ndp->ni_next += cnp->cn_consume; - ndp->ni_pathlen -= cnp->cn_consume; - cnp->cn_consume = 0; - } else { - int isdot_or_dotdot; - isdot_or_dotdot = (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') || (cnp->cn_flags & ISDOTDOT); - - if (dp->v_name == NULL || dp->v_parent == NULLVP) { - int update_flags = 0; - - if (isdot_or_dotdot == 0) { - if (dp->v_name == NULL) - update_flags |= VNODE_UPDATE_NAME; - if (ndp->ni_dvp != NULLVP && dp->v_parent == NULLVP) - update_flags |= VNODE_UPDATE_PARENT; - - if (update_flags) - vnode_update_identity(dp, ndp->ni_dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, update_flags); - } - } - if ( (cnp->cn_flags & MAKEENTRY) && (dp->v_flag & VNCACHEABLE) && LIST_FIRST(&dp->v_nclinks) == NULL) { - /* - * missing from name cache, but should - * be in it... this can happen if volfs - * causes the vnode to be created or the - * name cache entry got recycled but the - * vnode didn't... - * check to make sure that ni_dvp is valid - * cache_lookup_path may return a NULL - * do a quick check to see if the generation of the - * directory matches our snapshot... this will get - * rechecked behind the name cache lock, but if it - * already fails to match, no need to go any further - */ - if (ndp->ni_dvp != NULLVP && (nc_generation == ndp->ni_dvp->v_nc_generation) && (!isdot_or_dotdot)) - cache_enter_with_gen(ndp->ni_dvp, dp, cnp, nc_generation); - } - } - - mounted_on_dp = dp; - mounted_on_depth = 0; - dont_cache_mp = 0; - current_mount_generation = mount_generation; - /* - * Check to see if the vnode has been mounted on... - * if so find the root of the mounted file system. - */ -check_mounted_on: - if ((dp->v_type == VDIR) && dp->v_mountedhere && - ((cnp->cn_flags & NOCROSSMOUNT) == 0)) { - - vnode_lock(dp); - - if ((dp->v_type == VDIR) && (mp = dp->v_mountedhere)) { - struct uthread *uth = (struct uthread *)get_bsdthread_info(current_thread()); - - mp->mnt_crossref++; - vnode_unlock(dp); - - - if (vfs_busy(mp, vbusyflags)) { - mount_dropcrossref(mp, dp, 0); - if (vbusyflags == LK_NOWAIT) { - error = ENOENT; - goto bad2; - } - goto check_mounted_on; - } - - /* - * XXX - if this is the last component of the - * pathname, and it's either not a lookup operation - * or the NOTRIGGER flag is set for the operation, - * set a uthread flag to let VFS_ROOT() for autofs - * know it shouldn't trigger a mount. - */ - if ((cnp->cn_flags & ISLASTCN) && - (cnp->cn_nameiop != LOOKUP || - (cnp->cn_flags & NOTRIGGER))) { - uth->uu_notrigger = 1; - dont_cache_mp = 1; - } - error = VFS_ROOT(mp, &tdp, ctx); - /* XXX - clear the uthread flag */ - uth->uu_notrigger = 0; - /* - * mount_dropcrossref does a vnode_put - * on dp if the 3rd arg is non-zero - */ - mount_dropcrossref(mp, dp, 1); - dp = NULL; - vfs_unbusy(mp); - - if (error) { - goto bad2; - } - ndp->ni_vp = dp = tdp; - mounted_on_depth++; - - goto check_mounted_on; - } - vnode_unlock(dp); + /* We'll always have an iocount on ni_vp when this finishes. */ + error = lookup_handle_found_vnode(ndp, cnp, rdonly, vbusyflags, &keep_going, nc_generation, wantparent, atroot, ctx); + if (error != 0) { + goto bad2; } -#if CONFIG_MACF - if (vfs_flags(vnode_mount(dp)) & MNT_MULTILABEL) { - error = vnode_label(vnode_mount(dp), NULL, dp, NULL, 0, ctx); - if (error) - goto bad2; - } -#endif - - if (mounted_on_depth && !dont_cache_mp) { - mp = mounted_on_dp->v_mountedhere; - - if (mp) { - mount_lock_spin(mp); - mp->mnt_realrootvp_vid = dp->v_id; - mp->mnt_realrootvp = dp; - mp->mnt_generation = current_mount_generation; - mount_unlock(mp); - } - } - - /* - * Check for symbolic link - */ - if ((dp->v_type == VLNK) && - ((cnp->cn_flags & FOLLOW) || trailing_slash || *ndp->ni_next == '/')) { - cnp->cn_flags |= ISSYMLINK; - return (0); - } - - /* - * Check for bogus trailing slashes. - */ - if (trailing_slash) { - if (dp->v_type != VDIR) { - error = ENOTDIR; - goto bad2; - } - trailing_slash = 0; - } - -nextname: - /* - * Not a symbolic link. If more pathname, - * continue at next component, else return. - */ - if (*ndp->ni_next == '/') { - cnp->cn_nameptr = ndp->ni_next + 1; - ndp->ni_pathlen--; - while (*cnp->cn_nameptr == '/') { - cnp->cn_nameptr++; - ndp->ni_pathlen--; - } - vnode_put(ndp->ni_dvp); - - cp = cnp->cn_nameptr; - - if (*cp == '\0') - goto emptyname; - - /* - * cache_lookup_path is now responsible for dropping io ref on dp - * when it is called again in the dirloop. This ensures we hold - * a ref on dp until we complete the next round of lookup. - */ - last_dp = dp; - goto dirloop; - } - - /* - * Disallow directory write attempts on read-only file systems. - */ - if (rdonly && - (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { - error = EROFS; - goto bad2; - } - if (cnp->cn_flags & SAVESTART) { - /* - * note that we already hold a reference - * on both dp and ni_dvp, but for some reason - * can't get another one... in this case we - * need to do vnode_put on dp in 'bad2' - */ - if ( (vnode_get(ndp->ni_dvp)) ) { - error = ENOENT; - goto bad2; - } - ndp->ni_startdir = ndp->ni_dvp; - } - if (!wantparent && ndp->ni_dvp) { - vnode_put(ndp->ni_dvp); - ndp->ni_dvp = NULLVP; - } - - if (cnp->cn_flags & AUDITVNPATH1) - AUDIT_ARG(vnpath, dp, ARG_VNODE1); - else if (cnp->cn_flags & AUDITVNPATH2) - AUDIT_ARG(vnpath, dp, ARG_VNODE2); - -#if NAMEDRSRCFORK - /* - * Caller wants the resource fork. - */ - if ((cnp->cn_flags & CN_WANTSRSRCFORK) && (dp != NULLVP)) { - vnode_t svp = NULLVP; - enum nsoperation nsop; - - if (dp->v_type != VREG) { - error = ENOENT; - goto bad2; - } - switch (cnp->cn_nameiop) { - case DELETE: - if (cnp->cn_flags & CN_ALLOWRSRCFORK) { - nsop = NS_DELETE; - } else { - error = EPERM; - goto bad2; - } - break; - case CREATE: - if (cnp->cn_flags & CN_ALLOWRSRCFORK) { - nsop = NS_CREATE; - } else { - error = EPERM; - goto bad2; - } - break; - case LOOKUP: - /* Make sure our lookup of "/..namedfork/rsrc" is allowed. */ - if (cnp->cn_flags & CN_ALLOWRSRCFORK) { - nsop = NS_OPEN; - } else { - error = EPERM; - goto bad2; - } - break; - default: - error = EPERM; - goto bad2; - } - /* Ask the file system for the resource fork. */ - error = vnode_getnamedstream(dp, &svp, XATTR_RESOURCEFORK_NAME, nsop, 0, ctx); - - /* During a create, it OK for stream vnode to be missing. */ - if (error == ENOATTR || error == ENOENT) { - error = (nsop == NS_CREATE) ? 0 : ENOENT; - } - if (error) { - goto bad2; - } - /* The "parent" of the stream is the file. */ - if (wantparent) { - if (ndp->ni_dvp) { -#ifndef __LP64__ - if (ndp->ni_cnd.cn_flags & FSNODELOCKHELD) { - ndp->ni_cnd.cn_flags &= ~FSNODELOCKHELD; - unlock_fsnode(ndp->ni_dvp, NULL); - } -#endif /* __LP64__ */ - vnode_put(ndp->ni_dvp); - } - ndp->ni_dvp = dp; - } else { - vnode_put(dp); - } - ndp->ni_vp = dp = svp; /* on create this may be null */ + if (keep_going) { + dp = ndp->ni_vp; - /* Restore the truncated pathname buffer (for audits). */ - if (ndp->ni_pathlen == 1 && ndp->ni_next[0] == '\0') { - ndp->ni_next[0] = '/'; + /* namei() will handle symlinks */ + if ((dp->v_type == VLNK) && + ((cnp->cn_flags & FOLLOW) || (ndp->ni_flag & NAMEI_TRAILINGSLASH) || *ndp->ni_next == '/')) { + return 0; } - cnp->cn_flags &= ~MAKEENTRY; - } -#endif - if (kdebug_enable) - kdebug_lookup(dp, cnp); - return (0); -emptyname: - cnp->cn_namelen = 0; - /* - * A degenerate name (e.g. / or "") which is a way of - * talking about a directory, e.g. like "/." or ".". - */ - if (dp->v_type != VDIR) { - error = ENOTDIR; - goto bad; - } - if (cnp->cn_nameiop != LOOKUP) { - error = EISDIR; - goto bad; - } - if (wantparent) { - /* - * note that we already hold a reference - * on dp, but for some reason can't - * get another one... in this case we - * need to do vnode_put on dp in 'bad' + /* + * Otherwise, there's more path to process. + * cache_lookup_path is now responsible for dropping io ref on dp + * when it is called again in the dirloop. This ensures we hold + * a ref on dp until we complete the next round of lookup. */ - if ( (vnode_get(dp)) ) { - error = ENOENT; - goto bad; - } - ndp->ni_dvp = dp; + last_dp = dp; + + goto dirloop; } - cnp->cn_flags &= ~ISDOTDOT; - cnp->cn_flags |= ISLASTCN; - ndp->ni_next = cp; - ndp->ni_vp = dp; - if (cnp->cn_flags & AUDITVNPATH1) - AUDIT_ARG(vnpath, dp, ARG_VNODE1); - else if (cnp->cn_flags & AUDITVNPATH2) - AUDIT_ARG(vnpath, dp, ARG_VNODE2); - if (cnp->cn_flags & SAVESTART) - panic("lookup: SAVESTART"); return (0); - bad2: #ifndef __LP64__ if ((cnp->cn_flags & FSNODELOCKHELD)) { @@ -1070,9 +1105,9 @@ lookup(struct nameidata *ndp) } #endif /* __LP64__ */ if (ndp->ni_dvp) - vnode_put(ndp->ni_dvp); - if (dp) - vnode_put(dp); + vnode_put(ndp->ni_dvp); + + vnode_put(ndp->ni_vp); ndp->ni_vp = NULLVP; if (kdebug_enable) @@ -1095,6 +1130,257 @@ lookup(struct nameidata *ndp) return (error); } +int +lookup_validate_creation_path(struct nameidata *ndp) +{ + struct componentname *cnp = &ndp->ni_cnd; + + /* + * If creating and at end of pathname, then can consider + * allowing file to be created. + */ + if (cnp->cn_flags & RDONLY) { + return EROFS; + } + if ((cnp->cn_flags & ISLASTCN) && (ndp->ni_flag & NAMEI_TRAILINGSLASH) && !(cnp->cn_flags & WILLBEDIR)) { + return ENOENT; + } + + return 0; +} + +/* + * Modifies only ni_vp. Always returns with ni_vp still valid (iocount held). + */ +int +lookup_traverse_mountpoints(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, + int vbusyflags, vfs_context_t ctx) +{ + mount_t mp; + vnode_t tdp; + int error = 0; + uthread_t uth; + uint32_t depth = 0; + int dont_cache_mp = 0; + vnode_t mounted_on_dp; + int current_mount_generation = 0; + + mounted_on_dp = dp; + current_mount_generation = mount_generation; + + while ((dp->v_type == VDIR) && dp->v_mountedhere && + ((cnp->cn_flags & NOCROSSMOUNT) == 0)) { +#if CONFIG_TRIGGERS + /* + * For a trigger vnode, call its resolver when crossing its mount (if requested) + */ + if (dp->v_resolve) { + (void) vnode_trigger_resolve(dp, ndp, ctx); + } +#endif + vnode_lock(dp); + + if ((dp->v_type == VDIR) && (mp = dp->v_mountedhere)) { + + mp->mnt_crossref++; + vnode_unlock(dp); + + + if (vfs_busy(mp, vbusyflags)) { + mount_dropcrossref(mp, dp, 0); + if (vbusyflags == LK_NOWAIT) { + error = ENOENT; + goto out; + } + + continue; + } + + + /* + * XXX - if this is the last component of the + * pathname, and it's either not a lookup operation + * or the NOTRIGGER flag is set for the operation, + * set a uthread flag to let VFS_ROOT() for autofs + * know it shouldn't trigger a mount. + */ + uth = (struct uthread *)get_bsdthread_info(current_thread()); + if ((cnp->cn_flags & ISLASTCN) && + (cnp->cn_nameiop != LOOKUP || + (cnp->cn_flags & NOTRIGGER))) { + uth->uu_notrigger = 1; + dont_cache_mp = 1; + } + + error = VFS_ROOT(mp, &tdp, ctx); + /* XXX - clear the uthread flag */ + uth->uu_notrigger = 0; + + mount_dropcrossref(mp, dp, 0); + vfs_unbusy(mp); + + if (error) { + goto out; + } + + vnode_put(dp); + ndp->ni_vp = dp = tdp; + depth++; + +#if CONFIG_TRIGGERS + /* + * Check if root dir is a trigger vnode + */ + if (dp->v_resolve) { + error = vnode_trigger_resolve(dp, ndp, ctx); + if (error) { + goto out; + } + } +#endif + + } else { + vnode_unlock(dp); + break; + } + } + + if (depth && !dont_cache_mp) { + mp = mounted_on_dp->v_mountedhere; + + if (mp) { + mount_lock_spin(mp); + mp->mnt_realrootvp_vid = dp->v_id; + mp->mnt_realrootvp = dp; + mp->mnt_generation = current_mount_generation; + mount_unlock(mp); + } + } + + return 0; + +out: + return error; +} + +/* + * Takes ni_vp and ni_dvp non-NULL. Returns with *new_dp set to the location + * at which to start a lookup with a resolved path, and all other iocounts dropped. + */ +int +lookup_handle_symlink(struct nameidata *ndp, vnode_t *new_dp, vfs_context_t ctx) +{ + int error; + char *cp; /* pointer into pathname argument */ + uio_t auio; + char uio_buf[ UIO_SIZEOF(1) ]; + int need_newpathbuf; + u_int linklen; + struct componentname *cnp = &ndp->ni_cnd; + vnode_t dp; + char *tmppn; + +#ifndef __LP64__ + if ((cnp->cn_flags & FSNODELOCKHELD)) { + cnp->cn_flags &= ~FSNODELOCKHELD; + unlock_fsnode(ndp->ni_dvp, NULL); + } +#endif /* __LP64__ */ + + if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { + return ELOOP; + } +#if CONFIG_MACF + if ((error = mac_vnode_check_readlink(ctx, ndp->ni_vp)) != 0) + return error; +#endif /* MAC */ + if (ndp->ni_pathlen > 1 || !(cnp->cn_flags & HASBUF)) + need_newpathbuf = 1; + else + need_newpathbuf = 0; + + if (need_newpathbuf) { + MALLOC_ZONE(cp, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (cp == NULL) { + return ENOMEM; + } + } else { + cp = cnp->cn_pnbuf; + } + auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf)); + + uio_addiov(auio, CAST_USER_ADDR_T(cp), MAXPATHLEN); + + error = VNOP_READLINK(ndp->ni_vp, auio, ctx); + if (error) { + if (need_newpathbuf) + FREE_ZONE(cp, MAXPATHLEN, M_NAMEI); + return error; + } + + /* + * Safe to set unsigned with a [larger] signed type here + * because 0 <= uio_resid <= MAXPATHLEN and MAXPATHLEN + * is only 1024. + */ + linklen = MAXPATHLEN - (u_int)uio_resid(auio); + if (linklen + ndp->ni_pathlen > MAXPATHLEN) { + if (need_newpathbuf) + FREE_ZONE(cp, MAXPATHLEN, M_NAMEI); + + return ENAMETOOLONG; + } + if (need_newpathbuf) { + long len = cnp->cn_pnlen; + + tmppn = cnp->cn_pnbuf; + bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen); + cnp->cn_pnbuf = cp; + cnp->cn_pnlen = MAXPATHLEN; + + if ( (cnp->cn_flags & HASBUF) ) + FREE_ZONE(tmppn, len, M_NAMEI); + else + cnp->cn_flags |= HASBUF; + } else + cnp->cn_pnbuf[linklen] = '\0'; + + ndp->ni_pathlen += linklen; + cnp->cn_nameptr = cnp->cn_pnbuf; + + /* + * starting point for 'relative' + * symbolic link path + */ + dp = ndp->ni_dvp; + + /* + * get rid of references returned via 'lookup' + */ + vnode_put(ndp->ni_vp); + vnode_put(ndp->ni_dvp); /* ALWAYS have a dvp for a symlink */ + + ndp->ni_vp = NULLVP; + ndp->ni_dvp = NULLVP; + + /* + * Check if symbolic link restarts us at the root + */ + if (*(cnp->cn_nameptr) == '/') { + while (*(cnp->cn_nameptr) == '/') { + cnp->cn_nameptr++; + ndp->ni_pathlen--; + } + if ((dp = ndp->ni_rootdir) == NULLVP) { + return ENOENT; + } + } + + *new_dp = dp; + + return 0; +} + /* * relookup - lookup a path name component * Used by lookup to re-aquire things. @@ -1205,18 +1491,27 @@ relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp) return (error); } -/* - * Free pathname buffer - */ void -nameidone(struct nameidata *ndp) +namei_unlock_fsnode(struct nameidata *ndp) { #ifndef __LP64__ if ((ndp->ni_cnd.cn_flags & FSNODELOCKHELD)) { ndp->ni_cnd.cn_flags &= ~FSNODELOCKHELD; unlock_fsnode(ndp->ni_dvp, NULL); } +#else + (void)ndp; #endif /* __LP64__ */ +} + +/* + * Free pathname buffer + */ +void +nameidone(struct nameidata *ndp) +{ + namei_unlock_fsnode(ndp); + if (ndp->ni_cnd.cn_flags & HASBUF) { char *tmp = ndp->ni_cnd.cn_pnbuf; @@ -1258,6 +1553,7 @@ nameidone(struct nameidata *ndp) * fails because /foo_bar_baz is not found will only log "/foo_bar_baz", with * no '>' padding. But /foo_bar/spam would log "/foo_bar>>>>". */ +#if !defined(NO_KDEBUG) static void kdebug_lookup(struct vnode *dp, struct componentname *cnp) { @@ -1305,7 +1601,34 @@ kdebug_lookup(struct vnode *dp, struct componentname *cnp) KERNEL_DEBUG_CONSTANT(code, dbg_parms[i], dbg_parms[i+1], dbg_parms[i+2], dbg_parms[i+3], 0); } } +#else /* NO_KDEBUG */ +static void +kdebug_lookup(struct vnode *dp __unused, struct componentname *cnp __unused) +{ +} +#endif /* NO_KDEBUG */ + +int +vfs_getbyid(fsid_t *fsid, ino64_t ino, vnode_t *vpp, vfs_context_t ctx) +{ + mount_t mp; + int error; + + mp = mount_lookupby_volfsid(fsid->val[0], 1); + if (mp == NULL) { + return EINVAL; + } + + /* Get the target vnode. */ + if (ino == 2) { + error = VFS_ROOT(mp, vpp, ctx); + } else { + error = VFS_VGET(mp, ino, vpp, ctx); + } + vfs_unbusy(mp); + return error; +} /* * Obtain the real path from a legacy volfs style path. * @@ -1384,3 +1707,59 @@ vfs_getrealpath(const char * path, char * realpath, size_t bufsize, vfs_context_ return (error); } #endif + +void +lookup_compound_vnop_post_hook(int error, vnode_t dvp, vnode_t vp, struct nameidata *ndp, int did_create) +{ + if (error == 0 && vp == NULLVP) { + panic("NULL vp with error == 0.\n"); + } + + /* + * We don't want to do any of this if we didn't use the compound vnop + * to perform the lookup... i.e. if we're allowing and using the legacy pattern, + * where we did a full lookup. + */ + if ((ndp->ni_flag & NAMEI_COMPOUND_OP_MASK) == 0) { + return; + } + + /* + * If we're going to continue the lookup, we'll handle + * all lookup-related updates at that time. + */ + if (error == EKEEPLOOKING) { + return; + } + + /* + * Only audit or update cache for *found* vnodes. For creation + * neither would happen in the non-compound-vnop case. + */ + if ((vp != NULLVP) && !did_create) { + /* + * If MAKEENTRY isn't set, and we've done a successful compound VNOP, + * then we certainly don't want to update cache or identity. + */ + if ((error != 0) || (ndp->ni_cnd.cn_flags & MAKEENTRY)) { + lookup_consider_update_cache(dvp, vp, &ndp->ni_cnd, ndp->ni_ncgeneration); + } + if (ndp->ni_cnd.cn_flags & AUDITVNPATH1) + AUDIT_ARG(vnpath, vp, ARG_VNODE1); + else if (ndp->ni_cnd.cn_flags & AUDITVNPATH2) + AUDIT_ARG(vnpath, vp, ARG_VNODE2); + } + + /* + * If you created (whether you opened or not), cut a lookup tracepoint + * for the parent dir (as would happen without a compound vnop). Note: we may need + * a vnode despite failure in this case! + * + * If you did not create: + * Found child (succeeded or not): cut a tracepoint for the child. + * Did not find child: cut a tracepoint with the parent. + */ + if (kdebug_enable) { + kdebug_lookup(vp ? vp : dvp, &ndp->ni_cnd); + } +} diff --git a/bsd/vfs/vfs_subr.c b/bsd/vfs/vfs_subr.c index 3b10114cb..462fbef79 100644 --- a/bsd/vfs/vfs_subr.c +++ b/bsd/vfs/vfs_subr.c @@ -118,12 +118,17 @@ #include #include +#include #include /* kalloc()/kfree() */ #include /* delay_for_interval() */ #include /* OSAddAtomic() */ +#ifdef JOE_DEBUG +#include +#endif + #include /* vnode_pager_vrele() */ #if CONFIG_MACF @@ -133,6 +138,10 @@ extern lck_grp_t *vnode_lck_grp; extern lck_attr_t *vnode_lck_attr; +#if CONFIG_TRIGGERS +extern lck_grp_t *trigger_vnode_lck_grp; +extern lck_attr_t *trigger_vnode_lck_attr; +#endif extern lck_mtx_t * mnt_list_mtx_lock; @@ -145,6 +154,16 @@ int vttoif_tab[9] = { S_IFSOCK, S_IFIFO, S_IFMT, }; + +/* XXX These should be in a BSD accessible Mach header, but aren't. */ +extern void memory_object_mark_used( + memory_object_control_t control); + +extern void memory_object_mark_unused( + memory_object_control_t control, + boolean_t rage); + + /* XXX next protptype should be from */ extern int nfs_vinvalbuf(vnode_t, int, vfs_context_t, int); @@ -173,7 +192,6 @@ static void vclean(vnode_t vp, int flag); static void vnode_reclaim_internal(vnode_t, int, int, int); static void vnode_dropiocount (vnode_t); -static errno_t vnode_getiocount(vnode_t vp, unsigned int vid, int vflags); static vnode_t checkalias(vnode_t vp, dev_t nvp_rdev); static int vnode_reload(vnode_t); @@ -188,6 +206,9 @@ static int vnode_iterate_prepare(mount_t); static int vnode_iterate_reloadq(mount_t); static void vnode_iterate_clear(mount_t); static mount_t vfs_getvfs_locked(fsid_t *); +static int vn_create_reg(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, + struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx); +static int vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uint32_t *defaulted_fieldsp, vfs_context_t ctx); errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *); @@ -195,6 +216,11 @@ errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *); static void record_vp(vnode_t vp, int count); #endif +#if CONFIG_TRIGGERS +static int vnode_resolver_create(mount_t, vnode_t, struct vnode_trigger_param *, boolean_t external); +static void vnode_resolver_detach(vnode_t); +#endif + TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ TAILQ_HEAD(deadlst, vnode) vnode_dead_list; /* vnode dead list */ @@ -370,29 +396,27 @@ void vnode_writedone(vnode_t vp) { if (vp) { - OSAddAtomic(-1, &vp->v_numoutput); + int need_wakeup = 0; - if (vp->v_numoutput <= 1) { - int need_wakeup = 0; + OSAddAtomic(-1, &vp->v_numoutput); - vnode_lock_spin(vp); + vnode_lock_spin(vp); - if (vp->v_numoutput < 0) - panic("vnode_writedone: numoutput < 0"); + if (vp->v_numoutput < 0) + panic("vnode_writedone: numoutput < 0"); - if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= 1)) { - vp->v_flag &= ~VTHROTTLED; - need_wakeup = 1; - } - if ((vp->v_flag & VBWAIT) && (vp->v_numoutput == 0)) { - vp->v_flag &= ~VBWAIT; - need_wakeup = 1; - } - vnode_unlock(vp); - - if (need_wakeup) - wakeup((caddr_t)&vp->v_numoutput); + if ((vp->v_flag & VTHROTTLED)) { + vp->v_flag &= ~VTHROTTLED; + need_wakeup = 1; + } + if ((vp->v_flag & VBWAIT) && (vp->v_numoutput == 0)) { + vp->v_flag &= ~VBWAIT; + need_wakeup = 1; } + vnode_unlock(vp); + + if (need_wakeup) + wakeup((caddr_t)&vp->v_numoutput); } } @@ -781,6 +805,13 @@ mount_refdrain(mount_t mp) return(0); } +/* Tags the mount point as not supportine extended readdir for NFS exports */ +void +mount_set_noreaddirext(mount_t mp) { + mount_lock (mp); + mp->mnt_kern_flag |= MNTK_DENY_READDIREXT; + mount_unlock (mp); +} /* * Mark a mount point as busy. Used to synchronize access and to delay @@ -887,6 +918,8 @@ vfs_rootmountalloc_internal(struct vfstable *vfsp, const char *devname) mp->mnt_ioflags = 0; mp->mnt_realrootvp = NULLVP; mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL; + mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1; + mp->mnt_devbsdunit = 0; mount_lock_init(mp); (void)vfs_busy(mp, LK_NOWAIT); @@ -1120,6 +1153,8 @@ vfs_getvfs_by_mntonname(char *path) if (!strncmp(mp->mnt_vfsstat.f_mntonname, path, sizeof(mp->mnt_vfsstat.f_mntonname))) { retmp = mp; + if (mount_iterref(retmp, 1)) + retmp = NULL; goto out; } } @@ -1358,6 +1393,7 @@ checkalias(struct vnode *nvp, dev_t nvp_rdev) nvp->v_rdev = nvp_rdev; nvp->v_specflags = 0; nvp->v_speclastr = -1; + nvp->v_specinfo->si_opencount = 0; SPECHASH_LOCK(); @@ -1416,22 +1452,16 @@ int vget_internal(vnode_t vp, int vid, int vflags) { int error = 0; - int vpid; vnode_lock_spin(vp); - if (vflags & VNODE_WITHID) - vpid = vid; - else - vpid = vp->v_id; // save off the original v_id - if ((vflags & VNODE_WRITEABLE) && (vp->v_writecount == 0)) /* * vnode to be returned only if it has writers opened */ error = EINVAL; else - error = vnode_getiocount(vp, vpid, vflags); + error = vnode_getiocount(vp, vid, vflags); vnode_unlock(vp); @@ -1446,7 +1476,7 @@ int vnode_ref(vnode_t vp) { - return (vnode_ref_ext(vp, 0)); + return (vnode_ref_ext(vp, 0, 0)); } /* @@ -1454,7 +1484,7 @@ vnode_ref(vnode_t vp) * ENOENT No such file or directory [terminating] */ int -vnode_ref_ext(vnode_t vp, int fmode) +vnode_ref_ext(vnode_t vp, int fmode, int flags) { int error = 0; @@ -1471,10 +1501,12 @@ vnode_ref_ext(vnode_t vp, int fmode) /* * if you are the owner of drain/termination, can acquire usecount */ - if ((vp->v_lflag & (VL_DRAIN | VL_TERMINATE | VL_DEAD))) { - if (vp->v_owner != current_thread()) { - error = ENOENT; - goto out; + if ((flags & VNODE_REF_FORCE) == 0) { + if ((vp->v_lflag & (VL_DRAIN | VL_TERMINATE | VL_DEAD))) { + if (vp->v_owner != current_thread()) { + error = ENOENT; + goto out; + } } } vp->v_usecount++; @@ -1507,6 +1539,13 @@ vnode_ref_ext(vnode_t vp, int fmode) vnode_list_remove(vp); } } + if (vp->v_usecount == 1 && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) { + + if (vp->v_ubcinfo) { + vnode_lock_convert(vp); + memory_object_mark_used(vp->v_ubcinfo->ui_control); + } + } out: vnode_unlock(vp); @@ -1659,6 +1698,7 @@ vnode_rele_ext(vnode_t vp, int fmode, int dont_reenter) void vnode_rele_internal(vnode_t vp, int fmode, int dont_reenter, int locked) { + if ( !locked) vnode_lock_spin(vp); #if DIAGNOSTIC @@ -1689,9 +1729,7 @@ vnode_rele_internal(vnode_t vp, int fmode, int dont_reenter, int locked) vp->v_lflag |= VL_NEEDINACTIVE; vp->v_flag &= ~(VNOCACHE_DATA | VRAOFF | VOPENEVT); } - if ( !locked) - vnode_unlock(vp); - return; + goto done; } vp->v_flag &= ~(VNOCACHE_DATA | VRAOFF | VOPENEVT); @@ -1709,9 +1747,8 @@ vnode_rele_internal(vnode_t vp, int fmode, int dont_reenter, int locked) vp->v_flag |= VAGE; } vnode_list_add(vp); - if ( !locked) - vnode_unlock(vp); - return; + + goto done; } /* * at this point both the iocount and usecount @@ -1746,15 +1783,22 @@ vnode_rele_internal(vnode_t vp, int fmode, int dont_reenter, int locked) if (ut->uu_defer_reclaims) { vp->v_defer_reclaimlist = ut->uu_vreclaims; - ut->uu_vreclaims = vp; - goto defer_reclaim; + ut->uu_vreclaims = vp; + goto done; } vnode_lock_convert(vp); vnode_reclaim_internal(vp, 1, 1, 0); } vnode_dropiocount(vp); vnode_list_add(vp); -defer_reclaim: +done: + if (vp->v_usecount == 0 && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) { + + if (vp->v_ubcinfo) { + vnode_lock_convert(vp); + memory_object_mark_unused(vp->v_ubcinfo->ui_control, (vp->v_flag & VRAGE) == VRAGE); + } + } if ( !locked) vnode_unlock(vp); return; @@ -2020,13 +2064,13 @@ vclean(vnode_t vp, int flags) #endif { VNOP_FSYNC(vp, MNT_WAIT, ctx); - buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0); + buf_invalidateblks(vp, BUF_WRITE_DATA | BUF_INVALIDATE_LOCKED, 0, 0); } if (UBCINFOEXISTS(vp)) /* * Clean the pages in VM. */ - (void)ubc_sync_range(vp, (off_t)0, ubc_getsize(vp), UBC_PUSHALL); + (void)ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL, UBC_PUSHALL | UBC_INVALIDATE | UBC_SYNC); } if (active || need_inactive) VNOP_INACTIVE(vp, ctx); @@ -2039,18 +2083,7 @@ vclean(vnode_t vp, int flags) if (vnode_isshadow(vp)) { vnode_relenamedstream(pvp, vp, ctx); } - - /* - * Because vclean calls VNOP_INACTIVE prior to calling vnode_relenamedstream, we may not have - * torn down and/or deleted the shadow file yet. On HFS, if the shadow file is sufficiently large - * and occupies a large number of extents, the deletion will be deferred until VNOP_INACTIVE - * and the file treated like an open-unlinked. To rectify this, call VNOP_INACTIVE again - * explicitly to force its removal. - */ - if (vnode_isshadow(vp)) { - VNOP_INACTIVE(vp, ctx); - } - + /* * No more streams associated with the parent. We * have a ref on it, so its identity is stable. @@ -2072,6 +2105,14 @@ vclean(vnode_t vp, int flags) */ ubc_destroy_named(vp); +#if CONFIG_TRIGGERS + /* + * cleanup trigger info from vnode (if any) + */ + if (vp->v_resolve) + vnode_resolver_detach(vp); +#endif + /* * Reclaim the vnode. */ @@ -2301,7 +2342,7 @@ vcount(vnode_t vp) loop: if (!vnode_isaliased(vp)) - return (vp->v_usecount - vp->v_kusecount); + return (vp->v_specinfo->si_opencount); count = 0; SPECHASH_LOCK(); @@ -2332,7 +2373,7 @@ vcount(vnode_t vp) vnode_unlock(vq); goto loop; } - count += (vq->v_usecount - vq->v_kusecount); + count += vq->v_specinfo->si_opencount; } vnode_unlock(vq); @@ -2710,7 +2751,7 @@ sysctl_vnode } SYSCTL_PROC(_kern, KERN_VNODE, vnode, - CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MASKED, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, 0, sysctl_vnode, "S,", ""); @@ -2804,6 +2845,8 @@ vnode_pager_vrele(vnode_t vp) #include +u_int32_t rootunit = (u_int32_t)-1; + errno_t vfs_init_io_attributes(vnode_t devvp, mount_t mp) { @@ -2824,24 +2867,25 @@ vfs_init_io_attributes(vnode_t devvp, mount_t mp) vfs_context_t ctx = vfs_context_current(); int isssd = 0; int isvirtual = 0; + + + VNOP_IOCTL(devvp, DKIOCGETTHROTTLEMASK, (caddr_t)&mp->mnt_throttle_mask, 0, NULL); /* - * determine if this mount point exists on the same device as the root - * partition... if so, then it comes under the hard throttle control + * as a reasonable approximation, only use the lowest bit of the mask + * to generate a disk unit number */ - int thisunit = -1; - static int rootunit = -1; + mp->mnt_devbsdunit = num_trailing_0(mp->mnt_throttle_mask); - if (rootunit == -1) { - if (VNOP_IOCTL(rootvp, DKIOCGETBSDUNIT, (caddr_t)&rootunit, 0, ctx)) - rootunit = -1; - else if (rootvp == devvp) - mp->mnt_kern_flag |= MNTK_ROOTDEV; - } - if (devvp != rootvp && rootunit != -1) { - if (VNOP_IOCTL(devvp, DKIOCGETBSDUNIT, (caddr_t)&thisunit, 0, ctx) == 0) { - if (thisunit == rootunit) - mp->mnt_kern_flag |= MNTK_ROOTDEV; - } + if (devvp == rootvp) + rootunit = mp->mnt_devbsdunit; + + if (mp->mnt_devbsdunit == rootunit) { + /* + * this mount point exists on the same device as the root + * partition, so it comes under the hard throttle control... + * this is true even for the root mount point itself + */ + mp->mnt_kern_flag |= MNTK_ROOTDEV; } /* * force the spec device to re-cache @@ -2875,7 +2919,6 @@ vfs_init_io_attributes(vnode_t devvp, mount_t mp) if (isssd) mp->mnt_kern_flag |= MNTK_SSD; } - if ((error = VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&features, 0, ctx))) return (error); @@ -3253,7 +3296,11 @@ sysctl_vfs_ctlbyfsid(__unused struct sysctl_oid *oidp, void *arg1, int arg2, sfs.f_fsid = sp->f_fsid; sfs.f_owner = sp->f_owner; - strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN); + if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) { + strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN); + } else { + strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN); + } strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN); strlcpy(sfs.f_mntfromname, sp->f_mntfromname, MNAMELEN); @@ -3307,7 +3354,11 @@ sysctl_vfs_ctlbyfsid(__unused struct sysctl_oid *oidp, void *arg1, int arg2, sfs.f_fsid = sp->f_fsid; sfs.f_owner = sp->f_owner; - strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN); + if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) { + strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN); + } else { + strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN); + } strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN); strlcpy(sfs.f_mntfromname, sp->f_mntfromname, MNAMELEN); @@ -3414,14 +3465,14 @@ sysctl_vfs_noremotehang(__unused struct sysctl_oid *oidp, } /* the vfs.generic. branch. */ -SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RW|CTLFLAG_LOCKED, NULL, "vfs generic hinge"); +SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs generic hinge"); /* retreive a list of mounted filesystem fsid_t */ -SYSCTL_PROC(_vfs_generic, OID_AUTO, vfsidlist, CTLFLAG_RD, +SYSCTL_PROC(_vfs_generic, OID_AUTO, vfsidlist, CTLFLAG_RD | CTLFLAG_LOCKED, NULL, 0, sysctl_vfs_vfslist, "S,fsid", "List of mounted filesystem ids"); /* perform operations on filesystem via fsid_t */ -SYSCTL_NODE(_vfs_generic, OID_AUTO, ctlbyfsid, CTLFLAG_RW|CTLFLAG_LOCKED, +SYSCTL_NODE(_vfs_generic, OID_AUTO, ctlbyfsid, CTLFLAG_RW | CTLFLAG_LOCKED, sysctl_vfs_ctlbyfsid, "ctlbyfsid"); -SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW|CTLFLAG_ANYBODY, +SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW | CTLFLAG_ANYBODY, NULL, 0, sysctl_vfs_noremotehang, "I", "noremotehang"); @@ -3448,18 +3499,18 @@ new_vnode(vnode_t *vpp) vnode_list_lock(); - if ( !TAILQ_EMPTY(&vnode_dead_list)) { - /* - * Can always reuse a dead one + if ((numvnodes - deadvnodes) < desiredvnodes || force_alloc) { + if ( !TAILQ_EMPTY(&vnode_dead_list)) { + /* + * Can always reuse a dead one + */ + vp = TAILQ_FIRST(&vnode_dead_list); + goto steal_this_vp; + } + /* + * no dead vnodes available... if we're under + * the limit, we'll create a new vnode */ - vp = TAILQ_FIRST(&vnode_dead_list); - goto steal_this_vp; - } - /* - * no dead vnodes available... if we're under - * the limit, we'll create a new vnode - */ - if (numvnodes < desiredvnodes || force_alloc) { numvnodes++; vnode_list_unlock(); @@ -3493,17 +3544,22 @@ new_vnode(vnode_t *vpp) panic("new_vnode: vp (%p) on RAGE list not marked VLIST_RAGE", vp); // if we're a dependency-capable process, skip vnodes that can - // cause recycling deadlocks. (i.e. this process is diskimages - // helper and the vnode is in a disk image). - // - if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL || vp->v_mount->mnt_dependent_process == NULL) { - break; + // cause recycling deadlocks. (i.e. this process is diskimages + // helper and the vnode is in a disk image). Querying the + // mnt_kern_flag for the mount's virtual device status + // is safer than checking the mnt_dependent_process, which + // may not be updated if there are multiple devnode layers + // in between the disk image and the final consumer. + + if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL || + (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) { + break; } // don't iterate more than MAX_WALK_COUNT vnodes to // avoid keeping the vnode list lock held for too long. if (walk_count++ > MAX_WALK_COUNT) { - vp = NULL; + vp = NULL; break; } } @@ -3516,12 +3572,18 @@ new_vnode(vnode_t *vpp) */ walk_count = 0; TAILQ_FOREACH(vp, &vnode_free_list, v_freelist) { - // if we're a dependency-capable process, skip vnodes that can - // cause recycling deadlocks. (i.e. this process is diskimages - // helper and the vnode is in a disk image) - // - if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL || vp->v_mount->mnt_dependent_process == NULL) { - break; + + // if we're a dependency-capable process, skip vnodes that can + // cause recycling deadlocks. (i.e. this process is diskimages + // helper and the vnode is in a disk image). Querying the + // mnt_kern_flag for the mount's virtual device status + // is safer than checking the mnt_dependent_process, which + // may not be updated if there are multiple devnode layers + // in between the disk image and the final consumer. + + if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL || + (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) { + break; } // don't iterate more than MAX_WALK_COUNT vnodes to @@ -3572,7 +3634,7 @@ new_vnode(vnode_t *vpp) * Running out of vnodes tends to make a system unusable. Start killing * processes that jetsam knows are killable. */ - if (jetsam_kill_top_proc() < 0) { + if (jetsam_kill_top_proc(TRUE, kJetsamFlagsKilledVnodes) < 0) { /* * If jetsam can't find any more processes to kill and there * still aren't any free vnodes, panic. Hopefully we'll get a @@ -3754,10 +3816,27 @@ vnode_get_locked(struct vnode *vp) return (0); } +/* + * vnode_getwithvid() cuts in line in front of a vnode drain (that is, + * while the vnode is draining, but at no point after that) to prevent + * deadlocks when getting vnodes from filesystem hashes while holding + * resources that may prevent other iocounts from being released. + */ int vnode_getwithvid(vnode_t vp, uint32_t vid) { - return(vget_internal(vp, vid, ( VNODE_NODEAD| VNODE_WITHID))); + return(vget_internal(vp, vid, ( VNODE_NODEAD | VNODE_WITHID | VNODE_DRAINO ))); +} + +/* + * vnode_getwithvid_drainok() is like vnode_getwithvid(), but *does* block behind a vnode + * drain; it exists for use in the VFS name cache, where we really do want to block behind + * vnode drain to prevent holding off an unmount. + */ +int +vnode_getwithvid_drainok(vnode_t vp, uint32_t vid) +{ + return(vget_internal(vp, vid, ( VNODE_NODEAD | VNODE_WITHID ))); } int @@ -3801,7 +3880,7 @@ vnode_put_locked(vnode_t vp) vnode_dropiocount(vp); return(0); } - if ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD | VL_NEEDINACTIVE)) == VL_NEEDINACTIVE) { + if ((vp->v_lflag & (VL_DEAD | VL_NEEDINACTIVE)) == VL_NEEDINACTIVE) { vp->v_lflag &= ~VL_NEEDINACTIVE; vnode_unlock(vp); @@ -3914,7 +3993,7 @@ vnode_drain(vnode_t vp) { if (vp->v_lflag & VL_DRAIN) { - panic("vnode_drain: recursuve drain"); + panic("vnode_drain: recursive drain"); return(ENOENT); } vp->v_lflag |= VL_DRAIN; @@ -3922,13 +4001,16 @@ vnode_drain(vnode_t vp) while (vp->v_iocount > 1) msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_drain", NULL); + + vp->v_lflag &= ~VL_DRAIN; + return(0); } /* * if the number of recent references via vnode_getwithvid or vnode_getwithref - * exceeds this threshhold, than 'UN-AGE' the vnode by removing it from + * exceeds this threshold, than 'UN-AGE' the vnode by removing it from * the LRU list if it's currently on it... once the iocount and usecount both drop * to 0, it will get put back on the end of the list, effectively making it younger * this allows us to keep actively referenced vnodes in the list without having @@ -3937,12 +4019,13 @@ vnode_drain(vnode_t vp) */ #define UNAGE_THRESHHOLD 25 -static errno_t +errno_t vnode_getiocount(vnode_t vp, unsigned int vid, int vflags) { int nodead = vflags & VNODE_NODEAD; int nosusp = vflags & VNODE_NOSUSPEND; int always = vflags & VNODE_ALWAYS; + int beatdrain = vflags & VNODE_DRAINO; for (;;) { /* @@ -3974,6 +4057,18 @@ vnode_getiocount(vnode_t vp, unsigned int vid, int vflags) if (always != 0) break; + + /* + * In some situations, we want to get an iocount + * even if the vnode is draining to prevent deadlock, + * e.g. if we're in the filesystem, potentially holding + * resources that could prevent other iocounts from + * being released. + */ + if (beatdrain && (vp->v_lflag & VL_DRAIN)) { + break; + } + vnode_lock_convert(vp); if (vp->v_lflag & VL_TERMINATE) { @@ -3983,7 +4078,7 @@ vnode_getiocount(vnode_t vp, unsigned int vid, int vflags) } else msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_getiocount", NULL); } - if (vid != vp->v_id) { + if (((vflags & VNODE_WITHID) != 0) && vid != vp->v_id) { return(ENOENT); } if (++vp->v_references >= UNAGE_THRESHHOLD) { @@ -4087,7 +4182,6 @@ vnode_reclaim_internal(struct vnode * vp, int locked, int reuse, int flags) vp->v_socket = NULL; vp->v_lflag &= ~VL_TERMINATE; - vp->v_lflag &= ~VL_DRAIN; vp->v_owner = NULL; KNOTE(&vp->v_knotes, NOTE_REVOKE); @@ -4114,7 +4208,6 @@ vnode_reclaim_internal(struct vnode * vp, int locked, int reuse, int flags) * The following api creates a vnode and associates all the parameter specified in vnode_fsparam * structure and returns a vnode handle with a reference. device aliasing is handled here so checkalias * is obsoleted by this. - * vnode_create(int flavor, size_t size, void * param, vnode_t *vp) */ int vnode_create(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp) @@ -4127,159 +4220,210 @@ vnode_create(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp) struct uthread *ut; struct componentname *cnp; struct vnode_fsparam *param = (struct vnode_fsparam *)data; - - if (flavor == VNCREATE_FLAVOR && (size == VCREATESIZE) && param) { - if ( (error = new_vnode(&vp)) ) { - return(error); - } else { - dvp = param->vnfs_dvp; - cnp = param->vnfs_cnp; +#if CONFIG_TRIGGERS + struct vnode_trigger_param *tinfo = NULL; +#endif + if (param == NULL) + return (EINVAL); - vp->v_op = param->vnfs_vops; - vp->v_type = param->vnfs_vtype; - vp->v_data = param->vnfs_fsnode; +#if CONFIG_TRIGGERS + if ((flavor == VNCREATE_TRIGGER) && (size == VNCREATE_TRIGGER_SIZE)) { + tinfo = (struct vnode_trigger_param *)data; + + /* Validate trigger vnode input */ + if ((param->vnfs_vtype != VDIR) || + (tinfo->vnt_resolve_func == NULL) || + (tinfo->vnt_flags & ~VNT_VALID_MASK)) { + return (EINVAL); + } + /* Fall through a normal create (params will be the same) */ + flavor = VNCREATE_FLAVOR; + size = VCREATESIZE; + } +#endif + if ((flavor != VNCREATE_FLAVOR) || (size != VCREATESIZE)) + return (EINVAL); + + if ( (error = new_vnode(&vp)) ) + return(error); - if (param->vnfs_markroot) - vp->v_flag |= VROOT; - if (param->vnfs_marksystem) - vp->v_flag |= VSYSTEM; - if (vp->v_type == VREG) { - error = ubc_info_init_withsize(vp, param->vnfs_filesize); - if (error) { + dvp = param->vnfs_dvp; + cnp = param->vnfs_cnp; + + vp->v_op = param->vnfs_vops; + vp->v_type = param->vnfs_vtype; + vp->v_data = param->vnfs_fsnode; + + if (param->vnfs_markroot) + vp->v_flag |= VROOT; + if (param->vnfs_marksystem) + vp->v_flag |= VSYSTEM; + if (vp->v_type == VREG) { + error = ubc_info_init_withsize(vp, param->vnfs_filesize); + if (error) { #ifdef JOE_DEBUG - record_vp(vp, 1); + record_vp(vp, 1); #endif - vp->v_mount = NULL; - vp->v_op = dead_vnodeop_p; - vp->v_tag = VT_NON; - vp->v_data = NULL; - vp->v_type = VBAD; - vp->v_lflag |= VL_DEAD; - - vnode_put(vp); - return(error); - } - } + vp->v_mount = NULL; + vp->v_op = dead_vnodeop_p; + vp->v_tag = VT_NON; + vp->v_data = NULL; + vp->v_type = VBAD; + vp->v_lflag |= VL_DEAD; + + vnode_put(vp); + return(error); + } + } +#ifdef JOE_DEBUG + record_vp(vp, 1); +#endif + +#if CONFIG_TRIGGERS + /* + * For trigger vnodes, attach trigger info to vnode + */ + if ((vp->v_type == VDIR) && (tinfo != NULL)) { + /* + * Note: has a side effect of incrementing trigger count on the + * mount if successful, which we would need to undo on a + * subsequent failure. + */ +#ifdef JOE_DEBUG + record_vp(vp, -1); +#endif + error = vnode_resolver_create(param->vnfs_mp, vp, tinfo, FALSE); + if (error) { + printf("vnode_create: vnode_resolver_create() err %d\n", error); + vp->v_mount = NULL; + vp->v_op = dead_vnodeop_p; + vp->v_tag = VT_NON; + vp->v_data = NULL; + vp->v_type = VBAD; + vp->v_lflag |= VL_DEAD; #ifdef JOE_DEBUG record_vp(vp, 1); #endif - if (vp->v_type == VCHR || vp->v_type == VBLK) { - - vp->v_tag = VT_DEVFS; /* callers will reset if needed (bdevvp) */ - - if ( (nvp = checkalias(vp, param->vnfs_rdev)) ) { - /* - * if checkalias returns a vnode, it will be locked - * - * first get rid of the unneeded vnode we acquired - */ - vp->v_data = NULL; - vp->v_op = spec_vnodeop_p; - vp->v_type = VBAD; - vp->v_lflag = VL_DEAD; - vp->v_data = NULL; - vp->v_tag = VT_NON; - vnode_put(vp); + vnode_put(vp); + return (error); + } + } +#endif + if (vp->v_type == VCHR || vp->v_type == VBLK) { - /* - * switch to aliased vnode and finish - * preparing it - */ - vp = nvp; - - vclean(vp, 0); - vp->v_op = param->vnfs_vops; - vp->v_type = param->vnfs_vtype; - vp->v_data = param->vnfs_fsnode; - vp->v_lflag = 0; - vp->v_mount = NULL; - insmntque(vp, param->vnfs_mp); - insert = 0; - vnode_unlock(vp); - } - } + vp->v_tag = VT_DEVFS; /* callers will reset if needed (bdevvp) */ - if (vp->v_type == VFIFO) { - struct fifoinfo *fip; + if ( (nvp = checkalias(vp, param->vnfs_rdev)) ) { + /* + * if checkalias returns a vnode, it will be locked + * + * first get rid of the unneeded vnode we acquired + */ + vp->v_data = NULL; + vp->v_op = spec_vnodeop_p; + vp->v_type = VBAD; + vp->v_lflag = VL_DEAD; + vp->v_data = NULL; + vp->v_tag = VT_NON; + vnode_put(vp); - MALLOC(fip, struct fifoinfo *, - sizeof(*fip), M_TEMP, M_WAITOK); - bzero(fip, sizeof(struct fifoinfo )); - vp->v_fifoinfo = fip; - } - /* The file systems must pass the address of the location where - * they store the vnode pointer. When we add the vnode into the mount - * list and name cache they become discoverable. So the file system node - * must have the connection to vnode setup by then + /* + * switch to aliased vnode and finish + * preparing it */ - *vpp = vp; + vp = nvp; - /* Add fs named reference. */ - if (param->vnfs_flags & VNFS_ADDFSREF) { - vp->v_lflag |= VNAMED_FSHASH; - } - if (param->vnfs_mp) { - if (param->vnfs_mp->mnt_kern_flag & MNTK_LOCK_LOCAL) - vp->v_flag |= VLOCKLOCAL; - if (insert) { - if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) - panic("insmntque: vp on the free list\n"); - /* - * enter in mount vnode list - */ - insmntque(vp, param->vnfs_mp); - } + vclean(vp, 0); + vp->v_op = param->vnfs_vops; + vp->v_type = param->vnfs_vtype; + vp->v_data = param->vnfs_fsnode; + vp->v_lflag = 0; + vp->v_mount = NULL; + insmntque(vp, param->vnfs_mp); + insert = 0; + vnode_unlock(vp); + } + } + + if (vp->v_type == VFIFO) { + struct fifoinfo *fip; + + MALLOC(fip, struct fifoinfo *, + sizeof(*fip), M_TEMP, M_WAITOK); + bzero(fip, sizeof(struct fifoinfo )); + vp->v_fifoinfo = fip; + } + /* The file systems must pass the address of the location where + * they store the vnode pointer. When we add the vnode into the mount + * list and name cache they become discoverable. So the file system node + * must have the connection to vnode setup by then + */ + *vpp = vp; + + /* Add fs named reference. */ + if (param->vnfs_flags & VNFS_ADDFSREF) { + vp->v_lflag |= VNAMED_FSHASH; + } + if (param->vnfs_mp) { + if (param->vnfs_mp->mnt_kern_flag & MNTK_LOCK_LOCAL) + vp->v_flag |= VLOCKLOCAL; + if (insert) { + if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) + panic("insmntque: vp on the free list\n"); + + /* + * enter in mount vnode list + */ + insmntque(vp, param->vnfs_mp); + } #ifndef __LP64__ - if ((param->vnfs_mp->mnt_vtable->vfc_vfsflags & VFC_VFSTHREADSAFE) == 0) { - MALLOC_ZONE(vp->v_unsafefs, struct unsafe_fsnode *, - sizeof(struct unsafe_fsnode), M_UNSAFEFS, M_WAITOK); - vp->v_unsafefs->fsnode_count = 0; - vp->v_unsafefs->fsnodeowner = (void *)NULL; - lck_mtx_init(&vp->v_unsafefs->fsnodelock, vnode_lck_grp, vnode_lck_attr); - } + if ((param->vnfs_mp->mnt_vtable->vfc_vfsflags & VFC_VFSTHREADSAFE) == 0) { + MALLOC_ZONE(vp->v_unsafefs, struct unsafe_fsnode *, + sizeof(struct unsafe_fsnode), M_UNSAFEFS, M_WAITOK); + vp->v_unsafefs->fsnode_count = 0; + vp->v_unsafefs->fsnodeowner = (void *)NULL; + lck_mtx_init(&vp->v_unsafefs->fsnodelock, vnode_lck_grp, vnode_lck_attr); + } #endif /* __LP64__ */ - } - if (dvp && vnode_ref(dvp) == 0) { - vp->v_parent = dvp; - } - if (cnp) { - if (dvp && ((param->vnfs_flags & (VNFS_NOCACHE | VNFS_CANTCACHE)) == 0)) { - /* - * enter into name cache - * we've got the info to enter it into the name cache now - * cache_enter_create will pick up an extra reference on - * the name entered into the string cache - */ - vp->v_name = cache_enter_create(dvp, vp, cnp); - } else - vp->v_name = vfs_addname(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, 0); + } + if (dvp && vnode_ref(dvp) == 0) { + vp->v_parent = dvp; + } + if (cnp) { + if (dvp && ((param->vnfs_flags & (VNFS_NOCACHE | VNFS_CANTCACHE)) == 0)) { + /* + * enter into name cache + * we've got the info to enter it into the name cache now + * cache_enter_create will pick up an extra reference on + * the name entered into the string cache + */ + vp->v_name = cache_enter_create(dvp, vp, cnp); + } else + vp->v_name = vfs_addname(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, 0); - if ((cnp->cn_flags & UNIONCREATED) == UNIONCREATED) - vp->v_flag |= VISUNION; - } - if ((param->vnfs_flags & VNFS_CANTCACHE) == 0) { - /* - * this vnode is being created as cacheable in the name cache - * this allows us to re-enter it in the cache - */ - vp->v_flag |= VNCACHEABLE; - } - ut = get_bsdthread_info(current_thread()); - - if ((current_proc()->p_lflag & P_LRAGE_VNODES) || - (ut->uu_flag & UT_RAGE_VNODES)) { - /* - * process has indicated that it wants any - * vnodes created on its behalf to be rapidly - * aged to reduce the impact on the cached set - * of vnodes - */ - vp->v_flag |= VRAGE; - } - return(0); - } + if ((cnp->cn_flags & UNIONCREATED) == UNIONCREATED) + vp->v_flag |= VISUNION; + } + if ((param->vnfs_flags & VNFS_CANTCACHE) == 0) { + /* + * this vnode is being created as cacheable in the name cache + * this allows us to re-enter it in the cache + */ + vp->v_flag |= VNCACHEABLE; + } + ut = get_bsdthread_info(current_thread()); + + if ((current_proc()->p_lflag & P_LRAGE_VNODES) || + (ut->uu_flag & UT_RAGE_VNODES)) { + /* + * process has indicated that it wants any + * vnodes created on its behalf to be rapidly + * aged to reduce the impact on the cached set + * of vnodes + */ + vp->v_flag |= VRAGE; } - return (EINVAL); + return (0); } int @@ -4309,13 +4453,14 @@ vnode_removefsref(vnode_t vp) int -vfs_iterate(__unused int flags, int (*callout)(mount_t, void *), void *arg) +vfs_iterate(int flags, int (*callout)(mount_t, void *), void *arg) { mount_t mp; int ret = 0; fsid_t * fsid_list; int count, actualcount, i; void * allocmem; + int indx_start, indx_stop, indx_incr; count = mount_getvfscnt(); count += 10; @@ -4325,7 +4470,21 @@ vfs_iterate(__unused int flags, int (*callout)(mount_t, void *), void *arg) actualcount = mount_fillfsids(fsid_list, count); - for (i=0; i< actualcount; i++) { + /* + * Establish the iteration direction + * VFS_ITERATE_TAIL_FIRST overrides default head first order (oldest first) + */ + if (flags & VFS_ITERATE_TAIL_FIRST) { + indx_start = actualcount - 1; + indx_stop = -1; + indx_incr = -1; + } else /* Head first by default */ { + indx_start = 0; + indx_stop = actualcount; + indx_incr = 1; + } + + for (i=indx_start; i != indx_stop; i += indx_incr) { /* obtain the mount point with iteration reference */ mp = mount_list_lookupby_fsid(&fsid_list[i], 0, 1); @@ -4567,7 +4726,8 @@ vnode_lookup(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx) ndflags |= DOWHITEOUT; /* XXX AUDITVNPATH1 needed ? */ - NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx); + NDINIT(&nd, LOOKUP, OP_LOOKUP, ndflags, UIO_SYSSPACE, + CAST_USER_ADDR_T(path), ctx); if ((error = namei(&nd))) return (error); @@ -4603,7 +4763,8 @@ vnode_open(const char *path, int fmode, int cmode, int flags, vnode_t *vpp, vfs_ ndflags |= DOWHITEOUT; /* XXX AUDITVNPATH1 needed ? */ - NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx); + NDINIT(&nd, LOOKUP, OP_OPEN, ndflags, UIO_SYSSPACE, + CAST_USER_ADDR_T(path), ctx); if ((error = vn_open(&nd, fmode, cmode))) *vpp = NULL; @@ -4656,6 +4817,18 @@ vnode_setsize(vnode_t vp, off_t size, int ioflag, vfs_context_t ctx) return(vnode_setattr(vp, &va, ctx)); } +static int +vn_create_reg(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx) +{ + /* Only use compound VNOP for compound operation */ + if (vnode_compound_open_available(dvp) && ((flags & VN_CREATE_DOOPEN) != 0)) { + *vpp = NULLVP; + return VNOP_COMPOUND_OPEN(dvp, vpp, ndp, VNOP_COMPOUND_OPEN_DO_CREATE, fmode, statusp, vap, ctx); + } else { + return VNOP_CREATE(dvp, vpp, &ndp->ni_cnd, vap, ctx); + } +} + /* * Create a filesystem object of arbitrary type with arbitrary attributes in * the spevied directory with the specified name. @@ -4698,70 +4871,48 @@ vnode_setsize(vnode_t vp, off_t size, int ioflag, vfs_context_t ctx) * in the code they originated. */ errno_t -vn_create(vnode_t dvp, vnode_t *vpp, struct componentname *cnp, struct vnode_attr *vap, int flags, vfs_context_t ctx) +vn_create(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx) { - kauth_acl_t oacl, nacl; - int initial_acl; - errno_t error; + errno_t error, old_error; vnode_t vp = (vnode_t)0; + boolean_t batched; + struct componentname *cnp; + uint32_t defaulted; + cnp = &ndp->ni_cnd; error = 0; - oacl = nacl = NULL; - initial_acl = 0; + batched = namei_compound_available(dvp, ndp) ? TRUE : FALSE; KAUTH_DEBUG("%p CREATE - '%s'", dvp, cnp->cn_nameptr); + if (flags & VN_CREATE_NOINHERIT) + vap->va_vaflags |= VA_NOINHERIT; + if (flags & VN_CREATE_NOAUTH) + vap->va_vaflags |= VA_NOAUTH; /* - * Handle ACL inheritance. + * Handle ACL inheritance, initialize vap. */ - if (!(flags & VN_CREATE_NOINHERIT) && vfs_extendedsecurity(dvp->v_mount)) { - /* save the original filesec */ - if (VATTR_IS_ACTIVE(vap, va_acl)) { - initial_acl = 1; - oacl = vap->va_acl; - } - - vap->va_acl = NULL; - if ((error = kauth_acl_inherit(dvp, - oacl, - &nacl, - vap->va_type == VDIR, - ctx)) != 0) { - KAUTH_DEBUG("%p CREATE - error %d processing inheritance", dvp, error); - return(error); - } + error = vn_attribute_prepare(dvp, vap, &defaulted, ctx); + if (error) { + return error; + } - /* - * If the generated ACL is NULL, then we can save ourselves some effort - * by clearing the active bit. - */ - if (nacl == NULL) { - VATTR_CLEAR_ACTIVE(vap, va_acl); - } else { - VATTR_SET(vap, va_acl, nacl); - } + if (vap->va_type != VREG && (fmode != 0 || (flags & VN_CREATE_DOOPEN) || statusp)) { + panic("Open parameters, but not a regular file."); } - - /* - * Check and default new attributes. - * This will set va_uid, va_gid, va_mode and va_create_time at least, if the caller - * hasn't supplied them. - */ - if ((error = vnode_authattr_new(dvp, vap, flags & VN_CREATE_NOAUTH, ctx)) != 0) { - KAUTH_DEBUG("%p CREATE - error %d handing/defaulting attributes", dvp, error); - goto out; + if ((fmode != 0) && ((flags & VN_CREATE_DOOPEN) == 0)) { + panic("Mode for open, but not trying to open..."); } - /* * Create the requested node. */ switch(vap->va_type) { case VREG: - error = VNOP_CREATE(dvp, vpp, cnp, vap, ctx); + error = vn_create_reg(dvp, vpp, ndp, vap, flags, fmode, statusp, ctx); break; case VDIR: - error = VNOP_MKDIR(dvp, vpp, cnp, vap, ctx); + error = vn_mkdir(dvp, vpp, ndp, vap, ctx); break; case VSOCK: case VFIFO: @@ -4778,6 +4929,8 @@ vn_create(vnode_t dvp, vnode_t *vpp, struct componentname *cnp, struct vnode_att } vp = *vpp; + old_error = error; + #if CONFIG_MACF if (!(flags & VN_CREATE_NOLABEL)) { error = vnode_label(vnode_mount(vp), dvp, vp, cnp, VNODE_LABEL_CREATE, ctx); @@ -4797,24 +4950,22 @@ vn_create(vnode_t dvp, vnode_t *vpp, struct componentname *cnp, struct vnode_att #if CONFIG_MACF error: #endif - if ((error != 0 ) && (vp != (vnode_t)0)) { - *vpp = (vnode_t) 0; - vnode_put(vp); + if ((error != 0) && (vp != (vnode_t)0)) { + + /* If we've done a compound open, close */ + if (batched && (old_error == 0) && (vap->va_type == VREG)) { + VNOP_CLOSE(vp, fmode, ctx); + } + + /* Need to provide notifications if a create succeeded */ + if (!batched) { + *vpp = (vnode_t) 0; + vnode_put(vp); + } } out: - /* - * If the caller supplied a filesec in vap, it has been replaced - * now by the post-inheritance copy. We need to put the original back - * and free the inherited product. - */ - if (initial_acl) { - VATTR_SET(vap, va_acl, oacl); - } else { - VATTR_CLEAR_ACTIVE(vap, va_acl); - } - if (nacl != NULL) - kauth_acl_free(nacl); + vn_attribute_cleanup(vap, defaulted); return(error); } @@ -4845,6 +4996,433 @@ vnode_authorize_init(void) vnode_scope = kauth_register_scope(KAUTH_SCOPE_VNODE, vnode_authorize_callback, NULL); } +#define VATTR_PREPARE_DEFAULTED_UID 0x1 +#define VATTR_PREPARE_DEFAULTED_GID 0x2 +#define VATTR_PREPARE_DEFAULTED_MODE 0x4 + +int +vn_attribute_prepare(vnode_t dvp, struct vnode_attr *vap, uint32_t *defaulted_fieldsp, vfs_context_t ctx) +{ + kauth_acl_t nacl = NULL, oacl = NULL; + int error; + + /* + * Handle ACL inheritance. + */ + if (!(vap->va_vaflags & VA_NOINHERIT) && vfs_extendedsecurity(dvp->v_mount)) { + /* save the original filesec */ + if (VATTR_IS_ACTIVE(vap, va_acl)) { + oacl = vap->va_acl; + } + + vap->va_acl = NULL; + if ((error = kauth_acl_inherit(dvp, + oacl, + &nacl, + vap->va_type == VDIR, + ctx)) != 0) { + KAUTH_DEBUG("%p CREATE - error %d processing inheritance", dvp, error); + return(error); + } + + /* + * If the generated ACL is NULL, then we can save ourselves some effort + * by clearing the active bit. + */ + if (nacl == NULL) { + VATTR_CLEAR_ACTIVE(vap, va_acl); + } else { + vap->va_base_acl = oacl; + VATTR_SET(vap, va_acl, nacl); + } + } + + error = vnode_authattr_new_internal(dvp, vap, (vap->va_vaflags & VA_NOAUTH), defaulted_fieldsp, ctx); + if (error) { + vn_attribute_cleanup(vap, *defaulted_fieldsp); + } + + return error; +} + +void +vn_attribute_cleanup(struct vnode_attr *vap, uint32_t defaulted_fields) +{ + /* + * If the caller supplied a filesec in vap, it has been replaced + * now by the post-inheritance copy. We need to put the original back + * and free the inherited product. + */ + kauth_acl_t nacl, oacl; + + if (VATTR_IS_ACTIVE(vap, va_acl)) { + nacl = vap->va_acl; + oacl = vap->va_base_acl; + + if (oacl) { + VATTR_SET(vap, va_acl, oacl); + vap->va_base_acl = NULL; + } else { + VATTR_CLEAR_ACTIVE(vap, va_acl); + } + + if (nacl != NULL) { + kauth_acl_free(nacl); + } + } + + if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_MODE) != 0) { + VATTR_CLEAR_ACTIVE(vap, va_mode); + } + if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_GID) != 0) { + VATTR_CLEAR_ACTIVE(vap, va_gid); + } + if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_UID) != 0) { + VATTR_CLEAR_ACTIVE(vap, va_uid); + } + + return; +} + +int +vn_authorize_unlink(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, __unused void *reserved) +{ + int error = 0; + + /* + * Normally, unlinking of directories is not supported. + * However, some file systems may have limited support. + */ + if ((vp->v_type == VDIR) && + !(vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSDIRLINKS)) { + return (EPERM); /* POSIX */ + } + + /* authorize the delete operation */ +#if CONFIG_MACF + if (!error) + error = mac_vnode_check_unlink(ctx, dvp, vp, cnp); +#endif /* MAC */ + if (!error) + error = vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx); + + return error; +} + +int +vn_authorize_open_existing(vnode_t vp, struct componentname *cnp, int fmode, vfs_context_t ctx, void *reserved) +{ + /* Open of existing case */ + kauth_action_t action; + int error = 0; + + if (cnp->cn_ndp == NULL) { + panic("NULL ndp"); + } + if (reserved != NULL) { + panic("reserved not NULL."); + } + +#if CONFIG_MACF + /* XXX may do duplicate work here, but ignore that for now (idempotent) */ + if (vfs_flags(vnode_mount(vp)) & MNT_MULTILABEL) { + error = vnode_label(vnode_mount(vp), NULL, vp, NULL, 0, ctx); + if (error) + return (error); + } +#endif + + if ( (fmode & O_DIRECTORY) && vp->v_type != VDIR ) { + return (ENOTDIR); + } + + if (vp->v_type == VSOCK && vp->v_tag != VT_FDESC) { + return (EOPNOTSUPP); /* Operation not supported on socket */ + } + + if (vp->v_type == VLNK && (fmode & O_NOFOLLOW) != 0) { + return (ELOOP); /* O_NOFOLLOW was specified and the target is a symbolic link */ + } + + /* disallow write operations on directories */ + if (vnode_isdir(vp) && (fmode & (FWRITE | O_TRUNC))) { + return (EISDIR); + } + + if ((cnp->cn_ndp->ni_flag & NAMEI_TRAILINGSLASH)) { + if (vp->v_type != VDIR) { + return (ENOTDIR); + } + } + +#if CONFIG_MACF + /* If a file being opened is a shadow file containing + * namedstream data, ignore the macf checks because it + * is a kernel internal file and access should always + * be allowed. + */ + if (!(vnode_isshadow(vp) && vnode_isnamedstream(vp))) { + error = mac_vnode_check_open(ctx, vp, fmode); + if (error) { + return (error); + } + } +#endif + + /* compute action to be authorized */ + action = 0; + if (fmode & FREAD) { + action |= KAUTH_VNODE_READ_DATA; + } + if (fmode & (FWRITE | O_TRUNC)) { + /* + * If we are writing, appending, and not truncating, + * indicate that we are appending so that if the + * UF_APPEND or SF_APPEND bits are set, we do not deny + * the open. + */ + if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) { + action |= KAUTH_VNODE_APPEND_DATA; + } else { + action |= KAUTH_VNODE_WRITE_DATA; + } + } + return (vnode_authorize(vp, NULL, action, ctx)); +} + +int +vn_authorize_create(vnode_t dvp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx, void *reserved) +{ + /* Creation case */ + int error; + + if (cnp->cn_ndp == NULL) { + panic("NULL cn_ndp"); + } + if (reserved != NULL) { + panic("reserved not NULL."); + } + + /* Only validate path for creation if we didn't do a complete lookup */ + if (cnp->cn_ndp->ni_flag & NAMEI_UNFINISHED) { + error = lookup_validate_creation_path(cnp->cn_ndp); + if (error) + return (error); + } + +#if CONFIG_MACF + error = mac_vnode_check_create(ctx, dvp, cnp, vap); + if (error) + return (error); +#endif /* CONFIG_MACF */ + + return (vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)); +} + +int +vn_authorize_rename(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, + struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, + vfs_context_t ctx, void *reserved) +{ + int error = 0; + int moving = 0; + + if (reserved != NULL) { + panic("Passed something other than NULL as reserved field!"); + } + + /* + * Avoid renaming "." and "..". + * + * XXX No need to check for this in the FS. We should always have the leaves + * in VFS in this case. + */ + if (fvp->v_type == VDIR && + ((fdvp == fvp) || + (fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || + ((fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT)) ) { + error = EINVAL; + goto out; + } + + if (tvp == NULLVP && vnode_compound_rename_available(tdvp)) { + error = lookup_validate_creation_path(tcnp->cn_ndp); + if (error) + goto out; + } + + /***** *****/ +#if CONFIG_MACF + error = mac_vnode_check_rename_from(ctx, fdvp, fvp, fcnp); + if (error) + goto out; +#endif + +#if CONFIG_MACF + error = mac_vnode_check_rename_to(ctx, + tdvp, tvp, fdvp == tdvp, tcnp); + if (error) + goto out; +#endif + /***** *****/ + + /***** *****/ + if (tvp != NULL) { + if (fvp->v_type == VDIR && tvp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { + error = EISDIR; + goto out; + } + } + + if (fvp == tdvp) { + error = EINVAL; + goto out; + } + + /* + * The following edge case is caught here: + * (to cannot be a descendent of from) + * + * o fdvp + * / + * / + * o fvp + * \ + * \ + * o tdvp + * / + * / + * o tvp + */ + if (tdvp->v_parent == fvp) { + error = EINVAL; + goto out; + } + /***** *****/ + + /***** *****/ + + error = 0; + if ((tvp != NULL) && vnode_isdir(tvp)) { + if (tvp != fdvp) + moving = 1; + } else if (tdvp != fdvp) { + moving = 1; + } + + + /* + * must have delete rights to remove the old name even in + * the simple case of fdvp == tdvp. + * + * If fvp is a directory, and we are changing it's parent, + * then we also need rights to rewrite its ".." entry as well. + */ + if (vnode_isdir(fvp)) { + if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE | KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) + goto out; + } else { + if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE, ctx)) != 0) + goto out; + } + if (moving) { + /* moving into tdvp or tvp, must have rights to add */ + if ((error = vnode_authorize(((tvp != NULL) && vnode_isdir(tvp)) ? tvp : tdvp, + NULL, + vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, + ctx)) != 0) { + goto out; + } + } else { + /* node staying in same directory, must be allowed to add new name */ + if ((error = vnode_authorize(fdvp, NULL, + vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, ctx)) != 0) + goto out; + } + /* overwriting tvp */ + if ((tvp != NULL) && !vnode_isdir(tvp) && + ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0)) { + goto out; + } + + /***** *****/ + + /* XXX more checks? */ +out: + return error; +} + +int +vn_authorize_mkdir(vnode_t dvp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx, void *reserved) +{ + int error; + + if (reserved != NULL) { + panic("reserved not NULL in vn_authorize_mkdir()"); + } + + /* XXX A hack for now, to make shadow files work */ + if (cnp->cn_ndp == NULL) { + return 0; + } + + if (vnode_compound_mkdir_available(dvp)) { + error = lookup_validate_creation_path(cnp->cn_ndp); + if (error) + goto out; + } + +#if CONFIG_MACF + error = mac_vnode_check_create(ctx, + dvp, cnp, vap); + if (error) + goto out; +#endif + + /* authorize addition of a directory to the parent */ + if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) + goto out; + +out: + return error; +} + +int +vn_authorize_rmdir(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, void *reserved) +{ + int error; + + if (reserved != NULL) { + panic("Non-NULL reserved argument to vn_authorize_rmdir()"); + } + + if (vp->v_type != VDIR) { + /* + * rmdir only deals with directories + */ + return ENOTDIR; + } + + if (dvp == vp) { + /* + * No rmdir "." please. + */ + return EINVAL; + } + +#if CONFIG_MACF + error = mac_vnode_check_unlink(ctx, dvp, + vp, cnp); + if (error) + return error; +#endif + + return vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx); +} + /* * Authorize an operation on a vnode. * @@ -5254,8 +5832,11 @@ vnode_authorize_posix(vauth_ctx vcp, int action, int on_dir) } else { error = vauth_file_ingroup(vcp, &ismember, (!group_ok ? EACCES : 0)); } - if (error) - goto out; + if (error) { + if (!group_ok) + ismember = 1; + error = 0; + } if (ismember) { _SETWHERE("group"); if (!group_ok) @@ -5324,8 +5905,6 @@ vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) /* check the ACL on the directory */ delete_child_denied = 0; if (!cached_delete_child && VATTR_IS_NOT(dvap, va_acl, NULL)) { - errno_t posix_error; - eval.ae_requested = KAUTH_VNODE_DELETE_CHILD; eval.ae_acl = &dvap->va_acl->acl_ace[0]; eval.ae_count = dvap->va_acl->acl_entrycount; @@ -5338,9 +5917,11 @@ vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) * have the ACL evaluation answer. Previously, we would * always deny the operation at this point. */ - if ((posix_error = vauth_dir_ingroup(vcp, &ismember, ENOENT)) != 0 && posix_error != ENOENT) - return(posix_error); - if (ismember) + if ((error = vauth_dir_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) + return(error); + if (error == ENOENT) + eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN; + else if (ismember) eval.ae_options |= KAUTH_AEVAL_IN_GROUP; eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS; eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS; @@ -5361,18 +5942,11 @@ vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) case KAUTH_RESULT_DENY: delete_child_denied = 1; break; - case KAUTH_RESULT_ALLOW: - KAUTH_DEBUG("%p ALLOWED - granted by directory ACL", vcp->vp); - return(0); + /* FALLSTHROUGH */ + case KAUTH_RESULT_ALLOW: + KAUTH_DEBUG("%p ALLOWED - granted by directory ACL", vcp->vp); + return(0); case KAUTH_RESULT_DEFER: - /* - * If we don't have a POSIX answer of "yes", and we - * can't get an ACL answer, then we deny it now. - */ - if (posix_error == ENOENT) { - delete_child_denied = 1; - break; - } default: /* Effectively the same as !delete_child_denied */ KAUTH_DEBUG("%p DEFERRED - directory ACL", vcp->vp); @@ -5383,8 +5957,6 @@ vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) /* check the ACL on the node */ delete_denied = 0; if (VATTR_IS_NOT(vap, va_acl, NULL)) { - errno_t posix_error; - eval.ae_requested = KAUTH_VNODE_DELETE; eval.ae_acl = &vap->va_acl->acl_ace[0]; eval.ae_count = vap->va_acl->acl_entrycount; @@ -5397,9 +5969,11 @@ vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) * have the ACL evaluation answer. Previously, we would * always deny the operation at this point. */ - if ((posix_error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && posix_error != ENOENT) - return(posix_error); - if (ismember) + if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) + return(error); + if (error == ENOENT) + eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN; + else if (ismember) eval.ae_options |= KAUTH_AEVAL_IN_GROUP; eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS; eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS; @@ -5419,13 +5993,6 @@ vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) KAUTH_DEBUG("%p ALLOWED - granted by file ACL", vcp->vp); return(0); case KAUTH_RESULT_DEFER: - /* - * If we don't have a POSIX answer of "yes", and we - * can't get an ACL answer, then we deny it now. - */ - if (posix_error == ENOENT) { - delete_denied = 1; - } default: /* Effectively the same as !delete_child_denied */ KAUTH_DEBUG("%p DEFERRED%s - by file ACL", vcp->vp, delete_denied ? "(DENY)" : ""); @@ -5447,13 +6014,13 @@ vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) */ if (!cached_delete_child && (dvap->va_mode & S_ISTXT) && !vauth_file_owner(vcp) && !vauth_dir_owner(vcp)) { KAUTH_DEBUG("%p DENIED - sticky bit rules (user %d file %d dir %d)", - vcp->vp, cred->cr_uid, vap->va_uid, dvap->va_uid); + vcp->vp, cred->cr_posix.cr_uid, vap->va_uid, dvap->va_uid); return(EACCES); } /* check the directory */ if (!cached_delete_child && (error = vnode_authorize_posix(vcp, VWRITE, 1 /* on_dir */)) != 0) { - KAUTH_DEBUG("%p ALLOWED - granted by posix permisssions", vcp->vp); + KAUTH_DEBUG("%p DENIED - denied by posix permisssions", vcp->vp); return(error); } @@ -5501,8 +6068,6 @@ vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_r /* if we have an ACL, evaluate it */ if (VATTR_IS_NOT(vap, va_acl, NULL)) { - errno_t posix_error; - eval.ae_requested = acl_rights; eval.ae_acl = &vap->va_acl->acl_ace[0]; eval.ae_count = vap->va_acl->acl_entrycount; @@ -5515,9 +6080,11 @@ vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_r * have the ACL evaluation answer. Previously, we would * always deny the operation at this point. */ - if ((posix_error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && posix_error != ENOENT) - return(posix_error); - if (ismember) + if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) + return(error); + if (error == ENOENT) + eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN; + else if (ismember) eval.ae_options |= KAUTH_AEVAL_IN_GROUP; eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS; eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS; @@ -5537,14 +6104,6 @@ vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_r KAUTH_DEBUG("%p ALLOWED - all rights granted by ACL", vcp->vp); return(0); case KAUTH_RESULT_DEFER: - /* - * If we don't have a POSIX answer of "yes", and we - * can't get an ACL answer, then we deny it now. - */ - if (posix_error == ENOENT) { - KAUTH_DEBUG("%p DENIED(DEFERRED) - by ACL", vcp->vp); - return(EACCES); /* deny, deny, counter-allege */ - } default: /* Effectively the same as !delete_child_denied */ KAUTH_DEBUG("%p DEFERRED - directory ACL", vcp->vp); @@ -5866,8 +6425,8 @@ vnode_authorize_callback(kauth_cred_t cred, void *idata, kauth_action_t action, * find the stream and flush its cache. */ if (vnode_isnamedstream(vp) && (!vfs_authopaque(vp->v_mount))) { - cvp = vp->v_parent; - if ((cvp != NULLVP) && (vnode_getwithref(cvp) == 0)) { + cvp = vnode_getparent(vp); + if (cvp != NULLVP) { parent_iocount = 1; } else { cvp = NULL; @@ -5897,8 +6456,10 @@ vnode_authorize_callback(kauth_cred_t cred, void *idata, kauth_action_t action, defer: result = vnode_authorize_callback_int(cred, idata, action, arg0, arg1, arg2, arg3); - if (result == KAUTH_RESULT_ALLOW && cvp != NULLVP) + if (result == KAUTH_RESULT_ALLOW && cvp != NULLVP) { + KAUTH_DEBUG("%p - caching action = %x", cvp, action); vnode_cache_authorized_action(cvp, ctx, action); + } out: if (parent_iocount) { @@ -6068,7 +6629,7 @@ vnode_authorize_callback_int(__unused kauth_cred_t unused_cred, __unused void *i */ if (vnode_isnamedstream(vp) && (vp->v_parent != NULL) && - (vget_internal(vp->v_parent, 0, VNODE_NODEAD) == 0)) { + (vget_internal(vp->v_parent, 0, VNODE_NODEAD | VNODE_DRAINO) == 0)) { parent_ref = TRUE; vcp->vp = vp = vp->v_parent; if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL)) @@ -6175,6 +6736,7 @@ vnode_authorize_callback_int(__unused kauth_cred_t unused_cred, __unused void *i if (VATTR_IS_SUPPORTED(&dva, va_mode) && !(dva.va_mode & (S_ISVTX))) { /* OK to cache delete rights */ + KAUTH_DEBUG("%p - caching DELETE_CHILD rights", dvp); vnode_cache_authorized_action(dvp, ctx, KAUTH_VNODE_DELETE_CHILD); } } @@ -6188,12 +6750,18 @@ vnode_authorize_callback_int(__unused kauth_cred_t unused_cred, __unused void *i return(KAUTH_RESULT_ALLOW); } +int +vnode_authattr_new(vnode_t dvp, struct vnode_attr *vap, int noauth, vfs_context_t ctx) +{ + return vnode_authattr_new_internal(dvp, vap, noauth, NULL, ctx); +} + /* * Check that the attribute information in vattr can be legally applied to * a new file by the context. */ -int -vnode_authattr_new(vnode_t dvp, struct vnode_attr *vap, int noauth, vfs_context_t ctx) +static int +vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uint32_t *defaulted_fieldsp, vfs_context_t ctx) { int error; int has_priv_suser, ismember, defaulted_owner, defaulted_group, defaulted_mode; @@ -6202,6 +6770,11 @@ vnode_authattr_new(vnode_t dvp, struct vnode_attr *vap, int noauth, vfs_context_ mount_t dmp; error = 0; + + if (defaulted_fieldsp) { + *defaulted_fieldsp = 0; + } + defaulted_owner = defaulted_group = defaulted_mode = 0; /* @@ -6384,6 +6957,17 @@ vnode_authattr_new(vnode_t dvp, struct vnode_attr *vap, int noauth, vfs_context_ } } out: + if (defaulted_fieldsp) { + if (defaulted_mode) { + *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_MODE; + } + if (defaulted_group) { + *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_GID; + } + if (defaulted_owner) { + *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_UID; + } + } return(error); } @@ -6481,6 +7065,14 @@ vnode_authattr(vnode_t vp, struct vnode_attr *vap, kauth_action_t *actionp, vfs_ VATTR_WANTED(&ova, va_flags); } + /* + * If ACLs are being changed, we need the old ACLs. + */ + if (VATTR_IS_ACTIVE(vap, va_acl)) { + KAUTH_DEBUG("ATTR - acl changing, fetching old flags"); + VATTR_WANTED(&ova, va_acl); + } + /* * If the size is being set, make sure it's not a directory. */ @@ -6886,7 +7478,7 @@ vnode_authattr(vnode_t vp, struct vnode_attr *vap, kauth_action_t *actionp, vfs_ KAUTH_DEBUG("CHMOD - adding/removing ACL entries"); } else if (vap->va_acl->acl_entrycount > 0) { /* both ACLs have the same ACE count, said count is 1 or more, bitwise compare ACLs */ - if (!memcmp(&vap->va_acl->acl_ace[0], &ova.va_acl->acl_ace[0], + if (memcmp(&vap->va_acl->acl_ace[0], &ova.va_acl->acl_ace[0], sizeof(struct kauth_ace) * vap->va_acl->acl_entrycount)) { required_action |= KAUTH_VNODE_WRITE_SECURITY; KAUTH_DEBUG("CHMOD - changing ACL entries"); @@ -6909,29 +7501,28 @@ vnode_authattr(vnode_t vp, struct vnode_attr *vap, kauth_action_t *actionp, vfs_ return(error); } +static int +setlocklocal_callback(struct vnode *vp, __unused void *cargs) +{ + vnode_lock_spin(vp); + vp->v_flag |= VLOCKLOCAL; + vnode_unlock(vp); + + return (VNODE_RETURNED); +} void vfs_setlocklocal(mount_t mp) { - vnode_t vp; - - mount_lock(mp); + mount_lock_spin(mp); mp->mnt_kern_flag |= MNTK_LOCK_LOCAL; + mount_unlock(mp); /* - * We do not expect anyone to be using any vnodes at the - * time this routine is called. So no need for vnode locking + * The number of active vnodes is expected to be + * very small when vfs_setlocklocal is invoked. */ - TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { - vp->v_flag |= VLOCKLOCAL; - } - TAILQ_FOREACH(vp, &mp->mnt_workerqueue, v_mntvnodes) { - vp->v_flag |= VLOCKLOCAL; - } - TAILQ_FOREACH(vp, &mp->mnt_newvnodes, v_mntvnodes) { - vp->v_flag |= VLOCKLOCAL; - } - mount_unlock(mp); + vnode_iterate(mp, 0, setlocklocal_callback, NULL); } void @@ -6942,6 +7533,14 @@ vfs_setunmountpreflight(mount_t mp) mount_unlock(mp); } +void +vfs_setcompoundopen(mount_t mp) +{ + mount_lock_spin(mp); + mp->mnt_compound_ops |= COMPOUND_VNOP_OPEN; + mount_unlock(mp); +} + void vn_setunionwait(vnode_t vp) { @@ -7146,13 +7745,17 @@ errno_t rmdir_remove_orphaned_appleDouble(vnode_t vp , vfs_context_t ctx, int * !((dp->d_namlen == 1 && dp->d_name[0] == '.') || (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.')) ) { - - NDINIT(&nd_temp, DELETE, USEDVP, UIO_SYSSPACE, CAST_USER_ADDR_T(dp->d_name), ctx); + + NDINIT(&nd_temp, DELETE, OP_UNLINK, USEDVP, + UIO_SYSSPACE, CAST_USER_ADDR_T(dp->d_name), + ctx); nd_temp.ni_dvp = vp; error = unlink1(ctx, &nd_temp, 0); - if (error && error != ENOENT) { + + if (error && error != ENOENT) { goto outsc; } + } cpos += dp->d_reclen; dp = (struct dirent*)cpos; @@ -7208,21 +7811,645 @@ lock_vnode_and_post(vnode_t vp, int kevent_num) #ifdef JOE_DEBUG static void record_vp(vnode_t vp, int count) { struct uthread *ut; - int i; +#if CONFIG_TRIGGERS + if (vp->v_resolve) + return; +#endif if ((vp->v_flag & VSYSTEM)) return; ut = get_bsdthread_info(current_thread()); ut->uu_iocount += count; - if (ut->uu_vpindex < 32) { - for (i = 0; i < ut->uu_vpindex; i++) { - if (ut->uu_vps[i] == vp) - return; + if (count == 1) { + if (ut->uu_vpindex < 32) { + OSBacktrace((void **)&ut->uu_pcs[ut->uu_vpindex][0], 10); + + ut->uu_vps[ut->uu_vpindex] = vp; + ut->uu_vpindex++; } - ut->uu_vps[ut->uu_vpindex] = vp; - ut->uu_vpindex++; } } #endif + + +#if CONFIG_TRIGGERS + +#define TRIG_DEBUG 0 + +#if TRIG_DEBUG +#define TRIG_LOG(...) do { printf("%s: ", __FUNCTION__); printf(__VA_ARGS__); } while (0) +#else +#define TRIG_LOG(...) +#endif + +/* + * Resolver result functions + */ + +resolver_result_t +vfs_resolver_result(uint32_t seq, enum resolver_status stat, int aux) +{ + /* + * |<--- 32 --->|<--- 28 --->|<- 4 ->| + * sequence auxiliary status + */ + return (((uint64_t)seq) << 32) | + (((uint64_t)(aux & 0x0fffffff)) << 4) | + (uint64_t)(stat & 0x0000000F); +} + +enum resolver_status +vfs_resolver_status(resolver_result_t result) +{ + /* lower 4 bits is status */ + return (result & 0x0000000F); +} + +uint32_t +vfs_resolver_sequence(resolver_result_t result) +{ + /* upper 32 bits is sequence */ + return (uint32_t)(result >> 32); +} + +int +vfs_resolver_auxiliary(resolver_result_t result) +{ + /* 28 bits of auxiliary */ + return (int)(((uint32_t)(result & 0xFFFFFFF0)) >> 4); +} + +/* + * SPI + * Call in for resolvers to update vnode trigger state + */ +int +vnode_trigger_update(vnode_t vp, resolver_result_t result) +{ + vnode_resolve_t rp; + uint32_t seq; + enum resolver_status stat; + + if (vp->v_resolve == NULL) { + return (EINVAL); + } + + stat = vfs_resolver_status(result); + seq = vfs_resolver_sequence(result); + + if ((stat != RESOLVER_RESOLVED) && (stat != RESOLVER_UNRESOLVED)) { + return (EINVAL); + } + + rp = vp->v_resolve; + lck_mtx_lock(&rp->vr_lock); + + if (seq > rp->vr_lastseq) { + if (stat == RESOLVER_RESOLVED) + rp->vr_flags |= VNT_RESOLVED; + else + rp->vr_flags &= ~VNT_RESOLVED; + + rp->vr_lastseq = seq; + } + + lck_mtx_unlock(&rp->vr_lock); + + return (0); +} + +static int +vnode_resolver_attach(vnode_t vp, vnode_resolve_t rp, boolean_t ref) +{ + int error; + + vnode_lock_spin(vp); + if (vp->v_resolve != NULL) { + vnode_unlock(vp); + return EINVAL; + } else { + vp->v_resolve = rp; + } + vnode_unlock(vp); + + if (ref) { + error = vnode_ref_ext(vp, O_EVTONLY, VNODE_REF_FORCE); + if (error != 0) { + panic("VNODE_REF_FORCE didn't help..."); + } + } + + return 0; +} + +/* + * VFS internal interfaces for vnode triggers + * + * vnode must already have an io count on entry + * v_resolve is stable when io count is non-zero + */ +static int +vnode_resolver_create(mount_t mp, vnode_t vp, struct vnode_trigger_param *tinfo, boolean_t external) +{ + vnode_resolve_t rp; + int result; + char byte; + +#if 1 + /* minimum pointer test (debugging) */ + if (tinfo->vnt_data) + byte = *((char *)tinfo->vnt_data); +#endif + MALLOC(rp, vnode_resolve_t, sizeof(*rp), M_TEMP, M_WAITOK); + if (rp == NULL) + return (ENOMEM); + + lck_mtx_init(&rp->vr_lock, trigger_vnode_lck_grp, trigger_vnode_lck_attr); + + rp->vr_resolve_func = tinfo->vnt_resolve_func; + rp->vr_unresolve_func = tinfo->vnt_unresolve_func; + rp->vr_rearm_func = tinfo->vnt_rearm_func; + rp->vr_reclaim_func = tinfo->vnt_reclaim_func; + rp->vr_data = tinfo->vnt_data; + rp->vr_lastseq = 0; + rp->vr_flags = tinfo->vnt_flags & VNT_VALID_MASK; + if (external) { + rp->vr_flags |= VNT_EXTERNAL; + } + + result = vnode_resolver_attach(vp, rp, external); + if (result != 0) { + goto out; + } + + if (mp) { + OSAddAtomic(1, &mp->mnt_numtriggers); + } + + return (result); + +out: + FREE(rp, M_TEMP); + return result; +} + +static void +vnode_resolver_release(vnode_resolve_t rp) +{ + /* + * Give them a chance to free any private data + */ + if (rp->vr_data && rp->vr_reclaim_func) { + rp->vr_reclaim_func(NULLVP, rp->vr_data); + } + + lck_mtx_destroy(&rp->vr_lock, trigger_vnode_lck_grp); + FREE(rp, M_TEMP); + +} + +/* Called after the vnode has been drained */ +static void +vnode_resolver_detach(vnode_t vp) +{ + vnode_resolve_t rp; + mount_t mp; + + mp = vnode_mount(vp); + + vnode_lock(vp); + rp = vp->v_resolve; + vp->v_resolve = NULL; + vnode_unlock(vp); + + if ((rp->vr_flags & VNT_EXTERNAL) != 0) { + vnode_rele_ext(vp, O_EVTONLY, 1); + } + + vnode_resolver_release(rp); + + /* Keep count of active trigger vnodes per mount */ + OSAddAtomic(-1, &mp->mnt_numtriggers); +} + +/* + * Pathname operations that don't trigger a mount for trigger vnodes + */ +static const u_int64_t ignorable_pathops_mask = + 1LL << OP_MOUNT | + 1LL << OP_UNMOUNT | + 1LL << OP_STATFS | + 1LL << OP_ACCESS | + 1LL << OP_GETATTR | + 1LL << OP_LISTXATTR; + +int +vfs_istraditionaltrigger(enum path_operation op, const struct componentname *cnp) +{ + if (cnp->cn_flags & ISLASTCN) + return ((1LL << op) & ignorable_pathops_mask) == 0; + else + return (1); +} + +__private_extern__ +void +vnode_trigger_rearm(vnode_t vp, vfs_context_t ctx) +{ + vnode_resolve_t rp; + resolver_result_t result; + enum resolver_status status; + uint32_t seq; + + if ((vp->v_resolve == NULL) || + (vp->v_resolve->vr_rearm_func == NULL) || + (vp->v_resolve->vr_flags & VNT_AUTO_REARM) == 0) { + return; + } + + rp = vp->v_resolve; + lck_mtx_lock(&rp->vr_lock); + + /* + * Check if VFS initiated this unmount. If so, we'll catch it after the unresolve completes. + */ + if (rp->vr_flags & VNT_VFS_UNMOUNTED) { + lck_mtx_unlock(&rp->vr_lock); + return; + } + + /* Check if this vnode is already armed */ + if ((rp->vr_flags & VNT_RESOLVED) == 0) { + lck_mtx_unlock(&rp->vr_lock); + return; + } + + lck_mtx_unlock(&rp->vr_lock); + + result = rp->vr_rearm_func(vp, 0, rp->vr_data, ctx); + status = vfs_resolver_status(result); + seq = vfs_resolver_sequence(result); + + lck_mtx_lock(&rp->vr_lock); + if (seq > rp->vr_lastseq) { + if (status == RESOLVER_UNRESOLVED) + rp->vr_flags &= ~VNT_RESOLVED; + rp->vr_lastseq = seq; + } + lck_mtx_unlock(&rp->vr_lock); +} + +__private_extern__ +int +vnode_trigger_resolve(vnode_t vp, struct nameidata *ndp, vfs_context_t ctx) +{ + vnode_resolve_t rp; + enum path_operation op; + resolver_result_t result; + enum resolver_status status; + uint32_t seq; + + /* Only trigger on topmost vnodes */ + if ((vp->v_resolve == NULL) || + (vp->v_resolve->vr_resolve_func == NULL) || + (vp->v_mountedhere != NULL)) { + return (0); + } + + rp = vp->v_resolve; + lck_mtx_lock(&rp->vr_lock); + + /* Check if this vnode is already resolved */ + if (rp->vr_flags & VNT_RESOLVED) { + lck_mtx_unlock(&rp->vr_lock); + return (0); + } + + lck_mtx_unlock(&rp->vr_lock); + + /* + * XXX + * assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock) + * is there anyway to know this??? + * there can also be other legitimate lookups in parallel + * + * XXX - should we call this on a separate thread with a timeout? + * + * XXX - should we use ISLASTCN to pick the op value??? Perhaps only leafs should + * get the richer set and non-leafs should get generic OP_LOOKUP? TBD + */ + op = (ndp->ni_op < OP_MAXOP) ? ndp->ni_op: OP_LOOKUP; + + result = rp->vr_resolve_func(vp, &ndp->ni_cnd, op, 0, rp->vr_data, ctx); + status = vfs_resolver_status(result); + seq = vfs_resolver_sequence(result); + + lck_mtx_lock(&rp->vr_lock); + if (seq > rp->vr_lastseq) { + if (status == RESOLVER_RESOLVED) + rp->vr_flags |= VNT_RESOLVED; + rp->vr_lastseq = seq; + } + lck_mtx_unlock(&rp->vr_lock); + + /* On resolver errors, propagate the error back up */ + return (status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : 0); +} + +static int +vnode_trigger_unresolve(vnode_t vp, int flags, vfs_context_t ctx) +{ + vnode_resolve_t rp; + resolver_result_t result; + enum resolver_status status; + uint32_t seq; + + if ((vp->v_resolve == NULL) || (vp->v_resolve->vr_unresolve_func == NULL)) { + return (0); + } + + rp = vp->v_resolve; + lck_mtx_lock(&rp->vr_lock); + + /* Check if this vnode is already resolved */ + if ((rp->vr_flags & VNT_RESOLVED) == 0) { + printf("vnode_trigger_unresolve: not currently resolved\n"); + lck_mtx_unlock(&rp->vr_lock); + return (0); + } + + rp->vr_flags |= VNT_VFS_UNMOUNTED; + + lck_mtx_unlock(&rp->vr_lock); + + /* + * XXX + * assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock) + * there can also be other legitimate lookups in parallel + * + * XXX - should we call this on a separate thread with a timeout? + */ + + result = rp->vr_unresolve_func(vp, flags, rp->vr_data, ctx); + status = vfs_resolver_status(result); + seq = vfs_resolver_sequence(result); + + lck_mtx_lock(&rp->vr_lock); + if (seq > rp->vr_lastseq) { + if (status == RESOLVER_UNRESOLVED) + rp->vr_flags &= ~VNT_RESOLVED; + rp->vr_lastseq = seq; + } + rp->vr_flags &= ~VNT_VFS_UNMOUNTED; + lck_mtx_unlock(&rp->vr_lock); + + /* On resolver errors, propagate the error back up */ + return (status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : 0); +} + +static int +triggerisdescendant(mount_t mp, mount_t rmp) +{ + int match = FALSE; + + /* + * walk up vnode covered chain looking for a match + */ + name_cache_lock_shared(); + + while (1) { + vnode_t vp; + + /* did we encounter "/" ? */ + if (mp->mnt_flag & MNT_ROOTFS) + break; + + vp = mp->mnt_vnodecovered; + if (vp == NULLVP) + break; + + mp = vp->v_mount; + if (mp == rmp) { + match = TRUE; + break; + } + } + + name_cache_unlock(); + + return (match); +} + +struct trigger_unmount_info { + vfs_context_t ctx; + mount_t top_mp; + vnode_t trigger_vp; + mount_t trigger_mp; + uint32_t trigger_vid; + int flags; +}; + +static int +trigger_unmount_callback(mount_t mp, void * arg) +{ + struct trigger_unmount_info * infop = (struct trigger_unmount_info *)arg; + boolean_t mountedtrigger = FALSE; + + /* + * When we encounter the top level mount we're done + */ + if (mp == infop->top_mp) + return (VFS_RETURNED_DONE); + + if ((mp->mnt_vnodecovered == NULL) || + (vnode_getwithref(mp->mnt_vnodecovered) != 0)) { + return (VFS_RETURNED); + } + + if ((mp->mnt_vnodecovered->v_mountedhere == mp) && + (mp->mnt_vnodecovered->v_resolve != NULL) && + (mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_RESOLVED)) { + mountedtrigger = TRUE; + } + vnode_put(mp->mnt_vnodecovered); + + /* + * When we encounter a mounted trigger, check if its under the top level mount + */ + if ( !mountedtrigger || !triggerisdescendant(mp, infop->top_mp) ) + return (VFS_RETURNED); + + /* + * Process any pending nested mount (now that its not referenced) + */ + if ((infop->trigger_vp != NULLVP) && + (vnode_getwithvid(infop->trigger_vp, infop->trigger_vid) == 0)) { + vnode_t vp = infop->trigger_vp; + int error; + + infop->trigger_vp = NULLVP; + + if (mp == vp->v_mountedhere) { + vnode_put(vp); + printf("trigger_unmount_callback: unexpected match '%s'\n", + mp->mnt_vfsstat.f_mntonname); + return (VFS_RETURNED); + } + if (infop->trigger_mp != vp->v_mountedhere) { + vnode_put(vp); + printf("trigger_unmount_callback: trigger mnt changed! (%p != %p)\n", + infop->trigger_mp, vp->v_mountedhere); + goto savenext; + } + + error = vnode_trigger_unresolve(vp, infop->flags, infop->ctx); + vnode_put(vp); + if (error) { + printf("unresolving: '%s', err %d\n", + vp->v_mountedhere ? vp->v_mountedhere->mnt_vfsstat.f_mntonname : + "???", error); + return (VFS_RETURNED_DONE); /* stop iteration on errors */ + } + } +savenext: + /* + * We can't call resolver here since we hold a mount iter + * ref on mp so save its covered vp for later processing + */ + infop->trigger_vp = mp->mnt_vnodecovered; + if ((infop->trigger_vp != NULLVP) && + (vnode_getwithref(infop->trigger_vp) == 0)) { + if (infop->trigger_vp->v_mountedhere == mp) { + infop->trigger_vid = infop->trigger_vp->v_id; + infop->trigger_mp = mp; + } + vnode_put(infop->trigger_vp); + } + + return (VFS_RETURNED); +} + +/* + * Attempt to unmount any trigger mounts nested underneath a mount. + * This is a best effort attempt and no retries are performed here. + * + * Note: mp->mnt_rwlock is held exclusively on entry (so be carefull) + */ +__private_extern__ +void +vfs_nested_trigger_unmounts(mount_t mp, int flags, vfs_context_t ctx) +{ + struct trigger_unmount_info info; + + /* Must have trigger vnodes */ + if (mp->mnt_numtriggers == 0) { + return; + } + /* Avoid recursive requests (by checking covered vnode) */ + if ((mp->mnt_vnodecovered != NULL) && + (vnode_getwithref(mp->mnt_vnodecovered) == 0)) { + boolean_t recursive = FALSE; + + if ((mp->mnt_vnodecovered->v_mountedhere == mp) && + (mp->mnt_vnodecovered->v_resolve != NULL) && + (mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_VFS_UNMOUNTED)) { + recursive = TRUE; + } + vnode_put(mp->mnt_vnodecovered); + if (recursive) + return; + } + + /* + * Attempt to unmount any nested trigger mounts (best effort) + */ + info.ctx = ctx; + info.top_mp = mp; + info.trigger_vp = NULLVP; + info.trigger_vid = 0; + info.trigger_mp = NULL; + info.flags = flags; + + (void) vfs_iterate(VFS_ITERATE_TAIL_FIRST, trigger_unmount_callback, &info); + + /* + * Process remaining nested mount (now that its not referenced) + */ + if ((info.trigger_vp != NULLVP) && + (vnode_getwithvid(info.trigger_vp, info.trigger_vid) == 0)) { + vnode_t vp = info.trigger_vp; + + if (info.trigger_mp == vp->v_mountedhere) { + (void) vnode_trigger_unresolve(vp, flags, ctx); + } + vnode_put(vp); + } +} + +int +vfs_addtrigger(mount_t mp, const char *relpath, struct vnode_trigger_info *vtip, vfs_context_t ctx) +{ + struct nameidata nd; + int res; + vnode_t rvp, vp; + struct vnode_trigger_param vtp; + + /* + * Must be called for trigger callback, wherein rwlock is held + */ + lck_rw_assert(&mp->mnt_rwlock, LCK_RW_ASSERT_HELD); + + TRIG_LOG("Adding trigger at %s\n", relpath); + TRIG_LOG("Trying VFS_ROOT\n"); + + /* + * We do a lookup starting at the root of the mountpoint, unwilling + * to cross into other mountpoints. + */ + res = VFS_ROOT(mp, &rvp, ctx); + if (res != 0) { + goto out; + } + + TRIG_LOG("Trying namei\n"); + + NDINIT(&nd, LOOKUP, OP_LOOKUP, USEDVP | NOCROSSMOUNT | FOLLOW, UIO_SYSSPACE, + CAST_USER_ADDR_T(relpath), ctx); + nd.ni_dvp = rvp; + res = namei(&nd); + if (res != 0) { + vnode_put(rvp); + goto out; + } + + vp = nd.ni_vp; + nameidone(&nd); + vnode_put(rvp); + + TRIG_LOG("Trying vnode_resolver_create()\n"); + + /* + * Set up blob. vnode_create() takes a larger structure + * with creation info, and we needed something different + * for this case. One needs to win, or we need to munge both; + * vnode_create() wins. + */ + bzero(&vtp, sizeof(vtp)); + vtp.vnt_resolve_func = vtip->vti_resolve_func; + vtp.vnt_unresolve_func = vtip->vti_unresolve_func; + vtp.vnt_rearm_func = vtip->vti_rearm_func; + vtp.vnt_reclaim_func = vtip->vti_reclaim_func; + vtp.vnt_reclaim_func = vtip->vti_reclaim_func; + vtp.vnt_data = vtip->vti_data; + vtp.vnt_flags = vtip->vti_flags; + + res = vnode_resolver_create(mp, vp, &vtp, TRUE); + vnode_put(vp); +out: + TRIG_LOG("Returning %d\n", res); + return res; +} + +#endif /* CONFIG_TRIGGERS */ diff --git a/bsd/vfs/vfs_syscalls.c b/bsd/vfs/vfs_syscalls.c index 02c3c39af..3d9b4591b 100644 --- a/bsd/vfs/vfs_syscalls.c +++ b/bsd/vfs/vfs_syscalls.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1995-2008 Apple Inc. All rights reserved. + * Copyright (c) 1995-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -92,6 +92,7 @@ #include #include #include +#include #include #include #include @@ -101,7 +102,6 @@ #include #include #include -#include #include #include @@ -109,6 +109,7 @@ #include #include #include +#include #include @@ -153,15 +154,21 @@ static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp, static int statfs64_common(struct mount *mp, struct vfsstatfs *sfsp, user_addr_t bufp); static int fsync_common(proc_t p, struct fsync_args *uap, int flags); +static int mount_common(char *fstypename, vnode_t pvp, vnode_t vp, + struct componentname *cnp, user_addr_t fsmountargs, + int flags, uint32_t internal_flags, char *labelstr, boolean_t kernelmount, + vfs_context_t ctx); +void vfs_notify_mount(vnode_t pdvp); + +int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth); #ifdef CONFIG_IMGSRC_ACCESS -static int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname); static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx); static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx); static void undo_place_on_covered_vp(mount_t mp, vnode_t vp); static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags); static void mount_end_update(mount_t mp); -static int relocate_imageboot_source(vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs); +static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index); #endif /* CONFIG_IMGSRC_ACCESS */ int (*union_dircheckp)(struct vnode **, struct fileproc *, vfs_context_t); @@ -220,6 +227,60 @@ extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *); * Virtual File System System Calls */ +#if NFSCLIENT +/* + * Private in-kernel mounting spi (NFS only, not exported) + */ + __private_extern__ +boolean_t +vfs_iskernelmount(mount_t mp) +{ + return ((mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE); +} + + __private_extern__ +int +kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path, + void *data, __unused size_t datalen, int syscall_flags, __unused uint32_t kern_flags, vfs_context_t ctx) +{ + struct nameidata nd; + boolean_t did_namei; + int error; + + NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT, + UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx); + + /* + * Get the vnode to be covered if it's not supplied + */ + if (vp == NULLVP) { + error = namei(&nd); + if (error) + return (error); + vp = nd.ni_vp; + pvp = nd.ni_dvp; + did_namei = TRUE; + } else { + char *pnbuf = CAST_DOWN(char *, path); + + nd.ni_cnd.cn_pnbuf = pnbuf; + nd.ni_cnd.cn_pnlen = strlen(pnbuf) + 1; + did_namei = FALSE; + } + + error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data), + syscall_flags, kern_flags, NULL, TRUE, ctx); + + if (did_namei) { + vnode_put(vp); + vnode_put(pvp); + nameidone(&nd); + } + + return (error); +} +#endif /* NFSCLIENT */ + /* * Mount a file system. */ @@ -237,6 +298,13 @@ mount(proc_t p, struct mount_args *uap, __unused int32_t *retval) return (__mac_mount(p, &muap, retval)); } +void +vfs_notify_mount(vnode_t pdvp) +{ + vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL); + lock_vnode_and_post(pdvp, NOTE_WRITE); +} + /* * __mac_mount: * Mount a file system taking into account MAC label behavior. @@ -256,10 +324,135 @@ mount(proc_t p, struct mount_args *uap, __unused int32_t *retval) * Returns: 0 Success * !0 Not success */ +boolean_t root_fs_upgrade_try = FALSE; + int __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval) { - struct vnode *vp, *pvp; + vnode_t pvp, vp; + vfs_context_t ctx = vfs_context_current(); + char fstypename[MFSNAMELEN]; + struct nameidata nd; + size_t dummy=0; + char *labelstr = NULL; + int flags = uap->flags; + int error; + boolean_t is_64bit = IS_64BIT_PROCESS(p); + + /* + * Get the fs type name from user space + */ + error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy); + if (error) + return (error); + + /* + * Get the vnode to be covered + */ + NDINIT(&nd, LOOKUP, OP_MOUNT, NOTRIGGER | FOLLOW | AUDITVNPATH1 | WANTPARENT, + UIO_USERSPACE, uap->path, ctx); + error = namei(&nd); + if (error) + return (error); + vp = nd.ni_vp; + pvp = nd.ni_dvp; + +#ifdef CONFIG_IMGSRC_ACCESS + /* Mounting image source cannot be batched with other operations */ + if (flags == MNT_IMGSRC_BY_INDEX) { + error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename, + ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX)); + goto out; + } +#endif /* CONFIG_IMGSRC_ACCESS */ + +#if CONFIG_MACF + /* + * Get the label string (if any) from user space + */ + if (uap->mac_p != USER_ADDR_NULL) { + struct user_mac mac; + size_t ulen = 0; + + if (is_64bit) { + struct user64_mac mac64; + error = copyin(uap->mac_p, &mac64, sizeof(mac64)); + mac.m_buflen = mac64.m_buflen; + mac.m_string = mac64.m_string; + } else { + struct user32_mac mac32; + error = copyin(uap->mac_p, &mac32, sizeof(mac32)); + mac.m_buflen = mac32.m_buflen; + mac.m_string = mac32.m_string; + } + if (error) + goto out; + if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) || + (mac.m_buflen < 2)) { + error = EINVAL; + goto out; + } + MALLOC(labelstr, char *, mac.m_buflen, M_MACTEMP, M_WAITOK); + error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen); + if (error) { + goto out; + } + AUDIT_ARG(mac_string, labelstr); + } +#endif /* CONFIG_MACF */ + + AUDIT_ARG(fflags, flags); + + if ((vp->v_flag & VROOT) && + (vp->v_mount->mnt_flag & MNT_ROOTFS)) { + flags |= MNT_UPDATE; + /* + * See 7392553 for more details on why this check exists. + * Suffice to say: If this check is ON and something tries + * to mount the rootFS RW, we'll turn off the codesign + * bitmap optimization. + */ +#if CHECK_CS_VALIDATION_BITMAP + if ( !(flags & MNT_RDONLY) ) { + root_fs_upgrade_try = TRUE; + } +#endif + } + + error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0, + labelstr, FALSE, ctx); +out: +#if CONFIG_MACF + if (labelstr) + FREE(labelstr, M_MACTEMP); +#endif /* CONFIG_MACF */ + + vnode_put(vp); + vnode_put(pvp); + nameidone(&nd); + + return (error); +} + +/* + * common mount implementation (final stage of mounting) + + * Arguments: + * fstypename file system type (ie it's vfs name) + * pvp parent of covered vnode + * vp covered vnode + * cnp component name (ie path) of covered vnode + * flags generic mount flags + * fsmountargs file system specific data + * labelstr optional MAC label + * kernelmount TRUE for mounts initiated from inside the kernel + * ctx caller's context + */ +static int +mount_common(char *fstypename, vnode_t pvp, vnode_t vp, + struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags, + char *labelstr, boolean_t kernelmount, vfs_context_t ctx) +{ struct vnode *devvp = NULLVP; struct vnode *device_vnode = NULLVP; #if CONFIG_MACF @@ -267,57 +460,20 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 #endif struct mount *mp; struct vfstable *vfsp = (struct vfstable *)0; + struct proc *p = vfs_context_proc(ctx); int error, flag = 0; - struct vnode_attr va; - vfs_context_t ctx = vfs_context_current(); - struct nameidata nd; - struct nameidata nd1; - char fstypename[MFSNAMELEN]; - size_t dummy=0; user_addr_t devpath = USER_ADDR_NULL; - user_addr_t fsmountargs = uap->data; int ronly = 0; int mntalloc = 0; boolean_t vfsp_ref = FALSE; - mode_t accessmode; - boolean_t is_64bit; boolean_t is_rwlock_locked = FALSE; boolean_t did_rele = FALSE; boolean_t have_usecount = FALSE; - AUDIT_ARG(fflags, uap->flags); - - is_64bit = proc_is64bit(p); - /* - * Get vnode to be covered + * Process an update for an existing mount */ - NDINIT(&nd, LOOKUP, NOTRIGGER | FOLLOW | AUDITVNPATH1 | WANTPARENT, - UIO_USERSPACE, uap->path, ctx); - error = namei(&nd); - if (error) - return (error); - vp = nd.ni_vp; - pvp = nd.ni_dvp; - - if ((vp->v_flag & VROOT) && - (vp->v_mount->mnt_flag & MNT_ROOTFS)) - uap->flags |= MNT_UPDATE; - - error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy); - if (error) - goto out1; - -#ifdef CONFIG_IMGSRC_ACCESS - if (uap->flags == MNT_IMGSRC) { - error = relocate_imageboot_source(vp, &nd.ni_cnd, fstypename, ctx, is_64bit, fsmountargs); - vnode_put(pvp); - vnode_put(vp); - return error; - } -#endif /* CONFIG_IMGSRC_ACCESS */ - - if (uap->flags & MNT_UPDATE) { + if (flags & MNT_UPDATE) { if ((vp->v_flag & VROOT) == 0) { error = EINVAL; goto out1; @@ -338,7 +494,7 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 * We only allow the filesystem to be reloaded if it * is currently mounted read-only. */ - if ((uap->flags & MNT_RELOAD) && + if ((flags & MNT_RELOAD) && ((mp->mnt_flag & MNT_RDONLY) == 0)) { error = ENOTSUP; goto out1; @@ -347,8 +503,7 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 #ifdef CONFIG_IMGSRC_ACCESS /* Can't downgrade the backer of the root FS */ if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) && - (!vfs_isrdonly(mp)) && (uap->flags & MNT_RDONLY)) - { + (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) { error = ENOTSUP; goto out1; } @@ -365,7 +520,6 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 #if CONFIG_MACF error = mac_mount_check_remount(ctx, mp); if (error != 0) { - lck_rw_done(&mp->mnt_rwlock); goto out1; } #endif @@ -373,48 +527,26 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, * and MNT_NOEXEC if mount point is already MNT_NOEXEC. */ - if (suser(vfs_context_ucred(ctx), NULL)) { - uap->flags |= MNT_NOSUID | MNT_NODEV; + if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) { + flags |= MNT_NOSUID | MNT_NODEV; if (mp->mnt_flag & MNT_NOEXEC) - uap->flags |= MNT_NOEXEC; + flags |= MNT_NOEXEC; } flag = mp->mnt_flag; - mp->mnt_flag |= - uap->flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE); + mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE); vfsp = mp->mnt_vtable; goto update; } - /* - * If the user is not root, ensure that they own the directory - * onto which we are attempting to mount. - */ - VATTR_INIT(&va); - VATTR_WANTED(&va, va_uid); - if ((error = vnode_getattr(vp, &va, ctx)) || - (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) && - (error = suser(vfs_context_ucred(ctx), &p->p_acflag)))) { - goto out1; - } /* * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and * MNT_NOEXEC if mount point is already MNT_NOEXEC. */ - if (suser(vfs_context_ucred(ctx), NULL)) { - uap->flags |= MNT_NOSUID | MNT_NODEV; + if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) { + flags |= MNT_NOSUID | MNT_NODEV; if (vp->v_mount->mnt_flag & MNT_NOEXEC) - uap->flags |= MNT_NOEXEC; - } - if ( (error = VNOP_FSYNC(vp, MNT_WAIT, ctx)) ) - goto out1; - - if ( (error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0)) ) - goto out1; - - if (vp->v_type != VDIR) { - error = ENOTDIR; - goto out1; + flags |= MNT_NOEXEC; } /* XXXAUDIT: Should we capture the type on the error path as well? */ @@ -431,22 +563,22 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 error = ENODEV; goto out1; } -#if CONFIG_MACF - error = mac_mount_check_mount(ctx, vp, - &nd.ni_cnd, vfsp->vfc_name); - if (error != 0) + + /* + * VFC_VFSLOCALARGS is not currently supported for kernel mounts + */ + if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS)) { + error = EINVAL; /* unsupported request */ goto out1; -#endif - if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) { - error = EBUSY; + } + + error = prepare_coveredvp(vp, ctx, cnp, fstypename, ((internal_flags & KERNEL_MOUNT_NOAUTH) != 0)); + if (error != 0) { goto out1; } - vnode_lock_spin(vp); - SET(vp->v_flag, VMOUNT); - vnode_unlock(vp); /* - * Allocate and initialize the filesystem. + * Allocate and initialize the filesystem (mount_t) */ MALLOC_ZONE(mp, struct mount *, (u_int32_t)sizeof(struct mount), M_MOUNT, M_WAITOK); @@ -477,35 +609,50 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 //mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; strncpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN); - strncpy(mp->mnt_vfsstat.f_mntonname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN); + strncpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN); mp->mnt_vnodecovered = vp; mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx)); - mp->mnt_devbsdunit = LOWPRI_MAX_NUM_DEV - 1; + mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1; + mp->mnt_devbsdunit = 0; /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */ vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE); - + +#if NFSCLIENT + if (kernelmount) + mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT; + if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) + mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT; +#endif /* NFSCLIENT */ + update: /* * Set the mount level flags. */ - if (uap->flags & MNT_RDONLY) + if (flags & MNT_RDONLY) mp->mnt_flag |= MNT_RDONLY; - else if (mp->mnt_flag & MNT_RDONLY) + else if (mp->mnt_flag & MNT_RDONLY) { + // disallow read/write upgrades of file systems that + // had the TYPENAME_OVERRIDE feature set. + if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) { + error = EPERM; + goto out1; + } mp->mnt_kern_flag |= MNTK_WANTRDWR; - + } mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | - MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE | MNT_AUTOMOUNTED | - MNT_DEFWRITE | MNT_NOATIME | MNT_QUARANTINE | MNT_CPROTECT ); - - mp->mnt_flag |= uap->flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV | - MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | - MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE | MNT_AUTOMOUNTED | - MNT_DEFWRITE | MNT_NOATIME | MNT_QUARANTINE | MNT_CPROTECT ); + MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE | + MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | + MNT_QUARANTINE | MNT_CPROTECT); + mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV | + MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | + MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE | + MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | + MNT_QUARANTINE | MNT_CPROTECT); #if CONFIG_MACF - if (uap->flags & MNT_MULTILABEL) { + if (flags & MNT_MULTILABEL) { if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) { error = EINVAL; goto out1; @@ -513,9 +660,11 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 mp->mnt_flag |= MNT_MULTILABEL; } #endif - + /* + * Process device path for local file systems if requested + */ if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) { - if (is_64bit) { + if (vfs_context_is64bit(ctx)) { if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) ) goto out1; fsmountargs += sizeof(devpath); @@ -528,16 +677,18 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 fsmountargs += sizeof(tmp); } - /* if it is not update and device name needs to be parsed */ + /* Lookup device and authorize access to it */ if ((devpath)) { - NDINIT(&nd1, LOOKUP, FOLLOW, UIO_USERSPACE, devpath, ctx); - if ( (error = namei(&nd1)) ) + struct nameidata nd; + + NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, UIO_USERSPACE, devpath, ctx); + if ( (error = namei(&nd)) ) goto out1; - strncpy(mp->mnt_vfsstat.f_mntfromname, nd1.ni_cnd.cn_pnbuf, MAXPATHLEN); - devvp = nd1.ni_vp; + strncpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN); + devvp = nd.ni_vp; - nameidone(&nd1); + nameidone(&nd); if (devvp->v_type != VBLK) { error = ENOTBLK; @@ -552,14 +703,16 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 * permissions on the device. */ if (suser(vfs_context_ucred(ctx), NULL) != 0) { - accessmode = KAUTH_VNODE_READ_DATA; + mode_t accessmode = KAUTH_VNODE_READ_DATA; + if ((mp->mnt_flag & MNT_RDONLY) == 0) accessmode |= KAUTH_VNODE_WRITE_DATA; if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) goto out2; } } - if (devpath && ((uap->flags & MNT_UPDATE) == 0)) { + /* On first mount, preflight and open device */ + if (devpath && ((flags & MNT_UPDATE) == 0)) { if ( (error = vnode_ref(devvp)) ) goto out2; /* @@ -595,114 +748,75 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 mp->mnt_devvp = devvp; device_vnode = devvp; - } else { - if ((mp->mnt_flag & MNT_RDONLY) && (mp->mnt_kern_flag & MNTK_WANTRDWR)) { - dev_t dev; - int maj; - /* - * If upgrade to read-write by non-root, then verify - * that user has necessary permissions on the device. - */ - device_vnode = mp->mnt_devvp; - - if (device_vnode) { - vnode_getalways(device_vnode); - if (suser(vfs_context_ucred(ctx), NULL)) { - if ((error = vnode_authorize(device_vnode, NULL, - KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) { - vnode_put(device_vnode); - goto out2; - } - } + } else if ((mp->mnt_flag & MNT_RDONLY) && + (mp->mnt_kern_flag & MNTK_WANTRDWR) && + (device_vnode = mp->mnt_devvp)) { + dev_t dev; + int maj; + /* + * If upgrade to read-write by non-root, then verify + * that user has necessary permissions on the device. + */ + vnode_getalways(device_vnode); - /* Tell the device that we're upgrading */ - dev = (dev_t)device_vnode->v_rdev; - maj = major(dev); + if (suser(vfs_context_ucred(ctx), NULL) && + (error = vnode_authorize(device_vnode, NULL, + KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, + ctx)) != 0) { + vnode_put(device_vnode); + goto out2; + } - if ((u_int)maj >= (u_int)nblkdev) - panic("Volume mounted on a device with invalid major number.\n"); + /* Tell the device that we're upgrading */ + dev = (dev_t)device_vnode->v_rdev; + maj = major(dev); - error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p); + if ((u_int)maj >= (u_int)nblkdev) + panic("Volume mounted on a device with invalid major number."); - vnode_put(device_vnode); - if (error != 0) { - goto out2; - } - } - } + error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p); + vnode_put(device_vnode); device_vnode = NULLVP; + if (error != 0) { + goto out2; + } } } #if CONFIG_MACF - if ((uap->flags & MNT_UPDATE) == 0) { + if ((flags & MNT_UPDATE) == 0) { mac_mount_label_init(mp); mac_mount_label_associate(ctx, mp); } - if (uap->mac_p != USER_ADDR_NULL) { - struct user_mac mac; - char *labelstr = NULL; - size_t ulen = 0; - - if ((uap->flags & MNT_UPDATE) != 0) { - error = mac_mount_check_label_update( - ctx, mp); + if (labelstr) { + if ((flags & MNT_UPDATE) != 0) { + error = mac_mount_check_label_update(ctx, mp); if (error != 0) goto out3; } - if (is_64bit) { - error = copyin(uap->mac_p, &mac, sizeof(mac)); - } else { - struct mac mac32; - error = copyin(uap->mac_p, &mac32, sizeof(mac32)); - mac.m_buflen = mac32.m_buflen; - mac.m_string = CAST_USER_ADDR_T(mac32.m_string); - } - if (error != 0) - goto out3; - if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) || - (mac.m_buflen < 2)) { - error = EINVAL; - goto out3; - } - MALLOC(labelstr, char *, mac.m_buflen, M_MACTEMP, M_WAITOK); - error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen); - if (error != 0) { - FREE(labelstr, M_MACTEMP); - goto out3; - } - AUDIT_ARG(mac_string, labelstr); - error = mac_mount_label_internalize(mp->mnt_mntlabel, labelstr); - FREE(labelstr, M_MACTEMP); - if (error != 0) - goto out3; } #endif - if (device_vnode != NULL) { - VNOP_IOCTL(device_vnode, DKIOCGETBSDUNIT, (caddr_t)&mp->mnt_devbsdunit, 0, NULL); - mp->mnt_devbsdunit %= LOWPRI_MAX_NUM_DEV; - } - /* * Mount the filesystem. */ error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx); - if (uap->flags & MNT_UPDATE) { + if (flags & MNT_UPDATE) { if (mp->mnt_kern_flag & MNTK_WANTRDWR) mp->mnt_flag &= ~MNT_RDONLY; mp->mnt_flag &=~ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE); mp->mnt_kern_flag &=~ MNTK_WANTRDWR; if (error) - mp->mnt_flag = flag; + mp->mnt_flag = flag; /* restore flag value */ vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL); lck_rw_done(&mp->mnt_rwlock); is_rwlock_locked = FALSE; if (!error) enablequotas(mp, ctx); - goto out2; + goto exit; } + /* * Put the new filesystem on the mount list after root. */ @@ -761,11 +875,14 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 */ (void)VFS_START(mp, 0, ctx); - error = mount_list_add(mp); - if (error != 0) { + if (mount_list_add(mp) != 0) { + /* + * The system is shutting down trying to umount + * everything, so fail with a plausible errno. + */ + error = EBUSY; goto out4; } - lck_rw_done(&mp->mnt_rwlock); is_rwlock_locked = FALSE; @@ -818,8 +935,14 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 } /* Now that mount is setup, notify the listeners */ - vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL); + vfs_notify_mount(pvp); } else { + /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */ + if (mp->mnt_vnodelist.tqh_first != NULL) { + panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.", + mp->mnt_vtable->vfc_name, error); + } + vnode_lock_spin(vp); CLR(vp->v_flag, VMOUNT); vnode_unlock(vp); @@ -833,45 +956,60 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 } lck_rw_done(&mp->mnt_rwlock); is_rwlock_locked = FALSE; + + /* + * if we get here, we have a mount structure that needs to be freed, + * but since the coveredvp hasn't yet been updated to point at it, + * no need to worry about other threads holding a crossref on this mp + * so it's ok to just free it + */ mount_lock_destroy(mp); #if CONFIG_MACF mac_mount_label_destroy(mp); #endif FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT); } - nameidone(&nd); - +exit: /* - * drop I/O count on covered 'vp' and - * on the device vp if there was one + * drop I/O count on the device vp if there was one */ if (devpath && devvp) vnode_put(devvp); - vnode_put(vp); - - /* Note that we've changed something in the parent directory */ - post_event_if_success(pvp, error, NOTE_WRITE); - vnode_put(pvp); return(error); +/* Error condition exits */ out4: (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx); + + /* + * If the mount has been placed on the covered vp, + * it may have been discovered by now, so we have + * to treat this just like an unmount + */ + mount_lock_spin(mp); + mp->mnt_lflag |= MNT_LDEAD; + mount_unlock(mp); + if (device_vnode != NULLVP) { vnode_rele(device_vnode); VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD|FWRITE, ctx); did_rele = TRUE; } + vnode_lock_spin(vp); + + mp->mnt_crossref++; vp->v_mountedhere = (mount_t) 0; + vnode_unlock(vp); - + if (have_usecount) { vnode_rele(vp); } out3: - if (devpath && ((uap->flags & MNT_UPDATE) == 0) && (!did_rele)) + if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) vnode_rele(devvp); out2: if (devpath && devvp) @@ -881,47 +1019,50 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 if (is_rwlock_locked == TRUE) { lck_rw_done(&mp->mnt_rwlock); } + if (mntalloc) { + if (mp->mnt_crossref) + mount_dropcrossref(mp, vp, 0); + else { + mount_lock_destroy(mp); #if CONFIG_MACF - mac_mount_label_destroy(mp); + mac_mount_label_destroy(mp); #endif - FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT); + FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT); + } } - if (vfsp_ref) { mount_list_lock(); vfsp->vfc_refcount--; mount_list_unlock(); } - vnode_put(vp); - vnode_put(pvp); - nameidone(&nd); return(error); } -#ifdef CONFIG_IMGSRC_ACCESS /* * Flush in-core data, check for competing mount attempts, * and set VMOUNT */ -static int -prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname) +int +prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth) { struct vnode_attr va; int error; - /* - * If the user is not root, ensure that they own the directory - * onto which we are attempting to mount. - */ - VATTR_INIT(&va); - VATTR_WANTED(&va, va_uid); - if ((error = vnode_getattr(vp, &va, ctx)) || - (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) && - (!vfs_context_issuser(ctx)))) { - error = EPERM; - goto out; + if (!skip_auth) { + /* + * If the user is not root, ensure that they own the directory + * onto which we are attempting to mount. + */ + VATTR_INIT(&va); + VATTR_WANTED(&va, va_uid); + if ((error = vnode_getattr(vp, &va, ctx)) || + (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) && + (!vfs_context_issuser(ctx)))) { + error = EPERM; + goto out; + } } if ( (error = VNOP_FSYNC(vp, MNT_WAIT, ctx)) ) @@ -955,30 +1096,57 @@ prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, cons return error; } +#if CONFIG_IMGSRC_ACCESS + +#if DEBUG +#define IMGSRC_DEBUG(args...) printf(args) +#else +#define IMGSRC_DEBUG(args...) do { } while(0) +#endif + static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx) { struct nameidata nd; - vnode_t vp; + vnode_t vp, realdevvp; mode_t accessmode; int error; - NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, devpath, ctx); - if ( (error = namei(&nd)) ) + NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, UIO_USERSPACE, devpath, ctx); + if ( (error = namei(&nd)) ) { + IMGSRC_DEBUG("namei() failed with %d\n", error); return error; + } - strncpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN); vp = nd.ni_vp; - nameidone(&nd); - if (vp->v_type != VBLK) { + if (!vnode_isblk(vp)) { + IMGSRC_DEBUG("Not block device.\n"); error = ENOTBLK; goto out; } - if (major(vp->v_rdev) >= nblkdev) { + + realdevvp = mp->mnt_devvp; + if (realdevvp == NULLVP) { + IMGSRC_DEBUG("No device backs the mount.\n"); error = ENXIO; goto out; } + + error = vnode_getwithref(realdevvp); + if (error != 0) { + IMGSRC_DEBUG("Coudn't get iocount on device.\n"); + goto out; + } + + if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) { + IMGSRC_DEBUG("Wrong dev_t.\n"); + error = ENXIO; + goto out1; + } + + strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN); + /* * If mount by non-root, then verify that user has necessary * permissions on the device. @@ -987,12 +1155,18 @@ authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_ accessmode = KAUTH_VNODE_READ_DATA; if ((mp->mnt_flag & MNT_RDONLY) == 0) accessmode |= KAUTH_VNODE_WRITE_DATA; - if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) - goto out; + if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) { + IMGSRC_DEBUG("Access denied.\n"); + goto out1; + } } *devvpp = vp; + +out1: + vnode_put(realdevvp); out: + nameidone(&nd); if (error) { vnode_put(vp); } @@ -1113,20 +1287,41 @@ mount_end_update(mount_t mp) } static int -relocate_imageboot_source(vnode_t vp, struct componentname *cnp, +get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp) +{ + vnode_t vp; + + if (height >= MAX_IMAGEBOOT_NESTING) { + return EINVAL; + } + + vp = imgsrc_rootvnodes[height]; + if ((vp != NULLVP) && (vnode_get(vp) == 0)) { + *rvpp = vp; + return 0; + } else { + return ENOENT; + } +} + +static int +relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, - boolean_t is64bit, user_addr_t fsmountargs) + boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index) { int error; mount_t mp; boolean_t placed = FALSE; - vnode_t devvp; + vnode_t devvp = NULLVP; struct vfstable *vfsp; user_addr_t devpath; char *old_mntonname; + vnode_t rvp; + uint32_t height; + uint32_t flags; /* If we didn't imageboot, nothing to move */ - if (imgsrc_rootvnode == NULLVP) { + if (imgsrc_rootvnodes[0] == NULLVP) { return EINVAL; } @@ -1135,23 +1330,84 @@ relocate_imageboot_source(vnode_t vp, struct componentname *cnp, return EPERM; } - error = vnode_get(imgsrc_rootvnode); + IMGSRC_DEBUG("looking for root vnode.\n"); + + /* + * Get root vnode of filesystem we're moving. + */ + if (by_index) { + if (is64bit) { + struct user64_mnt_imgsrc_args mia64; + error = copyin(fsmountargs, &mia64, sizeof(mia64)); + if (error != 0) { + IMGSRC_DEBUG("Failed to copy in arguments.\n"); + return error; + } + + height = mia64.mi_height; + flags = mia64.mi_flags; + devpath = mia64.mi_devpath; + } else { + struct user32_mnt_imgsrc_args mia32; + error = copyin(fsmountargs, &mia32, sizeof(mia32)); + if (error != 0) { + IMGSRC_DEBUG("Failed to copy in arguments.\n"); + return error; + } + + height = mia32.mi_height; + flags = mia32.mi_flags; + devpath = mia32.mi_devpath; + } + } else { + /* + * For binary compatibility--assumes one level of nesting. + */ + if (is64bit) { + if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) ) + return error; + } else { + user32_addr_t tmp; + if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) ) + return error; + + /* munge into LP64 addr */ + devpath = CAST_USER_ADDR_T(tmp); + } + + height = 0; + flags = 0; + } + + if (flags != 0) { + IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__); + return EINVAL; + } + + error = get_imgsrc_rootvnode(height, &rvp); if (error != 0) { + IMGSRC_DEBUG("getting root vnode failed with %d\n", error); return error; } + IMGSRC_DEBUG("got root vnode.\n"); + MALLOC(old_mntonname, char*, MAXPATHLEN, M_TEMP, M_WAITOK); /* Can only move once */ - mp = vnode_mount(imgsrc_rootvnode); + mp = vnode_mount(rvp); if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) { + IMGSRC_DEBUG("Already moved.\n"); error = EBUSY; goto out0; } + IMGSRC_DEBUG("Starting updated.\n"); + /* Get exclusive rwlock on mount, authorize update on mp */ error = mount_begin_update(mp , ctx, 0); if (error != 0) { + IMGSRC_DEBUG("Starting updated failed with %d\n", error); goto out0; } @@ -1160,40 +1416,38 @@ relocate_imageboot_source(vnode_t vp, struct componentname *cnp, * so we're now safe to proceed. */ if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) { + IMGSRC_DEBUG("Already moved [2]\n"); goto out1; } + + + IMGSRC_DEBUG("Preparing coveredvp.\n"); /* Mark covered vnode as mount in progress, authorize placing mount on top */ - error = prepare_coveredvp(vp, ctx, cnp, fsname); + error = prepare_coveredvp(vp, ctx, cnp, fsname, FALSE); if (error != 0) { + IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error); goto out1; } + IMGSRC_DEBUG("Covered vp OK.\n"); + /* Sanity check the name caller has provided */ vfsp = mp->mnt_vtable; if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) { + IMGSRC_DEBUG("Wrong fs name.\n"); error = EINVAL; goto out2; } /* Check the device vnode and update mount-from name, for local filesystems */ if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) { - if (is64bit) { - if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) ) - goto out2; - fsmountargs += sizeof(devpath); - } else { - user32_addr_t tmp; - if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) ) - goto out2; - /* munge into LP64 addr */ - devpath = CAST_USER_ADDR_T(tmp); - fsmountargs += sizeof(tmp); - } + IMGSRC_DEBUG("Local, doing device validation.\n"); if (devpath != USER_ADDR_NULL) { error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx); if (error) { + IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n"); goto out2; } @@ -1205,6 +1459,8 @@ relocate_imageboot_source(vnode_t vp, struct componentname *cnp, * Place mp on top of vnode, ref the vnode, call checkdirs(), * and increment the name cache's mount generation */ + + IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n"); error = place_mount_and_checkdirs(mp, vp, ctx); if (error != 0) { goto out2; @@ -1221,15 +1477,21 @@ relocate_imageboot_source(vnode_t vp, struct componentname *cnp, mount_unlock(mp); /* Finally, add to mount list, completely ready to go */ - error = mount_list_add(mp); - if (error != 0) { + if (mount_list_add(mp) != 0) { + /* + * The system is shutting down trying to umount + * everything, so fail with a plausible errno. + */ + error = EBUSY; goto out3; } mount_end_update(mp); - vnode_put(imgsrc_rootvnode); + vnode_put(rvp); FREE(old_mntonname, M_TEMP); + vfs_notify_mount(pvp); + return 0; out3: strncpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN); @@ -1255,7 +1517,7 @@ relocate_imageboot_source(vnode_t vp, struct componentname *cnp, mount_end_update(mp); out0: - vnode_put(imgsrc_rootvnode); + vnode_put(rvp); FREE(old_mntonname, M_TEMP); return error; } @@ -1282,7 +1544,8 @@ enablequotas(struct mount *mp, vfs_context_t ctx) */ for (type=0; type < MAXQUOTAS; type++) { snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]); - NDINIT(&qnd, LOOKUP, FOLLOW, UIO_SYSSPACE, CAST_USER_ADDR_T(qfpath), ctx); + NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE, + CAST_USER_ADDR_T(qfpath), ctx); if (namei(&qnd) != 0) continue; /* option file to trigger quotas is not present */ vnode_put(qnd.ni_vp); @@ -1410,7 +1673,7 @@ unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval) struct nameidata nd; vfs_context_t ctx = vfs_context_current(); - NDINIT(&nd, LOOKUP, NOTRIGGER | FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_UNMOUNT, NOTRIGGER | FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) @@ -1466,13 +1729,18 @@ safedounmount(struct mount *mp, int flags, vfs_context_t ctx) proc_t p = vfs_context_proc(ctx); /* - * Only root, or the user that did the original mount is - * permitted to unmount this filesystem. + * Skip authorization if the mount is tagged as permissive and + * this is not a forced-unmount attempt. */ - if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) && - (error = suser(kauth_cred_get(), &p->p_acflag))) - goto out; - + if (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0))) { + /* + * Only root, or the user that did the original mount is + * permitted to unmount this filesystem. + */ + if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) && + (error = suser(kauth_cred_get(), &p->p_acflag))) + goto out; + } /* * Don't allow unmounting the root file system. */ @@ -1507,9 +1775,13 @@ dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx) int forcedunmount = 0; int lflags = 0; struct vnode *devvp = NULLVP; +#if CONFIG_TRIGGERS + int did_vflush = 0; +#endif /* CONFIG_TRIGGERS */ if (flags & MNT_FORCE) forcedunmount = 1; + mount_lock(mp); /* XXX post jaguar fix LK_DRAIN - then clean this up */ if ((flags & MNT_FORCE)) { @@ -1572,7 +1844,11 @@ dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx) } } } - + +#if CONFIG_TRIGGERS + vfs_nested_trigger_unmounts(mp, flags, ctx); + did_vflush = 1; +#endif if (forcedunmount) lflags |= FORCECLOSE; error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM | SKIPROOT | lflags); @@ -1614,14 +1890,17 @@ dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx) lck_rw_done(&mp->mnt_rwlock); mount_list_remove(mp); lck_rw_lock_exclusive(&mp->mnt_rwlock); - + /* mark the mount point hook in the vp but not drop the ref yet */ if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) { - vnode_getwithref(coveredvp); - vnode_lock_spin(coveredvp); - coveredvp->v_mountedhere = (struct mount *)0; - vnode_unlock(coveredvp); - vnode_put(coveredvp); + vnode_getwithref(coveredvp); + vnode_lock_spin(coveredvp); + + mp->mnt_crossref++; + coveredvp->v_mountedhere = (struct mount *)0; + + vnode_unlock(coveredvp); + vnode_put(coveredvp); } mount_list_lock(); @@ -1650,11 +1929,33 @@ dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx) mp->mnt_lflag &= ~MNT_LWAIT; needwakeup = 1; } + + +#if CONFIG_TRIGGERS + /* + * Callback and context are set together under the mount lock, and + * never cleared, so we're safe to examine them here, drop the lock, + * and call out. + */ + if (mp->mnt_triggercallback != NULL) { + mount_unlock(mp); + if (error == 0) { + mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx); + } else if (did_vflush) { + mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx); + } + } else { + mount_unlock(mp); + } +#else mount_unlock(mp); +#endif /* CONFIG_TRIGGERS */ + lck_rw_done(&mp->mnt_rwlock); if (needwakeup) wakeup((caddr_t)mp); + if (!error) { if ((coveredvp != NULLVP)) { vnode_t pvp; @@ -1662,18 +1963,12 @@ dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx) vnode_getwithref(coveredvp); pvp = vnode_getparent(coveredvp); vnode_rele(coveredvp); - vnode_lock_spin(coveredvp); - if(mp->mnt_crossref == 0) { - vnode_unlock(coveredvp); - mount_lock_destroy(mp); -#if CONFIG_MACF - mac_mount_label_destroy(mp); -#endif - FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT); - } else { - coveredvp->v_lflag |= VL_MOUNTDEAD; - vnode_unlock(coveredvp); - } + + mount_dropcrossref(mp, coveredvp, 0); +#if CONFIG_TRIGGERS + if (coveredvp->v_resolve) + vnode_trigger_rearm(coveredvp, ctx); +#endif vnode_put(coveredvp); if (pvp) { @@ -1695,25 +1990,28 @@ dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx) void mount_dropcrossref(mount_t mp, vnode_t dp, int need_put) { - vnode_lock(dp); - mp->mnt_crossref--; - if (mp->mnt_crossref < 0) - panic("mount cross refs -ve"); - if (((dp->v_lflag & VL_MOUNTDEAD) == VL_MOUNTDEAD) && (mp->mnt_crossref == 0)) { - dp->v_lflag &= ~VL_MOUNTDEAD; - if (need_put) - vnode_put_locked(dp); - vnode_unlock(dp); - mount_lock_destroy(mp); -#if CONFIG_MACF - mac_mount_label_destroy(mp); -#endif - FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT); - return; - } + vnode_lock(dp); + mp->mnt_crossref--; + + if (mp->mnt_crossref < 0) + panic("mount cross refs -ve"); + + if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) { + if (need_put) - vnode_put_locked(dp); + vnode_put_locked(dp); vnode_unlock(dp); + + mount_lock_destroy(mp); +#if CONFIG_MACF + mac_mount_label_destroy(mp); +#endif + FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT); + return; + } + if (need_put) + vnode_put_locked(dp); + vnode_unlock(dp); } @@ -1806,8 +2104,8 @@ quotactl_funneled(proc_t p, struct quotactl_args *uap, __unused int32_t *retval) AUDIT_ARG(uid, uap->uid); AUDIT_ARG(cmd, uap->cmd); - NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, - UIO_USERSPACE, uap->path, ctx); + NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, + uap->path, ctx); error = namei(&nd); if (error) return (error); @@ -1914,7 +2212,7 @@ statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval) vfs_context_t ctx = vfs_context_current(); vnode_t vp; - NDINIT(&nd, LOOKUP, NOTRIGGER | FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_STATFS, NOTRIGGER | FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) @@ -2000,7 +2298,11 @@ statfs64_common(struct mount *mp, struct vfsstatfs *sfsp, user_addr_t bufp) sfs.f_type = mp->mnt_vtable->vfc_typenum; sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK; sfs.f_fssubtype = sfsp->f_fssubtype; - strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSTYPENAMELEN); + if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) { + strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN); + } else { + strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSTYPENAMELEN); + } strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MAXPATHLEN); strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MAXPATHLEN); @@ -2022,7 +2324,7 @@ statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *r vfs_context_t ctxp = vfs_context_current(); vnode_t vp; - NDINIT(&nd, LOOKUP, NOTRIGGER | FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_STATFS, NOTRIGGER | FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctxp); error = namei(&nd); if (error) @@ -2068,7 +2370,7 @@ fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t mp = vp->v_mount; if (!mp) { - error = EBADF; + error = EBADF;; goto out; } sp = &mp->mnt_vfsstat; @@ -2470,7 +2772,7 @@ common_chdir(proc_t p, struct chdir_args *uap, int per_thread) vnode_t tvp; vfs_context_t ctx = vfs_context_current(); - NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = change_dir(&nd, ctx); if (error) @@ -2572,7 +2874,7 @@ chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval) if ((error = suser(kauth_cred_get(), &p->p_acflag))) return (error); - NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = change_dir(&nd, ctx); if (error) @@ -2795,7 +3097,15 @@ open1(vfs_context_t ctx, struct nameidata *ndp, int uflags, struct vnode_attr *v */ if (no_controlling_tty && (p->p_flag & P_CONTROLT)) { vnode_t ttyvp; - vnode_ref(vp); + + /* + * We already have a ref from vn_open_auth(), so we can demand another reference. + */ + error = vnode_ref_ext(vp, 0, VNODE_REF_FORCE); + if (error != 0) { + panic("vnode_ref_ext() with VNODE_REF_FORCE failed?!"); + } + session_lock(sessp); ttyvp = sessp->s_ttyvp; sessp->s_ttyvp = vp; @@ -2808,6 +3118,8 @@ open1(vfs_context_t ctx, struct nameidata *ndp, int uflags, struct vnode_attr *v vnode_put(vp); proc_fdlock(p); + if (flags & O_CLOEXEC) + *fdflags(p, indx) |= UF_EXCLOSE; procfdtbl_releasefd(p, indx, NULL); fp_drop(p, indx, fp, 1); proc_fdunlock(p); @@ -2887,7 +3199,8 @@ open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval) if (xsecdst != NULL) VATTR_SET(&va, va_acl, &xsecdst->fsec_acl); - NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, vfs_context_current()); + NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, + uap->path, vfs_context_current()); ciferror = open1(vfs_context_current(), &nd, uap->flags, &va, retval); if (xsecdst != NULL) @@ -2916,7 +3229,8 @@ open_nocancel(proc_t p, struct open_nocancel_args *uap, int32_t *retval) cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT; VATTR_SET(&va, va_mode, cmode & ACCESSPERMS); - NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, vfs_context_current()); + NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, + uap->path, vfs_context_current()); return(open1(vfs_context_current(), &nd, uap->flags, &va, retval)); } @@ -2933,7 +3247,6 @@ mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval) struct vnode_attr va; vfs_context_t ctx = vfs_context_current(); int error; - int whiteout = 0; struct nameidata nd; vnode_t vp, dvp; @@ -2950,7 +3263,7 @@ mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval) if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) return (error); - NDINIT(&nd, CREATE, LOCKPARENT | AUDITVNPATH1, + NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) @@ -2973,32 +3286,22 @@ mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval) case S_IFBLK: VATTR_SET(&va, va_type, VBLK); break; - case S_IFWHT: - whiteout = 1; - break; default: error = EINVAL; goto out; } #if CONFIG_MACF - if (!whiteout) { - error = mac_vnode_check_create(ctx, - nd.ni_dvp, &nd.ni_cnd, &va); - if (error) - goto out; - } + error = mac_vnode_check_create(ctx, + nd.ni_dvp, &nd.ni_cnd, &va); + if (error) + goto out; #endif if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) goto out; - if (whiteout) { - error = VNOP_WHITEOUT(dvp, &nd.ni_cnd, CREATE, ctx); - } else { - error = vn_create(dvp, &vp, &nd.ni_cnd, &va, 0, ctx); - } - if (error) + if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0) goto out; if (vp) { @@ -3050,7 +3353,7 @@ mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap) int error; struct nameidata nd; - NDINIT(&nd, CREATE, LOCKPARENT | AUDITVNPATH1, + NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1, UIO_USERSPACE, upath, ctx); error = namei(&nd); if (error) @@ -3065,19 +3368,10 @@ mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap) } VATTR_SET(vap, va_type, VFIFO); -#if CONFIG_MACF - error = mac_vnode_check_create(ctx, nd.ni_dvp, - &nd.ni_cnd, vap); - if (error) + if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) goto out; -#endif - - - if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) - goto out; - - error = vn_create(dvp, &vp, &nd.ni_cnd, vap, 0, ctx); + error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx); out: /* * nameidone has to happen before we vnode_put(dvp) @@ -3263,7 +3557,7 @@ link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval) vp = dvp = lvp = NULLVP; /* look up the object we are linking to */ - NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) @@ -3297,6 +3591,9 @@ link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval) } /* lookup the target node */ +#if CONFIG_TRIGGERS + nd.ni_op = OP_LINK; +#endif nd.ni_cnd.cn_nameiop = CREATE; nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK; nd.ni_dirp = uap->link; @@ -3439,7 +3736,7 @@ symlink(proc_t p, struct symlink_args *uap, __unused int32_t *retval) goto out; AUDIT_ARG(text, path); /* This is the link string */ - NDINIT(&nd, CREATE, LOCKPARENT | AUDITVNPATH1, + NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1, UIO_USERSPACE, uap->link, ctx); error = namei(&nd); if (error) @@ -3481,6 +3778,9 @@ symlink(proc_t p, struct symlink_args *uap, __unused int32_t *retval) if (vp == NULL) { nd.ni_cnd.cn_nameiop = LOOKUP; +#if CONFIG_TRIGGERS + nd.ni_op = OP_LOOKUP; +#endif nd.ni_cnd.cn_flags = 0; error = namei(&nd); vp = nd.ni_vp; @@ -3557,7 +3857,7 @@ undelete(__unused proc_t p, struct undelete_args *uap, __unused int32_t *retval) vfs_context_t ctx = vfs_context_current(); vnode_t vp, dvp; - NDINIT(&nd, DELETE, LOCKPARENT|DOWHITEOUT|AUDITVNPATH1, + NDINIT(&nd, DELETE, OP_UNLINK, LOCKPARENT | DOWHITEOUT | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) @@ -3598,19 +3898,25 @@ unlink1(vfs_context_t ctx, struct nameidata *ndp, int nodelbusy) int len=0; #if CONFIG_FSE fse_info finfo; + struct vnode_attr va; #endif int flags = 0; int need_event = 0; int has_listeners = 0; int truncated_path=0; + int batched; + struct vnode_attr *vap = NULL; + #if NAMEDRSRCFORK /* unlink or delete is allowed on rsrc forks and named streams */ ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK; #endif ndp->ni_cnd.cn_flags |= LOCKPARENT; + ndp->ni_flag |= NAMEI_COMPOUNDREMOVE; cnp = &ndp->ni_cnd; +lookup_continue: error = namei(ndp); if (error) return (error); @@ -3618,57 +3924,62 @@ unlink1(vfs_context_t ctx, struct nameidata *ndp, int nodelbusy) dvp = ndp->ni_dvp; vp = ndp->ni_vp; + /* With Carbon delete semantics, busy files cannot be deleted */ if (nodelbusy) { flags |= VNODE_REMOVE_NODELETEBUSY; } - /* - * Normally, unlinking of directories is not supported. - * However, some file systems may have limited support. - */ - if ((vp->v_type == VDIR) && - !(vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSDIRLINKS)) { - error = EPERM; /* POSIX */ - } + if (vp) { + batched = vnode_compound_remove_available(vp); + /* + * The root of a mounted filesystem cannot be deleted. + */ + if (vp->v_flag & VROOT) { + error = EBUSY; + } - /* - * The root of a mounted filesystem cannot be deleted. - */ - if (vp->v_flag & VROOT) { - error = EBUSY; - } - if (error) - goto out; + if (!batched) { + error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL); + if (error) { + goto out; + } + } + } else { + batched = 1; + if (!vnode_compound_remove_available(dvp)) { + panic("No vp, but no compound remove?"); + } + } - /* authorize the delete operation */ -#if CONFIG_MACF - if (!error) - error = mac_vnode_check_unlink(ctx, - dvp, vp, cnp); -#endif /* MAC */ - if (!error) - error = vnode_authorize(vp, ndp->ni_dvp, KAUTH_VNODE_DELETE, ctx); - if (error) - goto out; - #if CONFIG_FSE need_event = need_fsevent(FSE_DELETE, dvp); if (need_event) { - if ((vp->v_flag & VISHARDLINK) == 0) { - get_fse_info(vp, &finfo, ctx); + if (!batched) { + if ((vp->v_flag & VISHARDLINK) == 0) { + /* XXX need to get these data in batched VNOP */ + get_fse_info(vp, &finfo, ctx); + } + } else { + error = vfs_get_notify_attributes(&va); + if (error) { + goto out; + } + + vap = &va; } } #endif has_listeners = kauth_authorize_fileop_has_listeners(); if (need_event || has_listeners) { - GET_PATH(path); if (path == NULL) { - error = ENOMEM; - goto out; + GET_PATH(path); + if (path == NULL) { + error = ENOMEM; + goto out; + } } - len = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path); } @@ -3677,7 +3988,25 @@ unlink1(vfs_context_t ctx, struct nameidata *ndp, int nodelbusy) error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx); else #endif - error = VNOP_REMOVE(dvp, vp, &ndp->ni_cnd, flags, ctx); + { + error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx); + vp = ndp->ni_vp; + if (error == EKEEPLOOKING) { + if (!batched) { + panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?"); + } + + if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) { + panic("EKEEPLOOKING, but continue flag not set?"); + } + + if (vnode_isdir(vp)) { + error = EISDIR; + goto out; + } + goto lookup_continue; + } + } /* * Call out to allow 3rd party notification of delete. @@ -3706,6 +4035,8 @@ unlink1(vfs_context_t ctx, struct nameidata *ndp, int nodelbusy) if (need_event) { if (vp->v_flag & VISHARDLINK) { get_fse_info(vp, &finfo, ctx); + } else if (vap) { + vnode_get_fse_info_from_vap(vp, &finfo, vap); } if (truncated_path) { finfo.mode |= FSE_TRUNCATED_PATH; @@ -3717,27 +4048,30 @@ unlink1(vfs_context_t ctx, struct nameidata *ndp, int nodelbusy) } #endif } + +out: if (path != NULL) RELEASE_PATH(path); - /* - * nameidone has to happen before we vnode_put(dvp) - * since it may need to release the fs_nodelock on the dvp - */ -out: #if NAMEDRSRCFORK /* recycle the deleted rsrc fork vnode to force a reclaim, which * will cause its shadow file to go away if necessary. */ - if ((vnode_isnamedstream(ndp->ni_vp)) && - (ndp->ni_vp->v_parent != NULLVP) && - vnode_isshadow(ndp->ni_vp)) { - vnode_recycle(ndp->ni_vp); + if (vp && (vnode_isnamedstream(vp)) && + (vp->v_parent != NULLVP) && + vnode_isshadow(vp)) { + vnode_recycle(vp); } #endif + /* + * nameidone has to happen before we vnode_put(dvp) + * since it may need to release the fs_nodelock on the dvp + */ nameidone(ndp); vnode_put(dvp); - vnode_put(vp); + if (vp) { + vnode_put(vp); + } return (error); } @@ -3750,7 +4084,8 @@ unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval) struct nameidata nd; vfs_context_t ctx = vfs_context_current(); - NDINIT(&nd, DELETE, AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); + NDINIT(&nd, DELETE, OP_UNLINK, AUDITVNPATH1, UIO_USERSPACE, + uap->path, ctx); return unlink1(ctx, &nd, 0); } @@ -3763,7 +4098,8 @@ delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval) struct nameidata nd; vfs_context_t ctx = vfs_context_current(); - NDINIT(&nd, DELETE, AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); + NDINIT(&nd, DELETE, OP_UNLINK, AUDITVNPATH1, UIO_USERSPACE, + uap->path, ctx); return unlink1(ctx, &nd, 1); } @@ -4132,7 +4468,9 @@ access_extended(__unused proc_t p, struct access_extended_args *uap, __unused in niopts |= WANTPARENT; /* do the lookup */ - NDINIT(&nd, LOOKUP, niopts, UIO_SYSSPACE, CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset), &context); + NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE, + CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset), + &context); error = namei(&nd); if (!error) { vp = nd.ni_vp; @@ -4218,7 +4556,8 @@ access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval) /* need parent for vnode_authorize for deletion test */ if (uap->flags & _DELETE_OK) niopts |= WANTPARENT; - NDINIT(&nd, LOOKUP, niopts, UIO_USERSPACE, uap->path, &context); + NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_USERSPACE, + uap->path, &context); #if NAMEDRSRCFORK /* access(F_OK) calls are allowed for resource forks. */ @@ -4410,7 +4749,7 @@ stat1(user_addr_t path, user_addr_t ub, user_addr_t xsecurity, user_addr_t xsecu struct nameidata nd; vfs_context_t ctx = vfs_context_current(); - NDINIT(&nd, LOOKUP, NOTRIGGER | FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_GETATTR, NOTRIGGER | FOLLOW | AUDITVNPATH1, UIO_USERSPACE, path, ctx); return(stat2(ctx, &nd, ub, xsecurity, xsecurity_size, isstat64)); } @@ -4483,7 +4822,7 @@ lstat1(user_addr_t path, user_addr_t ub, user_addr_t xsecurity, user_addr_t xsec struct nameidata nd; vfs_context_t ctx = vfs_context_current(); - NDINIT(&nd, LOOKUP, NOTRIGGER | NOFOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_GETATTR, NOTRIGGER | NOFOLLOW | AUDITVNPATH1, UIO_USERSPACE, path, ctx); return(stat2(ctx, &nd, ub, xsecurity, xsecurity_size, isstat64)); @@ -4569,7 +4908,7 @@ pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval) struct nameidata nd; vfs_context_t ctx = vfs_context_current(); - NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) @@ -4597,7 +4936,7 @@ readlink(proc_t p, struct readlink_args *uap, int32_t *retval) vfs_context_t ctx = vfs_context_current(); char uio_buf[ UIO_SIZEOF(1) ]; - NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) @@ -4680,7 +5019,7 @@ chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval) struct nameidata nd; AUDIT_ARG(fflags, uap->flags); - NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) @@ -4782,7 +5121,7 @@ chmod1(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap) struct nameidata nd; int error; - NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, path, ctx); if ((error = namei(&nd))) return (error); @@ -4984,7 +5323,8 @@ chown1(vfs_context_t ctx, struct chown_args *uap, __unused int32_t *retval, int AUDIT_ARG(owner, uap->uid, uap->gid); - NDINIT(&nd, LOOKUP, (follow ? FOLLOW : 0) | NOTRIGGER | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_SETATTR, + (follow ? FOLLOW : 0) | NOTRIGGER | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) @@ -5193,7 +5533,7 @@ utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval) * AUDIT: Needed to change the order of operations to do the * name lookup first because auditing wants the path. */ - NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) @@ -5260,7 +5600,7 @@ truncate(__unused proc_t p, struct truncate_args *uap, __unused int32_t *retval) if (uap->length < 0) return(EINVAL); - NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); if ((error = namei(&nd))) return (error); @@ -5472,14 +5812,15 @@ copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval) return(EINVAL); } - NDINIT(&fromnd, LOOKUP, SAVESTART | AUDITVNPATH1, + NDINIT(&fromnd, LOOKUP, OP_COPYFILE, SAVESTART | AUDITVNPATH1, UIO_USERSPACE, uap->from, ctx); if ((error = namei(&fromnd))) return (error); fvp = fromnd.ni_vp; - NDINIT(&tond, CREATE, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK, - UIO_USERSPACE, uap->to, ctx); + NDINIT(&tond, CREATE, OP_LINK, + LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK, + UIO_USERSPACE, uap->to, ctx); if ((error = namei(&tond))) { goto out1; } @@ -5552,76 +5893,101 @@ rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval) int do_retry; int mntrename; int need_event; - const char *oname; + const char *oname = NULL; char *from_name = NULL, *to_name = NULL; int from_len=0, to_len=0; int holding_mntlock; mount_t locked_mp = NULL; - vnode_t oparent; + vnode_t oparent = NULLVP; #if CONFIG_FSE fse_info from_finfo, to_finfo; + struct vnode_attr fva, tva; #endif int from_truncated=0, to_truncated; + int batched = 0; + struct vnode_attr *fvap, *tvap; + int continuing = 0; holding_mntlock = 0; do_retry = 0; retry: fvp = tvp = NULL; fdvp = tdvp = NULL; + fvap = tvap = NULL; mntrename = FALSE; - NDINIT(&fromnd, DELETE, WANTPARENT | AUDITVNPATH1, UIO_USERSPACE, uap->from, ctx); + NDINIT(&fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1, + UIO_USERSPACE, uap->from, ctx); + fromnd.ni_flag = NAMEI_COMPOUNDRENAME; + + NDINIT(&tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK, + UIO_USERSPACE, uap->to, ctx); + tond.ni_flag = NAMEI_COMPOUNDRENAME; - if ( (error = namei(&fromnd)) ) - goto out1; - fdvp = fromnd.ni_dvp; - fvp = fromnd.ni_vp; +continue_lookup: + if ((fromnd.ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) { + if ( (error = namei(&fromnd)) ) + goto out1; + fdvp = fromnd.ni_dvp; + fvp = fromnd.ni_vp; -#if CONFIG_MACF - error = mac_vnode_check_rename_from(ctx, fdvp, fvp, &fromnd.ni_cnd); - if (error) - goto out1; -#endif + if (fvp && fvp->v_type == VDIR) + tond.ni_cnd.cn_flags |= WILLBEDIR; + } - NDINIT(&tond, RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK , UIO_USERSPACE, uap->to, ctx); - if (fvp->v_type == VDIR) - tond.ni_cnd.cn_flags |= WILLBEDIR; + if ((tond.ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) { + if ( (error = namei(&tond)) ) { + /* + * Translate error code for rename("dir1", "dir2/."). + */ + if (error == EISDIR && fvp->v_type == VDIR) + error = EINVAL; + goto out1; + } + tdvp = tond.ni_dvp; + tvp = tond.ni_vp; + } - if ( (error = namei(&tond)) ) { - /* - * Translate error code for rename("dir1", "dir2/."). + batched = vnode_compound_rename_available(fdvp); + if (!fvp) { + /* + * Claim: this check will never reject a valid rename. + * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp. + * Suppose fdvp and tdvp are not on the same mount. + * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem. If fvp is the root, + * then you can't move it to within another dir on the same mountpoint. + * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction. + * + * If this check passes, then we are safe to pass these vnodes to the same FS. */ - if (error == EISDIR && fvp->v_type == VDIR) - error = EINVAL; - goto out1; + if (fdvp->v_mount != tdvp->v_mount) { + error = EXDEV; + goto out1; + } + goto skipped_lookup; } - tdvp = tond.ni_dvp; - tvp = tond.ni_vp; - -#if CONFIG_MACF - error = mac_vnode_check_rename_to(ctx, - tdvp, tvp, fdvp == tdvp, &tond.ni_cnd); - if (error) - goto out1; -#endif - if (tvp != NULL) { - if (fvp->v_type == VDIR && tvp->v_type != VDIR) { - error = ENOTDIR; - goto out1; - } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { - error = EISDIR; + if (!batched) { + error = vn_authorize_rename(fdvp, fvp, &fromnd.ni_cnd, tdvp, tvp, &tond.ni_cnd, ctx, NULL); + if (error) { + if (error == ENOENT) { + /* + * We encountered a race where after doing the namei, tvp stops + * being valid. If so, simply re-drive the rename call from the + * top. + */ + do_retry = 1; + } goto out1; } } - if (fvp == tdvp) { - error = EINVAL; - goto out1; - } + /* * If the source and destination are the same (i.e. they're * links to the same vnode) and the target file system is * case sensitive, then there is nothing to do. + * + * XXX Come back to this. */ if (fvp == tvp) { int pathconf_val; @@ -5636,93 +6002,15 @@ rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval) } } - /* - * Authorization. - * - * If tvp is a directory and not the same as fdvp, or tdvp is not - * the same as fdvp, the node is moving between directories and we - * need rights to remove from the old and add to the new. - * - * If tvp already exists and is not a directory, we need to be - * allowed to delete it. - * - * Note that we do not inherit when renaming. - * - * XXX This needs to be revisited to implement the deferred-inherit bit - */ - { - int moving = 0; - - error = 0; - if ((tvp != NULL) && vnode_isdir(tvp)) { - if (tvp != fdvp) - moving = 1; - } else if (tdvp != fdvp) { - moving = 1; - } - /* - * must have delete rights to remove the old name even in - * the simple case of fdvp == tdvp. - * - * If fvp is a directory, and we are changing it's parent, - * then we also need rights to rewrite its ".." entry as well. - */ - if (vnode_isdir(fvp)) { - if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE | KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) - goto auth_exit; - } else { - if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE, ctx)) != 0) - goto auth_exit; - } - if (moving) { - /* moving into tdvp or tvp, must have rights to add */ - if ((error = vnode_authorize(((tvp != NULL) && vnode_isdir(tvp)) ? tvp : tdvp, - NULL, - vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, - ctx)) != 0) { - /* - * We could encounter a race where after doing the namei, tvp stops - * being valid. If so, simply re-drive the rename call from the - * top. - */ - if (error == ENOENT) { - do_retry = 1; - } - goto auth_exit; - } - } else { - /* node staying in same directory, must be allowed to add new name */ - if ((error = vnode_authorize(fdvp, NULL, - vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, ctx)) != 0) - goto auth_exit; - } - /* overwriting tvp */ - if ((tvp != NULL) && !vnode_isdir(tvp) && - ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0)) { - /* - * We could encounter a race where after doing the namei, tvp stops - * being valid. If so, simply re-drive the rename call from the - * top. - */ - if (error == ENOENT) { - do_retry = 1; - } - goto auth_exit; - } - - /* XXX more checks? */ - -auth_exit: - /* authorization denied */ - if (error != 0) - goto out1; - } /* * Allow the renaming of mount points. * - target must not exist * - target must reside in the same directory as source * - union mounts cannot be renamed * - "/" cannot be renamed + * + * XXX Handle this in VFS after a continued lookup (if we missed + * in the cache to start off) */ if ((fvp->v_flag & VROOT) && (fvp->v_type == VDIR) && @@ -5752,35 +6040,6 @@ rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval) error = EXDEV; goto out1; } - /* - * Avoid renaming "." and "..". - */ - if (fvp->v_type == VDIR && - ((fdvp == fvp) || - (fromnd.ni_cnd.cn_namelen == 1 && fromnd.ni_cnd.cn_nameptr[0] == '.') || - ((fromnd.ni_cnd.cn_flags | tond.ni_cnd.cn_flags) & ISDOTDOT)) ) { - error = EINVAL; - goto out1; - } - /* - * The following edge case is caught here: - * (to cannot be a descendent of from) - * - * o fdvp - * / - * / - * o fvp - * \ - * \ - * o tdvp - * / - * / - * o tvp - */ - if (tdvp->v_parent == fvp) { - error = EINVAL; - goto out1; - } /* * If source is the same as the destination (that is the @@ -5799,6 +6058,8 @@ rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval) * NOTE - that fvp == tvp also occurs if they are hard linked and * that correct behaviour then is just to return success without doing * anything. + * + * XXX filesystem should take care of this itself, perhaps... */ if (fvp == tvp && fdvp == tdvp) { if (fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen && @@ -5882,17 +6143,35 @@ rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval) holding_mntlock = 0; } } + // save these off so we can later verify that fvp is the same oname = fvp->v_name; oparent = fvp->v_parent; +skipped_lookup: #if CONFIG_FSE - need_event = need_fsevent(FSE_RENAME, fvp); + need_event = need_fsevent(FSE_RENAME, fdvp); if (need_event) { - get_fse_info(fvp, &from_finfo, ctx); + if (fvp) { + get_fse_info(fvp, &from_finfo, ctx); + } else { + error = vfs_get_notify_attributes(&fva); + if (error) { + goto out1; + } + + fvap = &fva; + } if (tvp) { get_fse_info(tvp, &to_finfo, ctx); + } else if (batched) { + error = vfs_get_notify_attributes(&tva); + if (error) { + goto out1; + } + + tvap = &tva; } } #else @@ -5900,26 +6179,30 @@ rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval) #endif /* CONFIG_FSE */ if (need_event || kauth_authorize_fileop_has_listeners()) { - GET_PATH(from_name); if (from_name == NULL) { - error = ENOMEM; - goto out1; + GET_PATH(from_name); + if (from_name == NULL) { + error = ENOMEM; + goto out1; + } } from_len = safe_getpath(fdvp, fromnd.ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated); - GET_PATH(to_name); if (to_name == NULL) { - error = ENOMEM; - goto out1; + GET_PATH(to_name); + if (to_name == NULL) { + error = ENOMEM; + goto out1; + } } to_len = safe_getpath(tdvp, tond.ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated); } - error = VNOP_RENAME(fdvp, fvp, &fromnd.ni_cnd, - tdvp, tvp, &tond.ni_cnd, - ctx); + error = vn_rename(fdvp, &fvp, &fromnd.ni_cnd, fvap, + tdvp, &tvp, &tond.ni_cnd, tvap, + 0, ctx); if (holding_mntlock) { /* @@ -5931,16 +6214,29 @@ rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval) holding_mntlock = 0; } if (error) { - /* - * We may encounter a race in the VNOP where the destination didn't - * exist when we did the namei, but it does by the time we go and - * try to create the entry. In this case, we should re-drive this rename - * call from the top again. Currently, only HFS bubbles out ERECYCLE, + if (error == EKEEPLOOKING) { + if ((fromnd.ni_flag & NAMEI_CONTLOOKUP) == 0) { + if ((tond.ni_flag & NAMEI_CONTLOOKUP) == 0) { + panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?"); + } + } + + fromnd.ni_vp = fvp; + tond.ni_vp = tvp; + + goto continue_lookup; + } + + /* + * We may encounter a race in the VNOP where the destination didn't + * exist when we did the namei, but it does by the time we go and + * try to create the entry. In this case, we should re-drive this rename + * call from the top again. Currently, only HFS bubbles out ERECYCLE, * but other filesystems susceptible to this race could return it, too. - */ - if (error == ERECYCLE) { - do_retry = 1; - } + */ + if (error == ERECYCLE) { + do_retry = 1; + } goto out1; } @@ -5958,6 +6254,14 @@ rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval) // set it here since only the from_finfo gets reported up to user space from_finfo.mode |= FSE_TRUNCATED_PATH; } + + if (tvap && tvp) { + vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap); + } + if (fvap) { + vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap); + } + if (tvp) { add_fsevent(FSE_RENAME, ctx, FSE_ARG_STRING, from_len, from_name, @@ -6020,8 +6324,10 @@ rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval) * check that fvp has the same name/parent pointers it * had before the rename call... this is a 'weak' check * at best... + * + * XXX oparent and oname may not be set in the compound vnop case */ - if (oname == fvp->v_name && oparent == fvp->v_parent) { + if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) { int update_flags; update_flags = VNODE_UPDATE_NAME; @@ -6068,12 +6374,12 @@ rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval) vnode_put(fdvp); } - /* - * If things changed after we did the namei, then we will re-drive - * this rename call from the top. - */ + /* + * If things changed after we did the namei, then we will re-drive + * this rename call from the top. + */ if(do_retry) { - do_retry = 0; + do_retry = 0; goto retry; } @@ -6096,12 +6402,16 @@ mkdir1(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap) vnode_t vp, dvp; int error; int update_flags = 0; + int batched; struct nameidata nd; AUDIT_ARG(mode, vap->va_mode); - NDINIT(&nd, CREATE, LOCKPARENT | AUDITVNPATH1, - UIO_USERSPACE, path, ctx); + NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, UIO_USERSPACE, + path, ctx); nd.ni_cnd.cn_flags |= WILLBEDIR; + nd.ni_flag = NAMEI_COMPOUNDMKDIR; + +continue_lookup: error = namei(&nd); if (error) return (error); @@ -6112,24 +6422,56 @@ mkdir1(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap) error = EEXIST; goto out; } + + batched = vnode_compound_mkdir_available(dvp); VATTR_SET(vap, va_type, VDIR); - -#if CONFIG_MACF - error = mac_vnode_check_create(ctx, - nd.ni_dvp, &nd.ni_cnd, vap); - if (error) + + /* + * XXX + * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will + * only get EXISTS or EISDIR for existing path components, and not that it could see + * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz" + * it will fail in a spurious manner. Need to figure out if this is valid behavior. + */ + if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) { + if (error == EACCES || error == EPERM) { + int error2; + + nameidone(&nd); + vnode_put(dvp); + dvp = NULLVP; + + /* + * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST + * rather than EACCESS if the target exists. + */ + NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, UIO_USERSPACE, + path, ctx); + error2 = namei(&nd); + if (error2) { + goto out; + } else { + vp = nd.ni_vp; + error = EEXIST; + goto out; + } + } + goto out; -#endif + } + + /* + * make the directory + */ + if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) { + if (error == EKEEPLOOKING) { + nd.ni_vp = vp; + goto continue_lookup; + } - /* authorize addition of a directory to the parent */ - if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) - goto out; - - - /* make the directory */ - if ((error = vn_create(dvp, &vp, &nd.ni_cnd, vap, 0, ctx)) != 0) goto out; + } // Make sure the name & parent pointers are hooked up if (vp->v_name == NULL) @@ -6152,8 +6494,9 @@ mkdir1(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap) nameidone(&nd); if (vp) - vnode_put(vp); - vnode_put(dvp); + vnode_put(vp); + if (dvp) + vnode_put(dvp); return (error); } @@ -6219,10 +6562,19 @@ rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval) vnode_t vp, dvp; int error; struct nameidata nd; + char *path = NULL; + int len=0; + int has_listeners = 0; + int need_event = 0; + int truncated = 0; vfs_context_t ctx = vfs_context_current(); +#if CONFIG_FSE + struct vnode_attr va; +#endif /* CONFIG_FSE */ + struct vnode_attr *vap = NULL; + int batched; int restart_flag; - uint32_t oldvp_id = UINT32_MAX; /* * This loop exists to restart rmdir in the unlikely case that two @@ -6230,10 +6582,13 @@ rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval) * containing orphaned appleDouble files. */ do { + NDINIT(&nd, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1, + UIO_USERSPACE, uap->path, ctx); + nd.ni_flag = NAMEI_COMPOUNDRMDIR; +continue_lookup: restart_flag = 0; + vap = NULL; - NDINIT(&nd, DELETE, LOCKPARENT | AUDITVNPATH1, - UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) return (error); @@ -6241,132 +6596,153 @@ rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval) dvp = nd.ni_dvp; vp = nd.ni_vp; + if (vp) { + batched = vnode_compound_rmdir_available(vp); - /* - * If being restarted check if the new vp - * still has the same v_id. - */ - if (oldvp_id != UINT32_MAX && oldvp_id != vp->v_id) { - error = ENOENT; - goto out; - } + if (vp->v_flag & VROOT) { + /* + * The root of a mounted filesystem cannot be deleted. + */ + error = EBUSY; + goto out; + } - if (vp->v_type != VDIR) { - /* - * rmdir only deals with directories - */ - error = ENOTDIR; - } else if (dvp == vp) { /* - * No rmdir "." please. + * Removed a check here; we used to abort if vp's vid + * was not the same as what we'd seen the last time around. + * I do not think that check was valid, because if we retry + * and all dirents are gone, the directory could legitimately + * be recycled but still be present in a situation where we would + * have had permission to delete. Therefore, we won't make + * an effort to preserve that check now that we may not have a + * vp here. */ - error = EINVAL; - } else if (vp->v_flag & VROOT) { - /* - * The root of a mounted filesystem cannot be deleted. - */ - error = EBUSY; + + if (!batched) { + error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL); + if (error) { + goto out; + } + } } else { -#if CONFIG_MACF - error = mac_vnode_check_unlink(ctx, dvp, - vp, &nd.ni_cnd); - if (!error) -#endif - error = vnode_authorize(vp, nd.ni_dvp, KAUTH_VNODE_DELETE, ctx); + batched = 1; + + if (!vnode_compound_rmdir_available(dvp)) { + panic("No error, but no compound rmdir?"); + } } - if (!error) { - char *path = NULL; - int len=0; - int has_listeners = 0; - int need_event = 0; - int truncated = 0; + #if CONFIG_FSE - fse_info finfo; + fse_info finfo; - need_event = need_fsevent(FSE_DELETE, dvp); - if (need_event) { + need_event = need_fsevent(FSE_DELETE, dvp); + if (need_event) { + if (!batched) { get_fse_info(vp, &finfo, ctx); + } else { + error = vfs_get_notify_attributes(&va); + if (error) { + goto out; + } + + vap = &va; } + } #endif - has_listeners = kauth_authorize_fileop_has_listeners(); - if (need_event || has_listeners) { + has_listeners = kauth_authorize_fileop_has_listeners(); + if (need_event || has_listeners) { + if (path == NULL) { GET_PATH(path); if (path == NULL) { error = ENOMEM; goto out; } + } - len = safe_getpath(vp, NULL, path, MAXPATHLEN, &truncated); + len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated); #if CONFIG_FSE - if (truncated) { - finfo.mode |= FSE_TRUNCATED_PATH; - } -#endif + if (truncated) { + finfo.mode |= FSE_TRUNCATED_PATH; } +#endif + } - error = VNOP_RMDIR(dvp, vp, &nd.ni_cnd, ctx); - - /* - * Special case to remove orphaned AppleDouble - * files. I don't like putting this in the kernel, - * but carbon does not like putting this in carbon either, - * so here we are. - */ - if (error == ENOTEMPTY) { - error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag); - if (error == EBUSY) { - oldvp_id = vp->v_id; - goto out; - } + error = vn_rmdir(dvp, &vp, &nd, vap, ctx); + nd.ni_vp = vp; + if (vp == NULLVP) { + /* Couldn't find a vnode */ + goto out; + } + if (error == EKEEPLOOKING) { + goto continue_lookup; + } - /* - * Assuming everything went well, we will try the RMDIR again - */ - if (!error) - error = VNOP_RMDIR(dvp, vp, &nd.ni_cnd, ctx); + /* + * Special case to remove orphaned AppleDouble + * files. I don't like putting this in the kernel, + * but carbon does not like putting this in carbon either, + * so here we are. + */ + if (error == ENOTEMPTY) { + error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag); + if (error == EBUSY) { + goto out; } + /* - * Call out to allow 3rd party notification of delete. - * Ignore result of kauth_authorize_fileop call. + * Assuming everything went well, we will try the RMDIR again */ - if (!error) { - if (has_listeners) { - kauth_authorize_fileop(vfs_context_ucred(ctx), - KAUTH_FILEOP_DELETE, - (uintptr_t)vp, - (uintptr_t)path); - } + if (!error) + error = vn_rmdir(dvp, &vp, &nd, vap, ctx); + } - if (vp->v_flag & VISHARDLINK) { - // see the comment in unlink1() about why we update - // the parent of a hard link when it is removed - vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT); - } + /* + * Call out to allow 3rd party notification of delete. + * Ignore result of kauth_authorize_fileop call. + */ + if (!error) { + if (has_listeners) { + kauth_authorize_fileop(vfs_context_ucred(ctx), + KAUTH_FILEOP_DELETE, + (uintptr_t)vp, + (uintptr_t)path); + } + + if (vp->v_flag & VISHARDLINK) { + // see the comment in unlink1() about why we update + // the parent of a hard link when it is removed + vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT); + } #if CONFIG_FSE - if (need_event) { - add_fsevent(FSE_DELETE, ctx, - FSE_ARG_STRING, len, path, - FSE_ARG_FINFO, &finfo, - FSE_ARG_DONE); + if (need_event) { + if (vap) { + vnode_get_fse_info_from_vap(vp, &finfo, vap); } -#endif + add_fsevent(FSE_DELETE, ctx, + FSE_ARG_STRING, len, path, + FSE_ARG_FINFO, &finfo, + FSE_ARG_DONE); } - if (path != NULL) - RELEASE_PATH(path); +#endif } out: + if (path != NULL) { + RELEASE_PATH(path); + path = NULL; + } /* * nameidone has to happen before we vnode_put(dvp) * since it may need to release the fs_nodelock on the dvp */ nameidone(&nd); - vnode_put(dvp); - vnode_put(vp); + + if (vp) + vnode_put(vp); if (restart_flag == 0) { wakeup_one((caddr_t)vp); @@ -6389,7 +6765,8 @@ vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag, int *numdirent, vfs_context_t ctxp) { /* Check if fs natively supports VNODE_READDIR_EXTENDED */ - if (vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) { + if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) && + ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) { return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp); } else { size_t bufsize; @@ -6673,8 +7050,8 @@ revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval) int error; struct nameidata nd; - NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, - UIO_USERSPACE, uap->path, ctx); + NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, + uap->path, ctx); error = namei(&nd); if (error) return (error); @@ -6922,24 +7299,24 @@ exchangedata (__unused proc_t p, struct exchangedata_args *uap, __unused int32_t nameiflags = 0; if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW; - NDINIT(&fnd, LOOKUP, nameiflags | AUDITVNPATH1, - UIO_USERSPACE, uap->path1, ctx); + NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1, + UIO_USERSPACE, uap->path1, ctx); - error = namei(&fnd); - if (error) - goto out2; + error = namei(&fnd); + if (error) + goto out2; nameidone(&fnd); fvp = fnd.ni_vp; - NDINIT(&snd, LOOKUP | CN_NBMOUNTLOOK, nameiflags | AUDITVNPATH2, - UIO_USERSPACE, uap->path2, ctx); + NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2, + UIO_USERSPACE, uap->path2, ctx); - error = namei(&snd); - if (error) { + error = namei(&snd); + if (error) { vnode_put(fvp); goto out2; - } + } nameidone(&snd); svp = snd.ni_vp; @@ -7187,8 +7564,8 @@ searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval) nameiflags = 0; if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW; - NDINIT(&nd, LOOKUP, nameiflags | AUDITVNPATH1, - UIO_USERSPACE, uap->path, ctx); + NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1, + UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) @@ -7197,6 +7574,14 @@ searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval) nameidone(&nd); vp = nd.ni_vp; +#if CONFIG_MACF + error = mac_vnode_check_searchfs(ctx, vp, &searchblock.searchattrs); + if (error) { + vnode_put(vp); + goto freeandexit; + } +#endif + /* * If searchblock.maxmatches == 0, then skip the search. This has happened @@ -7215,44 +7600,716 @@ searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval) from copying out any results... */ - fserror = VNOP_SEARCHFS(vp, - searchparams1, - searchparams2, - &searchblock.searchattrs, - (u_long)searchblock.maxmatches, - &timelimit, - returnattrs, - &nummatches, - (u_long)uap->scriptcode, - (u_long)uap->options, - auio, - state, - ctx); - -saveandexit: + fserror = VNOP_SEARCHFS(vp, + searchparams1, + searchparams2, + &searchblock.searchattrs, + (u_long)searchblock.maxmatches, + &timelimit, + returnattrs, + &nummatches, + (u_long)uap->scriptcode, + (u_long)uap->options, + auio, + state, + ctx); + +saveandexit: + + vnode_put(vp); + + /* Now copy out the stuff that needs copying out. That means the number of matches, the + search state. Everything was already put into he return buffer by the vop call. */ + + if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) + goto freeandexit; + + if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) + goto freeandexit; + + error = fserror; + +freeandexit: + + FREE(searchparams1,M_TEMP); + + return(error); + + +} /* end of searchfs system call */ + + + +lck_grp_attr_t * nspace_group_attr; +lck_attr_t * nspace_lock_attr; +lck_grp_t * nspace_mutex_group; + +lck_mtx_t nspace_handler_lock; +lck_mtx_t nspace_handler_exclusion_lock; + +time_t snapshot_timestamp=0; +int nspace_allow_virtual_devs=0; + +void nspace_handler_init(void); + +typedef struct nspace_item_info { + struct vnode *vp; + void *arg; + uint64_t op; + uint32_t vid; + uint32_t flags; + uint32_t token; + uint32_t refcount; +} nspace_item_info; + +#define MAX_NSPACE_ITEMS 128 +nspace_item_info nspace_items[MAX_NSPACE_ITEMS]; +uint32_t nspace_item_idx=0; // also used as the sleep/wakeup rendezvous address +uint32_t nspace_token_id=0; +uint32_t nspace_handler_timeout = 15; // seconds + +#define NSPACE_ITEM_NEW 0x0001 +#define NSPACE_ITEM_PROCESSING 0x0002 +#define NSPACE_ITEM_DEAD 0x0004 +#define NSPACE_ITEM_CANCELLED 0x0008 +#define NSPACE_ITEM_DONE 0x0010 +#define NSPACE_ITEM_RESET_TIMER 0x0020 + +#define NSPACE_ITEM_NSPACE_EVENT 0x0040 +#define NSPACE_ITEM_SNAPSHOT_EVENT 0x0080 +#define NSPACE_ITEM_TRACK_EVENT 0x0100 + +#define NSPACE_ITEM_ALL_EVENT_TYPES (NSPACE_ITEM_NSPACE_EVENT | NSPACE_ITEM_SNAPSHOT_EVENT | NSPACE_ITEM_TRACK_EVENT) + +//#pragma optimization_level 0 + +typedef enum { + NSPACE_HANDLER_NSPACE = 0, + NSPACE_HANDLER_SNAPSHOT = 1, + NSPACE_HANDLER_TRACK = 2, + + NSPACE_HANDLER_COUNT, +} nspace_type_t; + +typedef struct { + uint64_t handler_tid; + struct proc *handler_proc; + int handler_busy; +} nspace_handler_t; + +nspace_handler_t nspace_handlers[NSPACE_HANDLER_COUNT]; + +static inline int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type) +{ + switch(nspace_type) { + case NSPACE_HANDLER_NSPACE: + return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_NSPACE_EVENT; + case NSPACE_HANDLER_SNAPSHOT: + return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_SNAPSHOT_EVENT; + case NSPACE_HANDLER_TRACK: + return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_TRACK_EVENT; + default: + printf("nspace_flags_matches_handler: invalid type %u\n", (int)nspace_type); + return 0; + } +} + +static inline int nspace_item_flags_for_type(nspace_type_t nspace_type) +{ + switch(nspace_type) { + case NSPACE_HANDLER_NSPACE: + return NSPACE_ITEM_NSPACE_EVENT; + case NSPACE_HANDLER_SNAPSHOT: + return NSPACE_ITEM_SNAPSHOT_EVENT; + case NSPACE_HANDLER_TRACK: + return NSPACE_ITEM_TRACK_EVENT; + default: + printf("nspace_item_flags_for_type: invalid type %u\n", (int)nspace_type); + return 0; + } +} + +static inline int nspace_open_flags_for_type(nspace_type_t nspace_type) +{ + switch(nspace_type) { + case NSPACE_HANDLER_NSPACE: + return FREAD | FWRITE | O_EVTONLY; + case NSPACE_HANDLER_SNAPSHOT: + case NSPACE_HANDLER_TRACK: + return FREAD | O_EVTONLY; + default: + printf("nspace_open_flags_for_type: invalid type %u\n", (int)nspace_type); + return 0; + } +} + +static inline nspace_type_t nspace_type_for_op(uint64_t op) +{ + switch(op & NAMESPACE_HANDLER_EVENT_TYPE_MASK) { + case NAMESPACE_HANDLER_NSPACE_EVENT: + return NSPACE_HANDLER_NSPACE; + case NAMESPACE_HANDLER_SNAPSHOT_EVENT: + return NSPACE_HANDLER_SNAPSHOT; + case NAMESPACE_HANDLER_TRACK_EVENT: + return NSPACE_HANDLER_TRACK; + default: + printf("nspace_type_for_op: invalid op mask %llx\n", op & NAMESPACE_HANDLER_EVENT_TYPE_MASK); + return NSPACE_HANDLER_NSPACE; + } +} + +static inline int nspace_is_special_process(struct proc *proc) +{ + int i; + for (i = 0; i < NSPACE_HANDLER_COUNT; i++) { + if (proc == nspace_handlers[i].handler_proc) + return 1; + } + return 0; +} + +void +nspace_handler_init(void) +{ + nspace_lock_attr = lck_attr_alloc_init(); + nspace_group_attr = lck_grp_attr_alloc_init(); + nspace_mutex_group = lck_grp_alloc_init("nspace-mutex", nspace_group_attr); + lck_mtx_init(&nspace_handler_lock, nspace_mutex_group, nspace_lock_attr); + lck_mtx_init(&nspace_handler_exclusion_lock, nspace_mutex_group, nspace_lock_attr); + memset(&nspace_items[0], 0, sizeof(nspace_items)); +} + +void +nspace_proc_exit(struct proc *p) +{ + int i, event_mask = 0; + + for (i = 0; i < NSPACE_HANDLER_COUNT; i++) { + if (p == nspace_handlers[i].handler_proc) { + event_mask |= nspace_item_flags_for_type(i); + nspace_handlers[i].handler_tid = 0; + nspace_handlers[i].handler_proc = NULL; + } + } + + if (event_mask == 0) { + return; + } + + if (event_mask & NSPACE_ITEM_SNAPSHOT_EVENT) { + // if this process was the snapshot handler, zero snapshot_timeout + snapshot_timestamp = 0; + } + + // + // unblock anyone that's waiting for the handler that died + // + lck_mtx_lock(&nspace_handler_lock); + for(i=0; i < MAX_NSPACE_ITEMS; i++) { + if (nspace_items[i].flags & (NSPACE_ITEM_NEW | NSPACE_ITEM_PROCESSING)) { + + if ( nspace_items[i].flags & event_mask ) { + + if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) { + vnode_lock_spin(nspace_items[i].vp); + nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT; + vnode_unlock(nspace_items[i].vp); + } + nspace_items[i].vp = NULL; + nspace_items[i].vid = 0; + nspace_items[i].flags = NSPACE_ITEM_DONE; + nspace_items[i].token = 0; + + wakeup((caddr_t)&(nspace_items[i].vp)); + } + } + } + + wakeup((caddr_t)&nspace_item_idx); + lck_mtx_unlock(&nspace_handler_lock); +} + + +int +resolve_nspace_item(struct vnode *vp, uint64_t op) +{ + return resolve_nspace_item_ext(vp, op, NULL); +} + +int +resolve_nspace_item_ext(struct vnode *vp, uint64_t op, void *arg) +{ + int i, error, keep_waiting; + struct timespec ts; + nspace_type_t nspace_type = nspace_type_for_op(op); + + // only allow namespace events on regular files, directories and symlinks. + if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) { + return 0; + } + + // + // if this is a snapshot event and the vnode is on a + // disk image just pretend nothing happened since any + // change to the disk image will cause the disk image + // itself to get backed up and this avoids multi-way + // deadlocks between the snapshot handler and the ever + // popular diskimages-helper process. the variable + // nspace_allow_virtual_devs allows this behavior to + // be overridden (for use by the Mobile TimeMachine + // testing infrastructure which uses disk images) + // + if ( (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) + && (vp->v_mount != NULL) + && (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) + && !nspace_allow_virtual_devs) { + + return 0; + } + + // if (thread_tid(current_thread()) == namespace_handler_tid) { + if (nspace_handlers[nspace_type].handler_proc == NULL) { + return 0; + } + + if (nspace_is_special_process(current_proc())) { + return EDEADLK; + } + + lck_mtx_lock(&nspace_handler_lock); + +retry: + for(i=0; i < MAX_NSPACE_ITEMS; i++) { + if (vp == nspace_items[i].vp && op == nspace_items[i].op) { + break; + } + } + + if (i >= MAX_NSPACE_ITEMS) { + for(i=0; i < MAX_NSPACE_ITEMS; i++) { + if (nspace_items[i].flags == 0) { + break; + } + } + } else { + nspace_items[i].refcount++; + } + + if (i >= MAX_NSPACE_ITEMS) { + ts.tv_sec = nspace_handler_timeout; + ts.tv_nsec = 0; + + error = msleep((caddr_t)&nspace_token_id, &nspace_handler_lock, PVFS|PCATCH, "nspace-no-space", &ts); + if (error == 0) { + // an entry got free'd up, go see if we can get a slot + goto retry; + } else { + lck_mtx_unlock(&nspace_handler_lock); + return error; + } + } + + // + // if it didn't already exist, add it. if it did exist + // we'll get woken up when someone does a wakeup() on + // the slot in the nspace_items table. + // + if (vp != nspace_items[i].vp) { + nspace_items[i].vp = vp; + nspace_items[i].arg = arg; + nspace_items[i].op = op; + nspace_items[i].vid = vnode_vid(vp); + nspace_items[i].flags = NSPACE_ITEM_NEW; + nspace_items[i].flags |= nspace_item_flags_for_type(nspace_type); + if (nspace_items[i].flags & NSPACE_ITEM_SNAPSHOT_EVENT) { + if (arg) { + vnode_lock_spin(vp); + vp->v_flag |= VNEEDSSNAPSHOT; + vnode_unlock(vp); + } + } + + nspace_items[i].token = 0; + nspace_items[i].refcount = 1; + + wakeup((caddr_t)&nspace_item_idx); + } + + // + // Now go to sleep until the handler does a wakeup on this + // slot in the nspace_items table (or we timeout). + // + keep_waiting = 1; + while(keep_waiting) { + ts.tv_sec = nspace_handler_timeout; + ts.tv_nsec = 0; + error = msleep((caddr_t)&(nspace_items[i].vp), &nspace_handler_lock, PVFS|PCATCH, "namespace-done", &ts); + + if (nspace_items[i].flags & NSPACE_ITEM_DONE) { + error = 0; + } else if (nspace_items[i].flags & NSPACE_ITEM_CANCELLED) { + error = nspace_items[i].token; + } else if (error == EWOULDBLOCK || error == ETIMEDOUT) { + if (nspace_items[i].flags & NSPACE_ITEM_RESET_TIMER) { + nspace_items[i].flags &= ~NSPACE_ITEM_RESET_TIMER; + continue; + } else { + error = ETIMEDOUT; + } + } else if (error == 0) { + // hmmm, why did we get woken up? + printf("woken up for token %d but it's not done, cancelled or timedout and error == 0.\n", + nspace_items[i].token); + } + + if (--nspace_items[i].refcount == 0) { + nspace_items[i].vp = NULL; // clear this so that no one will match on it again + nspace_items[i].arg = NULL; + nspace_items[i].token = 0; // clear this so that the handler will not find it anymore + nspace_items[i].flags = 0; // this clears it for re-use + } + wakeup(&nspace_token_id); + keep_waiting = 0; + } + + lck_mtx_unlock(&nspace_handler_lock); + + return error; +} + + +int +get_nspace_item_status(struct vnode *vp, int32_t *status) +{ + int i; + + lck_mtx_lock(&nspace_handler_lock); + for(i=0; i < MAX_NSPACE_ITEMS; i++) { + if (nspace_items[i].vp == vp) { + break; + } + } + + if (i >= MAX_NSPACE_ITEMS) { + lck_mtx_unlock(&nspace_handler_lock); + return ENOENT; + } + + *status = nspace_items[i].flags; + lck_mtx_unlock(&nspace_handler_lock); + return 0; +} + + +#if 0 +static int +build_volfs_path(struct vnode *vp, char *path, int *len) +{ + struct vnode_attr va; + int ret; + + VATTR_INIT(&va); + VATTR_WANTED(&va, va_fsid); + VATTR_WANTED(&va, va_fileid); + + if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) { + *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1; + ret = -1; + } else { + *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1; + ret = 0; + } + + return ret; +} +#endif + +// +// Note: this function does NOT check permissions on all of the +// parent directories leading to this vnode. It should only be +// called on behalf of a root process. Otherwise a process may +// get access to a file because the file itself is readable even +// though its parent directories would prevent access. +// +static int +vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx) +{ + int error, action; + + if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) { + return error; + } + +#if CONFIG_MACF + error = mac_vnode_check_open(ctx, vp, fmode); + if (error) + return error; +#endif - vnode_put(vp); + /* compute action to be authorized */ + action = 0; + if (fmode & FREAD) { + action |= KAUTH_VNODE_READ_DATA; + } + if (fmode & (FWRITE | O_TRUNC)) { + /* + * If we are writing, appending, and not truncating, + * indicate that we are appending so that if the + * UF_APPEND or SF_APPEND bits are set, we do not deny + * the open. + */ + if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) { + action |= KAUTH_VNODE_APPEND_DATA; + } else { + action |= KAUTH_VNODE_WRITE_DATA; + } + } - /* Now copy out the stuff that needs copying out. That means the number of matches, the - search state. Everything was already put into he return buffer by the vop call. */ + if ((error = vnode_authorize(vp, NULL, action, ctx)) != 0) + return error; + - if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) - goto freeandexit; + // + // if the vnode is tagged VOPENEVT and the current process + // has the P_CHECKOPENEVT flag set, then we or in the O_EVTONLY + // flag to the open mode so that this open won't count against + // the vnode when carbon delete() does a vnode_isinuse() to see + // if a file is currently in use. this allows spotlight + // importers to not interfere with carbon apps that depend on + // the no-delete-if-busy semantics of carbon delete(). + // + if ((vp->v_flag & VOPENEVT) && (current_proc()->p_flag & P_CHECKOPENEVT)) { + fmode |= O_EVTONLY; + } - if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) - goto freeandexit; - - error = fserror; + if ( (error = VNOP_OPEN(vp, fmode, ctx)) ) { + return error; + } + if ( (error = vnode_ref_ext(vp, fmode, 0)) ) { + VNOP_CLOSE(vp, fmode, ctx); + return error; + } -freeandexit: + /* call out to allow 3rd party notification of open. + * Ignore result of kauth_authorize_fileop call. + */ + kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_OPEN, + (uintptr_t)vp, 0); - FREE(searchparams1,M_TEMP); - return(error); + return 0; +} +static int +wait_for_namespace_event(namespace_handler_info_ext *nhi, nspace_type_t nspace_type) +{ + int i, error=0, unblock=0; + task_t curtask; + + lck_mtx_lock(&nspace_handler_exclusion_lock); + if (nspace_handlers[nspace_type].handler_busy) { + lck_mtx_unlock(&nspace_handler_exclusion_lock); + return EBUSY; + } + nspace_handlers[nspace_type].handler_busy = 1; + lck_mtx_unlock(&nspace_handler_exclusion_lock); + + /* + * Any process that gets here will be one of the namespace handlers. + * As such, they should be prevented from acquiring DMG vnodes during vnode reclamation + * as we can cause deadlocks to occur, because the namespace handler may prevent + * VNOP_INACTIVE from proceeding. Mark the current task as a P_DEPENDENCY_CAPABLE + * process. + */ + curtask = current_task(); + bsd_set_dependency_capable (curtask); + + lck_mtx_lock(&nspace_handler_lock); + if (nspace_handlers[nspace_type].handler_proc == NULL) { + nspace_handlers[nspace_type].handler_tid = thread_tid(current_thread()); + nspace_handlers[nspace_type].handler_proc = current_proc(); + } + + while (error == 0) { + + for(i=0; i < MAX_NSPACE_ITEMS; i++) { + if (nspace_items[i].flags & NSPACE_ITEM_NEW) { + if (!nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) { + continue; + } + break; + } + } + + if (i < MAX_NSPACE_ITEMS) { + nspace_items[i].flags &= ~NSPACE_ITEM_NEW; + nspace_items[i].flags |= NSPACE_ITEM_PROCESSING; + nspace_items[i].token = ++nspace_token_id; + + if (nspace_items[i].vp) { + struct fileproc *fp; + int32_t indx, fmode; + struct proc *p = current_proc(); + vfs_context_t ctx = vfs_context_current(); + + fmode = nspace_open_flags_for_type(nspace_type); + + error = vnode_getwithvid(nspace_items[i].vp, nspace_items[i].vid); + if (error) { + unblock = 1; + break; + } + error = vn_open_with_vp(nspace_items[i].vp, fmode, ctx); + if (error) { + unblock = 1; + vnode_put(nspace_items[i].vp); + break; + } + + if ((error = falloc(p, &fp, &indx, ctx))) { + vn_close(nspace_items[i].vp, fmode, ctx); + vnode_put(nspace_items[i].vp); + unblock = 1; + break; + } + + fp->f_fglob->fg_flag = fmode; + fp->f_fglob->fg_type = DTYPE_VNODE; + fp->f_fglob->fg_ops = &vnops; + fp->f_fglob->fg_data = (caddr_t)nspace_items[i].vp; + + proc_fdlock(p); + procfdtbl_releasefd(p, indx, NULL); + fp_drop(p, indx, fp, 1); + proc_fdunlock(p); + + error = copyout(&nspace_items[i].token, nhi->token, sizeof(uint32_t)); + error = copyout(&nspace_items[i].op, nhi->flags, sizeof(uint64_t)); + error = copyout(&indx, nhi->fdptr, sizeof(uint32_t)); + if (nhi->infoptr) { + uio_t uio = (uio_t)nspace_items[i].arg; + uint64_t u_offset, u_length; + + if (uio) { + u_offset = uio_offset(uio); + u_length = uio_resid(uio); + } else { + u_offset = 0; + u_length = 0; + } + error = copyout(&u_offset, nhi->infoptr, sizeof(uint64_t)); + error = copyout(&u_length, nhi->infoptr+sizeof(uint64_t), sizeof(uint64_t)); + } + if (error) { + vn_close(nspace_items[i].vp, fmode, ctx); + fp_free(p, indx, fp); + unblock = 1; + } + + vnode_put(nspace_items[i].vp); + + break; + } else { + printf("wait_for_nspace_event: failed (nspace_items[%d] == %p error %d, name %s)\n", + i, nspace_items[i].vp, error, nspace_items[i].vp->v_name); + } + + } else { + error = msleep((caddr_t)&nspace_item_idx, &nspace_handler_lock, PVFS|PCATCH, "namespace-items", 0); + if ((nspace_type == NSPACE_HANDLER_SNAPSHOT) && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) { + error = EINVAL; + break; + } + + } + } + + if (unblock) { + if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) { + vnode_lock_spin(nspace_items[i].vp); + nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT; + vnode_unlock(nspace_items[i].vp); + } + nspace_items[i].vp = NULL; + nspace_items[i].vid = 0; + nspace_items[i].flags = NSPACE_ITEM_DONE; + nspace_items[i].token = 0; + + wakeup((caddr_t)&(nspace_items[i].vp)); + } + + if (nspace_type == NSPACE_HANDLER_SNAPSHOT) { + // just go through every snapshot event and unblock it immediately. + if (error && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) { + for(i=0; i < MAX_NSPACE_ITEMS; i++) { + if (nspace_items[i].flags & NSPACE_ITEM_NEW) { + if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) { + nspace_items[i].vp = NULL; + nspace_items[i].vid = 0; + nspace_items[i].flags = NSPACE_ITEM_DONE; + nspace_items[i].token = 0; + + wakeup((caddr_t)&(nspace_items[i].vp)); + } + } + } + } + } + + lck_mtx_unlock(&nspace_handler_lock); + + lck_mtx_lock(&nspace_handler_exclusion_lock); + nspace_handlers[nspace_type].handler_busy = 0; + lck_mtx_unlock(&nspace_handler_exclusion_lock); + + return error; +} -} /* end of searchfs system call */ +static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data) +{ + int error = 0; + namespace_handler_info_ext nhi; + + if (nspace_type == NSPACE_HANDLER_SNAPSHOT && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) { + return EINVAL; + } + + if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) { + return error; + } + + if ( (is64bit && size != sizeof(user64_namespace_handler_info) && size != sizeof(user64_namespace_handler_info_ext)) + || (is64bit == 0 && size != sizeof(user32_namespace_handler_info) && size != sizeof(user32_namespace_handler_info_ext))) { + + // either you're 64-bit and passed a 64-bit struct or + // you're 32-bit and passed a 32-bit struct. otherwise + // it's not ok. + return EINVAL; + } + + if (is64bit) { + nhi.token = (user_addr_t)((user64_namespace_handler_info *)data)->token; + nhi.flags = (user_addr_t)((user64_namespace_handler_info *)data)->flags; + nhi.fdptr = (user_addr_t)((user64_namespace_handler_info *)data)->fdptr; + if (size == sizeof(user64_namespace_handler_info_ext)) { + nhi.infoptr = (user_addr_t)((user64_namespace_handler_info_ext *)data)->infoptr; + } else { + nhi.infoptr = 0; + } + } else { + nhi.token = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->token); + nhi.flags = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->flags); + nhi.fdptr = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->fdptr); + if (size == sizeof(user32_namespace_handler_info_ext)) { + nhi.infoptr = CAST_USER_ADDR_T(((user32_namespace_handler_info_ext *)data)->infoptr); + } else { + nhi.infoptr = 0; + } + } + + return wait_for_namespace_event(&nhi, nspace_type); +} /* * Make a filesystem-specific control call: @@ -7272,7 +8329,7 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long size = IOCPARM_LEN(cmd); if (size > IOCPARM_MAX) return (EINVAL); - is64bit = proc_is64bit(p); + is64bit = proc_is64bit(p); memp = NULL; if (size > sizeof (stkbuf)) { @@ -7287,12 +8344,12 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long error = copyin(udata, data, size); if (error) goto FSCtl_Exit; } else { - if (is64bit) { - *(user_addr_t *)data = udata; - } - else { - *(uint32_t *)data = (uint32_t)udata; - } + if (is64bit) { + *(user_addr_t *)data = udata; + } + else { + *(uint32_t *)data = (uint32_t)udata; + } }; } else if ((cmd & IOC_OUT) && size) { /* @@ -7302,10 +8359,10 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long bzero(data, size); } else if (cmd & IOC_VOID) { if (is64bit) { - *(user_addr_t *)data = udata; + *(user_addr_t *)data = udata; } else { - *(uint32_t *)data = (uint32_t)udata; + *(uint32_t *)data = (uint32_t)udata; } } @@ -7349,31 +8406,31 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long *arg_vp = NULL; } else if (IOCBASECMD(cmd) == FSCTL_SET_PACKAGE_EXTS) { - user_addr_t ext_strings; - uint32_t num_entries; - uint32_t max_width; + user_addr_t ext_strings; + uint32_t num_entries; + uint32_t max_width; - if ( (is64bit && size != sizeof(user64_package_ext_info)) - || (is64bit == 0 && size != sizeof(user32_package_ext_info))) { + if ( (is64bit && size != sizeof(user64_package_ext_info)) + || (is64bit == 0 && size != sizeof(user32_package_ext_info))) { - // either you're 64-bit and passed a 64-bit struct or - // you're 32-bit and passed a 32-bit struct. otherwise - // it's not ok. - error = EINVAL; - goto FSCtl_Exit; - } + // either you're 64-bit and passed a 64-bit struct or + // you're 32-bit and passed a 32-bit struct. otherwise + // it's not ok. + error = EINVAL; + goto FSCtl_Exit; + } - if (is64bit) { - ext_strings = ((user64_package_ext_info *)data)->strings; - num_entries = ((user64_package_ext_info *)data)->num_entries; - max_width = ((user64_package_ext_info *)data)->max_width; - } else { - ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings); - num_entries = ((user32_package_ext_info *)data)->num_entries; - max_width = ((user32_package_ext_info *)data)->max_width; - } + if (is64bit) { + ext_strings = ((user64_package_ext_info *)data)->strings; + num_entries = ((user64_package_ext_info *)data)->num_entries; + max_width = ((user64_package_ext_info *)data)->max_width; + } else { + ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings); + num_entries = ((user32_package_ext_info *)data)->num_entries; + max_width = ((user32_package_ext_info *)data)->max_width; + } - error = set_package_extensions_table(ext_strings, num_entries, max_width); + error = set_package_extensions_table(ext_strings, num_entries, max_width); } else if (IOCBASECMD(cmd) == FSCTL_WAIT_FOR_SYNC) { error = tsleep((caddr_t)&sync_wait_time, PVFS|PCATCH, "sync-wait", 0); @@ -7384,6 +8441,192 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long error *= -1; } + } else if (IOCBASECMD(cmd) == FSCTL_NAMESPACE_HANDLER_GET) { + error = process_namespace_fsctl(NSPACE_HANDLER_NSPACE, is64bit, size, data); + } else if (IOCBASECMD(cmd) == FSCTL_OLD_SNAPSHOT_HANDLER_GET) { + error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data); + } else if (IOCBASECMD(cmd) == FSCTL_SNAPSHOT_HANDLER_GET_EXT) { + error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data); + } else if (IOCBASECMD(cmd) == FSCTL_TRACKED_HANDLER_GET) { + error = process_namespace_fsctl(NSPACE_HANDLER_TRACK, is64bit, size, data); + } else if (IOCBASECMD(cmd) == FSCTL_NAMESPACE_HANDLER_UPDATE) { + uint32_t token, val; + int i; + + if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) { + goto FSCtl_Exit; + } + + if (!nspace_is_special_process(p)) { + error = EINVAL; + goto FSCtl_Exit; + } + + token = ((uint32_t *)data)[0]; + val = ((uint32_t *)data)[1]; + + lck_mtx_lock(&nspace_handler_lock); + + for(i=0; i < MAX_NSPACE_ITEMS; i++) { + if (nspace_items[i].token == token) { + break; + } + } + + if (i >= MAX_NSPACE_ITEMS) { + error = ENOENT; + } else { + // + // if this bit is set, when resolve_nspace_item() times out + // it will loop and go back to sleep. + // + nspace_items[i].flags |= NSPACE_ITEM_RESET_TIMER; + } + + lck_mtx_unlock(&nspace_handler_lock); + + if (error) { + printf("nspace-handler-update: did not find token %u\n", token); + } + + } else if (IOCBASECMD(cmd) == FSCTL_NAMESPACE_HANDLER_UNBLOCK) { + uint32_t token, val; + int i; + + if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) { + goto FSCtl_Exit; + } + + if (!nspace_is_special_process(p)) { + error = EINVAL; + goto FSCtl_Exit; + } + + token = ((uint32_t *)data)[0]; + val = ((uint32_t *)data)[1]; + + lck_mtx_lock(&nspace_handler_lock); + + for(i=0; i < MAX_NSPACE_ITEMS; i++) { + if (nspace_items[i].token == token) { + break; + } + } + + if (i >= MAX_NSPACE_ITEMS) { + printf("nspace-handler-unblock: did not find token %u\n", token); + error = ENOENT; + } else { + if (val == 0 && nspace_items[i].vp) { + vnode_lock_spin(nspace_items[i].vp); + nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT; + vnode_unlock(nspace_items[i].vp); + } + + nspace_items[i].vp = NULL; + nspace_items[i].arg = NULL; + nspace_items[i].op = 0; + nspace_items[i].vid = 0; + nspace_items[i].flags = NSPACE_ITEM_DONE; + nspace_items[i].token = 0; + + wakeup((caddr_t)&(nspace_items[i].vp)); + } + + lck_mtx_unlock(&nspace_handler_lock); + + } else if (IOCBASECMD(cmd) == FSCTL_NAMESPACE_HANDLER_CANCEL) { + uint32_t token, val; + int i; + + if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) { + goto FSCtl_Exit; + } + + if (!nspace_is_special_process(p)) { + error = EINVAL; + goto FSCtl_Exit; + } + + token = ((uint32_t *)data)[0]; + val = ((uint32_t *)data)[1]; + + lck_mtx_lock(&nspace_handler_lock); + + for(i=0; i < MAX_NSPACE_ITEMS; i++) { + if (nspace_items[i].token == token) { + break; + } + } + + if (i >= MAX_NSPACE_ITEMS) { + printf("nspace-handler-cancel: did not find token %u\n", token); + error = ENOENT; + } else { + if (nspace_items[i].vp) { + vnode_lock_spin(nspace_items[i].vp); + nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT; + vnode_unlock(nspace_items[i].vp); + } + + nspace_items[i].vp = NULL; + nspace_items[i].arg = NULL; + nspace_items[i].vid = 0; + nspace_items[i].token = val; + nspace_items[i].flags &= ~NSPACE_ITEM_PROCESSING; + nspace_items[i].flags |= NSPACE_ITEM_CANCELLED; + + wakeup((caddr_t)&(nspace_items[i].vp)); + } + + lck_mtx_unlock(&nspace_handler_lock); + } else if (IOCBASECMD(cmd) == FSCTL_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME) { + if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) { + goto FSCtl_Exit; + } + + // we explicitly do not do the namespace_handler_proc check here + + lck_mtx_lock(&nspace_handler_lock); + snapshot_timestamp = ((uint32_t *)data)[0]; + wakeup(&nspace_item_idx); + lck_mtx_unlock(&nspace_handler_lock); + printf("nspace-handler-set-snapshot-time: %d\n", (int)snapshot_timestamp); + + } else if (IOCBASECMD(cmd) == FSCTL_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS) { + if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) { + goto FSCtl_Exit; + } + + lck_mtx_lock(&nspace_handler_lock); + nspace_allow_virtual_devs = ((uint32_t *)data)[0]; + lck_mtx_unlock(&nspace_handler_lock); + printf("nspace-snapshot-handler will%s allow events on disk-images\n", + nspace_allow_virtual_devs ? "" : " NOT"); + error = 0; + + } else if (IOCBASECMD(cmd) == FSCTL_SET_FSTYPENAME_OVERRIDE) { + if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) { + goto FSCtl_Exit; + } + if (vp->v_mount) { + mount_lock(vp->v_mount); + if (data[0] != 0) { + strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN); + vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE; + if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) { + vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY; + vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE; + } + } else { + if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) { + vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY; + } + vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE; + vp->v_mount->fstypename_override[0] = '\0'; + } + mount_unlock(vp->v_mount); + } } else { /* Invoke the filesystem-specific code */ error = VNOP_IOCTL(vp, IOCBASECMD(cmd), data, options, ctx); @@ -7418,8 +8661,8 @@ fsctl (proc_t p, struct fsctl_args *uap, __unused int32_t *retval) /* Get the vnode for the file we are getting info on: */ nameiflags = 0; if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW; - NDINIT(&nd, LOOKUP, nameiflags | AUDITVNPATH1, UIO_USERSPACE, - uap->path, ctx); + NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1, + UIO_USERSPACE, uap->path, ctx); if ((error = namei(&nd))) goto done; vp = nd.ni_vp; nameidone(&nd); @@ -7520,7 +8763,7 @@ getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval) return (EINVAL); nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW; - NDINIT(&nd, LOOKUP, nameiflags, spacetype, uap->path, ctx); + NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx); if ((error = namei(&nd))) { return (error); } @@ -7531,8 +8774,10 @@ getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval) goto out; } if (xattr_protected(attrname)) { - error = EPERM; - goto out; + if (!vfs_context_issuser(ctx) || strcmp(attrname, "com.apple.system.Security") != 0) { + error = EPERM; + goto out; + } } /* * the specific check for 0xffffffff is a hack to preserve @@ -7558,10 +8803,10 @@ getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval) if (uap->size == 0xffffffff || uap->size == (size_t)-1) goto no_uio; - if (uap->size > (size_t)XATTR_MAXSIZE) - uap->size = XATTR_MAXSIZE; - if (uap->value) { + if (uap->size > (size_t)XATTR_MAXSIZE) + uap->size = XATTR_MAXSIZE; + auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf)); uio_addiov(auio, uap->value, uap->size); @@ -7652,7 +8897,12 @@ setxattr(proc_t p, struct setxattr_args *uap, int *retval) return (EINVAL); if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) { - return (error); + if (error == EPERM) { + /* if the string won't fit in attrname, copyinstr emits EPERM */ + return (ENAMETOOLONG); + } + /* Otherwise return the default error from copyinstr to detect ERANGE, etc */ + return error; } if (xattr_protected(attrname)) return(EPERM); @@ -7661,7 +8911,7 @@ setxattr(proc_t p, struct setxattr_args *uap, int *retval) } nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW; - NDINIT(&nd, LOOKUP, nameiflags, spacetype, uap->path, ctx); + NDINIT(&nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx); if ((error = namei(&nd))) { return (error); } @@ -7698,7 +8948,9 @@ fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval) size_t namelen; int error; char uio_buf[ UIO_SIZEOF(1) ]; +#if CONFIG_FSE vfs_context_t ctx = vfs_context_current(); +#endif if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) return (EINVAL); @@ -7762,7 +9014,7 @@ removexattr(proc_t p, struct removexattr_args *uap, int *retval) if (xattr_protected(attrname)) return(EPERM); nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW; - NDINIT(&nd, LOOKUP, nameiflags, spacetype, uap->path, ctx); + NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx); if ((error = namei(&nd))) { return (error); } @@ -7793,7 +9045,9 @@ fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval) char attrname[XATTR_MAXNAMELEN+1]; size_t namelen; int error; +#if CONFIG_FSE vfs_context_t ctx = vfs_context_current(); +#endif if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) return (EINVAL); @@ -7847,15 +9101,15 @@ listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval) return (EINVAL); nameiflags = ((uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW) | NOTRIGGER; - NDINIT(&nd, LOOKUP, nameiflags, spacetype, uap->path, ctx); + NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx); if ((error = namei(&nd))) { return (error); } vp = nd.ni_vp; nameidone(&nd); if (uap->namebuf != 0 && uap->bufsize > 0) { - auio = uio_createwithbuffer(1, 0, spacetype, - UIO_READ, &uio_buf[0], sizeof(uio_buf)); + auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, + &uio_buf[0], sizeof(uio_buf)); uio_addiov(auio, uap->namebuf, uap->bufsize); } @@ -7958,6 +9212,13 @@ fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval) if (error) { goto out; } +#if CONFIG_MACF + error = mac_vnode_check_fsgetpath(ctx, vp); + if (error) { + vnode_put(vp); + goto out; + } +#endif /* Obtain the absolute path to this vnode. */ bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0; error = build_path(vp, realpath, uap->bufsize, &length, bpflags, ctx); @@ -8007,7 +9268,11 @@ munge_statfs(struct mount *mp, struct vfsstatfs *sfsp, sfs.f_ffree = (user64_long_t)sfsp->f_ffree; sfs.f_fsid = sfsp->f_fsid; sfs.f_owner = sfsp->f_owner; - strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN); + if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) { + strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN); + } else { + strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN); + } strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN); strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN); @@ -8080,7 +9345,11 @@ munge_statfs(struct mount *mp, struct vfsstatfs *sfsp, sfs.f_ffree = (user32_long_t)sfsp->f_ffree; sfs.f_fsid = sfsp->f_fsid; sfs.f_owner = sfsp->f_owner; - strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN); + if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) { + strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN); + } else { + strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN); + } strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN); strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN); diff --git a/bsd/vfs/vfs_utfconv.c b/bsd/vfs/vfs_utfconv.c index adf92df30..f785b0d8c 100644 --- a/bsd/vfs/vfs_utfconv.c +++ b/bsd/vfs/vfs_utfconv.c @@ -80,7 +80,7 @@ * Similar to __CFUniCharIsNonBaseCharacter except that * unicode_combinable also includes Hangul Jamo characters. */ -inline int +int unicode_combinable(u_int16_t character) { const u_int8_t *bitmap = __CFUniCharCombiningBitmap; @@ -105,7 +105,7 @@ unicode_combinable(u_int16_t character) * * Similar to __CFUniCharIsDecomposableCharacter. */ -inline int +int unicode_decomposeable(u_int16_t character) { const u_int8_t *bitmap = __CFUniCharDecomposableBitmap; u_int8_t value; @@ -1024,7 +1024,7 @@ priortysort(u_int16_t* characters, int count) u_int32_t p1, p2; u_int16_t *ch1, *ch2; u_int16_t *end; - int changes = 1; + int changes = 0; end = characters + count; do { @@ -1035,13 +1035,22 @@ priortysort(u_int16_t* characters, int count) while (ch2 < end) { p1 = p2; p2 = get_combining_class(*ch2); - if (p1 > p2) { + if (p1 > p2 && p2 != 0) { u_int32_t tmp; tmp = *ch1; *ch1 = *ch2; *ch2 = tmp; changes = 1; + + /* + * Make sure that p2 contains the combining class for the + * character now stored at *ch2. This isn't required for + * correctness, but it will be more efficient if a character + * with a large combining class has to "bubble past" several + * characters with lower combining classes. + */ + p2 = p1; } ++ch1; ++ch2; diff --git a/bsd/vfs/vfs_vnops.c b/bsd/vfs/vfs_vnops.c index c7b110fd6..d7e2b5f14 100644 --- a/bsd/vfs/vfs_vnops.c +++ b/bsd/vfs/vfs_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -125,6 +125,7 @@ static int vn_kqfilt_add(struct fileproc *fp, struct knote *kn, vfs_context_t ctx); static void filt_vndetach(struct knote *kn); static int filt_vnode(struct knote *kn, long hint); +static int vn_open_auth_finish(vnode_t vp, int fmode, vfs_context_t ctx); #if 0 static int vn_kqfilt_remove(struct vnode *vp, uintptr_t ident, vfs_context_t ctx); @@ -163,6 +164,138 @@ vn_open_modflags(struct nameidata *ndp, int *fmodep, int cmode) return(vn_open_auth(ndp, fmodep, &va)); } +static int +vn_open_auth_finish(vnode_t vp, int fmode, vfs_context_t ctx) +{ + int error; + + if ((error = vnode_ref_ext(vp, fmode, 0)) != 0) { + goto bad; + } + + /* call out to allow 3rd party notification of open. + * Ignore result of kauth_authorize_fileop call. + */ + kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_OPEN, + (uintptr_t)vp, 0); + + return 0; + +bad: + return error; + +} + +/* + * May do nameidone() to allow safely adding an FSEvent. Cue off of ni_dvp to + * determine whether that has happened. + */ +static int +vn_open_auth_do_create(struct nameidata *ndp, struct vnode_attr *vap, int fmode, boolean_t *did_create, boolean_t *did_open, vfs_context_t ctx) +{ + uint32_t status = 0; + vnode_t dvp = ndp->ni_dvp; + int batched; + int error; + vnode_t vp; + + batched = vnode_compound_open_available(ndp->ni_dvp); + *did_open = FALSE; + + VATTR_SET(vap, va_type, VREG); + if (fmode & O_EXCL) + vap->va_vaflags |= VA_EXCLUSIVE; + +#if NAMEDRSRCFORK + if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) { + if ((error = vn_authorize_create(dvp, &ndp->ni_cnd, vap, ctx, NULL)) != 0) + goto out; + if ((error = vnode_makenamedstream(dvp, &ndp->ni_vp, XATTR_RESOURCEFORK_NAME, 0, ctx)) != 0) + goto out; + *did_create = TRUE; + } else { +#endif + if (!batched) { + if ((error = vn_authorize_create(dvp, &ndp->ni_cnd, vap, ctx, NULL)) != 0) + goto out; + } + + error = vn_create(dvp, &ndp->ni_vp, ndp, vap, VN_CREATE_DOOPEN, fmode, &status, ctx); + if (error != 0) { + if (batched) { + *did_create = (status & COMPOUND_OPEN_STATUS_DID_CREATE) ? TRUE : FALSE; + } else { + *did_create = FALSE; + } + + if (error == EKEEPLOOKING) { + if (*did_create) { + panic("EKEEPLOOKING, but we did a create?"); + } + if (!batched) { + panic("EKEEPLOOKING from filesystem that doesn't support compound vnops?"); + } + if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) { + panic("EKEEPLOOKING, but continue flag not set?"); + } + + /* + * Do NOT drop the dvp: we need everything to continue the lookup. + */ + return error; + } + } else { + if (batched) { + *did_create = (status & COMPOUND_OPEN_STATUS_DID_CREATE) ? 1 : 0; + *did_open = TRUE; + } else { + *did_create = TRUE; + } + } +#if NAMEDRSRCFORK + } +#endif + + /* + * Unlock the fsnode (if locked) here so that we are free + * to drop the dvp iocount and prevent deadlock in build_path(). + * nameidone() will still do the right thing later. + */ + vp = ndp->ni_vp; + namei_unlock_fsnode(ndp); + + if (*did_create) { + int update_flags = 0; + + // Make sure the name & parent pointers are hooked up + if (vp->v_name == NULL) + update_flags |= VNODE_UPDATE_NAME; + if (vp->v_parent == NULLVP) + update_flags |= VNODE_UPDATE_PARENT; + + if (update_flags) + vnode_update_identity(vp, dvp, ndp->ni_cnd.cn_nameptr, ndp->ni_cnd.cn_namelen, ndp->ni_cnd.cn_hash, update_flags); + + vnode_put(dvp); + ndp->ni_dvp = NULLVP; + +#if CONFIG_FSE + if (need_fsevent(FSE_CREATE_FILE, vp)) { + add_fsevent(FSE_CREATE_FILE, ctx, + FSE_ARG_VNODE, vp, + FSE_ARG_DONE); + } +#endif + } +out: + if (ndp->ni_dvp != NULLVP) { + vnode_put(dvp); + ndp->ni_dvp = NULLVP; + } + + return error; +} + /* * Open a file with authorization, updating the contents of the structures * pointed to by ndp, fmodep, and vap as necessary to perform the requested @@ -217,100 +350,85 @@ vn_open_auth(struct nameidata *ndp, int *fmodep, struct vnode_attr *vap) int error; int fmode; uint32_t origcnflags; - kauth_action_t action; + boolean_t did_create; + boolean_t did_open; + boolean_t need_vnop_open; + boolean_t batched; + boolean_t ref_failed; again: vp = NULL; dvp = NULL; + batched = FALSE; + did_create = FALSE; + need_vnop_open = TRUE; + ref_failed = FALSE; fmode = *fmodep; origcnflags = ndp->ni_cnd.cn_flags; + + /* + * O_CREAT + */ if (fmode & O_CREAT) { if ( (fmode & O_DIRECTORY) ) { error = EINVAL; goto out; } ndp->ni_cnd.cn_nameiop = CREATE; +#if CONFIG_TRIGGERS + ndp->ni_op = OP_LINK; +#endif /* Inherit USEDVP, vnode_open() supported flags only */ ndp->ni_cnd.cn_flags &= (USEDVP | NOCROSSMOUNT | DOWHITEOUT); ndp->ni_cnd.cn_flags |= LOCKPARENT | LOCKLEAF | AUDITVNPATH1; + ndp->ni_flag = NAMEI_COMPOUNDOPEN; #if NAMEDRSRCFORK /* open calls are allowed for resource forks. */ ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK; #endif if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0 && (origcnflags & FOLLOW) != 0) ndp->ni_cnd.cn_flags |= FOLLOW; + +continue_create_lookup: if ( (error = namei(ndp)) ) goto out; + dvp = ndp->ni_dvp; vp = ndp->ni_vp; - /* not found, create */ - if (vp == NULL) { - /* must have attributes for a new file */ - if (vap == NULL) { - error = EINVAL; - goto badcreate; - } - - VATTR_SET(vap, va_type, VREG); -#if CONFIG_MACF - error = mac_vnode_check_create(ctx, - dvp, &ndp->ni_cnd, vap); - if (error) - goto badcreate; -#endif /* MAC */ + batched = vnode_compound_open_available(dvp); - /* authorize before creating */ - if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) - goto badcreate; + /* not found, create */ + if (vp == NULL) { + /* must have attributes for a new file */ + if (vap == NULL) { + error = EINVAL; + goto out; + } + /* + * Attempt a create. For a system supporting compound VNOPs, we may + * find an existing file or create one; in either case, we will already + * have the file open and no VNOP_OPEN() will be needed. + */ + error = vn_open_auth_do_create(ndp, vap, fmode, &did_create, &did_open, ctx); - if (fmode & O_EXCL) - vap->va_vaflags |= VA_EXCLUSIVE; -#if NAMEDRSRCFORK - if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) { - if ((error = vnode_makenamedstream(dvp, &ndp->ni_vp, XATTR_RESOURCEFORK_NAME, 0, ctx)) != 0) - goto badcreate; - } else -#endif - if ((error = vn_create(dvp, &ndp->ni_vp, &ndp->ni_cnd, vap, 0, ctx)) != 0) - goto badcreate; - + dvp = ndp->ni_dvp; vp = ndp->ni_vp; - if (vp) { - int update_flags = 0; - - // Make sure the name & parent pointers are hooked up - if (vp->v_name == NULL) - update_flags |= VNODE_UPDATE_NAME; - if (vp->v_parent == NULLVP) - update_flags |= VNODE_UPDATE_PARENT; - - if (update_flags) - vnode_update_identity(vp, dvp, ndp->ni_cnd.cn_nameptr, ndp->ni_cnd.cn_namelen, ndp->ni_cnd.cn_hash, update_flags); - -#if CONFIG_FSE - if (need_fsevent(FSE_CREATE_FILE, vp)) { - vnode_put(dvp); - dvp = NULL; - add_fsevent(FSE_CREATE_FILE, ctx, - FSE_ARG_VNODE, vp, - FSE_ARG_DONE); + /* + * Detected a node that the filesystem couldn't handle. Don't call + * nameidone() yet, because we need that path buffer. + */ + if (error == EKEEPLOOKING) { + if (!batched) { + panic("EKEEPLOOKING from a filesystem that doesn't support compound VNOPs?"); } -#endif - + goto continue_create_lookup; } - /* - * nameidone has to happen before we vnode_put(dvp) - * and clear the ni_dvp field, since it may need - * to release the fs_nodelock on the dvp - */ -badcreate: - nameidone(ndp); - ndp->ni_dvp = NULL; + nameidone(ndp); if (dvp) { - vnode_put(dvp); + panic("Shouldn't have a dvp here."); } if (error) { @@ -318,129 +436,166 @@ vn_open_auth(struct nameidata *ndp, int *fmodep, struct vnode_attr *vap) * Check for a creation or unlink race. */ if (((error == EEXIST) && !(fmode & O_EXCL)) || - ((error == ENOENT) && (fmode & O_CREAT))){ + ((error == ENOENT) && (fmode & O_CREAT))){ + if (vp) + vnode_put(vp); goto again; } goto bad; } - fmode &= ~O_TRUNC; + + need_vnop_open = !did_open; } else { + if (fmode & O_EXCL) + error = EEXIST; + + /* + * We have a vnode. Use compound open if available + * or else fall through to "traditional" path. Note: can't + * do a compound open for root, because the parent belongs + * to a different FS. + */ + if (error == 0 && batched && (vnode_mount(dvp) == vnode_mount(vp))) { + error = VNOP_COMPOUND_OPEN(dvp, &ndp->ni_vp, ndp, 0, fmode, NULL, NULL, ctx); + + if (error == 0) { + vp = ndp->ni_vp; + need_vnop_open = FALSE; + } else if (error == EKEEPLOOKING) { + if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) { + panic("EKEEPLOOKING, but continue flag not set?"); + } + goto continue_create_lookup; + } + } nameidone(ndp); - ndp->ni_dvp = NULL; vnode_put(dvp); + ndp->ni_dvp = NULLVP; - if (fmode & O_EXCL) { - error = EEXIST; + if (error) { goto bad; } + fmode &= ~O_CREAT; + + /* Fall through */ } } else { + /* + * Not O_CREAT + */ ndp->ni_cnd.cn_nameiop = LOOKUP; /* Inherit USEDVP, vnode_open() supported flags only */ ndp->ni_cnd.cn_flags &= (USEDVP | NOCROSSMOUNT | DOWHITEOUT); - ndp->ni_cnd.cn_flags |= FOLLOW | LOCKLEAF | AUDITVNPATH1; + ndp->ni_cnd.cn_flags |= FOLLOW | LOCKLEAF | AUDITVNPATH1 | WANTPARENT; #if NAMEDRSRCFORK /* open calls are allowed for resource forks. */ ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK; #endif + ndp->ni_flag = NAMEI_COMPOUNDOPEN; + /* preserve NOFOLLOW from vnode_open() */ if (fmode & O_NOFOLLOW || fmode & O_SYMLINK || (origcnflags & FOLLOW) == 0) { - ndp->ni_cnd.cn_flags &= ~FOLLOW; + ndp->ni_cnd.cn_flags &= ~FOLLOW; } - if ( (error = namei(ndp)) ) - goto out; - vp = ndp->ni_vp; + /* Do a lookup, possibly going directly to filesystem for compound operation */ + do { + if ( (error = namei(ndp)) ) + goto out; + vp = ndp->ni_vp; + dvp = ndp->ni_dvp; + + /* Check for batched lookup-open */ + batched = vnode_compound_open_available(dvp); + if (batched && ((vp == NULLVP) || (vnode_mount(dvp) == vnode_mount(vp)))) { + error = VNOP_COMPOUND_OPEN(dvp, &ndp->ni_vp, ndp, 0, fmode, NULL, NULL, ctx); + vp = ndp->ni_vp; + if (error == 0) { + need_vnop_open = FALSE; + } else if (error == EKEEPLOOKING) { + if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) { + panic("EKEEPLOOKING, but continue flag not set?"); + } + } + } + } while (error == EKEEPLOOKING); + nameidone(ndp); - ndp->ni_dvp = NULL; + vnode_put(dvp); + ndp->ni_dvp = NULLVP; - if ( (fmode & O_DIRECTORY) && vp->v_type != VDIR ) { - error = ENOTDIR; + if (error) { goto bad; } } - if (vp->v_type == VSOCK && vp->v_tag != VT_FDESC) { - error = EOPNOTSUPP; /* Operation not supported on socket */ - goto bad; - } - - if (vp->v_type == VLNK && (fmode & O_NOFOLLOW) != 0) { - error = ELOOP; /* O_NOFOLLOW was specified and the target is a symbolic link */ - goto bad; + /* + * By this point, nameidone() is called, dvp iocount is dropped, + * and dvp pointer is cleared. + */ + if (ndp->ni_dvp != NULLVP) { + panic("Haven't cleaned up adequately in vn_open_auth()"); } - /* authorize open of an existing file */ - if ((fmode & O_CREAT) == 0) { - - /* disallow write operations on directories */ - if (vnode_isdir(vp) && (fmode & (FWRITE | O_TRUNC))) { - error = EISDIR; - goto bad; + /* + * Expect to use this code for filesystems without compound VNOPs, for the root + * of a filesystem, which can't be "looked up" in the sense of VNOP_LOOKUP(), + * and for shadow files, which do not live on the same filesystems as their "parents." + */ + if (need_vnop_open) { + if (batched && !vnode_isvroot(vp) && !vnode_isnamedstream(vp)) { + panic("Why am I trying to use VNOP_OPEN() on anything other than the root or a named stream?"); } -#if CONFIG_MACF - error = mac_vnode_check_open(ctx, vp, fmode); - if (error) - goto bad; -#endif - - /* compute action to be authorized */ - action = 0; - if (fmode & FREAD) { - action |= KAUTH_VNODE_READ_DATA; - } - if (fmode & (FWRITE | O_TRUNC)) { - /* - * If we are writing, appending, and not truncating, - * indicate that we are appending so that if the - * UF_APPEND or SF_APPEND bits are set, we do not deny - * the open. - */ - if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) { - action |= KAUTH_VNODE_APPEND_DATA; - } else { - action |= KAUTH_VNODE_WRITE_DATA; + if (!did_create) { + error = vn_authorize_open_existing(vp, &ndp->ni_cnd, fmode, ctx, NULL); + if (error) { + goto bad; } } - if ((error = vnode_authorize(vp, NULL, action, ctx)) != 0) - goto bad; - - // - // if the vnode is tagged VOPENEVT and the current process - // has the P_CHECKOPENEVT flag set, then we or in the O_EVTONLY - // flag to the open mode so that this open won't count against - // the vnode when carbon delete() does a vnode_isinuse() to see - // if a file is currently in use. this allows spotlight - // importers to not interfere with carbon apps that depend on - // the no-delete-if-busy semantics of carbon delete(). - // - if ((vp->v_flag & VOPENEVT) && (current_proc()->p_flag & P_CHECKOPENEVT)) { - fmode |= O_EVTONLY; + error = VNOP_OPEN(vp, fmode, ctx); + if (error) { + goto bad; } + need_vnop_open = FALSE; + } + // if the vnode is tagged VOPENEVT and the current process + // has the P_CHECKOPENEVT flag set, then we or in the O_EVTONLY + // flag to the open mode so that this open won't count against + // the vnode when carbon delete() does a vnode_isinuse() to see + // if a file is currently in use. this allows spotlight + // importers to not interfere with carbon apps that depend on + // the no-delete-if-busy semantics of carbon delete(). + // + if (!did_create && (vp->v_flag & VOPENEVT) && (current_proc()->p_flag & P_CHECKOPENEVT)) { + fmode |= O_EVTONLY; } - if ( (error = VNOP_OPEN(vp, fmode, ctx)) ) { + /* + * Grab reference, etc. + */ + error = vn_open_auth_finish(vp, fmode, ctx); + if (error) { + ref_failed = TRUE; goto bad; } - if ( (error = vnode_ref_ext(vp, fmode)) ) { - goto bad2; - } - /* call out to allow 3rd party notification of open. - * Ignore result of kauth_authorize_fileop call. - */ - kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_OPEN, - (uintptr_t)vp, 0); + /* Compound VNOP open is responsible for doing the truncate */ + if (batched || did_create) + fmode &= ~O_TRUNC; *fmodep = fmode; return (0); -bad2: - VNOP_CLOSE(vp, fmode, ctx); + bad: + /* Opened either explicitly or by a batched create */ + if (!need_vnop_open) { + VNOP_CLOSE(vp, fmode, ctx); + } + ndp->ni_vp = NULL; if (vp) { #if NAMEDRSRCFORK @@ -459,10 +614,11 @@ vn_open_auth(struct nameidata *ndp, int *fmodep, struct vnode_attr *vap) * * EREDRIVEOPEN: means that we were hit by the tty allocation race. */ - if (((error == ENOENT) && (*fmodep & O_CREAT)) || (error == EREDRIVEOPEN)) { + if (((error == ENOENT) && (*fmodep & O_CREAT)) || (error == EREDRIVEOPEN) || ref_failed) { goto again; } } + out: return (error); } @@ -502,16 +658,6 @@ vn_close(struct vnode *vp, int flags, vfs_context_t ctx) { int error; -#if CONFIG_FSE - if (flags & FWASWRITTEN) { - if (need_fsevent(FSE_CONTENT_MODIFIED, vp)) { - add_fsevent(FSE_CONTENT_MODIFIED, ctx, - FSE_ARG_VNODE, vp, - FSE_ARG_DONE); - } - } -#endif - #if NAMEDRSRCFORK /* Sync data from resource fork shadow file if needed. */ if ((vp->v_flag & VISNAMEDSTREAM) && @@ -529,6 +675,16 @@ vn_close(struct vnode *vp, int flags, vfs_context_t ctx) error = VNOP_CLOSE(vp, flags, ctx); +#if CONFIG_FSE + if (flags & FWASWRITTEN) { + if (need_fsevent(FSE_CONTENT_MODIFIED, vp)) { + add_fsevent(FSE_CONTENT_MODIFIED, ctx, + FSE_ARG_VNODE, vp, + FSE_ARG_DONE); + } + } +#endif + if (!vnode_isspec(vp)) (void)vnode_rele_ext(vp, flags, 0); @@ -782,6 +938,9 @@ vn_write(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx) ioflag |= IO_NDELAY; if ((fp->f_fglob->fg_flag & FNOCACHE) || vnode_isnocache(vp)) ioflag |= IO_NOCACHE; + if (fp->f_fglob->fg_flag & FNODIRECT) + ioflag |= IO_NODIRECT; + /* * Treat synchronous mounts and O_FSYNC on the fd as equivalent. * @@ -996,7 +1155,7 @@ vn_stat_noauth(struct vnode *vp, void *sbptr, kauth_filesec_t *xsec, int isstat6 sb->st_blocks = roundup(va.va_total_alloc, 512) / 512; } - /* if we're interested in exended security data and we got an ACL */ + /* if we're interested in extended security data and we got an ACL */ if (xsec != NULL) { if (!VATTR_IS_SUPPORTED(&va, va_acl) && !VATTR_IS_SUPPORTED(&va, va_uuuid) && @@ -1147,7 +1306,10 @@ vn_ioctl(struct fileproc *fp, u_long com, caddr_t data, vfs_context_t ctx) error = VNOP_IOCTL(vp, com, data, fp->f_fglob->fg_flag, ctx); if (error == 0 && com == TIOCSCTTY) { - vnode_ref(vp); + error = vnode_ref_ext(vp, 0, VNODE_REF_FORCE); + if (error != 0) { + panic("vnode_ref_ext() failed despite VNODE_REF_FORCE?!"); + } funnel_state = thread_funnel_set(kernel_flock, TRUE); sessp = proc_session(vfs_context_proc(ctx)); @@ -1235,6 +1397,7 @@ int vn_pathconf(vnode_t vp, int name, int32_t *retval, vfs_context_t ctx) { int error = 0; + struct vfs_attr vfa; switch(name) { case _PC_EXTENDED_SECURITY_NP: @@ -1273,6 +1436,33 @@ vn_pathconf(vnode_t vp, int name, int32_t *retval, vfs_context_t ctx) case _PC_SYNC_IO: /* unistd.h: _POSIX_SYNCHRONIZED_IO */ *retval = 0; /* [SIO] option is not supported */ break; + case _PC_XATTR_SIZE_BITS: + /* The number of bits used to store maximum extended + * attribute size in bytes. For example, if the maximum + * attribute size supported by a file system is 128K, the + * value returned will be 18. However a value 18 can mean + * that the maximum attribute size can be anywhere from + * (256KB - 1) to 128KB. As a special case, the resource + * fork can have much larger size, and some file system + * specific extended attributes can have smaller and preset + * size; for example, Finder Info is always 32 bytes. + */ + memset(&vfa, 0, sizeof(vfa)); + VFSATTR_INIT(&vfa); + VFSATTR_WANTED(&vfa, f_capabilities); + if (vfs_getattr(vnode_mount(vp), &vfa, ctx) == 0 && + (VFSATTR_IS_SUPPORTED(&vfa, f_capabilities)) && + (vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) && + (vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) { + /* Supports native extended attributes */ + error = VNOP_PATHCONF(vp, name, retval, ctx); + } else { + /* Number of bits used to represent the maximum size of + * extended attribute stored in an Apple Double file. + */ + *retval = AD_XATTR_SIZE_BITS; + } + break; default: error = VNOP_PATHCONF(vp, name, retval, ctx); break; @@ -1303,7 +1493,7 @@ vn_kqfilt_add(struct fileproc *fp, struct knote *kn, vfs_context_t ctx) } } else if (!vnode_isreg(vp)) { - if (vnode_isspec(vp) && + if (vnode_ischr(vp) && (error = spec_kqfilter(vp, kn)) == 0) { /* claimed by a special device */ vnode_put(vp); @@ -1447,18 +1637,22 @@ vnode_writable_space_count(vnode_t vp) static int filt_vnode(struct knote *kn, long hint) { - struct vnode *vp = (struct vnode *)kn->kn_hook; + vnode_t vp = (struct vnode *)kn->kn_hook; int activate = 0; + long orig_hint = hint; if (0 == hint) { - if ((vnode_getwithvid(vp, kn->kn_hookid) != 0)) { + vnode_lock(vp); + + if (vnode_getiocount(vp, kn->kn_hookid, VNODE_NODEAD | VNODE_WITHID) != 0) { + /* Is recycled */ hint = NOTE_REVOKE; - } else { - vnode_put(vp); - } - } + } + } else { + lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED); + } - /* NOTE_REVOKE is special, as it is only sent during vnode reclaim */ + /* Special handling for vnodes that are in recycle or already gone */ if (NOTE_REVOKE == hint) { kn->kn_flags |= (EV_EOF | EV_ONESHOT); activate = 1; @@ -1496,5 +1690,15 @@ filt_vnode(struct knote *kn, long hint) } } + if (orig_hint == 0) { + /* + * Definitely need to unlock, may need to put + */ + if (hint == 0) { + vnode_put_locked(vp); + } + vnode_unlock(vp); + } + return (activate); } diff --git a/bsd/vfs/vfs_xattr.c b/bsd/vfs/vfs_xattr.c index d15711685..a37ba0f74 100644 --- a/bsd/vfs/vfs_xattr.c +++ b/bsd/vfs/vfs_xattr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2008 Apple Inc. All rights reserved. + * Copyright (c) 2004-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -65,8 +65,6 @@ #define MAKE_SHADOW_NAME(VP, NAME) \ snprintf((NAME), sizeof((NAME)), ".vfs_rsrc_stream_%p%08x%p", (void*)(VP), (VP)->v_id, (VP)->v_data); -static vnode_t shadow_dvp; /* tmp directory to hold stream shadow files */ -static int shadow_vid; static int shadow_sequence; @@ -556,7 +554,7 @@ vnode_flushnamedstream(vnode_t vp, vnode_t svp, vfs_context_t context) return (0); } datasize = va.va_data_size; - if ((datasize == 0)) { + if (datasize == 0) { (void) default_removexattr(vp, XATTR_RESOURCEFORK_NAME, 0, context); return (0); } @@ -623,9 +621,10 @@ getshadowfile(vnode_t vp, vnode_t *svpp, int makestream, size_t *rsrcsize, char tmpname[80]; size_t datasize = 0; int error = 0; + int retries = 0; +retry_create: *creator = 0; - /* Establish a unique file name. */ MAKE_SHADOW_NAME(vp, tmpname); bzero(&cn, sizeof(cn)); @@ -705,9 +704,32 @@ getshadowfile(vnode_t vp, vnode_t *svpp, int makestream, size_t *rsrcsize, if (error == 0) { vnode_recycle(svp); *creator = 1; - } else if ((error == EEXIST) && !makestream) { + } + else if ((error == EEXIST) && !makestream) { error = VNOP_LOOKUP(dvp, &svp, &cn, context); } + else if ((error == ENOENT) && !makestream) { + /* + * We could have raced with a rmdir on the shadow directory + * post-lookup. Retry from the beginning, 1x only, to + * try and see if we need to re-create the shadow directory + * in get_shadow_dir. + */ + if (retries == 0) { + retries++; + if (dvp) { + vnode_put (dvp); + dvp = NULLVP; + } + if (svp) { + vnode_put (svp); + svp = NULLVP; + } + goto retry_create; + } + /* Otherwise, just error out normally below */ + } + out: if (dvp) { vnode_put(dvp); @@ -936,15 +958,27 @@ get_shadow_dir(vnode_t *sdvpp, vfs_context_t context) uint32_t tmp_fsid; int error; - /* Check if we've already created it. */ - if (shadow_dvp != NULLVP) { - if ((error = vnode_getwithvid(shadow_dvp, shadow_vid))) { - shadow_dvp = NULLVP; - } else { - *sdvpp = shadow_dvp; - return (0); - } + + bzero(tmpname, sizeof(tmpname)); + snprintf(tmpname, sizeof(tmpname), "/var/run/.vfs_rsrc_streams_%p%x", + (void*)rootvnode, shadow_sequence); + /* + * Look up the shadow directory to ensure that it still exists. + * By looking it up, we get an iocounted dvp to use, and avoid some coherency issues + * in caching it when multiple threads may be trying to manipulate the pointers. + */ + error = vnode_lookup(tmpname, 0, &sdvp, context); + if (error == 0) { + /* + * If we get here, then we have successfully looked up the shadow dir, + * and it has an iocount from the lookup. Return the vp in the output argument. + */ + *sdvpp = sdvp; + return (0); } + /* In the failure case, no iocount is acquired */ + sdvp = NULLVP; + bzero (tmpname, sizeof(tmpname)); /* Obtain the vnode for "/var/run" directory. */ if (vnode_lookup("/var/run", 0, &dvp, context) != 0) { @@ -980,14 +1014,7 @@ get_shadow_dir(vnode_t *sdvpp, vfs_context_t context) /* * There can be only one winner for an exclusive create. */ - if (error == 0) { - /* Take a long term ref to keep this dir around. */ - error = vnode_ref(sdvp); - if (error == 0) { - shadow_dvp = sdvp; - shadow_vid = sdvp->v_id; - } - } else if (error == EEXIST) { + if (error == EEXIST) { /* loser has to look up directory */ error = VNOP_LOOKUP(dvp, &sdvp, &cn, context); if (error == 0) { @@ -995,7 +1022,7 @@ get_shadow_dir(vnode_t *sdvpp, vfs_context_t context) if (sdvp->v_type != VDIR) { goto baddir; } - /* Obtain the fsid for /var/run directory */ + /* Obtain the fsid for /tmp directory */ VATTR_INIT(&va); VATTR_WANTED(&va, va_fsid); if (VNOP_GETATTR(dvp, &va, context) != 0 || @@ -1156,7 +1183,7 @@ get_shadow_dir(vnode_t *sdvpp, vfs_context_t context) #define ATTR_BUF_SIZE 4096 /* default size of the attr file and how much we'll grow by */ /* Implementation Limits */ -#define ATTR_MAX_SIZE (128*1024) /* 128K maximum attribute data size */ +#define ATTR_MAX_SIZE AD_XATTR_MAXSIZE #define ATTR_MAX_HDR_SIZE 65536 /* * Note: ATTR_MAX_HDR_SIZE is the largest attribute header @@ -2347,12 +2374,15 @@ open_xattrfile(vnode_t vp, int fileflags, vnode_t *xvpp, vfs_context_t context) * file security from the EA must always get access */ lookup: - NDINIT(&nd, LOOKUP, LOCKLEAF | NOFOLLOW | USEDVP | DONOTAUTH, UIO_SYSSPACE, - CAST_USER_ADDR_T(filename), context); + NDINIT(&nd, LOOKUP, OP_OPEN, LOCKLEAF | NOFOLLOW | USEDVP | DONOTAUTH, + UIO_SYSSPACE, CAST_USER_ADDR_T(filename), context); nd.ni_dvp = dvp; if (fileflags & O_CREAT) { nd.ni_cnd.cn_nameiop = CREATE; +#if CONFIG_TRIGGERS + nd.ni_op = OP_LINK; +#endif if (dvp != vp) { nd.ni_cnd.cn_flags |= LOCKPARENT; } @@ -2394,8 +2424,9 @@ open_xattrfile(vnode_t vp, int fileflags, vnode_t *xvpp, vfs_context_t context) if (gid != KAUTH_GID_NONE) VATTR_SET(&va, va_gid, gid); - error = vn_create(dvp, &nd.ni_vp, &nd.ni_cnd, &va, + error = vn_create(dvp, &nd.ni_vp, &nd, &va, VN_CREATE_NOAUTH | VN_CREATE_NOINHERIT | VN_CREATE_NOLABEL, + 0, NULL, context); if (error) error = ENOATTR; @@ -2544,7 +2575,7 @@ remove_xattrfile(vnode_t xvp, vfs_context_t context) return (error); } - NDINIT(&nd, DELETE, LOCKPARENT | NOFOLLOW | DONOTAUTH, + NDINIT(&nd, DELETE, OP_UNLINK, LOCKPARENT | NOFOLLOW | DONOTAUTH, UIO_SYSSPACE, CAST_USER_ADDR_T(path), context); error = namei(&nd); FREE_ZONE(path, MAXPATHLEN, M_NAMEI); diff --git a/bsd/vfs/vnode_if.c b/bsd/vfs/vnode_if.c index 1a77414e2..6dd63bfde 100644 --- a/bsd/vfs/vnode_if.c +++ b/bsd/vfs/vnode_if.c @@ -106,6 +106,24 @@ struct vnodeop_desc vnop_lookup_desc = { NULL }; +int vnop_compound_open_vp_offsets[] = { + VOPARG_OFFSETOF(struct vnop_compound_open_args, a_dvp), + VDESC_NO_OFFSET +}; + +struct vnodeop_desc vnop_compound_open_desc = { + 0, + "vnop_compound_open", + 0 | VDESC_VP0_WILLRELE, + vnop_compound_open_vp_offsets, + VOPARG_OFFSETOF(struct vnop_compound_open_args, a_vpp), + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VOPARG_OFFSETOF(struct vnop_compound_open_args, a_cnp), + VOPARG_OFFSETOF(struct vnop_compound_open_args, a_context), + NULL +}; + int vnop_create_vp_offsets[] = { VOPARG_OFFSETOF(struct vnop_create_args,a_dvp), VDESC_NO_OFFSET @@ -485,6 +503,23 @@ struct vnodeop_desc vnop_remove_desc = { NULL }; +int vnop_remove_extended_vp_offsets[] = { + VOPARG_OFFSETOF(struct vnop_remove_args,a_dvp), + VDESC_NO_OFFSET +}; +struct vnodeop_desc vnop_compound_remove_desc = { + 0, + "vnop_compound_remove", + 0, + vnop_remove_vp_offsets, + VOPARG_OFFSETOF(struct vnop_compound_remove_args, a_vpp), + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VOPARG_OFFSETOF(struct vnop_remove_args, a_cnp), + VOPARG_OFFSETOF(struct vnop_remove_args, a_context), + NULL +}; + int vnop_link_vp_offsets[] = { VOPARG_OFFSETOF(struct vnop_link_args,a_vp), VOPARG_OFFSETOF(struct vnop_link_args,a_tdvp), @@ -523,6 +558,26 @@ struct vnodeop_desc vnop_rename_desc = { NULL }; +int vnop_compound_rename_vp_offsets[] = { + VOPARG_OFFSETOF(struct vnop_compound_rename_args,a_fdvp), + VOPARG_OFFSETOF(struct vnop_compound_rename_args,a_fvpp), + VOPARG_OFFSETOF(struct vnop_compound_rename_args,a_tdvp), + VOPARG_OFFSETOF(struct vnop_compound_rename_args,a_tvpp), + VDESC_NO_OFFSET +}; +struct vnodeop_desc vnop_compound_rename_desc = { + 0, + "vnop_compound_rename", + 0 | VDESC_VP0_WILLRELE | VDESC_VP1_WILLRELE | VDESC_VP2_WILLRELE | VDESC_VP3_WILLRELE, + vnop_compound_rename_vp_offsets, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VOPARG_OFFSETOF(struct vnop_compound_rename_args, a_fcnp), + VOPARG_OFFSETOF(struct vnop_compound_rename_args, a_context), + NULL +}; + int vnop_mkdir_vp_offsets[] = { VOPARG_OFFSETOF(struct vnop_mkdir_args,a_dvp), VDESC_NO_OFFSET @@ -540,6 +595,24 @@ struct vnodeop_desc vnop_mkdir_desc = { NULL }; +int vnop_compound_mkdir_vp_offsets[] = { + VOPARG_OFFSETOF(struct vnop_compound_mkdir_args,a_dvp), + VDESC_NO_OFFSET +}; +struct vnodeop_desc vnop_compound_mkdir_desc = { + 0, + "vnop_compound_mkdir", + 0 | VDESC_VP0_WILLRELE, + vnop_compound_mkdir_vp_offsets, + VOPARG_OFFSETOF(struct vnop_compound_mkdir_args, a_vpp), + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VOPARG_OFFSETOF(struct vnop_compound_mkdir_args, a_cnp), + VOPARG_OFFSETOF(struct vnop_compound_mkdir_args, a_context), + NULL +}; + + int vnop_rmdir_vp_offsets[] = { VOPARG_OFFSETOF(struct vnop_rmdir_args,a_dvp), VOPARG_OFFSETOF(struct vnop_rmdir_args,a_vp), @@ -558,6 +631,23 @@ struct vnodeop_desc vnop_rmdir_desc = { NULL }; +int vnop_compound_rmdir_vp_offsets[] = { + VOPARG_OFFSETOF(struct vnop_compound_rmdir_args,a_dvp), + VDESC_NO_OFFSET +}; +struct vnodeop_desc vnop_compound_rmdir_desc = { + 0, + "vnop_compound_rmdir", + 0 | VDESC_VP0_WILLRELE | VDESC_VP1_WILLRELE, + vnop_rmdir_vp_offsets, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VOPARG_OFFSETOF(struct vnop_compound_rmdir_args, a_cnp), + VOPARG_OFFSETOF(struct vnop_compound_rmdir_args, a_context), + NULL +}; + int vnop_symlink_vp_offsets[] = { VOPARG_OFFSETOF(struct vnop_symlink_args,a_dvp), VDESC_NO_OFFSET @@ -1004,6 +1094,7 @@ struct vnodeop_desc *vfs_op_descs[] = { &vnop_mknod_desc, &vnop_whiteout_desc, &vnop_open_desc, + &vnop_compound_open_desc, &vnop_close_desc, &vnop_access_desc, &vnop_getattr_desc, @@ -1021,10 +1112,14 @@ struct vnodeop_desc *vfs_op_descs[] = { &vnop_mnomap_desc, &vnop_fsync_desc, &vnop_remove_desc, + &vnop_compound_remove_desc, &vnop_link_desc, &vnop_rename_desc, + &vnop_compound_rename_desc, &vnop_mkdir_desc, + &vnop_compound_mkdir_desc, &vnop_rmdir_desc, + &vnop_compound_rmdir_desc, &vnop_symlink_desc, &vnop_readdir_desc, &vnop_readdirattr_desc, diff --git a/bsd/vm/Makefile b/bsd/vm/Makefile index f0ce21745..608304077 100644 --- a/bsd/vm/Makefile +++ b/bsd/vm/Makefile @@ -10,14 +10,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ DATAFILES = \ diff --git a/bsd/vm/dp_backing_file.c b/bsd/vm/dp_backing_file.c index 420238db9..bb2808ecf 100644 --- a/bsd/vm/dp_backing_file.c +++ b/bsd/vm/dp_backing_file.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -42,8 +42,11 @@ #include #include #include -#include #include +#include +#if CONFIG_PROTECT +#include +#endif #include #include @@ -245,7 +248,7 @@ macx_swapon( /* * Get a vnode for the paging area. */ - NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, + NDINIT(ndp, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, ((IS_64BIT_PROCESS(p)) ? UIO_USERSPACE64 : UIO_USERSPACE32), (user_addr_t) args->filename, ctx); @@ -274,6 +277,18 @@ macx_swapon( if ((file_size < (off_t)size) && ((error = vnode_setsize(vp, (off_t)size, 0, ctx)) != 0)) goto swapon_bailout; +#if CONFIG_PROTECT + { + void *cnode = NULL; + /* initialize content protection keys manually */ + if ((cnode = cp_get_protected_cnode(vp)) != 0) { + if ((error = cp_handle_vnop(cnode, CP_WRITE_ACCESS)) != 0) + goto swapon_bailout; + } + } +#endif + + if (default_pager_init_flag == 0) { start_def_pager(NULL); default_pager_init_flag = 1; @@ -306,21 +321,23 @@ macx_swapon( goto swapon_bailout; } - if (vp->v_mount->mnt_kern_flag & MNTK_SSD) { +#if CONFIG_EMBEDDED + dp_cluster_size = 1 * PAGE_SIZE; +#else + if ((dp_isssd = vnode_pager_isSSD(vp)) == TRUE) { /* * keep the cluster size small since the * seek cost is effectively 0 which means * we don't care much about fragmentation */ - dp_isssd = TRUE; dp_cluster_size = 2 * PAGE_SIZE; } else { /* * use the default cluster size */ - dp_isssd = FALSE; dp_cluster_size = 0; } +#endif kr = default_pager_backing_store_create(default_pager, -1, /* default priority */ dp_cluster_size, @@ -379,6 +396,12 @@ macx_swapon( } (void) thread_funnel_set(kernel_flock, FALSE); AUDIT_MACH_SYSCALL_EXIT(error); + + if (error) + printf("macx_swapon FAILED - %d\n", error); + else + printf("macx_swapon SUCCESS\n"); + return(error); } @@ -402,6 +425,8 @@ macx_swapoff( int error; boolean_t funnel_state; vfs_context_t ctx = vfs_context_current(); + struct uthread *ut; + int orig_iopol_disk; AUDIT_MACH_SYSCALL_ENTER(AUE_SWAPOFF); @@ -415,7 +440,7 @@ macx_swapoff( /* * Get the vnode for the paging area. */ - NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, + NDINIT(ndp, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, ((IS_64BIT_PROCESS(p)) ? UIO_USERSPACE64 : UIO_USERSPACE32), (user_addr_t) args->filename, ctx); @@ -447,7 +472,24 @@ macx_swapoff( } backing_store = (mach_port_t)bs_port_table[i].bs; + ut = get_bsdthread_info(current_thread()); + +#if !CONFIG_EMBEDDED + orig_iopol_disk = proc_get_thread_selfdiskacc(); + proc_apply_thread_selfdiskacc(IOPOL_THROTTLE); +#else /* !CONFIG_EMBEDDED */ + orig_iopol_disk = ut->uu_iopol_disk; + ut->uu_iopol_disk = IOPOL_THROTTLE; +#endif /* !CONFIG_EMBEDDED */ + kr = default_pager_backing_store_delete(backing_store); + +#if !CONFIG_EMBEDDED + proc_apply_thread_selfdiskacc(orig_iopol_disk); +#else /* !CONFIG_EMBEDDED */ + ut->uu_iopol_disk = orig_iopol_disk; +#endif /* !CONFIG_EMBEDDED */ + switch (kr) { case KERN_SUCCESS: error = 0; @@ -476,6 +518,12 @@ macx_swapoff( (void) thread_funnel_set(kernel_flock, FALSE); AUDIT_MACH_SYSCALL_EXIT(error); + + if (error) + printf("macx_swapoff FAILED - %d\n", error); + else + printf("macx_swapoff SUCCESS\n"); + return(error); } diff --git a/bsd/vm/vm_unix.c b/bsd/vm/vm_unix.c index 369c91350..0190e70f7 100644 --- a/bsd/vm/vm_unix.c +++ b/bsd/vm/vm_unix.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -74,8 +75,11 @@ #include #include #include +#include +#include #include +#include #include #include @@ -90,6 +94,18 @@ #include +#if CONFIG_FREEZE +#include +#endif + + +int _shared_region_map( struct proc*, int, unsigned int, struct shared_file_mapping_np*, memory_object_control_t*, struct shared_file_mapping_np*); +int _shared_region_slide(uint32_t, mach_vm_offset_t, mach_vm_size_t, mach_vm_offset_t, mach_vm_size_t, memory_object_control_t); +int shared_region_copyin_mappings(struct proc*, user_addr_t, unsigned int, struct shared_file_mapping_np *); + +SYSCTL_INT(_vm, OID_AUTO, vm_debug_events, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_debug_events, 0, ""); + + /* * Sysctl's related to data/stack execution. See osfmk/vm/vm_map.c */ @@ -97,8 +113,8 @@ #ifndef SECURE_KERNEL extern int allow_stack_exec, allow_data_exec; -SYSCTL_INT(_vm, OID_AUTO, allow_stack_exec, CTLFLAG_RW, &allow_stack_exec, 0, ""); -SYSCTL_INT(_vm, OID_AUTO, allow_data_exec, CTLFLAG_RW, &allow_data_exec, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, allow_stack_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_stack_exec, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, allow_data_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_data_exec, 0, ""); #endif /* !SECURE_KERNEL */ static const char *prot_values[] = { @@ -121,7 +137,7 @@ log_stack_execution_failure(addr64_t vaddr, vm_prot_t prot) int shared_region_unnest_logging = 1; -SYSCTL_INT(_vm, OID_AUTO, shared_region_unnest_logging, CTLFLAG_RW, +SYSCTL_INT(_vm, OID_AUTO, shared_region_unnest_logging, CTLFLAG_RW | CTLFLAG_LOCKED, &shared_region_unnest_logging, 0, ""); int vm_shared_region_unnest_log_interval = 10; @@ -486,8 +502,8 @@ task_for_pid_posix_check(proc_t target) /* Do target's ruid, euid, and saved uid match my euid? */ if ((kauth_cred_getuid(targetcred) != myuid) || - (targetcred->cr_ruid != myuid) || - (targetcred->cr_svuid != myuid)) { + (kauth_cred_getruid(targetcred) != myuid) || + (kauth_cred_getsvuid(targetcred) != myuid)) { allowed = FALSE; goto out; } @@ -600,6 +616,8 @@ task_for_pid( /* Grant task port access */ task_reference(p->task); + extmod_statistics_incr_task_for_pid(p->task); + sright = (void *) convert_task_to_port(p->task); tret = ipc_port_copyout_send( sright, @@ -664,7 +682,7 @@ task_name_for_pid( && ((current_proc() == p) || kauth_cred_issuser(kauth_cred_get()) || ((kauth_cred_getuid(target_cred) == kauth_cred_getuid(kauth_cred_get())) && - ((target_cred->cr_ruid == kauth_cred_get()->cr_ruid))))) { + ((kauth_cred_getruid(target_cred) == kauth_getruid()))))) { if (p->task != TASK_NULL) { task_reference(p->task); @@ -714,21 +732,21 @@ pid_suspend(struct proc *p __unused, struct pid_suspend_args *args, int *ret) int error = 0; #if CONFIG_MACF - error = mac_proc_check_suspend_resume(p, 0); /* 0 for suspend */ + error = mac_proc_check_suspend_resume(p, MAC_PROC_CHECK_SUSPEND); if (error) { - error = KERN_FAILURE; + error = EPERM; goto out; } #endif if (pid == 0) { - error = KERN_FAILURE; + error = EPERM; goto out; } targetproc = proc_find(pid); if (!task_for_pid_posix_check(targetproc)) { - error = KERN_FAILURE; + error = EPERM; goto out; } @@ -744,7 +762,7 @@ pid_suspend(struct proc *p __unused, struct pid_suspend_args *args, int *ret) (tfpport != IPC_PORT_NULL)) { if (tfpport == IPC_PORT_DEAD) { - error = KERN_PROTECTION_FAILURE; + error = EACCES; goto out; } @@ -753,9 +771,9 @@ pid_suspend(struct proc *p __unused, struct pid_suspend_args *args, int *ret) if (error != MACH_MSG_SUCCESS) { if (error == MACH_RCV_INTERRUPTED) - error = KERN_ABORTED; + error = EINTR; else - error = KERN_FAILURE; + error = EPERM; goto out; } } @@ -764,8 +782,19 @@ pid_suspend(struct proc *p __unused, struct pid_suspend_args *args, int *ret) task_reference(target); error = task_suspend(target); + if (error) { + if (error == KERN_INVALID_ARGUMENT) { + error = EINVAL; + } else { + error = EPERM; + } + } task_deallocate(target); +#if CONFIG_FREEZE + kern_hibernation_on_pid_suspend(pid); +#endif + out: if (targetproc != PROC_NULL) proc_rele(targetproc); @@ -782,21 +811,21 @@ pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret) int error = 0; #if CONFIG_MACF - error = mac_proc_check_suspend_resume(p, 1); /* 1 for resume */ + error = mac_proc_check_suspend_resume(p, MAC_PROC_CHECK_RESUME); if (error) { - error = KERN_FAILURE; + error = EPERM; goto out; } #endif if (pid == 0) { - error = KERN_FAILURE; + error = EPERM; goto out; } targetproc = proc_find(pid); if (!task_for_pid_posix_check(targetproc)) { - error = KERN_FAILURE; + error = EPERM; goto out; } @@ -812,7 +841,7 @@ pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret) (tfpport != IPC_PORT_NULL)) { if (tfpport == IPC_PORT_DEAD) { - error = KERN_PROTECTION_FAILURE; + error = EACCES; goto out; } @@ -821,9 +850,9 @@ pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret) if (error != MACH_MSG_SUCCESS) { if (error == MACH_RCV_INTERRUPTED) - error = KERN_ABORTED; + error = EINTR; else - error = KERN_FAILURE; + error = EPERM; goto out; } } @@ -831,7 +860,19 @@ pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret) #endif task_reference(target); + +#if CONFIG_FREEZE + kern_hibernation_on_pid_resume(pid, target); +#endif + error = task_resume(target); + if (error) { + if (error == KERN_INVALID_ARGUMENT) { + error = EINVAL; + } else { + error = EPERM; + } + } task_deallocate(target); out: @@ -843,6 +884,118 @@ pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret) return 0; } +#if CONFIG_EMBEDDED +kern_return_t +pid_hibernate(struct proc *p __unused, struct pid_hibernate_args *args, int *ret) +{ + int error = 0; + proc_t targetproc = PROC_NULL; + int pid = args->pid; + +#ifndef CONFIG_FREEZE + #pragma unused(pid) +#else + +#if CONFIG_MACF + error = mac_proc_check_suspend_resume(p, MAC_PROC_CHECK_HIBERNATE); + if (error) { + error = EPERM; + goto out; + } +#endif + + /* + * The only accepted pid value here is currently -1, since we just kick off the hibernation thread + * here - individual ids aren't required. However, it's intended that that this call is to change + * in the future to initiate hibernation of individual processes. In anticipation, we'll obtain the + * process handle for potentially valid values and call task_for_pid_posix_check(); this way, everything + * is validated correctly and set for further refactoring. See for more details. + */ + if (pid >= 0) { + targetproc = proc_find(pid); + if (!task_for_pid_posix_check(targetproc)) { + error = EPERM; + goto out; + } + } + + if (pid == -1) { + kern_hibernation_on_pid_hibernate(pid); + } else { + error = EPERM; + } + +out: + +#endif /* CONFIG_FREEZE */ + + if (targetproc != PROC_NULL) + proc_rele(targetproc); + *ret = error; + return error; +} + +int +pid_shutdown_sockets(struct proc *p __unused, struct pid_shutdown_sockets_args *args, int *ret) +{ + int error = 0; + proc_t targetproc = PROC_NULL; + struct filedesc *fdp; + struct fileproc *fp; + int pid = args->pid; + int level = args->level; + int i; + + if (level != SHUTDOWN_SOCKET_LEVEL_DISCONNECT_SVC && + level != SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL) + { + error = EINVAL; + goto out; + } + +#if CONFIG_MACF + error = mac_proc_check_suspend_resume(p, MAC_PROC_CHECK_SHUTDOWN_SOCKETS); + if (error) { + error = EPERM; + goto out; + } +#endif + + targetproc = proc_find(pid); + if (!task_for_pid_posix_check(targetproc)) { + error = EPERM; + goto out; + } + + proc_fdlock(targetproc); + fdp = targetproc->p_fd; + + for (i = 0; i < fdp->fd_nfiles; i++) { + struct socket *sockp; + + fp = fdp->fd_ofiles[i]; + if (fp == NULL || (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 || + fp->f_fglob->fg_type != DTYPE_SOCKET) + { + continue; + } + + sockp = (struct socket *)fp->f_fglob->fg_data; + + /* Call networking stack with socket and level */ + (void) socket_defunct(targetproc, sockp, level); + } + + proc_fdunlock(targetproc); + +out: + if (targetproc != PROC_NULL) + proc_rele(targetproc); + *ret = error; + return error; +} +#endif /* CONFIG_EMBEDDED */ + static int sysctl_settfp_policy(__unused struct sysctl_oid *oidp, void *arg1, __unused int arg2, struct sysctl_req *req) @@ -876,17 +1029,17 @@ static int kern_secure_kernel = 1; static int kern_secure_kernel = 0; #endif -SYSCTL_INT(_kern, OID_AUTO, secure_kernel, CTLFLAG_RD, &kern_secure_kernel, 0, ""); +SYSCTL_INT(_kern, OID_AUTO, secure_kernel, CTLFLAG_RD | CTLFLAG_LOCKED, &kern_secure_kernel, 0, ""); -SYSCTL_NODE(_kern, KERN_TFP, tfp, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "tfp"); -SYSCTL_PROC(_kern_tfp, KERN_TFP_POLICY, policy, CTLTYPE_INT | CTLFLAG_RW, +SYSCTL_NODE(_kern, KERN_TFP, tfp, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "tfp"); +SYSCTL_PROC(_kern_tfp, KERN_TFP_POLICY, policy, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tfp_policy, sizeof(uint32_t), &sysctl_settfp_policy ,"I","policy"); -SYSCTL_INT(_vm, OID_AUTO, shared_region_trace_level, CTLFLAG_RW, +SYSCTL_INT(_vm, OID_AUTO, shared_region_trace_level, CTLFLAG_RW | CTLFLAG_LOCKED, &shared_region_trace_level, 0, ""); -SYSCTL_INT(_vm, OID_AUTO, shared_region_version, CTLFLAG_RD, +SYSCTL_INT(_vm, OID_AUTO, shared_region_version, CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_version, 0, ""); -SYSCTL_INT(_vm, OID_AUTO, shared_region_persistence, CTLFLAG_RW, +SYSCTL_INT(_vm, OID_AUTO, shared_region_persistence, CTLFLAG_RW | CTLFLAG_LOCKED, &shared_region_persistence, 0, ""); /* @@ -968,6 +1121,31 @@ shared_region_check_np( return error; } + +int +shared_region_copyin_mappings( + struct proc *p, + user_addr_t user_mappings, + unsigned int mappings_count, + struct shared_file_mapping_np *mappings) +{ + int error = 0; + vm_size_t mappings_size = 0; + + /* get the list of mappings the caller wants us to establish */ + mappings_size = (vm_size_t) (mappings_count * sizeof (mappings[0])); + error = copyin(user_mappings, + mappings, + mappings_size); + if (error) { + SHARED_REGION_TRACE_ERROR( + ("shared_region: %p [%d(%s)] map(): " + "copyin(0x%llx, %d) failed (error=%d)\n", + current_thread(), p->p_pid, p->p_comm, + (uint64_t)user_mappings, mappings_count, error)); + } + return error; +} /* * shared_region_map_np() * @@ -979,25 +1157,22 @@ shared_region_check_np( * requiring any further setup. */ int -shared_region_map_np( +_shared_region_map( struct proc *p, - struct shared_region_map_np_args *uap, - __unused int *retvalp) + int fd, + uint32_t mappings_count, + struct shared_file_mapping_np *mappings, + memory_object_control_t *sr_file_control, + struct shared_file_mapping_np *mapping_to_slide) { int error; kern_return_t kr; - int fd; struct fileproc *fp; struct vnode *vp, *root_vp; struct vnode_attr va; off_t fs; memory_object_size_t file_size; - user_addr_t user_mappings; - struct shared_file_mapping_np *mappings; -#define SFM_MAX_STACK 8 - struct shared_file_mapping_np stack_mappings[SFM_MAX_STACK]; - unsigned int mappings_count; - vm_size_t mappings_size; + vm_prot_t maxprot = VM_PROT_ALL; memory_object_control_t file_control; struct vm_shared_region *shared_region; @@ -1006,15 +1181,9 @@ shared_region_map_np( current_thread(), p->p_pid, p->p_comm)); shared_region = NULL; - mappings_count = 0; - mappings_size = 0; - mappings = NULL; fp = NULL; vp = NULL; - /* get file descriptor for shared region cache file */ - fd = uap->fd; - /* get file structure from file descriptor */ error = fp_lookup(p, fd, &fp, 0); if (error) { @@ -1068,11 +1237,38 @@ shared_region_map_np( goto done; } +#if CONFIG_MACF + error = mac_file_check_mmap(vfs_context_ucred(vfs_context_current()), + fp->f_fglob, VM_PROT_ALL, MAP_FILE, &maxprot); + if (error) { + goto done; + } +#endif /* MAC */ + +#if CONFIG_PROTECT + /* check for content protection access */ + { + void *cnode; + if ((cnode = cp_get_protected_cnode(vp)) != NULL) { + error = cp_handle_vnop(cnode, CP_READ_ACCESS | CP_WRITE_ACCESS); + if (error) + goto done; + } + } +#endif /* CONFIG_PROTECT */ + /* make sure vnode is on the process's root volume */ root_vp = p->p_fd->fd_rdir; if (root_vp == NULL) { root_vp = rootvnode; + } else { + /* + * Chroot-ed processes can't use the shared_region. + */ + error = EINVAL; + goto done; } + if (vp->v_mount != root_vp->v_mount) { SHARED_REGION_TRACE_ERROR( ("shared_region: %p [%d(%s)] map(%p:'%s'): " @@ -1128,42 +1324,12 @@ shared_region_map_np( error = EINVAL; goto done; } - - /* get the list of mappings the caller wants us to establish */ - mappings_count = uap->count; /* number of mappings */ - mappings_size = (vm_size_t) (mappings_count * sizeof (mappings[0])); - if (mappings_count == 0) { - SHARED_REGION_TRACE_INFO( - ("shared_region: %p [%d(%s)] map(%p:'%s'): " - "no mappings\n", - current_thread(), p->p_pid, p->p_comm, - vp, vp->v_name)); - error = 0; /* no mappings: we're done ! */ - goto done; - } else if (mappings_count <= SFM_MAX_STACK) { - mappings = &stack_mappings[0]; - } else { - SHARED_REGION_TRACE_ERROR( - ("shared_region: %p [%d(%s)] map(%p:'%s'): " - "too many mappings (%d)\n", - current_thread(), p->p_pid, p->p_comm, - vp, vp->v_name, mappings_count)); - error = EINVAL; - goto done; - } - user_mappings = uap->mappings; /* the mappings, in user space */ - error = copyin(user_mappings, - mappings, - mappings_size); - if (error) { - SHARED_REGION_TRACE_ERROR( - ("shared_region: %p [%d(%s)] map(%p:'%s'): " - "copyin(0x%llx, %d) failed (error=%d)\n", - current_thread(), p->p_pid, p->p_comm, - vp, vp->v_name, (uint64_t)user_mappings, mappings_count, error)); - goto done; + if (sr_file_control != NULL) { + *sr_file_control = file_control; } + + /* get the process's shared region (setup in vm_map_exec()) */ shared_region = vm_shared_region_get(current_task()); @@ -1182,7 +1348,8 @@ shared_region_map_np( mappings, file_control, file_size, - (void *) p->p_fd->fd_rdir); + (void *) p->p_fd->fd_rdir, + mapping_to_slide); if (kr != KERN_SUCCESS) { SHARED_REGION_TRACE_ERROR( ("shared_region: %p [%d(%s)] map(%p:'%s'): " @@ -1210,6 +1377,12 @@ shared_region_map_np( error = 0; + vnode_lock_spin(vp); + + vp->v_flag |= VSHARED_DYLD; + + vnode_unlock(vp); + /* update the vnode's access time */ if (! (vnode_vfsvisflags(vp) & MNT_NOATIME)) { VATTR_INIT(&va); @@ -1249,6 +1422,126 @@ shared_region_map_np( return error; } +int +_shared_region_slide(uint32_t slide, + mach_vm_offset_t entry_start_address, + mach_vm_size_t entry_size, + mach_vm_offset_t slide_start, + mach_vm_size_t slide_size, + memory_object_control_t sr_file_control) +{ + void *slide_info_entry = NULL; + int error; + + if((error = vm_shared_region_slide_init(slide_size, entry_start_address, entry_size, slide, sr_file_control))) { + printf("slide_info initialization failed with kr=%d\n", error); + goto done; + } + + slide_info_entry = vm_shared_region_get_slide_info_entry(); + if (slide_info_entry == NULL){ + error = EFAULT; + } else { + error = copyin(slide_start, + slide_info_entry, + (vm_size_t)slide_size); + } + if (error) { + goto done; + } + + if (vm_shared_region_slide_sanity_check() != KERN_SUCCESS) { + error = EFAULT; + printf("Sanity Check failed for slide_info\n"); + } else { +#if DEBUG + printf("Succesfully init slide_info with start_address: %p region_size: %ld slide_header_size: %ld\n", + (void*)(uintptr_t)entry_start_address, + (unsigned long)entry_size, + (unsigned long)slide_size); +#endif + } +done: + return error; +} + +int +shared_region_map_and_slide_np( + struct proc *p, + struct shared_region_map_and_slide_np_args *uap, + __unused int *retvalp) +{ + struct shared_file_mapping_np mapping_to_slide; + struct shared_file_mapping_np *mappings; + unsigned int mappings_count = uap->count; + + memory_object_control_t sr_file_control; + kern_return_t kr = KERN_SUCCESS; + uint32_t slide = uap->slide; + +#define SFM_MAX_STACK 8 + struct shared_file_mapping_np stack_mappings[SFM_MAX_STACK]; + + if ((kr = vm_shared_region_sliding_valid(slide)) != KERN_SUCCESS) { + if (kr == KERN_INVALID_ARGUMENT) { + /* + * This will happen if we request sliding again + * with the same slide value that was used earlier + * for the very first sliding. We continue through + * to the mapping layer. This is so that we can be + * absolutely certain that the same mappings have + * been requested. + */ + kr = KERN_SUCCESS; + } else { + goto done; + } + } + + if (mappings_count == 0) { + SHARED_REGION_TRACE_INFO( + ("shared_region: %p [%d(%s)] map(): " + "no mappings\n", + current_thread(), p->p_pid, p->p_comm)); + kr = 0; /* no mappings: we're done ! */ + goto done; + } else if (mappings_count <= SFM_MAX_STACK) { + mappings = &stack_mappings[0]; + } else { + SHARED_REGION_TRACE_ERROR( + ("shared_region: %p [%d(%s)] map(): " + "too many mappings (%d)\n", + current_thread(), p->p_pid, p->p_comm, + mappings_count)); + kr = KERN_FAILURE; + goto done; + } + + if ( (kr = shared_region_copyin_mappings(p, uap->mappings, uap->count, mappings))) { + goto done; + } + + + kr = _shared_region_map(p, uap->fd, mappings_count, mappings, &sr_file_control, &mapping_to_slide); + if (kr != KERN_SUCCESS) { + return kr; + } + + if (slide) { + kr = _shared_region_slide(slide, + mapping_to_slide.sfm_file_offset, + mapping_to_slide.sfm_size, + uap->slide_start, + uap->slide_size, + sr_file_control); + if (kr != KERN_SUCCESS) { + vm_shared_region_undo_mappings(NULL, 0, mappings, mappings_count); + return kr; + } + } +done: + return kr; +} /* sysctl overflow room */ @@ -1256,11 +1549,11 @@ shared_region_map_np( allocate buffer space, possibly purgeable memory, but not cause inactive pages to be reclaimed. It allows the app to calculate how much memory is free outside the free target. */ extern unsigned int vm_page_free_target; -SYSCTL_INT(_vm, OID_AUTO, vm_page_free_target, CTLFLAG_RD, +SYSCTL_INT(_vm, OID_AUTO, vm_page_free_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_free_target, 0, "Pageout daemon free target"); extern unsigned int vm_memory_pressure; -SYSCTL_INT(_vm, OID_AUTO, memory_pressure, CTLFLAG_RD, +SYSCTL_INT(_vm, OID_AUTO, memory_pressure, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_memory_pressure, 0, "Memory pressure indicator"); static int @@ -1277,36 +1570,36 @@ SYSCTL_PROC(_vm, OID_AUTO, page_free_wanted, 0, 0, vm_ctl_page_free_wanted, "I", ""); extern unsigned int vm_page_purgeable_count; -SYSCTL_INT(_vm, OID_AUTO, page_purgeable_count, CTLFLAG_RD, +SYSCTL_INT(_vm, OID_AUTO, page_purgeable_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_purgeable_count, 0, "Purgeable page count"); extern unsigned int vm_page_purgeable_wired_count; -SYSCTL_INT(_vm, OID_AUTO, page_purgeable_wired_count, CTLFLAG_RD, +SYSCTL_INT(_vm, OID_AUTO, page_purgeable_wired_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_purgeable_wired_count, 0, "Wired purgeable page count"); -SYSCTL_INT(_vm, OID_AUTO, page_reusable_count, CTLFLAG_RD, +SYSCTL_INT(_vm, OID_AUTO, page_reusable_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.reusable_count, 0, "Reusable page count"); -SYSCTL_QUAD(_vm, OID_AUTO, reusable_success, CTLFLAG_RD, +SYSCTL_QUAD(_vm, OID_AUTO, reusable_success, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.reusable_pages_success, ""); -SYSCTL_QUAD(_vm, OID_AUTO, reusable_failure, CTLFLAG_RD, +SYSCTL_QUAD(_vm, OID_AUTO, reusable_failure, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.reusable_pages_failure, ""); -SYSCTL_QUAD(_vm, OID_AUTO, reusable_shared, CTLFLAG_RD, +SYSCTL_QUAD(_vm, OID_AUTO, reusable_shared, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.reusable_pages_shared, ""); -SYSCTL_QUAD(_vm, OID_AUTO, all_reusable_calls, CTLFLAG_RD, +SYSCTL_QUAD(_vm, OID_AUTO, all_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.all_reusable_calls, ""); -SYSCTL_QUAD(_vm, OID_AUTO, partial_reusable_calls, CTLFLAG_RD, +SYSCTL_QUAD(_vm, OID_AUTO, partial_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.partial_reusable_calls, ""); -SYSCTL_QUAD(_vm, OID_AUTO, reuse_success, CTLFLAG_RD, +SYSCTL_QUAD(_vm, OID_AUTO, reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.reuse_pages_success, ""); -SYSCTL_QUAD(_vm, OID_AUTO, reuse_failure, CTLFLAG_RD, +SYSCTL_QUAD(_vm, OID_AUTO, reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.reuse_pages_failure, ""); -SYSCTL_QUAD(_vm, OID_AUTO, all_reuse_calls, CTLFLAG_RD, +SYSCTL_QUAD(_vm, OID_AUTO, all_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.all_reuse_calls, ""); -SYSCTL_QUAD(_vm, OID_AUTO, partial_reuse_calls, CTLFLAG_RD, +SYSCTL_QUAD(_vm, OID_AUTO, partial_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.partial_reuse_calls, ""); -SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_success, CTLFLAG_RD, +SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.can_reuse_success, ""); -SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_failure, CTLFLAG_RD, +SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.can_reuse_failure, ""); diff --git a/bsd/vm/vnode_pager.c b/bsd/vm/vnode_pager.c index a15b6dcc4..d12a65652 100644 --- a/bsd/vm/vnode_pager.c +++ b/bsd/vm/vnode_pager.c @@ -52,6 +52,7 @@ #include /* needs internal due to fhandle_t */ #include #include +#include /* For DKIOC calls */ #include #include @@ -81,6 +82,27 @@ #include +void +vnode_pager_throttle() +{ + struct uthread *ut; + + ut = get_bsdthread_info(current_thread()); + + if (ut->uu_lowpri_window) + throttle_lowpri_io(TRUE); +} + + +boolean_t +vnode_pager_isSSD(vnode_t vp) +{ + if (vp->v_mount->mnt_kern_flag & MNTK_SSD) + return (TRUE); + return (FALSE); +} + + uint32_t vnode_pager_isinuse(struct vnode *vp) { @@ -137,6 +159,85 @@ vnode_pager_get_cs_blobs( return KERN_SUCCESS; } +/* + * vnode_trim: + * Used to call the DKIOCUNMAP ioctl on the underlying disk device for the specified vnode. + * Trims the region at offset bytes into the file, for length bytes. + * + * Care must be taken to ensure that the vnode is sufficiently reference counted at the time this + * function is called; no iocounts or usecounts are taken on the vnode. + * This function is non-idempotent in error cases; We cannot un-discard the blocks if only some of them + * are successfully discarded. + */ +u_int32_t vnode_trim ( + struct vnode *vp, + off_t offset, + size_t length) +{ + daddr64_t io_blockno; /* Block number corresponding to the start of the extent */ + size_t io_bytecount; /* Number of bytes in current extent for the specified range */ + size_t trimmed = 0; + off_t current_offset = offset; + size_t remaining_length = length; + int error = 0; + u_int32_t blocksize = 0; + struct vnode *devvp; + dk_extent_t extent; + dk_unmap_t unmap; + + + /* Get the underlying device vnode */ + devvp = vp->v_mount->mnt_devvp; + + /* Figure out the underlying device block size */ + error = VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)&blocksize, 0, vfs_context_kernel()); + if (error) { + goto trim_exit; + } + + /* + * We may not get the entire range from offset -> offset+length in a single + * extent from the blockmap call. Keep looping/going until we are sure we've hit + * the whole range or if we encounter an error. + */ + while (trimmed < length) { + /* + * VNOP_BLOCKMAP will tell us the logical to physical block number mapping for the + * specified offset. It returns blocks in contiguous chunks, so if the logical range is + * broken into multiple extents, it must be called multiple times, increasing the offset + * in each call to ensure that the entire range is covered. + */ + error = VNOP_BLOCKMAP (vp, current_offset, remaining_length, + &io_blockno, &io_bytecount, NULL, VNODE_READ, NULL); + + if (error) { + goto trim_exit; + } + /* + * We have a contiguous run. Prepare & issue the ioctl for the device. + * the DKIOCUNMAP ioctl takes offset in bytes from the start of the device. + */ + memset (&extent, 0, sizeof(dk_extent_t)); + memset (&unmap, 0, sizeof(dk_unmap_t)); + extent.offset = (uint64_t) io_blockno * (u_int64_t) blocksize; + extent.length = io_bytecount; + unmap.extents = &extent; + unmap.extentsCount = 1; + error = VNOP_IOCTL(devvp, DKIOCUNMAP, (caddr_t)&unmap, 0, vfs_context_kernel()); + + if (error) { + goto trim_exit; + } + remaining_length = remaining_length - io_bytecount; + trimmed = trimmed + io_bytecount; + current_offset = current_offset + io_bytecount; + } +trim_exit: + + return error; + +} + pager_return_t vnode_pageout(struct vnode *vp, upl_t upl, @@ -219,9 +320,7 @@ vnode_pageout(struct vnode *vp, else request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY; - ubc_create_upl(vp, f_offset, size, &upl, &pl, request_flags); - - if (upl == (upl_t)NULL) { + if (ubc_create_upl(vp, f_offset, size, &upl, &pl, request_flags) != KERN_SUCCESS) { result = PAGER_ERROR; error_ret = EINVAL; goto out; @@ -555,14 +654,23 @@ vnode_pagein( xsize, flags, vfs_context_current())) ) { /* * Usually this UPL will be aborted/committed by the lower cluster layer. - * In the case of decmpfs, however, we may return an error (EAGAIN) to avoid - * a deadlock with another thread already inflating the file. In that case, - * we must take care of our UPL at this layer itself. + * + * a) In the case of decmpfs, however, we may return an error (EAGAIN) to avoid + * a deadlock with another thread already inflating the file. + * + * b) In the case of content protection, EPERM is a valid error and we should respect it. + * + * In those cases, we must take care of our UPL at this layer itself. */ if (must_commit) { if(error == EAGAIN) { ubc_upl_abort_range(upl, (upl_offset_t) xoff, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART); } +#if CONFIG_PROTECT + if(error == EPERM) { + ubc_upl_abort_range(upl, (upl_offset_t) xoff, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR); + } +#endif } result = PAGER_ERROR; error = PAGER_ERROR; diff --git a/config/BSDKernel.exports b/config/BSDKernel.exports index f0322f1ac..ebb5af5db 100644 --- a/config/BSDKernel.exports +++ b/config/BSDKernel.exports @@ -33,6 +33,7 @@ _buf_bwrite _buf_callback _buf_clear _buf_clearflags +_buf_clear_redundancy_flags _buf_clone _buf_count _buf_dataptr @@ -55,6 +56,7 @@ _buf_iterate _buf_lblkno _buf_map _buf_markaged +_buf_markclean _buf_markdelayed _buf_markeintr _buf_markfua @@ -63,6 +65,7 @@ _buf_meta_bread _buf_meta_breadn _buf_proc _buf_rcred +_buf_redundancy_flags _buf_reset _buf_resid _buf_setblkno @@ -77,6 +80,7 @@ _buf_seterror _buf_setflags _buf_setfsprivate _buf_setlblkno +_buf_set_redundancy_flags _buf_setresid _buf_setsize _buf_setupl @@ -222,6 +226,8 @@ _ifnet_allocate _ifnet_attach _ifnet_attach_protocol _ifnet_baudrate +_ifnet_capabilities_enabled +_ifnet_capabilities_supported _ifnet_detach _ifnet_detach_protocol _ifnet_eflags @@ -259,6 +265,8 @@ _ifnet_remove_multicast _ifnet_resolve_multicast:_dlil_resolve_multi _ifnet_set_addrlen _ifnet_set_baudrate +_ifnet_set_capabilities_supported +_ifnet_set_capabilities_enabled _ifnet_set_eflags _ifnet_set_flags _ifnet_set_hdrlen @@ -305,6 +313,10 @@ _kauth_cred_get_with_ref _kauth_cred_getgid _kauth_cred_getguid _kauth_cred_getntsid +_kauth_cred_getrgid +_kauth_cred_getruid +_kauth_cred_getsvgid +_kauth_cred_getsvuid _kauth_cred_getuid _kauth_cred_gid2guid _kauth_cred_gid2ntsid @@ -351,6 +363,7 @@ _mbuf_clear_csum_performed _mbuf_clear_csum_requested _mbuf_get_mlen _mbuf_get_mhlen +_mbuf_get_minclsize _mbuf_clear_vlan_tag _mbuf_concatenate _mbuf_copy_pkthdr @@ -368,6 +381,7 @@ _mbuf_freem_list _mbuf_get _mbuf_get_csum_performed _mbuf_get_csum_requested +_mbuf_get_traffic_class _mbuf_get_tso_requested _mbuf_get_vlan_tag _mbuf_getcluster @@ -388,6 +402,7 @@ _mbuf_pulldown _mbuf_pullup _mbuf_set_csum_performed _mbuf_set_csum_requested +_mbuf_set_traffic_class _mbuf_set_vlan_tag _mbuf_setdata _mbuf_setflags @@ -670,6 +685,7 @@ _vfs_setextendedsecurity _vfs_setflags _vfs_setfsprivate _vfs_setioattr +_vfs_setlocklocal _vfs_setmaxsymlen _vfs_statfs _vfs_sysctl diff --git a/config/BSDKernel.ppc.exports b/config/BSDKernel.ppc.exports deleted file mode 100644 index 83559e0b0..000000000 --- a/config/BSDKernel.ppc.exports +++ /dev/null @@ -1,37 +0,0 @@ -_file_vnode -_in6_cksum:_inet6_cksum -_is_suser -_is_suser1 -_mbuf_data -_mbuf_inet6_cksum -_mbuf_len -_mbuf_next -_mbuf_nextpkt -_mbuf_pkthdr_header -_mbuf_pkthdr_len -_mbuf_pkthdr_rcvif -_mbuf_pkthdr_setheader -_mbuf_setlen -_mbuf_setnextpkt -_mbuf_type -_nd6_lookup_ipv6 -_proc_ucred -_rootvnode -_spl0 -_splbio -_splclock -_splhigh -_splimp -_spllo -_spln -_sploff -_splon -_splpower -_splsched -_splsoftclock -_spltty -_splvm -_splx -_suser -_ubc_setcred -_ubc_sync_range diff --git a/config/Dummy.exports b/config/Dummy.exports new file mode 100644 index 000000000..fe7149c32 --- /dev/null +++ b/config/Dummy.exports @@ -0,0 +1 @@ +# Dummy exports, exists for stub architectures like PPC diff --git a/config/IOKit.exports b/config/IOKit.exports index 8f1cb8e73..2ad8e78c9 100644 --- a/config/IOKit.exports +++ b/config/IOKit.exports @@ -365,7 +365,6 @@ __ZN13IOCommandGate10runCommandEPvS0_S0_S0_ __ZN13IOCommandGate10superClassE __ZN13IOCommandGate11commandGateEP8OSObjectPFiS1_PvS2_S2_S2_E __ZN13IOCommandGate11setWorkLoopEP10IOWorkLoop -__ZN13IOCommandGate12checkForWorkEv __ZN13IOCommandGate13attemptActionEPFiP8OSObjectPvS2_S2_S2_ES2_S2_S2_S2_ __ZN13IOCommandGate13commandWakeupEPvb __ZN13IOCommandGate14attemptCommandEPvS0_S0_S0_ @@ -430,6 +429,7 @@ __ZN13IOEventSource23_RESERVEDIOEventSource4Ev __ZN13IOEventSource23_RESERVEDIOEventSource5Ev __ZN13IOEventSource23_RESERVEDIOEventSource6Ev __ZN13IOEventSource23_RESERVEDIOEventSource7Ev +__ZN13IOEventSource4freeEv __ZN13IOEventSource4initEP8OSObjectPFvS1_zE __ZN13IOEventSource6enableEv __ZN13IOEventSource7disableEv @@ -440,6 +440,7 @@ __ZN13IOEventSource9MetaClassC2Ev __ZN13IOEventSource9closeGateEv __ZN13IOEventSource9metaClassE __ZN13IOEventSource9setActionEPFvP8OSObjectzE +__ZN13IOEventSource12checkForWorkEv __ZN13IOEventSourceC1EPK11OSMetaClass __ZN13IOEventSourceC2EPK11OSMetaClass __ZN13IOEventSourceD0Ev @@ -483,12 +484,12 @@ __ZN14IOPMrootDomain14tellChangeDownEm __ZN14IOPMrootDomain15powerChangeDoneEm __ZN14IOPMrootDomain16tellNoChangeDownEm __ZN14IOPMrootDomain17createPMAssertionEyjP9IOServicePKc -__ZN14IOPMrootDomain17getSleepSupportedEv -__ZN14IOPMrootDomain17setAggressivenessEmm -__ZN14IOPMrootDomain18changePowerStateToEm __ZN14IOPMrootDomain18releasePMAssertionEy __ZN14IOPMrootDomain19getPMAssertionLevelEy __ZN14IOPMrootDomain19setPMAssertionLevelEyj +__ZN14IOPMrootDomain17getSleepSupportedEv +__ZN14IOPMrootDomain17setAggressivenessEmm +__ZN14IOPMrootDomain18changePowerStateToEm __ZN14IOPMrootDomain22changePowerStateToPrivEm __ZN14IOPMrootDomain22removePublishedFeatureEj __ZN14IOPMrootDomain23requestPowerDomainStateEmP17IOPowerConnectionm @@ -860,7 +861,6 @@ __ZN18IORegistryIteratorD2Ev __ZN18IOTimerEventSource10gMetaClassE __ZN18IOTimerEventSource10superClassE __ZN18IOTimerEventSource11setWorkLoopEP10IOWorkLoop -__ZN18IOTimerEventSource12checkForWorkEv __ZN18IOTimerEventSource13cancelTimeoutEv __ZN18IOTimerEventSource14setTimeoutFuncEv __ZN18IOTimerEventSource16timerEventSourceEP8OSObjectPFvS1_PS_E diff --git a/config/IOKit.i386.exports b/config/IOKit.i386.exports index 068770db6..d83bbdde6 100644 --- a/config/IOKit.i386.exports +++ b/config/IOKit.i386.exports @@ -280,7 +280,6 @@ __ZN9IOService13newUserClientEP4taskPvmPP12IOUserClient __ZN9IOService13startMatchingEm __ZN9IOService13waitMatchIdleEm __ZN9IOService13willTerminateEPS_m -__ZN9IOService14actionFinalizeEPS_m __ZN9IOService14doServiceMatchEm __ZN9IOService14messageClientsEmPvj __ZN9IOService14newTemperatureElPS_ @@ -299,13 +298,11 @@ __ZN9IOService16didYouWakeSystemEv __ZN9IOService16registerInterestEPK8OSSymbolPFiPvS3_mPS_S3_jES3_S3_ __ZN9IOService16requestTerminateEPS_m __ZN9IOService16setCPUSnoopDelayEm -__ZN9IOService18actionDidTerminateEPS_m __ZN9IOService18doServiceTerminateEm __ZN9IOService18matchPropertyTableEP12OSDictionaryPl __ZN9IOService18requireMaxBusStallEm __ZN9IOService18settleTimerExpiredEv __ZN9IOService18systemWillShutdownEm -__ZN9IOService19actionWillTerminateEPS_mP7OSArray __ZN9IOService19deliverNotificationEPK8OSSymbolmm __ZN9IOService19installNotificationEPK8OSSymbolP12OSDictionaryPFbPvS5_PS_ES5_S5_lPP10OSIterator __ZN9IOService22PM_Clamp_Timer_ExpiredEv diff --git a/config/IOKit.ppc.exports b/config/IOKit.ppc.exports deleted file mode 100644 index 26b5a9209..000000000 --- a/config/IOKit.ppc.exports +++ /dev/null @@ -1,383 +0,0 @@ -_IOPanic -_PE_parse_boot_arg -__Z11IODBDMAStopPV23IODBDMAChannelRegisters -__Z12IODBDMAFlushPV23IODBDMAChannelRegisters -__Z12IODBDMAPausePV23IODBDMAChannelRegisters -__Z12IODBDMAResetPV23IODBDMAChannelRegisters -__Z12IODBDMAStartPV23IODBDMAChannelRegistersPV17IODBDMADescriptor -__Z15IODBDMAContinuePV23IODBDMAChannelRegisters -__Z16IODTFindSlotNameP15IORegistryEntrym -__Z16IODTSetResolvingP15IORegistryEntryPFlmPmS1_EPFvS0_PhS4_S4_E -__Z17IODTGetCellCountsP15IORegistryEntryPmS1_ -__Z22IODTResolveAddressCellP15IORegistryEntryPmS1_S1_ -__Z23IODTFindMatchingEntriesP15IORegistryEntrymPKc -__ZN10AppleMacIO9metaClassE -__ZN10IOWorkLoop19workLoopWithOptionsEm -__ZN10IOWorkLoop9sleepGateEPv12UnsignedWidem -__ZN10IOWorkLoop9sleepGateEPvm -__ZN11IOCatalogue11findDriversEP12OSDictionaryPl -__ZN11IOCatalogue11findDriversEP9IOServicePl -__ZN11IODataQueue11withEntriesEmm -__ZN11IODataQueue12withCapacityEm -__ZN11IODataQueue15initWithEntriesEmm -__ZN11IODataQueue16initWithCapacityEm -__ZN11IODataQueue7enqueueEPvm -__ZN11IOMemoryMap10getAddressEv -__ZN11IOMemoryMap18getPhysicalSegmentEmPm -__ZN11IOMemoryMap19setMemoryDescriptorEP18IOMemoryDescriptory -__ZN11IOMemoryMap7getSizeEv -__ZN11IOMemoryMap8redirectEP18IOMemoryDescriptormm -__ZN11IOMemoryMap8redirectEP18IOMemoryDescriptormy -__ZN12IODMACommand11OutputBig32EPS_NS_9Segment64EPvm -__ZN12IODMACommand11OutputBig64EPS_NS_9Segment64EPvm -__ZN12IODMACommand11synchronizeEm -__ZN12IODMACommand12OutputHost32EPS_NS_9Segment64EPvm -__ZN12IODMACommand12OutputHost64EPS_NS_9Segment64EPvm -__ZN12IODMACommand14OutputLittle32EPS_NS_9Segment64EPvm -__ZN12IODMACommand14OutputLittle64EPS_NS_9Segment64EPvm -__ZN12IODMACommand15genIOVMSegmentsEPyPvPm -__ZN12IODMACommand17withSpecificationEPFbPS_NS_9Segment64EPvmEhyNS_14MappingOptionsEymP8IOMapperS2_ -__ZN12IODMACommand21initWithSpecificationEPFbPS_NS_9Segment64EPvmEhyNS_14MappingOptionsEymP8IOMapperS2_ -__ZN12IODMACommand24prepareWithSpecificationEPFbPS_NS_9Segment64EPvmEhyNS_14MappingOptionsEymP8IOMapperyybb -__ZN12IODMACommand8transferEmyPvy -__ZN12IOUserClient12initWithTaskEP4taskPvm -__ZN12IOUserClient12initWithTaskEP4taskPvmP12OSDictionary -__ZN12IOUserClient15mapClientMemoryEmP4taskmj -__ZN12IOUserClient15sendAsyncResultEPjiPPvm -__ZN12IOUserClient17mapClientMemory64EmP4taskmy -__ZN12IOUserClient17sendAsyncResult64EPyiS0_m -__ZN12IOUserClient19clientMemoryForTypeEmPmPP18IOMemoryDescriptor -__ZN12IOUserClient19setAsyncReference64EPyP8ipc_portyy -__ZN12IOUserClient23getExternalTrapForIndexEm -__ZN12IOUserClient24getNotificationSemaphoreEmPP9semaphore -__ZN12IOUserClient24getTargetAndTrapForIndexEPP9IOServicem -__ZN12IOUserClient24registerNotificationPortEP8ipc_portmm -__ZN12IOUserClient24registerNotificationPortEP8ipc_portmy -__ZN12IOUserClient25getExternalMethodForIndexEm -__ZN12IOUserClient26getTargetAndMethodForIndexEPP9IOServicem -__ZN12IOUserClient30getExternalAsyncMethodForIndexEm -__ZN12IOUserClient31getAsyncTargetAndMethodForIndexEPP9IOServicem -__ZN13IOCommandGate12commandSleepEPv12UnsignedWidem -__ZN13IOCommandGate12commandSleepEPvm -__ZN13IOCommandPool11commandPoolEP9IOServiceP10IOWorkLoopm -__ZN13IOCommandPool4initEP9IOServiceP10IOWorkLoopm -__ZN13IOEventSource9sleepGateEPv12UnsignedWidem -__ZN13IOEventSource9sleepGateEPvm -__ZN13_IOServiceJob8startJobEP9IOServiceim -__ZN14IOCommandQueue10gMetaClassE -__ZN14IOCommandQueue10superClassE -__ZN14IOCommandQueue12checkForWorkEv -__ZN14IOCommandQueue12commandQueueEP8OSObjectPFvS1_PvS2_S2_S2_Ei -__ZN14IOCommandQueue14enqueueCommandEbPvS0_S0_S0_ -__ZN14IOCommandQueue15performAndFlushEP8OSObjectPFvS1_PvS2_S2_S2_E -__ZN14IOCommandQueue4freeEv -__ZN14IOCommandQueue4initEP8OSObjectPFvS1_PvS2_S2_S2_Ei -__ZN14IOCommandQueue9MetaClassC1Ev -__ZN14IOCommandQueue9MetaClassC2Ev -__ZN14IOCommandQueue9metaClassE -__ZN14IOCommandQueueC1EPK11OSMetaClass -__ZN14IOCommandQueueC1Ev -__ZN14IOCommandQueueC2EPK11OSMetaClass -__ZN14IOCommandQueueC2Ev -__ZN14IOCommandQueueD0Ev -__ZN14IOCommandQueueD2Ev -__ZN14IODeviceMemory12withSubRangeEPS_mm -__ZN14IODeviceMemory13arrayFromListEPNS_11InitElementEm -__ZN14IODeviceMemory9withRangeEmm -__ZN14IOMemoryCursor17withSpecificationEPFvNS_15PhysicalSegmentEPvmEmmm -__ZN14IOMemoryCursor19genPhysicalSegmentsEP18IOMemoryDescriptormPvmmPm -__ZN14IOMemoryCursor21initWithSpecificationEPFvNS_15PhysicalSegmentEPvmEmmm -__ZN14IOPMrootDomain17setSleepSupportedEm -__ZN14IOPMrootDomain19sysPowerDownHandlerEPvS0_mP9IOServiceS0_j -__ZN14IOPMrootDomain24receivePowerNotificationEm -__ZN14IOPMrootDomain27displayWranglerNotificationEPvS0_mP9IOServiceS0_j -__ZN15IODMAController13getControllerEP9IOServicem -__ZN15IODMAController16notifyDMACommandEP16IODMAEventSourceP12IODMACommandim -__ZN15IODMAController20createControllerNameEm -__ZN15IODMAController21registerDMAControllerEm -__ZN16AppleMacIODevice9metaClassE -__ZN16IODMAEventSource14dmaEventSourceEP8OSObjectP9IOServicePFvS1_PS_P12IODMACommandimES8_m -__ZN16IODMAEventSource15startDMACommandEP12IODMACommand11IODirectionmm -__ZN16IODMAEventSource16notifyDMACommandEP12IODMACommandim -__ZN16IODMAEventSource4initEP8OSObjectP9IOServicePFvS1_PS_P12IODMACommandimES8_m -__ZN16IOKitDiagnostics12updateOffsetEP12OSDictionarymPKc -__ZN16IORangeAllocator10deallocateEmm -__ZN16IORangeAllocator12allocElementEm -__ZN16IORangeAllocator13allocateRangeEmm -__ZN16IORangeAllocator14deallocElementEm -__ZN16IORangeAllocator28setFragmentCapacityIncrementEm -__ZN16IORangeAllocator4initEmmmm -__ZN16IORangeAllocator8allocateEmPmm -__ZN16IORangeAllocator9withRangeEmmmm -__ZN17IOBigMemoryCursor13outputSegmentEN14IOMemoryCursor15PhysicalSegmentEPvm -__ZN17IOBigMemoryCursor17withSpecificationEmmm -__ZN17IOBigMemoryCursor21initWithSpecificationEmmm -__ZN17IOSharedDataQueue11withEntriesEmm -__ZN17IOSharedDataQueue12withCapacityEm -__ZN17IOSharedDataQueue16initWithCapacityEm -__ZN17IOSharedDataQueue7dequeueEPvPm -__ZN18IOMemoryDescriptor10setMappingEP4taskjm -__ZN18IOMemoryDescriptor10withRangesEP14IOVirtualRangem11IODirectionP4taskb -__ZN18IOMemoryDescriptor10writeBytesEmPKvm -__ZN18IOMemoryDescriptor11makeMappingEPS_P4taskjmmm -__ZN18IOMemoryDescriptor11withAddressEPvm11IODirection -__ZN18IOMemoryDescriptor11withAddressEjm11IODirectionP4task -__ZN18IOMemoryDescriptor11withOptionsEPvmmP4taskmP8IOMapper -__ZN18IOMemoryDescriptor12setPurgeableEmPm -__ZN18IOMemoryDescriptor12withSubRangeEPS_mm11IODirection -__ZN18IOMemoryDescriptor14initWithRangesEP14IOVirtualRangem11IODirectionP4taskb -__ZN18IOMemoryDescriptor15initWithAddressEPvm11IODirection -__ZN18IOMemoryDescriptor15initWithAddressEjm11IODirectionP4task -__ZN18IOMemoryDescriptor15initWithOptionsEPvmmP4taskmP8IOMapper -__ZN18IOMemoryDescriptor16getSourceSegmentEmPm -__ZN18IOMemoryDescriptor16performOperationEmmm -__ZN18IOMemoryDescriptor16withAddressRangeEyymP4task -__ZN18IOMemoryDescriptor17getVirtualSegmentEmPm -__ZN18IOMemoryDescriptor17withAddressRangesEP14IOAddressRangemmP4task -__ZN18IOMemoryDescriptor18getPhysicalSegmentEmPm -__ZN18IOMemoryDescriptor18getPhysicalSegmentEmPmm -__ZN18IOMemoryDescriptor18withPhysicalRangesEP15IOPhysicalRangem11IODirectionb -__ZN18IOMemoryDescriptor19createMappingInTaskEP4taskymyy -__ZN18IOMemoryDescriptor19withPhysicalAddressEmm11IODirection -__ZN18IOMemoryDescriptor20getPhysicalSegment64EmPm -__ZN18IOMemoryDescriptor22initWithPhysicalRangesEP15IOPhysicalRangem11IODirectionb -__ZN18IOMemoryDescriptor23initWithPhysicalAddressEmm11IODirection -__ZN18IOMemoryDescriptor28_RESERVEDIOMemoryDescriptor8Ev -__ZN18IOMemoryDescriptor28_RESERVEDIOMemoryDescriptor9Ev -__ZN18IOMemoryDescriptor29_RESERVEDIOMemoryDescriptor10Ev -__ZN18IOMemoryDescriptor29_RESERVEDIOMemoryDescriptor11Ev -__ZN18IOMemoryDescriptor29_RESERVEDIOMemoryDescriptor12Ev -__ZN18IOMemoryDescriptor29_RESERVEDIOMemoryDescriptor13Ev -__ZN18IOMemoryDescriptor29_RESERVEDIOMemoryDescriptor14Ev -__ZN18IOMemoryDescriptor29_RESERVEDIOMemoryDescriptor15Ev -__ZN18IOMemoryDescriptor3mapEP4taskjmmm -__ZN18IOMemoryDescriptor3mapEm -__ZN18IOMemoryDescriptor5doMapEP7_vm_mapPjmmm -__ZN18IOMemoryDescriptor6setTagEm -__ZN18IOMemoryDescriptor7doUnmapEP7_vm_mapjm -__ZN18IOMemoryDescriptor9readBytesEmPvm -__ZN18IORegistryIterator11iterateOverEP15IORegistryEntryPK15IORegistryPlanem -__ZN18IORegistryIterator11iterateOverEPK15IORegistryPlanem -__ZN18IOTimerEventSource10setTimeoutE12UnsignedWide -__ZN18IOTimerEventSource10setTimeoutE13mach_timespec -__ZN18IOTimerEventSource10setTimeoutEmm -__ZN18IOTimerEventSource10wakeAtTimeE12UnsignedWide -__ZN18IOTimerEventSource10wakeAtTimeE13mach_timespec -__ZN18IOTimerEventSource10wakeAtTimeEmm -__ZN18IOTimerEventSource12setTimeoutMSEm -__ZN18IOTimerEventSource12setTimeoutUSEm -__ZN18IOTimerEventSource12wakeAtTimeMSEm -__ZN18IOTimerEventSource12wakeAtTimeUSEm -__ZN18IOTimerEventSource15setTimeoutTicksEm -__ZN18IOTimerEventSource15wakeAtTimeTicksEm -__ZN19IODBDMAMemoryCursor10gMetaClassE -__ZN19IODBDMAMemoryCursor10superClassE -__ZN19IODBDMAMemoryCursor17withSpecificationEmmm -__ZN19IODBDMAMemoryCursor21initWithSpecificationEmmm -__ZN19IODBDMAMemoryCursor9MetaClassC1Ev -__ZN19IODBDMAMemoryCursor9MetaClassC2Ev -__ZN19IODBDMAMemoryCursor9metaClassE -__ZN19IODBDMAMemoryCursorC1EPK11OSMetaClass -__ZN19IODBDMAMemoryCursorC1Ev -__ZN19IODBDMAMemoryCursorC2EPK11OSMetaClass -__ZN19IODBDMAMemoryCursorC2Ev -__ZN19IODBDMAMemoryCursorD0Ev -__ZN19IODBDMAMemoryCursorD2Ev -__ZN20IOLittleMemoryCursor13outputSegmentEN14IOMemoryCursor15PhysicalSegmentEPvm -__ZN20IOLittleMemoryCursor17withSpecificationEmmm -__ZN20IOLittleMemoryCursor21initWithSpecificationEmmm -__ZN20RootDomainUserClient15setPreventativeEmm -__ZN20RootDomainUserClient26getTargetAndMethodForIndexEPP9IOServicem -__ZN21IOInterruptController10initVectorElP17IOInterruptVector -__ZN21IOInterruptController11causeVectorElP17IOInterruptVector -__ZN21IOInterruptController12enableVectorElP17IOInterruptVector -__ZN21IOInterruptController13getVectorTypeElP17IOInterruptVector -__ZN21IOInterruptController17disableVectorHardElP17IOInterruptVector -__ZN21IOInterruptController17vectorCanBeSharedElP17IOInterruptVector -__ZN21IONaturalMemoryCursor13outputSegmentEN14IOMemoryCursor15PhysicalSegmentEPvm -__ZN21IONaturalMemoryCursor17withSpecificationEmmm -__ZN21IONaturalMemoryCursor21initWithSpecificationEmmm -__ZN21IOSubMemoryDescriptor11makeMappingEP18IOMemoryDescriptorP4taskjmmm -__ZN21IOSubMemoryDescriptor12initSubRangeEP18IOMemoryDescriptormm11IODirection -__ZN21IOSubMemoryDescriptor12setPurgeableEmPm -__ZN21IOSubMemoryDescriptor12withSubRangeEP18IOMemoryDescriptormmm -__ZN21IOSubMemoryDescriptor18getPhysicalSegmentEmPmm -__ZN21IOSubMemoryDescriptor7prepareE11IODirection -__ZN21IOSubMemoryDescriptor8completeE11IODirection -__ZN23IOMultiMemoryDescriptor15withDescriptorsEPP18IOMemoryDescriptorm11IODirectionb -__ZN23IOMultiMemoryDescriptor18getPhysicalSegmentEmPmm -__ZN23IOMultiMemoryDescriptor19initWithDescriptorsEPP18IOMemoryDescriptorm11IODirectionb -__ZN23IOMultiMemoryDescriptor7prepareE11IODirection -__ZN23IOMultiMemoryDescriptor8completeE11IODirection -__ZN24IOBufferMemoryDescriptor11appendBytesEPKvj -__ZN24IOBufferMemoryDescriptor11withOptionsEmjj -__ZN24IOBufferMemoryDescriptor12setDirectionE11IODirection -__ZN24IOBufferMemoryDescriptor12withCapacityEj11IODirectionb -__ZN24IOBufferMemoryDescriptor13initWithBytesEPKvj11IODirectionb -__ZN24IOBufferMemoryDescriptor14getBytesNoCopyEjj -__ZN24IOBufferMemoryDescriptor15initWithOptionsEmjj -__ZN24IOBufferMemoryDescriptor15initWithOptionsEmjjP4task -__ZN24IOBufferMemoryDescriptor17inTaskWithOptionsEP4taskmjj -__ZN24IOBufferMemoryDescriptor20initWithPhysicalMaskEP4taskmyyy -__ZN24IOBufferMemoryDescriptor22inTaskWithPhysicalMaskEP4taskmyy -__ZN24IOBufferMemoryDescriptor34_RESERVEDIOBufferMemoryDescriptor2Ev -__ZN24IOBufferMemoryDescriptor34_RESERVEDIOBufferMemoryDescriptor3Ev -__ZN24IOBufferMemoryDescriptor34_RESERVEDIOBufferMemoryDescriptor4Ev -__ZN24IOBufferMemoryDescriptor34_RESERVEDIOBufferMemoryDescriptor5Ev -__ZN24IOBufferMemoryDescriptor34_RESERVEDIOBufferMemoryDescriptor6Ev -__ZN24IOBufferMemoryDescriptor34_RESERVEDIOBufferMemoryDescriptor7Ev -__ZN24IOBufferMemoryDescriptor34_RESERVEDIOBufferMemoryDescriptor8Ev -__ZN24IOBufferMemoryDescriptor34_RESERVEDIOBufferMemoryDescriptor9Ev -__ZN24IOBufferMemoryDescriptor35_RESERVEDIOBufferMemoryDescriptor10Ev -__ZN24IOBufferMemoryDescriptor35_RESERVEDIOBufferMemoryDescriptor11Ev -__ZN24IOBufferMemoryDescriptor35_RESERVEDIOBufferMemoryDescriptor12Ev -__ZN24IOBufferMemoryDescriptor35_RESERVEDIOBufferMemoryDescriptor13Ev -__ZN24IOBufferMemoryDescriptor35_RESERVEDIOBufferMemoryDescriptor14Ev -__ZN24IOBufferMemoryDescriptor35_RESERVEDIOBufferMemoryDescriptor15Ev -__ZN24IOBufferMemoryDescriptor9setLengthEj -__ZN24IOBufferMemoryDescriptor9withBytesEPKvj11IODirectionb -__ZN25IOGeneralMemoryDescriptor11setPositionEm -__ZN25IOGeneralMemoryDescriptor11wireVirtualE11IODirection -__ZN25IOGeneralMemoryDescriptor12setPurgeableEmPm -__ZN25IOGeneralMemoryDescriptor13mapIntoKernelEj -__ZN25IOGeneralMemoryDescriptor14initWithRangesEP14IOVirtualRangem11IODirectionP4taskb -__ZN25IOGeneralMemoryDescriptor15initWithAddressEPvm11IODirection -__ZN25IOGeneralMemoryDescriptor15initWithAddressEjm11IODirectionP4task -__ZN25IOGeneralMemoryDescriptor15initWithOptionsEPvmmP4taskmP8IOMapper -__ZN25IOGeneralMemoryDescriptor15unmapFromKernelEv -__ZN25IOGeneralMemoryDescriptor16getSourceSegmentEmPm -__ZN25IOGeneralMemoryDescriptor17getVirtualSegmentEmPm -__ZN25IOGeneralMemoryDescriptor18getPhysicalSegmentEmPm -__ZN25IOGeneralMemoryDescriptor18getPhysicalSegmentEmPmm -__ZN25IOGeneralMemoryDescriptor20getPhysicalSegment64EmPm -__ZN25IOGeneralMemoryDescriptor22initWithPhysicalRangesEP15IOPhysicalRangem11IODirectionb -__ZN25IOGeneralMemoryDescriptor23initWithPhysicalAddressEmm11IODirection -__ZN25IOGeneralMemoryDescriptor5doMapEP7_vm_mapPjmmm -__ZN25IOGeneralMemoryDescriptor7doUnmapEP7_vm_mapjm -__ZN25IOGeneralMemoryDescriptor7prepareE11IODirection -__ZN25IOGeneralMemoryDescriptor8completeE11IODirection -__ZN29IOInterleavedMemoryDescriptor12withCapacityEm11IODirection -__ZN29IOInterleavedMemoryDescriptor16initWithCapacityEm11IODirection -__ZN29IOInterleavedMemoryDescriptor18getPhysicalSegmentEmPmm -__ZN29IOInterleavedMemoryDescriptor19setMemoryDescriptorEP18IOMemoryDescriptormm -__ZN29IOInterleavedMemoryDescriptor22clearMemoryDescriptorsE11IODirection -__ZN29IOInterleavedMemoryDescriptor7prepareE11IODirection -__ZN29IOInterleavedMemoryDescriptor8completeE11IODirection -__ZN8IOMapper10allocTableEm -__ZN8IOMapper10iovmInsertEjmP13upl_page_infom -__ZN8IOMapper10iovmInsertEjmPjm -__ZN8IOMapper11NewARTTableEmPPvPj -__ZN8IOMapper12FreeARTTableEP6OSDatam -__ZN8IOPMprot10gMetaClassE -__ZN8IOPMprot10superClassE -__ZN8IOPMprot9MetaClassC1Ev -__ZN8IOPMprot9MetaClassC2Ev -__ZN8IOPMprot9metaClassE -__ZN8IOPMprotC1EPK11OSMetaClass -__ZN8IOPMprotC1Ev -__ZN8IOPMprotC2EPK11OSMetaClass -__ZN8IOPMprotC2Ev -__ZN8IOPMprotD0Ev -__ZN8IOPMprotD2Ev -__ZN9IOService10adjustBusyEl -__ZN9IOService10handleOpenEPS_mPv -__ZN9IOService10systemWakeEv -__ZN9IOService10youAreRootEv -__ZN9IOService11_adjustBusyEl -__ZN9IOService11handleCloseEPS_m -__ZN9IOService11tellClientsEi -__ZN9IOService12clampPowerOnEm -__ZN9IOService12didTerminateEPS_mPb -__ZN9IOService12requestProbeEm -__ZN9IOService12waitForStateEmmP13mach_timespec -__ZN9IOService13getPMworkloopEv -__ZN9IOService13messageClientEmP8OSObjectPvj -__ZN9IOService13newUserClientEP4taskPvmP12OSDictionaryPP12IOUserClient -__ZN9IOService13newUserClientEP4taskPvmPP12IOUserClient -__ZN9IOService13startMatchingEm -__ZN9IOService13waitMatchIdleEm -__ZN9IOService13willTerminateEPS_m -__ZN9IOService14actionFinalizeEPS_m -__ZN9IOService14doServiceMatchEm -__ZN9IOService14messageClientsEmPvj -__ZN9IOService14newTemperatureElPS_ -__ZN9IOService14setPowerParentEP17IOPowerConnectionbm -__ZN9IOService15addNotificationEPK8OSSymbolP12OSDictionaryPFbPvS5_PS_ES5_S5_l -__ZN9IOService15nextIdleTimeoutE12UnsignedWideS0_j -__ZN9IOService15registerServiceEm -__ZN9IOService15tellChangeDown1Em -__ZN9IOService15tellChangeDown2Em -__ZN9IOService15terminateClientEPS_m -__ZN9IOService15terminatePhase1Em -__ZN9IOService15terminateWorkerEm -__ZN9IOService16ack_timer_tickedEv -__ZN9IOService16command_receivedEPvS0_S0_S0_ -__ZN9IOService16didYouWakeSystemEv -__ZN9IOService16registerInterestEPK8OSSymbolPFiPvS3_mPS_S3_jES3_S3_ -__ZN9IOService16requestTerminateEPS_m -__ZN9IOService16setCPUSnoopDelayEm -__ZN9IOService18actionDidTerminateEPS_m -__ZN9IOService18doServiceTerminateEm -__ZN9IOService18matchPropertyTableEP12OSDictionaryPl -__ZN9IOService18requireMaxBusStallEm -__ZN9IOService18settleTimerExpiredEv -__ZN9IOService18systemWillShutdownEm -__ZN9IOService19actionWillTerminateEPS_mP7OSArray -__ZN9IOService19deliverNotificationEPK8OSSymbolmm -__ZN9IOService19installNotificationEPK8OSSymbolP12OSDictionaryPFbPvS5_PS_ES5_S5_lPP10OSIterator -__ZN9IOService20_RESERVEDIOService48Ev -__ZN9IOService20_RESERVEDIOService49Ev -__ZN9IOService20_RESERVEDIOService50Ev -__ZN9IOService20_RESERVEDIOService51Ev -__ZN9IOService20_RESERVEDIOService52Ev -__ZN9IOService20_RESERVEDIOService53Ev -__ZN9IOService20_RESERVEDIOService54Ev -__ZN9IOService20_RESERVEDIOService55Ev -__ZN9IOService20_RESERVEDIOService56Ev -__ZN9IOService20_RESERVEDIOService57Ev -__ZN9IOService20_RESERVEDIOService58Ev -__ZN9IOService20_RESERVEDIOService59Ev -__ZN9IOService20_RESERVEDIOService60Ev -__ZN9IOService20_RESERVEDIOService61Ev -__ZN9IOService20_RESERVEDIOService62Ev -__ZN9IOService20_RESERVEDIOService63Ev -__ZN9IOService22PM_Clamp_Timer_ExpiredEv -__ZN9IOService22powerDomainDidChangeToEmP17IOPowerConnection -__ZN9IOService23acknowledgeNotificationEPvm -__ZN9IOService23addMatchingNotificationEPK8OSSymbolP12OSDictionaryPFbPvS5_PS_P10IONotifierES5_S5_l -__ZN9IOService23powerDomainWillChangeToEmP17IOPowerConnection -__ZN9IOService23scheduleTerminatePhase2Em -__ZN9IOService23tellClientsWithResponseEi -__ZN9IOService24PM_idle_timer_expirationEv -__ZN9IOService24mapDeviceMemoryWithIndexEjm -__ZN9IOService26temperatureCriticalForZoneEPS_ -__ZN9IOService27serializedAllowPowerChange2Em -__ZN9IOService28serializedCancelPowerChange2Em -__ZN9IOService4openEPS_mPv -__ZN9IOService5closeEPS_m -__ZN9IOService5probeEPS_Pl -__ZN9IOService6PMfreeEv -__ZN9IOService7messageEmPS_Pv -__ZN9IOService8finalizeEm -__ZN9IOService9terminateEm -__ZNK11IOCatalogue13serializeDataEmP11OSSerialize -__ZNK14IOCommandQueue12getMetaClassEv -__ZNK14IOCommandQueue9MetaClass5allocEv -__ZNK15IORegistryEntry11getPropertyEPK8OSStringPK15IORegistryPlanem -__ZNK15IORegistryEntry11getPropertyEPK8OSSymbolPK15IORegistryPlanem -__ZNK15IORegistryEntry11getPropertyEPKcPK15IORegistryPlanem -__ZNK15IORegistryEntry12copyPropertyEPK8OSStringPK15IORegistryPlanem -__ZNK15IORegistryEntry12copyPropertyEPK8OSSymbolPK15IORegistryPlanem -__ZNK15IORegistryEntry12copyPropertyEPKcPK15IORegistryPlanem -__ZNK18IOMemoryDescriptor19dmaCommandOperationEmPvj -__ZNK19IODBDMAMemoryCursor12getMetaClassEv -__ZNK19IODBDMAMemoryCursor9MetaClass5allocEv -__ZNK25IOGeneralMemoryDescriptor19dmaCommandOperationEmPvj -__ZNK8IOPMprot12getMetaClassEv -__ZNK8IOPMprot9MetaClass5allocEv -__ZTV14IOCommandQueue -__ZTV19IODBDMAMemoryCursor -__ZTV8IOPMprot -__ZTVN14IOCommandQueue9MetaClassE -__ZTVN19IODBDMAMemoryCursor9MetaClassE -__ZTVN8IOPMprot9MetaClassE diff --git a/config/IOKit.x86_64.exports b/config/IOKit.x86_64.exports index d3067b6e0..6f986aea6 100644 --- a/config/IOKit.x86_64.exports +++ b/config/IOKit.x86_64.exports @@ -226,7 +226,6 @@ __ZN9IOService13newUserClientEP4taskPvjPP12IOUserClient __ZN9IOService13startMatchingEj __ZN9IOService13waitMatchIdleEj __ZN9IOService13willTerminateEPS_j -__ZN9IOService14actionFinalizeEPS_j __ZN9IOService14doServiceMatchEj __ZN9IOService14messageClientsEjPvm __ZN9IOService15addNotificationEPK8OSSymbolP12OSDictionaryPFbPvS5_PS_ES5_S5_i @@ -238,7 +237,6 @@ __ZN9IOService15terminateWorkerEj __ZN9IOService16registerInterestEPK8OSSymbolPFiPvS3_jPS_S3_mES3_S3_ __ZN9IOService16requestTerminateEPS_j __ZN9IOService16setCPUSnoopDelayEj -__ZN9IOService18actionDidTerminateEPS_j __ZN9IOService18doServiceTerminateEj __ZN9IOService18matchPropertyTableEP12OSDictionaryPi __ZN9IOService18requireMaxBusStallEj @@ -249,7 +247,6 @@ __ZN9IOService19_RESERVEDIOService2Ev __ZN9IOService19_RESERVEDIOService3Ev __ZN9IOService19_RESERVEDIOService4Ev __ZN9IOService19_RESERVEDIOService5Ev -__ZN9IOService19actionWillTerminateEPS_jP7OSArray __ZN9IOService19deliverNotificationEPK8OSSymboljj __ZN9IOService23acknowledgeNotificationEPvj __ZN9IOService23addMatchingNotificationEPK8OSSymbolP12OSDictionaryPFbPvS5_PS_P10IONotifierES5_S5_i diff --git a/config/Libkern.exports b/config/Libkern.exports index 2e7ff44dd..4bd05a193 100644 --- a/config/Libkern.exports +++ b/config/Libkern.exports @@ -1,4 +1,4 @@ -___bzero:_bzero +___bzero _Assert _MD5Final _MD5Init @@ -744,6 +744,7 @@ _deflateSetDictionary _ffs _flush_dcache _flush_dcache64 +_gOSKextUnresolved _inet_ntop _inflate _inflateEnd diff --git a/config/Libkern.i386.exports b/config/Libkern.i386.exports index 31d172284..d1a97b9ee 100644 --- a/config/Libkern.i386.exports +++ b/config/Libkern.i386.exports @@ -1,4 +1,7 @@ _lck_mtx_unlock_darwin10 +_lck_mtx_lock_spin +_lck_mtx_try_lock_spin +_lck_mtx_convert_spin _OSAddAtomic64 _OSCompareAndSwap64 _OSRuntimeFinalizeCPP diff --git a/config/Libkern.ppc.exports b/config/Libkern.ppc.exports deleted file mode 100644 index ebf87f219..000000000 --- a/config/Libkern.ppc.exports +++ /dev/null @@ -1,29 +0,0 @@ -_OSDequeueAtomic -_OSEnqueueAtomic -_OSRuntimeFinalizeCPP -_OSRuntimeInitializeCPP -_OSRuntimeUnloadCPP -_OSRuntimeUnloadCPPForSegment -__ZN12OSOrderedSet12withCapacityEjPFlPK15OSMetaClassBaseS2_PvES3_ -__ZN12OSOrderedSet16initWithCapacityEjPFlPK15OSMetaClassBaseS2_PvES3_ -__ZN8OSObject19_RESERVEDOSObject16Ev -__ZN8OSObject19_RESERVEDOSObject17Ev -__ZN8OSObject19_RESERVEDOSObject18Ev -__ZN8OSObject19_RESERVEDOSObject19Ev -__ZN8OSObject19_RESERVEDOSObject20Ev -__ZN8OSObject19_RESERVEDOSObject21Ev -__ZN8OSObject19_RESERVEDOSObject22Ev -__ZN8OSObject19_RESERVEDOSObject23Ev -__ZN8OSObject19_RESERVEDOSObject24Ev -__ZN8OSObject19_RESERVEDOSObject25Ev -__ZN8OSObject19_RESERVEDOSObject26Ev -__ZN8OSObject19_RESERVEDOSObject27Ev -__ZN8OSObject19_RESERVEDOSObject28Ev -__ZN8OSObject19_RESERVEDOSObject29Ev -__ZN8OSObject19_RESERVEDOSObject30Ev -__ZN8OSObject19_RESERVEDOSObject31Ev -_bcopy_nc -_bzero_nc -_sprintf -_strcat -_strcpy diff --git a/config/Libkern.x86_64.exports b/config/Libkern.x86_64.exports index 639d10368..c42f577d8 100644 --- a/config/Libkern.x86_64.exports +++ b/config/Libkern.x86_64.exports @@ -1,8 +1,10 @@ +_lck_mtx_lock_spin +_lck_mtx_try_lock_spin +_lck_mtx_convert_spin _OSAddAtomic64 _OSCompareAndSwap64 __ZN12OSOrderedSet12withCapacityEjPFiPK15OSMetaClassBaseS2_PvES3_ __ZN12OSOrderedSet16initWithCapacityEjPFiPK15OSMetaClassBaseS2_PvES3_ -_gOSKextUnresolved _sprintf _strcat _strcpy diff --git a/config/MACFramework.exports b/config/MACFramework.exports index cba6d7dae..839eadc4f 100644 --- a/config/MACFramework.exports +++ b/config/MACFramework.exports @@ -8,6 +8,8 @@ _mac_label_set _mac_audit_text +_mac_iokit_check_hid_control + _sbuf_cat _sbuf_data _sbuf_delete diff --git a/config/MACFramework.ppc.exports b/config/MACFramework.ppc.exports deleted file mode 100644 index 6006136b4..000000000 --- a/config/MACFramework.ppc.exports +++ /dev/null @@ -1,9 +0,0 @@ -_kau_will_audit -_mac_kalloc -_mac_kalloc_noblock -_mac_kfree -_mac_mbuf_alloc -_mac_mbuf_free -_mac_unwire -_mac_wire -_sysctl__security_mac_children diff --git a/config/Mach.ppc.exports b/config/Mach.ppc.exports deleted file mode 100644 index cc31a814e..000000000 --- a/config/Mach.ppc.exports +++ /dev/null @@ -1 +0,0 @@ -_semaphore_timedwait diff --git a/config/Makefile b/config/Makefile index 9a00f1027..ff2d46ddb 100644 --- a/config/Makefile +++ b/config/Makefile @@ -1,5 +1,3 @@ -MAC = defined - export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule @@ -12,31 +10,22 @@ include $(MakeInc_def) ALL_SUBDIRS = INSTINC_SUBDIRS = - -INSTINC_SUBDIRS_PPC = - INSTINC_SUBDIRS_I386 = - INSTINC_SUBDIRS_X86_64 = - INSTINC_SUBDIRS_ARM = EXPINC_SUBDIRS = - -EXPINC_SUBDIRS_PPC = - EXPINC_SUBDIRS_I386 = - EXPINC_SUBDIRS_X86_64 = - EXPINC_SUBDIRS_ARM = + COMP_SUBDIRS = INST_SUBDIRS = -INSTALL_DATA_LIST= \ +INSTALL_KEXT_PLIST_LIST= \ System.kext/Info.plist \ System.kext/PlugIns/Libkern.kext/Info.plist \ System.kext/PlugIns/Mach.kext/Info.plist \ @@ -48,18 +37,10 @@ INSTALL_DATA_LIST= \ System.kext/PlugIns/IONVRAMFamily.kext/Info.plist \ System.kext/PlugIns/IOSystemManagement.kext/Info.plist \ System.kext/PlugIns/Unsupported.kext/Info.plist \ - System.kext/PlugIns/Private.kext/Info.plist \ - \ - System.kext/PlugIns/System6.0.kext/Info.plist \ - System.kext/PlugIns/Libkern6.0.kext/Info.plist \ - System.kext/PlugIns/Mach6.0.kext/Info.plist \ - System.kext/PlugIns/BSDKernel6.0.kext/Info.plist \ - System.kext/PlugIns/IOKit6.0.kext/Info.plist \ + System.kext/PlugIns/Private.kext/Info.plist -INSTALL_DATA_DIR= \ - /System/Library/Extensions/ +INSTALL_KEXT_DIR = /System/Library/Extensions/ -INSTMAN_SUBDIRS = MD_SUPPORTED_KPI_FILENAME="SupportedKPIs-${ARCH_CONFIG_LC}.txt" MI_SUPPORTED_KPI_FILENAME="SupportedKPIs-all-archs.txt" @@ -72,32 +53,39 @@ endif ifeq ($(ARCH_CONFIG),I386) SUPPORT_SYSTEM60_KEXT = 1 -else ifeq ($(ARCH_CONFIG),ARM) -SUPPORT_SYSTEM60_KEXT = 1 else SUPPORT_SYSTEM60_KEXT = 0 endif +ifeq ($(SUPPORT_SYSTEM60_KEXT),1) +INSTALL_KEXT_PLIST_LIST += \ + System.kext/PlugIns/System6.0.kext/Info.plist \ + System.kext/PlugIns/Libkern6.0.kext/Info.plist \ + System.kext/PlugIns/Mach6.0.kext/Info.plist \ + System.kext/PlugIns/BSDKernel6.0.kext/Info.plist \ + System.kext/PlugIns/IOKit6.0.kext/Info.plist +endif + SYMBOL_COMPONENT_LIST = \ System6.0 \ BSDKernel \ IOKit \ - Libkern \ - Mach \ - Unsupported \ - Private - -ifdef MAC -SYMBOL_COMPONENT_LIST += MACFramework -MACFRAMEWORKEXPORTS = \ - -export $(SRCROOT)/$(COMPONENT)/MACFramework.exports \ - -export $(SRCROOT)/$(COMPONENT)/MACFramework.$(ARCH_CONFIG_LC).exports -endif + Libkern \ + Mach \ + MACFramework \ + Unsupported \ + Private SYMBOL_SET_BUILD = $(foreach set, $(SYMBOL_COMPONENT_LIST), $(OBJPATH)/$(set).symbolset) SYMBOL_SET_FAT = $(foreach set, $(SYMBOL_COMPONENT_LIST), $(OBJROOT)/$(set).symbolset) -## .SUFFIXES: .symbolset .symbollist +INSTALL_KEXT_PLISTS = $(addprefix $(DSTROOT)$(INSTALL_KEXT_DIR), $(INSTALL_KEXT_PLIST_LIST)) + +$(INSTALL_KEXT_PLISTS): $(DSTROOT)$(INSTALL_KEXT_DIR)% : $(SOURCE)/% + @echo Install $< in $@ + $(_v)$(MKDIR) $(dir $@); \ + $(RM) $(RMFLAGS) $@; \ + $(INSTALL) $(DATA_INSTALL_FLAGS) $< $(dir $@) $(OBJPATH)/allsymbols: $(OBJPATH)/mach_kernel $(_v)$(NM) -gj $< > $@ @@ -143,36 +131,20 @@ $(SYMBOL_SET_FAT): $(OBJROOT)/%.symbolset : printf "" > $@; \ fi -build_symbol_sets: $(SYMBOL_SET_BUILD) +build_symbol_sets: $(SYMBOL_SET_BUILD) $(OBJPATH)/allsymbols $(_v)$(KEXT_CREATE_SYMBOL_SET) \ $($(addsuffix $(ARCH_CONFIG),ARCH_FLAGS_ALL_)) \ + $(foreach comp,$(filter-out System6.0 Private,$(SYMBOL_COMPONENT_LIST)), \ + -export $(SRCROOT)/$(COMPONENT)/$(comp).exports \ + -export $(SRCROOT)/$(COMPONENT)/$(comp).$(ARCH_CONFIG_LC).exports) \ -import $(OBJPATH)/allsymbols \ - -export $(SRCROOT)/$(COMPONENT)/Libkern.exports \ - -export $(SRCROOT)/$(COMPONENT)/Libkern.$(ARCH_CONFIG_LC).exports \ - -export $(SRCROOT)/$(COMPONENT)/Mach.exports \ - -export $(SRCROOT)/$(COMPONENT)/Mach.$(ARCH_CONFIG_LC).exports \ - -export $(SRCROOT)/$(COMPONENT)/IOKit.exports \ - -export $(SRCROOT)/$(COMPONENT)/IOKit.$(ARCH_CONFIG_LC).exports \ - -export $(SRCROOT)/$(COMPONENT)/BSDKernel.exports \ - -export $(SRCROOT)/$(COMPONENT)/BSDKernel.$(ARCH_CONFIG_LC).exports \ - $(MACFRAMEWORKEXPORTS) \ - -export $(SRCROOT)/$(COMPONENT)/Unsupported.exports \ - -export $(SRCROOT)/$(COMPONENT)/Unsupported.$(ARCH_CONFIG_LC).exports \ -output /dev/null $(_vstdout); $(_v)$(KEXT_CREATE_SYMBOL_SET) \ - $($(addsuffix $(ARCH_CONFIG),ARCH_FLAGS_)) \ + $($(addsuffix $(ARCH_CONFIG),ARCH_FLAGS_ALL_)) \ + $(foreach comp,$(filter-out System6.0 Unsupported,$(SYMBOL_COMPONENT_LIST)), \ + -export $(SRCROOT)/$(COMPONENT)/$(comp).exports \ + -export $(SRCROOT)/$(COMPONENT)/$(comp).$(ARCH_CONFIG_LC).exports) \ -import $(OBJPATH)/allsymbols \ - -export $(SRCROOT)/$(COMPONENT)/Libkern.exports \ - -export $(SRCROOT)/$(COMPONENT)/Libkern.$(ARCH_CONFIG_LC).exports \ - -export $(SRCROOT)/$(COMPONENT)/Mach.exports \ - -export $(SRCROOT)/$(COMPONENT)/Mach.$(ARCH_CONFIG_LC).exports \ - -export $(SRCROOT)/$(COMPONENT)/IOKit.exports \ - -export $(SRCROOT)/$(COMPONENT)/IOKit.$(ARCH_CONFIG_LC).exports \ - -export $(SRCROOT)/$(COMPONENT)/BSDKernel.exports \ - -export $(SRCROOT)/$(COMPONENT)/BSDKernel.$(ARCH_CONFIG_LC).exports \ - $(MACFRAMEWORKEXPORTS) \ - -export $(SRCROOT)/$(COMPONENT)/Private.exports \ - -export $(SRCROOT)/$(COMPONENT)/Private.$(ARCH_CONFIG_LC).exports \ -output /dev/null $(_vstdout); $(_v) $(SRCROOT)/$(COMPONENT)/list_supported.sh $(SRCROOT)/$(COMPONENT) $(ARCH_CONFIG_LC) $(OBJPATH)/${MD_SUPPORTED_KPI_FILENAME}; $(_v)if [ -n `echo $${ARCH_CONFIGS%%\ *} | grep -i $(ARCH_CONFIG)` ]; \ @@ -181,41 +153,42 @@ build_symbol_sets: $(SYMBOL_SET_BUILD) fi -install_symbol_sets: $(SYMBOL_SET_FAT) $(SRCROOT)/config/MasterVersion - $(_v)if [ -s "$(OBJROOT)/System6.0.symbolset" ]; then \ - install $(INSTALL_FLAGS) $(OBJROOT)/System6.0.symbolset $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/System6.0.kext/kernel.6.0; \ - install $(INSTALL_FLAGS) $(OBJROOT)/System6.0.symbolset $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/Mach6.0.kext/Mach6.0; \ - install $(INSTALL_FLAGS) $(OBJROOT)/System6.0.symbolset $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/BSDKernel6.0.kext/BSDKernel6.0; \ - install $(INSTALL_FLAGS) $(OBJROOT)/System6.0.symbolset $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/Libkern6.0.kext/Libkern6.0; \ - install $(INSTALL_FLAGS) $(OBJROOT)/System6.0.symbolset $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/IOKit6.0.kext/IOKit6.0; \ +install_symbol_sets: $(SYMBOL_SET_FAT) $(SRCROOT)/config/MasterVersion $(INSTALL_KEXT_PLISTS) + $(_v)if [ -s "$(OBJROOT)/System6.0.symbolset" -a $(SUPPORT_SYSTEM60_KEXT) -eq 1 ]; then \ + install $(INSTALL_FLAGS) $(OBJROOT)/System6.0.symbolset $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/System6.0.kext/kernel.6.0; \ + install $(INSTALL_FLAGS) $(OBJROOT)/System6.0.symbolset $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/Mach6.0.kext/Mach6.0; \ + install $(INSTALL_FLAGS) $(OBJROOT)/System6.0.symbolset $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/BSDKernel6.0.kext/BSDKernel6.0; \ + install $(INSTALL_FLAGS) $(OBJROOT)/System6.0.symbolset $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/Libkern6.0.kext/Libkern6.0; \ + install $(INSTALL_FLAGS) $(OBJROOT)/System6.0.symbolset $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/IOKit6.0.kext/IOKit6.0; \ + fi + $(_v)if [ -s "$(OBJROOT)/BSDKernel.symbolset" ]; then \ + install $(INSTALL_FLAGS) $(OBJROOT)/BSDKernel.symbolset $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/BSDKernel.kext/BSDKernel; \ + install $(INSTALL_FLAGS) $(OBJROOT)/IOKit.symbolset $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/IOKit.kext/IOKit; \ + install $(INSTALL_FLAGS) $(OBJROOT)/Libkern.symbolset $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/Libkern.kext/Libkern; \ + install $(INSTALL_FLAGS) $(OBJROOT)/Mach.symbolset $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/Mach.kext/Mach; \ + install $(INSTALL_FLAGS) $(OBJROOT)/MACFramework.symbolset $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/MACFramework.kext/MACFramework; \ + install $(INSTALL_FLAGS) $(OBJROOT)/Unsupported.symbolset $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/Unsupported.kext/Unsupported; \ + install $(INSTALL_FLAGS) $(OBJROOT)/Private.symbolset $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/Private.kext/Private; \ fi - $(_v)install $(INSTALL_FLAGS) $(OBJROOT)/BSDKernel.symbolset $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/BSDKernel.kext/BSDKernel; - $(_v)install $(INSTALL_FLAGS) $(OBJROOT)/IOKit.symbolset $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/IOKit.kext/IOKit; - $(_v)install $(INSTALL_FLAGS) $(OBJROOT)/Libkern.symbolset $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/Libkern.kext/Libkern; - $(_v)install $(INSTALL_FLAGS) $(OBJROOT)/Mach.symbolset $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/Mach.kext/Mach; - $(_v)install $(INSTALL_FLAGS) $(OBJROOT)/Unsupported.symbolset $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/Unsupported.kext/Unsupported; - $(_v)install $(INSTALL_FLAGS) $(OBJROOT)/Private.symbolset $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/Private.kext/Private; - $(_v)$(NEWVERS) $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/Info.plist \ - $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/AppleNMI.kext/Info.plist \ - $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/ApplePlatformFamily.kext/Info.plist \ - $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/BSDKernel.kext/Info.plist \ - $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/IOKit.kext/Info.plist \ - $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/IONVRAMFamily.kext/Info.plist \ - $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/IOSystemManagement.kext/Info.plist \ - $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/Libkern.kext/Info.plist \ - $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/Mach.kext/Info.plist \ - $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/Unsupported.kext/Info.plist \ - $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/Private.kext/Info.plist; + $(_v)$(NEWVERS) $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/Info.plist \ + $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/AppleNMI.kext/Info.plist \ + $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/ApplePlatformFamily.kext/Info.plist \ + $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/BSDKernel.kext/Info.plist \ + $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/IOKit.kext/Info.plist \ + $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/IONVRAMFamily.kext/Info.plist \ + $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/IOSystemManagement.kext/Info.plist \ + $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/Libkern.kext/Info.plist \ + $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/Mach.kext/Info.plist \ + $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/MACFramework.kext/Info.plist \ + $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/Unsupported.kext/Info.plist \ + $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/Private.kext/Info.plist; $(_v)$(MKDIR) $(DSTROOT)/$(KRESDIR); $(_v)install $(INSTALL_FLAGS) $(OBJPATH)/$(MD_SUPPORTED_KPI_FILENAME) $(DSTROOT)/$(KRESDIR); $(_v)if [ -n `echo $${ARCH_CONFIGS%%\ *} | grep -i $(ARCH_CONFIG)` ]; then \ install $(INSTALL_FLAGS) $(OBJROOT)/$(MI_SUPPORTED_KPI_FILENAME) $(DSTROOT)/$(KRESDIR); \ fi -ifdef MAC - $(_v)install $(INSTALL_FLAGS) $(OBJROOT)/MACFramework.symbolset $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/MACFramework.kext/MACFramework; - $(_v)$(NEWVERS) $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/MACFramework.kext/Info.plist -endif - $(_v)$(CP) -rf $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext $(SYMROOT) + $(_v)$(MKDIR) $(SYMROOT) + $(_v)$(CP) -rf $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext $(SYMROOT) do_build_all: build_symbol_sets diff --git a/config/MasterVersion b/config/MasterVersion index 237d3331d..8f5b9dd34 100644 --- a/config/MasterVersion +++ b/config/MasterVersion @@ -1,4 +1,4 @@ -10.8.0 +11.0.0 # The first line of this file contains the master version number for the kernel. # All other instances of the kernel version in xnu are derived from this file. diff --git a/config/Private.exports b/config/Private.exports index fb730cba2..299cabf8e 100644 --- a/config/Private.exports +++ b/config/Private.exports @@ -15,15 +15,21 @@ _bdevsw _boot _bsd_hostname _bsd_set_dependency_capable +_buf_create_shadow _buf_getcpaddr _buf_setcpaddr _buf_setfilter +_buf_shadow _cdevsw +_cdevsw_setkqueueok _clalloc _clfree _cons_cinput _cp_key_store_action _cp_register_wraps +_cs_entitlements_blob_get +_ctl_id_by_name +_ctl_name_by_id _fd_rdwr _get_aiotask _hz @@ -38,7 +44,14 @@ _ip_mutex _ip_output _ip_protox _ipc_port_release_send +_kauth_cred_getgroups +_kauth_cred_guid2grnam +_kauth_cred_guid2pwnam +_kauth_cred_grnam2guid +_kauth_cred_pwnam2guid +_kdp_register_link _kdp_set_interface +_kdp_unregister_link _kdp_unregister_send_receive _kmem_alloc_kobject _linesw @@ -55,9 +68,7 @@ _m_pullup _m_split _m_trailingspace:_mbuf_trailingspace _mac_proc_set_enforce -_mbuf_get_priority -_mbuf_get_traffic_class -_mbuf_set_traffic_class +_mbuf_get_priority:_mbuf_get_traffic_class _mcl_to_paddr _mountroot_post_hook _net_add_domain @@ -65,7 +76,7 @@ _net_add_proto _net_del_domain _net_del_proto _netboot_root -_perf_monitor_register +_perf_monitor_register_* _perf_monitor_unregister _pffinddomain _pffindproto @@ -111,6 +122,7 @@ _q_to_b _register_decmpfs_decompressor _rootdev _rootvp +_rtfree _sbappendaddr _sbappendrecord _sbflush @@ -121,6 +133,7 @@ _socantsendmore _sock_getlistener _sock_release _sock_retain +_sock_setupcall _sodisconnect _sofree _sofreelastref @@ -137,13 +150,20 @@ _soreserve _sorwakeup _sosend _termioschars -_thread_tid +_thread_clear_eager_preempt _thread_dispatchqaddr +_thread_set_eager_preempt +_thread_tid _throttle_info_create _throttle_info_mount_ref _throttle_info_mount_rel _throttle_info_release _throttle_info_update +_throttle_info_ref_by_mask +_throttle_info_rel_by_mask +_throttle_info_update_by_mask +_throttle_lowpri_io +_throttle_set_thread_io_policy _timeout _tk_nin _tk_rawcc @@ -167,15 +187,26 @@ _unmountroot_pre_hook _unputc _unregister_decmpfs_decompressor _untimeout +_vnode_isdyldsharedcache _vnode_ismonitored _vnode_notify +_vnop_compound_open_desc +_vnop_compound_mkdir_desc +_vnop_compound_remove_desc +_vnop_compound_rename_desc +_vnop_compound_rmdir_desc _vnop_monitor_desc _vfs_context_bind _vfs_context_get_special_port _vfs_context_set_special_port +_vfs_devvp +_vfs_getattr +_vfs_getbyid _vfs_get_notify_attributes _vfs_mntlabel +_vfs_setcompoundopen _vfs_setunmountpreflight +_vfs_throttle_mask _vfs_vnodecovered _vm_map_copy_copy _vm_map_copy_discard @@ -184,5 +215,6 @@ _vm_map_copyin_common _vm_map_copyout _vn_getpath_fsenter _vn_searchfs_inappropriate_name +_vnode_lookup_continue_needed _sock_settclassopt _sock_gettclassopt diff --git a/config/Private.i386.exports b/config/Private.i386.exports index 5ff0653e9..b6b05d103 100644 --- a/config/Private.i386.exports +++ b/config/Private.i386.exports @@ -1,14 +1,35 @@ +_IOGetBootKeyStoreData +_SHA256_Final +_SHA256_Init +_SHA256_Update +__ZN22IOInterruptEventSource7warmCPUEy _acpi_install_wake_handler _acpi_sleep_kernel _add_fsevent _apic_table +_apply_func_phys _cpu_to_lapic _cpuid_features _cpuid_info -_gOSKextUnresolved _lapic_end_of_interrupt _lapic_unmask_perfcnt_interrupt _mp_broadcast _mp_cpus_call +_mp_cpus_call1 _need_fsevent +_pal_efi_call_in_32bit_mode +_pal_efi_call_in_64bit_mode +_pal_machine_sleep _smp_initialized +_vfs_addtrigger +_vfs_istraditionaltrigger +_vfs_resolver_auxiliary +_vfs_resolver_result +_vfs_resolver_sequence +_vfs_resolver_status +_vfs_settriggercallback +_vnode_trigger_update +_xts_decrypt +_xts_done +_xts_encrypt +_xts_start diff --git a/config/Private.ppc.exports b/config/Private.ppc.exports deleted file mode 100644 index 0f0b58c19..000000000 --- a/config/Private.ppc.exports +++ /dev/null @@ -1,2 +0,0 @@ -_add_fsevent -_need_fsevent diff --git a/config/Private.x86_64.exports b/config/Private.x86_64.exports index 9748fcbe7..a19ab484b 100644 --- a/config/Private.x86_64.exports +++ b/config/Private.x86_64.exports @@ -1,7 +1,13 @@ +_IOGetBootKeyStoreData +_SHA256_Final +_SHA256_Init +_SHA256_Update +__ZN22IOInterruptEventSource7warmCPUEy _acpi_install_wake_handler _acpi_sleep_kernel _add_fsevent _apic_table +_apply_func_phys _cpu_to_lapic _cpuid_features _cpuid_info @@ -9,7 +15,23 @@ _lapic_end_of_interrupt _lapic_unmask_perfcnt_interrupt _mp_broadcast _mp_cpus_call +_mp_cpus_call1 _need_fsevent +_pal_efi_call_in_32bit_mode +_pal_efi_call_in_64bit_mode _semaphore_timedwait _smp_initialized _kext_get_vm_map +_pal_machine_sleep +_vfs_addtrigger +_vfs_istraditionaltrigger +_vfs_resolver_auxiliary +_vfs_resolver_result +_vfs_resolver_sequence +_vfs_resolver_status +_vfs_settriggercallback +_vnode_trigger_update +_xts_decrypt +_xts_done +_xts_encrypt +_xts_start diff --git a/config/System6.0.exports b/config/System6.0.exports index 75146568c..c3d167834 100644 --- a/config/System6.0.exports +++ b/config/System6.0.exports @@ -620,7 +620,6 @@ __ZN13IOCommandGate10runCommandEPvS0_S0_S0_ __ZN13IOCommandGate10superClassE __ZN13IOCommandGate11commandGateEP8OSObjectPFiS1_PvS2_S2_S2_E __ZN13IOCommandGate11setWorkLoopEP10IOWorkLoop -__ZN13IOCommandGate12checkForWorkEv __ZN13IOCommandGate12commandSleepEPv12UnsignedWidem __ZN13IOCommandGate12commandSleepEPvm __ZN13IOCommandGate13attemptActionEPFiP8OSObjectPvS2_S2_S2_ES2_S2_S2_S2_ @@ -689,6 +688,7 @@ __ZN13IOEventSource23_RESERVEDIOEventSource4Ev __ZN13IOEventSource23_RESERVEDIOEventSource5Ev __ZN13IOEventSource23_RESERVEDIOEventSource6Ev __ZN13IOEventSource23_RESERVEDIOEventSource7Ev +__ZN13IOEventSource4freeEv __ZN13IOEventSource4initEP8OSObjectPFvS1_zE __ZN13IOEventSource6enableEv __ZN13IOEventSource7disableEv @@ -699,6 +699,7 @@ __ZN13IOEventSource9MetaClassC2Ev __ZN13IOEventSource9closeGateEv __ZN13IOEventSource9metaClassE __ZN13IOEventSource9setActionEPFvP8OSObjectzE +__ZN13IOEventSource12checkForWorkEv __ZN13IOEventSource9sleepGateEPv12UnsignedWidem __ZN13IOEventSource9sleepGateEPvm __ZN13IOEventSourceC1EPK11OSMetaClass @@ -1241,7 +1242,6 @@ __ZN18IOTimerEventSource10wakeAtTimeE12UnsignedWide __ZN18IOTimerEventSource10wakeAtTimeE13mach_timespec __ZN18IOTimerEventSource10wakeAtTimeEmm __ZN18IOTimerEventSource11setWorkLoopEP10IOWorkLoop -__ZN18IOTimerEventSource12checkForWorkEv __ZN18IOTimerEventSource12setTimeoutMSEm __ZN18IOTimerEventSource12setTimeoutUSEm __ZN18IOTimerEventSource12wakeAtTimeMSEm @@ -2816,7 +2816,6 @@ __start _absolutetime_to_nanoseconds _acknowledgeSleepWakeNotification _appleClut8 -_argstrcpy _assert_wait _assert_wait_timeout _atoi @@ -2952,7 +2951,6 @@ _get_inpcb_str_size _get_kernel_symfile _get_procrustime _get_task_map -_getval _invalidate_icache _invalidate_icache64 _iokit_add_reference @@ -2970,7 +2968,6 @@ _iokit_version_variant:_version_variant _ipc_port_release_send _is_suser _is_suser1 -_isargsep _kOSBooleanFalse _kOSBooleanTrue _kalloc @@ -3150,7 +3147,6 @@ _thread_call_is_delayed _thread_cancel_timer _thread_deallocate _thread_flavor_array -_thread_funnel_set _thread_policy_set _thread_reference _thread_set_timer diff --git a/config/System6.0.i386.exports b/config/System6.0.i386.exports index 5cb3b501c..f3955791d 100644 --- a/config/System6.0.i386.exports +++ b/config/System6.0.i386.exports @@ -18,12 +18,12 @@ _lapic_end_of_interrupt _ml_get_max_cpus _mp_broadcast _mp_cpus_call +_mp_cpus_call1 _mp_rendezvous_no_intrs -_mtrr_range_add -_mtrr_range_remove _rtc_clock_stepped _rtc_clock_stepping _smp_initialized _sprintf _strcat _strcpy +_thread_funnel_set diff --git a/config/System6.0.ppc.exports b/config/System6.0.ppc.exports deleted file mode 100644 index 6b9d3ed8c..000000000 --- a/config/System6.0.ppc.exports +++ /dev/null @@ -1,256 +0,0 @@ -_CallTVector -_OSDequeueAtomic -_OSEnqueueAtomic -_PE_Determine_Clock_Speeds -_PE_find_scc -_PE_init_taproot -_PE_parse_boot_arg -_PE_read_write_time_of_day -_PE_write_IIC -_PPCcalls -_ResetHandler -__Z11IODBDMAStopPV23IODBDMAChannelRegisters -__Z12IODBDMAFlushPV23IODBDMAChannelRegisters -__Z12IODBDMAPausePV23IODBDMAChannelRegisters -__Z12IODBDMAResetPV23IODBDMAChannelRegisters -__Z12IODBDMAStartPV23IODBDMAChannelRegistersPV17IODBDMADescriptor -__Z15IODBDMAContinuePV23IODBDMAChannelRegisters -__Z32IOFreePhysicallyContiguousMemoryPjj -__Z36IOAllocatePhysicallyContiguousMemoryjjPjPm -__ZN10AppleMacIO10deleteListEv -__ZN10AppleMacIO10gMetaClassE -__ZN10AppleMacIO10processNubEP9IOService -__ZN10AppleMacIO10superClassE -__ZN10AppleMacIO11excludeListEv -__ZN10AppleMacIO12publishBelowEP15IORegistryEntry -__ZN10AppleMacIO15getNubResourcesEP9IOService -__ZN10AppleMacIO20_RESERVEDAppleMacIO0Ev -__ZN10AppleMacIO20_RESERVEDAppleMacIO1Ev -__ZN10AppleMacIO20_RESERVEDAppleMacIO2Ev -__ZN10AppleMacIO20_RESERVEDAppleMacIO3Ev -__ZN10AppleMacIO5startEP9IOService -__ZN10AppleMacIO8selfTestEv -__ZN10AppleMacIO9MetaClassC1Ev -__ZN10AppleMacIO9MetaClassC2Ev -__ZN10AppleMacIO9createNubEP15IORegistryEntry -__ZN10AppleMacIO9metaClassE -__ZN10AppleMacIOC1EPK11OSMetaClass -__ZN10AppleMacIOC2EPK11OSMetaClass -__ZN10AppleMacIOD0Ev -__ZN10AppleMacIOD2Ev -__ZN10AppleNVRAM10gMetaClassE -__ZN10AppleNVRAM10superClassE -__ZN10AppleNVRAM4readEmPhm -__ZN10AppleNVRAM5startEP9IOService -__ZN10AppleNVRAM5writeEmPhm -__ZN10AppleNVRAM9MetaClassC1Ev -__ZN10AppleNVRAM9MetaClassC2Ev -__ZN10AppleNVRAM9metaClassE -__ZN10AppleNVRAMC1EPK11OSMetaClass -__ZN10AppleNVRAMC1Ev -__ZN10AppleNVRAMC2EPK11OSMetaClass -__ZN10AppleNVRAMC2Ev -__ZN10AppleNVRAMD0Ev -__ZN10AppleNVRAMD2Ev -__ZN11IOMemoryMap19setMemoryDescriptorEP18IOMemoryDescriptory -__ZN11IOMemoryMap8redirectEP18IOMemoryDescriptormm -__ZN11IOMemoryMap8redirectEP18IOMemoryDescriptormy -__ZN16AppleMacIODevice10gMetaClassE -__ZN16AppleMacIODevice10superClassE -__ZN16AppleMacIODevice12getResourcesEv -__ZN16AppleMacIODevice13matchLocationEP9IOService -__ZN16AppleMacIODevice26_RESERVEDAppleMacIODevice0Ev -__ZN16AppleMacIODevice26_RESERVEDAppleMacIODevice1Ev -__ZN16AppleMacIODevice26_RESERVEDAppleMacIODevice2Ev -__ZN16AppleMacIODevice26_RESERVEDAppleMacIODevice3Ev -__ZN16AppleMacIODevice9MetaClassC1Ev -__ZN16AppleMacIODevice9MetaClassC2Ev -__ZN16AppleMacIODevice9metaClassE -__ZN16AppleMacIODeviceC1EPK11OSMetaClass -__ZN16AppleMacIODeviceC1Ev -__ZN16AppleMacIODeviceC2EPK11OSMetaClass -__ZN16AppleMacIODeviceC2Ev -__ZN16AppleMacIODeviceD0Ev -__ZN16AppleMacIODeviceD2Ev -__ZN17IONVRAMController10gMetaClassE -__ZN17IONVRAMController10superClassE -__ZN17IONVRAMController4syncEv -__ZN17IONVRAMController5startEP9IOService -__ZN17IONVRAMController9MetaClassC1Ev -__ZN17IONVRAMController9MetaClassC2Ev -__ZN17IONVRAMController9metaClassE -__ZN17IONVRAMControllerC1EPK11OSMetaClass -__ZN17IONVRAMControllerC2EPK11OSMetaClass -__ZN17IONVRAMControllerD0Ev -__ZN17IONVRAMControllerD2Ev -__ZN19ApplePlatformExpert10deleteListEv -__ZN19ApplePlatformExpert10gMetaClassE -__ZN19ApplePlatformExpert10superClassE -__ZN19ApplePlatformExpert11excludeListEv -__ZN19ApplePlatformExpert14getMachineNameEPci -__ZN19ApplePlatformExpert15getGMTTimeOfDayEv -__ZN19ApplePlatformExpert15setGMTTimeOfDayEl -__ZN19ApplePlatformExpert23registerNVRAMControllerEP17IONVRAMController -__ZN19ApplePlatformExpert29_RESERVEDApplePlatformExpert0Ev -__ZN19ApplePlatformExpert29_RESERVEDApplePlatformExpert1Ev -__ZN19ApplePlatformExpert29_RESERVEDApplePlatformExpert2Ev -__ZN19ApplePlatformExpert29_RESERVEDApplePlatformExpert3Ev -__ZN19ApplePlatformExpert5startEP9IOService -__ZN19ApplePlatformExpert9MetaClassC1Ev -__ZN19ApplePlatformExpert9MetaClassC2Ev -__ZN19ApplePlatformExpert9configureEP9IOService -__ZN19ApplePlatformExpert9metaClassE -__ZN19ApplePlatformExpertC1EPK11OSMetaClass -__ZN19ApplePlatformExpertC2EPK11OSMetaClass -__ZN19ApplePlatformExpertD0Ev -__ZN19ApplePlatformExpertD2Ev -__ZN19IODBDMAMemoryCursor10gMetaClassE -__ZN19IODBDMAMemoryCursor10superClassE -__ZN19IODBDMAMemoryCursor13outputSegmentEN14IOMemoryCursor15PhysicalSegmentEPvm -__ZN19IODBDMAMemoryCursor17withSpecificationEmmm -__ZN19IODBDMAMemoryCursor21initWithSpecificationEmmm -__ZN19IODBDMAMemoryCursor9MetaClassC1Ev -__ZN19IODBDMAMemoryCursor9MetaClassC2Ev -__ZN19IODBDMAMemoryCursor9metaClassE -__ZN19IODBDMAMemoryCursorC1EPK11OSMetaClass -__ZN19IODBDMAMemoryCursorC1Ev -__ZN19IODBDMAMemoryCursorC2EPK11OSMetaClass -__ZN19IODBDMAMemoryCursorC2Ev -__ZN19IODBDMAMemoryCursorD0Ev -__ZN19IODBDMAMemoryCursorD2Ev -__ZN24IOBufferMemoryDescriptor22inTaskWithPhysicalMaskEP4taskmyy -__ZN8AppleCPU10gMetaClassE -__ZN8AppleCPU10getCPUNameEv -__ZN8AppleCPU10quiesceCPUEv -__ZN8AppleCPU10superClassE -__ZN8AppleCPU5startEP9IOService -__ZN8AppleCPU7haltCPUEv -__ZN8AppleCPU7initCPUEb -__ZN8AppleCPU8startCPUEjj -__ZN8AppleCPU9MetaClassC1Ev -__ZN8AppleCPU9MetaClassC2Ev -__ZN8AppleCPU9metaClassE -__ZN8AppleCPUC1EPK11OSMetaClass -__ZN8AppleCPUC1Ev -__ZN8AppleCPUC2EPK11OSMetaClass -__ZN8AppleCPUC2Ev -__ZN8AppleCPUD0Ev -__ZN8AppleCPUD2Ev -__ZN8AppleNMI10gMetaClassE -__ZN8AppleNMI10superClassE -__ZN8AppleNMI15handleInterruptEPvP9IOServicei -__ZN8AppleNMI18_RESERVEDAppleNMI0Ev -__ZN8AppleNMI18_RESERVEDAppleNMI1Ev -__ZN8AppleNMI18_RESERVEDAppleNMI2Ev -__ZN8AppleNMI18_RESERVEDAppleNMI3Ev -__ZN8AppleNMI22powerStateWillChangeToEmmP9IOService -__ZN8AppleNMI5startEP9IOService -__ZN8AppleNMI7initNMIEP21IOInterruptControllerP6OSData -__ZN8AppleNMI9MetaClassC1Ev -__ZN8AppleNMI9MetaClassC2Ev -__ZN8AppleNMI9metaClassE -__ZN8AppleNMIC1EPK11OSMetaClass -__ZN8AppleNMIC1Ev -__ZN8AppleNMIC2EPK11OSMetaClass -__ZN8AppleNMIC2Ev -__ZN8AppleNMID0Ev -__ZN8AppleNMID2Ev -__ZN8OSObject19_RESERVEDOSObject16Ev -__ZN8OSObject19_RESERVEDOSObject17Ev -__ZN8OSObject19_RESERVEDOSObject18Ev -__ZN8OSObject19_RESERVEDOSObject19Ev -__ZN8OSObject19_RESERVEDOSObject20Ev -__ZN8OSObject19_RESERVEDOSObject21Ev -__ZN8OSObject19_RESERVEDOSObject22Ev -__ZN8OSObject19_RESERVEDOSObject23Ev -__ZN8OSObject19_RESERVEDOSObject24Ev -__ZN8OSObject19_RESERVEDOSObject25Ev -__ZN8OSObject19_RESERVEDOSObject26Ev -__ZN8OSObject19_RESERVEDOSObject27Ev -__ZN8OSObject19_RESERVEDOSObject28Ev -__ZN8OSObject19_RESERVEDOSObject29Ev -__ZN8OSObject19_RESERVEDOSObject30Ev -__ZN8OSObject19_RESERVEDOSObject31Ev -__ZN9IOService20_RESERVEDIOService48Ev -__ZN9IOService20_RESERVEDIOService49Ev -__ZN9IOService20_RESERVEDIOService50Ev -__ZN9IOService20_RESERVEDIOService51Ev -__ZN9IOService20_RESERVEDIOService52Ev -__ZN9IOService20_RESERVEDIOService53Ev -__ZN9IOService20_RESERVEDIOService54Ev -__ZN9IOService20_RESERVEDIOService55Ev -__ZN9IOService20_RESERVEDIOService56Ev -__ZN9IOService20_RESERVEDIOService57Ev -__ZN9IOService20_RESERVEDIOService58Ev -__ZN9IOService20_RESERVEDIOService59Ev -__ZN9IOService20_RESERVEDIOService60Ev -__ZN9IOService20_RESERVEDIOService61Ev -__ZN9IOService20_RESERVEDIOService62Ev -__ZN9IOService20_RESERVEDIOService63Ev -__ZNK10AppleMacIO12getMetaClassEv -__ZNK10AppleMacIO14compareNubNameEPK9IOServiceP8OSStringPS4_ -__ZNK10AppleMacIO9MetaClass5allocEv -__ZNK10AppleNVRAM12getMetaClassEv -__ZNK10AppleNVRAM9MetaClass5allocEv -__ZNK16AppleMacIODevice11compareNameEP8OSStringPS1_ -__ZNK16AppleMacIODevice12getMetaClassEv -__ZNK16AppleMacIODevice9MetaClass5allocEv -__ZNK17IONVRAMController12getMetaClassEv -__ZNK17IONVRAMController9MetaClass5allocEv -__ZNK19ApplePlatformExpert12getMetaClassEv -__ZNK19ApplePlatformExpert9MetaClass5allocEv -__ZNK19IODBDMAMemoryCursor12getMetaClassEv -__ZNK19IODBDMAMemoryCursor9MetaClass5allocEv -__ZNK8AppleCPU12getMetaClassEv -__ZNK8AppleCPU9MetaClass5allocEv -__ZNK8AppleNMI12getMetaClassEv -__ZNK8AppleNMI9MetaClass5allocEv -__ZTV10AppleMacIO -__ZTV10AppleNVRAM -__ZTV16AppleMacIODevice -__ZTV17IONVRAMController -__ZTV19ApplePlatformExpert -__ZTV19IODBDMAMemoryCursor -__ZTV8AppleCPU -__ZTV8AppleNMI -__ZTVN10AppleMacIO9MetaClassE -__ZTVN10AppleNVRAM9MetaClassE -__ZTVN16AppleMacIODevice9MetaClassE -__ZTVN17IONVRAMController9MetaClassE -__ZTVN19ApplePlatformExpert9MetaClassE -__ZTVN19IODBDMAMemoryCursor9MetaClassE -__ZTVN8AppleCPU9MetaClassE -__ZTVN8AppleNMI9MetaClassE -__eSynchronizeIO -_abs -_bcopy_nc -_bzero_nc -_cacheDisable -_cacheInit -_delay_for_interval -_gGetDefaultBusSpeedsKey -_get_io_base_addr -_get_preemption_level -_hfs_addconverter -_hfs_remconverter -_ignore_zero_fault -_killprint -_kprintf_lock -_mapping_prealloc -_mapping_relpre -_ml_enable_cache_level -_ml_enable_nap -_ml_mem_backoff -_ml_ppc_sleep -_ml_set_processor_speed -_ml_set_processor_voltage -_ml_throttle -_pe_do_clock_test -_pe_run_clock_test -_pmsRunLocal -_rc4_crypt -_rc4_init -_scc -_sprintf -_strcat -_strcpy diff --git a/config/Unsupported.exports b/config/Unsupported.exports index 8886533d8..374517b7e 100644 --- a/config/Unsupported.exports +++ b/config/Unsupported.exports @@ -6,6 +6,8 @@ _KUNCUserNotificationDisplayAlert _KUNCUserNotificationDisplayFromBundle _KUNCUserNotificationDisplayNotice _NDR_record +_OSSpinLockTry +_OSSpinLockUnlock _PE_kputc __Z22OSFlushObjectTrackListv __ZN15IOWatchDogTimer10gMetaClassE @@ -49,6 +51,7 @@ __ZN9IODTNVRAM26calculatePartitionChecksumEPh __ZN9IODTNVRAM9metaClassE __ZN9IODTNVRAMC2EPK11OSMetaClass __ZN9IODTNVRAMD2Ev +__ZN9IODTNVRAM10safeToSyncEv __ZNK15IOWatchDogTimer12getMetaClassEv __ZNK15IOWatchDogTimer9MetaClass5allocEv __ZNK9IODTNVRAM17getOFVariablePermEPK8OSSymbol @@ -64,6 +67,7 @@ _aes_decrypt_key _aes_decrypt_key128 _aes_decrypt_key256 _aes_encrypt_cbc +_aes_encrypt_key _aes_encrypt_key128 _aes_encrypt_key256 _appleClut8 @@ -93,7 +97,6 @@ _host_get_special_port _host_priv_self _hz _ipc_kernel_map -_ipflow_fastforward _kalloc _kauth_cred_issuser _kauth_cred_label_update @@ -115,7 +118,11 @@ _ldisc_deregister _ldisc_register _log _mach_gss_accept_sec_context +_mach_gss_accept_sec_context_v2 +_mach_gss_hold_cred _mach_gss_init_sec_context +_mach_gss_init_sec_context_v2 +_mach_gss_unhold_cred _mach_make_memory_entry_64 _mach_memory_entry_page_op _mach_memory_entry_range_op @@ -159,7 +166,6 @@ _thread_notrigger _thread_tid _tsleep _vfs_context_current -_vfs_setlocklocal _vfs_update_vfsstat _vm_allocate _vm_deallocate diff --git a/config/Unsupported.i386.exports b/config/Unsupported.i386.exports index 66029e241..38b70f0ff 100644 --- a/config/Unsupported.i386.exports +++ b/config/Unsupported.i386.exports @@ -23,7 +23,7 @@ _kernel_thread _lapic_set_perfcnt_interrupt_mask _lapic_set_pmi_func _lo_ifp -_m_adj +_m_adj:_mbuf_adj _m_cat _m_copydata _m_copym @@ -41,7 +41,7 @@ _m_split _m_trailingspace:_mbuf_trailingspace _mach_msg_rpc_from_kernel _mach_msg_send_from_kernel_with_options -_mcl_to_paddr +_mcl_to_paddr:_mbuf_data_to_physical _ml_get_apicid _ml_get_maxbusdelay _ml_get_maxsnoop diff --git a/config/Unsupported.ppc.exports b/config/Unsupported.ppc.exports deleted file mode 100644 index fbc85ede8..000000000 --- a/config/Unsupported.ppc.exports +++ /dev/null @@ -1,118 +0,0 @@ -_CallTVector -_PPCcalls -_PE_write_IIC -__ZN19IODBDMAMemoryCursor13outputSegmentEN14IOMemoryCursor15PhysicalSegmentEPvm -__ZN9IODTNVRAM17getOWVariableInfoEmPPK8OSSymbolPmS4_ -__ZN9IODTNVRAM19convertObjectToPropEPhPmPK8OSSymbolP8OSObject -__ZN9IODTNVRAM19convertPropToObjectEPhmS0_mPPK8OSSymbolPP8OSObject -__ZN9IODTNVRAM19searchNVRAMPropertyEP17IONVRAMDescriptorPm -__ZN9IODTNVRAM19unescapeBytesToDataEPKhm -_domains -_get_preemption_level -_ignore_zero_fault -_ifunit -_in6addr_local -_in_broadcast -_inaddr_local -_inet_domain_mutex -_ip_mutex -_ip_output -_ip_protox -_killprint -_kernel_flock -_kernel_thread -_lo_ifp -_mapping_prealloc -_mapping_relpre -_m_adj -_m_cat -_m_copydata -_m_copym -_m_free:_mbuf_free -_m_freem:_mbuf_freem -_m_get -_m_gethdr -_m_getpacket -_m_getpackets -_m_mclget -_m_mtod -_m_prepend_2 -_m_pullup -_m_split -_m_trailingspace:_mbuf_trailingspace -_mcl_to_paddr -_ml_enable_cache_level -_ml_enable_nap -_ml_ppc_sleep -_ml_set_processor_speed -_ml_set_processor_voltage -_ml_throttle -_nd6_storelladdr -_net_add_domain -_net_add_proto -_net_del_domain -_net_del_proto -_pffinddomain -_pffindproto -_pmsStart -_pmsPark -_pmsRun -_pmsRunLocal -_pmsBuild -_pru_abort_notsupp -_pru_accept_notsupp -_pru_bind_notsupp -_pru_connect2_notsupp -_pru_connect_notsupp -_pru_disconnect_notsupp -_pru_listen_notsupp -_pru_peeraddr_notsupp -_pru_rcvd_notsupp -_pru_rcvoob_notsupp -_pru_send_notsupp -_pru_sense_null -_pru_shutdown_notsupp -_pru_sockaddr_notsupp -_pru_sopoll_notsupp -_ml_mem_backoff -_sbappendaddr -_sbappendrecord -_sbflush -_sbspace -_soabort -_sobind -_socantrcvmore -_socantsendmore -_sock_getlistener -_sock_release -_sock_retain -_soclose -_soconnect -_socreate -_sodisconnect -_sofree -_sofreelastref -_soisconnected -_soisconnecting -_soisdisconnected -_soisdisconnecting -_sonewconn -_sooptcopyin -_sooptcopyout -_sopoll -_soreceive -_soreserve -_sorwakeup -_sosend -_sosetopt -_tcbinfo -_thread_call_func -_thread_call_func_cancel -_thread_call_func_delayed -_thread_call_is_delayed -_thread_cancel_timer -_thread_funnel_set -_thread_set_timer -_thread_set_timer_deadline -_udbinfo -_clock_get_system_value diff --git a/config/Unsupported.x86_64.exports b/config/Unsupported.x86_64.exports index 79dce8fdc..9413c7dec 100644 --- a/config/Unsupported.x86_64.exports +++ b/config/Unsupported.x86_64.exports @@ -32,3 +32,4 @@ _tmrCvt _tsc_get_info _hibernate_vm_lock _hibernate_vm_unlock + diff --git a/config/version.c b/config/version.c index d916cbee8..4870d134c 100644 --- a/config/version.c +++ b/config/version.c @@ -46,3 +46,5 @@ const char osbuilder[] = "###KERNEL_BUILDER###"; const char osrelease[] = OSRELEASE; const char ostype[] = OSTYPE; char osversion[OSVERSIZE]; + +__private_extern__ const char compiler_version[] = __VERSION__; diff --git a/osfmk/ppc/cpu_affinity.h b/iokit/IOKit/AppleKeyStoreInterface.h similarity index 62% rename from osfmk/ppc/cpu_affinity.h rename to iokit/IOKit/AppleKeyStoreInterface.h index 2e0ae7ce4..02cb776c1 100644 --- a/osfmk/ppc/cpu_affinity.h +++ b/iokit/IOKit/AppleKeyStoreInterface.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007 Apple Inc. All rights reserved. + * Copyright (c) 2010 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,28 +25,36 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#ifdef KERNEL_PRIVATE -#ifndef _PPC_CPU_AFFINITY_H_ -#define _PPC_CPU_AFFINITY_H_ -/* - * Just one hardware affinity set - the whole machine. - * This allows us to give the pretense that PPC supports the affinity policy - * SPI. The kernel will accept affinity hints but effectively ignore them. - * Hence Universal Apps can use platform-independent code. - */ -static inline int ml_get_max_affinity_sets(void) +#ifndef _IOKIT_APPLEKEYSTOREINTERFACE_H +#define _IOKIT_APPLEKEYSTOREINTERFACE_H + +// These are currently duplicate defs with different names +// from AppleKeyStore & CoreStorage + +// aka MAX_KEY_SIZE +#define AKS_MAX_KEY_SIZE 128 + +// aka rawKey +struct aks_raw_key_t { - return 1; -} + uint32_t keybytecount; + uint8_t keybytes[AKS_MAX_KEY_SIZE]; +}; -/* - * Return the single processor set. - */ -static inline processor_set_t ml_affinity_to_pset(__unused int affinity_num) +// aka volumeKey +struct aks_volume_key_t { - return processor_pset(master_processor); -} + uint32_t algorithm; + aks_raw_key_t key; +}; + +// aka AKS_GETKEY +#define AKS_PLATFORM_FUNCTION_GETKEY "getKey" + +// aka kCSFDETargetVEKID +#define PLATFORM_FUNCTION_GET_MEDIA_ENCRYPTION_KEY_UUID "CSFDETargetVEKID" + +#define AKS_SERVICE_PATH "/IOResources/AppleKeyStore" -#endif /* _I386_CPU_AFFINITY_H_ */ -#endif /* KERNEL_PRIVATE */ +#endif /* _IOKIT_APPLEKEYSTOREINTERFACE_H */ diff --git a/iokit/IOKit/IOBufferMemoryDescriptor.h b/iokit/IOKit/IOBufferMemoryDescriptor.h index 391a0460e..f5d504061 100644 --- a/iokit/IOKit/IOBufferMemoryDescriptor.h +++ b/iokit/IOKit/IOBufferMemoryDescriptor.h @@ -86,6 +86,7 @@ class IOBufferMemoryDescriptor : public IOGeneralMemoryDescriptor task_t inTask) APPLE_KEXT_DEPRECATED; /* use withOptions() instead */ #endif /* !__LP64__ */ +public: virtual bool initWithPhysicalMask( task_t inTask, IOOptionBits options, @@ -146,7 +147,11 @@ class IOBufferMemoryDescriptor : public IOGeneralMemoryDescriptor kIOMemoryPhysicallyContiguous - pass to request memory be physically contiguous. This option is heavily discouraged. The request may fail if memory is fragmented, may cause large amounts of paging activity, and may take a very long time to execute.
kIOMemoryPageable - pass to request memory be non-wired - the default for kernel allocated memory is wired.
kIOMemoryPurgeable - pass to request memory that may later have its purgeable state set with IOMemoryDescriptor::setPurgeable. Only supported for kIOMemoryPageable allocations.
- kIOMemoryKernelUserShared - pass to request memory that will be mapped into both the kernel and client applications. + kIOMemoryKernelUserShared - pass to request memory that will be mapped into both the kernel and client applications.
+ kIOMapInhibitCache - allocate memory with inhibited cache setting.
+ kIOMapWriteThruCache - allocate memory with writethru cache setting.
+ kIOMapCopybackCache - allocate memory with copyback cache setting.
+ kIOMapWriteCombineCache - allocate memory with writecombined cache setting. @param capacity The number of bytes to allocate. @param alignment The minimum required alignment of the buffer in bytes - 1 is the default for no required alignment. For example, pass 256 to get memory allocated at an address with bits 0-7 zero. @result Returns an instance of class IOBufferMemoryDescriptor to be released by the caller, which will free the memory desriptor and associated buffer. */ @@ -164,7 +169,11 @@ class IOBufferMemoryDescriptor : public IOGeneralMemoryDescriptor @param options Options for the allocation:
kIODirectionOut, kIODirectionIn - set the direction of the I/O transfer.
kIOMemoryPhysicallyContiguous - pass to request memory be physically contiguous. This option is heavily discouraged. The request may fail if memory is fragmented, may cause large amounts of paging activity, and may take a very long time to execute.
- kIOMemoryKernelUserShared - pass to request memory that will be mapped into both the kernel and client applications. + kIOMemoryKernelUserShared - pass to request memory that will be mapped into both the kernel and client applications.
+ kIOMapInhibitCache - allocate memory with inhibited cache setting.
+ kIOMapWriteThruCache - allocate memory with writethru cache setting.
+ kIOMapCopybackCache - allocate memory with copyback cache setting.
+ kIOMapWriteCombineCache - allocate memory with writecombined cache setting. @param capacity The number of bytes to allocate. @param mask The buffer will be allocated with pages such that physical addresses will only have bits set present in physicalMask. For example, pass 0x00000000FFFFFFFFULL for a buffer to be accessed by hardware that has 32 address bits. @result Returns an instance of class IOBufferMemoryDescriptor to be released by the caller, which will free the memory desriptor and associated buffer. */ diff --git a/iokit/IOKit/IOCatalogue.h b/iokit/IOKit/IOCatalogue.h index f087396fd..49f8fb84c 100644 --- a/iokit/IOKit/IOCatalogue.h +++ b/iokit/IOKit/IOCatalogue.h @@ -63,11 +63,11 @@ class IOCatalogue : public OSObject SInt32 generation; /* This stuff is no longer used at all but was exported in prior - * releases, so we keep it around for PPC/i386 only. + * releases, so we keep it around for i386 only. */ -#if __ppc__ || __i386__ +#if __i386__ IOLock * kld_lock; -#endif /* __ppc__ || __i386__ */ +#endif /* __i386__ */ public: /*! @@ -202,9 +202,19 @@ class IOCatalogue : public OSObject /*! @function reset @abstract Return the Catalogue to its initial state. + @discussion + Should only be used by kextd just before it sends all kext personalities down during a rescan. */ void reset(void); + /*! + @function resetAndAddDrivers + @abstract Replace personalities in IOCatalog with those provided. + @discussion + Resets the catalogue with a new set of drivers, preserving matching originals to keep wired memory usage down. + */ + bool resetAndAddDrivers(OSArray * drivers, bool doNubMatching = true); + /*! @function serialize @abstract Serializes the catalog for transport to the user. @@ -215,10 +225,10 @@ class IOCatalogue : public OSObject bool serializeData(IOOptionBits kind, OSSerialize * s) const; -/* This stuff is no longer used at all we keep it around for PPC/i386 +/* This stuff is no longer used at all we keep it around for i386 * binary compatibility only. Symbols are no longer exported. */ -#if __ppc__ || __i386__ +#if __i386__ /*! @function recordStartupExtensions @abstract Records extensions made available by the primary booter. @@ -253,7 +263,7 @@ class IOCatalogue : public OSObject removed or wasn't present, KERN_FAILURE otherwise. */ virtual kern_return_t removeKernelLinker(void); -#endif /* __ppc__ || __i386__ */ +#endif /* __i386__ */ private: diff --git a/iokit/IOKit/IOCommandGate.h b/iokit/IOKit/IOCommandGate.h index 1b17b791d..d38c88670 100644 --- a/iokit/IOKit/IOCommandGate.h +++ b/iokit/IOKit/IOCommandGate.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2009 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -83,10 +83,6 @@ member function's parameter list. void *arg2, void *arg3); protected: -/*! - @function checkForWork - @abstract Not used, $link IOEventSource::checkForWork(). */ - virtual bool checkForWork(); /*! @struct ExpansionData @discussion This structure will be used to expand the capablilties of the IOWorkLoop in the future. diff --git a/iokit/IOKit/IODMACommand.h b/iokit/IOKit/IODMACommand.h index a2a2852f3..c8bd8b37a 100644 --- a/iokit/IOKit/IODMACommand.h +++ b/iokit/IOKit/IODMACommand.h @@ -258,7 +258,7 @@ friend class IODMAEventSource; /*! @function setMemoryDescriptor @abstract Sets and resets the DMACommand's current memory descriptor - @discussion The DMA command will configure itself based on the information that it finds in the memory descriptor. It looks for things like the direction of the memory descriptor and whether the current memory descriptor is already mapped into some IOMMU. As a programmer convenience it can also prepare the memory descriptor immediately. See prepare(). Note the IODMACommand is designed to used multiple times with a succession of memory descriptors, making the pooling of commands possible. It is an error though to attempt to reset a currently prepared() DMA command. Warning: This routine may block so never try to autoprepare an IODMACommand while in a gated context, i.e. one of the WorkLoops action call outs. + @discussion The DMA command will configure itself based on the information that it finds in the memory descriptor. It looks for things like the direction of the memory descriptor and whether the current memory descriptor is already mapped into some IOMMU. As a programmer convenience it can also prepare the DMA command immediately. See prepare(). Note the IODMACommand is designed to used multiple times with a succession of memory descriptors, making the pooling of commands possible. It is an error though to attempt to reset a currently prepared() DMA command. Warning: This routine may block so never try to autoprepare an IODMACommand while in a gated context, i.e. one of the WorkLoops action call outs. @param mem A pointer to the current I/Os memory descriptor. @param autoPrepare An optional boolean variable that will call the prepare() function automatically after the memory descriptor is processed. Defaults to true. @result Returns kIOReturnSuccess, kIOReturnBusy if currently prepared, kIOReturnNoSpace if the length(mem) >= Maximum Transfer Size or the error codes returned by prepare() (qv). diff --git a/iokit/IOKit/IODataQueueShared.h b/iokit/IOKit/IODataQueueShared.h index 2fa0e9a45..dc4532486 100644 --- a/iokit/IOKit/IODataQueueShared.h +++ b/iokit/IOKit/IODataQueueShared.h @@ -66,7 +66,7 @@ typedef struct _IODataQueueMemory { * @abstract A struct mapping to the appendix region of a data queue. * @discussion This struct is variable sized dependent on the version. The struct represents the data queue appendix information. * @field version The version of the queue appendix. - * @field port The notification port associated with this queue. + * @field msgh Mach message header containing the notification mach port associated with this queue. */ typedef struct _IODataQueueAppendix { UInt32 version; diff --git a/iokit/IOKit/IOEventSource.h b/iokit/IOKit/IOEventSource.h index 4afc5aa99..10a392afc 100644 --- a/iokit/IOKit/IOEventSource.h +++ b/iokit/IOKit/IOEventSource.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2000, 2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -44,6 +44,9 @@ HISTORY #include #include +#if IOKITSTATS +#include +#endif __BEGIN_DECLS #include @@ -69,12 +72,12 @@ source may only be a member of 1 linked list chain. If you need to move it between chains than make sure it is removed from the original chain before attempting to move it.

- The IOEventSource makes no attempt to maintain the consitency of it's internal data across multi-threading. It is assumed that the user of these basic tools will protect the data that these objects represent in some sort of device wide instance lock. For example the IOWorkLoop maintains the event chain by handing off change request to its own thread and thus single threading access to its state. + The IOEventSource makes no attempt to maintain the consistency of its internal data across multi-threading. It is assumed that the user of these basic tools will protect the data that these objects represent in some sort of device wide instance lock. For example the IOWorkLoop maintains the event chain by using an IOCommandGate and thus single threading access to its state.

- All subclasses of the IOEventSource are expected to implement the checkForWork() member function. + All subclasses of IOEventSource that wish to perform work on the work-loop thread are expected to implement the checkForWork() member function. As of Mac OS X, 10.7 (Darwin 11), checkForWork is no longer pure virtual, and should not be overridden if there is no work to be done.

- checkForWork() is the key method in this class. It is called by some work-loop when convienient and is expected to evaluate it's internal state and determine if an event has occurred since the last call. In the case of an event having occurred then the instance defined target(owner)/action will be called. The action is stored as an ordinary C function pointer but the first parameter is always the owner. This means that a C++ member function can be used as an action function though this depends on the ABI. + checkForWork() is the key method in this class. It is called by some work-loop when convienient and is expected to evaluate its internal state and determine if an event has occurred since the last call. In the case of an event having occurred then the instance defined target(owner)/action will be called. The action is stored as an ordinary C function pointer but the first parameter is always the owner. This means that a C++ member function can be used as an action function though this depends on the ABI.

Although the eventChainNext variable contains a reference to the next event source in the chain this reference is not retained. The list 'owner' i.e. the client that creates the event, not the work-loop, is expected to retain the source. */ @@ -82,6 +85,9 @@ class IOEventSource : public OSObject { OSDeclareAbstractStructors(IOEventSource) friend class IOWorkLoop; +#if IOKITSTATS + friend class IOStatistics; +#endif public: /*! @@ -125,7 +131,13 @@ is implicitly the first paramter in the target member function's parameter list. /*! @struct ExpansionData @discussion This structure will be used to expand the capablilties of the IOEventSource in the future. */ - struct ExpansionData { }; + struct ExpansionData { +#if IOKITSTATS + struct IOEventSourceCounter *counter; +#else + void *iokitstatsReserved; +#endif + }; /*! @var reserved Reserved for future use. (Internal use only) */ @@ -149,14 +161,19 @@ successfully. */ virtual bool init(OSObject *owner, IOEventSource::Action action = 0); + virtual void free( void ); + /*! @function checkForWork - @abstract Pure Virtual member function used by IOWorkLoop for work + @abstract Virtual member function used by IOWorkLoop for work scheduling. @discussion This function will be called to request a subclass to check -it's internal state for any work to do and then to call out the owner/action. +its internal state for any work to do and then to call out the owner/action. +If this event source never performs any work (e.g. IOCommandGate), this +method should not be overridden. NOTE: This method is no longer declared pure +virtual. A default implementation is provided in IOEventSource. @result Return true if this function needs to be called again before all its outstanding events have been processed. */ - virtual bool checkForWork() = 0; + virtual bool checkForWork(); /*! @function setWorkLoop @abstract Set'ter for $link workLoop variable. diff --git a/iokit/IOKit/IOHibernatePrivate.h b/iokit/IOKit/IOHibernatePrivate.h index 30b307816..0fb3c53f3 100644 --- a/iokit/IOKit/IOHibernatePrivate.h +++ b/iokit/IOKit/IOHibernatePrivate.h @@ -34,6 +34,7 @@ extern "C" { #ifdef KERNEL #include +#include #endif struct IOPolledFileExtent @@ -48,7 +49,9 @@ struct IOHibernateImageHeader uint64_t imageSize; uint64_t image1Size; - uint32_t restore1CodePage; + uint32_t restore1CodePhysPage; + uint32_t reserved1; + uint64_t restore1CodeVirt; uint32_t restore1PageCount; uint32_t restore1CodeOffset; uint32_t restore1StackOffset; @@ -86,16 +89,15 @@ struct IOHibernateImageHeader uint32_t diag[4]; - int32_t graphicsInfoOffset; - int32_t cryptVarsOffset; - int32_t memoryMapOffset; - uint32_t memoryMapSize; + uint32_t handoffPages; + uint32_t handoffPageCount; + uint32_t systemTableOffset; uint32_t debugFlags; uint32_t options; - uint32_t reserved[71]; // make sizeof == 512 + uint32_t reserved[70]; // make sizeof == 512 uint64_t encryptEnd __attribute__ ((packed)); uint64_t deviceBase __attribute__ ((packed)); @@ -154,6 +156,25 @@ typedef struct hibernate_cryptvars_t hibernate_cryptvars_t; #endif /* defined(_AES_H) */ +enum +{ + kIOHibernateHandoffType = 0x686f0000, + kIOHibernateHandoffTypeEnd = kIOHibernateHandoffType + 0, + kIOHibernateHandoffTypeGraphicsInfo = kIOHibernateHandoffType + 1, + kIOHibernateHandoffTypeCryptVars = kIOHibernateHandoffType + 2, + kIOHibernateHandoffTypeMemoryMap = kIOHibernateHandoffType + 3, + kIOHibernateHandoffTypeDeviceTree = kIOHibernateHandoffType + 4, + kIOHibernateHandoffTypeDeviceProperties = kIOHibernateHandoffType + 5, + kIOHibernateHandoffTypeKeyStore = kIOHibernateHandoffType + 6, +}; + +struct IOHibernateHandoff +{ + uint32_t type; + uint32_t bytecount; + uint8_t data[]; +}; +typedef struct IOHibernateHandoff IOHibernateHandoff; enum { @@ -233,15 +254,20 @@ typedef void (*kern_get_file_extents_callback_t)(void * ref, uint64_t start, uin struct kern_direct_file_io_ref_t * kern_open_file_for_direct_io(const char * name, kern_get_file_extents_callback_t callback, - void * callback_ref, - dev_t * device_result, - uint64_t * partitionbase_result, - uint64_t * maxiocount_result, - boolean_t * solid_state); + void * callback_ref, + dev_t * partition_device_result, + dev_t * image_device_result, + uint64_t * partitionbase_result, + uint64_t * maxiocount_result, + uint32_t * oflags, + off_t offset, + caddr_t addr, + vm_size_t len); void -kern_close_file_for_direct_io(struct kern_direct_file_io_ref_t * ref); +kern_close_file_for_direct_io(struct kern_direct_file_io_ref_t * ref, + off_t offset, caddr_t addr, vm_size_t len); int kern_write_file(struct kern_direct_file_io_ref_t * ref, off_t offset, caddr_t addr, vm_size_t len); int get_kernel_symfile(struct proc *p, char const **symfile); @@ -257,6 +283,7 @@ hibernate_setup(IOHibernateImageHeader * header, boolean_t vmflush, hibernate_page_list_t ** page_list_ret, hibernate_page_list_t ** page_list_wired_ret, + hibernate_page_list_t ** page_list_pal_ret, boolean_t * encryptedswap); kern_return_t hibernate_teardown(hibernate_page_list_t * page_list, @@ -279,6 +306,7 @@ hibernate_vm_unlock(void); void hibernate_page_list_setall(hibernate_page_list_t * page_list, hibernate_page_list_t * page_list_wired, + hibernate_page_list_t * page_list_pal, uint32_t * pagesOut); // mark pages to be saved, or pages not to be saved but available @@ -316,7 +344,7 @@ hibernate_page_bitmap_pin(hibernate_page_list_t * list, uint32_t * page); uint32_t hibernate_page_bitmap_count(hibernate_bitmap_t * bitmap, uint32_t set, uint32_t page); -void +uintptr_t hibernate_restore_phys_page(uint64_t src, uint64_t dst, uint32_t len, uint32_t procFlags); void @@ -341,8 +369,6 @@ extern uint32_t gIOHibernateFreeTime; // max time to spend freeing pages (ms) extern uint8_t gIOHibernateRestoreStack[]; extern uint8_t gIOHibernateRestoreStackEnd[]; extern IOHibernateImageHeader * gIOHibernateCurrentHeader; -extern hibernate_graphics_t * gIOHibernateGraphicsInfo; -extern hibernate_cryptwakevars_t * gIOHibernateCryptWakeVars; #define HIBLOG(fmt, args...) \ { kprintf(fmt, ## args); printf(fmt, ## args); } @@ -419,9 +445,11 @@ enum { #define kIOHibernateMachineSignatureKey "machine-signature" #define kIOHibernateRTCVariablesKey "IOHibernateRTCVariables" +#define kIOHibernateSMCVariablesKey "IOHibernateSMCVariables" #define kIOHibernateBootSwitchVarsKey "boot-switch-vars" +#define kIOHibernateUseKernelInterpreter 0x80000000 #ifdef __cplusplus } diff --git a/iokit/IOKit/IOInterruptEventSource.h b/iokit/IOKit/IOInterruptEventSource.h index fe5d4ae12..2e1a82765 100644 --- a/iokit/IOKit/IOInterruptEventSource.h +++ b/iokit/IOKit/IOInterruptEventSource.h @@ -189,6 +189,17 @@ state when checkForWork is called. */ @param nub Where did the interrupt originate from @param ind What is this interrupts index within 'nub'. */ virtual void disableInterruptOccurred(void *, IOService *nub, int ind); + +/*! @function warmCPU + @abstract Tries to reduce latency for an interrupt which will be received near a specified time. + @discussion Warms up a CPU in advance of an interrupt so that the interrupt may be serviced with predictable latency. + The warm-up is not periodic; callers should call warmCPU once in advance of each interrupt. It is recommended that + requests be issues in serial (i.e. each after the target for the previous call has elapsed), as there is a systemwide + cap on the number of outstanding requests. This routine may be disruptive to the system if used with very small intervals + between requests; it should be used only in cases where interrupt latency is absolutely critical, and tens or hundreds of + milliseconds between targets is the expected time scale. NOTE: it is not safe to call this method with interrupts disabled. + @param abstime Time at which interrupt is expected. */ + IOReturn warmCPU(uint64_t abstime); private: IOReturn registerInterruptHandler(IOService *inProvider, int inIntIndex); diff --git a/iokit/IOKit/IOKitDebug.h b/iokit/IOKit/IOKitDebug.h index 96fb7c5a0..de2850d4e 100644 --- a/iokit/IOKit/IOKitDebug.h +++ b/iokit/IOKit/IOKitDebug.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,13 +25,6 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * Copyright (c) 1998 Apple Computer, Inc. All rights reserved. - * - * HISTORY - * - */ - #ifndef _IOKIT_IOKITDEBUG_H #define _IOKIT_IOKITDEBUG_H @@ -82,6 +75,9 @@ enum { kOSRegistryModsMode = 0x00040000ULL, // Change default registry modification handling - panic vs. log // kIOTraceIOService = 0x00080000ULL, // Obsolete: Use iotrace=0x00080000ULL to enable now kIOLogHibernate = 0x00100000ULL, + kIOLogDriverPower1 = 0x01000000ULL, + kIOLogDriverPower2 = 0x02000000ULL, + kIOStatistics = 0x04000000ULL, // debug aids - change behaviour kIONoFreeObjects = 0x00100000ULL, @@ -97,7 +93,7 @@ enum { kIOTraceEventSources = 0x00000004ULL, // Trace non-passive event sources kIOTraceIntEventSource = 0x00000008ULL, // Trace IOIES and IOFIES sources kIOTraceCommandGates = 0x00000010ULL, // Trace command gate activity - kIOTraceTimers = 0x00000020ULL, // Trace timer event source activity + kIOTraceTimers = 0x00000008ULL, // Trace timer event source activity kIOTracePowerMgmt = 0x00000400ULL, // Trace power management changes @@ -108,15 +104,22 @@ enum { extern SInt64 gIOKitDebug; extern SInt64 gIOKitTrace; -extern UInt64 gIOInterruptThresholdNS; - #ifdef __cplusplus extern "C" { #endif -struct IORegistryPlane; -extern void IOPrintPlane( const struct IORegistryPlane * plane ); +#ifdef __cplusplus +class IORegistryPlane; +#endif + +extern void IOPrintPlane( +#ifdef __cplusplus + const IORegistryPlane * plane +#else + const struct IORegistryPlane * plane +#endif + ); #ifndef _OSCPPDEBUG_H extern void OSPrintMemory( void ); #endif diff --git a/iokit/IOKit/IOKitKeys.h b/iokit/IOKit/IOKitKeys.h index 62395d54d..a6d7c8bf5 100644 --- a/iokit/IOKit/IOKitKeys.h +++ b/iokit/IOKit/IOKitKeys.h @@ -151,6 +151,7 @@ // IODTNVRAM property keys #define kIONVRAMDeletePropertyKey "IONVRAM-DELETE-PROPERTY" +#define kIONVRAMSyncNowPropertyKey "IONVRAM-SYNCNOW-PROPERTY" #define kIODTNVRAMPanicInfoKey "aapl,panic-info" // keys for complex boot information diff --git a/iokit/IOKit/IOKitKeysPrivate.h b/iokit/IOKit/IOKitKeysPrivate.h index 73e93db0f..06794304e 100644 --- a/iokit/IOKit/IOKitKeysPrivate.h +++ b/iokit/IOKit/IOKitKeysPrivate.h @@ -32,25 +32,31 @@ #include // properties found in the registry root -#define kIOConsoleUsersKey "IOConsoleUsers" /* value is OSArray */ +#define kIOConsoleLockedKey "IOConsoleLocked" /* value is OSBoolean */ +#define kIOConsoleUsersKey "IOConsoleUsers" /* value is OSArray */ #define kIOMaximumMappedIOByteCountKey "IOMaximumMappedIOByteCount" /* value is OSNumber */ // properties found in the console user dict +#define kIOConsoleSessionAuditIDKey "kCGSSessionAuditIDKey" /* value is OSNumber */ -#define kIOConsoleSessionIDKey "kCGSSessionIDKey" /* value is OSNumber */ - -#define kIOConsoleSessionUserNameKey "kCGSSessionUserNameKey" /* value is OSString */ -#define kIOConsoleSessionUIDKey "kCGSSessionUserIDKey" /* value is OSNumber */ -#define kIOConsoleSessionConsoleSetKey "kCGSSessionConsoleSetKey" /* value is OSNumber */ -#define kIOConsoleSessionOnConsoleKey "kCGSSessionOnConsoleKey" /* value is OSBoolean */ +#define kIOConsoleSessionUserNameKey "kCGSSessionUserNameKey" /* value is OSString */ +#define kIOConsoleSessionUIDKey "kCGSSessionUserIDKey" /* value is OSNumber */ +#define kIOConsoleSessionConsoleSetKey "kCGSSessionConsoleSetKey" /* value is OSNumber */ +#define kIOConsoleSessionOnConsoleKey "kCGSSessionOnConsoleKey" /* value is OSBoolean */ #define kIOConsoleSessionSecureInputPIDKey "kCGSSessionSecureInputPID" /* value is OSNumber */ +#define kIOConsoleSessionScreenLockedTimeKey "CGSSessionScreenLockedTime" /* value is OSNumber, secs - 1970 */ // IOResources property -#define kIOConsoleUsersSeedKey "IOConsoleUsersSeed" /* value is OSNumber */ +#define kIOConsoleUsersSeedKey "IOConsoleUsersSeed" /* value is OSNumber */ + +// interest type +#define kIOConsoleSecurityInterest "IOConsoleSecurityInterest" + // private keys for clientHasPrivilege #define kIOClientPrivilegeConsoleUser "console" #define kIOClientPrivilegeSecureConsoleProcess "secureprocess" +#define kIOClientPrivilegeConsoleSession "consolesession" // clientHasPrivilege security token for kIOClientPrivilegeSecureConsoleProcess typedef struct _IOUCProcessToken { diff --git a/iokit/IOKit/IOKitServer.h b/iokit/IOKit/IOKitServer.h index a68c99243..26787a25c 100644 --- a/iokit/IOKit/IOKitServer.h +++ b/iokit/IOKit/IOKitServer.h @@ -73,6 +73,11 @@ enum { @constant kIOCatalogRemoveDrivers Signals a call to the removeDrivers function in IOCatalogue. @constant kIOCatalogRemoveDriversNoMatch Signals a call to the removedrivers function in IOCatalogue but does not start a matching thread. @constant kIOCatalogStartMatching Signals the IOCatalogue to start an IOService matching thread. + @constant kIOCatalogRemoveKernelLinker Deprecated; does nothing. + @constant kIOCatalogKextdActive Signals the kernel that kextd is running. + @constant kIOCatalogKextdFinishedLaunching Signals the IOCatalogue that kextd has finished sending it information at startup. + @constant kIOCatalogResetDrivers Resets the IOCatalogue with a new set of personalities. + @constant kIOCatalogResetDriversNoMatch Resets the IOCatalogue with a new set of personalities but does not start a matching thread. */ enum { kIOCatalogAddDrivers = 1, @@ -82,7 +87,9 @@ enum { kIOCatalogStartMatching, kIOCatalogRemoveKernelLinker, kIOCatalogKextdActive, - kIOCatalogKextdFinishedLaunching + kIOCatalogKextdFinishedLaunching, + kIOCatalogResetDrivers, + kIOCatalogResetDriversNoMatch }; // IOCatalogueGetData diff --git a/iokit/IOKit/IOLib.h b/iokit/IOKit/IOLib.h index 6183a3358..5e91b4725 100644 --- a/iokit/IOKit/IOLib.h +++ b/iokit/IOKit/IOLib.h @@ -222,19 +222,10 @@ void IOMappedWrite32(IOPhysicalAddress address, UInt32 value); void IOMappedWrite64(IOPhysicalAddress address, UInt64 value); -/*! @function IOSetProcessorCacheMode - @abstract Sets the processor cache mode for mapped memory. - @discussion This function sets the cache mode of an already mapped & wired memory range. Note this may not be supported on I/O mappings or shared memory - it is far preferable to set the cache mode as mappings are created with the IOMemoryDescriptor::map method. - @param task Task the memory is mapped into. - @param address Virtual address of the memory. - @param length Length of the range to set. - @param cacheMode A constant from IOTypes.h,
- kIOMapDefaultCache to inhibit the cache in I/O areas, kIOMapCopybackCache in general purpose RAM.
- kIOMapInhibitCache, kIOMapWriteThruCache, kIOMapCopybackCache to set the appropriate caching.
- @result An IOReturn code.*/ +/* This function is deprecated. Cache settings may be set for allocated memory with the IOBufferMemoryDescriptor api. */ IOReturn IOSetProcessorCacheMode( task_t task, IOVirtualAddress address, - IOByteCount length, IOOptionBits cacheMode ); + IOByteCount length, IOOptionBits cacheMode ) __attribute__((deprecated)); /*! @function IOFlushProcessorCache @abstract Flushes the processor cache for mapped memory. @@ -341,8 +332,23 @@ void Debugger(const char * reason); void IOPanic(const char *reason) __attribute__((deprecated)); #endif -struct OSDictionary * IOBSDNameMatching( const char * name ); -struct OSDictionary * IOOFPathMatching( const char * path, char * buf, int maxLen ); +#ifdef __cplusplus +class OSDictionary; +#endif + +#ifdef __cplusplus +OSDictionary * +#else +struct OSDictionary * +#endif +IOBSDNameMatching( const char * name ); + +#ifdef __cplusplus +OSDictionary * +#else +struct OSDictionary * +#endif +IOOFPathMatching( const char * path, char * buf, int maxLen ); /* * Convert between size and a power-of-two alignment. diff --git a/iokit/IOKit/IOMemoryCursor.h b/iokit/IOKit/IOMemoryCursor.h index dfe9eed8c..048cdf584 100644 --- a/iokit/IOKit/IOMemoryCursor.h +++ b/iokit/IOKit/IOMemoryCursor.h @@ -378,85 +378,5 @@ class IOLittleMemoryCursor : public IOMemoryCursor } }; -/************************* class IODBDMAMemoryCursor *************************/ - -#if defined(__ppc__) - -struct IODBDMADescriptor; - -/*! - @class IODBDMAMemoryCursor - @abstract An IOMemoryCursor subclass that outputs a vector of DBDMA descriptors where the address and length are filled in. - @discussion The IODBDMAMemoryCursor would be used when the DBDMA hardware is available for the device for that will use an instance of this cursor. -*/ -class IODBDMAMemoryCursor : public IOMemoryCursor -{ - OSDeclareDefaultStructors(IODBDMAMemoryCursor) - -public: -/*! @function outputSegment - @abstract Outpust the given segment into the output segments array in address and length fields of an DBDMA descriptor. - @param segment The physical address and length that is next to be output. - @param segments Base of the output vector of DMA address length pairs. - @param segmentIndex Index to output 'segment' in the 'segments' array. -*/ - static void outputSegment(PhysicalSegment segment, - void * segments, - UInt32 segmentIndex); - -/*! @defined dbdmaOutputSegment - @discussion Backward compatibility define for the old global function definition. See IODBDMAMemoryCursor::outputSegment. */ -#define dbdmaOutputSegment IODBDMAMemoryCursor::outputSegment - -/*! @function withSpecification - @abstract Creates and initializes an IODBDMAMemoryCursor in one operation. - @discussion Factory function to create and initialize an IODBDMAMemoryCursor in one operation. See also IODBDMAMemoryCursor::initWithSpecification. - @param maxSegmentSize Maximum allowable size for one segment. Defaults to 0. - @param maxTransferSize Maximum size of an entire transfer. Defaults to 0 indicating no maximum. - @param alignment Alignment restrictions on output physical addresses. Not currently implemented. Defaults to single byte alignment. - @result Returns a new memory cursor if successfully created and initialized, 0 otherwise. -*/ - static IODBDMAMemoryCursor * - withSpecification(IOPhysicalLength maxSegmentSize, - IOPhysicalLength maxTransferSize, - IOPhysicalLength alignment = 1); - -/*! @function initWithSpecification - @abstract Primary initializer for the IODBDMAMemoryCursor class. - @param maxSegmentSize Maximum allowable size for one segment. Defaults to 0. - @param maxTransferSize Maximum size of an entire transfer. Defaults to 0 indicating no maximum. - @param alignment Alignment restrictions on output physical addresses. Not currently implemented. Defaults to single byte alignment. - @result Returns true if the inherited classes and this instance initialize successfully. -*/ - virtual bool initWithSpecification(IOPhysicalLength maxSegmentSize, - IOPhysicalLength maxTransferSize, - IOPhysicalLength alignment = 1); - - -/*! @function getPhysicalSegments - @abstract Generates a DBDMA physical scatter/gather list given a memory descriptor. - @discussion Generates a list of DBDMA descriptors where the address and length fields are filled in appropriately. But the client is expected to fill in the rest of the DBDMA descriptor as is appropriate for their particular hardware. Wraps IOMemoryCursor::genPhysicalSegments. - @param descriptor IOMemoryDescriptor that describes the data associated with an I/O request. - @param fromPosition Starting location of the I/O within a memory descriptor. - @param segments Pointer to an array of DBDMA descriptors for the output physical scatter/gather list. Be warned no room is left for a preamble in the output array. 'segments' should point to the first memory description slot in a DBDMA command. - @param maxSegments Maximum number of segments that can be written to the DBDMA descriptor table. - @param inMaxTransferSize Maximum transfer size is limited to that many bytes, otherwise it defaults to the maximum transfer size specified when the memory cursor was initialized. - @param transferSize Pointer to an IOByteCount variable that can contain the total size of the transfer being described. Defaults to 0 indicating that no transfer size need be returned. - @result If the descriptor is exhausted of memory, a zero is returned, otherwise the number of segments that were filled in is returned. -*/ - virtual UInt32 getPhysicalSegments(IOMemoryDescriptor * descriptor, - IOByteCount fromPosition, - IODBDMADescriptor * segments, - UInt32 maxSegments, - UInt32 inMaxTransferSize = 0, - IOByteCount * transferSize = 0) - { - return genPhysicalSegments(descriptor, fromPosition, segments, - maxSegments, inMaxTransferSize, transferSize); - } -}; - -#endif /* defined(__ppc__) */ - #endif /* !_IOMEMORYCURSOR_H */ diff --git a/iokit/IOKit/IOMemoryDescriptor.h b/iokit/IOKit/IOMemoryDescriptor.h index 866da4703..6e6961136 100644 --- a/iokit/IOKit/IOMemoryDescriptor.h +++ b/iokit/IOKit/IOMemoryDescriptor.h @@ -320,7 +320,7 @@ typedef IOOptionBits DMACommandOps; @param withLength The length of memory. @param options kIOMemoryDirectionMask (options:direction) This nibble indicates the I/O direction to be associated with the descriptor, which may affect the operation of the prepare and complete methods on some architectures. - @param task The task the virtual ranges are mapped into. Note that unlike IOMemoryDescriptor::withAddress(), kernel_task memory must be explicitly prepared when passed to this api. + @param task The task the virtual ranges are mapped into. Note that unlike IOMemoryDescriptor::withAddress(), kernel_task memory must be explicitly prepared when passed to this api. The task argument may be NULL to specify memory by physical address. @result The created IOMemoryDescriptor on success, to be released by the caller, or zero on failure. */ static IOMemoryDescriptor * withAddressRange( @@ -337,7 +337,7 @@ typedef IOOptionBits DMACommandOps; @param options kIOMemoryDirectionMask (options:direction) This nibble indicates the I/O direction to be associated with the descriptor, which may affect the operation of the prepare and complete methods on some architectures. kIOMemoryAsReference For options:type = Virtual or Physical this indicate that the memory descriptor need not copy the ranges array into local memory. This is an optimisation to try to minimise unnecessary allocations. - @param task The task each of the virtual ranges are mapped into. Note that unlike IOMemoryDescriptor::withAddress(), kernel_task memory must be explicitly prepared when passed to this api. + @param task The task each of the virtual ranges are mapped into. Note that unlike IOMemoryDescriptor::withAddress(), kernel_task memory must be explicitly prepared when passed to this api. The task argument may be NULL to specify memory by physical address. @result The created IOMemoryDescriptor on success, to be released by the caller, or zero on failure. */ static IOMemoryDescriptor * withAddressRanges( @@ -640,7 +640,7 @@ class IOMemoryMap : public OSObject public: /*! @function getVirtualAddress @abstract Accessor to the virtual address of the first byte in the mapping. - @discussion This method returns the virtual address of the first byte in the mapping. + @discussion This method returns the virtual address of the first byte in the mapping. Since the IOVirtualAddress is only 32bit in 32bit kernels, the getAddress() method should be used for compatibility with 64bit task mappings. @result A virtual address. */ virtual IOVirtualAddress getVirtualAddress(); @@ -725,9 +725,25 @@ class IOMemoryMap : public OSObject mach_vm_size_t offset = 0); #ifdef __LP64__ +/*! @function getAddress + @abstract Accessor to the virtual address of the first byte in the mapping. + @discussion This method returns the virtual address of the first byte in the mapping. + @result A virtual address. */ +/*! @function getSize + @abstract Accessor to the length of the mapping. + @discussion This method returns the length of the mapping. + @result A byte count. */ inline mach_vm_address_t getAddress() __attribute__((always_inline)); inline mach_vm_size_t getSize() __attribute__((always_inline)); #else /* !__LP64__ */ +/*! @function getAddress + @abstract Accessor to the virtual address of the first byte in the mapping. + @discussion This method returns the virtual address of the first byte in the mapping. + @result A virtual address. */ +/*! @function getSize + @abstract Accessor to the length of the mapping. + @discussion This method returns the length of the mapping. + @result A byte count. */ virtual mach_vm_address_t getAddress(); virtual mach_vm_size_t getSize(); #endif /* !__LP64__ */ @@ -770,8 +786,6 @@ enum { }; #endif /* XNU_KERNEL_PRIVATE */ -#if !defined(__LP64) || defined(_IOMEMORYDESCRIPTOR_INTERNAL_) - // The following classes are private implementation of IOMemoryDescriptor - they // should not be referenced directly, just through the public API's in the // IOMemoryDescriptor class. For example, an IOGeneralMemoryDescriptor instance @@ -929,8 +943,6 @@ class IOGeneralMemoryDescriptor : public IOMemoryDescriptor }; -#endif /* !defined(__LP64) || defined(_IOMEMORYDESCRIPTOR_INTERNAL_) */ - /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #ifdef __LP64__ diff --git a/iokit/IOKit/IOMessage.h b/iokit/IOKit/IOMessage.h index 77a1001aa..4a571b9d4 100644 --- a/iokit/IOKit/IOMessage.h +++ b/iokit/IOKit/IOMessage.h @@ -7,7 +7,7 @@ * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, + * may notificationused to create, or enable the creation or redistribution of, * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. @@ -32,16 +32,24 @@ #include #include +/*! + * @header IOMessage.h + * + * Defines message type constants for several IOKit messaging API's. + * + */ + typedef UInt32 IOMessage; #define iokit_common_msg(message) (UInt32)(sys_iokit|sub_iokit_common|message) #define iokit_family_msg(sub,message) (UInt32)(sys_iokit|sub|message) -/*! @defined iokit_vendor_specific_msg - @discussion iokit_vendor_specific_msg passes messages in the sub_iokit_vendor_specific - subsystem. It can be used to be generate messages that are used for private - communication between vendor specific code with the IOService::message() etc. APIs. -*/ +/*! + * @defined iokit_vendor_specific_msg + * @discussion iokit_vendor_specific_msg passes messages in the sub_iokit_vendor_specific + * subsystem. It can be used to generate messages that are used for private + * communication between vendor specific code with the IOService::message() etc. APIs. + */ #define iokit_vendor_specific_msg(message) (UInt32)(sys_iokit|sub_iokit_vendor_specific|message) #define kIOMessageServiceIsTerminated iokit_common_msg(0x010) @@ -54,29 +62,159 @@ typedef UInt32 IOMessage; #define kIOMessageServiceBusyStateChange iokit_common_msg(0x120) +#define kIOMessageConsoleSecurityChange iokit_common_msg(0x128) + #define kIOMessageServicePropertyChange iokit_common_msg(0x130) -#define kIOMessageCanDevicePowerOff iokit_common_msg(0x200) +#define kIOMessageCopyClientID iokit_common_msg(0x330) + +#define kIOMessageSystemCapabilityChange iokit_common_msg(0x340) +#define kIOMessageDeviceSignaledWakeup iokit_common_msg(0x350) + + +/*! + * @defined kIOMessageDeviceWillPowerOff + * @discussion Indicates the device is about to move to a lower power state. + * Sent to IOKit interest notification clients of type kIOAppPowerStateInterest + * and kIOGeneralInterest. + */ #define kIOMessageDeviceWillPowerOff iokit_common_msg(0x210) -#define kIOMessageDeviceWillNotPowerOff iokit_common_msg(0x220) + +/*! + * @defined kIOMessageDeviceHasPoweredOn + * @discussion Indicates the device has just moved to a higher power state. + * Sent to IOKit interest notification clients of type kIOAppPowerStateInterest + * and kIOGeneralInterest. + */ #define kIOMessageDeviceHasPoweredOn iokit_common_msg(0x230) -// IOService power mgt does not send -// kIOMessageDeviceWillPowerOn -// kIOMessageDeviceHasPoweredOff -#define kIOMessageDeviceWillPowerOn iokit_common_msg(0x215) -#define kIOMessageDeviceHasPoweredOff iokit_common_msg(0x225) +/*! @group In-kernel system shutdown and restart notifications + */ -#define kIOMessageCanSystemPowerOff iokit_common_msg(0x240) +/*! + * @defined kIOMessageSystemWillPowerOff + * @discussion Indicates an imminent system shutdown. Recipients have a limited + * amount of time to respond, otherwise the system will timeout and + * shutdown even without a response. + * Delivered to in-kernel IOKit drivers via IOService::systemWillShutdown(), + * and to clients of registerPrioritySleepWakeInterest(). + * Never delivered to user space notification clients. + */ #define kIOMessageSystemWillPowerOff iokit_common_msg(0x250) -#define kIOMessageSystemWillNotPowerOff iokit_common_msg(0x260) + +/*! + * @defined kIOMessageSystemWillRestart + * @discussion Indicates an imminent system restart. Recipients have a limited + * amount of time to respond, otherwise the system will timeout and + * restart even without a response. + * Delivered to in-kernel IOKit drivers via IOService::systemWillShutdown(), + * and to clients of registerPrioritySleepWakeInterest(). + * Never delivered to user space notification clients. + */ +#define kIOMessageSystemWillRestart iokit_common_msg(0x310) + +/*! + * @defined kIOMessageSystemPagingOff + * @discussion Indicates an imminent system shutdown, paging device now unavailable. + * Recipients have a limited amount of time to respond, otherwise the + * system will timeout and shutdown even without a response. + * Delivered to clients of registerPrioritySleepWakeInterest(). + * Never delivered to user space notification clients. + */ +#define kIOMessageSystemPagingOff iokit_common_msg(0x255) + + +/*! @group System sleep and wake notifications + */ + +/*! + * @defined kIOMessageCanSystemSleep + * @discussion Announces/Requests permission to proceed to system sleep. + * Delivered to in-kernel IOKit drivers via kIOGeneralInterest + * and kIOPriorityPowerStateInterest. + * Delivered to user clients of IORegisterForSystemPower. + */ #define kIOMessageCanSystemSleep iokit_common_msg(0x270) -#define kIOMessageSystemWillSleep iokit_common_msg(0x280) + +/*! + * @defined kIOMessageSystemWillNotSleep + * @discussion Announces that the system has retracted a previous attempt to sleep; + * it follows kIOMessageCanSystemSleep. + * Delivered to in-kernel IOKit drivers via kIOGeneralInterest + * and kIOPriorityPowerStateInterest. + * Delivered to user clients of IORegisterForSystemPower. + */ #define kIOMessageSystemWillNotSleep iokit_common_msg(0x290) -#define kIOMessageSystemHasPoweredOn iokit_common_msg(0x300) -#define kIOMessageSystemWillRestart iokit_common_msg(0x310) + +/*! + * @defined kIOMessageSystemWillSleep + * @discussion Announces that sleep is beginning. + * Delivered to in-kernel IOKit drivers via kIOGeneralInterest + * and kIOPriorityPowerStateInterest. + * Delivered to user clients of IORegisterForSystemPower. + */ +#define kIOMessageSystemWillSleep iokit_common_msg(0x280) + +/*! + * @defined kIOMessageSystemWillPowerOn + * @discussion Announces that the system is beginning to power the device tree; most + * devices are unavailable at this point.. + * Delivered to in-kernel IOKit drivers via kIOGeneralInterest + * and kIOPriorityPowerStateInterest. + * Delivered to user clients of IORegisterForSystemPower. + */ #define kIOMessageSystemWillPowerOn iokit_common_msg(0x320) -#define kIOMessageCopyClientID iokit_common_msg(0x330) +/*! + * @defined kIOMessageSystemHasPoweredOn + * @discussion Announces that the system and its devices have woken up. + * Delivered to in-kernel IOKit drivers via kIOGeneralInterest + * and kIOPriorityPowerStateInterest. + * Delivered to user clients of IORegisterForSystemPower. + */ +#define kIOMessageSystemHasPoweredOn iokit_common_msg(0x300) + +/*! @group Unused and deprecated notifications + */ + +/*! + * @defined kIOMessageCanDevicePowerOff + * @discussion Delivered to kIOAppPowerStateInterest clients of + * devices that implement their own idle timeouts. + * This message type is almost never used. + */ +#define kIOMessageCanDevicePowerOff iokit_common_msg(0x200) + +/*! + * @defined kIOMessageDeviceWillNotPowerOff + * @discussion This IOKit interest notification is largely unused; + * it's not very interesting. + */ +#define kIOMessageDeviceWillNotPowerOff iokit_common_msg(0x220) + +/*! + * @defined kIOMessageSystemWillNotPowerOff + * @deprecated This IOKit message is unused. + */ +#define kIOMessageSystemWillNotPowerOff iokit_common_msg(0x260) + +/*! + * @defined kIOMessageCanSystemPowerOff + * @deprecated This IOKit message is unused. + */ +#define kIOMessageCanSystemPowerOff iokit_common_msg(0x240) + +/*! + * @defined kIOMessageDeviceWillPowerOn + * @discussion IOService power mgt does not send kIOMessageDeviceWillPowerOn. + */ +#define kIOMessageDeviceWillPowerOn iokit_common_msg(0x215) + +/*! + * @defined kIOMessageDeviceHasPoweredOff + * @discussion IOService power mgt does not send kIOMessageDeviceHasPoweredOff. + */ +#define kIOMessageDeviceHasPoweredOff iokit_common_msg(0x225) + #endif /* ! __IOKIT_IOMESSAGE_H */ diff --git a/iokit/IOKit/IONVRAM.h b/iokit/IOKit/IONVRAM.h index a9337bd1d..15bf709f6 100644 --- a/iokit/IOKit/IONVRAM.h +++ b/iokit/IOKit/IONVRAM.h @@ -29,17 +29,20 @@ #ifndef _IOKIT_IONVRAM_H #define _IOKIT_IONVRAM_H +#ifdef __cplusplus #include #include #include #include - +#endif /* __cplusplus */ #define kIODTNVRAMOFPartitionName "common" #define kIODTNVRAMXPRAMPartitionName "APL,MacOS75" #define kIODTNVRAMPanicInfoPartitonName "APL,OSXPanic" #define kIODTNVRAMFreePartitionName "wwwwwwwwwwww" +#define MIN_SYNC_NOW_INTERVAL 15*60 /* Minimum 15 Minutes interval mandated */ + enum { kIODTNVRAMImageSize = 0x2000, kIODTNVRAMXPRAMSize = 0x0100, @@ -60,6 +63,8 @@ enum { kOFVariablePermKernelOnly }; +#ifdef __cplusplus + class IODTNVRAM : public IOService { OSDeclareDefaultStructors(IODTNVRAM); @@ -86,6 +91,8 @@ class IODTNVRAM : public IOService UInt32 _piPartitionSize; UInt8 *_piImage; bool _systemPaniced; + SInt32 _lastDeviceSync; + bool _freshInterval; virtual UInt8 calculatePartitionChecksum(UInt8 *partitionHeader); virtual IOReturn initOFVariables(void); @@ -162,6 +169,9 @@ class IODTNVRAM : public IOService IOByteCount length); virtual IOByteCount savePanicInfo(UInt8 *buffer, IOByteCount length); + virtual bool safeToSync(void); }; +#endif /* __cplusplus */ + #endif /* !_IOKIT_IONVRAM_H */ diff --git a/iokit/IOKit/IOPlatformExpert.h b/iokit/IOKit/IOPlatformExpert.h index f75a3e3ab..a27cf64ad 100644 --- a/iokit/IOKit/IOPlatformExpert.h +++ b/iokit/IOKit/IOPlatformExpert.h @@ -57,7 +57,8 @@ enum { kPEHangCPU, kPEUPSDelayHaltCPU, kPEPanicRestartCPU, - kPEPanicSync + kPEPanicSync, + kPEPagingOff }; extern int (*PE_halt_restart)(unsigned int type); extern int PEHaltRestart(unsigned int type); @@ -68,6 +69,12 @@ extern UInt32 PESavePanicInfo(UInt8 *buffer, UInt32 length); extern long PEGetGMTTimeOfDay( void ); extern void PESetGMTTimeOfDay( long secs ); +/* unless it's a "well-known" property, these will read/write out the value as raw data */ + +extern boolean_t PEWriteNVRAMProperty(const char *symbol, const void *value, const unsigned int len); + +extern boolean_t PEReadNVRAMProperty(const char *symbol, void *value, unsigned int *len); + #ifdef __cplusplus } /* extern "C" */ diff --git a/iokit/IOKit/IOService.h b/iokit/IOKit/IOService.h index f2f513929..c3282f8ea 100644 --- a/iokit/IOKit/IOService.h +++ b/iokit/IOKit/IOService.h @@ -134,6 +134,7 @@ extern const OSSymbol * gIOBusyInterest; extern const OSSymbol * gIOOpenInterest; extern const OSSymbol * gIOAppPowerStateInterest; extern const OSSymbol * gIOPriorityPowerStateInterest; +extern const OSSymbol * gIOConsoleSecurityInterest; extern const OSSymbol * gIODeviceMemoryKey; extern const OSSymbol * gIOInterruptControllersKey; @@ -434,25 +435,6 @@ class IOService : public IORegistryEntry OSMetaClassDeclareReservedUnused(IOService, 46); OSMetaClassDeclareReservedUnused(IOService, 47); -#ifdef __ppc__ - OSMetaClassDeclareReservedUnused(IOService, 48); - OSMetaClassDeclareReservedUnused(IOService, 49); - OSMetaClassDeclareReservedUnused(IOService, 50); - OSMetaClassDeclareReservedUnused(IOService, 51); - OSMetaClassDeclareReservedUnused(IOService, 52); - OSMetaClassDeclareReservedUnused(IOService, 53); - OSMetaClassDeclareReservedUnused(IOService, 54); - OSMetaClassDeclareReservedUnused(IOService, 55); - OSMetaClassDeclareReservedUnused(IOService, 56); - OSMetaClassDeclareReservedUnused(IOService, 57); - OSMetaClassDeclareReservedUnused(IOService, 58); - OSMetaClassDeclareReservedUnused(IOService, 59); - OSMetaClassDeclareReservedUnused(IOService, 60); - OSMetaClassDeclareReservedUnused(IOService, 61); - OSMetaClassDeclareReservedUnused(IOService, 62); - OSMetaClassDeclareReservedUnused(IOService, 63); -#endif - public: /*! @function getState @abstract Accessor for IOService state bits, not normally needed or used outside IOService. @@ -1220,6 +1202,8 @@ class IOService : public IORegistryEntry static void setPMRootDomain( class IOPMrootDomain * rootDomain ); static IOReturn catalogNewDrivers( OSOrderedSet * newTables ); uint64_t getAccumulatedBusyTime( void ); + static void updateConsoleUsers(OSArray * consoleUsers, IOMessage systemMessage); + static void consoleLockTimer(thread_call_param_t p0, thread_call_param_t p1); private: static IOReturn waitMatchIdle( UInt32 ms ); @@ -1312,10 +1296,13 @@ class IOService : public IORegistryEntry static void terminateThread( void * arg, wait_result_t unused ); static void terminateWorker( IOOptionBits options ); static void actionWillTerminate( IOService * victim, IOOptionBits options, - OSArray * doPhase2List ); - static void actionDidTerminate( IOService * victim, IOOptionBits options ); - static void actionFinalize( IOService * victim, IOOptionBits options ); - static void actionStop( IOService * client, IOService * provider ); + OSArray * doPhase2List, void*, void * ); + static void actionDidTerminate( IOService * victim, IOOptionBits options, + void *, void *, void *); + static void actionFinalize( IOService * victim, IOOptionBits options, + void *, void *, void *); + static void actionStop( IOService * client, IOService * provider, + void *, void *, void *); APPLE_KEXT_COMPATIBILITY_VIRTUAL IOReturn resolveInterrupt(IOService *nub, int source); @@ -1337,8 +1324,8 @@ class IOService : public IORegistryEntry virtual void PMinit( void ); /*! @function PMstop - @abstract Frees and removes the driver from power management. - @discussion The power managment variables don't exist after this call and the power managment methods in the caller shouldn't be called. + @abstract Stop power managing the driver. + @discussion Removes the driver from the power plane and stop its power management. This method is synchronous against any power management method invocations (e.g. setPowerState or setAggressiveness), so when this method returns it is guaranteed those power management methods will not be entered. Driver should not call any power management methods after this call. Calling PMstop cleans up for the three power management initialization calls: @link PMinit PMinit@/link, @link joinPMtree joinPMtree@/link, and @link registerPowerDriver registerPowerDriver@/link. */ virtual void PMstop( void ); @@ -1368,6 +1355,7 @@ class IOService : public IORegistryEntry /*! @function registerInterestedDriver @abstract Allows an IOService object to register interest in the changing power state of a power-managed IOService object. @discussion Call registerInterestedDriver on the IOService object you are interested in receiving power state messages from, and pass a pointer to the interested driver (this) as an argument. + The interested driver is retained until the power interest is removed by calling deRegisterInterestedDriver. The interested driver should override @link powerStateWillChangeTo powerStateWillChangeTo@/link and @link powerStateDidChangeTo powerStateDidChangeTo@/link to receive these power change messages. Interested drivers must acknowledge power changes in powerStateWillChangeTo or powerStateDidChangeTo, either via return value or later calls to @link acknowledgePowerChange acknowledgePowerChange@/link. @param theDriver The driver of interest adds this pointer to the list of interested drivers. It informs drivers on this list before and after the power change. @@ -1378,7 +1366,8 @@ class IOService : public IORegistryEntry /*! @function deRegisterInterestedDriver @abstract De-registers power state interest from a previous call to registerInterestedDriver. - @discussion Most drivers do not need to override deRegisterInterestedDriver. + @discussion The retain from registerInterestedDriver is released. This method is synchronous against any powerStateWillChangeTo or powerStateDidChangeTo call targeting the interested driver, so when this method returns it is guaranteed those interest handlers will not be entered. + Most drivers do not need to override deRegisterInterestedDriver. @param theDriver The interested driver previously passed into @link registerInterestedDriver registerInterestedDriver@/link. @result A return code that can be ignored by the caller. */ @@ -1725,10 +1714,13 @@ class IOService : public IORegistryEntry #ifdef XNU_KERNEL_PRIVATE /* Power management internals */ public: + void idleTimerExpired( void ); void settleTimerExpired( void ); - IOReturn synchronizePowerTree( void ); - bool assertPMThreadCall( void ); - void deassertPMThreadCall( void ); + IOReturn synchronizePowerTree( IOOptionBits options = 0, IOService * notifyRoot = 0 ); + bool assertPMDriverCall( IOPMDriverCallEntry * callEntry, IOOptionBits options = 0, IOPMinformee * inform = 0 ); + void deassertPMDriverCall( IOPMDriverCallEntry * callEntry ); + IOReturn changePowerStateWithOverrideTo( unsigned long ordinal ); + static const char * getIOMessageString( uint32_t msg ); #ifdef __LP64__ static IOWorkLoop * getPMworkloop( void ); @@ -1736,10 +1728,7 @@ class IOService : public IORegistryEntry protected: bool tellClientsWithResponse( int messageType ); - bool tellClientsWithResponse( int messageType, bool (*)(OSObject *, void *) ); void tellClients( int messageType ); - void tellClients( int messageType, bool (*)(OSObject *, void *) ); - IOReturn changePowerStateWithOverrideTo( unsigned long ordinal ); private: #ifndef __LP64__ @@ -1752,44 +1741,44 @@ class IOService : public IORegistryEntry void PMfree( void ); bool tellChangeDown1 ( unsigned long ); bool tellChangeDown2 ( unsigned long ); - IOReturn startPowerChange ( unsigned long, unsigned long, unsigned long, IOPowerConnection *, unsigned long ); + IOReturn startPowerChange( IOPMPowerChangeFlags, IOPMPowerStateIndex, IOPMPowerFlags, IOPowerConnection *, IOPMPowerFlags ); void setParentInfo ( IOPMPowerFlags, IOPowerConnection *, bool ); - IOReturn notifyAll ( int nextMachineState, bool is_prechange ); - bool notifyChild ( IOPowerConnection * nextObject, bool is_prechange ); + IOReturn notifyAll ( uint32_t nextMS ); + bool notifyChild ( IOPowerConnection * child ); // power change initiated by driver void OurChangeStart( void ); + void OurSyncStart ( void ); void OurChangeTellClientsPowerDown ( void ); void OurChangeTellPriorityClientsPowerDown ( void ); + void OurChangeTellCapabilityWillChange ( void ); void OurChangeNotifyInterestedDriversWillChange ( void ); void OurChangeSetPowerState ( void ); void OurChangeWaitForPowerSettle ( void ); void OurChangeNotifyInterestedDriversDidChange ( void ); + void OurChangeTellCapabilityDidChange ( void ); void OurChangeFinish ( void ); - void OurSyncStart ( void ); // downward power change initiated by a power parent IOReturn ParentChangeStart( void ); - void ParentDownTellPriorityClientsPowerDown ( void ); - void ParentDownNotifyInterestedDriversWillChange ( void ); - void ParentDownNotifyDidChangeAndAcknowledgeChange ( void ); - void ParentDownSetPowerState ( void ); - void ParentDownWaitForPowerSettle ( void ); - void ParentAcknowledgePowerChange ( void ); - - // upward power change initiated by a power parent - void ParentUpSetPowerState ( void ); - void ParentUpWaitForSettleTime ( void ); - void ParentUpNotifyInterestedDriversDidChange ( void ); + void ParentChangeTellPriorityClientsPowerDown ( void ); + void ParentChangeTellCapabilityWillChange ( void ); + void ParentChangeNotifyInterestedDriversWillChange ( void ); + void ParentChangeSetPowerState ( void ); + void ParentChangeWaitForPowerSettle ( void ); + void ParentChangeNotifyInterestedDriversDidChange ( void ); + void ParentChangeTellCapabilityDidChange ( void ); + void ParentChangeAcknowledgePowerChange ( void ); void all_done ( void ); void start_ack_timer ( void ); void stop_ack_timer ( void ); void startSettleTimer( void ); bool checkForDone ( void ); - bool responseValid ( unsigned long x, int pid ); + bool responseValid ( uint32_t x, int pid ); void computeDesiredState ( unsigned long tempDesire = 0 ); void rebuildChildClampBits ( void ); + void tellSystemCapabilityChange( uint32_t nextMS ); static void ack_timer_expired( thread_call_param_t, thread_call_param_t ); static IOReturn actionAckTimerExpired(OSObject *, void *, void *, void *, void * ); @@ -1797,8 +1786,10 @@ class IOService : public IORegistryEntry static IOPMRequest * acquirePMRequest( IOService * target, IOOptionBits type, IOPMRequest * active = 0 ); static void releasePMRequest( IOPMRequest * request ); static void pmDriverCallout( IOService * from ); - static void pmTellClientWithResponse( OSObject * object, void * context ); static void pmTellAppWithResponse( OSObject * object, void * context ); + static void pmTellClientWithResponse( OSObject * object, void * context ); + static void pmTellCapabilityAppWithResponse ( OSObject * object, void * arg ); + static void pmTellCapabilityClientWithResponse( OSObject * object, void * arg ); bool ackTimerTick( void ); void addPowerChild1( IOPMRequest * request ); void addPowerChild2( IOPMRequest * request ); @@ -1831,14 +1822,15 @@ class IOService : public IORegistryEntry void driverInformPowerChange( void ); bool isPMBlocked( IOPMRequest * request, int count ); void notifyChildren( void ); - void notifyChildrenDone( void ); + void notifyChildrenOrdered( void ); + void notifyChildrenDelayed( void ); void cleanClientResponses ( bool logErrors ); - void idleTimerExpired( IOTimerEventSource * ); void updatePowerClient( const OSSymbol * client, uint32_t powerState ); void removePowerClient( const OSSymbol * client ); uint32_t getPowerStateForClient( const OSSymbol * client ); IOReturn requestPowerState( const OSSymbol * client, uint32_t state ); - IOReturn requestDomainPower( unsigned long ourPowerState, IOOptionBits options = 0 ); + IOReturn requestDomainPower( IOPMPowerStateIndex ourPowerState, IOOptionBits options = 0 ); + void waitForPMDriverCall( IOService * target = 0 ); #endif /* XNU_KERNEL_PRIVATE */ }; diff --git a/iokit/IOKit/IOServicePM.h b/iokit/IOKit/IOServicePM.h index 96edc11c0..2a2c4c400 100644 --- a/iokit/IOKit/IOServicePM.h +++ b/iokit/IOKit/IOServicePM.h @@ -47,6 +47,15 @@ class IOPMRequest; class IOPMRequestQueue; class IOPMCompletionQueue; +typedef unsigned long IOPMPowerStateIndex; +typedef uint32_t IOPMPowerChangeFlags; + +struct IOPMDriverCallEntry { + queue_chain_t link; + thread_t thread; + IOService * target; +}; + /* Binary compatibility with drivers that access pm_vars */ #ifdef __LP64__ #define PM_VARS_SUPPORT 0 diff --git a/iokit/IOKit/IOSharedLock.h b/iokit/IOKit/IOSharedLock.h index eadfc407d..795007451 100644 --- a/iokit/IOKit/IOSharedLock.h +++ b/iokit/IOKit/IOSharedLock.h @@ -1,19 +1,14 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2010 Apple Computer, Inc. All rights reserved. * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * @APPLE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER @@ -23,70 +18,20 @@ * Please see the License for the specific language governing rights and * limitations under the License. * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1998 Apple Computer, Inc. All rights reserved. - * - * HISTORY - * - */ - -/* - * Multiprocessor locks used within the shared memory area between the - * kernel and event system. These must work in both user and kernel mode. - * - * These routines are public, for the purpose of writing frame buffer device - * drivers which handle their own cursors. Certain architectures define a - * generic display class which handles cursor drawing and is subclassed by - * driver writers. These drivers need not be concerned with the following - * types and definitions. - * - * The ev_lock(), ev_unlock(), and ev_try_lock() functions are available only - * to drivers built in or dynamically loaded into the kernel, and to DPS - * drivers built in or dynamically loaded into the Window Server. They do not - * exist in any shared library. - * - * --> They're now in IOKit user lib. + * @APPLE_LICENSE_HEADER_END@ */ #ifndef _IOKIT_IOSHAREDLOCK_H #define _IOKIT_IOSHAREDLOCK_H -#ifdef __cplusplus -extern "C" { -#endif - -// should be 32 bytes on PPC -typedef volatile int IOSharedLockData; -typedef IOSharedLockData * IOSharedLock; - -#define IOSpinLockInit(l) (*(l) = (IOSharedLockData)0) - -#ifndef KERNEL -extern void IOSpinLock(IOSharedLock l); -#endif +#include -extern void IOSpinUnlock(IOSharedLock l); -extern boolean_t IOTrySpinLock(IOSharedLock l); +#define IOSharedLockData OSSpinLock +#define ev_lock_data_t OSSpinLock -/* exact same stuff & implementation */ - -typedef IOSharedLockData ev_lock_data_t; -typedef ev_lock_data_t * ev_lock_t; - -#define ev_init_lock(l) (*(l) = (ev_lock_data_t)0) -// needs isync? -//#define ev_is_locked(l) (*(l) != (ev_lock_data_t)0) - -#ifndef KERNEL -extern void ev_lock(ev_lock_t l); // Spin lock! +#ifdef KERNEL +#define ev_unlock(l) OSSpinLockUnlock(l) +#define ev_try_lock(l) OSSpinLockTry(l) #endif -extern void ev_unlock(ev_lock_t l); -extern boolean_t ev_try_lock(ev_lock_t l); - -#ifdef __cplusplus -} -#endif #endif /* ! _IOKIT_IOSHAREDLOCK_H */ diff --git a/iokit/IOKit/IOStatistics.h b/iokit/IOKit/IOStatistics.h new file mode 100644 index 000000000..0c4a1abb2 --- /dev/null +++ b/iokit/IOKit/IOStatistics.h @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2010 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _IOKIT_STATISTICS_H +#define _IOKIT_STATISTICS_H + +#define IOSTATISTICS_SIG 'IOST' +#define IOSTATISTICS_SIG_USERCLIENT 'IOSU' +#define IOSTATISTICS_SIG_WORKLOOP 'IOSW' + +/* Update when the binary format changes */ +#define IOSTATISTICS_VER 0x2 + +enum { + kIOStatisticsDriverNameLength = 64, + kIOStatisticsClassNameLength = 64, + kIOStatisticsProcessNameLength = 20 +}; + +enum { + kIOStatisticsDerivedEventSourceCounter = 0, + kIOStatisticsTimerEventSourceCounter, + kIOStatisticsCommandGateCounter, + kIOStatisticsCommandQueueCounter, + kIOStatisticsInterruptEventSourceCounter, + kIOStatisticsFilterInterruptEventSourceCounter +}; + +typedef uint32_t IOStatisticsCounterType; + +enum { + kIOStatisticsGeneral = 0, + kIOStatisticsWorkLoop, + kIOStatisticsUserClient +}; + +/* Keep our alignments as intended */ + +#pragma pack(4) + +/* Event Counters */ + +typedef struct IOStatisticsInterruptEventSources { + uint32_t created; + uint32_t produced; + uint32_t checksForWork; +} IOStatisticsInterruptEventSources; + +typedef struct IOStatisticsTimerEventSources { + uint32_t created; + uint32_t openGateCalls; + uint32_t closeGateCalls; + uint64_t timeOnGate; + uint32_t timeouts; + uint32_t checksForWork; +} IOStatisticsTimerEventSources; + +typedef struct IOStatisticsDerivedEventSources { + uint32_t created; + uint32_t openGateCalls; + uint32_t closeGateCalls; + uint64_t timeOnGate; +} IOStatisticsDerivedEventSources; + +typedef struct IOStatisticsCommandGates { + uint32_t created; + uint32_t openGateCalls; + uint32_t closeGateCalls; + uint64_t timeOnGate; + uint32_t actionCalls; +} IOStatisticsCommandGates; + +typedef struct IOStatisticsCommandQueues { + uint32_t created; + uint32_t actionCalls; +} IOStatisticsCommandQueues; + +typedef struct IOStatisticsUserClients { + uint32_t created; + uint32_t clientCalls; +} IOStatisticsUserClients; + +/* General mode */ + +typedef struct IOStatisticsHeader { + uint32_t sig; /* 'IOST' */ + uint32_t ver; /* incremented with every data revision */ + + uint32_t seq; /* sequence ID */ + + uint32_t globalStatsOffset; + uint32_t kextStatsOffset; + uint32_t memoryStatsOffset; + uint32_t classStatsOffset; + uint32_t counterStatsOffset; + uint32_t kextIdentifiersOffset; + uint32_t classNamesOffset; + + /* struct IOStatisticsGlobal */ + /* struct IOStatisticsKext */ + /* struct IOStatisticsMemory */ + /* struct IOStatisticsClass */ + /* struct IOStatisticsCounter */ + /* struct IOStatisticsKextIdentifier */ + /* struct IOStatisticsClassName */ +} IOStatisticsHeader; + +typedef struct IOStatisticsGlobal { + uint32_t kextCount; + uint32_t classCount; + uint32_t workloops; +} IOStatisticsGlobal; + +typedef struct IOStatisticsKext { + uint32_t loadTag; + uint32_t loadSize; + uint32_t wiredSize; + uint32_t classes; /* Number of classes owned */ + uint32_t classIndexes[]; /* Variable length array of owned class indexes */ +} IOStatisticsKext; + +typedef struct IOStatisticsMemory { + uint32_t allocatedSize; + uint32_t freedSize; + uint32_t allocatedAlignedSize; + uint32_t freedAlignedSize; + uint32_t allocatedContiguousSize; + uint32_t freedContiguousSize; + uint32_t allocatedPageableSize; + uint32_t freedPageableSize; +} IOStatisticsMemory; + +typedef struct IOStatisticsClass { + uint32_t classID; + uint32_t superClassID; + uint32_t classSize; +} IOStatisticsClass; + +typedef struct IOStatisticsCounter { + uint32_t classID; + uint32_t classInstanceCount; + struct IOStatisticsUserClients userClientStatistics; + struct IOStatisticsInterruptEventSources interruptEventSourceStatistics; + struct IOStatisticsInterruptEventSources filterInterruptEventSourceStatistics; + struct IOStatisticsTimerEventSources timerEventSourceStatistics; + struct IOStatisticsCommandGates commandGateStatistics; + struct IOStatisticsCommandQueues commandQueueStatistics; + struct IOStatisticsDerivedEventSources derivedEventSourceStatistics; +} IOStatisticsCounter; + +typedef struct IOStatisticsKextIdentifier { + char identifier[kIOStatisticsDriverNameLength]; +} IOStatisticsKextIdentifier; + +typedef struct IOStatisticsClassName { + char name[kIOStatisticsClassNameLength]; +} IOStatisticsClassName; + +/* WorkLoop mode */ + +typedef struct IOStatisticsWorkLoop { + uint32_t attachedEventSources; + uint64_t timeOnGate; + uint32_t kextLoadTag; + uint32_t dependentKexts; + uint32_t dependentKextLoadTags[]; +} IOStatisticsWorkLoop; + +typedef struct IOStatisticsWorkLoopHeader { + uint32_t sig; /* 'IOSW */ + uint32_t ver; /* incremented with every data revision */ + uint32_t seq; /* sequence ID */ + uint32_t workloopCount; + struct IOStatisticsWorkLoop workLoopStats; +} IOStatisticsWorkLoopHeader; + +/* UserClient mode */ + +typedef struct IOStatisticsUserClientCall { + char processName[kIOStatisticsProcessNameLength]; + int32_t pid; + uint32_t calls; +} IOStatisticsUserClientCall; + +typedef struct IOStatisticsUserClientHeader { + uint32_t sig; /* 'IOSU */ + uint32_t ver; /* incremented with every data revision */ + uint32_t seq; /* sequence ID */ + uint32_t processes; + struct IOStatisticsUserClientCall userClientCalls[]; +} IOStatisticsUserClientHeader; + +#pragma pack() + +#endif /* _IOKIT_STATISTICS_H */ diff --git a/iokit/IOKit/IOStatisticsPrivate.h b/iokit/IOKit/IOStatisticsPrivate.h new file mode 100644 index 000000000..a41230c3d --- /dev/null +++ b/iokit/IOKit/IOStatisticsPrivate.h @@ -0,0 +1,359 @@ +/* + * Copyright (c) 2010 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef __IOKIT_STATISTICS_PRIVATE_H +#define __IOKIT_STATISTICS_PRIVATE_H + +#if IOKITSTATS + +#include +#include + +#include +#include + +#include +#include + +#ifndef KERNEL +#error IOStatisticsPrivate.h is for kernel use only +#endif + +/* Defines */ +#define IOKIT_STATISTICS_RECORDED_USERCLIENT_PROCS 20 + +#ifndef __probable +#define __probable(x) x +#endif + +/* Forward declarations */ +class IOWorkLoop; +class IOUserClient; +class IOEventSource; + +struct IOEventSourceCounter; +struct IOUserClientCounter; +struct IOWorkLoopCounter; +struct IOUserClientProcessEntry; + +struct KextNode; + +/* Allocation tracking */ + +enum { + kIOStatisticsMalloc = 0, + kIOStatisticsFree, + kIOStatisticsMallocAligned, + kIOStatisticsFreeAligned, + kIOStatisticsMallocContiguous, + kIOStatisticsFreeContiguous, + kIOStatisticsMallocPageable, + kIOStatisticsFreePageable, + kIOStatisticsAllocCount +}; + +TAILQ_HEAD(ProcessEntryList, IOUserClientProcessEntry); + +/* Tree and list structs */ + +typedef struct ClassNode { + RB_ENTRY(ClassNode) tLink; + SLIST_ENTRY(ClassNode) lLink; + struct KextNode *parentKext; + uint32_t classID; + uint32_t superClassID; + const OSMetaClass *metaClass; + SLIST_HEAD(, IOEventSourceCounter) counterList; + SLIST_HEAD(, IOUserClientCounter) userClientList; +} ClassNode; + +typedef struct KextNode { + RB_ENTRY(KextNode) link; + RB_ENTRY(KextNode) addressLink; + OSKext *kext; + OSKextLoadTag loadTag; + vm_offset_t address; + vm_offset_t address_end; + uint32_t memoryCounters[kIOStatisticsAllocCount]; + uint32_t classes; + SLIST_HEAD(, ClassNode) classList; + SLIST_HEAD(, IOWorkLoopCounter) workLoopList; + ProcessEntryList userClientCallList; +} KextNode; + +/* User client tracing */ + +typedef struct IOUserClientProcessEntry { + TAILQ_ENTRY(IOUserClientProcessEntry) link; + char processName[kIOStatisticsProcessNameLength]; + int32_t pid; + uint32_t calls; +} IOUserClientProcessEntry; + +/* Counters */ + +typedef struct IOInterruptEventSourceCounter { + uint32_t produced; + uint32_t checksForWork; +} IOInterruptEventSourceCounter; + +typedef struct IOTimerEventSourceCounter { + uint32_t timeouts; + uint32_t checksForWork; +} IOTimerEventSourceCounter; + +typedef struct IOCommandGateCounter { + uint32_t actionCalls; +} IOCommandGateCounter; + +typedef struct IOCommandQueueCounter { + uint32_t actionCalls; +} IOCommandQueueCounter; + +typedef struct IOEventSourceCounter { + SLIST_ENTRY(IOEventSourceCounter) link; + ClassNode *parentClass; + IOStatisticsCounterType type; + uint64_t startTimeStamp; + uint64_t timeOnGate; + uint32_t closeGateCalls; + uint32_t openGateCalls; + union { + IOInterruptEventSourceCounter interrupt; + IOInterruptEventSourceCounter filter; + IOTimerEventSourceCounter timer; + IOCommandGateCounter commandGate; + IOCommandQueueCounter commandQueue; + } u; +} IOEventSourceCounter; + +typedef struct IOWorkLoopDependency { + RB_ENTRY(IOWorkLoopDependency) link; + OSKextLoadTag loadTag; +} IOWorkLoopDependency; + +typedef struct IOWorkLoopCounter { + SLIST_ENTRY(IOWorkLoopCounter) link; + KextNode *parentKext; + int attachedEventSources; + IOWorkLoop *workLoop; + uint64_t startTimeStamp; + uint64_t timeOnGate; + uint32_t closeGateCalls; + uint32_t openGateCalls; + typedef RB_HEAD(DependencyTree, IOWorkLoopDependency) DependencyTreeHead; + DependencyTreeHead dependencyHead; + static int loadTagCompare(IOWorkLoopDependency *e1, IOWorkLoopDependency *e2); + RB_PROTOTYPE_SC(static, DependencyTree, IOWorkLoopDependency, dependencyLink, KextTagCompare); +} IOWorkLoopCounter; + +typedef struct IOUserClientCounter { + SLIST_ENTRY(IOUserClientCounter) link; + ClassNode *parentClass; + uint32_t clientCalls; +} IOUserClientCounter; + +class IOStatistics { + static bool enabled; + + static IORWLock *lock; + + static uint32_t sequenceID; + + static uint32_t lastKextIndex; + static uint32_t lastClassIndex; + + static uint32_t loadedKexts; + static uint32_t registeredClasses; + static uint32_t registeredCounters; + static uint32_t registeredWorkloops; + + static uint32_t attachedEventSources; + + static KextNode *kextHint; + + static IOWorkLoopDependency *nextWorkLoopDependency; + + typedef RB_HEAD(KextTree, KextNode) KextTreeHead; + static KextTreeHead kextHead; + static int kextNodeCompare(KextNode *e1, KextNode *e2); + RB_PROTOTYPE_SC(static, KextTree, KextNode, link, kextNodeCompare); + + typedef RB_HEAD(KextAddressTree, KextNode) KextAddressTreeHead; + static KextAddressTreeHead kextAddressHead; + static int kextAddressNodeCompare(KextNode *e1, KextNode *e2); + RB_PROTOTYPE_SC(static, KextAddressTree, KextNode, addressLink, kextAddressNodeCompare); + + typedef RB_HEAD(ClassTree, ClassNode) ClassTreeHead; + static ClassTreeHead classHead; + static int classNodeCompare(ClassNode *e1, ClassNode *e2); + RB_PROTOTYPE_SC(static, ClassTree, ClassNode, tLink, classNodeCompare); + + static int oid_sysctl(__unused struct sysctl_oid *oidp, __unused void *arg1, int arg2, struct sysctl_req *req); + + static uint32_t copyGlobalStatistics(IOStatisticsGlobal *stats); + static uint32_t copyKextStatistics(IOStatisticsKext *stats); + static uint32_t copyMemoryStatistics(IOStatisticsMemory *stats); + static uint32_t copyClassStatistics(IOStatisticsClass *stats); + static uint32_t copyCounterStatistics(IOStatisticsCounter *stats); + static uint32_t copyKextIdentifiers(IOStatisticsKextIdentifier *kextIDs); + static uint32_t copyClassNames(IOStatisticsClassName *classNames); + + static uint32_t copyWorkLoopStatistics(IOStatisticsWorkLoop *workLoopStats); + + static uint32_t copyUserClientStatistics(IOStatisticsUserClientHeader *stats, uint32_t loadTag); + + static void updateAllocationCounter(vm_offset_t address, uint32_t index, vm_size_t size); + + static void storeUserClientCallInfo(IOUserClient *userClient, IOUserClientCounter *counter); + + static KextNode *getKextNodeFromBacktrace(boolean_t write); + static void releaseKextNode(KextNode *node); + +public: + + static void initialize(); + + static void onKextLoad(OSKext *kext, kmod_info_t *kmod_info); + static void onKextUnload(OSKext *kext); + static void onClassAdded(OSKext *parentKext, OSMetaClass *metaClass); + static void onClassRemoved(OSKext *parentKext, OSMetaClass *metaClass); + + static IOEventSourceCounter *registerEventSource(OSObject *inOwner); + static void unregisterEventSource(IOEventSourceCounter *counter); + + static IOWorkLoopCounter *registerWorkLoop(IOWorkLoop *workLoop); + static void unregisterWorkLoop(IOWorkLoopCounter *counter); + + static IOUserClientCounter *registerUserClient(IOUserClient *userClient); + static void unregisterUserClient(IOUserClientCounter *counter); + + static int getStatistics(sysctl_req *req); + static int getWorkLoopStatistics(sysctl_req *req); + static int getUserClientStatistics(sysctl_req *req); + + /* Inlines for counter manipulation. + * + * NOTE: counter access is not expressly guarded here so as not to incur performance penalties + * in the instrumented parent objects. Writes are arranged so as to be protected by pre-existing + * locks in the parent where appropriate, but reads have no such guarantee. Counters should + * therefore be regarded as providing an indication of current state, rather than precisely + * accurate statistics. + */ + + static inline void setCounterType(IOEventSourceCounter *counter, IOStatisticsCounterType type) { + if (counter) { + counter->type = type; + } + } + + static inline void countOpenGate(IOEventSourceCounter *counter) { + if (counter) { + counter->timeOnGate += mach_absolute_time() - counter->startTimeStamp; + counter->openGateCalls++; + } + } + + static inline void countCloseGate(IOEventSourceCounter *counter) { + if (counter) { + counter->startTimeStamp = mach_absolute_time(); + counter->closeGateCalls++; + } + } + + /* Interrupt */ + static inline void countInterruptCheckForWork(IOEventSourceCounter *counter) { + if (counter) { + counter->u.interrupt.checksForWork++; + } + } + + static inline void countInterrupt(IOEventSourceCounter *counter) { + if (counter) { + counter->u.interrupt.produced++; + } + } + + /* CommandQueue */ + static inline void countCommandQueueActionCall(IOEventSourceCounter *counter) { + if (counter) { + counter->u.commandQueue.actionCalls++; + } + } + + /* CommandGate */ + static inline void countCommandGateActionCall(IOEventSourceCounter *counter) { + if (counter) { + counter->u.commandGate.actionCalls++; + } + } + + /* Timer */ + static inline void countTimerTimeout(IOEventSourceCounter *counter) { + if (counter) { + counter->u.timer.timeouts++; + } + } + + /* WorkLoop */ + static void attachWorkLoopEventSource(IOWorkLoopCounter *wlc, IOEventSourceCounter *esc); + static void detachWorkLoopEventSource(IOWorkLoopCounter *wlc, IOEventSourceCounter *esc); + + static inline void countWorkLoopOpenGate(IOWorkLoopCounter *counter) { + if (counter) { + counter->timeOnGate += mach_absolute_time() - counter->startTimeStamp; + counter->openGateCalls++; + } + } + + static inline void countWorkLoopCloseGate(IOWorkLoopCounter *counter) { + if (counter) { + counter->startTimeStamp = mach_absolute_time(); + counter->closeGateCalls++; + } + } + + /* IOLib allocations */ + static void countAlloc(uint32_t index, vm_size_t size); + + /* UserClient */ + static void countUserClientCall(IOUserClient *client); +}; + +#else + +/* Statistics disabled */ + +class IOStatistics { +public: + static void initialize() {} +}; + +#endif /* IOKITSTATS */ + +#endif /* __IOKIT_STATISTICS_PRIVATE_H */ diff --git a/iokit/IOKit/IOTimeStamp.h b/iokit/IOKit/IOTimeStamp.h index a1d22f4d3..b551fd723 100644 --- a/iokit/IOKit/IOTimeStamp.h +++ b/iokit/IOKit/IOTimeStamp.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -190,4 +190,8 @@ IOTimeStamp(uintptr_t csc, #define IOSERVICE_TERMINATE_STOP_DEFER 16 /* 0x05080040 */ #define IOSERVICE_TERMINATE_DONE 17 /* 0x05080044 */ +#define IOSERVICE_KEXTD_ALIVE 18 /* 0x05080048 */ +#define IOSERVICE_KEXTD_READY 19 /* 0x0508004C */ +#define IOSERVICE_REGISTRY_QUIET 20 /* 0x05080050 */ + #endif /* ! IOKIT_IOTIMESTAMP_H */ diff --git a/iokit/IOKit/IOTimerEventSource.h b/iokit/IOKit/IOTimerEventSource.h index 7cc0d38c3..bbbeaf964 100644 --- a/iokit/IOKit/IOTimerEventSource.h +++ b/iokit/IOKit/IOTimerEventSource.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2000, 2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -95,10 +95,6 @@ class IOTimerEventSource : public IOEventSource @abstract Sub-class implementation of free method, frees calloutEntry */ virtual void free(); -/*! @function checkForWork - @abstract Have to implement it is mandatory in $link IOEventSource, but IOTimerEventSources don't actually use this work-loop mechanism. */ - virtual bool checkForWork(); - virtual void setWorkLoop(IOWorkLoop *workLoop); public: diff --git a/iokit/IOKit/IOTypes.h b/iokit/IOKit/IOTypes.h index 9f5d5a3f7..3c41ab070 100644 --- a/iokit/IOKit/IOTypes.h +++ b/iokit/IOKit/IOTypes.h @@ -164,7 +164,12 @@ typedef unsigned int IOAlignment; #ifndef __IOKIT_PORTS_DEFINED__ #define __IOKIT_PORTS_DEFINED__ #ifdef KERNEL +#ifdef __cplusplus +class OSObject; +typedef OSObject * io_object_t; +#else typedef struct OSObject * io_object_t; +#endif #else /* KERNEL */ typedef mach_port_t io_object_t; #endif /* KERNEL */ diff --git a/iokit/IOKit/IOUserClient.h b/iokit/IOKit/IOUserClient.h index 7283ebd41..c3c40c57a 100644 --- a/iokit/IOKit/IOUserClient.h +++ b/iokit/IOKit/IOUserClient.h @@ -37,6 +37,9 @@ #include #include +#if IOKITSTATS +#include +#endif enum { kIOUCTypeMask = 0x0000000f, @@ -164,18 +167,29 @@ enum { class IOUserClient : public IOService { OSDeclareAbstractStructors(IOUserClient) +#if IOKITSTATS + friend class IOStatistics; +#endif protected: /*! @struct ExpansionData @discussion This structure will be used to expand the capablilties of this class in the future. */ - struct ExpansionData { }; + struct ExpansionData { +#if IOKITSTATS + IOUserClientCounter *counter; +#else + void *iokitstatsReserved; +#endif + }; /*! @var reserved Reserved for future use. (Internal use only) */ ExpansionData * reserved; + bool reserve(); + #ifdef XNU_KERNEL_PRIVATE public: #else diff --git a/iokit/IOKit/IOWorkLoop.h b/iokit/IOKit/IOWorkLoop.h index 808329ada..e248a9b3b 100644 --- a/iokit/IOKit/IOWorkLoop.h +++ b/iokit/IOKit/IOWorkLoop.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2007 Apple Inc. All rights reserved. + * Copyright (c) 1998-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,14 +25,6 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* -Copyright (c) 1998 Apple Computer, Inc. All rights reserved. -HISTORY - 1998-7-13 Godfrey van der Linden(gvdl) - Created. - 1998-10-30 Godfrey van der Linden(gvdl) - Converted to C++ -*/ #ifndef __IOKIT_IOWORKLOOP_H #define __IOKIT_IOWORKLOOP_H @@ -44,6 +36,10 @@ HISTORY #include +#if IOKITSTATS +#include +#endif + class IOEventSource; class IOTimerEventSource; class IOCommandGate; @@ -87,7 +83,14 @@ member function's parameter list. @abstract Static function that calls the threadMain function. */ static void threadMainContinuation(IOWorkLoop *self); - + +/*! @function eventSourcePerformsWork + @abstract Checks if the event source passed in overrides checkForWork() to perform any work. +IOWorkLoop uses this to determine if the event source should be polled in runEventSources() or not. + @param inEventSource The event source to check. +*/ + bool eventSourcePerformsWork(IOEventSource *inEventSource); + protected: /*! @typedef maintCommandEnum @@ -138,6 +141,15 @@ member function's parameter list. */ struct ExpansionData { IOOptionBits options; + IOEventSource *passiveEventChain; +#if DEBUG + void * allocationBacktrace[16]; +#endif /* DEBUG */ +#if IOKITSTATS + struct IOWorkLoopCounter *counter; +#else + void *iokitstatsReserved; +#endif }; /*! @var reserved @@ -237,13 +249,13 @@ member function's parameter list. /*! @function enableAllInterrupts @abstract Calls enable() in all interrupt event sources. - @discussion For all event sources (ES) for which IODynamicCast(IOInterruptEventSource, ES) is valid, in eventChain call enable() function. See IOEventSource::enable(). + @discussion For all event sources (ES) for which OSDynamicCast(IOInterruptEventSource, ES) is valid, in eventChain call enable() function. See IOEventSource::enable(). */ virtual void enableAllInterrupts() const; /*! @function disableAllInterrupts @abstract Calls disable() in all interrupt event sources. - @discussion For all event sources (ES) for which IODynamicCast(IOInterruptEventSource, ES) is valid, in eventChain call disable() function. See IOEventSource::disable(). + @discussion For all event sources (ES) for which OSDynamicCast(IOInterruptEventSource, ES) is valid, in eventChain call disable() function. See IOEventSource::disable(). */ virtual void disableAllInterrupts() const; @@ -252,6 +264,9 @@ member function's parameter list. // Internal APIs used by event sources to control the thread friend class IOEventSource; friend class IOTimerEventSource; +#if IOKITSTATS + friend class IOStatistics; +#endif virtual void signalWorkAvailable(); virtual void openGate(); virtual void closeGate(); diff --git a/iokit/IOKit/Makefile b/iokit/IOKit/Makefile index 23d52274b..7b3c8df3e 100644 --- a/iokit/IOKit/Makefile +++ b/iokit/IOKit/Makefile @@ -3,9 +3,13 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir -IOKIT_FRAMEDIR = $(FRAMEDIR)/IOKit.framework/Versions/A -export INCDIR = $(IOKIT_FRAMEDIR)/Headers -export LCLDIR = $(IOKIT_FRAMEDIR)/PrivateHeaders +IOKIT_INCVERS = A +IOKIT_INCFRAME = $(FRAMEDIR)/IOKit.framework +IOKIT_INCDIR = $(IOKIT_INCFRAME)/Versions/$(IOKIT_INCVERS)/Headers +IOKIT_PINCDIR = $(IOKIT_INCFRAME)/Versions/$(IOKIT_INCVERS)/PrivateHeaders + +export INCDIR = $(IOKIT_INCDIR) +export LCLDIR = $(IOKIT_PINCDIR) include $(MakeInc_cmd) include $(MakeInc_def) @@ -18,20 +22,13 @@ INSTINC_SUBDIRS = \ rtc \ system_management -INSTINC_SUBDIRS_PPC = \ - ppc - -INSTINC_SUBDIRS_I386 = \ - i386 +INSTINC_SUBDIRS_I386 = -INSTINC_SUBDIRS_X86_64 = \ - i386 +INSTINC_SUBDIRS_X86_64 = -INSTINC_SUBDIRS_ARM = \ - arm +INSTINC_SUBDIRS_ARM = EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} -EXPINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS_PPC} EXPINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS_I386} EXPINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS_X86_64} EXPINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS_ARM} @@ -41,7 +38,9 @@ NOT_EXPORT_HEADERS = NOT_KF_MI_HEADERS = $(NOT_EXPORT_HEADERS) \ IOKitKeysPrivate.h IOCPU.h \ IOHibernatePrivate.h IOPolledInterface.h \ - IOCommandQueue.h IOLocksPrivate.h + IOCommandQueue.h IOLocksPrivate.h \ + AppleKeyStoreInterface.h \ + IOStatistics.h IOStatisticsPrivate.h NOT_LOCAL_HEADERS = @@ -51,7 +50,7 @@ INSTALL_MI_LIST = IOBSD.h IOKitKeys.h IOKitServer.h IOReturn.h\ IOSharedLock.h IOTypes.h OSMessageNotification.h\ IODataQueueShared.h IOMessage.h -INSTALL_MI_LCL_LIST = IOKitKeysPrivate.h IOHibernatePrivate.h IOLocksPrivate.h +INSTALL_MI_LCL_LIST = IOKitKeysPrivate.h IOHibernatePrivate.h IOLocksPrivate.h IOStatistics.h AppleKeyStoreInterface.h INSTALL_MI_DIR = . diff --git a/iokit/IOKit/i386/IOSharedLockImp.h b/iokit/IOKit/i386/IOSharedLockImp.h deleted file mode 100644 index cb15fb1d8..000000000 --- a/iokit/IOKit/i386/IOSharedLockImp.h +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1998 Apple Computer, Inc. All rights reserved. - * - * HISTORY - * - */ - -/* Copyright (c) 1992 NeXT Computer, Inc. All rights reserved. - * - * EventShmemLock.h - Shared memory area locks for use between the - * WindowServer and the Event Driver. - * - * - * HISTORY - * 29 April 1992 Mike Paquette at NeXT - * Created. - * - * Multiprocessor locks used within the shared memory area between the - * kernel and event system. These must work in both user and kernel mode. - * The locks are defined in an include file so they get exported to the local - * include file area. - * - * This is basically a ripoff of the spin locks under the cthreads packages. - */ - -#ifndef _IOKIT_IOSHAREDLOCKIMP_H -#define _IOKIT_IOSHAREDLOCKIMP_H - -#include - -#ifndef KERNEL -#error this file for kernel only; comm page has user versions -#endif - - TEXT - -/* - * void - * ev_unlock(p) - * int *p; - * - * Unlock the lock pointed to by p. - */ -LEAF(_ev_unlock, 0) -LEAF(_IOSpinUnlock, 0) -#if __x86_64__ - movl $0, (%rdi) -#else - movl 4(%esp), %ecx - movl $0, (%ecx) -#endif -END(_ev_unlock) - - -/* - * int - * ev_try_lock(p) - * int *p; - * - * Try to lock p. Return zero if not successful. - */ - -LEAF(_ev_try_lock, 0) -LEAF(_IOTrySpinLock, 0) -#if __x86_64__ - xorl %eax, %eax - orl $-1, %edx - lock - cmpxchgl %edx, (%rdi) - setz %dl - movzbl %dl, %eax -#else - movl 4(%esp), %ecx - xorl %eax, %eax - lock - cmpxchgl %ecx, (%ecx) - jne 1f - movl $1, %eax /* yes */ - ret -1: - xorl %eax, %eax /* no */ -#endif -END(_ev_try_lock) - - -#endif /* ! _IOKIT_IOSHAREDLOCKIMP_H */ diff --git a/iokit/IOKit/machine/Makefile b/iokit/IOKit/machine/Makefile index 4a77745b4..14dd46d76 100644 --- a/iokit/IOKit/machine/Makefile +++ b/iokit/IOKit/machine/Makefile @@ -14,12 +14,10 @@ MI_DIR = machine EXCLUDE_HEADERS = INSTINC_SUBDIRS = -INSTINC_SUBDIRS_PPC = INSTINC_SUBDIRS_I386 = INSTINC_SUBDIRS_X86_64 = EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} -EXPINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS_PPC} EXPINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS_I386} EXPINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS_X86_64} diff --git a/iokit/IOKit/nvram/Makefile b/iokit/IOKit/nvram/Makefile index 3235dd242..2a3da6d3c 100644 --- a/iokit/IOKit/nvram/Makefile +++ b/iokit/IOKit/nvram/Makefile @@ -14,13 +14,11 @@ MI_DIR = nvram NOT_EXPORT_HEADERS = INSTINC_SUBDIRS = -INSTINC_SUBDIRS_PPC = INSTINC_SUBDIRS_I386 = INSTINC_SUBDIRS_X86_64 = INSTINC_SUBDIRS_ARM = EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} -EXPINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS_PPC} EXPINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS_I386} EXPINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS_X86_64} EXPINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS_ARM} diff --git a/iokit/IOKit/platform/Makefile b/iokit/IOKit/platform/Makefile index 644b0b114..7d5079f87 100644 --- a/iokit/IOKit/platform/Makefile +++ b/iokit/IOKit/platform/Makefile @@ -15,13 +15,11 @@ NOT_EXPORT_HEADERS = NOT_KF_MI_HEADERS = INSTINC_SUBDIRS = -INSTINC_SUBDIRS_PPC = INSTINC_SUBDIRS_I386 = INSTINC_SUBDIRS_X86_64 = INSTINC_SUBDIRS_ARM = EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} -EXPINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS_PPC} EXPINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS_I386} EXPINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS_X86_64} EXPINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS_ARM} diff --git a/iokit/IOKit/power/Makefile b/iokit/IOKit/power/Makefile index dcebcdb9b..fd1518bd7 100644 --- a/iokit/IOKit/power/Makefile +++ b/iokit/IOKit/power/Makefile @@ -14,13 +14,11 @@ MI_DIR = power NOT_EXPORT_HEADERS = INSTINC_SUBDIRS = -INSTINC_SUBDIRS_PPC = INSTINC_SUBDIRS_I386 = INSTINC_SUBDIRS_X86_64 = INSTINC_SUBDIRS_ARM = EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} -EXPINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS_PPC} EXPINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS_I386} EXPINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS_X86_64} EXPINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS_ARM} diff --git a/iokit/IOKit/ppc/IODBDMA.h b/iokit/IOKit/ppc/IODBDMA.h deleted file mode 100644 index afe1337bb..000000000 --- a/iokit/IOKit/ppc/IODBDMA.h +++ /dev/null @@ -1,367 +0,0 @@ -/* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1997 Apple Computer, Inc. - * - * - * HISTORY - * - * Simon Douglas 10 Nov 97 - * - first checked in, mostly from MacOS DBDMA.i, machdep/ppc/dbdma.h - * but use byte reverse ops. - */ - -#ifndef _IODBDMA_H_ -#define _IODBDMA_H_ - -#include -#include - - -/* DBDMA definitions */ - -struct IODBDMAChannelRegisters { - volatile unsigned long channelControl; - volatile unsigned long channelStatus; - volatile unsigned long commandPtrHi; /* implementation optional*/ - volatile unsigned long commandPtrLo; - volatile unsigned long interruptSelect; /* implementation optional*/ - volatile unsigned long branchSelect; /* implementation optional*/ - volatile unsigned long waitSelect; /* implementation optional*/ - volatile unsigned long transferModes; /* implementation optional*/ - volatile unsigned long data2PtrHi; /* implementation optional*/ - volatile unsigned long data2PtrLo; /* implementation optional*/ - - volatile unsigned long reserved1; - volatile unsigned long addressHi; /* implementation optional*/ - volatile unsigned long reserved2[4]; - volatile unsigned long unimplemented[16]; - -/* This structure must remain fully padded to 256 bytes.*/ - volatile unsigned long undefined[32]; -}; -typedef struct IODBDMAChannelRegisters IODBDMAChannelRegisters; - -/* These constants define the DB-DMA channel control words and status flags.*/ - -enum { - kdbdmaRun = 0x00008000, - kdbdmaPause = 0x00004000, - kdbdmaFlush = 0x00002000, - kdbdmaWake = 0x00001000, - kdbdmaDead = 0x00000800, - kdbdmaActive = 0x00000400, - kdbdmaBt = 0x00000100, - kdbdmaS7 = 0x00000080, - kdbdmaS6 = 0x00000040, - kdbdmaS5 = 0x00000020, - kdbdmaS4 = 0x00000010, - kdbdmaS3 = 0x00000008, - kdbdmaS2 = 0x00000004, - kdbdmaS1 = 0x00000002, - kdbdmaS0 = 0x00000001 -}; - - -#define IOSetDBDMAChannelControlBits(mask) ( ((mask) | (mask) << 16) ) -#define IOClearDBDMAChannelControlBits(mask) ( (mask) << 16) - - -/* This structure defines the DB-DMA channel command descriptor.*/ - -/* - *** WARNING: Endian-ness issues must be considered when performing load/store! *** -*/ - -struct IODBDMADescriptor { - unsigned long operation; /* cmd || key || i || b || w || reqCount*/ - unsigned long address; - volatile unsigned long cmdDep; - volatile unsigned long result; /* xferStatus || resCount*/ -}; -typedef struct IODBDMADescriptor IODBDMADescriptor; - -/* These constants define the DB-DMA channel command operations and modifiers.*/ - - -enum { -/* Command.cmd operations*/ - kdbdmaOutputMore = 0, - kdbdmaOutputLast = 1, - kdbdmaInputMore = 2, - kdbdmaInputLast = 3, - kdbdmaStoreQuad = 4, - kdbdmaLoadQuad = 5, - kdbdmaNop = 6, - kdbdmaStop = 7 -}; - - -enum { -/* Command.key modifiers (choose one for INPUT, OUTPUT, LOAD, and STORE)*/ - kdbdmaKeyStream0 = 0, /* default modifier*/ - kdbdmaKeyStream1 = 1, - kdbdmaKeyStream2 = 2, - kdbdmaKeyStream3 = 3, - kdbdmaKeyRegs = 5, - kdbdmaKeySystem = 6, - kdbdmaKeyDevice = 7, - - kdbdmaIntNever = 0, /* default modifier*/ - kdbdmaIntIfTrue = 1, - kdbdmaIntIfFalse = 2, - kdbdmaIntAlways = 3, - - kdbdmaBranchNever = 0, /* default modifier*/ - kdbdmaBranchIfTrue = 1, - kdbdmaBranchIfFalse = 2, - kdbdmaBranchAlways = 3, - - kdbdmaWaitNever = 0, /* default modifier*/ - kdbdmaWaitIfTrue = 1, - kdbdmaWaitIfFalse = 2, - kdbdmaWaitAlways = 3, - - kdbdmaCommandMask = (long)0xFFFF0000, - kdbdmaReqCountMask = 0x0000FFFF -}; - - -/* These constants define the DB-DMA channel command results.*/ - -enum { - /* result masks*/ - kdbdmaStatusRun = kdbdmaRun << 16, - kdbdmaStatusPause = kdbdmaPause << 16, - kdbdmaStatusFlush = kdbdmaFlush << 16, - kdbdmaStatusWake = kdbdmaWake << 16, - kdbdmaStatusDead = kdbdmaDead << 16, - kdbdmaStatusActive = kdbdmaActive << 16, - kdbdmaStatusBt = kdbdmaBt << 16, - kdbdmaStatusS7 = kdbdmaS7 << 16, - kdbdmaStatusS6 = kdbdmaS6 << 16, - kdbdmaStatusS5 = kdbdmaS5 << 16, - kdbdmaStatusS4 = kdbdmaS4 << 16, - kdbdmaStatusS3 = kdbdmaS3 << 16, - kdbdmaStatusS2 = kdbdmaS2 << 16, - kdbdmaStatusS1 = kdbdmaS1 << 16, - kdbdmaStatusS0 = kdbdmaS0 << 16, - kdbdmaResCountMask = 0x0000FFFF, - kdbdmaXferStatusMask = 0xFFFF0000 -}; - - -/* These macros are are IODBDMAChannelRegisters accessor functions. */ - -#define IOSetDBDMAChannelRegister(registerSetPtr,field,value) \ -OSWriteSwapInt32(registerSetPtr,offsetof(IODBDMAChannelRegisters,field),value) - -#define IOGetDBDMAChannelRegister(registerSetPtr, field) \ -OSReadSwapInt32(registerSetPtr,offsetof(IODBDMAChannelRegisters, field)) - - -/* void IOSetDBDMAChannelControl (IODBDMAChannelRegisters *registerSetPtr, unsigned long ctlValue); */ - -#define IOSetDBDMAChannelControl(registerSetPtr,ctlValue) \ -do { \ - eieio(); \ - IOSetDBDMAChannelRegister(registerSetPtr,channelControl,ctlValue); \ - eieio(); \ -} while(0) - -/* unsigned long IOGetDBDMAChannelStatus (IODBDMAChannelRegisters *registerSetPtr); */ - -#define IOGetDBDMAChannelStatus(registerSetPtr) \ - IOGetDBDMAChannelRegister(registerSetPtr,channelStatus) - -/* unsigned long IOGetDBDMACommandPtr (IODBDMAChannelRegisters *registerSetPtr); */ - -#define IOGetDBDMACommandPtr(registerSetPtr) \ - IOGetDBDMAChannelRegister(registerSetPtr,commandPtrLo) - -/* void IOSetDBDMACommandPtr (IODBDMAChannelRegisters *registerSetPtr, unsigned long cclPtr); */ - -#define IOSetDBDMACommandPtr(registerSetPtr,cclPtr) \ -do { \ - IOSetDBDMAChannelRegister(registerSetPtr,commandPtrHi,0); \ - eieio(); \ - IOSetDBDMAChannelRegister(registerSetPtr,commandPtrLo,cclPtr); \ - eieio(); \ -} while(0) - - -/* unsigned long IOGetDBDMAInterruptSelect (IODBDMAChannelRegisters *registerSetPtr); */ - -#define IOGetDBDMAInterruptSelect(registerSetPtr) \ - IOGetDBDMAChannelRegister(registerSetPtr,interruptSelect) - -/* void IOSetDBDMAInterruptSelect (IODBDMAChannelRegisters *registerSetPtr, unsigned long intSelValue); */ - -#define IOSetDBDMAInterruptSelect(registerSetPtr,intSelValue) \ -do { \ - IOSetDBDMAChannelRegister(registerSetPtr,interruptSelect,intSelValue); \ - eieio(); \ -} while(0) - -/* unsigned long IOGetDBDMABranchSelect (IODBDMAChannelRegisters *registerSetPtr); */ - -#define IOGetDBDMABranchSelect(registerSetPtr) \ - IOGetDBDMAChannelRegister(registerSetPtr,branchSelect) - -/* void IOSetDBDMABranchSelect (IODBDMAChannelRegisters *registerSetPtr, unsigned long braSelValue); */ - -#define IOSetDBDMABranchSelect(registerSetPtr,braSelValue) \ -do { \ - IOSetDBDMAChannelRegister(registerSetPtr,branchSelect,braSelValue); \ - eieio(); \ -} while(0) - -/* unsigned long IOGetDBDMAWaitSelect (IODBDMAChannelRegisters *registerSetPtr); */ - -#define IOGetDBDMAWaitSelect(registerSetPtr) \ - IOGetDBDMAChannelRegister(registerSetPtr,waitSelect) - -/* void IOSetDBDMAWaitSelect (IODBDMAChannelRegisters *registerSetPtr, unsigned long waitSelValue); */ - -#define IOSetDBDMAWaitSelect(registerSetPtr,waitSelValue) \ -do { \ - IOSetDBDMAChannelRegister(registerSetPtr,waitSelect,waitSelValue); \ - eieio(); \ -} while(0) - - -/* These macros are IODBDMADescriptor accessor functions. */ - -#define IOSetDBDMADescriptor(descPtr,field,value) \ -OSWriteSwapInt32( descPtr, offsetof( IODBDMADescriptor, field), value) - -#define IOGetDBDMADescriptor(descPtr,field) \ -OSReadSwapInt32( descPtr, offsetof( IODBDMADescriptor, field)) - -#define IOMakeDBDMAOperation(cmd,key,interrupt,branch,wait,count) \ - ( ((cmd) << 28) | ((key) << 24) | ((interrupt) << 20) \ - | ((branch) << 18) | ( (wait) << 16) | (count) ) - -/* void IOMakeDBDMADescriptor (IODBDMADescriptor *descPtr, - unsigned long cmd, - unsigned long key, - unsigned long interrupt, - unsigned long branch, - unsigned long wait, - unsigned long count, - unsigned long addr); */ - -#define IOMakeDBDMADescriptor(descPtr,cmd,key,interrupt,branch,wait,count,addr)\ -do { \ - IOSetDBDMADescriptor(descPtr, address, addr); \ - IOSetDBDMADescriptor(descPtr, cmdDep, 0); \ - IOSetDBDMADescriptor(descPtr, result, 0); \ - eieio(); \ - IOSetDBDMADescriptor(descPtr, operation, \ - IOMakeDBDMAOperation(cmd,key,interrupt,branch,wait,count)); \ - eieio(); \ -} while(0) - -/* void IOMakeDBDMADescriptorDep (IODBDMADescriptor *descPtr, - unsigned long cmd, - unsigned long key, - unsigned long interrupt, - unsigned long branch, - unsigned long wait, - unsigned long count, - unsigned long addr, - unsigned long dep); */ - -#define IOMakeDBDMADescriptorDep(descPtr,cmd,key,interrupt,branch,wait,count,addr,dep) \ -do { \ - IOSetDBDMADescriptor(descPtr, address, addr); \ - IOSetDBDMADescriptor(descPtr, cmdDep, dep); \ - IOSetDBDMADescriptor(descPtr, result, 0); \ - eieio(); \ - IOSetDBDMADescriptor(descPtr, operation, \ - IOMakeDBDMAOperation(cmd, key, interrupt, branch, wait, count)); \ - eieio(); \ -} while(0) - -/* Field accessors - NOTE: unsynchronized */ - -/* unsigned long IOGetDBDMAOperation (IODBDMADescriptor *descPtr) */ - -#define IOGetCCOperation(descPtr) \ - IOGetDBDMADescriptor(descPtr,operation) - -/* void IOSetCCOperation (IODBDMADescriptor *descPtr, unsigned long operationValue) */ - -#define IOSetCCOperation(descPtr,operationValue) \ - IOSetDBDMADescriptor(descPtr,operation,operationValue) - -/* unsigned long IOGetCCAddress (IODBDMADescriptor *descPtr) */ - -#define IOGetCCAddress(descPtr) \ - IOGetDBDMADescriptor(descPtr,address) - -/* void IOSetCCAddress (IODBDMADescriptor *descPtr, unsigned long addressValue) */ - -#define IOSetCCAddress(descPtr,addressValue) \ - IOSetDBDMADescriptor(descPtr,address, addressValue) - -/* unsigned long IOGetCCCmdDep (IODBDMADescriptor *descPtr) */ - -#define IOGetCCCmdDep(descPtr) \ - IOGetDBDMADescriptor(descPtr,cmdDep) - -/* void IOSetCCCmdDep (IODBDMADescriptor *descPtr, unsigned long cmdDepValue) */ - -#define IOSetCCCmdDep(descPtr,cmdDepValue) \ - IOSetDBDMADescriptor(descPtr,cmdDep,cmdDepValue) - -/* unsigned long IOGetCCResult (IODBDMADescriptor *descPtr) */ - -#define IOGetCCResult(descPtr) \ - IOGetDBDMADescriptor(descPtr,result) - -/* void IOSetCCResult (IODBDMADescriptor *descPtr, unsigned long resultValue) */ - -#define IOSetCCResult(descPtr,resultValue) \ - IOSetDBDMADescriptor(descPtr,result,resultValue) - - -/* DBDMA routines */ - -extern void IODBDMAStart( volatile IODBDMAChannelRegisters *registerSetPtr, volatile IODBDMADescriptor *physicalDescPtr); -extern void IODBDMAStop( volatile IODBDMAChannelRegisters *registerSetPtr); -extern void IODBDMAFlush( volatile IODBDMAChannelRegisters *registerSetPtr); -extern void IODBDMAReset( volatile IODBDMAChannelRegisters *registerSetPtr); -extern void IODBDMAContinue( volatile IODBDMAChannelRegisters *registerSetPtr); -extern void IODBDMAPause( volatile IODBDMAChannelRegisters *registerSetPtr); - -extern IOReturn IOAllocatePhysicallyContiguousMemory( unsigned int size, unsigned int options, - IOVirtualAddress * logical, IOPhysicalAddress * physical ); -extern IOReturn IOFreePhysicallyContiguousMemory( IOVirtualAddress * logical, unsigned int size); - -#endif /* !defined(_IODBDMA_H_) */ diff --git a/iokit/IOKit/ppc/IOSharedLockImp.h b/iokit/IOKit/ppc/IOSharedLockImp.h deleted file mode 100644 index 8c685b223..000000000 --- a/iokit/IOKit/ppc/IOSharedLockImp.h +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1998 Apple Computer, Inc. All rights reserved. - * - * HISTORY - * - */ - -/* Copyright (c) 1992 NeXT Computer, Inc. All rights reserved. - * - * EventShmemLock.h - Shared memory area locks for use between the - * WindowServer and the Event Driver. - * - * HISTORY - * 30 Nov 1992 Ben Fathi (benf@next.com) - * Ported to m98k. - * - * 29 April 1992 Mike Paquette at NeXT - * Created. - * - * Multiprocessor locks used within the shared memory area between the - * kernel and event system. These must work in both user and kernel mode. - * The locks are defined in an include file so they get exported to the local - * include file area. - */ - - -#ifndef _IOKIT_IOSHAREDLOCKIMP_H -#define _IOKIT_IOSHAREDLOCKIMP_H - -#include -#ifdef KERNEL -#undef END -#include -#endif - -/* - * void - * ev_lock(p) - * register int *p; - * - * Lock the lock pointed to by p. Spin (possibly forever) until - * the lock is available. Test and test and set logic used. - */ - TEXT - -#ifndef KERNEL -LEAF(_ev_lock) - - li a6,1 // lock value - -8: lwz a7,0(a0) // Get lock word - mr. a7,a7 // Is it held? - bne-- 8b // Yup... - -9: lwarx a7,0,a0 // read the lock - mr. a7,a7 // Is it held? - bne-- 7f // yes, kill reservation - stwcx. a6,0,a0 // try to get the lock - bne-- 9b // failed, try again - isync - blr // got it, return - -7: li a7,-4 // Point to a spot in the red zone - stwcx. a7,a7,r1 // Kill reservation - b 8b // Go wait some more... - - -END(_ev_lock) - -LEAF(_IOSpinLock) - - li a6,1 // lock value - -8: lwz a7,0(a0) // Get lock word - mr. a7,a7 // Is it held? - bne-- 8b // Yup... - -9: lwarx a7,0,a0 // read the lock - mr. a7,a7 // Is it held? - bne-- 7f // yes, kill reservation - stwcx. a6,0,a0 // try to get the lock - bne-- 9b // failed, try again - isync - blr // got it, return - -7: li a7,-4 // Point to a spot in the red zone - stwcx. a7,a7,r1 // Kill reservation - b 8b // Go wait some more... -END(_IOSpinLock) -#endif - -/* - * void - * spin_unlock(p) - * int *p; - * - * Unlock the lock pointed to by p. - */ - -LEAF(_ev_unlock) - sync - li a7,0 - stw a7,0(a0) - blr -END(_ev_unlock) - -LEAF(_IOSpinUnlock) - sync - li a7,0 - stw a7,0(a0) - blr -END(_IOSpinUnlock) - - -/* - * ev_try_lock(p) - * int *p; - * - * Try to lock p. Return TRUE if successful in obtaining lock. - */ - -LEAF(_ev_try_lock) - li a6,1 // lock value - - lwz a7,0(a0) // Get lock word - mr. a7,a7 // Is it held? - bne-- 6f // Yup... - -9: lwarx a7,0,a0 // read the lock - mr. a7,a7 // Is it held? - bne-- 7f // yes, kill reservation - stwcx. a6,0,a0 // try to get the lock - bne-- 9b // failed, try again - li a0,1 // return TRUE - isync - blr // got it, return - -7: li a7,-4 // Point to a spot in the red zone - stwcx. a7,a7,r1 // Kill reservation - -6: - li a0,0 // return FALSE - blr - -END(_ev_try_lock) - -LEAF(_IOTrySpinLock) - li a6,1 // lock value - - lwz a7,0(a0) // Get lock word - mr. a7,a7 // Is it held? - bne-- 6f // Yup... - -9: lwarx a7,0,a0 // read the lock - mr. a7,a7 // Is it held? - bne-- 7f // yes, kill reservation - stwcx. a6,0,a0 // try to get the lock - bne-- 9b // failed, try again - li a0,1 // return TRUE - isync - blr // got it, return - -7: li a7,-4 // Point to a spot in the red zone - stwcx. a7,a7,r1 // Kill reservation - -6: - li a0,0 // return FALSE - blr - -END(_IOTrySpinLock) - -#endif /* ! _IOKIT_IOSHAREDLOCKIMP_H */ diff --git a/iokit/IOKit/ppc/Makefile b/iokit/IOKit/ppc/Makefile deleted file mode 100644 index 21ff86cad..000000000 --- a/iokit/IOKit/ppc/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - -IOKIT_FRAMEDIR = $(FRAMEDIR)/IOKit.framework/Versions/A -export INCDIR = $(IOKIT_FRAMEDIR)/Headers -export LCLDIR = $(IOKIT_FRAMEDIR)/PrivateHeaders - -include $(MakeInc_cmd) -include $(MakeInc_def) - -MD_DIR = ppc -NOT_EXPORT_HEADERS = IOSharedLockImp.h - -INSTINC_SUBDIRS = -INSTINC_SUBDIRS_PPC = - -EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} -EXPINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS_PPC} - -ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h)) - -INSTALL_MD_LIST = IOSharedLockImp.h -INSTALL_MD_LCL_LIST = "" -INSTALL_MD_DIR = $(MD_DIR) - -EXPORT_MD_LIST = $(filter-out $(NOT_EXPORT_HEADERS), $(ALL_HEADERS)) -EXPORT_MD_DIR = IOKit/$(MD_DIR) - -include $(MakeInc_rule) -include $(MakeInc_dir) diff --git a/iokit/IOKit/pwr_mgt/IOPM.h b/iokit/IOKit/pwr_mgt/IOPM.h index 804b9bbfd..f0002d5d6 100644 --- a/iokit/IOKit/pwr_mgt/IOPM.h +++ b/iokit/IOKit/pwr_mgt/IOPM.h @@ -32,10 +32,6 @@ #include #include -#ifdef __ppc__ -#include -#endif - /*! @header IOPM.h @abstract Defines power management constants and keys used by both in-kernel and user space power management. @discussion IOPM.h defines a range of power management constants used in several in-kernel and user space APIs. Most significantly, the IOPMPowerFlags used to specify the fields of an IOPMPowerState struct are defined here. @@ -80,7 +76,7 @@ enum { Useful only as a Capability. - @constant kIOPMSleepCapability + @constant kIOPMSleepCapability Used only by certain IOKit Families (USB). Not defined or used by generic Power Management. Read your family documentation to see if you should define a powerstate using these capabilities. @constant kIOPMRestartCapability @@ -91,6 +87,9 @@ enum { @constant kIOPMRestart Used only by certain IOKit Families (USB). Not defined or used by generic Power Management. Read your family documentation to see if you should define a powerstate using these capabilities. + + @constant kIOPMInitialDeviceState + Indicates the initial power state for the device. If initialPowerStateForDomainState() returns a power state with this flag set in the capability field, then the initial power change is performed without calling the driver's setPowerState(). */ typedef unsigned long IOPMPowerFlags; enum { @@ -101,7 +100,8 @@ enum { kIOPMSleepCapability = 0x00000004, kIOPMRestartCapability = 0x00000080, kIOPMSleep = 0x00000001, - kIOPMRestart = 0x00000080 + kIOPMRestart = 0x00000080, + kIOPMInitialDeviceState = 0x00000100 }; /* @@ -121,7 +121,6 @@ enum { kIOPMNotPowerManaged = 0x0800 }; - /* * Deprecated IOPMPowerFlags * Their behavior is undefined when used in IOPMPowerState @@ -221,7 +220,7 @@ enum { * * See IOPMrootDomain notification kIOPMMessageSleepWakeUUIDChange */ -#define kIOPMSleepWakeUUIDKey "SleepWakeUUID" + #define kIOPMSleepWakeUUIDKey "SleepWakeUUID" /* kIOPMDeepSleepEnabledKey * Indicates the Deep Sleep enable state. @@ -239,11 +238,14 @@ enum { */ #define kIOPMDeepSleepDelayKey "Standby Delay" -/* kIOPMLowBatteryWakeThresholdKey - * Key refers to a CFNumberRef that represents the percentage of battery - * remaining charge that will trigger a system wake followed by Deep Sleep. +/* kIOPMDestroyFVKeyOnStandbyKey + * Specifies if FileVault key can be stored when going to standby mode + * It has a boolean value, + * true == Destroy FV key when going to standby mode + * false == Retain FV key when going to standby mode + * not present == Retain FV key when going to standby mode */ -#define kIOPMLowBatteryWakeThresholdKey "LowBatteryWakeThreshold" +#define kIOPMDestroyFVKeyOnStandbyKey "DestroyFVKeyOnStandby" /******************************************************************************* * @@ -276,8 +278,16 @@ enum { */ kIOPMDriverAssertionExternalMediaMountedBit = 0x10, + /*! kIOPMDriverAssertionReservedBit5 + * Reserved for Thunderbolt. + */ kIOPMDriverAssertionReservedBit5 = 0x20, - kIOPMDriverAssertionReservedBit6 = 0x40, + + /*! kIOPMDriverAssertionPreventDisplaySleepBit + * When set, the display should remain powered on while the system's awake. + */ + kIOPMDriverAssertionPreventDisplaySleepBit = 0x40, + kIOPMDriverAssertionReservedBit7 = 0x80 }; @@ -406,6 +416,7 @@ enum { * These commands are issued from system drivers only: * ApplePMU, AppleSMU, IOGraphics, AppleACPIFamily * + * TODO: deprecate kIOPMAllowSleep and kIOPMPreventSleep ******************************************************************************/ enum { kIOPMSleepNow = (1<<0), // put machine to sleep now @@ -500,6 +511,8 @@ enum { #define kIOPMPSCapacityEstimatedKey "CapacityEstimated" #define kIOPMPSBatteryChargeStatusKey "ChargeStatus" #define kIOPMPSBatteryTemperatureKey "Temperature" +#define kIOPMPSAdapterDetailsKey "AdapterDetails" +#define kIOPMPSChargerConfigurationKey "ChargerConfiguration" // kIOPMPSBatteryChargeStatusKey may have one of the following values, or may have // no value. If kIOPMBatteryChargeStatusKey has a NULL value (or no value) associated with it @@ -507,6 +520,7 @@ enum { // then the charge may have been interrupted. #define kIOPMBatteryChargeStatusTooHot "HighTemperature" #define kIOPMBatteryChargeStatusTooCold "LowTemperature" +#define kIOPMBatteryChargeStatusTooHotOrCold "HighOrLowTemperature" #define kIOPMBatteryChargeStatusGradient "BatteryTemperatureGradient" // Definitions for battery location, in case of multiple batteries. @@ -526,6 +540,16 @@ enum { kIOPMGoodValue = 3 }; +// Keys for kIOPMPSAdapterDetailsKey dictionary +#define kIOPMPSAdapterDetailsIDKey "AdapterID" +#define kIOPMPSAdapterDetailsWattsKey "Watts" +#define kIOPMPSAdapterDetailsRevisionKey "AdapterRevision" +#define kIOPMPSAdapterDetailsSerialNumberKey "SerialNumber" +#define kIOPMPSAdapterDetailsFamilyKey "FamilyCode" +#define kIOPMPSAdapterDetailsAmperageKey "Amperage" +#define kIOPMPSAdapterDetailsDescriptionKey "Description" +#define kIOPMPSAdapterDetailsPMUConfigurationKey "PMUConfiguration" + // Battery's time remaining estimate is invalid this long (seconds) after a wake #define kIOPMPSInvalidWakeSecondsKey "BatteryInvalidWakeSeconds" @@ -688,7 +712,6 @@ enum { kIOBatteryChargerConnect = (1 << 0) }; - // Private power management message indicating battery data has changed // Indicates new data resides in the IORegistry #define kIOPMMessageBatteryStatusHasChanged iokit_family_msg(sub_iokit_pmu, 0x100) @@ -714,7 +737,6 @@ enum { kIOPMClamshellStateOnWake = (1<<10) // used only by Platform Expert }; - // ********************************************** // Internal power management data structures // ********************************************** @@ -731,7 +753,7 @@ enum { kIOPMSuperclassPolicy1 }; -struct stateChangeNote{ +struct stateChangeNote { IOPMPowerFlags stateFlags; unsigned long stateNum; void * powerRef; @@ -748,5 +770,54 @@ typedef struct IOPowerStateChangeNotification IOPowerStateChangeNotification; typedef IOPowerStateChangeNotification sleepWakeNote; #endif /* KERNEL && __cplusplus */ -#endif /* ! _IOKIT_IOPM_H */ +/*! @struct IOPMSystemCapabilityChangeParameters + @abstract A structure describing a system capability change. + @discussion A system capability change is a system level transition from a set + of system capabilities to a new set of system capabilities. Power management + sends a kIOMessageSystemCapabilityChange message and provides + this structure as the message data (by reference) to + gIOPriorityPowerStateInterest clients when system capability + changes. + @field notifyRef An identifier for this message notification. Clients with pending + I/O can signal completion by calling allowPowerChange() with this + value as the argument. Clients that are able to process the notification + synchronously should ignore this field. + @field maxWaitForReply A return value to the caller indicating the maximum time in + microseconds to wait for the allowPowerChange() call. The default + value is zero, which indicates the client processing has finished, and power + management should not wait for an allowPowerChange() call. + @field changeFlags Flags will be set to indicate whether the notification precedes + the capability change (kIOPMSystemCapabilityWillChange), or after + the capability change has occurred (kIOPMSystemCapabilityDidChange). + @field __reserved1 Set to zero. + @field fromCapabilities The system capabilities at the start of the transition. + @field toCapabilities The system capabilities at the end of the transition. + @field __reserved2 Set to zero. + */ +struct IOPMSystemCapabilityChangeParameters { + uint32_t notifyRef; + uint32_t maxWaitForReply; + uint32_t changeFlags; + uint32_t __reserved1; + uint32_t fromCapabilities; + uint32_t toCapabilities; + uint32_t __reserved2[4]; +}; + +/*! @enum IOPMSystemCapabilityChangeFlags + @constant kIOPMSystemCapabilityWillChange Indicates the system capability will change. + @constant kIOPMSystemCapabilityDidChange Indicates the system capability has changed. +*/ +enum { + kIOPMSystemCapabilityWillChange = 0x01, + kIOPMSystemCapabilityDidChange = 0x02 +}; +enum { + kIOPMSystemCapabilityCPU = 0x01, + kIOPMSystemCapabilityGraphics = 0x02, + kIOPMSystemCapabilityAudio = 0x04, + kIOPMSystemCapabilityNetwork = 0x08 +}; + +#endif /* ! _IOKIT_IOPM_H */ diff --git a/iokit/IOKit/pwr_mgt/IOPMDeprecated.h b/iokit/IOKit/pwr_mgt/IOPMDeprecated.h deleted file mode 100644 index 3bee01a3b..000000000 --- a/iokit/IOKit/pwr_mgt/IOPMDeprecated.h +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright (c) 1998-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef _IOPMDeprecated_h_ -#define _IOPMDeprecated_h_ - -#ifdef __ppc__ - -// Power events -enum { - kClamshellClosedEventMask = (1<<0), // User closed lid - kDockingBarEventMask = (1<<1), // OBSOLETE - kACPlugEventMask = (1<<2), // User plugged or unplugged adapter - kFrontPanelButtonEventMask = (1<<3), // User hit the front panel button - kBatteryStatusEventMask = (1<<4) // Battery status has changed -}; - -// PUBLIC power management features -// NOTE: this is a direct port from classic, some of these bits -// are obsolete but are included for completeness -enum { - kPMHasWakeupTimerMask = (1<<0), // 1=wake timer is supported - kPMHasSharedModemPortMask = (1<<1), // Not used - kPMHasProcessorCyclingMask = (1<<2), // 1=processor cycling supported - kPMMustProcessorCycleMask = (1<<3), // Not used - kPMHasReducedSpeedMask = (1<<4), // 1=supports reduced processor speed - kPMDynamicSpeedChangeMask = (1<<5), // 1=supports changing processor speed on the fly - kPMHasSCSIDiskModeMask = (1<<6), // 1=supports using machine as SCSI drive - kPMCanGetBatteryTimeMask = (1<<7), // 1=battery time can be calculated - kPMCanWakeupOnRingMask = (1<<8), // 1=machine can wake on modem ring - kPMHasDimmingSupportMask = (1<<9), // 1=has monitor dimming support - kPMHasStartupTimerMask = (1<<10), // 1=can program startup timer - kPMHasChargeNotificationMask = (1<<11), // 1=client can determine charger status/get notifications - kPMHasDimSuspendSupportMask = (1<<12), // 1=can dim diplay to DPMS ('off') state - kPMHasWakeOnNetActivityMask = (1<<13), // 1=supports waking upon receipt of net packet - kPMHasWakeOnLidMask = (1<<14), // 1=can wake upon lid/case opening - kPMCanPowerOffPCIBusMask = (1<<15), // 1=can remove power from PCI bus on sleep - kPMHasDeepSleepMask = (1<<16), // 1=supports deep (hibernation) sleep - kPMHasSleepMask = (1<<17), // 1=machine support low power sleep (ala powerbooks) - kPMSupportsServerModeAPIMask = (1<<18), // 1=supports reboot on AC resume for unexpected power loss - kPMHasUPSIntegrationMask = (1<<19) // 1=supports incorporating UPS devices into power source calcs -}; - -// PRIVATE power management features -// NOTE: this is a direct port from classic, some of these bits -// are obsolete but are included for completeness. -enum { - kPMHasExtdBattInfoMask = (1<<0), // Not used - kPMHasBatteryIDMask = (1<<1), // Not used - kPMCanSwitchPowerMask = (1<<2), // Not used - kPMHasCelsiusCyclingMask = (1<<3), // Not used - kPMHasBatteryPredictionMask = (1<<4), // Not used - kPMHasPowerLevelsMask = (1<<5), // Not used - kPMHasSleepCPUSpeedMask = (1<<6), // Not used - kPMHasBtnIntHandlersMask = (1<<7), // 1=supports individual button interrupt handlers - kPMHasSCSITermPowerMask = (1<<8), // 1=supports SCSI termination power switch - kPMHasADBButtonHandlersMask = (1<<9), // 1=supports button handlers via ADB - kPMHasICTControlMask = (1<<10), // 1=supports ICT control - kPMHasLegacyDesktopSleepMask = (1<<11), // 1=supports 'doze' style sleep - kPMHasDeepIdleMask = (1<<12), // 1=supports Idle2 in hardware - kPMOpenLidPreventsSleepMask = (1<<13), // 1=open case prevent machine from sleeping - kPMClosedLidCausesSleepMask = (1<<14), // 1=case closed (clamshell closed) causes sleep - kPMHasFanControlMask = (1<<15), // 1=machine has software-programmable fan/thermostat controls - kPMHasThermalControlMask = (1<<16), // 1=machine supports thermal monitoring - kPMHasVStepSpeedChangeMask = (1<<17), // 1=machine supports processor voltage/clock change - kPMEnvironEventsPolledMask = (1<<18) // 1=machine doesn't generate pmu env ints, we must poll instead -}; - -// DEFAULT public and private features for machines whose device tree -// does NOT contain this information (pre-Core99). - -// For Cuda-based Desktops - -#define kStdDesktopPMFeatures kPMHasWakeupTimerMask |\ - kPMHasProcessorCyclingMask |\ - kPMHasDimmingSupportMask |\ - kPMHasStartupTimerMask |\ - kPMSupportsServerModeAPIMask |\ - kPMHasUPSIntegrationMask - -#define kStdDesktopPrivPMFeatures kPMHasExtdBattInfoMask |\ - kPMHasICTControlMask |\ - kPMHasLegacyDesktopSleepMask - -#define kStdDesktopNumBatteries 0 - -// For Wallstreet (PowerBook G3 Series 1998) - -#define kWallstreetPMFeatures kPMHasWakeupTimerMask |\ - kPMHasProcessorCyclingMask |\ - kPMHasReducedSpeedMask |\ - kPMDynamicSpeedChangeMask |\ - kPMHasSCSIDiskModeMask |\ - kPMCanGetBatteryTimeMask |\ - kPMHasDimmingSupportMask |\ - kPMHasChargeNotificationMask |\ - kPMHasDimSuspendSupportMask |\ - kPMHasSleepMask - -#define kWallstreetPrivPMFeatures kPMHasExtdBattInfoMask |\ - kPMHasBatteryIDMask |\ - kPMCanSwitchPowerMask |\ - kPMHasADBButtonHandlersMask |\ - kPMHasSCSITermPowerMask |\ - kPMHasICTControlMask |\ - kPMClosedLidCausesSleepMask |\ - kPMEnvironEventsPolledMask - -#define kStdPowerBookPMFeatures kWallstreetPMFeatures -#define kStdPowerBookPrivPMFeatures kWallstreetPrivPMFeatures - -#define kStdPowerBookNumBatteries 2 - -// For 101 (PowerBook G3 Series 1999) - -#define k101PMFeatures kPMHasWakeupTimerMask |\ - kPMHasProcessorCyclingMask |\ - kPMHasReducedSpeedMask |\ - kPMDynamicSpeedChangeMask |\ - kPMHasSCSIDiskModeMask |\ - kPMCanGetBatteryTimeMask |\ - kPMHasDimmingSupportMask |\ - kPMHasChargeNotificationMask |\ - kPMHasDimSuspendSupportMask |\ - kPMHasSleepMask |\ - kPMHasUPSIntegrationMask - -#define k101PrivPMFeatures kPMHasExtdBattInfoMask |\ - kPMHasBatteryIDMask |\ - kPMCanSwitchPowerMask |\ - kPMHasADBButtonHandlersMask |\ - kPMHasSCSITermPowerMask |\ - kPMHasICTControlMask |\ - kPMClosedLidCausesSleepMask |\ - kPMEnvironEventsPolledMask - - -// These flags are deprecated. Use the version with the kIOPM prefix in IOPM.h -enum { - kACInstalled = (1<<0), - kBatteryCharging = (1<<1), - kBatteryInstalled = (1<<2), - kUPSInstalled = (1<<3), - kBatteryAtWarn = (1<<4), - kBatteryDepleted = (1<<5), - kACnoChargeCapability = (1<<6), // AC adapter cannot charge battery - kRawLowBattery = (1<<7), // used only by Platform Expert - kForceLowSpeed = (1<<8) // set by Platfm Expert, chk'd by Pwr Plugin}; -}; - -#endif /* __ppc32 */ -#endif /* _IOPMDeprecated_h_ */ diff --git a/iokit/IOKit/pwr_mgt/IOPMPrivate.h b/iokit/IOKit/pwr_mgt/IOPMPrivate.h index 88ff6c788..3e61d81e0 100644 --- a/iokit/IOKit/pwr_mgt/IOPMPrivate.h +++ b/iokit/IOKit/pwr_mgt/IOPMPrivate.h @@ -30,8 +30,197 @@ #include -/*****************************************************************************/ +#pragma mark PM Timeline Logging +/************************************************** +* +* Timeline API Keys - Reports timing details for +* applications, drivers, and system during PM activity +* +* For kernel-internal use only +**************************************************/ + +// Keys for interfacing with IOPMrootDomain Timeline +/* @constant kIOPMTimelineDictionaryKey + * @abstract RootDomain key for dictionary describing Timeline's info + */ +#define kIOPMTimelineDictionaryKey "PMTimelineLogging" + +/* @constant kIOPMTimelineEnabledKey + * @abstract Boolean value indicating whether the system is recording PM events. + * @discussion Key may be found in the dictionary at IOPMrootDomain's property + * kIOPMTimelineDictionaryKey. uint32_t value; may be 0. + */ +#define kIOPMTimelineEnabledKey "TimelineEnabled" + +/* @constant kIOMPTimelineSystemNumberTrackedKey + * @abstract The maximum number of system power events the system may record. + * @discussion Key may be found in the dictionary at IOPMrootDomain's property + * kIOPMTimelineDictionaryKey. uint32_t value; may be 0. + */ +#define kIOPMTimelineSystemNumberTrackedKey "TimelineSystemEventsTracked" + +/* @constant kIOPMTimelineSystemBufferSizeKey + * @abstract Size in bytes of buffer recording system PM events + * @discussion Key may be found in the dictionary at IOPMrootDomain's property + * kIOPMTimelineDictionaryKey. uint32_t value; may be 0. + */ +#define kIOPMTimelineSystemBufferSizeKey "TimelineSystemBufferSize" + + + +/* @constant kIOPMEventTypeIntermediateFlag + * @abstract This bit indicates the event is an intermediate event + * which must occur within a major system power event. + */ +#define kIOPMEventTypeIntermediateFlag 0x10000000 + +/* @enum SystemEventTypes + * @abstract Potential system events logged in the system event record. + */ +enum { + kIOPMEventTypeUndefined = 0, + + /* Event types mark driver events + */ + kIOPMEventTypeSetPowerStateImmediate = 1001, + kIOPMEventTypeSetPowerStateDelayed = 1002, + kIOPMEventTypePSWillChangeTo = 1003, + kIOPMEventTypePSDidChangeTo = 1004, + kIOPMEventTypeAppResponse = 1005, + + + /* Start and stop event types bracket major + * system power management events. + */ + kIOPMEventTypeSleep = 2001, + kIOPMEventTypeSleepDone = 2002, + kIOPMEventTypeWake = 3001, + kIOPMEventTypeWakeDone = 3002, + kIOPMEventTypeDoze = 4001, + kIOPMEventTypeDozeDone = 4002, + kIOPMEventTypeLiteWakeUp = 5001, + kIOPMEventTypeLiteWakeUpDone = 5002, + kIOPMEventTypeLiteWakeDown = 5003, + kIOPMEventTypeLiteWakeDownDone = 5004, + kIOPMEventTypeUUIDSet = 6001, + kIOPMEventTypeUUIDClear = 6002, + + /* Intermediate events that may only occur within the bounds + * of a major system event (between the event's initiation and its "done event".) + * e.g. chronologically kIOPMEventTypeSleep may be followed by one or more + * intermediate events, which then must be followed by kIOPMEventTypeSleepDone. + * + * The intermediate events below will always occur in a Sleep or Wake event, and may + * or may not occur for any of the other events. + */ + kIOPMEventTypeAppNotificationsFinished = 501 | kIOPMEventTypeIntermediateFlag, + kIOPMEventTypeDriverNotificationsFinished = 502 | kIOPMEventTypeIntermediateFlag, + kIOPMEventTypeCalTimeChange = 503 | kIOPMEventTypeIntermediateFlag +}; + + +/* @enum SystemSleepReasons + * @abstract The potential causes for system sleep as logged in the system event record. + */ +enum { + kIOPMSleepReasonClamshell = 101, + kIOPMSleepReasonPowerButton = 102, + kIOPMSleepReasonSoftware = 103, + kIOPMSleepReasonOSSwitchHibernate = 104, + kIOPMSleepReasonIdle = 105, + kIOPMSleepReasonLowPower = 106, + kIOPMSleepReasonThermalEmergency = 107, + kIOPMSleepReasonMaintenance = 108 +}; + +/* + * Possible C-string sleep reasons found under kRootDomainSleepReasonsKey + */ +#define kIOPMClamshellSleepKey "Clamshell Sleep" +#define kIOPMPowerButtonSleepKey "Power Button Sleep" +#define kIOPMSoftwareSleepKey "Software Sleep" +#define kIOPMOSSwitchHibernationKey "OS Switch Sleep" +#define kIOPMIdleSleepKey "Idle Sleep" +#define kIOPMLowPowerSleepKey "Low Power Sleep" +#define kIOPMThermalEmergencySleepKey "Thermal Emergency Sleep" + + +enum { + kIOPMMaxSystemEventsTracked = 25000, + kIOPMDefaultSystemEventsTracked = 1000, + kMaxPMStringLength = 40, +}; + +/* @struct IOPMSystemEventRecord + * @abstract Records a singe power event to a particular PM entity. + * This includes changes to a driver's power state, application responses + * to PM notifications, or system power management milestones. + */ +typedef struct { + union { + // For DRIVER events + char ownerName[kMaxPMStringLength]; + // For SYSTEM events, uuid contains the string describing the active UUID + char uuid[kMaxPMStringLength]; + }; + + // For DRIVER events - records the name of the driver who generated the notifications. + char interestName[kMaxPMStringLength]; + + // DRIVER & SYSTEM - Times are stored as uint64_t + // The high 32 bytes are the seconds returned from clock_get_calendar_microtime, + // and the low 32 bytes are the accompanying microseconds. + uint64_t timestamp; + + union { + // For DRIVER events - ownerDisambiguateID is a unique descriptor of the driver, to disambiguate + // several similarly named drivers. + uint64_t ownerDisambiguateID; + // For SYSTEM events - eventReason is a value in SystemSleepReason + uint64_t eventReason; + }; + + // DRIVER & SYSTEM - eventType is one of 'SystemEventTypes' + // The value of eventType determines, among ohter things, whether this is a SYSTEM or + // DRIVER event type. + uint32_t eventType; + + // DRIVER & SYSTEM - eventResult is an IOReturn value + uint32_t eventResult; + + // DRIVER - If defined, elapsedTimeUS records the entire time a transaction took to complete + uint32_t elapsedTimeUS; + + // DRIVER - in power state changes, oldState & newState are PM power state indices. + uint8_t oldState; + uint8_t newState; +} IOPMSystemEventRecord; + +/* @struct IOPMTraceBufferHeader + * Occupies the first bytes in the buffer allocated by IOPMrootDomain + * Describes the size and current index of the trace buffer + */ +typedef struct { + uint32_t sizeBytes; + uint32_t sizeEntries; + uint32_t index; +} IOPMTraceBufferHeader; + +/* Argument to IOPMrootDomain::clientMemoryForType to acquire + * memory mapping. + */ +enum { + kPMRootDomainMapTraceBuffer = 1 +}; +/************************************************** +* +* Accountability API Ends here +* +**************************************************/ + + +#pragma mark Stray Bitfields // Private power commands issued to root domain // bits 0-7 in IOPM.h @@ -143,10 +332,10 @@ typedef struct { /* PM RootDomain tracePoints * * In the sleep/wake process, we expect the sleep trace points to proceed - * in increasing order. Once sleep begins with code kIOPMTracePointSleepStarted = 0x11, + * in increasing order. Once sleep begins with code kIOPMTracePointSleepStarted, * we expect sleep to continue in a monotonically increasing order of tracepoints - * to kIOPMTracePointSystemLoginwindowPhase = 0x30. After trace point SystemLoginWindowPhase, - * the system will return to kIOPMTracePointSystemUp = 0x00. + * to kIOPMTracePointSystemLoginwindowPhase. After trace point SystemLoginWindowPhase, + * the system will return to kIOPMTracePointSystemUp. * * If the trace point decreases (instead of increasing) before reaching kIOPMTracePointSystemUp, * that indicates that the sleep process was cancelled. The cancel reason shall be indicated @@ -155,94 +344,215 @@ typedef struct { enum { /* When kTracePointSystemUp is the latest tracePoint, - the system is awake. It is not asleep, sleeping, or waking. - - * Phase begins: At boot, at completion of wake from sleep, - immediately following kIOPMTracePointSystemLoginwindowPhase. - * Phase ends: When a sleep attempt is initiated. + * the system is awake. It is not asleep, sleeping, or waking. + * + * Phase begins: At boot, at completion of wake from sleep, + * immediately following kIOPMTracePointSystemLoginwindowPhase. + * Phase ends: When a sleep attempt is initiated. */ kIOPMTracePointSystemUp = 0, -/* When kIOPMTracePointSleepStarted we have just initiated sleep. +/* When kIOPMTracePointSleepStarted is the latest tracePoint, + * sleep has been initiated. + * + * Phase begins: At initiation of system sleep (idle or forced). + * Phase ends: PM starts to notify applications of system sleep. + */ + kIOPMTracePointSleepStarted = 0x10, - Note: The state prior to kIOPMTracePointSleepStarted may be only one of: - * kIOPMTracePointSystemUp - * kIOPMTracePointSystemLoginwindowPhase or +/* When kIOPMTracePointSleepApplications is the latest tracePoint, + * a system sleep has been initiated and PM waits for responses + * from notified applications. + * + * Phase begins: Begin to asynchronously fire kIOMessageSystemWillSleep + * notifications, and also kIOMessageCanSystemSleep for the idle sleep case. + * Phase ends: When PM has received all application responses. + */ + kIOPMTracePointSleepApplications = 0x11, - * Phase begins: At initiation of system sleep (idle or forced). - * Phase ends: As we start to notify applications of system sleep. +/* When kIOPMTracePointSleepPriorityClients is the latest tracePoint, + * PM is notifying priority clients and in-kernel system capability + * clients, and waiting for any asynchronous completions. + * + * Phase begins: Synchronous delivery of kIOMessageSystemWillSleep notifications. + * Phase ends: All notified clients have acknowledged. + */ + kIOPMTracePointSleepPriorityClients = 0x12, + +/* When kIOPMTracePointSleepWillChangeInterests is the latest tracePoint, + * PM is calling powerStateWillChangeTo() on interested drivers of root domain. + * + * Phase begins: Dispatch a callout thread to call interested drivers. + * Phase ends: Callout thread work done, and acknowledgePowerChange() called + * by drivers that indicated asynchronous completion. */ - kIOPMTracePointSleepStarted = 0x11, + kIOPMTracePointSleepWillChangeInterests = 0x13, -/* When kTracePointSystemSleepAppsPhase is the latest tracePoint, - a system sleep has been irrevocably inititated and PM waits - for responses from notified applications. +/* When kIOPMTracePointSleepPowerPlaneDrivers is the latest tracePoint, + * PM is directing power plane drivers to power off in leaf-to-root order. + * + * Phase begins: Root domain informs its power children that it will drop to + * sleep state. This has a cascade effect and triggers all drivers in + * the power plane to transition to a lower power state if necessary. + * Phase ends: All power transitions in response to the root domain power + * change have completed. + */ + kIOPMTracePointSleepPowerPlaneDrivers = 0x14, + +/* When kIOPMTracePointSleepDidChangeInterests is the latest tracePoint, + * PM is calling powerStateDidChangeTo() on interested drivers of root domain. + * + * Phase begins: Dispatch a callout thread to call interested drivers. + * Phase ends: Callout thread work done, and acknowledgePowerChange() called + * by drivers that indicated asynchronous completion. + */ + kIOPMTracePointSleepDidChangeInterests = 0x15, - * Phase begins: Begin to asynchronously fire kIOMessageSystemWillSleep notifications, - * and in the case of an idle sleep kIOMessageCanSystemSleep as well. - * Phase ends: When we have received all user & interested kernel acknowledgements. +/* When kIOPMTracePointSleepCapabilityClients is the latest tracePoint, + * PM is notifying system capability clients about system sleep. + * + * Phase begins: Send kIOMessageSystemCapabilityChange notifications to inform + * capability clients that system has lost all capabilities. + * Phase ends: Finished sending notifications. */ - kIOPMTracePointSystemSleepAppsPhase = 0x12, + kIOPMTracePointSleepCapabilityClients = 0x16, +/* When kIOPMTracePointSleepPlatformActions is the latest tracePoint, + * PM is calling drivers that have registered a platform sleep action. + */ + kIOPMTracePointSleepPlatformActions = 0x17, -/* When kIOPMTracePointSystemHibernatePhase is the latest tracePoint, - PM is writing the hiernate image to disk. +/* When kIOPMTracePointSleepCPUs is the latest tracePoint, + * PM is shutting down all non-boot processors. + * + * Phase begins: Shutdown all non-boot processors. + * Phase ends: Reduced to only the boot processor running. */ - kIOPMTracePointSystemHibernatePhase = 0x13, + kIOPMTracePointSleepCPUs = 0x18, -/* When kTracePointSystemSleepDriversPhase is the latest tracePoint, - PM is iterating the driver tree powering off devices individually. +/* When kIOPMTracePointSleepPlatformDriver is the latest tracePoint, + * PM is executing platform dependent code to prepare for system sleep. + */ + kIOPMTracePointSleepPlatformDriver = 0x19, - * Phase begins: When IOPMrootDomain has received all of its power acknowledgements and begins - * executing IOService::powerDomainWillChangeTo() - * Phase ends: When IOPMrootDomain::powerChangeDone begins executing CPU shutoff code. +/* When kIOPMTracePointHibernate is the latest tracePoint, + * PM is writing the hibernate image to disk. */ - kIOPMTracePointSystemSleepDriversPhase = 0x14, + kIOPMTracePointHibernate = 0x1a, -/* When kTracePointSystemSleepPlatformPhase is the latest tracePoint, - all apps and drivers have notified of sleep. Plotfarm is powering - off CPU; or system is asleep; or low level wakeup is underway. +/* When kIOPMTracePointSystemSleep is the latest tracePoint, + * PM has recorded the final trace point before the hardware platform + * enters sleep state, or low level wakeup is underway - such as restoring + * the hibernate image from disk. + * + * Note: If a system is asleep and then loses power, and it does not have a + * hibernate image to restore from (e.g. hibernatemode = 0), then OS X will + * interpret this power loss as a failure in kIOPMTracePointSystemSleep. + * + * Phase begins: Before the OS directs the hardware to enter sleep state. + * Phase ends: Control returns to the OS on wake, but before recording the first + * wake trace point. + */ + kIOPMTracePointSystemSleep = 0x1f, - Note: If a system is asleep and then loses power, and it does not have a hibernate - image to restore from (e.g. hibernatemode = 0), then OS X may interpret this power - loss as a system crash in the kTracePointSystemSleepPlatformPhase, since the - power loss resembles a hang or crash, and the power being removed by the user. +/* When kIOPMTracePointWakePlatformDriver is the latest tracePoint, + * PM is executing platform dependent code to prepare for system wake. + */ + kIOPMTracePointWakePlatformDriver = 0x21, + +/* When kIOPMTracePointWakePlatformActions is the latest tracePoint, + * PM is calling drivers that have registered a platform wake action. + */ + kIOPMTracePointWakePlatformActions = 0x22, - * Phase begins: IOPMrootDomain has already shut off drivers, and is now powering off CPU. - * Phase ends: Immediately after CPU's are powered back on during wakeup. +/* When kIOPMTracePointWakeCPUs is the latest tracePoint, + * PM is bringing all non-boot processors online. + */ + kIOPMTracePointWakeCPUs = 0x23, + +/* When kIOPMTracePointWakeWillPowerOnClients is the latest tracePoint, + * PM is sending kIOMessageSystemWillPowerOn to both kernel clients and + * applications. PM also notifies system capability clients about the + * proposed capability change. + * + * Phase begins: Send kIOMessageSystemWillPowerOn and + * kIOMessageSystemCapabilityChange notifications. + * Phase ends: Finished sending notifications. */ - kIOPMTracePointSystemSleepPlatformPhase = 0x15, + kIOPMTracePointWakeWillPowerOnClients = 0x24, -/* When kTracePointSystemWakeDriversPhase is the latest tracePoint, - System CPU is powered, PM is notifying drivers of system wake. +/* When kIOPMTracePointWakeWillChangeInterests is the latest tracePoint, + * PM is calling powerStateWillChangeTo() on interested drivers of root domain. + * + * Phase begins: Dispatch a callout thread to call interested drivers. + * Phase ends: Callout thread work done, and acknowledgePowerChange() called + * by drivers that indicated asynchronous completion. + */ + kIOPMTracePointWakeWillChangeInterests = 0x25, + +/* When kIOPMTracePointWakeDidChangeInterests is the latest tracePoint, + * PM is calling powerStateDidChangeTo() on interested drivers of root domain. + * + * Phase begins: Dispatch a callout thread to call interested drivers. + * Phase ends: Callout thread work done, and acknowledgePowerChange() called + * by drivers that indicated asynchronous completion. + */ + kIOPMTracePointWakeDidChangeInterests = 0x26, - * Phase begins: CPU's have successfully powered up and OS is executing. - * Phase ends: All drivers have handled power events & acknowledged completion. - IOPMrootDomain is about to deliver kIOMessageSystemHasPoweredOn. +/* When kIOPMTracePointWakePowerPlaneDrivers is the latest tracePoint, + * PM is directing power plane drivers to power up in root-to-leaf order. + * + * Phase begins: Root domain informs its power children that it transitioned + * to ON state. This has a cascade effect and triggers all drivers in + * the power plane to re-evaluate and potentially change power state. + * Phase ends: All power transitions in response to the root domain power + * change have completed. */ - kIOPMTracePointSystemWakeDriversPhase = 0x21, + kIOPMTracePointWakePowerPlaneDrivers = 0x27, -/* When kTracePointSystemWakeAppsPhase is the latest tracePoint, - System CPU is powered, PM has powered on each driver. +/* When kIOPMTracePointWakeCapabilityClients is the latest tracePoint, + * PM is notifying system capability clients about system wake, and waiting + * for any asynchronous completions. + * + * Phase begins: Inform capability clients that system has gained capabilities. + * Phase ends: All notified clients have acknowledged. + */ + kIOPMTracePointWakeCapabilityClients = 0x28, - * Phase begins: IOPMrootDomain::tellChangeUp before sending asynchronous - kIOMessageSystemHasPoweredOn notifications - * Phase ends: IOPMrootDomain::tellChangeUp after sending asynchronous notifications +/* When kIOPMTracePointWakeApplications is the latest tracePoint, + * System CPU is powered, PM has powered on each driver. + * + * Phase begins: Send asynchronous kIOMessageSystemHasPoweredOn notifications. + * Phase ends: Finished sending asynchronous notifications. */ - kIOPMTracePointSystemWakeAppsPhase = 0x22, + kIOPMTracePointWakeApplications = 0x29, /* kIOPMTracePointSystemLoginwindowPhase - This phase represents a several minute window after the system has powered on. - Higher levels of system diagnostics are in a heightened state of alert in this phase, - in case any user errors occurred that we could not detect in software. - - This several minute window + * This phase represents a several minute window after the system has powered on. + * Higher levels of system diagnostics are in a heightened state of alert in this phase, + * in case any user errors occurred that we could not detect in software. + * + * Phase begins: After IOPMrootDomain sends kIOMessageSystemHasPoweredOn message. + * Phase ends: When loginwindow calls IOPMSleepWakeSetUUID(NULL) the system shall + * be considered awake and usable. The next phase shall be kIOPMTracePointSystemUp. + */ + kIOPMTracePointSystemLoginwindowPhase = 0x30, - * Phase begins: After IOPMrootDomain sends kIOMessageSystemHasPoweredOn message. - * Phase ends: When loginwindow calls IOPMSleepWakeSetUUID(NULL) the system shall - be considered awake and usable. The next phase shall be kIOPMTracePointSystemUp. +/* When kIOPMTracePointDarkWakeEntry is the latest tracePoint, + * PM has started a transition from full wake to dark wake. + * + * Phase begins: Start transition to dark wake. + * Phase ends: System in dark wake. Before recording kIOPMTracePointSystemUp. + */ + kIOPMTracePointDarkWakeEntry = 0x31, + +/* When kIOPMTracePointDarkWakeExit is the latest tracePoint, + * PM has started a transition from dark wake to full wake. + * + * Phase begins: Start transition to full wake. + * Phase ends: System in full wake. Before recording kIOPMTracePointSystemUp. */ - kIOPMTracePointSystemLoginwindowPhase = 0x30 + kIOPMTracePointDarkWakeExit = 0x32 }; /*****************************************************************************/ diff --git a/iokit/IOKit/pwr_mgt/IOPowerConnection.h b/iokit/IOKit/pwr_mgt/IOPowerConnection.h index 179035b2f..a7ece0ad5 100644 --- a/iokit/IOKit/pwr_mgt/IOPowerConnection.h +++ b/iokit/IOKit/pwr_mgt/IOPowerConnection.h @@ -46,16 +46,18 @@ class IOPowerConnection : public IOService protected: /*! @field parentKnowsState true: parent knows state of its domain used by child */ - bool stateKnown; + bool stateKnown; + /*! @field currentPowerFlags power flags which describe the current state of the power domain used by child */ IOPMPowerFlags currentPowerFlags; + /*! @field desiredDomainState state number which corresponds to the child's desire used by parent */ unsigned long desiredDomainState; /*! @field requestFlag set to true when desiredDomainState is set */ - bool requestFlag; + bool requestFlag; /*! @field preventIdleSleepFlag true if child has this bit set in its desired state used by parent */ @@ -67,16 +69,21 @@ class IOPowerConnection : public IOService /*! @field awaitingAck true if child has not yet acked our notification used by parent */ - bool awaitingAck; + bool awaitingAck; /*! @field readyFlag true if the child has been added as a power child used by parent */ - bool readyFlag; + bool readyFlag; +#ifdef XNU_KERNEL_PRIVATE public: - /*! @function setParentKnowsState - @abstract Sets the stateKnown variable. - @discussion Called by the parent when the object is created and called by the child when it discovers that the parent now knows its state. */ + bool delayChildNotification; +#endif + +public: + /*! @function setParentKnowsState + @abstract Sets the stateKnown variable. + @discussion Called by the parent when the object is created and called by the child when it discovers that the parent now knows its state. */ void setParentKnowsState (bool ); /*! @function setParentCurrentPowerFlags @@ -107,7 +114,6 @@ class IOPowerConnection : public IOService @discussion Called by the parent. */ void setChildHasRequestedPower ( void ); - /*! @function childHasRequestedPower @abstract Return the flag that says whether the child has called requestPowerDomainState. @discussion Called by the PCI Aux Power Supply Driver to see if a device driver diff --git a/iokit/IOKit/pwr_mgt/Makefile b/iokit/IOKit/pwr_mgt/Makefile index 14165762a..b82357fe9 100644 --- a/iokit/IOKit/pwr_mgt/Makefile +++ b/iokit/IOKit/pwr_mgt/Makefile @@ -18,20 +18,18 @@ NOT_EXPORT_HEADERS = \ IOPMPagingPlexus.h INSTINC_SUBDIRS = -INSTINC_SUBDIRS_PPC = INSTINC_SUBDIRS_I386 = INSTINC_SUBDIRS_X86_64 = INSTINC_SUBDIRS_ARM = EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} -EXPINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS_PPC} EXPINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS_I386} EXPINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS_X86_64} EXPINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS_ARM} ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h)) -INSTALL_MI_LIST = IOPMLibDefs.h IOPM.h IOPMDeprecated.h +INSTALL_MI_LIST = IOPMLibDefs.h IOPM.h INSTALL_MI_LCL_LIST = IOPMPrivate.h INSTALL_MI_DIR = $(MI_DIR) diff --git a/iokit/IOKit/pwr_mgt/RootDomain.h b/iokit/IOKit/pwr_mgt/RootDomain.h index 2de4d289c..760e7d674 100644 --- a/iokit/IOKit/pwr_mgt/RootDomain.h +++ b/iokit/IOKit/pwr_mgt/RootDomain.h @@ -31,18 +31,20 @@ #include #include #include "IOKit/pwr_mgt/IOPMPrivate.h" +#include #ifdef XNU_KERNEL_PRIVATE -#if defined(__i386__) || defined(__x86_64__) -#define ROOT_DOMAIN_RUN_STATES 1 -#endif struct AggressivesRecord; -class PMAssertionsTracker; -#endif /* XNU_KERNEL_PRIVATE */ - +struct IOPMMessageFilterContext; +struct IOPMActions; +class PMSettingObject; +class IOPMTimeline; +class PMEventDetails; +class PMTraceWorker; class IOPMPowerStateQueue; class RootDomainUserClient; -class PMTraceWorker; +class PMAssertionsTracker; +#endif /*! * Types for PM Assertions @@ -105,19 +107,6 @@ enum { #define kIOPMThermalEmergencySleepKey "Thermal Emergency Sleep" #define kIOPMMaintenanceSleepKey "Maintenance Sleep" -enum -{ - kIOPMSleepReasonClamshell = 1, - kIOPMSleepReasonPowerButton = 2, - kIOPMSleepReasonSoftware = 3, - kIOPMSleepReasonOSSwitchHibernation = 4, - kIOPMSleepReasonIdle = 5, - kIOPMSleepReasonLowPower = 6, - kIOPMSleepReasonThermalEmergency = 7, - kIOPMSleepReasonMaintenance = 8, - kIOPMSleepReasonMax -}; - /* * String constants for communication with PM CPU */ @@ -268,17 +257,6 @@ class IOPMrootDomain: public IOService const OSSymbol * typeOfInterest, IOServiceInterestHandler handler, void * target, void * ref = 0 ); - - void pmStatsRecordEvent( - int eventIndex, - AbsoluteTime timestamp); - - void pmStatsRecordApplicationResponse( - const OSSymbol *response, - const char *name, - int messageType, - uint32_t delay_ms, - int app_pid); virtual IOReturn callPlatformFunction( const OSSymbol *functionName, @@ -346,57 +324,76 @@ class IOPMrootDomain: public IOService #ifdef XNU_KERNEL_PRIVATE /* Root Domain internals */ public: + void tagPowerPlaneService( + IOService * service, + IOPMActions * actions ); -#if HIBERNATION - bool getHibernateSettings( - uint32_t * hibernateMode, - uint32_t * hibernateFreeRatio, - uint32_t * hibernateFreeTime ); -#endif + void overrideOurPowerChange( + IOService * service, + IOPMActions * actions, + unsigned long * inOutPowerState, + uint32_t * inOutChangeFlags ); -#if ROOT_DOMAIN_RUN_STATES - void tagPowerPlaneService( - IOService * service, - uint32_t * rdFlags ); - - void handleActivityTickleForService( IOService * service, - unsigned long type, - unsigned long currentPowerState, - uint32_t activityTickleCount ); - - void handlePowerChangeStartForService( - IOService * service, - uint32_t * rootDomainFlags, - uint32_t newPowerState, - uint32_t changeFlags ); - - void handlePowerChangeDoneForService( - IOService * service, - uint32_t * rootDomainFlags, - uint32_t newPowerState, - uint32_t changeFlags ); - - void overridePowerStateForService( + void handleOurPowerChangeStart( IOService * service, - uint32_t * rdFlags, - unsigned long * powerState, + IOPMActions * actions, + uint32_t powerState, + uint32_t * inOutChangeFlags ); + + void handleOurPowerChangeDone( + IOService * service, + IOPMActions * actions, + uint32_t powerState, + uint32_t changeFlags ); + + void overridePowerChangeForUIService( + IOService * service, + IOPMActions * actions, + unsigned long * inOutPowerState, + uint32_t * inOutChangeFlags ); + + void handleActivityTickleForDisplayWrangler( + IOService * service, + IOPMActions * actions ); + + bool shouldDelayChildNotification( + IOService * service ); + + void handlePowerChangeStartForPCIDevice( + IOService * service, + IOPMActions * actions, + uint32_t powerState, + uint32_t * inOutChangeFlags ); + + void handlePowerChangeDoneForPCIDevice( + IOService * service, + IOPMActions * actions, + uint32_t powerState, uint32_t changeFlags ); + void askChangeDownDone( + IOPMPowerChangeFlags * inOutChangeFlags, + bool * cancel ); + + void handlePublishSleepWakeUUID( + bool shouldPublish); + + void handleQueueSleepWakeUUID( + OSObject *obj); + IOReturn setMaintenanceWakeCalendar( const IOPMCalendarStruct * calendar ); -#endif /* ROOT_DOMAIN_RUN_STATES */ // Handle callbacks from IOService::systemWillShutdown() - void acknowledgeSystemWillShutdown( IOService * from ); + void acknowledgeSystemWillShutdown( IOService * from ); // Handle platform halt and restart notifications - void handlePlatformHaltRestart( UInt32 pe_type ); + void handlePlatformHaltRestart( UInt32 pe_type ); + + IOReturn shutdownSystem( void ); + IOReturn restartSystem( void ); + void handleSleepTimerExpiration( void ); - IOReturn shutdownSystem( void ); - IOReturn restartSystem( void ); - void handleSleepTimerExpiration( void ); - void handleForcedSleepTimerExpiration( void ); - void stopIgnoringClamshellEventsDuringWakeup( void ); bool activitySinceSleep(void); bool abortHibernation(void); @@ -404,15 +401,67 @@ class IOPMrootDomain: public IOService void handleAggressivesRequests( void ); void tracePoint( uint8_t point ); + void tracePoint( uint8_t point, uint8_t data ); + void traceDetail( uint32_t data32 ); + + bool systemMessageFilter( + void * object, void * arg1, void * arg2, void * arg3 ); + +/*! @function recordPMEvent + @abstract Logs IOService PM event timing. + @discussion Should only be called from IOServicePM. Should not be exported. + @result kIOReturn on success. +*/ + IOReturn recordPMEvent( PMEventDetails *details ); + IOReturn recordAndReleasePMEvent( PMEventDetails *details ); + IOReturn recordPMEventGated( PMEventDetails *details ); + IOReturn recordAndReleasePMEventGated( PMEventDetails *details ); + + void pmStatsRecordEvent( + int eventIndex, + AbsoluteTime timestamp); + + void pmStatsRecordApplicationResponse( + const OSSymbol *response, + const char *name, + int messageType, + uint32_t delay_ms, + int app_pid); + +#if HIBERNATION + bool getHibernateSettings( + uint32_t * hibernateMode, + uint32_t * hibernateFreeRatio, + uint32_t * hibernateFreeTime ); +#endif private: friend class PMSettingObject; - friend class PMAssertionsTracker; friend class RootDomainUserClient; + friend class PMAssertionsTracker; + + static IOReturn sysPowerDownHandler( void * target, void * refCon, + UInt32 messageType, IOService * service, + void * messageArgument, vm_size_t argSize ); + + static IOReturn displayWranglerNotification( void * target, void * refCon, + UInt32 messageType, IOService * service, + void * messageArgument, vm_size_t argSize ); + + static IOReturn rootBusyStateChangeHandler( void * target, void * refCon, + UInt32 messageType, IOService * service, + void * messageArgument, vm_size_t argSize ); + + static bool displayWranglerMatchPublished( void * target, void * refCon, + IOService * newService, + IONotifier * notifier); + + static bool batteryPublished( void * target, void * refCon, + IOService * resourceService, + IONotifier * notifier); - // Points to our parent IOService * wrangler; - class IORootParent * patriarch; + IOService * wranglerConnection; IOLock *featuresDictLock; // guards supportedFeatures IOPMPowerStateQueue *pmPowerStateQueue; @@ -422,7 +471,7 @@ class IOPMrootDomain: public IOService PMAssertionsTracker *pmAssertions; // Settings controller info - IORecursiveLock *settingsCtrlLock; + IOLock *settingsCtrlLock; OSDictionary *settingsCallbacks; OSDictionary *fPMSettingsDict; @@ -430,16 +479,16 @@ class IOPMrootDomain: public IOService IONotifier *_displayWranglerNotifier; // Statistics - const OSSymbol *_statsNameKey; - const OSSymbol *_statsPIDKey; - const OSSymbol *_statsTimeMSKey; - const OSSymbol *_statsResponseTypeKey; - const OSSymbol *_statsMessageTypeKey; + const OSSymbol *_statsNameKey; + const OSSymbol *_statsPIDKey; + const OSSymbol *_statsTimeMSKey; + const OSSymbol *_statsResponseTypeKey; + const OSSymbol *_statsMessageTypeKey; OSString *queuedSleepWakeUUIDString; - OSArray *pmStatsAppResponses; + bool uuidPublished; PMStatsStruct pmStats; // Pref: idle time before idle sleep @@ -452,41 +501,78 @@ class IOPMrootDomain: public IOService unsigned long extraSleepDelay; // Used to wait between say display idle and system idle - thread_call_t extraSleepTimer; - - // Used to ignore clamshell close events while we're waking from sleep - thread_call_t clamshellWakeupIgnore; - + thread_call_t extraSleepTimer; thread_call_t diskSyncCalloutEntry; - uint32_t runStateIndex; - uint32_t runStateFlags; - uint32_t nextRunStateIndex; - uint32_t wranglerTickled; + // IOPMActions parameter encoding + enum { + kPMActionsFlagIsDisplayWrangler = 0x00000001, + kPMActionsFlagIsGraphicsDevice = 0x00000002, + kPMActionsFlagIsAudioDevice = 0x00000004, + kPMActionsFlagLimitPower = 0x00000008, + kPMActionsPCIBitNumberMask = 0x000000ff + }; + + // Track system capabilities. + uint32_t _desiredCapability; + uint32_t _currentCapability; + uint32_t _pendingCapability; + uint32_t _highestCapability; + OSSet * _joinedCapabilityClients; + uint32_t _systemStateGeneration; + + // Type of clients that can receive system messages. + enum { + kSystemMessageClientConfigd = 0x01, + kSystemMessageClientApp = 0x02, + kSystemMessageClientUser = 0x03, + kSystemMessageClientKernel = 0x04, + kSystemMessageClientAll = 0x07 + }; + uint32_t _systemMessageClientMask; + + // Power state and capability change transitions. + enum { + kSystemTransitionNone = 0, + kSystemTransitionSleep = 1, + kSystemTransitionWake = 2, + kSystemTransitionCapability = 3, + kSystemTransitionNewCapClient = 4 + } _systemTransitionType; unsigned int systemBooting :1; unsigned int systemShutdown :1; + unsigned int systemDarkWake :1; unsigned int clamshellExists :1; - unsigned int clamshellIsClosed :1; - unsigned int ignoringClamshell :1; - unsigned int ignoringClamshellOnWake :1; + unsigned int clamshellClosed :1; + unsigned int clamshellDisabled :1; unsigned int desktopMode :1; - unsigned int acAdaptorConnected :1; + unsigned int acAdaptorConnected :1; - unsigned int allowSleep :1; - unsigned int sleepIsSupported :1; - unsigned int canSleep :1; - unsigned int sleepASAP :1; unsigned int idleSleepTimerPending :1; unsigned int userDisabledAllSleep :1; - unsigned int ignoreChangeDown :1; + unsigned int childPreventSystemSleep :1; + unsigned int ignoreTellChangeDown :1; unsigned int wranglerAsleep :1; + unsigned int wranglerTickled :1; + unsigned int wranglerSleepIgnored :1; + unsigned int graphicsSuppressed :1; + + unsigned int capabilityLoss :1; + unsigned int pciCantSleepFlag :1; + unsigned int pciCantSleepValid :1; + unsigned int logWranglerTickle :1; + unsigned int logGraphicsClamp :1; + unsigned int darkWakeToSleepASAP :1; + unsigned int darkWakeMaintenance :1; + unsigned int darkWakePostTickle :1; unsigned int sleepTimerMaintenance :1; unsigned int lowBatteryCondition :1; unsigned int hibernateDisabled :1; unsigned int hibernateNoDefeat :1; unsigned int hibernateAborted :1; + unsigned int rejectWranglerTickle :1; uint32_t hibernateMode; uint32_t userActivityCount; @@ -498,54 +584,45 @@ class IOPMrootDomain: public IOService int32_t idxPMCPULimitedPower; IOOptionBits platformSleepSupport; + uint32_t _debugWakeSeconds; queue_head_t aggressivesQueue; thread_call_t aggressivesThreadCall; OSData * aggressivesData; AbsoluteTime wranglerSleepTime; - + AbsoluteTime systemWakeTime; + // PCI top-level PM trace IOService * pciHostBridgeDevice; + IOService * pciHostBridgeDriver; - // IOPMrootDomain internal sleep call - IOReturn privateSleepSystem( uint32_t sleepReason ); - void announcePowerSourceChange( void ); + IONotifier * systemCapabilityNotifier; - void reportUserInput( void ); - static IOReturn sysPowerDownHandler( void * target, void * refCon, - UInt32 messageType, IOService * service, - void * messageArgument, vm_size_t argSize ); + IOPMTimeline *timeline; - static IOReturn displayWranglerNotification( void * target, void * refCon, - UInt32 messageType, IOService * service, - void * messageArgument, vm_size_t argSize ); + // IOPMrootDomain internal sleep call + IOReturn privateSleepSystem( uint32_t sleepReason ); + void reportUserInput( void ); + bool checkSystemCanSleep( IOOptionBits options = 0 ); - static bool displayWranglerPublished( void * target, void * refCon, - IOService * newService); + void adjustPowerState( bool sleepASAP = false ); + void setQuickSpinDownTimeout( void ); + void restoreUserSpinDownTimeout( void ); - static bool batteryPublished( void * target, void * refCon, - IOService * resourceService ); + bool shouldSleepOnClamshellClosed(void ); + void sendClientClamshellNotification( void ); - void adjustPowerState( void ); - void setQuickSpinDownTimeout( void ); - void restoreUserSpinDownTimeout( void ); - - bool shouldSleepOnClamshellClosed(void ); - void sendClientClamshellNotification( void ); - // Inform PMCPU of changes to state like lid, AC vs. battery - void informCPUStateChange( uint32_t type, uint32_t value ); + void informCPUStateChange( uint32_t type, uint32_t value ); - void dispatchPowerEvent( uint32_t event, void * arg0, uint64_t arg1 ); - void handlePowerNotification( UInt32 msg ); + void dispatchPowerEvent( uint32_t event, void * arg0, uint64_t arg1 ); + void handlePowerNotification( UInt32 msg ); - IOReturn setPMSetting(const OSSymbol *, OSObject *); + IOReturn setPMSetting(const OSSymbol *, OSObject *); - void startIdleSleepTimer( uint32_t inSeconds ); - void cancelIdleSleepTimer( void ); - - void updateRunState( uint32_t inRunState ); + void startIdleSleepTimer( uint32_t inSeconds ); + void cancelIdleSleepTimer( void ); IOReturn setAggressiveness( unsigned long type, @@ -561,19 +638,23 @@ class IOPMrootDomain: public IOService const AggressivesRecord * array, int count ); - void aggressivenessChanged( void ); + // getPMTraceMemoryDescriptor should only be called by our friend RootDomainUserClient + IOMemoryDescriptor *getPMTraceMemoryDescriptor(void); IOReturn setPMAssertionUserLevels(IOPMDriverAssertionType); - + void publishSleepWakeUUID( bool shouldPublish ); + void evaluatePolicy( int stimulus, uint32_t arg = 0 ); + + void deregisterPMSettingObject( PMSettingObject * pmso ); + #if HIBERNATION bool getSleepOption( const char * key, uint32_t * option ); bool evaluateSystemSleepPolicy( IOPMSystemSleepParameters * p ); void evaluateSystemSleepPolicyEarly( void ); void evaluateSystemSleepPolicyFinal( void ); #endif /* HIBERNATION */ - #endif /* XNU_KERNEL_PRIVATE */ }; @@ -582,10 +663,9 @@ class IORootParent: public IOService { OSDeclareFinalStructors(IORootParent) -private: - unsigned long mostRecentChange; - public: + static void initialize( void ); + virtual OSObject * copyProperty( const char * aKey ) const; bool start( IOService * nub ); void shutDownSystem( void ); void restartSystem( void ); diff --git a/iokit/IOKit/rtc/Makefile b/iokit/IOKit/rtc/Makefile index ace4cfb12..e16d5b83a 100644 --- a/iokit/IOKit/rtc/Makefile +++ b/iokit/IOKit/rtc/Makefile @@ -14,13 +14,11 @@ MI_DIR = rtc NOT_EXPORT_HEADERS = INSTINC_SUBDIRS = -INSTINC_SUBDIRS_PPC = INSTINC_SUBDIRS_I386 = INSTINC_SUBDIRS_X86_64 = INSTINC_SUBDIRS_ARM = EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} -EXPINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS_PPC} EXPINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS_I386} EXPINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS_X86_64} EXPINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS_ARM} diff --git a/iokit/IOKit/system_management/Makefile b/iokit/IOKit/system_management/Makefile index 1f168421f..c887db562 100644 --- a/iokit/IOKit/system_management/Makefile +++ b/iokit/IOKit/system_management/Makefile @@ -14,13 +14,11 @@ MI_DIR = system_management NOT_EXPORT_HEADERS = INSTINC_SUBDIRS = -INSTINC_SUBDIRS_PPC = INSTINC_SUBDIRS_I386 = INSTINC_SUBDIRS_X86_64 = INSTINC_SUBDIRS_ARM = EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} -EXPINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS_PPC} EXPINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS_I386} EXPINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS_X86_64} EXPINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS_ARM} diff --git a/iokit/Kernel/IOBufferMemoryDescriptor.cpp b/iokit/Kernel/IOBufferMemoryDescriptor.cpp index 004d2ec89..563059600 100644 --- a/iokit/Kernel/IOBufferMemoryDescriptor.cpp +++ b/iokit/Kernel/IOBufferMemoryDescriptor.cpp @@ -103,16 +103,6 @@ bool IOBufferMemoryDescriptor::initWithPhysicalMask( // Grab IOMD bits from the Buffer MD options iomdOptions |= (options & kIOBufferDescriptorMemoryFlags); -#if 0 - // workarounds- - if ((options & kIOMemoryPhysicallyContiguous) || ((capacity == 0x1000) && (inTask == kernel_task)) - && !physicalMask) - { - highestMask = physicalMask = 0xFFFFF000; - } - //- -#endif - if (physicalMask && (alignment <= 1)) { alignment = ((physicalMask ^ (-1ULL)) & (physicalMask - 1)); diff --git a/iokit/Kernel/IOCPU.cpp b/iokit/Kernel/IOCPU.cpp index 7646d2a97..5dd9ea416 100644 --- a/iokit/Kernel/IOCPU.cpp +++ b/iokit/Kernel/IOCPU.cpp @@ -302,12 +302,15 @@ void IOCPUSleepKernel(void) long cnt, numCPUs; IOCPU *target; IOCPU *bootCPU = NULL; - + IOPMrootDomain *rootDomain = IOService::getPMRootDomain(); + kprintf("IOCPUSleepKernel\n"); OSIterator * iter; IOService * service; + rootDomain->tracePoint( kIOPMTracePointSleepPlatformActions ); + queue_init(&gIOSleepActionQueue); queue_init(&gIOWakeActionQueue); @@ -333,6 +336,8 @@ void IOCPUSleepKernel(void) iocpu_run_platform_actions(&gIOSleepActionQueue, 0, 0U-1, NULL, NULL, NULL); + rootDomain->tracePoint( kIOPMTracePointSleepCPUs ); + numCPUs = gIOCPUs->getCount(); // Sleep the CPUs. cnt = numCPUs; @@ -352,10 +357,14 @@ void IOCPUSleepKernel(void) } } + rootDomain->tracePoint( kIOPMTracePointSleepPlatformDriver ); + // Now sleep the boot CPU. if (bootCPU) bootCPU->haltCPU(); + rootDomain->tracePoint( kIOPMTracePointWakePlatformActions ); + iocpu_run_platform_actions(&gIOWakeActionQueue, 0, 0U-1, NULL, NULL, NULL); @@ -372,6 +381,8 @@ void IOCPUSleepKernel(void) if (!queue_empty(&gIOWakeActionQueue)) panic("gIOWakeActionQueue"); + rootDomain->tracePoint( kIOPMTracePointWakeCPUs ); + // Wake the other CPUs. for (cnt = 0; cnt < numCPUs; cnt++) { diff --git a/iokit/Kernel/IOCatalogue.cpp b/iokit/Kernel/IOCatalogue.cpp index 8c51eed84..c6de3b56c 100644 --- a/iokit/Kernel/IOCatalogue.cpp +++ b/iokit/Kernel/IOCatalogue.cpp @@ -74,31 +74,6 @@ IOLock * gIOCatalogLock; #if PRAGMA_MARK #pragma mark Utility functions #endif -/********************************************************************* -*********************************************************************/ -static void -UniqueProperties(OSDictionary * dict) -{ - OSString * data; - - data = OSDynamicCast(OSString, dict->getObject(gIOClassKey)); - if (data) { - const OSSymbol *classSymbol = OSSymbol::withString(data); - - dict->setObject( gIOClassKey, (OSSymbol *) classSymbol); - classSymbol->release(); - } - - data = OSDynamicCast(OSString, dict->getObject(gIOMatchCategoryKey)); - if (data) { - const OSSymbol *classSymbol = OSSymbol::withString(data); - - dict->setObject(gIOMatchCategoryKey, (OSSymbol *) classSymbol); - classSymbol->release(); - } - return; -} - /********************************************************************* * Add a new personality to the set if it has a unique IOResourceMatchKey value. * XXX -- svail: This should be optimized. @@ -170,13 +145,13 @@ bool IOCatalogue::init(OSArray * initArray) gIOCatalogLock = IOLockAlloc(); lock = gIOCatalogLock; -#if __ppc__ || __i386__ +#if __i386__ kld_lock = NULL; -#endif /* __ppc__ || __i386__ */ +#endif /* __i386__ */ kernelTables->reset(); while( (dict = (OSDictionary *) kernelTables->getNextObject())) { - UniqueProperties(dict); + OSKext::uniquePersonalityProperties(dict); if( 0 == dict->getObject( gIOClassKey )) IOLog("Missing or bad \"%s\" key\n", gIOClassKey->getCStringNoCopy()); @@ -306,7 +281,7 @@ IOCatalogue::findDrivers( OSDictionary * dict; OSOrderedSet * set; - UniqueProperties(matching); + OSKext::uniquePersonalityProperties(matching); set = OSOrderedSet::withCapacity( 1, IOServiceOrdering, (void *)gIOProbeScoreKey ); @@ -345,7 +320,7 @@ bool IOCatalogue::addDrivers( bool result = false; OSCollectionIterator * iter = NULL; // must release OSOrderedSet * set = NULL; // must release - OSDictionary * dict = NULL; // do not release + OSObject * object = NULL; // do not release OSArray * persons = NULL; // do not release persons = OSDynamicCast(OSArray, drivers); @@ -364,16 +339,26 @@ bool IOCatalogue::addDrivers( goto finish; } + /* Start with success; clear it on an error. + */ result = true; IOLockLock(lock); - while ( (dict = (OSDictionary *) iter->getNextObject()) ) { + while ( (object = iter->getNextObject()) ) { // xxx Deleted OSBundleModuleDemand check; will handle in other ways for SL + OSDictionary * personality = OSDynamicCast(OSDictionary, object); + SInt count; - - UniqueProperties(dict); + + if (!personality) { + IOLog("IOCatalogue::addDrivers() encountered non-dictionary; bailing.\n"); + result = false; + break; + } + + OSKext::uniquePersonalityProperties(personality); // Add driver personality to catalogue. count = array->getCount(); @@ -389,7 +374,7 @@ bool IOCatalogue::addDrivers( * Do not compare just the properties present in one driver * pesonality or the other. */ - if (dict->isEqualTo(driver)) { + if (personality->isEqualTo(driver)) { break; } } @@ -398,15 +383,15 @@ bool IOCatalogue::addDrivers( continue; } - result = array->setObject(dict); + result = array->setObject(personality); if (!result) { break; } - AddNewImports(set, dict); + AddNewImports(set, personality); } // Start device matching. - if (doNubMatching && (set->getCount() > 0)) { + if (result && doNubMatching && (set->getCount() > 0)) { IOService::catalogNewDrivers(set); generation++; } @@ -455,7 +440,7 @@ IOCatalogue::removeDrivers( return false; } - UniqueProperties( matching ); + OSKext::uniquePersonalityProperties( matching ); IOLockLock(lock); kernelTables->reset(); @@ -553,8 +538,7 @@ void IOCatalogue::moduleHasLoaded(OSString * moduleName) startMatching(dict); dict->release(); - (void) OSKext::setDeferredLoadSucceeded(); - (void) OSKext::considerRebuildOfPrelinkedKernel(); + (void) OSKext::considerRebuildOfPrelinkedKernel(moduleName); } void IOCatalogue::moduleHasLoaded(const char * moduleName) @@ -589,7 +573,7 @@ static IOReturn _terminateDrivers(OSDictionary * matching) if ( !iter ) return kIOReturnNoMemory; - UniqueProperties( matching ); + OSKext::uniquePersonalityProperties( matching ); // terminate instances. do { @@ -785,7 +769,183 @@ bool IOCatalogue::startMatching( OSDictionary * matching ) void IOCatalogue::reset(void) { + IOCatalogue::resetAndAddDrivers(/* no drivers; true reset */ NULL, + /* doMatching */ false); + return; +} + +bool IOCatalogue::resetAndAddDrivers(OSArray * drivers, bool doNubMatching) +{ + bool result = false; + OSArray * newPersonalities = NULL; // do not release + OSCollectionIterator * newPIterator = NULL; // must release + OSOrderedSet * matchSet = NULL; // must release + OSArray * oldPersonalities = NULL; // must release + OSArray * kernelPersonalities = NULL; // must release + OSString * errorString = NULL; // must release + OSObject * object = NULL; // do not release + OSDictionary * thisNewPersonality = NULL; // do not release + signed int count, i; + + extern const char * gIOKernelConfigTables; + + if (drivers) { + newPersonalities = OSDynamicCast(OSArray, drivers); + if (!newPersonalities) { + goto finish; + } + + newPIterator = OSCollectionIterator::withCollection(newPersonalities); + if (!newPIterator) { + goto finish; + } + + matchSet = OSOrderedSet::withCapacity(10, IOServiceOrdering, + (void *)gIOProbeScoreKey); + if (!matchSet) { + goto finish; + } + } + + /* Read personalities for the built-in kernel driver classes. + * We don't have many any more. + */ + kernelPersonalities = OSDynamicCast(OSArray, + OSUnserialize(gIOKernelConfigTables, &errorString)); + if (!kernelPersonalities && errorString) { + IOLog("KernelConfigTables syntax error: %s\n", + errorString->getCStringNoCopy()); + goto finish; + } + + /* Now copy the current array of personalities so we can reuse them + * if the new list contains any duplicates. This saves on memory + * consumption. + */ + oldPersonalities = OSDynamicCast(OSArray, array->copyCollection()); + if (!oldPersonalities) { + goto finish; + } + + result = true; + IOLog("Resetting IOCatalogue.\n"); + + /* No goto finish from here to unlock. + */ + IOLockLock(lock); + + array->flushCollection(); + + /* Add back the kernel personalities and remove them from the old + * array so we don't try to match on them again. Go forward through + * the arrays as this causes the least iteration since kernel personalities + * should always be first. + */ + count = kernelPersonalities->getCount(); + for (i = 0; i < count; i++) { + + /* Static cast here, as the data is coming from within the kernel image. + */ + OSDictionary * thisNewPersonality = (OSDictionary *) + kernelPersonalities->getObject(i); + array->setObject(thisNewPersonality); + + signed int oldPCount = oldPersonalities->getCount(); + for (signed int oldPIndex = 0; oldPIndex < oldPCount; oldPIndex++) { + if (thisNewPersonality->isEqualTo(oldPersonalities->getObject(oldPIndex))) { + oldPersonalities->removeObject(oldPIndex); + break; + } + } + } + + /* Now add the new set of personalities passed in, using existing + * copies if we had them in kernel memory already. + */ + if (newPIterator) { + OSDictionary * thisOldPersonality = NULL; // do not release + + while ( (object = newPIterator->getNextObject()) ) { + + thisNewPersonality = OSDynamicCast(OSDictionary, object); + if (!thisNewPersonality) { + IOLog("IOCatalogue::resetAndAddDrivers() encountered non-dictionary; bailing.\n"); + result = false; + break; + } + + /* Convert common OSString property values to OSSymbols. + */ + OSKext::uniquePersonalityProperties(thisNewPersonality); + + /* Add driver personality to catalogue, but if we had a copy already + * use that instead so we don't have multiple copies from OSKext instances. + */ + count = oldPersonalities->getCount(); + thisOldPersonality = NULL; + while (count--) { + + thisOldPersonality = (OSDictionary *)oldPersonalities->getObject(count); + + /* Unlike in other functions, this comparison must be exact! + * The catalogue must be able to contain personalities that + * are proper supersets of others. + * Do not compare just the properties present in one driver + * pesonality or the other. + */ + if (thisNewPersonality->isEqualTo(thisOldPersonality)) { + break; + } + } + + /* If we found a dup, add the *original* back to the catalogue, + * remove it from our bookkeeping list, and continue. + * Don't worry about matching on personalities we already had. + */ + if (count >= 0) { + array->setObject(thisOldPersonality); + oldPersonalities->removeObject(count); + continue; + } + + /* Otherwise add the new personality and mark it for matching. + */ + array->setObject(thisNewPersonality); + AddNewImports(matchSet, thisNewPersonality); + } + + /***** + * Now, go through remaining old personalities, which have effectively + * been removed, and add them to the match set as necessary. + */ + count = oldPersonalities->getCount(); + while (count--) { + + /* Static cast here is ok as these dictionaries were already in the catalogue. + */ + thisOldPersonality = (OSDictionary *)oldPersonalities->getObject(count); + AddNewImports(matchSet, thisOldPersonality); + } + + /* Finally, start device matching on all new & removed personalities. + */ + if (result && doNubMatching && (matchSet->getCount() > 0)) { + IOService::catalogNewDrivers(matchSet); + generation++; + } + } + + IOLockUnlock(lock); + +finish: + if (newPIterator) newPIterator->release(); + if (matchSet) matchSet->release(); + if (oldPersonalities) oldPersonalities->release(); + if (kernelPersonalities) kernelPersonalities->release(); + if (errorString) errorString->release(); + + return result; } bool IOCatalogue::serialize(OSSerialize * s) const @@ -837,9 +997,9 @@ bool IOCatalogue::serializeData(IOOptionBits kind, OSSerialize * s) const ********************************************************************** ********************************************************************** * These functions are no longer used are necessary for C++ binary -* compatibility on ppc/i386. +* compatibility on i386. **********************************************************************/ -#if __ppc__ || __i386__ +#if __i386__ bool IOCatalogue::recordStartupExtensions(void) { return false; } @@ -850,4 +1010,4 @@ bool IOCatalogue::addExtensionsFromArchive(OSData * mkext) kern_return_t IOCatalogue::removeKernelLinker(void) { return KERN_NOT_SUPPORTED; } -#endif /* __ppc__ || __i386__ */ +#endif /* __i386__ */ diff --git a/iokit/Kernel/IOCommandGate.cpp b/iokit/Kernel/IOCommandGate.cpp index 0b823d2b6..29ecd859e 100644 --- a/iokit/Kernel/IOCommandGate.cpp +++ b/iokit/Kernel/IOCommandGate.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2000, 2009-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -49,11 +49,33 @@ OSMetaClassDefineReservedUnused(IOCommandGate, 5); OSMetaClassDefineReservedUnused(IOCommandGate, 6); OSMetaClassDefineReservedUnused(IOCommandGate, 7); -bool IOCommandGate::checkForWork() { return false; } +#if IOKITSTATS + +#define IOStatisticsInitializeCounter() \ +do { \ + IOStatistics::setCounterType(IOEventSource::reserved->counter, kIOStatisticsCommandGateCounter); \ +} while (0) + +#define IOStatisticsActionCall() \ +do { \ + IOStatistics::countCommandGateActionCall(IOEventSource::reserved->counter); \ +} while (0) + +#else + +#define IOStatisticsInitializeCounter() +#define IOStatisticsActionCall() + +#endif /* IOKITSTATS */ bool IOCommandGate::init(OSObject *inOwner, Action inAction) { - return super::init(inOwner, (IOEventSource::Action) inAction); + bool res = super::init(inOwner, (IOEventSource::Action) inAction); + if (res) { + IOStatisticsInitializeCounter(); + } + + return res; } IOCommandGate * @@ -162,6 +184,8 @@ IOReturn IOCommandGate::runAction(Action inAction, IOTimeStampStartConstant(IODBG_CMDQ(IOCMDQ_ACTION), (uintptr_t) inAction, (uintptr_t) owner); + IOStatisticsActionCall(); + // Must be gated and on the work loop or enabled res = (*inAction)(owner, arg0, arg1, arg2, arg3); @@ -170,7 +194,7 @@ IOReturn IOCommandGate::runAction(Action inAction, (uintptr_t) inAction, (uintptr_t) owner); openGate(); - + return res; } @@ -196,9 +220,11 @@ IOReturn IOCommandGate::attemptAction(Action inAction, if (trace) IOTimeStampStartConstant(IODBG_CMDQ(IOCMDQ_ACTION), - (uintptr_t) inAction, (uintptr_t) owner); - - res = (*inAction)(owner, arg0, arg1, arg2, arg3); + (uintptr_t) inAction, (uintptr_t) owner); + + IOStatisticsActionCall(); + + res = (*inAction)(owner, arg0, arg1, arg2, arg3); if (trace) IOTimeStampEndConstant(IODBG_CMDQ(IOCMDQ_ACTION), diff --git a/iokit/Kernel/IOCommandQueue.cpp b/iokit/Kernel/IOCommandQueue.cpp index 7d7249dee..3a184bf94 100644 --- a/iokit/Kernel/IOCommandQueue.cpp +++ b/iokit/Kernel/IOCommandQueue.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -35,6 +35,20 @@ #include +#if IOKITSTATS + +#define IOStatisticsInitializeCounter() \ + IOStatistics::setCounterType(reserved->counter, kIOStatisticsCommandQueueCounter) + +#define IOStatisticsActionCall() \ + IOStatistics::countCommandQueueActionCall(reserved->counter) + +#else + +#define IOStatisticsInitializeCounter() +#define IOStatisticsActionCall() + +#endif /* IOKITSTATS */ #define NUM_FIELDS_IN_COMMAND 4 typedef struct commandEntryTag { @@ -87,6 +101,8 @@ bool IOCommandQueue::init(OSObject *inOwner, producerIndex = consumerIndex = 0; + IOStatisticsInitializeCounter(); + return true; } @@ -130,7 +146,7 @@ void IOCommandQueue::free() bool IOCommandQueue::checkForWork() { - void *field0, *field1, *field2, *field3; + void *field0, *field1, *field2, *field3; bool trace = ( gIOKitTrace & kIOTraceCommandGates ) ? true : false; if (!enabled || consumerIndex == producerIndex) @@ -150,10 +166,11 @@ bool IOCommandQueue::checkForWork() if (trace) IOTimeStampStartConstant(IODBG_CMDQ(IOCMDQ_ACTION), - (uintptr_t) action, (uintptr_t) owner); - + (uintptr_t) action, (uintptr_t) owner); + + IOStatisticsActionCall(); (*(IOCommandQueueAction) action)(owner, field0, field1, field2, field3); - + if (trace) IOTimeStampEndConstant(IODBG_CMDQ(IOCMDQ_ACTION), (uintptr_t) action, (uintptr_t) owner); diff --git a/iokit/Kernel/IODMACommand.cpp b/iokit/Kernel/IODMACommand.cpp index 444abf720..b95ee921d 100644 --- a/iokit/Kernel/IODMACommand.cpp +++ b/iokit/Kernel/IODMACommand.cpp @@ -84,7 +84,7 @@ enum /**************************** class IODMACommand ***************************/ #undef super -#define super OSObject +#define super IOCommand OSDefineMetaClassAndStructors(IODMACommand, IOCommand); OSMetaClassDefineReservedUsed(IODMACommand, 0); @@ -227,6 +227,8 @@ IODMACommand::free() IOReturn IODMACommand::setMemoryDescriptor(const IOMemoryDescriptor *mem, bool autoPrepare) { + IOReturn err = kIOReturnSuccess; + if (mem == fMemory) { if (!autoPrepare) @@ -244,15 +246,15 @@ IODMACommand::setMemoryDescriptor(const IOMemoryDescriptor *mem, bool autoPrepar if (fActive) return kIOReturnBusy; clearMemoryDescriptor(); - }; + } if (mem) { bzero(&fMDSummary, sizeof(fMDSummary)); - IOReturn rtn = mem->dmaCommandOperation( + err = mem->dmaCommandOperation( kIOMDGetCharacteristics, &fMDSummary, sizeof(fMDSummary)); - if (rtn) - return rtn; + if (err) + return err; ppnum_t highPage = fMDSummary.fHighestPage ? fMDSummary.fHighestPage : gIOLastPage; @@ -269,11 +271,15 @@ IODMACommand::setMemoryDescriptor(const IOMemoryDescriptor *mem, bool autoPrepar fMemory = mem; mem->dmaCommandOperation(kIOMDSetDMAActive, this, 0); - if (autoPrepare) - return prepare(); - }; - - return kIOReturnSuccess; + if (autoPrepare) { + err = prepare(); + if (err) { + clearMemoryDescriptor(); + } + } + } + + return err; } IOReturn diff --git a/iokit/Kernel/IODMAController.cpp b/iokit/Kernel/IODMAController.cpp index 33a54dc76..603998035 100644 --- a/iokit/Kernel/IODMAController.cpp +++ b/iokit/Kernel/IODMAController.cpp @@ -50,9 +50,9 @@ IODMAController *IODMAController::getController(IOService *provider, UInt32 dmaI // Find the name of the parent dma controller dmaParentData = OSDynamicCast(OSData, provider->getProperty("dma-parent")); - if (dmaParentData == 0) return false; + if (dmaParentData == 0) return NULL; dmaParentName = createControllerName(*(UInt32 *)dmaParentData->getBytesNoCopy()); - if (dmaParentName == 0) return false; + if (dmaParentName == 0) return NULL; // Wait for the parent dma controller dmaController = OSDynamicCast(IODMAController, IOService::waitForService(IOService::nameMatching(dmaParentName))); diff --git a/iokit/Kernel/IODeviceTreeSupport.cpp b/iokit/Kernel/IODeviceTreeSupport.cpp index afb221cf4..8de463efd 100644 --- a/iokit/Kernel/IODeviceTreeSupport.cpp +++ b/iokit/Kernel/IODeviceTreeSupport.cpp @@ -37,12 +37,14 @@ #include +#include + extern "C" { - #include - void DTInit( void * data ); - int IODTGetLoaderInfo( char *key, void **infoAddr, int *infosize ); - void IODTFreeLoaderInfo( char *key, void *infoAddr, int infoSize ); +int IODTGetLoaderInfo( const char *key, void **infoAddr, int *infosize ); +void IODTFreeLoaderInfo( const char *key, void *infoAddr, int infoSize ); +int IODTGetDefault(const char *key, void *infoAddr, unsigned int infoSize ); + } #include @@ -209,26 +211,6 @@ IODeviceTreeAlloc( void * dtTop ) if( !intMap && child->getProperty( gIODTInterruptParentKey)) intMap = true; -#if __ppc__ - OSObject * obj; - - // Look for a "driver,AAPL,MacOSX,PowerPC" property. - if( (obj = child->getProperty( "driver,AAPL,MacOSX,PowerPC"))) { - gIOCatalogue->addExtensionsFromArchive((OSData *)obj); - child->removeProperty( "driver,AAPL,MacOSX,PowerPC"); - } - - // some gross pruning - child->removeProperty( "lanLib,AAPL,MacOS,PowerPC"); - - if( (obj = child->getProperty( "driver,AAPL,MacOS,PowerPC"))) { - - if( (0 == (prop = (OSData *)child->getProperty( gIODTTypeKey ))) - || (strncmp("display", (char *)prop->getBytesNoCopy(), sizeof("display"))) ) { - child->removeProperty( "driver,AAPL,MacOS,PowerPC"); - } - } -#endif /* __ppc__ */ } regIter->release(); } @@ -265,7 +247,7 @@ IODeviceTreeAlloc( void * dtTop ) return( parent); } -int IODTGetLoaderInfo( char *key, void **infoAddr, int *infoSize ) +int IODTGetLoaderInfo( const char *key, void **infoAddr, int *infoSize ) { IORegistryEntry *chosen; OSData *propObj; @@ -290,7 +272,7 @@ int IODTGetLoaderInfo( char *key, void **infoAddr, int *infoSize ) return 0; } -void IODTFreeLoaderInfo( char *key, void *infoAddr, int infoSize ) +void IODTFreeLoaderInfo( const char *key, void *infoAddr, int infoSize ) { vm_offset_t range[2]; IORegistryEntry *chosen; @@ -307,6 +289,26 @@ void IODTFreeLoaderInfo( char *key, void *infoAddr, int infoSize ) } } +int IODTGetDefault(const char *key, void *infoAddr, unsigned int infoSize ) +{ + IORegistryEntry *defaults; + OSData *defaultObj; + unsigned int defaultSize; + + defaults = IORegistryEntry::fromPath( "/defaults", gIODTPlane ); + if ( defaults == 0 ) return -1; + + defaultObj = OSDynamicCast( OSData, defaults->getProperty(key) ); + if ( defaultObj == 0 ) return -1; + + defaultSize = defaultObj->getLength(); + if ( defaultSize > infoSize) return -1; + + memcpy( infoAddr, defaultObj->getBytesNoCopy(), defaultSize ); + + return 0; +} + static void FreePhysicalMemory( vm_offset_t * range ) { vm_offset_t virt; diff --git a/iokit/Kernel/IOEventSource.cpp b/iokit/Kernel/IOEventSource.cpp index a20232d91..95046dacd 100644 --- a/iokit/Kernel/IOEventSource.cpp +++ b/iokit/Kernel/IOEventSource.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2000, 2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -40,6 +40,7 @@ HISTORY #define super OSObject OSDefineMetaClassAndAbstractStructors(IOEventSource, OSObject) + OSMetaClassDefineReservedUnused(IOEventSource, 0); OSMetaClassDefineReservedUnused(IOEventSource, 1); OSMetaClassDefineReservedUnused(IOEventSource, 2); @@ -49,17 +50,88 @@ OSMetaClassDefineReservedUnused(IOEventSource, 5); OSMetaClassDefineReservedUnused(IOEventSource, 6); OSMetaClassDefineReservedUnused(IOEventSource, 7); +bool IOEventSource::checkForWork() { return false; } + /* inline function implementations */ -void IOEventSource::signalWorkAvailable() { workLoop->signalWorkAvailable(); } -void IOEventSource::openGate() { workLoop->openGate(); } -void IOEventSource::closeGate() { workLoop->closeGate(); } -bool IOEventSource::tryCloseGate() { return workLoop->tryCloseGate(); } + +#if IOKITSTATS + +#define IOStatisticsRegisterCounter() \ +do { \ + reserved->counter = IOStatistics::registerEventSource(inOwner); \ +} while (0) + +#define IOStatisticsUnregisterCounter() \ +do { \ + if (reserved) \ + IOStatistics::unregisterEventSource(reserved->counter); \ +} while (0) + +#define IOStatisticsOpenGate() \ +do { \ + IOStatistics::countOpenGate(reserved->counter); \ +} while (0) + +#define IOStatisticsCloseGate() \ +do { \ + IOStatistics::countCloseGate(reserved->counter); \ +} while (0) + +#else + +#define IOStatisticsRegisterCounter() +#define IOStatisticsUnregisterCounter() +#define IOStatisticsOpenGate() +#define IOStatisticsCloseGate() + +#endif /* IOKITSTATS */ + +void IOEventSource::signalWorkAvailable() +{ + workLoop->signalWorkAvailable(); +} + +void IOEventSource::openGate() +{ + IOStatisticsOpenGate(); + workLoop->openGate(); +} + +void IOEventSource::closeGate() +{ + workLoop->closeGate(); + IOStatisticsCloseGate(); +} + +bool IOEventSource::tryCloseGate() +{ + bool res; + if ((res = workLoop->tryCloseGate())) { + IOStatisticsCloseGate(); + } + return res; +} + int IOEventSource::sleepGate(void *event, UInt32 type) - { return workLoop->sleepGate(event, type); } +{ + bool res; + IOStatisticsOpenGate(); + res = workLoop->sleepGate(event, type); + IOStatisticsCloseGate(); + return res; +} + int IOEventSource::sleepGate(void *event, AbsoluteTime deadline, UInt32 type) - { return workLoop->sleepGate(event, deadline, type); } -void IOEventSource::wakeupGate(void *event, bool oneThread) - { workLoop->wakeupGate(event, oneThread); } +{ + bool res; + IOStatisticsOpenGate(); + res = workLoop->sleepGate(event, deadline, type); + IOStatisticsCloseGate(); + return res; +} + +void IOEventSource::wakeupGate(void *event, bool oneThread) { workLoop->wakeupGate(event, oneThread); } + bool IOEventSource::init(OSObject *inOwner, Action inAction) @@ -75,9 +147,28 @@ bool IOEventSource::init(OSObject *inOwner, (void) setAction(inAction); enabled = true; + if(!reserved) { + reserved = IONew(ExpansionData, 1); + if (!reserved) { + return false; + } + } + + IOStatisticsRegisterCounter(); + return true; } +void IOEventSource::free( void ) +{ + IOStatisticsUnregisterCounter(); + + if (reserved) + IODelete(reserved, ExpansionData, 1); + + super::free(); +} + IOEventSource::Action IOEventSource::getAction () const { return action; }; void IOEventSource::setAction(Action inAction) diff --git a/iokit/Kernel/IOFilterInterruptEventSource.cpp b/iokit/Kernel/IOFilterInterruptEventSource.cpp index f4f73e2b4..944e84ced 100644 --- a/iokit/Kernel/IOFilterInterruptEventSource.cpp +++ b/iokit/Kernel/IOFilterInterruptEventSource.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -32,6 +32,25 @@ #include #include +#if IOKITSTATS + +#define IOStatisticsInitializeCounter() \ +do { \ + IOStatistics::setCounterType(IOEventSource::reserved->counter, kIOStatisticsFilterInterruptEventSourceCounter); \ +} while (0) + +#define IOStatisticsInterrupt() \ +do { \ + IOStatistics::countInterrupt(IOEventSource::reserved->counter); \ +} while (0) + +#else + +#define IOStatisticsInitializeCounter() +#define IOStatisticsInterrupt() + +#endif /* IOKITSTATS */ + #define super IOInterruptEventSource OSDefineMetaClassAndStructors @@ -79,6 +98,9 @@ IOFilterInterruptEventSource::init(OSObject *inOwner, return false; filterAction = inFilterAction; + + IOStatisticsInitializeCounter(); + return true; } @@ -103,9 +125,10 @@ ::filterInterruptEventSource(OSObject *inOwner, void IOFilterInterruptEventSource::signalInterrupt() { bool trace = (gIOKitTrace & kIOTraceIntEventSource) ? true : false; - + + IOStatisticsInterrupt(); producerCount++; - + if (trace) IOTimeStampStartConstant(IODBG_INTES(IOINTES_SEMA), (uintptr_t) this, (uintptr_t) owner); @@ -129,20 +152,20 @@ IOFilterInterruptEventSource::getFilterAction() const void IOFilterInterruptEventSource::normalInterruptOccurred (void */*refcon*/, IOService */*prov*/, int /*source*/) { - bool filterRes; + bool filterRes; bool trace = (gIOKitTrace & kIOTraceIntEventSource) ? true : false; - + if (trace) IOTimeStampStartConstant(IODBG_INTES(IOINTES_FILTER), (uintptr_t) filterAction, (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop); - + // Call the filter. filterRes = (*filterAction)(owner, this); if (trace) IOTimeStampEndConstant(IODBG_INTES(IOINTES_FILTER), (uintptr_t) filterAction, (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop); - + if (filterRes) signalInterrupt(); } @@ -150,20 +173,20 @@ void IOFilterInterruptEventSource::normalInterruptOccurred void IOFilterInterruptEventSource::disableInterruptOccurred (void */*refcon*/, IOService *prov, int source) { - bool filterRes; + bool filterRes; bool trace = (gIOKitTrace & kIOTraceIntEventSource) ? true : false; - + if (trace) IOTimeStampStartConstant(IODBG_INTES(IOINTES_FILTER), (uintptr_t) filterAction, (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop); - + // Call the filter. filterRes = (*filterAction)(owner, this); if (trace) IOTimeStampEndConstant(IODBG_INTES(IOINTES_FILTER), (uintptr_t) filterAction, (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop); - + if (filterRes) { prov->disableInterrupt(source); /* disable the interrupt */ signalInterrupt(); diff --git a/iokit/Kernel/IOHibernateIO.cpp b/iokit/Kernel/IOHibernateIO.cpp index bc180fb5b..a4d7dbb4d 100644 --- a/iokit/Kernel/IOHibernateIO.cpp +++ b/iokit/Kernel/IOHibernateIO.cpp @@ -151,6 +151,7 @@ to restrict I/O ops. #include #include "IOPMPowerStateQueue.h" #include +#include #include #include @@ -158,13 +159,20 @@ to restrict I/O ops. #include #include // (FWRITE, ...) #include +#include #include #include #include #include "IOHibernateInternal.h" -#include "WKdm.h" +#include #include "IOKitKernelInternal.h" +#include + +#include +#include + +extern "C" addr64_t kvtophys(vm_offset_t va); /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -192,6 +200,8 @@ static IOPolledFileIOVars gFileVars; static IOHibernateVars gIOHibernateVars; static struct kern_direct_file_io_ref_t * gIOHibernateFileRef; static hibernate_cryptvars_t gIOHibernateCryptWakeContext; +static hibernate_graphics_t _hibernateGraphics; +static hibernate_graphics_t * gIOHibernateGraphicsInfo = &_hibernateGraphics; /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -540,19 +550,59 @@ file_extent_callback(void * ref, uint64_t start, uint64_t length) ctx->size += length; } +static IOService * +IOCopyMediaForDev(dev_t device) +{ + OSDictionary * matching; + OSNumber * num; + OSIterator * iter; + IOService * result = 0; + + matching = IOService::serviceMatching("IOMedia"); + if (!matching) + return (0); + do + { + num = OSNumber::withNumber(major(device), 32); + if (!num) + break; + matching->setObject(kIOBSDMajorKey, num); + num->release(); + num = OSNumber::withNumber(minor(device), 32); + if (!num) + break; + matching->setObject(kIOBSDMinorKey, num); + num->release(); + if (!num) + break; + iter = IOService::getMatchingServices(matching); + if (iter) + { + result = (IOService *) iter->getNextObject(); + result->retain(); + iter->release(); + } + } + while (false); + matching->release(); + + return (result); +} + IOReturn IOPolledFileOpen( const char * filename, IOBufferMemoryDescriptor * ioBuffer, IOPolledFileIOVars ** fileVars, OSData ** fileExtents, - OSData ** imagePath) + OSData ** imagePath, uint8_t * volumeCryptKey) { IOReturn err = kIOReturnError; IOPolledFileIOVars * vars; _OpenFileContext ctx; OSData * extentsData; OSNumber * num; - IORegistryEntry * part = 0; - OSDictionary * matching; - OSIterator * iter; + IOService * part = 0; + OSString * keyUUID = 0; + OSString * keyStoreUUID = 0; + dev_t block_dev; dev_t hibernate_image_dev; uint64_t maxiobytes; @@ -575,10 +625,13 @@ IOPolledFileOpen( const char * filename, IOBufferMemoryDescriptor * ioBuffer, ctx.size = 0; vars->fileRef = kern_open_file_for_direct_io(filename, &file_extent_callback, &ctx, + &block_dev, &hibernate_image_dev, &vars->block0, &maxiobytes, - &vars->solid_state); + &vars->flags, + 0, (caddr_t) gIOHibernateCurrentHeader, + sizeof(IOHibernateImageHeader)); if (!vars->fileRef) { err = kIOReturnNoSpace; @@ -587,10 +640,10 @@ IOPolledFileOpen( const char * filename, IOBufferMemoryDescriptor * ioBuffer, gIOHibernateFileRef = vars->fileRef; if (kIOHibernateModeSSDInvert & gIOHibernateMode) - vars->solid_state = vars->solid_state ? false : true; + vars->flags ^= kIOHibernateOptionSSD; HIBLOG("Opened file %s, size %qd, partition base 0x%qx, maxio %qx ssd %d\n", filename, ctx.size, - vars->block0, maxiobytes, vars->solid_state); + vars->block0, maxiobytes, kIOHibernateOptionSSD & vars->flags); if (ctx.size < 1*1024*1024) // check against image size estimate! { err = kIOReturnNoSpace; @@ -601,41 +654,52 @@ IOPolledFileOpen( const char * filename, IOBufferMemoryDescriptor * ioBuffer, vars->bufferSize = maxiobytes; vars->extentMap = (IOPolledFileExtent *) extentsData->getBytesNoCopy(); - - matching = IOService::serviceMatching("IOMedia"); - num = OSNumber::withNumber(major(hibernate_image_dev), 32); - matching->setObject(kIOBSDMajorKey, num); - num->release(); - num = OSNumber::withNumber(minor(hibernate_image_dev), 32); - matching->setObject(kIOBSDMinorKey, num); - num->release(); - iter = IOService::getMatchingServices(matching); - matching->release(); - if (iter) - { - part = (IORegistryEntry *) iter->getNextObject(); - part->retain(); - iter->release(); - } - if (!part) - break; - int minor, major; + part = IOCopyMediaForDev(block_dev); + if (!part) + break; + + err = part->callPlatformFunction(PLATFORM_FUNCTION_GET_MEDIA_ENCRYPTION_KEY_UUID, false, + (void *) &keyUUID, (void *) &keyStoreUUID, NULL, NULL); + if ((kIOReturnSuccess == err) && keyUUID && keyStoreUUID) + { +// IOLog("got volume key %s\n", keyStoreUUID->getCStringNoCopy()); + uuid_t volumeKeyUUID; + aks_volume_key_t vek; + static IOService * sKeyStore; + static const OSSymbol * sAKSGetKey; + + if (!sAKSGetKey) + sAKSGetKey = OSSymbol::withCStringNoCopy(AKS_PLATFORM_FUNCTION_GETKEY); + if (!sKeyStore) + sKeyStore = (IOService *) IORegistryEntry::fromPath(AKS_SERVICE_PATH, gIOServicePlane); + if (sKeyStore) + err = uuid_parse(keyStoreUUID->getCStringNoCopy(), volumeKeyUUID); + else + err = kIOReturnNoResources; + if (kIOReturnSuccess == err) + err = sKeyStore->callPlatformFunction(sAKSGetKey, true, volumeKeyUUID, &vek, NULL, NULL); + if (kIOReturnSuccess != err) + IOLog("volume key err 0x%x\n", err); + else + { + size_t bytes = (kIOHibernateAESKeySize / 8); + if (vek.key.keybytecount < bytes) + bytes = vek.key.keybytecount; + bcopy(&vek.key.keybytes[0], volumeCryptKey, bytes); + } + bzero(&vek, sizeof(vek)); + } + part->release(); + + part = IOCopyMediaForDev(hibernate_image_dev); + if (!part) + break; + IORegistryEntry * next; IORegistryEntry * child; OSData * data; - num = (OSNumber *) part->getProperty(kIOBSDMajorKey); - if (!num) - break; - major = num->unsigned32BitValue(); - num = (OSNumber *) part->getProperty(kIOBSDMinorKey); - if (!num) - break; - minor = num->unsigned32BitValue(); - - hibernate_image_dev = makedev(major, minor); - vars->pollers = OSArray::withCapacity(4); if (!vars->pollers) break; @@ -663,7 +727,7 @@ IOPolledFileOpen( const char * filename, IOBufferMemoryDescriptor * ioBuffer, && child->isParent(next, gIOServicePlane, true)); HIBLOG("hibernate image major %d, minor %d, blocksize %ld, pollers %d\n", - major, minor, (long)vars->blockSize, vars->pollers->getCount()); + major(hibernate_image_dev), minor(hibernate_image_dev), (long)vars->blockSize, vars->pollers->getCount()); if (vars->pollers->getCount() < kIOHibernateMinPollersNeeded) continue; @@ -682,18 +746,22 @@ IOPolledFileOpen( const char * filename, IOBufferMemoryDescriptor * ioBuffer, if ((extentsData->getLength() >= sizeof(IOPolledFileExtent))) { - char str2[24]; + char str2[24 + sizeof(uuid_string_t) + 2]; #if defined(__i386__) || defined(__x86_64__) if (!gIOCreateEFIDevicePathSymbol) gIOCreateEFIDevicePathSymbol = OSSymbol::withCString("CreateEFIDevicePath"); - snprintf(str2, sizeof(str2), "%qx", vars->extentMap[0].start); + if (keyUUID) + snprintf(str2, sizeof(str2), "%qx:%s", + vars->extentMap[0].start, keyUUID->getCStringNoCopy()); + else + snprintf(str2, sizeof(str2), "%qx", vars->extentMap[0].start); err = IOService::getPlatform()->callPlatformFunction( gIOCreateEFIDevicePathSymbol, false, - (void *) part, (void *) str2, (void *) true, - (void *) &data); + (void *) part, (void *) str2, + (void *) (uintptr_t) true, (void *) &data); #else char str1[256]; int len = sizeof(str1); @@ -724,7 +792,7 @@ IOPolledFileOpen( const char * filename, IOBufferMemoryDescriptor * ioBuffer, HIBLOG("error 0x%x opening hibernation file\n", err); if (vars->fileRef) { - kern_close_file_for_direct_io(vars->fileRef); + kern_close_file_for_direct_io(vars->fileRef, 0, 0, 0); gIOHibernateFileRef = vars->fileRef = NULL; } } @@ -827,6 +895,8 @@ IOPolledFileWrite(IOPolledFileIOVars * vars, && (vars->position > vars->encryptStart) && ((vars->position - length) < vars->encryptEnd)) { + AbsoluteTime startTime, endTime; + uint32_t encryptLen, encryptStart; encryptLen = vars->position - vars->encryptStart; if (encryptLen > length) @@ -835,12 +905,20 @@ IOPolledFileWrite(IOPolledFileIOVars * vars, if (vars->position > vars->encryptEnd) encryptLen -= (vars->position - vars->encryptEnd); + clock_get_uptime(&startTime); + // encrypt the buffer aes_encrypt_cbc(vars->buffer + vars->bufferHalf + encryptStart, &cryptvars->aes_iv[0], encryptLen / AES_BLOCK_SIZE, vars->buffer + vars->bufferHalf + encryptStart, &cryptvars->ctx.encrypt); + + clock_get_uptime(&endTime); + ADD_ABSOLUTETIME(&vars->cryptTime, &endTime); + SUB_ABSOLUTETIME(&vars->cryptTime, &startTime); + vars->cryptBytes += encryptLen; + // save initial vector for following encrypts bcopy(vars->buffer + vars->bufferHalf + encryptStart + encryptLen - AES_BLOCK_SIZE, &cryptvars->aes_iv[0], @@ -916,7 +994,7 @@ IOPolledFileRead(IOPolledFileIOVars * vars, vars->bufferOffset += copy; // vars->position += copy; - if (vars->bufferOffset == vars->bufferLimit) + if ((vars->bufferOffset == vars->bufferLimit) && (vars->position < vars->readEnd)) { if (vars->io) { @@ -929,9 +1007,9 @@ IOPolledFileRead(IOPolledFileIOVars * vars, if (vars->position & (vars->blockSize - 1)) HIBLOG("misaligned file pos %qx\n", vars->position); - vars->position += vars->lastRead; + vars->position += vars->lastRead; vars->extentRemaining -= vars->lastRead; - vars->bufferLimit = vars->lastRead; + vars->bufferLimit = vars->lastRead; if (!vars->extentRemaining) { @@ -953,14 +1031,18 @@ if (vars->position & (vars->blockSize - 1)) HIBLOG("misaligned file pos %qx\n", length = vars->extentRemaining; else length = vars->bufferSize; - vars->lastRead = length; + if ((length + vars->position) > vars->readEnd) + length = vars->readEnd - vars->position; + vars->lastRead = length; + if (length) + { //if (length != vars->bufferSize) HIBLOG("short read of %qx ends@ %qx\n", length, offset + length); - - err = IOHibernatePollerIO(vars, kIOPolledRead, vars->bufferHalf, offset, length); - if (kIOReturnSuccess != err) - break; - vars->io = true; + err = IOHibernatePollerIO(vars, kIOPolledRead, vars->bufferHalf, offset, length); + if (kIOReturnSuccess != err) + break; + vars->io = true; + } vars->bufferHalf = vars->bufferHalf ? 0 : vars->bufferSize; vars->bufferOffset = 0; @@ -969,16 +1051,26 @@ if (vars->position & (vars->blockSize - 1)) HIBLOG("misaligned file pos %qx\n", if (cryptvars) { uint8_t thisVector[AES_BLOCK_SIZE]; + AbsoluteTime startTime, endTime; + // save initial vector for following decrypts bcopy(&cryptvars->aes_iv[0], &thisVector[0], AES_BLOCK_SIZE); bcopy(vars->buffer + vars->bufferHalf + lastReadLength - AES_BLOCK_SIZE, &cryptvars->aes_iv[0], AES_BLOCK_SIZE); + // decrypt the buffer + clock_get_uptime(&startTime); + aes_decrypt_cbc(vars->buffer + vars->bufferHalf, &thisVector[0], lastReadLength / AES_BLOCK_SIZE, vars->buffer + vars->bufferHalf, &cryptvars->ctx.decrypt); + + clock_get_uptime(&endTime); + ADD_ABSOLUTETIME(&vars->cryptTime, &endTime); + SUB_ABSOLUTETIME(&vars->cryptTime, &startTime); + vars->cryptBytes += lastReadLength; } #endif /* CRYPTO */ } @@ -1013,10 +1105,12 @@ IOHibernateSystemSleep(void) if (IOService::getPMRootDomain()->getHibernateSettings( &gIOHibernateMode, &gIOHibernateFreeRatio, &gIOHibernateFreeTime)) + { if (kIOHibernateModeSleep & gIOHibernateMode) // default to discard clean for safe sleep gIOHibernateMode ^= (kIOHibernateModeDiscardCleanInactive | kIOHibernateModeDiscardCleanActive); + } if ((obj = IOService::getPMRootDomain()->copyProperty(kIOHibernateFileKey))) { @@ -1039,40 +1133,48 @@ IOHibernateSystemSleep(void) vars->ioBuffer = IOBufferMemoryDescriptor::withOptions(kIODirectionOutIn, 2 * kDefaultIOSize, page_size); - if (!vars->srcBuffer || !vars->ioBuffer) + vars->handoffBuffer = IOBufferMemoryDescriptor::withOptions(kIODirectionOutIn, + ptoa_64(gIOHibernateHandoffPageCount), page_size); + + if (!vars->srcBuffer || !vars->ioBuffer || !vars->handoffBuffer) { err = kIOReturnNoMemory; break; } + // open & invalidate the image file + gIOHibernateCurrentHeader->signature = kIOHibernateHeaderInvalidSignature; err = IOPolledFileOpen(gIOHibernateFilename, vars->ioBuffer, - &vars->fileVars, &vars->fileExtents, &data); + &vars->fileVars, &vars->fileExtents, &data, + &vars->volumeCryptKey[0]); if (KERN_SUCCESS != err) { HIBLOG("IOPolledFileOpen(%x)\n", err); break; } - if (vars->fileVars->fileRef) - { - // invalidate the image file - gIOHibernateCurrentHeader->signature = kIOHibernateHeaderInvalidSignature; - int err = kern_write_file(vars->fileVars->fileRef, 0, - (caddr_t) gIOHibernateCurrentHeader, sizeof(IOHibernateImageHeader)); - if (KERN_SUCCESS != err) - HIBLOG("kern_write_file(%d)\n", err); - } bzero(gIOHibernateCurrentHeader, sizeof(IOHibernateImageHeader)); gIOHibernateCurrentHeader->debugFlags = gIOHibernateDebugFlags; - - dsSSD = (vars->fileVars->solid_state + dsSSD = ((0 != (kIOHibernateOptionSSD & vars->fileVars->flags)) && (kOSBooleanTrue == IOService::getPMRootDomain()->getProperty(kIOPMDeepSleepEnabledKey))); - if (dsSSD) { gIOHibernateCurrentHeader->options |= kIOHibernateOptionSSD | kIOHibernateOptionColor; + +#if defined(__i386__) || defined(__x86_64__) + if (!uuid_is_null(vars->volumeCryptKey) && + (kOSBooleanTrue != IOService::getPMRootDomain()->getProperty(kIOPMDestroyFVKeyOnStandbyKey))) + { + uintptr_t smcVars[2]; + smcVars[0] = sizeof(vars->volumeCryptKey); + smcVars[1] = (uintptr_t)(void *) &vars->volumeCryptKey[0]; + + IOService::getPMRootDomain()->setProperty(kIOHibernateSMCVariablesKey, smcVars, sizeof(smcVars)); + bzero(smcVars, sizeof(smcVars)); + } +#endif } else { @@ -1087,7 +1189,7 @@ IOHibernateSystemSleep(void) err = hibernate_setup(gIOHibernateCurrentHeader, gIOHibernateFreeRatio, gIOHibernateFreeTime, dsSSD, - &vars->page_list, &vars->page_list_wired, &encryptedswap); + &vars->page_list, &vars->page_list_wired, &vars->page_list_pal, &encryptedswap); clock_get_uptime(&endTime); SUB_ABSOLUTETIME(&endTime, &startTime); absolutetime_to_nanoseconds(endTime, &nsec); @@ -1096,7 +1198,7 @@ IOHibernateSystemSleep(void) if (KERN_SUCCESS != err) break; - if (encryptedswap) + if (encryptedswap || !uuid_is_null(vars->volumeCryptKey)) gIOHibernateMode ^= kIOHibernateModeEncrypt; if (kIOHibernateOptionProgress & gIOHibernateCurrentHeader->options) @@ -1137,45 +1239,6 @@ IOHibernateSystemSleep(void) } data->release(); -#if defined(__ppc__) - size_t len; - char valueString[16]; - - vars->saveBootDevice = gIOOptionsEntry->copyProperty(kIOSelectedBootDeviceKey); - if (gIOChosenEntry) - { - OSData * bootDevice = OSDynamicCast(OSData, gIOChosenEntry->getProperty(kIOBootPathKey)); - if (bootDevice) - { - sym = OSSymbol::withCStringNoCopy(kIOSelectedBootDeviceKey); - OSString * str2 = OSString::withCStringNoCopy((const char *) bootDevice->getBytesNoCopy()); - if (sym && str2) - gIOOptionsEntry->setProperty(sym, str2); - if (sym) - sym->release(); - if (str2) - str2->release(); - } - data = OSDynamicCast(OSData, gIOChosenEntry->getProperty(kIOHibernateMemorySignatureKey)); - if (data) - { - vars->haveFastBoot = true; - - len = snprintf(valueString, sizeof(valueString), "0x%lx", *((UInt32 *)data->getBytesNoCopy())); - data = OSData::withBytes(valueString, len + 1); - sym = OSSymbol::withCStringNoCopy(kIOHibernateMemorySignatureEnvKey); - if (sym && data) - gIOOptionsEntry->setProperty(sym, data); - if (sym) - sym->release(); - if (data) - data->release(); - } - data = OSDynamicCast(OSData, gIOChosenEntry->getProperty(kIOHibernateMachineSignatureKey)); - if (data) - gIOHibernateCurrentHeader->machineSignature = *((UInt32 *)data->getBytesNoCopy()); - } -#endif /* __ppc__ */ #if defined(__i386__) || defined(__x86_64__) struct AppleRTCHibernateVars { @@ -1529,6 +1592,38 @@ IOHibernateSystemHasSlept(void) /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ +static DeviceTreeNode * +MergeDeviceTree(DeviceTreeNode * entry, IORegistryEntry * regEntry) +{ + DeviceTreeNodeProperty * prop; + DeviceTreeNode * child; + IORegistryEntry * childRegEntry; + const char * nameProp; + unsigned int propLen, idx; + + prop = (DeviceTreeNodeProperty *) (entry + 1); + for (idx = 0; idx < entry->nProperties; idx++) + { + if (regEntry && (0 != strcmp("name", prop->name))) + { + regEntry->setProperty((const char *) prop->name, (void *) (prop + 1), prop->length); +// HIBPRINT("%s: %s, %d\n", regEntry->getName(), prop->name, prop->length); + } + prop = (DeviceTreeNodeProperty *) (((uintptr_t)(prop + 1)) + ((prop->length + 3) & ~3)); + } + + child = (DeviceTreeNode *) prop; + for (idx = 0; idx < entry->nChildren; idx++) + { + if (kSuccess != DTGetProperty(child, "name", (void **) &nameProp, &propLen)) + panic("no name"); + childRegEntry = regEntry ? regEntry->childFromPath(nameProp, gIODTPlane) : NULL; +// HIBPRINT("%s == %p\n", nameProp, childRegEntry); + child = MergeDeviceTree(child, childRegEntry); + } + return (child); +} + IOReturn IOHibernateSystemWake(void) { @@ -1582,64 +1677,9 @@ IOHibernateSystemWake(void) // invalidate nvram properties - (gIOOptionsEntry != 0) => nvram was touched -#ifdef __ppc__ - OSData * data = OSData::withCapacity(4); - if (gIOOptionsEntry && data) - { - const OSSymbol * sym = OSSymbol::withCStringNoCopy(kIOHibernateBootImageKey); - if (sym) - { - gIOOptionsEntry->setProperty(sym, data); - sym->release(); - } - sym = OSSymbol::withCStringNoCopy(kIOSelectedBootDeviceKey); - if (sym) - { - if (vars->saveBootDevice) - { - gIOOptionsEntry->setProperty(sym, vars->saveBootDevice); - vars->saveBootDevice->release(); - } - sym->release(); - } - sym = OSSymbol::withCStringNoCopy(kIOHibernateBootImageKeyKey); - if (sym) - { - gIOOptionsEntry->setProperty(sym, data); - sym->release(); - } - sym = OSSymbol::withCStringNoCopy(kIOHibernateMemorySignatureEnvKey); - if (sym) - { - gIOOptionsEntry->removeProperty(sym); - sym->release(); - } - } - if (data) - data->release(); - - if (gIOOptionsEntry) - { - if (!vars->haveFastBoot) - { - // reset boot audio volume - IODTPlatformExpert * platform = OSDynamicCast(IODTPlatformExpert, IOService::getPlatform()); - if (platform) - platform->writeXPRAM(kXPRamAudioVolume, - &vars->saveBootAudioVolume, sizeof(vars->saveBootAudioVolume)); - } - - // sync now to hardware if the booter has not - if (kIOHibernateStateInactive == gIOHibernateState) - gIOOptionsEntry->sync(); - else - // just sync the variables in case a later panic syncs nvram (it won't sync variables) - gIOOptionsEntry->syncOFVariables(); - } -#endif - #if defined(__i386__) || defined(__x86_64__) IOService::getPMRootDomain()->removeProperty(gIOHibernateRTCVariablesKey); + IOService::getPMRootDomain()->removeProperty(kIOHibernateSMCVariablesKey); /* * Hibernate variable is written to NVRAM on platforms in which RtcRam @@ -1672,6 +1712,47 @@ IOHibernateSystemWake(void) vars->srcBuffer->release(); if (vars->ioBuffer) vars->ioBuffer->release(); + bzero(&gIOHibernateHandoffPages[0], gIOHibernateHandoffPageCount * sizeof(gIOHibernateHandoffPages[0])); + if (vars->handoffBuffer) + { + IOHibernateHandoff * handoff; + bool done = false; + for (handoff = (IOHibernateHandoff *) vars->handoffBuffer->getBytesNoCopy(); + !done; + handoff = (IOHibernateHandoff *) &handoff->data[handoff->bytecount]) + { +// HIBPRINT("handoff %p, %x, %x\n", handoff, handoff->type, handoff->bytecount); + uint8_t * data = &handoff->data[0]; + switch (handoff->type) + { + case kIOHibernateHandoffTypeEnd: + done = true; + break; + + case kIOHibernateHandoffTypeDeviceTree: + MergeDeviceTree((DeviceTreeNode *) data, IOService::getServiceRoot()); + break; + + case kIOHibernateHandoffTypeKeyStore: +#if defined(__i386__) || defined(__x86_64__) + { + IOBufferMemoryDescriptor * + md = IOBufferMemoryDescriptor::withBytes(data, handoff->bytecount, kIODirectionOutIn); + if (md) + { + IOSetKeyStoreData(md); + } + } +#endif + break; + + default: + done = (kIOHibernateHandoffType != (handoff->type & 0xFFFF0000)); + break; + } + } + vars->handoffBuffer->release(); + } if (vars->fileExtents) vars->fileExtents->release(); @@ -1687,14 +1768,11 @@ IOHibernateSystemPostWake(void) { if (gIOHibernateFileRef) { - // invalidate the image file + // invalidate & close the image file gIOHibernateCurrentHeader->signature = kIOHibernateHeaderInvalidSignature; - int err = kern_write_file(gIOHibernateFileRef, 0, - (caddr_t) gIOHibernateCurrentHeader, sizeof(IOHibernateImageHeader)); - if (KERN_SUCCESS != err) - HIBLOG("kern_write_file(%d)\n", err); - - kern_close_file_for_direct_io(gIOHibernateFileRef); + kern_close_file_for_direct_io(gIOHibernateFileRef, + 0, (caddr_t) gIOHibernateCurrentHeader, + sizeof(IOHibernateImageHeader)); gIOHibernateFileRef = 0; } return (kIOReturnSuccess); @@ -1703,13 +1781,13 @@ IOHibernateSystemPostWake(void) /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ SYSCTL_STRING(_kern, OID_AUTO, hibernatefile, - CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN, + CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, gIOHibernateFilename, sizeof(gIOHibernateFilename), ""); SYSCTL_STRING(_kern, OID_AUTO, bootsignature, - CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN, + CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, gIOHibernateBootSignature, sizeof(gIOHibernateBootSignature), ""); SYSCTL_UINT(_kern, OID_AUTO, hibernatemode, - CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN, + CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, &gIOHibernateMode, 0, ""); void @@ -1738,10 +1816,6 @@ IOHibernateSystemInit(IOPMrootDomain * rootDomain) static void hibernate_setup_for_wake(void) { -#if __ppc__ - // go slow (state needed for wake) - ml_set_processor_speed(1); -#endif } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -1762,6 +1836,34 @@ uint32_t wired_pages_encrypted = 0; uint32_t dirty_pages_encrypted = 0; uint32_t wired_pages_clear = 0; +static void +hibernate_pal_callback(void *vars_arg, vm_offset_t addr) +{ + IOHibernateVars *vars = (IOHibernateVars *)vars_arg; + /* Make sure it's not in either of the save lists */ + hibernate_set_page_state(vars->page_list, vars->page_list_wired, atop_64(addr), 1, kIOHibernatePageStateFree); + + /* Set it in the bitmap of pages owned by the PAL */ + hibernate_page_bitset(vars->page_list_pal, TRUE, atop_64(addr)); +} + +static struct hibernate_cryptvars_t *local_cryptvars; + +extern "C" int +hibernate_pal_write(void *buffer, size_t size) +{ + IOHibernateVars * vars = &gIOHibernateVars; + + IOReturn err = IOPolledFileWrite(vars->fileVars, (const uint8_t *)buffer, size, local_cryptvars); + if (kIOReturnSuccess != err) { + kprintf("epic hibernate fail! %d\n", err); + return err; + } + + return 0; +} + + extern "C" uint32_t hibernate_write_image(void) { @@ -1786,9 +1888,11 @@ hibernate_write_image(void) uint32_t tag; uint32_t pageType; uint32_t pageAndCount[2]; + addr64_t phys64; + IOByteCount segLen; AbsoluteTime startTime, endTime; - AbsoluteTime allTime, compTime, decoTime; + AbsoluteTime allTime, compTime; uint64_t compBytes; uint64_t nsec; uint32_t lastProgressStamp = 0; @@ -1809,9 +1913,12 @@ hibernate_write_image(void) kdebug_enable = save_kdebug_enable; KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 1) | DBG_FUNC_START, 0, 0, 0, 0, 0); + IOService::getPMRootDomain()->tracePoint(kIOPMTracePointHibernate); restore1Sum = sum1 = sum2 = 0; + hibernate_pal_prepare(); + #if CRYPTO // encryption data. "iv" is the "initial vector". if (kIOHibernateModeEncrypt & gIOHibernateMode) @@ -1831,6 +1938,9 @@ hibernate_write_image(void) cryptvars = &_cryptvars; bzero(cryptvars, sizeof(hibernate_cryptvars_t)); + for (pageCount = 0; pageCount < sizeof(vars->wiredCryptKey); pageCount++) + vars->wiredCryptKey[pageCount] ^= vars->volumeCryptKey[pageCount]; + bzero(&vars->volumeCryptKey[0], sizeof(vars->volumeCryptKey)); aes_encrypt_key(vars->wiredCryptKey, kIOHibernateAESKeySize, &cryptvars->ctx.encrypt); @@ -1838,7 +1948,8 @@ hibernate_write_image(void) bcopy(&first_iv[0], &cryptvars->aes_iv[0], AES_BLOCK_SIZE); bzero(&vars->wiredCryptKey[0], sizeof(vars->wiredCryptKey)); bzero(&vars->cryptKey[0], sizeof(vars->cryptKey)); - bzero(gIOHibernateCryptWakeVars, sizeof(hibernate_cryptwakevars_t)); + + local_cryptvars = cryptvars; } #endif /* CRYPTO */ @@ -1846,6 +1957,7 @@ hibernate_write_image(void) hibernate_page_list_setall(vars->page_list, vars->page_list_wired, + vars->page_list_pal, &pageCount); HIBLOG("hibernate_page_list_setall found pageCount %d\n", pageCount); @@ -1863,9 +1975,8 @@ hibernate_write_image(void) #endif needEncrypt = (0 != (kIOHibernateModeEncrypt & gIOHibernateMode)); - AbsoluteTime_to_scalar(&compTime) = 0; - AbsoluteTime_to_scalar(&decoTime) = 0; + compBytes = 0; clock_get_uptime(&allTime); IOService::getPMRootDomain()->pmStatsRecordEvent( @@ -1901,18 +2012,26 @@ hibernate_write_image(void) uintptr_t hibernateBase; uintptr_t hibernateEnd; -#if defined(__i386__) || defined(__x86_64__) - hibernateBase = sectINITPTB; -#else - hibernateBase = sectHIBB; -#endif + hibernateBase = HIB_BASE; /* Defined in PAL headers */ hibernateEnd = (sectHIBB + sectSizeHIB); + // copy out restore1 code - - page = atop_32(hibernateBase); - count = atop_32(round_page(hibernateEnd)) - page; - header->restore1CodePage = page; + + for (count = 0; + (phys64 = vars->handoffBuffer->getPhysicalSegment(count, &segLen, kIOMemoryMapperNone)); + count += segLen) + { + for (pagesDone = 0; pagesDone < atop_32(segLen); pagesDone++) + { + gIOHibernateHandoffPages[atop_32(count) + pagesDone] = atop_64(phys64) + pagesDone; + } + } + + page = atop_32(kvtophys(hibernateBase)); + count = atop_32(round_page(hibernateEnd) - hibernateBase); + header->restore1CodePhysPage = page; + header->restore1CodeVirt = hibernateBase; header->restore1PageCount = count; header->restore1CodeOffset = ((uintptr_t) &hibernate_machine_entrypoint) - hibernateBase; header->restore1StackOffset = ((uintptr_t) &gIOHibernateRestoreStackEnd[0]) - 64 - hibernateBase; @@ -1922,7 +2041,7 @@ hibernate_write_image(void) for (page = 0; page < count; page++) { if ((src < &gIOHibernateRestoreStack[0]) || (src >= &gIOHibernateRestoreStackEnd[0])) - restore1Sum += hibernate_sum_page(src, header->restore1CodePage + page); + restore1Sum += hibernate_sum_page(src, header->restore1CodeVirt + page); else restore1Sum += 0x00000000; src += page_size; @@ -1956,9 +2075,6 @@ hibernate_write_image(void) // write the preview buffer - addr64_t phys64; - IOByteCount segLen; - if (vars->previewBuffer) { ppnum = 0; @@ -2031,8 +2147,9 @@ hibernate_write_image(void) hibernate_page_list_set_volatile(vars->page_list, vars->page_list_wired, &pageCount); - page = atop_32(hibernateBase); - count = atop_32(round_page(hibernateEnd)) - page; + + page = atop_32(KERNEL_IMAGE_TO_PHYS(hibernateBase)); + count = atop_32(round_page(KERNEL_IMAGE_TO_PHYS(hibernateEnd))) - page; hibernate_set_page_state(vars->page_list, vars->page_list_wired, page, count, kIOHibernatePageStateFree); @@ -2048,10 +2165,22 @@ hibernate_write_image(void) pageCount -= atop_32(segLen); } + for (count = 0; + (phys64 = vars->handoffBuffer->getPhysicalSegment(count, &segLen, kIOMemoryMapperNone)); + count += segLen) + { + hibernate_set_page_state(vars->page_list, vars->page_list_wired, + atop_64(phys64), atop_32(segLen), + kIOHibernatePageStateFree); + pageCount -= atop_32(segLen); + } + + (void)hibernate_pal_callback; + src = (uint8_t *) vars->srcBuffer->getBytesNoCopy(); - pagesDone = 0; - lastBlob = 0; + pagesDone = 0; + lastBlob = 0; HIBLOG("writing %d pages\n", pageCount); @@ -2254,9 +2383,9 @@ hibernate_write_image(void) else header->fileExtentMapSize = sizeof(header->fileExtentMap); bcopy(&fileExtents[0], &header->fileExtentMap[0], count); - - header->deviceBase = vars->fileVars->block0; + header->deviceBase = vars->fileVars->block0; + IOPolledFileSeek(vars->fileVars, 0); err = IOPolledFileWrite(vars->fileVars, (uint8_t *) header, sizeof(IOHibernateImageHeader), @@ -2283,12 +2412,16 @@ hibernate_write_image(void) nsec / 1000000ULL); absolutetime_to_nanoseconds(compTime, &nsec); - HIBLOG("comp time: %qd ms, ", - nsec / 1000000ULL); + HIBLOG("comp bytes: %qd time: %qd ms %qd Mb/s, ", + compBytes, + nsec / 1000000ULL, + nsec ? (((compBytes * 1000000000ULL) / 1024 / 1024) / nsec) : 0); - absolutetime_to_nanoseconds(decoTime, &nsec); - HIBLOG("deco time: %qd ms, ", - nsec / 1000000ULL); + absolutetime_to_nanoseconds(vars->fileVars->cryptTime, &nsec); + HIBLOG("crypt bytes: %qd time: %qd ms %qd Mb/s, ", + vars->fileVars->cryptBytes, + nsec / 1000000ULL, + nsec ? (((vars->fileVars->cryptBytes * 1000000000ULL) / 1024 / 1024) / nsec) : 0); HIBLOG("\nimage %qd, uncompressed %qd (%d), compressed %qd (%d%%), sum1 %x, sum2 %x\n", header->imageSize, @@ -2353,7 +2486,9 @@ hibernate_machine_init(void) uint32_t sum; uint32_t pagesDone; uint32_t pagesRead = 0; + AbsoluteTime startTime, compTime; AbsoluteTime allTime, endTime; + uint64_t compBytes; uint64_t nsec; uint32_t lastProgressStamp = 0; uint32_t progressStamp; @@ -2381,7 +2516,7 @@ hibernate_machine_init(void) HIBPRINT("diag %x %x %x %x\n", gIOHibernateCurrentHeader->diag[0], gIOHibernateCurrentHeader->diag[1], - gIOHibernateCurrentHeader->diag[2], gIOHibernateCurrentHeader->diag[3]); + gIOHibernateCurrentHeader->diag[2], gIOHibernateCurrentHeader->diag[3]); HIBPRINT("video %x %d %d %d status %x\n", gIOHibernateGraphicsInfo->physicalAddress, gIOHibernateGraphicsInfo->depth, @@ -2392,6 +2527,62 @@ hibernate_machine_init(void) boot_args *args = (boot_args *) PE_state.bootArgs; + cryptvars = (kIOHibernateModeEncrypt & gIOHibernateMode) ? &gIOHibernateCryptWakeContext : 0; + + if (gIOHibernateCurrentHeader->handoffPageCount > gIOHibernateHandoffPageCount) + panic("handoff overflow"); + + IOHibernateHandoff * handoff; + bool done = false; + bool foundCryptData = false; + + for (handoff = (IOHibernateHandoff *) vars->handoffBuffer->getBytesNoCopy(); + !done; + handoff = (IOHibernateHandoff *) &handoff->data[handoff->bytecount]) + { +// HIBPRINT("handoff %p, %x, %x\n", handoff, handoff->type, handoff->bytecount); + uint8_t * data = &handoff->data[0]; + switch (handoff->type) + { + case kIOHibernateHandoffTypeEnd: + done = true; + break; + + case kIOHibernateHandoffTypeGraphicsInfo: + bcopy(data, gIOHibernateGraphicsInfo, sizeof(*gIOHibernateGraphicsInfo)); + break; + + case kIOHibernateHandoffTypeCryptVars: + if (cryptvars) + { + hibernate_cryptwakevars_t * + wakevars = (hibernate_cryptwakevars_t *) &handoff->data[0]; + bcopy(&wakevars->aes_iv[0], &cryptvars->aes_iv[0], sizeof(cryptvars->aes_iv)); + } + foundCryptData = true; + bzero(data, handoff->bytecount); + break; + + case kIOHibernateHandoffTypeMemoryMap: + hibernate_newruntime_map(data, handoff->bytecount, + gIOHibernateCurrentHeader->systemTableOffset); + break; + + case kIOHibernateHandoffTypeDeviceTree: + { +// DTEntry chosen = NULL; +// HIBPRINT("DTLookupEntry %d\n", DTLookupEntry((const DTEntry) data, "/chosen", &chosen)); + } + break; + + default: + done = (kIOHibernateHandoffType != (handoff->type & 0xFFFF0000)); + break; + } + } + if (cryptvars && !foundCryptData) + panic("hibernate handoff"); + if (vars->videoMapping && gIOHibernateGraphicsInfo->physicalAddress && (args->Video.v_baseAddr == gIOHibernateGraphicsInfo->physicalAddress)) @@ -2404,21 +2595,11 @@ hibernate_machine_init(void) } uint8_t * src = (uint8_t *) vars->srcBuffer->getBytesNoCopy(); - - if (gIOHibernateWakeMapSize) - { - err = IOMemoryDescriptorWriteFromPhysical(vars->srcBuffer, 0, ptoa_64(gIOHibernateWakeMap), - gIOHibernateWakeMapSize); - if (kIOReturnSuccess == err) - hibernate_newruntime_map(src, gIOHibernateWakeMapSize, - gIOHibernateCurrentHeader->systemTableOffset); - gIOHibernateWakeMap = 0; - gIOHibernateWakeMapSize = 0; - } - uint32_t decoOffset; clock_get_uptime(&allTime); + AbsoluteTime_to_scalar(&compTime) = 0; + compBytes = 0; HIBLOG("IOHibernatePollerOpen(), ml_get_interrupts_enabled %d\n", ml_get_interrupts_enabled()); err = IOHibernatePollerOpen(vars->fileVars, kIOPolledAfterSleepState, 0); @@ -2439,23 +2620,17 @@ hibernate_machine_init(void) ProgressUpdate(gIOHibernateGraphicsInfo, (uint8_t *) vars->videoMapping, 0, lastBlob); } - cryptvars = (kIOHibernateModeEncrypt & gIOHibernateMode) ? &gIOHibernateCryptWakeContext : 0; - if (kIOHibernateModeEncrypt & gIOHibernateMode) - { - cryptvars = &gIOHibernateCryptWakeContext; - bcopy(&gIOHibernateCryptWakeVars->aes_iv[0], - &cryptvars->aes_iv[0], - sizeof(cryptvars->aes_iv)); - } - // kick off the read ahead vars->fileVars->io = false; vars->fileVars->bufferHalf = 0; vars->fileVars->bufferLimit = 0; vars->fileVars->lastRead = 0; + vars->fileVars->readEnd = gIOHibernateCurrentHeader->imageSize; vars->fileVars->bufferOffset = vars->fileVars->bufferLimit; + vars->fileVars->cryptBytes = 0; + AbsoluteTime_to_scalar(&vars->fileVars->cryptTime) = 0; - IOPolledFileRead(vars->fileVars, 0, 0, cryptvars); + err = IOPolledFileRead(vars->fileVars, 0, 0, cryptvars); vars->fileVars->bufferOffset = vars->fileVars->bufferLimit; // -- @@ -2464,7 +2639,7 @@ hibernate_machine_init(void) uint32_t * header = (uint32_t *) src; sum = 0; - do + while (kIOReturnSuccess == err) { unsigned int count; unsigned int page; @@ -2510,7 +2685,14 @@ hibernate_machine_init(void) if (compressedSize < page_size) { decoOffset = page_size; + + clock_get_uptime(&startTime); WKdm_decompress((WK_word*) src, (WK_word*) (src + decoOffset), PAGE_SIZE_IN_WORDS); + clock_get_uptime(&endTime); + ADD_ABSOLUTETIME(&compTime, &endTime); + SUB_ABSOLUTETIME(&compTime, &startTime); + + compBytes += page_size; } else decoOffset = 0; @@ -2554,7 +2736,8 @@ hibernate_machine_init(void) } } } - while (true); + if (pagesDone == gIOHibernateCurrentHeader->actualUncompressedPages) + err = kIOReturnLockedRead; if (kIOReturnSuccess != err) panic("Hibernate restore error %x", err); @@ -2580,10 +2763,22 @@ hibernate_machine_init(void) SUB_ABSOLUTETIME(&endTime, &allTime); absolutetime_to_nanoseconds(endTime, &nsec); - HIBLOG("hibernate_machine_init pagesDone %d sum2 %x, time: %qd ms\n", + HIBLOG("hibernate_machine_init pagesDone %d sum2 %x, time: %qd ms, ", pagesDone, sum, nsec / 1000000ULL); - - KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 2) | DBG_FUNC_NONE, pagesRead, pagesDone, 0, 0, 0); + + absolutetime_to_nanoseconds(compTime, &nsec); + HIBLOG("comp bytes: %qd time: %qd ms %qd Mb/s, ", + compBytes, + nsec / 1000000ULL, + nsec ? (((compBytes * 1000000000ULL) / 1024 / 1024) / nsec) : 0); + + absolutetime_to_nanoseconds(vars->fileVars->cryptTime, &nsec); + HIBLOG("crypt bytes: %qd time: %qd ms %qd Mb/s\n", + vars->fileVars->cryptBytes, + nsec / 1000000ULL, + nsec ? (((vars->fileVars->cryptBytes * 1000000000ULL) / 1024 / 1024) / nsec) : 0); + + KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 2) | DBG_FUNC_NONE, pagesRead, pagesDone, 0, 0, 0); } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ diff --git a/iokit/Kernel/IOHibernateInternal.h b/iokit/Kernel/IOHibernateInternal.h index 2b2e5802e..7e7e95fe6 100644 --- a/iokit/Kernel/IOHibernateInternal.h +++ b/iokit/Kernel/IOHibernateInternal.h @@ -36,8 +36,10 @@ struct IOHibernateVars { hibernate_page_list_t * page_list; hibernate_page_list_t * page_list_wired; + hibernate_page_list_t * page_list_pal; class IOBufferMemoryDescriptor * ioBuffer; class IOBufferMemoryDescriptor * srcBuffer; + class IOBufferMemoryDescriptor * handoffBuffer; class IOMemoryDescriptor * previewBuffer; OSData * previewData; OSData * fileExtents; @@ -52,6 +54,7 @@ struct IOHibernateVars uint8_t saveBootAudioVolume; uint8_t wiredCryptKey[kIOHibernateAESKeySize / 8]; uint8_t cryptKey[kIOHibernateAESKeySize / 8]; + uint8_t volumeCryptKey[kIOHibernateAESKeySize / 8]; }; typedef struct IOHibernateVars IOHibernateVars; @@ -68,12 +71,15 @@ struct IOPolledFileIOVars IOByteCount bufferHalf; IOByteCount extentRemaining; IOByteCount lastRead; - boolean_t solid_state; + IOByteCount readEnd; + uint32_t flags; uint64_t block0; uint64_t position; uint64_t extentPosition; uint64_t encryptStart; uint64_t encryptEnd; + uint64_t cryptBytes; + AbsoluteTime cryptTime; IOPolledFileExtent * extentMap; IOPolledFileExtent * currentExtent; bool io; @@ -103,6 +109,5 @@ extern unsigned long sectSizeDATA; extern vm_offset_t sectINITPTB; #endif -extern vm_offset_t gIOHibernateWakeMap; // ppnum -extern vm_size_t gIOHibernateWakeMapSize; - +extern ppnum_t gIOHibernateHandoffPages[]; +extern uint32_t gIOHibernateHandoffPageCount; diff --git a/iokit/Kernel/IOHibernateRestoreKernel.c b/iokit/Kernel/IOHibernateRestoreKernel.c index 280b8c430..7259ab3ec 100644 --- a/iokit/Kernel/IOHibernateRestoreKernel.c +++ b/iokit/Kernel/IOHibernateRestoreKernel.c @@ -35,9 +35,13 @@ #include #include -#include "WKdm.h" +#include #include "IOHibernateInternal.h" +#if defined(__i386__) || defined(__x86_64__) +#include +#endif + /* This code is linked into the kernel but part of the "__HIB" section, which means its used by code running in the special context of restoring the kernel text and data @@ -52,14 +56,15 @@ uint32_t gIOHibernateDebugFlags; static IOHibernateImageHeader _hibernateHeader; IOHibernateImageHeader * gIOHibernateCurrentHeader = &_hibernateHeader; -static hibernate_graphics_t _hibernateGraphics; -hibernate_graphics_t * gIOHibernateGraphicsInfo = &_hibernateGraphics; - -static hibernate_cryptwakevars_t _cryptWakeVars; -hibernate_cryptwakevars_t * gIOHibernateCryptWakeVars = &_cryptWakeVars; +ppnum_t gIOHibernateHandoffPages[64]; +uint32_t gIOHibernateHandoffPageCount = sizeof(gIOHibernateHandoffPages) + / sizeof(gIOHibernateHandoffPages[0]); -vm_offset_t gIOHibernateWakeMap; // ppnum -vm_size_t gIOHibernateWakeMapSize; +#if CONFIG_DEBUG +void hibprintf(const char *fmt, ...); +#else +#define hibprintf(x...) +#endif #if CONFIG_SLEEP @@ -148,7 +153,7 @@ static void uart_puthex(uint64_t num) c = 0xf & (num >> bit); if (c) leading = false; - else if (leading) + else if (leading && bit) continue; if (c <= 9) c += '0'; @@ -333,7 +338,7 @@ hibernate_page_bitmap_count(hibernate_bitmap_t * bitmap, uint32_t set, uint32_t return (count); } -static vm_offset_t +static ppnum_t hibernate_page_list_grab(hibernate_page_list_t * list, uint32_t * pNextFree) { uint32_t nextFree = *pNextFree; @@ -365,27 +370,19 @@ static uint32_t store_one_page(uint32_t procFlags, uint32_t * src, uint32_t compressedSize, uint32_t * buffer, uint32_t ppnum) { - uint64_t dst; - uint32_t sum; - - dst = ptoa_64(ppnum); - if (ppnum < 0x00100000) - buffer = (uint32_t *) (uintptr_t) dst; + uint64_t dst = ptoa_64(ppnum); - if (compressedSize != PAGE_SIZE) - { - WKdm_decompress((WK_word*) src, (WK_word*) buffer, PAGE_SIZE >> 2); - src = buffer; - } - - sum = hibernate_sum_page((uint8_t *) src, ppnum); - - if (((uint64_t) (uintptr_t) src) == dst) - src = 0; - - hibernate_restore_phys_page((uint64_t) (uintptr_t) src, dst, PAGE_SIZE, procFlags); + if (compressedSize != PAGE_SIZE) + { + dst = pal_hib_map(DEST_COPY_AREA, dst); + WKdm_decompress((WK_word*) src, (WK_word*)(uintptr_t)dst, PAGE_SIZE >> 2); + } + else + { + dst = hibernate_restore_phys_page((uint64_t) (uintptr_t) src, dst, PAGE_SIZE, procFlags); + } - return (sum); + return hibernate_sum_page((uint8_t *)(uintptr_t)dst, ppnum); } // used only for small struct copies @@ -411,9 +408,10 @@ hibernate_kernel_entrypoint(IOHibernateImageHeader * header, { uint32_t idx; uint32_t * src; - uint32_t * buffer; + uint32_t * imageReadPos; uint32_t * pageIndexSource; hibernate_page_list_t * map; + uint32_t stage; uint32_t count; uint32_t ppnum; uint32_t page; @@ -424,10 +422,13 @@ hibernate_kernel_entrypoint(IOHibernateImageHeader * header, uint32_t * copyPageList; uint32_t copyPageIndex; uint32_t sum; + uint32_t pageSum; uint32_t nextFree; uint32_t lastImagePage; uint32_t lastMapPage; uint32_t lastPageIndexPage; + uint32_t handoffPages; + uint32_t handoffPageCount; C_ASSERT(sizeof(IOHibernateImageHeader) == 512); @@ -440,84 +441,43 @@ hibernate_kernel_entrypoint(IOHibernateImageHeader * header, gIOHibernateCurrentHeader, sizeof(IOHibernateImageHeader)); - if (!p2) - { - count = header->graphicsInfoOffset; - if (count) - p2 = (void *)(((uintptr_t) header) - count); - } - if (p2) - bcopy_internal(p2, - gIOHibernateGraphicsInfo, - sizeof(hibernate_graphics_t)); - else - gIOHibernateGraphicsInfo->physicalAddress = gIOHibernateGraphicsInfo->depth = 0; - - if (!p3) - { - count = header->cryptVarsOffset; - if (count) - p3 = (void *)(((uintptr_t) header) - count); - } - if (p3) - bcopy_internal(p3, - gIOHibernateCryptWakeVars, - sizeof(hibernate_cryptwakevars_t)); - - src = (uint32_t *) + map = (hibernate_page_list_t *) (((uintptr_t) &header->fileExtentMap[0]) + header->fileExtentMapSize - + ptoa_32(header->restore1PageCount)); - - if (header->previewSize) - { - pageIndexSource = src; - map = (hibernate_page_list_t *)(((uintptr_t) pageIndexSource) + header->previewSize); - src = (uint32_t *) (((uintptr_t) pageIndexSource) + header->previewPageListSize); - } - else - { - pageIndexSource = 0; - map = (hibernate_page_list_t *) src; - src = (uint32_t *) (((uintptr_t) map) + header->bitmapSize); - } - - lastPageIndexPage = atop_32((uintptr_t) src); + + ptoa_32(header->restore1PageCount) + + header->previewSize); lastImagePage = atop_32(((uintptr_t) header) + header->image1Size); lastMapPage = atop_32(((uintptr_t) map) + header->bitmapSize); + handoffPages = header->handoffPages; + handoffPageCount = header->handoffPageCount; + debug_code(kIOHibernateRestoreCodeImageEnd, ptoa_64(lastImagePage)); - debug_code(kIOHibernateRestoreCodePageIndexStart, (uintptr_t) pageIndexSource); - debug_code(kIOHibernateRestoreCodePageIndexEnd, ptoa_64(lastPageIndexPage)); debug_code(kIOHibernateRestoreCodeMapStart, (uintptr_t) map); debug_code(kIOHibernateRestoreCodeMapEnd, ptoa_64(lastMapPage)); + debug_code('hand', ptoa_64(handoffPages)); + debug_code('hnde', ptoa_64(handoffPageCount)); + // knock all the image pages to be used out of free map for (ppnum = atop_32((uintptr_t) header); ppnum <= lastImagePage; ppnum++) { hibernate_page_bitset(map, FALSE, ppnum); } + // knock all the handoff pages to be used out of free map + for (ppnum = handoffPages; ppnum < (handoffPages + handoffPageCount); ppnum++) + { + hibernate_page_bitset(map, FALSE, ppnum); + } nextFree = 0; hibernate_page_list_grab(map, &nextFree); - buffer = (uint32_t *) (uintptr_t) ptoa_32(hibernate_page_list_grab(map, &nextFree)); - if (header->memoryMapSize && (count = header->memoryMapOffset)) - { - p4 = (void *)(((uintptr_t) header) - count); - gIOHibernateWakeMap = hibernate_page_list_grab(map, &nextFree); - gIOHibernateWakeMapSize = header->memoryMapSize; - debug_code(kIOHibernateRestoreCodeWakeMapSize, gIOHibernateWakeMapSize); - if (gIOHibernateWakeMapSize > PAGE_SIZE) - fatal(); - bcopy_internal(p4, (void *) (uintptr_t) ptoa_32(gIOHibernateWakeMap), gIOHibernateWakeMapSize); - } - else - gIOHibernateWakeMapSize = 0; + pal_hib_window_setup(hibernate_page_list_grab(map, &nextFree)); - sum = gIOHibernateCurrentHeader->actualRestore1Sum; + sum = header->actualRestore1Sum; gIOHibernateCurrentHeader->diag[0] = (uint32_t)(uintptr_t) header; gIOHibernateCurrentHeader->diag[1] = sum; @@ -528,54 +488,110 @@ hibernate_kernel_entrypoint(IOHibernateImageHeader * header, copyPageIndex = PAGE_SIZE >> 2; compressedSize = PAGE_SIZE; + stage = 2; + count = 0; + src = NULL; + + if (gIOHibernateCurrentHeader->previewSize) + { + pageIndexSource = (uint32_t *) + (((uintptr_t) &header->fileExtentMap[0]) + + gIOHibernateCurrentHeader->fileExtentMapSize + + ptoa_32(gIOHibernateCurrentHeader->restore1PageCount)); + imageReadPos = (uint32_t *) (((uintptr_t) pageIndexSource) + gIOHibernateCurrentHeader->previewPageListSize); + lastPageIndexPage = atop_32((uintptr_t) imageReadPos); + } + else + { + pageIndexSource = NULL; + lastPageIndexPage = 0; + imageReadPos = (uint32_t *) (((uintptr_t) map) + gIOHibernateCurrentHeader->bitmapSize); + } + + debug_code(kIOHibernateRestoreCodePageIndexStart, (uintptr_t) pageIndexSource); + debug_code(kIOHibernateRestoreCodePageIndexEnd, ptoa_64(lastPageIndexPage)); while (1) { - if (pageIndexSource) - { - ppnum = pageIndexSource[0]; - count = pageIndexSource[1]; - pageIndexSource += 2; - if (!count) - { - pageIndexSource = 0; - src = (uint32_t *) (((uintptr_t) map) + gIOHibernateCurrentHeader->bitmapSize); - ppnum = src[0]; - count = src[1]; - src += 2; - } - } - else - { - ppnum = src[0]; - count = src[1]; - if (!count) - break; - src += 2; + switch (stage) + { + case 2: + // copy handoff data + count = src ? 0 : handoffPageCount; + if (!count) + break; + if (count > gIOHibernateHandoffPageCount) + count = gIOHibernateHandoffPageCount; + src = (uint32_t *) (uintptr_t) ptoa_64(handoffPages); + break; + + case 1: + // copy pageIndexSource pages == preview image data + if (!src) + { + if (!pageIndexSource) + break; + src = imageReadPos; + } + ppnum = pageIndexSource[0]; + count = pageIndexSource[1]; + pageIndexSource += 2; + imageReadPos = src; + break; + + case 0: + // copy pages + if (!src) + { + src = (uint32_t *) (((uintptr_t) map) + gIOHibernateCurrentHeader->bitmapSize); + } + ppnum = src[0]; + count = src[1]; + src += 2; + imageReadPos = src; + break; + } + + + if (!count) + { + if (!stage) + break; + stage--; + src = NULL; + continue; } for (page = 0; page < count; page++, ppnum++) { - uint32_t tag; + uint32_t tag; int conflicts; - if (!pageIndexSource) - { - tag = *src++; - compressedSize = kIOHibernateTagLength & tag; - } + if (2 == stage) + ppnum = gIOHibernateHandoffPages[page]; + else if (!stage) + { + tag = *src++; + compressedSize = kIOHibernateTagLength & tag; + } + + conflicts = (ppnum >= atop_32((uintptr_t) map)) && (ppnum <= lastMapPage); - conflicts = (((ppnum >= atop_32((uintptr_t) map)) && (ppnum <= lastMapPage)) - || ((ppnum >= atop_32((uintptr_t) src)) && (ppnum <= lastImagePage))); + conflicts |= ((ppnum >= atop_32((uintptr_t) imageReadPos)) && (ppnum <= lastImagePage)); - if (pageIndexSource) - conflicts |= ((ppnum >= atop_32((uintptr_t) pageIndexSource)) && (ppnum <= lastPageIndexPage)); + if (stage >= 2) + conflicts |= ((ppnum >= atop_32((uintptr_t) src)) && (ppnum <= (handoffPages + handoffPageCount - 1))); + + if (stage >= 1) + conflicts |= ((ppnum >= atop_32((uintptr_t) pageIndexSource)) && (ppnum <= lastPageIndexPage)); if (!conflicts) { - if (compressedSize) - sum += store_one_page(gIOHibernateCurrentHeader->processorFlags, - src, compressedSize, buffer, ppnum); +// if (compressedSize) + pageSum = store_one_page(gIOHibernateCurrentHeader->processorFlags, + src, compressedSize, 0, ppnum); + if (stage != 2) + sum += pageSum; uncompressedPages++; } else @@ -596,46 +612,59 @@ hibernate_kernel_entrypoint(IOHibernateImageHeader * header, // alloc new copy list page uint32_t pageListPage = hibernate_page_list_grab(map, &nextFree); // link to current - if (copyPageList) - copyPageList[1] = pageListPage; - else - copyPageListHead = pageListPage; - copyPageList = (uint32_t *) (uintptr_t) ptoa_32(pageListPage); + if (copyPageList) { + copyPageList[1] = pageListPage; + } else { + copyPageListHead = pageListPage; + } + copyPageList = (uint32_t *)pal_hib_map(SRC_COPY_AREA, + ptoa_32(pageListPage)); copyPageList[1] = 0; copyPageIndex = 2; } copyPageList[copyPageIndex++] = ppnum; copyPageList[copyPageIndex++] = bufferPage; - copyPageList[copyPageIndex++] = compressedSize; + copyPageList[copyPageIndex++] = (compressedSize | (stage << 24)); copyPageList[0] = copyPageIndex; - dst = (uint32_t *) (uintptr_t) ptoa_32(bufferPage); + dst = (uint32_t *)pal_hib_map(DEST_COPY_AREA, ptoa_32(bufferPage)); for (idx = 0; idx < ((compressedSize + 3) >> 2); idx++) - dst[idx] = src[idx]; + dst[idx] = src[idx]; } src += ((compressedSize + 3) >> 2); } } + /* src points to the last page restored, so we need to skip over that */ + hibernateRestorePALState(src); + // -- copy back conflicts copyPageList = (uint32_t *)(uintptr_t) ptoa_32(copyPageListHead); + while (copyPageList) { + copyPageList = (uint32_t *)pal_hib_map(COPY_PAGE_AREA, (uintptr_t)copyPageList); for (copyPageIndex = 2; copyPageIndex < copyPageList[0]; copyPageIndex += 3) { - ppnum = copyPageList[copyPageIndex + 0]; - src = (uint32_t *) (uintptr_t) ptoa_32(copyPageList[copyPageIndex + 1]); - compressedSize = copyPageList[copyPageIndex + 2]; - - sum += store_one_page(gIOHibernateCurrentHeader->processorFlags, - src, compressedSize, buffer, ppnum); + ppnum = copyPageList[copyPageIndex + 0]; + src = (uint32_t *) (uintptr_t) ptoa_32(copyPageList[copyPageIndex + 1]); + src = (uint32_t *)pal_hib_map(SRC_COPY_AREA, (uintptr_t)src); + compressedSize = copyPageList[copyPageIndex + 2]; + stage = compressedSize >> 24; + compressedSize &= 0x1FFF; + pageSum = store_one_page(gIOHibernateCurrentHeader->processorFlags, + src, compressedSize, 0, ppnum); + if (stage != 2) + sum += pageSum; uncompressedPages++; } copyPageList = (uint32_t *) (uintptr_t) ptoa_32(copyPageList[1]); } + pal_hib_patchup(); + // -- image has been destroyed... gIOHibernateCurrentHeader->actualImage1Sum = sum; @@ -646,16 +675,10 @@ hibernate_kernel_entrypoint(IOHibernateImageHeader * header, gIOHibernateState = kIOHibernateStateWakingFromHibernate; #if CONFIG_SLEEP -#if defined(__ppc__) - typedef void (*ResetProc)(void); - ResetProc proc; - proc = (ResetProc) 0x100; - __asm__ volatile("ori 0, 0, 0" : : ); - proc(); -#elif defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) typedef void (*ResetProc)(void); ResetProc proc; - proc = (ResetProc) acpi_wake_prot_entry; + proc = HIB_ENTRYPOINT; // flush caches __asm__("wbinvd"); proc(); @@ -666,3 +689,445 @@ hibernate_kernel_entrypoint(IOHibernateImageHeader * header, return -1; } + +#if CONFIG_DEBUG +/* standalone printf implementation */ +/*- + * Copyright (c) 1986, 1988, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)subr_prf.c 8.3 (Berkeley) 1/21/94 + */ + +typedef long ptrdiff_t; +char const hibhex2ascii_data[] = "0123456789abcdefghijklmnopqrstuvwxyz"; +#define hibhex2ascii(hex) (hibhex2ascii_data[hex]) +#define toupper(c) ((c) - 0x20 * (((c) >= 'a') && ((c) <= 'z'))) +static size_t +hibstrlen(const char *s) +{ + size_t l = 0; + while (*s++) + l++; + return l; +} + +/* Max number conversion buffer length: a u_quad_t in base 2, plus NUL byte. */ +#define MAXNBUF (sizeof(intmax_t) * NBBY + 1) + +/* + * Put a NUL-terminated ASCII number (base <= 36) in a buffer in reverse + * order; return an optional length and a pointer to the last character + * written in the buffer (i.e., the first character of the string). + * The buffer pointed to by `nbuf' must have length >= MAXNBUF. + */ +static char * +ksprintn(char *nbuf, uintmax_t num, int base, int *lenp, int upper) +{ + char *p, c; + + /* Truncate so we don't call umoddi3, which isn't in __HIB */ +#if !defined(__LP64__) + uint32_t num2 = (uint32_t) num; +#else + uintmax_t num2 = num; +#endif + + p = nbuf; + *p = '\0'; + do { + c = hibhex2ascii(num2 % base); + *++p = upper ? toupper(c) : c; + } while (num2 /= base); + if (lenp) + *lenp = (int)(p - nbuf); + return (p); +} + +/* + * Scaled down version of printf(3). + * + * Two additional formats: + * + * The format %b is supported to decode error registers. + * Its usage is: + * + * printf("reg=%b\n", regval, "*"); + * + * where is the output base expressed as a control character, e.g. + * \10 gives octal; \20 gives hex. Each arg is a sequence of characters, + * the first of which gives the bit number to be inspected (origin 1), and + * the next characters (up to a control character, i.e. a character <= 32), + * give the name of the register. Thus: + * + * kvprintf("reg=%b\n", 3, "\10\2BITTWO\1BITONE\n"); + * + * would produce output: + * + * reg=3 + * + * XXX: %D -- Hexdump, takes pointer and separator string: + * ("%6D", ptr, ":") -> XX:XX:XX:XX:XX:XX + * ("%*D", len, ptr, " " -> XX XX XX XX ... + */ +static int +hibkvprintf(char const *fmt, void (*func)(int, void*), void *arg, int radix, va_list ap) +{ +#define PCHAR(c) {int cc=(c); if (func) (*func)(cc,arg); else *d++ = cc; retval++; } + char nbuf[MAXNBUF]; + char *d; + const char *p, *percent, *q; + u_char *up; + int ch, n; + uintmax_t num; + int base, lflag, qflag, tmp, width, ladjust, sharpflag, neg, sign, dot; + int cflag, hflag, jflag, tflag, zflag; + int dwidth, upper; + char padc; + int stop = 0, retval = 0; + + num = 0; + if (!func) + d = (char *) arg; + else + d = NULL; + + if (fmt == NULL) + fmt = "(fmt null)\n"; + + if (radix < 2 || radix > 36) + radix = 10; + + for (;;) { + padc = ' '; + width = 0; + while ((ch = (u_char)*fmt++) != '%' || stop) { + if (ch == '\0') + return (retval); + PCHAR(ch); + } + percent = fmt - 1; + qflag = 0; lflag = 0; ladjust = 0; sharpflag = 0; neg = 0; + sign = 0; dot = 0; dwidth = 0; upper = 0; + cflag = 0; hflag = 0; jflag = 0; tflag = 0; zflag = 0; +reswitch: switch (ch = (u_char)*fmt++) { + case '.': + dot = 1; + goto reswitch; + case '#': + sharpflag = 1; + goto reswitch; + case '+': + sign = 1; + goto reswitch; + case '-': + ladjust = 1; + goto reswitch; + case '%': + PCHAR(ch); + break; + case '*': + if (!dot) { + width = va_arg(ap, int); + if (width < 0) { + ladjust = !ladjust; + width = -width; + } + } else { + dwidth = va_arg(ap, int); + } + goto reswitch; + case '0': + if (!dot) { + padc = '0'; + goto reswitch; + } + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + for (n = 0;; ++fmt) { + n = n * 10 + ch - '0'; + ch = *fmt; + if (ch < '0' || ch > '9') + break; + } + if (dot) + dwidth = n; + else + width = n; + goto reswitch; + case 'b': + num = (u_int)va_arg(ap, int); + p = va_arg(ap, char *); + for (q = ksprintn(nbuf, num, *p++, NULL, 0); *q;) + PCHAR(*q--); + + if (num == 0) + break; + + for (tmp = 0; *p;) { + n = *p++; + if (num & (1 << (n - 1))) { + PCHAR(tmp ? ',' : '<'); + for (; (n = *p) > ' '; ++p) + PCHAR(n); + tmp = 1; + } else + for (; *p > ' '; ++p) + continue; + } + if (tmp) + PCHAR('>'); + break; + case 'c': + PCHAR(va_arg(ap, int)); + break; + case 'D': + up = va_arg(ap, u_char *); + p = va_arg(ap, char *); + if (!width) + width = 16; + while(width--) { + PCHAR(hibhex2ascii(*up >> 4)); + PCHAR(hibhex2ascii(*up & 0x0f)); + up++; + if (width) + for (q=p;*q;q++) + PCHAR(*q); + } + break; + case 'd': + case 'i': + base = 10; + sign = 1; + goto handle_sign; + case 'h': + if (hflag) { + hflag = 0; + cflag = 1; + } else + hflag = 1; + goto reswitch; + case 'j': + jflag = 1; + goto reswitch; + case 'l': + if (lflag) { + lflag = 0; + qflag = 1; + } else + lflag = 1; + goto reswitch; + case 'n': + if (jflag) + *(va_arg(ap, intmax_t *)) = retval; + else if (qflag) + *(va_arg(ap, quad_t *)) = retval; + else if (lflag) + *(va_arg(ap, long *)) = retval; + else if (zflag) + *(va_arg(ap, size_t *)) = retval; + else if (hflag) + *(va_arg(ap, short *)) = retval; + else if (cflag) + *(va_arg(ap, char *)) = retval; + else + *(va_arg(ap, int *)) = retval; + break; + case 'o': + base = 8; + goto handle_nosign; + case 'p': + base = 16; + sharpflag = (width == 0); + sign = 0; + num = (uintptr_t)va_arg(ap, void *); + goto number; + case 'q': + qflag = 1; + goto reswitch; + case 'r': + base = radix; + if (sign) + goto handle_sign; + goto handle_nosign; + case 's': + p = va_arg(ap, char *); + if (p == NULL) + p = "(null)"; + if (!dot) + n = (typeof(n))hibstrlen (p); + else + for (n = 0; n < dwidth && p[n]; n++) + continue; + + width -= n; + + if (!ladjust && width > 0) + while (width--) + PCHAR(padc); + while (n--) + PCHAR(*p++); + if (ladjust && width > 0) + while (width--) + PCHAR(padc); + break; + case 't': + tflag = 1; + goto reswitch; + case 'u': + base = 10; + goto handle_nosign; + case 'X': + upper = 1; + case 'x': + base = 16; + goto handle_nosign; + case 'y': + base = 16; + sign = 1; + goto handle_sign; + case 'z': + zflag = 1; + goto reswitch; +handle_nosign: + sign = 0; + if (jflag) + num = va_arg(ap, uintmax_t); + else if (qflag) + num = va_arg(ap, u_quad_t); + else if (tflag) + num = va_arg(ap, ptrdiff_t); + else if (lflag) + num = va_arg(ap, u_long); + else if (zflag) + num = va_arg(ap, size_t); + else if (hflag) + num = (u_short)va_arg(ap, int); + else if (cflag) + num = (u_char)va_arg(ap, int); + else + num = va_arg(ap, u_int); + goto number; +handle_sign: + if (jflag) + num = va_arg(ap, intmax_t); + else if (qflag) + num = va_arg(ap, quad_t); + else if (tflag) + num = va_arg(ap, ptrdiff_t); + else if (lflag) + num = va_arg(ap, long); + else if (zflag) + num = va_arg(ap, ssize_t); + else if (hflag) + num = (short)va_arg(ap, int); + else if (cflag) + num = (char)va_arg(ap, int); + else + num = va_arg(ap, int); +number: + if (sign && (intmax_t)num < 0) { + neg = 1; + num = -(intmax_t)num; + } + p = ksprintn(nbuf, num, base, &tmp, upper); + if (sharpflag && num != 0) { + if (base == 8) + tmp++; + else if (base == 16) + tmp += 2; + } + if (neg) + tmp++; + + if (!ladjust && padc != '0' && width + && (width -= tmp) > 0) + while (width--) + PCHAR(padc); + if (neg) + PCHAR('-'); + if (sharpflag && num != 0) { + if (base == 8) { + PCHAR('0'); + } else if (base == 16) { + PCHAR('0'); + PCHAR('x'); + } + } + if (!ladjust && width && (width -= tmp) > 0) + while (width--) + PCHAR(padc); + + while (*p) + PCHAR(*p--); + + if (ladjust && width && (width -= tmp) > 0) + while (width--) + PCHAR(padc); + + break; + default: + while (percent < fmt) + PCHAR(*percent++); + /* + * Since we ignore an formatting argument it is no + * longer safe to obey the remaining formatting + * arguments as the arguments will no longer match + * the format specs. + */ + stop = 1; + break; + } + } +#undef PCHAR +} + + +static void +putchar(int c, void *arg) +{ + (void)arg; + uart_putc(c); +} + +void +hibprintf(const char *fmt, ...) +{ + /* http://www.pagetable.com/?p=298 */ + va_list ap; + + va_start(ap, fmt); + hibkvprintf(fmt, putchar, NULL, 10, ap); + va_end(ap); +} +#endif /* CONFIG_DEBUG */ + diff --git a/iokit/Kernel/IOInterruptController.cpp b/iokit/Kernel/IOInterruptController.cpp index a8e04bddd..1000178ad 100644 --- a/iokit/Kernel/IOInterruptController.cpp +++ b/iokit/Kernel/IOInterruptController.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2011 Apple Inc. All rights reserved. + * Copyright (c) 1998-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -26,11 +26,6 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ - -#if __ppc__ -#include -#endif - #include #include #include @@ -295,17 +290,10 @@ IOReturn IOInterruptController::enableInterrupt(IOService *nub, int source) if (vector->interruptDisabledSoft) { vector->interruptDisabledSoft = 0; -#if __ppc__ - sync(); - isync(); -#endif if (!getPlatform()->atInterruptLevel()) { while (vector->interruptActive) {} -#if __ppc__ - isync(); -#endif } if (vector->interruptDisabledHard) { vector->interruptDisabledHard = 0; @@ -330,17 +318,10 @@ IOReturn IOInterruptController::disableInterrupt(IOService *nub, int source) vector = &vectors[vectorNumber]; vector->interruptDisabledSoft = 1; -#if __ppc__ - sync(); - isync(); -#endif if (!getPlatform()->atInterruptLevel()) { while (vector->interruptActive) {} -#if __ppc__ - isync(); -#endif } return kIOReturnSuccess; @@ -663,10 +644,6 @@ IOReturn IOSharedInterruptController::disableInterrupt(IOService *nub, interruptState = IOSimpleLockLockDisableInterrupt(controllerLock); if (!vector->interruptDisabledSoft) { vector->interruptDisabledSoft = 1; -#if __ppc__ - sync(); - isync(); -#endif vectorsEnabled--; } IOSimpleLockUnlockEnableInterrupt(controllerLock, interruptState); @@ -674,9 +651,6 @@ IOReturn IOSharedInterruptController::disableInterrupt(IOService *nub, if (!getPlatform()->atInterruptLevel()) { while (vector->interruptActive) {} -#if __ppc__ - isync(); -#endif } return kIOReturnSuccess; @@ -699,48 +673,26 @@ IOReturn IOSharedInterruptController::handleInterrupt(void * /*refCon*/, vector = &vectors[vectorNumber]; vector->interruptActive = 1; -#if __ppc__ - sync(); - isync(); -#endif - if (!vector->interruptDisabledSoft) { -#if __ppc__ - isync(); -#endif - - // Call the handler if it exists. - if (vector->interruptRegistered) { - - bool trace = (gIOKitTrace & kIOTraceInterrupts) ? true : false; - bool timeHandler = gIOInterruptThresholdNS ? true : false; - uint64_t startTime = 0; - uint64_t endTime = 0; + if (!vector->interruptDisabledSoft) { + + // Call the handler if it exists. + if (vector->interruptRegistered) { + + bool trace = (gIOKitTrace & kIOTraceInterrupts) ? true : false; if (trace) IOTimeStampStartConstant(IODBG_INTC(IOINTC_HANDLER), (uintptr_t) vectorNumber, (uintptr_t) vector->handler, (uintptr_t)vector->target); - if (timeHandler) - startTime = mach_absolute_time(); - // Call handler. vector->handler(vector->target, vector->refCon, vector->nub, vector->source); - - if (timeHandler) - { - endTime = mach_absolute_time(); - if ((endTime - startTime) > gIOInterruptThresholdNS) - panic("IOSIC::handleInterrupt: interrupt exceeded threshold, handlerTime = %qd, vectorNumber = %d, handler = %p, target = %p\n", - endTime - startTime, (int)vectorNumber, vector->handler, vector->target); - } if (trace) IOTimeStampEndConstant(IODBG_INTC(IOINTC_HANDLER), (uintptr_t) vectorNumber, (uintptr_t) vector->handler, (uintptr_t)vector->target); - } - - } + } + } vector->interruptActive = 0; } diff --git a/iokit/Kernel/IOInterruptEventSource.cpp b/iokit/Kernel/IOInterruptEventSource.cpp index 97d4c5957..8b49024a1 100644 --- a/iokit/Kernel/IOInterruptEventSource.cpp +++ b/iokit/Kernel/IOInterruptEventSource.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,13 +25,7 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* -Copyright (c) 1998 Apple Computer, Inc. All rights reserved. -HISTORY - 1998-7-13 Godfrey van der Linden(gvdl) - Created. -*/ #include #include #include @@ -40,6 +34,31 @@ HISTORY #include #include +#if IOKITSTATS + +#define IOStatisticsInitializeCounter() \ +do { \ + IOStatistics::setCounterType(IOEventSource::reserved->counter, kIOStatisticsInterruptEventSourceCounter); \ +} while (0) + +#define IOStatisticsCheckForWork() \ +do { \ + IOStatistics::countInterruptCheckForWork(IOEventSource::reserved->counter); \ +} while (0) + +#define IOStatisticsInterrupt() \ +do { \ + IOStatistics::countInterrupt(IOEventSource::reserved->counter); \ +} while (0) + +#else + +#define IOStatisticsInitializeCounter() +#define IOStatisticsCheckForWork() +#define IOStatisticsInterrupt() + +#endif // IOKITSTATS + #define super IOEventSource OSDefineMetaClassAndStructors(IOInterruptEventSource, IOEventSource) @@ -74,6 +93,8 @@ bool IOInterruptEventSource::init(OSObject *inOwner, intIndex = inIntIndex; } + IOStatisticsInitializeCounter(); + return res; } @@ -182,24 +203,26 @@ bool IOInterruptEventSource::checkForWork() int numInts = cacheProdCount - consumerCount; IOInterruptEventAction intAction = (IOInterruptEventAction) action; bool trace = (gIOKitTrace & kIOTraceIntEventSource) ? true : false; - + + IOStatisticsCheckForWork(); + if ( numInts > 0 ) { if (trace) IOTimeStampStartConstant(IODBG_INTES(IOINTES_ACTION), (uintptr_t) intAction, (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop); - + // Call the handler - (*intAction)(owner, this, numInts); + (*intAction)(owner, this, numInts); if (trace) IOTimeStampEndConstant(IODBG_INTES(IOINTES_ACTION), (uintptr_t) intAction, (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop); - - consumerCount = cacheProdCount; - if (autoDisable && !explicitDisable) - enable(); - } + + consumerCount = cacheProdCount; + if (autoDisable && !explicitDisable) + enable(); + } else if ( numInts < 0 ) { @@ -208,17 +231,17 @@ bool IOInterruptEventSource::checkForWork() (uintptr_t) intAction, (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop); // Call the handler - (*intAction)(owner, this, -numInts); + (*intAction)(owner, this, -numInts); if (trace) IOTimeStampEndConstant(IODBG_INTES(IOINTES_ACTION), (uintptr_t) intAction, (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop); - - consumerCount = cacheProdCount; - if (autoDisable && !explicitDisable) - enable(); - } - + + consumerCount = cacheProdCount; + if (autoDisable && !explicitDisable) + enable(); + } + return false; } @@ -226,14 +249,15 @@ void IOInterruptEventSource::normalInterruptOccurred (void */*refcon*/, IOService */*prov*/, int /*source*/) { bool trace = (gIOKitTrace & kIOTraceIntEventSource) ? true : false; - + + IOStatisticsInterrupt(); producerCount++; - + if (trace) IOTimeStampStartConstant(IODBG_INTES(IOINTES_SEMA), (uintptr_t) this, (uintptr_t) owner); signalWorkAvailable(); - + if (trace) IOTimeStampEndConstant(IODBG_INTES(IOINTES_SEMA), (uintptr_t) this, (uintptr_t) owner); } @@ -242,16 +266,17 @@ void IOInterruptEventSource::disableInterruptOccurred (void */*refcon*/, IOService *prov, int source) { bool trace = (gIOKitTrace & kIOTraceIntEventSource) ? true : false; - + prov->disableInterrupt(source); /* disable the interrupt */ - + + IOStatisticsInterrupt(); producerCount++; - + if (trace) IOTimeStampStartConstant(IODBG_INTES(IOINTES_SEMA), (uintptr_t) this, (uintptr_t) owner); signalWorkAvailable(); - + if (trace) IOTimeStampEndConstant(IODBG_INTES(IOINTES_SEMA), (uintptr_t) this, (uintptr_t) owner); } @@ -264,3 +289,10 @@ void IOInterruptEventSource::interruptOccurred else normalInterruptOccurred(refcon, prov, source); } + +IOReturn IOInterruptEventSource::warmCPU + (uint64_t abstime) +{ + + return ml_interrupt_prewarm(abstime); +} diff --git a/iokit/Kernel/IOKitDebug.cpp b/iokit/Kernel/IOKitDebug.cpp index 31d681664..21048d88c 100644 --- a/iokit/Kernel/IOKitDebug.cpp +++ b/iokit/Kernel/IOKitDebug.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2011 Apple Inc. All rights reserved. + * Copyright (c) 1998-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -28,15 +28,15 @@ #include +#include +#include + #include #include #include #include #include -#include -#include - #ifdef IOKITDEBUG #define DEBUG_INIT_VALUE IOKITDEBUG #else @@ -44,12 +44,10 @@ #endif SInt64 gIOKitDebug = DEBUG_INIT_VALUE; -SInt64 gIOKitTrace = 0x3B; -UInt64 gIOInterruptThresholdNS = 0; +SInt64 gIOKitTrace = 0; SYSCTL_QUAD(_debug, OID_AUTO, iokit, CTLFLAG_RW | CTLFLAG_LOCKED, &gIOKitDebug, "boot_arg io"); SYSCTL_QUAD(_debug, OID_AUTO, iotrace, CTLFLAG_RW | CTLFLAG_LOCKED, &gIOKitTrace, "trace io"); -SYSCTL_QUAD(_debug, OID_AUTO, iointthreshold, CTLFLAG_RW | CTLFLAG_LOCKED, &gIOInterruptThresholdNS, "io interrupt threshold"); int debug_malloc_size; @@ -100,7 +98,7 @@ void IOPrintPlane( const IORegistryPlane * plane ) iter->release(); } -void dbugprintf(char *fmt, ...); +void dbugprintf(const char *fmt, ...); void db_dumpiojunk( const IORegistryPlane * plane ); void db_piokjunk(void) { diff --git a/iokit/Kernel/IOKitKernelInternal.h b/iokit/Kernel/IOKitKernelInternal.h index 804c57f24..5a74159a4 100644 --- a/iokit/Kernel/IOKitKernelInternal.h +++ b/iokit/Kernel/IOKitKernelInternal.h @@ -38,6 +38,29 @@ __BEGIN_DECLS #include #include +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#if !defined(NO_KDEBUG) + +#define IOServiceTrace(csc, a, b, c, d) do { \ + if(kIOTraceIOService & gIOKitDebug) { \ + KERNEL_DEBUG_CONSTANT(IODBG_IOSERVICE(csc), a, b, c, d, 0); \ + } \ +} while(0) + +#else /* NO_KDEBUG */ + +#define IOServiceTrace(csc, a, b, c, d) do { \ + (void)a; \ + (void)b; \ + (void)c; \ + (void)d; \ +} while (0) + +#endif /* NO_KDEBUG */ + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + typedef kern_return_t (*IOIteratePageableMapsCallback)(vm_map_t map, void * ref); void IOLibInit(void); @@ -149,4 +172,8 @@ extern "C" void IOKitInitializeTime( void ); extern "C" OSString * IOCopyLogNameForPID(int pid); +#if defined(__i386__) || defined(__x86_64__) +extern "C" void IOSetKeyStoreData(IOMemoryDescriptor * data); +#endif + #endif /* ! _IOKIT_KERNELINTERNAL_H */ diff --git a/iokit/Kernel/IOLib.cpp b/iokit/Kernel/IOLib.cpp index a5415e71c..50000299d 100644 --- a/iokit/Kernel/IOLib.cpp +++ b/iokit/Kernel/IOLib.cpp @@ -55,6 +55,24 @@ #include #endif +#include "libkern/OSAtomic.h" +#include +#include +#include + +#if IOKITSTATS + +#define IOStatisticsAlloc(type, size) \ +do { \ + IOStatistics::countAlloc(type, size); \ +} while (0) + +#else + +#define IOStatisticsAlloc(type, size) + +#endif /* IOKITSTATS */ + extern "C" { @@ -63,7 +81,7 @@ mach_timespec_t IOZeroTvalspec = { 0, 0 }; extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va); -int +extern int __doprnt( const char *fmt, va_list argp, @@ -71,7 +89,9 @@ __doprnt( void *arg, int radix); -extern void conslog_putc(char); +extern void cons_putc_locked(char); +extern void bsd_log_lock(void); +extern void bsd_log_unlock(void); /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -178,11 +198,13 @@ void * IOMalloc(vm_size_t size) void * address; address = (void *)kalloc(size); + if ( address ) { #if IOALLOCDEBUG - if (address) { debug_iomalloc_size += size; - } #endif + IOStatisticsAlloc(kIOStatisticsMalloc, size); + } + return address; } @@ -193,6 +215,7 @@ void IOFree(void * address, vm_size_t size) #if IOALLOCDEBUG debug_iomalloc_size -= size; #endif + IOStatisticsAlloc(kIOStatisticsFree, size); } } @@ -250,11 +273,12 @@ void * IOMallocAligned(vm_size_t size, vm_size_t alignment) assert(0 == (address & alignMask)); -#if IOALLOCDEBUG if( address) { +#if IOALLOCDEBUG debug_iomalloc_size += size; - } #endif + IOStatisticsAlloc(kIOStatisticsMallocAligned, size); + } return (void *) address; } @@ -289,6 +313,8 @@ void IOFreeAligned(void * address, vm_size_t size) #if IOALLOCDEBUG debug_iomalloc_size -= size; #endif + + IOStatisticsAlloc(kIOStatisticsFreeAligned, size); } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -325,7 +351,7 @@ IOKernelFreePhysical(mach_vm_address_t address, mach_vm_size_t size) mach_vm_address_t IOKernelAllocateWithPhysicalRestrict(mach_vm_size_t size, mach_vm_address_t maxPhys, - mach_vm_size_t alignment, bool contiguous) + mach_vm_size_t alignment, bool contiguous) { kern_return_t kr; mach_vm_address_t address; @@ -405,6 +431,7 @@ IOKernelAllocateWithPhysicalRestrict(mach_vm_size_t size, mach_vm_address_t maxP return (address); } + /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ struct _IOMallocContiguousEntry @@ -463,6 +490,10 @@ void * IOMallocContiguous(vm_size_t size, vm_size_t alignment, } while (false); + if (address) { + IOStatisticsAlloc(kIOStatisticsMallocContiguous, size); + } + return (void *) address; } @@ -500,6 +531,8 @@ void IOFreeContiguous(void * _address, vm_size_t size) { IOKernelFreePhysical((mach_vm_address_t) address, size); } + + IOStatisticsAlloc(kIOStatisticsFreeContiguous, size); } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -603,10 +636,12 @@ void * IOMallocPageable(vm_size_t size, vm_size_t alignment) if( kIOReturnSuccess != kr) ref.address = 0; + if( ref.address) { #if IOALLOCDEBUG - if( ref.address) debug_iomallocpageable_size += round_page(size); #endif + IOStatisticsAlloc(kIOStatisticsMallocPageable, size); + } return( (void *) ref.address ); } @@ -640,6 +675,8 @@ void IOFreePageable(void * address, vm_size_t size) #if IOALLOCDEBUG debug_iomallocpageable_size -= round_page(size); #endif + + IOStatisticsAlloc(kIOStatisticsFreePageable, size); } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -727,23 +764,36 @@ void IOPause(unsigned nanoseconds) /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ -static void _iolog_putc(int ch, void *arg __unused) +static void _iolog_consputc(int ch, void *arg __unused) { - conslog_putc(ch); + cons_putc_locked(ch); +} + +static void _iolog_logputc(int ch, void *arg __unused) +{ + log_putc_locked(ch); } void IOLog(const char *format, ...) { - va_list ap; + va_list ap; - va_start(ap, format); - __doprnt(format, ap, _iolog_putc, NULL, 16); - va_end(ap); + va_start(ap, format); + IOLogv(format, ap); + va_end(ap); } void IOLogv(const char *format, va_list ap) { - __doprnt(format, ap, _iolog_putc, NULL, 16); + va_list ap2; + + va_copy(ap2, ap); + + bsd_log_lock(); + __doprnt(format, ap, _iolog_logputc, NULL, 16); + bsd_log_unlock(); + + __doprnt(format, ap2, _iolog_consputc, NULL, 16); } #if !__LP64__ diff --git a/iokit/Kernel/IOMemoryCursor.cpp b/iokit/Kernel/IOMemoryCursor.cpp index 36a15009d..99999991d 100644 --- a/iokit/Kernel/IOMemoryCursor.cpp +++ b/iokit/Kernel/IOMemoryCursor.cpp @@ -325,66 +325,3 @@ IOLittleMemoryCursor::initWithSpecification(IOPhysicalLength inMaxSegmentSize, inMaxTransferSize, inAlignment); } - -/************************* class IODBDMAMemoryCursor *************************/ - -#if defined(__ppc__) - -#include - -#undef super -#define super IOMemoryCursor -OSDefineMetaClassAndStructors(IODBDMAMemoryCursor, IOMemoryCursor) - -void -IODBDMAMemoryCursor::outputSegment(PhysicalSegment inSegment, - void * inSegments, - UInt32 inSegmentIndex) -{ - IODBDMADescriptor *segment; - - segment = &((IODBDMADescriptor *) inSegments)[inSegmentIndex]; - - // Write location into address field - OSWriteSwapInt32((UInt32 *) segment, 4, inSegment.location); - - // Write count into 1st two bytes of operation field. - // DO NOT touch rest of operation field as it should contain a STOP command. - OSWriteSwapInt16((UInt16 *) segment, 0, inSegment.length); -} - -/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ - -IODBDMAMemoryCursor * -IODBDMAMemoryCursor::withSpecification(IOPhysicalLength inMaxSegmentSize, - IOPhysicalLength inMaxTransferSize, - IOPhysicalLength inAlignment) -{ - IODBDMAMemoryCursor *me = new IODBDMAMemoryCursor; - - if (me && !me->initWithSpecification(inMaxSegmentSize, - inMaxTransferSize, - inAlignment)) - { - me->release(); - return 0; - } - - return me; -} - -/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ - -bool -IODBDMAMemoryCursor::initWithSpecification(IOPhysicalLength inMaxSegmentSize, - IOPhysicalLength inMaxTransferSize, - IOPhysicalLength inAlignment) -{ - return super::initWithSpecification(&IODBDMAMemoryCursor::outputSegment, - inMaxSegmentSize, - inMaxTransferSize, - inAlignment); -} - -#endif /* defined(__ppc__) */ - diff --git a/iokit/Kernel/IOMemoryDescriptor.cpp b/iokit/Kernel/IOMemoryDescriptor.cpp index a46021ede..0a11064a1 100644 --- a/iokit/Kernel/IOMemoryDescriptor.cpp +++ b/iokit/Kernel/IOMemoryDescriptor.cpp @@ -71,26 +71,8 @@ __BEGIN_DECLS #include extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va); -void ipc_port_release_send(ipc_port_t port); - -/* Copy between a physical page and a virtual address in the given vm_map */ -kern_return_t copypv(addr64_t source, addr64_t sink, unsigned int size, int which); - -memory_object_t -device_pager_setup( - memory_object_t pager, - uintptr_t device_handle, - vm_size_t size, - int flags); -void -device_pager_deallocate( - memory_object_t); -kern_return_t -device_pager_populate_object( - memory_object_t pager, - vm_object_offset_t offset, - ppnum_t phys_addr, - vm_size_t size); +extern void ipc_port_release_send(ipc_port_t port); + kern_return_t memory_object_iopl_request( ipc_port_t port, @@ -172,8 +154,8 @@ struct ioGMDData { // align arrays to 8 bytes so following macros work unsigned int fPad; #endif - upl_page_info_t fPageList[]; - ioPLBlock fBlocks[]; + upl_page_info_t fPageList[1]; /* variable length */ + ioPLBlock fBlocks[1]; /* variable length */ }; #define getDataP(osd) ((ioGMDData *) (osd)->getBytesNoCopy()) @@ -182,7 +164,7 @@ struct ioGMDData { (((osd)->getLength() - ((char *) getIOPLList(d) - (char *) d)) / sizeof(ioPLBlock)) #define getPageList(d) (&(d->fPageList[0])) #define computeDataSize(p, u) \ - (sizeof(ioGMDData) + p * sizeof(upl_page_info_t) + u * sizeof(ioPLBlock)) + (offsetof(ioGMDData, fPageList) + p * sizeof(upl_page_info_t) + u * sizeof(ioPLBlock)) /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -306,18 +288,7 @@ IOMemoryDescriptor::withPhysicalAddress( IOByteCount length, IODirection direction ) { -#ifdef __LP64__ return (IOMemoryDescriptor::withAddressRange(address, length, direction, TASK_NULL)); -#else /* !__LP64__ */ - IOGeneralMemoryDescriptor *self = new IOGeneralMemoryDescriptor; - if (self - && !self->initWithPhysicalAddress(address, length, direction)) { - self->release(); - return 0; - } - - return self; -#endif /* !__LP64__ */ } #ifndef __LP64__ @@ -500,9 +471,7 @@ void *IOGeneralMemoryDescriptor::createNamedEntry() memory_object_size_t actualSize = size; vm_prot_t prot = VM_PROT_READ; -#if CONFIG_EMBEDDED if (kIODirectionOut != (kIODirectionOutIn & _flags)) -#endif prot |= VM_PROT_WRITE; if (_memEntry) @@ -630,6 +599,17 @@ IOGeneralMemoryDescriptor::initWithOptions(void * buffers, { IOOptionBits type = options & kIOMemoryTypeMask; +#ifndef __LP64__ + if (task + && (kIOMemoryTypeVirtual == type) + && vm_map_is_64bit(get_task_map(task)) + && ((IOVirtualRange *) buffers)->address) + { + OSReportWithBacktrace("IOMemoryDescriptor: attempt to create 32b virtual in 64b task, use ::withAddressRange()"); + return false; + } +#endif /* !__LP64__ */ + // Grab the original MD's configuation data to initialse the // arguments to this function. if (kIOMemoryTypePersistentMD == type) { @@ -644,10 +624,10 @@ IOGeneralMemoryDescriptor::initWithOptions(void * buffers, return false; _memEntry = initData->fMemEntry; // Grab the new named entry - options = orig->_flags | kIOMemoryAsReference; - _singleRange = orig->_singleRange; // Initialise our range - buffers = &_singleRange; - count = 1; + options = orig->_flags & ~kIOMemoryAsReference; + type = options & kIOMemoryTypeMask; + buffers = orig->_ranges.v; + count = orig->_rangesCount; // Now grab the original task and whatever mapper was previously used task = orig->_task; @@ -665,16 +645,6 @@ IOGeneralMemoryDescriptor::initWithOptions(void * buffers, assert(task); if (!task) return false; - -#ifndef __LP64__ - if (vm_map_is_64bit(get_task_map(task)) - && (kIOMemoryTypeVirtual == type) - && ((IOVirtualRange *) buffers)->address) - { - OSReportWithBacktrace("IOMemoryDescriptor: attempt to create 32b virtual in 64b task, use ::withAddressRange()"); - return false; - } -#endif /* !__LP64__ */ break; case kIOMemoryTypePhysical: // Neither Physical nor UPL should have a task @@ -721,7 +691,10 @@ IOGeneralMemoryDescriptor::initWithOptions(void * buffers, } if (_memEntry) - { ipc_port_release_send((ipc_port_t) _memEntry); _memEntry = 0; } + { + ipc_port_release_send((ipc_port_t) _memEntry); + _memEntry = 0; + } if (_mappings) _mappings->flushCollection(); } @@ -782,7 +755,7 @@ IOGeneralMemoryDescriptor::initWithOptions(void * buffers, else if (!_memoryEntries->initWithCapacity(dataSize)) return false; - _memoryEntries->appendBytes(0, sizeof(ioGMDData)); + _memoryEntries->appendBytes(0, computeDataSize(0, 0)); dataP = getDataP(_memoryEntries); dataP->fMapper = mapper; dataP->fPageCnt = 0; @@ -794,6 +767,7 @@ IOGeneralMemoryDescriptor::initWithOptions(void * buffers, ioPLBlock iopl; iopl.fIOPL = (upl_t) buffers; + upl_set_referenced(iopl.fIOPL, true); upl_page_info_t *pageList = UPL_GET_INTERNAL_PAGE_LIST(iopl.fIOPL); if (upl_get_size(iopl.fIOPL) < (count + offset)) @@ -853,7 +827,8 @@ IOGeneralMemoryDescriptor::initWithOptions(void * buffers, case kIOMemoryTypeVirtual64: case kIOMemoryTypePhysical64: if (count == 1 - && (((IOAddressRange *) buffers)->address + ((IOAddressRange *) buffers)->length) <= 0x100000000ULL) { + && (((IOAddressRange *) buffers)->address + ((IOAddressRange *) buffers)->length) <= 0x100000000ULL + ) { if (kIOMemoryTypeVirtual64 == type) type = kIOMemoryTypeVirtual; else @@ -931,7 +906,7 @@ IOGeneralMemoryDescriptor::initWithOptions(void * buffers, else if (!_memoryEntries->initWithCapacity(dataSize)) return false; - _memoryEntries->appendBytes(0, sizeof(ioGMDData)); + _memoryEntries->appendBytes(0, computeDataSize(0, 0)); dataP = getDataP(_memoryEntries); dataP->fMapper = mapper; dataP->fPageCnt = _pages; @@ -1183,11 +1158,7 @@ IOGeneralMemoryDescriptor::getPreparationID( void ) if (kIOPreparationIDUnprepared == dataP->fPreparationID) { -#if defined(__ppc__ ) - dataP->fPreparationID = gIOMDPreparationID++; -#else dataP->fPreparationID = OSIncrementAtomic64(&gIOMDPreparationID); -#endif } return (dataP->fPreparationID); } @@ -1397,7 +1368,7 @@ IOReturn IOGeneralMemoryDescriptor::dmaCommandOperation(DMACommandOps op, void * IOPhysicalAddress pageAddr = pageList[pageInd].phys_addr; if (!pageAddr) { panic("!pageList phys_addr"); - } + } address = ptoa_64(pageAddr) + offset; @@ -1911,13 +1882,8 @@ IOReturn IOMemoryDescriptor::performOperation( IOOptionBits options, return (remaining ? kIOReturnUnderrun : kIOReturnSuccess); } -#if defined(__ppc__) || defined(__arm__) -extern vm_offset_t static_memory_end; -#define io_kernel_static_end static_memory_end -#else extern vm_offset_t first_avail; #define io_kernel_static_end first_avail -#endif static kern_return_t io_get_kernel_static_upl( @@ -2118,12 +2084,16 @@ IOReturn IOGeneralMemoryDescriptor::wireVirtual(IODirection forDirection) iopl.fIOMDOffset = mdOffset; iopl.fPageInfo = pageIndex; +#if 0 + // used to remove the upl for auto prepares here, for some errant code + // that freed memory before the descriptor pointing at it if ((_flags & kIOMemoryAutoPrepare) && iopl.fIOPL) { upl_commit(iopl.fIOPL, 0, 0); upl_deallocate(iopl.fIOPL); iopl.fIOPL = 0; } +#endif if (!_memoryEntries->appendBytes(&iopl, sizeof(iopl))) { // Clean up partial created and unsaved iopl @@ -2169,7 +2139,7 @@ IOReturn IOGeneralMemoryDescriptor::wireVirtual(IODirection forDirection) upl_deallocate(ioplList[range].fIOPL); } } - (void) _memoryEntries->initWithBytes(dataP, sizeof(ioGMDData)); // == setLength() + (void) _memoryEntries->initWithBytes(dataP, computeDataSize(0, 0)); // == setLength() if (mapper && mapBase) mapper->iovmFree(mapBase, _pages); @@ -2231,7 +2201,7 @@ IOReturn IOGeneralMemoryDescriptor::prepare(IODirection forDirection) * issued; the prepare() and complete() must occur in pairs, before * before and after an I/O transfer involving pageable memory. */ - + IOReturn IOGeneralMemoryDescriptor::complete(IODirection /* forDirection */) { IOOptionBits type = _flags & kIOMemoryTypeMask; @@ -2273,8 +2243,11 @@ IOReturn IOGeneralMemoryDescriptor::complete(IODirection /* forDirection */) upl_commit(ioplList[ind].fIOPL, 0, 0); upl_deallocate(ioplList[ind].fIOPL); } + } else if (kIOMemoryTypeUPL == type) { + upl_set_referenced(ioplList[0].fIOPL, false); } - (void) _memoryEntries->initWithBytes(dataP, sizeof(ioGMDData)); // == setLength() + + (void) _memoryEntries->initWithBytes(dataP, computeDataSize(0, 0)); // == setLength() dataP->fPreparationID = kIOPreparationIDUnprepared; } @@ -3328,13 +3301,12 @@ IOMemoryMap * IOMemoryMap::copyCompatible( retain(); if( (fLength == _length) && (!_offset)) { - newMapping->release(); newMapping = this; } else { newMapping->fSuperMap = this; - newMapping->fOffset = _offset; + newMapping->fOffset = fOffset + _offset; newMapping->fAddress = fAddress + _offset; } @@ -3608,7 +3580,14 @@ IOMemoryMap * IOMemoryDescriptor::makeMapping( iter->release(); } if (result || (options & kIOMapReference)) + { + if (result != mapping) + { + mapping->release(); + mapping = NULL; + } continue; + } } if (!mapDesc) diff --git a/iokit/Kernel/IONVRAM.cpp b/iokit/Kernel/IONVRAM.cpp index 4c51e4457..85ac1a2ec 100644 --- a/iokit/Kernel/IONVRAM.cpp +++ b/iokit/Kernel/IONVRAM.cpp @@ -39,7 +39,6 @@ #define kIONVRAMPrivilege kIOClientPrivilegeAdministrator //#define kIONVRAMPrivilege kIOClientPrivilegeLocalUser - OSDefineMetaClassAndStructors(IODTNVRAM, IOService); bool IODTNVRAM::init(IORegistryEntry *old, const IORegistryPlane *plane) @@ -205,6 +204,9 @@ void IODTNVRAM::registerNVRAMController(IONVRAMController *nvram) _piImage = _nvramImage + _piPartitionOffset; } + _lastDeviceSync = 0; + _freshInterval = TRUE; // we will allow sync() even before the first 15 minutes have passed. + initOFVariables(); } @@ -229,27 +231,31 @@ bool IODTNVRAM::serializeProperties(OSSerialize *s) const OSDictionary *dict = 0, *tmpDict = 0; OSCollectionIterator *iter = 0; - if (_ofDict == 0) return false; - // Verify permissions. hasPrivilege = (kIOReturnSuccess == IOUserClient::clientHasPrivilege(current_task(), kIONVRAMPrivilege)); tmpDict = OSDictionary::withCapacity(1); if (tmpDict == 0) return false; + + if (_ofDict == 0) { + /* No nvram. Return an empty dictionary. */ + dict = tmpDict; + } else { + /* Copy properties with client privilege. */ + iter = OSCollectionIterator::withCollection(_ofDict); + if (iter == 0) return false; - iter = OSCollectionIterator::withCollection(_ofDict); - if (iter == 0) return false; - - while (1) { - key = OSDynamicCast(OSSymbol, iter->getNextObject()); - if (key == 0) break; + while (1) { + key = OSDynamicCast(OSSymbol, iter->getNextObject()); + if (key == 0) break; - variablePerm = getOFVariablePerm(key); - if ((hasPrivilege || (variablePerm != kOFVariablePermRootOnly)) && - ( ! (variablePerm == kOFVariablePermKernelOnly && current_task() != kernel_task) )) { - tmpDict->setObject(key, _ofDict->getObject(key)); + variablePerm = getOFVariablePerm(key); + if ((hasPrivilege || (variablePerm != kOFVariablePermRootOnly)) && + ( ! (variablePerm == kOFVariablePermKernelOnly && current_task() != kernel_task) )) { + tmpDict->setObject(key, _ofDict->getObject(key)); + } + dict = tmpDict; } - dict = tmpDict; } result = dict->serialize(s); @@ -412,18 +418,32 @@ IOReturn IODTNVRAM::setProperties(OSObject *properties) if (object == 0) continue; if (key->isEqualTo(kIONVRAMDeletePropertyKey)) { - tmpStr = OSDynamicCast(OSString, object); - if (tmpStr != 0) { - key = OSSymbol::withString(tmpStr); - removeProperty(key); - key->release(); - result = true; - } else { - result = false; - } - } else { - result = setProperty(key, object); + tmpStr = OSDynamicCast(OSString, object); + if (tmpStr != 0) { + key = OSSymbol::withString(tmpStr); + removeProperty(key); + key->release(); + result = true; + } else { + result = false; + } + } else if(key->isEqualTo(kIONVRAMSyncNowPropertyKey)) { + tmpStr = OSDynamicCast(OSString, object); + if (tmpStr != 0) { + + result = true; // We are not going to gaurantee sync, this is best effort + + if(safeToSync()) + sync(); + + } else { + result = false; + } + } + else { + result = setProperty(key, object); } + } iter->release(); @@ -1656,3 +1676,26 @@ IOReturn IODTNVRAM::writeNVRAMPropertyType1(IORegistryEntry *entry, return ok ? kIOReturnSuccess : kIOReturnNoMemory; } + +bool IODTNVRAM::safeToSync(void) +{ + AbsoluteTime delta; + UInt64 delta_ns; + SInt32 delta_secs; + + // delta interval went by + clock_get_uptime(&delta); + + // Figure it in seconds. + absolutetime_to_nanoseconds(delta, &delta_ns); + delta_secs = (SInt32)(delta_ns / NSEC_PER_SEC); + + if ((delta_secs > (_lastDeviceSync + MIN_SYNC_NOW_INTERVAL)) || _freshInterval) + { + _lastDeviceSync = delta_secs; + _freshInterval = FALSE; + return TRUE; + } + + return FALSE; +} diff --git a/iokit/Kernel/IOPMPowerSource.cpp b/iokit/Kernel/IOPMPowerSource.cpp index e6a11fc07..614f4caa3 100644 --- a/iokit/Kernel/IOPMPowerSource.cpp +++ b/iokit/Kernel/IOPMPowerSource.cpp @@ -165,7 +165,6 @@ void IOPMPowerSource::updateStatus (void) void IOPMPowerSource::setPSProperty(const OSSymbol *key, OSObject *val) { OSObject *lastVal; - OSNumber *newNumVal; if(!key || !val) return; @@ -175,19 +174,12 @@ void IOPMPowerSource::setPSProperty(const OSSymbol *key, OSObject *val) // Otherwise, just compare pointers. if( (lastVal = properties->getObject(key)) ) { - newNumVal = OSDynamicCast(OSNumber, val); - if(newNumVal) { - if(newNumVal->isEqualTo(lastVal)) { - // settings didn't change - } else { - // num val is not equal to last val - settingsChangedSinceUpdate = true; - } - } else { - // pointer compare as last resort - if(lastVal != val) - settingsChangedSinceUpdate = true; - } + if(val->isEqualTo(lastVal)) { + // settings didn't change + } else { + // num val is not equal to last val + settingsChangedSinceUpdate = true; + } } else { // new setting; no last value settingsChangedSinceUpdate = true; diff --git a/iokit/Kernel/IOPMrootDomain.cpp b/iokit/Kernel/IOPMrootDomain.cpp index 3ccda1a1b..e6146bb24 100644 --- a/iokit/Kernel/IOPMrootDomain.cpp +++ b/iokit/Kernel/IOPMrootDomain.cpp @@ -27,6 +27,7 @@ */ #include #include +#include #include #include #include @@ -43,7 +44,6 @@ #include "IOKit/pwr_mgt/IOPowerConnection.h" #include "IOPMPowerStateQueue.h" #include -#include // IOServicePMPrivate #if HIBERNATION #include #endif @@ -64,64 +64,98 @@ __END_DECLS #endif #define kIOPMrootDomainClass "IOPMrootDomain" +#define LOG_PREFIX "PMRD: " -#define LOG_PREFIX "PMRD: " +#define MSG(x...) \ + do { kprintf(LOG_PREFIX x); IOLog(x); } while (false) -#define LOG(x...) do { \ - kprintf(LOG_PREFIX x); IOLog(x); } while (false) - -#define KLOG(x...) do { \ - kprintf(LOG_PREFIX x); } while (false) +#define LOG(x...) \ + do { kprintf(LOG_PREFIX x); } while (false) #define DLOG(x...) do { \ if (kIOLogPMRootDomain & gIOKitDebug) \ kprintf(LOG_PREFIX x); } while (false) +#define _LOG(x...) + #define CHECK_THREAD_CONTEXT #ifdef CHECK_THREAD_CONTEXT static IOWorkLoop * gIOPMWorkLoop = 0; -#define ASSERT_GATED(x) \ +#define ASSERT_GATED() \ do { \ if (gIOPMWorkLoop && gIOPMWorkLoop->inGate() != true) { \ - panic("RootDomain: not inside PM gate"); \ + panic("RootDomain: not inside PM gate"); \ } \ } while(false) #else -#define ASSERT_GATED(x) +#define ASSERT_GATED() #endif /* CHECK_THREAD_CONTEXT */ +#define CAP_LOSS(c) \ + (((_pendingCapability & (c)) == 0) && \ + ((_currentCapability & (c)) != 0)) + +#define CAP_GAIN(c) \ + (((_currentCapability & (c)) == 0) && \ + ((_pendingCapability & (c)) != 0)) + +#define CAP_CHANGE(c) \ + (((_currentCapability ^ _pendingCapability) & (c)) != 0) + +#define CAP_CURRENT(c) \ + ((_currentCapability & (c)) != 0) + +#define CAP_HIGHEST(c) \ + ((_highestCapability & (c)) != 0) + +#define DARK_TO_FULL_EVALUATE_CLAMSHELL 0 + // Event types for IOPMPowerStateQueue::submitPowerEvent() enum { - kPowerEventFeatureChanged = 1, - kPowerEventReceivedPowerNotification, - kPowerEventSystemBootCompleted, - kPowerEventSystemShutdown, - kPowerEventUserDisabledSleep, - kPowerEventConfigdRegisteredInterest, - kPowerEventAggressivenessChanged, - kPowerEventAssertionCreate, // 8 - kPowerEventAssertionRelease, // 9 - kPowerEventAssertionSetLevel // 10 + kPowerEventFeatureChanged = 1, // 1 + kPowerEventReceivedPowerNotification, // 2 + kPowerEventSystemBootCompleted, // 3 + kPowerEventSystemShutdown, // 4 + kPowerEventUserDisabledSleep, // 5 + kPowerEventRegisterSystemCapabilityClient, // 6 + kPowerEventRegisterKernelCapabilityClient, // 7 + kPowerEventPolicyStimulus, // 8 + kPowerEventAssertionCreate, // 9 + kPowerEventAssertionRelease, // 10 + kPowerEventAssertionSetLevel, // 11 + kPowerEventQueueSleepWakeUUID, // 12 + kPowerEventPublishSleepWakeUUID // 13 +}; + +// For evaluatePolicy() +// List of stimuli that affects the root domain policy. +enum { + kStimulusDisplayWranglerSleep, // 0 + kStimulusDisplayWranglerWake, // 1 + kStimulusAggressivenessChanged, // 2 + kStimulusDemandSystemSleep, // 3 + kStimulusAllowSystemSleepChanged, // 4 + kStimulusDarkWakeActivityTickle, // 5 + kStimulusDarkWakeEntry, // 6 + kStimulusDarkWakeReentry, // 7 + kStimulusDarkWakeEvaluate // 8 }; extern "C" { IOReturn OSKextSystemSleepOrWake( UInt32 ); } -extern const IORegistryPlane * gIOPowerPlane; - static void idleSleepTimerExpired( thread_call_param_t, thread_call_param_t ); -static void wakeupClamshellTimerExpired( thread_call_param_t us, thread_call_param_t ); static void notifySystemShutdown( IOService * root, unsigned long event ); -static bool clientMessageFilter( OSObject * object, void * context ); -static void handleAggressivesFunction( thread_call_param_t param1, thread_call_param_t param2 ); +static void handleAggressivesFunction( thread_call_param_t, thread_call_param_t ); static void pmEventTimeStamp(uint64_t *recordTS); // "IOPMSetSleepSupported" callPlatformFunction name static const OSSymbol *sleepSupportedPEFunction = NULL; static const OSSymbol *sleepMessagePEFunction = NULL; -#define kIOSleepSupportedKey "IOSleepSupported" +#define kIOSleepSupportedKey "IOSleepSupported" +#define kIOPMSystemCapabilitiesKey "System Capabilities" #define kRD_AllPowerSources (kIOPMSupportedOnAC \ | kIOPMSupportedOnBatt \ @@ -137,91 +171,36 @@ enum #define kLocalEvalClamshellCommand (1 << 15) enum { - OFF_STATE = 0, - RESTART_STATE = 1, - SLEEP_STATE = 2, - DOZE_STATE = 3, - ON_STATE = 4, + OFF_STATE = 0, + RESTART_STATE = 1, + SLEEP_STATE = 2, + ON_STATE = 3, NUM_POWER_STATES }; #define ON_POWER kIOPMPowerOn #define RESTART_POWER kIOPMRestart #define SLEEP_POWER kIOPMAuxPowerOn -#define DOZE_POWER kIOPMDoze static IOPMPowerState ourPowerStates[NUM_POWER_STATES] = { {1, 0, 0, 0, 0,0,0,0,0,0,0,0}, {1, kIOPMRestartCapability, kIOPMRestart, RESTART_POWER, 0,0,0,0,0,0,0,0}, {1, kIOPMSleepCapability, kIOPMSleep, SLEEP_POWER, 0,0,0,0,0,0,0,0}, - {1, kIOPMDoze, kIOPMDoze, DOZE_POWER, 0,0,0,0,0,0,0,0}, {1, kIOPMPowerOn, kIOPMPowerOn, ON_POWER, 0,0,0,0,0,0,0,0} }; -// Clients eligible to receive system power messages. -enum { - kMessageClientNone = 0, - kMessageClientAll, - kMessageClientConfigd -}; - -// Run states (R-state) defined within the ON power state. -enum { - kRStateNormal = 0, - kRStateDark, - kRStateMaintenance, - kRStateCount -}; - -// IOService in power plane can be tagged with following flags. -enum { - kServiceFlagGraphics = 0x01, - kServiceFlagNoPowerUp = 0x02, - kServiceFlagTopLevelPCI = 0x04 -}; - -// Flags describing R-state features and capabilities. -enum { - kRStateFlagNone = 0x00000000, - kRStateFlagSuppressGraphics = 0x00000001, - kRStateFlagSuppressMessages = 0x00000002, - kRStateFlagSuppressPCICheck = 0x00000004, - kRStateFlagDisableIdleSleep = 0x00000008 -}; - -#if ROOT_DOMAIN_RUN_STATES - -// Table of flags for each R-state. -static uint32_t gRStateFlags[ kRStateCount ] = -{ - kRStateFlagNone, - - /* Dark wake */ - kRStateFlagSuppressGraphics, - - /* Maintenance wake */ - kRStateFlagSuppressGraphics | - kRStateFlagSuppressMessages | - kRStateFlagSuppressPCICheck | - kRStateFlagDisableIdleSleep -}; - -static IONotifier * gConfigdNotifier = 0; - -#define kIOPMRootDomainRunStateKey "Run State" #define kIOPMRootDomainWakeTypeMaintenance "Maintenance" #define kIOPMRootDomainWakeTypeSleepTimer "SleepTimer" #define kIOPMrootDomainWakeTypeLowBattery "LowBattery" - -#endif /* ROOT_DOMAIN_RUN_STATES */ +#define kIOPMRootDomainWakeTypeUser "User" +#define kIOPMRootDomainWakeTypeAlarm "Alarm" +#define kIOPMRootDomainWakeTypeNetwork "Network" // Special interest that entitles the interested client from receiving -// all system messages. Used by pmconfigd to support maintenance wake. +// all system messages. Only used by powerd. // -#define kIOPMPrivilegedPowerInterest "IOPMPrivilegedPowerInterest" - -static IONotifier * gSysPowerDownNotifier = 0; +#define kIOPMSystemCapabilityInterest "IOPMSystemCapabilityInterest" /* * Aggressiveness @@ -231,8 +210,6 @@ static IONotifier * gSysPowerDownNotifier = 0; #define kAggressivesMinValue 1 -static uint32_t gAggressivesState = 0; - enum { kAggressivesStateBusy = 0x01, kAggressivesStateQuickSpindown = 0x02 @@ -269,14 +246,28 @@ enum { enum { kAggressivesRecordFlagModified = 0x00000001, kAggressivesRecordFlagMinValue = 0x00000002 - +}; + +// gDarkWakeFlags +enum { + kDarkWakeFlagHIDTickleEarly = 0x01, // hid tickle before gfx suppression + kDarkWakeFlagHIDTickleLate = 0x02, // hid tickle after gfx suppression + kDarkWakeFlagHIDTickleNone = 0x03, // hid tickle is not posted + kDarkWakeFlagHIDTickleMask = 0x03, + kDarkWakeFlagIgnoreDiskIOInDark = 0x04, // ignore disk idle in DW + kDarkWakeFlagIgnoreDiskIOAlways = 0x08, // always ignore disk idle + kDarkWakeFlagIgnoreDiskIOMask = 0x0C, + kDarkWakeFlagAlarmIsDark = 0x0100 }; static IOPMrootDomain * gRootDomain; +static IONotifier * gSysPowerDownNotifier = 0; static UInt32 gSleepOrShutdownPending = 0; static UInt32 gWillShutdown = 0; -static uint32_t gMessageClientType = kMessageClientNone; +static UInt32 gPagingOff = 0; static UInt32 gSleepWakeUUIDIsSet = false; +static uint32_t gAggressivesState = 0; +static uint32_t gDarkWakeFlags = kDarkWakeFlagHIDTickleNone; struct timeval gIOLastSleepTime; struct timeval gIOLastWakeTime; @@ -293,29 +284,173 @@ const OSSymbol *gIOPMStatsApplicationResponseTimedOut; const OSSymbol *gIOPMStatsApplicationResponseCancel; const OSSymbol *gIOPMStatsApplicationResponseSlow; +/* + * PMSettingHandle + * Opaque handle passed to clients of registerPMSettingController() + */ +class PMSettingHandle : public OSObject +{ + OSDeclareFinalStructors( PMSettingHandle ) + friend class PMSettingObject; + +private: + PMSettingObject *pmso; + void free(void); +}; + +/* + * PMSettingObject + * Internal object to track each PM setting controller + */ class PMSettingObject : public OSObject { - OSDeclareFinalStructors(PMSettingObject) + OSDeclareFinalStructors( PMSettingObject ) + friend class IOPMrootDomain; + private: + queue_head_t calloutQueue; + thread_t waitThread; IOPMrootDomain *parent; + PMSettingHandle *pmsh; IOPMSettingControllerCallback func; OSObject *target; uintptr_t refcon; uint32_t *publishedFeatureID; - int releaseAtCount; + uint32_t settingCount; + bool disabled; + + void free(void); + public: static PMSettingObject *pmSettingObject( - IOPMrootDomain *parent_arg, + IOPMrootDomain *parent_arg, IOPMSettingControllerCallback handler_arg, - OSObject *target_arg, - uintptr_t refcon_arg, - uint32_t supportedPowerSources, - const OSSymbol *settings[]); + OSObject *target_arg, + uintptr_t refcon_arg, + uint32_t supportedPowerSources, + const OSSymbol *settings[], + OSObject **handle_obj); + + void dispatchPMSetting(const OSSymbol *type, OSObject *object); + void clientHandleFreed(void); +}; + +struct PMSettingCallEntry { + queue_chain_t link; + thread_t thread; +}; - void setPMSetting(const OSSymbol *type, OSObject *obj); +#define PMSETTING_LOCK() IOLockLock(settingsCtrlLock) +#define PMSETTING_UNLOCK() IOLockUnlock(settingsCtrlLock) +#define PMSETTING_WAIT(p) IOLockSleep(settingsCtrlLock, p, THREAD_UNINT) +#define PMSETTING_WAKEUP(p) IOLockWakeup(settingsCtrlLock, p, true) - void taggedRelease(const void *tag, const int when) const; - void free(void); +//********************************************************************************* +//********************************************************************************* +//********************************************************************************* + +/* @class IOPMTimeline + * @astract Tracks & records PM activity. + * @discussion Intended for use only as a helper-class to IOPMrootDomain. + * Do not subclass or directly invoke iOPMTimeline + */ +class IOPMTimeline : public OSObject +{ + OSDeclareDefaultStructors( IOPMTimeline ); + +public: + static IOPMTimeline* timeline(IOPMrootDomain *root_domain); + + bool setProperties(OSDictionary *d); + OSDictionary *copyInfoDictionary(void); + + IOReturn recordSystemPowerEvent( PMEventDetails *details ); + + IOReturn recordDetailedPowerEvent( PMEventDetails *details ); + + IOMemoryDescriptor *getPMTraceMemoryDescriptor(); + + uint32_t getNumEventsLoggedThisPeriod(); + void setNumEventsLoggedThisPeriod(uint32_t newCount); + bool isSleepCycleInProgress(); + void setSleepCycleInProgressFlag(bool flag); +private: + bool init(void); + void free(void); + + void setEventsTrackedCount(uint32_t newTracked); + void setEventsRecordingLevel(uint32_t eventsTrackedBits); + static uint32_t _atomicIndexIncrement(uint32_t *index, uint32_t limit); + + enum { + kPMTimelineRecordTardyDrivers = 1 << 0, + kPMTmielineRecordSystemEvents = 1 << 1, + kPMTimelineRecordAllDrivers = 1 << 2, + kPMTimelineRecordOff = 0, + kPMTimelineRecordDefault = 3, + kPMTimelineRecordDebug = 7 + }; + + // eventsRecordingLevel is a bitfield defining which PM driver events will get logged + // into the PM buffer. + uint32_t eventsRecordingLevel; + + // pmTraceMemoryDescriptor represents the memory block that IOPMTimeLine records PM trace points into. + IOBufferMemoryDescriptor *pmTraceMemoryDescriptor; + + // Pointer to starting address in pmTraceMemoryDescriptor + IOPMSystemEventRecord *traceBuffer; + IOPMTraceBufferHeader *hdr; + + uint16_t systemState; + + IOLock *logLock; + IOPMrootDomain *owner; + + uint32_t numEventsLoggedThisPeriod; + bool sleepCycleInProgress; +}; + +OSDefineMetaClassAndStructors( IOPMTimeline, OSObject ) + +/* + * PMTraceWorker + * Internal helper object for logging trace points to RTC + * IOPMrootDomain and only IOPMrootDomain should instantiate + * exactly one of these. + */ + +typedef void (*IOPMTracePointHandler)( + void * target, uint32_t code, uint32_t data ); + +class PMTraceWorker : public OSObject +{ + OSDeclareDefaultStructors(PMTraceWorker) +public: + typedef enum { kPowerChangeStart, kPowerChangeCompleted } change_t; + + static PMTraceWorker *tracer( IOPMrootDomain * ); + void tracePCIPowerChange(change_t, IOService *, uint32_t, uint32_t); + void tracePoint(uint8_t phase); + void tracePoint(uint8_t phase, uint8_t data8); + void traceDetail(uint32_t detail); + void traceLoginWindowPhase(uint8_t phase); + int recordTopLevelPCIDevice(IOService *); + void RTC_TRACE(void); + virtual bool serialize(OSSerialize *s) const; + + IOPMTracePointHandler tracePointHandler; + void * tracePointTarget; +private: + IOPMrootDomain *owner; + IOLock *pciMappingLock; + OSArray *pciDeviceBitMappings; + + uint8_t addedToRegistry; + uint8_t tracePhase; + uint8_t loginWindowPhase; + uint8_t traceData8; + uint32_t traceData32; }; /* @@ -327,7 +462,7 @@ class PMAssertionsTracker : public OSObject OSDeclareFinalStructors(PMAssertionsTracker) public: static PMAssertionsTracker *pmAssertionsTracker( IOPMrootDomain * ); - + IOReturn createAssertion(IOPMDriverAssertionType, IOPMDriverAssertionLevel, IOService *, const char *, IOPMDriverAssertionID *); IOReturn releaseAssertion(IOPMDriverAssertionID); IOReturn setAssertionLevel(IOPMDriverAssertionID, IOPMDriverAssertionLevel); @@ -353,7 +488,7 @@ class PMAssertionsTracker : public OSObject IOService *ownerService; IOPMDriverAssertionLevel level; } PMAssertStruct; - + uint32_t tabulateProducerCount; uint32_t tabulateConsumerCount; @@ -363,52 +498,14 @@ class PMAssertionsTracker : public OSObject IOPMrootDomain *owner; OSArray *assertionsArray; IOLock *assertionsArrayLock; - IOPMDriverAssertionID issuingUniqueID; + IOPMDriverAssertionID issuingUniqueID __attribute__((aligned(8))); /* aligned for atomic access */ IOPMDriverAssertionType assertionsKernel; IOPMDriverAssertionType assertionsUser; IOPMDriverAssertionType assertionsCombined; }; - + OSDefineMetaClassAndFinalStructors(PMAssertionsTracker, OSObject); - -/* - * PMTraceWorker - * Internal helper object for logging trace points to RTC - * IOPMrootDomain and only IOPMrootDomain should instantiate - * exactly one of these. - */ - -typedef void (*IOPMTracePointHandler)( - void * target, uint32_t code, uint32_t data ); - -class PMTraceWorker : public OSObject -{ - OSDeclareDefaultStructors(PMTraceWorker) -public: - typedef enum { kPowerChangeStart, kPowerChangeCompleted } change_t; - - static PMTraceWorker *tracer( IOPMrootDomain * ); - void tracePCIPowerChange(change_t, IOService *, uint32_t, uint32_t); - void tracePoint(uint8_t phase); - void traceLoginWindowPhase(uint8_t phase); - int recordTopLevelPCIDevice(IOService *); - void RTC_TRACE(void); - virtual bool serialize(OSSerialize *s) const; - - IOPMTracePointHandler tracePointHandler; - void * tracePointTarget; -private: - IOPMrootDomain *owner; - IOLock *pciMappingLock; - OSArray *pciDeviceBitMappings; - - uint8_t tracePhase; - uint8_t loginWindowPhase; - uint8_t addedToRegistry; - uint8_t unused0; - uint32_t pciBusyBitMask; -}; - + /* * PMHaltWorker * Internal helper object for Shutdown/Restart notifications. @@ -441,6 +538,19 @@ OSDefineMetaClassAndFinalStructors( PMHaltWorker, OSObject ) #define super IOService OSDefineMetaClassAndFinalStructors(IOPMrootDomain, IOService) +static void IOPMRootDomainWillShutdown(void) +{ + if (OSCompareAndSwap(0, 1, &gWillShutdown)) + { + OSKext::willShutdown(); + for (int i = 0; i < 100; i++) + { + if (OSCompareAndSwap(0, 1, &gSleepOrShutdownPending)) break; + IOSleep( 100 ); + } + } +} + extern "C" { IONotifier * registerSleepWakeInterest(IOServiceInterestHandler handler, void * self, void * ref) @@ -473,61 +583,60 @@ extern "C" return gRootDomain->shutdownSystem(); } - void IOSystemShutdownNotification ( void ) + void IOSystemShutdownNotification(void) { - if (OSCompareAndSwap(0, 1, &gWillShutdown)) - { - OSKext::willShutdown(); - for (int i = 0; i < 100; i++) - { - if (OSCompareAndSwap(0, 1, &gSleepOrShutdownPending)) break; - IOSleep( 100 ); - } - } + IOPMRootDomainWillShutdown(); + if (OSCompareAndSwap(0, 1, &gPagingOff)) + { +#if !CONFIG_EMBEDDED + gRootDomain->handlePlatformHaltRestart(kPEPagingOff); +#endif + } } int sync_internal(void); } /* -A device is always in the highest power state which satisfies its driver, its policy-maker, and any power domain -children it has, but within the constraint of the power state provided by its parent. The driver expresses its desire by -calling changePowerStateTo(), the policy-maker expresses its desire by calling changePowerStateToPriv(), and the children -express their desires by calling requestPowerDomainState(). - -The Root Power Domain owns the policy for idle and demand sleep and doze for the system. It is a power-managed IOService just -like the others in the system. It implements several power states which correspond to what we see as Sleep, Doze, etc. - -The sleep/doze policy is as follows: -Sleep and Doze are prevented if the case is open so that nobody will think the machine is off and plug/unplug cards. -Sleep and Doze are prevented if the sleep timeout slider in the preferences panel is at zero. -The system cannot Sleep, but can Doze if some object in the tree is in a power state marked kIOPMPreventSystemSleep. - -These three conditions are enforced using the "driver clamp" by calling changePowerStateTo(). For example, if the case is -opened, changePowerStateTo(ON_STATE) is called to hold the system on regardless of the desires of the children of the root or -the state of the other clamp. - -Demand Sleep/Doze is initiated by pressing the front panel power button, closing the clamshell, or selecting the menu item. -In this case the root's parent actually initiates the power state change so that the root has no choice and does not give -applications the opportunity to veto the change. - -Idle Sleep/Doze occurs if no objects in the tree are in a state marked kIOPMPreventIdleSleep. When this is true, the root's -children are not holding the root on, so it sets the "policy-maker clamp" by calling changePowerStateToPriv(ON_STATE) -to hold itself on until the sleep timer expires. This timer is set for the difference between the sleep timeout slider and -the larger of the display dim timeout slider and the disk spindown timeout slider in the Preferences panel. For example, if -the system is set to sleep after thirty idle minutes, and the display and disk are set to sleep after five idle minutes, -when there is no longer an object in the tree holding the system out of Idle Sleep (via kIOPMPreventIdleSleep), the root -sets its timer for 25 minutes (30 - 5). When the timer expires, it releases its clamp and now nothing is holding it awake, -so it falls asleep. - -Demand sleep is prevented when the system is booting. When preferences are transmitted by the loginwindow at the end of -boot, a flag is cleared, and this allows subsequent Demand Sleep. - -The system will not Sleep, but will Doze if some object calls setSleepSupported(kPCICantSleep) during a power change to the sleep state (this can be done by the PCI Aux Power Supply drivers, Slots99, MacRISC299, etc.). This is not enforced with -a clamp, but sets a flag which is noticed before actually sleeping the kernel. If the flag is set, the root steps up -one power state from Sleep to Doze, and any objects in the tree for which this is relevent will act appropriately (USB and -ADB will turn on again so that they can wake the system out of Doze (keyboard/mouse activity will cause the Display Wrangler -to be tickled)). +A device is always in the highest power state which satisfies its driver, +its policy-maker, and any power children it has, but within the constraint +of the power state provided by its parent. The driver expresses its desire by +calling changePowerStateTo(), the policy-maker expresses its desire by calling +changePowerStateToPriv(), and the children express their desires by calling +requestPowerDomainState(). + +The Root Power Domain owns the policy for idle and demand sleep for the system. +It is a power-managed IOService just like the others in the system. +It implements several power states which map to what we see as Sleep and On. + +The sleep policy is as follows: +1. Sleep is prevented if the case is open so that nobody will think the machine + is off and plug/unplug cards. +2. Sleep is prevented if the sleep timeout slider in the prefs panel is zero. +3. System cannot Sleep if some object in the tree is in a power state marked + kIOPMPreventSystemSleep. + +These three conditions are enforced using the "driver clamp" by calling +changePowerStateTo(). For example, if the case is opened, +changePowerStateTo(ON_STATE) is called to hold the system on regardless +of the desires of the children of the root or the state of the other clamp. + +Demand Sleep is initiated by pressing the front panel power button, closing +the clamshell, or selecting the menu item. In this case the root's parent +actually initiates the power state change so that the root domain has no +choice and does not give applications the opportunity to veto the change. + +Idle Sleep occurs if no objects in the tree are in a state marked +kIOPMPreventIdleSleep. When this is true, the root's children are not holding +the root on, so it sets the "policy-maker clamp" by calling +changePowerStateToPriv(ON_STATE) to hold itself on until the sleep timer expires. +This timer is set for the difference between the sleep timeout slider and the +display dim timeout slider. When the timer expires, it releases its clamp and +now nothing is holding it awake, so it falls asleep. + +Demand sleep is prevented when the system is booting. When preferences are +transmitted by the loginwindow at the end of boot, a flag is cleared, +and this allows subsequent Demand Sleep. */ //****************************************************************************** @@ -547,16 +656,27 @@ IOPMrootDomain * IOPMrootDomain::construct( void ) static void disk_sync_callout( thread_call_param_t p0, thread_call_param_t p1 ) { - IOService *rootDomain = (IOService *) p0; - unsigned long pmRef = (unsigned long) p1; + IOService * rootDomain = (IOService *) p0; + uint32_t notifyRef = (uint32_t)(uintptr_t) p1; + uint32_t powerState = rootDomain->getPowerState(); - DLOG("disk_sync_callout start\n"); + DLOG("disk_sync_callout ps=%u\n", powerState); + if (ON_STATE == powerState) + { #if HIBERNATION - IOHibernateSystemSleep(); + IOHibernateSystemSleep(); #endif - sync_internal(); - rootDomain->allowPowerChange(pmRef); + sync_internal(); + } +#if HIBERNATION + else + { + IOHibernateSystemPostWake(); + } +#endif + + rootDomain->allowPowerChange(notifyRef); DLOG("disk_sync_callout finish\n"); } @@ -601,11 +721,11 @@ sysctl_sleepwaketime SYSCTL_HANDLER_ARGS } static SYSCTL_PROC(_kern, OID_AUTO, sleeptime, - CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, &gIOLastSleepTime, 0, sysctl_sleepwaketime, "S,timeval", ""); static SYSCTL_PROC(_kern, OID_AUTO, waketime, - CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, &gIOLastWakeTime, 0, sysctl_sleepwaketime, "S,timeval", ""); @@ -617,7 +737,7 @@ sysctl_willshutdown int error = sysctl_io_number(req, gWillShutdown, sizeof(int), &new_value, &changed); if (changed) { if (!gWillShutdown && (new_value == 1)) { - IOSystemShutdownNotification(); + IOPMRootDomainWillShutdown(); } else error = EINVAL; } @@ -625,7 +745,7 @@ sysctl_willshutdown } static SYSCTL_PROC(_kern, OID_AUTO, willshutdown, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, 0, sysctl_willshutdown, "I", ""); #if !CONFIG_EMBEDDED @@ -661,16 +781,19 @@ sysctl_progressmeter } static SYSCTL_PROC(_kern, OID_AUTO, progressmeterenable, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, 0, sysctl_progressmeterenable, "I", ""); static SYSCTL_PROC(_kern, OID_AUTO, progressmeter, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, 0, sysctl_progressmeter, "I", ""); #endif +static SYSCTL_INT(_debug, OID_AUTO, darkwake, CTLFLAG_RW, &gDarkWakeFlags, 0, ""); + static const OSSymbol * gIOPMSettingAutoWakeSecondsKey; +static const OSSymbol * gIOPMSettingDebugWakeRelativeKey; static const OSSymbol * gIOPMSettingMaintenanceWakeCalendarKey; //****************************************************************************** @@ -684,11 +807,13 @@ bool IOPMrootDomain::start( IOService * nub ) { OSIterator *psIterator; OSDictionary *tmpDict; + IORootParent * patriarch; super::start(nub); gRootDomain = this; gIOPMSettingAutoWakeSecondsKey = OSSymbol::withCString(kIOPMSettingAutoWakeSecondsKey); + gIOPMSettingDebugWakeRelativeKey = OSSymbol::withCString(kIOPMSettingDebugWakeRelativeKey); gIOPMSettingMaintenanceWakeCalendarKey = OSSymbol::withCString(kIOPMSettingMaintenanceWakeCalendarKey); @@ -706,7 +831,7 @@ bool IOPMrootDomain::start( IOService * nub ) OSSymbol::withCString(kIOPMSettingAutoPowerSecondsKey), OSSymbol::withCString(kIOPMSettingAutoWakeCalendarKey), OSSymbol::withCString(kIOPMSettingAutoPowerCalendarKey), - OSSymbol::withCString(kIOPMSettingDebugWakeRelativeKey), + gIOPMSettingDebugWakeRelativeKey, OSSymbol::withCString(kIOPMSettingDebugPowerRelativeKey), OSSymbol::withCString(kIOPMSettingWakeOnRingKey), OSSymbol::withCString(kIOPMSettingRestartOnPowerLossKey), @@ -719,28 +844,25 @@ bool IOPMrootDomain::start( IOService * nub ) OSSymbol::withCString(kIOPMStateConsoleShutdown) }; + PE_parse_boot_argn("darkwake", &gDarkWakeFlags, sizeof(gDarkWakeFlags)); + queue_init(&aggressivesQueue); aggressivesThreadCall = thread_call_allocate(handleAggressivesFunction, this); aggressivesData = OSData::withCapacity( sizeof(AggressivesRecord) * (kPMLastAggressivenessType + 4)); featuresDictLock = IOLockAlloc(); - settingsCtrlLock = IORecursiveLockAlloc(); + settingsCtrlLock = IOLockAlloc(); setPMRootDomain(this); extraSleepTimer = thread_call_allocate( idleSleepTimerExpired, (thread_call_param_t) this); - clamshellWakeupIgnore = thread_call_allocate( - wakeupClamshellTimerExpired, - (thread_call_param_t) this); - diskSyncCalloutEntry = thread_call_allocate( &disk_sync_callout, (thread_call_param_t) this); - canSleep = true; setProperty(kIOSleepSupportedKey, true); bzero(&pmStats, sizeof(pmStats)); @@ -749,21 +871,27 @@ bool IOPMrootDomain::start( IOService * nub ) pmAssertions = PMAssertionsTracker::pmAssertionsTracker(this); - updateRunState(kRStateNormal); userDisabledAllSleep = false; - allowSleep = true; - sleepIsSupported = true; systemBooting = true; sleepSlider = 0; idleSleepTimerPending = false; wrangler = NULL; - sleepASAP = false; - clamshellIsClosed = false; - clamshellExists = false; - ignoringClamshell = true; - ignoringClamshellOnWake = false; + clamshellClosed = false; + clamshellExists = false; + clamshellDisabled = true; acAdaptorConnected = true; + // Set the default system capabilities at boot. + _currentCapability = kIOPMSystemCapabilityCPU | + kIOPMSystemCapabilityGraphics | + kIOPMSystemCapabilityAudio | + kIOPMSystemCapabilityNetwork; + + _pendingCapability = _currentCapability; + _desiredCapability = _currentCapability; + _highestCapability = _currentCapability; + setProperty(kIOPMSystemCapabilitiesKey, _currentCapability, 64); + queuedSleepWakeUUIDString = NULL; pmStatsAppResponses = OSArray::withCapacity(5); _statsNameKey = OSSymbol::withCString(kIOPMStatsNameKey); @@ -810,10 +938,23 @@ bool IOPMrootDomain::start( IOService * nub ) patriarch->addPowerChild(this); registerPowerDriver(this, ourPowerStates, NUM_POWER_STATES); - - // set a clamp until we sleep changePowerStateToPriv(ON_STATE); + if (gIOKitDebug & (kIOLogDriverPower1 | kIOLogDriverPower2)) + { + // Setup our PM logging & recording code + timeline = IOPMTimeline::timeline(this); + if (timeline) { + OSDictionary *tlInfo = timeline->copyInfoDictionary(); + + if (tlInfo) + { + setProperty(kIOPMTimelineDictionaryKey, tlInfo); + tlInfo->release(); + } + } + } + // install power change handler gSysPowerDownNotifier = registerPrioritySleepWakeInterest( &sysPowerDownHandler, this, 0); @@ -823,22 +964,12 @@ bool IOPMrootDomain::start( IOService * nub ) { _displayWranglerNotifier = addMatchingNotification( gIOPublishNotification, tmpDict, - (IOServiceMatchingNotificationHandler) &displayWranglerPublished, + (IOServiceMatchingNotificationHandler) &displayWranglerMatchPublished, this, 0); tmpDict->release(); } #endif - // Battery location published - ApplePMU support only - if ((tmpDict = serviceMatching("IOPMPowerSource"))) - { - _batteryPublishNotifier = addMatchingNotification( - gIOPublishNotification, tmpDict, - (IOServiceMatchingNotificationHandler) &batteryPublished, - this, this); - tmpDict->release(); - } - const OSSymbol *ucClassName = OSSymbol::withCStringNoCopy("RootDomainUserClient"); setProperty(gIOUserClientClassKey, (OSObject *) ucClassName); ucClassName->release(); @@ -874,7 +1005,6 @@ bool IOPMrootDomain::start( IOService * nub ) return true; } - //****************************************************************************** // setProperties // @@ -888,44 +1018,39 @@ IOReturn IOPMrootDomain::setProperties( OSObject * props_obj ) OSDictionary *dict = OSDynamicCast(OSDictionary, props_obj); OSBoolean *b; OSNumber *n; - OSString *str; + OSDictionary *d; OSSymbol *type; OSObject *obj; unsigned int i; - const OSSymbol *boot_complete_string = - OSSymbol::withCString("System Boot Complete"); - const OSSymbol *sys_shutdown_string = - OSSymbol::withCString("System Shutdown"); - const OSSymbol *stall_halt_string = - OSSymbol::withCString("StallSystemAtHalt"); - const OSSymbol *battery_warning_disabled_string = - OSSymbol::withCString("BatteryWarningsDisabled"); - const OSSymbol *idle_seconds_string = - OSSymbol::withCString("System Idle Seconds"); + const OSSymbol *publish_simulated_battery_string = OSSymbol::withCString("SoftwareSimulatedBatteries"); + const OSSymbol *boot_complete_string = OSSymbol::withCString("System Boot Complete"); + const OSSymbol *sys_shutdown_string = OSSymbol::withCString("System Shutdown"); + const OSSymbol *stall_halt_string = OSSymbol::withCString("StallSystemAtHalt"); + const OSSymbol *battery_warning_disabled_string = OSSymbol::withCString("BatteryWarningsDisabled"); + const OSSymbol *idle_seconds_string = OSSymbol::withCString("System Idle Seconds"); + const OSSymbol *sleepdisabled_string = OSSymbol::withCString("SleepDisabled"); + const OSSymbol *ondeck_sleepwake_uuid_string = OSSymbol::withCString(kIOPMSleepWakeUUIDKey); + const OSSymbol *loginwindow_tracepoint_string = OSSymbol::withCString(kIOPMLoginWindowSecurityDebugKey); + const OSSymbol *pmTimelineLogging_string = OSSymbol::withCString(kIOPMTimelineDictionaryKey); #if HIBERNATION - const OSSymbol *hibernatemode_string = - OSSymbol::withCString(kIOHibernateModeKey); - const OSSymbol *hibernatefile_string = - OSSymbol::withCString(kIOHibernateFileKey); - const OSSymbol *hibernatefreeratio_string = - OSSymbol::withCString(kIOHibernateFreeRatioKey); - const OSSymbol *hibernatefreetime_string = - OSSymbol::withCString(kIOHibernateFreeTimeKey); + const OSSymbol *hibernatemode_string = OSSymbol::withCString(kIOHibernateModeKey); + const OSSymbol *hibernatefile_string = OSSymbol::withCString(kIOHibernateFileKey); + const OSSymbol *hibernatefreeratio_string = OSSymbol::withCString(kIOHibernateFreeRatioKey); + const OSSymbol *hibernatefreetime_string = OSSymbol::withCString(kIOHibernateFreeTimeKey); #endif - const OSSymbol *sleepdisabled_string = - OSSymbol::withCString("SleepDisabled"); - const OSSymbol *ondeck_sleepwake_uuid_string = - OSSymbol::withCString(kIOPMSleepWakeUUIDKey); - const OSSymbol *loginwindow_tracepoint_string = - OSSymbol::withCString(kIOPMLoginWindowSecurityDebugKey); - - if(!dict) + + if (!dict) { return_value = kIOReturnBadArgument; goto exit; } + if ((b = OSDynamicCast(OSBoolean, dict->getObject(publish_simulated_battery_string)))) + { + publishResource(publish_simulated_battery_string, kOSBooleanTrue); + } + if ((n = OSDynamicCast(OSNumber, dict->getObject(idle_seconds_string)))) { setProperty(idle_seconds_string, n); @@ -936,44 +1061,53 @@ IOReturn IOPMrootDomain::setProperties( OSObject * props_obj ) { pmPowerStateQueue->submitPowerEvent( kPowerEventSystemBootCompleted ); } - - if( battery_warning_disabled_string - && dict->getObject(battery_warning_disabled_string)) + + if( battery_warning_disabled_string && dict->getObject(battery_warning_disabled_string)) { - setProperty( battery_warning_disabled_string, - dict->getObject(battery_warning_disabled_string)); + setProperty( battery_warning_disabled_string, dict->getObject(battery_warning_disabled_string)); } - if( sys_shutdown_string - && (b = OSDynamicCast(OSBoolean, dict->getObject(sys_shutdown_string)))) + if (pmTimelineLogging_string && (d = OSDynamicCast(OSDictionary, dict->getObject(pmTimelineLogging_string)))) + { + if (timeline && timeline->setProperties(d)) + { + OSDictionary *tlInfo = timeline->copyInfoDictionary(); + if (tlInfo) { + setProperty(kIOPMTimelineDictionaryKey, tlInfo); + tlInfo->release(); + } + } + } + + if( sys_shutdown_string && (b = OSDynamicCast(OSBoolean, dict->getObject(sys_shutdown_string)))) { pmPowerStateQueue->submitPowerEvent(kPowerEventSystemShutdown, (void *) b); } - if( stall_halt_string - && (b = OSDynamicCast(OSBoolean, dict->getObject(stall_halt_string))) ) + if( stall_halt_string && (b = OSDynamicCast(OSBoolean, dict->getObject(stall_halt_string))) ) { setProperty(stall_halt_string, b); } #if HIBERNATION if ( hibernatemode_string - && (n = OSDynamicCast(OSNumber, dict->getObject(hibernatemode_string)))) + && (n = OSDynamicCast(OSNumber, dict->getObject(hibernatemode_string)))) { setProperty(hibernatemode_string, n); } if ( hibernatefreeratio_string - && (n = OSDynamicCast(OSNumber, dict->getObject(hibernatefreeratio_string)))) + && (n = OSDynamicCast(OSNumber, dict->getObject(hibernatefreeratio_string)))) { setProperty(hibernatefreeratio_string, n); } if ( hibernatefreetime_string - && (n = OSDynamicCast(OSNumber, dict->getObject(hibernatefreetime_string)))) + && (n = OSDynamicCast(OSNumber, dict->getObject(hibernatefreetime_string)))) { setProperty(hibernatefreetime_string, n); - } + } + OSString *str; if ( hibernatefile_string - && (str = OSDynamicCast(OSString, dict->getObject(hibernatefile_string)))) + && (str = OSDynamicCast(OSString, dict->getObject(hibernatefile_string)))) { setProperty(hibernatefile_string, str); } @@ -985,28 +1119,14 @@ IOReturn IOPMrootDomain::setProperties( OSObject * props_obj ) setProperty(sleepdisabled_string, b); pmPowerStateQueue->submitPowerEvent(kPowerEventUserDisabledSleep, (void *) b); } - if (ondeck_sleepwake_uuid_string && (obj = dict->getObject(ondeck_sleepwake_uuid_string))) { - // Clear the currently published UUID - if (kOSBooleanFalse == obj) - { - publishSleepWakeUUID(NULL); + if(pmPowerStateQueue) { + obj->retain(); + pmPowerStateQueue->submitPowerEvent(kPowerEventQueueSleepWakeUUID, (void *)obj); } - // Cache UUID for an upcoming sleep/wake - if ((str = OSDynamicCast(OSString, obj))) - { - if (queuedSleepWakeUUIDString) { - queuedSleepWakeUUIDString->release(); - queuedSleepWakeUUIDString = NULL; - } - queuedSleepWakeUUIDString = str; - queuedSleepWakeUUIDString->retain(); - DLOG("SleepWake UUID queued: %s\n", - queuedSleepWakeUUIDString->getCStringNoCopy()); - } } if (loginwindow_tracepoint_string @@ -1024,6 +1144,10 @@ IOReturn IOPMrootDomain::setProperties( OSObject * props_obj ) { setProperty(kIOPMDeepSleepDelayKey, n); } + if ((b = OSDynamicCast(OSBoolean, dict->getObject(kIOPMDestroyFVKeyOnStandbyKey)))) + { + setProperty(kIOPMDestroyFVKeyOnStandbyKey, b); + } // Relay our allowed PM settings onto our registered PM clients for(i = 0; i < allowedPMSettings->getCount(); i++) { @@ -1034,24 +1158,31 @@ IOReturn IOPMrootDomain::setProperties( OSObject * props_obj ) obj = dict->getObject(type); if(!obj) continue; - if ((gIOPMSettingAutoWakeSecondsKey == type) && ((n = OSDynamicCast(OSNumber, obj)))) - { - UInt32 rsecs = n->unsigned32BitValue(); - if (!rsecs) - autoWakeStart = autoWakeEnd = 0; - else - { - AbsoluteTime deadline; - clock_interval_to_deadline(rsecs + kAutoWakePostWindow, kSecondScale, &deadline); - autoWakeEnd = AbsoluteTime_to_scalar(&deadline); - if (rsecs > kAutoWakePreWindow) - rsecs -= kAutoWakePreWindow; - else - rsecs = 0; - clock_interval_to_deadline(rsecs, kSecondScale, &deadline); - autoWakeStart = AbsoluteTime_to_scalar(&deadline); - } - } + if ((gIOPMSettingAutoWakeSecondsKey == type) && ((n = OSDynamicCast(OSNumber, obj)))) + { + UInt32 rsecs = n->unsigned32BitValue(); + if (!rsecs) + autoWakeStart = autoWakeEnd = 0; + else + { + AbsoluteTime deadline; + clock_interval_to_deadline(rsecs + kAutoWakePostWindow, kSecondScale, &deadline); + autoWakeEnd = AbsoluteTime_to_scalar(&deadline); + if (rsecs > kAutoWakePreWindow) + rsecs -= kAutoWakePreWindow; + else + rsecs = 0; + clock_interval_to_deadline(rsecs, kSecondScale, &deadline); + autoWakeStart = AbsoluteTime_to_scalar(&deadline); + } + } + if (gIOPMSettingDebugWakeRelativeKey == type) + { + if ((n = OSDynamicCast(OSNumber, obj))) + _debugWakeSeconds = n->unsigned32BitValue(); + else + _debugWakeSeconds = 0; + } return_value = setPMSetting(type, obj); @@ -1059,14 +1190,16 @@ IOReturn IOPMrootDomain::setProperties( OSObject * props_obj ) } exit: + if(publish_simulated_battery_string) publish_simulated_battery_string->release(); if(boot_complete_string) boot_complete_string->release(); if(sys_shutdown_string) sys_shutdown_string->release(); if(stall_halt_string) stall_halt_string->release(); - if (battery_warning_disabled_string) battery_warning_disabled_string->release(); + if(battery_warning_disabled_string) battery_warning_disabled_string->release(); if(idle_seconds_string) idle_seconds_string->release(); if(sleepdisabled_string) sleepdisabled_string->release(); if(ondeck_sleepwake_uuid_string) ondeck_sleepwake_uuid_string->release(); if(loginwindow_tracepoint_string) loginwindow_tracepoint_string->release(); + if(pmTimelineLogging_string) pmTimelineLogging_string->release(); #if HIBERNATION if(hibernatemode_string) hibernatemode_string->release(); if(hibernatefile_string) hibernatefile_string->release(); @@ -1076,114 +1209,21 @@ IOReturn IOPMrootDomain::setProperties( OSObject * props_obj ) return return_value; } +// MARK: - +// MARK: Aggressiveness //****************************************************************************** -// aggressivenessChanged +// setAggressiveness // -// We are behind the command gate to examine changes to aggressives. +// Override IOService::setAggressiveness() //****************************************************************************** -void IOPMrootDomain::aggressivenessChanged( void ) +IOReturn IOPMrootDomain::setAggressiveness( + unsigned long type, + unsigned long value ) { - unsigned long minutesToSleep = 0; - unsigned long minutesToDisplayDim = 0; - - ASSERT_GATED(); - - // Fetch latest display and system sleep slider values. - getAggressiveness(kPMMinutesToSleep, &minutesToSleep); - getAggressiveness(kPMMinutesToDim, &minutesToDisplayDim); - DLOG("aggressiveness changed system %u, display %u\n", - (uint32_t) minutesToSleep, (uint32_t) minutesToDisplayDim); - - DLOG("idle time -> %ld secs (ena %d)\n", - idleSeconds, (minutesToSleep != 0)); - - if (0x7fffffff == minutesToSleep) - minutesToSleep = idleSeconds; - - // How long to wait before sleeping the system once the displays turns - // off is indicated by 'extraSleepDelay'. - - if ( minutesToSleep > minutesToDisplayDim ) { - extraSleepDelay = minutesToSleep - minutesToDisplayDim; - } - else { - extraSleepDelay = 0; - } - - // system sleep timer was disabled, but not anymore. - if ( (sleepSlider == 0) && (minutesToSleep != 0) ) { - if (!wrangler) - { - sleepASAP = false; - changePowerStateToPriv(ON_STATE); - if (idleSeconds) - { - startIdleSleepTimer( idleSeconds ); - } - } - else - { - // Start idle sleep timer if wrangler went to sleep - // while system sleep was disabled. - - sleepASAP = false; - if (wranglerAsleep) - { - AbsoluteTime now; - uint64_t nanos; - uint32_t minutesSinceDisplaySleep = 0; - uint32_t sleepDelay; - - clock_get_uptime(&now); - if (CMP_ABSOLUTETIME(&now, &wranglerSleepTime) > 0) - { - SUB_ABSOLUTETIME(&now, &wranglerSleepTime); - absolutetime_to_nanoseconds(now, &nanos); - minutesSinceDisplaySleep = nanos / (60000000000ULL); - } - - if (extraSleepDelay > minutesSinceDisplaySleep) - { - sleepDelay = extraSleepDelay - minutesSinceDisplaySleep; - } - else - { - // 1 min idle sleep. - sleepDelay = 1; - } - - startIdleSleepTimer(sleepDelay * 60); - DLOG("display slept %u min, set idle timer to %u min\n", - minutesSinceDisplaySleep, sleepDelay); - } - } - } - - sleepSlider = minutesToSleep; - if ( sleepSlider == 0 ) { - cancelIdleSleepTimer(); - // idle sleep is now disabled - adjustPowerState(); - // make sure we're powered - patriarch->wakeSystem(); - } -} - - -//****************************************************************************** -// setAggressiveness -// -// Override IOService::setAggressiveness() -//****************************************************************************** - -IOReturn IOPMrootDomain::setAggressiveness( - unsigned long type, - unsigned long value ) -{ - return setAggressiveness( type, value, 0 ); -} + return setAggressiveness( type, value, 0 ); +} /* * Private setAggressiveness() with an internal options argument. @@ -1197,8 +1237,8 @@ IOReturn IOPMrootDomain::setAggressiveness( AggressivesRequest * request; bool found = false; - DLOG("setAggressiveness 0x%x = %u, options 0x%x\n", - (uint32_t) type, (uint32_t) value, (uint32_t) options); + DLOG("setAggressiveness(%x) 0x%x = %u\n", + (uint32_t) options, (uint32_t) type, (uint32_t) value); request = IONew(AggressivesRequest, 1); if (!request) @@ -1255,7 +1295,6 @@ IOReturn IOPMrootDomain::setAggressiveness( return kIOReturnSuccess; } - //****************************************************************************** // getAggressiveness // @@ -1328,8 +1367,8 @@ IOReturn IOPMrootDomain::getAggressiveness ( if (source) { - DLOG("getAggressiveness 0x%x = %u, source %d\n", - (uint32_t) type, value, source); + DLOG("getAggressiveness(%d) 0x%x = %u\n", + source, (uint32_t) type, value); *outLevel = (unsigned long) value; return kIOReturnSuccess; } @@ -1341,7 +1380,6 @@ IOReturn IOPMrootDomain::getAggressiveness ( } } - //****************************************************************************** // joinAggressiveness // @@ -1356,7 +1394,7 @@ IOReturn IOPMrootDomain::joinAggressiveness( if (!service || (service == this)) return kIOReturnBadArgument; - DLOG("joinAggressiveness %s (%p)\n", service->getName(), service); + DLOG("joinAggressiveness %s %p\n", service->getName(), service); request = IONew(AggressivesRequest, 1); if (!request) @@ -1377,7 +1415,6 @@ IOReturn IOPMrootDomain::joinAggressiveness( return kIOReturnSuccess; } - //****************************************************************************** // handleAggressivesRequests // @@ -1443,7 +1480,7 @@ void IOPMrootDomain::handleAggressivesRequests( void ) broadcast = true; record->flags |= (kAggressivesRecordFlagMinValue | kAggressivesRecordFlagModified); - DLOG("quick spindown accelerated, was %u min\n", + DLOG("disk spindown accelerated, was %u min\n", record->value); } } @@ -1545,11 +1582,12 @@ void IOPMrootDomain::handleAggressivesRequests( void ) // Submit a power event to handle those changes on the PM work loop. if (pingSelf && pmPowerStateQueue) { - pmPowerStateQueue->submitPowerEvent( kPowerEventAggressivenessChanged ); + pmPowerStateQueue->submitPowerEvent( + kPowerEventPolicyStimulus, + (void *) kStimulusAggressivenessChanged ); } } - //****************************************************************************** // synchronizeAggressives // @@ -1564,6 +1602,7 @@ void IOPMrootDomain::synchronizeAggressives( IOService * service; AggressivesRequest * request; const AggressivesRecord * record; + IOPMDriverCallEntry callEntry; uint32_t value; int i; @@ -1580,7 +1619,7 @@ void IOPMrootDomain::synchronizeAggressives( if (service) { - if (service->assertPMThreadCall()) + if (service->assertPMDriverCall(&callEntry)) { for (i = 0, record = array; i < count; i++, record++) { @@ -1588,18 +1627,17 @@ void IOPMrootDomain::synchronizeAggressives( if (record->flags & kAggressivesRecordFlagMinValue) value = kAggressivesMinValue; - DLOG("synchronizeAggressives 0x%x = %u to %s\n", + _LOG("synchronizeAggressives 0x%x = %u to %s\n", record->type, value, service->getName()); service->setAggressiveness(record->type, value); } - service->deassertPMThreadCall(); + service->deassertPMDriverCall(&callEntry); } service->release(); // retained by joinAggressiveness() } } } - //****************************************************************************** // broadcastAggressives // @@ -1610,18 +1648,19 @@ void IOPMrootDomain::broadcastAggressives( const AggressivesRecord * array, int count ) { - IORegistryIterator * iter; - IORegistryEntry * entry; - IOPowerConnection * connect; + IORegistryIterator * iter; + IORegistryEntry * entry; + IOPowerConnection * connect; IOService * service; const AggressivesRecord * record; + IOPMDriverCallEntry callEntry; uint32_t value; int i; - iter = IORegistryIterator::iterateOver( - this, gIOPowerPlane, kIORegistryIterateRecursively); + iter = IORegistryIterator::iterateOver( + this, gIOPowerPlane, kIORegistryIterateRecursively); if (iter) - { + { do { iter->reset(); @@ -1633,7 +1672,7 @@ void IOPMrootDomain::broadcastAggressives( if ((service = (IOService *) connect->copyChildEntry(gIOPowerPlane))) { - if (service->assertPMThreadCall()) + if (service->assertPMDriverCall(&callEntry)) { for (i = 0, record = array; i < count; i++, record++) { @@ -1642,12 +1681,12 @@ void IOPMrootDomain::broadcastAggressives( value = record->value; if (record->flags & kAggressivesRecordFlagMinValue) value = kAggressivesMinValue; - DLOG("broadcastAggressives %x = %u to %s\n", + _LOG("broadcastAggressives %x = %u to %s\n", record->type, value, service->getName()); service->setAggressiveness(record->type, value); } } - service->deassertPMThreadCall(); + service->deassertPMDriverCall(&callEntry); } service->release(); } @@ -1658,6 +1697,8 @@ void IOPMrootDomain::broadcastAggressives( } } +// MARK: - +// MARK: System Sleep //****************************************************************************** // startIdleSleepTimer @@ -1678,7 +1719,6 @@ void IOPMrootDomain::startIdleSleepTimer( uint32_t inSeconds ) } } - //****************************************************************************** // cancelIdleSleepTimer // @@ -1687,7 +1727,7 @@ void IOPMrootDomain::startIdleSleepTimer( uint32_t inSeconds ) void IOPMrootDomain::cancelIdleSleepTimer( void ) { ASSERT_GATED(); - if (idleSleepTimerPending) + if (idleSleepTimerPending) { DLOG("idle timer cancelled\n"); thread_call_cancel(extraSleepTimer); @@ -1695,7 +1735,6 @@ void IOPMrootDomain::cancelIdleSleepTimer( void ) } } - //****************************************************************************** // idleSleepTimerExpired // @@ -1707,13 +1746,6 @@ static void idleSleepTimerExpired( ((IOPMrootDomain *)us)->handleSleepTimerExpiration(); } -static void wakeupClamshellTimerExpired( - thread_call_param_t us, thread_call_param_t ) -{ - ((IOPMrootDomain *)us)->stopIgnoringClamshellEventsDuringWakeup(); -} - - //****************************************************************************** // handleSleepTimerExpiration // @@ -1747,41 +1779,34 @@ void IOPMrootDomain::handleSleepTimerExpiration( void ) return; } - // accelerate disk spin down if spin down timer is non-zero setQuickSpinDownTimeout(); - - sleepASAP = true; - adjustPowerState(); + adjustPowerState(true); } - //****************************************************************************** -// stopIgnoringClamshellEventsDuringWakeup +// setQuickSpinDownTimeout // //****************************************************************************** -void IOPMrootDomain::stopIgnoringClamshellEventsDuringWakeup( void ) +void IOPMrootDomain::setQuickSpinDownTimeout( void ) { - if (!getPMworkloop()->inGate()) - { - getPMworkloop()->runAction( - OSMemberFunctionCast(IOWorkLoop::Action, this, - &IOPMrootDomain::stopIgnoringClamshellEventsDuringWakeup), - this); - return; - } - ASSERT_GATED(); + setAggressiveness( + kPMMinutesToSpinDown, 0, kAggressivesOptionQuickSpindownEnable ); +} - // Allow clamshell-induced sleep now - ignoringClamshellOnWake = false; +//****************************************************************************** +// restoreUserSpinDownTimeout +// +//****************************************************************************** - // Re-send clamshell event, in case it causes a sleep - if (clamshellIsClosed) - handlePowerNotification( kLocalEvalClamshellCommand ); +void IOPMrootDomain::restoreUserSpinDownTimeout( void ) +{ + ASSERT_GATED(); + setAggressiveness( + kPMMinutesToSpinDown, 0, kAggressivesOptionQuickSpindownDisable ); } - //****************************************************************************** // sleepSystem // @@ -1806,21 +1831,17 @@ IOReturn IOPMrootDomain::sleepSystemOptions( OSDictionary *options ) if (options && options->getObject("OSSwitch")) { - // Log specific sleep cause for OS Switch hibernation - return privateSleepSystem( kIOPMSleepReasonOSSwitchHibernation); - + return privateSleepSystem( kIOPMSleepReasonOSSwitchHibernate); } else { - return privateSleepSystem( kIOPMSleepReasonSoftware); - } } /* private */ IOReturn IOPMrootDomain::privateSleepSystem( uint32_t sleepReason ) { - static const char * IOPMSleepReasons[kIOPMSleepReasonMax] = { + static const char * IOPMSleepReasons[] = { "", kIOPMClamshellSleepKey, kIOPMPowerButtonSleepKey, @@ -1829,711 +1850,869 @@ IOReturn IOPMrootDomain::privateSleepSystem( uint32_t sleepReason ) kIOPMIdleSleepKey, kIOPMLowPowerSleepKey, kIOPMClamshellSleepKey, - kIOPMThermalEmergencySleepKey + kIOPMThermalEmergencySleepKey, + kIOPMMaintenanceSleepKey }; - if ( userDisabledAllSleep ) - { - LOG("Sleep prevented by user disable\n"); - /* Prevent sleep of all kinds if directed to by user space */ - return kIOReturnNotPermitted; - } + PMEventDetails *details; - if ( systemBooting || systemShutdown || !allowSleep ) + if (!checkSystemCanSleep()) { - LOG("Sleep prevented by SB %d, SS %d, AS %d\n", - systemBooting, systemShutdown, allowSleep); + // Record why the system couldn't sleep + details = PMEventDetails::eventDetails(kIOPMEventTypeSleep, NULL, + sleepReason, kIOReturnNotPermitted); + + recordAndReleasePMEvent( details ); + return kIOReturnNotPermitted; + } - // Unable to sleep because system is in the process of booting or - // shutting down, or sleep has otherwise been disallowed. - return kIOReturnError; + if (timeline) + timeline->setSleepCycleInProgressFlag(true); + + // Time to publish a UUID for the Sleep --> Wake cycle + if(pmPowerStateQueue) { + pmPowerStateQueue->submitPowerEvent(kPowerEventPublishSleepWakeUUID, (void *)true); } + + // Log the beginning of system sleep. + details = PMEventDetails::eventDetails(kIOPMEventTypeSleep, NULL, + sleepReason, kIOReturnSuccess); + + recordAndReleasePMEvent( details ); + // Record sleep cause in IORegistry lastSleepReason = sleepReason; - if (sleepReason && (sleepReason < kIOPMSleepReasonMax)) { + sleepReason -= (kIOPMSleepReasonClamshell - 1); + if (sleepReason && (sleepReason < sizeof(IOPMSleepReasons)/sizeof(IOPMSleepReasons[0]))) { setProperty(kRootDomainSleepReasonKey, IOPMSleepReasons[sleepReason]); } - patriarch->sleepSystem(); + if (pmPowerStateQueue) + pmPowerStateQueue->submitPowerEvent( + kPowerEventPolicyStimulus, + (void *) kStimulusDemandSystemSleep ); + return kIOReturnSuccess; } +IOReturn IOPMrootDomain::recordPMEventGated(PMEventDetails *record) +{ + // If we don't have a place to log to, we can't actually + // log anything. Chances are, the person who is asking us to do + // the PM logging has forgotten to set the right bootflags + if(!timeline) + return kIOReturnSuccess; -//****************************************************************************** -// shutdownSystem -// -//****************************************************************************** + if(gIOPMWorkLoop->inGate() == false) { + + IOReturn ret = gIOPMWorkLoop->runAction( + OSMemberFunctionCast(IOWorkLoop::Action, this, &IOPMrootDomain::recordPMEventGated), + (OSObject *)this, + (void *)record); + + return ret; + } + else { + // Now that we're guaranteed to be running in gate ... -IOReturn IOPMrootDomain::shutdownSystem( void ) -{ - //patriarch->shutDownSystem(); - return kIOReturnUnsupported; + // Check the validity of the argument we are given + if(!record) + return kIOReturnBadArgument; + + // Record a driver event, or a system event + if(record->eventClassifier == kIOPMEventClassDriverEvent + || record->eventClassifier == kIOPMEventClassSystemEvent) + return this->recordPMEvent(record); + + else + return kIOReturnBadArgument; + } } - -//****************************************************************************** -// restartSystem -// -//****************************************************************************** - -IOReturn IOPMrootDomain::restartSystem( void ) +IOReturn IOPMrootDomain::recordAndReleasePMEventGated(PMEventDetails *record) { - //patriarch->restartSystem(); - return kIOReturnUnsupported; -} + IOReturn ret = kIOReturnBadArgument; + if (record) + { + ret = recordPMEventGated(record); + record->release(); + } + + return ret; +} //****************************************************************************** // powerChangeDone // // This overrides powerChangeDone in IOService. -// -// Menu sleep and idle sleep move us from the ON state to the SLEEP_STATE. -// In this case: -// If we finished going to the SLEEP_STATE, and the platform is capable of -// true sleep, then sleep the kernel. Otherwise switch up to the DOZE_STATE -// which will keep almost everything as off as it can get. //****************************************************************************** -void IOPMrootDomain::powerChangeDone( unsigned long previousState ) +void IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) { + PMEventDetails *details; + ASSERT_GATED(); DLOG("PowerChangeDone: %u->%u\n", - (uint32_t) previousState, (uint32_t) getPowerState()); - - switch ( getPowerState() ) { - case SLEEP_STATE: - if ( previousState != ON_STATE ) - break; + (uint32_t) previousPowerState, (uint32_t) getPowerState()); + + switch ( getPowerState() ) + { + case SLEEP_STATE: { + if (previousPowerState != ON_STATE) + break; + + details = PMEventDetails::eventDetails( + kIOPMEventTypeSleepDone, + NULL, + NULL, + kIOReturnSuccess); + + recordAndReleasePMEvent( details ); - if ( canSleep ) - { - // re-enable this timer for next sleep - cancelIdleSleepTimer(); - wranglerTickled = true; + // re-enable this timer for next sleep + cancelIdleSleepTimer(); - clock_sec_t secs; - clock_usec_t microsecs; - clock_get_calendar_microtime(&secs, µsecs); - logtime(secs); - gIOLastSleepTime.tv_sec = secs; - gIOLastSleepTime.tv_usec = microsecs; - gIOLastWakeTime.tv_sec = 0; - gIOLastWakeTime.tv_usec = 0; + clock_sec_t secs; + clock_usec_t microsecs; + clock_get_calendar_microtime(&secs, µsecs); + logtime(secs); + gIOLastSleepTime.tv_sec = secs; + gIOLastSleepTime.tv_usec = microsecs; + gIOLastWakeTime.tv_sec = 0; + gIOLastWakeTime.tv_usec = 0; #if HIBERNATION - LOG("System %sSleep\n", gIOHibernateState ? "Safe" : ""); - - tracePoint(kIOPMTracePointSystemHibernatePhase); + LOG("System %sSleep\n", gIOHibernateState ? "Safe" : ""); - IOHibernateSystemHasSlept(); + IOHibernateSystemHasSlept(); - evaluateSystemSleepPolicyFinal(); + evaluateSystemSleepPolicyFinal(); #else - LOG("System Sleep\n"); + LOG("System Sleep\n"); #endif - tracePoint(kIOPMTracePointSystemSleepPlatformPhase); + getPlatform()->sleepKernel(); - getPlatform()->sleepKernel(); + // The CPU(s) are off at this point, + // Code will resume execution here upon wake. - // The CPU(s) are off at this point. When they're awakened by CPU interrupt, - // code will resume execution here. + clock_get_uptime(&systemWakeTime); - // Now we're waking... - tracePoint(kIOPMTracePointSystemWakeDriversPhase); - #if HIBERNATION - IOHibernateSystemWake(); + IOHibernateSystemWake(); #endif - // sleep transition complete - gSleepOrShutdownPending = 0; - - // trip the reset of the calendar clock - clock_wakeup_calendar(); - - // get us some power - patriarch->wakeSystem(); - - // Set indicator if UUID was set - allow it to be cleared. - if (getProperty(kIOPMSleepWakeUUIDKey)) - gSleepWakeUUIDIsSet = true; + // sleep transition complete + gSleepOrShutdownPending = 0; -#if !ROOT_DOMAIN_RUN_STATES - tellClients(kIOMessageSystemWillPowerOn, clientMessageFilter); -#endif + // trip the reset of the calendar clock + clock_wakeup_calendar(); #if HIBERNATION - LOG("System %sWake\n", gIOHibernateState ? "SafeSleep " : ""); + LOG("System %sWake\n", gIOHibernateState ? "SafeSleep " : ""); #endif - // log system wake - getPlatform()->PMLog(kIOPMrootDomainClass, kPMLogSystemWake, 0, 0); - lowBatteryCondition = false; + // log system wake + getPlatform()->PMLog(kIOPMrootDomainClass, kPMLogSystemWake, 0, 0); + lowBatteryCondition = false; + lastSleepReason = 0; + + // And start logging the wake event here + // TODO: Publish the wakeReason string as an integer + details = PMEventDetails::eventDetails( + kIOPMEventTypeWake, + NULL, + 0, + kIOReturnSuccess); + + recordAndReleasePMEvent( details ); + #ifndef __LP64__ - // tell the tree we're waking - systemWake(); + systemWake(); #endif - #if defined(__i386__) || defined(__x86_64__) - sleepTimerMaintenance = false; -#if ROOT_DOMAIN_RUN_STATES - OSString * wakeType = OSDynamicCast( - OSString, getProperty(kIOPMRootDomainWakeTypeKey)); - if (wakeType && wakeType->isEqualTo(kIOPMrootDomainWakeTypeLowBattery)) + wranglerTickled = false; + graphicsSuppressed = false; + darkWakePostTickle = false; + logGraphicsClamp = true; + logWranglerTickle = true; + sleepTimerMaintenance = false; + + OSString * wakeType = OSDynamicCast( + OSString, getProperty(kIOPMRootDomainWakeTypeKey)); + OSString * wakeReason = OSDynamicCast( + OSString, getProperty(kIOPMRootDomainWakeReasonKey)); + + if (wakeType && wakeType->isEqualTo(kIOPMrootDomainWakeTypeLowBattery)) + { + lowBatteryCondition = true; + darkWakeMaintenance = true; + darkWakeToSleepASAP = true; + } + else if ((gDarkWakeFlags & kDarkWakeFlagHIDTickleMask) != 0) + { + OSNumber * hibOptions = OSDynamicCast( + OSNumber, getProperty(kIOHibernateOptionsKey)); + + if (hibernateAborted || + ((hibOptions && + !(hibOptions->unsigned32BitValue() & kIOHibernateOptionDarkWake))) || + ((_debugWakeSeconds != 0) && + ((gDarkWakeFlags & kDarkWakeFlagAlarmIsDark) == 0)) || + (wakeType && ( + wakeType->isEqualTo(kIOPMRootDomainWakeTypeUser) || + wakeType->isEqualTo(kIOPMRootDomainWakeTypeAlarm)))) { - lowBatteryCondition = true; - updateRunState(kRStateMaintenance); - wranglerTickled = false; + wranglerTickled = true; } - else if (wakeType && !hibernateAborted && wakeType->isEqualTo(kIOPMRootDomainWakeTypeSleepTimer)) + else + if (wakeType && + wakeType->isEqualTo(kIOPMRootDomainWakeTypeMaintenance)) { - sleepTimerMaintenance = true; - updateRunState(kRStateMaintenance); - wranglerTickled = false; + darkWakeMaintenance = true; + darkWakeToSleepASAP = true; } - else if (wakeType && !hibernateAborted && wakeType->isEqualTo(kIOPMRootDomainWakeTypeMaintenance)) + else + if (wakeType && + wakeType->isEqualTo(kIOPMRootDomainWakeTypeSleepTimer)) { - updateRunState(kRStateMaintenance); - wranglerTickled = false; + darkWakeMaintenance = true; + darkWakeToSleepASAP = true; + sleepTimerMaintenance = true; } else -#endif /* ROOT_DOMAIN_RUN_STATES */ { - updateRunState(kRStateNormal); - reportUserInput(); - } -#else /* !__i386__ && !__x86_64__ */ - // stay awake for at least 30 seconds - startIdleSleepTimer(30); - reportUserInput(); -#endif - - changePowerStateToPriv(ON_STATE); - } else { - updateRunState(kRStateNormal); + // Unidentified wake source, resume to full wake if debug + // alarm is pending. - // allow us to step up a power state - patriarch->sleepToDoze(); - - // ignore children's request for higher power during doze. - changePowerStateWithOverrideTo(DOZE_STATE); + if (_debugWakeSeconds && (!wakeReason || wakeReason->isEqualTo(""))) + wranglerTickled = true; + else + darkWakeToSleepASAP = true; + } } - break; - - case DOZE_STATE: - if ( previousState != DOZE_STATE ) + else { - LOG("System Doze\n"); - } - // re-enable this timer for next sleep - cancelIdleSleepTimer(); - gSleepOrShutdownPending = 0; - - // Invalidate prior activity tickles to allow wake from doze. - if (wrangler) wrangler->changePowerStateTo(0); - break; - -#if ROOT_DOMAIN_RUN_STATES - case ON_STATE: - // SLEEP -> ON (Maintenance) - // Go back to sleep, unless cancelled by a HID event. + // Post a HID tickle immediately - except for maintenance wake. - if ((previousState == SLEEP_STATE) && - (runStateIndex == kRStateMaintenance) && - !wranglerTickled) - { - if (lowBatteryCondition) + if (hibernateAborted || !wakeType || + !wakeType->isEqualTo(kIOPMRootDomainWakeTypeMaintenance)) { - lastSleepReason = kIOPMSleepReasonLowPower; - setProperty(kRootDomainSleepReasonKey, kIOPMLowPowerSleepKey); + wranglerTickled = true; } else { - lastSleepReason = kIOPMSleepReasonMaintenance; - setProperty(kRootDomainSleepReasonKey, kIOPMMaintenanceSleepKey); + darkWakeMaintenance = true; + darkWakeToSleepASAP = true; } - changePowerStateWithOverrideTo(SLEEP_STATE); } - // ON -> ON triggered by R-state changes. - - if ((previousState == ON_STATE) && - (runStateIndex != nextRunStateIndex) && - (nextRunStateIndex < kRStateCount)) + if (wranglerTickled) + reportUserInput(); + else if (!darkWakeMaintenance) { - LOG("R-state changed %u->%u\n", - runStateIndex, nextRunStateIndex); - updateRunState(nextRunStateIndex); + // Early/late tickle for non-maintenance wake. + if (((gDarkWakeFlags & kDarkWakeFlagHIDTickleMask) == + kDarkWakeFlagHIDTickleEarly) || + ((gDarkWakeFlags & kDarkWakeFlagHIDTickleMask) == + kDarkWakeFlagHIDTickleLate)) + { + darkWakePostTickle = true; + } + } +#else /* !__i386__ && !__x86_64__ */ + // stay awake for at least 30 seconds + wranglerTickled = true; + startIdleSleepTimer(30); +#endif + + changePowerStateToPriv(ON_STATE); + } break; + + case ON_STATE: { + bool wasPrevented = childPreventSystemSleep; + + details = PMEventDetails::eventDetails( + kIOPMEventTypeWakeDone, + NULL, + 0, + kIOReturnSuccess); + + recordAndReleasePMEvent( details ); + + if (previousPowerState != ON_STATE) + _debugWakeSeconds = 0; + + // Update childPreventSystemSleep flag using the capability computed + // by IOSevice::rebuildChildClampBits(). + + childPreventSystemSleep = + ((currentCapability() & kIOPMChildClamp2) != 0); - DLOG("kIOMessageSystemHasPoweredOn (%u)\n", - gMessageClientType); - tellClients(kIOMessageSystemHasPoweredOn, clientMessageFilter); + if (wasPrevented && !childPreventSystemSleep) + { + evaluatePolicy( kStimulusDarkWakeEvaluate ); } - - break; -#endif /* ROOT_DOMAIN_RUN_STATES */ + } break; } } - //****************************************************************************** -// wakeFromDoze +// requestPowerDomainState // -// The Display Wrangler calls here when it switches to its highest state. -// If the system is currently dozing, allow it to wake by making sure the -// parent is providing power. +// Extend implementation in IOService. Running on PM work loop thread. +// +// Examine children desires and initiate idle-sleep if all children are idle, +// prevent idle and system sleep flags are not set. //****************************************************************************** -void IOPMrootDomain::wakeFromDoze( void ) +IOReturn IOPMrootDomain::requestPowerDomainState ( + IOPMPowerFlags childDesire, + IOPowerConnection * childConnection, + unsigned long specification ) { - if ( getPowerState() == DOZE_STATE ) + OSIterator *iter; + OSObject *next; + IOPowerConnection *connection; + IOPMPowerFlags mergedChildDesire = 0; + IOPMPowerFlags editedChildDesire; + IOPMPowerFlags thisDesire; + bool sleepASAP = false; + + ASSERT_GATED(); + + // Disregard disk I/O (anything besides the display wrangler) as a + // factor in preventing idle sleep - based on a runtime setting. + + if ((gDarkWakeFlags & kDarkWakeFlagIgnoreDiskIOAlways) && + (kIOPMPreventIdleSleep & childDesire) && + (childConnection != wranglerConnection)) { - tracePoint(kIOPMTracePointSystemWakeDriversPhase); - changePowerStateToPriv(ON_STATE); - patriarch->wakeSystem(); + childDesire &= ~kIOPMPreventIdleSleep; } -} + // Force the child's input power requirement to 0 unless the prevent + // idle-sleep flag is set. Nil input power flags maps to our state 0. + // Our power clamp (deviceDesire) clamps the lowest power state at 2. -//****************************************************************************** -// publishFeature -// -// Adds a new feature to the supported features dictionary -//****************************************************************************** + editedChildDesire = 0; + if (childDesire & kIOPMPreventIdleSleep) + editedChildDesire |= (kIOPMPowerOn | kIOPMPreventIdleSleep); + if (childDesire & kIOPMPreventSystemSleep) + editedChildDesire |= (kIOPMPowerOn | kIOPMPreventSystemSleep); -void IOPMrootDomain::publishFeature( const char * feature ) -{ - publishFeature(feature, kRD_AllPowerSources, NULL); -} + iter = getChildIterator(gIOPowerPlane); + if ( iter ) + { + while ( (next = iter->getNextObject()) ) + { + if ( (connection = OSDynamicCast(IOPowerConnection, next)) ) + { + // Ignore child that are in the process of joining. + if (connection->getReadyFlag() == false) + continue; + // OR in the child's input power requirements. + // Is this connection attached to the child that called + // requestPowerDomainState()? -//****************************************************************************** -// publishFeature (with supported power source specified) -// -// Adds a new feature to the supported features dictionary -//****************************************************************************** - -void IOPMrootDomain::publishFeature( - const char *feature, - uint32_t supportedWhere, - uint32_t *uniqueFeatureID) -{ - static uint16_t next_feature_id = 500; - - OSNumber *new_feature_data = NULL; - OSNumber *existing_feature = NULL; - OSArray *existing_feature_arr = NULL; - OSObject *osObj = NULL; - uint32_t feature_value = 0; - - supportedWhere &= kRD_AllPowerSources; // mask off any craziness! - - if(!supportedWhere) { - // Feature isn't supported anywhere! - return; - } - - if(next_feature_id > 5000) { - // Far, far too many features! - return; - } - - if(featuresDictLock) IOLockLock(featuresDictLock); + if (connection == childConnection) + { + thisDesire = editedChildDesire; + } + else + { + thisDesire = 0; + if (connection->getPreventIdleSleepFlag()) + thisDesire |= (kIOPMPowerOn | kIOPMPreventIdleSleep); + if (connection->getPreventSystemSleepFlag()) + thisDesire |= (kIOPMPowerOn | kIOPMPreventSystemSleep); + } - OSDictionary *features = - (OSDictionary *) getProperty(kRootDomainSupportedFeatures); - - // Create new features dict if necessary - if ( features && OSDynamicCast(OSDictionary, features)) { - features = OSDictionary::withDictionary(features); - } else { - features = OSDictionary::withCapacity(1); - } - - // Create OSNumber to track new feature - - next_feature_id += 1; - if( uniqueFeatureID ) { - // We don't really mind if the calling kext didn't give us a place - // to stash their unique id. Many kexts don't plan to unload, and thus - // have no need to remove themselves later. - *uniqueFeatureID = next_feature_id; + mergedChildDesire |= thisDesire; + if (thisDesire && (kIOLogPMRootDomain & gIOKitDebug)) + { + IOService * child = + (IOService *) connection->getChildEntry(gIOPowerPlane); + LOG("child %p, noIdle %d, noSleep %d - %s\n", + child, + ((thisDesire & kIOPMPreventIdleSleep) != 0), + ((thisDesire & kIOPMPreventSystemSleep) != 0), + child ? child->getName() : "?"); + } + } + } + iter->release(); } - feature_value = (uint32_t)next_feature_id; - feature_value <<= 16; - feature_value += supportedWhere; - - new_feature_data = OSNumber::withNumber( - (unsigned long long)feature_value, 32); + DLOG("mergedChildDesire 0x%lx, extraSleepDelay %ld\n", + mergedChildDesire, extraSleepDelay); - // Does features object already exist? - if( (osObj = features->getObject(feature)) ) + if ( !mergedChildDesire && !systemBooting ) { - if(( existing_feature = OSDynamicCast(OSNumber, osObj) )) - { - // We need to create an OSArray to hold the now 2 elements. - existing_feature_arr = OSArray::withObjects( - (const OSObject **)&existing_feature, 1, 2); - } else if(( existing_feature_arr = OSDynamicCast(OSArray, osObj) )) + if (!wrangler) { - // Add object to existing array - existing_feature_arr = OSArray::withArray( - existing_feature_arr, - existing_feature_arr->getCount() + 1); + changePowerStateToPriv(ON_STATE); + if (idleSeconds) + { + // stay awake for at least idleSeconds + startIdleSleepTimer(idleSeconds); + } } - - if (existing_feature_arr) + else if (!extraSleepDelay && !idleSleepTimerPending && !systemDarkWake) { - existing_feature_arr->setObject(new_feature_data); - features->setObject(feature, existing_feature_arr); - existing_feature_arr->release(); - existing_feature_arr = 0; + sleepASAP = true; } - } else { - // The easy case: no previously existing features listed. We simply - // set the OSNumber at key 'feature' and we're on our way. - features->setObject(feature, new_feature_data); } - - new_feature_data->release(); - setProperty(kRootDomainSupportedFeatures, features); + // Drop our power clamp to SLEEP_STATE when all children became idle, + // and system sleep and display sleep slider values are equal. - features->release(); + adjustPowerState(sleepASAP); - if(featuresDictLock) IOLockUnlock(featuresDictLock); + // If our power clamp has already dropped to SLEEP_STATE, and no child + // is keeping us at ON_STATE, then the following will trigger idle sleep. - // Notify EnergySaver and all those in user space so they might - // re-populate their feature specific UI - if(pmPowerStateQueue) { - pmPowerStateQueue->submitPowerEvent( kPowerEventFeatureChanged ); - } + return super::requestPowerDomainState( + editedChildDesire, childConnection, specification); } - //****************************************************************************** -// removePublishedFeature +// tellChangeDown // -// Removes previously published feature +// Override the superclass implementation to send a different message type. //****************************************************************************** -IOReturn IOPMrootDomain::removePublishedFeature( uint32_t removeFeatureID ) +bool IOPMrootDomain::tellChangeDown( unsigned long stateNum ) { - IOReturn ret = kIOReturnError; - uint32_t feature_value = 0; - uint16_t feature_id = 0; - bool madeAChange = false; - - OSSymbol *dictKey = NULL; - OSCollectionIterator *dictIterator = NULL; - OSArray *arrayMember = NULL; - OSNumber *numberMember = NULL; - OSObject *osObj = NULL; - OSNumber *osNum = NULL; - OSArray *arrayMemberCopy; + DLOG("tellChangeDown %u->%u\n", + (uint32_t) getPowerState(), (uint32_t) stateNum); - if(featuresDictLock) IOLockLock(featuresDictLock); - - OSDictionary *features = - (OSDictionary *) getProperty(kRootDomainSupportedFeatures); - - if ( features && OSDynamicCast(OSDictionary, features) ) + if (SLEEP_STATE == stateNum) { - // Any modifications to the dictionary are made to the copy to prevent - // races & crashes with userland clients. Dictionary updated - // automically later. - features = OSDictionary::withDictionary(features); - } else { - features = NULL; - ret = kIOReturnNotFound; - goto exit; - } - - // We iterate 'features' dictionary looking for an entry tagged - // with 'removeFeatureID'. If found, we remove it from our tracking - // structures and notify the OS via a general interest message. - - dictIterator = OSCollectionIterator::withCollection(features); - if(!dictIterator) { - goto exit; + if (!ignoreTellChangeDown) + tracePoint( kIOPMTracePointSleepApplications ); + else + tracePoint( kIOPMTracePointSleepPriorityClients ); } - - while( (dictKey = OSDynamicCast(OSSymbol, dictIterator->getNextObject())) ) + + if ((SLEEP_STATE == stateNum) && !ignoreTellChangeDown) { - osObj = features->getObject(dictKey); - - // Each Feature is either tracked by an OSNumber - if( osObj && (numberMember = OSDynamicCast(OSNumber, osObj)) ) - { - feature_value = numberMember->unsigned32BitValue(); - feature_id = (uint16_t)(feature_value >> 16); + userActivityAtSleep = userActivityCount; + hibernateAborted = false; + DLOG("tellChangeDown::userActivityAtSleep %d\n", userActivityAtSleep); - if( feature_id == (uint16_t)removeFeatureID ) - { - // Remove this node - features->removeObject(dictKey); - madeAChange = true; - break; - } - - // Or tracked by an OSArray of OSNumbers - } else if( osObj && (arrayMember = OSDynamicCast(OSArray, osObj)) ) - { - unsigned int arrayCount = arrayMember->getCount(); - - for(unsigned int i=0; igetObject(i)); - if(!osNum) { - continue; - } - - feature_value = osNum->unsigned32BitValue(); - feature_id = (uint16_t)(feature_value >> 16); + // Direct callout into OSKext so it can disable kext unloads + // during sleep/wake to prevent deadlocks. + OSKextSystemSleepOrWake( kIOMessageSystemWillSleep ); - if( feature_id == (uint16_t)removeFeatureID ) - { - // Remove this node - if( 1 == arrayCount ) { - // If the array only contains one element, remove - // the whole thing. - features->removeObject(dictKey); - } else { - // Otherwise remove the element from a copy of the array. - arrayMemberCopy = OSArray::withArray(arrayMember); - if (arrayMemberCopy) - { - arrayMemberCopy->removeObject(i); - features->setObject(dictKey, arrayMemberCopy); - arrayMemberCopy->release(); - } - } + IOService::updateConsoleUsers(NULL, kIOMessageSystemWillSleep); - madeAChange = true; - break; - } - } - } - } - - dictIterator->release(); - - if( madeAChange ) - { - ret = kIOReturnSuccess; + // Notify platform that sleep has begun + getPlatform()->callPlatformFunction( + sleepMessagePEFunction, false, + (void *)(uintptr_t) kIOMessageSystemWillSleep, + NULL, NULL, NULL); - setProperty(kRootDomainSupportedFeatures, features); - - // Notify EnergySaver and all those in user space so they might - // re-populate their feature specific UI - if(pmPowerStateQueue) { - pmPowerStateQueue->submitPowerEvent( kPowerEventFeatureChanged ); - } - } else { - ret = kIOReturnNotFound; + // Two change downs are sent by IOServicePM. Ignore the 2nd. + // But tellClientsWithResponse() must be called for both. + ignoreTellChangeDown = true; } - -exit: - if(features) features->release(); - if(featuresDictLock) IOLockUnlock(featuresDictLock); - return ret; -} + return super::tellClientsWithResponse( kIOMessageSystemWillSleep ); +} //****************************************************************************** -// announcePowerSourceChange +// askChangeDown // -// Notifies "interested parties" that the battery state has changed +// Override the superclass implementation to send a different message type. +// This must be idle sleep since we don't ask during any other power change. //****************************************************************************** -void IOPMrootDomain::announcePowerSourceChange( void ) +bool IOPMrootDomain::askChangeDown( unsigned long stateNum ) { -#ifdef __ppc__ - IORegistryEntry *_batteryRegEntry = (IORegistryEntry *) getProperty("BatteryEntry"); + DLOG("askChangeDown %u->%u\n", + (uint32_t) getPowerState(), (uint32_t) stateNum); - // (if possible) re-publish power source state under IOPMrootDomain; - // only do so if the battery controller publishes an IOResource - // defining battery location. Called from ApplePMU battery driver. + // Don't log for dark wake entry + if (kSystemTransitionSleep == _systemTransitionType) + tracePoint( kIOPMTracePointSleepApplications ); - if(_batteryRegEntry) - { - OSArray *batt_info; - batt_info = (OSArray *) _batteryRegEntry->getProperty(kIOBatteryInfoKey); - if(batt_info) - setProperty(kIOBatteryInfoKey, batt_info); - } -#endif + return super::tellClientsWithResponse( kIOMessageCanSystemSleep ); } - //****************************************************************************** -// setPMSetting (private) +// askChangeDownDone // -// Internal helper to relay PM settings changes from user space to individual -// drivers. Should be called only by IOPMrootDomain::setProperties. +// Called by PM after all apps have responded to kIOMessageCanSystemSleep. +// pmconfigd may create a deny sleep assertion before ack'ing. //****************************************************************************** -IOReturn IOPMrootDomain::setPMSetting( - const OSSymbol *type, - OSObject *obj) +void IOPMrootDomain::askChangeDownDone( + IOPMPowerChangeFlags * inOutChangeFlags, bool * cancel ) { - OSArray *arr = NULL; - PMSettingObject *p_obj = NULL; - int count; - int i; + DLOG("askChangeDownDone(0x%x, %u) type %x, cap %x->%x\n", + *inOutChangeFlags, *cancel, + _systemTransitionType, + _currentCapability, _pendingCapability); - if(NULL == type) return kIOReturnBadArgument; + if ((false == *cancel) && (kSystemTransitionSleep == _systemTransitionType)) + { + // Dark->Sleep transition. + // Check if there are any deny sleep assertions. + // Full->Dark transition is never cancelled. - IORecursiveLockLock(settingsCtrlLock); - - fPMSettingsDict->setObject(type, obj); + if (!checkSystemCanSleep(true)) + { + // Cancel dark wake to sleep transition. + // Must re-scan assertions upon entering dark wake. - arr = (OSArray *)settingsCallbacks->getObject(type); - if(NULL == arr) goto exit; - count = arr->getCount(); - for(i=0; igetObject(i)); - if(p_obj) p_obj->setPMSetting(type, obj); + *cancel = true; + DLOG("cancel dark->sleep\n"); + } } - -exit: - IORecursiveLockUnlock(settingsCtrlLock); - return kIOReturnSuccess; } - //****************************************************************************** -// copyPMSetting (public) +// tellNoChangeDown // -// Allows kexts to safely read setting values, without being subscribed to -// notifications. -//****************************************************************************** - -OSObject * IOPMrootDomain::copyPMSetting( - OSSymbol *whichSetting) -{ - OSObject *obj = NULL; +// Notify registered applications and kernel clients that we are not dropping +// power. +// +// We override the superclass implementation so we can send a different message +// type to the client or application being notified. +// +// This must be a vetoed idle sleep, since no other power change can be vetoed. +//****************************************************************************** - if(!whichSetting) return NULL; +void IOPMrootDomain::tellNoChangeDown( unsigned long stateNum ) +{ + DLOG("tellNoChangeDown %u->%u\n", + (uint32_t) getPowerState(), (uint32_t) stateNum); - IORecursiveLockLock(settingsCtrlLock); - obj = fPMSettingsDict->getObject(whichSetting); - if(obj) { - obj->retain(); + if (idleSeconds && !wrangler) + { + // stay awake for at least idleSeconds + startIdleSleepTimer(idleSeconds); } - IORecursiveLockUnlock(settingsCtrlLock); - - return obj; + return tellClients( kIOMessageSystemWillNotSleep ); } - //****************************************************************************** -// registerPMSettingController (public) +// tellChangeUp // -// direct wrapper to registerPMSettingController with uint32_t power source arg +// Notify registered applications and kernel clients that we are raising power. +// +// We override the superclass implementation so we can send a different message +// type to the client or application being notified. //****************************************************************************** -IOReturn IOPMrootDomain::registerPMSettingController( - const OSSymbol * settings[], - IOPMSettingControllerCallback func, - OSObject *target, - uintptr_t refcon, - OSObject **handle) +void IOPMrootDomain::tellChangeUp( unsigned long stateNum ) { - return registerPMSettingController( - settings, - (kIOPMSupportedOnAC | kIOPMSupportedOnBatt | kIOPMSupportedOnUPS), - func, target, refcon, handle); -} + OSData *publishPMStats = NULL; + + DLOG("tellChangeUp %u->%u\n", + (uint32_t) getPowerState(), (uint32_t) stateNum); + + ignoreTellChangeDown = false; + + if ( stateNum == ON_STATE ) + { + // Direct callout into OSKext so it can disable kext unloads + // during sleep/wake to prevent deadlocks. + OSKextSystemSleepOrWake( kIOMessageSystemHasPoweredOn ); + + // Notify platform that sleep was cancelled or resumed. + getPlatform()->callPlatformFunction( + sleepMessagePEFunction, false, + (void *)(uintptr_t) kIOMessageSystemHasPoweredOn, + NULL, NULL, NULL); + + if (getPowerState() == ON_STATE) + { + // this is a quick wake from aborted sleep + if (idleSeconds && !wrangler) + { + // stay awake for at least idleSeconds + startIdleSleepTimer(idleSeconds); + } + tellClients( kIOMessageSystemWillPowerOn ); + } + + tracePoint( kIOPMTracePointWakeApplications ); + publishPMStats = OSData::withBytes(&pmStats, sizeof(pmStats)); + setProperty(kIOPMSleepStatisticsKey, publishPMStats); + publishPMStats->release(); + bzero(&pmStats, sizeof(pmStats)); + + if (pmStatsAppResponses) + { + setProperty(kIOPMSleepStatisticsAppsKey, pmStatsAppResponses); + pmStatsAppResponses->release(); + pmStatsAppResponses = OSArray::withCapacity(5); + } + tellClients( kIOMessageSystemHasPoweredOn ); + } +} //****************************************************************************** -// registerPMSettingController (public) +// sysPowerDownHandler // -// Kexts may register for notifications when a particular setting is changed. -// A list of settings is available in IOPM.h. -// Arguments: -// * settings - An OSArray containing OSSymbols. Caller should populate this -// array with a list of settings caller wants notifications from. -// * func - A C function callback of the type IOPMSettingControllerCallback -// * target - caller may provide an OSObject *, which PM will pass as an -// target to calls to "func" -// * refcon - caller may provide an void *, which PM will pass as an -// argument to calls to "func" -// * handle - This is a return argument. We will populate this pointer upon -// call success. Hold onto this and pass this argument to -// IOPMrootDomain::deRegisterPMSettingCallback when unloading your kext -// Returns: -// kIOReturnSuccess on success +// Perform a vfs sync before system sleep. //****************************************************************************** -IOReturn IOPMrootDomain::registerPMSettingController( - const OSSymbol * settings[], - uint32_t supportedPowerSources, - IOPMSettingControllerCallback func, - OSObject *target, - uintptr_t refcon, - OSObject **handle) +IOReturn IOPMrootDomain::sysPowerDownHandler( + void * target, void * refCon, + UInt32 messageType, IOService * service, + void * messageArgs, vm_size_t argSize ) { - PMSettingObject *pmso = NULL; - OSArray *list = NULL; - IOReturn ret = kIOReturnSuccess; - int i; + IOReturn ret; - if( NULL == settings || - NULL == func || - NULL == handle) + DLOG("sysPowerDownHandler message %s\n", getIOMessageString(messageType)); + + if (!gRootDomain) + return kIOReturnUnsupported; + + if (messageType == kIOMessageSystemCapabilityChange) { - return kIOReturnBadArgument; - } + IOPMSystemCapabilityChangeParameters * params = + (IOPMSystemCapabilityChangeParameters *) messageArgs; + + // Interested applications have been notified of an impending power + // change and have acked (when applicable). + // This is our chance to save whatever state we can before powering + // down. + // We call sync_internal defined in xnu/bsd/vfs/vfs_syscalls.c, + // via callout + + DLOG("sysPowerDownHandler cap %x -> %x (flags %x)\n", + params->fromCapabilities, params->toCapabilities, + params->changeFlags); + + if ((params->changeFlags & kIOPMSystemCapabilityWillChange) && + (params->fromCapabilities & kIOPMSystemCapabilityCPU) && + (params->toCapabilities & kIOPMSystemCapabilityCPU) == 0) + { + // We will ack within 20 seconds + params->maxWaitForReply = 20 * 1000 * 1000; +#if HIBERNATION + gRootDomain->evaluateSystemSleepPolicyEarly(); - pmso = PMSettingObject::pmSettingObject( - (IOPMrootDomain *)this, func, target, - refcon, supportedPowerSources, settings); + // add in time we could spend freeing pages + if (gRootDomain->hibernateMode && !gRootDomain->hibernateDisabled) + { + params->maxWaitForReply = kCapabilityClientMaxWait; + } + DLOG("sysPowerDownHandler timeout %d s\n", (int) (params->maxWaitForReply / 1000 / 1000)); +#endif + + if ( !OSCompareAndSwap( 0, 1, &gSleepOrShutdownPending ) ) + { + // Purposely delay the ack and hope that shutdown occurs quickly. + // Another option is not to schedule the thread and wait for + // ack timeout... + AbsoluteTime deadline; + clock_interval_to_deadline( 30, kSecondScale, &deadline ); + thread_call_enter1_delayed( + gRootDomain->diskSyncCalloutEntry, + (thread_call_param_t) params->notifyRef, + deadline ); + } + else + thread_call_enter1( + gRootDomain->diskSyncCalloutEntry, + (thread_call_param_t) params->notifyRef); + } +#if HIBERNATION + else + if ((params->changeFlags & kIOPMSystemCapabilityDidChange) && + (params->toCapabilities & kIOPMSystemCapabilityCPU) && + (params->fromCapabilities & kIOPMSystemCapabilityCPU) == 0) + { + // We will ack within 110 seconds + params->maxWaitForReply = 110 * 1000 * 1000; - if(!pmso) { - ret = kIOReturnInternalError; - goto bail_no_unlock; + thread_call_enter1( + gRootDomain->diskSyncCalloutEntry, + (thread_call_param_t) params->notifyRef); + } +#endif + ret = kIOReturnSuccess; } - IORecursiveLockLock(settingsCtrlLock); - for(i=0; settings[i]; i++) + return ret; +} + +//****************************************************************************** +// handleQueueSleepWakeUUID +// +// Called from IOPMrootDomain when we're initiating a sleep, +// or indirectly from PM configd when PM decides to clear the UUID. +// PM clears the UUID several minutes after successful wake from sleep, +// so that we might associate App spindumps with the immediately previous +// sleep/wake. +// +// @param obj has a retain on it. We're responsible for releasing that retain. +//****************************************************************************** + +void IOPMrootDomain::handleQueueSleepWakeUUID(OSObject *obj) +{ + OSString *str = NULL; + + if (kOSBooleanFalse == obj) { - list = (OSArray *)settingsCallbacks->getObject(settings[i]); - if(!list) { - // New array of callbacks for this setting - list = OSArray::withCapacity(1); - settingsCallbacks->setObject(settings[i], list); - list->release(); + handlePublishSleepWakeUUID(NULL); + } + else if ((str = OSDynamicCast(OSString, obj))) + { + // This branch caches the UUID for an upcoming sleep/wake + if (queuedSleepWakeUUIDString) { + queuedSleepWakeUUIDString->release(); + queuedSleepWakeUUIDString = NULL; } + queuedSleepWakeUUIDString = str; + queuedSleepWakeUUIDString->retain(); - // Add caller to the callback list - list->setObject(pmso); + DLOG("SleepWake UUID queued: %s\n", queuedSleepWakeUUIDString->getCStringNoCopy()); } - IORecursiveLockUnlock(settingsCtrlLock); - - ret = kIOReturnSuccess; + if (obj) { + obj->release(); + } + return; + +} +//****************************************************************************** +// handlePublishSleepWakeUUID +// +// Called from IOPMrootDomain when we're initiating a sleep, +// or indirectly from PM configd when PM decides to clear the UUID. +// PM clears the UUID several minutes after successful wake from sleep, +// so that we might associate App spindumps with the immediately previous +// sleep/wake. +//****************************************************************************** + +void IOPMrootDomain::handlePublishSleepWakeUUID( bool shouldPublish ) +{ + ASSERT_GATED(); + + /* + * Clear the current UUID + */ + if (gSleepWakeUUIDIsSet) + { + DLOG("SleepWake UUID cleared\n"); + + OSString *UUIDstring = NULL; + + if (timeline && + (UUIDstring = OSDynamicCast(OSString, getProperty(kIOPMSleepWakeUUIDKey)))) + { + PMEventDetails *details = PMEventDetails::eventDetails(kIOPMEventTypeUUIDClear, + UUIDstring->getCStringNoCopy(), NULL, 0); + if (details) { + timeline->recordSystemPowerEvent( details ); + details->release(); + } + timeline->setNumEventsLoggedThisPeriod(0); + } + + gSleepWakeUUIDIsSet = false; + + removeProperty(kIOPMSleepWakeUUIDKey); + messageClients(kIOPMMessageSleepWakeUUIDChange, kIOPMMessageSleepWakeUUIDCleared); + } + + /* + * Optionally, publish a new UUID + */ + if (queuedSleepWakeUUIDString && shouldPublish) { + + OSString *publishThisUUID = NULL; + + publishThisUUID = queuedSleepWakeUUIDString; + publishThisUUID->retain(); + + if (timeline) { + PMEventDetails *details; + details = PMEventDetails::eventDetails(kIOPMEventTypeUUIDSet, + publishThisUUID->getCStringNoCopy(), NULL, 0); + if (details) { + timeline->recordSystemPowerEvent( details ); + details->release(); + } + } + + if (publishThisUUID) + { + setProperty(kIOPMSleepWakeUUIDKey, publishThisUUID); + publishThisUUID->release(); + } + + gSleepWakeUUIDIsSet = true; + messageClients(kIOPMMessageSleepWakeUUIDChange, kIOPMMessageSleepWakeUUIDSet); + + queuedSleepWakeUUIDString->release(); + queuedSleepWakeUUIDString = NULL; + } +} + +//****************************************************************************** +// changePowerStateTo & changePowerStateToPriv +// +// Override of these methods for logging purposes. +//****************************************************************************** + +IOReturn IOPMrootDomain::changePowerStateTo( unsigned long ordinal ) +{ + return kIOReturnUnsupported; // ignored +} + +IOReturn IOPMrootDomain::changePowerStateToPriv( unsigned long ordinal ) +{ + DLOG("changePowerStateToPriv(%lu)\n", ordinal); + + if ((ordinal != ON_STATE) && (ordinal != SLEEP_STATE)) + return kIOReturnUnsupported; - // Track this instance by its OSData ptr from now on - *handle = pmso; + return super::changePowerStateToPriv(ordinal); +} + +//****************************************************************************** +// activity detect +// +//****************************************************************************** + +bool IOPMrootDomain::activitySinceSleep(void) +{ + return (userActivityCount != userActivityAtSleep); +} + +bool IOPMrootDomain::abortHibernation(void) +{ + bool ret = activitySinceSleep(); -bail_no_unlock: - if(kIOReturnSuccess != ret) + if (ret && !hibernateAborted) { - // Error return case - if(pmso) pmso->release(); - if(handle) *handle = NULL; + DLOG("activitySinceSleep ABORT [%d, %d]\n", userActivityCount, userActivityAtSleep); + hibernateAborted = true; } - return ret; + return (ret); } +extern "C" int +hibernate_should_abort(void) +{ + if (gRootDomain) + return (gRootDomain->abortHibernation()); + else + return (0); +} //****************************************************************************** // sleepOnClamshellClosed @@ -2544,13 +2723,13 @@ IOReturn IOPMrootDomain::registerPMSettingController( bool IOPMrootDomain::shouldSleepOnClamshellClosed( void ) { - DLOG("clamshell state %d, EX %d, IG %d, IW %d, DT %d, AC %d\n", - clamshellIsClosed, clamshellExists, ignoringClamshell, - ignoringClamshellOnWake, desktopMode, acAdaptorConnected); + if (!clamshellExists) + return false; + + DLOG("clamshell closed %d, disabled %d, desktopMode %d, ac %d\n", + clamshellClosed, clamshellDisabled, desktopMode, acAdaptorConnected); - return ( !ignoringClamshell - && !ignoringClamshellOnWake - && !(desktopMode && acAdaptorConnected) ); + return ( !clamshellDisabled && !(desktopMode && acAdaptorConnected) ); } void IOPMrootDomain::sendClientClamshellNotification( void ) @@ -2560,7 +2739,7 @@ void IOPMrootDomain::sendClientClamshellNotification( void ) return; setProperty(kAppleClamshellStateKey, - clamshellIsClosed ? kOSBooleanTrue : kOSBooleanFalse); + clamshellClosed ? kOSBooleanTrue : kOSBooleanFalse); setProperty(kAppleClamshellCausesSleepKey, shouldSleepOnClamshellClosed() ? kOSBooleanTrue : kOSBooleanFalse); @@ -2569,28 +2748,566 @@ void IOPMrootDomain::sendClientClamshellNotification( void ) * ( kClamshellStateBit | kClamshellSleepBit ) */ messageClients(kIOPMMessageClamshellStateChange, - (void *) ( (clamshellIsClosed ? kClamshellStateBit : 0) + (void *) ( (clamshellClosed ? kClamshellStateBit : 0) | ( shouldSleepOnClamshellClosed() ? kClamshellSleepBit : 0)) ); } - //****************************************************************************** -// informCPUStateChange -// -// Call into PM CPU code so that CPU power savings may dynamically adjust for -// running on battery, with the lid closed, etc. +// getSleepSupported // -// informCPUStateChange is a no-op on non x86 systems -// only x86 has explicit support in the IntelCPUPowerManagement kext +// Deprecated //****************************************************************************** -void IOPMrootDomain::informCPUStateChange( - uint32_t type, - uint32_t value ) +IOOptionBits IOPMrootDomain::getSleepSupported( void ) { -#if defined(__i386__) || defined(__x86_64__) + return( platformSleepSupport ); +} - pmioctlVariableInfo_t varInfoStruct; +//****************************************************************************** +// setSleepSupported +// +// Deprecated +//****************************************************************************** + +void IOPMrootDomain::setSleepSupported( IOOptionBits flags ) +{ + DLOG("setSleepSupported(%x)\n", (uint32_t) flags); + OSBitOrAtomic(flags, &platformSleepSupport); +} + +//****************************************************************************** +// wakeFromDoze +// +// Deprecated. +//****************************************************************************** + +void IOPMrootDomain::wakeFromDoze( void ) +{ + // Preserve symbol for familes (IOUSBFamily and IOGraphics) +} + +// MARK: - +// MARK: Features + +//****************************************************************************** +// publishFeature +// +// Adds a new feature to the supported features dictionary +//****************************************************************************** + +void IOPMrootDomain::publishFeature( const char * feature ) +{ + publishFeature(feature, kRD_AllPowerSources, NULL); +} + +//****************************************************************************** +// publishFeature (with supported power source specified) +// +// Adds a new feature to the supported features dictionary +//****************************************************************************** + +void IOPMrootDomain::publishFeature( + const char *feature, + uint32_t supportedWhere, + uint32_t *uniqueFeatureID) +{ + static uint16_t next_feature_id = 500; + + OSNumber *new_feature_data = NULL; + OSNumber *existing_feature = NULL; + OSArray *existing_feature_arr = NULL; + OSObject *osObj = NULL; + uint32_t feature_value = 0; + + supportedWhere &= kRD_AllPowerSources; // mask off any craziness! + + if(!supportedWhere) { + // Feature isn't supported anywhere! + return; + } + + if(next_feature_id > 5000) { + // Far, far too many features! + return; + } + + if(featuresDictLock) IOLockLock(featuresDictLock); + + OSDictionary *features = + (OSDictionary *) getProperty(kRootDomainSupportedFeatures); + + // Create new features dict if necessary + if ( features && OSDynamicCast(OSDictionary, features)) { + features = OSDictionary::withDictionary(features); + } else { + features = OSDictionary::withCapacity(1); + } + + // Create OSNumber to track new feature + + next_feature_id += 1; + if( uniqueFeatureID ) { + // We don't really mind if the calling kext didn't give us a place + // to stash their unique id. Many kexts don't plan to unload, and thus + // have no need to remove themselves later. + *uniqueFeatureID = next_feature_id; + } + + feature_value = (uint32_t)next_feature_id; + feature_value <<= 16; + feature_value += supportedWhere; + + new_feature_data = OSNumber::withNumber( + (unsigned long long)feature_value, 32); + + // Does features object already exist? + if( (osObj = features->getObject(feature)) ) + { + if(( existing_feature = OSDynamicCast(OSNumber, osObj) )) + { + // We need to create an OSArray to hold the now 2 elements. + existing_feature_arr = OSArray::withObjects( + (const OSObject **)&existing_feature, 1, 2); + } else if(( existing_feature_arr = OSDynamicCast(OSArray, osObj) )) + { + // Add object to existing array + existing_feature_arr = OSArray::withArray( + existing_feature_arr, + existing_feature_arr->getCount() + 1); + } + + if (existing_feature_arr) + { + existing_feature_arr->setObject(new_feature_data); + features->setObject(feature, existing_feature_arr); + existing_feature_arr->release(); + existing_feature_arr = 0; + } + } else { + // The easy case: no previously existing features listed. We simply + // set the OSNumber at key 'feature' and we're on our way. + features->setObject(feature, new_feature_data); + } + + new_feature_data->release(); + + setProperty(kRootDomainSupportedFeatures, features); + + features->release(); + + if(featuresDictLock) IOLockUnlock(featuresDictLock); + + // Notify EnergySaver and all those in user space so they might + // re-populate their feature specific UI + if(pmPowerStateQueue) { + pmPowerStateQueue->submitPowerEvent( kPowerEventFeatureChanged ); + } +} + +//****************************************************************************** +// removePublishedFeature +// +// Removes previously published feature +//****************************************************************************** + +IOReturn IOPMrootDomain::removePublishedFeature( uint32_t removeFeatureID ) +{ + IOReturn ret = kIOReturnError; + uint32_t feature_value = 0; + uint16_t feature_id = 0; + bool madeAChange = false; + + OSSymbol *dictKey = NULL; + OSCollectionIterator *dictIterator = NULL; + OSArray *arrayMember = NULL; + OSNumber *numberMember = NULL; + OSObject *osObj = NULL; + OSNumber *osNum = NULL; + OSArray *arrayMemberCopy; + + if(featuresDictLock) IOLockLock(featuresDictLock); + + OSDictionary *features = + (OSDictionary *) getProperty(kRootDomainSupportedFeatures); + + if ( features && OSDynamicCast(OSDictionary, features) ) + { + // Any modifications to the dictionary are made to the copy to prevent + // races & crashes with userland clients. Dictionary updated + // automically later. + features = OSDictionary::withDictionary(features); + } else { + features = NULL; + ret = kIOReturnNotFound; + goto exit; + } + + // We iterate 'features' dictionary looking for an entry tagged + // with 'removeFeatureID'. If found, we remove it from our tracking + // structures and notify the OS via a general interest message. + + dictIterator = OSCollectionIterator::withCollection(features); + if(!dictIterator) { + goto exit; + } + + while( (dictKey = OSDynamicCast(OSSymbol, dictIterator->getNextObject())) ) + { + osObj = features->getObject(dictKey); + + // Each Feature is either tracked by an OSNumber + if( osObj && (numberMember = OSDynamicCast(OSNumber, osObj)) ) + { + feature_value = numberMember->unsigned32BitValue(); + feature_id = (uint16_t)(feature_value >> 16); + + if( feature_id == (uint16_t)removeFeatureID ) + { + // Remove this node + features->removeObject(dictKey); + madeAChange = true; + break; + } + + // Or tracked by an OSArray of OSNumbers + } else if( osObj && (arrayMember = OSDynamicCast(OSArray, osObj)) ) + { + unsigned int arrayCount = arrayMember->getCount(); + + for(unsigned int i=0; igetObject(i)); + if(!osNum) { + continue; + } + + feature_value = osNum->unsigned32BitValue(); + feature_id = (uint16_t)(feature_value >> 16); + + if( feature_id == (uint16_t)removeFeatureID ) + { + // Remove this node + if( 1 == arrayCount ) { + // If the array only contains one element, remove + // the whole thing. + features->removeObject(dictKey); + } else { + // Otherwise remove the element from a copy of the array. + arrayMemberCopy = OSArray::withArray(arrayMember); + if (arrayMemberCopy) + { + arrayMemberCopy->removeObject(i); + features->setObject(dictKey, arrayMemberCopy); + arrayMemberCopy->release(); + } + } + + madeAChange = true; + break; + } + } + } + } + + dictIterator->release(); + + if( madeAChange ) + { + ret = kIOReturnSuccess; + + setProperty(kRootDomainSupportedFeatures, features); + + // Notify EnergySaver and all those in user space so they might + // re-populate their feature specific UI + if(pmPowerStateQueue) { + pmPowerStateQueue->submitPowerEvent( kPowerEventFeatureChanged ); + } + } else { + ret = kIOReturnNotFound; + } + +exit: + if(features) features->release(); + if(featuresDictLock) IOLockUnlock(featuresDictLock); + return ret; +} + +//****************************************************************************** +// setPMSetting (private) +// +// Internal helper to relay PM settings changes from user space to individual +// drivers. Should be called only by IOPMrootDomain::setProperties. +//****************************************************************************** + +IOReturn IOPMrootDomain::setPMSetting( + const OSSymbol *type, + OSObject *object ) +{ + PMSettingCallEntry *entries = 0; + OSArray *chosen = 0; + const OSArray *array; + PMSettingObject *pmso; + thread_t thisThread; + int i, j, count, capacity; + + if (NULL == type) + return kIOReturnBadArgument; + + PMSETTING_LOCK(); + + // Update settings dict so changes are visible from copyPMSetting(). + fPMSettingsDict->setObject(type, object); + + // Prep all PMSetting objects with the given 'type' for callout. + array = (const OSArray *) settingsCallbacks->getObject(type); + if (!array || ((capacity = array->getCount()) == 0)) + goto unlock_exit; + + // Array to retain PMSetting objects targeted for callout. + chosen = OSArray::withCapacity(capacity); + if (!chosen) + goto unlock_exit; // error + + entries = IONew(PMSettingCallEntry, capacity); + if (!entries) + goto unlock_exit; // error + memset(entries, 0, sizeof(PMSettingCallEntry) * capacity); + + thisThread = current_thread(); + + for (i = 0, j = 0; igetObject(i); + if (pmso->disabled) + continue; + entries[j].thread = thisThread; + queue_enter(&pmso->calloutQueue, &entries[j], PMSettingCallEntry *, link); + chosen->setObject(pmso); + j++; + } + count = j; + if (!count) + goto unlock_exit; + + PMSETTING_UNLOCK(); + + // Call each pmso in the chosen array. + for (i=0; igetObject(i); + pmso->dispatchPMSetting(type, object); + } + + PMSETTING_LOCK(); + for (i=0; igetObject(i); + queue_remove(&pmso->calloutQueue, &entries[i], PMSettingCallEntry *, link); + if (pmso->waitThread) + { + PMSETTING_WAKEUP(pmso); + } + } +unlock_exit: + PMSETTING_UNLOCK(); + + if (chosen) chosen->release(); + if (entries) IODelete(entries, PMSettingCallEntry, capacity); + + return kIOReturnSuccess; +} + +//****************************************************************************** +// copyPMSetting (public) +// +// Allows kexts to safely read setting values, without being subscribed to +// notifications. +//****************************************************************************** + +OSObject * IOPMrootDomain::copyPMSetting( + OSSymbol *whichSetting) +{ + OSObject *obj = NULL; + + if(!whichSetting) return NULL; + + PMSETTING_LOCK(); + obj = fPMSettingsDict->getObject(whichSetting); + if(obj) { + obj->retain(); + } + PMSETTING_UNLOCK(); + + return obj; +} + +//****************************************************************************** +// registerPMSettingController (public) +// +// direct wrapper to registerPMSettingController with uint32_t power source arg +//****************************************************************************** + +IOReturn IOPMrootDomain::registerPMSettingController( + const OSSymbol * settings[], + IOPMSettingControllerCallback func, + OSObject *target, + uintptr_t refcon, + OSObject **handle) +{ + return registerPMSettingController( + settings, + (kIOPMSupportedOnAC | kIOPMSupportedOnBatt | kIOPMSupportedOnUPS), + func, target, refcon, handle); +} + +//****************************************************************************** +// registerPMSettingController (public) +// +// Kexts may register for notifications when a particular setting is changed. +// A list of settings is available in IOPM.h. +// Arguments: +// * settings - An OSArray containing OSSymbols. Caller should populate this +// array with a list of settings caller wants notifications from. +// * func - A C function callback of the type IOPMSettingControllerCallback +// * target - caller may provide an OSObject *, which PM will pass as an +// target to calls to "func" +// * refcon - caller may provide an void *, which PM will pass as an +// argument to calls to "func" +// * handle - This is a return argument. We will populate this pointer upon +// call success. Hold onto this and pass this argument to +// IOPMrootDomain::deRegisterPMSettingCallback when unloading your kext +// Returns: +// kIOReturnSuccess on success +//****************************************************************************** + +IOReturn IOPMrootDomain::registerPMSettingController( + const OSSymbol * settings[], + uint32_t supportedPowerSources, + IOPMSettingControllerCallback func, + OSObject *target, + uintptr_t refcon, + OSObject **handle) +{ + PMSettingObject *pmso = NULL; + OSObject *pmsh = NULL; + OSArray *list = NULL; + int i; + + if (NULL == settings || + NULL == func || + NULL == handle) + { + return kIOReturnBadArgument; + } + + pmso = PMSettingObject::pmSettingObject( + (IOPMrootDomain *) this, func, target, + refcon, supportedPowerSources, settings, &pmsh); + + if (!pmso) { + *handle = NULL; + return kIOReturnInternalError; + } + + PMSETTING_LOCK(); + for (i=0; settings[i]; i++) + { + list = (OSArray *) settingsCallbacks->getObject(settings[i]); + if (!list) { + // New array of callbacks for this setting + list = OSArray::withCapacity(1); + settingsCallbacks->setObject(settings[i], list); + list->release(); + } + + // Add caller to the callback list + list->setObject(pmso); + } + PMSETTING_UNLOCK(); + + // Return handle to the caller, the setting object is private. + *handle = pmsh; + + return kIOReturnSuccess; +} + +//****************************************************************************** +// deregisterPMSettingObject (private) +// +// Only called from PMSettingObject. +//****************************************************************************** + +void IOPMrootDomain::deregisterPMSettingObject( PMSettingObject * pmso ) +{ + thread_t thisThread = current_thread(); + PMSettingCallEntry *callEntry; + OSCollectionIterator *iter; + OSSymbol *sym; + OSArray *array; + int index; + bool wait; + + PMSETTING_LOCK(); + + pmso->disabled = true; + + // Wait for all callout threads to finish. + do { + wait = false; + queue_iterate(&pmso->calloutQueue, callEntry, PMSettingCallEntry *, link) + { + if (callEntry->thread != thisThread) + { + wait = true; + break; + } + } + if (wait) + { + assert(0 == pmso->waitThread); + pmso->waitThread = thisThread; + PMSETTING_WAIT(pmso); + pmso->waitThread = 0; + } + } while (wait); + + // Search each PM settings array in the kernel. + iter = OSCollectionIterator::withCollection(settingsCallbacks); + if (iter) + { + while ((sym = OSDynamicCast(OSSymbol, iter->getNextObject()))) + { + array = (OSArray *) settingsCallbacks->getObject(sym); + index = array->getNextIndexOfObject(pmso, 0); + if (-1 != index) { + array->removeObject(index); + } + } + iter->release(); + } + + PMSETTING_UNLOCK(); + + pmso->release(); +} + +//****************************************************************************** +// informCPUStateChange +// +// Call into PM CPU code so that CPU power savings may dynamically adjust for +// running on battery, with the lid closed, etc. +// +// informCPUStateChange is a no-op on non x86 systems +// only x86 has explicit support in the IntelCPUPowerManagement kext +//****************************************************************************** + +void IOPMrootDomain::informCPUStateChange( + uint32_t type, + uint32_t value ) +{ +#if defined(__i386__) || defined(__x86_64__) + + pmioctlVariableInfo_t varInfoStruct; int pmCPUret = 0; const char *varNameStr = NULL; int32_t *varIndex = NULL; @@ -2639,6 +3356,8 @@ void IOPMrootDomain::informCPUStateChange( #endif /* __i386__ || __x86_64__ */ } +// MARK: - +// MARK: Deep Sleep Policy #if HIBERNATION @@ -2673,7 +3392,7 @@ enum { kIOPMSleepFactorUSBExternalDevice = 0x00000080, kIOPMSleepFactorBluetoothHIDDevice = 0x00000100, kIOPMSleepFactorExternalMediaMounted = 0x00000200, - kIOPMSleepFactorDriverAssertBit5 = 0x00000400, + kIOPMSleepFactorDriverAssertBit5 = 0x00000400, /* Reserved for ThunderBolt */ kIOPMSleepFactorDriverAssertBit6 = 0x00000800, kIOPMSleepFactorDriverAssertBit7 = 0x00001000 }; @@ -2730,18 +3449,15 @@ bool IOPMrootDomain::evaluateSystemSleepPolicy( IOPMSystemSleepParameters * p ) if (getPMAssertionLevel(kIOPMDriverAssertionExternalMediaMountedBit) != kIOPMDriverAssertionLevelOff) currentFactors |= kIOPMSleepFactorExternalMediaMounted; - if (getPMAssertionLevel(kIOPMDriverAssertionReservedBit5) != + if (getPMAssertionLevel(kIOPMDriverAssertionReservedBit5) != /* AssertionBit5 = Thunderbolt */ kIOPMDriverAssertionLevelOff) currentFactors |= kIOPMSleepFactorDriverAssertBit5; - if (getPMAssertionLevel(kIOPMDriverAssertionReservedBit6) != - kIOPMDriverAssertionLevelOff) - currentFactors |= kIOPMSleepFactorDriverAssertBit6; if (getPMAssertionLevel(kIOPMDriverAssertionReservedBit7) != kIOPMDriverAssertionLevelOff) currentFactors |= kIOPMSleepFactorDriverAssertBit7; if (0 == deepSleepDelay) currentFactors |= kIOPMSleepFactorDeepSleepNoDelay; - if (!clamshellIsClosed) + if (!clamshellClosed) currentFactors |= kIOPMSleepFactorLidOpen; if (acAdaptorConnected) currentFactors |= kIOPMSleepFactorACPower; @@ -2897,1768 +3613,2139 @@ bool IOPMrootDomain::getSleepOption( const char * key, uint32_t * option ) ok = true; } - if (obj) - obj->release(); - if (optionsProp) - optionsProp->release(); + if (obj) + obj->release(); + if (optionsProp) + optionsProp->release(); + + return true; +} +#endif /* HIBERNATION */ + +// MARK: - +// MARK: Shutdown and Restart + +//****************************************************************************** +// handlePlatformHaltRestart +// +//****************************************************************************** + +struct HaltRestartApplierContext { + IOPMrootDomain * RootDomain; + unsigned long PowerState; + IOPMPowerFlags PowerFlags; + UInt32 MessageType; + UInt32 Counter; +}; + +static void +platformHaltRestartApplier( OSObject * object, void * context ) +{ + IOPowerStateChangeNotification notify; + HaltRestartApplierContext * ctx; + AbsoluteTime startTime; + UInt32 deltaTime; + + ctx = (HaltRestartApplierContext *) context; + + memset(¬ify, 0, sizeof(notify)); + notify.powerRef = (void *)ctx->Counter; + notify.returnValue = 0; + notify.stateNumber = ctx->PowerState; + notify.stateFlags = ctx->PowerFlags; + + clock_get_uptime(&startTime); + ctx->RootDomain->messageClient( ctx->MessageType, object, (void *)¬ify ); + deltaTime = computeDeltaTimeMS(&startTime); + + if ((deltaTime > kPMHaltTimeoutMS) || + (gIOKitDebug & kIOLogPMRootDomain)) + { + _IOServiceInterestNotifier * notifier; + notifier = OSDynamicCast(_IOServiceInterestNotifier, object); + + // IOService children of IOPMrootDomain are not instrumented. + // Only IORootParent currently falls under that group. + + if (notifier) + { + LOG("%s handler %p took %u ms\n", + (ctx->MessageType == kIOMessageSystemWillPowerOff) ? "PowerOff" : + (ctx->MessageType == kIOMessageSystemPagingOff) ? "PagingOff" : "Restart", + notifier->handler, (uint32_t) deltaTime ); + } + } + + ctx->Counter++; +} + +void IOPMrootDomain::handlePlatformHaltRestart( UInt32 pe_type ) +{ + HaltRestartApplierContext ctx; + AbsoluteTime startTime; + UInt32 deltaTime; + + memset(&ctx, 0, sizeof(ctx)); + ctx.RootDomain = this; + + clock_get_uptime(&startTime); + switch (pe_type) + { + case kPEHaltCPU: + case kPEUPSDelayHaltCPU: + ctx.PowerState = OFF_STATE; + ctx.MessageType = kIOMessageSystemWillPowerOff; + break; + + case kPERestartCPU: + ctx.PowerState = RESTART_STATE; + ctx.MessageType = kIOMessageSystemWillRestart; + break; + + case kPEPagingOff: + ctx.PowerState = ON_STATE; + ctx.MessageType = kIOMessageSystemPagingOff; + break; + + default: + return; + } + + // Notify legacy clients + applyToInterested(gIOPriorityPowerStateInterest, platformHaltRestartApplier, &ctx); + + // For normal shutdown, turn off File Server Mode. + if (kPEHaltCPU == pe_type) + { + const OSSymbol * setting = OSSymbol::withCString(kIOPMSettingRestartOnPowerLossKey); + OSNumber * num = OSNumber::withNumber((unsigned long long) 0, 32); + if (setting && num) + { + setPMSetting(setting, num); + setting->release(); + num->release(); + } + } + + if (kPEPagingOff != pe_type) + { + // Notify in power tree order + notifySystemShutdown(this, ctx.MessageType); + } - return true; + deltaTime = computeDeltaTimeMS(&startTime); + LOG("%s all drivers took %u ms\n", + (ctx.MessageType == kIOMessageSystemWillPowerOff) ? "PowerOff" : + (ctx.MessageType == kIOMessageSystemPagingOff) ? "PagingOff" : "Restart", + (uint32_t) deltaTime ); } -#endif /* HIBERNATION */ - //****************************************************************************** -// dispatchPowerEvent +// shutdownSystem // -// IOPMPowerStateQueue callback function. Running on PM work loop thread. //****************************************************************************** -void IOPMrootDomain::dispatchPowerEvent( - uint32_t event, void * arg0, uint64_t arg1 ) +IOReturn IOPMrootDomain::shutdownSystem( void ) { - DLOG("power event %u args %p 0x%llx\n", event, arg0, arg1); - ASSERT_GATED(); - - switch (event) - { - case kPowerEventFeatureChanged: - messageClients(kIOPMMessageFeatureChange, this); - break; - - case kPowerEventReceivedPowerNotification: - handlePowerNotification( (UInt32)(uintptr_t) arg0 ); - break; - - case kPowerEventSystemBootCompleted: - if (systemBooting) - { - systemBooting = false; - adjustPowerState(); - - // If lid is closed, re-send lid closed notification - // now that booting is complete. - if( clamshellIsClosed ) - { - handlePowerNotification(kLocalEvalClamshellCommand); - } - } - break; - - case kPowerEventSystemShutdown: - if (kOSBooleanTrue == (OSBoolean *) arg0) - { - /* We set systemShutdown = true during shutdown - to prevent sleep at unexpected times while loginwindow is trying - to shutdown apps and while the OS is trying to transition to - complete power of. - - Set to true during shutdown, as soon as loginwindow shows - the "shutdown countdown dialog", through individual app - termination, and through black screen kernel shutdown. - */ - LOG("systemShutdown true\n"); - systemShutdown = true; - } else { - /* - A shutdown was initiated, but then the shutdown - was cancelled, clearing systemShutdown to false here. - */ - LOG("systemShutdown false\n"); - systemShutdown = false; - } - break; - - case kPowerEventUserDisabledSleep: - userDisabledAllSleep = (kOSBooleanTrue == (OSBoolean *) arg0); - break; - -#if ROOT_DOMAIN_RUN_STATES - case kPowerEventConfigdRegisteredInterest: - if (gConfigdNotifier) - { - gConfigdNotifier->release(); - gConfigdNotifier = 0; - } - if (arg0) - { - gConfigdNotifier = (IONotifier *) arg0; - } - break; -#endif - - case kPowerEventAggressivenessChanged: - aggressivenessChanged(); - break; - - case kPowerEventAssertionCreate: - if (pmAssertions) { - pmAssertions->handleCreateAssertion((OSData *)arg0); - } - break; + return kIOReturnUnsupported; +} - case kPowerEventAssertionRelease: - if (pmAssertions) { - pmAssertions->handleReleaseAssertion(arg1); - } - break; +//****************************************************************************** +// restartSystem +// +//****************************************************************************** - case kPowerEventAssertionSetLevel: - if (pmAssertions) { - pmAssertions->handleSetAssertionLevel(arg1, (IOPMDriverAssertionLevel)(uintptr_t)arg0); - } - break; - } +IOReturn IOPMrootDomain::restartSystem( void ) +{ + return kIOReturnUnsupported; } +// MARK: - +// MARK: System Capability //****************************************************************************** -// systemPowerEventOccurred -// -// The power controller is notifying us of a hardware-related power management -// event that we must handle. +// tagPowerPlaneService // -// systemPowerEventOccurred covers the same functionality that -// receivePowerNotification does; it simply provides a richer API for conveying -// more information. +// Running on PM work loop thread. //****************************************************************************** -IOReturn IOPMrootDomain::systemPowerEventOccurred( - const OSSymbol *event, - uint32_t intValue) +void IOPMrootDomain::tagPowerPlaneService( + IOService * service, + IOPMActions * actions ) { - IOReturn attempt = kIOReturnSuccess; - OSNumber *newNumber = NULL; + uint32_t flags = 0; + bool isDisplayWrangler; - if (!event) - return kIOReturnBadArgument; - - newNumber = OSNumber::withNumber(intValue, 8*sizeof(intValue)); - if (!newNumber) - return kIOReturnInternalError; + memset(actions, 0, sizeof(*actions)); + actions->target = this; - attempt = systemPowerEventOccurred(event, (OSObject *)newNumber); + if (service == this) + { + actions->actionPowerChangeStart = + OSMemberFunctionCast( + IOPMActionPowerChangeStart, this, + &IOPMrootDomain::handleOurPowerChangeStart); - newNumber->release(); + actions->actionPowerChangeDone = + OSMemberFunctionCast( + IOPMActionPowerChangeDone, this, + &IOPMrootDomain::handleOurPowerChangeDone); - return attempt; -} + actions->actionPowerChangeOverride = + OSMemberFunctionCast( + IOPMActionPowerChangeOverride, this, + &IOPMrootDomain::overrideOurPowerChange); + return; + } -IOReturn IOPMrootDomain::systemPowerEventOccurred( - const OSSymbol *event, - OSObject *value) -{ - OSDictionary *thermalsDict = NULL; - bool shouldUpdate = true; - - if (!event || !value) - return kIOReturnBadArgument; +#if !NO_KERNEL_HID + isDisplayWrangler = (0 != service->metaCast("IODisplayWrangler")); + if (isDisplayWrangler) + { + wrangler = service; + wranglerConnection = (IOService *) service->getParentEntry(gIOPowerPlane); + } +#else + isDisplayWrangler = false; +#endif - // LOCK - // We reuse featuresDict Lock because it already exists and guards - // the very infrequently used publish/remove feature mechanism; so there's zero rsk - // of stepping on that lock. - if (featuresDictLock) IOLockLock(featuresDictLock); +#if defined(__i386__) || defined(__x86_64__) + if (isDisplayWrangler) + flags |= kPMActionsFlagIsDisplayWrangler; + if (service->getProperty("IOPMStrictTreeOrder")) + flags |= kPMActionsFlagIsGraphicsDevice; + if (service->getProperty("IOPMUnattendedWakePowerState")) + flags |= kPMActionsFlagIsAudioDevice; +#endif - thermalsDict = (OSDictionary *)getProperty(kIOPMRootDomainPowerStatusKey); - - if (thermalsDict && OSDynamicCast(OSDictionary, thermalsDict)) { - thermalsDict = OSDictionary::withDictionary(thermalsDict); - } else { - thermalsDict = OSDictionary::withCapacity(1); - } + // Find the power connection object that is a child of the PCI host + // bridge, and has a graphics/audio device attached below. Mark the + // power branch for delayed child notifications. - if (!thermalsDict) { - shouldUpdate = false; - goto exit; + if (flags) + { + IORegistryEntry * child = service; + IORegistryEntry * parent = child->getParentEntry(gIOPowerPlane); + + while (child != this) + { + if ((parent == pciHostBridgeDriver) || + (parent == this)) + { + if (OSDynamicCast(IOPowerConnection, child)) + { + IOPowerConnection * conn = (IOPowerConnection *) child; + conn->delayChildNotification = true; + } + break; + } + child = parent; + parent = child->getParentEntry(gIOPowerPlane); + } } - thermalsDict->setObject (event, value); + if (flags) + { + DLOG("%s tag flags %x\n", service->getName(), flags); + actions->parameter |= flags; + actions->actionPowerChangeOverride = + OSMemberFunctionCast( + IOPMActionPowerChangeOverride, this, + &IOPMrootDomain::overridePowerChangeForUIService); - setProperty (kIOPMRootDomainPowerStatusKey, thermalsDict); + if (flags & kPMActionsFlagIsDisplayWrangler) + { + actions->actionActivityTickle = + OSMemberFunctionCast( + IOPMActionActivityTickle, this, + &IOPMrootDomain::handleActivityTickleForDisplayWrangler); + } + return; + } - thermalsDict->release(); + // Locate the first PCI host bridge for PMTrace. + if (!pciHostBridgeDevice && service->metaCast("IOPCIBridge")) + { + IOService * provider = service->getProvider(); + if (OSDynamicCast(IOPlatformDevice, provider) && + provider->inPlane(gIODTPlane)) + { + pciHostBridgeDevice = provider; + pciHostBridgeDriver = service; + DLOG("PMTrace found PCI host bridge %s->%s\n", + provider->getName(), service->getName()); + } + } -exit: - // UNLOCK - if (featuresDictLock) IOLockUnlock(featuresDictLock); + // Tag top-level PCI devices. The order of PMinit() call does not + // change across boots and is used as the PCI bit number. + if (pciHostBridgeDevice && service->metaCast("IOPCIDevice")) + { + // Would prefer to check built-in property, but tagPowerPlaneService() + // is called before pciDevice->registerService(). + IORegistryEntry * parent = service->getParentEntry(gIODTPlane); + if ((parent == pciHostBridgeDevice) && service->getProperty("acpi-device")) + { + int bit = pmTracer->recordTopLevelPCIDevice( service ); + if (bit >= 0) + { + // Save the assigned bit for fast lookup. + actions->parameter |= (bit & kPMActionsPCIBitNumberMask); - if (shouldUpdate) - messageClients (kIOPMMessageSystemPowerEventOccurred, (void *)NULL); + actions->actionPowerChangeStart = + OSMemberFunctionCast( + IOPMActionPowerChangeStart, this, + &IOPMrootDomain::handlePowerChangeStartForPCIDevice); - return kIOReturnSuccess; + actions->actionPowerChangeDone = + OSMemberFunctionCast( + IOPMActionPowerChangeDone, this, + &IOPMrootDomain::handlePowerChangeDoneForPCIDevice); + } + } + } } - //****************************************************************************** -// receivePowerNotification -// -// The power controller is notifying us of a hardware-related power management -// event that we must handle. This may be a result of an 'environment' interrupt -// from the power mgt micro. +// PM actions for root domain //****************************************************************************** -IOReturn IOPMrootDomain::receivePowerNotification( UInt32 msg ) +void IOPMrootDomain::overrideOurPowerChange( + IOService * service, + IOPMActions * actions, + unsigned long * inOutPowerState, + uint32_t * inOutChangeFlags ) { - pmPowerStateQueue->submitPowerEvent( - kPowerEventReceivedPowerNotification, (void *) msg ); - return kIOReturnSuccess; + uint32_t powerState = (uint32_t) *inOutPowerState; + uint32_t changeFlags = *inOutChangeFlags; + uint32_t currentPowerState = (uint32_t) getPowerState(); + + if ((currentPowerState == powerState) || + (changeFlags & kIOPMParentInitiated)) + { + // FIXME: cancel any parent change (unexpected) + // Root parent is permanently pegged at max power, + // kIOPMParentInitiated is unexpected. + return; + } + + if (powerState < currentPowerState) + { + if ((changeFlags & kIOPMSkipAskPowerDown) == 0) + { + /* Convenient place to run any code at idle sleep time + * IOPMrootDomain initiates an idle sleep here + * + * Set last sleep cause accordingly. + */ + pmPowerStateQueue->submitPowerEvent(kPowerEventPublishSleepWakeUUID, (void *)true); + + lastSleepReason = kIOPMSleepReasonIdle; + setProperty(kRootDomainSleepReasonKey, kIOPMIdleSleepKey); + } + if (CAP_CURRENT(kIOPMSystemCapabilityGraphics)) + { + // Root domain is dropping power state ON->SLEEP. + // If system is in full wake, first drop to dark wake. + + darkWakeToSleepASAP = true; + + // Drop graphics capability. + // No transition if system is already in dark wake. + + _desiredCapability &= ~( + kIOPMSystemCapabilityGraphics | + kIOPMSystemCapabilityAudio ); + + *inOutPowerState = ON_STATE; + *inOutChangeFlags |= kIOPMSynchronize; + + // Revert device desire from SLEEP->ON. + changePowerStateToPriv(ON_STATE); + } + } } -void IOPMrootDomain::handlePowerNotification( UInt32 msg ) +void IOPMrootDomain::handleOurPowerChangeStart( + IOService * service, + IOPMActions * actions, + uint32_t powerState, + uint32_t * inOutChangeFlags ) { - bool eval_clamshell = false; + uint32_t changeFlags = *inOutChangeFlags; + uint32_t currentPowerState = (uint32_t) getPowerState(); - ASSERT_GATED(); + _systemTransitionType = kSystemTransitionNone; + _systemMessageClientMask = 0; + capabilityLoss = false; - /* - * Local (IOPMrootDomain only) eval clamshell command - */ - if (msg & kLocalEvalClamshellCommand) + // 1. Explicit capability change. + + if (changeFlags & kIOPMSynchronize) { - eval_clamshell = true; + if (powerState == ON_STATE) + { + if (changeFlags & kIOPMSyncNoChildNotify) + _systemTransitionType = kSystemTransitionNewCapClient; + else + _systemTransitionType = kSystemTransitionCapability; + } } - /* - * Overtemp - */ - if (msg & kIOPMOverTemp) + // 2. Going to sleep (cancellation still possible). + + else if (powerState < currentPowerState) + _systemTransitionType = kSystemTransitionSleep; + + // 3. Woke from (idle or demand) sleep. + + else if (!systemBooting && + (changeFlags & kIOPMSelfInitiated) && + (powerState > currentPowerState)) { - LOG("PowerManagement emergency overtemp signal. Going to sleep!"); - privateSleepSystem (kIOPMSleepReasonThermalEmergency); + _systemTransitionType = kSystemTransitionWake; + _desiredCapability = kIOPMSystemCapabilityCPU | + kIOPMSystemCapabilityNetwork; + + // Check for early HID events (e.g. LID open) + if (wranglerTickled) + { + _desiredCapability |= ( + kIOPMSystemCapabilityGraphics | + kIOPMSystemCapabilityAudio ); + } } -#ifdef __ppc__ - /* - * PMU Processor Speed Change - */ - if (msg & kIOPMProcessorSpeedChange) + // Update pending wake capability at the beginning of every + // state transition (including synchronize). This will become + // the current capability at the end of the transition. + + if (kSystemTransitionSleep == _systemTransitionType) { - IOService *pmu = waitForService(serviceMatching("ApplePMU")); - pmu->callPlatformFunction("prepareForSleep", false, 0, 0, 0, 0); - getPlatform()->sleepKernel(); - pmu->callPlatformFunction("recoverFromSleep", false, 0, 0, 0, 0); + _pendingCapability = 0; + capabilityLoss = true; } -#endif - - /* - * Sleep Now! - */ - if (msg & kIOPMSleepNow) + else if (kSystemTransitionNewCapClient != _systemTransitionType) { - privateSleepSystem (kIOPMSleepReasonSoftware); + _pendingCapability = _desiredCapability | + kIOPMSystemCapabilityCPU | + kIOPMSystemCapabilityNetwork; + + if (_pendingCapability & kIOPMSystemCapabilityGraphics) + _pendingCapability |= kIOPMSystemCapabilityAudio; + + if ((kSystemTransitionCapability == _systemTransitionType) && + (_pendingCapability == _currentCapability)) + { + // Cancel the PM state change. + _systemTransitionType = kSystemTransitionNone; + *inOutChangeFlags |= kIOPMNotDone; + } + if (__builtin_popcount(_pendingCapability) < + __builtin_popcount(_currentCapability)) + capabilityLoss = true; + if (CAP_LOSS(kIOPMSystemCapabilityGraphics)) + rejectWranglerTickle = true; } - - /* - * Power Emergency - */ - if (msg & kIOPMPowerEmergency) + + // 1. Capability change. + + if (kSystemTransitionCapability == _systemTransitionType) { - lowBatteryCondition = true; - privateSleepSystem (kIOPMSleepReasonLowPower); + // Dark to Full transition. + if (CAP_GAIN(kIOPMSystemCapabilityGraphics)) + { + tracePoint( kIOPMTracePointDarkWakeExit ); + wranglerSleepIgnored = false; + sleepTimerMaintenance = false; + hibernateNoDefeat = false; + _systemMessageClientMask = kSystemMessageClientUser; + if ((_highestCapability & kIOPMSystemCapabilityGraphics) == 0) + _systemMessageClientMask |= kSystemMessageClientKernel; + + tellClients(kIOMessageSystemWillPowerOn); + } + + // Full to Dark transition. + if (CAP_LOSS(kIOPMSystemCapabilityGraphics)) + { + tracePoint( kIOPMTracePointDarkWakeEntry ); + *inOutChangeFlags |= kIOPMSyncTellPowerDown; + _systemMessageClientMask = kSystemMessageClientUser; + } } - /* - * Clamshell OPEN - */ - if (msg & kIOPMClamshellOpened) + // 2. System sleep. + + else if (kSystemTransitionSleep == _systemTransitionType) { - // Received clamshel open message from clamshell controlling driver - // Update our internal state and tell general interest clients - clamshellIsClosed = false; - clamshellExists = true; + // Beginning of a system sleep transition. + // Cancellation is still possible. + tracePoint( kIOPMTracePointSleepStarted, lastSleepReason ); - if (msg & kIOPMSetValue) + _systemMessageClientMask = kSystemMessageClientAll; + if ((_currentCapability & kIOPMSystemCapabilityGraphics) == 0) + _systemMessageClientMask &= ~kSystemMessageClientApp; + if ((_highestCapability & kIOPMSystemCapabilityGraphics) == 0) + _systemMessageClientMask &= ~kSystemMessageClientKernel; + + // Optimization to ignore wrangler power down thus skipping + // the disk spindown and arming the idle timer for demand sleep. + + if (changeFlags & kIOPMIgnoreChildren) { - reportUserInput(); + wranglerSleepIgnored = true; } - // Tell PMCPU - informCPUStateChange(kInformLid, 0); + logWranglerTickle = false; + } - // Tell general interest clients - sendClientClamshellNotification(); + // 3. System wake. - bool aborting = ((lastSleepReason == kIOPMSleepReasonClamshell) - || (lastSleepReason == kIOPMSleepReasonIdle) - || (lastSleepReason == kIOPMSleepReasonMaintenance)); - if (aborting) userActivityCount++; - DLOG("clamshell tickled %d lastSleepReason %d\n", userActivityCount, lastSleepReason); + else if (kSystemTransitionWake == _systemTransitionType) + { + wranglerSleepIgnored = false; + + if (_pendingCapability & kIOPMSystemCapabilityGraphics) + { + _systemMessageClientMask = kSystemMessageClientAll; + } + else + { + _systemMessageClientMask = kSystemMessageClientConfigd; + } + + tracePoint( kIOPMTracePointWakeWillPowerOnClients ); + tellClients(kIOMessageSystemWillPowerOn); } - /* - * Clamshell CLOSED - * Send the clamshell interest notification since the lid is closing. - */ - if (msg & kIOPMClamshellClosed) + if ((kSystemTransitionNone != _systemTransitionType) && + (kSystemTransitionNewCapClient != _systemTransitionType)) { - // Received clamshel open message from clamshell controlling driver - // Update our internal state and tell general interest clients - clamshellIsClosed = true; - clamshellExists = true; + _systemStateGeneration++; + systemDarkWake = false; - // Tell PMCPU - informCPUStateChange(kInformLid, 1); + DLOG("=== START (%u->%u, 0x%x) type %u, gen %u, msg %x, " + "dcp %x:%x:%x\n", + currentPowerState, powerState, *inOutChangeFlags, + _systemTransitionType, _systemStateGeneration, + _systemMessageClientMask, + _desiredCapability, _currentCapability, _pendingCapability); + } +} - // Tell general interest clients - sendClientClamshellNotification(); - - // And set eval_clamshell = so we can attempt - eval_clamshell = true; +void IOPMrootDomain::handleOurPowerChangeDone( + IOService * service, + IOPMActions * actions, + uint32_t powerState, + uint32_t changeFlags ) +{ + if (kSystemTransitionNewCapClient == _systemTransitionType) + { + _systemTransitionType = kSystemTransitionNone; + return; } - /* - * Set Desktop mode (sent from graphics) - * - * -> reevaluate lid state - */ - if (msg & kIOPMSetDesktopMode) + if (_systemTransitionType != kSystemTransitionNone) { - desktopMode = (0 != (msg & kIOPMSetValue)); - msg &= ~(kIOPMSetDesktopMode | kIOPMSetValue); + uint32_t currentPowerState = (uint32_t) getPowerState(); - sendClientClamshellNotification(); + if (changeFlags & kIOPMNotDone) + { + // Power down was cancelled or vetoed. + _pendingCapability = _currentCapability; + lastSleepReason = 0; - // Re-evaluate the lid state - if( clamshellIsClosed ) + if (((_currentCapability & kIOPMSystemCapabilityGraphics) == 0) && + (_currentCapability & kIOPMSystemCapabilityCPU)) + { + pmPowerStateQueue->submitPowerEvent( + kPowerEventPolicyStimulus, + (void *) kStimulusDarkWakeReentry, + _systemStateGeneration ); + } + + // Revert device desire to max. + changePowerStateToPriv(ON_STATE); + } + else { - eval_clamshell = true; + // Send message on dark wake to full wake promotion. + // tellChangeUp() handles the normal SLEEP->ON case. + + if (kSystemTransitionCapability == _systemTransitionType) + { + if (CAP_GAIN(kIOPMSystemCapabilityGraphics)) + { + tellClients(kIOMessageSystemHasPoweredOn); +#if DARK_TO_FULL_EVALUATE_CLAMSHELL + // Re-evaluate clamshell state ourselves when graphics + // will not get kIOMessageSystemHasPoweredOn. + + if (clamshellClosed && + ((_systemMessageClientMask & kSystemMessageClientKernel) == 0)) + { + receivePowerNotification( kLocalEvalClamshellCommand ); + } +#endif + } + if (CAP_LOSS(kIOPMSystemCapabilityGraphics)) + wranglerTickled = false; + } + + // Reset state after exiting from dark wake. + + if (CAP_GAIN(kIOPMSystemCapabilityGraphics) || + CAP_LOSS(kIOPMSystemCapabilityCPU)) + { + darkWakeMaintenance = false; + darkWakeToSleepASAP = false; + pciCantSleepValid = false; + rejectWranglerTickle = false; + } + + // Entered dark mode. + + if (((_pendingCapability & kIOPMSystemCapabilityGraphics) == 0) && + (_pendingCapability & kIOPMSystemCapabilityCPU)) + { + if (((gDarkWakeFlags & kDarkWakeFlagIgnoreDiskIOInDark) == 0) && + (kSystemTransitionWake == _systemTransitionType) && + (_debugWakeSeconds == 0)) + { + OSObject * prop = copyProperty(kIOPMRootDomainWakeTypeKey); + if (prop) + { + OSString * wakeType = OSDynamicCast(OSString, prop); + if (wakeType && + wakeType->isEqualTo(kIOPMRootDomainWakeTypeNetwork)) + { + // Woke from network and entered dark wake. + if (darkWakeToSleepASAP) + { + DLOG("cleared darkWakeToSleepASAP\n"); + darkWakeToSleepASAP = false; + } + } + prop->release(); + } + } + + // Queue an evaluation of whether to remain in dark wake, + // and for how long. This serves the purpose of draining + // any assertions from the queue. + + pmPowerStateQueue->submitPowerEvent( + kPowerEventPolicyStimulus, + (void *) kStimulusDarkWakeEntry, + _systemStateGeneration ); + } } - } - - /* - * AC Adaptor connected - * - * -> reevaluate lid state - */ - if (msg & kIOPMSetACAdaptorConnected) - { - acAdaptorConnected = (0 != (msg & kIOPMSetValue)); - msg &= ~(kIOPMSetACAdaptorConnected | kIOPMSetValue); - // Tell CPU PM - informCPUStateChange(kInformAC, !acAdaptorConnected); + DLOG("=== FINISH (%u->%u, 0x%x) type %u, gen %u, msg %x, " + "dcp %x:%x:%x, dbgtimer %u\n", + currentPowerState, powerState, changeFlags, + _systemTransitionType, _systemStateGeneration, + _systemMessageClientMask, + _desiredCapability, _currentCapability, _pendingCapability, + _debugWakeSeconds); - // Tell BSD if AC is connected - // 0 == external power source; 1 == on battery - post_sys_powersource(acAdaptorConnected ? 0:1); + // Update current system capability. - sendClientClamshellNotification(); + if (_currentCapability != _pendingCapability) + _currentCapability = _pendingCapability; - // Re-evaluate the lid state - if( clamshellIsClosed ) + // Update highest system capability. + + if (!CAP_CURRENT(kIOPMSystemCapabilityCPU)) + _highestCapability = 0; // reset at sleep state + else + _highestCapability |= _currentCapability; + + if (darkWakePostTickle && + (kSystemTransitionWake == _systemTransitionType) && + (gDarkWakeFlags & kDarkWakeFlagHIDTickleMask) == + kDarkWakeFlagHIDTickleLate) { - eval_clamshell = true; + darkWakePostTickle = false; + reportUserInput(); } - } - - /* - * Enable Clamshell (external display disappear) - * - * -> reevaluate lid state - */ - if (msg & kIOPMEnableClamshell) - { - // Re-evaluate the lid state - // System should sleep on external display disappearance - // in lid closed operation. - if( clamshellIsClosed && (true == ignoringClamshell) ) + + // Reset tracepoint at completion of capability change, + // completion of wake transition, and aborted sleep transition. + + if ((_systemTransitionType == kSystemTransitionCapability) || + (_systemTransitionType == kSystemTransitionWake) || + ((_systemTransitionType == kSystemTransitionSleep) && + (changeFlags & kIOPMNotDone))) { - eval_clamshell = true; + setProperty(kIOPMSystemCapabilitiesKey, _currentCapability, 64); + tracePoint( kIOPMTracePointSystemUp, 0 ); } - ignoringClamshell = false; + _systemTransitionType = kSystemTransitionNone; + _systemMessageClientMask = 0; - sendClientClamshellNotification(); + logGraphicsClamp = false; } - - /* - * Disable Clamshell (external display appeared) - * We don't bother re-evaluating clamshell state. If the system is awake, - * the lid is probably open. - */ - if (msg & kIOPMDisableClamshell) - { - ignoringClamshell = true; +} - sendClientClamshellNotification(); +//****************************************************************************** +// PM actions for graphics and audio. +//****************************************************************************** + +void IOPMrootDomain::overridePowerChangeForUIService( + IOService * service, + IOPMActions * actions, + unsigned long * inOutPowerState, + uint32_t * inOutChangeFlags ) +{ + uint32_t powerState = (uint32_t) *inOutPowerState; + uint32_t changeFlags = (uint32_t) *inOutChangeFlags; + + if (kSystemTransitionNone == _systemTransitionType) + { + // Not in midst of a system transition. + // Do not modify power limit enable state. } + else if ((actions->parameter & kPMActionsFlagLimitPower) == 0) + { + // Activate power limiter. - /* - * Evaluate clamshell and SLEEP if appropiate - */ - if ( eval_clamshell && shouldSleepOnClamshellClosed() ) + if ((actions->parameter & kPMActionsFlagIsDisplayWrangler) && + ((_pendingCapability & kIOPMSystemCapabilityGraphics) == 0)) + { + actions->parameter |= kPMActionsFlagLimitPower; + } + else if ((actions->parameter & kPMActionsFlagIsAudioDevice) && + ((_pendingCapability & kIOPMSystemCapabilityAudio) == 0)) + { + actions->parameter |= kPMActionsFlagLimitPower; + } + else if ((actions->parameter & kPMActionsFlagIsGraphicsDevice) && + (_systemTransitionType == kSystemTransitionSleep)) + { + // For graphics devices, arm the limiter when entering + // system sleep. Not when dropping to dark wake. + actions->parameter |= kPMActionsFlagLimitPower; + } + + if (actions->parameter & kPMActionsFlagLimitPower) + { + DLOG("+ plimit %s %p\n", + service->getName(), service); + } + } + else { + // Remove power limit. + if ((actions->parameter & ( + kPMActionsFlagIsDisplayWrangler | + kPMActionsFlagIsGraphicsDevice )) && + (_pendingCapability & kIOPMSystemCapabilityGraphics)) + { + actions->parameter &= ~kPMActionsFlagLimitPower; + } + else if ((actions->parameter & kPMActionsFlagIsAudioDevice) && + (_pendingCapability & kIOPMSystemCapabilityAudio)) + { + actions->parameter &= ~kPMActionsFlagLimitPower; + } - // SLEEP! - privateSleepSystem (kIOPMSleepReasonClamshell); + if ((actions->parameter & kPMActionsFlagLimitPower) == 0) + { + DLOG("- plimit %s %p\n", + service->getName(), service); + } } - /* - * Power Button - */ - if (msg & kIOPMPowerButton) + if (actions->parameter & kPMActionsFlagLimitPower) { - // toggle state of sleep/wake - // are we dozing? - if ( getPowerState() == DOZE_STATE ) + uint32_t maxPowerState = (uint32_t)(-1); + + if (changeFlags & (kIOPMDomainDidChange | kIOPMDomainWillChange)) { -#ifndef __LP64__ - // yes, tell the tree we're waking - systemWake(); -#endif - // wake the Display Wrangler - reportUserInput(); + // Enforce limit for system power/cap transitions. + + maxPowerState = 0; + if (actions->parameter & kPMActionsFlagIsDisplayWrangler) + { + // Forces a 3->1 transition sequence + if (changeFlags & kIOPMDomainWillChange) + maxPowerState = 3; + else + maxPowerState = 1; + } + } + else + { + // Deny all self-initiated changes when power is limited. + // Wrangler tickle should never defeat the limiter. + + maxPowerState = service->getPowerState(); + } + + if (powerState > maxPowerState) + { + DLOG("> plimit %s %p (%u->%u, 0x%x)\n", + service->getName(), service, powerState, maxPowerState, + changeFlags); + *inOutPowerState = maxPowerState; + + if (darkWakePostTickle && + (actions->parameter & kPMActionsFlagIsDisplayWrangler) && + (changeFlags & kIOPMDomainWillChange) && + ((gDarkWakeFlags & kDarkWakeFlagHIDTickleMask) == + kDarkWakeFlagHIDTickleEarly)) + { + darkWakePostTickle = false; + reportUserInput(); + } } - else { - OSString *pbs = OSString::withCString("DisablePowerButtonSleep"); - // Check that power button sleep is enabled - if( pbs ) { - if( kOSBooleanTrue != getProperty(pbs)) - privateSleepSystem (kIOPMSleepReasonPowerButton); + + if (!graphicsSuppressed && (changeFlags & kIOPMDomainDidChange)) + { + if (logGraphicsClamp) + { + AbsoluteTime now; + uint64_t nsec; + + clock_get_uptime(&now); + SUB_ABSOLUTETIME(&now, &systemWakeTime); + absolutetime_to_nanoseconds(now, &nsec); + MSG("Graphics suppressed %u ms\n", + ((int)((nsec) / 1000000ULL))); } + graphicsSuppressed = true; } } +} - /* - * Allow Sleep - * - */ - if ( (msg & kIOPMAllowSleep) && !allowSleep ) +void IOPMrootDomain::handleActivityTickleForDisplayWrangler( + IOService * service, + IOPMActions * actions ) +{ + // Warning: Not running in PM work loop context - don't modify state !!! + // Trap tickle directed to IODisplayWrangler while running with graphics + // capability suppressed. + + assert(service == wrangler); + + if (service == wrangler) { - allowSleep = true; - adjustPowerState(); + bool aborting = ((lastSleepReason == kIOPMSleepReasonIdle) + || (lastSleepReason == kIOPMSleepReasonMaintenance)); + if (aborting) { + userActivityCount++; + DLOG("display wrangler tickled1 %d lastSleepReason %d\n", userActivityCount, lastSleepReason); + } } - /* - * Prevent Sleep - * - */ - if (msg & kIOPMPreventSleep) { - allowSleep = false; - // are we dozing? - if ( getPowerState() == DOZE_STATE ) { -#ifndef __LP64__ - // yes, tell the tree we're waking - systemWake(); -#endif - adjustPowerState(); - // wake the Display Wrangler - reportUserInput(); - } else { - adjustPowerState(); - // make sure we have power to clamp - patriarch->wakeSystem(); + if (!wranglerTickled && !lowBatteryCondition && + ((_pendingCapability & kIOPMSystemCapabilityGraphics) == 0)) + { + DLOG("display wrangler tickled\n"); + if (kIOLogPMRootDomain & gIOKitDebug) + OSReportWithBacktrace("Dark wake display tickle"); + if (pmPowerStateQueue) + { + pmPowerStateQueue->submitPowerEvent( + kPowerEventPolicyStimulus, + (void *) kStimulusDarkWakeActivityTickle ); } } } - //****************************************************************************** -// getSleepSupported -// +// Approve usage of delayed child notification by PM. //****************************************************************************** -IOOptionBits IOPMrootDomain::getSleepSupported( void ) +bool IOPMrootDomain::shouldDelayChildNotification( + IOService * service ) { - return( platformSleepSupport ); + if (((gDarkWakeFlags & kDarkWakeFlagHIDTickleMask) != 0) && + !wranglerTickled && + (kSystemTransitionWake == _systemTransitionType)) + { + DLOG("%s: delay child notify\n", service->getName()); + return true; + } + return false; } - //****************************************************************************** -// setSleepSupported -// +// PM actions for PCI device. //****************************************************************************** -void IOPMrootDomain::setSleepSupported( IOOptionBits flags ) +void IOPMrootDomain::handlePowerChangeStartForPCIDevice( + IOService * service, + IOPMActions * actions, + uint32_t powerState, + uint32_t * inOutChangeFlags ) { - DLOG("setSleepSupported(%x)\n", (uint32_t) flags); - OSBitOrAtomic(flags, &platformSleepSupport); + pmTracer->tracePCIPowerChange( + PMTraceWorker::kPowerChangeStart, + service, *inOutChangeFlags, + (actions->parameter & kPMActionsPCIBitNumberMask)); } +void IOPMrootDomain::handlePowerChangeDoneForPCIDevice( + IOService * service, + IOPMActions * actions, + uint32_t powerState, + uint32_t changeFlags ) +{ + pmTracer->tracePCIPowerChange( + PMTraceWorker::kPowerChangeCompleted, + service, changeFlags, + (actions->parameter & kPMActionsPCIBitNumberMask)); +} //****************************************************************************** -// requestPowerDomainState -// -// The root domain intercepts this call to the superclass. -// Called on the PM work loop thread. +// registerInterest // -// If the clamp bit is not set in the desire, then the child doesn't need the power -// state it's requesting; it just wants it. The root ignores desires but not needs. -// If the clamp bit is not set, the root takes it that the child can tolerate no -// power and interprets the request accordingly. If all children can thus tolerate -// no power, we are on our way to idle sleep. +// Override IOService::registerInterest() to intercept special clients. //****************************************************************************** -IOReturn IOPMrootDomain::requestPowerDomainState ( - IOPMPowerFlags desiredFlags, - IOPowerConnection * whichChild, - unsigned long specification ) +IONotifier * IOPMrootDomain::registerInterest( + const OSSymbol * typeOfInterest, + IOServiceInterestHandler handler, + void * target, void * ref ) { - OSIterator *iter; - OSObject *next; - IOPowerConnection *connection; - IOPMPowerFlags powerRequestFlag = 0; - IOPMPowerFlags editedDesire; + IONotifier * notifier; + bool isSystemCapabilityClient; + bool isKernelCapabilityClient; - ASSERT_GATED(); + isSystemCapabilityClient = + typeOfInterest && + typeOfInterest->isEqualTo(kIOPMSystemCapabilityInterest); + + isKernelCapabilityClient = + typeOfInterest && + typeOfInterest->isEqualTo(gIOPriorityPowerStateInterest); - if (kIOLogPMRootDomain & gIOKitDebug) + if (isSystemCapabilityClient) + typeOfInterest = gIOAppPowerStateInterest; + + notifier = super::registerInterest(typeOfInterest, handler, target, ref); + if (notifier && pmPowerStateQueue) { - IOService * powerChild = - (IOService *) whichChild->getChildEntry(gIOPowerPlane); - DLOG("child %p, flags %lx, spec %lx - %s\n", - powerChild, desiredFlags, specification, - powerChild ? powerChild->getName() : "?"); + if (isSystemCapabilityClient) + { + notifier->retain(); + if (pmPowerStateQueue->submitPowerEvent( + kPowerEventRegisterSystemCapabilityClient, notifier) == false) + notifier->release(); + } + + if (isKernelCapabilityClient) + { + notifier->retain(); + if (pmPowerStateQueue->submitPowerEvent( + kPowerEventRegisterKernelCapabilityClient, notifier) == false) + notifier->release(); + } } - // Force the child's input power requirements to 0 unless the prevent - // idle-sleep flag is set. No input power flags map to our state 0. - // Our power clamp (deviceDesire) keeps the minimum power state at 2. + return notifier; +} - if (desiredFlags & kIOPMPreventIdleSleep) - editedDesire = kIOPMPreventIdleSleep | kIOPMPowerOn; - else - editedDesire = 0; +//****************************************************************************** +// systemMessageFilter +// +//****************************************************************************** - // Recompute sleep supported flag (doze if not supported) - sleepIsSupported = true; +bool IOPMrootDomain::systemMessageFilter( + void * object, void * arg1, void * arg2, void * arg3 ) +{ + const IOPMInterestContext * context = (const IOPMInterestContext *) arg1; + bool isCapMsg = (context->messageType == kIOMessageSystemCapabilityChange); + bool isCapClient = false; + bool allow = false; - iter = getChildIterator(gIOPowerPlane); - if ( iter ) - { - while ( (next = iter->getNextObject()) ) + do { + if ((kSystemTransitionNewCapClient == _systemTransitionType) && + (!isCapMsg || !_joinedCapabilityClients || + !_joinedCapabilityClients->containsObject((OSObject *) object))) + break; + + // Capability change message for app and kernel clients. + + if (isCapMsg) { - if ( (connection = OSDynamicCast(IOPowerConnection, next)) ) - { - // Ignore child that are in the process of joining. - if (connection->getReadyFlag() == false) - continue; + if ((context->notifyType == kNotifyPriority) || + (context->notifyType == kNotifyCapabilityChangePriority)) + isCapClient = true; - // Is this connection attached to the child that called - // requestPowerDomainState()? + if ((context->notifyType == kNotifyCapabilityChangeApps) && + (object == (void *) systemCapabilityNotifier)) + isCapClient = true; + } - if (connection == whichChild) - { - // OR in the child's input power requirements. - powerRequestFlag |= editedDesire; + if (isCapClient) + { + IOPMSystemCapabilityChangeParameters * capArgs = + (IOPMSystemCapabilityChangeParameters *) arg2; - if ( desiredFlags & kIOPMPreventSystemSleep ) - sleepIsSupported = false; - } + if (kSystemTransitionNewCapClient == _systemTransitionType) + { + capArgs->fromCapabilities = 0; + capArgs->toCapabilities = _currentCapability; + capArgs->changeFlags = 0; + } + else + { + capArgs->fromCapabilities = _currentCapability; + capArgs->toCapabilities = _pendingCapability; + + if (context->isPreChange) + capArgs->changeFlags = kIOPMSystemCapabilityWillChange; else - { - if (kIOLogPMRootDomain & gIOKitDebug) - { - IOService * powerChild = - (IOService *) connection->getChildEntry(gIOPowerPlane); - DLOG("child %p, state %ld, noIdle %d, noSleep %d - %s\n", - powerChild, - connection->getDesiredDomainState(), - connection->getPreventIdleSleepFlag(), - connection->getPreventSystemSleepFlag(), - powerChild ? powerChild->getName() : "?"); - } + capArgs->changeFlags = kIOPMSystemCapabilityDidChange; + } - // OR in the child's desired power state (0 or ON_STATE). - powerRequestFlag |= connection->getDesiredDomainState(); + // Capability change messages only go to the PM configd plugin. + // Wait for response post-change if capabilitiy is increasing. + // Wait for response pre-change if capability is decreasing. - if ( connection->getPreventSystemSleepFlag() ) - sleepIsSupported = false; - } + if ((context->notifyType == kNotifyCapabilityChangeApps) && arg3 && + ( (capabilityLoss && context->isPreChange) || + (!capabilityLoss && !context->isPreChange) ) ) + { + // app has not replied yet, wait for it + *((OSObject **) arg3) = kOSBooleanFalse; } + + allow = true; + break; } - iter->release(); - } - DLOG("childPowerFlags 0x%lx, extraSleepDelay %ld\n", - powerRequestFlag, extraSleepDelay); + // Capability client will always see kIOMessageCanSystemSleep, + // even for demand sleep. - if ( !powerRequestFlag && !systemBooting ) - { - if (!wrangler) + if ((kIOMessageCanSystemSleep == context->messageType) || + (kIOMessageSystemWillNotSleep == context->messageType)) { - sleepASAP = false; - changePowerStateToPriv(ON_STATE); - if (idleSeconds) + if (object == (OSObject *) systemCapabilityNotifier) { - // stay awake for at least idleSeconds - startIdleSleepTimer(idleSeconds); + allow = true; + break; + } + + // Not idle sleep, don't ask apps. + if (context->changeFlags & kIOPMSkipAskPowerDown) + { + break; } } - else if (!extraSleepDelay && !idleSleepTimerPending) + + // Reject capability change messages for legacy clients. + // Reject legacy system sleep messages for capability client. + + if (isCapMsg || (object == (OSObject *) systemCapabilityNotifier)) { - sleepASAP = true; + break; } - } - - // Drop our power clamp to SLEEP_STATE when all children became idle, - // and the system sleep and display sleep values are equal. - adjustPowerState(); + // Filter system sleep messages. - // If our power clamp has already dropped to SLEEP_STATE, and no child - // is keeping us at ON_STATE, then this will trigger idle sleep. + if ((context->notifyType == kNotifyApps) && + (_systemMessageClientMask & kSystemMessageClientApp)) + { + allow = true; + } + else if ((context->notifyType == kNotifyPriority) && + (_systemMessageClientMask & kSystemMessageClientKernel)) + { + allow = true; + } + } + while (false); - editedDesire |= (desiredFlags & kIOPMPreventSystemSleep); + if (allow && isCapMsg && _joinedCapabilityClients) + { + _joinedCapabilityClients->removeObject((OSObject *) object); + if (_joinedCapabilityClients->getCount() == 0) + { + DLOG("destroyed capability client set %p\n", + _joinedCapabilityClients); + _joinedCapabilityClients->release(); + _joinedCapabilityClients = 0; + } + } - return super::requestPowerDomainState( - editedDesire, whichChild, specification); + return allow; } - //****************************************************************************** -// handlePlatformHaltRestart +// setMaintenanceWakeCalendar // //****************************************************************************** -struct HaltRestartApplierContext { - IOPMrootDomain * RootDomain; - unsigned long PowerState; - IOPMPowerFlags PowerFlags; - UInt32 MessageType; - UInt32 Counter; -}; - -static void -platformHaltRestartApplier( OSObject * object, void * context ) +IOReturn IOPMrootDomain::setMaintenanceWakeCalendar( + const IOPMCalendarStruct * calendar ) { - IOPowerStateChangeNotification notify; - HaltRestartApplierContext * ctx; - AbsoluteTime startTime; - UInt32 deltaTime; + OSData * data; + IOReturn ret; - ctx = (HaltRestartApplierContext *) context; - - memset(¬ify, 0, sizeof(notify)); - notify.powerRef = (void *)ctx->Counter; - notify.returnValue = 0; - notify.stateNumber = ctx->PowerState; - notify.stateFlags = ctx->PowerFlags; + if (!calendar) + return kIOReturnBadArgument; + + data = OSData::withBytesNoCopy((void *) calendar, sizeof(*calendar)); + if (!data) + return kIOReturnNoMemory; + + ret = setPMSetting(gIOPMSettingMaintenanceWakeCalendarKey, data); - clock_get_uptime(&startTime); - ctx->RootDomain->messageClient( ctx->MessageType, object, (void *)¬ify ); - deltaTime = computeDeltaTimeMS(&startTime); + data->release(); + return ret; +} - if ((deltaTime > kPMHaltTimeoutMS) || (gIOKitDebug & kIOLogDebugPower)) - { - _IOServiceInterestNotifier * notifier; - notifier = OSDynamicCast(_IOServiceInterestNotifier, object); +// MARK: - +// MARK: Display Wrangler - // IOService children of IOPMrootDomain are not instrumented. - // Only IORootParent currently falls under that group. +//****************************************************************************** +// displayWranglerNotification +// +// Handle the notification when the IODisplayWrangler changes power state. +//****************************************************************************** - if (notifier) - { - KLOG("%s handler %p took %u ms\n", - (ctx->MessageType == kIOMessageSystemWillPowerOff) ? - "PowerOff" : "Restart", - notifier->handler, (uint32_t) deltaTime ); - } - } +IOReturn IOPMrootDomain::displayWranglerNotification( + void * target, void * refCon, + UInt32 messageType, IOService * service, + void * messageArgument, vm_size_t argSize ) +{ +#if !NO_KERNEL_HID + int displayPowerState; + IOPowerStateChangeNotification * params = + (IOPowerStateChangeNotification *) messageArgument; - ctx->Counter++; -} + if ((messageType != kIOMessageDeviceWillPowerOff) && + (messageType != kIOMessageDeviceHasPoweredOn)) + return kIOReturnUnsupported; -void IOPMrootDomain::handlePlatformHaltRestart( UInt32 pe_type ) -{ - HaltRestartApplierContext ctx; - AbsoluteTime startTime; - UInt32 deltaTime; + ASSERT_GATED(); + if (!gRootDomain) + return kIOReturnUnsupported; - memset(&ctx, 0, sizeof(ctx)); - ctx.RootDomain = this; + displayPowerState = params->stateNumber; + DLOG("DisplayWrangler message 0x%x, power state %d\n", + (uint32_t) messageType, displayPowerState); - clock_get_uptime(&startTime); - switch (pe_type) - { - case kPEHaltCPU: - case kPEUPSDelayHaltCPU: - ctx.PowerState = OFF_STATE; - ctx.MessageType = kIOMessageSystemWillPowerOff; - break; + switch (messageType) { + case kIOMessageDeviceWillPowerOff: - case kPERestartCPU: - ctx.PowerState = RESTART_STATE; - ctx.MessageType = kIOMessageSystemWillRestart; - break; + // Display wrangler has dropped power due to display idle + // or force system sleep. + // + // 4 Display ON + // 3 Display Dim + // 2 Display Sleep + // 1 Not visible to user + // 0 Not visible to user - default: - return; - } + if (displayPowerState > 2) + break; + + gRootDomain->evaluatePolicy( kStimulusDisplayWranglerSleep ); + break; - // Notify legacy clients - applyToInterested(gIOPriorityPowerStateInterest, platformHaltRestartApplier, &ctx); + case kIOMessageDeviceHasPoweredOn: - // For normal shutdown, turn off File Server Mode. - if (kPEHaltCPU == pe_type) - { - const OSSymbol * setting = OSSymbol::withCString(kIOPMSettingRestartOnPowerLossKey); - OSNumber * num = OSNumber::withNumber((unsigned long long) 0, 32); - if (setting && num) - { - setPMSetting(setting, num); - setting->release(); - num->release(); - } - } + // Display wrangler has powered on due to user activity + // or wake from sleep. - // Notify in power tree order - notifySystemShutdown(this, ctx.MessageType); + if ( 4 != displayPowerState ) + break; - deltaTime = computeDeltaTimeMS(&startTime); - KLOG("%s all drivers took %u ms\n", - (ctx.MessageType == kIOMessageSystemWillPowerOff) ? - "PowerOff" : "Restart", - (uint32_t) deltaTime ); + gRootDomain->evaluatePolicy( kStimulusDisplayWranglerWake ); + break; + } +#endif + return kIOReturnUnsupported; } - -//****************************************************************************** -// registerInterest +//********************************************************************************* +// displayWranglerMatchPublished // +// Receives a notification when the IODisplayWrangler is published. +// When it's published we install a power state change handler. //****************************************************************************** -IONotifier * IOPMrootDomain::registerInterest( - const OSSymbol * typeOfInterest, - IOServiceInterestHandler handler, - void * target, void * ref ) +bool IOPMrootDomain::displayWranglerMatchPublished( + void * target, + void * refCon, + IOService * newService, + IONotifier * notifier __unused) { - IONotifier * notifier; - bool isConfigd; - - isConfigd = typeOfInterest && - typeOfInterest->isEqualTo(kIOPMPrivilegedPowerInterest); - - if (isConfigd) - typeOfInterest = gIOAppPowerStateInterest; - - notifier = super::registerInterest(typeOfInterest, handler, target, ref); - -#if ROOT_DOMAIN_RUN_STATES - if (isConfigd && notifier && pmPowerStateQueue) +#if !NO_KERNEL_HID + // found the display wrangler, now install a handler + if( !newService->registerInterest( gIOGeneralInterest, + &displayWranglerNotification, target, 0) ) { - notifier->retain(); - if (pmPowerStateQueue->submitPowerEvent( - kPowerEventConfigdRegisteredInterest, notifier) == false) - notifier->release(); + return false; } #endif - - return notifier; + return true; } -static bool clientMessageFilter( OSObject * object, void * arg ) +//****************************************************************************** +// reportUserInput +// +//****************************************************************************** + +void IOPMrootDomain::reportUserInput( void ) { -#if ROOT_DOMAIN_RUN_STATES -#if LOG_INTEREST_CLIENTS - IOPMInterestContext * context = (IOPMInterestContext *) arg; -#endif - bool allow = false; +#if !NO_KERNEL_HID + OSIterator * iter; - switch (gMessageClientType) + if(!wrangler) { - case kMessageClientNone: - allow = false; - break; - - case kMessageClientAll: - allow = true; - break; - - case kMessageClientConfigd: - allow = ((object == (OSObject *) gConfigdNotifier) || - (object == (OSObject *) gSysPowerDownNotifier)); - break; + iter = getMatchingServices(serviceMatching("IODisplayWrangler")); + if(iter) + { + wrangler = (IOService *) iter->getNextObject(); + iter->release(); + } } -#if LOG_INTEREST_CLIENTS - if (allow) - DLOG("system message %x to %p\n", - context->msgType, object); -#endif - - return allow; -#else - return true; + if(wrangler) + wrangler->activityTickle(0,0); #endif } +// MARK: - +// MARK: Battery //****************************************************************************** -// tellChangeDown +// batteryPublished // -// We override the superclass implementation so we can send a different message -// type to the client or application being notified. +// Notification on battery class IOPowerSource appearance //****************************************************************************** -bool IOPMrootDomain::tellChangeDown( unsigned long stateNum ) -{ - bool done; +bool IOPMrootDomain::batteryPublished( + void * target, + void * root_domain, + IOService * resourceService, + IONotifier * notifier __unused ) +{ + // rdar://2936060&4435589 + // All laptops have dimmable LCD displays + // All laptops have batteries + // So if this machine has a battery, publish the fact that the backlight + // supports dimming. + ((IOPMrootDomain *)root_domain)->publishFeature("DisplayDims"); - DLOG("tellChangeDown %u->%u, R-state %u\n", - (uint32_t) getPowerState(), (uint32_t) stateNum, runStateIndex); + return (true); +} - switch ( stateNum ) { - case DOZE_STATE: - case SLEEP_STATE: +// MARK: - +// MARK: System PM Policy - if (!ignoreChangeDown) - { - userActivityAtSleep = userActivityCount; - hibernateAborted = false; - DLOG("tellChangeDown::userActivityAtSleep %d\n", userActivityAtSleep); +//****************************************************************************** +// checkSystemCanSleep +// +//****************************************************************************** - // Direct callout into OSKext so it can disable kext unloads - // during sleep/wake to prevent deadlocks. - OSKextSystemSleepOrWake( kIOMessageSystemWillSleep ); +bool IOPMrootDomain::checkSystemCanSleep( IOOptionBits options ) +{ + int err = 0; - if ( (SLEEP_STATE == stateNum) && sleepSupportedPEFunction ) - { - // Reset PCI prevent sleep flag before calling platform driver. - OSBitAndAtomic(~kPCICantSleep, &platformSleepSupport); + // Conditions that prevent idle and demand system sleep. - // Skip PCI check for maintenance sleep. - if ((runStateFlags & kRStateFlagSuppressPCICheck) == 0) - { - // Determine if the machine supports sleep, or must doze. - getPlatform()->callPlatformFunction( - sleepSupportedPEFunction, false, - NULL, NULL, NULL, NULL); - } + do { + if (userDisabledAllSleep) + { + err = 1; // 1. user-space sleep kill switch + break; + } - // If the machine only supports doze, the callPlatformFunction call - // boils down to IOPMrootDomain::setSleepSupported(kPCICantSleep), - // otherwise nothing. - } + if (systemBooting || systemShutdown) + { + err = 2; // 2. restart or shutdown in progress + break; + } - // Notify platform that sleep has begun - getPlatform()->callPlatformFunction( - sleepMessagePEFunction, false, - (void *)(uintptr_t) kIOMessageSystemWillSleep, - NULL, NULL, NULL); + if (options == 0) + break; - // Update canSleep and kIOSleepSupportedKey property so drivers - // can tell if platform is going to sleep versus doze. + // Conditions above pegs the system at full wake. + // Conditions below prevent system sleep but does not prevent + // dark wake, and must be called from gated context. -#if CONFIG_SLEEP - canSleep = true; -#else - canSleep = false; +#if !CONFIG_SLEEP + err = 3; // 3. config does not support sleep + break; #endif - if (!sleepIsSupported) - canSleep = false; - if (platformSleepSupport & kPCICantSleep) - canSleep = false; - setProperty(kIOSleepSupportedKey, canSleep); - DLOG("canSleep %d\n", canSleep); - - // Publish the new sleep-wake UUID - publishSleepWakeUUID(true); - - // Two change downs are sent by IOServicePM. Ignore the 2nd. - ignoreChangeDown = true; - - tracePoint( kIOPMTracePointSystemSleepAppsPhase); - } - DLOG("kIOMessageSystemWillSleep (%d)\n", gMessageClientType); - done = super::tellClientsWithResponse( - kIOMessageSystemWillSleep, clientMessageFilter); - break; + if (lowBatteryCondition) + { + break; // always sleep on low battery + } - default: - done = super::tellChangeDown(stateNum); + if (childPreventSystemSleep) + { + err = 4; // 4. child prevent system sleep clamp break; - } - return done; -} - + } -//****************************************************************************** -// askChangeDown -// -// We override the superclass implementation so we can send a different message -// type to the client or application being notified. -// -// This must be idle sleep since we don't ask during any other power change. -//****************************************************************************** + if (getPMAssertionLevel( kIOPMDriverAssertionCPUBit ) == + kIOPMDriverAssertionLevelOn) + { + err = 5; // 5. CPU assertion + break; + } -bool IOPMrootDomain::askChangeDown( unsigned long stateNum ) -{ - DLOG("askChangeDown %u->%u, R-state %u\n", - (uint32_t) getPowerState(), (uint32_t) stateNum, runStateIndex); - DLOG("kIOMessageCanSystemSleep (%d)\n", gMessageClientType); + if (pciCantSleepValid) + { + if (pciCantSleepFlag) + err = 6; // 6. PCI card does not support PM (cached) + break; + } + else if (sleepSupportedPEFunction && + CAP_HIGHEST(kIOPMSystemCapabilityGraphics)) + { + IOReturn ret; + OSBitAndAtomic(~kPCICantSleep, &platformSleepSupport); + ret = getPlatform()->callPlatformFunction( + sleepSupportedPEFunction, false, + NULL, NULL, NULL, NULL); + pciCantSleepValid = true; + pciCantSleepFlag = false; + if ((platformSleepSupport & kPCICantSleep) || + ((ret != kIOReturnSuccess) && (ret != kIOReturnUnsupported))) + { + err = 6; // 6. PCI card does not support PM + pciCantSleepFlag = true; + break; + } + } + } + while (false); - return super::tellClientsWithResponse( - kIOMessageCanSystemSleep, - clientMessageFilter); + if (err) + { + DLOG("System sleep prevented by %d\n", err); + return false; + } + return true; } - //****************************************************************************** -// tellNoChangeDown -// -// Notify registered applications and kernel clients that we are not dropping -// power. -// -// We override the superclass implementation so we can send a different message -// type to the client or application being notified. +// adjustPowerState // -// This must be a vetoed idle sleep, since no other power change can be vetoed. +// Conditions that affect our wake/sleep decision has changed. +// If conditions dictate that the system must remain awake, clamp power +// state to max with changePowerStateToPriv(ON). Otherwise if sleepASAP +// is TRUE, then remove the power clamp and allow the power state to drop +// to SLEEP_STATE. //****************************************************************************** -void IOPMrootDomain::tellNoChangeDown( unsigned long stateNum ) +void IOPMrootDomain::adjustPowerState( bool sleepASAP ) { - DLOG("tellNoChangeDown %u->%u, R-state %u\n", - (uint32_t) getPowerState(), (uint32_t) stateNum, runStateIndex); + DLOG("adjustPowerState ps %u, asap %d, slider %ld\n", + (uint32_t) getPowerState(), sleepASAP, sleepSlider); - // Sleep canceled, clear the sleep trace point. - tracePoint(kIOPMTracePointSystemUp); + ASSERT_GATED(); - if (idleSeconds && !wrangler) + if ((sleepSlider == 0) || !checkSystemCanSleep()) { - // stay awake for at least idleSeconds - sleepASAP = false; - startIdleSleepTimer(idleSeconds); + changePowerStateToPriv(ON_STATE); + } + else if ( sleepASAP ) + { + changePowerStateToPriv(SLEEP_STATE); } - DLOG("kIOMessageSystemWillNotSleep (%d)\n", gMessageClientType); - return tellClients(kIOMessageSystemWillNotSleep, clientMessageFilter); } - //****************************************************************************** -// tellChangeUp -// -// Notify registered applications and kernel clients that we are raising power. +// dispatchPowerEvent // -// We override the superclass implementation so we can send a different message -// type to the client or application being notified. +// IOPMPowerStateQueue callback function. Running on PM work loop thread. //****************************************************************************** -void IOPMrootDomain::tellChangeUp( unsigned long stateNum ) +void IOPMrootDomain::dispatchPowerEvent( + uint32_t event, void * arg0, uint64_t arg1 ) { - OSData *publishPMStats = NULL; - - DLOG("tellChangeUp %u->%u, R-state %u\n", - (uint32_t) getPowerState(), (uint32_t) stateNum, runStateIndex); - - ignoreChangeDown = false; + DLOG("power event %u args %p 0x%llx\n", event, arg0, arg1); + ASSERT_GATED(); - if ( stateNum == ON_STATE ) + switch (event) { - // Direct callout into OSKext so it can disable kext unloads - // during sleep/wake to prevent deadlocks. - OSKextSystemSleepOrWake( kIOMessageSystemHasPoweredOn ); - - // Notify platform that sleep was cancelled or resumed. - getPlatform()->callPlatformFunction( - sleepMessagePEFunction, false, - (void *)(uintptr_t) kIOMessageSystemHasPoweredOn, - NULL, NULL, NULL); + case kPowerEventFeatureChanged: + messageClients(kIOPMMessageFeatureChange, this); + break; - if (getPowerState() == ON_STATE) - { - // this is a quick wake from aborted sleep - if (idleSeconds && !wrangler) + case kPowerEventReceivedPowerNotification: + handlePowerNotification( (UInt32)(uintptr_t) arg0 ); + break; + + case kPowerEventSystemBootCompleted: + if (systemBooting) { - // stay awake for at least idleSeconds - sleepASAP = false; - startIdleSleepTimer(idleSeconds); - } - DLOG("kIOMessageSystemWillPowerOn (%d)\n", gMessageClientType); - tellClients(kIOMessageSystemWillPowerOn, clientMessageFilter); - } -#if HIBERNATION - else - { - IOHibernateSystemPostWake(); - } -#endif - - tracePoint(kIOPMTracePointSystemWakeAppsPhase); - publishPMStats = OSData::withBytes(&pmStats, sizeof(pmStats)); - setProperty(kIOPMSleepStatisticsKey, publishPMStats); - publishPMStats->release(); - bzero(&pmStats, sizeof(pmStats)); + systemBooting = false; - if (pmStatsAppResponses) - { - setProperty(kIOPMSleepStatisticsAppsKey, pmStatsAppResponses); - pmStatsAppResponses->release(); - pmStatsAppResponses = OSArray::withCapacity(5); - } - - DLOG("kIOMessageSystemHasPoweredOn (%d)\n", gMessageClientType); - tellClients(kIOMessageSystemHasPoweredOn, clientMessageFilter); + // If lid is closed, re-send lid closed notification + // now that booting is complete. + if ( clamshellClosed ) + { + handlePowerNotification(kLocalEvalClamshellCommand); + } + evaluatePolicy( kStimulusAllowSystemSleepChanged ); + } + break; - tracePoint(kIOPMTracePointSystemUp); - } -} + case kPowerEventSystemShutdown: + if (kOSBooleanTrue == (OSBoolean *) arg0) + { + /* We set systemShutdown = true during shutdown + to prevent sleep at unexpected times while loginwindow is trying + to shutdown apps and while the OS is trying to transition to + complete power of. + + Set to true during shutdown, as soon as loginwindow shows + the "shutdown countdown dialog", through individual app + termination, and through black screen kernel shutdown. + */ + systemShutdown = true; + } else { + /* + A shutdown was initiated, but then the shutdown + was cancelled, clearing systemShutdown to false here. + */ + systemShutdown = false; + } + break; + case kPowerEventUserDisabledSleep: + userDisabledAllSleep = (kOSBooleanTrue == (OSBoolean *) arg0); + break; -//****************************************************************************** -// reportUserInput -// -//****************************************************************************** + case kPowerEventRegisterSystemCapabilityClient: + if (systemCapabilityNotifier) + { + systemCapabilityNotifier->release(); + systemCapabilityNotifier = 0; + } + if (arg0) + { + systemCapabilityNotifier = (IONotifier *) arg0; + systemCapabilityNotifier->retain(); + } + /* intentional fall-through */ -void IOPMrootDomain::reportUserInput( void ) -{ -#if !NO_KERNEL_HID - OSIterator * iter; + case kPowerEventRegisterKernelCapabilityClient: + if (!_joinedCapabilityClients) + _joinedCapabilityClients = OSSet::withCapacity(8); + if (arg0) + { + IONotifier * notify = (IONotifier *) arg0; + if (_joinedCapabilityClients) + { + _joinedCapabilityClients->setObject(notify); + synchronizePowerTree( kIOPMSyncNoChildNotify ); + } + notify->release(); + } + break; - if(!wrangler) - { - iter = getMatchingServices(serviceMatching("IODisplayWrangler")); - if(iter) - { - wrangler = (IOService *) iter->getNextObject(); - iter->release(); - } - } + case kPowerEventPolicyStimulus: + if (arg0) + { + int stimulus = (uintptr_t) arg0; + evaluatePolicy( stimulus, (uint32_t) arg1 ); + } + break; - if(wrangler) - wrangler->activityTickle(0,0); -#endif -} + case kPowerEventAssertionCreate: + if (pmAssertions) { + pmAssertions->handleCreateAssertion((OSData *)arg0); + } + break; -//****************************************************************************** -// setQuickSpinDownTimeout -// -//****************************************************************************** + case kPowerEventAssertionRelease: + if (pmAssertions) { + pmAssertions->handleReleaseAssertion(arg1); + } + break; -void IOPMrootDomain::setQuickSpinDownTimeout( void ) -{ - ASSERT_GATED(); - setAggressiveness( - kPMMinutesToSpinDown, 0, kAggressivesOptionQuickSpindownEnable ); + case kPowerEventAssertionSetLevel: + if (pmAssertions) { + pmAssertions->handleSetAssertionLevel(arg1, (IOPMDriverAssertionLevel)(uintptr_t)arg0); + } + break; + + case kPowerEventQueueSleepWakeUUID: + handleQueueSleepWakeUUID((OSObject *)arg0); + break; + case kPowerEventPublishSleepWakeUUID: + handlePublishSleepWakeUUID((bool)arg0); + break; + } } - //****************************************************************************** -// restoreUserSpinDownTimeout +// systemPowerEventOccurred +// +// The power controller is notifying us of a hardware-related power management +// event that we must handle. // +// systemPowerEventOccurred covers the same functionality that +// receivePowerNotification does; it simply provides a richer API for conveying +// more information. //****************************************************************************** -void IOPMrootDomain::restoreUserSpinDownTimeout( void ) +IOReturn IOPMrootDomain::systemPowerEventOccurred( + const OSSymbol *event, + uint32_t intValue) { - ASSERT_GATED(); - setAggressiveness( - kPMMinutesToSpinDown, 0, kAggressivesOptionQuickSpindownDisable ); -} + IOReturn attempt = kIOReturnSuccess; + OSNumber *newNumber = NULL; + + if (!event) + return kIOReturnBadArgument; + + newNumber = OSNumber::withNumber(intValue, 8*sizeof(intValue)); + if (!newNumber) + return kIOReturnInternalError; + attempt = systemPowerEventOccurred(event, (OSObject *)newNumber); -//****************************************************************************** -// changePowerStateTo & changePowerStateToPriv -// -// Override of these methods for logging purposes. -//****************************************************************************** + newNumber->release(); -IOReturn IOPMrootDomain::changePowerStateTo( unsigned long ordinal ) -{ - return kIOReturnUnsupported; // ignored + return attempt; } -IOReturn IOPMrootDomain::changePowerStateToPriv( unsigned long ordinal ) +IOReturn IOPMrootDomain::systemPowerEventOccurred( + const OSSymbol *event, + OSObject *value) { - DLOG("changePowerStateToPriv(%lu)\n", ordinal); + OSDictionary *thermalsDict = NULL; + bool shouldUpdate = true; + + if (!event || !value) + return kIOReturnBadArgument; - if ( (getPowerState() == DOZE_STATE) && (ordinal != ON_STATE) ) - { - return kIOReturnSuccess; - } + // LOCK + // We reuse featuresDict Lock because it already exists and guards + // the very infrequently used publish/remove feature mechanism; so there's zero rsk + // of stepping on that lock. + if (featuresDictLock) IOLockLock(featuresDictLock); - if ( (userDisabledAllSleep || systemBooting || systemShutdown) && - (ordinal == SLEEP_STATE) ) - { - DLOG("SLEEP rejected, forced to ON state (UD %d, SB %d, SS %d)\n", - userDisabledAllSleep, systemBooting, systemShutdown); + thermalsDict = (OSDictionary *)getProperty(kIOPMRootDomainPowerStatusKey); + + if (thermalsDict && OSDynamicCast(OSDictionary, thermalsDict)) { + thermalsDict = OSDictionary::withDictionary(thermalsDict); + } else { + thermalsDict = OSDictionary::withCapacity(1); + } - super::changePowerStateToPriv(ON_STATE); + if (!thermalsDict) { + shouldUpdate = false; + goto exit; } - return super::changePowerStateToPriv(ordinal); -} + thermalsDict->setObject (event, value); -//****************************************************************************** -// activity detect -// -//****************************************************************************** + setProperty (kIOPMRootDomainPowerStatusKey, thermalsDict); -bool IOPMrootDomain::activitySinceSleep(void) -{ - return (userActivityCount != userActivityAtSleep); -} + thermalsDict->release(); -bool IOPMrootDomain::abortHibernation(void) -{ - bool ret = activitySinceSleep(); +exit: + // UNLOCK + if (featuresDictLock) IOLockUnlock(featuresDictLock); - if (ret && !hibernateAborted) - { - DLOG("activitySinceSleep ABORT [%d, %d]\n", userActivityCount, userActivityAtSleep); - hibernateAborted = true; - } - return (ret); -} + if (shouldUpdate) + messageClients (kIOPMMessageSystemPowerEventOccurred, (void *)NULL); -extern "C" int -hibernate_should_abort(void) -{ - if (gRootDomain) - return (gRootDomain->abortHibernation()); - else - return (0); + return kIOReturnSuccess; } //****************************************************************************** -// updateRunState +// receivePowerNotification // +// The power controller is notifying us of a hardware-related power management +// event that we must handle. This may be a result of an 'environment' interrupt +// from the power mgt micro. //****************************************************************************** -void IOPMrootDomain::updateRunState( uint32_t inRunState ) +IOReturn IOPMrootDomain::receivePowerNotification( UInt32 msg ) { -#if ROOT_DOMAIN_RUN_STATES - if (inRunState < kRStateCount) - { - runStateIndex = nextRunStateIndex = inRunState; - runStateFlags = gRStateFlags[inRunState]; - - setProperty( - kIOPMRootDomainRunStateKey, - (unsigned long long) inRunState, 32); - } -#endif + pmPowerStateQueue->submitPowerEvent( + kPowerEventReceivedPowerNotification, (void *) msg ); + return kIOReturnSuccess; } +void IOPMrootDomain::handlePowerNotification( UInt32 msg ) +{ + bool eval_clamshell = false; -#if ROOT_DOMAIN_RUN_STATES -//****************************************************************************** -// tagPowerPlaneService -// -// Running on PM work loop thread. -//****************************************************************************** + ASSERT_GATED(); -void IOPMrootDomain::tagPowerPlaneService( - IOService * service, - uint32_t * rdFlags ) -{ - *rdFlags = 0; + /* + * Local (IOPMrootDomain only) eval clamshell command + */ + if (msg & kLocalEvalClamshellCommand) + { + eval_clamshell = true; + } - if (service->getProperty("IOPMStrictTreeOrder") || - service->metaCast("IODisplayWrangler") || - OSDynamicCast(OSNumber, - service->getProperty("IOPMUnattendedWakePowerState"))) + /* + * Overtemp + */ + if (msg & kIOPMOverTemp) { - *rdFlags |= kServiceFlagGraphics; - DLOG("tagged device %s %x\n", service->getName(), *rdFlags); + MSG("PowerManagement emergency overtemp signal. Going to sleep!"); + privateSleepSystem (kIOPMSleepReasonThermalEmergency); } - // Locate the first PCI host bridge. - if (!pciHostBridgeDevice && service->metaCast("IOPCIBridge")) + /* + * Sleep Now! + */ + if (msg & kIOPMSleepNow) { - IOService * provider = service->getProvider(); - if (OSDynamicCast(IOPlatformDevice, provider) && - provider->inPlane(gIODTPlane)) - { - pciHostBridgeDevice = provider; - DLOG("PMTrace found PCI host bridge %s->%s\n", - provider->getName(), service->getName()); - } + privateSleepSystem (kIOPMSleepReasonSoftware); + } + + /* + * Power Emergency + */ + if (msg & kIOPMPowerEmergency) + { + lowBatteryCondition = true; + privateSleepSystem (kIOPMSleepReasonLowPower); } - // Tag top-level PCI devices. The order of PMinit() call does not - // change across boots and is used as the PCI bit number. - if (pciHostBridgeDevice && service->metaCast("IOPCIDevice")) + /* + * Clamshell OPEN + */ + if (msg & kIOPMClamshellOpened) { - // Would prefer to check built-in property, but tagPowerPlaneService() - // is called before pciDevice->registerService(). - IORegistryEntry * parent = service->getParentEntry(gIODTPlane); - if ((parent == pciHostBridgeDevice) && service->getProperty("acpi-device")) + // Received clamshel open message from clamshell controlling driver + // Update our internal state and tell general interest clients + clamshellClosed = false; + clamshellExists = true; + + if (msg & kIOPMSetValue) { - int bit = pmTracer->recordTopLevelPCIDevice( service ); - if (bit >= 0) - { - // Save the assigned bit for fast lookup. - bit &= 0xff; - *rdFlags |= (kServiceFlagTopLevelPCI | (bit << 8)); - } - } - } -} + reportUserInput(); + } + // Tell PMCPU + informCPUStateChange(kInformLid, 0); -//****************************************************************************** -// handleActivityTickleForService -// -// Called by IOService::activityTickle() for a tickle that is requesting the -// service to raise power state. Called from driver thread. -//****************************************************************************** + // Tell general interest clients + sendClientClamshellNotification(); -void IOPMrootDomain::handleActivityTickleForService( IOService * service, - unsigned long type, - unsigned long currentPowerState, - uint32_t activityTickleCount ) -{ - if ((service == wrangler) -) - { - bool aborting = ((lastSleepReason == kIOPMSleepReasonIdle) + bool aborting = ((lastSleepReason == kIOPMSleepReasonClamshell) + || (lastSleepReason == kIOPMSleepReasonIdle) || (lastSleepReason == kIOPMSleepReasonMaintenance)); if (aborting) userActivityCount++; - DLOG("display wrangler tickled1 %d lastSleepReason %d\n", userActivityCount, lastSleepReason); - } - - // Tickle directed to IODisplayWrangler while graphics is disabled. - // Bring graphics online. + DLOG("clamshell tickled %d lastSleepReason %d\n", userActivityCount, lastSleepReason); + } - if ((!currentPowerState) && - (service == wrangler) && - (runStateIndex > kRStateNormal) && - (false == wranglerTickled) && - (false == lowBatteryCondition)) + /* + * Clamshell CLOSED + * Send the clamshell interest notification since the lid is closing. + */ + if (msg & kIOPMClamshellClosed) { - DLOG("display wrangler tickled\n"); - if (kIOLogPMRootDomain & gIOKitDebug) - OSReportWithBacktrace("Display Tickle"); - wranglerTickled = true; - synchronizePowerTree(); - } -} - -//****************************************************************************** -// handlePowerChangeStartForService -// -// Running on PM work loop thread. -//****************************************************************************** + // Received clamshel open message from clamshell controlling driver + // Update our internal state and tell general interest clients + clamshellClosed = true; + clamshellExists = true; -void IOPMrootDomain::handlePowerChangeStartForService( - IOService * service, - uint32_t * rdFlags, - uint32_t newPowerState, - uint32_t changeFlags ) -{ - if (service == this) - { - uint32_t currentPowerState = (uint32_t) getPowerState(); - uint32_t nextRunStateFlags; + // Tell PMCPU + informCPUStateChange(kInformLid, 1); - assert(nextRunStateIndex < kRStateCount); - nextRunStateFlags = gRStateFlags[nextRunStateIndex]; + // Tell general interest clients + sendClientClamshellNotification(); + + // And set eval_clamshell = so we can attempt + eval_clamshell = true; + } - gMessageClientType = kMessageClientNone; + /* + * Set Desktop mode (sent from graphics) + * + * -> reevaluate lid state + */ + if (msg & kIOPMSetDesktopMode) + { + desktopMode = (0 != (msg & kIOPMSetValue)); + msg &= ~(kIOPMSetDesktopMode | kIOPMSetValue); - // Transition towards or away from ON power state. + sendClientClamshellNotification(); - if ((currentPowerState != newPowerState) && - ((ON_STATE == newPowerState) || (ON_STATE == currentPowerState))) + // Re-evaluate the lid state + if( clamshellClosed ) { - if ((runStateFlags & kRStateFlagSuppressMessages) == 0) - gMessageClientType = kMessageClientAll; - else - gMessageClientType = kMessageClientConfigd; + eval_clamshell = true; } + } + + /* + * AC Adaptor connected + * + * -> reevaluate lid state + */ + if (msg & kIOPMSetACAdaptorConnected) + { + acAdaptorConnected = (0 != (msg & kIOPMSetValue)); + msg &= ~(kIOPMSetACAdaptorConnected | kIOPMSetValue); - // Transition caused by deassertion of system notification suppression. + // Tell CPU PM + informCPUStateChange(kInformAC, !acAdaptorConnected); - if ((ON_STATE == newPowerState) && - (ON_STATE == currentPowerState) && - ((runStateFlags ^ nextRunStateFlags) & kRStateFlagSuppressMessages)) - { - gMessageClientType = kMessageClientAll; - } + // Tell BSD if AC is connected + // 0 == external power source; 1 == on battery + post_sys_powersource(acAdaptorConnected ? 0:1); - if (ON_STATE == newPowerState) - { - DLOG("kIOMessageSystemWillPowerOn (%d)\n", - gMessageClientType); - tellClients(kIOMessageSystemWillPowerOn, clientMessageFilter); - } - - if (SLEEP_STATE == newPowerState) + sendClientClamshellNotification(); + + // Re-evaluate the lid state + if( clamshellClosed ) { - tracePoint(kIOPMTracePointSleepStarted); + eval_clamshell = true; } } - if (*rdFlags & kServiceFlagTopLevelPCI) + /* + * Enable Clamshell (external display disappear) + * + * -> reevaluate lid state + */ + if (msg & kIOPMEnableClamshell) { - pmTracer->tracePCIPowerChange( - PMTraceWorker::kPowerChangeStart, - service, changeFlags, - (*rdFlags >> 8) & 0xff); - } -} - + // Re-evaluate the lid state + // System should sleep on external display disappearance + // in lid closed operation. + if( clamshellClosed && (true == clamshellDisabled) ) + { + eval_clamshell = true; + } -//****************************************************************************** -// handlePowerChangeDoneForService -// -// Running on PM work loop thread. -//****************************************************************************** + clamshellDisabled = false; -void IOPMrootDomain::handlePowerChangeDoneForService( - IOService * service, - uint32_t * rdFlags, - uint32_t newPowerState, - uint32_t changeFlags ) -{ - if (*rdFlags & kServiceFlagTopLevelPCI) - { - pmTracer->tracePCIPowerChange( - PMTraceWorker::kPowerChangeCompleted, - service, changeFlags, - (*rdFlags >> 8) & 0xff); + sendClientClamshellNotification(); } -} - - -//****************************************************************************** -// overridePowerStateForService -// -// Runs on PM work loop thread. -//****************************************************************************** + + /* + * Disable Clamshell (external display appeared) + * We don't bother re-evaluating clamshell state. If the system is awake, + * the lid is probably open. + */ + if (msg & kIOPMDisableClamshell) + { + clamshellDisabled = true; -void IOPMrootDomain::overridePowerStateForService( - IOService * service, - uint32_t * rdFlags, - unsigned long * powerState, - uint32_t changeFlags ) -{ - uint32_t inPowerState = (uint32_t) *powerState; + sendClientClamshellNotification(); + } - if ((service == this) && (inPowerState == ON_STATE) && - (changeFlags & kIOPMSynchronize)) + /* + * Evaluate clamshell and SLEEP if appropiate + */ + if ( eval_clamshell && shouldSleepOnClamshellClosed() ) { - DLOG("sync root domain %u->%u\n", - (uint32_t) getPowerState(), inPowerState); - // Root Domain is in a reduced R-state, and a HID tickle has - // requested a PM tree sync. Begin R-state transition. - if (runStateIndex != kRStateNormal) - { - sleepTimerMaintenance = false; - hibernateNoDefeat = false; - nextRunStateIndex = kRStateNormal; - setProperty( - kIOPMRootDomainRunStateKey, - (unsigned long long) kRStateNormal, 32); - } + // SLEEP! + privateSleepSystem (kIOPMSleepReasonClamshell); } - - if (*rdFlags & kServiceFlagGraphics) + else if ( eval_clamshell ) { - DLOG("graphics device %s %u->%u (flags 0x%x)\n", - service->getName(), (uint32_t) service->getPowerState(), - inPowerState, changeFlags); + evaluatePolicy(kStimulusDarkWakeEvaluate); + } - if (inPowerState == 0) + /* + * Power Button + */ + if (msg & kIOPMPowerButton) + { + if (!wranglerAsleep) { - // Graphics device is powering down, apply limit preventing - // device from powering back up later unless we consent. - - if ((*rdFlags & kServiceFlagNoPowerUp) == 0) - { - *rdFlags |= kServiceFlagNoPowerUp; - DLOG("asserted power limit for %s\n", - service->getName()); + OSString *pbs = OSString::withCString("DisablePowerButtonSleep"); + // Check that power button sleep is enabled + if( pbs ) { + if( kOSBooleanTrue != getProperty(pbs)) + privateSleepSystem (kIOPMSleepReasonPowerButton); } } else - { - uint32_t nextRunStateFlags; - - assert(nextRunStateIndex < kRStateCount); - nextRunStateFlags = gRStateFlags[nextRunStateIndex]; - - // Graphics device is powering up. Release power limit at the - // did-change machine state. - - if (changeFlags & kIOPMSynchronize) - { - if ((runStateFlags & kRStateFlagSuppressGraphics) && - ((nextRunStateFlags & kRStateFlagSuppressGraphics) == 0) && - (changeFlags & kIOPMDomainDidChange)) - { - // Woke up without graphics power, but - // HID event has tickled display wrangler. - *rdFlags &= ~kServiceFlagNoPowerUp; - DLOG("removed power limit for %s\n", - service->getName()); - } - } - else if ((runStateFlags & kRStateFlagSuppressGraphics) == 0) - { - *rdFlags &= ~kServiceFlagNoPowerUp; - } - - if (*rdFlags & kServiceFlagNoPowerUp) - { - DLOG("limited %s to power state 0\n", - service->getName()); - *powerState = 0; - } - } + reportUserInput(); } } - -//****************************************************************************** -// setMaintenanceWakeCalendar -// -//****************************************************************************** - -IOReturn IOPMrootDomain::setMaintenanceWakeCalendar( - const IOPMCalendarStruct * calendar ) -{ - OSData * data; - IOReturn ret; - - if (!calendar) - return kIOReturnBadArgument; - - data = OSData::withBytesNoCopy((void *) calendar, sizeof(*calendar)); - if (!data) - return kIOReturnNoMemory; - - ret = setPMSetting(gIOPMSettingMaintenanceWakeCalendarKey, data); - - data->release(); - return ret; -} -#endif /* ROOT_DOMAIN_RUN_STATES */ - - //****************************************************************************** -// sysPowerDownHandler -// -// Receives a notification when the RootDomain changes state. +// evaluatePolicy // -// Allows us to take action on system sleep, power down, and restart after -// applications have received their power change notifications and replied, -// but before drivers have powered down. We perform a vfs sync on power down. +// Evaluate root-domain policy in response to external changes. //****************************************************************************** -IOReturn IOPMrootDomain::sysPowerDownHandler( void * target, void * refCon, - UInt32 messageType, IOService * service, - void * messageArgument, vm_size_t argSize ) +void IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg ) { - IOReturn ret; - IOPowerStateChangeNotification *params = (IOPowerStateChangeNotification *) messageArgument; - IOPMrootDomain *rootDomain = OSDynamicCast(IOPMrootDomain, service); - - DLOG("sysPowerDownHandler message %x\n", (uint32_t) messageType); - - if(!rootDomain) - return kIOReturnUnsupported; - - switch (messageType) { - case kIOMessageSystemWillSleep: - // Interested applications have been notified of an impending power - // change and have acked (when applicable). - // This is our chance to save whatever state we can before powering - // down. - // We call sync_internal defined in xnu/bsd/vfs/vfs_syscalls.c, - // via callout -#if HIBERNATION - rootDomain->evaluateSystemSleepPolicyEarly(); - if (rootDomain->hibernateMode && !rootDomain->hibernateDisabled) - { - // We will ack within 240 seconds - params->returnValue = 240 * 1000 * 1000; - } - else -#endif - // We will ack within 20 seconds - params->returnValue = 20 * 1000 * 1000; - DLOG("sysPowerDownHandler timeout %d s\n", (int) (params->returnValue / 1000 / 1000)); - if ( ! OSCompareAndSwap( 0, 1, &gSleepOrShutdownPending ) ) - { - // Purposely delay the ack and hope that shutdown occurs quickly. - // Another option is not to schedule the thread and wait for - // ack timeout... - AbsoluteTime deadline; - clock_interval_to_deadline( 30, kSecondScale, &deadline ); - thread_call_enter1_delayed( rootDomain->diskSyncCalloutEntry, - (thread_call_param_t)params->powerRef, - deadline ); - } - else - thread_call_enter1(rootDomain->diskSyncCalloutEntry, (thread_call_param_t)params->powerRef); - ret = kIOReturnSuccess; - break; - - case kIOMessageSystemWillPowerOff: - case kIOMessageSystemWillRestart: - ret = kIOReturnUnsupported; - break; + union { + struct { + int idleSleepEnabled : 1; + int idleSleepDisabled : 1; + int displaySleep : 1; + int sleepDelayChanged : 1; + int evaluateDarkWake : 1; + } bit; + uint32_t u32; + } flags; + + DLOG("evaluatePolicy( %d, 0x%x )\n", stimulus, arg); - default: - ret = kIOReturnUnsupported; - break; - } - return ret; -} + ASSERT_GATED(); + flags.u32 = 0; -//****************************************************************************** -// publishSleepWakeUUID -// -// -//****************************************************************************** -void IOPMrootDomain::publishSleepWakeUUID( bool shouldPublish ) -{ - if (shouldPublish) + switch (stimulus) { - if (queuedSleepWakeUUIDString) - { - if (OSCompareAndSwap(/*old*/ true, /*new*/ false, &gSleepWakeUUIDIsSet)) + case kStimulusDisplayWranglerSleep: + if (!wranglerAsleep) { - // Upon wake, it takes some time for userland to invalidate the - // UUID. If another sleep is initiated during that period, force - // a CLEAR message to balance the upcoming SET message. - - messageClients( kIOPMMessageSleepWakeUUIDChange, - kIOPMMessageSleepWakeUUIDCleared ); - - DLOG("SleepWake UUID forced clear\n"); + wranglerAsleep = true; + clock_get_uptime(&wranglerSleepTime); + flags.bit.displaySleep = true; } + break; - setProperty(kIOPMSleepWakeUUIDKey, queuedSleepWakeUUIDString); - DLOG("SleepWake UUID published: %s\n", queuedSleepWakeUUIDString->getCStringNoCopy()); - queuedSleepWakeUUIDString->release(); - queuedSleepWakeUUIDString = NULL; - messageClients(kIOPMMessageSleepWakeUUIDChange, - kIOPMMessageSleepWakeUUIDSet); - } - } else { - if (OSCompareAndSwap(/*old*/ true, /*new*/ false, &gSleepWakeUUIDIsSet)) - { - DLOG("SleepWake UUID cleared\n"); - removeProperty(kIOPMSleepWakeUUIDKey); - messageClients(kIOPMMessageSleepWakeUUIDChange, - kIOPMMessageSleepWakeUUIDCleared); - } - } -} - - -//****************************************************************************** -// displayWranglerNotification -// -// Receives a notification when the IODisplayWrangler changes state. -// -// Allows us to take action on display dim/undim. -// -// When the display sleeps we: -// - Start the idle sleep timer -// - set the quick spin down timeout -// -// On wake from display sleep: -// - Cancel the idle sleep timer -// - restore the user's chosen spindown timer from the "quick" spin down value -//****************************************************************************** - -IOReturn IOPMrootDomain::displayWranglerNotification( - void * target, void * refCon, - UInt32 messageType, IOService * service, - void * messageArgument, vm_size_t argSize ) -{ -#if !NO_KERNEL_HID - int displayPowerState; - IOPowerStateChangeNotification * params = - (IOPowerStateChangeNotification *) messageArgument; - - if ((messageType != kIOMessageDeviceWillPowerOff) && - (messageType != kIOMessageDeviceHasPoweredOn)) - return kIOReturnUnsupported; + case kStimulusDisplayWranglerWake: + wranglerAsleep = false; + flags.bit.idleSleepDisabled = true; + break; - ASSERT_GATED(); - if (!gRootDomain) - return kIOReturnUnsupported; + case kStimulusAggressivenessChanged: + { + unsigned long minutesToIdleSleep = 0; + unsigned long minutesToDisplayDim = 0; + unsigned long minutesDelta = 0; - displayPowerState = params->stateNumber; - DLOG("DisplayWrangler message 0x%x, new power state %d\n", - (uint32_t) messageType, displayPowerState); + // Fetch latest display and system sleep slider values. + getAggressiveness(kPMMinutesToSleep, &minutesToIdleSleep); + getAggressiveness(kPMMinutesToDim, &minutesToDisplayDim); + DLOG("aggressiveness changed: system %u->%u, display %u\n", + (uint32_t) sleepSlider, + (uint32_t) minutesToIdleSleep, + (uint32_t) minutesToDisplayDim); - switch (messageType) { - case kIOMessageDeviceWillPowerOff: + DLOG("idle time -> %ld secs (ena %d)\n", + idleSeconds, (minutesToIdleSleep != 0)); - // The display wrangler has dropped power because of idle display sleep - // or force system sleep. - // - // 4 Display ON - // 3 Display Dim - // 2 Display Sleep - // 1 Not visible to user - // 0 Not visible to user + if (0x7fffffff == minutesToIdleSleep) + minutesToIdleSleep = idleSeconds; - if (gRootDomain->wranglerAsleep || (displayPowerState > 2)) - break; + // How long to wait before sleeping the system once + // the displays turns off is indicated by 'extraSleepDelay'. - // Record the time the display wrangler went to sleep. + if ( minutesToIdleSleep > minutesToDisplayDim ) + minutesDelta = minutesToIdleSleep - minutesToDisplayDim; - gRootDomain->wranglerAsleep = true; - clock_get_uptime(&gRootDomain->wranglerSleepTime); + if ((sleepSlider == 0) && (minutesToIdleSleep != 0)) + flags.bit.idleSleepEnabled = true; - // We start a timer here if the System Sleep timer is greater than the - // Display Sleep timer. We kick off this timer when the display sleeps. - // - // Note that, although Display Dim timings may change adaptively accordingly - // to the user's activity patterns, Display Sleep _always_ occurs at the - // specified interval since last user activity. + if ((sleepSlider != 0) && (minutesToIdleSleep == 0)) + flags.bit.idleSleepDisabled = true; + + if ((minutesDelta != extraSleepDelay) && + !flags.bit.idleSleepEnabled && !flags.bit.idleSleepDisabled) + flags.bit.sleepDelayChanged = true; - if ( gRootDomain->extraSleepDelay ) + if (systemDarkWake && !darkWakeToSleepASAP && + (flags.bit.idleSleepEnabled || flags.bit.idleSleepDisabled)) { - gRootDomain->startIdleSleepTimer(gRootDomain->extraSleepDelay * 60); + // Reconsider decision to remain in dark wake + flags.bit.evaluateDarkWake = true; } - else if ( gRootDomain->sleepSlider ) - { - // Accelerate disk spindown if system sleep and display sleep - // sliders are set to the same value (e.g. both set to 5 min), - // and display is about to go dark. Check that spin down timer - // is non-zero (zero = never spin down) and system sleep is - // not set to never sleep. - gRootDomain->setQuickSpinDownTimeout(); - } + sleepSlider = minutesToIdleSleep; + extraSleepDelay = minutesDelta; + } break; + case kStimulusDemandSystemSleep: + changePowerStateWithOverrideTo( SLEEP_STATE ); break; - case kIOMessageDeviceHasPoweredOn: + case kStimulusAllowSystemSleepChanged: + // FIXME: de-compose to change flags. + adjustPowerState(); + break; - // The display wrangler has powered on either because of user activity - // or wake from sleep/doze. + case kStimulusDarkWakeActivityTickle: + if (false == wranglerTickled) + { + uint32_t options = 0; + IOService * pciRoot = 0; - if ( 4 != displayPowerState ) - break; + if (rejectWranglerTickle) + { + DLOG("rejected tickle, type %u capability %x:%x\n", + _systemTransitionType, + _currentCapability, _pendingCapability); + break; + } + + _desiredCapability |= + (kIOPMSystemCapabilityGraphics | + kIOPMSystemCapabilityAudio); + + if ((kSystemTransitionWake == _systemTransitionType) && + !(_pendingCapability & kIOPMSystemCapabilityGraphics) && + !graphicsSuppressed) + { + DLOG("Promoting to full wake\n"); + + // Elevate to full wake while waking up to dark wake. + // PM will hold off notifying the graphics subsystem about + // system wake as late as possible, so if a HID event does + // arrive, we can turn on graphics on this wake cycle, and + // not have to wait till the following cycle. That latency + // can be huge on some systems. However, once any graphics + // suppression has taken effect, it is too late. All other + // graphics devices must be similarly suppressed. But the + // delay till the following cycle should be very short. + + _pendingCapability |= + (kIOPMSystemCapabilityGraphics | + kIOPMSystemCapabilityAudio); + + // Immediately bring up audio and graphics. + pciRoot = pciHostBridgeDriver; + + // Notify clients about full wake. + _systemMessageClientMask = kSystemMessageClientAll; + tellClients(kIOMessageSystemWillPowerOn); + } - gRootDomain->wranglerAsleep = false; - gRootDomain->adjustPowerState(); - gRootDomain->cancelIdleSleepTimer(); + // Unsafe to cancel once graphics was powered. + // If system woke from dark wake, the return to sleep can + // be cancelled. But "awake -> dark -> sleep" transition + // cannot be cancelled. + + if (!CAP_HIGHEST(kIOPMSystemCapabilityGraphics)) { + options |= kIOPMSyncCancelPowerDown; + } - // Change the spindown value back to the user's selection from our - // accelerated setting. - gRootDomain->restoreUserSpinDownTimeout(); + synchronizePowerTree( options, pciRoot ); + wranglerTickled = true; + // IOGraphics doesn't lit the display even though graphics + // is enanbled in kIOMessageSystemCapabilityChange message(radar 9502104) + // So, do an explicit activity tickle + if(wrangler) + wrangler->activityTickle(0,0); + if (logWranglerTickle) + { + AbsoluteTime now; + uint64_t nsec; + + clock_get_uptime(&now); + SUB_ABSOLUTETIME(&now, &systemWakeTime); + absolutetime_to_nanoseconds(now, &nsec); + MSG("HID tickle %u ms\n", + ((int)((nsec) / 1000000ULL))); + logWranglerTickle = false; + } + } break; - default: - break; - } -#endif - return kIOReturnUnsupported; -} - + case kStimulusDarkWakeEntry: + case kStimulusDarkWakeReentry: + // Any system transitions since the last dark wake transition + // will invalid the stimulus. -//****************************************************************************** -// displayWranglerPublished -// -// Receives a notification when the IODisplayWrangler is published. -// When it's published we install a power state change handler. -//****************************************************************************** + if (arg == _systemStateGeneration) + { + DLOG("dark wake entry\n"); + systemDarkWake = true; + wranglerAsleep = true; + clock_get_uptime(&wranglerSleepTime); -bool IOPMrootDomain::displayWranglerPublished( - void * target, - void * refCon, - IOService * newService) -{ -#if !NO_KERNEL_HID - if(!gRootDomain) - return false; + // Always accelerate disk spindown while in dark wake, + // even if system does not support/allow sleep. - gRootDomain->wrangler = newService; + cancelIdleSleepTimer(); + setQuickSpinDownTimeout(); + flags.bit.evaluateDarkWake = true; + } + break; - // we found the display wrangler, now install a handler - if( !gRootDomain->wrangler->registerInterest( gIOGeneralInterest, - &displayWranglerNotification, target, 0) ) - { - return false; - } + case kStimulusDarkWakeEvaluate: + if (systemDarkWake) + { + flags.bit.evaluateDarkWake = true; + } +#if !DARK_TO_FULL_EVALUATE_CLAMSHELL + else + { + // Not through kLocalEvalClamshellCommand to avoid loop. + if (clamshellClosed && shouldSleepOnClamshellClosed() && + checkSystemCanSleep(true)) + { + privateSleepSystem( kIOPMSleepReasonClamshell ); + } + } #endif - return true; -} + break; + } /* switch(stimulus) */ -//****************************************************************************** -// batteryPublished -// -// Notification on battery class IOPowerSource appearance -//****************************************************************************** + if (flags.bit.evaluateDarkWake && !wranglerTickled) + { + if (darkWakeToSleepASAP || + (clamshellClosed && !(desktopMode && acAdaptorConnected))) + { + // System currently in dark wake, and no children and + // assertion prevent system sleep. -bool IOPMrootDomain::batteryPublished( - void * target, - void * root_domain, - IOService * resourceService ) -{ - // rdar://2936060&4435589 - // All laptops have dimmable LCD displays - // All laptops have batteries - // So if this machine has a battery, publish the fact that the backlight - // supports dimming. - ((IOPMrootDomain *)root_domain)->publishFeature("DisplayDims"); + if (checkSystemCanSleep(true)) + { + if (lowBatteryCondition) + { + lastSleepReason = kIOPMSleepReasonLowPower; + setProperty(kRootDomainSleepReasonKey, kIOPMLowPowerSleepKey); + } + else if (darkWakeMaintenance) + { + lastSleepReason = kIOPMSleepReasonMaintenance; + setProperty(kRootDomainSleepReasonKey, kIOPMMaintenanceSleepKey); + } + changePowerStateWithOverrideTo( SLEEP_STATE ); + } + else + { + // Parked in dark wake, a tickle will return to full wake + rejectWranglerTickle = false; + } + } else // non-maintenance (network) dark wake + { + if (checkSystemCanSleep(true)) + { + // Release power clamp, and wait for children idle. + adjustPowerState(true); + } + else + { + changePowerStateToPriv(ON_STATE); + } + rejectWranglerTickle = false; + } + } - return (true); -} + if (systemDarkWake) + { + // The rest are irrelevant while system is in dark wake. + flags.u32 = 0; + } + if (flags.bit.displaySleep || flags.bit.sleepDelayChanged) + { + bool cancelQuickSpindown = false; -//****************************************************************************** -// adjustPowerState -// -// Some condition that affects our wake/sleep/doze decision has changed. -// -// If the sleep slider is in the off position, we cannot sleep or doze. -// If the enclosure is open, we cannot sleep or doze. -// If the system is still booting, we cannot sleep or doze. -// -// In those circumstances, we prevent sleep and doze by holding power on with -// changePowerStateToPriv(ON). -// -// If the above conditions do not exist, and also the sleep timer has expired, -// we allow sleep or doze to occur with either changePowerStateToPriv(SLEEP) or -// changePowerStateToPriv(DOZE) depending on whether or not we already know the -// platform cannot sleep. -// -// In this case, sleep or doze will either occur immediately or at the next time -// that no children are holding the system out of idle sleep via the -// kIOPMPreventIdleSleep flag in their power state arrays. -//****************************************************************************** + if (flags.bit.sleepDelayChanged) + { + DLOG("extra sleep timer changed\n"); + cancelIdleSleepTimer(); + cancelQuickSpindown = true; + } + else + { + DLOG("display sleep\n"); + } -void IOPMrootDomain::adjustPowerState( void ) -{ - DLOG("adjustPowerState " - "PS %u, ASAP %d, SL %ld, AS %d, SB %d, SS %d, UD %d\n", - (uint32_t) getPowerState(), sleepASAP, sleepSlider, - allowSleep, systemBooting, systemShutdown, userDisabledAllSleep); + if (wranglerAsleep && !wranglerSleepIgnored) + { + if ( extraSleepDelay ) + { + // Start a timer here if the System Sleep timer is greater + // than the Display Sleep timer. - ASSERT_GATED(); + startIdleSleepTimer(gRootDomain->extraSleepDelay * 60); + } + else if ( sleepSlider ) + { + // Accelerate disk spindown if system sleep and display sleep + // sliders are set to the same value (e.g. both set to 5 min), + // and display is about to go dark. Check the system sleep is + // not set to never sleep. Disk sleep setting is ignored. + + setQuickSpinDownTimeout(); + cancelQuickSpindown = false; + } + } + + if (cancelQuickSpindown) + restoreUserSpinDownTimeout(); + } - if ( (sleepSlider == 0) - || !allowSleep - || systemBooting - || systemShutdown - || userDisabledAllSleep - || (runStateFlags & kRStateFlagDisableIdleSleep) ) + if (flags.bit.idleSleepEnabled) { - changePowerStateToPriv(ON_STATE); - } else { - if ( sleepASAP ) + DLOG("idle sleep timer enabled\n"); + if (!wrangler) { - /* Convenient place to run any code at idle sleep time - * IOPMrootDomain initiates an idle sleep here - * - * Set last sleep cause accordingly. - */ - lastSleepReason = kIOPMSleepReasonIdle; - setProperty(kRootDomainSleepReasonKey, kIOPMIdleSleepKey); + changePowerStateToPriv(ON_STATE); + if (idleSeconds) + { + startIdleSleepTimer( idleSeconds ); + } + } + else + { + // Start idle sleep timer if wrangler went to sleep + // while system sleep was disabled. Disk spindown is + // accelerated upon timer expiration. + + if (wranglerAsleep) + { + AbsoluteTime now; + uint64_t nanos; + uint32_t minutesSinceDisplaySleep = 0; + uint32_t sleepDelay; + + clock_get_uptime(&now); + if (CMP_ABSOLUTETIME(&now, &wranglerSleepTime) > 0) + { + SUB_ABSOLUTETIME(&now, &wranglerSleepTime); + absolutetime_to_nanoseconds(now, &nanos); + minutesSinceDisplaySleep = nanos / (60000000000ULL); + } + + if (extraSleepDelay > minutesSinceDisplaySleep) + { + sleepDelay = extraSleepDelay - minutesSinceDisplaySleep; + } + else + { + sleepDelay = 1; // 1 min + } - sleepASAP = false; - changePowerStateToPriv(SLEEP_STATE); + startIdleSleepTimer(sleepDelay * 60); + DLOG("display slept %u min, set idle timer to %u min\n", + minutesSinceDisplaySleep, sleepDelay); + } } } + + if (flags.bit.idleSleepDisabled) + { + DLOG("idle sleep timer disabled\n"); + cancelIdleSleepTimer(); + restoreUserSpinDownTimeout(); + adjustPowerState(); + } } +// MARK: - +// MARK: Statistics + +//****************************************************************************** +// pmStats +// +//****************************************************************************** + void IOPMrootDomain::pmStatsRecordEvent( int eventIndex, AbsoluteTime timestamp) @@ -4783,6 +5870,8 @@ void IOPMrootDomain::pmStatsRecordApplicationResponse( return; } +// MARK: - +// MARK: PMTraceWorker //****************************************************************************** // TracePoint support @@ -4812,7 +5901,7 @@ IOReturn IOPMrootDomain::callPlatformFunction( statusCode = (((uint64_t)tracePointPCI) << 32) | tracePointPhases; if ((tracePointPhases >> 24) != kIOPMTracePointSystemUp) { - LOG("Sleep failure code 0x%08x 0x%08x\n", + MSG("Sleep failure code 0x%08x 0x%08x\n", tracePointPCI, tracePointPhases); } setProperty(kIOPMSleepWakeFailureCodeKey, statusCode, 64); @@ -4827,7 +5916,20 @@ IOReturn IOPMrootDomain::callPlatformFunction( void IOPMrootDomain::tracePoint( uint8_t point ) { - pmTracer->tracePoint(point); + if (!systemBooting) + pmTracer->tracePoint(point); +} + +void IOPMrootDomain::tracePoint( uint8_t point, uint8_t data ) +{ + if (!systemBooting) + pmTracer->tracePoint(point, data); +} + +void IOPMrootDomain::traceDetail( uint32_t detail ) +{ + if (!systemBooting) + pmTracer->traceDetail( detail ); } //****************************************************************************** @@ -4862,7 +5964,7 @@ PMTraceWorker *PMTraceWorker::tracer(IOPMrootDomain *owner) me->pciMappingLock = IOLockAlloc(); me->tracePhase = kIOPMTracePointSystemUp; me->loginWindowPhase = 0; - me->pciBusyBitMask = 0; + me->traceData32 = 0; return me; } @@ -4872,13 +5974,11 @@ void PMTraceWorker::RTC_TRACE(void) { uint32_t wordA; - wordA = tracePhase; // destined for bits 24-31 - wordA <<= 8; - wordA |= loginWindowPhase; // destined for bits 16-23 - wordA <<= 16; + wordA = (tracePhase << 24) | (loginWindowPhase << 16) | + (traceData8 << 8); - tracePointHandler( tracePointTarget, pciBusyBitMask, wordA ); - DLOG("RTC_TRACE wrote 0x%08x 0x%08x\n", pciBusyBitMask, wordA); + tracePointHandler( tracePointTarget, traceData32, wordA ); + _LOG("RTC_TRACE wrote 0x%08x 0x%08x\n", traceData32, wordA); } } @@ -4905,7 +6005,7 @@ int PMTraceWorker::recordTopLevelPCIDevice(IOService * pciDevice) pciDeviceBitMappings->setObject(deviceName)) { index = pciDeviceBitMappings->getCount() - 1; - DLOG("PMTrace PCI array: set object %s => %d\n", + _LOG("PMTrace PCI array: set object %s => %d\n", deviceName->getCStringNoCopy(), index); } if (deviceName) @@ -4932,9 +6032,37 @@ bool PMTraceWorker::serialize(OSSerialize *s) const void PMTraceWorker::tracePoint(uint8_t phase) { + // clear trace detail when phase begins + if (tracePhase != phase) + traceData32 = 0; + + tracePhase = phase; + + DLOG("trace point 0x%02x\n", tracePhase); + RTC_TRACE(); +} + +void PMTraceWorker::tracePoint(uint8_t phase, uint8_t data8) +{ + // clear trace detail when phase begins + if (tracePhase != phase) + traceData32 = 0; + tracePhase = phase; + traceData8 = data8; + + DLOG("trace point 0x%02x 0x%02x\n", tracePhase, traceData8); + RTC_TRACE(); +} + +void PMTraceWorker::traceDetail(uint32_t detail) +{ + if (kIOPMTracePointSleepPriorityClients != tracePhase) + return; + + traceData32 = detail; + DLOG("trace point 0x%02x detail 0x%08x\n", tracePhase, traceData32); - DLOG("IOPMrootDomain: trace point 0x%02x\n", tracePhase); RTC_TRACE(); } @@ -4942,7 +6070,7 @@ void PMTraceWorker::traceLoginWindowPhase(uint8_t phase) { loginWindowPhase = phase; - DLOG("IOPMrootDomain: loginwindow tracepoint 0x%02x\n", loginWindowPhase); + DLOG("loginwindow tracepoint 0x%02x\n", loginWindowPhase); RTC_TRACE(); } @@ -4953,14 +6081,14 @@ void PMTraceWorker::tracePCIPowerChange( uint32_t expectedFlag; // Ignore PCI changes outside of system sleep/wake. - if ((kIOPMTracePointSystemSleepDriversPhase != tracePhase) && - (kIOPMTracePointSystemWakeDriversPhase != tracePhase)) + if ((kIOPMTracePointSleepPowerPlaneDrivers != tracePhase) && + (kIOPMTracePointWakePowerPlaneDrivers != tracePhase)) return; // Only record the WillChange transition when going to sleep, // and the DidChange on the way up. changeFlags &= (kIOPMDomainWillChange | kIOPMDomainDidChange); - expectedFlag = (kIOPMTracePointSystemSleepDriversPhase == tracePhase) ? + expectedFlag = (kIOPMTracePointSleepPowerPlaneDrivers == tracePhase) ? kIOPMDomainWillChange : kIOPMDomainDidChange; if (changeFlags != expectedFlag) return; @@ -4972,21 +6100,23 @@ void PMTraceWorker::tracePCIPowerChange( if (kPowerChangeStart == type) { - pciBusyBitMask |= bitMask; - DLOG("PMTrace: Device %s started - bit %2d mask 0x%08x => 0x%08x\n", - service->getName(), bitNum, bitMask, pciBusyBitMask); + traceData32 |= bitMask; + _LOG("PMTrace: Device %s started - bit %2d mask 0x%08x => 0x%08x\n", + service->getName(), bitNum, bitMask, traceData32); } else { - pciBusyBitMask &= ~bitMask; - DLOG("PMTrace: Device %s finished - bit %2d mask 0x%08x => 0x%08x\n", - service->getName(), bitNum, bitMask, pciBusyBitMask); + traceData32 &= ~bitMask; + _LOG("PMTrace: Device %s finished - bit %2d mask 0x%08x => 0x%08x\n", + service->getName(), bitNum, bitMask, traceData32); } - RTC_TRACE(); + RTC_TRACE(); } } +// MARK: - +// MARK: PMHaltWorker //****************************************************************************** // PMHaltWorker Class @@ -5138,9 +6268,9 @@ void PMHaltWorker::work( PMHaltWorker * me ) deltaTime = computeDeltaTimeMS(&startTime); if ((deltaTime > kPMHaltTimeoutMS) || timeout || - (gIOKitDebug & (kIOLogDebugPower | kIOLogPMRootDomain))) + (gIOKitDebug & kIOLogPMRootDomain)) { - KLOG("%s driver %s (%p) took %u ms\n", + LOG("%s driver %s (%p) took %u ms\n", (gPMHaltEvent == kIOMessageSystemWillPowerOff) ? "PowerOff" : "Restart", service->getName(), service, @@ -5173,7 +6303,7 @@ void PMHaltWorker::checkTimeout( PMHaltWorker * me, AbsoluteTime * now ) if (nano > 3000000000ULL) { me->timeout = true; - LOG("%s still waiting on %s\n", + MSG("%s still waiting on %s\n", (gPMHaltEvent == kIOMessageSystemWillPowerOff) ? "PowerOff" : "Restart", me->service->getName()); @@ -5406,18 +6536,90 @@ notifySystemShutdown( IOService * root, unsigned long event ) } IOLockUnlock(gPMHaltLock); - // Release all workers + // Release all workers + + for (unsigned int i = 0; i < numWorkers; i++) + { + if (workers[i]) + workers[i]->release(); + // worker also retained by it's own thread + } + +done: + DLOG("%s done\n", __FUNCTION__); + return; +} + +//********************************************************************************* +// Sleep/Wake logging +// +//********************************************************************************* + +IOMemoryDescriptor *IOPMrootDomain::getPMTraceMemoryDescriptor(void) +{ + if (timeline) + return timeline->getPMTraceMemoryDescriptor(); + else + return NULL; +} + +// Forwards external reports of detailed events to IOPMTimeline +IOReturn IOPMrootDomain::recordPMEvent(PMEventDetails *details) +{ + if (timeline && details) { + + IOReturn rc; + + // Record a detailed driver power change event, or... + if(details->eventClassifier == kIOPMEventClassDriverEvent) { + rc = timeline->recordDetailedPowerEvent( details ); + } + + // Record a system power management event + else if(details->eventClassifier == kIOPMEventClassSystemEvent) { + rc = timeline->recordSystemPowerEvent( details ); + } + else { + return kIOReturnBadArgument; + } + + // If we get to record this message, then we've reached the + // end of another successful Sleep --> Wake cycle + // At this point, we pat ourselves in the back and allow + // our Sleep --> Wake UUID to be published + if(details->eventType == kIOPMEventTypeWakeDone) { + timeline->setSleepCycleInProgressFlag(false); + } + +/* + // Check if its time to clear the timeline buffer + if(getProperty(kIOPMSleepWakeUUIDKey) + && timeline->isSleepCycleInProgress() == false + && timeline->getNumEventsLoggedThisPeriod() > 500) { + + // Clear the old UUID + if(pmPowerStateQueue) { + pmPowerStateQueue->submitPowerEvent(kPowerEventPublishSleepWakeUUID, (void *)false ); + } + } +*/ + return rc; + } + else + return kIOReturnNotReady; +} + +IOReturn IOPMrootDomain::recordAndReleasePMEvent(PMEventDetails *details) +{ + IOReturn ret = kIOReturnBadArgument; - for (unsigned int i = 0; i < numWorkers; i++) - { - if (workers[i]) - workers[i]->release(); - // worker also retained by it's own thread - } + if (details) + { + ret = recordPMEvent(details); + details->release(); + } -done: - DLOG("%s done\n", __FUNCTION__); - return; + return ret; } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -5493,16 +6695,30 @@ bool IOPMrootDomain::serializeProperties( OSSerialize * s ) const /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ +// MARK: - +// MARK: PMSettingHandle -#undef super -#define super OSObject -OSDefineMetaClassAndFinalStructors(PMSettingObject, OSObject) +OSDefineMetaClassAndStructors( PMSettingHandle, OSObject ) -void PMSettingObject::setPMSetting(const OSSymbol *type, OSObject *obj) +void PMSettingHandle::free( void ) { - (*func)(target, type, obj, refcon); + if (pmso) + { + pmso->clientHandleFreed(); + pmso->release(); + pmso = 0; + } + + OSObject::free(); } +// MARK: - +// MARK: PMSettingObject + +#undef super +#define super OSObject +OSDefineMetaClassAndFinalStructors( PMSettingObject, OSObject ) + /* * Static constructor/initializer for PMSettingObject */ @@ -5512,92 +6728,445 @@ PMSettingObject *PMSettingObject::pmSettingObject( OSObject *target_arg, uintptr_t refcon_arg, uint32_t supportedPowerSources, - const OSSymbol * settings[]) + const OSSymbol * settings[], + OSObject **handle_obj) { - uint32_t objCount = 0; - PMSettingObject *pmso; + uint32_t settingCount = 0; + PMSettingObject *pmso = 0; + PMSettingHandle *pmsh = 0; - if( !parent_arg || !handler_arg || !settings ) return NULL; + if ( !parent_arg || !handler_arg || !settings || !handle_obj ) + return NULL; - // count OSSymbol entries in NULL terminated settings array - while( settings[objCount] ) { - objCount++; + // count OSSymbol entries in NULL terminated settings array + while (settings[settingCount]) { + settingCount++; } - if(0 == objCount) return NULL; + if (0 == settingCount) + return NULL; pmso = new PMSettingObject; - if(!pmso || !pmso->init()) return NULL; - - pmso->parent = parent_arg; - pmso->func = handler_arg; - pmso->target = target_arg; - pmso->refcon = refcon_arg; - pmso->releaseAtCount = objCount + 1; // release when it has count+1 retains - - pmso->publishedFeatureID = (uint32_t *)IOMalloc(sizeof(uint32_t)*objCount); - if(pmso->publishedFeatureID) { - for(unsigned int i=0; iinit()) + goto fail; + + pmsh = new PMSettingHandle; + if (!pmsh || !pmsh->init()) + goto fail; + + queue_init(&pmso->calloutQueue); + pmso->parent = parent_arg; + pmso->func = handler_arg; + pmso->target = target_arg; + pmso->refcon = refcon_arg; + pmso->settingCount = settingCount; + + pmso->retain(); // handle holds a retain on pmso + pmsh->pmso = pmso; + pmso->pmsh = pmsh; + + pmso->publishedFeatureID = (uint32_t *)IOMalloc(sizeof(uint32_t)*settingCount); + if (pmso->publishedFeatureID) { + for (unsigned int i=0; ipublishFeature( settings[i]->getCStringNoCopy(), + parent_arg->publishFeature( settings[i]->getCStringNoCopy(), supportedPowerSources, &pmso->publishedFeatureID[i] ); } } - + + *handle_obj = pmsh; return pmso; + +fail: + if (pmso) pmso->release(); + if (pmsh) pmsh->release(); + return NULL; } -void PMSettingObject::free(void) +void PMSettingObject::free( void ) { - OSCollectionIterator *settings_iter; - OSSymbol *sym; - OSArray *arr; - int arr_idx; - int i; - int objCount = releaseAtCount - 1; - - if(publishedFeatureID) { - for(i=0; iremovePublishedFeature( publishedFeatureID[i] ); } } + + IOFree(publishedFeatureID, sizeof(uint32_t) * settingCount); + } + + super::free(); +} + +void PMSettingObject::dispatchPMSetting( const OSSymbol * type, OSObject * object ) +{ + (*func)(target, type, object, refcon); +} + +void PMSettingObject::clientHandleFreed( void ) +{ + parent->deregisterPMSettingObject(this); +} + +// MARK: - +// MARK: IOPMTimeline + +#undef super +#define super OSObject + +//********************************************************************************* +//********************************************************************************* +//********************************************************************************* + +IOPMTimeline *IOPMTimeline::timeline(IOPMrootDomain *root_domain) +{ + IOPMTimeline *myself; + + if (!root_domain) + return NULL; - IOFree(publishedFeatureID, sizeof(uint32_t) * objCount); + myself = new IOPMTimeline; + + if (myself) { + myself->owner = root_domain; + myself->init(); } - - IORecursiveLockLock(parent->settingsCtrlLock); + + return myself; +} + +bool IOPMTimeline::init(void) +{ + if (!super::init()) { + return false; + } + + logLock = IOLockAlloc(); - // Search each PM settings array in the kernel. - settings_iter = OSCollectionIterator::withCollection(parent->settingsCallbacks); - if(settings_iter) + // Fresh timeline, no events logged yet + this->numEventsLoggedThisPeriod = 0; + this->sleepCycleInProgress = false; + + //this->setEventsRecordingLevel(1); // TODO + this->setEventsTrackedCount(kIOPMDefaultSystemEventsTracked); + + return true; +} + +void IOPMTimeline::free(void) +{ + if (pmTraceMemoryDescriptor) { + pmTraceMemoryDescriptor->release(); + pmTraceMemoryDescriptor = NULL; + } + + IOLockFree(logLock); + + super::free(); +} + +IOMemoryDescriptor *IOPMTimeline::getPMTraceMemoryDescriptor() +{ + return pmTraceMemoryDescriptor; +} + +//********************************************************************************* +//********************************************************************************* +//********************************************************************************* + +bool IOPMTimeline::setProperties(OSDictionary *d) +{ + OSNumber *n = NULL; + OSBoolean *b = NULL; + bool changed = false; + + /* Changes size of detailed events buffer */ + n = (OSNumber *)d->getObject(kIOPMTimelineSystemNumberTrackedKey); + if (OSDynamicCast(OSNumber, n)) { - while(( sym = OSDynamicCast(OSSymbol, settings_iter->getNextObject()) )) - { - arr = (OSArray *)parent->settingsCallbacks->getObject(sym); - arr_idx = arr->getNextIndexOfObject(this, 0); - if(-1 != arr_idx) { - // 'this' was found in the array; remove it - arr->removeObject(arr_idx); - } - } + changed = true; + this->setEventsTrackedCount(n->unsigned32BitValue()); + } + + + /* enables or disables system events */ + b = (OSBoolean *)d->getObject(kIOPMTimelineEnabledKey); + if (b) + { + changed = true; + this->setEventsRecordingLevel((int)(kOSBooleanTrue == b)); + } + + return changed; +} + +//********************************************************************************* +//********************************************************************************* +//********************************************************************************* + +OSDictionary *IOPMTimeline::copyInfoDictionary(void) +{ + OSDictionary *out = OSDictionary::withCapacity(3); + OSNumber *n = NULL; + + if (!out || !hdr) + return NULL; + + n = OSNumber::withNumber(hdr->sizeEntries, 32); + out->setObject(kIOPMTimelineSystemNumberTrackedKey, n); + n->release(); + + n = OSNumber::withNumber(hdr->sizeBytes, 32); + out->setObject(kIOPMTimelineSystemBufferSizeKey, n); + n->release(); + + // bool + out->setObject(kIOPMTimelineEnabledKey, eventsRecordingLevel ? kOSBooleanTrue : kOSBooleanFalse); + + return out; +} + +//********************************************************************************* +//********************************************************************************* +//********************************************************************************* + +/* IOPMTimeline::recordSystemPowerEvent() + * + * Expected "type" arguments are listed in IOPMPrivate.h under enum "SystemEventTypes" + * Type arguments include "system events", and "Intermediate events" + * + * - System Events have paired "start" and "stop" events. + * - A start event shall be followed by a stop event. + * - Any number of Intermediate Events may fall between the + * start and stop events. + * - Intermediate events are meaningless outside the bounds of a system event's + * start & stoup routines. + * - It's invalid to record a Start event without a following Stop event; e.g. two + * Start events without an intervenining Stop event is invalid. + * + * Buffer invariants + * - The first recorded system event shall be preceded by an entry with type == 0 + * - IOPMTimeline may choose not to record intermediate events while there's not + * a system event in process. + */ +IOReturn IOPMTimeline::recordSystemPowerEvent( PMEventDetails *details ) +{ + static bool wakeDonePending = true; + IOPMSystemEventRecord *record_to = NULL; + OSString *swUUIDKey = NULL; + uint32_t useIndex = 0; + + if (!details) + return kIOReturnBadArgument; + + if (!traceBuffer) + return kIOReturnNotReady; + + if (details->eventType == kIOPMEventTypeWakeDone) + { + if(!wakeDonePending) + return kIOReturnBadArgument; + } + + IOLockLock(logLock); - settings_iter->release(); + if (details->eventType == kIOPMEventTypeWake) { + wakeDonePending = true; + } else if (details->eventType == kIOPMEventTypeWakeDone) { + wakeDonePending = false; } + + systemState = details->eventType; + + useIndex = _atomicIndexIncrement(&hdr->index, hdr->sizeEntries); - IORecursiveLockUnlock(parent->settingsCtrlLock); + // The entry immediately after the latest entry (and thus + // immediately before the first entry) shall have a type 0. + if (useIndex + 1 >= hdr->sizeEntries) { + traceBuffer[useIndex + 1].eventType = 0; + } else { + traceBuffer[0].eventType = 0; + } - super::free(); + record_to = &traceBuffer[useIndex]; + bzero(record_to, sizeof(IOPMSystemEventRecord)); + + /*****/ + record_to->eventType = details->eventType; + record_to->eventReason = details->reason; + record_to->eventResult = details->result; + pmEventTimeStamp(&record_to->timestamp); + + // If caller doesn't provide a UUID, we'll use the UUID that's posted + // on IOPMrootDomain under key kIOPMSleepWakeUUIDKey + if (!details->uuid) { + swUUIDKey = OSDynamicCast(OSString, owner->copyProperty(kIOPMSleepWakeUUIDKey)); + + if (swUUIDKey) + details->uuid = swUUIDKey->getCStringNoCopy(); + } + + if (details->uuid) + strncpy(record_to->uuid, details->uuid, kMaxPMStringLength); + + if (swUUIDKey) + swUUIDKey->release(); + + numEventsLoggedThisPeriod++; + /*****/ + + IOLockUnlock(logLock); + + return kIOReturnSuccess; + +} + +//********************************************************************************* +//********************************************************************************* +//********************************************************************************* + +IOReturn IOPMTimeline::recordDetailedPowerEvent( PMEventDetails *details ) +{ + IOPMSystemEventRecord *record_to = NULL; + uint32_t useIndex; + + if (!details->eventType || !details->ownerName) + return kIOReturnBadArgument; + + IOLockLock(logLock); + + useIndex = _atomicIndexIncrement(&hdr->index, hdr->sizeEntries); + + record_to = (IOPMSystemEventRecord *)&traceBuffer[useIndex]; + bzero(record_to, sizeof(IOPMSystemEventRecord)); + + /*****/ + record_to->eventType = details->eventType; + if (details->ownerName && (strlen(details->ownerName) > 1)) { + strlcpy( record_to->ownerName, + details->ownerName, + sizeof(record_to->ownerName)); + } + + record_to->ownerDisambiguateID = details->ownerUnique; + + if (details->interestName && (strlen(details->interestName) > 1)) { + strlcpy(record_to->interestName, + details->interestName, + sizeof(record_to->interestName)); + } + + record_to->oldState = details->oldState; + record_to->newState = details->newState; + record_to->eventResult = details->result; + record_to->elapsedTimeUS = details->elapsedTimeUS; + pmEventTimeStamp(&record_to->timestamp); + + numEventsLoggedThisPeriod++; + /*****/ + + IOLockUnlock(logLock); + return kIOReturnSuccess; +} + +uint32_t IOPMTimeline::getNumEventsLoggedThisPeriod() { + return this->numEventsLoggedThisPeriod; +} + +void IOPMTimeline::setNumEventsLoggedThisPeriod(uint32_t newCount) { + this->numEventsLoggedThisPeriod = newCount; +} + +bool IOPMTimeline::isSleepCycleInProgress() { + return this->sleepCycleInProgress; +} + +void IOPMTimeline::setSleepCycleInProgressFlag(bool flag) { + this->sleepCycleInProgress = flag; +} +//********************************************************************************* +//********************************************************************************* +//********************************************************************************* + +void IOPMTimeline::setEventsTrackedCount(uint32_t newTracked) +{ + size_t make_buf_size = 0; + + make_buf_size = sizeof(IOPMTraceBufferHeader) + (newTracked * sizeof(IOPMSystemEventRecord)); + + IOLockLock(logLock); + + if (pmTraceMemoryDescriptor) { + pmTraceMemoryDescriptor->release(); + pmTraceMemoryDescriptor = NULL; + } + + hdr = NULL; + traceBuffer = NULL; + + if (0 == newTracked) + { + IOLog("IOPMrootDomain -> erased buffer.\n"); + goto exit; + } + + pmTraceMemoryDescriptor = IOBufferMemoryDescriptor::withOptions( + kIOMemoryKernelUserShared | kIODirectionIn, make_buf_size); + + if (!pmTraceMemoryDescriptor) + { + IOLog("IOPMRootDomain -> IOBufferMemoryDescriptor(%d) returns NULL\n", (int)make_buf_size); + goto exit; + } + + pmTraceMemoryDescriptor->prepare(kIODirectionIn); + + // Header occupies the first sizeof(IOPMTraceBufferHeader) bytes + hdr = (IOPMTraceBufferHeader *)pmTraceMemoryDescriptor->getBytesNoCopy(); + + // Recorded events occupy the remaining bulk of the buffer + traceBuffer = (IOPMSystemEventRecord *)((uint8_t *)hdr + sizeof(IOPMTraceBufferHeader)); + + bzero(hdr, make_buf_size); + + hdr->sizeBytes = make_buf_size; + hdr->sizeEntries = newTracked; + + IOLog("IOPMRootDomain -> IOBufferMemoryDescriptor(%d) returns bufferMB with address 0x%08x\n", (int)make_buf_size, (unsigned int)(uintptr_t)traceBuffer); + +exit: + IOLockUnlock(logLock); +} + +//********************************************************************************* +//********************************************************************************* +//********************************************************************************* + +void IOPMTimeline::setEventsRecordingLevel(uint32_t eventsTrackedBits) +{ + + // TODO + + return; + } -void PMSettingObject::taggedRelease(const void *tag, const int when) const -{ - // We have n+1 retains - 1 per array that this PMSettingObject is a member - // of, and 1 retain to ourself. When we get a release with n+1 retains - // remaining, we go ahead and free ourselves, cleaning up array pointers - // in free(); +/* static helper to IOPMTimeline + */ +uint32_t IOPMTimeline::_atomicIndexIncrement(uint32_t *index, uint32_t limit) +{ + uint32_t was_index; + uint32_t inc_index; + + if(!index) + return NULL; + + do { + was_index = *index; + inc_index = (was_index+1)%limit; + } while (!OSCompareAndSwap(was_index, inc_index, index)); - super::taggedRelease(tag, releaseAtCount); + return inc_index; } // MARK: - @@ -5676,8 +7245,19 @@ void PMAssertionsTracker::tabulate(void) if ((assertionsKernel != oldKernel) || (assertionsCombined != oldCombined)) - { - owner->messageClients(kIOPMMessageDriverAssertionsChanged); + { + owner->messageClients(kIOPMMessageDriverAssertionsChanged); + + if (((assertionsCombined & kIOPMDriverAssertionPreventDisplaySleepBit) != 0) + && ((oldCombined & kIOPMDriverAssertionPreventDisplaySleepBit) == 0)) + { + /* We react to a new PreventDisplaySleep assertion by waking the display + * with an activityTickle + */ + owner->evaluatePolicy(kStimulusDarkWakeActivityTickle); + } else { + owner->evaluatePolicy(kStimulusDarkWakeEvaluate); + } } } @@ -5780,18 +7360,14 @@ IOReturn PMAssertionsTracker::createAssertion( PMAssertStruct track; // Warning: trillions and trillions of created assertions may overflow the unique ID. -#ifdef __ppc__ - track.id = issuingUniqueID++; // FIXME: need OSIncrementAtomic64() for ppc -#else track.id = OSIncrementAtomic64((SInt64*) &issuingUniqueID); -#endif track.level = level; track.assertionBits = which; track.ownerString = whoItIs ? OSSymbol::withCString(whoItIs) : 0; track.ownerService = serviceID; track.modifiedTime = 0; pmEventTimeStamp(&track.createdTime); - + dataStore = OSData::withBytes(&track, sizeof(PMAssertStruct)); if (!dataStore) { @@ -6010,6 +7586,7 @@ IOPMDriverAssertionLevel PMAssertionsTracker::getAssertionLevel( //********************************************************************************* //********************************************************************************* + static void pmEventTimeStamp(uint64_t *recordTS) { clock_sec_t tsec; @@ -6031,37 +7608,38 @@ static void pmEventTimeStamp(uint64_t *recordTS) return; } -/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ +// MARK: - +// MARK: IORootParent -#undef super -#define super IOService +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ OSDefineMetaClassAndFinalStructors(IORootParent, IOService) -// This array exactly parallels the state array for the root domain. -// Power state changes initiated by a device can be vetoed by a client of the device, and -// power state changes initiated by the parent of a device cannot be vetoed by a client of the device, -// so when the root domain wants a power state change that cannot be vetoed (e.g. demand sleep), it asks -// its parent to make the change. That is the reason for this complexity. +// The reason that root domain needs a root parent is to facilitate demand +// sleep, since a power change from the root parent cannot be vetoed. +// +// The above statement is no longer true since root domain now performs +// demand sleep using overrides. But root parent remains to avoid changing +// the power tree stacking. Root parent is parked at the max power state. + -static IOPMPowerState patriarchPowerStates[NUM_POWER_STATES] = +static IOPMPowerState patriarchPowerStates[2] = { - {1,0,0,0,0,0,0,0,0,0,0,0}, // off (not used) - {1,0,RESTART_POWER,0,0,0,0,0,0,0,0,0}, // reset (not used) - {1,0,SLEEP_POWER,0,0,0,0,0,0,0,0,0}, // sleep - {1,0,DOZE_POWER,0,0,0,0,0,0,0,0,0}, // doze - {1,0,ON_POWER,0,0,0,0,0,0,0,0,0}, // running + {1,0,ON_POWER,0,0,0,0,0,0,0,0,0}, + {1,0,ON_POWER,0,0,0,0,0,0,0,0,0}, }; +void IORootParent::initialize( void ) +{ +} + bool IORootParent::start( IOService * nub ) { - mostRecentChange = ON_STATE; - super::start(nub); + IOService::start(nub); attachToParent( getRegistryRoot(), gIOPowerPlane ); PMinit(); - registerPowerDriver(this, patriarchPowerStates, NUM_POWER_STATES); - wakeSystem(); - powerOverrideOnPriv(); + registerPowerDriver(this, patriarchPowerStates, 2); + makeUsable(); return true; } @@ -6075,30 +7653,22 @@ void IORootParent::restartSystem( void ) void IORootParent::sleepSystem( void ) { - mostRecentChange = SLEEP_STATE; - changePowerStateToPriv(SLEEP_STATE); } void IORootParent::dozeSystem( void ) { - mostRecentChange = DOZE_STATE; - changePowerStateToPriv(DOZE_STATE); } -// Called in demand sleep when sleep discovered to be impossible after actually attaining that state. -// This brings the parent to doze, which allows the root to step up from sleep to doze. - -// In idle sleep, do nothing because the parent is still on and the root can freely change state. - void IORootParent::sleepToDoze( void ) { - if ( mostRecentChange == SLEEP_STATE ) { - changePowerStateToPriv(DOZE_STATE); - } } void IORootParent::wakeSystem( void ) { - mostRecentChange = ON_STATE; - changePowerStateToPriv(ON_STATE); } + +OSObject * IORootParent::copyProperty( const char * aKey) const +{ + return (IOService::copyProperty(aKey)); +} + diff --git a/iokit/Kernel/IOPlatformExpert.cpp b/iokit/Kernel/IOPlatformExpert.cpp index f00ffd725..7800babda 100644 --- a/iokit/Kernel/IOPlatformExpert.cpp +++ b/iokit/Kernel/IOPlatformExpert.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2008 Apple Inc. All rights reserved. + * Copyright (c) 1998-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -44,6 +44,7 @@ #include #include +#include extern "C" { #include @@ -77,6 +78,7 @@ OSMetaClassDefineReservedUnused(IOPlatformExpert, 11); static IOPlatformExpert * gIOPlatform; static OSDictionary * gIOInterruptControllers; static IOLock * gIOInterruptControllersLock; +static IODTNVRAM *gIOOptionsEntry; OSSymbol * gPlatformInterruptControllerName; @@ -258,7 +260,7 @@ int IOPlatformExpert::haltRestart(unsigned int type) // On ARM kPEPanicRestartCPU is supported in the drivers if (type == kPEPanicRestartCPU) type = kPERestartCPU; - + if (PE_halt_restart) return (*PE_halt_restart)(type); else return -1; } @@ -371,6 +373,8 @@ PMLog(const char *who, unsigned long event, { UInt32 debugFlags = gIOKitDebug; UInt32 traceFlags = gIOKitTrace; + uintptr_t name = 0; + UInt32 i = 0; if (debugFlags & kIOLogPower) { @@ -402,8 +406,11 @@ PMLog(const char *who, unsigned long event, code |= DBG_FUNC_START - sgnevent; } - // Record the timestamp, wish I had a this pointer - IOTimeStampConstant(code, (uintptr_t) who, event, param1, param2); + // Get first 8 characters of the name + while ( i < sizeof(uintptr_t) && who[i] != 0) + { ((char *)&name)[sizeof(uintptr_t)-i-1]=who[i]; i++; } + // Record the timestamp. + IOTimeStampConstant(code, name, event, param1, param2); } } } @@ -779,12 +786,13 @@ int PEGetPlatformEpoch(void) int PEHaltRestart(unsigned int type) { - IOPMrootDomain *pmRootDomain = IOService::getPMRootDomain(); + IOPMrootDomain *pmRootDomain; AbsoluteTime deadline; thread_call_t shutdown_hang; if(type == kPEHaltCPU || type == kPERestartCPU || type == kPEUPSDelayHaltCPU) { + pmRootDomain = IOService::getPMRootDomain(); /* Notify IOKit PM clients of shutdown/restart Clients subscribe to this message with a call to IOService::registerInterest() @@ -820,6 +828,115 @@ UInt32 PESavePanicInfo(UInt8 *buffer, UInt32 length) else return 0; } + + +inline static int init_gIOOptionsEntry(void) +{ + IORegistryEntry *entry; + void *nvram_entry; + volatile void **options; + int ret = -1; + + if (gIOOptionsEntry) + return 0; + + entry = IORegistryEntry::fromPath( "/options", gIODTPlane ); + if (!entry) + return -1; + + nvram_entry = (void *) OSDynamicCast(IODTNVRAM, entry); + if (!nvram_entry) + goto release; + + options = (volatile void **) &gIOOptionsEntry; + if (!OSCompareAndSwapPtr(NULL, nvram_entry, options)) { + ret = 0; + goto release; + } + + return 0; + +release: + entry->release(); + return ret; + +} + +/* pass in a NULL value if you just want to figure out the len */ +boolean_t PEReadNVRAMProperty(const char *symbol, void *value, + unsigned int *len) +{ + OSObject *obj; + OSData *data; + unsigned int vlen; + + if (!symbol || !len) + goto err; + + if (init_gIOOptionsEntry() < 0) + goto err; + + vlen = *len; + *len = 0; + + obj = gIOOptionsEntry->getProperty(symbol); + if (!obj) + goto err; + + /* convert to data */ + data = OSDynamicCast(OSData, obj); + if (!data) + goto err; + + *len = data->getLength(); + vlen = min(vlen, *len); + if (vlen) + memcpy((void *) value, data->getBytesNoCopy(), vlen); + + return TRUE; + +err: + return FALSE; +} + + +boolean_t PEWriteNVRAMProperty(const char *symbol, const void *value, + const unsigned int len) +{ + const OSSymbol *sym; + OSData *data; + bool ret = false; + + if (!symbol || !value || !len) + goto err; + + if (init_gIOOptionsEntry() < 0) + goto err; + + sym = OSSymbol::withCStringNoCopy(symbol); + if (!sym) + goto err; + + data = OSData::withBytes((void *) value, len); + if (!data) + goto sym_done; + + ret = gIOOptionsEntry->setProperty(sym, data); + data->release(); + +sym_done: + sym->release(); + + if (ret == true) { + gIOOptionsEntry->sync(); + return TRUE; + } + +err: + return FALSE; +} + + long PEGetGMTTimeOfDay(void) { long result = 0; diff --git a/iokit/Kernel/IORegistryEntry.cpp b/iokit/Kernel/IORegistryEntry.cpp index e41a94bdc..a299d3fa1 100644 --- a/iokit/Kernel/IORegistryEntry.cpp +++ b/iokit/Kernel/IORegistryEntry.cpp @@ -278,7 +278,12 @@ bool IORegistryEntry::init( OSDictionary * dict ) bzero(reserved, sizeof(ExpansionData)); } if( dict) { - dict->retain(); + if (OSCollection::kImmutable & dict->setOptions(0, 0)) { + dict = (OSDictionary *) dict->copyCollection(); + if (!dict) + return (false); + } else + dict->retain(); if( fPropertyTable) fPropertyTable->release(); fPropertyTable = dict; diff --git a/iokit/Kernel/IOService.cpp b/iokit/Kernel/IOService.cpp index 1a28626cf..6ef0b3413 100644 --- a/iokit/Kernel/IOService.cpp +++ b/iokit/Kernel/IOService.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2008 Apple Inc. All rights reserved. + * Copyright (c) 1998-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -50,10 +50,13 @@ #include #include +#include + #define LOG kprintf //#define LOG IOLog #include "IOServicePrivate.h" +#include "IOKitKernelInternal.h" // take lockForArbitration before LOCKNOTIFY @@ -106,11 +109,16 @@ const OSSymbol * gIOKitDebugKey; const OSSymbol * gIOCommandPoolSizeKey; +const OSSymbol * gIOConsoleLockedKey; const OSSymbol * gIOConsoleUsersKey; const OSSymbol * gIOConsoleSessionUIDKey; +const OSSymbol * gIOConsoleSessionAuditIDKey; const OSSymbol * gIOConsoleUsersSeedKey; -const OSSymbol * gIOConsoleSessionOnConsoleKey; -const OSSymbol * gIOConsoleSessionSecureInputPIDKey; +const OSSymbol * gIOConsoleSessionOnConsoleKey; +const OSSymbol * gIOConsoleSessionSecureInputPIDKey; +const OSSymbol * gIOConsoleSessionScreenLockedTimeKey; + +static clock_sec_t gIOConsoleLockTime; static int gIOResourceGenerationCount; @@ -125,6 +133,7 @@ const OSSymbol * gIOGeneralInterest; const OSSymbol * gIOBusyInterest; const OSSymbol * gIOAppPowerStateInterest; const OSSymbol * gIOPriorityPowerStateInterest; +const OSSymbol * gIOConsoleSecurityInterest; static OSDictionary * gNotifications; static IORecursiveLock * gNotificationLock; @@ -159,6 +168,9 @@ const OSSymbol * gIOPlatformActiveActionKey; const OSSymbol * gIOPlatformFunctionHandlerSet; +static IOLock * gIOConsoleUsersLock; +static thread_call_t gIOConsoleLockCallout; + /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #define LOCKREADNOTIFY() \ @@ -213,14 +225,6 @@ bool IOService::isInactive( void ) const { return( 0 != (kIOServiceInactiveState & getState())); } -/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ - -#define IOServiceTrace(csc, a, b, c, d) { \ - if(kIOTraceIOService & gIOKitTrace) { \ - KERNEL_DEBUG_CONSTANT(IODBG_IOSERVICE(csc), a, b, c, d, 0); \ - } \ -} - /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #if defined(__i386__) || defined(__x86_64__) @@ -296,6 +300,7 @@ void IOService::initialize( void ) gIOBusyInterest = OSSymbol::withCStringNoCopy( kIOBusyInterest ); gIOAppPowerStateInterest = OSSymbol::withCStringNoCopy( kIOAppPowerStateInterest ); gIOPriorityPowerStateInterest = OSSymbol::withCStringNoCopy( kIOPriorityPowerStateInterest ); + gIOConsoleSecurityInterest = OSSymbol::withCStringNoCopy( kIOConsoleSecurityInterest ); gNotifications = OSDictionary::withCapacity( 1 ); gIOPublishNotification = OSSymbol::withCStringNoCopy( @@ -310,13 +315,18 @@ void IOService::initialize( void ) kIOTerminatedNotification ); gIOServiceKey = OSSymbol::withCStringNoCopy( kIOServiceClass); + gIOConsoleLockedKey = OSSymbol::withCStringNoCopy( kIOConsoleLockedKey); gIOConsoleUsersKey = OSSymbol::withCStringNoCopy( kIOConsoleUsersKey); gIOConsoleSessionUIDKey = OSSymbol::withCStringNoCopy( kIOConsoleSessionUIDKey); - gIOConsoleUsersSeedKey = OSSymbol::withCStringNoCopy( kIOConsoleUsersSeedKey); - gIOConsoleSessionOnConsoleKey = OSSymbol::withCStringNoCopy( kIOConsoleSessionOnConsoleKey); - gIOConsoleSessionSecureInputPIDKey = OSSymbol::withCStringNoCopy( kIOConsoleSessionSecureInputPIDKey); - gIOConsoleUsersSeedValue = OSData::withBytesNoCopy(&gIOConsoleUsersSeed, sizeof(gIOConsoleUsersSeed)); + gIOConsoleSessionAuditIDKey = OSSymbol::withCStringNoCopy( kIOConsoleSessionAuditIDKey); + + gIOConsoleUsersSeedKey = OSSymbol::withCStringNoCopy(kIOConsoleUsersSeedKey); + gIOConsoleSessionOnConsoleKey = OSSymbol::withCStringNoCopy(kIOConsoleSessionOnConsoleKey); + gIOConsoleSessionSecureInputPIDKey = OSSymbol::withCStringNoCopy(kIOConsoleSessionSecureInputPIDKey); + gIOConsoleSessionScreenLockedTimeKey = OSSymbol::withCStringNoCopy(kIOConsoleSessionScreenLockedTimeKey); + gIOConsoleUsersSeedValue = OSData::withBytesNoCopy(&gIOConsoleUsersSeed, sizeof(gIOConsoleUsersSeed)); + gIOPlatformSleepActionKey = OSSymbol::withCStringNoCopy(kIOPlatformSleepActionKey); gIOPlatformWakeActionKey = OSSymbol::withCStringNoCopy(kIOPlatformWakeActionKey); gIOPlatformQuiesceActionKey = OSSymbol::withCStringNoCopy(kIOPlatformQuiesceActionKey); @@ -345,9 +355,14 @@ void IOService::initialize( void ) gIOServiceBusyLock = IOLockAlloc(); + gIOConsoleUsersLock = IOLockAlloc(); + err = semaphore_create(kernel_task, &gJobsSemaphore, SYNC_POLICY_FIFO, 0); - assert( gIOServiceBusyLock && gJobs && gJobsLock && (err == KERN_SUCCESS) ); + gIOConsoleLockCallout = thread_call_allocate(&IOService::consoleLockTimer, NULL); + + assert( gIOServiceBusyLock && gJobs && gJobsLock && gIOConsoleUsersLock + && gIOConsoleLockCallout && (err == KERN_SUCCESS) ); gIOResources = IOResources::resources(); assert( gIOResources ); @@ -578,7 +593,6 @@ void IOService::startMatching( IOOptionBits options ) // OSKernelStackRemaining(), getName()); if( needConfig) { - prevBusy = _adjustBusy( 1 ); needWake = (0 != (kIOServiceSyncPubState & __state[1])); } @@ -591,6 +605,8 @@ void IOService::startMatching( IOOptionBits options ) if( needConfig) { + prevBusy = _adjustBusy( 1 ); + if( needWake) { IOLockLock( gIOServiceBusyLock ); thread_wakeup( (event_t) this/*&__state[1]*/ ); @@ -1470,6 +1486,7 @@ IONotifier * IOService::registerInterest( const OSSymbol * typeOfInterest, if( (typeOfInterest != gIOGeneralInterest) && (typeOfInterest != gIOBusyInterest) && (typeOfInterest != gIOAppPowerStateInterest) + && (typeOfInterest != gIOConsoleSecurityInterest) && (typeOfInterest != gIOPriorityPowerStateInterest)) return( 0 ); @@ -1541,6 +1558,7 @@ void IOService::unregisterAllInterest( void ) cleanInterestList( getProperty( gIOBusyInterest )); cleanInterestList( getProperty( gIOAppPowerStateInterest )); cleanInterestList( getProperty( gIOPriorityPowerStateInterest )); + cleanInterestList( getProperty( gIOConsoleSecurityInterest )); } /* @@ -1583,7 +1601,7 @@ void _IOServiceInterestNotifier::remove() LOCKWRITENOTIFY(); if( queue_next( &chain )) { - remqueue( 0, &chain); + remqueue(&chain); queue_next( &chain) = queue_prev( &chain) = 0; release(); } @@ -1631,7 +1649,7 @@ void _IOServiceInterestNotifier::enable( bool was ) #define tailQ(o) setObject(o) #define headQ(o) setObject(0, o) -#define TLOG(fmt, args...) { if(kIOLogYield & gIOKitDebug) IOLog(fmt, ## args); } +#define TLOG(fmt, args...) { if(kIOLogYield & gIOKitDebug) { IOLog("[%llx] ", thread_tid(current_thread())); IOLog(fmt, ## args); }} static void _workLoopAction( IOWorkLoop::Action action, IOService * service, @@ -1667,13 +1685,15 @@ bool IOService::requestTerminate( IOService * provider, IOOptionBits options ) bool IOService::terminatePhase1( IOOptionBits options ) { - IOService * victim; - IOService * client; - OSIterator * iter; - OSArray * makeInactive; - bool ok; - bool didInactive; - bool startPhase2 = false; + IOService * victim; + IOService * client; + OSIterator * iter; + OSArray * makeInactive; + int waitResult = THREAD_AWAKENED; + bool wait; + bool ok; + bool didInactive; + bool startPhase2 = false; TLOG("%s::terminatePhase1(%08llx)\n", getName(), (long long)options); @@ -1701,16 +1721,38 @@ bool IOService::terminatePhase1( IOOptionBits options ) while( victim ) { - didInactive = victim->lockForArbitration( true ); + didInactive = victim->lockForArbitration( true ); if( didInactive) { didInactive = (0 == (victim->__state[0] & kIOServiceInactiveState)); if( didInactive) { victim->__state[0] |= kIOServiceInactiveState; victim->__state[0] &= ~(kIOServiceRegisteredState | kIOServiceMatchedState | kIOServiceFirstPublishState | kIOServiceFirstMatchState); + + if (victim == this) + victim->__state[1] |= kIOServiceTermPhase1State; + victim->_adjustBusy( 1 ); - } - victim->unlockForArbitration(); + + } else if (victim != this) do { + + IOLockLock(gIOServiceBusyLock); + wait = (victim->__state[1] & kIOServiceTermPhase1State); + if( wait) { + TLOG("%s::waitPhase1(%s)\n", getName(), victim->getName()); + victim->__state[1] |= kIOServiceTerm1WaiterState; + victim->unlockForArbitration(); + assert_wait((event_t)&victim->__state[1], THREAD_UNINT); + } + IOLockUnlock(gIOServiceBusyLock); + if( wait) { + waitResult = thread_block(THREAD_CONTINUE_NULL); + TLOG("%s::did waitPhase1(%s)\n", getName(), victim->getName()); + victim->lockForArbitration(); + } + } while( wait && (waitResult != THREAD_TIMED_OUT)); + + victim->unlockForArbitration(); } if( victim == this) startPhase2 = didInactive; @@ -1755,8 +1797,21 @@ bool IOService::terminatePhase1( IOOptionBits options ) makeInactive->release(); if( startPhase2) - scheduleTerminatePhase2( options ); + { + lockForArbitration(); + __state[1] &= ~kIOServiceTermPhase1State; + if (kIOServiceTerm1WaiterState & __state[1]) + { + __state[1] &= ~kIOServiceTerm1WaiterState; + TLOG("%s::wakePhase1\n", getName()); + IOLockLock( gIOServiceBusyLock ); + thread_wakeup( (event_t) &__state[1]); + IOLockUnlock( gIOServiceBusyLock ); + } + unlockForArbitration(); + scheduleTerminatePhase2( options ); + } return( true ); } @@ -1917,7 +1972,9 @@ bool IOService::didTerminate( IOService * provider, IOOptionBits options, bool * } void IOService::actionWillTerminate( IOService * victim, IOOptionBits options, - OSArray * doPhase2List ) + OSArray * doPhase2List, + void *unused2 __unused, + void *unused3 __unused ) { OSIterator * iter; IOService * client; @@ -1945,7 +2002,9 @@ void IOService::actionWillTerminate( IOService * victim, IOOptionBits options, } } -void IOService::actionDidTerminate( IOService * victim, IOOptionBits options ) +void IOService::actionDidTerminate( IOService * victim, IOOptionBits options, + void *unused1 __unused, void *unused2 __unused, + void *unused3 __unused ) { OSIterator * iter; IOService * client; @@ -1977,7 +2036,9 @@ void IOService::actionDidTerminate( IOService * victim, IOOptionBits options ) } } -void IOService::actionFinalize( IOService * victim, IOOptionBits options ) +void IOService::actionFinalize( IOService * victim, IOOptionBits options, + void *unused1 __unused, void *unused2 __unused, + void *unused3 __unused ) { TLOG("%s::finalize(%08llx)\n", victim->getName(), (long long)options); @@ -1991,7 +2052,9 @@ void IOService::actionFinalize( IOService * victim, IOOptionBits options ) victim->finalize( options ); } -void IOService::actionStop( IOService * provider, IOService * client ) +void IOService::actionStop( IOService * provider, IOService * client, + void *unused1 __unused, void *unused2 __unused, + void *unused3 __unused ) { TLOG("%s::stop(%s)\n", client->getName(), provider->getName()); @@ -3181,8 +3244,10 @@ UInt32 IOService::_adjustBusy( SInt32 delta ) &messageClientsApplier, &context ); #if !NO_KEXTD - if( nowQuiet && (next == gIOServiceRoot)) + if( nowQuiet && (next == gIOServiceRoot)) { OSKext::considerUnloads(); + IOServiceTrace(IOSERVICE_REGISTRY_QUIET, 0, 0, 0, 0); + } #endif } @@ -3386,7 +3451,7 @@ IOReturn IOService::waitMatchIdle( UInt32 msToWait ) bool wait; int waitResult = THREAD_AWAKENED; bool computeDeadline = true; - AbsoluteTime abstime; + AbsoluteTime deadline; IOLockLock( gJobsLock ); do { @@ -3394,14 +3459,12 @@ IOReturn IOService::waitMatchIdle( UInt32 msToWait ) if( wait) { if( msToWait) { if( computeDeadline ) { - clock_interval_to_absolutetime_interval( - msToWait, kMillisecondScale, &abstime ); - clock_absolutetime_interval_to_deadline( - abstime, &abstime ); + clock_interval_to_deadline( + msToWait, kMillisecondScale, &deadline ); computeDeadline = false; } waitResult = IOLockSleepDeadline( gJobsLock, &gNumConfigThreads, - abstime, THREAD_UNINT ); + deadline, THREAD_UNINT ); } else { waitResult = IOLockSleep( gJobsLock, &gNumConfigThreads, THREAD_UNINT ); @@ -4096,6 +4159,34 @@ IOService * IOResources::resources( void ) return( inst ); } +bool IOResources::init( OSDictionary * dictionary ) +{ + // Do super init first + if ( !super::init() ) + return false; + + // Allow PAL layer to publish a value + const char *property_name; + int property_value; + + pal_get_resource_property( &property_name, &property_value ); + + if( property_name ) { + OSNumber *num; + const OSSymbol * sym; + + if( (num = OSNumber::withNumber(property_value, 32)) != 0 ) { + if( (sym = OSSymbol::withCString( property_name)) != 0 ) { + this->setProperty( sym, num ); + sym->release(); + } + num->release(); + } + } + + return true; +} + IOWorkLoop * IOResources::getWorkLoop() const { // If we are the resource root @@ -4133,6 +4224,92 @@ bool IOResources::matchPropertyTable( OSDictionary * table ) return( ok ); } +void IOService::consoleLockTimer(thread_call_param_t p0, thread_call_param_t p1) +{ + IOService::updateConsoleUsers(NULL, 0); +} + +void IOService::updateConsoleUsers(OSArray * consoleUsers, IOMessage systemMessage) +{ + IORegistryEntry * regEntry; + OSObject * locked = kOSBooleanFalse; + uint32_t idx; + bool publish; + OSDictionary * user; + static IOMessage sSystemPower; + + regEntry = IORegistryEntry::getRegistryRoot(); + + IOLockLock(gIOConsoleUsersLock); + + if (systemMessage) + { + sSystemPower = systemMessage; + } + if (consoleUsers) + { + OSNumber * num = 0; + for (idx = 0; + (!num) && (user = OSDynamicCast(OSDictionary, consoleUsers->getObject(idx))); + idx++) + { + num = OSDynamicCast(OSNumber, user->getObject(gIOConsoleSessionScreenLockedTimeKey)); + } + gIOConsoleLockTime = num ? num->unsigned32BitValue() : 0; + } + + if (gIOConsoleLockTime) + { + if (kIOMessageSystemWillSleep == sSystemPower) + locked = kOSBooleanTrue; + else + { + clock_sec_t now; + clock_usec_t microsecs; + + clock_get_calendar_microtime(&now, µsecs); + if (gIOConsoleLockTime > now) + { + AbsoluteTime deadline; + clock_interval_to_deadline(gIOConsoleLockTime - now, kSecondScale, &deadline); + thread_call_enter_delayed(gIOConsoleLockCallout, deadline); + } + else + { + locked = kOSBooleanTrue; + } + } + } + + publish = (consoleUsers || (locked != regEntry->getProperty(gIOConsoleLockedKey))); + if (publish) + { + regEntry->setProperty(gIOConsoleLockedKey, locked); + if (consoleUsers) + { + regEntry->setProperty(gIOConsoleUsersKey, consoleUsers); + } + OSIncrementAtomic( &gIOConsoleUsersSeed ); + } + + IOLockUnlock(gIOConsoleUsersLock); + + if (publish) + { + publishResource( gIOConsoleUsersSeedKey, gIOConsoleUsersSeedValue ); + + MessageClientsContext context; + + context.service = getServiceRoot(); + context.type = kIOMessageConsoleSecurityChange; + context.argument = (void *) regEntry; + context.argSize = 0; + + applyToInterestNotifiers(getServiceRoot(), gIOConsoleSecurityInterest, + &messageClientsApplier, &context ); + } +} + IOReturn IOResources::setProperties( OSObject * properties ) { IOReturn err; @@ -4152,15 +4329,17 @@ IOReturn IOResources::setProperties( OSObject * properties ) if( 0 == iter) return( kIOReturnBadArgument); - while( (key = OSDynamicCast(OSSymbol, iter->getNextObject()))) { - - if (gIOConsoleUsersKey == key) + while( (key = OSDynamicCast(OSSymbol, iter->getNextObject()))) + { + if (gIOConsoleUsersKey == key) do { - IORegistryEntry::getRegistryRoot()->setProperty(key, dict->getObject(key)); - OSIncrementAtomic( &gIOConsoleUsersSeed ); - publishResource( gIOConsoleUsersSeedKey, gIOConsoleUsersSeedValue ); - continue; + OSArray * consoleUsers; + consoleUsers = OSDynamicCast(OSArray, dict->getObject(key)); + if (!consoleUsers) + continue; + IOService::updateConsoleUsers(consoleUsers, 0); } + while (false); publishResource( key, dict->getObject(key) ); } @@ -4461,7 +4640,7 @@ bool IOService::passiveMatch( OSDictionary * table, bool changesOK ) } while( matchParent && (where = where->getProvider()) ); if( kIOLogMatch & gIOKitDebug) - if( where != this) + if( where && (where != this) ) LOG("match parent @ %s = %d\n", where->getName(), match ); @@ -5174,22 +5353,3 @@ OSMetaClassDefineReservedUnused(IOService, 44); OSMetaClassDefineReservedUnused(IOService, 45); OSMetaClassDefineReservedUnused(IOService, 46); OSMetaClassDefineReservedUnused(IOService, 47); - -#ifdef __ppc__ -OSMetaClassDefineReservedUnused(IOService, 48); -OSMetaClassDefineReservedUnused(IOService, 49); -OSMetaClassDefineReservedUnused(IOService, 50); -OSMetaClassDefineReservedUnused(IOService, 51); -OSMetaClassDefineReservedUnused(IOService, 52); -OSMetaClassDefineReservedUnused(IOService, 53); -OSMetaClassDefineReservedUnused(IOService, 54); -OSMetaClassDefineReservedUnused(IOService, 55); -OSMetaClassDefineReservedUnused(IOService, 56); -OSMetaClassDefineReservedUnused(IOService, 57); -OSMetaClassDefineReservedUnused(IOService, 58); -OSMetaClassDefineReservedUnused(IOService, 59); -OSMetaClassDefineReservedUnused(IOService, 60); -OSMetaClassDefineReservedUnused(IOService, 61); -OSMetaClassDefineReservedUnused(IOService, 62); -OSMetaClassDefineReservedUnused(IOService, 63); -#endif diff --git a/iokit/Kernel/IOServicePM.cpp b/iokit/Kernel/IOServicePM.cpp index fcecfbf00..bd7bcd002 100644 --- a/iokit/Kernel/IOServicePM.cpp +++ b/iokit/Kernel/IOServicePM.cpp @@ -26,14 +26,16 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -//#define IOASSERT 1 +//#undef IOASSERT +//#define IOASSERT 1 + #include #include #include #include #include #include -#include +#include #include #include @@ -42,8 +44,10 @@ #include #include #include +#include #include +#include // Required for notification instrumentation #include "IOServicePrivate.h" @@ -51,6 +55,7 @@ #include "IOKitKernelInternal.h" static void settle_timer_expired(thread_call_param_t, thread_call_param_t); +static void idle_timer_expired(thread_call_param_t, thread_call_param_t); static void tellKernelClientApplier(OSObject * object, void * arg); static void tellAppClientApplier(OSObject * object, void * arg); @@ -69,15 +74,20 @@ static uint64_t computeTimeDeltaNS( const AbsoluteTime * start ) OSDefineMetaClassAndStructors(IOPMprot, OSObject) #endif -//********************************************************************************* +// Container class for recording system power events +OSDefineMetaClassAndStructors( PMEventDetails, OSObject ); + +//****************************************************************************** // Globals -//********************************************************************************* +//****************************************************************************** static bool gIOPMInitialized = false; static uint32_t gIOPMBusyCount = 0; +static uint32_t gIOPMWorkCount = 0; static IOWorkLoop * gIOPMWorkLoop = 0; static IOPMRequestQueue * gIOPMRequestQueue = 0; static IOPMRequestQueue * gIOPMReplyQueue = 0; +static IOPMWorkQueue * gIOPMWorkQueue = 0; static IOPMCompletionQueue * gIOPMFreeQueue = 0; static IOPMRequest * gIOPMRequest = 0; static IOPlatformExpert * gPlatform = 0; @@ -96,16 +106,31 @@ static uint32_t getPMRequestType( void ) return type; } -//********************************************************************************* +//****************************************************************************** // Macros -//********************************************************************************* +//****************************************************************************** #define PM_ERROR(x...) do { kprintf(x); IOLog(x); } while (false) -#define PM_DEBUG(x...) do { kprintf(x); } while (false) -#define PM_TRACE(x...) do { \ - if (kIOLogDebugPower & gIOKitDebug) kprintf(x); } while (false) +#define PM_LOG(x...) do { kprintf(x); } while (false) + +#define PM_LOG1(x...) do { \ + if (kIOLogDebugPower & gIOKitDebug) \ + kprintf(x); } while (false) + +#define PM_LOG2(x...) do { \ + if (kIOLogDebugPower & gIOKitDebug) \ + kprintf(x); } while (false) -#define PM_CONNECT(x...) +#if 0 +#define PM_LOG3(x...) do { kprintf(x); } while (false) +#else +#define PM_LOG3(x...) +#endif + +#define RD_LOG(x...) do { \ + if ((kIOLogPMRootDomain & gIOKitDebug) && \ + (getPMRootDomain() == this)) \ + kprintf("PMRD: " x); } while (false) #define PM_ASSERT_IN_GATE(x) \ do { \ @@ -114,7 +139,7 @@ do { \ #define PM_LOCK() IOLockLock(fPMLock) #define PM_UNLOCK() IOLockUnlock(fPMLock) -#define PM_LOCK_SLEEP(event) IOLockSleep(fPMLock, event, THREAD_UNINT) +#define PM_LOCK_SLEEP(event, dl) IOLockSleepDeadline(fPMLock, event, dl, THREAD_UNINT) #define PM_LOCK_WAKEUP(event) IOLockWakeup(fPMLock, event, false) #define ns_per_us 1000 @@ -128,17 +153,16 @@ do { \ do { gPlatform->PMLog( fName, t, a, b); } while(0) #define NS_TO_MS(nsec) ((int)((nsec) / 1000000ULL)) +#define NS_TO_US(nsec) ((int)((nsec) / 1000ULL)) #if CONFIG_EMBEDDED #define SUPPORT_IDLE_CANCEL 1 #endif -#define kNotifyWillChange (true) -#define kNotifyDidChange (false) - #define kIOPMPowerStateMax 0xFFFFFFFF -#define IS_PM_ROOT() (this == gIOPMRootNode) +#define IS_PM_ROOT (this == gIOPMRootNode) +#define IS_ROOT_DOMAIN (getPMRootDomain() == this) #define IS_POWER_DROP (fHeadNotePowerState < fCurrentPowerState) #define IS_POWER_RISE (fHeadNotePowerState > fCurrentPowerState) @@ -149,41 +173,69 @@ do { \ // use message tracer to log messages longer than (ns): #define LOG_APP_RESPONSE_MSG_TRACER (3 * 1000ULL * 1000ULL * 1000ULL) -#define RESERVE_DOMAIN_POWER 1 - enum { kReserveDomainPower = 1 }; +#define MS_PUSH(n) \ + do { assert(kIOPM_BadMachineState == fSavedMachineState); \ + assert(kIOPM_BadMachineState != n); \ + fSavedMachineState = n; } while (false) + +#define MS_POP() \ + do { assert(kIOPM_BadMachineState != fSavedMachineState); \ + fMachineState = fSavedMachineState; \ + fSavedMachineState = kIOPM_BadMachineState; } while (false) + +#define PM_ACTION_0(a) \ + do { if (fPMActions.a) { \ + (fPMActions.a)(fPMActions.target, this, &fPMActions); } \ + } while (false) + +#define PM_ACTION_2(a, x, y) \ + do { if (fPMActions.a) { \ + (fPMActions.a)(fPMActions.target, this, &fPMActions, x, y); } \ + } while (false) + //********************************************************************************* // PM machine states +// +// Check kgmacros after modifying machine states. //********************************************************************************* enum { + kIOPM_Finished = 0, + kIOPM_OurChangeTellClientsPowerDown = 1, kIOPM_OurChangeTellPriorityClientsPowerDown = 2, kIOPM_OurChangeNotifyInterestedDriversWillChange = 3, kIOPM_OurChangeSetPowerState = 4, kIOPM_OurChangeWaitForPowerSettle = 5, kIOPM_OurChangeNotifyInterestedDriversDidChange = 6, - kIOPM_OurChangeFinish = 7, - kIOPM_ParentDownTellPriorityClientsPowerDown = 8, - kIOPM_ParentDownNotifyInterestedDriversWillChange = 9, - /* 10 not used */ - kIOPM_ParentDownNotifyDidChangeAndAcknowledgeChange = 11, - kIOPM_ParentDownSetPowerState = 12, - kIOPM_ParentDownWaitForPowerSettle = 13, - kIOPM_ParentAcknowledgePowerChange = 14, - kIOPM_ParentUpSetPowerState = 15, - /* 16 not used */ - kIOPM_ParentUpWaitForSettleTime = 17, - kIOPM_ParentUpNotifyInterestedDriversDidChange = 18, - /* 19 not used */ - kIOPM_Finished = 20, - kIOPM_DriverThreadCallDone = 21, - kIOPM_NotifyChildrenDone = 22, + kIOPM_OurChangeTellCapabilityDidChange = 7, + kIOPM_OurChangeFinish = 8, + + kIOPM_ParentChangeTellPriorityClientsPowerDown = 10, + kIOPM_ParentChangeNotifyInterestedDriversWillChange = 11, + kIOPM_ParentChangeSetPowerState = 12, + kIOPM_ParentChangeWaitForPowerSettle = 13, + kIOPM_ParentChangeNotifyInterestedDriversDidChange = 14, + kIOPM_ParentChangeTellCapabilityDidChange = 15, + kIOPM_ParentChangeAcknowledgePowerChange = 16, + + kIOPM_NotifyChildrenStart = 17, + kIOPM_NotifyChildrenOrdered = 18, + kIOPM_NotifyChildrenDelayed = 19, + kIOPM_SyncTellClientsPowerDown = 20, + kIOPM_SyncTellPriorityClientsPowerDown = 21, + kIOPM_SyncNotifyWillChange = 22, kIOPM_SyncNotifyDidChange = 23, - kIOPM_SyncFinish = 24 + kIOPM_SyncTellCapabilityDidChange = 24, + kIOPM_SyncFinish = 25, + kIOPM_TellCapabilityChangeDone = 26, + kIOPM_DriverThreadCallDone = 27, + + kIOPM_BadMachineState = 0xFFFFFFFF }; @@ -366,68 +418,95 @@ void IOService::PMinit ( void ) if ( !gIOPMInitialized ) { gPlatform = getPlatform(); - gIOPMWorkLoop = IOWorkLoop::workLoop(); - if (gIOPMWorkLoop) - { - gIOPMRequestQueue = IOPMRequestQueue::create( - this, OSMemberFunctionCast(IOPMRequestQueue::Action, - this, &IOService::servicePMRequestQueue)); + gIOPMWorkLoop = IOWorkLoop::workLoop(); + if (gIOPMWorkLoop) + { + gIOPMRequestQueue = IOPMRequestQueue::create( + this, OSMemberFunctionCast(IOPMRequestQueue::Action, + this, &IOService::servicePMRequestQueue)); + + gIOPMReplyQueue = IOPMRequestQueue::create( + this, OSMemberFunctionCast(IOPMRequestQueue::Action, + this, &IOService::servicePMReplyQueue)); + + gIOPMWorkQueue = IOPMWorkQueue::create( + this, + OSMemberFunctionCast(IOPMWorkQueue::Action, this, + &IOService::servicePMRequest), + OSMemberFunctionCast(IOPMWorkQueue::Action, this, + &IOService::retirePMRequest)); + + gIOPMFreeQueue = IOPMCompletionQueue::create( + this, OSMemberFunctionCast(IOPMCompletionQueue::Action, + this, &IOService::servicePMFreeQueue)); + + if (gIOPMWorkLoop->addEventSource(gIOPMRequestQueue) != + kIOReturnSuccess) + { + gIOPMRequestQueue->release(); + gIOPMRequestQueue = 0; + } - gIOPMReplyQueue = IOPMRequestQueue::create( - this, OSMemberFunctionCast(IOPMRequestQueue::Action, - this, &IOService::servicePMReplyQueue)); + if (gIOPMWorkLoop->addEventSource(gIOPMReplyQueue) != + kIOReturnSuccess) + { + gIOPMReplyQueue->release(); + gIOPMReplyQueue = 0; + } + + if (gIOPMWorkLoop->addEventSource(gIOPMWorkQueue) != + kIOReturnSuccess) + { + gIOPMWorkQueue->release(); + gIOPMWorkQueue = 0; + } - gIOPMFreeQueue = IOPMCompletionQueue::create( - this, OSMemberFunctionCast(IOPMCompletionQueue::Action, - this, &IOService::servicePMFreeQueue)); + if (gIOPMWorkLoop->addEventSource(gIOPMFreeQueue) != + kIOReturnSuccess) + { + gIOPMFreeQueue->release(); + gIOPMFreeQueue = 0; + } - if (gIOPMWorkLoop->addEventSource(gIOPMRequestQueue) != - kIOReturnSuccess) - { - gIOPMRequestQueue->release(); - gIOPMRequestQueue = 0; - } + gIOPMPowerClientDevice = + OSSymbol::withCStringNoCopy( "DevicePowerState" ); - if (gIOPMWorkLoop->addEventSource(gIOPMReplyQueue) != - kIOReturnSuccess) - { - gIOPMReplyQueue->release(); - gIOPMReplyQueue = 0; - } + gIOPMPowerClientDriver = + OSSymbol::withCStringNoCopy( "DriverPowerState" ); - if (gIOPMWorkLoop->addEventSource(gIOPMFreeQueue) != - kIOReturnSuccess) - { - gIOPMFreeQueue->release(); - gIOPMFreeQueue = 0; - } + gIOPMPowerClientChildProxy = + OSSymbol::withCStringNoCopy( "ChildProxyPowerState" ); - gIOPMPowerClientDevice = OSSymbol::withCStringNoCopy( "DevicePowerState" ); - gIOPMPowerClientDriver = OSSymbol::withCStringNoCopy( "DriverPowerState" ); - gIOPMPowerClientChildProxy = OSSymbol::withCStringNoCopy( "ChildProxyPowerState" ); - gIOPMPowerClientChildren = OSSymbol::withCStringNoCopy( "ChildrenPowerState" ); - } + gIOPMPowerClientChildren = + OSSymbol::withCStringNoCopy( "ChildrenPowerState" ); + } - if (gIOPMRequestQueue && gIOPMReplyQueue && gIOPMFreeQueue) - gIOPMInitialized = true; - } - if (!gIOPMInitialized) - return; + if (gIOPMRequestQueue && gIOPMReplyQueue && gIOPMFreeQueue) + gIOPMInitialized = true; + } + if (!gIOPMInitialized) + return; pwrMgt = new IOServicePM; pwrMgt->init(); setProperty(kPwrMgtKey, pwrMgt); + queue_init(&pwrMgt->WorkChain); + queue_init(&pwrMgt->RequestHead); + queue_init(&pwrMgt->PMDriverCallQueue); + + fOwner = this; fPMLock = IOLockAlloc(); fInterestedDrivers = new IOPMinformeeList; fInterestedDrivers->initialize(); fDesiredPowerState = 0; fDeviceDesire = 0; - fInitialChange = true; - fPreviousRequest = 0; - fDeviceOverrides = false; + fInitialPowerChange = true; + fInitialSetPowerState = true; + fPreviousRequestPowerFlags = 0; + fDeviceOverrideEnabled = false; fMachineState = kIOPM_Finished; - fIdleTimerEventSource = NULL; + fSavedMachineState = kIOPM_BadMachineState; fIdleTimerMinPowerState = 0; fActivityLock = IOLockAlloc(); fStrictTreeOrder = false; @@ -437,13 +516,12 @@ void IOService::PMinit ( void ) fNumberOfPowerStates = 0; fCurrentPowerState = 0; fParentsCurrentPowerFlags = 0; - fMaxCapability = 0; + fMaxPowerState = 0; fName = getName(); fParentsKnowState = false; fSerialNumber = 0; fResponseArray = NULL; fNotifyClientArray = NULL; - fDoNotPowerDown = true; fCurrentPowerConsumption = kIOPMUnknown; fOverrideMaxPowerState = kIOPMPowerStateMax; @@ -457,9 +535,20 @@ void IOService::PMinit ( void ) &IOService::ack_timer_expired, (thread_call_param_t)this); fSettleTimer = thread_call_allocate( &settle_timer_expired, (thread_call_param_t)this); - fDriverCallEntry = thread_call_allocate( + fIdleTimer = thread_call_allocate( + &idle_timer_expired, (thread_call_param_t)this); + fDriverCallEntry = thread_call_allocate( (thread_call_func_t) &IOService::pmDriverCallout, this); - assert(fDriverCallEntry); + assert(fDriverCallEntry); + + // Check for powerChangeDone override. + if (OSMemberFunctionCast(void (*)(void), + getResourceService(), &IOService::powerChangeDone) != + OSMemberFunctionCast(void (*)(void), + this, &IOService::powerChangeDone)) + { + fPCDFunctionOverride = true; + } #if PM_VARS_SUPPORT IOPMprot * prot = new IOPMprot; @@ -472,7 +561,7 @@ void IOService::PMinit ( void ) pm_vars = prot; } #else - pm_vars = (void *) true; + pm_vars = (void *) (uintptr_t) true; #endif initialized = true; @@ -487,22 +576,18 @@ void IOService::PMinit ( void ) void IOService::PMfree ( void ) { - initialized = false; + initialized = false; pm_vars = 0; if ( pwrMgt ) - { - assert(fMachineState == kIOPM_Finished); - assert(fInsertInterestSet == NULL); - assert(fRemoveInterestSet == NULL); + { + assert(fMachineState == kIOPM_Finished); + assert(fInsertInterestSet == NULL); + assert(fRemoveInterestSet == NULL); assert(fNotifyChildArray == NULL); + assert(queue_empty(&pwrMgt->RequestHead)); + assert(queue_empty(&fPMDriverCallQueue)); - if ( fIdleTimerEventSource != NULL ) { - fIdleTimerEventSource->disable(); - gIOPMWorkLoop->removeEventSource(fIdleTimerEventSource); - fIdleTimerEventSource->release(); - fIdleTimerEventSource = NULL; - } if ( fSettleTimer ) { thread_call_cancel(fSettleTimer); thread_call_free(fSettleTimer); @@ -513,6 +598,11 @@ void IOService::PMfree ( void ) thread_call_free(fAckTimer); fAckTimer = NULL; } + if ( fIdleTimer ) { + thread_call_cancel(fIdleTimer); + thread_call_free(fIdleTimer); + fIdleTimer = NULL; + } if ( fDriverCallEntry ) { thread_call_free(fDriverCallEntry); fDriverCallEntry = NULL; @@ -525,20 +615,15 @@ void IOService::PMfree ( void ) IOLockFree(fActivityLock); fActivityLock = NULL; } - if ( fInterestedDrivers ) { - fInterestedDrivers->release(); - fInterestedDrivers = NULL; - } - if ( fPMWorkQueue ) { - gIOPMWorkLoop->removeEventSource(fPMWorkQueue); - fPMWorkQueue->release(); - fPMWorkQueue = 0; - } - if (fDriverCallParamSlots && fDriverCallParamPtr) { - IODelete(fDriverCallParamPtr, DriverCallParam, fDriverCallParamSlots); - fDriverCallParamPtr = 0; - fDriverCallParamSlots = 0; - } + if ( fInterestedDrivers ) { + fInterestedDrivers->release(); + fInterestedDrivers = NULL; + } + if (fDriverCallParamSlots && fDriverCallParamPtr) { + IODelete(fDriverCallParamPtr, DriverCallParam, fDriverCallParamSlots); + fDriverCallParamPtr = 0; + fDriverCallParamSlots = 0; + } if ( fResponseArray ) { fResponseArray->release(); fResponseArray = NULL; @@ -548,25 +633,25 @@ void IOService::PMfree ( void ) fNotifyClientArray = NULL; } if (fPowerStates && fNumberOfPowerStates) { - IODelete(fPowerStates, IOPMPowerState, fNumberOfPowerStates); + IODelete(fPowerStates, IOPMPSEntry, fNumberOfPowerStates); fNumberOfPowerStates = 0; fPowerStates = NULL; } - if (fPowerClients) { - fPowerClients->release(); - fPowerClients = 0; - } + if (fPowerClients) { + fPowerClients->release(); + fPowerClients = 0; + } #if PM_VARS_SUPPORT - if (fPMVars) - { - fPMVars->release(); - fPMVars = 0; - } + if (fPMVars) + { + fPMVars->release(); + fPMVars = 0; + } #endif pwrMgt->release(); - pwrMgt = 0; + pwrMgt = 0; } } @@ -614,42 +699,35 @@ IOReturn IOService::youAreRoot ( void ) void IOService::PMstop ( void ) { - IOPMRequest * request; + IOPMRequest * request; - if (!initialized) - return; + if (!initialized) + return; - // Schedule an async PMstop request, but immediately stop any further - // calls to the controlling or interested drivers. This device will - // continue to exist in the power plane and participate in power state - // changes until the PMstop async request is processed. + PM_LOCK(); - PM_LOCK(); - fLockedFlags.PMStop = true; - if (fLockedFlags.DriverCallBusy) - { - PM_DEBUG("%s: PMstop() driver call busy\n", getName()); - } - while (fThreadAssertionCount != 0) + if (fLockedFlags.PMStop) { - if (current_thread() == fThreadAssertionThread) - { - PM_ERROR("%s: PMstop() called from PM thread call\n", getName()); - break; - } - // Wait for thread assertions to drop to zero. - PM_DEBUG("%s: PMstop() wait for %u thread assertion(s)\n", - getName(), fThreadAssertionCount); - PM_LOCK_SLEEP(&fThreadAssertionCount); + PM_LOG2("%s: PMstop() already stopped\n", fName); + PM_UNLOCK(); + return; } + + // Inhibit future driver calls. + fLockedFlags.PMStop = true; + + // Wait for all prior driver calls to finish. + waitForPMDriverCall(); + PM_UNLOCK(); - request = acquirePMRequest( this, kIOPMRequestTypePMStop ); - if (request) - { - PM_TRACE("%s: %p PMstop\n", getName(), this); - submitPMRequest( request ); - } + // The rest of the work is performed async. + request = acquirePMRequest( this, kIOPMRequestTypePMStop ); + if (request) + { + PM_LOG2("%s: %p PMstop\n", getName(), this); + submitPMRequest( request ); + } } //********************************************************************************* @@ -660,14 +738,14 @@ void IOService::PMstop ( void ) void IOService::handlePMstop ( IOPMRequest * request ) { - OSIterator * iter; + OSIterator * iter; OSObject * next; IOPowerConnection * connection; IOService * theChild; IOService * theParent; PM_ASSERT_IN_GATE(); - PM_TRACE("%s: %p %s start\n", getName(), this, __FUNCTION__); + PM_LOG2("%s: %p %s start\n", getName(), this, __FUNCTION__); // remove the property removeProperty(kPwrMgtKey); @@ -729,23 +807,23 @@ void IOService::handlePMstop ( IOPMRequest * request ) if ( fInterestedDrivers ) { - IOPMinformeeList * list = fInterestedDrivers; + IOPMinformeeList * list = fInterestedDrivers; IOPMinformee * item; - PM_LOCK(); - while ((item = list->firstInList())) - { - list->removeFromList(item->whatObject); - } - PM_UNLOCK(); - } + PM_LOCK(); + while ((item = list->firstInList())) + { + list->removeFromList(item->whatObject); + } + PM_UNLOCK(); + } - // Tell idleTimerExpired() to ignore idle timer. - fIdleTimerPeriod = 0; - if (fIdleTimerEventSource) - fIdleTimerEventSource->disable(); + // Tell idleTimerExpired() to ignore idle timer. + fIdleTimerPeriod = 0; + if (fIdleTimer && thread_call_cancel(fIdleTimer)) + release(); - PM_TRACE("%s: %p %s done\n", getName(), this, __FUNCTION__); + PM_LOG2("%s: %p %s done\n", getName(), this, __FUNCTION__); } //********************************************************************************* @@ -791,7 +869,7 @@ IOReturn IOService::addPowerChild ( IOService * child ) } if (!ok) { - PM_DEBUG("%s: %s (%p) is already a child\n", + PM_LOG("%s: %s (%p) is already a child\n", getName(), child->getName(), child); break; } @@ -876,7 +954,7 @@ void IOService::addPowerChild1 ( IOPMRequest * request ) tempDesire = fNumberOfPowerStates - 1; } - if (tempDesire && (IS_PM_ROOT() || (fMaxCapability >= tempDesire))) + if (tempDesire && (IS_PM_ROOT || (fMaxPowerState >= tempDesire))) { adjustPowerState(tempDesire); } @@ -903,7 +981,7 @@ void IOService::addPowerChild2 ( IOPMRequest * request ) if (!parent || !inPlane(gIOPowerPlane)) { - PM_DEBUG("%s: addPowerChild2 not in power plane\n", getName()); + PM_LOG("%s: addPowerChild2 not in power plane\n", getName()); return; } @@ -914,7 +992,7 @@ void IOService::addPowerChild2 ( IOPMRequest * request ) powerState = parent->fCurrentPowerState; if (knowsState) - powerFlags = parent->fPowerStates[powerState].outputPowerCharacter; + powerFlags = parent->fPowerStates[powerState].outputPowerFlags; else powerFlags = 0; @@ -928,16 +1006,14 @@ void IOService::addPowerChild2 ( IOPMRequest * request ) if ( fControllingDriver && fParentsKnowState ) { - fMaxCapability = fControllingDriver->maxCapabilityForDomainState(fParentsCurrentPowerFlags); + fMaxPowerState = fControllingDriver->maxCapabilityForDomainState(fParentsCurrentPowerFlags); // initially change into the state we are already in tempDesire = fControllingDriver->initialPowerStateForDomainState(fParentsCurrentPowerFlags); - fPreviousRequest = 0xffffffff; + fPreviousRequestPowerFlags = (IOPMPowerFlags)(-1); adjustPowerState(tempDesire); } -#if ROOT_DOMAIN_RUN_STATES - getPMRootDomain()->tagPowerPlaneService(this, &fRootDomainState); -#endif + getPMRootDomain()->tagPowerPlaneService(this, &fPMActions); } //********************************************************************************* @@ -960,7 +1036,7 @@ void IOService::addPowerChild3 ( IOPMRequest * request ) { if (child->getProperty("IOPMStrictTreeOrder")) { - PM_DEBUG("%s: strict PM order enforced\n", getName()); + PM_LOG1("%s: strict PM order enforced\n", getName()); fStrictTreeOrder = true; } @@ -969,7 +1045,7 @@ void IOService::addPowerChild3 ( IOPMRequest * request ) } else { - PM_DEBUG("%s: addPowerChild3 not in power plane\n", getName()); + PM_LOG("%s: addPowerChild3 not in power plane\n", getName()); } connection->release(); @@ -1031,6 +1107,10 @@ IOReturn IOService::removePowerChild ( IOPowerConnection * theNub ) if ( fHeadNotePendingAcks == 0 ) { stop_ack_timer(); + + // Request unblocked, work queue + // should re-scan all busy requests. + gIOPMWorkQueue->incrementProducerCount(); } } } @@ -1065,8 +1145,8 @@ IOReturn IOService::registerPowerDriver ( IOPMPowerState * powerStates, unsigned long numberOfStates ) { - IOPMRequest * request; - IOPMPowerState * powerStatesCopy = 0; + IOPMRequest * request; + IOPMPSEntry * powerStatesCopy = 0; if (!initialized) return IOPMNotYetInitialized; @@ -1092,12 +1172,19 @@ IOReturn IOService::registerPowerDriver ( do { // Make a copy of the supplied power state array. - powerStatesCopy = IONew(IOPMPowerState, numberOfStates); + powerStatesCopy = IONew(IOPMPSEntry, numberOfStates); if (!powerStatesCopy) break; - bcopy( powerStates, powerStatesCopy, - sizeof(IOPMPowerState) * numberOfStates ); + for (uint32_t i = 0; i < numberOfStates; i++) + { + powerStatesCopy[i].capabilityFlags = powerStates[i].capabilityFlags; + powerStatesCopy[i].outputPowerFlags = powerStates[i].outputPowerCharacter; + powerStatesCopy[i].inputPowerFlags = powerStates[i].inputPowerRequirement; + powerStatesCopy[i].staticPower = powerStates[i].staticPower; + powerStatesCopy[i].settleUpTime = powerStates[i].settleUpTime; + powerStatesCopy[i].settleDownTime = powerStates[i].settleDownTime; + } request = acquirePMRequest( this, kIOPMRequestTypeRegisterPowerDriver ); if (!request) @@ -1114,7 +1201,7 @@ IOReturn IOService::registerPowerDriver ( while (false); if (powerStatesCopy) - IODelete(powerStatesCopy, IOPMPowerState, numberOfStates); + IODelete(powerStatesCopy, IOPMPSEntry, numberOfStates); return kIOReturnNoMemory; } @@ -1124,12 +1211,12 @@ IOReturn IOService::registerPowerDriver ( void IOService::handleRegisterPowerDriver ( IOPMRequest * request ) { - IOService * powerDriver = (IOService *) request->fArg0; - IOPMPowerState * powerStates = (IOPMPowerState *) request->fArg1; - unsigned long numberOfStates = (unsigned long) request->fArg2; - unsigned long i; - IOService * root; - OSIterator * iter; + IOService * powerDriver = (IOService *) request->fArg0; + IOPMPSEntry * powerStates = (IOPMPSEntry *) request->fArg1; + unsigned long numberOfStates = (unsigned long) request->fArg2; + unsigned long i; + IOService * root; + OSIterator * iter; PM_ASSERT_IN_GATE(); assert(powerStates); @@ -1140,7 +1227,7 @@ void IOService::handleRegisterPowerDriver ( IOPMRequest * request ) { OUR_PMLog(kPMLogControllingDriver, (unsigned long) numberOfStates, - (unsigned long) powerStates[0].version); + (unsigned long) kIOPMPowerStateVersion1); fPowerStates = powerStates; fNumberOfPowerStates = numberOfStates; @@ -1150,7 +1237,7 @@ void IOService::handleRegisterPowerDriver ( IOPMRequest * request ) // make a mask of all the character bits we know about fOutputPowerCharacterFlags = 0; for ( i = 0; i < numberOfStates; i++ ) { - fOutputPowerCharacterFlags |= fPowerStates[i].outputPowerCharacter; + fOutputPowerCharacterFlags |= fPowerStates[i].outputPowerFlags; } // Register powerDriver as interested, unless already done. @@ -1201,7 +1288,7 @@ void IOService::handleRegisterPowerDriver ( IOPMRequest * request ) if ( inPlane(gIOPowerPlane) && fParentsKnowState ) { unsigned long tempDesire; - fMaxCapability = fControllingDriver->maxCapabilityForDomainState(fParentsCurrentPowerFlags); + fMaxPowerState = fControllingDriver->maxCapabilityForDomainState(fParentsCurrentPowerFlags); // initially change into the state we are already in tempDesire = fControllingDriver->initialPowerStateForDomainState(fParentsCurrentPowerFlags); adjustPowerState(tempDesire); @@ -1210,7 +1297,7 @@ void IOService::handleRegisterPowerDriver ( IOPMRequest * request ) else { OUR_PMLog(kPMLogControllingDriverErr2, numberOfStates, 0); - IODelete(powerStates, IOPMPowerState, numberOfStates); + IODelete(powerStates, IOPMPSEntry, numberOfStates); } powerDriver->release(); @@ -1227,29 +1314,33 @@ void IOService::handleRegisterPowerDriver ( IOPMRequest * request ) IOPMPowerFlags IOService::registerInterestedDriver ( IOService * driver ) { - IOPMRequest * request; - bool signal; + IOPMRequest * request; + bool signal; - if (!initialized || !fInterestedDrivers) - return IOPMNotPowerManaged; + if (!driver || !initialized || !fInterestedDrivers) + return 0; - PM_LOCK(); - signal = (!fInsertInterestSet && !fRemoveInterestSet); - if (fInsertInterestSet == NULL) - fInsertInterestSet = OSSet::withCapacity(4); - if (fInsertInterestSet) - fInsertInterestSet->setObject(driver); - PM_UNLOCK(); + PM_LOCK(); + signal = (!fInsertInterestSet && !fRemoveInterestSet); + if (fInsertInterestSet == NULL) + fInsertInterestSet = OSSet::withCapacity(4); + if (fInsertInterestSet) + { + fInsertInterestSet->setObject(driver); + if (fRemoveInterestSet) + fRemoveInterestSet->removeObject(driver); + } + PM_UNLOCK(); - if (signal) - { - request = acquirePMRequest( this, kIOPMRequestTypeInterestChanged ); - if (request) - submitPMRequest( request ); - } + if (signal) + { + request = acquirePMRequest( this, kIOPMRequestTypeInterestChanged ); + if (request) + submitPMRequest( request ); + } - // This return value cannot be trusted, but return a value - // for those clients that care. + // This return value cannot be trusted, but return a value + // for those clients that care. OUR_PMLog(kPMLogInterestedDriver, kIOPMDeviceUsable, 2); return kIOPMDeviceUsable; @@ -1261,41 +1352,44 @@ IOPMPowerFlags IOService::registerInterestedDriver ( IOService * driver ) IOReturn IOService::deRegisterInterestedDriver ( IOService * driver ) { - IOPMinformeeList * list; + IOPMinformeeList * list; IOPMinformee * item; - IOPMRequest * request; - bool signal; + IOPMRequest * request; + bool signal; - if (!initialized || !fInterestedDrivers) - return IOPMNotPowerManaged; + if (!driver) + return kIOReturnBadArgument; + if (!initialized || !fInterestedDrivers) + return IOPMNotPowerManaged; - PM_LOCK(); - signal = (!fRemoveInterestSet && !fInsertInterestSet); - if (fRemoveInterestSet == NULL) - fRemoveInterestSet = OSSet::withCapacity(4); - if (fRemoveInterestSet) - { - fRemoveInterestSet->setObject(driver); + PM_LOCK(); + signal = (!fRemoveInterestSet && !fInsertInterestSet); + if (fRemoveInterestSet == NULL) + fRemoveInterestSet = OSSet::withCapacity(4); + if (fRemoveInterestSet) + { + fRemoveInterestSet->setObject(driver); + if (fInsertInterestSet) + fInsertInterestSet->removeObject(driver); - list = fInterestedDrivers; - item = list->findItem(driver); - if (item && item->active) - { - item->active = false; - } - if (fLockedFlags.DriverCallBusy) - PM_DEBUG("%s::deRegisterInterestedDriver() driver call busy\n", getName()); - } - PM_UNLOCK(); + list = fInterestedDrivers; + item = list->findItem(driver); + if (item && item->active) + { + item->active = false; + waitForPMDriverCall( driver ); + } + } + PM_UNLOCK(); - if (signal) - { - request = acquirePMRequest( this, kIOPMRequestTypeInterestChanged ); - if (request) - submitPMRequest( request ); - } + if (signal) + { + request = acquirePMRequest( this, kIOPMRequestTypeInterestChanged ); + if (request) + submitPMRequest( request ); + } - return IOPMNoErr; + return IOPMNoErr; } //********************************************************************************* @@ -1306,49 +1400,48 @@ IOReturn IOService::deRegisterInterestedDriver ( IOService * driver ) void IOService::handleInterestChanged( IOPMRequest * request ) { - IOService * driver; + IOService * driver; IOPMinformee * informee; - IOPMinformeeList * list = fInterestedDrivers; + IOPMinformeeList * list = fInterestedDrivers; - PM_LOCK(); + PM_LOCK(); - if (fInsertInterestSet) - { - while ((driver = (IOService *) fInsertInterestSet->getAnyObject())) - { - if ((list->findItem(driver) == NULL) && - (!fRemoveInterestSet || - !fRemoveInterestSet->containsObject(driver))) - { - informee = list->appendNewInformee(driver); - } - fInsertInterestSet->removeObject(driver); - } - fInsertInterestSet->release(); - fInsertInterestSet = 0; - } + if (fInsertInterestSet) + { + while ((driver = (IOService *) fInsertInterestSet->getAnyObject())) + { + if (list->findItem(driver) == NULL) + { + informee = list->appendNewInformee(driver); + } + fInsertInterestSet->removeObject(driver); + } + fInsertInterestSet->release(); + fInsertInterestSet = 0; + } - if (fRemoveInterestSet) - { - while ((driver = (IOService *) fRemoveInterestSet->getAnyObject())) - { - informee = list->findItem(driver); - if (informee) - { - if (fHeadNotePendingAcks && informee->timer) - { - informee->timer = 0; - fHeadNotePendingAcks--; - } - list->removeFromList(driver); - } - fRemoveInterestSet->removeObject(driver); - } - fRemoveInterestSet->release(); - fRemoveInterestSet = 0; - } + if (fRemoveInterestSet) + { + while ((driver = (IOService *) fRemoveInterestSet->getAnyObject())) + { + informee = list->findItem(driver); + if (informee) + { + // Clean-up async interest acknowledgement + if (fHeadNotePendingAcks && informee->timer) + { + informee->timer = 0; + fHeadNotePendingAcks--; + } + list->removeFromList(driver); + } + fRemoveInterestSet->removeObject(driver); + } + fRemoveInterestSet->release(); + fRemoveInterestSet = 0; + } - PM_UNLOCK(); + PM_UNLOCK(); } //********************************************************************************* @@ -1432,11 +1525,25 @@ bool IOService::handleAcknowledgePowerChange ( IOPMRequest * request ) { uint64_t nsec = computeTimeDeltaNS(&informee->startTime); if (nsec > LOG_SETPOWER_TIMES) - PM_DEBUG("%s::powerState%sChangeTo(%p, %s, %lu -> %lu) async took %d ms\n", + PM_LOG("%s::powerState%sChangeTo(%p, %s, %lu -> %lu) async took %d ms\n", informee->whatObject->getName(), (fDriverCallReason == kDriverCallInformPreChange) ? "Will" : "Did", informee->whatObject, - fName, fCurrentPowerState, fHeadNotePowerState, NS_TO_MS(nsec)); + fName, fCurrentPowerState, fHeadNotePowerState, NS_TO_US(nsec)); + + uint16_t logType = (fDriverCallReason == kDriverCallInformPreChange) + ? kIOPMEventTypePSWillChangeTo + : kIOPMEventTypePSDidChangeTo; + + PMEventDetails *details = PMEventDetails::eventDetails( + logType, + fName, + (uintptr_t)this, + informee->whatObject->getName(), + 0, 0, 0, + NS_TO_MS(nsec)); + + getPMRootDomain()->recordAndReleasePMEventGated( details ); } #endif // mark it acked @@ -1523,8 +1630,17 @@ void IOService::adjustPowerState ( uint32_t clamp ) computeDesiredState(clamp); if (fControllingDriver && fParentsKnowState && inPlane(gIOPowerPlane)) { + IOPMPowerChangeFlags changeFlags = kIOPMSelfInitiated; + + // Indicate that children desires were ignored, and do not ask + // apps for permission to drop power. This is used by root domain + // for demand sleep. + + if (getPMRequestType() == kIOPMRequestTypeRequestPowerStateOverride) + changeFlags |= (kIOPMIgnoreChildren | kIOPMSkipAskPowerDown); + startPowerChange( - /* flags */ kIOPMWeInitiated, + /* flags */ changeFlags, /* power state */ fDesiredPowerState, /* domain flags */ 0, /* connection */ 0, @@ -1536,9 +1652,11 @@ void IOService::adjustPowerState ( uint32_t clamp ) // [public] synchronizePowerTree //********************************************************************************* -IOReturn IOService::synchronizePowerTree ( void ) +IOReturn IOService::synchronizePowerTree ( + IOOptionBits options, + IOService * notifyRoot ) { - IOPMRequest * request_c; + IOPMRequest * request_c = 0; IOPMRequest * request_s; if (this != getPMRootDomain()) @@ -1546,15 +1664,30 @@ IOReturn IOService::synchronizePowerTree ( void ) if (!initialized) return kIOPMNotYetInitialized; - request_c = acquirePMRequest( this, kIOPMRequestTypeIdleCancel ); - request_s = acquirePMRequest( this, kIOPMRequestTypeSynchronizePowerTree ); + if (notifyRoot) + { + IOPMRequest * nr; - if (!request_c || !request_s) - goto error_no_memory; + // Cancels don't need to be synchronized. + nr = acquirePMRequest(notifyRoot, kIOPMRequestTypeChildNotifyDelayCancel); + if (nr) submitPMRequest(nr); + nr = acquirePMRequest(getPMRootDomain(), kIOPMRequestTypeChildNotifyDelayCancel); + if (nr) submitPMRequest(nr); + } - request_c->attachNextRequest( request_s ); + request_s = acquirePMRequest( this, kIOPMRequestTypeSynchronizePowerTree ); + if (!request_s) + goto error_no_memory; - submitPMRequest(request_c); + if (options & kIOPMSyncCancelPowerDown) + request_c = acquirePMRequest( this, kIOPMRequestTypeIdleCancel ); + if (request_c) + { + request_c->attachNextRequest( request_s ); + submitPMRequest(request_c); + } + + request_s->fArg0 = (void *)(uintptr_t) options; submitPMRequest(request_s); return kIOReturnSuccess; @@ -1569,14 +1702,17 @@ IOReturn IOService::synchronizePowerTree ( void ) // [private] handleSynchronizePowerTree //********************************************************************************* -void IOService::handleSynchronizePowerTree ( IOPMRequest * /*request*/ ) +void IOService::handleSynchronizePowerTree ( IOPMRequest * request ) { PM_ASSERT_IN_GATE(); if (fControllingDriver && fParentsKnowState && inPlane(gIOPowerPlane) && (fCurrentPowerState == fNumberOfPowerStates - 1)) { + IOOptionBits options = (uintptr_t) request->fArg0; + startPowerChange( - /* flags */ kIOPMWeInitiated | kIOPMSynchronize, + /* flags */ kIOPMSelfInitiated | kIOPMSynchronize | + (options & kIOPMSyncNoChildNotify), /* power state */ fCurrentPowerState, /* domain flags */ 0, /* connection */ 0, @@ -1610,24 +1746,24 @@ IOReturn IOService::powerDomainWillChangeTo ( void IOService::handlePowerDomainWillChangeTo ( IOPMRequest * request ) { - IOPMPowerFlags parentPowerFlags = (IOPMPowerFlags) request->fArg0; - IOPowerConnection * whichParent = (IOPowerConnection *) request->fArg1; - unsigned long parentChangeFlags = (unsigned long) request->fArg2; - OSIterator * iter; - OSObject * next; - IOPowerConnection * connection; - unsigned long newPowerState; - unsigned long myChangeFlags; - IOPMPowerFlags combinedPowerFlags; - bool savedParentsKnowState; - IOReturn result = IOPMAckImplied; + IOPMPowerFlags parentPowerFlags = (IOPMPowerFlags) request->fArg0; + IOPowerConnection * whichParent = (IOPowerConnection *) request->fArg1; + IOPMPowerChangeFlags parentChangeFlags = (IOPMPowerChangeFlags)(uintptr_t) request->fArg2; + IOPMPowerChangeFlags myChangeFlags; + OSIterator * iter; + OSObject * next; + IOPowerConnection * connection; + IOPMPowerStateIndex newPowerState; + IOPMPowerFlags combinedPowerFlags; + bool savedParentsKnowState; + IOReturn result = IOPMAckImplied; PM_ASSERT_IN_GATE(); OUR_PMLog(kPMLogWillChange, parentPowerFlags, 0); if (!inPlane(gIOPowerPlane) || !whichParent || !whichParent->getAwaitingAck()) { - PM_DEBUG("%s::%s not in power tree\n", getName(), __FUNCTION__); + PM_LOG("%s::%s not in power tree\n", getName(), __FUNCTION__); goto exit_no_ack; } @@ -1656,7 +1792,7 @@ void IOService::handlePowerDomainWillChangeTo ( IOPMRequest * request ) // If our initial change has yet to occur, then defer the power change // until after the power domain has completed its power transition. - if ( fControllingDriver && !fInitialChange ) + if ( fControllingDriver && !fInitialPowerChange ) { newPowerState = fControllingDriver->maxCapabilityForDomainState( combinedPowerFlags); @@ -1729,21 +1865,21 @@ IOReturn IOService::powerDomainDidChangeTo ( void IOService::handlePowerDomainDidChangeTo ( IOPMRequest * request ) { - IOPMPowerFlags parentPowerFlags = (IOPMPowerFlags) request->fArg0; - IOPowerConnection * whichParent = (IOPowerConnection *) request->fArg1; - unsigned long parentChangeFlags = (unsigned long) request->fArg2; - unsigned long newPowerState; - unsigned long myChangeFlags; - unsigned long initialDesire; - bool savedParentsKnowState; - IOReturn result = IOPMAckImplied; + IOPMPowerFlags parentPowerFlags = (IOPMPowerFlags) request->fArg0; + IOPowerConnection * whichParent = (IOPowerConnection *) request->fArg1; + IOPMPowerChangeFlags parentChangeFlags = (IOPMPowerChangeFlags)(uintptr_t) request->fArg2; + IOPMPowerChangeFlags myChangeFlags; + IOPMPowerStateIndex newPowerState; + IOPMPowerStateIndex initialDesire; + bool savedParentsKnowState; + IOReturn result = IOPMAckImplied; PM_ASSERT_IN_GATE(); OUR_PMLog(kPMLogDidChange, parentPowerFlags, 0); if (!inPlane(gIOPowerPlane) || !whichParent || !whichParent->getAwaitingAck()) { - PM_DEBUG("%s::%s not in power tree\n", getName(), __FUNCTION__); + PM_LOG("%s::%s not in power tree\n", getName(), __FUNCTION__); goto exit_no_ack; } @@ -1756,7 +1892,7 @@ void IOService::handlePowerDomainDidChangeTo ( IOPMRequest * request ) newPowerState = fControllingDriver->maxCapabilityForDomainState( fParentsCurrentPowerFlags); - if (fInitialChange) + if (fInitialPowerChange) { initialDesire = fControllingDriver->initialPowerStateForDomainState( fParentsCurrentPowerFlags); @@ -1796,7 +1932,7 @@ void IOService::handlePowerDomainDidChangeTo ( IOPMRequest * request ) if (!savedParentsKnowState && fParentsKnowState) { - PM_TRACE("%s::powerDomainDidChangeTo parentsKnowState = true\n", + PM_LOG1("%s::powerDomainDidChangeTo parentsKnowState = true\n", getName()); requestDomainPower( fDesiredPowerState ); } @@ -1889,7 +2025,7 @@ void IOService::rebuildChildClampBits ( void ) { if (connection->getReadyFlag() == false) { - PM_CONNECT("[%s] %s: connection not ready\n", + PM_LOG3("[%s] %s: connection not ready\n", getName(), __FUNCTION__); continue; } @@ -1919,7 +2055,7 @@ IOReturn IOService::requestPowerDomainState( IOPowerConnection * childConnection, unsigned long specification ) { - unsigned long ps; + IOPMPowerStateIndex ps; IOPMPowerFlags outputPowerFlags; IOService * child; IOPMRequest * subRequest; @@ -1931,7 +2067,7 @@ IOReturn IOService::requestPowerDomainState( if (gIOPMWorkLoop->onThread() == false) { - PM_DEBUG("%s::requestPowerDomainState\n", getName()); + PM_LOG("%s::requestPowerDomainState\n", getName()); return kIOReturnSuccess; } @@ -1941,7 +2077,7 @@ IOReturn IOService::requestPowerDomainState( return kIOReturnNotAttached; if (!fControllingDriver || !fNumberOfPowerStates) - return IOPMNotYetInitialized; + return kIOReturnNotReady; child = (IOService *) childConnection->getChildEntry(gIOPowerPlane); assert(child); @@ -1953,10 +2089,10 @@ IOReturn IOService::requestPowerDomainState( // Merge in the power flags contributed by this power parent // at its current or impending power state. - outputPowerFlags = fPowerStates[fCurrentPowerState].outputPowerCharacter; + outputPowerFlags = fPowerStates[fCurrentPowerState].outputPowerFlags; if (fMachineState != kIOPM_Finished) { - if (IS_POWER_DROP && (getPMRootDomain() != this)) + if (IS_POWER_DROP && !IS_ROOT_DOMAIN) { // Use the lower power state when dropping power. // Must be careful since a power drop can be canceled @@ -1967,7 +2103,7 @@ IOReturn IOService::requestPowerDomainState( // The child must not wait for this parent to raise power // if the power drop was cancelled. The solution is to cancel // the power drop if possible, then schedule an adjustment to - // re-evaluate our correct power state. + // re-evaluate the parent's power state. // // Root domain is excluded to avoid idle sleep issues. And permit // root domain children to pop up when system is going to sleep. @@ -1977,14 +2113,14 @@ IOReturn IOService::requestPowerDomainState( { fDoNotPowerDown = true; // cancel power drop adjustPower = true; // schedule an adjustment - PM_TRACE("%s: power drop cancelled in state %u by %s\n", + PM_LOG1("%s: power drop cancelled in state %u by %s\n", getName(), fMachineState, child->getName()); } else { // Beyond cancellation point, report the impending state. outputPowerFlags = - fPowerStates[fHeadNotePowerState].outputPowerCharacter; + fPowerStates[fHeadNotePowerState].outputPowerFlags; } } else if (IS_POWER_RISE) @@ -2006,7 +2142,7 @@ IOReturn IOService::requestPowerDomainState( for (ps = 0; ps < fNumberOfPowerStates; ps++) { - if ((fPowerStates[ps].outputPowerCharacter & childRequestPowerFlags) == + if ((fPowerStates[ps].outputPowerFlags & childRequestPowerFlags) == (fOutputPowerCharacterFlags & childRequestPowerFlags)) break; } @@ -2028,7 +2164,7 @@ IOReturn IOService::requestPowerDomainState( #if ENABLE_DEBUG_LOGS if (adjustPower) { - PM_DEBUG("requestPowerDomainState[%s]: %s, init %d, %u->%u\n", + PM_LOG("requestPowerDomainState[%s]: %s, init %d, %u->%u\n", getName(), child->getName(), !childConnection->childHasRequestedPower(), (uint32_t) childConnection->getDesiredDomainState(), @@ -2049,7 +2185,7 @@ IOReturn IOService::requestPowerDomainState( // adjust power state. Submit a request if one wasn't pending, // or if the current request is part of a call tree. - if (adjustPower && !fDeviceOverrides && + if (adjustPower && !fDeviceOverrideEnabled && (!fAdjustPowerScheduled || gIOPMRequest->getRootRequest())) { subRequest = acquirePMRequest( @@ -2185,8 +2321,8 @@ IOReturn IOService::changePowerStateWithOverrideTo ( unsigned long ordinal ) { fTempClampPowerState = max(fTempClampPowerState, ordinal); fTempClampCount++; - fOverrideMaxPowerState = ordinal; - request->fArg2 = (void *) true; + fOverrideMaxPowerState = ordinal; + request->fArg2 = (void *) (uintptr_t) true; } submitPMRequest( request ); @@ -2228,7 +2364,7 @@ IOReturn IOService::requestPowerState ( { fTempClampPowerState = max(fTempClampPowerState, state); fTempClampCount++; - request->fArg2 = (void *) true; + request->fArg2 = (void *) (uintptr_t) true; } submitPMRequest( request ); @@ -2255,8 +2391,8 @@ void IOService::handleRequestPowerState ( IOPMRequest * request ) if (fNumberOfPowerStates && (state >= fNumberOfPowerStates)) state = fNumberOfPowerStates - 1; - // Override from changePowerStateWithOverrideTo() persists until - // the next "device" power request, such as changePowerStateToPriv(). + // The power suppression due to changePowerStateWithOverrideTo() expires + // upon the next "device" power request - changePowerStateToPriv(). if ((getPMRequestType() != kIOPMRequestTypeRequestPowerStateOverride) && (client == gIOPMPowerClientDevice)) @@ -2317,6 +2453,77 @@ uint32_t IOService::getPowerStateForClient( const OSSymbol * client ) return powerState; } +//********************************************************************************* +// [protected] powerOverrideOnPriv +//********************************************************************************* + +IOReturn IOService::powerOverrideOnPriv ( void ) +{ + IOPMRequest * request; + + if (!initialized) + return IOPMNotYetInitialized; + + if (gIOPMWorkLoop->inGate()) + { + fDeviceOverrideEnabled = true; + return IOPMNoErr; + } + + request = acquirePMRequest( this, kIOPMRequestTypePowerOverrideOnPriv ); + if (!request) + return kIOReturnNoMemory; + + submitPMRequest( request ); + return IOPMNoErr; +} + +//********************************************************************************* +// [protected] powerOverrideOffPriv +//********************************************************************************* + +IOReturn IOService::powerOverrideOffPriv ( void ) +{ + IOPMRequest * request; + + if (!initialized) + return IOPMNotYetInitialized; + + if (gIOPMWorkLoop->inGate()) + { + fDeviceOverrideEnabled = false; + return IOPMNoErr; + } + + request = acquirePMRequest( this, kIOPMRequestTypePowerOverrideOffPriv ); + if (!request) + return kIOReturnNoMemory; + + submitPMRequest( request ); + return IOPMNoErr; +} + +//********************************************************************************* +// [private] handlePowerOverrideChanged +//********************************************************************************* + +void IOService::handlePowerOverrideChanged ( IOPMRequest * request ) +{ + PM_ASSERT_IN_GATE(); + if (request->getType() == kIOPMRequestTypePowerOverrideOnPriv) + { + OUR_PMLog(kPMLogOverrideOn, 0, 0); + fDeviceOverrideEnabled = true; + } + else + { + OUR_PMLog(kPMLogOverrideOff, 0, 0); + fDeviceOverrideEnabled = false; + } + + adjustPowerState(); +} + //********************************************************************************* // [private] computeDesiredState //********************************************************************************* @@ -2335,7 +2542,7 @@ void IOService::computeDesiredState ( unsigned long localClamp ) if (!fNumberOfPowerStates) { fDesiredPowerState = 0; - //PM_DEBUG("%s::%s no controlling driver\n", getName(), __FUNCTION__); + //PM_LOG("%s::%s no controlling driver\n", getName(), __FUNCTION__); return; } @@ -2350,7 +2557,7 @@ void IOService::computeDesiredState ( unsigned long localClamp ) { if (connection->getReadyFlag() == false) { - PM_CONNECT("[%s] %s: connection not ready\n", + PM_LOG3("[%s] %s: connection not ready\n", getName(), __FUNCTION__); continue; } @@ -2376,7 +2583,7 @@ void IOService::computeDesiredState ( unsigned long localClamp ) while ((client = (const OSSymbol *) iter->getNextObject())) { // Ignore child and driver when override is in effect. - if ((fDeviceOverrides || + if ((fDeviceOverrideEnabled || (getPMRequestType() == kIOPMRequestTypeRequestPowerStateOverride)) && ((client == gIOPMPowerClientChildren) || (client == gIOPMPowerClientDriver))) @@ -2388,7 +2595,7 @@ void IOService::computeDesiredState ( unsigned long localClamp ) desiredState = getPowerStateForClient(client); assert(desiredState < fNumberOfPowerStates); - PM_TRACE(" %u %s\n", + PM_LOG1(" %u %s\n", desiredState, client->getCStringNoCopy()); newPowerState = max(newPowerState, desiredState); @@ -2415,7 +2622,7 @@ void IOService::computeDesiredState ( unsigned long localClamp ) fDesiredPowerState = newPowerState; - PM_TRACE(" temp %u, clamp %u, current %u, new %u\n", + PM_LOG1(" temp %u, clamp %u, current %u, new %u\n", (uint32_t) localClamp, (uint32_t) fTempClampPowerState, (uint32_t) fCurrentPowerState, newPowerState); @@ -2466,12 +2673,92 @@ IOWorkLoop * IOService::getPMworkloop ( void ) return gIOPMWorkLoop; } +#if NOT_YET + //********************************************************************************* -// [public] activityTickle -// -// The tickle with parameter kIOPMSuperclassPolicy1 causes the activity -// flag to be set, and the device state checked. If the device has been -// powered down, it is powered up again. +// Power Parent/Children Applier +//********************************************************************************* + +static void +applyToPowerChildren( + IOService * service, + IOServiceApplierFunction applier, + void * context, + IOOptionBits options ) +{ + PM_ASSERT_IN_GATE(); + + IORegistryEntry * entry; + IORegistryIterator * iter; + IOPowerConnection * connection; + IOService * child; + + iter = IORegistryIterator::iterateOver(service, gIOPowerPlane, options); + if (iter) + { + while ((entry = iter->getNextObject())) + { + // Get child of IOPowerConnection objects + if ((connection = OSDynamicCast(IOPowerConnection, entry))) + { + child = (IOService *) connection->copyChildEntry(gIOPowerPlane); + if (child) + { + (*applier)(child, context); + child->release(); + } + } + } + iter->release(); + } +} + +static void +applyToPowerParent( + IOService * service, + IOServiceApplierFunction applier, + void * context, + IOOptionBits options ) +{ + PM_ASSERT_IN_GATE(); + + IORegistryEntry * entry; + IORegistryIterator * iter; + IOPowerConnection * connection; + IOService * parent; + + iter = IORegistryIterator::iterateOver(service, gIOPowerPlane, + options | kIORegistryIterateParents); + if (iter) + { + while ((entry = iter->getNextObject())) + { + // Get child of IOPowerConnection objects + if ((connection = OSDynamicCast(IOPowerConnection, entry))) + { + parent = (IOService *) connection->copyParentEntry(gIOPowerPlane); + if (parent) + { + (*applier)(parent, context); + parent->release(); + } + } + } + iter->release(); + } +} + +#endif /* NOT_YET */ + +// MARK: - +// MARK: Activity Tickle & Idle Timer + +//********************************************************************************* +// [public] activityTickle +// +// The tickle with parameter kIOPMSuperclassPolicy1 causes the activity +// flag to be set, and the device state checked. If the device has been +// powered down, it is powered up again. // The tickle with parameter kIOPMSubclassPolicy is ignored here and // should be intercepted by a subclass. //********************************************************************************* @@ -2487,14 +2774,11 @@ bool IOService::activityTickle ( unsigned long type, unsigned long stateNumber ) // Record device activity for the idle timer handler. - fDeviceActive = true; + fDeviceWasActive = true; fActivityTickleCount++; clock_get_uptime(&fDeviceActiveTimestamp); -#if ROOT_DOMAIN_RUN_STATES - getPMRootDomain()->handleActivityTickleForService(this, type, - fCurrentPowerState, fActivityTickleCount); -#endif + PM_ACTION_0(actionActivityTickle); // Record the last tickle power state. // This helps to filter out redundant tickles as @@ -2509,7 +2793,7 @@ bool IOService::activityTickle ( unsigned long type, unsigned long stateNumber ) if (request) { request->fArg0 = (void *) stateNumber; // power state - request->fArg1 = (void *) true; // power rise + request->fArg1 = (void *) (uintptr_t) true; // power rise submitPMRequest(request); } } @@ -2558,21 +2842,19 @@ void IOService::handleActivityTickle ( IOPMRequest * request ) } } -//********************************************************************************* +//****************************************************************************** // [public] setIdleTimerPeriod // -// A subclass policy-maker is going to use our standard idleness -// detection service. Make a command queue and an idle timer and -// connect them to the power management workloop. Finally, -// start the timer. -//********************************************************************************* +// A subclass policy-maker is using our standard idleness detection service. +// Start the idle timer. Period is in seconds. +//****************************************************************************** IOReturn IOService::setIdleTimerPeriod ( unsigned long period ) { if (!initialized) return IOPMNotYetInitialized; - OUR_PMLog(kPMLogSetIdleTimerPeriod, period, 0); + OUR_PMLog(kPMLogSetIdleTimerPeriod, period, fIdleTimerPeriod); IOPMRequest * request = acquirePMRequest( this, kIOPMRequestTypeSetIdleTimerPeriod ); @@ -2582,7 +2864,7 @@ IOReturn IOService::setIdleTimerPeriod ( unsigned long period ) request->fArg0 = (void *) period; submitPMRequest( request ); - return IOPMNoErr; + return kIOReturnSuccess; } //****************************************************************************** @@ -2597,10 +2879,10 @@ SInt32 IOService::nextIdleTimeout( AbsoluteTime lastActivity, unsigned int powerState) { - AbsoluteTime delta; - UInt64 delta_ns; - SInt32 delta_secs; - SInt32 delay_secs; + AbsoluteTime delta; + UInt64 delta_ns; + SInt32 delta_secs; + SInt32 delay_secs; // Calculate time difference using funky macro from clock.h. delta = currentTime; @@ -2619,26 +2901,25 @@ SInt32 IOService::nextIdleTimeout( return (SInt32)delay_secs; } -//****************************************************************************** +//********************************************************************************* // [public] start_PM_idle_timer -// -// The parameter is a pointer to us. Use it to call our timeout method. -//****************************************************************************** +//********************************************************************************* void IOService::start_PM_idle_timer ( void ) { - static const int maxTimeout = 100000; - static const int minTimeout = 1; - AbsoluteTime uptime; - SInt32 idle_in = 0; + static const int maxTimeout = 100000; + static const int minTimeout = 1; + AbsoluteTime uptime, deadline; + SInt32 idle_in = 0; + boolean_t pending; - if (!initialized || !fIdleTimerPeriod || !fIdleTimerEventSource) + if (!initialized || !fIdleTimerPeriod) return; IOLockLock(fActivityLock); clock_get_uptime(&uptime); - + // Subclasses may modify idle sleep algorithm idle_in = nextIdleTimeout(uptime, fDeviceActiveTimestamp, fCurrentPowerState); @@ -2655,18 +2936,41 @@ void IOService::start_PM_idle_timer ( void ) IOLockUnlock(fActivityLock); - fIdleTimerEventSource->setTimeout(idle_in, NSEC_PER_SEC); + retain(); + clock_interval_to_absolutetime_interval(idle_in, kSecondScale, &deadline); + ADD_ABSOLUTETIME(&deadline, &uptime); + pending = thread_call_enter_delayed(fIdleTimer, deadline); + if (pending) release(); +} + +//********************************************************************************* +// idle_timer_expired +//********************************************************************************* + +static void +idle_timer_expired ( + thread_call_param_t arg0, thread_call_param_t arg1 ) +{ + IOService * me = (IOService *) arg0; + + if (gIOPMWorkLoop) + gIOPMWorkLoop->runAction( + OSMemberFunctionCast(IOWorkLoop::Action, me, + &IOService::idleTimerExpired), + me); + + me->release(); } //********************************************************************************* // [private] idleTimerExpired // -// The idle timer has expired. If there has been activity since the last +// The idle timer has expired. If there has been activity since the last // expiration, just restart the timer and return. If there has not been // activity, switch to the next lower power state and restart the timer. //********************************************************************************* -void IOService::idleTimerExpired( IOTimerEventSource * ) +void IOService::idleTimerExpired( void ) { IOPMRequest * request; bool restartTimer = true; @@ -2678,10 +2982,10 @@ void IOService::idleTimerExpired( IOTimerEventSource * ) // Check for device activity (tickles) over last timer period. - if (fDeviceActive) + if (fDeviceWasActive) { // Device was active - do not drop power, restart timer. - fDeviceActive = false; + fDeviceWasActive = false; } else { @@ -2699,7 +3003,7 @@ void IOService::idleTimerExpired( IOTimerEventSource * ) if (request) { request->fArg0 = (void *) 0; // power state (irrelevant) - request->fArg1 = (void *) false; // power drop + request->fArg1 = (void *) (uintptr_t) false; // power drop submitPMRequest( request ); // Do not restart timer until after the tickle request has been @@ -2798,7 +3102,7 @@ IOReturn IOService::systemWake ( void ) { if (connection->getReadyFlag() == false) { - PM_CONNECT("[%s] %s: connection not ready\n", + PM_LOG3("[%s] %s: connection not ready\n", getName(), __FUNCTION__); continue; } @@ -2836,7 +3140,7 @@ IOReturn IOService::temperatureCriticalForZone ( IOService * whichZone ) OUR_PMLog(kPMLogCriticalTemp, 0, 0); - if ( inPlane(gIOPowerPlane) && !IS_PM_ROOT() ) + if ( inPlane(gIOPowerPlane) && !IS_PM_ROOT ) { theNub = (IOService *)copyParentEntry(gIOPowerPlane); if ( theNub ) @@ -2854,87 +3158,21 @@ IOReturn IOService::temperatureCriticalForZone ( IOService * whichZone ) } #endif /* !__LP64__ */ -//********************************************************************************* -// [protected] powerOverrideOnPriv -//********************************************************************************* - -IOReturn IOService::powerOverrideOnPriv ( void ) -{ - IOPMRequest * request; - - if (!initialized) - return IOPMNotYetInitialized; - - if (gIOPMWorkLoop->inGate()) - { - fDeviceOverrides = true; - return IOPMNoErr; - } - - request = acquirePMRequest( this, kIOPMRequestTypePowerOverrideOnPriv ); - if (!request) - return kIOReturnNoMemory; - - submitPMRequest( request ); - return IOPMNoErr; -} - -//********************************************************************************* -// [protected] powerOverrideOffPriv -//********************************************************************************* - -IOReturn IOService::powerOverrideOffPriv ( void ) -{ - IOPMRequest * request; - - if (!initialized) - return IOPMNotYetInitialized; - - if (gIOPMWorkLoop->inGate()) - { - fDeviceOverrides = false; - return IOPMNoErr; - } - - request = acquirePMRequest( this, kIOPMRequestTypePowerOverrideOffPriv ); - if (!request) - return kIOReturnNoMemory; - - submitPMRequest( request ); - return IOPMNoErr; -} - -//********************************************************************************* -// [private] handlePowerOverrideChanged -//********************************************************************************* - -void IOService::handlePowerOverrideChanged ( IOPMRequest * request ) -{ - PM_ASSERT_IN_GATE(); - if (request->getType() == kIOPMRequestTypePowerOverrideOnPriv) - { - OUR_PMLog(kPMLogOverrideOn, 0, 0); - fDeviceOverrides = true; - } - else - { - OUR_PMLog(kPMLogOverrideOff, 0, 0); - fDeviceOverrides = false; - } - - adjustPowerState(); -} +// MARK: - +// MARK: Power Change (Common) //********************************************************************************* // [private] startPowerChange +// +// All power state changes starts here. //********************************************************************************* -IOReturn IOService::startPowerChange ( - unsigned long changeFlags, - unsigned long powerState, - unsigned long domainFlags, - IOPowerConnection * parentConnection, - unsigned long parentFlags ) +IOReturn IOService::startPowerChange( + IOPMPowerChangeFlags changeFlags, + IOPMPowerStateIndex powerState, + IOPMPowerFlags domainFlags, + IOPowerConnection * parentConnection, + IOPMPowerFlags parentFlags ) { PM_ASSERT_IN_GATE(); assert( fMachineState == kIOPM_Finished ); @@ -2943,32 +3181,17 @@ IOReturn IOService::startPowerChange ( if (powerState >= fNumberOfPowerStates) return IOPMAckImplied; -#if ROOT_DOMAIN_RUN_STATES - // Root domain can override chosen power state to a lower state. - getPMRootDomain()->overridePowerStateForService( - this, &fRootDomainState, - &powerState, changeFlags); -#endif - - // Invalidate the last recorded tickle power state when a power transition - // is about to occur, and not as a result of a tickle request. + fIsPreChange = true; + PM_ACTION_2(actionPowerChangeOverride, &powerState, &changeFlags); - if ((getPMRequestType() != kIOPMRequestTypeActivityTickle) && - (fActivityTicklePowerState != -1)) - { - IOLockLock(fActivityLock); - fActivityTicklePowerState = -1; - IOLockUnlock(fActivityLock); - } - - // Initialize the change note. + // Forks to either Driver or Parent initiated power change paths. - fHeadNoteFlags = changeFlags; + fHeadNoteChangeFlags = changeFlags; fHeadNotePowerState = powerState; fHeadNotePowerArrayEntry = &fPowerStates[ powerState ]; fHeadNoteParentConnection = NULL; - if (changeFlags & kIOPMWeInitiated) + if (changeFlags & kIOPMSelfInitiated) { if (changeFlags & kIOPMSynchronize) OurSyncStart(); @@ -2992,70 +3215,68 @@ IOReturn IOService::startPowerChange ( bool IOService::notifyInterestedDrivers ( void ) { - IOPMinformee * informee; - IOPMinformeeList * list = fInterestedDrivers; - DriverCallParam * param; - IOItemCount count; + IOPMinformee * informee; + IOPMinformeeList * list = fInterestedDrivers; + DriverCallParam * param; + IOItemCount count; - PM_ASSERT_IN_GATE(); - assert( fDriverCallParamCount == 0 ); - assert( fHeadNotePendingAcks == 0 ); + PM_ASSERT_IN_GATE(); + assert( fDriverCallParamCount == 0 ); + assert( fHeadNotePendingAcks == 0 ); fHeadNotePendingAcks = 0; - count = list->numberOfItems(); - if (!count) - goto done; // no interested drivers + count = list->numberOfItems(); + if (!count) + goto done; // no interested drivers - // Allocate an array of interested drivers and their return values - // for the callout thread. Everything else is still "owned" by the - // PM work loop, which can run to process acknowledgePowerChange() - // responses. + // Allocate an array of interested drivers and their return values + // for the callout thread. Everything else is still "owned" by the + // PM work loop, which can run to process acknowledgePowerChange() + // responses. - param = (DriverCallParam *) fDriverCallParamPtr; - if (count > fDriverCallParamSlots) - { - if (fDriverCallParamSlots) - { - assert(fDriverCallParamPtr); - IODelete(fDriverCallParamPtr, DriverCallParam, fDriverCallParamSlots); - fDriverCallParamPtr = 0; - fDriverCallParamSlots = 0; - } + param = (DriverCallParam *) fDriverCallParamPtr; + if (count > fDriverCallParamSlots) + { + if (fDriverCallParamSlots) + { + assert(fDriverCallParamPtr); + IODelete(fDriverCallParamPtr, DriverCallParam, fDriverCallParamSlots); + fDriverCallParamPtr = 0; + fDriverCallParamSlots = 0; + } - param = IONew(DriverCallParam, count); - if (!param) - goto done; // no memory + param = IONew(DriverCallParam, count); + if (!param) + goto done; // no memory - fDriverCallParamPtr = (void *) param; - fDriverCallParamSlots = count; - } + fDriverCallParamPtr = (void *) param; + fDriverCallParamSlots = count; + } - informee = list->firstInList(); - assert(informee); - for (IOItemCount i = 0; i < count; i++) - { - informee->timer = -1; - param[i].Target = informee; - informee->retain(); + informee = list->firstInList(); + assert(informee); + for (IOItemCount i = 0; i < count; i++) + { + informee->timer = -1; + param[i].Target = informee; + informee->retain(); informee = list->nextInList( informee ); - } - - fDriverCallParamCount = count; - fHeadNotePendingAcks = count; + } - // Machine state will be blocked pending callout thread completion. + fDriverCallParamCount = count; + fHeadNotePendingAcks = count; - PM_LOCK(); - assert( fLockedFlags.DriverCallBusy == false ); - fLockedFlags.DriverCallBusy = true; - PM_UNLOCK(); - thread_call_enter( fDriverCallEntry ); - return true; + // Block state machine and wait for callout completion. + assert(!fDriverCallBusy); + fDriverCallBusy = true; + thread_call_enter( fDriverCallEntry ); + return true; done: - // no interested drivers or did not schedule callout thread due to error. - return false; + // Return false if there are no interested drivers or could not schedule + // callout thread due to error. + return false; } //********************************************************************************* @@ -3064,18 +3285,18 @@ bool IOService::notifyInterestedDrivers ( void ) void IOService::notifyInterestedDriversDone ( void ) { - IOPMinformee * informee; - IOItemCount count; - DriverCallParam * param; - IOReturn result; + IOPMinformee * informee; + IOItemCount count; + DriverCallParam * param; + IOReturn result; PM_ASSERT_IN_GATE(); + assert( fDriverCallBusy == false ); + assert( fMachineState == kIOPM_DriverThreadCallDone ); + param = (DriverCallParam *) fDriverCallParamPtr; count = fDriverCallParamCount; - assert( fLockedFlags.DriverCallBusy == false ); - assert( fMachineState == kIOPM_DriverThreadCallDone ); - if (param && count) { for (IOItemCount i = 0; i < count; i++, param++) @@ -3128,10 +3349,23 @@ void IOService::notifyInterestedDriversDone ( void ) } } - // Hop back to original machine state path (from notifyAll) - fMachineState = fNextMachineState; + MS_POP(); // pushed by notifyAll() + + // If interest acks are outstanding, wait for fHeadNotePendingAcks to become + // zero before notifying children. This enforces the children after interest + // ordering even for async interest clients. - notifyChildren(); + if (!fHeadNotePendingAcks) + { + notifyChildren(); + } + else + { + MS_PUSH(fMachineState); + fMachineState = kIOPM_NotifyChildrenStart; + PM_LOG2("%s: %u outstanding async interest\n", + getName(), fHeadNotePendingAcks); + } } //********************************************************************************* @@ -3144,6 +3378,17 @@ void IOService::notifyChildren ( void ) OSObject * next; IOPowerConnection * connection; OSArray * children = 0; + IOPMrootDomain * rootDomain; + bool delayNotify = false; + + if ((fHeadNotePowerState != fCurrentPowerState) && + (IS_POWER_DROP == fIsPreChange) && + ((rootDomain = getPMRootDomain()) == this)) + { + rootDomain->tracePoint( IS_POWER_DROP ? + kIOPMTracePointSleepPowerPlaneDrivers : + kIOPMTracePointWakePowerPlaneDrivers ); + } if (fStrictTreeOrder) children = OSArray::withCapacity(8); @@ -3160,49 +3405,78 @@ void IOService::notifyChildren ( void ) { if (connection->getReadyFlag() == false) { - PM_CONNECT("[%s] %s: connection not ready\n", + PM_LOG3("[%s] %s: connection not ready\n", getName(), __FUNCTION__); continue; } - if (children) + // Mechanism to postpone the did-change notification to + // certain power children to order those children last. + // Cannot be used together with strict tree ordering. + + if (!fIsPreChange && + (connection->delayChildNotification) && + getPMRootDomain()->shouldDelayChildNotification(this)) + { + if (!children) + { + children = OSArray::withCapacity(8); + if (children) + delayNotify = true; + } + if (delayNotify) + { + children->setObject( connection ); + continue; + } + } + + if (!delayNotify && children) children->setObject( connection ); else - notifyChild( connection, - fDriverCallReason == kDriverCallInformPreChange ); + notifyChild( connection ); } } iter->release(); } + if (children && (children->getCount() == 0)) + { + children->release(); + children = 0; + } if (children) { - if (children->getCount() == 0) - { - children->release(); - children = 0; - } - else - { - assert(fNotifyChildArray == 0); - fNotifyChildArray = children; - fNextMachineState = fMachineState; - fMachineState = kIOPM_NotifyChildrenDone; - } + assert(fNotifyChildArray == 0); + fNotifyChildArray = children; + MS_PUSH(fMachineState); + + if (delayNotify) + { + // Wait for exiting child notifications to complete, + // before notifying the children in the array. + fMachineState = kIOPM_NotifyChildrenDelayed; + PM_LOG2("%s: %d children in delayed array\n", + getName(), children->getCount()); + } + else + { + // Notify children in the array one at a time. + fMachineState = kIOPM_NotifyChildrenOrdered; + } } } //********************************************************************************* -// [private] notifyChildrenDone +// [private] notifyChildrenOrdered //********************************************************************************* -void IOService::notifyChildrenDone ( void ) +void IOService::notifyChildrenOrdered ( void ) { PM_ASSERT_IN_GATE(); assert(fNotifyChildArray); - assert(fMachineState == kIOPM_NotifyChildrenDone); + assert(fMachineState == kIOPM_NotifyChildrenOrdered); - // Interested drivers have all acked (if any), ack timer stopped. // Notify one child, wait for it to ack, then repeat for next child. // This is a workaround for some drivers with multiple instances at // the same branch in the power tree, but the driver is slow to power @@ -3217,28 +3491,61 @@ void IOService::notifyChildrenDone ( void ) IOPowerConnection * connection; connection = (IOPowerConnection *) fNotifyChildArray->getObject(0); fNotifyChildArray->removeObject(0); - notifyChild( connection, fDriverCallReason == kDriverCallInformPreChange ); + notifyChild( connection ); } else { fNotifyChildArray->release(); fNotifyChildArray = 0; - fMachineState = fNextMachineState; + + MS_POP(); // pushed by notifyChildren() } } +//********************************************************************************* +// [private] notifyChildrenDelayed +//********************************************************************************* + +void IOService::notifyChildrenDelayed ( void ) +{ + IOPowerConnection * connection; + + PM_ASSERT_IN_GATE(); + assert(fNotifyChildArray); + assert(fMachineState == kIOPM_NotifyChildrenDelayed); + + // Wait after all non-delayed children and interested drivers have ack'ed, + // then notify all delayed children. When explicitly cancelled, interest + // acks (and ack timer) may still be outstanding. + + for (int i = 0; ; i++) + { + connection = (IOPowerConnection *) fNotifyChildArray->getObject(i); + if (!connection) + break; + + notifyChild( connection ); + } + + PM_LOG2("%s: notified delayed children\n", getName()); + fNotifyChildArray->release(); + fNotifyChildArray = 0; + + MS_POP(); // pushed by notifyChildren() +} + //********************************************************************************* // [private] notifyAll //********************************************************************************* -IOReturn IOService::notifyAll ( int nextMachineState, bool is_prechange ) +IOReturn IOService::notifyAll ( uint32_t nextMS ) { // Save the next machine_state to be restored by notifyInterestedDriversDone() PM_ASSERT_IN_GATE(); - fNextMachineState = nextMachineState; + MS_PUSH(nextMS); fMachineState = kIOPM_DriverThreadCallDone; - fDriverCallReason = is_prechange ? + fDriverCallReason = fIsPreChange ? kDriverCallInformPreChange : kDriverCallInformPostChange; if (!notifyInterestedDrivers()) @@ -3258,16 +3565,15 @@ IOReturn IOService::actionDriverCalloutDone ( void * arg0, void * arg1, void * arg2, void * arg3 ) { - IOServicePM * pwrMgt = (IOServicePM *) arg0; + IOServicePM * pwrMgt = (IOServicePM *) arg0; - PM_LOCK(); - fLockedFlags.DriverCallBusy = false; - PM_UNLOCK(); + assert( fDriverCallBusy ); + fDriverCallBusy = false; - if (gIOPMReplyQueue) - gIOPMReplyQueue->signalWorkAvailable(); + assert(gIOPMWorkQueue); + gIOPMWorkQueue->signalWorkAvailable(); - return kIOReturnSuccess; + return kIOReturnSuccess; } void IOService::pmDriverCallout ( IOService * from ) @@ -3302,27 +3608,35 @@ void IOService::pmDriverCallout ( IOService * from ) void IOService::driverSetPowerState ( void ) { - IOService * driver; - unsigned long powerState; - DriverCallParam * param; - IOReturn result; + IOPMPowerStateIndex powerState; + DriverCallParam * param; + IOPMDriverCallEntry callEntry; AbsoluteTime end; + IOReturn result; + uint32_t oldPowerState = getPowerState(); - assert( fLockedFlags.DriverCallBusy == true ); - param = (DriverCallParam *) fDriverCallParamPtr; - assert( param ); - assert( fDriverCallParamCount == 1 ); + assert( fDriverCallBusy ); + assert( fDriverCallParamPtr ); + assert( fDriverCallParamCount == 1 ); - driver = fControllingDriver; - powerState = fHeadNotePowerState; + param = (DriverCallParam *) fDriverCallParamPtr; + powerState = fHeadNotePowerState; - if (fLockedFlags.PMStop == false) - { - OUR_PMLog( kPMLogProgramHardware, (uintptr_t) this, powerState); + if (assertPMDriverCall(&callEntry)) + { + OUR_PMLog( kPMLogProgramHardware, (uintptr_t) this, powerState); clock_get_uptime(&fDriverCallStartTime); - result = driver->setPowerState( powerState, this ); + result = fControllingDriver->setPowerState( powerState, this ); clock_get_uptime(&end); - OUR_PMLog((UInt32) -kPMLogProgramHardware, (uintptr_t) this, (UInt32) result); + OUR_PMLog((UInt32) -kPMLogProgramHardware, (uintptr_t) this, (UInt32) result); + + deassertPMDriverCall(&callEntry); + + if (result < 0) + { + PM_LOG("%s::setPowerState(%p, %lu -> %lu) returned 0x%x\n", + fName, this, fCurrentPowerState, powerState, result); + } #if LOG_SETPOWER_TIMES if ((result == IOPMAckImplied) || (result < 0)) @@ -3332,15 +3646,27 @@ void IOService::driverSetPowerState ( void ) SUB_ABSOLUTETIME(&end, &fDriverCallStartTime); absolutetime_to_nanoseconds(end, &nsec); if (nsec > LOG_SETPOWER_TIMES) - PM_DEBUG("%s::setPowerState(%p, %lu -> %lu) took %d ms\n", + PM_LOG("%s::setPowerState(%p, %lu -> %lu) took %d ms\n", fName, this, fCurrentPowerState, powerState, NS_TO_MS(nsec)); + + PMEventDetails *details = PMEventDetails::eventDetails( + kIOPMEventTypeSetPowerStateImmediate, // type + fName, // who + (uintptr_t)this, // owner unique + NULL, // interest name + (uint8_t)oldPowerState, // old + (uint8_t)powerState, // new + 0, // result + NS_TO_US(nsec)); // usec completion time + + getPMRootDomain()->recordAndReleasePMEventGated( details ); } #endif - } - else - result = kIOPMAckImplied; + } + else + result = kIOPMAckImplied; - param->Result = result; + param->Result = result; } //********************************************************************************* @@ -3351,46 +3677,51 @@ void IOService::driverSetPowerState ( void ) void IOService::driverInformPowerChange ( void ) { - IOItemCount count; - IOPMinformee * informee; - IOService * driver; - IOReturn result; - IOPMPowerFlags powerFlags; - unsigned long powerState; - DriverCallParam * param; + IOPMinformee * informee; + IOService * driver; + DriverCallParam * param; + IOPMDriverCallEntry callEntry; + IOPMPowerFlags powerFlags; + IOPMPowerStateIndex powerState; AbsoluteTime end; + IOReturn result; + IOItemCount count; - assert( fLockedFlags.DriverCallBusy == true ); - param = (DriverCallParam *) fDriverCallParamPtr; - count = fDriverCallParamCount; - assert( count && param ); + assert( fDriverCallBusy ); + assert( fDriverCallParamPtr ); + assert( fDriverCallParamCount ); - powerFlags = fHeadNotePowerArrayEntry->capabilityFlags; - powerState = fHeadNotePowerState; + param = (DriverCallParam *) fDriverCallParamPtr; + count = fDriverCallParamCount; - for (IOItemCount i = 0; i < count; i++) - { - informee = (IOPMinformee *) param->Target; - driver = informee->whatObject; + powerFlags = fHeadNotePowerArrayEntry->capabilityFlags; + powerState = fHeadNotePowerState; - if ((fLockedFlags.PMStop == false) && informee->active) - { - if (fDriverCallReason == kDriverCallInformPreChange) - { - OUR_PMLog(kPMLogInformDriverPreChange, (uintptr_t) this, powerState); - clock_get_uptime(&informee->startTime); - result = driver->powerStateWillChangeTo(powerFlags, powerState, this); - clock_get_uptime(&end); - OUR_PMLog((UInt32)-kPMLogInformDriverPreChange, (uintptr_t) this, result); - } - else - { - OUR_PMLog(kPMLogInformDriverPostChange, (uintptr_t) this, powerState); + for (IOItemCount i = 0; i < count; i++) + { + informee = (IOPMinformee *) param->Target; + driver = informee->whatObject; + + if (assertPMDriverCall(&callEntry, 0, informee)) + { + if (fDriverCallReason == kDriverCallInformPreChange) + { + OUR_PMLog(kPMLogInformDriverPreChange, (uintptr_t) this, powerState); clock_get_uptime(&informee->startTime); - result = driver->powerStateDidChangeTo(powerFlags, powerState, this); + result = driver->powerStateWillChangeTo(powerFlags, powerState, this); clock_get_uptime(&end); - OUR_PMLog((UInt32)-kPMLogInformDriverPostChange, (uintptr_t) this, result); - } + OUR_PMLog((UInt32)-kPMLogInformDriverPreChange, (uintptr_t) this, result); + } + else + { + OUR_PMLog(kPMLogInformDriverPostChange, (uintptr_t) this, powerState); + clock_get_uptime(&informee->startTime); + result = driver->powerStateDidChangeTo(powerFlags, powerState, this); + clock_get_uptime(&end); + OUR_PMLog((UInt32)-kPMLogInformDriverPostChange, (uintptr_t) this, result); + } + + deassertPMDriverCall(&callEntry); #if LOG_SETPOWER_TIMES if ((result == IOPMAckImplied) || (result < 0)) @@ -3400,19 +3731,35 @@ void IOService::driverInformPowerChange ( void ) SUB_ABSOLUTETIME(&end, &informee->startTime); absolutetime_to_nanoseconds(end, &nsec); if (nsec > LOG_SETPOWER_TIMES) - PM_DEBUG("%s::powerState%sChangeTo(%p, %s, %lu -> %lu) took %d ms\n", + PM_LOG("%s::powerState%sChangeTo(%p, %s, %lu -> %lu) took %d ms\n", driver->getName(), (fDriverCallReason == kDriverCallInformPreChange) ? "Will" : "Did", driver, fName, fCurrentPowerState, powerState, NS_TO_MS(nsec)); + + uint16_t logType = (fDriverCallReason == kDriverCallInformPreChange) + ? kIOPMEventTypePSWillChangeTo + : kIOPMEventTypePSDidChangeTo; + + PMEventDetails *details = PMEventDetails::eventDetails( + logType, // type + fName, // who + (uintptr_t)this, // owner unique + driver->getName(), // interest name + (uint8_t)fCurrentPowerState, // old + (uint8_t)fHeadNotePowerState, // new + 0, // result + NS_TO_US(nsec)); // usec completion time + + getPMRootDomain()->recordAndReleasePMEventGated( details ); } #endif - } - else - result = kIOPMAckImplied; + } + else + result = kIOPMAckImplied; - param->Result = result; - param++; - } + param->Result = result; + param++; + } } //********************************************************************************* @@ -3422,14 +3769,14 @@ void IOService::driverInformPowerChange ( void ) // If the object acknowledges the current change, we return TRUE. //********************************************************************************* -bool IOService::notifyChild ( IOPowerConnection * theNub, bool is_prechange ) +bool IOService::notifyChild ( IOPowerConnection * theNub ) { - IOReturn ret = IOPMAckImplied; - unsigned long childPower; - IOService * theChild; - IOPMRequest * childRequest; - uint32_t requestArg2; - int requestType; + IOReturn ret = IOPMAckImplied; + unsigned long childPower; + IOService * theChild; + IOPMRequest * childRequest; + IOPMPowerChangeFlags requestArg2; + int requestType; PM_ASSERT_IN_GATE(); theChild = (IOService *)(theNub->copyChildEntry(gIOPowerPlane)); @@ -3444,11 +3791,11 @@ bool IOService::notifyChild ( IOPowerConnection * theNub, bool is_prechange ) fHeadNotePendingAcks++; theNub->setAwaitingAck(true); - requestArg2 = fHeadNoteFlags; + requestArg2 = fHeadNoteChangeFlags; if (fHeadNotePowerState < fCurrentPowerState) requestArg2 |= kIOPMDomainPowerDrop; - requestType = is_prechange ? + requestType = fIsPreChange ? kIOPMRequestTypePowerDomainWillChange : kIOPMRequestTypePowerDomainDidChange; @@ -3456,7 +3803,7 @@ bool IOService::notifyChild ( IOPowerConnection * theNub, bool is_prechange ) if (childRequest) { theNub->retain(); - childRequest->fArg0 = (void *) fHeadNotePowerArrayEntry->outputPowerCharacter; + childRequest->fArg0 = (void *) fHeadNotePowerArrayEntry->outputPowerFlags; childRequest->fArg1 = (void *) theNub; childRequest->fArg2 = (void *) requestArg2; theChild->submitPMRequest( childRequest ); @@ -3481,6 +3828,246 @@ bool IOService::notifyChild ( IOPowerConnection * theNub, bool is_prechange ) return (IOPMAckImplied == ret); } +//********************************************************************************* +// [private] notifyControllingDriver +//********************************************************************************* + +bool IOService::notifyControllingDriver ( void ) +{ + DriverCallParam * param; + + PM_ASSERT_IN_GATE(); + assert( fDriverCallParamCount == 0 ); + assert( fControllingDriver ); + + if (fInitialSetPowerState) + { + // Driver specified flag to skip the inital setPowerState() + if (fHeadNotePowerArrayEntry->capabilityFlags & kIOPMInitialDeviceState) + { + return false; + } + fInitialSetPowerState = false; + } + + param = (DriverCallParam *) fDriverCallParamPtr; + if (!param) + { + param = IONew(DriverCallParam, 1); + if (!param) + return false; // no memory + + fDriverCallParamPtr = (void *) param; + fDriverCallParamSlots = 1; + } + + param->Target = fControllingDriver; + fDriverCallParamCount = 1; + fDriverTimer = -1; + + // Block state machine and wait for callout completion. + assert(!fDriverCallBusy); + fDriverCallBusy = true; + thread_call_enter( fDriverCallEntry ); + + return true; +} + +//********************************************************************************* +// [private] notifyControllingDriverDone +//********************************************************************************* + +void IOService::notifyControllingDriverDone( void ) +{ + DriverCallParam * param; + IOReturn result; + + PM_ASSERT_IN_GATE(); + param = (DriverCallParam *) fDriverCallParamPtr; + + assert( fDriverCallBusy == false ); + assert( fMachineState == kIOPM_DriverThreadCallDone ); + + if (param && fDriverCallParamCount) + { + assert(fDriverCallParamCount == 1); + + // the return value from setPowerState() + result = param->Result; + + if ((result == IOPMAckImplied) || (result < 0)) + { + fDriverTimer = 0; + } + else if (fDriverTimer) + { + assert(fDriverTimer == -1); + + // Driver has not acked, and has returned a positive result. + // Enforce a minimum permissible timeout value. + // Make the min value large enough so timeout is less likely + // to occur if a driver misinterpreted that the return value + // should be in microsecond units. And make it large enough + // to be noticeable if a driver neglects to ack. + + if (result < kMinAckTimeoutTicks) + result = kMinAckTimeoutTicks; + + fDriverTimer = (result / (ACK_TIMER_PERIOD / ns_per_us)) + 1; + } + // else, child has already acked and driver_timer reset to 0. + + fDriverCallParamCount = 0; + + if ( fDriverTimer ) + { + OUR_PMLog(kPMLogStartAckTimer, 0, 0); + start_ack_timer(); + } + } + + MS_POP(); // pushed by OurChangeSetPowerState() + fIsPreChange = false; +} + +//********************************************************************************* +// [private] all_done +// +// A power change is done. +//********************************************************************************* + +void IOService::all_done ( void ) +{ + IOPMPowerStateIndex prevPowerState; + const IOPMPSEntry * powerStatePtr; + IOPMDriverCallEntry callEntry; + uint32_t prevMachineState = fMachineState; + bool callAction = false; + + fMachineState = kIOPM_Finished; + + if ((fHeadNoteChangeFlags & kIOPMSynchronize) && + ((prevMachineState == kIOPM_Finished) || + (prevMachineState == kIOPM_SyncFinish))) + { + // Sync operation and no power change occurred. + // Do not inform driver and clients about this request completion, + // except for the originator (root domain). + + PM_ACTION_2(actionPowerChangeDone, + fHeadNotePowerState, fHeadNoteChangeFlags); + + if (getPMRequestType() == kIOPMRequestTypeSynchronizePowerTree) + { + powerChangeDone(fCurrentPowerState); + } + + return; + } + + // our power change + if ( fHeadNoteChangeFlags & kIOPMSelfInitiated ) + { + // could our driver switch to the new state? + if ( !( fHeadNoteChangeFlags & kIOPMNotDone) ) + { + // we changed, tell our parent + requestDomainPower(fHeadNotePowerState); + + // yes, did power raise? + if ( fCurrentPowerState < fHeadNotePowerState ) + { + // yes, inform clients and apps + tellChangeUp (fHeadNotePowerState); + } + prevPowerState = fCurrentPowerState; + // either way + fCurrentPowerState = fHeadNotePowerState; +#if PM_VARS_SUPPORT + fPMVars->myCurrentState = fCurrentPowerState; +#endif + OUR_PMLog(kPMLogChangeDone, fCurrentPowerState, 0); + PM_ACTION_2(actionPowerChangeDone, + fHeadNotePowerState, fHeadNoteChangeFlags); + callAction = true; + + powerStatePtr = &fPowerStates[fCurrentPowerState]; + fCurrentCapabilityFlags = powerStatePtr->capabilityFlags; + if (fCurrentCapabilityFlags & kIOPMStaticPowerValid) + fCurrentPowerConsumption = powerStatePtr->staticPower; + + // inform subclass policy-maker + if (fPCDFunctionOverride && fParentsKnowState && + assertPMDriverCall(&callEntry, kIOPMADC_NoInactiveCheck)) + { + powerChangeDone(prevPowerState); + deassertPMDriverCall(&callEntry); + } + } + else if (getPMRequestType() == kIOPMRequestTypeRequestPowerStateOverride) + { + // changePowerStateWithOverrideTo() was cancelled + fOverrideMaxPowerState = kIOPMPowerStateMax; + } + } + + // parent's power change + if ( fHeadNoteChangeFlags & kIOPMParentInitiated) + { + if (((fHeadNoteChangeFlags & kIOPMDomainWillChange) && + (fCurrentPowerState >= fHeadNotePowerState)) || + ((fHeadNoteChangeFlags & kIOPMDomainDidChange) && + (fCurrentPowerState < fHeadNotePowerState))) + { + // did power raise? + if ( fCurrentPowerState < fHeadNotePowerState ) + { + // yes, inform clients and apps + tellChangeUp (fHeadNotePowerState); + } + // either way + prevPowerState = fCurrentPowerState; + fCurrentPowerState = fHeadNotePowerState; +#if PM_VARS_SUPPORT + fPMVars->myCurrentState = fCurrentPowerState; +#endif + fMaxPowerState = fControllingDriver->maxCapabilityForDomainState(fHeadNoteDomainFlags); + + OUR_PMLog(kPMLogChangeDone, fCurrentPowerState, 0); + PM_ACTION_2(actionPowerChangeDone, + fHeadNotePowerState, fHeadNoteChangeFlags); + callAction = true; + + powerStatePtr = &fPowerStates[fCurrentPowerState]; + fCurrentCapabilityFlags = powerStatePtr->capabilityFlags; + if (fCurrentCapabilityFlags & kIOPMStaticPowerValid) + fCurrentPowerConsumption = powerStatePtr->staticPower; + + // inform subclass policy-maker + if (fPCDFunctionOverride && fParentsKnowState && + assertPMDriverCall(&callEntry, kIOPMADC_NoInactiveCheck)) + { + powerChangeDone(prevPowerState); + deassertPMDriverCall(&callEntry); + } + } + } + + // When power rises enough to satisfy the tickle's desire for more power, + // the condition preventing idle-timer from dropping power is removed. + + if (fCurrentPowerState >= fIdleTimerMinPowerState) + { + fIdleTimerMinPowerState = 0; + } + + if (!callAction) + { + PM_ACTION_2(actionPowerChangeDone, + fHeadNotePowerState, fHeadNoteChangeFlags); + } +} + // MARK: - // MARK: Power Change Initiated by Driver @@ -3495,13 +4082,13 @@ void IOService::OurChangeStart ( void ) PM_ASSERT_IN_GATE(); OUR_PMLog( kPMLogStartDeviceChange, fHeadNotePowerState, fCurrentPowerState ); - // fMaxCapability is our maximum possible power state based on the current + // fMaxPowerState is our maximum possible power state based on the current // power state of our parents. If we are trying to raise power beyond the // maximum, send an async request for more power to all parents. - if (!IS_PM_ROOT() && (fMaxCapability < fHeadNotePowerState)) + if (!IS_PM_ROOT && (fMaxPowerState < fHeadNotePowerState)) { - fHeadNoteFlags |= kIOPMNotDone; + fHeadNoteChangeFlags |= kIOPMNotDone; requestDomainPower(fHeadNotePowerState); OurChangeFinish(); return; @@ -3509,36 +4096,29 @@ void IOService::OurChangeStart ( void ) // Redundant power changes skips to the end of the state machine. - if (!fInitialChange && (fHeadNotePowerState == fCurrentPowerState)) + if (!fInitialPowerChange && (fHeadNotePowerState == fCurrentPowerState)) { OurChangeFinish(); return; } - fInitialChange = false; + fInitialPowerChange = false; -#if ROOT_DOMAIN_RUN_STATES // Change started, but may not complete... // Can be canceled (power drop) or deferred (power rise). - getPMRootDomain()->handlePowerChangeStartForService( - /* service */ this, - /* RD flags */ &fRootDomainState, - /* new pwr state */ fHeadNotePowerState, - /* change flags */ fHeadNoteFlags ); -#endif + PM_ACTION_2(actionPowerChangeStart, fHeadNotePowerState, &fHeadNoteChangeFlags); // Two separate paths, depending if power is being raised or lowered. // Lowering power is subject to approval by clients of this service. if (IS_POWER_DROP) { - // Next machine state for a power drop. - fMachineState = kIOPM_OurChangeTellClientsPowerDown; fDoNotPowerDown = false; - // Ask apps and kernel clients permission to lower power. + // Ask for persmission to drop power state + fMachineState = kIOPM_OurChangeTellClientsPowerDown; fOutOfBandParameter = kNotifyApps; - askChangeDown(fHeadNotePowerState); + askChangeDown(fHeadNotePowerState); } else { @@ -3554,7 +4134,6 @@ void IOService::OurChangeStart ( void ) // then the child will signal the parent to adjust power, and the child // will defer its power change. -#if RESERVE_DOMAIN_POWER IOReturn ret; // Reserve parent power necessary to achieve fHeadNotePowerState. @@ -3562,13 +4141,12 @@ void IOService::OurChangeStart ( void ) if (ret != kIOReturnSuccess) { // Reservation failed, defer power rise. - fHeadNoteFlags |= kIOPMNotDone; + fHeadNoteChangeFlags |= kIOPMNotDone; OurChangeFinish(); return; } -#endif - // Notify interested drivers and children. - notifyAll( kIOPM_OurChangeSetPowerState, kNotifyWillChange ); + + OurChangeTellCapabilityWillChange(); } } @@ -3613,26 +4191,26 @@ requestDomainPowerApplier( //********************************************************************************* IOReturn IOService::requestDomainPower( - unsigned long ourPowerState, - IOOptionBits options ) + IOPMPowerStateIndex ourPowerState, + IOOptionBits options ) { - const IOPMPowerState * powerStateEntry; + const IOPMPSEntry * powerStateEntry; IOPMPowerFlags requestPowerFlags; - unsigned long maxPowerState; + IOPMPowerStateIndex maxPowerState; IOPMRequestDomainPowerContext context; PM_ASSERT_IN_GATE(); assert(ourPowerState < fNumberOfPowerStates); if (ourPowerState >= fNumberOfPowerStates) return kIOReturnBadArgument; - if (IS_PM_ROOT()) + if (IS_PM_ROOT) return kIOReturnSuccess; // Fetch the input power flags for the requested power state. // Parent request is stated in terms of required power flags. powerStateEntry = &fPowerStates[ourPowerState]; - requestPowerFlags = powerStateEntry->inputPowerRequirement; + requestPowerFlags = powerStateEntry->inputPowerFlags; if (powerStateEntry->capabilityFlags & (kIOPMChildClamp | kIOPMPreventIdleSleep)) requestPowerFlags |= kIOPMPreventIdleSleep; @@ -3642,12 +4220,12 @@ IOReturn IOService::requestDomainPower( // Disregard the "previous request" for power reservation. if (((options & kReserveDomainPower) == 0) && - (fPreviousRequest == requestPowerFlags)) + (fPreviousRequestPowerFlags == requestPowerFlags)) { // skip if domain already knows our requirements goto done; } - fPreviousRequest = requestPowerFlags; + fPreviousRequestPowerFlags = requestPowerFlags; context.child = this; context.requestPowerFlags = requestPowerFlags; @@ -3661,7 +4239,7 @@ IOReturn IOService::requestDomainPower( if (maxPowerState < fHeadNotePowerState) { - PM_TRACE("%s: power desired %u:0x%x got %u:0x%x\n", + PM_LOG1("%s: power desired %u:0x%x got %u:0x%x\n", getName(), (uint32_t) ourPowerState, (uint32_t) requestPowerFlags, (uint32_t) maxPowerState, (uint32_t) fHeadNoteDomainTargetFlags); @@ -3681,30 +4259,38 @@ void IOService::OurSyncStart ( void ) { PM_ASSERT_IN_GATE(); - if (fInitialChange) + if (fInitialPowerChange) return; -#if ROOT_DOMAIN_RUN_STATES - getPMRootDomain()->handlePowerChangeStartForService( - /* service */ this, - /* RD flags */ &fRootDomainState, - /* new pwr state */ fHeadNotePowerState, - /* change flags */ fHeadNoteFlags ); -#endif + PM_ACTION_2(actionPowerChangeStart, fHeadNotePowerState, &fHeadNoteChangeFlags); + + if (fHeadNoteChangeFlags & kIOPMNotDone) + { + OurChangeFinish(); + return; + } - fMachineState = kIOPM_SyncNotifyDidChange; - fDriverCallReason = kDriverCallInformPreChange; + if (fHeadNoteChangeFlags & kIOPMSyncTellPowerDown) + { + fDoNotPowerDown = false; - notifyChildren(); + // Ask for permission to drop power state + fMachineState = kIOPM_SyncTellClientsPowerDown; + fOutOfBandParameter = kNotifyApps; + askChangeDown(fHeadNotePowerState); + } + else + { + // Only inform capability app and clients. + tellSystemCapabilityChange( kIOPM_SyncNotifyWillChange ); + } } //********************************************************************************* // [private] OurChangeTellClientsPowerDown // -// All registered applications and kernel clients have positively acknowledged our -// intention of lowering power. Here we notify them all that we will definitely -// lower the power. If we don't have to wait for any of them to acknowledge, we -// carry on by notifying interested drivers. Otherwise, we do wait. +// All applications and kernel clients have acknowledged our permission to drop +// power. Here we notify them that we will lower the power and wait for acks. //********************************************************************************* void IOService::OurChangeTellClientsPowerDown ( void ) @@ -3716,10 +4302,8 @@ void IOService::OurChangeTellClientsPowerDown ( void ) //********************************************************************************* // [private] OurChangeTellPriorityClientsPowerDown // -// All registered applications and kernel clients have positively acknowledged our -// intention of lowering power. Here we notify "priority" clients that we are -// lowering power. If we don't have to wait for any of them to acknowledge, we -// carry on by notifying interested drivers. Otherwise, we do wait. +// All applications and kernel clients have acknowledged our intention to drop +// power. Here we notify "priority" clients that we are lowering power. //********************************************************************************* void IOService::OurChangeTellPriorityClientsPowerDown ( void ) @@ -3728,80 +4312,123 @@ void IOService::OurChangeTellPriorityClientsPowerDown ( void ) tellChangeDown2(fHeadNotePowerState); } +//********************************************************************************* +// [private] OurChangeTellCapabilityWillChange +// +// Extra stage for root domain to notify apps and drivers about the +// system capability change when raising power state. +//********************************************************************************* + +void IOService::OurChangeTellCapabilityWillChange ( void ) +{ + if (!IS_ROOT_DOMAIN) + return OurChangeNotifyInterestedDriversWillChange(); + + tellSystemCapabilityChange( kIOPM_OurChangeNotifyInterestedDriversWillChange ); +} + //********************************************************************************* // [private] OurChangeNotifyInterestedDriversWillChange // -// All registered applications and kernel clients have acknowledged our notification -// that we are lowering power. Here we notify interested drivers. If we don't have -// to wait for any of them to acknowledge, we instruct our power driver to make the -// change. Otherwise, we do wait. +// All applications and kernel clients have acknowledged our power state change. +// Here we notify interested drivers pre-change. //********************************************************************************* void IOService::OurChangeNotifyInterestedDriversWillChange ( void ) { - IOPMrootDomain *rootDomain; + IOPMrootDomain * rootDomain; if ((rootDomain = getPMRootDomain()) == this) { - rootDomain->tracePoint(kIOPMTracePointSystemSleepDriversPhase); + if (IS_POWER_DROP) + { + rootDomain->tracePoint( kIOPMTracePointSleepWillChangeInterests ); + + PMEventDetails *details = PMEventDetails::eventDetails( + kIOPMEventTypeAppNotificationsFinished, + NULL, + 100, + kIOReturnSuccess); + rootDomain->recordAndReleasePMEventGated( details ); + } + else + rootDomain->tracePoint( kIOPMTracePointWakeWillChangeInterests ); } - notifyAll( kIOPM_OurChangeSetPowerState, kNotifyWillChange ); + notifyAll( kIOPM_OurChangeSetPowerState ); } //********************************************************************************* // [private] OurChangeSetPowerState // -// All interested drivers have acknowledged our pre-change notification of a power -// change we initiated. Here we instruct our controlling driver to make -// the change to the hardware. If it does so, we continue processing -// (waiting for settle and notifying interested parties post-change.) -// If it doesn't, we have to wait for it to acknowledge and then continue. +// Instruct our controlling driver to program the hardware for the power state +// change. Wait for async completions. //********************************************************************************* void IOService::OurChangeSetPowerState ( void ) { - fNextMachineState = kIOPM_OurChangeWaitForPowerSettle; - fMachineState = kIOPM_DriverThreadCallDone; - fDriverCallReason = kDriverCallSetPowerState; + MS_PUSH( kIOPM_OurChangeWaitForPowerSettle ); + fMachineState = kIOPM_DriverThreadCallDone; + fDriverCallReason = kDriverCallSetPowerState; - if (notifyControllingDriver() == false) - notifyControllingDriverDone(); + if (notifyControllingDriver() == false) + notifyControllingDriverDone(); } //********************************************************************************* // [private] OurChangeWaitForPowerSettle // -// Our controlling driver has changed power state on the hardware -// during a power change we initiated. Wait for the driver specified -// settle time to expire, before notifying interested parties post-change. +// Our controlling driver has completed the power state change we initiated. +// Wait for the driver specified settle time to expire. //********************************************************************************* -void IOService::OurChangeWaitForPowerSettle( void ) +void IOService::OurChangeWaitForPowerSettle ( void ) { - fMachineState = kIOPM_OurChangeNotifyInterestedDriversDidChange; + fMachineState = kIOPM_OurChangeNotifyInterestedDriversDidChange; startSettleTimer(); } //********************************************************************************* // [private] OurChangeNotifyInterestedDriversDidChange // -// Power has settled on a power change we initiated. Here we notify -// all our interested parties post-change. If they all acknowledge, we're -// done with this change note, and we can start on the next one. -// Otherwise we have to wait for acknowledgements and finish up later. +// Power has settled on a power change we initiated. Here we notify +// all our interested drivers post-change. //********************************************************************************* void IOService::OurChangeNotifyInterestedDriversDidChange ( void ) { - notifyAll( kIOPM_OurChangeFinish, kNotifyDidChange ); + IOPMrootDomain * rootDomain; + if ((rootDomain = getPMRootDomain()) == this) + { + rootDomain->tracePoint( IS_POWER_DROP ? + kIOPMTracePointSleepDidChangeInterests : + kIOPMTracePointWakeDidChangeInterests ); + } + + notifyAll( kIOPM_OurChangeTellCapabilityDidChange ); +} + +//********************************************************************************* +// [private] OurChangeTellCapabilityDidChange +// +// For root domain to notify capability power-change. +//********************************************************************************* + +void IOService::OurChangeTellCapabilityDidChange ( void ) +{ + if (!IS_ROOT_DOMAIN) + return OurChangeFinish(); + + getPMRootDomain()->tracePoint( IS_POWER_DROP ? + kIOPMTracePointSleepCapabilityClients : + kIOPMTracePointWakeCapabilityClients ); + + tellSystemCapabilityChange( kIOPM_OurChangeFinish ); } //********************************************************************************* // [private] OurChangeFinish // -// Power has settled on a power change we initiated, and -// all our interested parties have acknowledged. We're -// done with this change note, and we can start on the next one. +// Done with this self-induced power state change. //********************************************************************************* void IOService::OurChangeFinish ( void ) @@ -3829,17 +4456,11 @@ IOReturn IOService::ParentChangeStart ( void ) // TODO: redundant? See handlePowerDomainWillChangeTo() setParentInfo( fHeadNoteParentFlags, fHeadNoteParentConnection, true ); -#if ROOT_DOMAIN_RUN_STATES - getPMRootDomain()->handlePowerChangeStartForService( - /* service */ this, - /* RD flags */ &fRootDomainState, - /* new pwr state */ fHeadNotePowerState, - /* change flags */ fHeadNoteFlags ); -#endif + PM_ACTION_2(actionPowerChangeStart, fHeadNotePowerState, &fHeadNoteChangeFlags); - // tell apps and kernel clients - fInitialChange = false; - fMachineState = kIOPM_ParentDownTellPriorityClientsPowerDown; + // Tell apps and kernel clients + fInitialPowerChange = false; + fMachineState = kIOPM_ParentChangeTellPriorityClientsPowerDown; tellChangeDown1(fHeadNotePowerState); return IOPMWillAckLater; } @@ -3864,24 +4485,19 @@ IOReturn IOService::ParentChangeStart ( void ) } } - if ( fHeadNoteFlags & kIOPMDomainDidChange ) + if ( fHeadNoteChangeFlags & kIOPMDomainDidChange ) { if ( fHeadNotePowerState > fCurrentPowerState ) { -#if ROOT_DOMAIN_RUN_STATES - getPMRootDomain()->handlePowerChangeStartForService( - /* service */ this, - /* RD flags */ &fRootDomainState, - /* new pwr state */ fHeadNotePowerState, - /* change flags */ fHeadNoteFlags ); -#endif + PM_ACTION_2(actionPowerChangeStart, + fHeadNotePowerState, &fHeadNoteChangeFlags); // Parent did change up - start our change up - fInitialChange = false; - notifyAll( kIOPM_ParentUpSetPowerState, kNotifyWillChange ); + fInitialPowerChange = false; + ParentChangeTellCapabilityWillChange(); return IOPMWillAckLater; } - else if (fHeadNoteFlags & kIOPMSynchronize) + else if (fHeadNoteChangeFlags & kIOPMSynchronize) { // We do not need to change power state, but notify // children to propagate tree synchronization. @@ -3897,102 +4513,108 @@ IOReturn IOService::ParentChangeStart ( void ) } //********************************************************************************* -// [private] ParentDownTellPriorityClientsPowerDown +// [private] ParentChangeTellPriorityClientsPowerDown // -// All applications and kernel clients have been notified of a power lowering -// initiated by the parent and we had to wait for responses. Here -// we notify any priority clients. If they all ack, we continue with the power change. -// If at least one doesn't, we have to wait for it to acknowledge and then continue. +// All applications and kernel clients have acknowledged our intention to drop +// power. Here we notify "priority" clients that we are lowering power. //********************************************************************************* -void IOService::ParentDownTellPriorityClientsPowerDown ( void ) +void IOService::ParentChangeTellPriorityClientsPowerDown ( void ) { - fMachineState = kIOPM_ParentDownNotifyInterestedDriversWillChange; + fMachineState = kIOPM_ParentChangeNotifyInterestedDriversWillChange; tellChangeDown2(fHeadNotePowerState); } //********************************************************************************* -// [private] ParentDownNotifyInterestedDriversWillChange +// [private] ParentChangeTellCapabilityWillChange // -// All applications and kernel clients have been notified of a power lowering -// initiated by the parent and we had to wait for their responses. Here we notify -// any interested drivers and power domain children. If they all ack, we continue -// with the power change. -// If at least one doesn't, we have to wait for it to acknowledge and then continue. +// All (legacy) applications and kernel clients have acknowledged, extra stage for +// root domain to notify apps and drivers about the system capability change. //********************************************************************************* -void IOService::ParentDownNotifyInterestedDriversWillChange ( void ) +void IOService::ParentChangeTellCapabilityWillChange ( void ) { - IOPMrootDomain *rootDomain; - if ((rootDomain = getPMRootDomain()) == this) - { - rootDomain->tracePoint(kIOPMTracePointSystemSleepDriversPhase); - } + if (!IS_ROOT_DOMAIN) + return ParentChangeNotifyInterestedDriversWillChange(); - notifyAll( kIOPM_ParentDownSetPowerState, kNotifyWillChange ); + tellSystemCapabilityChange( kIOPM_ParentChangeNotifyInterestedDriversWillChange ); } //********************************************************************************* -// [private] ParentDownSetPowerState +// [private] ParentChangeNotifyInterestedDriversWillChange // -// We had to wait for it, but all parties have acknowledged our pre-change -// notification of a power lowering initiated by the parent. -// Here we instruct our controlling driver -// to put the hardware in the state it needs to be in when the domain is -// lowered. If it does so, we continue processing -// (waiting for settle and acknowledging the parent.) -// If it doesn't, we have to wait for it to acknowledge and then continue. +// All applications and kernel clients have acknowledged our power state change. +// Here we notify interested drivers pre-change. //********************************************************************************* -void IOService::ParentDownSetPowerState ( void ) +void IOService::ParentChangeNotifyInterestedDriversWillChange ( void ) { - fNextMachineState = kIOPM_ParentDownWaitForPowerSettle; - fMachineState = kIOPM_DriverThreadCallDone; - fDriverCallReason = kDriverCallSetPowerState; + notifyAll( kIOPM_ParentChangeSetPowerState ); +} - if (notifyControllingDriver() == false) - notifyControllingDriverDone(); +//********************************************************************************* +// [private] ParentChangeSetPowerState +// +// Instruct our controlling driver to program the hardware for the power state +// change. Wait for async completions. +//********************************************************************************* + +void IOService::ParentChangeSetPowerState ( void ) +{ + MS_PUSH( kIOPM_ParentChangeWaitForPowerSettle ); + fMachineState = kIOPM_DriverThreadCallDone; + fDriverCallReason = kDriverCallSetPowerState; + + if (notifyControllingDriver() == false) + notifyControllingDriverDone(); } //********************************************************************************* -// [private] ParentDownWaitForPowerSettle +// [private] ParentChangeWaitForPowerSettle // -// Our controlling driver has changed power state on the hardware -// during a power change initiated by our parent. We have had to wait -// for acknowledgement from interested parties, or we have had to wait -// for the controlling driver to change the state. Here we see if we need -// to wait for power to settle before continuing. If not, we continue -// processing (acknowledging our preparedness to the parent). -// If so, we wait and continue later. +// Our controlling driver has completed the power state change initiated by our +// parent. Wait for the driver specified settle time to expire. //********************************************************************************* -void IOService::ParentDownWaitForPowerSettle ( void ) +void IOService::ParentChangeWaitForPowerSettle ( void ) { - fMachineState = kIOPM_ParentDownNotifyDidChangeAndAcknowledgeChange; + fMachineState = kIOPM_ParentChangeNotifyInterestedDriversDidChange; startSettleTimer(); } //********************************************************************************* -// [private] ParentDownNotifyDidChangeAndAcknowledgeChange +// [private] ParentChangeNotifyInterestedDriversDidChange +// +// Power has settled on a power change initiated by our parent. Here we notify +// all our interested drivers post-change. +//********************************************************************************* + +void IOService::ParentChangeNotifyInterestedDriversDidChange ( void ) +{ + notifyAll( kIOPM_ParentChangeTellCapabilityDidChange ); +} + +//********************************************************************************* +// [private] ParentChangeTellCapabilityDidChange // -// Power has settled on a power change initiated by our parent. Here we -// notify interested parties. +// For root domain to notify capability power-change. //********************************************************************************* -void IOService::ParentDownNotifyDidChangeAndAcknowledgeChange ( void ) +void IOService::ParentChangeTellCapabilityDidChange ( void ) { - notifyAll( kIOPM_ParentAcknowledgePowerChange, kNotifyDidChange ); + if (!IS_ROOT_DOMAIN) + return ParentChangeAcknowledgePowerChange(); + + tellSystemCapabilityChange( kIOPM_ParentChangeAcknowledgePowerChange ); } //********************************************************************************* // [private] ParentAcknowledgePowerChange // -// We had to wait for it, but all parties have acknowledged our post-change -// notification of a power change (either Up or Down) initiated by the parent. -// Here we acknowledge the parent. +// Acknowledge our power parent that our power change is done. //********************************************************************************* -void IOService::ParentAcknowledgePowerChange ( void ) +void IOService::ParentChangeAcknowledgePowerChange ( void ) { IORegistryEntry * nub; IOService * parent; @@ -4009,192 +4631,26 @@ void IOService::ParentAcknowledgePowerChange ( void ) nub->release(); } +// MARK: - +// MARK: Ack and Settle timers + //********************************************************************************* -// [private] ParentUpSetPowerState +// [private] settleTimerExpired // -// Our parent has informed us via powerStateDidChange that it has -// raised the power in our power domain, and we have had to wait -// for some interested party to acknowledge our notification. -// Here we instruct our controlling -// driver to program the hardware to take advantage of the higher domain -// power. If it does so, we continue processing -// (waiting for settle and notifying interested parties post-change.) -// If it doesn't, we have to wait for it to acknowledge and then continue. +// Power has settled after our last change. Notify interested parties that +// there is a new power state. //********************************************************************************* -void IOService::ParentUpSetPowerState ( void ) +void IOService::settleTimerExpired( void ) { - fNextMachineState = kIOPM_ParentUpWaitForSettleTime; - fMachineState = kIOPM_DriverThreadCallDone; - fDriverCallReason = kDriverCallSetPowerState; - - if (notifyControllingDriver() == false) - notifyControllingDriverDone(); + fSettleTimeUS = 0; + gIOPMWorkQueue->signalWorkAvailable(); } //********************************************************************************* -// [private] ParentUpWaitForSettleTime +// settle_timer_expired // -// Our controlling driver has changed power state on the hardware -// during a power raise initiated by the parent, but we had to wait for it. -// Here we see if we need to wait for power to settle before continuing. -// If not, we continue processing (notifying interested parties post-change). -// If so, we wait and continue later. -//********************************************************************************* - -void IOService::ParentUpWaitForSettleTime ( void ) -{ - fMachineState = kIOPM_ParentUpNotifyInterestedDriversDidChange; - startSettleTimer(); -} - -//********************************************************************************* -// [private] ParentUpNotifyInterestedDriversDidChange -// -// Power has settled on a power raise initiated by the parent. -// Here we notify all our interested parties post-change. If they all acknowledge, -// we're done with this change note, and we can start on the next one. -// Otherwise we have to wait for acknowledgements and finish up later. -//********************************************************************************* - -void IOService::ParentUpNotifyInterestedDriversDidChange ( void ) -{ - notifyAll( kIOPM_ParentAcknowledgePowerChange, kNotifyDidChange ); -} - -//********************************************************************************* -// [private] all_done -// -// A power change is complete, and the used post-change note is at -// the head of the queue. Remove it and set myCurrentState to the result -// of the change. Start up the next change in queue. -//********************************************************************************* - -void IOService::all_done ( void ) -{ - unsigned long previous_state; - -#if ROOT_DOMAIN_RUN_STATES - getPMRootDomain()->handlePowerChangeDoneForService( - /* service */ this, - /* RD flags */ &fRootDomainState, - /* new pwr state */ fHeadNotePowerState, - /* change flags */ fHeadNoteFlags ); -#endif - - if ((fHeadNoteFlags & kIOPMSynchronize) && - ((fMachineState == kIOPM_Finished) || (fMachineState == kIOPM_SyncFinish))) - { - // Sync operation and no power change occurred. - // Do not inform driver and clients about this request completion, - // except for the originator (root domain). - - if (getPMRequestType() == kIOPMRequestTypeSynchronizePowerTree) - { - powerChangeDone(fCurrentPowerState); - } - - fMachineState = kIOPM_Finished; - return; - } - - fMachineState = kIOPM_Finished; - - // our power change - if ( fHeadNoteFlags & kIOPMWeInitiated ) - { - // could our driver switch to the new state? - if ( !( fHeadNoteFlags & kIOPMNotDone) ) - { - // we changed, tell our parent - requestDomainPower(fHeadNotePowerState); - - // yes, did power raise? - if ( fCurrentPowerState < fHeadNotePowerState ) - { - // yes, inform clients and apps - tellChangeUp (fHeadNotePowerState); - } - previous_state = fCurrentPowerState; - // either way - fCurrentPowerState = fHeadNotePowerState; -#if PM_VARS_SUPPORT - fPMVars->myCurrentState = fCurrentPowerState; -#endif - OUR_PMLog(kPMLogChangeDone, fCurrentPowerState, 0); - - // inform subclass policy-maker - if ((fLockedFlags.PMStop == false) && fParentsKnowState) - powerChangeDone(previous_state); - else - PM_DEBUG("%s::powerChangeDone() skipped\n", getName()); - } - } - - // parent's power change - if ( fHeadNoteFlags & kIOPMParentInitiated) - { - if (((fHeadNoteFlags & kIOPMDomainWillChange) && (fCurrentPowerState >= fHeadNotePowerState)) || - ((fHeadNoteFlags & kIOPMDomainDidChange) && (fCurrentPowerState < fHeadNotePowerState))) - { - // did power raise? - if ( fCurrentPowerState < fHeadNotePowerState ) - { - // yes, inform clients and apps - tellChangeUp (fHeadNotePowerState); - } - // either way - previous_state = fCurrentPowerState; - fCurrentPowerState = fHeadNotePowerState; -#if PM_VARS_SUPPORT - fPMVars->myCurrentState = fCurrentPowerState; -#endif - fMaxCapability = fControllingDriver->maxCapabilityForDomainState(fHeadNoteDomainFlags); - - OUR_PMLog(kPMLogChangeDone, fCurrentPowerState, 0); - - // inform subclass policy-maker - if ((fLockedFlags.PMStop == false) && fParentsKnowState) - powerChangeDone(previous_state); - else - PM_DEBUG("%s::powerChangeDone() skipped\n", getName()); - } - } - - if (fCurrentPowerState < fNumberOfPowerStates) - { - const IOPMPowerState * powerStatePtr = &fPowerStates[fCurrentPowerState]; - - fCurrentCapabilityFlags = powerStatePtr->capabilityFlags; - if (fCurrentCapabilityFlags & kIOPMStaticPowerValid) - fCurrentPowerConsumption = powerStatePtr->staticPower; - } - - // When power rises enough to satisfy the tickle's desire for more power, - // the condition preventing idle-timer from dropping power is removed. - - if (fCurrentPowerState >= fIdleTimerMinPowerState) - { - fIdleTimerMinPowerState = 0; - } -} - -//********************************************************************************* -// [public] settleTimerExpired -// -// Power has settled after our last change. Notify interested parties that -// there is a new power state. -//********************************************************************************* - -void IOService::settleTimerExpired ( void ) -{ - fSettleTimeUS = 0; -} - -//********************************************************************************* -// settle_timer_expired -// -// Holds a retain while the settle timer callout is in flight. +// Holds a retain while the settle timer callout is in flight. //********************************************************************************* static void @@ -4202,12 +4658,11 @@ settle_timer_expired( thread_call_param_t arg0, thread_call_param_t arg1 ) { IOService * me = (IOService *) arg0; - if (gIOPMWorkLoop && gIOPMReplyQueue) + if (gIOPMWorkLoop && gIOPMWorkQueue) { gIOPMWorkLoop->runAction( OSMemberFunctionCast(IOWorkLoop::Action, me, &IOService::settleTimerExpired), me); - gIOPMReplyQueue->signalWorkAvailable(); } me->release(); } @@ -4221,7 +4676,7 @@ settle_timer_expired( thread_call_param_t arg0, thread_call_param_t arg1 ) void IOService::startSettleTimer( void ) { AbsoluteTime deadline; - unsigned long i; + IOPMPowerStateIndex i; uint32_t settleTime = 0; boolean_t pending; @@ -4288,8 +4743,7 @@ bool IOService::ackTimerTick( void ) PM_ASSERT_IN_GATE(); switch (fMachineState) { case kIOPM_OurChangeWaitForPowerSettle: - case kIOPM_ParentDownWaitForPowerSettle: - case kIOPM_ParentUpWaitForSettleTime: + case kIOPM_ParentChangeWaitForPowerSettle: // are we waiting for controlling driver to acknowledge? if ( fDriverTimer > 0 ) { @@ -4304,6 +4758,20 @@ bool IOService::ackTimerTick( void ) PM_ERROR("%s::setPowerState(%p, %lu -> %lu) timed out after %d ms\n", fName, this, fCurrentPowerState, fHeadNotePowerState, NS_TO_MS(nsec)); +#if LOG_SETPOWER_TIMES + PMEventDetails *details = PMEventDetails::eventDetails( + kIOPMEventTypeSetPowerStateDelayed, // type + fName, // who + (uintptr_t)this, // owner unique + NULL, // interest name + (uint8_t)getPowerState(), // old + 0, // new + kIOReturnTimeout, // result + NS_TO_US(nsec)); // usec completion time + + getPMRootDomain()->recordAndReleasePMEventGated( details ); +#endif + if (gIOKitDebug & kIOLogDebugPower) { panic("%s::setPowerState(%p, %lu -> %lu) timed out after %d ms", @@ -4321,12 +4789,7 @@ bool IOService::ackTimerTick( void ) } break; - case kIOPM_OurChangeSetPowerState: - case kIOPM_OurChangeFinish: - case kIOPM_ParentDownSetPowerState: - case kIOPM_ParentAcknowledgePowerChange: - case kIOPM_ParentUpSetPowerState: - case kIOPM_NotifyChildrenDone: + case kIOPM_NotifyChildrenStart: // are we waiting for interested parties to acknowledge? if ( fHeadNotePendingAcks != 0 ) { @@ -4350,6 +4813,24 @@ bool IOService::ackTimerTick( void ) nextObject->whatObject, fName, fCurrentPowerState, fHeadNotePowerState, NS_TO_MS(nsec)); +#if LOG_SETPOWER_TIMES + uint16_t logType = (fDriverCallReason == kDriverCallInformPreChange) + ? kIOPMEventTypePSWillChangeTo + : kIOPMEventTypePSDidChangeTo; + + PMEventDetails *details = PMEventDetails::eventDetails( + logType, // type + fName, // who + (uintptr_t)this, // owner unique + nextObject->whatObject->getName(), // interest name + (uint8_t)fCurrentPowerState, // old + (uint8_t)fHeadNotePowerState, // new + kIOReturnTimeout, // result + NS_TO_US(nsec)); // usec completion time + + getPMRootDomain()->recordAndReleasePMEventGated( details ); +#endif + // Pretend driver has acked. fHeadNotePendingAcks--; } @@ -4369,11 +4850,16 @@ bool IOService::ackTimerTick( void ) } break; - case kIOPM_ParentDownTellPriorityClientsPowerDown: - case kIOPM_ParentDownNotifyInterestedDriversWillChange: + // TODO: aggreggate this case kIOPM_OurChangeTellClientsPowerDown: case kIOPM_OurChangeTellPriorityClientsPowerDown: case kIOPM_OurChangeNotifyInterestedDriversWillChange: + case kIOPM_ParentChangeTellPriorityClientsPowerDown: + case kIOPM_ParentChangeNotifyInterestedDriversWillChange: + case kIOPM_SyncTellClientsPowerDown: + case kIOPM_SyncTellPriorityClientsPowerDown: + case kIOPM_SyncNotifyWillChange: + case kIOPM_TellCapabilityChangeDone: // apps didn't respond in time cleanClientResponses(true); OUR_PMLog(kPMLogClientTardy, 0, 1); @@ -4382,7 +4868,7 @@ bool IOService::ackTimerTick( void ) break; default: - PM_TRACE("%s: unexpected ack timer tick (state = %d)\n", + PM_LOG1("%s: unexpected ack timer tick (state = %d)\n", getName(), fMachineState); break; } @@ -4441,8 +4927,8 @@ IOService::actionAckTimerExpired ( // otherwise no need to signal the work loop. done = me->ackTimerTick(); - if (done && gIOPMReplyQueue) - gIOPMReplyQueue->signalWorkAvailable(); + if (done && gIOPMWorkQueue) + gIOPMWorkQueue->signalWorkAvailable(); return kIOReturnSuccess; } @@ -4465,104 +4951,31 @@ IOService::ack_timer_expired ( thread_call_param_t arg0, thread_call_param_t arg me->release(); } -//********************************************************************************* -// [private] notifyControllingDriver -//********************************************************************************* - -bool IOService::notifyControllingDriver ( void ) -{ - DriverCallParam * param; - unsigned long powerState; - - PM_ASSERT_IN_GATE(); - assert( fDriverCallParamCount == 0 ); - assert( fControllingDriver ); - - powerState = fHeadNotePowerState; - - param = (DriverCallParam *) fDriverCallParamPtr; - if (!param) - { - param = IONew(DriverCallParam, 1); - if (!param) - return false; // no memory - - fDriverCallParamPtr = (void *) param; - fDriverCallParamSlots = 1; - } - - param->Target = fControllingDriver; - fDriverCallParamCount = 1; - - fDriverTimer = -1; - - // Machine state for this object will stall waiting for a reply - // from the callout thread. - - PM_LOCK(); - assert( fLockedFlags.DriverCallBusy == false ); - fLockedFlags.DriverCallBusy = true; - PM_UNLOCK(); - thread_call_enter( fDriverCallEntry ); - return true; -} +// MARK: - +// MARK: Client Messaging //********************************************************************************* -// [private] notifyControllingDriverDone +// [private] tellSystemCapabilityChange //********************************************************************************* -void IOService::notifyControllingDriverDone( void ) +void IOService::tellSystemCapabilityChange( uint32_t nextMS ) { - DriverCallParam * param; - IOReturn result; - - PM_ASSERT_IN_GATE(); - param = (DriverCallParam *) fDriverCallParamPtr; - - assert( fLockedFlags.DriverCallBusy == false ); - assert( fMachineState == kIOPM_DriverThreadCallDone ); + MS_PUSH( nextMS ); + fMachineState = kIOPM_TellCapabilityChangeDone; + fOutOfBandMessage = kIOMessageSystemCapabilityChange; - if (param) - { - assert(fDriverCallParamCount == 1); - - // the return value from setPowerState() - result = param->Result; - - if ((result == IOPMAckImplied) || (result < 0)) - { - // child return IOPMAckImplied - fDriverTimer = 0; - } - else if (fDriverTimer) - { - assert(fDriverTimer == -1); - - // Driver has not acked, and has returned a positive result. - // Enforce a minimum permissible timeout value. - // Make the min value large enough so timeout is less likely - // to occur if a driver misinterpreted that the return value - // should be in microsecond units. And make it large enough - // to be noticeable if a driver neglects to ack. - - if (result < kMinAckTimeoutTicks) - result = kMinAckTimeoutTicks; - - fDriverTimer = (result / (ACK_TIMER_PERIOD / ns_per_us)) + 1; - } - // else, child has already acked and driver_timer reset to 0. - - fDriverCallParamCount = 0; - - if ( fDriverTimer ) - { - OUR_PMLog(kPMLogStartAckTimer, 0, 0); - start_ack_timer(); - } - } + if (fIsPreChange) + { + // Notify app first on pre-change. + fOutOfBandParameter = kNotifyCapabilityChangeApps; + } + else + { + // Notify kernel clients first on post-change. + fOutOfBandParameter = kNotifyCapabilityChangePriority; + } - // Hop back to original machine state path. - fMachineState = fNextMachineState; + tellClientsWithResponse( fOutOfBandMessage ); } //********************************************************************************* @@ -4648,11 +5061,11 @@ static void logAppTimeouts ( OSObject * object, void * arg ) clientIndex = context->notifyClients->getNextIndexOfObject(object, 0); if ((clientIndex != (unsigned int) -1) && - (flag = context->responseFlags->getObject(clientIndex)) && + (flag = context->responseArray->getObject(clientIndex)) && (flag != kOSBooleanTrue)) { OSString * clientID = 0; - context->us->messageClient(context->msgType, object, &clientID); + context->us->messageClient(context->messageType, object, &clientID); PM_ERROR(context->errorLog, clientID ? clientID->getCStringNoCopy() : ""); // TODO: record message type if possible @@ -4669,32 +5082,47 @@ static void logAppTimeouts ( OSObject * object, void * arg ) void IOService::cleanClientResponses ( bool logErrors ) { - IOPMInterestContext context; - - if (logErrors && fResponseArray && fNotifyClientArray) { - context.responseFlags = fResponseArray; - context.notifyClients = fNotifyClientArray; - context.serialNumber = fSerialNumber; - context.counter = 0; - context.msgType = kIOMessageCopyClientID; - context.us = this; - context.maxTimeRequested = 0; - context.stateNumber = fHeadNotePowerState; - context.stateFlags = fHeadNotePowerArrayEntry->capabilityFlags; - context.errorLog = "PM notification timeout (%s)\n"; - + if (logErrors && fResponseArray) + { switch ( fOutOfBandParameter ) { case kNotifyApps: - applyToInterested(gIOAppPowerStateInterest, logAppTimeouts, (void *) &context); - case kNotifyPriority: + case kNotifyCapabilityChangeApps: + if (fNotifyClientArray) + { + IOPMInterestContext context; + + context.responseArray = fResponseArray; + context.notifyClients = fNotifyClientArray; + context.serialNumber = fSerialNumber; + context.messageType = kIOMessageCopyClientID; + context.notifyType = kNotifyApps; + context.isPreChange = fIsPreChange; + context.enableTracing = false; + context.us = this; + context.maxTimeRequested = 0; + context.stateNumber = fHeadNotePowerState; + context.stateFlags = fHeadNotePowerArrayEntry->capabilityFlags; + context.changeFlags = fHeadNoteChangeFlags; + context.errorLog = "PM notification timeout (%s)\n"; + + applyToInterested(gIOAppPowerStateInterest, logAppTimeouts, (void *) &context); + } + break; + default: + // kNotifyPriority, kNotifyCapabilityChangePriority + // TODO: identify the priority client that has not acked + PM_ERROR("PM priority notification timeout\n"); + if (gIOKitDebug & kIOLogDebugPower) + { + panic("PM priority notification timeout"); + } break; } } if (fResponseArray) { - // get rid of this stuff fResponseArray->release(); fResponseArray = NULL; } @@ -4703,8 +5131,6 @@ void IOService::cleanClientResponses ( bool logErrors ) fNotifyClientArray->release(); fNotifyClientArray = NULL; } - - return; } //********************************************************************************* @@ -4716,53 +5142,95 @@ void IOService::cleanClientResponses ( bool logErrors ) // Return true if we don't have to wait for acknowledgements //********************************************************************************* -bool IOService::tellClientsWithResponse ( - int messageType ) -{ - return tellClientsWithResponse( messageType, 0 ); -} - -bool IOService::tellClientsWithResponse ( - int messageType, - IOPMMessageFilter filter ) +bool IOService::tellClientsWithResponse ( int messageType ) { IOPMInterestContext context; + bool isRootDomain = IS_ROOT_DOMAIN; PM_ASSERT_IN_GATE(); assert( fResponseArray == NULL ); assert( fNotifyClientArray == NULL ); + RD_LOG("tellClientsWithResponse( %s, %d )\n", + getIOMessageString(messageType), fOutOfBandParameter); + fResponseArray = OSArray::withCapacity( 1 ); if (!fResponseArray) goto exit; fResponseArray->setCapacityIncrement(8); - fSerialNumber += 1; + if (++fSerialNumber == 0) + fSerialNumber++; - context.responseFlags = fResponseArray; + context.responseArray = fResponseArray; context.notifyClients = 0; context.serialNumber = fSerialNumber; - context.counter = 0; - context.msgType = messageType; + context.messageType = messageType; + context.notifyType = fOutOfBandParameter; + context.isPreChange = fIsPreChange; + context.enableTracing = false; context.us = this; context.maxTimeRequested = 0; context.stateNumber = fHeadNotePowerState; context.stateFlags = fHeadNotePowerArrayEntry->capabilityFlags; - context.filterFunc = filter; + context.changeFlags = fHeadNoteChangeFlags; + context.messageFilter = (isRootDomain) ? + OSMemberFunctionCast( + IOPMMessageFilter, + this, + &IOPMrootDomain::systemMessageFilter) : 0; switch ( fOutOfBandParameter ) { case kNotifyApps: applyToInterested( gIOAppPowerStateInterest, pmTellAppWithResponse, (void *) &context ); - fNotifyClientArray = context.notifyClients; + + if (isRootDomain && + (fMachineState != kIOPM_OurChangeTellClientsPowerDown) && + (fMachineState != kIOPM_SyncTellClientsPowerDown)) + { + // Notify capability app for tellChangeDown1() + // but not for askChangeDown(). + context.notifyType = kNotifyCapabilityChangeApps; + context.messageType = kIOMessageSystemCapabilityChange; + applyToInterested( gIOAppPowerStateInterest, + pmTellCapabilityAppWithResponse, (void *) &context ); + context.notifyType = fOutOfBandParameter; + context.messageType = messageType; + } + context.maxTimeRequested = k30seconds; applyToInterested( gIOGeneralInterest, pmTellClientWithResponse, (void *) &context ); + + fNotifyClientArray = context.notifyClients; break; case kNotifyPriority: + context.enableTracing = isRootDomain; applyToInterested( gIOPriorityPowerStateInterest, pmTellClientWithResponse, (void *) &context ); + + if (isRootDomain) + { + // Notify capability clients for tellChangeDown2(). + context.notifyType = kNotifyCapabilityChangePriority; + context.messageType = kIOMessageSystemCapabilityChange; + applyToInterested( gIOPriorityPowerStateInterest, + pmTellCapabilityClientWithResponse, (void *) &context ); + } + break; + + case kNotifyCapabilityChangeApps: + applyToInterested( gIOAppPowerStateInterest, + pmTellCapabilityAppWithResponse, (void *) &context ); + fNotifyClientArray = context.notifyClients; + context.maxTimeRequested = k30seconds; + break; + + case kNotifyCapabilityChangePriority: + applyToInterested( gIOPriorityPowerStateInterest, + pmTellCapabilityClientWithResponse, (void *) &context ); break; } @@ -4770,6 +5238,8 @@ bool IOService::tellClientsWithResponse ( if ( !checkForDone() ) { OUR_PMLog(kPMLogStartAckTimer, context.maxTimeRequested, 0); + if (context.enableTracing) + getPMRootDomain()->traceDetail( context.maxTimeRequested / 1000 ); start_ack_timer( context.maxTimeRequested / 1000, kMillisecondScale ); return false; } @@ -4799,143 +5269,361 @@ bool IOService::tellClientsWithResponse ( void IOService::pmTellAppWithResponse ( OSObject * object, void * arg ) { - IOPMInterestContext * context = (IOPMInterestContext *) arg; + IOPMInterestContext * context = (IOPMInterestContext *) arg; IOServicePM * pwrMgt = context->us->pwrMgt; + uint32_t msgIndex, msgRef, msgType; +#if LOG_APP_RESPONSE_TIMES AbsoluteTime now; - UInt32 refcon; +#endif if (!OSDynamicCast(_IOServiceInterestNotifier, object)) + return; + + if (context->messageFilter && + !context->messageFilter(context->us, object, context, 0, 0)) { - // object must be an _IOServiceInterestNotifier. + if (kIOLogDebugPower & gIOKitDebug) + { + // Log client pid/name and client array index. + OSString * clientID = 0; + context->us->messageClient(kIOMessageCopyClientID, object, &clientID); + PM_LOG("%s DROP App %s, %s\n", + context->us->getName(), + getIOMessageString(context->messageType), + clientID ? clientID->getCStringNoCopy() : ""); + if (clientID) clientID->release(); + } return; } - // Lazily create app clients array. + // Create client array (for tracking purposes) only if the service + // has app clients. Usually only root domain does. if (0 == context->notifyClients) - { context->notifyClients = OSArray::withCapacity( 32 ); + + msgType = context->messageType; + msgIndex = context->responseArray->getCount(); + msgRef = ((context->serialNumber & 0xFFFF) << 16) + (msgIndex & 0xFFFF); + + OUR_PMLog(kPMLogAppNotify, msgType, msgRef); + if (kIOLogDebugPower & gIOKitDebug) + { + // Log client pid/name and client array index. + OSString * clientID = 0; + context->us->messageClient(kIOMessageCopyClientID, object, &clientID); + PM_LOG("%s MESG App(%u) %s, %s\n", + context->us->getName(), + msgIndex, getIOMessageString(msgType), + clientID ? clientID->getCStringNoCopy() : ""); + if (clientID) clientID->release(); } - if (context->filterFunc && !context->filterFunc(object, arg)) +#if LOG_APP_RESPONSE_TIMES + OSNumber * num; + clock_get_uptime(&now); + num = OSNumber::withNumber(AbsoluteTime_to_scalar(&now), sizeof(uint64_t) * 8); + if (num) { - // ack - needed to match the counter index at logAppTimeouts(). - context->responseFlags->setObject(context->counter, kOSBooleanTrue); - if (context->notifyClients) - context->notifyClients->setObject(context->counter, kOSBooleanTrue); + context->responseArray->setObject(msgIndex, num); + num->release(); } else +#endif + context->responseArray->setObject(msgIndex, kOSBooleanFalse); + + if (context->notifyClients) + context->notifyClients->setObject(msgIndex, object); + + context->us->messageClient(msgType, object, (void *) msgRef); +} + +//********************************************************************************* +// [static private] pmTellClientWithResponse +// +// We send a message to an in-kernel client, and we expect a response, +// so we compute a cookie we can identify the response with. +//********************************************************************************* + +void IOService::pmTellClientWithResponse ( OSObject * object, void * arg ) +{ + IOPowerStateChangeNotification notify; + IOPMInterestContext * context = (IOPMInterestContext *) arg; + OSObject * replied = kOSBooleanTrue; + _IOServiceInterestNotifier * notifier; + uint32_t msgIndex, msgRef, msgType; + IOReturn retCode; + + if (context->messageFilter && + !context->messageFilter(context->us, object, context, 0, 0)) { - refcon = ((context->serialNumber & 0xFFFF)<<16) - + (context->counter & 0xFFFF); - OUR_PMLog(kPMLogAppNotify, context->msgType, refcon); + if ((kIOLogDebugPower & gIOKitDebug) && + (OSDynamicCast(_IOServiceInterestNotifier, object))) + { + _IOServiceInterestNotifier *n = (_IOServiceInterestNotifier *) object; + PM_LOG("%s DROP Client %s, notifier %p, handler %p\n", + context->us->getName(), + getIOMessageString(context->messageType), + object, n->handler); + } + return; + } - if (gIOKitDebug & kIOLogDebugPower) + notifier = OSDynamicCast(_IOServiceInterestNotifier, object); + msgType = context->messageType; + msgIndex = context->responseArray->getCount(); + msgRef = ((context->serialNumber & 0xFFFF) << 16) + (msgIndex & 0xFFFF); + + IOServicePM * pwrMgt = context->us->pwrMgt; + if (gIOKitDebug & kIOLogPower) { + OUR_PMLog(kPMLogClientNotify, msgRef, msgType); + if (OSDynamicCast(IOService, object)) { + const char *who = ((IOService *) object)->getName(); + gPlatform->PMLog(who, kPMLogClientNotify, (uintptr_t) object, 0); + } + else if (notifier) { + OUR_PMLog(kPMLogClientNotify, (uintptr_t) notifier->handler, 0); + } + } + if ((kIOLogDebugPower & gIOKitDebug) && notifier) + { + PM_LOG("%s MESG Client %s, notifier %p, handler %p\n", + context->us->getName(), + getIOMessageString(msgType), + object, notifier->handler); + } + + notify.powerRef = (void *)(uintptr_t) msgRef; + notify.returnValue = 0; + notify.stateNumber = context->stateNumber; + notify.stateFlags = context->stateFlags; + + if (context->enableTracing && (notifier != 0)) + { + uint32_t detail = ((msgIndex & 0xff) << 24) | + ((msgType & 0xfff) << 12) | + (((uintptr_t) notifier->handler) & 0xfff); + getPMRootDomain()->traceDetail( detail ); + } + + retCode = context->us->messageClient(msgType, object, (void *) ¬ify); + if ( kIOReturnSuccess == retCode ) + { + if ( 0 == notify.returnValue ) { - // Log client pid/name and associated index. - OSString * clientID = 0; - context->us->messageClient(kIOMessageCopyClientID, object, &clientID); - PM_DEBUG("[Notify %u] message 0x%x to %s\n", - (uint32_t) context->counter, - context->msgType, - clientID ? clientID->getCStringNoCopy() : ""); - if (clientID) clientID->release(); + // client doesn't want time to respond + OUR_PMLog(kPMLogClientAcknowledge, msgRef, (uintptr_t) object); + } + else + { + replied = kOSBooleanFalse; + if ( notify.returnValue > context->maxTimeRequested ) + { + if (notify.returnValue > kPriorityClientMaxWait) + { + context->maxTimeRequested = kPriorityClientMaxWait; + PM_ERROR("%s: client %p returned %llu for %s\n", + context->us->getName(), + notifier ? (void *) notifier->handler : object, + (uint64_t) notify.returnValue, + getIOMessageString(msgType)); + } + else + context->maxTimeRequested = notify.returnValue; + } } + } + else + { + // not a client of ours + // so we won't be waiting for response + OUR_PMLog(kPMLogClientAcknowledge, msgRef, 0); + } + + context->responseArray->setObject(msgIndex, replied); +} + +//********************************************************************************* +// [static private] pmTellCapabilityAppWithResponse +//********************************************************************************* +void IOService::pmTellCapabilityAppWithResponse ( OSObject * object, void * arg ) +{ + IOPMSystemCapabilityChangeParameters msgArg; + IOPMInterestContext * context = (IOPMInterestContext *) arg; + OSObject * replied = kOSBooleanTrue; + IOServicePM * pwrMgt = context->us->pwrMgt; + uint32_t msgIndex, msgRef, msgType; +#if LOG_APP_RESPONSE_TIMES + AbsoluteTime now; +#endif + + if (!OSDynamicCast(_IOServiceInterestNotifier, object)) + return; + + memset(&msgArg, 0, sizeof(msgArg)); + if (context->messageFilter && + !context->messageFilter(context->us, object, context, &msgArg, &replied)) + { + return; + } + + // Create client array (for tracking purposes) only if the service + // has app clients. Usually only root domain does. + if (0 == context->notifyClients) + context->notifyClients = OSArray::withCapacity( 32 ); + + msgType = context->messageType; + msgIndex = context->responseArray->getCount(); + msgRef = ((context->serialNumber & 0xFFFF) << 16) + (msgIndex & 0xFFFF); + + OUR_PMLog(kPMLogAppNotify, msgType, msgRef); + if (kIOLogDebugPower & gIOKitDebug) + { + // Log client pid/name and client array index. + OSString * clientID = 0; + context->us->messageClient(kIOMessageCopyClientID, object, &clientID); + PM_LOG("%s MESG App(%u) %s, wait %u, %s\n", + context->us->getName(), + msgIndex, getIOMessageString(msgType), + (replied != kOSBooleanTrue), + clientID ? clientID->getCStringNoCopy() : ""); + if (clientID) clientID->release(); + } + + msgArg.notifyRef = msgRef; + msgArg.maxWaitForReply = 0; + + if (replied == kOSBooleanTrue) + { + msgArg.notifyRef = 0; + context->responseArray->setObject(msgIndex, kOSBooleanTrue); + if (context->notifyClients) + context->notifyClients->setObject(msgIndex, kOSBooleanTrue); + } + else + { #if LOG_APP_RESPONSE_TIMES OSNumber * num; clock_get_uptime(&now); num = OSNumber::withNumber(AbsoluteTime_to_scalar(&now), sizeof(uint64_t) * 8); if (num) { - context->responseFlags->setObject(context->counter, num); + context->responseArray->setObject(msgIndex, num); num->release(); } else #endif - context->responseFlags->setObject(context->counter, kOSBooleanFalse); - - if (context->notifyClients) - context->notifyClients->setObject(context->counter, object); + context->responseArray->setObject(msgIndex, kOSBooleanFalse); - context->us->messageClient(context->msgType, object, (void *)refcon); - if ( context->maxTimeRequested < k30seconds ) - { - context->maxTimeRequested = k30seconds; - } + if (context->notifyClients) + context->notifyClients->setObject(msgIndex, object); } - context->counter++; + context->us->messageClient(msgType, object, (void *) &msgArg, sizeof(msgArg)); } //********************************************************************************* -// [static private] pmTellClientWithResponse -// -// We send a message to an in-kernel client, and we expect a response, so we compute a -// cookie we can identify the response with. -// If it doesn't understand the notification (it is not power-management savvy) -// we won't wait for it to prepare for sleep. If it tells us via a return code -// in the passed struct that it is currently ready, we won't wait for it to prepare. -// If it tells us via the return code in the struct that it does need time, we will chill. +// [static private] pmTellCapabilityClientWithResponse //********************************************************************************* -void IOService::pmTellClientWithResponse ( OSObject * object, void * arg ) +void IOService::pmTellCapabilityClientWithResponse( + OSObject * object, void * arg ) { + IOPMSystemCapabilityChangeParameters msgArg; IOPMInterestContext * context = (IOPMInterestContext *) arg; - IOPowerStateChangeNotification notify; - UInt32 refcon; + OSObject * replied = kOSBooleanTrue; + _IOServiceInterestNotifier * notifier; + uint32_t msgIndex, msgRef, msgType; IOReturn retCode; - OSObject * theFlag; - if (context->filterFunc && !context->filterFunc(object, arg)) + memset(&msgArg, 0, sizeof(msgArg)); + if (context->messageFilter && + !context->messageFilter(context->us, object, context, &msgArg, 0)) + { + if ((kIOLogDebugPower & gIOKitDebug) && + (OSDynamicCast(_IOServiceInterestNotifier, object))) + { + _IOServiceInterestNotifier *n = (_IOServiceInterestNotifier *) object; + PM_LOG("%s DROP Client %s, notifier %p, handler %p\n", + context->us->getName(), + getIOMessageString(context->messageType), + object, n->handler); + } return; + } - refcon = ((context->serialNumber & 0xFFFF)<<16) + (context->counter & 0xFFFF); - context->responseFlags->setObject(context->counter, kOSBooleanFalse); + notifier = OSDynamicCast(_IOServiceInterestNotifier, object); + msgType = context->messageType; + msgIndex = context->responseArray->getCount(); + msgRef = ((context->serialNumber & 0xFFFF) << 16) + (msgIndex & 0xFFFF); IOServicePM * pwrMgt = context->us->pwrMgt; if (gIOKitDebug & kIOLogPower) { - OUR_PMLog(kPMLogClientNotify, refcon, (UInt32) context->msgType); + OUR_PMLog(kPMLogClientNotify, msgRef, msgType); if (OSDynamicCast(IOService, object)) { const char *who = ((IOService *) object)->getName(); - gPlatform->PMLog(who, - kPMLogClientNotify, * (UInt32 *) object, (UInt64) object); - } else if (OSDynamicCast(_IOServiceInterestNotifier, object)) { - _IOServiceInterestNotifier *n = (_IOServiceInterestNotifier *) object; - OUR_PMLog(kPMLogClientNotify, (UInt64) n->handler, 0); + gPlatform->PMLog(who, kPMLogClientNotify, (uintptr_t) object, 0); + } + else if (notifier) { + OUR_PMLog(kPMLogClientNotify, (uintptr_t) notifier->handler, 0); } } + if ((kIOLogDebugPower & gIOKitDebug) && notifier) + { + PM_LOG("%s MESG Client %s, notifier %p, handler %p\n", + context->us->getName(), + getIOMessageString(msgType), + object, notifier->handler); + } - notify.powerRef = (void *)refcon; - notify.returnValue = 0; - notify.stateNumber = context->stateNumber; - notify.stateFlags = context->stateFlags; - retCode = context->us->messageClient(context->msgType,object,(void *)¬ify); - if ( retCode == kIOReturnSuccess ) + msgArg.notifyRef = msgRef; + msgArg.maxWaitForReply = 0; + + if (context->enableTracing && (notifier != 0)) + { + uint32_t detail = ((msgIndex & 0xff) << 24) | + ((msgType & 0xfff) << 12) | + (((uintptr_t) notifier->handler) & 0xfff); + getPMRootDomain()->traceDetail( detail ); + } + + retCode = context->us->messageClient( + msgType, object, (void *) &msgArg, sizeof(msgArg)); + + if ( kIOReturnSuccess == retCode ) { - if ( notify.returnValue == 0 ) + if ( 0 == msgArg.maxWaitForReply ) { // client doesn't want time to respond - context->responseFlags->replaceObject(context->counter, kOSBooleanTrue); - OUR_PMLog(kPMLogClientAcknowledge, refcon, (UInt64) object); - } else { - // it does want time, and it hasn't responded yet - theFlag = context->responseFlags->getObject(context->counter); - if ( kOSBooleanTrue != theFlag ) + OUR_PMLog(kPMLogClientAcknowledge, msgRef, (uintptr_t) object); + } + else + { + replied = kOSBooleanFalse; + if ( msgArg.maxWaitForReply > context->maxTimeRequested ) { - // so note its time requirement - if ( context->maxTimeRequested < notify.returnValue ) + if (msgArg.maxWaitForReply > kCapabilityClientMaxWait) { - context->maxTimeRequested = notify.returnValue; + context->maxTimeRequested = kCapabilityClientMaxWait; + PM_ERROR("%s: client %p returned %u for %s\n", + context->us->getName(), + notifier ? (void *) notifier->handler : object, + msgArg.maxWaitForReply, + getIOMessageString(msgType)); } + else + context->maxTimeRequested = msgArg.maxWaitForReply; } } - } else { - OUR_PMLog(kPMLogClientAcknowledge, refcon, 0); + } + else + { // not a client of ours // so we won't be waiting for response - context->responseFlags->replaceObject(context->counter, kOSBooleanTrue); + OUR_PMLog(kPMLogClientAcknowledge, msgRef, 0); } - context->counter++; + + context->responseArray->setObject(msgIndex, replied); } //********************************************************************************* @@ -4974,23 +5662,29 @@ void IOService::tellChangeUp ( unsigned long ) //********************************************************************************* void IOService::tellClients ( int messageType ) -{ - tellClients( messageType, 0 ); -} - -void IOService::tellClients ( int messageType, IOPMMessageFilter filter ) { IOPMInterestContext context; - context.msgType = messageType; - context.us = this; - context.stateNumber = fHeadNotePowerState; - context.stateFlags = fHeadNotePowerArrayEntry->capabilityFlags; - context.filterFunc = filter; - + RD_LOG("tellClients( %s )\n", getIOMessageString(messageType)); + + memset(&context, 0, sizeof(context)); + context.messageType = messageType; + context.isPreChange = fIsPreChange; + context.us = this; + context.stateNumber = fHeadNotePowerState; + context.stateFlags = fHeadNotePowerArrayEntry->capabilityFlags; + context.changeFlags = fHeadNoteChangeFlags; + context.messageFilter = (IS_ROOT_DOMAIN) ? + OSMemberFunctionCast( + IOPMMessageFilter, + this, + &IOPMrootDomain::systemMessageFilter) : 0; + + context.notifyType = kNotifyPriority; applyToInterested( gIOPriorityPowerStateInterest, tellKernelClientApplier, (void *) &context ); - + + context.notifyType = kNotifyApps; applyToInterested( gIOAppPowerStateInterest, tellAppClientApplier, (void *) &context ); @@ -5006,18 +5700,40 @@ void IOService::tellClients ( int messageType, IOPMMessageFilter filter ) static void tellKernelClientApplier ( OSObject * object, void * arg ) { - IOPMInterestContext * context = (IOPMInterestContext *) arg; IOPowerStateChangeNotification notify; + IOPMInterestContext * context = (IOPMInterestContext *) arg; - if (context->filterFunc && !context->filterFunc(object, arg)) + if (context->messageFilter && + !context->messageFilter(context->us, object, context, 0, 0)) + { + if ((kIOLogDebugPower & gIOKitDebug) && + (OSDynamicCast(_IOServiceInterestNotifier, object))) + { + _IOServiceInterestNotifier *n = (_IOServiceInterestNotifier *) object; + PM_LOG("%s DROP Client %s, notifier %p, handler %p\n", + context->us->getName(), + IOService::getIOMessageString(context->messageType), + object, n->handler); + } return; + } notify.powerRef = (void *) 0; notify.returnValue = 0; notify.stateNumber = context->stateNumber; notify.stateFlags = context->stateFlags; - context->us->messageClient(context->msgType, object, ¬ify); + context->us->messageClient(context->messageType, object, ¬ify); + + if ((kIOLogDebugPower & gIOKitDebug) && + (OSDynamicCast(_IOServiceInterestNotifier, object))) + { + _IOServiceInterestNotifier *n = (_IOServiceInterestNotifier *) object; + PM_LOG("%s MESG Client %s, notifier %p, handler %p\n", + context->us->getName(), + IOService::getIOMessageString(context->messageType), + object, n->handler); + } } //********************************************************************************* @@ -5028,12 +5744,38 @@ static void tellKernelClientApplier ( OSObject * object, void * arg ) static void tellAppClientApplier ( OSObject * object, void * arg ) { - IOPMInterestContext * context = (IOPMInterestContext *) arg; + IOPMInterestContext * context = (IOPMInterestContext *) arg; - if (context->filterFunc && !context->filterFunc(object, arg)) + if (context->messageFilter && + !context->messageFilter(context->us, object, context, 0, 0)) + { + if (kIOLogDebugPower & gIOKitDebug) + { + // Log client pid/name and client array index. + OSString * clientID = 0; + context->us->messageClient(kIOMessageCopyClientID, object, &clientID); + PM_LOG("%s DROP App %s, %s\n", + context->us->getName(), + IOService::getIOMessageString(context->messageType), + clientID ? clientID->getCStringNoCopy() : ""); + if (clientID) clientID->release(); + } return; + } + + if (kIOLogDebugPower & gIOKitDebug) + { + // Log client pid/name and client array index. + OSString * clientID = 0; + context->us->messageClient(kIOMessageCopyClientID, object, &clientID); + PM_LOG("%s MESG App %s, %s\n", + context->us->getName(), + IOService::getIOMessageString(context->messageType), + clientID ? clientID->getCStringNoCopy() : ""); + if (clientID) clientID->release(); + } - context->us->messageClient(context->msgType, object, 0); + context->us->messageClient(context->messageType, object, 0); } //********************************************************************************* @@ -5069,12 +5811,11 @@ bool IOService::checkForDone ( void ) // [public] responseValid //********************************************************************************* -bool IOService::responseValid ( unsigned long x, int pid ) +bool IOService::responseValid ( uint32_t refcon, int pid ) { UInt16 serialComponent; UInt16 ordinalComponent; OSObject * theFlag; - unsigned long refcon = (unsigned long) x; serialComponent = (refcon >> 16) & 0xFFFF; ordinalComponent = (refcon & 0xFFFF); @@ -5102,18 +5843,37 @@ bool IOService::responseValid ( unsigned long x, int pid ) #if LOG_APP_RESPONSE_TIMES AbsoluteTime now; AbsoluteTime start; - uint64_t nsec; + uint64_t nsec; + OSString *name = IOCopyLogNameForPID(pid); clock_get_uptime(&now); AbsoluteTime_to_scalar(&start) = num->unsigned64BitValue(); SUB_ABSOLUTETIME(&now, &start); absolutetime_to_nanoseconds(now, &nsec); + + PMEventDetails *details = PMEventDetails::eventDetails( + kIOPMEventTypeAppResponse, // type + name ? name->getCStringNoCopy() : "", // who + (uintptr_t)pid, // owner unique + NULL, // interest name + 0, // old + 0, // new + 0, // result + NS_TO_US(nsec)); // usec completion time + + getPMRootDomain()->recordAndReleasePMEventGated( details ); + + if (kIOLogDebugPower & gIOKitDebug) + { + PM_LOG("Ack(%u) %u ms\n", + (uint32_t) ordinalComponent, + NS_TO_MS(nsec)); + } // > 100 ms if (nsec > LOG_APP_RESPONSE_TIMES) { - OSString * name = IOCopyLogNameForPID(pid); - PM_DEBUG("PM response took %d ms (%s)\n", NS_TO_MS(nsec), + PM_LOG("PM response took %d ms (%s)\n", NS_TO_MS(nsec), name ? name->getCStringNoCopy() : ""); if (nsec > LOG_APP_RESPONSE_MSG_TRACER) @@ -5123,22 +5883,17 @@ bool IOService::responseValid ( unsigned long x, int pid ) gIOPMStatsApplicationResponseSlow, name ? name->getCStringNoCopy() : "", 0, NS_TO_MS(nsec), pid); - } - - if (name) - name->release(); + } } + + if (name) + name->release(); #endif theFlag = kOSBooleanFalse; } if ( kOSBooleanFalse == theFlag ) { - if ((gIOKitDebug & kIOLogDebugPower) && - (fOutOfBandParameter == kNotifyApps)) - { - PM_DEBUG("[Notify %u] acked\n", (uint32_t) ordinalComponent); - } fResponseArray->replaceObject(ordinalComponent, kOSBooleanTrue); } @@ -5151,9 +5906,6 @@ bool IOService::responseValid ( unsigned long x, int pid ) // Our power state is about to lower, and we have notified applications // and kernel clients, and one of them has acknowledged. If this is the last to do // so, and all acknowledgements are positive, we continue with the power change. -// -// We serialize this processing with timer expiration with a command gate on the -// power management workloop, which the timer expiration is command gated to as well. //********************************************************************************* IOReturn IOService::allowPowerChange ( unsigned long refcon ) @@ -5172,7 +5924,7 @@ IOReturn IOService::allowPowerChange ( unsigned long refcon ) request->fArg0 = (void *) refcon; request->fArg1 = (void *) proc_selfpid(); - request->fArg2 = (void *) 0; + request->fArg2 = (void *) 0; submitPMRequest( request ); return kIOReturnSuccess; @@ -5192,9 +5944,6 @@ IOReturn IOService::serializedAllowPowerChange2 ( unsigned long refcon ) // Our power state is about to lower, and we have notified applications // and kernel clients, and one of them has vetoed the change. If this is the last // client to respond, we abandon the power change. -// -// We serialize this processing with timer expiration with a command gate on the -// power management workloop, which the timer expiration is command gated to as well. //********************************************************************************* IOReturn IOService::cancelPowerChange ( unsigned long refcon ) @@ -5255,6 +6004,9 @@ void IOService::clampPowerOn ( unsigned long duration ) } #endif /* !__LP64__ */ +// MARK: - +// MARK: Driver Overrides + //********************************************************************************* // [public] setPowerState // @@ -5285,8 +6037,8 @@ unsigned long IOService::maxCapabilityForDomainState ( IOPMPowerFlags domainStat } for ( i = fNumberOfPowerStates - 1; i >= 0; i-- ) { - if ( (domainState & fPowerStates[i].inputPowerRequirement) == - fPowerStates[i].inputPowerRequirement ) + if ( (domainState & fPowerStates[i].inputPowerFlags) == + fPowerStates[i].inputPowerFlags ) { return i; } @@ -5312,8 +6064,8 @@ unsigned long IOService::initialPowerStateForDomainState ( IOPMPowerFlags domain } for ( i = fNumberOfPowerStates - 1; i >= 0; i-- ) { - if ( (domainState & fPowerStates[i].inputPowerRequirement) == - fPowerStates[i].inputPowerRequirement ) + if ( (domainState & fPowerStates[i].inputPowerFlags) == + fPowerStates[i].inputPowerFlags ) { return i; } @@ -5339,8 +6091,8 @@ unsigned long IOService::powerStateForDomainState ( IOPMPowerFlags domainState ) } for ( i = fNumberOfPowerStates - 1; i >= 0; i-- ) { - if ( (domainState & fPowerStates[i].inputPowerRequirement) == - fPowerStates[i].inputPowerRequirement ) + if ( (domainState & fPowerStates[i].inputPowerFlags) == + fPowerStates[i].inputPowerFlags ) { return i; } @@ -5420,6 +6172,9 @@ void IOService::systemWillShutdown( IOOptionBits specifier ) rootDomain->acknowledgeSystemWillShutdown( this ); } +// MARK: - +// MARK: PM State Machine + //********************************************************************************* // [private static] acquirePMRequest //********************************************************************************* @@ -5473,7 +6228,7 @@ void IOService::submitPMRequest( IOPMRequest * request ) assert( gIOPMReplyQueue ); assert( gIOPMRequestQueue ); - PM_TRACE("[+ %02lx] %p [%p %s] %p %p %p\n", + PM_LOG1("[+ %02lx] %p [%p %s] %p %p %p\n", (long)request->getType(), request, request->getTarget(), request->getTarget()->getName(), request->fArg0, request->fArg1, request->fArg2); @@ -5493,7 +6248,7 @@ void IOService::submitPMRequest( IOPMRequest ** requests, IOItemCount count ) for (IOItemCount i = 0; i < count; i++) { IOPMRequest * req = requests[i]; - PM_TRACE("[+ %02lx] %p [%p %s] %p %p %p\n", + PM_LOG1("[+ %02lx] %p [%p %s] %p %p %p\n", (long)req->getType(), req, req->getTarget(), req->getTarget()->getName(), req->fArg0, req->fArg1, req->fArg2); @@ -5504,66 +6259,39 @@ void IOService::submitPMRequest( IOPMRequest ** requests, IOItemCount count ) //********************************************************************************* // [private] servicePMRequestQueue +// +// Called from IOPMRequestQueue::checkForWork(). //********************************************************************************* bool IOService::servicePMRequestQueue( IOPMRequest * request, IOPMRequestQueue * queue ) { - // Calling PM methods without PMinit() is not allowed, fail the requests. - - if (!initialized) - { - PM_DEBUG("%s: PM not initialized\n", getName()); - goto done; - } - - // Create an IOPMWorkQueue on demand, when the initial PM request is - // received. + bool more; - if (!fPMWorkQueue) - { - // Allocate and attach an IOPMWorkQueue on demand to avoid taking - // the work loop lock in PMinit(), which may deadlock with certain - // drivers / families. - - fPMWorkQueue = IOPMWorkQueue::create( - /* target */ this, - /* Work */ OSMemberFunctionCast(IOPMWorkQueue::Action, this, - &IOService::servicePMRequest), - /* Done */ OSMemberFunctionCast(IOPMWorkQueue::Action, this, - &IOService::retirePMRequest) - ); - - if (fPMWorkQueue && - (gIOPMWorkLoop->addEventSource(fPMWorkQueue) != kIOReturnSuccess)) - { - PM_ERROR("%s: add PM work queue failed\n", getName()); - fPMWorkQueue->release(); - fPMWorkQueue = 0; - } + if (initialized) + { + // Work queue will immediately execute the queue'd request if possible. + // If execution blocks, the work queue will wait for a producer signal. + // Only need to signal more when completing attached requests. - if (!fPMWorkQueue) - { - PM_ERROR("%s: no PM work queue (type %02lx)\n", - getName(), (long)request->getType()); - goto done; - } - } + more = gIOPMWorkQueue->queuePMRequest(request, pwrMgt); + return more; + } - fPMWorkQueue->queuePMRequest(request); - return false; // do not signal more + // Calling PM without PMinit() is not allowed, fail the request. -done: + PM_LOG("%s: PM not initialized\n", getName()); fAdjustPowerScheduled = false; - gIOPMFreeQueue->queuePMRequest(request); - return false; // do not signal more + more = gIOPMFreeQueue->queuePMRequest(request); + if (more) gIOPMWorkQueue->incrementProducerCount(); + return more; } //********************************************************************************* // [private] servicePMFreeQueue // -// Called by the request completion to recycle a completed request. +// Called from IOPMCompletionQueue::checkForWork(). //********************************************************************************* bool IOService::servicePMFreeQueue( @@ -5575,26 +6303,8 @@ bool IOService::servicePMFreeQueue( if (root && (root != request)) more = true; - - if (fLockedFlags.PMStop && fPMWorkQueue && fPMWorkQueue->isEmpty()) - { - // Driver PMstop'ed and the work queue is empty. - // Detach and destroy the work queue to avoid the similar cleanup by - // PMfree(), which is deadlock prone. After PMstop() if driver calls PM, - // or a request from power parent or child arrives, it is possible to - // create/cleanup work queue more than once. Should be rare. - - gIOPMWorkLoop->removeEventSource(fPMWorkQueue); - fPMWorkQueue->release(); - fPMWorkQueue = 0; - - if ( fIdleTimerEventSource != NULL ) { - fIdleTimerEventSource->disable(); - gIOPMWorkLoop->removeEventSource(fIdleTimerEventSource); - fIdleTimerEventSource->release(); - fIdleTimerEventSource = NULL; - } - } + if (more) + gIOPMWorkQueue->incrementProducerCount(); releasePMRequest( request ); return more; @@ -5610,14 +6320,14 @@ bool IOService::retirePMRequest( IOPMRequest * request, IOPMWorkQueue * queue ) { assert(request && queue); - PM_TRACE("[- %02x] %p [%p %s] State %d, Busy %d\n", + PM_LOG1("[- %02x] %p [%p %s] state %d, busy %d\n", request->getType(), request, this, getName(), fMachineState, gIOPMBusyCount); // Catch requests created by idleTimerExpired(). if ((request->getType() == kIOPMRequestTypeActivityTickle) && - (request->fArg1 == (void *) false)) + (request->fArg1 == (void *) (uintptr_t) false)) { // Idle timer power drop request completed. // Restart the idle timer if deviceDesire can go lower, otherwise set @@ -5633,8 +6343,10 @@ bool IOService::retirePMRequest( IOPMRequest * request, IOPMWorkQueue * queue ) fIdleTimerStopped = true; } - gIOPMFreeQueue->queuePMRequest( request ); - return true; + // If the request is linked, then Work queue has already incremented its + // producer count. + + return (gIOPMFreeQueue->queuePMRequest( request )); } //********************************************************************************* @@ -5656,7 +6368,8 @@ bool IOService::isPMBlocked ( IOPMRequest * request, int count ) // 5 = kDriverCallInformPreChange // 6 = kDriverCallInformPostChange // 7 = kDriverCallSetPowerState - if (fLockedFlags.DriverCallBusy) reason = 5 + fDriverCallReason; + if (fDriverCallBusy) + reason = 5 + fDriverCallReason; break; } @@ -5691,7 +6404,7 @@ bool IOService::isPMBlocked ( IOPMRequest * request, int count ) { if (count) { - PM_TRACE("[B %02x] %p [%p %s] State %d, Reason %d\n", + PM_LOG1("[B %02x] %p [%p %s] state %d, reason %d\n", request->getType(), request, this, getName(), fMachineState, reason); } @@ -5717,10 +6430,11 @@ bool IOService::servicePMRequest( IOPMRequest * request, IOPMWorkQueue * queue ) while (isPMBlocked(request, loop++) == false) { - PM_TRACE("[W %02x] %p [%p %s] State %d\n", + PM_LOG1("[W %02x] %p [%p %s] state %d\n", request->getType(), request, this, getName(), fMachineState); gIOPMRequest = request; + gIOPMWorkCount++; // Every PM machine states must be handled in one of the cases below. @@ -5731,40 +6445,87 @@ bool IOService::servicePMRequest( IOPMRequest * request, IOPMWorkQueue * queue ) break; case kIOPM_OurChangeTellClientsPowerDown: - // our change, was it vetoed? + // Root domain might self cancel due to assertions. + if (IS_ROOT_DOMAIN) + { + bool cancel = (bool) fDoNotPowerDown; + getPMRootDomain()->askChangeDownDone( + &fHeadNoteChangeFlags, &cancel); + fDoNotPowerDown = cancel; + } + + // askChangeDown() done, was it vetoed? if (!fDoNotPowerDown) { + if (IS_ROOT_DOMAIN) { + PMEventDetails *details = PMEventDetails::eventDetails( + kIOPMEventTypeAppNotificationsFinished, + NULL, + 0, + 0); + + getPMRootDomain()->recordAndReleasePMEventGated( details ); + } + // no, we can continue OurChangeTellClientsPowerDown(); } else { + if (IS_ROOT_DOMAIN) { + PMEventDetails *details = PMEventDetails::eventDetails( + kIOPMEventTypeSleepDone, + NULL, + 1, /* reason: 1 == Ask clients succeeded */ + kIOReturnAborted); /* result */ + + getPMRootDomain()->recordAndReleasePMEventGated( details ); + } + OUR_PMLog(kPMLogIdleCancel, (uintptr_t) this, fMachineState); PM_ERROR("%s: idle cancel\n", fName); // yes, rescind the warning tellNoChangeDown(fHeadNotePowerState); // mark the change note un-actioned - fHeadNoteFlags |= kIOPMNotDone; + fHeadNoteChangeFlags |= kIOPMNotDone; // and we're done - all_done(); + OurChangeFinish(); } break; case kIOPM_OurChangeTellPriorityClientsPowerDown: - // our change, should it be acted on still? + // tellChangeDown(kNotifyApps) done, was it cancelled? if (fDoNotPowerDown) { + if (IS_ROOT_DOMAIN) { + PMEventDetails *details = PMEventDetails::eventDetails( + kIOPMEventTypeSleepDone, + NULL, + 2, /* reason: 2 == Client cancelled wake */ + kIOReturnAborted); /* result */ + + getPMRootDomain()->recordAndReleasePMEventGated( details ); + } OUR_PMLog(kPMLogIdleCancel, (uintptr_t) this, fMachineState); PM_ERROR("%s: idle revert\n", fName); // no, tell clients we're back in the old state tellChangeUp(fCurrentPowerState); // mark the change note un-actioned - fHeadNoteFlags |= kIOPMNotDone; + fHeadNoteChangeFlags |= kIOPMNotDone; // and we're done - all_done(); + OurChangeFinish(); } else { + if (IS_ROOT_DOMAIN) { + PMEventDetails *details = PMEventDetails::eventDetails( + kIOPMEventTypeAppNotificationsFinished, + NULL, + 2, /* reason: 2 == TellPriorityClientsDone */ + kIOReturnSuccess); /* result */ + + getPMRootDomain()->recordAndReleasePMEventGated( details ); + } // yes, we can continue OurChangeTellPriorityClientsPowerDown(); } @@ -5786,44 +6547,40 @@ bool IOService::servicePMRequest( IOPMRequest * request, IOPMWorkQueue * queue ) OurChangeNotifyInterestedDriversDidChange(); break; + case kIOPM_OurChangeTellCapabilityDidChange: + OurChangeTellCapabilityDidChange(); + break; + case kIOPM_OurChangeFinish: OurChangeFinish(); break; - case kIOPM_ParentDownTellPriorityClientsPowerDown: - ParentDownTellPriorityClientsPowerDown(); + case kIOPM_ParentChangeTellPriorityClientsPowerDown: + ParentChangeTellPriorityClientsPowerDown(); break; - case kIOPM_ParentDownNotifyInterestedDriversWillChange: - ParentDownNotifyInterestedDriversWillChange(); + case kIOPM_ParentChangeNotifyInterestedDriversWillChange: + ParentChangeNotifyInterestedDriversWillChange(); break; - case kIOPM_ParentDownNotifyDidChangeAndAcknowledgeChange: - ParentDownNotifyDidChangeAndAcknowledgeChange(); + case kIOPM_ParentChangeSetPowerState: + ParentChangeSetPowerState(); break; - case kIOPM_ParentDownSetPowerState: - ParentDownSetPowerState(); + case kIOPM_ParentChangeWaitForPowerSettle: + ParentChangeWaitForPowerSettle(); break; - case kIOPM_ParentDownWaitForPowerSettle: - ParentDownWaitForPowerSettle(); + case kIOPM_ParentChangeNotifyInterestedDriversDidChange: + ParentChangeNotifyInterestedDriversDidChange(); break; - case kIOPM_ParentAcknowledgePowerChange: - ParentAcknowledgePowerChange(); - break; - - case kIOPM_ParentUpSetPowerState: - ParentUpSetPowerState(); - break; - - case kIOPM_ParentUpWaitForSettleTime: - ParentUpWaitForSettleTime(); - break; + case kIOPM_ParentChangeTellCapabilityDidChange: + ParentChangeTellCapabilityDidChange(); + break; - case kIOPM_ParentUpNotifyInterestedDriversDidChange: - ParentUpNotifyInterestedDriversDidChange(); + case kIOPM_ParentChangeAcknowledgePowerChange: + ParentChangeAcknowledgePowerChange(); break; case kIOPM_DriverThreadCallDone: @@ -5833,21 +6590,116 @@ bool IOService::servicePMRequest( IOPMRequest * request, IOPMWorkQueue * queue ) notifyInterestedDriversDone(); break; - case kIOPM_NotifyChildrenDone: - notifyChildrenDone(); + case kIOPM_NotifyChildrenOrdered: + notifyChildrenOrdered(); + break; + + case kIOPM_NotifyChildrenDelayed: + notifyChildrenDelayed(); + break; + + case kIOPM_NotifyChildrenStart: + PM_LOG2("%s: kIOPM_NotifyChildrenStart done\n", getName()); + MS_POP(); // from notifyInterestedDriversDone() + notifyChildren(); + break; + + case kIOPM_SyncTellClientsPowerDown: + // Root domain might self cancel due to assertions. + if (IS_ROOT_DOMAIN) + { + bool cancel = (bool) fDoNotPowerDown; + getPMRootDomain()->askChangeDownDone( + &fHeadNoteChangeFlags, &cancel); + fDoNotPowerDown = cancel; + } + if (!fDoNotPowerDown) + { + fMachineState = kIOPM_SyncTellPriorityClientsPowerDown; + fOutOfBandParameter = kNotifyApps; + tellChangeDown(fHeadNotePowerState); + } + else + { + OUR_PMLog(kPMLogIdleCancel, (uintptr_t) this, fMachineState); + PM_ERROR("%s: idle cancel\n", fName); + tellNoChangeDown(fHeadNotePowerState); + fHeadNoteChangeFlags |= kIOPMNotDone; + OurChangeFinish(); + } + break; + + case kIOPM_SyncTellPriorityClientsPowerDown: + if (!fDoNotPowerDown) + { + fMachineState = kIOPM_SyncNotifyWillChange; + fOutOfBandParameter = kNotifyPriority; + tellChangeDown(fHeadNotePowerState); + } + else + { + OUR_PMLog(kPMLogIdleCancel, (uintptr_t) this, fMachineState); + PM_ERROR("%s: idle revert\n", fName); + tellChangeUp(fCurrentPowerState); + fHeadNoteChangeFlags |= kIOPMNotDone; + OurChangeFinish(); + } break; + case kIOPM_SyncNotifyWillChange: + if (kIOPMSyncNoChildNotify & fHeadNoteChangeFlags) + { + fMachineState = kIOPM_SyncFinish; + continue; + } + fMachineState = kIOPM_SyncNotifyDidChange; + fDriverCallReason = kDriverCallInformPreChange; + notifyChildren(); + break; + case kIOPM_SyncNotifyDidChange: - fMachineState = kIOPM_SyncFinish; + fIsPreChange = false; + + if (fHeadNoteChangeFlags & kIOPMParentInitiated) + fMachineState = kIOPM_SyncFinish; + else + fMachineState = kIOPM_SyncTellCapabilityDidChange; + fDriverCallReason = kDriverCallInformPostChange; notifyChildren(); break; + case kIOPM_SyncTellCapabilityDidChange: + tellSystemCapabilityChange( kIOPM_SyncFinish ); + break; + case kIOPM_SyncFinish: - if (fHeadNoteFlags & kIOPMParentInitiated) - ParentAcknowledgePowerChange(); + if (fHeadNoteChangeFlags & kIOPMParentInitiated) + ParentChangeAcknowledgePowerChange(); else OurChangeFinish(); + break; + + case kIOPM_TellCapabilityChangeDone: + if (fIsPreChange) + { + if (fOutOfBandParameter == kNotifyCapabilityChangePriority) + { + MS_POP(); // tellSystemCapabilityChange() + continue; + } + fOutOfBandParameter = kNotifyCapabilityChangePriority; + } + else + { + if (fOutOfBandParameter == kNotifyCapabilityChangeApps) + { + MS_POP(); // tellSystemCapabilityChange() + continue; + } + fOutOfBandParameter = kNotifyCapabilityChangeApps; + } + tellClientsWithResponse( fOutOfBandMessage ); break; default: @@ -5859,8 +6711,6 @@ bool IOService::servicePMRequest( IOPMRequest * request, IOPMWorkQueue * queue ) if (fMachineState == kIOPM_Finished) { - //PM_TRACE("[%s] PM End: Request %p (type %02lx)\n", - // getName(), request, request->getType()); done = true; break; } @@ -5910,6 +6760,7 @@ void IOService::executePMRequest( IOPMRequest * request ) break; case kIOPMRequestTypePowerDomainDidChange: + handlePowerDomainDidChangeTo( request ); break; @@ -5933,29 +6784,10 @@ void IOService::executePMRequest( IOPMRequest * request ) case kIOPMRequestTypeSetIdleTimerPeriod: { - IOWorkLoop * wl = gIOPMWorkLoop; fIdleTimerPeriod = (uintptr_t) request->fArg0; - if (wl && (false == fLockedFlags.PMStop) && (fIdleTimerPeriod > 0)) + if ((false == fLockedFlags.PMStop) && (fIdleTimerPeriod > 0)) { - if ( NULL == fIdleTimerEventSource ) - { - IOTimerEventSource * timerSrc; - - timerSrc = IOTimerEventSource::timerEventSource( - this, - OSMemberFunctionCast(IOTimerEventSource::Action, - this, &IOService::idleTimerExpired)); - - if (timerSrc && (wl->addEventSource(timerSrc) != kIOReturnSuccess)) - { - timerSrc->release(); - timerSrc = 0; - } - - fIdleTimerEventSource = timerSrc; - } - fActivityTickleCount = 0; clock_get_uptime(&fIdleTimerStartTime); start_PM_idle_timer(); @@ -5979,7 +6811,7 @@ bool IOService::servicePMReplyQueue( IOPMRequest * request, IOPMRequestQueue * q assert( request && queue ); assert( request->isReplyType() ); - PM_TRACE("[A %02x] %p [%p %s] State %d\n", + PM_LOG1("[A %02x] %p [%p %s] state %d\n", request->getType(), request, this, getName(), fMachineState); switch ( request->getType() ) @@ -5987,32 +6819,30 @@ bool IOService::servicePMReplyQueue( IOPMRequest * request, IOPMRequestQueue * q case kIOPMRequestTypeAllowPowerChange: case kIOPMRequestTypeCancelPowerChange: // Check if we are expecting this response. - if (responseValid((unsigned long) request->fArg0, (int)(long) request->fArg1)) + if (responseValid((uint32_t)(uintptr_t) request->fArg0, + (int)(uintptr_t) request->fArg1)) { if (kIOPMRequestTypeCancelPowerChange == request->getType()) { - OSString * name = (OSString *) request->fArg2; - getPMRootDomain()->pmStatsRecordApplicationResponse( - gIOPMStatsApplicationResponseCancel, - name ? name->getCStringNoCopy() : "", 0, - 0, (int)(uintptr_t) request->fArg1); + // Clients are not allowed to cancel when kIOPMSkipAskPowerDown + // flag is set. Only root domain will set this flag. + + if ((fHeadNoteChangeFlags & kIOPMSkipAskPowerDown) == 0) + { + fDoNotPowerDown = true; - fDoNotPowerDown = true; + OSString * name = (OSString *) request->fArg2; + getPMRootDomain()->pmStatsRecordApplicationResponse( + gIOPMStatsApplicationResponseCancel, + name ? name->getCStringNoCopy() : "", 0, + 0, (int)(uintptr_t) request->fArg1); + } } if (checkForDone()) { stop_ack_timer(); - if ( fResponseArray ) - { - fResponseArray->release(); - fResponseArray = NULL; - } - if ( fNotifyClientArray ) - { - fNotifyClientArray->release(); - fNotifyClientArray = NULL; - } + cleanClientResponses(false); more = true; } } @@ -6045,8 +6875,20 @@ bool IOService::servicePMReplyQueue( IOPMRequest * request, IOPMRequestQueue * q #if LOG_SETPOWER_TIMES uint64_t nsec = computeTimeDeltaNS(&fDriverCallStartTime); if (nsec > LOG_SETPOWER_TIMES) - PM_DEBUG("%s::setPowerState(%p, %lu -> %lu) async took %d ms\n", + PM_LOG("%s::setPowerState(%p, %lu -> %lu) async took %d ms\n", fName, this, fCurrentPowerState, fHeadNotePowerState, NS_TO_MS(nsec)); + + PMEventDetails *details = PMEventDetails::eventDetails( + kIOPMEventTypeSetPowerStateDelayed, // type + fName, // who + (uintptr_t)this, // owner unique + NULL, // interest name + (uint8_t)getPowerState(), // old + (uint8_t)fHeadNotePowerState, // new + 0, // result + NS_TO_US(nsec)); // usec completion time + + getPMRootDomain()->recordAndReleasePMEventGated( details ); #endif OUR_PMLog(kPMLogDriverAcknowledgeSet, (uintptr_t) this, fDriverTimer); fDriverTimer = 0; @@ -6066,68 +6908,187 @@ bool IOService::servicePMReplyQueue( IOPMRequest * request, IOPMRequestQueue * q case kIOPMRequestTypeIdleCancel: if ((fMachineState == kIOPM_OurChangeTellClientsPowerDown) - || (fMachineState == kIOPM_OurChangeTellPriorityClientsPowerDown)) + || (fMachineState == kIOPM_OurChangeTellPriorityClientsPowerDown) + || (fMachineState == kIOPM_SyncTellClientsPowerDown) + || (fMachineState == kIOPM_SyncTellPriorityClientsPowerDown)) { - OUR_PMLog(kPMLogIdleCancel, (uintptr_t) this, 0); + OUR_PMLog(kPMLogIdleCancel, (uintptr_t) this, fMachineState); + PM_LOG2("%s: cancel from machine state %d\n", + getName(), fMachineState); fDoNotPowerDown = true; - if (fMachineState == kIOPM_OurChangeTellPriorityClientsPowerDown) + // Stop waiting for app replys. + if ((fMachineState == kIOPM_OurChangeTellPriorityClientsPowerDown) || + (fMachineState == kIOPM_SyncTellPriorityClientsPowerDown)) cleanClientResponses(false); more = true; } break; + case kIOPMRequestTypeChildNotifyDelayCancel: + if (fMachineState == kIOPM_NotifyChildrenDelayed) + { + PM_LOG2("%s: delay notify cancelled\n", getName()); + notifyChildrenDelayed(); + } + break; + default: panic("servicePMReplyQueue: unknown reply type %x", request->getType()); } - releasePMRequest( request ); + more |= gIOPMFreeQueue->queuePMRequest(request); + if (more) + gIOPMWorkQueue->incrementProducerCount(); + return more; } //********************************************************************************* -// [private] assertPMThreadCall / deassertPMThreadCall +// [private] assertPMDriverCall / deassertPMDriverCall //********************************************************************************* -bool IOService::assertPMThreadCall( void ) +bool IOService::assertPMDriverCall( + IOPMDriverCallEntry * entry, + IOOptionBits options, + IOPMinformee * inform ) { + IOService * target = 0; + bool ok = false; + if (!initialized) return false; - // PMfree() should only be called from IOService::free(). - // That makes it safe to touch IOServicePM state here. - // Caller holds a retain and has checked target is on PM plane. - PM_LOCK(); + if (fLockedFlags.PMStop) { - // PMstop() already issued - fail the assertion. - PM_UNLOCK(); - return false; + goto fail; + } + + if (((options & kIOPMADC_NoInactiveCheck) == 0) && isInactive()) + { + goto fail; + } + + if (inform) + { + if (!inform->active) + { + goto fail; + } + target = inform->whatObject; + if (target->isInactive()) + { + goto fail; + } } - // Increment assertion count to block PMstop(), and return true. - fThreadAssertionCount++; - fThreadAssertionThread = current_thread(); // only 1 caller + entry->thread = current_thread(); + entry->target = target; + queue_enter(&fPMDriverCallQueue, entry, IOPMDriverCallEntry *, link); + ok = true; + +fail: PM_UNLOCK(); - return true; + return ok; } -void IOService::deassertPMThreadCall( void ) +void IOService::deassertPMDriverCall( IOPMDriverCallEntry * entry ) { + bool wakeup = false; + PM_LOCK(); - assert(fThreadAssertionCount > 0); - if (fThreadAssertionCount) - fThreadAssertionCount--; - if (current_thread() == fThreadAssertionThread) - fThreadAssertionThread = 0; - if ((fThreadAssertionCount == 0) && fLockedFlags.PMStop) + + assert( !queue_empty(&fPMDriverCallQueue) ); + queue_remove(&fPMDriverCallQueue, entry, IOPMDriverCallEntry *, link); + if (fLockedFlags.PMDriverCallWait) { - // PMstop() is blocked waiting for assertion count to drop to zero. - PM_LOCK_WAKEUP(&fThreadAssertionCount); + wakeup = true; } + PM_UNLOCK(); + + if (wakeup) + PM_LOCK_WAKEUP(&fPMDriverCallQueue); +} + +void IOService::waitForPMDriverCall( IOService * target ) +{ + const IOPMDriverCallEntry * entry; + thread_t thread = current_thread(); + AbsoluteTime deadline; + int waitResult; + bool log = true; + bool wait; + + do { + wait = false; + queue_iterate(&fPMDriverCallQueue, entry, const IOPMDriverCallEntry *, link) + { + // Target of interested driver call + if (target && (target != entry->target)) + continue; + + if (entry->thread == thread) + { + if (log) + { + PM_LOG("%s: %s(%s) on PM thread\n", + fName, __FUNCTION__, target ? target->getName() : ""); + OSReportWithBacktrace("%s: %s(%s) on PM thread\n", + fName, __FUNCTION__, target ? target->getName() : ""); + log = false; + } + continue; + } + + wait = true; + break; + } + + if (wait) + { + fLockedFlags.PMDriverCallWait = true; + clock_interval_to_deadline(15, kSecondScale, &deadline); + waitResult = PM_LOCK_SLEEP(&fPMDriverCallQueue, deadline); + fLockedFlags.PMDriverCallWait = false; + if (THREAD_TIMED_OUT == waitResult) + { + PM_ERROR("%s: waitForPMDriverCall timeout\n", fName); + wait = false; + } + } + } while (wait); +} + +//********************************************************************************* +// [private] Debug helpers +//********************************************************************************* + +const char * IOService::getIOMessageString( uint32_t msg ) +{ +#define MSG_ENTRY(x) {x, #x} + + static const IONamedValue msgNames[] = { + MSG_ENTRY( kIOMessageCanDevicePowerOff ), + MSG_ENTRY( kIOMessageDeviceWillPowerOff ), + MSG_ENTRY( kIOMessageDeviceWillNotPowerOff ), + MSG_ENTRY( kIOMessageDeviceHasPoweredOn ), + MSG_ENTRY( kIOMessageCanSystemPowerOff ), + MSG_ENTRY( kIOMessageSystemWillPowerOff ), + MSG_ENTRY( kIOMessageSystemWillNotPowerOff ), + MSG_ENTRY( kIOMessageCanSystemSleep ), + MSG_ENTRY( kIOMessageSystemWillSleep ), + MSG_ENTRY( kIOMessageSystemWillNotSleep ), + MSG_ENTRY( kIOMessageSystemHasPoweredOn ), + MSG_ENTRY( kIOMessageSystemWillRestart ), + MSG_ENTRY( kIOMessageSystemWillPowerOn ), + MSG_ENTRY( kIOMessageSystemCapabilityChange ) + }; + + return IOFindNameForValue(msg, msgNames); } // MARK: - @@ -6189,8 +7150,10 @@ void IOPMRequest::reset( void ) } } -void IOPMRequest::attachNextRequest( IOPMRequest * next ) +bool IOPMRequest::attachNextRequest( IOPMRequest * next ) { + bool ok = false; + if (!fRequestNext) { // Postpone the execution of the next request after @@ -6204,11 +7167,15 @@ void IOPMRequest::attachNextRequest( IOPMRequest * next ) (uint32_t) fRequestNext->fWorkWaitCount, fTarget->getName()); #endif + ok = true; } + return ok; } -void IOPMRequest::detachNextRequest( void ) +bool IOPMRequest::detachNextRequest( void ) { + bool ok = false; + if (fRequestNext) { assert(fRequestNext->fWorkWaitCount); @@ -6222,11 +7189,15 @@ void IOPMRequest::detachNextRequest( void ) fTarget->getName()); #endif fRequestNext = 0; + ok = true; } + return ok; } -void IOPMRequest::attachRootRequest( IOPMRequest * root ) +bool IOPMRequest::attachRootRequest( IOPMRequest * root ) { + bool ok = false; + if (!fRequestRoot) { // Delay the completion of the root request after @@ -6240,11 +7211,15 @@ void IOPMRequest::attachRootRequest( IOPMRequest * root ) (uint32_t) fRequestRoot->fFreeWaitCount, fTarget->getName()); #endif + ok = true; } + return ok; } -void IOPMRequest::detachRootRequest( void ) +bool IOPMRequest::detachRootRequest( void ) { + bool ok = false; + if (fRequestRoot) { assert(fRequestRoot->fFreeWaitCount); @@ -6258,7 +7233,9 @@ void IOPMRequest::detachRootRequest( void ) fTarget->getName()); #endif fRequestRoot = 0; + ok = true; } + return ok; } // MARK: - @@ -6267,8 +7244,7 @@ void IOPMRequest::detachRootRequest( void ) //********************************************************************************* // IOPMRequestQueue Class // -// Global queues. As PM-aware drivers load and unload, their IOPMWorkQueue's are -// created and deallocated. IOPMRequestQueue are created once and never released. +// Global queues. Queues are created once and never released. //********************************************************************************* OSDefineMetaClassAndStructors( IOPMRequestQueue, IOEventSource ); @@ -6353,19 +7329,13 @@ bool IOPMRequestQueue::checkForWork( void ) return more; } -void IOPMRequestQueue::signalWorkAvailable( void ) -{ - IOEventSource::signalWorkAvailable(); -} - // MARK: - // MARK: IOPMWorkQueue //********************************************************************************* // IOPMWorkQueue Class // -// Every object in the power plane that has handled a PM request, will have an -// instance of IOPMWorkQueue allocated for it. +// Queue of IOServicePM objects with busy IOPMRequest(s). //********************************************************************************* OSDefineMetaClassAndStructors( IOPMWorkQueue, IOEventSource ); @@ -6390,43 +7360,160 @@ bool IOPMWorkQueue::init( IOService * inOwner, Action work, Action retire ) queue_init(&fWorkQueue); - fWorkAction = work; - fRetireAction = retire; + fWorkAction = work; + fRetireAction = retire; + fConsumerCount = fProducerCount = 0; return true; } -void IOPMWorkQueue::queuePMRequest( IOPMRequest * request ) +bool IOPMWorkQueue::queuePMRequest( IOPMRequest * request, IOServicePM * pwrMgt ) { + bool more = false; + bool empty; + assert( request ); + assert( pwrMgt ); assert( onThread() ); + assert( queue_next(&request->fCommandChain) == + queue_prev(&request->fCommandChain) ); gIOPMBusyCount++; - queue_enter(&fWorkQueue, request, IOPMRequest *, fCommandChain); - checkForWork(); + + // Add new request to the tail of the per-service request queue. + // Then immediately check the request queue to minimize latency + // if the queue was empty. + + empty = queue_empty(&pwrMgt->RequestHead); + queue_enter(&pwrMgt->RequestHead, request, IOPMRequest *, fCommandChain); + if (empty) + { + more = checkRequestQueue(&pwrMgt->RequestHead, &empty); + if (!empty) + { + // New Request is blocked, add IOServicePM to work queue. + assert( queue_next(&pwrMgt->WorkChain) == + queue_prev(&pwrMgt->WorkChain) ); + + queue_enter(&fWorkQueue, pwrMgt, IOServicePM *, WorkChain); + fQueueLength++; + PM_LOG3("IOPMWorkQueue: [%u] added %s@%p to queue\n", + fQueueLength, pwrMgt->Name, pwrMgt); + } + } + + return more; } -bool IOPMWorkQueue::checkForWork( void ) +bool IOPMWorkQueue::checkRequestQueue( queue_head_t * queue, bool * empty ) { IOPMRequest * request; - IOService * target = (IOService *) owner; - bool done; + IOService * target; + bool more = false; + bool done = false; - while (!queue_empty(&fWorkQueue)) - { - request = (IOPMRequest *) queue_first(&fWorkQueue); - assert(request->getTarget() == target); - if (request->isWorkBlocked()) break; - done = (*fWorkAction)( target, request, this ); - if (!done) break; - - assert(gIOPMBusyCount > 0); - if (gIOPMBusyCount) gIOPMBusyCount--; - queue_remove_first(&fWorkQueue, request, IOPMRequest *, fCommandChain); - (*fRetireAction)( target, request, this ); - } + assert(!queue_empty(queue)); + do { + request = (IOPMRequest *) queue_first(queue); + if (request->isWorkBlocked()) + break; // cannot start, blocked on attached request - return false; + target = request->getTarget(); + done = (*fWorkAction)( target, request, this ); + if (!done) + break; // work started, blocked on PM state machine + + assert(gIOPMBusyCount > 0); + if (gIOPMBusyCount) + gIOPMBusyCount--; + + queue_remove_first(queue, request, IOPMRequest *, fCommandChain); + more |= (*fRetireAction)( target, request, this ); + done = queue_empty(queue); + } while (!done); + + *empty = done; + + if (more) + { + // Retired request blocks another request, since the + // blocked request may reside in the work queue, we + // must bump the producer count to avoid work stall. + fProducerCount++; + } + + return more; +} + +bool IOPMWorkQueue::checkForWork( void ) +{ + IOServicePM * entry; + IOServicePM * next; + bool more = false; + bool empty; + +#if WORK_QUEUE_STATS + fStatCheckForWork++; +#endif + + // Each producer signal triggers a full iteration over + // all IOServicePM entries in the work queue. + + while (fConsumerCount != fProducerCount) + { + PM_LOG3("IOPMWorkQueue: checkForWork %u %u\n", + fProducerCount, fConsumerCount); + + fConsumerCount = fProducerCount; + +#if WORK_QUEUE_STATS + if (queue_empty(&fWorkQueue)) + { + fStatQueueEmpty++; + break; + } + fStatScanEntries++; + uint32_t cachedWorkCount = gIOPMWorkCount; +#endif + + entry = (IOServicePM *) queue_first(&fWorkQueue); + while (!queue_end(&fWorkQueue, (queue_entry_t) entry)) + { + more |= checkRequestQueue(&entry->RequestHead, &empty); + + // Get next entry, points to head if current entry is last. + next = (IOServicePM *) queue_next(&entry->WorkChain); + + // if request queue is empty, remove IOServicePM from queue. + if (empty) + { + assert(fQueueLength); + if (fQueueLength) fQueueLength--; + PM_LOG3("IOPMWorkQueue: [%u] removed %s@%p from queue\n", + fQueueLength, entry->Name, entry); + queue_remove(&fWorkQueue, entry, IOServicePM *, WorkChain); + } + entry = next; + } + +#if WORK_QUEUE_STATS + if (cachedWorkCount == gIOPMWorkCount) + fStatNoWorkDone++; +#endif + } + + return more; +} + +void IOPMWorkQueue::signalWorkAvailable( void ) +{ + fProducerCount++; + IOEventSource::signalWorkAvailable(); +} + +void IOPMWorkQueue::incrementProducerCount( void ) +{ + fProducerCount++; } // MARK: - @@ -6438,7 +7525,8 @@ bool IOPMWorkQueue::checkForWork( void ) OSDefineMetaClassAndStructors( IOPMCompletionQueue, IOEventSource ); -IOPMCompletionQueue * IOPMCompletionQueue::create( IOService * inOwner, Action inAction ) +IOPMCompletionQueue * +IOPMCompletionQueue::create( IOService * inOwner, Action inAction ) { IOPMCompletionQueue * me = OSTypeAlloc(IOPMCompletionQueue); if (me && !me->init(inOwner, inAction)) @@ -6458,39 +7546,40 @@ bool IOPMCompletionQueue::init( IOService * inOwner, Action inAction ) return true; } -void IOPMCompletionQueue::queuePMRequest( IOPMRequest * request ) +bool IOPMCompletionQueue::queuePMRequest( IOPMRequest * request ) { + bool more; + assert(request); - request->detachNextRequest(); // unblocks next request + // unblock dependent request + more = request->detachNextRequest(); queue_enter(&fQueue, request, IOPMRequest *, fCommandChain); - if (workLoop) signalWorkAvailable(); + return more; } bool IOPMCompletionQueue::checkForWork( void ) { Action dqAction = (Action) action; IOPMRequest * request; + IOPMRequest * next; IOService * target; bool more = false; - queue_head_t tmpQueue; - - queue_init(&tmpQueue); - while (!queue_empty(&fQueue)) - { - queue_remove_first( &fQueue, request, IOPMRequest *, fCommandChain ); - if (request->isFreeBlocked()) - { - queue_enter(&tmpQueue, request, IOPMRequest *, fCommandChain); - continue; - } - target = request->getTarget(); - assert(target); - more |= (*dqAction)( target, request, this ); - } + request = (IOPMRequest *) queue_first(&fQueue); + while (!queue_end(&fQueue, (queue_entry_t) request)) + { + next = (IOPMRequest *) queue_next(&request->fCommandChain); + if (!request->isFreeBlocked()) + { + queue_remove(&fQueue, request, IOPMRequest *, fCommandChain); + target = request->getTarget(); + assert(target); + more |= (*dqAction)( target, request, this ); + } + request = next; + } - queue_new_head(&tmpQueue, &fQueue, IOPMRequest *, fCommandChain); - return more; + return more; } // MARK: - @@ -6519,11 +7608,16 @@ IOReturn IOServicePM::gatedSerialize( OSSerialize * s ) { OSDictionary * dict; bool ok = false; - int dictSize = 4; + int dictSize = 5; if (IdleTimerPeriod) dictSize += 4; +#if WORK_QUEUE_STATS + if (gIOPMRootNode == ControllingDriver) + dictSize += 4; +#endif + if (PowerClients) dict = OSDictionary::withDictionary( PowerClients, PowerClients->getCount() + dictSize); @@ -6533,11 +7627,13 @@ IOReturn IOServicePM::gatedSerialize( OSSerialize * s ) if (dict) { setPMProperty(dict, "CurrentPowerState", CurrentPowerState); + if (NumberOfPowerStates) + setPMProperty(dict, "MaxPowerState", NumberOfPowerStates-1); if (DesiredPowerState != CurrentPowerState) setPMProperty(dict, "DesiredPowerState", DesiredPowerState); if (kIOPM_Finished != MachineState) setPMProperty(dict, "MachineState", MachineState); - if (DeviceOverrides) + if (DeviceOverrideEnabled) dict->setObject("PowerOverrideOn", kOSBooleanTrue); if (IdleTimerPeriod) @@ -6560,7 +7656,7 @@ IOReturn IOServicePM::gatedSerialize( OSSerialize * s ) delta = now; SUB_ABSOLUTETIME(&delta, &DeviceActiveTimestamp); absolutetime_to_nanoseconds(delta, &nsecs); - setPMProperty(dict, "TimeSinceActivityTickle", NS_TO_MS(nsecs)); + setPMProperty(dict, "TimeSinceLastTickle", NS_TO_MS(nsecs)); } if (AbsoluteTime_to_scalar(&IdleTimerStartTime)) @@ -6573,6 +7669,20 @@ IOReturn IOServicePM::gatedSerialize( OSSerialize * s ) } } +#if WORK_QUEUE_STATS + if (gIOPMRootNode == Owner) + { + setPMProperty(dict, "WQ-CheckForWork", + gIOPMWorkQueue->fStatCheckForWork); + setPMProperty(dict, "WQ-ScanEntries", + gIOPMWorkQueue->fStatScanEntries); + setPMProperty(dict, "WQ-QueueEmpty", + gIOPMWorkQueue->fStatQueueEmpty); + setPMProperty(dict, "WQ-NoWorkDone", + gIOPMWorkQueue->fStatNoWorkDone); + } +#endif + ok = dict->serialize(s); dict->release(); } @@ -6593,3 +7703,53 @@ bool IOServicePM::serialize( OSSerialize * s ) const return (kIOReturnSuccess == ret); } + +PMEventDetails* PMEventDetails::eventDetails(uint32_t type, + const char *ownerName, + uintptr_t ownerUnique, + const char *interestName, + uint8_t oldState, + uint8_t newState, + uint32_t result, + uint32_t elapsedTimeUS) { + + PMEventDetails *myself; + myself = new PMEventDetails; + + if(myself) { + myself->eventType = type; + myself->ownerName = ownerName; + myself->ownerUnique = ownerUnique; + myself->interestName = interestName; + myself->oldState = oldState; + myself->newState = newState; + myself->result = result; + myself->elapsedTimeUS = elapsedTimeUS; + + myself->eventClassifier = kIOPMEventClassDriverEvent; + } + + return myself; +} + + +PMEventDetails* PMEventDetails::eventDetails(uint32_t type, + const char *uuid, + uint32_t reason, + uint32_t result) { + + PMEventDetails *myself; + myself = new PMEventDetails; + + if(myself) { + myself->eventType = type; + myself->uuid = uuid; + myself->reason = reason; + myself->result = result; + + myself->eventClassifier = kIOPMEventClassSystemEvent; + } + + return myself; +} + diff --git a/iokit/Kernel/IOServicePMPrivate.h b/iokit/Kernel/IOServicePMPrivate.h index 818285f8e..96e5bfacc 100644 --- a/iokit/Kernel/IOServicePMPrivate.h +++ b/iokit/Kernel/IOServicePMPrivate.h @@ -29,17 +29,163 @@ #ifndef _IOKIT_IOSERVICEPMPRIVATE_H #define _IOKIT_IOSERVICEPMPRIVATE_H -/*! @class IOServicePM - @abstract Power management class. -*/ +#include +#include + +//****************************************************************************** +// PM command types +//****************************************************************************** + +enum { + /* Command Types */ + kIOPMRequestTypeInvalid = 0x00, + kIOPMRequestTypePMStop = 0x01, + kIOPMRequestTypeAddPowerChild1 = 0x02, + kIOPMRequestTypeAddPowerChild2 = 0x03, + kIOPMRequestTypeAddPowerChild3 = 0x04, + kIOPMRequestTypeRegisterPowerDriver = 0x05, + kIOPMRequestTypeAdjustPowerState = 0x06, + kIOPMRequestTypePowerDomainWillChange = 0x07, + kIOPMRequestTypePowerDomainDidChange = 0x08, + kIOPMRequestTypePowerOverrideOnPriv = 0x09, + kIOPMRequestTypePowerOverrideOffPriv = 0x0A, + kIOPMRequestTypeActivityTickle = 0x0B, + kIOPMRequestTypeRequestPowerState = 0x0C, + kIOPMRequestTypeSynchronizePowerTree = 0x0D, + kIOPMRequestTypeRequestPowerStateOverride = 0x0E, + kIOPMRequestTypeSetIdleTimerPeriod = 0x0F, + + /* Reply Types */ + kIOPMRequestTypeReplyStart = 0x80, + kIOPMRequestTypeAckPowerChange = 0x81, + kIOPMRequestTypeAckSetPowerState = 0x82, + kIOPMRequestTypeAllowPowerChange = 0x83, + kIOPMRequestTypeCancelPowerChange = 0x84, + kIOPMRequestTypeInterestChanged = 0x85, + kIOPMRequestTypeIdleCancel = 0x86, + kIOPMRequestTypeChildNotifyDelayCancel = 0x87 +}; + +//****************************************************************************** +// PM actions - For root domain only +//****************************************************************************** + +struct IOPMActions; + +typedef void +(*IOPMActionPowerChangeStart)( + void * target, + IOService * service, + IOPMActions * actions, + uint32_t powerState, + uint32_t * changeFlags ); + +typedef void +(*IOPMActionPowerChangeDone)( + void * target, + IOService * service, + IOPMActions * actions, + uint32_t powerState, + uint32_t changeFlags ); + +typedef void +(*IOPMActionPowerChangeOverride)( + void * target, + IOService * service, + IOPMActions * actions, + unsigned long * powerState, + uint32_t * changeFlags ); + +typedef void +(*IOPMActionActivityTickle)( + void * target, + IOService * service, + IOPMActions * actions ); + +struct IOPMActions { + void * target; + uint32_t parameter; + IOPMActionPowerChangeStart actionPowerChangeStart; + IOPMActionPowerChangeDone actionPowerChangeDone; + IOPMActionPowerChangeOverride actionPowerChangeOverride; + IOPMActionActivityTickle actionActivityTickle; +}; + +//****************************************************************************** + +enum { + kIOPMEventClassSystemEvent = 0x00, + kIOPMEventClassDriverEvent = 0x1 +}; + +class PMEventDetails : public OSObject +{ + OSDeclareDefaultStructors( PMEventDetails ); + friend class IOServicePM; + friend class IOPMrootDomain; + friend class IOPMTimeline; +public: + static PMEventDetails *eventDetails(uint32_t type, + const char *ownerName, + uintptr_t ownerUnique, + const char *interestName, + uint8_t oldState, + uint8_t newState, + uint32_t result, + uint32_t elapsedTimeUS); + + static PMEventDetails *eventDetails(uint32_t type, + const char *uuid, + uint32_t reason, + uint32_t result); +private: + uint8_t eventClassifier; + uint32_t eventType; + const char *ownerName; + uintptr_t ownerUnique; + const char *interestName; + uint8_t oldState; + uint8_t newState; + uint32_t result; + uint32_t elapsedTimeUS; + + const char *uuid; + uint32_t reason; +}; + +// Internal concise representation of IOPMPowerState +struct IOPMPSEntry +{ + IOPMPowerFlags capabilityFlags; + IOPMPowerFlags outputPowerFlags; + IOPMPowerFlags inputPowerFlags; + uint32_t staticPower; + uint32_t settleUpTime; + uint32_t settleDownTime; +}; + +//****************************************************************************** +// IOServicePM +//****************************************************************************** + class IOServicePM : public OSObject { friend class IOService; + friend class IOPMWorkQueue; OSDeclareDefaultStructors( IOServicePM ) private: - // List of interested drivers. + // Link IOServicePM objects on IOPMWorkQueue. + queue_chain_t WorkChain; + + // Queue of IOPMRequest objects. + queue_head_t RequestHead; + + // IOService creator and owner. + IOService * Owner; + + // List of interested drivers (protected by PMLock). IOPMinformeeList * InterestedDrivers; // How long to wait for controlling driver to acknowledge. @@ -50,21 +196,22 @@ class IOServicePM : public OSObject thread_call_t AckTimer; thread_call_t SettleTimer; + thread_call_t IdleTimer; // Settle time after changing power state. - unsigned long SettleTimeUS; + uint32_t SettleTimeUS; // The flags describing current change note. - unsigned long HeadNoteFlags; + IOPMPowerChangeFlags HeadNoteChangeFlags; // The new power state number being changed to. - unsigned long HeadNotePowerState; + IOPMPowerStateIndex HeadNotePowerState; // Points to the entry in the power state array. - IOPMPowerState * HeadNotePowerArrayEntry; + IOPMPSEntry * HeadNotePowerArrayEntry; // Power flags supplied by all parents (domain). - unsigned long HeadNoteDomainFlags; + IOPMPowerFlags HeadNoteDomainFlags; // Power flags supplied by domain accounting for parent changes. IOPMPowerFlags HeadNoteDomainTargetFlags; @@ -73,32 +220,26 @@ class IOServicePM : public OSObject IOPowerConnection * HeadNoteParentConnection; // Power flags supplied by the changing parent. - unsigned long HeadNoteParentFlags; + IOPMPowerFlags HeadNoteParentFlags; // Number of acks still outstanding. - unsigned long HeadNotePendingAcks; + uint32_t HeadNotePendingAcks; // PM state lock. - IOLock * PMLock; - - // Initialized to true, then set to false after the initial power change. - bool InitialChange; - - // Ignore children and driver desires if true. - bool DeviceOverrides; - - // True if device was active since last idle timer expiration. - bool DeviceActive; - - // Keeps track of any negative responses from notified apps and clients. - bool DoNotPowerDown; - - // True if all our parents know the state of their power domain. - bool ParentsKnowState; - - bool StrictTreeOrder; - bool IdleTimerStopped; - bool AdjustPowerScheduled; + IOLock * PMLock; + + unsigned int InitialPowerChange:1; + unsigned int InitialSetPowerState:1; + unsigned int DeviceOverrideEnabled:1; + unsigned int DeviceWasActive:1; + unsigned int DoNotPowerDown:1; + unsigned int ParentsKnowState:1; + unsigned int StrictTreeOrder:1; + unsigned int IdleTimerStopped:1; + unsigned int AdjustPowerScheduled:1; + unsigned int IsPreChange:1; + unsigned int DriverCallBusy:1; + unsigned int PCDFunctionOverride:1; // Time of last device activity. AbsoluteTime DeviceActiveTimestamp; @@ -106,105 +247,106 @@ class IOServicePM : public OSObject // Used to protect activity flag. IOLock * ActivityLock; - // Idle timer event source. - IOTimerEventSource * IdleTimerEventSource; - // Idle timer's period in seconds. unsigned long IdleTimerPeriod; unsigned long IdleTimerMinPowerState; AbsoluteTime IdleTimerStartTime; // Power state desired by a subclassed device object. - unsigned long DeviceDesire; + IOPMPowerStateIndex DeviceDesire; // This is the power state we desire currently. - unsigned long DesiredPowerState; + IOPMPowerStateIndex DesiredPowerState; // This is what our parent thinks our need is. - unsigned long PreviousRequest; + IOPMPowerFlags PreviousRequestPowerFlags; // Cache result from getName(), used in logging. const char * Name; // Number of power states in the power array. - unsigned long NumberOfPowerStates; + IOPMPowerStateIndex NumberOfPowerStates; // Power state array. - IOPMPowerState * PowerStates; + IOPMPSEntry * PowerStates; // The controlling driver. - IOService * ControllingDriver; + IOService * ControllingDriver; // Our current power state. - unsigned long CurrentPowerState; + IOPMPowerStateIndex CurrentPowerState; // Logical OR of power flags for each power domain parent. - IOPMPowerFlags ParentsCurrentPowerFlags; + IOPMPowerFlags ParentsCurrentPowerFlags; // The highest power state we can achieve in current power domain. - unsigned long MaxCapability; + IOPMPowerStateIndex MaxPowerState; // Logical OR of all output power character flags in the array. - IOPMPowerFlags OutputPowerCharacterFlags; + IOPMPowerFlags OutputPowerCharacterFlags; // OSArray which manages responses from notified apps and clients. - OSArray * ResponseArray; + OSArray * ResponseArray; OSArray * NotifyClientArray; // Used to uniquely identify power management notification to apps and clients. - UInt16 SerialNumber; + UInt16 SerialNumber; // Used to communicate desired function to tellClientsWithResponse(). // This is used because it avoids changing the signatures of the affected virtual methods. - int OutOfBandParameter; + int OutOfBandParameter; AbsoluteTime DriverCallStartTime; IOPMPowerFlags CurrentCapabilityFlags; long ActivityTicklePowerState; unsigned long CurrentPowerConsumption; - unsigned long TempClampPowerState; - IOPMWorkQueue * PMWorkQueue; - OSSet * InsertInterestSet; - OSSet * RemoveInterestSet; - OSArray * NotifyChildArray; + IOPMPowerStateIndex TempClampPowerState; + OSArray * NotifyChildArray; OSDictionary * PowerClients; - thread_call_t DriverCallEntry; - void * DriverCallParamPtr; - IOItemCount DriverCallParamCount; - IOItemCount DriverCallParamSlots; + thread_call_t DriverCallEntry; + void * DriverCallParamPtr; + IOItemCount DriverCallParamCount; + IOItemCount DriverCallParamSlots; uint32_t DriverCallReason; + uint32_t OutOfBandMessage; uint32_t TempClampCount; uint32_t OverrideMaxPowerState; uint32_t ActivityTickleCount; uint32_t WaitReason; - uint32_t NextMachineState; + uint32_t SavedMachineState; uint32_t RootDomainState; - uint32_t ThreadAssertionCount; - // Protected by PMLock + // Protected by PMLock - BEGIN struct { - uint32_t DriverCallBusy : 1; - uint32_t PMStop : 1; + uint32_t PMStop : 1; + uint32_t PMDriverCallWait : 1; } LockedFlags; - thread_t ThreadAssertionThread; + queue_head_t PMDriverCallQueue; + OSSet * InsertInterestSet; + OSSet * RemoveInterestSet; + // Protected by PMLock - END #if PM_VARS_SUPPORT - IOPMprot * PMVars; + IOPMprot * PMVars; #endif + IOPMActions PMActions; + // Serialize IOServicePM state for debug output. IOReturn gatedSerialize( OSSerialize * s ); virtual bool serialize( OSSerialize * s ) const; }; +#define fOwner pwrMgt->Owner #define fInterestedDrivers pwrMgt->InterestedDrivers #define fDriverTimer pwrMgt->DriverTimer +#define fMachineState pwrMgt->MachineState #define fAckTimer pwrMgt->AckTimer #define fSettleTimer pwrMgt->SettleTimer -#define fMachineState pwrMgt->MachineState +#define fIdleTimer pwrMgt->IdleTimer #define fSettleTimeUS pwrMgt->SettleTimeUS -#define fHeadNoteFlags pwrMgt->HeadNoteFlags +#define fHeadNoteChangeFlags pwrMgt->HeadNoteChangeFlags #define fHeadNotePowerState pwrMgt->HeadNotePowerState #define fHeadNotePowerArrayEntry pwrMgt->HeadNotePowerArrayEntry #define fHeadNoteDomainFlags pwrMgt->HeadNoteDomainFlags @@ -213,63 +355,63 @@ class IOServicePM : public OSObject #define fHeadNoteParentFlags pwrMgt->HeadNoteParentFlags #define fHeadNotePendingAcks pwrMgt->HeadNotePendingAcks #define fPMLock pwrMgt->PMLock -#define fInitialChange pwrMgt->InitialChange -#define fDeviceOverrides pwrMgt->DeviceOverrides +#define fInitialPowerChange pwrMgt->InitialPowerChange +#define fInitialSetPowerState pwrMgt->InitialSetPowerState +#define fDeviceOverrideEnabled pwrMgt->DeviceOverrideEnabled +#define fDeviceWasActive pwrMgt->DeviceWasActive +#define fDoNotPowerDown pwrMgt->DoNotPowerDown +#define fParentsKnowState pwrMgt->ParentsKnowState +#define fStrictTreeOrder pwrMgt->StrictTreeOrder +#define fIdleTimerStopped pwrMgt->IdleTimerStopped +#define fAdjustPowerScheduled pwrMgt->AdjustPowerScheduled +#define fIsPreChange pwrMgt->IsPreChange +#define fDriverCallBusy pwrMgt->DriverCallBusy +#define fPCDFunctionOverride pwrMgt->PCDFunctionOverride +#define fDeviceActiveTimestamp pwrMgt->DeviceActiveTimestamp #define fActivityLock pwrMgt->ActivityLock -#define fIdleTimerEventSource pwrMgt->IdleTimerEventSource #define fIdleTimerPeriod pwrMgt->IdleTimerPeriod #define fIdleTimerMinPowerState pwrMgt->IdleTimerMinPowerState -#define fDeviceActive pwrMgt->DeviceActive #define fIdleTimerStartTime pwrMgt->IdleTimerStartTime -#define fDeviceActiveTimestamp pwrMgt->DeviceActiveTimestamp -#define fActivityTickleCount pwrMgt->ActivityTickleCount #define fDeviceDesire pwrMgt->DeviceDesire #define fDesiredPowerState pwrMgt->DesiredPowerState -#define fPreviousRequest pwrMgt->PreviousRequest +#define fPreviousRequestPowerFlags pwrMgt->PreviousRequestPowerFlags #define fName pwrMgt->Name #define fNumberOfPowerStates pwrMgt->NumberOfPowerStates #define fPowerStates pwrMgt->PowerStates #define fControllingDriver pwrMgt->ControllingDriver -#define fAggressivenessValue pwrMgt->AggressivenessValue -#define fAggressivenessValid pwrMgt->AggressivenessValid #define fCurrentPowerState pwrMgt->CurrentPowerState -#define fParentsKnowState pwrMgt->ParentsKnowState #define fParentsCurrentPowerFlags pwrMgt->ParentsCurrentPowerFlags -#define fMaxCapability pwrMgt->MaxCapability +#define fMaxPowerState pwrMgt->MaxPowerState #define fOutputPowerCharacterFlags pwrMgt->OutputPowerCharacterFlags -#define fSerialNumber pwrMgt->SerialNumber #define fResponseArray pwrMgt->ResponseArray #define fNotifyClientArray pwrMgt->NotifyClientArray -#define fDoNotPowerDown pwrMgt->DoNotPowerDown +#define fSerialNumber pwrMgt->SerialNumber #define fOutOfBandParameter pwrMgt->OutOfBandParameter #define fDriverCallStartTime pwrMgt->DriverCallStartTime #define fCurrentCapabilityFlags pwrMgt->CurrentCapabilityFlags +#define fActivityTicklePowerState pwrMgt->ActivityTicklePowerState #define fCurrentPowerConsumption pwrMgt->CurrentPowerConsumption #define fTempClampPowerState pwrMgt->TempClampPowerState -#define fTempClampCount pwrMgt->TempClampCount -#define fOverrideMaxPowerState pwrMgt->OverrideMaxPowerState -#define fPMWorkQueue pwrMgt->PMWorkQueue -#define fWaitReason pwrMgt->WaitReason -#define fNextMachineState pwrMgt->NextMachineState -#define fDriverCallReason pwrMgt->DriverCallReason +#define fNotifyChildArray pwrMgt->NotifyChildArray +#define fPowerClients pwrMgt->PowerClients #define fDriverCallEntry pwrMgt->DriverCallEntry #define fDriverCallParamPtr pwrMgt->DriverCallParamPtr #define fDriverCallParamCount pwrMgt->DriverCallParamCount #define fDriverCallParamSlots pwrMgt->DriverCallParamSlots -#define fActivityTickled pwrMgt->ActivityTickled +#define fDriverCallReason pwrMgt->DriverCallReason +#define fOutOfBandMessage pwrMgt->OutOfBandMessage +#define fTempClampCount pwrMgt->TempClampCount +#define fOverrideMaxPowerState pwrMgt->OverrideMaxPowerState +#define fActivityTickleCount pwrMgt->ActivityTickleCount +#define fWaitReason pwrMgt->WaitReason +#define fSavedMachineState pwrMgt->SavedMachineState +#define fRootDomainState pwrMgt->RootDomainState +#define fLockedFlags pwrMgt->LockedFlags +#define fPMDriverCallQueue pwrMgt->PMDriverCallQueue #define fInsertInterestSet pwrMgt->InsertInterestSet #define fRemoveInterestSet pwrMgt->RemoveInterestSet -#define fStrictTreeOrder pwrMgt->StrictTreeOrder -#define fNotifyChildArray pwrMgt->NotifyChildArray -#define fIdleTimerStopped pwrMgt->IdleTimerStopped -#define fAdjustPowerScheduled pwrMgt->AdjustPowerScheduled -#define fActivityTicklePowerState pwrMgt->ActivityTicklePowerState #define fPMVars pwrMgt->PMVars -#define fPowerClients pwrMgt->PowerClients -#define fRootDomainState pwrMgt->RootDomainState -#define fThreadAssertionCount pwrMgt->ThreadAssertionCount -#define fThreadAssertionThread pwrMgt->ThreadAssertionThread -#define fLockedFlags pwrMgt->LockedFlags +#define fPMActions pwrMgt->PMActions /* When an IOService is waiting for acknowledgement to a power change @@ -279,13 +421,27 @@ the ack timer is ticking every tenth of a second. */ #define ACK_TIMER_PERIOD 100000000 -#define kIOPMParentInitiated 0x01 // this power change initiated by our parent -#define kIOPMWeInitiated 0x02 // this power change initiated by this device -#define kIOPMNotDone 0x04 // we couldn't make this change -#define kIOPMDomainWillChange 0x08 // change started by PowerDomainWillChangeTo -#define kIOPMDomainDidChange 0x10 // change started by PowerDomainDidChangeTo -#define kIOPMDomainPowerDrop 0x20 // Domain is lowering power -#define kIOPMSynchronize 0x40 // change triggered by power tree re-sync +// Max wait time in microseconds for kernel priority and capability clients +// with async message handlers to acknowledge. +// +#define kPriorityClientMaxWait (90 * 1000 * 1000) +#define kCapabilityClientMaxWait (240 * 1000 * 1000) + +// Attributes describing a power state change. +// See IOPMPowerChangeFlags data type. +// +#define kIOPMParentInitiated 0x0001 // this power change initiated by our parent +#define kIOPMSelfInitiated 0x0002 // this power change initiated by this device +#define kIOPMNotDone 0x0004 // we couldn't make this change +#define kIOPMDomainWillChange 0x0008 // change started by PowerDomainWillChangeTo +#define kIOPMDomainDidChange 0x0010 // change started by PowerDomainDidChangeTo +#define kIOPMDomainPowerDrop 0x0020 // Domain is lowering power +#define kIOPMIgnoreChildren 0x0040 // Ignore children and driver power desires +#define kIOPMSkipAskPowerDown 0x0080 // skip the ask app phase +#define kIOPMSynchronize 0x0100 // change triggered by power tree re-sync +#define kIOPMSyncNoChildNotify 0x0200 // sync root domain only, not entire tree +#define kIOPMSyncTellPowerDown 0x0400 // send the ask/will power off messages +#define kIOPMSyncCancelPowerDown 0x0800 // sleep cancel for maintenance wake enum { kDriverCallInformPreChange, @@ -298,73 +454,51 @@ struct DriverCallParam { IOReturn Result; }; -// values of outofbandparameter +// values of OutOfBandParameter enum { kNotifyApps, - kNotifyPriority + kNotifyPriority, + kNotifyCapabilityChangeApps, + kNotifyCapabilityChangePriority }; -typedef bool (*IOPMMessageFilter)(OSObject * object, void * context); +typedef bool (*IOPMMessageFilter)( + void * target, void * object, void * arg1, void * arg2, void * arg3 ); // used for applyToInterested struct IOPMInterestContext { - OSArray * responseFlags; - OSArray * notifyClients; - UInt16 serialNumber; - UInt16 counter; - UInt32 maxTimeRequested; - int msgType; - IOService * us; - unsigned long stateNumber; - IOPMPowerFlags stateFlags; - const char * errorLog; - IOPMMessageFilter filterFunc; + OSArray * responseArray; + OSArray * notifyClients; + uint16_t serialNumber; + uint8_t isPreChange; + uint8_t enableTracing; + uint32_t maxTimeRequested; + uint32_t messageType; + uint32_t notifyType; + IOService * us; + IOPMPowerStateIndex stateNumber; + IOPMPowerFlags stateFlags; + IOPMPowerChangeFlags changeFlags; + const char * errorLog; + IOPMMessageFilter messageFilter; }; -//********************************************************************************* +// assertPMDriverCall() options +enum { + kIOPMADC_NoInactiveCheck = 1 +}; + +//****************************************************************************** // PM Statistics & Diagnostics -//********************************************************************************* +//****************************************************************************** extern const OSSymbol *gIOPMStatsApplicationResponseTimedOut; extern const OSSymbol *gIOPMStatsApplicationResponseCancel; extern const OSSymbol *gIOPMStatsApplicationResponseSlow; -//********************************************************************************* -// PM command types -//********************************************************************************* - -enum { - /* Command Types */ - kIOPMRequestTypeInvalid = 0x00, - kIOPMRequestTypePMStop = 0x01, - kIOPMRequestTypeAddPowerChild1 = 0x02, - kIOPMRequestTypeAddPowerChild2 = 0x03, - kIOPMRequestTypeAddPowerChild3 = 0x04, - kIOPMRequestTypeRegisterPowerDriver = 0x05, - kIOPMRequestTypeAdjustPowerState = 0x06, - kIOPMRequestTypePowerDomainWillChange = 0x07, - kIOPMRequestTypePowerDomainDidChange = 0x08, - kIOPMRequestTypePowerOverrideOnPriv = 0x09, - kIOPMRequestTypePowerOverrideOffPriv = 0x0A, - kIOPMRequestTypeActivityTickle = 0x0B, - kIOPMRequestTypeRequestPowerState = 0x0C, - kIOPMRequestTypeSynchronizePowerTree = 0x0D, - kIOPMRequestTypeRequestPowerStateOverride = 0x0E, - kIOPMRequestTypeSetIdleTimerPeriod = 0x0F, - - /* Reply Types */ - kIOPMRequestTypeReplyStart = 0x80, - kIOPMRequestTypeAckPowerChange = 0x81, - kIOPMRequestTypeAckSetPowerState = 0x82, - kIOPMRequestTypeAllowPowerChange = 0x83, - kIOPMRequestTypeCancelPowerChange = 0x84, - kIOPMRequestTypeInterestChanged = 0x85, - kIOPMRequestTypeIdleCancel = 0x86 -}; - -//********************************************************************************* -// IOServicePM internal helper classes -//********************************************************************************* +//****************************************************************************** +// IOPMRequest +//****************************************************************************** typedef void (*IOPMCompletionAction)(void * target, void * param, IOReturn status); @@ -445,12 +579,16 @@ class IOPMRequest : public IOCommand static IOPMRequest * create( void ); bool init( IOService * owner, IOOptionBits type ); void reset( void ); - void attachNextRequest( IOPMRequest * next ); - void detachNextRequest( void ); - void attachRootRequest( IOPMRequest * root ); - void detachRootRequest( void ); + bool attachNextRequest( IOPMRequest * next ); + bool detachNextRequest( void ); + bool attachRootRequest( IOPMRequest * root ); + bool detachRootRequest( void ); }; +//****************************************************************************** +// IOPMRequestQueue +//****************************************************************************** + class IOPMRequestQueue : public IOEventSource { OSDeclareDefaultStructors( IOPMRequestQueue ) @@ -470,9 +608,14 @@ class IOPMRequestQueue : public IOEventSource static IOPMRequestQueue * create( IOService * inOwner, Action inAction ); void queuePMRequest( IOPMRequest * request ); void queuePMRequestChain( IOPMRequest ** requests, IOItemCount count ); - void signalWorkAvailable( void ); }; +//****************************************************************************** +// IOPMWorkQueue +//****************************************************************************** + +#define WORK_QUEUE_STATS 1 + class IOPMWorkQueue : public IOEventSource { OSDeclareDefaultStructors( IOPMWorkQueue ) @@ -480,24 +623,36 @@ class IOPMWorkQueue : public IOEventSource public: typedef bool (*Action)( IOService *, IOPMRequest *, IOPMWorkQueue * ); +#if WORK_QUEUE_STATS + uint64_t fStatCheckForWork; + uint64_t fStatScanEntries; + uint64_t fStatQueueEmpty; + uint64_t fStatNoWorkDone; +#endif + protected: - queue_head_t fWorkQueue; - Action fWorkAction; - Action fRetireAction; + queue_head_t fWorkQueue; + Action fWorkAction; + Action fRetireAction; + uint32_t fQueueLength; + uint32_t fConsumerCount; + volatile uint32_t fProducerCount; virtual bool checkForWork( void ); virtual bool init( IOService * inOwner, Action work, Action retire ); + bool checkRequestQueue( queue_head_t * queue, bool * empty ); public: static IOPMWorkQueue * create( IOService * inOwner, Action work, Action retire ); - void queuePMRequest( IOPMRequest * request ); - - inline boolean_t isEmpty( void ) - { - return queue_empty(&fWorkQueue); - } + bool queuePMRequest( IOPMRequest * request, IOServicePM * pwrMgt ); + void signalWorkAvailable( void ); + void incrementProducerCount( void ); }; +//****************************************************************************** +// IOPMCompletionQueue +//****************************************************************************** + class IOPMCompletionQueue : public IOEventSource { OSDeclareDefaultStructors( IOPMCompletionQueue ) @@ -513,7 +668,7 @@ class IOPMCompletionQueue : public IOEventSource public: static IOPMCompletionQueue * create( IOService * inOwner, Action inAction ); - void queuePMRequest( IOPMRequest * request ); + bool queuePMRequest( IOPMRequest * request ); }; #endif /* !_IOKIT_IOSERVICEPMPRIVATE_H */ diff --git a/iokit/Kernel/IOServicePrivate.h b/iokit/Kernel/IOServicePrivate.h index 1d455fbee..873d47660 100644 --- a/iokit/Kernel/IOServicePrivate.h +++ b/iokit/Kernel/IOServicePrivate.h @@ -55,6 +55,8 @@ enum { kIOServiceConfigState = 0x04000000, kIOServiceTermPhase2State = 0x01000000, kIOServiceTermPhase3State = 0x00800000, + kIOServiceTermPhase1State = 0x00400000, + kIOServiceTerm1WaiterState = 0x00200000 }; // options for terminate() @@ -167,6 +169,7 @@ class IOResources : public IOService public: static IOService * resources( void ); + virtual bool init( OSDictionary * dictionary = 0 ); virtual IOWorkLoop * getWorkLoop( ) const; virtual bool matchPropertyTable( OSDictionary * table ); virtual IOReturn setProperties( OSObject * properties ); @@ -195,6 +198,7 @@ class _IOOpenServiceIterator : public OSIterator extern const OSSymbol * gIOConsoleUsersKey; extern const OSSymbol * gIOConsoleSessionUIDKey; +extern const OSSymbol * gIOConsoleSessionAuditIDKey; extern const OSSymbol * gIOConsoleSessionOnConsoleKey; extern const OSSymbol * gIOConsoleSessionSecureInputPIDKey; diff --git a/iokit/Kernel/IOStartIOKit.cpp b/iokit/Kernel/IOStartIOKit.cpp index 7b70541d6..49397d4cb 100644 --- a/iokit/Kernel/IOStartIOKit.cpp +++ b/iokit/Kernel/IOStartIOKit.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2011 Apple Inc. All rights reserved. + * Copyright (c) 1998-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -38,7 +38,10 @@ #include #include #include +#include #include +#include +#include #include @@ -61,7 +64,7 @@ void IOKitInitializeTime( void ) t.tv_nsec = 0; IOService::waitForService( IOService::resourceMatching("IORTC"), &t ); -#ifdef ppc +#if defined(__i386__) || defined(__x86_64__) IOService::waitForService( IOService::resourceMatching("IONVRAM"), &t ); #endif @@ -79,6 +82,8 @@ void IOKitResetTime( void ) clock_get_calendar_microtime(&secs, µsecs); gIOLastWakeTime.tv_sec = secs; gIOLastWakeTime.tv_usec = microsecs; + + IOService::updateConsoleUsers(NULL, kIOMessageSystemHasPoweredOn); } void iokit_post_constructor_init(void) @@ -90,8 +95,11 @@ void iokit_post_constructor_init(void) assert( root ); IOService::initialize(); IOCatalogue::initialize(); + IOStatistics::initialize(); + OSKext::initialize(); IOUserClient::initialize(); IOMemoryDescriptor::initialize(); + IORootParent::initialize(); // Initializes IOPMinformeeList class-wide shared lock IOPMinformeeList::getSharedRecursiveLock(); @@ -107,7 +115,6 @@ void iokit_post_constructor_init(void) root->setProperty( kIOKitDiagnosticsKey, obj ); obj->release(); } - } // From @@ -122,24 +129,20 @@ void StartIOKit( void * p1, void * p2, void * p3, void * p4 ) { IOPlatformExpertDevice * rootNub; int debugFlags; - uint32_t intThreshold; if( PE_parse_boot_argn( "io", &debugFlags, sizeof (debugFlags) )) - gIOKitDebug = debugFlags; - + gIOKitDebug = debugFlags; + if( PE_parse_boot_argn( "iotrace", &debugFlags, sizeof (debugFlags) )) gIOKitTrace = debugFlags; // Compat for boot-args gIOKitTrace |= (gIOKitDebug & kIOTraceCompatBootArgs); - - if( PE_parse_boot_argn( "iointthreshold", &intThreshold, sizeof (intThreshold) )) - gIOInterruptThresholdNS = intThreshold * 1000; // Check for the log synchronous bit set in io if (gIOKitDebug & kIOLogSynchronous) debug_mode = true; - + // // Have to start IOKit environment before we attempt to start // the C++ runtime environment. At some stage we have to clean up diff --git a/iokit/Kernel/IOStatistics.cpp b/iokit/Kernel/IOStatistics.cpp new file mode 100644 index 000000000..9235b293d --- /dev/null +++ b/iokit/Kernel/IOStatistics.cpp @@ -0,0 +1,1279 @@ +/* + * Copyright (c) 2010 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +#if IOKITSTATS + +bool IOStatistics::enabled = false; + +uint32_t IOStatistics::sequenceID = 0; + +uint32_t IOStatistics::lastClassIndex = 0; +uint32_t IOStatistics::lastKextIndex = 0; + +uint32_t IOStatistics::loadedKexts = 0; +uint32_t IOStatistics::registeredClasses = 0; +uint32_t IOStatistics::registeredCounters = 0; +uint32_t IOStatistics::registeredWorkloops = 0; + +uint32_t IOStatistics::attachedEventSources = 0; + +IOWorkLoopDependency *IOStatistics::nextWorkLoopDependency = NULL; + +/* Logging */ + +#define LOG_LEVEL 0 + +#define LOG(level, format, ...) \ +do { \ + if (level <= LOG_LEVEL) \ + printf(format, ##__VA_ARGS__); \ +} while (0) + +/* Locks */ + +IORWLock *IOStatistics::lock = NULL; + +/* Kext tree */ + +KextNode *IOStatistics::kextHint = NULL; + +IOStatistics::KextTreeHead IOStatistics::kextHead = RB_INITIALIZER(&IOStatistics::kextHead); + +int IOStatistics::kextNodeCompare(KextNode *e1, KextNode *e2) +{ + if (e1->kext < e2->kext) + return -1; + else if (e1->kext > e2->kext) + return 1; + else + return 0; +} + +RB_GENERATE(IOStatistics::KextTree, KextNode, link, kextNodeCompare); + +/* Kext tree ordered by address */ + +IOStatistics::KextAddressTreeHead IOStatistics::kextAddressHead = RB_INITIALIZER(&IOStatistics::kextAddressHead); + +int IOStatistics::kextAddressNodeCompare(KextNode *e1, KextNode *e2) +{ + if (e1->address < e2->address) + return -1; + else if (e1->address > e2->address) + return 1; + else + return 0; +} + +RB_GENERATE(IOStatistics::KextAddressTree, KextNode, addressLink, kextAddressNodeCompare); + +/* Class tree */ + +IOStatistics::ClassTreeHead IOStatistics::classHead = RB_INITIALIZER(&IOStatistics::classHead); + +int IOStatistics::classNodeCompare(ClassNode *e1, ClassNode *e2) { + if (e1->metaClass < e2->metaClass) + return -1; + else if (e1->metaClass > e2->metaClass) + return 1; + else + return 0; +} + +RB_GENERATE(IOStatistics::ClassTree, ClassNode, tLink, classNodeCompare); + +/* Workloop dependencies */ + +int IOWorkLoopCounter::loadTagCompare(IOWorkLoopDependency *e1, IOWorkLoopDependency *e2) { + if (e1->loadTag < e2->loadTag) + return -1; + else if (e1->loadTag > e2->loadTag) + return 1; + else + return 0; +} + +RB_GENERATE(IOWorkLoopCounter::DependencyTree, IOWorkLoopDependency, link, IOWorkLoopCounter::loadTagCompare); + +/* sysctl stuff */ + +static int +oid_sysctl(__unused struct sysctl_oid *oidp, __unused void *arg1, int arg2, struct sysctl_req *req) +{ + int error = EINVAL; + uint32_t request = arg2; + + switch (request) + { + case kIOStatisticsGeneral: + error = IOStatistics::getStatistics(req); + break; + case kIOStatisticsWorkLoop: + error = IOStatistics::getWorkLoopStatistics(req); + break; + case kIOStatisticsUserClient: + error = IOStatistics::getUserClientStatistics(req); + break; + default: + break; + } + + return error; +} + +SYSCTL_NODE(_debug, OID_AUTO, iokit_statistics, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "IOStatistics"); + +static SYSCTL_PROC(_debug_iokit_statistics, OID_AUTO, general, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + 0, kIOStatisticsGeneral, oid_sysctl, "S", ""); + +static SYSCTL_PROC(_debug_iokit_statistics, OID_AUTO, workloop, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + 0, kIOStatisticsWorkLoop, oid_sysctl, "S", ""); + +static SYSCTL_PROC(_debug_iokit_statistics, OID_AUTO, userclient, + CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + 0, kIOStatisticsUserClient, oid_sysctl, "S", ""); + +void IOStatistics::initialize() +{ + if (enabled) { + return; + } + +#if DEVELOPMENT || DEBUG + /* Always enabled in development and debug builds. */ +#else + /* Only enabled in release builds if the boot argument is set. */ + if (!(kIOStatistics & gIOKitDebug)) { + return; + } +#endif + + sysctl_register_oid(&sysctl__debug_iokit_statistics_general); + sysctl_register_oid(&sysctl__debug_iokit_statistics_workloop); + sysctl_register_oid(&sysctl__debug_iokit_statistics_userclient); + + lock = IORWLockAlloc(); + if (!lock) { + return; + } + + nextWorkLoopDependency = (IOWorkLoopDependency*)kalloc(sizeof(IOWorkLoopDependency)); + if (!nextWorkLoopDependency) { + return; + } + + enabled = true; +} + +void IOStatistics::onKextLoad(OSKext *kext, kmod_info_t *kmod_info) +{ + KextNode *ke; + + assert(kext && kmod_info); + + if (!enabled) { + return; + } + + LOG(1, "IOStatistics::onKextLoad: %s, tag %d, address 0x%llx, address end 0x%llx\n", + kext->getIdentifierCString(), kmod_info->id, (uint64_t)kmod_info->address, (uint64_t)(kmod_info->address + kmod_info->size)); + + ke = (KextNode *)kalloc(sizeof(KextNode)); + if (!ke) { + return; + } + + memset(ke, 0, sizeof(KextNode)); + + ke->kext = kext; + ke->loadTag = kmod_info->id; + ke->address = kmod_info->address; + ke->address_end = kmod_info->address + kmod_info->size; + + SLIST_INIT(&ke->classList); + TAILQ_INIT(&ke->userClientCallList); + + IORWLockWrite(lock); + + RB_INSERT(KextTree, &kextHead, ke); + RB_INSERT(KextAddressTree, &kextAddressHead, ke); + + sequenceID++; + loadedKexts++; + lastKextIndex++; + + IORWLockUnlock(lock); +} + +void IOStatistics::onKextUnload(OSKext *kext) +{ + KextNode sought, *found; + + assert(kext); + + if (!enabled) { + return; + } + + LOG(1, "IOStatistics::onKextUnload: %s\n", kext->getIdentifierCString()); + + IORWLockWrite(lock); + + sought.kext = kext; + found = RB_FIND(KextTree, &kextHead, &sought); + if (found) { + IOWorkLoopCounter *wlc; + IOUserClientProcessEntry *uce; + + /* Free up the list of counters */ + while ((wlc = SLIST_FIRST(&found->workLoopList))) { + SLIST_REMOVE_HEAD(&found->workLoopList, link); + kfree(wlc, sizeof(IOWorkLoopCounter)); + } + + /* Free up the user client list */ + while ((uce = TAILQ_FIRST(&found->userClientCallList))) { + TAILQ_REMOVE(&found->userClientCallList, uce, link); + kfree(uce, sizeof(IOUserClientProcessEntry)); + } + + /* Remove from kext trees */ + RB_REMOVE(KextTree, &kextHead, found); + RB_REMOVE(KextAddressTree, &kextAddressHead, found); + + /* + * Clear a matching kextHint to avoid use after free in + * onClassAdded() for a class add after a KEXT unload. + */ + if (found == kextHint) { + kextHint = NULL; + } + + /* Finally, free the class node */ + kfree(found, sizeof(KextNode)); + + sequenceID++; + loadedKexts--; + } + else { + panic("IOStatistics::onKextUnload: cannot find kext: %s", kext->getIdentifierCString()); + } + + IORWLockUnlock(lock); +} + +void IOStatistics::onClassAdded(OSKext *parentKext, OSMetaClass *metaClass) +{ + ClassNode *ce; + KextNode soughtKext, *foundKext = NULL; + + assert(parentKext && metaClass); + + if (!enabled) { + return; + } + + LOG(1, "IOStatistics::onClassAdded: %s\n", metaClass->getClassName()); + + ce = (ClassNode *)kalloc(sizeof(ClassNode)); + if (!ce) { + return; + } + + memset(ce, 0, sizeof(ClassNode)); + + IORWLockWrite(lock); + + /* Hinted? */ + if (kextHint && kextHint->kext == parentKext) { + foundKext = kextHint; + } + else { + soughtKext.kext = parentKext; + foundKext = RB_FIND(KextTree, &kextHead, &soughtKext); + } + + if (foundKext) { + ClassNode soughtClass, *foundClass = NULL; + const OSMetaClass *superClass; + + ce->metaClass = metaClass; + ce->classID = lastClassIndex++; + ce->parentKext = foundKext; + + /* Has superclass? */ + superClass = ce->metaClass->getSuperClass(); + if (superClass) { + soughtClass.metaClass = superClass; + foundClass = RB_FIND(ClassTree, &classHead, &soughtClass); + } + ce->superClassID = foundClass ? foundClass->classID : (uint32_t)(-1); + + SLIST_INIT(&ce->counterList); + SLIST_INIT(&ce->userClientList); + + RB_INSERT(ClassTree, &classHead, ce); + SLIST_INSERT_HEAD(&foundKext->classList, ce, lLink); + + foundKext->classes++; + + kextHint = foundKext; + + sequenceID++; + registeredClasses++; + } + else { + panic("IOStatistics::onClassAdded: cannot find parent kext: %s", parentKext->getIdentifierCString()); + } + + IORWLockUnlock(lock); +} + +void IOStatistics::onClassRemoved(OSKext *parentKext, OSMetaClass *metaClass) +{ + ClassNode sought, *found; + + assert(parentKext && metaClass); + + if (!enabled) { + return; + } + + LOG(1, "IOStatistics::onClassRemoved: %s\n", metaClass->getClassName()); + + IORWLockWrite(lock); + + sought.metaClass = metaClass; + found = RB_FIND(ClassTree, &classHead, &sought); + if (found) { + IOEventSourceCounter *esc; + IOUserClientCounter *ucc; + + /* Free up the list of counters */ + while ((esc = SLIST_FIRST(&found->counterList))) { + SLIST_REMOVE_HEAD(&found->counterList, link); + kfree(esc, sizeof(IOEventSourceCounter)); + } + + /* Free up the user client list */ + while ((ucc = SLIST_FIRST(&found->userClientList))) { + SLIST_REMOVE_HEAD(&found->userClientList, link); + kfree(ucc, sizeof(IOUserClientCounter)); + } + + /* Remove from class tree */ + RB_REMOVE(ClassTree, &classHead, found); + + /* Remove from parent */ + SLIST_REMOVE(&found->parentKext->classList, found, ClassNode, lLink); + + /* Finally, free the class node */ + kfree(found, sizeof(ClassNode)); + + sequenceID++; + registeredClasses--; + } + else { + panic("IOStatistics::onClassRemoved: cannot find class: %s", metaClass->getClassName()); + } + + IORWLockUnlock(lock); +} + +IOEventSourceCounter *IOStatistics::registerEventSource(OSObject *inOwner) +{ + IOEventSourceCounter *counter = NULL; + ClassNode sought, *found = NULL; + boolean_t createDummyCounter = FALSE; + + assert(inOwner); + + if (!enabled) { + return NULL; + } + + counter = (IOEventSourceCounter*)kalloc(sizeof(IOEventSourceCounter)); + if (!counter) { + return NULL; + } + + memset(counter, 0, sizeof(IOEventSourceCounter)); + + IORWLockWrite(lock); + + /* Workaround for - create a dummy counter when inOwner is bad. + * We use retainCount here as our best indication that the pointer is awry. + */ + if (inOwner->retainCount > 0xFFFFFF) { + kprintf("IOStatistics::registerEventSource - bad metaclass %p\n", inOwner); + createDummyCounter = TRUE; + } + else { + sought.metaClass = inOwner->getMetaClass(); + found = RB_FIND(ClassTree, &classHead, &sought); + } + + if (found) { + counter->parentClass = found; + SLIST_INSERT_HEAD(&found->counterList, counter, link); + registeredCounters++; + } + + if (!(createDummyCounter || found)) { + panic("IOStatistics::registerEventSource: cannot find parent class: %s", inOwner->getMetaClass()->getClassName()); + } + + IORWLockUnlock(lock); + + return counter; +} + +void IOStatistics::unregisterEventSource(IOEventSourceCounter *counter) +{ + if (!counter) { + return; + } + + IORWLockWrite(lock); + + if (counter->parentClass) { + SLIST_REMOVE(&counter->parentClass->counterList, counter, IOEventSourceCounter, link); + registeredCounters--; + } + kfree(counter, sizeof(IOEventSourceCounter)); + + IORWLockUnlock(lock); +} + +IOWorkLoopCounter* IOStatistics::registerWorkLoop(IOWorkLoop *workLoop) +{ + IOWorkLoopCounter *counter = NULL; + KextNode *found; + + assert(workLoop); + + if (!enabled) { + return NULL; + } + + counter = (IOWorkLoopCounter*)kalloc(sizeof(IOWorkLoopCounter)); + if (!counter) { + return NULL; + } + + memset(counter, 0, sizeof(IOWorkLoopCounter)); + + found = getKextNodeFromBacktrace(TRUE); + if (!found) { + panic("IOStatistics::registerWorkLoop: cannot find parent kext"); + } + + counter->parentKext = found; + counter->workLoop = workLoop; + RB_INIT(&counter->dependencyHead); + SLIST_INSERT_HEAD(&found->workLoopList, counter, link); + registeredWorkloops++; + + releaseKextNode(found); + + return counter; +} + +void IOStatistics::unregisterWorkLoop(IOWorkLoopCounter *counter) +{ + if (!counter) { + return; + } + + IORWLockWrite(lock); + + SLIST_REMOVE(&counter->parentKext->workLoopList, counter, IOWorkLoopCounter, link); + kfree(counter, sizeof(IOWorkLoopCounter)); + registeredWorkloops--; + + IORWLockUnlock(lock); +} + +IOUserClientCounter *IOStatistics::registerUserClient(IOUserClient *userClient) +{ + ClassNode sought, *found; + IOUserClientCounter *counter = NULL; + + assert(userClient); + + if (!enabled) { + return NULL; + } + + counter = (IOUserClientCounter*)kalloc(sizeof(IOUserClientCounter)); + if (!counter) { + return NULL; + } + + memset(counter, 0, sizeof(IOUserClientCounter)); + + IORWLockWrite(lock); + + sought.metaClass = userClient->getMetaClass(); + + found = RB_FIND(ClassTree, &classHead, &sought); + if (found) { + counter->parentClass = found; + SLIST_INSERT_HEAD(&found->userClientList, counter, link); + } + else { + panic("IOStatistics::registerUserClient: cannot find parent class: %s", sought.metaClass->getClassName()); + } + + IORWLockUnlock(lock); + + return counter; +} + +void IOStatistics::unregisterUserClient(IOUserClientCounter *counter) +{ + if (!counter) { + return; + } + + IORWLockWrite(lock); + + SLIST_REMOVE(&counter->parentClass->userClientList, counter, IOUserClientCounter, link); + kfree(counter, sizeof(IOUserClientCounter)); + + IORWLockUnlock(lock); +} + +void IOStatistics::attachWorkLoopEventSource(IOWorkLoopCounter *wlc, IOEventSourceCounter *esc) +{ + if (!wlc) { + return; + } + + IORWLockWrite(lock); + + if (!nextWorkLoopDependency) { + return; + } + + attachedEventSources++; + wlc->attachedEventSources++; + + /* Track the kext dependency */ + nextWorkLoopDependency->loadTag = esc->parentClass->parentKext->loadTag; + if (NULL == RB_INSERT(IOWorkLoopCounter::DependencyTree, &wlc->dependencyHead, nextWorkLoopDependency)) { + nextWorkLoopDependency = (IOWorkLoopDependency*)kalloc(sizeof(IOWorkLoopDependency)); + } + + IORWLockUnlock(lock); +} + +void IOStatistics::detachWorkLoopEventSource(IOWorkLoopCounter *wlc, IOEventSourceCounter *esc) +{ + IOWorkLoopDependency sought, *found; + + if (!wlc) { + return; + } + + IORWLockWrite(lock); + + attachedEventSources--; + wlc->attachedEventSources--; + + sought.loadTag = esc->parentClass->parentKext->loadTag; + + found = RB_FIND(IOWorkLoopCounter::DependencyTree, &wlc->dependencyHead, &sought); + if (found) { + RB_REMOVE(IOWorkLoopCounter::DependencyTree, &wlc->dependencyHead, found); + kfree(found, sizeof(IOWorkLoopDependency)); + } + + IORWLockUnlock(lock); +} + +int IOStatistics::getStatistics(sysctl_req *req) +{ + int error; + uint32_t calculatedSize, size; + char *buffer, *ptr; + IOStatisticsHeader *header; + + assert(IOStatistics::enabled && req); + + IORWLockRead(IOStatistics::lock); + + /* Work out how much we need to allocate. IOStatisticsKext is of variable size. */ + calculatedSize = sizeof(IOStatisticsHeader) + + sizeof(IOStatisticsGlobal) + + (sizeof(IOStatisticsKext) * loadedKexts) + (sizeof(uint32_t) * registeredClasses) + + (sizeof(IOStatisticsMemory) * loadedKexts) + + (sizeof(IOStatisticsClass) * registeredClasses) + + (sizeof(IOStatisticsCounter) * registeredClasses) + + (sizeof(IOStatisticsKextIdentifier) * loadedKexts) + + (sizeof(IOStatisticsClassName) * registeredClasses); + + /* Size request? */ + if (req->oldptr == USER_ADDR_NULL) { + error = SYSCTL_OUT(req, NULL, calculatedSize); + goto exit; + } + + /* Read only */ + if (req->newptr != USER_ADDR_NULL) { + error = EPERM; + goto exit; + } + + buffer = (char*)kalloc(calculatedSize); + if (!buffer) { + error = ENOMEM; + goto exit; + } + + memset(buffer, 0, calculatedSize); + + ptr = buffer; + + header = (IOStatisticsHeader*)((void*)ptr); + + header->sig = IOSTATISTICS_SIG; + header->ver = IOSTATISTICS_VER; + + header->seq = sequenceID; + + ptr += sizeof(IOStatisticsHeader); + + /* Global data - seq, timers, interrupts, etc) */ + header->globalStatsOffset = sizeof(IOStatisticsHeader); + size = copyGlobalStatistics((IOStatisticsGlobal*)((void*)ptr)); + ptr += size; + + /* Kext statistics */ + header->kextStatsOffset = header->globalStatsOffset + size; + size = copyKextStatistics((IOStatisticsKext*)((void*)ptr)); + ptr += size; + + /* Memory allocation info */ + header->memoryStatsOffset = header->kextStatsOffset + size; + size = copyMemoryStatistics((IOStatisticsMemory*)((void*)ptr)); + ptr += size; + + /* Class statistics */ + header->classStatsOffset = header->memoryStatsOffset + size; + size = copyClassStatistics((IOStatisticsClass*)((void*)ptr)); + ptr += size; + + /* Dynamic class counter data */ + header->counterStatsOffset = header->classStatsOffset + size; + size = copyCounterStatistics((IOStatisticsCounter*)((void*)ptr)); + ptr += size; + + /* Kext identifiers */ + header->kextIdentifiersOffset = header->counterStatsOffset + size; + size = copyKextIdentifiers((IOStatisticsKextIdentifier*)((void*)ptr)); + ptr += size; + + /* Class names */ + header->classNamesOffset = header->kextIdentifiersOffset + size; + size = copyClassNames((IOStatisticsClassName*)ptr); + ptr += size; + + LOG(2, "IOStatistics::getStatistics - calculatedSize 0x%x, kexts 0x%x, classes 0x%x.\n", + calculatedSize, loadedKexts, registeredClasses); + + assert( (uint32_t)(ptr - buffer) == calculatedSize ); + + error = SYSCTL_OUT(req, buffer, calculatedSize); + + kfree(buffer, calculatedSize); + +exit: + IORWLockUnlock(IOStatistics::lock); + return error; +} + +int IOStatistics::getWorkLoopStatistics(sysctl_req *req) +{ + int error; + uint32_t calculatedSize, size; + char *buffer; + IOStatisticsWorkLoopHeader *header; + + assert(IOStatistics::enabled && req); + + IORWLockRead(IOStatistics::lock); + + /* Approximate how much we need to allocate (worse case estimate) */ + calculatedSize = sizeof(IOStatisticsWorkLoop) * registeredWorkloops + + sizeof(uint32_t) * attachedEventSources; + + /* Size request? */ + if (req->oldptr == USER_ADDR_NULL) { + error = SYSCTL_OUT(req, NULL, calculatedSize); + goto exit; + } + + /* Read only */ + if (req->newptr != USER_ADDR_NULL) { + error = EPERM; + goto exit; + } + + buffer = (char*)kalloc(calculatedSize); + if (!buffer) { + error = ENOMEM; + goto exit; + } + + header = (IOStatisticsWorkLoopHeader*)((void*)buffer); + + header->sig = IOSTATISTICS_SIG_WORKLOOP; + header->ver = IOSTATISTICS_VER; + + header->seq = sequenceID; + + header->workloopCount = registeredWorkloops; + + size = copyWorkLoopStatistics(&header->workLoopStats); + + LOG(2, "IOStatistics::getWorkLoopStatistics: calculatedSize %d, size %d\n", calculatedSize, size); + + assert( size <= calculatedSize ); + + error = SYSCTL_OUT(req, buffer, size); + + kfree(buffer, calculatedSize); + +exit: + IORWLockUnlock(IOStatistics::lock); + return error; +} + +int IOStatistics::getUserClientStatistics(sysctl_req *req) +{ + int error; + uint32_t calculatedSize, size; + char *buffer; + uint32_t requestedLoadTag = 0; + IOStatisticsUserClientHeader *header; + + assert(IOStatistics::enabled && req); + + IORWLockRead(IOStatistics::lock); + + /* Work out how much we need to allocate */ + calculatedSize = sizeof(IOStatisticsUserClientHeader) + + sizeof(IOStatisticsUserClientCall) * IOKIT_STATISTICS_RECORDED_USERCLIENT_PROCS * loadedKexts; + + /* Size request? */ + if (req->oldptr == USER_ADDR_NULL) { + error = SYSCTL_OUT(req, NULL, calculatedSize); + goto exit; + } + + /* Kext request (potentially) valid? */ + if (!req->newptr || req->newlen < sizeof(requestedLoadTag)) { + error = EINVAL; + goto exit; + } + + SYSCTL_IN(req, &requestedLoadTag, sizeof(requestedLoadTag)); + + LOG(2, "IOStatistics::getUserClientStatistics - requesting kext w/load tag: %d\n", requestedLoadTag); + + buffer = (char*)kalloc(calculatedSize); + if (!buffer) { + error = ENOMEM; + goto exit; + } + + header = (IOStatisticsUserClientHeader*)((void*)buffer); + + header->sig = IOSTATISTICS_SIG_USERCLIENT; + header->ver = IOSTATISTICS_VER; + + header->seq = sequenceID; + + header->processes = 0; + + size = copyUserClientStatistics(header, requestedLoadTag); + + assert((sizeof(IOStatisticsUserClientHeader) + size) <= calculatedSize); + + if (size) { + error = SYSCTL_OUT(req, buffer, sizeof(IOStatisticsUserClientHeader) + size); + } + else { + error = EINVAL; + } + + kfree(buffer, calculatedSize); + +exit: + IORWLockUnlock(IOStatistics::lock); + return error; +} + +uint32_t IOStatistics::copyGlobalStatistics(IOStatisticsGlobal *stats) +{ + stats->kextCount = loadedKexts; + stats->classCount = registeredClasses; + stats->workloops = registeredWorkloops; + + return sizeof(IOStatisticsGlobal); +} + +uint32_t IOStatistics::copyKextStatistics(IOStatisticsKext *stats) +{ + KextNode *ke; + ClassNode *ce; + uint32_t index = 0; + + RB_FOREACH(ke, KextTree, &kextHead) { + stats->loadTag = ke->loadTag; + ke->kext->getSizeInfo(&stats->loadSize, &stats->wiredSize); + + stats->classes = ke->classes; + + /* Append indices of owned classes */ + SLIST_FOREACH(ce, &ke->classList, lLink) { + stats->classIndexes[index++] = ce->classID; + } + + stats = (IOStatisticsKext *)((void*)((char*)stats + sizeof(IOStatisticsKext) + (ke->classes * sizeof(uint32_t)))); + } + + return (sizeof(IOStatisticsKext) * loadedKexts + sizeof(uint32_t) * registeredClasses); +} + +uint32_t IOStatistics::copyMemoryStatistics(IOStatisticsMemory *stats) +{ + KextNode *ke; + + RB_FOREACH(ke, KextTree, &kextHead) { + stats->allocatedSize = ke->memoryCounters[kIOStatisticsMalloc]; + stats->freedSize = ke->memoryCounters[kIOStatisticsFree]; + stats->allocatedAlignedSize = ke->memoryCounters[kIOStatisticsMallocAligned]; + stats->freedAlignedSize = ke->memoryCounters[kIOStatisticsFreeAligned]; + stats->allocatedContiguousSize = ke->memoryCounters[kIOStatisticsMallocContiguous]; + stats->freedContiguousSize = ke->memoryCounters[kIOStatisticsFreeContiguous]; + stats->allocatedPageableSize = ke->memoryCounters[kIOStatisticsMallocPageable]; + stats->freedPageableSize = ke->memoryCounters[kIOStatisticsFreePageable]; + stats++; + } + + return (sizeof(IOStatisticsMemory) * loadedKexts); +} + +uint32_t IOStatistics::copyClassStatistics(IOStatisticsClass *stats) +{ + KextNode *ke; + ClassNode *ce; + + RB_FOREACH(ke, KextTree, &kextHead) { + SLIST_FOREACH(ce, &ke->classList, lLink) { + stats->classID = ce->classID; + stats->superClassID = ce->superClassID; + stats->classSize = ce->metaClass->getClassSize(); + + stats++; + } + } + + return sizeof(IOStatisticsClass) * registeredClasses; +} + +uint32_t IOStatistics::copyCounterStatistics(IOStatisticsCounter *stats) +{ + KextNode *ke; + ClassNode *ce; + + RB_FOREACH(ke, KextTree, &kextHead) { + SLIST_FOREACH(ce, &ke->classList, lLink) { + IOUserClientCounter *userClientCounter; + IOEventSourceCounter *counter; + + stats->classID = ce->classID; + stats->classInstanceCount = ce->metaClass->getInstanceCount(); + + IOStatisticsUserClients *uc = &stats->userClientStatistics; + + /* User client counters */ + SLIST_FOREACH(userClientCounter, &ce->userClientList, link) { + uc->clientCalls += userClientCounter->clientCalls; + uc->created++; + } + + IOStatisticsInterruptEventSources *iec = &stats->interruptEventSourceStatistics; + IOStatisticsInterruptEventSources *fiec = &stats->filterInterruptEventSourceStatistics; + IOStatisticsTimerEventSources *tec = &stats->timerEventSourceStatistics; + IOStatisticsCommandGates *cgc = &stats->commandGateStatistics; + IOStatisticsCommandQueues *cqc = &stats->commandQueueStatistics; + IOStatisticsDerivedEventSources *dec = &stats->derivedEventSourceStatistics; + + /* Event source counters */ + SLIST_FOREACH(counter, &ce->counterList, link) { + switch (counter->type) { + case kIOStatisticsInterruptEventSourceCounter: + iec->created++; + iec->produced += counter->u.interrupt.produced; + iec->checksForWork += counter->u.interrupt.checksForWork; + break; + case kIOStatisticsFilterInterruptEventSourceCounter: + fiec->created++; + fiec->produced += counter->u.filter.produced; + fiec->checksForWork += counter->u.filter.checksForWork; + break; + case kIOStatisticsTimerEventSourceCounter: + tec->created++; + tec->timeouts += counter->u.timer.timeouts; + tec->checksForWork += counter->u.timer.checksForWork; + tec->timeOnGate += counter->timeOnGate; + tec->closeGateCalls += counter->closeGateCalls; + tec->openGateCalls += counter->openGateCalls; + break; + case kIOStatisticsCommandGateCounter: + cgc->created++; + cgc->timeOnGate += counter->timeOnGate; + cgc->actionCalls += counter->u.commandGate.actionCalls; + break; + case kIOStatisticsCommandQueueCounter: + cqc->created++; + cqc->actionCalls += counter->u.commandQueue.actionCalls; + break; + case kIOStatisticsDerivedEventSourceCounter: + dec->created++; + dec->timeOnGate += counter->timeOnGate; + dec->closeGateCalls += counter->closeGateCalls; + dec->openGateCalls += counter->openGateCalls; + break; + default: + break; + } + } + + stats++; + } + } + + return sizeof(IOStatisticsCounter) * registeredClasses; +} + +uint32_t IOStatistics::copyKextIdentifiers(IOStatisticsKextIdentifier *kextIDs) +{ + KextNode *ke; + + RB_FOREACH(ke, KextTree, &kextHead) { + strncpy(kextIDs->identifier, ke->kext->getIdentifierCString(), kIOStatisticsDriverNameLength); + kextIDs++; + } + + return (sizeof(IOStatisticsKextIdentifier) * loadedKexts); +} + +uint32_t IOStatistics::copyClassNames(IOStatisticsClassName *classNames) +{ + KextNode *ke; + ClassNode *ce; + + RB_FOREACH(ke, KextTree, &kextHead) { + SLIST_FOREACH(ce, &ke->classList, lLink) { + strncpy(classNames->name, ce->metaClass->getClassName(), kIOStatisticsClassNameLength); + classNames++; + } + } + + return (sizeof(IOStatisticsClassName) * registeredClasses); +} + +uint32_t IOStatistics::copyWorkLoopStatistics(IOStatisticsWorkLoop *stats) +{ + KextNode *ke; + IOWorkLoopCounter *wlc; + IOWorkLoopDependency *dependentNode; + uint32_t size, accumulatedSize = 0; + + RB_FOREACH(ke, KextTree, &kextHead) { + SLIST_FOREACH(wlc, &ke->workLoopList, link) { + stats->kextLoadTag = ke->loadTag; + stats->attachedEventSources = wlc->attachedEventSources; + stats->timeOnGate = wlc->timeOnGate; + stats->dependentKexts = 0; + RB_FOREACH(dependentNode, IOWorkLoopCounter::DependencyTree, &wlc->dependencyHead) { + stats->dependentKextLoadTags[stats->dependentKexts] = dependentNode->loadTag; + stats->dependentKexts++; + } + + size = sizeof(IOStatisticsWorkLoop) + (sizeof(uint32_t) * stats->dependentKexts); + + accumulatedSize += size; + stats = (IOStatisticsWorkLoop*)((void*)((char*)stats + size)); + } + } + + return accumulatedSize; +} + +uint32_t IOStatistics::copyUserClientStatistics(IOStatisticsUserClientHeader *stats, uint32_t loadTag) +{ + KextNode *sought, *found = NULL; + uint32_t procs = 0; + IOUserClientProcessEntry *processEntry; + + RB_FOREACH(sought, KextTree, &kextHead) { + if (sought->loadTag == loadTag) { + found = sought; + break; + } + } + + if (!found) { + return 0; + } + + TAILQ_FOREACH(processEntry, &found->userClientCallList, link) { + strncpy(stats->userClientCalls[procs].processName, processEntry->processName, kIOStatisticsProcessNameLength); + stats->userClientCalls[procs].pid = processEntry->pid; + stats->userClientCalls[procs].calls = processEntry->calls; + stats->processes++; + procs++; + } + + return sizeof(IOStatisticsUserClientCall) * stats->processes; +} + +void IOStatistics::storeUserClientCallInfo(IOUserClient *userClient, IOUserClientCounter *counter) +{ + OSString *ossUserClientCreator = NULL; + int32_t pid = -1; + KextNode *parentKext; + IOUserClientProcessEntry *entry, *nextEntry, *prevEntry = NULL; + uint32_t count = 0; + const char *ptr = NULL; + OSObject *obj; + + /* TODO: see if this can be more efficient */ + obj = userClient->copyProperty("IOUserClientCreator", + gIOServicePlane, + kIORegistryIterateRecursively | kIORegistryIterateParents); + + if (!obj) + goto err_nounlock; + + ossUserClientCreator = OSDynamicCast(OSString, obj); + + if (ossUserClientCreator) { + uint32_t len, lenIter = 0; + + ptr = ossUserClientCreator->getCStringNoCopy(); + len = ossUserClientCreator->getLength(); + + while ((*ptr != ' ') && (lenIter < len)) { + ptr++; + lenIter++; + } + + if (lenIter < len) { + ptr++; // Skip the space + lenIter++; + pid = 0; + while ( (*ptr != ',') && (lenIter < len)) { + pid = pid*10 + (*ptr - '0'); + ptr++; + lenIter++; + } + + if(lenIter == len) { + pid = -1; + } else { + ptr += 2; + } + } + } + + if (-1 == pid) + goto err_nounlock; + + IORWLockWrite(lock); + + parentKext = counter->parentClass->parentKext; + + TAILQ_FOREACH(entry, &parentKext->userClientCallList, link) { + if (entry->pid == pid) { + /* Found, so increment count and move to the head */ + entry->calls++; + if (count) { + TAILQ_REMOVE(&parentKext->userClientCallList, entry, link); + break; + } + else { + /* At the head already, so increment and return */ + goto err_unlock; + } + } + + count++; + } + + if (!entry) { + if (count == IOKIT_STATISTICS_RECORDED_USERCLIENT_PROCS) { + /* Max elements hit, so reuse the last */ + entry = TAILQ_LAST(&parentKext->userClientCallList, ProcessEntryList); + TAILQ_REMOVE(&parentKext->userClientCallList, entry, link); + } + else { + /* Otherwise, allocate a new entry */ + entry = (IOUserClientProcessEntry*)kalloc(sizeof(IOUserClientProcessEntry)); + if (!entry) { + IORWLockUnlock(lock); + return; + } + } + + strncpy(entry->processName, ptr, kIOStatisticsProcessNameLength); + entry->pid = pid; + entry->calls = 1; + } + + TAILQ_FOREACH(nextEntry, &parentKext->userClientCallList, link) { + if (nextEntry->calls <= entry->calls) + break; + + prevEntry = nextEntry; + } + + if (!prevEntry) + TAILQ_INSERT_HEAD(&parentKext->userClientCallList, entry, link); + else + TAILQ_INSERT_AFTER(&parentKext->userClientCallList, prevEntry, entry, link); + +err_unlock: + IORWLockUnlock(lock); + +err_nounlock: + if (obj) + obj->release(); +} + +void IOStatistics::countUserClientCall(IOUserClient *client) { + IOUserClient::ExpansionData *data; + IOUserClientCounter *counter; + + /* Guard against an uninitialized client object - */ + if (!(data = client->reserved)) { + return; + } + + if ((counter = data->counter)) { + storeUserClientCallInfo(client, counter); + OSIncrementAtomic(&counter->clientCalls); + } +} + +KextNode *IOStatistics::getKextNodeFromBacktrace(boolean_t write) { + const uint32_t btMin = 3; + + void *bt[16]; + unsigned btCount = sizeof(bt) / sizeof(bt[0]); + vm_offset_t *scanAddr = NULL; + uint32_t i; + KextNode *found = NULL, *ke = NULL; + + btCount = OSBacktrace(bt, btCount); + + if (write) { + IORWLockWrite(lock); + } else { + IORWLockRead(lock); + } + + /* Ignore first levels */ + scanAddr = (vm_offset_t *)&bt[btMin - 1]; + + for (i = 0; i < btCount; i++, scanAddr++) { + ke = RB_ROOT(&kextAddressHead); + while (ke) { + if (*scanAddr < ke->address) { + ke = RB_LEFT(ke, addressLink); + } + else { + if ((*scanAddr < ke->address_end) && (*scanAddr >= ke->address)) { + if (!ke->kext->isKernelComponent()) { + return ke; + } else { + found = ke; + } + } + ke = RB_RIGHT(ke, addressLink); + } + } + } + + if (!found) { + IORWLockUnlock(lock); + } + + return found; +} + +void IOStatistics::releaseKextNode(KextNode *node) { +#pragma unused(node) + IORWLockUnlock(lock); +} + +/* IOLib allocations */ +void IOStatistics::countAlloc(uint32_t index, vm_size_t size) { + KextNode *ke; + + if (!enabled) { + return; + } + + ke = getKextNodeFromBacktrace(FALSE); + if (ke) { + OSAddAtomic(size, &ke->memoryCounters[index]); + releaseKextNode(ke); + } +} + +#endif /* IOKITSTATS */ diff --git a/iokit/Kernel/IOTimerEventSource.cpp b/iokit/Kernel/IOTimerEventSource.cpp index 112deeee7..c71feccf0 100644 --- a/iokit/Kernel/IOTimerEventSource.cpp +++ b/iokit/Kernel/IOTimerEventSource.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2000, 2009-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -53,13 +53,42 @@ OSMetaClassDefineReservedUnused(IOTimerEventSource, 5); OSMetaClassDefineReservedUnused(IOTimerEventSource, 6); OSMetaClassDefineReservedUnused(IOTimerEventSource, 7); +#if IOKITSTATS + +#define IOStatisticsInitializeCounter() \ +do { \ + IOStatistics::setCounterType(IOEventSource::reserved->counter, kIOStatisticsTimerEventSourceCounter); \ +} while (0) + +#define IOStatisticsOpenGate() \ +do { \ + IOStatistics::countOpenGate(me->IOEventSource::reserved->counter); \ +} while (0) + +#define IOStatisticsCloseGate() \ +do { \ + IOStatistics::countCloseGate(me->IOEventSource::reserved->counter); \ +} while (0) + +#define IOStatisticsTimeout() \ +do { \ + IOStatistics::countTimerTimeout(me->IOEventSource::reserved->counter); \ +} while (0) + +#else + +#define IOStatisticsInitializeCounter() +#define IOStatisticsOpenGate() +#define IOStatisticsCloseGate() +#define IOStatisticsTimeout() + +#endif /* IOKITSTATS */ + // // reserved != 0 means IOTimerEventSource::timeoutAndRelease is being used, // not a subclassed implementation. // -bool IOTimerEventSource::checkForWork() { return false; } - // Timeout handler function. This function is called by the kernel when // the timeout interval expires. // @@ -67,6 +96,8 @@ void IOTimerEventSource::timeout(void *self) { IOTimerEventSource *me = (IOTimerEventSource *) self; + IOStatisticsTimeout(); + if (me->enabled && me->action) { IOWorkLoop * @@ -75,6 +106,7 @@ void IOTimerEventSource::timeout(void *self) { Action doit; wl->closeGate(); + IOStatisticsCloseGate(); doit = (Action) me->action; if (doit && me->enabled && AbsoluteTime_to_scalar(&me->abstime)) { @@ -82,7 +114,7 @@ void IOTimerEventSource::timeout(void *self) if (trace) IOTimeStampStartConstant(IODBG_TIMES(IOTIMES_ACTION), - (uintptr_t) doit, (uintptr_t) me->owner); + (uintptr_t) doit, (uintptr_t) me->owner); (*doit)(me->owner, me); @@ -90,6 +122,7 @@ void IOTimerEventSource::timeout(void *self) IOTimeStampEndConstant(IODBG_TIMES(IOTIMES_ACTION), (uintptr_t) doit, (uintptr_t) me->owner); } + IOStatisticsOpenGate(); wl->openGate(); } } @@ -102,6 +135,8 @@ void IOTimerEventSource::timeoutAndRelease(void * self, void * c) must be cast to "long" before, in order to tell GCC we're not truncating a pointer. */ SInt32 count = (SInt32) (long) c; + IOStatisticsTimeout(); + if (me->enabled && me->action) { IOWorkLoop * @@ -110,6 +145,7 @@ void IOTimerEventSource::timeoutAndRelease(void * self, void * c) { Action doit; wl->closeGate(); + IOStatisticsCloseGate(); doit = (Action) me->action; if (doit && (me->reserved->calloutGeneration == count)) { @@ -117,7 +153,7 @@ void IOTimerEventSource::timeoutAndRelease(void * self, void * c) if (trace) IOTimeStampStartConstant(IODBG_TIMES(IOTIMES_ACTION), - (uintptr_t) doit, (uintptr_t) me->owner); + (uintptr_t) doit, (uintptr_t) me->owner); (*doit)(me->owner, me); @@ -125,6 +161,7 @@ void IOTimerEventSource::timeoutAndRelease(void * self, void * c) IOTimeStampEndConstant(IODBG_TIMES(IOTIMES_ACTION), (uintptr_t) doit, (uintptr_t) me->owner); } + IOStatisticsOpenGate(); wl->openGate(); } } @@ -151,6 +188,8 @@ bool IOTimerEventSource::init(OSObject *inOwner, Action inAction) if (!calloutEntry) return false; + IOStatisticsInitializeCounter(); + return true; } diff --git a/iokit/Kernel/IOUserClient.cpp b/iokit/Kernel/IOUserClient.cpp index 084471c3c..f031afd66 100644 --- a/iokit/Kernel/IOUserClient.cpp +++ b/iokit/Kernel/IOUserClient.cpp @@ -37,8 +37,22 @@ #include #include #include +#include +#include #include #include +#include + +#if CONFIG_MACF + +extern "C" { +#include +}; +#include + +#define IOMACF_LOG 0 + +#endif /* CONFIG_MACF */ #include @@ -57,6 +71,32 @@ enum kIOUCAsync64Flag = 1ULL }; +#if IOKITSTATS + +#define IOStatisticsRegisterCounter() \ +do { \ + reserved->counter = IOStatistics::registerUserClient(this); \ +} while (0) + +#define IOStatisticsUnregisterCounter() \ +do { \ + if (reserved) \ + IOStatistics::unregisterUserClient(reserved->counter); \ +} while (0) + +#define IOStatisticsClientCall() \ +do { \ + IOStatistics::countUserClientCall(client); \ +} while (0) + +#else + +#define IOStatisticsRegisterCounter() +#define IOStatisticsUnregisterCounter() +#define IOStatisticsClientCall() + +#endif /* IOKITSTATS */ + /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ // definitions we should get from osfmk @@ -383,9 +423,11 @@ iokit_client_died( io_object_t obj, ipc_port_t /* port */, if( IKOT_IOKIT_CONNECT == type) { - if( (client = OSDynamicCast( IOUserClient, obj ))) + if( (client = OSDynamicCast( IOUserClient, obj ))) { + IOStatisticsClientCall(); client->clientDied(); } + } else if( IKOT_IOKIT_OBJECT == type) { if( (map = OSDynamicCast( IOMemoryMap, obj ))) @@ -851,21 +893,27 @@ static OSDictionary * CopyConsoleUser(UInt32 uid) return user; } -static bool IOUCIsBackgroundTask(task_t task, bool * isBg) +static OSDictionary * CopyUserOnConsole(void) { - kern_return_t kr; - task_category_policy_data_t info; - mach_msg_type_number_t count = TASK_CATEGORY_POLICY_COUNT; - boolean_t get_default = false; - - kr = task_policy_get(current_task(), - TASK_CATEGORY_POLICY, - (task_policy_t) &info, - &count, - &get_default); - - *isBg = ((KERN_SUCCESS == kr) && (info.role == TASK_THROTTLE_APPLICATION)); - return (kr); + OSArray * array; + OSDictionary * user = 0; + + if ((array = OSDynamicCast(OSArray, + IORegistryEntry::getRegistryRoot()->copyProperty(gIOConsoleUsersKey)))) + { + for (unsigned int idx = 0; + (user = OSDynamicCast(OSDictionary, array->getObject(idx))); + idx++) + { + if (kOSBooleanTrue == user->getObject(gIOConsoleSessionOnConsoleKey)) + { + user->retain(); + break; + } + } + array->release(); + } + return (user); } IOReturn IOUserClient::clientHasPrivilege( void * securityToken, @@ -882,12 +930,41 @@ IOReturn IOUserClient::clientHasPrivilege( void * securityToken, if (!strncmp(privilegeName, kIOClientPrivilegeForeground, sizeof(kIOClientPrivilegeForeground))) { - bool isBg; - kern_return_t kr = IOUCIsBackgroundTask(current_task(), &isBg); + /* is graphics access denied for current task? */ + if (proc_get_task_selfgpuacc_deny() != 0) + return (kIOReturnNotPrivileged); + else + return (kIOReturnSuccess); + } - if (KERN_SUCCESS != kr) - return (kr); - return (isBg ? kIOReturnNotPrivileged : kIOReturnSuccess); + if (!strncmp(privilegeName, kIOClientPrivilegeConsoleSession, + sizeof(kIOClientPrivilegeConsoleSession))) + { + kauth_cred_t cred; + proc_t p; + + task = (task_t) securityToken; + if (!task) + task = current_task(); + p = (proc_t) get_bsdtask_info(task); + kr = kIOReturnNotPrivileged; + + if (p && (cred = kauth_cred_proc_ref(p))) + { + user = CopyUserOnConsole(); + if (user) + { + OSNumber * num; + if ((num = OSDynamicCast(OSNumber, user->getObject(gIOConsoleSessionAuditIDKey))) + && (cred->cr_audit.as_aia_p->ai_asid == (au_asid_t) num->unsigned32BitValue())) + { + kr = kIOReturnSuccess; + } + user->release(); + } + kauth_cred_unref(&cred); + } + return (kr); } if ((secureConsole = !strncmp(privilegeName, kIOClientPrivilegeSecureConsoleProcess, @@ -895,7 +972,7 @@ IOReturn IOUserClient::clientHasPrivilege( void * securityToken, task = (task_t)((IOUCProcessToken *)securityToken)->token; else task = (task_t)securityToken; - + count = TASK_SECURITY_TOKEN_COUNT; kr = task_info( task, TASK_SECURITY_TOKEN, (task_info_t) &token, &count ); @@ -935,28 +1012,28 @@ IOReturn IOUserClient::clientHasPrivilege( void * securityToken, bool IOUserClient::init() { - if( getPropertyTable()) - return true; - else - return super::init(); + if (getPropertyTable() || super::init()) + return reserve(); + + return false; } bool IOUserClient::init(OSDictionary * dictionary) { - if( getPropertyTable()) - return true; - else - return super::init(dictionary); + if (getPropertyTable() || super::init(dictionary)) + return reserve(); + + return false; } bool IOUserClient::initWithTask(task_t owningTask, void * securityID, UInt32 type ) -{ - if( getPropertyTable()) - return true; - else - return super::init(); +{ + if (getPropertyTable() || super::init()) + return reserve(); + + return false; } bool IOUserClient::initWithTask(task_t owningTask, @@ -972,11 +1049,30 @@ bool IOUserClient::initWithTask(task_t owningTask, return( ok ); } +bool IOUserClient::reserve() +{ + if(!reserved) { + reserved = IONew(ExpansionData, 1); + if (!reserved) { + return false; + } + } + + IOStatisticsRegisterCounter(); + + return true; +} + void IOUserClient::free() { if( mappings) mappings->release(); + + IOStatisticsUnregisterCounter(); + if (reserved) + IODelete(reserved, ExpansionData, 1); + super::free(); } @@ -1802,6 +1898,7 @@ kern_return_t is_io_connect_get_notification_semaphore( { CHECK( IOUserClient, connection, client ); + IOStatisticsClientCall(); return( client->getNotificationSemaphore( (UInt32) notification_type, semaphore )); } @@ -2211,11 +2308,17 @@ kern_return_t is_io_registry_entry_set_properties obj = OSUnserializeXML( (const char *) data ); vm_deallocate( kernel_map, data, propertiesCnt ); - if( obj) { + if (!obj) + res = kIOReturnBadArgument; +#if CONFIG_MACF + else if (0 != mac_iokit_check_set_properties(kauth_cred_get(), + registry_entry, obj)) + res = kIOReturnNotPermitted; +#endif + else res = entry->setProperties( obj ); - obj->release(); - } else - res = kIOReturnBadArgument; + if (obj) + obj->release(); } else res = err; @@ -2305,30 +2408,6 @@ kern_return_t is_io_service_request_probe( return( service->requestProbe( options )); } - -/* Routine io_service_open */ -kern_return_t is_io_service_open( - io_object_t _service, - task_t owningTask, - uint32_t connect_type, - io_object_t *connection ) -{ - IOUserClient * client; - IOReturn err; - - CHECK( IOService, _service, service ); - - err = service->newUserClient( owningTask, (void *) owningTask, - connect_type, 0, &client ); - - if( err == kIOReturnSuccess) { - assert( OSDynamicCast(IOUserClient, client) ); - *connection = client; - } - - return( err); -} - /* Routine io_service_open_ndr */ kern_return_t is_io_service_open_extended( io_object_t _service, @@ -2404,13 +2483,17 @@ kern_return_t is_io_service_open_extended( disallowAccess = (crossEndian && (kOSBooleanTrue != service->getProperty(kIOUserClientCrossEndianCompatibleKey)) && (kOSBooleanTrue != client->getProperty(kIOUserClientCrossEndianCompatibleKey))); - - if (disallowAccess) + if (disallowAccess) res = kIOReturnUnsupported; +#if CONFIG_MACF + else if (0 != mac_iokit_check_open(kauth_cred_get(), client, connect_type)) + res = kIOReturnNotPermitted; +#endif + if (kIOReturnSuccess != res) { + IOStatisticsClientCall(); client->clientClose(); client->release(); client = 0; - res = kIOReturnUnsupported; break; } client->sharedInstance = (0 != client->getProperty(kIOUserClientSharedInstanceKey)); @@ -2440,6 +2523,7 @@ kern_return_t is_io_service_close( CHECK( IOUserClient, connection, client ); + IOStatisticsClientCall(); client->clientClose(); return( kIOReturnSuccess ); @@ -2472,6 +2556,7 @@ kern_return_t is_io_connect_set_notification_port( { CHECK( IOUserClient, connection, client ); + IOStatisticsClientCall(); return( client->registerNotificationPort( port, notification_type, (io_user_reference_t) reference )); } @@ -2485,6 +2570,7 @@ kern_return_t is_io_connect_set_notification_port_64( { CHECK( IOUserClient, connection, client ); + IOStatisticsClientCall(); return( client->registerNotificationPort( port, notification_type, reference )); } @@ -2505,6 +2591,7 @@ kern_return_t is_io_connect_map_memory_into_task CHECK( IOUserClient, connection, client ); + IOStatisticsClientCall(); map = client->mapClientMemory64( memory_type, into_task, flags, *address ); if( map) { @@ -2563,6 +2650,8 @@ kern_return_t is_io_connect_map_memory( return (err); } +} /* extern "C" */ + IOMemoryMap * IOUserClient::removeMappingForDescriptor(IOMemoryDescriptor * mem) { OSIterator * iter; @@ -2590,6 +2679,8 @@ IOMemoryMap * IOUserClient::removeMappingForDescriptor(IOMemoryDescriptor * mem) return (map); } +extern "C" { + /* Routine io_connect_unmap_memory_from_task */ kern_return_t is_io_connect_unmap_memory_from_task ( @@ -2605,6 +2696,7 @@ kern_return_t is_io_connect_unmap_memory_from_task CHECK( IOUserClient, connection, client ); + IOStatisticsClientCall(); err = client->clientMemoryForType( (UInt32) memory_type, &options, &memory ); if( memory && (kIOReturnSuccess == err)) { @@ -2667,6 +2759,7 @@ kern_return_t is_io_connect_add_client( CHECK( IOUserClient, connection, client ); CHECK( IOUserClient, connect_to, to ); + IOStatisticsClientCall(); return( client->connectClient( to ) ); } @@ -2693,10 +2786,10 @@ kern_return_t is_io_connect_method mach_msg_type_number_t inband_inputCnt, mach_vm_address_t ool_input, mach_vm_size_t ool_input_size, - io_scalar_inband64_t scalar_output, - mach_msg_type_number_t *scalar_outputCnt, io_struct_inband_t inband_output, mach_msg_type_number_t *inband_outputCnt, + io_scalar_inband64_t scalar_output, + mach_msg_type_number_t *scalar_outputCnt, mach_vm_address_t ool_output, mach_vm_size_t * ool_output_size ) @@ -2742,6 +2835,7 @@ kern_return_t is_io_connect_method args.structureOutputDescriptor = outputMD; args.structureOutputDescriptorSize = *ool_output_size; + IOStatisticsClientCall(); ret = client->externalMethod( selector, &args ); *scalar_outputCnt = args.scalarOutputCount; @@ -2770,10 +2864,10 @@ kern_return_t is_io_connect_async_method mach_msg_type_number_t inband_inputCnt, mach_vm_address_t ool_input, mach_vm_size_t ool_input_size, - io_scalar_inband64_t scalar_output, - mach_msg_type_number_t *scalar_outputCnt, io_struct_inband_t inband_output, mach_msg_type_number_t *inband_outputCnt, + io_scalar_inband64_t scalar_output, + mach_msg_type_number_t *scalar_outputCnt, mach_vm_address_t ool_output, mach_vm_size_t * ool_output_size ) @@ -2823,6 +2917,7 @@ kern_return_t is_io_connect_async_method args.structureOutputDescriptor = outputMD; args.structureOutputDescriptorSize = *ool_output_size; + IOStatisticsClientCall(); ret = client->externalMethod( selector, &args ); *inband_outputCnt = args.structureOutputSize; @@ -2860,8 +2955,8 @@ kern_return_t is_io_connect_method_scalarI_scalarO( _input, inputCount, NULL, 0, 0, 0, - _output, outputCount, NULL, &struct_outputCnt, + _output, outputCount, 0, &ool_output_size); for (i = 0; i < *outputCount; i++) @@ -2979,8 +3074,8 @@ kern_return_t is_io_async_method_scalarI_scalarO( _input, inputCount, NULL, 0, 0, 0, - _output, outputCount, NULL, &struct_outputCnt, + _output, outputCount, 0, &ool_output_size); for (i = 0; i < *outputCount; i++) @@ -3019,8 +3114,8 @@ kern_return_t is_io_async_method_scalarI_structureO( _input, inputCount, NULL, 0, 0, 0, - NULL, &scalar_outputCnt, output, outputCount, + NULL, &scalar_outputCnt, 0, &ool_output_size)); } @@ -3056,8 +3151,8 @@ kern_return_t is_io_async_method_scalarI_structureI( _input, inputCount, inputStruct, inputStructCount, 0, 0, - NULL, &scalar_outputCnt, NULL, &inband_outputCnt, + NULL, &scalar_outputCnt, 0, &ool_output_size)); } @@ -3087,8 +3182,8 @@ kern_return_t is_io_async_method_structureI_structureO( NULL, 0, input, inputCount, 0, 0, - NULL, &scalar_outputCnt, output, outputCount, + NULL, &scalar_outputCnt, 0, &ool_output_size)); } @@ -3207,8 +3302,8 @@ kern_return_t is_io_connect_method_scalarI_structureO( _input, inputCount, NULL, 0, 0, 0, - NULL, &scalar_outputCnt, output, outputCount, + NULL, &scalar_outputCnt, 0, &ool_output_size)); } @@ -3380,8 +3475,8 @@ kern_return_t is_io_connect_method_scalarI_structureI( _input, inputCount, inputStruct, inputStructCount, 0, 0, - NULL, &scalar_outputCnt, NULL, &inband_outputCnt, + NULL, &scalar_outputCnt, 0, &ool_output_size)); } @@ -3553,8 +3648,8 @@ kern_return_t is_io_connect_method_structureI_structureO( NULL, 0, input, inputCount, 0, 0, - NULL, &scalar_outputCnt, output, outputCount, + NULL, &scalar_outputCnt, 0, &ool_output_size)); } @@ -3772,6 +3867,23 @@ kern_return_t is_io_catalog_send_data( } switch ( flag ) { + case kIOCatalogResetDrivers: + case kIOCatalogResetDriversNoMatch: { + OSArray * array; + + array = OSDynamicCast(OSArray, obj); + if (array) { + if ( !gIOCatalogue->resetAndAddDrivers(array, + flag == kIOCatalogResetDrivers) ) { + + kr = kIOReturnError; + } + } else { + kr = kIOReturnBadArgument; + } + } + break; + case kIOCatalogAddDrivers: case kIOCatalogAddDriversNoMatch: { OSArray * array; @@ -3827,6 +3939,7 @@ kern_return_t is_io_catalog_send_data( case kIOCatalogKextdActive: #if !NO_KEXTD + IOServiceTrace(IOSERVICE_KEXTD_ALIVE, 0, 0, 0, 0); OSKext::setKextdActive(); /* Dump all nonloaded startup extensions; kextd will now send them @@ -3844,6 +3957,7 @@ kern_return_t is_io_catalog_send_data( if (!clearedBusy) { IOService * serviceRoot = IOService::getServiceRoot(); if (serviceRoot) { + IOServiceTrace(IOSERVICE_KEXTD_READY, 0, 0, 0, 0); serviceRoot->adjustBusy(-1); clearedBusy = true; } @@ -3881,6 +3995,7 @@ kern_return_t is_io_catalog_terminate( return( kr ); switch ( flag ) { +#if !defined(SECURE_KERNEL) case kIOCatalogServiceTerminate: OSIterator * iter; IOService * service; @@ -3910,6 +4025,7 @@ kern_return_t is_io_catalog_terminate( kr = gIOCatalogue->terminateDriversForModule(name, flag == kIOCatalogModuleUnload); break; +#endif default: kr = kIOReturnBadArgument; @@ -4048,6 +4164,8 @@ kern_return_t iokit_user_client_trap(struct iokit_user_client_trap_args *args) return result; } +} /* extern "C" */ + IOReturn IOUserClient::externalMethod( uint32_t selector, IOExternalMethodArguments * args, IOExternalMethodDispatch * dispatch, OSObject * target, void * reference ) { @@ -4113,10 +4231,8 @@ IOReturn IOUserClient::externalMethod( uint32_t selector, IOExternalMethodArgume if (kIOUCForegroundOnly & method->flags) { - bool isBg; - kern_return_t kr = IOUCIsBackgroundTask(current_task(), &isBg); - - if ((KERN_SUCCESS == kr) && isBg) + /* is graphics access denied for current task? */ + if (proc_get_task_selfgpuacc_deny() != 0) return (kIOReturnNotPermitted); } @@ -4165,11 +4281,10 @@ IOReturn IOUserClient::externalMethod( uint32_t selector, IOExternalMethodArgume if (kIOUCForegroundOnly & method->flags) { - bool isBg; - kern_return_t kr = IOUCIsBackgroundTask(current_task(), &isBg); - - if ((KERN_SUCCESS == kr) && isBg) + /* is graphics access denied for current task? */ + if (proc_get_task_selfgpuacc_deny() != 0) return (kIOReturnNotPermitted); + } switch (method->flags & kIOUCTypeMask) @@ -4211,8 +4326,6 @@ IOReturn IOUserClient::externalMethod( uint32_t selector, IOExternalMethodArgume } -}; /* extern "C" */ - #if __LP64__ OSMetaClassDefineReservedUnused(IOUserClient, 0); OSMetaClassDefineReservedUnused(IOUserClient, 1); diff --git a/iokit/Kernel/IOWorkLoop.cpp b/iokit/Kernel/IOWorkLoop.cpp index c32a565f6..51045a234 100644 --- a/iokit/Kernel/IOWorkLoop.cpp +++ b/iokit/Kernel/IOWorkLoop.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2007 Apple Inc. All rights reserved. + * Copyright (c) 1998-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -56,31 +56,81 @@ OSMetaClassDefineReservedUnused(IOWorkLoop, 6); OSMetaClassDefineReservedUnused(IOWorkLoop, 7); enum IOWorkLoopState { kLoopRestart = 0x1, kLoopTerminate = 0x2 }; -#ifdef __ppc__ -static inline void SETP(void *addr, unsigned int flag) - { unsigned int *num = (unsigned int *) addr; *num |= flag; } -static inline void CLRP(void *addr, unsigned int flag) - { unsigned int *num = (unsigned int *) addr; *num &= ~flag; } -static inline bool ISSETP(void *addr, unsigned int flag) - { unsigned int *num = (unsigned int *) addr; return (*num & flag) != 0; } -#else static inline void SETP(void *addr, unsigned int flag) { unsigned char *num = (unsigned char *) addr; *num |= flag; } static inline void CLRP(void *addr, unsigned int flag) { unsigned char *num = (unsigned char *) addr; *num &= ~flag; } static inline bool ISSETP(void *addr, unsigned int flag) { unsigned char *num = (unsigned char *) addr; return (*num & flag) != 0; } -#endif #define fFlags loopRestart +#define passiveEventChain reserved->passiveEventChain + +#if IOKITSTATS + +#define IOStatisticsRegisterCounter() \ +do { \ + reserved->counter = IOStatistics::registerWorkLoop(this); \ +} while(0) + +#define IOStatisticsUnregisterCounter() \ +do { \ + if (reserved) \ + IOStatistics::unregisterWorkLoop(reserved->counter); \ +} while(0) + +#define IOStatisticsOpenGate() \ +do { \ + IOStatistics::countWorkLoopOpenGate(reserved->counter); \ +} while(0) + +#define IOStatisticsCloseGate() \ +do { \ + IOStatistics::countWorkLoopCloseGate(reserved->counter); \ +} while(0) + +#define IOStatisticsAttachEventSource() \ +do { \ + IOStatistics::attachWorkLoopEventSource(reserved->counter, inEvent->reserved->counter); \ +} while(0) + +#define IOStatisticsDetachEventSource() \ +do { \ + IOStatistics::detachWorkLoopEventSource(reserved->counter, inEvent->reserved->counter); \ +} while(0) + +#else + +#define IOStatisticsRegisterCounter() +#define IOStatisticsUnregisterCounter() +#define IOStatisticsOpenGate() +#define IOStatisticsCloseGate() +#define IOStatisticsAttachEventSource() +#define IOStatisticsDetachEventSource() + +#endif /* IOKITSTATS */ bool IOWorkLoop::init() { - // The super init and gateLock allocation MUST be done first + // The super init and gateLock allocation MUST be done first. if ( !super::init() ) return false; + // Allocate our ExpansionData if it hasn't been allocated already. + if ( !reserved ) + { + reserved = IONew(ExpansionData,1); + if ( !reserved ) + return false; + + bzero(reserved,sizeof(ExpansionData)); + } + +#if DEBUG + OSBacktrace ( reserved->allocationBacktrace, sizeof ( reserved->allocationBacktrace ) / sizeof ( reserved->allocationBacktrace[0] ) ); +#endif + if ( gateLock == NULL ) { if ( !( gateLock = IORecursiveLockAlloc()) ) return false; @@ -93,6 +143,13 @@ bool IOWorkLoop::init() workToDo = false; } + if (!reserved) { + reserved = IONew(ExpansionData, 1); + reserved->options = 0; + } + + IOStatisticsRegisterCounter(); + if ( controlG == NULL ) { controlG = IOCommandGate::commandGate( this, @@ -132,23 +189,24 @@ IOWorkLoop::workLoop() IOWorkLoop * IOWorkLoop::workLoopWithOptions(IOOptionBits options) { - IOWorkLoop *me = new IOWorkLoop; - - if (me && options) { - me->reserved = IONew(ExpansionData, 1); - if (!me->reserved) { - me->release(); - return 0; + IOWorkLoop *me = new IOWorkLoop; + + if (me && options) { + me->reserved = IONew(ExpansionData,1); + if (!me->reserved) { + me->release(); + return 0; + } + bzero(me->reserved,sizeof(ExpansionData)); + me->reserved->options = options; } - me->reserved->options = options; - } - - if (me && !me->init()) { - me->release(); - return 0; - } - - return me; + + if (me && !me->init()) { + me->release(); + return 0; + } + + return me; } // Free is called twice: @@ -187,6 +245,14 @@ void IOWorkLoop::free() } eventChain = 0; + for (event = passiveEventChain; event; event = next) { + next = event->getNext(); + event->setWorkLoop(0); + event->setNext(0); + event->release(); + } + passiveEventChain = 0; + // Either we have a partial initialization to clean up // or the workThread itself is performing hari-kari. // Either way clean up all of our resources and return. @@ -205,6 +271,9 @@ void IOWorkLoop::free() IORecursiveLockFree(gateLock); gateLock = 0; } + + IOStatisticsUnregisterCounter(); + if (reserved) { IODelete(reserved, ExpansionData, 1); reserved = 0; @@ -230,6 +299,9 @@ void IOWorkLoop::enableAllEventSources() const for (event = eventChain; event; event = event->getNext()) event->enable(); + + for (event = passiveEventChain; event; event = event->getNext()) + event->enable(); } void IOWorkLoop::disableAllEventSources() const @@ -237,6 +309,10 @@ void IOWorkLoop::disableAllEventSources() const IOEventSource *event; for (event = eventChain; event; event = event->getNext()) + event->disable(); + + /* NOTE: controlG is in passiveEventChain since it's an IOCommandGate */ + for (event = passiveEventChain; event; event = event->getNext()) if (event != controlG) // Don't disable the control gate event->disable(); } @@ -244,7 +320,7 @@ void IOWorkLoop::disableAllEventSources() const void IOWorkLoop::enableAllInterrupts() const { IOEventSource *event; - + for (event = eventChain; event; event = event->getNext()) if (OSDynamicCast(IOInterruptEventSource, event)) event->enable(); @@ -253,43 +329,12 @@ void IOWorkLoop::enableAllInterrupts() const void IOWorkLoop::disableAllInterrupts() const { IOEventSource *event; - + for (event = eventChain; event; event = event->getNext()) if (OSDynamicCast(IOInterruptEventSource, event)) event->disable(); } -#if KDEBUG -#define IOTimeClientS() \ -do { \ - IOTimeStampStart(IODBG_WORKLOOP(IOWL_CLIENT), \ - (unsigned int) this, (unsigned int) event); \ -} while(0) - -#define IOTimeClientE() \ -do { \ - IOTimeStampEnd(IODBG_WORKLOOP(IOWL_CLIENT), \ - (unsigned int) this, (unsigned int) event); \ -} while(0) - -#define IOTimeWorkS() \ -do { \ - IOTimeStampStart(IODBG_WORKLOOP(IOWL_WORK), (unsigned int) this); \ -} while(0) - -#define IOTimeWorkE() \ -do { \ - IOTimeStampEnd(IODBG_WORKLOOP(IOWL_WORK),(unsigned int) this); \ -} while(0) - -#else /* !KDEBUG */ - -#define IOTimeClientS() -#define IOTimeClientE() -#define IOTimeWorkS() -#define IOTimeWorkE() - -#endif /* KDEBUG */ /* virtual */ bool IOWorkLoop::runEventSources() { @@ -299,42 +344,43 @@ do { \ closeGate(); if (ISSETP(&fFlags, kLoopTerminate)) - goto abort; - + goto abort; + if (traceWL) IOTimeStampStartConstant(IODBG_WORKLOOP(IOWL_WORK), (uintptr_t) this); bool more; do { - CLRP(&fFlags, kLoopRestart); - more = false; - IOInterruptState is = IOSimpleLockLockDisableInterrupt(workToDoLock); - workToDo = false; - IOSimpleLockUnlockEnableInterrupt(workToDoLock, is); - for (IOEventSource *evnt = eventChain; evnt; evnt = evnt->getNext()) { - - if (traceES) - IOTimeStampStartConstant(IODBG_WORKLOOP(IOWL_CLIENT), (uintptr_t) this, (uintptr_t) evnt); + CLRP(&fFlags, kLoopRestart); + more = false; + IOInterruptState is = IOSimpleLockLockDisableInterrupt(workToDoLock); + workToDo = false; + IOSimpleLockUnlockEnableInterrupt(workToDoLock, is); + /* NOTE: only loop over event sources in eventChain. Bypass "passive" event sources for performance */ + for (IOEventSource *evnt = eventChain; evnt; evnt = evnt->getNext()) { - more |= evnt->checkForWork(); + if (traceES) + IOTimeStampStartConstant(IODBG_WORKLOOP(IOWL_CLIENT), (uintptr_t) this, (uintptr_t) evnt); - if (traceES) - IOTimeStampEndConstant(IODBG_WORKLOOP(IOWL_CLIENT), (uintptr_t) this, (uintptr_t) evnt); - - if (ISSETP(&fFlags, kLoopTerminate)) - goto abort; - else if (fFlags & kLoopRestart) { - more = true; - break; - } - } + more |= evnt->checkForWork(); + + if (traceES) + IOTimeStampEndConstant(IODBG_WORKLOOP(IOWL_CLIENT), (uintptr_t) this, (uintptr_t) evnt); + + if (ISSETP(&fFlags, kLoopTerminate)) + goto abort; + else if (fFlags & kLoopRestart) { + more = true; + break; + } + } } while (more); - + res = true; if (traceWL) IOTimeStampEndConstant(IODBG_WORKLOOP(IOWL_WORK), (uintptr_t) this); - + abort: openGate(); return res; @@ -403,27 +449,41 @@ void IOWorkLoop::signalWorkAvailable() void IOWorkLoop::openGate() { + IOStatisticsOpenGate(); IORecursiveLockUnlock(gateLock); } void IOWorkLoop::closeGate() { IORecursiveLockLock(gateLock); + IOStatisticsCloseGate(); } bool IOWorkLoop::tryCloseGate() { - return IORecursiveLockTryLock(gateLock) != 0; + bool res = (IORecursiveLockTryLock(gateLock) != 0); + if (res) { + IOStatisticsCloseGate(); + } + return res; } int IOWorkLoop::sleepGate(void *event, UInt32 interuptibleType) { - return IORecursiveLockSleep(gateLock, event, interuptibleType); + int res; + IOStatisticsOpenGate(); + res = IORecursiveLockSleep(gateLock, event, interuptibleType); + IOStatisticsCloseGate(); + return res; } int IOWorkLoop::sleepGate(void *event, AbsoluteTime deadline, UInt32 interuptibleType) { - return IORecursiveLockSleepDeadline(gateLock, event, deadline, interuptibleType); + int res; + IOStatisticsOpenGate(); + res = IORecursiveLockSleepDeadline(gateLock, event, deadline, interuptibleType); + IOStatisticsCloseGate(); + return res; } void IOWorkLoop::wakeupGate(void *event, bool oneThread) @@ -460,41 +520,82 @@ IOReturn IOWorkLoop::_maintRequest(void *inC, void *inD, void *, void *) inEvent->retain(); inEvent->setWorkLoop(this); inEvent->setNext(0); + + /* Check if this is a passive or active event source being added */ + if (eventSourcePerformsWork(inEvent)) { + + if (!eventChain) + eventChain = inEvent; + else { + IOEventSource *event, *next; - if (!eventChain) - eventChain = inEvent; + for (event = eventChain; (next = event->getNext()); event = next) + ; + event->setNext(inEvent); + + } + + } else { - IOEventSource *event, *next; + + if (!passiveEventChain) + passiveEventChain = inEvent; + else { + IOEventSource *event, *next; - for (event = eventChain; (next = event->getNext()); event = next) - ; - event->setNext(inEvent); + for (event = passiveEventChain; (next = event->getNext()); event = next) + ; + event->setNext(inEvent); + + } + } + IOStatisticsAttachEventSource(); } break; case mRemoveEvent: if (inEvent->getWorkLoop()) { - if (eventChain == inEvent) - eventChain = inEvent->getNext(); - else { - IOEventSource *event, *next; - - event = eventChain; - while ((next = event->getNext()) && next != inEvent) - event = next; - - if (!next) { - res = kIOReturnBadArgument; - break; - } - event->setNext(inEvent->getNext()); - } - + if (eventSourcePerformsWork(inEvent)) { + if (eventChain == inEvent) + eventChain = inEvent->getNext(); + else { + IOEventSource *event, *next; + + event = eventChain; + while ((next = event->getNext()) && next != inEvent) + event = next; + + if (!next) { + res = kIOReturnBadArgument; + break; + } + event->setNext(inEvent->getNext()); + } + } + else { + if (passiveEventChain == inEvent) + passiveEventChain = inEvent->getNext(); + else { + IOEventSource *event, *next; + + event = passiveEventChain; + while ((next = event->getNext()) && next != inEvent) + event = next; + + if (!next) { + res = kIOReturnBadArgument; + break; + } + event->setNext(inEvent->getNext()); + } + } + inEvent->setWorkLoop(0); inEvent->setNext(0); inEvent->release(); SETP(&fFlags, kLoopRestart); + IOStatisticsDetachEventSource(); } break; @@ -504,3 +605,40 @@ IOReturn IOWorkLoop::_maintRequest(void *inC, void *inD, void *, void *) return res; } + +bool +IOWorkLoop::eventSourcePerformsWork(IOEventSource *inEventSource) +{ + bool result = true; + + /* + * The idea here is to see if the subclass of IOEventSource has overridden checkForWork(). + * The assumption is that if you override checkForWork(), you need to be + * active and not passive. + * + * We picked a known quantity controlG that does not override + * IOEventSource::checkForWork(), namely the IOCommandGate associated with + * the workloop to which this event source is getting attached. + * + * We do a pointer comparison on the offset in the vtable for inNewEvent against + * the offset in the vtable for inReferenceEvent. This works because + * IOCommandGate's slot for checkForWork() has the address of + * IOEventSource::checkForWork() in it. + * + * Think of OSMemberFunctionCast yielding the value at the vtable offset for + * checkForWork() here. We're just testing to see if it's the same or not. + * + */ + if (controlG) { + void * ptr1; + void * ptr2; + + ptr1 = OSMemberFunctionCast(void*, inEventSource, &IOEventSource::checkForWork); + ptr2 = OSMemberFunctionCast(void*, controlG, &IOEventSource::checkForWork); + + if (ptr1 == ptr2) + result = false; + } + + return result; +} diff --git a/iokit/Kernel/RootDomainUserClient.cpp b/iokit/Kernel/RootDomainUserClient.cpp index 69c0dfa1a..29c90deef 100644 --- a/iokit/Kernel/RootDomainUserClient.cpp +++ b/iokit/Kernel/RootDomainUserClient.cpp @@ -73,23 +73,14 @@ bool RootDomainUserClient::start( IOService * provider ) IOReturn RootDomainUserClient::secureSleepSystem( uint32_t *return_code ) { - IOByteCount return_code_size = 1; - - return secureSleepSystemOptions( NULL, // inOptions - (void *)return_code, // returnCode - (void *)0, // inSize - (void *)&return_code_size, // returnSize - NULL, NULL); + return secureSleepSystemOptions(NULL, 0, return_code); } IOReturn RootDomainUserClient::secureSleepSystemOptions( - void * p1, void * p2, void * p3, - void * p4, void * p5, void * p6 ) + const void *inOptions, + IOByteCount inOptionsSize __unused, + uint32_t *returnCode) { - void *inOptions = (void *)p1; - uint32_t *returnCode = (uint32_t *)p2; -// IOByteCount inOptionsSize = (uintptr_t)p3; - IOByteCount *returnCodeSize = (IOByteCount *)p4; int local_priv = 0; int admin_priv = 0; @@ -103,7 +94,6 @@ IOReturn RootDomainUserClient::secureSleepSystemOptions( ret = clientHasPrivilege(fOwningTask, kIOClientPrivilegeAdministrator); admin_priv = (kIOReturnSuccess == ret); - *returnCodeSize = sizeof(uint32_t); if (inOptions) { @@ -159,43 +149,32 @@ IOReturn RootDomainUserClient::secureSetAggressiveness( if((local_priv || admin_priv) && fOwner) { *return_code = fOwner->setAggressiveness(type, newLevel); - return kIOReturnSuccess; } else { *return_code = kIOReturnNotPrivileged; - return kIOReturnSuccess; } + return kIOReturnSuccess; } -IOReturn RootDomainUserClient::secureSetMaintenanceWakeCalendar( - void * p1, void * p2, void * p3, - void * p4, void * p5, void * p6 ) +IOReturn RootDomainUserClient::secureSetMaintenanceWakeCalendar( + IOPMCalendarStruct *inCalendar, + uint32_t *returnCode) { -#if ROOT_DOMAIN_RUN_STATES - IOPMCalendarStruct * inCalendar = (IOPMCalendarStruct *) p1; - uint32_t * returnCode = (uint32_t *) p2; - IOByteCount * returnCodeSize = (IOByteCount *) p4; int admin_priv = 0; IOReturn ret = kIOReturnNotPrivileged; ret = clientHasPrivilege(fOwningTask, kIOClientPrivilegeAdministrator); admin_priv = (kIOReturnSuccess == ret); - *returnCodeSize = sizeof(uint32_t); - if (admin_priv && fOwner) { *returnCode = fOwner->setMaintenanceWakeCalendar(inCalendar); - return kIOReturnSuccess; } else { *returnCode = kIOReturnNotPrivileged; - return kIOReturnSuccess; } -#else - return kIOReturnUnsupported; -#endif + return kIOReturnSuccess; } IOReturn RootDomainUserClient::secureSetUserAssertionLevels( - uint32_t assertBits ) + uint32_t assertionBitfield) { int admin_priv = 0; IOReturn ret = kIOReturnNotPrivileged; @@ -204,7 +183,7 @@ IOReturn RootDomainUserClient::secureSetUserAssertionLevels( admin_priv = (kIOReturnSuccess == ret); if (admin_priv && fOwner) { - ret = fOwner->setPMAssertionUserLevels(assertBits); + ret = fOwner->setPMAssertionUserLevels(assertionBitfield); } else { ret = kIOReturnNotPrivileged; } @@ -223,103 +202,185 @@ IOReturn RootDomainUserClient::clientClose( void ) return kIOReturnSuccess; } -IOExternalMethod * -RootDomainUserClient::getTargetAndMethodForIndex( IOService ** targetP, UInt32 index ) +IOReturn RootDomainUserClient::clientMemoryForType( + UInt32 type, + IOOptionBits *options, + IOMemoryDescriptor ** memory) { - static const IOExternalMethod sMethods[] = { - { // kPMSetAggressiveness, 0 - (IOService *)1, (IOMethod)&RootDomainUserClient::secureSetAggressiveness, kIOUCScalarIScalarO, 2, 1 - }, - { // kPMGetAggressiveness, 1 - 0, (IOMethod)&IOPMrootDomain::getAggressiveness, kIOUCScalarIScalarO, 1, 1 - }, - { // kPMSleepSystem, 2 - (IOService *)1, (IOMethod)&RootDomainUserClient::secureSleepSystem, kIOUCScalarIScalarO, 0, 1 - }, - { // kPMAllowPowerChange, 3 - 0, (IOMethod)&IOPMrootDomain::allowPowerChange, kIOUCScalarIScalarO, 1, 0 - }, - { // kPMCancelPowerChange, 4 - 0, (IOMethod)&IOPMrootDomain::cancelPowerChange, kIOUCScalarIScalarO, 1, 0 - }, - { // kPMShutdownSystem, 5 - 0, (IOMethod)&IOPMrootDomain::shutdownSystem, kIOUCScalarIScalarO, 0, 0 - }, - { // kPMRestartSystem, 6 - 0, (IOMethod)&IOPMrootDomain::restartSystem, kIOUCScalarIScalarO, 0, 0 - }, - { // kPMSleepSystemOptions, 7 - (IOService *)1, (IOMethod)&RootDomainUserClient::secureSleepSystemOptions, - kIOUCStructIStructO, kIOUCVariableStructureSize, sizeof(uint32_t) - }, - { // kPMSetMaintenanceWakeCalendar, 8 - (IOService *)1, (IOMethod)&RootDomainUserClient::secureSetMaintenanceWakeCalendar, - kIOUCStructIStructO, sizeof(IOPMCalendarStruct), sizeof(uint32_t) - }, - { // kPMSetUserAssertionLevels, 9 - (IOService *)1, (IOMethod)&RootDomainUserClient::secureSetUserAssertionLevels, - kIOUCScalarIScalarO, 1, 0 + if (!fOwner) + return kIOReturnNotReady; + + if (kPMRootDomainMapTraceBuffer == type) + { + *memory = fOwner->getPMTraceMemoryDescriptor(); + if (*memory) { + (*memory)->retain(); + *options = 0; + return kIOReturnSuccess; + } else { + return kIOReturnNotFound; } - }; - - if(index >= kNumPMMethods) - return NULL; - else { - if (sMethods[index].object) - *targetP = this; - else - *targetP = fOwner; - - return (IOExternalMethod *)&sMethods[index]; + } + return kIOReturnUnsupported; } -#if 0 -IOReturn RootDomainUserClient::externalMethod( uint32_t selector, IOExternalMethodArguments * args, - IOExternalMethodDispatch * dispatch, OSObject * target, void * reference ) +IOReturn RootDomainUserClient::externalMethod( + uint32_t selector, + IOExternalMethodArguments * arguments, + IOExternalMethodDispatch * dispatch __unused, + OSObject * target __unused, + void * reference __unused ) { - static const IOExternalMethodDispatch sMethods[] = { - { // kPMSetAggressiveness, 0 - (IOService *)1, (IOMethod)&RootDomainUserClient::secureSetAggressiveness, kIOUCScalarIScalarO, 2, 1 - }, - { // kPMGetAggressiveness, 1 - 0, (IOMethod)&IOPMrootDomain::getAggressiveness, kIOUCScalarIScalarO, 1, 1 - }, - { // kPMSleepSystem, 2 - (IOService *)1, (IOMethod)&RootDomainUserClient::secureSleepSystem, kIOUCScalarIScalarO, 0, 1 - }, - { // kPMAllowPowerChange, 3 - 0, (IOMethod)&IOPMrootDomain::allowPowerChange, kIOUCScalarIScalarO, 1, 0 - }, - { // kPMCancelPowerChange, 4 - 0, (IOMethod)&IOPMrootDomain::cancelPowerChange, kIOUCScalarIScalarO, 1, 0 - }, - { // kPMShutdownSystem, 5 - 0, (IOMethod)&IOPMrootDomain::shutdownSystem, kIOUCScalarIScalarO, 0, 0 - }, - { // kPMRestartSystem, 6 - 0, (IOMethod)&IOPMrootDomain::restartSystem, kIOUCScalarIScalarO, 0, 0 - }, - { // kPMSetPreventative, 7 - (IOService *)1, (IOMethod)&RootDomainUserClient::setPreventative, kIOUCScalarIScalarO, 2, 0 - }, - }; - - if (selector > (sizeof(sMethods) / sizeof(sMethods[0]))) - return (kIOReturnBadArgument); - - if ((1 << selector) & ((1 << 0) | (1 << 7)) - target = this; - else - target = fOwner; - - return (super::externalMethod(selector, args, &sMethods[selector], target, 0)); + IOReturn ret = kIOReturnBadArgument; + + switch (selector) + { + case kPMSetAggressiveness: + if ((2 == arguments->scalarInputCount) + && (1 == arguments->scalarOutputCount)) + { + ret = this->secureSetAggressiveness( + (unsigned long)arguments->scalarInput[0], + (unsigned long)arguments->scalarInput[1], + (int *)&arguments->scalarOutput[0]); + } + break; + + case kPMGetAggressiveness: + if ((1 == arguments->scalarInputCount) + && (1 == arguments->scalarOutputCount)) + { + ret = fOwner->getAggressiveness( + (unsigned long)arguments->scalarInput[0], + (unsigned long *)&arguments->scalarOutput[0]); + } + break; + + case kPMSleepSystem: + if (1 == arguments->scalarOutputCount) + { + ret = this->secureSleepSystem( + (uint32_t *)&arguments->scalarOutput[0]); + } + break; + + case kPMAllowPowerChange: + if (1 == arguments->scalarInputCount) + { + ret = fOwner->allowPowerChange( + arguments->scalarInput[0]); + } + break; + + case kPMCancelPowerChange: + if (1 == arguments->scalarInputCount) + { + ret = fOwner->cancelPowerChange( + arguments->scalarInput[0]); + } + break; + + case kPMShutdownSystem: + // deperecated interface + ret = kIOReturnUnsupported; + break; + + case kPMRestartSystem: + // deperecated interface + ret = kIOReturnUnsupported; + break; + + case kPMSleepSystemOptions: + ret = this->secureSleepSystemOptions( + arguments->structureInput, + arguments->structureInputSize, + (uint32_t *)&arguments->scalarOutput[0]); + break; + case kPMSetMaintenanceWakeCalendar: + ret = this->secureSetMaintenanceWakeCalendar( + (IOPMCalendarStruct *)arguments->structureInput, + (uint32_t *)&arguments->structureOutput); + arguments->structureOutputSize = sizeof(uint32_t); + break; + + case kPMSetUserAssertionLevels: + ret = this->secureSetUserAssertionLevels( + (uint32_t)arguments->scalarInput[0]); + break; + +/* + case kPMMethodCopySystemTimeline: + // intentional fallthrough + case kPMMethodCopyDetailedTimeline: + + if (!arguments->structureOutputDescriptor) + { + // TODO: Force IOKit.framework to always send this data out + // of line; so I don't have to create a MemoryDescriptor here. + mem_size = arguments->structureOutputSize; + mem = IOMemoryDescriptor::withAddressRange( + (mach_vm_address_t)arguments->structureOutput, + (mach_vm_size_t)mem_size, + kIODirectionIn, current_task()); + } else { + mem_size = arguments->structureOutputDescriptorSize; + if (( mem = arguments->structureOutputDescriptor )) + mem->retain(); + } + + if (mem) + { + mem->prepare(kIODirectionNone); + + if (kPMMethodCopySystemTimeline == selector) { + arguments->scalarOutput[0] = fOwner->copySystemTimeline( + mem, &mem_size); + } + else + if (kPMMethodCopyDetailedTimeline == selector) { + arguments->scalarOutput[0] = fOwner->copyDetailedTimeline( + mem, &mem_size); + } + + if (arguments->structureOutputDescriptor) { + arguments->structureOutputDescriptorSize = mem_size; + } else { + arguments->structureOutputSize = mem_size; + } + + mem->release(); + + ret = kIOReturnSuccess; + } else { + ret = kIOReturnCannotWire; + } + + break; +*/ + default: + // bad selector + return kIOReturnBadArgument; + } + + return ret; } -#endif -void -RootDomainUserClient::setPreventative(UInt32 on_off, UInt32 types_of_sleep) +/* getTargetAndMethodForIndex + * Not used. We prefer to use externalMethod() for user client invocations. + * We maintain getTargetAndExternalMethod since it's an exported symbol, + * and only for that reason. + */ +IOExternalMethod * RootDomainUserClient::getTargetAndMethodForIndex( + IOService ** targetP, UInt32 index ) { - return; + // DO NOT EDIT + return super::getTargetAndMethodForIndex(targetP, index); } +/* setPreventative + * Does nothing. Exists only for exported symbol compatibility. + */ +void +RootDomainUserClient::setPreventative(UInt32 on_off, UInt32 types_of_sleep) +{ return; } // DO NOT EDIT diff --git a/iokit/Kernel/RootDomainUserClient.h b/iokit/Kernel/RootDomainUserClient.h index 4a277749c..9e6be4003 100644 --- a/iokit/Kernel/RootDomainUserClient.h +++ b/iokit/Kernel/RootDomainUserClient.h @@ -46,39 +46,50 @@ class RootDomainUserClient : public IOUserClient { OSDeclareDefaultStructors(RootDomainUserClient) + friend class IOPMrootDomain; private: IOPMrootDomain * fOwner; task_t fOwningTask; IOReturn secureSleepSystem( uint32_t *return_code ); - IOReturn secureSleepSystemOptions( void *p1, void *p2, void *p3, - void *p4, void *p5, void *p6 ); + IOReturn secureSleepSystemOptions( const void *inOptions, + IOByteCount inOptionsSize, + uint32_t *returnCode); IOReturn secureSetAggressiveness( unsigned long type, unsigned long newLevel, int *return_code ); IOReturn secureSetMaintenanceWakeCalendar( - void * p1, void * p2, void * p3, - void * p4, void * p5, void * p6 ); - - IOReturn secureSetUserAssertionLevels( - uint32_t assertBits ); + IOPMCalendarStruct *inCalendar, + uint32_t *returnCode); + + IOReturn secureSetUserAssertionLevels(uint32_t assertionBitfield); public: virtual IOReturn clientClose( void ); - virtual IOExternalMethod * getTargetAndMethodForIndex( IOService ** targetP, UInt32 index ); + virtual IOReturn clientMemoryForType( UInt32 type, IOOptionBits *options, IOMemoryDescriptor **memory); + + virtual IOReturn externalMethod( uint32_t selector, + IOExternalMethodArguments * arguments, + IOExternalMethodDispatch * dispatch, + OSObject * target, + void * reference ); virtual bool start( IOService * provider ); virtual bool initWithTask(task_t owningTask, void *security_id, UInt32 type, OSDictionary * properties); + // Unused - retained for symbol compatibility void setPreventative(UInt32 on_off, UInt32 types_of_sleep); + // Unused - retained for symbol compatibility + virtual IOExternalMethod * getTargetAndMethodForIndex( IOService ** targetP, UInt32 index ); + }; #endif /* ! _IOKIT_ROOTDOMAINUSERCLIENT_H */ diff --git a/iokit/Kernel/i386/IOKeyStoreHelper.cpp b/iokit/Kernel/i386/IOKeyStoreHelper.cpp new file mode 100644 index 000000000..fb09d9c2e --- /dev/null +++ b/iokit/Kernel/i386/IOKeyStoreHelper.cpp @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include + +#include +#include +#include +#include +#include + +__BEGIN_DECLS + +#include + +static volatile UInt32 alreadyFetched = 0; +static IOMemoryDescriptor * newData; + +IOMemoryDescriptor * +IOGetBootKeyStoreData(void); +void +IOSetKeyStoreData(IOMemoryDescriptor * data); + +__END_DECLS + +#if 1 +#define DEBG(fmt, args...) { kprintf(fmt, ## args); } +#else +#define DEBG(fmt, args...) {} +#endif + +void +IOSetKeyStoreData(IOMemoryDescriptor * data) +{ + newData = data; + alreadyFetched = 0; +} + +IOMemoryDescriptor * +IOGetBootKeyStoreData(void) +{ + IOMemoryDescriptor *memoryDescriptor; + boot_args *args = (boot_args *)PE_state.bootArgs; + IOOptionBits options; + IOAddressRange ranges; + + if (!OSCompareAndSwap(0, 1, &alreadyFetched)) + return (NULL); + + if (newData) + { + IOMemoryDescriptor * data = newData; + newData = NULL; + return (data); + } + + DEBG("%s: data at address %u size %u\n", __func__, + args->keyStoreDataStart, + args->keyStoreDataSize); + + if (args->keyStoreDataStart == 0) + return (NULL); + + ranges.address = args->keyStoreDataStart; + ranges.length = args->keyStoreDataSize; + + options = kIODirectionInOut | kIOMemoryTypePhysical64; + + memoryDescriptor = IOMemoryDescriptor::withOptions(&ranges, + 1, + 0, + NULL, + options); + + DEBG("%s: memory descriptor %p\n", __func__, memoryDescriptor); + + return memoryDescriptor; +} diff --git a/iokit/Kernel/i386/IOSharedLock.s b/iokit/Kernel/i386/IOSharedLock.s index 69183e016..9360dce09 100644 --- a/iokit/Kernel/i386/IOSharedLock.s +++ b/iokit/Kernel/i386/IOSharedLock.s @@ -1,19 +1,14 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2010 Apple Computer, Inc. All rights reserved. * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * @APPLE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER @@ -23,7 +18,47 @@ * Please see the License for the specific language governing rights and * limitations under the License. * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + * @APPLE_LICENSE_HEADER_END@ + */ + +#include + + TEXT + +/* + * void + * OSSpinLockUnlock(p) + * int *p; + * + * Unlock the lock pointed to by p. + */ + +LEAF(_OSSpinLockUnlock, 0) +LEAF(_IOSpinUnlock, 0) +LEAF(_ev_unlock, 0) + movl 4(%esp), %ecx + movl $0, (%ecx) +END(_OSSpinLockUnlock) + + +/* + * int + * OSSpinLockTry(p) + * int *p; + * + * Try to lock p. Return zero if not successful. */ -#include +LEAF(_OSSpinLockTry, 0) +LEAF(_IOTrySpinLock, 0) +LEAF(_ev_try_lock, 0) + movl 4(%esp), %ecx + xorl %eax, %eax + lock + cmpxchgl %ecx, (%ecx) + jne 1f + movl $1, %eax /* yes */ + ret +1: + xorl %eax, %eax /* no */ +END(_OSSpinLockTry) diff --git a/iokit/Kernel/ppc/IOAsmSupport.s b/iokit/Kernel/ppc/IOAsmSupport.s deleted file mode 100644 index 56e068cc2..000000000 --- a/iokit/Kernel/ppc/IOAsmSupport.s +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - /* - * Copyright (c) 1997-1998 Apple Computer, Inc. - * - * - * HISTORY - * - * sdouglas 22 Oct 97 - first checked in from DriverServices - * sdouglas 28 Jul 98 - start IOKit - */ - -#include - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; -; ENTRY functionName -; -; Assembly directives to begin an exported function. -; -; Takes: functionName - name of the exported function -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -.macro ENTRY - .text - .align 2 - .globl $0 -$0: -.endmacro - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -/* - -OSStatus SynchronizeIO( void ) - -*/ - - ENTRY __eSynchronizeIO - - li r0, 0 - eieio - li r3, 0 - blr - -/* - -OSStatus CallTVector_NoRecover( - void * p1, void * p2, void * p3, void * p4, void * p5, void * p6, // r3-8 - LogicalAddress entry ) // r9 - -*/ - -#define PARAM_SIZE 24 -#define FM_SIZE 64 -#define FM_LR_SAVE 8 -#define FM_TOC_SAVE 20 - - ENTRY _CallTVector - -#if 1 - stw r2, FM_TOC_SAVE(r1) - lwz r0, 0(r9) - lwz r2, 4(r9) - mtspr ctr, r0 - bctr - -#else - mflr r0 - stw r0, FM_LR_SAVE(r1) - stw r2, FM_TOC_SAVE(r1) - - stwu r1, -(PARAM_SIZE+FM_SIZE)(r1) - - lwz r2, 4(r9) - lwz r0, 0(r9) - mtspr lr, r0 - mfspr r12, lr - blrl - - addi r1, r1,(PARAM_SIZE+FM_SIZE) - lwz r2, FM_TOC_SAVE(r1) - lwz r0, FM_LR_SAVE(r1) - mtlr r0 - blr -#endif - -/* - * Seemingly unused references from cpp statically initialized objects. - */ - -.globl .constructors_used -.constructors_used = 0 -.globl .destructors_used -.destructors_used = 0 diff --git a/iokit/Kernel/ppc/IODBDMA.cpp b/iokit/Kernel/ppc/IODBDMA.cpp deleted file mode 100644 index f706e2809..000000000 --- a/iokit/Kernel/ppc/IODBDMA.cpp +++ /dev/null @@ -1,161 +0,0 @@ -/* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1997 Apple Computer, Inc. - * - * - * HISTORY - * - * Simon Douglas 10 Nov 97 - * - first checked in, mostly from machdep/ppc/dbdma.c - * - */ - - -#include -#include - -void -IODBDMAStart( volatile IODBDMAChannelRegisters *registers, volatile IODBDMADescriptor *physicalDescPtr) -{ - - if( ((int) physicalDescPtr) & 0xf) - panic("IODBDMAStart: unaligned IODBDMADescriptor"); - - eieio(); - IOSetDBDMAInterruptSelect(registers, 0xff000000); // clear out interrupts - - IOSetDBDMAChannelControl( registers, - IOClearDBDMAChannelControlBits( kdbdmaRun | kdbdmaPause | kdbdmaFlush | kdbdmaWake | kdbdmaDead | kdbdmaActive )); - - while( IOGetDBDMAChannelStatus( registers) & kdbdmaActive) - eieio(); - - IOSetDBDMACommandPtr( registers, (unsigned int) physicalDescPtr); - - IOSetDBDMAChannelControl( registers, - IOSetDBDMAChannelControlBits( kdbdmaRun | kdbdmaWake )); - -} - -void -IODBDMAStop( volatile IODBDMAChannelRegisters *registers) -{ - - IOSetDBDMAChannelControl( registers, - IOClearDBDMAChannelControlBits( kdbdmaRun ) - | IOSetDBDMAChannelControlBits( kdbdmaFlush )); - - while( IOGetDBDMAChannelStatus( registers) & ( kdbdmaActive | kdbdmaFlush)) - eieio(); - -} - -void -IODBDMAFlush( volatile IODBDMAChannelRegisters *registers) -{ - - IOSetDBDMAChannelControl( registers, - IOSetDBDMAChannelControlBits( kdbdmaFlush )); - - while( IOGetDBDMAChannelStatus( registers) & kdbdmaFlush) - eieio(); - -} - -void -IODBDMAReset( volatile IODBDMAChannelRegisters *registers) -{ - - IOSetDBDMAChannelControl( registers, - IOClearDBDMAChannelControlBits( kdbdmaRun | kdbdmaPause | kdbdmaFlush | kdbdmaWake | kdbdmaDead | kdbdmaActive )); - - while( IOGetDBDMAChannelStatus( registers) & kdbdmaActive) - eieio(); - -} - -void -IODBDMAContinue( volatile IODBDMAChannelRegisters *registers) -{ - - IOSetDBDMAChannelControl( registers, - IOClearDBDMAChannelControlBits( kdbdmaPause | kdbdmaDead ) - | IOSetDBDMAChannelControlBits( kdbdmaRun | kdbdmaWake )); - -} - -void -IODBDMAPause( volatile IODBDMAChannelRegisters *registers) -{ - - IOSetDBDMAChannelControl( registers, - IOSetDBDMAChannelControlBits( kdbdmaPause )); - - while( IOGetDBDMAChannelStatus( registers) & kdbdmaActive) - eieio(); - -} - -IOReturn -IOAllocatePhysicallyContiguousMemory( - unsigned int /* size */, unsigned int /* options */, - IOVirtualAddress * /* logical */, - IOPhysicalAddress * /* physical */ ) -{ -#if 0 - IOReturn err; - vm_offset_t mem; - - if( (size > 4096) || (options)) - return( kIOReturnUnsupported); - - mem = (vm_offset_t) IOMalloc( size); - *logical = (IOVirtualAddress) mem; - - if( mem) { - err = IOPhysicalFromVirtual( IOVmTaskSelf(), mem, (vm_offset_t *) physical); - if( err) - IOFree( (char *)mem, size); - - } else { - err = kIOReturnNoMemory; - *physical = 0; - } - - return( err); -#endif /* 0 */ - return (kIOReturnUnsupported); -} - -IOReturn -IOFreePhysicallyContiguousMemory( IOVirtualAddress * logical, unsigned int size) -{ - IOFree( logical, size); - return( kIOReturnSuccess); -} diff --git a/iokit/Kernel/x86_64/IOSharedLock.s b/iokit/Kernel/x86_64/IOSharedLock.s index 69183e016..d5e5ecabc 100644 --- a/iokit/Kernel/x86_64/IOSharedLock.s +++ b/iokit/Kernel/x86_64/IOSharedLock.s @@ -1,19 +1,14 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2010 Apple Computer, Inc. All rights reserved. * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * @APPLE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER @@ -23,7 +18,43 @@ * Please see the License for the specific language governing rights and * limitations under the License. * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + * @APPLE_LICENSE_HEADER_END@ + */ + +#include + + TEXT + +/* + * void + * OSSpinLockUnlock(p) + * int *p; + * + * Unlock the lock pointed to by p. + */ + +LEAF(_OSSpinLockUnlock, 0) +LEAF(_IOSpinUnlock, 0) +LEAF(_ev_unlock, 0) + movl $0, (%rdi) +END(_OSSpinLockUnlock) + + +/* + * int + * OSSpinLockTry(p) + * int *p; + * + * Try to lock p. Return zero if not successful. */ -#include +LEAF(_OSSpinLockTry, 0) +LEAF(_IOTrySpinLock, 0) +LEAF(_ev_try_lock, 0) + xorl %eax, %eax + orl $-1, %edx + lock + cmpxchgl %edx, (%rdi) + setz %dl + movzbl %dl, %eax +END(_OSSpinLockTry) diff --git a/iokit/KernelConfigTables.cpp b/iokit/KernelConfigTables.cpp index b8c590d06..ee06e47e5 100644 --- a/iokit/KernelConfigTables.cpp +++ b/iokit/KernelConfigTables.cpp @@ -37,35 +37,15 @@ const char * gIOKernelConfigTables = " {" " 'IOClass' = IOPanicPlatform;" " 'IOProviderClass' = IOPlatformExpertDevice;" -" 'IOProbeScore' = '-1';" +" 'IOProbeScore' = 0:32;" " }" -#ifdef PPC -" ," -" {" -" 'IOClass' = AppleCPU;" -" 'IOProviderClass' = IOPlatformDevice;" -" 'IONameMatch' = 'cpu';" -" 'IOProbeScore' = 100:32;" -" }," -" {" -" 'IOClass' = AppleNMI;" -" 'IOProviderClass' = AppleMacIODevice;" -" 'IONameMatch' = 'programmer-switch';" -" }," -" {" -" 'IOClass' = AppleNVRAM;" -" 'IOProviderClass' = AppleMacIODevice;" -" 'IONameMatch' = nvram;" -" }" -#endif /* PPC */ ")"; - /* This stuff is no longer used at all but was exported in prior * releases, so we'll keep them around for PPC/i386 only. * See libkern's OSKext.cpp for other symbols, which have been moved * there for sanity. */ -#if __ppc__ || __i386__ +#if __i386__ const char * gIOKernelKmods = ""; -#endif /* __ppc__ || __i386__ */ +#endif /* __i386__ */ diff --git a/iokit/Makefile b/iokit/Makefile index fee3c6fe1..498a2540a 100644 --- a/iokit/Makefile +++ b/iokit/Makefile @@ -8,20 +8,17 @@ include $(MakeInc_cmd) include $(MakeInc_def) INSTINC_SUBDIRS = IOKit -INSTINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS} INSTINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS} INSTINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS} INSTINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS} - EXPINC_SUBDIRS = IOKit -EXPINC_SUBDIRS_PPC = ${EXPINC_SUBDIRS} EXPINC_SUBDIRS_I386 = ${EXPINC_SUBDIRS} EXPINC_SUBDIRS_X86_64 = ${EXPINC_SUBDIRS} EXPINC_SUBDIRS_ARM = ${EXPINC_SUBDIRS} -SETUP_SUBDIRS = conf +SETUP_SUBDIRS = COMP_SUBDIRS = conf diff --git a/iokit/bsddev/DINetBootHook.cpp b/iokit/bsddev/DINetBootHook.cpp index ebf591eb0..8e83da5c1 100644 --- a/iokit/bsddev/DINetBootHook.cpp +++ b/iokit/bsddev/DINetBootHook.cpp @@ -95,6 +95,40 @@ #define kDIRootImageResultKey "di-root-image-result" #define kDIRootImageDevNameKey "di-root-image-devname" #define kDIRootImageDevTKey "di-root-image-devt" +#define kDIRootRamFileKey "di-root-ram-file" + +static IOService * +di_load_controller( void ) +{ + OSIterator * controllerIterator = 0; + OSDictionary * matchDictionary = 0; + IOService * controller = 0; + + do { + IOService::getResourceService()->publishResource("com.apple.AppleDiskImageController.load", kOSBooleanTrue); + IOService::getResourceService()->waitQuiet(); + + // first find IOHDIXController + matchDictionary = IOService::serviceMatching(kIOHDIXControllerClassName); + if (!matchDictionary) + break; + + controllerIterator = IOService::getMatchingServices(matchDictionary); + if (!controllerIterator) + break; + + controller = OSDynamicCast(IOService, controllerIterator->getNextObject()); + if (!controller) + break; + + controller->retain(); + } while (false); + + if (matchDictionary) matchDictionary->release(); + if (controllerIterator) controllerIterator->release(); + + return controller; +} extern "C" { /* @@ -108,8 +142,6 @@ extern "C" { int di_root_image(const char *path, char devname[], dev_t *dev_p) { IOReturn res = 0; - OSIterator * controllerIterator = 0; - OSDictionary * matchDictionary = 0; IOService * controller = 0; OSString * pathString = 0; OSNumber * myResult = 0; @@ -124,24 +156,7 @@ int di_root_image(const char *path, char devname[], dev_t *dev_p) if (!devname) return kIOReturnBadArgument; if (!dev_p) return kIOReturnBadArgument; - (void)IOService::getResourceService()->publishResource("com.apple.AppleDiskImageController.load", kOSBooleanTrue); - IOService::getResourceService()->waitQuiet(); - - // first find IOHDIXController - matchDictionary = IOService::serviceMatching(kIOHDIXControllerClassName); - if (!matchDictionary) { - res = kIOReturnNoMemory; - goto serviceMatching_FAILED; - } - - controllerIterator = IOService::getMatchingServices(matchDictionary); - if (!controllerIterator) { - res = kIOReturnNoMemory; - goto getMatchingServices_FAILED; - } - - // use the "setProperty" method of IOHDIXController to trigger the desired behaviour - controller = OSDynamicCast(IOService, controllerIterator->getNextObject()); + controller = di_load_controller(); if (!controller) { res = kIOReturnNotFound; goto NoIOHDIXController; @@ -191,16 +206,85 @@ int di_root_image(const char *path, char devname[], dev_t *dev_p) di_root_image_FAILED: CannotCreatePathOSString: -serviceMatching_FAILED: NoIOHDIXController: -getMatchingServices_FAILED: // clean up memory allocations if (pathString) pathString->release(); - if (matchDictionary) matchDictionary->release(); - if (controllerIterator) controllerIterator->release(); + if (controller) controller->release(); return res; } +void di_root_ramfile( IORegistryEntry * entry ) +{ + OSData * data; + IOMemoryDescriptor * mem; + uint64_t dmgSize; + uint64_t remain, length; + OSData * extentData = 0; + IOAddressRange * extentList; + uint64_t extentSize; + uint32_t extentCount; + + do { + data = OSDynamicCast(OSData, entry->getProperty("boot-ramdmg-size")); + if (!data || (data->getLength() != sizeof(uint64_t))) + break; // bad disk image size + + dmgSize = *(uint64_t *) data->getBytesNoCopy(); + if (!dmgSize) + break; + + data = OSDynamicCast(OSData, entry->getProperty("boot-ramdmg-extents")); + if (!data || (data->getLength() == 0) || + ((data->getLength() & (sizeof(IOAddressRange)-1)) != 0)) + break; // bad extents + + // make modifications to local copy + extentData = OSData::withData(data); + assert(extentData); + + extentList = (IOAddressRange *) extentData->getBytesNoCopy(); + extentCount = extentData->getLength() / sizeof(IOAddressRange); + extentSize = 0; + remain = dmgSize; + + // truncate extent length to enclosing disk image + for (uint32_t i = 0; i < extentCount; i++) + { + length = extentList[i].length; + if (!length) break; + + extentSize += length; + if (length >= remain) + { + extentList[i].length = remain; + extentCount = i + 1; + break; + } + remain -= length; + } + if (extentSize < dmgSize) + break; // not enough extent bytes for enclosing disk image + + mem = IOMemoryDescriptor::withAddressRanges( + extentList, extentCount, + kIODirectionOut | kIOMemoryMapperNone, NULL); + + if (mem) + { + IOService * controller = di_load_controller(); + if (controller) + { + controller->setProperty(kDIRootRamFileKey, mem); + controller->release(); + } + mem->release(); + } + } while (false); + + if (extentData) + extentData->release(); +} + }; diff --git a/iokit/bsddev/IOKitBSDInit.cpp b/iokit/bsddev/IOKitBSDInit.cpp index 6568afd57..feffd1a9e 100644 --- a/iokit/bsddev/IOKitBSDInit.cpp +++ b/iokit/bsddev/IOKitBSDInit.cpp @@ -46,9 +46,10 @@ extern "C" { #define ROOTDEVICETIMEOUT 60 #endif -extern dev_t mdevadd(int devid, ppnum_t base, unsigned int size, int phys); +extern dev_t mdevadd(int devid, uint64_t base, unsigned int size, int phys); extern dev_t mdevlookup(int devid); extern void mdevremoveall(void); +extern void di_root_ramfile(IORegistryEntry * entry); kern_return_t IOKitBSDInit( void ) @@ -542,7 +543,7 @@ kern_return_t IOFindBSDRoot( char * rootName, unsigned int rootNameSize, UInt32 flags = 0; int mnr, mjr; bool findHFSChild = false; - char * mediaProperty = 0; + const char * mediaProperty = 0; char * rdBootVar; enum { kMaxPathBuf = 512, kMaxBootVar = 128 }; char * str; @@ -571,6 +572,7 @@ kern_return_t IOFindBSDRoot( char * rootName, unsigned int rootNameSize, do { if( (regEntry = IORegistryEntry::fromPath( "/chosen", gIODTPlane ))) { + di_root_ramfile(regEntry); data = OSDynamicCast(OSData, regEntry->getProperty( "root-matching" )); if (data) { matching = OSDynamicCast(OSDictionary, OSUnserializeXML((char *)data->getBytesNoCopy())); @@ -917,7 +919,7 @@ kern_return_t IOBSDGetPlatformUUID( uuid_t uuid, mach_timespec_t timeout ) IOService * resources; OSString * string; - resources = IOService::waitForService( IOService::resourceMatching( kIOPlatformUUIDKey ), &timeout ); + resources = IOService::waitForService( IOService::resourceMatching( kIOPlatformUUIDKey ), ( timeout.tv_sec || timeout.tv_nsec ) ? &timeout : 0 ); if ( resources == 0 ) return KERN_OPERATION_TIMED_OUT; string = ( OSString * ) IOService::getPlatform( )->getProvider( )->getProperty( kIOPlatformUUIDKey ); diff --git a/iokit/conf/MASTER b/iokit/conf/MASTER index 0cb4dbb61..f1d0f0648 100644 --- a/iokit/conf/MASTER +++ b/iokit/conf/MASTER @@ -60,6 +60,7 @@ ident IOKIT options HIBERNATION # system hibernation # options KERNOBJC # Objective-C implementation # options IOKITCPP # C++ implementation # +options IOKITSTATS # IOKit statistics # options KDEBUG # kernel tracing # options NETWORKING # kernel networking # options CRYPTO # want crypto code # @@ -67,7 +68,6 @@ options CONFIG_DTRACE # enable dtrace # options CONFIG_SLEEP # # - #makeoptions LIBDRIVER = "libDriver_kern.o" # #makeoptions LIBOBJC = "libkobjc.o" # @@ -93,4 +93,15 @@ options CONFIG_NO_KPRINTF_STRINGS # # options CONFIG_EMBEDDED # +# secure_kernel - secure kernel from user programs +options SECURE_KERNEL # + options MACH_ASSERT # + +# +# Note: MAC/AUDIT options must be set in all the bsd/conf, osfmk/conf, and +# security/conf MASTER files. +# +options CONFIG_MACF # Mandatory Access Control Framework + +options DEVELOPMENT # diff --git a/iokit/conf/MASTER.i386 b/iokit/conf/MASTER.i386 index 3574359ef..ab7ff3360 100644 --- a/iokit/conf/MASTER.i386 +++ b/iokit/conf/MASTER.i386 @@ -3,13 +3,12 @@ # Standard Apple Mac OS Configurations: # -------- ----- ------ --------------- # -# RELEASE = [ intel mach iokitcpp hibernation medium crypto config_dtrace config_sleep ] +# RELEASE = [ intel mach iokitcpp hibernation medium crypto config_dtrace config_sleep iokitstats ] # PROFILE = [ RELEASE profile ] # DEBUG = [ RELEASE debug ] # -# # EMBEDDED = [ intel mach iokitcpp hibernation no_kextd bsmall crypto ] -# DEVELOPMENT = [ EMBEDDED config_dtrace ] +# DEVELOPMENT = [ EMBEDDED config_dtrace development] # ###################################################################### diff --git a/iokit/conf/MASTER.ppc b/iokit/conf/MASTER.ppc deleted file mode 100644 index 2318d5561..000000000 --- a/iokit/conf/MASTER.ppc +++ /dev/null @@ -1,18 +0,0 @@ -# -###################################################################### -# -# Standard Apple MacOS X Configurations: -# -------- ---- -------- --------------- -# -# RELEASE = [ppc mach iokitcpp hibernation medium crypto config_dtrace] -# DEVELOPMENT = [ RELEASE ] -# PROFILE = [ RELEASE profile ] -# DEBUG = [ RELEASE debug] -# RELEASE_TRACE = [ RELEASE kdebug ] -# DEBUG_TRACE = [ DEBUG kdebug ] -# -###################################################################### - -machine "ppc" # -cpu "ppc" # - diff --git a/iokit/conf/MASTER.x86_64 b/iokit/conf/MASTER.x86_64 index 857357c71..781ce8c7c 100644 --- a/iokit/conf/MASTER.x86_64 +++ b/iokit/conf/MASTER.x86_64 @@ -3,13 +3,12 @@ # Standard Apple Mac OS Configurations: # -------- ----- ------ --------------- # -# RELEASE = [ intel mach iokitcpp hibernation medium crypto config_dtrace config_sleep ] +# RELEASE = [ intel mach iokitcpp hibernation medium crypto config_dtrace config_sleep iokitstats ] # PROFILE = [ RELEASE profile ] # DEBUG = [ RELEASE debug ] # -# # EMBEDDED = [ intel mach iokitcpp hibernation no_kextd bsmall crypto ] -# DEVELOPMENT = [ EMBEDDED ] +# DEVELOPMENT = [ EMBEDDED development ] # ###################################################################### diff --git a/iokit/conf/Makefile b/iokit/conf/Makefile index 750aadb65..7b37a4736 100644 --- a/iokit/conf/Makefile +++ b/iokit/conf/Makefile @@ -7,8 +7,7 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir include $(MakeInc_cmd) include $(MakeInc_def) -SETUP_SUBDIRS = \ - tools +SETUP_SUBDIRS = COMP_SUBDIRS = @@ -24,30 +23,24 @@ else export COMPOBJROOT=$(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)/$(COMPONENT) endif -$(COMPOBJROOT)/doconf: - @make build_setup +MASTER_CPU_PER_SOC = $(SOURCE)/MASTER.$(ARCH_CONFIG_LC).$(MACHINE_CONFIG_LC) $(COMPOBJROOT)/$(IOKIT_KERNEL_CONFIG)/Makefile : $(SOURCE)/MASTER \ $(SOURCE)/MASTER.$(ARCH_CONFIG_LC) \ $(SOURCE)/Makefile.template \ $(SOURCE)/Makefile.$(ARCH_CONFIG_LC) \ $(SOURCE)/files \ - $(SOURCE)/files.$(ARCH_CONFIG_LC) \ - $(COMPOBJROOT)/doconf + $(SOURCE)/files.$(ARCH_CONFIG_LC) $(_v)(doconf_target=$(addsuffix /conf, $(TARGET)); \ $(MKDIR) $${doconf_target}; \ cd $${doconf_target}; \ rm -f $(notdir $?); \ cp $? $${doconf_target}; \ - $(COMPOBJROOT)/doconf -c -cpu $(ARCH_CONFIG_LC) -d $(TARGET)/$(IOKIT_KERNEL_CONFIG) $(IOKIT_KERNEL_CONFIG); \ + if [ -f $(MASTER_CPU_PER_SOC) ]; then cp $(MASTER_CPU_PER_SOC) $${doconf_target}; fi; \ + $(SRCROOT)/SETUP/config/doconf -c -cpu $(ARCH_CONFIG_LC) -soc $(MACHINE_CONFIG_LC) -d $(TARGET)/$(IOKIT_KERNEL_CONFIG) $(IOKIT_KERNEL_CONFIG); \ ); -.ORDER: $(COMPOBJROOT)/$(IOKIT_KERNEL_CONFIG)/Makefile - -do_setup_conf: $(COMPOBJROOT)/doconf \ - $(COMPOBJROOT)/$(IOKIT_KERNEL_CONFIG)/Makefile - -do_all: do_setup_conf +do_all: $(COMPOBJROOT)/$(IOKIT_KERNEL_CONFIG)/Makefile $(_v)next_source=$(subst conf/,,$(SOURCE)); \ ${MAKE} -C $(COMPOBJROOT)/$(IOKIT_KERNEL_CONFIG) \ MAKEFILES=$(TARGET)/$(IOKIT_KERNEL_CONFIG)/Makefile \ diff --git a/iokit/conf/Makefile.i386 b/iokit/conf/Makefile.i386 index 43a6e5b4f..8842b32d7 100644 --- a/iokit/conf/Makefile.i386 +++ b/iokit/conf/Makefile.i386 @@ -2,21 +2,9 @@ #BEGIN Machine dependent Makefile fragment for i386 ###################################################################### -# Enable -Werror for i386 builds -CFLAGS+= $(WERROR) -CWARNFLAGS= $(filter-out -Wbad-function-cast, $(CWARNFLAGS_STD)) - -# Objects that don't compile cleanly: -#OBJS_NO_WERROR= \ - -OBJS_WERROR=$(filter-out $(OBJS_NO_WERROR),$(OBJS)) - -$(OBJS_WERROR): WERROR=-Werror - # Files that must go in the __HIB segment: UNCONFIGURED_HIB_FILES= \ - IOHibernateRestoreKernel.o \ - WKdmDecompress.o + IOHibernateRestoreKernel.o HIB_FILES=$(filter $(UNCONFIGURED_HIB_FILES),$(OBJS)) diff --git a/iokit/conf/Makefile.ppc b/iokit/conf/Makefile.ppc deleted file mode 100644 index c794da174..000000000 --- a/iokit/conf/Makefile.ppc +++ /dev/null @@ -1,27 +0,0 @@ -###################################################################### -#BEGIN Machine dependent Makefile fragment for ppc -###################################################################### - -# Enable -Werror for ppc builds -CFLAGS+= $(WERROR) -CWARNFLAGS= $(filter-out -Wbad-function-cast, $(CWARNFLAGS_STD)) - -# Objects that don't compile cleanly: -OBJS_NO_WERROR= \ - AppleMacIO.cpo - -OBJS_WERROR=$(filter-out $(OBJS_NO_WERROR),$(OBJS)) - -$(OBJS_WERROR): WERROR=-Werror - -# Files that must go in the __HIB segment: -UNCONFIGURED_HIB_FILES= \ - IOHibernateRestoreKernel.o \ - WKdmDecompress.o - -HIB_FILES=$(filter $(UNCONFIGURED_HIB_FILES),$(OBJS)) - -###################################################################### -#END Machine dependent Makefile fragment for ppc -###################################################################### - diff --git a/iokit/conf/Makefile.template b/iokit/conf/Makefile.template index 55d99d413..96fe217a1 100644 --- a/iokit/conf/Makefile.template +++ b/iokit/conf/Makefile.template @@ -26,15 +26,13 @@ include $(MakeInc_def) # # XXX: CFLAGS # -CFLAGS+= -imacros meta_features.h -DKERNEL -DDRIVER_PRIVATE \ - -Wall -fno-common \ +CFLAGS+= -include meta_features.h -DDRIVER_PRIVATE \ -DIOMATCHDEBUG=1 -DIOALLOCDEBUG=1 \ - -imacros meta_features.h $(CFLAGS_INLINE_CONFIG) + -include meta_features.h $(CFLAGS_INLINE_CONFIG) #-DIOKITDEBUG=-1 -CWARNFLAGS += -Wno-unused-parameter -Wno-redundant-decls -Wno-nested-externs -Wno-write-strings -MWARNFLAGS += -Wno-unused-parameter -Wno-redundant-decls -Wno-nested-externs -Wno-write-strings -CXXWARNFLAGS += -Wno-unused-parameter -Wno-redundant-decls -Wno-write-strings -Wno-cast-qual -Wno-shadow +CWARNFLAGS = $(CWARNFLAGS_STD) -Wno-unused-parameter +CXXWARNFLAGS = $(CXXWARNFLAGS_STD) -Wno-unused-parameter -Wno-cast-qual -Wno-shadow CFLAGS_RELEASE += -DIOASSERT=0 CFLAGS_DEBUG += -DIOASSERT=1 @@ -85,22 +83,24 @@ ${OBJS}: ${OBJSDEPS} LDOBJS = $(OBJS) -$(COMPONENT).o: $(LDOBJS) - $(_v)for hib_file in ${HIB_FILES}; \ +$(COMPONENT).filelist: $(LDOBJS) + $(_v)if [ $(BUILD_MACHO_OBJ) -eq 1 ]; then \ + for hib_file in ${HIB_FILES}; \ do \ $(SEG_HACK) __HIB $${hib_file} -o $${hib_file}__; \ mv $${hib_file}__ $${hib_file} ; \ - done; + done; \ + fi @echo LDFILELIST $(COMPONENT) $(_v)( for obj in ${LDOBJS}; do \ echo $(TARGET)$(COMP_OBJ_DIR)/$(KERNEL_CONFIG)/$${obj}; \ - done; ) > $(COMPONENT).o + done; ) > $(COMPONENT).filelist do_depend: do_all $(_v)${MD} -u Makedep -f -d `ls *.d` -do_all: $(COMPONENT).o +do_all: $(COMPONENT).filelist do_build_all: do_depend diff --git a/iokit/conf/Makefile.x86_64 b/iokit/conf/Makefile.x86_64 index 09b0c0b71..463de5a20 100644 --- a/iokit/conf/Makefile.x86_64 +++ b/iokit/conf/Makefile.x86_64 @@ -1,22 +1,10 @@ ###################################################################### #BEGIN Machine dependent Makefile fragment for x86_64 ###################################################################### - -# Enable -Werror for x86_64 builds -CFLAGS+= $(WERROR) -CWARNFLAGS= $(filter-out -Wbad-function-cast, $(CWARNFLAGS_STD)) - -# Objects that don't compile cleanly: -#OBJS_NO_WERROR= \ - -OBJS_WERROR=$(filter-out $(OBJS_NO_WERROR),$(OBJS)) - -$(OBJS_WERROR): WERROR=-Werror # Files that must go in the __HIB segment: UNCONFIGURED_HIB_FILES= \ - IOHibernateRestoreKernel.o \ - WKdmDecompress.o + IOHibernateRestoreKernel.o HIB_FILES=$(filter $(UNCONFIGURED_HIB_FILES),$(OBJS)) diff --git a/iokit/conf/files b/iokit/conf/files index 18d44275a..532732d3b 100644 --- a/iokit/conf/files +++ b/iokit/conf/files @@ -10,8 +10,6 @@ OPTIONS/mach_assert optional mach_assert # libIOKit -iokit/Kernel/WKdmCompress.c optional hibernation -iokit/Kernel/WKdmDecompress.c optional hibernation iokit/Kernel/IOHibernateIO.cpp optional hibernation iokit/Kernel/IOHibernateRestoreKernel.c optional hibernation @@ -78,6 +76,8 @@ iokit/Kernel/IOSharedDataQueue.cpp optional iokitcpp # iokit/Tests/TestContainers.cpp optional iokitcpp # iokit/Tests/TestCollections.cpp optional iokitcpp +iokit/Kernel/IOStatistics.cpp optional iokitcpp + iokit/Kernel/IOStringFuncs.c standard # Property tables for kernel-linked objects diff --git a/iokit/conf/files.i386 b/iokit/conf/files.i386 index 2193ae37a..17c544f86 100644 --- a/iokit/conf/files.i386 +++ b/iokit/conf/files.i386 @@ -5,4 +5,7 @@ iokit/Kernel/i386/IOSharedLock.s standard iokit/Kernel/i386/IOAsmSupport.s standard # Power Domains -iokit/Kernel/IOPMrootDomain.cpp optional iokitcpp +iokit/Kernel/IOPMrootDomain.cpp optional iokitcpp + +# Key Store helper +iokit/Kernel/i386/IOKeyStoreHelper.cpp standard diff --git a/iokit/conf/files.ppc b/iokit/conf/files.ppc deleted file mode 100644 index 8d60fc863..000000000 --- a/iokit/conf/files.ppc +++ /dev/null @@ -1,20 +0,0 @@ - -iokit/Kernel/ppc/IOAsmSupport.s standard -iokit/Kernel/ppc/IODBDMA.cpp optional iokitcpp -iokit/Kernel/ppc/IOSharedLock.s standard - -iokit/Families/IONVRAM/IONVRAMController.cpp optional iokitcpp -iokit/Drivers/platform/drvAppleNVRAM/AppleNVRAM.cpp optional iokitcpp - -# Apple Platform Expert -iokit/Drivers/platform/drvApplePlatformExpert/ApplePlatformExpert.cpp optional iokitcpp -iokit/Drivers/platform/drvApplePlatformExpert/AppleCPU.cpp optional iokitcpp - -# Power Domains -iokit/Kernel/IOPMrootDomain.cpp optional iokitcpp - -# Apple Mac-IO driver -iokit/Drivers/platform/drvAppleMacIO/AppleMacIO.cpp optional iokitcpp - -# Apple NMI driver -iokit/Drivers/platform/drvAppleNMI/AppleNMI.cpp optional iokitcpp diff --git a/iokit/conf/files.x86_64 b/iokit/conf/files.x86_64 index c81cf1178..9d6ca13ee 100644 --- a/iokit/conf/files.x86_64 +++ b/iokit/conf/files.x86_64 @@ -5,4 +5,7 @@ iokit/Kernel/x86_64/IOSharedLock.s standard iokit/Kernel/x86_64/IOAsmSupport.s standard # Power Domains -iokit/Kernel/IOPMrootDomain.cpp optional iokitcpp +iokit/Kernel/IOPMrootDomain.cpp optional iokitcpp + +# Key Store helper +iokit/Kernel/i386/IOKeyStoreHelper.cpp standard diff --git a/iokit/conf/tools/Makefile b/iokit/conf/tools/Makefile deleted file mode 100644 index 4f9ccd553..000000000 --- a/iokit/conf/tools/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -SETUP_SUBDIRS = doconf - -COMP_SUBDIRS = doconf - -INST_SUBDIRS = \ - - -setup_build_all: - @echo "[ $(SOURCE) ] make setup_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -do_build_all: - @echo "[ $(SOURCE) ] make do_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -setup_build_install: - @echo "[ $(SOURCE) ] make setup_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -do_build_install: - @echo "[ $(SOURCE) ] make do_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -include $(MakeInc_rule) -include $(MakeInc_dir) - - diff --git a/iokit/conf/tools/doconf/Makefile b/iokit/conf/tools/doconf/Makefile deleted file mode 100644 index aa55a9419..000000000 --- a/iokit/conf/tools/doconf/Makefile +++ /dev/null @@ -1,47 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -COMP_SUBDIRS = \ - -INST_SUBDIRS = \ - - -# -# Who and where -# -BINDIR= -ifneq ($(MACHINE_CONFIG), DEFAULT) -DSTDIR= $(strip $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)_$(MACHINE_CONFIG)/$(COMPONENT)/) -else -DSTDIR= $(strip $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)/$(COMPONENT)/) -endif -PROGRAM= $(DSTDIR)doconf - -# -# How to install it -# -IFLAGS= -c -m 555 - -$(PROGRAM): $(DSTDIR)% : $(SOURCE)%.csh - @-$(RM) $(RMFLAGS) $(notdir $(PROGRAM)).VERS - @sed -e "s/#PROGRAM.*/#`vers_string $(notdir $(PROGRAM))`/" \ - < $< >$(notdir $(PROGRAM)).VERS; - @install $(IFLAGS) $(notdir $(PROGRAM)).VERS $(PROGRAM); - @-$(RM) $(RMFLAGS) $(notdir $(PROGRAM)).VERS; - -do_build_setup: $(PROGRAM) - -do_build_all: - -setup_build_install: - -do_build_install: - -include $(MakeInc_rule) -include $(MakeInc_dir) diff --git a/iokit/conf/tools/doconf/doconf.csh b/iokit/conf/tools/doconf/doconf.csh deleted file mode 100755 index 6fedb4786..000000000 --- a/iokit/conf/tools/doconf/doconf.csh +++ /dev/null @@ -1,321 +0,0 @@ -#!/bin/csh -f -set path = ($path .) -###################################################################### -# HISTORY -# 1-Dec-87 Michael Young (mwyoung) at Carnegie-Mellon University -# Added "-verbose" switch, so this script produces no output -# in the normal case. -# -# 10-Oct-87 Mike Accetta (mja) at Carnegie-Mellon University -# Flushed cmu_*.h and spin_locks.h -# [ V5.1(XF18) ] -# -# 6-Apr-87 Avadis Tevanian (avie) at Carnegie-Mellon University -# Use MASTER.local and MASTER..local for generation of -# configuration files in addition to MASTER and MASTER.. -# -# 25-Mar-87 Mike Accetta (mja) at Carnegie-Mellon University -# Removed use of obsolete wb_*.h files when building the feature -# list; modified to save the previous configuration file and -# display the differences between it and the new file. -# [ V5.1(F8) ] -# -# 25-Mar-87 Avadis Tevanian (avie) at Carnegie-Mellon University -# If there is no /etc/machine just print out a message telling -# user to use the -cpu option. I thought this script was supposed -# to work even without a /etc/machine, but it doesn't... and this -# is the easiest way out. -# -# 13-Mar-87 Mike Accetta (mja) at Carnegie-Mellon University -# Added "romp_fpa.h" file to extra features for the RT. -# [ V5.1(F7) ] -# -# 11-Mar-87 Mike Accetta (mja) at Carnegie-Mellon University -# Updated to maintain the appropriate configuration features file -# in the "machine" directory whenever the corresponding -# configuration is generated. This replaces the old mechanism of -# storing this directly in the file since it was -# machine dependent and also precluded building programs for more -# than one configuration from the same set of sources. -# [ V5.1(F6) ] -# -# 21-Feb-87 Mike Accetta (mja) at Carnegie-Mellon University -# Fixed to require wired-in cpu type names for only those -# machines where the kernel name differs from that provided by -# /etc/machine (i.e. IBMRT => ca and SUN => sun3); updated to -# permit configuration descriptions in both machine indepedent -# and dependent master configuration files so that attributes can -# be grouped accordingly. -# [ V5.1(F3) ] -# -# 17-Jan-87 Mike Accetta (mja) at Carnegie-Mellon University -# Updated to work from any directory at the same level as -# "conf"; generate configuration from both MASTER and -# MASTER. files; added -cpu switch. -# [ V5.1(F1) ] -# -# 18-Aug-86 Mike Accetta (mja) at Carnegie-Mellon University -# Added -make switch and changed meaning of -config; upgraded to -# allow multiple attributes per configuration and to define -# configurations in terms of these attributes within MASTER. -# -# 14-Apr-83 Mike Accetta (mja) at Carnegie-Mellon University -# Added -config switch to only run /etc/config without -# "make depend" and "make". -# -###################################################################### - -set prog=$0 -set prog=$prog:t -set nonomatch -set OBJDIR=../BUILD -if ("`/usr/bin/uname`" == "Rhapsody" ) then -set CONFIG_DIR=/usr/local/bin -else -set CONFIG_DIR=/usr/bin -endif - -unset domake -unset doconfig -unset beverbose -unset MACHINE -unset profile - -while ($#argv >= 1) - if ("$argv[1]" =~ -*) then - switch ("$argv[1]") - case "-c": - case "-config": - set doconfig - breaksw - case "-m": - case "-make": - set domake - breaksw - case "-cpu": - if ($#argv < 2) then - echo "${prog}: missing argument to ${argv[1]}" - exit 1 - endif - set MACHINE="$argv[2]" - shift - breaksw - case "-d": - if ($#argv < 2) then - echo "${prog}: missing argument to ${argv[1]}" - exit 1 - endif - set OBJDIR="$argv[2]" - shift - breaksw - case "-verbose": - set beverbose - breaksw - case "-p": - case "-profile": - set profile - breaksw - default: - echo "${prog}: ${argv[1]}: unknown switch" - exit 1 - breaksw - endsw - shift - else - break - endif -end - -if ($#argv == 0) set argv=(GENERIC) - -if (! $?MACHINE) then - if (-d /NextApps) then - set MACHINE=`hostinfo | awk '/MC680x0/ { printf("m68k") } /MC880x0/ { printf("m88k") }'` - endif -endif - -if (! $?MACHINE) then - if (-f /etc/machine) then - set MACHINE="`/etc/machine`" - else - echo "${prog}: no /etc/machine, specify machine type with -cpu" - echo "${prog}: e.g. ${prog} -cpu VAX CONFIGURATION" - exit 1 - endif -endif - -set FEATURES_EXTRA= - -switch ("$MACHINE") - case IBMRT: - set cpu=ca - set ID=RT - set FEATURES_EXTRA="romp_dualcall.h romp_fpa.h" - breaksw - case SUN: - set cpu=sun3 - set ID=SUN3 - breaksw - default: - set cpu=`echo $MACHINE | tr A-Z a-z` - set ID=`echo $MACHINE | tr a-z A-Z` - breaksw -endsw -set FEATURES=../h/features.h -set FEATURES_H=(cs_*.h mach_*.h net_*.h\ - cputypes.h cpus.h vice.h\ - $FEATURES_EXTRA) -set MASTER_DIR=../conf -set MASTER = ${MASTER_DIR}/MASTER -set MASTER_CPU=${MASTER}.${cpu} - -set MASTER_LOCAL = ${MASTER}.local -set MASTER_CPU_LOCAL = ${MASTER_CPU}.local -if (! -f $MASTER_LOCAL) set MASTER_LOCAL = "" -if (! -f $MASTER_CPU_LOCAL) set MASTER_CPU_LOCAL = "" - -if (! -d $OBJDIR) then - if ($?beverbose) then - echo "[ creating $OBJDIR ]" - endif - mkdir -p $OBJDIR -endif - -foreach SYS ($argv) - set SYSID=${SYS}_${ID} - set SYSCONF=$OBJDIR/config.$SYSID - set BLDDIR=$OBJDIR - if ($?beverbose) then - echo "[ generating $SYSID from $MASTER_DIR/MASTER{,.$cpu}{,.local} ]" - endif - echo +$SYS \ - | \ - cat $MASTER $MASTER_LOCAL $MASTER_CPU $MASTER_CPU_LOCAL - \ - $MASTER $MASTER_LOCAL $MASTER_CPU $MASTER_CPU_LOCAL \ - | \ - sed -n \ - -e "/^+/{" \ - -e "s;[-+];#&;gp" \ - -e 't loop' \ - -e ': loop' \ - -e 'n' \ - -e '/^#/b loop' \ - -e '/^$/b loop' \ - -e 's;^\([^#]*\).*#[ ]*<\(.*\)>[ ]*$;\2#\1;' \ - -e 't not' \ - -e 's;\([^#]*\).*;#\1;' \ - -e 't not' \ - -e ': not' \ - -e 's;[ ]*$;;' \ - -e 's;^\!\(.*\);\1#\!;' \ - -e 'p' \ - -e 't loop' \ - -e 'b loop' \ - -e '}' \ - -e "/^[^#]/d" \ - -e 's; ; ;g' \ - -e "s;^# *\([^ ]*\)[ ]*=[ ]*\[\(.*\)\].*;\1#\2;p" \ - | \ - awk '-F#' '\ -part == 0 && $1 != "" {\ - m[$1]=m[$1] " " $2;\ - next;\ -}\ -part == 0 && $1 == "" {\ - for (i=NF;i>1;i--){\ - s=substr($i,2);\ - c[++na]=substr($i,1,1);\ - a[na]=s;\ - }\ - while (na > 0){\ - s=a[na];\ - d=c[na--];\ - if (m[s] == "") {\ - f[s]=d;\ - } else {\ - nx=split(m[s],x," ");\ - for (j=nx;j>0;j--) {\ - z=x[j];\ - a[++na]=z;\ - c[na]=d;\ - }\ - }\ - }\ - part=1;\ - next;\ -}\ -part != 0 {\ - if ($1 != "") {\ - n=split($1,x,",");\ - ok=0;\ - for (i=1;i<=n;i++) {\ - if (f[x[i]] == "+") {\ - ok=1;\ - }\ - }\ - if (NF > 2 && ok == 0 || NF <= 2 && ok != 0) {\ - print $2; \ - }\ - } else { \ - print $2; \ - }\ -}\ -' >$SYSCONF.new - if (-z $SYSCONF.new) then - echo "${prog}: ${$SYSID}: no such configuration in $MASTER_DIR/MASTER{,.$cpu}" - rm -f $SYSCONF.new - endif - if (! -d $BLDDIR) then - if ($?beverbose) then - echo "[ creating $BLDDIR ]" - endif - mkdir -p $BLDDIR - endif -# -# These paths are used by config. -# -# "builddir" is the name of the directory where kernel binaries -# are put. It is a single path element, never absolute, and is -# always relative to "objectdir". "builddir" is used by config -# solely to determine where to put files created by "config" (e.g. -# the created Makefile and *.h's.) -# -# "objectdir" is the name of the directory which will hold "builddir". -# It is a path; if relative, it is relative to the current directory -# where config is run. It's sole use is to be prepended to "builddir" -# to indicate where config-created files are to be placed (see above). -# -# "sourcedir" is the location of the sources used to build the kernel. -# It is a path; if relative, it is relative to the directory specified -# by the concatenation of "objectdir" and "builddir" (i.e. where the -# kernel binaries are put). -# - echo 'builddir "."' >> $SYSCONF.new - set OBJRELDIR=`$RELPATH $OBJROOT $OBJDIR` - echo 'objectdir "'$OBJROOT'/'$OBJRELDIR'"' >> $SYSCONF.new - set SRCDIR=`dirname $SOURCE` - echo 'sourcedir "'$SRCROOT'"' >> $SYSCONF.new - if (-f $SYSCONF) then - diff $SYSCONF $SYSCONF.new - rm -f $SYSCONF.old - mv $SYSCONF $SYSCONF.old - endif - rm -f $SYSCONF - mv $SYSCONF.new $SYSCONF - if ($?doconfig) then - if ($?beverbose) then - echo "[ configuring $SYSID ]" - endif - if ($?profile) then - $CONFIG_DIR/config -c $MASTER_DIR -p $SYSCONF - else - $CONFIG_DIR/config -c $MASTER_DIR $SYSCONF - endif - endif - if ($?domake) then - if ($?beverbose) then - echo "[ making $SYSID ]" - endif - (cd $BLDDIR; make) - endif -end diff --git a/kgmacros b/kgmacros index 5c2205e24..edb1e35db 100644 --- a/kgmacros +++ b/kgmacros @@ -1,3 +1,4 @@ + # Kernel gdb macros # # These gdb macros should be useful during kernel development in @@ -38,9 +39,12 @@ document kgm | showallvm Display a summary listing of all the vm maps | showallvme Display a summary listing of all the vm map entries | showallipc Display a summary listing of all the ipc spaces +| showipcsummary Display a summary listing of the ipc spaces of all tasks | showallrights Display a summary listing of all the ipc rights -| showallkmods Display a summary listing of all the kernel modules -| showallbusyports Display a listing of all ports with unread messages +| showallkexts Display a summary listing of all loaded kexts (alias: showallkmods) +| showallknownkexts Display a summary listing of all kexts, loaded or not +| showallbusyports Display a listing of all ports with unread messages +| showallprocessors Display a listing of all psets and processors | | showallclasses Display info about all OSObject subclasses in the system | showobject Show info about an OSObject - its vtable ptr and retain count, & more info for simple container classes. @@ -82,13 +86,14 @@ document kgm | showprocfiles Given a proc_t pointer, display the list of open file descriptors | showproclocks Given a proc_t pointer, display the list of advisory file locks | zombproc Print out all procs in the zombie list +| showproctree Show all the processes in a hierarchical tree form | allproc Print out all process in the system not in the zombie list | zombstacks Print out all stacks of tasks that are exiting | | showinitchild Print out all processes in the system which are children of init process | -| showkmod Display info about a kernel module -| showkmodaddr Given an address, display the kernel module and offset +| showkext Display info about a kext (alias: showkmod) +| showkextaddr Given an address, display the kext and offset (alias: showkmodaddr) | | dumpcallqueue Dump out all the entries given a queue head | @@ -103,6 +108,9 @@ document kgm | switchtoctx Switch to different context | showuserstack Display numeric backtrace of the user stack for an | activation +| showtaskuserstacks Display user stacks for a specified task +| showuserregisters Display user registers for the specified thread +| showtaskuserregisters Display user registers for the specified task | | switchtouserthread Switch to the user context of the specified thread | resetstacks Return to the original kernel context @@ -145,6 +153,21 @@ document kgm | | inifa_showdbg Print the debug information of an IPv4 interface address | in6ifa_showdbg Print the debug information of an IPv6 interface address +| inm_showdbg Print the debug information of an IPv4 multicast address +| ifma_showdbg Print the debug information of a link multicast address +| ifpref_showdbg Print the debug information of an interface ref count +| +| ndpr_showdbg Print the debug information of a nd_prefix structure +| nddr_showdbg Print the debug information of a nd_defrouter structure +| +| imo_showdbg Print the debug information of a ip_moptions structure +| im6o_showdbg Print the debug information of a ip6_moptions structure +| +| inifa_trash Walk the list of trash in_ifaddr entries +| in6ifa_trash Walk the list of trash in6_ifaddr entries +| inm_trash Walk the list of trash in_multi entries +| in6m_trash Walk the list of trash in6_multi entries +| ifma_trash Walk the list of trash ifmultiaddr entries | | mbuf_walkpkt Walk the mbuf packet chain (m_nextpkt) | mbuf_walk Walk the mbuf chain (m_next) @@ -157,6 +180,9 @@ document kgm | mbuf_slabs Print all slabs in the group | mbuf_slabstbl Print slabs table | mbuf_stat Print extended mbuf allocator statistics +| mbuf_countchain Count the length of an mbuf chain +| mbuf_topleak Print the top suspected mbuf leakers +| mbuf_traceleak Print the leak information for a given leak address | | mcache_walkobj Walk the mcache object chain (obj_next) | mcache_stat Print all mcaches in the system @@ -188,8 +214,11 @@ document kgm | shownewvnodes Print the new vnode list | | ifconfig display ifconfig-like output +| showifnets show the list of attached and detached interfaces | showifaddrs show the list of addresses for the given ifp | showifmultiaddrs show the list of multicast addresses for the given ifp +| showinmultiaddrs show the list of IPv4 multicast addresses records +| showin6multiaddrs show the list of IPv6 multicast addresses records | | showsocket Display information about a socket | showprocsockets Given a proc_t pointer, display information about its sockets @@ -202,7 +231,7 @@ document kgm | show_rt_inet Display the IPv4 routing table | show_rt_inet6 Display the IPv6 routing table | -| showallpmworkqueues Display info about all IOPMWorkQueue objects +| showpmworkqueue Display the IOPMWorkQueue object | showregistrypmstate Display power management state for all IOPower registry entries | showioservicepm Display the IOServicePM object | showstacksaftertask showallstacks starting after a given task @@ -214,19 +243,39 @@ document kgm | showallgdbcorestacks Corefile equivalent of "showallgdbstacks" | kdp-reenter Schedule reentry into the debugger and continue. | kdp-reboot Restart remote target -| kdp-version Get KDP version number -| kdp-connect "shorthand" connection macro +| kdp-version Get KDP version number | | zstack Print zalloc caller stack (zone leak debugging) | findoldest Find oldest zone leak debugging record | countpcs Print how often a pc occurs in the zone leak log | +| showtopztrace Print the ztrace with the most outstanding allocated memory +| showztrace Print a backtrace record given its index +| showzalloc Print an allocation record + stacktrace at index +| showztraceaddr Print a backtrace record given its address +| showztracesabove Print all the backtrace records with a size bigger than X +| showzstacktrace Symbolicate and print a stored OSBacktrace +| +| showztraces Finds all in-use traces in the ztraces table +| showzallocs Finds all in-use allocations in the zallocs table +| showzstats Shows the statistics gathered about the hash tables +| +| showzallocsfortrace Print all the allocations that refer to a trace +| showztracehistogram Prints a histogram of the ztraces table +| showzallochistogram Prints a histogram of the zallocs table +| | pmap_walk Perform a page-table walk | pmap_vtop Translate a virtual address to physical address | -| showuserlibraries Show binary images known by dyld in the target task +| showuserdyldinfo Show dyld information and error messages +| in the target task +| showuserlibraries Show binary images known by dyld in the +| target task +| showallvmstats Prints a summary of vm statistics in a table format +| memstats Displays memory statistics in a table format | -| showthreadfortid Displays the address of the thread structure for a given thread_id value. +| showthreadfortid Displays the address of the thread structure +| for a given thread_id value. | | strcmp_nomalloc A version of strcmp that avoids the use of malloc | through the use of encoded strings created via @@ -252,6 +301,12 @@ document kgm | ioapic_write32 Write IOAPIC entry | ioapic_dump Dump IOAPIC entries | +| showallproviders Display summary listing of all dtrace_providers +| showallmodctls Display summary listing of all dtrace modctls +| showmodctl Display info about a dtrace modctl +| showfbtprobe Display info about an fbt probe given an id (traverses fbt_probetab) +| processortimers Display all processor timers, noting any inconsistencies +| | Type "help " for more specific help on a particular macro. | Type "show user " to see what the macro is really doing. end @@ -304,6 +359,10 @@ set $kgm_kdp_pkt_input_off = $kgm_kdp_pkt_data_len + 4 set $kgm_kdp_pkt_hostreboot = 0x13 set $kgm_kdp_pkt_hdr_size = 8 + +set $kgm_readphys_force_kdp = 0 +set $kgm_readphys_force_physmap = 0 + set $kgm_lcpu_self = 0xFFFE set $kgm_reg_depth = 0 @@ -366,8 +425,18 @@ define showptrhdrpad end end +# Print a userspace pointer, using $kgm_tasp +define showuserptr + set $kgm_userptr_task_64 = ( $kgm_taskp->taskFeatures[0] & 0x80000000) + if $kgm_userptr_task_64 + printf "0x%016llx", $arg0 + else + printf "0x%08x", $arg0 + end +end + define showkmodheader - printf "kmod " + printf "kmod_info " showptrhdrpad printf " address " showptrhdrpad @@ -424,7 +493,6 @@ end define showkmodaddr showkmodaddrint $arg0 end - document showkmodaddr Syntax: (gdb) showkmodaddr | Given an address, print the offset and name for the kmod containing it @@ -436,7 +504,15 @@ define showkmod end document showkmod Syntax: (gdb) showkmod -| Routine to print info about a kernel module +| Routine to print info about a kext +end + +define showkext + showkmod $arg0 +end +document showkext +Syntax: (gdb) showkext +| Routine to print info about a kext end define showallkmods @@ -449,7 +525,108 @@ define showallkmods end document showallkmods Syntax: (gdb) showallkmods -| Routine to print a summary listing of all the kernel modules +| Routine to print a summary listing of all loaded kexts +end + +define showallkexts + showallkmods +end +document showallkexts +Syntax: (gdb) showallkexts +| Routine to print a summary listing of all loaded kexts +end + +# See OSKextVersion.c for the C code this is based on +# +set $KGM_OSKEXT_VERS_MAJ_MULT = 100000000 +set $KGM_OSKEXT_VERS_MIN_MULT = 1000000 +set $KGM_OSKEXT_VERS_REV_MULT = 10000 +set $KGM_OSKEXT_VERS_STAGE_MULT = 1000 + +define printoskextversion + set $vers_scratch = $arg0 + + if ($vers_scratch == -1) + printf "(invalid)" + else + + set $vers_major = $vers_scratch / $KGM_OSKEXT_VERS_MAJ_MULT + + set $vers_scratch = $vers_scratch - ($vers_major * $KGM_OSKEXT_VERS_MAJ_MULT) + set $vers_minor = $vers_scratch / $KGM_OSKEXT_VERS_MIN_MULT + + set $vers_scratch = $vers_scratch - ( $vers_minor * $KGM_OSKEXT_VERS_MIN_MULT) + set $vers_revision = $vers_scratch / $KGM_OSKEXT_VERS_REV_MULT + + set $vers_scratch = $vers_scratch - ( $vers_revision * $KGM_OSKEXT_VERS_REV_MULT) + set $vers_stage = $vers_scratch / $KGM_OSKEXT_VERS_STAGE_MULT + + set $vers_scratch = $vers_scratch - ( $vers_stage * $KGM_OSKEXT_VERS_STAGE_MULT) + set $vers_stagelevel = $vers_scratch + + printf "%d.%d", $vers_major, $vers_minor + if ($vers_revision > 0) + printf ".%d", $vers_revision + end + + if ($vers_stage == 1) + printf "d" + end + if ($vers_stage == 3) + printf "a" + end + if ($vers_stage == 5) + printf "b" + end + if ($vers_stage == 7) + printf "fc" + end + if ($vers_stage == 1 || $vers_stage == 3 || $vers_stage == 5 || $vers_stage == 7) + printf "%d", $vers_stagelevel + end + end +end + +define showallknownkexts + set $kext_count = sKextsByID->count + set $kext_index = 0 + printf "%d kexts in sKextsByID:\n", $kext_count + + printf "OSKext * " + showptrhdrpad + printf "load_addr " + showptrhdrpad + + printf " id name (version)\n" + + while $kext_index < $kext_count + set $kext_id = sKextsByID->dictionary[$kext_index].key->string + set $oskext = (OSKext *)sKextsByID->dictionary[$kext_index].value + + showptr $oskext + printf " " + + if ($oskext->flags.loaded) + showptr $oskext->kmod_info + printf " " + printf "%3d", $oskext->loadTag + else + showptrhdrpad + printf " -------- " + printf " " + printf " --" + end + printf " " + + printf "%.64s (", $kext_id + printoskextversion (uint64_t)$oskext->version + printf ")\n" + set $kext_index = $kext_index + 1 + end +end +document showallknownkexts +Syntax: (gdb) showallknownkexts +| Routine to print a summary listing of all kexts, loaded or not end define showactheader @@ -477,7 +654,7 @@ define showactint else printf " " end - printf " %7ld ", $kgm_thread.thread_id + printf " 0x%llx ", $kgm_thread.thread_id showptr $kgm_thread.last_processor printf " %3d ", $kgm_thread.sched_pri if ($kgm_thread.uthread != 0) @@ -488,18 +665,36 @@ define showactint else printf " " end - if ($kgm_uthread->uu_iopol_disk == 1) - printf "NORM " - set $kgm_printed = 1 + set $diskpolicy = 0 + if ($kgm_thread->ext_actionstate.hw_disk != 0) + set $diskpolicy = $kgm_thread->ext_actionstate.hw_disk + else + if ($kgm_thread->actionstate.hw_disk != 0) + set $diskpolicy = $kgm_thread->actionstate.hw_disk + end + end + if ($kgm_thread->ext_actionstate.hw_bg != 0) + set $diskpolicy = 5 end - if ($kgm_uthread->uu_iopol_disk == 2) + if ($kgm_thread->actionstate.hw_bg != 0) + set $diskpolicy = 4 + end + if ($diskpolicy == 2) printf "PASS " set $kgm_printed = 1 end - if ($kgm_uthread->uu_iopol_disk == 3) + if ($diskpolicy == 3) printf "THROT " set $kgm_printed = 1 end + if ($diskpolicy == 4) + printf "BG_THRT " + set $kgm_printed = 1 + end + if ($diskpolicy == 5) + printf "EBG_THRT" + set $kgm_printed = 1 + end if ($kgm_printed == 0) printf " " end @@ -544,8 +739,25 @@ define showactint end end end + if ($kgm_thread.uthread != 0) + set $kgm_uthread = (struct uthread *)$kgm_thread.uthread + if ($kgm_uthread->pth_name && $kgm_uthread->pth_name[0]) + printf "\n\t\tThread Name: %s", $kgm_uthread->pth_name + end + end if $arg1 != 0 if ($kgm_thread.kernel_stack != 0) + if ($kgm_thread.uthread != 0) + printf "\n " + set $kgm_uthread = (struct uthread *)$kgm_thread.uthread + if ($kgm_uthread->uu_kwe.kwe_kwqqueue != 0) + set $kwq = (ksyn_wait_queue_t)$kgm_uthread->uu_kwe.kwe_kwqqueue + printf " kwq_lockcount:0x%x; kwq_retval:0x%x", $kgm_uthread->uu_kwe.kwe_lockseq, $kgm_uthread->uu_kwe.kwe_psynchretval + printf "\n " + show_kwq $kwq + printf " " + end + end if ($kgm_thread.reserved_stack != 0) printf "\n " showptrhdrpad @@ -683,10 +895,17 @@ Syntax: (gdb) showallthreads | Routine to print out info about all threads in the system. end +define showprocessorint + set $kgm_processor_int = (struct processor *)$arg0 + printf "Processor " + showptr $kgm_processor_int + printf " State %d (cpu_id 0x%x)\n", ($kgm_processor_int)->state, ($kgm_processor_int)->cpu_id +end + define showcurrentthreads -set $kgm_prp = (struct processor *)processor_list + set $kgm_prp = (struct processor *)processor_list while $kgm_prp != 0 - printf "Processor 0x%08x State %d (cpu_id %x)\n", $kgm_prp, ($kgm_prp)->state, ($kgm_prp)->cpu_id + showprocessorint $kgm_prp if ($kgm_prp)->active_thread != 0 set $kgm_actp = ($kgm_prp)->active_thread showtaskheader @@ -703,15 +922,192 @@ Syntax: (gdb) showcurrentthreads | Routine to print out info about the thread running on each cpu. end + +define _showrunqint + set $kgm_runq = (struct run_queue *)$arg0 + + printf " Priority Run Queue Info: Count %d\n", $kgm_runq->count + set $kgm_runq_queue_i = 0 + set $kgm_runq_queue_count = sizeof($kgm_runq->queues)/sizeof($kgm_runq->queues[0]) + while $kgm_runq->count && $kgm_runq_queue_i < $kgm_runq_queue_count + set $kgm_runq_queue_head = &$kgm_runq->queues[$kgm_runq_queue_i] + set $kgm_runq_queue_p = $kgm_runq_queue_head->next + if $kgm_runq_queue_p != $kgm_runq_queue_head + set $kgm_runq_queue_this_count = 0 + while $kgm_runq_queue_p != $kgm_runq_queue_head + set $kgm_runq_queue_this_count = $kgm_runq_queue_this_count + 1 + showtask ((thread_t)$kgm_runq_queue_p)->task + showactstack $kgm_runq_queue_p + set $kgm_runq_queue_p = $kgm_runq_queue_p->next + end + printf " Queue Priority %3d [", $kgm_runq_queue_i + showptr $kgm_runq_queue_head + printf "] Count %d\n", $kgm_runq_queue_this_count + end + set $kgm_runq_queue_i = $kgm_runq_queue_i + 1 + end + +end + +define _showgrrrint + set $kgm_grrr_runq = $arg0 + + printf " GRRR Info: Count %d Weight %d Current Group ", $kgm_grrr_runq->count, $kgm_grrr_runq->weight + showptr $kgm_grrr_runq->current_group + printf "\n" + set $kgm_grrr_group_i = 0 + set $kgm_grrr_group_count = sizeof($kgm_grrr_runq->groups)/sizeof($kgm_grrr_runq->groups[0]) + while $kgm_grrr_runq->count && $kgm_grrr_group_i < $kgm_grrr_group_count + set $kgm_grrr_group = &$kgm_grrr_runq->groups[$kgm_grrr_group_i] + if $kgm_grrr_group->count > 0 + printf " Group %3d [", $kgm_grrr_group->index + showptr $kgm_grrr_group + printf "] Count %d Weight %d\n", $kgm_grrr_group->count, $kgm_grrr_group->weight + set $kgm_grrr_group_client_head = &$kgm_grrr_group->clients + set $kgm_grrr_group_client = $kgm_grrr_group_client_head->next + while $kgm_grrr_group_client != $kgm_grrr_group_client_head + # showtask ((thread_t)$kgm_grrr_group_client)->task + # showactstack $kgm_grrr_group_client + set $kgm_grrr_group_client = $kgm_grrr_group_client->next + end + end + set $kgm_grrr_group_i = $kgm_grrr_group_i + 1 + end +end + +define showallprocessors + set $kgm_pset = &pset0 + + set $kgm_show_grrr = 0 + set $kgm_show_priority_runq = 0 + set $kgm_show_priority_pset_runq = 0 + set $kgm_show_fairshare_grrr = 0 + set $kgm_show_fairshare_list = 0 + + if _sched_enum == 1 + set $kgm_show_priority_runq = 1 + set $kgm_show_fairshare_list = 1 + end + if _sched_enum == 2 + set $kgm_show_priority_pset_runq = 1 + set $kgm_show_fairshare_list = 1 + end + if _sched_enum == 4 + set $kgm_show_grrr = 1 + set $kgm_show_fairshare_grrr = 1 + end + if _sched_enum == 5 + set $kgm_show_priority_runq = 1 + set $kgm_show_fairshare_list = 1 + end + if _sched_enum == 6 + set $kgm_show_priority_pset_runq = 1 + set $kgm_show_fairshare_list = 1 + end + + while $kgm_pset != 0 + printf "Processor Set " + showptr $kgm_pset + printf " Count %d (cpu_id 0x%x-0x%x)\n", ($kgm_pset)->cpu_set_count, ($kgm_pset)->cpu_set_low, ($kgm_pset)->cpu_set_hi + printf " Active Processors:\n" + set $kgm_active_queue_head = &($kgm_pset)->active_queue + set $kgm_active_elt = $kgm_active_queue_head->next + while $kgm_active_elt != $kgm_active_queue_head + set $kgm_processor = (processor_t)$kgm_active_elt + printf " " + showprocessorint $kgm_processor + + if $kgm_show_priority_runq + set $kgm_runq = &$kgm_processor->runq + _showrunqint $kgm_runq + end + if $kgm_show_grrr + set $kgm_grrr_runq = &$kgm_processor->grrr_runq + _showgrrrint $kgm_grrr_runq + end + + if $kgm_processor->processor_meta != 0 && $kgm_processor->processor_meta->primary == $kgm_processor + set $kgm_processor_meta_idle_head = &$kgm_processor->processor_meta->idle_queue + set $kgm_processor_meta_idle = $kgm_processor_meta_idle_head->next + while $kgm_processor_meta_idle != $kgm_processor_meta_idle_head + printf " Idle Meta Processor: " + showprocessorint $kgm_processor_meta_idle + set $kgm_processor_meta_idle = $kgm_processor_meta_idle->next + end + end + + set $kgm_active_elt = $kgm_active_elt->next + end + printf " Idle Processors:\n" + set $kgm_idle_queue_head = &($kgm_pset)->idle_queue + set $kgm_idle_elt = $kgm_idle_queue_head->next + while $kgm_idle_elt != $kgm_idle_queue_head + set $kgm_processor = (processor_t)$kgm_idle_elt + printf " " + showprocessorint $kgm_processor + + if $kgm_processor->processor_meta != 0 && $kgm_processor->processor_meta->primary == $kgm_processor + set $kgm_processor_meta_idle_head = &$kgm_processor->processor_meta->idle_queue + set $kgm_processor_meta_idle = $kgm_processor_meta_idle_head->next + while $kgm_processor_meta_idle != $kgm_processor_meta_idle_head + printf " Idle Meta Processor: " + showprocessorint $kgm_processor_meta_idle + set $kgm_processor_meta_idle = $kgm_processor_meta_idle->next + end + end + + set $kgm_idle_elt = $kgm_idle_elt->next + end + + if $kgm_show_priority_pset_runq + set $kgm_runq = &$kgm_pset->pset_runq + printf "\n" + _showrunqint $kgm_runq + end + set $kgm_pset = ($kgm_pset)->pset_list + end + + printf "\n" + printf "Realtime Queue Count %d\n", rt_runq.count + set $kgm_rt_runq_head = &rt_runq.queue + set $kgm_rt_runq = $kgm_rt_runq_head->next + while $kgm_rt_runq != $kgm_rt_runq_head + showtask ((thread_t)$kgm_rt_runq)->task + showact $kgm_rt_runq + set $kgm_rt_runq = $kgm_rt_runq->next + end + + printf "\n" + if $kgm_show_fairshare_list + printf "Fair Share Queue Count %d\n", fs_runq.count + set $kgm_fs_runq_head = &fs_runq.queue + set $kgm_fs_runq = $kgm_fs_runq_head->next + while $kgm_fs_runq != $kgm_fs_runq_head + showtask ((thread_t)$kgm_fs_runq)->task + showact $kgm_fs_runq + set $kgm_fs_runq = $kgm_fs_runq->next + end + end + if $kgm_show_fairshare_grrr + printf "Fair Share Queue Count %d\n", fs_grrr_runq.count + set $kgm_fs_grrr = &fs_grrr_runq + _showgrrrint $kgm_fs_grrr + end +end +document showallprocessors +Syntax: (gdb) showallprocessors +| Routine to print out info about all psets and processors +end + set $decode_wait_events = 0 define showallstacks set $kgm_head_taskp = &tasks set $kgm_taskp = (struct task *)($kgm_head_taskp->next) while $kgm_taskp != $kgm_head_taskp - showtaskheader + showtaskheader showtaskint $kgm_taskp set $kgm_head_actp = &($kgm_taskp->threads) - set $kgm_actp = (struct thread *)($kgm_taskp->threads.next) + set $kgm_actp = (struct thread *)($kgm_taskp->threads.next) while $kgm_actp != $kgm_head_actp showactheader if ($decode_wait_events > 0) @@ -719,11 +1115,14 @@ define showallstacks else showactint $kgm_actp 2 end - set $kgm_actp = (struct thread *)($kgm_actp->task_threads.next) - end + set $kgm_actp = (struct thread *)($kgm_actp->task_threads.next) + end printf "\n" - set $kgm_taskp = (struct task *)($kgm_taskp->tasks.next) + set $kgm_taskp = (struct task *)($kgm_taskp->tasks.next) end + + printf "\nZombie Processes:\n" + zombstacks end document showallstacks @@ -735,9 +1134,9 @@ Syntax: (gdb) showallstacks end define showcurrentstacks -set $kgm_prp = processor_list + set $kgm_prp = processor_list while $kgm_prp != 0 - printf "Processor 0x%08x State %d (cpu_id %x)\n", $kgm_prp, ($kgm_prp)->state, ($kgm_prp)->cpu_id + showprocessorint $kgm_prp if ($kgm_prp)->active_thread != 0 set $kgm_actp = ($kgm_prp)->active_thread showtaskheader @@ -784,7 +1183,7 @@ define showwaitqwaitercount set $kgm_wc_wqe = (WaitQueueElement *)$kgm_wc_linksp->next set $kgm_wc_count = 0 while ( (queue_entry_t)$kgm_wc_wqe != (queue_entry_t)$kgm_wc_linksp) - if ($kgm_wc_wqe->wqe_type != &_wait_queue_link) && ($kgm_wc_wqe->wqe_type != &_wait_queue_link_noalloc) + if ($kgm_wc_wqe->wqe_type != &_wait_queue_link) set $kgm_wc_count = $kgm_wc_count + 1 end set $kgm_wc_wqe = (WaitQueueElement *)$kgm_wc_wqe->wqe_links.next @@ -793,7 +1192,7 @@ define showwaitqwaitercount end define showwaitqmembercount - set $kgm_mc_waitqsetp = (struct wait_queue_set *)$arg0 + set $kgm_mc_waitqsetp = (WaitQueueSet*)$arg0 set $kgm_mc_setlinksp = &($kgm_mc_waitqsetp->wqs_setlinks) set $kgm_mc_wql = (WaitQueueLink *)$kgm_mc_setlinksp->next set $kgm_mc_count = 0 @@ -855,7 +1254,7 @@ define showwaitqmemberof end define showwaitqmembers - set $kgm_ms_waitqsetp = (struct wait_queue_set *)$arg0 + set $kgm_ms_waitqsetp = (WaitQueueSet*)$arg0 set $kgm_ms_setlinksp = &($kgm_ms_waitqsetp->wqs_setlinks) set $kgm_ms_wql = (WaitQueueLink *)$kgm_ms_setlinksp->next set $kgm_ms_found = 0 @@ -871,15 +1270,15 @@ define showwaitqmembers end define showwaitqheader - printf "wait_queue prepostq interlock " + printf "wait_queue ref_count interlock " printf "pol type member_cnt waiter_cnt\n" end define showwaitqint - set $kgm_waitqp = (WaitQueue *)$arg0 + set $kgm_waitqp = (WaitQueue*)$arg0 printf "0x%08x ", $kgm_waitqp if ($kgm_waitqp->wq_type == 0xf1d1) - printf "0x%08x ", &((struct wait_queue_set *)$kgm_waitqp)->wqs_preposts + printf "0x%08x ", ((WaitQueueSet*)$kgm_waitqp)->wqs_refcount else printf "0x00000000 " end @@ -1138,20 +1537,29 @@ define showipcheader showptrhdrpad printf " table_next" showptrhdrpad - printf " flags tsize splaytree splaybase\n" + printf " flags ports splaysize " + showptrhdrpad + printf "splaybase\n" end define showipceheader - printf " name object " + printf " " showptrhdrpad - printf " rite urefs destname destination\n" + printf "object " + showptrhdrpad + showptrhdrpad + printf "name rite urefs destname " + showptrhdrpad + printf "destination\n" end define showipceint set $kgm_ie = *(ipc_entry_t)$arg0 - printf " 0x%08x ", $arg1 + printf " " + showptrhdrpad showptr $kgm_ie.ie_object - printf " " + showptrhdrpad + printf " 0x%08x ", $arg1 if $kgm_ie.ie_bits & 0x00100000 printf "Dead " printf "%5d\n", $kgm_ie.ie_bits & 0xffff @@ -1175,10 +1583,29 @@ define showipceint printf " O" end if $kgm_ie.index.request - printf "n" + set $kgm_port = (ipc_port_t)$kgm_ie.ie_object + set $kgm_requests = $kgm_port->ip_requests + set $kgm_req_soright = $kgm_requests[$kgm_ie.index.request].notify.port + if $kgm_req_soright +# Armed send-possible notification? + if (uintptr_t)$kgm_req_soright & 0x1 + printf "s" + else +# Delayed send-possible notification? + if (uintptr_t)$kgm_req_soright & 0x2 + printf "d" + else +# Dead-name notification + printf "n" + end + end + else + printf " " + end else printf " " end +# Collision (with tree)? if $kgm_ie.ie_bits & 0x00800000 printf "c" else @@ -1214,8 +1641,9 @@ define showipcint else printf " " end - printf "%5d ", $kgm_is.is_table_size - printf "0x%08x ", $kgm_is.is_tree_total + printf "%5d ", $kgm_is.is_table_size + $kgm_is.is_tree_total + showptr $kgm_is.is_tree_total + printf " " showptr &$kgm_isp->is_tree printf "\n" if $arg1 != 0 @@ -1228,12 +1656,12 @@ define showipcint if $kgm_ie.ie_bits & 0x001f0000 set $kgm_name = (($kgm_iindex << 8)|($kgm_ie.ie_bits >> 24)) showipceint $kgm_iep $kgm_name - if $arg2 != 0 && ipc_portbt != 0 - if $kgm_ie.ie_object != 0 && ($kgm_ie.ie_bits & 0x00070000) && ((ipc_port_t) $kgm_ie.ie_object)->ip_callstack[0] != 0 - printf " user bt: " - showportbt $kgm_ie.ie_object $kgm_is.is_task - end - end + if $arg2 != 0 + if $kgm_ie.ie_object != 0 && ($kgm_ie.ie_bits & 0x00070000) && ((ipc_port_t) $kgm_ie.ie_object)->ip_callstack[0] != 0 + printf " user bt: " + showportbt $kgm_ie.ie_object $kgm_is.is_task + end + end end set $kgm_iindex = $kgm_iindex + 1 set $kgm_iep = &($kgm_is.is_table[$kgm_iindex]) @@ -1270,8 +1698,8 @@ end define showtaskipc set $kgm_taskp = (task_t)$arg0 showtaskheader - showipcheader showtaskint $kgm_taskp + showipcheader showipcint $kgm_taskp->itk_space 0 0 end document showtaskipc @@ -1283,8 +1711,8 @@ end define showtaskrights set $kgm_taskp = (task_t)$arg0 showtaskheader - showipcheader showtaskint $kgm_taskp + showipcheader showipcint $kgm_taskp->itk_space 1 0 end document showtaskrights @@ -1295,8 +1723,8 @@ end define showtaskrightsbt set $kgm_taskp = (task_t)$arg0 showtaskheader - showipcheader showtaskint $kgm_taskp + showipcheader showipcint $kgm_taskp->itk_space 1 1 end document showtaskrightsbt @@ -1309,8 +1737,8 @@ define showallipc set $kgm_cur_taskp = (struct task *)($kgm_head_taskp->next) while $kgm_cur_taskp != $kgm_head_taskp showtaskheader - showipcheader showtaskint $kgm_cur_taskp + showipcheader showipcint $kgm_cur_taskp->itk_space 0 0 set $kgm_cur_taskp = (struct task *)($kgm_cur_taskp->tasks.next) end @@ -1320,14 +1748,49 @@ Syntax: (gdb) showallipc | Routine to print a summary listing of all the ipc spaces end +define showipcsumheader + printf "task " + showptrhdrpad + printf " pid " + printf " #acts " + printf " tsize " + printf "command\n" +end + +define showipcsummaryint + set $kgm_taskp = (struct task *)$arg0 + showptr $arg0 + printf "%7d", ((struct proc *)$kgm_taskp->bsd_info)->p_pid + printf "%15d", $kgm_taskp->thread_count + printf "%15d", $kgm_cur_taskp->itk_space.is_table_size + printf " %s\n", ((struct proc *)$kgm_taskp->bsd_info)->p_comm +end + +define showipcsummary + showipcsumheader + set $kgm_head_taskp = &tasks + set $kgm_cur_taskp = (struct task *)($kgm_head_taskp->next) + while $kgm_cur_taskp != $kgm_head_taskp + showipcsummaryint $kgm_cur_taskp + set $kgm_cur_taskp = (struct task *)($kgm_cur_taskp->tasks.next) + end +end + +document showipcsummary +Syntax: (gdb) showipcsummary +| Summarizes the IPC state of all tasks. This is a convenient way to dump +| some basic clues about IPC messaging. You can use the output to determine +| tasks that are candidates for further investigation. +end + define showallrights set $kgm_head_taskp = &tasks set $kgm_cur_taskp = (struct task *)($kgm_head_taskp->next) while $kgm_cur_taskp != $kgm_head_taskp showtaskheader - showipcheader showtaskint $kgm_cur_taskp + showipcheader showipcint $kgm_cur_taskp->itk_space 1 0 set $kgm_cur_taskp = (struct task *)($kgm_cur_taskp->tasks.next) end @@ -1353,8 +1816,8 @@ end define showtaskvme set $kgm_taskp = (task_t)$arg0 showtaskheader - showmapheader showtaskint $kgm_taskp + showmapheader showvmint $kgm_taskp->map 1 end document showtaskvme @@ -1431,6 +1894,28 @@ Syntax: (gdb) showtaskstacks | Routine to print out the stack for each thread in a task. end +define showqueue_elems + set $queue_head = (struct queue_entry *)($arg0) + set $queue = (struct queue_entry *)($queue_head->next) + while $queue != $queue_head + showptr $queue + printf " " + set $thread = (struct thread *)$queue + set $task = (struct task *)$thread->task + set $bsd = (struct proc *)$task->bsd_info + set $guy = (char *)$bsd->p_comm + showptr $thread + printf " " + showptr $task + printf " " + showptr $bsd + printf " " + showptr $guy + #printf " %s\n", $kgm_procp->p_comm + printf "\n" + set $queue = (struct queue_entry *)($queue->next) + end +end define showalltasks showtaskheader @@ -1453,9 +1938,9 @@ Syntax: (gdb) showalltasks end define showprocheader - printf " pid process io_policy wq_state" + printf " pid process " showptrhdrpad - printf " command\n" + printf "io_policy wq_state command\n" end define showprocint @@ -1469,17 +1954,36 @@ define showprocint else printf " " end - if ($kgm_procp->p_iopol_disk == 1) - printf "NORM " - set $kgm_printed = 1 + set $ptask = (struct task *)$kgm_procp->task + set $diskpolicy = 0 + if ($ptask->ext_actionstate.hw_disk != 0) + set $diskpolicy = $ptask->ext_actionstate.hw_disk + else + if ($ptask->actionstate.hw_disk != 0) + set $diskpolicy = $ptask->actionstate.hw_disk + end + end + if ($ptask->ext_actionstate.hw_bg != 0) + set $diskpolicy = 5 end - if ($kgm_procp->p_iopol_disk == 2) - printf "PASS " - set $kgm_printed = 1 - end - if ($kgm_procp->p_iopol_disk == 3) - printf "THROT " - set $kgm_printed = 1 + if ($ptask->actionstate.hw_bg != 0) + set $diskpolicy = 4 + end + if ($diskpolicy == 2) + printf "PASS " + set $kgm_printed = 1 + end + if ($diskpolicy == 3) + printf "THROT " + set $kgm_printed = 1 + end + if ($diskpolicy == 4) + printf "BG_THRT " + set $kgm_printed = 1 + end + if ($diskpolicy == 5) + printf "EBG_THRT" + set $kgm_printed = 1 end if ($kgm_printed == 0) printf " " @@ -1537,29 +2041,95 @@ document kdb end define showpsetheader - printf "portset waitqueue recvname " - printf "flags refs recvname process\n" + printf "portset " + showptrhdrpad + printf "waitqueue " + showptrhdrpad + showptrhdrpad + printf "recvname flags refs recvname " + showptrhdrpad + printf "process\n" end define showportheader - printf "port mqueue recvname " - printf "flags refs recvname process\n" + printf "port " + showptrhdrpad + printf "mqueue " + showptrhdrpad + showptrhdrpad + printf "recvname flags refs recvname " + showptrhdrpad + printf "dest\n" end define showportmemberheader - printf "members port recvname " - printf "flags refs mqueue msgcount\n" + printf "members " + showptrhdrpad + printf "port " + showptrhdrpad + showptrhdrpad + printf "recvname " + printf "flags refs mqueue " + showptrhdrpad + printf "msgcount\n" end define showkmsgheader - printf "messages kmsg size " - printf "disp msgid remote-port local-port\n" + printf "dest-port " + showptrhdrpad + printf "kmsg " + showptrhdrpad + showptrhdrpad + printf "msgid " + printf "disp size " + printf "reply-port " + showptrhdrpad + printf "source\n" +end + +define showkmsgsrcint + set $kgm_kmsgsrchp = ((ipc_kmsg_t)$arg0)->ikm_header +# set $kgm_kmsgsrctp = (mach_msg_audit_trailer_t *)((uintptr_t)$kgm_kmsgsrchp + $kgm_kmsgsrchp->msgh_size) +# set $kgm_kmsgpid = $kgm_kmsgsrctp->msgh_audit.val[5] + set $kgm_kmsgpid = (pid_t)((uint *)((uintptr_t)$kgm_kmsgsrchp + $kgm_kmsgsrchp->msgh_size))[10] +# compare against a well-known or cached value as this may be slow + if ($kgm_kmsgpid == 0) + set $kgm_kmsgsrcpid = (pid_t)0 + set $kgm_kmsgsrcprocp = (struct proc *)kernel_task->bsd_info + else + if ($kgm_kmsgpid != $kgm_kmsgsrcpid) + set $kgm_kmsgsrchead_taskp = &tasks + set $kgm_kmsgsrctaskp = (struct task *)($kgm_kmsgsrchead_taskp->next) + while $kgm_kmsgsrctaskp != $kgm_kmsgsrchead_taskp + set $kgm_kmsgsrcprocp = (struct proc *)$kgm_kmsgsrctaskp->bsd_info + set $kgm_kmsgsrcpid = $kgm_kmsgsrcprocp->p_pid + if (($kgm_kmsgsrcprocp != 0) && ($kgm_kmsgsrcprocp->p_pid == $kgm_kmsgpid)) + set $kgm_kmsgsrctaskp = $kgm_kmsgsrchead_taskp + else + set $kgm_kmsgsrctaskp = (struct task *)($kgm_kmsgsrctaskp->tasks.next) + end + end + end + end + if ($kgm_kmsgsrcprocp->p_pid == $kgm_kmsgpid) + printf "%s(%d)\n", $kgm_kmsgsrcprocp->p_comm, $kgm_kmsgpid + else + printf "unknown(%d)\n", $kgm_kmsgpid + end end define showkmsgint - printf " 0x%08x ", $arg0 - set $kgm_kmsgh = ((ipc_kmsg_t)$arg0)->ikm_header - printf "0x%08x ", $kgm_kmsgh.msgh_size + set $kgm_kmsghp = ((ipc_kmsg_t)$arg0)->ikm_header + set $kgm_kmsgh = *$kgm_kmsghp + if ($arg1 != 0) + printf " " + showptrhdrpad + else + showptr $kgm_kmsgh.msgh_remote_port + end + showptr $arg0 + showptrhdrpad + printf " 0x%08x ", $kgm_kmsgh.msgh_id if (($kgm_kmsgh.msgh_bits & 0xff) == 19) printf "rC" else @@ -1575,12 +2145,16 @@ define showkmsgint else printf "s" end - printf "%5d ", $kgm_kmsgh.msgh_id - printf "0x%08x ", $kgm_kmsgh.msgh_remote_port - printf "0x%08x\n", $kgm_kmsgh.msgh_local_port + printf "%5d ", $kgm_kmsgh.msgh_size + showptr $kgm_kmsgh.msgh_local_port + printf " " + set $kgm_kmsgsrcpid = (pid_t)0 + showkmsgsrcint $arg0 end - +define showkmsg + showkmsgint $arg0 0 +end define showkobject set $kgm_portp = (struct ipc_port *)$arg0 @@ -1715,7 +2289,7 @@ end define showportdest set $kgm_portp = (struct ipc_port *)$arg0 set $kgm_spacep = $kgm_portp->data.receiver - if ($kgm_spacep == ipc_space_kernel) + if ((uintptr_t)$kgm_spacep == (uintptr_t)ipc_space_kernel) showkobject $kgm_portp else if ($kgm_portp->ip_object.io_bits & 0x80000000) @@ -1730,9 +2304,12 @@ define showportdest end define showportmember - printf " 0x%08x ", $arg0 + printf " " + showptrhdrpad + showptr $arg0 + showptrhdrpad set $kgm_portp = (struct ipc_port *)$arg0 - printf "0x%08x ", $kgm_portp->ip_messages.data.port.receiver_name + printf " 0x%08x ", $kgm_portp->ip_messages.data.port.receiver_name if ($kgm_portp->ip_object.io_bits & 0x80000000) printf "A" else @@ -1740,8 +2317,8 @@ define showportmember end printf "Port" printf "%5d ", $kgm_portp->ip_object.io_references - printf "0x%08x ", &($kgm_portp->ip_messages) - printf "0x%08x\n", $kgm_portp->ip_messages.data.port.msgcount + showptr &($kgm_portp->ip_messages) + printf " 0x%08x\n", $kgm_portp->ip_messages.data.port.msgcount end define showportbt @@ -1764,10 +2341,12 @@ define showportbt end define showportint - printf "0x%08x ", $arg0 + showptr $arg0 + printf " " set $kgm_portp = (struct ipc_port *)$arg0 - printf "0x%08x ", &($kgm_portp->ip_messages) - printf "0x%08x ", $kgm_portp->ip_messages.data.port.receiver_name + showptr &($kgm_portp->ip_messages) + showptrhdrpad + printf " 0x%08x ", $kgm_portp->ip_messages.data.port.receiver_name if ($kgm_portp->ip_object.io_bits & 0x80000000) printf "A" else @@ -1780,21 +2359,23 @@ define showportint set $kgm_kmsgp = (ipc_kmsg_t)$kgm_portp->ip_messages.data.port.messages.ikmq_base if $arg1 && $kgm_kmsgp showkmsgheader - showkmsgint $kgm_kmsgp + showkmsgint $kgm_kmsgp 1 set $kgm_kmsgheadp = $kgm_kmsgp set $kgm_kmsgp = $kgm_kmsgp->ikm_next while $kgm_kmsgp != $kgm_kmsgheadp - showkmsgint $kgm_kmsgp + showkmsgint $kgm_kmsgp 1 set $kgm_kmsgp = $kgm_kmsgp->ikm_next end end end define showpsetint - printf "0x%08x ", $arg0 + showptr $arg0 + printf " " set $kgm_psetp = (struct ipc_pset *)$arg0 - printf "0x%08x ", &($kgm_psetp->ips_messages) - printf "0x%08x ", $kgm_psetp->ips_messages.data.pset.local_name + showptr &($kgm_psetp->ips_messages) + showptrhdrpad + printf " 0x%08x ", $kgm_psetp->ips_messages.data.pset.local_name if ($kgm_psetp->ips_object.io_bits & 0x80000000) printf "A" else @@ -1802,12 +2383,13 @@ define showpsetint end printf "Set " printf "%5d ", $kgm_psetp->ips_object.io_references - printf "0x%08x ", $kgm_psetp->ips_messages.data.pset.local_name + showptr $kgm_psetp->ips_messages.data.pset.local_name + printf " " set $kgm_setlinksp = &($kgm_psetp->ips_messages.data.pset.set_queue.wqs_setlinks) set $kgm_wql = (WaitQueueLink *)$kgm_setlinksp->next set $kgm_found = 0 while ( (queue_entry_t)$kgm_wql != (queue_entry_t)$kgm_setlinksp) - set $kgm_portp = (struct ipc_port *)((uintptr_t)$kgm_wql->wql_element.wqe_queue - $kgm_portoff) + set $kgm_portp = (struct ipc_port *)((uintptr_t)($kgm_wql->wql_element->wqe_queue) - (uintptr_t)$kgm_portoff) if !$kgm_found set $kgm_destspacep = (struct ipc_space *)0 showportdestproc $kgm_portp @@ -1823,6 +2405,7 @@ define showpsetint end define showpset + set $kgm_portoff = &(((struct ipc_port *)0)->ip_messages) showpsetheader showpsetint $arg0 1 end @@ -1833,8 +2416,9 @@ define showport end define showipcobject - set $kgm_object = (ipc_object_t)$arg0 + set $kgm_objectp = (ipc_object_t)$arg0 if ($kgm_objectp->io_bits & 0x7fff0000) + set $kgm_portoff = &(((struct ipc_port *)0)->ip_messages) showpset $kgm_objectp else showport $kgm_objectp @@ -1843,17 +2427,17 @@ end define showmqueue set $kgm_mqueue = *(struct ipc_mqueue *)$arg0 - set $kgm_psetoff = (uintptr_t)&(((struct ipc_pset *)0)->ips_messages) - set $kgm_portoff = (uintptr_t)&(((struct ipc_port *)0)->ip_messages) if ($kgm_mqueue.data.pset.set_queue.wqs_wait_queue.wq_type == 0xf1d1) - set $kgm_psetp = (struct ipc_pset *)(((uintptr_t)$arg0) - $kgm_psetoff) + set $kgm_psetoff = &(((struct ipc_pset *)0)->ips_messages) + set $kgm_pset = (((long)$arg0) - ((long)$kgm_psetoff)) showpsetheader - showpsetint $kgm_psetp 1 + showpsetint $kgm_pset 1 end if ($kgm_mqueue.data.pset.set_queue.wqs_wait_queue.wq_type == 0xf1d0) - set $kgm_portp = (struct ipc_port *)(((uintptr_t)$arg0) - $kgm_portoff) + set $kgm_portoff = &(((struct ipc_port *)0)->ip_messages) + set $kgm_port = (((long)$arg0) - ((long)$kgm_portoff)) showportheader - showportint $kgm_portp 1 + showportint $kgm_port 1 end end @@ -1866,6 +2450,8 @@ define zprint_one printf "%8x ",$kgm_zone->max_size printf "%6d ",$kgm_zone->elem_size printf "%8x ",$kgm_zone->alloc_size + printf " %8d ",$kgm_zone->num_allocs + printf "%8d ",$kgm_zone->num_frees printf "%s ",$kgm_zone->zone_name if ($kgm_zone->exhaustible) @@ -1878,7 +2464,7 @@ define zprint_one printf "X" end if ($kgm_zone->noencrypt) - printf "$" + printf "$" end printf "\n" end @@ -1887,7 +2473,7 @@ end define zprint printf "ZONE " showptrhdrpad - printf " COUNT TOT_SZ MAX_SZ ELT_SZ ALLOC_SZ NAME\n" + printf " COUNT TOT_SZ MAX_SZ ELT_SZ ALLOC_SZ TOT_ALLOC TOT_FREE NAME\n" set $kgm_zone_ptr = (struct zone *)first_zone while ($kgm_zone_ptr != 0) zprint_one $kgm_zone_ptr @@ -1965,6 +2551,7 @@ Syntax: (gdb) showallrwlck end set $kdp_act_counter = 0 +set $kdp_arm_act_counter = 0 set $r0_save = 0 set $r1_save = 0 @@ -1998,115 +2585,118 @@ define switchtoact output/a (unsigned) $newact.continuation echo \n else + if ($kgm_mtype == $kgm_mtype_ppc) + if ($kdp_act_counter == 0) + set $kdpstate = (struct savearea *) kdp.saved_state + end + set $kdp_act_counter = $kdp_act_counter + 1 + set (struct savearea *) kdp.saved_state=$newact->machine->pcb + flushregs + flushstack + set $pc=$newact->machine->pcb.save_srr0 + update + end + if ($kgm_mtype == $kgm_mtype_i386) + set $kdpstatep = (struct x86_saved_state32 *) kdp.saved_state + if ($kdp_act_counter == 0) + set $kdpstate = *($kdpstatep) + end + set $kdp_act_counter = $kdp_act_counter + 1 + + set $kgm_statep = (struct x86_kernel_state *) \ + ($newact->kernel_stack + kernel_stack_size \ + - sizeof(struct x86_kernel_state)) + set $kdpstatep->ebx = $kgm_statep->k_ebx + set $kdpstatep->ebp = $kgm_statep->k_ebp + set $kdpstatep->edi = $kgm_statep->k_edi + set $kdpstatep->esi = $kgm_statep->k_esi + set $kdpstatep->eip = $kgm_statep->k_eip + flushregs + flushstack + set $pc = $kgm_statep->k_eip + update + end + if ($kgm_mtype == $kgm_mtype_x86_64) + set $kdpstatep = (struct x86_saved_state64 *) kdp.saved_state + if ($kdp_act_counter == 0) + set $kdpstate = *($kdpstatep) + end + set $kdp_act_counter = $kdp_act_counter + 1 + + set $kgm_statep = (struct x86_kernel_state *) \ + ($newact->kernel_stack + kernel_stack_size \ + - sizeof(struct x86_kernel_state)) + set $kdpstatep->rbx = $kgm_statep->k_rbx + set $kdpstatep->rbp = $kgm_statep->k_rbp + set $kdpstatep->r12 = $kgm_statep->k_r12 + set $kdpstatep->r13 = $kgm_statep->k_r13 + set $kdpstatep->r14 = $kgm_statep->k_r14 + set $kdpstatep->r15 = $kgm_statep->k_r15 + set $kdpstatep->isf.rsp = $kgm_statep->k_rsp + flushregs + flushstack + set $pc = $kgm_statep->k_rip + update + end + if ($kgm_mtype == $kgm_mtype_arm) + set $kdp_arm_act_counter = $kdp_arm_act_counter + 1 + if ($kdp_arm_act_counter == 1) + set $r0_save = $r0 + set $r1_save = $r1 + set $r2_save = $r2 + set $r3_save = $r3 + set $r4_save = $r4 + set $r5_save = $r5 + set $r6_save = $r6 + set $r7_save = $r7 + set $r8_save = $r8 + set $r9_save = $r9 + set $r10_save = $r10 + set $r11_save = $r11 + set $r12_save = $r12 + set $sp_save = $sp + set $lr_save = $lr + set $pc_save = $pc + end + set $pc_ctx = load_reg+8 + set $kgm_statep = (struct arm_saved_state *)((struct thread*)$arg0)->machine.kstackptr + set $r0 = $kgm_statep->r[0] + set $r1 = $kgm_statep->r[1] + set $r2 = $kgm_statep->r[2] + set $r3 = $kgm_statep->r[3] + set $r4 = $kgm_statep->r[4] + set $r5 = $kgm_statep->r[5] + set $r6 = $kgm_statep->r[6] + set $r8 = $kgm_statep->r[8] + set $r9 = $kgm_statep->r[9] + set $r10 = $kgm_statep->r[10] + set $r11 = $kgm_statep->r[11] + set $r12 = $kgm_statep->r[12] + set $sp = $kgm_statep->sp + set $lr = $kgm_statep->lr + set $pc = $pc_ctx + set $r7 = $kgm_statep->r[7] + flushregs + flushstack + end + end + showcontext_int +end + +document switchtoact +Syntax: switchtoact
+| This command allows gdb to examine the execution context and call +| stack for the specified activation. For example, to view the backtrace +| for an activation issue "switchtoact
", followed by "bt". +| Before resuming execution, issue a "resetctx" command, to +| return to the original execution context. +end + +define switchtoctx + select 0 if ($kgm_mtype == $kgm_mtype_ppc) if ($kdp_act_counter == 0) - set $kdpstate = (struct savearea *) kdp.saved_state - end - set $kdp_act_counter = $kdp_act_counter + 1 - set (struct savearea *) kdp.saved_state=$newact->machine->pcb - flushregs - flushstack - set $pc=$newact->machine->pcb.save_srr0 - update - end - if ($kgm_mtype == $kgm_mtype_i386) - set $kdpstatep = (struct x86_saved_state32 *) kdp.saved_state - if ($kdp_act_counter == 0) - set $kdpstate = *($kdpstatep) - end - set $kdp_act_counter = $kdp_act_counter + 1 - - set $kgm_statep = (struct x86_kernel_state *) \ - ($newact->kernel_stack + kernel_stack_size \ - - sizeof(struct x86_kernel_state)) - set $kdpstatep->ebx = $kgm_statep->k_ebx - set $kdpstatep->ebp = $kgm_statep->k_ebp - set $kdpstatep->edi = $kgm_statep->k_edi - set $kdpstatep->esi = $kgm_statep->k_esi - set $kdpstatep->eip = $kgm_statep->k_eip - flushregs - flushstack - set $pc = $kgm_statep->k_eip - update - end - if ($kgm_mtype == $kgm_mtype_x86_64) - set $kdpstatep = (struct x86_saved_state64 *) kdp.saved_state - if ($kdp_act_counter == 0) - set $kdpstate = *($kdpstatep) - end - set $kdp_act_counter = $kdp_act_counter + 1 - - set $kgm_statep = (struct x86_kernel_state *) \ - ($newact->kernel_stack + kernel_stack_size \ - - sizeof(struct x86_kernel_state)) - set $kdpstatep->rbx = $kgm_statep->k_rbx - set $kdpstatep->rbp = $kgm_statep->k_rbp - set $kdpstatep->r12 = $kgm_statep->k_r12 - set $kdpstatep->r13 = $kgm_statep->k_r13 - set $kdpstatep->r14 = $kgm_statep->k_r14 - set $kdpstatep->r15 = $kgm_statep->k_r15 - set $kdpstatep->isf.rsp = $kgm_statep->k_rsp - flushregs - flushstack - set $pc = $kgm_statep->k_rip - update - end - if ($kgm_mtype == $kgm_mtype_arm) - set $r0_save = $r0 - set $r1_save = $r1 - set $r2_save = $r2 - set $r3_save = $r3 - set $r4_save = $r4 - set $r5_save = $r5 - set $r6_save = $r6 - set $r7_save = $r7 - set $r8_save = $r8 - set $r9_save = $r9 - set $r10_save = $r10 - set $r11_save = $r11 - set $r12_save = $r12 - set $sp_save = $sp - set $lr_save = $lr - set $pc_save = $pc - set $pc_ctx = load_reg+8 - set $kgm_statep = (struct arm_saved_state *)((struct thread*)$arg0)->machine.kstackptr - set $r0 = $kgm_statep->r[0] - set $r1 = $kgm_statep->r[1] - set $r2 = $kgm_statep->r[2] - set $r3 = $kgm_statep->r[3] - set $r4 = $kgm_statep->r[4] - set $r5 = $kgm_statep->r[5] - set $r6 = $kgm_statep->r[6] - set $r8 = $kgm_statep->r[8] - set $r9 = $kgm_statep->r[9] - set $r10 = $kgm_statep->r[10] - set $r11 = $kgm_statep->r[11] - set $r12 = $kgm_statep->r[12] - set $sp = $kgm_statep->sp - set $lr = $kgm_statep->lr - set $pc = $pc_ctx - set $r7 = $kgm_statep->r[7] - flushregs - flushstack - end - end - showcontext_int -end - -document switchtoact -Syntax: switchtoact
-| This command allows gdb to examine the execution context and call -| stack for the specified activation. For example, to view the backtrace -| for an activation issue "switchtoact
", followed by "bt". -| Before resuming execution, issue a "resetctx" command, to -| return to the original execution context. -end - -define switchtoctx - select 0 - if ($kgm_mtype == $kgm_mtype_ppc) - if ($kdp_act_counter == 0) - set $kdpstate = (struct savearea *) kdp.saved_state + set $kdpstate = (struct savearea *) kdp.saved_state end set $kdp_act_counter = $kdp_act_counter + 1 set (struct savearea *) kdp.saved_state=(struct savearea *) $arg0 @@ -2116,24 +2706,26 @@ define switchtoctx update else if ($kgm_mtype == $kgm_mtype_arm) - set arm disassembler std - select-frame 0 - set $r0_save = $r0 - set $r1_save = $r1 - set $r2_save = $r2 - set $r3_save = $r3 - set $r4_save = $r4 - set $r5_save = $r5 - set $r6_save = $r6 - set $r7_save = $r7 - set $r8_save = $r8 - set $r9_save = $r9 - set $r10_save = $r10 - set $r11_save = $r11 - set $r12_save = $r12 - set $sp_save = $sp - set $lr_save = $lr - set $pc_save = $pc + select 0 + set $kdp_arm_act_counter = $kdp_arm_act_counter + 1 + if ($kdp_arm_act_counter == 1) + set $r0_save = $r0 + set $r1_save = $r1 + set $r2_save = $r2 + set $r3_save = $r3 + set $r4_save = $r4 + set $r5_save = $r5 + set $r6_save = $r6 + set $r7_save = $r7 + set $r8_save = $r8 + set $r9_save = $r9 + set $r10_save = $r10 + set $r11_save = $r11 + set $r12_save = $r12 + set $sp_save = $sp + set $lr_save = $lr + set $pc_save = $pc + end set $kgm_statep = (struct arm_saved_state *)$arg0 set $r0 = $kgm_statep->r[0] set $r1 = $kgm_statep->r[1] @@ -2170,33 +2762,36 @@ end define resetctx select 0 if ($kdp_act_counter != 0) - if ($kgm_mtype == $kgm_mtype_ppc) - set (struct savearea *)kdp.saved_state=$kdpstate - flushregs - flushstack - set $pc=((struct savearea *) kdp.saved_state)->save_srr0 - update - set $kdp_act_counter = 0 - end - if ($kgm_mtype == $kgm_mtype_i386) - set $kdpstatep = (struct x86_saved_state32 *) kdp.saved_state - set *($kdpstatep)=$kdpstate - flushregs - flushstack - set $pc=$kdpstatep->eip - update - set $kdp_act_counter = 0 - end - if ($kgm_mtype == $kgm_mtype_x86_64) - set $kdpstatep = (struct x86_saved_state64 *) kdp.saved_state - set *($kdpstatep)=$kdpstate - flushregs - flushstack - set $pc=$kdpstatep->isf.rip - update - set $kdp_act_counter = 0 - end - if ($kgm_mtype == $kgm_mtype_arm) + if ($kgm_mtype == $kgm_mtype_ppc) + set (struct savearea *)kdp.saved_state=$kdpstate + flushregs + flushstack + set $pc=((struct savearea *) kdp.saved_state)->save_srr0 + update + set $kdp_act_counter = 0 + end + if ($kgm_mtype == $kgm_mtype_i386) + set $kdpstatep = (struct x86_saved_state32 *) kdp.saved_state + set *($kdpstatep)=$kdpstate + flushregs + flushstack + set $pc=$kdpstatep->eip + update + set $kdp_act_counter = 0 + end + if ($kgm_mtype == $kgm_mtype_x86_64) + set $kdpstatep = (struct x86_saved_state64 *) kdp.saved_state + set *($kdpstatep)=$kdpstate + flushregs + flushstack + set $pc=$kdpstatep->isf.rip + update + set $kdp_act_counter = 0 + end + showcontext_int + end + if ($kgm_mtype == $kgm_mtype_arm && $kdp_arm_act_counter != 0) + echo Restoring context\n set $r0 = $r0_save flushregs set $r1 = $r1_save @@ -2229,8 +2824,9 @@ define resetctx flushregs set $r7 = $r7_save flushregs - end - showcontext_int + flushstack + update + set $kdp_arm_act_counter = 0 end end @@ -2462,7 +3058,7 @@ define showx86backtrace _loadfrom ($kgm_tmp_frame+$kgm_ret_off) set $kgm_prev_pc = $kgm_loadval set $kgm_frameno = 1 - while $kgm_prev_frame != 0 + while ($kgm_prev_frame != 0) && ($kgm_prev_frame != 0x0000000800000008) printf "%d: Saved frame: 0x%016llx Saved PC: 0x%016llx\n", $kgm_frameno, $kgm_prev_frame, $kgm_prev_pc if (!(($kgm_x86_abi == 0xf) && ($kgm_mtype == $kgm_mtype_i386))) x/i $kgm_prev_pc @@ -2517,7 +3113,7 @@ define showuserstack else if (($kgm_mtype & $kgm_mtype_x86_mask) == $kgm_mtype_x86_any) set $newact = (struct thread *) $arg0 - set $newiss = (x86_saved_state_t *) ($newact->machine.pcb->iss) + set $newiss = (x86_saved_state_t *) ($newact->machine->iss) set $kgm_x86_abi = $newiss.flavor if ($newiss.flavor == 0xf) set $checkpc = $newiss.uss.ss_64.isf.rip @@ -2534,19 +3130,47 @@ define showuserstack else set $kgm_cur_frame = $checkframe set $kgm_cur_pc = $checkpc - printf "You may now issue the showx86backtrace command to see the user space backtrace for this thread (" - showptr $arg0 - printf "); you can also examine memory locations in this address space (pmap " - showptr $newact->task->map->pmap - printf ") before issuing the backtrace. This two-step process is necessary to work around various bugs in x86 gdb, which cause it to stop memory evaluation on spurious memory read errors. Additionally, you may need to issue a set kdp_pmap = 0 command after the showx86backtrace completes, to resume reading from the kernel address space.\n" +# When have more than one argument is present, don't print usage + if ( $argc == 1 ) + printf "You may now issue the showx86backtrace command to see the user space backtrace for this thread (" + showptr $arg0 + printf "); you can also examine memory locations in this address space (pmap " + showptr $newact->task->map->pmap + printf ") before issuing the backtrace. This two-step process is necessary to work around various bugs in x86 gdb, which cause it to stop memory evaluation on spurious memory read errors. Additionally, you may need to issue a set kdp_pmap = 0 command after the showx86backtrace completes, to resume reading from the kernel address space.\n" + end set kdp_pmap = $newact->task->map->pmap _kgm_flush_loop _kgm_update_loop end + else + if ($kgm_mtype == $kgm_mtype_arm) + if (kdp->is_conn > 0) + set $kgm_threadp = (struct thread *)$arg0 + set $kgm_saved_pmap = kdp_pmap + showactheader + showactint $kgm_threadp 0 + set $kgm_thread_pmap = $kgm_threadp->task->map->pmap + set $kgm_thread_sp = $kgm_threadp.machine->PcbData.r[7] + set kdp_pmap = $kgm_thread_pmap + while ($kgm_thread_sp != 0) + set $link_register = *($kgm_thread_sp + 4) + showptrhdrpad + printf " " + showptr $kgm_thread_sp + printf " " + showptr $link_register + printf "\n" + set $kgm_thread_sp = *$kgm_thread_sp + end + set kdp_pmap = $kgm_saved_pmap + else + echo You must be connected via nanokdp to use this macro\n + end else echo showuserstack not supported on this architecture\n end end + end end document showuserstack Syntax: showuserstack
@@ -2560,6 +3184,216 @@ Syntax: showuserstack
|macro in some cases. end +define showtaskuserstacks + set $kgm_taskp = (struct task *)$arg0 + set $kgm_head_actp = &($kgm_taskp->threads) + set $kgm_actp = (struct thread *)($kgm_taskp->threads.next) + while $kgm_actp != $kgm_head_actp + printf "For thread " + showptr $kgm_actp + printf "\n" + showuserstack $kgm_actp quiet + if (($kgm_mtype & $kgm_mtype_x86_mask) == $kgm_mtype_x86_any) + showx86backtrace + end + set kdp_pmap=0 + set $kgm_actp = (struct thread *)($kgm_actp->task_threads.next) + printf "\n" + end + showuserlibraries $kgm_taskp +end +document showtaskuserstacks +Syntax: (gdb) showtaskuserstacks +| Print out the user stack for each thread in a task, followed by the user libraries. +end + + +define showuserregisters + set $kgm_threadp = (struct thread *)$arg0 + set $kgm_taskp = $kgm_threadp->task + if (($kgm_mtype & $kgm_mtype_x86_mask) == $kgm_mtype_x86_any) + set $newiss = (x86_saved_state_t *) ($kgm_threadp->machine.iss) + set $kgm_x86_abi = $newiss.flavor + if ($newiss.flavor == 0xf) + printf "X86 Thread State (64-bit):\n" + set $kgm_ss64 = $newiss.uss.ss_64 + + printf " rax: " + showuserptr $kgm_ss64.rax + printf " rbx: " + showuserptr $kgm_ss64.rbx + printf " rcx: " + showuserptr $kgm_ss64.rcx + printf " rdx: " + showuserptr $kgm_ss64.rdx + printf "\n" + + printf " rdi: " + showuserptr $kgm_ss64.rdi + printf " rsi: " + showuserptr $kgm_ss64.rsi + printf " rbp: " + showuserptr $kgm_ss64.rbp + printf " rsp: " + showuserptr $kgm_ss64.isf.rsp + printf "\n" + + printf " r8: " + showuserptr $kgm_ss64.r8 + printf " r9: " + showuserptr $kgm_ss64.r9 + printf " r10: " + showuserptr $kgm_ss64.r10 + printf " r11: " + showuserptr $kgm_ss64.r11 + printf "\n" + + printf " r12: " + showuserptr $kgm_ss64.r12 + printf " r13: " + showuserptr $kgm_ss64.r13 + printf " r14: " + showuserptr $kgm_ss64.r14 + printf " r15: " + showuserptr $kgm_ss64.r15 + printf "\n" + + printf " rip: " + showuserptr $kgm_ss64.isf.rip + printf " rfl: " + showuserptr $kgm_ss64.isf.rflags + printf " cr2: " + showuserptr $kgm_ss64.cr2 + printf "\n" + else + printf "X86 Thread State (32-bit):\n" + set $kgm_ss32 = $newiss.uss.ss_32 + + printf " eax: " + showuserptr $kgm_ss32.eax + printf " ebx: " + showuserptr $kgm_ss32.ebx + printf " ecx: " + showuserptr $kgm_ss32.ecx + printf " edx: " + showuserptr $kgm_ss32.edx + printf "\n" + + printf " edi: " + showuserptr $kgm_ss32.edi + printf " esi: " + showuserptr $kgm_ss32.esi + printf " ebp: " + showuserptr $kgm_ss32.ebp + printf " esp: " + showuserptr $kgm_ss32.uesp + printf "\n" + + printf " ss: " + showuserptr $kgm_ss32.ss + printf " efl: " + showuserptr $kgm_ss32.efl + printf " eip: " + showuserptr $kgm_ss32.eip + printf " cs: " + showuserptr $kgm_ss32.cs + printf "\n" + + printf " ds: " + showuserptr $kgm_ss32.ds + printf " es: " + showuserptr $kgm_ss32.es + printf " fs: " + showuserptr $kgm_ss32.fs + printf " gs: " + showuserptr $kgm_ss32.gs + printf "\n" + + printf " cr2: " + showuserptr $kgm_ss32.cr2 + printf "\n" + end + else + if ($kgm_mtype == $kgm_mtype_arm) + printf "ARM Thread State:\n" + set $kgm_pcb = (arm_saved_state_t *) ($kgm_threadp->machine.upcb) + + printf " r0: " + showuserptr $kgm_pcb.r[0] + printf " r1: " + showuserptr $kgm_pcb.r[1] + printf " r2: " + showuserptr $kgm_pcb.r[2] + printf " r3: " + showuserptr $kgm_pcb.r[3] + printf "\n" + + printf " r4: " + showuserptr $kgm_pcb.r[4] + printf " r5: " + showuserptr $kgm_pcb.r[5] + printf " r6: " + showuserptr $kgm_pcb.r[6] + printf " r7: " + showuserptr $kgm_pcb.r[7] + printf "\n" + + printf " r8: " + showuserptr $kgm_pcb.r[8] + printf " r9: " + showuserptr $kgm_pcb.r[9] + printf " r10: " + showuserptr $kgm_pcb.r[10] + printf " r11: " + showuserptr $kgm_pcb.r[11] + printf "\n" + + printf " ip: " + showuserptr $kgm_pcb.r[12] + printf " sp: " + showuserptr $kgm_pcb.sp + printf " lr: " + showuserptr $kgm_pcb.lr + printf " pc: " + showuserptr $kgm_pcb.pc + printf "\n" + + printf " cpsr: " + showuserptr $kgm_pcb.cpsr + printf "\n" + else + echo showuserregisters not supported on this architecture\n + end + end +end +document showuserregisters +Syntax: showuserstack
+|This command displays the last known user register state +|for the thread. This map not be correct for cases where +|the thread is currently executing in userspace. However +|for threads that have entered the kernel (either explicitly +|with a system call or implicitly with a fault), it should +|be accurate +end + +define showtaskuserregisters + set $kgm_taskp = (struct task *)$arg0 + set $kgm_head_actp = &($kgm_taskp->threads) + set $kgm_actp = (struct thread *)($kgm_taskp->threads.next) + while $kgm_actp != $kgm_head_actp + printf "For thread " + showptr $kgm_actp + printf "\n" + showuserregisters $kgm_actp + set $kgm_actp = (struct thread *)($kgm_actp->task_threads.next) + printf "\n" + end +end +document showtaskuserregisters +Syntax: (gdb) showtaskuserregisters +| Print out the user registers for each thread in a task +end + define kdp-reboot # Alternatively, set *(*(unsigned **) 0x2498) = 1 # (or 0x5498 on PPC, 0xffffff8000002928 on x86_64, 0xffff049c on arm) @@ -2605,7 +3439,7 @@ define dumpinfoint set manual_pkt.input = 0 set manual_pkt.len = sizeof(kdp_dumpinfo_req_t) - set $kgm_pkt = (kdp_dumpinfo_req_t *)manual_pkt.data + set $kgm_pkt = (kdp_dumpinfo_req_t *)&manual_pkt.data set $kgm_pkt->hdr.request = KDP_DUMPINFO set $kgm_pkt->hdr.len = sizeof(kdp_dumpinfo_req_t) set $kgm_pkt->hdr.is_reply = 0 @@ -2697,7 +3531,7 @@ define getdumpinfo dumpinfoint KDP_DUMPINFO_GETINFO set $kgm_dumpinfo = (kdp_dumpinfo_reply_t *) manual_pkt.data if $kgm_dumpinfo->type & KDP_DUMPINFO_REBOOT - printf "System will reboot after kernel info gets dumped.\n" + printf "Sysem will reboot after kernel info gets dumped.\n" else printf "Sysem will not reboot after kernel info gets dumped.\n" end @@ -3257,6 +4091,28 @@ define showregdictionary end +define showorderedsetarrayint + set $kgm$arg0_array = (_Element *)$arg1 + set $kgm$arg0_count = $arg2 + + set $kgm$arg0_idx = 0 + while ($kgm$arg0_idx < $kgm$arg0_count) + set $kgm_obj = $kgm$arg0_array[$kgm$arg0_idx++] + showobjectint _$arg0 $kgm_obj + if ($kgm$arg0_idx < $kgm$arg0_count) + printf "," + end + end +end + +define showorderedsetint + set $kgm_array = ((OSOrderedSet *)$arg1)->array + set $count = ((OSOrderedSet *)$arg1)->count + printf "[" + showorderedsetarrayint $arg0 $kgm_array $count + printf "]" +end + define showarraysetint set $kgm$arg0_array = (OSArray *)$arg1 @@ -3341,6 +4197,10 @@ define showobjectint showsetint _$arg0 $arg1 set $kgm_shown = 1 end + if ($kgm_vt == &_ZTV12OSOrderedSet) + showorderedsetint _$arg0 $arg1 + set $kgm_shown = 1 + end if ($kgm_shown != 1) if ($kgm_show_object_addrs == 0) @@ -3435,7 +4295,7 @@ define findregistryentryrecurse print $kgm_re end - # if we want to show everything, then don't populate $kgm_registry_entry + # don't populate $kgm_registry_entry if we want to show everything if !$kgm_findregistry_continue set $kgm_registry_entry = $kgm_re end @@ -3845,38 +4705,84 @@ Syntax: (gdb) showosobjecttracking | Set gOSObjectTrackThread to 1 or a thread_t to capture new OSObjects allocated by a thread or all threads. end +# $kgm_readphys_force_kdp and $kgm_readphys_force_physmap +# can respectively cause physical memory access to use +# a KDP manual packet or the physical memory mapping +# even if the default behavior would be otherwise. define readphysint - set $kgm_readphysint_result = 0xBAD10AD - # set up the manual KDP packet - set manual_pkt.input = 0 - set manual_pkt.len = sizeof(kdp_readphysmem64_req_t) - set $kgm_pkt = (kdp_readphysmem64_req_t *)&manual_pkt.data - set $kgm_pkt->hdr.request = KDP_READPHYSMEM64 - set $kgm_pkt->hdr.len = sizeof(kdp_readphysmem64_req_t) - set $kgm_pkt->hdr.is_reply = 0 - set $kgm_pkt->hdr.seq = 0 - set $kgm_pkt->hdr.key = 0 - set $kgm_pkt->address = (uint64_t)$arg0 - set $kgm_pkt->nbytes = $arg1 >> 3 - set $kgm_pkt->lcpu = $arg2 - set manual_pkt.input = 1 - # dummy to make sure manual packet is executed - set $kgm_dummy = &_mh_execute_header - set $kgm_pkt = (kdp_readphysmem64_reply_t *)&manual_pkt.data - if ($kgm_pkt->error == 0) - if $arg1 == 8 - set $kgm_readphysint_result = *((uint8_t *)$kgm_pkt->data) - end - if $arg1 == 16 - set $kgm_readphysint_result = *((uint16_t *)$kgm_pkt->data) - end - if $arg1 == 32 - set $kgm_readphysint_result = *((uint32_t *)$kgm_pkt->data) - end - if $arg1 == 64 - set $kgm_readphysint_result = *((uint64_t *)$kgm_pkt->data) - end - end + set $kgm_readphysint_result = 0xBAD10AD + + if ($kgm_readphys_force_kdp != 0) + set $kgm_readphys_use_kdp = 1 + else + if ($kgm_readphys_force_physmap) + set $kgm_readphys_use_kdp = 0 + else + set $kgm_readphys_use_kdp = ( kdp->is_conn > 0 ) + end + end + + if ($kgm_readphys_use_kdp) + + # set up the manual KDP packet + set manual_pkt.input = 0 + set manual_pkt.len = sizeof(kdp_readphysmem64_req_t) + set $kgm_pkt = (kdp_readphysmem64_req_t *)&manual_pkt.data + set $kgm_pkt->hdr.request = KDP_READPHYSMEM64 + set $kgm_pkt->hdr.len = sizeof(kdp_readphysmem64_req_t) + set $kgm_pkt->hdr.is_reply = 0 + set $kgm_pkt->hdr.seq = 0 + set $kgm_pkt->hdr.key = 0 + set $kgm_pkt->address = (uint64_t)$arg0 + set $kgm_pkt->nbytes = $arg1 >> 3 + set $kgm_pkt->lcpu = $arg2 + set manual_pkt.input = 1 + # dummy to make sure manual packet is executed + set $kgm_dummy = &_mh_execute_header + set $kgm_pkt = (kdp_readphysmem64_reply_t *)&manual_pkt.data + if ($kgm_pkt->error == 0) + if $arg1 == 8 + set $kgm_readphysint_result = *((uint8_t *)$kgm_pkt->data) + end + if $arg1 == 16 + set $kgm_readphysint_result = *((uint16_t *)$kgm_pkt->data) + end + if $arg1 == 32 + set $kgm_readphysint_result = *((uint32_t *)$kgm_pkt->data) + end + if $arg1 == 64 + set $kgm_readphysint_result = *((uint64_t *)$kgm_pkt->data) + end + end + + else + # No KDP. Attempt to use physical memory mapping + + if ($kgm_mtype == $kgm_mtype_x86_64) + set $kgm_readphys_paddr_in_kva = (unsigned long long)$arg0 + (((unsigned long long)-1 << 47) | ((unsigned long long)509 << 39)) + else + if ($kgm_mtype == $kgm_mtype_arm) + set $kgm_readphys_paddr_in_kva = (unsigned long long)$arg0 - gPhysBase + gVirtBase + else + printf "readphys not available for current architecture.\n" + set $kgm_readphys_paddr_in_kva = 0 + end + end + if $kgm_readphys_paddr_in_kva + if $arg1 == 8 + set $kgm_readphysint_result = *((uint8_t *)$kgm_readphys_paddr_in_kva) + end + if $arg1 == 16 + set $kgm_readphysint_result = *((uint16_t *)$kgm_readphys_paddr_in_kva) + end + if $arg1 == 32 + set $kgm_readphysint_result = *((uint32_t *)$kgm_readphys_paddr_in_kva) + end + if $arg1 == 64 + set $kgm_readphysint_result = *((uint64_t *)$kgm_readphys_paddr_in_kva) + end + end + end end define readphys8 @@ -3994,8 +4900,10 @@ document writephys64 end define addkextsyms - shell ls $arg0/* | xargs -n 1 echo add-symbol-file > /tmp/gdb-syms - source /tmp/gdb-syms + shell echo cd `pwd` > /tmp/gdb-cd + cd $arg0 + source kcbmacros + source /tmp/gdb-cd set $kgm_show_kmod_syms = 1 end @@ -4176,7 +5084,7 @@ define showprocinfo # decode part of credential set $kgm_spi_cred = $kgm_spi_proc->p_ucred if ($kgm_spi_cred != 0) - printf "Cred: euid %d ruid %d svuid %d\n", $kgm_spi_cred->cr_uid, $kgm_spi_cred->cr_ruid, $kgm_spi_cred->cr_svuid + printf "Cred: euid %d ruid %d svuid %d\n", $kgm_spi_cred->cr_posix.cr_uid, $kgm_spi_cred->cr_posix.cr_ruid, $kgm_spi_cred->cr_posix.cr_svuid else printf "Cred: (null)\n" end @@ -4366,7 +5274,138 @@ Syntax: (gdb) allproc | Routine to print out all process in the system | which are not in the zombie list end +define showprocsiblingint + set $kgm_sibling_ptr = (struct proc *)$arg0 + set $kgm_lx = $arg1 + while $kgm_lx + printf "| " + set $kgm_lx = $kgm_lx-3 + end + printf "|--%d %s [ 0x%llx ]\n", $kgm_sibling_ptr->p_pid, $kgm_sibling_ptr->p_comm, $kgm_sibling_ptr +end +define showproctreeint +#Initialize all the set variables used in this macro + set $kgm_basep1 = 0 + set $kgm_sibling_ptr = 0 + set $kgm_lx = 0 + set $kgm_tmp_base = 0 + set $kgm_head_ptr = 0 + set $kgm_search_pid = 0 + set $kgm_rev = 0 + set $kgm_x = 0 + + set $kgm_basep1 = (struct proc *)allproc->lh_first + if ($arg0 == 0) + set $kgm_head_ptr = (struct proc *)initproc + end + if ($arg0 > 0) + set $kgm_tmp_base = (struct proc *)allproc->lh_first + set $kgm_search_pid = $arg0 + while $kgm_tmp_base + if ( $kgm_tmp_base->p_pid == $kgm_search_pid) + if ($kgm_tmp_base->p_childrencnt > 0) + set $kgm_head_ptr = $kgm_tmp_base->p_children.lh_first + else + set $kgm_head_ptr = 0 + printf "No children present for PID=%d", $kgm_search_pid + end + loop_break + end + set $kgm_tmp_base = $kgm_tmp_base->p_list.le_next + end + end + set $kgm_rev = 0 + set $kgm_x = 0 + if ($kgm_head_ptr) + printf "PID PROCESS POINTER]\n" + printf "=== ======= =======\n" + printf "%d %s [ 0x%llx ]\n", $kgm_head_ptr->p_ppid, $kgm_head_ptr->p_pptr->p_comm, $kgm_head_ptr + printf "|--%d %s [ 0x%llx ]\n", $kgm_head_ptr->p_pid, $kgm_head_ptr->p_comm, $kgm_head_ptr + end + while ($kgm_head_ptr) + #Is childrencnt = 0? YES {=> no children} + if ($kgm_head_ptr->p_childrencnt == 0) + # Does it have sibling? + if($kgm_head_ptr->p_sibling.le_next == 0) + #No, it does not have sibling, so go back to its parent which will go to its sibling + if($kgm_head_ptr == $kgm_head_ptr->p_pptr) + loop_break + end + set $kgm_head_ptr = $kgm_head_ptr->p_pptr + if ($kgm_head_ptr == $kgm_tmp_base) + loop_break + end + if ($kgm_x > 3) + set $kgm_x = $kgm_x - 3 + end + set $kgm_rev = 1 + end + if($kgm_head_ptr->p_sibling.le_next != 0) + # Yes, it has sibling. So print sibling + set $kgm_rev = 0 + showprocsiblingint $kgm_head_ptr->p_sibling.le_next $kgm_x + set $kgm_head_ptr = $kgm_head_ptr->p_sibling.le_next + end + # childrencnt != 0 {=> it has children} + else + if ($kgm_rev == 1) + if($kgm_head_ptr->p_sibling.le_next == 0) + #No, it does not have sibling, so go back to its parent which will go to its sibling + if($kgm_head_ptr == $kgm_head_ptr->p_pptr) + loop_break + end + set $kgm_head_ptr = $kgm_head_ptr->p_pptr + if ($kgm_head_ptr == $kgm_tmp_base) + loop_break + end + if ($kgm_x > 3) + set $kgm_x = $kgm_x - 3 + end + set $kgm_rev = 1 + end + if($kgm_head_ptr->p_sibling.le_next != 0) + set $kgm_rev = 0 + # Yes, it has sibling. So print sibling + showprocsiblingint $kgm_head_ptr->p_sibling.le_next $kgm_x + set $kgm_head_ptr = $kgm_head_ptr->p_sibling.le_next + end + else + set $kgm_head_ptr = $kgm_head_ptr->p_children.lh_first + set $kgm_x = $kgm_x + 3 + set $kgm_lx = $kgm_x + while $kgm_lx + printf "| " + set $kgm_lx = $kgm_lx-3 + end + printf "|--%d %s [ 0x%llx ] \n", $kgm_head_ptr->p_pid, $kgm_head_ptr->p_comm, $kgm_head_ptr + end + end + end + printf "\n" +#Unset all the set variables used in this macro + set $kgm_basep1 = 0 + set $kgm_sibling_ptr = 0 + set $kgm_lx = 0 + set $kgm_tmp_base = 0 + set $kgm_head_ptr = 0 + set $kgm_search_pid = 0 + set $kgm_rev = 0 + set $kgm_x = 0 +end +define showproctree + if ($argc > 0) + showproctreeint $arg0 + else + showproctreeint 0 + end +end +document showproctree +Syntax: (gdb) showproctree +| Routine to print the processes in the system in a hierarchical tree form. This routine does not print zombie processes. +| If no argument is given, showproctree will print all the processes in the system. +| If pid is specified, showproctree prints all the descendants of the indicated process +end define print_vnode @@ -4644,9 +5683,13 @@ end define mbuf_buf2slab set $addr = $arg0 set $gix = ((char *)$addr - (char *)mbutl) >> 20 - set $ix = ((char *)$addr - (char *)mbutl) >> 11 + set $ix = ((char *)$addr - (char *)slabstbl[$gix].slg_slab[0].sl_base) >> 12 set $slab = &slabstbl[$gix].slg_slab[$ix] - printf "%p", $slab + if $kgm_lp64 + printf "0x%-16llx", $slab + else + printf "0x%-8x", $slab + end end document mbuf_buf2slab @@ -4655,11 +5698,15 @@ end define mbuf_buf2mca set $addr = $arg0 - set $ix = ((char *)$addr - (char *)mbutl) >> 11 - set $clbase = ((union mcluster *)(mbutl + $ix)) + set $ix = ((char *)$addr - (char *)mbutl) >> 12 + set $clbase = ((union mbigcluster *)mbutl) + $ix set $mclidx = (((char *)$addr - (char *)$clbase) >> 8) set $mca = mclaudit[$ix].cl_audit[$mclidx] - printf "mca: %p", $mca + if $kgm_lp64 + printf "mca: 0x%-16llx", $mca + else + printf "mca: 0x%-8x", $mca + end end document mbuf_buf2mca @@ -4677,11 +5724,11 @@ define mbuf_showmca mbuf_mca_ctype $mca 1 printf "\ncontrolling mcache:\t%p (%s)\n", $mca->mca_cache, $cp->mc_name if $mca->mca_uflags & $MB_SCVALID - set $ix = ((char *)$mca->mca_addr - (char *)mbutl) >> 11 - set $clbase = ((union mcluster *)(mbutl + $ix)) + set $ix = ((char *)$mca->mca_addr - (char *)mbutl) >> 12 + set $clbase = ((union mbigcluster *)mbutl) + $ix set $mclidx = (((char *)$mca->mca_addr - (char *)$clbase) >> 8) printf "mbuf obj:\t\t%p\n", $mca->mca_addr - printf "mbuf index:\t\t%d (out of 8) in cluster base %p\n", \ + printf "mbuf index:\t\t%d (out of 16) in cluster base %p\n", \ $mclidx + 1, $clbase if $mca->mca_uptr != 0 set $peer_mca = (mcache_audit_t *)$mca->mca_uptr @@ -4729,10 +5776,56 @@ Syntax: (gdb) mbuf_showmca | records including the stack trace of the last buffer transaction. end -set $MCF_NOCPUCACHE = 0x10 - -define mcache_stat - set $head = (mcache_t *)mcache_head +define mbuf_topleak + set language c + set $topcnt = 0 + if $arg0 < 5 + set $maxcnt = $arg0 + else + set $maxcnt = 5 + end + while $topcnt < $maxcnt + mbuf_traceleak mleak_top_trace[$topcnt] + set $topcnt = $topcnt + 1 + end + set language auto +end + +document mbuf_topleak +Syntax: (gdb) mbuf_topleak +| Prints information about the top suspected mbuf leakers +| where is a value <= 5 +end + +define mbuf_traceleak + set language c + set $trace = (struct mtrace *) $arg0 + if $trace->allocs != 0 + printf "%p:%d outstanding allocs\n", $trace, $trace->allocs + printf "backtrace saved %d deep:\n", $trace->depth + if $trace->depth != 0 + set $cnt = 0 + while $cnt < $trace->depth + printf "%4d: ", $cnt + 1 + pcprint $trace->addr[$cnt] + printf "\n" + set $cnt = $cnt + 1 + end + end + end + set language auto +end + +document mbuf_traceleak +Syntax: (gdb) mbuf_traceleak +| Given an mbuf leak trace (mtrace) structure address, print out the +| stored information with that trace +end + +set $MCF_NOCPUCACHE = 0x10 + +define mcache_stat + set $head = (mcache_t *)mcache_head set $mc = $head if $kgm_lp64 @@ -4868,16 +5961,26 @@ set $NSLABSPMB = sizeof(mcl_slabg_t)/sizeof(mcl_slab_t) define mbuf_slabstbl set $x = 0 - printf "slot addr slabs range\n" - printf "---- ---------- -----------------------\n" + if $kgm_lp64 + printf "slot slabg slabs range\n" + printf "---- ------------------ -------------------------------------------\n" + else + printf "slot slabg slabs range\n" + printf "---- ---------- ---------------------------\n" + end while $x < maxslabgrp set $slg = slabstbl[$x] printf "%3d: ", $x if $slg == 0 printf "-\n" else - printf "%p [%p-%p]\n", $slg, &$slg->slg_slab[0], \ - &$slg->slg_slab[$NSLABSPMB-1] + if $kgm_lp64 + printf "0x%-16llx [ 0x%-16llx - 0x%-16llx ]\n", $slg, &$slg->slg_slab[0], \ + &$slg->slg_slab[$NSLABSPMB-1] + else + printf "0x%-8x [ 0x%-8x - 0x%-8x ]\n", $slg, &$slg->slg_slab[0], \ + &$slg->slg_slab[$NSLABSPMB-1] + end end set $x += 1 end @@ -4895,19 +5998,36 @@ define mbuf_slabs set $slg = (mcl_slabg_t *)$arg0 set $x = 0 - if $kgm_lp64 - printf "slot addr next base C R N size flags\n" - printf "---- ------------------ ------------------ ------------------ -- -- -- ------ -----\n" + if $kgm_lp64 + printf "slot slab next obj mca C R N size flags\n" + printf "---- ------------------ ------------------ ------------------ ------------------ -- -- -- ------ -----\n" else - printf "slot addr next base C R N size flags\n" - printf "---- ---------- ---------- ---------- -- -- -- ------ -----\n" + printf "slot slab next obj mca C R N size flags\n" + printf "---- ---------- ---------- ---------- ---------- -- -- -- ------ -----\n" end while $x < $NSLABSPMB set $sl = &$slg->slg_slab[$x] - printf "%3d: %p %p %p %2d %2d %2d %6d 0x%04x ", \ - $x + 1, $sl, $sl->sl_next, $sl->sl_base, $sl->sl_class, \ - $sl->sl_refcnt, $sl->sl_chunks, $sl->sl_len, \ - $sl->sl_flags + set $mca = 0 + set $obj = $sl->sl_base + + if mclaudit != 0 + set $ix = ((char *)$obj - (char *)mbutl) >> 12 + set $clbase = ((union mbigcluster *)mbutl) + $ix + set $mclidx = (((char *)$obj - (char *)$clbase) >> 8) + set $mca = mclaudit[$ix].cl_audit[$mclidx] + end + + if $kgm_lp64 + printf "%3d: 0x%-16llx 0x%-16llx 0x%-16llx 0x%-16llx %2d %2d %2d %6d 0x%04x ", \ + $x + 1, $sl, $sl->sl_next, $obj, $mca, $sl->sl_class, \ + $sl->sl_refcnt, $sl->sl_chunks, $sl->sl_len, \ + $sl->sl_flags + else + printf "%3d: 0x%-8x 0x%-8x 0x%-8x 0x%-8x %2d %2d %2d %6d 0x%04x ", \ + $x + 1, $sl, $sl->sl_next, $obj, $mca, $sl->sl_class, \ + $sl->sl_refcnt, $sl->sl_chunks, $sl->sl_len, \ + $sl->sl_flags + end if $sl->sl_flags != 0 printf "<" if $sl->sl_flags & $SLF_MAPPED @@ -4922,6 +6042,31 @@ define mbuf_slabs printf ">" end printf "\n" + + if $sl->sl_chunks > 1 + set $z = 1 + set $c = $sl->sl_len / $sl->sl_chunks + + while $z < $sl->sl_chunks + set $obj = $sl->sl_base + ($c * $z) + set $mca = 0 + + if mclaudit != 0 + set $ix = ((char *)$obj - (char *)mbutl) >> 12 + set $clbase = ((union mbigcluster *)mbutl) + $ix + set $mclidx = (((char *)$obj - (char *)$clbase) >> 8) + set $mca = mclaudit[$ix].cl_audit[$mclidx] + end + + if $kgm_lp64 + printf " 0x%-16llx 0x%-16llx\n", $obj, $mca + else + printf " 0x%-8x 0x%-8x\n", $obj, $mca + end + set $z += 1 + end + end + set $x += 1 end end @@ -5183,7 +6328,7 @@ define mbuf_walkallslabs end printf "objects; this may take a while ...)\n\n" - if $kgm_lp64 + if $kgm_lp64 printf " slab mca obj allocation\n" printf "slot idx address address address type state\n" printf "---- ---- ------------------ ------------------ ------------------ ----- -----------\n" @@ -5200,8 +6345,8 @@ define mbuf_walkallslabs while $y < $NSLABSPMB && $stop == 0 set $sl = &$slg->slg_slab[$y] set $base = (char *)$sl->sl_base - set $ix = ($base - (char *)mbutl) >> 11 - set $clbase = ((union mcluster *)(mbutl + $ix)) + set $ix = ($base - (char *)mbutl) >> 12 + set $clbase = ((union mbigcluster *)mbutl) + $ix set $mclidx = ($base - (char *)$clbase) >> 8 set $mca = mclaudit[$ix].cl_audit[$mclidx] set $first = 1 @@ -5218,7 +6363,11 @@ define mbuf_walkallslabs if $printmca != 0 if $first == 1 - printf "%4d %4d %p ", $x, $y, $sl + if $kgm_lp64 + printf "%4d %4d 0x%-16llx ", $x, $y, $sl + else + printf "%4d %4d 0x%-8x ", $x, $y, $sl + end else if $kgm_lp64 printf " " @@ -5227,7 +6376,12 @@ define mbuf_walkallslabs end end - printf "%p %p ", $mca, $mca->mca_addr + if $kgm_lp64 + printf "0x%-16llx 0x%-16llx ", $mca, $mca->mca_addr + else + printf "0x%-8x 0x%-8x ", $mca, $mca->mca_addr + end + mbuf_mca_ctype $mca 0 if $mca->mca_uflags & ($MB_INUSE|$MB_COMP_INUSE) printf "active " @@ -5276,6 +6430,38 @@ document mbuf_walkallslabs | parameter. This is a backend routine for mbuf_show{active,inactive,all}. end +define mbuf_countchain + set $mp = (struct mbuf *)$arg0 + + set $pkt = 0 + set $nxt = 0 + + while $mp != 0 + set $pkt = $pkt + 1 + + set $mn = (struct mbuf *)$mp->m_hdr.mh_next + while $mn != 0 + set $nxt = $nxt + 1 + + set $mn = (struct mbuf *)$mn->m_hdr.mh_next + end + + set $mp = $mp->m_hdr.mh_nextpkt + + if (($pkt + $nxt) % 50) == 0 + printf "... %d\n", $pkt + $nxt + end + end + + printf "\ntotal: %d (via m_next: %d)\n", $pkt + $nxt, $nxt +end + +document mbuf_countchain +Syntax: mbuf_countchain +| Count the total number of mbufs chained from the given the address of an mbuf. +| The routine follows both the m_next pointers and m_nextpkt pointers. +end + set $RTF_UP = 0x1 set $RTF_GATEWAY = 0x2 set $RTF_HOST = 0x4 @@ -5659,6 +6845,8 @@ Syntax: (gdb) rtentry_showdbg | parameter. end +set $INIFA_TRACE_HIST_SIZE = inifa_trace_hist_size + define inifa_showdbg set $inifa = (struct in_ifaddr_dbg *)$arg0 set $cnt = 0 @@ -5694,7 +6882,7 @@ define inifa_showdbg end set $ix = $ix + 1 end - while $cnt < $CTRACE_HIST_SIZE + while $cnt < $INIFA_TRACE_HIST_SIZE set $ix = 0 while $ix < $CTRACE_STACK_SIZE set $kgm_pc = $inifa->inifa_refhold[$cnt].pc[$ix] @@ -5712,7 +6900,7 @@ define inifa_showdbg set $cnt = $cnt + 1 end set $cnt = 0 - while $cnt < $CTRACE_HIST_SIZE + while $cnt < $INIFA_TRACE_HIST_SIZE set $ix = 0 while $ix < $CTRACE_STACK_SIZE set $kgm_pc = $inifa->inifa_refrele[$cnt].pc[$ix] @@ -5739,6 +6927,8 @@ Syntax: (gdb) inifa_showdbg | parameter. end +set $IN6IFA_TRACE_HIST_SIZE = in6ifa_trace_hist_size + define in6ifa_showdbg set $in6ifa = (struct in6_ifaddr_dbg *)$arg0 set $cnt = 0 @@ -5774,7 +6964,7 @@ define in6ifa_showdbg end set $ix = $ix + 1 end - while $cnt < $CTRACE_HIST_SIZE + while $cnt < $IN6IFA_TRACE_HIST_SIZE set $ix = 0 while $ix < $CTRACE_STACK_SIZE set $kgm_pc = $in6ifa->in6ifa_refhold[$cnt].pc[$ix] @@ -5792,31 +6982,570 @@ define in6ifa_showdbg set $cnt = $cnt + 1 end set $cnt = 0 - while $cnt < $CTRACE_HIST_SIZE - set $ix = 0 - while $ix < $CTRACE_STACK_SIZE - set $kgm_pc = $in6ifa->in6ifa_refrele[$cnt].pc[$ix] - if $kgm_pc != 0 - if $ix == 0 - printf "\nRelease [%d] (thread %p):\n",\ - $cnt, $in6ifa->in6ifa_refrele[$cnt].th - end - printf "%4d: ", $ix + 1 - pcprint $kgm_pc - printf "\n" + while $cnt < $IN6IFA_TRACE_HIST_SIZE + set $ix = 0 + while $ix < $CTRACE_STACK_SIZE + set $kgm_pc = $in6ifa->in6ifa_refrele[$cnt].pc[$ix] + if $kgm_pc != 0 + if $ix == 0 + printf "\nRelease [%d] (thread %p):\n",\ + $cnt, $in6ifa->in6ifa_refrele[$cnt].th + end + printf "%4d: ", $ix + 1 + pcprint $kgm_pc + printf "\n" + end + set $ix = $ix + 1 + end + set $cnt = $cnt + 1 + end +end + +document in6ifa_showdbg +Syntax: (gdb) in6ifa_showdbg +| Given an IPv6 interface structure address, print the debug information +| related to it. This requires interface address debugging to be turned +| on, by setting the appropriate flags to the "ifa_debug" boot-args +| parameter. +end + +set $IFMA_TRACE_HIST_SIZE = ifma_trace_hist_size + +define ifma_showdbg + set $ifma = (struct ifmultiaddr_dbg *)$arg0 + set $cnt = 0 + + printf "Total holds:\t%d\n", $ifma->ifma_refhold_cnt + printf "Total releases:\t%d\n", $ifma->ifma_refrele_cnt + + while $cnt < $IFMA_TRACE_HIST_SIZE + set $ix = 0 + while $ix < $CTRACE_STACK_SIZE + set $kgm_pc = $ifma->ifma_refhold[$cnt].pc[$ix] + if $kgm_pc != 0 + if $ix == 0 + printf "\nHold [%d] (thread %p):\n", \ + $cnt, $ifma->ifma_refhold[$cnt].th + end + printf "%4d: ", $ix + 1 + pcprint $kgm_pc + printf "\n" + end + set $ix = $ix + 1 + end + set $cnt = $cnt + 1 + end + set $cnt = 0 + while $cnt < $IFMA_TRACE_HIST_SIZE + set $ix = 0 + while $ix < $CTRACE_STACK_SIZE + set $kgm_pc = $ifma->ifma_refrele[$cnt].pc[$ix] + if $kgm_pc != 0 + if $ix == 0 + printf "\nRelease [%d] (thread %p):\n",\ + $cnt, $ifma->ifma_refrele[$cnt].th + end + printf "%4d: ", $ix + 1 + pcprint $kgm_pc + printf "\n" + end + set $ix = $ix + 1 + end + set $cnt = $cnt + 1 + end +end + +document ifma_showdbg +Syntax: (gdb) ifma_showdbg +| Given a link multicast structure address, print the debug information +| related to it. This requires interface address debugging to be turned +| on, by setting the appropriate flags to the "ifa_debug" boot-args +| parameter. +end + +set $INM_TRACE_HIST_SIZE = inm_trace_hist_size + +define inm_showdbg + set $inm = (struct in_multi_dbg *)$arg0 + set $cnt = 0 + + printf "Total holds:\t%d\n", $inm->inm_refhold_cnt + printf "Total releases:\t%d\n", $inm->inm_refrele_cnt + + while $cnt < $INM_TRACE_HIST_SIZE + set $ix = 0 + while $ix < $CTRACE_STACK_SIZE + set $kgm_pc = $inm->inm_refhold[$cnt].pc[$ix] + if $kgm_pc != 0 + if $ix == 0 + printf "\nHold [%d] (thread %p):\n", \ + $cnt, $inm->inm_refhold[$cnt].th + end + printf "%4d: ", $ix + 1 + pcprint $kgm_pc + printf "\n" + end + set $ix = $ix + 1 + end + set $cnt = $cnt + 1 + end + set $cnt = 0 + while $cnt < $INM_TRACE_HIST_SIZE + set $ix = 0 + while $ix < $CTRACE_STACK_SIZE + set $kgm_pc = $inm->inm_refrele[$cnt].pc[$ix] + if $kgm_pc != 0 + if $ix == 0 + printf "\nRelease [%d] (thread %p):\n",\ + $cnt, $inm->inm_refrele[$cnt].th + end + printf "%4d: ", $ix + 1 + pcprint $kgm_pc + printf "\n" + end + set $ix = $ix + 1 + end + set $cnt = $cnt + 1 + end +end + +document inm_showdbg +Syntax: (gdb) inm_showdbg +| Given an IPv4 multicast structure address, print the debug information +| related to it. This requires interface address debugging to be turned +| on, by setting the appropriate flags to the "ifa_debug" boot-args +| parameter. +end + +set $IF_REF_TRACE_HIST_SIZE = if_ref_trace_hist_size + +define ifpref_showdbg + set $dl_if = (struct dlil_ifnet_dbg *)$arg0 + set $cnt = 0 + + printf "Total references:\t%d\n", $dl_if->dldbg_if_refhold_cnt + printf "Total releases:\t\t%d\n", $dl_if->dldbg_if_refrele_cnt + + while $cnt < $IF_REF_TRACE_HIST_SIZE + set $ix = 0 + while $ix < $CTRACE_STACK_SIZE + set $kgm_pc = $dl_if->dldbg_if_refhold[$cnt].pc[$ix] + if $kgm_pc != 0 + if $ix == 0 + printf "\nHold [%d] (thread %p):\n", \ + $cnt, \ + $dl_if->dldbg_if_refhold[$cnt].th + end + printf "%4d: ", $ix + 1 + pcprint $kgm_pc + printf "\n" + end + set $ix = $ix + 1 + end + set $cnt = $cnt + 1 + end + set $cnt = 0 + while $cnt < $IF_REF_TRACE_HIST_SIZE + set $ix = 0 + while $ix < $CTRACE_STACK_SIZE + set $kgm_pc = $dl_if->dldbg_if_refrele[$cnt].pc[$ix] + if $kgm_pc != 0 + if $ix == 0 + printf "\nRelease [%d] (thread %p):\n",\ + $cnt, \ + $dl_if->dldbg_if_refrele[$cnt].th + end + printf "%4d: ", $ix + 1 + pcprint $kgm_pc + printf "\n" + end + set $ix = $ix + 1 + end + set $cnt = $cnt + 1 + end +end + +document ifpref_showdbg +Syntax: (gdb) ifpref_showdbg +| Given an ifnet structure address, print the debug information +| related to its refcnt. This requires ifnet debugging to be turned +| on, by setting the appropriate flags to the "ifnet_debug" boot-args +| parameter. +end + +define in6ifa_trash + set $ifa = (struct in6_ifaddr_dbg *)in6ifa_trash_head.tqh_first + set $cnt = 0 + while $ifa != 0 + if $cnt == 0 + if $kgm_lp64 + printf " in6_ifa ref hold rele\n" + printf " ----------------- --- ------ ------\n" + else + printf " in6_ifa ref hold rele\n" + printf " --------- --- ------ ------\n" + end + end + printf "%4d: %p %3d %6d %6d ", $cnt + 1, $ifa, \ + $ifa->in6ifa_refhold_cnt - $ifa->in6ifa_refrele_cnt, \ + $ifa->in6ifa_refhold_cnt, $ifa->in6ifa_refrele_cnt + showsockaddr_in6 $ifa->in6ifa.ia_ifa.ifa_addr + printf "\n" + set $ifa = $ifa->in6ifa_trash_link.tqe_next + set $cnt = $cnt + 1 + end +end + +set $NDPR_TRACE_HIST_SIZE = ndpr_trace_hist_size + +define ndpr_showdbg + set $ndpr = (struct nd_prefix_dbg *)$arg0 + set $cnt = 0 + + printf "Total references:\t%d\n", $ndpr->ndpr_refhold_cnt + printf "Total releases:\t\t%d\n", $ndpr->ndpr_refrele_cnt + + while $cnt < $NDPR_TRACE_HIST_SIZE + set $ix = 0 + while $ix < $CTRACE_STACK_SIZE + set $kgm_pc = $ndpr->ndpr_refhold[$cnt].pc[$ix] + if $kgm_pc != 0 + if $ix == 0 + printf "\nHold [%d] (thread %p):\n", \ + $cnt, \ + $ndpr->ndpr_refhold[$cnt].th + end + printf "%4d: ", $ix + 1 + pcprint $kgm_pc + printf "\n" + end + set $ix = $ix + 1 + end + set $cnt = $cnt + 1 + end + set $cnt = 0 + while $cnt < $NDPR_TRACE_HIST_SIZE + set $ix = 0 + while $ix < $CTRACE_STACK_SIZE + set $kgm_pc = $ndpr->ndpr_refrele[$cnt].pc[$ix] + if $kgm_pc != 0 + if $ix == 0 + printf "\nRelease [%d] (thread %p):\n",\ + $cnt, \ + $ndpr->ndpr_refrele[$cnt].th + end + printf "%4d: ", $ix + 1 + pcprint $kgm_pc + printf "\n" + end + set $ix = $ix + 1 + end + set $cnt = $cnt + 1 + end +end + +document ndpr_showdbg +Syntax: (gdb) ndpr_showdbg +| Given a nd_prefix structure address, print the debug information +| related to its refcnt. This requires the interface address debugging +| to be turned on, by setting the appropriate flags to the "ifa_debug" +| boot-args parameter. +end + +set $NDDR_TRACE_HIST_SIZE = nddr_trace_hist_size + +define nddr_showdbg + set $nddr = (struct nd_defrouter_dbg *)$arg0 + set $cnt = 0 + + printf "Total references:\t%d\n", $nddr->nddr_refhold_cnt + printf "Total releases:\t\t%d\n", $nddr->nddr_refrele_cnt + + while $cnt < $NDDR_TRACE_HIST_SIZE + set $ix = 0 + while $ix < $CTRACE_STACK_SIZE + set $kgm_pc = $nddr->nddr_refhold[$cnt].pc[$ix] + if $kgm_pc != 0 + if $ix == 0 + printf "\nHold [%d] (thread %p):\n", \ + $cnt, \ + $nddr->nddr_refhold[$cnt].th + end + printf "%4d: ", $ix + 1 + pcprint $kgm_pc + printf "\n" + end + set $ix = $ix + 1 + end + set $cnt = $cnt + 1 + end + set $cnt = 0 + while $cnt < $NDDR_TRACE_HIST_SIZE + set $ix = 0 + while $ix < $CTRACE_STACK_SIZE + set $kgm_pc = $nddr->nddr_refrele[$cnt].pc[$ix] + if $kgm_pc != 0 + if $ix == 0 + printf "\nRelease [%d] (thread %p):\n",\ + $cnt, \ + $nddr->nddr_refrele[$cnt].th + end + printf "%4d: ", $ix + 1 + pcprint $kgm_pc + printf "\n" + end + set $ix = $ix + 1 + end + set $cnt = $cnt + 1 + end +end + +document nddr_showdbg +Syntax: (gdb) nddr_showdbg +| Given a nd_defrouter structure address, print the debug information +| related to its refcnt. This requires the interface address debugging +| to be turned on, by setting the appropriate flags to the "ifa_debug" +| boot-args parameter. +end +set $IMO_TRACE_HIST_SIZE = imo_trace_hist_size + +define imo_showdbg + set $imo = (struct ip_moptions_dbg *)$arg0 + set $cnt = 0 + + printf "Total references:\t%d\n", $imo->imo_refhold_cnt + printf "Total releases:\t\t%d\n", $imo->imo_refrele_cnt + + while $cnt < $IMO_TRACE_HIST_SIZE + set $ix = 0 + while $ix < $CTRACE_STACK_SIZE + set $kgm_pc = $imo->imo_refhold[$cnt].pc[$ix] + if $kgm_pc != 0 + if $ix == 0 + printf "\nHold [%d] (thread %p):\n", \ + $cnt, \ + $imo->imo_refhold[$cnt].th + end + printf "%4d: ", $ix + 1 + pcprint $kgm_pc + printf "\n" + end + set $ix = $ix + 1 + end + set $cnt = $cnt + 1 + end + set $cnt = 0 + while $cnt < $IMO_TRACE_HIST_SIZE + set $ix = 0 + while $ix < $CTRACE_STACK_SIZE + set $kgm_pc = $imo->imo_refrele[$cnt].pc[$ix] + if $kgm_pc != 0 + if $ix == 0 + printf "\nRelease [%d] (thread %p):\n",\ + $cnt, \ + $imo->imo_refrele[$cnt].th + end + printf "%4d: ", $ix + 1 + pcprint $kgm_pc + printf "\n" + end + set $ix = $ix + 1 + end + set $cnt = $cnt + 1 + end +end + +document imo_showdbg +Syntax: (gdb) imo_showdbg +| Given a ip_moptions structure address, print the debug information +| related to its refcnt. This requires the interface address debugging +| to be turned on, by setting the appropriate flags to the "ifa_debug" +| boot-args parameter. +end + +set $IM6O_TRACE_HIST_SIZE = im6o_trace_hist_size + +define im6o_showdbg + set $im6o = (struct ip6_moptions_dbg *)$arg0 + set $cnt = 0 + + printf "Total references:\t%d\n", $im6o->im6o_refhold_cnt + printf "Total releases:\t\t%d\n", $im6o->im6o_refrele_cnt + + while $cnt < $IM6O_TRACE_HIST_SIZE + set $ix = 0 + while $ix < $CTRACE_STACK_SIZE + set $kgm_pc = $im6o->im6o_refhold[$cnt].pc[$ix] + if $kgm_pc != 0 + if $ix == 0 + printf "\nHold [%d] (thread %p):\n", \ + $cnt, \ + $im6o->im6o_refhold[$cnt].th + end + printf "%4d: ", $ix + 1 + pcprint $kgm_pc + printf "\n" + end + set $ix = $ix + 1 + end + set $cnt = $cnt + 1 + end + set $cnt = 0 + while $cnt < $IM6O_TRACE_HIST_SIZE + set $ix = 0 + while $ix < $CTRACE_STACK_SIZE + set $kgm_pc = $im6o->im6o_refrele[$cnt].pc[$ix] + if $kgm_pc != 0 + if $ix == 0 + printf "\nRelease [%d] (thread %p):\n",\ + $cnt, \ + $im6o->im6o_refrele[$cnt].th + end + printf "%4d: ", $ix + 1 + pcprint $kgm_pc + printf "\n" + end + set $ix = $ix + 1 + end + set $cnt = $cnt + 1 + end +end + +document im6o_showdbg +Syntax: (gdb) im6o_showdbg +| Given a ip6_moptions structure address, print the debug information +| related to its refcnt. This requires the interface address debugging +| to be turned on, by setting the appropriate flags to the "ifa_debug" +| boot-args parameter. +end + +document in6ifa_trash +Syntax: (gdb) in6ifa_trash +| Walk the list of trash in6_ifaddr entries; this requires interface +| address debugging to be turned on, by setting the appropriate flags +| to the "ifa_debug" boot-args parameter. +end + +define inifa_trash + set $ifa = (struct in_ifaddr_dbg *)inifa_trash_head.tqh_first + set $cnt = 0 + while $ifa != 0 + if $cnt == 0 + if $kgm_lp64 + printf " in_ifa ref hold rele\n" + printf " ----------------- --- ------ ------\n" + else + printf " in_ifa ref hold rele\n" + printf " --------- --- ------ ------\n" + end + end + printf "%4d: %p %3d %6d %6d ", $cnt + 1, $ifa, \ + $ifa->inifa_refhold_cnt - $ifa->inifa_refrele_cnt, \ + $ifa->inifa_refhold_cnt, $ifa->inifa_refrele_cnt + showsockaddr_in $ifa->inifa.ia_ifa.ifa_addr + printf "\n" + set $ifa = $ifa->inifa_trash_link.tqe_next + set $cnt = $cnt + 1 + end +end + +document inifa_trash +Syntax: (gdb) inifa_trash +| Walk the list of trash in_ifaddr entries; this requires interface +| address debugging to be turned on, by setting the appropriate flags +| to the "ifa_debug" boot-args parameter. +end + +define ifma_trash + set $ifma = (struct ifmultiaddr_dbg *)ifma_trash_head.tqh_first + set $cnt = 0 + while $ifma != 0 + if $cnt == 0 + if $kgm_lp64 + printf " ifma ref hold rele\n" + printf " ----------------- --- ------ ------\n" + else + printf " ifma ref hold rele\n" + printf " --------- --- ------ ------\n" + end + end + printf "%4d: %p %3d %6d %6d ", $cnt + 1, $ifma, \ + $ifma->ifma_refhold_cnt - $ifma->ifma_refrele_cnt, \ + $ifma->ifma_refhold_cnt, $ifma->ifma_refrele_cnt + showsockaddr $ifma->ifma.ifma_addr + printf " @ %s%d", $ifma->ifma.ifma_ifp->if_name, \ + $ifma->ifma.ifma_ifp->if_unit + printf "\n" + set $ifma = $ifma->ifma_trash_link.tqe_next + set $cnt = $cnt + 1 + end +end + +document ifma_trash +Syntax: (gdb) ifma_trash +| Walk the list of trash ifmultiaddr entries; this requires interface +| address debugging to be turned on, by setting the appropriate flags +| to the "ifa_debug" boot-args parameter. +end + +define inm_trash + set $inm = (struct in_multi_dbg *)inm_trash_head.tqh_first + set $cnt = 0 + while $inm != 0 + if $cnt == 0 + if $kgm_lp64 + printf " inm ref hold rele\n" + printf " ----------------- --- ------ ------\n" + else + printf " inm ref hold rele\n" + printf " --------- --- ------ ------\n" + end + end + printf "%4d: %p %3d %6d %6d ", $cnt + 1, $inm, \ + $inm->inm_refhold_cnt - $inm->inm_refrele_cnt, \ + $inm->inm_refhold_cnt, $inm->inm_refrele_cnt + show_in_addr &($inm->inm.inm_addr) + printf "\n" + set $inm = $inm->inm_trash_link.tqe_next + set $cnt = $cnt + 1 + end +end + +document inm_trash +Syntax: (gdb) inm_trash +| Walk the list of trash in_multi entries; this requires interface +| address debugging to be turned on, by setting the appropriate flags +| to the "ifa_debug" boot-args parameter. +end + +define in6m_trash + set $in6m = (struct in6_multi_dbg *)in6m_trash_head.tqh_first + set $cnt = 0 + while $in6m != 0 + if $cnt == 0 + if $kgm_lp64 + printf " in6m ref hold rele\n" + printf " ----------------- --- ------ ------\n" + else + printf " in6m ref hold rele\n" + printf " --------- --- ------ ------\n" end - set $ix = $ix + 1 end + printf "%4d: %p %3d %6d %6d ", $cnt + 1, $in6m, \ + $in6m->in6m_refhold_cnt - $in6m->in6m_refrele_cnt, \ + $in6m->in6m_refhold_cnt, $in6m->in6m_refrele_cnt + show_in_addr &($in6m->in6m.in6m_addr) + printf "\n" + set $in6m = $in6m->in6m_trash_link.tqe_next set $cnt = $cnt + 1 end end -document in6ifa_showdbg -Syntax: (gdb) in6ifa_showdbg -| Given an IPv6 interface structure address, print the debug information -| related to it. This requires interface address debugging to be turned -| on, by setting the appropriate flags to the "ifa_debug" boot-args -| parameter. +document in6m_trash +Syntax: (gdb) in6m_trash +| Walk the list of trash in6_multi entries; this requires interface +| address debugging to be turned on, by setting the appropriate flags +| to the "ifa_debug" boot-args parameter. end # @@ -5835,11 +7564,11 @@ end define showosmalloc printf "TAG COUNT STATE ATTR NAME\n" -set $kgm_tagheadp = (OSMallocTag)&OSMalloc_tag_list - set $kgm_tagptr = (OSMallocTag )($kgm_tagheadp->OSMT_link.next) +set $kgm_tagheadp = (struct _OSMallocTag_ *)&OSMalloc_tag_list + set $kgm_tagptr = (struct _OSMallocTag_ * )($kgm_tagheadp->OSMT_link.next) while $kgm_tagptr != $kgm_tagheadp ostag_print $kgm_tagptr - set $kgm_tagptr = (OSMallocTag)$kgm_tagptr->OSMT_link.next + set $kgm_tagptr = (struct _OSMallocTag_ *)$kgm_tagptr->OSMT_link.next end printf "\n" end @@ -5850,7 +7579,8 @@ end define systemlog - if msgbufp->msg_bufc[msgbufp->msg_bufx] == 0 + if msgbufp->msg_bufc[msgbufp->msg_bufx] == 0 \ + && msgbufp->msg_bufc[0] != 0 # The buffer hasn't wrapped, so take the easy (and fast!) path printf "%s", msgbufp->msg_bufc else @@ -5878,7 +7608,9 @@ define systemlog set $kgm_i = 0 while $kgm_i < $kgm_syslog_bufend set $kgm_syslog_char = $kgm_msgbuf.msg_bufc[$kgm_i] - printf "%c", $kgm_syslog_char + if $kgm_syslog_char != 0 + printf "%c", $kgm_syslog_char + end set $kgm_i = $kgm_i + 1 end end @@ -5970,17 +7702,27 @@ define showsockaddr_at printcolonhex $addr $count end +define show_in_addr + set $ia = (unsigned char *)$arg0 + printf "%3u.%03u.%03u.%03u", $ia[0], $ia[1], $ia[2], $ia[3] +end + define showsockaddr_in set $sin = (struct sockaddr_in *)$arg0 set $sa_bytes = (unsigned char *)&($sin->sin_addr) - printf "%3u.%03u.%03u.%03u", $sa_bytes[0], $sa_bytes[1], $sa_bytes[2], $sa_bytes[3] + show_in_addr $sa_bytes +end + +define show_in6_addr + set $ia = (unsigned char *)$arg0 + printf "%2x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x", \ + $ia[0], $ia[1], $ia[2], $ia[3], $ia[4], $ia[5], $ia[6], $ia[7], $ia[8], $ia[9], $ia[10], $ia[11], $ia[12], $ia[13], $ia[14], $ia[15] end define showsockaddr_in6 set $sin6 = (struct sockaddr_in6 *)$arg0 set $sa_bytes = $sin6->sin6_addr.__u6_addr.__u6_addr8 - printf "%2x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x", \ - $sa_bytes[0], $sa_bytes[1], $sa_bytes[2], $sa_bytes[3], $sa_bytes[4], $sa_bytes[5], $sa_bytes[6], $sa_bytes[7], $sa_bytes[8], $sa_bytes[9], $sa_bytes[10], $sa_bytes[11], $sa_bytes[12], $sa_bytes[13], $sa_bytes[14], $sa_bytes[15] + show_in6_addr $sa_bytes end define showsockaddr_un @@ -6002,7 +7744,7 @@ define showifmultiaddrs set $mymulti = $if_multi set $myi = 0 while ($mymulti != 0) - printf "%2d. ", $myi + printf "%2d. %p ", $myi, $mymulti set $sa_family = $mymulti->ifma_addr.sa_family if ($sa_family == 2) if ($mymulti->ifma_ll != 0) @@ -6036,6 +7778,48 @@ Syntax showifmultiaddrs | show the (struct ifnet).if_multiaddrs list of multicast addresses for the given ifp end +define showinmultiaddrs + set $in_multi = (struct in_multi *)(in_multihead->lh_first) + set $mymulti = $in_multi + set $myi = 0 + while ($mymulti != 0) + set $ifp = (struct ifnet *)$mymulti->inm_ifp + printf "%2d. %p ", $myi, $mymulti + show_in_addr &($mymulti->inm_addr) + printf " (ifp %p [%s%d] ifma %p) ", $ifp, $ifp->if_name, \ + $ifp->if_unit, $mymulti->inm_ifma + printf "\n" + set $mymulti = $mymulti->inm_link.le_next + set $myi = $myi + 1 + end +end + +document showinmultiaddrs +Syntax showinmultiaddrs +| show the contents of IPv4 multicast address records +end + +define showin6multiaddrs + set $in6_multi = (struct in6_multi *)(in6_multihead->lh_first) + set $mymulti = $in6_multi + set $myi = 0 + while ($mymulti != 0) + set $ifp = (struct ifnet *)$mymulti->in6m_ifp + printf "%2d. %p ", $myi, $mymulti + show_in6_addr &($mymulti->in6m_addr) + printf " (ifp %p [%s%d] ifma %p) ", $ifp, $ifp->if_name, \ + $ifp->if_unit, $mymulti->in6m_ifma + printf "\n" + set $mymulti = $mymulti->in6m_entry.le_next + set $myi = $myi + 1 + end +end + +document showin6multiaddrs +Syntax showin6multiaddrs +| show the contents of IPv6 multicast address records +end + define showsockaddr set $mysock = (struct sockaddr *)$arg0 set $showsockaddr_handled = 0 @@ -6116,10 +7900,10 @@ define showifflags end printf "POINTTOPOINT" end -# if ($flags & 0x20) -# if ($first == 1) +## if ($flags & 0x20) +## if ($first == 1) # set $first = 0 -# else +## else # printf "," # end # printf "NOTRAILERS" @@ -6212,7 +7996,7 @@ define showifaddrs set $myifaddr = (struct ifaddr *)$ifp->if_addrhead->tqh_first set $myi = 0 while ($myifaddr != 0) - printf "\t%d. ", $myi + printf "\t%d. %p ", $myi, $myifaddr showsockaddr $myifaddr->ifa_addr printf " [%d]\n", $myifaddr->ifa_refcnt set $myifaddr = $myifaddr->ifa_link->tqe_next @@ -6230,7 +8014,7 @@ define ifconfig if ($argc == 1) set $ifconfig_all = 1 end - set $ifp = (struct ifnet *)(ifnet->tqh_first) + set $ifp = (struct ifnet *)(ifnet_head->tqh_first) while ($ifp != 0) printf "%s%d: flags=%hx", $ifp->if_name, $ifp->if_unit, (u_short)$ifp->if_flags showifflags $ifp->if_flags @@ -6250,6 +8034,44 @@ Syntax: (gdb) ifconfig | display ifconfig-like output, and print the (struct ifnet *) pointers for further inspection end +set $DLIF_INUSE = 0x1 +set $DLIF_REUSE = 0x2 + +define showifnets + set $all = 0 + if ($argc == 1) + set $all = 1 + end + set $dlifp = (struct dlil_ifnet *)(dlil_ifnet_head->tqh_first) + while ($dlifp != 0) + set $ifp = (struct ifnet *)$dlifp + if ($dlifp->dl_if_flags & $DLIF_REUSE) + printf "*" + end + if ($dlifp->dl_if_flags & $DLIF_INUSE) + printf "%s%d: ", $ifp->if_name, $ifp->if_unit + else + printf "[%s%d]: ", $ifp->if_name, $ifp->if_unit + end + printf "flags=%hx", (u_short)$ifp->if_flags + showifflags $ifp->if_flags + printf " index %d", $ifp->if_index + printf " mtu %d\n", $ifp->if_data.ifi_mtu + printf "\t(struct ifnet *)" + showptr $ifp + printf "\n" + if ($all == 1) + showifaddrs $ifp + end + set $dlifp = $dlifp->dl_if_link->tqe_next + end +end + +document showifnets +Syntax: (gdb) showifnets +| Display ifconfig-like output for all attached and detached interfaces +end + define _show_unix_domain_socket set $so = (struct socket *)$arg0 set $pcb = (struct unpcb *)$so->so_pcb @@ -6663,7 +8485,9 @@ set $UDBHASHSIZE=16 define _dump_pcbinfo set $snd_cc = 0 + set $snd_buf = (unsigned int)0 set $rcv_cc = 0 + set $rcv_buf = (unsigned int)0 set $pcbseen = 0 set $pcbi = (struct inpcbinfo *)$arg0 printf "lastport %d lastlow %d lasthi %d\n", \ @@ -6691,7 +8515,23 @@ define _dump_pcbinfo set $so = (struct socket *)$pcb->inp_socket if $so != 0 set $snd_cc += $so->so_snd.sb_cc - set $rcv_cc += $so-> so_rcv.sb_cc + set $mp = $so->so_snd.sb_mb + while $mp + set $snd_buf += 256 + if ($mp->m_hdr.mh_flags & 0x01) + set $snd_buf += $mp->M_dat.MH.MH_dat.MH_ext.ext_size + end + set $mp = $mp->m_hdr.mh_next + end + set $rcv_cc += $so->so_rcv.sb_cc + set $mp = $so->so_rcv.sb_mb + while $mp + set $rcv_buf += 256 + if ($mp->m_hdr.mh_flags & 0x01) + set $rcv_buf += $mp->M_dat.MH.MH_dat.MH_ext.ext_size + end + set $mp = $mp->m_hdr.mh_next + end end set $pcb0 = $pcb0->inp_hash.le_next printf "\n" @@ -6702,6 +8542,7 @@ define _dump_pcbinfo set $head = *(uintptr_t *)$hashbase end printf "total seen %ld snd_cc %ld rcv_cc %ld\n", $pcbseen, $snd_cc, $rcv_cc + printf "total snd_buf %u rcv_buf %u \n", (unsigned int)$snd_buf, (unsigned int)$rcv_buf printf "port hash base is %p\n", $pcbi->porthashbase set $i = 0 set $hashbase = $pcbi->porthashbase @@ -6922,10 +8763,16 @@ define showvnodeint showptr $kgm_vnode->v_parent printf " " - if $kgm_vnode->v_name != 0 + if ($kgm_vnode->v_name != 0) printf "%s\n", $kgm_vnode->v_name - else - printf "\n" + else + # If this is HFS vnode, get name from the cnode + if ($kgm_vnode->v_tag == 16) + set $kgm_cnode = (struct cnode *)$kgm_vnode->v_data + printf "hfs: %s\n", (char *)$kgm_cnode->c_desc->cd_nameptr + else + printf "\n" + end end end @@ -7195,24 +9042,25 @@ end define showstacksaftertask - set $kgm_head_taskp = &default_pset.tasks + set $kgm_head_taskp = &tasks set $kgm_taskp = (struct task *)$arg0 + set $kgm_taskp = (struct task *)$kgm_taskp->tasks.next while $kgm_taskp != $kgm_head_taskp - showtaskheader - showtaskint $kgm_taskp - set $kgm_head_actp = &($kgm_taskp->threads) - set $kgm_actp = (struct thread *)($kgm_taskp->threads.next) - while $kgm_actp != $kgm_head_actp - showactheader - if ($decode_wait_events > 0) - showactint $kgm_actp 1 - else - showactint $kgm_actp 2 - end - set $kgm_actp = (struct thread *)($kgm_actp->task_threads.next) - end - printf "\n" - set $kgm_taskp = (struct task *)($kgm_taskp->pset_tasks.next) + showtaskheader + showtaskint $kgm_taskp + set $kgm_head_actp = &($kgm_taskp->threads) + set $kgm_actp = (struct thread *)($kgm_taskp->threads.next) + while $kgm_actp != $kgm_head_actp + showactheader + if ($decode_wait_events > 0) + showactint $kgm_actp 1 + else + showactint $kgm_actp 2 + end + set $kgm_actp = (struct thread *)($kgm_actp->task_threads.next) + end + printf "\n" + set $kgm_taskp = (struct task *)($kgm_taskp->tasks.next) end end document showstacksaftertask @@ -7222,20 +9070,19 @@ Syntax: (gdb) showstacksaftertask end define showpmworkqueueint - set $kgm_pm_wq = (IOPMWorkQueue *)$arg0 - set $kgm_pm_node = (IOService *)$kgm_pm_wq->owner - showptr $kgm_pm_wq - printf " " - showptr $kgm_pm_node - printf " " - printf "%02d ", $kgm_pm_node->pwrMgt->CurrentPowerState - printf "%02d ", $kgm_pm_node->pwrMgt->MachineState - printf "%02d ", $kgm_pm_node->pwrMgt->WaitReason - printf "%s\n", $kgm_pm_node->pwrMgt->Name - set $kgm_pm_queue = &($kgm_pm_wq->fWorkQueue) - set $kgm_pm_req = (IOPMRequest *)$kgm_pm_queue->next - if ((queue_entry_t) $kgm_pm_req != (queue_entry_t) $kgm_pm_queue) - printf "\n" + set $kgm_pm_workqueue = (IOPMWorkQueue *)$arg0 + set $kgm_pm_wq = &($kgm_pm_workqueue->fWorkQueue) + set $kgm_pm_wqe = (IOServicePM *)$kgm_pm_wq->next + while ((queue_entry_t) $kgm_pm_wqe != (queue_entry_t) $kgm_pm_wq) + printf "service " + showptrhdrpad + printf " ps ms wr name\n" + showptr $kgm_pm_wqe->Owner + printf " " + printf "%02d ", $kgm_pm_wqe->CurrentPowerState + printf "%02d ", $kgm_pm_wqe->MachineState + printf "%02d ", $kgm_pm_wqe->WaitReason + printf "%s\n", $kgm_pm_wqe->Name printf "request " showptrhdrpad printf " type next " @@ -7243,158 +9090,179 @@ define showpmworkqueueint printf " root " showptrhdrpad printf " work_wait free_wait\n" - while ((queue_entry_t) $kgm_pm_req != (queue_entry_t) $kgm_pm_queue) - showptr $kgm_pm_req - printf " 0x%02x ", $kgm_pm_req->fType - showptr $kgm_pm_req->fRequestNext + set $kgm_pm_rq = &($kgm_pm_wqe->RequestHead) + set $kgm_pm_rqe = (IOPMRequest *)$kgm_pm_rq->next + while ((queue_entry_t) $kgm_pm_rqe != (queue_entry_t) $kgm_pm_rq) + showptr $kgm_pm_rqe + printf " 0x%02x ", $kgm_pm_rqe->fType + showptr $kgm_pm_rqe->fRequestNext printf " " - showptr $kgm_pm_req->fRequestRoot - printf " 0x%08x 0x%08x\n", $kgm_pm_req->fWorkWaitCount, $kgm_pm_req->fFreeWaitCount + showptr $kgm_pm_rqe->fRequestRoot + printf " 0x%08x 0x%08x\n", $kgm_pm_rqe->fWorkWaitCount, $kgm_pm_rqe->fFreeWaitCount showptrhdrpad printf " args " - showptr $kgm_pm_req->fArg0 + showptr $kgm_pm_rqe->fArg0 printf " " - showptr $kgm_pm_req->fArg1 + showptr $kgm_pm_rqe->fArg1 printf " " - showptr $kgm_pm_req->fArg2 + showptr $kgm_pm_rqe->fArg2 printf "\n" - set $kgm_pm_req = (IOPMRequest *)$kgm_pm_req->fCommandChain.next + set $kgm_pm_rqe = (IOPMRequest *)$kgm_pm_rqe->fCommandChain.next end printf "\n" + set $kgm_pm_wqe = (IOServicePM *)$kgm_pm_wqe->WorkChain.next end end -define showallpmworkqueues - set $kgm_pm_next = gIOPMWorkLoop->eventChain - printf "queue " - showptrhdrpad - printf " owner " - showptrhdrpad - printf " ps ms wr name\n" - while ( $kgm_pm_next ) - set $kgm_vt = *((void **) $kgm_pm_next) - if ($kgm_lp64 || $kgm_mtype == $kgm_mtype_arm) - set $kgm_vt = $kgm_vt - 2 * sizeof(void *) - end - if ($kgm_vt == &_ZTV13IOPMWorkQueue) - showpmworkqueueint $kgm_pm_next - end - set $kgm_pm_next = $kgm_pm_next->eventChainNext +define showpmworkqueue + printf "IOPMWorkQueue " + showptr gIOPMWorkQueue + printf " length " + printf "%u", gIOPMWorkQueue->fQueueLength + printf "\n" + if (gIOPMWorkQueue->fQueueLength > 0) + showpmworkqueueint gIOPMWorkQueue end end -document showallpmworkqueues -Syntax: (gdb) showallpmworkqueues -| Display info about all IOPMWorkQueue objects +document showpmworkqueue +Syntax: (gdb) showpmworkqueue +| Display the IOPMWorkQueue object end define showioservicepm - set $kgm_iopmpriv = (IOServicePM *)$arg0 - printf "{ " - printf "MachineState = %d (", $kgm_iopmpriv->MachineState - if ( $kgm_iopmpriv->MachineState == 1 ) - printf "kIOPM_OurChangeTellClientsPowerDown" - else - if ( $kgm_iopmpriv->MachineState == 2 ) - printf "kIOPM_OurChangeTellPriorityClientsPowerDown" - else - if ( $kgm_iopmpriv->MachineState == 3 ) - printf "kIOPM_OurChangeNotifyInterestedDriversWillChange" - else - if ( $kgm_iopmpriv->MachineState == 4 ) - printf "kIOPM_OurChangeSetPowerState" - else - if ( $kgm_iopmpriv->MachineState == 5 ) - printf "kIOPM_OurChangeWaitForPowerSettle" - else - if ( $kgm_iopmpriv->MachineState == 6 ) - printf "kIOPM_OurChangeNotifyInterestedDriversDidChange" - else - if ( $kgm_iopmpriv->MachineState == 7 ) - printf "kIOPM_OurChangeFinish" - else - if ( $kgm_iopmpriv->MachineState == 8 ) - printf "kIOPM_ParentDownTellPriorityClientsPowerDown" - else - if ( $kgm_iopmpriv->MachineState == 9 ) - printf "kIOPM_ParentDownNotifyInterestedDriversWillChange" - else - if ( $kgm_iopmpriv->MachineState == 10 ) - printf "Unused_MachineState_10" - else - if ( $kgm_iopmpriv->MachineState == 11 ) - printf "kIOPM_ParentDownNotifyDidChangeAndAcknowledgeChange" - else - if ( $kgm_iopmpriv->MachineState == 12 ) - printf "kIOPM_ParentDownSetPowerState" - else - if ( $kgm_iopmpriv->MachineState == 13 ) - printf "kIOPM_ParentDownWaitForPowerSettle" - else - if ( $kgm_iopmpriv->MachineState == 14 ) - printf "kIOPM_ParentDownAcknowledgeChange" - else - if ( $kgm_iopmpriv->MachineState == 15) - printf "kIOPM_ParentUpSetPowerState" - else - if ( $kgm_iopmpriv->MachineState == 16) - printf "Unused_MachineState_16" - else - if ( $kgm_iopmpriv->MachineState == 17) - printf "kIOPM_ParentUpWaitForSettleTime" - else - if ( $kgm_iopmpriv->MachineState == 18) - printf "kIOPM_ParentUpNotifyInterestedDriversDidChange" - else - if ( $kgm_iopmpriv->MachineState == 19) - printf "kIOPM_ParentUpAcknowledgePowerChange" - else - if ( $kgm_iopmpriv->MachineState == 20) - printf "kIOPM_Finished" - else - if ( $kgm_iopmpriv->MachineState == 21) - printf "kIOPM_DriverThreadCallDone" - else - if ( $kgm_iopmpriv->MachineState == 22) - printf "kIOPM_NotifyChildrenDone" - end - end - end - end - end - end - end - end - end - end - end - end - end - end - end - end - end - end - end - end - end + set $kgm_iopmpriv = (IOServicePM *)$arg0 + printf "{ " + printf "MachineState = %d (", $kgm_iopmpriv->MachineState + if ( $kgm_iopmpriv->MachineState == 0 ) + printf "kIOPM_Finished" + else + if ( $kgm_iopmpriv->MachineState == 1 ) + printf "kIOPM_OurChangeTellClientsPowerDown" + else + if ( $kgm_iopmpriv->MachineState == 2 ) + printf "kIOPM_OurChangeTellPriorityClientsPowerDown" + else + if ( $kgm_iopmpriv->MachineState == 3 ) + printf "kIOPM_OurChangeNotifyInterestedDriversWillChange" + else + if ( $kgm_iopmpriv->MachineState == 4 ) + printf "kIOPM_OurChangeSetPowerState" + else + if ( $kgm_iopmpriv->MachineState == 5 ) + printf "kIOPM_OurChangeWaitForPowerSettle" + else + if ( $kgm_iopmpriv->MachineState == 6 ) + printf "kIOPM_OurChangeNotifyInterestedDriversDidChange" + else + if ( $kgm_iopmpriv->MachineState == 7 ) + printf "kIOPM_OurChangeTellCapabilityDidChange" + else + if ( $kgm_iopmpriv->MachineState == 8 ) + printf "kIOPM_OurChangeFinish" + else + if ( $kgm_iopmpriv->MachineState == 9 ) + printf "Unused_MachineState_9" + else + if ( $kgm_iopmpriv->MachineState == 10 ) + printf "kIOPM_ParentChangeTellPriorityClientsPowerDown" + else + if ( $kgm_iopmpriv->MachineState == 11 ) + printf "kIOPM_ParentChangeNotifyInterestedDriversWillChange" + else + if ( $kgm_iopmpriv->MachineState == 12 ) + printf "kIOPM_ParentChangeSetPowerState" + else + if ( $kgm_iopmpriv->MachineState == 13 ) + printf "kIOPM_ParentChangeWaitForPowerSettle" + else + if ( $kgm_iopmpriv->MachineState == 14) + printf "kIOPM_ParentChangeNotifyInterestedDriversDidChange" + else + if ( $kgm_iopmpriv->MachineState == 15) + printf "kIOPM_ParentChangeTellCapabilityDidChange" + else + if ( $kgm_iopmpriv->MachineState == 16) + printf "kIOPM_ParentChangeAcknowledgePowerChange" + else + if ( $kgm_iopmpriv->MachineState == 17) + printf "kIOPM_NotifyChildrenStart" + else + if ( $kgm_iopmpriv->MachineState == 18) + printf "kIOPM_NotifyChildrenOrdered" + else + if ( $kgm_iopmpriv->MachineState == 19) + printf "kIOPM_NotifyChildrenDelayed" + else + if ( $kgm_iopmpriv->MachineState == 20) + printf "kIOPM_SyncTellClientsPowerDown" + else + if ( $kgm_iopmpriv->MachineState == 21) + printf "kIOPM_SyncTellPriorityClientsPowerDown" + else + if ( $kgm_iopmpriv->MachineState == 22) + printf "kIOPM_SyncNotifyWillChange" + else + if ( $kgm_iopmpriv->MachineState == 23) + printf "kIOPM_SyncNotifyDidChange" + else + if ( $kgm_iopmpriv->MachineState == 24) + printf "kIOPM_SyncTellCapabilityDidChange" + else + if ( $kgm_iopmpriv->MachineState == 25) + printf "kIOPM_SyncFinish" + else + if ( $kgm_iopmpriv->MachineState == 26) + printf "kIOPM_TellCapabilityChangeDone" + else + if ( $kgm_iopmpriv->MachineState == 27) + printf "kIOPM_DriverThreadCallDone" + else + printf "Unknown_MachineState" + end + end + end + end + end + end + end + end + end + end + end + end + end + end + end + end + end + end + end + end + end + end + end + end + end + end + end end printf "), " if ( $kgm_iopmpriv->MachineState != 20 ) printf "DriverTimer = %d, ",(unsigned int)$kgm_iopmpriv->DriverTimer printf "SettleTime = %d, ",(unsigned int)$kgm_iopmpriv->SettleTimeUS - printf "HeadNoteFlags = %08x, ",(unsigned int)$kgm_iopmpriv->HeadNoteFlags + printf "HeadNoteFlags = %08x, ",(unsigned int)$kgm_iopmpriv->HeadNoteChangeFlags printf "HeadNotePendingAcks = %x, ",(unsigned int)$kgm_iopmpriv->HeadNotePendingAcks end - if ( $kgm_iopmpriv->DeviceOverrides != 0 ) + if ( $kgm_iopmpriv->DeviceOverrideEnabled != 0 ) printf"DeviceOverrides, " end printf "DeviceDesire = %d, ",(unsigned int)$kgm_iopmpriv->DeviceDesire printf "DesiredPowerState = %d, ",(unsigned int)$kgm_iopmpriv->DesiredPowerState - printf "PreviousRequest = %d }\n",(unsigned int)$kgm_iopmpriv->PreviousRequest + printf "PreviousRequest = %d }\n",(unsigned int)$kgm_iopmpriv->PreviousRequestPowerFlags end document showioservicepm @@ -7506,25 +9374,29 @@ Syntax: (gdb) showregistrypmstate end define showstacksafterthread - set $kgm_head_taskp = &default_pset.tasks + set $kgm_head_taskp = &tasks set $kgm_actp = (struct thread *)$arg0 set $kgm_actp = (struct thread *)($kgm_actp->task_threads.next) set $kgm_taskp = (struct task *)$kgm_actp->task while $kgm_taskp != $kgm_head_taskp - showtaskheader - showtaskint $kgm_taskp - set $kgm_head_actp = &($kgm_taskp->threads) - while $kgm_actp != $kgm_head_actp - showactheader - if ($decode_wait_events > 0) - showactint $kgm_actp 1 - else - showactint $kgm_actp 2 - end - set $kgm_actp = (struct thread *)($kgm_actp->task_threads.next) - end - printf "\n" - set $kgm_taskp = (struct task *)($kgm_taskp->pset_tasks.next) + showtaskheader + showtaskint $kgm_taskp + set $kgm_head_actp = &($kgm_taskp->threads) + if $kgm_actp == 0 + set $kgm_actp = (struct thread *)($kgm_taskp->threads.next) + end + while $kgm_actp != $kgm_head_actp + showactheader + if ($decode_wait_events > 0) + showactint $kgm_actp 1 + else + showactint $kgm_actp 2 + end + set $kgm_actp = (struct thread *)($kgm_actp->task_threads.next) + end + printf "\n" + set $kgm_taskp = (struct task *)($kgm_taskp->tasks.next) + set $kgm_actp = 0 end end @@ -7617,17 +9489,17 @@ define _pt_step set $kgm_entryp = $kgm_pt_paddr + 8*$kgm_pt_index readphysint $kgm_entryp 64 $kgm_lcpu_self set $entry = $kgm_readphysint_result - if $kgm_pt_verbose == 2 + if $kgm_pt_verbose >= 3 set $kgm_pte_loop = 0 - while $kgm_pte_loop < 512 - set $kgm_pt_paddr_tmp = $kgm_pt_paddr + $kgm_pte_loop*8 - readphys64 $kgm_pt_paddr_tmp - set $kgm_pte_loop = $kgm_pte_loop + 1 - end + while $kgm_pte_loop < 512 + set $kgm_pt_paddr_tmp = $kgm_pt_paddr + $kgm_pte_loop*8 + readphys64 $kgm_pt_paddr_tmp + set $kgm_pte_loop = $kgm_pte_loop + 1 + end end set $kgm_paddr_mask = ~((0xfffULL<<52) | 0xfffULL) set $kgm_paddr_largemask = ~((0xfffULL<<52) | 0x1fffffULL) - if $kgm_pt_verbose == 0 + if $kgm_pt_verbose < 2 if $entry & (0x1 << 0) set $kgm_pt_valid = 1 if $entry & (0x1 << 7) @@ -7636,7 +9508,7 @@ define _pt_step else set $kgm_pt_large = 0 set $kgm_pt_paddr = $entry & $kgm_paddr_mask - end + end else set $kgm_pt_valid = 0 set $kgm_pt_large = 0 @@ -7645,7 +9517,7 @@ define _pt_step else printf "0x%016llx:\n\t0x%016llx\n\t", $kgm_entryp, $entry if $entry & (0x1 << 0) - printf "valid" + printf "valid" set $kgm_pt_paddr = $entry & $kgm_paddr_mask set $kgm_pt_valid = 1 else @@ -7689,17 +9561,16 @@ define _pt_step if $entry & (0x3 << 9) printf " avail:0x%x", ($entry >> 9) & 0x3 end - if $entry & (0x1 << 63) + if $entry & (0x1ULL << 63) printf " noexec" end printf "\n" end end -define _pmap_walk - set $kgm_pmap = (pmap_t) $arg0 +define _pml4_walk + set $kgm_pt_paddr = $arg0 set $kgm_vaddr = $arg1 - set $kgm_pt_paddr = $kgm_pmap->pm_cr3 set $kgm_pt_valid = $kgm_pt_paddr != 0 set $kgm_pt_large = 0 set $kgm_pframe_offset = 0 @@ -7707,7 +9578,7 @@ define _pmap_walk # Look up bits 47:39 of the linear address in PML4T set $kgm_pt_index = ($kgm_vaddr >> 39) & 0x1ffULL set $kgm_pframe_offset = $kgm_vaddr & 0x7fffffffffULL - if $kgm_pt_verbose + if $kgm_pt_verbose >= 2 printf "pml4 (index %d):\n", $kgm_pt_index end _pt_step @@ -7716,7 +9587,7 @@ define _pmap_walk # Look up bits 38:30 of the linear address in PDPT set $kgm_pt_index = ($kgm_vaddr >> 30) & 0x1ffULL set $kgm_pframe_offset = $kgm_vaddr & 0x3fffffffULL - if $kgm_pt_verbose + if $kgm_pt_verbose >= 2 printf "pdpt (index %d):\n", $kgm_pt_index end _pt_step @@ -7725,7 +9596,7 @@ define _pmap_walk # Look up bits 29:21 of the linear address in PDT set $kgm_pt_index = ($kgm_vaddr >> 21) & 0x1ffULL set $kgm_pframe_offset = $kgm_vaddr & 0x1fffffULL - if $kgm_pt_verbose + if $kgm_pt_verbose >= 2 printf "pdt (index %d):\n", $kgm_pt_index end _pt_step @@ -7734,37 +9605,60 @@ define _pmap_walk # Look up bits 20:21 of the linear address in PT set $kgm_pt_index = ($kgm_vaddr >> 12) & 0x1ffULL set $kgm_pframe_offset = $kgm_vaddr & 0xfffULL - if $kgm_pt_verbose + if $kgm_pt_verbose >= 2 printf "pt (index %d):\n", $kgm_pt_index end _pt_step end + if $kgm_pt_valid set $kgm_paddr = $kgm_pt_paddr + $kgm_pframe_offset - readphysint $kgm_paddr 32 $kgm_lcpu_self - set $kgm_value = $kgm_readphysint_result - printf "phys 0x%016llx: 0x%08x\n", $kgm_paddr, $kgm_value + set $kgm_paddr_isvalid = 1 else set $kgm_paddr = 0 - printf "(no translation)\n" + set $kgm_paddr_isvalid = 0 + end + + if $kgm_pt_verbose >= 1 + if $kgm_paddr_isvalid + readphysint $kgm_paddr 32 $kgm_lcpu_self + set $kgm_value = $kgm_readphysint_result + printf "phys 0x%016llx: 0x%08x\n", $kgm_paddr, $kgm_value + else + printf "(no translation)\n" + end end end +define _pmap_walk_x86 + set $kgm_pmap = (pmap_t) $arg0 + _pml4_walk $kgm_pmap->pm_cr3 $arg1 +end + +define _pmap_walk_arm + set $kgm_paddr = 0 + set $kgm_paddr_isvalid = 0 +end + define pmap_walk - if (($kgm_mtype & $kgm_mtype_x86_mask) != $kgm_mtype_x86_any) - printf "Not available for current architecture.\n" + if $argc != 2 + printf "pmap_walk \n" else - if $argc != 2 - printf "pmap_walk \n" + if !$kgm_pt_verbose + set $kgm_pt_verbose = 2 + else + if $kgm_pt_verbose > 3 + set $kgm_pt_verbose = 2 + end + end + if (($kgm_mtype & $kgm_mtype_x86_mask) == $kgm_mtype_x86_any) + _pmap_walk_x86 $arg0 $arg1 else - if !$kgm_pt_verbose - set $kgm_pt_verbose = 1 + if ($kgm_mtype == $kgm_mtype_arm) + _pmap_walk_arm $arg0 $arg1 else - if $kgm_pt_verbose != 2 - set $kgm_pt_verbose = 1 - end + printf "Not available for current architecture.\n" end - _pmap_walk $arg0 $arg1 end end end @@ -7772,18 +9666,27 @@ end document pmap_walk Syntax: (gdb) pmap_walk | Perform a page-table walk in for . -| Set $kgm_pt_verbose=2 for full hex dump of page tables. +| Set: +| $kgm_pt_verbose=0 for no output, $kgm_paddr will be set +| if $kgm_paddr_isvalid is 1 +| $kgm_pt_verbose=1 for final physical address +| $kgm_pt_verbose=2 for dump of page table entry. +| $kgm_pt_verbose=3 for full hex dump of page tables. end define pmap_vtop - if (($kgm_mtype & $kgm_mtype_x86_mask) != $kgm_mtype_x86_any) - printf "Not available for current architecture.\n" + if $argc != 2 + printf "pmap_vtop \n" else - if $argc != 2 - printf "pmap_vtop \n" + set $kgm_pt_verbose = 1 + if (($kgm_mtype & $kgm_mtype_x86_mask) == $kgm_mtype_x86_any) + _pmap_walk_x86 $arg0 $arg1 else - set $kgm_pt_verbose = 0 - _pmap_walk $arg0 $arg1 + if ($kgm_mtype == $kgm_mtype_arm) + _pmap_walk_arm $arg0 $arg1 + else + printf "Not available for current architecture.\n" + end end end end @@ -7971,7 +9874,12 @@ end # in the kernel's address space and use that instead. Don't rely on # kdp_pmap between invocations of map/unmap. Since the shadow # codepath uses a manual KDP packet, request no more than 128 bytes. -# Uses $kgm_lp64 for kernel address space size +# Uses $kgm_lp64 for kernel address space size, and +# $kgm_readphys_use_kdp/$kgm_readphys_force_physmap to override +# how the user pages are accessed ($kgm_readphys_force_physmap +# implies walking the user task's pagetables to get a physical +# address and then shadowing data from there using the +# physical mapping of memory). define _map_user_data_from_task set $kgm_map_user_taskp = (task_t)$arg0 set $kgm_map_user_map = $kgm_map_user_taskp->map @@ -7980,47 +9888,117 @@ define _map_user_data_from_task set $kgm_map_user_window = 0 set $kgm_map_switch_map = 0 - if $kgm_lp64 - set $kgm_map_switch_map = 1 + if ($kgm_readphys_force_kdp != 0) + set $kgm_readphys_use_kdp = 1 else - if !$kgm_map_user_task_64 - set $kgm_map_switch_map = 1 + if ($kgm_readphys_force_physmap) + set $kgm_readphys_use_kdp = 0 + else + set $kgm_readphys_use_kdp = ( kdp->is_conn > 0 ) end end - - if ($kgm_map_switch_map) - # switch the map safely - set $kgm_map_user_window = $arg1 - set kdp_pmap = $kgm_map_user_pmap - else - # requires shadowing/copying - # set up the manual KDP packet - set manual_pkt.input = 0 - set manual_pkt.len = sizeof(kdp_readmem64_req_t) - set $kgm_pkt = (kdp_readmem64_req_t *)&manual_pkt.data - set $kgm_pkt->hdr.request = KDP_READMEM64 - set $kgm_pkt->hdr.len = sizeof(kdp_readmem64_req_t) - set $kgm_pkt->hdr.is_reply = 0 - set $kgm_pkt->hdr.seq = 0 - set $kgm_pkt->hdr.key = 0 - set $kgm_pkt->address = (uint64_t)$arg1 - set $kgm_pkt->nbytes = (uint32_t)$arg2 + if ($kgm_readphys_use_kdp) - set kdp_pmap = $kgm_map_user_pmap - set manual_pkt.input = 1 - # dummy to make sure manual packet is executed - set $kgm_dummy = &_mh_execute_header - # Go back to kernel map so that we can access buffer directly - set kdp_pmap = 0 + if $kgm_lp64 + set $kgm_map_switch_map = 1 + else + if !$kgm_map_user_task_64 + set $kgm_map_switch_map = 1 + end + end + + if ($kgm_map_switch_map) + # switch the map safely + set $kgm_map_user_window = $arg1 + set kdp_pmap = $kgm_map_user_pmap + else + # requires shadowing/copying + + # set up the manual KDP packet + set manual_pkt.input = 0 + set manual_pkt.len = sizeof(kdp_readmem64_req_t) + set $kgm_pkt = (kdp_readmem64_req_t *)&manual_pkt.data + set $kgm_pkt->hdr.request = KDP_READMEM64 + set $kgm_pkt->hdr.len = sizeof(kdp_readmem64_req_t) + set $kgm_pkt->hdr.is_reply = 0 + set $kgm_pkt->hdr.seq = 0 + set $kgm_pkt->hdr.key = 0 + set $kgm_pkt->address = (uint64_t)$arg1 + set $kgm_pkt->nbytes = (uint32_t)$arg2 + + set kdp_pmap = $kgm_map_user_pmap + set manual_pkt.input = 1 + # dummy to make sure manual packet is executed + set $kgm_dummy = &_mh_execute_header + # Go back to kernel map so that we can access buffer directly + set kdp_pmap = 0 + + set $kgm_pkt = (kdp_readmem64_reply_t *)&manual_pkt.data + if ($kgm_pkt->error == 0) + set $kgm_map_user_window = $kgm_pkt->data + else + set $kgm_map_user_window = 0 + end + end - set $kgm_pkt = (kdp_readmem64_reply_t *)&manual_pkt.data - if ($kgm_pkt->error == 0) - set $kgm_map_user_window = $kgm_pkt->data + else + # without the benefit of a KDP stub on the target, try to + # find the user task's physical mapping and memcpy the data. + # If it straddles a page boundary, copy in two passes + set $kgm_vaddr_range1_start = (unsigned long long)$arg1 + set $kgm_vaddr_range1_count = (unsigned long long)$arg2 + if (($kgm_vaddr_range1_start + $kgm_vaddr_range1_count) & 0xFFF) < $kgm_vaddr_range1_count + set $kgm_vaddr_range2_start = ($kgm_vaddr_range1_start + $kgm_vaddr_range1_count) & ~((unsigned long long)0xFFF) + set $kgm_vaddr_range2_count = $kgm_vaddr_range1_start + $kgm_vaddr_range1_count - $kgm_vaddr_range2_start + set $kgm_vaddr_range1_count = $kgm_vaddr_range2_start - $kgm_vaddr_range1_start else - set $kgm_map_user_window = 0 + set $kgm_vaddr_range2_start = 0 + set $kgm_vaddr_range2_count = 0 end + set $kgm_paddr_range1_in_kva = 0 + set $kgm_paddr_range2_in_kva = 0 + if ($kgm_mtype == $kgm_mtype_x86_64) + set $kgm_pt_verbose = 0 + _pmap_walk_x86 $kgm_map_user_pmap $kgm_vaddr_range1_start + if $kgm_paddr_isvalid + set $kgm_paddr_range1_in_kva = $kgm_paddr + (((unsigned long long)-1 << 47) | ((unsigned long long)509 << 39)) + end + if $kgm_vaddr_range2_start + _pmap_walk_x86 $kgm_map_user_pmap $kgm_vaddr_range2_start + if $kgm_paddr_isvalid + set $kgm_paddr_range2_in_kva = $kgm_paddr + (((unsigned long long)-1 << 47) | ((unsigned long long)509 << 39)) + end + end + else + if ($kgm_mtype == $kgm_mtype_arm) + set $kgm_pt_verbose = 0 + _pmap_walk_arm $kgm_map_user_pmap $kgm_vaddr_range1_start + if $kgm_paddr_isvalid + set $kgm_paddr_range1_in_kva = $kgm_paddr - gPhysBase + gVirtBase + end + if $kgm_vaddr_range2_start + _pmap_walk_arm $kgm_map_user_pmap $kgm_vaddr_range2_start + if $kgm_paddr_isvalid + set $kgm_paddr_range2_in_kva = $kgm_paddr - gPhysBase + gVirtBase + end + end + else + printf "Not available for current architecture.\n" + set $kgm_paddr_isvalid = 0 + end + end + if $kgm_paddr_range1_in_kva + set $kgm_pkt = (kdp_readmem64_reply_t *)&manual_pkt.data + memcpy $kgm_pkt->data $kgm_paddr_range1_in_kva $kgm_vaddr_range1_count + if $kgm_paddr_range2_in_kva + memcpy &$kgm_pkt->data[$kgm_vaddr_range1_count] $kgm_paddr_range2_in_kva $kgm_vaddr_range2_count + end + set $kgm_map_user_window = $kgm_pkt->data + else + set $kgm_map_user_window = 0 + end end end @@ -8032,6 +10010,10 @@ end define _print_path_for_image set $kgm_print_path_address = (unsigned long long)$arg0 set $kgm_path_str_notdone = 1 + + if ($kgm_print_path_address == 0) + set $kgm_path_str_notdone = 0 + end while $kgm_path_str_notdone _map_user_data_from_task $kgm_taskp $kgm_print_path_address 32 @@ -8045,7 +10027,7 @@ define _print_path_for_image _unmap_user_data_from_task $kgm_taskp - # if we terminated on NUL, break out + # break out if we terminated on NUL if $kgm_path_i < 32 set $kgm_path_str_notdone = 0 else @@ -8054,7 +10036,7 @@ define _print_path_for_image end end -# uses $kgm_taskp and $kgm_task_64 +# uses $kgm_taskp and $kgm_task_64. May modify $kgm_dyld_load_path define _print_image_info set $kgm_mh_image_address = (unsigned long long)$arg0 set $kgm_mh_path_address = (unsigned long long)$arg1 @@ -8135,6 +10117,10 @@ define _print_image_info loop_break else + if $kgm_lc_cmd == 0xe + set $kgm_load_dylinker_data = $kgm_lc_data + set $kgm_dyld_load_path = $kgm_lc_address + *((unsigned int *)$kgm_load_dylinker_data) + end _unmap_user_data_from_task $kgm_taskp end @@ -8184,20 +10170,24 @@ define _print_images_for_dyld_image_info set $kgm_task_64 = $arg1 set $kgm_dyld_all_image_infos_address = (unsigned long long)$arg2 - _map_user_data_from_task $kgm_taskp $kgm_dyld_all_image_infos_address 16 + _map_user_data_from_task $kgm_taskp $kgm_dyld_all_image_infos_address 112 set $kgm_dyld_all_image_infos = (unsigned int *)$kgm_map_user_window - if ($kgm_dyld_all_image_infos[0] != 6) - printf "Invalid version number %d\n", $kgm_dyld_all_image_infos[0] + set $kgm_dyld_all_image_infos_version = $kgm_dyld_all_image_infos[0] + if ($kgm_dyld_all_image_infos_version > 12) + printf "Unknown dyld all_image_infos version number %d\n", $kgm_dyld_all_image_infos_version end set $kgm_image_info_count = $kgm_dyld_all_image_infos[1] - + + set $kgm_dyld_load_path = 0 if $kgm_task_64 set $kgm_image_info_size = 24 set $kgm_image_info_array_address = ((unsigned long long *)$kgm_dyld_all_image_infos)[1] + set $kgm_dyld_load_address = ((unsigned long long *)$kgm_dyld_all_image_infos)[4] else set $kgm_image_info_size = 12 set $kgm_image_info_array_address = ((unsigned int *)$kgm_dyld_all_image_infos)[2] + set $kgm_dyld_load_address = ((unsigned int *)$kgm_dyld_all_image_infos)[5] end _unmap_user_data_from_task $kgm_taskp @@ -8222,28 +10212,33 @@ define _print_images_for_dyld_image_info set $kgm_image_info_i = $kgm_image_info_i + 1 end + + # $kgm_dyld_load_path may get set when the main executable is processed + # printf "[dyld] = image address %llx path address %llx\n", $kgm_dyld_load_address, $kgm_dyld_load_path + _print_image_info $kgm_dyld_load_address $kgm_dyld_load_path + end define showuserlibraries - set $kgm_taskp = (task_t)$arg0 - set $kgm_dyld_image_info = $kgm_taskp->all_image_info_addr + set $kgm_taskp = (task_t)$arg0 + set $kgm_dyld_image_info = $kgm_taskp->all_image_info_addr - set $kgm_map = $kgm_taskp->map - set $kgm_task_64 = ( $kgm_taskp->taskFeatures[0] & 0x80000000) + set $kgm_map = $kgm_taskp->map + set $kgm_task_64 = ( $kgm_taskp->taskFeatures[0] & 0x80000000) - if ($kgm_dyld_image_info != 0) - printf "address " - if $kgm_task_64 - printf " " - end - printf " type " - printf " uuid " - printf "path\n" + if ($kgm_dyld_image_info != 0) + printf "address " + if $kgm_task_64 + printf " " + end + printf " type " + printf " uuid " + printf "path\n" - _print_images_for_dyld_image_info $kgm_taskp $kgm_task_64 $kgm_dyld_image_info - else - printf "No dyld shared library information available for task\n" - end + _print_images_for_dyld_image_info $kgm_taskp $kgm_task_64 $kgm_dyld_image_info + else + printf "No dyld shared library information available for task\n" + end end document showuserlibraries Syntax: (gdb) showuserlibraries @@ -8251,6 +10246,191 @@ Syntax: (gdb) showuserlibraries | information about all Mach-O images. end +define showuserdyldinfo + set $kgm_taskp = (task_t)$arg0 + set $kgm_dyld_all_image_infos_address = (unsigned long long)$kgm_taskp->all_image_info_addr + + set $kgm_map = $kgm_taskp->map + set $kgm_task_64 = ( $kgm_taskp->taskFeatures[0] & 0x80000000) + + if ($kgm_dyld_all_image_infos_address != 0) + + _map_user_data_from_task $kgm_taskp $kgm_dyld_all_image_infos_address 112 + + set $kgm_dyld_all_image_infos = (unsigned char *)$kgm_map_user_window + set $kgm_dyld_all_image_infos_version = ((unsigned int *)$kgm_dyld_all_image_infos)[0] + if ($kgm_dyld_all_image_infos_version > 12) + printf "Unknown dyld all_image_infos version number %d\n", $kgm_dyld_all_image_infos_version + end + + # Find fields by byte offset. We assume at least version 9 is supported + if $kgm_task_64 + set $kgm_dyld_all_image_infos_infoArrayCount = *(unsigned int *)(&$kgm_dyld_all_image_infos[4]) + set $kgm_dyld_all_image_infos_infoArray = *(unsigned long long *)(&$kgm_dyld_all_image_infos[8]) + set $kgm_dyld_all_image_infos_notification = *(unsigned long long *)(&$kgm_dyld_all_image_infos[16]) + set $kgm_dyld_all_image_infos_processDetachedFromSharedRegion = *(unsigned char *)(&$kgm_dyld_all_image_infos[24]) + set $kgm_dyld_all_image_infos_libSystemInitialized = *(unsigned char *)(&$kgm_dyld_all_image_infos[25]) + set $kgm_dyld_all_image_infos_dyldImageLoadAddress = *(unsigned long long *)(&$kgm_dyld_all_image_infos[32]) + set $kgm_dyld_all_image_infos_jitInfo = *(unsigned long long *)(&$kgm_dyld_all_image_infos[40]) + set $kgm_dyld_all_image_infos_dyldVersion = *(unsigned long long *)(&$kgm_dyld_all_image_infos[48]) + set $kgm_dyld_all_image_infos_errorMessage = *(unsigned long long *)(&$kgm_dyld_all_image_infos[56]) + set $kgm_dyld_all_image_infos_terminationFlags = *(unsigned long long *)(&$kgm_dyld_all_image_infos[64]) + set $kgm_dyld_all_image_infos_coreSymbolicationShmPage = *(unsigned long long *)(&$kgm_dyld_all_image_infos[72]) + set $kgm_dyld_all_image_infos_systemOrderFlag = *(unsigned long long *)(&$kgm_dyld_all_image_infos[80]) + set $kgm_dyld_all_image_infos_uuidArrayCount = *(unsigned long long *)(&$kgm_dyld_all_image_infos[88]) + set $kgm_dyld_all_image_infos_uuidArray = *(unsigned long long *)(&$kgm_dyld_all_image_infos[96]) + set $kgm_dyld_all_image_infos_dyldAllImageInfosAddress = *(unsigned long long *)(&$kgm_dyld_all_image_infos[104]) + else + set $kgm_dyld_all_image_infos_infoArrayCount = *(unsigned int *)(&$kgm_dyld_all_image_infos[4]) + set $kgm_dyld_all_image_infos_infoArray = *(unsigned int *)(&$kgm_dyld_all_image_infos[8]) + set $kgm_dyld_all_image_infos_notification = *(unsigned int *)(&$kgm_dyld_all_image_infos[12]) + set $kgm_dyld_all_image_infos_processDetachedFromSharedRegion = *(unsigned char *)(&$kgm_dyld_all_image_infos[16]) + set $kgm_dyld_all_image_infos_libSystemInitialized = *(unsigned char *)(&$kgm_dyld_all_image_infos[17]) + set $kgm_dyld_all_image_infos_dyldImageLoadAddress = *(unsigned int *)(&$kgm_dyld_all_image_infos[20]) + set $kgm_dyld_all_image_infos_jitInfo = *(unsigned int *)(&$kgm_dyld_all_image_infos[24]) + set $kgm_dyld_all_image_infos_dyldVersion = *(unsigned int *)(&$kgm_dyld_all_image_infos[28]) + set $kgm_dyld_all_image_infos_errorMessage = *(unsigned int *)(&$kgm_dyld_all_image_infos[32]) + set $kgm_dyld_all_image_infos_terminationFlags = *(unsigned int *)(&$kgm_dyld_all_image_infos[36]) + set $kgm_dyld_all_image_infos_coreSymbolicationShmPage = *(unsigned int *)(&$kgm_dyld_all_image_infos[40]) + set $kgm_dyld_all_image_infos_systemOrderFlag = *(unsigned int *)(&$kgm_dyld_all_image_infos[44]) + set $kgm_dyld_all_image_infos_uuidArrayCount = *(unsigned int *)(&$kgm_dyld_all_image_infos[48]) + set $kgm_dyld_all_image_infos_uuidArray = *(unsigned int *)(&$kgm_dyld_all_image_infos[52]) + set $kgm_dyld_all_image_infos_dyldAllImageInfosAddress = *(unsigned int *)(&$kgm_dyld_all_image_infos[56]) + end + + _unmap_user_data_from_task $kgm_taskp + + printf " version %u\n", $kgm_dyld_all_image_infos_version + printf " infoArrayCount %u\n", $kgm_dyld_all_image_infos_infoArrayCount + printf " infoArray " + showuserptr $kgm_dyld_all_image_infos_infoArray + printf "\n" + printf " notification " + showuserptr $kgm_dyld_all_image_infos_notification + printf "\n" + printf "processDetachedFromSharedRegion %d\n", $kgm_dyld_all_image_infos_processDetachedFromSharedRegion + printf " libSystemInitialized %d\n", $kgm_dyld_all_image_infos_libSystemInitialized + printf " dyldImageLoadAddress " + showuserptr $kgm_dyld_all_image_infos_dyldImageLoadAddress + printf "\n" + printf " jitInfo " + showuserptr $kgm_dyld_all_image_infos_jitInfo + printf "\n" + printf " dyldVersion " + showuserptr $kgm_dyld_all_image_infos_dyldVersion + printf "\n" + printf " " + _print_path_for_image $kgm_dyld_all_image_infos_dyldVersion + printf "\n" + + printf " errorMessage " + showuserptr $kgm_dyld_all_image_infos_errorMessage + printf "\n" + if $kgm_dyld_all_image_infos_errorMessage != 0 + printf " " + _print_path_for_image $kgm_dyld_all_image_infos_errorMessage + printf "\n" + end + + printf " terminationFlags " + showuserptr $kgm_dyld_all_image_infos_terminationFlags + printf "\n" + printf " coreSymbolicationShmPage " + showuserptr $kgm_dyld_all_image_infos_coreSymbolicationShmPage + printf "\n" + printf " systemOrderFlag " + showuserptr $kgm_dyld_all_image_infos_systemOrderFlag + printf "\n" + printf " uuidArrayCount " + showuserptr $kgm_dyld_all_image_infos_uuidArrayCount + printf "\n" + printf " uuidArray " + showuserptr $kgm_dyld_all_image_infos_uuidArray + printf "\n" + printf " dyldAllImageInfosAddress " + showuserptr $kgm_dyld_all_image_infos_dyldAllImageInfosAddress + printf "\n" + printf " (currently " + showuserptr $kgm_dyld_all_image_infos_address + printf ")\n" + + if $kgm_task_64 + set $kgm_dyld_all_image_infos_address = $kgm_dyld_all_image_infos_address + 112 + _map_user_data_from_task $kgm_taskp $kgm_dyld_all_image_infos_address 64 + set $kgm_dyld_all_image_infos_v10 = (unsigned char *)$kgm_map_user_window + set $kgm_dyld_all_image_infos_initialImageCount = *(unsigned long long *)(&$kgm_dyld_all_image_infos_v10[112-112]) + set $kgm_dyld_all_image_infos_errorKind = *(unsigned long long *)(&$kgm_dyld_all_image_infos_v10[120-112]) + set $kgm_dyld_all_image_infos_errorClientOfDylibPath = *(unsigned long long *)(&$kgm_dyld_all_image_infos_v10[128-112]) + set $kgm_dyld_all_image_infos_errorTargetDylibPath = *(unsigned long long *)(&$kgm_dyld_all_image_infos_v10[136-112]) + set $kgm_dyld_all_image_infos_errorSymbol = *(unsigned long long *)(&$kgm_dyld_all_image_infos_v10[144-112]) + set $kgm_dyld_all_image_infos_sharedCacheSlide = *(unsigned long long *)(&$kgm_dyld_all_image_infos_v10[152-112]) + + _unmap_user_data_from_task $kgm_taskp + else + set $kgm_dyld_all_image_infos_address = $kgm_dyld_all_image_infos_address + 60 + _map_user_data_from_task $kgm_taskp $kgm_dyld_all_image_infos_address 64 + set $kgm_dyld_all_image_infos_v10 = (unsigned char *)$kgm_map_user_window + set $kgm_dyld_all_image_infos_initialImageCount = *(unsigned int *)(&$kgm_dyld_all_image_infos_v10[60-60]) + set $kgm_dyld_all_image_infos_errorKind = *(unsigned int *)(&$kgm_dyld_all_image_infos_v10[64-60]) + set $kgm_dyld_all_image_infos_errorClientOfDylibPath = *(unsigned int *)(&$kgm_dyld_all_image_infos_v10[68-60]) + set $kgm_dyld_all_image_infos_errorTargetDylibPath = *(unsigned int *)(&$kgm_dyld_all_image_infos_v10[72-60]) + set $kgm_dyld_all_image_infos_errorSymbol = *(unsigned int *)(&$kgm_dyld_all_image_infos_v10[76-60]) + set $kgm_dyld_all_image_infos_sharedCacheSlide = *(unsigned int *)(&$kgm_dyld_all_image_infos_v10[80-60]) + _unmap_user_data_from_task $kgm_taskp + end + + if $kgm_dyld_all_image_infos_version >= 10 + printf " initialImageCount " + showuserptr $kgm_dyld_all_image_infos_initialImageCount + printf "\n" + end + + if $kgm_dyld_all_image_infos_version >= 11 + printf " errorKind " + showuserptr $kgm_dyld_all_image_infos_errorKind + printf "\n" + printf " errorClientOfDylibPath " + showuserptr $kgm_dyld_all_image_infos_errorClientOfDylibPath + printf "\n" + if $kgm_dyld_all_image_infos_errorClientOfDylibPath != 0 + printf " " + _print_path_for_image $kgm_dyld_all_image_infos_errorClientOfDylibPath + printf "\n" + end + printf " errorTargetDylibPath " + showuserptr $kgm_dyld_all_image_infos_errorTargetDylibPath + printf "\n" + if $kgm_dyld_all_image_infos_errorTargetDylibPath != 0 + printf " " + _print_path_for_image $kgm_dyld_all_image_infos_errorTargetDylibPath + printf "\n" + end + printf " errorSymbol " + showuserptr $kgm_dyld_all_image_infos_errorSymbol + printf "\n" + if $kgm_dyld_all_image_infos_errorSymbol != 0 + printf " " + _print_path_for_image $kgm_dyld_all_image_infos_errorSymbol + printf "\n" + end + end + + if $kgm_dyld_all_image_infos_version >= 12 + printf " sharedCacheSlide " + showuserptr $kgm_dyld_all_image_infos_sharedCacheSlide + printf "\n" + end + + else + printf "No dyld information available for task\n" + end +end +document showuserdyldinfo +Syntax: (gdb) showuserdyldinfo +| For a given user task, inspect the dyld global info and print +| out all fields, including error messages. +end + define showkerneldebugheader printf "kd_buf " showptrhdrpad @@ -8546,8 +10726,7 @@ define showkerneldebugbuffercpu set $kgm_entry_count = (int) $arg1 set $kgm_debugentriesfound = 0 - #if kdebug_flags & KDBG_BFINIT - if (kdebug_flags & 0x80000000) + if (kdebug_flags & 0x80000000) # 0x80000000 == KDBG_BFINIT showkerneldebugheader if $kgm_entry_count == 0 @@ -8584,8 +10763,7 @@ end define showkerneldebugbuffer - #if kdebug_flags & KDBG_BFINIT - if (kdebug_flags & 0x80000000) + if (kdebug_flags & 0x80000000) # 0x80000000 == KDBG_BFINIT set $kgm_entrycount = (int) $arg0 @@ -8629,19 +10807,30 @@ Syntax: showallvmstats | prints a summary of vm statistics in a table format end +define memstats + if ($kgm_mtype == $kgm_mtype_arm) + printf "kern_memorystatus_level: %8d\n", kern_memorystatus_level + end + printf "vm_page_throttled_count: %8d\n", vm_page_throttled_count + printf "vm_page_active_count: %8d\n", vm_page_active_count + printf "vm_page_inactive_count: %8d\n", vm_page_inactive_count + printf "vm_page_wire_count: %8d\n", vm_page_wire_count + printf "vm_page_free_count: %8d\n", vm_page_free_count + printf "vm_page_purgeable_count: %8d\n", vm_page_purgeable_count + printf "vm_page_inactive_target: %8d\n", vm_page_inactive_target + printf "vm_page_free_target: %8d\n", vm_page_free_target + printf "inuse_ptepages_count: %8d\n", inuse_ptepages_count + printf "vm_page_free_reserved: %8d\n", vm_page_free_reserved +end + +document memstats +Syntax: (gdb) memstats +| Prints out a summary of various memory statistics. In particular vm_page_wire_count should +| be greater than 2K or you are under memory pressure. +end + define show_user_registers - if (($kgm_mtype & $kgm_mtype_x86_mask) == $kgm_mtype_x86_any) - set $kgm_thread = (thread_t)$arg0 - if ((*(thread_t)$kgm_thread)->machine.xxx_pcb.iss.flavor == 15) - p/x ($kgm_thread)->machine.xxx_pcb.iss->uss.ss_64 - else - p/x ($kgm_thread)->machine.xxx_pcb.iss->uss.ss_32 - end - end - if ($kgm_mtype == $kgm_mtype_ppc) - set $kgm_thread = (thread_t)$arg0 - p/x *($kgm_thread)->machine.pcb - end + showuserregisters $arg0 end document show_user_registers @@ -8786,6 +10975,35 @@ Syntax: strcmp_nomalloc [b] [c] [d] [e] [f] [g] [h] [i] | strcmp_nomalloc version $kgm_strcmp_arg end +define memcpy + set $kgm_dst = (unsigned char *)$arg0 + set $kgm_src = (unsigned char *)$arg1 + set $kgm_count = $arg2 + + # printf "src %p dst %p len %d\n", $kgm_src, $kgm_dst, $kgm_count + + while ($kgm_count >= 8) + set *(unsigned long long *)$kgm_dst = *(unsigned long long *)$kgm_src + + set $kgm_dst = $kgm_dst + 8 + set $kgm_src = $kgm_src + 8 + set $kgm_count = $kgm_count - 8 + end + while ($kgm_count > 0) + set *$kgm_dst = *$kgm_src + + set $kgm_dst = $kgm_dst + 1 + set $kgm_src = $kgm_src + 1 + set $kgm_count = $kgm_count - 1 + end +end + +document memcpy +Syntax: memcpy +| Given two addresses that are accessible by the debugger, perform +| a memory copy of bytes from to +end + # _pci_cfg_addr_value $addr $size define _pci_cfg_addr_value readphysint $arg0 $arg1 $kgm_lcpu_self @@ -8825,7 +11043,7 @@ define _pci_cfg_init end end - # if the above fails, search for 0:0:0 in likely places. + # search for 0:0:0 in likely places if the above fails if $kgm_pci_cfg_init == 0 set $kgm_pci_cfg_base = 0xF0000000 while $kgm_pci_cfg_init == 0 && $kgm_pci_cfg_base > 0xA0000000 @@ -10125,6 +12343,31 @@ Syntax: (gdb) showeventsourceobject | Routine to display information about an IOEventSource subclass. end +define showworkloopallocator + set $kgm_workloop = (struct IOWorkLoop*)$arg0 + set $kgm_bt = (void**)$kgm_workloop->reserved->allocationBacktrace + set $kgm_bt_count = 0 + while $kgm_bt_count != (sizeof(IOWorkLoop::ExpansionData.allocationBacktrace) / sizeof(IOWorkLoop::ExpansionData.allocationBacktrace[0])) + set $kgm_frame_address = (void*)$kgm_bt[$kgm_bt_count] + if $kgm_frame_address != 0 + if (((unsigned long) $kgm_frame_address < (unsigned long) &_mh_execute_header || \ + (unsigned long) $kgm_frame_address >= (unsigned long) &last_kernel_symbol ) \ + && ($kgm_show_kmod_syms == 0)) + showkmodaddr $kgm_frame_address + else + output /a $kgm_frame_address + end + printf "\n" + end + set $kgm_bt_count = $kgm_bt_count + 1 + end +end +document showworkloopallocator +Syntax: (gdb) showworkloopallocator +| Routine to display the backtrace of the thread which allocated the workloop in question. Only +| valid on DEBUG kernels. +end + define showworkloopeventsources set $kgm_eventsource = (struct IOEventSource*)$arg0 while $kgm_eventsource != 0 @@ -10204,10 +12447,27 @@ define showworkloop end printf "\t\t" set $kgm_gateLock = ( struct _IORecursiveLock *)$kgm_workloop->gateLock - set $kgm_lockGroup = (struct _lck_grp_*)($kgm_gateLock->group) - printf "%s", $kgm_lockGroup->lck_grp_name - printf "\n" - showworkloopeventsources $kgm_workloop->eventChain + if $kgm_gateLock != 0 + set $kgm_lockGroup = (struct _lck_grp_*)($kgm_gateLock->group) + printf "%s", $kgm_lockGroup->lck_grp_name + else + printf "No WorkLoop Lock found" + end + printf "\n\n" + + #Allocation backtrace is only valid on DEBUG kernels. + #printf "Allocation path:\n\n" + #showworkloopallocator $kgm_workloop + #printf "\n\n" + + if $kgm_workloop->eventChain != 0 + printf "Active event sources:\n\n" + showworkloopeventsources $kgm_workloop->eventChain + end + if $kgm_workloop->reserved->passiveEventChain != 0 + printf "Passive event sources:\n" + showworkloopeventsources $kgm_workloop->reserved->passiveEventChain + end end document showworkloop Syntax: (gdb) showworkloop @@ -10293,7 +12553,7 @@ Syntax: showthreadfortid |corresponding to a given thread_id. end -define showtaskbusyports +define showtaskbusyportsint set $kgm_isp = ((task_t)$arg0)->itk_space set $kgm_iindex = 0 while ( $kgm_iindex < $kgm_isp->is_table_size ) @@ -10308,6 +12568,10 @@ define showtaskbusyports end end +define showtaskbusyports + showtaskbusyportsint $arg0 +end + document showtaskbusyports Syntax: showtaskbusyports |Routine to print information about receive rights belonging to this task that @@ -10318,7 +12582,7 @@ define showallbusyports set $kgm_head_taskp = &tasks set $kgm_cur_taskp = (struct task *)($kgm_head_taskp->next) while $kgm_cur_taskp != $kgm_head_taskp - showtaskbusyports $kgm_cur_taskp + showtaskbusyportsint $kgm_cur_taskp set $kgm_cur_taskp = (struct task *)($kgm_cur_taskp->tasks.next) end end @@ -10329,16 +12593,689 @@ Syntax: showallbusyports |have enqueued messages. end -define kdp-connect - if $argc > 0 - kdp-reattach $arg0 +define showallproviders + set $kgm_providerp = dtrace_provider + while $kgm_providerp + p *(dtrace_provider_t *)$kgm_providerp + printf "\n" + set $kgm_providerp = (dtrace_provider_t *)($kgm_providerp->dtpv_next) + end +end + +document showallproviders +Syntax: showallproviders +| Display summary listing of all dtrace_providers +end + +define showmodctlheader + printf "modctl " + showptrhdrpad + printf " stale " + showptrhdrpad + printf " symbols " + showptrhdrpad + printf " address " + showptrhdrpad + printf " size " + showptrhdrpad + printf " loadid loaded nenabled flags name\n" +end + +define showmodctlint + set $kgm_modctlp = (struct modctl *)$arg0 + showptr $kgm_modctlp + printf " " + showptr $kgm_modctlp->mod_stale + printf " " + showptr $kgm_modctlp->mod_user_symbols + printf " " + showptr $kgm_modctlp->mod_address + printf " " + showptr $kgm_modctlp->mod_size + printf " " + printf "%6d ", $kgm_modctlp->mod_loadcnt + printf "%6d ", $kgm_modctlp->mod_loaded + printf "%6d ", $kgm_modctlp->mod_nenabled + printf " 0x%x ", $kgm_modctlp->mod_flags + printf "%s\n", $kgm_modctlp->mod_modname +end + +define showmodctl + showmodctlheader + showmodctlint $arg0 +end +document showmodctl +Syntax: (gdb) showmodctl +| Display info about a dtrace modctl +end + +define showallmodctls + showmodctlheader + set $kgm_modctlp = (struct modctl *)dtrace_modctl_list + while $kgm_modctlp + showmodctlint $kgm_modctlp + set $kgm_modctlp = $kgm_modctlp->mod_next + end +end +document showallmodctls +Syntax: (gdb) showallmodctls +| Display summary listing of all dtrace modctls +end + +define showfbtprobe + printf "Be very patient, this traverses a large list \n" + set $kgm_indx = 0 + set $kgm_found = 0 + set $kgm_depth = 0 + while $kgm_indx < fbt_probetab_size && !$kgm_found + set $kgm_fbt_probep = (struct fbt_probe *)fbt_probetab[$kgm_indx] + set $kgm_depth = 0 + if $kgm_fbt_probep + set $kgm_probeid = (struct fbt_probe *)$kgm_fbt_probep->fbtp_id + if $kgm_probeid == $arg0 + set $kgm_found = 1 + loop_break + else + set $kgm_fbt_probep = $kgm_fbt_probep->fbtp_hashnext + while $kgm_fbt_probep + set $kgm_depth++ + set $kgm_probeid = (struct fbt_probe *)$kgm_fbt_probep->fbtp_id + if $kgm_probeid == $arg0 + set $kgm_found = 1 + loop_break + else + set $kgm_fbt_probep = $kgm_fbt_probep->fbtp_hashnext + end + end + end + end + if !$kgm_found + set $kgm_indx++ else - printf "Attempting to attach to localhost...\n" - kdp-reattach localhost + printf "fbt_probetab[index=%d], depth=%d, 0x%x\n", $kgm_indx, $kgm_depth, $kgm_fbt_probep + printf "(gdb) p *(struct fbt_probe *)0x%x\n", $kgm_fbt_probep + p *(struct fbt_probe *)$kgm_fbt_probep + set $kgm_fbtp_ctl = (struct fbt_probe *)$kgm_fbt_probep->fbtp_ctl + showmodctl $kgm_fbtp_ctl + loop_break + end + end +end +document showfbtprobe +Syntax: (gdb) showfbtprobe +| Display info about an fbt probe given an id. +| Traverses fbt_probetab and matches with fbtp_id. +| The is found using dtrace -l +end + +define showzstacktrace + set $kgm_trace = (void*)$arg0 + if ($argc == 1) + set $kgm_trace_size = 15 + end + if ($argc == 2) + set $kgm_trace_size = $arg1 + end + set $kgm_trace_current = 0 + while ($kgm_trace_current < $kgm_trace_size) + set $kgm_trace_addr = (void**)$kgm_trace + $kgm_trace_current + set $kgm_trace_value = *((void**)$kgm_trace_addr) + #printf "\t\t" + output /a $kgm_trace_value + set $kgm_trace_current = $kgm_trace_current + 1 + printf "\n" + end +end + +document showzstacktrace +Syntax: showzstacktrace [size] +| Routine to print a stacktrace stored by OSBacktrace. +| size is optional, defaults to 15. +end + +define showzalloc + set $kgm_zallocation = zallocations[$arg0] + print $kgm_zallocation + showztrace $kgm_zallocation->za_trace_index +end + +document showzalloc +Syntax: showzalloc +| Prints a zallocation from the zallocations array based off its index, +| and prints the associated symbolicated backtrace. +end + +define showztrace + set $kgm_ztrace = &ztraces[$arg0] + showztraceaddr $kgm_ztrace +end + +document showztrace +Syntax: showztrace +| Prints the backtrace from the ztraces array at index +end + +define showztraceaddr + print *$arg0 + showzstacktrace $arg0->zt_stack ($arg0)->zt_depth +end + +document showztraceaddr +Syntax: showztraceaddr +| Prints the struct ztrace passed in +end + +#TODO: Iterate through the hash table, or make top_ztrace accurate in the face of deallocations (better idea). +define showtopztrace + set $kgm_top_ztrace = top_ztrace + printf "Index: %d\n", (top_ztrace - ztraces) + showztraceaddr $kgm_top_ztrace +end + +document showtopztrace +Syntax: showtopztrace +| Shows the ztrace with the biggest size. (according to top_ztrace, not by iterating through the hash table) +end + +define showzallocs + set $kgm_zallocation_current_index = 0 + set $kgm_zallocations_count = 0 + set $kgm_max_zallocation = zleak_alloc_buckets + printf "INDEX ADDRESS " + if $kgm_lp64 + printf " " + end + printf "TRACE SIZE\n" + while ($kgm_zallocation_current_index < $kgm_max_zallocation) + set $kgm_zallocation_current = zallocations[$kgm_zallocation_current_index] + if ($kgm_zallocation_current->element != 0) + printf "%5d %p ", $kgm_zallocation_current_index, $kgm_zallocation_current->za_element + printf "%5d %6lu\n", $kgm_zallocation_current->za_trace_index, $kgm_zallocation_current->za_size + set $kgm_zallocations_count = $kgm_zallocations_count + 1 + end + set $kgm_zallocation_current_index = $kgm_zallocation_current_index + 1 + end + printf "Total allocations: %d\n", $kgm_zallocations_count +end + +document showzallocs +Syntax: showzallocs +| Prints all allocations in the zallocations table +end + +define showzallocsfortrace + set $kgm_zallocation_current_index = 0 + set $kgm_zallocations_count = 0 + set $kgm_max_zallocation = zleak_alloc_buckets + printf "INDEX ADDRESS " + if $kgm_lp64 + printf " " + end + printf "SIZE\n" + while ($kgm_zallocation_current_index < $kgm_max_zallocation) + set $kgm_zallocation_current = zallocations[$kgm_zallocation_current_index] + if ($kgm_zallocation_current->element != 0 && $kgm_zallocation_current->za_trace_index == $arg0) + printf "%5d %p ", $kgm_zallocation_current_index, $kgm_zallocation_current->za_element + printf "%6lu\n", $kgm_zallocation_current->size + set $kgm_zallocations_count = $kgm_zallocations_count + 1 + end + set $kgm_zallocation_current_index = $kgm_zallocation_current_index + 1 + end + printf "Total allocations: %d\n", $kgm_zallocations_count +end + +document showzallocsfortrace +Syntax: showzallocsfortrace +| Prints all allocations pointing to the passed in trace's index into ztraces by looking through zallocations table +end + +define showztraces + showztracesabove 0 +end + +document showztraces +Syntax: showztraces +| Prints all traces with size > 0 +end + +define showztracesabove + set $kgm_ztrace_current_index = 0 + set $kgm_ztrace_count = 0 + set $kgm_max_ztrace = zleak_trace_buckets + printf "INDEX SIZE\n" + while ($kgm_ztrace_current_index < $kgm_max_ztrace) + set $kgm_ztrace_current = ztraces[$kgm_ztrace_current_index] + if ($kgm_ztrace_current->zt_size > $arg0) + printf "%5d %6lu\n", $kgm_ztrace_current_index, $kgm_ztrace_current->zt_size + set $kgm_ztrace_count = $kgm_ztrace_count + 1 + end + set $kgm_ztrace_current_index = $kgm_ztrace_current_index + 1 + end + printf "Total traces: %d\n", $kgm_ztrace_count +end + +document showztracesabove +Syntax: showztracesabove +| Prints all traces with size greater than X +end + +define showztracehistogram + set $kgm_ztrace_current_index = 0 + set $kgm_ztrace_count = 0 + set $kgm_max_ztrace = zleak_trace_buckets + printf "INDEX HIT_COUNT COLLISIONS\n" + while ($kgm_ztrace_current_index < $kgm_max_ztrace) + set $kgm_ztrace_current = ztraces[$kgm_ztrace_current_index] + if ($kgm_ztrace_current->zt_hit_count != 0) + printf "%5d %5d %5d\n", $kgm_ztrace_current_index, $kgm_ztrace_current->zt_hit_count, $kgm_ztrace_current->zt_collisions + set $kgm_ztrace_count = $kgm_ztrace_count + 1 + end + set $kgm_ztrace_current_index = $kgm_ztrace_current_index + 1 + end + printf "Total traces: %d\n", $kgm_ztrace_count +end + +document showztracehistogram +Syntax: showztracehistogram +| Prints the histogram of the ztrace table +end + +define showzallochistogram + set $kgm_zallocation_current_index = 0 + set $kgm_zallocations_count = 0 + set $kgm_max_zallocation = zleak_alloc_buckets + printf "INDEX HIT_COUNT\n" + while ($kgm_zallocation_current_index < $kgm_max_zallocation) + set $kgm_zallocation_current = zallocations[$kgm_zallocation_current_index] + if ($kgm_zallocation_current->za_hit_count != 0) + printf "%5d %5d\n", $kgm_zallocation_current_index, $kgm_zallocation_current->za_hit_count + set $kgm_zallocations_count = $kgm_zallocations_count + 1 + end + set $kgm_zallocation_current_index = $kgm_zallocation_current_index + 1 + end + printf "Total allocations: %d\n", $kgm_zallocations_count +end + +document showzallochistogram +Syntax: showzallochistogram +| Prints the histogram for the zalloc table +end + +define showzstats + printf "z_alloc_collisions: %u, z_trace_collisions: %u\n", z_alloc_collisions, z_trace_collisions + printf "z_alloc_overwrites: %u, z_trace_overwrites: %u\n", z_alloc_overwrites, z_trace_overwrites + printf "z_alloc_recorded: %u, z_trace_recorded: %u\n", z_alloc_recorded, z_trace_recorded +end + +document showzstats +Syntax: showzstats +| Prints the zone leak detection stats +end + + +set $kgm_au_sentry_hash_table_size = 97 + +define showsession1 + set $p = (struct au_sentry *)$arg0 + showptr $p + printf " 0x%08x 0x%08x 0x%016x", $p->se_auinfo.ai_asid, $p->se_auinfo.ai_auid, $p->se_auinfo.ai_flags + printf " %3ld %3ld", $p->se_refcnt, $p->se_procnt + printf "\n" +end + +define showsessionhdr + printf "au_sentry " + showptrhdrpad + printf " ASID AUID FLAGS C P\n" +end + +define showsession + showsessionhdr + showsession1 $arg0 +end + +document showsession +Syntax: showsession +| Display info about a specified audit session +end + +define showallsessions + showsessionhdr + set $kgm_au_sentry_hash_table = au_sentry_bucket + set $i = $kgm_au_sentry_hash_table_size - 1 + while $i >= 0 + set $p = $kgm_au_sentry_hash_table[$i].lh_first + while $p != 0 + showsession1 $p + set $p = $p->se_link.le_next + end + set $i = $i - 1 + end +end + +document showallsessions +Syntax: showallsessions +| Prints the audit sessions in the global hash table +end + +define showauhistorystack + set $ii = $arg0 + set $pp = (void **)$arg1 + while $ii > 0 + printf " " + x/i $pp[$ii-1] + set $ii = $ii - 1 + end +end + +define showauhistory1 + set $p = (struct au_history *)$arg0 + set $stack_depth = $p->stack_depth + set $stack = $p->stack + showptr $p->ptr + if $p->event == 1 + printf " REF" + end + if $p->event == 2 + printf " UNREF" + end + if $p->event == 3 + printf " BIRTH" + end + if $p->event == 4 + printf " DEATH" + end + if $p->event == 5 + printf " FIND" + end + set $p = &$p->se + printf " 0x%08x 0x%08x 0x%016x", $p->se_auinfo.ai_asid, $p->se_auinfo.ai_auid, $p->se_auinfo.ai_flags + printf " %3ld %3ld", $p->se_refcnt, $p->se_procnt + printf "\n" + showauhistorystack $stack_depth $stack +end + +define showauhistory + set $i = (au_history_index-1) % au_history_size + if au_history_index >= au_history_size + set $n = au_history_size + else + set $n = au_history_index + end + while $n > 0 + if au_history[$i].ptr != 0 && (0 == $arg0 || au_history[$i].ptr == $arg0) + printf "[% 4d] ", $i + showauhistory1 &au_history[$i] + end + set $n = $n - 1 + set $i = ($i - 1) % au_history_size + end +end + +define showallauhistory + showauhistory 0 +end + +define showkwqheader + printf " kwq " + showptrhdrpad + printf " kwqaddr " + showptrhdrpad + printf " inqueue fakecount highseq lowseq flags lastunlock p_rwwc" + printf "\n " +end + +define showkwqint + printf " " + set $kgm_kwq = (ksyn_wait_queue_t)$arg0 + showptr $kgm_kwq + printf " " + showptr $kgm_kwq->kw_addr + printf " " + printf " %d ", $kgm_kwq->kw_inqueue + printf " %d ", $kgm_kwq->kw_fakecount + printf " 0x%x ", $kgm_kwq->kw_highseq + printf " 0x%x ", $kgm_kwq->kw_lowseq + printf " 0x%x ", $kgm_kwq->kw_flags + printf " 0x%x ", $kgm_kwq->kw_lastunlockseq + printf " 0x%x ", $kgm_kwq->kw_pre_rwwc + printf "\n" +end + +define show_kwq + showkwqheader + showkwqint $arg0 +end + +document show_kwq +Syntax: (gdb) show_kwq +| Display info about one ksyn_wait_queue +end + +# Internal routine used by "showpthread_mutex" to abstract possible loads from +# user space +define _loadfrommutex + if (kdp_pmap == 0) + set $kgm_loadval = *(uintptr_t *)$arg0 + else + if ($kgm_x86_abi == 0xe) + set $kgm_loadval = *(uint32_t *)$arg0 + else + if ($kgm_x86_abi == 0xf) + if ($kgm_mtype == $kgm_mtype_i386) + _loadk32m64 $arg0 + set $kgm_loadval = $kgm_k32read64 + else + set $kgm_loadval = *(uint32_t *)$arg0 + end + end + end +end +end + +define show_pthreadmutex + set $newact = (struct thread *) $arg0 + set $ourtask = (struct task *)($newact->task) + set $our_user_is64 = ($ourtask->taskFeatures[0] & 0x80000000) + _kgm_flush_loop + set $mutex = (void *)$arg1 + set kdp_pmap = $newact->task->map->pmap + _kgm_flush_loop + _kgm_update_loop + set $newiss = (x86_saved_state_t *) ($newact->machine.pcb->iss) + set $kgm_x86_abi = $newiss.flavor + if ($our_user_is64 != 0) + printf "\tUser 64Bit\n " + printf "\tSignature: " + set $nextval = $mutex + _loadfrommutex $nextval + printf "0x%x\n",$kgm_loadval + printf "\tflags: " + set $nextval = $mutex + 12 + _loadfrommutex $nextval + printf "0x%x\n",$kgm_loadval + printf "\tSeqs: " + set $nextval = $mutex + 20 + _loadfrommutex $nextval + printf "0x%x ",$kgm_loadval + set $nextval = $mutex + 24 + _loadfrommutex $nextval + printf "0x%x ",$kgm_loadval + set $nextval = $mutex + 28 + _loadfrommutex $nextval + printf "0x%x\n",$kgm_loadval + printf "\ttid[0]: " + set $nextval = $mutex + 32 + _loadfrommutex $nextval + printf "0x%x\n",$kgm_loadval + printf "\ttid[1]: " + set $nextval = $mutex + 36 + _loadfrommutex $nextval + printf "0x%x\n",$kgm_loadval + else + printf "\tUser 32Bit\n " + printf "\tSignature: " + set $nextval = $mutex + _loadfrommutex $nextval + printf "0x%x\n",$kgm_loadval + printf "\tflags: " + set $nextval = $mutex + 8 + _loadfrommutex $nextval + printf "0x%x\n",$kgm_loadval + printf "\tSeqs: " + set $nextval = $mutex + 16 + _loadfrommutex $nextval + printf "0x%x ",$kgm_loadval + set $nextval = $mutex + 20 + _loadfrommutex $nextval + printf "0x%x ",$kgm_loadval + set $nextval = $mutex + 24 + _loadfrommutex $nextval + printf "0x%x\n",$kgm_loadval + printf "\ttid[0]: " + set $nextval = $mutex + 32 + _loadfrommutex $nextval + printf "0x%x\n",$kgm_loadval + printf "\ttid[1]: " + set $nextval = $mutex + 36 + _loadfrommutex $nextval + printf "0x%x\n",$kgm_loadval + end + printf "\n" + resetstacks +end + + +document show_pthreadmutex +Syntax: (gdb) show_pthreadmutex +| Display the mutex contents from userspace. +end + + +define show_pthreadcondition + set $newact = (struct thread *) $arg0 + set $ourtask = (struct task *)($newact->task) + set $our_user_is64 = ($ourtask->taskFeatures[0] & 0x80000000) + _kgm_flush_loop + set $cond = (void *)$arg1 + set kdp_pmap = $newact->task->map->pmap + _kgm_flush_loop + _kgm_update_loop + set $newiss = (x86_saved_state_t *) ($newact->machine.pcb->iss) + set $kgm_x86_abi = $newiss.flavor + if ($our_user_is64 != 0) + printf "\tUser 64Bit\n " + printf "\tSignature: " + set $nextval = $cond + _loadfrommutex $nextval + printf "0x%x\n",$kgm_loadval + printf "\tflags: " + set $nextval = $cond + 12 + _loadfrommutex $nextval + printf "0x%x\n",$kgm_loadval + printf "\tSeqs: " + set $nextval = $cond + 24 + _loadfrommutex $nextval + printf "0x%x ",$kgm_loadval + set $nextval = $cond + 28 + _loadfrommutex $nextval + printf "0x%x ",$kgm_loadval + set $nextval = $cond + 32 + _loadfrommutex $nextval + printf "0x%x\n",$kgm_loadval + printf "\tMutex lowaddr: " + set $nextval = $cond + 16 + _loadfrommutex $nextval + printf "0x%08x\n",$kgm_loadval + printf "\tMutex highaddr: " + set $nextval = $cond + 20 + _loadfrommutex $nextval + printf "0x%x\n",$kgm_loadval + else + printf "\tUser 32Bit\n " + printf "\tSignature: " + set $nextval = $cond + _loadfrommutex $nextval + printf "0x%x\n",$kgm_loadval + printf "\tflags: " + set $nextval = $cond + 8 + _loadfrommutex $nextval + printf "0x%x\n",$kgm_loadval + printf "\tSeqs: " + set $nextval = $cond + 16 + _loadfrommutex $nextval + printf "0x%x ",$kgm_loadval + set $nextval = $cond + 20 + _loadfrommutex $nextval + printf "0x%x ",$kgm_loadval + set $nextval = $cond + 24 + _loadfrommutex $nextval + printf "0x%x\n",$kgm_loadval + printf "\tMutex addr: " + set $nextval = $cond + 12 + _loadfrommutex $nextval + printf "0x%x\n",$kgm_loadval + end + printf "\n" + resetstacks +end + + +document show_pthreadcondition +Syntax: (gdb) show_pthreadcondition +| Display the condition variable contents from userspace. +end + +define processortimers + set $kgm_p = processor_list + printf "Processor\t\t\t Last dispatch\t\t Next deadline\t\t difference\n" + while $kgm_p + printf "Processor %d: %p\t", $kgm_p->cpu_id, $kgm_p + printf " 0x%016llx\t", $kgm_p->last_dispatch + set $kgm_rt_timer = &(cpu_data_ptr[$kgm_p->cpu_id].rtclock_timer) + printf " 0x%016llx \t", $kgm_rt_timer->deadline + set $kgm_rt_diff = ((long long)$kgm_p->last_dispatch) - ((long long)$kgm_rt_timer->deadline) + printf " 0x%016llx ", $kgm_rt_diff +# normally the $kgm_rt_diff will be close to the last dispatch time, or negative +# When it isn't, mark the result as bad. This is a suggestion, not an absolute + if ( ($kgm_rt_diff > 0) && ((long long)$kgm_p->last_dispatch) - ($kgm_rt_diff + 1) > 0 ) + printf "probably BAD\n" + else + printf "(ok)\n" + end + # dump the call entries (Intel only) + if (($kgm_mtype & $kgm_mtype_x86_mask) == $kgm_mtype_x86_any) + printf "Next deadline set at: 0x%016llx. Timer call list:", $kgm_rt_timer->when_set + set $kgm_entry = (queue_t *)$kgm_rt_timer->queue + if ($kgm_entry == $kgm_rt_timer) + printf " (empty)\n" + else + printf "\n entry: " + showptrhdrpad + printf "deadline soft_deadline delta (*func)(param0,param1)\n" + while $kgm_entry != $kgm_rt_timer + set $kgm_timer_call = (timer_call_t) $kgm_entry + set $kgm_call_entry = (struct call_entry *) $kgm_entry + printf " " + showptr $kgm_entry + printf ": 0x%016llx 0x%016llx 0x%08x (%p)(%p,%p)\n", \ + $kgm_call_entry->deadline, \ + $kgm_timer_call->soft_deadline, \ + ($kgm_call_entry->deadline - $kgm_timer_call->soft_deadline), \ + $kgm_call_entry->func, \ + $kgm_call_entry->param0, $kgm_call_entry->param1 + set $kgm_entry = $kgm_entry->next + end + end + end + set $kgm_p = $kgm_p->processor_list end + printf "\n" end -document kdp-connect -Syntax: (gdb) kdpconnect -| Attach to the machine with given hostname or IP address, or 'localhost' if blank +document processortimers +Syntax: (gdb) processortimers +| Print details of processor timers, noting any timer which might be suspicious end + + diff --git a/libkern/Makefile b/libkern/Makefile index 583dcb221..ff3bbec5f 100644 --- a/libkern/Makefile +++ b/libkern/Makefile @@ -9,37 +9,24 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ libkern \ uuid - -INSTINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS} - INSTINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS} - INSTINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS} - INSTINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS} EXPINC_SUBDIRS = \ libkern \ uuid - -EXPINC_SUBDIRS_PPC = ${EXPINC_SUBDIRS} - EXPINC_SUBDIRS_I386 = ${EXPINC_SUBDIRS} - EXPINC_SUBDIRS_X86_64 = ${EXPINC_SUBDIRS} - EXPINC_SUBDIRS_ARM = ${EXPINC_SUBDIRS} -SETUP_SUBDIRS = conf - -COMP_SUBDIRS_PPC = conf kmod +SETUP_SUBDIRS = COMP_SUBDIRS_I386 = conf kmod - COMP_SUBDIRS_X86_64 = conf kmod - COMP_SUBDIRS_ARM = conf kmod + INST_SUBDIRS = kmod include $(MakeInc_rule) diff --git a/libkern/OSKextLib.cpp b/libkern/OSKextLib.cpp index 4876839af..c782a830f 100644 --- a/libkern/OSKextLib.cpp +++ b/libkern/OSKextLib.cpp @@ -29,10 +29,6 @@ extern "C" { #include #include - -#include -#include -#include }; #include @@ -185,45 +181,6 @@ OSReturn OSKextCancelRequest( #if PRAGMA_MARK #pragma mark MIG Functions & Wrappers #endif -/********************************************************************* -* This function is for use only by OSKextLib.cpp and OSKext.cpp. -* -* xxx - can we cache the kextd port or do we have to get it each time -* xxx - in case it relaunches? -*********************************************************************/ -extern void ipc_port_release_send(ipc_port_t); - -kern_return_t OSKextPingKextd(void) -{ - kern_return_t result = KERN_FAILURE; - mach_port_t kextd_port = IPC_PORT_NULL; - - result = host_get_kextd_port(host_priv_self(), &kextd_port); - if (result != KERN_SUCCESS || !IPC_PORT_VALID(kextd_port)) { - OSKextLog(/* kext */ NULL, - kOSKextLogErrorLevel | - kOSKextLogIPCFlag, - "Can't get kextd port."); - goto finish; - } - - result = kextd_ping(kextd_port); - if (result != KERN_SUCCESS) { - OSKextLog(/* kext */ NULL, - kOSKextLogErrorLevel | - kOSKextLogIPCFlag, - "kextd ping failed (0x%x).", (int)result); - goto finish; - } - -finish: - if (IPC_PORT_VALID(kextd_port)) { - ipc_port_release_send(kextd_port); - } - - return result; -} - /********************************************************************* * IMPORTANT: Once we have done the vm_map_copyout(), we *must* return * KERN_SUCCESS or the kernel map gets messed up (reason as yet @@ -442,6 +399,16 @@ void OSKextRemoveKextBootstrap(void) return; } +#if CONFIG_DTRACE +/********************************************************************* +*********************************************************************/ +void OSKextRegisterKextsWithDTrace(void) +{ + OSKext::registerKextsWithDTrace(); + return; +} +#endif /* CONFIG_DTRACE */ + /********************************************************************* *********************************************************************/ void kext_dump_panic_lists(int (*printf_func)(const char * fmt, ...)) @@ -491,7 +458,7 @@ kmod_dump_log( * Compatibility implementation for kmod_get_info() host_priv routine. * Only supported on old 32-bit architectures. *********************************************************************/ -#if __ppc__ || __i386__ +#if __i386__ kern_return_t kext_get_kmod_info( kmod_info_array_t * kmod_list, @@ -499,6 +466,16 @@ kext_get_kmod_info( { return OSKext::getKmodInfo(kmod_list, kmodCount); } -#endif /* __ppc__ || __i386__ */ +#endif /* __i386__ */ + +#if PRAGMA_MARK +#pragma mark Loaded Kext Summary +#endif + +void +OSKextLoadedKextSummariesUpdated(void) +{ + // Do nothing. +} }; diff --git a/libkern/OSKextVersion.c b/libkern/OSKextVersion.c index bc1cc253c..f9013853c 100644 --- a/libkern/OSKextVersion.c +++ b/libkern/OSKextVersion.c @@ -388,6 +388,7 @@ OSKextVersion OSKextParseVersionString(const char * versionString) } /********************************************************************* +* This function must be safe to call in panic context. *********************************************************************/ Boolean OSKextVersionGetString( OSKextVersion aVersion, diff --git a/libkern/c++/OSKext.cpp b/libkern/c++/OSKext.cpp index acc3e3e98..14f0643c2 100644 --- a/libkern/c++/OSKext.cpp +++ b/libkern/c++/OSKext.cpp @@ -38,8 +38,13 @@ extern "C" { #include #include #include +#include #include +#include #include +#include +// 04/18/11 - gab: +#include }; #include @@ -51,15 +56,14 @@ extern "C" { #include #include +#include + #if PRAGMA_MARK #pragma mark External & Internal Function Protos #endif /********************************************************************* *********************************************************************/ extern "C" { -// in libkern/OSKextLib.cpp, not in header for a reason. -extern kern_return_t OSKextPingKextd(void); - extern int IODTGetLoaderInfo(const char * key, void ** infoAddr, int * infoSize); extern void IODTFreeLoaderInfo(const char * key, void * infoAddr, int infoSize); extern void OSRuntimeUnloadCPPForSegment(kernel_segment_command_t * segment); @@ -84,6 +88,11 @@ static OSReturn _OSDictionarySetCStringValue( OSDictionary * dict, const char * key, const char * value); + +// We really should add containsObject() & containsCString to OSCollection & subclasses. +// So few pad slots, though.... +static bool _OSArrayContainsCString(OSArray * array, const char * cString); + #if CONFIG_MACF_KEXT static void * MACFCopyModuleDataForKext( OSKext * theKext, @@ -177,18 +186,32 @@ typedef struct MkextEntryRef { static bool sPrelinkBoot = false; static bool sSafeBoot = false; +static bool sKeepSymbols = false; -/****** -* sKextLock is the principal lock for OSKext. Below, there is also an -* sKextInnerLock used to guard access to data accessed on in-calls from -* IOService. This 2nd lock is required to prevent a deadlock -* with IOService calling back into OSKext::considerUnloads() -* on a separate thread during a kext load operation. +/********************************************************************* +* sKextLock is the principal lock for OSKext, and guards all static +* and global variables not owned by other locks (declared further +* below). It must be taken by any entry-point method or function, +* including internal functions called on scheduled threads. +* +* sKextLock and sKextInnerLock are recursive due to multiple functions +* that are called both externally and internally. The other locks are +* nonrecursive. +* +* Which locks are taken depends on what they protect, but if more than +* one must be taken, they must always be locked in this order +* (and unlocked in reverse order) to prevent deadlocks: +* +* 1. sKextLock +* 2. sKextInnerLock +* 3. sKextSummariesLock +* 4. sKextLoggingLock */ static IORecursiveLock * sKextLock = NULL; static OSDictionary * sKextsByID = NULL; static OSArray * sLoadedKexts = NULL; +static OSArray * sUnloadedPrelinkedKexts = NULL; // Requests to kextd waiting to be picked up. static OSArray * sKernelRequests = NULL; @@ -207,7 +230,11 @@ static bool sKextdActive = false; static bool sDeferredLoadSucceeded = false; static bool sConsiderUnloadsExecuted = false; +#if NO_KEXTD +static bool sKernelRequestsEnabled = false; +#else static bool sKernelRequestsEnabled = true; +#endif static bool sLoadEnabled = true; static bool sUnloadEnabled = true; @@ -252,32 +279,26 @@ kmod_info_t * kmod = NULL; #define KEXT_PANICLIST_SIZE (2 * PAGE_SIZE) -static char * unloaded_kext_paniclist = NULL; -static uint32_t unloaded_kext_paniclist_size = 0; -static uint32_t unloaded_kext_paniclist_length = 0; + +static char * loaded_kext_paniclist = NULL; +static uint32_t loaded_kext_paniclist_size = 0; +static uint32_t loaded_kext_paniclist_length = 0; + AbsoluteTime last_loaded_timestamp; +static char last_loaded_str[2*KMOD_MAX_NAME]; +static u_long last_loaded_strlen = 0; +static void * last_loaded_address = NULL; +static u_long last_loaded_size = 0; -static char * loaded_kext_paniclist = NULL; -static uint32_t loaded_kext_paniclist_size = 0; -static uint32_t loaded_kext_paniclist_length = 0; AbsoluteTime last_unloaded_timestamp; -static void * last_unloaded_address = NULL; -#if __LP64__ -static uint64_t last_unloaded_size = 0; -#else -static uint32_t last_unloaded_size = 0; -#endif /* __LP64__ */ - -}; +static char last_unloaded_str[2*KMOD_MAX_NAME]; +static u_long last_unloaded_strlen = 0; +static void * last_unloaded_address = NULL; +static u_long last_unloaded_size = 0; /********************************************************************* -* Because we can start IOService matching from OSKext (via IOCatalogue) -* and IOService can call into OSKext, there is potential for cross-lock -* contention, so OSKext needs two locks. The regular sKextLock above -* guards most OSKext class/static variables, and sKextInnerLock guards -* variables that can be accessed on in-calls from IOService, currently: -* -* * OSKext::considerUnloads() +* sKextInnerLock protects against cross-calls with IOService and +* IOCatalogue, and owns the variables declared immediately below. * * Note that sConsiderUnloadsExecuted above belongs to sKextLock! * @@ -286,9 +307,6 @@ static uint32_t last_unloaded_size = 0; * locks in an entry point to OSKext; if you need to do so, you must * spawn an independent thread to avoid potential deadlocks for threads * calling into OSKext. -* -* All static variables from here to the closing comment block fall -* under sKextInnerLock. **********/ static IORecursiveLock * sKextInnerLock = NULL; @@ -301,11 +319,33 @@ static thread_call_t sUnloadCallout = 0; static thread_call_t sDestroyLinkContextThread = 0; // one-shot, one-at-a-time thread static bool sSystemSleep = false; // true when system going to sleep +/********************************************************************* +* Backtraces can be printed at various times so we need a tight lock +* on data used for that. sKextSummariesLock protects the variables +* declared immediately below. +* +* gLoadedKextSummaries is accessed by other modules, but only during +* a panic so the lock isn't needed then. +**********/ +static IOLock * sKextSummariesLock = NULL; + +void (*sLoadedKextSummariesUpdated)(void) = OSKextLoadedKextSummariesUpdated; +OSKextLoadedKextSummaryHeader * gLoadedKextSummaries = NULL; +static size_t sLoadedKextSummariesAllocSize = 0; +OSKextLoadedKextSummaryHeader * sPrevLoadedKextSummaries = NULL; +static size_t sPrevLoadedKextSummariesAllocSize = 0; +}; + +/********************************************************************* +* sKextLoggingLock protects the logging variables declared immediately below. +**********/ +static IOLock * sKextLoggingLock = NULL; + static const OSKextLogSpec kDefaultKernelLogFilter = kOSKextLogBasicLevel | kOSKextLogVerboseFlagsMask; static OSKextLogSpec sKernelLogFilter = kDefaultKernelLogFilter; static bool sBootArgLogFilterFound = false; -SYSCTL_INT(_debug, OID_AUTO, kextlog, CTLFLAG_RW, &sKernelLogFilter, +SYSCTL_INT(_debug, OID_AUTO, kextlog, CTLFLAG_RW | CTLFLAG_LOCKED, &sKernelLogFilter, sKernelLogFilter, "kernel kext logging"); static OSKextLogSpec sUserSpaceKextLogFilter = kOSKextLogSilentFilter; @@ -338,6 +378,12 @@ void osdata_vm_deallocate(void * ptr, unsigned int length) (void)vm_deallocate(kernel_map, (vm_offset_t)ptr, length); return; } + +void osdata_kext_free(void * ptr, unsigned int length) +{ + (void)kext_free((vm_offset_t)ptr, length); +} + }; #if PRAGMA_MARK @@ -370,9 +416,6 @@ kern_allocate( } /* Create an OSData wrapper for the allocated buffer. - * Note that we do not set a dealloc function on it here. - * We have to call vm_map_unwire() on it in OSKext::unload() - * and an OSData dealloc function can't take all those parameters. */ linkBuffer = OSData::withBytesNoCopy((void *)result, roundSize); if (!linkBuffer) { @@ -383,6 +426,7 @@ kern_allocate( theKext->getIdentifierCString()); goto finish; } + linkBuffer->setDeallocFunction(osdata_kext_free); OSKextLog(theKext, kOSKextLogProgressLevel | @@ -453,6 +497,41 @@ kxld_log_callback( OSKextVLog(theKext, logSpec, format, argList); } +#if PRAGMA_MARK +#pragma mark IOStatistics defines +#endif + +#if IOKITSTATS + +#define notifyKextLoadObservers(kext, kmod_info) \ +do { \ + IOStatistics::onKextLoad(kext, kmod_info); \ +} while (0) + +#define notifyKextUnloadObservers(kext) \ +do { \ + IOStatistics::onKextUnload(kext); \ +} while (0) + +#define notifyAddClassObservers(kext, addedClass, flags) \ +do { \ + IOStatistics::onClassAdded(kext, addedClass); \ +} while (0) + +#define notifyRemoveClassObservers(kext, removedClass, flags) \ +do { \ + IOStatistics::onClassRemoved(kext, removedClass); \ +} while (0) + +#else + +#define notifyKextLoadObservers(kext, kmod_info) +#define notifyKextUnloadObservers(kext) +#define notifyAddClassObservers(kext, addedClass, flags) +#define notifyRemoveClassObservers(kext, removedClass, flags) + +#endif /* IOKITSTATS */ + #if PRAGMA_MARK #pragma mark Module Config (Startup & Shutdown) #endif @@ -484,18 +563,23 @@ OSKext::initialize(void) */ sKextLock = IORecursiveLockAlloc(); sKextInnerLock = IORecursiveLockAlloc(); + sKextSummariesLock = IOLockAlloc(); + sKextLoggingLock = IOLockAlloc(); assert(sKextLock); assert(sKextInnerLock); + assert(sKextSummariesLock); + assert(sKextLoggingLock); sKextsByID = OSDictionary::withCapacity(kOSKextTypicalLoadCount); sLoadedKexts = OSArray::withCapacity(kOSKextTypicalLoadCount); + sUnloadedPrelinkedKexts = OSArray::withCapacity(kOSKextTypicalLoadCount / 10); sKernelRequests = OSArray::withCapacity(0); sPostedKextLoadIdentifiers = OSSet::withCapacity(0); sAllKextLoadIdentifiers = OSSet::withCapacity(kOSKextTypicalLoadCount); sRequestCallbackRecords = OSArray::withCapacity(0); assert(sKextsByID && sLoadedKexts && sKernelRequests && sPostedKextLoadIdentifiers && sAllKextLoadIdentifiers && - sRequestCallbackRecords); + sRequestCallbackRecords && sUnloadedPrelinkedKexts); /* Read the log flag boot-args and set the log flags. */ @@ -521,6 +605,8 @@ OSKext::initialize(void) "only valid OSBundleRequired kexts will be loaded."); } + PE_parse_boot_argn("keepsyms", &sKeepSymbols, sizeof(sKeepSymbols)); + /* Set up an OSKext instance to represent the kernel itself. */ sKernelKext = new OSKext; @@ -538,7 +624,6 @@ OSKext::initialize(void) sKernelKext->version = OSKextParseVersionString(osrelease); sKernelKext->compatibleVersion = sKernelKext->version; sKernelKext->linkedExecutable = kernelExecutable; - // linkState will be set first time we do a link sKernelKext->flags.hasAllDependencies = 1; sKernelKext->flags.kernelComponent = 1; @@ -614,6 +699,8 @@ OSKext::initialize(void) kOSKextLogGeneralFlag, "Kext system initialized."); + notifyKextLoadObservers(sKernelKext, sKernelKext->kmod_info); + return; } @@ -628,7 +715,6 @@ OSKext::removeKextBootstrap(void) OSReturn result = kOSReturnError; static bool alreadyDone = false; - boolean_t keepsyms = FALSE; const char * dt_kernel_header_name = "Kernel-__HEADER"; const char * dt_kernel_symtab_name = "Kernel-__SYMTAB"; @@ -639,11 +725,6 @@ OSKext::removeKextBootstrap(void) int dt_result = 0; kernel_segment_command_t * seg_to_remove = NULL; -#if __ppc__ || __arm__ - const char * dt_segment_name = NULL; - void * segment_paddress = NULL; - int segment_size = 0; -#endif /* This must be the very first thing done by this function. */ @@ -661,8 +742,6 @@ OSKext::removeKextBootstrap(void) kOSKextLogGeneralFlag, "Jettisoning kext bootstrap segments."); - PE_parse_boot_argn("keepsyms", &keepsyms, sizeof(keepsyms)); - /***** * Dispose of unnecessary stuff that the booter didn't need to load. */ @@ -688,21 +767,17 @@ OSKext::removeKextBootstrap(void) OSRuntimeUnloadCPPForSegment(seg_to_remove); } -#if __ppc__ || __arm__ - /* Free the memory that was set up by bootx. - */ - dt_segment_name = "Kernel-__KLD"; - if (0 == IODTGetLoaderInfo(dt_segment_name, &segment_paddress, &segment_size)) { - IODTFreeLoaderInfo(dt_segment_name, (void *)segment_paddress, - (int)segment_size); - } -#elif __i386__ || __x86_64__ +#if __i386__ || __x86_64__ /* On x86, use the mapping data from the segment load command to * unload KLD directly. * This may invalidate any assumptions about "avail_start" * defining the lower bound for valid physical addresses. */ if (seg_to_remove && seg_to_remove->vmaddr && seg_to_remove->vmsize) { + // 04/18/11 - gab: + // overwrite memory occupied by KLD segment with random data before + // releasing it. + read_random((void *) seg_to_remove->vmaddr, seg_to_remove->vmsize); ml_static_mfree(seg_to_remove->vmaddr, seg_to_remove->vmsize); } #else @@ -711,7 +786,7 @@ OSKext::removeKextBootstrap(void) seg_to_remove = NULL; - /***** + /***** * Prelinked kernel's symtab (if there is one). */ kernel_section_t * sect; @@ -720,36 +795,101 @@ OSKext::removeKextBootstrap(void) ml_static_mfree(sect->addr, sect->size); } - /***** - * Dump the LINKEDIT segment, unless keepsyms is set. - */ - if (!keepsyms) { - seg_to_remove = (kernel_segment_command_t *)getsegbyname("__LINKEDIT"); - if (seg_to_remove) { - OSRuntimeUnloadCPPForSegment(seg_to_remove); + seg_to_remove = (kernel_segment_command_t *)getsegbyname("__LINKEDIT"); + + /* kxld always needs the kernel's __LINKEDIT segment, but we can make it + * pageable, unless keepsyms is set. To do that, we have to copy it from + * its booter-allocated memory, free the booter memory, reallocate proper + * managed memory, then copy the segment back in. + */ +#if CONFIG_KXLD + if (!sKeepSymbols) { + kern_return_t mem_result; + void *seg_copy = NULL; + void *seg_data = NULL; + vm_map_offset_t seg_offset = 0; + vm_map_offset_t seg_copy_offset = 0; + vm_map_size_t seg_length = 0; + + seg_data = (void *) seg_to_remove->vmaddr; + seg_offset = (vm_map_offset_t) seg_to_remove->vmaddr; + seg_length = (vm_map_size_t) seg_to_remove->vmsize; + + /* Allocate space for the LINKEDIT copy. + */ + mem_result = kmem_alloc(kernel_map, (vm_offset_t *) &seg_copy, + seg_length); + if (mem_result != KERN_SUCCESS) { + OSKextLog(/* kext */ NULL, + kOSKextLogErrorLevel | + kOSKextLogGeneralFlag | kOSKextLogArchiveFlag, + "Can't copy __LINKEDIT segment for VM reassign."); + goto finish; } + seg_copy_offset = (vm_map_offset_t) seg_copy; -#if __ppc__ || __arm__ - dt_segment_name = "Kernel-__LINKEDIT"; - if (0 == IODTGetLoaderInfo(dt_segment_name, - &segment_paddress, &segment_size)) { + /* Copy it out. + */ + memcpy(seg_copy, seg_data, seg_length); + + /* Dump the booter memory. + */ + ml_static_mfree(seg_offset, seg_length); - IODTFreeLoaderInfo(dt_segment_name, (void *)segment_paddress, - (int)segment_size); + /* Set up the VM region. + */ + mem_result = vm_map_enter_mem_object( + kernel_map, + &seg_offset, + seg_length, /* mask */ 0, + VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE, + (ipc_port_t)NULL, + (vm_object_offset_t) 0, + /* copy */ FALSE, + /* cur_protection */ VM_PROT_ALL, + /* max_protection */ VM_PROT_ALL, + /* inheritance */ VM_INHERIT_DEFAULT); + if ((mem_result != KERN_SUCCESS) || + (seg_offset != (vm_map_offset_t) seg_data)) + { + OSKextLog(/* kext */ NULL, + kOSKextLogErrorLevel | + kOSKextLogGeneralFlag | kOSKextLogArchiveFlag, + "Can't create __LINKEDIT VM entry at %p, length 0x%llx (error 0x%x).", + seg_data, seg_length, mem_result); + goto finish; } -#elif __i386__ || __x86_64__ + + /* And copy it back. + */ + memcpy(seg_data, seg_copy, seg_length); + + /* Free the copy. + */ + kmem_free(kernel_map, seg_copy_offset, seg_length); + } +#else /* we are not CONFIG_KXLD */ + + /***** + * Dump the LINKEDIT segment, unless keepsyms is set. + */ + if (!sKeepSymbols) { +#if __i386__ || __x86_64__ if (seg_to_remove && seg_to_remove->vmaddr && seg_to_remove->vmsize) { ml_static_mfree(seg_to_remove->vmaddr, seg_to_remove->vmsize); } -#else +#else /* from if __arm__ */ + #error arch -#endif +#endif /* from if __arm__ */ + } else { OSKextLog(/* kext */ NULL, - kOSKextLogBasicLevel | - kOSKextLogGeneralFlag, - "keepsyms boot arg specified; keeping linkedit segment for symbols."); + kOSKextLogBasicLevel | + kOSKextLogGeneralFlag, + "keepsyms boot arg specified; keeping linkedit segment for symbols."); } +#endif /* CONFIG_KXLD */ seg_to_remove = NULL; @@ -866,13 +1006,61 @@ OSKext::setKextdActive(Boolean active) IORecursiveLockLock(sKextLock); sKextdActive = active; if (sKernelRequests->getCount()) { - OSKextPingKextd(); + OSKext::pingKextd(); } IORecursiveLockUnlock(sKextLock); return; } +/********************************************************************* +* OSKextLib.cpp might need access to this someday but for now it's +* private. +*********************************************************************/ +extern "C" { +extern void ipc_port_release_send(ipc_port_t); +}; + +/* static */ +OSReturn +OSKext::pingKextd(void) +{ + OSReturn result = kOSReturnError; +#if !NO_KEXTD + mach_port_t kextd_port = IPC_PORT_NULL; + + if (!sKextdActive) { + result = kOSKextReturnDisabled; // basically unavailable + goto finish; + } + + result = host_get_kextd_port(host_priv_self(), &kextd_port); + if (result != KERN_SUCCESS || !IPC_PORT_VALID(kextd_port)) { + OSKextLog(/* kext */ NULL, + kOSKextLogErrorLevel | + kOSKextLogIPCFlag, + "Can't get kextd port."); + goto finish; + } + + result = kextd_ping(kextd_port); + if (result != KERN_SUCCESS) { + OSKextLog(/* kext */ NULL, + kOSKextLogErrorLevel | + kOSKextLogIPCFlag, + "kextd ping failed (0x%x).", (int)result); + goto finish; + } + +finish: + if (IPC_PORT_VALID(kextd_port)) { + ipc_port_release_send(kextd_port); + } +#endif + + return result; +} + /********************************************************************* *********************************************************************/ /* static */ @@ -893,7 +1081,9 @@ OSKext::setDeferredLoadSucceeded(Boolean succeeded) void OSKext::willShutdown(void) { +#if !NO_KEXTD OSReturn checkResult = kOSReturnError; +#endif OSDictionary * exitRequest = NULL; // must release IORecursiveLockLock(sKextLock); @@ -903,6 +1093,7 @@ OSKext::willShutdown(void) OSKext::setAutounloadsEnabled(false); OSKext::setKernelRequestsEnabled(false); +#if !NO_KEXTD OSKextLog(/* kext */ NULL, kOSKextLogProgressLevel | kOSKextLogGeneralFlag, @@ -917,9 +1108,11 @@ OSKext::willShutdown(void) goto finish; } - OSKextPingKextd(); + OSKext::pingKextd(); finish: +#endif + IORecursiveLockUnlock(sKextLock); OSSafeRelease(exitRequest); @@ -1129,15 +1322,12 @@ OSKext::initWithPrelinkedInfoDict( OSDictionary * anInfoDict) { bool result = false; - kern_return_t alloc_result = KERN_SUCCESS; OSString * kextPath = NULL; // do not release OSNumber * addressNum = NULL; // reused; do not release OSNumber * lengthNum = NULL; // reused; do not release void * data = NULL; // do not free void * srcData = NULL; // do not free OSData * prelinkedExecutable = NULL; // must release - void * linkStateCopy = NULL; // kmem_free on error - uint32_t linkStateLength = 0; uint32_t length = 0; // reused if (!super::init()) { @@ -1153,62 +1343,19 @@ OSKext::initWithPrelinkedInfoDict( goto finish; } - /* Don't need the path to be in the info dictionary any more. + /* Also get the executable's bundle-relative path if present. + * Don't look for an arch-specific path property. */ - anInfoDict->removeObject(kPrelinkBundlePathKey); + executableRelPath = OSDynamicCast(OSString, + anInfoDict->getObject(kPrelinkExecutableRelativePathKey)); + if (executableRelPath) { + executableRelPath->retain(); + } - /* If we have a link state, create an OSData wrapper for it. + /* Don't need the paths to be in the info dictionary any more. */ - addressNum = OSDynamicCast(OSNumber, - anInfoDict->getObject(kPrelinkLinkStateKey)); - if (addressNum) { - lengthNum = OSDynamicCast(OSNumber, - anInfoDict->getObject(kPrelinkLinkStateSizeKey)); - if (!lengthNum) { - OSKextLog(this, - kOSKextLogErrorLevel | - kOSKextLogArchiveFlag, - "Kext %s can't find prelinked kext link state size.", - getIdentifierCString()); - goto finish; - } - - data = (void *) (intptr_t) (addressNum->unsigned64BitValue()); - linkStateLength = (uint32_t) (lengthNum->unsigned32BitValue()); - - anInfoDict->removeObject(kPrelinkLinkStateKey); - anInfoDict->removeObject(kPrelinkLinkStateSizeKey); - - /* Copy the link state out of the booter-provided memory so it is in - * the VM system and we can page it out. - */ - alloc_result = kmem_alloc_pageable(kernel_map, - (vm_offset_t *)&linkStateCopy, linkStateLength); - if (alloc_result != KERN_SUCCESS) { - OSKextLog(this, - kOSKextLogErrorLevel | - kOSKextLogArchiveFlag, - "Kext %s failed to copy prelinked link state.", - getIdentifierCString()); - goto finish; - } - memcpy(linkStateCopy, data, linkStateLength); - - linkState = OSData::withBytesNoCopy(linkStateCopy, linkStateLength); - if (!linkState) { - OSKextLog(this, - kOSKextLogErrorLevel | - kOSKextLogArchiveFlag, - "Kext %s failed to create link state wrapper.", - getIdentifierCString()); - goto finish; - } - linkState->setDeallocFunction(osdata_kmem_free); - - /* Clear linkStateCopy; the OSData owns it now so we mustn't free it. - */ - linkStateCopy = NULL; - } + anInfoDict->removeObject(kPrelinkBundlePathKey); + anInfoDict->removeObject(kPrelinkExecutableRelativePathKey); /* Create an OSData wrapper around the linked executable. */ @@ -1241,6 +1388,8 @@ OSKext::initWithPrelinkedInfoDict( if (data != srcData) { #if __LP64__ + kern_return_t alloc_result; + alloc_result = kext_alloc((vm_offset_t *)&data, length, /* fixed */ TRUE); if (alloc_result != KERN_SUCCESS) { OSKextLog(this, @@ -1263,11 +1412,6 @@ OSKext::initWithPrelinkedInfoDict( anInfoDict->removeObject(kPrelinkExecutableSourceKey); } - /* We don't need to set a dealloc function for the linked executable - * because it is freed separately in OSKext::unload(), which must unwire - * part of the memory. - * xxx - do we *have* to do it that way? - */ prelinkedExecutable = OSData::withBytesNoCopy(data, length); if (!prelinkedExecutable) { OSKextLog(this, @@ -1277,6 +1421,7 @@ OSKext::initWithPrelinkedInfoDict( getIdentifierCString()); goto finish; } + prelinkedExecutable->setDeallocFunction(osdata_kext_free); setLinkedExecutable(prelinkedExecutable); addressNum = OSDynamicCast(OSNumber, @@ -1316,13 +1461,6 @@ OSKext::initWithPrelinkedInfoDict( result = registerIdentifier(); finish: - - /* If we didn't hand linkStateCopy off to an OSData, free it. - */ - if (linkStateCopy) { - kmem_free(kernel_map, (vm_offset_t)linkStateCopy, linkStateLength); - } - OSSafeRelease(prelinkedExecutable); return result; @@ -1367,7 +1505,7 @@ OSKext::initWithBooterData( void * executableAddr = NULL; // do not free char * bundlePathAddr = NULL; // do not free - OSObject * parsedXML = NULL; // must release + OSObject * parsedXML = NULL; // must release OSDictionary * theInfoDict = NULL; // do not release OSString * kextPath = NULL; // must release OSString * errorString = NULL; // must release @@ -1527,6 +1665,8 @@ OSKext::registerIdentifier(void) OSData * newUUID = NULL; // must release OSData * existingUUID = NULL; // must release + IORecursiveLockLock(sKextLock); + /* Get the new kext's version for checks & log messages. */ newVersion = getVersion(); @@ -1691,6 +1831,8 @@ OSKext::registerIdentifier(void) finish: + IORecursiveLockUnlock(sKextLock); + if (result) { OSKextLog(this, kOSKextLogStepLevel | @@ -1708,21 +1850,20 @@ OSKext::registerIdentifier(void) /********************************************************************* * Does the bare minimum validation to look up a kext. * All other validation is done on the spot as needed. -* -* No need for lock, only called from init **********************************************************************/ bool OSKext::setInfoDictionaryAndPath( OSDictionary * aDictionary, OSString * aPath) { - bool result = false; - OSString * bundleIDString = NULL; // do not release - OSString * versionString = NULL; // do not release - OSString * compatibleVersionString = NULL; // do not release - const char * versionCString = NULL; // do not free - const char * compatibleVersionCString = NULL; // do not free - OSBoolean * scratchBool = NULL; // do not release + bool result = false; + OSString * bundleIDString = NULL; // do not release + OSString * versionString = NULL; // do not release + OSString * compatibleVersionString = NULL; // do not release + const char * versionCString = NULL; // do not free + const char * compatibleVersionCString = NULL; // do not free + OSBoolean * scratchBool = NULL; // do not release + OSDictionary * scratchDict = NULL; // do not release if (infoDict) { panic("Attempt to set info dictionary on a kext " @@ -1845,13 +1986,13 @@ OSKext::setInfoDictionaryAndPath( */ scratchBool = OSDynamicCast(OSBoolean, getPropertyForHostArch(kOSBundleIsInterfaceKey)); - if (scratchBool && scratchBool->isTrue()) { + if (scratchBool == kOSBooleanTrue) { flags.interface = 1; } scratchBool = OSDynamicCast(OSBoolean, getPropertyForHostArch(kOSKernelResourceKey)); - if (scratchBool && scratchBool->isTrue()) { + if (scratchBool == kOSBooleanTrue) { flags.kernelComponent = 1; flags.interface = 1; // xxx - hm. the kernel itself isn't an interface... flags.started = 1; @@ -1861,6 +2002,14 @@ OSKext::setInfoDictionaryAndPath( flags.hasAllDependencies = 1; } + /* Make sure common string values in personalities are uniqued to OSSymbols. + */ + scratchDict = OSDynamicCast(OSDictionary, + getPropertyForHostArch(kIOKitPersonalitiesKey)); + if (scratchDict) { + uniquePersonalityProperties(scratchDict); + } + result = true; finish: @@ -1917,6 +2066,95 @@ OSKext::setExecutable( return result; } +/********************************************************************* +*********************************************************************/ +static void +uniqueStringPlistProperty(OSDictionary * dict, const char * key) +{ + OSString * stringValue = NULL; // do not release + const OSSymbol * symbolValue = NULL; // must release + + stringValue = OSDynamicCast(OSString, dict->getObject(key)); + if (!stringValue) { + goto finish; + } + + symbolValue = OSSymbol::withString(stringValue); + if (!symbolValue) { + goto finish; + } + + dict->setObject(key, symbolValue); + +finish: + if (symbolValue) symbolValue->release(); + + return; +} + +/********************************************************************* +*********************************************************************/ +static void +uniqueStringPlistProperty(OSDictionary * dict, const OSString * key) +{ + OSString * stringValue = NULL; // do not release + const OSSymbol * symbolValue = NULL; // must release + + stringValue = OSDynamicCast(OSString, dict->getObject(key)); + if (!stringValue) { + goto finish; + } + + symbolValue = OSSymbol::withString(stringValue); + if (!symbolValue) { + goto finish; + } + + dict->setObject(key, symbolValue); + +finish: + if (symbolValue) symbolValue->release(); + + return; +} + +/********************************************************************* +* Replace common personality property values with uniqued instances +* to save on wired memory. +*********************************************************************/ +/* static */ +void +OSKext::uniquePersonalityProperties(OSDictionary * personalityDict) +{ + /* Properties every personality has. + */ + uniqueStringPlistProperty(personalityDict, kCFBundleIdentifierKey); + uniqueStringPlistProperty(personalityDict, kIOProviderClassKey); + uniqueStringPlistProperty(personalityDict, gIOClassKey); + + /* Other commonly used properties. + */ + uniqueStringPlistProperty(personalityDict, gIOMatchCategoryKey); + uniqueStringPlistProperty(personalityDict, gIOResourceMatchKey); + uniqueStringPlistProperty(personalityDict, gIOUserClientClassKey); + + uniqueStringPlistProperty(personalityDict, "HIDDefaultBehavior"); + uniqueStringPlistProperty(personalityDict, "HIDPointerAccelerationType"); + uniqueStringPlistProperty(personalityDict, "HIDRemoteControlType"); + uniqueStringPlistProperty(personalityDict, "HIDScrollAccelerationType"); + uniqueStringPlistProperty(personalityDict, "IOPersonalityPublisher"); + uniqueStringPlistProperty(personalityDict, "Physical Interconnect"); + uniqueStringPlistProperty(personalityDict, "Physical Interconnect Location"); + uniqueStringPlistProperty(personalityDict, "Vendor"); + uniqueStringPlistProperty(personalityDict, "Vendor Identification"); + uniqueStringPlistProperty(personalityDict, "Vendor Name"); + uniqueStringPlistProperty(personalityDict, "bConfigurationValue"); + uniqueStringPlistProperty(personalityDict, "bInterfaceNumber"); + uniqueStringPlistProperty(personalityDict, "idProduct"); + + return; +} + /********************************************************************* *********************************************************************/ void @@ -1929,8 +2167,8 @@ OSKext::free(void) OSSafeRelease(infoDict); OSSafeRelease(bundleID); OSSafeRelease(path); + OSSafeRelease(executableRelPath); OSSafeRelease(dependencies); - OSSafeRelease(linkState); OSSafeRelease(linkedExecutable); OSSafeRelease(metaClasses); OSSafeRelease(interfaceUUID); @@ -2032,7 +2270,7 @@ OSKext::readMkext1Archive( uint32_t numKexts = 0; OSData * infoDictDataObject = NULL; // must release - OSObject * parsedXML = NULL; // must release + OSObject * parsedXML = NULL; // must release OSDictionary * infoDict = NULL; // do not release OSString * errorString = NULL; // must release OSData * mkextExecutableInfo = NULL; // must release @@ -2361,7 +2599,7 @@ OSKext::readMkext2Archive( OSString * errorString = NULL; // must release OSData * mkextPlistUncompressedData = NULL; // must release const char * mkextPlistDataBuffer = NULL; // do not free - OSObject * parsedXML = NULL; // must release + OSObject * parsedXML = NULL; // must release OSDictionary * mkextPlist = NULL; // do not release OSArray * mkextInfoDictArray = NULL; // do not release uint32_t count, i; @@ -2550,9 +2788,18 @@ OSKext::initWithMkext2Info( goto finish; } - /* Don't need the path to be in the info dictionary any more. + /* If we have a path to the executable, save it. + */ + executableRelPath = OSDynamicCast(OSString, + anInfoDict->getObject(kMKEXTExecutableRelativePathKey)); + if (executableRelPath) { + executableRelPath->retain(); + } + + /* Don't need the paths to be in the info dictionary any more. */ anInfoDict->removeObject(kMKEXTBundlePathKey); + anInfoDict->removeObject(kMKEXTExecutableRelativePathKey); executableOffsetNum = OSDynamicCast(OSNumber, infoDict->getObject(kMKEXTExecutableKey)); @@ -2688,7 +2935,7 @@ OSKext::extractMkext2FileData( /* How's this for cheesy? The kernel is only asked to extract * kext plists so we tailor the log messages. */ - if (this == sKernelKext) { + if (isKernel()) { OSKextLog(this, kOSKextLogErrorLevel | kOSKextLogArchiveFlag, @@ -2705,7 +2952,7 @@ OSKext::extractMkext2FileData( } uncompressedData = OSData::withBytesNoCopy(uncompressedDataBuffer, fullSize); if (!uncompressedData) { - if (this == sKernelKext) { + if (isKernel()) { OSKextLog(this, kOSKextLogErrorLevel | kOSKextLogArchiveFlag, @@ -2721,7 +2968,7 @@ OSKext::extractMkext2FileData( } uncompressedData->setDeallocFunction(&osdata_kmem_free); - if (this == sKernelKext) { + if (isKernel()) { OSKextLog(this, kOSKextLogDetailLevel | kOSKextLogArchiveFlag, @@ -2747,7 +2994,7 @@ OSKext::extractMkext2FileData( zlib_result = inflateInit(&zstream); if (Z_OK != zlib_result) { - if (this == sKernelKext) { + if (isKernel()) { OSKextLog(this, kOSKextLogErrorLevel | kOSKextLogArchiveFlag, @@ -2770,7 +3017,7 @@ OSKext::extractMkext2FileData( if (zlib_result == Z_STREAM_END || zlib_result == Z_OK) { uncompressedSize = zstream.total_out; } else { - if (this == sKernelKext) { + if (isKernel()) { OSKextLog(this, kOSKextLogErrorLevel | kOSKextLogArchiveFlag, @@ -2793,7 +3040,7 @@ OSKext::extractMkext2FileData( } if (uncompressedSize != fullSize) { - if (this == sKernelKext) { + if (isKernel()) { OSKextLog(this, kOSKextLogErrorLevel | kOSKextLogArchiveFlag, @@ -2954,7 +3201,7 @@ OSKext::loadFromMkext( } startKextExcludeNum = OSDynamicCast(OSNumber, - requestArgs->getObject(kKextKextRequestArgumentStartExcludeKey)); + requestArgs->getObject(kKextRequestArgumentStartExcludeKey)); startMatchingExcludeNum = OSDynamicCast(OSNumber, requestArgs->getObject(kKextRequestArgumentStartMatchingExcludeKey)); delayAutounloadBool = OSDynamicCast(OSBoolean, @@ -3243,6 +3490,7 @@ OSKext::removeKext( } if (aKext->isLoaded()) { + /* If we are terminating, send the request to the IOCatalogue * (which will actually call us right back but that's ok we have * a recursive lock don't you know) but do not ask the IOCatalogue @@ -3253,7 +3501,7 @@ OSKext::removeKext( aKext->getIdentifierCString(), /* unload */ false); if (result != kOSReturnSuccess) { OSKextLog(aKext, - kOSKextLogProgressLevel | + kOSKextLogErrorLevel | kOSKextLogKextBookkeepingFlag, "Can't remove kext %s; services failed to terminate - 0x%x.", aKext->getIdentifierCString(), result); @@ -3417,6 +3665,14 @@ OSKext::getCompatibleVersion(void) return compatibleVersion; } +/********************************************************************* +*********************************************************************/ +bool +OSKext::isLibrary(void) +{ + return (getCompatibleVersion() > 0); +} + /********************************************************************* *********************************************************************/ bool @@ -3434,10 +3690,7 @@ OSKext::isCompatibleWithVersion(OSKextVersion aVersion) bool OSKext::declaresExecutable(void) { - if (getPropertyForHostArch(kCFBundleExecutableKey)) { - return true; - } - return false; + return (getPropertyForHostArch(kCFBundleExecutableKey) != NULL); } /********************************************************************* @@ -3512,6 +3765,14 @@ OSKext::isInterface(void) return flags.interface; } +/********************************************************************* +*********************************************************************/ +bool +OSKext::isKernel(void) +{ + return (this == sKernelKext); +} + /********************************************************************* *********************************************************************/ bool @@ -3520,6 +3781,14 @@ OSKext::isKernelComponent(void) return flags.kernelComponent ? true : false; } +/********************************************************************* +*********************************************************************/ +bool +OSKext::isExecutable(void) +{ + return (!isKernel() && !isInterface() && declaresExecutable()); +} + /********************************************************************* * We might want to check this recursively for all dependencies, * since a subtree of dependencies could get loaded before we hit @@ -3538,6 +3807,10 @@ OSKext::isLoadableInSafeBoot(void) bool result = false; OSString * required = NULL; // do not release + if (isKernel()) { + result = true; + goto finish; + } required = OSDynamicCast(OSString, getPropertyForHostArch(kOSBundleRequiredKey)); @@ -3604,6 +3877,28 @@ OSKext::getLoadTag(void) return loadTag; } +/********************************************************************* + *********************************************************************/ +void OSKext::getSizeInfo(uint32_t *loadSize, uint32_t *wiredSize) +{ + if (linkedExecutable) { + *loadSize = linkedExecutable->getLength(); + + /* If we have a kmod_info struct, calculated the wired size + * from that. Otherwise it's the full load size. + */ + if (kmod_info) { + *wiredSize = *loadSize - kmod_info->hdr_size; + } else { + *wiredSize = *loadSize; + } + } + else { + *wiredSize = 0; + *loadSize = 0; + } +} + /********************************************************************* *********************************************************************/ OSData * @@ -3654,9 +3949,7 @@ OSKext::copyUUID(void) /********************************************************************* *********************************************************************/ -#if defined (__ppc__) -#define ARCHNAME "ppc" -#elif defined (__i386__) +#if defined (__i386__) #define ARCHNAME "i386" #elif defined (__x86_64__) #define ARCHNAME "x86_64" @@ -3734,6 +4027,7 @@ OSKext::getPropertyForHostArch(const char * key) #endif /********************************************************************* *********************************************************************/ +/* static */ OSReturn OSKext::loadKextWithIdentifier( const char * kextIdentifierCString, @@ -3760,7 +4054,6 @@ OSKext::loadKextWithIdentifier( return result; } - /********************************************************************* *********************************************************************/ OSReturn @@ -3773,6 +4066,7 @@ OSKext::loadKextWithIdentifier( OSArray * personalityNames) { OSReturn result = kOSReturnError; + OSReturn pingResult = kOSReturnError; OSKext * theKext = NULL; // do not release OSDictionary * loadRequest = NULL; // must release const OSSymbol * kextIdentifierSymbol = NULL; // must release @@ -3840,13 +4134,12 @@ OSKext::loadKextWithIdentifier( kextIdentifier->getCStringNoCopy()); } - if (sKextdActive) { - OSKextPingKextd(); - } else { + pingResult = OSKext::pingKextd(); + if (pingResult == kOSKextReturnDisabled) { OSKextLog(/* kext */ NULL, ((sPrelinkBoot) ? kOSKextLogDebugLevel : kOSKextLogErrorLevel) | kOSKextLogLoadFlag, - "Not loading kext %s - not found and kextd not available in early boot.", + "Kext %s might not load - kextd is currently unavailable.", kextIdentifier->getCStringNoCopy()); } @@ -4106,7 +4399,7 @@ OSKext::load( /* Keep the kernel itself out of the kmod list. */ - if (lastLoadedKext == sKernelKext) { + if (lastLoadedKext->isKernel()) { lastLoadedKext = NULL; } @@ -4114,6 +4407,8 @@ OSKext::load( kmod_info->next = lastLoadedKext->kmod_info; } + notifyKextLoadObservers(this, kmod_info); + /* Make the global kmod list point at the just-loaded kext. Note that the * __kernel__ kext isn't in this list, as it wasn't before SnowLeopard, * although we do report it in kextstat these days by using the newer @@ -4127,19 +4422,30 @@ OSKext::load( /* Save the list of loaded kexts in case we panic. */ - clock_get_uptime(&last_loaded_timestamp); OSKext::saveLoadedKextPanicList(); -loaded: + if (isExecutable()) { + OSKext::updateLoadedKextSummaries(); + savePanicString(/* isLoading */ true); - if (declaresExecutable() && (startOpt == kOSKextExcludeNone)) { - result = start(); - if (result != kOSReturnSuccess) { - OSKextLog(this, - kOSKextLogErrorLevel | kOSKextLogLoadFlag, - "Kext %s start failed (result 0x%x).", - getIdentifierCString(), result); - result = kOSKextReturnStartStopError; +#if CONFIG_DTRACE + registerWithDTrace(); +#else + jettisonLinkeditSegment(); +#endif /* CONFIG_DTRACE */ + } + +loaded: + if (isExecutable() && !flags.started) { + if (startOpt == kOSKextExcludeNone) { + result = start(); + if (result != kOSReturnSuccess) { + OSKextLog(this, + kOSKextLogErrorLevel | kOSKextLogLoadFlag, + "Kext %s start failed (result 0x%x).", + getIdentifierCString(), result); + result = kOSKextReturnStartStopError; + } } } @@ -4151,6 +4457,7 @@ OSKext::load( if (result == kOSReturnSuccess && startMatchingOpt == kOSKextExcludeNone) { result = sendPersonalitiesToCatalog(true, personalityNames); } + finish: /* More hack! If the kext doesn't declare an executable, even if we @@ -4183,10 +4490,37 @@ OSKext::load( kOSKextLogLoadFlag, "Kext %s loaded.", getIdentifierCString()); + + queueKextNotification(kKextRequestPredicateLoadNotification, + OSDynamicCast(OSString, bundleID)); } return result; } +/********************************************************************* +* +*********************************************************************/ +static char * strdup(const char * string) +{ + char * result = NULL; + size_t size; + + if (!string) { + goto finish; + } + + size = 1 + strlen(string); + result = (char *)kalloc(size); + if (!result) { + goto finish; + } + + memcpy(result, string, size); + +finish: + return result; +} + /********************************************************************* * called only by load() *********************************************************************/ @@ -4195,13 +4529,11 @@ OSKext::loadExecutable() { OSReturn result = kOSReturnError; kern_return_t kxldResult; - u_char ** kxlddeps = NULL; // must kfree + KXLDDependency * kxlddeps = NULL; // must kfree uint32_t num_kxlddeps = 0; + OSArray * linkDependencies = NULL; // must release + uint32_t numDirectDependencies = 0; uint32_t num_kmod_refs = 0; - u_char * linkStateBytes = NULL; // do not free - u_long linkStateLength = 0; - u_char ** linkStateBytesPtr = NULL; // do not free - u_long * linkStateLengthPtr = NULL; // do not free struct mach_header ** kxldHeaderPtr = NULL; // do not free struct mach_header * kxld_header = NULL; // xxx - need to free here? OSData * theExecutable = NULL; // do not release @@ -4221,6 +4553,7 @@ OSKext::loadExecutable() if (isKernelComponent()) { if (STRING_HAS_PREFIX(versCString, KERNEL_LIB_PREFIX)) { + if (strncmp(versCString, KERNEL6_VERSION, strlen(KERNEL6_VERSION))) { OSKextLog(this, kOSKextLogErrorLevel | @@ -4263,11 +4596,37 @@ OSKext::loadExecutable() goto register_kmod; } - if (isKernelComponent()) { - num_kxlddeps = 1; // the kernel itself - } else { - num_kxlddeps = getNumDependencies(); + if (isInterface()) { + OSData *executableCopy = OSData::withData(theExecutable); + setLinkedExecutable(executableCopy); + executableCopy->release(); + goto register_kmod; } + + numDirectDependencies = getNumDependencies(); + + if (flags.hasBleedthrough) { + linkDependencies = dependencies; + linkDependencies->retain(); + } else { + linkDependencies = OSArray::withArray(dependencies); + if (!linkDependencies) { + OSKextLog(this, + kOSKextLogErrorLevel | + kOSKextLogLoadFlag | kOSKextLogLinkFlag, + "Can't allocate link dependencies to load kext %s.", + getIdentifierCString()); + goto finish; + } + + for (i = 0; i < numDirectDependencies; ++i) { + OSKext * dependencyKext = OSDynamicCast(OSKext, + dependencies->getObject(i)); + dependencyKext->addBleedthroughDependencies(linkDependencies); + } + } + + num_kxlddeps = linkDependencies->getCount(); if (!num_kxlddeps) { OSKextLog(this, kOSKextLogErrorLevel | @@ -4276,7 +4635,8 @@ OSKext::loadExecutable() getIdentifierCString()); goto finish; } - kxlddeps = (u_char **)kalloc(num_kxlddeps * sizeof(*kxlddeps)); + + kxlddeps = (KXLDDependency *)kalloc(num_kxlddeps * sizeof(*kxlddeps)); if (!kxlddeps) { OSKextLog(this, kOSKextLogErrorLevel | @@ -4285,37 +4645,51 @@ OSKext::loadExecutable() getIdentifierCString()); goto finish; } - - if (isKernelComponent()) { - OSData * kernelLinkState = OSKext::getKernelLinkState(); - kxlddeps[0] = (u_char *)kernelLinkState->getBytesNoCopy(); - } else for (i = 0; i < num_kxlddeps; i++) { - OSKext * dependency = OSDynamicCast(OSKext, dependencies->getObject(i)); - if (!dependency->linkState) { - // xxx - maybe we should panic here - OSKextLog(this, - kOSKextLogErrorLevel | - kOSKextLogLoadFlag | kOSKextLogLinkFlag, - "Can't load kext %s - link state missing.", - getIdentifierCString()); - goto finish; + bzero(kxlddeps, num_kxlddeps * sizeof(*kxlddeps)); + + for (i = 0; i < num_kxlddeps; ++i ) { + OSKext * dependency = OSDynamicCast(OSKext, linkDependencies->getObject(i)); + + if (dependency->isInterface()) { + OSKext *interfaceTargetKext = NULL; + OSData * interfaceTarget = NULL; + + if (dependency->isKernelComponent()) { + interfaceTargetKext = sKernelKext; + interfaceTarget = sKernelKext->linkedExecutable; + } else { + interfaceTargetKext = OSDynamicCast(OSKext, + dependency->dependencies->getObject(0)); + + interfaceTarget = interfaceTargetKext->linkedExecutable; + } + + if (!interfaceTarget) { + // panic? + goto finish; + } + + /* The names set here aren't actually logged yet , + * it will be useful to have them in the debugger. + * strdup() failing isn't critical right here so we don't check that. + */ + kxlddeps[i].kext = (u_char *) interfaceTarget->getBytesNoCopy(); + kxlddeps[i].kext_size = interfaceTarget->getLength(); + kxlddeps[i].kext_name = strdup(interfaceTargetKext->getIdentifierCString()); + + kxlddeps[i].interface = (u_char *) dependency->linkedExecutable->getBytesNoCopy(); + kxlddeps[i].interface_size = dependency->linkedExecutable->getLength(); + kxlddeps[i].interface_name = strdup(dependency->getIdentifierCString()); + } else { + kxlddeps[i].kext = (u_char *) dependency->linkedExecutable->getBytesNoCopy(); + kxlddeps[i].kext_size = dependency->linkedExecutable->getLength(); + kxlddeps[i].kext_name = strdup(dependency->getIdentifierCString()); } - kxlddeps[i] = (u_char *)dependency->linkState->getBytesNoCopy(); - assert(kxlddeps[i]); - } - /* We only need link state for a library kext. - */ - if (compatibleVersion > -1 && (declaresExecutable() || isKernelComponent())) { - linkStateBytesPtr = &linkStateBytes; - linkStateLengthPtr = &linkStateLength; + kxlddeps[i].is_direct_dependency = (i < numDirectDependencies); } - /* We only need the linked executable for a real kext. - */ - if (!isInterface()) { - kxldHeaderPtr = &kxld_header; - } + kxldHeaderPtr = &kxld_header; #if DEBUG OSKextLog(this, @@ -4326,13 +4700,11 @@ OSKext::loadExecutable() " executable: %p executable_length: %d\n" " user_data: %p\n" " kxld_dependencies: %p num_dependencies: %d\n" - " kxld_header_ptr: %p kmod_info_ptr: %p\n" - " link_state_ptr: %p link_state_length_ptr: %p", - getIdentifierCString(), kxldContext, + " kxld_header_ptr: %p kmod_info_ptr: %p\n", + getIdentifierCString(), sKxldContext, theExecutable->getBytesNoCopy(), theExecutable->getLength(), this, kxlddeps, num_kxlddeps, - kxldHeaderPtr, kernelKmodInfoPtr, - linkStateBytesPtr, linkStateLengthPtr); + kxldHeaderPtr, &kmod_info); #endif /* After this call, the linkedExecutable instance variable @@ -4342,9 +4714,7 @@ OSKext::loadExecutable() (u_char *)theExecutable->getBytesNoCopy(), theExecutable->getLength(), getIdentifierCString(), this, kxlddeps, num_kxlddeps, - (u_char **)kxldHeaderPtr, (kxld_addr_t *)&kmod_info, - linkStateBytesPtr, linkStateLengthPtr, - /* symbolFile */ NULL, /* symbolFileSize */ NULL); + (u_char **)kxldHeaderPtr, (kxld_addr_t *)&kmod_info); if (kxldResult != KERN_SUCCESS) { // xxx - add kxldResult here? @@ -4356,24 +4726,15 @@ OSKext::loadExecutable() result = kOSKextReturnLinkError; goto finish; } - - /* If we got a link state, wrap it in an OSData and keep it - * around for later use linking other kexts that depend on this kext. - */ - if (linkStateBytes && linkStateLength > 0) { - linkState = OSData::withBytesNoCopy(linkStateBytes, linkStateLength); - assert(linkState); - linkState->setDeallocFunction(&osdata_kmem_free); - } - /* If this isn't an interface, We've written data & instructions into kernel - * memory, so flush the data cache and invalidate the instruction cache. + /* We've written data & instructions into kernel memory, so flush the data + * cache and invalidate the instruction cache. + * I/D caches are coherent on x86 */ - if (!isInterface()) { - flush_dcache(kmod_info->address, kmod_info->size, false); - invalidate_icache(kmod_info->address, kmod_info->size, false); - } - +#if !defined(__i386__) && !defined(__x86_64__) + flush_dcache(kmod_info->address, kmod_info->size, false); + invalidate_icache(kmod_info->address, kmod_info->size, false); +#endif register_kmod: if (isInterface()) { @@ -4454,7 +4815,23 @@ OSKext::loadExecutable() result = kOSReturnSuccess; finish: - if (kxlddeps) kfree(kxlddeps, (num_kxlddeps * sizeof(void *))); + OSSafeRelease(linkDependencies); + + /* Clear up locally allocated dependency info. + */ + for (i = 0; i < num_kxlddeps; ++i ) { + size_t size; + + if (kxlddeps[i].kext_name) { + size = 1 + strlen(kxlddeps[i].kext_name); + kfree(kxlddeps[i].kext_name, size); + } + if (kxlddeps[i].interface_name) { + size = 1 + strlen(kxlddeps[i].interface_name); + kfree(kxlddeps[i].interface_name, size); + } + } + if (kxlddeps) kfree(kxlddeps, (num_kxlddeps * sizeof(*kxlddeps))); /* We no longer need the unrelocated executable (which the linker * has altered anyhow). @@ -4486,7 +4863,131 @@ OSKext::loadExecutable() } /********************************************************************* -* xxx - initWithPrelinkedInfoDict doesn't use this +* The linkedit segment is used by the kext linker for dependency +* resolution, and by dtrace for probe initialization. We can free it +* for non-library kexts, since no kexts depend on non-library kexts +* by definition, once dtrace has been initialized. +*********************************************************************/ +void +OSKext::jettisonLinkeditSegment(void) +{ + kernel_mach_header_t * machhdr = (kernel_mach_header_t *)kmod_info->address; + kernel_segment_command_t * linkedit = NULL; + vm_size_t linkeditsize, kextsize; + OSData * data = NULL; + + if (sKeepSymbols || isLibrary() || !isExecutable() || !linkedExecutable) { + goto finish; + } + + /* Find the linkedit segment. If it's not the last segment, then freeing + * it will fragment the kext into multiple VM regions, which OSKext is not + * designed to handle, so we'll have to skip it. + */ + linkedit = getsegbynamefromheader(machhdr, SEG_LINKEDIT); + if (!linkedit) { + goto finish; + } + + if (round_page(kmod_info->address + kmod_info->size) != + round_page(linkedit->vmaddr + linkedit->vmsize)) + { + goto finish; + } + + /* Create a new OSData for the smaller kext object. + */ + linkeditsize = round_page(linkedit->vmsize); + kextsize = kmod_info->size - linkeditsize; + + data = OSData::withBytesNoCopy((void *)kmod_info->address, kextsize); + if (!data) { + goto finish; + } + data->setDeallocFunction(osdata_kext_free); + + /* Rewrite the Mach-O headers. + */ + if (KERN_SUCCESS != removeLinkeditHeaders(linkedit)) { + goto finish; + } + + /* Fix the kmod info and linkedExecutable. + */ + kmod_info->size = kextsize; + linkedExecutable->setDeallocFunction(NULL); + linkedExecutable->release(); + linkedExecutable = data; + + /* Free the linkedit segment. + */ + kext_free(linkedit->vmaddr, linkeditsize); + +finish: + return; +} + +/********************************************************************* +*********************************************************************/ +OSReturn +OSKext::removeLinkeditHeaders(kernel_segment_command_t *linkedit) +{ + OSReturn result = KERN_FAILURE; + kernel_mach_header_t * machhdr = (kernel_mach_header_t *)kmod_info->address; + vm_map_t kext_map; + u_char * src, * dst; + uint32_t cmdsize, ncmds; + u_int i = 0; + + kext_map = kext_get_vm_map(kmod_info); + if (!kext_map) { + result = KERN_MEMORY_ERROR; + goto finish; + } + + result = vm_map_protect(kext_map, kmod_info->address, + kmod_info->address + kmod_info->hdr_size, VM_PROT_DEFAULT, TRUE); + if (result != KERN_SUCCESS) { + goto finish; + } + + ncmds = machhdr->ncmds; + src = dst = (u_char *)(kmod_info->address + sizeof(*machhdr)); + + for (i = 0; i < ncmds; ++i, src += cmdsize) { + struct load_command * lc = (struct load_command *) src; + cmdsize = lc->cmdsize; + + switch (lc->cmd) { + case LC_SEGMENT: + case LC_SEGMENT_64: + if (src != (u_char *)linkedit) break; + /* FALLTHROUGH */ + case LC_SYMTAB: + case LC_DYSYMTAB: + bzero(src, cmdsize); + machhdr->ncmds--; + machhdr->sizeofcmds -= cmdsize; + continue; + } + + memmove(dst, src, cmdsize); + dst += cmdsize; + } + + result = vm_map_protect(kext_map, kmod_info->address, + kmod_info->address + kmod_info->hdr_size, VM_PROT_READ, TRUE); + if (result != KERN_SUCCESS) { + goto finish; + } + + result = KERN_SUCCESS; + +finish: + return result; +} + +/********************************************************************* *********************************************************************/ void OSKext::setLinkedExecutable(OSData * anExecutable) @@ -4501,6 +5002,73 @@ OSKext::setLinkedExecutable(OSData * anExecutable) return; } +#if CONFIG_DTRACE +/********************************************************************* +* Go through all loaded kexts and tell them to register with dtrace. +* The instance method only registers if necessary. +*********************************************************************/ +/* static */ +void +OSKext::registerKextsWithDTrace(void) +{ + uint32_t count = sLoadedKexts->getCount(); + uint32_t i; + + IORecursiveLockLock(sKextLock); + + for (i = 0; i < count; i++) { + OSKext * thisKext = NULL; // do not release + + thisKext = OSDynamicCast(OSKext, sLoadedKexts->getObject(i)); + if (!thisKext || !thisKext->isExecutable()) { + continue; + } + + thisKext->registerWithDTrace(); + } + + IORecursiveLockUnlock(sKextLock); + + return; +} + +extern "C" { + extern int (*dtrace_modload)(struct kmod_info *); + extern int (*dtrace_modunload)(struct kmod_info *); +}; + +/********************************************************************* +*********************************************************************/ +void +OSKext::registerWithDTrace(void) +{ + /* Register kext with dtrace. A dtrace_modload failure should not + * prevent a kext from loading, so we ignore the return code. + */ + if (!flags.dtraceInitialized && (dtrace_modload != NULL)) { + (void)(*dtrace_modload)(kmod_info); + flags.dtraceInitialized = true; + jettisonLinkeditSegment(); + } + return; +} +/********************************************************************* +*********************************************************************/ +void +OSKext::unregisterWithDTrace(void) +{ + /* Unregister kext with dtrace. A dtrace_modunload failure should not + * prevent a kext from loading, so we ignore the return code. + */ + if (flags.dtraceInitialized && (dtrace_modunload != NULL)) { + (void)(*dtrace_modunload)(kmod_info); + flags.dtraceInitialized = false; + } + return; +} +#endif /* CONFIG_DTRACE */ + + /********************************************************************* * called only by loadExecutable() *********************************************************************/ @@ -4566,9 +5134,11 @@ OSKext::setVMProtections(void) goto finish; } - result = vm_map_wire(kext_map, start, end, seg->initprot, FALSE); - if (result != KERN_SUCCESS) { - goto finish; + if (segmentShouldBeWired(seg)) { + result = vm_map_wire(kext_map, start, end, seg->initprot, FALSE); + if (result != KERN_SUCCESS) { + goto finish; + } } seg = nextsegfromheader((kernel_mach_header_t *) kmod_info->address, seg); @@ -4578,6 +5148,14 @@ OSKext::setVMProtections(void) return result; } +/********************************************************************* +*********************************************************************/ +boolean_t +OSKext::segmentShouldBeWired(kernel_segment_command_t *seg) +{ + return (sKeepSymbols || strncmp(seg->segname, SEG_LINKEDIT, sizeof(seg->segname))); +} + /********************************************************************* *********************************************************************/ OSReturn @@ -4587,6 +5165,7 @@ OSKext::validateKextMapping(bool startFlag) const char * whichOp = startFlag ? "start" : "stop"; kern_return_t kern_result = 0; vm_map_t kext_map = NULL; + kernel_segment_command_t * seg = NULL; mach_vm_address_t address = 0; mach_vm_size_t size = 0; uint32_t depth = 0; @@ -4682,23 +5261,18 @@ OSKext::validateKextMapping(bool startFlag) goto finish; } - /* Verify that the kext is backed by physical memory. + /* Verify that the kext's segments are backed by physical memory. */ - for (address = kmod_info->address; - address < round_page(kmod_info->address + kmod_info->size); - address += PAGE_SIZE) - { - if (!pmap_find_phys(kernel_pmap, (vm_offset_t)address)) { - OSKextLog(this, - kOSKextLogErrorLevel | - kOSKextLogLoadFlag, - "Kext %s - page %p is not backed by physical memory.", - getIdentifierCString(), - (void *)address); + seg = firstsegfromheader((kernel_mach_header_t *)kmod_info->address); + while (seg) { + if (!verifySegmentMapping(seg)) { result = kOSKextReturnBadData; goto finish; } + + seg = nextsegfromheader((kernel_mach_header_t *) kmod_info->address, seg); } + } result = kOSReturnSuccess; @@ -4706,6 +5280,33 @@ OSKext::validateKextMapping(bool startFlag) return result; } +/********************************************************************* +*********************************************************************/ +boolean_t +OSKext::verifySegmentMapping(kernel_segment_command_t *seg) +{ + mach_vm_address_t address = 0; + + if (!segmentShouldBeWired(seg)) return true; + + for (address = seg->vmaddr; + address < round_page(seg->vmaddr + seg->vmsize); + address += PAGE_SIZE) + { + if (!pmap_find_phys(kernel_pmap, (vm_offset_t)address)) { + OSKextLog(this, + kOSKextLogErrorLevel | + kOSKextLogLoadFlag, + "Kext %s - page %p is not backed by physical memory.", + getIdentifierCString(), + (void *)address); + return false; + } + } + + return true; +} + /********************************************************************* *********************************************************************/ OSReturn @@ -4805,14 +5406,14 @@ OSKext::start(bool startDependenciesFlag) flags.starting = 1; -#if !__i386__ && !__ppc__ +#if !CONFIG_STATIC_CPPINIT result = OSRuntimeInitializeCPP(kmod_info, NULL); if (result == KERN_SUCCESS) { #endif result = startfunc(kmod_info, kmodStartData); -#if !__i386__ && !__ppc__ +#if !CONFIG_STATIC_CPPINIT if (result != KERN_SUCCESS) { (void) OSRuntimeFinalizeCPP(kmod_info, NULL); } @@ -4951,10 +5552,6 @@ OSKext::stop(void) goto finish; } - /* Save the list of loaded kexts in case we panic. - */ - OSKext::saveUnloadedKextPanicList(this); - stopfunc = kmod_info->stop; if (stopfunc) { OSKextLog(this, @@ -4966,7 +5563,7 @@ OSKext::stop(void) flags.stopping = 1; result = stopfunc(kmod_info, /* userData */ NULL); -#if !__i386__ && !__ppc__ +#if !CONFIG_STATIC_CPPINIT if (result == KERN_SUCCESS) { result = OSRuntimeFinalizeCPP(kmod_info, NULL); } @@ -5030,7 +5627,6 @@ OSKext::unload(void) goto finish; } - if (hasOSMetaClassInstances()) { OSKextLog(this, kOSKextLogErrorLevel | @@ -5059,6 +5655,10 @@ OSKext::unload(void) * of unloading. */ flags.unloading = 1; + + /* Update the string describing the last kext to unload in case we panic. + */ + savePanicString(/* isLoading */ false); if (isStarted()) { result = stop(); @@ -5113,7 +5713,7 @@ OSKext::unload(void) } OSKext * lastKext = OSDynamicCast(OSKext, sLoadedKexts->getLastObject()); - if (lastKext && lastKext != sKernelKext) { + if (lastKext && !lastKext->isKernel()) { kmod = lastKext->kmod_info; } else { kmod = NULL; // clear the global kmod variable @@ -5134,35 +5734,56 @@ OSKext::unload(void) num_kmod_refs * sizeof(kmod_reference_t)); } - /* If we have a linked executable, release & clear it, and then - * unwire & deallocate the buffer the OSData wrapped. - */ +#if CONFIG_DTRACE + unregisterWithDTrace(); +#endif /* CONFIG_DTRACE */ + + notifyKextUnloadObservers(this); + + /* Unwire and free the linked executable. + */ if (linkedExecutable) { - vm_map_t kext_map; + if (!isInterface()) { + kernel_segment_command_t *seg = NULL; + vm_map_t kext_map = kext_get_vm_map(kmod_info); - /* linkedExecutable is just a wrapper for the executable and doesn't - * free it. - */ - linkedExecutable->release(); - linkedExecutable = NULL; + if (!kext_map) { + OSKextLog(this, + kOSKextLogErrorLevel | + kOSKextLogLoadFlag, + "Failed to free kext %s; couldn't find the kext map.", + getIdentifierCString()); + result = kOSKextReturnInternalError; + goto finish; + } - OSKextLog(this, - kOSKextLogProgressLevel | - kOSKextLogLoadFlag, - "Kext %s unwiring and unmapping linked executable.", - getIdentifierCString()); + OSKextLog(this, + kOSKextLogProgressLevel | + kOSKextLogLoadFlag, + "Kext %s unwiring and unmapping linked executable.", + getIdentifierCString()); + + seg = firstsegfromheader((kernel_mach_header_t *)kmod_info->address); + while (seg) { + if (segmentShouldBeWired(seg)) { + result = vm_map_unwire(kext_map, seg->vmaddr, + seg->vmaddr + seg->vmsize, FALSE); + if (result != KERN_SUCCESS) { + OSKextLog(this, + kOSKextLogErrorLevel | + kOSKextLogLoadFlag, + "Failed to unwire kext %s.", + getIdentifierCString()); + result = kOSKextReturnInternalError; + goto finish; + } + } - kext_map = kext_get_vm_map(kmod_info); - if (kext_map) { - // xxx - do we have to do this before freeing? Why can't we just free it? - // xxx - we should be able to set a dealloc func on the linkedExecutable - result = vm_map_unwire(kext_map, - kmod_info->address + kmod_info->hdr_size, - kmod_info->address + kmod_info->size, FALSE); - if (result == KERN_SUCCESS) { - kext_free(kmod_info->address, kmod_info->size); + seg = nextsegfromheader((kernel_mach_header_t *) kmod_info->address, seg); } } + + OSSafeReleaseNULL(linkedExecutable); } /* An interface kext has a fake kmod_info that was allocated, @@ -5177,17 +5798,80 @@ OSKext::unload(void) flags.loaded = false; flushDependencies(); + /* save a copy of the bundle ID for us to check when deciding to + * rebuild the kernel cache file. If a kext was already in the kernel + * cache and unloaded then later loaded we do not need to rebuild the + * kernel cache. 9055303 + */ + if (isPrelinked()) { + sUnloadedPrelinkedKexts->setObject(bundleID); + } + OSKextLog(this, kOSKextLogProgressLevel | kOSKextLogLoadFlag, "Kext %s unloaded.", getIdentifierCString()); + queueKextNotification(kKextRequestPredicateUnloadNotification, + OSDynamicCast(OSString, bundleID)); + finish: OSKext::saveLoadedKextPanicList(); + OSKext::updateLoadedKextSummaries(); flags.unloading = 0; return result; } +/********************************************************************* +* Assumes sKextLock is held. +*********************************************************************/ +/* static */ +OSReturn +OSKext::queueKextNotification( + const char * notificationName, + OSString * kextIdentifier) +{ + OSReturn result = kOSReturnError; + OSDictionary * loadRequest = NULL; // must release + + if (!kextIdentifier) { + result = kOSKextReturnInvalidArgument; + goto finish; + } + + /* Create a new request unless one is already sitting + * in sKernelRequests for this bundle identifier + */ + result = _OSKextCreateRequest(notificationName, &loadRequest); + if (result != kOSReturnSuccess) { + goto finish; + } + if (!_OSKextSetRequestArgument(loadRequest, + kKextRequestArgumentBundleIdentifierKey, kextIdentifier)) { + + result = kOSKextReturnNoMemory; + goto finish; + } + if (!sKernelRequests->setObject(loadRequest)) { + result = kOSKextReturnNoMemory; + goto finish; + } + + /* We might want to only queue the notification if kextd is active, + * but that wouldn't work for embedded. Note that we don't care if + * the ping immediately succeeds here so don't do anything with the + * result of this call. + */ + OSKext::pingKextd(); + + result = kOSReturnSuccess; + +finish: + OSSafeRelease(loadRequest); + + return result; +} + /********************************************************************* *********************************************************************/ static void @@ -5195,8 +5879,7 @@ _OSKextConsiderDestroyingLinkContext( __unused thread_call_param_t p0, __unused thread_call_param_t p1) { - /* Once both recursive locks are taken in correct order, we shouldn't - * have to worry about further recursive lock takes. + /* Take multiple locks in the correct order. */ IORecursiveLockLock(sKextLock); IORecursiveLockLock(sKextInnerLock); @@ -5235,9 +5918,8 @@ _OSKextConsiderDestroyingLinkContext( * to avoid deadlocks with IOService, with which OSKext has a reciprocal * call relationship. * -* Do not call any function that takes sKextLock here! This function -* can be invoked with sKextInnerLock, and the two must always -* be taken in the order: sKextLock -> sKextInnerLock. +* This function must be invoked with sKextInnerLock held. +* Do not call any function that takes sKextLock here! *********************************************************************/ /* static */ void @@ -5264,56 +5946,11 @@ OSKext::considerDestroyingLinkContext(void) goto finish; } - thread_call_enter(sDestroyLinkContextThread); - -finish: - IORecursiveLockUnlock(sKextInnerLock); - return; -} - -/********************************************************************* -*********************************************************************/ -OSData * -OSKext::getKernelLinkState() -{ - kern_return_t kxldResult; - u_char * kernel = NULL; - size_t kernelLength; - u_char * linkStateBytes = NULL; - u_long linkStateLength; - OSData * linkState = NULL; - - if (sKernelKext && sKernelKext->linkState) { - goto finish; - } - - kernel = (u_char *)&_mh_execute_header; - kernelLength = getlastaddr() - (vm_offset_t)kernel; - - kxldResult = kxld_link_file(sKxldContext, - kernel, - kernelLength, - kOSKextKernelIdentifier, - /* callbackData */ NULL, - /* dependencies */ NULL, - /* numDependencies */ 0, - /* linkedObjectOut */ NULL, - /* kmod_info_kern out */ NULL, - &linkStateBytes, - &linkStateLength, - /* symbolFile */ NULL, - /* symbolFileSize */ NULL); - if (kxldResult) { - panic("Can't generate kernel link state; no kexts can be loaded."); - goto finish; - } - - linkState = OSData::withBytesNoCopy(linkStateBytes, linkStateLength); - linkState->setDeallocFunction(&osdata_kmem_free); - sKernelKext->linkState = linkState; + thread_call_enter(sDestroyLinkContextThread); finish: - return sKernelKext->linkState; + IORecursiveLockUnlock(sKextInnerLock); + return; } #if PRAGMA_MARK @@ -5323,6 +5960,7 @@ OSKext::getKernelLinkState() * This is a static method because the kext will be deallocated if it * does unload! *********************************************************************/ +/* static */ OSReturn OSKext::autounloadKext(OSKext * aKext) { @@ -5373,14 +6011,27 @@ _OSKextConsiderUnloads( bool didUnload = false; unsigned int count, i; - /* Once both recursive locks are taken in correct order, we shouldn't - * have to worry about further recursive lock takes. + /* Take multiple locks in the correct order + * (note also sKextSummaries lock further down). */ IORecursiveLockLock(sKextLock); IORecursiveLockLock(sKextInnerLock); OSKext::flushNonloadedKexts(/* flushPrelinkedKexts */ true); + IOLockLock(sKextSummariesLock); + + /* If there is an old kext summary, free that now. + */ + if (sPrevLoadedKextSummaries) { + kmem_free(kernel_map, (vm_offset_t)sPrevLoadedKextSummaries, + sPrevLoadedKextSummariesAllocSize); + sPrevLoadedKextSummaries = NULL; + sPrevLoadedKextSummariesAllocSize = 0; + } + + IOLockUnlock(sKextSummariesLock); + /* If the system is powering down, don't try to unload anything. */ if (sSystemSleep) { @@ -5405,7 +6056,7 @@ _OSKextConsiderUnloads( OSBoolean * stale = OSDynamicCast(OSBoolean, callbackRecord->getObject(kKextRequestStaleKey)); - if (stale && stale->isTrue()) { + if (stale == kOSBooleanTrue) { OSKext::invokeRequestCallback(callbackRecord, kOSKextReturnTimeout); } else { @@ -5442,8 +6093,8 @@ _OSKextConsiderUnloads( sConsiderUnloadsPending = false; sConsiderUnloadsExecuted = true; - (void) OSKext::considerRebuildOfPrelinkedKernel(); - + (void) OSKext::considerRebuildOfPrelinkedKernel(NULL); + IORecursiveLockUnlock(sKextInnerLock); IORecursiveLockUnlock(sKextLock); @@ -5463,6 +6114,9 @@ void OSKext::considerUnloads(Boolean rescheduleOnlyFlag) sUnloadCallout = thread_call_allocate(&_OSKextConsiderUnloads, 0); } + /* we only reset delay value for unloading if we already have something + * pending. rescheduleOnlyFlag should not start the count down. + */ if (rescheduleOnlyFlag && !sConsiderUnloadsPending) { goto finish; } @@ -5540,13 +6194,40 @@ IOReturn OSKextSystemSleepOrWake(UInt32 messageType) *********************************************************************/ /* static */ void -OSKext::considerRebuildOfPrelinkedKernel(void) +OSKext::considerRebuildOfPrelinkedKernel(OSString * moduleName) { OSReturn checkResult = kOSReturnError; static bool requestedPrelink = false; OSDictionary * prelinkRequest = NULL; // must release IORecursiveLockLock(sKextLock); + + /* moduleName is only passed when we see a load come in. We are only + * interested in rebuilding the kernel cache if the kext we are loading + * is not already in the original kernel cache. 9055303 + */ + if (moduleName) { + int count = sUnloadedPrelinkedKexts->getCount(); + int i; + + for (i = 0; i < count; i++) { + const OSSymbol * myBundleID; // do not release + + myBundleID = OSDynamicCast(OSSymbol, sUnloadedPrelinkedKexts->getObject(i)); + if (!myBundleID) continue; + if (moduleName->isEqualTo(myBundleID->getCStringNoCopy())) { + OSKextLog(/* kext */ NULL, + kOSKextLogDetailLevel | + kOSKextLogArchiveFlag, + "bundleID %s already in cache skipping rebuild.", + myBundleID->getCStringNoCopy()); + + /* no need to rebuild, already in kernel cache */ + goto finish; + } + } + (void) OSKext::setDeferredLoadSucceeded(); + } if (!sDeferredLoadSucceeded || !sConsiderUnloadsExecuted || sSafeBoot || requestedPrelink) @@ -5569,7 +6250,7 @@ OSKext::considerRebuildOfPrelinkedKernel(void) goto finish; } - OSKextPingKextd(); + OSKext::pingKextd(); requestedPrelink = true; finish: @@ -5745,7 +6426,24 @@ OSKext::resolveDependencies( libraryVersion->getCStringNoCopy()); goto finish; } - + + /* If a nonprelinked library somehow got into the mix for a + * prelinked kext, at any point in the chain, we must fail + * because the prelinked relocs for the library will be all wrong. + */ + if (this->isPrelinked() && + libraryKext->declaresExecutable() && + !libraryKext->isPrelinked()) { + + OSKextLog(this, + kOSKextLogErrorLevel | + kOSKextLogDependenciesFlag, + "Kext %s (prelinked) - library kext %s (v%s) not prelinked.", + getIdentifierCString(), library_id, + libraryVersion->getCStringNoCopy()); + goto finish; + } + if (!libraryKext->resolveDependencies(loopStack)) { goto finish; } @@ -5815,8 +6513,16 @@ OSKext::resolveDependencies( } } + if (hasRawKernelDependency) { + OSKextLog(this, + kOSKextLogErrorLevel | + kOSKextLogValidationFlag | kOSKextLogDependenciesFlag, + "Error - kext %s declares a dependency on %s, which is not permitted.", + getIdentifierCString(), KERNEL_LIB); + goto finish; + } #if __LP64__ - if (hasRawKernelDependency || hasKernelDependency) { + if (hasKernelDependency) { OSKextLog(this, kOSKextLogErrorLevel | kOSKextLogValidationFlag | kOSKextLogDependenciesFlag, @@ -5838,17 +6544,7 @@ OSKext::resolveDependencies( // xxx - is it invalid to do both "com.apple.kernel" and any // xxx - "com.apple.kernel.*"? - if (hasRawKernelDependency && hasKernelDependency) { - OSKextLog(this, - kOSKextLogErrorLevel | - kOSKextLogValidationFlag | kOSKextLogDependenciesFlag, - "Error - kext %s declares dependencies on both " - "%s and %s.", - getIdentifierCString(), KERNEL_LIB, KERNEL6_LIB); - goto finish; - } - - if ((hasRawKernelDependency || hasKernelDependency) && hasKPIDependency) { + if (hasKernelDependency && hasKPIDependency) { OSKextLog(this, kOSKextLogWarningLevel | kOSKextLogDependenciesFlag, @@ -5857,7 +6553,7 @@ OSKext::resolveDependencies( getIdentifierCString(), KERNEL_LIB, KPI_LIB_PREFIX); } - if (!hasRawKernelDependency && !hasKernelDependency && !hasKPIDependency) { + if (!hasKernelDependency && !hasKPIDependency) { // xxx - do we want to use validation flag for these too? OSKextLog(this, kOSKextLogWarningLevel | @@ -5881,9 +6577,11 @@ OSKext::resolveDependencies( * its indirect dependencies to simulate old-style linking. XXX - Should * check for duplicates. */ - if (!hasRawKernelDependency && !hasKPIDependency) { + if (!hasKPIDependency) { unsigned int i; + flags.hasBleedthrough = true; + count = getNumDependencies(); /* We add to the dependencies array in this loop, but do not iterate @@ -6107,6 +6805,8 @@ OSKext::addClass( } } + notifyAddClassObservers(this, aClass, flags); + result = kOSReturnSuccess; finish: @@ -6154,6 +6854,8 @@ OSKext::removeClass( metaClasses->removeObject(aClass); + notifyRemoveClassObservers(this, aClass, flags); + result = kOSReturnSuccess; finish: @@ -6285,12 +6987,12 @@ OSKext::handleRequest( char * response = NULL; // returned by reference uint32_t responseLength = 0; - OSObject * parsedXML = NULL; // must release + OSObject * parsedXML = NULL; // must release OSDictionary * requestDict = NULL; // do not release OSString * errorString = NULL; // must release OSData * responseData = NULL; // must release - OSObject * responseObject = NULL; // must release + OSObject * responseObject = NULL; // must release OSSerialize * serializer = NULL; // must release @@ -6365,7 +7067,7 @@ OSKext::handleRequest( result = kOSKextReturnNotPrivileged; if (hostPriv == HOST_PRIV_NULL) { if (!predicate->isEqualTo(kKextRequestPredicateGetLoaded) && - !predicate->isEqualTo(kKextRequestPredicateGetKernelLinkState) && + !predicate->isEqualTo(kKextRequestPredicateGetKernelImage) && !predicate->isEqualTo(kKextRequestPredicateGetKernelLoadAddress)) { goto finish; @@ -6444,7 +7146,10 @@ OSKext::handleRequest( result = OSKext::dispatchResource(requestDict); } else if (predicate->isEqualTo(kKextRequestPredicateGetLoaded)) { - OSBoolean * delayAutounloadBool = NULL; + OSBoolean * delayAutounloadBool = NULL; + OSObject * infoKeysRaw = NULL; + OSArray * infoKeys = NULL; + uint32_t infoKeysCount = 0; delayAutounloadBool = OSDynamicCast(OSBoolean, _OSKextGetRequestArgument(requestDict, @@ -6457,8 +7162,31 @@ OSKext::handleRequest( OSKext::considerUnloads(/* rescheduleOnly? */ true); } - responseObject = OSDynamicCast(OSObject, - OSKext::copyLoadedKextInfo(kextIdentifiers)); + infoKeysRaw = _OSKextGetRequestArgument(requestDict, + kKextRequestArgumentInfoKeysKey); + infoKeys = OSDynamicCast(OSArray, infoKeysRaw); + if (infoKeysRaw && !infoKeys) { + OSKextLog(/* kext */ NULL, + kOSKextLogErrorLevel | + kOSKextLogIPCFlag, + "Invalid arguments to kext info request."); + goto finish; + } + + if (infoKeys) { + infoKeysCount = infoKeys->getCount(); + for (uint32_t i = 0; i < infoKeysCount; i++) { + if (!OSDynamicCast(OSString, infoKeys->getObject(i))) { + OSKextLog(/* kext */ NULL, + kOSKextLogErrorLevel | + kOSKextLogIPCFlag, + "Invalid arguments to kext info request."); + goto finish; + } + } + } + + responseObject = OSKext::copyLoadedKextInfo(kextIdentifiers, infoKeys); if (!responseObject) { result = kOSKextReturnInternalError; } else { @@ -6489,16 +7217,15 @@ OSKext::handleRequest( (unsigned long long)textseg->vmaddr); addressNum = OSNumber::withNumber((long long unsigned int)textseg->vmaddr, 8 * sizeof(long long unsigned int)); - responseObject = OSDynamicCast(OSObject, addressNum); + responseObject = addressNum; result = kOSReturnSuccess; - } else if (predicate->isEqualTo(kKextRequestPredicateGetKernelLinkState)) { + } else if (predicate->isEqualTo(kKextRequestPredicateGetKernelImage)) { OSKextLog(/* kext */ NULL, kOSKextLogDebugLevel | kOSKextLogIPCFlag, - "Returning kernel link state."); - responseData = sKernelKext->linkState; - responseData->retain(); + "Returning kernel image."); + responseData = OSKext::copySanitizedKernelImage(); result = kOSReturnSuccess; } else if (predicate->isEqualTo(kKextRequestPredicateGetKernelRequests)) { @@ -6506,7 +7233,7 @@ OSKext::handleRequest( /* Hand the current sKernelRequests array to the caller * (who must release it), and make a new one. */ - responseObject = OSDynamicCast(OSObject, sKernelRequests); + responseObject = sKernelRequests; sKernelRequests = OSArray::withCapacity(0); sPostedKextLoadIdentifiers->flushCollection(); OSKextLog(/* kext */ NULL, @@ -6518,7 +7245,7 @@ OSKext::handleRequest( } else if (predicate->isEqualTo(kKextRequestPredicateGetAllLoadRequests)) { /* Return the set of all requested bundle identifiers */ - responseObject = OSDynamicCast(OSObject, sAllKextLoadIdentifiers); + responseObject = sAllKextLoadIdentifiers; responseObject->retain(); OSKextLog(/* kext */ NULL, kOSKextLogDebugLevel | @@ -6613,7 +7340,7 @@ OSKext::handleRequest( IORecursiveLockUnlock(sKextLock); - OSSafeRelease(requestDict); + OSSafeRelease(parsedXML); OSSafeRelease(errorString); OSSafeRelease(responseData); OSSafeRelease(responseObject); @@ -6626,10 +7353,12 @@ OSKext::handleRequest( /********************************************************************* *********************************************************************/ /* static */ -OSArray * -OSKext::copyLoadedKextInfo(OSArray * kextIdentifiers) +OSDictionary * +OSKext::copyLoadedKextInfo( + OSArray * kextIdentifiers, + OSArray * infoKeys) { - OSArray * result = NULL; + OSDictionary * result = NULL; OSDictionary * kextInfo = NULL; // must release uint32_t count, i; uint32_t idCount = 0; @@ -6645,8 +7374,14 @@ OSKext::copyLoadedKextInfo(OSArray * kextIdentifiers) idCount = kextIdentifiers->getCount(); } + /* Same for keys. + */ + if (infoKeys && !infoKeys->getCount()) { + infoKeys = NULL; + } + count = sLoadedKexts->getCount(); - result = OSArray::withCapacity(count); + result = OSDictionary::withCapacity(count); if (!result) { goto finish; } @@ -6685,8 +7420,10 @@ OSKext::copyLoadedKextInfo(OSArray * kextIdentifiers) continue; } - kextInfo = thisKext->copyInfo(); - result->setObject(kextInfo); + kextInfo = thisKext->copyInfo(infoKeys); + if (kextInfo) { + result->setObject(thisKext->getIdentifier(), kextInfo); + } } finish: @@ -6698,38 +7435,33 @@ OSKext::copyLoadedKextInfo(OSArray * kextIdentifiers) } /********************************************************************* -Load Tag -Bundle ID -Bundle Version -Path -Load Address -Load Size -Wired Size -Version -Dependency Load Tags -# Dependent References -UUID -RetainCount +* Any info that needs to do allocations must goto finish on alloc +* failure. Info that is just a lookup should just not set the object +* if the info does not exist. *********************************************************************/ #define _OSKextLoadInfoDictCapacity (12) OSDictionary * -OSKext::copyInfo(void) -{ - OSDictionary * result = NULL; - bool success = false; - OSNumber * cpuTypeNumber = NULL; // must release - OSNumber * cpuSubtypeNumber = NULL; // must release - OSString * versionString = NULL; // do not release - OSData * uuid = NULL; // must release - OSNumber * scratchNumber = NULL; // must release - OSArray * dependencyLoadTags = NULL; // must release - OSCollectionIterator * metaClassIterator = NULL; // must release - OSArray * metaClassInfo = NULL; // must release - OSDictionary * metaClassDict = NULL; // must release - OSMetaClass * thisMetaClass = NULL; // do not release - OSString * metaClassName = NULL; // must release - OSString * superclassName = NULL; // must release +OSKext::copyInfo(OSArray * infoKeys) +{ + OSDictionary * result = NULL; + bool success = false; + OSData * headerData = NULL; // must release + OSNumber * cpuTypeNumber = NULL; // must release + OSNumber * cpuSubtypeNumber = NULL; // must release + OSString * versionString = NULL; // do not release + uint32_t executablePathCStringSize = 0; + char * executablePathCString = NULL; // must release + OSString * executablePathString = NULL; // must release + OSData * uuid = NULL; // must release + OSNumber * scratchNumber = NULL; // must release + OSArray * dependencyLoadTags = NULL; // must release + OSCollectionIterator * metaClassIterator = NULL; // must release + OSArray * metaClassInfo = NULL; // must release + OSDictionary * metaClassDict = NULL; // must release + OSMetaClass * thisMetaClass = NULL; // do not release + OSString * metaClassName = NULL; // must release + OSString * superclassName = NULL; // must release uint32_t count, i; result = OSDictionary::withCapacity(_OSKextLoadInfoDictCapacity); @@ -6737,232 +7469,340 @@ OSKext::copyInfo(void) goto finish; } - /* CPU Type & Subtype. - * Use the CPU type of the kernel for all (loaded) kexts. - * xxx - should we not include this for the kernel components, - * xxx - or for any interface? they have mach-o files, they're just weird. + + /* Empty keys means no keys, but NULL is quicker to check. + */ + if (infoKeys && !infoKeys->getCount()) { + infoKeys = NULL; + } + + /* Headers, CPU type, and CPU subtype. */ - if (linkedExecutable || (this == sKernelKext)) { + if (!infoKeys || + _OSArrayContainsCString(infoKeys, kOSBundleMachOHeadersKey) || + _OSArrayContainsCString(infoKeys, kOSBundleCPUTypeKey) || + _OSArrayContainsCString(infoKeys, kOSBundleCPUSubtypeKey)) + { + + if (linkedExecutable && !isInterface()) { + + kernel_mach_header_t *kext_mach_hdr = (kernel_mach_header_t *) + linkedExecutable->getBytesNoCopy(); + + if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleMachOHeadersKey)) { + headerData = OSData::withBytes(kext_mach_hdr, + (u_int) (sizeof(*kext_mach_hdr) + kext_mach_hdr->sizeofcmds)); + if (!headerData) { + goto finish; + } + result->setObject(kOSBundleMachOHeadersKey, headerData); + } + + if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleCPUTypeKey)) { + cpuTypeNumber = OSNumber::withNumber( + (uint64_t) kext_mach_hdr->cputype, + 8 * sizeof(kext_mach_hdr->cputype)); + if (!cpuTypeNumber) { + goto finish; + } + result->setObject(kOSBundleCPUTypeKey, cpuTypeNumber); + } - cpuTypeNumber = OSNumber::withNumber( - (long long unsigned int)_mh_execute_header.cputype, - 8 * sizeof(_mh_execute_header.cputype)); - if (cpuTypeNumber) { - result->setObject(kOSBundleCPUTypeKey, cpuTypeNumber); + if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleCPUSubtypeKey)) { + cpuSubtypeNumber = OSNumber::withNumber( + (uint64_t) kext_mach_hdr->cpusubtype, + 8 * sizeof(kext_mach_hdr->cpusubtype)); + if (!cpuSubtypeNumber) { + goto finish; + } + result->setObject(kOSBundleCPUSubtypeKey, cpuSubtypeNumber); + } } } - // I don't want to rely on a mach header for nonkernel kexts, yet - if (this == sKernelKext) { - cpuSubtypeNumber = OSNumber::withNumber( - (long long unsigned int)_mh_execute_header.cputype, - 8 * sizeof(_mh_execute_header.cputype)); - if (cpuSubtypeNumber) { - result->setObject(kOSBundleCPUSubtypeKey, cpuSubtypeNumber); - } - } - - /* CFBundleIdentifier. + /* CFBundleIdentifier. We set this regardless because it's just stupid not to. */ result->setObject(kCFBundleIdentifierKey, bundleID); /* CFBundleVersion. */ - versionString = OSDynamicCast(OSString, - getPropertyForHostArch(kCFBundleVersionKey)); - if (versionString) { - result->setObject(kCFBundleVersionKey, versionString); + if (!infoKeys || _OSArrayContainsCString(infoKeys, kCFBundleVersionKey)) { + versionString = OSDynamicCast(OSString, + getPropertyForHostArch(kCFBundleVersionKey)); + if (versionString) { + result->setObject(kCFBundleVersionKey, versionString); + } } /* OSBundleCompatibleVersion. */ - versionString = OSDynamicCast(OSString, - getPropertyForHostArch(kOSBundleCompatibleVersionKey)); - if (versionString) { - result->setObject(kOSBundleCompatibleVersionKey, versionString); + if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleCompatibleVersionKey)) { + versionString = OSDynamicCast(OSString, + getPropertyForHostArch(kOSBundleCompatibleVersionKey)); + if (versionString) { + result->setObject(kOSBundleCompatibleVersionKey, versionString); + } } /* Path. */ - if (path) { - result->setObject(kOSBundlePathKey, path); + if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundlePathKey)) { + if (path) { + result->setObject(kOSBundlePathKey, path); + } } - /* UUID. + + /* OSBundleExecutablePath. */ - uuid = copyUUID(); - if (uuid) { - result->setObject(kOSBundleUUIDKey, uuid); + if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleExecutablePathKey)) { + if (path && executableRelPath) { + + uint32_t pathLength = path->getLength(); // gets incremented below + + // +1 for slash, +1 for \0 + executablePathCStringSize = pathLength + executableRelPath->getLength() + 2; + + executablePathCString = (char *)kalloc((executablePathCStringSize) * + sizeof(char)); // +1 for \0 + if (!executablePathCString) { + goto finish; + } + strlcpy(executablePathCString, path->getCStringNoCopy(), + executablePathCStringSize); + executablePathCString[pathLength++] = '/'; + executablePathCString[pathLength++] = '\0'; + strlcat(executablePathCString, executableRelPath->getCStringNoCopy(), + executablePathCStringSize); + + executablePathString = OSString::withCString(executablePathCString); + + if (!executablePathCString) { + goto finish; + } + + result->setObject(kOSBundleExecutablePathKey, executablePathString); + } + } + + /* UUID, if the kext has one. + */ + if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleUUIDKey)) { + uuid = copyUUID(); + if (uuid) { + result->setObject(kOSBundleUUIDKey, uuid); + } } /***** * OSKernelResource, OSBundleIsInterface, OSBundlePrelinked, OSBundleStarted. */ - result->setObject(kOSKernelResourceKey, - isKernelComponent() ? kOSBooleanTrue : kOSBooleanFalse); + if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleUUIDKey)) { + result->setObject(kOSKernelResourceKey, + isKernelComponent() ? kOSBooleanTrue : kOSBooleanFalse); + } - result->setObject(kOSBundleIsInterfaceKey, - isInterface() ? kOSBooleanTrue : kOSBooleanFalse); + if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleIsInterfaceKey)) { + result->setObject(kOSBundleIsInterfaceKey, + isInterface() ? kOSBooleanTrue : kOSBooleanFalse); + } - result->setObject(kOSBundlePrelinkedKey, - isPrelinked() ? kOSBooleanTrue : kOSBooleanFalse); + if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundlePrelinkedKey)) { + result->setObject(kOSBundlePrelinkedKey, + isPrelinked() ? kOSBooleanTrue : kOSBooleanFalse); + } - result->setObject(kOSBundleStartedKey, - isStarted() ? kOSBooleanTrue : kOSBooleanFalse); + if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleStartedKey)) { + result->setObject(kOSBundleStartedKey, + isStarted() ? kOSBooleanTrue : kOSBooleanFalse); + } /* LoadTag (Index). */ - scratchNumber = OSNumber::withNumber((unsigned long long)loadTag, - /* numBits */ 8 * sizeof(loadTag)); - if (scratchNumber) { + if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleLoadTagKey)) { + scratchNumber = OSNumber::withNumber((unsigned long long)loadTag, + /* numBits */ 8 * sizeof(loadTag)); + if (!scratchNumber) { + goto finish; + } result->setObject(kOSBundleLoadTagKey, scratchNumber); OSSafeReleaseNULL(scratchNumber); } /* LoadAddress, LoadSize. */ - if (isInterface() || linkedExecutable) { - /* These go to userspace via serialization, so we don't want any doubts - * about their size. - */ - uint64_t loadAddress = 0; - uint32_t loadSize = 0; - uint32_t wiredSize = 0; - - /* Interfaces always report 0 load address & size. - * Just the way they roll. - * - * xxx - leaving in # when we have a linkedExecutable...a kernelcomp - * xxx - shouldn't have one! - */ - if (linkedExecutable /* && !isInterface() */) { - loadAddress = (uint64_t)linkedExecutable->getBytesNoCopy(); - loadSize = linkedExecutable->getLength(); - - /* If we have a kmod_info struct, calculated the wired size - * from that. Otherwise it's the full load size. + if (!infoKeys || + _OSArrayContainsCString(infoKeys, kOSBundleLoadAddressKey) || + _OSArrayContainsCString(infoKeys, kOSBundleLoadSizeKey) || + _OSArrayContainsCString(infoKeys, kOSBundleWiredSizeKey)) + { + if (isInterface() || linkedExecutable) { + /* These go to userspace via serialization, so we don't want any doubts + * about their size. */ - if (kmod_info) { - wiredSize = loadSize - kmod_info->hdr_size; - } else { - wiredSize = loadSize; + uint64_t loadAddress = 0; + uint32_t loadSize = 0; + uint32_t wiredSize = 0; + + /* Interfaces always report 0 load address & size. + * Just the way they roll. + * + * xxx - leaving in # when we have a linkedExecutable...a kernelcomp + * xxx - shouldn't have one! + */ + if (linkedExecutable /* && !isInterface() */) { + loadAddress = (uint64_t)linkedExecutable->getBytesNoCopy(); + loadSize = linkedExecutable->getLength(); + + /* If we have a kmod_info struct, calculated the wired size + * from that. Otherwise it's the full load size. + */ + if (kmod_info) { + wiredSize = loadSize - kmod_info->hdr_size; + } else { + wiredSize = loadSize; + } } - } - scratchNumber = OSNumber::withNumber( - (unsigned long long)(loadAddress), - /* numBits */ 8 * sizeof(loadAddress)); - if (scratchNumber) { - result->setObject(kOSBundleLoadAddressKey, scratchNumber); - OSSafeReleaseNULL(scratchNumber); - } - scratchNumber = OSNumber::withNumber( - (unsigned long long)(loadSize), - /* numBits */ 8 * sizeof(loadSize)); - if (scratchNumber) { - result->setObject(kOSBundleLoadSizeKey, scratchNumber); - OSSafeReleaseNULL(scratchNumber); - } - scratchNumber = OSNumber::withNumber( - (unsigned long long)(wiredSize), - /* numBits */ 8 * sizeof(wiredSize)); - if (scratchNumber) { - result->setObject(kOSBundleWiredSizeKey, scratchNumber); - OSSafeReleaseNULL(scratchNumber); + if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleLoadAddressKey)) { + scratchNumber = OSNumber::withNumber( + (unsigned long long)(loadAddress), + /* numBits */ 8 * sizeof(loadAddress)); + if (!scratchNumber) { + goto finish; + } + result->setObject(kOSBundleLoadAddressKey, scratchNumber); + OSSafeReleaseNULL(scratchNumber); + } + if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleLoadSizeKey)) { + scratchNumber = OSNumber::withNumber( + (unsigned long long)(loadSize), + /* numBits */ 8 * sizeof(loadSize)); + if (!scratchNumber) { + goto finish; + } + result->setObject(kOSBundleLoadSizeKey, scratchNumber); + OSSafeReleaseNULL(scratchNumber); + } + if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleWiredSizeKey)) { + scratchNumber = OSNumber::withNumber( + (unsigned long long)(wiredSize), + /* numBits */ 8 * sizeof(wiredSize)); + if (!scratchNumber) { + goto finish; + } + result->setObject(kOSBundleWiredSizeKey, scratchNumber); + OSSafeReleaseNULL(scratchNumber); + } } } - + /* OSBundleDependencies. In descending order for * easy compatibility with kextstat(8). */ - if ((count = getNumDependencies())) { - dependencyLoadTags = OSArray::withCapacity(count); - result->setObject(kOSBundleDependenciesKey, dependencyLoadTags); + if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleDependenciesKey)) { + if ((count = getNumDependencies())) { + dependencyLoadTags = OSArray::withCapacity(count); + result->setObject(kOSBundleDependenciesKey, dependencyLoadTags); - i = count - 1; - do { - OSKext * dependency = OSDynamicCast(OSKext, - dependencies->getObject(i)); + i = count - 1; + do { + OSKext * dependency = OSDynamicCast(OSKext, + dependencies->getObject(i)); - OSSafeReleaseNULL(scratchNumber); - - if (!dependency) { - continue; - } - scratchNumber = OSNumber::withNumber( - (unsigned long long)dependency->getLoadTag(), - /* numBits*/ 8 * sizeof(loadTag)); - if (scratchNumber) { + OSSafeReleaseNULL(scratchNumber); + + if (!dependency) { + continue; + } + scratchNumber = OSNumber::withNumber( + (unsigned long long)dependency->getLoadTag(), + /* numBits*/ 8 * sizeof(loadTag)); + if (!scratchNumber) { + goto finish; + } dependencyLoadTags->setObject(scratchNumber); - } - } while (i--); + } while (i--); + } } OSSafeReleaseNULL(scratchNumber); /* OSBundleMetaClasses. */ - if (metaClasses && metaClasses->getCount()) { - metaClassIterator = OSCollectionIterator::withCollection(metaClasses); - metaClassInfo = OSArray::withCapacity(metaClasses->getCount()); - if (!metaClassIterator || !metaClassInfo) { - goto finish; - } - result->setObject(kOSBundleClassesKey, metaClassInfo); + if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleClassesKey)) { + if (metaClasses && metaClasses->getCount()) { + metaClassIterator = OSCollectionIterator::withCollection(metaClasses); + metaClassInfo = OSArray::withCapacity(metaClasses->getCount()); + if (!metaClassIterator || !metaClassInfo) { + goto finish; + } + result->setObject(kOSBundleClassesKey, metaClassInfo); - while ( (thisMetaClass = OSDynamicCast(OSMetaClass, - metaClassIterator->getNextObject())) ) { + while ( (thisMetaClass = OSDynamicCast(OSMetaClass, + metaClassIterator->getNextObject())) ) { - OSSafeReleaseNULL(metaClassDict); - OSSafeReleaseNULL(metaClassName); - OSSafeReleaseNULL(superclassName); - OSSafeReleaseNULL(scratchNumber); + OSSafeReleaseNULL(metaClassDict); + OSSafeReleaseNULL(scratchNumber); + OSSafeReleaseNULL(metaClassName); + OSSafeReleaseNULL(superclassName); - metaClassDict = OSDictionary::withCapacity(3); - if (!metaClassDict) { - goto finish; - } + metaClassDict = OSDictionary::withCapacity(3); + if (!metaClassDict) { + goto finish; + } - metaClassName = OSString::withCString(thisMetaClass->getClassName()); - if (thisMetaClass->getSuperClass()) { - superclassName = OSString::withCString( - thisMetaClass->getSuperClass()->getClassName()); - } - scratchNumber = OSNumber::withNumber(thisMetaClass->getInstanceCount(), - 8 * sizeof(unsigned int)); - if (!metaClassDict || !metaClassName || !superclassName || - !scratchNumber) { + metaClassName = OSString::withCString(thisMetaClass->getClassName()); + if (thisMetaClass->getSuperClass()) { + superclassName = OSString::withCString( + thisMetaClass->getSuperClass()->getClassName()); + } + scratchNumber = OSNumber::withNumber(thisMetaClass->getInstanceCount(), + 8 * sizeof(unsigned int)); + + /* Bail if any of the essentials is missing. The root class lacks a superclass, + * of course. + */ + if (!metaClassDict || !metaClassName || !scratchNumber) { + goto finish; + } - goto finish; + metaClassInfo->setObject(metaClassDict); + metaClassDict->setObject(kOSMetaClassNameKey, metaClassName); + if (superclassName) { + metaClassDict->setObject(kOSMetaClassSuperclassNameKey, superclassName); + } + metaClassDict->setObject(kOSMetaClassTrackingCountKey, scratchNumber); } - - metaClassInfo->setObject(metaClassDict); - metaClassDict->setObject(kOSMetaClassNameKey, metaClassName); - metaClassDict->setObject(kOSMetaClassSuperclassNameKey, superclassName); - metaClassDict->setObject(kOSMetaClassTrackingCountKey, scratchNumber); } } /* OSBundleRetainCount. */ - OSSafeReleaseNULL(scratchNumber); - { - int extRetainCount = getRetainCount() - 1; - if (isLoaded()) { - extRetainCount--; - } - scratchNumber = OSNumber::withNumber( - (int)extRetainCount, - /* numBits*/ 8 * sizeof(int)); - if (scratchNumber) { - result->setObject(kOSBundleRetainCountKey, scratchNumber); + if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleRetainCountKey)) { + OSSafeReleaseNULL(scratchNumber); + { + int kextRetainCount = getRetainCount() - 1; + if (isLoaded()) { + kextRetainCount--; + } + scratchNumber = OSNumber::withNumber( + (int)kextRetainCount, + /* numBits*/ 8 * sizeof(int)); + if (scratchNumber) { + result->setObject(kOSBundleRetainCountKey, scratchNumber); + } } } success = true; + finish: + OSSafeRelease(headerData); OSSafeRelease(cpuTypeNumber); OSSafeRelease(cpuSubtypeNumber); + OSSafeRelease(executablePathString); + if (executablePathString) kfree(executablePathCString, executablePathCStringSize); OSSafeRelease(uuid); OSSafeRelease(scratchNumber); OSSafeRelease(dependencyLoadTags); @@ -6977,6 +7817,216 @@ OSKext::copyInfo(void) return result; } +/********************************************************************/ +static struct symtab_command * getKernelSymtab(void) +{ + struct symtab_command * result = NULL; + struct load_command * load_cmd = NULL; + unsigned long i; + + load_cmd = (struct load_command *) + ((uintptr_t)&_mh_execute_header + sizeof(_mh_execute_header)); + for(i = 0; i < _mh_execute_header.ncmds; i++){ + if (load_cmd->cmd == LC_SYMTAB) { + result = (struct symtab_command *)load_cmd; + goto finish; + } + load_cmd = (struct load_command *) + ((uintptr_t)load_cmd + load_cmd->cmdsize); + } + +finish: + return result; +} + +/********************************************************************* +*********************************************************************/ +/* static */ +OSData * +OSKext::copySanitizedKernelImage(void) +{ + OSData * result = NULL; + + kernel_mach_header_t * kernelHeader = NULL; + uint32_t sizeofcmds = 0; + + /* These start out pointing to running kernel but + * after copying point to the copied info. + */ + kernel_segment_command_t * text_seg = NULL; + kernel_segment_command_t * data_seg = NULL; + kernel_segment_command_t * linkedit_seg = NULL; + struct symtab_command * symtab_cmd = NULL; + kernel_section_t * text_const_sect = NULL; + kernel_section_t * data_const_sect = NULL; + + kern_return_t kern_result = 0; + u_long kernelCopyLength = 0; + vm_offset_t kernelCopyAddr = 0; + u_char * kernelCopy = NULL; + + vm_offset_t contentOffset = 0; + struct load_command * scan_cmd = NULL; + kernel_section_t * scan_sect = NULL; + int64_t stroff_shift = 0; + + uint32_t i; + + text_seg = getsegbyname("__TEXT"); + data_seg = getsegbyname("__DATA"); + linkedit_seg = getsegbyname("__LINKEDIT"); + symtab_cmd = getKernelSymtab(); + + text_const_sect = getsectbyname("__TEXT", "__const"); + data_const_sect = getsectbyname("__DATA", "__const"); + + if (!text_seg || !data_seg || !linkedit_seg || !symtab_cmd || + !text_const_sect || ! data_const_sect) { + + OSKextLog(/* kext */ NULL, + kOSKextLogErrorLevel | kOSKextLogIPCFlag, + "Can't provide kernel image for linking; missing component."); + goto finish; + } + + /* Figure the size of the kernel image to build. We don't use the sizes of + * the __TEXT & __DATA segments overall as we only use the __const sections, + * so add those in manually. We're going to round each part to page size + * multiples too, just to be extra cautious. + */ + sizeofcmds = text_seg->cmdsize + data_seg->cmdsize + + linkedit_seg->cmdsize + symtab_cmd->cmdsize; + kernelCopyLength = round_page(sizeof(_mh_execute_header) + sizeofcmds) + + round_page(text_const_sect->size) + + round_page(data_const_sect->size) + + round_page(linkedit_seg->filesize); + + kern_result = kmem_alloc(kernel_map, &kernelCopyAddr, kernelCopyLength); + if (kern_result != KERN_SUCCESS) { + goto finish; + } + + kernelCopy = (u_char *)kernelCopyAddr; + bzero(kernelCopy, kernelCopyLength); // ??? - is this really necessary? + + /***** + * Copy the kernel Mach header and the load commands we want. + */ + memcpy(kernelCopy, &_mh_execute_header, sizeof(_mh_execute_header)); + kernelHeader = (kernel_mach_header_t *)kernelCopy; + kernelHeader->ncmds = 0; + kernelHeader->sizeofcmds = sizeofcmds; + contentOffset = round_page(sizeof(_mh_execute_header) + sizeofcmds); + + /* __TEXT segment load command and sections. + * Note that the __TEXT segment's 'offset' and 'filesize' include + * the data from the beginning of the mach header. + * + * Don't muck with the __TEXT segment's vmsize here; + * user-space linking requires it to match what is in the running kernel. + * We'll just have to live with it not being accurate + * (not like we can run the sanitized image after all). + */ + scan_cmd = (struct load_command *)&kernelHeader[1]; // just past mach header + memcpy(scan_cmd, text_seg, text_seg->cmdsize); + kernelHeader->ncmds++; + text_seg = (kernel_segment_command_t *)scan_cmd; // retarget to constructed segment + text_seg->fileoff = 0; + text_seg->filesize = round_page(sizeof(_mh_execute_header) + sizeofcmds); + + scan_sect = (kernel_section_t *)(text_seg + 1); + for (i = 0; i < text_seg->nsects; i++, scan_sect++) { + if (0 == strncmp("__const", scan_sect->sectname, sizeof("__const"))) { + text_const_sect = scan_sect; // retarget to constructed section + + text_seg->filesize += scan_sect->size; + + scan_sect->offset = contentOffset; + contentOffset += scan_sect->size; + + memcpy(kernelCopy + scan_sect->offset, (void *)(uintptr_t)scan_sect->addr, + scan_sect->size); + } else { + scan_sect->addr = 0; + scan_sect->size = 0; + scan_sect->offset = contentOffset; + scan_sect->nreloc = 0; + } + } + + contentOffset = round_page(contentOffset); + + /* __DATA segment load command and sections. + * Leave the vmsize as in the running kernel here, too. + */ + scan_cmd = (struct load_command *)((uintptr_t)scan_cmd + scan_cmd->cmdsize); + memcpy(scan_cmd, data_seg, data_seg->cmdsize); + kernelHeader->ncmds++; + data_seg = (kernel_segment_command_t *)scan_cmd; // retarget to constructed segment + data_seg->fileoff = contentOffset; + data_seg->filesize = 0; + + scan_sect = (kernel_section_t *)(data_seg + 1); + for (i = 0; i < data_seg->nsects; i++, scan_sect++) { + if (0 == strncmp("__const", scan_sect->sectname, sizeof("__const"))) { + data_const_sect = scan_sect; // retarget to constructed section + + data_seg->filesize += scan_sect->size; + + scan_sect->offset = contentOffset; + contentOffset += scan_sect->size; + + memcpy(kernelCopy + scan_sect->offset, (void *)(uintptr_t)scan_sect->addr, + scan_sect->size); + } else { + scan_sect->addr = 0; + scan_sect->size = 0; + scan_sect->offset = contentOffset; + scan_sect->nreloc = 0; + } + } + + contentOffset = round_page(contentOffset); + + /* __LINKEDIT segment load command. + * Leave the vmsize as in the running kernel here, too. + */ + scan_cmd = (struct load_command *)((uintptr_t)scan_cmd + scan_cmd->cmdsize); + memcpy(scan_cmd, linkedit_seg, linkedit_seg->cmdsize); + kernelHeader->ncmds++; + linkedit_seg = (kernel_segment_command_t *)scan_cmd; // retarget to constructed segment + linkedit_seg->fileoff = contentOffset; + linkedit_seg->filesize = linkedit_seg->vmsize; + + contentOffset += round_page(linkedit_seg->vmsize); + + memcpy(kernelCopy + linkedit_seg->fileoff, (void *)(uintptr_t)linkedit_seg->vmaddr, + linkedit_seg->vmsize); + + /* __SYMTAB load command (contents shared with __LINKEDIT). + */ + scan_cmd = (struct load_command *)((uintptr_t)scan_cmd + scan_cmd->cmdsize); + memcpy(scan_cmd, symtab_cmd, symtab_cmd->cmdsize); + kernelHeader->ncmds++; + symtab_cmd = (struct symtab_command *)scan_cmd; // retarget to constructed cmd + stroff_shift = symtab_cmd->stroff - symtab_cmd->symoff; + symtab_cmd->symoff = linkedit_seg->fileoff; + symtab_cmd->stroff = symtab_cmd->symoff + stroff_shift; + + /* Wrap the thing up in an OSData. + */ + result = OSData::withBytesNoCopy(kernelCopy, kernelCopyLength); + if (result) { + result->setDeallocFunction(osdata_kmem_free); + kernelCopy = NULL; + } + +finish: + if (kernelCopy) kmem_free(kernel_map, kernelCopyAddr, kernelCopyLength); + + return result; +} + /********************************************************************* *********************************************************************/ /* static */ @@ -7009,6 +8059,17 @@ OSKext::requestResource( *requestTagOut = kOSKextRequestTagInvalid; } + /* If requests to user space are disabled, don't go any further */ + if (!sKernelRequestsEnabled) { + OSKextLog(/* kext */ NULL, + kOSKextLogErrorLevel | kOSKextLogIPCFlag, + "Can't request resource %s for %s - requests to user space are disabled.", + resourceNameCString, + kextIdentifierCString); + result = kOSKextReturnDisabled; + goto finish; + } + if (!kextIdentifierCString || !resourceNameCString || !callback) { result = kOSKextReturnInvalidArgument; goto finish; @@ -7109,7 +8170,7 @@ OSKext::requestResource( goto finish; } - OSKextPingKextd(); + OSKext::pingKextd(); result = kOSReturnSuccess; if (requestTagOut) { @@ -7153,6 +8214,7 @@ OSKext::requestResource( } /********************************************************************* +* Assumes sKextLock is held. *********************************************************************/ /* static */ OSReturn @@ -7179,6 +8241,7 @@ OSKext::dequeueCallbackForRequestTag( } /********************************************************************* +* Assumes sKextLock is held. *********************************************************************/ /* static */ OSReturn @@ -7191,8 +8254,6 @@ OSKext::dequeueCallbackForRequestTag( OSNumber * callbackTagNum = NULL; // do not release unsigned int count, i; - IORecursiveLockLock(sKextLock); - result = kOSReturnError; count = sRequestCallbackRecords->getCount(); for (i = 0; i < count; i++) { @@ -7227,11 +8288,11 @@ OSKext::dequeueCallbackForRequestTag( result = kOSKextReturnNotFound; finish: - IORecursiveLockUnlock(sKextLock); return result; } /********************************************************************* +* Assumes sKextLock is held. *********************************************************************/ /* static */ OSReturn @@ -7250,8 +8311,6 @@ OSKext::dispatchResource(OSDictionary * requestDict) void * context = NULL; // do not free OSKext * callbackKext = NULL; // must release (looked up) - IORecursiveLockLock(sKextLock); - /* Get the args from the request. Right now we need the tag * to look up the callback record, and the result for invoking the callback. */ @@ -7329,7 +8388,6 @@ OSKext::dispatchResource(OSDictionary * requestDict) if (callbackKext) callbackKext->release(); if (callbackRecord) callbackRecord->release(); - IORecursiveLockUnlock(sKextLock); return result; } @@ -7372,6 +8430,7 @@ OSKext::invokeRequestCallback( } /********************************************************************* +* Assumes sKextLock is held. *********************************************************************/ /* static */ OSReturn @@ -7383,9 +8442,11 @@ OSKext::cancelRequest( OSDictionary * callbackRecord = NULL; // must release OSData * contextWrapper = NULL; // do not release + IORecursiveLockLock(sKextLock); result = OSKext::dequeueCallbackForRequestTag(requestTag, &callbackRecord); - + IORecursiveLockUnlock(sKextLock); + if (result == kOSReturnSuccess && contextOut) { contextWrapper = OSDynamicCast(OSData, _OSKextGetRequestArgument(callbackRecord, @@ -7399,6 +8460,7 @@ OSKext::cancelRequest( } /********************************************************************* +* Assumes sKextLock is held. *********************************************************************/ void OSKext::invokeOrCancelRequestCallbacks( @@ -7407,8 +8469,6 @@ OSKext::invokeOrCancelRequestCallbacks( { unsigned int count, i; - IORecursiveLockLock(sKextLock); - count = sRequestCallbackRecords->getCount(); if (!count) { goto finish; @@ -7448,11 +8508,11 @@ OSKext::invokeOrCancelRequestCallbacks( } while (i--); finish: - IORecursiveLockUnlock(sKextLock); return; } /********************************************************************* +* Assumes sKextLock is held. *********************************************************************/ uint32_t OSKext::countRequestCallbacks(void) @@ -7460,8 +8520,6 @@ OSKext::countRequestCallbacks(void) uint32_t result = 0; unsigned int count, i; - IORecursiveLockLock(sKextLock); - count = sRequestCallbackRecords->getCount(); if (!count) { goto finish; @@ -7494,7 +8552,6 @@ OSKext::countRequestCallbacks(void) } while (i--); finish: - IORecursiveLockUnlock(sKextLock); return result; } @@ -7619,6 +8676,39 @@ static OSReturn _OSDictionarySetCStringValue( return result; } +/********************************************************************* +*********************************************************************/ +static bool _OSArrayContainsCString( + OSArray * array, + const char * cString) +{ + bool result = false; + const OSSymbol * symbol = NULL; + uint32_t count, i; + + if (!array || !cString) { + goto finish; + } + + symbol = OSSymbol::withCStringNoCopy(cString); + if (!symbol) { + goto finish; + } + + count = array->getCount(); + for (i = 0; i < count; i++) { + OSObject * thisObject = array->getObject(i); + if (symbol->isEqualTo(thisObject)) { + result = true; + goto finish; + } + } + +finish: + if (symbol) symbol->release(); + return result; +} + #if PRAGMA_MARK #pragma mark Personalities (IOKit Drivers) #endif @@ -7906,29 +8996,21 @@ OSKext::removePersonalitiesFromCatalog(void) /* static */ OSKextLogSpec OSKext::setUserSpaceLogFilter( - OSKextLogSpec userLogFilter, + OSKextLogSpec newUserLogFilter, bool captureFlag) { OSKextLogSpec result; + bool allocError = false; - IORecursiveLockLock(sKextInnerLock); + /* Do not call any function that takes sKextLoggingLock during + * this critical block. That means do logging after. + */ + IOLockLock(sKextLoggingLock); result = sUserSpaceKextLogFilter; - sUserSpaceKextLogFilter = userLogFilter; - - /* If the config flag itself is changing, log the state change - * going both ways, before setting up the user-space log arrays, - * so that this is only logged in the kernel. - */ - if (sUserSpaceKextLogFilter != result) { - OSKextLog(/* kext */ NULL, - kOSKextLogDebugLevel | - kOSKextLogGeneralFlag, - "User-space log flags changed from 0x%x to 0x%x.", - result, sUserSpaceKextLogFilter); - } + sUserSpaceKextLogFilter = newUserLogFilter; - if (userLogFilter && captureFlag && + if (newUserLogFilter && captureFlag && !sUserSpaceLogSpecArray && !sUserSpaceLogMessageArray) { // xxx - do some measurements for a good initial capacity? @@ -7936,16 +9018,31 @@ OSKext::setUserSpaceLogFilter( sUserSpaceLogMessageArray = OSArray::withCapacity(0); if (!sUserSpaceLogSpecArray || !sUserSpaceLogMessageArray) { - OSKextLog(/* kext */ NULL, - kOSKextLogErrorLevel | - kOSKextLogGeneralFlag, - "Failed to allocate user-space log message arrays."); OSSafeReleaseNULL(sUserSpaceLogSpecArray); OSSafeReleaseNULL(sUserSpaceLogMessageArray); + allocError = true; } } - IORecursiveLockUnlock(sKextInnerLock); + IOLockUnlock(sKextLoggingLock); + + /* If the config flag itself is changing, log the state change + * going both ways, before setting up the user-space log arrays, + * so that this is only logged in the kernel. + */ + if (result != newUserLogFilter) { + OSKextLog(/* kext */ NULL, + kOSKextLogDebugLevel | + kOSKextLogGeneralFlag, + "User-space log flags changed from 0x%x to 0x%x.", + result, newUserLogFilter); + } + if (allocError) { + OSKextLog(/* kext */ NULL, + kOSKextLogErrorLevel | + kOSKextLogGeneralFlag, + "Failed to allocate user-space log message arrays."); + } return result; } @@ -7957,10 +9054,14 @@ OSKext::setUserSpaceLogFilter( OSArray * OSKext::clearUserSpaceLogFilter(void) { - OSArray * result = NULL; + OSArray * result = NULL; OSKextLogSpec oldLogFilter; + OSKextLogSpec newLogFilter = kOSKextLogSilentFilter; - IORecursiveLockLock(sKextInnerLock); + /* Do not call any function that takes sKextLoggingLock during + * this critical block. That means do logging after. + */ + IOLockLock(sKextLoggingLock); result = OSArray::withCapacity(2); if (result) { @@ -7971,25 +9072,26 @@ OSKext::clearUserSpaceLogFilter(void) OSSafeReleaseNULL(sUserSpaceLogMessageArray); oldLogFilter = sUserSpaceKextLogFilter; - sUserSpaceKextLogFilter = kOSKextLogSilentFilter; + sUserSpaceKextLogFilter = newLogFilter; + + IOLockUnlock(sKextLoggingLock); /* If the config flag itself is changing, log the state change * going both ways, after tearing down the user-space log * arrays, so this is only logged within the kernel. */ - if (oldLogFilter != sUserSpaceKextLogFilter) { + if (oldLogFilter != newLogFilter) { OSKextLog(/* kext */ NULL, kOSKextLogDebugLevel | kOSKextLogGeneralFlag, "User-space log flags changed from 0x%x to 0x%x.", - oldLogFilter, sUserSpaceKextLogFilter); + oldLogFilter, newLogFilter); } - IORecursiveLockUnlock(sKextInnerLock); - return result; } + /********************************************************************* * Do not call any function that takes sKextLock here! *********************************************************************/ @@ -7999,9 +9101,9 @@ OSKext::getUserSpaceLogFilter(void) { OSKextLogSpec result; - IORecursiveLockLock(sKextInnerLock); + IOLockLock(sKextLoggingLock); result = sUserSpaceKextLogFilter; - IORecursiveLockUnlock(sKextInnerLock); + IOLockUnlock(sKextLoggingLock); return result; } @@ -8120,7 +9222,7 @@ OSKextVLog( OSKext * aKext, OSKextLogSpec msgLogSpec, const char * format, - va_list srcArgList) + va_list srcArgList) { extern int disableConsoleOutput; @@ -8134,7 +9236,7 @@ OSKextVLog( OSString * logString = NULL; // must release char * buffer = stackBuffer; // do not free - IORecursiveLockLock(sKextInnerLock); + IOLockLock(sKextLoggingLock); /* Set the kext/global bit in the message spec if we have no * kext or if the kext requests logging. @@ -8203,12 +9305,13 @@ OSKextVLog( } finish: + IOLockUnlock(sKextLoggingLock); + if (allocBuffer) { kfree(allocBuffer, (length + 1) * sizeof(char)); } OSSafeRelease(logString); OSSafeRelease(logSpecNum); - IORecursiveLockUnlock(sKextInnerLock); return; } @@ -8218,6 +9321,7 @@ OSKextVLog( #pragma mark Backtrace Dump & kmod_get_info() support #endif /********************************************************************* +* This function must be safe to call in panic context. *********************************************************************/ /* static */ void @@ -8227,74 +9331,164 @@ OSKext::printKextsInBacktrace( int (* printf_func)(const char *fmt, ...), bool lockFlag) { - vm_offset_t * kscan_addr = NULL; - kmod_info_t * k = NULL; - kmod_reference_t * r = NULL; - unsigned int i; - int found_kmod = 0; + addr64_t summary_page = 0; + addr64_t last_summary_page = 0; + bool found_kmod = false; + u_int i = 0; if (lockFlag) { - IORecursiveLockLock(sKextLock); + IOLockLock(sKextSummariesLock); } - for (k = kmod; k; k = k->next) { - if (pmap_find_phys(kernel_pmap, (addr64_t)((uintptr_t)k)) == 0) { - (*printf_func)(" kmod scan stopped due to missing " - "kmod page: %p\n", k); - break; + if (!gLoadedKextSummaries) { + (*printf_func)(" can't perform kext scan: no kext summary"); + goto finish; + } + + summary_page = trunc_page((addr64_t)(uintptr_t)gLoadedKextSummaries); + last_summary_page = round_page(summary_page + sLoadedKextSummariesAllocSize); + for (; summary_page < last_summary_page; summary_page += PAGE_SIZE) { + if (pmap_find_phys(kernel_pmap, summary_page) == 0) { + (*printf_func)(" can't perform kext scan: " + "missing kext summary page %p", summary_page); + goto finish; + } + } + + for (i = 0; i < gLoadedKextSummaries->numSummaries; ++i) { + OSKextLoadedKextSummary * summary; + + summary = gLoadedKextSummaries->summaries + i; + if (!summary->address) { + continue; } - if (!k->address) { - continue; // skip fake entries for built-in kernel components + + if (!summaryIsInBacktrace(summary, addr, cnt)) { + continue; } - for (i = 0, kscan_addr = addr; i < cnt; i++, kscan_addr++) { - if ((*kscan_addr >= k->address) && - (*kscan_addr < (k->address + k->size))) { - if (!found_kmod) { - (*printf_func)(" Kernel Extensions in backtrace " - "(with dependencies):\n"); - } - found_kmod = 1; - (*printf_func)(" %s(%s)@%p->%p\n", - k->name, k->version, k->address, k->address + k->size - 1); + if (!found_kmod) { + (*printf_func)(" Kernel Extensions in backtrace:\n"); + found_kmod = true; + } - for (r = k->reference_list; r; r = r->next) { - kmod_info_t * rinfo; + printSummary(summary, printf_func); + } - if (pmap_find_phys(kernel_pmap, (addr64_t)((uintptr_t)r)) == 0) { - (*printf_func)(" kmod dependency scan stopped " - "due to missing dependency page: %p\n", r); - break; - } +finish: + if (lockFlag) { + IOLockUnlock(sKextSummariesLock); + } - rinfo = r->info; + return; +} - if (pmap_find_phys(kernel_pmap, (addr64_t)((uintptr_t)rinfo)) == 0) { - (*printf_func)(" kmod dependency scan stopped " - "due to missing kmod page: %p\n", rinfo); - break; - } +/********************************************************************* +* This function must be safe to call in panic context. +*********************************************************************/ +/* static */ +boolean_t +OSKext::summaryIsInBacktrace( + OSKextLoadedKextSummary * summary, + vm_offset_t * addr, + unsigned int cnt) +{ + u_int i = 0; + + for (i = 0; i < cnt; i++) { + vm_offset_t kscan_addr = addr[i]; + if ((kscan_addr >= summary->address) && + (kscan_addr < (summary->address + summary->size))) + { + return TRUE; + } + } - if (!rinfo->address) { - continue; // skip fake entries for built-ins - } + return FALSE; +} - (*printf_func)(" dependency: %s(%s)@%p\n", - rinfo->name, rinfo->version, rinfo->address); - } +/********************************************************************* + * scan list of loaded kext summaries looking for a load address match and if + * found return the UUID C string. If not found then set empty string. + *********************************************************************/ +static void findSummaryUUID( + uint32_t tag_ID, + uuid_string_t uuid); - break; // only report this kmod for one backtrace address - } +static void findSummaryUUID( + uint32_t tag_ID, + uuid_string_t uuid) +{ + u_int i; + + uuid[0] = 0x00; // default to no UUID + + for (i = 0; i < gLoadedKextSummaries->numSummaries; ++i) { + OSKextLoadedKextSummary * summary; + + summary = gLoadedKextSummaries->summaries + i; + + if (summary->loadTag == tag_ID) { + (void) uuid_unparse(summary->uuid, uuid); + break; } } + return; +} - if (lockFlag) { - IORecursiveLockUnlock(sKextLock); +/********************************************************************* +* This function must be safe to call in panic context. +*********************************************************************/ +void OSKext::printSummary( + OSKextLoadedKextSummary * summary, + int (* printf_func)(const char *fmt, ...)) +{ + kmod_reference_t * kmod_ref = NULL; + uuid_string_t uuid; + char version[kOSKextVersionMaxLength]; + + if (!OSKextVersionGetString(summary->version, version, sizeof(version))) { + strlcpy(version, "unknown version", sizeof(version)); } + (void) uuid_unparse(summary->uuid, uuid); + (*printf_func)(" %s(%s)[%s]@0x%llx->0x%llx\n", + summary->name, version, uuid, + summary->address, summary->address + summary->size - 1); + + /* print dependency info */ + for (kmod_ref = (kmod_reference_t *) summary->reference_list; + kmod_ref; + kmod_ref = kmod_ref->next) { + kmod_info_t * rinfo; + + if (pmap_find_phys(kernel_pmap, (addr64_t)((uintptr_t)kmod_ref)) == 0) { + (*printf_func)(" kmod dependency scan stopped " + "due to missing dependency page: %p\n", kmod_ref); + break; + } + rinfo = kmod_ref->info; + + if (pmap_find_phys(kernel_pmap, (addr64_t)((uintptr_t)rinfo)) == 0) { + (*printf_func)(" kmod dependency scan stopped " + "due to missing kmod page: %p\n", rinfo); + break; + } + + if (!rinfo->address) { + continue; // skip fake entries for built-ins + } + + /* locate UUID in gLoadedKextSummaries */ + findSummaryUUID(rinfo->id, uuid); + + (*printf_func)(" dependency: %s(%s)[%s]@%p\n", + rinfo->name, rinfo->version, uuid, rinfo->address); + } return; } + /******************************************************************************* * substitute() looks at an input string (a pointer within a larger buffer) * for a match to a substring, and on match it writes the marker & substitution @@ -8442,10 +9636,8 @@ assemble_identifier_and_version( } /******************************************************************************* +* Assumes sKextLock is held. *******************************************************************************/ -#define LAST_LOADED " - last loaded " -#define LAST_LOADED_TS_WIDTH (16) - /* static */ uint32_t OSKext::saveLoadedKextPanicListTyped( @@ -8467,12 +9659,23 @@ OSKext::saveLoadedKextPanicListTyped( i = count - 1; do { - OSKext * theKext = OSDynamicCast(OSKext, sLoadedKexts->getObject(i)); - kmod_info_t * kmod_info = theKext->kmod_info; + OSObject * rawKext = sLoadedKexts->getObject(i); + OSKext * theKext = OSDynamicCast(OSKext, rawKext); int match; char identPlusVers[2*KMOD_MAX_NAME]; uint32_t identPlusVersLength; - char timestampBuffer[17]; // enough for a uint64_t + + if (!rawKext) { + printf("OSKext::saveLoadedKextPanicListTyped - " + "NULL kext in loaded kext list; continuing\n"); + continue; + } + + if (!theKext) { + printf("OSKext::saveLoadedKextPanicListTyped - " + "Kext type cast failed in loaded kext list; continuing\n"); + continue; + } /* Skip all built-in kexts. */ @@ -8480,6 +9683,8 @@ OSKext::saveLoadedKextPanicListTyped( continue; } + kmod_info_t * kmod_info = theKext->kmod_info; + /* Filter for kmod name (bundle identifier). */ match = !strncmp(kmod_info->name, prefix, strnlen(prefix, KMOD_MAX_NAME)); @@ -8511,15 +9716,6 @@ OSKext::saveLoadedKextPanicListTyped( goto finish; } - /* We're going to note the last-loaded kext in the list. - */ - if (i + 1 == count) { - snprintf(timestampBuffer, sizeof(timestampBuffer), "%llu", - AbsoluteTime_to_scalar(&last_loaded_timestamp)); - identPlusVersLength += sizeof(LAST_LOADED) - 1 + - strnlen(timestampBuffer, sizeof(timestampBuffer)); - } - /* Adding 1 for the newline. */ if (*list_length_ptr + identPlusVersLength + 1 >= list_size) { @@ -8527,10 +9723,6 @@ OSKext::saveLoadedKextPanicListTyped( } *list_length_ptr = strlcat(paniclist, identPlusVers, list_size); - if (i + 1 == count) { - *list_length_ptr = strlcat(paniclist, LAST_LOADED, list_size); - *list_length_ptr = strlcat(paniclist, timestampBuffer, list_size); - } *list_length_ptr = strlcat(paniclist, "\n", list_size); } while (i--); @@ -8555,8 +9747,6 @@ OSKext::saveLoadedKextPanicList(void) uint32_t newlist_size = 0; uint32_t newlist_length = 0; - IORecursiveLockLock(sKextLock); - newlist_length = 0; newlist_size = KEXT_PANICLIST_SIZE; newlist = (char *)kalloc(newlist_size); @@ -8597,90 +9787,64 @@ OSKext::saveLoadedKextPanicList(void) loaded_kext_paniclist_length = newlist_length; finish: - IORecursiveLockUnlock(sKextLock); return; } /********************************************************************* +* Assumes sKextLock is held. *********************************************************************/ -/* static */ void -OSKext::saveUnloadedKextPanicList(OSKext * aKext) +OSKext::savePanicString(bool isLoading) { - char * newlist = NULL; - uint32_t newlist_size = 0; - uint32_t newlist_length = 0; - char identPlusVers[2*KMOD_MAX_NAME]; - uint32_t identPlusVersLength; + u_long len; - if (!aKext->kmod_info) { + if (!kmod_info) { return; // do not goto finish here b/c of lock } - IORecursiveLockLock(sKextLock); - - clock_get_uptime(&last_unloaded_timestamp); - last_unloaded_address = (void *)aKext->kmod_info->address; - last_unloaded_size = aKext->kmod_info->size; - - - identPlusVersLength = assemble_identifier_and_version(aKext->kmod_info, - identPlusVers); - if (!identPlusVersLength) { + len = assemble_identifier_and_version(kmod_info, + (isLoading) ? last_loaded_str : last_unloaded_str); + if (!len) { printf("error saving unloaded kext info\n"); goto finish; } - newlist_length = identPlusVersLength; - newlist_size = newlist_length + 1; - newlist = (char *)kalloc(newlist_size); - - if (!newlist) { - printf("couldn't allocate kext panic log buffer\n"); - goto finish; - } - - newlist[0] = '\0'; - - strlcpy(newlist, identPlusVers, newlist_size); - - if (unloaded_kext_paniclist) { - kfree(unloaded_kext_paniclist, unloaded_kext_paniclist_size); + if (isLoading) { + last_loaded_strlen = len; + last_loaded_address = (void *)kmod_info->address; + last_loaded_size = kmod_info->size; + clock_get_uptime(&last_loaded_timestamp); + } else { + last_unloaded_strlen = len; + last_unloaded_address = (void *)kmod_info->address; + last_unloaded_size = kmod_info->size; + clock_get_uptime(&last_unloaded_timestamp); } - unloaded_kext_paniclist = newlist; - unloaded_kext_paniclist_size = newlist_size; - unloaded_kext_paniclist_length = newlist_length; finish: - IORecursiveLockUnlock(sKextLock); return; } /********************************************************************* *********************************************************************/ -#if __LP64__ -#define __kLoadSizeEscape "0x%lld" -#else -#define __kLoadSizeEscape "0x%ld" -#endif /* __LP64__ */ - /* static */ void OSKext::printKextPanicLists(int (*printf_func)(const char *fmt, ...)) { - printf_func("unloaded kexts:\n"); - if (unloaded_kext_paniclist && - pmap_find_phys(kernel_pmap, (addr64_t) (uintptr_t) unloaded_kext_paniclist) && - unloaded_kext_paniclist[0]) { + if (last_loaded_strlen) { + printf_func("last loaded kext at %llu: %.*s (addr %p, size %lu)\n", + AbsoluteTime_to_scalar(&last_loaded_timestamp), + last_loaded_strlen, last_loaded_str, + last_loaded_address, last_loaded_size); + } - printf_func( - "%.*s (addr %p, size " __kLoadSizeEscape ") - last unloaded %llu\n", - unloaded_kext_paniclist_length, unloaded_kext_paniclist, - last_unloaded_address, last_unloaded_size, - AbsoluteTime_to_scalar(&last_unloaded_timestamp)); - } else { - printf_func("(none)\n"); + if (last_unloaded_strlen) { + printf_func("last unloaded kext at %llu: %.*s (addr %p, size %lu)\n", + AbsoluteTime_to_scalar(&last_unloaded_timestamp), + last_unloaded_strlen, last_unloaded_str, + last_unloaded_address, last_unloaded_size); } + printf_func("loaded kexts:\n"); if (loaded_kext_paniclist && pmap_find_phys(kernel_pmap, (addr64_t) (uintptr_t) loaded_kext_paniclist) && @@ -8693,21 +9857,163 @@ OSKext::printKextPanicLists(int (*printf_func)(const char *fmt, ...)) return; } +/********************************************************************* +* Assumes sKextLock is held. +*********************************************************************/ +/* static */ +void +OSKext::updateLoadedKextSummaries(void) +{ + kern_return_t result = KERN_FAILURE; + OSKextLoadedKextSummaryHeader *summaryHeader = NULL; + OSKextLoadedKextSummaryHeader *summaryHeaderAlloc = NULL; + OSKext *aKext; + vm_map_offset_t start, end; + size_t summarySize = 0; + size_t size; + u_int count; + u_int numKexts; + u_int i, j; + + IOLockLock(sKextSummariesLock); + + count = sLoadedKexts->getCount(); + for (i = 0, numKexts = 0; i < count; ++i) { + aKext = OSDynamicCast(OSKext, sLoadedKexts->getObject(i)); + numKexts += (aKext && aKext->isExecutable()); + } + + if (!numKexts) goto finish; + + /* Calculate the size needed for the new summary headers. + */ + + size = sizeof(*gLoadedKextSummaries); + size += numKexts * sizeof(*gLoadedKextSummaries->summaries); + size = round_page(size); + + /* If the previous summary is large enough, use it (and be sure to make + * it writable). If it's too small, free it and allocate a new buffer. + */ + + if (sPrevLoadedKextSummariesAllocSize < size) { + if (sPrevLoadedKextSummaries) { + kmem_free(kernel_map, (vm_offset_t)sPrevLoadedKextSummaries, + sPrevLoadedKextSummariesAllocSize); + sPrevLoadedKextSummaries = NULL; + sPrevLoadedKextSummariesAllocSize = 0; + } + + result = kmem_alloc(kernel_map, + (vm_offset_t*)&summaryHeaderAlloc, size); + if (result != KERN_SUCCESS) goto finish; + + summaryHeader = summaryHeaderAlloc; + summarySize = size; + } else { + summaryHeader = sPrevLoadedKextSummaries; + summarySize = sPrevLoadedKextSummariesAllocSize; + + start = (vm_map_offset_t) summaryHeader; + end = start + summarySize; + result = vm_map_protect(kernel_map, start, end, VM_PROT_DEFAULT, FALSE); + if (result != KERN_SUCCESS) goto finish; + } + + /* Populate the summary header. + */ + + bzero(summaryHeader, summarySize); + summaryHeader->version = kOSKextLoadedKextSummaryVersion; + summaryHeader->entry_size = sizeof(OSKextLoadedKextSummary); + summaryHeader->numSummaries = numKexts; + + /* Populate each kext summary. + */ + + count = sLoadedKexts->getCount(); + for (i = 0, j = 0; i < count; ++i) { + aKext = OSDynamicCast(OSKext, sLoadedKexts->getObject(i)); + if (!aKext || !aKext->isExecutable()) continue; + + aKext->updateLoadedKextSummary(&summaryHeader->summaries[j++]); + } + + /* Write protect the buffer and move it into place. + */ + + start = (vm_map_offset_t) summaryHeader; + end = start + summarySize; + result = vm_map_protect(kernel_map, start, end, VM_PROT_READ, FALSE); + if (result != KERN_SUCCESS) goto finish; + + sPrevLoadedKextSummaries = gLoadedKextSummaries; + sPrevLoadedKextSummariesAllocSize = sLoadedKextSummariesAllocSize; + + gLoadedKextSummaries = summaryHeader; + sLoadedKextSummariesAllocSize = summarySize; + + summaryHeaderAlloc = NULL; + + /* Call the magic breakpoint function through a static function pointer so + * the compiler can't optimize the function away. + */ + if (sLoadedKextSummariesUpdated) (*sLoadedKextSummariesUpdated)(); + +finish: + IOLockUnlock(sKextSummariesLock); + + /* If we had to allocate a new buffer but failed to generate the summaries, + * free that now. + */ + if (summaryHeaderAlloc) { + kmem_free(kernel_map, (vm_offset_t)summaryHeaderAlloc, summarySize); + } + + return; +} + +/********************************************************************* +*********************************************************************/ +void +OSKext::updateLoadedKextSummary(OSKextLoadedKextSummary *summary) +{ + OSData *uuid; + + strlcpy(summary->name, getIdentifierCString(), + sizeof(summary->name)); + + uuid = copyUUID(); + if (uuid) { + memcpy(summary->uuid, uuid->getBytesNoCopy(), sizeof(summary->uuid)); + OSSafeRelease(uuid); + } + + summary->address = kmod_info->address; + summary->size = kmod_info->size; + summary->version = getVersion(); + summary->loadTag = kmod_info->id; + summary->flags = 0; + summary->reference_list = (uint64_t) kmod_info->reference_list; + + return; +} + /********************************************************************* *********************************************************************/ -#if __ppc__ || __i386__ +#if __i386__ /* static */ kern_return_t OSKext::getKmodInfo( kmod_info_array_t * kmodList, mach_msg_type_number_t * kmodCount) { - kern_return_t result = KERN_FAILURE; - vm_offset_t data; - kmod_info_t * k, * kmod_info_scan_ptr; + kern_return_t result = KERN_FAILURE; + vm_offset_t data = 0; + kmod_info_t * k, * kmod_info_scan_ptr; kmod_reference_t * r, * ref_scan_ptr; - int ref_count; - unsigned size = 0; + int ref_count; + unsigned size = 0; *kmodList = (kmod_info_t *)0; *kmodCount = 0; @@ -8796,7 +10102,7 @@ OSKext::getKmodInfo( } return result; } -#endif /* __ppc__ || __i386__ */ +#endif /* __i386__ */ #if PRAGMA_MARK #pragma mark MAC Framework Support #endif @@ -8849,7 +10155,7 @@ MACFLengthForObject(OSObject * obj) len = sizeof("4294967295"); /* UINT32_MAX */ } else if (typeID == OSTypeID(OSBoolean)) { OSBoolean * boolObj = OSDynamicCast(OSBoolean, obj); - len = boolObj->isTrue() ? sizeof("true") : sizeof("false"); + len = (boolObj == kOSBooleanTrue) ? sizeof("true") : sizeof("false"); } else if (typeID == OSTypeID(OSData)) { OSData * dataObj = OSDynamicCast(OSData, obj); len = dataObj->getLength(); @@ -8885,7 +10191,7 @@ MACFInitElementFromObject( } else if (typeID == OSTypeID(OSBoolean)) { OSBoolean * boolObj = OSDynamicCast(OSBoolean, value); element->value_type = MAC_DATA_TYPE_PRIMITIVE; - if (boolObj->isTrue()) { + if (boolObj == kOSBooleanTrue) { strcpy(element->value, "true"); element->value_size = 5; } else { diff --git a/libkern/c++/OSMetaClass.cpp b/libkern/c++/OSMetaClass.cpp index bc67307d2..0696e2b02 100644 --- a/libkern/c++/OSMetaClass.cpp +++ b/libkern/c++/OSMetaClass.cpp @@ -491,6 +491,7 @@ int OSMetaClass::getRetainCount() const { return 0; } const char * OSMetaClass::getClassName() const { + if (!className) return NULL; return className->getCStringNoCopy(); } diff --git a/libkern/c++/OSObject.cpp b/libkern/c++/OSObject.cpp index 814fbbdf6..7da83e9a1 100644 --- a/libkern/c++/OSObject.cpp +++ b/libkern/c++/OSObject.cpp @@ -87,25 +87,6 @@ OSMetaClassDefineReservedUnused(OSObject, 13); OSMetaClassDefineReservedUnused(OSObject, 14); OSMetaClassDefineReservedUnused(OSObject, 15); -#ifdef __ppc__ -OSMetaClassDefineReservedUnused(OSObject, 16); -OSMetaClassDefineReservedUnused(OSObject, 17); -OSMetaClassDefineReservedUnused(OSObject, 18); -OSMetaClassDefineReservedUnused(OSObject, 19); -OSMetaClassDefineReservedUnused(OSObject, 20); -OSMetaClassDefineReservedUnused(OSObject, 21); -OSMetaClassDefineReservedUnused(OSObject, 22); -OSMetaClassDefineReservedUnused(OSObject, 23); -OSMetaClassDefineReservedUnused(OSObject, 24); -OSMetaClassDefineReservedUnused(OSObject, 25); -OSMetaClassDefineReservedUnused(OSObject, 26); -OSMetaClassDefineReservedUnused(OSObject, 27); -OSMetaClassDefineReservedUnused(OSObject, 28); -OSMetaClassDefineReservedUnused(OSObject, 29); -OSMetaClassDefineReservedUnused(OSObject, 30); -OSMetaClassDefineReservedUnused(OSObject, 31); -#endif - static const char *getClassName(const OSObject *obj) { const OSMetaClass *meta = obj->getMetaClass(); @@ -115,12 +96,6 @@ static const char *getClassName(const OSObject *obj) bool OSObject::init() { return true; } -#if (!__ppc__) || (__GNUC__ < 3) - -// Implemented in assembler in post gcc 3.x systems as we have a problem -// where the destructor in gcc2.95 gets 2 arguments. The second argument -// appears to be a flag argument. I have copied the assembler from Puma xnu -// to OSRuntimeSupport.c So for 2.95 builds use the C void OSObject::free() { const OSMetaClass *meta = getMetaClass(); @@ -129,7 +104,6 @@ void OSObject::free() meta->instanceDestructed(); delete this; } -#endif /* (!__ppc__) || (__GNUC__ < 3) */ int OSObject::getRetainCount() const { diff --git a/libkern/c++/OSObjectAsm.s b/libkern/c++/OSObjectAsm.s deleted file mode 100644 index eba1bc781..000000000 --- a/libkern/c++/OSObjectAsm.s +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#if __GNUC__ >= 3 - -; -; This function was generated by disassembling the 'OSObject::free(void)' -; function of the Panther7B7 kernel in gdb. -; -; Then add the 'li r4,3' flag taken fropm the Puma kernel 'OSObject::free' -; - .text - - .align 5 - .globl __ZN8OSObject4freeEv - -__ZN8OSObject4freeEv: - ; function prologue - stw r31,-4(r1) - mflr r0 - stw r0,8(r1) - mr r31,r3 - stwu r1,-80(r1) - - ; const OSMetaClass *meta = getMetaClass(); - lwz r9,0(r3) - lwz r12,32(r9) - mtctr r12 - bctrl - - ; if (meta) - ; meta->instanceDestructed(); - cmpwi r3,0 - beq delete_this - bl __ZNK11OSMetaClass18instanceDestructedEv - -delete_this: - ; delete this; - lwz r9,0(r31) - mr r3,r31 - li r4,0 ; Load up some sort of flags, for 2.95 destructors? - lwz r0,88(r1) - addi r1,r1,80 - lwz r12,8(r9) - mtlr r0 - lwz r31,-4(r1) - mtctr r12 - bctr - -#endif /* __GNUC__ >= 3 */ diff --git a/libkern/c++/OSOrderedSet.cpp b/libkern/c++/OSOrderedSet.cpp index 47d5f4970..5a5fb83e4 100644 --- a/libkern/c++/OSOrderedSet.cpp +++ b/libkern/c++/OSOrderedSet.cpp @@ -226,9 +226,9 @@ void OSOrderedSet::removeObject(const OSMetaClassBase *anObject) for (i = 0; i < count; i++) { - if( deleted) + if (deleted) array[i-1] = array[i]; - else if( (array[i].obj == anObject)) { + else if (array[i].obj == anObject) { deleted = true; haveUpdated(); // Pity we can't flush the log array[i].obj->taggedRelease(OSTypeID(OSCollection)); diff --git a/libkern/c++/OSRuntime.cpp b/libkern/c++/OSRuntime.cpp index 88df070d0..ae8faf0ef 100644 --- a/libkern/c++/OSRuntime.cpp +++ b/libkern/c++/OSRuntime.cpp @@ -224,6 +224,41 @@ void __pure_virtual( void ) { panic("%s", __FUNCTION__); } typedef void (*structor_t)(void); +/********************************************************************* +*********************************************************************/ +static boolean_t +sectionIsDestructor(kernel_section_t * section) +{ + boolean_t result; + + result = !strncmp(section->sectname, SECT_MODTERMFUNC, + sizeof(SECT_MODTERMFUNC) - 1); +#if !__LP64__ + result = result || !strncmp(section->sectname, SECT_DESTRUCTOR, + sizeof(SECT_DESTRUCTOR) - 1); +#endif + + return result; +} + +/********************************************************************* +*********************************************************************/ +static boolean_t +sectionIsConstructor(kernel_section_t * section) +{ + boolean_t result; + + result = !strncmp(section->sectname, SECT_MODINITFUNC, + sizeof(SECT_MODINITFUNC) - 1); +#if !__LP64__ + result = result || !strncmp(section->sectname, SECT_CONSTRUCTOR, + sizeof(SECT_CONSTRUCTOR) - 1); +#endif + + return result; +} + + /********************************************************************* * OSRuntimeUnloadCPPForSegment() * @@ -249,9 +284,7 @@ OSRuntimeUnloadCPPForSegmentInKmod( section != 0; section = nextsect(segment, section)) { - if (strncmp(section->sectname, SECT_DESTRUCTOR, - sizeof(SECT_DESTRUCTOR)) == 0) { - + if (sectionIsDestructor(section)) { structor_t * destructors = (structor_t *)section->addr; if (destructors) { @@ -422,9 +455,7 @@ OSRuntimeInitializeCPP( section != NULL; section = nextsect(segment, section)) { - if (strncmp(section->sectname, SECT_CONSTRUCTOR, - sizeof(SECT_CONSTRUCTOR)) == 0) { - + if (sectionIsConstructor(section)) { structor_t * constructors = (structor_t *)section->addr; if (constructors) { diff --git a/libkern/c++/OSSet.cpp b/libkern/c++/OSSet.cpp index a97158413..f2d5c3e8c 100644 --- a/libkern/c++/OSSet.cpp +++ b/libkern/c++/OSSet.cpp @@ -196,29 +196,36 @@ void OSSet::flushCollection() bool OSSet::setObject(const OSMetaClassBase *anObject) { - if (containsObject(anObject)) + if (containsObject(anObject)) { return false; - else { + } else { haveUpdated(); return members->setObject(anObject); } } -bool OSSet::merge(const OSArray *array) +bool OSSet::merge(const OSArray * array) { - const OSMetaClassBase *anObject; - bool retVal = false; + const OSMetaClassBase * anObject = 0; + bool result = true; -// xx-review: if any setObject fails due to memory allocation failure, -// xx-review: this function should return false - for (int i = 0; (anObject = array->getObject(i)); i++) - if (setObject(anObject)) - retVal = true; + for (int i = 0; (anObject = array->getObject(i)); i++) { - return retVal; + /* setObject() returns false if the object is already in the set, + * so we have to check beforehand here with containsObject(). + */ + if (containsObject(anObject)) { + continue; + } + if (!setObject(anObject)) { + result = false; + } + } + + return result; } -bool OSSet::merge(const OSSet *set) +bool OSSet::merge(const OSSet * set) { return merge(set->members); } diff --git a/libkern/c++/OSSymbol.cpp b/libkern/c++/OSSymbol.cpp index 1d6e6c2f0..5f9fad84e 100644 --- a/libkern/c++/OSSymbol.cpp +++ b/libkern/c++/OSSymbol.cpp @@ -361,8 +361,11 @@ void OSSymbolPool::removeSymbol(OSSymbol *sym) j = thisBucket->count; list = thisBucket->symbolP; - if (!j) + if (!j) { + // couldn't find the symbol; probably means string hash changed + panic("removeSymbol"); return; + } if (j == 1) { probeSymbol = (OSSymbol *) list; @@ -374,6 +377,8 @@ void OSSymbolPool::removeSymbol(OSSymbol *sym) SHRINK_POOL(); return; } + // couldn't find the symbol; probably means string hash changed + panic("removeSymbol"); return; } @@ -399,6 +404,8 @@ void OSSymbolPool::removeSymbol(OSSymbol *sym) SHRINK_POOL(); return; } + // couldn't find the symbol; probably means string hash changed + panic("removeSymbol"); return; } @@ -424,6 +431,8 @@ void OSSymbolPool::removeSymbol(OSSymbol *sym) return; } } + // couldn't find the symbol; probably means string hash changed + panic("removeSymbol"); } /* diff --git a/libkern/c++/Tests/TestSerialization/test1/test1_main.cpp b/libkern/c++/Tests/TestSerialization/test1/test1_main.cpp old mode 100644 new mode 100755 diff --git a/libkern/conf/MASTER b/libkern/conf/MASTER index b303c3603..783f1af08 100644 --- a/libkern/conf/MASTER +++ b/libkern/conf/MASTER @@ -51,6 +51,7 @@ # ident LIBKERN +options HIBERNATION # system hibernation # options KDEBUG # kernel tracing # options GPROF # kernel profiling # options LIBKERNCPP # C++ implementation # @@ -58,6 +59,7 @@ options NETWORKING # kernel networking # options CONFIG_DTRACE # dtrace support # options CRYPTO # cryptographic routines # options ZLIB # zlib support # +options IOKITSTATS # IOKit statistics # options CONFIG_NO_PANIC_STRINGS # options CONFIG_NO_PRINTF_STRINGS # @@ -67,6 +69,11 @@ options IPSEC # IP security # options CONFIG_KXLD # kxld/runtime linking of kexts # +# Note that when adding this config option to an architecture, one MUST +# add the architecture to the preprocessor test at the beginning of +# libkern/kmod/cplus_{start.c,stop.c}. +options CONFIG_STATIC_CPPINIT # Static library initializes kext cpp runtime # + # secure_kernel - secure kernel from user programs options SECURE_KERNEL # diff --git a/libkern/conf/MASTER.i386 b/libkern/conf/MASTER.i386 index 2bf2e22da..46f20d9ec 100644 --- a/libkern/conf/MASTER.i386 +++ b/libkern/conf/MASTER.i386 @@ -1,11 +1,10 @@ ###################################################################### # -# RELEASE = [ intel mach libkerncpp networking config_dtrace crypto zlib config_kxld ] +# RELEASE = [ intel mach libkerncpp hibernation networking config_dtrace crypto zlib config_kxld config_static_cppinit iokitstats ] # PROFILE = [ RELEASE profile ] # DEBUG = [ RELEASE debug mach_kdb ] # -# -# EMBEDDED = [ intel mach libkerncpp networking crypto zlib ] +# EMBEDDED = [ intel mach libkerncpp hibernation networking crypto zlib ] # DEVELOPMENT = [ EMBEDDED config_dtrace ] # ###################################################################### diff --git a/libkern/conf/MASTER.ppc b/libkern/conf/MASTER.ppc deleted file mode 100644 index 21e317660..000000000 --- a/libkern/conf/MASTER.ppc +++ /dev/null @@ -1,19 +0,0 @@ -# -###################################################################### -# -# Standard Apple MacOS X Configurations: -# -------- ---- -------- --------------- -# -# RELEASE = [ppc mach libkerncpp networking config_dtrace crypto zlib config_kxld ] -# DEVELOPMENT = [ RELEASE ] -# PROFILE = [RELEASE profile] -# DEBUG = [RELEASE debug mach_kdb ] -# RELEASE_TRACE = [ RELEASE kdebug ] -# DEBUG_TRACE = [ DEBUG kdebug ] -# -###################################################################### - -machine "ppc" # -cpu "ppc" # - -options MACH_KDB # # diff --git a/libkern/conf/MASTER.x86_64 b/libkern/conf/MASTER.x86_64 index da71fbe23..a9fd68364 100644 --- a/libkern/conf/MASTER.x86_64 +++ b/libkern/conf/MASTER.x86_64 @@ -1,11 +1,10 @@ ###################################################################### # -# RELEASE = [ intel mach libkerncpp networking config_dtrace crypto zlib config_kxld ] +# RELEASE = [ intel mach libkerncpp hibernation networking config_dtrace crypto zlib config_kxld iokitstats ] # PROFILE = [ RELEASE profile ] # DEBUG = [ RELEASE debug mach_kdb ] # -# -# EMBEDDED = [ intel mach libkerncpp networking crypto zlib ] +# EMBEDDED = [ intel mach libkerncpp hibernation networking crypto zlib ] # DEVELOPMENT = [ EMBEDDED ] # ###################################################################### diff --git a/libkern/conf/Makefile b/libkern/conf/Makefile index 8f54b1af4..f0cf53e3d 100644 --- a/libkern/conf/Makefile +++ b/libkern/conf/Makefile @@ -7,8 +7,7 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir include $(MakeInc_cmd) include $(MakeInc_def) -SETUP_SUBDIRS = \ - tools +SETUP_SUBDIRS = COMP_SUBDIRS = @@ -24,30 +23,24 @@ else export COMPOBJROOT=$(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)/$(COMPONENT) endif -$(COMPOBJROOT)/doconf: - @make build_setup +MASTER_CPU_PER_SOC = $(SOURCE)/MASTER.$(ARCH_CONFIG_LC).$(MACHINE_CONFIG_LC) $(COMPOBJROOT)/$(LIBKERN_KERNEL_CONFIG)/Makefile : $(SOURCE)/MASTER \ $(SOURCE)/MASTER.$(ARCH_CONFIG_LC) \ $(SOURCE)/Makefile.template \ $(SOURCE)/Makefile.$(ARCH_CONFIG_LC) \ $(SOURCE)/files \ - $(SOURCE)/files.$(ARCH_CONFIG_LC) \ - $(COMPOBJROOT)/doconf + $(SOURCE)/files.$(ARCH_CONFIG_LC) $(_v)(doconf_target=$(addsuffix /conf, $(TARGET)); \ $(MKDIR) $${doconf_target}; \ cd $${doconf_target}; \ rm -f $(notdir $?); \ cp $? $${doconf_target}; \ - $(COMPOBJROOT)/doconf -c -cpu $(ARCH_CONFIG_LC) -d $(TARGET)/$(LIBKERN_KERNEL_CONFIG) $(LIBKERN_KERNEL_CONFIG); \ + if [ -f $(MASTER_CPU_PER_SOC) ]; then cp $(MASTER_CPU_PER_SOC) $${doconf_target}; fi; \ + $(SRCROOT)/SETUP/config/doconf -c -cpu $(ARCH_CONFIG_LC) -soc $(MACHINE_CONFIG_LC) -d $(TARGET)/$(LIBKERN_KERNEL_CONFIG) $(LIBKERN_KERNEL_CONFIG); \ ); -.ORDER: $(COMPOBJROOT)/$(LIBKERN_KERNEL_CONFIG)/Makefile - -do_setup_conf: $(COMPOBJROOT)/doconf \ - $(COMPOBJROOT)/$(LIBKERN_KERNEL_CONFIG)/Makefile - -do_all: do_setup_conf +do_all: $(COMPOBJROOT)/$(LIBKERN_KERNEL_CONFIG)/Makefile $(_v)next_source=$(subst conf/,,$(SOURCE)); \ ${MAKE} -C $(COMPOBJROOT)/$(LIBKERN_KERNEL_CONFIG) \ MAKEFILES=$(TARGET)/$(LIBKERN_KERNEL_CONFIG)/Makefile \ diff --git a/libkern/conf/Makefile.i386 b/libkern/conf/Makefile.i386 index 3695a666c..f28e7a459 100644 --- a/libkern/conf/Makefile.i386 +++ b/libkern/conf/Makefile.i386 @@ -2,6 +2,12 @@ #BEGIN Machine dependent Makefile fragment for i386 ###################################################################### +# Files that must go in the __HIB segment: +UNCONFIGURED_HIB_FILES= \ + WKdmDecompress.o + +HIB_FILES=$(filter $(UNCONFIGURED_HIB_FILES),$(OBJS)) + ###################################################################### #END Machine dependent Makefile fragment for i386 ###################################################################### diff --git a/libkern/conf/Makefile.ppc b/libkern/conf/Makefile.ppc deleted file mode 100644 index cd79f229a..000000000 --- a/libkern/conf/Makefile.ppc +++ /dev/null @@ -1,7 +0,0 @@ -###################################################################### -#BEGIN Machine dependent Makefile fragment for ppc -###################################################################### - -###################################################################### -#END Machine dependent Makefile fragment for ppc -###################################################################### diff --git a/libkern/conf/Makefile.template b/libkern/conf/Makefile.template index 005aa9ca3..9dad4c816 100644 --- a/libkern/conf/Makefile.template +++ b/libkern/conf/Makefile.template @@ -26,8 +26,8 @@ include $(MakeInc_def) # # CFLAGS # -CFLAGS+= -imacros meta_features.h -DLIBKERN_KERNEL_PRIVATE -DOSALLOCDEBUG=1 \ - -Werror $(CFLAGS_INLINE_CONFIG) +CFLAGS+= -include meta_features.h -DLIBKERN_KERNEL_PRIVATE -DOSALLOCDEBUG=1 \ + $(CFLAGS_INLINE_CONFIG) # zlib is 3rd party source compress.o_CWARNFLAGS_ADD = -Wno-cast-qual @@ -85,13 +85,20 @@ ${OBJS}: ${OBJSDEPS} LDOBJS = $(OBJS) -$(COMPONENT).o: $(LDOBJS) +$(COMPONENT).filelist: $(LDOBJS) + $(_v)if [ $(BUILD_MACHO_OBJ) -eq 1 ]; then \ + for hib_file in ${HIB_FILES}; \ + do \ + $(SEG_HACK) __HIB $${hib_file} -o $${hib_file}__; \ + mv $${hib_file}__ $${hib_file} ; \ + done; \ + fi @echo LDFILELIST $(COMPONENT) $(_v)( for obj in ${LDOBJS}; do \ echo $(TARGET)$(COMP_OBJ_DIR)/$(KERNEL_CONFIG)/$${obj}; \ - done; ) > $(COMPONENT).o + done; ) > $(COMPONENT).filelist -do_all: $(COMPONENT).o +do_all: $(COMPONENT).filelist do_depend: do_all $(_v)${MD} -u Makedep -f -d `ls *.d` diff --git a/libkern/conf/Makefile.x86_64 b/libkern/conf/Makefile.x86_64 index 7b0de925d..a7fda56ca 100644 --- a/libkern/conf/Makefile.x86_64 +++ b/libkern/conf/Makefile.x86_64 @@ -2,6 +2,12 @@ #BEGIN Machine dependent Makefile fragment for x86_64 ###################################################################### +# Files that must go in the __HIB segment: +UNCONFIGURED_HIB_FILES= \ + WKdmDecompress.o + +HIB_FILES=$(filter $(UNCONFIGURED_HIB_FILES),$(OBJS)) + ###################################################################### #END Machine dependent Makefile fragment for x86_64 ###################################################################### diff --git a/libkern/conf/files b/libkern/conf/files index 15f992d67..6f3d432ac 100644 --- a/libkern/conf/files +++ b/libkern/conf/files @@ -4,6 +4,7 @@ OPTIONS/libkerncpp optional libkerncpp OPTIONS/kdebug optional kdebug OPTIONS/gprof optional gprof OPTIONS/config_dtrace optional config_dtrace +OPTIONS/hibernation optional hibernation OPTIONS/networking optional networking OPTIONS/crypto optional crypto OPTIONS/zlib optional zlib @@ -72,9 +73,9 @@ libkern/kxld/kxld_demangle.c optional config_kxld libkern/kxld/kxld_dict.c optional config_kxld libkern/kxld/kxld_kext.c optional config_kxld libkern/kxld/kxld_reloc.c optional config_kxld +libkern/kxld/kxld_object.c optional config_kxld libkern/kxld/kxld_sect.c optional config_kxld libkern/kxld/kxld_seg.c optional config_kxld -libkern/kxld/kxld_state.c optional config_kxld libkern/kxld/kxld_sym.c optional config_kxld libkern/kxld/kxld_symtab.c optional config_kxld libkern/kxld/kxld_util.c optional config_kxld diff --git a/libkern/conf/files.i386 b/libkern/conf/files.i386 index 70f37ed51..18edb6e7d 100644 --- a/libkern/conf/files.i386 +++ b/libkern/conf/files.i386 @@ -1 +1,8 @@ libkern/i386/OSAtomic.s standard +libkern/zlib/intel/inffastS.s optional zlib +libkern/zlib/intel/adler32vec.s optional zlib +libkern/crypto/intel/sha1edp.s optional crypto + +# Optimized WKdm compressor +libkern/kxld/i386/WKdmCompress.s optional hibernation +libkern/kxld/i386/WKdmDecompress.s optional hibernation diff --git a/libkern/conf/files.ppc b/libkern/conf/files.ppc deleted file mode 100644 index 0e495aa18..000000000 --- a/libkern/conf/files.ppc +++ /dev/null @@ -1,6 +0,0 @@ -libkern/ppc/OSAtomic.s standard -libkern/ppc/bcmp.s standard -libkern/ppc/memcmp.s standard -libkern/ppc/strlen.s standard -libkern/c++/OSObjectAsm.s optional libkerncpp - diff --git a/libkern/conf/files.x86_64 b/libkern/conf/files.x86_64 index bcf047445..bc32a4846 100644 --- a/libkern/conf/files.x86_64 +++ b/libkern/conf/files.x86_64 @@ -1 +1,8 @@ libkern/x86_64/OSAtomic.s standard +libkern/zlib/intel/inffastS.s optional zlib +libkern/zlib/intel/adler32vec.s optional zlib +libkern/crypto/intel/sha1edp.s optional crypto + +# Optimized WKdm compressor +libkern/kxld/i386/WKdmCompress.s optional hibernation +libkern/kxld/i386/WKdmDecompress.s optional hibernation diff --git a/libkern/conf/tools/Makefile b/libkern/conf/tools/Makefile deleted file mode 100644 index 4f9ccd553..000000000 --- a/libkern/conf/tools/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -SETUP_SUBDIRS = doconf - -COMP_SUBDIRS = doconf - -INST_SUBDIRS = \ - - -setup_build_all: - @echo "[ $(SOURCE) ] make setup_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -do_build_all: - @echo "[ $(SOURCE) ] make do_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -setup_build_install: - @echo "[ $(SOURCE) ] make setup_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -do_build_install: - @echo "[ $(SOURCE) ] make do_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -include $(MakeInc_rule) -include $(MakeInc_dir) - - diff --git a/libkern/conf/tools/doconf/Makefile b/libkern/conf/tools/doconf/Makefile deleted file mode 100644 index aa55a9419..000000000 --- a/libkern/conf/tools/doconf/Makefile +++ /dev/null @@ -1,47 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -COMP_SUBDIRS = \ - -INST_SUBDIRS = \ - - -# -# Who and where -# -BINDIR= -ifneq ($(MACHINE_CONFIG), DEFAULT) -DSTDIR= $(strip $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)_$(MACHINE_CONFIG)/$(COMPONENT)/) -else -DSTDIR= $(strip $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)/$(COMPONENT)/) -endif -PROGRAM= $(DSTDIR)doconf - -# -# How to install it -# -IFLAGS= -c -m 555 - -$(PROGRAM): $(DSTDIR)% : $(SOURCE)%.csh - @-$(RM) $(RMFLAGS) $(notdir $(PROGRAM)).VERS - @sed -e "s/#PROGRAM.*/#`vers_string $(notdir $(PROGRAM))`/" \ - < $< >$(notdir $(PROGRAM)).VERS; - @install $(IFLAGS) $(notdir $(PROGRAM)).VERS $(PROGRAM); - @-$(RM) $(RMFLAGS) $(notdir $(PROGRAM)).VERS; - -do_build_setup: $(PROGRAM) - -do_build_all: - -setup_build_install: - -do_build_install: - -include $(MakeInc_rule) -include $(MakeInc_dir) diff --git a/libkern/conf/tools/doconf/doconf.csh b/libkern/conf/tools/doconf/doconf.csh deleted file mode 100755 index 6fedb4786..000000000 --- a/libkern/conf/tools/doconf/doconf.csh +++ /dev/null @@ -1,321 +0,0 @@ -#!/bin/csh -f -set path = ($path .) -###################################################################### -# HISTORY -# 1-Dec-87 Michael Young (mwyoung) at Carnegie-Mellon University -# Added "-verbose" switch, so this script produces no output -# in the normal case. -# -# 10-Oct-87 Mike Accetta (mja) at Carnegie-Mellon University -# Flushed cmu_*.h and spin_locks.h -# [ V5.1(XF18) ] -# -# 6-Apr-87 Avadis Tevanian (avie) at Carnegie-Mellon University -# Use MASTER.local and MASTER..local for generation of -# configuration files in addition to MASTER and MASTER.. -# -# 25-Mar-87 Mike Accetta (mja) at Carnegie-Mellon University -# Removed use of obsolete wb_*.h files when building the feature -# list; modified to save the previous configuration file and -# display the differences between it and the new file. -# [ V5.1(F8) ] -# -# 25-Mar-87 Avadis Tevanian (avie) at Carnegie-Mellon University -# If there is no /etc/machine just print out a message telling -# user to use the -cpu option. I thought this script was supposed -# to work even without a /etc/machine, but it doesn't... and this -# is the easiest way out. -# -# 13-Mar-87 Mike Accetta (mja) at Carnegie-Mellon University -# Added "romp_fpa.h" file to extra features for the RT. -# [ V5.1(F7) ] -# -# 11-Mar-87 Mike Accetta (mja) at Carnegie-Mellon University -# Updated to maintain the appropriate configuration features file -# in the "machine" directory whenever the corresponding -# configuration is generated. This replaces the old mechanism of -# storing this directly in the file since it was -# machine dependent and also precluded building programs for more -# than one configuration from the same set of sources. -# [ V5.1(F6) ] -# -# 21-Feb-87 Mike Accetta (mja) at Carnegie-Mellon University -# Fixed to require wired-in cpu type names for only those -# machines where the kernel name differs from that provided by -# /etc/machine (i.e. IBMRT => ca and SUN => sun3); updated to -# permit configuration descriptions in both machine indepedent -# and dependent master configuration files so that attributes can -# be grouped accordingly. -# [ V5.1(F3) ] -# -# 17-Jan-87 Mike Accetta (mja) at Carnegie-Mellon University -# Updated to work from any directory at the same level as -# "conf"; generate configuration from both MASTER and -# MASTER. files; added -cpu switch. -# [ V5.1(F1) ] -# -# 18-Aug-86 Mike Accetta (mja) at Carnegie-Mellon University -# Added -make switch and changed meaning of -config; upgraded to -# allow multiple attributes per configuration and to define -# configurations in terms of these attributes within MASTER. -# -# 14-Apr-83 Mike Accetta (mja) at Carnegie-Mellon University -# Added -config switch to only run /etc/config without -# "make depend" and "make". -# -###################################################################### - -set prog=$0 -set prog=$prog:t -set nonomatch -set OBJDIR=../BUILD -if ("`/usr/bin/uname`" == "Rhapsody" ) then -set CONFIG_DIR=/usr/local/bin -else -set CONFIG_DIR=/usr/bin -endif - -unset domake -unset doconfig -unset beverbose -unset MACHINE -unset profile - -while ($#argv >= 1) - if ("$argv[1]" =~ -*) then - switch ("$argv[1]") - case "-c": - case "-config": - set doconfig - breaksw - case "-m": - case "-make": - set domake - breaksw - case "-cpu": - if ($#argv < 2) then - echo "${prog}: missing argument to ${argv[1]}" - exit 1 - endif - set MACHINE="$argv[2]" - shift - breaksw - case "-d": - if ($#argv < 2) then - echo "${prog}: missing argument to ${argv[1]}" - exit 1 - endif - set OBJDIR="$argv[2]" - shift - breaksw - case "-verbose": - set beverbose - breaksw - case "-p": - case "-profile": - set profile - breaksw - default: - echo "${prog}: ${argv[1]}: unknown switch" - exit 1 - breaksw - endsw - shift - else - break - endif -end - -if ($#argv == 0) set argv=(GENERIC) - -if (! $?MACHINE) then - if (-d /NextApps) then - set MACHINE=`hostinfo | awk '/MC680x0/ { printf("m68k") } /MC880x0/ { printf("m88k") }'` - endif -endif - -if (! $?MACHINE) then - if (-f /etc/machine) then - set MACHINE="`/etc/machine`" - else - echo "${prog}: no /etc/machine, specify machine type with -cpu" - echo "${prog}: e.g. ${prog} -cpu VAX CONFIGURATION" - exit 1 - endif -endif - -set FEATURES_EXTRA= - -switch ("$MACHINE") - case IBMRT: - set cpu=ca - set ID=RT - set FEATURES_EXTRA="romp_dualcall.h romp_fpa.h" - breaksw - case SUN: - set cpu=sun3 - set ID=SUN3 - breaksw - default: - set cpu=`echo $MACHINE | tr A-Z a-z` - set ID=`echo $MACHINE | tr a-z A-Z` - breaksw -endsw -set FEATURES=../h/features.h -set FEATURES_H=(cs_*.h mach_*.h net_*.h\ - cputypes.h cpus.h vice.h\ - $FEATURES_EXTRA) -set MASTER_DIR=../conf -set MASTER = ${MASTER_DIR}/MASTER -set MASTER_CPU=${MASTER}.${cpu} - -set MASTER_LOCAL = ${MASTER}.local -set MASTER_CPU_LOCAL = ${MASTER_CPU}.local -if (! -f $MASTER_LOCAL) set MASTER_LOCAL = "" -if (! -f $MASTER_CPU_LOCAL) set MASTER_CPU_LOCAL = "" - -if (! -d $OBJDIR) then - if ($?beverbose) then - echo "[ creating $OBJDIR ]" - endif - mkdir -p $OBJDIR -endif - -foreach SYS ($argv) - set SYSID=${SYS}_${ID} - set SYSCONF=$OBJDIR/config.$SYSID - set BLDDIR=$OBJDIR - if ($?beverbose) then - echo "[ generating $SYSID from $MASTER_DIR/MASTER{,.$cpu}{,.local} ]" - endif - echo +$SYS \ - | \ - cat $MASTER $MASTER_LOCAL $MASTER_CPU $MASTER_CPU_LOCAL - \ - $MASTER $MASTER_LOCAL $MASTER_CPU $MASTER_CPU_LOCAL \ - | \ - sed -n \ - -e "/^+/{" \ - -e "s;[-+];#&;gp" \ - -e 't loop' \ - -e ': loop' \ - -e 'n' \ - -e '/^#/b loop' \ - -e '/^$/b loop' \ - -e 's;^\([^#]*\).*#[ ]*<\(.*\)>[ ]*$;\2#\1;' \ - -e 't not' \ - -e 's;\([^#]*\).*;#\1;' \ - -e 't not' \ - -e ': not' \ - -e 's;[ ]*$;;' \ - -e 's;^\!\(.*\);\1#\!;' \ - -e 'p' \ - -e 't loop' \ - -e 'b loop' \ - -e '}' \ - -e "/^[^#]/d" \ - -e 's; ; ;g' \ - -e "s;^# *\([^ ]*\)[ ]*=[ ]*\[\(.*\)\].*;\1#\2;p" \ - | \ - awk '-F#' '\ -part == 0 && $1 != "" {\ - m[$1]=m[$1] " " $2;\ - next;\ -}\ -part == 0 && $1 == "" {\ - for (i=NF;i>1;i--){\ - s=substr($i,2);\ - c[++na]=substr($i,1,1);\ - a[na]=s;\ - }\ - while (na > 0){\ - s=a[na];\ - d=c[na--];\ - if (m[s] == "") {\ - f[s]=d;\ - } else {\ - nx=split(m[s],x," ");\ - for (j=nx;j>0;j--) {\ - z=x[j];\ - a[++na]=z;\ - c[na]=d;\ - }\ - }\ - }\ - part=1;\ - next;\ -}\ -part != 0 {\ - if ($1 != "") {\ - n=split($1,x,",");\ - ok=0;\ - for (i=1;i<=n;i++) {\ - if (f[x[i]] == "+") {\ - ok=1;\ - }\ - }\ - if (NF > 2 && ok == 0 || NF <= 2 && ok != 0) {\ - print $2; \ - }\ - } else { \ - print $2; \ - }\ -}\ -' >$SYSCONF.new - if (-z $SYSCONF.new) then - echo "${prog}: ${$SYSID}: no such configuration in $MASTER_DIR/MASTER{,.$cpu}" - rm -f $SYSCONF.new - endif - if (! -d $BLDDIR) then - if ($?beverbose) then - echo "[ creating $BLDDIR ]" - endif - mkdir -p $BLDDIR - endif -# -# These paths are used by config. -# -# "builddir" is the name of the directory where kernel binaries -# are put. It is a single path element, never absolute, and is -# always relative to "objectdir". "builddir" is used by config -# solely to determine where to put files created by "config" (e.g. -# the created Makefile and *.h's.) -# -# "objectdir" is the name of the directory which will hold "builddir". -# It is a path; if relative, it is relative to the current directory -# where config is run. It's sole use is to be prepended to "builddir" -# to indicate where config-created files are to be placed (see above). -# -# "sourcedir" is the location of the sources used to build the kernel. -# It is a path; if relative, it is relative to the directory specified -# by the concatenation of "objectdir" and "builddir" (i.e. where the -# kernel binaries are put). -# - echo 'builddir "."' >> $SYSCONF.new - set OBJRELDIR=`$RELPATH $OBJROOT $OBJDIR` - echo 'objectdir "'$OBJROOT'/'$OBJRELDIR'"' >> $SYSCONF.new - set SRCDIR=`dirname $SOURCE` - echo 'sourcedir "'$SRCROOT'"' >> $SYSCONF.new - if (-f $SYSCONF) then - diff $SYSCONF $SYSCONF.new - rm -f $SYSCONF.old - mv $SYSCONF $SYSCONF.old - endif - rm -f $SYSCONF - mv $SYSCONF.new $SYSCONF - if ($?doconfig) then - if ($?beverbose) then - echo "[ configuring $SYSID ]" - endif - if ($?profile) then - $CONFIG_DIR/config -c $MASTER_DIR -p $SYSCONF - else - $CONFIG_DIR/config -c $MASTER_DIR $SYSCONF - endif - endif - if ($?domake) then - if ($?beverbose) then - echo "[ making $SYSID ]" - endif - (cd $BLDDIR; make) - endif -end diff --git a/libkern/crypto/intel/sha1edp.h b/libkern/crypto/intel/sha1edp.h new file mode 100644 index 000000000..ba90122fd --- /dev/null +++ b/libkern/crypto/intel/sha1edp.h @@ -0,0 +1,51 @@ +#if !defined sha1edp_h +#define sha1edp_h + + +/* This file is included in sha1edpLittleEndian.s and sha1edpBigEndian.s to + define the symbols below for use in assembly code. + + It is also included in sha1_locl.h and compiled in C to test that the + hard-coded values here match the values used in C. CC_SHA1_BLOCK_BYTES + is defined in another header, so an error will be generated if its + definition here conflicts. The other symbols are tested below, with + the CheckAssertion definition. +*/ + + +// Number of bytes in a SHA-1 block. +#define CC_SHA1_BLOCK_BYTES 64 + +// Offset of h0 to h4 members in SHA-1 context structure. +#define Contexth0 (0*4) +#define Contexth1 (1*4) +#define Contexth2 (2*4) +#define Contexth3 (3*4) +#define Contexth4 (4*4) + + +#if !defined __ASSEMBLER__ + + #include // Get offsetof macro. + + /* Declare CheckAssertion so that if any of the declarations below + differ from it, the compiler will report an error. + */ + extern char CheckAssertion[1]; + + /* Ensure that Contexth0 through Contexth4 are the byte offsets of the + h0 through h4 members of the SHA-1 context structure. + */ + extern char CheckAssertion[Contexth0 == offsetof(SHA_CTX, h0)]; + extern char CheckAssertion[Contexth1 == offsetof(SHA_CTX, h1)]; + extern char CheckAssertion[Contexth2 == offsetof(SHA_CTX, h2)]; + extern char CheckAssertion[Contexth3 == offsetof(SHA_CTX, h3)]; + extern char CheckAssertion[Contexth4 == offsetof(SHA_CTX, h4)]; + /* If these assertions fail, change the definitions of Contexth0 to + Contexth4 to match the offsets of the members. + */ + +#endif // !defined __ASSEMBLER__ + + +#endif // !defined sha1edp_h diff --git a/libkern/crypto/intel/sha1edp.s b/libkern/crypto/intel/sha1edp.s new file mode 100644 index 000000000..80da81a62 --- /dev/null +++ b/libkern/crypto/intel/sha1edp.s @@ -0,0 +1,1481 @@ +/* sha1edp.s : this file provides optimized x86_64 and i386 implementation of the sha1 function + CoreOS - vector and numerics group + cclee 6-21-10 + + The implementation is based on the principle described in an Intel online article + "Improving the Performance of the Secure Hash Algorithm (SHA-1)" + http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ + + + Update HASH[] by processing a one 64-byte block in MESSAGE[] can be represented by the following C function + +void SHA1( int HASH[], int MESSAGE[] ) +{ + int A[81], B[81], C[81], D[81], E[81]; + int W[80]; + + int i, FN; + + A[0] = HASH[0]; + B[0] = HASH[1]; + C[0] = HASH[2]; + D[0] = HASH[3]; + E[0] = HASH[4]; + + for ( i=0; i<80; ++i ) + { + if ( i < 16 ) + W[i] = BIG_ENDIAN_LOAD( MESSAGE[i] ); + else + W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 ); + + FN = F( i, B[i], C[i], D[i] ); + + A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + W[i] + K(i); + B[i+1] = A[i]; + C[i+1] = ROTATE_LEFT( B[i], 30 ); + D[i+1] = C[i]; + E[i+1] = D[i]; + } + + HASH[0] += A[80]; + HASH[1] += B[80]; + HASH[2] += C[80]; + HASH[3] += D[80]; + HASH[4] += E[80]; +} + + For i=0:15, W[i] is simply big-endian loading of MESSAGE[i]. For i=16:79, W[i] is updated according to W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 ); + + The approach (by Dean Gaudet) can be used to vectorize the computation of W[i] for i=16:79, + + 1. done on 4 consequtive W[i] values in a single XMM register + W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1 + W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1 + W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1 + W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1 + + 2. this additional calculation unfortunately requires many additional operations + W[i+3] ^= W[i] rol 1 + + 3. once we have 4 W[i] values in XMM we can also add four K values with one instruction + W[i:i+3] += {K,K,K,K} + + Let W0 = {W[i] W[i+1] W[i+2] W[i+3]} be the current W-vector to be computed, W4 = {W[i-4] W[i-3] W[i-2] W[i-1]} be the previous vector, and so on + The Dean Gaudet approach can be expressed as + + 1. W0 = rotate_left(left_shift(W4,32) ^ W8 ^ left_shift(concatenate(W16,W12),64) ^ W16,1); + 2. W[i+3] ^= W[i] rol 1 + 3. W0 += {K,K,K,K} + + For i>=32, the Intel online article suggests that (using a basic identity (X rol 1) rol 1 = X rol 2) the update equation is equivalent to + + 1. W0 = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2); + + Note: + 1. In total, we need 8 16-byte registers or memory for W0,W4,...,W28. W0 and W32 can be the same register or memory. + 2. The registers are used in a circular buffering mode. For example, we start with W28,W24,...,W0 (with W0 indicating the most recent 16-byte) + i=0, W28,W24,...,W0 + i=4, W24,W20,...,W28 + i=8, W20,W16,...,W24 + . + . + and so forth. + 3. 2 ssse3 instructions are used in the Intel article, pshufb and palignr. + a. pshufb is used to simplify the BIG_ENDIAN_LOAD operation + b. palignr is used to simplify the computation of left_shift(concatenate(W12,W8),64) + 4. we probe __cpu_capabilities to detect ssse3 support and dispatch code with ssse3 support when available. + If ssse3 is not supported, a suboptimal code (pshufb and palignr workaround) is dispatched. + +*/ + +/* the code can be compiled into single block (64 bytes) per call mode by setting Multiple_blocks to 0 */ +#define Multiple_Blocks 1 + +#if defined (__x86_64__) || defined(__i386__) // x86_64 or i386 architectures + +#if defined(__x86_64__) + + // set up for x86_64 +#define stack_size (8+16*11+16*4) // 8 (alignedment) + x0-x10 + 4 128-bits for intermediate WK(t) storage +#define sp %rsp // unifying architectural stack pointer representation +#define ctx %rdi // 1st input argument, will move to HASH_PTR (%r9) +#define buf %rsi // 2nd input argument, will move to BUFFER_PTR (%r10) +#define cnt %r11 // will copy from the 3rd input argument (%rdx) +#define K_BASE %r8 // an aligned pointer to point to shufb reference numbers of table of K values +#define HASH_PTR %r9 // pointer to Hash values (A,B,C,D,E) +#define BUFFER_PTR %r10 // pointer to input blocks + +#else // !__x86_64__ + + // set up for i386 +#define stack_size (12+16*2+16*11+16*4) // 12-bytes (alignment) + extra 2 + 3 (W24/W28/XMM_SHUFB_BSWAP) + 8 (xmm0-xmm7) + 4 (WK(t)) +#define sp %esp // unifying architectural stack pointer representation +#define HASH_PTR stack_size+16+4(sp) // use 1st input argument from caller function, 16 for (esi/edi/ebx/ebp) +#define BUFFER_PTR stack_size+16+8(sp) // use 2nd input argument from caller function +#define cnt stack_size+16+12(sp) // use 3rd input argument from caller function +#define K_BASE stack_size-4(sp) // use for K_BASE + +#endif // __x86_64__ + +// symbolizing registers or stack memory with algorithmic variables W0,W4,...,W28 + W_TMP, W_TMP2, and XMM_SHUFB_BSWAP for code with ssse3 support + +#define W_TMP %xmm0 +#define W_TMP2 %xmm1 +#define W0 %xmm2 +#define W4 %xmm3 +#define W8 %xmm4 +#define W12 %xmm5 +#define W16 %xmm6 +#define W20 %xmm7 +#if defined(__x86_64__) +#define W24 %xmm8 +#define W28 %xmm9 +#define XMM_SHUFB_BSWAP %xmm10 // used only when ssse3 is supported +#else // defined (__i386__) +#define W24 12*16(sp) +#define W28 13*16(sp) +#define XMM_SHUFB_BSWAP 14*16(sp) // used only when ssse3 is supported +#endif + +#define xmov movaps // aligned 16-byte move +#define xmovu movups // unaligned 16-byte move + +// intermediate hash variables +#define A %ecx +#define B %esi +#define C %edi +#define D %ebp +#define E %edx + +// temp variables +#define T1 %eax +#define T2 %ebx + +#define WK(t) (t&15)*4(sp) + + // int F1(int B, int C, int D) { return (D ^ ( B & (C ^ D)); } + // result in T1 + .macro F1 + mov $1, T1 + xor $2, T1 + and $0, T1 + xor $2, T1 + .endm + + // int F2(int B, int C, int D) { return (D ^ B ^ C); } + // result in T1 + .macro F2 + mov $2, T1 + xor $1, T1 + xor $0, T1 + .endm + + // int F3(int B, int C, int D) { return (B & C) | (D & (B ^ C)); } + // result in T1 + .macro F3 + mov $1, T1 + mov $0, T2 + or $0, T1 + and $1, T2 + and $2, T1 + or T2, T1 + .endm + + // for i=60:79, F4 is identical to F2 + #define F4 F2 + + + /* + i=0:15, W[i] = BIG_ENDIAN_LOAD(MESSAGE[i]); + + with ssse3 support, this is achived via + for (i=0;i<16;i+=4) { + 1. W_TMP = new 16 bytes from MESSAGE[] + 2. W_TMP = pshufb(W_TMP, XMM_SHUFB_BSWAP); save to W circular buffer for updating W + 3. WTMP += {K,K,K,K}; + 4. save quadruple W[i]+K[i] = W_TMP in the stack memory; + } + + each step is represented in one of the following 4 macro definitions + + */ + + .macro W_PRECALC_00_15_0_ssse3 // input argument $0 : 0/4/8/12 +#if defined (__x86_64__) // BUFFER_PTR is already an address register in x86_64 + xmovu $0*4(BUFFER_PTR), W_TMP // read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned +#else // BUFFER_PTR is from the argument set up in the caller + mov BUFFER_PTR, T1 // T1 = BUFFER_PTR + xmovu $0*4(T1), W_TMP // read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned +#endif + .endm + + .macro W_PRECALC_00_15_1_ssse3 // input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28 + pshufb XMM_SHUFB_BSWAP, W_TMP // convert W_TMP from little-endian into big-endian + xmov W_TMP, $0 // save W_TMP in the circular buffer + .endm + + .macro W_PRECALC_00_15_2 // K_BASE points to the current K quadruple. +#if defined (__x86_64__) // K_BASE is already an address register in x86_64 + paddd (K_BASE), W_TMP // W_TMP += {K,K,K,K}; +#else // K_BASE is previously set up in the stack memory + mov K_BASE, T1 // T1 = K_BASE + paddd (T1), W_TMP // W_TMP += {K,K,K,K}; +#endif + .endm + + .macro W_PRECALC_00_15_3 + xmov W_TMP, WK($0&~3) // save quadruple W[i]+K in the stack memory, which would be used later for updating the hashes A/B/C/D/E + .endm + + /* + without ssse3 support, steps 1 and 2 need to be modified + 1. sequentially load 4 words into T1, bswap T1, and save it to 4-bytes in the stack space + 2. load the 16-bytes from the aligned stack memory into W_TMP + */ + + .macro W_PRECALC_00_15_0_nossse3 // input argument $0 : 0/4/8/12 + +#if defined (__x86_64__) + #define BUFFERP BUFFER_PTR +#else + mov BUFFER_PTR, T2 // copy BUFFER_PTR (from caller 2nd argument) to T2 + #define BUFFERP T2 +#endif + + // load 1st word, bswap it, save it to stack + mov $0*4(BUFFERP), T1 + bswap T1 + mov T1, 14*16(sp) + + // load 2nd word, bswap it, save it to stack + mov 4+$0*4(BUFFERP), T1 + bswap T1 + mov T1, 4+14*16(sp) + + // load 3rd word, bswap it, save it to stack + mov 8+$0*4(BUFFERP), T1 + bswap T1 + mov T1, 8+14*16(sp) + + // load 4th word, bswap it, save it to stack + mov 12+$0*4(BUFFERP), T1 + bswap T1 + mov T1, 12+14*16(sp) + .endm + + .macro W_PRECALC_00_15_1_nossse3 // input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28 + xmov 14*16(sp), W_TMP // load the bswapped 16-bytes from the aligned stack memory + xmov W_TMP, $0 // save W = W_TMP in the circular buffer + .endm + + // rounds 16-31 compute W[0] using the vectorization approach by Dean Gaudet + /* + W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1 + W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1 + W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1 + W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1 + + W[i+3] ^= W[i] rol 1; // this W[i] is already rol by 1, if we are taking from the intial W before rol 1, we should rol this by 2 + + The operation (updating W and W+K) is scheduled as and divided into 4 steps + + 0. W_tmp = W3; W = W14 ^ W8 + 1. W = W3 ^ W8 ^ W14 ^ W16; W_TMP = W; W_TMP2 = (W[i] 0 0 0); + 2. W_TMP = (W3 ^ W8 ^ W14 ^ W16) rol 1; split (W[i] 0 0 0) rol 2 in W_TMP2 and W + 3. W = W_TMP = W_TMP ^ W_TMP2 ^ W = (W3 ^ W8 ^ W14 ^ W16) rol 1 ^ (W[i] 0 0 0) rol 2; WK = W _TMP+K; + + */ + + .macro W_PRECALC_16_31_0_ssse3 // input arguments : W16,W12,W8,W4,W + xmov $1, $4 // W = W12 + palignr $$8, $0, $4 // W = W14 + xmov $3, W_TMP // W_TMP = W4 + psrldq $$4, W_TMP // W_TMP = W3 + pxor $2, $4 // W = W8 ^ W14 + .endm + + .macro W_PRECALC_16_31_1 // input arguments : W16,W + pxor $0, W_TMP // W_TMP = W3 ^ W16 + pxor W_TMP, $1 // W = W3 ^ W16 ^ W8 ^ W14 + xmov $1, W_TMP2 // W_TMP2 = W3 ^ W16 ^ W8 ^ W14 + xmov $1, W_TMP // W_TMP = W3 ^ W16 ^ W8 ^ W14 + pslldq $$12, W_TMP2 // W_TMP2 = (W[i] 0 0 0) + .endm + + .macro W_PRECALC_16_31_2 // input argument : W + psrld $$31, $0 // (W3 ^ W16 ^ W8 ^ W14)>>31 + pslld $$1, W_TMP // (W3 ^ W16 ^ W8 ^ W14)<<1 + por $0, W_TMP // W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1 + xmov W_TMP2, $0 // copy W[i] at location of W[i+3] + psrld $$30, W_TMP2 // W_TMP2 = W[i] lower 2 bits after rol 2 + pslld $$2, $0 // W = W[i] higher 30 bits after rol 2 + .endm + + .macro W_PRECALC_16_31_3 // input arguments: W, i, K_XMM +#if defined (__i386__) + mov K_BASE, T1 // K_BASE is store in the stack memory for i386 +#endif + pxor $0, W_TMP + pxor W_TMP2, W_TMP // W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1 ^ (W[i] 0 0 0) rol 2 + xmov W_TMP, $0 // save W = W_TMP in the W circular buffer +#if defined (__x86_64__) + paddd $2(K_BASE), W_TMP // W+K +#else + paddd $2(T1), W_TMP // W+K +#endif + xmov W_TMP, WK($1&~3) // save WK = W+K for later update of the hashes A/B/C/D/E + .endm + + // the following is a variant of W_PRECALC_16_31_0_ssse3 to be used for system without ssse3, palignr is replaced with 4 instructions + + .macro W_PRECALC_16_31_0_nossse3 // input arguments : W16,W12,W8,W4,W + xmov $1, $4 // W = W12 = (w9 w10 w11 w12) + + // the following is a wrokaround for palignr + xmov $0, W_TMP // W16 = (w13 w14 w15 w16) + pslldq $$8, $4 // shift left to make (w11 w12 0 0) + psrldq $$8, W_TMP // shift right to make (0 0 w13 w14) + por W_TMP, $4 // W = W14 = (w11 w12 w13 w14) + + xmov $3, W_TMP // W_TMP = W4 = (w1 w2 w3 w4) + psrldq $$4, W_TMP // W_TMP = W3 = (0 w1 w2 w3) + pxor $2, $4 // W = W8 ^ W14 + .endm + + /* rounds 32-79 compute W und W+K iusing the vectorization approach from the Intel article + + W = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2); + + where left_shift(concatenate(W8,W4),64) is equivalent to W6. Note also that W32 and W use the same register. + + + 0. W_tmp = W6; W = W28 ^ W32; + 1. W = W_tmp = W6 ^ W16 ^ W28 ^ W32; + 2. W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2; + 3. W = W_Tmp; WK = W_tmp + K; + + */ + + + .macro W_PRECALC_32_79_0_ssse3 // inputr arguments : W28,W8,W4,W + xmov $2, W_TMP // (w1 w2 w3 w4) + pxor $0, $3 // W = W28 ^ W32; + palignr $$8, $1, W_TMP // W_tmp = (w3 w4 w5 w6) = W6; + .endm + + // the following is a variant and will be used for system without ssse3 support + .macro W_PRECALC_32_79_0_nossse3 // input arguments : W28,W8,W4,W + xmov $2, W_TMP // (w1 w2 w3 w4) + xmov $1, W_TMP2 // (w5 w6 w7 w8) + pxor $0, $3 // W = W28 ^ W32 + pslldq $$8, W_TMP // (w3 w4 0 0) + psrldq $$8, W_TMP2 // (0 0 w5 w6) + por W_TMP2, W_TMP // W_tmp = (w3 w4 w5 w6) = W6 + .endm + + // this is a variant of W_PRECALC_32_79_0_ssse3 for i386 (as W24/W28 are stored in memory, not in registers) + .macro W_PRECALC_32_79_0_i386_ssse3 // input arguments : W28,W8,W4,W + xmov $3, W_TMP // W32 + pxor $0, W_TMP // W28 ^ W32 + xmov W_TMP, $3 // W = W28 ^ W32; + xmov $2, W_TMP // W4 + palignr $$8, $1, W_TMP // W_tmp = (w3 w4 w5 w6) = W6; + .endm + + // this is a variant of W_PRECALC_32_79_0_nossse3 for i386 (as W24/W28 are stored in memory, not in registers) + .macro W_PRECALC_32_79_0_i386_nossse3 // input arguments : W28,W8,W4,W + xmov $3, W_TMP // W32 + pxor $0, W_TMP // W28 ^ W32 + xmov W_TMP, $3 // W = W28 ^ W32 + xmov $2, W_TMP // W4 = (w1 w2 w3 w4) + xmov $1, W_TMP2 // W8 = (w5 w6 w7 w8) + pslldq $$8, W_TMP // (w3 w4 0 0) + psrldq $$8, W_TMP2 // (0 0 w5 w6) + por W_TMP2, W_TMP // W_tmp = (w3 w4 w5 w6) = W6 + .endm + + .macro W_PRECALC_32_79_1 // input arguments : W16,W + pxor $0, W_TMP // W_tmp = W6 ^ W16 + pxor $1, W_TMP // W_tmp = W6 ^ W16 ^ W28 ^ W32 + xmov W_TMP, $1 // W = W_tmp = W6 ^ W16 ^ W28 ^ W32 + .endm + + .macro W_PRECALC_32_79_2 // input argument : W + psrld $$30, $0 // W >> 30 + pslld $$2, W_TMP // W << 2 + por $0, W_TMP // W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2 + .endm + + // this is a variant of W_PRECALC_32_79_2 for i386 (as W24/W28 are stored in memory, not in registers) + // this should be used when the input is either W24 or W28 on i386 architecture + .macro W_PRECALC_32_79_2_i386 // input argument : W + xmov $0, W_TMP2 // W + psrld $$30, W_TMP2 // W >> 30 + xmov W_TMP2, $0 // save (W >> 30) at W + pslld $$2, W_TMP // W_tmp << 2 + por $0, W_TMP // W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2 + .endm + + .macro W_PRECALC_32_79_3 // input argument W, i, K_XMM +#if defined (__x86_64__) + xmov W_TMP, $0 // W = (W6 ^ W16 ^ W28 ^ W32) rol 2 + paddd $2(K_BASE), W_TMP // W + K + xmov W_TMP, WK($1&~3) // write W+K +#else + mov K_BASE, T1 // T1 = K_BASE (which is in the caller argument) + xmov W_TMP, $0 // W = (W6 ^ W16 ^ W28 ^ W32) rol 2 + paddd $2(T1), W_TMP // W_tmp = W + K + xmov W_TMP, WK($1&~3) // write WK +#endif + .endm + + + /* The hash update operation is completed by the following statements. + + A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + WK(i); + B[i+1] = A[i]; + C[i+1] = ROTATE_LEFT( B[i], 30 ); + D[i+1] = C[i]; + E[i+1] = D[i]; + + Suppose we start with A0,B0,C0,D0,E0. The 1st iteration can be expressed as follows: + + A1 = FN + E0 + rol(A0,5) + WK; + B1 = A0; + C1 = rol(B0, 30); + D1 = C0; + E1 = D0; + + to avoid excessive memory movement between registers, + 1. A1 = FN + E0 + rol(A0,5) + WK; can be temporarily saved in E0, + 2. C1 = rol(B0,30) can be temporarily saved in B0. + + Therefore, ignoring the time index, the update operation is equivalent to + 1. E = FN(B,C,D) + E + rol(A,5) + WK(i) + 2. B = rol(B,30) + 3. the hashes are now stored in the order of E,A,B,C,D + + + To pack 2 hash update operations in 1 iteration, starting with A,B,C,D,E + 1. E = FN(B,C,D) + E + rol(A,5) + WK(i) + 2. B = rol(B,30) + // now the hashes are in the order of E,A,B,C,D + 3. D = FN(A,B,C) + D + rol(E,5) + WK(i+1) + 4. A = rol(A,30) + // now the hashes are in the order of D,E,A,B,C + + These operations are distributed into the following 2 macro definitions RR0 and RR1. + + */ + + .macro RR0 // input arguments : FN, A, B, C, D, E, i + $0 $2, $3, $4 // T1 = FN(B,C,D) + add WK($6), $5 // E + WK(i) + rol $$30, $2 // B = rol(B,30) + mov $1, T2 // T2 = A + add WK($6+1), $4 // D + WK(i+1) + rol $$5, T2 // rol(A,5) + add T1, $5 // E = FN(B,C,D) + E + WK(i) + .endm + + .macro RR1 + add $5, T2 // T2 = FN(B,C,D) + E + rol(A,5) + WK(i) + mov T2, $5 // E = FN(B,C,D) + E + rol(A,5) + WK(i) + rol $$5, T2 // rol(E,5) + add T2, $4 // D + WK(i+1) + rol(E,5) + $0 $1, $2, $3 // FN(A,B,C) + add T1, $4 // D = FN(A,B,C) + D + rol(E,5) + WK(i+1) + rol $$30, $1 // A = rol(A,30) + .endm + + + + /* + + The following macro definitions are used to expand code for the per-block sha1 operation. + + INITIAL_W_PRECALC_ssse3 : BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory + INTERNAL_ssse3 : updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory) + ENDING : finishing up update the digests A/B/C/D/E (i=64:79) + + For multiple-block sha1 operation (Multiple_Blocks = 1), INITIAL_W_PRECALC_ssse3 and ENDING are combined + into 1 macro definition for software pipeling. + + SOFTWARE_PIPELINING_ssse3 : BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack, and finishing up update the digests A/B/C/D/E (i=64:79) + + assume cnt (the number of blocks) >= 1, the main code body should look like + + INITIAL_W_PRECALC_ssse3 // W = big_endian_load and pre-compute W+K (i=0:15) + do { + INTERNAL_ssse3 // update W(i=16:79), and update hash digests A/B/C/D/E (i=0:63) + cnt--; + if (cnt==0) break; + BUFFER_PTR += 64; + SOFTWARE_PIPELINING_ssse3; // update hash digests A/B/C/D/E (i=64:79) + W = big_endian_load and pre-compute W+K (i=0:15) + } + ENDING // update hash digests A/B/C/D/E (i=64:79) + + */ + + #define W_PRECALC_00_15_0 W_PRECALC_00_15_0_ssse3 + #define W_PRECALC_00_15_1 W_PRECALC_00_15_1_ssse3 + #define W_PRECALC_16_31_0 W_PRECALC_16_31_0_ssse3 + #define W_PRECALC_32_79_0 W_PRECALC_32_79_0_ssse3 + #define W_PRECALC_32_79_0_i386 W_PRECALC_32_79_0_i386_ssse3 + + + .macro INITIAL_W_PRECALC_ssse3 // BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory + + // i=0 : W28,W24,W20,W16,W12,W8,W4,W0 + W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR) + W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP + W_PRECALC_00_15_2 // W_TMP = W0 + K + W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K + + // i=4 : W24,W20,W16,W12,W8,W4,W0,W28 + W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR) + W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP + W_PRECALC_00_15_2 // W_TMP = W28 + K + W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K + + // i=8 : W20,W16,W12,W8,W4,W0,W28,W24 + W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR) + W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP + W_PRECALC_00_15_2 // W_TMP = W24 + K + W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K + + // i=12 : W16,W12,W8,W4,W0,W28,W24,W20 + W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR) + W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP + W_PRECALC_00_15_2 // W_TMP = W20 + K + W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K + + .endm + + + .macro INTERNAL_ssse3 // updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory) + + // i=16 : W12,W8,W4,W0,W28,W24,W20,W16 + W_PRECALC_16_31_0 W0,W28,W24,W20,W16 + RR0 F1,A,B,C,D,E,0 + W_PRECALC_16_31_1 W0,W16 + RR1 F1,A,B,C,D,E,0 + W_PRECALC_16_31_2 W16 + RR0 F1,D,E,A,B,C,2 + W_PRECALC_16_31_3 W16, 2, 0 + RR1 F1,D,E,A,B,C,2 + + // i=20 : W8,W4,W0,W28,W24,W20,W16,W12 + W_PRECALC_16_31_0 W28,W24,W20,W16,W12 + RR0 F1,B,C,D,E,A,4 + W_PRECALC_16_31_1 W28,W12 + RR1 F1,B,C,D,E,A,4 + W_PRECALC_16_31_2 W12 + RR0 F1,E,A,B,C,D,6 + W_PRECALC_16_31_3 W12, 6, 16 + RR1 F1,E,A,B,C,D,6 + + // i=24 : W4,W0,W28,W24,W20,W16,W12,W8 + W_PRECALC_16_31_0 W24,W20,W16,W12,W8 + RR0 F1,C,D,E,A,B,8 + W_PRECALC_16_31_1 W24,W8 + RR1 F1,C,D,E,A,B,8 + W_PRECALC_16_31_2 W8 + RR0 F1,A,B,C,D,E,10 + W_PRECALC_16_31_3 W8,10,16 + RR1 F1,A,B,C,D,E,10 + + // i=28 : W0,W28,W24,W20,W16,W12,W8,W4 + W_PRECALC_16_31_0 W20,W16,W12,W8,W4 + RR0 F1,D,E,A,B,C,12 + W_PRECALC_16_31_1 W20,W4 + RR1 F1,D,E,A,B,C,12 + W_PRECALC_16_31_2 W4 + RR0 F1,B,C,D,E,A,14 + W_PRECALC_16_31_3 W4,14,16 + RR1 F1,B,C,D,E,A,14 + + // i=32 : W28,W24,W20,W16,W12,W8,W4,W0 + W_PRECALC_32_79_0 W28,W8,W4,W0 + RR0 F1,E,A,B,C,D,16 + W_PRECALC_32_79_1 W16,W0 + RR1 F1,E,A,B,C,D,16 + W_PRECALC_32_79_2 W0 + RR0 F1,C,D,E,A,B,18 + W_PRECALC_32_79_3 W0,18,16 + RR1 F1,C,D,E,A,B,18 + + // starting using F2 + + // i=36 : W24,W20,W16,W12,W8,W4,W0,W28 +#if defined (__x86_64__) + W_PRECALC_32_79_0 W24,W4,W0,W28 +#else + W_PRECALC_32_79_0_i386 W24,W4,W0,W28 +#endif + RR0 F2,A,B,C,D,E,20 + W_PRECALC_32_79_1 W12,W28 + RR1 F2,A,B,C,D,E,20 +#if defined (__x86_64__) + W_PRECALC_32_79_2 W28 +#else + W_PRECALC_32_79_2_i386 W28 +#endif + RR0 F2,D,E,A,B,C,22 + W_PRECALC_32_79_3 W28,22,16 + RR1 F2,D,E,A,B,C,22 + + // i=40 : W20,W16,W12,W8,W4,W0,W28,W24 + #undef K_XMM + #define K_XMM 32 +#if defined (__x86_64__) + W_PRECALC_32_79_0 W20,W0,W28,W24 +#else + W_PRECALC_32_79_0_i386 W20,W0,W28,W24 +#endif + RR0 F2,B,C,D,E,A,24 + W_PRECALC_32_79_1 W8,W24 + RR1 F2,B,C,D,E,A,24 +#if defined (__x86_64__) + W_PRECALC_32_79_2 W24 +#else + W_PRECALC_32_79_2_i386 W24 +#endif + RR0 F2,E,A,B,C,D,26 + W_PRECALC_32_79_3 W24,26,K_XMM + RR1 F2,E,A,B,C,D,26 + + // i=44 : W16,W12,W8,W4,W0,W28,W24,W20 + W_PRECALC_32_79_0 W16,W28,W24,W20 + RR0 F2,C,D,E,A,B,28 + W_PRECALC_32_79_1 W4,W20 + RR1 F2,C,D,E,A,B,28 + W_PRECALC_32_79_2 W20 + RR0 F2,A,B,C,D,E,30 + W_PRECALC_32_79_3 W20,30,K_XMM + RR1 F2,A,B,C,D,E,30 + + // i=48 : W12,W8,W4,W0,W28,W24,W20,W16 + W_PRECALC_32_79_0 W12,W24,W20,W16 + RR0 F2,D,E,A,B,C,32 + W_PRECALC_32_79_1 W0,W16 + RR1 F2,D,E,A,B,C,32 + W_PRECALC_32_79_2 W16 + RR0 F2,B,C,D,E,A,34 + W_PRECALC_32_79_3 W16,34,K_XMM + RR1 F2,B,C,D,E,A,34 + + // i=52 : W8,W4,W0,W28,W24,W20,W16,W12 + W_PRECALC_32_79_0 W8,W20,W16,W12 + RR0 F2,E,A,B,C,D,36 + W_PRECALC_32_79_1 W28,W12 + RR1 F2,E,A,B,C,D,36 + W_PRECALC_32_79_2 W12 + RR0 F2,C,D,E,A,B,38 + W_PRECALC_32_79_3 W12,38,K_XMM + RR1 F2,C,D,E,A,B,38 + + // starting using F3 + + // i=56 : W4,W0,W28,W24,W20,W16,W12,W8 + W_PRECALC_32_79_0 W4,W16,W12,W8 + RR0 F3,A,B,C,D,E,40 + W_PRECALC_32_79_1 W24,W8 + RR1 F3,A,B,C,D,E,40 + W_PRECALC_32_79_2 W8 + RR0 F3,D,E,A,B,C,42 + W_PRECALC_32_79_3 W8,42,K_XMM + RR1 F3,D,E,A,B,C,42 + + // i=60 : W0,W28,W24,W20,W16,W12,W8,W4 + #undef K_XMM + #define K_XMM 48 + W_PRECALC_32_79_0 W0,W12,W8,W4 + RR0 F3,B,C,D,E,A,44 + W_PRECALC_32_79_1 W20,W4 + RR1 F3,B,C,D,E,A,44 + W_PRECALC_32_79_2 W4 + RR0 F3,E,A,B,C,D,46 + W_PRECALC_32_79_3 W4,46,K_XMM + RR1 F3,E,A,B,C,D,46 + + // i=64 : W28,W24,W20,W16,W12,W8,W4,W0 + W_PRECALC_32_79_0 W28,W8,W4,W0 + RR0 F3,C,D,E,A,B,48 + W_PRECALC_32_79_1 W16,W0 + RR1 F3,C,D,E,A,B,48 + W_PRECALC_32_79_2 W0 + RR0 F3,A,B,C,D,E,50 + W_PRECALC_32_79_3 W0,50,K_XMM + RR1 F3,A,B,C,D,E,50 + + // i=68 : W24,W20,W16,W12,W8,W4,W0,W28 +#if defined (__x86_64__) + W_PRECALC_32_79_0 W24,W4,W0,W28 +#else + W_PRECALC_32_79_0_i386 W24,W4,W0,W28 +#endif + RR0 F3,D,E,A,B,C,52 + W_PRECALC_32_79_1 W12,W28 + RR1 F3,D,E,A,B,C,52 +#if defined (__x86_64__) + W_PRECALC_32_79_2 W28 +#else + W_PRECALC_32_79_2_i386 W28 +#endif + RR0 F3,B,C,D,E,A,54 + W_PRECALC_32_79_3 W28,54,K_XMM + RR1 F3,B,C,D,E,A,54 + + // i=72 : W20,W16,W12,W8,W4,W0,W28,W24 +#if defined (__x86_64__) + W_PRECALC_32_79_0 W20,W0,W28,W24 +#else + W_PRECALC_32_79_0_i386 W20,W0,W28,W24 +#endif + RR0 F3,E,A,B,C,D,56 + W_PRECALC_32_79_1 W8,W24 + RR1 F3,E,A,B,C,D,56 +#if defined (__x86_64__) + W_PRECALC_32_79_2 W24 +#else + W_PRECALC_32_79_2_i386 W24 +#endif + RR0 F3,C,D,E,A,B,58 + W_PRECALC_32_79_3 W24,58,K_XMM + RR1 F3,C,D,E,A,B,58 + + // starting using F4 + + // i=76 : W16,W12,W8,W4,W0,W28,W24,W20 + W_PRECALC_32_79_0 W16,W28,W24,W20 + RR0 F4,A,B,C,D,E,60 + W_PRECALC_32_79_1 W4,W20 + RR1 F4,A,B,C,D,E,60 + W_PRECALC_32_79_2 W20 + RR0 F4,D,E,A,B,C,62 + W_PRECALC_32_79_3 W20,62,K_XMM + RR1 F4,D,E,A,B,C,62 + + .endm + + .macro SOFTWARE_PIPELINING_ssse3 + // i=0 : W28,W24,W20,W16,W12,W8,W4,W0 + W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR) + RR0 F4,B,C,D,E,A,64 + W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP + RR1 F4,B,C,D,E,A,64 + W_PRECALC_00_15_2 // W_TMP = W0 + K + RR0 F4,E,A,B,C,D,66 + W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K + RR1 F4,E,A,B,C,D,66 + + // i=4 : W24,W20,W16,W12,W8,W4,W0,W28 + W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR) + RR0 F4,C,D,E,A,B,68 + W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP + RR1 F4,C,D,E,A,B,68 + W_PRECALC_00_15_2 // W_TMP = W28 + K + RR0 F4,A,B,C,D,E,70 + W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K[0] + RR1 F4,A,B,C,D,E,70 + + // i=8 : W20,W16,W12,W8,W4,W0,W28,W24 + W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR) + RR0 F4,D,E,A,B,C,72 + W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP + RR1 F4,D,E,A,B,C,72 + W_PRECALC_00_15_2 // W_TMP = W24 + K + RR0 F4,B,C,D,E,A,74 + W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K + RR1 F4,B,C,D,E,A,74 + + // i=12 : W16,W12,W8,W4,W0,W28,W24,W20 + W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR) + RR0 F4,E,A,B,C,D,76 + W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP + RR1 F4,E,A,B,C,D,76 + W_PRECALC_00_15_2 // W_TMP = W20 + K + RR0 F4,C,D,E,A,B,78 + W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K + RR1 F4,C,D,E,A,B,78 + .endm + + + #undef W_PRECALC_00_15_0 + #undef W_PRECALC_00_15_1 + #undef W_PRECALC_16_31_0 + #undef W_PRECALC_32_79_0 + #undef W_PRECALC_32_79_0_i386 + + + + /* + + The following are 3 macro definitions that are no-ssse3 variants of the previous 3 macro definitions. + + INITIAL_W_PRECALC_nossse3 + INTERNAL_nossse3 + SOFTWARE_PIPELINING_nossse3 + + They will be used in a sha1 code main body definition that will be used for system without ssse3 support. + + */ + + #define W_PRECALC_00_15_0 W_PRECALC_00_15_0_nossse3 + #define W_PRECALC_00_15_1 W_PRECALC_00_15_1_nossse3 + #define W_PRECALC_16_31_0 W_PRECALC_16_31_0_nossse3 + #define W_PRECALC_32_79_0 W_PRECALC_32_79_0_nossse3 + #define W_PRECALC_32_79_0_i386 W_PRECALC_32_79_0_i386_nossse3 + + + .macro INITIAL_W_PRECALC_nossse3 + + // i=0 : W28,W24,W20,W16,W12,W8,W4,W0 + W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR) + W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP + W_PRECALC_00_15_2 // W_TMP = W0 + K + W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K + + // i=4 : W24,W20,W16,W12,W8,W4,W0,W28 + W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR) + W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP + W_PRECALC_00_15_2 // W_TMP = W28 + K + W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K + + // i=8 : W20,W16,W12,W8,W4,W0,W28,W24 + W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR) + W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP + W_PRECALC_00_15_2 // W_TMP = W24 + K + W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K + + // i=12 : W16,W12,W8,W4,W0,W28,W24,W20 + W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR) + W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP + W_PRECALC_00_15_2 // W_TMP = W20 + K + W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K + + .endm + + + .macro INTERNAL_nossse3 + // i=16 + // circular buffer : W12,W8,W4,W0,W28,W24,W20,W16 + W_PRECALC_16_31_0 W0,W28,W24,W20,W16 + RR0 F1,A,B,C,D,E,0 + W_PRECALC_16_31_1 W0,W16 + RR1 F1,A,B,C,D,E,0 + W_PRECALC_16_31_2 W16 + RR0 F1,D,E,A,B,C,2 + W_PRECALC_16_31_3 W16, 2, 0 + RR1 F1,D,E,A,B,C,2 + + // i=20, + // W8,W4,W0,W28,W24,W20,W16,W12 + W_PRECALC_16_31_0 W28,W24,W20,W16,W12 + RR0 F1,B,C,D,E,A,4 + W_PRECALC_16_31_1 W28,W12 + RR1 F1,B,C,D,E,A,4 + + W_PRECALC_16_31_2 W12 + RR0 F1,E,A,B,C,D,6 + W_PRECALC_16_31_3 W12, 6, 16 + RR1 F1,E,A,B,C,D,6 + + // i=24, + // W4,W0,W28,W24,W20,W16,W12,W8 + W_PRECALC_16_31_0 W24,W20,W16,W12,W8 + RR0 F1,C,D,E,A,B,8 + W_PRECALC_16_31_1 W24,W8 + RR1 F1,C,D,E,A,B,8 + + W_PRECALC_16_31_2 W8 + RR0 F1,A,B,C,D,E,10 + W_PRECALC_16_31_3 W8,10,16 + RR1 F1,A,B,C,D,E,10 + + // i=28 + // W0,W28,W24,W20,W16,W12,W8,W4 + W_PRECALC_16_31_0 W20,W16,W12,W8,W4 + RR0 F1,D,E,A,B,C,12 + W_PRECALC_16_31_1 W20,W4 + RR1 F1,D,E,A,B,C,12 + + W_PRECALC_16_31_2 W4 + RR0 F1,B,C,D,E,A,14 + W_PRECALC_16_31_3 W4,14,16 + RR1 F1,B,C,D,E,A,14 + + //i=32 + // W28,W24,W20,W16,W12,W8,W4,W0 + W_PRECALC_32_79_0 W28,W8,W4,W0 + RR0 F1,E,A,B,C,D,16 + W_PRECALC_32_79_1 W16,W0 + RR1 F1,E,A,B,C,D,16 + W_PRECALC_32_79_2 W0 + RR0 F1,C,D,E,A,B,18 + W_PRECALC_32_79_3 W0,18,16 + RR1 F1,C,D,E,A,B,18 + + //i=36 + // W24,W20,W16,W12,W8,W4,W0,W28 +#if defined (__x86_64__) + W_PRECALC_32_79_0 W24,W4,W0,W28 +#else + W_PRECALC_32_79_0_i386 W24,W4,W0,W28 +#endif + RR0 F2,A,B,C,D,E,20 + W_PRECALC_32_79_1 W12,W28 + RR1 F2,A,B,C,D,E,20 +#if defined (__x86_64__) + W_PRECALC_32_79_2 W28 +#else + W_PRECALC_32_79_2_i386 W28 +#endif + RR0 F2,D,E,A,B,C,22 + W_PRECALC_32_79_3 W28,22,16 + RR1 F2,D,E,A,B,C,22 + + //i=40 + #undef K_XMM + #define K_XMM 32 + // W20,W16,W12,W8,W4,W0,W28,W24 +#if defined (__x86_64__) + W_PRECALC_32_79_0 W20,W0,W28,W24 +#else + W_PRECALC_32_79_0_i386 W20,W0,W28,W24 +#endif + RR0 F2,B,C,D,E,A,24 + W_PRECALC_32_79_1 W8,W24 + RR1 F2,B,C,D,E,A,24 +#if defined (__x86_64__) + W_PRECALC_32_79_2 W24 +#else + W_PRECALC_32_79_2_i386 W24 +#endif + RR0 F2,E,A,B,C,D,26 + W_PRECALC_32_79_3 W24,26,K_XMM + RR1 F2,E,A,B,C,D,26 + + //i=44 + // W16,W12,W8,W4,W0,W28,W24,W20 + W_PRECALC_32_79_0 W16,W28,W24,W20 + RR0 F2,C,D,E,A,B,28 + W_PRECALC_32_79_1 W4,W20 + RR1 F2,C,D,E,A,B,28 + W_PRECALC_32_79_2 W20 + RR0 F2,A,B,C,D,E,30 + W_PRECALC_32_79_3 W20,30,K_XMM + RR1 F2,A,B,C,D,E,30 + + //i=48 + // W12,W8,W4,W0,W28,W24,W20,W16 + W_PRECALC_32_79_0 W12,W24,W20,W16 + RR0 F2,D,E,A,B,C,32 + W_PRECALC_32_79_1 W0,W16 + RR1 F2,D,E,A,B,C,32 + W_PRECALC_32_79_2 W16 + RR0 F2,B,C,D,E,A,34 + W_PRECALC_32_79_3 W16,34,K_XMM + RR1 F2,B,C,D,E,A,34 + + //i=52 + // W8,W4,W0,W28,W24,W20,W16,W12 + W_PRECALC_32_79_0 W8,W20,W16,W12 + RR0 F2,E,A,B,C,D,36 + W_PRECALC_32_79_1 W28,W12 + RR1 F2,E,A,B,C,D,36 + W_PRECALC_32_79_2 W12 + RR0 F2,C,D,E,A,B,38 + W_PRECALC_32_79_3 W12,38,K_XMM + RR1 F2,C,D,E,A,B,38 + + //i=56 + // W4,W0,W28,W24,W20,W16,W12,W8 + W_PRECALC_32_79_0 W4,W16,W12,W8 + RR0 F3,A,B,C,D,E,40 + W_PRECALC_32_79_1 W24,W8 + RR1 F3,A,B,C,D,E,40 + W_PRECALC_32_79_2 W8 + RR0 F3,D,E,A,B,C,42 + W_PRECALC_32_79_3 W8,42,K_XMM + RR1 F3,D,E,A,B,C,42 + + //i=60 + #undef K_XMM + #define K_XMM 48 + // W0,W28,W24,W20,W16,W12,W8,W4 + W_PRECALC_32_79_0 W0,W12,W8,W4 + RR0 F3,B,C,D,E,A,44 + W_PRECALC_32_79_1 W20,W4 + RR1 F3,B,C,D,E,A,44 + W_PRECALC_32_79_2 W4 + RR0 F3,E,A,B,C,D,46 + W_PRECALC_32_79_3 W4,46,K_XMM + RR1 F3,E,A,B,C,D,46 + + //i=64 + // W28,W24,W20,W16,W12,W8,W4,W0 + W_PRECALC_32_79_0 W28,W8,W4,W0 + RR0 F3,C,D,E,A,B,48 + W_PRECALC_32_79_1 W16,W0 + RR1 F3,C,D,E,A,B,48 + W_PRECALC_32_79_2 W0 + RR0 F3,A,B,C,D,E,50 + W_PRECALC_32_79_3 W0,50,K_XMM + RR1 F3,A,B,C,D,E,50 + + //i=68 + // W24,W20,W16,W12,W8,W4,W0,W28 +#if defined (__x86_64__) + W_PRECALC_32_79_0 W24,W4,W0,W28 +#else + W_PRECALC_32_79_0_i386 W24,W4,W0,W28 +#endif + RR0 F3,D,E,A,B,C,52 + W_PRECALC_32_79_1 W12,W28 + RR1 F3,D,E,A,B,C,52 +#if defined (__x86_64__) + W_PRECALC_32_79_2 W28 +#else + W_PRECALC_32_79_2_i386 W28 +#endif + RR0 F3,B,C,D,E,A,54 + W_PRECALC_32_79_3 W28,54,K_XMM + RR1 F3,B,C,D,E,A,54 + + //i=72 + // W20,W16,W12,W8,W4,W0,W28,W24 +#if defined (__x86_64__) + W_PRECALC_32_79_0 W20,W0,W28,W24 +#else + W_PRECALC_32_79_0_i386 W20,W0,W28,W24 +#endif + RR0 F3,E,A,B,C,D,56 + W_PRECALC_32_79_1 W8,W24 + RR1 F3,E,A,B,C,D,56 +#if defined (__x86_64__) + W_PRECALC_32_79_2 W24 +#else + W_PRECALC_32_79_2_i386 W24 +#endif + RR0 F3,C,D,E,A,B,58 + W_PRECALC_32_79_3 W24,58,K_XMM + RR1 F3,C,D,E,A,B,58 + + // starting using F4 + + //i=76 + // W16,W12,W8,W4,W0,W28,W24,W20 + W_PRECALC_32_79_0 W16,W28,W24,W20 + RR0 F4,A,B,C,D,E,60 + W_PRECALC_32_79_1 W4,W20 + RR1 F4,A,B,C,D,E,60 + W_PRECALC_32_79_2 W20 + RR0 F4,D,E,A,B,C,62 + W_PRECALC_32_79_3 W20,62,K_XMM + RR1 F4,D,E,A,B,C,62 + + .endm + + .macro SOFTWARE_PIPELINING_nossse3 + // i=0 : W28,W24,W20,W16,W12,W8,W4,W0 + W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR) + RR0 F4,B,C,D,E,A,64 + W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP + RR1 F4,B,C,D,E,A,64 + W_PRECALC_00_15_2 // W_TMP = W0 + K + RR0 F4,E,A,B,C,D,66 + W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K + RR1 F4,E,A,B,C,D,66 + + // i=4 : W24,W20,W16,W12,W8,W4,W0,W28 + W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR) + RR0 F4,C,D,E,A,B,68 + W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP + RR1 F4,C,D,E,A,B,68 + W_PRECALC_00_15_2 // W_TMP = W28 + K + RR0 F4,A,B,C,D,E,70 + W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K[0] + RR1 F4,A,B,C,D,E,70 + + // i=8 : W20,W16,W12,W8,W4,W0,W28,W24 + W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR) + RR0 F4,D,E,A,B,C,72 + W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP + RR1 F4,D,E,A,B,C,72 + W_PRECALC_00_15_2 // W_TMP = W24 + K + RR0 F4,B,C,D,E,A,74 + W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K + RR1 F4,B,C,D,E,A,74 + + // i=12 : W16,W12,W8,W4,W0,W28,W24,W20 + W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR) + RR0 F4,E,A,B,C,D,76 + W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP + RR1 F4,E,A,B,C,D,76 + W_PRECALC_00_15_2 // W_TMP = W20 + K + RR0 F4,C,D,E,A,B,78 + W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K + RR1 F4,C,D,E,A,B,78 + .endm + + .macro ENDING // finish up updating hash digests (i=64:79) + //i=80 + RR0 F4,B,C,D,E,A,64 + RR1 F4,B,C,D,E,A,64 + RR0 F4,E,A,B,C,D,66 + RR1 F4,E,A,B,C,D,66 + + //i=84 + RR0 F4,C,D,E,A,B,68 + RR1 F4,C,D,E,A,B,68 + RR0 F4,A,B,C,D,E,70 + RR1 F4,A,B,C,D,E,70 + + //i=88 + RR0 F4,D,E,A,B,C,72 + RR1 F4,D,E,A,B,C,72 + RR0 F4,B,C,D,E,A,74 + RR1 F4,B,C,D,E,A,74 + + //i=92 + RR0 F4,E,A,B,C,D,76 + RR1 F4,E,A,B,C,D,76 + RR0 F4,C,D,E,A,B,78 + RR1 F4,C,D,E,A,B,78 + .endm + + // load hash digests A,B,C,D,E from memory into registers + .macro LOAD_HASH +#if defined (__x86_64__) + mov (HASH_PTR), A + mov 4(HASH_PTR), B + mov 8(HASH_PTR), C + mov 12(HASH_PTR), D + mov 16(HASH_PTR), E +#else + mov HASH_PTR, T1 + mov (T1), A + mov 4(T1), B + mov 8(T1), C + mov 12(T1), D + mov 16(T1), E +#endif + .endm + + .macro UPDATE_HASH + add $0, $1 + mov $1, $0 + .endm + + .macro UPDATE_ALL_HASH +#if defined (__x86_64__) + UPDATE_HASH (HASH_PTR), A + UPDATE_HASH 4(HASH_PTR), B + UPDATE_HASH 8(HASH_PTR), C + UPDATE_HASH 12(HASH_PTR), D + UPDATE_HASH 16(HASH_PTR), E +#else + mov HASH_PTR, T1 + UPDATE_HASH (T1), A + UPDATE_HASH 4(T1), B + UPDATE_HASH 8(T1), C + UPDATE_HASH 12(T1), D + UPDATE_HASH 16(T1), E +#endif + .endm + + + /* + main sha1 code for system without ssse3 support + */ + + .macro SHA1_PIPELINED_MAIN_BODY_nossse3 + LOAD_HASH // load initial hashes into A,B,C,D,E (registers) + INITIAL_W_PRECALC_nossse3 // big_endian_load(W) and W+K (i=0:15) + .align 4,0x90 +0: + INTERNAL_nossse3 // update W (i=16:79) and update ABCDE (i=0:63) +#if Multiple_Blocks + add $$64, BUFFER_PTR // BUFFER_PTR+=64; + sub $$1, cnt // pre-decrement cnt by 1 + jbe 1f // if cnt <= 0, branch to finish off + SOFTWARE_PIPELINING_nossse3 // update ABCDE (i=64:79) || big_endian_load(W) and W+K (i=0:15) + UPDATE_ALL_HASH // update output hashes + jmp 0b // repeat for next block + .align 4,0x90 +1: +#endif + ENDING // update ABCDE (i=64:79) + UPDATE_ALL_HASH // update output hashes + .endm + + /* + main sha1 code for system with ssse3 support + */ + + .macro SHA1_PIPELINED_MAIN_BODY_ssse3 + LOAD_HASH // load initial hashes into A,B,C,D,E + INITIAL_W_PRECALC_ssse3 // big_endian_load(W) and W+K (i=0:15) + .align 4,0x90 +0: + INTERNAL_ssse3 // update W (i=16:79) and update ABCDE (i=0:63) +#if Multiple_Blocks + add $$64, BUFFER_PTR // BUFFER_PTR+=64; + sub $$1, cnt // pre-decrement cnt by 1 + jbe 1f // if cnt <= 0, branch to finish off + SOFTWARE_PIPELINING_ssse3 // update ABCDE (i=64:79) || big_endian_load(W) and W+K (i=0:15) + UPDATE_ALL_HASH // update output hashes + jmp 0b // repeat for next block + .align 4,0x90 +1: +#endif + ENDING // update ABCDE (i=64:79) + UPDATE_ALL_HASH // update output hashes + .endm + +#include + + .text + + .globl _SHA1Transform + .private_extern _SHA1Transform +_SHA1Transform: + + // detect SSSE3 and dispatch appropriate code branch + #if defined __x86_64__ + movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities + mov (%rax), %eax // %eax = __cpu_capabilities + #else // i386 + #if defined KERNEL + leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities + mov (%eax), %eax // %eax = __cpu_capabilities + #else + mov _COMM_PAGE_CPU_CAPABILITIES, %eax + #endif + #endif + test $(kHasSupplementalSSE3), %eax + je _SHA1Transform_nossse3 // branch to no-ssse3 code + + + // start the sha1 code with ssse3 support + + // save callee-save registers +#if defined (__x86_64__) + push %rbx + push %rbp +#else + push %ebx + push %ebp + push %esi + push %edi +#endif + + sub $stack_size, sp // allocate stack memory for use + + // save used xmm register if this is for kernel +#if KERNEL + xmov %xmm0, 4*16(sp) + xmov %xmm1, 5*16(sp) + xmov %xmm2, 6*16(sp) + xmov %xmm3, 7*16(sp) + xmov %xmm4, 8*16(sp) + xmov %xmm5, 9*16(sp) + xmov %xmm6, 10*16(sp) + xmov %xmm7, 11*16(sp) +#if defined (__x86_64__) + xmov %xmm8, 12*16(sp) + xmov %xmm9, 13*16(sp) + xmov %xmm10, 14*16(sp) +#endif +#endif + +#if defined (__x86_64__) + + // set up registers to free %edx/%edi/%esi for other use (ABCDE) + mov ctx, HASH_PTR + mov buf, BUFFER_PTR +#if Multiple_Blocks + mov %rdx, cnt +#endif + lea K_XMM_AR(%rip), K_BASE + xmov 0x40(K_BASE), XMM_SHUFB_BSWAP + +#else // __i386__ + +#if KERNEL + lea K_XMM_AR, %eax +#else + // Get address of 0 in R. + call 0f // Push program counter onto stack. + 0: pop %eax // Get program counter. + lea K_XMM_AR-0b(%eax), %eax +#endif + mov %eax, K_BASE + xmov 0x40(%eax), %xmm0 + xmov %xmm0, XMM_SHUFB_BSWAP + +#endif + + SHA1_PIPELINED_MAIN_BODY_ssse3 + + // restore used xmm registers if this is for kernel +#if KERNEL + xmov 4*16(sp), %xmm0 + xmov 5*16(sp), %xmm1 + xmov 6*16(sp), %xmm2 + xmov 7*16(sp), %xmm3 + xmov 8*16(sp), %xmm4 + xmov 9*16(sp), %xmm5 + xmov 10*16(sp), %xmm6 + xmov 11*16(sp), %xmm7 +#if defined (__x86_64__) + xmov 12*16(sp), %xmm8 + xmov 13*16(sp), %xmm9 + xmov 14*16(sp), %xmm10 +#endif +#endif + + add $stack_size, sp // deallocate stack memory + + // restore callee-save registers +#if defined (__x86_64__) + pop %rbp + pop %rbx +#else + pop %edi + pop %esi + pop %ebp + pop %ebx +#endif + + ret // return + + // this is equivalent to the above function _SHA1Transform, but it does not use ssse3 instructions + + .globl _SHA1Transform_nossse3 + .private_extern _SHA1Transform_nossse3 +_SHA1Transform_nossse3: + + // push callee-save registers +#if defined (__x86_64__) + push %rbx + push %rbp +#else + push %ebx + push %ebp + push %esi + push %edi +#endif + + sub $stack_size, sp // allocate stack memory for local use + + // save used xmm registers if this is for kernel +#if KERNEL + xmov %xmm0, 4*16(sp) + xmov %xmm1, 5*16(sp) + xmov %xmm2, 6*16(sp) + xmov %xmm3, 7*16(sp) + xmov %xmm4, 8*16(sp) + xmov %xmm5, 9*16(sp) + xmov %xmm6, 10*16(sp) + xmov %xmm7, 11*16(sp) +#if defined (__x86_64__) + xmov %xmm8, 12*16(sp) + xmov %xmm9, 13*16(sp) +#endif +#endif + +#if defined (__x86_64__) + + // set up registers to free %edx/%edi/%esi for other use (ABCDE) + mov ctx, HASH_PTR + mov buf, BUFFER_PTR +#if Multiple_Blocks + mov %rdx, cnt +#endif + lea K_XMM_AR(%rip), K_BASE + +#else // __i386__ + +#if KERNEL + lea K_XMM_AR, %eax +#else + // Get address of 0 in R. + call 0f // Push program counter onto stack. + 0: pop %eax // Get program counter. + lea K_XMM_AR-0b(%eax), %eax +#endif + mov %eax, K_BASE + +#endif + + SHA1_PIPELINED_MAIN_BODY_nossse3 + + // restore used xmm registers if this is for kernel +#if KERNEL + xmov 4*16(sp), %xmm0 + xmov 5*16(sp), %xmm1 + xmov 6*16(sp), %xmm2 + xmov 7*16(sp), %xmm3 + xmov 8*16(sp), %xmm4 + xmov 9*16(sp), %xmm5 + xmov 10*16(sp), %xmm6 + xmov 11*16(sp), %xmm7 +#if defined (__x86_64__) + xmov 12*16(sp), %xmm8 + xmov 13*16(sp), %xmm9 +#endif +#endif + + add $stack_size, sp // deallocate stack memory + + // restore callee-save registers +#if defined (__x86_64__) + pop %rbp + pop %rbx +#else + pop %edi + pop %esi + pop %ebp + pop %ebx +#endif + + ret // return + + .const + .align 4, 0x90 + +#define K1 0x5a827999 +#define K2 0x6ed9eba1 +#define K3 0x8f1bbcdc +#define K4 0xca62c1d6 + +K_XMM_AR: + .long K1 + .long K1 + .long K1 + .long K1 + .long K2 + .long K2 + .long K2 + .long K2 + .long K3 + .long K3 + .long K3 + .long K3 + .long K4 + .long K4 + .long K4 + .long K4 +// bswap_shufb_ctl: invoked thru 0x40(K_XMM_AR) + .long 0x00010203 + .long 0x04050607 + .long 0x08090a0b + .long 0x0c0d0e0f + + + +#endif // architecture x86_64 or i386 diff --git a/libkern/crypto/sha1.c b/libkern/crypto/sha1.c index 69e9eec42..b85cbec96 100644 --- a/libkern/crypto/sha1.c +++ b/libkern/crypto/sha1.c @@ -57,6 +57,11 @@ #include #include #include +#define SHA1_TIMER 0 // change to nonzero to write timing stamps to profile sha1transform + +#if SHA1_TIMER +#include +#endif #define memset(x, y, z) bzero(x, z); #define memcpy(x, y, z) bcopy(y, x, z) @@ -139,8 +144,11 @@ static unsigned char PADDING[64] = { 0x80, /* zeros */ }; (p) = ROTATE_LEFT(p, 1); \ } -static void SHA1Transform(u_int32_t, u_int32_t, u_int32_t, u_int32_t, - u_int32_t, const u_int8_t *, SHA1_CTX *); +#if (defined (__x86_64__) || defined (__i386__)) +extern void SHA1Transform(SHA1_CTX *, const u_int8_t *, u_int32_t Nblocks); +#else +static void SHA1Transform(SHA1_CTX *, const u_int8_t *); +#endif void _SHA1Update(SHA1_CTX *context, const void *inpp, size_t inputLen); @@ -199,19 +207,36 @@ _SHA1Update(SHA1_CTX *context, const void *inpp, size_t inputLen) if (inputLen >= partLen) { if (index != 0) { memcpy(&context->buffer[index], input, partLen); - SHA1Transform(context->state[0], context->state[1], - context->state[2], context->state[3], - context->state[4], context->buffer, context); +#if (defined (__x86_64__) || defined (__i386__)) + SHA1Transform(context, context->buffer, 1); +#else + SHA1Transform(context, context->buffer); +#endif i = partLen; } +#if SHA1_TIMER + KERNEL_DEBUG_CONSTANT(0xaa800004 | DBG_FUNC_START, 0, 0, 0, 0, 0); +#endif +#if (defined (__x86_64__) || defined (__i386__)) + { + int kk = (inputLen-i)>>6; + if (kk>0) { + SHA1Transform(context, &input[i], kk); + i += (kk<<6); + } + } +#else for (; i + 63 < inputLen; i += 64) - SHA1Transform(context->state[0], context->state[1], - context->state[2], context->state[3], - context->state[4], &input[i], context); + SHA1Transform(context, &input[i]); +#endif - if (inputLen == i) + if (inputLen == i) { +#if SHA1_TIMER + KERNEL_DEBUG_CONSTANT(0xaa800004 | DBG_FUNC_END, 0, 0, 0, 0, 0); +#endif return; + } index = 0; } @@ -358,14 +383,21 @@ SHA1Final(void *digest, SHA1_CTX *context) /* * SHA1 basic transformation. Transforms state based on block. */ +#if !(defined (__x86_64__) || defined (__i386__)) static void -SHA1Transform(u_int32_t a, u_int32_t b, u_int32_t c, u_int32_t d, - u_int32_t e, const u_int8_t block[64], SHA1_CTX *context) +SHA1Transform(SHA1_CTX *context, const u_int8_t block[64]) { /* Register (instead of array) is a win in most cases */ + register u_int32_t a, b, c, d, e; register u_int32_t w0, w1, w2, w3, w4, w5, w6, w7; register u_int32_t w8, w9, w10, w11, w12, w13, w14, w15; + a = context->state[0]; + b = context->state[1]; + c = context->state[2]; + d = context->state[3]; + e = context->state[4]; + w15 = FETCH_32(block + 60); w14 = FETCH_32(block + 56); w13 = FETCH_32(block + 52); @@ -480,3 +512,4 @@ SHA1Transform(u_int32_t a, u_int32_t b, u_int32_t c, u_int32_t d, w15 = w14 = w13 = w12 = w11 = w10 = w9 = w8 = 0; w7 = w6 = w5 = w4 = w3 = w2 = w1 = w0 = 0; } +#endif diff --git a/libkern/gen/OSAtomicOperations.c b/libkern/gen/OSAtomicOperations.c index aeeb09364..3484791d5 100644 --- a/libkern/gen/OSAtomicOperations.c +++ b/libkern/gen/OSAtomicOperations.c @@ -54,8 +54,8 @@ enum { * Like standards, there are a lot of atomic ops to choose from! */ -#if !defined(__ppc__) && !defined(__i386__) && !defined(__x86_64__) -/* Implemented in assembly for ppc and i386 and x86_64 */ +#if !defined(__i386__) && !defined(__x86_64__) +/* Implemented in assembly for i386 and x86_64 */ #undef OSAddAtomic SInt32 OSAddAtomic(SInt32 amount, volatile SInt32 * value) @@ -72,6 +72,7 @@ OSAddAtomic(SInt32 amount, volatile SInt32 * value) return oldValue; } +#undef OSAddAtomicLong long OSAddAtomicLong(long theAmount, volatile long *address) { @@ -82,7 +83,7 @@ OSAddAtomicLong(long theAmount, volatile long *address) #endif } -/* Implemented as an assembly alias for i386 and linker alias for ppc */ +/* Implemented as an assembly alias for i386 */ #undef OSCompareAndSwapPtr Boolean OSCompareAndSwapPtr(void *oldValue, void *newValue, void * volatile *address) @@ -97,9 +98,6 @@ Boolean OSCompareAndSwapPtr(void *oldValue, void *newValue, } #endif -#ifndef __ppc__ -/* Implemented as assembly for ppc */ - #undef OSIncrementAtomic SInt32 OSIncrementAtomic(volatile SInt32 * value) { @@ -111,7 +109,6 @@ SInt32 OSDecrementAtomic(volatile SInt32 * value) { return OSAddAtomic(-1, value); } -#endif /* !__ppc__ */ static UInt32 OSBitwiseAtomic(UInt32 and_mask, UInt32 or_mask, UInt32 xor_mask, volatile UInt32 * value) { diff --git a/libkern/gen/OSDebug.cpp b/libkern/gen/OSDebug.cpp index de1d99372..3e67cfff8 100644 --- a/libkern/gen/OSDebug.cpp +++ b/libkern/gen/OSDebug.cpp @@ -28,6 +28,7 @@ // NOTE: This file is only c++ so I can get static initialisers going #include +#include #include @@ -179,41 +180,23 @@ x86_64_validate_stackptr(vm_offset_t stackptr) } #endif +void +OSPrintBacktrace(void) +{ + void * btbuf[20]; + int tmp = OSBacktrace(btbuf, 20); + int i; + for(i=0;i> 2]; - if ((stackptr - stackptr_prev) > 8 * 1024) // Sanity check - break; - - vm_offset_t addr = mem[(stackptr >> 2) + 2]; - if ((addr & 3) || (addr < 0x8000)) // More sanity checks - break; - bt[i] = (void *) addr; - } - frame = i; - - for ( ; i < maxAddrs; i++) - bt[i] = (void *) 0; -#elif __i386__ +#if __i386__ #define SANE_i386_FRAME_SIZE (kernel_stack_size >> 1) vm_offset_t stackptr, stackptr_prev, raddr; unsigned frame_index = 0; diff --git a/libkern/kernel_mach_header.c b/libkern/kernel_mach_header.c index e0830d99d..0edc6b64d 100644 --- a/libkern/kernel_mach_header.c +++ b/libkern/kernel_mach_header.c @@ -68,6 +68,33 @@ getlastaddr(void) return last_addr; } +/* + * Find the UUID load command in the Mach-O headers, and return + * the address of the UUID blob and size in "*size". If the + * Mach-O image is missing a UUID, NULL is returned. + */ +void * +getuuidfromheader(kernel_mach_header_t *mhp, unsigned long *size) +{ + struct uuid_command *uuidp; + unsigned long i; + + uuidp = (struct uuid_command *) + ((uintptr_t)mhp + sizeof(kernel_mach_header_t)); + for(i = 0; i < mhp->ncmds; i++){ + if(uuidp->cmd == LC_UUID) { + if (size) + *size = sizeof(uuidp->uuid); + + return (void *)uuidp->uuid; + } + + uuidp = (struct uuid_command *)((uintptr_t)uuidp + uuidp->cmdsize); + } + + return NULL; +} + /* * This routine returns the a pointer to the data for the named section in the * named segment if it exist in the mach header passed to it. Also it returns diff --git a/libkern/kmod/Makefile.kmod b/libkern/kmod/Makefile.kmod index 0c35bdf4d..62ffd893b 100644 --- a/libkern/kmod/Makefile.kmod +++ b/libkern/kmod/Makefile.kmod @@ -18,7 +18,8 @@ LIB_INSTALL_FLAGS = -p -m 444 # -mkernel implies -mlong-branch/-mlong-calls/-mno-red-zone as needed for # code linked into kexts -CFLAGS_KMOD = $(filter-out -O0 -O1 -O2 -O3 -O4 -Os -Oz,$(CFLAGS)) \ +# -fno-stack-protector is necessary for the kernel, but not for kexts +CFLAGS_KMOD = $(filter-out -O0 -O1 -O2 -O3 -O4 -Os -Oz -freorder-blocks -flto -fno-stack-protector,$(CFLAGS)) \ -Os -mkernel -Wall ifneq ($(MACHINE_CONFIG), DEFAULT) @@ -39,8 +40,8 @@ KMODCPP_OFILES = $(KMODCPP_CFILES:.c=.o) ALL_OFILES = $(KMOD_OFILES) $(KMODCPP_OFILES) $(ALL_OFILES): %.o : %.c - @echo CC $@ - $(_v)${KCC} -c ${CFLAGS_KMOD} ${${join $@,_CFLAGS}} ${INCFLAGS} ${${join $@,_INCFLAGS}} -o $(COMPOBJROOT)/$(*F).o $< + @echo LIBKMOD_CC $@ + $(_v)${LIBKMOD_CC} -c ${CFLAGS_KMOD} ${${join $@,_CFLAGS}} ${INCFLAGS} ${${join $@,_INCFLAGS}} -o $(COMPOBJROOT)/$(*F).o $< $(COMPOBJROOT)/$(KMOD_NAME).a: $(KMOD_OFILES) @echo LIBTOOL $(notdir $@) @@ -59,7 +60,11 @@ $(INSTALL_DIR)/%.a: $(INSTOBJROOT)/%.a if [ $(MACHINE_CONFIG) = DEFAULT ]; then \ allarchs=""; \ for onearch in $(INSTALL_ARCHS); do \ - archdir=$(OBJROOT)/$(KERNEL_CONFIG)_$${onearch}/$(COMPONENT); \ + if [ $${onearch} = ARM ] ; then \ + archdir=$(OBJROOT)/$(KERNEL_CONFIG)_$${onearch}_$(DEFAULT_ARM_MACHINE_CONFIG)/$(COMPONENT); \ + else \ + archdir=$(OBJROOT)/$(KERNEL_CONFIG)_$${onearch}/$(COMPONENT); \ + fi; \ if [ -e $${archdir}/kmod/$(*F).a ]; then \ allarchs="$${allarchs} $${archdir}/kmod/$(*F).a"; \ fi; \ @@ -79,6 +84,11 @@ $(INSTALL_DIR)/%.a: $(INSTOBJROOT)/%.a my_aconfig=$${my_config}; \ else \ my_counter=1; \ + if [ $${my_aconfig} = ARM ] ; then \ + if [ $${my_config} = DEFAULT ] ; then \ + my_config=$(DEFAULT_ARM_MACHINE_CONFIG); \ + fi; \ + fi; \ inputfile=$(OBJROOT)/$${my_kconfig}_$${my_aconfig}_$${my_config}/$(COMPONENT)/kmod/$(*F).a; \ if [ -e $${inputfile} ]; then \ if [ $${my_innercounter} -eq 1 ]; then \ diff --git a/libkern/kmod/cplus_start.c b/libkern/kmod/cplus_start.c index 8ae7a0193..1a2f3b9a0 100644 --- a/libkern/kmod/cplus_start.c +++ b/libkern/kmod/cplus_start.c @@ -42,7 +42,12 @@ The linkline must look like this. *.o -lkmodc++ kmod_info.o -lkmod */ -#if __i386__ || __ppc__ + +/* The following preprocessor test must match exactly with the architectures + * that define the CONFIG_STATIC_CPPINIT config option. + */ +#if __i386__ + #include #include diff --git a/libkern/kmod/cplus_stop.c b/libkern/kmod/cplus_stop.c index b4ce5236a..2b2bdc688 100644 --- a/libkern/kmod/cplus_stop.c +++ b/libkern/kmod/cplus_stop.c @@ -42,7 +42,12 @@ The linkline must look like this. *.o -lkmodc++ kmod_info.o -lkmod */ -#if __i386__ || __ppc__ + +/* The following preprocessor test must match exactly with the architectures + * that define the CONFIG_STATIC_CPPINIT config option. + */ +#if __i386__ + #include asm(".destructors_used = 0"); diff --git a/libkern/kxld/Makefile b/libkern/kxld/Makefile index 9bc3566c6..0e962487b 100644 --- a/libkern/kxld/Makefile +++ b/libkern/kxld/Makefile @@ -36,15 +36,16 @@ endif PRODUCT_TYPE ?= DYLIB HDRDST=$(DSTROOT)/usr/local/include -LIBDST=$(DSTROOT)/usr/lib/system +DYLIBDST=$(DSTROOT)/usr/lib/system ARCHIVEDST=$(DSTROOT)/usr/local/lib -LIBOBJ=$(OBJROOT)/libkxld.o LIBKXLD_DYLIB=libkxld.dylib LIBKXLD_ARCHIVE=libkxld.a -LIBKXLDNAME=/usr/lib/system/$(LIBKXLD_DYLIB) +LIBKXLD_INSTALLNAME=/usr/lib/system/$(LIBKXLD_DYLIB) LIBKXLDOBJ_DYLIB=$(OBJROOT)/$(LIBKXLD_DYLIB) LIBKXLDOBJ_ARCHIVE=$(OBJROOT)/$(LIBKXLD_ARCHIVE) -LIBKXLDDST_DYLIB=$(LIBDST)/$(LIBKXLD_DYLIB) +LIBKXLDSYM_DYLIB=$(SYMROOT)/$(LIBKXLD_DYLIB) +LIBKXLDSYM_ARCHIVE=$(SYMROOT)/$(LIBKXLD_ARCHIVE) +LIBKXLDDST_DYLIB=$(DYLIBDST)/$(LIBKXLD_DYLIB) LIBKXLDDST_ARCHIVE=$(ARCHIVEDST)/$(LIBKXLD_ARCHIVE) TESTSRC=$(SRCROOT)/tests TESTDST=./BUILD/tests @@ -55,15 +56,20 @@ CFLAGS=-std=c99 -Wall -Wextra -Werror -pedantic -Wformat=2 -Wcast-align \ -Wwrite-strings -Wshorten-64-to-32 -Wshadow -Winit-self -Wpointer-arith \ -Wno-format-y2k -W -Wstrict-prototypes -Wmissing-prototypes -Wreturn-type \ -Wcast-qual -Wwrite-strings -Wswitch -Wcast-align -Wbad-function-cast \ - -Wchar-subscripts -Winline -Wnested-externs -Wredundant-decls -g \ - -isysroot $(SDKROOT) -LDFLAGS=$(ARCHS) -dynamiclib -install_name $(LIBKXLDNAME) \ + -Wchar-subscripts -Winline -Wnested-externs -Wredundant-decls -g +LDFLAGS=$(ARCHS) -dynamiclib -install_name $(LIBKXLD_INSTALLNAME) \ -compatibility_version $(COMPATIBILITY_VERSION) \ - -current_version $(CURRENT_VERSION) -isysroot $(SDKROOT) -lstdc++ -INCLUDES=-I$(HDRSRC) $(INCFLAGS_EXTERN) + -current_version $(CURRENT_VERSION) -lstdc++ +INCLUDES=-I$(HDRSRC) + +ifneq ($(SDKROOT),/) + CFLAGS += -isysroot $(SDKROOT) + LDFLAGS += -isysroot $(SDKROOT) +endif # Tools CC = xcrun -sdk $(SDKROOT) cc +CLANG_ANALYZER = clang --analyze LIBTOOL = xcrun -sdk $(SDKROOT) libtool STRIP = xcrun -sdk $(SDKROOT) strip @@ -73,10 +79,11 @@ CFLAGS+=-Wno-cast-align endif # Files -HDR_NAMES=kxld.h kxld_types.h +HDR_NAMES=kxld.h kxld_types.h WKdm.h OBJ_NAMES=kxld.o kxld_array.o kxld_copyright.o kxld_demangle.o kxld_dict.o \ - kxld_kext.o kxld_reloc.o kxld_sect.o kxld_seg.o kxld_sym.o kxld_state.o \ - kxld_symtab.o kxld_util.o kxld_uuid.o kxld_vtable.o + kxld_kext.o kxld_object.o kxld_reloc.o kxld_sect.o kxld_seg.o \ + kxld_sym.o kxld_symtab.o kxld_util.o kxld_uuid.o kxld_vtable.o \ + WKdmCompress.o WKdmDecompress.o HDRS=$(addprefix $(HDRSRC)/, $(HDR_NAMES)) OBJS=$(addprefix $(OBJROOT)/, $(OBJ_NAMES)) @@ -108,11 +115,17 @@ profile: OPTIM=-Os -pg -dynamic profile: build tests: OPTIM=-O0 -DDEBUG -tests: kxld_dict_test copyrighttest +tests: kxld_array_test kxld_dict_test copyrighttest + +build: $(LIBKXLDSYM_$(PRODUCT_TYPE)) -build: $(LIBKXLDOBJ_$(PRODUCT_TYPE)) - @[ -d $(SYMROOT) ] || mkdir -p $(SYMROOT) - install -c -m 644 $< $(SYMROOT) +$(LIBKXLDSYM_DYLIB): $(LIBKXLDOBJ_DYLIB) + @mkdir -p $(SYMROOT) + install -c -m 644 $< $@ + +$(LIBKXLDSYM_ARCHIVE): $(LIBKXLDOBJ_ARCHIVE) + @mkdir -p $(SYMROOT) + install -c -m 644 $< $@ $(LIBKXLDOBJ_DYLIB): $(OBJS) $(CC) $(LDFLAGS) -o $@ $^ @@ -121,39 +134,56 @@ $(LIBKXLDOBJ_ARCHIVE): $(OBJS) $(LIBTOOL) -static -o $@ $^ installhdrs: - @[ -d $(HDRDST) ] || mkdir -p $(HDRDST) + @mkdir -p $(HDRDST) install -o 0 -g 0 -c -m 444 $(HDRS) $(HDRDST) install: release installhdrs $(LIBKXLDDST_$(PRODUCT_TYPE)) -$(LIBKXLDDST_DYLIB): - @[ -d $(LIBDST) ] || mkdir -p $(LIBDST) - install -o 0 -g 0 -c -m 555 $(SYMROOT)/$(LIBKXLD_DYLIB) $(LIBDST) +$(LIBKXLDDST_DYLIB): $(LIBKXLDSYM_DYLIB) + @mkdir -p $(DYLIBDST) + install -o 0 -g 0 -c -m 555 $< $@ $(STRIP) -S -x $@ -$(LIBKXLDDST_ARCHIVE): - @[ -d $(ARCHIVEDST) ] || mkdir -p $(ARCHIVEDST) - install -o 0 -g 0 -c -m 555 $(SYMROOT)/$(LIBKXLD_ARCHIVE) $(ARCHIVEDST) +$(LIBKXLDDST_ARCHIVE): $(LIBKXLDSYM_ARCHIVE) + @mkdir -p $(ARCHIVEDST) + install -o 0 -g 0 -c -m 555 $< $@ KEXTCOPYOBJS=$(OBJROOT)/kextcopyright.o $(OBJROOT)/kxld_copyright.o $(OBJROOT)/kxld_util.o -kextcopyright: $(KEXTCOPYOBJS) $(TESTDST) +kextcopyright: $(TESTDST)/kextcopyright +$(TESTDST)/kextcopyright: $(KEXTCOPYOBJS) + @mkdir -p $(TESTDST) $(CC) $(ARCHS) $(KEXTCOPYOBJS) -framework CoreFoundation -o $(OBJROOT)/kextcopyright - install -c -m 755 $(OBJROOT)/kextcopyright $(TESTDST) + install -c -m 755 $(OBJROOT)/kextcopyright $@ + +TESTOBJS=$(OBJROOT)/kxld_test.o $(OBJROOT)/kxld_util.o + +ARRAYOBJS=$(OBJROOT)/kxld_array_test.o $(OBJROOT)/kxld_array.o $(TESTOBJS) +kxld_array_test: $(TESTDST)/kxld_array_test +$(TESTDST)/kxld_array_test: $(ARRAYOBJS) + @mkdir -p $(TESTDST) + $(CC) $(ARCHS) $(ARRAYOBJS) -o $(OBJROOT)/kxld_array_test + install -c -m 755 $(OBJROOT)/kxld_array_test $@ -DICTOBJS=$(OBJROOT)/kxld_dict_test.o $(OBJROOT)/kxld_dict.o $(OBJROOT)/kxld_array.o $(OBJROOT)/kxld_util.o -kxld_dict_test: $(DICTOBJS) $(TESTDST) +DICTOBJS=$(OBJROOT)/kxld_dict_test.o $(OBJROOT)/kxld_dict.o $(OBJROOT)/kxld_array.o $(TESTOBJS) +kxld_dict_test: $(TESTDST)/kxld_dict_test +$(TESTDST)/kxld_dict_test: $(DICTOBJS) + @mkdir -p $(TESTDST) $(CC) $(ARCHS) $(DICTOBJS) -o $(OBJROOT)/kxld_dict_test - install -c -m 755 $(OBJROOT)/kxld_dict_test $(TESTDST) + install -c -m 755 $(OBJROOT)/kxld_dict_test $@ COPYTESTOBJS=$(OBJROOT)/kxld_copyright.o $(OBJROOT)/kxld_util.o copyrighttest: OPTIM+=-DTEST -copyrighttest: $(KEXTCOPYOBJS) $(TESTDST) +copyrighttest: $(TESTDST)/copyrighttest +$(TESTDST)/copyrighttest: $(COPYTESTOBJS) + @mkdir -p $(TESTDST) $(CC) $(ARCHS) $(COPYTESTOBJS) -framework CoreFoundation -framework IOKit -o $(OBJROOT)/copyrighttest - install -c -m 755 $(OBJROOT)/copyrighttest $(TESTDST) + install -c -m 755 $(OBJROOT)/copyrighttest $@ -$(TESTDST): - @[ -d $(TESTDST) ] || mkdir -p $(TESTDST) +analyze: + @$(CLANG_ANALYZER) *.c + @$(CLANG_ANALYZER) -I. tests/*.c + @rm -f *.plist clean: @rm -rf $(OBJROOT)/* diff --git a/iokit/Kernel/WKdmCompress.c b/libkern/kxld/WKdmCompress.c similarity index 97% rename from iokit/Kernel/WKdmCompress.c rename to libkern/kxld/WKdmCompress.c index c58477371..db2c5c05b 100644 --- a/iokit/Kernel/WKdmCompress.c +++ b/libkern/kxld/WKdmCompress.c @@ -245,7 +245,7 @@ WKdm_compress (WK_word* src_buf, */ { - unsigned int num_bytes_to_pack = next_qp - (char *) tempQPosArray; + unsigned int num_bytes_to_pack = (unsigned int)(next_qp - (char *) tempQPosArray); unsigned int num_packed_words = (num_bytes_to_pack + 7) >> 3; // ceil((double) num_bytes_to_pack / 8); unsigned int num_source_words = num_packed_words * 2; WK_word* endQPosArray = tempQPosArray + num_source_words; @@ -296,7 +296,7 @@ WKdm_compress (WK_word* src_buf, { unsigned int num_tenbits_to_pack = - next_low_bits - tempLowBitsArray; + (unsigned int)(next_low_bits - tempLowBitsArray); unsigned int num_packed_words = (num_tenbits_to_pack + 2) / 3; //ceil((double) num_tenbits_to_pack / 3); unsigned int num_source_words = num_packed_words * 3; WK_word* endLowBitsArray = tempLowBitsArray + num_source_words; @@ -324,5 +324,5 @@ WKdm_compress (WK_word* src_buf, } - return ((char *) boundary_tmp - (char *) dest_buf); + return (unsigned int)((char *) boundary_tmp - (char *) dest_buf); } diff --git a/iokit/Kernel/WKdmDecompress.c b/libkern/kxld/WKdmDecompress.c similarity index 100% rename from iokit/Kernel/WKdmDecompress.c rename to libkern/kxld/WKdmDecompress.c diff --git a/libkern/kxld/i386/WKdmCompress.s b/libkern/kxld/i386/WKdmCompress.s new file mode 100644 index 000000000..f7d98440c --- /dev/null +++ b/libkern/kxld/i386/WKdmCompress.s @@ -0,0 +1,597 @@ +// $Id: WKdmCompress.intel.s,v 1.1 2010/01/28 22:33:24 cclee Exp cclee $ +// +// This file contains i386 and x86_64 (no SSE) optimized implementation of WKdm Compressor. The function prototype is +// +// unsigned int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, unsigned int num_input_words); +// +// The implementation assumes the input buffer is a memory page (4096 bytes or 1024 words), or something less than 4KB. +// +// WKdm Compression algorithm is briefly stated as follows: +// +// There is a dynamically updated dictionary of 16 words, each initialized with "1". +// +// the dictionary is indexed as follows, +// 0, x = input_word +// 1, hash_index = (x>>10)&255 +// 2, dict_location = &dictionary[hash_index] +// 3, dict_word = *dict_location +// +// Sequentially for each input word, it is classified/tagged into 4 classes +// 0 : if the input word is 0 +// 1 : the higher 22 bits of the input word is identically to the higher bits from the dictionary (hash table indexed) +// 2 : the above condition (partially 22 higher bits matched) is not met, a dictionary miss condition +// 3 : the input word is exactly matched to the word from the dictionary (hash table index) +// +// after each input word is classified, each tag is represented by 2 bits. Furthermore, for each class +// 0 : no further info is needed +// 1 : the hash_index is represented by 4-bits (8 packed into a word), +// the lower 10-bits is sent to the decompressor (3 packed into a word) +// 2 : the 32-bit word is sent to the decompressor +// 3 : the hash_index is represented by 4-bits (8 packed into a word) +// +// for classes 1 and 2, the input word is used to update the dictionary after it is classified/tagged +// +// the following implementation was started from compiling (gcc -O3) the original C code (WKdmCompress.c) +// and then subsequentially improved and documented. +// For i386, it speeds up ~ 1.5 times +// For x86_64, it speeds up ~ 1.3 times +// +// cclee, 1/28/10 + +#if !(defined __i386__ || defined __x86_64__) + +typedef char DummyDefinition; + +#else // i386 or x86_64 architectures + +#if defined __i386__ // 32-bit implementation + + .text + .align 4,0x90 + +.globl _WKdm_compress +_WKdm_compress: + + pushl %ebp + movl %esp, %ebp + + pushl %edi + pushl %esi + pushl %ebx + + // allocate stack memory for local variables + + subl $6316, %esp + + leal _hashLookupTable, %ebx // hashTable + + movl 8(%ebp), %edx // %edx = src_buf + movl 12(%ebp), %esi // %esi = dest_buf + movl 16(%ebp), %eax // %eax = num_input_words + + leal -1112(%ebp), %ecx // tempTagsArray + movl %ecx, -6272(%ebp) // a copy of char* next_tag = (char *) tempTagsArray; + + leal -2136(%ebp), %ecx // tempQPosArray + movl %ecx, -6264(%ebp) // char* next_qp = (char *) tempQPosArray; + movl %ecx, -6252(%ebp) + + leal (%edx,%eax,4), %ecx // src_buf + num_input_words*4 + movl %ecx, -6244(%ebp) // end_of_input = src_buf + num_input_words; + + // PRELOAD_DICTIONARY; + movl $1, -88(%ebp) + movl $1, -84(%ebp) + movl $1, -80(%ebp) + movl $1, -76(%ebp) + movl $1, -72(%ebp) + movl $1, -68(%ebp) + movl $1, -64(%ebp) + movl $1, -60(%ebp) + movl $1, -56(%ebp) + movl $1, -52(%ebp) + movl $1, -48(%ebp) + movl $1, -44(%ebp) + movl $1, -40(%ebp) + movl $1, -36(%ebp) + movl $1, -32(%ebp) + movl $1, -28(%ebp) + + shrl $4, %eax // (num_input_words / 16) + leal 16(%esi,%eax,4), %eax // dest_buf + [TAGS_AREA_OFFSET + (num_input_words / 16)]*4 + movl %eax, -6256(%ebp) // next_full_patt = dest_buf + TAGS_AREA_OFFSET + (num_input_words / 16); + + leal -6232(%ebp), %eax // &tempLowBitsArray[0] + movl %eax, -6260(%ebp) // save a copy of &tempLowBitsArray[0] + movl %eax, -6248(%ebp) // save a copy of &tempLowBitsArray[0] + + cmpl %ecx, %edx // next_input_word (%edx) vs end_of_input (%ecx) + jae L_done_search // if (next_input_word >= end_of_input) skip the following search loop + + leal -1111(%ebp), %esi // &next_tag[1] + leal -88(%ebp), %ebp // dictionary + + movl %edx, %edi // next_input_word + + #define next_input_word %edi + #define dictionary %ebp + #define next_tag %esi + + jmp L5 + + .align 4,0x90 +L_RECORD_ZERO: + movb $0, -1(next_tag) // *next_tag = ZERO; +L8: + addl $4, next_input_word // next_input_word++; + incl next_tag // next_tag++ + cmpl next_input_word, 84(%esp) // end_of_input vs next_input_word + jbe L_done_search // if (next_input_word>=end_of_input), skip to L_done_search +L5: + movl (next_input_word), %ecx // input_word = *next_input_word; + movl %ecx, %eax // a copy of input_word + testl %ecx, %ecx // input_word + je L_RECORD_ZERO // if (input_word==0) RECORD_ZERO + shrl $10, %eax // input_high_bits = HIGH_BITS(input_word); + movl %eax, (%esp) // save a copy of input_high_bits; + andl $255, %eax // 8 bits index to Hash Table + movsbl (%ebx,%eax),%edx // HASH_TO_DICT_BYTE_OFFSET(input_word) + addl dictionary, %edx // ((char*) dictionary) + HASH_TO_DICT_BYTE_OFFSET(input_word)); + movl (%edx), %eax // dict_word = *dict_location; + cmpl %eax, %ecx // cmp input_word vs dict_word + je L_RECORD_EXACT + shrl $10, %eax // HIGH_BITS(dict_word) + cmpl %eax, (%esp) // input_high_bits vs HIGH_BITS(dict_word) + je L_RECORD_PARTIAL // if (input_high_bits == HIGH_BITS(dict_word)) RECORD_PARTIAL + +L_RECORD_MISS: + movb $2, -1(next_tag) // *next_tag = 2 for miss + movl 72(%esp), %eax // next_full_patt + movl %ecx, (%eax) // *next_full_patt = input_word; + addl $4, %eax // next_full_patt++; + movl %eax, 72(%esp) // save next_full_patt + movl %ecx, (%edx) // *dict_location = input_word + jmp L8 + + .align 4,0x90 +L_RECORD_EXACT: + movb $3, -1(next_tag) // *next_tag = 3 for exact + subl dictionary, %edx // dict_location - dictionary + sarl $2, %edx // divide by 4 for word offset + movl 76(%esp), %eax // next_qp + movb %dl, (%eax) // *next_qp = word offset (4-bit) + incl %eax // next_qp++ + movl %eax, 76(%esp) // save next_qp + jmp L8 + +L_done_search: + + // restore %ebp as normal use (was used as dictionary) + movl %esp, %ebp + addl $6328, %ebp + + // SET_QPOS_AREA_START(dest_buf,next_full_patt); + movl -6256(%ebp), %edi // next_full_patt + subl 12(%ebp), %edi // next_full_patt - dest_buf + movl %edi, %eax // next_full_patt - dest_buf + sarl $2, %eax // in 4-byte words + movl %eax, -6240(%ebp) // save (next_full_patt - dest_buf) in words + movl 12(%ebp), %edx // dest_buf + movl %eax, 4(%edx) // dest_buf[1] = next_full_patt - dest_buf + + movl -6272(%ebp), %ecx // &tempTagsArray[0] + decl next_tag + cmpl next_tag, %ecx // next_tag vs &tempTagsArray[0] + jae L13 // if &tempTagsArray[0] >= next_tag, skip the following WK_pack_2bits + + movl %edx, %ebx // a copy of dest_buf + + // boundary_tmp = WK_pack_2bits(tempTagsArray, (WK_word *) next_tag, dest_buf + HEADER_SIZE_IN_WORDS); + + .align 4,0x90 +L_WK_pack_2bits: + movl 4(%ecx), %eax // w1 + sall $2, %eax // w1 << 2 + movl 8(%ecx), %edx // w2 + sall $4, %edx // w2 << 4 + orl %edx, %eax // (w1<<2) | (w2<<4) + orl (%ecx), %eax // (w0) | (w1<<2) | (w2<<4) + movl 12(%ecx), %edx // w3 + sall $6, %edx // (w3<<6) + orl %edx, %eax // (w0) | (w1<<2) | (w2<<4) | (w3<<6) + movl %eax, 16(%ebx) // save at *(dest_buf + HEADER_SIZE_IN_WORDS) + addl $16, %ecx // tempTagsArray += 16; + addl $4, %ebx // dest_buf += 4; + cmpl %ecx, next_tag // cmp next_tag vs dest_buf + ja L_WK_pack_2bits // if (next_tag > dest_buf) repeat L_WK_pack_2bits + + /* Pack the queue positions into the area just after the full words. */ +L13: + movl -6252(%ebp), %eax // next_qp + movl -6264(%ebp), %ecx // (char *) tempQPosArray + movl %eax, %esi // next_qp + subl %ecx, %eax // num_bytes_to_pack = next_qp - (char *) tempQPosArray; + addl $7, %eax // num_bytes_to_pack + 7 + andl $-8, %eax // clear lower 3 bits, (num_packed_words<<3) + addl %eax, %ecx // endQPosArray = tempQPosArray + num_source_words; + cmpl %ecx, %esi // next_qp vs endQPosArray + jae L16 + .align 4,0x90 +L30: + movb $0, (%esi) // *next_qp = 0; + incl %esi // next_qp++ + cmpl %ecx, %esi // next_qp vs endQPosArray + jne L30 // + +L16: + movl -6256(%ebp), %ebx // next_full_patt + cmpl -6264(%ebp), %ecx // endQPosArray vs tempQPosArray + jbe L20 // if (endQPosArray<=tempQPosArray) skip L_WK_pack_4bits + movl -6264(%ebp), %edx // tempQPosArray + + + // boundary_tmp = WK_pack_4bits(tempQPosArray, endQPosArray, next_full_patt); + + .align 4,0x90 +L21: + movl 4(%edx), %eax // src_next[1] + sall $4, %eax // (src_next[1] << 4) + orl (%edx), %eax // temp = src_next[0] | (src_next[1] << 4) + movl %eax, (%ebx) // dest_next[0] = temp; + addl $4, %ebx // dest_next++; + addl $8, %edx // src_next += 2; + cmpl %edx, %ecx // source_end vs src_next + ja L21 // while (src_next < source_end) repeat the loop + + movl %ebx, %edi // boundary_tmp + + subl 12(%ebp), %edi // boundary_tmp - dest_buf + movl %edi, %eax // boundary_tmp - dest_buf + sarl $2, %eax // translate into word offset + + movl %eax, -6240(%ebp) // save (next_full_patt - dest_buf) in words + +L20: + // SET_LOW_BITS_AREA_START(dest_buf,boundary_tmp); + movl -6240(%ebp), %ecx // boundary_tmp - dest_buf + movl 12(%ebp), %edx // dest_buf + movl %ecx, 8(%edx) // dest_buf[2] = boundary_tmp - dest_buf + + movl -6260(%ebp), %ecx // tempLowBitsArray + movl -6248(%ebp), %edx // next_low_bits + subl %ecx, %edx // next_low_bits - tempLowBitsArray + sarl $2, %edx // num_tenbits_to_pack + + subl $3, %edx // pre-decrement num_tenbits_to_pack by 3 + jl 1f // if num_tenbits_to_pack < 3, skip the following loop + .align 4,0x90 +0: + movl 4(%ecx), %eax // w1 + sall $10, %eax // w1<<10 + movl 8(%ecx), %esi // w2 + sall $20, %esi // w2<<20 + orl %esi, %eax // (w1<<10) | (w2<<20) + orl (%ecx), %eax // (w0) | (w1<<10) | (w2<<20) + movl %eax, (%ebx) // pack w0,w1,w2 into 1 dest_buf word + addl $4, %ebx // dest_buf++ + addl $12, %ecx // next w0/w1/w2 triplet + subl $3, %edx // num_tenbits_to_pack-=3 + jge 0b // if no less than 3 elements, back to loop head + +1: addl $3, %edx // post-increment num_tenbits_to_pack by 3 + je 3f // if num_tenbits_to_pack is a multiple of 3, skip the following + movl (%ecx), %eax // w0 + subl $1, %edx // num_tenbits_to_pack -- + je 2f // + movl 4(%ecx), %esi // w1 + sall $10, %esi // w1<<10 + orl %esi, %eax +2: + movl %eax, (%ebx) // write the final dest_buf word + addl $4, %ebx // dest_buf++ +3: + movl %ebx, %eax // boundary_tmp + subl 12(%ebp), %eax // boundary_tmp - dest_buf + sarl $2, %eax // boundary_tmp - dest_buf in terms of words + movl 12(%ebp), %esi // dest_buf + movl %eax, 12(%esi) // SET_LOW_BITS_AREA_END(dest_buf,boundary_tmp); + sall $2, %eax // boundary_tmp - dest_buf in terms of bytes + addl $6316, %esp // pop out stack memory + popl %ebx + popl %esi + popl %edi + leave + ret + + .align 4,0x90 + +L_RECORD_PARTIAL: + movb $1, -1(next_tag) // *next_tag = 1 for partial matched + movl %edx, %eax // dict_location + subl dictionary, %eax // %eax = dict_location - dictionary + movl %ecx, (%edx) // *dict_location = input_word; + sarl $2, %eax // offset in 32-bit word + movl 76(%esp), %edx // next_qp + movb %al, (%edx) // update *next_qp + incl %edx // next_qp++ + movl %edx, 76(%esp) // save next_qp + movl %ecx, %eax // a copy of input_word + andl $1023, %eax // lower 10 bits + movl 80(%esp), %edx // next_low_bits + movl %eax, (%edx) // EMIT_WORD(next_low_bits,(low_bits_pattern)) + addl $4, %edx // next_low_bits++ + movl %edx, 80(%esp) // save next_low_bits + jmp L8 + +#endif // i386 architectures + +#if defined __x86_64__ // 64-bit implementation + .text + .align 4,0x90 + +.globl _WKdm_compress +_WKdm_compress: + pushq %rbp + movq %rsp, %rbp + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbx + subq $6112, %rsp + + #define tempTagsArray -6264(%rbp) + #define tempLowBitsArray -6272(%rbp) + #define next_tag %r8 + #define next_input_word %rdi + #define end_of_input %r13 + #define next_full_patt %rbx + #define dict_location %rcx + #define next_qp %r10 + #define dictionary %r11 + #define dest_buf %r12 + #define hashTable %r14 + #define tempQPosArray %r15 + #define next_low_bits %rsi + + movq %rsi, %r12 // dest_buf + + leaq -1136(%rbp), %rax // &tempTagsArray[0] + movq %rax, tempTagsArray + leaq 1(%rax), next_tag // next_tag always points to the one following the current tag + + leaq -2160(%rbp), %r15 // &tempQPosArray[0] + movq %r15, next_qp // next_qp + + mov %edx, %eax // num_input_words + leaq (%rdi,%rax,4), end_of_input // end_of_input = src_buf + num_input_words + + // PRELOAD_DICTIONARY; + movl $1, -112(%rbp) + movl $1, -108(%rbp) + movl $1, -104(%rbp) + movl $1, -100(%rbp) + movl $1, -96(%rbp) + movl $1, -92(%rbp) + movl $1, -88(%rbp) + movl $1, -84(%rbp) + movl $1, -80(%rbp) + movl $1, -76(%rbp) + movl $1, -72(%rbp) + movl $1, -68(%rbp) + movl $1, -64(%rbp) + movl $1, -60(%rbp) + movl $1, -56(%rbp) + movl $1, -52(%rbp) + + shrl $4, %edx // (num_input_words / 16) + mov %edx, %edx // sign extension into quad word + leaq 16(%rsi,%rdx,4), %rbx // dest_buf + [TAGS_AREA_OFFSET + (num_input_words / 16)]*4 + + leaq -6256(%rbp), %rax // &tempLowBitsArray[0] + movq %rax, tempLowBitsArray // save for later reference + movq %rax, next_low_bits // next_low_bits + + cmpq end_of_input, next_input_word // next_input_word vs end_of_input + jae L_done_search // if (next_input_word>=end_of_input) no work to do in search + leaq -112(%rbp), dictionary // dictionary + leaq _hashLookupTable(%rip), hashTable // hash look up table + jmp L5 + + .align 4,0x90 +L_RECORD_ZERO: + movb $0, -1(next_tag) // *next_tag = ZERO; +L8: + addq $4, next_input_word // next_input_word++; + incq next_tag // next_tag++ + cmpq next_input_word, end_of_input // end_of_input vs next_input_word + jbe L_done_search +L5: + movl (next_input_word), %edx // input_word = *next_input_word; + movl %edx, %r9d // a copy of input_word + testl %edx, %edx // input_word + je L_RECORD_ZERO // if (input_word==0) RECORD_ZERO + shrl $10, %r9d // input_high_bits = HIGH_BITS(input_word); + movzbl %r9b, %eax // 8-bit index to the Hash Table + movsbq (hashTable,%rax),%rax // HASH_TO_DICT_BYTE_OFFSET(input_word) + leaq (dictionary, %rax), dict_location // ((char*) dictionary) + HASH_TO_DICT_BYTE_OFFSET(input_word)); + movl (dict_location), %eax // dict_word = *dict_location; + cmpl %eax, %edx // dict_word vs input_word + je L_RECORD_EXACT // if identical, RECORD_EXACT + shrl $10, %eax // HIGH_BITS(dict_word) + cmpl %eax, %r9d // input_high_bits vs HIGH_BITS(dict_word) + je L_RECORD_PARTIAL // if identical, RECORD_PARTIAL + +L_RECORD_MISS: + movb $2, -1(next_tag) // *next_tag = 2 for miss + movl %edx, (next_full_patt) // *next_full_patt = input_word; + addq $4, next_full_patt // next_full_patt++ + movl %edx, (dict_location) // *dict_location = input_word + addq $4, next_input_word // next_input_word++ + incq next_tag // next_tag++ + cmpq next_input_word, end_of_input // end_of_input vs next_input_word + ja L5 // if (end_of_input>next_input_word) repeat from L5 + +L_done_search: + + // SET_QPOS_AREA_START(dest_buf,next_full_patt); + //movq next_full_patt, %r11 // next_full_patt + movq next_full_patt, %rax // next_full_patt + subq dest_buf, %rax // next_full_patt - dest_buf + sarq $2, %rax // offset in 4-bytes + movl %eax, %r13d // r13d = (next_full_patt - dest_buf) + movl %eax, 4(dest_buf) // dest_buf[1] = next_full_patt - dest_buf + + decq next_tag + cmpq next_tag, tempTagsArray // &tempTagsArray[0] vs next_tag + jae L13 // if (&tempTagsArray[0] >= next_tag), skip the following + + // boundary_tmp = WK_pack_2bits(tempTagsArray, (WK_word *) next_tag, dest_buf + HEADER_SIZE_IN_WORDS); + + movq dest_buf, %rdi // dest_buf + movq tempTagsArray, %rcx // &tempTagsArray[0] + + .align 4,0x90 +L_pack_2bits: + movl 4(%rcx), %eax // w1 + sall $2, %eax // w1 << 2 + movl 8(%rcx), %edx // w2 + sall $4, %edx // w2 << 4 + orl %edx, %eax // (w1<<2) | (w2<<4) + orl (%rcx), %eax // (w0) | (w1<<2) | (w2<<4) + movl 12(%rcx), %edx // w3 + sall $6, %edx // w3 << 6 + orl %edx, %eax // (w0) | (w1<<2) | (w2<<4) | (w3<<6) + movl %eax, 16(%rdi) // save at *(dest_buf + HEADER_SIZE_IN_WORDS) + addq $16, %rcx // tempTagsArray += 16; + addq $4, %rdi // dest_buf += 4; + cmpq %rcx, next_tag // cmp next_tag vs dest_buf + ja L_pack_2bits // if (next_tag > dest_buf) repeat L_pack_2bits + + /* Pack the queue positions into the area just after the full words. */ + +L13: + movl %r10d, %eax // next_qp + subl %r15d, %eax // num_bytes_to_pack = next_qp - (char *) tempQPosArray; + addl $7, %eax // num_bytes_to_pack+7 + shrl $3, %eax // num_packed_words = (num_bytes_to_pack + 7) >> 3 + addl %eax, %eax // num_source_words = num_packed_words * 2; + mov %eax, %eax + leaq (tempQPosArray,%rax,4), %rcx // endQPosArray = tempQPosArray + num_source_words + cmpq %rcx, %r10 // next_qp vs endQPosArray + jae L16 // if (next_qp >= endQPosArray) skip the following zero paddings + .align 4,0x90 +L30: + movb $0, (next_qp) // *next_qp = 0 + incq next_qp // next_qp++ + cmpq %rcx, next_qp // next_qp vs endQPosArray + jne L30 // repeat while next_qp < endQPosArray +L16: + movq %rbx, %rdi // next_full_patt + cmpq tempQPosArray, %rcx // endQPosArray vs tempQPosArray + jbe L20 // if (endQPosArray <= tempQPosArray) skip the following + movq tempQPosArray, %rdx // tempQPosArray + + .align 4,0x90 +L_pack_4bits: + movl 4(%rdx), %eax // src_next[1] + sall $4, %eax // (src_next[1] << 4) + orl (%rdx), %eax // temp = src_next[0] | (src_next[1] << 4) + movl %eax, (%rdi) // dest_next[0] = temp; + addq $4, %rdi // dest_next++; + addq $8, %rdx // src_next += 2; + cmpq %rdx, %rcx // source_end vs src_next + ja L_pack_4bits // while (src_next < source_end) repeat the loop + + // SET_LOW_BITS_AREA_START(dest_buf,boundary_tmp); + //movq %rdi, %r11 // boundary_tmp + movq %rdi, %rax // boundary_tmp + subq dest_buf, %rax // boundary_tmp - dest_buf + movq %rax, %r13 // boundary_tmp - dest_buf + shrq $2, %r13 // boundary_tmp - dest_buf in words +L20: + movl %r13d, 8(dest_buf) // dest_buf[2] = boundary_tmp - dest_buf + + movq tempLowBitsArray, %rcx // tempLowBitsArray + movq next_low_bits, %rbx // next_low_bits + subq %rcx, %rbx // next_low_bits - tempLowBitsArray (in bytes) + sarq $2, %rbx // num_tenbits_to_pack (in words) + + #define size %ebx + + subl $3, size // pre-decrement num_tenbits_to_pack by 3 + jl 1f // if num_tenbits_to_pack < 3, skip the following loop + + .align 4,0x90 +0: + movl 4(%rcx), %eax // w1 + sall $10, %eax // w1 << 10 + movl 8(%rcx), %edx // w2 + sall $20, %edx // w2 << 20 + orl %edx, %eax // (w1<<10) | (w2<<20) + orl (%rcx), %eax // (w0) | (w1<<10) | (w2<<20) + movl %eax, (%rdi) // pack w0,w1,w2 into 1 dest_buf word + addq $4, %rdi // dest_buf++ + addq $12, %rcx // next w0/w1/w2 triplet + subl $3, size // num_tenbits_to_pack-=3 + jge 0b // if no less than 3 elements, back to loop head + +1: addl $3, size // post-increment num_tenbits_to_pack by 3 + je 3f // if num_tenbits_to_pack is a multiple of 3, skip the following + movl (%rcx), %eax // w0 + subl $1, size // num_tenbits_to_pack-- + je 2f // + movl 4(%rcx), %edx // w1 + sall $10, %edx // w1 << 10 + orl %edx, %eax // w0 | (w1<<10) + +2: movl %eax, (%rdi) // write the final dest_buf word + addq $4, %rdi // dest_buf++ + +3: movq %rdi, %rax // boundary_tmp + subq dest_buf, %rax // boundary_tmp - dest_buf + shrq $2, %rax // boundary_tmp - dest_buf in terms of words + movl %eax, 12(dest_buf) // SET_LOW_BITS_AREA_END(dest_buf,boundary_tmp) + shlq $2, %rax // boundary_tmp - dest_buf in terms of bytes + + // restore registers and return + addq $6112, %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + leave + ret + + .align 4,0x90 +L_RECORD_EXACT: + movb $3, -1(next_tag) // *next_tag = 3 for exact + subq dictionary, %rcx // dict_location - dictionary + sarq $2, %rcx // divide by 4 for word offset + movb %cl, (next_qp) // *next_qp = word offset (4-bit) + incq next_qp // next_qp++ + jmp L8 + + .align 4,0x90 +L_RECORD_PARTIAL: + movb $1, -1(next_tag) // *next_tag = 1 for partial matched + movq %rcx, %rax // dict_location + subq dictionary, %rax // dict_location - dictionary + movl %edx, (%rcx) // *dict_location = input_word; + sarq $2, %rax // offset in 32-bit word + movb %al, (next_qp) // update *next_qp + incq next_qp // next_qp++ + andl $1023, %edx // lower 10 bits + movl %edx, (next_low_bits) // save next_low_bits + addq $4, next_low_bits // next_low_bits++ + jmp L8 + + // for some reason, keeping the following never executed code yields a better performance +L41: + leaq -6256(%rbp), %rax + movq %rax, -6272(%rbp) + movq %rax, %rsi + jmp L_done_search +#endif // x86_64 architectures +#endif // i386 or x86_64 architectures diff --git a/libkern/kxld/i386/WKdmDecompress.s b/libkern/kxld/i386/WKdmDecompress.s new file mode 100644 index 000000000..c2e6e9345 --- /dev/null +++ b/libkern/kxld/i386/WKdmDecompress.s @@ -0,0 +1,675 @@ +// $Id: WKdmDecompress.intel.s,v 1.1 2010/01/30 00:39:21 cclee Exp cclee $ + +// This file contains i386 and x86_64 (no SSE) optimized implementation of WKdm Decompressor. +// The implementation is derived by compiling (gcc -O3) the original C code (WKdmDecompress.c) +// followed by hand tweaking of the compiled assembly code. +// cclee, 1/29/10 + +#if defined __i386__ + .text + .align 4,0x90 + + .globl _WKdm_decompress +_WKdm_decompress: + + // save registers, set up base pointer %ebp, and allocate stack memory for local veriables + + pushl %ebp + movl %esp, %ebp + pushl %edi + pushl %esi + pushl %ebx + subl $7324, %esp + + // PRELOAD_DICTIONARY; dictionary starting address : -88(%ebp) + movl $1, -88(%ebp) + movl $1, -84(%ebp) + movl $1, -80(%ebp) + movl $1, -76(%ebp) + movl $1, -72(%ebp) + movl $1, -68(%ebp) + movl $1, -64(%ebp) + movl $1, -60(%ebp) + movl $1, -56(%ebp) + movl $1, -52(%ebp) + movl $1, -48(%ebp) + movl $1, -44(%ebp) + movl $1, -40(%ebp) + movl $1, -36(%ebp) + movl $1, -32(%ebp) + movl $1, -28(%ebp) + + #define dictionary_addr -88(%ebp) + #define TAGS_AREA_END -7292(%ebp) + #define tempTagsArray -7300(%ebp) + #define tempQPosArray -2488(%ebp) + #define tempLowBitsArray -7288(%ebp) + #define next_low_bits -7296(%ebp) + #define dictionary -7308(%ebp) + #define tag_area_end -7304(%ebp) + + // WK_unpack_2bits(TAGS_AREA_START(src_buf), TAGS_AREA_END(src_buf), tempTagsArray); + + movl 8(%ebp), %eax // src_buf + addl $272, %eax // src_buf + 16 (WKdm Header) + 256 (Tags) + movl %eax, TAGS_AREA_END // TAGS_AREA_END(src_buf) + movl 8(%ebp), %eax // src_buf + movl %eax, %edi // src_buf + addl $16, %eax // TAGS_AREA_START(src_buf) = src_buf + 16 (WKdm Header) + leal -1288(%ebp), %edx // tempTagsArray + movl %edx, tempTagsArray // save a copy of tempTagsArray[] at the said location + cmpl %eax, TAGS_AREA_END // TAGS_AREA_END vs TAGS_AREA_START + jbe 1f // if TAGS_AREA_END<=TAGS_AREA_START, no need for WK_unpack_2bits + movl %edx, %ecx // %ecx -> tempTagsArray[0] + xorl %esi, %esi // i=0 + movl $50529027, %ebx // 0x03030303, mask to extract 4 2-bit tags + .align 4,0x90 +L_WK_unpack_2bits: + movl 16(%edi,%esi,4), %edx // src_buf[i] for 16 tags, 16 (WKdm header) + movl %edx, %eax // w = src_buf[i] + andl %ebx, %eax // 1st 4 tags, each in bytes + movl %eax, (%ecx) // save 1st 4 tags + movl %edx, %eax // w = src_buf[i] + shrl $2, %eax // shift down 2 bits + andl %ebx, %eax // 2nd 4 tags, each in bytes + movl %eax, 4(%ecx) // save 2nd 4 tags + shrl $4, %edx // shift down w by 4 bits + movl %edx, %eax // w>>4 + andl %ebx, %eax // 3rd 4 tags + movl %eax, 8(%ecx) // save 3rd 4 tags + shrl $2, %edx // w>>6 + andl %ebx, %edx // 4th 4 tags + movl %edx, 12(%ecx) // save 4th 4 tags + addl $16, %ecx // point to next tempTagsArray[i*16] + incl %esi // i++ + cmpl $64, %esi // i vs 64 + jne L_WK_unpack_2bits // repeat the loop until i==64 +1: + + // WK_unpack_4bits(QPOS_AREA_START(src_buf), QPOS_AREA_END(src_buf), tempQPosArray); + + movl 8(%edi), %eax // WKdm header qpos end + leal (%edi,%eax,4), %esi // QPOS_AREA_END + movl 4(%edi), %eax // WKdm header qpos start + leal (%edi,%eax,4), %ecx // QPOS_AREA_START + cmpl %ecx, %esi // QPOS_AREA_END vs QPOS_AREA_START + jbe 1f // if QPOS_AREA_END <= QPOS_AREA_START, skip WK_unpack_4bits + leal tempQPosArray, %edi // tempQPosArray + movl $252645135, %ebx // 0x0f0f0f0f : mask to extract 4 4-bit qpos +L_WK_unpack_4bits: + movl (%ecx), %eax // w + movl %eax, %edx // w + andl %ebx, %edx // 1st 4 qpos + movl %edx, (%edi) // save 1st 4 qpos + shrl $4, %eax // w>>4 + andl %ebx, %eax // 2nd 4 qpos + movl %eax, 4(%edi) // save 2nd 4 qpos + addl $4, %ecx // point to next word w + addl $8, %edi // qpos += 8 + cmpl %ecx, %esi // QPOS_AREA_END vs qpos_pointer + ja L_WK_unpack_4bits // repeat until qpos_pointer >= QPOS_AREA_END + + // WK_unpack_3_tenbits(LOW_BITS_AREA_START(src_buf), LOW_BITS_AREA_END(src_buf), tempLowBitsArray); + +1: + movl 8(%ebp), %edx // src_buf + movl 12(%edx), %eax // LOW_BITS_AREA_END offset + leal (%edx,%eax,4), %edi // LOW_BITS_AREA_END + cmpl %edi, %esi // LOW_BITS_AREA_START(=QPOS_AREA_END) vs LOW_BITS_AREA_END + jae 1f // if (LOW_BITS_AREA_START>=LOW_BITS_AREA_END) skip unpack_3_tenbits + leal tempLowBitsArray, %ecx // tempLowBitsArray + movl $1023, %ebx // 0x03ff to extact lower 10-bits + + .align 4,0x90 +L_WK_unpack_3_tenbits: + movl (%esi), %eax // w = *next_low_bits + movl %eax, %edx // w + andl %ebx, %edx // 1st 10-bit + movl %edx, (%ecx) // save 1st 10-bit + shrl $10, %eax // (w>>10) + movl %eax, %edx // (w>>10) + andl %ebx, %edx // 2nd 10-bit + movl %edx, 4(%ecx) // save 2nd 10-bit + shrl $10, %eax // (w>>20), no need to and with mask, the top 2 bits should be zero + movl %eax, 8(%ecx) // save 3rd 10-bits + addl $4, %esi // point to next w + addl $12, %ecx // tempLowBitsArray += 3; + cmpl %esi, %edi // LOW_BITS_AREA_END vs next_low_bits + ja L_WK_unpack_3_tenbits // repeat until next_low_bits>=LOW_BITS_AREA_END +1: + call Lhash +Lhash: + popl %ebx // set up %ebx for use in Hash Table loopup[ + + #define next_tag %esi + #define next_qpos %edi + + movl tempTagsArray, next_tag // next_tag = tempTagsArray + leal tempQPosArray, next_qpos // next_qpos = tempQPosArray + movl 12(%ebp), %ecx // dest_buf + addl $4, %ecx // for some reason, performance is better if we points to the next one + leal tempLowBitsArray, %eax // tempLowBitsArray + movl %eax, next_low_bits // next_low_bits = next_low_bits; + leal -264(%ebp), %edx + movl %edx, tag_area_end // tag_area_end + leal dictionary_addr, %eax // dictionary starting address + movl %eax, dictionary // dictionary + jmp L11 + .align 4,0x90 +L29: + jle L_ZERO_TAG + cmpb $2, %al // MISS_TAG + je L_MISS_TAG +L_EXACT_TAG: + movsbl (next_qpos),%eax // qpos = *next_qpos + incl next_qpos // next_qpos++ + movl dictionary, %edx // dictionary + movl (%edx,%eax,4), %eax // w = dictionary[qpos] + movl %eax, -4(%ecx) // *dest_buf = w + .align 4,0x90 +L_next: + incl next_tag // next_tag++ + addl $4, %ecx // dest_buf++ + cmpl tag_area_end, next_tag // next_tag vs tag_area_end + jae L_done // if (next_tag>=tag_area_end) +L11: + movzbl (next_tag), %eax // tag = *next_tag + cmpb $1, %al // Partial match? + jne L29 +L_PARTIAL_TAG: + movsbl (next_qpos),%edx // qpos = *next_qpos + movl dictionary, %eax // dictionary + leal (%eax,%edx,4), %edx // dict_location = &dictionary[qpos] + movl %edx, -7324(%ebp) // save dict_location to release %edx + incl next_qpos // next_qpos++ + movl (%edx), %eax // read dictionary word + andl $-1024, %eax // keep only higher 22-bits + movl next_low_bits, %edx // low_bits = *next_low_bits + orl (%edx), %eax // construct the new partially matched word + addl $4, %edx // + movl %edx, next_low_bits // next_low_bits++ + movl -7324(%ebp), %edx // dict_location + movl %eax, (%edx) // update *dict_location with the newly constructed word + movl %eax, -4(%ecx) // *dest_buf = the newly constructed word + incl next_tag // next_tag++ + addl $4, %ecx // dest_buf++ + cmpl tag_area_end, next_tag // next_tag vs tag_area_end + jb L11 // if next_tag < tag_area_end, repeat the loop +L_done: + + // release stack memory, restore registers, and return + addl $7324, %esp + popl %ebx + popl %esi + popl %edi + leave + ret + + #define next_full_patt -7292(%ebp) /* next_full_patt starts with initial value of TAGS_AREA_END */ + + .align 4,0x90 +L_MISS_TAG: + movl next_full_patt, %edx // next_full_patt + movl (%edx), %eax // word = *next_full_patt + addl $4, %edx // next_full_patt++ + movl %edx, next_full_patt // save next_full_patt + movl %eax, %edx // word + shrl $10, %edx // word>>10 + andl $255, %edx // 8-bit hash table index + movsbl _hashLookupTable-Lhash(%ebx,%edx),%edx // qpos + movl %eax, -88(%ebp,%edx) // dictionary[qpos] = word + movl %eax, -4(%ecx) // *dest_buf = word + jmp L_next // repeat the loop + + .align 4,0x90 +L_ZERO_TAG: + movl $0, -4(%ecx) // *dest_buf = 0 + jmp L_next // repeat the loop + +#endif // __i386__ + +#if defined __x86_64__ + + + .text + .align 4,0x90 + + .globl _WKdm_decompress +_WKdm_decompress: + + // save registers, and allocate stack memory for local variables + + pushq %rbp + movq %rsp, %rbp + pushq %r12 + pushq %rbx + subq $7144, %rsp + + movq %rsi, %r12 // dest_buf + + // PRELOAD_DICTIONARY; dictionary starting address : starting address -80(%rpb) + movl $1, -80(%rbp) + movl $1, -76(%rbp) + movl $1, -72(%rbp) + movl $1, -68(%rbp) + movl $1, -64(%rbp) + movl $1, -60(%rbp) + movl $1, -56(%rbp) + movl $1, -52(%rbp) + movl $1, -48(%rbp) + movl $1, -44(%rbp) + movl $1, -40(%rbp) + movl $1, -36(%rbp) + movl $1, -32(%rbp) + movl $1, -28(%rbp) + movl $1, -24(%rbp) + movl $1, -20(%rbp) + + // WK_unpack_2bits(TAGS_AREA_START(src_buf), TAGS_AREA_END(src_buf), tempTagsArray); + leaq 272(%rdi), %r10 // TAGS_AREA_END + leaq 16(%rdi), %rax // TAGS_AREA_START + leaq -1280(%rbp), %rsi // tempTagsArray + cmpq %rax, %r10 // TAGS_AREA_END vs TAGS_AREA_START + jbe 1f // if TAGS_AREA_END <= TAGS_AREA_START, skip L_WK_unpack_2bits + movq %rsi, %rcx // next_word + xorl %r8d, %r8d // i = 0 + .align 4,0x90 +L_WK_unpack_2bits: + movl 16(%rdi,%r8,4), %edx // w = *next_word + movl %edx, %eax // w + andl $50529027, %eax // 1st 4 tags + movl %eax, (%rcx) // write 1st 4 tags + movl %edx, %eax // w + shrl $2, %eax // w>>2 + andl $50529027, %eax // 2nd 4 tags + movl %eax, 4(%rcx) // write 2nd 4 tags + shrl $4, %edx // w>>4 + movl %edx, %eax // w>>4 + andl $50529027, %eax // 3rd 4 tags + movl %eax, 8(%rcx) // write 3rd 4 tags + shrl $2, %edx // w>>6 + andl $50529027, %edx // 4th 4 tags + movl %edx, 12(%rcx) // write 4th 4 tags + addq $16, %rcx // next_tags += 16 + incq %r8 // i++ + cmpq $64, %r8 // i vs 64 + jne L_WK_unpack_2bits // repeat loop until i==64 +1: + + // WK_unpack_4bits(QPOS_AREA_START(src_buf), QPOS_AREA_END(src_buf), tempQPosArray); + + mov 8(%rdi), %eax // WKdm header qpos end + leaq (%rdi,%rax,4), %r9 // QPOS_AREA_END + mov 4(%rdi), %eax // WKdm header qpos start + leaq (%rdi,%rax,4), %r8 // QPOS_AREA_START + leaq -2480(%rbp), %rbx // tempQPosArray + cmpq %r8, %r9 // QPOS_AREA_END vs QPOS_AREA_START + jbe 1f // if QPOS_AREA_END <= QPOS_AREA_START, skip L_WK_unpack_4bits + leaq 8(%rbx), %rcx // next_qpos +L_WK_unpack_4bits: + movl (%r8), %eax // w = *next_word + movl %eax, %edx // w + andl $252645135, %edx // 1st 4 qpos + movl %edx, -8(%rcx) // write 1st 4 qpos + shrl $4, %eax // w>>4 + andl $252645135, %eax // 2nd 4 qpos + movl %eax, -4(%rcx) // write 2nd 4 qpos + addq $4, %r8 // next_word++ + addq $8, %rcx // next_qpos+=8 + cmpq %r8, %r9 // QPOS_AREA_END vs QPOS_AREA_START + ja L_WK_unpack_4bits // repeat loop until QPOS_AREA_END <= QPOS_AREA_START +1: + + // WK_unpack_3_tenbits(LOW_BITS_AREA_START(src_buf), LOW_BITS_AREA_END(src_buf), tempLowBitsArray); + + mov 12(%rdi), %eax // LOW_BITS_AREA_END offset + leaq (%rdi,%rax,4), %rdi // LOW_BITS_AREA_END + leaq -7280(%rbp), %r11 // tempLowBitsArray + cmpq %rdi, %r9 // LOW_BITS_AREA_START vs LOW_BITS_AREA_END + jae 1f // if START>=END, skip L_WK_unpack_3_tenbits + leaq 12(%r11), %rcx // next_low_bits +L_WK_unpack_3_tenbits: + movl (%r9), %eax // w = *next_word + movl %eax, %edx // w + andl $1023, %edx // 1st tenbits + movl %edx, -12(%rcx) // write 1st tenbits + shrl $10, %eax // w >> 10 + movl %eax, %edx // w >> 10 + andl $1023, %edx // 2nd tenbits + movl %edx, -8(%rcx) // write 2nd tenbits + shrl $10, %eax // w >> 20, 3rd tenbits + movl %eax, -4(%rcx) // write 3rd tenbits + addq $4, %r9 // next_word++ + addq $12, %rcx // next_low_bits += 3 + cmpq %r9, %rdi // LOW_BITS_AREA_END vs next_word + ja L_WK_unpack_3_tenbits // repeat loop if LOW_BITS_AREA_END > next_word +1: + movq %rsi, %rdi // next_tag + movq %rbx, %r8 // next_qpos + leaq 4(%r12), %rcx // dest_buf + movq %r11, %r9 // next_low_bits + leaq -80(%rbp), %r11 // dictionary + leaq _hashLookupTable(%rip), %rbx // hash look up table + leaq 1024(%rsi), %rsi // tag_area_end + + jmp L11 + .align 4,0x90 +L31: + jle L_ZERO_TAG + cmpb $2, %al // MISS_TAG + je L_MISS_TAG +L_EXACT_TAG: + movsbq (%r8),%rax // qpos = *next_qpos + incq %r8 // next_qpos++ + movl (%r11,%rax,4), %eax // w = dictionary[qpos] + movl %eax, -4(%rcx) // *dest_buf = w + .align 4,0x90 +L_next: + incq %rdi // next_tag++ + addq $4, %rcx // dest_buf++ + cmpq %rsi, %rdi // next_tag vs tag_area_end + jae L_done // if next_tag >= tag_area_end, we're done +L11: + movzbl (%rdi), %eax // tag = *next_tag + cmpb $1, %al // partial match tag ? + jne L31 +L_PARTIAL_TAG: + movsbq (%r8),%rdx // qpos = *next_qpos + leaq (%r11,%rdx,4), %rdx // dict_location = &dictionary[qpos] + incq %r8 // next_qpos++ + movl (%rdx), %eax // read dictionary word + andl $-1024, %eax // clear lower 10 bits + orl (%r9), %eax // pad the lower 10-bits from *next_low_bits + addq $4, %r9 // next_low_bits++ + movl %eax, (%rdx) // *dict_location = newly formed word + movl %eax, -4(%rcx) // *dest_buf = newly formed word + cmpq %rsi, %rdi // compare next_tag vs tag_area_end + jne L_next // repeat loop until next_tag==tag_area_end +L_done: + + // release stack memory, restore registers, and return + addq $7144, %rsp + popq %rbx + popq %r12 + leave + ret + + .align 4,0x90 +L_MISS_TAG: + movl (%r10), %eax // w = *next_full_patt + addq $4, %r10 // next_full_patt++ + movl %eax, %edx // w + shrl $10, %edx // w>>10 + movzbl %dl, %edx // 8-bit hash table index + movsbq (%rbx,%rdx),%rdx // qpos + movl %eax, -80(%rbp,%rdx) // dictionary[qpos] = word + movl %eax, -4(%rcx) // *dest_buf = word + jmp L_next // repeat the loop + + .align 4,0x90 +L_ZERO_TAG: + movl $0, -4(%rcx) // *dest_buf = 0 + jmp L_next // repeat the loop + +#endif // --X86_64__ + +.globl _hashLookupTable + .const + .align 5 +_hashLookupTable: + .byte 0 + .byte 52 + .byte 8 + .byte 56 + .byte 16 + .byte 12 + .byte 28 + .byte 20 + .byte 4 + .byte 36 + .byte 48 + .byte 24 + .byte 44 + .byte 40 + .byte 32 + .byte 60 + .byte 8 + .byte 12 + .byte 28 + .byte 20 + .byte 4 + .byte 60 + .byte 16 + .byte 36 + .byte 24 + .byte 48 + .byte 44 + .byte 32 + .byte 52 + .byte 56 + .byte 40 + .byte 12 + .byte 8 + .byte 48 + .byte 16 + .byte 52 + .byte 60 + .byte 28 + .byte 56 + .byte 32 + .byte 20 + .byte 24 + .byte 36 + .byte 40 + .byte 44 + .byte 4 + .byte 8 + .byte 40 + .byte 60 + .byte 32 + .byte 20 + .byte 44 + .byte 4 + .byte 36 + .byte 52 + .byte 24 + .byte 16 + .byte 56 + .byte 48 + .byte 12 + .byte 28 + .byte 16 + .byte 8 + .byte 40 + .byte 36 + .byte 28 + .byte 32 + .byte 12 + .byte 4 + .byte 44 + .byte 52 + .byte 20 + .byte 24 + .byte 48 + .byte 60 + .byte 56 + .byte 40 + .byte 48 + .byte 8 + .byte 32 + .byte 28 + .byte 36 + .byte 4 + .byte 44 + .byte 20 + .byte 56 + .byte 60 + .byte 24 + .byte 52 + .byte 16 + .byte 12 + .byte 12 + .byte 4 + .byte 48 + .byte 20 + .byte 8 + .byte 52 + .byte 16 + .byte 60 + .byte 24 + .byte 36 + .byte 44 + .byte 28 + .byte 56 + .byte 40 + .byte 32 + .byte 36 + .byte 20 + .byte 24 + .byte 60 + .byte 40 + .byte 44 + .byte 52 + .byte 16 + .byte 32 + .byte 4 + .byte 48 + .byte 8 + .byte 28 + .byte 56 + .byte 12 + .byte 28 + .byte 32 + .byte 40 + .byte 52 + .byte 36 + .byte 16 + .byte 20 + .byte 48 + .byte 8 + .byte 4 + .byte 60 + .byte 24 + .byte 56 + .byte 44 + .byte 12 + .byte 8 + .byte 36 + .byte 24 + .byte 28 + .byte 16 + .byte 60 + .byte 20 + .byte 56 + .byte 32 + .byte 40 + .byte 48 + .byte 12 + .byte 4 + .byte 44 + .byte 52 + .byte 44 + .byte 40 + .byte 12 + .byte 56 + .byte 8 + .byte 36 + .byte 24 + .byte 60 + .byte 28 + .byte 48 + .byte 4 + .byte 32 + .byte 20 + .byte 16 + .byte 52 + .byte 60 + .byte 12 + .byte 24 + .byte 36 + .byte 8 + .byte 4 + .byte 16 + .byte 56 + .byte 48 + .byte 44 + .byte 40 + .byte 52 + .byte 32 + .byte 20 + .byte 28 + .byte 32 + .byte 12 + .byte 36 + .byte 28 + .byte 24 + .byte 56 + .byte 40 + .byte 16 + .byte 52 + .byte 44 + .byte 4 + .byte 20 + .byte 60 + .byte 8 + .byte 48 + .byte 48 + .byte 52 + .byte 12 + .byte 20 + .byte 32 + .byte 44 + .byte 36 + .byte 28 + .byte 4 + .byte 40 + .byte 24 + .byte 8 + .byte 56 + .byte 60 + .byte 16 + .byte 36 + .byte 32 + .byte 8 + .byte 40 + .byte 4 + .byte 52 + .byte 24 + .byte 44 + .byte 20 + .byte 12 + .byte 28 + .byte 48 + .byte 56 + .byte 16 + .byte 60 + .byte 4 + .byte 52 + .byte 60 + .byte 48 + .byte 20 + .byte 16 + .byte 56 + .byte 44 + .byte 24 + .byte 8 + .byte 40 + .byte 12 + .byte 32 + .byte 28 + .byte 36 + .byte 24 + .byte 32 + .byte 12 + .byte 4 + .byte 20 + .byte 16 + .byte 60 + .byte 36 + .byte 28 + .byte 8 + .byte 52 + .byte 40 + .byte 48 + .byte 44 + .byte 56 diff --git a/libkern/kxld/kxld.c b/libkern/kxld/kxld.c index 3d9de9588..ada1cf3cf 100644 --- a/libkern/kxld/kxld.c +++ b/libkern/kxld/kxld.c @@ -43,7 +43,7 @@ #include "kxld_array.h" #include "kxld_dict.h" #include "kxld_kext.h" -#include "kxld_state.h" +#include "kxld_object.h" #include "kxld_sym.h" #include "kxld_symtab.h" #include "kxld_util.h" @@ -54,11 +54,12 @@ struct kxld_vtable; struct kxld_context { KXLDKext *kext; KXLDArray *section_order; - KXLDArray deps; - KXLDArray tmps; - KXLDDict defined_symbols; - KXLDDict obsolete_symbols; - KXLDDict vtables; + KXLDArray objects; + KXLDArray dependencies; + KXLDDict defined_symbols_by_name; + KXLDDict defined_cxx_symbols_by_value; + KXLDDict obsolete_symbols_by_name; + KXLDDict vtables_by_name; KXLDFlags flags; KXLDAllocateCallback allocate_callback; cpu_type_t cputype; @@ -88,6 +89,14 @@ static KXLDDict *s_order_dict; * Prototypes *******************************************************************************/ +static kern_return_t init_context(KXLDContext *context, u_int ndependencies); +static kern_return_t init_kext_objects(KXLDContext *context, u_char *file, + u_long size, const char *name, KXLDDependency *dependencies, + u_int ndependencies); +static KXLDObject * get_object_for_file(KXLDContext *context, + u_char *file, u_long size, const char *name); +static u_char * allocate_kext(KXLDContext *context, void *callback_data, + kxld_addr_t *vmaddr, u_long *vmsize, u_char **linked_object_alloc_out); static void clear_context(KXLDContext *context); /******************************************************************************* @@ -98,10 +107,10 @@ kxld_create_context(KXLDContext **_context, KXLDFlags flags, cpu_type_t cputype, cpu_subtype_t cpusubtype) { kern_return_t rval = KERN_FAILURE; - KXLDContext *context = NULL; - KXLDArray *section_order = NULL; + KXLDContext * context = NULL; + KXLDArray * section_order = NULL; #if !KERNEL - cpu_type_t *cputype_p = NULL; + cpu_type_t * cputype_p = NULL; #endif check(_context); @@ -177,7 +186,7 @@ kxld_create_context(KXLDContext **_context, context = NULL; finish: - if (context) kxld_free(context, sizeof(*context)); + if (context) kxld_destroy_context(context); if (section_order) kxld_free(section_order, sizeof(*section_order)); #if !KERNEL if (cputype_p) kxld_free(cputype_p, sizeof(*cputype_p)); @@ -191,24 +200,30 @@ kxld_create_context(KXLDContext **_context, void kxld_destroy_context(KXLDContext *context) { - KXLDState *dep = NULL; + KXLDObject *object = NULL; + KXLDKext *dep = NULL; u_int i = 0; check(context); kxld_kext_deinit(context->kext); - for (i = 0; i < context->deps.maxitems; ++i) { - dep = kxld_array_get_slot(&context->deps, i); - kxld_state_deinit(dep); + for (i = 0; i < context->objects.maxitems; ++i) { + object = kxld_array_get_slot(&context->objects, i); + kxld_object_deinit(object); } + kxld_array_deinit(&context->objects); - kxld_array_deinit(&context->deps); - kxld_array_deinit(&context->tmps); + for (i = 0; i < context->dependencies.maxitems; ++i) { + dep = kxld_array_get_slot(&context->dependencies, i); + kxld_kext_deinit(dep); + } + kxld_array_deinit(&context->dependencies); - kxld_dict_deinit(&context->defined_symbols); - kxld_dict_deinit(&context->obsolete_symbols); - kxld_dict_deinit(&context->vtables); + kxld_dict_deinit(&context->defined_symbols_by_name); + kxld_dict_deinit(&context->defined_cxx_symbols_by_value); + kxld_dict_deinit(&context->obsolete_symbols_by_name); + kxld_dict_deinit(&context->vtables_by_name); kxld_free(context->kext, kxld_kext_sizeof()); kxld_free(context, sizeof(*context)); @@ -220,229 +235,262 @@ kxld_destroy_context(KXLDContext *context) *******************************************************************************/ kern_return_t kxld_link_file( - KXLDContext *context, - u_char *file, - u_long size, - const char *name, - void *callback_data, - u_char **deps, - u_int ndeps, - u_char **_linked_object, - kxld_addr_t *kmod_info_kern, - u_char **_link_state, - u_long *_link_state_size, - u_char **_symbol_file __unused, - u_long *_symbol_file_size __unused) + KXLDContext * context, + u_char * file, + u_long size, + const char * name, + void * callback_data, + KXLDDependency * dependencies, + u_int ndependencies, + u_char ** linked_object_out, + kxld_addr_t * kmod_info_kern) { - kern_return_t rval = KERN_FAILURE; - KXLDState *state = NULL; - KXLDAllocateFlags flags = 0; - kxld_addr_t vmaddr = 0; - u_long header_size = 0; - u_long vmsize = 0; - u_int nsyms = 0; - u_int nvtables = 0; - u_int i = 0; - u_char *linked_object = NULL; - u_char *linked_object_alloc = NULL; - u_char *link_state = NULL; - u_char *symbol_file = NULL; - u_long link_state_size = 0; - u_long symbol_file_size = 0; + kern_return_t rval = KERN_FAILURE; + kxld_addr_t vmaddr = 0; + u_long vmsize = 0; + u_char * linked_object = NULL; + u_char * linked_object_alloc = NULL; kxld_set_logging_callback_data(name, callback_data); + kxld_log(kKxldLogLinking, kKxldLogBasic, "Linking kext %s", name); + require_action(context, finish, rval=KERN_INVALID_ARGUMENT); require_action(file, finish, rval=KERN_INVALID_ARGUMENT); require_action(size, finish, rval=KERN_INVALID_ARGUMENT); + require_action(dependencies, finish, rval=KERN_INVALID_ARGUMENT); + require_action(ndependencies, finish, rval=KERN_INVALID_ARGUMENT); + require_action(linked_object_out, finish, rval=KERN_INVALID_ARGUMENT); + require_action(kmod_info_kern, finish, rval=KERN_INVALID_ARGUMENT); - rval = kxld_array_init(&context->deps, sizeof(struct kxld_state), ndeps); + rval = init_context(context, ndependencies); require_noerr(rval, finish); - if (deps) { - /* Initialize the dependencies */ - for (i = 0; i < ndeps; ++i) { - state = kxld_array_get_item(&context->deps, i); - - rval = kxld_state_init_from_file(state, deps[i], - context->section_order); - require_noerr(rval, finish); - } - } - - rval = kxld_kext_init(context->kext, file, size, name, - context->flags, (deps == 0) /* is_kernel */, context->section_order, - context->cputype, context->cpusubtype); + rval = init_kext_objects(context, file, size, name, + dependencies, ndependencies); require_noerr(rval, finish); - if (deps) { + linked_object = allocate_kext(context, callback_data, + &vmaddr, &vmsize, &linked_object_alloc); + require_action(linked_object, finish, rval=KERN_RESOURCE_SHORTAGE); - /* Calculate the base number of symbols and vtables in the kext */ + rval = kxld_kext_relocate(context->kext, vmaddr, + &context->vtables_by_name, + &context->defined_symbols_by_name, + &context->obsolete_symbols_by_name, + &context->defined_cxx_symbols_by_value); + require_noerr(rval, finish); - nsyms += kxld_kext_get_num_symbols(context->kext); - nvtables += kxld_kext_get_num_vtables(context->kext); + rval = kxld_kext_export_linked_object(context->kext, + linked_object, kmod_info_kern); + require_noerr(rval, finish); - /* Extract the symbol and vtable counts from the dependencies. - */ + *linked_object_out = linked_object; + linked_object_alloc = NULL; - for (i = 0; i < ndeps; ++i) { - cpu_type_t cputype; - cpu_subtype_t cpusubtype; + rval = KERN_SUCCESS; +finish: + if (linked_object_alloc) { + kxld_page_free_untracked(linked_object_alloc, vmsize); + } - state = kxld_array_get_item(&context->deps, i); + clear_context(context); + kxld_set_logging_callback_data(NULL, NULL); - kxld_state_get_cputype(state, &cputype, &cpusubtype); + return rval; +} - rval = kxld_kext_validate_cputype(context->kext, - cputype, cpusubtype); - require_noerr(rval, finish); +/******************************************************************************* +*******************************************************************************/ +static kern_return_t +init_context(KXLDContext *context, u_int ndependencies) +{ + kern_return_t rval = KERN_FAILURE; - nsyms += kxld_state_get_num_symbols(state); - nvtables += kxld_state_get_num_vtables(state); - } + /* Create an array of objects large enough to hold an object + * for every dependency, an interface for each dependency, and a kext. */ + rval = kxld_array_init(&context->objects, + kxld_object_sizeof(), 2 * ndependencies + 1); + require_noerr(rval, finish); - /* Create the global symbol and vtable tables */ + rval = kxld_array_init(&context->dependencies, + kxld_kext_sizeof(), ndependencies); + require_noerr(rval, finish); - rval = kxld_dict_init(&context->defined_symbols, kxld_dict_string_hash, - kxld_dict_string_cmp, nsyms); - require_noerr(rval, finish); + rval = kxld_dict_init(&context->defined_symbols_by_name, + kxld_dict_string_hash, kxld_dict_string_cmp, 0); + require_noerr(rval, finish); - rval = kxld_dict_init(&context->obsolete_symbols, kxld_dict_string_hash, - kxld_dict_string_cmp, 0); - require_noerr(rval, finish); + rval = kxld_dict_init(&context->defined_cxx_symbols_by_value, + kxld_dict_kxldaddr_hash, kxld_dict_kxldaddr_cmp, 0); + require_noerr(rval, finish); - rval = kxld_dict_init(&context->vtables, kxld_dict_string_hash, - kxld_dict_string_cmp, nvtables); - require_noerr(rval, finish); + rval = kxld_dict_init(&context->obsolete_symbols_by_name, + kxld_dict_string_hash, kxld_dict_string_cmp, 0); + require_noerr(rval, finish); - /* Populate the global tables */ + rval = kxld_dict_init(&context->vtables_by_name, kxld_dict_string_hash, + kxld_dict_string_cmp, 0); + require_noerr(rval, finish); - for (i = 0; i < ndeps; ++i) { - state = kxld_array_get_item(&context->deps, i); + rval = KERN_SUCCESS; +finish: + return rval; +} - rval = kxld_state_get_symbols(state, &context->defined_symbols, - &context->obsolete_symbols); - require_noerr(rval, finish); +/******************************************************************************* +*******************************************************************************/ +static kern_return_t +init_kext_objects(KXLDContext *context, u_char *file, u_long size, + const char *name, KXLDDependency *dependencies, u_int ndependencies) +{ + kern_return_t rval = KERN_FAILURE; + KXLDKext *kext = NULL; + KXLDObject *kext_object = NULL; + KXLDObject *interface_object = NULL; + u_int i = 0; - rval = kxld_state_get_vtables(state, &context->vtables); - require_noerr(rval, finish); + /* Create a kext object for each dependency. If it's a direct dependency, + * export its symbols by name by value. If it's indirect, just export the + * C++ symbols by value. + */ + for (i = 0; i < ndependencies; ++i) { kext = + kxld_array_get_item(&context->dependencies, i); kext_object = NULL; + interface_object = NULL; + + kext_object = get_object_for_file(context, dependencies[i].kext, + dependencies[i].kext_size, dependencies[i].kext_name); + require_action(kext_object, finish, rval=KERN_FAILURE); + + if (dependencies[i].interface) { + interface_object = get_object_for_file(context, + dependencies[i].interface, dependencies[i].interface_size, + dependencies[i].interface_name); + require_action(interface_object, finish, rval=KERN_FAILURE); } - if (kxld_kext_is_true_kext(context->kext)) { - - /* Allocate the kext object */ - - kxld_kext_get_vmsize(context->kext, &header_size, &vmsize); - vmaddr = context->allocate_callback(vmsize, &flags, callback_data); - require_action(!(vmaddr & (PAGE_SIZE-1)), finish, rval=KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, - "Load address %p is not page-aligned.", - (void *) (uintptr_t) vmaddr)); - - if (flags & kKxldAllocateWritable) { - linked_object = (u_char *) (u_long) vmaddr; - } else { - linked_object_alloc = kxld_page_alloc_untracked(vmsize); - require_action(linked_object_alloc, finish, rval=KERN_RESOURCE_SHORTAGE); - linked_object = linked_object_alloc; - } - - /* Zero out the memory before we fill it. We fill this buffer in a - * sparse fashion, and it's simpler to clear it now rather than - * track and zero any pieces we didn't touch after we've written - * all of the sections to memory. - */ - bzero(linked_object, vmsize); - - /* Relocate to the new link address */ + rval = kxld_kext_init(kext, kext_object, interface_object); + require_noerr(rval, finish); - rval = kxld_kext_relocate(context->kext, vmaddr, &context->vtables, - &context->defined_symbols, &context->obsolete_symbols); + if (dependencies[i].is_direct_dependency) { + rval = kxld_kext_export_symbols(kext, + &context->defined_symbols_by_name, + &context->obsolete_symbols_by_name, + &context->defined_cxx_symbols_by_value); require_noerr(rval, finish); - - /* Generate linked object if requested */ - - if (_linked_object) { - check(kmod_info_kern); - *_linked_object = NULL; - *kmod_info_kern = 0; - - rval = kxld_kext_export_linked_object(context->kext, linked_object, - kmod_info_kern); - require_noerr(rval, finish); - } - - } else { - /* Resolve the pseudokext's symbols */ - - rval = kxld_kext_resolve(context->kext, &context->vtables, - &context->defined_symbols); + } else { + rval = kxld_kext_export_symbols(kext, + /* defined_symbols */ NULL, /* obsolete_symbols */ NULL, + &context->defined_cxx_symbols_by_value); require_noerr(rval, finish); } } - /* Generate link state if requested */ - - if (_link_state) { - check(_link_state_size); - *_link_state = NULL; - *_link_state_size = 0; + /* Export the vtables for all of the dependencies. */ + for (i = 0; i < context->dependencies.nitems; ++i) { + kext = kxld_array_get_item(&context->dependencies, i); - kxld_dict_clear(&context->defined_symbols); - rval = kxld_state_export_kext_to_file(context->kext, &link_state, - &link_state_size, &context->defined_symbols, &context->tmps); + rval = kxld_kext_export_vtables(kext, + &context->defined_cxx_symbols_by_value, + &context->defined_symbols_by_name, + &context->vtables_by_name); require_noerr(rval, finish); } -#if !KERNEL - /* Generate symbol file if requested */ - - if (_symbol_file) { - check(_symbol_file_size); - *_symbol_file = NULL; - *_symbol_file_size = 0; + /* Create a kext object for the kext we're linking and export its locally + * defined C++ symbols. + */ + kext_object = get_object_for_file(context, file, size, name); + require_action(kext_object, finish, rval=KERN_FAILURE); - rval = kxld_kext_export_symbol_file(context->kext, &symbol_file, - &symbol_file_size); - require_noerr(rval, finish); - } -#endif /* !KERNEL */ + rval = kxld_kext_init(context->kext, kext_object, /* interface */ NULL); + require_noerr(rval, finish); - /* Commit output to return variables */ + rval = kxld_kext_export_symbols(context->kext, + /* defined_symbols */ NULL, /* obsolete_symbols */ NULL, + &context->defined_cxx_symbols_by_value); + require_noerr(rval, finish); - if (_linked_object) { - *_linked_object = linked_object; - linked_object = NULL; - linked_object_alloc = NULL; - } + rval = KERN_SUCCESS; +finish: + return rval; +} - if (_link_state) { - *_link_state = link_state; - *_link_state_size = link_state_size; - link_state = NULL; - } +/******************************************************************************* +*******************************************************************************/ +static KXLDObject * +get_object_for_file(KXLDContext *context, u_char *file, u_long size, + const char *name) +{ + KXLDObject *rval = NULL; + KXLDObject *object = NULL; + kern_return_t result = 0; + u_int i = 0; -#if !KERNEL - if (_symbol_file) { - *_symbol_file = symbol_file; - *_symbol_file_size = symbol_file_size; - symbol_file = NULL; - } -#endif + for (i = 0; i < context->objects.nitems; ++i) { + object = kxld_array_get_item(&context->objects, i); - rval = KERN_SUCCESS; + if (!kxld_object_get_file(object)) { + result = kxld_object_init_from_macho(object, file, size, name, + context->section_order, context->cputype, context->cpusubtype); + require_noerr(result, finish); -finish: + rval = object; + break; + } - if (linked_object_alloc) kxld_page_free_untracked(linked_object_alloc, vmsize); - if (link_state) kxld_page_free_untracked(link_state, link_state_size); - if (symbol_file) kxld_page_free_untracked(symbol_file, symbol_file_size); + if (kxld_object_get_file(object) == file) { + rval = object; + break; + } + } - clear_context(context); +finish: + return rval; +} + +/******************************************************************************* +*******************************************************************************/ +static u_char * +allocate_kext(KXLDContext *context, void *callback_data, + kxld_addr_t *vmaddr_out, u_long *vmsize_out, + u_char **linked_object_alloc_out) +{ + KXLDAllocateFlags flags = 0; + kxld_addr_t vmaddr = 0; + u_long vmsize = 0; + u_long header_size = 0; + u_char * linked_object = NULL; + + *linked_object_alloc_out = NULL; + + kxld_kext_get_vmsize(context->kext, &header_size, &vmsize); + vmaddr = context->allocate_callback(vmsize, &flags, callback_data); + require_action(!(vmaddr & (PAGE_SIZE-1)), finish, + kxld_log(kKxldLogLinking, kKxldLogErr, + "Load address %p is not page-aligned.", + (void *) (uintptr_t) vmaddr)); + + if (flags & kKxldAllocateWritable) { + linked_object = (u_char *) (u_long) vmaddr; + } else { + linked_object = kxld_page_alloc_untracked(vmsize); + require(linked_object, finish); + + *linked_object_alloc_out = linked_object; + } - kxld_set_logging_callback_data(NULL, NULL); + /* Zero out the memory before we fill it. We fill this buffer in a + * sparse fashion, and it's simpler to clear it now rather than + * track and zero any pieces we didn't touch after we've written + * all of the sections to memory. + */ + bzero(linked_object, vmsize); + *vmaddr_out = vmaddr; + *vmsize_out = vmsize; - return rval; +finish: + return linked_object; } /******************************************************************************* @@ -450,21 +498,29 @@ kxld_link_file( static void clear_context(KXLDContext *context) { - KXLDState *state = NULL; + KXLDObject * object = NULL; + KXLDKext * dep = NULL; u_int i = 0; check(context); kxld_kext_clear(context->kext); - for (i = 0; i < context->deps.nitems; ++i) { - state = kxld_array_get_item(&context->deps, i); - kxld_state_clear(state); + + for (i = 0; i < context->objects.nitems; ++i) { + object = kxld_array_get_item(&context->objects, i); + kxld_object_clear(object); + } + kxld_array_reset(&context->objects); + + for (i = 0; i < context->dependencies.nitems; ++i) { + dep = kxld_array_get_item(&context->dependencies, i); + kxld_kext_clear(dep); } - kxld_array_reset(&context->deps); + kxld_array_reset(&context->dependencies); - kxld_array_clear(&context->tmps); - kxld_dict_clear(&context->defined_symbols); - kxld_dict_clear(&context->obsolete_symbols); - kxld_dict_clear(&context->vtables); + kxld_dict_clear(&context->defined_symbols_by_name); + kxld_dict_clear(&context->defined_cxx_symbols_by_value); + kxld_dict_clear(&context->obsolete_symbols_by_name); + kxld_dict_clear(&context->vtables_by_name); } diff --git a/libkern/kxld/kxld_array.c b/libkern/kxld/kxld_array.c index 9720f3d08..55d009ba4 100644 --- a/libkern/kxld/kxld_array.c +++ b/libkern/kxld/kxld_array.c @@ -139,6 +139,9 @@ array_init(KXLDArray *array, size_t itemsize, u_int nitems) { kern_return_t rval = KERN_FAILURE; KXLDArrayPool *pool = NULL; + + require_action(itemsize, finish, rval=KERN_INVALID_ARGUMENT); + require_action(array->npools < 2, finish, rval=KERN_INVALID_ARGUMENT); array->itemsize = itemsize; diff --git a/libkern/kxld/kxld_copyright.c b/libkern/kxld/kxld_copyright.c index 9b70348e8..e1f13c257 100644 --- a/libkern/kxld/kxld_copyright.c +++ b/libkern/kxld/kxld_copyright.c @@ -1,3 +1,31 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + #include #include #include @@ -186,7 +214,7 @@ dates_are_valid(const char *str, const u_long len) if (is_token_break(token_ptr)) { if (!token_index) continue; - token_buffer[token_index++] = '\0'; + token_buffer[token_index] = '\0'; if (!token_is_year(token_buffer) && !token_is_yearRange(token_buffer)) @@ -230,7 +258,7 @@ kxld_validate_copyright_string(const char *str) str = copyright + const_strlen(kCopyrightToken); len = rights - str; - date_str = kxld_alloc(len); + date_str = kxld_alloc(len+1); if (!date_str) goto finish; strncpy(date_str, str, len); @@ -240,7 +268,7 @@ kxld_validate_copyright_string(const char *str) result = TRUE; finish: - if (date_str) kxld_free(date_str, len); + if (date_str) kxld_free(date_str, len+1); return result; } diff --git a/libkern/kxld/kxld_demangle.c b/libkern/kxld/kxld_demangle.c index 98ca4d55a..c0bb5e276 100644 --- a/libkern/kxld/kxld_demangle.c +++ b/libkern/kxld/kxld_demangle.c @@ -1,3 +1,31 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + #if !KERNEL #include diff --git a/libkern/kxld/kxld_demangle.h b/libkern/kxld/kxld_demangle.h index 1fee33193..5c38abc8f 100644 --- a/libkern/kxld/kxld_demangle.h +++ b/libkern/kxld/kxld_demangle.h @@ -1,3 +1,31 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + #ifndef _KXLD_DEMANGLE_H_ #define _KXLD_DEMANGLE_H_ diff --git a/libkern/kxld/kxld_kext.c b/libkern/kxld/kxld_kext.c index a5520711e..b2be1535a 100644 --- a/libkern/kxld/kxld_kext.c +++ b/libkern/kxld/kxld_kext.c @@ -54,10 +54,10 @@ #include "kxld_demangle.h" #include "kxld_dict.h" #include "kxld_kext.h" +#include "kxld_object.h" #include "kxld_reloc.h" #include "kxld_sect.h" #include "kxld_seg.h" -#include "kxld_state.h" #include "kxld_symtab.h" #include "kxld_util.h" #include "kxld_uuid.h" @@ -65,125 +65,44 @@ struct symtab_command; -enum kxld_link_type { - KXLD_LINK_KERNEL, - KXLD_LINK_PSEUDO_KEXT, - KXLD_LINK_KEXT, - KXLD_LINK_UNKNOWN -}; - -typedef enum kxld_link_type KXLDLinkType; - struct kxld_kext { - u_char *file; - u_long size; - const char *name; - uint32_t filetype; - KXLDArray segs; - KXLDArray sects; + KXLDObject *kext; + KXLDObject *interface; KXLDArray vtables; - KXLDArray extrelocs; - KXLDArray locrelocs; KXLDDict vtable_index; - KXLDRelocator relocator; - KXLDuuid uuid; - KXLDSymtab *symtab; - kxld_addr_t link_addr; - kmod_info_t *kmod_info; - kxld_addr_t kmod_link_addr; - cpu_type_t cputype; - cpu_subtype_t cpusubtype; - KXLDLinkType link_type; - KXLDFlags flags; - boolean_t is_final_image; - boolean_t got_is_created; - struct dysymtab_command *dysymtab_hdr; -#if KXLD_USER_OR_OBJECT - KXLDArray *section_order; -#endif -#if !KERNEL - enum NXByteOrder host_order; - enum NXByteOrder target_order; -#endif + boolean_t vtables_created; + boolean_t vtable_index_created; }; /******************************************************************************* * Prototypes *******************************************************************************/ -static kern_return_t get_target_machine_info(KXLDKext *kext, cpu_type_t cputype, - cpu_subtype_t cpusubtype); -static kern_return_t get_file_for_arch(KXLDKext *kext, u_char *file, u_long size); - -static u_long get_macho_header_size(const KXLDKext *kext); -static u_long get_macho_data_size(const KXLDKext *kext); -static kern_return_t export_macho_header(const KXLDKext *kext, u_char *buf, - u_int ncmds, u_long *header_offset, u_long header_size); - -static kern_return_t init_from_execute(KXLDKext *kext); -static kern_return_t init_from_final_linked_image(KXLDKext *kext, u_int *filetype_out, - struct symtab_command **symtab_hdr_out); - -static boolean_t target_supports_protected_segments(const KXLDKext *kext) - __attribute__((pure)); - -#if KXLD_USER_OR_OBJECT -static boolean_t target_supports_object(const KXLDKext *kext) __attribute((pure)); -static kern_return_t init_from_object(KXLDKext *kext); -static kern_return_t process_relocs_from_sections(KXLDKext *kext); -#endif /* KXLD_USER_OR_OBJECT */ - -#if KXLD_USER_OR_BUNDLE -static boolean_t target_supports_bundle(const KXLDKext *kext) __attribute((pure)); -static kern_return_t init_from_bundle(KXLDKext *kext); -static kern_return_t process_relocs_from_tables(KXLDKext *kext); -static kern_return_t process_symbol_pointers(KXLDKext *kext); -static void add_to_ptr(u_char *symptr, kxld_addr_t val, boolean_t is_32_bit); -#endif /* KXLD_USER_OR_BUNDLE */ - -static kern_return_t get_metaclass_symbol_from_super_meta_class_pointer_symbol( - KXLDKext *kext, KXLDSym *super_metaclass_pointer_sym, KXLDSym **meta_class); - -static kern_return_t resolve_symbols(KXLDKext *kext, KXLDDict *defined_symbols, - KXLDDict *obsolete_symbols); +static kern_return_t export_symbols_through_interface( + const KXLDObject *kext, const KXLDObject *interface, + KXLDDict *defined_symbols_by_name, + KXLDDict *defined_cxx_symbol_by_value, + KXLDDict *obsolete_symbols_by_name); +static kern_return_t export_symbols(const KXLDObject *kext, + KXLDDict *defined_symbols_by_name, + KXLDDict *defined_cxx_symbols_by_value); + +static kern_return_t create_vtables(KXLDKext *kext, + const KXLDDict *defined_symbols, const KXLDDict *defined_cxx_symbols); +static kern_return_t get_vtable_syms_from_smcp(KXLDKext *kext, + const KXLDDict *defined_symbols, KXLDSym *super_metaclass_ptr_sym, + KXLDSym **vtable_sym_out, KXLDSym **meta_vtable_sym_out); + +static kern_return_t resolve_symbols(KXLDKext *kext, + const KXLDDict *defined_symbols, const KXLDDict *obsolete_symbols); + static kern_return_t patch_vtables(KXLDKext *kext, KXLDDict *patched_vtables, - KXLDDict *defined_symbols); + const KXLDDict *defined_symbols); +static const KXLDSym *get_metaclass_symbol_from_super_meta_class_pointer_symbol( + KXLDKext *kext, KXLDSym *super_metaclass_pointer_sym); +static kern_return_t create_vtable_index(KXLDKext *kext); + static kern_return_t validate_symbols(KXLDKext *kext); -static kern_return_t populate_kmod_info(KXLDKext *kext); -static kern_return_t copy_vtables(KXLDKext *kext, const KXLDDict *patched_vtables); -static kern_return_t create_vtables(KXLDKext *kext); -static void restrict_private_symbols(KXLDKext *kext); - -#if KXLD_USER_OR_GOT || KXLD_USER_OR_COMMON -static kern_return_t add_section(KXLDKext *kext, KXLDSect **sect); -#endif /* KXLD_USER_OR_GOT || KXLD_USER_OR_COMMON */ - -#if KXLD_USER_OR_GOT -static boolean_t target_has_got(const KXLDKext *kext) __attribute__((pure)); -static kern_return_t create_got(KXLDKext *kext); -static kern_return_t populate_got(KXLDKext *kext); -#endif /* KXLD_USER_OR_GOT */ - -static boolean_t target_supports_common(const KXLDKext *kext) __attribute((pure)); -#if KXLD_USER_OR_COMMON -static kern_return_t resolve_common_symbols(KXLDKext *kext); -#endif /* KXLD_USER_OR_COMMON */ - -static boolean_t target_supports_strict_patching(KXLDKext *kext) - __attribute__((pure)); - -#if KXLD_USER_OR_ILP32 -static u_long get_macho_cmd_data_32(u_char *file, u_long offset, - u_int *filetype, u_int *ncmds); -static kern_return_t export_macho_header_32(const KXLDKext *kext, u_char *buf, - u_int ncmds, u_long *header_offset, u_long header_size); -#endif /* KXLD_USER_OR_ILP32 */ -#if KXLD_USER_OR_LP64 -static u_long get_macho_cmd_data_64(u_char *file, u_long offset, - u_int *filetype, u_int *ncmds); -static kern_return_t export_macho_header_64(const KXLDKext *kext, u_char *buf, - u_int ncmds, u_long *header_offset, u_long header_size); -#endif /* KXLD_USER_OR_LP64 */ /******************************************************************************* *******************************************************************************/ @@ -196,109 +115,93 @@ kxld_kext_sizeof(void) /******************************************************************************* *******************************************************************************/ kern_return_t -kxld_kext_init(KXLDKext *kext, u_char *file, u_long size, - const char *name, KXLDFlags flags, boolean_t is_kernel, - KXLDArray *section_order __unused, - cpu_type_t cputype, cpu_subtype_t cpusubtype) +kxld_kext_init(KXLDKext *kext, KXLDObject *kext_object, + KXLDObject *interface_object) { kern_return_t rval = KERN_FAILURE; - KXLDSeg *seg = NULL; - u_int i = 0; check(kext); - check(file); - check(size); + check(kext_object); - kext->name = name; - kext->flags = flags; -#if KXLD_USER_OR_OBJECT - kext->section_order = section_order; -#endif + kext->kext = kext_object; - /* Find the local architecture */ + if (interface_object) { + kext->interface = interface_object; - rval = get_target_machine_info(kext, cputype, cpusubtype); - require_noerr(rval, finish); - - /* Find the Mach-O file for the target architecture */ + rval = kxld_object_index_symbols_by_name(kext->kext); + require_noerr(rval, finish); + } + + rval = KERN_SUCCESS; +finish: + return rval; +} - rval = get_file_for_arch(kext, file, size); - require_noerr(rval, finish); +/******************************************************************************* +*******************************************************************************/ +void +kxld_kext_clear(KXLDKext *kext) +{ + KXLDVTable *vtable = NULL; + u_int i; - /* Build the relocator */ + check(kext); - rval = kxld_relocator_init(&kext->relocator, kext->cputype, - kext->cpusubtype, kxld_kext_target_needs_swap(kext)); - require_noerr(rval, finish); + for (i = 0; i < kext->vtables.nitems; ++i) { + vtable = kxld_array_get_item(&kext->vtables, i); + kxld_vtable_clear(vtable); + } + kxld_array_reset(&kext->vtables); + kxld_dict_clear(&kext->vtable_index); - /* Allocate the symbol table */ + kext->kext = NULL; + kext->interface = NULL; + kext->vtables_created = FALSE; + kext->vtable_index_created = FALSE; +} - if (!kext->symtab) { - kext->symtab = kxld_alloc(kxld_symtab_sizeof()); - require_action(kext->symtab, finish, rval=KERN_RESOURCE_SHORTAGE); - bzero(kext->symtab, kxld_symtab_sizeof()); - } - if (is_kernel) { - kext->link_type = KXLD_LINK_KERNEL; - } else { - kext->link_type = KXLD_LINK_UNKNOWN; - } +/******************************************************************************* +*******************************************************************************/ +void +kxld_kext_deinit(KXLDKext *kext) +{ + KXLDVTable *vtable = NULL; + u_int i; - /* There are four types of Mach-O files that we can support: - * 1) 32-bit MH_OBJECT - All pre-SnowLeopard systems - * 2) 32-bit MH_KEXT_BUNDLE - Not supported - * 3) 64-bit MH_OBJECT - Needed for K64 bringup - * 4) 64-bit MH_KEXT_BUNDLE - The likely 64-bit kext filetype - */ + check(kext); - if (kxld_kext_is_32_bit(kext)) { - struct mach_header *mach_hdr = (struct mach_header *) kext->file; - kext->filetype = mach_hdr->filetype; - } else { - struct mach_header_64 *mach_hdr = (struct mach_header_64 *) kext->file; - kext->filetype = mach_hdr->filetype; + for (i = 0; i < kext->vtables.maxitems; ++i) { + vtable = kxld_array_get_slot(&kext->vtables, i); + kxld_vtable_deinit(vtable); } + kxld_array_deinit(&kext->vtables); + kxld_dict_deinit(&kext->vtable_index); - switch (kext->filetype) { -#if KXLD_USER_OR_OBJECT - case MH_OBJECT: - rval = init_from_object(kext); - require_noerr(rval, finish); - break; -#endif /* KXLD_USER_OR_OBJECT */ -#if KXLD_USER_OR_BUNDLE - case MH_KEXT_BUNDLE: - rval = init_from_bundle(kext); - require_noerr(rval, finish); - break; -#endif /* KXLD_USER_OR_BUNDLE */ - case MH_EXECUTE: - rval = init_from_execute(kext); - require_noerr(rval, finish); - break; - default: - rval = KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, - kKxldLogFiletypeNotSupported, kext->filetype); - goto finish; - } + bzero(kext, sizeof(*kext)); +} - for (i = 0; i < kext->segs.nitems; ++i) { - seg = kxld_array_get_item(&kext->segs, i); - kxld_seg_set_vm_protections(seg, target_supports_protected_segments(kext)); - } +/******************************************************************************* +*******************************************************************************/ +kern_return_t +kxld_kext_export_symbols(const KXLDKext *kext, + struct kxld_dict *defined_symbols_by_name, + struct kxld_dict *obsolete_symbols_by_name, + struct kxld_dict *defined_cxx_symbols_by_value) +{ + kern_return_t rval = KERN_FAILURE; + + check(kext); - switch (kext->link_type) { - case KXLD_LINK_KEXT: - (void) restrict_private_symbols(kext); - /* Fallthrough */ - case KXLD_LINK_KERNEL: - rval = create_vtables(kext); + if (kext->interface) { + rval = export_symbols_through_interface(kext->kext, kext->interface, + defined_symbols_by_name, obsolete_symbols_by_name, + defined_cxx_symbols_by_value); + require_noerr(rval, finish); + } else { + rval = export_symbols(kext->kext, defined_symbols_by_name, + defined_cxx_symbols_by_value); require_noerr(rval, finish); - break; - default: - break; } rval = KERN_SUCCESS; @@ -309,189 +212,114 @@ kxld_kext_init(KXLDKext *kext, u_char *file, u_long size, /******************************************************************************* *******************************************************************************/ kern_return_t -get_target_machine_info(KXLDKext *kext, cpu_type_t cputype __unused, - cpu_subtype_t cpusubtype __unused) +export_symbols_through_interface(const KXLDObject *kext, + const KXLDObject *interface, KXLDDict *defined_symbols_by_name, + KXLDDict *obsolete_symbols_by_name, KXLDDict *defined_cxx_symbols_by_value) { -#if KERNEL - - /* Because the kernel can only link for its own architecture, we know what - * the host and target architectures are at compile time, so we can use - * a vastly simplified version of this function. - */ - - check(kext); - -#if defined(__i386__) - kext->cputype = CPU_TYPE_I386; - kext->cpusubtype = CPU_SUBTYPE_I386_ALL; - return KERN_SUCCESS; -#elif defined(__ppc__) - kext->cputype = CPU_TYPE_POWERPC; - kext->cpusubtype = CPU_SUBTYPE_POWERPC_ALL; - return KERN_SUCCESS; -#elif defined(__x86_64__) - kext->cputype = CPU_TYPE_X86_64; - kext->cpusubtype = CPU_SUBTYPE_X86_64_ALL; - return KERN_SUCCESS; -#else - kxld_log(kKxldLogLinking, kKxldLogErr, - kKxldLogArchNotSupported, _mh_execute_header->cputype); - return KERN_NOT_SUPPORTED; -#endif /* Supported architecture defines */ - - -#else /* !KERNEL */ - - /* User-space must look up the architecture it's running on and the target - * architecture at run-time. - */ - kern_return_t rval = KERN_FAILURE; - const NXArchInfo *host_arch = NULL; + KXLDSymtabIterator iter; + const KXLDSymtab *kext_symtab = NULL; + const KXLDSymtab *interface_symtab = NULL; + KXLDSym *kext_sym = NULL; + const KXLDSym *interface_sym = NULL; check(kext); + check(interface); + + kext_symtab = kxld_object_get_symtab(kext); + interface_symtab = kxld_object_get_symtab(interface); + + if (defined_symbols_by_name) { + /* Add exported symbols */ + (void) kxld_symtab_iterator_init(&iter, interface_symtab, + kxld_sym_is_undefined, FALSE); + while ((interface_sym = kxld_symtab_iterator_get_next(&iter))) { + kext_sym = kxld_symtab_get_locally_defined_symbol_by_name(kext_symtab, + interface_sym->name); + if (!kext_sym) { + kxld_log(kKxldLogLinking, kKxldLogWarn, + "In interface %s of %s, couldn't find symbol %s\n", + kxld_object_get_name(interface), kxld_object_get_name(kext), + interface_sym->name); + continue; + } - host_arch = NXGetLocalArchInfo(); - require_action(host_arch, finish, rval=KERN_FAILURE); - - kext->host_order = host_arch->byteorder; + rval = kxld_dict_insert(defined_symbols_by_name, + kext_sym->name, kext_sym); + require_noerr(rval, finish); + } - /* If the user did not specify a cputype, use the local architecture. - */ + /* Add indirect symbols */ + (void) kxld_symtab_iterator_init(&iter, interface_symtab, + kxld_sym_is_indirect, FALSE); + while ((interface_sym = kxld_symtab_iterator_get_next(&iter))) { + kext_sym = kxld_symtab_get_locally_defined_symbol_by_name(kext_symtab, + interface_sym->alias); + if (!kext_sym) { + kxld_log(kKxldLogLinking, kKxldLogWarn, + "In interface %s of %s, couldn't find indirect symbol %s (%s)\n", + kxld_object_get_name(interface), kxld_object_get_name(kext), + interface_sym->alias, interface_sym->name); + continue; + } - if (cputype) { - kext->cputype = cputype; - kext->cpusubtype = cpusubtype; - } else { - kext->cputype = host_arch->cputype; - kext->target_order = kext->host_order; - - switch (kext->cputype) { - case CPU_TYPE_I386: - kext->cpusubtype = CPU_SUBTYPE_I386_ALL; - break; - case CPU_TYPE_POWERPC: - kext->cpusubtype = CPU_SUBTYPE_POWERPC_ALL; - break; - case CPU_TYPE_X86_64: - kext->cpusubtype = CPU_SUBTYPE_X86_64_ALL; - break; - case CPU_TYPE_ARM: - kext->cpusubtype = CPU_SUBTYPE_ARM_ALL; - break; - default: - kext->cpusubtype = 0; + rval = kxld_dict_insert(defined_symbols_by_name, + interface_sym->name, kext_sym); + require_noerr(rval, finish); } } - /* Validate that we support the target architecture and record its - * endianness. - */ + /* Add obsolete symbols */ + if (obsolete_symbols_by_name) { + (void) kxld_symtab_iterator_init(&iter, interface_symtab, + kxld_sym_is_obsolete, FALSE); + while ((kext_sym = kxld_symtab_iterator_get_next(&iter))) { + rval = kxld_dict_insert(obsolete_symbols_by_name, + kext_sym->name, kext_sym); + require_noerr(rval, finish); + } + } - switch(kext->cputype) { - case CPU_TYPE_ARM: - case CPU_TYPE_I386: - case CPU_TYPE_X86_64: - kext->target_order = NX_LittleEndian; - break; - case CPU_TYPE_POWERPC: - kext->target_order = NX_BigEndian; - break; - default: - rval = KERN_NOT_SUPPORTED; - kxld_log(kKxldLogLinking, kKxldLogErr, - kKxldLogArchNotSupported, kext->cputype); - goto finish; + /* Add C++ symbols */ + if (defined_cxx_symbols_by_value) { + (void) kxld_symtab_iterator_init(&iter, kext_symtab, + kxld_sym_is_cxx, FALSE); + while ((kext_sym = kxld_symtab_iterator_get_next(&iter))) { + rval = kxld_dict_insert(defined_cxx_symbols_by_value, + &kext_sym->link_addr, kext_sym); + require_noerr(rval, finish); + } } rval = KERN_SUCCESS; - finish: return rval; -#endif /* KERNEL */ } /******************************************************************************* *******************************************************************************/ -static kern_return_t -get_file_for_arch(KXLDKext *kext, u_char *file, u_long size) +kern_return_t +export_symbols(const KXLDObject *kext, KXLDDict *defined_symbols_by_name, + KXLDDict *defined_cxx_symbols_by_value) { kern_return_t rval = KERN_FAILURE; - struct mach_header *mach_hdr = NULL; -#if !KERNEL - struct fat_header *fat = (struct fat_header *) file; - struct fat_arch *archs = (struct fat_arch *) &fat[1]; - boolean_t swap = FALSE; -#endif /* KERNEL */ - - check(kext); - check(file); - check(size); - - kext->file = file; - kext->size = size; - - /* We are assuming that we will never receive a fat file in the kernel */ - -#if !KERNEL - require_action(size >= sizeof(*fat), finish, - rval=KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogTruncatedMachO)); - - /* The fat header is always big endian, so swap if necessary */ - if (fat->magic == FAT_CIGAM) { - (void) swap_fat_header(fat, kext->host_order); - swap = TRUE; - } - - if (fat->magic == FAT_MAGIC) { - struct fat_arch *arch = NULL; - - require_action(size >= (sizeof(*fat) + (fat->nfat_arch * sizeof(*archs))), - finish, - rval=KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogTruncatedMachO)); + KXLDSymtabIterator iter; + KXLDSym *sym = NULL; - /* Swap the fat_arch structures if necessary */ - if (swap) { - (void) swap_fat_arch(archs, fat->nfat_arch, kext->host_order); + (void) kxld_symtab_iterator_init(&iter, kxld_object_get_symtab(kext), + kxld_sym_is_exported, FALSE); + while ((sym = kxld_symtab_iterator_get_next(&iter))) { + if (defined_symbols_by_name) { + rval = kxld_dict_insert(defined_symbols_by_name, sym->name, sym); + require_noerr(rval, finish); } - /* Locate the Mach-O for the requested architecture */ - - arch = NXFindBestFatArch(kext->cputype, kext->cpusubtype, archs, - fat->nfat_arch); - require_action(arch, finish, rval=KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogArchNotFound)); - require_action(size >= arch->offset + arch->size, finish, - rval=KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogTruncatedMachO)); - - kext->file = file + arch->offset; - kext->size = arch->size; - } -#endif /* !KERNEL */ - - /* Swap the Mach-O's headers to this architecture if necessary */ - if (kxld_kext_is_32_bit(kext)) { - rval = validate_and_swap_macho_32(kext->file, kext->size -#if !KERNEL - , kext->host_order -#endif /* !KERNEL */ - ); - } else { - rval = validate_and_swap_macho_64(kext->file, kext->size -#if !KERNEL - , kext->host_order -#endif /* !KERNEL */ - ); + if (kxld_sym_is_cxx(sym) && defined_cxx_symbols_by_value) { + rval = kxld_dict_insert(defined_cxx_symbols_by_value, + &sym->link_addr, sym); + require_noerr(rval, finish); + } } - require_noerr(rval, finish); - - mach_hdr = (struct mach_header *) kext->file; - require_action(kext->cputype == mach_hdr->cputype, finish, - rval=KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogTruncatedMachO)); rval = KERN_SUCCESS; finish: @@ -500,395 +328,204 @@ get_file_for_arch(KXLDKext *kext, u_char *file, u_long size) /******************************************************************************* *******************************************************************************/ -boolean_t -kxld_kext_is_32_bit(const KXLDKext *kext) +kern_return_t +kxld_kext_export_vtables(KXLDKext *kext, const KXLDDict *defined_cxx_symbols, + const KXLDDict *defined_symbols, KXLDDict *vtables) { + kern_return_t rval = KERN_FAILURE; + KXLDVTable *vtable = NULL; + u_int i = 0; + check(kext); + check(defined_symbols); + check(defined_cxx_symbols); + check(vtables); - return kxld_is_32_bit(kext->cputype); -} + rval = create_vtables(kext, defined_cxx_symbols, defined_symbols); + require_noerr(rval, finish); -/******************************************************************************* -*******************************************************************************/ -void -kxld_kext_get_cputype(const KXLDKext *kext, cpu_type_t *cputype, - cpu_subtype_t *cpusubtype) -{ - check(kext); - check(cputype); - check(cpusubtype); + for (i = 0; i < kext->vtables.nitems; ++i) { + vtable = kxld_array_get_item(&kext->vtables, i); - *cputype = kext->cputype; - *cpusubtype = kext->cpusubtype; + rval = kxld_dict_insert(vtables, vtable->name, vtable); + require_noerr(rval, finish); + } + + rval = KERN_SUCCESS; +finish: + return rval; } /******************************************************************************* *******************************************************************************/ -kern_return_t -kxld_kext_validate_cputype(const KXLDKext *kext, cpu_type_t cputype, - cpu_subtype_t cpusubtype __unused) +void +kxld_kext_get_vmsize(const KXLDKext *kext, + u_long *header_size, u_long *vmsize) { - if (kext->cputype != cputype) return KERN_FAILURE; - return KERN_SUCCESS; + (void) kxld_object_get_vmsize(kext->kext, header_size, vmsize); } - + /******************************************************************************* *******************************************************************************/ -static boolean_t -target_supports_protected_segments(const KXLDKext *kext) +kern_return_t +kxld_kext_export_linked_object(const KXLDKext *kext, + u_char *linked_object, kxld_addr_t *kmod_info) { - return (kext->is_final_image && - kext->cputype == CPU_TYPE_X86_64); -} + kern_return_t rval = KERN_FAILURE; + const KXLDSym *kmodsym = NULL; -#if KXLD_USER_OR_OBJECT -/******************************************************************************* -*******************************************************************************/ -static boolean_t target_supports_object(const KXLDKext *kext) -{ - return (kext->cputype == CPU_TYPE_POWERPC || - kext->cputype == CPU_TYPE_I386 || - kext->cputype == CPU_TYPE_ARM); + kmodsym = kxld_symtab_get_locally_defined_symbol_by_name( + kxld_object_get_symtab(kext->kext), KXLD_KMOD_INFO_SYMBOL); + require_action(kmodsym, finish, rval=KERN_FAILURE; + kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogNoKmodInfo)); + + *kmod_info = kmodsym->link_addr; + + rval = kxld_object_export_linked_object(kext->kext, linked_object); +finish: + return rval; } /******************************************************************************* *******************************************************************************/ -static kern_return_t -init_from_object(KXLDKext *kext) +kern_return_t +kxld_kext_relocate(KXLDKext *kext, kxld_addr_t link_address, + KXLDDict *patched_vtables, const KXLDDict *defined_symbols, + const KXLDDict *obsolete_symbols, const KXLDDict *defined_cxx_symbols) { kern_return_t rval = KERN_FAILURE; - struct load_command *cmd_hdr = NULL; - struct symtab_command *symtab_hdr = NULL; - struct uuid_command *uuid_hdr = NULL; - KXLDSect *sect = NULL; - u_long offset = 0; - u_long sect_offset = 0; - u_int filetype = 0; - u_int ncmds = 0; - u_int nsects = 0; - u_int i = 0; - boolean_t has_segment = FALSE; check(kext); - - require_action(target_supports_object(kext), - finish, rval=KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, - kKxldLogFiletypeNotSupported, MH_OBJECT)); - - KXLD_3264_FUNC(kxld_kext_is_32_bit(kext), offset, - get_macho_cmd_data_32, get_macho_cmd_data_64, - kext->file, offset, &filetype, &ncmds); - - require_action(filetype == MH_OBJECT, finish, rval=KERN_FAILURE); - - /* MH_OBJECTs use one unnamed segment to contain all of the sections. We - * loop over all of the load commands to initialize the structures we - * expect. Then, we'll use the unnamed segment to get to all of the - * sections, and then use those sections to create the actual segments. + check(patched_vtables); + check(defined_symbols); + check(obsolete_symbols); + + /* Kexts that are being relocated need symbols indexed by value for vtable + * creation and patching. Note that we don't need to index by value for + * dependencies that have already been linked because their symbols are + * already in the global cxx value table. It's important to index the + * symbols by value before we relocate the symbols because the vtable + * entries will still have unrelocated values. */ + rval = kxld_object_index_cxx_symbols_by_value(kext->kext); + require_noerr(rval, finish); - for (; i < ncmds; ++i, offset += cmd_hdr->cmdsize) { - cmd_hdr = (struct load_command *) (kext->file + offset); - - switch(cmd_hdr->cmd) { -#if KXLD_USER_OR_ILP32 - case LC_SEGMENT: - { - struct segment_command *seg_hdr = - (struct segment_command *) cmd_hdr; - - /* Ignore segments with no vm size */ - if (!seg_hdr->vmsize) continue; - - require_action(kxld_kext_is_32_bit(kext), finish, rval=KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO - "LC_SEGMENT in 64-bit kext.")); - require_action(!has_segment, finish, rval=KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO - "Multiple segments in an MH_OBJECT kext.")); - - nsects = seg_hdr->nsects; - sect_offset = offset + sizeof(*seg_hdr); - has_segment = TRUE; - } - break; -#endif /* KXLD_USER_OR_ILP32 */ -#if KXLD_USER_OR_LP64 - case LC_SEGMENT_64: - { - struct segment_command_64 *seg_hdr = - (struct segment_command_64 *) cmd_hdr; - - /* Ignore segments with no vm size */ - if (!seg_hdr->vmsize) continue; - - require_action(!kxld_kext_is_32_bit(kext), finish, rval=KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO - "LC_SEGMENT_64 in a 32-bit kext.")); - require_action(!has_segment, finish, rval=KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO - "Multiple segments in an MH_OBJECT kext.")); - - nsects = seg_hdr->nsects; - sect_offset = offset + sizeof(*seg_hdr); - has_segment = TRUE; - } - break; -#endif /* KXLD_USER_OR_LP64 */ - case LC_SYMTAB: - symtab_hdr = (struct symtab_command *) cmd_hdr; - - KXLD_3264_FUNC(kxld_kext_is_32_bit(kext), rval, - kxld_symtab_init_from_macho_32, kxld_symtab_init_from_macho_64, - kext->symtab, kext->file, symtab_hdr, 0); - require_noerr(rval, finish); - break; - case LC_UUID: - uuid_hdr = (struct uuid_command *) cmd_hdr; - kxld_uuid_init_from_macho(&kext->uuid, uuid_hdr); - break; - case LC_UNIXTHREAD: - /* Don't need to do anything with UNIXTHREAD */ - break; - default: - rval = KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO - "Invalid segment type in MH_OBJECT kext: %u.", cmd_hdr->cmd); - goto finish; - } - } - - if (has_segment) { - - /* Get the number of sections from the segment and build the section index */ - - rval = kxld_array_init(&kext->sects, sizeof(KXLDSect), nsects); - require_noerr(rval, finish); - - /* Loop over all of the sections to initialize the section index */ - - for (i = 0; i < nsects; ++i) { - sect = kxld_array_get_item(&kext->sects, i); - KXLD_3264_FUNC(kxld_kext_is_32_bit(kext), rval, - kxld_sect_init_from_macho_32, kxld_sect_init_from_macho_64, - sect, kext->file, §_offset, i, &kext->relocator); - require_noerr(rval, finish); - } - - /* Create special sections */ - -#if KXLD_USER_OR_GOT - rval = create_got(kext); - require_noerr(rval, finish); -#endif /* KXLD_USER_OR_GOT */ + rval = kxld_object_index_symbols_by_name(kext->kext); + require_noerr(rval, finish); -#if KXLD_USER_OR_COMMON - rval = resolve_common_symbols(kext); - require_noerr(rval, finish); -#endif /* KXLD_USER_OR_COMMON */ + rval = kxld_object_relocate(kext->kext, link_address); + require_noerr(rval, finish); - /* Create the segments from the section index */ + rval = resolve_symbols(kext, defined_symbols, obsolete_symbols); + require_noerr(rval, finish); - rval = kxld_seg_create_seg_from_sections(&kext->segs, &kext->sects); - require_noerr(rval, finish); + rval = create_vtables(kext, defined_cxx_symbols, /* defined_symbols */ NULL); + require_noerr(rval, finish); - rval = kxld_seg_finalize_object_segment(&kext->segs, - kext->section_order, get_macho_header_size(kext)); - require_noerr(rval, finish); + rval = patch_vtables(kext, patched_vtables, defined_symbols); + require_noerr(rval, finish); + + rval = validate_symbols(kext); + require_noerr(rval, finish); - kext->link_type = KXLD_LINK_KEXT; - } else { - kext->link_type = KXLD_LINK_PSEUDO_KEXT; - } + rval = kxld_object_process_relocations(kext->kext, patched_vtables); + require_noerr(rval, finish); rval = KERN_SUCCESS; finish: return rval; } -#endif /* KXLD_USER_OR_OBJECT */ /******************************************************************************* +* The defined symbols argument is optional. When supplied, create_vtables() +* will look for vtable symbols in the defined_symbols dictionary. Otherwise, +* it will look in the kext's symbol table for vtable symbols. +* +* We do this because there are two types of KXLDKext objects that call +* create_vtables(), those that have been linked, and those that haven't. The +* linked kexts export their symbols into the global symbol table that is used +* for symbol resolution, so we can look there for vtable symbols without +* having to index their local symbol table separately. +* +* Unlinked kexts haven't yet had their symbols exported into the global table, +* so we have to index their local symbol table separately. *******************************************************************************/ -static kern_return_t -init_from_final_linked_image(KXLDKext *kext, u_int *filetype_out, - struct symtab_command **symtab_hdr_out) +static kern_return_t +create_vtables(KXLDKext *kext, const KXLDDict *defined_cxx_symbols, + const KXLDDict *defined_symbols) { kern_return_t rval = KERN_FAILURE; - KXLDSeg *seg = NULL; - KXLDSect *sect = NULL; - struct load_command *cmd_hdr = NULL; - struct symtab_command *symtab_hdr = NULL; - struct uuid_command *uuid_hdr = NULL; - u_long base_offset = 0; - u_long offset = 0; - u_long sect_offset = 0; - u_int filetype = 0; + const KXLDSymtab *symtab = NULL; + KXLDSymtabIterator iter; + KXLDSym *sym = NULL; + KXLDSym *vtable_sym = NULL; + KXLDSym *meta_vtable_sym = NULL; + KXLDVTable *vtable = NULL; + KXLDVTable *meta_vtable = NULL; u_int i = 0; - u_int j = 0; - u_int segi = 0; - u_int secti = 0; - u_int nsegs = 0; - u_int nsects = 0; - u_int ncmds = 0; - - KXLD_3264_FUNC(kxld_kext_is_32_bit(kext), base_offset, - get_macho_cmd_data_32, get_macho_cmd_data_64, - kext->file, offset, &filetype, &ncmds); - - /* First pass to count segments and sections */ - - offset = base_offset; - for (i = 0; i < ncmds; ++i, offset += cmd_hdr->cmdsize) { - cmd_hdr = (struct load_command *) (kext->file + offset); - - switch(cmd_hdr->cmd) { -#if KXLD_USER_OR_ILP32 - case LC_SEGMENT: - { - struct segment_command *seg_hdr = - (struct segment_command *) cmd_hdr; - - /* Ignore segments with no vm size */ - if (!seg_hdr->vmsize) continue; - - ++nsegs; - nsects += seg_hdr->nsects; - } - break; -#endif /* KXLD_USER_OR_ILP32 */ -#if KXLD_USER_OR_LP64 - case LC_SEGMENT_64: - { - struct segment_command_64 *seg_hdr = - (struct segment_command_64 *) cmd_hdr; - - /* Ignore segments with no vm size */ - if (!seg_hdr->vmsize) continue; - - ++nsegs; - nsects += seg_hdr->nsects; - } - break; -#endif /* KXLD_USER_OR_LP64 */ - default: - continue; - } - } - - /* Allocate the segments and sections */ - - if (nsegs) { - rval = kxld_array_init(&kext->segs, sizeof(KXLDSeg), nsegs); - require_noerr(rval, finish); + u_int nvtables = 0; - rval = kxld_array_init(&kext->sects, sizeof(KXLDSect), nsects); - require_noerr(rval, finish); + if (kext->vtables_created) { + rval = KERN_SUCCESS; + goto finish; } - /* Initialize the segments and sections */ + symtab = kxld_object_get_symtab(kext->kext); - offset = base_offset; - for (i = 0; i < ncmds; ++i, offset += cmd_hdr->cmdsize) { - cmd_hdr = (struct load_command *) (kext->file + offset); - seg = NULL; + if (kxld_object_is_linked(kext->kext)) { + /* Create a vtable object for every vtable symbol */ + kxld_symtab_iterator_init(&iter, symtab, kxld_sym_is_vtable, FALSE); + nvtables = kxld_symtab_iterator_get_num_remaining(&iter); + } else { + /* We walk over the super metaclass pointer symbols because classes + * with them are the only ones that need patching. Then we double the + * number of vtables we're expecting, because every pointer will have a + * class vtable and a MetaClass vtable. + */ + kxld_symtab_iterator_init(&iter, symtab, + kxld_sym_is_super_metaclass_pointer, FALSE); + nvtables = kxld_symtab_iterator_get_num_remaining(&iter) * 2; + } - switch(cmd_hdr->cmd) { -#if KXLD_USER_OR_ILP32 - case LC_SEGMENT: - { - struct segment_command *seg_hdr = - (struct segment_command *) cmd_hdr; + rval = kxld_array_init(&kext->vtables, sizeof(KXLDVTable), nvtables); + require_noerr(rval, finish); - /* Ignore segments with no vm size */ - if (!seg_hdr->vmsize) continue; + while ((sym = kxld_symtab_iterator_get_next(&iter))) { + if (kxld_object_is_linked(kext->kext)) { + vtable_sym = sym; + meta_vtable_sym = NULL; + meta_vtable = NULL; + } else { + rval = get_vtable_syms_from_smcp(kext, defined_symbols, sym, + &vtable_sym, &meta_vtable_sym); + require_noerr(rval, finish); + } - seg = kxld_array_get_item(&kext->segs, segi++); + vtable = kxld_array_get_item(&kext->vtables, i++); + rval = kxld_vtable_init(vtable, vtable_sym, kext->kext, + defined_cxx_symbols); + require_noerr(rval, finish); - rval = kxld_seg_init_from_macho_32(seg, seg_hdr); + /* meta_vtable_sym will be null when we don't support strict + * patching and can't find the metaclass vtable. If that's the + * case, we just reduce the expect number of vtables by 1. + */ + if (!kxld_object_is_linked(kext->kext)) { + if (meta_vtable_sym) { + meta_vtable = kxld_array_get_item(&kext->vtables, i++); + rval = kxld_vtable_init(meta_vtable, meta_vtable_sym, + kext->kext, defined_cxx_symbols); require_noerr(rval, finish); - - sect_offset = offset + sizeof(*seg_hdr); + } else { + kxld_array_resize(&kext->vtables, --nvtables); + meta_vtable = NULL; } - break; -#endif /* KXLD_USER_OR_ILP32 */ -#if KXLD_USER_OR_LP64 - case LC_SEGMENT_64: - { - struct segment_command_64 *seg_hdr = - (struct segment_command_64 *) cmd_hdr; - - /* Ignore segments with no vm size */ - if (!seg_hdr->vmsize) continue; + } + } + require_action(i == kext->vtables.nitems, finish, + rval=KERN_FAILURE); - seg = kxld_array_get_item(&kext->segs, segi++); - - rval = kxld_seg_init_from_macho_64(seg, seg_hdr); - require_noerr(rval, finish); - - sect_offset = offset + sizeof(*seg_hdr); - } - break; -#endif /* KXLD_USER_OR_LP64 */ - case LC_SYMTAB: - symtab_hdr = (struct symtab_command *) cmd_hdr; - break; - case LC_UUID: - uuid_hdr = (struct uuid_command *) cmd_hdr; - kxld_uuid_init_from_macho(&kext->uuid, uuid_hdr); - break; - case LC_DYSYMTAB: - kext->dysymtab_hdr = (struct dysymtab_command *) cmd_hdr; - - rval = kxld_reloc_create_macho(&kext->extrelocs, &kext->relocator, - (struct relocation_info *) (kext->file + kext->dysymtab_hdr->extreloff), - kext->dysymtab_hdr->nextrel); - require_noerr(rval, finish); - - rval = kxld_reloc_create_macho(&kext->locrelocs, &kext->relocator, - (struct relocation_info *) (kext->file + kext->dysymtab_hdr->locreloff), - kext->dysymtab_hdr->nlocrel); - require_noerr(rval, finish); - - break; - case LC_UNIXTHREAD: - /* Don't need to do anything with UNIXTHREAD for the kernel */ - require_action(kext->link_type == KXLD_LINK_KERNEL, finish, - rval=KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO - "LC_UNIXTHREAD segment is not valid in a kext.")); - break; - default: - rval=KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO - "Invalid segment type in MH_KEXT_BUNDLE kext: %u.", cmd_hdr->cmd); - goto finish; - } - - if (seg) { - - /* Initialize the sections */ - for (j = 0; j < seg->sects.nitems; ++j, ++secti) { - sect = kxld_array_get_item(&kext->sects, secti); - KXLD_3264_FUNC(kxld_kext_is_32_bit(kext), rval, - kxld_sect_init_from_macho_32, kxld_sect_init_from_macho_64, - sect, kext->file, §_offset, secti, &kext->relocator); - require_noerr(rval, finish); - - /* Add the section to the segment. This will also make sure - * that the sections and segments have the same segname. - */ - rval = kxld_seg_add_section(seg, sect); - require_noerr(rval, finish); - } - rval = kxld_seg_finish_init(seg); - require_noerr(rval, finish); - } - } - - if (filetype_out) *filetype_out = filetype; - if (symtab_hdr_out) *symtab_hdr_out = symtab_hdr; - kext->is_final_image = TRUE; + kext->vtables_created = TRUE; rval = KERN_SUCCESS; finish: return rval; @@ -897,1433 +534,82 @@ init_from_final_linked_image(KXLDKext *kext, u_int *filetype_out, /******************************************************************************* *******************************************************************************/ static kern_return_t -init_from_execute(KXLDKext *kext) -{ - kern_return_t rval = KERN_FAILURE; - struct symtab_command *symtab_hdr = NULL; - kxld_addr_t linkedit_offset = 0; - u_int filetype = 0; -#if KERNEL - KXLDSeg *textseg = NULL; - KXLDSeg *linkeditseg = NULL; -#endif /*KERNEL */ -#if KXLD_USER_OR_OBJECT - KXLDSeg *seg = NULL; - KXLDSect *sect = NULL; - KXLDSectionName *sname = NULL; - u_int i = 0, j = 0, k = 0; -#endif /* KXLD_USER_OR_OBJECT */ - - check(kext); - - require_action(kext->link_type == KXLD_LINK_KERNEL, finish, - rval=KERN_FAILURE); - - rval = init_from_final_linked_image(kext, &filetype, &symtab_hdr); - require_noerr(rval, finish); - - require_action(filetype == MH_EXECUTE, finish, rval=KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO - "The kernel file is not of type MH_EXECUTE.")); - -#if KERNEL - /* When we're in the kernel, the symbol table can no longer be found by the - * symtab_command alone because the command specifies offsets for the file - * on disk, not the file mapped into memory. We can find the additional - * offset necessary by finding the difference between the linkedit segment's - * vm address and the text segment's vm address. - */ - - textseg = kxld_kext_get_seg_by_name(kext, SEG_TEXT); - require_action(textseg, finish, rval=KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO)); - - linkeditseg = kxld_kext_get_seg_by_name(kext, SEG_LINKEDIT); - require_action(linkeditseg, finish, rval=KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO)); - - linkedit_offset = linkeditseg->base_addr - textseg->base_addr - - linkeditseg->fileoff; -#endif /* KERNEL */ - - /* Initialize the symbol table */ - - KXLD_3264_FUNC(kxld_kext_is_32_bit(kext), rval, - kxld_symtab_init_from_macho_32, kxld_symtab_init_from_macho_64, - kext->symtab, kext->file, symtab_hdr, linkedit_offset); - require_noerr(rval, finish); - -#if KXLD_USER_OR_OBJECT - /* Save off the order of section names so that we can lay out kext - * sections for MH_OBJECT-based systems. - */ - if (target_supports_object(kext)) { - - rval = kxld_array_init(kext->section_order, sizeof(KXLDSectionName), - kext->sects.nitems); - require_noerr(rval, finish); - - /* Copy the section names into the section_order array for future kext - * section ordering. - */ - for (i = 0, k = 0; i < kext->segs.nitems; ++i) { - seg = kxld_array_get_item(&kext->segs, i); - - for (j = 0; j < seg->sects.nitems; ++j, ++k) { - sect = *(KXLDSect **) kxld_array_get_item(&seg->sects, j); - sname = kxld_array_get_item(kext->section_order, k); - - strlcpy(sname->segname, sect->segname, sizeof(sname->segname)); - strlcpy(sname->sectname, sect->sectname, sizeof(sname->sectname)); - } - } - } -#endif /* KXLD_USER_OR_OBJECT */ - - rval = KERN_SUCCESS; -finish: - return rval; -} - -#if KXLD_USER_OR_BUNDLE -/******************************************************************************* -*******************************************************************************/ -static boolean_t -target_supports_bundle(const KXLDKext *kext) -{ - return (kext->cputype == CPU_TYPE_X86_64); -} - -/******************************************************************************* -*******************************************************************************/ -static kern_return_t -init_from_bundle(KXLDKext *kext) -{ - kern_return_t rval = KERN_FAILURE; - KXLDSeg *seg = NULL; - struct symtab_command *symtab_hdr = NULL; - u_int filetype = 0; - u_int idx = 0; - - check(kext); - - require_action(target_supports_bundle(kext), finish, - rval=KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, - kKxldLogFiletypeNotSupported, MH_KEXT_BUNDLE)); - - rval = init_from_final_linked_image(kext, &filetype, &symtab_hdr); - require_noerr(rval, finish); - - require_action(filetype == MH_KEXT_BUNDLE, finish, - rval=KERN_FAILURE); - - KXLD_3264_FUNC(kxld_kext_is_32_bit(kext), rval, - kxld_symtab_init_from_macho_32, kxld_symtab_init_from_macho_64, - kext->symtab, kext->file, symtab_hdr, /* linkedit offset */ 0); - require_noerr(rval, finish); - - if (kext->segs.nitems) { - /* Remove the __LINKEDIT segment, since we never keep the symbol - * table around in memory for kexts. - */ - seg = kxld_kext_get_seg_by_name(kext, SEG_LINKEDIT); - if (seg) { - rval = kxld_array_get_index(&kext->segs, seg, &idx); - require_noerr(rval, finish); - - kxld_seg_deinit(seg); - - rval = kxld_array_remove(&kext->segs, idx); - require_noerr(rval, finish); - } - - kext->link_type = KXLD_LINK_KEXT; - } else { - kext->link_type = KXLD_LINK_PSEUDO_KEXT; - } - - rval = KERN_SUCCESS; -finish: - return rval; -} -#endif /* KXLD_USER_OR_BUNDLE */ - -#if KXLD_USER_OR_ILP32 -/******************************************************************************* -*******************************************************************************/ -static u_long -get_macho_cmd_data_32(u_char *file, u_long offset, u_int *filetype, u_int *ncmds) -{ - struct mach_header *mach_hdr = (struct mach_header *) (file + offset); - - if (filetype) *filetype = mach_hdr->filetype; - if (ncmds) *ncmds = mach_hdr->ncmds; - - return sizeof(*mach_hdr); -} - -#endif /* KXLD_USER_OR_ILP32 */ - -#if KXLD_USER_OR_LP64 -/******************************************************************************* -*******************************************************************************/ -static u_long -get_macho_cmd_data_64(u_char *file, u_long offset, u_int *filetype, u_int *ncmds) -{ - struct mach_header_64 *mach_hdr = (struct mach_header_64 *) (file + offset); - - if (filetype) *filetype = mach_hdr->filetype; - if (ncmds) *ncmds = mach_hdr->ncmds; - - return sizeof(*mach_hdr); -} -#endif /* KXLD_USER_OR_LP64 */ - -/******************************************************************************* -*******************************************************************************/ -static kern_return_t -create_vtables(KXLDKext *kext) +get_vtable_syms_from_smcp(KXLDKext *kext, const KXLDDict *defined_symbols, + KXLDSym *super_metaclass_ptr_sym, KXLDSym **vtable_sym_out, + KXLDSym **meta_vtable_sym_out) { kern_return_t rval = KERN_FAILURE; - KXLDSymtabIterator iter; - KXLDSym *sym = NULL; + const KXLDSymtab *symtab = NULL; KXLDSym *vtable_sym = NULL; KXLDSym *meta_vtable_sym = NULL; - KXLDSect *vtable_sect = NULL; - KXLDSect *meta_vtable_sect = NULL; - KXLDVTable *vtable = NULL; - KXLDVTable *meta_vtable = NULL; char class_name[KXLD_MAX_NAME_LEN]; char vtable_name[KXLD_MAX_NAME_LEN]; char meta_vtable_name[KXLD_MAX_NAME_LEN]; - char *demangled_name1 = NULL; - char *demangled_name2 = NULL; - size_t demangled_length1 = 0; - size_t demangled_length2 = 0; - u_int i = 0; - u_int nvtables = 0; - - if (kext->link_type == KXLD_LINK_KERNEL) { - /* Create a vtable object for every vtable symbol */ - kxld_symtab_iterator_init(&iter, kext->symtab, kxld_sym_is_vtable, FALSE); - nvtables = kxld_symtab_iterator_get_num_remaining(&iter); - } else { - /* We walk over the super metaclass pointer symbols, because classes - * with them are the only ones that need patching. Then we double the - * number of vtables we're expecting, because every pointer will have a - * class vtable and a MetaClass vtable. - */ - kxld_symtab_iterator_init(&iter, kext->symtab, - kxld_sym_is_super_metaclass_pointer, FALSE); - nvtables = kxld_symtab_iterator_get_num_remaining(&iter) * 2; - } - - /* Allocate the array of vtable objects. - */ - rval = kxld_array_init(&kext->vtables, sizeof(KXLDVTable), nvtables); - require_noerr(rval, finish); - - /* Initialize from each vtable symbol */ - while ((sym = kxld_symtab_iterator_get_next(&iter))) { - - if (kext->link_type == KXLD_LINK_KERNEL) { - vtable_sym = sym; - } else { - /* Get the class name from the smc pointer */ - rval = kxld_sym_get_class_name_from_super_metaclass_pointer( - sym, class_name, sizeof(class_name)); - require_noerr(rval, finish); - - /* Get the vtable name from the class name */ - rval = kxld_sym_get_vtable_name_from_class_name(class_name, - vtable_name, sizeof(vtable_name)); - require_noerr(rval, finish); - - /* Get the vtable symbol */ - vtable_sym = kxld_symtab_get_symbol_by_name(kext->symtab, vtable_name); - require_action(vtable_sym, finish, rval=KERN_FAILURE; - kxld_log(kKxldLogPatching, kKxldLogErr, kKxldLogMissingVtable, - vtable_name, class_name)); - - /* Get the meta vtable name from the class name */ - rval = kxld_sym_get_meta_vtable_name_from_class_name(class_name, - meta_vtable_name, sizeof(meta_vtable_name)); - require_noerr(rval, finish); - - /* Get the meta vtable symbol */ - meta_vtable_sym = kxld_symtab_get_symbol_by_name(kext->symtab, - meta_vtable_name); - if (!meta_vtable_sym) { - /* If we don't support strict patching and we can't find the vtable, - * log a warning and reduce the expected number of vtables by 1. - */ - if (target_supports_strict_patching(kext)) { - kxld_log(kKxldLogPatching, kKxldLogErr, kKxldLogMissingVtable, - meta_vtable_name, class_name); - rval = KERN_FAILURE; - goto finish; - } else { - kxld_log(kKxldLogPatching, kKxldLogErr, - "Warning: " kKxldLogMissingVtable, - kxld_demangle(meta_vtable_name, &demangled_name1, - &demangled_length1), - kxld_demangle(class_name, &demangled_name2, - &demangled_length2)); - kxld_array_resize(&kext->vtables, --nvtables); - } - } - } - - /* Get the vtable's section */ - vtable_sect = kxld_array_get_item(&kext->sects, vtable_sym->sectnum); - require_action(vtable_sect, finish, rval=KERN_FAILURE); - - vtable = kxld_array_get_item(&kext->vtables, i++); - - if (kext->link_type == KXLD_LINK_KERNEL) { - /* Initialize the kernel vtable */ - rval = kxld_vtable_init_from_kernel_macho(vtable, vtable_sym, - vtable_sect, kext->symtab, &kext->relocator); - require_noerr(rval, finish); - } else { - /* Initialize the class vtable */ - if (kext->is_final_image) { - rval = kxld_vtable_init_from_final_macho(vtable, vtable_sym, - vtable_sect, kext->symtab, &kext->relocator, &kext->extrelocs); - require_noerr(rval, finish); - } else { - rval = kxld_vtable_init_from_object_macho(vtable, vtable_sym, - vtable_sect, kext->symtab, &kext->relocator); - require_noerr(rval, finish); - } - - /* meta_vtable_sym will be null when we don't support strict patching - * and can't find the metaclass vtable. - */ - if (meta_vtable_sym) { - /* Get the vtable's section */ - meta_vtable_sect = kxld_array_get_item(&kext->sects, - meta_vtable_sym->sectnum); - require_action(vtable_sect, finish, rval=KERN_FAILURE); - - meta_vtable = kxld_array_get_item(&kext->vtables, i++); - - /* Initialize the metaclass vtable */ - if (kext->is_final_image) { - rval = kxld_vtable_init_from_final_macho(meta_vtable, meta_vtable_sym, - meta_vtable_sect, kext->symtab, &kext->relocator, &kext->extrelocs); - require_noerr(rval, finish); - } else { - rval = kxld_vtable_init_from_object_macho(meta_vtable, meta_vtable_sym, - meta_vtable_sect, kext->symtab, &kext->relocator); - require_noerr(rval, finish); - } - } - } - } - require_action(i == kext->vtables.nitems, finish, - rval=KERN_FAILURE); - - /* Map vtable names to the vtable structures */ - rval = kxld_dict_init(&kext->vtable_index, kxld_dict_string_hash, - kxld_dict_string_cmp, kext->vtables.nitems); - require_noerr(rval, finish); - - for (i = 0; i < kext->vtables.nitems; ++i) { - vtable = kxld_array_get_item(&kext->vtables, i); - rval = kxld_dict_insert(&kext->vtable_index, vtable->name, vtable); - require_noerr(rval, finish); - } - - rval = KERN_SUCCESS; - -finish: - - if (demangled_name1) kxld_free(demangled_name1, demangled_length1); - if (demangled_name2) kxld_free(demangled_name2, demangled_length2); - - return rval; -} - -/******************************************************************************* -* Temporary workaround for PR-6668105 -* new, new[], delete, and delete[] may be overridden globally in a kext. -* We should do this with some sort of weak symbols, but we'll use a whitelist -* for now to minimize risk. -*******************************************************************************/ -static void -restrict_private_symbols(KXLDKext *kext) -{ - const char *private_symbols[] = { - KXLD_KMOD_INFO_SYMBOL, - KXLD_OPERATOR_NEW_SYMBOL, - KXLD_OPERATOR_NEW_ARRAY_SYMBOL, - KXLD_OPERATOR_DELETE_SYMBOL, - KXLD_OPERATOR_DELETE_ARRAY_SYMBOL - }; - KXLDSymtabIterator iter; - KXLDSym *sym = NULL; - const char *name = NULL; - u_int i = 0; - - kxld_symtab_iterator_init(&iter, kext->symtab, kxld_sym_is_exported, FALSE); - while ((sym = kxld_symtab_iterator_get_next(&iter))) { - for (i = 0; i < const_array_len(private_symbols); ++i) { - name = private_symbols[i]; - if (!streq(sym->name, name)) { - continue; - } - - kxld_sym_mark_private(sym); - } - } -} - -/******************************************************************************* -*******************************************************************************/ -void -kxld_kext_clear(KXLDKext *kext) -{ - KXLDSeg *seg = NULL; - KXLDSect *sect = NULL; - KXLDVTable *vtable = NULL; - u_int i; - - check(kext); - -#if !KERNEL - if (kext->link_type == KXLD_LINK_KERNEL) { - unswap_macho(kext->file, kext->host_order, kext->target_order); - } -#endif /* !KERNEL */ - - for (i = 0; i < kext->segs.nitems; ++i) { - seg = kxld_array_get_item(&kext->segs, i); - kxld_seg_clear(seg); - } - kxld_array_reset(&kext->segs); - - for (i = 0; i < kext->sects.nitems; ++i) { - sect = kxld_array_get_item(&kext->sects, i); - kxld_sect_clear(sect); - } - kxld_array_reset(&kext->sects); - - for (i = 0; i < kext->vtables.nitems; ++i) { - vtable = kxld_array_get_item(&kext->vtables, i); - kxld_vtable_clear(vtable); - } - kxld_array_reset(&kext->vtables); - - kxld_array_reset(&kext->extrelocs); - kxld_array_reset(&kext->locrelocs); - kxld_dict_clear(&kext->vtable_index); - kxld_relocator_clear(&kext->relocator); - kxld_uuid_clear(&kext->uuid); - - if (kext->symtab) kxld_symtab_clear(kext->symtab); - - kext->link_addr = 0; - kext->kmod_link_addr = 0; - kext->cputype = 0; - kext->cpusubtype = 0; - kext->link_type = KXLD_LINK_UNKNOWN; - kext->is_final_image = FALSE; - kext->got_is_created = FALSE; -} - - - -/******************************************************************************* -*******************************************************************************/ -void -kxld_kext_deinit(KXLDKext *kext) -{ - KXLDSeg *seg = NULL; - KXLDSect *sect = NULL; - KXLDVTable *vtable = NULL; - u_int i; - - check(kext); - -#if !KERNEL - if (kext->link_type == KXLD_LINK_KERNEL) { - unswap_macho(kext->file, kext->host_order, kext->target_order); - } -#endif /* !KERNEL */ - - for (i = 0; i < kext->segs.maxitems; ++i) { - seg = kxld_array_get_slot(&kext->segs, i); - kxld_seg_deinit(seg); - } - kxld_array_deinit(&kext->segs); - - for (i = 0; i < kext->sects.maxitems; ++i) { - sect = kxld_array_get_slot(&kext->sects, i); - kxld_sect_deinit(sect); - } - kxld_array_deinit(&kext->sects); - - for (i = 0; i < kext->vtables.maxitems; ++i) { - vtable = kxld_array_get_slot(&kext->vtables, i); - kxld_vtable_deinit(vtable); - } - kxld_array_deinit(&kext->vtables); - - kxld_array_deinit(&kext->extrelocs); - kxld_array_deinit(&kext->locrelocs); - kxld_dict_deinit(&kext->vtable_index); - - if (kext->symtab) { - kxld_symtab_deinit(kext->symtab); - kxld_free(kext->symtab, kxld_symtab_sizeof()); - } - - bzero(kext, sizeof(*kext)); -} - -/******************************************************************************* -*******************************************************************************/ -boolean_t -kxld_kext_is_true_kext(const KXLDKext *kext) -{ - return (kext->link_type == KXLD_LINK_KEXT); -} - -/******************************************************************************* -*******************************************************************************/ -void -kxld_kext_get_vmsize(const KXLDKext *kext, u_long *header_size, u_long *vmsize) -{ - check(kext); - check(header_size); - check(vmsize); - *header_size = 0; - *vmsize = 0; - - /* vmsize is the padded header page(s) + segment vmsizes */ - - *header_size = (kext->is_final_image) ? - 0 : round_page(get_macho_header_size(kext)); - *vmsize = *header_size + get_macho_data_size(kext); - -} - -/******************************************************************************* -*******************************************************************************/ -const struct kxld_symtab * -kxld_kext_get_symtab(const KXLDKext *kext) -{ - check(kext); - - return kext->symtab; -} - -/******************************************************************************* -*******************************************************************************/ -u_int -kxld_kext_get_num_symbols(const KXLDKext *kext) -{ - check(kext); - - return kxld_symtab_get_num_symbols(kext->symtab); -} - -/******************************************************************************* -*******************************************************************************/ -void -kxld_kext_get_vtables(KXLDKext *kext, const KXLDArray **vtables) -{ - check(kext); - check(vtables); - - *vtables = &kext->vtables; -} - -/******************************************************************************* -*******************************************************************************/ -u_int -kxld_kext_get_num_vtables(const KXLDKext *kext) -{ - check(kext); - - return kext->vtables.nitems; -} - -/******************************************************************************* -*******************************************************************************/ -KXLDSeg * -kxld_kext_get_seg_by_name(const KXLDKext *kext, const char *segname) -{ - KXLDSeg *seg = NULL; - u_int i = 0; - - for (i = 0; i < kext->segs.nitems; ++i) { - seg = kxld_array_get_item(&kext->segs, i); - - if (streq(segname, seg->segname)) break; - - seg = NULL; - } - - return seg; -} - -/******************************************************************************* -*******************************************************************************/ -KXLDSect * -kxld_kext_get_sect_by_name(const KXLDKext *kext, const char *segname, - const char *sectname) -{ - KXLDSect *sect = NULL; - u_int i = 0; - - for (i = 0; i < kext->sects.nitems; ++i) { - sect = kxld_array_get_item(&kext->sects, i); - - if (streq(segname, sect->segname) && streq(sectname, sect->sectname)) { - break; - } - - sect = NULL; - } - - return sect; -} - -/******************************************************************************* -*******************************************************************************/ -int -kxld_kext_get_sectnum_for_sect(const KXLDKext *kext, const KXLDSect *sect) -{ - kern_return_t rval = KERN_FAILURE; - u_int idx = -1; - - rval = kxld_array_get_index(&kext->sects, sect, &idx); - if (rval) idx = -1; - - return idx; -} - -/******************************************************************************* -*******************************************************************************/ -const KXLDArray * -kxld_kext_get_section_order(const KXLDKext *kext __unused) -{ -#if KXLD_USER_OR_OBJECT - if (kext->link_type == KXLD_LINK_KERNEL && target_supports_object(kext)) { - return kext->section_order; - } -#endif /* KXLD_USER_OR_OBJECT */ - - return NULL; -} - -/******************************************************************************* -*******************************************************************************/ -static u_long -get_macho_header_size(const KXLDKext *kext) -{ - KXLDSeg *seg = NULL; - u_long header_size = 0; - u_int i = 0; - - check(kext); - - /* Mach, segment, and UUID headers */ - - if (kxld_kext_is_32_bit(kext)) { - header_size += sizeof(struct mach_header); - } else { - header_size += sizeof(struct mach_header_64); - } - - for (i = 0; i < kext->segs.nitems; ++i) { - seg = kxld_array_get_item(&kext->segs, i); - header_size += kxld_seg_get_macho_header_size(seg, kxld_kext_is_32_bit(kext)); - } - - if (kext->uuid.has_uuid) { - header_size += kxld_uuid_get_macho_header_size(); - } - - return header_size; -} - -/******************************************************************************* -*******************************************************************************/ -static u_long -get_macho_data_size(const KXLDKext *kext) -{ - KXLDSeg *seg = NULL; - u_long data_size = 0; - u_int i = 0; - - check(kext); - - for (i = 0; i < kext->segs.nitems; ++i) { - seg = kxld_array_get_item(&kext->segs, i); - data_size += (u_long) kxld_seg_get_vmsize(seg); - } - - return data_size; -} - -/******************************************************************************* -*******************************************************************************/ -kern_return_t kxld_kext_export_linked_object(const KXLDKext *kext, - u_char *linked_object, kxld_addr_t *kmod_info_kern) -{ - kern_return_t rval = KERN_FAILURE; - KXLDSeg *seg = NULL; - u_long size = 0; - u_long header_size = 0; - u_long header_offset = 0; - u_long data_offset = 0; - u_int ncmds = 0; - u_int i = 0; - - check(kext); - check(linked_object); - check(kmod_info_kern); - *kmod_info_kern = 0; - - /* Calculate the size of the headers and data */ - - header_size = get_macho_header_size(kext); - data_offset = (kext->is_final_image) ? header_size : round_page(header_size); - size = data_offset + get_macho_data_size(kext); - - /* Copy data to the file */ - - ncmds = kext->segs.nitems + (kext->uuid.has_uuid == TRUE); - - rval = export_macho_header(kext, linked_object, ncmds, - &header_offset, header_size); - require_noerr(rval, finish); - - for (i = 0; i < kext->segs.nitems; ++i) { - seg = kxld_array_get_item(&kext->segs, i); - - rval = kxld_seg_export_macho_to_vm(seg, linked_object, &header_offset, - header_size, size, kext->link_addr, kxld_kext_is_32_bit(kext)); - require_noerr(rval, finish); - } - - if (kext->uuid.has_uuid) { - rval = kxld_uuid_export_macho(&kext->uuid, linked_object, - &header_offset, header_size); - require_noerr(rval, finish); - } - - *kmod_info_kern = kext->kmod_link_addr; - -#if !KERNEL - unswap_macho(linked_object, kext->host_order, kext->target_order); -#endif /* KERNEL */ - - rval = KERN_SUCCESS; - -finish: - return rval; -} - -#if !KERNEL -/******************************************************************************* -*******************************************************************************/ -kern_return_t -kxld_kext_export_symbol_file(const KXLDKext *kext, - u_char **_symbol_file, u_long *_filesize) -{ - kern_return_t rval = KERN_FAILURE; - KXLDSeg *seg = NULL; - u_char *file = NULL; - u_long size = 0; - u_long header_size = 0; - u_long header_offset = 0; - u_long data_offset = 0; - u_int ncmds = 0; - u_int i = 0; - - check(kext); - check(_symbol_file); - *_symbol_file = NULL; - - /* Calculate the size of the file */ - - if (kxld_kext_is_32_bit(kext)) { - header_size += sizeof(struct mach_header); - } else { - header_size += sizeof(struct mach_header_64); - } - - for (i = 0; i < kext->segs.nitems; ++i) { - seg = kxld_array_get_item(&kext->segs, i); - header_size += kxld_seg_get_macho_header_size(seg, kxld_kext_is_32_bit(kext)); - size += kxld_seg_get_macho_data_size(seg); - } - - header_size += kxld_symtab_get_macho_header_size(); - size += kxld_symtab_get_macho_data_size(kext->symtab, FALSE, - kxld_kext_is_32_bit(kext)); - - if (kext->uuid.has_uuid) { - header_size += kxld_uuid_get_macho_header_size(); - } - - data_offset = round_page(header_size); - size += data_offset; - - /* Allocate the symbol file */ - - file = kxld_page_alloc_untracked(size); - require_action(file, finish, rval=KERN_RESOURCE_SHORTAGE); - bzero(file, size); - - /* Copy data to the file */ - - ncmds = kext->segs.nitems + (kext->uuid.has_uuid == TRUE) + 1; /* +1 for symtab */ - rval = export_macho_header(kext, file, ncmds, &header_offset, header_size); - require_noerr(rval, finish); - - for (i = 0; i < kext->segs.nitems; ++i) { - seg = kxld_array_get_item(&kext->segs, i); - rval = kxld_seg_export_macho_to_file_buffer(seg, file, &header_offset, - header_size, &data_offset, size, kxld_kext_is_32_bit(kext)); - require_noerr(rval, finish); - } - - rval = kxld_symtab_export_macho(kext->symtab, file, &header_offset, - header_size, &data_offset, size, FALSE, kxld_kext_is_32_bit(kext)); - require_noerr(rval, finish); - - if (kext->uuid.has_uuid) { - rval = kxld_uuid_export_macho(&kext->uuid, file, &header_offset, - header_size); - require_noerr(rval, finish); - } - - header_offset = header_size; - - /* Commit */ - - unswap_macho(file, kext->host_order, kext->target_order); - - *_filesize = size; - *_symbol_file = file; - file = NULL; - rval = KERN_SUCCESS; - -finish: - - if (file) { - kxld_page_free_untracked(file, size); - file = NULL; - } - - check(!file); - check((!rval) ^ (!*_symbol_file)); - - return rval; -} -#endif - -/******************************************************************************* -*******************************************************************************/ -boolean_t -kxld_kext_target_needs_swap(const KXLDKext *kext __unused) -{ -#if KERNEL - return FALSE; -#else - return (kext->target_order != kext->host_order); -#endif /* KERNEL */ -} - -/******************************************************************************* -*******************************************************************************/ -static kern_return_t -export_macho_header(const KXLDKext *kext, u_char *buf, u_int ncmds, - u_long *header_offset, u_long header_size) -{ - kern_return_t rval = KERN_FAILURE; - - check(kext); - check(buf); - check(header_offset); - - KXLD_3264_FUNC(kxld_kext_is_32_bit(kext), rval, - export_macho_header_32, export_macho_header_64, - kext, buf, ncmds, header_offset, header_size); - require_noerr(rval, finish); - - rval = KERN_SUCCESS; - -finish: - return rval; -} - -#if KXLD_USER_OR_ILP32 -/******************************************************************************* -*******************************************************************************/ -static kern_return_t -export_macho_header_32(const KXLDKext *kext, u_char *buf, u_int ncmds, - u_long *header_offset, u_long header_size) -{ - kern_return_t rval = KERN_FAILURE; - struct mach_header *mach = NULL; - - check(kext); - check(buf); - check(header_offset); - - require_action(sizeof(*mach) <= header_size - *header_offset, finish, - rval=KERN_FAILURE); - mach = (struct mach_header *) (buf + *header_offset); - - mach->magic = MH_MAGIC; - mach->cputype = kext->cputype; - mach->filetype = kext->filetype; - mach->ncmds = ncmds; - mach->sizeofcmds = (uint32_t) (header_size - sizeof(*mach)); - mach->flags = MH_NOUNDEFS; - - *header_offset += sizeof(*mach); - - rval = KERN_SUCCESS; - -finish: - return rval; -} -#endif /* KXLD_USER_OR_ILP32 */ - -#if KXLD_USER_OR_LP64 -/******************************************************************************* -*******************************************************************************/ -static kern_return_t -export_macho_header_64(const KXLDKext *kext, u_char *buf, u_int ncmds, - u_long *header_offset, u_long header_size) -{ - kern_return_t rval = KERN_FAILURE; - struct mach_header_64 *mach = NULL; - - check(kext); - check(buf); - check(header_offset); - - require_action(sizeof(*mach) <= header_size - *header_offset, finish, - rval=KERN_FAILURE); - mach = (struct mach_header_64 *) (buf + *header_offset); - - mach->magic = MH_MAGIC_64; - mach->cputype = kext->cputype; - mach->cpusubtype = kext->cpusubtype; - mach->filetype = kext->filetype; - mach->ncmds = ncmds; - mach->sizeofcmds = (uint32_t) (header_size - sizeof(*mach)); - mach->flags = MH_NOUNDEFS; - - *header_offset += sizeof(*mach); - - rval = KERN_SUCCESS; - -finish: - return rval; -} -#endif /* KXLD_USER_OR_LP64 */ - -/******************************************************************************* -*******************************************************************************/ -kern_return_t -kxld_kext_resolve(KXLDKext *kext, struct kxld_dict *patched_vtables, - struct kxld_dict *defined_symbols) -{ - kern_return_t rval = KERN_FAILURE; - - require_action(kext->link_type == KXLD_LINK_PSEUDO_KEXT, finish, - rval=KERN_FAILURE); - - /* Resolve symbols */ - rval = resolve_symbols(kext, defined_symbols, NULL); - require_noerr(rval, finish); - - /* Validate symbols */ - rval = validate_symbols(kext); - require_noerr(rval, finish); - - /* Pseudokexts re-export their dependencies' vtables */ - rval = copy_vtables(kext, patched_vtables); - require_noerr(rval, finish); - - rval = KERN_SUCCESS; - -finish: - return rval; -} - -/******************************************************************************* -*******************************************************************************/ -kern_return_t -kxld_kext_relocate(KXLDKext *kext, kxld_addr_t link_address, - KXLDDict *patched_vtables, KXLDDict *defined_symbols, - KXLDDict *obsolete_symbols) -{ - kern_return_t rval = KERN_FAILURE; - KXLDSeg *seg = NULL; - u_int i = 0; - - check(kext); - check(patched_vtables); - check(defined_symbols); - - require_action(kext->link_type == KXLD_LINK_KEXT, finish, rval=KERN_FAILURE); - - kext->link_addr = link_address; - - /* Relocate segments (which relocates the sections) */ - for (i = 0; i < kext->segs.nitems; ++i) { - seg = kxld_array_get_item(&kext->segs, i); - kxld_seg_relocate(seg, link_address); - } - - /* Relocate symbols */ - rval = kxld_symtab_relocate(kext->symtab, &kext->sects); - require_noerr(rval, finish); - - /* Populate kmod info structure */ - rval = populate_kmod_info(kext); - require_noerr(rval, finish); - - /* Resolve symbols */ - rval = resolve_symbols(kext, defined_symbols, obsolete_symbols); - require_noerr(rval, finish); - - /* Patch vtables */ - rval = patch_vtables(kext, patched_vtables, defined_symbols); - require_noerr(rval, finish); - - /* Validate symbols */ - rval = validate_symbols(kext); - require_noerr(rval, finish); - - /* Process relocation entries and populate the global offset table. - * - * For final linked images: the relocation entries are contained in a couple - * of tables hanging off the end of the symbol table. The GOT has its own - * section created by the linker; we simply need to fill it. - * - * For object files: the relocation entries are bound to each section. - * The GOT, if it exists for the target architecture, is created by kxld, - * and we must populate it according to our internal structures. - */ - if (kext->is_final_image) { -#if KXLD_USER_OR_BUNDLE - rval = process_symbol_pointers(kext); - require_noerr(rval, finish); - - rval = process_relocs_from_tables(kext); - require_noerr(rval, finish); -#else - require_action(FALSE, finish, rval=KERN_FAILURE); -#endif /* KXLD_USER_OR_BUNDLE */ - } else { -#if KXLD_USER_OR_GOT - /* Populate GOT */ - rval = populate_got(kext); - require_noerr(rval, finish); -#endif /* KXLD_USER_OR_GOT */ -#if KXLD_USER_OR_OBJECT - rval = process_relocs_from_sections(kext); - require_noerr(rval, finish); -#else - require_action(FALSE, finish, rval=KERN_FAILURE); -#endif /* KXLD_USER_OR_OBJECT */ - } - - rval = KERN_SUCCESS; - -finish: - return rval; -} - -/******************************************************************************* -*******************************************************************************/ -static kern_return_t -resolve_symbols(KXLDKext *kext, KXLDDict *defined_symbols, - KXLDDict *obsolete_symbols) -{ - kern_return_t rval = KERN_FAILURE; - KXLDSymtabIterator iter; - KXLDSym *sym = NULL; - void *addrp = NULL; - kxld_addr_t addr = 0; - const char *name = NULL; - boolean_t tests_for_weak = FALSE; - boolean_t error = FALSE; - boolean_t warning = FALSE; - char *demangled_name = NULL; - size_t demangled_length = 0; - - check(kext); - check(defined_symbols); - - /* Check if the kext tests for weak symbols */ - sym = kxld_symtab_get_symbol_by_name(kext->symtab, KXLD_WEAK_TEST_SYMBOL); - tests_for_weak = (sym != NULL); - - /* Check for duplicate symbols */ - kxld_symtab_iterator_init(&iter, kext->symtab, kxld_sym_is_exported, FALSE); - while ((sym = kxld_symtab_iterator_get_next(&iter))) { - addrp = kxld_dict_find(defined_symbols, sym->name); - if (addrp) { - /* Convert to a kxld_addr_t */ - if (kxld_kext_is_32_bit(kext)) { - addr = (kxld_addr_t) (*(uint32_t*)addrp); - } else { - addr = (kxld_addr_t) (*(uint64_t*)addrp); - } - - /* Not a problem if the symbols have the same address */ - if (addr == sym->link_addr) { - continue; - } - - if (!error) { - error = TRUE; - kxld_log(kKxldLogLinking, kKxldLogErr, - "The following symbols were defined more than once:"); - } - - kxld_log(kKxldLogLinking, kKxldLogErr, "\t%s: %p - %p", - kxld_demangle(sym->name, &demangled_name, &demangled_length), - (void *) (uintptr_t) sym->link_addr, - (void *) (uintptr_t) addr); - } - } - require_noerr_action(error, finish, rval=KERN_FAILURE); - - /* Resolve undefined and indirect symbols */ - - /* Iterate over all unresolved symbols */ - kxld_symtab_iterator_init(&iter, kext->symtab, - kxld_sym_is_unresolved, FALSE); - while ((sym = kxld_symtab_iterator_get_next(&iter))) { - - /* Common symbols are not supported */ - if (kxld_sym_is_common(sym)) { - - if (!error) { - error = TRUE; - if (target_supports_common(kext)) { - kxld_log(kKxldLogLinking, kKxldLogErr, - "The following common symbols were not resolved:"); - } else { - kxld_log(kKxldLogLinking, kKxldLogErr, - "Common symbols are not supported in kernel extensions. " - "Use -fno-common to build your kext. " - "The following are common symbols:"); - } - } - kxld_log(kKxldLogLinking, kKxldLogErr, "\t%s", - kxld_demangle(sym->name, &demangled_name, &demangled_length)); - - } else { - - /* Find the address of the defined symbol */ - if (kxld_sym_is_undefined(sym)) { - name = sym->name; - } else { - name = sym->alias; - } - addrp = kxld_dict_find(defined_symbols, name); - - /* Resolve the symbol. If a definition cannot be found, then: - * 1) Psuedokexts log a warning and proceed - * 2) Actual kexts delay the error until validation in case vtable - * patching replaces the undefined symbol. - */ - - if (addrp) { - - /* Convert to a kxld_addr_t */ - if (kxld_kext_is_32_bit(kext)) { - addr = (kxld_addr_t) (*(uint32_t*)addrp); - } else { - addr = (kxld_addr_t) (*(uint64_t*)addrp); - } - - boolean_t is_exported = (kext->link_type == KXLD_LINK_PSEUDO_KEXT); - - rval = kxld_sym_resolve(sym, addr, is_exported); - require_noerr(rval, finish); - - if (obsolete_symbols && kxld_dict_find(obsolete_symbols, name)) { - kxld_log(kKxldLogLinking, kKxldLogWarn, - "This kext uses obsolete symbol %s.", - kxld_demangle(name, &demangled_name, &demangled_length)); - } - - } else if (kext->link_type == KXLD_LINK_PSEUDO_KEXT) { - /* Pseudokexts ignore undefined symbols, because any actual - * kexts that need those symbols will fail to link anyway, so - * there's no need to block well-behaved kexts. - */ - if (!warning) { - kxld_log(kKxldLogLinking, kKxldLogWarn, - "This symbol set has the following unresolved symbols:"); - warning = TRUE; - } - kxld_log(kKxldLogLinking, kKxldLogErr, "\t%s", - kxld_demangle(sym->name, &demangled_name, &demangled_length)); - kxld_sym_delete(sym); - - } else if (kxld_sym_is_weak(sym)) { - /* Make sure that the kext has referenced gOSKextUnresolved. - */ - require_action(tests_for_weak, finish, - rval=KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, - "This kext has weak references but does not test for " - "them. Test for weak references with " - "OSKextIsSymbolResolved().")); - -#if KERNEL - /* Get the address of the default weak address. - */ - addr = (kxld_addr_t) &kext_weak_symbol_referenced; -#else - /* This is run during symbol generation only, so we only - * need a filler value here. - */ - addr = kext->link_addr; -#endif /* KERNEL */ - - rval = kxld_sym_resolve(sym, addr, /* exported */ FALSE); - require_noerr(rval, finish); - } - } - } - require_noerr_action(error, finish, rval=KERN_FAILURE); - - rval = KERN_SUCCESS; - -finish: - if (demangled_name) kxld_free(demangled_name, demangled_length); - - return rval; -} - -/******************************************************************************* -*******************************************************************************/ -static boolean_t -target_supports_strict_patching(KXLDKext *kext) -{ - check(kext); - - return (kext->cputype != CPU_TYPE_I386 && - kext->cputype != CPU_TYPE_POWERPC); -} - -/******************************************************************************* -* We must patch vtables to ensure binary compatibility, and to perform that -* patching, we have to determine the vtables' inheritance relationships. The -* MetaClass system gives us a way to do that: -* 1) Iterate over all of the super MetaClass pointer symbols. Every class -* that inherits from OSObject will have a pointer in its MetaClass that -* points to the MetaClass's super MetaClass. -* 2) Derive the name of the class from the super MetaClass pointer. -* 3) Derive the name of the class's vtable from the name of the class -* 4) Follow the super MetaClass pointer to get the address of the super -* MetaClass's symbol -* 5) Look up the super MetaClass symbol by address -* 6) Derive the super class's name from the super MetaClass name -* 7) Derive the super class's vtable from the super class's name -* This procedure will allow us to find all of the OSObject-derived classes and -* their super classes, and thus patch all of the vtables. -* -* We also have to take care to patch up the MetaClass's vtables. The -* MetaClasses follow a parallel hierarchy to the classes, so once we have the -* class name and super class name, we can also derive the MetaClass name and -* the super MetaClass name, and thus find and patch their vtables as well. -*******************************************************************************/ - -#define kOSMetaClassVTableName "__ZTV11OSMetaClass" - -static kern_return_t -patch_vtables(KXLDKext *kext, KXLDDict *patched_vtables, - KXLDDict *defined_symbols) -{ - kern_return_t rval = KERN_FAILURE; - KXLDSymtabIterator iter; - KXLDSym *metaclass = NULL; - KXLDSym *super_metaclass_pointer = NULL; - KXLDSym *final_sym = NULL; - KXLDVTable *vtable = NULL; - KXLDVTable *super_vtable = NULL; - char class_name[KXLD_MAX_NAME_LEN]; - char super_class_name[KXLD_MAX_NAME_LEN]; - char vtable_name[KXLD_MAX_NAME_LEN]; - char super_vtable_name[KXLD_MAX_NAME_LEN]; - char final_sym_name[KXLD_MAX_NAME_LEN]; - char *demangled_name1 = NULL; - char *demangled_name2 = NULL; - size_t demangled_length1 = 0;; - size_t demangled_length2 = 0; - size_t len = 0; - u_int nvtables = 0; - u_int npatched = 0; - u_int nprogress = 0; - boolean_t failure = FALSE; - - check(kext); - check(patched_vtables); - - /* Find each super meta class pointer symbol */ - - kxld_symtab_iterator_init(&iter, kext->symtab, - kxld_sym_is_super_metaclass_pointer, FALSE); - nvtables = kxld_symtab_iterator_get_num_remaining(&iter); - - while (npatched < nvtables) { - npatched = 0; - nprogress = 0; - kxld_symtab_iterator_reset(&iter); - while((super_metaclass_pointer = kxld_symtab_iterator_get_next(&iter))) - { - /* Get the class name from the smc pointer */ - rval = kxld_sym_get_class_name_from_super_metaclass_pointer( - super_metaclass_pointer, class_name, sizeof(class_name)); - require_noerr(rval, finish); - - /* Get the vtable name from the class name */ - rval = kxld_sym_get_vtable_name_from_class_name(class_name, - vtable_name, sizeof(vtable_name)); - require_noerr(rval, finish); - - /* Get the vtable and make sure it hasn't been patched */ - vtable = kxld_dict_find(&kext->vtable_index, vtable_name); - require_action(vtable, finish, rval=KERN_FAILURE; - kxld_log(kKxldLogPatching, kKxldLogErr, kKxldLogMissingVtable, - vtable_name, class_name)); - - if (!vtable->is_patched) { - - /* Find the SMCP's meta class symbol */ - rval = get_metaclass_symbol_from_super_meta_class_pointer_symbol( - kext, super_metaclass_pointer, &metaclass); - require_noerr(rval, finish); - - /* Get the super class name from the super metaclass */ - rval = kxld_sym_get_class_name_from_metaclass(metaclass, - super_class_name, sizeof(super_class_name)); - require_noerr(rval, finish); - - /* Get the super vtable name from the class name */ - rval = kxld_sym_get_vtable_name_from_class_name(super_class_name, - super_vtable_name, sizeof(super_vtable_name)); - require_noerr(rval, finish); - - if (failure) { - kxld_log(kKxldLogPatching, kKxldLogErr, - "\t'%s' (super vtable '%s')", - kxld_demangle(vtable_name, &demangled_name1, - &demangled_length1), - kxld_demangle(super_vtable_name, &demangled_name2, - &demangled_length2)); - continue; - } - - /* Get the super vtable if it's been patched */ - super_vtable = kxld_dict_find(patched_vtables, super_vtable_name); - if (!super_vtable) continue; - - /* Get the final symbol's name from the super vtable */ - rval = kxld_sym_get_final_sym_name_from_class_name(super_class_name, - final_sym_name, sizeof(final_sym_name)); - require_noerr(rval, finish); - - /* Verify that the final symbol does not exist. First check - * all the externally defined symbols, then check locally. - */ - final_sym = kxld_dict_find(defined_symbols, final_sym_name); - if (!final_sym) { - final_sym = kxld_symtab_get_symbol_by_name(kext->symtab, - final_sym_name); - } - require_action(!final_sym, finish, - rval=KERN_FAILURE; - kxld_log(kKxldLogPatching, kKxldLogErr, - "Class '%s' is a subclass of final class '%s'.", - kxld_demangle(class_name, &demangled_name1, - &demangled_length1), - kxld_demangle(super_class_name, &demangled_name2, - &demangled_length2))); - - /* Patch the class's vtable */ - rval = kxld_vtable_patch(vtable, super_vtable, kext->symtab, - target_supports_strict_patching(kext)); - require_noerr(rval, finish); + char *demangled_name1 = NULL; + char *demangled_name2 = NULL; + size_t demangled_length1 = 0; + size_t demangled_length2 = 0; - /* Add the class's vtable to the set of patched vtables */ - rval = kxld_dict_insert(patched_vtables, vtable->name, vtable); - require_noerr(rval, finish); + check(kext); + check(vtable_sym_out); + check(meta_vtable_sym_out); - /* Get the meta vtable name from the class name */ - rval = kxld_sym_get_meta_vtable_name_from_class_name(class_name, - vtable_name, sizeof(vtable_name)); - require_noerr(rval, finish); + require(!kxld_object_is_kernel(kext->kext), finish); - /* Get the meta vtable. Whether or not it should exist has already - * been tested in create_vtables(), so if it doesn't exist and we're - * still running, we can safely skip it. - */ - vtable = kxld_dict_find(&kext->vtable_index, vtable_name); - if (!vtable) { - ++nprogress; - ++npatched; - continue; - } - require_action(!vtable->is_patched, finish, rval=KERN_FAILURE); + symtab = kxld_object_get_symtab(kext->kext); - /* There is no way to look up a metaclass vtable at runtime, but - * we know that every class's metaclass inherits directly from - * OSMetaClass, so we just hardcode that vtable name here. - */ - len = strlcpy(super_vtable_name, kOSMetaClassVTableName, - sizeof(super_vtable_name)); - require_action(len == const_strlen(kOSMetaClassVTableName), - finish, rval=KERN_FAILURE); - - /* Get the super meta vtable */ - super_vtable = kxld_dict_find(patched_vtables, super_vtable_name); - require_action(super_vtable && super_vtable->is_patched, - finish, rval=KERN_FAILURE); + /* Get the class name from the smc pointer */ + rval = kxld_sym_get_class_name_from_super_metaclass_pointer( + super_metaclass_ptr_sym, class_name, sizeof(class_name)); + require_noerr(rval, finish); - /* Patch the meta class's vtable */ - rval = kxld_vtable_patch(vtable, super_vtable, - kext->symtab, target_supports_strict_patching(kext)); - require_noerr(rval, finish); + /* Get the vtable name from the class name */ + rval = kxld_sym_get_vtable_name_from_class_name(class_name, + vtable_name, sizeof(vtable_name)); + require_noerr(rval, finish); - /* Add the MetaClass's vtable to the set of patched vtables */ - rval = kxld_dict_insert(patched_vtables, vtable->name, vtable); - require_noerr(rval, finish); - - ++nprogress; - } + /* Get the vtable symbol */ + if (defined_symbols) { + vtable_sym = kxld_dict_find(defined_symbols, vtable_name); + } else { + vtable_sym = kxld_symtab_get_locally_defined_symbol_by_name(symtab, + vtable_name); + } + require_action(vtable_sym, finish, rval=KERN_FAILURE; + kxld_log(kKxldLogPatching, kKxldLogErr, kKxldLogMissingVtable, + vtable_name, class_name)); - ++npatched; - } + /* Get the meta vtable name from the class name */ + rval = kxld_sym_get_meta_vtable_name_from_class_name(class_name, + meta_vtable_name, sizeof(meta_vtable_name)); + require_noerr(rval, finish); - require_action(!failure, finish, rval=KERN_FAILURE); - if (!nprogress) { - failure = TRUE; + /* Get the meta vtable symbol */ + if (defined_symbols) { + meta_vtable_sym = kxld_dict_find(defined_symbols, meta_vtable_name); + } else { + meta_vtable_sym = kxld_symtab_get_locally_defined_symbol_by_name(symtab, + meta_vtable_name); + } + if (!meta_vtable_sym) { + if (kxld_object_target_supports_strict_patching(kext->kext)) { + kxld_log(kKxldLogPatching, kKxldLogErr, + kKxldLogMissingVtable, + meta_vtable_name, class_name); + rval = KERN_FAILURE; + goto finish; + } else { kxld_log(kKxldLogPatching, kKxldLogErr, - "The following vtables were unpatchable because each one's " - "parent vtable either was not found or also was not patchable:"); + "Warning: " kKxldLogMissingVtable, + kxld_demangle(meta_vtable_name, &demangled_name1, + &demangled_length1), + kxld_demangle(class_name, &demangled_name2, + &demangled_length2)); } } - + + *vtable_sym_out = vtable_sym; + *meta_vtable_sym_out = meta_vtable_sym; rval = KERN_SUCCESS; finish: if (demangled_name1) kxld_free(demangled_name1, demangled_length1); @@ -2335,570 +621,454 @@ patch_vtables(KXLDKext *kext, KXLDDict *patched_vtables, /******************************************************************************* *******************************************************************************/ static kern_return_t -validate_symbols(KXLDKext *kext) +resolve_symbols(KXLDKext *kext, const KXLDDict *defined_symbols, + const KXLDDict *obsolete_symbols) { kern_return_t rval = KERN_FAILURE; + const KXLDSymtab *symtab = NULL; KXLDSymtabIterator iter; KXLDSym *sym = NULL; - u_int error = FALSE; + KXLDSym *defined_sym = NULL; + const char *name = NULL; + boolean_t tests_for_weak = FALSE; + boolean_t error = FALSE; char *demangled_name = NULL; size_t demangled_length = 0; - - /* Check for any unresolved symbols */ - kxld_symtab_iterator_init(&iter, kext->symtab, kxld_sym_is_unresolved, FALSE); - while ((sym = kxld_symtab_iterator_get_next(&iter))) { - if (!error) { - error = TRUE; - kxld_log(kKxldLogLinking, kKxldLogErr, - "The following symbols are unresolved for this kext:"); - } - kxld_log(kKxldLogLinking, kKxldLogErr, "\t%s", - kxld_demangle(sym->name, &demangled_name, &demangled_length)); - } - require_noerr_action(error, finish, rval=KERN_FAILURE); - rval = KERN_SUCCESS; + check(kext->kext); + check(defined_symbols); + check(obsolete_symbols); -finish: - if (demangled_name) kxld_free(demangled_name, demangled_length); - return rval; -} + symtab = kxld_object_get_symtab(kext->kext); -#if KXLD_USER_OR_GOT || KXLD_USER_OR_COMMON -/******************************************************************************* -*******************************************************************************/ -static kern_return_t -add_section(KXLDKext *kext, KXLDSect **sect) -{ - kern_return_t rval = KERN_FAILURE; - u_int nsects = kext->sects.nitems; + /* Check if the kext tests for weak symbols */ + sym = kxld_symtab_get_symbol_by_name(symtab, KXLD_WEAK_TEST_SYMBOL); + tests_for_weak = (sym != NULL); - rval = kxld_array_resize(&kext->sects, nsects + 1); - require_noerr(rval, finish); + /* Check for duplicate symbols */ + kxld_symtab_iterator_init(&iter, symtab, kxld_sym_is_exported, FALSE); + while ((sym = kxld_symtab_iterator_get_next(&iter))) { + defined_sym = kxld_dict_find(defined_symbols, sym->name); + if (defined_sym) { + /* Not a problem if the symbols have the same address */ + if (defined_sym->link_addr == sym->link_addr) { + continue; + } - *sect = kxld_array_get_item(&kext->sects, nsects); + if (!error) { + error = TRUE; + kxld_log(kKxldLogLinking, kKxldLogErr, + "The following symbols were defined more than once:"); + } - rval = KERN_SUCCESS; + kxld_log(kKxldLogLinking, kKxldLogErr, "\t%s: %p - %p", + kxld_demangle(sym->name, &demangled_name, &demangled_length), + (void *) (uintptr_t) sym->link_addr, + (void *) (uintptr_t) defined_sym->link_addr); + } + } + require_noerr_action(error, finish, rval=KERN_FAILURE); -finish: - return rval; -} -#endif /* KXLD_USER_OR_GOT || KXLD_USER_OR_COMMON */ + /* Resolve undefined and indirect symbols */ -#if KXLD_USER_OR_GOT -/******************************************************************************* -*******************************************************************************/ -static boolean_t -target_has_got(const KXLDKext *kext) -{ - return FALSE: -} + /* Iterate over all unresolved symbols */ + kxld_symtab_iterator_init(&iter, symtab, + kxld_sym_is_unresolved, FALSE); + while ((sym = kxld_symtab_iterator_get_next(&iter))) { -/******************************************************************************* -* Create and initialize the Global Offset Table -*******************************************************************************/ -static kern_return_t -create_got(KXLDKext *kext) -{ - kern_return_t rval = KERN_FAILURE; - KXLDSect *sect = NULL; - u_int ngots = 0; - u_int i = 0; + /* Common symbols are not supported */ + if (kxld_sym_is_common(sym)) { - if (!target_has_got(kext)) { - rval = KERN_SUCCESS; - goto finish; - } + if (!error) { + error = TRUE; + if (kxld_object_target_supports_common_symbols(kext->kext)) { + kxld_log(kKxldLogLinking, kKxldLogErr, + "The following common symbols were not resolved:"); + } else { + kxld_log(kKxldLogLinking, kKxldLogErr, + "Common symbols are not supported in kernel extensions. " + "Use -fno-common to build your kext. " + "The following are common symbols:"); + } + } + kxld_log(kKxldLogLinking, kKxldLogErr, "\t%s", + kxld_demangle(sym->name, &demangled_name, &demangled_length)); - for (i = 0; i < kext->sects.nitems; ++i) { - sect = kxld_array_get_item(&kext->sects, i); - ngots += kxld_sect_get_ngots(sect, &kext->relocator, - kext->symtab); - } + } else { - rval = add_section(kext, §); - require_noerr(rval, finish); + /* Find the address of the defined symbol */ + if (kxld_sym_is_undefined(sym)) { + name = sym->name; + } else { + name = sym->alias; + } + defined_sym = kxld_dict_find(defined_symbols, name); + + /* Resolve the symbol. If a definition cannot be found, then: + * 1) Psuedokexts log a warning and proceed + * 2) Actual kexts delay the error until validation in case vtable + * patching replaces the undefined symbol. + */ - rval = kxld_sect_init_got(sect, ngots); - require_noerr(rval, finish); + if (defined_sym) { - kext->got_is_created = TRUE; - rval = KERN_SUCCESS; + rval = kxld_sym_resolve(sym, defined_sym->link_addr); + require_noerr(rval, finish); -finish: - return rval; -} + if (obsolete_symbols && kxld_dict_find(obsolete_symbols, name)) { + kxld_log(kKxldLogLinking, kKxldLogWarn, + "This kext uses obsolete symbol %s.", + kxld_demangle(name, &demangled_name, &demangled_length)); + } -/******************************************************************************* -*******************************************************************************/ -static kern_return_t -populate_got(KXLDKext *kext) -{ - kern_return_t rval = KERN_FAILURE; - KXLDSect *sect = NULL; - u_int i = 0; + } else if (kxld_sym_is_weak(sym)) { + kxld_addr_t addr = 0; - if (!target_has_got(kext) || !kext->got_is_created) { - rval = KERN_SUCCESS; - goto finish; - } + /* Make sure that the kext has referenced gOSKextUnresolved. + */ + require_action(tests_for_weak, finish, + rval=KERN_FAILURE; + kxld_log(kKxldLogLinking, kKxldLogErr, + "This kext has weak references but does not test for " + "them. Test for weak references with " + "OSKextIsSymbolResolved().")); - for (i = 0; i < kext->sects.nitems; ++i) { - sect = kxld_array_get_item(&kext->sects, i); - if (streq_safe(sect->segname, KXLD_SEG_GOT, sizeof(KXLD_SEG_GOT)) && - streq_safe(sect->sectname, KXLD_SECT_GOT, sizeof(KXLD_SECT_GOT))) - { - kxld_sect_populate_got(sect, kext->symtab, - kxld_kext_target_needs_swap(kext)); - break; +#if KERNEL + /* Get the address of the default weak address. + */ + addr = (kxld_addr_t) &kext_weak_symbol_referenced; +#else + /* This is run during symbol generation only, so we only + * need a filler value here. + */ + addr = 0xF00DD00D; +#endif /* KERNEL */ + + rval = kxld_sym_resolve(sym, addr); + require_noerr(rval, finish); + } } } - - require_action(i < kext->sects.nitems, finish, rval=KXLD_MISSING_GOT); + require_noerr_action(error, finish, rval=KERN_FAILURE); rval = KERN_SUCCESS; finish: + if (demangled_name) kxld_free(demangled_name, demangled_length); + return rval; } -#endif /* KXLD_USER_OR_GOT */ /******************************************************************************* +* We must patch vtables to ensure binary compatibility, and to perform that +* patching, we have to determine the vtables' inheritance relationships. The +* MetaClass system gives us a way to do that: +* 1) Iterate over all of the super MetaClass pointer symbols. Every class +* that inherits from OSObject will have a pointer in its MetaClass that +* points to the MetaClass's super MetaClass. +* 2) Derive the name of the class from the super MetaClass pointer. +* 3) Derive the name of the class's vtable from the name of the class +* 4) Follow the super MetaClass pointer to get the address of the super +* MetaClass's symbol +* 5) Look up the super MetaClass symbol by address +* 6) Derive the super class's name from the super MetaClass name +* 7) Derive the super class's vtable from the super class's name +* This procedure will allow us to find all of the OSObject-derived classes and +* their super classes, and thus patch all of the vtables. +* +* We also have to take care to patch up the MetaClass's vtables. The +* MetaClasses follow a parallel hierarchy to the classes, so once we have the +* class name and super class name, we can also derive the MetaClass name and +* the super MetaClass name, and thus find and patch their vtables as well. *******************************************************************************/ -static boolean_t -target_supports_common(const KXLDKext *kext) -{ - check(kext); - return (kext->cputype == CPU_TYPE_I386 || - kext->cputype == CPU_TYPE_POWERPC); -} -#if KXLD_USER_OR_COMMON -/******************************************************************************* -* If there are common symbols, calculate how much space they'll need -* and create/grow the __DATA __common section to accommodate them. -* Then, resolve them against that section. -*******************************************************************************/ +#define kOSMetaClassVTableName "__ZTV11OSMetaClass" + static kern_return_t -resolve_common_symbols(KXLDKext *kext) +patch_vtables(KXLDKext *kext, KXLDDict *patched_vtables, + const KXLDDict *defined_symbols) { kern_return_t rval = KERN_FAILURE; KXLDSymtabIterator iter; - KXLDSym *sym = NULL; - KXLDSect *sect = NULL; - kxld_addr_t base_addr = 0; - kxld_size_t size = 0; - kxld_size_t total_size = 0; - u_int align = 0; - u_int max_align = 0; - u_int sectnum = 0; - - if (!target_supports_common(kext)) { - rval = KERN_SUCCESS; - goto finish; - } - - /* Iterate over the common symbols to calculate their total aligned size */ - kxld_symtab_iterator_init(&iter, kext->symtab, kxld_sym_is_common, FALSE); - while ((sym = kxld_symtab_iterator_get_next(&iter))) { - align = kxld_sym_get_common_align(sym); - size = kxld_sym_get_common_size(sym); - - if (align > max_align) max_align = align; - - total_size = kxld_align_address(total_size, align) + size; - } + const KXLDSymtab *symtab = NULL; + const KXLDSym *metaclass = NULL; + KXLDSym *super_metaclass_pointer = NULL; + KXLDSym *final_sym = NULL; + KXLDVTable *vtable = NULL; + KXLDVTable *super_vtable = NULL; + char class_name[KXLD_MAX_NAME_LEN]; + char super_class_name[KXLD_MAX_NAME_LEN]; + char vtable_name[KXLD_MAX_NAME_LEN]; + char super_vtable_name[KXLD_MAX_NAME_LEN]; + char final_sym_name[KXLD_MAX_NAME_LEN]; + char *demangled_name1 = NULL; + char *demangled_name2 = NULL; + size_t demangled_length1 = 0;; + size_t demangled_length2 = 0; + size_t len = 0; + u_int nvtables = 0; + u_int npatched = 0; + u_int nprogress = 0; + boolean_t failure = FALSE; - /* If there are common symbols, grow or create the __DATA __common section - * to hold them. - */ - if (total_size) { - sect = kxld_kext_get_sect_by_name(kext, SEG_DATA, SECT_COMMON); - if (sect) { - base_addr = sect->base_addr + sect->size; + check(kext); + check(patched_vtables); - kxld_sect_grow(sect, total_size, max_align); - } else { - base_addr = 0; + symtab = kxld_object_get_symtab(kext->kext); - rval = add_section(kext, §); - require_noerr(rval, finish); + rval = create_vtable_index(kext); + require_noerr(rval, finish); - kxld_sect_init_zerofill(sect, SEG_DATA, SECT_COMMON, - total_size, max_align); - } + /* Find each super meta class pointer symbol */ - /* Resolve the common symbols against the new section */ - rval = kxld_array_get_index(&kext->sects, sect, §num); - require_noerr(rval, finish); + kxld_symtab_iterator_init(&iter, symtab, + kxld_sym_is_super_metaclass_pointer, FALSE); + nvtables = kxld_symtab_iterator_get_num_remaining(&iter); + while (npatched < nvtables) { + npatched = 0; + nprogress = 0; kxld_symtab_iterator_reset(&iter); - while ((sym = kxld_symtab_iterator_get_next(&iter))) { - align = kxld_sym_get_common_align(sym); - size = kxld_sym_get_common_size(sym); + while((super_metaclass_pointer = kxld_symtab_iterator_get_next(&iter))) + { + /* Get the class name from the smc pointer */ + rval = kxld_sym_get_class_name_from_super_metaclass_pointer( + super_metaclass_pointer, class_name, sizeof(class_name)); + require_noerr(rval, finish); - base_addr = kxld_align_address(base_addr, align); - kxld_sym_resolve_common(sym, sectnum, base_addr); + /* Get the vtable name from the class name */ + rval = kxld_sym_get_vtable_name_from_class_name(class_name, + vtable_name, sizeof(vtable_name)); + require_noerr(rval, finish); - base_addr += size; - } - } + /* Get the vtable and make sure it hasn't been patched */ + vtable = kxld_dict_find(&kext->vtable_index, vtable_name); + require_action(vtable, finish, rval=KERN_FAILURE; + kxld_log(kKxldLogPatching, kKxldLogErr, kKxldLogMissingVtable, + vtable_name, class_name)); - rval = KERN_SUCCESS; + if (!vtable->is_patched) { -finish: - return rval; -} -#endif /* KXLD_USER_OR_COMMON */ + /* Find the SMCP's meta class symbol */ + metaclass = get_metaclass_symbol_from_super_meta_class_pointer_symbol( + kext, super_metaclass_pointer); + require_action(metaclass, finish, rval=KERN_FAILURE); -/******************************************************************************* -*******************************************************************************/ -static kern_return_t -get_metaclass_symbol_from_super_meta_class_pointer_symbol(KXLDKext *kext, - KXLDSym *super_metaclass_pointer_sym, KXLDSym **metaclass) -{ - kern_return_t rval = KERN_FAILURE; - KXLDSect *sect = NULL; - KXLDReloc *reloc = NULL; - uint32_t offset = 0; - - check(kext); - check(super_metaclass_pointer_sym); - check(metaclass); - *metaclass = NULL; + /* Get the super class name from the super metaclass */ + rval = kxld_sym_get_class_name_from_metaclass(metaclass, + super_class_name, sizeof(super_class_name)); + require_noerr(rval, finish); - sect = kxld_array_get_item(&kext->sects, super_metaclass_pointer_sym->sectnum); - require_action(sect, finish, rval=KERN_FAILURE); + /* Get the super vtable name from the class name */ + rval = kxld_sym_get_vtable_name_from_class_name(super_class_name, + super_vtable_name, sizeof(super_vtable_name)); + require_noerr(rval, finish); - /* Find the relocation entry for the super metaclass pointer and get the - * symbol associated with that relocation entry - */ + /* Get the super vtable if it's been patched */ + super_vtable = kxld_dict_find(patched_vtables, super_vtable_name); - if (kext->is_final_image) { - /* The relocation entry could be in either the external or local - * relocation entries. kxld_reloc_get_symbol() can handle either - * type. - */ - reloc = kxld_reloc_get_reloc_by_offset(&kext->extrelocs, - super_metaclass_pointer_sym->base_addr); - if (!reloc) { - reloc = kxld_reloc_get_reloc_by_offset(&kext->locrelocs, - super_metaclass_pointer_sym->base_addr); - } - require_action(reloc, finish, rval=KERN_FAILURE); + if (failure) { + const KXLDVTable *unpatched_super_vtable; + unpatched_super_vtable = kxld_dict_find(&kext->vtable_index, + super_vtable_name); + + /* If the parent's vtable hasn't been patched, warn that + * this vtable is unpatchable because of the parent. + */ + if (!super_vtable) { + kxld_log(kKxldLogPatching, kKxldLogErr, + "The %s was not patched because its parent, " + "the %s, was not %s.", + kxld_demangle(vtable_name, &demangled_name1, + &demangled_length1), + kxld_demangle(super_vtable_name, &demangled_name2, + &demangled_length2), + (unpatched_super_vtable) ? "patchable" : "found"); + } + continue; + } - *metaclass = kxld_reloc_get_symbol(&kext->relocator, reloc, kext->file, - kext->symtab); - } else { - offset = kxld_sym_get_section_offset(super_metaclass_pointer_sym, sect); + if (!super_vtable) continue; - reloc = kxld_reloc_get_reloc_by_offset(§->relocs, offset); - require_action(reloc, finish, rval=KERN_FAILURE); + /* Get the final symbol's name from the super vtable */ + rval = kxld_sym_get_final_sym_name_from_class_name(super_class_name, + final_sym_name, sizeof(final_sym_name)); + require_noerr(rval, finish); - *metaclass = kxld_reloc_get_symbol(&kext->relocator, reloc, sect->data, - kext->symtab); - } - require_action(*metaclass, finish, rval=KERN_FAILURE); + /* Verify that the final symbol does not exist. First check + * all the externally defined symbols, then check locally. + */ + final_sym = kxld_dict_find(defined_symbols, final_sym_name); + if (!final_sym) { + final_sym = kxld_symtab_get_locally_defined_symbol_by_name( + symtab, final_sym_name); + } + if (final_sym) { + kxld_log(kKxldLogPatching, kKxldLogErr, + "Class '%s' is a subclass of final class '%s'.", + kxld_demangle(class_name, &demangled_name1, + &demangled_length1), + kxld_demangle(super_class_name, &demangled_name2, + &demangled_length2)); + continue; + } - rval = KERN_SUCCESS; + /* Patch the class's vtable */ + rval = kxld_vtable_patch(vtable, super_vtable, kext->kext); + if (rval) continue; -finish: - return rval; -} + /* Add the class's vtable to the set of patched vtables */ + rval = kxld_dict_insert(patched_vtables, vtable->name, vtable); + require_noerr(rval, finish); -/******************************************************************************* -*******************************************************************************/ -static kern_return_t -copy_vtables(KXLDKext *kext, const KXLDDict *patched_vtables) -{ - kern_return_t rval = KERN_FAILURE; - KXLDSymtabIterator iter; - KXLDSym *sym = NULL; - KXLDVTable *vtable = NULL, *src = NULL; - u_int i = 0; - u_int nvtables = 0; - char class_name[KXLD_MAX_NAME_LEN]; - char meta_vtable_name[KXLD_MAX_NAME_LEN]; + /* Get the meta vtable name from the class name */ + rval = kxld_sym_get_meta_vtable_name_from_class_name(class_name, + vtable_name, sizeof(vtable_name)); + require_noerr(rval, finish); - kxld_symtab_iterator_init(&iter, kext->symtab, - kxld_sym_is_class_vtable, FALSE); - - /* The iterator tracks all the class vtables, so we double the number of - * vtables we're expecting because we use the class vtables to find the - * MetaClass vtables. - */ - nvtables = kxld_symtab_iterator_get_num_remaining(&iter) * 2; - rval = kxld_array_init(&kext->vtables, sizeof(KXLDVTable), nvtables); - require_noerr(rval, finish); - - while ((sym = kxld_symtab_iterator_get_next(&iter))) { - src = kxld_dict_find(patched_vtables, sym->name); - require_action(src, finish, rval=KERN_FAILURE); + /* Get the meta vtable. Whether or not it should exist has already + * been tested in create_vtables(), so if it doesn't exist and we're + * still running, we can safely skip it. + */ + vtable = kxld_dict_find(&kext->vtable_index, vtable_name); + if (!vtable) { + ++nprogress; + ++npatched; + continue; + } + require_action(!vtable->is_patched, finish, rval=KERN_FAILURE); - vtable = kxld_array_get_item(&kext->vtables, i++); - rval = kxld_vtable_copy(vtable, src); - require_noerr(rval, finish); + /* There is no way to look up a metaclass vtable at runtime, but + * we know that every class's metaclass inherits directly from + * OSMetaClass, so we just hardcode that vtable name here. + */ + len = strlcpy(super_vtable_name, kOSMetaClassVTableName, + sizeof(super_vtable_name)); + require_action(len == const_strlen(kOSMetaClassVTableName), + finish, rval=KERN_FAILURE); + + /* Get the super meta vtable */ + super_vtable = kxld_dict_find(patched_vtables, super_vtable_name); + require_action(super_vtable && super_vtable->is_patched, + finish, rval=KERN_FAILURE); - rval = kxld_sym_get_class_name_from_vtable(sym, - class_name, sizeof(class_name)); - require_noerr(rval, finish); + /* Patch the meta class's vtable */ + rval = kxld_vtable_patch(vtable, super_vtable, kext->kext); + require_noerr(rval, finish); - rval = kxld_sym_get_meta_vtable_name_from_class_name(class_name, - meta_vtable_name, sizeof(meta_vtable_name)); - require_noerr(rval, finish); + /* Add the MetaClass's vtable to the set of patched vtables */ + rval = kxld_dict_insert(patched_vtables, vtable->name, vtable); + require_noerr(rval, finish); + + ++nprogress; + } - /* Some classes don't have a MetaClass, so when we run across one - * of those, we shrink the vtable array by 1. - */ - src = kxld_dict_find(patched_vtables, meta_vtable_name); - if (src) { - vtable = kxld_array_get_item(&kext->vtables, i++); - rval = kxld_vtable_copy(vtable, src); - require_noerr(rval, finish); - } else { - kxld_array_resize(&kext->vtables, kext->vtables.nitems - 1); + ++npatched; } - } - - rval = KERN_SUCCESS; - -finish: - return rval; -} - -#if KXLD_USER_OR_OBJECT -/******************************************************************************* -*******************************************************************************/ -static kern_return_t -process_relocs_from_sections(KXLDKext *kext) -{ - kern_return_t rval = KERN_FAILURE; - KXLDSect *sect = NULL; - u_int i = 0; - for (i = 0; i < kext->sects.nitems; ++i) { - sect = kxld_array_get_item(&kext->sects, i); - rval = kxld_sect_process_relocs(sect, &kext->relocator, - &kext->sects, kext->symtab); - require_noerr_action(rval, finish, - kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogInvalidSectReloc, - i, sect->segname, sect->sectname)); + require_action(!failure, finish, rval=KERN_FAILURE); + failure = (nprogress == 0); } rval = KERN_SUCCESS; - finish: + if (demangled_name1) kxld_free(demangled_name1, demangled_length1); + if (demangled_name2) kxld_free(demangled_name2, demangled_length2); + return rval; } -#endif /* KXLD_USER_OR_OBJECT */ -#if KXLD_USER_OR_BUNDLE /******************************************************************************* *******************************************************************************/ static kern_return_t -process_relocs_from_tables(KXLDKext *kext) +create_vtable_index(KXLDKext *kext) { kern_return_t rval = KERN_FAILURE; - KXLDReloc *reloc = NULL; - KXLDSeg *seg = NULL; + KXLDVTable *vtable = NULL; u_int i = 0; - /* Offsets for relocations in relocation tables are based on the vm - * address of the first segment. - */ - seg = kxld_array_get_item(&kext->segs, 0); - - /* Process external relocations */ - for (i = 0; i < kext->extrelocs.nitems; ++i) { - reloc = kxld_array_get_item(&kext->extrelocs, i); - - rval = kxld_relocator_process_table_reloc(&kext->relocator, reloc, seg, - kext->file, &kext->sects, kext->symtab); - require_noerr_action(rval, finish, - kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogInvalidExtReloc, i)); + if (kext->vtable_index_created) { + rval = KERN_SUCCESS; + goto finish; } - /* Process local relocations */ - for (i = 0; i < kext->locrelocs.nitems; ++i) { - reloc = kxld_array_get_item(&kext->locrelocs, i); + /* Map vtable names to the vtable structures */ + rval = kxld_dict_init(&kext->vtable_index, kxld_dict_string_hash, + kxld_dict_string_cmp, kext->vtables.nitems); + require_noerr(rval, finish); - rval = kxld_relocator_process_table_reloc(&kext->relocator, reloc, seg, - kext->file, &kext->sects, kext->symtab); - require_noerr_action(rval, finish, - kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogInvalidIntReloc, i)); + for (i = 0; i < kext->vtables.nitems; ++i) { + vtable = kxld_array_get_item(&kext->vtables, i); + rval = kxld_dict_insert(&kext->vtable_index, vtable->name, vtable); + require_noerr(rval, finish); } + kext->vtable_index_created = TRUE; rval = KERN_SUCCESS; - finish: return rval; } /******************************************************************************* *******************************************************************************/ -static void -add_to_ptr(u_char *symptr, kxld_addr_t val, boolean_t is_32_bit) -{ - if (is_32_bit) { - uint32_t *ptr = (uint32_t *) symptr; - *ptr += (uint32_t) val; - } else { - uint64_t *ptr = (uint64_t *) symptr; - *ptr += (uint64_t) val; - } -} - -#define SECT_SYM_PTRS "__nl_symbol_ptr" - -/******************************************************************************* -* Final linked images create an __nl_symbol_ptr section for the global offset -* table and for symbol pointer lookups in general. Rather than use relocation -* entries, the linker creates an "indirect symbol table" which stores indexes -* into the symbol table corresponding to the entries of this section. This -* function populates the section with the relocated addresses of those symbols. -*******************************************************************************/ -static kern_return_t -process_symbol_pointers(KXLDKext *kext) +static const KXLDSym * +get_metaclass_symbol_from_super_meta_class_pointer_symbol(KXLDKext *kext, + KXLDSym *super_metaclass_pointer_sym) { kern_return_t rval = KERN_FAILURE; - KXLDSect *sect = NULL; - KXLDSym *sym = NULL; - int32_t *symidx = NULL; - u_char *symptr = NULL; - u_long symptrsize = 0; - u_int nsyms = 0; - u_int firstsym = 0; - u_int i = 0; - + const KXLDReloc *reloc = NULL; + const KXLDSect *sect = NULL; + const KXLDSym *metaclass = NULL; + check(kext); + check(super_metaclass_pointer_sym); - require_action(kext->is_final_image && kext->dysymtab_hdr, - finish, rval=KERN_FAILURE); - - /* Get the __DATA,__nl_symbol_ptr section. If it doesn't exist, we have - * nothing to do. - */ - - sect = kxld_kext_get_sect_by_name(kext, SEG_DATA, SECT_SYM_PTRS); - if (!sect) { - rval = KERN_SUCCESS; - goto finish; - } - - require_action(sect->flags & S_NON_LAZY_SYMBOL_POINTERS, - finish, rval=KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO - "Section %s,%s does not have S_NON_LAZY_SYMBOL_POINTERS flag.", - SEG_DATA, SECT_SYM_PTRS)); - - /* Calculate the table offset and number of entries in the section */ - - if (kxld_kext_is_32_bit(kext)) { - symptrsize = sizeof(uint32_t); - } else { - symptrsize = sizeof(uint64_t); - } - - nsyms = (u_int) (sect->size / symptrsize); - firstsym = sect->reserved1; - - require_action(firstsym + nsyms <= kext->dysymtab_hdr->nindirectsyms, - finish, rval=KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO)); - - /* Iterate through the indirect symbol table and fill in the section of - * symbol pointers. There are three cases: - * 1) A normal symbol - put its value directly in the table - * 2) An INDIRECT_SYMBOL_LOCAL - symbols that are local and already have - * their offset from the start of the file in the section. Simply - * add the file's link address to fill this entry. - * 3) An INDIRECT_SYMBOL_ABS - prepopulated absolute symbols. No - * action is required. - */ - - symidx = (int32_t *) (kext->file + kext->dysymtab_hdr->indirectsymoff); - symidx += firstsym; - symptr = sect->data; - for (i = 0; i < nsyms; ++i, ++symidx, symptr+=symptrsize) { - if (*symidx & INDIRECT_SYMBOL_LOCAL) { - if (*symidx & INDIRECT_SYMBOL_ABS) continue; - - add_to_ptr(symptr, kext->link_addr, kxld_kext_is_32_bit(kext)); - } else { - sym = kxld_symtab_get_symbol_by_index(kext->symtab, *symidx); - require_action(sym, finish, rval=KERN_FAILURE); + /* Get the relocation entry that fills in the super metaclass pointer. */ + reloc = kxld_object_get_reloc_at_symbol(kext->kext, + super_metaclass_pointer_sym); + require_action(reloc, finish, rval=KERN_FAILURE); - add_to_ptr(symptr, sym->link_addr, kxld_kext_is_32_bit(kext)); - } - } + /* Get the section of the super metaclass pointer. */ + sect = kxld_object_get_section_by_index(kext->kext, + super_metaclass_pointer_sym->sectnum); + require_action(sect, finish, rval=KERN_FAILURE); - rval = KERN_SUCCESS; + /* Get the symbol that will be filled into the super metaclass pointer. */ + metaclass = kxld_object_get_symbol_of_reloc(kext->kext, reloc, sect); finish: - return rval; + return metaclass; } -#endif /* KXLD_USER_OR_BUNDLE */ /******************************************************************************* *******************************************************************************/ static kern_return_t -populate_kmod_info(KXLDKext *kext) +validate_symbols(KXLDKext *kext) { kern_return_t rval = KERN_FAILURE; - KXLDSect *kmodsect = NULL; - KXLDSym *kmodsym = NULL; - u_long kmod_offset = 0; - u_long header_size; - u_long size; - - if (kext->link_type != KXLD_LINK_KEXT) { - rval = KERN_SUCCESS; - goto finish; - } - - kxld_kext_get_vmsize(kext, &header_size, &size); - - kmodsym = kxld_symtab_get_symbol_by_name(kext->symtab, KXLD_KMOD_INFO_SYMBOL); - require_action(kmodsym, finish, rval=KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogNoKmodInfo)); - - kmodsect = kxld_array_get_item(&kext->sects, kmodsym->sectnum); - kmod_offset = (u_long) (kmodsym->base_addr - kmodsect->base_addr); - - kext->kmod_info = (kmod_info_t *) (kmodsect->data + kmod_offset); - kext->kmod_link_addr = kmodsym->link_addr; - - if (kxld_kext_is_32_bit(kext)) { - kmod_info_32_v1_t *kmod = (kmod_info_32_v1_t *) (kext->kmod_info); - kmod->address = (uint32_t) kext->link_addr; - kmod->size = (uint32_t) size; - kmod->hdr_size = (uint32_t) header_size; - -#if !KERNEL - if (kxld_kext_target_needs_swap(kext)) { - kmod->address = OSSwapInt32(kmod->address); - kmod->size = OSSwapInt32(kmod->size); - kmod->hdr_size = OSSwapInt32(kmod->hdr_size); - } -#endif /* !KERNEL */ - } else { - kmod_info_64_v1_t *kmod = (kmod_info_64_v1_t *) (kext->kmod_info); - kmod->address = kext->link_addr; - kmod->size = size; - kmod->hdr_size = header_size; - -#if !KERNEL - if (kxld_kext_target_needs_swap(kext)) { - kmod->address = OSSwapInt64(kmod->address); - kmod->size = OSSwapInt64(kmod->size); - kmod->hdr_size = OSSwapInt64(kmod->hdr_size); + KXLDSymtabIterator iter; + KXLDSym *sym = NULL; + u_int error = FALSE; + char *demangled_name = NULL; + size_t demangled_length = 0; + + /* Check for any unresolved symbols */ + kxld_symtab_iterator_init(&iter, kxld_object_get_symtab(kext->kext), + kxld_sym_is_unresolved, FALSE); + while ((sym = kxld_symtab_iterator_get_next(&iter))) { + if (!error) { + error = TRUE; + kxld_log(kKxldLogLinking, kKxldLogErr, + "The following symbols are unresolved for this kext:"); } -#endif /* !KERNEL */ + kxld_log(kKxldLogLinking, kKxldLogErr, "\t%s", + kxld_demangle(sym->name, &demangled_name, &demangled_length)); } - + require_noerr_action(error, finish, rval=KERN_FAILURE); rval = KERN_SUCCESS; finish: + if (demangled_name) kxld_free(demangled_name, demangled_length); return rval; } diff --git a/libkern/kxld/kxld_kext.h b/libkern/kxld/kxld_kext.h index 20eeaf501..f2b80c0f6 100644 --- a/libkern/kxld/kxld_kext.h +++ b/libkern/kxld/kxld_kext.h @@ -39,6 +39,7 @@ struct kxld_array; struct kxld_kext; struct kxld_dict; +struct kxld_object; struct kxld_sect; struct kxld_seg; struct kxld_symtab; @@ -50,12 +51,11 @@ typedef struct kxld_kext KXLDKext; *******************************************************************************/ size_t kxld_kext_sizeof(void) - __attribute__((const, nonnull, visibility("hidden"))); + __attribute__((const, visibility("hidden"))); -kern_return_t kxld_kext_init(KXLDKext *kext, u_char *file, u_long size, - const char *name, KXLDFlags flags, boolean_t is_kernel, KXLDArray *seg_order, - cpu_type_t cputype, cpu_subtype_t cpusubtype) - __attribute__((nonnull(1,2,4), visibility("hidden"))); +kern_return_t kxld_kext_init(KXLDKext *kext, struct kxld_object *kext_object, + struct kxld_object *interface_object) + __attribute__((nonnull(1,2), visibility("hidden"))); void kxld_kext_clear(KXLDKext *kext) __attribute__((nonnull, visibility("hidden"))); @@ -67,76 +67,33 @@ void kxld_kext_deinit(KXLDKext *kext) * Accessors *******************************************************************************/ -boolean_t kxld_kext_is_true_kext(const KXLDKext *kext) - __attribute__((pure, nonnull, visibility("hidden"))); +kern_return_t kxld_kext_export_symbols(const KXLDKext *kext, + struct kxld_dict *defined_symbols_by_name, + struct kxld_dict *obsolete_symbols_by_name, + struct kxld_dict *defined_cxx_symbols_by_value) + __attribute__((nonnull(1), visibility("hidden"))); -boolean_t kxld_kext_is_32_bit(const KXLDKext *kext) - __attribute__((pure, nonnull, visibility("hidden"))); - -void kxld_kext_get_cputype(const KXLDKext *kext, cpu_type_t *cputype, - cpu_subtype_t *cpusubtype) - __attribute__((nonnull, visibility("hidden"))); - -kern_return_t kxld_kext_validate_cputype(const KXLDKext *kext, cpu_type_t cputype, - cpu_subtype_t cpusubtype) - __attribute__((pure, nonnull, visibility("hidden"))); - -void kxld_kext_get_vmsize(const KXLDKext *kext, u_long *header_size, - u_long *vmsize) - __attribute__((nonnull, visibility("hidden"))); - -const struct kxld_symtab * kxld_kext_get_symtab(const KXLDKext *kext) - __attribute__((pure, nonnull, visibility("hidden"))); - -u_int kxld_kext_get_num_symbols(const KXLDKext *kext) - __attribute__((pure, nonnull, visibility("hidden"))); - -void kxld_kext_get_vtables(KXLDKext *kext, const struct kxld_array **vtables) +void kxld_kext_get_vmsize(const KXLDKext *kext, + u_long *header_size, u_long *vmsize) __attribute__((nonnull, visibility("hidden"))); - -u_int kxld_kext_get_num_vtables(const KXLDKext *kext) - __attribute__((pure, nonnull, visibility("hidden"))); - -struct kxld_seg * kxld_kext_get_seg_by_name(const KXLDKext *kext, - const char *segname) - __attribute__((pure, nonnull, visibility("hidden"))); - -struct kxld_sect * kxld_kext_get_sect_by_name(const KXLDKext *kext, - const char *segname, const char *sectname) - __attribute__((pure, nonnull, visibility("hidden"))); - -int kxld_kext_get_sectnum_for_sect(const KXLDKext *kext, - const struct kxld_sect *sect) - __attribute__((pure, nonnull, visibility("hidden"))); - -const struct kxld_array * kxld_kext_get_section_order(const KXLDKext *kext) - __attribute__((pure, nonnull, visibility("hidden"))); - -/* This will be the same size as kxld_kext_get_vmsize */ + kern_return_t kxld_kext_export_linked_object(const KXLDKext *kext, - u_char *linked_object, kxld_addr_t *kmod_info_kern) + u_char *linked_object, kxld_addr_t *kmod_info) __attribute__((nonnull, visibility("hidden"))); -#if !KERNEL -kern_return_t kxld_kext_export_symbol_file(const KXLDKext *kext, - u_char **symbol_file, u_long *filesize) - __attribute__((nonnull, visibility("hidden"))); -#endif - -boolean_t kxld_kext_target_needs_swap(const KXLDKext *kext) - __attribute__((pure, nonnull, visibility("hidden"))); - /******************************************************************************* * Modifiers *******************************************************************************/ - -kern_return_t kxld_kext_resolve(KXLDKext *kext, struct kxld_dict *patched_vtables, - struct kxld_dict *defined_symbols) +kern_return_t kxld_kext_export_vtables(KXLDKext *kext, + const struct kxld_dict *defined_cxx_symbols, + const struct kxld_dict *defined_symbols, + struct kxld_dict *vtables) __attribute__((nonnull, visibility("hidden"))); kern_return_t kxld_kext_relocate(KXLDKext *kext, kxld_addr_t link_address, - struct kxld_dict *patched_vtables, struct kxld_dict *defined_symbols, - struct kxld_dict *obsolete_symbols) + struct kxld_dict *patched_vtables, const struct kxld_dict *defined_symbols, + const struct kxld_dict *obsolete_symbols, + const struct kxld_dict *defined_cxx_symbols) __attribute__((nonnull(1,3,4), visibility("hidden"))); #endif /* _KXLD_KEXT_H_ */ diff --git a/libkern/kxld/kxld_object.c b/libkern/kxld/kxld_object.c new file mode 100644 index 000000000..24b589912 --- /dev/null +++ b/libkern/kxld/kxld_object.c @@ -0,0 +1,2185 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include + +#include +#include +#include +#include + +#if KERNEL + #include + #include + #include +#else /* !KERNEL */ + #include + #include + #include + #include +#endif /* KERNEL */ + +#define DEBUG_ASSERT_COMPONENT_NAME_STRING "kxld" +#include + +#include "kxld_demangle.h" +#include "kxld_dict.h" +#include "kxld_reloc.h" +#include "kxld_sect.h" +#include "kxld_seg.h" +#include "kxld_symtab.h" +#include "kxld_util.h" +#include "kxld_uuid.h" +#include "kxld_vtable.h" + +#include "kxld_object.h" + +/******************************************************************************* +* Data structures +*******************************************************************************/ + +struct kxld_object { + u_char *file; + u_long size; + const char *name; + uint32_t filetype; + cpu_type_t cputype; + cpu_subtype_t cpusubtype; + KXLDArray segs; + KXLDArray sects; + KXLDArray extrelocs; + KXLDArray locrelocs; + KXLDRelocator relocator; + KXLDuuid uuid; + KXLDSymtab *symtab; + struct dysymtab_command *dysymtab_hdr; + kxld_addr_t link_addr; + boolean_t is_kernel; + boolean_t is_final_image; + boolean_t is_linked; + boolean_t got_is_created; +#if KXLD_USER_OR_OBJECT + KXLDArray *section_order; +#endif +#if !KERNEL + enum NXByteOrder host_order; + enum NXByteOrder target_order; +#endif +}; + +/******************************************************************************* +* Prototypes +*******************************************************************************/ + +static kern_return_t get_target_machine_info(KXLDObject *object, + cpu_type_t cputype, cpu_subtype_t cpusubtype); +static kern_return_t get_macho_slice_for_arch(KXLDObject *object, + u_char *file, u_long size); + +static u_long get_macho_header_size(const KXLDObject *object); +static u_long get_macho_data_size(const KXLDObject *object) __unused; + +static kern_return_t init_from_execute(KXLDObject *object); +static kern_return_t init_from_final_linked_image(KXLDObject *object, + u_int *filetype_out, struct symtab_command **symtab_hdr_out); + +static boolean_t target_supports_protected_segments(const KXLDObject *object) + __attribute__((pure)); +static void set_is_object_linked(KXLDObject *object); + +#if KXLD_USER_OR_BUNDLE +static boolean_t target_supports_bundle(const KXLDObject *object) + __attribute((pure)); +static kern_return_t init_from_bundle(KXLDObject *object); +static kern_return_t process_relocs_from_tables(KXLDObject *object); +static KXLDSeg *get_seg_by_base_addr(KXLDObject *object, + kxld_addr_t base_addr); +static kern_return_t process_symbol_pointers(KXLDObject *object); +static void add_to_ptr(u_char *symptr, kxld_addr_t val, boolean_t is_32_bit); +#endif /* KXLD_USER_OR_BUNDLE */ + +#if KXLD_USER_OR_OBJECT +static boolean_t target_supports_object(const KXLDObject *object) + __attribute((pure)); +static kern_return_t init_from_object(KXLDObject *object); +static kern_return_t process_relocs_from_sections(KXLDObject *object); +#endif /* KXLD_USER_OR_OBJECT */ + +static kern_return_t export_macho_header(const KXLDObject *object, u_char *buf, + u_int ncmds, u_long *header_offset, u_long header_size); +#if KXLD_USER_OR_ILP32 +static u_long get_macho_cmd_data_32(u_char *file, u_long offset, + u_int *filetype, u_int *ncmds); +static kern_return_t export_macho_header_32(const KXLDObject *object, + u_char *buf, u_int ncmds, u_long *header_offset, u_long header_size); +#endif /* KXLD_USER_OR_ILP32 */ +#if KXLD_USER_OR_LP64 +static u_long get_macho_cmd_data_64(u_char *file, u_long offset, + u_int *filetype, u_int *ncmds); +static kern_return_t export_macho_header_64(const KXLDObject *object, + u_char *buf, u_int ncmds, u_long *header_offset, u_long header_size); +#endif /* KXLD_USER_OR_LP64 */ + +#if KXLD_USER_OR_GOT || KXLD_USER_OR_COMMON +static kern_return_t add_section(KXLDObject *object, KXLDSect **sect); +#endif /* KXLD_USER_OR_GOT || KXLD_USER_OR_COMMON */ + +#if KXLD_USER_OR_COMMON +static kern_return_t resolve_common_symbols(KXLDObject *object); +#endif /* KXLD_USER_OR_COMMON */ + +#if KXLD_USER_OR_GOT +static boolean_t target_has_got(const KXLDObject *object) __attribute__((pure)); +static kern_return_t create_got(KXLDObject *object); +static kern_return_t populate_got(KXLDObject *object); +#endif /* KXLD_USER_OR_GOT */ + +static KXLDSym *get_mutable_sym(const KXLDObject *object, const KXLDSym *sym); + +static kern_return_t populate_kmod_info(KXLDObject *object); + +/******************************************************************************* +* Prototypes that may need to be exported +*******************************************************************************/ +static boolean_t kxld_object_target_needs_swap(const KXLDObject *object __unused); +static KXLDSeg * kxld_object_get_seg_by_name(const KXLDObject *object, const char *segname); +static KXLDSect * kxld_object_get_sect_by_name(const KXLDObject *object, const char *segname, + const char *sectname); + +/******************************************************************************* +*******************************************************************************/ +size_t +kxld_object_sizeof(void) +{ + return sizeof(KXLDObject); +} + +/******************************************************************************* +*******************************************************************************/ +kern_return_t +kxld_object_init_from_macho(KXLDObject *object, u_char *file, u_long size, + const char *name, KXLDArray *section_order __unused, + cpu_type_t cputype, cpu_subtype_t cpusubtype) +{ + kern_return_t rval = KERN_FAILURE; + KXLDSeg * seg = NULL; + u_int i = 0; + + check(object); + check(file); + check(name); + + object->name = name; + +#if KXLD_USER_OR_OBJECT + object->section_order = section_order; +#endif + /* Find the local architecture */ + + rval = get_target_machine_info(object, cputype, cpusubtype); + require_noerr(rval, finish); + + /* Find the Mach-O slice for the target architecture */ + + rval = get_macho_slice_for_arch(object, file, size); + require_noerr(rval, finish); + + /* Allocate the symbol table */ + + if (!object->symtab) { + object->symtab = kxld_alloc(kxld_symtab_sizeof()); + require_action(object->symtab, finish, rval=KERN_RESOURCE_SHORTAGE); + bzero(object->symtab, kxld_symtab_sizeof()); + } + + /* Build the relocator */ + + rval = kxld_relocator_init(&object->relocator, object->file, + object->symtab, &object->sects, object->cputype, + object->cpusubtype, kxld_object_target_needs_swap(object)); + require_noerr(rval, finish); + + /* There are four types of Mach-O files that we can support: + * 1) 32-bit MH_OBJECT - Snow Leopard and earlier + * 2) 32-bit MH_KEXT_BUNDLE - Lion and Later + * 3) 64-bit MH_OBJECT - Unsupported + * 4) 64-bit MH_KEXT_BUNDLE - Snow Leopard and Later + */ + + if (kxld_object_is_32_bit(object)) { + struct mach_header *mach_hdr = (struct mach_header *) object->file; + object->filetype = mach_hdr->filetype; + } else { + struct mach_header_64 *mach_hdr = (struct mach_header_64 *) object->file; + object->filetype = mach_hdr->filetype; + } + + switch (object->filetype) { +#if KXLD_USER_OR_BUNDLE + case MH_KEXT_BUNDLE: + rval = init_from_bundle(object); + require_noerr(rval, finish); + break; +#endif /* KXLD_USER_OR_BUNDLE */ +#if KXLD_USER_OR_OBJECT + case MH_OBJECT: + rval = init_from_object(object); + require_noerr(rval, finish); + break; +#endif /* KXLD_USER_OR_OBJECT */ + case MH_EXECUTE: + object->is_kernel = TRUE; + rval = init_from_execute(object); + require_noerr(rval, finish); + break; + default: + rval = KERN_FAILURE; + kxld_log(kKxldLogLinking, kKxldLogErr, + kKxldLogFiletypeNotSupported, object->filetype); + goto finish; + } + + if (!kxld_object_is_kernel(object)) { + for (i = 0; i < object->segs.nitems; ++i) { + seg = kxld_array_get_item(&object->segs, i); + kxld_seg_set_vm_protections(seg, + target_supports_protected_segments(object)); + } + + seg = kxld_object_get_seg_by_name(object, SEG_LINKEDIT); + if (seg) { + (void) kxld_seg_populate_linkedit(seg, object->symtab, + kxld_object_is_32_bit(object)); + } + } + + (void) set_is_object_linked(object); + + rval = KERN_SUCCESS; +finish: + return rval; +} + +/******************************************************************************* +*******************************************************************************/ +kern_return_t +get_target_machine_info(KXLDObject *object, cpu_type_t cputype __unused, + cpu_subtype_t cpusubtype __unused) +{ +#if KERNEL + + /* Because the kernel can only link for its own architecture, we know what + * the host and target architectures are at compile time, so we can use + * a vastly simplified version of this function. + */ + + check(object); + +#if defined(__i386__) + object->cputype = CPU_TYPE_I386; + object->cpusubtype = CPU_SUBTYPE_I386_ALL; + return KERN_SUCCESS; +#elif defined(__x86_64__) + object->cputype = CPU_TYPE_X86_64; + object->cpusubtype = CPU_SUBTYPE_X86_64_ALL; + return KERN_SUCCESS; +#else + kxld_log(kKxldLogLinking, kKxldLogErr, + kKxldLogArchNotSupported, _mh_execute_header->cputype); + return KERN_NOT_SUPPORTED; +#endif /* Supported architecture defines */ + + +#else /* !KERNEL */ + + /* User-space must look up the architecture it's running on and the target + * architecture at run-time. + */ + + kern_return_t rval = KERN_FAILURE; + const NXArchInfo *host_arch = NULL; + + check(object); + + host_arch = NXGetLocalArchInfo(); + require_action(host_arch, finish, rval=KERN_FAILURE); + + object->host_order = host_arch->byteorder; + + /* If the user did not specify a cputype, use the local architecture. + */ + + if (cputype) { + object->cputype = cputype; + object->cpusubtype = cpusubtype; + } else { + object->cputype = host_arch->cputype; + object->target_order = object->host_order; + + switch (object->cputype) { + case CPU_TYPE_I386: + object->cpusubtype = CPU_SUBTYPE_I386_ALL; + break; + case CPU_TYPE_POWERPC: + object->cpusubtype = CPU_SUBTYPE_POWERPC_ALL; + break; + case CPU_TYPE_X86_64: + object->cpusubtype = CPU_SUBTYPE_X86_64_ALL; + break; + case CPU_TYPE_ARM: + object->cpusubtype = CPU_SUBTYPE_ARM_ALL; + break; + default: + object->cpusubtype = 0; + } + } + + /* Validate that we support the target architecture and record its + * endianness. + */ + + switch(object->cputype) { + case CPU_TYPE_ARM: + case CPU_TYPE_I386: + case CPU_TYPE_X86_64: + object->target_order = NX_LittleEndian; + break; + case CPU_TYPE_POWERPC: + object->target_order = NX_BigEndian; + break; + default: + rval = KERN_NOT_SUPPORTED; + kxld_log(kKxldLogLinking, kKxldLogErr, + kKxldLogArchNotSupported, object->cputype); + goto finish; + } + + rval = KERN_SUCCESS; + +finish: + return rval; +#endif /* KERNEL */ +} + +/******************************************************************************* +*******************************************************************************/ +static kern_return_t +get_macho_slice_for_arch(KXLDObject *object, u_char *file, u_long size) +{ + kern_return_t rval = KERN_FAILURE; + struct mach_header *mach_hdr = NULL; +#if !KERNEL + struct fat_header *fat = (struct fat_header *) file; + struct fat_arch *archs = (struct fat_arch *) &fat[1]; + boolean_t swap = FALSE; +#endif /* KERNEL */ + + check(object); + check(file); + check(size); + + object->file = file; + object->size = size; + + /* We are assuming that we will never receive a fat file in the kernel */ + +#if !KERNEL + require_action(size >= sizeof(*fat), finish, + rval=KERN_FAILURE; + kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogTruncatedMachO)); + + /* The fat header is always big endian, so swap if necessary */ + if (fat->magic == FAT_CIGAM) { + (void) swap_fat_header(fat, object->host_order); + swap = TRUE; + } + + if (fat->magic == FAT_MAGIC) { + struct fat_arch *arch = NULL; + + require_action(size >= (sizeof(*fat) + (fat->nfat_arch * sizeof(*archs))), + finish, + rval=KERN_FAILURE; + kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogTruncatedMachO)); + + /* Swap the fat_arch structures if necessary */ + if (swap) { + (void) swap_fat_arch(archs, fat->nfat_arch, object->host_order); + } + + /* Locate the Mach-O for the requested architecture */ + + arch = NXFindBestFatArch(object->cputype, object->cpusubtype, archs, + fat->nfat_arch); + require_action(arch, finish, rval=KERN_FAILURE; + kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogArchNotFound)); + require_action(size >= arch->offset + arch->size, finish, + rval=KERN_FAILURE; + kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogTruncatedMachO)); + + object->file = file + arch->offset; + object->size = arch->size; + } +#endif /* !KERNEL */ + + /* Swap the Mach-O's headers to this architecture if necessary */ + if (kxld_object_is_32_bit(object)) { + rval = validate_and_swap_macho_32(object->file, object->size +#if !KERNEL + , object->host_order +#endif /* !KERNEL */ + ); + } else { + rval = validate_and_swap_macho_64(object->file, object->size +#if !KERNEL + , object->host_order +#endif /* !KERNEL */ + ); + } + require_noerr(rval, finish); + + mach_hdr = (struct mach_header *) object->file; + require_action(object->cputype == mach_hdr->cputype, finish, + rval=KERN_FAILURE; + kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogTruncatedMachO)); + + rval = KERN_SUCCESS; +finish: + return rval; +} + +/******************************************************************************* +*******************************************************************************/ +static kern_return_t +init_from_final_linked_image(KXLDObject *object, u_int *filetype_out, + struct symtab_command **symtab_hdr_out) +{ + kern_return_t rval = KERN_FAILURE; + KXLDSeg *seg = NULL; + KXLDSect *sect = NULL; + struct load_command *cmd_hdr = NULL; + struct symtab_command *symtab_hdr = NULL; + struct uuid_command *uuid_hdr = NULL; + u_long base_offset = 0; + u_long offset = 0; + u_long sect_offset = 0; + u_int filetype = 0; + u_int i = 0; + u_int j = 0; + u_int segi = 0; + u_int secti = 0; + u_int nsegs = 0; + u_int nsects = 0; + u_int ncmds = 0; + + KXLD_3264_FUNC(kxld_object_is_32_bit(object), base_offset, + get_macho_cmd_data_32, get_macho_cmd_data_64, + object->file, offset, &filetype, &ncmds); + + /* First pass to count segments and sections */ + + offset = base_offset; + for (i = 0; i < ncmds; ++i, offset += cmd_hdr->cmdsize) { + cmd_hdr = (struct load_command *) (object->file + offset); + + switch(cmd_hdr->cmd) { +#if KXLD_USER_OR_ILP32 + case LC_SEGMENT: + { + struct segment_command *seg_hdr = + (struct segment_command *) cmd_hdr; + + /* Ignore segments with no vm size */ + if (!seg_hdr->vmsize) continue; + + ++nsegs; + nsects += seg_hdr->nsects; + } + break; +#endif /* KXLD_USER_OR_ILP32 */ +#if KXLD_USER_OR_LP64 + case LC_SEGMENT_64: + { + struct segment_command_64 *seg_hdr = + (struct segment_command_64 *) cmd_hdr; + + /* Ignore segments with no vm size */ + if (!seg_hdr->vmsize) continue; + + ++nsegs; + nsects += seg_hdr->nsects; + } + break; +#endif /* KXLD_USER_OR_LP64 */ + default: + continue; + } + } + + /* Allocate the segments and sections */ + + if (nsegs) { + rval = kxld_array_init(&object->segs, sizeof(KXLDSeg), nsegs); + require_noerr(rval, finish); + + rval = kxld_array_init(&object->sects, sizeof(KXLDSect), nsects); + require_noerr(rval, finish); + } + + /* Initialize the segments and sections */ + + offset = base_offset; + for (i = 0; i < ncmds; ++i, offset += cmd_hdr->cmdsize) { + cmd_hdr = (struct load_command *) (object->file + offset); + seg = NULL; + + switch(cmd_hdr->cmd) { +#if KXLD_USER_OR_ILP32 + case LC_SEGMENT: + { + struct segment_command *seg_hdr = + (struct segment_command *) cmd_hdr; + + /* Ignore segments with no vm size */ + if (!seg_hdr->vmsize) continue; + + seg = kxld_array_get_item(&object->segs, segi++); + + rval = kxld_seg_init_from_macho_32(seg, seg_hdr); + require_noerr(rval, finish); + + sect_offset = offset + sizeof(*seg_hdr); + } + break; +#endif /* KXLD_USER_OR_ILP32 */ +#if KXLD_USER_OR_LP64 + case LC_SEGMENT_64: + { + struct segment_command_64 *seg_hdr = + (struct segment_command_64 *) cmd_hdr; + + /* Ignore segments with no vm size */ + if (!seg_hdr->vmsize) continue; + + seg = kxld_array_get_item(&object->segs, segi++); + + rval = kxld_seg_init_from_macho_64(seg, seg_hdr); + require_noerr(rval, finish); + + sect_offset = offset + sizeof(*seg_hdr); + } + break; +#endif /* KXLD_USER_OR_LP64 */ + case LC_SYMTAB: + symtab_hdr = (struct symtab_command *) cmd_hdr; + break; + case LC_UUID: + uuid_hdr = (struct uuid_command *) cmd_hdr; + kxld_uuid_init_from_macho(&object->uuid, uuid_hdr); + break; + case LC_DYSYMTAB: + object->dysymtab_hdr = (struct dysymtab_command *) cmd_hdr; + + rval = kxld_reloc_create_macho(&object->extrelocs, &object->relocator, + (struct relocation_info *) (object->file + object->dysymtab_hdr->extreloff), + object->dysymtab_hdr->nextrel); + require_noerr(rval, finish); + + rval = kxld_reloc_create_macho(&object->locrelocs, &object->relocator, + (struct relocation_info *) (object->file + object->dysymtab_hdr->locreloff), + object->dysymtab_hdr->nlocrel); + require_noerr(rval, finish); + + break; + case LC_UNIXTHREAD: + /* Don't need to do anything with UNIXTHREAD for the kernel */ + require_action(kxld_object_is_kernel(object), + finish, rval=KERN_FAILURE; + kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO + "LC_UNIXTHREAD segment is not valid in a kext.")); + break; + default: + rval=KERN_FAILURE; + kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO + "Invalid segment type in MH_KEXT_BUNDLE kext: %u.", cmd_hdr->cmd); + goto finish; + } + + if (seg) { + + /* Initialize the sections */ + for (j = 0; j < seg->sects.nitems; ++j, ++secti) { + sect = kxld_array_get_item(&object->sects, secti); + KXLD_3264_FUNC(kxld_object_is_32_bit(object), rval, + kxld_sect_init_from_macho_32, kxld_sect_init_from_macho_64, + sect, object->file, §_offset, secti, &object->relocator); + require_noerr(rval, finish); + + /* Add the section to the segment. This will also make sure + * that the sections and segments have the same segname. + */ + rval = kxld_seg_add_section(seg, sect); + require_noerr(rval, finish); + } + rval = kxld_seg_finish_init(seg); + require_noerr(rval, finish); + } + } + + if (filetype_out) *filetype_out = filetype; + if (symtab_hdr_out) *symtab_hdr_out = symtab_hdr; + object->is_final_image = TRUE; + rval = KERN_SUCCESS; +finish: + return rval; +} + +/******************************************************************************* +*******************************************************************************/ +static kern_return_t +init_from_execute(KXLDObject *object) +{ + kern_return_t rval = KERN_FAILURE; + struct symtab_command *symtab_hdr = NULL; + u_int filetype = 0; + KXLDSeg * kernel_linkedit_seg = NULL; // used if running kernel +#if KXLD_USER_OR_OBJECT + KXLDSeg *seg = NULL; + KXLDSect *sect = NULL; + KXLDSectionName *sname = NULL; + u_int i = 0, j = 0, k = 0; +#endif /* KXLD_USER_OR_OBJECT */ + + check(object); + + require_action(kxld_object_is_kernel(object), finish, rval=KERN_FAILURE); + + rval = init_from_final_linked_image(object, &filetype, &symtab_hdr); + require_noerr(rval, finish); + + require_action(filetype == MH_EXECUTE, finish, rval=KERN_FAILURE; + kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO + "The kernel file is not of type MH_EXECUTE.")); + + /* Initialize the symbol table. If this is the running kernel + * we will work from the in-memory linkedit segment; + * otherwise we work from the whole mach-o image. + */ +#if KERNEL + kernel_linkedit_seg = kxld_object_get_seg_by_name(object, SEG_LINKEDIT); + require_action(kernel_linkedit_seg, finish, rval=KERN_FAILURE; + kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO)); +#endif + + KXLD_3264_FUNC(kxld_object_is_32_bit(object), rval, + kxld_symtab_init_from_macho_32, kxld_symtab_init_from_macho_64, + object->symtab, symtab_hdr, object->file, kernel_linkedit_seg); + require_noerr(rval, finish); + +#if KXLD_USER_OR_OBJECT + /* Save off the order of section names so that we can lay out kext + * sections for MH_OBJECT-based systems. + */ + if (target_supports_object(object)) { + + rval = kxld_array_init(object->section_order, sizeof(KXLDSectionName), + object->sects.nitems); + require_noerr(rval, finish); + + /* Copy the section names into the section_order array for future kext + * section ordering. + */ + for (i = 0, k = 0; i < object->segs.nitems; ++i) { + seg = kxld_array_get_item(&object->segs, i); + + for (j = 0; j < seg->sects.nitems; ++j, ++k) { + sect = *(KXLDSect **) kxld_array_get_item(&seg->sects, j); + sname = kxld_array_get_item(object->section_order, k); + + strlcpy(sname->segname, sect->segname, sizeof(sname->segname)); + strlcpy(sname->sectname, sect->sectname, sizeof(sname->sectname)); + } + } + } +#endif /* KXLD_USER_OR_OBJECT */ + + rval = KERN_SUCCESS; +finish: + return rval; +} + +#if KXLD_USER_OR_BUNDLE +/******************************************************************************* +*******************************************************************************/ +static boolean_t +target_supports_bundle(const KXLDObject *object) +{ + return (object->cputype == CPU_TYPE_I386 || + object->cputype == CPU_TYPE_X86_64 || + object->cputype == CPU_TYPE_ARM); +} + +/******************************************************************************* +*******************************************************************************/ +static kern_return_t +init_from_bundle(KXLDObject *object) +{ + kern_return_t rval = KERN_FAILURE; + struct symtab_command *symtab_hdr = NULL; + u_int filetype = 0; + + check(object); + + require_action(target_supports_bundle(object), finish, + rval=KERN_FAILURE; + kxld_log(kKxldLogLinking, kKxldLogErr, + kKxldLogFiletypeNotSupported, MH_KEXT_BUNDLE)); + + rval = init_from_final_linked_image(object, &filetype, &symtab_hdr); + require_noerr(rval, finish); + + require_action(filetype == MH_KEXT_BUNDLE, finish, + rval=KERN_FAILURE); + + KXLD_3264_FUNC(kxld_object_is_32_bit(object), rval, + kxld_symtab_init_from_macho_32, kxld_symtab_init_from_macho_64, + object->symtab, symtab_hdr, object->file, + /* kernel_linkedit_seg */ NULL); + require_noerr(rval, finish); + + rval = KERN_SUCCESS; +finish: + return rval; +} +#endif /* KXLD_USER_OR_BUNDLE */ + +#if KXLD_USER_OR_OBJECT +/******************************************************************************* +*******************************************************************************/ +static boolean_t target_supports_object(const KXLDObject *object) +{ + return (object->cputype == CPU_TYPE_POWERPC || + object->cputype == CPU_TYPE_I386 || + object->cputype == CPU_TYPE_ARM); +} + +/******************************************************************************* +*******************************************************************************/ +static kern_return_t +init_from_object(KXLDObject *object) +{ + kern_return_t rval = KERN_FAILURE; + struct load_command *cmd_hdr = NULL; + struct symtab_command *symtab_hdr = NULL; + struct uuid_command *uuid_hdr = NULL; + KXLDSect *sect = NULL; + u_long offset = 0; + u_long sect_offset = 0; + u_int filetype = 0; + u_int ncmds = 0; + u_int nsects = 0; + u_int i = 0; + boolean_t has_segment = FALSE; + + check(object); + + require_action(target_supports_object(object), + finish, rval=KERN_FAILURE; + kxld_log(kKxldLogLinking, kKxldLogErr, + kKxldLogFiletypeNotSupported, MH_OBJECT)); + + KXLD_3264_FUNC(kxld_object_is_32_bit(object), offset, + get_macho_cmd_data_32, get_macho_cmd_data_64, + object->file, offset, &filetype, &ncmds); + + require_action(filetype == MH_OBJECT, finish, rval=KERN_FAILURE); + + /* MH_OBJECTs use one unnamed segment to contain all of the sections. We + * loop over all of the load commands to initialize the structures we + * expect. Then, we'll use the unnamed segment to get to all of the + * sections, and then use those sections to create the actual segments. + */ + + for (; i < ncmds; ++i, offset += cmd_hdr->cmdsize) { + cmd_hdr = (struct load_command *) (object->file + offset); + + switch(cmd_hdr->cmd) { +#if KXLD_USER_OR_ILP32 + case LC_SEGMENT: + { + struct segment_command *seg_hdr = + (struct segment_command *) cmd_hdr; + + /* Ignore segments with no vm size */ + if (!seg_hdr->vmsize) continue; + + /* Ignore LINKEDIT segments */ + if (streq_safe(seg_hdr->segname, SEG_LINKEDIT, + const_strlen(SEG_LINKEDIT))) + { + continue; + } + + require_action(kxld_object_is_32_bit(object), finish, rval=KERN_FAILURE; + kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO + "LC_SEGMENT in 64-bit kext.")); + require_action(!has_segment, finish, rval=KERN_FAILURE; + kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO + "Multiple segments in an MH_OBJECT kext.")); + + nsects = seg_hdr->nsects; + sect_offset = offset + sizeof(*seg_hdr); + has_segment = TRUE; + } + break; +#endif /* KXLD_USER_OR_ILP32 */ +#if KXLD_USER_OR_LP64 + case LC_SEGMENT_64: + { + struct segment_command_64 *seg_hdr = + (struct segment_command_64 *) cmd_hdr; + + /* Ignore segments with no vm size */ + if (!seg_hdr->vmsize) continue; + + /* Ignore LINKEDIT segments */ + if (streq_safe(seg_hdr->segname, SEG_LINKEDIT, + const_strlen(SEG_LINKEDIT))) + { + continue; + } + + require_action(!kxld_object_is_32_bit(object), finish, rval=KERN_FAILURE; + kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO + "LC_SEGMENT_64 in a 32-bit kext.")); + require_action(!has_segment, finish, rval=KERN_FAILURE; + kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO + "Multiple segments in an MH_OBJECT kext.")); + + nsects = seg_hdr->nsects; + sect_offset = offset + sizeof(*seg_hdr); + has_segment = TRUE; + } + break; +#endif /* KXLD_USER_OR_LP64 */ + case LC_SYMTAB: + symtab_hdr = (struct symtab_command *) cmd_hdr; + + KXLD_3264_FUNC(kxld_object_is_32_bit(object), rval, + kxld_symtab_init_from_macho_32, kxld_symtab_init_from_macho_64, + object->symtab, symtab_hdr, object->file, + /* kernel_linkedit_seg */ NULL); + require_noerr(rval, finish); + break; + case LC_UUID: + uuid_hdr = (struct uuid_command *) cmd_hdr; + kxld_uuid_init_from_macho(&object->uuid, uuid_hdr); + break; + case LC_UNIXTHREAD: + /* Don't need to do anything with UNIXTHREAD */ + break; + default: + rval = KERN_FAILURE; + kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO + "Invalid segment type in MH_OBJECT kext: %u.", cmd_hdr->cmd); + goto finish; + } + } + + if (has_segment) { + + /* Get the number of sections from the segment and build the section index */ + + rval = kxld_array_init(&object->sects, sizeof(KXLDSect), nsects); + require_noerr(rval, finish); + + /* Loop over all of the sections to initialize the section index */ + + for (i = 0; i < nsects; ++i) { + sect = kxld_array_get_item(&object->sects, i); + KXLD_3264_FUNC(kxld_object_is_32_bit(object), rval, + kxld_sect_init_from_macho_32, kxld_sect_init_from_macho_64, + sect, object->file, §_offset, i, &object->relocator); + require_noerr(rval, finish); + } + + /* Create special sections */ + +#if KXLD_USER_OR_GOT + rval = create_got(object); + require_noerr(rval, finish); +#endif /* KXLD_USER_OR_GOT */ + +#if KXLD_USER_OR_COMMON + rval = resolve_common_symbols(object); + require_noerr(rval, finish); +#endif /* KXLD_USER_OR_COMMON */ + + /* Create the segments from the section index */ + + rval = kxld_seg_create_seg_from_sections(&object->segs, &object->sects); + require_noerr(rval, finish); + + rval = kxld_seg_finalize_object_segment(&object->segs, + object->section_order, get_macho_header_size(object)); + require_noerr(rval, finish); + + rval = kxld_seg_init_linkedit(&object->segs); + require_noerr(rval, finish); + } + + rval = KERN_SUCCESS; +finish: + return rval; +} +#endif /* KXLD_USER_OR_OBJECT */ + +#if KXLD_USER_OR_ILP32 +/******************************************************************************* +*******************************************************************************/ +static u_long +get_macho_cmd_data_32(u_char *file, u_long offset, u_int *filetype, u_int *ncmds) +{ + struct mach_header *mach_hdr = (struct mach_header *) (file + offset); + + if (filetype) *filetype = mach_hdr->filetype; + if (ncmds) *ncmds = mach_hdr->ncmds; + + return sizeof(*mach_hdr); +} + +#endif /* KXLD_USER_OR_ILP32 */ + +#if KXLD_USER_OR_LP64 +/******************************************************************************* +*******************************************************************************/ +static u_long +get_macho_cmd_data_64(u_char *file, u_long offset, u_int *filetype, u_int *ncmds) +{ + struct mach_header_64 *mach_hdr = (struct mach_header_64 *) (file + offset); + + if (filetype) *filetype = mach_hdr->filetype; + if (ncmds) *ncmds = mach_hdr->ncmds; + + return sizeof(*mach_hdr); +} +#endif /* KXLD_USER_OR_LP64 */ + +/******************************************************************************* +*******************************************************************************/ +static u_long +get_macho_header_size(const KXLDObject *object) +{ + KXLDSeg *seg = NULL; + u_long header_size = 0; + u_int i = 0; + + check(object); + + /* Mach, segment, symtab, and UUID headers */ + + if (kxld_object_is_32_bit(object)) { + header_size += sizeof(struct mach_header); + } else { + header_size += sizeof(struct mach_header_64); + } + + for (i = 0; i < object->segs.nitems; ++i) { + seg = kxld_array_get_item(&object->segs, i); + header_size += kxld_seg_get_macho_header_size(seg, kxld_object_is_32_bit(object)); + } + + header_size += kxld_symtab_get_macho_header_size(); + + if (object->uuid.has_uuid) { + header_size += kxld_uuid_get_macho_header_size(); + } + + return header_size; +} + +/******************************************************************************* +*******************************************************************************/ +static u_long +get_macho_data_size(const KXLDObject *object) +{ + KXLDSeg *seg = NULL; + u_long data_size = 0; + u_int i = 0; + + check(object); + + for (i = 0; i < object->segs.nitems; ++i) { + seg = kxld_array_get_item(&object->segs, i); + data_size += (u_long) kxld_seg_get_vmsize(seg); + } + + return data_size; +} + +/******************************************************************************* +*******************************************************************************/ +boolean_t +kxld_object_target_needs_swap(const KXLDObject *object __unused) +{ +#if KERNEL + return FALSE; +#else + return (object->target_order != object->host_order); +#endif /* KERNEL */ +} + +/******************************************************************************* +*******************************************************************************/ +KXLDSeg * +kxld_object_get_seg_by_name(const KXLDObject *object, const char *segname) +{ + KXLDSeg *seg = NULL; + u_int i = 0; + + for (i = 0; i < object->segs.nitems; ++i) { + seg = kxld_array_get_item(&object->segs, i); + + if (streq_safe(segname, seg->segname, sizeof(seg->segname))) break; + + seg = NULL; + } + + return seg; +} + +/******************************************************************************* +*******************************************************************************/ +const KXLDRelocator * +kxld_object_get_relocator(const KXLDObject * object) +{ + check(object); + + return &object->relocator; +} + +/******************************************************************************* +*******************************************************************************/ +KXLDSect * +kxld_object_get_sect_by_name(const KXLDObject *object, const char *segname, + const char *sectname) +{ + KXLDSect *sect = NULL; + u_int i = 0; + + for (i = 0; i < object->sects.nitems; ++i) { + sect = kxld_array_get_item(&object->sects, i); + + if (streq_safe(segname, sect->segname, sizeof(sect->segname)) && + streq_safe(sectname, sect->sectname, sizeof(sect->sectname))) + { + break; + } + + sect = NULL; + } + + return sect; +} + +/******************************************************************************* +*******************************************************************************/ +const KXLDReloc * +kxld_object_get_reloc_at_symbol(const KXLDObject *object, const KXLDSym *sym) +{ + const KXLDReloc *reloc = NULL; + const KXLDSect *sect = NULL; + uint32_t offset = 0; + + check(object); + check(sym); + + sect = kxld_object_get_section_by_index(object, sym->sectnum); + require(sect, finish); + + if (kxld_object_is_final_image(object)) { + reloc = kxld_reloc_get_reloc_by_offset(&object->extrelocs, + sym->base_addr); + if (!reloc) { + reloc = kxld_reloc_get_reloc_by_offset(&object->locrelocs, + sym->base_addr); + } + } else { + offset = kxld_sym_get_section_offset(sym, sect); + reloc = kxld_reloc_get_reloc_by_offset(§->relocs, offset); + } + +finish: + return reloc; +} + +/******************************************************************************* +*******************************************************************************/ +const KXLDSym * +kxld_object_get_symbol_of_reloc(const KXLDObject *object, + const KXLDReloc *reloc, const KXLDSect *sect) +{ + const KXLDSym *sym = NULL; + + if (kxld_object_is_final_image(object)) { + sym = kxld_reloc_get_symbol(&object->relocator, reloc, object->file); + } else { + sym = kxld_reloc_get_symbol(&object->relocator, reloc, sect->data); + } + + return sym; +} + +/******************************************************************************* +*******************************************************************************/ +const KXLDSect * +kxld_object_get_section_by_index(const KXLDObject *object, u_int sectnum) +{ + KXLDSect *sect = NULL; + + check(object); + + if (sectnum < object->sects.nitems) { + sect = kxld_array_get_item(&object->sects, sectnum); + } + + return sect; +} + +/******************************************************************************* +*******************************************************************************/ +const KXLDArray * +kxld_object_get_extrelocs(const KXLDObject *object) +{ + const KXLDArray *rval = NULL; + + check(object); + + if (kxld_object_is_final_image(object)) { + rval = &object->extrelocs; + } + + return rval; +} + +/******************************************************************************* +*******************************************************************************/ +const KXLDSymtab * +kxld_object_get_symtab(const KXLDObject *object) +{ + check(object); + + return object->symtab; +} + +#if KXLD_USER_OR_GOT || KXLD_USER_OR_COMMON +/******************************************************************************* +*******************************************************************************/ +static kern_return_t +add_section(KXLDObject *object, KXLDSect **sect) +{ + kern_return_t rval = KERN_FAILURE; + u_int nsects = object->sects.nitems; + + rval = kxld_array_resize(&object->sects, nsects + 1); + require_noerr(rval, finish); + + *sect = kxld_array_get_item(&object->sects, nsects); + + rval = KERN_SUCCESS; + +finish: + return rval; +} +#endif /* KXLD_USER_OR_GOT || KXLD_USER_OR_COMMON */ + +#if KXLD_USER_OR_COMMON +/******************************************************************************* +* If there are common symbols, calculate how much space they'll need +* and create/grow the __DATA __common section to accommodate them. +* Then, resolve them against that section. +*******************************************************************************/ +static kern_return_t +resolve_common_symbols(KXLDObject *object) +{ + kern_return_t rval = KERN_FAILURE; + KXLDSymtabIterator iter; + KXLDSym *sym = NULL; + KXLDSect *sect = NULL; + kxld_addr_t base_addr = 0; + kxld_size_t size = 0; + kxld_size_t total_size = 0; + u_int align = 0; + u_int max_align = 0; + u_int sectnum = 0; + + if (!kxld_object_target_supports_common_symbols(object)) { + rval = KERN_SUCCESS; + goto finish; + } + + /* Iterate over the common symbols to calculate their total aligned size */ + kxld_symtab_iterator_init(&iter, object->symtab, kxld_sym_is_common, FALSE); + while ((sym = kxld_symtab_iterator_get_next(&iter))) { + align = kxld_sym_get_common_align(sym); + size = kxld_sym_get_common_size(sym); + + if (align > max_align) max_align = align; + + total_size = kxld_align_address(total_size, align) + size; + } + + /* If there are common symbols, grow or create the __DATA __common section + * to hold them. + */ + if (total_size) { + sect = kxld_object_get_sect_by_name(object, SEG_DATA, SECT_COMMON); + if (sect) { + base_addr = sect->base_addr + sect->size; + + kxld_sect_grow(sect, total_size, max_align); + } else { + base_addr = 0; + + rval = add_section(object, §); + require_noerr(rval, finish); + + kxld_sect_init_zerofill(sect, SEG_DATA, SECT_COMMON, + total_size, max_align); + } + + /* Resolve the common symbols against the new section */ + rval = kxld_array_get_index(&object->sects, sect, §num); + require_noerr(rval, finish); + + kxld_symtab_iterator_reset(&iter); + while ((sym = kxld_symtab_iterator_get_next(&iter))) { + align = kxld_sym_get_common_align(sym); + size = kxld_sym_get_common_size(sym); + + base_addr = kxld_align_address(base_addr, align); + kxld_sym_resolve_common(sym, sectnum, base_addr); + + base_addr += size; + } + } + + rval = KERN_SUCCESS; + +finish: + return rval; +} +#endif /* KXLD_USER_OR_COMMON */ + +#if KXLD_USER_OR_GOT +/******************************************************************************* +*******************************************************************************/ +static boolean_t +target_has_got(const KXLDObject *object) +{ + return FALSE: +} + +/******************************************************************************* +* Create and initialize the Global Offset Table +*******************************************************************************/ +static kern_return_t +create_got(KXLDObject *object) +{ + kern_return_t rval = KERN_FAILURE; + KXLDSect *sect = NULL; + u_int ngots = 0; + u_int i = 0; + + if (!target_has_got(object)) { + rval = KERN_SUCCESS; + goto finish; + } + + for (i = 0; i < object->sects.nitems; ++i) { + sect = kxld_array_get_item(&object->sects, i); + ngots += kxld_sect_get_ngots(sect, &object->relocator, + object->symtab); + } + + rval = add_section(object, §); + require_noerr(rval, finish); + + rval = kxld_sect_init_got(sect, ngots); + require_noerr(rval, finish); + + object->got_is_created = TRUE; + rval = KERN_SUCCESS; + +finish: + return rval; +} + +/******************************************************************************* +*******************************************************************************/ +static kern_return_t +populate_got(KXLDObject *object) +{ + kern_return_t rval = KERN_FAILURE; + KXLDSect *sect = NULL; + u_int i = 0; + + if (!target_has_got(object) || !object->got_is_created) { + rval = KERN_SUCCESS; + goto finish; + } + + for (i = 0; i < object->sects.nitems; ++i) { + sect = kxld_array_get_item(&object->sects, i); + if (streq_safe(sect->segname, KXLD_SEG_GOT, sizeof(KXLD_SEG_GOT)) && + streq_safe(sect->sectname, KXLD_SECT_GOT, sizeof(KXLD_SECT_GOT))) + { + kxld_sect_populate_got(sect, object->symtab, + kxld_object_target_needs_swap(object)); + break; + } + } + + require_action(i < object->sects.nitems, finish, rval=KXLD_MISSING_GOT); + + rval = KERN_SUCCESS; + +finish: + return rval; +} +#endif /* KXLD_USER_OR_GOT */ + +/******************************************************************************* +*******************************************************************************/ +static boolean_t +target_supports_protected_segments(const KXLDObject *object) +{ + return (object->is_final_image && + object->cputype == CPU_TYPE_X86_64); +} + +/******************************************************************************* +*******************************************************************************/ +static void +set_is_object_linked(KXLDObject *object) +{ + u_int i = 0; + + if (kxld_object_is_kernel(object)) { + object->is_linked = TRUE; + return; + } + + if (object->is_final_image) { + object->is_linked = !object->extrelocs.nitems && !object->locrelocs.nitems; + return; + } + + object->is_linked = TRUE; + for (i = 0; i < object->sects.nitems; ++i) { + KXLDSect *sect = kxld_array_get_item(&object->sects, i); + if (sect->relocs.nitems) { + object->is_linked = FALSE; + break; + } + } +} + + +/******************************************************************************* +*******************************************************************************/ +void kxld_object_clear(KXLDObject *object __unused) +{ + KXLDSeg *seg = NULL; + KXLDSect *sect = NULL; + u_int i; + + check(object); + +#if !KERNEL + if (kxld_object_is_kernel(object)) { + unswap_macho(object->file, object->host_order, object->target_order); + } +#endif /* !KERNEL */ + + for (i = 0; i < object->segs.nitems; ++i) { + seg = kxld_array_get_item(&object->segs, i); + kxld_seg_clear(seg); + } + kxld_array_reset(&object->segs); + + for (i = 0; i < object->sects.nitems; ++i) { + sect = kxld_array_get_item(&object->sects, i); + kxld_sect_clear(sect); + } + kxld_array_reset(&object->sects); + + kxld_array_reset(&object->extrelocs); + kxld_array_reset(&object->locrelocs); + kxld_relocator_clear(&object->relocator); + kxld_uuid_clear(&object->uuid); + + if (object->symtab) kxld_symtab_clear(object->symtab); + + object->file = NULL; + object->size = 0; + object->filetype = 0; + object->cputype = 0; + object->cpusubtype = 0; + object->is_kernel = FALSE; + object->is_final_image = FALSE; + object->is_linked = FALSE; + object->got_is_created = FALSE; + +#if KXLD_USER_OR_OBJECT + object->section_order = NULL; +#endif +#if !KERNEL + object->host_order = 0; + object->target_order = 0; +#endif +} + +/******************************************************************************* +*******************************************************************************/ +void kxld_object_deinit(KXLDObject *object __unused) +{ + KXLDSeg *seg = NULL; + KXLDSect *sect = NULL; + u_int i; + + check(object); + +#if !KERNEL + if (object->file && kxld_object_is_kernel(object)) { + unswap_macho(object->file, object->host_order, object->target_order); + } +#endif /* !KERNEL */ + + for (i = 0; i < object->segs.maxitems; ++i) { + seg = kxld_array_get_slot(&object->segs, i); + kxld_seg_deinit(seg); + } + kxld_array_deinit(&object->segs); + + for (i = 0; i < object->sects.maxitems; ++i) { + sect = kxld_array_get_slot(&object->sects, i); + kxld_sect_deinit(sect); + } + kxld_array_deinit(&object->sects); + + kxld_array_deinit(&object->extrelocs); + kxld_array_deinit(&object->locrelocs); + + if (object->symtab) { + kxld_symtab_deinit(object->symtab); + kxld_free(object->symtab, kxld_symtab_sizeof()); + } + + bzero(object, sizeof(*object)); +} + +/******************************************************************************* +*******************************************************************************/ +const u_char * +kxld_object_get_file(const KXLDObject *object) +{ + check(object); + + return object->file; +} + +/******************************************************************************* +*******************************************************************************/ +const char * +kxld_object_get_name(const KXLDObject *object) +{ + check(object); + + return object->name; +} + +/******************************************************************************* +*******************************************************************************/ +boolean_t +kxld_object_is_32_bit(const KXLDObject *object) +{ + check(object); + + return kxld_is_32_bit(object->cputype); +} + +/******************************************************************************* +*******************************************************************************/ +boolean_t +kxld_object_is_final_image(const KXLDObject *object) +{ + check(object); + + return object->is_final_image; +} + +/******************************************************************************* +*******************************************************************************/ +boolean_t +kxld_object_is_kernel(const KXLDObject *object) +{ + check(object); + + return object->is_kernel; +} + +/******************************************************************************* +*******************************************************************************/ +boolean_t +kxld_object_is_linked(const KXLDObject *object) +{ + check(object); + + return object->is_linked; +} + +/******************************************************************************* +*******************************************************************************/ +boolean_t +kxld_object_target_supports_strict_patching(const KXLDObject *object) +{ + check(object); + + return (object->cputype != CPU_TYPE_I386 && + object->cputype != CPU_TYPE_POWERPC); +} + +/******************************************************************************* +*******************************************************************************/ +boolean_t +kxld_object_target_supports_common_symbols(const KXLDObject *object) +{ + check(object); + + return (object->cputype == CPU_TYPE_I386 || + object->cputype == CPU_TYPE_POWERPC); +} + +/******************************************************************************* +*******************************************************************************/ +void +kxld_object_get_vmsize(const KXLDObject *object, u_long *header_size, + u_long *vmsize) +{ + check(object); + check(header_size); + check(vmsize); + *header_size = 0; + *vmsize = 0; + + /* vmsize is the padded header page(s) + segment vmsizes */ + + *header_size = (object->is_final_image) ? + 0 : round_page(get_macho_header_size(object)); + *vmsize = *header_size + get_macho_data_size(object); + +} + +/******************************************************************************* +*******************************************************************************/ +kern_return_t +kxld_object_export_linked_object(const KXLDObject *object, + u_char *linked_object) +{ + kern_return_t rval = KERN_FAILURE; + KXLDSeg *seg = NULL; + u_long size = 0; + u_long header_size = 0; + u_long header_offset = 0; + u_long data_offset = 0; + u_int ncmds = 0; + u_int i = 0; + + check(object); + check(linked_object); + + /* Calculate the size of the headers and data */ + + header_size = get_macho_header_size(object); + data_offset = (object->is_final_image) ? header_size : round_page(header_size); + size = data_offset + get_macho_data_size(object); + + /* Copy data to the file */ + + ncmds = object->segs.nitems + (object->uuid.has_uuid == TRUE) + 1 /* linkedit */; + + rval = export_macho_header(object, linked_object, ncmds, + &header_offset, header_size); + require_noerr(rval, finish); + + for (i = 0; i < object->segs.nitems; ++i) { + seg = kxld_array_get_item(&object->segs, i); + + rval = kxld_seg_export_macho_to_vm(seg, linked_object, &header_offset, + header_size, size, object->link_addr, kxld_object_is_32_bit(object)); + require_noerr(rval, finish); + } + + seg = kxld_object_get_seg_by_name(object, SEG_LINKEDIT); + data_offset = (u_long) (seg->link_addr - object->link_addr); + rval = kxld_symtab_export_macho(object->symtab, linked_object, &header_offset, + header_size, &data_offset, size, kxld_object_is_32_bit(object)); + require_noerr(rval, finish); + + if (object->uuid.has_uuid) { + rval = kxld_uuid_export_macho(&object->uuid, linked_object, + &header_offset, header_size); + require_noerr(rval, finish); + } + +#if !KERNEL + unswap_macho(linked_object, object->host_order, object->target_order); +#endif /* KERNEL */ + + rval = KERN_SUCCESS; + +finish: + return rval; +} + +/******************************************************************************* +*******************************************************************************/ +static kern_return_t +export_macho_header(const KXLDObject *object, u_char *buf, u_int ncmds, + u_long *header_offset, u_long header_size) +{ + kern_return_t rval = KERN_FAILURE; + + check(object); + check(buf); + check(header_offset); + + KXLD_3264_FUNC(kxld_object_is_32_bit(object), rval, + export_macho_header_32, export_macho_header_64, + object, buf, ncmds, header_offset, header_size); + require_noerr(rval, finish); + + rval = KERN_SUCCESS; + +finish: + return rval; +} + +#if KXLD_USER_OR_ILP32 +/******************************************************************************* +*******************************************************************************/ +static kern_return_t +export_macho_header_32(const KXLDObject *object, u_char *buf, u_int ncmds, + u_long *header_offset, u_long header_size) +{ + kern_return_t rval = KERN_FAILURE; + struct mach_header *mach = NULL; + + check(object); + check(buf); + check(header_offset); + + require_action(sizeof(*mach) <= header_size - *header_offset, finish, + rval=KERN_FAILURE); + mach = (struct mach_header *) (buf + *header_offset); + + mach->magic = MH_MAGIC; + mach->cputype = object->cputype; + mach->cpusubtype = object->cpusubtype; + mach->filetype = object->filetype; + mach->ncmds = ncmds; + mach->sizeofcmds = (uint32_t) (header_size - sizeof(*mach)); + mach->flags = MH_NOUNDEFS; + + *header_offset += sizeof(*mach); + + rval = KERN_SUCCESS; + +finish: + return rval; +} +#endif /* KXLD_USER_OR_ILP32 */ + +#if KXLD_USER_OR_LP64 +/******************************************************************************* +*******************************************************************************/ +static kern_return_t +export_macho_header_64(const KXLDObject *object, u_char *buf, u_int ncmds, + u_long *header_offset, u_long header_size) +{ + kern_return_t rval = KERN_FAILURE; + struct mach_header_64 *mach = NULL; + + check(object); + check(buf); + check(header_offset); + + require_action(sizeof(*mach) <= header_size - *header_offset, finish, + rval=KERN_FAILURE); + mach = (struct mach_header_64 *) (buf + *header_offset); + + mach->magic = MH_MAGIC_64; + mach->cputype = object->cputype; + mach->cpusubtype = object->cpusubtype; + mach->filetype = object->filetype; + mach->ncmds = ncmds; + mach->sizeofcmds = (uint32_t) (header_size - sizeof(*mach)); + mach->flags = MH_NOUNDEFS; + + *header_offset += sizeof(*mach); + + rval = KERN_SUCCESS; + +finish: + return rval; +} +#endif /* KXLD_USER_OR_LP64 */ + +/******************************************************************************* +*******************************************************************************/ +kern_return_t +kxld_object_index_symbols_by_name(KXLDObject *object) +{ + return kxld_symtab_index_symbols_by_name(object->symtab); +} + +/******************************************************************************* +*******************************************************************************/ +kern_return_t +kxld_object_index_cxx_symbols_by_value(KXLDObject *object) +{ + return kxld_symtab_index_cxx_symbols_by_value(object->symtab); +} + +/******************************************************************************* +*******************************************************************************/ +kern_return_t +kxld_object_relocate(KXLDObject *object, kxld_addr_t link_address) +{ + kern_return_t rval = KERN_FAILURE; + KXLDSeg *seg = NULL; + u_int i = 0; + + check(object); + + object->link_addr = link_address; + + /* Relocate segments (which relocates the sections) */ + for (i = 0; i < object->segs.nitems; ++i) { + seg = kxld_array_get_item(&object->segs, i); + kxld_seg_relocate(seg, link_address); + } + + /* Relocate symbols */ + rval = kxld_symtab_relocate(object->symtab, &object->sects); + require_noerr(rval, finish); + + rval = KERN_SUCCESS; +finish: + return rval; +} + +/******************************************************************************* +*******************************************************************************/ +static KXLDSym * +get_mutable_sym(const KXLDObject *object, const KXLDSym *sym) +{ + KXLDSym *rval = NULL; + kern_return_t result = KERN_FAILURE; + u_int i = 0; + + result = kxld_symtab_get_sym_index(object->symtab, sym, &i); + require_noerr(result, finish); + + rval = kxld_symtab_get_symbol_by_index(object->symtab, i); + require_action(rval == sym, finish, rval=NULL); + +finish: + return rval; +} + +/******************************************************************************* +*******************************************************************************/ +kern_return_t +kxld_object_resolve_symbol(KXLDObject *object, + const KXLDSym *sym, kxld_addr_t addr) +{ + kern_return_t rval = KERN_FAILURE; + KXLDSym *resolved_sym = NULL; + + resolved_sym = get_mutable_sym(object, sym); + require_action(resolved_sym, finish, rval=KERN_FAILURE); + + rval = kxld_sym_resolve(resolved_sym, addr); + require_noerr(rval, finish); + + rval = KERN_SUCCESS; +finish: + return rval; +} + +/******************************************************************************* +*******************************************************************************/ +kern_return_t +kxld_object_patch_symbol(KXLDObject *object, const struct kxld_sym *sym) +{ + kern_return_t rval = KERN_FAILURE; + KXLDSym *patched_sym = NULL; + + patched_sym = get_mutable_sym(object, sym); + require_action(patched_sym, finish, rval=KERN_FAILURE); + + (void) kxld_sym_patch(patched_sym); + rval = KERN_SUCCESS; +finish: + return rval; +} + +/******************************************************************************* +*******************************************************************************/ +kern_return_t +kxld_object_add_symbol(KXLDObject *object, char *name, kxld_addr_t link_addr, + const KXLDSym **sym_out) +{ + kern_return_t rval = KERN_FAILURE; + KXLDSym *sym = NULL; + + rval = kxld_symtab_add_symbol(object->symtab, name, link_addr, &sym); + require_noerr(rval, finish); + + *sym_out = sym; + rval = KERN_SUCCESS; +finish: + return rval; +} + +/******************************************************************************* +*******************************************************************************/ +kern_return_t +kxld_object_process_relocations(KXLDObject *object, + const KXLDDict *patched_vtables) +{ + kern_return_t rval = KERN_FAILURE; + + (void) kxld_relocator_set_vtables(&object->relocator, patched_vtables); + + /* Process relocation entries and populate the global offset table. + * + * For final linked images: the relocation entries are contained in a couple + * of tables hanging off the end of the symbol table. The GOT has its own + * section created by the linker; we simply need to fill it. + * + * For object files: the relocation entries are bound to each section. + * The GOT, if it exists for the target architecture, is created by kxld, + * and we must populate it according to our internal structures. + */ + if (object->is_final_image) { +#if KXLD_USER_OR_BUNDLE + rval = process_symbol_pointers(object); + require_noerr(rval, finish); + + rval = process_relocs_from_tables(object); + require_noerr(rval, finish); +#else + require_action(FALSE, finish, rval=KERN_FAILURE); +#endif /* KXLD_USER_OR_BUNDLE */ + } else { +#if KXLD_USER_OR_GOT + /* Populate GOT */ + rval = populate_got(object); + require_noerr(rval, finish); +#endif /* KXLD_USER_OR_GOT */ +#if KXLD_USER_OR_OBJECT + rval = process_relocs_from_sections(object); + require_noerr(rval, finish); +#else + require_action(FALSE, finish, rval=KERN_FAILURE); +#endif /* KXLD_USER_OR_OBJECT */ + } + + /* Populate kmod info structure */ + rval = populate_kmod_info(object); + require_noerr(rval, finish); + + rval = KERN_SUCCESS; +finish: + return rval; +} + +#if KXLD_USER_OR_BUNDLE + +#define SECT_SYM_PTRS "__nl_symbol_ptr" + +/******************************************************************************* +* Final linked images create an __nl_symbol_ptr section for the global offset +* table and for symbol pointer lookups in general. Rather than use relocation +* entries, the linker creates an "indirect symbol table" which stores indexes +* into the symbol table corresponding to the entries of this section. This +* function populates the section with the relocated addresses of those symbols. +*******************************************************************************/ +static kern_return_t +process_symbol_pointers(KXLDObject *object) +{ + kern_return_t rval = KERN_FAILURE; + KXLDSect *sect = NULL; + KXLDSym *sym = NULL; + int32_t *symidx = NULL; + u_char *symptr = NULL; + u_long symptrsize = 0; + u_int nsyms = 0; + u_int firstsym = 0; + u_int i = 0; + + check(object); + + require_action(object->is_final_image && object->dysymtab_hdr, + finish, rval=KERN_FAILURE); + + /* Get the __DATA,__nl_symbol_ptr section. If it doesn't exist, we have + * nothing to do. + */ + + sect = kxld_object_get_sect_by_name(object, SEG_DATA, SECT_SYM_PTRS); + if (!sect) { + rval = KERN_SUCCESS; + goto finish; + } + + require_action(sect->flags & S_NON_LAZY_SYMBOL_POINTERS, + finish, rval=KERN_FAILURE; + kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO + "Section %s,%s does not have S_NON_LAZY_SYMBOL_POINTERS flag.", + SEG_DATA, SECT_SYM_PTRS)); + + /* Calculate the table offset and number of entries in the section */ + + if (kxld_object_is_32_bit(object)) { + symptrsize = sizeof(uint32_t); + } else { + symptrsize = sizeof(uint64_t); + } + + nsyms = (u_int) (sect->size / symptrsize); + firstsym = sect->reserved1; + + require_action(firstsym + nsyms <= object->dysymtab_hdr->nindirectsyms, + finish, rval=KERN_FAILURE; + kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO)); + + /* Iterate through the indirect symbol table and fill in the section of + * symbol pointers. There are three cases: + * 1) A normal symbol - put its value directly in the table + * 2) An INDIRECT_SYMBOL_LOCAL - symbols that are local and already have + * their offset from the start of the file in the section. Simply + * add the file's link address to fill this entry. + * 3) An INDIRECT_SYMBOL_ABS - prepopulated absolute symbols. No + * action is required. + */ + + symidx = (int32_t *) (object->file + object->dysymtab_hdr->indirectsymoff); + symidx += firstsym; + symptr = sect->data; + for (i = 0; i < nsyms; ++i, ++symidx, symptr+=symptrsize) { + if (*symidx & INDIRECT_SYMBOL_LOCAL) { + if (*symidx & INDIRECT_SYMBOL_ABS) continue; + + add_to_ptr(symptr, object->link_addr, kxld_object_is_32_bit(object)); + } else { + sym = kxld_symtab_get_symbol_by_index(object->symtab, *symidx); + require_action(sym, finish, rval=KERN_FAILURE); + + add_to_ptr(symptr, sym->link_addr, kxld_object_is_32_bit(object)); + } + } + + rval = KERN_SUCCESS; +finish: + return rval; +} + +/******************************************************************************* +*******************************************************************************/ +static KXLDSeg * +get_seg_by_base_addr(KXLDObject *object, kxld_addr_t base_addr) +{ + KXLDSeg *seg = NULL; + kxld_addr_t start = 0; + kxld_addr_t end = 0; + u_int i = 0; + + for (i = 0; i < object->segs.nitems; ++i) { + seg = kxld_array_get_item(&object->segs, i); + start = seg->base_addr; + end = seg->base_addr + seg->vmsize; + + if (start <= base_addr && base_addr < end) return seg; + } + + return NULL; +} + +/******************************************************************************* +*******************************************************************************/ +static kern_return_t +process_relocs_from_tables(KXLDObject *object) +{ + kern_return_t rval = KERN_FAILURE; + KXLDReloc *reloc = NULL; + KXLDSeg *seg = NULL; + u_int i = 0; + + /* Process external relocations */ + for (i = 0; i < object->extrelocs.nitems; ++i) { + reloc = kxld_array_get_item(&object->extrelocs, i); + + seg = get_seg_by_base_addr(object, reloc->address); + require_action(seg, finish, rval=KERN_FAILURE); + + rval = kxld_relocator_process_table_reloc(&object->relocator, reloc, + seg, object->link_addr); + require_noerr(rval, finish); + } + + /* Process local relocations */ + for (i = 0; i < object->locrelocs.nitems; ++i) { + reloc = kxld_array_get_item(&object->locrelocs, i); + + seg = get_seg_by_base_addr(object, reloc->address); + require_action(seg, finish, rval=KERN_FAILURE); + + rval = kxld_relocator_process_table_reloc(&object->relocator, reloc, + seg, object->link_addr); + require_noerr(rval, finish); + } + + rval = KERN_SUCCESS; +finish: + return rval; +} + +/******************************************************************************* +*******************************************************************************/ +static void +add_to_ptr(u_char *symptr, kxld_addr_t val, boolean_t is_32_bit) +{ + if (is_32_bit) { + uint32_t *ptr = (uint32_t *) symptr; + *ptr += (uint32_t) val; + } else { + uint64_t *ptr = (uint64_t *) symptr; + *ptr += (uint64_t) val; + } +} +#endif /* KXLD_USER_OR_BUNDLE */ + +#if KXLD_USER_OR_OBJECT +/******************************************************************************* +*******************************************************************************/ +static kern_return_t +process_relocs_from_sections(KXLDObject *object) +{ + kern_return_t rval = KERN_FAILURE; + KXLDSect *sect = NULL; + u_int i = 0; + + for (i = 0; i < object->sects.nitems; ++i) { + sect = kxld_array_get_item(&object->sects, i); + rval = kxld_sect_process_relocs(sect, &object->relocator); + require_noerr(rval, finish); + } + + rval = KERN_SUCCESS; +finish: + return rval; +} +#endif /* KXLD_USER_OR_OBJECT */ + +/******************************************************************************* +*******************************************************************************/ +static kern_return_t +populate_kmod_info(KXLDObject *object) +{ + kern_return_t rval = KERN_FAILURE; + KXLDSect *kmodsect = NULL; + KXLDSym *kmodsym = NULL; + kmod_info_t *kmod_info = NULL; + u_long kmod_offset = 0; + u_long header_size; + u_long size; + + if (kxld_object_is_kernel(object)) { + rval = KERN_SUCCESS; + goto finish; + } + + kxld_object_get_vmsize(object, &header_size, &size); + + kmodsym = kxld_symtab_get_locally_defined_symbol_by_name(object->symtab, + KXLD_KMOD_INFO_SYMBOL); + require_action(kmodsym, finish, rval=KERN_FAILURE; + kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogNoKmodInfo)); + + kmodsect = kxld_array_get_item(&object->sects, kmodsym->sectnum); + kmod_offset = (u_long) (kmodsym->base_addr - kmodsect->base_addr); + kmod_info = (kmod_info_t *) (kmodsect->data + kmod_offset); + + if (kxld_object_is_32_bit(object)) { + kmod_info_32_v1_t *kmod = (kmod_info_32_v1_t *) (kmod_info); + kmod->address = (uint32_t) object->link_addr; + kmod->size = (uint32_t) size; + kmod->hdr_size = (uint32_t) header_size; + +#if !KERNEL + if (kxld_object_target_needs_swap(object)) { + kmod->address = OSSwapInt32(kmod->address); + kmod->size = OSSwapInt32(kmod->size); + kmod->hdr_size = OSSwapInt32(kmod->hdr_size); + } +#endif /* !KERNEL */ + } else { + kmod_info_64_v1_t *kmod = (kmod_info_64_v1_t *) (kmod_info); + kmod->address = object->link_addr; + kmod->size = size; + kmod->hdr_size = header_size; + +#if !KERNEL + if (kxld_object_target_needs_swap(object)) { + kmod->address = OSSwapInt64(kmod->address); + kmod->size = OSSwapInt64(kmod->size); + kmod->hdr_size = OSSwapInt64(kmod->hdr_size); + } +#endif /* !KERNEL */ + } + + + rval = KERN_SUCCESS; + +finish: + return rval; +} + diff --git a/libkern/kxld/kxld_object.h b/libkern/kxld/kxld_object.h new file mode 100644 index 000000000..5b6b5064d --- /dev/null +++ b/libkern/kxld/kxld_object.h @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef _KXLD_OBJECT_H_ +#define _KXLD_OBJECT_H_ + +#include +#include +#if KERNEL + #include +#else + #include "kxld_types.h" +#endif + +struct kxld_array; +struct kxld_dict; +struct kxld_reloc; +struct kxld_relocator; +struct kxld_sect; +struct kxld_sym; +struct kxld_symtab; + +typedef struct kxld_object KXLDObject; + +/******************************************************************************* +* Constructors and destructors +*******************************************************************************/ + +size_t kxld_object_sizeof(void) + __attribute__((const, visibility("hidden"))); + +kern_return_t kxld_object_init_from_macho(KXLDObject *object, + u_char *file, u_long size, const char *name, + struct kxld_array *section_order, + cpu_type_t cputype, cpu_subtype_t cpusubtype) + __attribute__((nonnull(1,2,4) visibility("hidden"))); + +void kxld_object_clear(KXLDObject *object) + __attribute__((nonnull, visibility("hidden"))); + +void kxld_object_deinit(KXLDObject *object) + __attribute__((nonnull, visibility("hidden"))); + +/******************************************************************************* +* Accessors +*******************************************************************************/ + +const u_char * kxld_object_get_file(const KXLDObject *object) + __attribute__((pure, nonnull, visibility("hidden"))); + +const char * kxld_object_get_name(const KXLDObject *object) + __attribute__((pure, nonnull, visibility("hidden"))); + +boolean_t kxld_object_is_32_bit(const KXLDObject *object) + __attribute__((pure, nonnull, visibility("hidden"))); + +boolean_t kxld_object_is_final_image(const KXLDObject *object) + __attribute__((pure, nonnull, visibility("hidden"))); + +boolean_t kxld_object_is_kernel(const KXLDObject *object) + __attribute__((pure, nonnull, visibility("hidden"))); + +boolean_t kxld_object_is_linked(const KXLDObject *object) + __attribute__((pure, nonnull, visibility("hidden"))); + +boolean_t kxld_object_target_supports_strict_patching(const KXLDObject *object) + __attribute__((pure, nonnull, visibility("hidden"))); + +boolean_t kxld_object_target_supports_common_symbols(const KXLDObject *object) + __attribute__((pure, nonnull, visibility("hidden"))); + +const struct kxld_relocator * kxld_object_get_relocator( + const KXLDObject * object) + __attribute__((pure, nonnull, visibility("hidden"))); + +const struct kxld_reloc * kxld_object_get_reloc_at_symbol( + const KXLDObject *object, const struct kxld_sym *sym) + __attribute__((pure, nonnull, visibility("hidden"))); + +const struct kxld_sym * kxld_object_get_symbol_of_reloc( + const KXLDObject *object, const struct kxld_reloc *reloc, + const struct kxld_sect *sect) + __attribute__((pure, nonnull, visibility("hidden"))); + +const struct kxld_sect * kxld_object_get_section_by_index( + const KXLDObject *object, u_int sectnum) + __attribute__((pure, nonnull, visibility("hidden"))); + +const struct kxld_array * kxld_object_get_extrelocs( + const KXLDObject *object) + __attribute__((pure, nonnull, visibility("hidden"))); + +const struct kxld_symtab * kxld_object_get_symtab(const KXLDObject *object) + __attribute__((pure, nonnull, visibility("hidden"))); + +void kxld_object_get_vmsize(const KXLDObject *object, u_long *header_size, + u_long *vmsize) + __attribute__((nonnull, visibility("hidden"))); + +/* This will be the same size as kxld_kext_get_vmsize */ +kern_return_t kxld_object_export_linked_object(const KXLDObject *object, + u_char *linked_object) + __attribute__((nonnull, visibility("hidden"))); + +/******************************************************************************* +* Modifiers +*******************************************************************************/ + +kern_return_t kxld_object_index_symbols_by_name(KXLDObject *object) + __attribute__((nonnull, visibility("hidden"))); + +kern_return_t kxld_object_index_cxx_symbols_by_value(KXLDObject *object) + __attribute__((nonnull, visibility("hidden"))); + +kern_return_t kxld_object_relocate(KXLDObject *object, kxld_addr_t link_address) + __attribute__((nonnull, visibility("hidden"))); + +kern_return_t kxld_object_resolve_symbol(KXLDObject *object, + const struct kxld_sym *sym, kxld_addr_t addr) + __attribute__((nonnull, visibility("hidden"))); + +kern_return_t kxld_object_patch_symbol(KXLDObject *object, + const struct kxld_sym *sym) + __attribute__((nonnull, visibility("hidden"))); + +kern_return_t kxld_object_add_symbol(KXLDObject *object, char *name, + kxld_addr_t link_addr, const struct kxld_sym **sym_out) + __attribute__((nonnull, visibility("hidden"))); + +kern_return_t kxld_object_process_relocations(KXLDObject *object, + const struct kxld_dict *patched_vtables) + __attribute__((nonnull, visibility("hidden"))); + +#endif /* _KXLD_OBJECT_H_ */ + diff --git a/libkern/kxld/kxld_reloc.c b/libkern/kxld/kxld_reloc.c index c781d6dc2..4867c8c78 100644 --- a/libkern/kxld/kxld_reloc.c +++ b/libkern/kxld/kxld_reloc.c @@ -41,12 +41,15 @@ #include #include "kxld_array.h" +#include "kxld_demangle.h" +#include "kxld_dict.h" #include "kxld_reloc.h" #include "kxld_sect.h" #include "kxld_seg.h" #include "kxld_sym.h" #include "kxld_symtab.h" #include "kxld_util.h" +#include "kxld_vtable.h" /* include target-specific relocation prototypes */ #include @@ -102,9 +105,10 @@ static boolean_t generic_reloc_is_pair(u_int _type, u_int _prev_type) __attribute__((const)); static boolean_t generic_reloc_has_got(u_int _type) __attribute__((const)); -static kern_return_t generic_process_reloc(u_char *instruction, u_int length, - u_int pcrel, kxld_addr_t base_pc, kxld_addr_t link_pc, kxld_addr_t link_disp, - u_int type, kxld_addr_t target, kxld_addr_t pair_target, boolean_t swap); +static kern_return_t generic_process_reloc(const KXLDRelocator *relocator, + u_char *instruction, u_int length, u_int pcrel, kxld_addr_t base_pc, + kxld_addr_t link_pc, kxld_addr_t link_disp, u_int type, kxld_addr_t target, + kxld_addr_t pair_target, boolean_t swap); #endif /* KXLD_USER_OR_I386 */ #if KXLD_USER_OR_PPC @@ -114,9 +118,10 @@ static boolean_t ppc_reloc_is_pair(u_int _type, u_int _prev_type) __attribute__((const)); static boolean_t ppc_reloc_has_got(u_int _type) __attribute__((const)); -static kern_return_t ppc_process_reloc(u_char *instruction, u_int length, - u_int pcrel, kxld_addr_t base_pc, kxld_addr_t link_pc, kxld_addr_t link_disp, - u_int type, kxld_addr_t target, kxld_addr_t pair_target, boolean_t swap); +static kern_return_t ppc_process_reloc(const KXLDRelocator *relocator, + u_char *instruction, u_int length, u_int pcrel, kxld_addr_t base_pc, + kxld_addr_t link_pc, kxld_addr_t link_disp, u_int type, kxld_addr_t target, + kxld_addr_t pair_target, boolean_t swap); #endif /* KXLD_USER_OR_PPC */ #if KXLD_USER_OR_X86_64 @@ -126,9 +131,10 @@ static boolean_t x86_64_reloc_is_pair(u_int _type, u_int _prev_type) __attribute__((const)); static boolean_t x86_64_reloc_has_got(u_int _type) __attribute__((const)); -static kern_return_t x86_64_process_reloc(u_char *instruction, u_int length, - u_int pcrel, kxld_addr_t base_pc, kxld_addr_t link_pc, kxld_addr_t link_disp, - u_int type, kxld_addr_t target, kxld_addr_t pair_target, boolean_t swap); +static kern_return_t x86_64_process_reloc(const KXLDRelocator *relocator, + u_char *instruction, u_int length, u_int pcrel, kxld_addr_t base_pc, + kxld_addr_t link_pc, kxld_addr_t link_disp, u_int type, kxld_addr_t target, + kxld_addr_t pair_target, boolean_t swap); static kern_return_t calculate_displacement_x86_64(uint64_t target, uint64_t adjustment, int32_t *instr32); #endif /* KXLD_USER_OR_X86_64 */ @@ -140,19 +146,20 @@ static boolean_t arm_reloc_is_pair(u_int _type, u_int _prev_type) __attribute__((const)); static boolean_t arm_reloc_has_got(u_int _type) __attribute__((const)); -static kern_return_t arm_process_reloc(u_char *instruction, u_int length, - u_int pcrel, kxld_addr_t base_pc, kxld_addr_t link_pc, kxld_addr_t link_disp, - u_int type, kxld_addr_t target, kxld_addr_t pair_target, boolean_t swap); +static kern_return_t arm_process_reloc(const KXLDRelocator *relocator, + u_char *instruction, u_int length, u_int pcrel, kxld_addr_t base_pc, + kxld_addr_t link_pc, kxld_addr_t link_disp, u_int type, kxld_addr_t target, + kxld_addr_t pair_target, boolean_t swap); #endif /* KXLD_USER_OR_ARM */ #if KXLD_USER_OR_ILP32 -static kxld_addr_t get_pointer_at_addr_32(u_char *data, u_long offset, - const KXLDRelocator *relocator __unused) +static kxld_addr_t get_pointer_at_addr_32(const KXLDRelocator *relocator, + const u_char *data, u_long offset) __attribute__((pure, nonnull)); #endif /* KXLD_USER_OR_ILP32 */ #if KXLD_USER_OR_LP64 -static kxld_addr_t get_pointer_at_addr_64(u_char *data, u_long offset, - const KXLDRelocator *relocator __unused) +static kxld_addr_t get_pointer_at_addr_64(const KXLDRelocator *relocator, + const u_char *data, u_long offset) __attribute__((pure, nonnull)); #endif /* KXLD_USER_OR_LP64 */ @@ -160,16 +167,23 @@ static u_int count_relocatable_relocs(const KXLDRelocator *relocator, const struct relocation_info *relocs, u_int nrelocs) __attribute__((pure)); -static kern_return_t calculate_targets(kxld_addr_t *_target, - kxld_addr_t *_pair_target, const KXLDReloc *reloc, - const KXLDArray *sectarray, const KXLDSymtab *symtab); +static kern_return_t calculate_targets(KXLDRelocator *relocator, + kxld_addr_t *_target, kxld_addr_t *_pair_target, const KXLDReloc *reloc); + +static kxld_addr_t align_raw_function_address(const KXLDRelocator *relocator, + kxld_addr_t value); + static kern_return_t get_target_by_address_lookup(kxld_addr_t *target, kxld_addr_t addr, const KXLDArray *sectarray); +static kern_return_t check_for_direct_pure_virtual_call( + const KXLDRelocator *relocator, u_long offset); + /******************************************************************************* *******************************************************************************/ kern_return_t -kxld_relocator_init(KXLDRelocator *relocator, cpu_type_t cputype, +kxld_relocator_init(KXLDRelocator *relocator, u_char *file, + const KXLDSymtab *symtab, const KXLDArray *sectarray, cpu_type_t cputype, cpu_subtype_t cpusubtype __unused, boolean_t swap) { kern_return_t rval = KERN_FAILURE; @@ -183,6 +197,7 @@ kxld_relocator_init(KXLDRelocator *relocator, cpu_type_t cputype, relocator->reloc_is_pair = generic_reloc_is_pair; relocator->reloc_has_got = generic_reloc_has_got; relocator->process_reloc = generic_process_reloc; + relocator->function_align = 0; relocator->is_32_bit = TRUE; break; #endif /* KXLD_USER_OR_I386 */ @@ -192,6 +207,7 @@ kxld_relocator_init(KXLDRelocator *relocator, cpu_type_t cputype, relocator->reloc_is_pair = ppc_reloc_is_pair; relocator->reloc_has_got = ppc_reloc_has_got; relocator->process_reloc = ppc_process_reloc; + relocator->function_align = 0; relocator->is_32_bit = TRUE; break; #endif /* KXLD_USER_OR_PPC */ @@ -201,6 +217,7 @@ kxld_relocator_init(KXLDRelocator *relocator, cpu_type_t cputype, relocator->reloc_is_pair = x86_64_reloc_is_pair; relocator->reloc_has_got = x86_64_reloc_has_got; relocator->process_reloc = x86_64_process_reloc; + relocator->function_align = 0; relocator->is_32_bit = FALSE; break; #endif /* KXLD_USER_OR_X86_64 */ @@ -210,6 +227,7 @@ kxld_relocator_init(KXLDRelocator *relocator, cpu_type_t cputype, relocator->reloc_is_pair = arm_reloc_is_pair; relocator->reloc_has_got = arm_reloc_has_got; relocator->process_reloc = arm_process_reloc; + relocator->function_align = 1; relocator->is_32_bit = TRUE; break; #endif /* KXLD_USER_OR_ARM */ @@ -220,6 +238,9 @@ kxld_relocator_init(KXLDRelocator *relocator, cpu_type_t cputype, goto finish; } + relocator->file = file; + relocator->symtab = symtab; + relocator->sectarray = sectarray; relocator->is_32_bit = kxld_is_32_bit(cputype); relocator->swap = swap; @@ -238,8 +259,8 @@ kxld_reloc_create_macho(KXLDArray *relocarray, const KXLDRelocator *relocator, kern_return_t rval = KERN_FAILURE; KXLDReloc *reloc = NULL; u_int nrelocs = 0; - const struct relocation_info *src = NULL, *prev_src = NULL; - const struct scattered_relocation_info *scatsrc = NULL, *prev_scatsrc = NULL; + const struct relocation_info *src = NULL; + const struct scattered_relocation_info *scatsrc = NULL; u_int i = 0; u_int reloc_index = 0; @@ -313,9 +334,7 @@ kxld_reloc_create_macho(KXLDArray *relocarray, const KXLDRelocator *relocator, ++i; require_action(i < nsrcs, finish, rval=KERN_FAILURE); - prev_src = src; src = srcs + i; - prev_scatsrc = (const struct scattered_relocation_info *) prev_src; scatsrc = (const struct scattered_relocation_info *) src; if (src->r_address & R_SCATTERED) { @@ -447,25 +466,23 @@ kxld_relocator_has_got(const KXLDRelocator *relocator, u_int r_type) /******************************************************************************* *******************************************************************************/ KXLDSym * -kxld_reloc_get_symbol(const KXLDRelocator *relocator, const KXLDReloc *reloc, - u_char *data, const KXLDSymtab *symtab) +kxld_reloc_get_symbol(const KXLDRelocator *relocator, const KXLDReloc *reloc, + const u_char *data) { KXLDSym *sym = NULL; kxld_addr_t value = 0; check(reloc); - check(symtab); switch (reloc->target_type) { case KXLD_TARGET_SYMBOLNUM: - sym = kxld_symtab_get_symbol_by_index(symtab, reloc->target); + sym = kxld_symtab_get_symbol_by_index(relocator->symtab, reloc->target); break; case KXLD_TARGET_SECTNUM: - if (data) { - KXLD_3264_FUNC(relocator->is_32_bit, value, - get_pointer_at_addr_32, get_pointer_at_addr_64, - data, reloc->address, relocator); - sym = kxld_symtab_get_cxx_symbol_by_value(symtab, value); + if (data) { + value = kxld_relocator_get_pointer_at_addr(relocator, data, + reloc->address); + sym = kxld_symtab_get_cxx_symbol_by_value(relocator->symtab, value); } break; default: @@ -521,26 +538,40 @@ kxld_reloc_get_reloc_by_offset(const KXLDArray *relocs, kxld_addr_t offset) return reloc; } +/******************************************************************************* +*******************************************************************************/ +kxld_addr_t +kxld_relocator_get_pointer_at_addr(const KXLDRelocator *relocator, + const u_char *data, u_long offset) +{ + kxld_addr_t value; + + KXLD_3264_FUNC(relocator->is_32_bit, value, + get_pointer_at_addr_32, get_pointer_at_addr_64, + relocator, data, offset); + + return value; +} + #if KXLD_USER_OR_ILP32 /******************************************************************************* *******************************************************************************/ static kxld_addr_t -get_pointer_at_addr_32(u_char *data, u_long offset, - const KXLDRelocator *relocator __unused) +get_pointer_at_addr_32(const KXLDRelocator *relocator, + const u_char *data, u_long offset) { uint32_t addr = 0; check(relocator); - check(data); - addr = *(uint32_t *) (data + offset); + addr = *(const uint32_t *) (data + offset); #if !KERNEL if (relocator->swap) { addr = OSSwapInt32(addr); } #endif - return (kxld_addr_t) addr; + return align_raw_function_address(relocator, addr); } #endif /* KXLD_USER_OR_ILP32 */ @@ -548,31 +579,55 @@ get_pointer_at_addr_32(u_char *data, u_long offset, /******************************************************************************* *******************************************************************************/ static kxld_addr_t -get_pointer_at_addr_64(u_char *data, u_long offset, - const KXLDRelocator *relocator __unused) +get_pointer_at_addr_64(const KXLDRelocator *relocator, + const u_char *data, u_long offset) { uint64_t addr = 0; check(relocator); - check(data); - addr = *(uint64_t *) (data + offset); + addr = *(const uint64_t *) (data + offset); #if !KERNEL if (relocator->swap) { addr = OSSwapInt64(addr); } #endif - return (kxld_addr_t) addr; + return align_raw_function_address(relocator, addr); } #endif /* KXLD_USER_OR_LP64 */ +/******************************************************************************* +*******************************************************************************/ +void +kxld_relocator_set_vtables(KXLDRelocator *relocator, + const struct kxld_dict *vtables) +{ + relocator->vtables = vtables; +} + +/******************************************************************************* +* When we're inspecting the raw binary and not the symbol table, value may +* hold a THUMB address (with bit 0 set to 1) but the index will have the real +* address (bit 0 set to 0). So if bit 0 is set here, we clear it. This only +* impacts ARM for now, but it's implemented as a generic function alignment +* mask. +*******************************************************************************/ +static kxld_addr_t +align_raw_function_address(const KXLDRelocator *relocator, kxld_addr_t value) +{ + if (relocator->function_align) { + value &= ~((1ULL << relocator->function_align) - 1); + } + + return value; +} + /******************************************************************************* *******************************************************************************/ kern_return_t -kxld_relocator_process_sect_reloc(const KXLDRelocator *relocator, - const KXLDReloc *reloc, const struct kxld_sect *sect, - const KXLDArray *sectarray, const struct kxld_symtab *symtab) +kxld_relocator_process_sect_reloc(KXLDRelocator *relocator, + const KXLDReloc *reloc, const struct kxld_sect *sect) { kern_return_t rval = KERN_FAILURE; u_char *instruction = NULL; @@ -585,8 +640,6 @@ kxld_relocator_process_sect_reloc(const KXLDRelocator *relocator, check(relocator); check(reloc); check(sect); - check(sectarray); - check(symtab); /* Find the instruction */ @@ -594,7 +647,7 @@ kxld_relocator_process_sect_reloc(const KXLDRelocator *relocator, /* Calculate the target */ - rval = calculate_targets(&target, &pair_target, reloc, sectarray, symtab); + rval = calculate_targets(relocator, &target, &pair_target, reloc); require_noerr(rval, finish); base_pc = reloc->address; @@ -603,13 +656,14 @@ kxld_relocator_process_sect_reloc(const KXLDRelocator *relocator, /* Relocate */ - rval = relocator->process_reloc(instruction, reloc->length, reloc->pcrel, - base_pc, link_pc, link_disp, reloc->reloc_type, target, pair_target, - relocator->swap); + rval = relocator->process_reloc(relocator, instruction, reloc->length, + reloc->pcrel, base_pc, link_pc, link_disp, reloc->reloc_type, target, + pair_target, relocator->swap); require_noerr(rval, finish); /* Return */ + relocator->current_vtable = NULL; rval = KERN_SUCCESS; finish: @@ -637,9 +691,8 @@ kxld_reloc_update_symindex(KXLDReloc *reloc, u_int symindex) /******************************************************************************* *******************************************************************************/ kern_return_t -kxld_relocator_process_table_reloc(const KXLDRelocator *relocator, - const KXLDReloc *reloc, const KXLDSeg *seg, u_char *file, - const struct kxld_array *sectarray, const struct kxld_symtab *symtab) +kxld_relocator_process_table_reloc(KXLDRelocator *relocator, + const KXLDReloc *reloc, const KXLDSeg *seg, kxld_addr_t link_addr) { kern_return_t rval = KERN_FAILURE; u_char *instruction = NULL; @@ -647,36 +700,34 @@ kxld_relocator_process_table_reloc(const KXLDRelocator *relocator, kxld_addr_t pair_target = 0; kxld_addr_t base_pc = 0; kxld_addr_t link_pc = 0; - kxld_addr_t link_disp = 0; + u_long offset = 0; check(relocator); check(reloc); - check(file); - check(sectarray); - check(symtab); /* Find the instruction */ - instruction = file + seg->fileoff + reloc->address; + offset = (u_long)(seg->fileoff + (reloc->address - seg->base_addr)); + instruction = relocator->file + offset; /* Calculate the target */ - rval = calculate_targets(&target, &pair_target, reloc, sectarray, symtab); + rval = calculate_targets(relocator, &target, &pair_target, reloc); require_noerr(rval, finish); base_pc = reloc->address; - link_pc = base_pc + seg->link_addr; - link_disp = seg->link_addr - seg->base_addr; + link_pc = base_pc + link_addr; /* Relocate */ - rval = relocator->process_reloc(instruction, reloc->length, reloc->pcrel, - base_pc, link_pc, link_disp, reloc->reloc_type, target, pair_target, - relocator->swap); + rval = relocator->process_reloc(relocator, instruction, reloc->length, + reloc->pcrel, base_pc, link_pc, link_addr, reloc->reloc_type, target, + pair_target, relocator->swap); require_noerr(rval, finish); /* Return */ + relocator->current_vtable = NULL; rval = KERN_SUCCESS; finish: @@ -686,19 +737,19 @@ kxld_relocator_process_table_reloc(const KXLDRelocator *relocator, /******************************************************************************* *******************************************************************************/ static kern_return_t -calculate_targets(kxld_addr_t *_target, kxld_addr_t *_pair_target, - const KXLDReloc *reloc, const KXLDArray *sectarray, const KXLDSymtab *symtab) +calculate_targets(KXLDRelocator *relocator, kxld_addr_t *_target, + kxld_addr_t *_pair_target, const KXLDReloc *reloc) { kern_return_t rval = KERN_FAILURE; const KXLDSect *sect = NULL; const KXLDSym *sym = NULL; kxld_addr_t target = 0; kxld_addr_t pair_target = 0; + char *demangled_name = NULL; + size_t demangled_length = 0; check(_target); check(_pair_target); - check(sectarray); - check(symtab); *_target = 0; *_pair_target = 0; @@ -711,12 +762,13 @@ calculate_targets(kxld_addr_t *_target, kxld_addr_t *_pair_target, reloc->pair_target_type == KXLD_TARGET_VALUE, finish, rval=KERN_FAILURE); - rval = get_target_by_address_lookup(&target, reloc->target, sectarray); + rval = get_target_by_address_lookup(&target, reloc->target, + relocator->sectarray); require_noerr(rval, finish); if (reloc->pair_target_type == KXLD_TARGET_LOOKUP) { rval = get_target_by_address_lookup(&pair_target, - reloc->pair_target, sectarray); + reloc->pair_target, relocator->sectarray); require_noerr(rval, finish); } else if (reloc->pair_target_type == KXLD_TARGET_VALUE) { pair_target = reloc->pair_target; @@ -728,7 +780,7 @@ calculate_targets(kxld_addr_t *_target, kxld_addr_t *_pair_target, finish, rval=KERN_FAILURE); /* Get the target's section by section number */ - sect = kxld_array_get_item(sectarray, reloc->target); + sect = kxld_array_get_item(relocator->sectarray, reloc->target); require_action(sect, finish, rval=KERN_FAILURE); /* target is the change in the section's address */ @@ -751,10 +803,27 @@ calculate_targets(kxld_addr_t *_target, kxld_addr_t *_pair_target, rval=KERN_FAILURE); /* Get the target's symbol by symbol number */ - sym = kxld_symtab_get_symbol_by_index(symtab, reloc->target); + sym = kxld_symtab_get_symbol_by_index(relocator->symtab, reloc->target); require_action(sym, finish, rval=KERN_FAILURE); + + /* If this symbol is a padslot that has already been replaced, then the + * only way a relocation entry can still reference it is if there is a + * vtable that has not been patched. The vtable patcher uses the + * MetaClass structure to find classes for patching, so an unpatched + * vtable means that there is an OSObject-dervied class that is missing + * its OSDeclare/OSDefine macros. + */ + require_action(!kxld_sym_is_padslot(sym) || !kxld_sym_is_replaced(sym), + finish, rval=KERN_FAILURE; + kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogRelocatingPatchedSym, + kxld_demangle(sym->name, &demangled_name, &demangled_length))); + target = sym->link_addr; + if (kxld_sym_is_vtable(sym)) { + relocator->current_vtable = kxld_dict_find(relocator->vtables, sym->name); + } + /* Some relocation types need the GOT entry address instead of the * symbol's actual address. These types don't have pair relocation * entries, so we store the GOT entry address as the pair target. @@ -762,7 +831,8 @@ calculate_targets(kxld_addr_t *_target, kxld_addr_t *_pair_target, if (reloc->pair_target_type == KXLD_TARGET_VALUE) { pair_target = reloc->pair_target; } else if (reloc->pair_target_type == KXLD_TARGET_SYMBOLNUM ) { - sym = kxld_symtab_get_symbol_by_index(symtab, reloc->pair_target); + sym = kxld_symtab_get_symbol_by_index(relocator->symtab, + reloc->pair_target); require_action(sym, finish, rval=KERN_FAILURE); pair_target = sym->link_addr; } else if (reloc->pair_target_type == KXLD_TARGET_GOT) { @@ -779,6 +849,7 @@ calculate_targets(kxld_addr_t *_target, kxld_addr_t *_pair_target, rval = KERN_SUCCESS; finish: + if (demangled_name) kxld_free(demangled_name, demangled_length); return rval; } @@ -804,9 +875,10 @@ get_target_by_address_lookup(kxld_addr_t *target, kxld_addr_t addr, end = start + sect->size; if (start <= addr && addr < end) break; + + sect = NULL; } - require_action(i < sectarray->nitems, finish, - rval=KERN_FAILURE); + require_action(sect, finish, rval=KERN_FAILURE); *target = sect->link_addr - sect->base_addr; rval = KERN_SUCCESS; @@ -815,6 +887,29 @@ get_target_by_address_lookup(kxld_addr_t *target, kxld_addr_t addr, return rval; } +/******************************************************************************* +*******************************************************************************/ +static kern_return_t +check_for_direct_pure_virtual_call(const KXLDRelocator *relocator, u_long offset) +{ + kern_return_t rval = KERN_FAILURE; + const KXLDVTableEntry *entry = NULL; + + if (relocator->current_vtable) { + entry = kxld_vtable_get_entry_for_offset(relocator->current_vtable, + offset, relocator->is_32_bit); + require_action(!entry || !entry->patched.name || + !kxld_sym_name_is_pure_virtual(entry->patched.name), + finish, rval=KERN_FAILURE; + kxld_log(kKxldLogLinking, kKxldLogErr, + kKxldLogDirectPureVirtualCall)); + } + + rval = KERN_SUCCESS; +finish: + return rval; +} + #if KXLD_USER_OR_I386 /******************************************************************************* *******************************************************************************/ @@ -847,10 +942,10 @@ static boolean_t generic_reloc_has_got(u_int _type __unused) /******************************************************************************* *******************************************************************************/ static kern_return_t -generic_process_reloc(u_char *instruction, u_int length, u_int pcrel, - kxld_addr_t _base_pc, kxld_addr_t _link_pc, kxld_addr_t _link_disp __unused, - u_int _type, kxld_addr_t _target, kxld_addr_t _pair_target, - boolean_t swap __unused) +generic_process_reloc(const KXLDRelocator *relocator, u_char *instruction, + u_int length, u_int pcrel, kxld_addr_t _base_pc, kxld_addr_t _link_pc, + kxld_addr_t _link_disp __unused, u_int _type, kxld_addr_t _target, + kxld_addr_t _pair_target, boolean_t swap __unused) { kern_return_t rval = KERN_FAILURE; uint32_t base_pc = (uint32_t) _base_pc; @@ -873,6 +968,9 @@ generic_process_reloc(u_char *instruction, u_int length, u_int pcrel, if (swap) instr_data = OSSwapInt32(instr_data); #endif + rval = check_for_direct_pure_virtual_call(relocator, instr_data); + require_noerr(rval, finish); + switch (type) { case GENERIC_RELOC_VANILLA: instr_data += target; @@ -944,10 +1042,10 @@ static boolean_t ppc_reloc_has_got(u_int _type __unused) /******************************************************************************* *******************************************************************************/ static kern_return_t -ppc_process_reloc(u_char *instruction, u_int length, u_int pcrel, - kxld_addr_t _base_pc, kxld_addr_t _link_pc, kxld_addr_t _link_disp __unused, - u_int _type, kxld_addr_t _target, kxld_addr_t _pair_target __unused, - boolean_t swap __unused) +ppc_process_reloc(const KXLDRelocator *relocator __unused, u_char *instruction, + u_int length, u_int pcrel, kxld_addr_t _base_pc, kxld_addr_t _link_pc, + kxld_addr_t _link_disp __unused, u_int _type, kxld_addr_t _target, + kxld_addr_t _pair_target __unused, boolean_t swap __unused) { kern_return_t rval = KERN_FAILURE; uint32_t *instr_addr = NULL; @@ -975,6 +1073,9 @@ ppc_process_reloc(u_char *instruction, u_int length, u_int pcrel, if (swap) instr_data = OSSwapInt32(instr_data); #endif + rval = check_for_direct_pure_virtual_call(relocator, instr_data); + require_noerr(rval, finish); + switch (type) { case PPC_RELOC_VANILLA: require_action(!pcrel, finish, rval=KERN_FAILURE); @@ -1123,10 +1224,10 @@ x86_64_reloc_has_got(u_int _type) /******************************************************************************* *******************************************************************************/ static kern_return_t -x86_64_process_reloc(u_char *instruction, u_int length, u_int pcrel, - kxld_addr_t _base_pc __unused, kxld_addr_t _link_pc, kxld_addr_t _link_disp, - u_int _type, kxld_addr_t _target, kxld_addr_t _pair_target, - boolean_t swap __unused) +x86_64_process_reloc(const KXLDRelocator *relocator __unused, u_char *instruction, + u_int length, u_int pcrel, kxld_addr_t _base_pc __unused, + kxld_addr_t _link_pc, kxld_addr_t _link_disp, u_int _type, + kxld_addr_t _target, kxld_addr_t _pair_target, boolean_t swap __unused) { kern_return_t rval = KERN_FAILURE; enum reloc_type_x86_64 type = _type; @@ -1152,6 +1253,9 @@ x86_64_process_reloc(u_char *instruction, u_int length, u_int pcrel, if (swap) instr32 = OSSwapInt32(instr32); #endif + rval = check_for_direct_pure_virtual_call(relocator, instr32); + require_noerr(rval, finish); + /* There are a number of different small adjustments for pc-relative * relocation entries. The general case is to subtract the size of the * relocation (represented by the length parameter), and it applies to @@ -1251,6 +1355,9 @@ x86_64_process_reloc(u_char *instruction, u_int length, u_int pcrel, if (swap) instr64 = OSSwapInt64(instr64); #endif + rval = check_for_direct_pure_virtual_call(relocator, (u_long) instr64); + require_noerr(rval, finish); + switch (type) { case X86_64_RELOC_UNSIGNED: require_action(!pcrel, finish, rval=KERN_FAILURE); @@ -1349,10 +1456,11 @@ arm_reloc_has_got(u_int _type __unused) /******************************************************************************* *******************************************************************************/ static kern_return_t -arm_process_reloc(u_char *instruction, u_int length, u_int pcrel, - kxld_addr_t _base_pc __unused, kxld_addr_t _link_pc __unused, kxld_addr_t _link_disp __unused, - u_int _type __unused, kxld_addr_t _target __unused, kxld_addr_t _pair_target __unused, - boolean_t swap __unused) +arm_process_reloc(const KXLDRelocator *relocator __unused, u_char *instruction, + u_int length, u_int pcrel, kxld_addr_t _base_pc __unused, + kxld_addr_t _link_pc __unused, kxld_addr_t _link_disp __unused, + u_int _type __unused, kxld_addr_t _target __unused, + kxld_addr_t _pair_target __unused, boolean_t swap __unused) { kern_return_t rval = KERN_FAILURE; uint32_t *instr_addr = NULL; @@ -1375,9 +1483,11 @@ arm_process_reloc(u_char *instruction, u_int length, u_int pcrel, if (swap) instr_data = OSSwapInt32(instr_data); #endif + rval = check_for_direct_pure_virtual_call(relocator, instr_data); + require_noerr(rval, finish); + switch (type) { case ARM_RELOC_VANILLA: - require_action(!pcrel, finish, rval=KERN_FAILURE); instr_data += target; break; diff --git a/libkern/kxld/kxld_reloc.h b/libkern/kxld/kxld_reloc.h index 679a95870..40a610d1a 100644 --- a/libkern/kxld/kxld_reloc.h +++ b/libkern/kxld/kxld_reloc.h @@ -37,16 +37,23 @@ #endif struct kxld_array; +struct kxld_dict; +struct kxld_sect; +struct kxld_seg; struct kxld_sym; struct kxld_symtab; +struct kxld_vtable; +struct relocation_info; + typedef struct kxld_relocator KXLDRelocator; typedef struct kxld_reloc KXLDReloc; typedef boolean_t (*RelocHasPair)(u_int r_type); typedef boolean_t (*RelocIsPair)(u_int r_type, u_int prev_r_type); typedef boolean_t (*RelocHasGot)(u_int r_type); -typedef kern_return_t(*ProcessReloc)(u_char *instruction, u_int length, u_int pcrel, - kxld_addr_t base_pc, kxld_addr_t link_pc, kxld_addr_t link_disp, u_int type, +typedef kern_return_t(*ProcessReloc)(const KXLDRelocator *relocator, + u_char *instruction, u_int length, u_int pcrel, kxld_addr_t base_pc, + kxld_addr_t link_pc, kxld_addr_t link_disp, u_int type, kxld_addr_t target, kxld_addr_t pair_target, boolean_t swap); struct kxld_relocator { @@ -54,6 +61,12 @@ struct kxld_relocator { RelocIsPair reloc_is_pair; RelocHasGot reloc_has_got; ProcessReloc process_reloc; + const struct kxld_symtab *symtab; + const struct kxld_array *sectarray; + const struct kxld_dict *vtables; + const struct kxld_vtable *current_vtable; + u_char *file; + u_int function_align; /* Power of two alignment of functions */ boolean_t is_32_bit; boolean_t swap; }; @@ -69,18 +82,12 @@ struct kxld_reloc { u_int pcrel:1; }; -struct kxld_array; -struct kxld_sect; -struct kxld_seg; -struct kxld_symtab; -struct relocation_info; - /******************************************************************************* * Constructors and Destructors *******************************************************************************/ - -kern_return_t kxld_relocator_init(KXLDRelocator *relocator, cpu_type_t cputype, - cpu_subtype_t cpusubtype, boolean_t swap) +kern_return_t kxld_relocator_init(KXLDRelocator *relocator, u_char *file, + const struct kxld_symtab *symtab, const struct kxld_array *sectarray, + cpu_type_t cputype, cpu_subtype_t cpusubtype, boolean_t swap) __attribute__((nonnull,visibility("hidden"))); kern_return_t kxld_reloc_create_macho(struct kxld_array *relocarray, @@ -104,10 +111,13 @@ boolean_t kxld_relocator_is_pair(const KXLDRelocator *relocator, u_int r_type, boolean_t kxld_relocator_has_got(const KXLDRelocator *relocator, u_int r_type) __attribute__((pure, nonnull,visibility("hidden"))); +kxld_addr_t kxld_relocator_get_pointer_at_addr(const KXLDRelocator *relocator, + const u_char *data, u_long offset) + __attribute__((pure, nonnull,visibility("hidden"))); + struct kxld_sym * kxld_reloc_get_symbol(const KXLDRelocator *relocator, - const KXLDReloc *reloc, u_char *data, - const struct kxld_symtab *symtab) - __attribute__((pure, nonnull(1,2,4), visibility("hidden"))); + const KXLDReloc *reloc, const u_char *data) + __attribute__((pure, nonnull(1,2), visibility("hidden"))); kern_return_t kxld_reloc_get_reloc_index_by_offset(const struct kxld_array *relocs, kxld_size_t offset, u_int *idx) @@ -124,16 +134,18 @@ KXLDReloc * kxld_reloc_get_reloc_by_offset(const struct kxld_array *relocs, kern_return_t kxld_reloc_update_symindex(KXLDReloc *reloc, u_int symindex) __attribute__((nonnull,visibility("hidden"))); -kern_return_t kxld_relocator_process_sect_reloc(const KXLDRelocator *relocator, - const KXLDReloc *reloc, const struct kxld_sect *sect, - const struct kxld_array *sectarray, const struct kxld_symtab *symtab) +void kxld_relocator_set_vtables(KXLDRelocator *relocator, + const struct kxld_dict *vtables) + __attribute__((nonnull,visibility("hidden"))); + +kern_return_t kxld_relocator_process_sect_reloc(KXLDRelocator *relocator, + const KXLDReloc *reloc, const struct kxld_sect *sect) __attribute__((nonnull,visibility("hidden"))); -kern_return_t kxld_relocator_process_table_reloc(const KXLDRelocator *relocator, - const KXLDReloc *reloc, const struct kxld_seg *seg, u_char *file, - const struct kxld_array *sectarray, - const struct kxld_symtab *symtab) +kern_return_t kxld_relocator_process_table_reloc(KXLDRelocator *relocator, + const KXLDReloc *reloc, const struct kxld_seg *seg, kxld_addr_t link_addr) __attribute__((nonnull,visibility("hidden"))); #endif /* _KXLD_RELOC_H */ + diff --git a/libkern/kxld/kxld_sect.c b/libkern/kxld/kxld_sect.c index 0c286b5b6..d00d6596d 100644 --- a/libkern/kxld/kxld_sect.c +++ b/libkern/kxld/kxld_sect.c @@ -40,7 +40,7 @@ #include "kxld_util.h" static kern_return_t export_macho(const KXLDSect *sect, u_char *buf, u_long offset, - u_long bufsize, boolean_t is_32_bit); + u_long bufsize); #if KXLD_USER_OR_ILP32 static kern_return_t sect_export_macho_header_32(const KXLDSect *sect, u_char *buf, u_long *header_offset, u_long header_size, u_long data_offset); @@ -326,7 +326,7 @@ kxld_sect_align_address(const KXLDSect *sect, kxld_addr_t address) kern_return_t kxld_sect_export_macho_to_file_buffer(const KXLDSect *sect, u_char *buf, u_long *header_offset, u_long header_size, u_long *data_offset, - u_long data_size, boolean_t is_32_bit) + u_long data_size, boolean_t is_32_bit __unused) { kern_return_t rval = KERN_FAILURE; @@ -351,7 +351,7 @@ kxld_sect_export_macho_to_file_buffer(const KXLDSect *sect, u_char *buf, sect, buf, header_offset, header_size, *data_offset); require_noerr(rval, finish); - rval = export_macho(sect, buf, *data_offset, data_size, is_32_bit); + rval = export_macho(sect, buf, *data_offset, data_size); require_noerr(rval, finish); *data_offset += (u_long) sect->size; @@ -369,7 +369,7 @@ kern_return_t kxld_sect_export_macho_to_vm(const KXLDSect *sect, u_char *buf, u_long *header_offset, u_long header_size, kxld_addr_t link_addr, u_long data_size, - boolean_t is_32_bit) + boolean_t is_32_bit __unused) { kern_return_t rval = KERN_FAILURE; u_long data_offset = (u_long) (sect->link_addr - link_addr); @@ -383,7 +383,7 @@ kxld_sect_export_macho_to_vm(const KXLDSect *sect, u_char *buf, sect, buf, header_offset, header_size, data_offset); require_noerr(rval, finish); - rval = export_macho(sect, buf, data_offset, data_size, is_32_bit); + rval = export_macho(sect, buf, data_offset, data_size); require_noerr(rval, finish); rval = KERN_SUCCESS; @@ -395,8 +395,7 @@ kxld_sect_export_macho_to_vm(const KXLDSect *sect, u_char *buf, /******************************************************************************* *******************************************************************************/ static kern_return_t -export_macho(const KXLDSect *sect, u_char *buf, u_long offset, u_long bufsize, - boolean_t is_32_bit) +export_macho(const KXLDSect *sect, u_char *buf, u_long offset, u_long bufsize) { kern_return_t rval = KERN_FAILURE; @@ -424,11 +423,6 @@ export_macho(const KXLDSect *sect, u_char *buf, u_long offset, u_long bufsize, case S_NON_LAZY_SYMBOL_POINTERS: case S_MOD_INIT_FUNC_POINTERS: case S_MOD_TERM_FUNC_POINTERS: - require_action(!is_32_bit, finish, rval=KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO - "Invalid section type in 32-bit kext: %u.", - sect->flags & SECTION_TYPE)); - /* Fall through */ case S_REGULAR: case S_CSTRING_LITERALS: case S_4BYTE_LITERALS: @@ -607,8 +601,7 @@ kxld_sect_populate_got(KXLDSect *sect, KXLDSymtab *symtab, /******************************************************************************* *******************************************************************************/ kern_return_t -kxld_sect_process_relocs(KXLDSect *sect, const KXLDRelocator *relocator, - const KXLDArray *sectarray, const KXLDSymtab *symtab) +kxld_sect_process_relocs(KXLDSect *sect, KXLDRelocator *relocator) { kern_return_t rval = KERN_FAILURE; KXLDReloc *reloc = NULL; @@ -616,13 +609,11 @@ kxld_sect_process_relocs(KXLDSect *sect, const KXLDRelocator *relocator, for (i = 0; i < sect->relocs.nitems; ++i) { reloc = kxld_array_get_item(§->relocs, i); - rval = kxld_relocator_process_sect_reloc(relocator, reloc, sect, - sectarray, symtab); + rval = kxld_relocator_process_sect_reloc(relocator, reloc, sect); require_noerr(rval, finish); } rval = KERN_SUCCESS; - finish: return rval; } diff --git a/libkern/kxld/kxld_sect.h b/libkern/kxld/kxld_sect.h index cf79fde75..2f655b4af 100644 --- a/libkern/kxld/kxld_sect.h +++ b/libkern/kxld/kxld_sect.h @@ -125,7 +125,7 @@ kxld_addr_t kxld_sect_align_address(const KXLDSect *sect, kxld_addr_t address) /* Returns the space required by the exported Mach-O header */ u_long kxld_sect_get_macho_header_size(boolean_t is_32_bit) - __attribute__((const, nonnull, visibility("hidden"))); + __attribute__((const, visibility("hidden"))); /* Returns the space required by the exported Mach-O data */ u_long kxld_sect_get_macho_data_size(const KXLDSect *sect) @@ -176,8 +176,7 @@ kern_return_t kxld_sect_populate_got(KXLDSect *sect, struct kxld_symtab *symtab, /* Processes all of a section's relocation entries */ kern_return_t kxld_sect_process_relocs(KXLDSect *sect, - const struct kxld_relocator *relocator, const KXLDArray *sectarray, - const struct kxld_symtab *symtab) + struct kxld_relocator *relocator) __attribute__((nonnull, visibility("hidden"))); #endif /* _KXLD_SECT_H_ */ diff --git a/libkern/kxld/kxld_seg.c b/libkern/kxld/kxld_seg.c index 5c11a1f9a..ba14b4917 100644 --- a/libkern/kxld/kxld_seg.c +++ b/libkern/kxld/kxld_seg.c @@ -41,6 +41,7 @@ #include "kxld_sect.h" #include "kxld_seg.h" +#include "kxld_symtab.h" #include "kxld_util.h" #define MAX_SEGS 20 @@ -402,6 +403,32 @@ reorder_section(KXLDArray *sects, u_int *sect_reorder_index, ++(*sect_reorder_index); } + +/******************************************************************************* +*******************************************************************************/ +kern_return_t +kxld_seg_init_linkedit(KXLDArray *segs) +{ + kern_return_t rval = KERN_FAILURE; + KXLDSeg *seg = NULL; + KXLDSeg *le = NULL; + + rval = kxld_array_resize(segs, 2); + require_noerr(rval, finish); + + seg = kxld_array_get_item(segs, 0); + le = kxld_array_get_item(segs, 1); + + strlcpy(le->segname, SEG_LINKEDIT, sizeof(le->segname)); + le->link_addr = round_page(seg->link_addr + seg->vmsize); + le->maxprot = VM_PROT_ALL; + le->initprot = VM_PROT_DEFAULT; + + rval = KERN_SUCCESS; + +finish: + return rval; +} #endif /* KXLD_USER_OR_OBJECT */ /******************************************************************************* @@ -742,8 +769,14 @@ kxld_seg_finish_init(KXLDSeg *seg) void kxld_seg_set_vm_protections(KXLDSeg *seg, boolean_t strict_protections) { + /* This is unnecessary except to make the clang analyzer happy. When + * the analyzer no longer ignores nonnull attributes for if statements, + * we can remove this line. + */ + if (!seg) return; + if (strict_protections) { - if (streq_safe(seg->segname, SEG_TEXT, sizeof(SEG_TEXT))) { + if (streq_safe(seg->segname, SEG_TEXT, const_strlen(SEG_TEXT))) { seg->initprot = TEXT_SEG_PROT; seg->maxprot = VM_PROT_ALL; } else { @@ -771,3 +804,12 @@ kxld_seg_relocate(KXLDSeg *seg, kxld_addr_t link_addr) } } +/******************************************************************************* +*******************************************************************************/ +void +kxld_seg_populate_linkedit(KXLDSeg *seg, + const KXLDSymtab *symtab, boolean_t is_32_bit) +{ + seg->vmsize = round_page(kxld_symtab_get_macho_data_size(symtab, is_32_bit)); +} + diff --git a/libkern/kxld/kxld_seg.h b/libkern/kxld/kxld_seg.h index e6484bf1a..ab5abcdc6 100644 --- a/libkern/kxld/kxld_seg.h +++ b/libkern/kxld/kxld_seg.h @@ -39,6 +39,7 @@ #include "kxld_array.h" struct kxld_sect; +struct kxld_symtab; struct segment_command; struct segment_command_64; typedef struct kxld_seg KXLDSeg; @@ -77,6 +78,9 @@ kern_return_t kxld_seg_create_seg_from_sections(KXLDArray *segarray, kern_return_t kxld_seg_finalize_object_segment(KXLDArray *segarray, KXLDArray *section_order, u_long hdrsize) __attribute__((nonnull, visibility("hidden"))); + +kern_return_t kxld_seg_init_linkedit(KXLDArray *segs) + __attribute__((nonnull, visibility("hidden"))); #endif /* KXLD_USER_OR_OBJECT */ void kxld_seg_clear(KXLDSeg *seg) @@ -127,7 +131,11 @@ kern_return_t kxld_seg_finish_init(KXLDSeg *seg) void kxld_seg_set_vm_protections(KXLDSeg *seg, boolean_t strict_protections) __attribute__((nonnull, visibility("hidden"))); -void kxld_seg_relocate(KXLDSeg *Seg, kxld_addr_t link_addr) +void kxld_seg_relocate(KXLDSeg *seg, kxld_addr_t link_addr) + __attribute__((nonnull, visibility("hidden"))); + +void kxld_seg_populate_linkedit(KXLDSeg *seg, + const struct kxld_symtab *symtab, boolean_t is_32_bit) __attribute__((nonnull, visibility("hidden"))); #endif /* _KXLD_SEG_H_ */ diff --git a/libkern/kxld/kxld_state.c b/libkern/kxld/kxld_state.c deleted file mode 100644 index d3a06da19..000000000 --- a/libkern/kxld/kxld_state.c +++ /dev/null @@ -1,1072 +0,0 @@ -/* - * Copyright (c) 2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#include - -#if !KERNEL - #include -#endif - -#define DEBUG_ASSERT_COMPONENT_NAME_STRING "kxld" -#include - -#include "kxld_array.h" -#include "kxld_dict.h" -#include "kxld_kext.h" -#include "kxld_state.h" -#include "kxld_sym.h" -#include "kxld_symtab.h" -#include "kxld_util.h" -#include "kxld_vtable.h" - -#define LINK_STATE_MAGIC 0xF00DD00D -#define CIGAM_ETATS_KNIL 0x0DD00DF0 - -#define LINK_STATE_MAGIC_64 0xCAFEF00D -#define CIGAM_ETATS_KNIL_64 0x0DF0FECA - -#define LINK_STATE_VERSION 1 - -static kern_return_t init_string_index(KXLDDict *strings, KXLDArray *tmps, - KXLDSymtabIterator *iter, const KXLDArray *vtables, u_int nsymentries, - u_long *strsize); -static kern_return_t add_string_to_index(KXLDDict *strings, const char *str, - KXLDArray *tmps, u_int *tmpi, u_long *stroff); -static kern_return_t create_link_state(u_char **_file, u_long *_filesize, - const KXLDKext *kext, KXLDSymtabIterator *iter, const KXLDArray *vtables, - KXLDDict *strings, u_int nsyms, u_int nsymentries, u_long strsize); -static boolean_t state_is_32_bit(KXLDLinkStateHdr *state); - -#if KXLD_USER_OR_ILP32 -static kern_return_t get_symbols_32(KXLDState *state, KXLDDict *defined_symbols, - KXLDDict *obsolete_symbols); -static kern_return_t copy_symbols_32(u_char *file, u_long *data_offset, - KXLDSymtabIterator *iter, const KXLDDict *strings); -static kern_return_t copy_vtables_32(u_char *file, u_long *header_offset, - u_long *data_offset, const KXLDArray *vtables, const KXLDDict *strings); -#endif /* KXLD_USER_OR_ILP32*/ -#if KXLD_USER_OR_LP64 -static kern_return_t get_symbols_64(KXLDState *state, KXLDDict *defined_symbols, - KXLDDict *obsolete_symbols); -static kern_return_t copy_symbols_64(u_char *file, u_long *data_offset, - KXLDSymtabIterator *iter, const KXLDDict *strings); -static kern_return_t copy_vtables_64(u_char *file, u_long *header_offset, - u_long *data_offset, const KXLDArray *vtables, const KXLDDict *strings); -#endif /* KXLD_USER_OR_ILP64 */ - -#if !KERNEL -static boolean_t swap_link_state(u_char *state); -static void swap_link_state_32(u_char *state); -static void swap_link_state_64(u_char *state); -static boolean_t unswap_link_state(u_char *state); -static void unswap_link_state_32(u_char *state); -static void unswap_link_state_64(u_char *state); -static void swap_state_hdr(KXLDLinkStateHdr *state_hdr); -static void swap_vtable_hdr(KXLDVTableHdr *vtable_hdr); -static void swap_sym_entry_32(KXLDSymEntry32 *entry); -static void swap_sym_entry_64(KXLDSymEntry64 *entry); -#endif - -/******************************************************************************* -*******************************************************************************/ -kern_return_t -kxld_state_init_from_file(KXLDState *state, u_char *file, - KXLDArray *section_order __unused) -{ - kern_return_t rval = KERN_FAILURE; - KXLDLinkStateHdr *hdr = (KXLDLinkStateHdr *) file; -#if KXLD_USER_OR_OBJECT - KXLDSectionName *dstname = NULL; - KXLDSectionName *srcname = NULL; -#endif - KXLDVTableHdr *vhdr = NULL; - KXLDVTable *vtable = NULL; - u_int i = 0; - - check(state); - check(file); - -#if !KERNEL - /* Swap the link state file to host byte order for as long this kxld_state - * object owns the file. - */ - state->swap = swap_link_state(file); -#endif - require_action(hdr->magic == LINK_STATE_MAGIC || - hdr->magic == LINK_STATE_MAGIC_64, - finish, rval=KERN_FAILURE); - - state->file = file; - -#if KXLD_USER_OR_OBJECT - if (section_order && !section_order->nitems && hdr->nsects) { - rval = kxld_array_init(section_order, sizeof(*dstname), hdr->nsects); - require_noerr(rval, finish); - - srcname = (KXLDSectionName *) (file + hdr->sectoff); - for (i = 0; i < hdr->nsects; ++i, ++srcname) { - dstname = kxld_array_get_item(section_order, i); - memcpy(dstname, srcname, sizeof(*srcname)); - } - } -#endif - - rval = kxld_array_init(&state->vtables, sizeof(*vtable), hdr->nvtables); - require_noerr(rval, finish); - - vhdr = (KXLDVTableHdr *) (file + hdr->voff); - for (i = 0; i < hdr->nvtables; ++i, ++vhdr) { - vtable = kxld_array_get_item(&state->vtables, i); - KXLD_3264_FUNC(kxld_is_32_bit(hdr->cputype), rval, - kxld_vtable_init_from_link_state_32, - kxld_vtable_init_from_link_state_64, - vtable, file, vhdr); - require_noerr(rval, finish); - } - - rval = KERN_SUCCESS; - -finish: - return rval; -} - -/******************************************************************************* -*******************************************************************************/ -void -kxld_state_clear(KXLDState *state) -{ - KXLDVTable *vtable = NULL; - u_int i = 0; - - check(state); - -#if !KERNEL - /* We use kxld_state objects to wrap the link state files. Whenever the - * file is wrapped by a kxld_state object, the file is kept in host byte - * order. Once we are done, we must return it to target byte order. - */ - if (state->swap) (void)unswap_link_state(state->file); -#endif - - state->file = NULL; - state->swap = FALSE; - for (i = 0; i < state->vtables.nitems; ++i) { - vtable = kxld_array_get_item(&state->vtables, i); - kxld_vtable_clear(vtable); - } - kxld_array_reset(&state->vtables); -} - -/******************************************************************************* -*******************************************************************************/ -void -kxld_state_deinit(KXLDState *state) -{ - KXLDVTable *vtable = NULL; - u_int i = 0; - - check(state); - -#if !KERNEL - if (state->file && state->swap) (void)unswap_link_state(state->file); -#endif - - for (i = 0; i < state->vtables.maxitems; ++i) { - vtable = kxld_array_get_slot(&state->vtables, i); - kxld_vtable_deinit(vtable); - } - kxld_array_deinit(&state->vtables); - bzero(state, sizeof(*state)); -} - -/******************************************************************************* -*******************************************************************************/ -u_int -kxld_state_get_num_symbols(KXLDState *state) -{ - KXLDLinkStateHdr *hdr = (KXLDLinkStateHdr *) state->file; - - return hdr->nsyms; -} - -/******************************************************************************* -*******************************************************************************/ -kern_return_t -kxld_state_get_symbols(KXLDState *state, KXLDDict *defined_symbols, - KXLDDict *obsolete_symbols) -{ - KXLDLinkStateHdr * hdr = (KXLDLinkStateHdr *) state->file; - kern_return_t rval = KERN_FAILURE; - - check(state); - check(defined_symbols); - check(obsolete_symbols); - - require_action(hdr->magic == LINK_STATE_MAGIC || - hdr->magic == LINK_STATE_MAGIC_64, - finish, rval=KERN_FAILURE); - - KXLD_3264_FUNC(state_is_32_bit(hdr), rval, - get_symbols_32, get_symbols_64, - state, defined_symbols, obsolete_symbols); - require_noerr(rval, finish); - - rval = KERN_SUCCESS; - -finish: - return rval; -} - -#if KXLD_USER_OR_ILP32 -/******************************************************************************* -*******************************************************************************/ -static kern_return_t -get_symbols_32(KXLDState *state, KXLDDict *defined_symbols, - KXLDDict *obsolete_symbols) -{ - kern_return_t rval = KERN_FAILURE; - KXLDLinkStateHdr *hdr = (KXLDLinkStateHdr *) state->file; - KXLDSymEntry32 *entry = NULL; - const char *name = NULL; - u_int i = 0; - - entry = (KXLDSymEntry32 *) (state->file + hdr->symoff); - for (i = 0; i < hdr->nsyms; ++i, ++entry) { - name = (const char *) (state->file + entry->nameoff); - rval = kxld_dict_insert(defined_symbols, name, &entry->addr); - require_noerr(rval, finish); - - if (entry->flags & KXLD_SYM_OBSOLETE) { - rval = kxld_dict_insert(obsolete_symbols, name, &entry->addr); - require_noerr(rval, finish); - } - } - - rval = KERN_SUCCESS; - -finish: - return rval; -} -#endif /* KXLD_USER_OR_ILP32 */ - -#if KXLD_USER_OR_LP64 -/******************************************************************************* -*******************************************************************************/ -static kern_return_t -get_symbols_64(KXLDState *state, KXLDDict *defined_symbols, - KXLDDict *obsolete_symbols) -{ - kern_return_t rval = KERN_FAILURE; - KXLDLinkStateHdr *hdr = (KXLDLinkStateHdr *) state->file; - KXLDSymEntry64 *entry = NULL; - const char *name = NULL; - u_int i = 0; - - entry = (KXLDSymEntry64 *) (state->file + hdr->symoff); - for (i = 0; i < hdr->nsyms; ++i, ++entry) { - name = (const char *) (state->file + entry->nameoff); - rval = kxld_dict_insert(defined_symbols, name, &entry->addr); - require_noerr(rval, finish); - - if (entry->flags & KXLD_SYM_OBSOLETE) { - rval = kxld_dict_insert(obsolete_symbols, name, &entry->addr); - require_noerr(rval, finish); - } - } - - rval = KERN_SUCCESS; - -finish: - return rval; -} -#endif /* KXLD_USER_OR_LP64 */ - -/******************************************************************************* -*******************************************************************************/ -u_int -kxld_state_get_num_vtables(KXLDState *state) -{ - return state->vtables.nitems; -} - -/******************************************************************************* -*******************************************************************************/ -kern_return_t -kxld_state_get_vtables(KXLDState *state, KXLDDict *patched_vtables) -{ - kern_return_t rval = KERN_FAILURE; - KXLDVTable *vtable = NULL; - u_int i = 0; - - check(state); - check(patched_vtables); - - for (i = 0; i < state->vtables.nitems; ++i) { - vtable = kxld_array_get_item(&state->vtables, i); - rval = kxld_dict_insert(patched_vtables, vtable->name, vtable); - require_noerr(rval, finish); - } - - rval = KERN_SUCCESS; - -finish: - return rval; -} - -/******************************************************************************* -*******************************************************************************/ -void -kxld_state_get_cputype(const KXLDState *state, cpu_type_t *cputype, - cpu_subtype_t *cpusubtype) -{ - KXLDLinkStateHdr *hdr = (KXLDLinkStateHdr *) state->file; - - check(state); - check(cputype); - check(cpusubtype); - - *cputype = hdr->cputype; - *cpusubtype = hdr->cpusubtype; -} - -/******************************************************************************* -*******************************************************************************/ -kern_return_t -kxld_state_export_kext_to_file(KXLDKext *kext, u_char **file, u_long *filesize, - KXLDDict *strings, KXLDArray *tmps) -{ - kern_return_t rval = KERN_FAILURE; - KXLDSymtabIterator iter; - const KXLDSymtab *symtab = NULL; - const KXLDArray *vtables = NULL; - const KXLDVTable *vtable = NULL; - u_int nsyms = 0; - u_int nsymentries = 0; - u_int i = 0; - u_long strsize = 0; - - check(kext); - check(file); - check(tmps); - - bzero(&iter, sizeof(iter)); - - /* Get the vtables and symbol tables from the kext */ - - kxld_kext_get_vtables(kext, &vtables); - symtab = kxld_kext_get_symtab(kext); - require_action(symtab, finish, rval=KERN_FAILURE); - - /* Count the number of symentries we'll need in the linkstate */ - - kxld_symtab_iterator_init(&iter, symtab, kxld_sym_is_exported, FALSE); - - nsyms = kxld_symtab_iterator_get_num_remaining(&iter); - nsymentries = nsyms; - for (i = 0; i < vtables->nitems; ++i) { - vtable = kxld_array_get_item(vtables, i); - nsymentries += vtable->entries.nitems; - } - - /* Initialize the string index */ - - rval = init_string_index(strings, tmps, &iter, vtables, nsymentries, - &strsize); - require_noerr(rval, finish); - - /* Create the linkstate file */ - - rval = create_link_state(file, filesize, kext, &iter, vtables, - strings, nsyms, nsymentries, strsize); - require_noerr(rval, finish); - - /* Swap if necessary */ - -#if !KERNEL - if (kxld_kext_target_needs_swap(kext)) unswap_link_state(*file); -#endif /* !KERNEL */ - - rval = KERN_SUCCESS; - -finish: - return rval; -} - -/******************************************************************************* -*******************************************************************************/ -static kern_return_t -init_string_index(KXLDDict *strings, KXLDArray *tmps, KXLDSymtabIterator *iter, - const KXLDArray *vtables, u_int nsymentries, u_long *_strsize) -{ - kern_return_t rval = KERN_SUCCESS; - const KXLDSym *sym = NULL; - const KXLDVTable *vtable = NULL; - const KXLDVTableEntry *ventry = NULL; - u_long strsize = 0; - u_int tmpi = 0; - u_int i = 0; - u_int j = 0; - - check(strings); - check(tmps); - check(iter); - check(vtables); - check(_strsize); - - *_strsize = 0; - - /* Initialize the string dictionary and string offset array */ - - rval = kxld_dict_init(strings, kxld_dict_string_hash, kxld_dict_string_cmp, - nsymentries); - require_noerr(rval, finish); - - rval = kxld_array_init(tmps, sizeof(u_long), nsymentries); - require_noerr(rval, finish); - - /* Add all of the strings from the symbol table to the dictionary */ - - kxld_symtab_iterator_reset(iter); - while ((sym = kxld_symtab_iterator_get_next(iter))) { - rval = add_string_to_index(strings, sym->name, tmps, &tmpi, &strsize); - require_noerr(rval, finish); - } - - /* Add all of the strings from the vtables entries to the dictionary */ - - for (i = 0; i < vtables->nitems; ++i) { - vtable = kxld_array_get_item(vtables, i); - rval = add_string_to_index(strings, vtable->name, tmps, &tmpi, &strsize); - require_noerr(rval, finish); - - for (j = 0; j < vtable->entries.nitems; ++j) { - ventry = kxld_array_get_item(&vtable->entries, j); - if (ventry->patched.name) { - rval = add_string_to_index(strings, ventry->patched.name, tmps, - &tmpi, &strsize); - require_noerr(rval, finish); - } - } - } - - *_strsize = strsize; - rval = KERN_SUCCESS; - -finish: - return rval; -} - -/******************************************************************************* -*******************************************************************************/ -static kern_return_t -add_string_to_index(KXLDDict *strings, const char *str, KXLDArray *tmps, - u_int *tmpi, u_long *stroff) -{ - kern_return_t rval = KERN_FAILURE; - u_long *tmpp = NULL; - - if (!kxld_dict_find(strings, str)) { - tmpp = kxld_array_get_item(tmps, (*tmpi)++); - *tmpp = *stroff; - - rval = kxld_dict_insert(strings, str, tmpp); - require_noerr(rval, finish); - - *stroff += strlen(str) + 1; - } - - rval = KERN_SUCCESS; - -finish: - return rval; -} - -/******************************************************************************* -*******************************************************************************/ -static boolean_t -state_is_32_bit(KXLDLinkStateHdr *state) -{ - return kxld_is_32_bit(state->cputype); -} - -/******************************************************************************* -*******************************************************************************/ -static kern_return_t -create_link_state(u_char **_file, u_long *_filesize, const KXLDKext *kext, - KXLDSymtabIterator *iter, const KXLDArray *vtables, KXLDDict *strings, - u_int nsyms, u_int nsymentries, u_long strsize) -{ - kern_return_t rval = KERN_SUCCESS; - u_char *file = NULL; - KXLDLinkStateHdr *hdr = NULL; - KXLDDictIterator striter; -#if KXLD_USER_OR_OBJECT - KXLDSectionName *dstsectname = NULL; - KXLDSectionName *srcsectname = NULL; - const KXLDArray *section_order = NULL; - u_int i = 0; -#endif - const char *name = NULL; - char *dstname = NULL; - u_long *stridx = 0; - u_long hsize = 0; - u_long dsize = 0; - u_long filesize = 0; - u_long hoff = 0; - u_long doff = 0; - u_long stroff = 0; - - check(_file); - check(iter); - check(vtables); - check(strings); - - *_file = NULL; - *_filesize = 0; - -#if KXLD_USER_OR_OBJECT - section_order = kxld_kext_get_section_order(kext); -#endif - - /* Calculate header and data size */ - - hsize = sizeof(KXLDLinkStateHdr); - hsize += vtables->nitems * sizeof(KXLDVTableHdr); -#if KXLD_USER_OR_OBJECT - if (section_order) { - hsize += section_order->nitems * sizeof(KXLDSectionName); - } -#endif - - if (kxld_kext_is_32_bit(kext)) { - dsize = nsymentries * sizeof(KXLDSymEntry32); - } else { - dsize = nsymentries * sizeof(KXLDSymEntry64); - } - - filesize = hsize + dsize + strsize; - - hoff = 0; - doff = hsize; - stroff = hsize + dsize; - - /* Allocate the link state */ - - file = kxld_alloc_pageable(filesize); - require_action(file, finish, rval=KERN_RESOURCE_SHORTAGE); - - /* Initialize link state header */ - - hdr = (KXLDLinkStateHdr *) file; - hoff += sizeof(*hdr); - - if (state_is_32_bit(hdr)) { - hdr->magic = LINK_STATE_MAGIC; - } else { - hdr->magic = LINK_STATE_MAGIC_64; - } - hdr->version = LINK_STATE_VERSION; - kxld_kext_get_cputype(kext, &hdr->cputype, &hdr->cpusubtype); - hdr->nsects = 0; - hdr->nvtables = vtables->nitems; - hdr->nsyms = nsyms; - -#if KXLD_USER_OR_OBJECT - if (section_order) { - hdr->nsects = section_order->nitems; - hdr->sectoff = (uint32_t) hoff; - - dstsectname = (KXLDSectionName *) (file + hoff); - hoff += section_order->nitems * sizeof(*dstsectname); - - for (i = 0; i < section_order->nitems; ++i, ++dstsectname) { - srcsectname = kxld_array_get_item(section_order, i); - memcpy(dstsectname, srcsectname, sizeof(*srcsectname)); - } - } -#endif - - hdr->voff = (uint32_t) hoff; - hdr->symoff = (uint32_t) doff; - - /* Copy strings */ - - kxld_dict_iterator_init(&striter, strings); - kxld_dict_iterator_get_next(&striter, (const void **) &name, (void **) &stridx); - while (name) { - *stridx += stroff; - dstname = (char *) (file + *stridx); - strlcpy(dstname, name, filesize - *stridx); - kxld_dict_iterator_get_next(&striter, (const void **) &name, (void **) &stridx); - } - - /* Copy symbols */ - - KXLD_3264_FUNC(state_is_32_bit(hdr), rval, - copy_symbols_32, copy_symbols_64, - file, &doff, iter, strings); - require_noerr(rval, finish); - - /* Copy vtables */ - - KXLD_3264_FUNC(state_is_32_bit(hdr), rval, - copy_vtables_32, copy_vtables_64, - file, &hoff, &doff, vtables, strings); - require_noerr(rval, finish); - - *_file = file; - *_filesize = filesize; - file = NULL; - rval = KERN_SUCCESS; - -finish: - - if (file) { - kxld_page_free(file, filesize); - file = NULL; - } - - return rval; -} - -#if KXLD_USER_OR_ILP32 -/******************************************************************************* -*******************************************************************************/ -static kern_return_t -copy_symbols_32(u_char *file, u_long *data_offset, KXLDSymtabIterator *iter, - const KXLDDict *strings) -{ - kern_return_t rval = KERN_FAILURE; - KXLDSymEntry32 *symentry = NULL; - const KXLDSym *sym = NULL; - u_long *stridx = 0; - - kxld_symtab_iterator_reset(iter); - while ((sym = kxld_symtab_iterator_get_next(iter))) { - symentry = (KXLDSymEntry32 *) (file + *data_offset); - stridx = kxld_dict_find(strings, sym->name); - require_action(stridx, finish, rval=KERN_FAILURE); - - /* Initialize the symentry */ - - symentry->nameoff = (uint32_t) *stridx; - if (sym->predicates.is_thumb) { - symentry->addr = (uint32_t) sym->link_addr | 1; - } else { - symentry->addr = (uint32_t) sym->link_addr; - } - symentry->flags = 0; - - /* Set any flags */ - - symentry->flags |= (kxld_sym_is_obsolete(sym)) ? KXLD_SYM_OBSOLETE : 0; - - *data_offset += sizeof(*symentry); - } - - rval = KERN_SUCCESS; - -finish: - return rval; -} -#endif /* KXLD_USER_OR_ILP32 */ - -#if KXLD_USER_OR_LP64 -/******************************************************************************* -*******************************************************************************/ -static kern_return_t -copy_symbols_64(u_char *file, u_long *data_offset, KXLDSymtabIterator *iter, - const KXLDDict *strings) -{ - kern_return_t rval = KERN_FAILURE; - KXLDSymEntry64 *symentry = NULL; - const KXLDSym *sym = NULL; - u_long *stridx = 0; - - kxld_symtab_iterator_reset(iter); - while ((sym = kxld_symtab_iterator_get_next(iter))) { - symentry = (KXLDSymEntry64 *) (file + *data_offset); - stridx = kxld_dict_find(strings, sym->name); - require_action(stridx, finish, rval=KERN_FAILURE); - - /* Initialize the symentry */ - - symentry->nameoff = (uint32_t) *stridx; - symentry->addr = (uint64_t) sym->link_addr; - symentry->flags = 0; - - /* Set any flags */ - - symentry->flags |= (kxld_sym_is_obsolete(sym)) ? KXLD_SYM_OBSOLETE : 0; - - *data_offset += sizeof(*symentry); - } - - rval = KERN_SUCCESS; - -finish: - return rval; -} -#endif /* KXLD_USER_OR_LP64 */ - -#if KXLD_USER_OR_ILP32 -/******************************************************************************* -*******************************************************************************/ -static kern_return_t -copy_vtables_32(u_char *file, u_long *header_offset, u_long *data_offset, - const KXLDArray *vtables, const KXLDDict *strings) -{ - kern_return_t rval = KERN_FAILURE; - KXLDVTable *vtable = NULL; - KXLDVTableHdr *vhdr = NULL; - KXLDVTableEntry *ventry = NULL; - KXLDSymEntry32 *symentry = NULL; - u_long *stridx = 0; - u_int i = 0; - u_int j = 0; - - for (i = 0; i < vtables->nitems; ++i) { - vtable = kxld_array_get_item(vtables, i); - stridx = kxld_dict_find(strings, vtable->name); - require_action(stridx, finish, rval=KERN_FAILURE); - - vhdr = (KXLDVTableHdr *) (file + *header_offset); - vhdr->nameoff = (uint32_t) *stridx; - vhdr->nentries = vtable->entries.nitems; - vhdr->vtableoff = (uint32_t) (*data_offset); - - *header_offset += sizeof(*vhdr); - - for(j = 0; j < vtable->entries.nitems; ++j) { - - ventry = kxld_array_get_item(&vtable->entries, j); - symentry = (KXLDSymEntry32 *) (file + *data_offset); - - if (ventry->patched.name) { - stridx = kxld_dict_find(strings, ventry->patched.name); - require_action(stridx, finish, rval=KERN_FAILURE); - - symentry->nameoff = (uint32_t) *stridx; - symentry->addr = (uint32_t) ventry->patched.addr; - } else { - symentry->nameoff = 0; - symentry->addr = 0; - } - - *data_offset += sizeof(*symentry); - } - } - - rval = KERN_SUCCESS; - -finish: - return rval; -} -#endif /* KXLD_USER_OR_ILP32 */ - -#if KXLD_USER_OR_LP64 -/******************************************************************************* -*******************************************************************************/ -static kern_return_t -copy_vtables_64(u_char *file, u_long *header_offset, u_long *data_offset, - const KXLDArray *vtables, const KXLDDict *strings) -{ - kern_return_t rval = KERN_FAILURE; - KXLDVTable *vtable = NULL; - KXLDVTableHdr *vhdr = NULL; - KXLDVTableEntry *ventry = NULL; - KXLDSymEntry64 *symentry = NULL; - u_long *stridx = 0; - u_int i = 0; - u_int j = 0; - - for (i = 0; i < vtables->nitems; ++i) { - vtable = kxld_array_get_item(vtables, i); - stridx = kxld_dict_find(strings, vtable->name); - require_action(stridx, finish, rval=KERN_FAILURE); - - vhdr = (KXLDVTableHdr *) (file + *header_offset); - vhdr->nameoff = (uint32_t) *stridx; - vhdr->nentries = vtable->entries.nitems; - vhdr->vtableoff = (uint32_t) (*data_offset); - - *header_offset += sizeof(*vhdr); - - for(j = 0; j < vtable->entries.nitems; ++j) { - - ventry = kxld_array_get_item(&vtable->entries, j); - symentry = (KXLDSymEntry64 *) (file + *data_offset); - - if (ventry->patched.name) { - stridx = kxld_dict_find(strings, ventry->patched.name); - require_action(stridx, finish, rval=KERN_FAILURE); - - symentry->nameoff = (uint32_t) *stridx; - symentry->addr = (uint64_t) ventry->patched.addr; - } else { - symentry->nameoff = 0; - symentry->addr = 0; - } - - *data_offset += sizeof(*symentry); - } - } - - rval = KERN_SUCCESS; - -finish: - return rval; -} -#endif /* KXLD_USER_OR_LP64 */ - -#if !KERNEL -/******************************************************************************* -*******************************************************************************/ -static boolean_t -swap_link_state(u_char *state) -{ - KXLDLinkStateHdr *state_hdr = (KXLDLinkStateHdr *) state; - - if (state_hdr->magic == CIGAM_ETATS_KNIL) { - swap_link_state_32(state); - return TRUE; - } else if (state_hdr->magic == CIGAM_ETATS_KNIL_64) { - swap_link_state_64(state); - return TRUE; - } - - return FALSE; -} - -/******************************************************************************* -*******************************************************************************/ -static void -swap_link_state_32(u_char *state) -{ - KXLDLinkStateHdr *state_hdr = NULL; - KXLDVTableHdr *vtable_hdr = NULL; - KXLDSymEntry32 *entry = NULL; - u_int i = 0; - u_int j = 0; - - state_hdr = (KXLDLinkStateHdr *) state; - - if (state_hdr->magic != CIGAM_ETATS_KNIL) return; - - /* Swap the header */ - swap_state_hdr(state_hdr); - - /* Swap the symbols */ - entry = (KXLDSymEntry32 *) (state + state_hdr->symoff); - for (i = 0; i < state_hdr->nsyms; ++i, ++entry) { - swap_sym_entry_32(entry); - } - - /* Swap the vtable headers and entries */ - vtable_hdr = (KXLDVTableHdr *) (state + state_hdr->voff); - for (i = 0; i < state_hdr->nvtables; ++i, ++vtable_hdr) { - swap_vtable_hdr(vtable_hdr); - - entry = (KXLDSymEntry32 *) (state + vtable_hdr->vtableoff); - for (j = 0; j < vtable_hdr->nentries; ++j, ++entry) { - swap_sym_entry_32(entry); - } - } -} - -/******************************************************************************* -*******************************************************************************/ -static void -swap_link_state_64(u_char *state) -{ - KXLDLinkStateHdr *state_hdr = NULL; - KXLDVTableHdr *vtable_hdr = NULL; - KXLDSymEntry64 *entry = NULL; - u_int i = 0; - u_int j = 0; - - state_hdr = (KXLDLinkStateHdr *) state; - - if (state_hdr->magic != CIGAM_ETATS_KNIL_64) return; - - /* Swap the header */ - swap_state_hdr(state_hdr); - - /* Swap the symbols */ - entry = (KXLDSymEntry64 *) (state + state_hdr->symoff); - for (i = 0; i < state_hdr->nsyms; ++i, ++entry) { - swap_sym_entry_64(entry); - } - - /* Swap the vtable headers and entries */ - vtable_hdr = (KXLDVTableHdr *) (state + state_hdr->voff); - for (i = 0; i < state_hdr->nvtables; ++i, ++vtable_hdr) { - swap_vtable_hdr(vtable_hdr); - - entry = (KXLDSymEntry64 *) (state + vtable_hdr->vtableoff); - for (j = 0; j < vtable_hdr->nentries; ++j, ++entry) { - swap_sym_entry_64(entry); - } - } -} - -/******************************************************************************* -*******************************************************************************/ -static boolean_t -unswap_link_state(u_char *state) -{ - KXLDLinkStateHdr *state_hdr = (KXLDLinkStateHdr *) state; - - if (state_hdr->magic == LINK_STATE_MAGIC) { - unswap_link_state_32(state); - return TRUE; - } else if (state_hdr->magic == LINK_STATE_MAGIC_64) { - unswap_link_state_64(state); - return TRUE; - } - - return FALSE; -} - -/******************************************************************************* -*******************************************************************************/ -static void -unswap_link_state_32(u_char *state) -{ - KXLDLinkStateHdr *state_hdr = NULL; - KXLDVTableHdr *vtable_hdr = NULL; - KXLDSymEntry32 *entry = NULL; - u_int i = 0; - u_int j = 0; - - state_hdr = (KXLDLinkStateHdr *) state; - - if (state_hdr->magic != LINK_STATE_MAGIC) return; - - /* Unswap the vtables and their headers */ - vtable_hdr = (KXLDVTableHdr *) (state + state_hdr->voff); - for (i = 0; i < state_hdr->nvtables; ++i, ++vtable_hdr) { - entry = (KXLDSymEntry32 *) (state + vtable_hdr->vtableoff); - for (j = 0; j < vtable_hdr->nentries; ++j, ++entry) { - swap_sym_entry_32(entry); - } - - swap_vtable_hdr(vtable_hdr); - } - - /* Unswap the symbols themselves */ - entry = (KXLDSymEntry32 *) (state + state_hdr->symoff); - for (i = 0; i < state_hdr->nsyms; ++i, ++entry) { - swap_sym_entry_32(entry); - } - - /* Unswap the header */ - swap_state_hdr(state_hdr); -} - -/******************************************************************************* -*******************************************************************************/ -static void -unswap_link_state_64(u_char *state) -{ - KXLDLinkStateHdr *state_hdr = NULL; - KXLDVTableHdr *vtable_hdr = NULL; - KXLDSymEntry64 *entry = NULL; - u_int i = 0; - u_int j = 0; - - state_hdr = (KXLDLinkStateHdr *) state; - - if (state_hdr->magic != LINK_STATE_MAGIC_64) return; - - /* Unswap the vtables and their headers */ - vtable_hdr = (KXLDVTableHdr *) (state + state_hdr->voff); - for (i = 0; i < state_hdr->nvtables; ++i, ++vtable_hdr) { - entry = (KXLDSymEntry64 *) (state + vtable_hdr->vtableoff); - for (j = 0; j < vtable_hdr->nentries; ++j, ++entry) { - swap_sym_entry_64(entry); - } - - swap_vtable_hdr(vtable_hdr); - } - - /* Unswap the symbols themselves */ - entry = (KXLDSymEntry64 *) (state + state_hdr->symoff); - for (i = 0; i < state_hdr->nsyms; ++i, ++entry) { - swap_sym_entry_64(entry); - } - - /* Unswap the header */ - swap_state_hdr(state_hdr); -} - -/******************************************************************************* -*******************************************************************************/ -static void -swap_state_hdr(KXLDLinkStateHdr *state_hdr) -{ - state_hdr->magic = OSSwapInt32(state_hdr->magic); - state_hdr->version = OSSwapInt32(state_hdr->version); - state_hdr->cputype = OSSwapInt32(state_hdr->cputype); - state_hdr->cpusubtype = OSSwapInt32(state_hdr->cpusubtype); - state_hdr->nsects = OSSwapInt32(state_hdr->nsects); - state_hdr->sectoff = OSSwapInt32(state_hdr->sectoff); - state_hdr->nvtables = OSSwapInt32(state_hdr->nvtables); - state_hdr->voff = OSSwapInt32(state_hdr->voff); - state_hdr->nsyms = OSSwapInt32(state_hdr->nsyms); - state_hdr->symoff = OSSwapInt32(state_hdr->symoff); -} - -/******************************************************************************* -*******************************************************************************/ -static void -swap_vtable_hdr(KXLDVTableHdr *vtable_hdr) -{ - vtable_hdr->nameoff = OSSwapInt32(vtable_hdr->nameoff); - vtable_hdr->vtableoff = OSSwapInt32(vtable_hdr->vtableoff); - vtable_hdr->nentries = OSSwapInt32(vtable_hdr->nentries); -} - -/******************************************************************************* -*******************************************************************************/ -static void -swap_sym_entry_32(KXLDSymEntry32 *entry) -{ - entry->nameoff = OSSwapInt32(entry->nameoff); - entry->addr = OSSwapInt32(entry->addr); -} - -/******************************************************************************* -*******************************************************************************/ -static void -swap_sym_entry_64(KXLDSymEntry64 *entry) -{ - entry->nameoff = OSSwapInt32(entry->nameoff); - entry->addr = OSSwapInt64(entry->addr); -} -#endif /* !KERNEL */ - diff --git a/libkern/kxld/kxld_state.h b/libkern/kxld/kxld_state.h deleted file mode 100644 index 22878159c..000000000 --- a/libkern/kxld/kxld_state.h +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Copyright (c) 2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#ifndef _KXLD_STATE_H_ -#define _KXLD_STATE_H_ - -#include -#if KERNEL - #include -#else - #include "kxld_types.h" -#endif - -#include "kxld_array.h" -#include "kxld_util.h" - -struct kxld_dict; -struct kxld_kext; -struct kxld_link_state_hdr; -typedef struct kxld_state KXLDState; -typedef struct kxld_link_state_hdr KXLDLinkStateHdr; -typedef struct kxld_vtable_hdr KXLDVTableHdr; -typedef struct kxld_sym_entry_32 KXLDSymEntry32; -typedef struct kxld_sym_entry_64 KXLDSymEntry64; - -struct kxld_state { - u_char *file; - KXLDArray vtables; - boolean_t swap; -}; - -/* - * The format of the link state object is as follows: - - * Field *** Type * - ************************************************** - * Link state header *** KXLDLinkStateHdr * - ************************************************** - * Section order entries *** KXLDSectionName * - ************************************************** - * Vtable headers *** KXLDVTableHdr * - ************************************************** - * VTables *** KXLDSymEntry[32|64] * - ************************************************** - * Exported symbols *** KXLDSymEntry[32|64] * - ************************************************** - * String table *** char[] * - ************************************************** - - */ - -struct kxld_link_state_hdr { - uint32_t magic; - uint32_t version; - cpu_type_t cputype; - cpu_subtype_t cpusubtype; - uint32_t nsects; - uint32_t sectoff; - uint32_t nvtables; - uint32_t voff; - uint32_t nsyms; - uint32_t symoff; -}; - -struct kxld_vtable_hdr { - uint32_t nameoff; - uint32_t vtableoff; - uint32_t nentries; -}; - -struct kxld_sym_entry_32 { - uint32_t addr; - uint32_t nameoff; - uint32_t flags; -}; - -struct kxld_sym_entry_64 { - uint64_t addr; - uint32_t nameoff; - uint32_t flags; -} __attribute__((aligned(16))); - -#define KXLD_SYM_OBSOLETE 0x1 - -/******************************************************************************* -* Constructors and destructors -*******************************************************************************/ - -kern_return_t kxld_state_init_from_file(KXLDState *state, u_char *file, - KXLDArray *section_order) - __attribute__((nonnull(1,2), visibility("hidden"))); - -void kxld_state_clear(KXLDState *state) - __attribute__((nonnull, visibility("hidden"))); - -void kxld_state_deinit(KXLDState *state) - __attribute__((nonnull, visibility("hidden"))); - -/******************************************************************************* -* Accessors -*******************************************************************************/ - -u_int kxld_state_get_num_symbols(KXLDState *state) - __attribute__((pure, nonnull, visibility("hidden"))); - -kern_return_t kxld_state_get_symbols(KXLDState *state, - struct kxld_dict *defined_symbols, - struct kxld_dict *obsolete_symbols) - __attribute__((nonnull, visibility("hidden"))); - -u_int kxld_state_get_num_vtables(KXLDState *state) - __attribute__((pure, nonnull, visibility("hidden"))); - -kern_return_t kxld_state_get_vtables(KXLDState *state, - struct kxld_dict *patched_vtables) - __attribute__((nonnull, visibility("hidden"))); - -void kxld_state_get_cputype(const KXLDState *state, - cpu_type_t *cputype, cpu_subtype_t *cpusubtype) - __attribute__((nonnull, visibility("hidden"))); - -/******************************************************************************* -* Exporters -*******************************************************************************/ - -kern_return_t kxld_state_export_kext_to_file(struct kxld_kext *kext, u_char **file, - u_long *filesize, struct kxld_dict *tmpdict, KXLDArray *tmps) - __attribute__((nonnull, visibility("hidden"))); - -#endif /* _KXLD_STATE_H_ */ - diff --git a/libkern/kxld/kxld_stubs.c b/libkern/kxld/kxld_stubs.c index 511e82a10..2b10ce687 100644 --- a/libkern/kxld/kxld_stubs.c +++ b/libkern/kxld/kxld_stubs.c @@ -30,6 +30,7 @@ * These kxld stubs panic if the kernel is built without kxld support but * something tries to use it anyway. */ +#if KERNEL #if !CONFIG_KXLD @@ -56,19 +57,15 @@ kxld_destroy_context(KXLDContext *context __unused) kern_return_t kxld_link_file( - KXLDContext *context __unused, - u_char *file __unused, - u_long size __unused, - const char *name, - void *callback_data __unused, - u_char **deps __unused, - u_int ndeps __unused, - u_char **_linked_object __unused, - kxld_addr_t *kmod_info_kern __unused, - u_char **_link_state __unused, - u_long *_link_state_size __unused, - u_char **_symbol_file __unused, - u_long *_symbol_file_size __unused) + KXLDContext * context __unused, + u_char * file __unused, + u_long size __unused, + const char * name __unused, + void * callback_data __unused, + KXLDDependency * dependencies __unused, + u_int ndependencies __unused, + u_char ** linked_object_out __unused, + kxld_addr_t * kmod_info_kern __unused) { panic("%s (%s) called in kernel without kxld support", __PRETTY_FUNCTION__, name); return KERN_SUCCESS; @@ -81,3 +78,5 @@ kxld_validate_copyright_string(const char *str __unused) } #endif + +#endif /* KERNEL */ diff --git a/libkern/kxld/kxld_sym.c b/libkern/kxld/kxld_sym.c index 5dbd6b860..2e9cb16e9 100644 --- a/libkern/kxld/kxld_sym.c +++ b/libkern/kxld/kxld_sym.c @@ -2,7 +2,7 @@ * Copyright (c) 2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ #include @@ -146,7 +146,7 @@ kxld_sym_init_absolute(KXLDSym *sym, char *name, kxld_addr_t link_addr) sym->sectnum = NO_SECT; init_predicates(sym, N_ABS | N_EXT, 0); - sym->predicates.is_resolved = TRUE; + sym->is_resolved = TRUE; } /******************************************************************************* @@ -160,7 +160,7 @@ init_predicates(KXLDSym *sym, u_char n_type, u_short n_desc) /* The type field is interpreted differently for normal symbols and stabs */ if (n_type & N_STAB) { - sym->predicates.is_stab = 1; + sym->is_stab = 1; switch (n_type) { /* Labeled as NO_SECT in stab.h */ @@ -180,7 +180,7 @@ init_predicates(KXLDSym *sym, u_char n_type, u_short n_desc) case N_LENG: case N_OPT: case N_OSO: - sym->predicates.is_absolute = 1; + sym->is_absolute = 1; break; /* Labeled as n_sect in stab.h */ case N_FUN: @@ -198,9 +198,9 @@ init_predicates(KXLDSym *sym, u_char n_type, u_short n_desc) * section-based on OS X. We must mark them as such so they get * relocated. */ - case N_LBRAC: case N_RBRAC: - sym->predicates.is_section = 1; + case N_LBRAC: + sym->is_section = 1; break; default: rval = KERN_FAILURE; @@ -214,42 +214,25 @@ init_predicates(KXLDSym *sym, u_char n_type, u_short n_desc) } else { u_char type = n_type & N_TYPE; - /* Set the type-independent fields */ - if ((n_type & N_EXT) && !(n_type & N_PEXT)) { - sym->predicates.is_external = 1; - } - - if (n_desc & N_DESC_DISCARDED) { - sym->predicates.is_obsolete = 1; - } - - if (n_desc & N_WEAK_REF) { - sym->predicates.is_weak = 1; - } - - if (n_desc & N_ARM_THUMB_DEF) { - sym->predicates.is_thumb = 1; - } - /* The first set of type fields are mutually exclusive, so they can be * set with a switch statement. */ switch (type) { case N_ABS: - sym->predicates.is_absolute = 1; + sym->is_absolute = 1; break; case N_SECT: - sym->predicates.is_section = 1; + sym->is_section = 1; break; case N_UNDF: if (sym->base_addr) { - sym->predicates.is_common = 1; + sym->is_common = 1; } else { - sym->predicates.is_undefined = 1; + sym->is_undefined = 1; } break; case N_INDR: - sym->predicates.is_indirect = 1; + sym->is_indirect = 1; break; default: rval = KERN_FAILURE; @@ -258,28 +241,47 @@ init_predicates(KXLDSym *sym, u_char n_type, u_short n_desc) goto finish; } + /* Set the type-independent fields */ + if ((n_type & N_EXT) && !(n_type & N_PEXT)) { + sym->is_external = 1; + } + + if (n_desc & N_DESC_DISCARDED) { + sym->is_obsolete = 1; + } + + if (n_desc & N_WEAK_REF) { + sym->is_weak = 1; + } + + if (n_desc & N_ARM_THUMB_DEF) { + sym->is_thumb = 1; + sym->base_addr |= 1; + sym->link_addr |= 1; + } + /* Set the C++-specific fields */ - if ((0 == strncmp(CXX_PREFIX, sym->name, const_strlen(CXX_PREFIX)))) { - sym->predicates.is_cxx = 1; + if ((streq_safe(CXX_PREFIX, sym->name, const_strlen(CXX_PREFIX)))) { + sym->is_cxx = 1; - if (0 == strncmp(sym->name, METACLASS_VTABLE_PREFIX, + if (streq_safe(sym->name, METACLASS_VTABLE_PREFIX, const_strlen(METACLASS_VTABLE_PREFIX))) { - sym->predicates.is_meta_vtable = 1; - } else if (0 == strncmp(sym->name, VTABLE_PREFIX, + sym->is_meta_vtable = 1; + } else if (streq_safe(sym->name, VTABLE_PREFIX, const_strlen(VTABLE_PREFIX))) { - sym->predicates.is_class_vtable = 1; + sym->is_class_vtable = 1; } else if (kxld_strstr(sym->name, RESERVED_TOKEN)) { - sym->predicates.is_padslot = 1; + sym->is_padslot = 1; } else if (kxld_strstr(sym->name, METACLASS_TOKEN)) { - sym->predicates.is_metaclass = 1; + sym->is_metaclass = 1; } else if (kxld_strstr(sym->name, SUPER_METACLASS_POINTER_TOKEN)) { - sym->predicates.is_super_metaclass_pointer = 1; + sym->is_super_metaclass_pointer = 1; } - } else if (streq_safe(CXX_PURE_VIRTUAL, sym->name, sizeof(CXX_PURE_VIRTUAL))) { - sym->predicates.is_cxx = 1; - sym->predicates.is_pure_virtual = 1; + } else if (kxld_sym_name_is_pure_virtual(sym->name)) { + sym->is_cxx = 1; + sym->is_pure_virtual = 1; } } @@ -305,8 +307,8 @@ init_sym_sectnum(KXLDSym *sym, u_int n_sect) /* Convert the section number to an index into the section index */ sym->sectnum = n_sect - 1; } else { - sym->predicates.is_absolute = 1; - sym->predicates.is_section = 0; + sym->is_absolute = 1; + sym->is_section = 0; } } @@ -338,7 +340,7 @@ kxld_sym_is_absolute(const KXLDSym *sym) { check(sym); - return (0 != sym->predicates.is_absolute); + return (0 != sym->is_absolute); } /******************************************************************************* @@ -348,7 +350,7 @@ kxld_sym_is_section(const KXLDSym *sym) { check(sym); - return (0 != sym->predicates.is_section); + return (0 != sym->is_section); } /******************************************************************************* @@ -359,7 +361,7 @@ kxld_sym_is_defined(const KXLDSym *sym) check(sym); return ((kxld_sym_is_absolute(sym) || kxld_sym_is_section(sym)) && - !sym->predicates.is_replaced); + !kxld_sym_is_replaced(sym)); } @@ -370,7 +372,7 @@ kxld_sym_is_defined_locally(const KXLDSym *sym) { check(sym); - return (kxld_sym_is_defined(sym) && !sym->predicates.is_resolved); + return (kxld_sym_is_defined(sym) && !sym->is_resolved); } /******************************************************************************* @@ -380,7 +382,7 @@ kxld_sym_is_external(const KXLDSym *sym) { check(sym); - return (0 != sym->predicates.is_external); + return (0 != sym->is_external); } /******************************************************************************* @@ -400,7 +402,7 @@ kxld_sym_is_undefined(const KXLDSym *sym) { check(sym); - return (0 != sym->predicates.is_undefined); + return (0 != sym->is_undefined); } /******************************************************************************* @@ -410,7 +412,17 @@ kxld_sym_is_indirect(const KXLDSym *sym) { check(sym); - return (0 != sym->predicates.is_indirect); + return (0 != sym->is_indirect); +} + +/******************************************************************************* +*******************************************************************************/ +boolean_t +kxld_sym_is_replaced(const KXLDSym *sym) +{ + check(sym); + + return (0 != sym->is_replaced); } /******************************************************************************* @@ -420,7 +432,7 @@ kxld_sym_is_common(const KXLDSym *sym) { check(sym); - return (0 != sym->predicates.is_common); + return (0 != sym->is_common); } /******************************************************************************* @@ -428,7 +440,7 @@ kxld_sym_is_common(const KXLDSym *sym) boolean_t kxld_sym_is_unresolved(const KXLDSym *sym) { - return ((kxld_sym_is_undefined(sym) && !sym->predicates.is_replaced) || + return ((kxld_sym_is_undefined(sym) && !kxld_sym_is_replaced(sym)) || kxld_sym_is_indirect(sym) || kxld_sym_is_common(sym)); } @@ -437,7 +449,7 @@ kxld_sym_is_unresolved(const KXLDSym *sym) boolean_t kxld_sym_is_obsolete(const KXLDSym *sym) { - return (0 != sym->predicates.is_obsolete); + return (0 != sym->is_obsolete); } #if KXLD_USER_OR_GOT @@ -448,7 +460,7 @@ kxld_sym_is_got(const KXLDSym *sym) { check(sym); - return (0 != sym->predicates.is_got); + return (0 != sym->is_got); } #endif /* KXLD_USER_OR_GOT */ @@ -459,7 +471,7 @@ kxld_sym_is_stab(const KXLDSym *sym) { check(sym); - return (0 != sym->predicates.is_stab); + return (0 != sym->is_stab); } /******************************************************************************* @@ -469,7 +481,7 @@ kxld_sym_is_weak(const KXLDSym *sym) { check(sym); - return (0 != sym->predicates.is_weak); + return (0 != sym->is_weak); } /******************************************************************************* @@ -479,7 +491,7 @@ kxld_sym_is_cxx(const KXLDSym *sym) { check(sym); - return (0 != sym->predicates.is_cxx); + return (0 != sym->is_cxx); } /******************************************************************************* @@ -487,7 +499,7 @@ kxld_sym_is_cxx(const KXLDSym *sym) boolean_t kxld_sym_is_pure_virtual(const KXLDSym *sym) { - return (0 != sym->predicates.is_pure_virtual); + return (0 != sym->is_pure_virtual); } /******************************************************************************* @@ -507,7 +519,7 @@ kxld_sym_is_class_vtable(const KXLDSym *sym) { check(sym); - return (0 != sym->predicates.is_class_vtable); + return (0 != sym->is_class_vtable); } /******************************************************************************* @@ -517,7 +529,7 @@ kxld_sym_is_metaclass_vtable(const KXLDSym *sym) { check(sym); - return (0 != sym->predicates.is_meta_vtable); + return (0 != sym->is_meta_vtable); } /******************************************************************************* @@ -527,7 +539,7 @@ kxld_sym_is_padslot(const KXLDSym *sym) { check(sym); - return (0 != sym->predicates.is_padslot); + return (0 != sym->is_padslot); } /******************************************************************************* @@ -537,7 +549,7 @@ kxld_sym_is_metaclass(const KXLDSym *sym) { check(sym); - return (0 != sym->predicates.is_metaclass); + return (0 != sym->is_metaclass); } /******************************************************************************* @@ -547,7 +559,15 @@ kxld_sym_is_super_metaclass_pointer(const KXLDSym *sym) { check(sym); - return (0 != sym->predicates.is_super_metaclass_pointer); + return (0 != sym->is_super_metaclass_pointer); +} + +/******************************************************************************* +*******************************************************************************/ +boolean_t +kxld_sym_name_is_pure_virtual(const char *name) +{ + return streq_safe(CXX_PURE_VIRTUAL, name, sizeof(CXX_PURE_VIRTUAL)); } /******************************************************************************* @@ -813,7 +833,7 @@ extract_inner_string(const char *str, const char *prefix, const char *suffix, void kxld_sym_set_got(KXLDSym *sym) { - sym->predicates.is_got = 1; + sym->is_got = 1; } #endif /* KXLD_USER_OR_GOT */ @@ -833,7 +853,7 @@ kxld_sym_relocate(KXLDSym *sym, const KXLDSect *sect) *******************************************************************************/ kern_return_t kxld_sym_export_macho_32(const KXLDSym *sym, u_char *_nl, char *strtab, - u_long *stroff, u_long strsize, boolean_t is_link_state) + u_long *stroff, u_long strsize) { kern_return_t rval = KERN_FAILURE; struct nlist *nl = (struct nlist *) _nl; @@ -849,17 +869,14 @@ kxld_sym_export_macho_32(const KXLDSym *sym, u_char *_nl, char *strtab, require_action((u_long)bytes <= strsize - *stroff, finish, rval = KERN_FAILURE); - if (is_link_state) { - nl->n_type = N_ABS | N_EXT; - nl->n_sect = NO_SECT; - nl->n_desc = 0; - } else { - nl->n_type = sym->type; - nl->n_sect = (kxld_sym_is_section(sym)) ? sym->relocated_sectnum + 1 : 0; - nl->n_desc = sym->desc; - } + nl->n_type = sym->type; + nl->n_sect = (kxld_sym_is_section(sym)) ? sym->relocated_sectnum + 1 : 0; + nl->n_desc = sym->desc; nl->n_un.n_strx = (uint32_t) *stroff; nl->n_value = (uint32_t) sym->link_addr; + if (sym->is_thumb) { + nl->n_value &= ~0x1U; + } str = (char *) (strtab + *stroff); strlcpy(str, sym->name, strsize - *stroff); @@ -877,7 +894,7 @@ kxld_sym_export_macho_32(const KXLDSym *sym, u_char *_nl, char *strtab, *******************************************************************************/ kern_return_t kxld_sym_export_macho_64(const KXLDSym *sym, u_char *_nl, char *strtab, - u_long *stroff, u_long strsize, boolean_t is_link_state) + u_long *stroff, u_long strsize) { kern_return_t rval = KERN_FAILURE; struct nlist_64 *nl = (struct nlist_64 *) _nl; @@ -893,17 +910,14 @@ kxld_sym_export_macho_64(const KXLDSym *sym, u_char *_nl, char *strtab, require_action((u_long)bytes <= strsize - *stroff, finish, rval = KERN_FAILURE); - if (is_link_state) { - nl->n_type = N_ABS | N_EXT; - nl->n_sect = NO_SECT; - nl->n_desc = 0; - } else { - nl->n_type = sym->type; - nl->n_sect = (kxld_sym_is_section(sym)) ? sym->relocated_sectnum + 1 : 0; - nl->n_desc = sym->desc; - } + nl->n_type = sym->type; + nl->n_sect = (kxld_sym_is_section(sym)) ? sym->relocated_sectnum + 1 : 0; + nl->n_desc = sym->desc; nl->n_un.n_strx = (uint32_t) *stroff; nl->n_value = (uint64_t) sym->link_addr; + if (sym->is_thumb) { + nl->n_value &= ~0x1ULL; + } str = (char *) (strtab + *stroff); strlcpy(str, sym->name, strsize - *stroff); @@ -919,7 +933,7 @@ kxld_sym_export_macho_64(const KXLDSym *sym, u_char *_nl, char *strtab, /******************************************************************************* *******************************************************************************/ kern_return_t -kxld_sym_resolve(KXLDSym *sym, kxld_addr_t addr, boolean_t export_sym) +kxld_sym_resolve(KXLDSym *sym, kxld_addr_t addr) { kern_return_t rval = KERN_FAILURE; @@ -934,18 +948,16 @@ kxld_sym_resolve(KXLDSym *sym, kxld_addr_t addr, boolean_t export_sym) sym->type = N_ABS | N_EXT; sym->sectnum = NO_SECT; - /* Set the predicate bits for an externally resolved symbol. We re-export - * indirect symbols and any symbols that the caller wants re-exported (for - * example, symbols from a pseudo-kext). */ + /* Set the predicate bits for an externally resolved symbol. */ - sym->predicates.is_external = TRUE; - sym->predicates.is_absolute = TRUE; - sym->predicates.is_resolved = !(kxld_sym_is_indirect(sym) || export_sym); - + sym->is_external = TRUE; + sym->is_absolute = TRUE; + sym->is_resolved = TRUE; + /* Clear the predicate bits for types that can be resolved */ - sym->predicates.is_undefined = FALSE; - sym->predicates.is_indirect = FALSE; + sym->is_undefined = FALSE; + sym->is_indirect = FALSE; rval = KERN_SUCCESS; @@ -973,12 +985,12 @@ kxld_sym_resolve_common(KXLDSym *sym, u_int sectnum, kxld_addr_t base_addr) sym->sectnum = sectnum; sym->desc = 0; - sym->predicates.is_absolute = FALSE; - sym->predicates.is_section = TRUE; - sym->predicates.is_undefined = FALSE; - sym->predicates.is_indirect = FALSE; - sym->predicates.is_common = FALSE; - sym->predicates.is_external = TRUE; + sym->is_absolute = FALSE; + sym->is_section = TRUE; + sym->is_undefined = FALSE; + sym->is_indirect = FALSE; + sym->is_common = FALSE; + sym->is_external = TRUE; rval = KERN_SUCCESS; @@ -996,7 +1008,7 @@ kxld_sym_delete(KXLDSym *sym) check(sym); bzero(sym, sizeof(*sym)); - sym->predicates.is_replaced = TRUE; + sym->is_replaced = TRUE; } @@ -1007,7 +1019,7 @@ kxld_sym_patch(KXLDSym *sym) { check(sym); - sym->predicates.is_replaced = TRUE; + sym->is_replaced = TRUE; } /******************************************************************************* @@ -1018,6 +1030,6 @@ kxld_sym_mark_private(KXLDSym *sym) check(sym); sym->type |= N_PEXT; - sym->predicates.is_external = FALSE; + sym->is_external = FALSE; } diff --git a/libkern/kxld/kxld_sym.h b/libkern/kxld/kxld_sym.h index 237586263..69cb8cbf7 100644 --- a/libkern/kxld/kxld_sym.h +++ b/libkern/kxld/kxld_sym.h @@ -48,12 +48,11 @@ struct kxld_sym { kxld_addr_t base_addr; // The symbol's base address kxld_addr_t link_addr; // The relocated address kxld_addr_t got_addr; // The address of this symbol's GOT entry + uint16_t desc; uint8_t type; uint8_t sectnum; // The symbol's section number uint8_t relocated_sectnum; - uint16_t desc; - struct { - u_int is_absolute:1, // Set for absolute symbols + u_int is_absolute:1, // Set for absolute symbols is_section:1, // Set for section symbols is_undefined:1, // Set for undefined symbols is_indirect:1, // Set for indirect symbols @@ -74,7 +73,6 @@ struct kxld_sym { is_metaclass:1, // Set for metaclass symbols is_super_metaclass_pointer:1, // Set for super metaclass pointer syms is_thumb:1; // Set for thumb symbols (ARM only) - } predicates; }; /******************************************************************************* @@ -128,6 +126,9 @@ boolean_t kxld_sym_is_undefined(const KXLDSym *sym) boolean_t kxld_sym_is_indirect(const KXLDSym *sym) __attribute__((pure, nonnull, visibility("hidden"))); +boolean_t kxld_sym_is_replaced(const KXLDSym *sym) + __attribute__((pure, nonnull, visibility("hidden"))); + /* We don't wrap this in KXLD_USER_OR_COMMON because even though common symbols * aren't always supported, we always need to be able to detect them. */ @@ -175,6 +176,9 @@ boolean_t kxld_sym_is_metaclass(const KXLDSym *sym) boolean_t kxld_sym_is_super_metaclass_pointer(const KXLDSym *sym) __attribute__((pure, nonnull, visibility("hidden"))); +boolean_t kxld_sym_name_is_pure_virtual(const char *name) + __attribute__((pure, nonnull, visibility("hidden"))); + boolean_t kxld_sym_name_is_padslot(const char *name) __attribute__((pure, nonnull, visibility("hidden"))); @@ -224,13 +228,13 @@ u_long kxld_sym_get_function_prefix_from_class_name(const char *class_name, #if KXLD_USER_OR_ILP32 kern_return_t kxld_sym_export_macho_32(const KXLDSym *sym, u_char *nl, - char *strtab, u_long *stroff, u_long strsize, boolean_t is_link_state) + char *strtab, u_long *stroff, u_long strsize) __attribute__((nonnull, visibility("hidden"))); #endif #if KXLD_USER_OR_LP64 kern_return_t kxld_sym_export_macho_64(const KXLDSym *sym, u_char *nl, - char *strtab, u_long *stroff, u_long strsize, boolean_t is_link_state) + char *strtab, u_long *stroff, u_long strsize) __attribute__((nonnull, visibility("hidden"))); #endif @@ -246,8 +250,7 @@ void kxld_sym_set_got(KXLDSym *sym) __attribute__((nonnull, visibility("hidden"))); #endif /* KXLD_USER_OR_GOT */ -kern_return_t kxld_sym_resolve(KXLDSym *sym, const kxld_addr_t addr, - boolean_t export_sym) +kern_return_t kxld_sym_resolve(KXLDSym *sym, const kxld_addr_t addr) __attribute__((nonnull, visibility("hidden"))); #if KXLD_USER_OR_COMMON diff --git a/libkern/kxld/kxld_symtab.c b/libkern/kxld/kxld_symtab.c index 569bd1bbe..6700774f4 100644 --- a/libkern/kxld/kxld_symtab.c +++ b/libkern/kxld/kxld_symtab.c @@ -47,33 +47,33 @@ struct kxld_symtab { KXLDDict name_index; char *strings; u_int strsize; + boolean_t cxx_index_initialized; + boolean_t name_index_initialized; }; /******************************************************************************* * Prototypes *******************************************************************************/ -static kern_return_t init_macho(KXLDSymtab *symtab, u_char *macho, - struct symtab_command *src, kxld_addr_t linkedit_offset, boolean_t is_32_bit) - __attribute__((nonnull)); +static kern_return_t init_macho(KXLDSymtab *symtab, struct symtab_command *src, + u_char *macho, KXLDSeg * kernel_linkedit_seg, + boolean_t is_32_bit) + __attribute__((nonnull(1,2))); #if KXLD_USER_OR_ILP32 -static kern_return_t init_syms_32(KXLDSymtab *symtab, u_char *macho, u_long offset, +static kern_return_t init_syms_32(KXLDSymtab *symtab, u_char *macho, u_long offset, u_int nsyms); #endif #if KXLD_USER_OR_LP64 -static kern_return_t init_syms_64(KXLDSymtab *symtab, u_char *macho, u_long offset, +static kern_return_t init_syms_64(KXLDSymtab *symtab, u_char *macho, u_long offset, u_int nsyms); #endif -static kern_return_t make_cxx_index(KXLDSymtab *symtab) +static void restrict_private_symbols(KXLDSymtab *symtab) __attribute__((nonnull)); static boolean_t sym_is_defined_cxx(const KXLDSym *sym); -static kern_return_t make_name_index(KXLDSymtab *symtab) - __attribute__((nonnull)); static boolean_t sym_is_name_indexed(const KXLDSym *sym); - /******************************************************************************* *******************************************************************************/ size_t @@ -86,10 +86,11 @@ kxld_symtab_sizeof() /******************************************************************************* *******************************************************************************/ kern_return_t -kxld_symtab_init_from_macho_32(KXLDSymtab *symtab, u_char *macho, - struct symtab_command *src, kxld_addr_t linkedit_offset) +kxld_symtab_init_from_macho_32(KXLDSymtab *symtab, struct symtab_command *src, + u_char *macho, KXLDSeg * kernel_linkedit_seg) { - return init_macho(symtab, macho, src, linkedit_offset, TRUE); + return init_macho(symtab, src, macho, kernel_linkedit_seg, + /* is_32_bit */ TRUE); } #endif /* KXLD_USER_ILP32 */ @@ -97,24 +98,28 @@ kxld_symtab_init_from_macho_32(KXLDSymtab *symtab, u_char *macho, /******************************************************************************* *******************************************************************************/ kern_return_t -kxld_symtab_init_from_macho_64(KXLDSymtab *symtab, u_char *macho, - struct symtab_command *src, kxld_addr_t linkedit_offset) +kxld_symtab_init_from_macho_64(KXLDSymtab *symtab, struct symtab_command *src, + u_char *macho, KXLDSeg * kernel_linkedit_seg) { - return init_macho(symtab, macho, src, linkedit_offset, FALSE); + return init_macho(symtab, src, macho, kernel_linkedit_seg, + /* is_32_bit */ FALSE); } #endif /* KXLD_USER_OR_LP64 */ /******************************************************************************* *******************************************************************************/ static kern_return_t -init_macho(KXLDSymtab *symtab, u_char *macho, struct symtab_command *src, - kxld_addr_t linkedit_offset, boolean_t is_32_bit __unused) +init_macho(KXLDSymtab *symtab, struct symtab_command *src, + u_char *macho, KXLDSeg * kernel_linkedit_seg, + boolean_t is_32_bit __unused) { kern_return_t rval = KERN_FAILURE; + u_long symoff; + u_char * macho_or_linkedit = macho; check(symtab); - check(macho); check(src); + check(macho); /* Initialize the symbol array */ @@ -123,26 +128,45 @@ init_macho(KXLDSymtab *symtab, u_char *macho, struct symtab_command *src, /* Initialize the string table */ - symtab->strings = (char *) (macho + src->stroff + linkedit_offset); + if (kernel_linkedit_seg) { + + /* If initing the kernel file in memory, we can't trust + * the symtab offsets directly, because the kernel file has been mapped + * into memory and the mach-o offsets are disk-based. + * + * The symoff is an offset relative to the linkedit segment + * so we just subtract the fileoffset of the linkedit segment + * to get its relative start. + * + * The strings table is an actual pointer, so we calculate that from + * the linkedit's vmaddr. + * + * Further, the init_syms_... functions need an adjusted base + * pointer instead of the beginning of the macho, so we substitute + * the base of the linkedit segment. + */ + + symoff = (u_long)(src->symoff - kernel_linkedit_seg->fileoff); + symtab->strings = (char *)(uintptr_t)kernel_linkedit_seg->base_addr + + src->stroff - kernel_linkedit_seg->fileoff; + macho_or_linkedit = (u_char *)(uintptr_t)kernel_linkedit_seg->base_addr; + } else { + symoff = (u_long)src->symoff; + symtab->strings = (char *) (macho + src->stroff); + } + symtab->strsize = src->strsize; /* Initialize the symbols */ KXLD_3264_FUNC(is_32_bit, rval, init_syms_32, init_syms_64, - symtab, macho, (u_long) (src->symoff + linkedit_offset), src->nsyms); - require_noerr(rval, finish); - - /* Create the C++ index */ - - rval = make_cxx_index(symtab); - require_noerr(rval, finish); - - /* Create the name index */ - - rval = make_name_index(symtab); + symtab, macho_or_linkedit, symoff, src->nsyms); require_noerr(rval, finish); + /* Some symbols must be forced private for compatibility */ + (void) restrict_private_symbols(symtab); + /* Save the output */ rval = KERN_SUCCESS; @@ -153,6 +177,7 @@ init_macho(KXLDSymtab *symtab, u_char *macho, struct symtab_command *src, #if KXLD_USER_OR_ILP32 /******************************************************************************* +* In the running kernel, 'macho' is actually the start of the linkedit segment. *******************************************************************************/ static kern_return_t init_syms_32(KXLDSymtab *symtab, u_char *macho, u_long offset, u_int nsyms) @@ -179,6 +204,7 @@ init_syms_32(KXLDSymtab *symtab, u_char *macho, u_long offset, u_int nsyms) #if KXLD_USER_OR_LP64 /******************************************************************************* +* In the running kernel, 'macho' is actually the start of the linkedit segment. *******************************************************************************/ static kern_return_t init_syms_64(KXLDSymtab *symtab, u_char *macho, u_long offset, u_int nsyms) @@ -203,6 +229,41 @@ init_syms_64(KXLDSymtab *symtab, u_char *macho, u_long offset, u_int nsyms) } #endif /* KXLD_USER_OR_LP64 */ +/******************************************************************************* +* Temporary workaround for PR-6668105 +* new, new[], delete, and delete[] may be overridden globally in a kext. +* We should do this with some sort of weak symbols, but we'll use a whitelist +* for now to minimize risk. +*******************************************************************************/ +static void +restrict_private_symbols(KXLDSymtab *symtab) +{ + const char *private_symbols[] = { + KXLD_KMOD_INFO_SYMBOL, + KXLD_OPERATOR_NEW_SYMBOL, + KXLD_OPERATOR_NEW_ARRAY_SYMBOL, + KXLD_OPERATOR_DELETE_SYMBOL, + KXLD_OPERATOR_DELETE_ARRAY_SYMBOL + }; + KXLDSymtabIterator iter; + KXLDSym *sym = NULL; + const char *name = NULL; + u_int i = 0; + + kxld_symtab_iterator_init(&iter, symtab, kxld_sym_is_exported, FALSE); + while ((sym = kxld_symtab_iterator_get_next(&iter))) { + for (i = 0; i < const_array_len(private_symbols); ++i) { + name = private_symbols[i]; + if (!streq(sym->name, name)) { + continue; + } + + kxld_sym_mark_private(sym); + } + } +} + + /******************************************************************************* *******************************************************************************/ void @@ -229,6 +290,10 @@ kxld_symtab_clear(KXLDSymtab *symtab) kxld_array_clear(&symtab->syms); kxld_dict_clear(&symtab->cxx_index); kxld_dict_clear(&symtab->name_index); + symtab->strings = NULL; + symtab->strsize = 0; + symtab->cxx_index_initialized = 0; + symtab->name_index_initialized = 0; } /******************************************************************************* @@ -241,6 +306,7 @@ kxld_symtab_deinit(KXLDSymtab *symtab) kxld_array_deinit(&symtab->syms); kxld_dict_deinit(&symtab->cxx_index); kxld_dict_deinit(&symtab->name_index); + bzero(symtab, sizeof(*symtab)); } /******************************************************************************* @@ -265,8 +331,28 @@ kxld_symtab_get_symbol_by_index(const KXLDSymtab *symtab, u_int idx) /******************************************************************************* *******************************************************************************/ -KXLDSym * +KXLDSym * kxld_symtab_get_symbol_by_name(const KXLDSymtab *symtab, const char *name) +{ + KXLDSym *sym = NULL; + u_int i = 0; + + for (i = 0; i < symtab->syms.nitems; ++i) { + sym = kxld_array_get_item(&symtab->syms, i); + + if (streq(sym->name, name)) { + return sym; + } + } + + return NULL; +} + +/******************************************************************************* +*******************************************************************************/ +KXLDSym * +kxld_symtab_get_locally_defined_symbol_by_name(const KXLDSymtab *symtab, + const char *name) { check(symtab); check(name); @@ -281,14 +367,7 @@ kxld_symtab_get_cxx_symbol_by_value(const KXLDSymtab *symtab, kxld_addr_t value) { check(symtab); - /* - * value may hold a THUMB address (with bit 0 set to 1) but the index will - * have the real address (bit 0 set to 0). So if bit 0 is set here, - * we clear it (should impact no architectures but ARM). - */ - kxld_addr_t v = value & ~1; - - return kxld_dict_find(&symtab->cxx_index, &v); + return kxld_dict_find(&symtab->cxx_index, &value); } /******************************************************************************* @@ -319,8 +398,7 @@ kxld_symtab_get_macho_header_size(void) /******************************************************************************* *******************************************************************************/ u_long -kxld_symtab_get_macho_data_size(const KXLDSymtab *symtab, - boolean_t is_link_state, boolean_t is_32_bit) +kxld_symtab_get_macho_data_size(const KXLDSymtab *symtab, boolean_t is_32_bit) { KXLDSymtabIterator iter; KXLDSym *sym = NULL; @@ -329,12 +407,8 @@ kxld_symtab_get_macho_data_size(const KXLDSymtab *symtab, check(symtab); - if (is_link_state) { - kxld_symtab_iterator_init(&iter, symtab, kxld_sym_is_exported, FALSE); - } else { - kxld_symtab_iterator_init(&iter, symtab, - kxld_sym_is_defined_locally, FALSE); - } + kxld_symtab_iterator_init(&iter, symtab, + kxld_sym_is_defined_locally, FALSE); while ((sym = kxld_symtab_iterator_get_next(&iter))) { size += strlen(sym->name) + 1; @@ -356,7 +430,7 @@ kern_return_t kxld_symtab_export_macho(const KXLDSymtab *symtab, u_char *buf, u_long *header_offset, u_long header_size, u_long *data_offset, u_long data_size, - boolean_t is_link_state, boolean_t is_32_bit) + boolean_t is_32_bit) { kern_return_t rval = KERN_FAILURE; KXLDSymtabIterator iter; @@ -386,12 +460,8 @@ kxld_symtab_export_macho(const KXLDSymtab *symtab, u_char *buf, /* Find the size of the symbol and string tables */ - if (is_link_state) { - kxld_symtab_iterator_init(&iter, symtab, kxld_sym_is_exported, FALSE); - } else { - kxld_symtab_iterator_init(&iter, symtab, - kxld_sym_is_defined_locally, FALSE); - } + kxld_symtab_iterator_init(&iter, symtab, + kxld_sym_is_defined_locally, FALSE); while ((sym = kxld_symtab_iterator_get_next(&iter))) { symtabhdr->nsyms++; @@ -421,7 +491,7 @@ kxld_symtab_export_macho(const KXLDSymtab *symtab, u_char *buf, KXLD_3264_FUNC(is_32_bit, rval, kxld_sym_export_macho_32, kxld_sym_export_macho_64, - sym, nl, strtab, &stroff, symtabhdr->strsize, is_link_state); + sym, nl, strtab, &stroff, symtabhdr->strsize); require_noerr(rval, finish); nl += nlistsize; @@ -447,8 +517,6 @@ kxld_symtab_iterator_get_num_remaining(const KXLDSymtabIterator *iter) check(iter); - idx = iter->idx; - for (idx = iter->idx; idx < iter->symtab->syms.nitems; ++idx) { count += iter->test(kxld_array_get_item(&iter->symtab->syms, idx)); } @@ -458,8 +526,8 @@ kxld_symtab_iterator_get_num_remaining(const KXLDSymtabIterator *iter) /******************************************************************************* *******************************************************************************/ -static kern_return_t -make_cxx_index(KXLDSymtab *symtab) +kern_return_t +kxld_symtab_index_cxx_symbols_by_value(KXLDSymtab *symtab) { kern_return_t rval = KERN_FAILURE; KXLDSymtabIterator iter; @@ -468,6 +536,11 @@ make_cxx_index(KXLDSymtab *symtab) check(symtab); + if (symtab->cxx_index_initialized) { + rval = KERN_SUCCESS; + goto finish; + } + /* Count the number of C++ symbols */ kxld_symtab_iterator_init(&iter, symtab, sym_is_defined_cxx, FALSE); nsyms = kxld_symtab_iterator_get_num_remaining(&iter); @@ -483,10 +556,9 @@ make_cxx_index(KXLDSymtab *symtab) require_noerr(rval, finish); } + symtab->cxx_index_initialized = TRUE; rval = KERN_SUCCESS; - finish: - return rval; } @@ -500,8 +572,8 @@ sym_is_defined_cxx(const KXLDSym *sym) /******************************************************************************* *******************************************************************************/ -static kern_return_t -make_name_index(KXLDSymtab *symtab) +kern_return_t +kxld_symtab_index_symbols_by_name(KXLDSymtab *symtab) { kern_return_t rval = KERN_FAILURE; KXLDSymtabIterator iter; @@ -510,6 +582,11 @@ make_name_index(KXLDSymtab *symtab) check(symtab); + if (symtab->name_index_initialized) { + rval = KERN_SUCCESS; + goto finish; + } + /* Count the number of symbols we need to index by name */ kxld_symtab_iterator_init(&iter, symtab, sym_is_name_indexed, FALSE); nsyms = kxld_symtab_iterator_get_num_remaining(&iter); @@ -525,23 +602,18 @@ make_name_index(KXLDSymtab *symtab) require_noerr(rval, finish); } + symtab->name_index_initialized = TRUE; rval = KERN_SUCCESS; - finish: return rval; } - /******************************************************************************* *******************************************************************************/ static boolean_t sym_is_name_indexed(const KXLDSym *sym) { - return (kxld_sym_is_vtable(sym) || - streq_safe(sym->name, KXLD_KMOD_INFO_SYMBOL, - const_strlen(KXLD_KMOD_INFO_SYMBOL)) || - streq_safe(sym->name, KXLD_WEAK_TEST_SYMBOL, - const_strlen(KXLD_WEAK_TEST_SYMBOL))); + return (kxld_sym_is_defined_locally(sym) && !kxld_sym_is_stab(sym)); } /******************************************************************************* diff --git a/libkern/kxld/kxld_symtab.h b/libkern/kxld/kxld_symtab.h index cc2d91cec..a5a038756 100644 --- a/libkern/kxld/kxld_symtab.h +++ b/libkern/kxld/kxld_symtab.h @@ -37,6 +37,7 @@ #endif #include "kxld_sym.h" +#include "kxld_seg.h" struct kxld_array; struct symtab_command; @@ -55,18 +56,18 @@ struct kxld_symtab_iterator { *******************************************************************************/ size_t kxld_symtab_sizeof(void) - __attribute__((const, nonnull, visibility("hidden"))); + __attribute__((const, visibility("hidden"))); #if KXLD_USER_OR_ILP32 -kern_return_t kxld_symtab_init_from_macho_32(KXLDSymtab *symtab, u_char *macho, - struct symtab_command *src, kxld_addr_t linkedit_offset) - __attribute__((nonnull, visibility("hidden"))); +kern_return_t kxld_symtab_init_from_macho_32(KXLDSymtab *symtab, + struct symtab_command *src, u_char *macho, KXLDSeg * kernel_linkedit_seg) + __attribute__((nonnull(1,2), visibility("hidden"))); #endif /* KXLD_USER_OR_ILP32 */ #if KXLD_USER_OR_LP64 -kern_return_t kxld_symtab_init_from_macho_64(KXLDSymtab *symtab, u_char *macho, - struct symtab_command *src, kxld_addr_t linkedit_offset) - __attribute__((nonnull, visibility("hidden"))); +kern_return_t kxld_symtab_init_from_macho_64(KXLDSymtab *symtab, + struct symtab_command *src, u_char *macho, KXLDSeg * kernel_linkedit_seg) + __attribute__((nonnull(1,2), visibility("hidden"))); #endif /* KXLD_USER_OR_ILP64 */ void kxld_symtab_iterator_init(KXLDSymtabIterator *iter, @@ -89,30 +90,34 @@ u_int kxld_symtab_get_num_symbols(const KXLDSymtab *symtab) KXLDSym * kxld_symtab_get_symbol_by_index(const KXLDSymtab *symtab, u_int idx) __attribute__((pure, nonnull, visibility("hidden"))); -KXLDSym * kxld_symtab_get_symbol_by_name(const KXLDSymtab *symtab, +KXLDSym * kxld_symtab_get_symbol_by_name(const KXLDSymtab *symtab, const char *name) __attribute__((pure, nonnull, visibility("hidden"))); +KXLDSym * kxld_symtab_get_locally_defined_symbol_by_name( + const KXLDSymtab *symtab, const char *name) + __attribute__((pure, nonnull, visibility("hidden"))); + KXLDSym * kxld_symtab_get_cxx_symbol_by_value(const KXLDSymtab *symtab, kxld_addr_t value) __attribute__((pure, nonnull, visibility("hidden"))); kern_return_t kxld_symtab_get_sym_index(const KXLDSymtab *symtab, const KXLDSym * sym, u_int *idx) - __attribute__((pure, nonnull, visibility("hidden"))); + __attribute__((nonnull, visibility("hidden"))); u_long kxld_symtab_get_macho_header_size(void) __attribute__((pure, visibility("hidden"))); u_long kxld_symtab_get_macho_data_size(const KXLDSymtab *symtab, - boolean_t is_link_state, boolean_t is_32_bit) + boolean_t is_32_bit) __attribute__((pure, nonnull, visibility("hidden"))); kern_return_t kxld_symtab_export_macho(const KXLDSymtab *symtab, u_char *buf, u_long *header_offset, u_long header_size, u_long *data_offset, u_long data_size, - boolean_t is_link_state, boolean_t is_32_bit) + boolean_t is_32_bit) __attribute__((nonnull, visibility("hidden"))); u_int kxld_symtab_iterator_get_num_remaining(const KXLDSymtabIterator *iter) @@ -122,6 +127,12 @@ u_int kxld_symtab_iterator_get_num_remaining(const KXLDSymtabIterator *iter) * Modifiers *******************************************************************************/ +kern_return_t kxld_symtab_index_symbols_by_name(KXLDSymtab *symtab) + __attribute__((nonnull, visibility("hidden"))); + +kern_return_t kxld_symtab_index_cxx_symbols_by_value(KXLDSymtab *symtab) + __attribute__((nonnull, visibility("hidden"))); + kern_return_t kxld_symtab_relocate(KXLDSymtab *symtab, const struct kxld_array *sectarray) __attribute__((nonnull, visibility("hidden"))); diff --git a/libkern/kxld/kxld_util.c b/libkern/kxld/kxld_util.c index 35dc1066b..2f7a10643 100644 --- a/libkern/kxld/kxld_util.c +++ b/libkern/kxld/kxld_util.c @@ -105,8 +105,8 @@ kxld_log(KXLDLogSubsystem subsystem, KXLDLogLevel level, alloc_buffer = kxld_alloc(length); if (!alloc_buffer) return; - snprintf(alloc_buffer, sizeof(alloc_buffer), "kxld[%s]: %s", - name, format); + snprintf(alloc_buffer, length, "kxld[%s]: %s", + name, in_format); format = alloc_buffer; } @@ -310,6 +310,17 @@ validate_and_swap_macho_32(u_char *file, u_long size kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO "Invalid magic number: 0x%x.", mach_hdr->magic)); + /* If in the running kernel, and asked to validate the kernel + * (which is the only file of type MH_EXECUTE we should ever see), + * then just assume it's ok or we wouldn't be running to begin with. + */ +#if KERNEL + if (mach_hdr->filetype == MH_EXECUTE) { + rval = KERN_SUCCESS; + goto finish; + } +#endif /* KERNEL */ + /* Validate and potentially swap the load commands */ for(i = 0; i < mach_hdr->ncmds; ++i, offset += cmdsize) { @@ -470,6 +481,17 @@ validate_and_swap_macho_64(u_char *file, u_long size kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO "Invalid magic number: 0x%x.", mach_hdr->magic)); + /* If in the running kernel, and asked to validate the kernel + * (which is the only file of type MH_EXECUTE we should ever see), + * then just assume it's ok or we wouldn't be running to begin with. + */ +#if KERNEL + if (mach_hdr->filetype == MH_EXECUTE) { + rval = KERN_SUCCESS; + goto finish; + } +#endif /* KERNEL */ + /* Validate and potentially swap the load commands */ for(i = 0; i < mach_hdr->ncmds; ++i, offset += cmdsize) { /* Get the load command and size */ @@ -717,6 +739,8 @@ kxld_align_address(kxld_addr_t address, u_int align) kxld_addr_t alignment = (1 << align); kxld_addr_t low_bits = 0; + if (!align) return address; + low_bits = (address) & (alignment - 1); if (low_bits) { address += (alignment - low_bits); diff --git a/libkern/kxld/kxld_util.h b/libkern/kxld/kxld_util.h index 3392b4a74..9d5720f04 100644 --- a/libkern/kxld/kxld_util.h +++ b/libkern/kxld/kxld_util.h @@ -119,13 +119,15 @@ void kxld_log(KXLDLogSubsystem subsystem, KXLDLogLevel level, #define kKxldLogMalformedMachO "The Mach-O file is malformed: " #define kKxldLogMalformedVTable "The vtable '%s' is malformed. Make sure your kext has been built against the correct headers." #define kKxldLogMissingVtable "Cannot find the vtable '%s' for class '%s'. This vtable symbol is required for binary compatibility, and it may have been stripped." +#define kKxldLogDirectPureVirtualCall "This kext calls a pure virtual function. Make sure your kext's OSObject-derived classes implement all pure virtual functions." #define kKxldLogParentOutOfDate "The super class vtable '%s' for vtable '%s' is out of date. Make sure your kext has been built against the correct headers." #define kKxldLogNoKmodInfo "The kext is missing its kmod_info structure." -#define kKxldLogInvalidSectReloc "Relocation entry %u from section %s,%s cannot be processed." -#define kKxldLogInvalidExtReloc "External relocation entry %u cannot be processed." -#define kKxldLogInvalidIntReloc "Internal relocation entry %u cannot be processed." #define kKxldLogRelocationOverflow "A relocation entry has overflowed. The kext may be too far from one " \ "of its dependencies. Check your kext's load address." +#define kKxldLogRelocatingPatchedSym "Relocation failed because some class in this kext " \ + "didn't use the OSDeclareDefaultStructors and OSDefineMetaClassAndStructors, so it still " \ + "references %s, which has been patched with another symbol for binary compatibility. " \ + "Please make sure all classes that inherit from OSObject use these macros." /******************************************************************************* * Allocators @@ -183,10 +185,10 @@ void unswap_macho(u_char *file, enum NXByteOrder host_order, *******************************************************************************/ kxld_addr_t kxld_align_address(kxld_addr_t address, u_int align) - __attribute__((const, nonnull, visibility("hidden"))); + __attribute__((const, visibility("hidden"))); boolean_t kxld_is_32_bit(cpu_type_t) - __attribute__((const, nonnull, visibility("hidden"))); + __attribute__((const, visibility("hidden"))); const char * kxld_strstr(const char *s, const char *find) __attribute__((pure, nonnull, visibility("hidden"))); diff --git a/libkern/kxld/kxld_vtable.c b/libkern/kxld/kxld_vtable.c index 208c030d9..e792d3842 100644 --- a/libkern/kxld/kxld_vtable.c +++ b/libkern/kxld/kxld_vtable.c @@ -29,13 +29,24 @@ #include #include +#if KERNEL + #ifdef MACH_ASSERT + #undef MACH_ASSERT + #endif + #define MACH_ASSERT 1 + #include +#else + #include +#endif + #define DEBUG_ASSERT_COMPONENT_NAME_STRING "kxld" #include #include "kxld_demangle.h" +#include "kxld_dict.h" +#include "kxld_object.h" #include "kxld_reloc.h" #include "kxld_sect.h" -#include "kxld_state.h" #include "kxld_sym.h" #include "kxld_symtab.h" #include "kxld_util.h" @@ -49,235 +60,102 @@ #define VTABLE_HEADER_LEN_64 2 #define VTABLE_HEADER_SIZE_64 (VTABLE_HEADER_LEN_64 * VTABLE_ENTRY_SIZE_64) -static kern_return_t init_by_relocs(KXLDVTable *vtable, const KXLDSym *sym, - const KXLDSect *sect, const KXLDSymtab *symtab, - const KXLDRelocator *relocator); - -static kern_return_t init_by_entries_and_relocs(KXLDVTable *vtable, - const KXLDSym *sym, const KXLDSymtab *symtab, - const KXLDRelocator *relocator, const KXLDArray *relocs); - -static kxld_addr_t get_entry_value(u_char *entry, const KXLDRelocator *relocator) - __attribute__((pure)); -#if !KERNEL -static kxld_addr_t swap_entry_value(kxld_addr_t entry_value, - const KXLDRelocator *relocator) __attribute__((const)); -#endif /* !KERNEL */ -static kern_return_t init_by_entries(KXLDVTable *vtable, const KXLDSymtab *symtab, - const KXLDRelocator *relocator); +static void get_vtable_base_sizes(boolean_t is_32_bit, u_int *vtable_entry_size, + u_int *vtable_header_size); -/******************************************************************************* -*******************************************************************************/ -kern_return_t -kxld_vtable_init_from_kernel_macho(KXLDVTable *vtable, const KXLDSym *sym, - const KXLDSect *sect, const KXLDSymtab *symtab, - const KXLDRelocator *relocator) -{ - kern_return_t rval = KERN_FAILURE; - char *demangled_name = NULL; - size_t demangled_length = 0; - - check(vtable); - check(sym); - check(sect); - check(symtab); +static kern_return_t init_by_relocs(KXLDVTable *vtable, const KXLDSym *vtable_sym, + const KXLDSect *sect, const KXLDRelocator *relocator); - vtable->name = sym->name; - vtable->vtable = sect->data + kxld_sym_get_section_offset(sym, sect); - vtable->is_patched = FALSE; - - require_action(kxld_sect_get_num_relocs(sect) == 0, finish, - rval=KERN_FAILURE; - kxld_log(kKxldLogPatching, kKxldLogErr, - kKxldLogMalformedVTable, - kxld_demangle(vtable->name, &demangled_name, &demangled_length))); - - rval = init_by_entries(vtable, symtab, relocator); - require_noerr(rval, finish); - - vtable->is_patched = TRUE; - - rval = KERN_SUCCESS; - -finish: - if (rval) kxld_vtable_deinit(vtable); - if (demangled_name) kxld_free(demangled_name, demangled_length); +static kern_return_t init_by_entries_and_relocs(KXLDVTable *vtable, + const KXLDSym *vtable_sym, const KXLDRelocator *relocator, + const KXLDArray *relocs, const KXLDDict *defined_cxx_symbols); - return rval; -} +static kern_return_t init_by_entries(KXLDVTable *vtable, + const KXLDRelocator *relocator, const KXLDDict *defined_cxx_symbols); /******************************************************************************* *******************************************************************************/ -kern_return_t -kxld_vtable_init_from_object_macho(KXLDVTable *vtable, const KXLDSym *sym, - const KXLDSect *sect, const KXLDSymtab *symtab, - const KXLDRelocator *relocator) +kern_return_t +kxld_vtable_init(KXLDVTable *vtable, const KXLDSym *vtable_sym, + const KXLDObject *object, const KXLDDict *defined_cxx_symbols) { kern_return_t rval = KERN_FAILURE; + const KXLDArray *extrelocs = NULL; + const KXLDRelocator *relocator = NULL; + const KXLDSect *vtable_sect = NULL; char *demangled_name = NULL; size_t demangled_length = 0; check(vtable); - check(sym); - check(sect); - check(symtab); + check(vtable_sym); + check(object); - vtable->name = sym->name; - vtable->vtable = sect->data + kxld_sym_get_section_offset(sym, sect); - vtable->is_patched = FALSE; + relocator = kxld_object_get_relocator(object); - require_action(kxld_sect_get_num_relocs(sect) > 0, finish, - rval=KERN_FAILURE; - kxld_log(kKxldLogPatching, kKxldLogErr, - kKxldLogMalformedVTable, - kxld_demangle(vtable->name, &demangled_name, &demangled_length))); - - rval = init_by_relocs(vtable, sym, sect, symtab, relocator); - require_noerr(rval, finish); + vtable_sect = kxld_object_get_section_by_index(object, + vtable_sym->sectnum); + require_action(vtable_sect, finish, rval=KERN_FAILURE); - rval = KERN_SUCCESS; - -finish: - if (rval) kxld_vtable_deinit(vtable); - if (demangled_name) kxld_free(demangled_name, demangled_length); - - return rval; -} - -/******************************************************************************* -*******************************************************************************/ -kern_return_t -kxld_vtable_init_from_final_macho(KXLDVTable *vtable, const KXLDSym *sym, - const KXLDSect *sect, const KXLDSymtab *symtab, - const KXLDRelocator *relocator, const KXLDArray *relocs) -{ - kern_return_t rval = KERN_FAILURE; - char *demangled_name = NULL; - size_t demangled_length = 0; + vtable->name = vtable_sym->name; + vtable->vtable = vtable_sect->data + + kxld_sym_get_section_offset(vtable_sym, vtable_sect); - check(vtable); - check(sym); - check(sect); - check(symtab); + if (kxld_object_is_linked(object)) { + rval = init_by_entries(vtable, relocator, defined_cxx_symbols); + require_noerr(rval, finish); - vtable->name = sym->name; - vtable->vtable = sect->data + kxld_sym_get_section_offset(sym, sect); - vtable->is_patched = FALSE; + vtable->is_patched = TRUE; + } else { + if (kxld_object_is_final_image(object)) { + extrelocs = kxld_object_get_extrelocs(object); + require_action(extrelocs, finish, + rval=KERN_FAILURE; + kxld_log(kKxldLogPatching, kKxldLogErr, + kKxldLogMalformedVTable, + kxld_demangle(vtable->name, + &demangled_name, &demangled_length))); - require_action(kxld_sect_get_num_relocs(sect) == 0, finish, - rval=KERN_FAILURE; - kxld_log(kKxldLogPatching, kKxldLogErr, - kKxldLogMalformedVTable, - kxld_demangle(vtable->name, &demangled_name, &demangled_length))); + rval = init_by_entries_and_relocs(vtable, vtable_sym, + relocator, extrelocs, defined_cxx_symbols); + require_noerr(rval, finish); + } else { + require_action(kxld_sect_get_num_relocs(vtable_sect) > 0, finish, + rval=KERN_FAILURE; + kxld_log(kKxldLogPatching, kKxldLogErr, + kKxldLogMalformedVTable, + kxld_demangle(vtable->name, + &demangled_name, &demangled_length))); - rval = init_by_entries_and_relocs(vtable, sym, symtab, - relocator, relocs); - require_noerr(rval, finish); + rval = init_by_relocs(vtable, vtable_sym, vtable_sect, relocator); + require_noerr(rval, finish); + } + + vtable->is_patched = FALSE; + } rval = KERN_SUCCESS; - finish: - if (rval) kxld_vtable_deinit(vtable); if (demangled_name) kxld_free(demangled_name, demangled_length); return rval; } -#if KXLD_USER_OR_ILP32 -/******************************************************************************* -*******************************************************************************/ -kern_return_t -kxld_vtable_init_from_link_state_32(KXLDVTable *vtable, u_char *file, - KXLDVTableHdr *hdr) -{ - kern_return_t rval = KERN_FAILURE; - KXLDSymEntry32 *sym = NULL; - KXLDVTableEntry *entry = NULL; - u_int i = 0; - - check(vtable); - check(file); - check(hdr); - - vtable->name = (char *) (file + hdr->nameoff); - vtable->is_patched = TRUE; - - rval = kxld_array_init(&vtable->entries, sizeof(KXLDVTableEntry), - hdr->nentries); - require_noerr(rval, finish); - - sym = (KXLDSymEntry32 *) (file + hdr->vtableoff); - for (i = 0; i < vtable->entries.nitems; ++i, ++sym) { - entry = kxld_array_get_item(&vtable->entries, i); - entry->patched.name = (char *) (file + sym->nameoff); - entry->patched.addr = sym->addr; - } - - rval = KERN_SUCCESS; - -finish: - return rval; -} -#endif /* KXLD_USER_OR_ILP32 */ - -#if KXLD_USER_OR_LP64 /******************************************************************************* *******************************************************************************/ -kern_return_t -kxld_vtable_init_from_link_state_64(KXLDVTable *vtable, u_char *file, - KXLDVTableHdr *hdr) +static void +get_vtable_base_sizes(boolean_t is_32_bit, u_int *vtable_entry_size, + u_int *vtable_header_size) { - kern_return_t rval = KERN_FAILURE; - KXLDSymEntry64 *sym = NULL; - KXLDVTableEntry *entry = NULL; - u_int i = 0; - - check(vtable); - check(file); - check(hdr); - - vtable->name = (char *) (file + hdr->nameoff); - vtable->is_patched = TRUE; + check(vtable_entry_size); + check(vtable_header_size); - rval = kxld_array_init(&vtable->entries, sizeof(KXLDVTableEntry), - hdr->nentries); - require_noerr(rval, finish); - - sym = (KXLDSymEntry64 *) (file + hdr->vtableoff); - for (i = 0; i < vtable->entries.nitems; ++i, ++sym) { - entry = kxld_array_get_item(&vtable->entries, i); - entry->patched.name = (char *) (file + sym->nameoff); - entry->patched.addr = sym->addr; + if (is_32_bit) { + *vtable_entry_size = VTABLE_ENTRY_SIZE_32; + *vtable_header_size = VTABLE_HEADER_SIZE_32; + } else { + *vtable_entry_size = VTABLE_ENTRY_SIZE_64; + *vtable_header_size = VTABLE_HEADER_SIZE_64; } - - rval = KERN_SUCCESS; - -finish: - return rval; -} -#endif /* KXLD_USER_OR_LP64 */ - -/******************************************************************************* -*******************************************************************************/ -kern_return_t -kxld_vtable_copy(KXLDVTable *vtable, const KXLDVTable *src) -{ - kern_return_t rval = KERN_FAILURE; - - check(vtable); - check(src); - - vtable->vtable = src->vtable; - vtable->name = src->name; - vtable->is_patched = src->is_patched; - - rval = kxld_array_copy(&vtable->entries, &src->entries); - require_noerr(rval, finish); - - rval = KERN_SUCCESS; - -finish: - return rval; } /******************************************************************************* @@ -285,38 +163,35 @@ kxld_vtable_copy(KXLDVTable *vtable, const KXLDVTable *src) * entries and finding the corresponding symbols. *******************************************************************************/ static kern_return_t -init_by_relocs(KXLDVTable *vtable, const KXLDSym *sym, const KXLDSect *sect, - const KXLDSymtab *symtab, const KXLDRelocator *relocator) +init_by_relocs(KXLDVTable *vtable, const KXLDSym *vtable_sym, + const KXLDSect *sect, const KXLDRelocator *relocator) { kern_return_t rval = KERN_FAILURE; KXLDReloc *reloc = NULL; KXLDVTableEntry *entry = NULL; - KXLDSym *tmpsym = NULL; + KXLDSym *sym = NULL; kxld_addr_t vtable_base_offset = 0; kxld_addr_t entry_offset = 0; u_int i = 0; u_int nentries = 0; u_int vtable_entry_size = 0; + u_int vtable_header_size = 0; u_int base_reloc_index = 0; u_int reloc_index = 0; check(vtable); - check(sym); + check(vtable_sym); check(sect); - check(symtab); check(relocator); /* Find the first entry past the vtable padding */ - vtable_base_offset = kxld_sym_get_section_offset(sym, sect); - if (relocator->is_32_bit) { - vtable_entry_size = VTABLE_ENTRY_SIZE_32; - vtable_base_offset += VTABLE_HEADER_SIZE_32; - } else { - vtable_entry_size = VTABLE_ENTRY_SIZE_64; - vtable_base_offset += VTABLE_HEADER_SIZE_64; - } + (void) get_vtable_base_sizes(relocator->is_32_bit, + &vtable_entry_size, &vtable_header_size); + vtable_base_offset = kxld_sym_get_section_offset(vtable_sym, sect) + + vtable_header_size; + /* Find the relocation entry at the start of the vtable */ rval = kxld_reloc_get_reloc_index_by_offset(§->relocs, @@ -359,9 +234,9 @@ init_by_relocs(KXLDVTable *vtable, const KXLDSym *sym, const KXLDSect *sect, * skip it. We won't be able to patch subclasses with this symbol, * but there isn't much we can do about that. */ - tmpsym = kxld_reloc_get_symbol(relocator, reloc, sect->data, symtab); + sym = kxld_reloc_get_symbol(relocator, reloc, sect->data); - entry->unpatched.sym = tmpsym; + entry->unpatched.sym = sym; entry->unpatched.reloc = reloc; } @@ -370,77 +245,42 @@ init_by_relocs(KXLDVTable *vtable, const KXLDSym *sym, const KXLDSect *sect, return rval; } -/******************************************************************************* -*******************************************************************************/ -static kxld_addr_t -get_entry_value(u_char *entry, const KXLDRelocator *relocator) -{ - kxld_addr_t entry_value; - - if (relocator->is_32_bit) { - entry_value = *(uint32_t *)entry; - } else { - entry_value = *(uint64_t *)entry; - } - - return entry_value; -} - -#if !KERNEL -/******************************************************************************* -*******************************************************************************/ -static kxld_addr_t -swap_entry_value(kxld_addr_t entry_value, const KXLDRelocator *relocator) -{ - if (relocator->is_32_bit) { - entry_value = OSSwapInt32((uint32_t) entry_value); - } else { - entry_value = OSSwapInt64((uint64_t) entry_value); - } - - return entry_value; -} -#endif /* KERNEL */ - /******************************************************************************* * Initializes a vtable object by reading the symbol values out of the vtable * entries and performing reverse symbol lookups on those values. *******************************************************************************/ static kern_return_t -init_by_entries(KXLDVTable *vtable, const KXLDSymtab *symtab, - const KXLDRelocator *relocator) +init_by_entries(KXLDVTable *vtable, const KXLDRelocator *relocator, + const KXLDDict *defined_cxx_symbols) { kern_return_t rval = KERN_FAILURE; KXLDVTableEntry *tmpentry = NULL; KXLDSym *sym = NULL; - u_char *base_entry = NULL; - u_char *entry = NULL; kxld_addr_t entry_value = 0; + u_long entry_offset; u_int vtable_entry_size = 0; u_int vtable_header_size = 0; u_int nentries = 0; u_int i = 0; - if (relocator->is_32_bit) { - vtable_entry_size = VTABLE_ENTRY_SIZE_32; - vtable_header_size = VTABLE_HEADER_SIZE_32; - } else { - vtable_entry_size = VTABLE_ENTRY_SIZE_64; - vtable_header_size = VTABLE_HEADER_SIZE_64; - } + check(vtable); + check(relocator); - base_entry = vtable->vtable + vtable_header_size; + (void) get_vtable_base_sizes(relocator->is_32_bit, + &vtable_entry_size, &vtable_header_size); /* Count the number of entries (the vtable is null-terminated) */ - entry = base_entry; - entry_value = get_entry_value(entry, relocator); - while (entry_value) { + entry_offset = vtable_header_size; + while (1) { + entry_value = kxld_relocator_get_pointer_at_addr(relocator, + vtable->vtable, entry_offset); + if (!entry_value) break; + + entry_offset += vtable_entry_size; ++nentries; - entry += vtable_entry_size; - entry_value = get_entry_value(entry, relocator); } - + /* Allocate the symbol index */ rval = kxld_array_init(&vtable->entries, sizeof(KXLDVTableEntry), nentries); @@ -448,24 +288,19 @@ init_by_entries(KXLDVTable *vtable, const KXLDSymtab *symtab, /* Look up the symbols for each entry */ - entry = base_entry; - rval = KERN_SUCCESS; - for (i = 0; i < vtable->entries.nitems; ++i) { - entry = base_entry + (i * vtable_entry_size); - entry_value = get_entry_value(entry, relocator); + for (i = 0, entry_offset = vtable_header_size; + i < vtable->entries.nitems; + ++i, entry_offset += vtable_entry_size) + { + entry_value = kxld_relocator_get_pointer_at_addr(relocator, + vtable->vtable, entry_offset); -#if !KERNEL - if (relocator->swap) { - entry_value = swap_entry_value(entry_value, relocator); - } -#endif /* !KERNEL */ - /* If we can't find the symbol, it means that the virtual function was * defined inline. There's not much I can do about this; it just means * I can't patch this function. */ tmpentry = kxld_array_get_item(&vtable->entries, i); - sym = kxld_symtab_get_cxx_symbol_by_value(symtab, entry_value); + sym = kxld_dict_find(defined_cxx_symbols, &entry_value); if (sym) { tmpentry->patched.name = sym->name; @@ -477,7 +312,6 @@ init_by_entries(KXLDVTable *vtable, const KXLDSymtab *symtab, } rval = KERN_SUCCESS; - finish: return rval; } @@ -493,63 +327,49 @@ init_by_entries(KXLDVTable *vtable, const KXLDSymtab *symtab, * external symbols. *******************************************************************************/ static kern_return_t -init_by_entries_and_relocs(KXLDVTable *vtable, const KXLDSym *sym, - const KXLDSymtab *symtab, const KXLDRelocator *relocator, - const KXLDArray *relocs) +init_by_entries_and_relocs(KXLDVTable *vtable, const KXLDSym *vtable_sym, + const KXLDRelocator *relocator, const KXLDArray *relocs, + const KXLDDict *defined_cxx_symbols) { kern_return_t rval = KERN_FAILURE; KXLDReloc *reloc = NULL; KXLDVTableEntry *tmpentry = NULL; - KXLDSym *tmpsym = NULL; + KXLDSym *sym = NULL; u_int vtable_entry_size = 0; u_int vtable_header_size = 0; - u_char *base_entry = NULL; - u_char *entry = NULL; kxld_addr_t entry_value = 0; - kxld_addr_t base_entry_offset = 0; - kxld_addr_t entry_offset = 0; + u_long entry_offset = 0; u_int nentries = 0; u_int i = 0; char *demangled_name1 = NULL; size_t demangled_length1 = 0; check(vtable); - check(sym); - check(symtab); + check(vtable_sym); + check(relocator); check(relocs); /* Find the first entry and its offset past the vtable padding */ - if (relocator->is_32_bit) { - vtable_entry_size = VTABLE_ENTRY_SIZE_32; - vtable_header_size = VTABLE_HEADER_SIZE_32; - } else { - vtable_entry_size = VTABLE_ENTRY_SIZE_64; - vtable_header_size = VTABLE_HEADER_SIZE_64; - } - - base_entry = vtable->vtable + vtable_header_size; - - base_entry_offset = sym->base_addr; - base_entry_offset += vtable_header_size; + (void) get_vtable_base_sizes(relocator->is_32_bit, + &vtable_entry_size, &vtable_header_size); /* In a final linked image, a vtable slot is valid if it is nonzero - * (meaning the userspace linker has already resolved it, or if it has + * (meaning the userspace linker has already resolved it) or if it has * a relocation entry. We'll know the end of the vtable when we find a * slot that meets neither of these conditions. */ - entry = base_entry; - entry_value = get_entry_value(entry, relocator); - entry_offset = base_entry_offset; + entry_offset = vtable_header_size; while (1) { - entry_value = get_entry_value(entry, relocator); + entry_value = kxld_relocator_get_pointer_at_addr(relocator, + vtable->vtable, entry_offset); if (!entry_value) { - reloc = kxld_reloc_get_reloc_by_offset(relocs, entry_offset); + reloc = kxld_reloc_get_reloc_by_offset(relocs, + vtable_sym->base_addr + entry_offset); if (!reloc) break; } ++nentries; - entry += vtable_entry_size; entry_offset += vtable_entry_size; } @@ -560,11 +380,12 @@ init_by_entries_and_relocs(KXLDVTable *vtable, const KXLDSym *sym, /* Find the symbols for each vtable entry */ - entry = base_entry; - entry_value = get_entry_value(entry, relocator); - entry_offset = base_entry_offset; - for (i = 0; i < vtable->entries.nitems; ++i) { - entry_value = get_entry_value(entry, relocator); + for (i = 0, entry_offset = vtable_header_size; + i < vtable->entries.nitems; + ++i, entry_offset += vtable_entry_size) + { + entry_value = kxld_relocator_get_pointer_at_addr(relocator, + vtable->vtable, entry_offset); /* If we can't find a symbol, it means it is a locally-defined, * non-external symbol that has been stripped. We don't patch over @@ -573,16 +394,11 @@ init_by_entries_and_relocs(KXLDVTable *vtable, const KXLDSym *sym, * but there isn't much we can do about that. */ if (entry_value) { -#if !KERNEL - if (relocator->swap) { - entry_value = swap_entry_value(entry_value, relocator); - } -#endif /* !KERNEL */ - reloc = NULL; - tmpsym = kxld_symtab_get_cxx_symbol_by_value(symtab, entry_value); + sym = kxld_dict_find(defined_cxx_symbols, &entry_value); } else { - reloc = kxld_reloc_get_reloc_by_offset(relocs, entry_offset); + reloc = kxld_reloc_get_reloc_by_offset(relocs, + vtable_sym->base_addr + entry_offset); require_action(reloc, finish, rval=KERN_FAILURE; kxld_log(kKxldLogPatching, kKxldLogErr, @@ -590,20 +406,15 @@ init_by_entries_and_relocs(KXLDVTable *vtable, const KXLDSym *sym, kxld_demangle(vtable->name, &demangled_name1, &demangled_length1))); - tmpsym = kxld_reloc_get_symbol(relocator, reloc, - /* data */ NULL, symtab); + sym = kxld_reloc_get_symbol(relocator, reloc, /* data */ NULL); } - + tmpentry = kxld_array_get_item(&vtable->entries, i); tmpentry->unpatched.reloc = reloc; - tmpentry->unpatched.sym = tmpsym; - - entry += vtable_entry_size; - entry_offset += vtable_entry_size; + tmpentry->unpatched.sym = sym; } rval = KERN_SUCCESS; - finish: return rval; } @@ -632,17 +443,42 @@ kxld_vtable_deinit(KXLDVTable *vtable) bzero(vtable, sizeof(*vtable)); } +/******************************************************************************* +*******************************************************************************/ +KXLDVTableEntry * +kxld_vtable_get_entry_for_offset(const KXLDVTable *vtable, u_long offset, + boolean_t is_32_bit) +{ + KXLDVTableEntry *rval = NULL; + u_int vtable_entry_size = 0; + u_int vtable_header_size = 0; + u_int vtable_entry_idx = 0; + + (void) get_vtable_base_sizes(is_32_bit, + &vtable_entry_size, &vtable_header_size); + + if (offset % vtable_entry_size) { + goto finish; + } + + vtable_entry_idx = (u_int) ((offset - vtable_header_size) / vtable_entry_size); + rval = kxld_array_get_item(&vtable->entries, vtable_entry_idx); +finish: + return rval; +} + /******************************************************************************* * Patching vtables allows us to preserve binary compatibility across releases. *******************************************************************************/ kern_return_t kxld_vtable_patch(KXLDVTable *vtable, const KXLDVTable *super_vtable, - KXLDSymtab *symtab, boolean_t strict_patching __unused) + KXLDObject *object) { kern_return_t rval = KERN_FAILURE; + const KXLDSymtab *symtab = NULL; + const KXLDSym *sym = NULL; KXLDVTableEntry *child_entry = NULL; KXLDVTableEntry *parent_entry = NULL; - KXLDSym *sym = NULL; u_int symindex = 0; u_int i = 0; char *demangled_name1 = NULL; @@ -651,10 +487,13 @@ kxld_vtable_patch(KXLDVTable *vtable, const KXLDVTable *super_vtable, size_t demangled_length1 = 0; size_t demangled_length2 = 0; size_t demangled_length3 = 0; + boolean_t failure = FALSE; check(vtable); check(super_vtable); + symtab = kxld_object_get_symtab(object); + require_action(!vtable->is_patched, finish, rval=KERN_SUCCESS); require_action(vtable->entries.nitems >= super_vtable->entries.nitems, finish, rval=KERN_FAILURE; @@ -679,7 +518,7 @@ kxld_vtable_patch(KXLDVTable *vtable, const KXLDVTable *super_vtable, */ if (!parent_entry->patched.name) continue; - + /* 1) If the symbol is defined locally, do not patch */ if (kxld_sym_is_defined_locally(child_entry->unpatched.sym)) continue; @@ -726,7 +565,8 @@ kxld_vtable_patch(KXLDVTable *vtable, const KXLDVTable *super_vtable, * should not patch it. */ - if (strict_patching && !kxld_sym_is_defined(child_entry->unpatched.sym)) + if (kxld_object_target_supports_strict_patching(object) && + !kxld_sym_is_defined(child_entry->unpatched.sym)) { char class_name[KXLD_MAX_NAME_LEN]; char function_prefix[KXLD_MAX_NAME_LEN]; @@ -744,6 +584,14 @@ kxld_vtable_patch(KXLDVTable *vtable, const KXLDVTable *super_vtable, if (!strncmp(child_entry->unpatched.sym->name, function_prefix, function_prefix_len)) { + failure = TRUE; + kxld_log(kKxldLogPatching, kKxldLogErr, + "The %s is unpatchable because its class declares the " + "method '%s' without providing an implementation.", + kxld_demangle(vtable->name, + &demangled_name1, &demangled_length1), + kxld_demangle(child_entry->unpatched.sym->name, + &demangled_name2, &demangled_length2)); continue; } } @@ -758,9 +606,10 @@ kxld_vtable_patch(KXLDVTable *vtable, const KXLDVTable *super_vtable, * that. */ - sym = kxld_symtab_get_symbol_by_name(symtab, parent_entry->patched.name); + sym = kxld_symtab_get_locally_defined_symbol_by_name(symtab, + parent_entry->patched.name); if (!sym) { - rval = kxld_symtab_add_symbol(symtab, parent_entry->patched.name, + rval = kxld_object_add_symbol(object, parent_entry->patched.name, parent_entry->patched.addr, &sym); require_noerr(rval, finish); } @@ -771,7 +620,6 @@ kxld_vtable_patch(KXLDVTable *vtable, const KXLDVTable *super_vtable, rval = kxld_reloc_update_symindex(child_entry->unpatched.reloc, symindex); require_noerr(rval, finish); - kxld_log(kKxldLogPatching, kKxldLogDetail, "In vtable '%s', patching '%s' with '%s'.", kxld_demangle(vtable->name, &demangled_name1, &demangled_length1), @@ -779,13 +627,28 @@ kxld_vtable_patch(KXLDVTable *vtable, const KXLDVTable *super_vtable, &demangled_name2, &demangled_length2), kxld_demangle(sym->name, &demangled_name3, &demangled_length3)); - kxld_sym_patch(child_entry->unpatched.sym); + rval = kxld_object_patch_symbol(object, child_entry->unpatched.sym); + require_noerr(rval, finish); + child_entry->unpatched.sym = sym; + + /* + * The C++ ABI requires that functions be aligned on a 2-byte boundary: + * http://www.codesourcery.com/public/cxx-abi/abi.html#member-pointers + * If the LSB of any virtual function's link address is 1, then the + * compiler has violated that part of the ABI, and we're going to panic + * in _ptmf2ptf() (in OSMetaClass.h). Better to panic here with some + * context. + */ + assert(kxld_sym_is_pure_virtual(sym) || !(sym->link_addr & 1)); } + require_action(!failure, finish, rval=KERN_FAILURE); + /* Change the vtable representation from the unpatched layout to the * patched layout. */ + for (i = 0; i < vtable->entries.nitems; ++i) { char *name; kxld_addr_t addr; diff --git a/libkern/kxld/kxld_vtable.h b/libkern/kxld/kxld_vtable.h index 124756994..4dd304a76 100644 --- a/libkern/kxld/kxld_vtable.h +++ b/libkern/kxld/kxld_vtable.h @@ -38,6 +38,7 @@ #include "kxld_array.h" struct kxld_array; +struct kxld_object; struct kxld_reloc; struct kxld_relocator; struct kxld_sect; @@ -62,7 +63,7 @@ struct kxld_vtable_patched_entry { }; struct kxld_vtable_unpatched_entry { - struct kxld_sym *sym; + const struct kxld_sym *sym; struct kxld_reloc *reloc; }; @@ -75,31 +76,9 @@ union kxld_vtable_entry { * Constructors and destructors *******************************************************************************/ -kern_return_t kxld_vtable_init_from_kernel_macho(KXLDVTable *vtable, - const struct kxld_sym *sym, const struct kxld_sect *sect, - const struct kxld_symtab *symtab, const struct kxld_relocator *relocator) - __attribute__((nonnull, visibility("hidden"))); - -kern_return_t kxld_vtable_init_from_final_macho(KXLDVTable *vtable, - const struct kxld_sym *sym, const struct kxld_sect *sect, - const struct kxld_symtab *symtab, const struct kxld_relocator *relocator, - const struct kxld_array *relocs) - __attribute__((nonnull, visibility("hidden"))); - -kern_return_t kxld_vtable_init_from_object_macho(KXLDVTable *vtable, - const struct kxld_sym *sym, const struct kxld_sect *sect, - const struct kxld_symtab *symtab, const struct kxld_relocator *relocator) - __attribute__((nonnull, visibility("hidden"))); - -kern_return_t kxld_vtable_init_from_link_state_32(KXLDVTable *vtable, u_char *state, - struct kxld_vtable_hdr *hdr) - __attribute__((nonnull, visibility("hidden"))); - -kern_return_t kxld_vtable_init_from_link_state_64(KXLDVTable *vtable, u_char *state, - struct kxld_vtable_hdr *hdr) - __attribute__((nonnull, visibility("hidden"))); - -kern_return_t kxld_vtable_copy(KXLDVTable *vtable, const KXLDVTable *src) +kern_return_t kxld_vtable_init(KXLDVTable *vtable, + const struct kxld_sym *vtable_sym, const struct kxld_object *object, + const struct kxld_dict *defined_cxx_symbols) __attribute__((nonnull, visibility("hidden"))); void kxld_vtable_clear(KXLDVTable *vtable) @@ -108,13 +87,21 @@ void kxld_vtable_clear(KXLDVTable *vtable) void kxld_vtable_deinit(KXLDVTable *vtable) __attribute__((visibility("hidden"))); +/******************************************************************************* +* Accessors +*******************************************************************************/ + +KXLDVTableEntry * kxld_vtable_get_entry_for_offset(const KXLDVTable *vtable, + u_long offset, boolean_t is_32_bit) + __attribute__((pure,nonnull,visibility("hidden"))); + /******************************************************************************* * Modifiers *******************************************************************************/ /* With strict patching, the vtable patcher with only patch pad slots */ kern_return_t kxld_vtable_patch(KXLDVTable *vtable, const KXLDVTable *super_vtable, - struct kxld_symtab *symtab, boolean_t strict_patching) + struct kxld_object *object) __attribute__((nonnull, visibility("hidden"))); #endif /* _KXLD_VTABLE_H_ */ diff --git a/libkern/kxld/tests/kextcopyright.c b/libkern/kxld/tests/kextcopyright.c index dffbdbc22..7e545d328 100644 --- a/libkern/kxld/tests/kextcopyright.c +++ b/libkern/kxld/tests/kextcopyright.c @@ -1,3 +1,31 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + #include #include @@ -84,6 +112,7 @@ convert_cfstring(CFStringRef the_string) result = converted_string; finish: + CFRelease(the_data); return result; } diff --git a/libkern/kxld/tests/kxld_array_test.c b/libkern/kxld/tests/kxld_array_test.c new file mode 100644 index 000000000..4791712e1 --- /dev/null +++ b/libkern/kxld/tests/kxld_array_test.c @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include + +#include + +#include "kxld_array.h" +#include "kxld_test.h" +#include "kxld_util.h" + +#define kNumStorageTestItems (u_int) (4 * PAGE_SIZE / sizeof(u_int)) + +int +main(int argc __unused, char *argv[] __unused) +{ + kern_return_t rval = KERN_FAILURE; + KXLDArray array; + u_int *item = 0; + u_int test_num = 0; + u_int idx = 0; + u_int titems = 0; + u_int storageTestItems[kNumStorageTestItems]; + u_int i = 0; + + bzero(&array, sizeof(array)); + + kxld_set_logging_callback(kxld_test_log); + kxld_set_logging_callback_data("kxld_array_test", NULL); + + kxld_log(0, 0, "%d: Initialize", ++test_num); + + titems = PAGE_SIZE / sizeof(u_int); + rval = kxld_array_init(&array, sizeof(u_int), titems); + assert(rval == KERN_SUCCESS); + assert(array.nitems == titems); + + kxld_log(0, 0, "%d: Get item", ++test_num); + idx = 0; + item = kxld_array_get_item(&array, idx); + assert(item); + assert(item == kxld_array_get_slot(&array, idx)); + + idx = titems - 1; + item = kxld_array_get_item(&array, idx); + assert(item); + assert(item == kxld_array_get_slot(&array, idx)); + + idx = titems; + item = kxld_array_get_item(&array, idx); + assert(!item); + /* We allocated the max number of items that could be stored in a page, + * so get_slot() and get_item() are equivalent. + */ + assert(item == kxld_array_get_slot(&array, idx)); + + kxld_log(0, 0, "%d: Resize", ++test_num); + + titems = 2 * PAGE_SIZE / sizeof(u_int) + 100; + rval = kxld_array_resize(&array, titems); + assert(rval == KERN_SUCCESS); + assert(array.nitems == titems); + + kxld_log(0, 0, "%d: Get more items", ++test_num); + idx = 0; + item = kxld_array_get_item(&array, idx); + assert(item); + assert(item == kxld_array_get_slot(&array, idx)); + + idx = titems - 1; + item = kxld_array_get_item(&array, idx); + assert(item); + assert(item == kxld_array_get_slot(&array, idx)); + + idx = titems; + item = kxld_array_get_item(&array, idx); + assert(!item); + /* We allocated fewer items than could fit in a page, so get_slot() will + * return items even when get_item() does not. See below for details. + */ + assert(item != kxld_array_get_slot(&array, idx)); + + kxld_log(0, 0, "%d: Clear and attempt to get an item", ++test_num); + (void) kxld_array_clear(&array); + item = kxld_array_get_item(&array, 0); + assert(!item); + + kxld_log(0, 0, "%d: Get slot", ++test_num); + /* The array allocates its internal storage in pages. Because get_slot() + * fetches items based on the allocated size, not the logical size, we + * calculate the max items get_slot() can retrieve based on page size. + */ + titems = (u_int) (round_page(titems * sizeof(u_int)) / sizeof(u_int)); + assert(!item); + item = kxld_array_get_slot(&array, 0); + assert(item); + item = kxld_array_get_slot(&array, titems - 1); + assert(item); + item = kxld_array_get_slot(&array, titems); + assert(!item); + + kxld_log(0, 0, "%d: Reinitialize", ++test_num); + + titems = kNumStorageTestItems; + rval = kxld_array_init(&array, sizeof(u_int), titems); + assert(rval == KERN_SUCCESS); + assert(array.nitems == titems); + + kxld_log(0, 0, "%d: Storage test - %d insertions and finds", + ++test_num, kNumStorageTestItems); + for (i = 0; i < titems; ++i) { + item = kxld_array_get_item(&array, i); + assert(item); + + *item = (u_int) (random() % UINT_MAX); + storageTestItems[i] = *item; + } + + for (i = 0; i < titems; ++i) { + item = kxld_array_get_item(&array, i); + assert(item); + assert(*item == storageTestItems[i]); + } + + (void) kxld_array_deinit(&array); + + kxld_log(0, 0, " "); + kxld_log(0, 0, "All tests passed! Now check for memory leaks..."); + + kxld_print_memory_report(); + + return 0; +} diff --git a/libkern/kxld/tests/kxld_dict_test.c b/libkern/kxld/tests/kxld_dict_test.c index d831a44ed..a9b2f5f23 100644 --- a/libkern/kxld/tests/kxld_dict_test.c +++ b/libkern/kxld/tests/kxld_dict_test.c @@ -27,11 +27,10 @@ */ #include #include -#include #include #include "kxld_dict.h" -#include "kxld_util.h" +#include "kxld_test.h" #define KEYLEN 40 #define STRESSNUM 10000 @@ -41,22 +40,6 @@ typedef struct { int * value; } Stress; - -void kxld_test_log(KXLDLogSubsystem sys, KXLDLogLevel level, - const char *format, va_list ap, void *user_data); - -void -kxld_test_log(KXLDLogSubsystem sys __unused, KXLDLogLevel level __unused, - const char *format, va_list ap, void *user_data __unused) -{ - va_list args; - - va_copy(args, ap); - vfprintf(stderr, format, args); - fprintf(stderr, "\n"); - va_end(args); -} - int main(int argc __unused, char *argv[] __unused) { @@ -69,20 +52,21 @@ main(int argc __unused, char *argv[] __unused) Stress stress_test[STRESSNUM]; kxld_set_logging_callback(kxld_test_log); + kxld_set_logging_callback_data("kxld_dict_test", NULL); bzero(&dict, sizeof(dict)); - fprintf(stderr, "%d: Initialize\n", ++test_num); + kxld_log(0, 0, "%d: Initialize", ++test_num); result = kxld_dict_init(&dict, kxld_dict_string_hash, kxld_dict_string_cmp, 10); assert(result == KERN_SUCCESS); size = kxld_dict_get_num_entries(&dict); assert(size == 0); - fprintf(stderr, "%d: Find nonexistant key\n", ++test_num); + kxld_log(0, 0, "%d: Find nonexistant key", ++test_num); b = kxld_dict_find(&dict, "hi"); assert(b == NULL); - fprintf(stderr, "%d: Insert and find\n", ++test_num); + kxld_log(0, 0, "%d: Insert and find", ++test_num); result = kxld_dict_insert(&dict, "hi", &a1); assert(result == KERN_SUCCESS); b = kxld_dict_find(&dict, "hi"); @@ -90,7 +74,7 @@ main(int argc __unused, char *argv[] __unused) size = kxld_dict_get_num_entries(&dict); assert(size == 1); - fprintf(stderr, "%d: Insert same key with different values\n", ++test_num); + kxld_log(0, 0, "%d: Insert same key with different values", ++test_num); result = kxld_dict_insert(&dict, "hi", &a2); assert(result == KERN_SUCCESS); b = kxld_dict_find(&dict, "hi"); @@ -98,15 +82,16 @@ main(int argc __unused, char *argv[] __unused) size = kxld_dict_get_num_entries(&dict); assert(size == 1); - fprintf(stderr, "%d: Clear and find of nonexistant key\n", ++test_num); + kxld_log(0, 0, "%d: Clear and find of nonexistant key", ++test_num); kxld_dict_clear(&dict); result = kxld_dict_init(&dict, kxld_dict_string_hash, kxld_dict_string_cmp, 10); + assert(result == KERN_SUCCESS); b = kxld_dict_find(&dict, "hi"); assert(b == NULL); size = kxld_dict_get_num_entries(&dict); assert(size == 0); - fprintf(stderr, "%d: Insert multiple keys\n", ++test_num); + kxld_log(0, 0, "%d: Insert multiple keys", ++test_num); result = kxld_dict_insert(&dict, "hi", &a1); assert(result == KERN_SUCCESS); result = kxld_dict_insert(&dict, "hello", &a2); @@ -119,7 +104,7 @@ main(int argc __unused, char *argv[] __unused) size = kxld_dict_get_num_entries(&dict); assert(size == 2); - fprintf(stderr, "%d: Remove keys\n", ++test_num); + kxld_log(0, 0, "%d: Remove keys", ++test_num); kxld_dict_remove(&dict, "hi", &b); assert(b && *(int*)b == a1); b = kxld_dict_find(&dict, "hi"); @@ -129,17 +114,18 @@ main(int argc __unused, char *argv[] __unused) size = kxld_dict_get_num_entries(&dict); assert(size == 1); - fprintf(stderr, "%d: Stress test - %d insertions and finds\n", ++test_num, STRESSNUM); + kxld_log(0, 0, "%d: Stress test - %d insertions and finds", ++test_num, STRESSNUM); kxld_dict_clear(&dict); result = kxld_dict_init(&dict, kxld_dict_string_hash, kxld_dict_string_cmp, 10); + assert(result == KERN_SUCCESS); for (i = 0; i < STRESSNUM; ++i) { int * tmp_value = kxld_alloc(sizeof(int)); char * tmp_key = kxld_alloc(sizeof(char) * (KEYLEN + 1)); *tmp_value = i; for (j = 0; j < KEYLEN; ++j) { - tmp_key[j] = (rand() % 26) + 'a'; + tmp_key[j] = (random() % 26) + 'a'; } tmp_key[KEYLEN] = '\0'; @@ -161,10 +147,10 @@ main(int argc __unused, char *argv[] __unused) kxld_free(stress_test[i].value, sizeof(int)); } - fprintf(stderr, "%d: Destroy\n", ++test_num); + kxld_log(0, 0, "%d: Destroy", ++test_num); kxld_dict_deinit(&dict); - fprintf(stderr, "\nAll tests passed! Now check for memory leaks...\n"); + kxld_log(0, 0, "\nAll tests passed! Now check for memory leaks..."); kxld_print_memory_report(); diff --git a/bsd/ppc/psl.h b/libkern/kxld/tests/kxld_test.c similarity index 76% rename from bsd/ppc/psl.h rename to libkern/kxld/tests/kxld_test.c index 14abec125..d802cc7a1 100644 --- a/bsd/ppc/psl.h +++ b/libkern/kxld/tests/kxld_test.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,19 +25,21 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* Copyright (c) 1993 NeXT Computer, Inc. All rights reserved. - * - * File: bsd/ppc/psl.h - * - */ +#include -#if KERNEL_PRIVATE +#include "kxld_test.h" +#include "kxld_util.h" -#ifndef _BSD_PPC_PSL_H_ -#define _BSD_PPC_PSL_H_ +void +kxld_test_log(KXLDLogSubsystem sys __unused, KXLDLogLevel level __unused, + const char *format, va_list ap, void *user_data __unused) +{ + va_list args; -/* empty */ + va_copy(args, ap); + vfprintf(stderr, format, args); + fprintf(stderr, "\n"); + va_end(args); +} -#endif /* _BSD_PPC_PSL_H_ */ -#endif /* KERNEL_PRIVATE */ diff --git a/osfmk/ppc/machine_rpc.h b/libkern/kxld/tests/kxld_test.h similarity index 86% rename from osfmk/ppc/machine_rpc.h rename to libkern/kxld/tests/kxld_test.h index ffbf6c762..98e05c778 100644 --- a/osfmk/ppc/machine_rpc.h +++ b/libkern/kxld/tests/kxld_test.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002,2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,14 +25,9 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * @OSF_COPYRIGHT@ - * - */ - -#ifndef _MACHINE_RPC_H_ -#define _MACHINE_RPC_H_ -#endif /* _MACHINE_RPC_H_ */ +#include "kxld_util.h" +void kxld_test_log(KXLDLogSubsystem sys, KXLDLogLevel level, + const char *format, va_list ap, void *user_data); diff --git a/libkern/kxld/tests/loadtest.py b/libkern/kxld/tests/loadtest.py index def56cfed..ce7a95d96 100644 --- a/libkern/kxld/tests/loadtest.py +++ b/libkern/kxld/tests/loadtest.py @@ -1,3 +1,31 @@ +## +# Copyright (c) 2009 Apple Inc. All rights reserved. +# +# @APPLE_OSREFERENCE_LICENSE_HEADER_START@ +# +# This file contains Original Code and/or Modifications of Original Code +# as defined in and that are subject to the Apple Public Source License +# Version 2.0 (the 'License'). You may not use this file except in +# compliance with the License. The rights granted to you under the License +# may not be used to create, or enable the creation or redistribution of, +# unlawful or unlicensed copies of an Apple operating system, or to +# circumvent, violate, or enable the circumvention or violation of, any +# terms of an Apple operating system software license agreement. +# +# Please obtain a copy of the License at +# http://www.opensource.apple.com/apsl/ and read it before using this file. +# +# The Original Code and all software distributed under the License are +# distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER +# EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +# INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. +# Please see the License for the specific language governing rights and +# limitations under the License. +# +# @APPLE_OSREFERENCE_LICENSE_HEADER_END@ +## + #!/usr/bin/env python import sys diff --git a/libkern/libkern/Makefile b/libkern/libkern/Makefile index 76e4d9f99..2d86d6882 100644 --- a/libkern/libkern/Makefile +++ b/libkern/libkern/Makefile @@ -11,21 +11,14 @@ INSTINC_SUBDIRS = \ machine \ c++ \ crypto - -INSTINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS} \ - ppc - INSTINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS} \ i386 - INSTINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS} \ i386 - INSTINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS} \ arm EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} -EXPINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS_PPC} EXPINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS_I386} EXPINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS_X86_64} EXPINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS_ARM} @@ -42,15 +35,16 @@ DATAFILES = \ OSTypes.h \ locks.h \ sysctl.h \ + tree.h \ zconf.h \ zlib.h PRIVATE_DATAFILES = \ - OSKextLibPrivate.h \ OSKextLibPrivate.h \ kext_request_keys.h \ mkext.h \ - prelink.h + prelink.h \ + WKdm.h INSTALL_MI_LIST = \ OSByteOrder.h \ @@ -65,6 +59,7 @@ INSTALL_MI_DIR = libkern INSTALL_MI_LCL_LIST = \ ${INSTALL_MI_LIST} \ ${PRIVATE_DATAFILES} \ + tree.h \ kext_panic_report.h \ OSCrossEndian.h diff --git a/libkern/libkern/OSAtomic.h b/libkern/libkern/OSAtomic.h index 36d9127e2..72ff30594 100644 --- a/libkern/libkern/OSAtomic.h +++ b/libkern/libkern/OSAtomic.h @@ -41,6 +41,22 @@ extern "C" { #endif +#ifdef XNU_KERNEL_PRIVATE +/* + * The macro SAFE_CAST_PTR() casts one type of pointer to another type, making sure + * the data the pointer is referencing is the same size. If it is not, it will cause + * a division by zero compiler warning. This is to work around "SInt32" being defined + * as "long" on ILP32 and as "int" on LP64, which would require an explicit cast to + * "SInt32*" when for instance passing an "int*" to OSAddAtomic() - which masks size + * mismatches. + * -- var is used, but sizeof does not evaluate the + * argument, i.e. we're safe against "++" etc. in var -- + */ +#define __SAFE_CAST_PTR(type, var) (((type)(var))+(0/(sizeof(*var) == sizeof(*(type)0) ? 1 : 0))) +#else +#define __SAFE_CAST_PTR(type, var) ((type)(var)) +#endif + /*! * @header * @@ -64,6 +80,8 @@ extern Boolean OSCompareAndSwap64( UInt64 oldValue, UInt64 newValue, volatile UInt64 * address); +#define OSCompareAndSwap64(a, b, c) \ + (OSCompareAndSwap64(a, b, __SAFE_CAST_PTR(volatile UInt64*,c))) #endif /* defined(__i386__) || defined(__x86_64__) */ @@ -81,6 +99,8 @@ extern Boolean OSCompareAndSwap64( extern SInt64 OSAddAtomic64( SInt64 theAmount, volatile SInt64 * address); +#define OSAddAtomic64(a, b) \ + (OSAddAtomic64(a, __SAFE_CAST_PTR(volatile SInt64*,b))) /*! * @function OSIncrementAtomic64 @@ -126,6 +146,8 @@ inline static SInt64 OSDecrementAtomic64(volatile SInt64 * address) extern long OSAddAtomicLong( long theAmount, volatile long * address); +#define OSAddAtomicLong(a, b) \ + (OSAddAtomicLong(a, __SAFE_CAST_PTR(volatile long*,b))) /* Not to be included in headerdoc. * @@ -156,22 +178,6 @@ inline static long OSDecrementAtomicLong(volatile long * address) } #endif /* XNU_KERNEL_PRIVATE */ -/* - * The macro SAFE_CAST_PTR() casts one type of pointer to another type, making sure - * the data the pointer is referencing is the same size. If it is not, it will cause - * a division by zero compiler warning. This is to work around "SInt32" being defined - * as "long" on ILP32 and as "int" on LP64, which would require an explicit cast to - * "SInt32*" when for instance passing an "int*" to OSAddAtomic() - which masks size - * mismatches. - * -- var is used twice, but sizeof does not evaluate the - * argument, i.e. we're safe against "++" etc. in var -- - */ -#ifdef XNU_KERNEL_PRIVATE -#define SAFE_CAST_PTR(type, var) (((type)(var))+(0/(sizeof(*var) == sizeof(*(type)0) ? 1 : 0))) -#else -#define SAFE_CAST_PTR(type, var) ((type)(var)) -#endif - /*! * @function OSCompareAndSwap * @@ -193,7 +199,7 @@ extern Boolean OSCompareAndSwap( UInt32 newValue, volatile UInt32 * address); #define OSCompareAndSwap(a, b, c) \ - (OSCompareAndSwap(a, b, SAFE_CAST_PTR(volatile UInt32*,c))) + (OSCompareAndSwap(a, b, __SAFE_CAST_PTR(volatile UInt32*,c))) /*! * @function OSCompareAndSwapPtr @@ -215,7 +221,7 @@ extern Boolean OSCompareAndSwapPtr( void * newValue, void * volatile * address); #define OSCompareAndSwapPtr(a, b, c) \ - (OSCompareAndSwapPtr(a, b, SAFE_CAST_PTR(void * volatile *,c))) + (OSCompareAndSwapPtr(a, b, __SAFE_CAST_PTR(void * volatile *,c))) /*! * @function OSAddAtomic @@ -235,7 +241,7 @@ extern SInt32 OSAddAtomic( SInt32 amount, volatile SInt32 * address); #define OSAddAtomic(a, b) \ - (OSAddAtomic(a, SAFE_CAST_PTR(volatile SInt32*,b))) + (OSAddAtomic(a, __SAFE_CAST_PTR(volatile SInt32*,b))) /*! * @function OSAddAtomic16 @@ -288,7 +294,7 @@ extern SInt8 OSAddAtomic8( */ extern SInt32 OSIncrementAtomic(volatile SInt32 * address); #define OSIncrementAtomic(a) \ - (OSIncrementAtomic(SAFE_CAST_PTR(volatile SInt32*,a))) + (OSIncrementAtomic(__SAFE_CAST_PTR(volatile SInt32*,a))) /*! * @function OSIncrementAtomic16 @@ -335,7 +341,7 @@ extern SInt8 OSIncrementAtomic8(volatile SInt8 * address); */ extern SInt32 OSDecrementAtomic(volatile SInt32 * address); #define OSDecrementAtomic(a) \ - (OSDecrementAtomic(SAFE_CAST_PTR(volatile SInt32*,a))) + (OSDecrementAtomic(__SAFE_CAST_PTR(volatile SInt32*,a))) /*! * @function OSDecrementAtomic16 @@ -385,7 +391,7 @@ extern UInt32 OSBitAndAtomic( UInt32 mask, volatile UInt32 * address); #define OSBitAndAtomic(a, b) \ - (OSBitAndAtomic(a, SAFE_CAST_PTR(volatile UInt32*,b))) + (OSBitAndAtomic(a, __SAFE_CAST_PTR(volatile UInt32*,b))) /*! * @function OSBitAndAtomic16 @@ -441,7 +447,7 @@ extern UInt32 OSBitOrAtomic( UInt32 mask, volatile UInt32 * address); #define OSBitOrAtomic(a, b) \ - (OSBitOrAtomic(a, SAFE_CAST_PTR(volatile UInt32*,b))) + (OSBitOrAtomic(a, __SAFE_CAST_PTR(volatile UInt32*,b))) /*! * @function OSBitOrAtomic16 @@ -497,7 +503,7 @@ extern UInt32 OSBitXorAtomic( UInt32 mask, volatile UInt32 * address); #define OSBitXorAtomic(a, b) \ - (OSBitXorAtomic(a, SAFE_CAST_PTR(volatile UInt32*,b))) + (OSBitXorAtomic(a, __SAFE_CAST_PTR(volatile UInt32*,b))) /*! * @function OSBitXorAtomic16 @@ -571,44 +577,54 @@ extern Boolean OSTestAndClear( UInt32 bit, volatile UInt8 * startAddress); -#ifdef __ppc__ /*! - * @function OSEnqueueAtomic + * @defined OS_SPINLOCK_INIT * * @abstract - * Singly linked list head insertion, performed atomically with respect to all devices that participate in the coherency architecture of the platform. + * The default value for an OSSpinLock. * * @discussion - * The OSEnqueueAtomic function places an element at the head of a single linked list, which is specified with the address of a head pointer, listHead. The element structure has a next field whose offset is specified. + * The convention is that unlocked is zero, locked is nonzero. + */ +#define OS_SPINLOCK_INIT 0 + +/*! + * @typedef OSSpinLock * - * This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. Additionally, this function incorporates a memory barrier on systems with weakly-ordered memory architectures. - * @param listHead The address of a head pointer for the list . - * @param element The list element to insert at the head of the list. - * @param elementNextFieldOffset The byte offset into the element where a pointer to the next element in the list is stored. + * @abstract + * Data type for a spinlock. + * + * @discussion + * You should always initialize a spinlock to OS_SPINLOCK_INIT before using it. */ -extern void OSEnqueueAtomic( - void * volatile * listHead, - void * element, - SInt32 elementNextFieldOffset); +typedef SInt32 OSSpinLock; +#ifdef PRIVATE /*! - * @function OSDequeueAtomic + * @function OSSpinLockTry * * @abstract - * Singly linked list element head removal, performed atomically with respect to all devices that participate in the coherency architecture of the platform. + * Locks a spinlock if it would not block. * * @discussion - * The OSDequeueAtomic function removes an element from the head of a single linked list, which is specified with the address of a head pointer, listHead. The element structure has a next field whose offset is specified. + * Multiprocessor locks used within the shared memory area between the kernel and event system. These must work in both user and kernel mode. * - * This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. Additionally, this function incorporates a memory barrier on systems with weakly-ordered memory architectures. - * @param listHead The address of a head pointer for the list . - * @param elementNextFieldOffset The byte offset into the element where a pointer to the next element in the list is stored. - * @result A removed element, or zero if the list is empty. + * @result + * Returns false if the lock was already held by another thread, true if it took the lock successfully. + */ +extern Boolean OSSpinLockTry(volatile OSSpinLock * lock); + +/*! + * @function OSSpinLockUnlock + * + * @abstract + * Unlocks a spinlock. + * + * @discussion + * Unlocks a spinlock. */ -extern void * OSDequeueAtomic( - void * volatile * listHead, - SInt32 elementNextFieldOffset); -#endif /* __ppc__ */ +extern void OSSpinLockUnlock(volatile OSSpinLock * lock); +#endif /* PRIVATE */ /*! * @function OSSynchronizeIO @@ -621,9 +637,6 @@ extern void * OSDequeueAtomic( */ static __inline__ void OSSynchronizeIO(void) { -#if defined(__ppc__) - __asm__ ("eieio"); -#endif } #if defined(__cplusplus) diff --git a/libkern/libkern/OSAtomic.h.save b/libkern/libkern/OSAtomic.h.save deleted file mode 100644 index 1870272b5..000000000 --- a/libkern/libkern/OSAtomic.h.save +++ /dev/null @@ -1,305 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. - * - * HISTORY - * - */ - -#ifndef _OS_OSATOMIC_H -#define _OS_OSATOMIC_H - -#include - -#if defined(__cplusplus) -extern "C" { -#endif - -/*! @function OSCompareAndSwap - @abstract Compare and swap operation, performed atomically with respect to all devices that participate in the coherency architecture of the platform. - @discussion The OSCompareAndSwap function compares the value at the specified address with oldVal. The value of newValue is written to the address only if oldValue and the value at the address are equal. OSCompareAndSwap returns true if newValue is written to the address; otherwise, it returns false. - - This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. - @param oldValue The value to compare at address. - @param newValue The value to write to address if oldValue compares true. - @param address The 4-byte aligned address of the data to update atomically. - @result true if newValue was written to the address. */ - -extern Boolean OSCompareAndSwap( UInt32 oldValue, UInt32 newValue, UInt32 * address ); - -/*! @function OSAddAtomic - @abstract 32-bit add operation, performed atomically with respect to all devices that participate in the coherency architecture of the platform. - @discussion The OSAddAtomic function adds the specified amount to the value at the specified address and returns the original value. - - This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. - @param amount The amount to add. - @param address The 4-byte aligned address of the value to update atomically. - @result The value before the addition */ - -extern SInt32 OSAddAtomic(SInt32 amount, SInt32 * address); - -/*! @function OSAddAtomic16 - @abstract 16-bit add operation, performed atomically with respect to all devices that participate in the coherency architecture of the platform. - @discussion The OSAddAtomic16 function adds the specified amount to the value at the specified address and returns the original value. - - This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. - @param amount The amount to add. - @param address The 2-byte aligned address of the value to update atomically. - @result The value before the addition */ - -extern SInt16 OSAddAtomic16(SInt32 amount, SInt16 * address); - -/*! @function OSAddAtomic8 - @abstract 8-bit add operation, performed atomically with respect to all devices that participate in the coherency architecture of the platform. - @discussion The OSAddAtomic8 function adds the specified amount to the value at the specified address and returns the original value. - - This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. - @param amount The amount to add. - @param address The address of the value to update atomically. - @result The value before the addition */ - -extern SInt8 OSAddAtomic8(SInt32 amount, SInt8 * address); - -/*! @function OSIncrementAtomic - @abstract 32-bit increment operation, performed atomically with respect to all devices that participate in the coherency architecture of the platform. - @discussion The OSIncrementAtomic function increments the value at the specified address by one and returns the original value. - - This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. - @param address The 4-byte aligned address of the value to update atomically. - @result The value before the increment. */ - -extern SInt32 OSIncrementAtomic(SInt32 * address); - -/*! @function OSIncrementAtomic16 - @abstract 16-bit increment operation, performed atomically with respect to all devices that participate in the coherency architecture of the platform. - @discussion The OSIncrementAtomic16 function increments the value at the specified address by one and returns the original value. - - This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. - @param address The 2-byte aligned address of the value to update atomically. - @result The value before the increment. */ - -extern SInt16 OSIncrementAtomic16(SInt16 * address); - -/*! @function OSIncrementAtomic8 - @abstract 8-bit increment operation, performed atomically with respect to all devices that participate in the coherency architecture of the platform. - @discussion The OSIncrementAtomic8 function increments the value at the specified address by one and returns the original value. - - This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. - @param address The address of the value to update atomically. - @result The value before the increment. */ - -extern SInt8 OSIncrementAtomic8(SInt8 * address); - -/*! @function OSDecrementAtomic - @abstract 32-bit decrement operation, performed atomically with respect to all devices that participate in the coherency architecture of the platform. - @discussion The OSDecrementAtomic function decrements the value at the specified address by one and returns the original value. - - This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. - @param address The 4-byte aligned address of the value to update atomically. - @result The value before the decrement. */ - -extern SInt32 OSDecrementAtomic(SInt32 * address); - -/*! @function OSDecrementAtomic16 - @abstract 16-bit decrement operation, performed atomically with respect to all devices that participate in the coherency architecture of the platform. - @discussion The OSDecrementAtomic16 function decrements the value at the specified address by one and returns the original value. - - This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. - @param address The 2-byte aligned address of the value to update atomically. - @result The value before the decrement. */ - -extern SInt16 OSDecrementAtomic16(SInt16 * address); - -/*! @function OSDecrementAtomic8 - @abstract 8-bit decrement operation, performed atomically with respect to all devices that participate in the coherency architecture of the platform. - @discussion The OSDecrementAtomic8 function decrements the value at the specified address by one and returns the original value. - - This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. - @param address The address of the value to update atomically. - @result The value before the decrement. */ - -extern SInt8 OSDecrementAtomic8(SInt8 * address); - -/*! @function OSBitAndAtomic - @abstract 32-bit logical and operation, performed atomically with respect to all devices that participate in the coherency architecture of the platform. - @discussion The OSBitAndAtomic function logically ands the bits of the specified mask into the value at the specified address and returns the original value. - - This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. - @param mask The mask to logically and with the value. - @param address The 4-byte aligned address of the value to update atomically. - @result The value before the bitwise operation */ - -extern UInt32 OSBitAndAtomic(UInt32 mask, UInt32 * address); - -/*! @function OSBitAndAtomic16 - @abstract 16-bit logical and operation, performed atomically with respect to all devices that participate in the coherency architecture of the platform. - @discussion The OSBitAndAtomic16 function logically ands the bits of the specified mask into the value at the specified address and returns the original value. - - This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. - @param mask The mask to logically and with the value. - @param address The 2-byte aligned address of the value to update atomically. - @result The value before the bitwise operation. */ - -extern UInt16 OSBitAndAtomic16(UInt32 mask, UInt16 * address); - -/*! @function OSBitAndAtomic8 - @abstract 8-bit logical and operation, performed atomically with respect to all devices that participate in the coherency architecture of the platform. - @discussion The OSBitAndAtomic8 function logically ands the bits of the specified mask into the value at the specified address and returns the original value. - - This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. - @param mask The mask to logically and with the value. - @param address The address of the value to update atomically. - @result The value before the bitwise operation. */ - -extern UInt8 OSBitAndAtomic8(UInt32 mask, UInt8 * address); - -/*! @function OSBitOrAtomic - @abstract 32-bit logical or operation, performed atomically with respect to all devices that participate in the coherency architecture of the platform. - @discussion The OSBitOrAtomic function logically ors the bits of the specified mask into the value at the specified address and returns the original value. - - This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. - @param mask The mask to logically or with the value. - @param address The 4-byte aligned address of the value to update atomically. - @result The value before the bitwise operation. */ - -extern UInt32 OSBitOrAtomic(UInt32 mask, UInt32 * address); - -/*! @function OSBitOrAtomic16 - @abstract 16-bit logical or operation, performed atomically with respect to all devices that participate in the coherency architecture of the platform. - @discussion The OSBitOrAtomic16 function logically ors the bits of the specified mask into the value at the specified address and returns the original value. - - This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. - @param mask The mask to logically or with the value. - @param address The 2-byte aligned address of the value to update atomically. - @result The value before the bitwise operation. */ - -extern UInt16 OSBitOrAtomic16(UInt32 mask, UInt16 * address); - -/*! @function OSBitOrAtomic8 - @abstract 8-bit logical or operation, performed atomically with respect to all devices that participate in the coherency architecture of the platform. - - This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. - @discussion The OSBitOrAtomic8 function logically ors the bits of the specified mask into the value at the specified address and returns the original value. - @param mask The mask to logically or with the value. - @param address The address of the value to update atomically. - @result The value before the bitwise operation. */ - -extern UInt8 OSBitOrAtomic8(UInt32 mask, UInt8 * address); - -/*! @function OSBitXorAtomic - @abstract 32-bit logical xor operation, performed atomically with respect to all devices that participate in the coherency architecture of the platform. - - This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. - @discussion The OSBitXorAtomic function logically xors the bits of the specified mask into the value at the specified address and returns the original value. - @param mask The mask to logically or with the value. - @param address The 4-byte aligned address of the value to update atomically. - @result The value before the bitwise operation. */ - -extern UInt32 OSBitXorAtomic(UInt32 mask, UInt32 * address); - -/*! @function OSBitXorAtomic16 - @abstract 16-bit logical xor operation, performed atomically with respect to all devices that participate in the coherency architecture of the platform. - @discussion The OSBitXorAtomic16 function logically xors the bits of the specified mask into the value at the specified address and returns the original value. - - This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. - @param mask The mask to logically or with the value. - @param address The 2-byte aligned address of the value to update atomically. - @result The value before the bitwise operation. */ - -extern UInt16 OSBitXorAtomic16(UInt32 mask, UInt16 * address); - -/*! @function OSBitXorAtomic8 - @abstract 8-bit logical xor operation, performed atomically with respect to all devices that participate in the coherency architecture of the platform. - - This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. - @discussion The OSBitXorAtomic8 function logically xors the bits of the specified mask into the value at the specified address and returns the original value. - @param mask The mask to logically or with the value. - @param address The address of the value to update atomically. - @result The value before the bitwise operation. */ - -extern UInt8 OSBitXorAtomic8(UInt32 mask, UInt8 * address); - -/*! @function OSTestAndSet - @abstract Bit test and set operation, performed atomically with respect to all devices that participate in the coherency architecture of the platform. - - This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. - @discussion The OSTestAndSet function sets a single bit in a byte at a specified address. It returns true if the bit was already set, false otherwise. - @param bit The bit number in the range 0 through 7. - @param address The address of the byte to update atomically. - @result true if the bit was already set, false otherwise. */ - -extern Boolean OSTestAndSet(UInt32 bit, UInt8 * startAddress); - -/*! @function OSTestAndClear - @abstract Bit test and clear operation, performed atomically with respect to all devices that participate in the coherency architecture of the platform. - @discussion The OSTestAndClear function clears a single bit in a byte at a specified address. It returns true if the bit was already clear, false otherwise. - - This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. - @param bit The bit number in the range 0 through 7. - @param address The address of the byte to update atomically. - @result true if the bit was already clear, false otherwise. */ - -extern Boolean OSTestAndClear(UInt32 bit, UInt8 * startAddress); - -#ifdef __ppc__ -/*! @function OSEnqueueAtomic - @abstract Singly linked list head insertion, performed atomically with respect to all devices that participate in the coherency architecture of the platform. - @discussion The OSEnqueueAtomic function places an element at the head of a single linked list, which is specified with the address of a head pointer, listHead. The element structure has a next field whose offset is specified. - - This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. - @param listHead The address of a head pointer for the list . - @param element The list element to insert at the head of the list. - @param elementNextFieldOffset The byte offset into the element where a pointer to the next element in the list is stored. */ - -extern void OSEnqueueAtomic(void ** listHead, void * element, - SInt32 elementNextFieldOffset); - -/*! @function OSDequeueAtomic - @abstract Singly linked list element head removal, performed atomically with respect to all devices that participate in the coherency architecture of the platform. - @discussion The OSDequeueAtomic function removes an element from the head of a single linked list, which is specified with the address of a head pointer, listHead. The element structure has a next field whose offset is specified. - - This function guarantees atomicity only with main system memory. It is specifically unsuitable for use on noncacheable memory such as that in devices; this function cannot guarantee atomicity, for example, on memory mapped from a PCI device. - @param listHead The address of a head pointer for the list . - @param elementNextFieldOffset The byte offset into the element where a pointer to the next element in the list is stored. - @result A removed element, or zero if the list is empty. */ - -extern void * OSDequeueAtomic(void ** listHead, - SInt32 elementNextFieldOffset); -#endif /* __ppc__ */ - -/*! @function OSSynchronizeIO - @abstract The OSSynchronizeIO routine ensures orderly load and store operations to noncached memory mapped I/O devices. - @discussion The OSSynchronizeIO routine ensures orderly load and store operations to noncached memory mapped I/O devices. It executes the eieio instruction on PowerPC processors. */ - -static __inline__ void OSSynchronizeIO(void) -{ -#if defined(__ppc__) - __asm__ ("eieio"); -#endif -} - -#if defined(__cplusplus) -} -#endif - -#endif /* ! _OS_OSATOMIC_H */ diff --git a/libkern/libkern/OSByteOrder.h b/libkern/libkern/OSByteOrder.h index cb12cb31a..8ae2c33b8 100644 --- a/libkern/libkern/OSByteOrder.h +++ b/libkern/libkern/OSByteOrder.h @@ -39,9 +39,7 @@ #if defined(__GNUC__) -#if (defined(__ppc__) || defined(__ppc64__)) -#include -#elif (defined(__i386__) || defined(__x86_64__)) +#if (defined(__i386__) || defined(__x86_64__)) #include #else #include diff --git a/libkern/libkern/OSCrossEndian.h b/libkern/libkern/OSCrossEndian.h index 0bbbf58e3..6038319ef 100644 --- a/libkern/libkern/OSCrossEndian.h +++ b/libkern/libkern/OSCrossEndian.h @@ -60,29 +60,8 @@ #include -#if __ppc__ - -static __inline__ int -_OSRosettaCheck(void) -{ - int isCrossEndian = 0; - int val = 0; - size_t size = sizeof val; - - if (sysctlbyname("sysctl.proc_native", &val, &size, NULL, 0) == -1) - isCrossEndian = 0; - else - isCrossEndian = val ? 0 : 1; - - return isCrossEndian; -} - -#else /* __ppc__ */ - static __inline__ int _OSRosettaCheck(void) { return 0; } -#endif /* __ppc__ */ - #define IF_ROSETTA() if (__builtin_expect(_OSRosettaCheck(), 0) ) #define ROSETTA_ONLY(exprs) \ diff --git a/libkern/libkern/OSDebug.h b/libkern/libkern/OSDebug.h index eaeefc129..84611f320 100644 --- a/libkern/libkern/OSDebug.h +++ b/libkern/libkern/OSDebug.h @@ -48,6 +48,9 @@ extern void trace_backtrace(unsigned int debugid, unsigned int debugid2, unsigne extern void OSReportWithBacktrace(const char *str, ...); extern unsigned OSBacktrace(void **bt, unsigned maxAddrs); +/* Simple dump of 20 backtrace entries */ +extern void OSPrintBacktrace(void); + /*! @function OSKernelStackRemaining @abstract Returns bytes available below the current stack frame. @discussion Returns bytes available below the current stack frame. Safe for interrupt or thread context. diff --git a/libkern/libkern/OSKextLib.h b/libkern/libkern/OSKextLib.h index 9842b7546..6ecc3548d 100644 --- a/libkern/libkern/OSKextLib.h +++ b/libkern/libkern/OSKextLib.h @@ -671,6 +671,11 @@ OSReturn OSKextReleaseKextWithLoadTag(OSKextLoadTag loadTag); #pragma mark Kext Requests /********************************************************************/ #endif +/*! + * @group Kext Requests to User Space + * Functions for making requests to kextd in user space. + */ + /*! * @typedef OSKextRequestTag * @@ -679,9 +684,15 @@ OSReturn OSKextReleaseKextWithLoadTag(OSKextLoadTag loadTag); */ typedef uint32_t OSKextRequestTag; +/*! + * @define kOSKextRequestTagInvalid + * + * @abstract + * A request tag value that will never be used for a kext request; + * indicates failure to create/queue the request. + */ #define kOSKextRequestTagInvalid ((OSKextRequestTag)-1) - /*! * @typedef OSKextRequestResourceCallback * @@ -732,7 +743,10 @@ typedef void (* OSKextRequestResourceCallback)( * when it is invoked. May be NULL. * @param requestTagOut If non-NULL, * filled on success with a tag identifying the - * pending request; can be used with + * pending request + * (or on failure with @link kOSKextRequestTagInvalid + * kOSKextRequestTagInvalid@/link; + * can be used with * @link OSKextCancelRequest * OSKextCancelRequest@/link. * @@ -748,12 +762,23 @@ typedef void (* OSKextRequestResourceCallback)( * Other OSKextReturn... errors are possible. * * @discussion - * This function queues a request to the user-space kext daemon + * This function queues an asynchronous request to the user-space kext daemon * @link //apple_ref/doc/man/8/kextd kextd(8)@/link; * requests for resources early in system startup * will not be fulfilled until that daemon starts. - * Note also that the localization context of the kext daemon - * (namely tha tof the superuser) + * Requests made by a kext while that kext is loading + * (specifically in the kext's module start routine) + * will not be fulfilled until after the start routine returns and + * the kext is completely loaded. + * Kexts requesting resources should be sure to perform appropriate locking + * in the callback function. + * + * Kext resources are stored in the kext's on-disk bundle under the + * Resources subdirectory. + * See {@linkdoc //apple_ref/doc/uid/10000123i Bundle Programming Guide} + * for an overview of bundle structure. + * The localization context of the kext daemon + * (namely that of the superuser) * will be used in retrieving resources; * kext resources intended for use in the kernel * should generally not be localized. @@ -828,15 +853,12 @@ OSReturn OSKextCancelRequest( void ** contextOut); -#if (__x86_64__) - #if PRAGMA_MARK #pragma mark - /********************************************************************/ #pragma mark Weak linking /********************************************************************/ #endif - /*! * @group Weak Linking * Support for weak references to symbols in kexts. @@ -894,8 +916,6 @@ extern const void * gOSKextUnresolved; #define OSKextSymbolIsResolved(weak_sym) \ (&(weak_sym) != gOSKextUnresolved) -#endif /* (__x86_64__) */ - #endif /* KERNEL */ __END_DECLS diff --git a/libkern/libkern/OSKextLibPrivate.h b/libkern/libkern/OSKextLibPrivate.h index cc4f3aa29..53fbc3921 100644 --- a/libkern/libkern/OSKextLibPrivate.h +++ b/libkern/libkern/OSKextLibPrivate.h @@ -30,6 +30,7 @@ #define _LIBKERN_OSKEXTLIBPRIVATE_H #include +#include __BEGIN_DECLS #ifdef KERNEL @@ -38,6 +39,7 @@ __BEGIN_DECLS #include #else #include +#include #endif /* KERNEL */ __END_DECLS @@ -85,15 +87,17 @@ typedef uint8_t OSKextExcludeLevel; #endif /********************************************************************* * In addition to the keys defined here, you will find: -* CFBundleIdentifier -* CFBundleVersion -* OSBundleCompatibleVersion -* OSKernelResource -* OSBundleInterface +* kCFBundleIdentifierKey +* kCFBundleVersionKey +* kOSBundleCompatibleVersionKey +* kOSBundleIsInterfaceKey +* kOSKernelResourceKey *********************************************************************/ +#define kOSBundleMachOHeadersKey "OSBundleMachOHeaders" #define kOSBundleCPUTypeKey "OSBundleCPUType" #define kOSBundleCPUSubtypeKey "OSBundleCPUSubtype" #define kOSBundlePathKey "OSBundlePath" +#define kOSBundleExecutablePathKey "OSBundleExecutablePath" #define kOSBundleUUIDKey "OSBundleUUID" #define kOSBundleStartedKey "OSBundleStarted" #define kOSBundlePrelinkedKey "OSBundlePrelinked" @@ -104,8 +108,13 @@ typedef uint8_t OSKextExcludeLevel; #define kOSBundleDependenciesKey "OSBundleDependencies" #define kOSBundleRetainCountKey "OSBundleRetainCount" +/* Dictionary of metaclass info keyed by classname. + */ #define kOSBundleClassesKey "OSBundleClasses" +/* These are contained in kOSBundleClassesKey. kOSMetaClassSuperclassNameKey + * may be absent (for the root class). + */ #define kOSMetaClassNameKey "OSMetaClassName" #define kOSMetaClassSuperclassNameKey "OSMetaClassSuperclassName" #define kOSMetaClassTrackingCountKey "OSMetaClassTrackingCount" @@ -653,15 +662,6 @@ Boolean OSKextVersionGetString( void kext_weak_symbol_referenced(void); #endif /* XNU_KERNEL_PRIVATE */ -#if !(__x86_64__) - -extern const void *gOSKextUnresolved; - -#define OSKextSymbolIsResolved(weak_sym) \ - (&(weak_sym) != gOSKextUnresolved) - -#endif /* !(__x86_64__) */ - #if PRAGMA_MARK #pragma mark - /********************************************************************/ @@ -681,6 +681,17 @@ vm_map_t kext_get_vm_map(kmod_info_t * info); #ifdef XNU_KERNEL_PRIVATE +#if CONFIG_DTRACE +/*! + * @function OSKextRegisterKextsWithDTrace + * @abstract + * DTrace calls this once when it has started up so that the kext system + * will register any already-loaded kexts with it. + */ +void OSKextRegisterKextsWithDTrace(void); + +#endif /* CONFIG_DTRACE */ + /*! * @function kext_dump_panic_lists * @abstract Prints compacted lists of last unloaded & all loaded kexts @@ -787,6 +798,89 @@ OSReturn OSKextUnloadKextWithLoadTag(uint32_t loadTag); #endif /* KERNEL */ +#if PRAGMA_MARK +#pragma mark - +/********************************************************************/ +#pragma mark Loaded Kext Summary +/********************************************************************/ +#endif + +/*! + * @define kOSKextLoadedKextSummaryVersion + * @abstract The current version of the loaded kext summary headers. + */ +#define kOSKextLoadedKextSummaryVersion 2 + +/*! + * @typedef OSKextLoadedKextSummary + * @abstract A structure that describes a loaded kext. + * + * @field name The kext's bundle identifier. + * @field uuid The kext's UUID; + * @field address The kext's load address. + * @field size The kext's load size. + * @field version The binary format (OSKextVersion) version of the kext. + * @field loadTag The kext's load tag. + * @field flags Internal tracking flags. + * @field reference_list who this refs (links on). + * + * @discussion + * The OSKextLoadedKextSummary structure contains a basic set of information + * about the kext to facilitate kext debugging and panic debug log output. + */ +typedef struct _loaded_kext_summary { + char name[KMOD_MAX_NAME]; + uuid_t uuid; + uint64_t address; + uint64_t size; + uint64_t version; + uint32_t loadTag; + uint32_t flags; + uint64_t reference_list; +} OSKextLoadedKextSummary; + +/*! + * @typedef OSKextLoadedKextSummaryHeader + * @abstract A structure that describes the set of loaded kext summaries. + * + * @field version The version of the loaded kext summary structures. + * @field entry_size The size of each entry in summaries. + * @field numSummaries The number of OSKextLoadedKextSummary structures + * following the header. + * @field summaries A convenience pointer to the array of summaries following + * the header. + * + * @discussion + * The OSKextLoadedKextSummaryHeader describes the set of loaded kext summaries + * available for use by the debugger or panic log routine. + * The array of summaries contains one OSKextLoadedKextSummary for every kext + * that declares an executable and is not an interface to the kernel. + */ +typedef struct _loaded_kext_summary_header { + uint32_t version; + uint32_t entry_size; + uint32_t numSummaries; + uint32_t reserved; /* explicit alignment for gdb */ + OSKextLoadedKextSummary summaries[0]; +} OSKextLoadedKextSummaryHeader; + +/*! + * @var gLoadedKextSummaries + * @abstract The global pointer to the current set of loaded kext summaries. + */ +extern OSKextLoadedKextSummaryHeader * gLoadedKextSummaries; + +/*! + * @function OSKextLoadedKextSummariesUpdated + * @abstract Called when gLoadedKextSummaries has been updated. + * + * @discussion + * gLoadedKextSummaries is updated when a kext is loaded or unloaded. + * When the update is complete, OSKextLoadedKextSummariesUpdated is called. + * gdb can set a breakpoint on this function to detect kext loads and unloads. + */ +void OSKextLoadedKextSummariesUpdated(void); + __END_DECLS #endif /* ! _LIBKERN_OSKEXTLIBPRIVATE_H */ diff --git a/iokit/Kernel/WKdm.h b/libkern/libkern/WKdm.h similarity index 97% rename from iokit/Kernel/WKdm.h rename to libkern/libkern/WKdm.h index fc73454ae..f88b9971b 100644 --- a/iokit/Kernel/WKdm.h +++ b/libkern/libkern/WKdm.h @@ -68,11 +68,11 @@ typedef unsigned int WK_word; /* the next few are used during compression to write the header */ #define SET_QPOS_AREA_START(compr_dest_buf,qpos_start_addr) \ - (compr_dest_buf[1] = qpos_start_addr - compr_dest_buf) + (compr_dest_buf[1] = (unsigned int)(qpos_start_addr - compr_dest_buf)) #define SET_LOW_BITS_AREA_START(compr_dest_buf,lb_start_addr) \ - (compr_dest_buf[2] = lb_start_addr - compr_dest_buf) + (compr_dest_buf[2] = (unsigned int)(lb_start_addr - compr_dest_buf)) #define SET_LOW_BITS_AREA_END(compr_dest_buf,lb_end_addr) \ - (compr_dest_buf[3] = lb_end_addr - compr_dest_buf) + (compr_dest_buf[3] = (unsigned int)(lb_end_addr - compr_dest_buf)) /* the next few are only use during decompression to read the header */ #define TAGS_AREA_START(decomp_src_buf) \ diff --git a/libkern/libkern/_OSByteOrder.h b/libkern/libkern/_OSByteOrder.h index f01425b02..3ceec32eb 100644 --- a/libkern/libkern/_OSByteOrder.h +++ b/libkern/libkern/_OSByteOrder.h @@ -69,7 +69,7 @@ #define __DARWIN_OSSwapInt16(x) \ - (__builtin_constant_p(x) ? __DARWIN_OSSwapConstInt16(x) : _OSSwapInt16(x)) + ((__uint16_t)(__builtin_constant_p(x) ? __DARWIN_OSSwapConstInt16(x) : _OSSwapInt16(x))) #define __DARWIN_OSSwapInt32(x) \ (__builtin_constant_p(x) ? __DARWIN_OSSwapConstInt32(x) : _OSSwapInt32(x)) diff --git a/libkern/libkern/c++/Makefile b/libkern/libkern/c++/Makefile index 4d2eb7d29..8045763a1 100644 --- a/libkern/libkern/c++/Makefile +++ b/libkern/libkern/c++/Makefile @@ -8,8 +8,6 @@ include $(MakeInc_def) INSTINC_SUBDIRS = -INSTINC_SUBDIRS_PPC = - INSTINC_SUBDIRS_I386 = INSTINC_SUBDIRS_X86_64 = @@ -18,8 +16,6 @@ INSTINC_SUBDIRS_ARM = EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} -EXPINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS_PPC} - EXPINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS_I386} EXPINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS_X86_64} diff --git a/libkern/libkern/c++/OSKext.h b/libkern/libkern/c++/OSKext.h index 312d53993..d3f0fa232 100644 --- a/libkern/libkern/c++/OSKext.h +++ b/libkern/libkern/c++/OSKext.h @@ -32,6 +32,7 @@ extern "C" { #include #include +#include #include #include @@ -96,11 +97,11 @@ kern_return_t is_io_catalog_send_data( void kmod_dump_log(vm_offset_t*, unsigned int); -#if __ppc__ || __i386__ +#if __i386__ kern_return_t kext_get_kmod_info( kmod_info_array_t * kmod_list, mach_msg_type_number_t * kmodCount); -#endif /* __ppc__ || __i386__ */ +#endif /* __i386__ */ #endif /* XNU_KERNEL_PRIVATE */ }; @@ -123,7 +124,6 @@ class OSKext : public OSObject /**************************************/ #endif friend class IOCatalogue; - friend class IOPMrootDomain; friend class KLDBootstrap; friend class OSMetaClass; @@ -183,11 +183,11 @@ class OSKext : public OSObject friend void kmod_dump_log(vm_offset_t*, unsigned int); friend void kext_dump_panic_lists(int (*printf_func)(const char * fmt, ...)); -#if __ppc__ || __i386__ +#if __i386__ friend kern_return_t kext_get_kmod_info( kmod_info_array_t * kmod_list, mach_msg_type_number_t * kmodCount); -#endif /* __ppc__ || __i386__ */ +#endif /* __i386__ */ #endif /* XNU_KERNEL_PRIVATE */ @@ -200,6 +200,7 @@ class OSKext : public OSObject const OSSymbol * bundleID; OSString * path; // not necessarily correct :-/ + OSString * executableRelPath; // relative to bundle OSKextVersion version; // parsed OSKextVersion compatibleVersion; // parsed @@ -213,14 +214,13 @@ class OSKext : public OSObject OSArray * dependencies; // kernel resource does not have any; // links directly to kernel - OSData * linkState; // only kept for libraries /* Only real kexts have these; interface kexts do not. */ OSData * linkedExecutable; OSSet * metaClasses; // for C++/OSMetaClass kexts - /* Only interface kexts have these; interface kexts can get at them + /* Only interface kexts have these; non-interface kexts can get at them * in the linked Executable. */ OSData * interfaceUUID; @@ -229,11 +229,13 @@ class OSKext : public OSObject unsigned int loggingEnabled:1; unsigned int hasAllDependencies:1; + unsigned int hasBleedthrough:1; unsigned int interface:1; unsigned int kernelComponent:1; unsigned int prelinked:1; unsigned int loaded:1; + unsigned int dtraceInitialized:1; unsigned int starting:1; unsigned int started:1; unsigned int stopping:1; @@ -250,15 +252,18 @@ class OSKext : public OSObject #pragma mark Private Functions /**************************************/ #endif -private: +#ifdef XNU_KERNEL_PRIVATE /* Startup/shutdown phases. */ +public: static void initialize(void); static OSDictionary * copyKexts(void); static OSReturn removeKextBootstrap(void); static void willShutdown(void); // called by IOPMrootDomain on shutdown +#endif /* XNU_KERNEL_PRIVATE */ +private: /* Called by power management at sleep/shutdown. */ static bool setLoadEnabled(bool flag); @@ -338,7 +343,6 @@ class OSKext : public OSObject const void * mkextFileBase, const void * entry); - /* Dependencies. */ virtual bool resolveDependencies( @@ -377,20 +381,33 @@ class OSKext : public OSObject OSKextExcludeLevel startMatchingOpt = kOSKextExcludeAll, OSArray * personalityNames = NULL); // priv/prot virtual OSReturn unload(void); + virtual OSReturn queueKextNotification( + const char * notificationName, + OSString * kextIdentifier); static void recordIdentifierRequest( OSString * kextIdentifier); virtual OSReturn loadExecutable(void); + virtual void jettisonLinkeditSegment(void); + virtual OSReturn removeLinkeditHeaders(kernel_segment_command_t *linkedit); static void considerDestroyingLinkContext(void); - static OSData * getKernelLinkState(void); virtual OSData * getExecutable(void); virtual void setLinkedExecutable(OSData * anExecutable); + +#if CONFIG_DTRACE + friend void OSKextRegisterKextsWithDTrace(void); + static void registerKextsWithDTrace(void); + virtual void registerWithDTrace(void); + virtual void unregisterWithDTrace(void); +#endif /* CONFIG_DTRACE */ virtual OSReturn start(bool startDependenciesFlag = true); virtual OSReturn stop(void); virtual OSReturn setVMProtections(void); + virtual boolean_t segmentShouldBeWired(kernel_segment_command_t *seg); virtual OSReturn validateKextMapping(bool startFlag); + virtual boolean_t verifySegmentMapping(kernel_segment_command_t *seg); static OSArray * copyAllKextPersonalities( bool filterSafeBootFlag = false); @@ -409,10 +426,18 @@ class OSKext : public OSObject static OSReturn autounloadKext(OSKext * aKext); + /* Sync with user space. + */ + static OSReturn pingKextd(void); + /* Getting info about loaded kexts (kextstat). */ - static OSArray * copyLoadedKextInfo(OSArray * kextIdentifiers); - virtual OSDictionary * copyInfo(void); + static OSDictionary * copyLoadedKextInfo( + OSArray * kextIdentifiers = NULL, + OSArray * keys = NULL); + virtual OSDictionary * copyInfo(OSArray * keys = NULL); + + static OSData * copySanitizedKernelImage(void); /* Logging to user space. */ @@ -437,6 +462,8 @@ class OSKext : public OSObject virtual void reportOSMetaClassInstances( OSKextLogSpec msgLogSpec); + /* Resource requests and other callback stuff. + */ static OSReturn dispatchResource(OSDictionary * requestDict); static OSReturn dequeueCallbackForRequestTag( @@ -460,6 +487,14 @@ class OSKext : public OSObject unsigned int cnt, int (* printf_func)(const char *fmt, ...), bool lockFlag); + static boolean_t summaryIsInBacktrace( + OSKextLoadedKextSummary * summary, + vm_offset_t * addr, + unsigned int cnt); + static void printSummary( + OSKextLoadedKextSummary * summary, + int (* printf_func)(const char *fmt, ...)); + static uint32_t saveLoadedKextPanicListTyped( const char * prefix, int invertFlag, @@ -468,21 +503,25 @@ class OSKext : public OSObject uint32_t list_size, uint32_t * list_length_ptr); static void saveLoadedKextPanicList(void); - static void saveUnloadedKextPanicList(OSKext * aKext); + void savePanicString(bool isLoading); static void printKextPanicLists(int (*printf_func)(const char *fmt, ...)); + /* Kext summary support. + */ + static void updateLoadedKextSummaries(void); + void updateLoadedKextSummary(OSKextLoadedKextSummary *summary); + /* C++ Initialization. */ - virtual void setCPPInitialized(bool initialized=true); -#if __ppc__ || __i386__ +#if __i386__ /* Backward compatibility for kmod_get_info() MIG call. */ static kern_return_t getKmodInfo( kmod_info_array_t * kmodList, mach_msg_type_number_t * kmodCount); -#endif /* __ppc__ || __i386__ */ +#endif /* __i386__ */ #if PRAGMA_MARK @@ -530,29 +569,41 @@ class OSKext : public OSObject OSKextRequestTag requestTag, void ** contextOut); - static void considerUnloads(Boolean rescheduleOnlyFlag = false); - static void flushNonloadedKexts(Boolean flushPrelinkedKexts); - static void setKextdActive(Boolean active = true); - static void setDeferredLoadSucceeded(Boolean succeeded = true); - static void considerRebuildOfPrelinkedKernel(void); + static void considerUnloads(Boolean rescheduleOnlyFlag = false); + static void flushNonloadedKexts(Boolean flushPrelinkedKexts); + static void setKextdActive(Boolean active = true); + static void setDeferredLoadSucceeded(Boolean succeeded = true); + static void considerRebuildOfPrelinkedKernel(OSString * moduleName); - virtual bool setAutounloadEnabled(bool flag); + virtual bool setAutounloadEnabled(bool flag); virtual const OSSymbol * getIdentifier(void); virtual const char * getIdentifierCString(void); virtual OSKextVersion getVersion(void); virtual OSKextVersion getCompatibleVersion(void); + virtual bool isLibrary(void); virtual bool isCompatibleWithVersion(OSKextVersion aVersion); virtual OSObject * getPropertyForHostArch(const char * key); virtual OSKextLoadTag getLoadTag(void); + virtual void getSizeInfo(uint32_t *loadSize, uint32_t *wiredSize); virtual OSData * copyUUID(void); virtual OSArray * copyPersonalitiesArray(void); + + /* This removes personalities naming the kext (by CFBundleIdentifier), + * not all personalities defined by the kext (IOPersonalityPublisher or CFBundleIdentifier). + */ virtual void removePersonalitiesFromCatalog(void); + /* Converts common string-valued properties to OSSymbols for lower memory consumption. + */ + static void uniquePersonalityProperties(OSDictionary * personalityDict); + virtual bool declaresExecutable(void); // might be missing virtual bool isInterface(void); + virtual bool isKernel(void); virtual bool isKernelComponent(void); + virtual bool isExecutable(void); virtual bool isLoadableInSafeBoot(void); virtual bool isPrelinked(void); virtual bool isLoaded(void); diff --git a/libkern/libkern/c++/OSMetaClass.h b/libkern/libkern/c++/OSMetaClass.h index 85f9553e0..662021550 100644 --- a/libkern/libkern/c++/OSMetaClass.h +++ b/libkern/libkern/c++/OSMetaClass.h @@ -51,12 +51,8 @@ class OSSerialize; */ -#if !defined(__ppc__) || __GNUC__ < 3 /*! @parseOnly */ #define APPLE_KEXT_COMPATIBILITY -#else -#define APPLE_KEXT_COMPATIBILITY __attribute__ ((apple_kext_compatibility)) -#endif /*! @parseOnly */ #define APPLE_KEXT_VTABLE_PADDING 1 @@ -846,6 +842,9 @@ _ptmf2ptf(const OSMetaClassBase *self, void (OSMetaClassBase::*func)(void)) class OSMetaClass : private OSMetaClassBase { friend class OSKext; +#if IOKITSTATS + friend class IOStatistics; +#endif private: // Can never be allocated must be created at compile time @@ -862,7 +861,7 @@ class OSMetaClass : private OSMetaClassBase /* className OSSymbol of the class' name. */ const OSSymbol *className; - /* classSize How big is a single instancde of this class. */ + /* classSize How big is a single instance of this class. */ unsigned int classSize; /* instanceCount Roughly number of instances of the object, diff --git a/libkern/libkern/c++/OSObject.h b/libkern/libkern/c++/OSObject.h index cfd75269c..b33ed3c47 100644 --- a/libkern/libkern/c++/OSObject.h +++ b/libkern/libkern/c++/OSObject.h @@ -164,6 +164,9 @@ class OSString; class OSObject : public OSMetaClassBase { OSDeclareAbstractStructors(OSObject) +#if IOKITSTATS + friend class IOStatistics; +#endif private: /* Not to be included in headerdoc. @@ -435,24 +438,6 @@ class OSObject : public OSMetaClassBase OSMetaClassDeclareReservedUnused(OSObject, 14); OSMetaClassDeclareReservedUnused(OSObject, 15); -#ifdef __ppc__ - OSMetaClassDeclareReservedUnused(OSObject, 16); - OSMetaClassDeclareReservedUnused(OSObject, 17); - OSMetaClassDeclareReservedUnused(OSObject, 18); - OSMetaClassDeclareReservedUnused(OSObject, 19); - OSMetaClassDeclareReservedUnused(OSObject, 20); - OSMetaClassDeclareReservedUnused(OSObject, 21); - OSMetaClassDeclareReservedUnused(OSObject, 22); - OSMetaClassDeclareReservedUnused(OSObject, 23); - OSMetaClassDeclareReservedUnused(OSObject, 24); - OSMetaClassDeclareReservedUnused(OSObject, 25); - OSMetaClassDeclareReservedUnused(OSObject, 26); - OSMetaClassDeclareReservedUnused(OSObject, 27); - OSMetaClassDeclareReservedUnused(OSObject, 28); - OSMetaClassDeclareReservedUnused(OSObject, 29); - OSMetaClassDeclareReservedUnused(OSObject, 30); - OSMetaClassDeclareReservedUnused(OSObject, 31); -#endif }; #endif /* !_LIBKERN_OSOBJECT_H */ diff --git a/libkern/libkern/c++/OSOrderedSet.h b/libkern/libkern/c++/OSOrderedSet.h index 64609d863..8819f9332 100644 --- a/libkern/libkern/c++/OSOrderedSet.h +++ b/libkern/libkern/c++/OSOrderedSet.h @@ -111,8 +111,8 @@ class OSOrderedSet : public OSCollection * @result * A comparison result of the object: *
    - *
  • a positive value if obj2 should precede obj1,
  • - *
  • a negative value if obj1 should precede obj2,
  • + *
  • a negative value if obj2 should precede obj1,
  • + *
  • a positive value if obj1 should precede obj2,
  • *
  • and 0 if obj1 and obj2 have an equivalent ordering.
  • *
*/ diff --git a/libkern/libkern/c++/OSSet.h b/libkern/libkern/c++/OSSet.h index 65fd45d6e..0e82f7a87 100644 --- a/libkern/libkern/c++/OSSet.h +++ b/libkern/libkern/c++/OSSet.h @@ -537,23 +537,21 @@ class OSSet : public OSCollection * @param array The OSArray object containing the objects to be added. * * @result - * true if any object from array - * was successfully added the receiver, + * true if all objects from array + * are successfully added the receiver (or were already present), * false otherwise. * * @discussion * This functions adds to the receiving set * all objects from array - * that are not already in the set. - * Objects successfully added to the receiver are retained. + * that are not already in the receiving set. + * Objects added to the receiver are retained. * - * A false return value can mean either - * that all the objects in array are already present in the set, - * or that a memory allocation failure occurred. - * If you need to know whether the objects - * are already present, use - * @link containsObject containsObject@/link - * for each object. + * In releases prior to 10.7, this function would return false + * if an object from array was already present in the set, + * or if array was empty. + * This is no longer the case, so this function correctly returns true + * when the semantic of merging is met. */ virtual bool merge(const OSArray * array); @@ -568,22 +566,20 @@ class OSSet : public OSCollection * * @result * true if any object from set - * was successfully added the receiver, + * are successfully added the receiver (or were already present), * false otherwise. * * @discussion * This functions adds to the receiving set * all objects from set * that are not already in the receiving set. - * Objects successfully added to the receiver are retained. + * Objects added to the receiver are retained. * - * A false return value can mean either - * that all the objects in array are already present in the set, - * or that a memory allocation failure occurred. - * If you need to know whether the objects - * are already present, use - * @link containsObject containsObject@/link - * for each object. + * In releases prior to 10.7, this function would return false + * if an object from set was already present in the set, + * or if set was empty. + * This is no longer the case, so this function correctly returns true + * when the semantic of merging is met. */ virtual bool merge(const OSSet * set); diff --git a/libkern/libkern/crypto/Makefile b/libkern/libkern/crypto/Makefile index 5c8103efa..38aaa055e 100644 --- a/libkern/libkern/crypto/Makefile +++ b/libkern/libkern/crypto/Makefile @@ -8,14 +8,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = -INSTINC_SUBDIRS_PPC = - INSTINC_SUBDIRS_I386 = EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} -EXPINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS_PPC} - EXPINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS_I386} DATAFILES = md5.h sha1.h diff --git a/libkern/libkern/crypto/sha1.h b/libkern/libkern/crypto/sha1.h index 9acc46b91..ad6b798f9 100644 --- a/libkern/libkern/crypto/sha1.h +++ b/libkern/libkern/crypto/sha1.h @@ -60,7 +60,9 @@ typedef struct sha1_ctxt { extern void SHA1Init(SHA1_CTX *); extern void SHA1Update(SHA1_CTX *, const void *, size_t); +#ifdef XNU_KERNEL_PRIVATE extern void SHA1UpdateUsePhysicalAddress(SHA1_CTX *, const void *, size_t); +#endif extern void SHA1Final(void *, SHA1_CTX *); #ifdef __cplusplus diff --git a/libkern/libkern/kernel_mach_header.h b/libkern/libkern/kernel_mach_header.h index 03e95969f..6588b9b09 100644 --- a/libkern/libkern/kernel_mach_header.h +++ b/libkern/libkern/kernel_mach_header.h @@ -57,8 +57,6 @@ typedef struct segment_command_64 kernel_segment_command_t; typedef struct section_64 kernel_section_t; #define LC_SEGMENT_KERNEL LC_SEGMENT_64 -#define SECT_CONSTRUCTOR "__mod_init_func" -#define SECT_DESTRUCTOR "__mod_term_func" #else @@ -72,6 +70,9 @@ typedef struct section kernel_section_t; #endif +#define SECT_MODINITFUNC "__mod_init_func" +#define SECT_MODTERMFUNC "__mod_term_func" + extern kernel_mach_header_t _mh_execute_header; vm_offset_t getlastaddr(void); @@ -94,6 +95,7 @@ kernel_section_t *getsectbynamefromheader( void *getsectdatafromheader(kernel_mach_header_t *, const char *, const char *, unsigned long *); kernel_section_t *firstsect(kernel_segment_command_t *sgp); kernel_section_t *nextsect(kernel_segment_command_t *sgp, kernel_section_t *sp); +void *getuuidfromheader(kernel_mach_header_t *, unsigned long *); #if MACH_KDB boolean_t getsymtab(kernel_mach_header_t *header, diff --git a/libkern/libkern/kext_request_keys.h b/libkern/libkern/kext_request_keys.h index aa5c1da4a..6b908f133 100644 --- a/libkern/libkern/kext_request_keys.h +++ b/libkern/libkern/kext_request_keys.h @@ -88,19 +88,15 @@ extern "C" { */ #define kKextRequestPredicateGetLoaded "Get Loaded Kext Info" -/* Predicate: Get Kernel Link State +/* Predicate: Get Kernel Image * Argument: None - * Response: Raw bytes + length containing the link state of the kernel. + * Response: Raw bytes + length containing the sanitized image of the kernel. * Op result: OSReturn indicating any errors in processing (see OSKextLib.h) * - * Retrieves the link state of the running kernel for use in generating + * Retrieves a sanitized image of the running kernel for use in generating * debug symbols in user space. - * - * xxx - Should this allow retrieval of any kext's link state (maybe for - * xxx - debugging)? Could rename "Get Kext Link State" and take a bundle ID - * xxx - or none for kernel, just like with "Get Kext UUID". */ -#define kKextRequestPredicateGetKernelLinkState "Get Kernel Link State" +#define kKextRequestPredicateGetKernelImage "Get Kernel Image" /* Predicate: Get Kernel Load Address * Argument: None @@ -223,6 +219,26 @@ extern "C" { */ #define kKextRequestPredicateRequestLoad "Kext Load Request" +/* Predicate: Kext Load Notification + * Argument: kext identifier + * Response: None + * Op result: OSReturn indicating result (see OSKextLib.h) + * + * Informs kextd that the kernel has successfully loaded and started + * a kext. + */ +#define kKextRequestPredicateLoadNotification "Kext Load Notification" + +/* Predicate: Kext Unload Notification + * Argument: kext identifier + * Response: None + * Op result: OSReturn indicating result (see OSKextLib.h) + * + * Informs kextd that the kernel has successfully stopped and unloaded + * a kext. + */ +#define kKextRequestPredicateUnloadNotification "Kext Unload Notification" + /* Predicate: Prelinked Kernel Request * Argument: None * Response: None @@ -281,6 +297,14 @@ extern "C" { */ #define kKextRequestArgumentBundleIdentifierKey "CFBundleIdentifier" +/* Argument: OSReturn + * Type: Dictionary + * Used by: OSKext::copyInfo() + * + * Used to specify a subset of all possible info to be returned. + */ +#define kKextRequestArgumentInfoKeysKey "Kext Request Info Keys" + /* Argument: OSReturn * Type: Number (OSReturn) * Used by: several @@ -358,7 +382,7 @@ extern "C" { * either the primary kext, or the whole load list (any that weren't * already loaded & started). */ -#define kKextKextRequestArgumentStartExcludeKey "Start Exclude Level" +#define kKextRequestArgumentStartExcludeKey "Start Exclude Level" /* Argument: Start Matching Exclude Level * Type: Integer, corresponding to OSKextExcludeLevel diff --git a/libkern/libkern/kxld.h b/libkern/libkern/kxld.h index 6fa11e422..4fa1e9021 100644 --- a/libkern/libkern/kxld.h +++ b/libkern/libkern/kxld.h @@ -78,45 +78,28 @@ void kxld_destroy_context( * size The size of the kext in memory. Must be nonzero. * name The name, usually the bundle identifier, of the kext * callback_data Data that is to be passed to the callback functions. -* deps An array of pointers to the link state of kexts upon -* which this kext is dependent. -* ndeps Number of entries in the 'deps' array. -* linked_object If this is not null, it will be set to the address of -* the linked kext object. If the address provided by the -* kxld_alloc_callback is considered writable, this pointer -* will be set to that address. Otherwise, the linked -* object will be written to a temporary buffer that should -* be freed by the caller. +* dependencies An array of pointers to the kexts upon which this kext +* is dependent. +* num_dependencies Number of entries in the 'dependencies' array. +* linked_object This will be set to the address of the linked kext +* object. If the address provided by the +* kxld_alloc_callback is considered writable, this +* pointer will be set to that address. Otherwise, the +* linked object will be written to a temporary buffer +* that should be freed by the caller. * kmod_info_kern Kernel address of the kmod_info_t structure. -* link_state If this is not null, it will be set to the address of a -* block of memory that contains state generated by the -* linking process for use by links of dependent kexts. -* The link state object is serialized and can be written -* directly to disk. This memory should be freed by the -* caller when no longer needed. -* link_state_size The size of the returned link state buffer. -* symbol_file If this is not null, it will be set to the address of a -* buffer containing a Mach-O symbol file that may be -* written to disk. This should be freed by the caller -* when no longer needed. -* Note: symbol files are never generated in the kernel -* symbol_file_size The size of the returned symbol file buffer. -*******************************************************************************/ +******************************************************************************/ kern_return_t kxld_link_file( KXLDContext *context, u_char *file, u_long size, const char *name, void *callback_data, - u_char **deps, - u_int ndeps, + KXLDDependency *dependencies, + u_int num_dependencies, u_char **linked_object, - kxld_addr_t *kmod_info_kern, - u_char **link_state, - u_long *link_state_size, - u_char **symbol_file, - u_long *symbol_file_size) - __attribute__((nonnull(1, 2), visibility("default"))); + kxld_addr_t *kmod_info_kern) + __attribute__((nonnull(1,2,4,6,8,9), visibility("default"))); /******************************************************************************* *******************************************************************************/ diff --git a/libkern/libkern/kxld_types.h b/libkern/libkern/kxld_types.h index cd7153c8b..1578b5859 100644 --- a/libkern/libkern/kxld_types.h +++ b/libkern/libkern/kxld_types.h @@ -30,6 +30,7 @@ #include #include +#include // boolean_t #include /******************************************************************************* @@ -82,7 +83,7 @@ #endif /* For linking code specific to architectures that use MH_KEXT_BUNDLE */ -#if (!KERNEL || __x86_64__) +#if (!KERNEL || __i386__ || __x86_64__ || __arm__) #define KXLD_USER_OR_BUNDLE 1 #endif @@ -115,14 +116,14 @@ typedef uint64_t kxld_size_t; /* Flags for general linker behavior */ enum kxld_flags { - kKxldFlagDefault = 0x0 + kKxldFlagDefault = 0x0, }; typedef enum kxld_flags KXLDFlags; /* Flags for the allocation callback */ enum kxld_allocate_flags { kKxldAllocateDefault = 0x0, - kKxldAllocateWritable = 0x1 /* kxld may write into the allocated memory */ + kKxldAllocateWritable = 0x1, /* kxld may write into the allocated memory */ }; typedef enum kxld_allocate_flags KXLDAllocateFlags; @@ -149,6 +150,25 @@ typedef enum kxld_log_level { kKxldLogDebug = 0x5 } KXLDLogLevel; +/* This structure is used to describe a dependency kext. The kext field + * is a pointer to the binary executable of the dependency. The interface + * field is a pointer to an optional interface kext that restricts the + * symbols that may be accessed in the dependency kext. + * + * For example, to use this structure with the KPIs, set the kext field + * to point to the kernel's Mach-O binary, and set interface to point + * to the KPI's Mach-O binary. + */ +typedef struct kxld_dependency { + u_char * kext; + u_long kext_size; + char * kext_name; + u_char * interface; + u_long interface_size; + char * interface_name; + boolean_t is_direct_dependency; +} KXLDDependency; + typedef void (*KXLDLoggingCallback) (KXLDLogSubsystem sys, KXLDLogLevel level, const char *format, va_list ap, void *user_data); diff --git a/libkern/libkern/machine/Makefile b/libkern/libkern/machine/Makefile index 0a072f9f9..e4d4ce152 100644 --- a/libkern/libkern/machine/Makefile +++ b/libkern/libkern/machine/Makefile @@ -8,8 +8,6 @@ include $(MakeInc_def) INSTINC_SUBDIRS = -INSTINC_SUBDIRS_PPC = - INSTINC_SUBDIRS_I386 = INSTINC_SUBDIRS_X86_64 = @@ -18,8 +16,6 @@ INSTINC_SUBDIRS_ARM = EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} -EXPINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS_PPC} - EXPINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS_I386} EXPINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS_X86_64} diff --git a/libkern/libkern/mkext.h b/libkern/libkern/mkext.h index 6b43a6b82..0024e84a0 100644 --- a/libkern/libkern/mkext.h +++ b/libkern/libkern/mkext.h @@ -149,6 +149,7 @@ typedef struct mkext_basic_header { #define kMKEXTInfoDictionariesKey "_MKEXTInfoDictionaries" #define kMKEXTBundlePathKey "_MKEXTBundlePath" +#define kMKEXTExecutableRelativePathKey "_MKEXTExecutableRelativePath" #define kMKEXTExecutableKey "_MKEXTExecutable" #define kMKEXTLoadRequestKey "_MKEXTLoadRequest" diff --git a/libkern/libkern/ppc/Makefile b/libkern/libkern/ppc/Makefile deleted file mode 100644 index e892ce42f..000000000 --- a/libkern/libkern/ppc/Makefile +++ /dev/null @@ -1,31 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - -include $(MakeInc_cmd) -include $(MakeInc_def) - -INSTINC_SUBDIRS = - -INSTINC_SUBDIRS_PPC = - -EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} - -EXPINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS_PPC} - -DATAFILES = \ - OSByteOrder.h - -INSTALL_MD_LIST = ${DATAFILES} - -INSTALL_MD_DIR = libkern/ppc - -EXPORT_MD_LIST = ${DATAFILES} - -EXPORT_MD_DIR = libkern/ppc - -include $(MakeInc_rule) -include $(MakeInc_dir) - - diff --git a/libkern/libkern/ppc/OSByteOrder.h b/libkern/libkern/ppc/OSByteOrder.h deleted file mode 100644 index c6666859d..000000000 --- a/libkern/libkern/ppc/OSByteOrder.h +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef _OS_OSBYTEORDERPPC_H -#define _OS_OSBYTEORDERPPC_H - -#include - -#if !defined(OS_INLINE) -# if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L -# define OS_INLINE static inline -# elif defined(__MWERKS__) || defined(__cplusplus) -# define OS_INLINE static inline -# else -# define OS_INLINE static __inline__ -# endif -#endif - -/* Functions for byte reversed loads. */ - -OS_INLINE -uint16_t -OSReadSwapInt16( - const volatile void * base, - uintptr_t byteOffset -) -{ - uint16_t result; - volatile uint16_t *addr = (volatile uint16_t *)((uintptr_t)base + byteOffset); - -#if defined(__llvm__) - result = *addr; - result = ((result << 8) | (result >> 8)); -#else - __asm__ ("lhbrx %0, %2, %1" - : "=r" (result) - : "r" (base), "bO" (byteOffset), "m" (*addr)); -#endif - - return result; -} - -OS_INLINE -uint32_t -OSReadSwapInt32( - const volatile void * base, - uintptr_t byteOffset -) -{ - uint32_t result; - volatile uint32_t *addr = (volatile uint32_t *)((uintptr_t)base + byteOffset); - -#if defined(__llvm__) - result = __builtin_bswap32(*addr); -#else - __asm__ ("lwbrx %0, %2, %1" - : "=r" (result) - : "r" (base), "bO" (byteOffset), "m" (*addr)); -#endif - - return result; -} - -OS_INLINE -uint64_t -OSReadSwapInt64( - const volatile void * base, - uintptr_t byteOffset -) -{ - volatile uint64_t *addr = (volatile uint64_t *)((uintptr_t)base + byteOffset); - union { - uint64_t u64; - uint32_t u32[2]; - } u; - -#if defined(__llvm__) - u.u64 = __builtin_bswap64(*addr); -#else - __asm__ ("lwbrx %0, %3, %2\n\t" - "lwbrx %1, %4, %2" - : "=&r" (u.u32[1]), "=r" (u.u32[0]) - : "r" (base), "bO" (byteOffset), "b" (byteOffset + 4), "m" (*addr)); -#endif - - return u.u64; -} - -/* Functions for byte reversed stores. */ - -OS_INLINE -void -OSWriteSwapInt16( - volatile void * base, - uintptr_t byteOffset, - uint16_t data -) -{ - volatile uint16_t *addr = (volatile uint16_t *)((uintptr_t)base + byteOffset); - -#if defined(__llvm__) - *addr = ((data >> 8) | (data << 8)); -#else - __asm__ ("sthbrx %1, %3, %2" - : "=m" (*addr) - : "r" (data), "r" (base), "bO" (byteOffset)); -#endif -} - -OS_INLINE -void -OSWriteSwapInt32( - volatile void * base, - uintptr_t byteOffset, - uint32_t data -) -{ - volatile uint32_t *addr = (volatile uint32_t *)((uintptr_t)base + byteOffset); - -#if defined(__llvm__) - *addr = __builtin_bswap32(data); -#else - __asm__ ("stwbrx %1, %3, %2" - : "=m" (*addr) - : "r" (data), "r" (base), "bO" (byteOffset)); -#endif -} - -OS_INLINE -void -OSWriteSwapInt64( - volatile void * base, - uintptr_t byteOffset, - uint64_t data -) -{ - volatile uint64_t *addr = (volatile uint64_t *)((uintptr_t)base + byteOffset); - -#if defined(__llvm__) - *addr = __builtin_bswap64(data); -#else - uint32_t hi = (uint32_t)(data >> 32); - uint32_t lo = (uint32_t)(data & 0xffffffff); - - __asm__ ("stwbrx %1, %4, %3\n\t" - "stwbrx %2, %5, %3" - : "=m" (*addr) - : "r" (lo), "r" (hi), "r" (base), "bO" (byteOffset), "b" (byteOffset + 4)); -#endif -} - -/* Generic byte swapping functions. */ - -OS_INLINE -uint16_t -_OSSwapInt16( - uint16_t data -) -{ - return OSReadSwapInt16(&data, 0); -} - -OS_INLINE -uint32_t -_OSSwapInt32( - uint32_t data -) -{ - return OSReadSwapInt32(&data, 0); -} - -OS_INLINE -uint64_t -_OSSwapInt64( - uint64_t data -) -{ - return OSReadSwapInt64(&data, 0); -} - -#endif /* ! _OS_OSBYTEORDERPPC_H */ diff --git a/libkern/libkern/prelink.h b/libkern/libkern/prelink.h index e8f37e1f0..59aefd3a2 100644 --- a/libkern/libkern/prelink.h +++ b/libkern/libkern/prelink.h @@ -40,6 +40,7 @@ #define kPrelinkInfoSection "__info" #define kPrelinkBundlePathKey "_PrelinkBundlePath" +#define kPrelinkExecutableRelativePathKey "_PrelinkExecutableRelativePath" #define kPrelinkExecutableLoadKey "_PrelinkExecutableLoadAddr" #define kPrelinkExecutableSourceKey "_PrelinkExecutableSourceAddr" #define kPrelinkExecutableSizeKey "_PrelinkExecutableSize" @@ -48,7 +49,6 @@ #define kPrelinkKmodInfoKey "_PrelinkKmodInfo" #define kPrelinkLinkStateKey "_PrelinkLinkState" #define kPrelinkLinkStateSizeKey "_PrelinkLinkStateSize" -#define kPrelinkPersonalitiesKey "_PrelinkPersonalities" #endif /* _PRELINK_H_ */ diff --git a/libkern/libkern/tree.h b/libkern/libkern/tree.h new file mode 100644 index 000000000..3a26162bd --- /dev/null +++ b/libkern/libkern/tree.h @@ -0,0 +1,802 @@ +/* + * Copyright (c) 2009-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $NetBSD: tree.h,v 1.13 2006/08/27 22:32:38 christos Exp $ */ +/* $OpenBSD: tree.h,v 1.7 2002/10/17 21:51:54 art Exp $ */ +/* + * Copyright 2002 Niels Provos + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LIBKERN_TREE_H_ +#define _LIBKERN_TREE_H_ + +/* + * This file defines data structures for different types of trees: + * splay trees and red-black trees. + * + * A splay tree is a self-organizing data structure. Every operation + * on the tree causes a splay to happen. The splay moves the requested + * node to the root of the tree and partly rebalances it. + * + * This has the benefit that request locality causes faster lookups as + * the requested nodes move to the top of the tree. On the other hand, + * every lookup causes memory writes. + * + * The Balance Theorem bounds the total access time for m operations + * and n inserts on an initially empty tree as O((m + n)lg n). The + * amortized cost for a sequence of m accesses to a splay tree is O(lg n); + * + * A red-black tree is a binary search tree with the node color as an + * extra attribute. It fulfills a set of conditions: + * - every search path from the root to a leaf consists of the + * same number of black nodes, + * - each red node (except for the root) has a black parent, + * - each leaf node is black. + * + * Every operation on a red-black tree is bounded as O(lg n). + * The maximum height of a red-black tree is 2lg (n+1). + */ + +#define SPLAY_HEAD(name, type) \ +struct name { \ + struct type *sph_root; /* root of the tree */ \ +} + +#define SPLAY_INITIALIZER(root) \ + { NULL } + +#define SPLAY_INIT(root) do { \ + (root)->sph_root = NULL; \ +} while (/*CONSTCOND*/ 0) + +#define SPLAY_ENTRY(type) \ +struct { \ + struct type *spe_left; /* left element */ \ + struct type *spe_right; /* right element */ \ +} + +#define SPLAY_LEFT(elm, field) (elm)->field.spe_left +#define SPLAY_RIGHT(elm, field) (elm)->field.spe_right +#define SPLAY_ROOT(head) (head)->sph_root +#define SPLAY_EMPTY(head) (SPLAY_ROOT(head) == NULL) + +/* SPLAY_ROTATE_{LEFT,RIGHT} expect that tmp hold SPLAY_{RIGHT,LEFT} */ +#define SPLAY_ROTATE_RIGHT(head, tmp, field) do { \ + SPLAY_LEFT((head)->sph_root, field) = SPLAY_RIGHT(tmp, field); \ + SPLAY_RIGHT(tmp, field) = (head)->sph_root; \ + (head)->sph_root = tmp; \ +} while (/*CONSTCOND*/ 0) + +#define SPLAY_ROTATE_LEFT(head, tmp, field) do { \ + SPLAY_RIGHT((head)->sph_root, field) = SPLAY_LEFT(tmp, field); \ + SPLAY_LEFT(tmp, field) = (head)->sph_root; \ + (head)->sph_root = tmp; \ +} while (/*CONSTCOND*/ 0) + +#define SPLAY_LINKLEFT(head, tmp, field) do { \ + SPLAY_LEFT(tmp, field) = (head)->sph_root; \ + tmp = (head)->sph_root; \ + (head)->sph_root = SPLAY_LEFT((head)->sph_root, field); \ +} while (/*CONSTCOND*/ 0) + +#define SPLAY_LINKRIGHT(head, tmp, field) do { \ + SPLAY_RIGHT(tmp, field) = (head)->sph_root; \ + tmp = (head)->sph_root; \ + (head)->sph_root = SPLAY_RIGHT((head)->sph_root, field); \ +} while (/*CONSTCOND*/ 0) + +#define SPLAY_ASSEMBLE(head, node, left, right, field) do { \ + SPLAY_RIGHT(left, field) = SPLAY_LEFT((head)->sph_root, field); \ + SPLAY_LEFT(right, field) = SPLAY_RIGHT((head)->sph_root, field);\ + SPLAY_LEFT((head)->sph_root, field) = SPLAY_RIGHT(node, field); \ + SPLAY_RIGHT((head)->sph_root, field) = SPLAY_LEFT(node, field); \ +} while (/*CONSTCOND*/ 0) + +/* Generates prototypes and inline functions */ + +#define SPLAY_PROTOTYPE(name, type, field, cmp) \ +void name##_SPLAY(struct name *, struct type *); \ +void name##_SPLAY_MINMAX(struct name *, int); \ +struct type *name##_SPLAY_INSERT(struct name *, struct type *); \ +struct type *name##_SPLAY_REMOVE(struct name *, struct type *); \ + \ +/* Finds the node with the same key as elm */ \ +static __inline struct type * \ +name##_SPLAY_FIND(struct name *head, struct type *elm) \ +{ \ + if (SPLAY_EMPTY(head)) \ + return(NULL); \ + name##_SPLAY(head, elm); \ + if ((cmp)(elm, (head)->sph_root) == 0) \ + return (head->sph_root); \ + return (NULL); \ +} \ + \ +static __inline struct type * \ +name##_SPLAY_NEXT(struct name *head, struct type *elm) \ +{ \ + name##_SPLAY(head, elm); \ + if (SPLAY_RIGHT(elm, field) != NULL) { \ + elm = SPLAY_RIGHT(elm, field); \ + while (SPLAY_LEFT(elm, field) != NULL) { \ + elm = SPLAY_LEFT(elm, field); \ + } \ + } else \ + elm = NULL; \ + return (elm); \ +} \ + \ +static __inline struct type * \ +name##_SPLAY_MIN_MAX(struct name *head, int val) \ +{ \ + name##_SPLAY_MINMAX(head, val); \ + return (SPLAY_ROOT(head)); \ +} + +/* Main splay operation. + * Moves node close to the key of elm to top + */ +#define SPLAY_GENERATE(name, type, field, cmp) \ +struct type * \ +name##_SPLAY_INSERT(struct name *head, struct type *elm) \ +{ \ + if (SPLAY_EMPTY(head)) { \ + SPLAY_LEFT(elm, field) = SPLAY_RIGHT(elm, field) = NULL; \ + } else { \ + int __comp; \ + name##_SPLAY(head, elm); \ + __comp = (cmp)(elm, (head)->sph_root); \ + if(__comp < 0) { \ + SPLAY_LEFT(elm, field) = SPLAY_LEFT((head)->sph_root, field);\ + SPLAY_RIGHT(elm, field) = (head)->sph_root; \ + SPLAY_LEFT((head)->sph_root, field) = NULL; \ + } else if (__comp > 0) { \ + SPLAY_RIGHT(elm, field) = SPLAY_RIGHT((head)->sph_root, field);\ + SPLAY_LEFT(elm, field) = (head)->sph_root; \ + SPLAY_RIGHT((head)->sph_root, field) = NULL; \ + } else \ + return ((head)->sph_root); \ + } \ + (head)->sph_root = (elm); \ + return (NULL); \ +} \ + \ +struct type * \ +name##_SPLAY_REMOVE(struct name *head, struct type *elm) \ +{ \ + struct type *__tmp; \ + if (SPLAY_EMPTY(head)) \ + return (NULL); \ + name##_SPLAY(head, elm); \ + if ((cmp)(elm, (head)->sph_root) == 0) { \ + if (SPLAY_LEFT((head)->sph_root, field) == NULL) { \ + (head)->sph_root = SPLAY_RIGHT((head)->sph_root, field);\ + } else { \ + __tmp = SPLAY_RIGHT((head)->sph_root, field); \ + (head)->sph_root = SPLAY_LEFT((head)->sph_root, field);\ + name##_SPLAY(head, elm); \ + SPLAY_RIGHT((head)->sph_root, field) = __tmp; \ + } \ + return (elm); \ + } \ + return (NULL); \ +} \ + \ +void \ +name##_SPLAY(struct name *head, struct type *elm) \ +{ \ + struct type __node, *__left, *__right, *__tmp; \ + int __comp; \ +\ + SPLAY_LEFT(&__node, field) = SPLAY_RIGHT(&__node, field) = NULL;\ + __left = __right = &__node; \ +\ + while ((__comp = (cmp)(elm, (head)->sph_root)) != 0) { \ + if (__comp < 0) { \ + __tmp = SPLAY_LEFT((head)->sph_root, field); \ + if (__tmp == NULL) \ + break; \ + if ((cmp)(elm, __tmp) < 0){ \ + SPLAY_ROTATE_RIGHT(head, __tmp, field); \ + if (SPLAY_LEFT((head)->sph_root, field) == NULL)\ + break; \ + } \ + SPLAY_LINKLEFT(head, __right, field); \ + } else if (__comp > 0) { \ + __tmp = SPLAY_RIGHT((head)->sph_root, field); \ + if (__tmp == NULL) \ + break; \ + if ((cmp)(elm, __tmp) > 0){ \ + SPLAY_ROTATE_LEFT(head, __tmp, field); \ + if (SPLAY_RIGHT((head)->sph_root, field) == NULL)\ + break; \ + } \ + SPLAY_LINKRIGHT(head, __left, field); \ + } \ + } \ + SPLAY_ASSEMBLE(head, &__node, __left, __right, field); \ +} \ + \ +/* Splay with either the minimum or the maximum element \ + * Used to find minimum or maximum element in tree. \ + */ \ +void name##_SPLAY_MINMAX(struct name *head, int __comp) \ +{ \ + struct type __node, *__left, *__right, *__tmp; \ +\ + SPLAY_LEFT(&__node, field) = SPLAY_RIGHT(&__node, field) = NULL;\ + __left = __right = &__node; \ +\ + while (1) { \ + if (__comp < 0) { \ + __tmp = SPLAY_LEFT((head)->sph_root, field); \ + if (__tmp == NULL) \ + break; \ + if (__comp < 0){ \ + SPLAY_ROTATE_RIGHT(head, __tmp, field); \ + if (SPLAY_LEFT((head)->sph_root, field) == NULL)\ + break; \ + } \ + SPLAY_LINKLEFT(head, __right, field); \ + } else if (__comp > 0) { \ + __tmp = SPLAY_RIGHT((head)->sph_root, field); \ + if (__tmp == NULL) \ + break; \ + if (__comp > 0) { \ + SPLAY_ROTATE_LEFT(head, __tmp, field); \ + if (SPLAY_RIGHT((head)->sph_root, field) == NULL)\ + break; \ + } \ + SPLAY_LINKRIGHT(head, __left, field); \ + } \ + } \ + SPLAY_ASSEMBLE(head, &__node, __left, __right, field); \ +} + +#define SPLAY_NEGINF -1 +#define SPLAY_INF 1 + +#define SPLAY_INSERT(name, x, y) name##_SPLAY_INSERT(x, y) +#define SPLAY_REMOVE(name, x, y) name##_SPLAY_REMOVE(x, y) +#define SPLAY_FIND(name, x, y) name##_SPLAY_FIND(x, y) +#define SPLAY_NEXT(name, x, y) name##_SPLAY_NEXT(x, y) +#define SPLAY_MIN(name, x) (SPLAY_EMPTY(x) ? NULL \ + : name##_SPLAY_MIN_MAX(x, SPLAY_NEGINF)) +#define SPLAY_MAX(name, x) (SPLAY_EMPTY(x) ? NULL \ + : name##_SPLAY_MIN_MAX(x, SPLAY_INF)) + +#define SPLAY_FOREACH(x, name, head) \ + for ((x) = SPLAY_MIN(name, head); \ + (x) != NULL; \ + (x) = SPLAY_NEXT(name, head, x)) + +/* Macros that define a red-black tree */ +#define RB_HEAD(name, type) \ +struct name { \ + struct type *rbh_root; /* root of the tree */ \ +} + +#define RB_INITIALIZER(root) \ + { NULL } + +#define RB_INIT(root) do { \ + (root)->rbh_root = NULL; \ +} while (/*CONSTCOND*/ 0) + +#define RB_BLACK 0 +#define RB_RED 1 +#define RB_PLACEHOLDER NULL +#define RB_ENTRY(type) \ +struct { \ + struct type *rbe_left; /* left element */ \ + struct type *rbe_right; /* right element */ \ + struct type *rbe_parent; /* parent element */ \ +} + +#define RB_COLOR_MASK (uintptr_t)0x1 +#define RB_LEFT(elm, field) (elm)->field.rbe_left +#define RB_RIGHT(elm, field) (elm)->field.rbe_right +#define _RB_PARENT(elm, field) (elm)->field.rbe_parent +#define RB_ROOT(head) (head)->rbh_root +#define RB_EMPTY(head) (RB_ROOT(head) == NULL) + +#define RB_SET(name, elm, parent, field) do { \ + name##_RB_SETPARENT(elm, parent); \ + RB_LEFT(elm, field) = RB_RIGHT(elm, field) = NULL; \ + name##_RB_SETCOLOR(elm, RB_RED); \ +} while (/*CONSTCOND*/ 0) + +#define RB_SET_BLACKRED(name, black, red, field) do { \ + name##_RB_SETCOLOR(black, RB_BLACK); \ + name##_RB_SETCOLOR(red, RB_RED); \ +} while (/*CONSTCOND*/ 0) + +#ifndef RB_AUGMENT +#define RB_AUGMENT(x) (void)(x) +#endif + +#define RB_ROTATE_LEFT(name, head, elm, tmp, field) do { \ + (tmp) = RB_RIGHT(elm, field); \ + if ((RB_RIGHT(elm, field) = RB_LEFT(tmp, field)) != NULL) { \ + name##_RB_SETPARENT(RB_LEFT(tmp, field),(elm)); \ + } \ + RB_AUGMENT(elm); \ + if (name##_RB_SETPARENT(tmp, name##_RB_GETPARENT(elm)) != NULL) { \ + if ((elm) == RB_LEFT(name##_RB_GETPARENT(elm), field)) \ + RB_LEFT(name##_RB_GETPARENT(elm), field) = (tmp); \ + else \ + RB_RIGHT(name##_RB_GETPARENT(elm), field) = (tmp); \ + } else \ + (head)->rbh_root = (tmp); \ + RB_LEFT(tmp, field) = (elm); \ + name##_RB_SETPARENT(elm, (tmp)); \ + RB_AUGMENT(tmp); \ + if ((name##_RB_GETPARENT(tmp))) \ + RB_AUGMENT(name##_RB_GETPARENT(tmp)); \ +} while (/*CONSTCOND*/ 0) + +#define RB_ROTATE_RIGHT(name, head, elm, tmp, field) do { \ + (tmp) = RB_LEFT(elm, field); \ + if ((RB_LEFT(elm, field) = RB_RIGHT(tmp, field)) != NULL) { \ + name##_RB_SETPARENT(RB_RIGHT(tmp, field), (elm)); \ + } \ + RB_AUGMENT(elm); \ + if (name##_RB_SETPARENT(tmp, name##_RB_GETPARENT(elm)) != NULL) { \ + if ((elm) == RB_LEFT(name##_RB_GETPARENT(elm), field)) \ + RB_LEFT(name##_RB_GETPARENT(elm), field) = (tmp); \ + else \ + RB_RIGHT(name##_RB_GETPARENT(elm), field) = (tmp); \ + } else \ + (head)->rbh_root = (tmp); \ + RB_RIGHT(tmp, field) = (elm); \ + name##_RB_SETPARENT(elm, tmp); \ + RB_AUGMENT(tmp); \ + if ((name##_RB_GETPARENT(tmp))) \ + RB_AUGMENT(name##_RB_GETPARENT(tmp)); \ +} while (/*CONSTCOND*/ 0) + +/* Generates prototypes and inline functions */ +#define RB_PROTOTYPE(name, type, field, cmp) \ +void name##_RB_INSERT_COLOR(struct name *, struct type *); \ +void name##_RB_REMOVE_COLOR(struct name *, struct type *, struct type *);\ +struct type *name##_RB_REMOVE(struct name *, struct type *); \ +struct type *name##_RB_INSERT(struct name *, struct type *); \ +struct type *name##_RB_FIND(struct name *, struct type *); \ +struct type *name##_RB_NEXT(struct type *); \ +struct type *name##_RB_MINMAX(struct name *, int); \ +struct type *name##_RB_GETPARENT(struct type*); \ +struct type *name##_RB_SETPARENT(struct type*, struct type*); \ +int name##_RB_GETCOLOR(struct type*); \ +void name##_RB_SETCOLOR(struct type*,int); + +/* Generates prototypes (with storage class) and inline functions */ +#define RB_PROTOTYPE_SC(_sc_, name, type, field, cmp) \ +_sc_ void name##_RB_INSERT_COLOR(struct name *, struct type *); \ +_sc_ void name##_RB_REMOVE_COLOR(struct name *, struct type *, struct type *); \ +_sc_ struct type *name##_RB_REMOVE(struct name *, struct type *); \ +_sc_ struct type *name##_RB_INSERT(struct name *, struct type *); \ +_sc_ struct type *name##_RB_FIND(struct name *, struct type *); \ +_sc_ struct type *name##_RB_NEXT(struct type *); \ +_sc_ struct type *name##_RB_MINMAX(struct name *, int); \ +_sc_ struct type *name##_RB_GETPARENT(struct type*); \ +_sc_ struct type *name##_RB_SETPARENT(struct type*, struct type*); \ +_sc_ int name##_RB_GETCOLOR(struct type*); \ +_sc_ void name##_RB_SETCOLOR(struct type*,int); + + +/* Main rb operation. + * Moves node close to the key of elm to top + */ +#define RB_GENERATE(name, type, field, cmp) \ +struct type *name##_RB_GETPARENT(struct type *elm) { \ + struct type *parent = _RB_PARENT(elm, field); \ + if( parent != NULL) { \ + parent = (struct type*)((uintptr_t)parent & ~RB_COLOR_MASK);\ + return( (struct type*) ( (parent == (struct type*) RB_PLACEHOLDER) ? NULL: parent));\ + } \ + return((struct type*)NULL); \ +} \ +int name##_RB_GETCOLOR(struct type *elm) { \ + int color = 0; \ + color = (int)((uintptr_t)_RB_PARENT(elm,field) & RB_COLOR_MASK);\ + return(color); \ +} \ +void name##_RB_SETCOLOR(struct type *elm,int color) { \ + struct type *parent = name##_RB_GETPARENT(elm); \ + if(parent == (struct type*)NULL) \ + parent = (struct type*) RB_PLACEHOLDER; \ + _RB_PARENT(elm, field) = (struct type*)((uintptr_t)parent | (unsigned int)color);\ +} \ +struct type *name##_RB_SETPARENT(struct type *elm, struct type *parent) { \ + int color = name##_RB_GETCOLOR(elm); \ + _RB_PARENT(elm, field) = parent; \ + if(color) name##_RB_SETCOLOR(elm, color); \ + return(name##_RB_GETPARENT(elm)); \ +} \ + \ +void \ +name##_RB_INSERT_COLOR(struct name *head, struct type *elm) \ +{ \ + struct type *parent, *gparent, *tmp; \ + while ((parent = name##_RB_GETPARENT(elm)) != NULL && \ + name##_RB_GETCOLOR(parent) == RB_RED) { \ + gparent = name##_RB_GETPARENT(parent); \ + if (parent == RB_LEFT(gparent, field)) { \ + tmp = RB_RIGHT(gparent, field); \ + if (tmp && name##_RB_GETCOLOR(tmp) == RB_RED) { \ + name##_RB_SETCOLOR(tmp, RB_BLACK); \ + RB_SET_BLACKRED(name, parent, gparent, field);\ + elm = gparent; \ + continue; \ + } \ + if (RB_RIGHT(parent, field) == elm) { \ + RB_ROTATE_LEFT(name, head, parent, tmp, field);\ + tmp = parent; \ + parent = elm; \ + elm = tmp; \ + } \ + RB_SET_BLACKRED(name, parent, gparent, field); \ + RB_ROTATE_RIGHT(name,head, gparent, tmp, field); \ + } else { \ + tmp = RB_LEFT(gparent, field); \ + if (tmp && name##_RB_GETCOLOR(tmp) == RB_RED) { \ + name##_RB_SETCOLOR(tmp, RB_BLACK); \ + RB_SET_BLACKRED(name, parent, gparent, field);\ + elm = gparent; \ + continue; \ + } \ + if (RB_LEFT(parent, field) == elm) { \ + RB_ROTATE_RIGHT(name, head, parent, tmp, field);\ + tmp = parent; \ + parent = elm; \ + elm = tmp; \ + } \ + RB_SET_BLACKRED(name, parent, gparent, field); \ + RB_ROTATE_LEFT(name, head, gparent, tmp, field); \ + } \ + } \ + name##_RB_SETCOLOR(head->rbh_root, RB_BLACK); \ +} \ + \ +void \ +name##_RB_REMOVE_COLOR(struct name *head, struct type *parent, struct type *elm) \ +{ \ + struct type *tmp; \ + while ((elm == NULL || name##_RB_GETCOLOR(elm) == RB_BLACK) && \ + elm != RB_ROOT(head)) { \ + if (RB_LEFT(parent, field) == elm) { \ + tmp = RB_RIGHT(parent, field); \ + if (name##_RB_GETCOLOR(tmp) == RB_RED) { \ + RB_SET_BLACKRED(name, tmp, parent, field); \ + RB_ROTATE_LEFT(name, head, parent, tmp, field);\ + tmp = RB_RIGHT(parent, field); \ + } \ + if ((RB_LEFT(tmp, field) == NULL || \ + name##_RB_GETCOLOR(RB_LEFT(tmp, field)) == RB_BLACK) &&\ + (RB_RIGHT(tmp, field) == NULL || \ + name##_RB_GETCOLOR(RB_RIGHT(tmp, field)) == RB_BLACK)) {\ + name##_RB_SETCOLOR(tmp, RB_RED); \ + elm = parent; \ + parent = name##_RB_GETPARENT(elm); \ + } else { \ + if (RB_RIGHT(tmp, field) == NULL || \ + name##_RB_GETCOLOR(RB_RIGHT(tmp, field)) == RB_BLACK) {\ + struct type *oleft; \ + if ((oleft = RB_LEFT(tmp, field)) \ + != NULL) \ + name##_RB_SETCOLOR(oleft, RB_BLACK);\ + name##_RB_SETCOLOR(tmp, RB_RED); \ + RB_ROTATE_RIGHT(name, head, tmp, oleft, field);\ + tmp = RB_RIGHT(parent, field); \ + } \ + name##_RB_SETCOLOR(tmp, (name##_RB_GETCOLOR(parent)));\ + name##_RB_SETCOLOR(parent, RB_BLACK); \ + if (RB_RIGHT(tmp, field)) \ + name##_RB_SETCOLOR(RB_RIGHT(tmp, field),RB_BLACK);\ + RB_ROTATE_LEFT(name, head, parent, tmp, field);\ + elm = RB_ROOT(head); \ + break; \ + } \ + } else { \ + tmp = RB_LEFT(parent, field); \ + if (name##_RB_GETCOLOR(tmp) == RB_RED) { \ + RB_SET_BLACKRED(name, tmp, parent, field); \ + RB_ROTATE_RIGHT(name, head, parent, tmp, field);\ + tmp = RB_LEFT(parent, field); \ + } \ + if ((RB_LEFT(tmp, field) == NULL || \ + name##_RB_GETCOLOR(RB_LEFT(tmp, field)) == RB_BLACK) &&\ + (RB_RIGHT(tmp, field) == NULL || \ + name##_RB_GETCOLOR(RB_RIGHT(tmp, field)) == RB_BLACK)) {\ + name##_RB_SETCOLOR(tmp, RB_RED); \ + elm = parent; \ + parent = name##_RB_GETPARENT(elm); \ + } else { \ + if (RB_LEFT(tmp, field) == NULL || \ + name##_RB_GETCOLOR(RB_LEFT(tmp, field)) == RB_BLACK) {\ + struct type *oright; \ + if ((oright = RB_RIGHT(tmp, field)) \ + != NULL) \ + name##_RB_SETCOLOR(oright, RB_BLACK);\ + name##_RB_SETCOLOR(tmp, RB_RED); \ + RB_ROTATE_LEFT(name, head, tmp, oright, field);\ + tmp = RB_LEFT(parent, field); \ + } \ + name##_RB_SETCOLOR(tmp,(name##_RB_GETCOLOR(parent)));\ + name##_RB_SETCOLOR(parent, RB_BLACK); \ + if (RB_LEFT(tmp, field)) \ + name##_RB_SETCOLOR(RB_LEFT(tmp, field), RB_BLACK);\ + RB_ROTATE_RIGHT(name, head, parent, tmp, field);\ + elm = RB_ROOT(head); \ + break; \ + } \ + } \ + } \ + if (elm) \ + name##_RB_SETCOLOR(elm, RB_BLACK); \ +} \ + \ +struct type * \ +name##_RB_REMOVE(struct name *head, struct type *elm) \ +{ \ + struct type *child, *parent, *old = elm; \ + int color; \ + if (RB_LEFT(elm, field) == NULL) \ + child = RB_RIGHT(elm, field); \ + else if (RB_RIGHT(elm, field) == NULL) \ + child = RB_LEFT(elm, field); \ + else { \ + struct type *left; \ + elm = RB_RIGHT(elm, field); \ + while ((left = RB_LEFT(elm, field)) != NULL) \ + elm = left; \ + child = RB_RIGHT(elm, field); \ + parent = name##_RB_GETPARENT(elm); \ + color = name##_RB_GETCOLOR(elm); \ + if (child) \ + name##_RB_SETPARENT(child, parent); \ + if (parent) { \ + if (RB_LEFT(parent, field) == elm) \ + RB_LEFT(parent, field) = child; \ + else \ + RB_RIGHT(parent, field) = child; \ + RB_AUGMENT(parent); \ + } else \ + RB_ROOT(head) = child; \ + if (name##_RB_GETPARENT(elm) == old) \ + parent = elm; \ + (elm)->field = (old)->field; \ + if (name##_RB_GETPARENT(old)) { \ + if (RB_LEFT(name##_RB_GETPARENT(old), field) == old)\ + RB_LEFT(name##_RB_GETPARENT(old), field) = elm;\ + else \ + RB_RIGHT(name##_RB_GETPARENT(old), field) = elm;\ + RB_AUGMENT(name##_RB_GETPARENT(old)); \ + } else \ + RB_ROOT(head) = elm; \ + name##_RB_SETPARENT(RB_LEFT(old, field), elm); \ + if (RB_RIGHT(old, field)) \ + name##_RB_SETPARENT(RB_RIGHT(old, field), elm); \ + if (parent) { \ + left = parent; \ + do { \ + RB_AUGMENT(left); \ + } while ((left = name##_RB_GETPARENT(left)) != NULL); \ + } \ + goto color; \ + } \ + parent = name##_RB_GETPARENT(elm); \ + color = name##_RB_GETCOLOR(elm); \ + if (child) \ + name##_RB_SETPARENT(child, parent); \ + if (parent) { \ + if (RB_LEFT(parent, field) == elm) \ + RB_LEFT(parent, field) = child; \ + else \ + RB_RIGHT(parent, field) = child; \ + RB_AUGMENT(parent); \ + } else \ + RB_ROOT(head) = child; \ +color: \ + if (color == RB_BLACK) \ + name##_RB_REMOVE_COLOR(head, parent, child); \ + return (old); \ +} \ + \ +/* Inserts a node into the RB tree */ \ +struct type * \ +name##_RB_INSERT(struct name *head, struct type *elm) \ +{ \ + struct type *tmp; \ + struct type *parent = NULL; \ + int comp = 0; \ + tmp = RB_ROOT(head); \ + while (tmp) { \ + parent = tmp; \ + comp = (cmp)(elm, parent); \ + if (comp < 0) \ + tmp = RB_LEFT(tmp, field); \ + else if (comp > 0) \ + tmp = RB_RIGHT(tmp, field); \ + else \ + return (tmp); \ + } \ + RB_SET(name, elm, parent, field); \ + if (parent != NULL) { \ + if (comp < 0) \ + RB_LEFT(parent, field) = elm; \ + else \ + RB_RIGHT(parent, field) = elm; \ + RB_AUGMENT(parent); \ + } else \ + RB_ROOT(head) = elm; \ + name##_RB_INSERT_COLOR(head, elm); \ + return (NULL); \ +} \ + \ +/* Finds the node with the same key as elm */ \ +struct type * \ +name##_RB_FIND(struct name *head, struct type *elm) \ +{ \ + struct type *tmp = RB_ROOT(head); \ + int comp; \ + while (tmp) { \ + comp = cmp(elm, tmp); \ + if (comp < 0) \ + tmp = RB_LEFT(tmp, field); \ + else if (comp > 0) \ + tmp = RB_RIGHT(tmp, field); \ + else \ + return (tmp); \ + } \ + return (NULL); \ +} \ + \ +/* ARGSUSED */ \ +struct type * \ +name##_RB_NEXT(struct type *elm) \ +{ \ + if (RB_RIGHT(elm, field)) { \ + elm = RB_RIGHT(elm, field); \ + while (RB_LEFT(elm, field)) \ + elm = RB_LEFT(elm, field); \ + } else { \ + if (name##_RB_GETPARENT(elm) && \ + (elm == RB_LEFT(name##_RB_GETPARENT(elm), field))) \ + elm = name##_RB_GETPARENT(elm); \ + else { \ + while (name##_RB_GETPARENT(elm) && \ + (elm == RB_RIGHT(name##_RB_GETPARENT(elm), field)))\ + elm = name##_RB_GETPARENT(elm); \ + elm = name##_RB_GETPARENT(elm); \ + } \ + } \ + return (elm); \ +} \ + \ +struct type * \ +name##_RB_MINMAX(struct name *head, int val) \ +{ \ + struct type *tmp = RB_ROOT(head); \ + struct type *parent = NULL; \ + while (tmp) { \ + parent = tmp; \ + if (val < 0) \ + tmp = RB_LEFT(tmp, field); \ + else \ + tmp = RB_RIGHT(tmp, field); \ + } \ + return (parent); \ +} + + +#define RB_PROTOTYPE_PREV(name, type, field, cmp) \ + RB_PROTOTYPE(name, type, field, cmp) \ +struct type *name##_RB_PREV(struct type *); + + +#define RB_PROTOTYPE_SC_PREV(_sc_, name, type, field, cmp) \ + RB_PROTOTYPE_SC(_sc_, name, type, field, cmp) \ +_sc_ struct type *name##_RB_PREV(struct type *); + +#define RB_GENERATE_PREV(name, type, field, cmp) \ + RB_GENERATE(name, type, field, cmp) \ +struct type * \ +name##_RB_PREV(struct type *elm) \ +{ \ + if (RB_LEFT(elm, field)) { \ + elm = RB_LEFT(elm, field); \ + while (RB_RIGHT(elm, field)) \ + elm = RB_RIGHT(elm, field); \ + } else { \ + if (name##_RB_GETPARENT(elm) && \ + (elm == RB_RIGHT(name##_RB_GETPARENT(elm), field))) \ + elm = name##_RB_GETPARENT(elm); \ + else { \ + while (name##_RB_GETPARENT(elm) && \ + (elm == RB_LEFT(name##_RB_GETPARENT(elm), field)))\ + elm = name##_RB_GETPARENT(elm); \ + elm = name##_RB_GETPARENT(elm); \ + } \ + } \ + return (elm); \ +} \ + +#define RB_NEGINF -1 +#define RB_INF 1 + +#define RB_INSERT(name, x, y) name##_RB_INSERT(x, y) +#define RB_REMOVE(name, x, y) name##_RB_REMOVE(x, y) +#define RB_FIND(name, x, y) name##_RB_FIND(x, y) +#define RB_NEXT(name, x, y) name##_RB_NEXT(y) +#define RB_PREV(name, x, y) name##_RB_PREV(y) +#define RB_MIN(name, x) name##_RB_MINMAX(x, RB_NEGINF) +#define RB_MAX(name, x) name##_RB_MINMAX(x, RB_INF) + +#define RB_FOREACH(x, name, head) \ + for ((x) = RB_MIN(name, head); \ + (x) != NULL; \ + (x) = name##_RB_NEXT(x)) + +#define RB_FOREACH_FROM(x, name, y) \ + for ((x) = (y); \ + ((x) != NULL) && ((y) = name##_RB_NEXT(x), (x) != NULL); \ + (x) = (y)) + +#define RB_FOREACH_REVERSE_FROM(x, name, y) \ + for ((x) = (y); \ + ((x) != NULL) && ((y) = name##_RB_PREV(x), (x) != NULL); \ + (x) = (y)) + +#define RB_FOREACH_SAFE(x, name, head, y) \ + for ((x) = RB_MIN(name, head); \ + ((x) != NULL) && ((y) = name##_RB_NEXT(x), (x) != NULL); \ + (x) = (y)) + +#endif /* _LIBKERN_TREE_H_ */ diff --git a/libkern/libkern/version.h.template b/libkern/libkern/version.h.template index 32793fe63..57b97d48a 100644 --- a/libkern/libkern/version.h.template +++ b/libkern/libkern/version.h.template @@ -23,10 +23,6 @@ #ifndef LIBKERN_VERSION_H #define LIBKERN_VERSION_H -#if defined(__cplusplus) -extern "C" { -#endif - /* Kernel versions conform to kext version strings, as described in: * http://developer.apple.com/technotes/tn/tn1132.html */ @@ -35,25 +31,21 @@ extern "C" { * of the kernel */ #define VERSION_MAJOR ###KERNEL_VERSION_MAJOR### -extern const int version_major; /* VERSION_MINOR, version_minor is an integer that represents the minor version * of the kernel */ #define VERSION_MINOR ###KERNEL_VERSION_MINOR### -extern const int version_minor; /* VERSION_VARIANT, version_variant is a string that contains the revision, * stage, and prerelease level of the kernel */ #define VERSION_VARIANT "###KERNEL_VERSION_VARIANT###" -extern const char version_variant[]; /* VERSION_REVISION, version_revision is an integer that represents the revision * of the kernel */ #define VERSION_REVISION ###KERNEL_VERSION_REVISION### -extern const int version_revision; /* VERSION_STAGE, version_stage, is an integer set to one of the following: */ #define VERSION_STAGE_DEV 0x20 @@ -61,20 +53,46 @@ extern const int version_revision; #define VERSION_STAGE_BETA 0x60 #define VERSION_STAGE_RELEASE 0x80 #define VERSION_STAGE ###KERNEL_VERSION_STAGE### -extern const int version_stage; /* VERSION_PRERELEASE_LEVEL, version_prerelease_level, is an integer sequence * number to distinguish between pre-release builds */ #define VERSION_PRERELEASE_LEVEL ###KERNEL_VERSION_PRERELEASE_LEVEL### -extern const int version_prerelease_level; /* OSTYPE, ostype, is a string as returned by uname -s */ #define OSTYPE "Darwin" -extern const char ostype[]; /* OSRELEASE, osrelease, is a string as returned by uname -r */ #define OSRELEASE "###KERNEL_VERSION_LONG###" + +#ifndef ASSEMBLER + +#if defined(__cplusplus) +extern "C" { +#endif + +/* Build-time value of VERSION_MAJOR */ +extern const int version_major; + +/* Build-time value of VERSION_MINOR */ +extern const int version_minor; + +/* Build-time value of VERSION_VARIANT */ +extern const char version_variant[]; + +/* Build-time value of VERSION_REVISION */ +extern const int version_revision; + +/* Build-time value of VERSION_STAGE */ +extern const int version_stage; + +/* Build-time value of VERSION_PRERELEASE_LEVEL */ +extern const int version_prerelease_level; + +/* Build-time value of OSTYPE */ +extern const char ostype[]; + +/* Build-time value of OSRELEASE */ extern const char osrelease[]; /* osbuilder is a string as returned by uname -r */ @@ -94,4 +112,6 @@ extern char osversion[]; } #endif +#endif /* !ASSEMBLER */ + #endif /* LIBKERN_VERSION_H */ diff --git a/libkern/ppc/OSAtomic.s b/libkern/ppc/OSAtomic.s deleted file mode 100644 index 82b1f3c99..000000000 --- a/libkern/ppc/OSAtomic.s +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - /* - * Copyright (c) 1997-1998 Apple Computer, Inc. - * - * - * HISTORY - * - * sdouglas 22 Oct 97 - first checked in from DriverServices - * sdouglas 28 Jul 98 - start IOKit - */ - -#include - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; -; ENTRY functionName -; -; Assembly directives to begin an exported function. -; -; Takes: functionName - name of the exported function -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -.macro ENTRY - .text - .align 2 - .globl $0 -$0: -.endmacro - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -/* -int OSCompareAndSwap( UInt32 oldVal, UInt32 newVal, UInt32 * addr ) - This is now an alias to hw_compare_and_store, see xnu/libkern/Makefile. - -void * OSDequeueAtomic(void ** inList, SInt32 inOffset) - This is also aliased, to hw_dequeue_atomic. - -void OSEnqueueAtomic(void ** inList, void * inNewLink, SInt32 inOffset) - This is aliased to hw_queue_atomic. -*/ - -/* -Note: We can not use the hw_atomic routines provided by osfmk/ppc as -the return the result of the addition not the original value. -*/ -/* -SInt32 OSDecrementAtomic(SInt32 * value) -*/ - ENTRY _OSDecrementAtomic - mr r4, r3 - li r3, -1 - b _OSAddAtomic - -/* -SInt32 OSIncrementAtomic(SInt32 * value) -*/ - - .align 5 - - ENTRY _OSIncrementAtomic - mr r4, r3 - li r3, 1 - -/* -SInt32 OSAddAtomic(SInt32 amount, SInt32 * value) -*/ - - ENTRY _OSAddAtomic - ENTRY _OSAddAtomicLong - - mr r5,r3 /* Save the increment */ -.L_AAretry: - lwarx r3, 0, r4 /* Grab the area value */ - add r6, r3, r5 /* Add the value */ - stwcx. r6, 0, r4 /* Try to save the new value */ - bne- .L_AAretry /* Didn't get it, try again... */ - blr /* Return the original value */ diff --git a/libkern/ppc/bcmp.s b/libkern/ppc/bcmp.s deleted file mode 100644 index 901850379..000000000 --- a/libkern/ppc/bcmp.s +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2000-2001 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -; -#include -#include -; -; int bcmp(const void *LHS, const void *RHS, size_t len); -; -; Because bcmp returns zero if equal and nonzero otherwise, it is slightly -; faster than memcmp, which returns the difference between the first different -; bytes. -; r3 - LHS -; r4 - RHS -; r5 - len - - .align 5 - .globl EXT(bcmp) -LEXT(bcmp) - - cmpwi cr1,r5,6 ; six chars long? - mr r6,r3 ; copy LHS ptr so we can use r3 as result - mr. r3,r5 ; test length and move to r3 - bgt cr1,Llong ; more than 6 chars long - blt cr1,Lshort ; less than 6 - - ; most common operand length is 6 chars (enet addrs) - - lwz r8,0(r6) ; first 4 bytes of LHS - lwz r7,0(r4) ; and RHS - lhz r9,4(r6) ; next 2 of LHS - sub. r3,r8,r7 ; compare first 4 - bnelr ; first 4 differed (r3!=0) - lhz r10,4(r4) ; next 2 of RHS - sub r3,r9,r10 ; compare last 2 - blr ; done, result in r3 - - ; handle long strings -Llong: - srwi r0,r5,2 ; r0 = word len - mtctr r0 ; set up for loop -Llongloop: - lwz r8,0(r6) ; next 4 bytes from LHS - addi r6,r6,4 - lwz r7,0(r4) ; next 4 from RHS - addi r4,r4,4 - sub. r3,r8,r7 ; compare next 4 bytes - bdnzt+ eq,Llongloop ; loop if ctr!=0 and cr0_eq - bnelr ; done if not equal (r3!=0) - - andi. r5,r5,3 ; more to go? - - ; compare short strings (0-5 bytes long) - ; r5 = length remaining - ; cr0= set on length - ; r3 = zero if length is zero -Lshort: - beqlr ; done (r3=0) - mtctr r5 -Lshortloop: - lbz r8,0(r6) ; get next byte from LHS - addi r6,r6,1 - lbz r7,0(r4) ; and next byte from RHS - addi r4,r4,1 - sub. r3,r8,r7 ; compare - bdnzt+ eq,Lshortloop ; loop if ctr!=0 and cr0_eq - blr ; done, r3 set correctly by the subtract - diff --git a/libkern/ppc/memcmp.s b/libkern/ppc/memcmp.s deleted file mode 100644 index 9968bf6f4..000000000 --- a/libkern/ppc/memcmp.s +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (c) 2000-2001 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -; -#include -#include -; -; int memcmp(const void *LHS, const void *RHS, size_t len); -; -; Memcmp returns the difference between the first two different bytes, -; or 0 if the two strings are equal. Because we compare a word at a -; time, this requires a little additional processing once we find a -; difference. -; r3 - LHS -; r4 - RHS -; r5 - len - - .align 5 - .globl EXT(memcmp) -LEXT(memcmp) - - cmpwi cr1,r5,6 ; six is the most common length - mr r6,r3 ; we want to use r3 for compare result - mr. r3,r5 ; test length for 0 - bgt cr1,Llong ; handle long strings - blt cr1,Lshort ; and short strings - - ; six char strings are special cased because they are the most common -Lsix: - lwz r8,0(r6) ; first 4 bytes of LHS - lwz r7,0(r4) ; and RHS - xor. r3,r8,r7 ; compare first 4 - bne Ldifferent ; first 4 differed - lhz r8,4(r6) ; last 2 of LHS - lhz r7,4(r4) ; last 2 of RHS - xor. r3,r8,r7 ; compare last 2 - beqlr ; done if equal - - ; strings differ, so we must compute difference between first two - ; differing bytes. - ; r8 = LHS bytes - ; r7 = RHS bytes - ; r3 = r8 xor r7 (r3!=0) -Ldifferent: - cntlzw r9,r3 ; count leading 0s in xor - rlwinm r10,r9,0,0,28 ; mask off low 3 bits, so r10 = 0, 8, 16, or 24 - subfic r6,r10,24 ; r6 := (24 - r10) - srw r4,r8,r6 ; r4 = LHS differing byte - srw r5,r7,r6 ; r5 = RHS differing byte - sub r3,r4,r5 ; r3 = difference - blr - - ; handle long strings -Llong: - srwi r0,r5,2 ; r0 = word length - mtctr r0 ; set up for loop -Llongloop: - lwz r8,0(r6) ; next 4 bytes from LHS - addi r6,r6,4 - lwz r7,0(r4) ; next 4 from RHS - addi r4,r4,4 - xor. r3,r8,r7 ; compare next 4 bytes - bdnzt+ eq,Llongloop ; loop if ctr!=0 and cr0_eq - bne Ldifferent ; these 4 bytes not equal - - andi. r5,r5,3 ; more to go? - - ; compare short strings (0-5 bytes long) - ; r5 = length (0-5) - ; cr0= set on length - ; r3 = if r5=0, then r3=0 -Lshort: - beqlr ; 0-length strings are defined to be equal (r3=0) - mtctr r5 -Lshortloop: - lbz r8,0(r6) ; get next byte from LHS - addi r6,r6,1 - lbz r7,0(r4) ; and next byte from RHS - addi r4,r4,1 - sub. r3,r8,r7 ; compare - bdnzt+ eq,Lshortloop ; lloop if ctr!=0 and cr0_eq - blr ; done, r3 set correctly by the subtract diff --git a/libkern/ppc/strlen.s b/libkern/ppc/strlen.s deleted file mode 100644 index 0bb80cf99..000000000 --- a/libkern/ppc/strlen.s +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright (c) 2000-2001 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -; -; -; Strlen, optimized for PPC. The routine we use is 2-3x faster -; then the simple loop which checks each byte for zero. -; For 0- and 1-byte strings, the simple routine is faster, but -; only by a few cycles. The algorithm used was adapted from the -; Mac OS 9 stdCLib strcopy routine, which was originally -; written by Gary Davidian. It relies on the following rather -; inobvious but very efficient test: -; -; y = dataWord + 0xFEFEFEFF -; z = ~dataWord & 0x80808080 -; if ( y & z ) = 0 then all bytes in dataWord are non-zero -; -; The test maps any non-zero byte to zeros and any zero byte to 0x80, -; with one exception: 0x01 bytes preceeding the first zero are also -; mapped to 0x80. -; -#include -#include -; -; int strlen(ptr) -; -; - - .align 5 - .globl EXT(strlen) -LEXT(strlen) - - andi. r4,r3,0x03 ; test alignment first - mr r9,r3 ; store the original address for later use.... - bne LalignSource ; align the source addr if not already aligned -Llentry: - lis r5,hi16(0xFEFEFEFF) - lis r6,hi16(0x80808080) - subi r3,r3,0x04 ; pre-decrement r3 for the lwzu - ori r5,r5,lo16(0xFEFEFEFF) ; r5=0xFEFEFEFF - ori r6,r6,lo16(0x80808080) ; r6=0x80808080 - -LLoop: - lwzu r8,4(r3) ; get the first 4 bytes and increment address - add r4,r5,r8 ; r4= data + 0xFEFEFEFF - andc r7,r6,r8 ; r7= ~data & 0x80808080 - and. r4,r4,r7 ; r4= r4 & r7 - beq LLoop ; if r4 is zero, then all bytes are non-zero - -; Now we know one of the bytes in r8 is zero, -; we just have to figure out which one. -; We have mapped 0 bytes to 0x80, and nonzero bytes to 0x00, -; with one exception: -; 0x01 bytes preceeding the first zero are also mapped to 0x80. -; So we have to mask out the 0x80s caused by 0x01s before -; counting leading zeroes to get the bytes in last word. - - rlwinm r5,r8,7,0,31 ; move 0x01 bits to 0x80 position - subf r3,r9,r3 ; start to compute string length - andc r4,r4,r5 ; turn off false hits from 0x0100 worst case - cntlzw r7,r4 ; now we can count leading 0s - srwi r7,r7,3 ; convert 0,8,16,24 to 0,1,2,3 - add r3,r3,r7 ; add in nonzero bytes in last word - blr - -; We must align the source address for two reasons: to avoid spurious page -; faults, and for speed. -; r4 = low 2 bits of address (1,2, or 3) -; r3 = address -; r9 = original address (still same as r3) - -LalignSource: - lbz r5,0(r3) ; get the first byte... - subic. r4,r4,2 ; test for 1, 2 or 3 bytes - addi r3,r3,1 ; increment address - addi r6,r9,1 ; now r6==r3 - cmpwi cr1,r5,0 ; zero? - beq cr1,Lreturn ; if its zero return zero - bgt Llentry ; address is aligned now if low bits were 3 - - lbz r5,0(r3) ; get the next byte... - addi r3,r3,1 ; increment address - cmpwi cr1,r5,0 ; zero? - beq cr1,Lreturn ; if its zero return one - beq Llentry ; addr is aligned now if low bits were 2 - - lbz r5,0(r3) ; get the next byte... - addi r3,r3,1 ; increment address - cmpwi cr1,r5,0 ; zero? - bne cr1,Llentry ; not zero, continue check (now aligned) -Lreturn: - sub r3,r3,r6 ; get string length (0, 1, or 2) - blr - diff --git a/libkern/uuid/Makefile b/libkern/uuid/Makefile index c7c467538..13a3f1969 100644 --- a/libkern/uuid/Makefile +++ b/libkern/uuid/Makefile @@ -9,8 +9,6 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ INSTINC_SUBDIRS_X86_64 = \ @@ -19,8 +17,6 @@ INSTINC_SUBDIRS_ARM = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ diff --git a/libkern/uuid/uuid.c b/libkern/uuid/uuid.c index ac9db3f84..ffc5c8059 100644 --- a/libkern/uuid/uuid.c +++ b/libkern/uuid/uuid.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004-2010 Apple Inc. All rights reserved. * * %Begin-Header% * Redistribution and use in source and binary forms, with or without @@ -51,19 +51,22 @@ read_node(uint8_t *node) { #if NETWORKING struct ifnet *ifp; - struct ifaddr *ifa; struct sockaddr_dl *sdl; ifnet_head_lock_shared(); TAILQ_FOREACH(ifp, &ifnet_head, if_link) { - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - sdl = (struct sockaddr_dl *)ifa->ifa_addr; - if (sdl && sdl->sdl_family == AF_LINK && sdl->sdl_type == IFT_ETHER) { - memcpy(node, LLADDR(sdl), 6); - ifnet_head_done(); - return; - } + ifnet_lock_shared(ifp); + IFA_LOCK_SPIN(ifp->if_lladdr); + sdl = (struct sockaddr_dl *)ifp->if_lladdr->ifa_addr; + if (sdl->sdl_type == IFT_ETHER) { + memcpy(node, LLADDR(sdl), 6); + IFA_UNLOCK(ifp->if_lladdr); + ifnet_lock_done(ifp); + ifnet_head_done(); + return; } + IFA_UNLOCK(ifp->if_lladdr); + ifnet_lock_done(ifp); } ifnet_head_done(); #endif /* NETWORKING */ diff --git a/libkern/x86_64/OSAtomic.s b/libkern/x86_64/OSAtomic.s index b3b26164a..30713ef3d 100644 --- a/libkern/x86_64/OSAtomic.s +++ b/libkern/x86_64/OSAtomic.s @@ -31,13 +31,11 @@ #;*************************************************************************** .globl _OSCompareAndSwap - -// TODO FIXME!! _OSCompareAndSwap: #;oldValue, newValue, ptr movl %edi, %eax lock - cmpxchgl %esi, 0(%rdx) #; CAS (eax is an implicit operand) - sete %al #; did CAS succeed? (TZ=1) + cmpxchgl %esi, (%rdx) #; CAS (eax is an implicit operand) + sete %al #; did CAS succeed? (TZ=1) movzbq %al, %rax #; clear out the high bytes ret @@ -50,10 +48,10 @@ _OSCompareAndSwap: #;oldValue, newValue, ptr _OSCompareAndSwap64: _OSCompareAndSwapPtr: #;oldValue, newValue, ptr - movq %rdi, %rax + movq %rdi, %rax lock - cmpxchgq %rsi, 0(%rdx) #; CAS (eax is an implicit operand) - sete %al #; did CAS succeed? (TZ=1) + cmpxchgq %rsi, (%rdx) #; CAS (rax is an implicit operand) + sete %al #; did CAS succeed? (TZ=1) movzbq %al, %rax #; clear out the high bytes ret @@ -66,7 +64,7 @@ _OSCompareAndSwapPtr: #;oldValue, newValue, ptr _OSAddAtomic64: _OSAddAtomicLong: lock - xaddq %rdi, 0(%rsi) #; Atomic exchange and add + xaddq %rdi, (%rsi) #; Atomic exchange and add movq %rdi, %rax; ret @@ -78,6 +76,6 @@ _OSAddAtomicLong: .globl _OSAddAtomic _OSAddAtomic: lock - xaddl %edi, 0(%rsi) #; Atomic exchange and add + xaddl %edi, (%rsi) #; Atomic exchange and add movl %edi, %eax; ret diff --git a/libkern/zlib/adler32.c b/libkern/zlib/adler32.c index bf0d9723a..00214cd2e 100644 --- a/libkern/zlib/adler32.c +++ b/libkern/zlib/adler32.c @@ -32,8 +32,6 @@ /* @(#) $Id$ */ -#include // For uintptr_t. - #define ZLIB_INTERNAL #if KERNEL @@ -42,8 +40,9 @@ #include "zlib.h" #endif /* KERNEL */ -#if defined _ARM_ARCH_6 - extern uLong adler32_vec(uLong adler, uLong sum2, const Bytef *buf, uInt len); +#if defined __x86_64__ || defined __i386__ || defined _ARM_ARCH_6 +#include // For uintptr_t. + extern uLong adler32_vec(uLong adler, uLong sum2, const Bytef *buf, uInt len); #endif #define BASE 65521UL /* largest prime smaller than 65536 */ @@ -98,9 +97,7 @@ uLong ZEXPORT adler32(adler, buf, len) uInt len; { unsigned long sum2; -#if !defined _ARM_ARCH_6 unsigned n; -#endif /* split Adler-32 into component sums */ sum2 = (adler >> 16) & 0xffff; @@ -133,8 +130,10 @@ uLong ZEXPORT adler32(adler, buf, len) return adler | (sum2 << 16); } -#if defined _ARM_ARCH_6 - /* align buf to 16-byte boundary */ +#if defined __x86_64__ || defined __i386__ || defined _ARM_ARCH_6 + + if (len>=32000) { /* use vector code only if len is sufficiently large to compensate registers save/restore */ + /* align buf to 16-byte boundary */ while (((uintptr_t)buf)&15) { /* not on a 16-byte boundary */ len--; adler += *buf++; @@ -143,9 +142,10 @@ uLong ZEXPORT adler32(adler, buf, len) MOD4(sum2); /* only added so many BASE's */ } - return adler32_vec(adler, sum2, buf, len); // armv7 neon vectorized implementation + return adler32_vec(adler, sum2, buf, len); // x86_64 or i386 (up to SSE3) or armv6 or up + } -#else // _ARM_ARCH_6 +#endif // defined __x86_64__ || defined __i386__ || defined _ARM_ARCH_6 /* do length NMAX blocks -- requires just one modulo operation */ while (len >= NMAX) { @@ -176,8 +176,6 @@ uLong ZEXPORT adler32(adler, buf, len) /* return recombined sums */ return adler | (sum2 << 16); - -#endif // _ARM_ARCH_6 } /* ========================================================================= */ diff --git a/libkern/zlib/arm/adler32vec.s b/libkern/zlib/arm/adler32vec.s deleted file mode 100644 index 3af072caa..000000000 --- a/libkern/zlib/arm/adler32vec.s +++ /dev/null @@ -1,428 +0,0 @@ -#include - -#define BASE 65521 /* largest prime smaller than 65536 */ -#define NMAX 5552 /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */ - -// Note: buf should have been 16-byte aligned in the caller function, - -// uLong adler32_vec(unsigned int adler, unsigned int sum2, const Bytef* buf, int len) { -// unsigned n; -// while (len >= NMAX) { -// len -= NMAX; -// n = NMAX / 16; /* NMAX is divisible by 16 */ -// do { -// DO16(buf); /* 16 sums unrolled */ -// buf += 16; -// } while (--n); -// MOD(adler); -// MOD(sum2); -// } -// if (len) { /* avoid modulos if none remaining */ -// while (len >= 16) { -// len -= 16; -// DO16(buf); -// buf += 16; -// } -// while (len--) { -// adler += *buf++; -// sum2 += adler; -// } -// MOD(adler); -// MOD(sum2); -// } -// return adler | (sum2 << 16); /* return recombined sums */ -// } - - -/* - DO16 vectorization: - given initial unsigned int sum2 and adler, and a new set of 16 input bytes (x[0:15]), it can be shown that - sum2 += (16*adler + 16*x[0] + 15*x[1] + ... + 1*x[15]); - adler += (x[0] + x[1] + ... + x[15]); - - therefore, this is what can be done to vectorize the above computation - 1. 16-byte aligned vector load into q2 (x[0:x15]) - 2. sum2 += (adler<<4); - 3. vmull.u8 (q9,q8),q2,d2 where d2 = (1,1,1,1...,1), (q9,q8) + 16 16-bit elements x[0:15] - 4. vmull.u8 (q11,q10),q2,q0 where q0 = (1,2,3,4...,16), (q11,q10) + 16 16-bit elements (16:1)*x[0:15] - 5. parallel add (with once expansion to 32-bit) (q9,q8) and (q11,q10) all the way to accumulate to adler and sum2 - - In this revision, whenever possible, 2 DO16 loops are combined into a DO32 loop. - 1. 32-byte aligned vector load into q2,q14 (x[0:x31]) - 2. sum2 += (adler<<5); - 3. vmull.u8 (4 q registers),(q2,q14),d2 where d2 = (1,1,1,1...,1), (4 q registers) : 32 16-bit elements x[0:31] - 4. vmull.u8 (4 q registers),(q2,q14),(q0,q15) where q0 = (1,...,32), (4 q regs) : 32 16-bit elements (32:1)*x[0:31] - 5. parallel add (with once expansion to 32-bit) the pair of (4 q regs) all the way to accumulate to adler and sum2 - - This change improves the performance by ~ 0.55 cycle/uncompress byte on ARM Cortex-A8. - -*/ - -/* - MOD implementation: - adler%BASE = adler - floor(adler*(1/BASE))*BASE; where (1/BASE) = 0x80078071 in Q47 - 1. vmull.u32 q2,(adler,sum2),(1/BASE) // *(1/BASE) in Q47 - 2. vshr.u64 q2,q2,#47 // floor function - 3. vpadd.u32 d4,d4,d5 // merge into a double word in d4 - 4. vmls.u32 (adler,sum2),d4,d3[0] // (adler,sum2) -= floor[(adler,sum2)/BASE]*BASE - -*/ - -#if defined _ARM_ARCH_6 // this file would be used only for armv6 or above - - - .text - .align 2 - .globl _adler32_vec -_adler32_vec: - -#if (!KERNEL_SUPPORT_NEON) || (!defined _ARM_ARCH_7) // for armv6 or armv7 without neon support - - - #define adler r0 - #define sum2 r1 - #define buf r2 - #define len r3 - #define one_by_base r4 - #define base r5 - #define nmax r6 - #define t r12 - #define vecs lr - #define x0 r8 - #define x1 r10 - #define x2 r11 - #define x3 r12 - #define zero r9 - - // this macro performs adler/sum2 update for 4 input bytes - - .macro DO4 - add sum2, adler, lsl #2 // sum2 += 4*adler; - ldr x0,[buf] // 4 bytes in 1 32-bit word - usada8 adler, x0, zero, adler // adler += sum(x0:x3) - ldrb x0,[buf], #4 // x0 - ldrb x2,[buf,#-2] // x2 - ldrb x1,[buf,#-3] // x1 - ldrb x3,[buf,#-1] // x3 - add sum2, x0, lsl #2 // sum2 += 4*x0 - add x3, x3, x1, lsl #1 // x3+2*x1 - add sum2, x2, lsl #1 // sum2 += 2*x2 - add x3, x1 // x3+3*x1 - add sum2, x3 // sum2 += x3+3*x1 - .endm - - // the following macro cascades 4 DO4 into a adler/sum2 update for 16 bytes - .macro DO16 - DO4 // adler/sum2 update for 4 input bytes - DO4 // adler/sum2 update for 4 input bytes - DO4 // adler/sum2 update for 4 input bytes - DO4 // adler/sum2 update for 4 input bytes - .endm - - // the following macro performs adler sum2 modulo BASE - .macro modulo_base - umull x0,x1,adler,one_by_base // adler/BASE in Q47 - umull x2,x3,sum2,one_by_base // sum2/BASE in Q47 - lsr x1, #15 // x1 >> 15 = floor(adler/BASE) - lsr x3, #15 // x3 >> 15 = floor(sum2/BASE) - mla adler, x1, base, adler // adler %= base; - mla sum2, x3, base, sum2 // sum2 %= base; - .endm - - adr t, coeffs - push {r4-r6, r8-r11, lr} - ldmia t, {one_by_base, base, nmax} // load up coefficients - - subs len, nmax // pre-subtract len by NMAX - eor zero, zero // a dummy zero register to use usada8 instruction - blt len_lessthan_NMAX // if (len < NMAX) skip the while loop - -while_lengenmax_loop: // do { - lsr vecs, nmax, #4 // vecs = NMAX/16; - -len16_loop: // do { - - DO16 - - subs vecs, #1 // vecs--; - bgt len16_loop // } while (vec>0); - - modulo_base // adler sum2 modulo BASE - - subs len, nmax // len -= NMAX - bge while_lengenmax_loop // } while (len >= NMAX); - -len_lessthan_NMAX: - adds len, nmax // post-subtract len by NMAX - - subs len, #16 // pre-decrement len by 16 - blt len_lessthan_16 - -len16_loop2: - - DO16 - - subs len, #16 - bge len16_loop2 - -len_lessthan_16: - adds len, #16 // post-increment len by 16 - beq len_is_zero - -remaining_buf: - ldrb x0, [buf], #1 - subs len, #1 - add adler, x0 - add sum2, adler - bgt remaining_buf - -len_is_zero: - - modulo_base // adler sum2 modulo BASE - - add r0, adler, sum2, lsl #16 // to return sum2<<16 | adler - - pop {r4-r6, r8-r11, pc} - - .align 2 -coeffs: - .long -2146992015 - .long -BASE - .long NMAX - -#else // KERNEL_SUPPORT_NEON - - - - #define adler r0 - #define sum2 r1 - #define buf r2 - #define len r3 - #define nmax r4 - #define vecs lr // vecs = NMAX/16 - #define n r5 - - #define t r12 - - #define sum2_coeff q0 - #define sum2_coeff0 d0 - #define sum2_coeff1 d1 - #define alder_coeff q1 - #define ones d2 - #define x0_x15 q2 - #define x0_x7 d4 - #define x8_x15 d5 - #define adlersum2 d6 - #define adler16 d25 - -#if defined _ARM_ARCH_7 - - adr t, vec_table // address to vec_table[] - stmfd sp!, {r4, r5, lr} - - vld1.32 {q0-q1},[t,:128]! // loading up coefficients for adler/sum2 computation - vld1.32 {q15},[t,:128]! // for sum2 computation - ldr nmax, [t] // NMAX - - vmov adlersum2, sum2, adler // pack up adler/sum2 into a double register - - cmp len, nmax // len vs NMAX - lsr vecs, nmax, #4 // vecs = NMAX/16; - blt len_lessthan_NMAX // if (len < NMAX) skip the while loop - - sub len, nmax // pre-decrement len by NMAX - -while_len_ge_NMAX_loop: // while (len>=NMAX) { - - mov n, vecs, lsr #1 // n = NMAX/16; - -do_loop: // do { - - vshll.u32 q12, adlersum2, #5 // d25 = (0,32*adler) to be added into (adler,sum2) - vld1.32 {x0_x15},[buf,:128]! // 16-byte input x0:x15 - vmull.u8 q8, x0_x7, ones // 16-bit x0-x7 - vld1.32 {q14}, [buf,:128]! // x16:x31 - vmull.u8 q9, x8_x15, ones // 16-bit x8-x15 - vadd.u32 adlersum2,adler16 // sum2 += old adler*32; - vmull.u8 q12, d28, ones // 16-bit x16-x23 - vmull.u8 q13, d29, ones // 16-bit x24-x31 - vmull.u8 q10, d28, sum2_coeff0 // 16-bit x16*16, x17*15, ..., x23*9 - vmull.u8 q11, d29, sum2_coeff1 // 16-bit x24*8, x25*7, ..., x31*1 - vadd.u16 q8, q8, q9 // q8 = (x0+x8):(x7+x15) 8 16-bit elements for adler - vmull.u8 q9, x0_x7, d30 // 16-bit x0*32,...,x7*25 - vmull.u8 q14, x8_x15, d31 // 16-bit x8*24,...,x15*17 - vadd.u16 q12, q12, q13 // q12 = (x16+x24):(x23+x31) 8 16-bit elements for adler - vadd.u16 q10, q11 // 8 16-bit elements for sum2 - vadd.u16 q8, q12 // 8 16-bit elements for adler - vadd.u16 q9, q14 // 8 16-bit elements for sum2 - vadd.u16 q10, q9 // 8 16-bit elements for sum2 - vpaddl.u16 q8, q8 // 4 32-bit elements for adler - vpaddl.u16 q10, q10 // 4 32-bit elements for sum2 - vpadd.u32 d16,d16,d17 // 2 32-bit elements for adler - vpadd.u32 d17,d20,d21 // 2 32-bit elements for sum2 - subs n, #1 // --n - vpadd.u32 d4,d17,d16 // s8 : 32-bit elements for sum2, s9 : 32-bit element for adler - vadd.u32 adlersum2,d4 // update adler/sum2 with the new 16 bytes input - - bgt do_loop // } while (--n); - - vshll.u32 q12, adlersum2, #4 // d25 = (0,16*adler) to be added into (adler,sum2) - - vld1.32 {x0_x15},[buf,:128]! // 16-byte input - - vmull.u8 q8, x0_x7, ones // 16-bit x0-x7 - vmull.u8 q9, x8_x15, ones // 16-bit x8-x15 - vmull.u8 q10, x0_x7, sum2_coeff0 // 16-bit x0*16, x1*15, ..., x7*9 - vmull.u8 q11, x8_x15, sum2_coeff1 // 16-bit x8*8, x9*7, ..., x15*1 - - vadd.u16 q8, q8, q9 // 8 16-bit elements for adler - vadd.u16 q10, q10, q11 // 8 16-bit elements for sum2 - vpaddl.u16 q8, q8 // 4 32-bit elements for adler - vpaddl.u16 q10, q10 // 4 32-bit elements for sum2 - vpadd.u32 d16,d16,d17 // 2 32-bit elements for adler - vpadd.u32 d17,d20,d21 // 2 32-bit elements for sum2 - vadd.u32 adlersum2,adler16 // sum2 += old adler; - vpadd.u32 d4,d17,d16 // s8 : 32-bit elements for sum2, s9 : 32-bit element for adler - vadd.u32 adlersum2,d4 // update adler/sum2 with the new 16 bytes input - - // mod(alder,BASE); mod(sum2,BASE); - vmull.u32 q2,adlersum2,d3[1] // alder/BASE, sum2/BASE in Q47 - vshr.u64 q2,q2,#47 // take the integer part - vpadd.u32 d4,d4,d5 // merge into a double word in d4 - vmls.u32 adlersum2,d4,d3[0] // (adler,sum2) -= floor[(adler,sum2)/BASE]*BASE - - subs len, nmax // len -= NMAX; - bge while_len_ge_NMAX_loop // repeat while len >= NMAX - - add len, nmax // post-increment len by NMAX - -len_lessthan_NMAX: - - cmp len, #0 - beq len_is_zero // if len==0, branch to skip the following - - - subs len, #32 // pre-decrement len by 32 - blt len_lessthan_32 // if len < 32, branch to len16_loop - -len32_loop: - - vshll.u32 q12, adlersum2, #5 // d25 = (0,32*adler) to be added into (adler,sum2) - vld1.32 {x0_x15},[buf,:128]! // 16-byte input x0:x15 - vmull.u8 q8, x0_x7, ones // 16-bit x0-x7 - vld1.32 {q14}, [buf,:128]! // x16:x31 - vmull.u8 q9, x8_x15, ones // 16-bit x8-x15 - vadd.u32 adlersum2,adler16 // sum2 += old adler*32; - vmull.u8 q12, d28, ones // 16-bit x16-x23 - vmull.u8 q13, d29, ones // 16-bit x24-x31 - vmull.u8 q10, d28, sum2_coeff0 // 16-bit x16*16, x17*15, ..., x23*9 - vmull.u8 q11, d29, sum2_coeff1 // 16-bit x24*8, x25*7, ..., x31*1 - vadd.u16 q8, q8, q9 // q8 = (x0+x8):(x7+x15) 8 16-bit elements for adler - vmull.u8 q9, x0_x7, d30 // 16-bit x0*32,...,x7*25 - vmull.u8 q14, x8_x15, d31 // 16-bit x8*24,...,x15*17 - vadd.u16 q12, q12, q13 // q12 = (x16+x24):(x23+x31) 8 16-bit elements for adler - vadd.u16 q10, q11 // 8 16-bit elements for sum2 - vadd.u16 q8, q12 // 8 16-bit elements for adler - vadd.u16 q9, q14 // 8 16-bit elements for sum2 - vadd.u16 q10, q9 // 8 16-bit elements for sum2 - vpaddl.u16 q8, q8 // 4 32-bit elements for adler - vpaddl.u16 q10, q10 // 4 32-bit elements for sum2 - vpadd.u32 d16,d16,d17 // 2 32-bit elements for adler - vpadd.u32 d17,d20,d21 // 2 32-bit elements for sum2 - subs len, #32 // len -= 32; - vpadd.u32 d4,d17,d16 // s8 : 32-bit elements for sum2, s9 : 32-bit element for adler - vadd.u32 adlersum2,d4 // update adler/sum2 with the new 16 bytes input - - bge len32_loop - -len_lessthan_32: - - adds len, #(32-16) // post-increment len by 32, then pre-decrement by 16 - blt len_lessthan_16 // if len < 16, branch to len_lessthan_16 - - vshll.u32 q12, adlersum2, #4 // d25 = (0,16*adler) to be added into (adler,sum2) - - vld1.32 {x0_x15},[buf,:128]! // 16-byte input - - - vmull.u8 q8, x0_x7, ones // 16-bit x0-x7 - vmull.u8 q9, x8_x15, ones // 16-bit x8-x15 - vmull.u8 q10, x0_x7, sum2_coeff0 // 16-bit x0*16, x1*15, ..., x7*9 - vmull.u8 q11, x8_x15, sum2_coeff1 // 16-bit x8*8, x9*7, ..., x15*1 - - vadd.u16 q8, q8, q9 // 8 16-bit elements for adler - vadd.u16 q10, q10, q11 // 8 16-bit elements for sum2 - vpaddl.u16 q8, q8 // 4 32-bit elements for adler - vpaddl.u16 q10, q10 // 4 32-bit elements for sum2 - vpadd.u32 d16,d16,d17 // 2 32-bit elements for adler - vpadd.u32 d17,d20,d21 // 2 32-bit elements for sum2 - subs len, #16 // decrement len by 16 - vadd.u32 adlersum2,adler16 // sum2 += old adler; - vpadd.u32 d4,d17,d16 // s8 : 32-bit elements for sum2, s9 : 32-bit element for adler - vadd.u32 adlersum2,d4 // update adler/sum2 with the new 16 bytes input - -len_lessthan_16: - adds len, #16 // post-increment len by 16 - beq len_is_zero_internal // if len==0, branch to len_is_zero_internal - - // restore adler/sum2 into general registers for remaining (<16) bytes - - vmov sum2, adler, adlersum2 -remaining_len_loop: - ldrb t, [buf], #1 // *buf++; - subs len, #1 // len--; - add adler,t // adler += *buf - add sum2,adler // sum2 += adler - bgt remaining_len_loop // break if len<=0 - - vmov adlersum2, sum2, adler // move to double register for modulo operation - -len_is_zero_internal: - - // mod(alder,BASE); mod(sum2,BASE); - - vmull.u32 q2,adlersum2,d3[1] // alder/BASE, sum2/BASE in Q47 - vshr.u64 q2,q2,#47 // take the integer part - vpadd.u32 d4,d4,d5 // merge into a double word in d4 - vmls.u32 adlersum2,d4,d3[0] // (adler,sum2) -= floor[(adler,sum2)/BASE]*BASE - -len_is_zero: - - vmov sum2, adler, adlersum2 // restore adler/sum2 from (s12=sum2, s13=adler) - add r0, adler, sum2, lsl #16 // to return adler | (sum2 << 16); - ldmfd sp!, {r4, r5, pc} // restore registers and return - - - // constants to be loaded into q registers - .align 4 // 16 byte aligned - -vec_table: - - // coefficients for computing sum2 - .long 0x0d0e0f10 // s0 - .long 0x090a0b0c // s1 - .long 0x05060708 // s2 - .long 0x01020304 // s3 - - // coefficients for computing adler - .long 0x01010101 // s4/d2 - .long 0x01010101 // s5 - - .long BASE // s6 : BASE - .long 0x80078071 // s7 : 1/BASE in Q47 - - // q15 : d30.d31 - .long 0x1d1e1f20 // s0 - .long 0x191a1b1c // s1 - .long 0x15161718 // s2 - .long 0x11121314 // s3 - -NMAX_loc: - .long NMAX // NMAX - -#endif // _ARM_ARCH_7 - -#endif // (!KERNEL_SUPPORT_NEON) || (!defined _ARM_ARCH_7) - -#endif // _ARM_ARCH_6 - diff --git a/libkern/zlib/arm/inffastS.s b/libkern/zlib/arm/inffastS.s deleted file mode 100644 index fcf018e82..000000000 --- a/libkern/zlib/arm/inffastS.s +++ /dev/null @@ -1,565 +0,0 @@ -#include - -// the follow assembly code was hard wired to POSTINC not defined, - -#if 0 // #ifdef POSTINC -# define OFF 0 -# define PUP(a) *(a)++ -#else -# define OFF 1 -# define PUP(a) *++(a) -#endif - -// the code uses r9, therefore, it does not meet the register protocol for armv5 and below -// the code can only be used for armv6 and above - -#if defined _ARM_ARCH_6 - - .cstring - .align 2 -LC0: - .ascii "invalid distance too far back\0" - .align 2 -LC1: - .ascii "invalid distance code\0" - .align 2 -LC2: - .ascii "invalid literal/length code\0" - - // renaming the register and stack memory use - - #define out r0 - #define strm r10 - #define state r5 - #define in r11 - #define write r9 - #define distcode r8 - #define bits lr - #define hold r4 - - // stack memory allocation - - #define window_loc [sp,#0] - #define last_loc [sp,#4] - #define beg_loc [sp,#8] - #define end_loc [sp,#12] - #define wsize_loc [sp,#16] - #define whave_loc [sp,#20] - #define windowm1_loc [sp,#28] - #define lmask_loc [sp,#32] - #define dmask_loc [sp,#36] - #define op_loc [sp,#44] - #define dist_loc [sp,#48] - - #define local_size 52 - - // the following defines the variable offset in the inflate_state structure (in inflate.h) - - #define state_mode [state, #0] - #define state_last [state, #4] - #define state_wrap [state, #8] - #define state_havedict [state, #12] - #define state_flags [state, #16] - #define state_dmax [state, #20] - #define state_wbits [state, #36] - #define state_wsize [state, #40] - #define state_whave [state, #44] - #define state_write [state, #48] - #define state_window [state, #52] - #define state_hold [state, #56] - #define state_bits [state, #60] - #define state_lencode [state, #76] - #define state_distcode [state, #80] - #define state_lenbits [state, #84] - #define state_distbits [state, #88] - - -// void inflate_fast(z_streamp strm, unsigned start) -// input : -// r0 = strm, (move to r10) -// r1 = start - - .text - .align 2 - .globl _inflate_fast -_inflate_fast: - - stmfd sp!, {r4-r6,r8-r11,lr} - sub sp, sp, #local_size - -#if defined(_ARM_ARCH_5) - ldrd r2,r3,[r0, #0] // r2 = strm->next_in, r3 = strm->avail_in -#else - ldmia r0, {r2-r3} -#endif - - sub in, r2, #OFF // in = strm->next_in - OFF; - sub r2, #(OFF+5) // next_in -= (OFF+5); - ldr state, [r0, #28] // state = (struct inflate_state FAR *)strm->state; - add r3, r3, r2 // last = next_in - OFF + (avail_in - 5); next_in already updated - mov strm, r0 - str r3, last_loc // store last to release r3 - - ldr r3, [r0, #12] // next_out - ldr r2, [strm, #16] // avail_out - - sub out, r3, #OFF // out = strm->next_out - OFF; r0 is used as out from this point on - - sub r3, r3, #256 // next_out - 256 - rsb r1, r2, r1 // start - avail_out - sub r3, r3, #(1+OFF) // next_out-OFF-257 - add r3, r3, r2 // r3 = end = avail_out + (next_out-OFF) - 257 = avail_out + out - 257 - rsb r2, r1, out // r2 = beg = out - (start - avail_out); -#if defined(_ARM_ARCH_5) - strd r2,r3, beg_loc // store beg/end - ldrd r2,r3, state_wsize // wsize/whave - strd r2,r3, wsize_loc // store wsize/whave - //ldrd r6,hold, state_window // window/hold, hold use r7 - ldr r6, state_window // state->window - ldr hold, state_hold // state->hold - nop -#else - // for architecture < armv5, ldrd/strd is not available - str r2, beg_loc // store beg - str r3, end_loc // store end - ldr r2, state_wsize // state->wsize - ldr r3, state_whave // state->whave - str r2, wsize_loc // store wsize - str r3, whave_loc // store whave - ldr r6, state_window // state->window - ldr hold, state_hold // state->hold -#endif - - ldr ip, state_lencode // lencode - mov r3, #1 // used to derive lmask and dmask - ldr write, state_write // write (r9 from this point on) : window write index - nop - str ip, [sp, #40] // save lencode - sub ip, r6, #1 // window-1 - str r6, window_loc // store window - str ip, windowm1_loc // store window-1 - ldr r2, state_lenbits // lenbits - ldr bits, state_bits // bits, use lr from this point on - ldr distcode, state_distcode// distcode, use r8 - mov r2, r3, asl r2 // (1<lenbits) - 1; - mov r3, r3, asl r12 // (1U << state->distbits) - sub r3, r3, #1 // dmask = (1U << state->distbits) - 1; - -#if defined(_ARM_ARCH_5) - strd r2, r3, lmask_loc // store lmask/dmask -#else - str r2, lmask_loc // lmask - str r3, dmask_loc // dmask -#endif - - // start the do loop decoding literals and length/distances - // until end-of-block or not enough input data or output space - -do_loop: - cmp bits, #15 // bits vs 15 - ldr r1, lmask_loc // lmask - bge bitsge15 // if bits >= 15, skip loading new 16 bits - - // this is a shortcut with the processor reads data in little-endian mode - ldrh r3, [in,#1] // read 2 bytes - add in, #2 // in pointer += 2 - add hold, hold, r3, asl bits // deposit the new 2 bytes into hold - add bits, #16 // bits count += 16 - -bitsge15: - ldr ip, [sp, #40] // restore lencode - and r3, hold, r1 // r3 = hold & lmask - b dolen - -op_not_zero: - - tst r2, #16 // if (op&16) - bne length_base // branch to length_base - - tst r2, #64 // else if (op&64) - bne end_of_block // branch to end_of_block processing - - // 2nd-level length code, this is the part where if ((op & 64) == 0) { ... } - - // this.val + (hold & ((1U << op) - 1)); - // r3 = r1 + hold & ((1< 8-bit code, 8-bit bits, 16-bit val - ldrb r2, [ip,r3,asl #2] // op = (unsigned)(this.bits); - add r3, ip, r3, asl #2 // r3 = this - ldrb ip, [r3, #1] // ip = this.bits - ldrh r1, [r3, #2] // r1 = this.value - cmp r2, #0 // op == 0 ? - - mov hold, hold, lsr ip // hold >>= this.bits - rsb bits, ip, bits // bits -= this.bits - bne op_not_zero // branch to op_not_zero if this.op != 0 - - strb r1, [out, #1]! // PUP(out) = (unsigned char)(this.val); - -do_loop_while: - ldr r1, last_loc // last - ldr r2, end_loc // end - cmp in, r1 // compare in vs last - cmpcc out, r2 // if in < last, compare out vs end - bcc do_loop // if (in < last && out < end) go back to do_loop - -update_state_and_return: - - sub r2, in, bits, lsr #3 // r2 = in - (bits>>3) - - add r3, r2, #OFF // r3 = (in - (bits>>3)) + OFF - str r3, [strm, #0] // strm->next_in = in + OFF; - - add r3, out, #OFF // out + OFF - str r3, [strm, #12] // strm->next_out = out + OFF; - - ldr r3, last_loc // r3 = last - ldr ip, end_loc // ip = end - - cmp r3, r2 // compare last vs in - addhi r3, r3, #5 // if last > in, last +=5 - movls r6, r3 // o.w., r6 = last - rsbls r3, r6, r2 // r3 = in-last - rsbhi r3, r2, r3 // r3 = (last+5) - in - rsbls r3, r3, #5 // r3 = 5 - (in-last); - cmp out, ip // compare out vs end - str r3, [strm, #4] // strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last)); - movcs r2, ip // if out=end, r3 = end+256 - rsbcs r3, r2, out // if out=end, r3 = end+257 - rsbcs r3, r3, #256 // if out>3) << 3; - rsbcc r3, out, r3 // if out=end, r3 = 257 + (end-out) - str r3, [strm, #16] // strm->avail_out = (unsigned)(out < end ? 257 + (end - out) : 257 - (out - end)); - - // hold &= (1U << bits) - 1; - - rsb ip, bits, #32 // 32-bits - ror hold, hold, bits // this is equivalent to hold<<(32-bits) - lsr hold, hold, ip // logical shift right by (32-bits), hold now only keeps the bits LSBs - - str bits, state_bits // state->bits = bits; - str hold, state_hold // state->hold = hold; - - add sp, #local_size // pop out stack memory - ldmfd sp!,{r4-r6,r8-r11,pc} // restore registers and return - -length_base: // r2=op, r1=lmask - ands r2, r2, #15 // op&=15; - mov r6, r1 // len = (unsigned) this.val; - beq op_is_zero // if op==0, branch to op_is_zero - cmp r2, bits // op vs bits - ldrhib r3, [in, #1]! // if (op>bits) r3 = (PUP(in)); - addhi hold, hold, r3, asl bits // if (op>bits) hold += (unsigned long)(PUP(in)) << bits; - - rsb ip, r2, #32 // 32-op - ror r3, hold, r2 // (hold<<(32-op)) - add r6, r1, r3, lsr ip // len += (unsigned)hold & ((1U << op) - 1); - - addhi bits, bits, #8 // if (op>bits) bits += 8; - - mov hold, hold, lsr r2 // hold >>= op; - rsb bits, r2, bits // bits -= op; - -op_is_zero: - cmp bits, #14 - ldrh r3,[in,#1] // if (bits < 15) { 2 (PUP(in)); no condition code for better performance - addls in, #2 // in+=2; - addls hold, hold, r3, asl bits // twice hold += (unsigned long)(PUP(in)) << bits; - addls bits, #16 // 2 bits += 8; } - -dodist: - - ldr r2, dmask_loc // r2 = dmask - and r3, hold, r2 // r3 = hold & dmask - mov r2, r3, asl #2 - add r3, r2, distcode // &dcode[hold&dmask]; - ldrb ip, [r2, distcode] // op - ldrh r1, [r3, #2] // dist = (unsigned)(this.val); - tst ip, #16 // op vs 16 - ldrb r3, [r3, #1] // this.bits - mov hold, hold, lsr r3 // hold >>= this.bits; - rsb bits, r3, bits // bits -= this.bits; - bne distance_base // if (op&16) { distance base processing } - tst ip, #64 // - beq second_distance_code // else if ((op&64)==0) branch to 2nd level distance code - - b invalide_distance_code - -check_2nd_level_distance_code: - - tst r2, #64 // check for esle if ((op & 64) == 0) for 2nd level distance code - bne invalide_distance_code - -second_distance_code: - - rsb r2, ip, #32 // 32-op - ror r3, hold, ip // hold<<(32-op) - add r3, r1, r3, lsr r2 // this.val + (hold & ((1U << op) - 1)) - - mov r2, r3, asl #2 - add r3, r2, distcode // this = dcode[this.val + (hold & ((1U << op) - 1))]; - ldrb r2, [r2, distcode] // this.op - ldrh r1, [r3, #2] // this.val - - tst r2, #16 // op&16 - ldrb r3, [r3, #1] // this.bits - mov ip, r2 // op - mov hold, hold, lsr r3 // hold >> = this.bits - rsb bits, r3, bits // bits -= this.bits - beq check_2nd_level_distance_code - -distance_base: // this is invoked from if ((op&16)!=0) - - and r2, ip, #15 // op &= 15; - cmp r2, bits // op vs bits - ldrhib r3, [in, #1]! // if (op > bits) (PUP(in)) - addhi hold, hold, r3, asl bits // hold += (unsigned long)(PUP(in)) << bits; - addhi bits, bits, #8 // bits += 8; - cmphi r2, bits // internel (bits < op) - ldrhib r3, [in, #1]! // if (op > bits) (PUP(in)) - addhi hold, hold, r3, asl bits // hold += (unsigned long)(PUP(in)) << bits; - addhi bits, bits, #8 // bits += 8 - - rsb ip, r2, #32 // (32-op) - ror r3, hold, r2 // hold<<(32-op) - add r3, r1, r3, lsr ip // dist += (unsigned)hold & ((1U << op) - 1); - str r3, dist_loc // save dist - - -#ifdef INFLATE_STRICT - ldr r1, state_dmax // r1 = dmax - cmp r3, r1 // dist vs dmax - bgt invalid_distance_too_far_back // if dist > dmax, set up msg/mode = bad and break -#endif - - mov hold, hold, lsr r2 // hold >>= op ; - rsb bits, r2, bits // bits -= op; - - ldr ip, beg_loc // beg - ldr r1, dist_loc // dist - rsb r3, ip, out // (out - beg); - - cmp r1, r3 // dist vs (out - beg) - - rsbls r2, r1, out // if (dist<=op) r2 = from = out-dist - bls copy_direct_from_output // if (dist<=op) branch to copy_direct_from_output - - ldr r2, whave_loc // whave - rsb r1, r3, r1 // op = dist-op - cmp r2, r1 // whave vs op - str r1, op_loc // save a copy of op - bcc invalid_distance_too_far_back // if whave < op, message invalid distance too far back, and break - - cmp write, #0 // write - bne non_very_common_case // if (write ==0) non_very_common_case - - // the following : if (write == 0) { /* very common case */ } - ldr r1, op_loc // restore op in r1 - ldr ip, wsize_loc // wsize - cmp r6, r1 // len vs op - rsb r3, r1, ip // wsize - op - ldr ip, windowm1_loc // window - 1 - add r2, ip, r3 // from = window - 1 + wsize - op : setup for using PUP(from) - //movhi r3, r1 // if len > op, r3 = op - //movhi r1, out // if len > op, r1 = out - bhi some_from_window // if (len > op), branch to some_from_window - -finish_copy: - - // while (len > 2) { - // PUP(out) = PUP(from); - // PUP(out) = PUP(from); - // PUP(out) = PUP(from); - // len -= 3; - // } - // if (len) { - // PUP(out) = PUP(from); - // if (len > 1) - // PUP(out) = PUP(from); - // } - - cmp r6, #2 // len > 2 ? - movls r1, r6 // if (len<=2) r1 = len - bls lenle2 // if (len<=2) branch to lenle2 - mov r1, r6 -fcopy_per3bytes: - ldrb r3, [r2, #1] // 1st PUP(from) - sub r1, r1, #3 // len-=3 - cmp r1, #2 // len > 2 ? - strb r3, [out, #1] // 1st PUP(out) = PUP(from); - ldrb r3, [r2, #2] // 2nd PUP(from) - add r2, r2, #3 // from+=3 - strb r3, [out, #2] // 2nd PUP(out) = PUP(from); - ldrb r3, [r2, #0] // 3rd PUP(from) - add out, out, #3 // out+=3 - strb r3, [out, #0] // 3rd PUP(out) = PUP(from); - bgt fcopy_per3bytes // while (len>3) back to loop head -lenle2: - cmp r1, #0 // len - beq do_loop_while // back to while loop head if len==0 - ldrb r3, [r2, #1] // PUP(from) - cmp r1, #2 // check whether len==2 - strb r3, [out, #1]! // PUP(out) = PUP(from); - bne do_loop_while // back to while loop head if len==1 - ldrb r3, [r2, #2] // 2nd PUP(from) - strb r3, [out, #1]! // 2nd PUP(out) = PUP(from); - b do_loop_while // back to while loop head - -end_of_block: - tst r2, #32 // if (op&32) - movne r3, #11 // TYPE? - strne r3, state_mode // state-mode = TYPE - bne update_state_and_return // break the do loop and branch to get ready to return - ldr r3, messages // "invalid literal/length code" message -L75: - add r3, pc, r3 - str r3, [strm, #24] // strm->msg = (char *)"invalid literal/length code"; - mov r3, #27 // BAD? - str r3, state_mode // state->mode = BAD; - b update_state_and_return // break the do loop and branch to get ready to return - -//Read_2_bytes: -// ldrh r3,[in,#1] // 2 (PUP(in)) together -// add in, #2 // 2 in++ -// add hold, hold, r3, asl bits // twice hold += (unsigned long)(PUP(in)) << bits; -// add bits, #16 // 2 bits += 8; -// b dodist // branch to dodist - nop // a pad dummy instruction to give better performance - -copy_direct_from_output: // r2 = from = out - dist ; - - // do { - ldrb r3, [r2, #1] // 1st PUP(from) - sub r6, r6, #3 // len-=3 - cmp r6, #2 // len vs 2 - strb r3, [out, #1] // 1st PUP(out) = PUP(from); - ldrb r3, [r2, #2] // 2nd PUP(from) - add r2, r2, #3 // update from+=3 - strb r3, [out, #2] // 2nd PUP(out) = PUP(from); - ldrb r3, [r2, #0] // 3rd PUP(from); - add out, out, #3 // update out+=3 - strb r3, [out, #0] // 3rd PUP(out) = PUP(from); - bhi copy_direct_from_output // while (len>2); - - // len in r6 can now be 0 1 or 2 - - subs r6,#1 // len--; - ldrb r3, [r2, #1] // PUP(from) - blt do_loop_while // if len<0 back to while loop head - strb r3, [out, #1]! // PUP(out) = PUP(from); - subs r6, #1 // len--; - ldrb r3, [r2, #2] // 2nd PUP(from) - blt do_loop_while // if len<0 back to while loop head - strb r3, [out, #1]! // 2nd PUP(out) = PUP(from); - b do_loop_while // back to while loop head - - -invalide_distance_code: - ldr r3, messages+4 // "invalid distance code" -L72: - add r3, pc, r3 - str r3, [strm, #24] // strm->msg = (char *)"invalid distance code"; - mov r3, #27 - str r3, state_mode // state->mode = BAD; - b update_state_and_return // break, restore registers, and return - - -some_from_window: - ldr r3, dist_loc // dist - rsb r6, r1, r6 // len -= op -some_from_window_loop: // do { - ldrb ip, [r2, #1]! // PUP(from); - subs r1, #1 // --op - strb ip, [out, #1]! // PUP(out) = PUP(from); - bne some_from_window_loop // } while(op); - rsb r2, r3, out // from = out - dist; - b finish_copy - -non_very_common_case: - ldr r1, op_loc // restore op in r1 - cmp write, r1 // write vs op - bcs contiguous_in_window // if (write >= op) branch to contiguous_in_window - - /* wrap around window */ - - ldr r2, wsize_loc // wsize - ldr ip, windowm1_loc // window-1 - add r3, write, r2 // r3 = wsize+write - rsb r3, r1, r3 // r3 = wsize+write-op - add r2, ip, r3 // r2 = from = wsize+write-op+window-1; - rsb r1, write, r1 // op -= write; - - cmp r6, r1 // len vs op - bls finish_copy // if (len <= op) branch to finish_copy - rsb r6, r1, r6 // len -= op -waw_loop: // do { - ldrb r3, [r2, #1]! // PUP(from) - subs r1, r1, #1 // --op; - strb r3, [out, #1]! // PUP(out) = PUP(from); - bne waw_loop // } while (op); - - cmp write, r6 // write vs len - ldr r2, windowm1_loc // if (write>=len) r2 = from = window-1; - bcs finish_copy // if (write>=len) branch to finish_copy - - // some from start of window - - mov r1, write // op = write - sub r6, write // len -= op -sow_loop: // do { - ldrb r3,[r2, #1]! // PUP(from) - subs r1, #1 // --op; - strb r3, [out,#1]! // PUP(out) = PUP(from); - bne sow_loop // } while (op); - - ldr r2, dist_loc // dist - rsb r2, r2, out // r2 = from = out-dist - b finish_copy // continue to finish_copy - - -contiguous_in_window: - ldr ip, windowm1_loc // window-1 - cmp r6, r1 // len vs op - rsb r3, r1, write // r3 = write-op - add r2, ip, r3 // r2 = from = window+write-op-1 - bls finish_copy // if (len <= op) branch to finish_copy - rsb r6, r1, r6 // len -= op - ldr r3, dist_loc // dist -ciw_loop: - ldrb ip, [r2, #1]! // PUP(from) - subs r1, r1, #1 // op-- - strb ip, [out, #1]! // PUP(out) = PUP(from); - bne ciw_loop // while (--op); - rsb r2, r3, out // from = out - dist; - b finish_copy - -invalid_distance_too_far_back: - ldr r3, messages+8 // "invalid distance too far back" -L42: - add r3, pc, r3 - str r3, [strm, #24] // strm->msg = (char *)"invalid distance too far back"; - mov r3, #27 - str r3, state_mode // state->mode = BAD; - b update_state_and_return // break, restore registers, and return - - .align 2 -messages: - .long LC2-8-(L75) - .long LC1-8-(L72) - .long LC0-8-(L42) - -#endif // defined _ARM_ARCH_6 diff --git a/libkern/zlib/inffast.c b/libkern/zlib/inffast.c index 54f0ee815..8be51094c 100644 --- a/libkern/zlib/inffast.c +++ b/libkern/zlib/inffast.c @@ -31,13 +31,14 @@ */ -#if defined _ARM_ARCH_6 +#if defined __x86_64__ || defined __i386__ || defined _ARM_ARCH_6 - // dummy definition, for armv6 or above, compile code from inffastS.s - typedef char DummyDefinition; + // dummy definition, for x86_64 or i386 or armv6 or up, compile code from inffastS.s + typedef char DummyDefinition; #else // architecture + #include "zutil.h" #include "inftrees.h" #include "inflate.h" @@ -352,4 +353,4 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ #endif /* !ASMINF */ -#endif // architecture +#endif // architecture diff --git a/libkern/zlib/intel/adler32vec.s b/libkern/zlib/intel/adler32vec.s new file mode 100644 index 000000000..df9dcf328 --- /dev/null +++ b/libkern/zlib/intel/adler32vec.s @@ -0,0 +1,1050 @@ +/* Apple Copyright 2009 + CoreOS - vector & Numerics, cclee 10-22-09 + + This following source code implements a vectorized version of adler32 computation that is defined in zlib. + The target architectures are x86_64 and i386. + + Given 2 unsigned 32-bit alder and sum2 (both pre-modulo by BASE=65521) and a sequence of input bytes x[0],...x[N-1]. + The adler-sum2 pair is updated according to + + for (i=0;i= NMAX) { +// len -= NMAX; +// n = NMAX / 16; /* NMAX is divisible by 16 */ +// do { +// DO16(buf); /* 16 sums unrolled */ +// buf += 16; +// } while (--n); +// MOD(adler); +// MOD(sum2); +// } +// if (len) { /* avoid modulos if none remaining */ +// while (len >= 16) { +// len -= 16; +// DO16(buf); +// buf += 16; +// } +// while (len--) { +// adler += *buf++; +// sum2 += adler; +// } +// MOD(adler); +// MOD(sum2); +// } +// return adler | (sum2 << 16); +// } + +#if (defined __i386__ || defined __x86_64__) + +#include + + .text + .align 4,0x90 +.globl _adler32_vec +_adler32_vec: + +#if (defined __i386__) + + pushl %ebp + movl %esp, %ebp + + pushl %ebx + pushl %edi + pushl %esi + +#ifdef KERNEL // if this is for kernel, need to save xmm registers + subl $140, %esp // to save %xmm0-%xmm7 into stack, extra 12 to align %esp to 16-byte boundary + movaps %xmm0, 0(%esp) // save xmm0, offset -12 for ebx/edi/esi + movaps %xmm1, 16(%esp) // save xmm1 + movaps %xmm2, 32(%esp) // save xmm2 + movaps %xmm3, 48(%esp) // save xmm3 + movaps %xmm4, 64(%esp) // save xmm4 + movaps %xmm5, 80(%esp) // save xmm5 + movaps %xmm6, 96(%esp) // save xmm6 + movaps %xmm7, 112(%esp) // save xmm7, if this is for SSSE3 or above +#endif + + #define adler %edi // 8(%ebp) + #define sum2 %esi // 12(%ebp) + #define buf %ecx // 16(%ebp) + #define len %ebx // 20(%ebp) + #define zero %xmm0 + #define ones %xmm5 + + movl 8(%ebp), adler + movl 12(%ebp), sum2 + movl 16(%ebp), buf // use ecx as buf pointer + movl 20(%ebp), len + + .macro modulo_BASE + movl $$-2146992015, %eax // 1/BASE in Q47 + mull adler // edx:eax = adler divided by BASE in Q47 + shrl $$15, %edx // edx is now the floor integer of adler and BASE + imull $$BASE, %edx, %edx // edx * BASE + subl %edx, adler // adler -= edx*BASE + movl $$-2146992015, %eax // 1/BASE in Q47 + mull sum2 // edx:eax = sum2 divided by BASE in Q47 + shrl $$15, %edx // edx is now the floor integer of sum2 and BASE + imull $$BASE, %edx, %eax // eax = edx * BASE + subl %eax, sum2 // sum2 -= sdx*BASE + .endmacro + + // update adler/sum2 according to a new 16-byte vector + .macro DO16 + movaps (buf), %xmm1 // 16 bytes vector, in xmm1 + movaps %xmm1, %xmm3 // a copy of the vector, used for unsigned byte in the destination of pmaddubsw + addl $$16, buf // buf -> next vector + psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1 + pmaddubsw %xmm4, %xmm3 // 8 16-bit words to be added for sum2 in xmm3 + imull $$16, adler, %edx // edx = 16*adler; + movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2 + pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3 + paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler + addl %edx, sum2 // sum2 += adler*16; + movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements + movd %xmm1, %edx // to be added to adler + paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2 + addl %edx, adler // update adler + movd %xmm3, %edx // to be added to sum2 + psrlq $$32, %xmm3 // another 32-bit to be added to sum2 + addl %edx, sum2 // sum2 += 1st half of update + movd %xmm3, %edx // to be added to sum2 + addl %edx, sum2 // sum2 += 2nd half of update + .endm + + // update adler/sum2 according to a new 32-byte vector + .macro DO32 + imull $$32, adler, %edx // edx = 32*adler + movaps (buf), %xmm1 // 1st 16 bytes vector + movaps 16(buf), %xmm7 // 2nd 16 bytes vector + movaps %xmm1, %xmm3 // a copy of 1st vector, used for unsigned byte in the destination of pmaddubsw + movaps %xmm7, %xmm2 // a copy of 2nd vector, used for unsigned byte in the destination of pmaddubsw + psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1 + psadbw zero, %xmm7 // 2 16-bit words to be added for adler in xmm7 + addl %edx, sum2 // sum2 += adler*32; + pmaddubsw %xmm6, %xmm3 // 8 16-bit words to be added for sum2 in xmm3 + pmaddubsw %xmm4, %xmm2 // 8 16-bit words to be added for sum2 in xmm2 + paddd %xmm7, %xmm1 // 2 16-bit words to be added for adler in xmm1 + paddd %xmm2, %xmm3 // 8 16-bit words to be added for sum2 in xmm3 + addl $$32, buf // buf -> vector for next iteration + movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2 + pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3 + paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler + movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements + movd %xmm1, %edx // to be added to adler + paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2 + addl %edx, adler // update adler + movd %xmm3, %edx // to be added to sum2 + psrlq $$32, %xmm3 // another 32-bit to be added to sum2 + addl %edx, sum2 // sum2 += 1st half of update + movd %xmm3, %edx // to be added to sum2 + addl %edx, sum2 // sum2 += 2nd half of update + .endm + + // this defines the macro DO16 for SSSE3 not supported + .macro DO16_nossse3 + movaps (buf), %xmm1 // 16 bytes vector + movaps %xmm1, %xmm3 // a copy of the vector, the lower 8 bytes to be shuffled into 8 words + movaps %xmm1, %xmm2 // a copy of the vector, the higher 8 bytes to be shuffled into 8 words + psrldq $$8, %xmm2 // shift down 8 bytes, to reuse the shuffle vector + punpcklbw zero, %xmm3 // convert lower 8 bytes into 8 words + punpcklbw zero, %xmm2 // convert higher 8 bytes into 8 words + pmullw %xmm6, %xmm3 // lower 8 words * 16:9 + pmullw %xmm4, %xmm2 // higher 8 words * 8:1 + addl $$16, buf // buf -> next vector + psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1 + paddw %xmm2, %xmm3 // 8 16-bit words to be added for sum2 in xmm3 + imull $$16, adler, %edx // edx = 16*adler; + movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2 + pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3 + paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler + addl %edx, sum2 // sum2 += adler*16; + movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements + movd %xmm1, %edx // to be added to adler + paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2 + addl %edx, adler // update adler + movd %xmm3, %edx // to be added to sum2 + psrlq $$32, %xmm3 // another 32-bit to be added to sum2 + addl %edx, sum2 // sum2 += 1st half of update + movd %xmm3, %edx // to be added to sum2 + addl %edx, sum2 // sum2 += 2nd half of update + .endm + +#ifdef KERNEL + leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities + mov (%eax), %eax // %eax = __cpu_capabilities +#else + mov _COMM_PAGE_CPU_CAPABILITIES, %eax +#endif + test $(kHasSupplementalSSE3), %eax // __cpu_capabilities & kHasAES + je L_no_ssse3 + + // i386 adler32 with ssse3 + + // need to fill up xmm4/xmm5/xmm6 only if len>=16 + cmpl $16, len + jl L_skip_loading_tables + + // set up table starting address to %eax + leal sum2_coefficients, %eax + + // reading coefficients + pxor zero, zero + movaps (%eax), %xmm6 // coefficients for computing sum2 : pmaddubsw 32:17 + movaps 16(%eax), %xmm4 // coefficients for computing sum2 : pmaddubsw 16:1 + movaps 32(%eax), ones // coefficients for computing sum2 : pmaddwd 1,1,...,1 + +L_skip_loading_tables: + + cmpl $NMAX, len // len vs NMAX + jl len_lessthan_NMAX // if (len < NMAX), skip the following NMAX batches processing + +len_ge_NMAX_loop: // while (len>=NMAX) { + + subl $NMAX, len // len -= NMAX + movl $(NMAX/32), %eax // n = NMAX/32 + +n_loop: // do { + DO32 // update adler/sum2 for a 32-byte input + decl %eax // n--; + jg n_loop // } while (n); + DO16 // update adler/sum2 for a 16-byte input + modulo_BASE // (adler/sum2) modulo BASE; + cmpl $NMAX, len // + jge len_ge_NMAX_loop // } /* len>=NMAX */ + +len_lessthan_NMAX: + + subl $32, len // pre-decrement len by 32 + jl len_lessthan_32 // if len < 32, skip the 32-vector code +len32_loop: // while (len>=32) { + DO32 // update adler/sum2 for a 32-byte input + subl $32, len // len -= 32; + jge len32_loop // } + +len_lessthan_32: + + addl $(32-16), len // post-increment by 32 + pre-decrement by 16 on len + jl L_len_lessthan_16 // if len < 16, skip the 16-vector code + DO16 // update adler/sum2 for a 16-byte input + subl $16, len // len -= 16; + +L_len_lessthan_16: + addl $16, len // post-increment len by 16 + jz len_is_zero // if len==0, branch over scalar processing + +0: // while (len) { + movzbl (buf), %edx // new input byte + incl buf // buf++ + addl %edx, adler // adler += *buf + addl adler, sum2 // sum2 += adler + subl $1, len // len-- + jg 0b // } + +len_is_zero: + + modulo_BASE // (adler/sum2) modulo BASE; + + // construct 32-bit (sum2<<16 | adler) to be returned + + sall $16, sum2 // sum2 <<16 + movl adler, %eax // adler + orl sum2, %eax // sum2<<16 | adler + + +#ifdef KERNEL // if this is for kernel code, need to restore xmm registers + movaps (%esp), %xmm0 // restore xmm0, offset -12 for ebx/edi/esi + movaps 16(%esp), %xmm1 // restore xmm1 + movaps 32(%esp), %xmm2 // restore xmm2 + movaps 48(%esp), %xmm3 // restore xmm3 + movaps 64(%esp), %xmm4 // restore xmm4 + movaps 80(%esp), %xmm5 // restore xmm5 + movaps 96(%esp), %xmm6 // restore xmm6 + movaps 112(%esp), %xmm7 // restore xmm7, if this is for SSSE3 or above + addl $140, %esp // we've already restored %xmm0-%xmm7 from stack +#endif + + popl %esi + popl %edi + popl %ebx + leave // pop ebp out from stack + ret + + +L_no_ssse3: + + // i386 adler32 without ssse3 + + // need to fill up xmm4/xmm5/xmm6 only if len>=16 + cmpl $16, len + jl 2f + + // set up table starting address to %eax + leal sum2_coefficients, %eax + + // reading coefficients + pxor zero, zero + movaps 48(%eax), %xmm6 // coefficients for computing sum2 : pmaddubsw 16:9 + movaps 64(%eax), %xmm4 // coefficients for computing sum2 : pmaddubsw 8:1 + movaps 80(%eax), ones // coefficients for computing sum2 : pmaddwd 1,1,...,1 + +2: + + cmpl $NMAX, len // len vs NMAX + jl 3f // if (len < NMAX), skip the following NMAX batches processing + +0: // while (len>=NMAX) { + + subl $NMAX, len // len -= NMAX + movl $(NMAX/16), %eax // n = NMAX/16 + +1: // do { + DO16_nossse3 // update adler/sum2 for a 16-byte input + decl %eax // n--; + jg 1b // } while (n); + + modulo_BASE // (adler/sum2) modulo BASE; + + cmpl $NMAX, len // + jge 0b // } /* len>=NMAX */ + +3: + + subl $16, len // pre-decrement len by 16 + jl L_len_lessthan_16 // if len < 16, skip the 16-vector code + DO16_nossse3 // update adler/sum2 for a 16-byte input + subl $16, len // len -= 16; + jmp L_len_lessthan_16 + + + .const + .align 4 +sum2_coefficients: // used for vectorizing adler32 computation + + .byte 32 + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 15 + .byte 14 + .byte 13 + .byte 12 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 7 + .byte 6 + .byte 5 + .byte 4 + .byte 3 + .byte 2 + .byte 1 + + // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2 + .word 1 + .word 1 + .word 1 + .word 1 + .word 1 + .word 1 + .word 1 + .word 1 + + + // data for without ssse3 + + .word 16 + .word 15 + .word 14 + .word 13 + .word 12 + .word 11 + .word 10 + .word 9 + .word 8 + .word 7 + .word 6 + .word 5 + .word 4 + .word 3 + .word 2 + .word 1 + + // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2 + .word 1 + .word 1 + .word 1 + .word 1 + .word 1 + .word 1 + .word 1 + .word 1 + +#else // (defined __x86_64__) + + movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities + mov (%rax), %eax // %eax = __cpu_capabilities + test $(kHasSupplementalSSE3), %eax // __cpu_capabilities & kHasSupplementalSSE3 + jne L_has_ssse3 + + // ---------------------------------------------------------------------------------- + // the following is added for x86_64 without SSSE3 support + // it is essentially a translated copy of the i386 code without SSSE3 code + // ---------------------------------------------------------------------------------- + + // input : + // adler : rdi + // sum2 : rsi + // buf : rdx + // len : rcx + + pushq %rbp + movq %rsp, %rbp + pushq %rbx + +#ifdef KERNEL // if for kernel, save %xmm0-%xmm11 + subq $200, %rsp // allocate for %xmm0-%xmm11 (192 bytes), extra 8 to align %rsp to 16-byte boundary + movaps %xmm0, -32(%rbp) + movaps %xmm1, -48(%rbp) + movaps %xmm2, -64(%rbp) + movaps %xmm3, -80(%rbp) + movaps %xmm4, -96(%rbp) + movaps %xmm5, -112(%rbp) + movaps %xmm6, -128(%rbp) +#endif + + #define adler %rdi // 16(%rbp) + #define sum2 %rsi // 24(%ebp) + #define buf %rcx // 32(%ebp) + #define len %rbx // 40(%ebp) + #define zero %xmm0 + #define ones %xmm5 + + movq %rcx, len + movq %rdx, buf + + .macro modulo_BASE + movl $$-2146992015, %eax // 1/BASE in Q47 + mull %edi // edx:eax = adler divided by BASE in Q47 + shrl $$15, %edx // edx is now the floor integer of adler and BASE + imull $$BASE, %edx, %edx // edx * BASE + subq %rdx, adler // adler -= edx*BASE + movl $$-2146992015, %eax // 1/BASE in Q47 + mull %esi // edx:eax = sum2 divided by BASE in Q47 + shrl $$15, %edx // edx is now the floor integer of sum2 and BASE + imull $$BASE, %edx, %eax // eax = edx * BASE + subq %rax, sum2 // sum2 -= sdx*BASE + .endmacro + + // update adler/sum2 according to a new 16-byte vector, no ssse3 + .macro DO16_nossse3 + movaps (buf), %xmm1 // 16 bytes vector + movaps %xmm1, %xmm3 // a copy of the vector, the lower 8 bytes to be shuffled into 8 words + movaps %xmm1, %xmm2 // a copy of the vector, the higher 8 bytes to be shuffled into 8 words + psrldq $$8, %xmm2 // shift down 8 bytes, to reuse the shuffle vector + punpcklbw zero, %xmm3 // convert lower 8 bytes into 8 words + punpcklbw zero, %xmm2 // convert higher 8 bytes into 8 words + pmullw %xmm6, %xmm3 // lower 8 words * 16:9 + pmullw %xmm4, %xmm2 // higher 8 words * 8:1 + add $$16, buf // buf -> next vector + psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1 + paddw %xmm2, %xmm3 // 8 16-bit words to be added for sum2 in xmm3 + imulq $$16, adler, %rdx // edx = 16*adler; + movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2 + pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3 + paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler + add %rdx, sum2 // sum2 += adler*16; + movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements + movd %xmm1, %edx // to be added to adler + paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2 + addq %rdx, adler // update adler + movd %xmm3, %edx // to be added to sum2 + psrlq $$32, %xmm3 // another 32-bit to be added to sum2 + addq %rdx, sum2 // sum2 += 1st half of update + movd %xmm3, %edx // to be added to sum2 + addq %rdx, sum2 // sum2 += 2nd half of update + .endm + + // need to fill up xmm4/xmm5/xmm6 only if len>=16 + cmpq $16, len + jl 0f + + // set up table starting address to %eax + leaq sum2_coefficients_nossse3(%rip), %rax + + // reading coefficients + pxor zero, zero + movaps (%rax), %xmm6 // coefficients for computing sum2 : pmaddubsw 16:9 + movaps 16(%rax), %xmm4 // coefficients for computing sum2 : pmaddubsw 8:1 + movaps 32(%rax), ones // coefficients for computing sum2 : pmaddwd 1,1,...,1 +0: + + cmp $NMAX, len // len vs NMAX + jl 3f // if (len < NMAX), skip the following NMAX batches processing + +0: // while (len>=NMAX) { + + sub $NMAX, len // len -= NMAX + mov $(NMAX/16), %eax // n = NMAX/16 + +1: // do { + DO16_nossse3 // update adler/sum2 for a 16-byte input + decl %eax // n--; + jg 1b // } while (n); + + modulo_BASE // (adler/sum2) modulo BASE; + + cmp $NMAX, len // + jge 0b // } /* len>=NMAX */ + +3: + + sub $16, len // pre-decrement len by 16 + jl 2f // if len < 16, skip the 16-vector code + DO16_nossse3 // update adler/sum2 for a 16-byte input + sub $16, len // len -= 16; + +2: + add $16, len // post-increment len by 16 + jz 1f // if len==0, branch over scalar processing + +0: // while (len) { + movzbq (buf), %rdx // new input byte + incq buf // buf++ + addq %rdx, adler // adler += *buf + addq adler, sum2 // sum2 += adler + decq len // len-- + jg 0b // } + +1: + + modulo_BASE // (adler/sum2) modulo BASE; + + // construct 32-bit (sum2<<16 | adler) to be returned + + salq $16, sum2 // sum2 <<16 + movq adler, %rax // adler + orq sum2, %rax // sum2<<16 | adler + +#ifdef KERNEL // if this is for kernel code, need to restore xmm registers + movaps -32(%rbp), %xmm0 + movaps -48(%rbp), %xmm1 + movaps -64(%rbp), %xmm2 + movaps -80(%rbp), %xmm3 + movaps -96(%rbp), %xmm4 + movaps -112(%rbp), %xmm5 + movaps -128(%rbp), %xmm6 + addq $200, %rsp // we've already restored %xmm0-%xmm11 from stack +#endif + + popq %rbx + leave + ret + + + + .const + .align 4 +sum2_coefficients_nossse3: // used for vectorizing adler32 computation + + // data for without ssse3 + + .word 16 + .word 15 + .word 14 + .word 13 + .word 12 + .word 11 + .word 10 + .word 9 + .word 8 + .word 7 + .word 6 + .word 5 + .word 4 + .word 3 + .word 2 + .word 1 + + // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2 + .word 1 + .word 1 + .word 1 + .word 1 + .word 1 + .word 1 + .word 1 + .word 1 + + + .text + + // ---------------------------------------------------------------------------------- + // the following is the original x86_64 adler32_vec code that uses SSSE3 instructions + // ---------------------------------------------------------------------------------- + +L_has_ssse3: + + // input : + // adler : rdi + // sum2 : rsi + // buf : rdx + // len : rcx + + pushq %rbp + movq %rsp, %rbp + pushq %rbx + +#ifdef KERNEL // if for kernel, save %xmm0-%xmm11 + subq $200, %rsp // allocate for %xmm0-%xmm11 (192 bytes), extra 8 to align %rsp to 16-byte boundary + movaps %xmm0, -32(%rbp) + movaps %xmm1, -48(%rbp) + movaps %xmm2, -64(%rbp) + movaps %xmm3, -80(%rbp) + movaps %xmm4, -96(%rbp) + movaps %xmm5, -112(%rbp) + movaps %xmm6, -128(%rbp) + movaps %xmm7, -144(%rbp) + movaps %xmm8, -160(%rbp) + movaps %xmm9, -176(%rbp) + movaps %xmm10, -192(%rbp) + movaps %xmm11, -208(%rbp) +#endif + + #define adler %rdi // 16(%rbp) + #define sum2 %rsi // 24(%ebp) + #define buf %rcx // 32(%ebp) + #define len %rbx // 40(%ebp) + #define zero %xmm0 + #define ones %xmm5 + + movq %rcx, len + movq %rdx, buf + + // update adler/sum2 according to a new 16-byte vector + .macro DO16 + movaps (buf), %xmm1 // 16 bytes vector + movaps %xmm1, %xmm3 // a copy of the vector, used for unsigned byte in the destination of pmaddubsw + addq $$16, buf // buf -> next vector + psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1 + pmaddubsw %xmm4, %xmm3 // 8 16-bit words to be added for sum2 in xmm3 + imulq $$16, adler, %rdx // edx = 16*adler; + movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2 + pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3 + paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler + addq %rdx, sum2 // sum2 += adler*16; + movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements + movd %xmm1, %edx // to be added to adler + paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2 + addq %rdx, adler // update adler + movd %xmm3, %edx // to be added to sum2 + psrlq $$32, %xmm3 // another 32-bit to be added to sum2 + addq %rdx, sum2 // sum2 += 1st half of update + movd %xmm3, %edx // to be added to sum2 + addq %rdx, sum2 // sum2 += 2nd half of update + .endm + + // update adler/sum2 according to a new 32-byte vector + .macro DO32 + imulq $$32, adler, %rdx // edx = 32*adler + movaps (buf), %xmm1 // 1st 16 bytes vector + movaps 16(buf), %xmm7 // 2nd 16 bytes vector + movaps %xmm1, %xmm3 // a copy of 1st vector, used for unsigned byte in the destination of pmaddubsw + movaps %xmm7, %xmm2 // a copy of 2nd vector, used for unsigned byte in the destination of pmaddubsw + psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1 + psadbw zero, %xmm7 // 2 16-bit words to be added for adler in xmm7 + addq %rdx, sum2 // sum2 += adler*32; + pmaddubsw %xmm6, %xmm3 // 8 16-bit words to be added for sum2 in xmm3 + pmaddubsw %xmm4, %xmm2 // 8 16-bit words to be added for sum2 in xmm2 + paddd %xmm7, %xmm1 // 2 16-bit words to be added for adler in xmm1 + paddw %xmm2, %xmm3 // 8 16-bit words to be added for sum2 in xmm3 + addq $$32, buf // buf -> vector for next iteration + movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2 + pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3 + paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler + movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements + movd %xmm1, %edx // to be added to adler + paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2 + addq %rdx, adler // update adler + movd %xmm3, %edx // to be added to sum2 + psrlq $$32, %xmm3 // another 32-bit to be added to sum2 + addq %rdx, sum2 // sum2 += 1st half of update + movd %xmm3, %edx // to be added to sum2 + addq %rdx, sum2 // sum2 += 2nd half of update + .endm + + // update adler/sum2 according to a new 48-byte vector + + .macro DO48 + imulq $$48, adler, %rdx // edx = 48*adler + + movaps (buf), %xmm7 // 1st 16 bytes vector + movaps 16(buf), %xmm10 // 2nd 16 bytes vector + movaps 32(buf), %xmm11 // 3rd 16 bytes vector + + movaps %xmm7, %xmm1 // 1st vector + movaps %xmm10, %xmm2 // 2nd vector + movaps %xmm11, %xmm3 // 3rd vector + + psadbw zero, %xmm7 // 1st vector for adler + psadbw zero, %xmm10 // 2nd vector for adler + psadbw zero, %xmm11 // 3rd vector for adler + + addq %rdx, sum2 // sum2 += adler*48; + + pmaddubsw %xmm9, %xmm1 // 8 16-bit words to be added for sum2 : 1st vector + pmaddubsw %xmm6, %xmm2 // 8 16-bit words to be added for sum2 : 2nd vector + pmaddubsw %xmm4, %xmm3 // 8 16-bit words to be added for sum2 : 3rd vector + + pmaddwd ones, %xmm1 // 4 32-bit elements to be added for sum2 in xmm1 + pmaddwd ones, %xmm2 // 4 32-bit elements to be added for sum2 in xmm2 + pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3 + + paddd %xmm10, %xmm7 // 2 16-bit words to be added for adler + paddd %xmm11, %xmm7 // 2 16-bit words to be added for adler + + paddd %xmm1, %xmm3 // 4 32-bit elements to be added for sum2 + paddd %xmm2, %xmm3 // 4 32-bit elements to be added for sum2 + + addq $$48, buf // buf -> vector for next iteration + + movhlps %xmm7, %xmm2 // higher 16-bit word (for adler) in xmm2 + paddq %xmm2, %xmm7 // xmm7 lower 32-bit to be added to adler + + movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements + movd %xmm7, %edx // to be added to adler + paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2 + addq %rdx, adler // update adler + movd %xmm3, %edx // to be added to sum2 + psrlq $$32, %xmm3 // another 32-bit to be added to sum2 + addq %rdx, sum2 // sum2 += 1st half of update + movd %xmm3, %edx // to be added to sum2 + addq %rdx, sum2 // sum2 += 2nd half of update + .endm + + // update adler/sum2 according to a new 64-byte vector + .macro DO64 + imulq $$64, adler, %rdx // edx = 64*adler + + movaps (buf), %xmm1 // 1st 16 bytes vector + movaps 16(buf), %xmm7 // 2nd 16 bytes vector + movaps 32(buf), %xmm10 // 3rd 16 bytes vector + movaps 48(buf), %xmm11 // 4th 16 bytes vector + + movaps %xmm1, %xmm3 // 1st vector + movaps %xmm11, %xmm2 // 4th vector + psadbw zero, %xmm1 // 1st vector for adler + psadbw zero, %xmm11 // 4th vector for adler + + addq %rdx, sum2 // sum2 += adler*64; + + pmaddubsw %xmm8, %xmm3 // 8 16-bit words to be added for sum2 : 1st vector + pmaddubsw %xmm4, %xmm2 // 8 16-bit words to be added for sum2 : 4th vector + pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3 + pmaddwd ones, %xmm2 // 4 32-bit elements to be added for sum2 in xmm2 + + paddd %xmm11, %xmm1 // 2 16-bit words to be added for adler in xmm1 + paddd %xmm2, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3 + + movaps %xmm7, %xmm2 // 2nd vector + movaps %xmm10, %xmm11 // 3rd vector + + psadbw zero, %xmm7 // 2nd vector for adler + psadbw zero, %xmm10 // 3rd vector for adler + + pmaddubsw %xmm9, %xmm2 // 8 16-bit words to be added for sum2 : 2nd vector + pmaddubsw %xmm6, %xmm11 // 8 16-bit words to be added for sum2 : 3rd vector + pmaddwd ones, %xmm2 // 4 32-bit elements to be added for sum2 in xmm2 + pmaddwd ones, %xmm11 // 4 32-bit elements to be added for sum2 in xmm11 + + paddd %xmm7, %xmm1 // 2 16-bit words to be added for adler in xmm1 + paddd %xmm10, %xmm1 // 2 16-bit words to be added for adler in xmm1 + + paddd %xmm2, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3 + paddd %xmm11, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3 + + addq $$64, buf // buf -> vector for next iteration + + movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2 + paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler + movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements + movd %xmm1, %edx // to be added to adler + paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2 + addq %rdx, adler // update adler + movd %xmm3, %edx // to be added to sum2 + psrlq $$32, %xmm3 // another 32-bit to be added to sum2 + addq %rdx, sum2 // sum2 += 1st half of update + movd %xmm3, %edx // to be added to sum2 + addq %rdx, sum2 // sum2 += 2nd half of update + .endm + + // need to fill up xmm4/xmm5/xmm6 only if len>=16 + cmpq $16, len + jl skip_loading_tables + + // set up table starting address to %eax + leaq sum2_coefficients(%rip), %rax + + // reading coefficients + pxor zero, zero + movaps (%rax), %xmm8 // coefficients for computing sum2 : pmaddubsw 64:49 + movaps 16(%rax), %xmm9 // coefficients for computing sum2 : pmaddubsw 48:33 + movaps 32(%rax), %xmm6 // coefficients for computing sum2 : pmaddubsw 32:17 + movaps 48(%rax), %xmm4 // coefficients for computing sum2 : pmaddubsw 16:1 + movaps 64(%rax), ones // coefficients for computing sum2 : pmaddwd 1,1,...,1 + +skip_loading_tables: + + + cmpq $NMAX, len // len vs NMAX + jl len_lessthan_NMAX // if (len < NMAX), skip the following NMAX batches processing + +len_ge_NMAX_loop: // while (len>=NMAX) { + + subq $NMAX, len // len -= NMAX + movq $(NMAX/64), %rax // n = NMAX/64 + +n_loop: // do { + DO64 // update adler/sum2 for a 64-byte input + decq %rax // n--; + jg n_loop // } while (n); + + DO48 // update adler/sum2 for a 48-byte input + + modulo_BASE // (adler/sum2) modulo BASE; + + cmpq $NMAX, len // + jge len_ge_NMAX_loop // } /* len>=NMAX */ + +len_lessthan_NMAX: + + subq $64, len // pre-decrement len by 64 + jl len_lessthan_64 // if len < 64, skip the 64-vector code +len64_loop: // while (len>=64) { + DO64 // update adler/sum2 for a 64-byte input + subq $64, len // len -= 64; + jge len64_loop // } + +len_lessthan_64: + addq $(64-32), len // post-increment 64 + pre-decrement 32 of len + jl len_lessthan_32 // if len < 32, skip the 32-vector code + DO32 // update adler/sum2 for a 32-byte input + subq $32, len // len -= 32; + +len_lessthan_32: + + addq $(32-16), len // post-increment by 32 + pre-decrement by 16 on len + jl len_lessthan_16 // if len < 16, skip the 16-vector code + DO16 // update adler/sum2 for a 16-byte input + subq $16, len // len -= 16; + +len_lessthan_16: + addq $16, len // post-increment len by 16 + jz len_is_zero // if len==0, branch over scalar processing + +scalar_loop: // while (len) { + movzbq (buf), %rdx // new input byte + incq buf // buf++ + addq %rdx, adler // adler += *buf + addq adler, sum2 // sum2 += adler + decq len // len-- + jg scalar_loop // } + +len_is_zero: + + modulo_BASE // (adler/sum2) modulo BASE; + + // construct 32-bit (sum2<<16 | adler) to be returned + + salq $16, sum2 // sum2 <<16 + movq adler, %rax // adler + orq sum2, %rax // sum2<<16 | adler + + +#ifdef KERNEL // if for kernel, restore %xmm0-%xmm11 + movaps -32(%rbp), %xmm0 + movaps -48(%rbp), %xmm1 + movaps -64(%rbp), %xmm2 + movaps -80(%rbp), %xmm3 + movaps -96(%rbp), %xmm4 + movaps -112(%rbp), %xmm5 + movaps -128(%rbp), %xmm6 + movaps -144(%rbp), %xmm7 + movaps -160(%rbp), %xmm8 + movaps -176(%rbp), %xmm9 + movaps -192(%rbp), %xmm10 + movaps -208(%rbp), %xmm11 + addq $200, %rsp // we've already restored %xmm0-%xmm11 from stack +#endif + + popq %rbx + leave // pop ebp out from stack + ret + + + .const + .align 4 +sum2_coefficients: // used for vectorizing adler32 computation + + // coefficients for pmaddubsw instruction, used to generate 16-bit elements for sum2 + + .byte 64 + .byte 63 + .byte 62 + .byte 61 + .byte 60 + .byte 59 + .byte 58 + .byte 57 + .byte 56 + .byte 55 + .byte 54 + .byte 53 + .byte 52 + .byte 51 + .byte 50 + .byte 49 + .byte 48 + .byte 47 + .byte 46 + .byte 45 + .byte 44 + .byte 43 + .byte 42 + .byte 41 + .byte 40 + .byte 39 + .byte 38 + .byte 37 + .byte 36 + .byte 35 + .byte 34 + .byte 33 + .byte 32 + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 15 + .byte 14 + .byte 13 + .byte 12 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 7 + .byte 6 + .byte 5 + .byte 4 + .byte 3 + .byte 2 + .byte 1 + + // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2 + .word 1 + .word 1 + .word 1 + .word 1 + .word 1 + .word 1 + .word 1 + .word 1 + +#endif // (defined __i386__) + +#endif // (defined __i386__ || defined __x86_64__) diff --git a/libkern/zlib/intel/inffastS.s b/libkern/zlib/intel/inffastS.s new file mode 100644 index 000000000..4252121bf --- /dev/null +++ b/libkern/zlib/intel/inffastS.s @@ -0,0 +1,1179 @@ +#if (defined __i386__) + +/* this assembly was 1st compiled from inffast.c (assuming POSTINC defined, OFF=0) and then hand optimized */ + + .cstring +LC0: + .ascii "invalid distance too far back\0" +LC1: + .ascii "invalid distance code\0" +LC2: + .ascii "invalid literal/length code\0" + .text + .align 4,0x90 + + +#ifdef INFLATE_STRICT + .byte 0 + .byte 0 + .byte 0 + .byte 0 + .byte 0 + .byte 0 + .byte 0 + .byte 0 + .byte 0 + .byte 0 +#endif +.globl _inflate_fast +_inflate_fast: + + // set up ebp to refer to arguments strm and start + pushl %ebp + movl %esp, %ebp + + // push edi/esi/ebx into stack + pushl %edi + pushl %esi + pushl %ebx + + // allocate for local variables 92-12=80, + 12 to align %esp to 16-byte boundary + subl $92, %esp + movl 8(%ebp), %ebx + + /* definitions to help code readability */ + + #define bits %edi + #define strm %ebx + #define state 28(strm) // state = (struct inflate_state FAR *)strm->state; + #define in -84(%ebp) // in = strm->next_in - OFF; OFF=0 + #define last -80(%ebp) // last = in + (strm->avail_in - 5); + #define out -28(%ebp) // out = strm->next_out - OFF; + #define beg -76(%ebp) // beg = out - (start - strm->avail_out); + #define end -72(%ebp) // end = out + (strm->avail_out - 257); + #define wsize -68(%ebp) // wsize = state->wsize; + #define whave -64(%ebp) // whave = state->whave; + #define write -60(%ebp) // write = state->write; + #define window -56(%ebp) // window = state->window; + #define hold -52(%ebp) // hold = state->hold; + #define lcode -48(%ebp) // lcode = state->lencode; + #define dcode -44(%ebp) // dcode = state->distcode; + #define lmask -40(%ebp) // lmask = (1U << state->lenbits) - 1; + #define dmask -36(%ebp) // dmask = (1U << state->distbits) - 1; + #define len -32(%ebp) + #define dmax -20(%ebp) + #define dist -16(%ebp) // dist + #define write_wsize -24(%ebp) // write+wsize + #define write_1 -88(%ebp) // write-1 + #define op -92(%ebp) // op + + movl (strm), %eax // strm->next_in + movl %eax, in // in = strm->next_in - OFF; OFF=0 + + subl $5, %eax // in - 5; + movl 4(strm), %ecx // strm->avail_in + addl %ecx, %eax // in + (strm->avail_in - 5); + movl %eax, last // last = in + (strm->avail_in - 5); + + movl 12(strm), %esi // strm->next_out + movl %esi, out // out = strm->next_out - OFF; + + movl 16(strm), %ecx // strm->avail_out + movl %esi, %eax // out + subl 12(%ebp), %eax // out - start + addl %ecx, %eax // out - (start - strm->avail_out); + movl %eax, beg // beg = out - (start - strm->avail_out); + + leal -257(%esi,%ecx), %ecx // out + (strm->avail_out - 257); + movl %ecx, end // end = out + (strm->avail_out - 257); + + movl state, %edx + +#ifdef INFLATE_STRICT + movl 20(%edx), %ecx // state->dmax + movl %ecx, dmax // dmax = state->dmax; +#endif + + movl 40(%edx), %ecx // state->wsize + movl %ecx, wsize // wsize = state->wsize; + + movl 44(%edx), %ecx // state->whave + movl %ecx, whave // whave = state->whave; + + movl 48(%edx), %esi // state->write + movl %esi, write // write = state->write; + + movl 52(%edx), %eax // state->window + movl %eax, window // window = state->window; + + + movl 56(%edx), %ecx // state->hold + movl %ecx, hold // hold = state->hold + + movl 60(%edx), bits // bits = state->bits; + + movl 76(%edx), %esi // state->lencode + movl %esi, lcode // lcode = state->lencode; + + movl 80(%edx), %eax // state->distcode + movl %eax, dcode // dcode = state->distcode; + + movl 84(%edx), %ecx // state->lenbits + movl $1, %eax + movl %eax, %esi // a copy of 1 + sall %cl, %esi // 1 << state->lenbits + decl %esi // (1U << state->lenbits) - 1; + movl %esi, lmask // lmask = (1U << state->lenbits) - 1; + + movl 88(%edx), %ecx // state->distbits + sall %cl, %eax // 1 << state->distbits + decl %eax // (1U << state->distbits) - 1; + movl %eax, dmask // dmask = (1U << state->distbits) - 1; + + + // these 2 might be used often, precomputed and saved in stack + movl write, %eax + addl wsize, %eax + movl %eax, write_wsize // write+wsize + + movl write, %edx + decl %edx + movl %edx, write_1 // write-1 + + +L_do_while_loop: // do { + + cmpl $15, bits + jae bits_ge_15 // if (bits < 15) { +#if 0 + leal 8(bits), %esi // esi = bits+8 + movl in, %eax // eax = in + movzbl (%eax), %edx // edx = *in++ + movl bits, %ecx // cl = bits + sall %cl, %edx // 1st *in << bits + addl hold, %edx // hold += 1st *in << bits + movzbl 1(%eax), %eax // 2nd *in + movl %esi, %ecx // cl = bits+8 + sall %cl, %eax // 2nd *in << (bits+8) + addl %eax, %edx // hold += 2nd *in << (bits+8) + movl %edx, hold // update hold + addl $2, in // in += 2 + addl $16, bits // bits += 16; +#else + /* from simulation, this code segment performs better than the other case + possibly, we are more often hit with aligned memory access */ + movl in, %ecx // unsigned short *inp = (unsigned short *) (in+OFF); + movzwl (%ecx), %eax // *((unsigned short *) in); + movl bits, %ecx // bits + sall %cl, %eax // *((unsigned short *) in) << bits + addl %eax, hold // hold += (unsigned long) *((unsigned short *) in) << bits; + addl $2, in // in += 2; + addl $16, bits // bits += 16; +#endif + +bits_ge_15: // } /* bits < 15 */ + + movl hold, %eax // hold + andl lmask, %eax // hold & lmask; + movl lcode, %esi // lcode[] : 4-byte aligned + movl (%esi,%eax,4), %eax // this = lcode[hold&lmask]; + jmp dolen + .align 4,0x90 +op_nonzero: + movzbl %al, %ecx // a copy of op to cl + testb $16, %cl // if op&16 + jne Llength_base // branch to length_base + + testb $64, %cl // elif op&64 + jne length_2nd_level_else // branch to 2nd level length code else conditions + + // 2nd level length code + + movl $1, %eax + sall %cl, %eax // 1 << op + decl %eax // ((1<>= op; + subl %ecx, bits // bits -= op; + testb %al, %al // op = (unsigned)(this.op); + jne op_nonzero // if op!=0, branch to op_nonzero + + movl %esi, %ecx // this.val; + movl out, %eax // out + movb %cl, (%eax) // PUP(out) = (unsigned char)(this.val); + incl %eax // out++; + movl %eax, out // save out + +L_tst_do_while_loop_end: + movl last, %eax // last + cmpl %eax, in // in vs last + jae return_unused_bytes // branch to return_unused_bytes if in >= last + movl end, %edx // end + cmpl %edx, out // out vs end + jb L_do_while_loop // branch to do loop if out < end + +return_unused_bytes: + + movl bits, %eax // bits + shrl $3, %eax // len = bits >> 3 + movl in, %edx // in + subl %eax, %edx // in -= len + sall $3, %eax // len << 3 + movl bits, %ecx // bits + subl %eax, %ecx // bits -= len << 3 + + movl %edx, (strm) // strm->next_in = in + OFF; + movl out, %eax + movl %eax, 12(strm) // strm->next_out = out + OFF; + + cmpl %edx, last // last vs in + jbe L67 // if (last <= in) branch to L67 and return to L69 + movl last, %eax // last + addl $5, %eax // 5 + last + subl %edx, %eax // 5 + last - in +L69: + movl %eax, 4(strm) // update strm->avail_in + + movl end, %eax + cmpl %eax, out // out vs end + jae L70 // if (out>=end) branch to L70, and return to L72 + addl $257, %eax // 257 + end + subl out, %eax // 257 + end - out +L72: + movl %eax, 16(strm) // update strm->avail_out + + movl $1, %eax + sall %cl, %eax // 1 << bits + decl %eax // (1 << bits) -1 + andl hold, %eax // hold &= (1U << bits) - 1; + movl state, %esi + movl %eax, 56(%esi) // state->hold = hold; + movl %ecx, 60(%esi) // state->bits = bits; + + addl $92, %esp // pop out local from stack + + // restore saved registers and return + popl %ebx + popl %esi + popl %edi + leave + ret + + // this code segment is branched in from op_nonzero, with op in cl and this.value in esi +Llength_base: + movzwl %si, %esi // this instruction might not be needed, pad here to give better performance + movl %esi, len // len = (unsigned)(this.val); + + movl %ecx, %esi // leave a copy of op at ecx + andl $15, %esi // op&=15; + je Lop_is_zero // if (op) { + cmpl bits, %esi // op vs bits + jbe Lop_be_bits // if (bits < op) { + movl in, %edx // in + movzbl (%edx), %eax // *in + movl bits, %ecx // bits + sall %cl, %eax // *in << bits + addl %eax, hold // hold += (unsigned long)(PUP(in)) << bits; + incl %edx // in++ + movl %edx, in // update in + addl $8, bits // bits += 8 +Lop_be_bits: // } + movl $1, %eax // 1 + movl %esi, %ecx // op + sall %cl, %eax // 1 << op + decl %eax // (1<>= op; + subl %esi, bits // bits -= op; +Lop_is_zero: // } + cmpl $14, bits // if (bits < 15) { + jbe bits_le_14 // branch to refill 16-bit into hold, and branch back to next +L19: // } + movl hold, %eax // hold + andl dmask, %eax // hold&dmask + movl dcode, %esi // dcode[] : 4-byte aligned + movl (%esi,%eax,4), %eax // this = dcode[hold & dmask]; + jmp dodist + +Lop_16_zero: + testb $64, %cl // op&64 + jne Linvalid_distance_code // if (op&64)!=0, branch to invalid distance code + movl $1, %eax // 1 + sall %cl, %eax // (1<>= op; + subl %ecx, bits // bits -= op; + movzbl %al, %ecx // op = (unsigned)(this.op); + testb $16, %cl // op & 16 + je Lop_16_zero // if (op&16)==0 goto test op&64 + +Ldistance_base: // if (op&16) { /* distance base */ + andl $15, %ecx // op &= 15; edx = dist = this.val; + movl %ecx, op // save a copy of op + cmpl bits, %ecx // op vs bits + jbe 0f // if (bits < op) { + movl in, %ecx // in + movzbl (%ecx), %eax // *in + movl bits, %ecx // bits + sall %cl, %eax // *in << bits + addl %eax, hold // hold += (unsigned long)(PUP(in)) << bits; + incl in // in++ + addl $8, bits // bits += 8 + cmpl bits, op // op vs bits + jbe 0f // if (bits < op) { + movl in, %esi // i + movzbl (%esi), %eax // *in + movl bits, %ecx // cl = bits + sall %cl, %eax // *in << bits + addl %eax, hold // hold += (unsigned long)(PUP(in)) << bits; + incl %esi // in++ + movl %esi, in // update in + addl $8, bits // bits += 8 +0: // } } + + movzwl %dx, %edx // dist = (unsigned)(this.val); + movl $1, %eax // 1 + movzbl op, %ecx // cl = op + sall %cl, %eax // 1 << op + decl %eax // ((1U << op) - 1) + andl hold, %eax // (unsigned)hold & ((1U << op) - 1) + addl %edx, %eax // dist += (unsigned)hold & ((1U << op) - 1); + +#ifdef INFLATE_STRICT + + cmpl dmax, %eax // dist vs dmax + ja Linvalid_distance_too_far_back // if (dist > dmax) break for invalid distance too far back + +#endif + + movl %eax, dist // save a copy of dist in stack + shrl %cl, hold // hold >>= op; + subl %ecx, bits // bits -= op; + + movl out, %eax + subl beg, %eax // eax = op = out - beg + cmpl %eax, dist // dist vs op + jbe Lcopy_direct_from_output // if (dist <= op) branch to copy direct from output + + // if (dist > op) { + movl dist, %ecx // dist + subl %eax, %ecx // esi = op = dist - op; + cmpl %ecx, whave // whave vs op + jb Linvalid_distance_too_far_back // if (op > whave) break for error; + + movl write, %edx + testl %edx, %edx + jne Lwrite_non_zero // if (write==0) { + movl wsize, %eax // wsize + subl %ecx, %eax // wsize-op + movl window, %esi // from=window-OFF + addl %eax, %esi // from += wsize-op + movl out, %edx // out + cmpl %ecx, len // len vs op + jbe L38 // if !(op < len) skip + subl %ecx, len // len - op +0: // do { + movzbl (%esi), %eax // + movb %al, (%edx) // + incl %edx // + incl %esi // PUP(out) = PUP(from); + decl %ecx // --op; + jne 0b // } while (op); + + movl %edx, out // update out + movl %edx, %esi // out + subl dist, %esi // esi = from = out - dist; + +L38: /* copy from output */ + + // while (len > 2) { + // PUP(out) = PUP(from); + // PUP(out) = PUP(from); + // PUP(out) = PUP(from); + // len -= 3; + // } + // if (len) { + // PUP(out) = PUP(from); + // if (len > 1) + // PUP(out) = PUP(from); + // } + + movl len, %ecx // len + movl out, %edx // out + subl $3, %ecx // pre-decrement len by 3 + jl 1f // if len < 3, branch to 1f for remaining processing +0: // while (len>2) { + movzbl (%esi), %eax + movb %al, (%edx) // PUP(out) = PUP(from); + movzbl 1(%esi), %eax + movb %al, 1(%edx) // PUP(out) = PUP(from); + movzbl 2(%esi), %eax + movb %al, 2(%edx) // PUP(out) = PUP(from); + addl $3, %esi // from += 3; + addl $3, %edx // out += 3; + subl $3, %ecx // len -= 3; + jge 0b // } + movl %edx, out // update out, in case len == 0 +1: + addl $3, %ecx // post-increment len by 3 + je L_tst_do_while_loop_end // if (len) { + movzbl (%esi), %eax // + movb %al, (%edx) // PUP(out) = PUP(from); + incl %edx // out++ + movl %edx, out // update out, in case len == 1 + cmpl $2, %ecx // + jne L_tst_do_while_loop_end // if len==1, break + movzbl 1(%esi), %eax + movb %al, (%edx) // PUP(out) = PUP(from); + incl %edx // out++ + movl %edx, out // update out + jmp L_tst_do_while_loop_end // } + + .align 4,0x90 +length_2nd_level_else: + andl $32, %ecx // test end-of-block + je invalid_literal_length_code // if (op&32)==0, branch for invalid literal/length code break + movl state, %edx // if (op&32), end-of-block is detected + movl $11, (%edx) // state->mode = TYPE + jmp return_unused_bytes + +L70: + movl out, %edx // out + subl %edx, end // (end-out) + movl end, %esi // %esi = (end-out) = -(out - end); + leal 257(%esi), %eax // %eax = 257 + %esi = 257 - (out -end) + jmp L72 // return to update state and return + +L67: // %edx = in, to return 5 - (in - last) in %eax + subl %edx, last // last - in + movl last, %edx // %edx = last - in = - (in - last); + leal 5(%edx), %eax // %eax = 5 + %edx = 5 - (in - last); + jmp L69 // return to update state and return + +bits_le_14: +#if 1 + leal 8(bits), %esi // esi = bits+8 + movl in, %eax // eax = in + movzbl (%eax), %edx // edx = *in++ + movl bits, %ecx // cl = bits + sall %cl, %edx // 1st *in << bits + addl hold, %edx // hold += 1st *in << bits + movzbl 1(%eax), %eax // 2nd *in + movl %esi, %ecx // cl = bits+8 + sall %cl, %eax // 2nd *in << (bits+8) + addl %eax, %edx // hold += 2nd *in << (bits+8) + movl %edx, hold // update hold + addl $2, in // in += 2 + addl $16, bits // bits += 16; + jmp L19 +#else + /* this code segment does not run as fast as the other original code segment, possibly the processor + need extra time to handle unaligned short access */ + movl in, %edx // unsigned short *inp = (unsigned short *) (in+OFF); + movzwl (%edx), %eax // *((unsigned short *) in); + movl bits, %ecx // bits + sall %cl, %eax // *((unsigned short *) in) << bits + addl %eax, hold // hold += (unsigned long) *((unsigned short *) in) << bits; + addl $2, %edx // in += 2; + addl $16, %ecx // bits += 16; + movl %edx, in + movl %ecx, bits + jmp L19 +#endif +invalid_literal_length_code: + call 0f +0: popl %eax + leal LC2-0b(%eax), %eax + movl %eax, 24(strm) + movl state, %esi + movl $27, (%esi) + jmp return_unused_bytes +Linvalid_distance_code: + call 0f +0: popl %eax + leal LC1-0b(%eax), %eax + movl %eax, 24(strm) + movl state, %eax + movl $27, (%eax) + jmp return_unused_bytes + +#ifdef INFLATE_STRICT + .align 4,0x90 + .byte 0 + .byte 0 + .byte 0 + .byte 0 + .byte 0 + .byte 0 + .byte 0 + .byte 0 + .byte 0 +#endif +Lcopy_direct_from_output: + movl out, %edx // out + subl dist, %edx // from = out - dist + movl out, %ecx // out + movl len, %esi // len + subl $3, %esi // pre-decement len by 3 +0: // do { + movzbl (%edx), %eax + movb %al, (%ecx) // PUP(out) = PUP(from); + movzbl 1(%edx), %eax + movb %al, 1(%ecx) // PUP(out) = PUP(from); + movzbl 2(%edx), %eax + movb %al, 2(%ecx) // PUP(out) = PUP(from); + addl $3, %edx // from += 3 + addl $3, %ecx // out += 3 + subl $3, %esi // len -= 3 + jge 0b // } while (len > 2); + movl %ecx, out // update out in case len == 0 + addl $3, %esi // post-increment len by 3 + je L_tst_do_while_loop_end // if (len) { + movzbl (%edx), %eax + movb %al, (%ecx) // PUP(out) = PUP(from); + incl %ecx + movl %ecx, out // out++ + cmpl $2, %esi // + jne L_tst_do_while_loop_end // if (len>2) + movzbl 1(%edx), %eax + movb %al, (%ecx) // PUP(out) = PUP(from); + incl %ecx + movl %ecx, out // out++ + jmp L_tst_do_while_loop_end // } + + .align 4,0x90 +Lwrite_non_zero: // %edx = write, %ecx = op + movl window, %esi // from = window - OFF; + cmp %ecx, %edx // write vs op, test for wrap around window or contiguous in window + jae Lcontiguous_in_window // if (write >= op) branch to contiguous in window + +Lwrap_around_window: // wrap around window + addl write_wsize, %esi // from += write+wsize + subl %ecx, %esi // from += wsize + write - op; + subl %edx, %ecx // op -= write + cmpl %ecx, len // len vs op + jbe L38 // if (len <= op) break to copy from output + subl %ecx, len // len -= op; + movl out, %edx // out +0: // do { + movzbl (%esi), %eax // *from + movb %al, (%edx) // *out + incl %esi // from++ + incl %edx // out++ + decl %ecx // --op + jne 0b // } while (op); + + movl %edx, out // save out in case we need to break to L38 + movl window, %esi // from = window - OFF; + movl len, %eax // len + cmpl %eax, write // write vs len + jae L38 // if (write >= len) break to L38 + + movl write, %ecx // op = write + subl %ecx, len // len -= op; +0: // do { + movzbl (%esi), %eax // *from + movb %al, (%edx) // *out + incl %esi // from++ + incl %edx // out++ + decl %ecx // --op + jne 0b // } while (op); + + movl %edx, %esi // from = out + movl %edx, out // save a copy of out + subl dist, %esi // from = out - dist; + jmp L38 // break to copy from output + +Lcontiguous_in_window: // contiguous in window, edx = write, %ecx = op + subl %ecx, %edx // write - op + addl %edx, %esi // from += write - op; + cmpl %ecx, len // len vs op + jbe L38 // if (len <= op) break to copy from output + movl out, %edx // out + subl %ecx, len // len -= op; + +0: // do { + movzbl (%esi), %eax // *from + movb %al, (%edx) // *out + incl %esi // from++ + incl %edx // out++ + decl %ecx // op-- + jne 0b // } while (op); + + movl %edx, out // update out + movl %edx, %esi // from = out + subl dist, %esi // from = out - dist; + jmp L38 + +Linvalid_distance_too_far_back: + call 0f +0: popl %eax + leal LC0-0b(%eax), %eax + movl %eax, 24(strm) + movl state, %ecx + movl $27, (%ecx) + jmp return_unused_bytes + +#endif + +#if (defined __x86_64__) + .cstring +LC0: + .ascii "invalid distance too far back\0" +LC1: + .ascii "invalid distance code\0" +LC2: + .ascii "invalid literal/length code\0" + .text + .align 4,0x90 + +#ifdef INFLATE_STRICT + .byte 0 + .byte 0 + .byte 0 + .byte 0 + .byte 0 + .byte 0 + .byte 0 + .byte 0 + .byte 0 + .byte 0 + .byte 0 + .byte 0 +#endif + +.globl _inflate_fast +_inflate_fast: + + // set up rbp + pushq %rbp + movq %rsp, %rbp + + // save registers in stack + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbx + + #define strm %r13 + #define state %rdi + #define in %r12 + #define in_d %r12d + #define out %r10 + #define out_d %r10d + #define write %r15d + #define hold %r9 + #define holdd %r9d + #define bits %r8d + #define lcode %r14 + #define len %ebx + #define from %rcx + #define dmax %r11d + + #define last -104(%rbp) + #define beg -96(%rbp) + #define end -88(%rbp) + #define wsize -80(%rbp) + #define whave -76(%rbp) + #define window -72(%rbp) + #define dcode -64(%rbp) + #define lmask -56(%rbp) + #define dmask -112(%rbp) + #define wsize_write -116(%rbp) + #define write_1 -128(%rbp) + #define dist -44(%rbp) + + // reserve stack memory for local variables 128-40=88 + subq $88, %rsp + + movq %rdi, strm + movq 56(%rdi), state // state = (struct inflate_state FAR *)strm->state; + movq (strm), in // in = strm->next_in - OFF; + movl 8(strm), %eax // strm->avail_in + subl $5, %eax // (strm->avail_in - 5) + addq in, %rax // in + (strm->avail_in - 5) + movq %rax, last // last = in + (strm->avail_in - 5) + movq 24(strm), out // out = strm->next_out + movl 32(strm), %eax // strm->avail_out + subl %eax, %esi // (start - strm->avail_out); + movq out, %rdx // strm->next_out + subq %rsi, %rdx // out - (start - strm->avail_out); + movq %rdx, beg // beg = out - (start - strm->avail_out); + subl $257, %eax // (strm->avail_out - 257) + addq out, %rax // out + (strm->avail_out - 257); + movq %rax, end // end = out + (strm->avail_out - 257); + +#ifdef INFLATE_STRICT + movl 20(state), dmax // dmax = state->dmax; +#endif + + movl 52(state), %ecx // state->wsize + movl %ecx, wsize // wsize = state->wsize; + movl 56(state), %ebx // state->whave; + movl %ebx, whave // whave = state->whave; + movl 60(state), write // write = state->write; + movq 64(state), %rax // state->window + movq %rax, window // window = state->window; + movq 72(state), hold // hold = state->hold; + movl 80(state), bits // bits = state->bits; + + movq 96(state), lcode // lcode = state->lencode; + movq 104(state), %rdx // state->distcode; + movq %rdx, dcode // dcode = state->distcode; + + movl 116(state), %ecx // state->distbits + movl $1, %eax + movl %eax, %edx // 1 + sall %cl, %edx // (1U << state->distbits) + movl 112(state), %ecx // state->lenbits + sall %cl, %eax // (1U << state->lenbits) + decl %eax // (1U << state->lenbits) - 1 + movq %rax, lmask // lmask = (1U << state->lenbits) - 1 + decl %edx // (1U << state->distbits) - 1 + movq %rdx, dmask // dmask = (1U << state->distbits) - 1 + + movl wsize, %ecx // wsize + addl write, %ecx // wsize + write + movl %ecx, wsize_write // wsize_write = wsize + write + + leal -1(%r15), %ebx // write - 1 + movq %rbx, write_1 // write_1 = write - 1 + +L_do_while_loop: + cmpl $14, bits // bits vs 14 + ja 0f // if (bits < 15) { + movzwl (in), %eax // read 2 bytes from in + movl bits, %ecx // set up cl = bits + salq %cl, %rax // (*in) << bits + addq %rax, hold // hold += (*in) << bits + addq $2, in // in += 2 + addl $16, bits // bits += 16 +0: // } + movq lmask, %rax // lmask + andq hold, %rax // hold & lmask + jmp 1f + .align 4,0x90 +Lop_nonzero: + movzbl %al, %ecx // op in al and cl + testb $16, %cl // check for length base processing (op&16) + jne L_length_base // if (op&16) branch to length base processing + testb $64, %cl // check for 2nd level length code (op&64==0) + jne L_end_of_block // if (op&64)!=0, branch for end-of-block processing + + /* 2nd level length code : (op&64) == 0*/ +L_2nd_level_length_code: + movl $1, %eax // 1 + sall %cl, %eax // 1 << op + decl %eax // ((1U << op) - 1) + andq hold, %rax // (hold & ((1U << op) - 1)) + movzwl %dx, %edx + addq %rdx, %rax // this = lcode[this.val + (hold & ((1U << op) - 1))]; +1: + movl (lcode,%rax,4), %eax // this = lcode[hold & lmask]; +Ldolen: + movl %eax, %edx // a copy of this + shrl $16, %edx // edx = this.val; + movzbl %ah, %ecx // op = this.bits + shrq %cl, hold // hold >>= op; + subl %ecx, bits // bits -= op; + testb %al, %al // op = (unsigned)(this.op); + jne Lop_nonzero // if (op!-0) branch for copy operation +L_literal: + movb %dl, (out) // *out = this.val + incq out // out ++ +L_do_while_loop_check: + cmpq last, in // in vs last + jae L_return_unused_byte // if in >= last, break to return unused byte processing + cmpq end, out // out vs end + jb L_do_while_loop // back to do_while_loop if out < end + + /* return unused bytes (on entry, bits < 8, so in won't go too far back) */ + +L_return_unused_byte: + movl out_d, %esi + jmp L34 + +L_length_base: /* al = cl = op, edx = this.val, op&16 = 16 */ + movzwl %dx, len // len = (unsigned)(this.val); + movl %ecx, %edx // op + andl $15, %edx // op &= 15; + je 1f // if (op) { + cmpl bits, %edx // op vs bits + jbe 0f // if (bits < op) { + movzbl (in), %eax // *in + movl bits, %ecx // cl = bits + salq %cl, %rax // *in << bits + addq %rax, hold // hold += (unsigned long)(PUP(in)) << bits; + incq in // in++ + addl $8, bits // bits += 8 +0: // } + movl $1, %eax // 1 + movl %edx, %ecx // cl = op + sall %cl, %eax // 1 << op + decl %eax // (1 << op) - 1 + andl holdd, %eax // (unsigned)hold & ((1U << op) - 1); + addl %eax, len // len += (unsigned)hold & ((1U << op) - 1); + shrq %cl, hold // hold >>= op; + subl %edx, bits // bits -= op; +1: // } + cmpl $14, bits // bits vs 14 + jbe L99 // if (bits < 15) go to loading to hold and return to L19 +L19: // } + movq dmask, %rax // dmask + andq hold, %rax // hold & dmask + movq dcode, %rdx // dcode[] + movl (%rdx,%rax,4), %eax // this = dcode[hold & dmask]; + jmp L_dodist + .align 4,0x90 +0: // op&16 == 0, test (op&64)==0 for 2nd level distance code + testb $64, %cl // op&64 + jne L_invalid_distance_code // if ((op&64)==0) { /* 2nd level distance code */ + movl $1, %eax // 1 + sall %cl, %eax // 1 << op + decl %eax // (1 << op) - 1 + andq hold, %rax // (hold & ((1U << op) - 1)) + movzwl %dx, %edx // this.val + addq %rdx, %rax // this.val + (hold & ((1U << op) - 1)) + movq dcode, %rcx // dcode[] + movl (%rcx,%rax,4), %eax // this = dcode[this.val + (hold & ((1U << op) - 1))]; +L_dodist: + movl %eax, %edx // this + shrl $16, %edx // dist = (unsigned)(this.val); + movzbl %ah, %ecx // cl = op = this.bits + shrq %cl, hold // hold >>= op; + subl %ecx, bits // bits -= op; + movzbl %al, %ecx // op = (unsigned)(this.op); + testb $16, %cl // (op & 16) test for distance base + je 0b // if (op&16) == 0, branch to check for 2nd level distance code + +L_distance_base: /* distance base */ + + movl %ecx, %esi // op + andl $15, %esi // op&=15 + cmpl bits, %esi // op vs bits + jbe 1f // if (bits < op) { + movzbl (in), %eax // *in + movl bits, %ecx // cl = bits + salq %cl, %rax // *in << bits + addq %rax, hold // hold += (unsigned long)(PUP(in)) << bits; + incq in // in++ + addl $8, bits // bits += 8 + cmpl bits, %esi // op vs bits + jbe 1f // if (bits < op) { + movzbl (in), %eax // *in + movl bits, %ecx // cl = bits + salq %cl, %rax // *in << bits + addq %rax, hold // hold += (unsigned long)(PUP(in)) << bits; + incq in // in++ + addl $8, bits // bits += 8 +1: // } } + + movzwl %dx, %edx // dist + movl $1, %eax // 1 + movl %esi, %ecx // cl = op + sall %cl, %eax // (1 << op) + decl %eax // (1 << op) - 1 + andl holdd, %eax // (unsigned)hold & ((1U << op) - 1) + addl %edx, %eax // dist += (unsigned)hold & ((1U << op) - 1); + movl %eax, dist // save a copy of dist in stack + +#ifdef INFLATE_STRICT + cmp %eax, dmax // dmax vs dist + jb L_invalid_distance_too_far_back // if (dmax < dist) break for invalid distance too far back +#endif + + shrq %cl, hold // hold >>= op; + subl %esi, bits // bits -= op; + movl out_d, %esi // out + movl out_d, %eax // out + subl beg, %eax // op = out - beg + cmpl %eax, dist // dist vs op, /* see if copy from window */ + jbe L_copy_direct_from_output // if (dist <= op) branch to copy direct from output + +L_distance_back_in_window: + + movl dist, %edx // dist + subl %eax, %edx // op = dist - op; /* distance back in window */ + + cmpl %edx, whave // whave vs op + jb L_invalid_distance_too_far_back // if (op > whave), break for invalid distance too far back + + testl write, write // if (write!=0) + jne L_wrap_around_window // branch to wrap around window + +L_very_common_case: + + movl wsize, %eax // wsize + subl %edx, %eax // wsize - op + movq window, from // from = window - OFF; + addq %rax, from // from += wsize - op; + + movl %edx, %esi // op + cmpl %edx, len // len vs op + ja L_some_from_window // if (len > op), branch for aligned code block L_some_from_window +L38: + subl $3, len // pre-decrement len by 3 + jge 0f // if len >= 3, branch to the aligned code block +1: addl $3, len // post-increment len by 3 + je L_do_while_loop_check // if (len==0) break to L_do_while_loop_check + movzbl (from), %eax // *from + movb %al, (out) // *out + incq out // out++ + cmpl $2, len // len vs 2 + jne L_do_while_loop_check // if len!=2 break to L_do_while_loop_check + movzbl 1(from), %eax // *from + movb %al, (out) // *out + incq out // out++ + jmp L_do_while_loop_check // break to L_do_while_loop_check + + .align 4,0x90 +0: // do { + movzbl (from), %eax // *from + movb %al, (out) // *out + movzbl 1(from), %eax // *from + movb %al, 1(out) // *out + movzbl 2(from), %eax // *from + movb %al, 2(out) // *out + addq $3, out // out += 3 + addq $3, from // from += 3 + subl $3, len // len -= 3 + jge 0b // } while (len>=0); + jmp 1b // branch back to the possibly unaligned code + + .align 4,0x90 +L_end_of_block: + andl $32, %ecx // op & 32 + jne L101 // if (op&32) branch to end-of-block break + leaq LC2(%rip), from + movq from, 48(strm) // state->mode + movl $27, (state) // state->mode = BAD; + movl out_d, %esi + +L34: + movl bits, %eax // bits + shrl $3, %eax // len = bits >> 3; + mov %eax, %edx // len + subq %rdx, in // in -= len + sall $3, %eax // len << 3 + movl bits, %ecx // bits + subl %eax, %ecx // bits -= len << 3 + movq in, (strm) // strm->next_in = in + OFF; + movq out, 24(strm) // strm->next_out = out + OFF; + cmpq in, last // last vs in + jbe L67 // if (last <= in) branch to L67 and return to L69 + movl last, %eax // last + addl $5, %eax // last + 5 + subl in_d, %eax // 5 + last - in +L69: + movl %eax, 8(strm) // update strm->avail_in + + cmpq end, out // out vs end + jae L70 // if out<=end branch to L70 and return to L72 + movl end, %eax // end + addl $257, %eax // 257 + end + subl %esi, %eax // 257 + end - out; +L72: + movl %eax, 32(strm) // update strm->avail_out + + movl $1, %eax // 1 + sall %cl, %eax // 1 << bits + decl %eax // (1U << bits) - 1 + andq hold, %rax // hold &= (1U << bits) - 1; + movq %rax, 72(state) // state->hold = hold; + movl %ecx, 80(state) // state->bits = bits; + + // clear stack memory for local variables + addq $88, %rsp + + // restore registers from stack + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + + // return to caller + leave + ret + + .align 4,0x90 +L99: + leal 8(bits), %esi // esi = bits+8 + movzbl (in), %edx // 1st *in + movl bits, %ecx // cl = bits + salq %cl, %rdx // 1st *in << 8 + addq %rdx, hold // 1st hold += (unsigned long)(PUP(in)) << bits; + movzbl 1(in), %eax // 2nd *in + movl %esi, %ecx // cl = bits + 8 + salq %cl, %rax // 2nd *in << bits+8 + addq %rax, hold // 2nd hold += (unsigned long)(PUP(in)) << bits; + addq $2, in // in += 2 + addl $16, bits // bits += 16 + jmp L19 + +L101: + movl $11, (state) + movl out_d, %esi + jmp L34 + .align 4,0x90 +L70: + movl end, %eax // end + subl %esi, %eax // end - out + addl $257, %eax // 257 + end - out + jmp L72 + .align 4,0x90 +L67: + movl last, %eax // last + subl in_d, %eax // last - in + addl $5, %eax // 5 + last - in + jmp L69 + + + .align 4,0x90 + + // stuffing the following 4 bytes to align the major loop to a 16-byte boundary to give the better performance + .byte 0 + .byte 0 + .byte 0 + .byte 0 +L_copy_direct_from_output: + mov dist, %eax // dist + movq out, %rdx // out + subq %rax, %rdx // from = out - dist; + subl $3, len // pre-decrement len by 3 + // do { +0: movzbl (%rdx), %eax // *from + movb %al, (out) // *out + movzbl 1(%rdx), %eax // *from + movb %al, 1(out) // *out + movzbl 2(%rdx), %eax // *from + movb %al, 2(out) // *out + addq $3, out // out+=3 + addq $3, %rdx // from+=3 + subl $3, len // len-=3 + jge 0b // } while (len>=0); +1: addl $3, len // post-increment len by 3 + je L_do_while_loop_check // if len==0, branch to do_while_loop_check + + movzbl (%rdx), %eax // *from + movb %al, (out) // *out + incq out // out++ + cmpl $2, len // len == 2 ? + jne L_do_while_loop_check // if len==1, branch to do_while_loop_check + + movzbl 1(%rdx), %eax // *from + movb %al, (out) // *out + incq out // out++ + jmp L_do_while_loop_check // branch to do_while_loop_check + + .align 4,0x90 +L_some_from_window: // from : from, out, %esi/%edx = op + // do { + movzbl (from), %eax // *from + movb %al, (out) // *out + incq from // from++ + incq out // out++ + decl %esi // --op + jne L_some_from_window // } while (op); + subl %edx, len // len -= op; + mov dist, %eax // dist + movq out, from // out + subq %rax, from // from = out - dist; + jmp L38 // copy from output + + .align 4,0x90 +L_wrap_around_window: + cmpl %edx, write // write vs op + jae L_contiguous_in_window // if (write >= op) branch to contiguous in window + movl wsize_write, %eax // wsize+write + subl %edx, %eax // wsize+write-op + movq window, from // from = window - OFF + addq %rax, from // from += wsize+write-op + subl write, %edx // op -= write + cmpl %edx, len // len vs op + jbe L38 // if (len<=op) branch to copy from output + + subl %edx, len // len -= op; +0: // do { + movzbl (from), %eax // *from + movb %al, (out) // *out + incq from // from++ + incq out // out++ + decl %edx // op-- + jne 0b // } while (op); + movq window, from + + cmpl len, write // write vs len + jae L38 // if (write >= len) branch to copy from output + movl write, %esi // op = write + subl write, len // len -= op +1: // do { + movzbl (from), %eax // *from + movb %al, (out) // *out + incq from // from++ + incq out // out++ + decl %esi // op-- + jne 1b // } while (op); + mov dist, %eax // dist + movq out, from // out + subq %rax, from // from = out - dist; + jmp L38 + + .align 4,0x90 +L_contiguous_in_window: + movl write, %eax // write + subl %edx, %eax // write - op + movq window, from // from = window - OFF + addq %rax, from // from += write - op + cmpl %edx, len // len vs op + jbe L38 // if (len <= op) branch to copy from output + subl %edx, len // len -= op; +2: // do { + movzbl (from), %eax // *from + movb %al, (out) // *out + incq from // from++ + incq out // out++ + decl %edx // op-- + jne 2b // } while (op); + + mov dist, %eax // dist + movq out, from // out + subq %rax, from // from = out - dist; + jmp L38 // copy from output + + .align 4,0x90 +L_invalid_distance_code: + leaq LC1(%rip), %rdx + movq %rdx, 48(strm) + movl $27, (state) + movl out_d, %esi + jmp L34 + +L_invalid_distance_too_far_back: + leaq LC0(%rip), %rbx + movq %rbx, 48(strm) // error message + movl $27, (state) // state->mode = BAD + jmp L34 + +#endif diff --git a/libsa/Makefile b/libsa/Makefile index eea21bddd..3815c667f 100644 --- a/libsa/Makefile +++ b/libsa/Makefile @@ -8,20 +8,18 @@ include $(MakeInc_cmd) include $(MakeInc_def) INSTINC_SUBDIRS = libsa -INSTINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS} INSTINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS} INSTINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS} INSTINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS} EXPINC_SUBDIRS = libsa -EXPINC_SUBDIRS_PPC = ${EXPINC_SUBDIRS} EXPINC_SUBDIRS_I386 = ${EXPINC_SUBDIRS} EXPINC_SUBDIRS_X86_64 = ${EXPINC_SUBDIRS} EXPINC_SUBDIRS_ARM = ${EXPINC_SUBDIRS} -SETUP_SUBDIRS = conf +SETUP_SUBDIRS = COMP_SUBDIRS = conf diff --git a/libsa/bootstrap.cpp b/libsa/bootstrap.cpp index 286255265..9ad023c1a 100644 --- a/libsa/bootstrap.cpp +++ b/libsa/bootstrap.cpp @@ -36,7 +36,7 @@ extern "C" { #include #include #include -#include +#include #include #include @@ -66,6 +66,7 @@ extern "C" { static void bootstrapRecordStartupExtensions(void); static void bootstrapLoadSecurityExtensions(void); + #if PRAGMA_MARK #pragma mark Macros #endif @@ -100,7 +101,7 @@ static const char * sKernelComponentNames[] = { "com.apple.iokit.IOSystemManagementFamily", "com.apple.iokit.ApplePlatformFamily", -#if defined(__ppc__) || defined(__i386__) || defined(__arm__) +#if defined(__i386__) || defined(__arm__) /* These ones are not supported on x86_64 or any newer platforms. * They must be version 7.9.9; check by "com.apple.kernel.", with * the trailing period; "com.apple.kernel" always represents the @@ -163,7 +164,6 @@ KLDBootstrap::KLDBootstrap(void) } record_startup_extensions_function = &bootstrapRecordStartupExtensions; load_security_extensions_function = &bootstrapLoadSecurityExtensions; - OSKext::initialize(); } /********************************************************************* @@ -175,6 +175,8 @@ KLDBootstrap::~KLDBootstrap(void) if (this != &sBootstrapObject) { panic("Attempt to access bootstrap segment."); } + + record_startup_extensions_function = 0; load_security_extensions_function = 0; } @@ -218,16 +220,11 @@ KLDBootstrap::readPrelinkedExtensions( kernel_section_t * prelinkInfoSect) { OSArray * infoDictArray = NULL; // do not release - OSArray * personalitiesArray = NULL; // do not release OSObject * parsedXML = NULL; // must release OSDictionary * prelinkInfoDict = NULL; // do not release OSString * errorString = NULL; // must release OSKext * theKernel = NULL; // must release -#if CONFIG_KXLD - kernel_section_t * kernelLinkStateSection = NULL; // see code -#endif - kernel_segment_command_t * prelinkLinkStateSegment = NULL; // see code kernel_segment_command_t * prelinkTextSegment = NULL; // see code kernel_segment_command_t * prelinkInfoSegment = NULL; // see code @@ -235,13 +232,13 @@ KLDBootstrap::readPrelinkedExtensions( * going to fail the boot, so these won't be cleaned up on error. */ void * prelinkData = NULL; // see code - void * prelinkCopy = NULL; // see code vm_size_t prelinkLength = 0; + #if !__LP64__ && !defined(__arm__) vm_map_offset_t prelinkDataMapOffset = 0; -#endif - + void * prelinkCopy = NULL; // see code kern_return_t mem_result = KERN_SUCCESS; +#endif OSDictionary * infoDict = NULL; // do not release @@ -255,57 +252,6 @@ KLDBootstrap::readPrelinkedExtensions( kOSKextLogDirectoryScanFlag | kOSKextLogArchiveFlag, "Starting from prelinked kernel."); - /***** - * Wrap the kernel link state in-place in an OSData. - * This is unnecessary (and the link state may not be present) if the kernel - * does not have kxld support because this information is only used for - * runtime linking. - */ -#if CONFIG_KXLD - kernelLinkStateSection = getsectbyname(kPrelinkLinkStateSegment, - kPrelinkKernelLinkStateSection); - if (!kernelLinkStateSection) { - OSKextLog(/* kext */ NULL, - kOSKextLogErrorLevel | - kOSKextLogArchiveFlag, - "Can't find prelinked kernel link state."); - goto finish; - } - - theKernel = OSKext::lookupKextWithIdentifier(kOSKextKernelIdentifier); - if (!theKernel) { - OSKextLog(/* kext */ NULL, - kOSKextLogErrorLevel | - kOSKextLogArchiveFlag, - "Can't find kernel kext object in prelinked kernel."); - goto finish; - } - - prelinkData = (void *) kernelLinkStateSection->addr; - prelinkLength = kernelLinkStateSection->size; - - mem_result = kmem_alloc_pageable(kernel_map, - (vm_offset_t *) &prelinkCopy, prelinkLength); - if (mem_result != KERN_SUCCESS) { - OSKextLog(/* kext */ NULL, - kOSKextLogErrorLevel | - kOSKextLogGeneralFlag | kOSKextLogArchiveFlag, - "Can't copy prelinked kernel link state."); - goto finish; - } - memcpy(prelinkCopy, prelinkData, prelinkLength); - - theKernel->linkState = OSData::withBytesNoCopy(prelinkCopy, prelinkLength); - if (!theKernel->linkState) { - OSKextLog(/* kext */ NULL, - kOSKextLogErrorLevel | - kOSKextLogGeneralFlag | kOSKextLogArchiveFlag, - "Can't create prelinked kernel link state wrapper."); - goto finish; - } - theKernel->linkState->setDeallocFunction(osdata_kmem_free); -#endif - prelinkTextSegment = getsegbyname(kPrelinkTextSegment); if (!prelinkTextSegment) { OSKextLog(/* kext */ NULL, @@ -318,7 +264,9 @@ KLDBootstrap::readPrelinkedExtensions( prelinkData = (void *) prelinkTextSegment->vmaddr; prelinkLength = prelinkTextSegment->vmsize; -#if !__LP64__ +#if !__LP64__ && !__arm__ + /* XXX: arm's pmap implementation doesn't seem to let us do this */ + /* To enable paging and write/execute protections on the kext * executables, we need to copy them out of the booter-created * memory, reallocate that space with VM, then prelinkCopy them back in. @@ -375,7 +323,7 @@ KLDBootstrap::readPrelinkedExtensions( memcpy(prelinkData, prelinkCopy, prelinkLength); kmem_free(kernel_map, (vm_offset_t)prelinkCopy, prelinkLength); -#endif /* !__LP64__ */ +#endif /* !__LP64__ && !__arm__*/ /* Unserialize the info dictionary from the prelink info section. */ @@ -425,21 +373,6 @@ KLDBootstrap::readPrelinkedExtensions( OSSafeReleaseNULL(newKext); } - /* Get all of the personalities for kexts that were not prelinked and - * add them to the catalogue. - */ - personalitiesArray = OSDynamicCast(OSArray, - prelinkInfoDict->getObject(kPrelinkPersonalitiesKey)); - if (!personalitiesArray) { - OSKextLog(/* kext */ NULL, kOSKextLogErrorLevel | kOSKextLogArchiveFlag, - "The prelinked kernel has no personalities array"); - goto finish; - } - - if (personalitiesArray->getCount()) { - gIOCatalogue->addDrivers(personalitiesArray); - } - /* Store the number of prelinked kexts in the registry so we can tell * when the system has been started from a prelinked kernel. */ @@ -454,21 +387,12 @@ KLDBootstrap::readPrelinkedExtensions( registryRoot->setProperty(kOSPrelinkKextCountKey, prelinkCountObj); } - OSSafeReleaseNULL(prelinkCountObj); - prelinkCountObj = OSNumber::withNumber( - (unsigned long long)personalitiesArray->getCount(), - 8 * sizeof(uint32_t)); - assert(prelinkCountObj); - if (prelinkCountObj) { - registryRoot->setProperty(kOSPrelinkPersonalityCountKey, prelinkCountObj); - } - OSKextLog(/* kext */ NULL, kOSKextLogProgressLevel | kOSKextLogGeneralFlag | kOSKextLogKextBookkeepingFlag | kOSKextLogDirectoryScanFlag | kOSKextLogArchiveFlag, - "%u prelinked kexts, and %u additional personalities.", - infoDictArray->getCount(), personalitiesArray->getCount()); + "%u prelinked kexts", + infoDictArray->getCount()); #if __LP64__ /* On LP64 systems, kexts are copied to their own special VM region @@ -477,14 +401,6 @@ KLDBootstrap::readPrelinkedExtensions( ml_static_mfree((vm_offset_t) prelinkData, prelinkLength); #endif /* __LP64__ */ - /* Free the link state segment, kexts have copied out what they need. - */ - prelinkLinkStateSegment = getsegbyname(kPrelinkLinkStateSegment); - if (prelinkLinkStateSegment) { - ml_static_mfree((vm_offset_t)prelinkLinkStateSegment->vmaddr, - (vm_size_t)prelinkLinkStateSegment->vmsize); - } - /* Free the prelink info segment, we're done with it. */ prelinkInfoSegment = getsegbyname(kPrelinkInfoSegment); @@ -946,3 +862,4 @@ static void bootstrapLoadSecurityExtensions(void) sBootstrapObject.loadSecurityExtensions(); return; } + diff --git a/libsa/conf/MASTER b/libsa/conf/MASTER index c2690d207..99865aa3e 100644 --- a/libsa/conf/MASTER +++ b/libsa/conf/MASTER @@ -64,4 +64,3 @@ options CONFIG_NO_PRINTF_STRINGS # options CONFIG_NO_KPRINTF_STRINGS # options CONFIG_KXLD # kxld/runtime linking of kexts # - diff --git a/libsa/conf/MASTER.i386 b/libsa/conf/MASTER.i386 index 66fe402b0..448133126 100644 --- a/libsa/conf/MASTER.i386 +++ b/libsa/conf/MASTER.i386 @@ -4,7 +4,6 @@ # PROFILE = [ RELEASE profile ] # DEBUG = [ RELEASE debug ] # -# # EMBEDDED = [ intel mach libkerncpp ] # DEVELOPMENT = [ EMBEDDED config_dtrace ] # diff --git a/libsa/conf/MASTER.ppc b/libsa/conf/MASTER.ppc deleted file mode 100644 index 5119c4062..000000000 --- a/libsa/conf/MASTER.ppc +++ /dev/null @@ -1,18 +0,0 @@ -# -###################################################################### -# -# Standard Apple MacOS X Configurations: -# -------- ---- -------- --------------- -# -# RELEASE = [ppc mach libkerncpp config_dtrace config_kxld ] -# DEVELOPMENT = [ RELEASE ] -# PROFILE = [RELEASE profile] -# DEBUG = [ppc mach libkerncpp debug] -# RELEASE_TRACE = [ RELEASE kdebug ] -# DEBUG_TRACE = [ DEBUG kdebug ] -# -###################################################################### - -machine "ppc" # -cpu "ppc" # - diff --git a/libsa/conf/MASTER.x86_64 b/libsa/conf/MASTER.x86_64 index 68218d47d..89c745125 100644 --- a/libsa/conf/MASTER.x86_64 +++ b/libsa/conf/MASTER.x86_64 @@ -4,7 +4,6 @@ # PROFILE = [ RELEASE profile ] # DEBUG = [ RELEASE debug ] # -# # EMBEDDED = [ intel mach libkerncpp ] # DEVELOPMENT = [ EMBEDDED ] # diff --git a/libsa/conf/Makefile b/libsa/conf/Makefile index f2daf7618..b463b2528 100644 --- a/libsa/conf/Makefile +++ b/libsa/conf/Makefile @@ -7,8 +7,7 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir include $(MakeInc_cmd) include $(MakeInc_def) -SETUP_SUBDIRS = \ - tools +SETUP_SUBDIRS = COMP_SUBDIRS = @@ -24,30 +23,24 @@ else export COMPOBJROOT=$(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)/$(COMPONENT) endif -$(COMPOBJROOT)/doconf: - @make build_setup +MASTER_CPU_PER_SOC = $(SOURCE)/MASTER.$(ARCH_CONFIG_LC).$(MACHINE_CONFIG_LC) $(COMPOBJROOT)/$(LIBSA_KERNEL_CONFIG)/Makefile : $(SOURCE)/MASTER \ $(SOURCE)/MASTER.$(ARCH_CONFIG_LC) \ $(SOURCE)/Makefile.template \ $(SOURCE)/Makefile.$(ARCH_CONFIG_LC) \ $(SOURCE)/files \ - $(SOURCE)/files.$(ARCH_CONFIG_LC) \ - $(COMPOBJROOT)/doconf + $(SOURCE)/files.$(ARCH_CONFIG_LC) $(_v)(doconf_target=$(addsuffix /conf, $(TARGET)); \ $(MKDIR) $${doconf_target}; \ cd $${doconf_target}; \ rm -f $(notdir $?); \ cp $? $${doconf_target}; \ - $(COMPOBJROOT)/doconf -c -cpu $(ARCH_CONFIG_LC) -d $(TARGET)/$(LIBSA_KERNEL_CONFIG) $(LIBSA_KERNEL_CONFIG); \ + if [ -f $(MASTER_CPU_PER_SOC) ]; then cp $(MASTER_CPU_PER_SOC) $${doconf_target}; fi; \ + $(SRCROOT)/SETUP/config/doconf -c -cpu $(ARCH_CONFIG_LC) -soc $(MACHINE_CONFIG_LC) -d $(TARGET)/$(LIBSA_KERNEL_CONFIG) $(LIBSA_KERNEL_CONFIG); \ ); -.ORDER: $(COMPOBJROOT)/$(LIBSA_KERNEL_CONFIG)/Makefile - -do_setup_conf: $(COMPOBJROOT)/doconf \ - $(COMPOBJROOT)/$(LIBSA_KERNEL_CONFIG)/Makefile - -do_all: do_setup_conf +do_all: $(COMPOBJROOT)/$(LIBSA_KERNEL_CONFIG)/Makefile $(_v)next_source=$(subst conf/,,$(SOURCE)); \ ${MAKE} -C $(COMPOBJROOT)/$(LIBSA_KERNEL_CONFIG) \ MAKEFILES=$(TARGET)/$(LIBSA_KERNEL_CONFIG)/Makefile \ diff --git a/libsa/conf/Makefile.i386 b/libsa/conf/Makefile.i386 index 3695a666c..b89fdd145 100644 --- a/libsa/conf/Makefile.i386 +++ b/libsa/conf/Makefile.i386 @@ -2,6 +2,7 @@ #BEGIN Machine dependent Makefile fragment for i386 ###################################################################### + ###################################################################### #END Machine dependent Makefile fragment for i386 ###################################################################### diff --git a/libsa/conf/Makefile.ppc b/libsa/conf/Makefile.ppc deleted file mode 100644 index cd79f229a..000000000 --- a/libsa/conf/Makefile.ppc +++ /dev/null @@ -1,7 +0,0 @@ -###################################################################### -#BEGIN Machine dependent Makefile fragment for ppc -###################################################################### - -###################################################################### -#END Machine dependent Makefile fragment for ppc -###################################################################### diff --git a/libsa/conf/Makefile.template b/libsa/conf/Makefile.template index a975da2a5..26aede6b2 100644 --- a/libsa/conf/Makefile.template +++ b/libsa/conf/Makefile.template @@ -27,8 +27,8 @@ include $(MakeInc_def) # # CFLAGS # -CFLAGS+= -imacros meta_features.h -DLIBSA_KERNEL_PRIVATE \ - -Werror $(CFLAGS_INLINE_CONFIG) +CFLAGS+= -include meta_features.h -DLIBSA_KERNEL_PRIVATE \ + $(CFLAGS_INLINE_CONFIG) # # Directories for mig generated files @@ -74,23 +74,26 @@ ${OBJS}: ${OBJSDEPS} LDOBJS = $(OBJS) -$(COMPONENT).o: $(LDOBJS) - $(_v)for kld_file in ${LDOBJS}; do \ +$(COMPONENT).filelist: $(LDOBJS) + $(_v)if [ $(BUILD_MACHO_OBJ) -eq 1 ]; then \ + for kld_file in ${LDOBJS}; do \ $(SEG_HACK) __KLD $${kld_file} -o $${kld_file}__; \ mv $${kld_file}__ $${kld_file} ; \ - done; + done; \ + fi @echo LDFILELIST $(COMPONENT) $(_v)( for obj in ${LDOBJS}; do \ echo $(TARGET)$(COMP_OBJ_DIR)/$(KERNEL_CONFIG)/$${obj}; \ - done; ) > $(COMPONENT).o + done; ) > $(COMPONENT).filelist -do_all: $(COMPONENT).o +do_all: $(COMPONENT).filelist do_depend: do_all $(_v)${MD} -u Makedep -f -d `ls *.d` do_build_all: do_depend + %RULES include $(MakeInc_rule) diff --git a/libsa/conf/Makefile.x86_64 b/libsa/conf/Makefile.x86_64 index 7b0de925d..d7024f6c7 100644 --- a/libsa/conf/Makefile.x86_64 +++ b/libsa/conf/Makefile.x86_64 @@ -2,6 +2,7 @@ #BEGIN Machine dependent Makefile fragment for x86_64 ###################################################################### + ###################################################################### #END Machine dependent Makefile fragment for x86_64 ###################################################################### diff --git a/libsa/conf/files.ppc b/libsa/conf/files.ppc deleted file mode 100644 index 8b1378917..000000000 --- a/libsa/conf/files.ppc +++ /dev/null @@ -1 +0,0 @@ - diff --git a/libsa/conf/tools/Makefile b/libsa/conf/tools/Makefile deleted file mode 100644 index 4f9ccd553..000000000 --- a/libsa/conf/tools/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -SETUP_SUBDIRS = doconf - -COMP_SUBDIRS = doconf - -INST_SUBDIRS = \ - - -setup_build_all: - @echo "[ $(SOURCE) ] make setup_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -do_build_all: - @echo "[ $(SOURCE) ] make do_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -setup_build_install: - @echo "[ $(SOURCE) ] make setup_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -do_build_install: - @echo "[ $(SOURCE) ] make do_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -include $(MakeInc_rule) -include $(MakeInc_dir) - - diff --git a/libsa/conf/tools/doconf/Makefile b/libsa/conf/tools/doconf/Makefile deleted file mode 100644 index aa55a9419..000000000 --- a/libsa/conf/tools/doconf/Makefile +++ /dev/null @@ -1,47 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -COMP_SUBDIRS = \ - -INST_SUBDIRS = \ - - -# -# Who and where -# -BINDIR= -ifneq ($(MACHINE_CONFIG), DEFAULT) -DSTDIR= $(strip $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)_$(MACHINE_CONFIG)/$(COMPONENT)/) -else -DSTDIR= $(strip $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)/$(COMPONENT)/) -endif -PROGRAM= $(DSTDIR)doconf - -# -# How to install it -# -IFLAGS= -c -m 555 - -$(PROGRAM): $(DSTDIR)% : $(SOURCE)%.csh - @-$(RM) $(RMFLAGS) $(notdir $(PROGRAM)).VERS - @sed -e "s/#PROGRAM.*/#`vers_string $(notdir $(PROGRAM))`/" \ - < $< >$(notdir $(PROGRAM)).VERS; - @install $(IFLAGS) $(notdir $(PROGRAM)).VERS $(PROGRAM); - @-$(RM) $(RMFLAGS) $(notdir $(PROGRAM)).VERS; - -do_build_setup: $(PROGRAM) - -do_build_all: - -setup_build_install: - -do_build_install: - -include $(MakeInc_rule) -include $(MakeInc_dir) diff --git a/libsa/conf/tools/doconf/doconf.csh b/libsa/conf/tools/doconf/doconf.csh deleted file mode 100755 index 6fedb4786..000000000 --- a/libsa/conf/tools/doconf/doconf.csh +++ /dev/null @@ -1,321 +0,0 @@ -#!/bin/csh -f -set path = ($path .) -###################################################################### -# HISTORY -# 1-Dec-87 Michael Young (mwyoung) at Carnegie-Mellon University -# Added "-verbose" switch, so this script produces no output -# in the normal case. -# -# 10-Oct-87 Mike Accetta (mja) at Carnegie-Mellon University -# Flushed cmu_*.h and spin_locks.h -# [ V5.1(XF18) ] -# -# 6-Apr-87 Avadis Tevanian (avie) at Carnegie-Mellon University -# Use MASTER.local and MASTER..local for generation of -# configuration files in addition to MASTER and MASTER.. -# -# 25-Mar-87 Mike Accetta (mja) at Carnegie-Mellon University -# Removed use of obsolete wb_*.h files when building the feature -# list; modified to save the previous configuration file and -# display the differences between it and the new file. -# [ V5.1(F8) ] -# -# 25-Mar-87 Avadis Tevanian (avie) at Carnegie-Mellon University -# If there is no /etc/machine just print out a message telling -# user to use the -cpu option. I thought this script was supposed -# to work even without a /etc/machine, but it doesn't... and this -# is the easiest way out. -# -# 13-Mar-87 Mike Accetta (mja) at Carnegie-Mellon University -# Added "romp_fpa.h" file to extra features for the RT. -# [ V5.1(F7) ] -# -# 11-Mar-87 Mike Accetta (mja) at Carnegie-Mellon University -# Updated to maintain the appropriate configuration features file -# in the "machine" directory whenever the corresponding -# configuration is generated. This replaces the old mechanism of -# storing this directly in the file since it was -# machine dependent and also precluded building programs for more -# than one configuration from the same set of sources. -# [ V5.1(F6) ] -# -# 21-Feb-87 Mike Accetta (mja) at Carnegie-Mellon University -# Fixed to require wired-in cpu type names for only those -# machines where the kernel name differs from that provided by -# /etc/machine (i.e. IBMRT => ca and SUN => sun3); updated to -# permit configuration descriptions in both machine indepedent -# and dependent master configuration files so that attributes can -# be grouped accordingly. -# [ V5.1(F3) ] -# -# 17-Jan-87 Mike Accetta (mja) at Carnegie-Mellon University -# Updated to work from any directory at the same level as -# "conf"; generate configuration from both MASTER and -# MASTER. files; added -cpu switch. -# [ V5.1(F1) ] -# -# 18-Aug-86 Mike Accetta (mja) at Carnegie-Mellon University -# Added -make switch and changed meaning of -config; upgraded to -# allow multiple attributes per configuration and to define -# configurations in terms of these attributes within MASTER. -# -# 14-Apr-83 Mike Accetta (mja) at Carnegie-Mellon University -# Added -config switch to only run /etc/config without -# "make depend" and "make". -# -###################################################################### - -set prog=$0 -set prog=$prog:t -set nonomatch -set OBJDIR=../BUILD -if ("`/usr/bin/uname`" == "Rhapsody" ) then -set CONFIG_DIR=/usr/local/bin -else -set CONFIG_DIR=/usr/bin -endif - -unset domake -unset doconfig -unset beverbose -unset MACHINE -unset profile - -while ($#argv >= 1) - if ("$argv[1]" =~ -*) then - switch ("$argv[1]") - case "-c": - case "-config": - set doconfig - breaksw - case "-m": - case "-make": - set domake - breaksw - case "-cpu": - if ($#argv < 2) then - echo "${prog}: missing argument to ${argv[1]}" - exit 1 - endif - set MACHINE="$argv[2]" - shift - breaksw - case "-d": - if ($#argv < 2) then - echo "${prog}: missing argument to ${argv[1]}" - exit 1 - endif - set OBJDIR="$argv[2]" - shift - breaksw - case "-verbose": - set beverbose - breaksw - case "-p": - case "-profile": - set profile - breaksw - default: - echo "${prog}: ${argv[1]}: unknown switch" - exit 1 - breaksw - endsw - shift - else - break - endif -end - -if ($#argv == 0) set argv=(GENERIC) - -if (! $?MACHINE) then - if (-d /NextApps) then - set MACHINE=`hostinfo | awk '/MC680x0/ { printf("m68k") } /MC880x0/ { printf("m88k") }'` - endif -endif - -if (! $?MACHINE) then - if (-f /etc/machine) then - set MACHINE="`/etc/machine`" - else - echo "${prog}: no /etc/machine, specify machine type with -cpu" - echo "${prog}: e.g. ${prog} -cpu VAX CONFIGURATION" - exit 1 - endif -endif - -set FEATURES_EXTRA= - -switch ("$MACHINE") - case IBMRT: - set cpu=ca - set ID=RT - set FEATURES_EXTRA="romp_dualcall.h romp_fpa.h" - breaksw - case SUN: - set cpu=sun3 - set ID=SUN3 - breaksw - default: - set cpu=`echo $MACHINE | tr A-Z a-z` - set ID=`echo $MACHINE | tr a-z A-Z` - breaksw -endsw -set FEATURES=../h/features.h -set FEATURES_H=(cs_*.h mach_*.h net_*.h\ - cputypes.h cpus.h vice.h\ - $FEATURES_EXTRA) -set MASTER_DIR=../conf -set MASTER = ${MASTER_DIR}/MASTER -set MASTER_CPU=${MASTER}.${cpu} - -set MASTER_LOCAL = ${MASTER}.local -set MASTER_CPU_LOCAL = ${MASTER_CPU}.local -if (! -f $MASTER_LOCAL) set MASTER_LOCAL = "" -if (! -f $MASTER_CPU_LOCAL) set MASTER_CPU_LOCAL = "" - -if (! -d $OBJDIR) then - if ($?beverbose) then - echo "[ creating $OBJDIR ]" - endif - mkdir -p $OBJDIR -endif - -foreach SYS ($argv) - set SYSID=${SYS}_${ID} - set SYSCONF=$OBJDIR/config.$SYSID - set BLDDIR=$OBJDIR - if ($?beverbose) then - echo "[ generating $SYSID from $MASTER_DIR/MASTER{,.$cpu}{,.local} ]" - endif - echo +$SYS \ - | \ - cat $MASTER $MASTER_LOCAL $MASTER_CPU $MASTER_CPU_LOCAL - \ - $MASTER $MASTER_LOCAL $MASTER_CPU $MASTER_CPU_LOCAL \ - | \ - sed -n \ - -e "/^+/{" \ - -e "s;[-+];#&;gp" \ - -e 't loop' \ - -e ': loop' \ - -e 'n' \ - -e '/^#/b loop' \ - -e '/^$/b loop' \ - -e 's;^\([^#]*\).*#[ ]*<\(.*\)>[ ]*$;\2#\1;' \ - -e 't not' \ - -e 's;\([^#]*\).*;#\1;' \ - -e 't not' \ - -e ': not' \ - -e 's;[ ]*$;;' \ - -e 's;^\!\(.*\);\1#\!;' \ - -e 'p' \ - -e 't loop' \ - -e 'b loop' \ - -e '}' \ - -e "/^[^#]/d" \ - -e 's; ; ;g' \ - -e "s;^# *\([^ ]*\)[ ]*=[ ]*\[\(.*\)\].*;\1#\2;p" \ - | \ - awk '-F#' '\ -part == 0 && $1 != "" {\ - m[$1]=m[$1] " " $2;\ - next;\ -}\ -part == 0 && $1 == "" {\ - for (i=NF;i>1;i--){\ - s=substr($i,2);\ - c[++na]=substr($i,1,1);\ - a[na]=s;\ - }\ - while (na > 0){\ - s=a[na];\ - d=c[na--];\ - if (m[s] == "") {\ - f[s]=d;\ - } else {\ - nx=split(m[s],x," ");\ - for (j=nx;j>0;j--) {\ - z=x[j];\ - a[++na]=z;\ - c[na]=d;\ - }\ - }\ - }\ - part=1;\ - next;\ -}\ -part != 0 {\ - if ($1 != "") {\ - n=split($1,x,",");\ - ok=0;\ - for (i=1;i<=n;i++) {\ - if (f[x[i]] == "+") {\ - ok=1;\ - }\ - }\ - if (NF > 2 && ok == 0 || NF <= 2 && ok != 0) {\ - print $2; \ - }\ - } else { \ - print $2; \ - }\ -}\ -' >$SYSCONF.new - if (-z $SYSCONF.new) then - echo "${prog}: ${$SYSID}: no such configuration in $MASTER_DIR/MASTER{,.$cpu}" - rm -f $SYSCONF.new - endif - if (! -d $BLDDIR) then - if ($?beverbose) then - echo "[ creating $BLDDIR ]" - endif - mkdir -p $BLDDIR - endif -# -# These paths are used by config. -# -# "builddir" is the name of the directory where kernel binaries -# are put. It is a single path element, never absolute, and is -# always relative to "objectdir". "builddir" is used by config -# solely to determine where to put files created by "config" (e.g. -# the created Makefile and *.h's.) -# -# "objectdir" is the name of the directory which will hold "builddir". -# It is a path; if relative, it is relative to the current directory -# where config is run. It's sole use is to be prepended to "builddir" -# to indicate where config-created files are to be placed (see above). -# -# "sourcedir" is the location of the sources used to build the kernel. -# It is a path; if relative, it is relative to the directory specified -# by the concatenation of "objectdir" and "builddir" (i.e. where the -# kernel binaries are put). -# - echo 'builddir "."' >> $SYSCONF.new - set OBJRELDIR=`$RELPATH $OBJROOT $OBJDIR` - echo 'objectdir "'$OBJROOT'/'$OBJRELDIR'"' >> $SYSCONF.new - set SRCDIR=`dirname $SOURCE` - echo 'sourcedir "'$SRCROOT'"' >> $SYSCONF.new - if (-f $SYSCONF) then - diff $SYSCONF $SYSCONF.new - rm -f $SYSCONF.old - mv $SYSCONF $SYSCONF.old - endif - rm -f $SYSCONF - mv $SYSCONF.new $SYSCONF - if ($?doconfig) then - if ($?beverbose) then - echo "[ configuring $SYSID ]" - endif - if ($?profile) then - $CONFIG_DIR/config -c $MASTER_DIR -p $SYSCONF - else - $CONFIG_DIR/config -c $MASTER_DIR $SYSCONF - endif - endif - if ($?domake) then - if ($?beverbose) then - echo "[ making $SYSID ]" - endif - (cd $BLDDIR; make) - endif -end diff --git a/libsa/lastkernelconstructor.c b/libsa/lastkernelconstructor.c index 97980f080..5b62f3fe6 100644 --- a/libsa/lastkernelconstructor.c +++ b/libsa/lastkernelconstructor.c @@ -26,10 +26,10 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -static void last_kernel_constructor(void) __attribute__ ((constructor,section("__TEXT, initcode"))); - extern void iokit_post_constructor_init(void); +static void last_kernel_constructor(void) __attribute__ ((constructor,section("__TEXT, initcode"))); + static void last_kernel_constructor(void) { iokit_post_constructor_init(); diff --git a/libsa/libsa/Makefile b/libsa/libsa/Makefile index c96349565..4554d46ba 100644 --- a/libsa/libsa/Makefile +++ b/libsa/libsa/Makefile @@ -8,13 +8,11 @@ include $(MakeInc_cmd) include $(MakeInc_def) INSTINC_SUBDIRS = -INSTINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS} INSTINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS} INSTINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS} INSTINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS} EXPINC_SUBDIRS = -EXPINC_SUBDIRS_PPC = ${EXPINC_SUBDIRS} EXPINC_SUBDIRS_I386 = ${EXPINC_SUBDIRS} EXPINC_SUBDIRS_X86_64 = ${EXPINC_SUBDIRS} EXPINC_SUBDIRS_ARM = ${EXPINC_SUBDIRS} diff --git a/libsyscall/BSDmakefile b/libsyscall/BSDmakefile deleted file mode 100644 index 57eda28ba..000000000 --- a/libsyscall/BSDmakefile +++ /dev/null @@ -1,141 +0,0 @@ -.ifndef DSTROOT -DSTROOT != x=`pwd`/DSTROOT && mkdir -p $$x && echo $$x -.endif -.ifndef OBJROOT -OBJROOT != x=`pwd`/OBJROOT && mkdir -p $$x && echo $$x -.endif -.ifndef SRCROOT -SRCROOT != dirname `pwd` -.endif -.ifndef SYMROOT -SYMROOT != x=`pwd`/SYMROOT && mkdir -p $$x && echo $$x -.endif -ARCH != arch -.ifndef RC_ARCHS -RC_ARCHS = $(ARCH) -RC_$(RC_ARCHS) = 1 -.endif -SDKROOT ?= / -NARCHS != echo $(RC_ARCHS) | wc -w -LIBSYS = $(SDKROOT)/usr/local/lib/system -NJOBS != perl -e '$$n = `/usr/sbin/sysctl -n hw.ncpu`; printf "%d\n", $$n < 2 ? 2 : ($$n * 1.5)' -BSDMAKE = bsdmake -f Makefile -BSDMAKEJ = $(BSDMAKE) -j $(NJOBS) - -# This variables are to guarantee that the left-hand side of an expression is -# always a variable -dynamic = dynamic -static = static - -# Remove the arch stuff, since we know better here. -LOCAL_CFLAGS != echo $(RC_CFLAGS) | sed 's/ *-arch [^ ][^ ]*//g' - -FORMS := debug dynamic profile static - -all: setup build - -build: build-debug build-dynamic build-profile build-static - -# These are the non B&I defaults -.ifndef RC_ProjectName -install: installhdrs install-all - -.else # RC_ProjectName - -install: setup -.for F in $(FORMS) -install: BI-install-$(F) -.endfor # FORMS -install: - install -c -m 444 $(OBJROOT)/sys/libsyscall.list $(DSTROOT)/usr/local/lib/system -.endif # RC_ProjectName - -.for F in $(FORMS) -.if $(dynamic) == $(F) -SUFFIX$(F) = -.else -SUFFIX$(F) = _$(F) -.endif -LIPOARGS$(F) != perl -e 'printf "%s\n", join(" ", map(qq(-arch $$_ \"$(OBJROOT)/obj.$$_/libsyscall$(SUFFIX$(F)).a\"), qw($(RC_ARCHS))))' - -.for A in $(RC_ARCHS) -build-$(F): build-$(A)-$(F) -.endfor # RC_ARCHS -build-$(F): - mkdir -p $(SYMROOT) -.if $(NARCHS) == 1 - cp -p "$(OBJROOT)/obj.$(RC_ARCHS)/libsyscall$(SUFFIX$(F)).a" "$(SYMROOT)" -.else - xcrun -sdk $(SDKROOT) lipo -create $(LIPOARGS$(F)) -output $(SYMROOT)/libsyscall$(SUFFIX$(F)).a -.endif - -.for A in $(RC_ARCHS) -build-$(A)-$(F): - mkdir -p $(OBJROOT)/obj.$(A) && \ - MAKEOBJDIR="$(OBJROOT)/obj.$(A)" MACHINE_ARCH="$(A)" \ - DSTROOT='$(DSTROOT)' OBJROOT='$(OBJROOT)' SYMROOT='$(SYMROOT)' \ - MAKEFLAGS="" MIGDEFINES="" CFLAGS="-arch $(A) $(LOCAL_CFLAGS)" $(BSDMAKEJ) libsyscall$(SUFFIX$(F)).a -.endfor # RC_ARCHS -.endfor # FORMS - -installhdrs: - MAKEOBJDIR="$(OBJROOT)" DESTDIR="$(DSTROOT)" MAKEFLAGS="" \ - DSTROOT='$(DSTROOT)' OBJROOT='$(OBJROOT)' SYMROOT='$(SYMROOT)' \ - MIGDEFINES="-DLIBSYSCALL_INTERFACE=1" \ - $(BSDMAKE) installhdrs -.for A in $(RC_ARCHS) - mkdir -p "$(OBJROOT)/obj.$(A)" && \ - MAKEOBJDIR="$(OBJROOT)/obj.$(A)" MACHINE_ARCH="$(A)" \ - DSTROOT='$(DSTROOT)' OBJROOT='$(OBJROOT)' SYMROOT='$(SYMROOT)' \ - MAKEFLAGS="" MIGDEFINES="" $(BSDMAKE) installhdrs-md -.endfor # RC_ARCHS - -.for F in $(FORMS) -BI-install-$(F): build-$(F) - mkdir -p $(DSTROOT)/usr/local/lib/system - if [ -f "$(SYMROOT)/libsyscall$(SUFFIX$(F)).a" ]; then \ - echo "Installing libsyscall$(SUFFIX$(F)).a" && \ - install -c -m 644 "$(SYMROOT)/libsyscall$(SUFFIX$(F)).a" \ - $(DSTROOT)/usr/local/lib/system && \ - ranlib "$(DSTROOT)/usr/local/lib/system/libsyscall$(SUFFIX$(F)).a"; \ - chmod 444 "$(DSTROOT)/usr/local/lib/system/libsyscall$(SUFFIX$(F)).a"; \ - fi -.endfor # FORMS - -install-man: - mkdir -p $(DSTROOT)/usr/share/man/man2 - MAKEOBJDIR="$(OBJROOT)" DESTDIR="$(DSTROOT)" \ - DSTROOT='$(DSTROOT)' OBJROOT='$(OBJROOT)' SYMROOT='$(SYMROOT)' \ - MACHINE_ARCH="$(ARCH)" MAKEFLAGS="" $(BSDMAKE) all-man maninstall - -install-all: setup build install-man -.for F in $(FORMS) -install-all: BI-install-$(F) -.endfor # FORMS - -clean: -.for F in $(FORMS) - rm -f $(OBJROOT)/libsyscall$(SUFFIX$(F)).a -.endfor # FORMS -.for A in $(RC_ARCHS) - rm -rf $(OBJROOT)/obj.$(A) -.endfor # RC_ARCHS - -INCLUDEDIR = $(OBJROOT)/include -SYSDIR = $(OBJROOT)/sys - -setup: $(INCLUDEDIR) $(SYSDIR) - -USR-INCLUDE = /usr/include -MOD-HEADERS = architecture/ppc/mode_independent_asm.h architecture/i386/asm_help.h - -$(INCLUDEDIR): - mkdir -p $(INCLUDEDIR) -.for h in $(MOD-HEADERS) - mkdir -p $(INCLUDEDIR)/$(h:H) - sed 's/\.globl/.private_extern/g' $(USR-INCLUDE)/$(h) > $(INCLUDEDIR)/$(h) -.endfor # MOD-HEADERS - -$(SYSDIR): - mkdir -p $(SYSDIR) - $(SRCROOT)/libsyscall/create-syscalls.pl $(SRCROOT)/bsd/kern/syscalls.master $(SRCROOT)/libsyscall/custom $(SYSDIR) diff --git a/libsyscall/GNUmakefile b/libsyscall/GNUmakefile deleted file mode 100644 index 6965e8628..000000000 --- a/libsyscall/GNUmakefile +++ /dev/null @@ -1,8 +0,0 @@ -# This GNUmakefile is only used when running "make" by-hand; it is not -# used by buildit or XBS - -all: - @bsdmake - -.DEFAULT: - @bsdmake $@ diff --git a/libsyscall/Libsyscall.xcconfig b/libsyscall/Libsyscall.xcconfig new file mode 100644 index 000000000..8881d5028 --- /dev/null +++ b/libsyscall/Libsyscall.xcconfig @@ -0,0 +1,31 @@ +#include "/Makefiles/CoreOS/Xcode/BSD.xcconfig" +BUILD_VARIANTS = normal +ONLY_ACTIVE_ARCH = NO +DEBUG_INFORMATION_FORMAT = dwarf-with-dsym +INSTALL_PATH = /usr/lib/system +INSTALL_PATH[sdk=iphoneos*] = /usr/lib/system +INSTALL_PATH[sdk=iphonesimulator*] = $(SDKROOT)/usr/lib/system +INSTALL_PATH[sdk=macosx*] = /usr/lib/system +PUBLIC_HEADERS_FOLDER_PATH = /usr/include/mach +PUBLIC_HEADERS_FOLDER_PATH[sdk=iphoneos*] = /usr/include/mach +PUBLIC_HEADERS_FOLDER_PATH[sdk=iphonesimulator*] = $(SDKROOT)/usr/include/mach +PUBLIC_HEADERS_FOLDER_PATH[sdk=macosx*] = /usr/include/mach +EXECUTABLE_PREFIX = libsystem_ +PRODUCT_NAME = kernel +ALWAYS_SEARCH_USER_PATHS = NO +OTHER_CFLAGS = -fdollars-in-identifiers -no-cpp-precomp -fno-common -fno-stack-protector -pipe -DLIBSYSCALL_INTERFACE -D__DARWIN_VERS_1050=1 +OTHER_CFLAGS[sdk=macosx*] = $(inherited) -DSYSCALL_PRE1050 +OTHER_CFLAGS[sdk=macosx*][arch=x86_64] = $(inherited) -DNO_SYSCALL_LEGACY +OTHER_CFLAGS[sdk=iphoneos*] = $(inherited) -DNO_SYSCALL_LEGACY +GCC_PREPROCESSOR_DEFINITIONS = CF_OPEN_SOURCE CF_EXCLUDE_CSTD_HEADERS DEBUG _FORTIFY_SOURCE=0 +HEADER_SEARCH_PATHS = /System/Library/Frameworks/System.framework/PrivateHeaders $(PROJECT_DIR)/mach $(PROJECT_DIR)/wrappers +WARNING_CFLAGS = -Wmost +GCC_TREAT_WARNINGS_AS_ERRORS = YES +GCC_WARN_ABOUT_MISSING_NEWLINE = YES +CODE_SIGN_IDENTITY = - +DYLIB_CURRENT_VERSION = $(RC_ProjectSourceVersion) +OTHER_LDFLAGS = +INSTALLHDRS_SCRIPT_PHASE = YES +USE_HEADERMAP = NO +LINK_WITH_STANDARD_LIBRARIES = NO +ALWAYS_SEARCH_USER_PATHS = YES diff --git a/libsyscall/Libsyscall.xcodeproj/project.pbxproj b/libsyscall/Libsyscall.xcodeproj/project.pbxproj new file mode 100644 index 000000000..6310cd437 --- /dev/null +++ b/libsyscall/Libsyscall.xcodeproj/project.pbxproj @@ -0,0 +1,1029 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 45; + objects = { + +/* Begin PBXAggregateTarget section */ + 24614EF311E7C98600E78584 /* Syscalls */ = { + isa = PBXAggregateTarget; + buildConfigurationList = 24614EFD11E7C9B900E78584 /* Build configuration list for PBXAggregateTarget "Syscalls" */; + buildPhases = ( + 24614EF211E7C98600E78584 /* Generate Syscalls */, + 24614EF611E7C9A000E78584 /* Compile Syscalls */, + ); + dependencies = ( + ); + name = Syscalls; + productName = Syscalls; + }; + 249C61101194755D00ED73F3 /* Build */ = { + isa = PBXAggregateTarget; + buildConfigurationList = 249C61191194756B00ED73F3 /* Build configuration list for PBXAggregateTarget "Build" */; + buildPhases = ( + ); + dependencies = ( + 249C61151194756A00ED73F3 /* PBXTargetDependency */, + ); + name = Build; + productName = Build; + }; +/* End PBXAggregateTarget section */ + +/* Begin PBXBuildFile section */ + 240BAC4C1214770F000A1719 /* memcpy.c in Sources */ = {isa = PBXBuildFile; fileRef = 24B028D511FF4FBB00CA64A9 /* memcpy.c */; }; + 2419382B12135FF6003CDE41 /* chmod.c in Sources */ = {isa = PBXBuildFile; fileRef = 2419382A12135FF6003CDE41 /* chmod.c */; }; + 242AB66611EBDC1200107336 /* errno.c in Sources */ = {isa = PBXBuildFile; fileRef = 242AB66511EBDC1200107336 /* errno.c */; }; + 24484A7511F6178E00E10CD2 /* string.c in Sources */ = {isa = PBXBuildFile; fileRef = 24484A7411F51E9800E10CD2 /* string.c */; }; + 24484A9411F61D2B00E10CD2 /* mig_reply_port.c in Sources */ = {isa = PBXBuildFile; fileRef = 24484A9311F61D1900E10CD2 /* mig_reply_port.c */; }; + 24614F0411E7CB5B00E78584 /* syscalls.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 24614F0311E7CB5B00E78584 /* syscalls.a */; }; + 247A08C211F8BDC900E4693F /* _libkernel_init.c in Sources */ = {isa = PBXBuildFile; fileRef = 247A08B311F8B05900E4693F /* _libkernel_init.c */; }; + 247A090011F8E18000E4693F /* abort.h in Headers */ = {isa = PBXBuildFile; fileRef = 247A08FF11F8E18000E4693F /* abort.h */; }; + 247A091711F8E7A800E4693F /* exc_catcher.h in Headers */ = {isa = PBXBuildFile; fileRef = 247A091611F8E7A800E4693F /* exc_catcher.h */; }; + 2485235511582D8F0051B413 /* mach_legacy.c in Sources */ = {isa = PBXBuildFile; fileRef = 2485235411582D8F0051B413 /* mach_legacy.c */; }; + 248AA963122C7B2A0085F5B1 /* unlink.c in Sources */ = {isa = PBXBuildFile; fileRef = 248AA962122C7B2A0085F5B1 /* unlink.c */; }; + 248AA965122C7C330085F5B1 /* rmdir.c in Sources */ = {isa = PBXBuildFile; fileRef = 248AA964122C7C330085F5B1 /* rmdir.c */; }; + 248AA967122C7CDA0085F5B1 /* rename.c in Sources */ = {isa = PBXBuildFile; fileRef = 248AA966122C7CDA0085F5B1 /* rename.c */; }; + 248BA01D121C56BF008C073F /* connect.c in Sources */ = {isa = PBXBuildFile; fileRef = 248BA01C121C56BF008C073F /* connect.c */; }; + 248BA01F121C607E008C073F /* fchmod.c in Sources */ = {isa = PBXBuildFile; fileRef = 248BA01E121C607E008C073F /* fchmod.c */; }; + 248BA04F121C8F06008C073F /* fcntl.c in Sources */ = {isa = PBXBuildFile; fileRef = 248BA04E121C8F06008C073F /* fcntl.c */; }; + 248BA05C121C9649008C073F /* fcntl-cancel.c in Sources */ = {isa = PBXBuildFile; fileRef = 248BA051121C8FE2008C073F /* fcntl-cancel.c */; }; + 248BA069121D9E27008C073F /* getrlimit.c in Sources */ = {isa = PBXBuildFile; fileRef = 248BA068121D9E27008C073F /* getrlimit.c */; }; + 248BA080121DA36B008C073F /* ioctl.c in Sources */ = {isa = PBXBuildFile; fileRef = 248BA07F121DA36B008C073F /* ioctl.c */; }; + 248BA082121DA4F3008C073F /* kill.c in Sources */ = {isa = PBXBuildFile; fileRef = 248BA081121DA4F3008C073F /* kill.c */; }; + 248BA085121DA5E4008C073F /* kill.c in Sources */ = {isa = PBXBuildFile; fileRef = 248BA084121DA5E4008C073F /* kill.c */; }; + 248BA087121DA72D008C073F /* mmap.c in Sources */ = {isa = PBXBuildFile; fileRef = 248BA086121DA72D008C073F /* mmap.c */; }; + 248BA089121DA8E0008C073F /* mprotect.c in Sources */ = {isa = PBXBuildFile; fileRef = 248BA088121DA8E0008C073F /* mprotect.c */; }; + 248BA08B121DAC86008C073F /* msync.c in Sources */ = {isa = PBXBuildFile; fileRef = 248BA08A121DAC86008C073F /* msync.c */; }; + 248BA08D121DB0E7008C073F /* munmap.c in Sources */ = {isa = PBXBuildFile; fileRef = 248BA08C121DB0E7008C073F /* munmap.c */; }; + 248BA08F121DC545008C073F /* open.c in Sources */ = {isa = PBXBuildFile; fileRef = 248BA08E121DC545008C073F /* open.c */; }; + 248BA093121DE369008C073F /* select.c in Sources */ = {isa = PBXBuildFile; fileRef = 248BA092121DE369008C073F /* select.c */; }; + 248BA095121DE565008C073F /* select-pre1050.c in Sources */ = {isa = PBXBuildFile; fileRef = 248BA094121DE565008C073F /* select-pre1050.c */; }; + 248BA0B3121DE760008C073F /* select-cancel.c in Sources */ = {isa = PBXBuildFile; fileRef = 248BA0B2121DE760008C073F /* select-cancel.c */; }; + 248BA0BE121DE902008C073F /* select.c in Sources */ = {isa = PBXBuildFile; fileRef = 248BA0BC121DE902008C073F /* select.c */; }; + 248BA0CD121DEBEF008C073F /* setrlimit.c in Sources */ = {isa = PBXBuildFile; fileRef = 248BA0CC121DEBEF008C073F /* setrlimit.c */; }; + 249C610B1194750E00ED73F3 /* libsystem_kernel.a in Frameworks */ = {isa = PBXBuildFile; fileRef = D2AAC0630554660B00DB518D /* libsystem_kernel.a */; }; + 249C612F1194828600ED73F3 /* dylib_link.c in Sources */ = {isa = PBXBuildFile; fileRef = 249C612C1194827D00ED73F3 /* dylib_link.c */; }; + 24A7C5BC11FF8DA6007669EB /* accept.c in Sources */ = {isa = PBXBuildFile; fileRef = 24A7C5AE11FF8DA6007669EB /* accept.c */; }; + 24A7C5BD11FF8DA6007669EB /* bind.c in Sources */ = {isa = PBXBuildFile; fileRef = 24A7C5AF11FF8DA6007669EB /* bind.c */; }; + 24A7C5BF11FF8DA6007669EB /* getattrlist.c in Sources */ = {isa = PBXBuildFile; fileRef = 24A7C5B111FF8DA6007669EB /* getattrlist.c */; }; + 24A7C5C011FF8DA6007669EB /* getpeername.c in Sources */ = {isa = PBXBuildFile; fileRef = 24A7C5B211FF8DA6007669EB /* getpeername.c */; }; + 24A7C5C111FF8DA6007669EB /* getsockname.c in Sources */ = {isa = PBXBuildFile; fileRef = 24A7C5B311FF8DA6007669EB /* getsockname.c */; }; + 24A7C5C211FF8DA6007669EB /* lchown.c in Sources */ = {isa = PBXBuildFile; fileRef = 24A7C5B411FF8DA6007669EB /* lchown.c */; }; + 24A7C5C311FF8DA6007669EB /* listen.c in Sources */ = {isa = PBXBuildFile; fileRef = 24A7C5B511FF8DA6007669EB /* listen.c */; }; + 24A7C5C411FF8DA6007669EB /* recvfrom.c in Sources */ = {isa = PBXBuildFile; fileRef = 24A7C5B611FF8DA6007669EB /* recvfrom.c */; }; + 24A7C5C511FF8DA6007669EB /* recvmsg.c in Sources */ = {isa = PBXBuildFile; fileRef = 24A7C5B711FF8DA6007669EB /* recvmsg.c */; }; + 24A7C5C611FF8DA6007669EB /* sendmsg.c in Sources */ = {isa = PBXBuildFile; fileRef = 24A7C5B811FF8DA6007669EB /* sendmsg.c */; }; + 24A7C5C711FF8DA6007669EB /* sendto.c in Sources */ = {isa = PBXBuildFile; fileRef = 24A7C5B911FF8DA6007669EB /* sendto.c */; }; + 24A7C5C811FF8DA6007669EB /* setattrlist.c in Sources */ = {isa = PBXBuildFile; fileRef = 24A7C5BA11FF8DA6007669EB /* setattrlist.c */; }; + 24A7C5C911FF8DA6007669EB /* socketpair.c in Sources */ = {isa = PBXBuildFile; fileRef = 24A7C5BB11FF8DA6007669EB /* socketpair.c */; }; + 24B028F511FF5C3500CA64A9 /* _libkernel_init.h in Headers */ = {isa = PBXBuildFile; fileRef = 247A08B211F8B05900E4693F /* _libkernel_init.h */; settings = {ATTRIBUTES = (Private, ); }; }; + 24B223B0121DFD36007DAEDE /* sigsuspend.c in Sources */ = {isa = PBXBuildFile; fileRef = 24B223AF121DFD36007DAEDE /* sigsuspend.c */; }; + 24B223B2121DFE6D007DAEDE /* sigsuspend-cancel.c in Sources */ = {isa = PBXBuildFile; fileRef = 24B223B1121DFE6D007DAEDE /* sigsuspend-cancel.c */; }; + 24B223B5121DFF29007DAEDE /* sigsuspend.c in Sources */ = {isa = PBXBuildFile; fileRef = 24B223B4121DFF29007DAEDE /* sigsuspend.c */; }; + 24B8C2621237F53900D36CC3 /* remove-counter.c in Sources */ = {isa = PBXBuildFile; fileRef = 24B8C2611237F53900D36CC3 /* remove-counter.c */; }; + 24D1158311E671B20063D54D /* SYS.h in Headers */ = {isa = PBXBuildFile; fileRef = 24D1157411E671B20063D54D /* SYS.h */; }; + 24E4782712088267009A384D /* _libc_funcptr.c in Sources */ = {isa = PBXBuildFile; fileRef = 24E47824120881DF009A384D /* _libc_funcptr.c */; }; + C99A4F501305B2BD0054B7B7 /* __get_cpu_capabilities.s in Sources */ = {isa = PBXBuildFile; fileRef = C99A4F4E1305B1B70054B7B7 /* __get_cpu_capabilities.s */; }; + C99A4F531305B43F0054B7B7 /* init_cpu_capabilities.c in Sources */ = {isa = PBXBuildFile; fileRef = C99A4F511305B43F0054B7B7 /* init_cpu_capabilities.c */; }; + C9D9BD17114B00600000D8B9 /* vm_map_compat.c in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCC2114B00600000D8B9 /* vm_map_compat.c */; }; + C9D9BD19114B00600000D8B9 /* clock_priv.defs in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCC5114B00600000D8B9 /* clock_priv.defs */; }; + C9D9BD1A114B00600000D8B9 /* clock_reply.defs in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCC6114B00600000D8B9 /* clock_reply.defs */; }; + C9D9BD1B114B00600000D8B9 /* clock_sleep.c in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCC7114B00600000D8B9 /* clock_sleep.c */; }; + C9D9BD1C114B00600000D8B9 /* clock.defs in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCC8114B00600000D8B9 /* clock.defs */; }; + C9D9BD1D114B00600000D8B9 /* error_codes.c in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCD0114B00600000D8B9 /* error_codes.c */; }; + C9D9BD1E114B00600000D8B9 /* errorlib.h in Headers */ = {isa = PBXBuildFile; fileRef = C9D9BCD1114B00600000D8B9 /* errorlib.h */; }; + C9D9BD1F114B00600000D8B9 /* exc_catcher_state_identity.c in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCD2114B00600000D8B9 /* exc_catcher_state_identity.c */; }; + C9D9BD20114B00600000D8B9 /* exc_catcher_state.c in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCD3114B00600000D8B9 /* exc_catcher_state.c */; }; + C9D9BD21114B00600000D8B9 /* exc_catcher.c in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCD4114B00600000D8B9 /* exc_catcher.c */; }; + C9D9BD22114B00600000D8B9 /* exc.defs in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCD5114B00600000D8B9 /* exc.defs */; settings = {ATTRIBUTES = (Client, Server, ); }; }; + C9D9BD23114B00600000D8B9 /* externs.h in Headers */ = {isa = PBXBuildFile; fileRef = C9D9BCD6114B00600000D8B9 /* externs.h */; }; + C9D9BD24114B00600000D8B9 /* fprintf_stderr.c in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCD7114B00600000D8B9 /* fprintf_stderr.c */; }; + C9D9BD25114B00600000D8B9 /* errorlib.h in Headers */ = {isa = PBXBuildFile; fileRef = C9D9BCD9114B00600000D8B9 /* errorlib.h */; }; + C9D9BD26114B00600000D8B9 /* mach.h in Headers */ = {isa = PBXBuildFile; fileRef = C9D9BCDA114B00600000D8B9 /* mach.h */; settings = {ATTRIBUTES = (Public, ); }; }; + C9D9BD27114B00600000D8B9 /* mach_error.h in Headers */ = {isa = PBXBuildFile; fileRef = C9D9BCDB114B00600000D8B9 /* mach_error.h */; settings = {ATTRIBUTES = (Public, ); }; }; + C9D9BD28114B00600000D8B9 /* mach_init.h in Headers */ = {isa = PBXBuildFile; fileRef = C9D9BCDC114B00600000D8B9 /* mach_init.h */; settings = {ATTRIBUTES = (Public, ); }; }; + C9D9BD29114B00600000D8B9 /* mach_interface.h in Headers */ = {isa = PBXBuildFile; fileRef = C9D9BCDD114B00600000D8B9 /* mach_interface.h */; settings = {ATTRIBUTES = (Public, ); }; }; + C9D9BD2B114B00600000D8B9 /* port_obj.h in Headers */ = {isa = PBXBuildFile; fileRef = C9D9BCDF114B00600000D8B9 /* port_obj.h */; settings = {ATTRIBUTES = (Public, ); }; }; + C9D9BD2C114B00600000D8B9 /* sync.h in Headers */ = {isa = PBXBuildFile; fileRef = C9D9BCE0114B00600000D8B9 /* sync.h */; settings = {ATTRIBUTES = (Public, ); }; }; + C9D9BD2D114B00600000D8B9 /* task.h in Headers */ = {isa = PBXBuildFile; fileRef = C9D9BCE1114B00600000D8B9 /* task.h */; settings = {ATTRIBUTES = (Public, ); }; }; + C9D9BD2E114B00600000D8B9 /* thread_act.h in Headers */ = {isa = PBXBuildFile; fileRef = C9D9BCE2114B00600000D8B9 /* thread_act.h */; settings = {ATTRIBUTES = (Public, ); }; }; + C9D9BD2F114B00600000D8B9 /* vm_task.h in Headers */ = {isa = PBXBuildFile; fileRef = C9D9BCE3114B00600000D8B9 /* vm_task.h */; settings = {ATTRIBUTES = (Public, ); }; }; + C9D9BD30114B00600000D8B9 /* host_priv.defs in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCE4114B00600000D8B9 /* host_priv.defs */; }; + C9D9BD31114B00600000D8B9 /* host_security.defs in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCE5114B00600000D8B9 /* host_security.defs */; }; + C9D9BD34114B00600000D8B9 /* ledger.defs in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCE9114B00600000D8B9 /* ledger.defs */; }; + C9D9BD35114B00600000D8B9 /* lock_set.defs in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCEA114B00600000D8B9 /* lock_set.defs */; }; + C9D9BD36114B00600000D8B9 /* mach_error_string.c in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCEB114B00600000D8B9 /* mach_error_string.c */; }; + C9D9BD37114B00600000D8B9 /* mach_error.c in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCEC114B00600000D8B9 /* mach_error.c */; }; + C9D9BD38114B00600000D8B9 /* mach_host.defs in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCED114B00600000D8B9 /* mach_host.defs */; }; + C9D9BD3B114B00600000D8B9 /* mach_init.c in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCF0114B00600000D8B9 /* mach_init.c */; }; + C9D9BD3C114B00600000D8B9 /* mach_msg.c in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCF1114B00600000D8B9 /* mach_msg.c */; }; + C9D9BD3D114B00600000D8B9 /* mach_port.defs in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCF2114B00600000D8B9 /* mach_port.defs */; }; + C9D9BD3E114B00600000D8B9 /* mach_traps.s in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCF3114B00600000D8B9 /* mach_traps.s */; }; + C9D9BD3F114B00600000D8B9 /* mach_vm.defs in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCF4114B00600000D8B9 /* mach_vm.defs */; }; + C9D9BD41114B00600000D8B9 /* mig_allocate.c in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCF6114B00600000D8B9 /* mig_allocate.c */; }; + C9D9BD42114B00600000D8B9 /* mig_deallocate.c in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCF7114B00600000D8B9 /* mig_deallocate.c */; }; + C9D9BD43114B00600000D8B9 /* mig_reply_setup.c in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCF8114B00600000D8B9 /* mig_reply_setup.c */; }; + C9D9BD44114B00600000D8B9 /* mig_strncpy.c in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCF9114B00600000D8B9 /* mig_strncpy.c */; }; + C9D9BD45114B00600000D8B9 /* ms_thread_switch.c in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCFA114B00600000D8B9 /* ms_thread_switch.c */; }; + C9D9BD46114B00600000D8B9 /* notify.defs in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCFB114B00600000D8B9 /* notify.defs */; }; + C9D9BD47114B00600000D8B9 /* panic.c in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCFC114B00600000D8B9 /* panic.c */; }; + C9D9BD48114B00600000D8B9 /* port_obj.c in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCFD114B00600000D8B9 /* port_obj.c */; }; + C9D9BD4C114B00600000D8B9 /* processor_set.defs in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BD03114B00600000D8B9 /* processor_set.defs */; }; + C9D9BD4D114B00600000D8B9 /* processor.defs in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BD04114B00600000D8B9 /* processor.defs */; }; + C9D9BD4F114B00600000D8B9 /* semaphore.c in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BD06114B00600000D8B9 /* semaphore.c */; }; + C9D9BD50114B00600000D8B9 /* key_defs.h in Headers */ = {isa = PBXBuildFile; fileRef = C9D9BD08114B00600000D8B9 /* key_defs.h */; }; + C9D9BD51114B00600000D8B9 /* ls_defs.h in Headers */ = {isa = PBXBuildFile; fileRef = C9D9BD09114B00600000D8B9 /* ls_defs.h */; }; + C9D9BD53114B00600000D8B9 /* netname.defs in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BD0B114B00600000D8B9 /* netname.defs */; }; + C9D9BD54114B00600000D8B9 /* netname_defs.h in Headers */ = {isa = PBXBuildFile; fileRef = C9D9BD0C114B00600000D8B9 /* netname_defs.h */; }; + C9D9BD55114B00600000D8B9 /* nm_defs.h in Headers */ = {isa = PBXBuildFile; fileRef = C9D9BD0D114B00600000D8B9 /* nm_defs.h */; }; + C9D9BD56114B00600000D8B9 /* slot_name.c in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BD0E114B00600000D8B9 /* slot_name.c */; }; + C9D9BD57114B00600000D8B9 /* task.defs in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BD0F114B00600000D8B9 /* task.defs */; }; + C9D9BD58114B00600000D8B9 /* thread_act.defs in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BD10114B00600000D8B9 /* thread_act.defs */; }; + C9D9BD59114B00600000D8B9 /* vm_map.defs in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BD11114B00600000D8B9 /* vm_map.defs */; }; +/* End PBXBuildFile section */ + +/* Begin PBXContainerItemProxy section */ + 242AB67811ED03ED00107336 /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 08FB7793FE84155DC02AAC07 /* Project object */; + proxyType = 1; + remoteGlobalIDString = 24614EF311E7C98600E78584; + remoteInfo = Syscalls; + }; + 249C61091194750700ED73F3 /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 08FB7793FE84155DC02AAC07 /* Project object */; + proxyType = 1; + remoteGlobalIDString = D2AAC0620554660B00DB518D; + remoteInfo = "Libmach Static"; + }; + 249C61141194756A00ED73F3 /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 08FB7793FE84155DC02AAC07 /* Project object */; + proxyType = 1; + remoteGlobalIDString = 249C60FE1194747600ED73F3; + remoteInfo = Libmach; + }; +/* End PBXContainerItemProxy section */ + +/* Begin PBXFileReference section */ + 240D716711933ED300556E97 /* mach_install_mig.sh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.script.sh; path = mach_install_mig.sh; sourceTree = ""; }; + 2419382A12135FF6003CDE41 /* chmod.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = chmod.c; sourceTree = ""; }; + 2427FA821200BCF800EF7A1F /* compat-symlinks.sh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.script.sh; path = "compat-symlinks.sh"; sourceTree = ""; }; + 242AB66511EBDC1200107336 /* errno.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = errno.c; sourceTree = ""; }; + 24484A7311F51E9800E10CD2 /* string.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = string.h; sourceTree = ""; }; + 24484A7411F51E9800E10CD2 /* string.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = string.c; sourceTree = ""; }; + 24484A9211F61D1900E10CD2 /* mig_reply_port.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mig_reply_port.h; sourceTree = ""; }; + 24484A9311F61D1900E10CD2 /* mig_reply_port.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mig_reply_port.c; sourceTree = ""; }; + 24614EA111E7A2ED00E78584 /* compile-syscalls.pl */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.script.perl; path = "compile-syscalls.pl"; sourceTree = ""; }; + 24614F0311E7CB5B00E78584 /* syscalls.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = syscalls.a; path = $BUILD_ROOT/syscalls.a; sourceTree = ""; }; + 247A08B211F8B05900E4693F /* _libkernel_init.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = _libkernel_init.h; sourceTree = ""; }; + 247A08B311F8B05900E4693F /* _libkernel_init.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = _libkernel_init.c; sourceTree = ""; }; + 247A08FF11F8E18000E4693F /* abort.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = abort.h; sourceTree = ""; }; + 247A091611F8E7A800E4693F /* exc_catcher.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = exc_catcher.h; sourceTree = ""; }; + 2485235411582D8F0051B413 /* mach_legacy.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mach_legacy.c; sourceTree = ""; }; + 248AA962122C7B2A0085F5B1 /* unlink.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = unlink.c; sourceTree = ""; }; + 248AA964122C7C330085F5B1 /* rmdir.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = rmdir.c; sourceTree = ""; }; + 248AA966122C7CDA0085F5B1 /* rename.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = rename.c; sourceTree = ""; }; + 248BA01C121C56BF008C073F /* connect.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = connect.c; sourceTree = ""; }; + 248BA01E121C607E008C073F /* fchmod.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = fchmod.c; sourceTree = ""; }; + 248BA04B121C8EE4008C073F /* fcntl-base.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "fcntl-base.c"; sourceTree = ""; }; + 248BA04E121C8F06008C073F /* fcntl.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = fcntl.c; sourceTree = ""; }; + 248BA051121C8FE2008C073F /* fcntl-cancel.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "fcntl-cancel.c"; sourceTree = ""; }; + 248BA068121D9E27008C073F /* getrlimit.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = getrlimit.c; sourceTree = ""; }; + 248BA07F121DA36B008C073F /* ioctl.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = ioctl.c; sourceTree = ""; }; + 248BA081121DA4F3008C073F /* kill.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = kill.c; sourceTree = ""; }; + 248BA084121DA5E4008C073F /* kill.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = kill.c; sourceTree = ""; }; + 248BA086121DA72D008C073F /* mmap.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mmap.c; sourceTree = ""; }; + 248BA088121DA8E0008C073F /* mprotect.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mprotect.c; sourceTree = ""; }; + 248BA08A121DAC86008C073F /* msync.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = msync.c; sourceTree = ""; }; + 248BA08C121DB0E7008C073F /* munmap.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = munmap.c; sourceTree = ""; }; + 248BA08E121DC545008C073F /* open.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = open.c; sourceTree = ""; }; + 248BA090121DDD7F008C073F /* select-base.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "select-base.c"; sourceTree = ""; }; + 248BA092121DE369008C073F /* select.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = select.c; sourceTree = ""; }; + 248BA094121DE565008C073F /* select-pre1050.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "select-pre1050.c"; sourceTree = ""; }; + 248BA0B2121DE760008C073F /* select-cancel.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "select-cancel.c"; sourceTree = ""; }; + 248BA0BC121DE902008C073F /* select.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = select.c; sourceTree = ""; }; + 248BA0CC121DEBEF008C073F /* setrlimit.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = setrlimit.c; sourceTree = ""; }; + 249C60FF1194747600ED73F3 /* libsystem_kernel.dylib */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.dylib"; includeInIndex = 0; path = libsystem_kernel.dylib; sourceTree = BUILT_PRODUCTS_DIR; }; + 249C612C1194827D00ED73F3 /* dylib_link.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = dylib_link.c; sourceTree = ""; }; + 24A7C5AE11FF8DA6007669EB /* accept.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = accept.c; sourceTree = ""; }; + 24A7C5AF11FF8DA6007669EB /* bind.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = bind.c; sourceTree = ""; }; + 24A7C5B111FF8DA6007669EB /* getattrlist.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = getattrlist.c; sourceTree = ""; }; + 24A7C5B211FF8DA6007669EB /* getpeername.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = getpeername.c; sourceTree = ""; }; + 24A7C5B311FF8DA6007669EB /* getsockname.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = getsockname.c; sourceTree = ""; }; + 24A7C5B411FF8DA6007669EB /* lchown.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = lchown.c; sourceTree = ""; }; + 24A7C5B511FF8DA6007669EB /* listen.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = listen.c; sourceTree = ""; }; + 24A7C5B611FF8DA6007669EB /* recvfrom.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = recvfrom.c; sourceTree = ""; }; + 24A7C5B711FF8DA6007669EB /* recvmsg.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = recvmsg.c; sourceTree = ""; }; + 24A7C5B811FF8DA6007669EB /* sendmsg.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = sendmsg.c; sourceTree = ""; }; + 24A7C5B911FF8DA6007669EB /* sendto.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = sendto.c; sourceTree = ""; }; + 24A7C5BA11FF8DA6007669EB /* setattrlist.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = setattrlist.c; sourceTree = ""; }; + 24A7C5BB11FF8DA6007669EB /* socketpair.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = socketpair.c; sourceTree = ""; }; + 24A7C5CB11FF973C007669EB /* _errno.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = _errno.h; sourceTree = ""; }; + 24B028D511FF4FBB00CA64A9 /* memcpy.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = memcpy.c; sourceTree = ""; }; + 24B223AF121DFD36007DAEDE /* sigsuspend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = sigsuspend.c; sourceTree = ""; }; + 24B223B1121DFE6D007DAEDE /* sigsuspend-cancel.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "sigsuspend-cancel.c"; sourceTree = ""; }; + 24B223B3121DFF12007DAEDE /* sigsuspend-base.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "sigsuspend-base.c"; sourceTree = ""; }; + 24B223B4121DFF29007DAEDE /* sigsuspend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = sigsuspend.c; sourceTree = ""; }; + 24B8C2611237F53900D36CC3 /* remove-counter.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "remove-counter.c"; sourceTree = ""; }; + 24D1156611E671B20063D54D /* __fork.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = __fork.s; sourceTree = ""; }; + 24D1156711E671B20063D54D /* __getpid.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = __getpid.s; sourceTree = ""; }; + 24D1156811E671B20063D54D /* __gettimeofday.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = __gettimeofday.s; sourceTree = ""; }; + 24D1156911E671B20063D54D /* __lseek.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = __lseek.s; sourceTree = ""; }; + 24D1156A11E671B20063D54D /* __pipe.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = __pipe.s; sourceTree = ""; }; + 24D1156B11E671B20063D54D /* __psynch_cvbroad.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = __psynch_cvbroad.s; sourceTree = ""; }; + 24D1156C11E671B20063D54D /* __psynch_cvwait.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = __psynch_cvwait.s; sourceTree = ""; }; + 24D1156D11E671B20063D54D /* __ptrace.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = __ptrace.s; sourceTree = ""; }; + 24D1156E11E671B20063D54D /* __sigaltstack.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = __sigaltstack.s; sourceTree = ""; }; + 24D1156F11E671B20063D54D /* __sigreturn.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = __sigreturn.s; sourceTree = ""; }; + 24D1157011E671B20063D54D /* __syscall.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = __syscall.s; sourceTree = ""; }; + 24D1157111E671B20063D54D /* __thread_selfid.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = __thread_selfid.s; sourceTree = ""; }; + 24D1157211E671B20063D54D /* __vfork.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = __vfork.s; sourceTree = ""; }; + 24D1157311E671B20063D54D /* custom.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = custom.s; sourceTree = ""; }; + 24D1157411E671B20063D54D /* SYS.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = SYS.h; sourceTree = ""; }; + 24D1158C11E672270063D54D /* syscall.map */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = syscall.map; sourceTree = ""; }; + 24D1158F11E672270063D54D /* syscall.map */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = syscall.map; sourceTree = ""; }; + 24D1159111E672270063D54D /* syscall.map */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = syscall.map; sourceTree = ""; }; + 24D1159711E672270063D54D /* syscall.map */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = syscall.map; sourceTree = ""; }; + 24D1159811E672270063D54D /* syscall.map */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = syscall.map; sourceTree = ""; }; + 24D1159911E6723E0063D54D /* create-syscalls.pl */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.script.perl; path = "create-syscalls.pl"; sourceTree = ""; }; + 24E47824120881DF009A384D /* _libc_funcptr.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = _libc_funcptr.c; sourceTree = ""; }; + C99A4F4E1305B1B70054B7B7 /* __get_cpu_capabilities.s */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.asm; path = __get_cpu_capabilities.s; sourceTree = ""; }; + C99A4F511305B43F0054B7B7 /* init_cpu_capabilities.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = init_cpu_capabilities.c; sourceTree = ""; }; + C9D9BCBF114B00600000D8B9 /* .open_source_exclude */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = .open_source_exclude; sourceTree = ""; }; + C9D9BCC2114B00600000D8B9 /* vm_map_compat.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = vm_map_compat.c; sourceTree = ""; }; + C9D9BCC5114B00600000D8B9 /* clock_priv.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = clock_priv.defs; sourceTree = ""; }; + C9D9BCC6114B00600000D8B9 /* clock_reply.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = clock_reply.defs; sourceTree = ""; }; + C9D9BCC7114B00600000D8B9 /* clock_sleep.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = clock_sleep.c; sourceTree = ""; }; + C9D9BCC8114B00600000D8B9 /* clock.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = clock.defs; sourceTree = ""; }; + C9D9BCC9114B00600000D8B9 /* err_iokit.sub */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = err_iokit.sub; sourceTree = ""; }; + C9D9BCCA114B00600000D8B9 /* err_ipc.sub */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = err_ipc.sub; sourceTree = ""; }; + C9D9BCCB114B00600000D8B9 /* err_kern.sub */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = err_kern.sub; sourceTree = ""; }; + C9D9BCCC114B00600000D8B9 /* err_libkern.sub */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = err_libkern.sub; sourceTree = ""; }; + C9D9BCCD114B00600000D8B9 /* err_mach_ipc.sub */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = err_mach_ipc.sub; sourceTree = ""; }; + C9D9BCCE114B00600000D8B9 /* err_server.sub */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = err_server.sub; sourceTree = ""; }; + C9D9BCCF114B00600000D8B9 /* err_us.sub */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = err_us.sub; sourceTree = ""; }; + C9D9BCD0114B00600000D8B9 /* error_codes.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = error_codes.c; sourceTree = ""; }; + C9D9BCD1114B00600000D8B9 /* errorlib.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = errorlib.h; sourceTree = ""; }; + C9D9BCD2114B00600000D8B9 /* exc_catcher_state_identity.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = exc_catcher_state_identity.c; sourceTree = ""; }; + C9D9BCD3114B00600000D8B9 /* exc_catcher_state.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = exc_catcher_state.c; sourceTree = ""; }; + C9D9BCD4114B00600000D8B9 /* exc_catcher.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = exc_catcher.c; sourceTree = ""; }; + C9D9BCD5114B00600000D8B9 /* exc.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = exc.defs; sourceTree = ""; }; + C9D9BCD6114B00600000D8B9 /* externs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = externs.h; sourceTree = ""; }; + C9D9BCD7114B00600000D8B9 /* fprintf_stderr.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = fprintf_stderr.c; sourceTree = ""; }; + C9D9BCD9114B00600000D8B9 /* errorlib.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = errorlib.h; sourceTree = ""; }; + C9D9BCDA114B00600000D8B9 /* mach.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mach.h; sourceTree = ""; }; + C9D9BCDB114B00600000D8B9 /* mach_error.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mach_error.h; sourceTree = ""; }; + C9D9BCDC114B00600000D8B9 /* mach_init.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mach_init.h; sourceTree = ""; }; + C9D9BCDD114B00600000D8B9 /* mach_interface.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mach_interface.h; sourceTree = ""; }; + C9D9BCDF114B00600000D8B9 /* port_obj.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = port_obj.h; sourceTree = ""; }; + C9D9BCE0114B00600000D8B9 /* sync.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sync.h; sourceTree = ""; }; + C9D9BCE1114B00600000D8B9 /* task.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = task.h; sourceTree = ""; }; + C9D9BCE2114B00600000D8B9 /* thread_act.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = thread_act.h; sourceTree = ""; }; + C9D9BCE3114B00600000D8B9 /* vm_task.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = vm_task.h; sourceTree = ""; }; + C9D9BCE4114B00600000D8B9 /* host_priv.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = host_priv.defs; sourceTree = ""; }; + C9D9BCE5114B00600000D8B9 /* host_security.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = host_security.defs; sourceTree = ""; }; + C9D9BCE9114B00600000D8B9 /* ledger.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = ledger.defs; sourceTree = ""; }; + C9D9BCEA114B00600000D8B9 /* lock_set.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = lock_set.defs; sourceTree = ""; }; + C9D9BCEB114B00600000D8B9 /* mach_error_string.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mach_error_string.c; sourceTree = ""; }; + C9D9BCEC114B00600000D8B9 /* mach_error.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mach_error.c; sourceTree = ""; }; + C9D9BCED114B00600000D8B9 /* mach_host.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = mach_host.defs; sourceTree = ""; }; + C9D9BCF0114B00600000D8B9 /* mach_init.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mach_init.c; sourceTree = ""; }; + C9D9BCF1114B00600000D8B9 /* mach_msg.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mach_msg.c; sourceTree = ""; }; + C9D9BCF2114B00600000D8B9 /* mach_port.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = mach_port.defs; sourceTree = ""; }; + C9D9BCF3114B00600000D8B9 /* mach_traps.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = mach_traps.s; sourceTree = ""; }; + C9D9BCF4114B00600000D8B9 /* mach_vm.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = mach_vm.defs; sourceTree = ""; }; + C9D9BCF6114B00600000D8B9 /* mig_allocate.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mig_allocate.c; sourceTree = ""; }; + C9D9BCF7114B00600000D8B9 /* mig_deallocate.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mig_deallocate.c; sourceTree = ""; }; + C9D9BCF8114B00600000D8B9 /* mig_reply_setup.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mig_reply_setup.c; sourceTree = ""; }; + C9D9BCF9114B00600000D8B9 /* mig_strncpy.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mig_strncpy.c; sourceTree = ""; }; + C9D9BCFA114B00600000D8B9 /* ms_thread_switch.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = ms_thread_switch.c; sourceTree = ""; }; + C9D9BCFB114B00600000D8B9 /* notify.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = notify.defs; sourceTree = ""; }; + C9D9BCFC114B00600000D8B9 /* panic.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = panic.c; sourceTree = ""; }; + C9D9BCFD114B00600000D8B9 /* port_obj.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = port_obj.c; sourceTree = ""; }; + C9D9BD03114B00600000D8B9 /* processor_set.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = processor_set.defs; sourceTree = ""; }; + C9D9BD04114B00600000D8B9 /* processor.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = processor.defs; sourceTree = ""; }; + C9D9BD06114B00600000D8B9 /* semaphore.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = semaphore.c; sourceTree = ""; }; + C9D9BD08114B00600000D8B9 /* key_defs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = key_defs.h; sourceTree = ""; }; + C9D9BD09114B00600000D8B9 /* ls_defs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ls_defs.h; sourceTree = ""; }; + C9D9BD0B114B00600000D8B9 /* netname.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = netname.defs; sourceTree = ""; }; + C9D9BD0C114B00600000D8B9 /* netname_defs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = netname_defs.h; sourceTree = ""; }; + C9D9BD0D114B00600000D8B9 /* nm_defs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = nm_defs.h; sourceTree = ""; }; + C9D9BD0E114B00600000D8B9 /* slot_name.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = slot_name.c; sourceTree = ""; }; + C9D9BD0F114B00600000D8B9 /* task.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = task.defs; sourceTree = ""; }; + C9D9BD10114B00600000D8B9 /* thread_act.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = thread_act.defs; sourceTree = ""; }; + C9D9BD11114B00600000D8B9 /* vm_map.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = vm_map.defs; sourceTree = ""; }; + C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.xcconfig; path = Libsyscall.xcconfig; sourceTree = ""; }; + D2AAC0630554660B00DB518D /* libsystem_kernel.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libsystem_kernel.a; sourceTree = BUILT_PRODUCTS_DIR; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + 249C60FD1194747600ED73F3 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + 249C610B1194750E00ED73F3 /* libsystem_kernel.a in Frameworks */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + D289988505E68E00004EDB86 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + 24614F0411E7CB5B00E78584 /* syscalls.a in Frameworks */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + 08FB7794FE84155DC02AAC07 /* mach */ = { + isa = PBXGroup; + children = ( + C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */, + 24D1158911E672270063D54D /* Platforms */, + 24D1156511E671B20063D54D /* custom */, + 08FB7795FE84155DC02AAC07 /* mach */, + 247A08B011F8AF1700E4693F /* wrappers */, + 240D716611933ED300556E97 /* xcodescripts */, + 1AB674ADFE9D54B511CA2CBB /* Products */, + ); + name = mach; + sourceTree = ""; + }; + 08FB7795FE84155DC02AAC07 /* mach */ = { + isa = PBXGroup; + children = ( + C9D9BCBE114B00600000D8B9 /* arm */, + 247A08FF11F8E18000E4693F /* abort.h */, + C9D9BCC5114B00600000D8B9 /* clock_priv.defs */, + C9D9BCC6114B00600000D8B9 /* clock_reply.defs */, + C9D9BCC7114B00600000D8B9 /* clock_sleep.c */, + C9D9BCC8114B00600000D8B9 /* clock.defs */, + C9D9BCC9114B00600000D8B9 /* err_iokit.sub */, + C9D9BCCA114B00600000D8B9 /* err_ipc.sub */, + C9D9BCCB114B00600000D8B9 /* err_kern.sub */, + C9D9BCCC114B00600000D8B9 /* err_libkern.sub */, + C9D9BCCD114B00600000D8B9 /* err_mach_ipc.sub */, + C9D9BCCE114B00600000D8B9 /* err_server.sub */, + C9D9BCCF114B00600000D8B9 /* err_us.sub */, + C9D9BCD0114B00600000D8B9 /* error_codes.c */, + C9D9BCD1114B00600000D8B9 /* errorlib.h */, + 247A091611F8E7A800E4693F /* exc_catcher.h */, + C9D9BCD2114B00600000D8B9 /* exc_catcher_state_identity.c */, + C9D9BCD3114B00600000D8B9 /* exc_catcher_state.c */, + C9D9BCD4114B00600000D8B9 /* exc_catcher.c */, + C9D9BCD5114B00600000D8B9 /* exc.defs */, + C9D9BCD6114B00600000D8B9 /* externs.h */, + C9D9BCD7114B00600000D8B9 /* fprintf_stderr.c */, + C9D9BCD8114B00600000D8B9 /* mach */, + C9D9BCE4114B00600000D8B9 /* host_priv.defs */, + C9D9BCE5114B00600000D8B9 /* host_security.defs */, + C9D9BCE9114B00600000D8B9 /* ledger.defs */, + C9D9BCEA114B00600000D8B9 /* lock_set.defs */, + C9D9BCEB114B00600000D8B9 /* mach_error_string.c */, + C9D9BCEC114B00600000D8B9 /* mach_error.c */, + C9D9BCED114B00600000D8B9 /* mach_host.defs */, + C9D9BCF0114B00600000D8B9 /* mach_init.c */, + 2485235411582D8F0051B413 /* mach_legacy.c */, + C9D9BCF1114B00600000D8B9 /* mach_msg.c */, + C9D9BCF2114B00600000D8B9 /* mach_port.defs */, + C9D9BCF3114B00600000D8B9 /* mach_traps.s */, + C9D9BCF4114B00600000D8B9 /* mach_vm.defs */, + C9D9BCF6114B00600000D8B9 /* mig_allocate.c */, + C9D9BCF7114B00600000D8B9 /* mig_deallocate.c */, + C9D9BCF8114B00600000D8B9 /* mig_reply_setup.c */, + C9D9BCF9114B00600000D8B9 /* mig_strncpy.c */, + 24484A9211F61D1900E10CD2 /* mig_reply_port.h */, + 24484A9311F61D1900E10CD2 /* mig_reply_port.c */, + C9D9BCFA114B00600000D8B9 /* ms_thread_switch.c */, + C9D9BCFB114B00600000D8B9 /* notify.defs */, + C9D9BCFC114B00600000D8B9 /* panic.c */, + C9D9BCFD114B00600000D8B9 /* port_obj.c */, + C9D9BD03114B00600000D8B9 /* processor_set.defs */, + C9D9BD04114B00600000D8B9 /* processor.defs */, + C9D9BD06114B00600000D8B9 /* semaphore.c */, + C9D9BD07114B00600000D8B9 /* servers */, + C9D9BD0E114B00600000D8B9 /* slot_name.c */, + 24484A7311F51E9800E10CD2 /* string.h */, + 24484A7411F51E9800E10CD2 /* string.c */, + C9D9BD0F114B00600000D8B9 /* task.defs */, + C9D9BD10114B00600000D8B9 /* thread_act.defs */, + C9D9BD11114B00600000D8B9 /* vm_map.defs */, + 249C612C1194827D00ED73F3 /* dylib_link.c */, + ); + path = mach; + sourceTree = ""; + }; + 1AB674ADFE9D54B511CA2CBB /* Products */ = { + isa = PBXGroup; + children = ( + 24614F0311E7CB5B00E78584 /* syscalls.a */, + D2AAC0630554660B00DB518D /* libsystem_kernel.a */, + 249C60FF1194747600ED73F3 /* libsystem_kernel.dylib */, + ); + name = Products; + sourceTree = ""; + }; + 240D716611933ED300556E97 /* xcodescripts */ = { + isa = PBXGroup; + children = ( + 24D1159911E6723E0063D54D /* create-syscalls.pl */, + 24614EA111E7A2ED00E78584 /* compile-syscalls.pl */, + 240D716711933ED300556E97 /* mach_install_mig.sh */, + 2427FA821200BCF800EF7A1F /* compat-symlinks.sh */, + ); + path = xcodescripts; + sourceTree = ""; + }; + 2419382912135FE1003CDE41 /* unix03 */ = { + isa = PBXGroup; + children = ( + 2419382A12135FF6003CDE41 /* chmod.c */, + 248BA01E121C607E008C073F /* fchmod.c */, + 248BA068121D9E27008C073F /* getrlimit.c */, + 248BA086121DA72D008C073F /* mmap.c */, + 248BA0CC121DEBEF008C073F /* setrlimit.c */, + ); + path = unix03; + sourceTree = ""; + }; + 247A08B011F8AF1700E4693F /* wrappers */ = { + isa = PBXGroup; + children = ( + 248BA04A121C8EE4008C073F /* cancelable */, + 2419382912135FE1003CDE41 /* unix03 */, + 24A7C6951200AF8A007669EB /* legacy */, + C99A4F4E1305B1B70054B7B7 /* __get_cpu_capabilities.s */, + 247A08B211F8B05900E4693F /* _libkernel_init.h */, + 247A08B311F8B05900E4693F /* _libkernel_init.c */, + 24E47824120881DF009A384D /* _libc_funcptr.c */, + 24A7C5CB11FF973C007669EB /* _errno.h */, + C99A4F511305B43F0054B7B7 /* init_cpu_capabilities.c */, + 248BA07F121DA36B008C073F /* ioctl.c */, + 248BA081121DA4F3008C073F /* kill.c */, + 24B028D511FF4FBB00CA64A9 /* memcpy.c */, + 24B8C2611237F53900D36CC3 /* remove-counter.c */, + 248AA966122C7CDA0085F5B1 /* rename.c */, + 248AA964122C7C330085F5B1 /* rmdir.c */, + 248BA090121DDD7F008C073F /* select-base.c */, + 24B223B3121DFF12007DAEDE /* sigsuspend-base.c */, + 248AA962122C7B2A0085F5B1 /* unlink.c */, + ); + path = wrappers; + sourceTree = ""; + }; + 248BA04A121C8EE4008C073F /* cancelable */ = { + isa = PBXGroup; + children = ( + 248BA04B121C8EE4008C073F /* fcntl-base.c */, + 248BA04E121C8F06008C073F /* fcntl.c */, + 248BA051121C8FE2008C073F /* fcntl-cancel.c */, + 248BA0BC121DE902008C073F /* select.c */, + 248BA0B2121DE760008C073F /* select-cancel.c */, + 24B223AF121DFD36007DAEDE /* sigsuspend.c */, + 24B223B1121DFE6D007DAEDE /* sigsuspend-cancel.c */, + ); + path = cancelable; + sourceTree = ""; + }; + 24A7C6951200AF8A007669EB /* legacy */ = { + isa = PBXGroup; + children = ( + 24A7C5AE11FF8DA6007669EB /* accept.c */, + 24A7C5AF11FF8DA6007669EB /* bind.c */, + 248BA01C121C56BF008C073F /* connect.c */, + 24A7C5B111FF8DA6007669EB /* getattrlist.c */, + 24A7C5B211FF8DA6007669EB /* getpeername.c */, + 24A7C5B311FF8DA6007669EB /* getsockname.c */, + 24A7C5B411FF8DA6007669EB /* lchown.c */, + 24A7C5B511FF8DA6007669EB /* listen.c */, + 248BA084121DA5E4008C073F /* kill.c */, + 248BA088121DA8E0008C073F /* mprotect.c */, + 248BA08A121DAC86008C073F /* msync.c */, + 248BA08C121DB0E7008C073F /* munmap.c */, + 248BA08E121DC545008C073F /* open.c */, + 24A7C5B611FF8DA6007669EB /* recvfrom.c */, + 24A7C5B711FF8DA6007669EB /* recvmsg.c */, + 248BA092121DE369008C073F /* select.c */, + 248BA094121DE565008C073F /* select-pre1050.c */, + 24A7C5B811FF8DA6007669EB /* sendmsg.c */, + 24A7C5B911FF8DA6007669EB /* sendto.c */, + 24A7C5BA11FF8DA6007669EB /* setattrlist.c */, + 24A7C5BB11FF8DA6007669EB /* socketpair.c */, + 24B223B4121DFF29007DAEDE /* sigsuspend.c */, + ); + path = legacy; + sourceTree = ""; + }; + 24D1156511E671B20063D54D /* custom */ = { + isa = PBXGroup; + children = ( + 24D1156611E671B20063D54D /* __fork.s */, + 24D1156711E671B20063D54D /* __getpid.s */, + 24D1156811E671B20063D54D /* __gettimeofday.s */, + 24D1156911E671B20063D54D /* __lseek.s */, + 24D1156A11E671B20063D54D /* __pipe.s */, + 24D1156B11E671B20063D54D /* __psynch_cvbroad.s */, + 24D1156C11E671B20063D54D /* __psynch_cvwait.s */, + 24D1156D11E671B20063D54D /* __ptrace.s */, + 24D1156E11E671B20063D54D /* __sigaltstack.s */, + 24D1156F11E671B20063D54D /* __sigreturn.s */, + 24D1157011E671B20063D54D /* __syscall.s */, + 24D1157111E671B20063D54D /* __thread_selfid.s */, + 24D1157211E671B20063D54D /* __vfork.s */, + 24D1157311E671B20063D54D /* custom.s */, + 24D1157411E671B20063D54D /* SYS.h */, + 242AB66511EBDC1200107336 /* errno.c */, + ); + path = custom; + sourceTree = ""; + }; + 24D1158911E672270063D54D /* Platforms */ = { + isa = PBXGroup; + children = ( + 24D1158A11E672270063D54D /* iPhoneOS */, + 24D1158D11E672270063D54D /* MacOSX */, + 24D1159811E672270063D54D /* syscall.map */, + ); + path = Platforms; + sourceTree = ""; + }; + 24D1158A11E672270063D54D /* iPhoneOS */ = { + isa = PBXGroup; + children = ( + 24D1158B11E672270063D54D /* arm */, + ); + path = iPhoneOS; + sourceTree = ""; + }; + 24D1158B11E672270063D54D /* arm */ = { + isa = PBXGroup; + children = ( + 24D1158C11E672270063D54D /* syscall.map */, + ); + path = arm; + sourceTree = ""; + }; + 24D1158D11E672270063D54D /* MacOSX */ = { + isa = PBXGroup; + children = ( + 24D1158E11E672270063D54D /* arm */, + 24D1159011E672270063D54D /* i386 */, + 24D1159611E672270063D54D /* x86_64 */, + ); + path = MacOSX; + sourceTree = ""; + }; + 24D1158E11E672270063D54D /* arm */ = { + isa = PBXGroup; + children = ( + 24D1158F11E672270063D54D /* syscall.map */, + ); + path = arm; + sourceTree = ""; + }; + 24D1159011E672270063D54D /* i386 */ = { + isa = PBXGroup; + children = ( + 24D1159111E672270063D54D /* syscall.map */, + ); + path = i386; + sourceTree = ""; + }; + 24D1159611E672270063D54D /* x86_64 */ = { + isa = PBXGroup; + children = ( + 24D1159711E672270063D54D /* syscall.map */, + ); + path = x86_64; + sourceTree = ""; + }; + C9D9BCBE114B00600000D8B9 /* arm */ = { + isa = PBXGroup; + children = ( + C9D9BCBF114B00600000D8B9 /* .open_source_exclude */, + C9D9BCC2114B00600000D8B9 /* vm_map_compat.c */, + ); + path = arm; + sourceTree = ""; + }; + C9D9BCD8114B00600000D8B9 /* mach */ = { + isa = PBXGroup; + children = ( + C9D9BCD9114B00600000D8B9 /* errorlib.h */, + C9D9BCDA114B00600000D8B9 /* mach.h */, + C9D9BCDB114B00600000D8B9 /* mach_error.h */, + C9D9BCDC114B00600000D8B9 /* mach_init.h */, + C9D9BCDD114B00600000D8B9 /* mach_interface.h */, + C9D9BCDF114B00600000D8B9 /* port_obj.h */, + C9D9BCE0114B00600000D8B9 /* sync.h */, + C9D9BCE1114B00600000D8B9 /* task.h */, + C9D9BCE2114B00600000D8B9 /* thread_act.h */, + C9D9BCE3114B00600000D8B9 /* vm_task.h */, + ); + path = mach; + sourceTree = ""; + }; + C9D9BD07114B00600000D8B9 /* servers */ = { + isa = PBXGroup; + children = ( + C9D9BD08114B00600000D8B9 /* key_defs.h */, + C9D9BD09114B00600000D8B9 /* ls_defs.h */, + C9D9BD0B114B00600000D8B9 /* netname.defs */, + C9D9BD0C114B00600000D8B9 /* netname_defs.h */, + C9D9BD0D114B00600000D8B9 /* nm_defs.h */, + ); + path = servers; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXHeadersBuildPhase section */ + D2AAC0600554660B00DB518D /* Headers */ = { + isa = PBXHeadersBuildPhase; + buildActionMask = 2147483647; + files = ( + C9D9BD1E114B00600000D8B9 /* errorlib.h in Headers */, + C9D9BD23114B00600000D8B9 /* externs.h in Headers */, + C9D9BD25114B00600000D8B9 /* errorlib.h in Headers */, + C9D9BD26114B00600000D8B9 /* mach.h in Headers */, + C9D9BD27114B00600000D8B9 /* mach_error.h in Headers */, + C9D9BD28114B00600000D8B9 /* mach_init.h in Headers */, + C9D9BD29114B00600000D8B9 /* mach_interface.h in Headers */, + C9D9BD2B114B00600000D8B9 /* port_obj.h in Headers */, + C9D9BD2C114B00600000D8B9 /* sync.h in Headers */, + C9D9BD2D114B00600000D8B9 /* task.h in Headers */, + C9D9BD2E114B00600000D8B9 /* thread_act.h in Headers */, + C9D9BD2F114B00600000D8B9 /* vm_task.h in Headers */, + C9D9BD50114B00600000D8B9 /* key_defs.h in Headers */, + C9D9BD51114B00600000D8B9 /* ls_defs.h in Headers */, + C9D9BD54114B00600000D8B9 /* netname_defs.h in Headers */, + C9D9BD55114B00600000D8B9 /* nm_defs.h in Headers */, + 24D1158311E671B20063D54D /* SYS.h in Headers */, + 247A090011F8E18000E4693F /* abort.h in Headers */, + 247A091711F8E7A800E4693F /* exc_catcher.h in Headers */, + 24B028F511FF5C3500CA64A9 /* _libkernel_init.h in Headers */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXHeadersBuildPhase section */ + +/* Begin PBXNativeTarget section */ + 249C60FE1194747600ED73F3 /* Libmach Dynamic */ = { + isa = PBXNativeTarget; + buildConfigurationList = 249C6102119474D700ED73F3 /* Build configuration list for PBXNativeTarget "Libmach Dynamic" */; + buildPhases = ( + 249C61281194815000ED73F3 /* Sources */, + 249C60FD1194747600ED73F3 /* Frameworks */, + 2427FA811200BCDA00EF7A1F /* Compat Symlinks */, + ); + buildRules = ( + ); + dependencies = ( + 249C610A1194750700ED73F3 /* PBXTargetDependency */, + ); + name = "Libmach Dynamic"; + productName = Libmach; + productReference = 249C60FF1194747600ED73F3 /* libsystem_kernel.dylib */; + productType = "com.apple.product-type.library.dynamic"; + }; + D2AAC0620554660B00DB518D /* Libmach */ = { + isa = PBXNativeTarget; + buildConfigurationList = 1DEB914A08733D8E0010E9CD /* Build configuration list for PBXNativeTarget "Libmach" */; + buildPhases = ( + D2AAC0600554660B00DB518D /* Headers */, + D2AAC0610554660B00DB518D /* Sources */, + D289988505E68E00004EDB86 /* Frameworks */, + 2487545E11629934000975E0 /* Install Headers */, + ); + buildRules = ( + ); + dependencies = ( + 242AB67911ED03ED00107336 /* PBXTargetDependency */, + ); + name = Libmach; + productName = mach; + productReference = D2AAC0630554660B00DB518D /* libsystem_kernel.a */; + productType = "com.apple.product-type.library.static"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + 08FB7793FE84155DC02AAC07 /* Project object */ = { + isa = PBXProject; + buildConfigurationList = 1DEB914E08733D8E0010E9CD /* Build configuration list for PBXProject "Libsyscall" */; + compatibilityVersion = "Xcode 3.1"; + developmentRegion = English; + hasScannedForEncodings = 1; + knownRegions = ( + English, + Japanese, + French, + German, + ); + mainGroup = 08FB7794FE84155DC02AAC07 /* mach */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + 249C61101194755D00ED73F3 /* Build */, + 24614EF311E7C98600E78584 /* Syscalls */, + D2AAC0620554660B00DB518D /* Libmach */, + 249C60FE1194747600ED73F3 /* Libmach Dynamic */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXShellScriptBuildPhase section */ + 2427FA811200BCDA00EF7A1F /* Compat Symlinks */ = { + isa = PBXShellScriptBuildPhase; + buildActionMask = 8; + files = ( + ); + inputPaths = ( + ); + name = "Compat Symlinks"; + outputPaths = ( + ); + runOnlyForDeploymentPostprocessing = 1; + shellPath = /bin/sh; + shellScript = "\"$PROJECT_DIR\"/xcodescripts/compat-symlinks.sh"; + }; + 24614EF211E7C98600E78584 /* Generate Syscalls */ = { + isa = PBXShellScriptBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + inputPaths = ( + ); + name = "Generate Syscalls"; + outputPaths = ( + ); + runOnlyForDeploymentPostprocessing = 0; + shellPath = /bin/sh; + shellScript = "set -x\n\nmkdir -p $OBJROOT/sys\n\n$SRCROOT/xcodescripts/create-syscalls.pl \\\n\t$SRCROOT/../bsd/kern/syscalls.master \\\n\t$SRCROOT/custom \\\n\t$SRCROOT/Platforms \\\n\t$MAP_PLATFORM \\\n\t$OBJROOT/sys\n"; + }; + 24614EF611E7C9A000E78584 /* Compile Syscalls */ = { + isa = PBXShellScriptBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + inputPaths = ( + ); + name = "Compile Syscalls"; + outputPaths = ( + "$(BUILD_ROOT)/syscalls.a", + ); + runOnlyForDeploymentPostprocessing = 0; + shellPath = /bin/sh; + shellScript = "set -x\n\nmkdir -p $OBJROOT/UninstalledProducts\n\n$SRCROOT/xcodescripts/compile-syscalls.pl \\\n\t$OBJROOT/sys/stubs.list \\\n\t$BUILD_ROOT/syscalls.a"; + }; + 2487545E11629934000975E0 /* Install Headers */ = { + isa = PBXShellScriptBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + inputPaths = ( + ); + name = "Install Headers"; + outputPaths = ( + ); + runOnlyForDeploymentPostprocessing = 0; + shellPath = /bin/sh; + shellScript = "\"$PROJECT_DIR\"/xcodescripts/mach_install_mig.sh"; + }; +/* End PBXShellScriptBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + 249C61281194815000ED73F3 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 24E4782712088267009A384D /* _libc_funcptr.c in Sources */, + 240BAC4C1214770F000A1719 /* memcpy.c in Sources */, + 249C612F1194828600ED73F3 /* dylib_link.c in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + D2AAC0610554660B00DB518D /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + C9D9BD19114B00600000D8B9 /* clock_priv.defs in Sources */, + C9D9BD1A114B00600000D8B9 /* clock_reply.defs in Sources */, + C9D9BD1C114B00600000D8B9 /* clock.defs in Sources */, + C9D9BD22114B00600000D8B9 /* exc.defs in Sources */, + C9D9BD30114B00600000D8B9 /* host_priv.defs in Sources */, + C9D9BD31114B00600000D8B9 /* host_security.defs in Sources */, + C9D9BD34114B00600000D8B9 /* ledger.defs in Sources */, + C9D9BD35114B00600000D8B9 /* lock_set.defs in Sources */, + C9D9BD38114B00600000D8B9 /* mach_host.defs in Sources */, + C9D9BD3D114B00600000D8B9 /* mach_port.defs in Sources */, + C9D9BD3F114B00600000D8B9 /* mach_vm.defs in Sources */, + C9D9BD46114B00600000D8B9 /* notify.defs in Sources */, + C9D9BD4C114B00600000D8B9 /* processor_set.defs in Sources */, + C9D9BD4D114B00600000D8B9 /* processor.defs in Sources */, + C9D9BD53114B00600000D8B9 /* netname.defs in Sources */, + C9D9BD57114B00600000D8B9 /* task.defs in Sources */, + C9D9BD58114B00600000D8B9 /* thread_act.defs in Sources */, + C9D9BD59114B00600000D8B9 /* vm_map.defs in Sources */, + C9D9BD1B114B00600000D8B9 /* clock_sleep.c in Sources */, + C9D9BD1D114B00600000D8B9 /* error_codes.c in Sources */, + C9D9BD1F114B00600000D8B9 /* exc_catcher_state_identity.c in Sources */, + C9D9BD20114B00600000D8B9 /* exc_catcher_state.c in Sources */, + C9D9BD21114B00600000D8B9 /* exc_catcher.c in Sources */, + C9D9BD24114B00600000D8B9 /* fprintf_stderr.c in Sources */, + C9D9BD36114B00600000D8B9 /* mach_error_string.c in Sources */, + C9D9BD37114B00600000D8B9 /* mach_error.c in Sources */, + C9D9BD3B114B00600000D8B9 /* mach_init.c in Sources */, + C9D9BD3C114B00600000D8B9 /* mach_msg.c in Sources */, + C9D9BD3E114B00600000D8B9 /* mach_traps.s in Sources */, + C9D9BD41114B00600000D8B9 /* mig_allocate.c in Sources */, + C9D9BD42114B00600000D8B9 /* mig_deallocate.c in Sources */, + C9D9BD43114B00600000D8B9 /* mig_reply_setup.c in Sources */, + 24484A9411F61D2B00E10CD2 /* mig_reply_port.c in Sources */, + C9D9BD44114B00600000D8B9 /* mig_strncpy.c in Sources */, + C9D9BD45114B00600000D8B9 /* ms_thread_switch.c in Sources */, + C9D9BD47114B00600000D8B9 /* panic.c in Sources */, + C9D9BD48114B00600000D8B9 /* port_obj.c in Sources */, + C9D9BD4F114B00600000D8B9 /* semaphore.c in Sources */, + C9D9BD56114B00600000D8B9 /* slot_name.c in Sources */, + 24484A7511F6178E00E10CD2 /* string.c in Sources */, + 2485235511582D8F0051B413 /* mach_legacy.c in Sources */, + C9D9BD17114B00600000D8B9 /* vm_map_compat.c in Sources */, + 242AB66611EBDC1200107336 /* errno.c in Sources */, + 247A08C211F8BDC900E4693F /* _libkernel_init.c in Sources */, + 24A7C5BC11FF8DA6007669EB /* accept.c in Sources */, + 24A7C5BD11FF8DA6007669EB /* bind.c in Sources */, + 24A7C5BF11FF8DA6007669EB /* getattrlist.c in Sources */, + 24A7C5C011FF8DA6007669EB /* getpeername.c in Sources */, + 24A7C5C111FF8DA6007669EB /* getsockname.c in Sources */, + 24A7C5C211FF8DA6007669EB /* lchown.c in Sources */, + 24A7C5C311FF8DA6007669EB /* listen.c in Sources */, + 24A7C5C411FF8DA6007669EB /* recvfrom.c in Sources */, + 24A7C5C511FF8DA6007669EB /* recvmsg.c in Sources */, + 24A7C5C611FF8DA6007669EB /* sendmsg.c in Sources */, + 24A7C5C711FF8DA6007669EB /* sendto.c in Sources */, + 24A7C5C811FF8DA6007669EB /* setattrlist.c in Sources */, + 24A7C5C911FF8DA6007669EB /* socketpair.c in Sources */, + 2419382B12135FF6003CDE41 /* chmod.c in Sources */, + 248BA01D121C56BF008C073F /* connect.c in Sources */, + 248BA01F121C607E008C073F /* fchmod.c in Sources */, + 248BA04F121C8F06008C073F /* fcntl.c in Sources */, + 248BA05C121C9649008C073F /* fcntl-cancel.c in Sources */, + 248BA069121D9E27008C073F /* getrlimit.c in Sources */, + 248BA080121DA36B008C073F /* ioctl.c in Sources */, + 248BA082121DA4F3008C073F /* kill.c in Sources */, + 248BA085121DA5E4008C073F /* kill.c in Sources */, + 248BA087121DA72D008C073F /* mmap.c in Sources */, + 248BA089121DA8E0008C073F /* mprotect.c in Sources */, + 248BA08B121DAC86008C073F /* msync.c in Sources */, + 248BA08D121DB0E7008C073F /* munmap.c in Sources */, + 248BA08F121DC545008C073F /* open.c in Sources */, + 248BA093121DE369008C073F /* select.c in Sources */, + 248BA095121DE565008C073F /* select-pre1050.c in Sources */, + 248BA0B3121DE760008C073F /* select-cancel.c in Sources */, + 248BA0BE121DE902008C073F /* select.c in Sources */, + 248BA0CD121DEBEF008C073F /* setrlimit.c in Sources */, + 24B223B0121DFD36007DAEDE /* sigsuspend.c in Sources */, + 24B223B2121DFE6D007DAEDE /* sigsuspend-cancel.c in Sources */, + 24B223B5121DFF29007DAEDE /* sigsuspend.c in Sources */, + 248AA963122C7B2A0085F5B1 /* unlink.c in Sources */, + 248AA965122C7C330085F5B1 /* rmdir.c in Sources */, + 248AA967122C7CDA0085F5B1 /* rename.c in Sources */, + 24B8C2621237F53900D36CC3 /* remove-counter.c in Sources */, + C99A4F501305B2BD0054B7B7 /* __get_cpu_capabilities.s in Sources */, + C99A4F531305B43F0054B7B7 /* init_cpu_capabilities.c in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin PBXTargetDependency section */ + 242AB67911ED03ED00107336 /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = 24614EF311E7C98600E78584 /* Syscalls */; + targetProxy = 242AB67811ED03ED00107336 /* PBXContainerItemProxy */; + }; + 249C610A1194750700ED73F3 /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = D2AAC0620554660B00DB518D /* Libmach */; + targetProxy = 249C61091194750700ED73F3 /* PBXContainerItemProxy */; + }; + 249C61151194756A00ED73F3 /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = 249C60FE1194747600ED73F3 /* Libmach Dynamic */; + targetProxy = 249C61141194756A00ED73F3 /* PBXContainerItemProxy */; + }; +/* End PBXTargetDependency section */ + +/* Begin XCBuildConfiguration section */ + 1DEB914C08733D8E0010E9CD /* Release */ = { + isa = XCBuildConfiguration; + baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */; + buildSettings = { + COPY_PHASE_STRIP = NO; + INSTALL_PATH = /usr/local/lib/dyld; + "INSTALL_PATH[sdk=iphoneos*]" = /usr/local/lib/dyld; + "INSTALL_PATH[sdk=iphonesimulator*]" = "$(SDKROOT)/usr/local/lib/dyld"; + "INSTALL_PATH[sdk=macosx*]" = /usr/local/lib/dyld; + }; + name = Release; + }; + 1DEB915008733D8E0010E9CD /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + GCC_C_LANGUAGE_STANDARD = gnu99; + GCC_WARN_ABOUT_RETURN_TYPE = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + PREBINDING = NO; + }; + name = Release; + }; + 24614EF411E7C98600E78584 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ARCHS = "$(ARCHS_STANDARD_32_64_BIT)"; + COPY_PHASE_STRIP = YES; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + GCC_ENABLE_FIX_AND_CONTINUE = NO; + MAP_PLATFORM = "$(MAP_PLATFORM_$(PLATFORM_NAME))"; + MAP_PLATFORM_iphoneos = iPhoneOS; + MAP_PLATFORM_macosx = MacOSX; + PRODUCT_NAME = Syscalls; + ZERO_LINK = NO; + }; + name = Release; + }; + 249C61001194747600ED73F3 /* Release */ = { + isa = XCBuildConfiguration; + baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */; + buildSettings = { + OTHER_LDFLAGS = ( + "-umbrella", + System, + "-all_load", + ); + VERSION_INFO_PREFIX = "___"; + }; + name = Release; + }; + 249C61111194755E00ED73F3 /* Release */ = { + isa = XCBuildConfiguration; + baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */; + buildSettings = { + PRODUCT_NAME = Build; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + 1DEB914A08733D8E0010E9CD /* Build configuration list for PBXNativeTarget "Libmach" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 1DEB914C08733D8E0010E9CD /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 1DEB914E08733D8E0010E9CD /* Build configuration list for PBXProject "Libsyscall" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 1DEB915008733D8E0010E9CD /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 24614EFD11E7C9B900E78584 /* Build configuration list for PBXAggregateTarget "Syscalls" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 24614EF411E7C98600E78584 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 249C6102119474D700ED73F3 /* Build configuration list for PBXNativeTarget "Libmach Dynamic" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 249C61001194747600ED73F3 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 249C61191194756B00ED73F3 /* Build configuration list for PBXAggregateTarget "Build" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 249C61111194755E00ED73F3 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + }; + rootObject = 08FB7793FE84155DC02AAC07 /* Project object */; +} diff --git a/libsyscall/Makefile b/libsyscall/Makefile deleted file mode 100644 index 33f3e99ff..000000000 --- a/libsyscall/Makefile +++ /dev/null @@ -1,65 +0,0 @@ -# @(#)Makefile 8.2 (Berkeley) 2/3/94 -# $FreeBSD: src/lib/libc/Makefile,v 1.31 2001/08/13 21:48:43 peter Exp $ -# -# All library objects contain rcsid strings by default; they may be -# excluded as a space-saving measure. To produce a library that does -# not contain these strings, delete -DLIBC_RCS and -DSYSLIBC_RCS -# from CFLAGS below. To remove these strings from just the system call -# stubs, remove just -DSYSLIBC_RCS from CFLAGS. -# -# Yes, we build everything with -g, and strip it out later... -# -LIB=syscall -SHLIB_MAJOR= 1 -SHLIB_MINOR= 0 -.if (${MACHINE_ARCH} == unknown) -.ifdef RC_ARCHS -MACHINE_ARCH != echo $(RC_ARCHS) | cut -f 1 -d " " -.else -MACHINE_ARCH != /usr/bin/arch -.endif -.endif -.if !empty $(MACHINE_ARCH:M*64) -LP64 = 1 -.endif -SDKROOT ?= / -CC = xcrun -sdk $(SDKROOT) gcc -MIG = xcrun -sdk $(SDKROOT) mig -MIGCC != xcrun -find -sdk $(SDKROOT) gcc -.ifdef ALTFRAMEWORKSPATH -PRIVINC = -F${ALTFRAMEWORKSPATH} -I${ALTFRAMEWORKSPATH}/System.framework/PrivateHeaders -.else -PRIVINC = -I${SDKROOT}/System/Library/Frameworks/System.framework/PrivateHeaders -.endif -CFLAGS += ${PRIVINC} -CFLAGS += -no-cpp-precomp -CFLAGS += -fno-common -pipe -Wmost -g -CFLAGS += -DCF_EXCLUDE_CSTD_HEADERS -DCF_OPEN_SOURCE -CFLAGS += -isysroot ${SDKROOT} -AINC= -no-cpp-precomp -AINC+= -arch ${MACHINE_ARCH} -g -MIGDEFINES ?= -CLEANFILES+=tags -INSTALL_PIC_ARCHIVE= yes -PRECIOUSLIB= yes - -# workaround for 3649783 -AINC += -fdollars-in-identifiers - -# If these aren't set give it expected defaults -DESTDIR ?= ${DSTROOT} -MAKEOBJDIR ?= ${OBJROOT} - -# add version string -SRCS += libsyscall_version.c -libsyscall_version.c: - ${SDKROOT}/Developer/Makefiles/bin/version.pl Libsyscall > $@ - -CFLAGS += -I${SYMROOT} -.include "${.CURDIR}/Makefile.inc" -.PATH: ${SYMROOT} -.include "Makefile.xbs" -.if exists(/usr/share/mk/bsd.init.mk) -.include -.endif -.include diff --git a/libsyscall/Makefile.inc b/libsyscall/Makefile.inc deleted file mode 100644 index 0bc95b71c..000000000 --- a/libsyscall/Makefile.inc +++ /dev/null @@ -1,52 +0,0 @@ -# $FreeBSD: src/lMakefile.inc,v 1.7 2001/04/04 18:17:25 tmm Exp $ -# -# This file contains make rules that are shared by libc and libc_r. -# -# Define (empty) variables so that make doesn't give substitution -# errors if the included makefiles don't change these: -MDSRCS= -MISRCS= -MDASM= -MIASM= -NOASM= - -# SUPPRESSSRCS is used to prevent machine-independent files from being -# built, when a machine-dependent file defines multiple symbols. -# Use MDSRCS to block one file, and SUPPRESSSRCS to block the others. -SUPPRESSSRCS= - -# set object file suffix -.if make(lib${LIB}_static.a) -OBJSUFFIX = o -.endif -.if make(lib${LIB}_profile.a) -OBJSUFFIX = po -.endif -.if make(lib${LIB}_debug.a) -OBJSUFFIX = do -.endif -.if make(lib${LIB}.a) -OBJSUFFIX = So -.endif - -.if exists(${OBJROOT}/sys/Makefile.inc) -.include "${OBJROOT}/sys/Makefile.inc" -.endif -.include "${.CURDIR}/include/Makefile.inc" -.include "${.CURDIR}/mach/Makefile.inc" - -# If there are no machine dependent sources, append all the -# machine-independent sources: -.if empty(MDSRCS) -SRCS+= ${MISRCS} -.else -# Append machine-dependent sources, then append machine-independent sources -# for which there is no machine-dependent variant, and not being suppressed. -SRCS+= ${MDSRCS} -_SUPPRESS= ${MDSRCS} ${SUPPRESSSRCS} -.for _src in ${MISRCS} -.if ${_SUPPRESS:R:M${_src:R}} == "" -SRCS+= ${_src} -.endif -.endfor -.endif diff --git a/libsyscall/Makefile.xbs b/libsyscall/Makefile.xbs deleted file mode 100644 index 556597fef..000000000 --- a/libsyscall/Makefile.xbs +++ /dev/null @@ -1,130 +0,0 @@ -BSDMAKE = bsdmake -f Makefile - -.PATH: . -.MAIN: all -all: lib${LIB}.a lib${LIB}_static.a lib${LIB}_debug.a lib${LIB}_profile.a -install: installhdrs install_lib${LIB}.a install_lib${LIB}_static.a \ - install_lib${LIB}_profile.a install_lib${LIB}_debug.a maninstall - -.SUFFIXES: -.SUFFIXES: .o .po .So .do -.SUFFIXES: .S .s .c .cc .cpp .cxx .m .C -.SUFFIXES: .defs .h -.SUFFIXES: User.c User.o User.po User.So User.do -.SUFFIXES: Server.c Server.o Server.po Server.So Server.do - -OBJS+= ${SRCS:N*.h:R:S/$/.o/g} -DOBJS+= ${OBJS:.o=.do} -POBJS+= ${OBJS:.o=.po} ${STATICOBJS:.o=.po} -SOBJS+= ${OBJS:.o=.So} - -#### Standard C Rules ################################################# -.c.o User.cUser.o Server.cServer.o: - ${CC} -static ${PRECFLAGS-${.IMPSRC:T}} ${CFLAGS} \ - ${CFLAGS-${.IMPSRC:T}} -Os ${OPTIMIZE-${.IMPSRC:T}} \ - -c ${.IMPSRC} -o ${.TARGET} -.c.po User.cUser.po Server.cServer.po: - ${CC} -pg ${PRECFLAGS-${.IMPSRC:T}} -DPROFILE ${CFLAGS} \ - ${CFLAGS-${.IMPSRC:T}} -Os ${OPTIMIZE-${.IMPSRC:T}} \ - -c ${.IMPSRC} -o ${.TARGET} -.c.So User.cUser.So Server.cServer.So: - ${CC} ${PRECFLAGS-${.IMPSRC:T}} ${CFLAGS} ${CFLAGS-${.IMPSRC:T}} \ - -Os ${OPTIMIZE-${.IMPSRC:T}} -c ${.IMPSRC} -o ${.TARGET} -.c.do User.cUser.do Server.cServer.do: - ${CC} -g ${PRECFLAGS-${.IMPSRC:T}} -DDEBUG ${CFLAGS} \ - ${CFLAGS-${.IMPSRC:T}} -c ${.IMPSRC} -o ${.TARGET} - -#### Standard Assembler Rules ######################################### -.s.o .S.o: - ${CC} -static -x assembler-with-cpp ${PRECFLAGS-${.IMPSRC:T}} \ - ${AINC} ${CFLAGS:M-[BIDFU]*} ${CFLAGS-${.IMPSRC:T}:M-[BIDFU]*} \ - -Os ${OPTIMIZE-${.IMPSRC:T}} -c ${.IMPSRC} -o ${.TARGET} -.s.po .S.po: - ${CC} -pg -x assembler-with-cpp ${PRECFLAGS-${.IMPSRC:T}} -DPROFILE \ - ${AINC} ${CFLAGS:M-[BIDFU]*} ${CFLAGS-${.IMPSRC:T}:M-[BIDFU]*} \ - -Os ${OPTIMIZE-${.IMPSRC:T}} -c ${.IMPSRC} -o ${.TARGET} -.s.So .S.So: - ${CC} -x assembler-with-cpp ${PRECFLAGS-${.IMPSRC:T}} \ - ${AINC} ${CFLAGS:M-[BIDFU]*} ${CFLAGS-${.IMPSRC:T}:M-[BIDFU]*} \ - -Os ${OPTIMIZE-${.IMPSRC:T}} -c ${.IMPSRC} -o ${.TARGET} -.s.do .S.do: - ${CC} -g -x assembler-with-cpp ${PRECFLAGS-${.IMPSRC:T}} -DDEBUG \ - ${AINC} ${CFLAGS:M-[BIDFU]*} ${CFLAGS-${.IMPSRC:T}:M-[BIDFU]*} \ - -c ${.IMPSRC} -o ${.TARGET} - -#### mig Rules ######################################################## -.defs.h .defsUser.c .defsServer.c: - $(MIG) ${PRIVINC} ${MIGDEFINES} -arch ${MACHINE_ARCH} -cc ${MIGCC} -user ${.PREFIX}User.c -server ${.PREFIX}Server.c -header ${.PREFIX}.h ${.IMPSRC} - -gen_mig_defs: ${SRVMIGHDRS} ${MIGHDRS} -gen_md_mig_defs: ${MD_MIGHDRS} - -#### Library Rules #################################################### -lib${LIB}_static.a:: ${OBJS} ${STATICOBJS} - @${ECHO} building static ${LIB} library - @rm -f lib${LIB}_static.a - @${AR} cq lib${LIB}_static.a `lorder ${OBJS} ${STATICOBJS} | tsort -q` ${ARADD} - ${RANLIB} lib${LIB}_static.a - -lib${LIB}_profile.a:: ${POBJS} ${POBJS2} - @${ECHO} building profiled ${LIB} library - @rm -f lib${LIB}_profile.a - @${AR} cq lib${LIB}_profile.a `lorder ${POBJS} | tsort -q` ${ARADD} - ${RANLIB} lib${LIB}_profile.a - -lib${LIB}_debug.a:: ${DOBJS} ${DOBJS2} - @${ECHO} building debug ${LIB} library - @rm -f lib${LIB}_debug.a - @${AR} cq lib${LIB}_debug.a `lorder ${DOBJS} | tsort -q` ${ARADD} - ${RANLIB} lib${LIB}_debug.a - -lib${LIB}.a:: ${SOBJS} ${SOBJS2} - @${ECHO} building standard ${LIB} library - @rm -f lib${LIB}.a - @${AR} cq lib${LIB}.a `lorder ${SOBJS} | tsort -q` ${ARADD} - ${RANLIB} lib${LIB}.a - -CLEANFILES += ${DOBJS} lib${LIB}_static.a lib${LIB}_profile.a lib${LIB}_debug.a - -INCDIR = ${DESTDIR}/usr/include -LOCINCDIR = ${DESTDIR}/usr/local/include -SYSTEMFRAMEWORK = ${DESTDIR}/System/Library/Frameworks/System.framework -PRIVHDRS = ${SYSTEMFRAMEWORK}/Versions/B/PrivateHeaders -PRIVHDRSPPC = ${PRIVHDRS}/architecture/ppc -KERNELFRAMEWORK = ${DESTDIR}/System/Library/Frameworks/Kernel.framework -PRIVKERNELHDRS = ${KERNELFRAMEWORK}/Versions/A/PrivateHeaders - -ARCHDIR = ${MACHINE_ARCH:C/^armv.*$/arm/} - -installhdrs-md: gen_md_mig_defs - mkdir -p ${INCDIR}/mach/${ARCHDIR} - ${INSTALL} -o 0 -c -m 444 ${MD_MIGHDRS} ${INCDIR}/mach/${ARCHDIR} - mkdir -p ${PRIVHDRSPPC} - ${INSTALL} -c -m 444 ${PRIVHDRSPPCHDRS} ${PRIVHDRSPPC} - -installhdrs: gen_mig_defs - mkdir -p ${INCDIR}/mach - mkdir -p ${INCDIR}/servers - ${INSTALL} -o 0 -c -m 444 ${MACH_INSTHDRS} ${INCDIR}/mach - ${INSTALL} -o 0 -c -m 444 ${SRVHDRS} ${INCDIR}/servers - @for i in `find ${DESTDIR}/usr/include/mach ${DESTDIR}/usr/include/servers -name \*.h`; do \ - x=`fgrep '' $$i | uniq -d`; \ - if [ -n "$$x" ]; then \ - echo patching $$i; \ - ed - $$i < ${SRCROOT}/libsyscall/fixdups.ed; \ - fi; \ - done - -install_lib${LIB}_static.a: - ${INSTALL} -c -m 444 lib${LIB}_static.a ${DESTDIR}/usr/local/lib/system/ -install_lib${LIB}_profile.a: - ${INSTALL} -c -m 444 lib${LIB}_profile.a ${DESTDIR}/usr/local/lib/system -install_lib${LIB}_debug.a: - ${INSTALL} -c -m 444 lib${LIB}_debug.a ${DESTDIR}/usr/local/lib/system/ -install_lib${LIB}.a: - ${INSTALL} -c -m 444 lib${LIB}.a ${DESTDIR}/usr/local/lib/system/ - -clean: - rm -f ${OBJS} ${POBJS} ${DOBJS} ${SOBJS} ${CLEANFILES} - rm -f lib${LIB}.a lib${LIB}_static.a lib${LIB}_profile.a \ - lib${LIB}_debug.a diff --git a/libsyscall/Platforms/MacOSX/i386/syscall.map b/libsyscall/Platforms/MacOSX/i386/syscall.map new file mode 100644 index 000000000..bdfa11aac --- /dev/null +++ b/libsyscall/Platforms/MacOSX/i386/syscall.map @@ -0,0 +1,93 @@ +_accept$NOCANCEL$UNIX2003 ___accept_nocancel +_accept$UNIX2003 ___accept +_aio_suspend ___aio_suspend_nocancel +_aio_suspend$NOCANCEL$UNIX2003 ___aio_suspend_nocancel +_aio_suspend$UNIX2003 ___aio_suspend +_bind$UNIX2003 ___bind +_close ___close_nocancel +_close$NOCANCEL$UNIX2003 ___close_nocancel +_close$UNIX2003 ___close +_chmod ___chmod +_connect$NOCANCEL$UNIX2003 ___connect_nocancel +_connect$UNIX2003 ___connect +_fcntl ___fcntl_nocancel +_fcntl$NOCANCEL$UNIX2003 ___fcntl_nocancel +_fcntl$UNIX2003 ___fcntl +_fstat$INODE64 ___fstat64 +_fstatfs$INODE64 ___fstatfs64 +_fsync ___fsync_nocancel +_fsync$NOCANCEL$UNIX2003 ___fsync_nocancel +_fsync$UNIX2003 ___fsync +_getattrlist$UNIX2003 ___getattrlist +_getfsstat$INODE64 ___getfsstat64 +_getpeername$UNIX2003 ___getpeername +_getsockname$UNIX2003 ___getsockname +_lchown$UNIX2003 ___lchown +_listen$UNIX2003 ___listen +_lstat$INODE64 ___lstat64 +_mmap ___mmap +_mprotect$UNIX2003 ___mprotect +_msgctl$UNIX2003 ___msgctl +_msgrcv ___msgrcv_nocancel +_msgrcv$NOCANCEL$UNIX2003 ___msgrcv_nocancel +_msgrcv$UNIX2003 ___msgrcv +_msgsnd ___msgsnd_nocancel +_msgsnd$NOCANCEL$UNIX2003 ___msgsnd_nocancel +_msgsnd$UNIX2003 ___msgsnd +_msync$NOCANCEL$UNIX2003 ___msync_nocancel +_msync$UNIX2003 ___msync +_munmap$UNIX2003 ___munmap +_open$NOCANCEL$UNIX2003 ___open_nocancel +_open$UNIX2003 ___open +_poll ___poll_nocancel +_poll$NOCANCEL$UNIX2003 ___poll_nocancel +_poll$UNIX2003 ___poll +_pread ___pread_nocancel +_pread$NOCANCEL$UNIX2003 ___pread_nocancel +_pread$UNIX2003 ___pread +_pwrite ___pwrite_nocancel +_pwrite$NOCANCEL$UNIX2003 ___pwrite_nocancel +_pwrite$UNIX2003 ___pwrite +_read ___read_nocancel +_read$NOCANCEL$UNIX2003 ___read_nocancel +_read$UNIX2003 ___read +_readv ___readv_nocancel +_readv$NOCANCEL$UNIX2003 ___readv_nocancel +_readv$UNIX2003 ___readv +_recvfrom$NOCANCEL$UNIX2003 ___recvfrom_nocancel +_recvfrom$UNIX2003 ___recvfrom +_recvmsg$NOCANCEL$UNIX2003 ___recvmsg_nocancel +_recvmsg$UNIX2003 ___recvmsg +_select$DARWIN_EXTSN ___select +_select$DARWIN_EXTSN$NOCANCEL ___select_nocancel +_sem_wait ___sem_wait_nocancel +_sem_wait$NOCANCEL$UNIX2003 ___sem_wait_nocancel +_sem_wait$UNIX2003 ___sem_wait +_semctl$UNIX2003 ___semctl +_sendmsg$NOCANCEL$UNIX2003 ___sendmsg_nocancel +_sendmsg$UNIX2003 ___sendmsg +_sendto$NOCANCEL$UNIX2003 ___sendto_nocancel +_sendto$UNIX2003 ___sendto +_setattrlist$UNIX2003 ___setattrlist +_setpgrp ___setpgid +_setregid$UNIX2003 ___setregid +_setreuid$UNIX2003 ___setreuid +_shmctl$UNIX2003 ___shmctl +_socketpair$UNIX2003 ___socketpair +_stat$INODE64 ___stat64 +_statfs$INODE64 ___statfs64 +_waitid ___waitid_nocancel +_waitid$NOCANCEL$UNIX2003 ___waitid_nocancel +_waitid$UNIX2003 ___waitid +_write ___write_nocancel +_write$NOCANCEL$UNIX2003 ___write_nocancel +_write$UNIX2003 ___write +_writev ___writev_nocancel +_writev$NOCANCEL$UNIX2003 ___writev_nocancel +_writev$UNIX2003 ___writev + +_ioctl ___ioctl +_sigaltstack ___sigaltstack +_fchmod ___fchmod +_setrlimit ___setrlimit +_getrlimit ___getrlimit diff --git a/libsyscall/Platforms/MacOSX/x86_64/syscall.map b/libsyscall/Platforms/MacOSX/x86_64/syscall.map new file mode 100644 index 000000000..b8cb6b1e1 --- /dev/null +++ b/libsyscall/Platforms/MacOSX/x86_64/syscall.map @@ -0,0 +1,54 @@ +_accept$NOCANCEL ___accept_nocancel +_aio_suspend$NOCANCEL ___aio_suspend_nocancel +_close$NOCANCEL ___close_nocancel +_connect$NOCANCEL ___connect_nocancel +_fstat$INODE64 ___fstat64 +_fstatfs$INODE64 ___fstatfs64 +_fsync$NOCANCEL ___fsync_nocancel +_getfsstat$INODE64 ___getfsstat64 +_lstat$INODE64 ___lstat64 +_msgrcv$NOCANCEL ___msgrcv_nocancel +_msgsnd$NOCANCEL ___msgsnd_nocancel +_msync$NOCANCEL ___msync_nocancel +_open$NOCANCEL ___open_nocancel +_poll$NOCANCEL ___poll_nocancel +_pread$NOCANCEL ___pread_nocancel +_pwrite$NOCANCEL ___pwrite_nocancel +_read$NOCANCEL ___read_nocancel +_readv$NOCANCEL ___readv_nocancel +_recvfrom$NOCANCEL ___recvfrom_nocancel +_recvmsg$NOCANCEL ___recvmsg_nocancel +_select$DARWIN_EXTSN ___select +_select$DARWIN_EXTSN$NOCANCEL ___select_nocancel +_sem_wait$NOCANCEL ___sem_wait_nocancel +_sendmsg$NOCANCEL ___sendmsg_nocancel +_sendto$NOCANCEL ___sendto_nocancel +_stat$INODE64 ___stat64 +_statfs$INODE64 ___statfs64 +_waitid$NOCANCEL ___waitid_nocancel +_write$NOCANCEL ___write_nocancel +_writev$NOCANCEL ___writev_nocancel + +_accept ___accept +_bind ___bind +_connect ___connect +_getattrlist ___getattrlist +_getpeername ___getpeername +_getsockname ___getsockname +_lchown ___lchown +_listen ___listen +_mprotect ___mprotect +_msgctl ___msgctl +_msync ___msync +_munmap ___munmap +_open ___open +_recvfrom ___recvfrom +_recvmsg ___recvmsg +_semctl ___semctl +_sendmsg ___sendmsg +_sendto ___sendto +_setattrlist ___setattrlist +_setregid ___setregid +_setreuid ___setreuid +_shmctl ___shmctl +_socketpair ___socketpair diff --git a/libsyscall/Platforms/syscall.map b/libsyscall/Platforms/syscall.map new file mode 100644 index 000000000..3c24170a5 --- /dev/null +++ b/libsyscall/Platforms/syscall.map @@ -0,0 +1,16 @@ +___sandbox_me ___mac_execve +___sandbox_mm ___mac_mount +___sandbox_ms ___mac_syscall +___sandbox_msp ___mac_set_proc +__exit ___exit +_accessx_np ___access_extended +_getsgroups_np ___getsgroups +_getwgroups_np ___getwgroups +# initgroups wrapper is defined in Libinfo +_initgroups +_posix_madvise ___madvise +_pthread_getugid_np ___gettid +_pthread_setugid_np ___settid +_setsgroups_np ___setsgroups +_setwgroups_np ___setwgroups +_wait4 ___wait4_nocancel diff --git a/libsyscall/create-syscalls.pl b/libsyscall/create-syscalls.pl deleted file mode 100755 index 285a170a0..000000000 --- a/libsyscall/create-syscalls.pl +++ /dev/null @@ -1,266 +0,0 @@ -#!/usr/bin/perl -# -# Copyright (c) 2006 Apple Computer, Inc. All rights reserved. -# -# @APPLE_OSREFERENCE_LICENSE_HEADER_START@ -# -# This file contains Original Code and/or Modifications of Original Code -# as defined in and that are subject to the Apple Public Source License -# Version 2.0 (the 'License'). You may not use this file except in -# compliance with the License. Please obtain a copy of the License at -# http://www.opensource.apple.com/apsl/ and read it before using this -# file. -# -# The Original Code and all software distributed under the License are -# distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER -# EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, -# INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. -# Please see the License for the specific language governing rights and -# limitations under the License. -# -# @APPLE_OSREFERENCE_LICENSE_HEADER_END@ -# -########################################################################## -# -# % create-syscalls.pl syscalls.master custom-directory out-directory -# -# This script fills the the out-directory with a Makefile.inc and *.s -# files to create the double-underbar syscall stubs. It reads the -# syscall.master file to get the symbol names and number of arguments, -# and whether Libsystem should automatically create the (non-double-underbar) -# stubs if Libc doesn't provide a wrapper. Which system calls will get -# the automatic treatment is writen to the libsyscall.list file, also -# written to the out-directory. -# -# The custom-directory contains: -# 1. SYS.h - used by the automatically created *.s and custom files -# 2. custom.s - contains architecture specific additional system calls and -# auxilliary routines (like cerror) -# 3. special case double-underbar stub files - which are copied into -# the out-directory -# -# The BSDmakefile copies /usr/include/architecture/ppc/emode_independent_asm.h -# and /usr/include/architecture/i386/asm_help.h to $(OBJDIR)/include, -# replacing .globl with .private_extern. These headers, along with SYS.h -# make the double-underbar syscall stub private_extern, so that then become -# static in the resulting libSystem.dylib. -# -########################################################################## - -use strict; -use File::Basename (); -use File::Copy (); -use File::Spec; -use IO::File; - -my $MyName = File::Basename::basename($0); - -my @CustomSrc = qw(custom.s); - -my @Copy = (qw(SYS.h), @CustomSrc); -my $CustomDir; -my %NoStub; -my $OutDir; -my %Stub = ( - quota => [4, 0], # unimplemented - setquota => [2, 0], # unimplemented - syscall => [0, 0], # custom/__syscall.s will be used -); -my $StubFile = 'libsyscall.list'; -# size in bytes of known types (only used for i386) -my %TypeBytes = ( - 'au_asid_t' => 4, - 'caddr_t' => 4, - 'gid_t' => 4, - 'id_t' => 4, - 'idtype_t' => 4, - 'int' => 4, - 'int32_t' => 4, - 'int64_t' => 8, - 'key_t' => 4, - 'long' => 4, - 'mach_port_name_t' => 4, - 'mode_t' => 4, - 'off_t' => 8, - 'pid_t' => 4, - 'semun_t' => 4, - 'sigset_t' => 4, - 'size_t' => 4, - 'socklen_t' => 4, - 'ssize_t' => 4, - 'u_int' => 4, - 'u_long' => 4, - 'uid_t' => 4, - 'uint32_t' => 4, - 'uint64_t' => 8, - 'user_addr_t' => 4, - 'user_long_t' => 4, - 'user_size_t' => 4, - 'user_ssize_t' => 4, - 'user_ulong_t' => 4, -); - -########################################################################## -# Make a __xxx.s file: if it exists in the $CustomDir, just copy it, otherwise -# create one. We define the macro __SYSCALL_32BIT_ARG_BYTES so that SYS.h could -# use that to define __SYSCALL dependent on the arguments' total size. -########################################################################## -sub make_s { - my($name, $args, $bytes) = @_; - local $_; - my $pseudo = $name; - $pseudo = '__' . $pseudo unless $pseudo =~ /^__/; - my $file = $pseudo . '.s'; - my $custom = File::Spec->join($CustomDir, $file); - my $path = File::Spec->join($OutDir, $file); - if(-f $custom) { - File::Copy::copy($custom, $path) || die "$MyName: copy($custom, $path): $!\n"; - print "Copying $path\n"; - } else { - my $f = IO::File->new($path, 'w'); - die "$MyName: $path: $!\n" unless defined($f); - print $f "#define __SYSCALL_32BIT_ARG_BYTES $bytes\n\n"; - print $f "#include \"SYS.h\"\n\n"; - print $f "__SYSCALL($pseudo, $name, $args)\n"; - print "Creating $path\n"; - } - return $file; -} - -sub usage { - die "Usage: $MyName syscalls.master custom-directory out-directory\n"; -} - -########################################################################## -# Read the syscall.master file and collect the system call names and number -# of arguments. It looks for the NO_SYSCALL_STUB quailifier following the -# prototype to determine if no automatic stub should be created by Libsystem. -# System call name that are already prefixed with double-underbar are set as -# if the NO_SYSCALL_STUB qualifier were specified (whether it is or not). -# -# For the #if lines in syscall.master, all macros are assumed to be defined, -# except COMPAT_GETFSSTAT (assumed undefined). -########################################################################## -sub readmaster { - my $file = shift; - local $_; - my $f = IO::File->new($file, 'r'); - die "$MyName: $file: $!\n" unless defined($f); - my $line = 0; - my $skip = 0; - while(<$f>) { - $line++; - if(/^#\s*endif/) { - $skip = 0; - next; - } - if(/^#\s*else/) { - $skip = -$skip; - next; - } - chomp; - if(/^#\s*if\s+(\S+)$/) { - $skip = ($1 eq 'COMPAT_GETFSSTAT') ? -1 : 1; - next; - } - next if $skip < 0; - next unless /^\d/; - s/^[^{]*{\s*//; - s/\s*}.*$//; # } - die "$MyName: no function prototype on line $line\n" unless length($_) > 0 && /;$/; - my $no_syscall_stub = /\)\s*NO_SYSCALL_STUB\s*;/; - my($name, $args) = /\s(\S+)\s*\(([^)]*)\)/; - next if $name =~ /e?nosys/; - $args =~ s/^\s+//; - $args =~ s/\s+$//; - my $argbytes = 0; - my $nargs = 0; - if($args ne '' && $args ne 'void') { - my @a = split(',', $args); - $nargs = scalar(@a); - # Calculate the size of all the arguments (only used for i386) - for my $type (@a) { - $type =~ s/\s*\w+$//; # remove the argument name - if($type =~ /\*$/) { - $argbytes += 4; # a pointer type - } else { - $type =~ s/^.*\s//; # remove any type qualifier, like unsigned - my $b = $TypeBytes{$type}; - die "$MyName: $name: unknown type '$type'\n" unless defined($b); - $argbytes += $b; - } - } - } - if($no_syscall_stub || $name =~ /^__/) { - $NoStub{$name} = [$nargs, $argbytes]; - } else { - $Stub{$name} = [$nargs, $argbytes]; - } - } -} - -usage() unless scalar(@ARGV) == 3; -$CustomDir = $ARGV[1]; -die "$MyName: $CustomDir: No such directory\n" unless -d $CustomDir; -$OutDir = $ARGV[2]; -die "$MyName: $OutDir: No such directory\n" unless -d $OutDir; - -readmaster($ARGV[0]); - -########################################################################## -# copy the files specified in @Copy from the $CustomDir to $OutDir -########################################################################## -for(@Copy) { - my $custom = File::Spec->join($CustomDir, $_); - my $path = File::Spec->join($OutDir, $_); - File::Copy::copy($custom, $path) || die "$MyName: copy($custom, $path): $!\n"; -} - -########################################################################## -# make all the *.s files -########################################################################## -my @src; -my($k, $v); -while(($k, $v) = each(%Stub)) { - push(@src, make_s($k, @$v)); -} -while(($k, $v) = each(%NoStub)) { - push(@src, make_s($k, @$v)); -} - -########################################################################## -# create the Makefile.inc file from the list for files in @src and @CustomSrc -########################################################################## -my $path = File::Spec->join($OutDir, 'Makefile.inc'); -my $f = IO::File->new($path, 'w'); -die "$MyName: $path: $!\n" unless defined($f); -print $f ".PATH: $OutDir\n\n"; -print $f "SYSCALLSRCS= " . join(" \\\n\t", sort(@src, @CustomSrc)) . "\n\n"; -print $f "MDSRCS+= \$(SYSCALLSRCS)\n\n"; -print $f ".for S in \$(SYSCALLSRCS)\n"; -print $f "PRECFLAGS-\$(S)+= -I\$(OBJROOT)/include\n"; -print $f ".endfor\n"; -undef $f; - -########################################################################## -# create the libsyscall.list file for Libsystem to use. For the one that -# should not have auto-generated stubs, the line begins with #. -########################################################################## -$path = File::Spec->join($OutDir, $StubFile); -$f = IO::File->new($path, 'w'); -die "$MyName: $path: $!\n" unless defined($f); -# Add the %NoStub entries to %Stub, appending '#' to the name, so we can sort -while(($k, $v) = each(%NoStub)) { - $k =~ s/^__//; - $Stub{"$k#"} = $v; -} -for(sort(keys(%Stub))) { - $k = $_; - if($k =~ s/#$//) { - printf $f "#___%s\t%s\n", $k, $Stub{$_}->[0]; - } else { - printf $f "___%s\t%s\n", $_, $Stub{$_}->[0]; - } -} -undef $f; diff --git a/libsyscall/custom/SYS.h b/libsyscall/custom/SYS.h index a4eb976a2..a16f358d8 100644 --- a/libsyscall/custom/SYS.h +++ b/libsyscall/custom/SYS.h @@ -55,46 +55,7 @@ #define SYS_quota 149 #endif -#if defined(__ppc__) || defined(__ppc64__) - -#include - -/* - * Macros. - */ - -#define SYSCALL(name, nargs) \ - .globl cerror @\ - MI_ENTRY_POINT(_##name) @\ - li r0,SYS_##name @\ - sc @\ - b 1f @\ - blr @\ -1: MI_BRANCH_EXTERNAL(cerror) - - -#define SYSCALL_NONAME(name, nargs) \ - .globl cerror @\ - li r0,SYS_##name @\ - sc @\ - b 1f @\ - b 2f @\ -1: MI_BRANCH_EXTERNAL(cerror) @\ -2: - - -#define PSEUDO(pseudo, name, nargs) \ - .private_extern _##pseudo @\ - .text @\ - .align 2 @\ -_##pseudo: @\ - SYSCALL_NONAME(name, nargs) - -#define __SYSCALL(pseudo, name, nargs) \ - PSEUDO(pseudo, name, nargs) @\ - blr - -#elif defined(__i386__) +#if defined(__i386__) #include #include @@ -150,11 +111,11 @@ LEAF(_##name, 0) ;\ 2: #define PSEUDO(pseudo, name, nargs) \ -LEAF(_##pseudo, 0) ;\ +LEAF(pseudo, 0) ;\ UNIX_SYSCALL_NONAME(name, nargs) #define PSEUDO_INT(pseudo, name, nargs) \ -LEAF(_##pseudo, 0) ;\ +LEAF(pseudo, 0) ;\ UNIX_SYSCALL_INT_NONAME(name, nargs) #define __SYSCALL(pseudo, name, nargs) \ @@ -192,7 +153,7 @@ LEAF(_##name, 0) ;\ 2: #define PSEUDO(pseudo, name, nargs) \ -LEAF(_##pseudo, 0) ;\ +LEAF(pseudo, 0) ;\ UNIX_SYSCALL_NONAME(name, nargs) #define __SYSCALL(pseudo, name, nargs) \ diff --git a/libsyscall/custom/__fork.s b/libsyscall/custom/__fork.s index baff6eb82..2de3a9a28 100644 --- a/libsyscall/custom/__fork.s +++ b/libsyscall/custom/__fork.s @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2007 Apple Inc. All rights reserved. + * Copyright (c) 1999-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -36,104 +36,26 @@ * 11-Jan-92 Peter King (king@next.com) * Created from M68K sources */ - -#include "SYS.h" - -#if defined(__ppc__) || defined(__ppc64__) - -/* We use mode-independent "g" opcodes such as "srgi". These expand - * into word operations when targeting __ppc__, and into doubleword - * operations when targeting __ppc64__. - */ -#include - -MI_ENTRY_POINT(___fork) - MI_PUSH_STACK_FRAME - - MI_CALL_EXTERNAL(__cthread_fork_prepare) - - li r0,SYS_fork - sc // do the fork - b Lbotch // error return - - cmpwi r4,0 // parent (r4==0) or child (r4==1) ? - beq Lparent // parent, since r4==0 - - -/* Here if we are the child. */ - -#if defined(__DYNAMIC__) - .cstring -LC3: - .ascii "__dyld_fork_child\0" - .text - .align 2 - mflr r0 - bcl 20,31,1f -1: mflr r3 - mtlr r0 - addis r3,r3,ha16(LC3-1b) - addi r3,r3,lo16(LC3-1b) - addi r4,r1,SF_LOCAL1 - bl __dyld_func_lookup - lg r3,SF_LOCAL1(r1) - mtspr ctr,r3 - bctrl -#endif - - li r9,0 - MI_GET_ADDRESS(r8,__current_pid) - stw r9,0(r8) // clear cached pid in child - - MI_CALL_EXTERNAL(__cthread_fork_child) - - li r3,0 // flag for "we are the child" - b Lreturn - - -/* Here if we are the parent, with: - * r3 = child's pid + +/* + * All of the asm stubs in this file have been adjusted so the pre/post + * fork handlers and dyld fixup are done in C inside Libc. As such, Libc + * expects the __fork asm to fix up the return code to be -1, 0 or pid + * and errno if needed. */ -Lparent: - stg r3,SF_LOCAL2(r1) // save child pid in stack - - b Lparent_return // clean up and return child's pid - - -/* Here if the fork() syscall failed. We're still the parent. */ - -Lbotch: - MI_CALL_EXTERNAL(cerror) - li r3,-1 // get an error return code - stg r3,SF_LOCAL2(r1) // save return code in stack - - /* - * We use cthread_fork_parent() to clean up after a fork error - * (unlock cthreads and mailloc packages) so the parent - * process can Malloc() after fork() errors without - * deadlocking. - */ - -Lparent_return: - MI_CALL_EXTERNAL(__cthread_fork_parent) - lg r3,SF_LOCAL2(r1) // return -1 on error, child's pid on success - -Lreturn: - MI_POP_STACK_FRAME_AND_RETURN +#include "SYS.h" -#elif defined(__i386__) +#if defined(__i386__) LEAF(___fork, 0) subl $28, %esp // Align the stack, with 16 bytes of extra padding that we'll need - CALL_EXTERN(__cthread_fork_prepare) movl $ SYS_fork,%eax; // code for fork -> eax UNIX_SYSCALL_TRAP // do the system call jnc L1 // jump if CF==0 CALL_EXTERN(cerror) - CALL_EXTERN(__cthread_fork_parent) movl $-1,%eax addl $28, %esp // restore the stack ret @@ -143,60 +65,23 @@ L1: jz L2 // parent, since r1 == 0 in parent, 1 in child //child here... -#if defined(__DYNAMIC__) -// Here on the child side of the fork we need to tell the dynamic linker that -// we have forked. To do this we call __dyld_fork_child in the dyanmic -// linker. But since we can't dynamically bind anything until this is done we -// do this by using the private extern __dyld_func_lookup() function to get the -// address of __dyld_fork_child (the 'C' code equivlent): -// -// _dyld_func_lookup("__dyld_fork_child", &address); -// address(); -// -.cstring -LC0: - .ascii "__dyld_fork_child\0" - -.text - leal 0x8(%esp),%eax // get the address where we're going to store the pointer - movl %eax, 0x4(%esp) // copy the address of the pointer - call 1f -1: popl %eax - leal LC0-1b(%eax),%eax - movl %eax, 0x0(%esp) // copy the name of the function to look up - call __dyld_func_lookup - movl 0x8(%esp),%eax // move the value returned in address parameter - call *%eax // call __dyld_fork_child indirectly -#endif - xorl %eax, %eax - REG_TO_EXTERN(%eax, __current_pid) - CALL_EXTERN(__cthread_fork_child) - xorl %eax,%eax // zero eax - addl $28, %esp // restore the stack - ret - - //parent here... + REG_TO_EXTERN(%eax, __current_pid); L2: - movl %eax, 0xc(%esp) // save pid - - CALL_EXTERN_AGAIN(__cthread_fork_parent) - movl 0xc(%esp), %eax // return pid addl $28, %esp // restore the stack - ret + // parent ends up here skipping child portion + ret #elif defined(__x86_64__) LEAF(___fork, 0) subq $24, %rsp // Align the stack, plus room for local storage - CALL_EXTERN(__cthread_fork_prepare) movl $ SYSCALL_CONSTRUCT_UNIX(SYS_fork),%eax; // code for fork -> rax UNIX_SYSCALL_TRAP // do the system call jnc L1 // jump if CF==0 CALL_EXTERN(cerror) - CALL_EXTERN(__cthread_fork_parent) movq $-1, %rax addq $24, %rsp // restore the stack ret @@ -206,42 +91,13 @@ L1: jz L2 // parent, since r1 == 0 in parent, 1 in child //child here... -#if defined(__DYNAMIC__) -// Here on the child side of the fork we need to tell the dynamic linker that -// we have forked. To do this we call __dyld_fork_child in the dyanmic -// linker. But since we can't dynamically bind anything until this is done we -// do this by using the private extern __dyld_func_lookup() function to get the -// address of __dyld_fork_child (the 'C' code equivlent): -// -// _dyld_func_lookup("__dyld_fork_child", &address); -// address(); -// -.cstring -LC0: - .ascii "__dyld_fork_child\0" - -.text - leaq 8(%rsp),%rsi // get the address where we're going to store the pointer - leaq LC0(%rip), %rdi // copy the name of the function to look up - call __dyld_func_lookup - call *8(%rsp) // call __dyld_fork_child indirectly -#endif xorq %rax, %rax - REG_TO_EXTERN(%rax, __current_pid) - CALL_EXTERN(__cthread_fork_child) - - xorq %rax,%rax // zero rax - addq $24, %rsp // restore the stack - ret - - //parent here... + PICIFY(__current_pid) + movl %eax,(%r11) L2: - movl %eax, 16(%rsp) // save pid - - CALL_EXTERN_AGAIN(__cthread_fork_parent) - movl 16(%rsp), %eax // return pid + // parent ends up here skipping child portion addq $24, %rsp // restore the stack - ret + ret #else #error Unsupported architecture diff --git a/libsyscall/custom/__getpid.s b/libsyscall/custom/__getpid.s index 1299a1645..48c85313c 100644 --- a/libsyscall/custom/__getpid.s +++ b/libsyscall/custom/__getpid.s @@ -28,45 +28,7 @@ #include "SYS.h" -#if defined(__ppc__) || defined(__ppc64__) - - .data - .globl __current_pid - .align 2 -__current_pid: - .long 0 - -MI_ENTRY_POINT(___getpid) -#if defined(__DYNAMIC__) - mflr r0 // note we cannot use MI_GET_ADDRESS... - bcl 20,31,1f // ...because we define __current_pid -1: - mflr r5 - mtlr r0 - addis r5, r5, ha16(__current_pid - 1b) - addi r5, r5, lo16(__current_pid - 1b) -#else - lis r5,hi16(__current_pid) - ori r5,r5,lo16(__current_pid) -#endif - lwz r3,0(r5) // get the cached pid - cmpwi r3,0 // if positive, - bgtlr++ // return it - - SYSCALL_NONAME(getpid, 0) - - lwarx r4,0,r5 // see if we can cache it - cmpwi r4,0 // we can't if there are any... - blt-- 1f // ...vforks in progress - - stwcx. r3,0,r5 // ignore cache conflicts - blr -1: - li r6,-4 // on 970, cancel the reservation using red zone... - stwcx. r3,r6,r1 // ...to avoid an errata - blr - -#elif defined(__i386__) +#if defined(__i386__) .data .private_extern __current_pid diff --git a/libsyscall/custom/__gettimeofday.s b/libsyscall/custom/__gettimeofday.s index c43ee761e..1dbf19c77 100644 --- a/libsyscall/custom/__gettimeofday.s +++ b/libsyscall/custom/__gettimeofday.s @@ -29,21 +29,7 @@ #include "SYS.h" -#if defined(__ppc__) || defined(__ppc64__) - -/* This syscall is special cased: the timeval is returned in r3/r4. - * Note also that the "seconds" field of the timeval is a long, so - * it's size is mode dependent. - */ -MI_ENTRY_POINT(___gettimeofday) - mr r12,r3 // save ptr to timeval - SYSCALL_NONAME(gettimeofday,0) - stg r3,0(r12) // "stw" in 32-bit mode, "std" in 64-bit mode - stw r4,GPR_BYTES(r12) - li r3,0 - blr - -#elif defined(__i386__) +#if defined(__i386__) /* * This syscall is special cased: the timeval is returned in eax/edx. diff --git a/libsyscall/custom/__lseek.s b/libsyscall/custom/__lseek.s index 909443b17..b051cc5a4 100644 --- a/libsyscall/custom/__lseek.s +++ b/libsyscall/custom/__lseek.s @@ -28,13 +28,13 @@ #include "SYS.h" -#if defined(__ppc__) || defined(__ppc64__) || defined(__x86_64__) +#if defined(__x86_64__) -__SYSCALL(__lseek, lseek, 3) +__SYSCALL(___lseek, lseek, 3) #elif defined(__i386__) -__SYSCALL_INT(__lseek, lseek, 3) +__SYSCALL_INT(___lseek, lseek, 3) #else #error Unsupported architecture diff --git a/libsyscall/custom/__pipe.s b/libsyscall/custom/__pipe.s index 107a37799..0131d476d 100644 --- a/libsyscall/custom/__pipe.s +++ b/libsyscall/custom/__pipe.s @@ -29,19 +29,9 @@ #include "SYS.h" -#if defined(__ppc__) || defined(__ppc64__) +#if defined(__i386__) -MI_ENTRY_POINT(___pipe) - mr r12,r3 // save fildes across syscall - SYSCALL_NONAME(pipe, 0) - stw r3,0(r12) - stw r4,4(r12) - li r3,0 - blr - -#elif defined(__i386__) - -PSEUDO_INT(__pipe, pipe, 0) +PSEUDO_INT(___pipe, pipe, 0) movl 4(%esp),%ecx movl %eax,(%ecx) movl %edx,4(%ecx) @@ -50,7 +40,7 @@ PSEUDO_INT(__pipe, pipe, 0) #elif defined(__x86_64__) -PSEUDO(__pipe, pipe, 0) +PSEUDO(___pipe, pipe, 0) movl %eax, (%rdi) movl %edx, 4(%rdi) xorl %eax, %eax diff --git a/libsyscall/custom/__psynch_cvbroad.s b/libsyscall/custom/__psynch_cvbroad.s index 86d9d8024..037fcfc07 100644 --- a/libsyscall/custom/__psynch_cvbroad.s +++ b/libsyscall/custom/__psynch_cvbroad.s @@ -31,9 +31,9 @@ #define __SYSCALL_32BIT_ARG_BYTES 36 -#if defined(__i386__) || defined(__x86_64__) || defined(__ppc__) +#if defined(__i386__) || defined(__x86_64__) -__SYSCALL(__psynch_cvbroad, psynch_cvbroad, 8) +__SYSCALL(___psynch_cvbroad, psynch_cvbroad, 8) #else #error Unsupported architecture diff --git a/libsyscall/custom/__psynch_cvwait.s b/libsyscall/custom/__psynch_cvwait.s index f29bceab4..c5d69ce8c 100644 --- a/libsyscall/custom/__psynch_cvwait.s +++ b/libsyscall/custom/__psynch_cvwait.s @@ -31,9 +31,9 @@ #define __SYSCALL_32BIT_ARG_BYTES 40 -#if defined(__i386__) || defined(__x86_64__) || defined(__ppc__) +#if defined(__i386__) || defined(__x86_64__) -__SYSCALL(__psynch_cvwait, psynch_cvwait, 8) +__SYSCALL(___psynch_cvwait, psynch_cvwait, 8) #else #error Unsupported architecture diff --git a/libsyscall/custom/__ptrace.s b/libsyscall/custom/__ptrace.s index 2fd53b460..9eae221f2 100644 --- a/libsyscall/custom/__ptrace.s +++ b/libsyscall/custom/__ptrace.s @@ -29,16 +29,7 @@ #include "SYS.h" -#if defined(__ppc__) || defined(__ppc64__) - -MI_ENTRY_POINT(___ptrace) - li r7,0 - MI_GET_ADDRESS(r8,_errno) - stw r7,0(r8) - SYSCALL_NONAME(ptrace, 4) - blr - -#elif defined(__i386__) +#if defined(__i386__) .globl _errno @@ -54,7 +45,8 @@ UNIX_SYSCALL_NONAME(ptrace, 4) LEAF(___ptrace, 0) xorq %rax,%rax - REG_TO_EXTERN(%rax,_errno) + PICIFY(_errno) + movl %eax,(%r11) UNIX_SYSCALL_NONAME(ptrace, 4) ret diff --git a/libsyscall/custom/__sigaltstack.s b/libsyscall/custom/__sigaltstack.s index 514822ba2..d5f1803ff 100644 --- a/libsyscall/custom/__sigaltstack.s +++ b/libsyscall/custom/__sigaltstack.s @@ -28,13 +28,13 @@ #include "SYS.h" -#if defined(__ppc__) || defined(__ppc64__) || defined(__x86_64__) +#if defined(__x86_64__) -__SYSCALL(__sigaltstack, sigaltstack, 3) +__SYSCALL(___sigaltstack, sigaltstack, 3) #elif defined(__i386__) -__SYSCALL_INT(__sigaltstack, sigaltstack, 3) +__SYSCALL_INT(___sigaltstack, sigaltstack, 3) #else #error Unsupported architecture diff --git a/libsyscall/custom/__sigreturn.s b/libsyscall/custom/__sigreturn.s index 776351abb..16d5be4fc 100644 --- a/libsyscall/custom/__sigreturn.s +++ b/libsyscall/custom/__sigreturn.s @@ -28,13 +28,13 @@ #include "SYS.h" -#if defined(__ppc__) || defined(__ppc64__) || defined(__x86_64__) +#if defined(__x86_64__) -__SYSCALL(__sigreturn, sigreturn, 2) +__SYSCALL(___sigreturn, sigreturn, 2) #elif defined(__i386__) -__SYSCALL_INT(__sigreturn, sigreturn, 2) +__SYSCALL_INT(___sigreturn, sigreturn, 2) #else #error Unsupported architecture diff --git a/libsyscall/custom/__syscall.s b/libsyscall/custom/__syscall.s index dae18a831..73735bd4b 100644 --- a/libsyscall/custom/__syscall.s +++ b/libsyscall/custom/__syscall.s @@ -29,11 +29,7 @@ #include "SYS.h" -#if defined(__ppc__) || defined(__ppc64__) - -__SYSCALL(__syscall, syscall, 7) - -#elif defined(__i386__) +#if defined(__i386__) LEAF(___syscall, 0) popl %ecx // ret addr @@ -52,7 +48,7 @@ END(___syscall) // For x86-64, the kernel slides the argument list for us. // The number of arguments here is variable, but our macros ignore // that value anyway. -__SYSCALL(__syscall, syscall, 0); +__SYSCALL(___syscall, syscall, 0); #else #error Unsupported architecture diff --git a/libsyscall/custom/__thread_selfid.s b/libsyscall/custom/__thread_selfid.s index 5e70787cf..2c4dd934c 100644 --- a/libsyscall/custom/__thread_selfid.s +++ b/libsyscall/custom/__thread_selfid.s @@ -30,10 +30,10 @@ #if defined(__x86_64__) -__SYSCALL(__thread_selfid, thread_selfid, 1) +__SYSCALL(___thread_selfid, thread_selfid, 1) #elif defined(__i386__) -__SYSCALL_INT(__thread_selfid, thread_selfid, 1) +__SYSCALL_INT(___thread_selfid, thread_selfid, 1) #endif diff --git a/libsyscall/custom/__vfork.s b/libsyscall/custom/__vfork.s index 073b90840..91408f9c3 100644 --- a/libsyscall/custom/__vfork.s +++ b/libsyscall/custom/__vfork.s @@ -37,56 +37,7 @@ #include "SYS.h" -#if defined(__ppc__) || defined(__ppc64__) - -/* We use mode-independent "g" opcodes such as "srgi", and/or - * mode-independent macros such as MI_GET_ADDRESS. These expand - * into word operations when targeting __ppc__, and into doubleword - * operations when targeting __ppc64__. - */ -#include - -/* In vfork(), the child runs in parent's address space. */ - - -MI_ENTRY_POINT(___vfork) - MI_GET_ADDRESS(r5,__current_pid) // get address of __current_pid in r5 -2: - lwarx r6,0,r5 // don't cache pid across vfork - cmpwi r6,0 - ble-- 3f // is another vfork in progress - li r6,0 // if not, erase the stored pid -3: - addi r6,r6,-1 // count the parallel vforks in - stwcx. r6,0,r5 // negative cached pid values - bne-- 2b - - li r0,SYS_vfork - sc - b Lbotch // error return - - cmpwi r4,0 - beq Lparent // parent, since a1 == 0 in parent, - - li r3,0 // child - blr - -Lparent: // r3 == child's pid - lwarx r6,0,r5 // we're back, decrement vfork count - addi r6,r6,1 - stwcx. r6,0,r5 - bne-- Lparent - blr // return pid - -Lbotch: - lwarx r6,0,r5 // never went, decrement vfork count - addi r6,r6,1 - stwcx. r6,0,r5 - bne-- Lbotch - - MI_BRANCH_EXTERNAL(cerror) - -#elif defined(__i386__) +#if defined(__i386__) #if defined(__DYNAMIC__) #define GET_CURRENT_PID PICIFY(__current_pid) diff --git a/libsyscall/custom/custom.s b/libsyscall/custom/custom.s index 5f34a7434..b9d46ba13 100644 --- a/libsyscall/custom/custom.s +++ b/libsyscall/custom/custom.s @@ -30,35 +30,7 @@ #include "SYS.h" -#if defined(__ppc__) || defined(__ppc64__) - -/* We use mode-independent "g" opcodes such as "srgi", and/or - * mode-independent macros such as MI_GET_ADDRESS. These expand - * into word operations when targeting __ppc__, and into doubleword - * operations when targeting __ppc64__. - */ -#include - - .globl _errno - -MI_ENTRY_POINT(cerror) - MI_PUSH_STACK_FRAME - MI_GET_ADDRESS(r12,_errno) - stw r3,0(r12) /* save syscall return code in global */ - MI_CALL_EXTERNAL(_cthread_set_errno_self) - li r3,-1 /* then bug return value */ - li r4,-1 /* in case we're returning a long-long in 32-bit mode, etc */ - MI_POP_STACK_FRAME_AND_RETURN - - - .globl _processor_facilities_used - .align 2 -_processor_facilities_used: - li r0,0x7FF3 - sc - blr - -#elif defined(__i386__) +#if defined(__i386__) .globl _errno @@ -75,9 +47,7 @@ LABEL(cerror) movl $-1,%edx /* in case a 64-bit value is returned */ ret - .private_extern __sysenter_trap - ALIGN -__sysenter_trap: +LABEL(__sysenter_trap) popl %edx movl %esp, %ecx sysenter @@ -87,8 +57,9 @@ __sysenter_trap: .globl _errno LABEL(cerror) - REG_TO_EXTERN(%rax, _errno) - mov %rsp,%rdx + PICIFY(_errno) /* address -> %r11 */ + movl %eax,(%r11) + mov %rsp,%rdx andq $-16,%rsp subq $16,%rsp // Preserve the original stack diff --git a/osfmk/chud/ppc/chud_xnu_glue.h b/libsyscall/custom/errno.c similarity index 95% rename from osfmk/chud/ppc/chud_xnu_glue.h rename to libsyscall/custom/errno.c index 7145052d0..58da2c114 100644 --- a/osfmk/chud/ppc/chud_xnu_glue.h +++ b/libsyscall/custom/errno.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2008 Apple Inc. All rights reserved. + * Copyright (c) 2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -26,3 +26,4 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +int errno; diff --git a/libsyscall/include/Makefile.inc b/libsyscall/include/Makefile.inc deleted file mode 100644 index 7bf41dc30..000000000 --- a/libsyscall/include/Makefile.inc +++ /dev/null @@ -1 +0,0 @@ -PRIVHDRSPPCHDRS += ${.CURDIR}/include/processor_facilities.h diff --git a/libsyscall/include/processor_facilities.h b/libsyscall/include/processor_facilities.h deleted file mode 100644 index 7ba9747bc..000000000 --- a/libsyscall/include/processor_facilities.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* Does the current CPU have Altivec support? */ -extern int _cpu_has_altivec; - -/* What processor facilities is the current thread using? */ -#define floatUsed 0x40000000 -#define vectorUsed 0x20000000 - -extern int processor_facilities_used(void); diff --git a/libsyscall/mach/Makefile.inc b/libsyscall/mach/Makefile.inc deleted file mode 100644 index 516300d2a..000000000 --- a/libsyscall/mach/Makefile.inc +++ /dev/null @@ -1,74 +0,0 @@ -# machine-dependent mach sources -ARCHDIR = ${MACHINE_ARCH:C/^armv.*$/arm/} -.if exists(${.CURDIR}/mach/${ARCHDIR}/Makefile.inc) -.include "${.CURDIR}/mach/${ARCHDIR}/Makefile.inc" -.endif - -.PATH: ${.CURDIR}/mach - -.include "${.CURDIR}/mach/headers/Makefile.inc" -.include "${.CURDIR}/mach/servers/Makefile.inc" - -MD_MIGDEFS += task.defs \ - thread_act.defs - -MD_MIGHDRS += ${MD_MIGDEFS:.defs=.h} - -MIGDEFS += \ - clock.defs \ - clock_priv.defs \ - exc.defs \ - host_priv.defs \ - host_security.defs \ - ledger.defs \ - lock_set.defs \ - mach_port.defs \ - mach_host.defs \ - mach_vm.defs \ - processor.defs \ - processor_set.defs \ - vm_map.defs - -MIGHDRS = ${MIGDEFS:.defs=.h} -MIGHDRS += clock_reply.h -MACH_INSTHDRS += ${MIGHDRS} - -# These files are generated from the .defs files -MIGSRCS = ${MIGDEFS:.defs=User.c} ${MD_MIGDEFS:.defs=User.c} - -MISRCS += ${MIGSRCS} \ - bootstrap_ports.c \ - clock_sleep.c \ - error_codes.c \ - excServer.c \ - excUser.c \ - exc_catcher.c \ - exc_catcher_state.c \ - exc_catcher_state_identity.c \ - fprintf_stderr.c \ - mig_allocate.c \ - mig_deallocate.c \ - mig_reply_setup.c \ - mig_strncpy.c \ - mach_error.c \ - mach_error_string.c \ - mach_init.c \ - mach_init_libSystem.c \ - mach_init_ports.c \ - mach_msg.c \ - mach_traps.s \ - ms_thread_switch.c \ - notifyUser.c \ - panic.c \ - port_obj.c \ - sbrk.c \ - semaphore.c \ - slot_name.c - -CLEANFILES += ${MIGHDRS} ${MIGSRCS} ${MD_MIGDEFS:.defs=Server.c} \ - ${MIGDEFS:.defs=Server.c} exc.h excUser.c excServer.c \ - notify.h notifyUser.c notifyServer.c - -MAN2 += brk.2 - -MLINKS += brk.2 sbrk.2 diff --git a/osfmk/ppc/hardclock_entries.h b/libsyscall/mach/abort.h similarity index 83% rename from osfmk/ppc/hardclock_entries.h rename to libsyscall/mach/abort.h index 3a804ea9e..7b99c1cf0 100644 --- a/osfmk/ppc/hardclock_entries.h +++ b/libsyscall/mach/abort.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,13 +25,12 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * @OSF_COPYRIGHT@ - */ -#ifndef _HARDCLOCK_ENTRIES_H_ -#define _HARDCLOCK_ENTRIES_H_ +// Defined because we don't have Libc +#define __SIGABRT 6 +#define __STDERR_FILENO 2 -extern void hardclock(struct ppc_saved_state*); +int __getpid(void); +int __kill(int pid, int signum, int posix); -#endif /* _HARDCLOCK_ENTRIES_H_ */ +#define abort() __kill(__getpid(), __SIGABRT, 0) diff --git a/libsyscall/mach/bootstrap_ports.c b/libsyscall/mach/bootstrap_ports.c deleted file mode 100644 index 33399332b..000000000 --- a/libsyscall/mach/bootstrap_ports.c +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (c) 2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ - -#include - -/* - * Stub out the old bootstrap_ports() API, as some applications need - * it to exist. We do not publish a prototype for this, and the stub - * WILL disappear in a future release. - */ -kern_return_t -bootstrap_ports( - mach_port_t bootstrap, - mach_port_t *priv_host, - mach_port_t *device_master, - mach_port_t *wired_ledger, - mach_port_t *paged_ledger, - mach_port_t *host_security) -{ - return KERN_FAILURE; -} - diff --git a/libsyscall/mach/brk.2 b/libsyscall/mach/brk.2 deleted file mode 100644 index 9ea4f61c2..000000000 --- a/libsyscall/mach/brk.2 +++ /dev/null @@ -1,150 +0,0 @@ -.\" $NetBSD: brk.2,v 1.7 1995/02/27 12:31:57 cgd Exp $ -.\" -.\" Copyright (c) 1980, 1991, 1993 -.\" The Regents of the University of California. All rights reserved. -.\" -.\" Redistribution and use in source and binary forms, with or without -.\" modification, are permitted provided that the following conditions -.\" are met: -.\" 1. Redistributions of source code must retain the above copyright -.\" notice, this list of conditions and the following disclaimer. -.\" 2. Redistributions in binary form must reproduce the above copyright -.\" notice, this list of conditions and the following disclaimer in the -.\" documentation and/or other materials provided with the distribution. -.\" 3. All advertising materials mentioning features or use of this software -.\" must display the following acknowledgement: -.\" This product includes software developed by the University of -.\" California, Berkeley and its contributors. -.\" 4. Neither the name of the University nor the names of its contributors -.\" may be used to endorse or promote products derived from this software -.\" without specific prior written permission. -.\" -.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND -.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -.\" SUCH DAMAGE. -.\" -.\" @(#)brk.2 8.2 (Berkeley) 12/11/93 -.\" -.Dd December 11, 1993 -.Dt BRK 2 -.Os BSD 4 -.Sh NAME -.Nm brk , -.Nm sbrk -.Nd change data segment size -.Sh SYNOPSIS -.Fd #include -.Ft void * -.Fn brk "const void *addr" -.Ft void * -.Fn sbrk "int incr" -.Sh DESCRIPTION -.Bf -symbolic -The brk and sbrk functions are historical curiosities -left over from earlier days before the advent of virtual memory management. -.Ef -The -.Fn brk -function -sets the break or lowest address -of a process's data segment (uninitialized data) to -.Fa addr -(immediately above bss). -Data addressing is restricted between -.Fa addr -and the lowest stack pointer to the stack segment. -Memory is allocated by -.Fa brk -in page size pieces; -if -.Fa addr -is not evenly divisible by the system page size, it is -increased to the next page boundary. -.Pp -.\" The -.\" .Nm sbrk -.\" function -.\" allocates chunks of -.\" .Fa incr -.\" bytes -.\" to the process's data space -.\" and returns an address pointer. -.\" The -.\" .Xr malloc 3 -.\" function utilizes -.\" .Nm sbrk . -.\" .Pp -The current value of the program break is reliably returned by -.Dq Li sbrk(0) -(see also -.Xr end 3 ) . -The -.Xr getrlimit 2 -system call may be used to determine -the maximum permissible size of the -.Em data -segment; -it will not be possible to set the break -beyond the -.Em rlim_max -value returned from a call to -.Xr getrlimit , -e.g. -.Dq qetext + rlp\(->rlim_max. -(see -.Xr end 3 -for the definition of -.Em etext ) . -.Sh RETURN VALUES -.Nm Brk -returns a pointer to the new end of memory if successful; -otherwise -1 with -.Va errno -set to indicate why the allocation failed. -The -.Nm sbrk -function returns a pointer to the base of the new storage if successful; -otherwise -1 with -.Va errno -set to indicate why the allocation failed. -.Sh ERRORS -.Xr Sbrk -will fail and no additional memory will be allocated if -one of the following are true: -.Bl -tag -width Er -.It Bq Er ENOMEM -The limit, as set by -.Xr setrlimit 2 , -was exceeded. -.It Bq Er ENOMEM -The maximum possible size of a data segment (compiled into the -system) was exceeded. -.It Bq Er ENOMEM -Insufficient space existed in the swap area -to support the expansion. -.El -.Sh SEE ALSO -.Xr execve 2 , -.Xr getrlimit 2 , -.Xr malloc 3 , -.Xr mmap 2 , -.Xr end 3 -.Sh BUGS -Setting the break may fail due to a temporary lack of -swap space. It is not possible to distinguish this -from a failure caused by exceeding the maximum size of -the data segment without consulting -.Xr getrlimit . -.Sh HISTORY -A -.Fn brk -function call appeared in -.At v7 . diff --git a/libsyscall/mach/clock_sleep.c b/libsyscall/mach/clock_sleep.c index 6470f2713..dbcca39d2 100644 --- a/libsyscall/mach/clock_sleep.c +++ b/libsyscall/mach/clock_sleep.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1999-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -29,10 +29,11 @@ #include #include -kern_return_t clock_sleep(mach_port_t clock_name, - sleep_type_t clock_type, - mach_timespec_t sleep_time, - mach_timespec_t *wake_time) { - +kern_return_t +clock_sleep(mach_port_t clock_name, + sleep_type_t clock_type, + mach_timespec_t sleep_time, + mach_timespec_t *wake_time) +{ return clock_sleep_trap(clock_name, clock_type, sleep_time.tv_sec, sleep_time.tv_nsec, wake_time); } diff --git a/iokit/Kernel/ppc/IOSharedLock.s b/libsyscall/mach/dylib_link.c similarity index 90% rename from iokit/Kernel/ppc/IOSharedLock.s rename to libsyscall/mach/dylib_link.c index 69183e016..5aa27f230 100644 --- a/iokit/Kernel/ppc/IOSharedLock.s +++ b/libsyscall/mach/dylib_link.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,5 +25,5 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#include +/* This empty file is here to force the dylib target to actually link */ diff --git a/libsyscall/mach/err_iokit.sub b/libsyscall/mach/err_iokit.sub index 02e657aa8..b5361b8be 100755 --- a/libsyscall/mach/err_iokit.sub +++ b/libsyscall/mach/err_iokit.sub @@ -34,12 +34,12 @@ #include #endif -static struct error_sparse_map err_codes_iokit_common_map[] = { +static const struct error_sparse_map err_codes_iokit_common_map[] = { err_code_map_entry(kIOReturnInvalid, kIOReturnInvalid ), err_code_map_entry(kIOReturnError, kIOReturnNotFound), }; -static const char * err_codes_iokit_common[] = { +static const char * const err_codes_iokit_common[] = { "(iokit/common) invalid - should never be seen", // 0x001 "(iokit/common) general error", // 0x2bc @@ -98,7 +98,7 @@ static const char * err_codes_iokit_common[] = { }; #if !TARGET_OS_EMBEDDED -static struct error_sparse_map err_codes_iokit_usb_map[] = { +static const struct error_sparse_map err_codes_iokit_usb_map[] = { err_code_map_entry(kIOUSBCRCErr, kIOUSBDataToggleErr), err_code_map_entry(kIOUSBPIDCheckErr, kIOUSBWrongPIDErr), err_code_map_entry(kIOUSBReserved1Err, kIOUSBLinkErr), @@ -108,7 +108,7 @@ static struct error_sparse_map err_codes_iokit_usb_map[] = { }; // error codes with in 0xe0004000 -static const char * err_codes_iokit_usb[] = { +static const char * const err_codes_iokit_usb[] = { "(iokit/usb) USB Controller Error: bad CRC received", // 0x001 "(iokit/usb) USB Controller Error: bitstuffing", // 0x002 "(iokit/usb) USB Controller Error: Bad data toggle", // 0x003 @@ -144,14 +144,14 @@ static const char * err_codes_iokit_usb[] = { "(iokit/usb) pipe ref not recognized", // 0x061 }; -static struct error_sparse_map err_codes_iokit_fw_map[] = { +static const struct error_sparse_map err_codes_iokit_fw_map[] = { err_code_map_entry(kIOConfigNoEntry, kIOFireWireBusReset), err_code_map_entry(kIOFireWireBogusDCLProgram, kIOFireWireCompleting), err_code_map_entry(kIOFWMessageServiceIsRequestingClose, kIOFWMessageTopologyChanged), }; // error codes with in 0xe0008000 -static const char * err_codes_iokit_fw[] = { +static const char * const err_codes_iokit_fw[] = { "(iokit/firewire) can't find requested entry in config ROM", // 001 "(iokit/firewire) command pending (internal)", // 002 "(iokit/firewire) DCL callback is final callback (internal)", // 003 @@ -196,7 +196,7 @@ static const char * err_codes_iokit_fw[] = { }; // error codes with in 0xe0020000 -static const char * err_codes_iokit_bluetooth[] = { +static const char * const err_codes_iokit_bluetooth[] = { "(iokit/bluetooth) unknown error", // 000 "(iokit/bluetooth) interrupted operation, hardware reset", // 001 "(iokit/bluetooth) connection to device already exists", // 002 @@ -211,7 +211,7 @@ static const struct error_sparse_map err_iokit_sub_map[] = { }; #define err_iokit_null_sub { "(iokit/?", 0 } -static struct error_subsystem err_iokit_sub[] = +static const struct error_subsystem err_iokit_sub[] = { /* 0 */ { "(iokit/common)", // 0xe0000000 diff --git a/libsyscall/mach/err_ipc.sub b/libsyscall/mach/err_ipc.sub index f5c0905f4..35cbce0c6 100644 --- a/libsyscall/mach/err_ipc.sub +++ b/libsyscall/mach/err_ipc.sub @@ -36,7 +36,7 @@ * Definitions of error strings for original IPC */ -static const char * err_codes_send[] = { +static const char * const err_codes_send[] = { "(ipc/send) unknown error", /* -100 */ "(ipc/send) invalid memory", /* -101 */ "(ipc/send) invalid port", /* -102 */ @@ -51,7 +51,7 @@ static const char * err_codes_send[] = { "(ipc/send) message size changed while being copied", /* -111 */ }; -static const char * err_codes_rcv[] = { +static const char * const err_codes_rcv[] = { "(ipc/rcv) unknown error", /* -200 */ "(ipc/rcv) invalid memory", /* -201 */ "(ipc/rcv) invalid port", /* -202 */ @@ -63,7 +63,7 @@ static const char * err_codes_rcv[] = { "(ipc/rcv) port receiver changed or port became enabled", /* -208 */ }; -static const char * err_codes_mig[] = { +static const char * const err_codes_mig[] = { "(ipc/mig) type check failure in message interface", /* 0 (-300) */ "(ipc/mig) wrong return message ID", /* 1 */ "(ipc/mig) server detected error", /* 2 */ diff --git a/libsyscall/mach/err_kern.sub b/libsyscall/mach/err_kern.sub index f00943599..bc059a5dd 100644 --- a/libsyscall/mach/err_kern.sub +++ b/libsyscall/mach/err_kern.sub @@ -36,7 +36,7 @@ * error codes for Mach and Unix kernels */ -static const char * err_codes_kern[] = { +static const char * const err_codes_kern[] = { "(os/kern) successful", /* 0 */ "(os/kern) invalid address", "(os/kern) protection failure", @@ -87,7 +87,7 @@ static const char * err_codes_kern[] = { "(os/kern) remote node down", }; -static const char * err_codes_unix[] = { +static const char * const err_codes_unix[] = { NO_SUCH_ERROR, "(os/unix) no rights to object", "(os/unix) file or directory does not exist", diff --git a/libsyscall/mach/err_libkern.sub b/libsyscall/mach/err_libkern.sub index a9a9c27c2..f419d04fa 100644 --- a/libsyscall/mach/err_libkern.sub +++ b/libsyscall/mach/err_libkern.sub @@ -34,14 +34,14 @@ /* These codes are specified in decimal in OSReturn.h. */ -static const char * err_codes_libkern_common[] = { +static const char * const err_codes_libkern_common[] = { NO_SUCH_ERROR, "(libkern/common) general/unspecified error", /* 1 */ }; /* These codes are specified in decimal in OSReturn.h. */ -static const char * err_codes_libkern_metaclass[] = { +static const char * const err_codes_libkern_metaclass[] = { NO_SUCH_ERROR, "(libkern/metaclass) runtime internal error", /* 1 */ "(libkern/metaclass) class has instances", /* 2 */ @@ -58,7 +58,7 @@ static const char * err_codes_libkern_metaclass[] = { /* These codes are specified in hexadecimal in OSKextLib.h. */ -static const char * err_codes_libkern_kext[] = { +static const char * const err_codes_libkern_kext[] = { NO_SUCH_ERROR, "(libkern/kext) internal error", /* 0x1 */ "(libkern/kext) allocation failure", /* 0x2 */ diff --git a/libsyscall/mach/err_mach_ipc.sub b/libsyscall/mach/err_mach_ipc.sub index c9e6a79c9..d1e542fae 100644 --- a/libsyscall/mach/err_mach_ipc.sub +++ b/libsyscall/mach/err_mach_ipc.sub @@ -36,7 +36,7 @@ * Error string definitions for the new Mach IPC */ -static const char * err_codes_mach_send[] = { +static const char * const err_codes_mach_send[] = { /* 0 */ "(ipc/send) no error", /* 1 */ "(ipc/send) send in progress", /* 2 */ "(ipc/send) invalid data", @@ -61,7 +61,7 @@ static const char * err_codes_mach_send[] = { /* 21 */ "(ipc/send) out-of-line buffer too large", }; -static const char * err_codes_mach_rcv[] = { +static const char * const err_codes_mach_rcv[] = { /* 0 */ "(ipc/rcv) no error", /* 1 */ "(ipc/rcv) receive in progress", /* 2 */ "(ipc/rcv) invalid name", @@ -81,7 +81,7 @@ static const char * err_codes_mach_rcv[] = { /* 16 */ "(ipc/rcv) DIPC transport error", }; -static const char * err_codes_mach_mig[] = { +static const char * const err_codes_mach_mig[] = { /* 0 */ "(ipc/mig) client type check failure", /* 1 */ "(ipc/mig) wrong reply message ID", /* 2 */ "(ipc/mig) server detected error", diff --git a/libsyscall/mach/err_server.sub b/libsyscall/mach/err_server.sub index acac59a0e..3fed18fe1 100644 --- a/libsyscall/mach/err_server.sub +++ b/libsyscall/mach/err_server.sub @@ -36,13 +36,13 @@ * Definitions of Servers error strings */ -static const char * err_codes_netname[] = { /* 0 */ +static const char * const err_codes_netname[] = { /* 0 */ "(server/netname) name is not yours", "(server/netname) name not checked in", "(server/netname) no such host", "(server/netname) host not found", }; -static const char * err_codes_env_mgr[] = { /* 1 */ +static const char * const err_codes_env_mgr[] = { /* 1 */ NO_SUCH_ERROR, "(server/env_mgr) variable not found", "(server/env_mgr) wrong type of variable", @@ -52,23 +52,23 @@ static const char * err_codes_env_mgr[] = { /* 1 */ "(server/env_mgr) port table full", "(server/env_mgr) attempting to enter a null port ", }; -static const char * err_codes_execd[] = { /* 2 */ +static const char * const err_codes_execd[] = { /* 2 */ NO_SUCH_ERROR, "(server/execd) could not find file to run", "(server/execd) userid or password incorrect", "(server/execd) fork failed", }; -static const char * err_codes_netmemory[] = { /* 3 */ +static const char * const err_codes_netmemory[] = { /* 3 */ "(server/netmemory) successful", "(server/netmemory) invalid argument", "(server/netmemory) resource shortage", }; -static const char * err_codes_ufs[] = { /* 4 */ +static const char * const err_codes_ufs[] = { /* 4 */ NO_SUCH_ERROR, /* XXX "(server/ufs) invalid port", */ }; -static const char * err_codes_task_master[] = { /* 5 */ +static const char * const err_codes_task_master[] = { /* 5 */ "(server/task_master) GENERIC ERROR", "(server/task_master) invalid tm_task port", "(server/task_master) invalid task id", @@ -77,7 +77,7 @@ static const char * err_codes_task_master[] = { /* 5 */ "(server/task_master) invalid action", }; -static const char * err_codes_ns[] = { /* 6 */ +static const char * const err_codes_ns[] = { /* 6 */ "(server/ns) GENERIC ERROR", "(server/ns) invalid handle", "(server/ns) name not found", @@ -98,7 +98,7 @@ static const char * err_codes_ns[] = { /* 6 */ "(server/ns) entry not reserved", }; -static const char * err_codes_io[] = { /* 7 */ +static const char * const err_codes_io[] = { /* 7 */ "(server/io) GENERIC ERROR", "(server/io) invalid offset", "(server/io) invalid size", @@ -107,7 +107,7 @@ static const char * err_codes_io[] = { /* 7 */ "(server/io) operation rejected under current I/O strategy", }; -static const char * err_codes_auth[] = { /* 8 */ +static const char * const err_codes_auth[] = { /* 8 */ "(server/auth) GENERIC ERROR", "(server/auth) bad private port", "(server/auth) bad name", @@ -119,7 +119,7 @@ static const char * err_codes_auth[] = { /* 8 */ "(server/auth) not secondary", }; -static const char * err_codes_us[] = { /* 9 */ +static const char * const err_codes_us[] = { /* 9 */ "(server/us) GENERIC ERROR", "(server/us) unknown error", "(server/us) object not found", @@ -146,7 +146,7 @@ static const char * err_codes_us[] = { /* 9 */ "(server/us) internal error", }; -static const char * err_codes_sunrpc[] = { /* 10 */ +static const char * const err_codes_sunrpc[] = { /* 10 */ "(server/sunrpc) GENERIC ERROR", "(server/sunrpc) cannot encode arguments", "(server/sunrpc) cannot decode results", @@ -167,7 +167,7 @@ static const char * err_codes_sunrpc[] = { /* 10 */ "(server/sunrpc) unknown protocol", }; -static const char * err_codes_machobj[] = { /* 11 */ +static const char * const err_codes_machobj[] = { /* 11 */ "(server/object system) GENERIC ERROR", "(server/object system) object not found", "(server/object system) no such operation", @@ -176,7 +176,7 @@ static const char * err_codes_machobj[] = { /* 11 */ "(server/object system) bad ipc message format", }; -static const char * err_codes_loader[] = { /* 12 */ +static const char * const err_codes_loader[] = { /* 12 */ "(server/loader) GENERIC ERROR", "(server/loader) object file not relocated", "(server/loader) unknown file type", @@ -186,7 +186,7 @@ static const char * err_codes_loader[] = { /* 12 */ }; -static const char * err_codes_exception[] = { /* 13 */ +static const char * const err_codes_exception[] = { /* 13 */ "(server/exception) GENERIC ERROR", "(server/exception) invalid access", "(server/exception) invalid instruction", @@ -196,7 +196,7 @@ static const char * err_codes_exception[] = { /* 13 */ "(server/exception) breakpoint exception", }; -static const char * err_codes_ux_signal[] = { /* 14 */ +static const char * const err_codes_ux_signal[] = { /* 14 */ "(server/unix-signal) GENERIC ERROR", "(server/unix-signal) hangup", "(server/unix-signal) interrupt", @@ -229,7 +229,7 @@ static const char * err_codes_ux_signal[] = { /* 14 */ "(server/unix-signal) user-defined signal 2", }; -static const char * err_codes_xkernel[] = { /* 15 */ +static const char * const err_codes_xkernel[] = { /* 15 */ "(server/xkernel) GENERIC ERROR", "(server/xkernel) map full", "(server/xkernel) inconsistent bind", diff --git a/libsyscall/mach/error_codes.c b/libsyscall/mach/error_codes.c index c87e18b8b..085f468dc 100644 --- a/libsyscall/mach/error_codes.c +++ b/libsyscall/mach/error_codes.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003 Apple Inc. All rights reserved. + * Copyright (c) 2003-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -72,7 +72,7 @@ #include "err_server.sub" #include "err_us.sub" -__private_extern__ struct error_system _mach_errors[err_max_system+1] = { +const struct error_system _mach_errors[err_max_system+1] = { /* 0; err_kern */ { errlib_count(err_os_sub), diff --git a/libsyscall/mach/errorlib.h b/libsyscall/mach/errorlib.h index 0ed4a27c8..931184372 100644 --- a/libsyscall/mach/errorlib.h +++ b/libsyscall/mach/errorlib.h @@ -91,9 +91,9 @@ struct error_sparse_map { #define err_code_map_entry(start, end) { err_get_code(start), err_get_code(end) } struct error_subsystem { - const char * subsys_name; + const char *subsys_name; int max_code; - const char * * codes; + const char * const *codes; const struct error_sparse_map *map_table; int map_count; }; @@ -101,15 +101,14 @@ struct error_subsystem { struct error_system { int max_sub; - const char * bad_sub; - const struct error_subsystem * subsystem; - const struct error_sparse_map * map_table; + const char *bad_sub; + const struct error_subsystem *subsystem; + const struct error_sparse_map *map_table; int map_count; }; #define errorlib_sub_null { NULL, 0, NULL, NULL, 0 } -__private_extern__ struct error_system _mach_errors[err_max_system+1]; - -__private_extern__ char *mach_error_string_int(mach_error_t, boolean_t *); +extern const struct error_system _mach_errors[err_max_system+1]; +char *mach_error_string_int(mach_error_t, boolean_t *); #define errlib_count(s) (sizeof(s)/sizeof(s[0])) diff --git a/libsyscall/mach/exc_catcher.c b/libsyscall/mach/exc_catcher.c index a85086519..9915eb2a0 100644 --- a/libsyscall/mach/exc_catcher.c +++ b/libsyscall/mach/exc_catcher.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1999-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -35,10 +35,12 @@ #include #include #include -#include -#include -__private_extern__ kern_return_t internal_catch_exception_raise ( +#include "abort.h" +#include "exc_catcher.h" + +__private_extern__ kern_return_t +internal_catch_exception_raise( mach_port_t exception_port, mach_port_t thread, mach_port_t task, @@ -47,21 +49,19 @@ __private_extern__ kern_return_t internal_catch_exception_raise ( mach_msg_type_number_t codeCnt) { #if defined(__DYNAMIC__) - static int checkForFunction = 0; - /* This will be non-zero if the user has defined this function */ - static kern_return_t (*func)(mach_port_t, mach_port_t, mach_port_t, exception_type_t, exception_data_t, mach_msg_type_number_t); - if (checkForFunction == 0) { - checkForFunction = 1; - func = dlsym(RTLD_DEFAULT, "catch_exception_raise"); - } - if (func == 0) { - /* The user hasn't defined catch_exception_raise in their binary */ - abort(); - } - return (*func)(exception_port, thread, task, exception, code, codeCnt); + static _libkernel_exc_raise_func_t exc_raise_func = (void*)-1; + + if (exc_raise_func == ((void*)-1)) { + exc_raise_func = _dlsym(RTLD_DEFAULT, "catch_exception_raise"); + } + if (exc_raise_func == 0) { + /* The user hasn't defined catch_exception_raise in their binary */ + abort(); + } + return (*exc_raise_func)(exception_port, thread, task, exception, code, codeCnt); #else - extern kern_return_t catch_exception_raise(mach_port_t, mach_port_t, mach_port_t, exception_type_t, exception_data_t, mach_msg_type_number_t); - return catch_exception_raise(exception_port, thread, task, exception, code, codeCnt); + extern kern_return_t catch_exception_raise(mach_port_t, mach_port_t, mach_port_t, exception_type_t, exception_data_t, mach_msg_type_number_t); + return catch_exception_raise(exception_port, thread, task, exception, code, codeCnt); #endif } diff --git a/libsyscall/mach/exc_catcher.h b/libsyscall/mach/exc_catcher.h new file mode 100644 index 000000000..28aac2508 --- /dev/null +++ b/libsyscall/mach/exc_catcher.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef __EXC_CATCHER_H +#define __EXC_CATCHER_H + +#include "_libkernel_init.h" + +typedef kern_return_t (*_libkernel_exc_raise_func_t)(mach_port_t, + mach_port_t, + mach_port_t, + exception_type_t, + exception_data_t, + mach_msg_type_number_t); + +typedef kern_return_t (*_libkernel_exc_raise_state_func_t)(mach_port_t, + exception_type_t, + exception_data_t, + mach_msg_type_number_t, + int *, + thread_state_t, + mach_msg_type_number_t, + thread_state_t, + mach_msg_type_number_t *); + +typedef kern_return_t (*_libkernel_exec_raise_state_identity_t)(mach_port_t, + mach_port_t, mach_port_t, + exception_type_t, + exception_data_t, + mach_msg_type_number_t, + int *, thread_state_t, + mach_msg_type_number_t, + thread_state_t, + mach_msg_type_number_t *); + +#define RTLD_DEFAULT ((void *) -2) +extern void* (*_dlsym)(void*, const char*); + +#endif // __EXC_CATCHER_H diff --git a/libsyscall/mach/exc_catcher_state.c b/libsyscall/mach/exc_catcher_state.c index efcb5344c..c6674d56c 100644 --- a/libsyscall/mach/exc_catcher_state.c +++ b/libsyscall/mach/exc_catcher_state.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1999-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -36,9 +36,12 @@ #include #include #include -#include -__private_extern__ kern_return_t internal_catch_exception_raise_state ( +#include "abort.h" +#include "exc_catcher.h" + +__private_extern__ kern_return_t +internal_catch_exception_raise_state( mach_port_t exception_port, exception_type_t exception, exception_data_t code, @@ -50,21 +53,19 @@ __private_extern__ kern_return_t internal_catch_exception_raise_state ( mach_msg_type_number_t *new_stateCnt) { #if defined(__DYNAMIC__) - static int checkForFunction = 0; - /* This will be non-zero if the user has defined this function */ - static kern_return_t (*func)(mach_port_t, exception_type_t, exception_data_t, mach_msg_type_number_t, int *, thread_state_t, mach_msg_type_number_t, thread_state_t, mach_msg_type_number_t *); - if (checkForFunction == 0) { - checkForFunction = 1; - func = dlsym(RTLD_DEFAULT, "catch_exception_raise_state"); - } - if (func == 0) { - /* The user hasn't defined catch_exception_raise in their binary */ - abort(); - } - return (*func)(exception_port, exception, code, codeCnt, flavor, old_state, old_stateCnt, new_state, new_stateCnt); + static _libkernel_exc_raise_state_func_t exc_raise_state_func = (void*)-1; + + if (exc_raise_state_func == ((void*)-1)) { + exc_raise_state_func = _dlsym(RTLD_DEFAULT, "catch_exception_raise_state"); + } + if (exc_raise_state_func == 0) { + /* The user hasn't defined catch_exception_raise in their binary */ + abort(); + } + return (*exc_raise_state_func)(exception_port, exception, code, codeCnt, flavor, old_state, old_stateCnt, new_state, new_stateCnt); #else - extern kern_return_t catch_exception_raise_state(mach_port_t, exception_type_t, exception_data_t, mach_msg_type_number_t, int *, thread_state_t, mach_msg_type_number_t, thread_state_t, mach_msg_type_number_t *); - return catch_exception_raise_state(exception_port, exception, code, codeCnt, flavor, old_state, old_stateCnt, new_state, new_stateCnt); + extern kern_return_t catch_exception_raise_state(mach_port_t, exception_type_t, exception_data_t, mach_msg_type_number_t, int *, thread_state_t, mach_msg_type_number_t, thread_state_t, mach_msg_type_number_t *); + return catch_exception_raise_state(exception_port, exception, code, codeCnt, flavor, old_state, old_stateCnt, new_state, new_stateCnt); #endif } diff --git a/libsyscall/mach/exc_catcher_state_identity.c b/libsyscall/mach/exc_catcher_state_identity.c index 1e0c5c0df..b92f5892e 100644 --- a/libsyscall/mach/exc_catcher_state_identity.c +++ b/libsyscall/mach/exc_catcher_state_identity.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1999-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -36,9 +36,12 @@ #include #include #include -#include -__private_extern__ kern_return_t internal_catch_exception_raise_state_identity ( +#include "abort.h" +#include "exc_catcher.h" + +__private_extern__ kern_return_t +internal_catch_exception_raise_state_identity( mach_port_t exception_port, mach_port_t thread, mach_port_t task, @@ -52,21 +55,19 @@ __private_extern__ kern_return_t internal_catch_exception_raise_state_identity ( mach_msg_type_number_t *new_stateCnt) { #if defined(__DYNAMIC__) - static int checkForFunction = 0; - /* This will be non-zero if the user has defined this function */ - static kern_return_t (*func)(mach_port_t, mach_port_t, mach_port_t, exception_type_t, exception_data_t, mach_msg_type_number_t, int *, thread_state_t, mach_msg_type_number_t, thread_state_t, mach_msg_type_number_t *); - if (checkForFunction == 0) { - checkForFunction = 1; - func = dlsym(RTLD_DEFAULT, "catch_exception_raise_state_identity"); - } - if (func == 0) { - /* The user hasn't defined catch_exception_raise in their binary */ - abort(); - } - return (*func)(exception_port, thread, task, exception, code, codeCnt, flavor, old_state, old_stateCnt, new_state, new_stateCnt); + static _libkernel_exec_raise_state_identity_t exc_raise_state_identity_func = (void*)-1; + + if (exc_raise_state_identity_func == ((void*)-1)) { + exc_raise_state_identity_func = _dlsym(RTLD_DEFAULT, "catch_exception_raise_state_identity"); + } + if (exc_raise_state_identity_func == 0) { + /* The user hasn't defined catch_exception_raise in their binary */ + abort(); + } + return (*exc_raise_state_identity_func)(exception_port, thread, task, exception, code, codeCnt, flavor, old_state, old_stateCnt, new_state, new_stateCnt); #else - extern kern_return_t catch_exception_raise_state_identity(mach_port_t, mach_port_t, mach_port_t, exception_type_t, exception_data_t, mach_msg_type_number_t, int *, thread_state_t, mach_msg_type_number_t, thread_state_t, mach_msg_type_number_t *); - return catch_exception_raise_state_identity(exception_port, thread, task, exception, code, codeCnt, flavor, old_state, old_stateCnt, new_state, new_stateCnt); + extern kern_return_t catch_exception_raise_state_identity(mach_port_t, mach_port_t, mach_port_t, exception_type_t, exception_data_t, mach_msg_type_number_t, int *, thread_state_t, mach_msg_type_number_t, thread_state_t, mach_msg_type_number_t *); + return catch_exception_raise_state_identity(exception_port, thread, task, exception, code, codeCnt, flavor, old_state, old_stateCnt, new_state, new_stateCnt); #endif } diff --git a/libsyscall/mach/fprintf_stderr.c b/libsyscall/mach/fprintf_stderr.c index e89df1136..4d92bfc1c 100644 --- a/libsyscall/mach/fprintf_stderr.c +++ b/libsyscall/mach/fprintf_stderr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2003-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -32,30 +32,36 @@ #include #include -#include #include +#include "string.h" int (*vprintf_stderr_func)(const char *format, va_list ap); +#define __STDERR_FILENO 2 +int write(int fd, const char* cbuf, int nbyte); /* This function allows the writing of a mach error message to an * application-controllable output method, the default being to * use printf if no other method is specified by the application. * - * To override, set the global (static) function pointer vprintf_stderr to + * To override, set the global function pointer vprintf_stderr to * a function which takes the same parameters as vprintf. */ -int fprintf_stderr(const char *format, ...) +__private_extern__ int +fprintf_stderr(const char *format, ...) { - va_list args; + va_list args; int retval; va_start(args, format); - if (vprintf_stderr_func == NULL) - retval = vprintf(format, args); - else + if (vprintf_stderr_func == NULL) { + char buffer[1024]; + retval = _mach_vsnprintf(buffer, sizeof(buffer), format, args); + write(__STDERR_FILENO, buffer, retval); + } else { retval = (*vprintf_stderr_func)(format, args); + } va_end(args); return retval; diff --git a/libsyscall/mach/headers/Makefile.inc b/libsyscall/mach/headers/Makefile.inc deleted file mode 100644 index f747c3d76..000000000 --- a/libsyscall/mach/headers/Makefile.inc +++ /dev/null @@ -1,10 +0,0 @@ -MACH_INSTHDRS += mach.h \ - mach_error.h \ - mach_init.h \ - mach_interface.h \ - port_obj.h \ - sync.h \ - task.h \ - thread_act.h \ - vm_task.h -MACH_INSTHDRS := ${MACH_INSTHDRS:S/^/${.CURDIR}\/mach\/headers\//} diff --git a/libsyscall/mach/i386/Makefile.inc b/libsyscall/mach/i386/Makefile.inc deleted file mode 100644 index 4afb1ae3a..000000000 --- a/libsyscall/mach/i386/Makefile.inc +++ /dev/null @@ -1,3 +0,0 @@ -.PATH: ${.CURDIR}/mach/i386 - -MDSRCS += mach_absolute_time.S diff --git a/libsyscall/mach/headers/errorlib.h b/libsyscall/mach/mach/errorlib.h similarity index 94% rename from libsyscall/mach/headers/errorlib.h rename to libsyscall/mach/mach/errorlib.h index 0c3cc64c2..a5b6daf32 100644 --- a/libsyscall/mach/headers/errorlib.h +++ b/libsyscall/mach/mach/errorlib.h @@ -81,21 +81,21 @@ #define NO_SUCH_ERROR "unknown error code" struct error_subsystem { - char * subsys_name; + const char *subsys_name; int max_code; - char * * codes; + const char * const *codes; }; struct error_system { - int max_sub; - char * bad_sub; - struct error_subsystem * subsystem; + int max_sub; + const char *bad_sub; + const struct error_subsystem *subsystem; }; #include __BEGIN_DECLS -extern struct error_system errors[err_max_system+1]; +extern const struct error_system errors[err_max_system+1]; __END_DECLS #define errlib_count(s) (sizeof(s)/sizeof(s[0])) diff --git a/libsyscall/mach/headers/mach.h b/libsyscall/mach/mach/mach.h similarity index 100% rename from libsyscall/mach/headers/mach.h rename to libsyscall/mach/mach/mach.h diff --git a/libsyscall/mach/headers/mach_error.h b/libsyscall/mach/mach/mach_error.h similarity index 100% rename from libsyscall/mach/headers/mach_error.h rename to libsyscall/mach/mach/mach_error.h diff --git a/libsyscall/mach/headers/mach_init.h b/libsyscall/mach/mach/mach_init.h similarity index 95% rename from libsyscall/mach/headers/mach_init.h rename to libsyscall/mach/mach/mach_init.h index 36a47fac1..9816f1138 100644 --- a/libsyscall/mach/headers/mach_init.h +++ b/libsyscall/mach/mach/mach_init.h @@ -68,12 +68,11 @@ */ __BEGIN_DECLS -extern mach_port_t mach_task_self(void); extern mach_port_t mach_host_self(void); extern mach_port_t mach_thread_self(void); extern kern_return_t host_page_size(host_t, vm_size_t *); -extern mach_port_t mach_task_self_; +extern mach_port_t mach_task_self_; #define mach_task_self() mach_task_self_ #define current_task() mach_task_self() @@ -86,9 +85,6 @@ __BEGIN_DECLS */ extern mach_port_t bootstrap_port; -extern mach_port_t name_server_port; -extern mach_port_t environment_port; -extern mach_port_t service_port; /* * Where these ports occur in the "mach_ports_register" @@ -125,6 +121,7 @@ extern int vm_page_shift; * application to point to a user-specified output function */ extern int (*vprintf_stderr_func)(const char *format, va_list ap); + __END_DECLS #endif /* _MACH_INIT_ */ diff --git a/libsyscall/mach/headers/mach_interface.h b/libsyscall/mach/mach/mach_interface.h similarity index 100% rename from libsyscall/mach/headers/mach_interface.h rename to libsyscall/mach/mach/mach_interface.h diff --git a/libsyscall/mach/headers/port_obj.h b/libsyscall/mach/mach/port_obj.h similarity index 100% rename from libsyscall/mach/headers/port_obj.h rename to libsyscall/mach/mach/port_obj.h diff --git a/libsyscall/mach/headers/sync.h b/libsyscall/mach/mach/sync.h similarity index 100% rename from libsyscall/mach/headers/sync.h rename to libsyscall/mach/mach/sync.h diff --git a/libsyscall/mach/headers/task.h b/libsyscall/mach/mach/task.h similarity index 93% rename from libsyscall/mach/headers/task.h rename to libsyscall/mach/mach/task.h index a919ee664..6cef51794 100644 --- a/libsyscall/mach/headers/task.h +++ b/libsyscall/mach/mach/task.h @@ -29,10 +29,6 @@ #include #elif defined(__x86_64__) #include -#elif defined(__ppc__) -#include -#elif defined(__ppc64__) -#include #else #error unknown architecture #endif diff --git a/libsyscall/mach/headers/thread_act.h b/libsyscall/mach/mach/thread_act.h similarity index 92% rename from libsyscall/mach/headers/thread_act.h rename to libsyscall/mach/mach/thread_act.h index 2696b626d..b413f7a7e 100644 --- a/libsyscall/mach/headers/thread_act.h +++ b/libsyscall/mach/mach/thread_act.h @@ -29,10 +29,6 @@ #include #elif defined(__x86_64__) #include -#elif defined(__ppc__) -#include -#elif defined(__ppc64__) -#include #else #error unknown architecture #endif diff --git a/libsyscall/mach/headers/vm_task.h b/libsyscall/mach/mach/vm_task.h similarity index 100% rename from libsyscall/mach/headers/vm_task.h rename to libsyscall/mach/mach/vm_task.h diff --git a/libsyscall/mach/mach_error.c b/libsyscall/mach/mach_error.c index b87c0adbf..4b9542726 100644 --- a/libsyscall/mach/mach_error.c +++ b/libsyscall/mach/mach_error.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1999-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -60,26 +60,24 @@ * or returns a descriptive string. */ -#include #include #include #include "errorlib.h" +#include "string.h" int fprintf_stderr(const char *format, ...); void -mach_error( str, err ) - const char *str; - mach_error_t err; +mach_error(const char *str, mach_error_t err) { - char * err_str; + char *err_str; char buf[1024]; boolean_t diag; - err_str=mach_error_string_int(err, &diag); + err_str = mach_error_string_int(err, &diag); - if ( diag ) { - sprintf( buf, "%s %s (%x)", mach_error_type(err), err_str, err ); + if (diag) { + _mach_snprintf(buf, sizeof(buf), "%s %s (%x)", mach_error_type(err), err_str, err); err_str = buf; } diff --git a/libsyscall/mach/mach_error_string.c b/libsyscall/mach/mach_error_string.c index 9240629d9..82dc4da99 100644 --- a/libsyscall/mach/mach_error_string.c +++ b/libsyscall/mach/mach_error_string.c @@ -173,7 +173,6 @@ mach_error_string(mach_error_t err) boolean_t diag; return mach_error_string_int( err, &diag ); - } /* vim: set ts=4: */ diff --git a/libsyscall/mach/mach_init.c b/libsyscall/mach/mach_init.c index ce2eeed8c..c2702539e 100644 --- a/libsyscall/mach/mach_init.c +++ b/libsyscall/mach/mach_init.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1999-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -58,32 +58,26 @@ #include #include #include +#include #include "externs.h" +#include "mig_reply_port.h" mach_port_t mach_task_self_ = MACH_PORT_NULL; -mach_port_t mach_host_self_ = MACH_PORT_NULL; +#ifdef __i386__ +mach_port_t mach_host_self_ = MACH_PORT_NULL; +#endif -__private_extern__ kern_return_t _host_mach_msg_trap_return_; +vm_size_t vm_page_size = PAGE_SIZE; +vm_size_t vm_page_mask = PAGE_MASK; +int vm_page_shift = PAGE_SHIFT; -vm_size_t vm_page_size; -vm_size_t vm_page_mask; -int vm_page_shift; +int mach_init(void); +int _mach_fork_child(void); -/* - * Forward internal declarations for automatic mach_init during - * fork() implementation. - */ -/* fork() calls through atfork_child_routine */ -void (*_atfork_child_routine)(void); +static int mach_init_doit(bool forkchild); -static void mach_atfork_child_routine(void); -static boolean_t first = TRUE; -static void (*previous_atfork_child_routine)(void); -static boolean_t mach_init_inited = FALSE; -extern int mach_init(void); extern void _pthread_set_self(void *); extern void cthread_set_self(void *); -extern void __libc_init(void); /* Libc initialization routine */ kern_return_t host_page_size(__unused host_t host, vm_size_t *out_page_size) @@ -92,114 +86,74 @@ host_page_size(__unused host_t host, vm_size_t *out_page_size) return KERN_SUCCESS; } -static void mach_atfork_child_routine(void) +/* + * mach_init() must be called explicitly in static executables (including dyld). + * called by libSystem_initializer() in dynamic executables + */ +int +mach_init(void) { - /* - * If an (*_atfork_child_routine)() was registered when - * mach_init was first called, then call that routine - * prior to performing our re-initialization. This ensures - * that the post-fork handlers are called in exactly the - * same order as the crt0 (exec) handlers. Any library - * that makes use of the _atfork_child_routine must follow - * the same technique. - */ - if (previous_atfork_child_routine) { - (*previous_atfork_child_routine)(); + static bool mach_init_inited = false; + + if (mach_init_inited) { + return 0; } - mach_init_inited = FALSE; - mach_init(); + mach_init_inited = true; + + return mach_init_doit(false); } -mach_port_t -mach_host_self(void) +// called by libSystem_atfork_child() +int +_mach_fork_child(void) { - return(host_self_trap()); + return mach_init_doit(true); } -int mach_init_doit(int forkchild) +int +mach_init_doit(bool forkchild) { - host_t host; - /* * Get the important ports into the cached values, * as required by "mach_init.h". */ - mach_task_self_ = task_self_trap(); - host = host_self_trap(); - - - if (!forkchild) { - /* - * Set up the post-fork child handler in the libc stub - * to invoke this routine if this process forks. Save the - * previous value in order that we can call that handler - * prior to performing our postfork work. - */ - - first = FALSE; - previous_atfork_child_routine = _atfork_child_routine; - _atfork_child_routine = mach_atfork_child_routine; - _pthread_set_self(0); - cthread_set_self(0); - } - + /* * Initialize the single mig reply port */ - mig_init(0); - - /* - * Cache some other valuable system constants - */ - - (void)host_page_size(host, &vm_page_size); - vm_page_mask = vm_page_size - 1; - if (vm_page_size == 0) { - /* guard against unlikely craziness */ - vm_page_shift = 0; - } else { - /* - * Unfortunately there's no kernel interface to get the - * vm_page_shift, but it's easy enough to calculate. - */ - for (vm_page_shift = 0; - (vm_page_size & (1 << vm_page_shift)) == 0; - vm_page_shift++) - continue; - } - - mach_port_deallocate(mach_task_self_, host); - - mach_init_ports(); + _pthread_set_self(0); + _mig_init(0); #if WE_REALLY_NEED_THIS_GDB_HACK /* * Check to see if GDB wants us to stop */ { - task_user_data_data_t user_data; - mach_msg_type_number_t user_data_count = TASK_USER_DATA_COUNT; + task_user_data_data_t user_data; + mach_msg_type_number_t user_data_count = TASK_USER_DATA_COUNT; user_data.user_data = 0; (void)task_info(mach_task_self_, TASK_USER_DATA, (task_info_t)&user_data, &user_data_count); #define MACH_GDB_RUN_MAGIC_NUMBER 1 #ifdef MACH_GDB_RUN_MAGIC_NUMBER - /* This magic number is set in mach-aware gdb - * for RUN command to allow us to suspend user's - * executable (linked with this libmach!) - * with the code below. - * This hack should disappear when gdb improves. - */ + /* This magic number is set in mach-aware gdb + * for RUN command to allow us to suspend user's + * executable (linked with this libmach!) + * with the code below. + * This hack should disappear when gdb improves. + */ if ((int)user_data.user_data == MACH_GDB_RUN_MAGIC_NUMBER) { kern_return_t ret; user_data.user_data = 0; - ret = task_suspend (mach_task_self_); + ret = task_suspend(mach_task_self_); if (ret != KERN_SUCCESS) { - while(1) (void)task_terminate(mach_task_self_); + while (1) { + (void)task_terminate(mach_task_self_); + } } } #undef MACH_GDB_RUN_MAGIC_NUMBER @@ -207,62 +161,5 @@ int mach_init_doit(int forkchild) } #endif /* WE_REALLY_NEED_THIS_GDB_HACK */ - /* - * Reserve page 0 so that the program doesn't get it as - * the result of a vm_allocate() or whatever. - */ - { - vm_offset_t zero_page_start; - - zero_page_start = 0; - (void)vm_map(mach_task_self_, &zero_page_start, vm_page_size, - 0, FALSE, MEMORY_OBJECT_NULL, 0, TRUE, - VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_COPY); - /* ignore result, we don't care if it failed */ - } - - return(0); -} - - - - -/* - * mach_init() is called explicitly in static executables (including dyld) - * It is called implicitly by libSystem_initializer() in dynamic executables - */ -int mach_init(void) -{ - int ret; - - if (mach_init_inited) - return(0); - mach_init_inited = TRUE; - ret = mach_init_doit(0); - - return ret; -} - - - - -/* called by _cthread_fork_child() */ -int fork_mach_init(void) -{ - /* called only from child */ - return(mach_init_doit(1)); -} - -#undef mach_task_self - -mach_port_t -mach_task_self(void) -{ - return(task_self_trap()); -} - -mach_port_t -mach_thread_self(void) -{ - return(thread_self_trap()); + return 0; } diff --git a/libsyscall/mach/mach_init_libSystem.c b/libsyscall/mach/mach_init_libSystem.c deleted file mode 100644 index 86ca46aca..000000000 --- a/libsyscall/mach/mach_init_libSystem.c +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2007, 2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#ifdef __DYNAMIC__ -struct ProgramVars; /* forward reference */ - -extern void pthread_init(void); // from libc.a -extern void __libc_init(const struct ProgramVars* vars); // from libc.a -extern void __keymgr_initializer(void); // from libkeymgr.a -extern void _dyld_initializer(void); // from libdyld.a -extern void libdispatch_init(void); // from libdispatch.a - -/* - * libsyscall_initializer() initializes all of libSystem.dylib - */ -static __attribute__((constructor)) -void libSystem_initializer(int argc, const char* argv[], const char* envp[], const char* apple[], const struct ProgramVars* vars) -{ - mach_init(); - pthread_init(); - __libc_init(vars); - __keymgr_initializer(); - _dyld_initializer(); - libdispatch_init(); -} - -/* - * Old crt1.o glue used to call through mach_init_routine which was used to initialize libSystem. - * LibSystem now auto-initializes but mach_init_routine is left for binary compatibility. - */ -static void mach_init_old() {} -void (*mach_init_routine)(void) = &mach_init_old; - -#endif /* __DYNAMIC__ */ diff --git a/libsyscall/mach/mach_init_ports.c b/libsyscall/mach/mach_init_ports.c deleted file mode 100644 index fcb6d2227..000000000 --- a/libsyscall/mach/mach_init_ports.c +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright (c) 2003-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ - -#include -#include -#include "externs.h" - -mach_port_t bootstrap_port = MACH_PORT_NULL; -mach_port_t name_server_port = MACH_PORT_NULL; -mach_port_t environment_port = MACH_PORT_NULL; -mach_port_t service_port = MACH_PORT_NULL; -semaphore_t clock_sem = MACH_PORT_NULL; -mach_port_t clock_port = MACH_PORT_NULL; -mach_port_t thread_recycle_port = MACH_PORT_NULL; - -void -mach_init_ports(void) -{ - mach_port_array_t ports; - mach_msg_type_number_t ports_count; - kern_return_t kr; - host_t host; - - /* - * Find those ports important to every task. - */ - kr = task_get_special_port(mach_task_self(), - TASK_BOOTSTRAP_PORT, - &bootstrap_port); - if (kr != KERN_SUCCESS) - return; - - /* Get the clock service port for nanosleep */ - host = mach_host_self(); - kr = host_get_clock_service(host, SYSTEM_CLOCK, &clock_port); - if (kr != KERN_SUCCESS) { - abort(); - } - kr = semaphore_create(mach_task_self(), &clock_sem, SYNC_POLICY_FIFO, 0); - if (kr != KERN_SUCCESS) { - abort(); - } - mach_port_deallocate(mach_task_self(), host); - kr = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &thread_recycle_port); - if (kr != KERN_SUCCESS) { - abort(); - } - - /* - * Find the options service ports. - * XXX - Don't need these on Darwin, should go away. - */ - kr = mach_ports_lookup(mach_task_self(), &ports, - &ports_count); - if (kr == KERN_SUCCESS) { - if (ports_count >= MACH_PORTS_SLOTS_USED) { - name_server_port = ports[NAME_SERVER_SLOT]; - environment_port = ports[ENVIRONMENT_SLOT]; - service_port = ports[SERVICE_SLOT]; - } - - /* get rid of out-of-line data */ - (void) vm_deallocate(mach_task_self(), - (vm_offset_t) ports, - (vm_size_t) (ports_count * sizeof *ports)); - } -} - -#ifdef notdef -/* will have problems with dylib build --> not needed anyway */ -#ifndef lint -/* - * Routines which our library must suck in, to avoid - * a later library from referencing them and getting - * the wrong version. - */ -extern void _replacements(void); - -void -_replacements(void) -{ - (void)sbrk(0); /* Pull in our sbrk/brk */ - (void)malloc(0); /* Pull in our malloc package */ -} -#endif /* lint */ -#endif /* notdef */ diff --git a/pexpert/ppc/pe_bootargs.c b/libsyscall/mach/mach_legacy.c similarity index 80% rename from pexpert/ppc/pe_bootargs.c rename to libsyscall/mach/mach_legacy.c index a0d2b2a08..f425d78f8 100644 --- a/pexpert/ppc/pe_bootargs.c +++ b/libsyscall/mach/mach_legacy.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,12 +25,27 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#include -#include -char * -PE_boot_args( - void) +#include + +#undef mach_host_self + +mach_port_t +mach_host_self(void) +{ + return host_self_trap(); +} + +#undef mach_task_self + +mach_port_t +mach_task_self(void) +{ + return task_self_trap(); +} + +mach_port_t +mach_thread_self(void) { - return((char *)((boot_args*)PE_state.bootArgs)->CommandLine); + return thread_self_trap(); } diff --git a/libsyscall/mach/mach_msg.c b/libsyscall/mach/mach_msg.c index 644313d6b..d8b094119 100644 --- a/libsyscall/mach/mach_msg.c +++ b/libsyscall/mach/mach_msg.c @@ -210,28 +210,28 @@ mach_msg_destroy_port(mach_port_t port, mach_msg_type_name_t type) case MACH_MSG_TYPE_MOVE_SEND: case MACH_MSG_TYPE_MOVE_SEND_ONCE: /* destroy the send/send-once right */ - (void) mach_port_deallocate(mach_task_self(), port); + (void) mach_port_deallocate(mach_task_self_, port); break; case MACH_MSG_TYPE_MOVE_RECEIVE: /* destroy the receive right */ - (void) mach_port_mod_refs(mach_task_self(), port, + (void) mach_port_mod_refs(mach_task_self_, port, MACH_PORT_RIGHT_RECEIVE, -1); break; case MACH_MSG_TYPE_MAKE_SEND: /* create a send right and then destroy it */ - (void) mach_port_insert_right(mach_task_self(), port, + (void) mach_port_insert_right(mach_task_self_, port, port, MACH_MSG_TYPE_MAKE_SEND); - (void) mach_port_deallocate(mach_task_self(), port); + (void) mach_port_deallocate(mach_task_self_, port); break; case MACH_MSG_TYPE_MAKE_SEND_ONCE: /* create a send-once right and then destroy it */ - (void) mach_port_extract_right(mach_task_self(), port, + (void) mach_port_extract_right(mach_task_self_, port, MACH_MSG_TYPE_MAKE_SEND_ONCE, &port, &type); - (void) mach_port_deallocate(mach_task_self(), port); + (void) mach_port_deallocate(mach_task_self_, port); break; } } @@ -240,7 +240,7 @@ static void mach_msg_destroy_memory(vm_offset_t addr, vm_size_t size) { if (size != 0) - (void) vm_deallocate(mach_task_self(), addr, size); + (void) vm_deallocate(mach_task_self_, addr, size); } @@ -273,43 +273,56 @@ mach_msg_destroy(mach_msg_header_t *msg) mach_msg_destroy_port(msg->msgh_remote_port, MACH_MSGH_BITS_REMOTE(mbits)); if (mbits & MACH_MSGH_BITS_COMPLEX) { - mach_msg_body_t *body; - mach_msg_descriptor_t *saddr, *eaddr; + mach_msg_base_t *base; + mach_msg_type_number_t count, i; + mach_msg_descriptor_t *daddr; - body = (mach_msg_body_t *) (msg + 1); - saddr = (mach_msg_descriptor_t *) - ((mach_msg_base_t *) msg + 1); - eaddr = saddr + body->msgh_descriptor_count; + base = (mach_msg_base_t *) msg; + count = base->body.msgh_descriptor_count; - for ( ; saddr < eaddr; saddr++) { - switch (saddr->type.type) { + daddr = (mach_msg_descriptor_t *) (base + 1); + for (i = 0; i < count; i++) { + + switch (daddr->type.type) { - case MACH_MSG_PORT_DESCRIPTOR: { + case MACH_MSG_PORT_DESCRIPTOR: { mach_msg_port_descriptor_t *dsc; /* * Destroy port rights carried in the message */ - dsc = &saddr->port; - mach_msg_destroy_port(dsc->name, dsc->disposition); + dsc = &daddr->port; + mach_msg_destroy_port(dsc->name, dsc->disposition); + daddr = (mach_msg_descriptor_t *)(dsc + 1); break; - } + } - case MACH_MSG_OOL_DESCRIPTOR : { + case MACH_MSG_OOL_DESCRIPTOR: { mach_msg_ool_descriptor_t *dsc; /* * Destroy memory carried in the message */ - dsc = &saddr->out_of_line; + dsc = &daddr->out_of_line; if (dsc->deallocate) { mach_msg_destroy_memory((vm_offset_t)dsc->address, dsc->size); } + daddr = (mach_msg_descriptor_t *)(dsc + 1); break; - } + } - case MACH_MSG_OOL_PORTS_DESCRIPTOR : { + case MACH_MSG_OOL_VOLATILE_DESCRIPTOR: { + mach_msg_ool_descriptor_t *dsc; + + /* + * Just skip it. + */ + daddr = (mach_msg_descriptor_t *)(dsc + 1); + break; + } + + case MACH_MSG_OOL_PORTS_DESCRIPTOR: { mach_port_t *ports; mach_msg_ool_ports_descriptor_t *dsc; mach_msg_type_number_t j; @@ -317,7 +330,7 @@ mach_msg_destroy(mach_msg_header_t *msg) /* * Destroy port rights carried in the message */ - dsc = &saddr->ool_ports; + dsc = &daddr->ool_ports; ports = (mach_port_t *) dsc->address; for (j = 0; j < dsc->count; j++, ports++) { mach_msg_destroy_port(*ports, dsc->disposition); @@ -330,8 +343,9 @@ mach_msg_destroy(mach_msg_header_t *msg) mach_msg_destroy_memory((vm_offset_t)dsc->address, dsc->count * sizeof(mach_port_t)); } + daddr = (mach_msg_descriptor_t *)(dsc + 1); break; - } + } } } } @@ -362,7 +376,7 @@ mach_msg_server_once( mach_msg_size_t reply_alloc; mach_msg_return_t mr; kern_return_t kr; - mach_port_t self = mach_task_self(); + mach_port_t self = mach_task_self_; options &= ~(MACH_SEND_MSG|MACH_RCV_MSG); @@ -487,7 +501,7 @@ mach_msg_server( mach_msg_size_t reply_alloc; mach_msg_return_t mr; kern_return_t kr; - mach_port_t self = mach_task_self(); + mach_port_t self = mach_task_self_; options &= ~(MACH_SEND_MSG|MACH_RCV_MSG|MACH_RCV_OVERWRITE); diff --git a/libsyscall/mach/mig_allocate.c b/libsyscall/mach/mig_allocate.c index 14b8a2933..ed1288662 100644 --- a/libsyscall/mach/mig_allocate.c +++ b/libsyscall/mach/mig_allocate.c @@ -59,7 +59,7 @@ void mig_allocate(vm_address_t *addr_p, vm_size_t size) { - if (vm_allocate(mach_task_self(), + if (vm_allocate(mach_task_self_, addr_p, size, VM_MAKE_TAG(VM_MEMORY_MACH_MSG)|TRUE) diff --git a/libsyscall/mach/mig_deallocate.c b/libsyscall/mach/mig_deallocate.c index bbcf15e6b..2b58e2c70 100644 --- a/libsyscall/mach/mig_deallocate.c +++ b/libsyscall/mach/mig_deallocate.c @@ -59,7 +59,7 @@ void mig_deallocate(vm_address_t addr, vm_size_t size) { - (void) vm_deallocate(mach_task_self(), + (void) vm_deallocate(mach_task_self_, addr, size); } diff --git a/libsyscall/mach/mig_reply_port.c b/libsyscall/mach/mig_reply_port.c new file mode 100644 index 000000000..aa2890ac6 --- /dev/null +++ b/libsyscall/mach/mig_reply_port.c @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include + +//extern mach_port_t _pthread_reply_port(pthread_t); +static mach_port_t _task_reply_port = MACH_PORT_NULL; + +extern mach_port_t _mig_get_reply_port(void); +extern void _mig_set_reply_port(mach_port_t port); + +/* + * Called by mach_init with 0 before cthread_init is + * called and again with 1 at the end of cthread_init. + */ +void +_mig_init(int init_done) +{ + if (init_done == 0) { + _task_reply_port = mach_reply_port(); + } +} + +/* + * Called by mig interface code whenever a reply port is needed. + * Tracing is masked during this call; otherwise, a call to printf() + * can result in a call to malloc() which eventually reenters + * mig_get_reply_port() and deadlocks. + */ +mach_port_t +mig_get_reply_port(void) +{ + register mach_port_t port = _mig_get_reply_port(); + if (port == MACH_PORT_NULL) { + port = mach_reply_port(); + _mig_set_reply_port(port); + } + return port; +} + +/* + * Called by mig interface code after a timeout on the reply port. + * May also be called by user. The new mig calls with port passed in. + */ +void +mig_dealloc_reply_port(mach_port_t migport) +{ + register mach_port_t port; + + port = _mig_get_reply_port(); + if (port != MACH_PORT_NULL && port != _task_reply_port) { + _mig_set_reply_port(_task_reply_port); + (void) mach_port_mod_refs(mach_task_self(), port, MACH_PORT_RIGHT_RECEIVE, -1); + if (migport != port) { + (void) mach_port_deallocate(mach_task_self(), migport); + } + _mig_set_reply_port(MACH_PORT_NULL); + } +} + +/************************************************************* + * Called by mig interfaces after each RPC. + * Could be called by user. + ***********************************************************/ + +void +mig_put_reply_port(mach_port_t reply_port) +{ +} diff --git a/osfmk/mach/ppc/rpc.h b/libsyscall/mach/mig_reply_port.h similarity index 85% rename from osfmk/mach/ppc/rpc.h rename to libsyscall/mach/mig_reply_port.h index b3a274a2d..54e27879c 100644 --- a/osfmk/mach/ppc/rpc.h +++ b/libsyscall/mach/mig_reply_port.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002,2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,11 +25,10 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * @OSF_COPYRIGHT@ - */ -#ifndef _MACH_PPC_RPC_H_ -#define _MACH_PPC_RPC_H_ +#include + +void _mig_fork_child(void); +void _mig_init(int init_done); -#endif /* _MACH_PPC_RPC_H_ */ +void _mig_reply_port_callbacks(mach_port_t (*get)(void), void (*set)(mach_port_t)); diff --git a/libsyscall/mach/mig_strncpy.c b/libsyscall/mach/mig_strncpy.c index 4366563fa..ed17aaff2 100644 --- a/libsyscall/mach/mig_strncpy.c +++ b/libsyscall/mach/mig_strncpy.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1999-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -73,18 +73,21 @@ int mig_strncpy( - register char *dest, - register const char *src, - register int len) + char *dest, + const char *src, + int len) { - register int i; + int i; - if (len <= 0) - return 0; + if (len <= 0) { + return 0; + } - for (i=1; i #include -#include -#include + +#include "abort.h" +#include "string.h" + +int write(int fd, const char* cbuf, int nbyte); static mach_port_t master_host_port; @@ -72,14 +75,10 @@ panic_init(mach_port_t port) void panic(const char *s, ...) { - va_list listp; - - printf("panic: "); - va_start(listp, s); - vprintf(s, listp); - va_end(listp); - printf("\n"); - + char buffer[1024]; + int len = _mach_snprintf(buffer, sizeof(buffer), "panic: %s\n", s); + write(__STDERR_FILENO, buffer, len+1); + #define RB_DEBUGGER 0x1000 /* enter debugger NOW */ (void) host_reboot(master_host_port, RB_DEBUGGER); diff --git a/libsyscall/mach/port_obj.c b/libsyscall/mach/port_obj.c index 1951d1ce5..b23054a15 100644 --- a/libsyscall/mach/port_obj.c +++ b/libsyscall/mach/port_obj.c @@ -47,7 +47,7 @@ void port_obj_init( { kern_return_t kr; - kr = vm_allocate(mach_task_self(), + kr = vm_allocate(mach_task_self_, (vm_offset_t *)&port_obj_table, (vm_size_t)(maxsize * sizeof (*port_obj_table)), TRUE); diff --git a/libsyscall/mach/ppc/Makefile.inc b/libsyscall/mach/ppc/Makefile.inc deleted file mode 100644 index faa3b19e8..000000000 --- a/libsyscall/mach/ppc/Makefile.inc +++ /dev/null @@ -1,3 +0,0 @@ -.PATH: ${.CURDIR}/mach/ppc - -MDSRCS += mach_absolute_time.s diff --git a/libsyscall/mach/ppc64/Makefile.inc b/libsyscall/mach/ppc64/Makefile.inc deleted file mode 100644 index 302f57141..000000000 --- a/libsyscall/mach/ppc64/Makefile.inc +++ /dev/null @@ -1,4 +0,0 @@ -# searching ppc directory as a fallback to avoid unnecessary code duplication -.PATH: ${.CURDIR}/mach/ppc - -MDSRCS += mach_absolute_time.s diff --git a/libsyscall/mach/sbrk.c b/libsyscall/mach/sbrk.c deleted file mode 100644 index 702534a1b..000000000 --- a/libsyscall/mach/sbrk.c +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * File: sbrk.c - * - * Unix compatibility for sbrk system call. - * - * HISTORY - * 09-Mar-90 Gregg Kellogg (gk) at NeXT. - * include instead of - * - * 14-Feb-89 Avadis Tevanian (avie) at NeXT. - * Total rewrite using a fixed area of VM from break region. - */ - -#include /* for vm_allocate, vm_offset_t */ -#include - -static int sbrk_needs_init = TRUE; -static vm_size_t sbrk_region_size = 4*1024*1024; /* Well, what should it be? */ -static vm_address_t sbrk_curbrk; - -void *sbrk(size) - int size; -{ - kern_return_t ret; - - if (sbrk_needs_init) { - sbrk_needs_init = FALSE; - /* - * Allocate a big region to simulate break region. - */ - ret = vm_allocate(mach_task_self(), &sbrk_curbrk, sbrk_region_size, - VM_MAKE_TAG(VM_MEMORY_SBRK)|TRUE); - if (ret != KERN_SUCCESS) - return((void *)-1); - } - - if (size <= 0) - return((void *)sbrk_curbrk); - else if (size > sbrk_region_size) - return((void *)-1); - sbrk_curbrk += size; - sbrk_region_size -= size; - return((void *)(sbrk_curbrk - size)); -} - -void *brk(x) - void *x; -{ - return((void *)-1); -} - diff --git a/libsyscall/mach/servers/Makefile.inc b/libsyscall/mach/servers/Makefile.inc deleted file mode 100644 index 848379a88..000000000 --- a/libsyscall/mach/servers/Makefile.inc +++ /dev/null @@ -1,16 +0,0 @@ -.PATH: ${.CURDIR}/${MACHINE_ARCH}/mach/servers ${.CURDIR}/mach/servers - -SRVMIGDEFS += netname.defs - -SRVMIGHDRS = ${SRVMIGDEFS:S/.defs$/.h/} -#SRVMIGHDRS = ${SRVMIGDEFS:S/.defs$/.h/:S/^/${.CURDIR}\/mach\/servers\//} -SRVMIGSRCS = ${SRVMIGDEFS:S/.defs$/User.c/} - -SRVHDRS = netname_defs.h key_defs.h nm_defs.h ls_defs.h -SRVHDRS := ${SRVHDRS:S/^/${.CURDIR}\/mach\/servers\//} -SRVHDRS += ${SRVMIGHDRS} - -MISRCS+= ${SRVMIGDEFS:S/.defs$/User.defs/} - -CLEANFILES += ${SRVMIGHDRS} ${SRVMIGHDRS:S/.h$/User.c/} \ - ${SRVMIGHDRS:S/.h$/Server.c/} diff --git a/libsyscall/mach/slot_name.c b/libsyscall/mach/slot_name.c index a059c1c59..fa733527c 100644 --- a/libsyscall/mach/slot_name.c +++ b/libsyscall/mach/slot_name.c @@ -46,26 +46,6 @@ #include #include -/* - * Convert the specified cpu_type/cpu_subtype pair to their - * human readable form. - */ -void slot_name(cpu_type, cpu_subtype, cpu_name, cpu_subname) - cpu_type_t cpu_type; - cpu_subtype_t cpu_subtype; - char **cpu_name, **cpu_subname; -{ - register char *name = "Unknown CPU"; - register char *subname = ""; - const NXArchInfo *ai = NXGetArchInfoFromCpuType(cpu_type, cpu_subtype); - if (ai != NULL) { - name = (char *)ai->name; - subname = (char *)ai->description; - } - *cpu_name = name; - *cpu_subname = subname; -} - kern_return_t msg_rpc(void) { return KERN_FAILURE; } diff --git a/libsyscall/mach/string.c b/libsyscall/mach/string.c new file mode 100644 index 000000000..000a0f88f --- /dev/null +++ b/libsyscall/mach/string.c @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include "string.h" + +static const char hex[] = "0123456789abcdef"; + +static int +_mach_strlen(const char *str) +{ + const char *p; + for (p = str; p; p++) { + if (*p == '\0') { + return (p - str); + } + } + /* NOTREACHED */ + return 0; +} + +static void +_mach_hex(char **buffer, int *length, unsigned long long n) +{ + char buf[32]; + char *cp = buf + sizeof(buf); + + if (n) { + *--cp = '\0'; + while (n) { + *--cp = hex[n & 0xf]; + n >>= 4; + } + + int width = _mach_strlen(cp); + while (width > 0 && length > 0) { + *(*buffer)++ = *cp++; + (*length)--; + width--; + } + } +} + +int +_mach_vsnprintf(char *buffer, int length, const char *fmt, va_list ap) +{ + int width, max = length; + char *out_ptr = buffer; + + // we only ever write n-1 bytes so we can put a \0 at the end + length--; + while (length > 0 && *fmt) { + if (*fmt == '\0') { + break; + } + if (*fmt != '%') { + *(out_ptr++) = *(fmt++); + length--; + continue; + } + fmt++; + // only going to support a specific subset of sprintf flags + // namely %s, %x, with no padding modifiers + switch (*fmt++) { + case 's': + { + char *cp = va_arg(ap, char*); + width = _mach_strlen(cp); + while (width > 0 && length > 0) { + *(out_ptr++) = *(cp++); + width--; + length--; + } + break; + } + case 'x': + { + _mach_hex(&out_ptr, &length, va_arg(ap, unsigned int)); + break; + } + } + } + *out_ptr = '\0'; + return max - length; +} + +int +_mach_snprintf(char *buffer, int length, const char *fmt, ...) +{ + int ret; + va_list ap; + va_start(ap, fmt); + ret = _mach_vsnprintf(buffer, length, fmt, ap); + va_end(ap); + return ret; +} diff --git a/osfmk/mach/ppc/thread_state.h b/libsyscall/mach/string.h similarity index 59% rename from osfmk/mach/ppc/thread_state.h rename to libsyscall/mach/string.h index 3ab7baa1f..9b20980cf 100644 --- a/osfmk/mach/ppc/thread_state.h +++ b/libsyscall/mach/string.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,18 +25,37 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * @OSF_COPYRIGHT@ - */ -#ifndef _MACH_PPC_THREAD_STATE_H_ -#define _MACH_PPC_THREAD_STATE_H_ +#ifndef _STRING_H_ +#define _STRING_H_ -/* Size of maximum exported thread state in words */ -#define PPC_THREAD_STATE_MAX (144) /* Size of biggest state possible */ +#include +#include <_types.h> -#if defined (__ppc__) || defined (__ppc64__) -#define THREAD_STATE_MAX PPC_THREAD_STATE_MAX +#ifndef SIZE_T +#define SIZE_T +typedef __darwin_size_t size_t; #endif -#endif /* _MACH_PPC_THREAD_STATE_H_ */ +#ifndef NULL +#define NULL __DARWIN_NULL +#endif + +#ifndef _UINTPTR_T +#define _UINTPTR_T +typedef unsigned long uintptr_t; +#endif /* _UINTPTR_T */ + +// We're purposefully called "string.h" in order to superceed any use +// of Libc's string.h (which no one should be using bar MIG) in order +// to override their use of memcpy. + +int _mach_snprintf(char *buffer, int length, const char *fmt, ...); +int _mach_vsnprintf(char *buffer, int length, const char *fmt, va_list ap); + +// Actually in memcpy.c but MIG likes to include string.h + +void *memcpy(void *dst0, const void *src0, size_t length); +int memcmp(const void *s1, const void *s2, size_t n); + +#endif /* _STRING_H_ */ diff --git a/libsyscall/mach/x86_64/Makefile.inc b/libsyscall/mach/x86_64/Makefile.inc deleted file mode 100644 index 475e5a5b8..000000000 --- a/libsyscall/mach/x86_64/Makefile.inc +++ /dev/null @@ -1,3 +0,0 @@ -.PATH: ${.CURDIR}/mach/x86_64 - -MDSRCS += mach_absolute_time.S diff --git a/libsyscall/wrappers/__get_cpu_capabilities.s b/libsyscall/wrappers/__get_cpu_capabilities.s new file mode 100644 index 000000000..12e9c7652 --- /dev/null +++ b/libsyscall/wrappers/__get_cpu_capabilities.s @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +/* Get the cpu_capabilities bit vector out of the comm page */ + +#define __APPLE_API_PRIVATE +#include +#undef __APPLE_API_PRIVATE + +#if defined(__x86_64__) + + .text + .align 2, 0x90 + .globl __get_cpu_capabilities +__get_cpu_capabilities: + movq $(_COMM_PAGE_CPU_CAPABILITIES), %rax + movl (%rax), %eax + ret + +#elif defined(__i386__) + + .text + .align 2, 0x90 + .globl __get_cpu_capabilities +__get_cpu_capabilities: + movl _COMM_PAGE_CPU_CAPABILITIES, %eax + ret + +#endif diff --git a/osfmk/x86_64/genassym.c b/libsyscall/wrappers/_errno.h similarity index 88% rename from osfmk/x86_64/genassym.c rename to libsyscall/wrappers/_errno.h index 2fc719cff..0c3c2da96 100644 --- a/osfmk/x86_64/genassym.c +++ b/libsyscall/wrappers/_errno.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -26,4 +26,8 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#include "../i386/genassym.c" +#include + +extern int* (*_libc_get_errno)(void); +#undef errno +#define errno (*_libc_get_errno()) diff --git a/osfmk/ppc/testjump.c b/libsyscall/wrappers/_libc_funcptr.c similarity index 55% rename from osfmk/ppc/testjump.c rename to libsyscall/wrappers/_libc_funcptr.c index be2ae5afa..60fd52142 100644 --- a/osfmk/ppc/testjump.c +++ b/libsyscall/wrappers/_libc_funcptr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,56 +25,48 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * @OSF_FREE_COPYRIGHT@ - */ -#define VERBOSE 0 +#include +#include + +extern void (*_libc_set_errno)(int); -#include +static mach_port_t (*_libc_get_reply_port)(void); +static void (*_libc_set_reply_port)(mach_port_t); -int recursed(jmp_buf_t *bufp, int retval, int depth) +/* + * Called at Libsystem initialise time, sets up callbacks we + * need to get at thread variables inside of Libc + */ +void +_mig_reply_port_callbacks(mach_port_t (*get)(void), void (*set)(mach_port_t)) { - int mumbojumbo[16]; - int i; - -#if VERBOSE - for (i=0;i -#include -#include -#include -#include -#include -#include +#include "_libkernel_init.h" +#include "mig_reply_port.h" -#include "hfs.h" -#include "hfs_cnode.h" +void (*_libc_set_errno)(int) __attribute__((visibility("hidden"))); +int* (*_libc_get_errno)(void) __attribute__((visibility("hidden"))); -int cp_key_store_action(int action __unused) -{ - return ENOTSUP; -} +/* dlsym() funcptr is for legacy support in exc_catcher */ +void* (*_dlsym)(void*, const char*) __attribute__((visibility("hidden"))); - -int cp_register_wraps(cp_wrap_func_t key_store_func __unused) +void +_libkernel_init(_libkernel_functions_t fns) { - return ENOTSUP; + /* libc */ + _libc_set_errno = fns.set_errno; + _libc_get_errno = fns.get_errno; + + /* mach */ + _mig_reply_port_callbacks(fns.get_reply_port, fns.set_reply_port); + + /* dlsym */ + _dlsym = fns.dlsym; } - diff --git a/bsd/ppc/disklabel.h b/libsyscall/wrappers/_libkernel_init.h similarity index 65% rename from bsd/ppc/disklabel.h rename to libsyscall/wrappers/_libkernel_init.h index f7cfe1155..609975abd 100644 --- a/bsd/ppc/disklabel.h +++ b/libsyscall/wrappers/_libkernel_init.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,24 +25,30 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -#ifndef _BSD_PPC_DISKLABEL_H_ -#define _BSD_PPC_DISKLABEL_H_ +#ifndef __LIBKERNEL_INIT_H +#define __LIBKERNEL_INIT_H -#include +#include -#ifdef __APPLE_API_OBSOLETE -#define LABELSECTOR (1024 / DEV_BSIZE) /* sector containing label */ -#define LABELOFFSET 0 /* offset of label in sector */ -#define MAXPARTITIONS 8 /* number of partitions */ -#define RAW_PART 2 /* raw partition: xx?c */ +typedef struct _libkernel_functions { + /* for mach dependencies on libc */ + mach_port_t (*get_reply_port)(void); + void (*set_reply_port)(mach_port_t); + + /* dlsym() for looking up catch_exception_raise */ + void* (*dlsym)(void*, const char*); -/* Just a dummy */ -struct cpu_disklabel { - int cd_dummy; /* must have one element. */ -}; + /* placeholders for struct layout compatibility with Libsystem */ + void *_placeholder_1; + void *_placeholder_2; + + /* for setting errno in libc */ + void (*set_errno)(int); + int* (*get_errno)(void); -#endif /* __APPLE_API_OBSOLETE */ +} _libkernel_functions_t; -#endif /* _BSD_PPC_DISKLABEL_H_ */ +void _libkernel_init(_libkernel_functions_t fns); + +#endif // __LIBKERNEL_INIT_H` diff --git a/libsyscall/wrappers/cancelable/fcntl-base.c b/libsyscall/wrappers/cancelable/fcntl-base.c new file mode 100644 index 000000000..2f48a42c1 --- /dev/null +++ b/libsyscall/wrappers/cancelable/fcntl-base.c @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2004, 2006 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +#include + +int __FCNTL(int, int, void *); + +/* + * Stub function to account for the differences in the size of the third + * argument when int and void * are different sizes. Also add pthread + * cancelability. + * + * This is for LP64 only. + */ +int +fcntl(int fd, int cmd, ...) +{ + va_list ap; + void *arg; + + va_start(ap, cmd); + switch(cmd) { + case F_GETLK: + case F_SETLK: + case F_SETLKW: + case F_PREALLOCATE: + case F_SETSIZE: + case F_RDADVISE: + case F_READBOOTSTRAP: + case F_WRITEBOOTSTRAP: + case F_LOG2PHYS: + case F_GETPATH: + case F_GETPATH_MTMINFO: + case F_PATHPKG_CHECK: + case F_OPENFROM: + case F_UNLINKFROM: + case F_ADDSIGS: + arg = va_arg(ap, void *); + break; + default: + arg = (void *)((unsigned long)va_arg(ap, int)); + break; + } + va_end(ap); + return (__FCNTL(fd, cmd, arg)); +} diff --git a/libsyscall/mach/x86_64/mach_absolute_time.S b/libsyscall/wrappers/cancelable/fcntl-cancel.c similarity index 81% rename from libsyscall/mach/x86_64/mach_absolute_time.S rename to libsyscall/wrappers/cancelable/fcntl-cancel.c index 7c53025b6..e5db000a6 100644 --- a/libsyscall/mach/x86_64/mach_absolute_time.S +++ b/libsyscall/wrappers/cancelable/fcntl-cancel.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2010 Apple Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -21,12 +21,11 @@ * @APPLE_LICENSE_HEADER_END@ */ -#include +#if defined(__LP64__) || defined(__arm__) +#include +#define __FCNTL __fcntl - .text - .align 2 - .globl _mach_absolute_time -_mach_absolute_time: - movq $(_COMM_PAGE_NANOTIME), %rax - jmp *%rax +#include "fcntl-base.c" + +#endif diff --git a/libsyscall/wrappers/cancelable/fcntl.c b/libsyscall/wrappers/cancelable/fcntl.c new file mode 100644 index 000000000..f31bff7ef --- /dev/null +++ b/libsyscall/wrappers/cancelable/fcntl.c @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#if defined(__LP64__) || defined(__arm__) + +#undef __DARWIN_NON_CANCELABLE +#define __DARWIN_NON_CANCELABLE 1 + +#include +#define __FCNTL __fcntl_nocancel + +#include "fcntl-base.c" + +#endif diff --git a/libsyscall/wrappers/cancelable/select-cancel.c b/libsyscall/wrappers/cancelable/select-cancel.c new file mode 100644 index 000000000..dba3fc291 --- /dev/null +++ b/libsyscall/wrappers/cancelable/select-cancel.c @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#define VARIANT_CANCELABLE + +#include "../select-base.c" diff --git a/libsyscall/wrappers/cancelable/select.c b/libsyscall/wrappers/cancelable/select.c new file mode 100644 index 000000000..af06d655f --- /dev/null +++ b/libsyscall/wrappers/cancelable/select.c @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#undef __DARWIN_NON_CANCELABLE +#define __DARWIN_NON_CANCELABLE 1 + +#include "../select-base.c" diff --git a/libsyscall/wrappers/cancelable/sigsuspend-cancel.c b/libsyscall/wrappers/cancelable/sigsuspend-cancel.c new file mode 100644 index 000000000..a7e7a320d --- /dev/null +++ b/libsyscall/wrappers/cancelable/sigsuspend-cancel.c @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#define __SIGSUSPEND __sigsuspend + +#include "../sigsuspend-base.c" diff --git a/libsyscall/mach/i386/mach_absolute_time.S b/libsyscall/wrappers/cancelable/sigsuspend.c similarity index 81% rename from libsyscall/mach/i386/mach_absolute_time.S rename to libsyscall/wrappers/cancelable/sigsuspend.c index 71e746235..2b1e2d877 100644 --- a/libsyscall/mach/i386/mach_absolute_time.S +++ b/libsyscall/wrappers/cancelable/sigsuspend.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2010 Apple Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -21,12 +21,9 @@ * @APPLE_LICENSE_HEADER_END@ */ -#include +#undef __DARWIN_NON_CANCELABLE +#define __DARWIN_NON_CANCELABLE 1 +#define __SIGSUSPEND __sigsuspend_nocancel - .text - .align 2 - .globl _mach_absolute_time -_mach_absolute_time: - movl $(_COMM_PAGE_NANOTIME), %eax - jmpl *%eax +#include "../sigsuspend-base.c" diff --git a/bsd/dev/ppc/sysctl.c b/libsyscall/wrappers/init_cpu_capabilities.c similarity index 61% rename from bsd/dev/ppc/sysctl.c rename to libsyscall/wrappers/init_cpu_capabilities.c index 7bc509e16..7eecac6bf 100644 --- a/bsd/dev/ppc/sysctl.c +++ b/libsyscall/wrappers/init_cpu_capabilities.c @@ -1,19 +1,14 @@ /* * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * @APPLE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER @@ -23,6 +18,24 @@ * Please see the License for the specific language governing rights and * limitations under the License. * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + * @APPLE_LICENSE_HEADER_END@ */ +#define __APPLE_API_PRIVATE +#include +#undef __APPLE_API_PRIVATE + +#if defined(__i386__) || defined(__x86_64__) + +/* Initialize the "_cpu_capabilities" vector on x86 processors. */ + +int _cpu_has_altivec = 0; // DEPRECATED +int _cpu_capabilities = 0; + +void +_init_cpu_capabilities( void ) +{ + _cpu_capabilities = _get_cpu_capabilities(); +} + +#endif diff --git a/libsyscall/wrappers/ioctl.c b/libsyscall/wrappers/ioctl.c new file mode 100644 index 000000000..eced7e7e1 --- /dev/null +++ b/libsyscall/wrappers/ioctl.c @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#if defined(__LP64__) || defined(__arm__) + +#include +#include + +int __ioctl(int, unsigned long, void *); +/* + * Stub function to account for the third argument being void * + * + * This is for LP64 only. + */ +int +ioctl(int d, unsigned long request, ...) +{ + va_list ap; + void *arg; + + va_start(ap, request); + arg = va_arg(ap, void *); + va_end(ap); + return (__ioctl(d, request, arg)); +} + +#endif diff --git a/libsyscall/wrappers/kill.c b/libsyscall/wrappers/kill.c new file mode 100644 index 000000000..74e3ca2fe --- /dev/null +++ b/libsyscall/wrappers/kill.c @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include + +extern int __kill(pid_t pid, int sig, int posix); + +/* + * kill stub, which wraps a modified kill system call that takes a posix + * behaviour indicator as the third parameter to indicate whether or not + * conformance to standards is needed. We use a trailing parameter in + * case the call is called directly via syscall(), since for most uses, + * it won't matter to the caller. + */ +int +kill(pid_t pid, int sig) +{ +#if __DARWIN_UNIX03 + return(__kill(pid, sig, 1)); +#else /* !__DARWIN_UNIX03 */ + return(__kill(pid, sig, 0)); +#endif /* !__DARWIN_UNIX03 */ +} diff --git a/libsyscall/wrappers/legacy/accept.c b/libsyscall/wrappers/legacy/accept.c new file mode 100644 index 000000000..216b76685 --- /dev/null +++ b/libsyscall/wrappers/legacy/accept.c @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2006-2010 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef NO_SYSCALL_LEGACY + +#define _NONSTD_SOURCE +#include + +/* + * We need conformance on so that EOPNOTSUPP=102. But the routine symbol + * will still be the legacy (undecorated) one. + */ +#undef __DARWIN_UNIX03 +#define __DARWIN_UNIX03 1 + +#include +#include +#include "_errno.h" + +int __accept_nocancel(int, struct sockaddr *, socklen_t *); + +/* + * accept stub, legacy version + */ +int +accept(int s, struct sockaddr *addr, socklen_t *addrlen) +{ + int ret = __accept_nocancel(s, addr, addrlen); + + /* use ENOTSUP for legacy behavior */ + if (ret < 0 && errno == EOPNOTSUPP) + errno = ENOTSUP; + return ret; +} + +#endif /* __DARWIN_ONLY_UNIX_CONFORMANCE */ diff --git a/libsyscall/wrappers/legacy/bind.c b/libsyscall/wrappers/legacy/bind.c new file mode 100644 index 000000000..f30281d61 --- /dev/null +++ b/libsyscall/wrappers/legacy/bind.c @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2006-2010 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef NO_SYSCALL_LEGACY + +#define _NONSTD_SOURCE +#include + +/* + * We need conformance on so that EOPNOTSUPP=102. But the routine symbol + * will still be the legacy (undecorated) one. + */ +#undef __DARWIN_UNIX03 +#define __DARWIN_UNIX03 1 + +#include +#include +#include "_errno.h" + +extern int __bind(int, const struct sockaddr *, socklen_t); + +/* + * bind stub, legacy version + */ +int +bind(int s, const struct sockaddr *name, socklen_t namelen) +{ + int ret = __bind(s, name, namelen); + + /* use ENOTSUP for legacy behavior */ + if (ret < 0 && errno == EOPNOTSUPP) + errno = ENOTSUP; + return ret; +} + +#endif /* __DARWIN_ONLY_UNIX_CONFORMANCE */ diff --git a/libsyscall/wrappers/legacy/connect.c b/libsyscall/wrappers/legacy/connect.c new file mode 100644 index 000000000..39910566b --- /dev/null +++ b/libsyscall/wrappers/legacy/connect.c @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef NO_SYSCALL_LEGACY + +#define _NONSTD_SOURCE +#include + +/* + * We need conformance on so that EOPNOTSUPP=102. But the routine symbol + * will still be the legacy (undecorated) one. + */ +#undef __DARWIN_UNIX03 +#define __DARWIN_UNIX03 1 + +#include +#include +#include "_errno.h" + +int __connect_nocancel(int, const struct sockaddr *, socklen_t); + +/* + * connect stub, legacy version + */ +int +connect(int s, const struct sockaddr *name, socklen_t namelen) +{ + int ret = __connect_nocancel(s, name, namelen); + + /* use ENOTSUP for legacy behavior */ + if (ret < 0 && errno == EOPNOTSUPP) + errno = ENOTSUP; + return ret; +} + +#endif /* NO_SYSCALL_LEGACY */ diff --git a/libsyscall/wrappers/legacy/getattrlist.c b/libsyscall/wrappers/legacy/getattrlist.c new file mode 100644 index 000000000..a0444a3da --- /dev/null +++ b/libsyscall/wrappers/legacy/getattrlist.c @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2006-2010 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef NO_SYSCALL_LEGACY + +#define _NONSTD_SOURCE +#include + +/* + * We need conformance on so that EOPNOTSUPP=102. But the routine symbol + * will still be the legacy (undecorated) one. + */ +#undef __DARWIN_UNIX03 +#define __DARWIN_UNIX03 1 + +#include +#include "_errno.h" + +#ifdef NO_SYSCALL_LEGACY +extern int __getattrlist(const char *, void *, void *, size_t, unsigned int); +#else /* !__LP64__ */ +extern int __getattrlist(const char *, void *, void *, size_t, unsigned long); +#endif /* __LP64__ */ + +/* + * getattrlist stub, legacy version + */ +int +#ifdef __LP64__ +getattrlist(const char *path, void *attrList, void *attrBuf, + size_t attrBufSize, unsigned int options) +#else /* !__LP64__ */ +getattrlist(const char *path, void *attrList, void *attrBuf, + size_t attrBufSize, unsigned long options) +#endif /* __LP64__ */ +{ + int ret = __getattrlist(path, attrList, attrBuf, attrBufSize, options); + + /* use ENOTSUP for legacy behavior */ + if (ret < 0 && errno == EOPNOTSUPP) + errno = ENOTSUP; + return ret; +} + +#endif /* NO_SYSCALL_LEGACY */ diff --git a/libsyscall/wrappers/legacy/getpeername.c b/libsyscall/wrappers/legacy/getpeername.c new file mode 100644 index 000000000..a5619ece0 --- /dev/null +++ b/libsyscall/wrappers/legacy/getpeername.c @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2006-2010 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef NO_SYSCALL_LEGACY + +#define _NONSTD_SOURCE +#include + +/* + * We need conformance on so that EOPNOTSUPP=102. But the routine symbol + * will still be the legacy (undecorated) one. + */ +#undef __DARWIN_UNIX03 +#define __DARWIN_UNIX03 1 + +#include +#include "_errno.h" + +extern int __getpeername(int, struct sockaddr * __restrict, socklen_t * __restrict); + +/* + * getpeername stub, legacy version + */ +int +getpeername(int socket, struct sockaddr * __restrict address, + socklen_t * __restrict address_len) +{ + int ret = __getpeername(socket, address, address_len); + + /* use ENOTSUP for legacy behavior */ + if (ret < 0 && errno == EOPNOTSUPP) + errno = ENOTSUP; + return ret; +} + +#endif /* __DARWIN_ONLY_UNIX_CONFORMANCE */ diff --git a/libsyscall/wrappers/legacy/getsockname.c b/libsyscall/wrappers/legacy/getsockname.c new file mode 100644 index 000000000..9a2a94cd5 --- /dev/null +++ b/libsyscall/wrappers/legacy/getsockname.c @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2006-2010 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef NO_SYSCALL_LEGACY + +#define _NONSTD_SOURCE +#include + +/* + * We need conformance on so that EOPNOTSUPP=102. But the routine symbol + * will still be the legacy (undecorated) one. + */ +#undef __DARWIN_UNIX03 +#define __DARWIN_UNIX03 1 + +#include +#include "_errno.h" + +extern int __getsockname(int, struct sockaddr * __restrict, socklen_t * __restrict); + +/* + * getsockname stub, legacy version + */ +int +getsockname(int socket, struct sockaddr * __restrict address, + socklen_t * __restrict address_len) +{ + int ret = __getsockname(socket, address, address_len); + + /* use ENOTSUP for legacy behavior */ + if (ret < 0 && errno == EOPNOTSUPP) + errno = ENOTSUP; + return ret; +} + +#endif /* __DARWIN_ONLY_UNIX_CONFORMANCE */ diff --git a/libsyscall/wrappers/legacy/kill.c b/libsyscall/wrappers/legacy/kill.c new file mode 100644 index 000000000..1f25079e9 --- /dev/null +++ b/libsyscall/wrappers/legacy/kill.c @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef NO_SYSCALL_LEGACY + +#define _NONSTD_SOURCE + +#include "../kill.c" + +#endif /* NO_SYSCALL_LEGACY */ diff --git a/libsyscall/wrappers/legacy/lchown.c b/libsyscall/wrappers/legacy/lchown.c new file mode 100644 index 000000000..05279fe4c --- /dev/null +++ b/libsyscall/wrappers/legacy/lchown.c @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2006-2010 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef NO_SYSCALL_LEGACY + +#define _NONSTD_SOURCE +#include + +/* + * We need conformance on so that EOPNOTSUPP=102. But the routine symbol + * will still be the legacy (undecorated) one. + */ +#undef __DARWIN_UNIX03 +#define __DARWIN_UNIX03 1 + +#include +#include "_errno.h" + +int __lchown(const char *, uid_t, gid_t); + +/* + * lchown stub, legacy version + */ +int +lchown(const char *path, uid_t owner, gid_t group) +{ + int ret = __lchown(path, owner, group); + + /* use ENOTSUP for legacy behavior */ + if (ret < 0 && errno == EOPNOTSUPP) + errno = ENOTSUP; + return ret; +} + +#endif diff --git a/osfmk/ppc/xpr.h b/libsyscall/wrappers/legacy/listen.c similarity index 50% rename from osfmk/ppc/xpr.h rename to libsyscall/wrappers/legacy/listen.c index c81865886..0e21db52b 100644 --- a/osfmk/ppc/xpr.h +++ b/libsyscall/wrappers/legacy/listen.c @@ -1,19 +1,14 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2006-2010 Apple Inc. All rights reserved. * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * @APPLE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER @@ -23,14 +18,38 @@ * Please see the License for the specific language governing rights and * limitations under the License. * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + * @APPLE_LICENSE_HEADER_END@ */ + +#ifndef NO_SYSCALL_LEGACY + +#define _NONSTD_SOURCE +#include + /* - * @OSF_COPYRIGHT@ + * We need conformance on so that EOPNOTSUPP=102. But the routine symbol + * will still be the legacy (undecorated) one. */ +#undef __DARWIN_UNIX03 +#define __DARWIN_UNIX03 1 + +#include +#include "_errno.h" + +extern int __listen(int, int); /* - * Machine dependent module for the XPR tracing facility. + * listen stub, legacy version */ +int +listen(int socket, int backlog) +{ + int ret = __listen(socket, backlog); + + /* use ENOTSUP for legacy behavior */ + if (ret < 0 && errno == EOPNOTSUPP) + errno = ENOTSUP; + return ret; +} -#define XPR_TIMESTAMP (0) +#endif diff --git a/libsyscall/wrappers/legacy/mprotect.c b/libsyscall/wrappers/legacy/mprotect.c new file mode 100644 index 000000000..666b8974d --- /dev/null +++ b/libsyscall/wrappers/legacy/mprotect.c @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef NO_SYSCALL_LEGACY + +#define _NONSTD_SOURCE +#include + +#include "_errno.h" +#include +#include +#include + +/* + * Stub function to account for the differences in standard compliance + * while maintaining binary backward compatibility. + * + * This is only the legacy behavior. + */ +extern int __mprotect(void *, size_t, int); + +int +mprotect(void *addr, size_t len, int prot) +{ + void *aligned_addr; + size_t offset; + int rv; + + /* + * Page-align "addr" since the system now requires it + * for standards compliance. + * Update "len" to reflect the alignment. + */ + offset = ((uintptr_t) addr) & PAGE_MASK; + aligned_addr = (void *) (((uintptr_t) addr) & ~PAGE_MASK); + len += offset; + rv = __mprotect(aligned_addr, len, prot); + if (rv == -1 && errno == ENOMEM) { + /* + * Standards now require that we return ENOMEM if there was + * a hole in the address range. Panther and earlier used + * to return an EINVAL error, so honor backwards compatibility. + */ + errno = EINVAL; + } + return rv; +} + +#endif /* NO_SYSCALL_LEGACY */ diff --git a/libsyscall/wrappers/legacy/msync.c b/libsyscall/wrappers/legacy/msync.c new file mode 100644 index 000000000..7ba2a82c7 --- /dev/null +++ b/libsyscall/wrappers/legacy/msync.c @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2004, 2006 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +#ifndef NO_SYSCALL_LEGACY + +#define _NONSTD_SOURCE +#include + +#include +#include +#include + +int __msync_nocancel(void *, size_t, int); + +/* + * Stub function for legacy version + */ +int +msync(void *addr, size_t len, int flags) +{ + size_t offset; + + /* + * Page-align "addr" since the system now requires it + * for standards compliance. + * Update "len" to reflect the alignment. + */ + offset = ((uintptr_t) addr) & PAGE_MASK; + addr = (void *) (((uintptr_t) addr) & ~PAGE_MASK); + len += offset; + return __msync_nocancel(addr, len, flags); +} + +#endif /* NO_SYSCALL_LEGACY */ diff --git a/libsyscall/wrappers/legacy/munmap.c b/libsyscall/wrappers/legacy/munmap.c new file mode 100644 index 000000000..24b5b5eaa --- /dev/null +++ b/libsyscall/wrappers/legacy/munmap.c @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef NO_SYSCALL_LEGACY + +#define _NONSTD_SOURCE +#include + +#include +#include +#include + +/* + * Stub function to account for the differences in standard compliance + * while maintaining binary backward compatibility. + * + * This is only the legacy behavior. + */ +extern int __munmap(void *, size_t); + +int +munmap(void *addr, size_t len) +{ + size_t offset; + + if (len == 0) { + /* + * Standard compliance now requires the system to return EINVAL + * for munmap(addr, 0). Return success now to maintain + * backwards compatibility. + */ + return 0; + } + /* + * Page-align "addr" since the system now requires it + * for standards compliance. + * Update "len" to reflect the adjustment and still cover the same area. + */ + offset = ((uintptr_t) addr) & PAGE_MASK; + addr = (void *) (((uintptr_t) addr) & ~PAGE_MASK); + len += offset; + return __munmap(addr, len); +} + +#endif /* NO_SYSCALL_LEGACY */ diff --git a/libsyscall/wrappers/legacy/open.c b/libsyscall/wrappers/legacy/open.c new file mode 100644 index 000000000..c11f4e919 --- /dev/null +++ b/libsyscall/wrappers/legacy/open.c @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2005, 2009 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef NO_SYSCALL_LEGACY + +#define _NONSTD_SOURCE +#include + +#include +#include +#include + +int __open_nocancel(const char *path, int flags, mode_t mode); + +/* + * open stub: The legacy interface never automatically associated a controlling + * tty, so we always pass O_NOCTTY. + */ +int +open(const char *path, int flags, ...) +{ + mode_t mode = 0; + + if(flags & O_CREAT) { + va_list ap; + va_start(ap, flags); + // compiler warns to pass int (not mode_t) to va_arg + mode = va_arg(ap, int); + va_end(ap); + } + return(__open_nocancel(path, flags | O_NOCTTY, mode)); +} + +#endif /* NO_SYSCALL_LEGACY */ diff --git a/libsyscall/wrappers/legacy/recvfrom.c b/libsyscall/wrappers/legacy/recvfrom.c new file mode 100644 index 000000000..1f53ed8ba --- /dev/null +++ b/libsyscall/wrappers/legacy/recvfrom.c @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2006-2010 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef NO_SYSCALL_LEGACY + +#define _NONSTD_SOURCE +#include + +/* + * We need conformance on so that EOPNOTSUPP=102. But the routine symbol + * will still be the legacy (undecorated) one. + */ +#undef __DARWIN_UNIX03 +#define __DARWIN_UNIX03 1 + +#include +#include "_errno.h" + +ssize_t __recvfrom_nocancel(int, void *, size_t, int, struct sockaddr * __restrict, socklen_t * __restrict); + +/* + * recvfrom stub, legacy version + */ +ssize_t +recvfrom(int s, void *buf, size_t len, int flags, struct sockaddr * __restrict from, socklen_t * __restrict fromlen) +{ + int ret = __recvfrom_nocancel(s, buf, len, flags, from, fromlen); + + /* use ENOTSUP for legacy behavior */ + if (ret < 0 && errno == EOPNOTSUPP) + errno = ENOTSUP; + return ret; +} + +#endif diff --git a/libsyscall/wrappers/legacy/recvmsg.c b/libsyscall/wrappers/legacy/recvmsg.c new file mode 100644 index 000000000..dea590555 --- /dev/null +++ b/libsyscall/wrappers/legacy/recvmsg.c @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2006-2010 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef NO_SYSCALL_LEGACY + +#define _NONSTD_SOURCE +#include + +/* + * We need conformance on so that EOPNOTSUPP=102. But the routine symbol + * will still be the legacy (undecorated) one. + */ +#undef __DARWIN_UNIX03 +#define __DARWIN_UNIX03 1 + +#include +#include "_errno.h" + +ssize_t __recvmsg_nocancel(int, struct msghdr *, int); + +/* + * recvmsg stub, legacy version + */ +ssize_t +recvmsg(int s, struct msghdr *msg, int flags) +{ + int ret = __recvmsg_nocancel(s, msg, flags); + + /* use ENOTSUP for legacy behavior */ + if (ret < 0 && errno == EOPNOTSUPP) + errno = ENOTSUP; + return ret; +} + +#endif diff --git a/libsyscall/wrappers/legacy/select-pre1050.c b/libsyscall/wrappers/legacy/select-pre1050.c new file mode 100644 index 000000000..2b8bd8e17 --- /dev/null +++ b/libsyscall/wrappers/legacy/select-pre1050.c @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#if defined(SYSCALL_PRE1050) && defined(__LP64__) + +#undef __DARWIN_VERS_1050 +#define __DARWIN_VERS_1050 0 +#define VARIANT_PRE1050 + +#include "../select-base.c" + +#endif diff --git a/libsyscall/wrappers/legacy/select.c b/libsyscall/wrappers/legacy/select.c new file mode 100644 index 000000000..2ababf8cc --- /dev/null +++ b/libsyscall/wrappers/legacy/select.c @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef NO_SYSCALL_LEGACY + +#define _NONSTD_SOURCE +#define VARIANT_LEGACY + +#include "../select-base.c" + +#endif /* NO_SYSCALL_LEGACY */ diff --git a/libsyscall/wrappers/legacy/sendmsg.c b/libsyscall/wrappers/legacy/sendmsg.c new file mode 100644 index 000000000..e337f2515 --- /dev/null +++ b/libsyscall/wrappers/legacy/sendmsg.c @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2006-2010 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef NO_SYSCALL_LEGACY + +#define _NONSTD_SOURCE +#include + +/* + * We need conformance on so that EOPNOTSUPP=102. But the routine symbol + * will still be the legacy (undecorated) one. + */ +#undef __DARWIN_UNIX03 +#define __DARWIN_UNIX03 1 + +#include +#include +#include "_errno.h" + +ssize_t __sendmsg_nocancel(int, const struct msghdr *, int); + +/* + * sendmsg stub, legacy version + */ +ssize_t +sendmsg(int s, const struct msghdr *msg, int flags) +{ + int ret = __sendmsg_nocancel(s, msg, flags); + + /* use ENOTSUP for legacy behavior */ + if (ret < 0 && errno == EOPNOTSUPP) + errno = ENOTSUP; + return ret; +} + +#endif diff --git a/libsyscall/wrappers/legacy/sendto.c b/libsyscall/wrappers/legacy/sendto.c new file mode 100644 index 000000000..095282119 --- /dev/null +++ b/libsyscall/wrappers/legacy/sendto.c @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2006-2010 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef NO_SYSCALL_LEGACY + +#define _NONSTD_SOURCE +#include + +/* + * We need conformance on so that EOPNOTSUPP=102. But the routine symbol + * will still be the legacy (undecorated) one. + */ +#undef __DARWIN_UNIX03 +#define __DARWIN_UNIX03 1 + +#include +#include +#include "_errno.h" + +ssize_t __sendto_nocancel(int, const void *, size_t, int, const struct sockaddr *, socklen_t); + +/* + * sendto stub, legacy version + */ +ssize_t +sendto(int s, const void *msg, size_t len, int flags, const struct sockaddr *to, socklen_t tolen) +{ + int ret = __sendto_nocancel(s, msg, len, flags, to, tolen); + + /* use ENOTSUP for legacy behavior */ + if (ret < 0 && errno == EOPNOTSUPP) + errno = ENOTSUP; + return ret; +} + +#endif diff --git a/libsyscall/wrappers/legacy/setattrlist.c b/libsyscall/wrappers/legacy/setattrlist.c new file mode 100644 index 000000000..d9e5a5e37 --- /dev/null +++ b/libsyscall/wrappers/legacy/setattrlist.c @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2006-2010 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef NO_SYSCALL_LEGACY + +#define _NONSTD_SOURCE +#include + +/* + * We need conformance on so that EOPNOTSUPP=102. But the routine symbol + * will still be the legacy (undecorated) one. + */ +#undef __DARWIN_UNIX03 +#define __DARWIN_UNIX03 1 + +#include +#include "_errno.h" + +#ifdef __LP64__ +extern int __setattrlist(const char *, void *, void *, size_t, unsigned int); +#else /* !__LP64__ */ +extern int __setattrlist(const char *, void *, void *, size_t, unsigned long); +#endif /* __LP64__ */ + +/* + * setattrlist stub, legacy version + */ +int +#ifdef __LP64__ +setattrlist(const char *path, void *attrList, void *attrBuf, + size_t attrBufSize, unsigned int options) +#else /* !__LP64__ */ +setattrlist(const char *path, void *attrList, void *attrBuf, + size_t attrBufSize, unsigned long options) +#endif /* __LP64__ */ +{ + int ret = __setattrlist(path, attrList, attrBuf, attrBufSize, options); + + /* use ENOTSUP for legacy behavior */ + if (ret < 0 && errno == EOPNOTSUPP) + errno = ENOTSUP; + return ret; +} + +#endif diff --git a/libsyscall/wrappers/legacy/sigsuspend.c b/libsyscall/wrappers/legacy/sigsuspend.c new file mode 100644 index 000000000..98ffc8c7f --- /dev/null +++ b/libsyscall/wrappers/legacy/sigsuspend.c @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef NO_SYSCALL_LEGACY + +#define _NONSTD_SOURCE +#define __SIGSUSPEND __sigsuspend_nocancel + +#include "../sigsuspend-base.c" + +#endif /* NO_SYSCALL_LEGACY */ diff --git a/libsyscall/wrappers/legacy/socketpair.c b/libsyscall/wrappers/legacy/socketpair.c new file mode 100644 index 000000000..8249814e8 --- /dev/null +++ b/libsyscall/wrappers/legacy/socketpair.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2006-2010 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef NO_SYSCALL_LEGACY + +#define _NONSTD_SOURCE +#include + +/* + * We need conformance on so that EOPNOTSUPP=102. But the routine symbol + * will still be the legacy (undecorated) one. + */ +#undef __DARWIN_UNIX03 +#define __DARWIN_UNIX03 1 + +#include +#include + +#include "_errno.h" + +extern int __socketpair(int, int, int, int [2]); + +/* + * socketpair stub, legacy version + */ +int +socketpair(int domain, int type, int protocol, int socket_vector[2]) +{ + int ret = __socketpair(domain, type, protocol, socket_vector); + + /* use ENOTSUP for legacy behavior */ + if (ret < 0 && errno == EOPNOTSUPP) + errno = ENOTSUP; + return ret; +} + +#endif diff --git a/libsyscall/wrappers/memcpy.c b/libsyscall/wrappers/memcpy.c new file mode 100644 index 000000000..c9af35506 --- /dev/null +++ b/libsyscall/wrappers/memcpy.c @@ -0,0 +1,143 @@ +/*- + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Chris Torek. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "string.h" + +/* + * sizeof(word) MUST BE A POWER OF TWO + * SO THAT wmask BELOW IS ALL ONES + */ +typedef int word; /* "word" used for optimal copy speed */ + +#define wsize sizeof(word) +#define wmask (wsize - 1) + +/* + * Copy a block of memory, handling overlap. + * This is the routine that actually implements + * (the portable versions of) bcopy, memcpy, and memmove. + */ + +__private_extern__ +void * memcpy(void *dst0, const void *src0, size_t length) +{ + char *dst = dst0; + const char *src = src0; + size_t t; + + if (length == 0 || dst == src) /* nothing to do */ + goto done; + + /* + * Macros: loop-t-times; and loop-t-times, t>0 + */ +#define TLOOP(s) if (t) TLOOP1(s) +#define TLOOP1(s) do { s; } while (--t) + + if ((unsigned long)dst < (unsigned long)src) { + /* + * Copy forward. + */ + t = (uintptr_t)src; /* only need low bits */ + if ((t | (uintptr_t)dst) & wmask) { + /* + * Try to align operands. This cannot be done + * unless the low bits match. + */ + if ((t ^ (uintptr_t)dst) & wmask || length < wsize) + t = length; + else + t = wsize - (t & wmask); + length -= t; + TLOOP1(*dst++ = *src++); + } + /* + * Copy whole words, then mop up any trailing bytes. + */ + t = length / wsize; + TLOOP(*(word *)dst = *(word *)src; src += wsize; dst += wsize); + t = length & wmask; + TLOOP(*dst++ = *src++); + } else { + /* + * Copy backwards. Otherwise essentially the same. + * Alignment works as before, except that it takes + * (t&wmask) bytes to align, not wsize-(t&wmask). + */ + src += length; + dst += length; + t = (uintptr_t)src; + if ((t | (uintptr_t)dst) & wmask) { + if ((t ^ (uintptr_t)dst) & wmask || length <= wsize) + t = length; + else + t &= wmask; + length -= t; + TLOOP1(*--dst = *--src); + } + t = length / wsize; + TLOOP(src -= wsize; dst -= wsize; *(word *)dst = *(word *)src); + t = length & wmask; + TLOOP(*--dst = *--src); + } +done: + return (dst0); +} + +__private_extern__ void * +memmove(void *s1, const void *s2, size_t n) +{ + return memcpy(s1, s2, n); +} + +__private_extern__ void +bcopy(const void *s1, void *s2, size_t n) +{ + memcpy(s2, s1, n); +} + +/* + * Compare memory regions. + */ +__private_extern__ int +memcmp(const void *s1, const void *s2, size_t n) +{ + if (n != 0) { + const unsigned char *p1 = s1, *p2 = s2; + + do { + if (*p1++ != *p2++) + return (*--p1 - *--p2); + } while (--n != 0); + } + return (0); +} diff --git a/libsyscall/wrappers/remove-counter.c b/libsyscall/wrappers/remove-counter.c new file mode 100644 index 000000000..d6a2846d8 --- /dev/null +++ b/libsyscall/wrappers/remove-counter.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include + +#if defined(__ppc64__) || defined(__i386__) || defined(__x86_64__) +static int64_t __remove_counter = 0; +#else +static int32_t __remove_counter = 0; +#endif + +__uint64_t +__get_remove_counter(void) { +#if defined(__arm__) && !defined(_ARM_ARCH_6) + return __remove_counter; +#else + return __sync_add_and_fetch(&__remove_counter, 0); +#endif +} + +void +__inc_remove_counter(void) +{ +#if defined(__arm__) && !defined(_ARM_ARCH_6) + __remove_counter++; +#else + __sync_add_and_fetch(&__remove_counter, 1); +#endif +} diff --git a/libsyscall/wrappers/rename.c b/libsyscall/wrappers/rename.c new file mode 100644 index 000000000..a73c0ec82 --- /dev/null +++ b/libsyscall/wrappers/rename.c @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +void __inc_remove_counter(void); +int __rename(const char *old, const char *new); + +int +rename(const char *old, const char *new) +{ + int res = __rename(old, new); + if (res == 0) __inc_remove_counter(); + return res; +} diff --git a/libsyscall/wrappers/rmdir.c b/libsyscall/wrappers/rmdir.c new file mode 100644 index 000000000..07bfb9588 --- /dev/null +++ b/libsyscall/wrappers/rmdir.c @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +void __inc_remove_counter(void); +int __rmdir(const char *path); + +int +rmdir(const char *path) +{ + int res = __rmdir(path); + if (res == 0) __inc_remove_counter(); + return res; +} diff --git a/libsyscall/wrappers/select-base.c b/libsyscall/wrappers/select-base.c new file mode 100644 index 000000000..09f8816f6 --- /dev/null +++ b/libsyscall/wrappers/select-base.c @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2005, 2007 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#if defined(__LP64__) && (defined(VARIANT_CANCELABLE) || defined(VARIANT_PRE1050)) +#undef __DARWIN_NON_CANCELABLE +#define __DARWIN_NON_CANCELABLE 0 +#endif /* __LP64__ && (VARIANT_CANCELABLE || VARIANT_PRE1050) */ + +#include +#include "_errno.h" + +#if defined(VARIANT_CANCELABLE) || defined(VARIANT_PRE1050) +extern int __select(int, fd_set * __restrict, fd_set * __restrict, + fd_set * __restrict, struct timeval * __restrict); +#else /* !VARIANT_CANCELABLE && !VARIANT_PRE1050 */ +int __select_nocancel(int, fd_set * __restrict, fd_set * __restrict, + fd_set * __restrict, struct timeval * __restrict); +#endif /* VARIANT_CANCELABLE || VARIANT_PRE1050 */ + +/* + * select stub, return error if nfds > FD_SETSIZE + * add pthread cancelability + * mandated for conformance. + * + * This is only for (non DARWINEXTSN) UNIX03 (both cancelable and + * non-cancelable) and for legacy + */ +int +select(int nfds, fd_set * __restrict readfds, fd_set * __restrict writefds, + fd_set * __restrict exceptfds, struct timeval * __restrict +#if defined(VARIANT_LEGACY) || defined(VARIANT_PRE1050) + intimeout +#else /* !VARIANT_LEGACY && !VARIANT_PRE1050 */ + timeout +#endif /* VARIANT_LEGACY || VARIANT_PRE1050 */ + ) +{ + +#if defined(VARIANT_LEGACY) || defined(VARIANT_PRE1050) + struct timeval tb, *timeout; + + /* + * Legacy select behavior is minimum 10 msec when tv_usec is non-zero + */ + if (intimeout && intimeout->tv_sec == 0 && intimeout->tv_usec > 0 && intimeout->tv_usec < 10000) { + tb.tv_sec = 0; + tb.tv_usec = 10000; + timeout = &tb; + } else + timeout = intimeout; +#else /* !VARIANT_LEGACY && !VARIANT_PRE1050 */ + if (nfds > FD_SETSIZE) { + errno = EINVAL; + return -1; + } +#endif /* VARIANT_LEGACY || VARIANT_PRE1050 */ +#if defined(VARIANT_CANCELABLE) || defined(VARIANT_PRE1050) + return __select(nfds, readfds, writefds, exceptfds, timeout); +#else /* !VARIANT_CANCELABLE && !VARIANT_PRE1050 */ + return __select_nocancel(nfds, readfds, writefds, exceptfds, timeout); +#endif /* VARIANT_CANCELABLE || VARIANT_PRE1050 */ +} diff --git a/libsyscall/wrappers/sigsuspend-base.c b/libsyscall/wrappers/sigsuspend-base.c new file mode 100644 index 000000000..98f3fc3d0 --- /dev/null +++ b/libsyscall/wrappers/sigsuspend-base.c @@ -0,0 +1,41 @@ +/* + * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* @(#)sigsuspend.c 1.0 9/22/95 (c) 1995 NeXT */ + +#include +#include + +int __SIGSUSPEND(const sigset_t); + +int +sigsuspend (const sigset_t *sigmask_p) +{ + sigset_t mask; + + if (sigmask_p) + mask = *sigmask_p; + else + sigemptyset(&mask); + return __SIGSUSPEND(mask); +} + diff --git a/libsyscall/wrappers/unix03/chmod.c b/libsyscall/wrappers/unix03/chmod.c new file mode 100644 index 000000000..ca5077780 --- /dev/null +++ b/libsyscall/wrappers/unix03/chmod.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include + +#if __DARWIN_UNIX03 + +#include +#include +#include "_errno.h" + +extern int __chmod(const char *path, mode_t mode); + +/* + * chmod stub, ignore S_ISUID and/or S_ISGID on EPERM, + * mandated for conformance. + * + * This is for UNIX03 only. + */ +int +chmod(const char *path, mode_t mode) +{ + int res = __chmod(path, mode); + + if (res >= 0 || errno != EPERM || (mode & (S_ISUID | S_ISGID)) == 0) + return res; + if (mode & S_ISGID) { + res = __chmod(path, mode ^ S_ISGID); + if (res >= 0 || errno != EPERM) + return res; + } + if (mode & S_ISUID) { + res = __chmod(path, mode ^ S_ISUID); + if (res >= 0 || errno != EPERM) + return res; + } + if ((mode & (S_ISUID | S_ISGID)) == (S_ISUID | S_ISGID)) + res = __chmod(path, mode ^ (S_ISUID | S_ISGID)); + return res; +} + +#endif /* __DARWIN_UNIX03 */ diff --git a/libsyscall/wrappers/unix03/fchmod.c b/libsyscall/wrappers/unix03/fchmod.c new file mode 100644 index 000000000..648c53edc --- /dev/null +++ b/libsyscall/wrappers/unix03/fchmod.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include + +#if __DARWIN_UNIX03 + +#include +#include +#include "_errno.h" + +extern int __fchmod(int fd, mode_t mode); + +/* + * fchmod stub, ignore S_ISUID and/or S_ISGID on EPERM, + * mandated for conformance. + * + * This is for UNIX03 only. + */ +int +fchmod(int fd, mode_t mode) +{ + int res = __fchmod(fd, mode); + + if (res >= 0 || errno != EPERM || (mode & (S_ISUID | S_ISGID)) == 0) + return res; + if (mode & S_ISGID) { + res = __fchmod(fd, mode ^ S_ISGID); + if (res >= 0 || errno != EPERM) + return res; + } + if (mode & S_ISUID) { + res = __fchmod(fd, mode ^ S_ISUID); + if (res >= 0 || errno != EPERM) + return res; + } + if ((mode & (S_ISUID | S_ISGID)) == (S_ISUID | S_ISGID)) + res = __fchmod(fd, mode ^ (S_ISUID | S_ISGID)); + return res; +} + +#endif /* __DARWIN_UNIX03 */ diff --git a/libsyscall/wrappers/unix03/getrlimit.c b/libsyscall/wrappers/unix03/getrlimit.c new file mode 100644 index 000000000..ab38b7170 --- /dev/null +++ b/libsyscall/wrappers/unix03/getrlimit.c @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include + +#if __DARWIN_UNIX03 + +#include +#include +#include + +extern int __getrlimit(int resource, struct rlimit *rlp); + +/* + * getrlimit stub, for conformance, OR in _RLIMIT_POSIX_FLAG + * + * This is for UNIX03 only. + */ +int +getrlimit(int resource, struct rlimit *rlp) +{ + resource |= _RLIMIT_POSIX_FLAG; + return(__getrlimit(resource, rlp)); +} + +#endif /* __DARWIN_UNIX03 */ diff --git a/libsyscall/wrappers/unix03/mmap.c b/libsyscall/wrappers/unix03/mmap.c new file mode 100644 index 000000000..60c6bba32 --- /dev/null +++ b/libsyscall/wrappers/unix03/mmap.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include + +#if __DARWIN_UNIX03 + +#include +#include +#include + +void *__mmap(void *addr, size_t len, int prot, int flags, int fildes, off_t off); + +/* + * mmap stub, with preemptory failures due to extra parameter checking + * mandated for conformance. + * + * This is for UNIX03 only. + */ +void * +mmap(void *addr, size_t len, int prot, int flags, int fildes, off_t off) +{ + /* + * Preemptory failures: + * + * o off is not a multiple of the page size + * o flags does not contain either MAP_PRIVATE or MAP_SHARED + * o len is zero + */ + extern void cthread_set_errno_self(int); + if ((off & PAGE_MASK) || + (((flags & MAP_PRIVATE) != MAP_PRIVATE) && + ((flags & MAP_SHARED) != MAP_SHARED)) || + (len == 0)) { + cthread_set_errno_self(EINVAL); + return(MAP_FAILED); + } + + return(__mmap(addr, len, prot, flags, fildes, off)); +} + +#endif /* __DARWIN_UNIX03 */ diff --git a/libsyscall/wrappers/unix03/setrlimit.c b/libsyscall/wrappers/unix03/setrlimit.c new file mode 100644 index 000000000..ebc872deb --- /dev/null +++ b/libsyscall/wrappers/unix03/setrlimit.c @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include + +#if __DARWIN_UNIX03 + +#include +#include +#include + +extern int __setrlimit(int resource, const struct rlimit *rlp); + +/* + * setrlimit stub, for conformance, OR in _RLIMIT_POSIX_FLAG + * + * This is for UNIX03 only. + */ +int +setrlimit(int resource, const struct rlimit *rlp) +{ + resource |= _RLIMIT_POSIX_FLAG; + return(__setrlimit(resource, rlp)); +} + +#endif /* __DARWIN_UNIX03 */ diff --git a/libsyscall/wrappers/unlink.c b/libsyscall/wrappers/unlink.c new file mode 100644 index 000000000..8f2144a82 --- /dev/null +++ b/libsyscall/wrappers/unlink.c @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +void __inc_remove_counter(void); +int __unlink(const char *path); + +int +unlink(const char *path) +{ + int res = __unlink(path); + if (res == 0) __inc_remove_counter(); + return res; +} diff --git a/libsyscall/xcodescripts/compat-symlinks.sh b/libsyscall/xcodescripts/compat-symlinks.sh new file mode 100755 index 000000000..78b504777 --- /dev/null +++ b/libsyscall/xcodescripts/compat-symlinks.sh @@ -0,0 +1,32 @@ +#!/bin/sh +# +# Copyright (c) 2010 Apple Inc. All rights reserved. +# +# @APPLE_OSREFERENCE_LICENSE_HEADER_START@ +# +# This file contains Original Code and/or Modifications of Original Code +# as defined in and that are subject to the Apple Public Source License +# Version 2.0 (the 'License'). You may not use this file except in +# compliance with the License. The rights granted to you under the License +# may not be used to create, or enable the creation or redistribution of, +# unlawful or unlicensed copies of an Apple operating system, or to +# circumvent, violate, or enable the circumvention or violation of, any +# terms of an Apple operating system software license agreement. +# +# Please obtain a copy of the License at +# http://www.opensource.apple.com/apsl/ and read it before using this file. +# +# The Original Code and all software distributed under the License are +# distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER +# EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +# INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. +# Please see the License for the specific language governing rights and +# limitations under the License. +# +# @APPLE_OSREFERENCE_LICENSE_HEADER_END@ +# + +if [ "x$ACTION" != "xinstallhdrs" ]; then + ln -sf libsystem_kernel.a "$DSTROOT/usr/local/lib/dyld/libsystem_mach.a" +fi diff --git a/libsyscall/xcodescripts/compile-syscalls.pl b/libsyscall/xcodescripts/compile-syscalls.pl new file mode 100755 index 000000000..a75737992 --- /dev/null +++ b/libsyscall/xcodescripts/compile-syscalls.pl @@ -0,0 +1,130 @@ +#!/usr/bin/perl +# +# Copyright (c) 2010 Apple Inc. All rights reserved. +# +# @APPLE_OSREFERENCE_LICENSE_HEADER_START@ +# +# This file contains Original Code and/or Modifications of Original Code +# as defined in and that are subject to the Apple Public Source License +# Version 2.0 (the 'License'). You may not use this file except in +# compliance with the License. The rights granted to you under the License +# may not be used to create, or enable the creation or redistribution of, +# unlawful or unlicensed copies of an Apple operating system, or to +# circumvent, violate, or enable the circumvention or violation of, any +# terms of an Apple operating system software license agreement. +# +# Please obtain a copy of the License at +# http://www.opensource.apple.com/apsl/ and read it before using this file. +# +# The Original Code and all software distributed under the License are +# distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER +# EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +# INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. +# Please see the License for the specific language governing rights and +# limitations under the License. +# +# @APPLE_OSREFERENCE_LICENSE_HEADER_END@ +# + +use warnings; +use strict; + +use Data::Dumper; +use File::Spec; +use IO::File; +use File::Basename (); + +my $basename = File::Basename::basename($0); + +sub usage { + print "$basename: "; + exit 1; +} + +usage unless scalar(@ARGV) == 2; + +my $sourceList = $ARGV[0]; +my $outputFile = $ARGV[1]; + +my $f = IO::File->new($sourceList, 'r'); +die "$basename: $sourceList: $!\n" unless defined($f); + +my @objects; +my @archs = split / /, $ENV{"ARCHS"}; +my @sources = <$f>; +chomp @sources; + +undef $f; + +# compiler options +chomp(my $CC = `xcrun -sdk "$ENV{'SDKROOT'}" -find cc`); +my @CFLAGS = ( + "-x assembler-with-cpp", + "-c", + "-I".$ENV{"SDKROOT"}."/System/Library/Frameworks/System.framework/PrivateHeaders", +); + +chomp(my $LIBTOOL = `xcrun -sdk "$ENV{'SDKROOT'}" -find libtool`); +my @LIBTOOLFLAGS = ( + "-static", +); + +# architectures +for my $arch (@archs) { + push(@CFLAGS, "-arch $arch"); +} + +# do each compile +my $jobs = `sysctl -n hw.ncpu` + 2; + +for my $src (@sources) { + if ($jobs == 0) { + if (wait != -1) { + $jobs++; + } else { + printf "wait exited with -1 (no children) and exhausted allowed jobs. Exiting.\n"; + exit 1; + } + + if ($? != 0) { + printf "$CC exited with value %d\n", $? >> 8; + exit 1; + } + } + + (my $o = $src) =~ s/\.s$/\.o/; + my $compileCommand = "$CC " . join(' ', @CFLAGS) . " -o $o $src"; + printf $compileCommand . "\n"; + + $jobs--; + my $pid = fork(); + if ($pid == 0) { + exec($compileCommand); + } + push(@objects, $o); +} + +while (wait != -1) { + if ($? != 0) { + printf "$CC exited with value %d\n", $? >> 8; + exit 1; + } +} + +printf "Finished assembly, beginning link.\n"; + +# final link + +if (-f $outputFile) { + unlink($outputFile); +} + +my $linkCommand = "$LIBTOOL " . join(' ', @LIBTOOLFLAGS) . " -o $outputFile " . join(' ', @objects); + +printf $linkCommand . "\n"; +system($linkCommand); +if ($? != 0) { + print "$LIBTOOL exited with value %d\n", $? >> 8; + exit 1; +} diff --git a/libsyscall/xcodescripts/create-syscalls.pl b/libsyscall/xcodescripts/create-syscalls.pl new file mode 100755 index 000000000..68366de86 --- /dev/null +++ b/libsyscall/xcodescripts/create-syscalls.pl @@ -0,0 +1,403 @@ +#!/usr/bin/perl +# +# Copyright (c) 2006 Apple Computer, Inc. All rights reserved. +# +# @APPLE_OSREFERENCE_LICENSE_HEADER_START@ +# +# This file contains Original Code and/or Modifications of Original Code +# as defined in and that are subject to the Apple Public Source License +# Version 2.0 (the 'License'). You may not use this file except in +# compliance with the License. Please obtain a copy of the License at +# http://www.opensource.apple.com/apsl/ and read it before using this +# file. +# +# The Original Code and all software distributed under the License are +# distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER +# EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +# INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. +# Please see the License for the specific language governing rights and +# limitations under the License. +# +# @APPLE_OSREFERENCE_LICENSE_HEADER_END@ +# +########################################################################## +# +# % create-syscalls.pl syscalls.master custom-directory out-directory +# +# This script fills the the out-directory with a Makefile.inc and *.s +# files to create the double-underbar syscall stubs. It reads the +# syscall.master file to get the symbol names and number of arguments, +# and whether Libsystem should automatically create the (non-double-underbar) +# stubs if Libc doesn't provide a wrapper. Which system calls will get +# the automatic treatment is writen to the libsyscall.list file, also +# written to the out-directory. +# +# The custom-directory contains: +# 1. SYS.h - used by the automatically created *.s and custom files +# 2. custom.s - contains architecture specific additional system calls and +# auxilliary routines (like cerror) +# 3. special case double-underbar stub files - which are copied into +# the out-directory +# +########################################################################## + +use strict; +use File::Basename (); +use File::Copy (); +use File::Spec; +use IO::File; + +my $MyName = File::Basename::basename($0); + +my @CustomSrc = qw(custom.s); + +my @Architectures = split /\s/, $ENV{"ARCHS"}; +my @Copy = (qw(SYS.h), @CustomSrc); +my $CustomDir; +my $PlatformsDir; +my $PlatformName; +my $OutDir; +# size in bytes of known types (only used for i386) +my %TypeBytes = ( + 'au_asid_t' => 4, + 'caddr_t' => 4, + 'gid_t' => 4, + 'id_t' => 4, + 'idtype_t' => 4, + 'int' => 4, + 'int32_t' => 4, + 'int64_t' => 8, + 'key_t' => 4, + 'long' => 4, + 'mach_port_name_t' => 4, + 'mode_t' => 4, + 'off_t' => 8, + 'pid_t' => 4, + 'semun_t' => 4, + 'sigset_t' => 4, + 'size_t' => 4, + 'socklen_t' => 4, + 'ssize_t' => 4, + 'u_int' => 4, + 'u_long' => 4, + 'uid_t' => 4, + 'uint32_t' => 4, + 'uint64_t' => 8, + 'user_addr_t' => 4, + 'user_long_t' => 4, + 'user_size_t' => 4, + 'user_ssize_t' => 4, + 'user_ulong_t' => 4, +); + +# Moving towards storing all data in this hash, then we always know +# if data is aliased or not, or promoted or not. +my %Symbols = ( + "quota" => { + c_sym => "quota", + syscall => "quota", + asm_sym => "_quota", + is_private => undef, + is_custom => undef, + nargs => 4, + bytes => 0, + aliases => {}, + }, + "setquota" => { + c_sym => "setquota", + syscall => "setquota", + asm_sym => "_setquota", + is_private => undef, + is_custom => undef, + nargs => 2, + bytes => 0, + aliases => {}, + }, + "syscall" => { + c_sym => "syscall", + syscall => "syscall", + asm_sym => "_syscall", + is_private => undef, + is_custom => undef, + nargs => 0, + bytes => 0, + aliases => {}, + }, +); + +sub usage { + die "Usage: $MyName syscalls.master custom-directory platforms-directory out-directory\n"; +} + +########################################################################## +# Read the syscall.master file and collect the system call names and number +# of arguments. It looks for the NO_SYSCALL_STUB quailifier following the +# prototype to determine if no automatic stub should be created by Libsystem. +# System call name that are already prefixed with double-underbar are set as +# if the NO_SYSCALL_STUB qualifier were specified (whether it is or not). +# +# For the #if lines in syscall.master, all macros are assumed to be defined, +# except COMPAT_GETFSSTAT (assumed undefined). +########################################################################## +sub readMaster { + my $file = shift; + local $_; + my $f = IO::File->new($file, 'r'); + die "$MyName: $file: $!\n" unless defined($f); + my $line = 0; + my $skip = 0; + while(<$f>) { + $line++; + if(/^#\s*endif/) { + $skip = 0; + next; + } + if(/^#\s*else/) { + $skip = -$skip; + next; + } + chomp; + if(/^#\s*if\s+(\S+)$/) { + $skip = ($1 eq 'COMPAT_GETFSSTAT') ? -1 : 1; + next; + } + next if $skip < 0; + next unless /^\d/; + s/^[^{]*{\s*//; + s/\s*}.*$//; # } + die "$MyName: no function prototype on line $line\n" unless length($_) > 0 && /;$/; + my $no_syscall_stub = /\)\s*NO_SYSCALL_STUB\s*;/; + my($name, $args) = /\s(\S+)\s*\(([^)]*)\)/; + next if $name =~ /e?nosys/; + $args =~ s/^\s+//; + $args =~ s/\s+$//; + my $argbytes = 0; + my $nargs = 0; + if($args ne '' && $args ne 'void') { + my @a = split(',', $args); + $nargs = scalar(@a); + # Calculate the size of all the arguments (only used for i386) + for my $type (@a) { + $type =~ s/\s*\w+$//; # remove the argument name + if($type =~ /\*$/) { + $argbytes += 4; # a pointer type + } else { + $type =~ s/^.*\s//; # remove any type qualifier, like unsigned + my $b = $TypeBytes{$type}; + die "$MyName: $name: unknown type '$type'\n" unless defined($b); + $argbytes += $b; + } + } + } + $Symbols{$name} = { + c_sym => $name, + syscall => $name, + asm_sym => $no_syscall_stub ? "___$name" : "_$name", + is_private => $no_syscall_stub, + is_custom => undef, + nargs => $nargs, + bytes => $argbytes, + aliases => {}, + except => [], + }; + } +} + +sub checkForCustomStubs { + my ($dir) = @_; + + my ($c_sym_name, $sym); + while (($c_sym_name, $sym) = each %Symbols) { + my $source = "__".$$sym{c_sym}.".s"; + my $custom = File::Spec->join($dir, $source); + next unless -f $custom; + + $$sym{is_custom} = $source; + if (!$$sym{is_private}) { + foreach my $subarch (@Architectures) { + (my $arch = $subarch) =~ s/arm(.*)/arm/; + $$sym{aliases}{$arch} = [] unless $$sym{aliases}{$arch}; + push(@{$$sym{aliases}{$arch}}, $$sym{asm_sym}); + } + $$sym{asm_sym} = "__".$$sym{asm_sym}; + $$sym{is_private} = 1; + } + } +} + +sub readAliases { + my ($platformDir, $platformName) = @_; + my $genericMap = File::Spec->join($platformDir, "syscall.map"); + + my %sym_to_c; + foreach my $k (keys %Symbols) { + $sym_to_c{$Symbols{$k}{asm_sym}} = $k; + } + + my @a = (); + for my $arch (@Architectures) { + (my $new_arch = $arch) =~ s/arm(.*)/arm/g; + push(@a, $new_arch) unless grep { $_ eq $new_arch } @a; + } + + foreach my $arch (@a) { + my $syscallFile = File::Spec->join($platformDir, $platformName, $arch, "syscall.map"); + + my @files = (); + push(@files, IO::File->new($syscallFile, 'r')); + die "$MyName: $syscallFile: $!\n" unless defined($files[$#files]); + push(@files, IO::File->new($genericMap, 'r')); + die "$MyName: $genericMap: $!\n" unless defined($files[$#files]); + + foreach my $f (@files) { + while (<$f>) { + next if /^#/; + chomp; + + my ($alias, $target_symbol) = split; + if (defined($target_symbol)) { + foreach my $sym (values %Symbols) { + # I've eliminated most of the ugly from this script except + # the need to try stripping underbars here. + if ($$sym{is_private}) { + next unless $$sym{asm_sym} eq $target_symbol; + } else { + (my $target = $target_symbol) =~ s/^__//; + next unless ($$sym{asm_sym} eq $target || $$sym{asm_sym} eq $target_symbol); + } + $$sym{aliases}{$arch} = [] unless $$sym{aliases}{$arch}; + + die "$MyName: $arch $$sym{asm_sym} -> $alias: Duplicate alias.\n" if grep { $_ eq $alias } @{$$sym{aliases}{$arch}}; + push(@{$$sym{aliases}{$arch}}, $alias); + + # last thing to do, if we aliased over a first class symbol, we need + # to mark it + my $c = $sym_to_c{$alias}; + if ($Symbols{$c}) { + push(@{$Symbols{$c}{except}}, $arch); + } + } + } + } + } + } +} + +########################################################################## +# Make a __xxx.s file: if it exists in the $CustomDir, just copy it, otherwise +# create one. We define the macro __SYSCALL_32BIT_ARG_BYTES so that SYS.h could +# use that to define __SYSCALL dependent on the arguments' total size. +########################################################################## +sub writeStubForSymbol { + my ($f, $symbol) = @_; + + my @conditions; + for my $subarch (@Architectures) { + (my $arch = $subarch) =~ s/arm(.*)/arm/; + push(@conditions, "defined(__${arch}__)") unless grep { $_ eq $arch } @{$$symbol{except}}; + } + + print $f "#define __SYSCALL_32BIT_ARG_BYTES $$symbol{bytes}\n"; + print $f "#include \"SYS.h\"\n\n"; + if (scalar(@conditions)) { + printf $f "#if " . join(" || ", @conditions) . "\n"; + printf $f "__SYSCALL(%s, %s, %d)\n", $$symbol{asm_sym}, $$symbol{syscall}, $$symbol{nargs}; + if (!$$symbol{is_private} && (scalar(@conditions) < scalar(@Architectures))) { + printf $f "#else\n"; + printf $f "__SYSCALL(%s, %s, %d)\n", "__".$$symbol{asm_sym}, $$symbol{syscall}, $$symbol{nargs}; + } + printf $f "#endif\n\n"; + } else { + # actually this isnt an inconsistency. kernel can expose what it wants but if all our arches + # override it we need to honour that. + } +} + +sub writeAliasesForSymbol { + my ($f, $symbol) = @_; + + foreach my $subarch (@Architectures) { + (my $arch = $subarch) =~ s/arm(.*)/arm/; + + next unless scalar($$symbol{aliases}{$arch}); + + printf $f "#if defined(__${arch}__)\n"; + foreach my $alias_sym (@{$$symbol{aliases}{$arch}}) { + my $sym = (grep { $_ eq $arch } @{$$symbol{except}}) ? "__".$$symbol{asm_sym} : $$symbol{asm_sym}; + + printf $f "\t.globl\t$alias_sym\n"; + printf $f "\t.set\t$alias_sym, $sym\n"; + } + printf $f "#endif\n\n"; + } +} + +usage() unless scalar(@ARGV) == 5; +$CustomDir = $ARGV[1]; +die "$MyName: $CustomDir: No such directory\n" unless -d $CustomDir; +$PlatformsDir = $ARGV[2]; +die "$MyName: $PlatformsDir: No such directory\n" unless -d $PlatformsDir; +$PlatformName = $ARGV[3]; +die "$MyName: $PlatformsDir/$PlatformName: No such directory\n" unless -d "$PlatformsDir/$PlatformName"; +$OutDir = $ARGV[4]; +die "$MyName: $OutDir: No such directory\n" unless -d $OutDir; + +readMaster($ARGV[0]); +checkForCustomStubs($CustomDir); +readAliases($PlatformsDir, $PlatformName); + +########################################################################## +# copy the files specified in @Copy from the $CustomDir to $OutDir +########################################################################## +for(@Copy) { + my $custom = File::Spec->join($CustomDir, $_); + my $path = File::Spec->join($OutDir, $_); + print "Copy $custom -> $path\n"; + File::Copy::copy($custom, $path) || die "$MyName: copy($custom, $path): $!\n"; +} + +########################################################################## +# make all the *.s files +########################################################################## +my @src; +my($k, $sym); +while (($k, $sym) = each %Symbols) +{ + my $srcname = $$sym{asm_sym} . ".s"; + my $outpath = File::Spec->join($OutDir, $srcname); + + if ($$sym{is_custom}) { + my $custom = File::Spec->join($CustomDir, $$sym{is_custom}); + File::Copy::copy($custom, $outpath); + print "Copied $outpath\n"; + + print "Writing aliases for $srcname\n"; + my $f = IO::File->new($outpath, 'a'); + die "$MyName: $outpath: $!\n" unless defined($f); + writeAliasesForSymbol($f, $sym); + undef $f; + } else { + my $f = IO::File->new($outpath, 'w'); + die "$MyName: $outpath: $!\n" unless defined($f); + + printf "Creating $outpath\n"; + writeStubForSymbol($f, $sym); + writeAliasesForSymbol($f, $sym); + undef $f; + } + push(@src, $srcname); +} + +########################################################################## +# create the Makefile.inc file from the list for files in @src and @CustomSrc +########################################################################## +my $path = File::Spec->join($OutDir, 'stubs.list'); +my $f = IO::File->new($path, 'w'); +my @sources = sort(@src, @CustomSrc); +for my $s (@sources) { + printf $f File::Spec->join($OutDir, $s) . "\n"; +} +undef $f; +undef $path; + diff --git a/libsyscall/xcodescripts/mach_install_mig.sh b/libsyscall/xcodescripts/mach_install_mig.sh new file mode 100755 index 000000000..068bc30ad --- /dev/null +++ b/libsyscall/xcodescripts/mach_install_mig.sh @@ -0,0 +1,97 @@ +#!/bin/sh +# +# Copyright (c) 2010 Apple Inc. All rights reserved. +# +# @APPLE_OSREFERENCE_LICENSE_HEADER_START@ +# +# This file contains Original Code and/or Modifications of Original Code +# as defined in and that are subject to the Apple Public Source License +# Version 2.0 (the 'License'). You may not use this file except in +# compliance with the License. The rights granted to you under the License +# may not be used to create, or enable the creation or redistribution of, +# unlawful or unlicensed copies of an Apple operating system, or to +# circumvent, violate, or enable the circumvention or violation of, any +# terms of an Apple operating system software license agreement. +# +# Please obtain a copy of the License at +# http://www.opensource.apple.com/apsl/ and read it before using this file. +# +# The Original Code and all software distributed under the License are +# distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER +# EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +# INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. +# Please see the License for the specific language governing rights and +# limitations under the License. +# +# @APPLE_OSREFERENCE_LICENSE_HEADER_END@ +# + +# build inside OBJROOT +cd $OBJROOT + +# check if we're building for the simulator +[ "$RC_ProjectName" == "Libmach_Sim" ] && DSTROOT="$DSTROOT$SDKROOT" + +MIG=`xcrun -sdk "$SDKROOT" -find mig` +MIGCC=`xcrun -sdk "$SDKROOT" -find cc` +export MIGCC +MIG_DEFINES="-DLIBSYSCALL_INTERFACE" +MIG_HEADER_DST="$DSTROOT/usr/include/mach" +SERVER_HEADER_DST="$DSTROOT/usr/include/servers" +# from old Libsystem makefiles +MACHINE_ARCH=`echo $ARCHS | cut -d' ' -f 1` +SRC="$SRCROOT/mach" + +MIGS="clock.defs + clock_priv.defs + clock_reply.defs + exc.defs + host_priv.defs + host_security.defs + ledger.defs + lock_set.defs + mach_port.defs + mach_host.defs + mach_vm.defs + processor.defs + processor_set.defs + vm_map.defs" + +MIGS_ARCH="thread_act.defs + task.defs" + +SERVER_HDRS="key_defs.h + ls_defs.h + netname_defs.h + nm_defs.h" + +# install /usr/include/server headers +mkdir -p $SERVER_HEADER_DST +for hdr in $SERVER_HDRS; do + install -o 0 -c -m 444 $SRC/servers/$hdr $SERVER_HEADER_DST +done + +# special case because we only have one to do here +$MIG -arch $MACHINE_ARCH -header "$SERVER_HEADER_DST/netname.h" $SRC/servers/netname.defs + +# install /usr/include/mach mig headers + +mkdir -p $MIG_HEADER_DST + +for mig in $MIGS; do + MIG_NAME=`basename $mig .defs` + $MIG -arch $MACHINE_ARCH -cc $MIGCC -header "$MIG_HEADER_DST/$MIG_NAME.h" $MIG_DEFINES $SRC/$mig +done + +ARCHS=`echo $ARCHS | sed -e 's/armv./arm/g'` +for arch in $ARCHS; do + MIG_ARCH_DST="$MIG_HEADER_DST/$arch" + + mkdir -p $MIG_ARCH_DST + + for mig in $MIGS_ARCH; do + MIG_NAME=`basename $mig .defs` + $MIG -arch $MACHINE_ARCH -cc $MIGCC -header "$MIG_ARCH_DST/$MIG_NAME.h" $MIG_DEFINES $SRC/$mig + done +done diff --git a/makedefs/MakeInc.cmd b/makedefs/MakeInc.cmd index 06457b6f2..5fea21d30 100644 --- a/makedefs/MakeInc.cmd +++ b/makedefs/MakeInc.cmd @@ -24,21 +24,88 @@ else endif SDKROOT ?= / +HOST_SDKROOT ?= / -CC := $(XCRUN) -sdk $(SDKROOT) cc -CXX := $(XCRUN) -sdk $(SDKROOT) g++ -MIG := $(XCRUN) -sdk $(SDKROOT) mig +ifeq ($(PLATFORM),) + export PLATFORM := $(shell xcodebuild -sdk $(SDKROOT) -version PlatformPath | head -1 | sed 's,^.*/\([^/]*\)\.platform$$,\1,') + ifeq ($(PLATFORM),) + export PLATFORM := MacOSX + endif +endif + +# CC/CXX get defined by make(1) by default, so we can't check them +# against the empty string to see if they haven't been set +ifeq ($(origin CC),default) +ifneq ($(findstring iPhone,$(PLATFORM)),) + export CC := $(shell $(XCRUN) -sdk $(SDKROOT) -find gcc-4.2) +else + export CC := $(shell $(XCRUN) -sdk $(SDKROOT) -find cc) +endif +endif +ifeq ($(origin CXX),default) +ifneq ($(findstring iPhone,$(PLATFORM)),) + export CXX := $(shell $(XCRUN) -sdk $(SDKROOT) -find g++-4.2) +else + export CXX := $(shell $(XCRUN) -sdk $(SDKROOT) -find c++) +endif +endif +ifeq ($(MIG),) + export MIG := $(shell $(XCRUN) -sdk $(SDKROOT) -find mig) +endif ifeq ($(MIGCC),) - export MIGCC := $(shell $(XCRUN) -sdk $(SDKROOT) -find cc) + export MIGCC := $(CC) endif ifeq ($(RELPATH),) export RELPATH := $(shell $(XCRUN) -sdk $(SDKROOT) -find relpath) endif -SEG_HACK := $(XCRUN) -sdk $(SDKROOT) setsegname -KEXT_CREATE_SYMBOL_SET := $(XCRUN) -sdk $(SDKROOT) kextsymboltool +ifeq ($(STRIP),) + export STRIP := $(shell $(XCRUN) -sdk $(SDKROOT) -find strip) +endif +ifeq ($(LIPO),) + export LIPO := $(shell $(XCRUN) -sdk $(SDKROOT) -find lipo) +endif +ifeq ($(LIBTOOL),) + export LIBTOOL := $(shell $(XCRUN) -sdk $(SDKROOT) -find libtool) +endif +ifeq ($(NM),) + export NM := $(shell $(XCRUN) -sdk $(SDKROOT) -find nm) +endif +ifeq ($(UNIFDEF),) + export UNIFDEF := $(shell $(XCRUN) -sdk $(SDKROOT) -find unifdef) +endif +ifeq ($(DECOMMENT),) + export DECOMMENT := $(shell $(XCRUN) -sdk $(SDKROOT) -find decomment) +endif +ifeq ($(DSYMUTIL),) + export DSYMUTIL := $(shell $(XCRUN) -sdk $(SDKROOT) -find dsymutil) +endif +ifeq ($(CTFCONVERT),) + export CTFCONVERT := $(shell $(XCRUN) -sdk $(SDKROOT) -find ctfconvert) +endif +ifeq ($(CTFMERGE),) + export CTFMERGE := $(shell $(XCRUN) -sdk $(SDKROOT) -find ctfmerge) +endif +ifeq ($(CTFSCRUB),) + export CTFSCRUB := $(shell $(XCRUN) -sdk $(SDKROOT) -find ctfdump) -r +endif +ifeq ($(NMEDIT),) + export NMEDIT := $(shell $(XCRUN) -sdk $(SDKROOT) -find nmedit) +endif -MD = /usr/bin/md +# Platform-specific tools +ifneq ($(findstring iPhone,$(PRODUCT)),) +ifeq ($(IPHONEOS_OPTIMIZE),) + export IPHONEOS_OPTIMIZE := $(shell $(XCRUN) -sdk $(SDKROOT) -find iphoneos-optimize) +endif +endif + +# Scripts or tools we build ourselves +SEG_HACK := $(OBJROOT)/SETUP/setsegname/setsegname +KEXT_CREATE_SYMBOL_SET := $(OBJROOT)/SETUP/kextsymboltool/kextsymboltool +NEWVERS = $(SRCROOT)/config/newvers.pl +# Standard BSD tools +MD = /usr/bin/md RM = /bin/rm -f CP = /bin/cp MV = /bin/mv @@ -47,23 +114,43 @@ CAT = /bin/cat MKDIR = /bin/mkdir -p FIND = /usr/bin/find INSTALL = /usr/bin/install - TAR = /usr/bin/gnutar -STRIP = $(XCRUN) -sdk $(SDKROOT) strip -LIPO = $(XCRUN) -sdk $(SDKROOT) lipo -LIBTOOL = $(XCRUN) -sdk $(SDKROOT) libtool -NM = $(XCRUN) -sdk $(SDKROOT) nm - BASENAME = /usr/bin/basename TR = /usr/bin/tr -UNIFDEF = $(XCRUN) -sdk $(SDKROOT) unifdef -DECOMMENT = /usr/local/bin/decomment -NEWVERS = $(SRCROOT)/config/newvers.pl +# Platform-specific tools +ifeq (iPhoneOS,$(PLATFORM)) +ifeq ($(IPHONEOS_OPTIMIZE),) + export IPHONEOS_OPTIMIZE := $(shell $(XCRUN) -sdk $(SDKROOT) -find iphoneos-optimize || echo /usr/bin/true) +endif +endif + +CTFINSERT = $(XCRUN) -sdk $(SDKROOT) ctf_insert -DSYMUTIL = $(XCRUN) -sdk $(SDKROOT) dsymutil -CTFCONVERT = $(XCRUN) -sdk $(SDKROOT) ctfconvert -CTFMERGE = $(XCRUN) -sdk $(SDKROOT) ctfmerge -CTFSCRUB = $(XCRUN) -sdk $(SDKROOT) ctfdump -r +# +# Command to generate host binaries. Intentionally not +# $(CC), which controls the target compiler +# +ifeq ($(HOST_CC),) + export HOST_CC := $(shell $(XCRUN) -sdk $(HOST_SDKROOT) -find cc) +endif +ifeq ($(HOST_FLEX),) + export HOST_FLEX := $(shell $(XCRUN) -sdk $(HOST_SDKROOT) -find flex) +endif +ifeq ($(HOST_BISON),) + export HOST_BISON := $(shell $(XCRUN) -sdk $(HOST_SDKROOT) -find bison) +endif +ifeq ($(HOST_CODESIGN),) + export HOST_CODESIGN := $(shell $(XCRUN) -sdk $(HOST_SDKROOT) -find codesign) +endif + +# +# Command to build libkmod.a/libkmodc++.a, which are +# linked into kext binaries, and should be built as if +# they followed system-wide policies +# +ifeq ($(LIBKMOD_CC),) + export LIBKMOD_CC := $(shell $(XCRUN) -sdk $(SDKROOT) -find cc) +endif # vim: set ft=make: diff --git a/makedefs/MakeInc.def b/makedefs/MakeInc.def index 984c4f316..92d80379f 100644 --- a/makedefs/MakeInc.def +++ b/makedefs/MakeInc.def @@ -37,14 +37,14 @@ endif # Architecture options # ifndef SUPPORTED_ARCH_CONFIGS -export SUPPORTED_ARCH_CONFIGS = PPC I386 X86_64 ARM +export SUPPORTED_ARCH_CONFIGS = I386 X86_64 endif ifndef ARCH_CONFIGS ifdef RC_ARCHS -export ARCH_CONFIGS := $(shell printf "%s" "$(RC_ARCHS)" | $(TR) a-z A-Z) +export ARCH_CONFIGS := $(shell printf "%s" "$(RC_ARCHS)" | $(TR) a-z A-Z | sed -e 's/ARMV./ARM/g') else -export ARCH_CONFIGS := $(shell arch | $(TR) a-z A-Z) +export ARCH_CONFIGS := $(shell arch | $(TR) a-z A-Z | sed -e 's/ARMV./ARM/g') endif endif ifdef ARCH_CONFIG @@ -53,6 +53,16 @@ export ARCH_CONFIG_LC := $(shell printf "%s" "$(ARCH_CONFIG)" | $(TR) A-Z a-z) endif endif +# +# Platform options +# +ifndef SUPPORTED_PLATFORMS +export SUPPORTED_PLATFORMS = MacOSX iPhoneOS iPhoneSimulator +endif + +# PLATFORM is set earlier in MakeInc.cmd, closer to where decisions about +# platform tools are made + # # Kernel Configuration options # @@ -94,6 +104,11 @@ export MACHINE_CONFIG = DEFAULT endif + +ifndef SUPPORTED_MACHINE_CONFIGS +export SUPPORTED_MACHINE_CONFIGS = DEFAULT +endif + # # Target configuration options. NOTE - target configurations will # override ARCH_CONFIGS and KERNEL_CONFIGS. @@ -105,7 +120,7 @@ endif # seperated by whitespace. # # Example: -# TARGET_CONFIGS="release ppc default debug i386 default release arm MX31ADS" +# TARGET_CONFIGS="release x86_64 default debug i386 default release arm MX31ADS" # Parameters may be in upper or lower case (they are converted to upper). # # "default" parameter is a special case. It means use the default value for @@ -113,10 +128,9 @@ endif # # default kernel configuration = DEFAULT_KERNEL_CONFIG # default architecture configuration = system architecture where you are running make. -# default machine configuration for ppc = none at this time. # default machine configuration for i386 = none at this time. # default machine configuration for x86_64 = none at this time. -# default machine configuration for arm = "S5L8900X". +# default machine configuration for arm = "S5L8920X". # ifndef TARGET_CONFIGS_UC ifdef TARGET_CONFIGS @@ -136,24 +150,55 @@ endif export MACHINE_CONFIG_LC := $(shell printf "%s" "$(MACHINE_CONFIG)" | $(TR) A-Z a-z) export KERNEL_CONFIG_LC := $(shell printf "%s" "$(KERNEL_CONFIG)" | $(TR) A-Z a-z) +# +# Validate configuration options +# +ifneq ($(ARCH_CONFIG),) +ifeq ($(filter $(ARCH_CONFIG),$(SUPPORTED_ARCH_CONFIGS)),) +$(error Unsupported ARCH_CONFIG $(ARCH_CONFIG)) +endif +endif + +ifneq ($(KERNEL_CONFIG),) +ifeq ($(filter $(KERNEL_CONFIG),$(SUPPORTED_KERNEL_CONFIGS)),) +$(error Unsupported KERNEL_CONFIG $(KERNEL_CONFIG)) +endif +endif + +ifneq ($(MACHINE_CONFIG),) +ifeq ($(filter $(MACHINE_CONFIG),$(SUPPORTED_MACHINE_CONFIGS)),) +$(error Unsupported MACHINE_CONFIG $(MACHINE_CONFIG)) +endif +endif + +ifneq ($(PLATFORM),) +ifeq ($(filter $(PLATFORM),$(SUPPORTED_PLATFORMS)),) +$(error Unsupported PLATFORM $(PLATFORM)) +endif +endif + # # Kernel Configuration to install # -# supported install architecture : PPC I386 X86_64 ARM +# supported install architecture : I386 X86_64 ARM # export INSTALL_TYPE = $(DEFAULT_KERNEL_CONFIG) ifndef INSTALL_ARCHS -export INSTALL_ARCHS = $(strip $(foreach my_config, $(SUPPORTED_ARCH_CONFIGS), $(findstring $(my_config), $(TARGET_CONFIGS_UC)))) +export INSTALL_ARCHS = $(strip $(foreach my_config, $(SUPPORTED_ARCH_CONFIGS), $(filter $(TARGET_CONFIGS_UC),$(my_config)))) export INSTALL_ARCHS_LC := $(shell printf "%s" "$(INSTALL_ARCHS)" | $(TR) A-Z a-z) endif export INSTALL_ARCH_DEFAULT = $(firstword $(INSTALL_ARCHS)) +ifeq ($(INSTALL_ARCH_DEFAULT),) +$(error Could not determine INSTALL_ARCH_DEFAULT) +endif # # Standard defines list # -export DEFINES = -DAPPLE -DKERNEL -DKERNEL_PRIVATE -DXNU_KERNEL_PRIVATE -DPRIVATE -D__MACHO__=1 -Dvolatile=__volatile $(IDENT) +export DEFINES = -DAPPLE -DKERNEL -DKERNEL_PRIVATE -DXNU_KERNEL_PRIVATE \ + -DPRIVATE -D__MACHO__=1 -Dvolatile=__volatile $(IDENT) # # Compiler command @@ -166,20 +211,35 @@ KC++ := $(CXX) # CWARNFLAGS_STD = \ - -Wall -Wno-format-y2k -W -Wstrict-prototypes -Wmissing-prototypes \ - -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch \ - -Wshadow -Wcast-align -Wchar-subscripts -Winline \ - -Wnested-externs -Wredundant-decls + -Wall -Werror -Wno-format-y2k -Wextra -Wstrict-prototypes \ + -Wmissing-prototypes -Wpointer-arith -Wreturn-type -Wcast-qual \ + -Wwrite-strings -Wswitch -Wshadow -Wcast-align -Wchar-subscripts \ + -Winline -Wnested-externs -Wredundant-decls -Wextra-tokens + +# Certain warnings are non-fatal (8474835) +CWARNFLAGS_STD += -Wno-error=cast-align +# Can be overridden in Makefile.template or Makefile.$arch export CWARNFLAGS ?= $(CWARNFLAGS_STD) +define add_perfile_cflags +$(1)_CWARNFLAGS_ADD += $2 +endef + CXXWARNFLAGS_STD = \ - -Wall -Wno-format-y2k -W \ - -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch \ - -Wcast-align -Wchar-subscripts -Wredundant-decls + -Wall -Werror -Wno-format-y2k -Wextra -Wpointer-arith -Wreturn-type \ + -Wcast-qual -Wwrite-strings -Wswitch -Wcast-align -Wchar-subscripts \ + -Wredundant-decls -Wextra-tokens + +# Certain warnings are non-fatal (8474835) +CXXWARNFLAGS_STD += -Wno-error=cast-align +# Can be overridden in Makefile.template or Makefile.$arch export CXXWARNFLAGS ?= $(CXXWARNFLAGS_STD) +define add_perfile_cxxflags +$(1)_CXXWARNFLAGS_ADD += $2 +endef # # Setup for parallel sub-makes based on 2 times number of logical CPUs @@ -191,12 +251,10 @@ endif # # Default ARCH_FLAGS, for use with compiler/linker/assembler/mig drivers -ARCH_FLAGS_PPC = -arch ppc ARCH_FLAGS_I386 = -arch i386 ARCH_FLAGS_X86_64 = -arch x86_64 ARCH_FLAGS_ARM = $($(addsuffix $(MACHINE_CONFIG),ARCH_FLAGS_ARM_)) -ARCH_FLAGS_ALL_PPC = $(ARCH_FLAGS_PPC) ARCH_FLAGS_ALL_I386 = $(ARCH_FLAGS_I386) ARCH_FLAGS_ALL_X86_64 = $(ARCH_FLAGS_X86_64) ARCH_FLAGS_ALL_ARM = -arch arm @@ -209,14 +267,16 @@ ifdef RC_CFLAGS export OTHER_CFLAGS = $(subst $(addprefix -arch ,$(RC_ARCHS)),,$(RC_CFLAGS)) endif +export DSYMRESDIR = ./Contents/Resources/ export DSYMBUILDDIR = ./Contents/Resources/DWARF/ # # We must not use -fno-keep-inline-functions, or it will remove the dtrace # probes from the kernel. # -export CFLAGS_GEN = -static $(DEBUG_CFLAGS) -nostdinc -nostdlib \ - -fno-builtin -finline -fno-common -msoft-float \ +export CFLAGS_GEN = -static $(DEBUG_CFLAGS) -nostdinc \ + -freorder-blocks \ + -fno-builtin -fno-common -msoft-float \ -fsigned-bitfields -fno-stack-protector $(OTHER_CFLAGS) ifeq ($(BUILD_STABS),1) @@ -234,15 +294,14 @@ export CFLAGS_DEVELOPMENT = export CFLAGS_DEBUG = export CFLAGS_PROFILE = -pg -export CFLAGS_PPC = -Dppc -DPPC -D__PPC__ -DPAGE_SIZE_FIXED \ - -mno-altivec -force_cpusubtype_ALL export CFLAGS_I386 = -Di386 -DI386 -D__I386__ \ - -DPAGE_SIZE_FIXED -force_cpusubtype_ALL + -DPAGE_SIZE_FIXED export CFLAGS_X86_64 = -Dx86_64 -DX86_64 -D__X86_64__ -DLP64 \ -DPAGE_SIZE_FIXED -mkernel export CFLAGS_ARM = -Darm -DARM -D__ARM__ -DPAGE_SIZE_FIXED \ -fno-strict-aliasing -fno-keep-inline-functions + ifeq (-arch armv7,$(ARCH_FLAGS_ARM)) CFLAGS_ARM += -mthumb endif @@ -256,21 +315,16 @@ ifeq (-arch xscale,$(ARCH_FLAGS_ARM)) CFLAGS_ARM += -mthumb endif -export CFLAGS_RELEASEPPC = -O2 -mcpu=750 -mmultiple -export CFLAGS_DEVELOPMENTPPC = -O2 -mcpu=750 -mmultiple -export CFLAGS_DEBUGPPC = -O2 -mcpu=750 -mmultiple -export CFLAGS_PROFILEPPC = -O2 -mcpu=750 -mmultiple +export CFLAGS_RELEASEI386 = -O2 +export CFLAGS_DEVELOPMENTI386 = -O2 +export CFLAGS_DEBUGI386 = -O0 +export CFLAGS_PROFILEI386 = -O2 -export CFLAGS_RELEASEI386 = -Os -export CFLAGS_DEVELOPMENTI386 = -Os -export CFLAGS_DEBUGI386 = -Os -export CFLAGS_PROFILEI386 = -Os - -export CFLAGS_RELEASEX86_64 = -Os -export CFLAGS_DEVELOPMENTX86_64 = -Os +export CFLAGS_RELEASEX86_64 = -O2 +export CFLAGS_DEVELOPMENTX86_64 = -O2 # No space optimization for the DEBUG kernel for the benefit of gdb: export CFLAGS_DEBUGX86_64 = -O0 -export CFLAGS_PROFILEX86_64 = -Os +export CFLAGS_PROFILEX86_64 = -O2 export CFLAGS_RELEASEARM = -O2 export CFLAGS_DEVELOPMENTARM = -O2 @@ -285,14 +339,35 @@ export CFLAGS = $(CFLAGS_GEN) \ $($(addsuffix $(ARCH_CONFIG), $(addsuffix $(KERNEL_CONFIG),CFLAGS_))) \ $(DEFINES) +# # Default C++ flags # -CXXFLAGS_GEN = -fno-rtti -fno-exceptions -fcheck-new -fapple-kext + +OTHER_CXXFLAGS = + +CXXFLAGS_GEN = -fno-rtti -fno-exceptions -fcheck-new -fapple-kext \ + $(OTHER_CXXFLAGS) CXXFLAGS = $(CXXFLAGS_GEN) \ $($(addsuffix $(ARCH_CONFIG),CXXFLAGS_)) \ $($(addsuffix $(KERNEL_CONFIG),CXXFLAGS_)) + +# +# Support for LLVM Link Time Optimization (LTO) +# + +ifeq ($(BUILD_LTO),1) +export CFLAGS_GEN += -flto +export CXXFLAGS_GEN += -flto +export BUILD_MACHO_OBJ = 0 +export BUILD_LTO = 1 +else +export BUILD_MACHO_OBJ = 1 +export BUILD_LTO = 0 +endif + + # # Assembler command # @@ -309,7 +384,6 @@ export SFLAGS_DEVELOPMENT = export SFLAGS_DEBUG = export SFLAGS_PROFILE = -export SFLAGS_PPC = $(CFLAGS_PPC) -force_cpusubtype_ALL export SFLAGS_I386 = $(CFLAGS_I386) export SFLAGS_ARM = $(CFLAGS_ARM) export SFLAGS_X86_64 = $(CFLAGS_X86_64) @@ -332,6 +406,7 @@ LD = $(KC++) -nostdlib export LDFLAGS_KERNEL_GEN = \ -static \ + -nostdlib \ -fapple-kext \ -Wl,-e,__start \ -Wl,-sectalign,__TEXT,__text,0x1000 \ @@ -342,10 +417,12 @@ export LDFLAGS_KERNEL_GEN = \ -Wl,-sectcreate,__PRELINK_STATE,__kexts,/dev/null \ -Wl,-sectcreate,__PRELINK_INFO,__info,/dev/null -# Availability of DWARF allows DTrace CTF (compressed type format) to be constructed +# Availability of DWARF allows DTrace CTF (compressed type format) to be constructed. +# ctf_insert creates the CTF section. It needs reserved padding in the +# headers for the load command segment and the CTF section structures. ifeq ($(BUILD_DWARF),1) export LDFLAGS_KERNEL_GEN += \ - -Wl,-sectcreate,__CTF,__ctf,/dev/null + -Wl,-headerpad,152 endif export LDFLAGS_KERNEL_RELEASE = @@ -353,14 +430,6 @@ export LDFLAGS_KERNEL_DEVELOPMENT = export LDFLAGS_KERNEL_DEBUG = export LDFLAGS_KERNEL_PROFILE = -export LDFLAGS_KERNEL_PPC = \ - -force_cpusubtype_ALL \ - -Wl,-new_linker \ - -Wl,-pagezero_size,0x0 \ - -Wl,-segaddr,__VECTORS,0x0 \ - -Wl,-segaddr,__HIB,0x7000 \ - -Wl,-segaddr,__TEXT,0xe000 - export LDFLAGS_KERNEL_RELEASEI386 = \ -Wl,-new_linker \ -Wl,-pagezero_size,0x0 \ @@ -393,13 +462,13 @@ export LDFLAGS_KERNEL_PROFILEX86_64 = $(LDFLAGS_KERNEL_RELEASEX86_64) export LDFLAGS_KERNEL_RELEASEARM = \ -Wl,-new_linker \ -Wl,-pagezero_size,0x0 \ - -Wl,-image_base,0xC0001000 \ + -Wl,-image_base,0x80001000 \ -Wl,-exported_symbols_list,$(TARGET)/kernel-kpi.exp export LDFLAGS_KERNEL_DEVELOPMENTARM = \ -Wl,-new_linker \ -Wl,-pagezero_size,0x0 \ - -Wl,-image_base,0xC0001000 + -Wl,-image_base,0x80001000 export LDFLAGS_KERNEL_DEBUGARM = $(LDFLAGS_KERNEL_DEVELOPMENTARM) @@ -417,11 +486,6 @@ export LDFLAGS_KERNEL = $(LDFLAGS_KERNEL_GEN) \ # export LD_KERNEL_LIBS = -lcc_kext -# -# Command to generate host binaries. Intentionally not -# $(CC), which controls the target compiler -# -HOST_CC = cc # # Default INCFLAGS @@ -462,12 +526,25 @@ DATA_INSTALL_FLAGS = -c -m 0644 # # Header file destinations # -FRAMEDIR = System/Library/Frameworks +ifeq ($(RC_ProjectName),xnu_headers_Sim) + HEADER_INSTALL_PREFIX = $(SDKROOT) +else + HEADER_INSTALL_PREFIX = +endif + +FRAMEDIR = $(HEADER_INSTALL_PREFIX)/System/Library/Frameworks + +SINCVERS = B +SINCFRAME = $(FRAMEDIR)/System.framework +SINCDIR = $(SINCFRAME)/Versions/$(SINCVERS)/Headers +SPINCDIR = $(SINCFRAME)/Versions/$(SINCVERS)/PrivateHeaders +SRESDIR = $(SINCFRAME)/Versions/$(SINCVERS)/Resources + ifndef INCDIR - INCDIR = /usr/include + INCDIR = $(HEADER_INSTALL_PREFIX)/usr/include endif ifndef LCLDIR - LCLDIR = $(FRAMEDIR)/System.framework/Versions/B/PrivateHeaders + LCLDIR = $(SPINCDIR) endif KINCVERS = A @@ -478,10 +555,12 @@ KRESDIR = $(KINCFRAME)/Versions/$(KINCVERS)/Resources XNU_PRIVATE_UNIFDEF = -UMACH_KERNEL_PRIVATE -UBSD_KERNEL_PRIVATE -UIOKIT_KERNEL_PRIVATE -ULIBKERN_KERNEL_PRIVATE -ULIBSA_KERNEL_PRIVATE -UPEXPERT_KERNEL_PRIVATE -UXNU_KERNEL_PRIVATE -SPINCFRAME_UNIFDEF = $(XNU_PRIVATE_UNIFDEF) -UKERNEL_PRIVATE -UKERNEL -DPRIVATE -U_OPEN_SOURCE_ -SINCFRAME_UNIFDEF = $(XNU_PRIVATE_UNIFDEF) -UKERNEL_PRIVATE -UKERNEL -UPRIVATE -D_OPEN_SOURCE_ -KPINCFRAME_UNIFDEF = $(XNU_PRIVATE_UNIFDEF) -DKERNEL_PRIVATE -DPRIVATE -DKERNEL -U_OPEN_SOURCE_ -KINCFRAME_UNIFDEF = $(XNU_PRIVATE_UNIFDEF) -UKERNEL_PRIVATE -UPRIVATE -DKERNEL -D_OPEN_SOURCE_ +PLATFORM_UNIFDEF = $(foreach x,$(SUPPORTED_PLATFORMS),$(if $(filter $(PLATFORM),$(x)),-DPLATFORM_$(x),-UPLATFORM_$(x))) + +SPINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) -UKERNEL_PRIVATE -UKERNEL -DPRIVATE -U_OPEN_SOURCE_ +SINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) -UKERNEL_PRIVATE -UKERNEL -UPRIVATE -D_OPEN_SOURCE_ +KPINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) -DKERNEL_PRIVATE -DPRIVATE -DKERNEL -U_OPEN_SOURCE_ +KINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) -UKERNEL_PRIVATE -UPRIVATE -DKERNEL -D_OPEN_SOURCE_ # @@ -504,7 +583,6 @@ export STRIP_FLAGS = $($(addsuffix $(KERNEL_CONFIG),STRIP_FLAGS_)) # export DSYMUTIL_FLAGS_I386 = --arch=i386 export DSYMUTIL_FLAGS_X86_64 = --arch=x86_64 -export DSYMUTIL_FLAGS_PPC = --arch=ppc export DSYMUTIL_FLAGS_ARM = --arch=arm export DSYMUTIL_FLAGS = $($(addsuffix $(ARCH_CONFIG),DSYMUTIL_FLAGS_)) diff --git a/makedefs/MakeInc.dir b/makedefs/MakeInc.dir index 7f98650e6..b4b594cd6 100644 --- a/makedefs/MakeInc.dir +++ b/makedefs/MakeInc.dir @@ -1,15 +1,25 @@ # # Install kernel header files # +.PHONY: installhdrs + ifeq ($(RC_ProjectName),Libsyscall) installhdrs: - bsdmake -C libsyscall installhdrs + cd libsyscall ; \ + sdk="$(SDKROOT)" ; \ + if [ $${sdk} = / ] ; then \ + sdk="" ; \ + fi; \ + xcrun -sdk "$(SDKROOT)" xcodebuild installhdrs \ + "SRCROOT=$(SRCROOT)/libsyscall" \ + "OBJROOT=$(OBJROOT)" \ + "SYMROOT=$(SYMROOT)" \ + "DSTROOT=$(DSTROOT)" \ + "SDKROOT=$${sdk}" else ifeq ($(findstring libkxld,$(RC_ProjectName)),libkxld) installhdrs: make -C libkern/kxld/ installhdrs -else ifeq ($(RC_ProjectName),xnu_debug) -installhdrs: -else # xnu +else # xnu, xnu_debug, or xnu_headers_Sim installhdrs: exporthdrs installhdrs_mi installhdrs_md @echo "[ $(SRCROOT) ] make installhdrs installing Kernel.framework" $(_v)kincpath=$(DSTROOT)/$(KINCDIR); \ @@ -28,28 +38,52 @@ installhdrs: exporthdrs installhdrs_mi installhdrs_md [ -d $(DSTROOT)/$(KPINCDIR) ] || $(MKDIR) $(DSTROOT)/$(KPINCDIR); \ cd $$kframepath; [ -L PrivateHeaders ] || \ $(LN) Versions/Current/PrivateHeaders PrivateHeaders; +ifeq ($(RC_ProjectName),xnu_headers_Sim) + @echo "[ $(SRCROOT) ] make installhdrs installing System.framework" + $(_v)spincpath=$(DSTROOT)/$(SPINCDIR); \ + sframepath=$(DSTROOT)/$(SINCFRAME); \ + [ -d $$spincpath ] || $(MKDIR) $$spincpath; \ + cd $$sframepath/Versions; \ + [ -L Current ] || $(LN) $(SINCVERS) Current; \ + cd $$sframepath; [ -L PrivateHeaders ] || \ + $(LN) Versions/Current/PrivateHeaders PrivateHeaders; +endif +ifeq (iPhoneOS,$(PLATFORM)) + $(_v)$(IPHONEOS_OPTIMIZE) $(DSTROOT)/$(KRESDIR)/Info.plist +endif endif # # Install header files order # .ORDER: installhdrs_mi installhdrs_md +.PHONY: installhdrs_mi installhdrs_md # # Install machine independent header files # installhdrs_mi: - $(_v)rel_path=$(shell $(RELPATH) $(SRCROOT) $(SOURCE)); \ + $(_v)rel_path=$(shell $(RELPATH) $(SRCROOT) $(SOURCE)); \ kernel_config=$(INSTALL_TYPE); \ - arch_config=$(INSTALL_ARCH_DEFAULT); \ - installinc_dir=${OBJROOT}/$${kernel_config}_$${arch_config}/$${rel_path}; \ - [ -d $${installinc_dir} ] ||$(MKDIR) $${installinc_dir}; \ + machine_config=$(MACHINE_CONFIG); \ + arch_config=$(INSTALL_ARCH_DEFAULT); \ + if [ $${arch_config} = ARM ] ; then \ + if [ $${machine_config} = DEFAULT ] ; then \ + machine_config=$(DEFAULT_ARM_MACHINE_CONFIG); \ + fi; \ + fi; \ + if [ $${machine_config} = DEFAULT ] ; then \ + installinc_dir=${OBJROOT}/$${kernel_config}_$${arch_config}/$${rel_path}; \ + else \ + installinc_dir=${OBJROOT}/$${kernel_config}_$${arch_config}_$${machine_config}/$${rel_path}; \ + fi; \ + [ -d $${installinc_dir} ] || $(MKDIR) $${installinc_dir}; \ ${MAKE} ${MAKEJOBS} -C $${installinc_dir} \ KERNEL_CONFIG=$${kernel_config} \ ARCH_CONFIG=$${arch_config} \ MAKEFILES=${SOURCE}/Makefile \ SOURCE=${SOURCE}/ \ - TARGET=${OBJROOT}/$${kernel_config}_$${arch_config}/$${rel_path}/ \ + TARGET=$${installinc_dir}/ \ build_installhdrs_mi; \ # @@ -67,17 +101,17 @@ installhdrs_md: fi; \ fi; \ if [ $${machine_config} = DEFAULT ] ; then \ - objpath=${OBJROOT}/$${kernel_config}_$${arch_config}/$${rel_path}; \ + installinc_dir=${OBJROOT}/$${kernel_config}_$${arch_config}/$${rel_path}; \ else \ - objpath=${OBJROOT}/$${kernel_config}_$${arch_config}_$${machine_config}/$${rel_path}; \ + installinc_dir=${OBJROOT}/$${kernel_config}_$${arch_config}_$${machine_config}/$${rel_path}; \ fi; \ - [ -d $${objpath} ] || $(MKDIR) $${objpath}; \ - ${MAKE} ${MAKEJOBS} -C $${objpath} \ + [ -d $${installinc_dir} ] || $(MKDIR) $${installinc_dir}; \ + ${MAKE} ${MAKEJOBS} -C $${installinc_dir} \ KERNEL_CONFIG=$${kernel_config} \ ARCH_CONFIG=$${arch_config} \ MAKEFILES=${SOURCE}/Makefile \ SOURCE=${SOURCE}/ \ - TARGET=$${objpath}/ \ + TARGET=$${installinc_dir}/ \ build_installhdrs_md; \ done; @@ -126,12 +160,15 @@ build_installhdrs_md: $(BUILD_INSTALLHDRS_MD_SUBDIRS_TARGETS) # # Install kernel header files # +.PHONY: exporthdrs + exporthdrs: exporthdrs_mi exporthdrs_md # # Install header files order # .ORDER: exporthdrs_mi exporthdrs_md +.PHONY: exporthdrs_mi exporthdrs_md # # Install machine independent header files @@ -139,11 +176,21 @@ exporthdrs: exporthdrs_mi exporthdrs_md do_exporthdrs_mi: exporthdrs_mi: - $(_v)rel_path=$(shell $(RELPATH) $(SRCROOT) $(SOURCE)); \ + $(_v)rel_path=$(shell $(RELPATH) $(SRCROOT) $(SOURCE)); \ kernel_config=$(INSTALL_TYPE); \ - arch_config=$(INSTALL_ARCH_DEFAULT); \ - exportinc_dir=${OBJROOT}/$${kernel_config}_$${arch_config}/$${rel_path}; \ - [ -d $${exportinc_dir} ] || $(MKDIR) $${exportinc_dir}; \ + machine_config=$(MACHINE_CONFIG); \ + arch_config=$(INSTALL_ARCH_DEFAULT); \ + if [ $${arch_config} = ARM ] ; then \ + if [ $${machine_config} = DEFAULT ] ; then \ + machine_config=$(DEFAULT_ARM_MACHINE_CONFIG); \ + fi; \ + fi; \ + if [ $${machine_config} = DEFAULT ] ; then \ + exportinc_dir=${OBJROOT}/$${kernel_config}_$${arch_config}/$${rel_path}; \ + else \ + exportinc_dir=${OBJROOT}/$${kernel_config}_$${arch_config}_$${machine_config}/$${rel_path}; \ + fi; \ + [ -d $${exportinc_dir} ] || $(MKDIR) $${exportinc_dir}; \ ${MAKE} ${MAKEJOBS} -C $${exportinc_dir} \ KERNEL_CONFIG=$${kernel_config} \ ARCH_CONFIG=$${arch_config} \ @@ -253,23 +300,21 @@ build_exporthdrs_md: $(BUILD_EXPORTHDRS_MD_SUBDIRS_TARGETS) # # Setup pass for all architectures for all Configuration/Architecture options # +.PHONY: setup + setup: - $(_v)rel_path=$(shell $(RELPATH) $(SRCROOT) $(SOURCE)); \ - for kernel_config in $(KERNEL_CONFIGS); \ - do \ - for arch_config in $(ARCH_CONFIGS); \ - do \ - setup_subdir=${OBJROOT}/$${kernel_config}_$${arch_config}/$${rel_path}; \ - [ -d $${setup_subdir} ] || $(MKDIR) $${setup_subdir}; \ - ${MAKE} -C $${setup_subdir} \ + $(_v)rel_path=$(shell $(RELPATH) $(SRCROOT) $(SOURCE)); \ + kernel_config=$(INSTALL_TYPE); \ + arch_config=$(INSTALL_ARCH_DEFAULT); \ + setup_subdir=${OBJROOT}/$${rel_path}; \ + [ -d $${setup_subdir} ] || $(MKDIR) $${setup_subdir}; \ + ${MAKE} ${MAKEJOBS} -C $${setup_subdir} \ KERNEL_CONFIG=$${kernel_config} \ ARCH_CONFIG=$${arch_config} \ MAKEFILES=${SOURCE}/Makefile \ SOURCE=${SOURCE}/ \ TARGET=$${setup_subdir}/ \ - build_setup; \ - done; \ - done; + build_setup; do_build_setup: @@ -297,15 +342,32 @@ build_setup: $(BUILD_SETUP_SUBDIRS_TARGETS) # item in the set is the kernel configuration. The second item in the set is the architecture # and the third item is the machine configuration. There may be multiple sets to build. # +.PHONY: all + ifeq ($(RC_ProjectName),Libsyscall) all: - bsdmake -C libsyscall install -else ifeq ($(findstring libkxld,$(RC_ProjectName)),libkxld) + cd libsyscall ; \ + sdk="$(SDKROOT)" ; \ + if [ $${sdk} = / ] ; then \ + sdk="" ; \ + fi; \ + xcrun -sdk "$(SDKROOT)" xcodebuild install \ + "SRCROOT=$(SRCROOT)/libsyscall" \ + "OBJROOT=$(OBJROOT)" \ + "SYMROOT=$(SYMROOT)" \ + "DSTROOT=$(DSTROOT)" \ + "SDKROOT=$${sdk}" +else ifeq ($(RC_ProjectName),libkxld) all: make -C libkern/kxld/ install +else ifeq ($(RC_ProjectName),libkxld_host) +all: + make -C libkern/kxld/ install PRODUCT_TYPE=ARCHIVE +else ifeq ($(RC_ProjectName),xnu_headers_Sim) +all: exporthdrs else # xnu or xnu_debug ifeq ($(COMPONENT), .) -all: exporthdrs +all: exporthdrs setup else all: endif @@ -462,6 +524,8 @@ install: installhdrs all installman installmachinekernels ifeq ($(RC_ProjectName),Libsyscall) # nothing to do else ifeq ($(findstring libkxld,$(RC_ProjectName)),libkxld) +# nothing to do, work performed in "all" action +else ifeq ($(RC_ProjectName),xnu_headers_Sim) # nothing to do else # xnu or xnu_debug $(_v)rel_path=$(shell $(RELPATH) $(SRCROOT) $(SOURCE)); \ @@ -523,10 +587,10 @@ installmachinekernels: machine_config=$${my_config}; \ if [ $${machine_config} != DEFAULT ] ; then \ build_subdir=${OBJROOT}/$${kernel_config}_$${arch_config}_$${machine_config}; \ - install_file_list=mach.`printf "%s" "$${kernel_config}" | $(TR) A-Z a-z`.`printf "%s" "$${machine_config}" | $(TR) A-Z a-z`; \ + install_kernel_file=mach.`printf "%s" "$${kernel_config}" | $(TR) A-Z a-z`.`printf "%s" "$${machine_config}" | $(TR) A-Z a-z`; \ [ -d $${build_subdir} ] || $(MKDIR) $${build_subdir}; \ ${MAKE} ${MAKEJOBS} -C $${build_subdir} \ - INSTALL_FILE_LIST=$${install_file_list} \ + INSTALL_KERNEL_FILE=$${install_kernel_file} \ KERNEL_CONFIG=$${kernel_config} \ ARCH_CONFIG=$${arch_config} \ MACHINE_CONFIG=$${machine_config} \ @@ -578,13 +642,17 @@ build_install: $(BUILD_INSTALL_SUBDIRS_TARGETS) # # Install source tree # +.PHONY: installsrc + installsrc: - $(_v)($(TAR) -c --mode go=r,+X --no-ignore-case --exclude .svn --exclude cscope.\* --exclude BUILD --exclude \*~ -f - .) | (cd $(SRCROOT) && $(TAR) --no-same-owner -xf -) + $(_v)($(TAR) -c --mode go=r,+X --no-ignore-case --exclude .svn --exclude .git --exclude cscope.\* --exclude BUILD --exclude \*~ -f - .) | (cd $(SRCROOT) && $(TAR) --no-same-owner -xf -) # # Clean up source tree # +.PHONY: clean + clean: # @@ -626,12 +694,17 @@ TAGS: cscope.files # # Install Man Pages # +.PHONY: installman + installman: ifeq ($(RC_ProjectName),Libsyscall) - bsdmake -C libsyscall install-man +# nothing to do else ifeq ($(findstring libkxld,$(RC_ProjectName)),libkxld) # nothing to do -else # xnu or xnu_debug +else ifeq ($(findstring xnu_,$(RC_ProjectName)),xnu_) +installman: +# nothing to do +else # xnu @echo "[ $(SRCROOT) ] Installing man pages" $(_v)manpath=$(DSTROOT)/$(MANDIR); \ [ -d $$manpath ] || $(MKDIR) $$manpath; \ diff --git a/makedefs/MakeInc.rule b/makedefs/MakeInc.rule index 3ba713083..b2d7e3af3 100644 --- a/makedefs/MakeInc.rule +++ b/makedefs/MakeInc.rule @@ -523,7 +523,9 @@ S_RULE_3= C_RULE_1A=$(_v)${KCC} -c ${filter-out ${$@_CFLAGS_RM}, ${CFLAGS} ${CWARNFLAGS}} -MD ${$@_CFLAGS_ADD} ${$@_CWARNFLAGS_ADD} ${INCFLAGS} ${$@_INCFLAGS} C_RULE_1B=$*.c C_RULE_2=@echo CC $@ -ifeq ($(BUILD_STABS),1) +ifeq ($(BUILD_MACHO_OBJ),0) +C_RULE_3= +else ifeq ($(BUILD_STABS),1) C_RULE_3= else C_RULE_3=$(_v)${CTFCONVERT} -l xnu -v -o $(TARGET)$(COMP_OBJ_DIR)/$(KERNEL_CONFIG)/$@.ctf $@ > /dev/null && $(CTFSCRUB) `cat $(SRCROOT)/config/DtraceIgnored.symbols` $(TARGET)$(COMP_OBJ_DIR)/$(KERNEL_CONFIG)/$@.ctf || true; @@ -547,7 +549,9 @@ P_RULE_1A=$(_v)${KC++} -o $@ -c ${CXXFLAGS} ${filter-out ${$@_CFLAGS_RM}, ${CFLA P_RULE_1B=$( $(@:.cpo=.d~) && mv $(@:.cpo=.d~) $(@:.cpo=.d) P_RULE_3=@echo C++ $@ -ifeq ($(BUILD_STABS),1) +ifeq ($(BUILD_MACHO_OBJ),0) +P_RULE_4= +else ifeq ($(BUILD_STABS),1) P_RULE_4= else P_RULE_4=$(_v)${CTFCONVERT} -l xnu -v -o $(TARGET)$(COMP_OBJ_DIR)/$(KERNEL_CONFIG)/$@.ctf $@ > /dev/null && $(CTFSCRUB) `cat $(SRCROOT)/config/DtraceIgnored.symbols` $(TARGET)$(COMP_OBJ_DIR)/$(KERNEL_CONFIG)/$@.ctf || true; @@ -568,25 +572,37 @@ STATIC_KMODS = $(SRCROOT)/kmods.a do_build_mach_kernel: $(TARGET)/kgmacros $(TARGET)/mach_kernel -$(TARGET)/mach_kernel: $(addprefix $(TARGET)/,$(foreach component,$(COMPONENT_LIST), $(addprefix $(component)/$(firstword $($(addsuffix _KERNEL_CONFIG, $(shell printf $(component) | tr a-z A-Z))) $(KERNEL_CONFIG))/, $(addsuffix .o, $(component))))) lastkernelconstructor.o +$(TARGET)/mach_kernel: $(addprefix $(TARGET)/,$(foreach component,$(COMPONENT_LIST), $(addprefix $(component)/$(firstword $($(addsuffix _KERNEL_CONFIG, $(shell printf $(component) | tr a-z A-Z))) $(KERNEL_CONFIG))/, $(addsuffix .filelist, $(component))))) lastkernelconstructor.o $(_v)${MAKE} version.o $(_v)${MAKE} build_mach_kernel_exports @echo LD mach_kernel.sys - $(_v)$(CAT) $(addprefix $(TARGET)/,$(foreach component,$(COMPONENT_LIST), $(addprefix $(component)/$(firstword $($(addsuffix _KERNEL_CONFIG, $(shell printf $(component) | tr a-z A-Z))) $(KERNEL_CONFIG))/, $(addsuffix .o, $(component))))) > mach_kernel.filelist - $(_v)$(LD) $(LDFLAGS_KERNEL) -filelist mach_kernel.filelist version.o lastkernelconstructor.o `if [ -e $(STATIC_KMODS) ]; then echo $(STATIC_KMODS); fi` \ + $(_v)$(CAT) $(addprefix $(TARGET)/,$(foreach component,$(COMPONENT_LIST), $(addprefix $(component)/$(firstword $($(addsuffix _KERNEL_CONFIG, $(shell printf $(component) | tr a-z A-Z))) $(KERNEL_CONFIG))/, $(addsuffix .filelist, $(component))))) < /dev/null > link.filelist + $(_v)$(LD) $(LDFLAGS_KERNEL) -filelist link.filelist version.o lastkernelconstructor.o `if [ -e $(STATIC_KMODS) ]; then echo $(STATIC_KMODS); fi` \ -o $(TARGET)/mach_kernel.sys $(LD_KERNEL_LIBS) - @echo DSYMUTIL mach_kernel.sys $(_v)if [ $(BUILD_DWARF) -eq 1 ]; then \ + echo DSYMUTIL mach_kernel.sys; \ $(DSYMUTIL) $(DSYMUTIL_FLAGS) $(TARGET)/mach_kernel.sys -o $(TARGET)/mach_kernel.sys.dSYM > /dev/null; \ + $(INSTALL) $(INSTALL_FLAGS) $(SRCROOT)/kgmacros $(TARGET)/mach_kernel.sys.dSYM/$(DSYMRESDIR)/kgmacros; \ + fi; + $(_v)if [ $(MACHINE_CONFIG) != DEFAULT ] ; then \ + kernel_file_name=mach.`printf "%s" "$(KERNEL_CONFIG)" | $(TR) A-Z a-z`.`printf "%s" "$(MACHINE_CONFIG)" | $(TR) A-Z a-z`; \ + echo kernel_file_name $${kernel_file_name}; \ + [ -h ${OBJROOT}/$${kernel_file_name} ] || $(LN) $(TARGET)/mach_kernel ${OBJROOT}/$${kernel_file_name}; \ fi; @echo STRIP mach_kernel $(_v)$(STRIP) $(STRIP_FLAGS) $(TARGET)/mach_kernel.sys -o $(TARGET)/mach_kernel - @echo CTFMERGE mach_kernel - $(_v)if [ $(BUILD_DWARF) -eq 1 ]; then \ + $(_v)if [ $(BUILD_MACHO_OBJ) -eq 1 -a $(BUILD_DWARF) -eq 1 ]; then \ + echo CTFMERGE mach_kernel; \ $(FIND) $(OBJPATH)/ -name \*.ctf -size 0 \ -exec $(RM) -rf {} \; ; \ - $(CTFMERGE) -l xnu -o $(TARGET)/mach_kernel \ - $(OBJPATH)/*/$(KERNEL_CONFIG)/*.*o.ctf || true; \ + $(CTFMERGE) -l xnu -o $(TARGET)/mach_kernel \ + -Z $(TARGET)/mach_kernel.ctfdata \ + $(OBJPATH)/*/$(KERNEL_CONFIG)/*.*o.ctf || true; \ + echo CTFINSERT mach_kernel; \ + $(CTFINSERT) $(TARGET)/mach_kernel \ + $($(addsuffix $(ARCH_CONFIG),ARCH_FLAGS_)) $(TARGET)/mach_kernel.ctfdata \ + -o $(TARGET)/mach_kernel || true; \ + $(RM) -f $(TARGET)/mach_kernel.ctfdata > /dev/null || true; \ fi; \ version.o: $(OBJPATH)/version.c @@ -599,9 +615,13 @@ $(OBJPATH)/version.c: $(SRCROOT)/config/version.c $(NEWVERS) $(SRCROOT)/config/M $(_v)$(CP) $< $@ $(_v)$(NEWVERS) $(OBJPATH)/version.c > /dev/null; +# "/libsa" needed because TARGET ends in "/." +lastkernelconstructor.o: COMP_OBJ_DIR=/libsa lastkernelconstructor.o: $(SRCROOT)/libsa/lastkernelconstructor.c + $(_v)$(MKDIR) $(TARGET)$(COMP_OBJ_DIR)/$(KERNEL_CONFIG) ${C_RULE_1A}$< ${C_RULE_2} + ${C_RULE_3} ${C_RULE_4} $(TARGET)/kgmacros: $(SRCROOT)/kgmacros @@ -615,65 +635,19 @@ build_mach_kernel_exports: TARGET=$${TARGET} \ build_mach_kernel_exports; -# Special rules to install machine configuration variants - -$(DSTROOT)$(INSTALL_FILE_DIR)mach.$(KERNEL_CONFIG_LC).$(MACHINE_CONFIG_LC): $(TARGET)/mach_kernel force_file_install - @echo Installing $< in $@; - $(_v)if [ ! -e $(DSTROOT)$(INSTALL_FILE_DIR) ]; then \ - $(MKDIR) $(DSTROOT)$(INSTALL_FILE_DIR); \ - fi; \ - if [ "`echo $(INSTALL_ARCHS_LC) | wc -w`" -eq 1 ]; then \ - $(RM) $(RMFLAGS) $@; \ - $(INSTALL) $(FILE_INSTALL_FLAGS) $< $@; \ - else \ - if [ ! -e $@ ]; then \ - print "" >empty_file_$(notdir $@); \ - lipo_arg="$(foreach lipo_arch,$(INSTALL_ARCHS),$(ARCH_FLAGS_$(lipo_arch)) empty_file_$(notdir $@))"; \ - $(LIPO) $${lipo_arg} -create -output $@; \ - $(RM) $(RMFLAGS) empty_file_$(notdir $@); \ - fi; \ - $(LIPO) $@ -replace $(subst -arch,,$(ARCH_FLAGS_$(ARCH_CONFIG))) $< -o $@; \ - fi - -$(SYMROOT)$(INSTALL_FILE_DIR)mach.$(KERNEL_CONFIG_LC).$(MACHINE_CONFIG_LC): $(TARGET)/mach_kernel.sys force_file_install - @echo Installing $< in $@; - $(_v)if [ ! -e $(SYMROOT)$(INSTALL_FILE_DIR) ]; then \ - $(MKDIR) $(SYMROOT)$(INSTALL_FILE_DIR); \ - fi; \ - if [ "`echo $(INSTALL_ARCHS_LC) | wc -w`" -eq 1 ]; then \ - $(RM) $(RMFLAGS) $@; \ - $(INSTALL) $(FILE_INSTALL_FLAGS) $< $@; \ - if [ $(BUILD_DWARF) -eq 1 ]; then \ - $(RM) -rf $@.dSYM; \ - $(MKDIR) -p -m 0755 $@.dSYM/$(DSYMBUILDDIR); \ - $(INSTALL) $(INSTALL_FLAGS) \ - $<.dSYM/$(DSYMBUILDDIR)/$(notdir $<) \ - $@.dSYM/$(DSYMBUILDDIR)/$(notdir $@); \ - fi; \ - else \ - if [ ! -e $@ ]; then \ - printf "" >empty_file_$(notdir $@); \ - lipo_arg="$(foreach lipo_arch,$(INSTALL_ARCHS),$(ARCH_FLAGS_$(lipo_arch)) empty_file_$(notdir $@))"; \ - $(LIPO) $${lipo_arg} -create -output $@; \ - $(RM) $(RMFLAGS) empty_file_$(notdir $@); \ - fi; \ - $(LIPO) $@ -replace $(subst -arch,,$(ARCH_FLAGS_$(ARCH_CONFIG))) $< -o $@; \ - fi - endif # mach_kernel-specific build rules # -# Generic Install rules +# Kernel Install rules # -INSTALL_FILE_FILES = $(addprefix $(DSTROOT)$(INSTALL_FILE_DIR), $(INSTALL_FILE_LIST)) -INSTALL_FILE_FILES_GENERIC = $(filter-out $(DSTROOT)$(INSTALL_FILE_DIR)mach.$(KERNEL_CONFIG_LC).$(MACHINE_CONFIG_LC), $(INSTALL_FILE_FILES)) +INSTALL_KERNEL_FILE_FILES = $(addprefix $(DSTROOT)$(INSTALL_KERNEL_DIR), $(INSTALL_KERNEL_FILE)) -force_file_install: +force_kernel_file_install: -$(INSTALL_FILE_FILES_GENERIC): $(DSTROOT)$(INSTALL_FILE_DIR)% : $(TARGET)/% force_file_install +$(INSTALL_KERNEL_FILE_FILES): $(TARGET)/mach_kernel force_kernel_file_install @echo Installing $< in $@; - $(_v)if [ ! -e $(DSTROOT)$(INSTALL_FILE_DIR) ]; then \ - $(MKDIR) $(DSTROOT)$(INSTALL_FILE_DIR); \ + $(_v)if [ ! -e $(DSTROOT)$(INSTALL_KERNEL_DIR) ]; then \ + $(MKDIR) $(DSTROOT)$(INSTALL_KERNEL_DIR); \ fi; \ if [ "`echo $(INSTALL_ARCHS_LC) | wc -w`" -eq 1 ]; then \ $(RM) $(RMFLAGS) $@; \ @@ -688,28 +662,27 @@ $(INSTALL_FILE_FILES_GENERIC): $(DSTROOT)$(INSTALL_FILE_DIR)% : $(TARGET)/% forc $(LIPO) $@ -replace $(subst -arch,,$(ARCH_FLAGS_$(ARCH_CONFIG))) $< -o $@; \ fi -INSTALL_FILESYS_FILES = $(addprefix $(SYMROOT)$(INSTALL_FILE_DIR), $(INSTALL_FILE_LIST)) -INSTALL_FILESYS_FILES_GENERIC = $(filter-out $(SYMROOT)$(INSTALL_FILE_DIR)mach.$(KERNEL_CONFIG_LC).$(MACHINE_CONFIG_LC), $(INSTALL_FILESYS_FILES)) +INSTALL_KERNEL_FILESYS_FILES = $(addprefix $(SYMROOT)$(INSTALL_KERNEL_DIR), $(INSTALL_KERNEL_FILE)) -force_filesys_install: +force_kernel_filesys_install: -$(INSTALL_FILESYS_FILES_GENERIC): $(SYMROOT)$(INSTALL_FILE_DIR)% : $(TARGET)/%.sys force_filesys_install +$(INSTALL_KERNEL_FILESYS_FILES): $(TARGET)/mach_kernel.sys force_kernel_filesys_install @echo Installing $< in $@; - $(_v)if [ ! -e $(SYMROOT)$(INSTALL_FILE_DIR) ]; then \ - $(MKDIR) $(SYMROOT)$(INSTALL_FILE_DIR); \ + $(_v)if [ ! -e $(SYMROOT)$(INSTALL_KERNEL_DIR) ]; then \ + $(MKDIR) $(SYMROOT)$(INSTALL_KERNEL_DIR); \ fi; \ if [ "`echo $(INSTALL_ARCHS_LC) | wc -w`" -eq 1 ]; then \ $(RM) $(RMFLAGS) $@; \ $(INSTALL) $(INSTALL_FLAGS) $< $@; \ if [ $(BUILD_DWARF) -eq 1 ]; then \ - $(DSYMUTIL) $(DSYMUTIL_FLAGS) \ - $(TARGET)/mach_kernel.sys \ - -o $(TARGET)/mach_kernel.sys.dSYM; \ $(RM) -rf $@.dSYM; \ $(MKDIR) -p -m 0755 $@.dSYM/$(DSYMBUILDDIR); \ $(INSTALL) $(INSTALL_FLAGS) \ $<.dSYM/$(DSYMBUILDDIR)/$(notdir $<) \ $@.dSYM/$(DSYMBUILDDIR)/$(notdir $@); \ + $(INSTALL) $(INSTALL_FLAGS) \ + $<.dSYM/$(DSYMRESDIR)/kgmacros \ + $@.dSYM/$(DSYMRESDIR)/kgmacros; \ fi; \ else \ if [ ! -e $@ ]; then \ @@ -730,29 +703,21 @@ $(INSTALL_FILESYS_FILES_GENERIC): $(SYMROOT)$(INSTALL_FILE_DIR)% : $(TARGET)/%.s $@.dSYM/$(DSYMBUILDDIR)/$(notdir $@); \ $(RM) $(RMFLAGS) empty_filesys_$(notdir $@); \ fi; \ - $(DSYMUTIL) $(DSYMUTIL_FLAGS) \ - $(TARGET)/mach_kernel.sys \ - -o $(TARGET)/mach_kernel.sys.dSYM; \ $(LIPO) $@.dSYM/$(DSYMBUILDDIR)/$(notdir $@) \ -replace $(subst -arch,,$(ARCH_FLAGS_$(ARCH_CONFIG))) \ $<.dSYM/$(DSYMBUILDDIR)/$(notdir $<) \ -o $@.dSYM/$(DSYMBUILDDIR)/$(notdir $@); \ + $(INSTALL) $(INSTALL_FLAGS) \ + $<.dSYM/$(DSYMRESDIR)/kgmacros \ + $@.dSYM/$(DSYMRESDIR)/kgmacros; \ fi; \ fi $(INSTALL) $(INSTALL_FLAGS) $(SOURCE)kgmacros $(SYMROOT)$(INSTALL_FILE_DIR) -INSTALL_DATA_FILES = $(addprefix $(DSTROOT)$(INSTALL_DATA_DIR), $(INSTALL_DATA_LIST)) - -$(INSTALL_DATA_FILES): $(DSTROOT)$(INSTALL_DATA_DIR)% : $(SOURCE)/% - @echo Installing $< in $@; - $(_v)[ -d $(dir $@) ] ||$(MKDIR) $(dir $@); \ - $(RM) $(RMFLAGS) $@; \ - $(INSTALL) $(DATA_INSTALL_FLAGS) $< $(dir $@); - setup_build_install: @echo "[ $(SOURCE) ] make setup_build_install $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" -do_build_install: $(INSTALL_FILESYS_FILES) $(INSTALL_FILE_FILES) $(INSTALL_DATA_FILES) +do_build_install: $(INSTALL_KERNEL_FILESYS_FILES) $(INSTALL_KERNEL_FILE_FILES) @echo "[ $(SOURCE) ] make do_build_install $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" INSTALL_MAN_FILES = $(addprefix $(DSTROOT)/$(MANDIR)/$(INSTALL_MAN_DIR)/, $(INSTALL_MAN_LIST)) diff --git a/osfmk/Makefile b/osfmk/Makefile index f07b7e1f3..7b609b5ae 100644 --- a/osfmk/Makefile +++ b/osfmk/Makefile @@ -22,20 +22,13 @@ INSTINC_SUBDIRS = \ libsa \ kdp \ pmc - -INSTINC_SUBDIRS_PPC = \ - mach \ - ppc - INSTINC_SUBDIRS_I386 = \ mach \ i386 - INSTINC_SUBDIRS_X86_64 = \ mach \ i386 \ x86_64 - INSTINC_SUBDIRS_ARM = \ mach \ arm @@ -58,25 +51,18 @@ EXPINC_SUBDIRS = \ kdp \ pmc -EXPINC_SUBDIRS_PPC = \ - mach \ - ppc - EXPINC_SUBDIRS_I386 = \ mach \ i386 - EXPINC_SUBDIRS_X86_64 = \ mach \ i386 \ x86_64 - EXPINC_SUBDIRS_ARM = \ mach \ arm -SETUP_SUBDIRS = \ - conf +SETUP_SUBDIRS = COMP_SUBDIRS = \ conf diff --git a/osfmk/UserNotification/Makefile b/osfmk/UserNotification/Makefile index edc1f17b4..98a5377e8 100644 --- a/osfmk/UserNotification/Makefile +++ b/osfmk/UserNotification/Makefile @@ -8,16 +8,12 @@ include $(MakeInc_def) INSTINC_SUBDIRS = -INSTINC_SUBDIRS_PPC = - INSTINC_SUBDIRS_I386 = INSTINC_SUBDIRS_ARM = EXPINC_SUBDIRS = -EXPINC_SUBDIRS_PPC = - EXPINC_SUBDIRS_I386 = EXPINC_SUBDIRS_ARM = diff --git a/osfmk/UserNotification/UNDRequest.defs b/osfmk/UserNotification/UNDRequest.defs index 9a3a7c954..bfe925c80 100644 --- a/osfmk/UserNotification/UNDRequest.defs +++ b/osfmk/UserNotification/UNDRequest.defs @@ -74,10 +74,7 @@ simpleroutine UNDDisplayCustomFromBundle_rpc( in messageKey: UNDKey; in tokenKey: UNDPath); -simpleroutine UNDDisplayCustomFromDictionary_rpc( - server: UNDServerRef; - in reply: UNDReplyRef; - in data: xmlData); +skip; /* was UNDDisplayCustomFromDictionary_rpc */ simpleroutine UNDCancelNotification_rpc( server: UNDServerRef; diff --git a/osfmk/chud/chud_cpu.c b/osfmk/chud/chud_cpu.c index 19b639cd7..c21a40eb2 100644 --- a/osfmk/chud/chud_cpu.c +++ b/osfmk/chud/chud_cpu.c @@ -73,24 +73,12 @@ chudxnu_cpu_number(void) #pragma mark **** interrupts enable/disable **** #endif -__private_extern__ boolean_t -chudxnu_get_interrupts_enabled(void) -{ - return ml_get_interrupts_enabled(); -} - __private_extern__ boolean_t chudxnu_set_interrupts_enabled(boolean_t enable) { return ml_set_interrupts_enabled(enable); } -__private_extern__ boolean_t -chudxnu_at_interrupt_context(void) -{ - return ml_at_interrupt_context(); -} - __private_extern__ void chudxnu_cause_interrupt(void) { diff --git a/osfmk/chud/chud_thread.c b/osfmk/chud/chud_thread.c index 0f955bb6e..97c07757b 100644 --- a/osfmk/chud/chud_thread.c +++ b/osfmk/chud/chud_thread.c @@ -46,8 +46,6 @@ // include the correct file to find real_ncpus #if defined(__i386__) || defined(__x86_64__) # include -#elif defined(__ppc__) || defined(__ppc64__) -# include #else // fall back on declaring it extern. The linker will sort us out. extern unsigned int real_ncpus; @@ -124,6 +122,51 @@ chudxnu_thread_get_idle(thread_t thread) { return ((thread->state & TH_IDLE) == TH_IDLE); } +__private_extern__ int +chudxnu_thread_get_scheduler_state(thread_t thread) { + /* + * Instantaneous snapshot of the scheduler state of + * a given thread. + * + * MUST ONLY be called on an interrupted or + * locked thread, to avoid a race. + */ + + int state = 0; + int schedulerState = (volatile int)(thread->state); + processor_t lastProcessor = (volatile processor_t)(thread->last_processor); + + if ((PROCESSOR_NULL != lastProcessor) && (thread == lastProcessor->active_thread)) { + state |= CHUDXNU_TS_RUNNING; + } + + if (schedulerState & TH_RUN) { + state |= CHUDXNU_TS_RUNNABLE; + } + + if (schedulerState & TH_WAIT) { + state |= CHUDXNU_TS_WAIT; + } + + if (schedulerState & TH_UNINT) { + state |= CHUDXNU_TS_UNINT; + } + + if (schedulerState & TH_SUSP) { + state |= CHUDXNU_TS_SUSP; + } + + if (schedulerState & TH_TERMINATE) { + state |= CHUDXNU_TS_TERMINATE; + } + + if (schedulerState & TH_IDLE) { + state |= CHUDXNU_TS_IDLE; + } + + return state; +} + #if 0 #pragma mark **** task and thread info **** #endif diff --git a/osfmk/chud/chud_xnu.h b/osfmk/chud/chud_xnu.h index 91465bd61..2e8168577 100644 --- a/osfmk/chud/chud_xnu.h +++ b/osfmk/chud/chud_xnu.h @@ -71,7 +71,6 @@ extern kern_return_t chudxnu_unbind_thread(thread_t thread, int options); extern kern_return_t chudxnu_thread_get_state(thread_t thread, thread_flavor_t flavor, thread_state_t tstate, mach_msg_type_number_t *count, boolean_t user_only); extern kern_return_t chudxnu_thread_set_state(thread_t thread, thread_flavor_t flavor, thread_state_t tstate, mach_msg_type_number_t count, boolean_t user_only); -extern kern_return_t chudxnu_thread_user_state_available(thread_t thread); extern kern_return_t chudxnu_thread_get_callstack64(thread_t thread, uint64_t *callStack, mach_msg_type_number_t *count, boolean_t user_only); @@ -84,12 +83,22 @@ extern kern_return_t chudxnu_free_thread_list(thread_array_t *thread_list, mach_ extern kern_return_t chudxnu_thread_info( thread_t thread, thread_flavor_t flavor, thread_info_t thread_info_out, mach_msg_type_number_t *thread_info_count); -extern kern_return_t chudxnu_thread_last_context_switch(thread_t thread, uint64_t *timestamp); - extern boolean_t chudxnu_thread_set_marked(thread_t thread, boolean_t marked); extern boolean_t chudxnu_thread_get_marked(thread_t thread); extern boolean_t chudxnu_thread_get_idle(thread_t thread); +enum { + CHUDXNU_TS_RUNNING = 0x1, + CHUDXNU_TS_RUNNABLE = 0x2, + CHUDXNU_TS_WAIT = 0x4, + CHUDXNU_TS_UNINT = 0x8, + CHUDXNU_TS_SUSP = 0x10, + CHUDXNU_TS_TERMINATE = 0x20, + CHUDXNU_TS_IDLE = 0x40 +}; + +extern int chudxnu_thread_get_scheduler_state(thread_t thread); + #if 0 #pragma mark **** memory **** #endif @@ -114,9 +123,7 @@ extern int chudxnu_cpu_number(void); extern kern_return_t chudxnu_enable_cpu(int cpu, boolean_t enable); -extern boolean_t chudxnu_get_interrupts_enabled(void); extern boolean_t chudxnu_set_interrupts_enabled(boolean_t enable); -extern boolean_t chudxnu_at_interrupt_context(void); extern void chudxnu_cause_interrupt(void); extern void chudxnu_enable_preemption(void); @@ -246,15 +253,9 @@ extern kern_return_t chudxnu_set_shadowed_spr64(int cpu, int spr, uint64_t val); extern kern_return_t chudxnu_enable_cpu_nap(int cpu, boolean_t enable); extern boolean_t chudxnu_cpu_nap_enabled(int cpu); -extern uint32_t chudxnu_get_orig_cpu_l2cr(int cpu); -extern uint32_t chudxnu_get_orig_cpu_l3cr(int cpu); - extern kern_return_t chudxnu_read_spr(int cpu, int spr, uint32_t *val_p); extern kern_return_t chudxnu_read_spr64(int cpu, int spr, uint64_t *val_p); extern kern_return_t chudxnu_write_spr(int cpu, int spr, uint32_t val); extern kern_return_t chudxnu_write_spr64(int cpu, int spr, uint64_t val); -extern void chudxnu_flush_caches(void); -extern void chudxnu_enable_caches(boolean_t enable); - #endif /* _CHUD_XNU_H_ */ diff --git a/osfmk/chud/chud_xnu_glue.h b/osfmk/chud/chud_xnu_glue.h index 20626c064..b2ac2189c 100644 --- a/osfmk/chud/chud_xnu_glue.h +++ b/osfmk/chud/chud_xnu_glue.h @@ -26,9 +26,7 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#if defined (__ppc__) -#include "ppc/chud_xnu_glue.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/chud_xnu_glue.h" #else #error architecture not supported diff --git a/osfmk/chud/chud_xnu_private.h b/osfmk/chud/chud_xnu_private.h index 0932a6497..56b6eb22c 100644 --- a/osfmk/chud/chud_xnu_private.h +++ b/osfmk/chud/chud_xnu_private.h @@ -33,9 +33,7 @@ #include #include -#if defined (__ppc__) -#include "chud/ppc/chud_xnu_private.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "chud/i386/chud_xnu_private.h" #else #error architecture not supported diff --git a/osfmk/chud/i386/chud_osfmk_callback_i386.c b/osfmk/chud/i386/chud_osfmk_callback_i386.c index b3fc4d685..aa576cbc7 100644 --- a/osfmk/chud/i386/chud_osfmk_callback_i386.c +++ b/osfmk/chud/i386/chud_osfmk_callback_i386.c @@ -73,6 +73,11 @@ void chudxnu_cancel_all_callbacks(void) chudxnu_dtrace_callback_cancel(); } +static lck_grp_t chud_request_lck_grp; +static lck_grp_attr_t chud_request_lck_grp_attr; +static lck_attr_t chud_request_lck_attr; + + static chudcpu_data_t chudcpu_boot_cpu; void * chudxnu_cpu_alloc(boolean_t boot_processor) @@ -81,6 +86,11 @@ chudxnu_cpu_alloc(boolean_t boot_processor) if (boot_processor) { chud_proc_info = &chudcpu_boot_cpu; + + lck_attr_setdefault(&chud_request_lck_attr); + lck_grp_attr_setdefault(&chud_request_lck_grp_attr); + lck_grp_init(&chud_request_lck_grp, "chud_request", &chud_request_lck_grp_attr); + } else { chud_proc_info = (chudcpu_data_t *) kalloc(sizeof(chudcpu_data_t)); @@ -90,7 +100,8 @@ chudxnu_cpu_alloc(boolean_t boot_processor) } bzero((char *)chud_proc_info, sizeof(chudcpu_data_t)); chud_proc_info->t_deadline = 0xFFFFFFFFFFFFFFFFULL; - mpqueue_init(&chud_proc_info->cpu_request_queue); + + mpqueue_init(&chud_proc_info->cpu_request_queue, &chud_request_lck_grp, &chud_request_lck_attr); return (void *)chud_proc_info; @@ -161,7 +172,8 @@ chudxnu_cpu_timer_callback_enter( timer_call_setup(&(chud_proc_info->cpu_timer_call), chudxnu_private_cpu_timer_callback, NULL); timer_call_enter(&(chud_proc_info->cpu_timer_call), - chud_proc_info->t_deadline); + chud_proc_info->t_deadline, + TIMER_CALL_CRITICAL|TIMER_CALL_LOCAL); ml_set_interrupts_enabled(oldlevel); return KERN_SUCCESS; @@ -316,46 +328,40 @@ static kern_return_t chud_null_ast(thread_flavor_t flavor __unused, } static kern_return_t -chudxnu_private_chud_ast_callback( - int trapno, - void *regs, - int unused1, - int unused2) -{ -#pragma unused (trapno) -#pragma unused (regs) -#pragma unused (unused1) -#pragma unused (unused2) - boolean_t oldlevel = ml_set_interrupts_enabled(FALSE); - ast_t *myast = ast_pending(); - kern_return_t retval = KERN_FAILURE; +chudxnu_private_chud_ast_callback(ast_t reasons, ast_t *myast) +{ + boolean_t oldlevel = ml_set_interrupts_enabled(FALSE); + kern_return_t retval = KERN_FAILURE; chudxnu_perfmon_ast_callback_func_t fn = perfmon_ast_callback_fn; - - if (*myast & AST_CHUD_URGENT) { - *myast &= ~(AST_CHUD_URGENT | AST_CHUD); - if ((*myast & AST_PREEMPTION) != AST_PREEMPTION) - *myast &= ~(AST_URGENT); - retval = KERN_SUCCESS; - } else if (*myast & AST_CHUD) { - *myast &= ~(AST_CHUD); - retval = KERN_SUCCESS; - } - + if (fn) { - x86_thread_state_t state; - mach_msg_type_number_t count; - count = x86_THREAD_STATE_COUNT; - - if (chudxnu_thread_get_state( - current_thread(), - x86_THREAD_STATE, - (thread_state_t) &state, &count, - TRUE) == KERN_SUCCESS) { - - (fn)( - x86_THREAD_STATE, - (thread_state_t) &state, - count); + if ((*myast & AST_CHUD_URGENT) && (reasons & (AST_URGENT | AST_CHUD_URGENT))) { // Only execute urgent callbacks if reasons specifies an urgent context. + *myast &= ~AST_CHUD_URGENT; + + if (AST_URGENT == *myast) { // If the only flag left is AST_URGENT, we can clear it; we know that we set it, but if there are also other bits set in reasons then someone else might still need AST_URGENT, so we'll leave it set. The normal machinery in ast_taken will ensure it gets cleared eventually, as necessary. + *myast = AST_NONE; + } + + retval = KERN_SUCCESS; + } + + if ((*myast & AST_CHUD) && (reasons & AST_CHUD)) { // Only execute non-urgent callbacks if reasons actually specifies AST_CHUD. This implies non-urgent callbacks since the only time this'll happen is if someone either calls ast_taken with AST_CHUD explicitly (not done at time of writing, but possible) or with AST_ALL, which of course includes AST_CHUD. + *myast &= ~AST_CHUD; + retval = KERN_SUCCESS; + } + + if (KERN_SUCCESS == retval) { + x86_thread_state_t state; + mach_msg_type_number_t count = x86_THREAD_STATE_COUNT; + thread_t thread = current_thread(); + + if (KERN_SUCCESS == chudxnu_thread_get_state(thread, + x86_THREAD_STATE, + (thread_state_t)&state, + &count, + (thread->task != kernel_task))) { + (fn)(x86_THREAD_STATE, (thread_state_t)&state, count); + } } } @@ -425,6 +431,9 @@ static kern_return_t chud_null_int(uint32_t trapentry __unused, thread_flavor_t return KERN_FAILURE; } +static void +chudxnu_private_interrupt_callback(void *foo) __attribute__((used)); + static void chudxnu_private_interrupt_callback(void *foo) { @@ -460,7 +469,6 @@ chudxnu_interrupt_callback_enter(chudxnu_interrupt_callback_func_t func) if(OSCompareAndSwapPtr(chud_null_int, func, (void * volatile *)&interrupt_callback_fn)) { lapic_set_pmi_func((i386_intr_func_t)chudxnu_private_interrupt_callback); - return KERN_SUCCESS; } return KERN_FAILURE; diff --git a/osfmk/chud/i386/chud_thread_i386.c b/osfmk/chud/i386/chud_thread_i386.c index f5c992fef..a8edff8fa 100644 --- a/osfmk/chud/i386/chud_thread_i386.c +++ b/osfmk/chud/i386/chud_thread_i386.c @@ -49,13 +49,6 @@ #pragma mark **** thread state **** #endif -__private_extern__ kern_return_t -chudxnu_thread_user_state_available(thread_t thread) -{ -#pragma unused (thread) - return KERN_SUCCESS; -} - __private_extern__ kern_return_t chudxnu_thread_get_state( thread_t thread, diff --git a/osfmk/chud/ppc/chud_cpu_asm.h b/osfmk/chud/ppc/chud_cpu_asm.h deleted file mode 100644 index a385f7664..000000000 --- a/osfmk/chud/ppc/chud_cpu_asm.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2003-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef _CHUD_CPU_ASM_H_ -#define _CHUD_CPU_ASM_H_ - -kern_return_t mfspr64(uint64_t *val, int spr); -kern_return_t mfmsr64(uint64_t *val); - -kern_return_t mtspr64(int spr, uint64_t *val); -kern_return_t mtmsr64(uint64_t *val); - -#endif // _CHUD_CPU_ASM_H_ diff --git a/osfmk/chud/ppc/chud_cpu_asm.s b/osfmk/chud/ppc/chud_cpu_asm.s deleted file mode 100644 index 81482361a..000000000 --- a/osfmk/chud/ppc/chud_cpu_asm.s +++ /dev/null @@ -1,593 +0,0 @@ -/* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include - -/* - * kern_return_t mfspr64(uint64_t *val, int spr); - * - * r3: address to store value in - * r4: spr to read from - * - */ - -; Force a line boundry here - .align 5 - .globl EXT(mfspr64) - -EXT(mfspr64): - ;; generic PPC 64-bit wide SPRs - cmpwi r4,chud_ppc_srr0 - beq mfspr64_srr0 - cmpwi r4,chud_ppc_srr1 - beq mfspr64_srr1 - cmpwi r4,chud_ppc_dar - beq mfspr64_dar - cmpwi r4,chud_ppc_sdr1 - beq mfspr64_sdr1 - cmpwi r4,chud_ppc_sprg0 - beq mfspr64_sprg0 - cmpwi r4,chud_ppc_sprg1 - beq mfspr64_sprg1 - cmpwi r4,chud_ppc_sprg2 - beq mfspr64_sprg2 - cmpwi r4,chud_ppc_sprg3 - beq mfspr64_sprg3 - cmpwi r4,chud_ppc64_asr - beq mfspr64_asr - cmpwi r4,chud_ppc_dabr - beq mfspr64_dabr - - ;; GPUL specific 64-bit wide SPRs - cmpwi r4,chud_970_hid0 - beq mfspr64_hid0 - cmpwi r4,chud_970_hid1 - beq mfspr64_hid1 - cmpwi r4,chud_970_hid4 - beq mfspr64_hid4 - cmpwi r4,chud_970_hid5 - beq mfspr64_hid5 - cmpwi r4,chud_970_mmcr0 - beq mfspr64_mmcr0 - cmpwi r4,chud_970_mmcr1 - beq mfspr64_mmcr1 - cmpwi r4,chud_970_mmcra - beq mfspr64_mmcra - cmpwi r4,chud_970_siar - beq mfspr64_siar - cmpwi r4,chud_970_sdar - beq mfspr64_sdar - cmpwi r4,chud_970_imc - beq mfspr64_imc - cmpwi r4,chud_970_rmor - beq mfspr64_rmor - cmpwi r4,chud_970_hrmor - beq mfspr64_hrmor - cmpwi r4,chud_970_hior - beq mfspr64_hior - cmpwi r4,chud_970_lpidr - beq mfspr64_lpidr - cmpwi r4,chud_970_lpcr - beq mfspr64_lpcr - cmpwi r4,chud_970_dabrx - beq mfspr64_dabrx - cmpwi r4,chud_970_hsprg0 - beq mfspr64_hsprg0 - cmpwi r4,chud_970_hsprg1 - beq mfspr64_hsprg1 - cmpwi r4,chud_970_hsrr0 - beq mfspr64_hsrr0 - cmpwi r4,chud_970_hsrr1 - beq mfspr64_hsrr1 - cmpwi r4,chud_970_hdec - beq mfspr64_hdec - cmpwi r4,chud_970_trig0 - beq mfspr64_trig0 - cmpwi r4,chud_970_trig1 - beq mfspr64_trig1 - cmpwi r4,chud_970_trig2 - beq mfspr64_trig2 - cmpwi r4,chud_ppc64_accr - beq mfspr64_accr - cmpwi r4,chud_970_scomc - beq mfspr64_scomc - cmpwi r4,chud_970_scomd - beq mfspr64_scomd - - b mfspr64_failure - -mfspr64_srr0: - mfspr r5,chud_ppc_srr0 - std r5,0(r3) - b mfspr64_success -mfspr64_srr1: - mfspr r5,chud_ppc_srr1 - std r5,0(r3) - b mfspr64_success -mfspr64_dar: - mfspr r5,chud_ppc_dar - std r5,0(r3) - b mfspr64_success -mfspr64_sdr1: - mfspr r5,chud_ppc_sdr1 - std r5,0(r3) - b mfspr64_success -mfspr64_sprg0: - mfspr r5,chud_ppc_sprg0 - std r5,0(r3) - b mfspr64_success -mfspr64_sprg1: - mfspr r5,chud_ppc_sprg1 - std r5,0(r3) - b mfspr64_success -mfspr64_sprg2: - mfspr r5,chud_ppc_sprg2 - std r5,0(r3) - b mfspr64_success -mfspr64_sprg3: - mfspr r5,chud_ppc_sprg3 - std r5,0(r3) - b mfspr64_success -mfspr64_asr: - mfspr r5,chud_ppc64_asr - std r5,0(r3) - b mfspr64_success -mfspr64_dabr: - mfspr r5,chud_ppc_dabr - std r5,0(r3) - b mfspr64_success -mfspr64_hid0: - mfspr r5,chud_970_hid0 - std r5,0(r3) - b mfspr64_success -mfspr64_hid1: - mfspr r5,chud_970_hid1 - std r5,0(r3) - b mfspr64_success -mfspr64_hid4: - mfspr r5,chud_970_hid4 - std r5,0(r3) - b mfspr64_success -mfspr64_hid5: - mfspr r5,chud_970_hid5 - std r5,0(r3) - b mfspr64_success -mfspr64_mmcr0: - mfspr r5,chud_970_mmcr0 - std r5,0(r3) - b mfspr64_success -mfspr64_mmcr1: - mfspr r5,chud_970_mmcr1 - std r5,0(r3) - b mfspr64_success -mfspr64_mmcra: - mfspr r5,chud_970_mmcra - std r5,0(r3) - b mfspr64_success -mfspr64_siar: - mfspr r5,chud_970_siar - std r5,0(r3) - b mfspr64_success -mfspr64_sdar: - mfspr r5,chud_970_sdar - std r5,0(r3) - b mfspr64_success -mfspr64_imc: - mfspr r5,chud_970_imc - std r5,0(r3) - b mfspr64_success -mfspr64_rmor: - mfspr r5,chud_970_rmor - std r5,0(r3) - b mfspr64_success -mfspr64_hrmor: - mfspr r5,chud_970_hrmor - std r5,0(r3) - b mfspr64_success -mfspr64_hior: - mfspr r5,chud_970_hior - std r5,0(r3) - b mfspr64_success -mfspr64_lpidr: - mfspr r5,chud_970_lpidr - std r5,0(r3) - b mfspr64_success -mfspr64_lpcr: - mfspr r5,chud_970_lpcr - std r5,0(r3) - b mfspr64_success -mfspr64_dabrx: - mfspr r5,chud_970_dabrx - std r5,0(r3) - b mfspr64_success -mfspr64_hsprg0: - mfspr r5,chud_970_hsprg0 - std r5,0(r3) - b mfspr64_success -mfspr64_hsprg1: - mfspr r5,chud_970_hsprg1 - std r5,0(r3) - b mfspr64_success -mfspr64_hsrr0: - mfspr r5,chud_970_hsrr0 - std r5,0(r3) - b mfspr64_success -mfspr64_hsrr1: - mfspr r5,chud_970_hsrr1 - std r5,0(r3) - b mfspr64_success -mfspr64_hdec: - mfspr r5,chud_970_hdec - std r5,0(r3) - b mfspr64_success -mfspr64_trig0: - mfspr r5,chud_970_trig0 - std r5,0(r3) - b mfspr64_success -mfspr64_trig1: - mfspr r5,chud_970_trig1 - std r5,0(r3) - b mfspr64_success -mfspr64_trig2: - mfspr r5,chud_970_trig2 - std r5,0(r3) - b mfspr64_success -mfspr64_accr: - mfspr r5,chud_ppc64_accr - std r5,0(r3) - b mfspr64_success -mfspr64_scomc: - mfspr r5,chud_970_scomc - std r5,0(r3) - b mfspr64_success -mfspr64_scomd: - mfspr r5,chud_970_scomd - std r5,0(r3) - b mfspr64_success - -mfspr64_failure: - li r3,KERN_FAILURE - blr - -mfspr64_success: - li r3,KERN_SUCCESS - blr - - -/* - * kern_return_t mtspr64(int spr, uint64_t *val); - * - * r3: spr to write to - * r4: address to get value from - * - */ - -; Force a line boundry here - .align 5 - .globl EXT(mtspr64) - -EXT(mtspr64): - ;; generic PPC 64-bit wide SPRs - cmpwi r3,chud_ppc_srr0 - beq mtspr64_srr0 - cmpwi r3,chud_ppc_srr1 - beq mtspr64_srr1 - cmpwi r3,chud_ppc_dar - beq mtspr64_dar - cmpwi r3,chud_ppc_sdr1 - beq mtspr64_sdr1 - cmpwi r3,chud_ppc_sprg0 - beq mtspr64_sprg0 - cmpwi r3,chud_ppc_sprg1 - beq mtspr64_sprg1 - cmpwi r3,chud_ppc_sprg2 - beq mtspr64_sprg2 - cmpwi r3,chud_ppc_sprg3 - beq mtspr64_sprg3 - cmpwi r3,chud_ppc64_asr - beq mtspr64_asr - cmpwi r3,chud_ppc_dabr - beq mtspr64_dabr - - ;; GPUL specific 64-bit wide SPRs - cmpwi r3,chud_970_hid0 - beq mtspr64_hid0 - cmpwi r3,chud_970_hid1 - beq mtspr64_hid1 - cmpwi r3,chud_970_hid4 - beq mtspr64_hid4 - cmpwi r3,chud_970_hid5 - beq mtspr64_hid5 - cmpwi r3,chud_970_mmcr0 - beq mtspr64_mmcr0 - cmpwi r3,chud_970_mmcr1 - beq mtspr64_mmcr1 - cmpwi r3,chud_970_mmcra - beq mtspr64_mmcra - cmpwi r3,chud_970_siar - beq mtspr64_siar - cmpwi r3,chud_970_sdar - beq mtspr64_sdar - cmpwi r3,chud_970_imc - beq mtspr64_imc - cmpwi r3,chud_970_rmor - beq mtspr64_rmor - cmpwi r3,chud_970_hrmor - beq mtspr64_hrmor - cmpwi r3,chud_970_hior - beq mtspr64_hior - cmpwi r3,chud_970_lpidr - beq mtspr64_lpidr - cmpwi r3,chud_970_lpcr - beq mtspr64_lpcr - cmpwi r3,chud_970_dabrx - beq mtspr64_dabrx - cmpwi r3,chud_970_hsprg0 - beq mtspr64_hsprg0 - cmpwi r3,chud_970_hsprg1 - beq mtspr64_hsprg1 - cmpwi r3,chud_970_hsrr0 - beq mtspr64_hsrr0 - cmpwi r3,chud_970_hsrr1 - beq mtspr64_hsrr1 - cmpwi r3,chud_970_hdec - beq mtspr64_hdec - cmpwi r3,chud_970_trig0 - beq mtspr64_trig0 - cmpwi r3,chud_970_trig1 - beq mtspr64_trig1 - cmpwi r3,chud_970_trig2 - beq mtspr64_trig2 - cmpwi r3,chud_ppc64_accr - beq mtspr64_accr - cmpwi r3,chud_970_scomc - beq mtspr64_scomc - cmpwi r3,chud_970_scomd - beq mtspr64_scomd - - b mtspr64_failure - -mtspr64_srr0: - ld r5,0(r4) - mtspr chud_ppc_srr0,r5 - b mtspr64_success -mtspr64_srr1: - ld r5,0(r4) - mtspr chud_ppc_srr1,r5 - b mtspr64_success -mtspr64_dar: - ld r5,0(r4) - mtspr chud_ppc_dar,r5 - b mtspr64_success -mtspr64_sdr1: - ld r5,0(r4) - mtspr chud_ppc_sdr1,r5 - b mtspr64_success -mtspr64_sprg0: - ld r5,0(r4) - mtspr chud_ppc_sprg0,r5 - b mtspr64_success -mtspr64_sprg1: - ld r5,0(r4) - mtspr chud_ppc_sprg1,r5 - b mtspr64_success -mtspr64_sprg2: - ld r5,0(r4) - mtspr chud_ppc_sprg2,r5 - b mtspr64_success -mtspr64_sprg3: - ld r5,0(r4) - mtspr chud_ppc_sprg3,r5 - b mtspr64_success -mtspr64_asr: - ld r5,0(r4) - mtspr chud_ppc64_asr,r5 - b mtspr64_success -mtspr64_dabr: - ld r5,0(r4) - mtspr chud_ppc_dabr,r5 - b mtspr64_success -mtspr64_hid0: - ld r5,0(r4) - sync - mtspr chud_970_hid0,r5 - mfspr r5,chud_970_hid0 /* syncronization requirements */ - mfspr r5,chud_970_hid0 - mfspr r5,chud_970_hid0 - mfspr r5,chud_970_hid0 - mfspr r5,chud_970_hid0 - mfspr r5,chud_970_hid0 - b mtspr64_success -mtspr64_hid1: - ld r5,0(r4) - mtspr chud_970_hid1,r5 /* tell you twice */ - mtspr chud_970_hid1,r5 - isync - b mtspr64_success -mtspr64_hid4: - ld r5,0(r4) - sync /* syncronization requirements */ - mtspr chud_970_hid4,r5 - isync - b mtspr64_success -mtspr64_hid5: - ld r5,0(r4) - mtspr chud_970_hid5,r5 - b mtspr64_success -mtspr64_mmcr0: - ld r5,0(r4) - mtspr chud_970_mmcr0,r5 - b mtspr64_success -mtspr64_mmcr1: - ld r5,0(r4) - mtspr chud_970_mmcr1,r5 - b mtspr64_success -mtspr64_mmcra: - ld r5,0(r4) - mtspr chud_970_mmcra,r5 - b mtspr64_success -mtspr64_siar: - ld r5,0(r4) - mtspr chud_970_siar,r5 - b mtspr64_success -mtspr64_sdar: - ld r5,0(r4) - mtspr chud_970_sdar,r5 - b mtspr64_success -mtspr64_imc: - ld r5,0(r4) - mtspr chud_970_imc,r5 - b mtspr64_success -mtspr64_rmor: - ld r5,0(r4) - mtspr chud_970_rmor,r5 - b mtspr64_success -mtspr64_hrmor: - ld r5,0(r4) - mtspr chud_970_hrmor,r5 - b mtspr64_success -mtspr64_hior: - ld r5,0(r4) - mtspr chud_970_hior,r5 - b mtspr64_success -mtspr64_lpidr: - ld r5,0(r4) - mtspr chud_970_lpidr,r5 - b mtspr64_success -mtspr64_lpcr: - ld r5,0(r4) - mtspr chud_970_lpcr,r5 - b mtspr64_success -mtspr64_dabrx: - ld r5,0(r4) - mtspr chud_970_dabrx,r5 - b mtspr64_success -mtspr64_hsprg0: - ld r5,0(r4) - mtspr chud_970_hsprg0,r5 - b mtspr64_success -mtspr64_hsprg1: - ld r5,0(r4) - mtspr chud_970_hsprg1,r5 - b mtspr64_success -mtspr64_hsrr0: - ld r5,0(r4) - mtspr chud_970_hsrr0,r5 - b mtspr64_success -mtspr64_hsrr1: - ld r5,0(r4) - mtspr chud_970_hsrr1,r5 - b mtspr64_success -mtspr64_hdec: - ld r5,0(r4) - mtspr chud_970_hdec,r5 - b mtspr64_success -mtspr64_trig0: - ld r5,0(r4) - mtspr chud_970_trig0,r5 - b mtspr64_success -mtspr64_trig1: - ld r5,0(r4) - mtspr chud_970_trig1,r5 - b mtspr64_success -mtspr64_trig2: - ld r5,0(r4) - mtspr chud_970_trig2,r5 - b mtspr64_success -mtspr64_accr: - ld r5,0(r4) - mtspr chud_ppc64_accr,r5 - b mtspr64_success -mtspr64_scomc: - ld r5,0(r4) - mtspr chud_970_scomc,r5 - b mtspr64_success -mtspr64_scomd: - ld r5,0(r4) - mtspr chud_970_scomd,r5 - b mtspr64_success - -mtspr64_failure: - li r3,KERN_FAILURE - blr - -mtspr64_success: - li r3,KERN_SUCCESS - blr - - -/* - * kern_return_t mfmsr64(uint64_t *val); - * - * r3: address to store value in - * - */ - -; Force a line boundry here - .align 5 - .globl EXT(mfmsr64) - -EXT(mfmsr64): - mfmsr r5 - std r5,0(r3) -mfmsr64_success: - li r3,KERN_SUCCESS - blr - -mfmsr64_failure: - li r3,KERN_FAILURE - blr - - -/* - * kern_return_t mtmsr64(uint64_t *val); - * - * r3: address to load value from - * - */ - -; Force a line boundry here - .align 5 - .globl EXT(mtmsr64) - -EXT(mtmsr64): - ld r5,0(r3) - mtmsrd r5 - b mtmsr64_success - -mtmsr64_success: - li r3,KERN_SUCCESS - blr - -mtmsr64_failure: - li r3,KERN_FAILURE - blr - -.L_end: diff --git a/osfmk/chud/ppc/chud_cpu_ppc.c b/osfmk/chud/ppc/chud_cpu_ppc.c deleted file mode 100644 index 60f279c3f..000000000 --- a/osfmk/chud/ppc/chud_cpu_ppc.c +++ /dev/null @@ -1,1182 +0,0 @@ -/* - * Copyright (c) 2003-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -// the macros in proc_reg.h fail with "expression must be absolute" - -#undef mtsprg -#undef mfsprg -#define mtsprg(n, reg) __asm__ volatile("mtsprg " # n ", %0" : : "r" (reg)) -#define mfsprg(reg, n) __asm__ volatile("mfsprg %0, " # n : "=r" (reg)) - -#undef mtspr -#undef mfspr -#define mtspr(spr, reg) __asm__ volatile ("mtspr %0, %1" : : "n" (spr), "r" (reg)) -#define mfspr(reg, spr) __asm__ volatile("mfspr %0, %1" : "=r" (reg) : "n" (spr)); - -#undef mtsr -#undef mfsr -#define mtsr(sr, reg) __asm__ volatile("sync" "@" "mtsr sr%0, %1 " "@" "isync" : : "i" (sr), "r" (reg)); -#define mfsr(reg, sr) __asm__ volatile("mfsr %0, sr%1" : "=r" (reg) : "i" (sr)); - -#if 0 -#pragma mark **** cpu enable/disable **** -#endif - -extern kern_return_t processor_start(processor_t processor); // osfmk/kern/processor.c -extern kern_return_t processor_exit(processor_t processor); // osfmk/kern/processor.c - -__private_extern__ -kern_return_t chudxnu_enable_cpu(int cpu, boolean_t enable) -{ - chudxnu_unbind_thread(current_thread(), 0); - - if(cpu<0 || cpu>=chudxnu_phys_cpu_count()) { // check sanity of cpu argument - return KERN_FAILURE; - } - - if((PerProcTable[cpu].ppe_vaddr != (struct per_proc_info *)NULL) - && cpu != master_cpu) { - processor_t processor = cpu_to_processor(cpu); - - if(enable) { - return processor_start(processor); - } else { - return processor_exit(processor); - } - } - return KERN_FAILURE; -} - -#if 0 -#pragma mark **** nap **** -#endif - -__private_extern__ -kern_return_t chudxnu_enable_cpu_nap(int cpu, boolean_t enable) -{ - if(cpu<0 || cpu>=chudxnu_phys_cpu_count()) { // check sanity of cpu argument - return KERN_FAILURE; - } - - if(PerProcTable[cpu].ppe_vaddr != (struct per_proc_info *)NULL) { - ml_enable_nap(cpu, enable); - return KERN_SUCCESS; - } - - return KERN_FAILURE; -} - -__private_extern__ -boolean_t chudxnu_cpu_nap_enabled(int cpu) -{ - boolean_t prev; - - if(cpu<0 || cpu>=chudxnu_phys_cpu_count()) { // check sanity of cpu argument - cpu = 0; - } - - prev = ml_enable_nap(cpu, TRUE); - ml_enable_nap(cpu, prev); - - return prev; -} - -#if 0 -#pragma mark **** shadowed spr **** -#endif - -__private_extern__ -kern_return_t chudxnu_set_shadowed_spr(int cpu, int spr, uint32_t val) -{ - cpu_subtype_t target_cpu_subtype; - uint32_t available; - kern_return_t retval = KERN_FAILURE; - struct per_proc_info *per_proc; - boolean_t didBind = FALSE; - - if(cpu>=chudxnu_phys_cpu_count()) { // check sanity of cpu argument - return KERN_FAILURE; - } - - if(cpu<0) { // cpu<0 means don't bind (current cpu) - cpu = chudxnu_cpu_number(); - didBind = FALSE; - } else { - chudxnu_bind_thread(current_thread(), cpu, 0); - didBind = TRUE; - } - - per_proc = PerProcTable[cpu].ppe_vaddr; - available = per_proc->pf.Available; - target_cpu_subtype = per_proc->cpu_subtype; - - if(spr==chud_750_l2cr) { - switch(target_cpu_subtype) { - case CPU_SUBTYPE_POWERPC_750: - case CPU_SUBTYPE_POWERPC_7400: - case CPU_SUBTYPE_POWERPC_7450: - if(available & pfL2) { -// int enable = (val & 0x80000000) ? TRUE : FALSE; -// if(enable) { -// per_proc->pf.l2cr = val; -// } else { -// per_proc->pf.l2cr = 0; -// } - per_proc->pf.l2cr = val; - cacheInit(); - // mtspr(l2cr, per_proc->pf.l2cr); // XXXXXXX why is this necessary? XXXXXXX - retval = KERN_SUCCESS; - } else { - retval = KERN_FAILURE; - } - break; - default: - retval = KERN_INVALID_ARGUMENT; - break; - } - } - else if(spr==chud_7450_l3cr) { - switch(target_cpu_subtype) { - case CPU_SUBTYPE_POWERPC_7450: - if(available & pfL3) { - int enable = (val & 0x80000000) ? TRUE : FALSE; - if(enable) { - per_proc->pf.l3cr = val; - } else { - per_proc->pf.l3cr = 0; - } - cacheInit(); - retval = KERN_SUCCESS; - } else { - retval = KERN_FAILURE; - } - break; - default: - retval = KERN_INVALID_ARGUMENT; - break; - } - } - else if(spr==chud_750_hid0) { - switch(target_cpu_subtype) { - case CPU_SUBTYPE_POWERPC_750: - cacheInit(); - cacheDisable(); /* disable caches */ - mtspr(chud_750_hid0, val); - per_proc->pf.pfHID0 = val; - cacheInit(); /* reenable caches */ - retval = KERN_SUCCESS; - break; - case CPU_SUBTYPE_POWERPC_7400: - case CPU_SUBTYPE_POWERPC_7450: - mtspr(chud_750_hid0, val); - per_proc->pf.pfHID0 = val; - retval = KERN_SUCCESS; - break; - default: - retval = KERN_INVALID_ARGUMENT; - break; - } - } - else if(spr==chud_750_hid1) { - switch(target_cpu_subtype) { - case CPU_SUBTYPE_POWERPC_750: - case CPU_SUBTYPE_POWERPC_7400: - case CPU_SUBTYPE_POWERPC_7450: - mtspr(chud_750_hid1, val); - per_proc->pf.pfHID1 = val; - retval = KERN_SUCCESS; - break; - default: - retval = KERN_INVALID_ARGUMENT; - break; - } - } - else if(spr==chud_750fx_hid2 && target_cpu_subtype==CPU_SUBTYPE_POWERPC_750) { - mtspr(chud_750fx_hid2, val); - per_proc->pf.pfHID2 = val; - retval = KERN_SUCCESS; - } - else if(spr==chud_7400_msscr0 && (target_cpu_subtype==CPU_SUBTYPE_POWERPC_7400 || target_cpu_subtype==CPU_SUBTYPE_POWERPC_7450)) { - mtspr(chud_7400_msscr0, val); - per_proc->pf.pfMSSCR0 = val; - retval = KERN_SUCCESS; - } - else if(spr==chud_7400_msscr1 && (target_cpu_subtype==CPU_SUBTYPE_POWERPC_7400 || target_cpu_subtype==CPU_SUBTYPE_POWERPC_7450)) { // called msssr0 on 7450 - mtspr(chud_7400_msscr1, val); - per_proc->pf.pfMSSCR1 = val; - retval = KERN_SUCCESS; - } - else if(spr==chud_7450_ldstcr && target_cpu_subtype==CPU_SUBTYPE_POWERPC_7450) { - mtspr(chud_7450_ldstcr, val); - per_proc->pf.pfLDSTCR = val; - retval = KERN_SUCCESS; - } - else if(spr==chud_7450_ictrl && target_cpu_subtype==CPU_SUBTYPE_POWERPC_7450) { - mtspr(chud_7450_ictrl, val); - per_proc->pf.pfICTRL = val; - retval = KERN_SUCCESS; - } else { - retval = KERN_INVALID_ARGUMENT; - } - - if(didBind) { - chudxnu_unbind_thread(current_thread(), 0); - } - - return retval; -} - -__private_extern__ -kern_return_t chudxnu_set_shadowed_spr64(int cpu, int spr, uint64_t val) -{ - cpu_subtype_t target_cpu_subtype; - kern_return_t retval = KERN_FAILURE; - struct per_proc_info *per_proc; - boolean_t didBind = FALSE; - - if(cpu>=chudxnu_phys_cpu_count()) { // check sanity of cpu argument - return KERN_FAILURE; - } - - if(cpu<0) { // cpu<0 means don't bind (current cpu) - cpu = chudxnu_cpu_number(); - didBind = FALSE; - } else { - chudxnu_bind_thread(current_thread(), cpu, 0); - didBind = TRUE; - } - - per_proc = PerProcTable[cpu].ppe_vaddr; - target_cpu_subtype = per_proc->cpu_subtype; - - if(spr==chud_970_hid0) { - switch(target_cpu_subtype) { - case CPU_SUBTYPE_POWERPC_970: - mtspr64(chud_970_hid0, &val); - per_proc->pf.pfHID0 = val; - retval = KERN_SUCCESS; - break; - default: - retval = KERN_INVALID_ARGUMENT; - break; - } - } - else if(spr==chud_970_hid1) { - switch(target_cpu_subtype) { - case CPU_SUBTYPE_POWERPC_970: - mtspr64(chud_970_hid1, &val); - per_proc->pf.pfHID1 = val; - retval = KERN_SUCCESS; - break; - default: - retval = KERN_INVALID_ARGUMENT; - break; - } - } - else if(spr==chud_970_hid4) { - switch(target_cpu_subtype) { - case CPU_SUBTYPE_POWERPC_970: - mtspr64(chud_970_hid4, &val); - per_proc->pf.pfHID4 = val; - retval = KERN_SUCCESS; - break; - default: - retval = KERN_INVALID_ARGUMENT; - break; - } - } - else if(spr==chud_970_hid5) { - switch(target_cpu_subtype) { - case CPU_SUBTYPE_POWERPC_970: - mtspr64(chud_970_hid5, &val); - per_proc->pf.pfHID5 = val; - retval = KERN_SUCCESS; - break; - default: - retval = KERN_INVALID_ARGUMENT; - break; - } - } else { - retval = KERN_INVALID_ARGUMENT; - } - - if(didBind) { - chudxnu_unbind_thread(current_thread(), 0); - } - - return retval; -} - -__private_extern__ -uint32_t chudxnu_get_orig_cpu_l2cr(int cpu) -{ - if(cpu<0 || cpu>=chudxnu_phys_cpu_count()) { // check sanity of cpu argument - cpu = 0; - } - return PerProcTable[cpu].ppe_vaddr->pf.l2crOriginal; -} - -__private_extern__ -uint32_t chudxnu_get_orig_cpu_l3cr(int cpu) -{ - if(cpu<0 || cpu>=chudxnu_phys_cpu_count()) { // check sanity of cpu argument - cpu = 0; - } - return PerProcTable[cpu].ppe_vaddr->pf.l3crOriginal; -} - -#if 0 -#pragma mark **** spr **** -#endif - -__private_extern__ -kern_return_t chudxnu_read_spr(int cpu, int spr, uint32_t *val_p) -{ - kern_return_t retval = KERN_SUCCESS; - boolean_t oldlevel; - uint32_t val = 0xFFFFFFFF; - - /* bind to requested CPU */ - if(cpu>=0 && !(ml_at_interrupt_context() && cpu_number() == cpu)) { // cpu<0 means don't bind - if(chudxnu_bind_thread(current_thread(), cpu, 0)!=KERN_SUCCESS) { - return KERN_INVALID_ARGUMENT; - } - } - - oldlevel = chudxnu_set_interrupts_enabled(FALSE); /* disable interrupts */ - - do { - /* PPC SPRs - 32-bit and 64-bit implementations */ - if(spr==chud_ppc_srr0) { mfspr(val, chud_ppc_srr0); break; } - if(spr==chud_ppc_srr1) { mfspr(val, chud_ppc_srr1); break; } - if(spr==chud_ppc_dsisr) { mfspr(val, chud_ppc_dsisr); break; } - if(spr==chud_ppc_dar) { mfspr(val, chud_ppc_dar); break; } - if(spr==chud_ppc_dec) { mfspr(val, chud_ppc_dec); break; } - if(spr==chud_ppc_sdr1) { mfspr(val, chud_ppc_sdr1); break; } - if(spr==chud_ppc_sprg0) { mfspr(val, chud_ppc_sprg0); break; } - if(spr==chud_ppc_sprg1) { mfspr(val, chud_ppc_sprg1); break; } - if(spr==chud_ppc_sprg2) { mfspr(val, chud_ppc_sprg2); break; } - if(spr==chud_ppc_sprg3) { mfspr(val, chud_ppc_sprg3); break; } - if(spr==chud_ppc_ear) { mfspr(val, chud_ppc_ear); break; } - if(spr==chud_ppc_tbl) { mfspr(val, 268); break; } /* timebase consists of read registers and write registers */ - if(spr==chud_ppc_tbu) { mfspr(val, 269); break; } - if(spr==chud_ppc_pvr) { mfspr(val, chud_ppc_pvr); break; } - if(spr==chud_ppc_ibat0u) { mfspr(val, chud_ppc_ibat0u); break; } - if(spr==chud_ppc_ibat0l) { mfspr(val, chud_ppc_ibat0l); break; } - if(spr==chud_ppc_ibat1u) { mfspr(val, chud_ppc_ibat1u); break; } - if(spr==chud_ppc_ibat1l) { mfspr(val, chud_ppc_ibat1l); break; } - if(spr==chud_ppc_ibat2u) { mfspr(val, chud_ppc_ibat2u); break; } - if(spr==chud_ppc_ibat2l) { mfspr(val, chud_ppc_ibat2l); break; } - if(spr==chud_ppc_ibat3u) { mfspr(val, chud_ppc_ibat3u); break; } - if(spr==chud_ppc_ibat3l) { mfspr(val, chud_ppc_ibat3l); break; } - if(spr==chud_ppc_dbat0u) { mfspr(val, chud_ppc_dbat0u); break; } - if(spr==chud_ppc_dbat0l) { mfspr(val, chud_ppc_dbat0l); break; } - if(spr==chud_ppc_dbat1u) { mfspr(val, chud_ppc_dbat1u); break; } - if(spr==chud_ppc_dbat1l) { mfspr(val, chud_ppc_dbat1l); break; } - if(spr==chud_ppc_dbat2u) { mfspr(val, chud_ppc_dbat2u); break; } - if(spr==chud_ppc_dbat2l) { mfspr(val, chud_ppc_dbat2l); break; } - if(spr==chud_ppc_dbat3u) { mfspr(val, chud_ppc_dbat3u); break; } - if(spr==chud_ppc_dbat3l) { mfspr(val, chud_ppc_dbat3l); break; } - if(spr==chud_ppc_dabr) { mfspr(val, chud_ppc_dabr); break; } - if(spr==chud_ppc_msr) { /* this is the MSR for the calling process */ - struct ppc_thread_state64 state; - mach_msg_type_number_t count = PPC_THREAD_STATE64_COUNT; - kern_return_t kr; - kr = chudxnu_thread_get_state(current_thread(), PPC_THREAD_STATE64, (thread_state_t)&state, &count, TRUE /* user only */); - if(KERN_SUCCESS==kr) { - val = state.srr1; - } else { - retval = KERN_FAILURE; - } - break; - } - - /* PPC SPRs - 32-bit implementations */ - if(spr==chud_ppc32_sr0) { mfsr(val, 0); break; } - if(spr==chud_ppc32_sr1) { mfsr(val, 1); break; } - if(spr==chud_ppc32_sr2) { mfsr(val, 2); break; } - if(spr==chud_ppc32_sr3) { mfsr(val, 3); break; } - if(spr==chud_ppc32_sr4) { mfsr(val, 4); break; } - if(spr==chud_ppc32_sr5) { mfsr(val, 5); break; } - if(spr==chud_ppc32_sr6) { mfsr(val, 6); break; } - if(spr==chud_ppc32_sr7) { mfsr(val, 7); break; } - if(spr==chud_ppc32_sr8) { mfsr(val, 8); break; } - if(spr==chud_ppc32_sr9) { mfsr(val, 9); break; } - if(spr==chud_ppc32_sr10) { mfsr(val, 10); break; } - if(spr==chud_ppc32_sr11) { mfsr(val, 11); break; } - if(spr==chud_ppc32_sr12) { mfsr(val, 12); break; } - if(spr==chud_ppc32_sr13) { mfsr(val, 13); break; } - if(spr==chud_ppc32_sr14) { mfsr(val, 14); break; } - if(spr==chud_ppc32_sr15) { mfsr(val, 15); break; } - - /* PPC SPRs - 64-bit implementations */ - if(spr==chud_ppc64_ctrl) { mfspr(val, chud_ppc64_ctrl); break; } - - /* Implementation Specific SPRs */ - if(cpu_subtype()==CPU_SUBTYPE_POWERPC_750) { - if(spr==chud_750_mmcr0) { mfspr(val, chud_750_mmcr0); break; } - if(spr==chud_750_pmc1) { mfspr(val, chud_750_pmc1); break; } - if(spr==chud_750_pmc2) { mfspr(val, chud_750_pmc2); break; } - if(spr==chud_750_sia) { mfspr(val, chud_750_sia); break; } - if(spr==chud_750_mmcr1) { mfspr(val, chud_750_mmcr1); break; } - if(spr==chud_750_pmc3) { mfspr(val, chud_750_pmc3); break; } - if(spr==chud_750_pmc4) { mfspr(val, chud_750_pmc4); break; } - if(spr==chud_750_hid0) { mfspr(val, chud_750_hid0); break; } - if(spr==chud_750_hid1) { mfspr(val, chud_750_hid1); break; } - if(spr==chud_750_iabr) { mfspr(val, chud_750_iabr); break; } - if(spr==chud_750_ictc) { mfspr(val, chud_750_ictc); break; } - if(spr==chud_750_thrm1) { mfspr(val, chud_750_thrm1); break; } - if(spr==chud_750_thrm2) { mfspr(val, chud_750_thrm2); break; } - if(spr==chud_750_thrm3) { mfspr(val, chud_750_thrm3); break; } - if(spr==chud_750_l2cr) { mfspr(val, chud_750_l2cr); break; } - - // 750FX only - if(spr==chud_750fx_ibat4u) { mfspr(val, chud_750fx_ibat4u); break; } - if(spr==chud_750fx_ibat4l) { mfspr(val, chud_750fx_ibat4l); break; } - if(spr==chud_750fx_ibat5u) { mfspr(val, chud_750fx_ibat5u); break; } - if(spr==chud_750fx_ibat5l) { mfspr(val, chud_750fx_ibat5l); break; } - if(spr==chud_750fx_ibat6u) { mfspr(val, chud_750fx_ibat6u); break; } - if(spr==chud_750fx_ibat6l) { mfspr(val, chud_750fx_ibat6l); break; } - if(spr==chud_750fx_ibat7u) { mfspr(val, chud_750fx_ibat7u); break; } - if(spr==chud_750fx_ibat7l) { mfspr(val, chud_750fx_ibat7l); break; } - if(spr==chud_750fx_dbat4u) { mfspr(val, chud_750fx_dbat4u); break; } - if(spr==chud_750fx_dbat4l) { mfspr(val, chud_750fx_dbat4l); break; } - if(spr==chud_750fx_dbat5u) { mfspr(val, chud_750fx_dbat5u); break; } - if(spr==chud_750fx_dbat5l) { mfspr(val, chud_750fx_dbat5l); break; } - if(spr==chud_750fx_dbat6u) { mfspr(val, chud_750fx_dbat6u); break; } - if(spr==chud_750fx_dbat6l) { mfspr(val, chud_750fx_dbat6l); break; } - if(spr==chud_750fx_dbat7u) { mfspr(val, chud_750fx_dbat7u); break; } - if(spr==chud_750fx_dbat7l) { mfspr(val, chud_750fx_dbat7l); break; } - - // 750FX >= DDR2.x only - if(spr==chud_750fx_hid2) { mfspr(val, chud_750fx_hid2); break; } - } - - if(cpu_subtype()==CPU_SUBTYPE_POWERPC_7400) { - if(spr==chud_7400_mmcr2) { mfspr(val, chud_7400_mmcr2); break; } - if(spr==chud_7400_bamr) { mfspr(val, chud_7400_bamr); break; } - if(spr==chud_7400_mmcr0) { mfspr(val, chud_7400_mmcr0); break; } - if(spr==chud_7400_pmc1) { mfspr(val, chud_7400_pmc1); break; } - if(spr==chud_7400_pmc2) { mfspr(val, chud_7400_pmc2); break; } - if(spr==chud_7400_siar) { mfspr(val, chud_7400_siar); break; } - if(spr==chud_7400_mmcr1) { mfspr(val, chud_7400_mmcr1); break; } - if(spr==chud_7400_pmc3) { mfspr(val, chud_7400_pmc3); break; } - if(spr==chud_7400_pmc4) { mfspr(val, chud_7400_pmc4); break; } - if(spr==chud_7400_hid0) { mfspr(val, chud_7400_hid0); break; } - if(spr==chud_7400_hid1) { mfspr(val, chud_7400_hid1); break; } - if(spr==chud_7400_iabr) { mfspr(val, chud_7400_iabr); break; } - if(spr==chud_7400_msscr0) { mfspr(val, chud_7400_msscr0); break; } - if(spr==chud_7400_msscr1) { mfspr(val, chud_7400_msscr1); break; } /* private */ - if(spr==chud_7400_ictc) { mfspr(val, chud_7400_ictc); break; } - if(spr==chud_7400_thrm1) { mfspr(val, chud_7400_thrm1); break; } - if(spr==chud_7400_thrm2) { mfspr(val, chud_7400_thrm2); break; } - if(spr==chud_7400_thrm3) { mfspr(val, chud_7400_thrm3); break; } - if(spr==chud_7400_pir) { mfspr(val, chud_7400_pir); break; } - if(spr==chud_7400_l2cr) { mfspr(val, chud_7400_l2cr); break; } - - // 7410 only - if(spr==chud_7410_l2pmcr) { mfspr(val, chud_7410_l2pmcr); break; } - } - - if(cpu_subtype()==CPU_SUBTYPE_POWERPC_7450) { - if(spr==chud_7450_mmcr2) { mfspr(val, chud_7450_mmcr2); break; } - if(spr==chud_7450_pmc5) { mfspr(val, chud_7450_pmc5); break; } - if(spr==chud_7450_pmc6) { mfspr(val, chud_7450_pmc6); break; } - if(spr==chud_7450_bamr) { mfspr(val, chud_7450_bamr); break; } - if(spr==chud_7450_mmcr0) { mfspr(val, chud_7450_mmcr0); break; } - if(spr==chud_7450_pmc1) { mfspr(val, chud_7450_pmc1); break; } - if(spr==chud_7450_pmc2) { mfspr(val, chud_7450_pmc2); break; } - if(spr==chud_7450_siar) { mfspr(val, chud_7450_siar); break; } - if(spr==chud_7450_mmcr1) { mfspr(val, chud_7450_mmcr1); break; } - if(spr==chud_7450_pmc3) { mfspr(val, chud_7450_pmc3); break; } - if(spr==chud_7450_pmc4) { mfspr(val, chud_7450_pmc4); break; } - if(spr==chud_7450_tlbmiss) { mfspr(val, chud_7450_tlbmiss); break; } - if(spr==chud_7450_ptehi) { mfspr(val, chud_7450_ptehi); break; } - if(spr==chud_7450_ptelo) { mfspr(val, chud_7450_ptelo); break; } - if(spr==chud_7450_l3pm) { mfspr(val, chud_7450_l3pm); break; } - if(spr==chud_7450_hid0) { mfspr(val, chud_7450_hid0); break; } - if(spr==chud_7450_hid1) { mfspr(val, chud_7450_hid1); break; } - if(spr==chud_7450_iabr) { mfspr(val, chud_7450_iabr); break; } - if(spr==chud_7450_ldstdb) { mfspr(val, chud_7450_ldstdb); break; } - if(spr==chud_7450_msscr0) { mfspr(val, chud_7450_msscr0); break; } - if(spr==chud_7450_msssr0) { mfspr(val, chud_7450_msssr0); break; } - if(spr==chud_7450_ldstcr) { mfspr(val, chud_7450_ldstcr); break; } - if(spr==chud_7450_ictc) { mfspr(val, chud_7450_ictc); break; } - if(spr==chud_7450_ictrl) { mfspr(val, chud_7450_ictrl); break; } - if(spr==chud_7450_thrm1) { mfspr(val, chud_7450_thrm1); break; } - if(spr==chud_7450_thrm2) { mfspr(val, chud_7450_thrm2); break; } - if(spr==chud_7450_thrm3) { mfspr(val, chud_7450_thrm3); break; } - if(spr==chud_7450_pir) { mfspr(val, chud_7450_pir); break; } - if(spr==chud_7450_l2cr) { mfspr(val, chud_7450_l2cr); break; } - if(spr==chud_7450_l3cr) { mfspr(val, chud_7450_l3cr); break; } - - // 7455/7457 only - if(spr==chud_7455_sprg4) { mfspr(val, chud_7455_sprg4); break; } - if(spr==chud_7455_sprg5) { mfspr(val, chud_7455_sprg5); break; } - if(spr==chud_7455_sprg6) { mfspr(val, chud_7455_sprg6); break; } - if(spr==chud_7455_sprg7) { mfspr(val, chud_7455_sprg7); break; } - if(spr==chud_7455_ibat4u) { mfspr(val, chud_7455_ibat4u); break; } - if(spr==chud_7455_ibat4l) { mfspr(val, chud_7455_ibat4l); break; } - if(spr==chud_7455_ibat5u) { mfspr(val, chud_7455_ibat5u); break; } - if(spr==chud_7455_ibat5l) { mfspr(val, chud_7455_ibat5l); break; } - if(spr==chud_7455_ibat6u) { mfspr(val, chud_7455_ibat6u); break; } - if(spr==chud_7455_ibat6l) { mfspr(val, chud_7455_ibat6l); break; } - if(spr==chud_7455_ibat7u) { mfspr(val, chud_7455_ibat7u); break; } - if(spr==chud_7455_ibat7l) { mfspr(val, chud_7455_ibat7l); break; } - if(spr==chud_7455_dbat4u) { mfspr(val, chud_7455_dbat4u); break; } - if(spr==chud_7455_dbat4l) { mfspr(val, chud_7455_dbat4l); break; } - if(spr==chud_7455_dbat5u) { mfspr(val, chud_7455_dbat5u); break; } - if(spr==chud_7455_dbat5l) { mfspr(val, chud_7455_dbat5l); break; } - if(spr==chud_7455_dbat6u) { mfspr(val, chud_7455_dbat6u); break; } - if(spr==chud_7455_dbat6l) { mfspr(val, chud_7455_dbat6l); break; } - if(spr==chud_7455_dbat7u) { mfspr(val, chud_7455_dbat7u); break; } - if(spr==chud_7455_dbat7l) { mfspr(val, chud_7455_dbat7l); break; } - } - - if(cpu_subtype()==CPU_SUBTYPE_POWERPC_970) { - if(spr==chud_970_pir) { mfspr(val, chud_970_pir); break; } - if(spr==chud_970_pmc1) { mfspr(val, chud_970_pmc1); break; } - if(spr==chud_970_pmc2) { mfspr(val, chud_970_pmc2); break; } - if(spr==chud_970_pmc3) { mfspr(val, chud_970_pmc3); break; } - if(spr==chud_970_pmc4) { mfspr(val, chud_970_pmc4); break; } - if(spr==chud_970_pmc5) { mfspr(val, chud_970_pmc5); break; } - if(spr==chud_970_pmc6) { mfspr(val, chud_970_pmc6); break; } - if(spr==chud_970_pmc7) { mfspr(val, chud_970_pmc7); break; } - if(spr==chud_970_pmc8) { mfspr(val, chud_970_pmc8); break; } - if(spr==chud_970_hdec) { mfspr(val, chud_970_hdec); break; } - } - - /* we only get here if none of the above cases qualify */ - retval = KERN_INVALID_ARGUMENT; - } while(0); - - chudxnu_set_interrupts_enabled(oldlevel); /* enable interrupts */ - - if(cpu>=0) { // cpu<0 means don't bind - chudxnu_unbind_thread(current_thread(), 0); - } - - *val_p = val; - - return retval; -} - -__private_extern__ -kern_return_t chudxnu_read_spr64(int cpu, int spr, uint64_t *val_p) -{ - kern_return_t retval = KERN_SUCCESS; - boolean_t oldlevel; - - /* bind to requested CPU */ - if(cpu>=0 && !(ml_at_interrupt_context() && cpu_number() == cpu)) { // cpu<0 means don't bind - if(chudxnu_bind_thread(current_thread(), cpu, 0)!=KERN_SUCCESS) { - return KERN_INVALID_ARGUMENT; - } - } - - oldlevel = chudxnu_set_interrupts_enabled(FALSE); /* disable interrupts */ - - do { - /* PPC SPRs - 32-bit and 64-bit implementations */ - if(spr==chud_ppc_srr0) { retval = mfspr64(val_p, chud_ppc_srr0); break; } - if(spr==chud_ppc_srr1) { retval = mfspr64(val_p, chud_ppc_srr1); break; } - if(spr==chud_ppc_dar) { retval = mfspr64(val_p, chud_ppc_dar); break; } - if(spr==chud_ppc_dsisr) { retval = mfspr64(val_p, chud_ppc_dsisr); break; } - if(spr==chud_ppc_sdr1) { retval = mfspr64(val_p, chud_ppc_sdr1); break; } - if(spr==chud_ppc_sprg0) { retval = mfspr64(val_p, chud_ppc_sprg0); break; } - if(spr==chud_ppc_sprg1) { retval = mfspr64(val_p, chud_ppc_sprg1); break; } - if(spr==chud_ppc_sprg2) { retval = mfspr64(val_p, chud_ppc_sprg2); break; } - if(spr==chud_ppc_sprg3) { retval = mfspr64(val_p, chud_ppc_sprg3); break; } - if(spr==chud_ppc_dabr) { retval = mfspr64(val_p, chud_ppc_dabr); break; } - if(spr==chud_ppc_msr) { /* this is the MSR for the calling process */ - struct ppc_thread_state64 state; - mach_msg_type_number_t count = PPC_THREAD_STATE64_COUNT; - kern_return_t kr; - kr = chudxnu_thread_get_state(current_thread(), PPC_THREAD_STATE64, (thread_state_t)&state, &count, TRUE /* user only */); - if(KERN_SUCCESS==kr) { - *val_p = state.srr1; - } else { - retval = KERN_FAILURE; - } - break; - } - - /* PPC SPRs - 64-bit implementations */ - if(spr==chud_ppc64_asr) { retval = mfspr64(val_p, chud_ppc64_asr); break; } - if(spr==chud_ppc64_accr) { retval = mfspr64(val_p, chud_ppc64_accr); break; } - - /* Implementation Specific SPRs */ - if(cpu_subtype()==CPU_SUBTYPE_POWERPC_970) { - if(spr==chud_970_hid0) { retval = mfspr64(val_p, chud_970_hid0); break; } - if(spr==chud_970_hid1) { retval = mfspr64(val_p, chud_970_hid1); break; } - if(spr==chud_970_hid4) { retval = mfspr64(val_p, chud_970_hid4); break; } - if(spr==chud_970_hid5) { retval = mfspr64(val_p, chud_970_hid5); break; } - if(spr==chud_970_mmcr0) { retval = mfspr64(val_p, chud_970_mmcr0); break; } - if(spr==chud_970_mmcr1) { retval = mfspr64(val_p, chud_970_mmcr1); break; } - if(spr==chud_970_mmcra) { retval = mfspr64(val_p, chud_970_mmcra); break; } - if(spr==chud_970_siar) { retval = mfspr64(val_p, chud_970_siar); break; } - if(spr==chud_970_sdar) { retval = mfspr64(val_p, chud_970_sdar); break; } - if(spr==chud_970_imc) { retval = mfspr64(val_p, chud_970_imc); break; } - if(spr==chud_970_rmor) { retval = mfspr64(val_p, chud_970_rmor); break; } - if(spr==chud_970_hrmor) { retval = mfspr64(val_p, chud_970_hrmor); break; } - if(spr==chud_970_hior) { retval = mfspr64(val_p, chud_970_hior); break; } - if(spr==chud_970_lpidr) { retval = mfspr64(val_p, chud_970_lpidr); break; } - if(spr==chud_970_lpcr) { retval = mfspr64(val_p, chud_970_lpcr); break; } - if(spr==chud_970_dabrx) { retval = mfspr64(val_p, chud_970_dabrx); break; } - if(spr==chud_970_hsprg0) { retval = mfspr64(val_p, chud_970_hsprg0); break; } - if(spr==chud_970_hsprg1) { retval = mfspr64(val_p, chud_970_hsprg1); break; } - if(spr==chud_970_hsrr0) { retval = mfspr64(val_p, chud_970_hsrr0); break; } - if(spr==chud_970_hsrr1) { retval = mfspr64(val_p, chud_970_hsrr1); break; } - if(spr==chud_970_hdec) { retval = mfspr64(val_p, chud_970_hdec); break; } - if(spr==chud_970_trig0) { retval = mfspr64(val_p, chud_970_trig0); break; } - if(spr==chud_970_trig1) { retval = mfspr64(val_p, chud_970_trig1); break; } - if(spr==chud_970_trig2) { retval = mfspr64(val_p, chud_970_trig2); break; } - if(spr==chud_970_scomc) { retval = mfspr64(val_p, chud_970_scomc); break; } - if(spr==chud_970_scomd) { retval = mfspr64(val_p, chud_970_scomd); break; } - } - - /* we only get here if none of the above cases qualify */ - *val_p = 0xFFFFFFFFFFFFFFFFLL; - retval = KERN_INVALID_ARGUMENT; - } while(0); - - chudxnu_set_interrupts_enabled(oldlevel); /* enable interrupts */ - - if(cpu>=0) { // cpu<0 means don't bind - chudxnu_unbind_thread(current_thread(), 0); - } - - return retval; -} - -__private_extern__ -kern_return_t chudxnu_write_spr(int cpu, int spr, uint32_t val) -{ - kern_return_t retval = KERN_SUCCESS; - boolean_t oldlevel; - - /* bind to requested CPU */ - if(cpu>=0 && !(ml_at_interrupt_context() && cpu_number() == cpu)) { // cpu<0 means don't bind - if(chudxnu_bind_thread(current_thread(), cpu, 0)!=KERN_SUCCESS) { - return KERN_INVALID_ARGUMENT; - } - } - - oldlevel = chudxnu_set_interrupts_enabled(FALSE); /* disable interrupts */ - - do { - /* PPC SPRs - 32-bit and 64-bit implementations */ - if(spr==chud_ppc_srr0) { mtspr(chud_ppc_srr0, val); break; } - if(spr==chud_ppc_srr1) { mtspr(chud_ppc_srr1, val); break; } - if(spr==chud_ppc_dsisr) { mtspr(chud_ppc_dsisr, val); break; } - if(spr==chud_ppc_dar) { mtspr(chud_ppc_dar, val); break; } - if(spr==chud_ppc_dec) { mtspr(chud_ppc_dec, val); break; } - if(spr==chud_ppc_sdr1) { mtspr(chud_ppc_sdr1, val); break; } - if(spr==chud_ppc_sprg0) { mtspr(chud_ppc_sprg0, val); break; } - if(spr==chud_ppc_sprg1) { mtspr(chud_ppc_sprg1, val); break; } - if(spr==chud_ppc_sprg2) { mtspr(chud_ppc_sprg2, val); break; } - if(spr==chud_ppc_sprg3) { mtspr(chud_ppc_sprg3, val); break; } - if(spr==chud_ppc_ear) { mtspr(chud_ppc_ear, val); break; } - if(spr==chud_ppc_tbl) { mtspr(284, val); break; } /* timebase consists of read registers and write registers */ - if(spr==chud_ppc_tbu) { mtspr(285, val); break; } - if(spr==chud_ppc_pvr) { mtspr(chud_ppc_pvr, val); break; } - if(spr==chud_ppc_ibat0u) { mtspr(chud_ppc_ibat0u, val); break; } - if(spr==chud_ppc_ibat0l) { mtspr(chud_ppc_ibat0l, val); break; } - if(spr==chud_ppc_ibat1u) { mtspr(chud_ppc_ibat1u, val); break; } - if(spr==chud_ppc_ibat1l) { mtspr(chud_ppc_ibat1l, val); break; } - if(spr==chud_ppc_ibat2u) { mtspr(chud_ppc_ibat2u, val); break; } - if(spr==chud_ppc_ibat2l) { mtspr(chud_ppc_ibat2l, val); break; } - if(spr==chud_ppc_ibat3u) { mtspr(chud_ppc_ibat3u, val); break; } - if(spr==chud_ppc_ibat3l) { mtspr(chud_ppc_ibat3l, val); break; } - if(spr==chud_ppc_dbat0u) { mtspr(chud_ppc_dbat0u, val); break; } - if(spr==chud_ppc_dbat0l) { mtspr(chud_ppc_dbat0l, val); break; } - if(spr==chud_ppc_dbat1u) { mtspr(chud_ppc_dbat1u, val); break; } - if(spr==chud_ppc_dbat1l) { mtspr(chud_ppc_dbat1l, val); break; } - if(spr==chud_ppc_dbat2u) { mtspr(chud_ppc_dbat2u, val); break; } - if(spr==chud_ppc_dbat2l) { mtspr(chud_ppc_dbat2l, val); break; } - if(spr==chud_ppc_dbat3u) { mtspr(chud_ppc_dbat3u, val); break; } - if(spr==chud_ppc_dbat3l) { mtspr(chud_ppc_dbat3l, val); break; } - if(spr==chud_ppc_dabr) { mtspr(chud_ppc_dabr, val); break; } - if(spr==chud_ppc_msr) { /* this is the MSR for the calling process */ - struct ppc_thread_state64 state; - mach_msg_type_number_t count = PPC_THREAD_STATE64_COUNT; - kern_return_t kr; - kr = chudxnu_thread_get_state(current_thread(), PPC_THREAD_STATE64, (thread_state_t)&state, &count, TRUE /* user only */); - if(KERN_SUCCESS==kr) { - state.srr1 = val; - kr = chudxnu_thread_set_state(current_thread(), PPC_THREAD_STATE64, (thread_state_t)&state, count, TRUE /* user only */); - if(KERN_SUCCESS!=kr) { - retval = KERN_FAILURE; - } - } else { - retval = KERN_FAILURE; - } - break; - } - - /* PPC SPRs - 32-bit implementations */ - if(spr==chud_ppc32_sr0) { mtsr(0, val); break; } - if(spr==chud_ppc32_sr1) { mtsr(1, val); break; } - if(spr==chud_ppc32_sr2) { mtsr(2, val); break; } - if(spr==chud_ppc32_sr3) { mtsr(3, val); break; } - if(spr==chud_ppc32_sr4) { mtsr(4, val); break; } - if(spr==chud_ppc32_sr5) { mtsr(5, val); break; } - if(spr==chud_ppc32_sr6) { mtsr(6, val); break; } - if(spr==chud_ppc32_sr7) { mtsr(7, val); break; } - if(spr==chud_ppc32_sr8) { mtsr(8, val); break; } - if(spr==chud_ppc32_sr9) { mtsr(9, val); break; } - if(spr==chud_ppc32_sr10) { mtsr(10, val); break; } - if(spr==chud_ppc32_sr11) { mtsr(11, val); break; } - if(spr==chud_ppc32_sr12) { mtsr(12, val); break; } - if(spr==chud_ppc32_sr13) { mtsr(13, val); break; } - if(spr==chud_ppc32_sr14) { mtsr(14, val); break; } - if(spr==chud_ppc32_sr15) { mtsr(15, val); break; } - - /* Implementation Specific SPRs */ - if(cpu_subtype()==CPU_SUBTYPE_POWERPC_750) { - if(spr==chud_750_mmcr0) { mtspr(chud_750_mmcr0, val); break; } - if(spr==chud_750_pmc1) { mtspr(chud_750_pmc1, val); break; } - if(spr==chud_750_pmc2) { mtspr(chud_750_pmc2, val); break; } - if(spr==chud_750_sia) { mtspr(chud_750_sia, val); break; } - if(spr==chud_750_mmcr1) { mtspr(chud_750_mmcr1, val); break; } - if(spr==chud_750_pmc3) { mtspr(chud_750_pmc3, val); break; } - if(spr==chud_750_pmc4) { mtspr(chud_750_pmc4, val); break; } - if(spr==chud_750_iabr) { mtspr(chud_750_iabr, val); break; } - if(spr==chud_750_ictc) { mtspr(chud_750_ictc, val); break; } - if(spr==chud_750_thrm1) { mtspr(chud_750_thrm1, val); break; } - if(spr==chud_750_thrm2) { mtspr(chud_750_thrm2, val); break; } - if(spr==chud_750_thrm3) { mtspr(chud_750_thrm3, val); break; } - if(spr==chud_750_l2cr) { - retval = chudxnu_set_shadowed_spr(cpu, spr, val); - break; - } - if(spr==chud_750_hid0) { - retval = chudxnu_set_shadowed_spr(cpu, spr, val); - break; - } - if(spr==chud_750_hid1) { - retval = chudxnu_set_shadowed_spr(cpu, spr, val); - break; - } - - // 750FX only - if(spr==chud_750fx_ibat4u) { mtspr(chud_750fx_ibat4u, val); break; } - if(spr==chud_750fx_ibat4l) { mtspr(chud_750fx_ibat4l, val); break; } - if(spr==chud_750fx_ibat5u) { mtspr(chud_750fx_ibat5u, val); break; } - if(spr==chud_750fx_ibat5l) { mtspr(chud_750fx_ibat5l, val); break; } - if(spr==chud_750fx_ibat6u) { mtspr(chud_750fx_ibat6u, val); break; } - if(spr==chud_750fx_ibat6l) { mtspr(chud_750fx_ibat6l, val); break; } - if(spr==chud_750fx_ibat7u) { mtspr(chud_750fx_ibat7u, val); break; } - if(spr==chud_750fx_ibat7l) { mtspr(chud_750fx_ibat7l, val); break; } - if(spr==chud_750fx_dbat4u) { mtspr(chud_750fx_dbat4u, val); break; } - if(spr==chud_750fx_dbat4l) { mtspr(chud_750fx_dbat4l, val); break; } - if(spr==chud_750fx_dbat5u) { mtspr(chud_750fx_dbat5u, val); break; } - if(spr==chud_750fx_dbat5l) { mtspr(chud_750fx_dbat5l, val); break; } - if(spr==chud_750fx_dbat6u) { mtspr(chud_750fx_dbat6u, val); break; } - if(spr==chud_750fx_dbat6l) { mtspr(chud_750fx_dbat6l, val); break; } - if(spr==chud_750fx_dbat7u) { mtspr(chud_750fx_dbat7u, val); break; } - if(spr==chud_750fx_dbat7l) { mtspr(chud_750fx_dbat7l, val); break; } - - // 750FX >= DDR2.x - if(spr==chud_750fx_hid2) { mtspr(chud_750fx_hid2, val); break; } - } - - if(cpu_subtype()==CPU_SUBTYPE_POWERPC_7400) { - if(spr==chud_7400_mmcr2) { mtspr(chud_7400_mmcr2, val); break; } - if(spr==chud_7400_bamr) { mtspr(chud_7400_bamr, val); break; } - if(spr==chud_7400_mmcr0) { mtspr(chud_7400_mmcr0, val); break; } - if(spr==chud_7400_pmc1) { mtspr(chud_7400_pmc1, val); break; } - if(spr==chud_7400_pmc2) { mtspr(chud_7400_pmc2, val); break; } - if(spr==chud_7400_siar) { mtspr(chud_7400_siar, val); break; } - if(spr==chud_7400_mmcr1) { mtspr(chud_7400_mmcr1, val); break; } - if(spr==chud_7400_pmc3) { mtspr(chud_7400_pmc3, val); break; } - if(spr==chud_7400_pmc4) { mtspr(chud_7400_pmc4, val); break; } - if(spr==chud_7400_iabr) { mtspr(chud_7400_iabr, val); break; } - if(spr==chud_7400_ictc) { mtspr(chud_7400_ictc, val); break; } - if(spr==chud_7400_thrm1) { mtspr(chud_7400_thrm1, val); break; } - if(spr==chud_7400_thrm2) { mtspr(chud_7400_thrm2, val); break; } - if(spr==chud_7400_thrm3) { mtspr(chud_7400_thrm3, val); break; } - if(spr==chud_7400_pir) { mtspr(chud_7400_pir, val); break; } - - if(spr==chud_7400_l2cr) { - retval = chudxnu_set_shadowed_spr(cpu, spr, val); - break; - } - if(spr==chud_7400_hid0) { - retval = chudxnu_set_shadowed_spr(cpu, spr, val); - break; - } - if(spr==chud_7400_hid1) { - retval = chudxnu_set_shadowed_spr(cpu, spr, val); - break; - } - if(spr==chud_7400_msscr0) { - retval = chudxnu_set_shadowed_spr(cpu, spr, val); - break; - } - if(spr==chud_7400_msscr1) { /* private */ - retval = chudxnu_set_shadowed_spr(cpu, spr, val); - break; - } - - // 7410 only - if(spr==chud_7410_l2pmcr) { mtspr(chud_7410_l2pmcr, val); break; } - } - - if(cpu_subtype()==CPU_SUBTYPE_POWERPC_7450) { - if(spr==chud_7450_mmcr2) { mtspr(chud_7450_mmcr2, val); break; } - if(spr==chud_7450_pmc5) { mtspr(chud_7450_pmc5, val); break; } - if(spr==chud_7450_pmc6) { mtspr(chud_7450_pmc6, val); break; } - if(spr==chud_7450_bamr) { mtspr(chud_7450_bamr, val); break; } - if(spr==chud_7450_mmcr0) { mtspr(chud_7450_mmcr0, val); break; } - if(spr==chud_7450_pmc1) { mtspr(chud_7450_pmc1, val); break; } - if(spr==chud_7450_pmc2) { mtspr(chud_7450_pmc2, val); break; } - if(spr==chud_7450_siar) { mtspr(chud_7450_siar, val); break; } - if(spr==chud_7450_mmcr1) { mtspr(chud_7450_mmcr1, val); break; } - if(spr==chud_7450_pmc3) { mtspr(chud_7450_pmc3, val); break; } - if(spr==chud_7450_pmc4) { mtspr(chud_7450_pmc4, val); break; } - if(spr==chud_7450_tlbmiss) { mtspr(chud_7450_tlbmiss, val); break; } - if(spr==chud_7450_ptehi) { mtspr(chud_7450_ptehi, val); break; } - if(spr==chud_7450_ptelo) { mtspr(chud_7450_ptelo, val); break; } - if(spr==chud_7450_l3pm) { mtspr(chud_7450_l3pm, val); break; } - if(spr==chud_7450_iabr) { mtspr(chud_7450_iabr, val); break; } - if(spr==chud_7450_ldstdb) { mtspr(chud_7450_ldstdb, val); break; } - if(spr==chud_7450_ictc) { mtspr(chud_7450_ictc, val); break; } - if(spr==chud_7450_thrm1) { mtspr(chud_7450_thrm1, val); break; } - if(spr==chud_7450_thrm2) { mtspr(chud_7450_thrm2, val); break; } - if(spr==chud_7450_thrm3) { mtspr(chud_7450_thrm3, val); break; } - if(spr==chud_7450_pir) { mtspr(chud_7450_pir, val); break; } - - if(spr==chud_7450_l2cr) { - retval = chudxnu_set_shadowed_spr(cpu, spr, val); - break; - } - - if(spr==chud_7450_l3cr) { - retval = chudxnu_set_shadowed_spr(cpu, spr, val); - break; - } - if(spr==chud_7450_ldstcr) { - retval = chudxnu_set_shadowed_spr(cpu, spr, val); - break; - } - if(spr==chud_7450_hid0) { - retval = chudxnu_set_shadowed_spr(cpu, spr, val); - break; - } - if(spr==chud_7450_hid1) { - retval = chudxnu_set_shadowed_spr(cpu, spr, val); - break; - } - if(spr==chud_7450_msscr0) { - retval = chudxnu_set_shadowed_spr(cpu, spr, val); - break; - } - if(spr==chud_7450_msssr0) { - retval = chudxnu_set_shadowed_spr(cpu, spr, val); - break; - } - if(spr==chud_7450_ictrl) { - retval = chudxnu_set_shadowed_spr(cpu, spr, val); - break; - } - - // 7455/7457 only - if(spr==chud_7455_sprg4) { mtspr(chud_7455_sprg4, val); break; } - if(spr==chud_7455_sprg5) { mtspr(chud_7455_sprg5, val); break; } - if(spr==chud_7455_sprg6) { mtspr(chud_7455_sprg6, val); break; } - if(spr==chud_7455_sprg7) { mtspr(chud_7455_sprg7, val); break; } - if(spr==chud_7455_ibat4u) { mtspr(chud_7455_ibat4u, val); break; } - if(spr==chud_7455_ibat4l) { mtspr(chud_7455_ibat4l, val); break; } - if(spr==chud_7455_ibat5u) { mtspr(chud_7455_ibat5u, val); break; } - if(spr==chud_7455_ibat5l) { mtspr(chud_7455_ibat5l, val); break; } - if(spr==chud_7455_ibat6u) { mtspr(chud_7455_ibat6u, val); break; } - if(spr==chud_7455_ibat6l) { mtspr(chud_7455_ibat6l, val); break; } - if(spr==chud_7455_ibat7u) { mtspr(chud_7455_ibat7u, val); break; } - if(spr==chud_7455_ibat7l) { mtspr(chud_7455_ibat7l, val); break; } - if(spr==chud_7455_dbat4u) { mtspr(chud_7455_dbat4u, val); break; } - if(spr==chud_7455_dbat4l) { mtspr(chud_7455_dbat4l, val); break; } - if(spr==chud_7455_dbat5u) { mtspr(chud_7455_dbat5u, val); break; } - if(spr==chud_7455_dbat5l) { mtspr(chud_7455_dbat5l, val); break; } - if(spr==chud_7455_dbat6u) { mtspr(chud_7455_dbat6u, val); break; } - if(spr==chud_7455_dbat6l) { mtspr(chud_7455_dbat6l, val); break; } - if(spr==chud_7455_dbat7u) { mtspr(chud_7455_dbat7u, val); break; } - if(spr==chud_7455_dbat7l) { mtspr(chud_7455_dbat7l, val); break; } - } - - if(cpu_subtype()==CPU_SUBTYPE_POWERPC_970) { - if(spr==chud_970_pir) { mtspr(chud_970_pir, val); break; } - if(spr==chud_970_pmc1) { mtspr(chud_970_pmc1, val); break; } - if(spr==chud_970_pmc2) { mtspr(chud_970_pmc2, val); break; } - if(spr==chud_970_pmc3) { mtspr(chud_970_pmc3, val); break; } - if(spr==chud_970_pmc4) { mtspr(chud_970_pmc4, val); break; } - if(spr==chud_970_pmc5) { mtspr(chud_970_pmc5, val); break; } - if(spr==chud_970_pmc6) { mtspr(chud_970_pmc6, val); break; } - if(spr==chud_970_pmc7) { mtspr(chud_970_pmc7, val); break; } - if(spr==chud_970_pmc8) { mtspr(chud_970_pmc8, val); break; } - if(spr==chud_970_hdec) { mtspr(chud_970_hdec, val); break; } - } - - /* we only get here if none of the above cases qualify */ - retval = KERN_INVALID_ARGUMENT; - } while(0); - - chudxnu_set_interrupts_enabled(oldlevel); /* re-enable interrupts */ - - if(cpu>=0) { // cpu<0 means don't bind - chudxnu_unbind_thread(current_thread(), 0); - } - - return retval; -} - -__private_extern__ -kern_return_t chudxnu_write_spr64(int cpu, int spr, uint64_t val) -{ - kern_return_t retval = KERN_SUCCESS; - boolean_t oldlevel; - uint64_t *val_p = &val; - - /* bind to requested CPU */ - if(cpu>=0 && !(ml_at_interrupt_context() && cpu_number() == cpu)) { // cpu<0 means don't bind - if(chudxnu_bind_thread(current_thread(), cpu, 0)!=KERN_SUCCESS) { - return KERN_INVALID_ARGUMENT; - } - } - - oldlevel = ml_set_interrupts_enabled(FALSE); /* disable interrupts */ - - do { - /* PPC SPRs - 32-bit and 64-bit implementations */ - if(spr==chud_ppc_srr0) { retval = mtspr64(chud_ppc_srr0, val_p); break; } - if(spr==chud_ppc_srr1) { retval = mtspr64(chud_ppc_srr1, val_p); break; } - if(spr==chud_ppc_dar) { retval = mtspr64(chud_ppc_dar, val_p); break; } - if(spr==chud_ppc_dsisr) { retval = mtspr64(chud_ppc_dsisr, val_p); break; } - if(spr==chud_ppc_sdr1) { retval = mtspr64(chud_ppc_sdr1, val_p); break; } - if(spr==chud_ppc_sprg0) { retval = mtspr64(chud_ppc_sprg0, val_p); break; } - if(spr==chud_ppc_sprg1) { retval = mtspr64(chud_ppc_sprg1, val_p); break; } - if(spr==chud_ppc_sprg2) { retval = mtspr64(chud_ppc_sprg2, val_p); break; } - if(spr==chud_ppc_sprg3) { retval = mtspr64(chud_ppc_sprg3, val_p); break; } - if(spr==chud_ppc_dabr) { retval = mtspr64(chud_ppc_dabr, val_p); break; } - if(spr==chud_ppc_msr) { /* this is the MSR for the calling process */ - struct ppc_thread_state64 state; - mach_msg_type_number_t count = PPC_THREAD_STATE64_COUNT; - kern_return_t kr; - kr = chudxnu_thread_get_state(current_thread(), PPC_THREAD_STATE64, (thread_state_t)&state, &count, TRUE /* user only */); - if(KERN_SUCCESS==kr) { - state.srr1 = val; - kr = chudxnu_thread_set_state(current_thread(), PPC_THREAD_STATE64, (thread_state_t)&state, count, TRUE /* user only */); - if(KERN_SUCCESS!=kr) { - retval = KERN_FAILURE; - } - } else { - retval = KERN_FAILURE; - } - break; - } - - /* PPC SPRs - 64-bit implementations */ - if(spr==chud_ppc64_asr) { retval = mtspr64(chud_ppc64_asr, val_p); break; } - if(spr==chud_ppc64_accr) { retval = mtspr64(chud_ppc64_accr, val_p); break; } - if(spr==chud_ppc64_ctrl) { retval = mtspr64(chud_ppc64_ctrl, val_p); break; } - - /* Implementation Specific SPRs */ - if(cpu_subtype()==CPU_SUBTYPE_POWERPC_970) { - if(spr==chud_970_hid0) { retval = mtspr64(chud_970_hid0, val_p); break; } - if(spr==chud_970_hid1) { retval = mtspr64(chud_970_hid1, val_p); break; } - if(spr==chud_970_hid4) { retval = mtspr64(chud_970_hid4, val_p); break; } - if(spr==chud_970_hid5) { retval = mtspr64(chud_970_hid5, val_p); break; } - if(spr==chud_970_mmcr0) { retval = mtspr64(chud_970_mmcr0, val_p); break; } - if(spr==chud_970_mmcr1) { retval = mtspr64(chud_970_mmcr1, val_p); break; } - if(spr==chud_970_mmcra) { retval = mtspr64(chud_970_mmcra, val_p); break; } - if(spr==chud_970_siar) { retval = mtspr64(chud_970_siar, val_p); break; } - if(spr==chud_970_sdar) { retval = mtspr64(chud_970_sdar, val_p); break; } - if(spr==chud_970_imc) { retval = mtspr64(chud_970_imc, val_p); break; } - - if(spr==chud_970_rmor) { retval = mtspr64(chud_970_rmor, val_p); break; } - if(spr==chud_970_hrmor) { retval = mtspr64(chud_970_hrmor, val_p); break; } - if(spr==chud_970_hior) { retval = mtspr64(chud_970_hior, val_p); break; } - if(spr==chud_970_lpidr) { retval = mtspr64(chud_970_lpidr, val_p); break; } - if(spr==chud_970_lpcr) { retval = mtspr64(chud_970_lpcr, val_p); break; } - if(spr==chud_970_dabrx) { retval = mtspr64(chud_970_dabrx, val_p); break; } - - if(spr==chud_970_hsprg0) { retval = mtspr64(chud_970_hsprg0, val_p); break; } - if(spr==chud_970_hsprg1) { retval = mtspr64(chud_970_hsprg1, val_p); break; } - if(spr==chud_970_hsrr0) { retval = mtspr64(chud_970_hsrr0, val_p); break; } - if(spr==chud_970_hsrr1) { retval = mtspr64(chud_970_hsrr1, val_p); break; } - if(spr==chud_970_hdec) { retval = mtspr64(chud_970_hdec, val_p); break; } - if(spr==chud_970_trig0) { retval = mtspr64(chud_970_trig0, val_p); break; } - if(spr==chud_970_trig1) { retval = mtspr64(chud_970_trig1, val_p); break; } - if(spr==chud_970_trig2) { retval = mtspr64(chud_970_trig2, val_p); break; } - if(spr==chud_970_scomc) { retval = mtspr64(chud_970_scomc, val_p); break; } - if(spr==chud_970_scomd) { retval = mtspr64(chud_970_scomd, val_p); break; } - - if(spr==chud_970_hid0) { - retval = chudxnu_set_shadowed_spr64(cpu, spr, val); - break; - } - - if(spr==chud_970_hid1) { - retval = chudxnu_set_shadowed_spr64(cpu, spr, val); - break; - } - - if(spr==chud_970_hid4) { - retval = chudxnu_set_shadowed_spr64(cpu, spr, val); - break; - } - - if(spr==chud_970_hid5) { - retval = chudxnu_set_shadowed_spr64(cpu, spr, val); - break; - } - - } - - /* we only get here if none of the above cases qualify */ - retval = KERN_INVALID_ARGUMENT; - } while(0); - - chudxnu_set_interrupts_enabled(oldlevel); /* re-enable interrupts */ - - if(cpu>=0) { // cpu<0 means don't bind - chudxnu_unbind_thread(current_thread(), 0); - } - - return retval; -} - -#if 0 -#pragma mark **** perfmon facility **** -#endif - -__private_extern__ -kern_return_t chudxnu_perfmon_acquire_facility(task_t task) -{ - return perfmon_acquire_facility(task); -} - -__private_extern__ -kern_return_t chudxnu_perfmon_release_facility(task_t task) -{ - return perfmon_release_facility(task); -} - -#if 0 -#pragma mark **** rupt counters **** -#endif - -__private_extern__ -kern_return_t chudxnu_get_cpu_interrupt_counters(int cpu, interrupt_counters_t *rupts) -{ - if(cpu<0 || cpu>=chudxnu_phys_cpu_count()) { // check sanity of cpu argument - return KERN_FAILURE; - } - - if(rupts) { - boolean_t oldlevel = ml_set_interrupts_enabled(FALSE); - struct per_proc_info *per_proc; - - per_proc = PerProcTable[cpu].ppe_vaddr; - rupts->hwResets = per_proc->hwCtr.hwResets; - rupts->hwMachineChecks = per_proc->hwCtr.hwMachineChecks; - rupts->hwDSIs = per_proc->hwCtr.hwDSIs; - rupts->hwISIs = per_proc->hwCtr.hwISIs; - rupts->hwExternals = per_proc->hwCtr.hwExternals; - rupts->hwAlignments = per_proc->hwCtr.hwAlignments; - rupts->hwPrograms = per_proc->hwCtr.hwPrograms; - rupts->hwFloatPointUnavailable = per_proc->hwCtr.hwFloatPointUnavailable; - rupts->hwDecrementers = per_proc->hwCtr.hwDecrementers; - rupts->hwIOErrors = per_proc->hwCtr.hwIOErrors; - rupts->hwSystemCalls = per_proc->hwCtr.hwSystemCalls; - rupts->hwTraces = per_proc->hwCtr.hwTraces; - rupts->hwFloatingPointAssists = per_proc->hwCtr.hwFloatingPointAssists; - rupts->hwPerformanceMonitors = per_proc->hwCtr.hwPerformanceMonitors; - rupts->hwAltivecs = per_proc->hwCtr.hwAltivecs; - rupts->hwInstBreakpoints = per_proc->hwCtr.hwInstBreakpoints; - rupts->hwSystemManagements = per_proc->hwCtr.hwSystemManagements; - rupts->hwAltivecAssists = per_proc->hwCtr.hwAltivecAssists; - rupts->hwThermal = per_proc->hwCtr.hwThermal; - rupts->hwSoftPatches = per_proc->hwCtr.hwSoftPatches; - rupts->hwMaintenances = per_proc->hwCtr.hwMaintenances; - rupts->hwInstrumentations = per_proc->hwCtr.hwInstrumentations; - - ml_set_interrupts_enabled(oldlevel); - return KERN_SUCCESS; - } else { - return KERN_FAILURE; - } -} - -__private_extern__ -kern_return_t chudxnu_clear_cpu_interrupt_counters(int cpu) -{ - if(cpu<0 || cpu>=chudxnu_phys_cpu_count()) { // check sanity of cpu argument - return KERN_FAILURE; - } - - bzero((char *)&(PerProcTable[cpu].ppe_vaddr->hwCtr), sizeof(struct hwCtrs)); - return KERN_SUCCESS; -} - -#if 0 -#pragma mark *** deprecated *** -#endif - -//DEPRECATED -__private_extern__ -void chudxnu_flush_caches(void) -{ - cacheInit(); -} - -//DEPRECATED -__private_extern__ -void chudxnu_enable_caches(boolean_t enable) -{ - if(!enable) { - cacheInit(); - cacheDisable(); - } else { - cacheInit(); - } -} diff --git a/osfmk/chud/ppc/chud_osfmk_callback_ppc.c b/osfmk/chud/ppc/chud_osfmk_callback_ppc.c deleted file mode 100644 index 3077f07c3..000000000 --- a/osfmk/chud/ppc/chud_osfmk_callback_ppc.c +++ /dev/null @@ -1,549 +0,0 @@ -/* - * Copyright (c) 2003-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include - -#include -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include - -#include -#include - -__private_extern__ -void chudxnu_cancel_all_callbacks(void) -{ - chudxnu_cpu_timer_callback_cancel_all(); - chudxnu_trap_callback_cancel(); - chudxnu_interrupt_callback_cancel(); - chudxnu_perfmon_ast_callback_cancel(); - chudxnu_cpusig_callback_cancel(); - chudxnu_kdebug_callback_cancel(); - chudxnu_syscall_callback_cancel(); - chudxnu_dtrace_callback_cancel(); -} - -static chudcpu_data_t chudcpu_boot_cpu; - -void *chudxnu_per_proc_alloc(boolean_t boot_processor) -{ - chudcpu_data_t *chud_proc_info; - - if (boot_processor) { - chud_proc_info = &chudcpu_boot_cpu; - } else { - chud_proc_info = (chudcpu_data_t *)kalloc(sizeof(chudcpu_data_t)); - if (chud_proc_info == (chudcpu_data_t *)NULL) { - return (void *)NULL; - } - } - bzero((char *)chud_proc_info, sizeof(chudcpu_data_t)); - chud_proc_info->t_deadline = 0xFFFFFFFFFFFFFFFFULL; - return (void *)chud_proc_info; -} - -void chudxnu_per_proc_free(void *per_proc_chud) -{ - if (per_proc_chud == (void *)&chudcpu_boot_cpu) { - return; - } else { - kfree(per_proc_chud,sizeof(chudcpu_data_t)); - } -} - -static void -chudxnu_private_cpu_timer_callback(__unused timer_call_param_t param0, - __unused timer_call_param_t param1) -{ - chudcpu_data_t *chud_proc_info; - boolean_t oldlevel; - struct ppc_thread_state64 state; - mach_msg_type_number_t count; - chudxnu_cpu_timer_callback_func_t fn = NULL; - - oldlevel = ml_set_interrupts_enabled(FALSE); - chud_proc_info = (chudcpu_data_t *)(getPerProc()->pp_chud); - - count = PPC_THREAD_STATE64_COUNT; - if(chudxnu_thread_get_state(current_thread(), PPC_THREAD_STATE64, (thread_state_t)&state, &count, FALSE)==KERN_SUCCESS) { - fn = chud_proc_info->cpu_timer_callback_fn; - if(fn) { - (fn)(PPC_THREAD_STATE64, (thread_state_t)&state, count); - } - } - - ml_set_interrupts_enabled(oldlevel); -} - -__private_extern__ -kern_return_t chudxnu_cpu_timer_callback_enter(chudxnu_cpu_timer_callback_func_t func, uint32_t time, uint32_t units) -{ - chudcpu_data_t *chud_proc_info; - boolean_t oldlevel; - - oldlevel = ml_set_interrupts_enabled(FALSE); - chud_proc_info = (chudcpu_data_t *)(getPerProc()->pp_chud); - - timer_call_cancel(&(chud_proc_info->cpu_timer_call)); // cancel any existing callback for this cpu - - chud_proc_info->cpu_timer_callback_fn = func; - - clock_interval_to_deadline(time, units, &(chud_proc_info->t_deadline)); - timer_call_setup(&(chud_proc_info->cpu_timer_call), chudxnu_private_cpu_timer_callback, NULL); - timer_call_enter(&(chud_proc_info->cpu_timer_call), chud_proc_info->t_deadline); - - ml_set_interrupts_enabled(oldlevel); - return KERN_SUCCESS; -} - -__private_extern__ -kern_return_t chudxnu_cpu_timer_callback_cancel(void) -{ - chudcpu_data_t *chud_proc_info; - boolean_t oldlevel; - - oldlevel = ml_set_interrupts_enabled(FALSE); - chud_proc_info = (chudcpu_data_t *)(getPerProc()->pp_chud); - - timer_call_cancel(&(chud_proc_info->cpu_timer_call)); - chud_proc_info->t_deadline = chud_proc_info->t_deadline | ~(chud_proc_info->t_deadline); // set to max value - chud_proc_info->cpu_timer_callback_fn = NULL; - - ml_set_interrupts_enabled(oldlevel); - return KERN_SUCCESS; -} - -__private_extern__ -kern_return_t chudxnu_cpu_timer_callback_cancel_all(void) -{ - unsigned int cpu; - chudcpu_data_t *chud_proc_info; - - for(cpu=0; cpupp_chud == 0)) - continue; - chud_proc_info = (chudcpu_data_t *)PerProcTable[cpu].ppe_vaddr->pp_chud; - timer_call_cancel(&(chud_proc_info->cpu_timer_call)); - chud_proc_info->t_deadline = chud_proc_info->t_deadline | ~(chud_proc_info->t_deadline); // set to max value - chud_proc_info->cpu_timer_callback_fn = NULL; - } - return KERN_SUCCESS; -} - -#if 0 -#pragma mark **** trap **** -#endif -static kern_return_t chud_null_trap(uint32_t trapentry, thread_flavor_t flavor, - thread_state_t tstate, mach_msg_type_number_t count); -static chudxnu_trap_callback_func_t trap_callback_fn = chud_null_trap; - -static kern_return_t chud_null_trap(uint32_t trapentry __unused, thread_flavor_t flavor __unused, - thread_state_t tstate __unused, mach_msg_type_number_t count __unused) { - return KERN_FAILURE; -} - - -#define TRAP_ENTRY_POINT(t) ((t==T_RESET) ? 0x100 : \ - (t==T_MACHINE_CHECK) ? 0x200 : \ - (t==T_DATA_ACCESS) ? 0x300 : \ - (t==T_DATA_SEGMENT) ? 0x380 : \ - (t==T_INSTRUCTION_ACCESS) ? 0x400 : \ - (t==T_INSTRUCTION_SEGMENT) ? 0x480 : \ - (t==T_INTERRUPT) ? 0x500 : \ - (t==T_ALIGNMENT) ? 0x600 : \ - (t==T_PROGRAM) ? 0x700 : \ - (t==T_FP_UNAVAILABLE) ? 0x800 : \ - (t==T_DECREMENTER) ? 0x900 : \ - (t==T_IO_ERROR) ? 0xa00 : \ - (t==T_RESERVED) ? 0xb00 : \ - (t==T_SYSTEM_CALL) ? 0xc00 : \ - (t==T_TRACE) ? 0xd00 : \ - (t==T_FP_ASSIST) ? 0xe00 : \ - (t==T_PERF_MON) ? 0xf00 : \ - (t==T_VMX) ? 0xf20 : \ - (t==T_INVALID_EXCP0) ? 0x1000 : \ - (t==T_INVALID_EXCP1) ? 0x1100 : \ - (t==T_INVALID_EXCP2) ? 0x1200 : \ - (t==T_INSTRUCTION_BKPT) ? 0x1300 : \ - (t==T_SYSTEM_MANAGEMENT) ? 0x1400 : \ - (t==T_SOFT_PATCH) ? 0x1500 : \ - (t==T_ALTIVEC_ASSIST) ? 0x1600 : \ - (t==T_THERMAL) ? 0x1700 : \ - (t==T_ARCHDEP0) ? 0x1800 : \ - (t==T_INSTRUMENTATION) ? 0x2000 : \ - 0x0) - -static kern_return_t -chudxnu_private_trap_callback(int trapno, struct savearea *ssp, - __unused unsigned int dsisr, - __unused addr64_t dar) -{ - boolean_t oldlevel = ml_set_interrupts_enabled(FALSE); - kern_return_t retval = KERN_FAILURE; - uint32_t trapentry = TRAP_ENTRY_POINT(trapno); - chudxnu_trap_callback_func_t fn = trap_callback_fn; - - if(trapentry!=0x0) { - if(fn) { - struct ppc_thread_state64 state; - mach_msg_type_number_t count = PPC_THREAD_STATE64_COUNT; - chudxnu_copy_savearea_to_threadstate(PPC_THREAD_STATE64, (thread_state_t)&state, &count, ssp); - retval = (fn)(trapentry, PPC_THREAD_STATE64, (thread_state_t)&state, count); - } - } - - ml_set_interrupts_enabled(oldlevel); - - return retval; -} - -__private_extern__ kern_return_t -chudxnu_trap_callback_enter(chudxnu_trap_callback_func_t func) -{ - if(OSCompareAndSwapPtr(NULL, chudxnu_private_trap_callback, - (void * volatile *)&perfTrapHook)) { - - chudxnu_trap_callback_func_t old = trap_callback_fn; - while(!OSCompareAndSwapPtr(old, func, - (void * volatile *)&trap_callback_fn)) { - old = trap_callback_fn; - } - - return KERN_SUCCESS; - } - return KERN_FAILURE; -} - -__private_extern__ kern_return_t -chudxnu_trap_callback_cancel(void) -{ - if(OSCompareAndSwapPtr(chudxnu_private_trap_callback, NULL, - (void * volatile *)&perfTrapHook)) { - - chudxnu_trap_callback_func_t old = trap_callback_fn; - while(!OSCompareAndSwapPtr(old, chud_null_trap, - (void * volatile *)&trap_callback_fn)) { - old = trap_callback_fn; - } - - return KERN_SUCCESS; - } - return KERN_FAILURE; -} - -#if 0 -#pragma mark **** ast **** -#endif -static kern_return_t chud_null_ast(thread_flavor_t flavor, thread_state_t tstate, - mach_msg_type_number_t count); -static chudxnu_perfmon_ast_callback_func_t perfmon_ast_callback_fn = chud_null_ast; - -static kern_return_t chud_null_ast(thread_flavor_t flavor __unused, - thread_state_t tstate __unused, mach_msg_type_number_t count __unused) { - return KERN_FAILURE; -} - - -static kern_return_t -chudxnu_private_chud_ast_callback(__unused int trapno, - __unused struct savearea *ssp, - __unused unsigned int dsisr, - __unused addr64_t dar) -{ - boolean_t oldlevel = ml_set_interrupts_enabled(FALSE); - ast_t *myast = ast_pending(); - kern_return_t retval = KERN_FAILURE; - chudxnu_perfmon_ast_callback_func_t fn = perfmon_ast_callback_fn; - - if(*myast & AST_CHUD_URGENT) { - *myast &= ~(AST_CHUD_URGENT | AST_CHUD); - if((*myast & AST_PREEMPTION) != AST_PREEMPTION) *myast &= ~(AST_URGENT); - retval = KERN_SUCCESS; - } else if(*myast & AST_CHUD) { - *myast &= ~(AST_CHUD); - retval = KERN_SUCCESS; - } - - if(fn) { - struct ppc_thread_state64 state; - mach_msg_type_number_t count; - count = PPC_THREAD_STATE64_COUNT; - - if(chudxnu_thread_get_state(current_thread(), PPC_THREAD_STATE64, (thread_state_t)&state, &count, FALSE)==KERN_SUCCESS) { - (fn)(PPC_THREAD_STATE64, (thread_state_t)&state, count); - } - } - -#if 0 - // ASTs from ihandler go through thandler and are made to look like traps - // always handle AST_CHUD_URGENT if there's a callback - // only handle AST_CHUD if it's the only AST pending - if(perfmon_ast_callback_fn && ((*myast & AST_CHUD_URGENT) || ((*myast & AST_CHUD) && !(*myast & AST_URGENT)))) { - struct ppc_thread_state64 state; - mach_msg_type_number_t count = PPC_THREAD_STATE64_COUNT; - chudxnu_copy_savearea_to_threadstate(PPC_THREAD_STATE64, (thread_state_t)&state, &count, ssp); - if(*myast & AST_CHUD_URGENT) { - *myast &= ~(AST_CHUD_URGENT | AST_CHUD); - if((*myast & AST_PREEMPTION) != AST_PREEMPTION) *myast &= ~(AST_URGENT); - retval = KERN_SUCCESS; - } else if(*myast & AST_CHUD) { - *myast &= ~(AST_CHUD); - retval = KERN_SUCCESS; - } - (perfmon_ast_callback_fn)(PPC_THREAD_STATE64, (thread_state_t)&state, count); - } -#endif - - ml_set_interrupts_enabled(oldlevel); - return retval; -} - -__private_extern__ kern_return_t -chudxnu_perfmon_ast_callback_enter(chudxnu_perfmon_ast_callback_func_t func) -{ - if(OSCompareAndSwapPtr(NULL, chudxnu_private_chud_ast_callback, - (void * volatile *)&perfASTHook)) { - chudxnu_perfmon_ast_callback_func_t old = perfmon_ast_callback_fn; - - while(!OSCompareAndSwapPtr(old, func, - (void * volatile *)&perfmon_ast_callback_fn)) { - old = perfmon_ast_callback_fn; - } - - return KERN_SUCCESS; - } - return KERN_FAILURE; -} - -__private_extern__ kern_return_t -chudxnu_perfmon_ast_callback_cancel(void) -{ - if(OSCompareAndSwapPtr(chudxnu_private_chud_ast_callback, NULL, - (void * volatile *)&perfASTHook)) { - chudxnu_perfmon_ast_callback_func_t old = perfmon_ast_callback_fn; - - while(!OSCompareAndSwapPtr(old, chud_null_ast, - (void * volatile *)&perfmon_ast_callback_fn)) { - old = perfmon_ast_callback_fn; - } - - return KERN_SUCCESS; - } - return KERN_FAILURE; -} - -__private_extern__ -kern_return_t chudxnu_perfmon_ast_send_urgent(boolean_t urgent) -{ - boolean_t oldlevel = ml_set_interrupts_enabled(FALSE); - ast_t *myast = ast_pending(); - - if(urgent) { - *myast |= (AST_CHUD_URGENT | AST_URGENT); - } else { - *myast |= (AST_CHUD); - } - - ml_set_interrupts_enabled(oldlevel); - return KERN_SUCCESS; -} - -#if 0 -#pragma mark **** interrupt **** -#endif -static kern_return_t chud_null_int(uint32_t trapentry, thread_flavor_t flavor, - thread_state_t tstate, mach_msg_type_number_t count); -static chudxnu_interrupt_callback_func_t interrupt_callback_fn = chud_null_int; - -static kern_return_t chud_null_int(uint32_t trapentry __unused, thread_flavor_t flavor __unused, - thread_state_t tstate __unused, mach_msg_type_number_t count __unused) { - return KERN_FAILURE; -} - - -static kern_return_t -chudxnu_private_interrupt_callback(int trapno, struct savearea *ssp, - __unused unsigned int dsisr, - __unused addr64_t dar) -{ - chudxnu_interrupt_callback_func_t fn = interrupt_callback_fn; - - if(fn) { - struct ppc_thread_state64 state; - mach_msg_type_number_t count = PPC_THREAD_STATE64_COUNT; - chudxnu_copy_savearea_to_threadstate(PPC_THREAD_STATE64, (thread_state_t)&state, &count, ssp); - return (fn)(TRAP_ENTRY_POINT(trapno), PPC_THREAD_STATE64, (thread_state_t)&state, count); - } else { - return KERN_FAILURE; - } -} - -__private_extern__ -kern_return_t chudxnu_interrupt_callback_enter(chudxnu_interrupt_callback_func_t func) -{ - if(OSCompareAndSwapPtr(NULL, chudxnu_private_interrupt_callback, - (void * volatile *)&perfIntHook)) { - chudxnu_interrupt_callback_func_t old = interrupt_callback_fn; - - while(!OSCompareAndSwapPtr(old, func, - (void * volatile *)&interrupt_callback_fn)) { - old = interrupt_callback_fn; - } - - return KERN_SUCCESS; - } - return KERN_FAILURE; -} - -__private_extern__ -kern_return_t chudxnu_interrupt_callback_cancel(void) -{ - if(OSCompareAndSwapPtr(chudxnu_private_interrupt_callback, NULL, - (void * volatile *)&perfIntHook)) { - chudxnu_interrupt_callback_func_t old = interrupt_callback_fn; - - while(!OSCompareAndSwapPtr(old, chud_null_int, - (void * volatile *)&interrupt_callback_fn)) { - old = interrupt_callback_fn; - } - - return KERN_SUCCESS; - } - return KERN_FAILURE; -} - -#if 0 -#pragma mark **** cpu signal **** -#endif -static chudxnu_cpusig_callback_func_t cpusig_callback_fn = NULL; -extern perfCallback perfCpuSigHook; /* function hook into cpu_signal_handler() */ - -static kern_return_t -chudxnu_private_cpu_signal_handler(int request, struct savearea *ssp, - __unused unsigned int arg0, - __unused addr64_t arg1) -{ - chudxnu_cpusig_callback_func_t fn = cpusig_callback_fn; - - if(fn) { - struct ppc_thread_state64 state; - mach_msg_type_number_t count = PPC_THREAD_STATE64_COUNT; - chudxnu_copy_savearea_to_threadstate(PPC_THREAD_STATE64, (thread_state_t)&state, &count, ssp); - (fn)(request, PPC_THREAD_STATE64, (thread_state_t)&state, count); - } - return KERN_SUCCESS; // ignored -} - -__private_extern__ -kern_return_t chudxnu_cpusig_callback_enter(chudxnu_cpusig_callback_func_t func) -{ - if(OSCompareAndSwapPtr(NULL, chudxnu_private_cpu_signal_handler, - (void * volatile *)&perfCpuSigHook)) { - chudxnu_cpusig_callback_func_t old = cpusig_callback_fn; - - while(!OSCompareAndSwapPtr(old, func, - (void * volatile *)&cpusig_callback_fn)) { - old = cpusig_callback_fn; - } - - return KERN_SUCCESS; - } - return KERN_FAILURE; -} - -__private_extern__ -kern_return_t chudxnu_cpusig_callback_cancel(void) -{ - if(OSCompareAndSwapPtr(chudxnu_private_cpu_signal_handler, NULL, - (void * volatile *)&perfCpuSigHook)) { - chudxnu_cpusig_callback_func_t old = cpusig_callback_fn; - - while(!OSCompareAndSwapPtr(old, NULL, - (void * volatile *)&cpusig_callback_fn)) { - old = cpusig_callback_fn; - } - - return KERN_SUCCESS; - } - return KERN_FAILURE; -} - -__private_extern__ -kern_return_t chudxnu_cpusig_send(int otherCPU, uint32_t request) -{ - int thisCPU; - kern_return_t retval = KERN_FAILURE; - int retries = 0; - boolean_t oldlevel; - uint32_t temp[2]; - - oldlevel = ml_set_interrupts_enabled(FALSE); - thisCPU = cpu_number(); - - if(thisCPU!=otherCPU) { - temp[0] = 0xFFFFFFFF; /* set sync flag */ - temp[1] = request; /* set request */ - __asm__ volatile("eieio"); /* force order */ - __asm__ volatile("sync"); /* force to memory */ - - do { - retval=cpu_signal(otherCPU, SIGPcpureq, CPRQchud, (uint32_t)&temp); - } while(retval!=KERN_SUCCESS && (retries++)<16); - - if(retries>=16) { - retval = KERN_FAILURE; - } else { - retval = hw_cpu_sync(temp, LockTimeOut); /* wait for the other processor */ - if(!retval) { - retval = KERN_FAILURE; - } else { - retval = KERN_SUCCESS; - } - } - } else { - retval = KERN_INVALID_ARGUMENT; - } - - ml_set_interrupts_enabled(oldlevel); - return retval; -} - diff --git a/osfmk/chud/ppc/chud_spr.h b/osfmk/chud/ppc/chud_spr.h deleted file mode 100644 index 479f664be..000000000 --- a/osfmk/chud/ppc/chud_spr.h +++ /dev/null @@ -1,273 +0,0 @@ -/* - * Copyright (c) 2003-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef _CHUD_SPR_H_ -#define _CHUD_SPR_H_ - -/* PPC SPRs - 32-bit and 64-bit implementations */ -#define chud_ppc_srr0 26 -#define chud_ppc_srr1 27 -#define chud_ppc_dsisr 18 -#define chud_ppc_dar 19 -#define chud_ppc_dec 22 -#define chud_ppc_sdr1 25 -#define chud_ppc_sprg0 272 -#define chud_ppc_sprg1 273 -#define chud_ppc_sprg2 274 -#define chud_ppc_sprg3 275 -#define chud_ppc_ear 282 -#define chud_ppc_tbl 284 -#define chud_ppc_tbu 285 -#define chud_ppc_pvr 287 -#define chud_ppc_ibat0u 528 -#define chud_ppc_ibat0l 529 -#define chud_ppc_ibat1u 530 -#define chud_ppc_ibat1l 531 -#define chud_ppc_ibat2u 532 -#define chud_ppc_ibat2l 533 -#define chud_ppc_ibat3u 534 -#define chud_ppc_ibat3l 535 -#define chud_ppc_dbat0u 536 -#define chud_ppc_dbat0l 537 -#define chud_ppc_dbat1u 538 -#define chud_ppc_dbat1l 539 -#define chud_ppc_dbat2u 540 -#define chud_ppc_dbat2l 541 -#define chud_ppc_dbat3u 542 -#define chud_ppc_dbat3l 543 -#define chud_ppc_dabr 1013 -#define chud_ppc_msr 10000 /* FAKE */ - -/* PPC SPRs - 32-bit implementations */ -#define chud_ppc32_sr0 20000 /* FAKE */ -#define chud_ppc32_sr1 20001 /* FAKE */ -#define chud_ppc32_sr2 20002 /* FAKE */ -#define chud_ppc32_sr3 20003 /* FAKE */ -#define chud_ppc32_sr4 20004 /* FAKE */ -#define chud_ppc32_sr5 20005 /* FAKE */ -#define chud_ppc32_sr6 20006 /* FAKE */ -#define chud_ppc32_sr7 20007 /* FAKE */ -#define chud_ppc32_sr8 20008 /* FAKE */ -#define chud_ppc32_sr9 20009 /* FAKE */ -#define chud_ppc32_sr10 20010 /* FAKE */ -#define chud_ppc32_sr11 20011 /* FAKE */ -#define chud_ppc32_sr12 20012 /* FAKE */ -#define chud_ppc32_sr13 20013 /* FAKE */ -#define chud_ppc32_sr14 20014 /* FAKE */ -#define chud_ppc32_sr15 20015 /* FAKE */ - -/* PPC SPRs - 64-bit implementations */ -#define chud_ppc64_asr 280 - -/* PPC SPRs - 750/750CX/750CXe/750FX Specific */ -#define chud_750_upmc1 937 -#define chud_750_upmc2 938 -#define chud_750_upmc3 941 -#define chud_750_upmc4 942 -#define chud_750_mmcr0 952 -#define chud_750_pmc1 953 -#define chud_750_pmc2 954 -#define chud_750_sia 955 -#define chud_750_mmcr1 956 -#define chud_750_pmc3 957 -#define chud_750_pmc4 958 -#define chud_750_hid0 1008 -#define chud_750_hid1 1009 -#define chud_750_iabr 1010 -#define chud_750_l2cr 1017 -#define chud_750_ictc 1019 -#define chud_750_thrm1 1020 -#define chud_750_thrm2 1021 -#define chud_750_thrm3 1022 -#define chud_750fx_ibat4u 560 /* 750FX only */ -#define chud_750fx_ibat4l 561 /* 750FX only */ -#define chud_750fx_ibat5u 562 /* 750FX only */ -#define chud_750fx_ibat5l 563 /* 750FX only */ -#define chud_750fx_ibat6u 564 /* 750FX only */ -#define chud_750fx_ibat6l 565 /* 750FX only */ -#define chud_750fx_ibat7u 566 /* 750FX only */ -#define chud_750fx_ibat7l 567 /* 750FX only */ -#define chud_750fx_dbat4u 568 /* 750FX only */ -#define chud_750fx_dbat4l 569 /* 750FX only */ -#define chud_750fx_dbat5u 570 /* 750FX only */ -#define chud_750fx_dbat5l 571 /* 750FX only */ -#define chud_750fx_dbat6u 572 /* 750FX only */ -#define chud_750fx_dbat6l 573 /* 750FX only */ -#define chud_750fx_dbat7u 574 /* 750FX only */ -#define chud_750fx_dbat7l 575 /* 750FX only */ -#define chud_750fx_hid2 1016 /* 750FX only */ - -/* PPC SPRs - 7400/7410 Specific */ -#define chud_7400_upmc1 937 -#define chud_7400_upmc2 938 -#define chud_7400_upmc3 941 -#define chud_7400_upmc4 942 -#define chud_7400_mmcr2 944 -#define chud_7400_bamr 951 -#define chud_7400_mmcr0 952 -#define chud_7400_pmc1 953 -#define chud_7400_pmc2 954 -#define chud_7400_siar 955 -#define chud_7400_mmcr1 956 -#define chud_7400_pmc3 957 -#define chud_7400_pmc4 958 -#define chud_7400_sda 959 -#define chud_7400_hid0 1008 -#define chud_7400_hid1 1009 -#define chud_7400_iabr 1010 -#define chud_7400_msscr0 1014 -#define chud_7410_l2pmcr 1016 /* 7410 only */ -#define chud_7400_l2cr 1017 -#define chud_7400_ictc 1019 -#define chud_7400_thrm1 1020 -#define chud_7400_thrm2 1021 -#define chud_7400_thrm3 1022 -#define chud_7400_pir 1023 - -/* PPC SPRs - 7450/7455 Specific */ -#define chud_7455_sprg4 276 /* 7455 only */ -#define chud_7455_sprg5 277 /* 7455 only */ -#define chud_7455_sprg6 278 /* 7455 only */ -#define chud_7455_sprg7 279 /* 7455 only */ -#define chud_7455_ibat4u 560 /* 7455 only */ -#define chud_7455_ibat4l 561 /* 7455 only */ -#define chud_7455_ibat5u 562 /* 7455 only */ -#define chud_7455_ibat5l 563 /* 7455 only */ -#define chud_7455_ibat6u 564 /* 7455 only */ -#define chud_7455_ibat6l 565 /* 7455 only */ -#define chud_7455_ibat7u 566 /* 7455 only */ -#define chud_7455_ibat7l 567 /* 7455 only */ -#define chud_7455_dbat4u 568 /* 7455 only */ -#define chud_7455_dbat4l 569 /* 7455 only */ -#define chud_7455_dbat5u 570 /* 7455 only */ -#define chud_7455_dbat5l 571 /* 7455 only */ -#define chud_7455_dbat6u 572 /* 7455 only */ -#define chud_7455_dbat6l 573 /* 7455 only */ -#define chud_7455_dbat7u 574 /* 7455 only */ -#define chud_7455_dbat7l 575 /* 7455 only */ -#define chud_7450_upmc5 929 -#define chud_7450_upmc6 930 -#define chud_7450_upmc1 937 -#define chud_7450_upmc2 938 -#define chud_7450_upmc3 941 -#define chud_7450_upmc4 942 -#define chud_7450_mmcr2 944 -#define chud_7450_pmc5 945 -#define chud_7450_pmc6 946 -#define chud_7450_bamr 951 -#define chud_7450_mmcr0 952 -#define chud_7450_pmc1 953 -#define chud_7450_pmc2 954 -#define chud_7450_siar 955 -#define chud_7450_mmcr1 956 -#define chud_7450_pmc3 957 -#define chud_7450_pmc4 958 -#define chud_7450_tlbmiss 980 -#define chud_7450_ptehi 981 -#define chud_7450_ptelo 982 -#define chud_7450_l3pm 983 -#define chud_7450_hid0 1008 -#define chud_7450_hid1 1009 -#define chud_7450_iabr 1010 -#define chud_7450_ldstdb 1012 -#define chud_7450_msscr0 1014 -#define chud_7450_msssr0 1015 -#define chud_7450_ldstcr 1016 -#define chud_7450_l2cr 1017 -#define chud_7450_l3cr 1018 -#define chud_7450_ictc 1019 -#define chud_7450_ictrl 1011 -#define chud_7450_thrm1 1020 -#define chud_7450_thrm2 1021 -#define chud_7450_thrm3 1022 -#define chud_7450_pir 1023 - -/* PPC SPRs - 970 Specific */ -#define chud_970_vrsave 256 -#define chud_970_ummcra 770 -#define chud_970_upmc1 771 -#define chud_970_upmc2 772 -#define chud_970_upmc3 773 -#define chud_970_upmc4 774 -#define chud_970_upmc5 775 -#define chud_970_upmc6 776 -#define chud_970_upmc7 777 -#define chud_970_upmc8 778 -#define chud_970_ummcr0 779 -#define chud_970_usiar 780 -#define chud_970_usdar 781 -#define chud_970_ummcr1 782 -#define chud_970_uimc 783 -#define chud_970_mmcra 786 -#define chud_970_pmc1 787 -#define chud_970_pmc2 788 -#define chud_970_pmc3 789 -#define chud_970_pmc4 790 -#define chud_970_pmc5 791 -#define chud_970_pmc6 792 -#define chud_970_pmc7 793 -#define chud_970_pmc8 794 -#define chud_970_mmcr0 795 -#define chud_970_siar 796 -#define chud_970_sdar 797 -#define chud_970_mmcr1 798 -#define chud_970_imc 799 - -/* PPC SPRs - 7400/7410 Specific, Private */ -#define chud_7400_msscr1 1015 - -/* PPC SPRs - 64-bit implementations, Private */ -#define chud_ppc64_accr 29 -#define chud_ppc64_ctrl 152 - -/* PPC SPRs - 970 Specific, Private */ -#define chud_970_scomc 276 -#define chud_970_scomd 277 -#define chud_970_hsprg0 304 -#define chud_970_hsprg1 305 -#define chud_970_hdec 310 -#define chud_970_hior 311 -#define chud_970_rmor 312 -#define chud_970_hrmor 313 -#define chud_970_hsrr0 314 -#define chud_970_hsrr1 315 -#define chud_970_lpcr 318 -#define chud_970_lpidr 319 -#define chud_970_trig0 976 -#define chud_970_trig1 977 -#define chud_970_trig2 978 -#define chud_970_hid0 1008 -#define chud_970_hid1 1009 -#define chud_970_hid4 1012 -#define chud_970_hid5 1014 -#define chud_970_dabrx 1015 -#define chud_970_trace 1022 -#define chud_970_pir 1023 - -#endif // _CHUD_SPR_H_ - diff --git a/osfmk/chud/ppc/chud_thread_ppc.c b/osfmk/chud/ppc/chud_thread_ppc.c deleted file mode 100644 index 0bca0ac92..000000000 --- a/osfmk/chud/ppc/chud_thread_ppc.c +++ /dev/null @@ -1,586 +0,0 @@ -/* - * Copyright (c) 2003-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include - -#include -#include -#include -#include - -#include -#include - -#include -#include - -#include -#include -#include -#include - -#if 0 -#pragma mark **** thread state **** -#endif - -__private_extern__ -kern_return_t chudxnu_copy_savearea_to_threadstate(thread_flavor_t flavor, thread_state_t tstate, mach_msg_type_number_t *count, struct savearea *sv) -{ - struct ppc_thread_state *ts; - struct ppc_thread_state64 *xts; - - switch(flavor) { - case PPC_THREAD_STATE: - if(*count < PPC_THREAD_STATE_COUNT) { /* Is the count ok? */ - *count = 0; - return KERN_INVALID_ARGUMENT; - } - ts = (struct ppc_thread_state *) tstate; - if(sv) { - ts->r0 = (unsigned int)sv->save_r0; - ts->r1 = (unsigned int)sv->save_r1; - ts->r2 = (unsigned int)sv->save_r2; - ts->r3 = (unsigned int)sv->save_r3; - ts->r4 = (unsigned int)sv->save_r4; - ts->r5 = (unsigned int)sv->save_r5; - ts->r6 = (unsigned int)sv->save_r6; - ts->r7 = (unsigned int)sv->save_r7; - ts->r8 = (unsigned int)sv->save_r8; - ts->r9 = (unsigned int)sv->save_r9; - ts->r10 = (unsigned int)sv->save_r10; - ts->r11 = (unsigned int)sv->save_r11; - ts->r12 = (unsigned int)sv->save_r12; - ts->r13 = (unsigned int)sv->save_r13; - ts->r14 = (unsigned int)sv->save_r14; - ts->r15 = (unsigned int)sv->save_r15; - ts->r16 = (unsigned int)sv->save_r16; - ts->r17 = (unsigned int)sv->save_r17; - ts->r18 = (unsigned int)sv->save_r18; - ts->r19 = (unsigned int)sv->save_r19; - ts->r20 = (unsigned int)sv->save_r20; - ts->r21 = (unsigned int)sv->save_r21; - ts->r22 = (unsigned int)sv->save_r22; - ts->r23 = (unsigned int)sv->save_r23; - ts->r24 = (unsigned int)sv->save_r24; - ts->r25 = (unsigned int)sv->save_r25; - ts->r26 = (unsigned int)sv->save_r26; - ts->r27 = (unsigned int)sv->save_r27; - ts->r28 = (unsigned int)sv->save_r28; - ts->r29 = (unsigned int)sv->save_r29; - ts->r30 = (unsigned int)sv->save_r30; - ts->r31 = (unsigned int)sv->save_r31; - ts->cr = (unsigned int)sv->save_cr; - ts->xer = (unsigned int)sv->save_xer; - ts->lr = (unsigned int)sv->save_lr; - ts->ctr = (unsigned int)sv->save_ctr; - ts->srr0 = (unsigned int)sv->save_srr0; - ts->srr1 = (unsigned int)sv->save_srr1; - ts->mq = 0; - ts->vrsave = (unsigned int)sv->save_vrsave; - } else { - bzero((void *)ts, sizeof(struct ppc_thread_state)); - } - *count = PPC_THREAD_STATE_COUNT; /* Pass back the amount we actually copied */ - return KERN_SUCCESS; - break; - case PPC_THREAD_STATE64: - if(*count < PPC_THREAD_STATE64_COUNT) { /* Is the count ok? */ - return KERN_INVALID_ARGUMENT; - } - xts = (struct ppc_thread_state64 *) tstate; - if(sv) { - xts->r0 = sv->save_r0; - xts->r1 = sv->save_r1; - xts->r2 = sv->save_r2; - xts->r3 = sv->save_r3; - xts->r4 = sv->save_r4; - xts->r5 = sv->save_r5; - xts->r6 = sv->save_r6; - xts->r7 = sv->save_r7; - xts->r8 = sv->save_r8; - xts->r9 = sv->save_r9; - xts->r10 = sv->save_r10; - xts->r11 = sv->save_r11; - xts->r12 = sv->save_r12; - xts->r13 = sv->save_r13; - xts->r14 = sv->save_r14; - xts->r15 = sv->save_r15; - xts->r16 = sv->save_r16; - xts->r17 = sv->save_r17; - xts->r18 = sv->save_r18; - xts->r19 = sv->save_r19; - xts->r20 = sv->save_r20; - xts->r21 = sv->save_r21; - xts->r22 = sv->save_r22; - xts->r23 = sv->save_r23; - xts->r24 = sv->save_r24; - xts->r25 = sv->save_r25; - xts->r26 = sv->save_r26; - xts->r27 = sv->save_r27; - xts->r28 = sv->save_r28; - xts->r29 = sv->save_r29; - xts->r30 = sv->save_r30; - xts->r31 = sv->save_r31; - xts->cr = sv->save_cr; - xts->xer = sv->save_xer; - xts->lr = sv->save_lr; - xts->ctr = sv->save_ctr; - xts->srr0 = sv->save_srr0; - xts->srr1 = sv->save_srr1; - xts->vrsave = sv->save_vrsave; - } else { - bzero((void *)xts, sizeof(struct ppc_thread_state64)); - } - *count = PPC_THREAD_STATE64_COUNT; /* Pass back the amount we actually copied */ - return KERN_SUCCESS; - break; - default: - *count = 0; - return KERN_INVALID_ARGUMENT; - break; - } -} - -__private_extern__ -kern_return_t chudxnu_copy_threadstate_to_savearea(struct savearea *sv, thread_flavor_t flavor, thread_state_t tstate, mach_msg_type_number_t *count) -{ - struct ppc_thread_state *ts; - struct ppc_thread_state64 *xts; - - switch(flavor) { - case PPC_THREAD_STATE: - if(*count < PPC_THREAD_STATE_COUNT) { /* Is the count ok? */ - return KERN_INVALID_ARGUMENT; - } - ts = (struct ppc_thread_state *) tstate; - if(sv) { - sv->save_r0 = (uint64_t)ts->r0; - sv->save_r1 = (uint64_t)ts->r1; - sv->save_r2 = (uint64_t)ts->r2; - sv->save_r3 = (uint64_t)ts->r3; - sv->save_r4 = (uint64_t)ts->r4; - sv->save_r5 = (uint64_t)ts->r5; - sv->save_r6 = (uint64_t)ts->r6; - sv->save_r7 = (uint64_t)ts->r7; - sv->save_r8 = (uint64_t)ts->r8; - sv->save_r9 = (uint64_t)ts->r9; - sv->save_r10 = (uint64_t)ts->r10; - sv->save_r11 = (uint64_t)ts->r11; - sv->save_r12 = (uint64_t)ts->r12; - sv->save_r13 = (uint64_t)ts->r13; - sv->save_r14 = (uint64_t)ts->r14; - sv->save_r15 = (uint64_t)ts->r15; - sv->save_r16 = (uint64_t)ts->r16; - sv->save_r17 = (uint64_t)ts->r17; - sv->save_r18 = (uint64_t)ts->r18; - sv->save_r19 = (uint64_t)ts->r19; - sv->save_r20 = (uint64_t)ts->r20; - sv->save_r21 = (uint64_t)ts->r21; - sv->save_r22 = (uint64_t)ts->r22; - sv->save_r23 = (uint64_t)ts->r23; - sv->save_r24 = (uint64_t)ts->r24; - sv->save_r25 = (uint64_t)ts->r25; - sv->save_r26 = (uint64_t)ts->r26; - sv->save_r27 = (uint64_t)ts->r27; - sv->save_r28 = (uint64_t)ts->r28; - sv->save_r29 = (uint64_t)ts->r29; - sv->save_r30 = (uint64_t)ts->r30; - sv->save_r31 = (uint64_t)ts->r31; - sv->save_cr = ts->cr; - sv->save_xer = (uint64_t)ts->xer; - sv->save_lr = (uint64_t)ts->lr; - sv->save_ctr = (uint64_t)ts->ctr; - sv->save_srr0 = (uint64_t)ts->srr0; - sv->save_srr1 = (uint64_t)ts->srr1; - sv->save_vrsave = ts->vrsave; - return KERN_SUCCESS; - } - break; - case PPC_THREAD_STATE64: - if(*count < PPC_THREAD_STATE64_COUNT) { /* Is the count ok? */ - return KERN_INVALID_ARGUMENT; - } - xts = (struct ppc_thread_state64 *) tstate; - if(sv) { - sv->save_r0 = xts->r0; - sv->save_r1 = xts->r1; - sv->save_r2 = xts->r2; - sv->save_r3 = xts->r3; - sv->save_r4 = xts->r4; - sv->save_r5 = xts->r5; - sv->save_r6 = xts->r6; - sv->save_r7 = xts->r7; - sv->save_r8 = xts->r8; - sv->save_r9 = xts->r9; - sv->save_r10 = xts->r10; - sv->save_r11 = xts->r11; - sv->save_r12 = xts->r12; - sv->save_r13 = xts->r13; - sv->save_r14 = xts->r14; - sv->save_r15 = xts->r15; - sv->save_r16 = xts->r16; - sv->save_r17 = xts->r17; - sv->save_r18 = xts->r18; - sv->save_r19 = xts->r19; - sv->save_r20 = xts->r20; - sv->save_r21 = xts->r21; - sv->save_r22 = xts->r22; - sv->save_r23 = xts->r23; - sv->save_r24 = xts->r24; - sv->save_r25 = xts->r25; - sv->save_r26 = xts->r26; - sv->save_r27 = xts->r27; - sv->save_r28 = xts->r28; - sv->save_r29 = xts->r29; - sv->save_r30 = xts->r30; - sv->save_r31 = xts->r31; - sv->save_cr = xts->cr; - sv->save_xer = xts->xer; - sv->save_lr = xts->lr; - sv->save_ctr = xts->ctr; - sv->save_srr0 = xts->srr0; - sv->save_srr1 = xts->srr1; - sv->save_vrsave = xts->vrsave; - return KERN_SUCCESS; - } - } - return KERN_FAILURE; -} - -__private_extern__ -kern_return_t chudxnu_thread_user_state_available(thread_t thread) -{ - if(find_user_regs(thread)) { - return KERN_SUCCESS; - } else { - return KERN_FAILURE; - } -} - -__private_extern__ -kern_return_t chudxnu_thread_get_state(thread_t thread, - thread_flavor_t flavor, - thread_state_t tstate, - mach_msg_type_number_t *count, - boolean_t user_only) -{ - if(flavor==PPC_THREAD_STATE || flavor==PPC_THREAD_STATE64) { // machine_thread_get_state filters out some bits - struct savearea *sv; - if(user_only) { - sv = find_user_regs(thread); - } else { - sv = find_kern_regs(thread); - } - return chudxnu_copy_savearea_to_threadstate(flavor, tstate, count, sv); - } else { - if(user_only) { - return machine_thread_get_state(thread, flavor, tstate, count); - } else { - // doesn't do FP or VMX - return machine_thread_get_kern_state(thread, flavor, tstate, count); - } - } -} - -__private_extern__ -kern_return_t chudxnu_thread_set_state(thread_t thread, - thread_flavor_t flavor, - thread_state_t tstate, - mach_msg_type_number_t count, - boolean_t user_only) -{ - if(flavor==PPC_THREAD_STATE || flavor==PPC_THREAD_STATE64) { // machine_thread_set_state filters out some bits - struct savearea *sv; - if(user_only) { - sv = find_user_regs(thread); - } else { - sv = find_kern_regs(thread); - } - return chudxnu_copy_threadstate_to_savearea(sv, flavor, tstate, &count); - } else { - return machine_thread_set_state(thread, flavor, tstate, count); // always user - } -} - -#if 0 -#pragma mark **** task memory read/write **** -#endif - -__private_extern__ -kern_return_t chudxnu_task_read(task_t task, void *kernaddr, uint64_t usraddr, vm_size_t size) -{ - kern_return_t ret = KERN_SUCCESS; - - if(ml_at_interrupt_context()) { - // can't do this on an interrupt stack - return KERN_FAILURE; - } - - if(!chudxnu_is_64bit_task(task)) { // clear any cruft out of upper 32-bits for 32-bit tasks - usraddr &= 0x00000000FFFFFFFFULL; - } - - if(current_task()==task) { - thread_t cur_thr = current_thread(); - vm_offset_t recover_handler = cur_thr->recover; - - if(copyin(usraddr, kernaddr, size)) { - ret = KERN_FAILURE; - } - - cur_thr->recover = recover_handler; - } else { - - vm_map_t map = get_task_map(task); - ret = vm_map_read_user(map, usraddr, kernaddr, size); - } - - return ret; -} - -__private_extern__ -kern_return_t chudxnu_task_write(task_t task, uint64_t useraddr, void *kernaddr, vm_size_t size) -{ - kern_return_t ret = KERN_SUCCESS; - - if(ml_at_interrupt_context()) { - // can't do this on an interrupt stack - return KERN_FAILURE; - } - - if(!chudxnu_is_64bit_task(task)) { // clear any cruft out of upper 32-bits for 32-bit tasks - useraddr &= 0x00000000FFFFFFFFULL; - } - - if(current_task()==task) { - thread_t cur_thr = current_thread(); - vm_offset_t recover_handler = cur_thr->recover; - - if(copyout(kernaddr, useraddr, size)) { - ret = KERN_FAILURE; - } - cur_thr->recover = recover_handler; - } else { - - vm_map_t map = get_task_map(task); - ret = vm_map_write_user(map, kernaddr, useraddr, size); - } - - return ret; -} - -__private_extern__ -kern_return_t chudxnu_kern_read(void *dstaddr, vm_offset_t srcaddr, vm_size_t size) -{ - return (ml_nofault_copy(srcaddr, (vm_offset_t) dstaddr, size) == size ? - KERN_SUCCESS: KERN_FAILURE); -} - -__private_extern__ -kern_return_t chudxnu_kern_write(vm_offset_t dstaddr, void *srcaddr, vm_size_t size) -{ - return (ml_nofault_copy((vm_offset_t) srcaddr, dstaddr, size) == size ? - KERN_SUCCESS: KERN_FAILURE); -} - -// chudxnu_thread_get_callstack gathers a raw callstack along with any information needed to -// fix it up later (in case we stopped program as it was saving values into prev stack frame, etc.) -// after sampling has finished. -// -// For an N-entry callstack: -// -// [0] current pc -// [1..N-3] stack frames (including current one) -// [N-2] current LR (return value if we're in a leaf function) -// [N-1] current r0 (in case we've saved LR in r0) -// - -#define FP_LINK_OFFSET 2 -#define STACK_ALIGNMENT_MASK 0xF // PPC stack frames are supposed to be 16-byte aligned -#define INST_ALIGNMENT_MASK 0x3 // Instructions are always 4-bytes wide - -#ifndef USER_MODE -#define USER_MODE(msr) ((msr) & MASK(MSR_PR) ? TRUE : FALSE) -#endif - -#ifndef SUPERVISOR_MODE -#define SUPERVISOR_MODE(msr) ((msr) & MASK(MSR_PR) ? FALSE : TRUE) -#endif - -#define VALID_STACK_ADDRESS(addr) (addr>=0x1000ULL && \ - (addr&STACK_ALIGNMENT_MASK)==0x0 && \ - (supervisor ? \ - (addr>=kernStackMin && \ - addr<=kernStackMax) : \ - TRUE)) - - -__private_extern__ -kern_return_t chudxnu_thread_get_callstack64( thread_t thread, - uint64_t *callStack, - mach_msg_type_number_t *count, - boolean_t user_only) -{ - kern_return_t kr; - task_t task = get_threadtask(thread); - uint64_t nextFramePointer = 0; - uint64_t currPC, currLR, currR0; - uint64_t framePointer; - uint64_t prevPC = 0; - uint64_t kernStackMin = thread->kernel_stack; - uint64_t kernStackMax = kernStackMin + kernel_stack_size; - uint64_t *buffer = callStack; - uint32_t tmpWord; - int bufferIndex = 0; - int bufferMaxIndex = *count; - boolean_t supervisor; - boolean_t is64Bit; - struct savearea *sv; - - if(user_only) { - sv = find_user_regs(thread); - } else { - sv = find_kern_regs(thread); - } - - if(!sv) { - *count = 0; - return KERN_FAILURE; - } - - supervisor = SUPERVISOR_MODE(sv->save_srr1); - if(supervisor) { - is64Bit = FALSE; /* XXX assuming task is always 32-bit */ - } else { - is64Bit = chudxnu_is_64bit_task(task); - } - - bufferMaxIndex = bufferMaxIndex - 2; // allot space for saving the LR and R0 on the stack at the end. - if(bufferMaxIndex<2) { - *count = 0; - return KERN_RESOURCE_SHORTAGE; - } - - currPC = sv->save_srr0; - framePointer = sv->save_r1; /* r1 is the stack pointer (no FP on PPC) */ - currLR = sv->save_lr; - currR0 = sv->save_r0; - - bufferIndex = 0; // start with a stack of size zero - buffer[bufferIndex++] = currPC; // save PC in position 0. - - // Now, fill buffer with stack backtraces. - while(bufferIndex SP - // Here, we'll get the lr from the stack. - uint64_t fp_link; - - if(is64Bit) { - fp_link = framePointer + FP_LINK_OFFSET*sizeof(uint64_t); - } else { - fp_link = framePointer + FP_LINK_OFFSET*sizeof(uint32_t); - } - - // Note that we read the pc even for the first stack frame (which, in theory, - // is always empty because the callee fills it in just before it lowers the - // stack. However, if we catch the program in between filling in the return - // address and lowering the stack, we want to still have a valid backtrace. - // FixupStack correctly disregards this value if necessary. - - if(supervisor) { - if(is64Bit) { - kr = chudxnu_kern_read(&pc, fp_link, sizeof(uint64_t)); - } else { - kr = chudxnu_kern_read(&tmpWord, fp_link, sizeof(uint32_t)); - pc = tmpWord; - } - } else { - if(is64Bit) { - kr = chudxnu_task_read(task, &pc, fp_link, sizeof(uint64_t)); - } else { - kr = chudxnu_task_read(task, &tmpWord, fp_link, sizeof(uint32_t)); - pc = tmpWord; - } - } - if(kr!=KERN_SUCCESS) { - pc = 0; - break; - } - - // retrieve the contents of the frame pointer and advance to the next stack frame if it's valid - if(supervisor) { - if(is64Bit) { - kr = chudxnu_kern_read(&nextFramePointer, framePointer, sizeof(uint64_t)); - } else { - kr = chudxnu_kern_read(&tmpWord, framePointer, sizeof(uint32_t)); - nextFramePointer = tmpWord; - } - } else { - if(is64Bit) { - kr = chudxnu_task_read(task, &nextFramePointer, framePointer, sizeof(uint64_t)); - } else { - kr = chudxnu_task_read(task, &tmpWord, framePointer, sizeof(uint32_t)); - nextFramePointer = tmpWord; - } - } - if(kr!=KERN_SUCCESS) { - nextFramePointer = 0; - } - - if(nextFramePointer) { - buffer[bufferIndex++] = pc; - prevPC = pc; - } - - if(nextFramePointer=bufferMaxIndex) { - *count = 0; - return KERN_RESOURCE_SHORTAGE; - } - - // Save link register and R0 at bottom of stack (used for later fixup). - buffer[bufferIndex++] = currLR; - buffer[bufferIndex++] = currR0; - - *count = bufferIndex; - return KERN_SUCCESS; -} - diff --git a/osfmk/chud/ppc/chud_xnu_private.h b/osfmk/chud/ppc/chud_xnu_private.h deleted file mode 100644 index 72b2ed663..000000000 --- a/osfmk/chud/ppc/chud_xnu_private.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2003-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef _PPC_CHUD_XNU_PRIVATE_H_ -#define _PPC_CHUD_XNU_PRIVATE_H_ - -#if 0 -#pragma mark **** thread **** -#endif -// ***************************************************************************** -// thread -// ***************************************************************************** -extern kern_return_t chudxnu_copy_savearea_to_threadstate( - thread_flavor_t flavor, - thread_state_t tstate, - mach_msg_type_number_t *count, - struct savearea *sv); - -extern kern_return_t chudxnu_copy_threadstate_to_savearea( - struct savearea *sv, - thread_flavor_t flavor, - thread_state_t tstate, - mach_msg_type_number_t *count); - -#if 0 -#pragma mark **** cpu timer **** -#endif -typedef struct { - timer_call_data_t cpu_timer_call; - uint64_t t_deadline; - chudxnu_cpu_timer_callback_func_t cpu_timer_callback_fn; -} chudcpu_data_t; - -#endif /* _PPC_CHUD_XNU_PRIVATE_H_ */ diff --git a/osfmk/conf/MASTER b/osfmk/conf/MASTER index cadb1a976..e34f671cb 100644 --- a/osfmk/conf/MASTER +++ b/osfmk/conf/MASTER @@ -65,7 +65,7 @@ ident MACH # option should be on. # options MACH_KERNEL -options MACH_PAGEMAP +options MACH_PAGEMAP # options MACH_LOAD options MACH_RT options TASK_SWAPPER # @@ -108,6 +108,9 @@ options MACH_MP_DEBUG # # # operations on each element. # options ZONE_DEBUG # # + +options CONFIG_ZLEAKS # Live zone leak debugging # + # options ZONE_ALIAS_ADDR # # # @@ -141,7 +144,6 @@ options CONFIG_DTRACE # # # options MACH_COUNTERS # # - ########################################################## # # This defines configuration options that are normally used only during @@ -207,9 +209,6 @@ options CONFIG_ZONE_MAP_MIN=12582912 # options CONFIG_ZONE_MAP_MIN=6291456 # options CONFIG_ZONE_MAP_MIN=1048576 # -options CONFIG_TOKEN_QUEUE_SMALL=1 # -options CONFIG_TOKEN_QUEUE_SMALL=0 # - # # configurable kernel - use these options to strip strings from panic # and printf calls. @@ -250,3 +249,23 @@ options CONFIG_CODE_DECRYPTION # # Context switched counters # options CONFIG_COUNTERS # + +# +# Timeshare scheduler implementations +# +options CONFIG_SCHED_TRADITIONAL # +options CONFIG_SCHED_PROTO # +options CONFIG_SCHED_GRRR # +options CONFIG_SCHED_FIXEDPRIORITY # +options CONFIG_SCHED_GRRR_CORE # + +options CONFIG_SCHED_IDLE_IN_PLACE # + +# +# freeze - support app hibernation, used on embedded +# +options CONFIG_FREEZE # + + +options CHECK_CS_VALIDATION_BITMAP # + diff --git a/osfmk/conf/MASTER.i386 b/osfmk/conf/MASTER.i386 index b8cd08e05..42b4294e1 100644 --- a/osfmk/conf/MASTER.i386 +++ b/osfmk/conf/MASTER.i386 @@ -9,11 +9,10 @@ # Standard Apple MacOS X Configurations: # -------- ---- -------- --------------- # -# RELEASE = [ medium intel pc iokit mach_pe mach mach_kdp config_serial_kdp event vol hd pst gdb fixpri simple_clock mkernserv uxpr kernstack ipc_compat ipc_debug fb mk30 mk30_i386 hibernation config_sleep crypto config_dtrace config_mca config_vmx config_counters ] +# RELEASE = [ medium intel pc iokit mach_pe mach mach_kdp config_serial_kdp event vol hd pst gdb fixpri simple_clock mkernserv uxpr kernstack ipc_compat ipc_debug fb mk30 mk30_i386 hibernation config_sleep crypto config_dtrace config_mca config_vmx config_mtrr config_lapic config_counters zleaks config_sched_traditional config_sched_proto config_sched_grrr config_sched_fixedpriority mach_pagemap config_sched_idle_in_place ] # DEBUG= [ RELEASE osf_debug debug mach_kdb mach_assert] # PROFILE = [ RELEASE profile ] # -# # EMBEDDED_BASE = [ bsmall intel pc iokit mach_pe mach mach_kdp config_serial_kdp event vol hd pst gdb fixpri simple_clock mkernserv uxpr kernstack ipc_compat ipc_debug fb mk30 mk30_i386 hibernation config_sleep crypto ] # EMBEDDED = [ EMBEDDED_BASE no_printf_str no_kprintf_str no_kdebug ] # DEVELOPMENT = [ EMBEDDED_BASE mach_assert config_dtrace config_counters ] @@ -58,6 +57,8 @@ options CONFIG_SERIAL_KDP # KDP over serial # options PAE options X86_64 options DISPATCH_COUNTS +options PAL_I386 +options CONFIG_YONAH # 32-bit Yonah support # # # Note: MAC/AUDIT options must be set in all the bsd/conf, osfmk/conf, and @@ -75,5 +76,6 @@ options CONFIG_CODE_DECRYPTION options CONFIG_MCA # Machine Check Architecture # options CONFIG_VMX # Virtual Machine Extensions # +options CONFIG_MTRR # Memory Type Range Registers # options NO_NESTED_PMAP # diff --git a/osfmk/conf/MASTER.ppc b/osfmk/conf/MASTER.ppc deleted file mode 100644 index 98036b366..000000000 --- a/osfmk/conf/MASTER.ppc +++ /dev/null @@ -1,67 +0,0 @@ -# -# Mach Operating System -# Copyright (c) 1986 Carnegie-Mellon University -# All rights reserved. The CMU software License Agreement -# specifies the terms and conditions for use and redistribution. -# -###################################################################### -# -# Standard Apple MacOS X Configurations: -# -------- ---- -------- --------------- -# -# RELEASE = [ medium mach_bsd mach_kdp iokit mach_pe ppc mach hibernation crypto config_dtrace config_counters ] -# DEVELOPMENT = [ RELEASE ] -# RELEASE_TRACE = [ RELEASE kdebug ] -# DEBUG = [ RELEASE mach_kdb debug mach_assert ] -# DEBUG_TRACE = [ DEBUG kdebug ] -# PROFILE = [ RELEASE profile ] -# -###################################################################### -# -############################################################################## -# -# MACH_PROF enables code for mach profiling. -# -options MACH_PROF # # -############################################################################## -# -# Debug -# -options DEBUG # # - -options PROFILE # kernel profiling # - -machine "ppc" -cpu "ppc" -pseudo-device scc 1 -pseudo-device vc 1 - -options MACHINE_TIMER_ROUTINES - -# Disabled by default, since mklinux does not need this -# unless running multiserver - the atalk stack at time of -# writing inserts a null filter! -#options NET_FILTER_COMPILER - -# Turn on the serial console by uncommenting the this: -#options SERIAL_CONSOLE_DEFAULT - -options MACH_KDP # # -options MACH_KDB # # -options MACH_BSD # # -options IOKIT # # -options MACH_PE # # - -# XXX for bringup, turns on mac disklabels, -# and some other nice stuff for the diskshim -options POWERMAC - -options DISPATCH_COUNTS - -# -# Note: MAC/AUDIT options must be set in all the bsd/conf, osfmk/conf, and -# security/conf MASTER files. -# -options CONFIG_MACF # Mandatory Access Control Framework -#options CONFIG_MACF_MACH # MACF applied to Mach services -options CONFIG_AUDIT # Kernel auditing diff --git a/osfmk/conf/MASTER.x86_64 b/osfmk/conf/MASTER.x86_64 index a3f336c06..993fa17ab 100644 --- a/osfmk/conf/MASTER.x86_64 +++ b/osfmk/conf/MASTER.x86_64 @@ -9,10 +9,8 @@ # Standard Apple MacOS X Configurations: # -------- ---- -------- --------------- # -# RELEASE = [ medium intel pc iokit mach_pe mach mach_kdp config_serial_kdp event vol hd pst gdb fixpri simple_clock mkernserv uxpr kernstack ipc_compat ipc_debug fb mk30 mk30_i386 hibernation config_sleep crypto config_dtrace config_mca config_vmx config_counters ] -# DEBUG= [ RELEASE osf_debug debug mach_assert ] -# PROFILE = [ RELEASE profile ] -# +# RELEASE = [ medium intel pc iokit mach_pe mach mach_kdp config_serial_kdp event vol hd pst gdb fixpri simple_clock mkernserv uxpr kernstack ipc_compat ipc_debug fb mk30 mk30_i386 hibernation config_sleep crypto config_dtrace config_mca config_vmx config_mtrr config_lapic config_counters zleaks config_sched_traditional config_sched_proto config_sched_grrr config_sched_fixedpriority mach_pagemap config_sched_idle_in_place ] +# DEBUG = [ RELEASE osf_debug debug mach_assert ] # # EMBEDDED_BASE = [ bsmall intel pc iokit mach_pe mach mach_kdp config_serial_kdp event vol hd pst gdb fixpri simple_clock mkernserv uxpr kernstack ipc_compat ipc_debug fb mk30 mk30_i386 hibernation config_sleep crypto ] # EMBEDDED = [ EMBEDDED_BASE no_printf_str no_kprintf_str no_kdebug ] @@ -20,8 +18,8 @@ # ###################################################################### # -machine "x86_64" # -cpu "x86_64" # +machine "x86_64" # +cpu "x86_64" # pseudo-device com 2 pseudo-device vc 1 @@ -56,6 +54,7 @@ options CONFIG_SERIAL_KDP # KDP over serial # options PAE options X86_64 options DISPATCH_COUNTS +options PAL_I386 # # Note: MAC/AUDIT options must be set in all the bsd/conf, osfmk/conf, and @@ -73,6 +72,7 @@ options CONFIG_CODE_DECRYPTION options CONFIG_MCA # Machine Check Architecture # options CONFIG_VMX # Virtual Machine Extensions # +options CONFIG_MTRR # Memory Type Range Registers # options NO_NESTED_PMAP # options CONFIG_NO_NESTED_PMAP # diff --git a/osfmk/conf/Makefile b/osfmk/conf/Makefile index 4010dbcba..330f94ab6 100644 --- a/osfmk/conf/Makefile +++ b/osfmk/conf/Makefile @@ -6,8 +6,7 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir include $(MakeInc_cmd) include $(MakeInc_def) -SETUP_SUBDIRS = \ - tools +SETUP_SUBDIRS = COMP_SUBDIRS = @@ -23,22 +22,21 @@ else export COMPOBJROOT=$(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)/$(COMPONENT) endif -$(COMPOBJROOT)/doconf: - @make build_setup +MASTER_CPU_PER_SOC = $(SOURCE)/MASTER.$(ARCH_CONFIG_LC).$(MACHINE_CONFIG_LC) $(COMPOBJROOT)/$(OSFMK_KERNEL_CONFIG)/Makefile: $(SOURCE)/MASTER \ $(SOURCE)/MASTER.$(ARCH_CONFIG_LC) \ $(SOURCE)/Makefile.template \ $(SOURCE)/Makefile.$(ARCH_CONFIG_LC) \ $(SOURCE)/files \ - $(SOURCE)/files.$(ARCH_CONFIG_LC) \ - $(COMPOBJROOT)/doconf + $(SOURCE)/files.$(ARCH_CONFIG_LC) $(_v)(doconf_target=$(addsuffix /conf, $(TARGET)); \ $(MKDIR) $${doconf_target}; \ cd $${doconf_target}; \ rm -f $(notdir $?); \ cp $? $${doconf_target}; \ - $(COMPOBJROOT)/doconf -c -cpu $(ARCH_CONFIG_LC) -d $(TARGET)/$(OSFMK_KERNEL_CONFIG) $(OSFMK_KERNEL_CONFIG); \ + if [ -f $(MASTER_CPU_PER_SOC) ]; then cp $(MASTER_CPU_PER_SOC) $${doconf_target}; fi; \ + $(SRCROOT)/SETUP/config/doconf -c -cpu $(ARCH_CONFIG_LC) -soc $(MACHINE_CONFIG_LC) -d $(TARGET)/$(OSFMK_KERNEL_CONFIG) $(OSFMK_KERNEL_CONFIG); \ ); $(COMPOBJROOT)/$(OSFMK_KERNEL_CONFIG)/platforms.h: $(COMPOBJROOT)/$(OSFMK_KERNEL_CONFIG)/Makefile @@ -47,11 +45,8 @@ $(COMPOBJROOT)/$(OSFMK_KERNEL_CONFIG)/platforms.h: $(COMPOBJROOT)/$(OSFMK_KERNEL ${LN} cputypes.h $@; \ ) -do_setup_conf: $(COMPOBJROOT)/doconf \ - $(COMPOBJROOT)/$(OSFMK_KERNEL_CONFIG)/Makefile \ +do_all: $(COMPOBJROOT)/$(OSFMK_KERNEL_CONFIG)/Makefile \ $(COMPOBJROOT)/$(OSFMK_KERNEL_CONFIG)/platforms.h - -do_all: do_setup_conf $(_v)next_source=$(subst conf/,,$(SOURCE)); \ ${MAKE} -C $(COMPOBJROOT)/$(OSFMK_KERNEL_CONFIG) \ MAKEFILES=$(TARGET)/$(OSFMK_KERNEL_CONFIG)/Makefile \ diff --git a/osfmk/conf/Makefile.i386 b/osfmk/conf/Makefile.i386 index 387d4aafb..e232c0e32 100644 --- a/osfmk/conf/Makefile.i386 +++ b/osfmk/conf/Makefile.i386 @@ -2,28 +2,19 @@ #BEGIN Machine dependent Makefile fragment for i386 ###################################################################### -CFLAGS+= -DAT386=1 -SFLAGS+= -DAT386=1 - -# Enable -Werror for i386 builds -CFLAGS+= $(WERROR) -Wshorten-64-to-32 -CWARNFLAGS= $(filter-out -Wbad-function-cast, $(CWARNFLAGS_STD)) +CWARNFLAGS = $(CWARNFLAGS_STD) -Wshorten-64-to-32 # Objects that don't compile cleanly: OBJS_NO_WERROR= \ - UNDRequest.o \ db_macro.o \ db_print.o \ db_sym.o \ db_variables.o \ db_disasm.o \ db_interface.o \ - db_trace.o \ - gssd_mach.o - -OBJS_WERROR=$(filter-out $(OBJS_NO_WERROR),$(OBJS)) + db_trace.o -$(OBJS_WERROR): WERROR=-Werror +$(foreach file,$(OBJS_NO_WERROR),$(eval $(call add_perfile_cflags,$(file),-Wno-error))) # Files that must go in the __HIB segment: UNCONFIGURED_HIB_FILES= \ diff --git a/osfmk/conf/Makefile.ppc b/osfmk/conf/Makefile.ppc deleted file mode 100644 index 35d7f0dd2..000000000 --- a/osfmk/conf/Makefile.ppc +++ /dev/null @@ -1,76 +0,0 @@ -###################################################################### -#BEGIN Machine dependent Makefile fragment for ppc -###################################################################### - -# -# ppc should be (mostly) warning free -# -CFLAGS+= $(WERROR) -CWARNFLAGS= $(filter-out -Wbad-function-cast, $(CWARNFLAGS_STD)) - -# Objects that don't compile cleanly: -OBJS_NO_WERROR= \ - UNDRequest.o \ - machine_routines.o \ - db_examine.o \ - db_macro.o \ - db_print.o \ - db_sym.o \ - db_variables.o \ - ppc_disasm.o \ - db_disasm.o \ - db_trace.o \ - db_low_trace.o \ - gssd_mach.o \ - kdp_machdep.o - -OBJS_WERROR=$(filter-out $(OBJS_NO_WERROR),$(OBJS)) - -$(OBJS_WERROR): WERROR=-Werror - -export bsd_vm.o_CFLAGS_ADD=-Werror -Wshorten-64-to-32 -export device_vm.o_CFLAGS_ADD=-Werror -Wshorten-64-to-32 -export memory_object.o_CFLAGS_ADD=-Werror -Wshorten-64-to-32 -export vm32_user.o_CFLAGS_ADD=-Werror -Wshorten-64-to-32 -export vm_apple_protect.o_CFLAGS_ADD=-Werror -Wshorten-64-to-32 -export vm_debug.o_CFLAGS_ADD=-Werror -Wshorten-64-to-32 -export vm_external.o_CFLAGS_ADD=-Werror -Wshorten-64-to-32 -export vm_fault.o_CFLAGS_ADD=-Werror -Wshorten-64-to-32 -export vm_init.o_CFLAGS_ADD=-Werror -Wshorten-64-to-32 -export vm_kern.o_CFLAGS_ADD=-Werror -Wshorten-64-to-32 -export vm_map.o_CFLAGS_ADD=-Werror -Wshorten-64-to-32 -export vm_object.o_CFLAGS_ADD=-Werror -Wshorten-64-to-32 -export vm_pageout.o_CFLAGS_ADD=-Werror -Wshorten-64-to-32 -export vm_purgeable.o_CFLAGS_ADD=-Werror -Wshorten-64-to-32 -export vm_resident.o_CFLAGS_ADD=-Werror -Wshorten-64-to-32 -export vm_shared_region.o_CFLAGS_ADD=-Werror -Wshorten-64-to-32 -export vm_swapfile_pager.o_CFLAGS_ADD=-Werror -Wshorten-64-to-32 -export vm_user.o_CFLAGS_ADD=-Werror -Wshorten-64-to-32 - -export default_pager.o_CFLAGS_ADD=-Werror -Wshorten-64-to-32 -export dp_backing_store.o_CFLAGS_ADD=-Werror -Wshorten-64-to-32 -export dp_memory_object.o_CFLAGS_ADD=-Werror -Wshorten-64-to-32 - -# -# KDB support -# - -makedis: $(SRCROOT)/osfmk/ddb/makedis.c - $(HOST_CC) -Werror -Wall -o $@ $< - -ppc_disasm.o_CFLAGS_ADD = -Dperror=db_printf -Dexit=db_error -Dmalloc=db_disasm_malloc - -ppc_disasm.c ppc_disasm.h : $(SRCROOT)/osfmk/ppc/ppc_disasm.i makedis - ./makedis -w -h ./ppc_disasm.h $(SOURCE_DIR)/osfmk/ppc/ppc_disasm.i > ./ppc_disasm.c - - -db_disasm.o : ppc_disasm.h - -# Files that must go in the __HIB segment: -UNCONFIGURED_HIB_FILES= \ - hibernate_restore.o -HIB_FILES=$(filter $(UNCONFIGURED_HIB_FILES),$(OBJS)) - -###################################################################### -#END Machine dependent Makefile fragment for ppc -###################################################################### diff --git a/osfmk/conf/Makefile.template b/osfmk/conf/Makefile.template index 75f1c7f31..c39e844cf 100644 --- a/osfmk/conf/Makefile.template +++ b/osfmk/conf/Makefile.template @@ -26,7 +26,7 @@ include $(MakeInc_def) # # XXX: CFLAGS # -CFLAGS+= -imacros meta_features.h -DMACH_KERNEL_PRIVATE $(CFLAGS_INLINE_CONFIG) +CFLAGS+= -include meta_features.h -DMACH_KERNEL_PRIVATE $(CFLAGS_INLINE_CONFIG) # # Directories for mig generated files @@ -86,18 +86,20 @@ ${OBJS}: ${OBJSDEPS} LDOBJS = $(OBJS) -$(COMPONENT).o: $(LDOBJS) assym.s - $(_v)for hib_file in ${HIB_FILES}; \ +$(COMPONENT).filelist: $(LDOBJS) assym.s + $(_v)if [ $(BUILD_MACHO_OBJ) -eq 1 ]; then \ + for hib_file in ${HIB_FILES}; \ do \ $(SEG_HACK) __HIB $${hib_file} -o $${hib_file}__; \ mv $${hib_file}__ $${hib_file} ; \ - done; + done; \ + fi @echo LDFILELIST $(COMPONENT) $(_v)( for obj in ${LDOBJS}; do \ echo $(TARGET)$(COMP_OBJ_DIR)/$(KERNEL_CONFIG)/$${obj}; \ - done; ) > $(COMPONENT).o + done; ) > $(COMPONENT).filelist -do_all: $(COMPONENT).o +do_all: $(COMPONENT).filelist do_depend: do_all $(_v)${MD} -u Makedep -f -d `ls *.d`; @@ -108,8 +110,14 @@ do_build_all: do_depend # we name it genassym.o to help with the automatic # dependency generation -genassym.o: $(SOURCE_DIR)/$(COMPONENT)/$(ARCH_CONFIG_LC)/genassym.c - $(_v)${KCC} ${CFLAGS} -MD ${_HOST_EXTRA_CFLAGS} -S -o ${@} -c ${INCFLAGS} $< +GENASSYM_LOCATION = $(ARCH_CONFIG_LC) + +ifeq ($(ARCH_CONFIG_LC),x86_64) +GENASSYM_LOCATION = i386 +endif + +genassym.o: $(SOURCE_DIR)/$(COMPONENT)/$(GENASSYM_LOCATION)/genassym.c + $(_v)${KCC} $(subst -flto,,${CFLAGS}) -MD ${_HOST_EXTRA_CFLAGS} -S -o ${@} -c ${INCFLAGS} $< assym.s: genassym.o $(_v)sed -e '/#DEFINITION#/!d' -e 's/^.*#DEFINITION#//' -e 's/\$$//' -e 'p' -e 's/#//2' -e 's/[^A-Za-z0-9_]*\([A-Za-z0-9_]*\)/ \1_NUM/2' genassym.o > ${@} diff --git a/osfmk/conf/Makefile.x86_64 b/osfmk/conf/Makefile.x86_64 index d24ace3bf..768a50845 100644 --- a/osfmk/conf/Makefile.x86_64 +++ b/osfmk/conf/Makefile.x86_64 @@ -2,32 +2,7 @@ #BEGIN Machine dependent Makefile fragment for x86_64 ###################################################################### -CFLAGS+= -DAT386=1 -SFLAGS+= -DAT386=1 - -CFLAGS+= $(WERROR) -Wshorten-64-to-32 -CWARNFLAGS= $(filter-out -Wbad-function-cast, $(CWARNFLAGS_STD)) - -# Objects that don't compile cleanly: -OBJS_NO_WERROR= \ - UNDRequest.o \ - db_examine.o \ - db_macro.o \ - db_print.o \ - db_sym.o \ - db_variables.o \ - db_disasm.o \ - db_interface.o \ - db_trace.o \ - host_priv_server.o \ - mach_host_server.o \ - security_server.o \ - device_server.o \ - gssd_mach.o \ - -OBJS_WERROR=$(filter-out $(OBJS_NO_WERROR),$(OBJS)) - -$(OBJS_WERROR): WERROR=-Werror +CWARNFLAGS = $(CWARNFLAGS_STD) -Wshorten-64-to-32 # Files that must go in the __HIB segment: UNCONFIGURED_HIB_FILES= \ diff --git a/osfmk/conf/files b/osfmk/conf/files index 40e2d16d2..7a97e71c6 100644 --- a/osfmk/conf/files +++ b/osfmk/conf/files @@ -72,6 +72,8 @@ OPTIONS/stack_usage optional stack_usage OPTIONS/config_dtrace optional config_dtrace OPTIONS/config_counters optional config_counters +OPTIONS/no_kextd optional no_kextd + # Default pager and system pager files, to be moved to separate component osfmk/default_pager/default_pager.c standard @@ -93,7 +95,7 @@ osfmk/default_pager/dp_memory_object.c standard # # kextd files # -./kextd/kextd_mach.c standard +./kextd/kextd_mach.c optional not no_kextd # # UserNotification files @@ -152,13 +154,14 @@ osfmk/kern/clock_oldops.c standard osfmk/kern/counters.c standard osfmk/kern/debug.c standard osfmk/kern/exception.c standard +osfmk/kern/extmod_statistics.c standard osfmk/kern/host.c standard osfmk/kern/host_notify.c standard osfmk/kern/ipc_clock.c standard osfmk/kern/ipc_host.c standard osfmk/kern/ipc_kobject.c standard osfmk/kern/ipc_mig.c standard -osfmk/kern/ipc_misc.c optional config_embedded +osfmk/kern/ipc_misc.c standard osfmk/kern/ipc_sync.c standard osfmk/kern/ipc_tt.c standard osfmk/kern/kalloc.c standard @@ -176,6 +179,9 @@ osfmk/kern/processor_data.c standard osfmk/kern/queue.c standard osfmk/kern/sched_average.c standard osfmk/kern/sched_prim.c standard +osfmk/kern/sched_proto.c optional config_sched_proto +osfmk/kern/sched_grrr.c optional config_sched_grrr_core +osfmk/kern/sched_fixedpriority.c optional config_sched_fixedpriority osfmk/kern/security.c optional config_macf osfmk/kern/stack.c standard osfmk/kern/startup.c standard @@ -235,6 +241,7 @@ osfmk/pmc/pmc.c standard ./mach/security_server.c optional config_macf osfmk/vm/bsd_vm.c optional mach_bsd +osfmk/vm/default_freezer.c optional config_freeze osfmk/vm/device_vm.c standard osfmk/vm/memory_object.c standard osfmk/vm/vm_debug.c standard @@ -243,6 +250,9 @@ osfmk/vm/vm_fault.c standard osfmk/vm/vm_init.c standard osfmk/vm/vm_kern.c standard osfmk/vm/vm_map.c standard +osfmk/vm/vm_map_store.c standard +osfmk/vm/vm_map_store_ll.c standard +osfmk/vm/vm_map_store_rb.c standard osfmk/vm/vm_object.c standard osfmk/vm/vm_pageout.c standard osfmk/vm/vm_purgeable.c standard @@ -280,4 +290,3 @@ osfmk/chud/chud_osfmk_callback.c standard osfmk/chud/chud_thread.c standard osfmk/console/serial_general.c standard - diff --git a/osfmk/conf/files.i386 b/osfmk/conf/files.i386 index 9fe585040..8c2864527 100644 --- a/osfmk/conf/files.i386 +++ b/osfmk/conf/files.i386 @@ -22,11 +22,15 @@ osfmk/vm/vm_apple_protect.c standard osfmk/i386/pmap.c standard osfmk/i386/pmap_x86_common.c standard +osfmk/i386/pmap_common.c standard +osfmk/i386/pal_routines.c optional pal_i386 +osfmk/i386/pal_routines_asm.s optional pal_i386 osfmk/ddb/db_aout.c optional mach_kdb osfmk/i386/bsd_i386.c optional mach_bsd +osfmk/i386/bsd_i386_native.c optional mach_bsd osfmk/i386/machdep_call.c optional mach_bsd osfmk/i386/_setjmp.s standard @@ -50,7 +54,8 @@ osfmk/i386/idt.s standard osfmk/i386/io_map.c standard osfmk/i386/ktss.c standard osfmk/i386/ldt.c standard -osfmk/i386/loose_ends.c standard +osfmk/i386/loose_ends.c standard +osfmk/i386/copyio.c standard osfmk/i386/locks_i386.c standard osfmk/i386/locore.s standard osfmk/i386/start.s standard @@ -64,9 +69,12 @@ osfmk/i386/mcount.s optional profile osfmk/i386/mp_desc.c standard #osfmk/i386/ntoh.s standard osfmk/i386/pcb.c standard +osfmk/i386/pcb_native.c standard osfmk/i386/phys.c standard osfmk/i386/rtclock.c standard +osfmk/i386/rtclock_native.c standard osfmk/i386/trap.c standard +osfmk/i386/trap_native.c standard osfmk/i386/user_ldt.c standard osfmk/i386/Diagnostics.c standard osfmk/i386/pmCPU.c standard @@ -74,41 +82,21 @@ osfmk/i386/tsc.c standard osfmk/i386/commpage/commpage.c standard osfmk/i386/commpage/commpage_asm.s standard -osfmk/i386/commpage/atomic.s standard -osfmk/i386/commpage/cpu_number.s standard -osfmk/i386/commpage/commpage_mach_absolute_time.s standard -osfmk/i386/commpage/spinlocks.s standard osfmk/i386/commpage/pthreads.s standard -osfmk/i386/commpage/cacheflush.s standard -osfmk/i386/commpage/commpage_gettimeofday.s standard -osfmk/i386/commpage/bcopy_scalar.s standard -osfmk/i386/commpage/bcopy_sse2.s standard -osfmk/i386/commpage/bcopy_sse3x.s standard -osfmk/i386/commpage/bcopy_sse3x_64.s standard -osfmk/i386/commpage/bcopy_sse42.s standard -osfmk/i386/commpage/bcopy_sse42_64.s standard -osfmk/i386/commpage/bzero_scalar.s standard -osfmk/i386/commpage/bzero_sse2.s standard -osfmk/i386/commpage/bzero_sse2_64.s standard -osfmk/i386/commpage/bzero_sse42.s standard -osfmk/i386/commpage/bzero_sse42_64.s standard -osfmk/i386/commpage/memset_pattern_sse2.s standard -osfmk/i386/commpage/memset_pattern_sse2_64.s standard -osfmk/i386/commpage/longcopy_sse3x.s standard -osfmk/i386/commpage/longcopy_sse3x_64.s standard -osfmk/i386/commpage/commpage_sigs.c standard osfmk/i386/commpage/fifo_queues.s standard osfmk/i386/AT386/conf.c standard osfmk/i386/AT386/model_dep.c standard osfmk/i386/lapic.c standard +osfmk/i386/lapic_native.c standard osfmk/i386/mp.c standard +osfmk/i386/mp_native.c standard osfmk/i386/acpi.c standard osfmk/i386/acpi_wakeup.s standard -osfmk/i386/mtrr.c standard +osfmk/i386/mtrr.c optional config_mtrr osfmk/console/i386/serial_console.c optional com device-driver @@ -134,6 +122,8 @@ osfmk/chud/i386/chud_osfmk_callback_i386.c standard osfmk/chud/i386/chud_cpu_i386.c standard osfmk/chud/i386/chud_thread_i386.c standard +osfmk/i386/ucode.c standard + osfmk/i386/vmx/vmx_cpu.c optional config_vmx osfmk/i386/vmx/vmx_shims.c optional config_vmx @@ -145,6 +135,6 @@ osfmk/i386/vmx/vmx_shims.c optional config_vmx #osfmk/OPTIONS/hi_res_clock optional hi_res_clock -osfmk/i386/startup64.c optional x86_64 -osfmk/i386/start64.s optional x86_64 -osfmk/i386/idt64.s optional x86_64 +osfmk/i386/startup64.c standard +osfmk/i386/start64.s standard +osfmk/i386/idt64.s standard diff --git a/osfmk/conf/files.ppc b/osfmk/conf/files.ppc deleted file mode 100644 index 2866dd820..000000000 --- a/osfmk/conf/files.ppc +++ /dev/null @@ -1,120 +0,0 @@ -# @OSF_COPYRIGHT@ -# - -OPTIONS/db_machine_commands optional db_machine_commands -OPTIONS/gprof optional gprof -OPTIONS/fpe optional fpe -OPTIONS/fddi optional fddi -OPTIONS/serial_console_default optional serial_console_default -OPTIONS/mp optional mp - -# lowmem_vectors.s must be at head of link line. -# template.mk treats this as a special case and makes sure -# that the file is placed at the front of the line - - -osfmk/ddb/db_aout.c optional mach_kdb -./ppc_disasm.c optional mach_kdb -osfmk/ppc/db_disasm.c optional mach_kdb -osfmk/ppc/db_interface.c optional mach_kdb -osfmk/ppc/db_trace.c optional mach_kdb -osfmk/ppc/db_low_trace.c optional mach_kdb -osfmk/ppc/bcopytest.c optional mach_kdb - -osfmk/ppc/lowmem_vectors.s standard -osfmk/ppc/start.s standard -osfmk/ppc/_setjmp.s standard -osfmk/ppc/mcount.s optional profile - -osfmk/ppc/cpu.c standard -osfmk/ppc/ppc_init.c standard -osfmk/ppc/ppc_vm_init.c standard -osfmk/ppc/model_dep.c standard -osfmk/ppc/locks_ppc.c standard -osfmk/ppc/pmap.c standard -osfmk/ppc/mappings.c standard -osfmk/ppc/savearea.c standard -osfmk/ppc/savearea_asm.s standard -osfmk/ppc/hw_vm.s standard -osfmk/ppc/skiplists.s standard -osfmk/ppc/hw_lock.s standard -osfmk/ppc/misc_asm.s standard -osfmk/ppc/status.c standard -osfmk/ppc/io_map.c standard -osfmk/ppc/trap.c standard -osfmk/ppc/pcb.c standard -osfmk/ppc/bits.s standard -osfmk/ppc/cswtch.s standard -osfmk/ppc/cache.s standard -osfmk/ppc/movc.s standard -osfmk/ppc/hw_exception.s standard -osfmk/ppc/bzero.s standard -osfmk/ppc/bcopy.s standard -osfmk/ppc/atomic_switch.s standard -osfmk/ppc/PseudoKernel.c standard -osfmk/ppc/interrupt.c standard -osfmk/ppc/machine_routines.c standard -osfmk/ppc/machine_routines_asm.s standard -osfmk/ppc/machine_task.c standard -osfmk/ppc/Emulate.s standard -osfmk/ppc/Emulate64.s standard -osfmk/ppc/AltiAssist.s standard -osfmk/ppc/conf.c standard -osfmk/ppc/etimer.c standard -osfmk/ppc/rtclock.c standard -osfmk/ppc/Diagnostics.c standard -osfmk/ppc/PPCcalls.c standard -osfmk/ppc/vmachmon.c standard -osfmk/ppc/vmachmon_asm.s standard -osfmk/ppc/pms.c standard -osfmk/ppc/pmsCPU.c standard - -osfmk/ppc/Firmware.s standard -osfmk/ppc/FirmwareC.c standard - -osfmk/ppc/aligned_data.s standard - -osfmk/ppc/hw_perfmon.c standard - -osfmk/ppc/commpage/commpage.c standard -osfmk/ppc/commpage/commpage_asm.s standard -osfmk/ppc/commpage/bcopy_g3.s standard -osfmk/ppc/commpage/bcopy_g4.s standard -osfmk/ppc/commpage/bcopy_970.s standard -osfmk/ppc/commpage/bcopy_64.s standard -osfmk/ppc/commpage/bzero_32.s standard -osfmk/ppc/commpage/bzero_128.s standard -osfmk/ppc/commpage/cacheflush.s standard -osfmk/ppc/commpage/gettimeofday.s standard -osfmk/ppc/commpage/mach_absolute_time.s standard -osfmk/ppc/commpage/pthread.s standard -osfmk/ppc/commpage/spinlocks.s standard -osfmk/ppc/commpage/bigcopy_970.s standard -osfmk/ppc/commpage/atomic.s standard -osfmk/ppc/commpage/memset_64.s standard -osfmk/ppc/commpage/memset_g3.s standard -osfmk/ppc/commpage/memset_g4.s standard -osfmk/ppc/commpage/memset_g5.s standard - -osfmk/chud/ppc/chud_cpu_asm.s standard -osfmk/chud/ppc/chud_cpu_ppc.c standard -osfmk/chud/ppc/chud_osfmk_callback_ppc.c standard -osfmk/chud/ppc/chud_thread_ppc.c standard - -osfmk/kdp/ml/ppc/kdp_machdep.c optional mach_kdp -osfmk/kdp/ml/ppc/kdp_vm.c optional mach_kdp -osfmk/kdp/ml/ppc/kdp_misc.s optional mach_kdp - -osfmk/console/ppc/serial_console.c optional scc device-driver -osfmk/ppc/serial_io.c optional scc device-driver - -osfmk/console/panic_dialog.c optional vc device-driver -osfmk/console/video_console.c optional vc device-driver -osfmk/console/ppc/video_scroll.s optional vc device-driver - -osfmk/ppc/hibernate_ppc.c optional hibernation -osfmk/ppc/hibernate_restore.s optional hibernation - -# DUMMIES TO FORCE GENERATION OF .h FILES -OPTIONS/bm optional bm -OPTIONS/debug optional debug diff --git a/osfmk/conf/files.x86_64 b/osfmk/conf/files.x86_64 index fbdaf097a..a147f68de 100644 --- a/osfmk/conf/files.x86_64 +++ b/osfmk/conf/files.x86_64 @@ -24,9 +24,14 @@ osfmk/vm/vm_apple_protect.c standard osfmk/x86_64/pmap.c standard osfmk/i386/pmap_x86_common.c standard +osfmk/i386/pmap_common.c standard +osfmk/x86_64/pmap_pcid.c standard +osfmk/i386/pal_routines.c optional pal_i386 +osfmk/x86_64/pal_routines_asm.s optional pal_i386 osfmk/i386/bsd_i386.c optional mach_bsd +osfmk/i386/bsd_i386_native.c optional mach_bsd osfmk/i386/machdep_call.c optional mach_bsd osfmk/x86_64/bcopy.s standard @@ -46,6 +51,7 @@ osfmk/i386/io_map.c standard osfmk/i386/ktss.c standard osfmk/i386/ldt.c standard osfmk/x86_64/loose_ends.c standard +osfmk/x86_64/copyio.c standard osfmk/i386/locks_i386.c standard osfmk/x86_64/locore.s standard osfmk/x86_64/start.s standard @@ -59,9 +65,12 @@ osfmk/x86_64/mcount.s optional profile osfmk/i386/mp_desc.c standard #osfmk/x86_64/ntoh.s standard osfmk/i386/pcb.c standard +osfmk/i386/pcb_native.c standard osfmk/i386/phys.c standard osfmk/i386/rtclock.c standard +osfmk/i386/rtclock_native.c standard osfmk/i386/trap.c standard +osfmk/i386/trap_native.c standard osfmk/i386/user_ldt.c standard osfmk/i386/Diagnostics.c standard osfmk/i386/pmCPU.c standard @@ -69,40 +78,20 @@ osfmk/i386/tsc.c standard osfmk/i386/commpage/commpage.c standard osfmk/i386/commpage/commpage_asm.s standard -osfmk/i386/commpage/atomic.s standard -osfmk/i386/commpage/cpu_number.s standard -osfmk/i386/commpage/commpage_mach_absolute_time.s standard -osfmk/i386/commpage/spinlocks.s standard osfmk/i386/commpage/pthreads.s standard -osfmk/i386/commpage/cacheflush.s standard -osfmk/i386/commpage/commpage_gettimeofday.s standard -osfmk/i386/commpage/bcopy_scalar.s standard -osfmk/i386/commpage/bcopy_sse2.s standard -osfmk/i386/commpage/bcopy_sse3x.s standard -osfmk/i386/commpage/bcopy_sse3x_64.s standard -osfmk/i386/commpage/bcopy_sse42.s standard -osfmk/i386/commpage/bcopy_sse42_64.s standard -osfmk/i386/commpage/bzero_scalar.s standard -osfmk/i386/commpage/bzero_sse2.s standard -osfmk/i386/commpage/bzero_sse2_64.s standard -osfmk/i386/commpage/bzero_sse42.s standard -osfmk/i386/commpage/bzero_sse42_64.s standard -osfmk/i386/commpage/memset_pattern_sse2.s standard -osfmk/i386/commpage/memset_pattern_sse2_64.s standard -osfmk/i386/commpage/longcopy_sse3x.s standard -osfmk/i386/commpage/longcopy_sse3x_64.s standard -osfmk/i386/commpage/commpage_sigs.c standard osfmk/i386/commpage/fifo_queues.s standard osfmk/i386/AT386/conf.c standard osfmk/i386/AT386/model_dep.c standard osfmk/i386/lapic.c standard +osfmk/i386/lapic_native.c standard osfmk/i386/mp.c standard +osfmk/i386/mp_native.c standard osfmk/i386/acpi.c standard -osfmk/i386/mtrr.c standard +osfmk/i386/mtrr.c optional config_mtrr osfmk/console/i386/serial_console.c optional com device-driver @@ -128,6 +117,8 @@ osfmk/chud/i386/chud_osfmk_callback_i386.c standard osfmk/chud/i386/chud_cpu_i386.c standard osfmk/chud/i386/chud_thread_i386.c standard +osfmk/i386/ucode.c standard + osfmk/i386/vmx/vmx_cpu.c optional config_vmx osfmk/i386/vmx/vmx_shims.c optional config_vmx @@ -139,5 +130,5 @@ osfmk/i386/vmx/vmx_shims.c optional config_vmx #osfmk/OPTIONS/hi_res_clock optional hi_res_clock -osfmk/i386/startup64.c optional x86_64 -osfmk/x86_64/idt64.s optional x86_64 +osfmk/i386/startup64.c standard +osfmk/x86_64/idt64.s standard diff --git a/osfmk/conf/tools/Makefile b/osfmk/conf/tools/Makefile deleted file mode 100644 index 4f9ccd553..000000000 --- a/osfmk/conf/tools/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -SETUP_SUBDIRS = doconf - -COMP_SUBDIRS = doconf - -INST_SUBDIRS = \ - - -setup_build_all: - @echo "[ $(SOURCE) ] make setup_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -do_build_all: - @echo "[ $(SOURCE) ] make do_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -setup_build_install: - @echo "[ $(SOURCE) ] make setup_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -do_build_install: - @echo "[ $(SOURCE) ] make do_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -include $(MakeInc_rule) -include $(MakeInc_dir) - - diff --git a/osfmk/conf/tools/doconf/Makefile b/osfmk/conf/tools/doconf/Makefile deleted file mode 100644 index aa55a9419..000000000 --- a/osfmk/conf/tools/doconf/Makefile +++ /dev/null @@ -1,47 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -COMP_SUBDIRS = \ - -INST_SUBDIRS = \ - - -# -# Who and where -# -BINDIR= -ifneq ($(MACHINE_CONFIG), DEFAULT) -DSTDIR= $(strip $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)_$(MACHINE_CONFIG)/$(COMPONENT)/) -else -DSTDIR= $(strip $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)/$(COMPONENT)/) -endif -PROGRAM= $(DSTDIR)doconf - -# -# How to install it -# -IFLAGS= -c -m 555 - -$(PROGRAM): $(DSTDIR)% : $(SOURCE)%.csh - @-$(RM) $(RMFLAGS) $(notdir $(PROGRAM)).VERS - @sed -e "s/#PROGRAM.*/#`vers_string $(notdir $(PROGRAM))`/" \ - < $< >$(notdir $(PROGRAM)).VERS; - @install $(IFLAGS) $(notdir $(PROGRAM)).VERS $(PROGRAM); - @-$(RM) $(RMFLAGS) $(notdir $(PROGRAM)).VERS; - -do_build_setup: $(PROGRAM) - -do_build_all: - -setup_build_install: - -do_build_install: - -include $(MakeInc_rule) -include $(MakeInc_dir) diff --git a/osfmk/conf/tools/doconf/doconf.csh b/osfmk/conf/tools/doconf/doconf.csh deleted file mode 100755 index 6fedb4786..000000000 --- a/osfmk/conf/tools/doconf/doconf.csh +++ /dev/null @@ -1,321 +0,0 @@ -#!/bin/csh -f -set path = ($path .) -###################################################################### -# HISTORY -# 1-Dec-87 Michael Young (mwyoung) at Carnegie-Mellon University -# Added "-verbose" switch, so this script produces no output -# in the normal case. -# -# 10-Oct-87 Mike Accetta (mja) at Carnegie-Mellon University -# Flushed cmu_*.h and spin_locks.h -# [ V5.1(XF18) ] -# -# 6-Apr-87 Avadis Tevanian (avie) at Carnegie-Mellon University -# Use MASTER.local and MASTER..local for generation of -# configuration files in addition to MASTER and MASTER.. -# -# 25-Mar-87 Mike Accetta (mja) at Carnegie-Mellon University -# Removed use of obsolete wb_*.h files when building the feature -# list; modified to save the previous configuration file and -# display the differences between it and the new file. -# [ V5.1(F8) ] -# -# 25-Mar-87 Avadis Tevanian (avie) at Carnegie-Mellon University -# If there is no /etc/machine just print out a message telling -# user to use the -cpu option. I thought this script was supposed -# to work even without a /etc/machine, but it doesn't... and this -# is the easiest way out. -# -# 13-Mar-87 Mike Accetta (mja) at Carnegie-Mellon University -# Added "romp_fpa.h" file to extra features for the RT. -# [ V5.1(F7) ] -# -# 11-Mar-87 Mike Accetta (mja) at Carnegie-Mellon University -# Updated to maintain the appropriate configuration features file -# in the "machine" directory whenever the corresponding -# configuration is generated. This replaces the old mechanism of -# storing this directly in the file since it was -# machine dependent and also precluded building programs for more -# than one configuration from the same set of sources. -# [ V5.1(F6) ] -# -# 21-Feb-87 Mike Accetta (mja) at Carnegie-Mellon University -# Fixed to require wired-in cpu type names for only those -# machines where the kernel name differs from that provided by -# /etc/machine (i.e. IBMRT => ca and SUN => sun3); updated to -# permit configuration descriptions in both machine indepedent -# and dependent master configuration files so that attributes can -# be grouped accordingly. -# [ V5.1(F3) ] -# -# 17-Jan-87 Mike Accetta (mja) at Carnegie-Mellon University -# Updated to work from any directory at the same level as -# "conf"; generate configuration from both MASTER and -# MASTER. files; added -cpu switch. -# [ V5.1(F1) ] -# -# 18-Aug-86 Mike Accetta (mja) at Carnegie-Mellon University -# Added -make switch and changed meaning of -config; upgraded to -# allow multiple attributes per configuration and to define -# configurations in terms of these attributes within MASTER. -# -# 14-Apr-83 Mike Accetta (mja) at Carnegie-Mellon University -# Added -config switch to only run /etc/config without -# "make depend" and "make". -# -###################################################################### - -set prog=$0 -set prog=$prog:t -set nonomatch -set OBJDIR=../BUILD -if ("`/usr/bin/uname`" == "Rhapsody" ) then -set CONFIG_DIR=/usr/local/bin -else -set CONFIG_DIR=/usr/bin -endif - -unset domake -unset doconfig -unset beverbose -unset MACHINE -unset profile - -while ($#argv >= 1) - if ("$argv[1]" =~ -*) then - switch ("$argv[1]") - case "-c": - case "-config": - set doconfig - breaksw - case "-m": - case "-make": - set domake - breaksw - case "-cpu": - if ($#argv < 2) then - echo "${prog}: missing argument to ${argv[1]}" - exit 1 - endif - set MACHINE="$argv[2]" - shift - breaksw - case "-d": - if ($#argv < 2) then - echo "${prog}: missing argument to ${argv[1]}" - exit 1 - endif - set OBJDIR="$argv[2]" - shift - breaksw - case "-verbose": - set beverbose - breaksw - case "-p": - case "-profile": - set profile - breaksw - default: - echo "${prog}: ${argv[1]}: unknown switch" - exit 1 - breaksw - endsw - shift - else - break - endif -end - -if ($#argv == 0) set argv=(GENERIC) - -if (! $?MACHINE) then - if (-d /NextApps) then - set MACHINE=`hostinfo | awk '/MC680x0/ { printf("m68k") } /MC880x0/ { printf("m88k") }'` - endif -endif - -if (! $?MACHINE) then - if (-f /etc/machine) then - set MACHINE="`/etc/machine`" - else - echo "${prog}: no /etc/machine, specify machine type with -cpu" - echo "${prog}: e.g. ${prog} -cpu VAX CONFIGURATION" - exit 1 - endif -endif - -set FEATURES_EXTRA= - -switch ("$MACHINE") - case IBMRT: - set cpu=ca - set ID=RT - set FEATURES_EXTRA="romp_dualcall.h romp_fpa.h" - breaksw - case SUN: - set cpu=sun3 - set ID=SUN3 - breaksw - default: - set cpu=`echo $MACHINE | tr A-Z a-z` - set ID=`echo $MACHINE | tr a-z A-Z` - breaksw -endsw -set FEATURES=../h/features.h -set FEATURES_H=(cs_*.h mach_*.h net_*.h\ - cputypes.h cpus.h vice.h\ - $FEATURES_EXTRA) -set MASTER_DIR=../conf -set MASTER = ${MASTER_DIR}/MASTER -set MASTER_CPU=${MASTER}.${cpu} - -set MASTER_LOCAL = ${MASTER}.local -set MASTER_CPU_LOCAL = ${MASTER_CPU}.local -if (! -f $MASTER_LOCAL) set MASTER_LOCAL = "" -if (! -f $MASTER_CPU_LOCAL) set MASTER_CPU_LOCAL = "" - -if (! -d $OBJDIR) then - if ($?beverbose) then - echo "[ creating $OBJDIR ]" - endif - mkdir -p $OBJDIR -endif - -foreach SYS ($argv) - set SYSID=${SYS}_${ID} - set SYSCONF=$OBJDIR/config.$SYSID - set BLDDIR=$OBJDIR - if ($?beverbose) then - echo "[ generating $SYSID from $MASTER_DIR/MASTER{,.$cpu}{,.local} ]" - endif - echo +$SYS \ - | \ - cat $MASTER $MASTER_LOCAL $MASTER_CPU $MASTER_CPU_LOCAL - \ - $MASTER $MASTER_LOCAL $MASTER_CPU $MASTER_CPU_LOCAL \ - | \ - sed -n \ - -e "/^+/{" \ - -e "s;[-+];#&;gp" \ - -e 't loop' \ - -e ': loop' \ - -e 'n' \ - -e '/^#/b loop' \ - -e '/^$/b loop' \ - -e 's;^\([^#]*\).*#[ ]*<\(.*\)>[ ]*$;\2#\1;' \ - -e 't not' \ - -e 's;\([^#]*\).*;#\1;' \ - -e 't not' \ - -e ': not' \ - -e 's;[ ]*$;;' \ - -e 's;^\!\(.*\);\1#\!;' \ - -e 'p' \ - -e 't loop' \ - -e 'b loop' \ - -e '}' \ - -e "/^[^#]/d" \ - -e 's; ; ;g' \ - -e "s;^# *\([^ ]*\)[ ]*=[ ]*\[\(.*\)\].*;\1#\2;p" \ - | \ - awk '-F#' '\ -part == 0 && $1 != "" {\ - m[$1]=m[$1] " " $2;\ - next;\ -}\ -part == 0 && $1 == "" {\ - for (i=NF;i>1;i--){\ - s=substr($i,2);\ - c[++na]=substr($i,1,1);\ - a[na]=s;\ - }\ - while (na > 0){\ - s=a[na];\ - d=c[na--];\ - if (m[s] == "") {\ - f[s]=d;\ - } else {\ - nx=split(m[s],x," ");\ - for (j=nx;j>0;j--) {\ - z=x[j];\ - a[++na]=z;\ - c[na]=d;\ - }\ - }\ - }\ - part=1;\ - next;\ -}\ -part != 0 {\ - if ($1 != "") {\ - n=split($1,x,",");\ - ok=0;\ - for (i=1;i<=n;i++) {\ - if (f[x[i]] == "+") {\ - ok=1;\ - }\ - }\ - if (NF > 2 && ok == 0 || NF <= 2 && ok != 0) {\ - print $2; \ - }\ - } else { \ - print $2; \ - }\ -}\ -' >$SYSCONF.new - if (-z $SYSCONF.new) then - echo "${prog}: ${$SYSID}: no such configuration in $MASTER_DIR/MASTER{,.$cpu}" - rm -f $SYSCONF.new - endif - if (! -d $BLDDIR) then - if ($?beverbose) then - echo "[ creating $BLDDIR ]" - endif - mkdir -p $BLDDIR - endif -# -# These paths are used by config. -# -# "builddir" is the name of the directory where kernel binaries -# are put. It is a single path element, never absolute, and is -# always relative to "objectdir". "builddir" is used by config -# solely to determine where to put files created by "config" (e.g. -# the created Makefile and *.h's.) -# -# "objectdir" is the name of the directory which will hold "builddir". -# It is a path; if relative, it is relative to the current directory -# where config is run. It's sole use is to be prepended to "builddir" -# to indicate where config-created files are to be placed (see above). -# -# "sourcedir" is the location of the sources used to build the kernel. -# It is a path; if relative, it is relative to the directory specified -# by the concatenation of "objectdir" and "builddir" (i.e. where the -# kernel binaries are put). -# - echo 'builddir "."' >> $SYSCONF.new - set OBJRELDIR=`$RELPATH $OBJROOT $OBJDIR` - echo 'objectdir "'$OBJROOT'/'$OBJRELDIR'"' >> $SYSCONF.new - set SRCDIR=`dirname $SOURCE` - echo 'sourcedir "'$SRCROOT'"' >> $SYSCONF.new - if (-f $SYSCONF) then - diff $SYSCONF $SYSCONF.new - rm -f $SYSCONF.old - mv $SYSCONF $SYSCONF.old - endif - rm -f $SYSCONF - mv $SYSCONF.new $SYSCONF - if ($?doconfig) then - if ($?beverbose) then - echo "[ configuring $SYSID ]" - endif - if ($?profile) then - $CONFIG_DIR/config -c $MASTER_DIR -p $SYSCONF - else - $CONFIG_DIR/config -c $MASTER_DIR $SYSCONF - endif - endif - if ($?domake) then - if ($?beverbose) then - echo "[ making $SYSID ]" - endif - (cd $BLDDIR; make) - endif -end diff --git a/osfmk/console/i386/serial_console.c b/osfmk/console/i386/serial_console.c index 234a022b8..2af1a9553 100644 --- a/osfmk/console/i386/serial_console.c +++ b/osfmk/console/i386/serial_console.c @@ -58,9 +58,6 @@ typedef struct console_buf { char buf[CPU_BUFFER_LEN]; } console_buf_t; -extern int serial_getc(void); -extern void serial_putc(int); - static void _serial_putc(int, int, int); struct console_ops cons_ops[] = { @@ -138,6 +135,13 @@ console_cpu_free(void *buf) kfree((void *) buf, sizeof(console_buf_t)); } +/* So we can re-write the serial device functions at boot-time */ +void +console_set_serial_ops( struct console_ops *newops ) +{ + cons_ops[SERIAL_CONS_OPS] = *newops; +} + static inline int console_ring_space(void) { diff --git a/osfmk/console/ppc/serial_console.c b/osfmk/console/ppc/serial_console.c deleted file mode 100644 index 648ea791e..000000000 --- a/osfmk/console/ppc/serial_console.c +++ /dev/null @@ -1,329 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * @APPLE_FREE_COPYRIGHT@ - */ - -#include -#include -#include - -#include -#include /* spl definitions */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * A machine MUST have a console. In our case - * things are a little complicated by the graphic - * display: people expect it to be their "console", - * but we'd like to be able to live without it. - * This is not to be confused with the "rconsole" thing: - * that just duplicates the console I/O to - * another place (for debugging/logging purposes). - */ - -const int console_unit = 0; -const uint32_t console_chan_default = CONSOLE_PORT; -#define console_chan (console_chan_default) /* ^ cpu_number()) */ - -#define MP_SAFE_CONSOLE 1 /* Set this to 1 to allow more than 1 processor to print at once */ -#if MP_SAFE_CONSOLE -struct ppcbfr { /* Controls multiple processor output */ - unsigned int pos; /* Current position in buffer */ - unsigned int noprompt; /* Set if we skip the prompt */ - unsigned int echo; /* Control character echoing */ - char buffer[256]; /* Fairly big buffer */ -}; -typedef struct ppcbfr ppcbfr_t; - -ppcbfr_t cbfr_boot_cpu; /* Get one for boot cpu */ -volatile unsigned int cbfpend; /* A buffer is pending output */ -volatile unsigned int sconowner=-1; /* Mark who's actually writing */ -#endif /* MP_SAFE_CONSOLE */ - -struct console_ops cons_ops[] = { - { - .putc = scc_putc, - .getc = scc_getc, - }, - { - .putc = vcputc, - .getc = vcgetc, - }, -}; - -uint32_t nconsops = (sizeof cons_ops / sizeof cons_ops[0]); - -uint32_t cons_ops_index = VC_CONS_OPS; - -unsigned int killprint = 0; -unsigned int debcnputc = 0; -extern unsigned int mappingdeb0; -extern int debugger_cpu; - -void *console_per_proc_alloc(boolean_t boot_processor) -{ - ppcbfr_t *cbfr_cpu; - - if (boot_processor) - cbfr_cpu = &cbfr_boot_cpu; - else { - cbfr_cpu = (ppcbfr_t *)kalloc(sizeof(ppcbfr_t)); - if (cbfr_cpu == (ppcbfr_t *)NULL) - return (void *)NULL; - } - bzero((char *)cbfr_cpu, sizeof(ppcbfr_t)); - return (void *)cbfr_cpu; -} - -void console_per_proc_free(void *per_proc_cbfr) -{ - if (per_proc_cbfr == (void *)&cbfr_boot_cpu) - return; - else - kfree(per_proc_cbfr, sizeof(ppcbfr_t)); -} - -static void _cnputc(char c) -{ - cons_ops[cons_ops_index].putc(console_unit, console_chan, c); -} - -void cnputc_unbuffered(char c) { - _cnputc(c); -} - -void cnputcusr(char c) { /* Echo input character directly */ - struct per_proc_info *procinfo; - spl_t s; - - s=splhigh(); - procinfo = getPerProc(); - - (void)hw_atomic_add(&(procinfo->debugger_holdoff), 1); /* Don't allow debugger entry just now (this is a HACK) */ - - _cnputc( c); /* Echo the character */ - if(c=='\n') _cnputc( '\r'); /* Add a return if we had a new line */ - - (void)hw_atomic_sub(&(procinfo->debugger_holdoff), 1); /* Don't allow debugger entry just now (this is a HACK) */ - splx(s); - return; -} - -void -cnputc(char c) -{ - unsigned int oldpend, i, cpu, ourbit, sccpu; - struct per_proc_info *procinfo; - ppcbfr_t *cbfr, *cbfr_cpu; - spl_t s; - -#if MP_SAFE_CONSOLE - -/* - * Handle multiple CPU console output. - * Note: this thing has gotten god-awful complicated. We need a better way. - */ - - - if(killprint) { - return; /* If printing is disabled, bail... */ - } - - s=splhigh(); /* Don't bother me */ - procinfo = getPerProc(); - cpu = procinfo->cpu_number; - cbfr = procinfo->pp_cbfr; - - (void)hw_atomic_add(&(procinfo->debugger_holdoff), 1); /* Don't allow debugger entry just now (this is a HACK) */ - - ourbit = 1 << cpu; /* Make a mask for just us */ - if(debugger_cpu != -1) { /* Are we in the debugger with empty buffers? */ - - while(sconowner != cpu) { /* Anyone but us? */ - hw_compare_and_store(-1, cpu, &sconowner); /* Try to mark it for us if idle */ - } - - _cnputc( c); /* Yeah, just write it */ - if(c=='\n') /* Did we just write a new line? */ - _cnputc( '\r'); /* Yeah, just add a return */ - - sconowner=-1; /* Mark it idle */ - (void)hw_atomic_sub(&(procinfo->debugger_holdoff), 1); /* Don't allow debugger entry just now (this is a HACK) */ - - splx(s); - return; /* Leave... */ - } - - - while(ourbit&cbfpend); /* We aren't "double buffered," so we'll just wait until the buffers are written */ - isync(); /* Just in case we had to wait */ - - if(c) { /* If the character is not null */ - cbfr->buffer[cbfr->pos]=c; /* Fill in the buffer for our CPU */ - cbfr->pos++; /* Up the count */ - if(cbfr->pos > 253) { /* Is the buffer full? */ - cbfr->buffer[254]='\n'; /* Yeah, set the second to last as a LF */ - cbfr->buffer[255]='\r'; /* And the last to a CR */ - cbfr->pos=256; /* Push the buffer to the end */ - c='\r'; /* Set character to a CR */ - } - } - - if(c == '\n') { /* Are we finishing a line? */ - cbfr->buffer[cbfr->pos]='\r'; /* And the last to a CR */ - cbfr->pos++; /* Up the count */ - c='\r'; /* Set character to a CR */ - } - -#if 1 - if(cbfr->echo == 1) { /* Did we hit an escape last time? */ - if(c == 'K') { /* Is it a partial clear? */ - cbfr->echo = 2; /* Yes, enter echo mode */ - } - else cbfr->echo = 0; /* Otherwise reset escape */ - } - else if(cbfr->echo == 0) { /* Not in escape sequence, see if we should enter */ - cbfr->echo = 1; /* Set that we are in escape sequence */ - } -#endif - - if((c == 0x00) || (c == '\r') || (cbfr->echo == 2)) { /* Try to push out all buffers if we see CR or null */ - - while(1) { /* Loop until we see who's doing this */ - oldpend=cbfpend; /* Get the currentest pending buffer flags */ - if(hw_compare_and_store(oldpend, oldpend|ourbit, &cbfpend)) /* Swap ours on if no change */ - break; /* Bail the loop if it worked */ - } - - if(!hw_compare_and_store(-1, cpu, &sconowner)) { /* See if someone else has this, and take it if not */ - procinfo->debugger_holdoff = 0; /* Allow debugger entry (this is a HACK) */ - splx(s); /* Let's take some 'rupts now */ - return; /* We leave here, 'cause another processor is already writing the buffers */ - } - - while(1) { /* Loop to dump out all of the finished buffers */ - oldpend=cbfpend; /* Get the most current finished buffers */ - for(sccpu=0; sccpupp_cbfr == 0)) - continue; - - cbfr_cpu = PerProcTable[sccpu].ppe_vaddr->pp_cbfr; - - if(oldpend&(1<noprompt) { /* Don't prompt if there was not CR before */ - _cnputc( '{'); /* Mark CPU number */ - _cnputc( '0'+sccpu); /* Mark CPU number */ - _cnputc( '.'); /* (TEST/DEBUG) */ - _cnputc( '0'+cpu); /* (TEST/DEBUG) */ - _cnputc( '}'); /* Mark CPU number */ - _cnputc( ' '); /* Mark CPU number */ - } -#endif - - for(i=0; ipos; i++) { /* Do the whole buffer */ - _cnputc(cbfr_cpu->buffer[i]); /* Write it */ - } - - if(cbfr_cpu->buffer[cbfr_cpu->pos-1]!='\r') { /* Was the last character a return? */ - cbfr_cpu->noprompt = 1; /* Remember not to prompt */ - } - else { /* Last was a return */ - cbfr_cpu->noprompt = 0; /* Otherwise remember to prompt */ - cbfr_cpu->echo = 0; /* And clear echo */ - } - - cbfr_cpu->pos=0; /* Reset the buffer pointer */ - - while(!hw_compare_and_store(cbfpend, cbfpend&~(1<debugger_holdoff), 1); /* Don't allow debugger entry just now (this is a HACK) */ - splx(s); /* Let's take some 'rupts now */ - -#else /* MP_SAFE_CONSOLE */ - _cnputc( c); - if (c == '\n') - _cnputc('\r'); -#endif /* MP_SAFE_CONSOLE */ - -} - -int -cngetc(void) -{ - return cons_ops[cons_ops_index].getc(console_unit, console_chan, - TRUE, FALSE); -} - -int -cnmaygetc(void) -{ - return cons_ops[cons_ops_index].getc(console_unit, console_chan, - FALSE, FALSE); -} - - -int -vcgetc(__unused int l, - __unused int u, - __unused boolean_t wait, - __unused boolean_t raw) -{ - char c; - - if( 0 == (*PE_poll_input)( 0, &c)) - return( c); - else - return( 0); -} diff --git a/osfmk/console/ppc/video_scroll.s b/osfmk/console/ppc/video_scroll.s deleted file mode 100644 index 77e4ffcfc..000000000 --- a/osfmk/console/ppc/video_scroll.s +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_FREE_COPYRIGHT@ - * - */ - -/* Routines to perform high-speed scrolling, assuming that the memory is - * non-cached, and that the amount of memory to be scrolled is a multiple - * of (at least) 16. - */ - -#include -#include - -/* - * void video_scroll_up(unsigned long start, - * unsigned long end, - * unsigned long dest) - */ - -ENTRY(video_scroll_up, TAG_NO_FRAME_USED) - - mfmsr r0 /* Get the MSR */ - mflr r6 /* Get the LR */ - ori r7,r0,1<<(31-MSR_FP_BIT) /* Turn on floating point */ - stwu r1,-(FM_SIZE+16)(r1) /* Get space for a couple of registers on stack */ - rlwinm r7,r7,0,MSR_EE_BIT+1,MSR_EE_BIT-1 /* Turn off interrupts */ - stw r6,(FM_SIZE+16+FM_LR_SAVE)(r1) /* Save the return */ - - mtmsr r7 /* Turn on FPU */ - isync /* Wait for it */ - -vsufpuon1: stfd f0,(FM_SIZE+0)(r1) /* Save one register */ - stfd f1,(FM_SIZE+8)(r1) /* and the second */ - -/* ok, now we can use the FPU registers to do some fast copying - */ - -.L_vscr_up_loop: - lfd f0, 0(r3) - lfd f1, 8(r3) - - addi r3, r3, 16 - - stfd f0, 0(r5) - - cmpl cr0, r3, r4 - - stfd f1, 8(r5) - - addi r5, r5, 16 - - blt+ cr0, .L_vscr_up_loop - - lfd f0,(FM_SIZE+0)(r1) /* Load back one register */ - lfd f1,(FM_SIZE+8)(r1) /* and the second */ - lwz r1,0(r1) /* Pop the stack */ - - mtmsr r0 /* Turn off FPU again */ - isync /* Wait for it */ - blr /* Go away, don't bother me... */ - - -/* - * void video_scroll_down(unsigned long start, HIGH address to scroll from - * unsigned long end, LOW address - * unsigned long dest) HIGH address - */ - -ENTRY(video_scroll_down, TAG_NO_FRAME_USED) - - /* Save off the link register, we want to call fpu_save. - */ - - - mfmsr r0 /* Get the MSR */ - mflr r6 /* Get the LR */ - ori r7,r0,1<<(31-MSR_FP_BIT) /* Turn on floating point */ - stwu r1,-(FM_SIZE+16)(r1) /* Get space for a couple of registers on stack */ - rlwinm r7,r7,0,MSR_EE_BIT+1,MSR_EE_BIT-1 /* Turn off interrupts */ - stw r6,(FM_SIZE+16+FM_LR_SAVE)(r1) /* Save the return */ - - mtmsr r7 /* Turn on FPU */ - isync /* Wait for it */ - -vsdfpuon1: stfd f0,(FM_SIZE+0)(r1) /* Save one register */ - stfd f1,(FM_SIZE+8)(r1) /* and the second */ - -/* ok, now we can use the FPU registers to do some fast copying */ - -.L_vscr_down_loop: - lfd f0, -16(r3) - lfd f1, -8(r3) - - subi r3, r3, 16 - - stfd f0, -16(r5) - - cmpl cr0, r3, r4 - - stfd f1, -8(r5) - - subi r5, r5, 16 - - bgt+ cr0, .L_vscr_down_loop - - - lfd f0,(FM_SIZE+0)(r1) /* Load back one register */ - lfd f1,(FM_SIZE+8)(r1) /* and the second */ - lwz r1,0(r1) /* Pop the stack */ - - mtmsr r0 /* Turn off FPU again */ - isync /* Wait for it */ - blr /* Go away, don't bother me... */ - diff --git a/osfmk/console/serial_general.c b/osfmk/console/serial_general.c index a2d78e79a..d51e98dab 100644 --- a/osfmk/console/serial_general.c +++ b/osfmk/console/serial_general.c @@ -80,7 +80,6 @@ serial_keyboard_poll(void) int chr; uint64_t next; - while(1) { chr = _serial_getc(0, 1, 0, 1); /* Get a character if there is one */ if(chr < 0) /* The serial buffer is empty */ diff --git a/osfmk/console/serial_protos.h b/osfmk/console/serial_protos.h index 90b691f1d..99da75451 100644 --- a/osfmk/console/serial_protos.h +++ b/osfmk/console/serial_protos.h @@ -46,16 +46,17 @@ extern unsigned int disable_serial_output; int _serial_getc(int unit, int line, boolean_t wait, boolean_t raw); -boolean_t console_is_serial(void); -int switch_to_serial_console(void); -int switch_to_video_console(void); -void switch_to_old_console(int old_console); - struct console_ops { void (*putc)(int, int, int); int (*getc)(int, int, boolean_t, boolean_t); }; +boolean_t console_is_serial(void); +int switch_to_serial_console(void); +int switch_to_video_console(void); +void switch_to_old_console(int old_console); +void console_set_serial_ops( struct console_ops *newops ); + #define SERIAL_CONS_OPS 0 #define VC_CONS_OPS 1 diff --git a/osfmk/console/video_console.c b/osfmk/console/video_console.c index 4b088aa41..9c5460016 100644 --- a/osfmk/console/video_console.c +++ b/osfmk/console/video_console.c @@ -187,7 +187,7 @@ MACRO_END #define VCPUTC_LOCK_LOCK() \ MACRO_BEGIN \ - if (!hw_lock_to(&vcputc_lock, LockTimeOut*10)) \ + if (!hw_lock_to(&vcputc_lock, hwLockTimeOut*10))\ { \ panic("VCPUTC_LOCK_LOCK"); \ } \ @@ -1274,7 +1274,7 @@ gc_update_color(int color, boolean_t fore) void vcputc(__unused int l, __unused int u, int c) { - if ( gc_enabled || debug_mode ) + if ( gc_initialized && ( gc_enabled || debug_mode ) ) { spl_t s; @@ -2444,7 +2444,7 @@ initialize_screen(PE_Video * boot_vinfo, unsigned int op) #if defined(__x86_64__) // Adjust the video buffer pointer to point to where it is in high virtual (above the hole) - new_vinfo.v_baseaddr |= VM_MIN_KERNEL_ADDRESS; + new_vinfo.v_baseaddr |= (VM_MIN_KERNEL_ADDRESS & ~LOW_4GB_MASK); #endif /* Update the vinfo structure atomically with respect to the vc_progress task if running */ @@ -2632,6 +2632,10 @@ vcattach(void) for ( index = 0 ; index < msgbufp->msg_bufx ; index++ ) { + if (msgbufp->msg_bufc[index] == '\0') { + continue; + } + vcputc( 0, 0, msgbufp->msg_bufc[index] ); if ( msgbufp->msg_bufc[index] == '\n' ) diff --git a/osfmk/ddb/db_command.c b/osfmk/ddb/db_command.c index 13815b525..7e21b12b4 100644 --- a/osfmk/ddb/db_command.c +++ b/osfmk/ddb/db_command.c @@ -64,9 +64,6 @@ * Command dispatcher. */ #include -#ifdef AT386 -#include -#endif /* AT386 */ #include #include @@ -86,9 +83,6 @@ #include #include #include -#if defined(__ppc__) -#include -#endif #include #include #include @@ -741,7 +735,6 @@ struct db_command db_command_table[] = { .name = "reboot", (db_func)db_reboot, }, -#if !defined(__ppc__) { .name = "ms", .fcn = db_msr, @@ -757,69 +750,6 @@ struct db_command db_command_table[] = { .fcn = db_apic, .flag = CS_MORE, }, -#endif /* !__ppc__ */ -#if defined(__ppc__) - { - .name = "lt", - .fcn = db_low_trace, - .flag = CS_MORE|CS_SET_DOT, - }, - { - .name = "dl", - .fcn = db_display_long, - .flag = CS_MORE|CS_SET_DOT, - }, - { - .name = "dc", - .fcn = db_display_char, - .flag = CS_MORE|CS_SET_DOT, - }, - { - .name = "dv", - .fcn = db_display_virtual, - .flag = CS_MORE|CS_SET_DOT, - }, - { - .name = "dm", - .fcn = db_display_mappings, - .flag = CS_MORE|CS_SET_DOT, - }, - { - .name = "dh", - .fcn = db_display_hash, - .flag = CS_MORE|CS_SET_DOT, - }, - { - .name = "dp", - .fcn = db_display_pmap, - .flag = CS_MORE, - }, - { - .name = "ds", - .fcn = db_display_save, - .flag = CS_MORE|CS_SET_DOT, - }, - { - .name = "dx", - .fcn = db_display_xregs, - .flag = CS_MORE|CS_SET_DOT, - }, - { - .name = "gs", - .fcn = db_gsnoop, - .flag = CS_MORE, - }, - { - .name = "cm", - .fcn = db_check_mappings, - .flag = CS_MORE, - }, - { - .name = "cp", - .fcn = db_check_pmaps, - .flag = CS_MORE, - }, -#endif /* __ppc__ */ { .name = (const char *)NULL, }, diff --git a/osfmk/ddb/db_print.c b/osfmk/ddb/db_print.c index 1a6fac39b..d773823d0 100644 --- a/osfmk/ddb/db_print.c +++ b/osfmk/ddb/db_print.c @@ -278,7 +278,7 @@ db_print_act( db_printf("%s ID: ACT STAT SW STACK SHUTTLE", indent); db_printf(" SUS PRI WAIT_FUNC\n"); } - policy = ((athread && (athread->sched_mode&TH_MODE_TIMESHARE))? 1: 2); + policy = ((athread && (athread->sched_mode == TH_MODE_TIMESHARE))? 1: 2); db_printf("%s%3d%c %0*X %s %s %0*X %0*X %3d %3d/%s ", indent, act_id, (thr_act == current_thread())? '#': ':', diff --git a/osfmk/ddb/db_sym.c b/osfmk/ddb/db_sym.c index b99f9674f..1e118054e 100644 --- a/osfmk/ddb/db_sym.c +++ b/osfmk/ddb/db_sym.c @@ -185,7 +185,7 @@ db_add_symbol_table( st->map_pointer = 0; else st->map_pointer = map_pointer; - strcpy(st->name, name); + strlcpy(st->name, name, sizeof (st->name)); st->minsym = minsym; st->maxsym = maxsym; if (maxsym == 0) @@ -1329,7 +1329,7 @@ db_clone_symtabXXX( } *st = *st_src; /* bulk copy src -> dest */ - strcpy(st->name, cloner); /* new name */ + strlcpy(st->name, cloner, sizeof (st->name)); /* new name */ st->private = memp; /* copy symbols */ bcopy((const char *)st_src->private, st->private, size); st->start = memp + sizeof(int); /* fixup pointers to symtab */ diff --git a/osfmk/ddb/db_trap.c b/osfmk/ddb/db_trap.c index 41acbd325..759649b82 100644 --- a/osfmk/ddb/db_trap.c +++ b/osfmk/ddb/db_trap.c @@ -132,11 +132,7 @@ db_task_trap(__unused int type, __unused int code, boolean_t user_space) db_printf("\n\t"); db_print_inst(db_dot, task_space); #else /* !defined(__alpha) */ -#if defined(__ppc__) - db_print_loc_and_inst(db_dot, task_space); -#else /* __ppc__ */ db_print_loc_and_inst(db_dot, task); -#endif /* __ppc__ */ #endif /* defined(__alpha) */ } else db_printf("Trouble printing location %#llX.\n", (unsigned long long)db_dot); diff --git a/osfmk/ddb/db_variables.c b/osfmk/ddb/db_variables.c index 0fe14d1e6..f30e5cad4 100644 --- a/osfmk/ddb/db_variables.c +++ b/osfmk/ddb/db_variables.c @@ -197,7 +197,7 @@ db_cmp_variable_name(struct db_variable *vp, const char *name, || (level > 0 && (ap->suffix[0] < vp->low || (vp->high >= 0 && ap->suffix[0] > vp->high)))) return(FALSE); - strcpy(ap->modif, (*np)? np+1: ""); + strlcpy(ap->modif, (*np)? np+1: "", TOK_STRING_SIZE); ap->thr_act = (db_option(ap->modif, 't')?db_default_act: THREAD_NULL); ap->level = level; ap->hidden_level = -1; @@ -451,7 +451,7 @@ db_show_one_variable(void) return; } - strcpy(aux_param.modif, *p ? p + 1 : ""); + strlcpy(aux_param.modif, *p ? p + 1 : "", TOK_STRING_SIZE); aux_param.thr_act = (db_option(aux_param.modif, 't') ? db_default_act : THREAD_NULL); } diff --git a/osfmk/ddb/db_variables.h b/osfmk/ddb/db_variables.h index 8d0b1c817..3ff52cf16 100644 --- a/osfmk/ddb/db_variables.h +++ b/osfmk/ddb/db_variables.h @@ -171,7 +171,7 @@ * auxiliary parameters passed to a variable handler */ struct db_var_aux_param { - char *modif; /* option strings */ + char *modif; /* option strings, must be TOK_STRING_SIZE */ short level; /* number of levels */ short hidden_level; /* hidden level */ short suffix[DB_VAR_LEVEL]; /* suffix */ diff --git a/osfmk/ddb/makedis.c b/osfmk/ddb/makedis.c index 59afa5290..a33bf216e 100644 --- a/osfmk/ddb/makedis.c +++ b/osfmk/ddb/makedis.c @@ -2371,9 +2371,10 @@ void *xmalloc(size_t size) { void *xstrdup(char *s) { char *p; + size_t i = strlen(s) + 1; - p = xmalloc(strlen(s) + 1); - strcpy(p, s); + p = xmalloc(i); + strlcpy(p, s, i); return p; } diff --git a/osfmk/default_pager/default_pager.c b/osfmk/default_pager/default_pager.c index 20f3e361a..5b2ee7b41 100644 --- a/osfmk/default_pager/default_pager.c +++ b/osfmk/default_pager/default_pager.c @@ -396,6 +396,7 @@ default_pager_initialize(void) vstruct_zone = zinit(sizeof(struct vstruct), 10000 * sizeof(struct vstruct), 8192, "vstruct zone"); + zone_change(vstruct_zone, Z_CALLERACCT, FALSE); zone_change(vstruct_zone, Z_NOENCRYPT, TRUE); VSL_LOCK_INIT(); diff --git a/osfmk/default_pager/default_pager_internal.h b/osfmk/default_pager/default_pager_internal.h index d1c4883e1..904643741 100644 --- a/osfmk/default_pager/default_pager_internal.h +++ b/osfmk/default_pager/default_pager_internal.h @@ -87,12 +87,14 @@ #define MACH_PORT_FACE mach_port_t -#if 0 -#ifndef USE_PRECIOUS -#define USE_PRECIOUS TRUE -#endif +#if CONFIG_FREEZE +#define RECLAIM_SWAP 1 +#else +#define RECLAIM_SWAP 0 #endif +#define USE_PRECIOUS 0 + #ifdef USER_PAGER #define UP(stuff) stuff #else /* USER_PAGER */ @@ -737,6 +739,9 @@ extern boolean_t bs_add_device(char *, MACH_PORT_FACE); extern vstruct_t ps_vstruct_create(dp_size_t); extern void ps_vstruct_dealloc(vstruct_t); +extern void ps_vstruct_reclaim(vstruct_t, + boolean_t, + boolean_t); extern kern_return_t pvs_cluster_read(vstruct_t, dp_offset_t, dp_size_t, diff --git a/osfmk/default_pager/dp_backing_store.c b/osfmk/default_pager/dp_backing_store.c index ceda0a902..9fcf6a2bd 100644 --- a/osfmk/default_pager/dp_backing_store.c +++ b/osfmk/default_pager/dp_backing_store.c @@ -163,11 +163,15 @@ unsigned int maximum_pages_free = 0; ipc_port_t min_pages_trigger_port = NULL; ipc_port_t max_pages_trigger_port = NULL; +#if CONFIG_FREEZE +boolean_t use_emergency_swap_file_first = TRUE; +#else boolean_t use_emergency_swap_file_first = FALSE; +#endif boolean_t bs_low = FALSE; int backing_store_release_trigger_disable = 0; boolean_t backing_store_stop_compaction = FALSE; - +boolean_t backing_store_abort_compaction = FALSE; /* Have we decided if swap needs to be encrypted yet ? */ boolean_t dp_encryption_inited = FALSE; @@ -176,7 +180,6 @@ boolean_t dp_encryption = FALSE; boolean_t dp_isssd = FALSE; - /* * Object sizes are rounded up to the next power of 2, * unless they are bigger than a given maximum size. @@ -205,6 +208,15 @@ unsigned int dp_pages_free = 0; unsigned int dp_pages_reserve = 0; unsigned int cluster_transfer_minimum = 100; +/* + * Trim state + */ +struct ps_vnode_trim_data { + struct vnode *vp; + dp_offset_t offset; + dp_size_t length; +}; + /* forward declarations */ kern_return_t ps_write_file(paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, int); /* forward */ kern_return_t ps_read_file (paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, unsigned int *, int); /* forward */ @@ -227,6 +239,10 @@ vs_map_t vs_get_map_entry( kern_return_t default_pager_backing_store_delete_internal( MACH_PORT_FACE ); +static inline void ps_vnode_trim_init(struct ps_vnode_trim_data *data); +static inline void ps_vnode_trim_now(struct ps_vnode_trim_data *data); +static inline void ps_vnode_trim_more(struct ps_vnode_trim_data *data, struct vs_map *map, unsigned int shift, dp_size_t length); + default_pager_thread_t * get_read_buffer( void ) { @@ -441,7 +457,7 @@ backing_store_lookup( if ((port == MACH_PORT_NULL) || port_is_vs(port)) */ - if ((port == MACH_PORT_NULL)) + if (port == MACH_PORT_NULL) return BACKING_STORE_NULL; BSL_LOCK(); @@ -714,6 +730,10 @@ ps_delete( if ((vs_count != 0) && (vs != NULL)) vs->vs_async_pending += 1; /* hold parties calling */ /* vs_async_wait */ + + if (bs_low == FALSE) + backing_store_abort_compaction = FALSE; + VS_UNLOCK(vs); VSL_UNLOCK(); while((vs_count != 0) && (vs != NULL)) { @@ -736,13 +756,19 @@ ps_delete( vm_object_t transfer_object; unsigned int count; upl_t upl; + int upl_flags; transfer_object = vm_object_allocate((vm_object_size_t)VM_SUPER_CLUSTER); count = 0; + upl_flags = (UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | + UPL_SET_LITE | UPL_SET_INTERNAL); + if (dp_encryption) { + /* mark the pages as "encrypted" when they come in */ + upl_flags |= UPL_ENCRYPT; + } error = vm_object_upl_request(transfer_object, (vm_object_offset_t)0, VM_SUPER_CLUSTER, - &upl, NULL, &count, - UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_LITE | UPL_SET_INTERNAL); + &upl, NULL, &count, upl_flags); if(error == KERN_SUCCESS) { error = ps_vstruct_transfer_from_segment( @@ -754,7 +780,7 @@ ps_delete( } vm_object_deallocate(transfer_object); } - if(error || current_thread_aborted() || backing_store_stop_compaction) { + if(error || current_thread_aborted()) { VS_LOCK(vs); vs->vs_async_pending -= 1; /* release vs_async_wait */ if (vs->vs_async_pending == 0 && vs->vs_waiting_async) { @@ -1408,6 +1434,7 @@ ps_select_segment( trigger = min_pages_trigger_port; min_pages_trigger_port = NULL; bs_low = TRUE; + backing_store_abort_compaction = TRUE; } lps = ps; } @@ -1428,6 +1455,8 @@ ps_select_segment( PSL_UNLOCK(); if (trigger != IP_NULL) { + dprintf(("ps_select_segment - send HI_WAT_ALERT\n")); + default_pager_space_alert(trigger, HI_WAT_ALERT); ipc_port_release_send(trigger); } @@ -1497,6 +1526,8 @@ ps_select_segment( minimum_pages_remaining)) { trigger = min_pages_trigger_port; min_pages_trigger_port = NULL; + bs_low = TRUE; + backing_store_abort_compaction = TRUE; } PS_UNLOCK(ps); /* @@ -1506,6 +1537,8 @@ ps_select_segment( PSL_UNLOCK(); if (trigger != IP_NULL) { + dprintf(("ps_select_segment - send HI_WAT_ALERT\n")); + default_pager_space_alert( trigger, HI_WAT_ALERT); @@ -1592,10 +1625,14 @@ ps_allocate_cluster( (dp_pages_free < minimum_pages_remaining)) { trigger = min_pages_trigger_port; min_pages_trigger_port = NULL; + bs_low = TRUE; + backing_store_abort_compaction = TRUE; } PSL_UNLOCK(); PS_UNLOCK(ps); if (trigger != IP_NULL) { + dprintf(("ps_allocate_cluster - send HI_WAT_ALERT\n")); + default_pager_space_alert(trigger, HI_WAT_ALERT); ipc_port_release_send(trigger); } @@ -1688,9 +1725,12 @@ ps_allocate_cluster( trigger = min_pages_trigger_port; min_pages_trigger_port = NULL; bs_low = TRUE; + backing_store_abort_compaction = TRUE; } PSL_UNLOCK(); if (trigger != IP_NULL) { + dprintf(("ps_allocate_cluster - send HI_WAT_ALERT\n")); + default_pager_space_alert(trigger, HI_WAT_ALERT); ipc_port_release_send(trigger); } @@ -1780,10 +1820,23 @@ ps_dealloc_vsmap( dp_size_t size) { unsigned int i; - for (i = 0; i < size; i++) - if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i])) + struct ps_vnode_trim_data trim_data; + + ps_vnode_trim_init(&trim_data); + + for (i = 0; i < size; i++) { + if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i])) { + ps_vnode_trim_more(&trim_data, + &vsmap[i], + VSM_PS(vsmap[i])->ps_clshift, + vm_page_size << VSM_PS(vsmap[i])->ps_clshift); ps_deallocate_cluster(VSM_PS(vsmap[i]), VSM_CLOFF(vsmap[i])); + } else { + ps_vnode_trim_now(&trim_data); + } + } + ps_vnode_trim_now(&trim_data); } void @@ -1826,6 +1879,134 @@ ps_vstruct_dealloc( zfree(vstruct_zone, vs); } +void +ps_vstruct_reclaim( + vstruct_t vs, + boolean_t return_to_vm, + boolean_t reclaim_backing_store) +{ + unsigned int i, j; +// spl_t s; + unsigned int request_flags; + struct vs_map *vsmap; + boolean_t vsmap_all_clear, vsimap_all_clear; + struct vm_object_fault_info fault_info; + int clmap_off; + unsigned int vsmap_size; + kern_return_t kr; + + request_flags = UPL_NO_SYNC | UPL_RET_ONLY_ABSENT | UPL_SET_LITE; + if (reclaim_backing_store) { +#if USE_PRECIOUS + request_flags |= UPL_PRECIOUS | UPL_CLEAN_IN_PLACE; +#else /* USE_PRECIOUS */ + request_flags |= UPL_REQUEST_SET_DIRTY; +#endif /* USE_PRECIOUS */ + } + + VS_MAP_LOCK(vs); + + fault_info.cluster_size = VM_SUPER_CLUSTER; + fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL; + fault_info.user_tag = 0; + fault_info.lo_offset = 0; + fault_info.hi_offset = ptoa_32(vs->vs_size << vs->vs_clshift); + fault_info.io_sync = reclaim_backing_store; + + /* + * If this is an indirect structure, then we walk through the valid + * (non-zero) indirect pointers and deallocate the clusters + * associated with each used map entry (via ps_dealloc_vsmap). + * When all of the clusters in an indirect block have been + * freed, we deallocate the block. When all of the indirect + * blocks have been deallocated we deallocate the memory + * holding the indirect pointers. + */ + if (vs->vs_indirect) { + vsimap_all_clear = TRUE; + for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) { + vsmap = vs->vs_imap[i]; + if (vsmap == NULL) + continue; + /* loop on clusters in this indirect map */ + clmap_off = (vm_page_size * CLMAP_ENTRIES * + VSCLSIZE(vs) * i); + if (i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size)) + vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i); + else + vsmap_size = CLMAP_ENTRIES; + vsmap_all_clear = TRUE; + if (return_to_vm) { + for (j = 0; j < vsmap_size;) { + if (VSM_ISCLR(vsmap[j]) || + VSM_ISERR(vsmap[j])) { + j++; + clmap_off += vm_page_size * VSCLSIZE(vs); + continue; + } + VS_MAP_UNLOCK(vs); + kr = pvs_cluster_read( + vs, + clmap_off, + (dp_size_t) -1, /* read whole cluster */ + &fault_info); + VS_MAP_LOCK(vs); /* XXX what if it changed ? */ + if (kr != KERN_SUCCESS) { + vsmap_all_clear = FALSE; + vsimap_all_clear = FALSE; + } + } + } + if (vsmap_all_clear) { + ps_dealloc_vsmap(vsmap, CLMAP_ENTRIES); + kfree(vsmap, CLMAP_THRESHOLD); + vs->vs_imap[i] = NULL; + } + } + if (vsimap_all_clear) { +// kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size)); + } + } else { + /* + * Direct map. Free used clusters, then memory. + */ + vsmap = vs->vs_dmap; + if (vsmap == NULL) { + goto out; + } + vsmap_all_clear = TRUE; + /* loop on clusters in the direct map */ + if (return_to_vm) { + for (j = 0; j < vs->vs_size;) { + if (VSM_ISCLR(vsmap[j]) || + VSM_ISERR(vsmap[j])) { + j++; + continue; + } + clmap_off = vm_page_size * (j << vs->vs_clshift); + VS_MAP_UNLOCK(vs); + kr = pvs_cluster_read( + vs, + clmap_off, + (dp_size_t) -1, /* read whole cluster */ + &fault_info); + VS_MAP_LOCK(vs); /* XXX what if it changed ? */ + if (kr != KERN_SUCCESS) { + vsmap_all_clear = FALSE; + } else { +// VSM_CLR(vsmap[j]); + } + } + } + if (vsmap_all_clear) { + ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size); +// kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size)); + } + } +out: + VS_MAP_UNLOCK(vs); +} + int ps_map_extend(vstruct_t, unsigned int); /* forward */ int ps_map_extend( @@ -2156,6 +2337,9 @@ ps_clunmap( { dp_offset_t cluster; /* The cluster number of offset */ struct vs_map *vsmap; + struct ps_vnode_trim_data trim_data; + + ps_vnode_trim_init(&trim_data); VS_MAP_LOCK(vs); @@ -2173,11 +2357,13 @@ ps_clunmap( else vsmap = vs->vs_dmap; if (vsmap == NULL) { + ps_vnode_trim_now(&trim_data); VS_MAP_UNLOCK(vs); return; } vsmap += cluster%CLMAP_ENTRIES; if (VSM_ISCLR(*vsmap)) { + ps_vnode_trim_now(&trim_data); length -= vm_page_size; offset += vm_page_size; continue; @@ -2206,12 +2392,19 @@ ps_clunmap( /* * If map entry is empty, clear and deallocate cluster. */ - if (!VSM_ALLOC(*vsmap)) { + if (!VSM_BMAP(*vsmap)) { + ps_vnode_trim_more(&trim_data, + vsmap, + vs->vs_clshift, + VSCLSIZE(vs) * vm_page_size); ps_deallocate_cluster(VSM_PS(*vsmap), VSM_CLOFF(*vsmap)); VSM_CLR(*vsmap); + } else { + ps_vnode_trim_now(&trim_data); } } + ps_vnode_trim_now(&trim_data); VS_MAP_UNLOCK(vs); } @@ -2670,16 +2863,31 @@ pvs_object_data_provided( ASSERT(size > 0); GSTAT(global_stats.gs_pages_in += atop_32(size)); - -#if USE_PRECIOUS - ps_clunmap(vs, offset, size); -#endif /* USE_PRECIOUS */ +/* check upl iosync flag instead of using RECLAIM_SWAP*/ +#if RECLAIM_SWAP + if (size != upl->size) { + upl_abort(upl, UPL_ABORT_ERROR); + upl_deallocate(upl); + } else { + ps_clunmap(vs, offset, size); + upl_commit(upl, NULL, 0); + upl_deallocate(upl); + } +#endif /* RECLAIM_SWAP */ } static memory_object_offset_t last_start; static vm_size_t last_length; +/* + * A "cnt" of 0 means that the caller just wants to check if the page at + * offset "vs_offset" exists in the backing store. That page hasn't been + * prepared, so no need to release it. + * + * A "cnt" of -1 means that the caller wants to bring back from the backing + * store all existing pages in the cluster containing "vs_offset". + */ kern_return_t pvs_cluster_read( vstruct_t vs, @@ -2707,16 +2915,32 @@ pvs_cluster_read( memory_object_offset_t cluster_start; vm_size_t cluster_length; uint32_t io_streaming; + int i; + boolean_t io_sync = FALSE; pages_in_cl = 1 << vs->vs_clshift; cl_size = pages_in_cl * vm_page_size; cl_mask = cl_size - 1; -#if USE_PRECIOUS - request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_PRECIOUS | UPL_RET_ONLY_ABSENT | UPL_SET_LITE; -#else - request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_RET_ONLY_ABSENT | UPL_SET_LITE; -#endif + request_flags = UPL_NO_SYNC | UPL_RET_ONLY_ABSENT | UPL_SET_LITE; + + if (cnt == (dp_size_t) -1) { + /* + * We've been called from ps_vstruct_reclaim() to move all + * the object's swapped pages back to VM pages. + * This can put memory pressure on the system, so we do want + * to wait for free pages, to avoid getting in the way of the + * vm_pageout_scan() thread. + * Let's not use UPL_NOBLOCK in this case. + */ + vs_offset &= ~cl_mask; + i = pages_in_cl; + } else { + i = 1; + request_flags |= UPL_NOBLOCK; + } + +again: cl_index = (vs_offset & cl_mask) / vm_page_size; if ((ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0) == (dp_offset_t)-1) || @@ -2735,6 +2959,16 @@ pvs_cluster_read( */ return KERN_FAILURE; } + if (cnt == (dp_size_t) -1) { + i--; + if (i == 0) { + /* no more pages in this cluster */ + return KERN_FAILURE; + } + /* try the next page in this cluster */ + vs_offset += vm_page_size; + goto again; + } page_list_count = 0; @@ -2762,6 +2996,24 @@ pvs_cluster_read( return KERN_SUCCESS; } + if(((vm_object_fault_info_t)fault_info)->io_sync == TRUE ) { + io_sync = TRUE; + } else { +#if RECLAIM_SWAP + io_sync = TRUE; +#endif /* RECLAIM_SWAP */ + } + + if( io_sync == TRUE ) { + + io_flags |= UPL_IOSYNC | UPL_NOCOMMIT; +#if USE_PRECIOUS + request_flags |= UPL_PRECIOUS | UPL_CLEAN_IN_PLACE; +#else /* USE_PRECIOUS */ + request_flags |= UPL_REQUEST_SET_DIRTY; +#endif /* USE_PRECIOUS */ + } + assert(dp_encryption_inited); if (dp_encryption) { /* @@ -2770,6 +3022,7 @@ pvs_cluster_read( * decryption. */ request_flags |= UPL_ENCRYPT; + io_flags |= UPL_PAGING_ENCRYPTED; } orig_vs_offset = vs_offset; @@ -2970,7 +3223,7 @@ pvs_cluster_read( memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset, xfer_size, xfer_size, &upl, NULL, &page_list_count, - request_flags | UPL_SET_INTERNAL | UPL_NOBLOCK); + request_flags | UPL_SET_INTERNAL); error = ps_read_file(psp[beg_pseg], upl, (upl_offset_t) 0, @@ -3091,15 +3344,33 @@ vs_cluster_write( boolean_t minimal_clustering = FALSE; boolean_t found_dirty; + if (!dp_encryption_inited) { + /* + * ENCRYPTED SWAP: + * Once we've started using swap, we + * can't change our mind on whether + * it needs to be encrypted or + * not. + */ + dp_encryption_inited = TRUE; + } + if (dp_encryption) { + /* + * ENCRYPTED SWAP: + * the UPL will need to be encrypted... + */ + flags |= UPL_PAGING_ENCRYPTED; + } + pages_in_cl = 1 << vs->vs_clshift; cl_size = pages_in_cl * vm_page_size; #if CONFIG_FREEZE minimal_clustering = TRUE; -#endif +#else if (dp_isssd == TRUE) minimal_clustering = TRUE; - +#endif if (!dp_internal) { unsigned int page_list_count; int request_flags; @@ -3124,16 +3395,6 @@ vs_cluster_write( UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE; - if (!dp_encryption_inited) { - /* - * ENCRYPTED SWAP: - * Once we've started using swap, we - * can't change our mind on whether - * it needs to be encrypted or - * not. - */ - dp_encryption_inited = TRUE; - } if (dp_encryption) { /* * ENCRYPTED SWAP: @@ -3143,6 +3404,7 @@ vs_cluster_write( request_flags |= UPL_ENCRYPT; flags |= UPL_PAGING_ENCRYPTED; } + page_list_count = 0; memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)offset, @@ -3168,6 +3430,7 @@ vs_cluster_write( found_dirty = TRUE; for (seg_index = 0, transfer_size = upl->size; transfer_size > 0; ) { + unsigned int seg_pgcnt; seg_pgcnt = seg_size / PAGE_SIZE; @@ -3208,7 +3471,7 @@ vs_cluster_write( page_index += seg_pgcnt; transfer_size -= seg_size; upl_offset_aligned += cl_size; - seg_size = cl_size; + seg_size = cl_size; seg_index++; } else transfer_size = 0; @@ -3588,6 +3851,14 @@ ps_vstruct_transfer_from_segment( vs->vs_xfer_pending = FALSE; VS_UNLOCK(vs); vs_finish_write(vs); + + if (backing_store_abort_compaction || backing_store_stop_compaction) { + backing_store_abort_compaction = FALSE; + dprintf(("ps_vstruct_transfer_from_segment - ABORTED\n")); + return KERN_FAILURE; + } + vnode_pager_throttle(); + VS_LOCK(vs); vs->vs_xfer_pending = TRUE; vs_wait_for_sync_writers(vs); @@ -3810,7 +4081,7 @@ vs_cluster_transfer( /* NEED TO ISSUE WITH SYNC & NO COMMIT */ error = ps_read_file(ps, upl, (upl_offset_t) 0, actual_offset, size, &residual, - (UPL_IOSYNC | UPL_NOCOMMIT)); + (UPL_IOSYNC | UPL_NOCOMMIT | (dp_encryption ? UPL_PAGING_ENCRYPTED : 0))); } read_vsmap = *vsmap_ptr; @@ -4028,12 +4299,17 @@ default_pager_add_file( * emergency segment will be back to its original state of * online but not activated (till it's needed the next time). */ - ps = paging_segments[EMERGENCY_PSEG_INDEX]; - if(IS_PS_EMERGENCY_SEGMENT(ps) && IS_PS_OK_TO_USE(ps)) { - if(default_pager_backing_store_delete(emergency_segment_backing_store)) { - dprintf(("Failed to recover emergency paging segment\n")); - } else { - dprintf(("Recovered emergency paging segment\n")); +#if CONFIG_FREEZE + if (!vm_freeze_enabled) +#endif + { + ps = paging_segments[EMERGENCY_PSEG_INDEX]; + if(IS_PS_EMERGENCY_SEGMENT(ps) && IS_PS_OK_TO_USE(ps)) { + if(default_pager_backing_store_delete(emergency_segment_backing_store)) { + dprintf(("Failed to recover emergency paging segment\n")); + } else { + dprintf(("Recovered emergency paging segment\n")); + } } } @@ -4123,6 +4399,49 @@ ps_write_file( return result; } +static inline void ps_vnode_trim_init(struct ps_vnode_trim_data *data) +{ +#if CONFIG_EMBEDDED + data->vp = NULL; + data->offset = 0; + data->length = 0; +#else +#pragma unused(data) +#endif +} + +static inline void ps_vnode_trim_now(struct ps_vnode_trim_data *data) +{ +#if CONFIG_EMBEDDED + if ((data->vp) != NULL) { + vnode_trim(data->vp, + data->offset, + data->length); + ps_vnode_trim_init(data); + } +#else +#pragma unused(data) +#endif +} + +static inline void ps_vnode_trim_more(struct ps_vnode_trim_data *data, struct vs_map *map, unsigned int shift, dp_size_t length) +{ +#if CONFIG_EMBEDDED + struct vnode *vp = VSM_PS(*map)->ps_vnode; + dp_offset_t offset = ptoa_32(VSM_CLOFF(*map)) << shift; + + if ((vp != data->vp) || (offset) != (data->offset + data->length)) { + ps_vnode_trim_now(data); + data->vp = vp; + data->offset = offset; + data->length = 0; + } + data->length += (length); +#else +#pragma unused(data, map, shift, length) +#endif +} + kern_return_t default_pager_triggers( __unused MACH_PORT_FACE default_pager, int hi_wat, @@ -4130,7 +4449,7 @@ default_pager_triggers( __unused MACH_PORT_FACE default_pager, int flags, MACH_PORT_FACE trigger_port) { - MACH_PORT_FACE release; + MACH_PORT_FACE release = IPC_PORT_NULL; kern_return_t kr; clock_sec_t now; clock_nsec_t nanoseconds_dummy; @@ -4159,15 +4478,42 @@ default_pager_triggers( __unused MACH_PORT_FACE default_pager, } } else if (flags == HI_WAT_ALERT) { release = min_pages_trigger_port; - min_pages_trigger_port = trigger_port; - minimum_pages_remaining = hi_wat/vm_page_size; - bs_low = FALSE; - kr = KERN_SUCCESS; +#if CONFIG_FREEZE + /* High and low water signals aren't applicable when freeze is */ + /* enabled, so release the trigger ports here and return */ + /* KERN_FAILURE. */ + if (vm_freeze_enabled) { + if (IP_VALID( trigger_port )){ + ipc_port_release_send( trigger_port ); + } + min_pages_trigger_port = IPC_PORT_NULL; + kr = KERN_FAILURE; + } + else +#endif + { + min_pages_trigger_port = trigger_port; + minimum_pages_remaining = hi_wat/vm_page_size; + bs_low = FALSE; + kr = KERN_SUCCESS; + } } else if (flags == LO_WAT_ALERT) { release = max_pages_trigger_port; - max_pages_trigger_port = trigger_port; - maximum_pages_free = lo_wat/vm_page_size; - kr = KERN_SUCCESS; +#if CONFIG_FREEZE + if (vm_freeze_enabled) { + if (IP_VALID( trigger_port )){ + ipc_port_release_send( trigger_port ); + } + max_pages_trigger_port = IPC_PORT_NULL; + kr = KERN_FAILURE; + } + else +#endif + { + max_pages_trigger_port = trigger_port; + maximum_pages_free = lo_wat/vm_page_size; + kr = KERN_SUCCESS; + } } else if (flags == USE_EMERGENCY_SWAP_FILE_FIRST) { use_emergency_swap_file_first = TRUE; release = trigger_port; @@ -4259,6 +4605,8 @@ default_pager_backing_store_monitor(__unused thread_call_param_t p1, } else { VSL_UNLOCK(); } + dprintf(("default_pager_backing_store_monitor - send LO_WAT_ALERT\n")); + default_pager_space_alert(trigger, LO_WAT_ALERT); ipc_port_release_send(trigger); dp_pages_free_low_count = 0; @@ -4267,3 +4615,9 @@ default_pager_backing_store_monitor(__unused thread_call_param_t p1, clock_interval_to_deadline(PF_INTERVAL, NSEC_PER_SEC, &deadline); thread_call_enter_delayed(default_pager_backing_store_monitor_callout, deadline); } + +#if CONFIG_FREEZE +unsigned int default_pager_swap_pages_free() { + return dp_pages_free; +} +#endif diff --git a/osfmk/default_pager/dp_memory_object.c b/osfmk/default_pager/dp_memory_object.c index c85278056..e122e7711 100644 --- a/osfmk/default_pager/dp_memory_object.c +++ b/osfmk/default_pager/dp_memory_object.c @@ -369,6 +369,7 @@ const struct memory_object_pager_ops default_pager_ops = { dp_memory_object_synchronize, dp_memory_object_map, dp_memory_object_last_unmap, + dp_memory_object_data_reclaim, "default pager" }; @@ -431,6 +432,33 @@ dp_memory_object_last_unmap( return KERN_FAILURE; } +kern_return_t +dp_memory_object_data_reclaim( + memory_object_t mem_obj, + boolean_t reclaim_backing_store) +{ + vstruct_t vs; + + vs_lookup(mem_obj, vs); + for (;;) { + vs_lock(vs); + vs_async_wait(vs); + if (!vs->vs_xfer_pending) { + break; + } + } + vs->vs_xfer_pending = TRUE; + vs_unlock(vs); + + ps_vstruct_reclaim(vs, TRUE, reclaim_backing_store); + + vs_lock(vs); + vs->vs_xfer_pending = FALSE; + vs_unlock(vs); + + return KERN_SUCCESS; +} + kern_return_t dp_memory_object_terminate( memory_object_t mem_obj) diff --git a/osfmk/device/device.defs b/osfmk/device/device.defs index 5410b050e..2e39dc559 100644 --- a/osfmk/device/device.defs +++ b/osfmk/device/device.defs @@ -215,12 +215,14 @@ routine io_registry_entry_get_parent_iterator( out iterator : io_object_t ); -routine io_service_open( +skip; +/* was routine io_service_open service : io_object_t; in owningTask : task_t; in connect_type : uint32_t; out connection : io_connect_t ); +*/ routine io_service_close( connection : io_connect_t @@ -599,8 +601,8 @@ routine io_connect_method( in ool_input : mach_vm_address_t; in ool_input_size : mach_vm_size_t; - out scalar_output : io_scalar_inband64_t, CountInOut; out inband_output : io_struct_inband_t, CountInOut; + out scalar_output : io_scalar_inband64_t, CountInOut; in ool_output : mach_vm_address_t; inout ool_output_size : mach_vm_size_t ); @@ -616,8 +618,8 @@ routine io_connect_async_method( in ool_input : mach_vm_address_t; in ool_input_size : mach_vm_size_t; - out scalar_output : io_scalar_inband64_t, CountInOut; out inband_output : io_struct_inband_t, CountInOut; + out scalar_output : io_scalar_inband64_t, CountInOut; in ool_output : mach_vm_address_t; inout ool_output_size : mach_vm_size_t ); diff --git a/osfmk/device/iokit_rpc.c b/osfmk/device/iokit_rpc.c index 5990a3e5c..5c5f8b742 100644 --- a/osfmk/device/iokit_rpc.c +++ b/osfmk/device/iokit_rpc.c @@ -63,9 +63,6 @@ #include -#ifdef __ppc__ -#include -#endif #if defined(__i386__) || defined(__x86_64__) #include #endif @@ -449,13 +446,16 @@ unsigned int IODefaultCacheBits(addr64_t pa) kern_return_t IOMapPages(vm_map_t map, mach_vm_address_t va, mach_vm_address_t pa, mach_vm_size_t length, unsigned int options) { - vm_prot_t prot; + vm_prot_t prot; unsigned int flags; + ppnum_t pagenum; pmap_t pmap = map->pmap; prot = (options & kIOMapReadOnly) ? VM_PROT_READ : (VM_PROT_READ|VM_PROT_WRITE); + pagenum = (ppnum_t)atop_64(pa); + switch(options & kIOMapCacheMask ) { /* What cache mode do we need? */ case kIOMapDefaultCache: @@ -480,8 +480,13 @@ kern_return_t IOMapPages(vm_map_t map, mach_vm_address_t va, mach_vm_address_t p break; } + pmap_set_cache_attributes(pagenum, flags); + + vm_map_set_cache_attr(map, (vm_map_offset_t)va); + + // Set up a block mapped area - pmap_map_block(pmap, va, (ppnum_t)atop_64(pa), (uint32_t) atop_64(round_page_64(length)), prot, flags, 0); + pmap_map_block(pmap, va, pagenum, (uint32_t) atop_64(round_page_64(length)), prot, 0, 0); return( KERN_SUCCESS ); } @@ -498,10 +503,6 @@ kern_return_t IOUnmapPages(vm_map_t map, mach_vm_address_t va, mach_vm_size_t le kern_return_t IOProtectCacheMode(vm_map_t __unused map, mach_vm_address_t __unused va, mach_vm_size_t __unused length, unsigned int __unused options) { -#if __ppc__ - // can't remap block mappings, but ppc doesn't speculatively read from WC -#else - mach_vm_size_t off; vm_prot_t prot; unsigned int flags; @@ -542,31 +543,25 @@ kern_return_t IOProtectCacheMode(vm_map_t __unused map, mach_vm_address_t __unus pmap_enter(pmap, va + off, ppnum, prot, flags, TRUE); } -#endif - return (KERN_SUCCESS); } ppnum_t IOGetLastPageNumber(void) { - ppnum_t lastPage, highest = 0; - unsigned int idx; - -#if __ppc__ - for (idx = 0; idx < pmap_mem_regions_count; idx++) - { - lastPage = pmap_mem_regions[idx].mrEnd; -#elif __i386__ || __x86_64__ - for (idx = 0; idx < pmap_memory_region_count; idx++) - { - lastPage = pmap_memory_regions[idx].end - 1; +#if __i386__ || __x86_64__ + ppnum_t lastPage, highest = 0; + unsigned int idx; + + for (idx = 0; idx < pmap_memory_region_count; idx++) + { + lastPage = pmap_memory_regions[idx].end - 1; + if (lastPage > highest) + highest = lastPage; + } + return (highest); #else -#error arch +#error unknown arch #endif - if (lastPage > highest) - highest = lastPage; - } - return (highest); } diff --git a/osfmk/device/subrs.c b/osfmk/device/subrs.c index b9aafe509..105edff0f 100644 --- a/osfmk/device/subrs.c +++ b/osfmk/device/subrs.c @@ -261,7 +261,7 @@ strncasecmp(const char *s1, const char *s2, size_t n) * Deprecation Warning: * strcpy() is being deprecated. Please use strlcpy() instead. */ - +#if !CONFIG_EMBEDDED char * strcpy( char *to, @@ -274,7 +274,7 @@ strcpy( return ret; } - +#endif /* * Abstract: @@ -428,6 +428,7 @@ itoa( * Deprecation Warning: * strcat() is being deprecated. Please use strlcat() instead. */ +#if !CONFIG_EMBEDDED char * strcat( char *dest, @@ -441,6 +442,7 @@ strcat( ; return (old); } +#endif /* * Appends src to string dst of size siz (unlike strncat, siz is the @@ -535,7 +537,7 @@ strlcpy(char *dst, const char *src, size_t siz) * one should use FREE() with the allocated buffer. * */ -inline char * +char * STRDUP(const char *string, int type) { size_t len; diff --git a/osfmk/gssd/Makefile b/osfmk/gssd/Makefile index 2f7167424..bda924f4b 100644 --- a/osfmk/gssd/Makefile +++ b/osfmk/gssd/Makefile @@ -8,14 +8,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = -INSTINC_SUBDIRS_PPC = - INSTINC_SUBDIRS_I386 = EXPINC_SUBDIRS = -EXPINC_SUBDIRS_PPC = - EXPINC_SUBDIRS_I386 = MIG_DEFS = gssd_mach.defs diff --git a/osfmk/gssd/gssd_mach.defs b/osfmk/gssd/gssd_mach.defs index 014785f3a..abe5ffe08 100644 --- a/osfmk/gssd/gssd_mach.defs +++ b/osfmk/gssd/gssd_mach.defs @@ -32,16 +32,18 @@ #ifdef KERNEL import ; #else -import ; +import ; #endif -type mechtype = int32_t; -type string_t = c_string[*:1024]; -type byte_buffer = array [] of uint8_t; +type gssd_mechtype = int32_t; +type gssd_nametype = int32_t; +type gssd_string = c_string[*:1024]; /* MAX_PRINC_STR must be < 1024 */ +type gssd_dstring = c_string[*:128]; /* MAX_DISPLAY_STR must be < 128 */ +type gssd_byte_buffer = array [] of uint8_t; type gssd_verifier = uint64_t; -type gid_list = array [*:16] of uint32_t; -type gss_ctx = uint64_t; -type gss_cred = uint64_t; +type gssd_gid_list = array [*:16] of uint32_t; +type gssd_ctx = uint64_t; +type gssd_cred = uint64_t; subsystem #if KERNEL_USER @@ -53,43 +55,99 @@ serverprefix svc_; routine mach_gss_init_sec_context( server : mach_port_t; - in mech : mechtype; - in intoken : byte_buffer; + in mech : gssd_mechtype; + in intoken : gssd_byte_buffer; in uid : uint32_t; - in princ_namestr : string_t; - in svc_namestr : string_t; + in princ_namestr : gssd_string; + in svc_namestr : gssd_string; in flags : uint32_t; in gssd_flags : uint32_t; - inout context : gss_ctx; - inout cred_handle : gss_cred; + inout context : gssd_ctx; + inout cred_handle : gssd_cred; out ret_flags : uint32_t; - out key : byte_buffer, dealloc; - out outtoken : byte_buffer, dealloc; + out key : gssd_byte_buffer, dealloc; + out outtoken : gssd_byte_buffer, dealloc; out major_stat : uint32_t; out minor_stat : uint32_t ); routine mach_gss_accept_sec_context( server : mach_port_t; - in intoken : byte_buffer; - in svc_namestr : string_t; + in intoken : gssd_byte_buffer; + in svc_namestr : gssd_string; in gssd_flags : uint32_t; - inout context : gss_ctx; - inout cred_handle : gss_cred; + inout context : gssd_ctx; + inout cred_handle : gssd_cred; out flags : uint32_t; out uid : uint32_t; - out gids : gid_list; - out key : byte_buffer, dealloc; - out outtoken : byte_buffer, dealloc; + out gids : gssd_gid_list; + out key : gssd_byte_buffer, dealloc; + out outtoken : gssd_byte_buffer, dealloc; out major_stat : uint32_t; out minor_stat : uint32_t ); simpleroutine mach_gss_log_error( server : mach_port_t; - in mnt : string_t; + in mnt : gssd_string; in uid : uint32_t; - in source : string_t; + in source : gssd_string; in major_stat : uint32_t; in minor_stat : uint32_t ); + +routine mach_gss_init_sec_context_v2( + server : mach_port_t; + in mech : gssd_mechtype; + in intoken : gssd_byte_buffer; + in uid : uint32_t; + in clnt_nt : gssd_nametype; + in clnt_princ : gssd_byte_buffer; + in svc_nt : gssd_nametype; + in svc_princ : gssd_byte_buffer; + in flags : uint32_t; + inout gssd_flags : uint32_t; + inout context : gssd_ctx; + inout cred_handle : gssd_cred; + out ret_flags : uint32_t; + out key : gssd_byte_buffer, dealloc; + out outtoken : gssd_byte_buffer, dealloc; + out displayname : gssd_dstring; + out major_stat : uint32_t; + out minor_stat : uint32_t +); + +routine mach_gss_accept_sec_context_v2( + server : mach_port_t; + in intoken : gssd_byte_buffer; + in svc_nt : gssd_nametype; + in svc_princ : gssd_byte_buffer; + inout gssd_flags : uint32_t; + inout context : gssd_ctx; + inout cred_handle : gssd_cred; + out flags : uint32_t; + out uid : uint32_t; + out gids : gssd_gid_list; + out key : gssd_byte_buffer, dealloc; + out outtoken : gssd_byte_buffer, dealloc; + out major_stat : uint32_t; + out minor_stat : uint32_t +); + +routine mach_gss_hold_cred( + server : mach_port_t; + in mech : gssd_mechtype; + in nt : gssd_nametype; + in princ : gssd_byte_buffer; + out major_stat : uint32_t; + out minor_stat : uint32_t +); + +routine mach_gss_unhold_cred( + server : mach_port_t; + in mech : gssd_mechtype; + in nt : gssd_nametype; + in princ : gssd_byte_buffer; + out major_stat : uint32_t; + out minor_stat : uint32_t +); diff --git a/osfmk/gssd/gssd_mach_types.h b/osfmk/gssd/gssd_mach_types.h index c091cc3ef..e3bde951a 100644 --- a/osfmk/gssd/gssd_mach_types.h +++ b/osfmk/gssd/gssd_mach_types.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2006, 2008, 2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -29,34 +29,41 @@ #ifndef _GSSD_MACH_TYPES_H_ #define _GSSD_MACH_TYPES_H_ -typedef enum mechtype { DEFAULT_MECH = 0, KRB5_MECH = 0, SPNEGO_MECH } mechtype; -typedef char *string_t; -typedef uint8_t *byte_buffer; -typedef uint32_t *gid_list; -typedef uint64_t gss_ctx; -typedef uint64_t gss_cred; +#define MAX_DISPLAY_STR 128 +#define MAX_PRINC_STR 1024 + +typedef enum gssd_mechtype { GSSD_NO_MECH = -1, GSSD_KRB5_MECH = 0, + GSSD_SPNEGO_MECH, GSSD_NTLM_MECH } gssd_mechtype; +typedef enum gssd_nametype { GSSD_STRING_NAME = 0, GSSD_EXPORT, + GSSD_ANONYMOUS, GSSD_HOSTBASED, GSSD_USER, GSSD_MACHINE_UID, + GSSD_STRING_UID, GSSD_KRB5_PRINCIPAL, GSSD_KRB5_REFERRAL, + GSSD_NTLM_PRINCIPAL, GSSD_NTLM_BLOB} gssd_nametype; +typedef char *gssd_string; +typedef char *gssd_dstring; +typedef uint8_t *gssd_byte_buffer; +typedef uint32_t *gssd_gid_list; +typedef uint64_t gssd_ctx; +typedef uint64_t gssd_cred; -#define GSSD_GSS_FLAGS_MASK 0x1FF /* The following need to correspond to GSS_C_*_FLAG in gssapi.h */ #define GSSD_DELEG_FLAG 1 -#define GSSD_MUTUAL_FLAG 2 -#define GSSD_REPLAY_FLAG 4 +#define GSSD_MUTUAL_FLAG 2 +#define GSSD_REPLAY_FLAG 4 #define GSSD_SEQUENCE_FLAG 8 #define GSSD_CONF_FLAG 16 #define GSSD_INTEG_FLAG 32 #define GSSD_ANON_FLAG 64 #define GSSD_PROT_FLAG 128 #define GSSD_TRANS_FLAG 256 -#define GSSD_C_DELEG_POLICY_FLAG 32768 +#define GSSD_DELEG_POLICY_FLAG 32768 -#define GSSD_FLAGS_SHIFT 0 -#define GSSD_NO_DEFAULT (1 << GSSD_FLAGS_SHIFT) // Only use principal from uid -#define GSSD_NO_CANON (2 << GSSD_FLAGS_SHIFT) // Don't canononicalize host names -#define GSSD_HOME_ACCESS_OK (4 << GSSD_FLAGS_SHIFT) // OK to access home directory -#define GSSD_UI_OK (8 << GSSD_FLAGS_SHIFT) // OK to bring up UI -#define GSSD_RESTART (16 << GSSD_FLAGS_SHIFT) // Destroy the supplied context and start over -#define GSSD_NFS_1DES (64 << GSSD_FLAGS_SHIFT) // Only get single DES session keys -#define GSSD_WIN2K_HACK (128 << GSSD_FLAGS_SHIFT) // Hack for Win2K +#define GSSD_NO_DEFAULT 1 // Only use the supplied principal, do not fallback to the default. +#define GSSD_NO_CANON 2 // Don't canononicalize host names +#define GSSD_HOME_ACCESS_OK 4 // OK to access home directory +#define GSSD_GUEST_ONLY 8 // NTLM Server is forcing guest access +#define GSSD_RESTART 16 // Destroy the supplied context and start over +#define GSSD_NFS_1DES 64 // Only get single DES session keys +#define GSSD_WIN2K_HACK 128 // Hack for Win2K #endif /* _GSSD_MACH_TYPES_H_ */ diff --git a/osfmk/i386/AT386/model_dep.c b/osfmk/i386/AT386/model_dep.c index 6727bae26..c21012552 100644 --- a/osfmk/i386/AT386/model_dep.c +++ b/osfmk/i386/AT386/model_dep.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -93,8 +93,12 @@ #include /* mp_rendezvous_break_lock */ #include #include -#include +#include +#include +#if CONFIG_MTRR #include +#endif +#include #include #include /* inb() */ #include @@ -116,6 +120,10 @@ #include #include +#include + +#define DPRINTF(x...) +//#define DPRINTF(x...) kprintf(x) static void machine_conf(void); @@ -130,6 +138,8 @@ volatile int pbtcpu = -1; hw_lock_data_t pbtlock; /* backtrace print lock */ uint32_t pbtcnt = 0; +volatile int panic_double_fault_cpu = -1; + #if defined (__i386__) #define PRINT_ARGS_FROM_STACK_FRAME 1 #elif defined (__x86_64__) @@ -168,10 +178,10 @@ machine_startup(void) #endif if (PE_parse_boot_argn("debug", &debug_boot_arg, sizeof (debug_boot_arg))) { + panicDebugging = TRUE; if (debug_boot_arg & DB_HALT) halt_in_debugger=1; if (debug_boot_arg & DB_PRT) disable_debug_output=FALSE; if (debug_boot_arg & DB_SLOG) systemLogDiags=TRUE; - if (debug_boot_arg & DB_NMI) panicDebugging=TRUE; if (debug_boot_arg & DB_LOG_PI_SCRN) logPanicDataToScreen=TRUE; } else { debug_boot_arg = 0; @@ -369,8 +379,14 @@ efi_set_tables_64(EFI_SYSTEM_TABLE_64 * system_table) uint32_t hdr_cksum; uint32_t cksum; - kprintf("Processing 64-bit EFI tables at %p\n", system_table); + DPRINTF("Processing 64-bit EFI tables at %p\n", system_table); do { + DPRINTF("Header:\n"); + DPRINTF(" Signature: 0x%016llx\n", system_table->Hdr.Signature); + DPRINTF(" Revision: 0x%08x\n", system_table->Hdr.Revision); + DPRINTF(" HeaderSize: 0x%08x\n", system_table->Hdr.HeaderSize); + DPRINTF(" CRC32: 0x%08x\n", system_table->Hdr.CRC32); + DPRINTF("RuntimeServices: 0x%016llx\n", system_table->RuntimeServices); if (system_table->Hdr.Signature != EFI_SYSTEM_TABLE_SIGNATURE) { kprintf("Bad EFI system table signature\n"); break; @@ -380,7 +396,7 @@ efi_set_tables_64(EFI_SYSTEM_TABLE_64 * system_table) system_table->Hdr.CRC32 = 0; cksum = crc32(0L, system_table, system_table->Hdr.HeaderSize); - //kprintf("System table calculated CRC32 = 0x%x, header = 0x%x\n", cksum, hdr_cksum); + DPRINTF("System table calculated CRC32 = 0x%x, header = 0x%x\n", cksum, hdr_cksum); system_table->Hdr.CRC32 = hdr_cksum; if (cksum != hdr_cksum) { kprintf("Bad EFI system table checksum\n"); @@ -389,7 +405,6 @@ efi_set_tables_64(EFI_SYSTEM_TABLE_64 * system_table) gPEEFISystemTable = system_table; - if (!cpu_mode_is64bit()) { kprintf("Skipping 64-bit EFI runtime services for 32-bit legacy mode\n"); break; @@ -399,10 +414,10 @@ efi_set_tables_64(EFI_SYSTEM_TABLE_64 * system_table) kprintf("No runtime table present\n"); break; } - kprintf("RuntimeServices table at 0x%qx\n", system_table->RuntimeServices); + DPRINTF("RuntimeServices table at 0x%qx\n", system_table->RuntimeServices); // 64-bit virtual address is OK for 64-bit EFI and 64/32-bit kernel. runtime = (EFI_RUNTIME_SERVICES_64 *) (uintptr_t)system_table->RuntimeServices; - kprintf("Checking runtime services table %p\n", runtime); + DPRINTF("Checking runtime services table %p\n", runtime); if (runtime->Hdr.Signature != EFI_RUNTIME_SERVICES_SIGNATURE) { kprintf("Bad EFI runtime table signature\n"); break; @@ -413,7 +428,7 @@ efi_set_tables_64(EFI_SYSTEM_TABLE_64 * system_table) runtime->Hdr.CRC32 = 0; cksum = crc32(0L, runtime, runtime->Hdr.HeaderSize); - //kprintf("Runtime table calculated CRC32 = 0x%x, header = 0x%x\n", cksum, hdr_cksum); + DPRINTF("Runtime table calculated CRC32 = 0x%x, header = 0x%x\n", cksum, hdr_cksum); runtime->Hdr.CRC32 = hdr_cksum; if (cksum != hdr_cksum) { kprintf("Bad EFI runtime table checksum\n"); @@ -432,8 +447,14 @@ efi_set_tables_32(EFI_SYSTEM_TABLE_32 * system_table) uint32_t hdr_cksum; uint32_t cksum; - kprintf("Processing 32-bit EFI tables at %p\n", system_table); + DPRINTF("Processing 32-bit EFI tables at %p\n", system_table); do { + DPRINTF("Header:\n"); + DPRINTF(" Signature: 0x%016llx\n", system_table->Hdr.Signature); + DPRINTF(" Revision: 0x%08x\n", system_table->Hdr.Revision); + DPRINTF(" HeaderSize: 0x%08x\n", system_table->Hdr.HeaderSize); + DPRINTF(" CRC32: 0x%08x\n", system_table->Hdr.CRC32); + DPRINTF("RuntimeServices: 0x%08x\n", system_table->RuntimeServices); if (system_table->Hdr.Signature != EFI_SYSTEM_TABLE_SIGNATURE) { kprintf("Bad EFI system table signature\n"); break; @@ -441,9 +462,10 @@ efi_set_tables_32(EFI_SYSTEM_TABLE_32 * system_table) // Verify signature of the system table hdr_cksum = system_table->Hdr.CRC32; system_table->Hdr.CRC32 = 0; + DPRINTF("System table at %p HeaderSize 0x%x\n", system_table, system_table->Hdr.HeaderSize); cksum = crc32(0L, system_table, system_table->Hdr.HeaderSize); - //kprintf("System table calculated CRC32 = 0x%x, header = 0x%x\n", cksum, hdr_cksum); + DPRINTF("System table calculated CRC32 = 0x%x, header = 0x%x\n", cksum, hdr_cksum); system_table->Hdr.CRC32 = hdr_cksum; if (cksum != hdr_cksum) { kprintf("Bad EFI system table checksum\n"); @@ -452,15 +474,20 @@ efi_set_tables_32(EFI_SYSTEM_TABLE_32 * system_table) gPEEFISystemTable = system_table; - if(system_table->RuntimeServices == 0) { kprintf("No runtime table present\n"); break; } - kprintf("RuntimeServices table at 0x%x\n", system_table->RuntimeServices); + DPRINTF("RuntimeServices table at 0x%x\n", system_table->RuntimeServices); // 32-bit virtual address is OK for 32-bit EFI and 32-bit kernel. - // For a 64-bit kernel, booter will ensure pointer is zeroed out - runtime = (EFI_RUNTIME_SERVICES_32 *) (intptr_t)system_table->RuntimeServices; + // For a 64-bit kernel, booter provides a virtual address mod 4G + runtime = (EFI_RUNTIME_SERVICES_32 *) +#ifdef __x86_64__ + (system_table->RuntimeServices | VM_MIN_KERNEL_ADDRESS); +#else + system_table->RuntimeServices; +#endif + DPRINTF("Runtime table addressed at %p\n", runtime); if (runtime->Hdr.Signature != EFI_RUNTIME_SERVICES_SIGNATURE) { kprintf("Bad EFI runtime table signature\n"); break; @@ -471,13 +498,26 @@ efi_set_tables_32(EFI_SYSTEM_TABLE_32 * system_table) runtime->Hdr.CRC32 = 0; cksum = crc32(0L, runtime, runtime->Hdr.HeaderSize); - //kprintf("Runtime table calculated CRC32 = 0x%x, header = 0x%x\n", cksum, hdr_cksum); + DPRINTF("Runtime table calculated CRC32 = 0x%x, header = 0x%x\n", cksum, hdr_cksum); runtime->Hdr.CRC32 = hdr_cksum; if (cksum != hdr_cksum) { kprintf("Bad EFI runtime table checksum\n"); break; } + DPRINTF("Runtime functions\n"); + DPRINTF(" GetTime : 0x%x\n", runtime->GetTime); + DPRINTF(" SetTime : 0x%x\n", runtime->SetTime); + DPRINTF(" GetWakeupTime : 0x%x\n", runtime->GetWakeupTime); + DPRINTF(" SetWakeupTime : 0x%x\n", runtime->SetWakeupTime); + DPRINTF(" SetVirtualAddressMap : 0x%x\n", runtime->SetVirtualAddressMap); + DPRINTF(" ConvertPointer : 0x%x\n", runtime->ConvertPointer); + DPRINTF(" GetVariable : 0x%x\n", runtime->GetVariable); + DPRINTF(" GetNextVariableName : 0x%x\n", runtime->GetNextVariableName); + DPRINTF(" SetVariable : 0x%x\n", runtime->SetVariable); + DPRINTF(" GetNextHighMonotonicCount: 0x%x\n", runtime->GetNextHighMonotonicCount); + DPRINTF(" ResetSystem : 0x%x\n", runtime->ResetSystem); + gPEEFIRuntimeServices = runtime; } while (FALSE); @@ -503,24 +543,41 @@ efi_init(void) msize = args->MemoryMapDescriptorSize; mcount = args->MemoryMapSize / msize; + DPRINTF("efi_init() kernel base: 0x%x size: 0x%x\n", + args->kaddr, args->ksize); + DPRINTF(" efiSystemTable physical: 0x%x virtual: %p\n", + args->efiSystemTable, + (void *) ml_static_ptovirt(args->efiSystemTable)); + DPRINTF(" efiRuntimeServicesPageStart: 0x%x\n", + args->efiRuntimeServicesPageStart); + DPRINTF(" efiRuntimeServicesPageCount: 0x%x\n", + args->efiRuntimeServicesPageCount); + DPRINTF(" efiRuntimeServicesVirtualPageStart: 0x%016llx\n", + args->efiRuntimeServicesVirtualPageStart); mptr = (EfiMemoryRange *)ml_static_ptovirt(args->MemoryMap); for (i=0; i < mcount; i++, mptr = (EfiMemoryRange *)(((vm_offset_t)mptr) + msize)) { if (((mptr->Attribute & EFI_MEMORY_RUNTIME) == EFI_MEMORY_RUNTIME) ) { vm_size = (vm_offset_t)i386_ptob((uint32_t)mptr->NumberOfPages); vm_addr = (vm_offset_t) mptr->VirtualStart; - phys_addr = (vm_map_offset_t) mptr->PhysicalStart; -#if defined(__i386__) - pmap_map -#elif defined(__x86_64__) - pmap_map_bd /* K64todo resolve pmap layer inconsistency */ +#ifdef __x86_64__ + /* For K64 on EFI32, shadow-map into high KVA */ + if (vm_addr < VM_MIN_KERNEL_ADDRESS) + vm_addr |= VM_MIN_KERNEL_ADDRESS; #endif - (vm_addr, phys_addr, phys_addr + round_page(vm_size), + phys_addr = (vm_map_offset_t) mptr->PhysicalStart; + DPRINTF(" Type: %x phys: %p EFIv: %p kv: %p size: %p\n", + mptr->Type, + (void *) (uintptr_t) phys_addr, + (void *) (uintptr_t) mptr->VirtualStart, + (void *) vm_addr, + (void *) vm_size); + pmap_map(vm_addr, phys_addr, phys_addr + round_page(vm_size), (mptr->Type == kEfiRuntimeServicesCode) ? VM_PROT_READ | VM_PROT_EXECUTE : VM_PROT_READ|VM_PROT_WRITE, (mptr->Type == EfiMemoryMappedIO) ? VM_WIMG_IO : VM_WIMG_USE_DEFAULT); } } - if ((args->Version != kBootArgsVersion1) || (args->Version == kBootArgsVersion1 && args->Revision < kBootArgsRevision1_5 )) + if (args->Version != kBootArgsVersion2) panic("Incompatible boot args version %d revision %d\n", args->Version, args->Revision); kprintf("Boot args version %d revision %d mode %d\n", args->Version, args->Revision, args->efiMode); @@ -543,8 +600,6 @@ hibernate_newruntime_map(void * map, vm_size_t map_size, uint32_t system_table_o kprintf("Reinitializing EFI runtime services\n"); - if (args->Version != kBootArgsVersion1) - return; do { vm_offset_t vm_size, vm_addr; @@ -572,6 +627,11 @@ hibernate_newruntime_map(void * map, vm_size_t map_size, uint32_t system_table_o vm_size = (vm_offset_t)i386_ptob((uint32_t)mptr->NumberOfPages); vm_addr = (vm_offset_t) mptr->VirtualStart; +#ifdef __x86_64__ + /* K64 on EFI32 */ + if (vm_addr < VM_MIN_KERNEL_ADDRESS) + vm_addr |= VM_MIN_KERNEL_ADDRESS; +#endif phys_addr = (vm_map_offset_t) mptr->PhysicalStart; kprintf("mapping[%u] %qx @ %lx, %llu\n", mptr->Type, phys_addr, (unsigned long)vm_addr, mptr->NumberOfPages); @@ -590,22 +650,21 @@ hibernate_newruntime_map(void * map, vm_size_t map_size, uint32_t system_table_o vm_size = (vm_offset_t)i386_ptob((uint32_t)mptr->NumberOfPages); vm_addr = (vm_offset_t) mptr->VirtualStart; +#ifdef __x86_64__ + if (vm_addr < VM_MIN_KERNEL_ADDRESS) + vm_addr |= VM_MIN_KERNEL_ADDRESS; +#endif phys_addr = (vm_map_offset_t) mptr->PhysicalStart; kprintf("mapping[%u] %qx @ %lx, %llu\n", mptr->Type, phys_addr, (unsigned long)vm_addr, mptr->NumberOfPages); -#if defined(__i386__) - pmap_map -#elif defined(__x86_64__) - pmap_map_bd /* K64todo resolve pmap layer inconsistency */ -#endif - (vm_addr, phys_addr, phys_addr + round_page(vm_size), + pmap_map(vm_addr, phys_addr, phys_addr + round_page(vm_size), (mptr->Type == kEfiRuntimeServicesCode) ? VM_PROT_READ | VM_PROT_EXECUTE : VM_PROT_READ|VM_PROT_WRITE, (mptr->Type == EfiMemoryMappedIO) ? VM_WIMG_IO : VM_WIMG_USE_DEFAULT); } } - if ((args->Version != kBootArgsVersion1) || (args->Version == kBootArgsVersion1 && args->Revision < kBootArgsRevision1_5 )) + if (args->Version != kBootArgsVersion2) panic("Incompatible boot args version %d revision %d\n", args->Version, args->Revision); kprintf("Boot args version %d revision %d mode %d\n", args->Version, args->Revision, args->efiMode); @@ -655,6 +714,7 @@ machine_init(void) */ clock_config(); +#if CONFIG_MTRR /* * Initialize MTRR from boot processor. */ @@ -664,6 +724,7 @@ machine_init(void) * Set up PAT for boot processor. */ pat_init(); +#endif /* * Free lowmem pages and complete other setup @@ -712,9 +773,25 @@ panic_io_port_read(void) { /* For use with the MP rendezvous mechanism */ +uint64_t panic_restart_timeout = ~(0ULL); + static void -machine_halt_cpu(__unused void *arg) { +machine_halt_cpu(void) { panic_io_port_read(); + + if (panic_restart_timeout != ~(0ULL)) { + uint64_t deadline = mach_absolute_time() + panic_restart_timeout; + while (mach_absolute_time() < deadline) { + cpu_pause(); + } + kprintf("Invoking PE_halt_restart\n"); + /* Attempt restart via ACPI RESET_REG; at the time of this + * writing, this is routine is chained through AppleSMC-> + * AppleACPIPlatform + */ + if (PE_halt_restart) + (*PE_halt_restart)(kPERestartCPU); + } pmCPUHalt(PM_HALT_DEBUG); } @@ -724,6 +801,7 @@ Debugger( { unsigned long pi_size = 0; void *stackptr; + int cn = cpu_number(); hw_atomic_add(&debug_mode, 1); if (!panic_is_inited) { @@ -731,7 +809,6 @@ Debugger( asm("hlt"); } - printf("Debugger called: <%s>\n", message); kprintf("Debugger called: <%s>\n", message); @@ -758,7 +835,7 @@ Debugger( #endif /* Print backtrace - callee is internally synchronized */ - panic_i386_backtrace(stackptr, 64, NULL, FALSE, NULL); + panic_i386_backtrace(stackptr, ((panic_double_fault_cpu == cn) ? 80: 48), NULL, FALSE, NULL); /* everything should be printed now so copy to NVRAM */ @@ -794,7 +871,7 @@ Debugger( * since we can subsequently halt the system. */ - kprintf("Attempting to commit panic log to NVRAM\n"); + /* The following sequence is a workaround for: * SnowLeopard10A67: AppleEFINVRAM should not invoke * any routines that use floating point (MMX in this case) when saving panic @@ -802,10 +879,12 @@ Debugger( */ cr0 = get_cr0(); clear_ts(); - + + kprintf("Attempting to commit panic log to NVRAM\n"); pi_size = PESavePanicInfo((unsigned char *)debug_buf, (uint32_t)pi_size ); set_cr0(cr0); + /* Uncompress in-place, to permit examination of * the panic log by debuggers. */ @@ -823,20 +902,27 @@ Debugger( draw_panic_dialog(); if (!panicDebugging) { + unsigned cnum; /* Clear the MP rendezvous function lock, in the event * that a panic occurred while in that codepath. */ mp_rendezvous_break_lock(); if (PE_reboot_on_panic()) { - PEHaltRestart(kPEPanicRestartCPU); + if (PE_halt_restart) + (*PE_halt_restart)(kPERestartCPU); } - /* Force all CPUs to disable interrupts and HLT. - * We've panicked, and shouldn't depend on the - * PEHaltRestart() mechanism, which relies on several - * bits of infrastructure. + /* Non-maskably interrupt all other processors + * If a restart timeout is specified, this processor + * will attempt a restart. */ - mp_rendezvous_no_intrs(machine_halt_cpu, NULL); + kprintf("Invoking machine_halt_cpu on CPU %d\n", cn); + for (cnum = 0; cnum < real_ncpus; cnum++) { + if (cnum != (unsigned) cn) { + cpu_NMI_interrupt(cnum); + } + } + machine_halt_cpu(); /* NOT REACHED */ } } @@ -852,26 +938,12 @@ machine_boot_info(char *buf, __unused vm_size_t size) return buf; } - -struct pasc { - unsigned a: 7; - unsigned b: 7; - unsigned c: 7; - unsigned d: 7; - unsigned e: 7; - unsigned f: 7; - unsigned g: 7; - unsigned h: 7; -} __attribute__((packed)); - -typedef struct pasc pasc_t; - /* Routines for address - symbol translation. Not called unless the "keepsyms" * boot-arg is supplied. */ static int -panic_print_macho_symbol_name(kernel_mach_header_t *mh, vm_address_t search) +panic_print_macho_symbol_name(kernel_mach_header_t *mh, vm_address_t search, const char *module_name) { kernel_nlist_t *sym = NULL; struct load_command *cmd; @@ -896,7 +968,7 @@ panic_print_macho_symbol_name(kernel_mach_header_t *mh, vm_address_t search) orig_le = orig_sg; else if (strncmp("", orig_sg->segname, sizeof(orig_sg->segname)) == 0) - orig_ts = orig_sg; /* kexts have a single unnamed segment */ + orig_ts = orig_sg; /* pre-Barolo i386 kexts have a single unnamed segment */ } else if (cmd->cmd == LC_SYMTAB) orig_st = (struct symtab_command *) cmd; @@ -907,12 +979,6 @@ panic_print_macho_symbol_name(kernel_mach_header_t *mh, vm_address_t search) if ((orig_ts == NULL) || (orig_st == NULL) || (orig_le == NULL)) return 0; - /* kexts don't have a LINKEDIT segment for now, so we'll never get this far for kexts */ - - vm_offset_t slide = ((vm_address_t)mh) - orig_ts->vmaddr; - if (slide != 0) - search -= slide; /* adjusting search since the binary has slid */ - if ((search < orig_ts->vmaddr) || (search >= orig_ts->vmaddr + orig_ts->vmsize)) { /* search out of range for this mach header */ @@ -938,9 +1004,9 @@ panic_print_macho_symbol_name(kernel_mach_header_t *mh, vm_address_t search) if (bestsym != NULL) { if (diff != 0) { - kdb_printf("%s + 0x%lx", bestsym, (unsigned long)diff); + kdb_printf("%s : %s + 0x%lx", module_name, bestsym, (unsigned long)diff); } else { - kdb_printf("%s", bestsym); + kdb_printf("%s : %s", module_name, bestsym); } return 1; } @@ -952,17 +1018,22 @@ extern kmod_info_t * kmod; /* the list of modules */ static void panic_print_kmod_symbol_name(vm_address_t search) { - kmod_info_t * current_kmod = kmod; - - while (current_kmod != NULL) { - if ((current_kmod->address <= search) && - (current_kmod->address + current_kmod->size > search)) + u_int i; + + if (gLoadedKextSummaries == NULL) + return; + for (i = 0; i < gLoadedKextSummaries->numSummaries; ++i) { + OSKextLoadedKextSummary *summary = gLoadedKextSummaries->summaries + i; + + if ((search >= summary->address) && + (search < (summary->address + summary->size))) + { + kernel_mach_header_t *header = (kernel_mach_header_t *)(uintptr_t) summary->address; + if (panic_print_macho_symbol_name(header, search, summary->name) == 0) { + kdb_printf("%s + %llu", summary->name, (unsigned long)search - summary->address); + } break; - current_kmod = current_kmod->next; - } - if (current_kmod != NULL) { - /* if kexts had symbol table loaded, we'd call search_symbol_name again; alas, they don't */ - kdb_printf("%s + %lu \n", current_kmod->name, (unsigned long)search - current_kmod->address); + } } } @@ -970,7 +1041,7 @@ static void panic_print_symbol_name(vm_address_t search) { /* try searching in the kernel */ - if (panic_print_macho_symbol_name(&_mh_execute_header, search) == 0) { + if (panic_print_macho_symbol_name(&_mh_execute_header, search, "mach_kernel") == 0) { /* that failed, now try to search for the right kext */ panic_print_kmod_symbol_name(search); } @@ -994,14 +1065,15 @@ panic_i386_backtrace(void *_frame, int nframes, const char *msg, boolean_t regdu volatile uint32_t *ppbtcnt = &pbtcnt; uint64_t bt_tsc_timeout; boolean_t keepsyms = FALSE; + int cn = cpu_number(); - if(pbtcpu != cpu_number()) { + if(pbtcpu != cn) { hw_atomic_add(&pbtcnt, 1); /* Spin on print backtrace lock, which serializes output * Continue anyway if a timeout occurs. */ - hw_lock_to(&pbtlock, LockTimeOutTSC); - pbtcpu = cpu_number(); + hw_lock_to(&pbtlock, LockTimeOutTSC*2); + pbtcpu = cn; } PE_parse_boot_argn("keepsyms", &keepsyms, sizeof (keepsyms)); @@ -1041,9 +1113,9 @@ panic_i386_backtrace(void *_frame, int nframes, const char *msg, boolean_t regdu kdb_printf("Backtrace (CPU %d), " #if PRINT_ARGS_FROM_STACK_FRAME - "Frame : Return Address (4 potential args on stack)\n", cpu_number()); + "Frame : Return Address (4 potential args on stack)\n", cn); #else - "Frame : Return Address\n", cpu_number()); + "Frame : Return Address\n", cn); #endif for (frame_index = 0; frame_index < nframes; frame_index++) { @@ -1058,7 +1130,7 @@ panic_i386_backtrace(void *_frame, int nframes, const char *msg, boolean_t regdu } if (!kvtophys(curframep) || - !kvtophys(curframep + sizeof(cframe_t))) { + !kvtophys(curframep + sizeof(cframe_t) - 1)) { kdb_printf("No mapping exists for frame pointer\n"); goto invalid; } @@ -1119,5 +1191,3 @@ panic_i386_backtrace(void *_frame, int nframes, const char *msg, boolean_t regdu bt_tsc_timeout = rdtsc64() + PBT_TIMEOUT_CYCLES; while(*ppbtcnt && (rdtsc64() < bt_tsc_timeout)); } - -void *apic_table = NULL; diff --git a/osfmk/i386/Diagnostics.h b/osfmk/i386/Diagnostics.h index c8d385c7b..f5281c604 100644 --- a/osfmk/i386/Diagnostics.h +++ b/osfmk/i386/Diagnostics.h @@ -42,8 +42,8 @@ #ifndef _DIAGNOSTICS_H_ #define _DIAGNOSTICS_H_ -#ifdef __ppc__ -#error This file is not useful on PowerPC. +#if !(defined(__i386__) || defined(__x86_64__)) +#error This file is not useful on non-Intel #endif int diagCall(x86_saved_state_t *regs); diff --git a/osfmk/i386/Makefile b/osfmk/i386/Makefile index d07d32aac..270006bdb 100644 --- a/osfmk/i386/Makefile +++ b/osfmk/i386/Makefile @@ -26,14 +26,18 @@ EXPORT_ONLY_FILES = \ mp.h \ mp_desc.h \ mp_events.h \ + pal_native.h \ + pal_routines.h \ + pal_hibernate.h \ pmCPU.h \ pmap.h \ proc_reg.h \ - rtclock.h \ + rtclock_protos.h \ seg.h \ simple_lock.h \ tsc.h \ tss.h \ + ucode.h \ vmx.h INSTALL_MD_DIR = i386 @@ -44,7 +48,7 @@ INSTALL_MD_LCL_LIST = cpu_capabilities.h INSTALL_KF_MD_LIST = asm.h cpuid.h eflags.h locks.h machine_routines.h proc_reg.h vmx.h -INSTALL_KF_MD_LCL_LIST = $(filter-out cpu_data.h, $(EXPORT_ONLY_FILES)) +INSTALL_KF_MD_LCL_LIST = $(filter-out cpu_data.h pal_i386.h, $(EXPORT_ONLY_FILES)) EXPORT_MD_LIST = ${EXPORT_ONLY_FILES} diff --git a/osfmk/i386/acpi.c b/osfmk/i386/acpi.c index fb2cbe334..f13561244 100644 --- a/osfmk/i386/acpi.c +++ b/osfmk/i386/acpi.c @@ -32,10 +32,13 @@ #include #include #include +#if CONFIG_MTRR #include +#endif #if CONFIG_VMX #include #endif +#include #include #include #include @@ -51,6 +54,7 @@ #include #include +#include #include #if HIBERNATION @@ -103,7 +107,6 @@ acpi_hibernate(void *refcon) #if defined(__i386__) cpu_IA32e_enable(current_cpu_datap()); #endif - mode = hibernate_write_image(); if( mode == kIOHibernatePostWriteHalt ) @@ -145,7 +148,8 @@ acpi_hibernate(void *refcon) #endif /* CONFIG_SLEEP */ #endif /* HIBERNATION */ -extern void slave_pstart(void); +extern void slave_pstart(void); + void acpi_sleep_kernel(acpi_sleep_callback func, void *refcon) @@ -161,8 +165,8 @@ acpi_sleep_kernel(acpi_sleep_callback func, void *refcon) uint64_t my_tsc; uint64_t my_abs; - kprintf("acpi_sleep_kernel hib=%d\n", - current_cpu_datap()->cpu_hibernate); + kprintf("acpi_sleep_kernel hib=%d, cpu=%d\n", + current_cpu_datap()->cpu_hibernate, cpu_number()); /* Get all CPUs to be in the "off" state */ my_cpu = cpu_number(); @@ -175,7 +179,7 @@ acpi_sleep_kernel(acpi_sleep_callback func, void *refcon) rc, cpu); } - /* shutdown local APIC before passing control to BIOS */ + /* shutdown local APIC before passing control to firmware */ lapic_shutdown(); #if HIBERNATION @@ -238,7 +242,7 @@ acpi_sleep_kernel(acpi_sleep_callback func, void *refcon) */ if (FALSE == disable_serial_output) - serial_init(); + pal_serial_init(); #if HIBERNATION if (current_cpu_datap()->cpu_hibernate) { @@ -263,8 +267,13 @@ acpi_sleep_kernel(acpi_sleep_callback func, void *refcon) mca_cpu_init(); #endif +#if CONFIG_MTRR /* restore MTRR settings */ mtrr_update_cpu(); +#endif + + /* update CPU microcode */ + ucode_update_wake(); #if CONFIG_VMX /* @@ -273,8 +282,10 @@ acpi_sleep_kernel(acpi_sleep_callback func, void *refcon) vmx_resume(); #endif +#if CONFIG_MTRR /* set up PAT following boot processor power up */ pat_init(); +#endif /* * Go through all of the CPUs and mark them as requiring diff --git a/osfmk/i386/asm.h b/osfmk/i386/asm.h index 02a5620ae..a51f8ae92 100644 --- a/osfmk/i386/asm.h +++ b/osfmk/i386/asm.h @@ -305,4 +305,104 @@ #endif /* __NO_UNDERSCORES__ */ #endif /* ASSEMBLER */ +/* + * The following macros make calls into C code. + * They dynamically align the stack to 16 bytes. + */ +#if defined(__i386__) +/* + * Arguments are moved (not pushed) onto the correctly aligned stack. + * NOTE: ESI is destroyed in the process, and hence cannot + * be directly used as a parameter. Users of this macro must + * independently preserve ESI (a non-volatile) if the routine is + * intended to be called from C, for instance. + */ + +#define CCALL(fn) \ + movl %esp, %esi ;\ + andl $0xFFFFFFF0, %esp ;\ + call EXT(fn) ;\ + movl %esi, %esp + +#define CCALL1(fn, arg1) \ + movl %esp, %esi ;\ + subl $4, %esp ;\ + andl $0xFFFFFFF0, %esp ;\ + movl arg1, (%esp) ;\ + call EXT(fn) ;\ + movl %esi, %esp + +#define CCALL2(fn, arg1, arg2) \ + movl %esp, %esi ;\ + subl $8, %esp ;\ + andl $0xFFFFFFF0, %esp ;\ + movl arg2, 4(%esp) ;\ + movl arg1, (%esp) ;\ + call EXT(fn) ;\ + movl %esi, %esp + +/* This variant exists to permit adjustment of the stack by "dtrace" */ +#define CCALL1WITHSP(fn, arg1) \ + movl %esp, %esi ;\ + subl $12, %esp ;\ + andl $0xFFFFFFF0, %esp ;\ + movl %esi, 8(%esp) ;\ + leal 8(%esp), %esi ;\ + movl %esi, 4(%esp) ;\ + movl arg1, (%esp) ;\ + call EXT(fn) ;\ + movl 8(%esp), %esp + +/* + * CCALL5 is used for callee functions with 3 arguments but + * where arg2 (a3:a2) and arg3 (a5:a4) are 64-bit values. + */ +#define CCALL5(fn, a1, a2, a3, a4, a5) \ + movl %esp, %esi ;\ + subl $20, %esp ;\ + andl $0xFFFFFFF0, %esp ;\ + movl a5, 16(%esp) ;\ + movl a4, 12(%esp) ;\ + movl a3, 8(%esp) ;\ + movl a2, 4(%esp) ;\ + movl a1, (%esp) ;\ + call EXT(fn) ;\ + movl %esi, %esp + +#elif defined(__x86_64__) + +/* This variant exists to permit adjustment of the stack by "dtrace" */ +#define CCALLWITHSP(fn) \ + mov %rsp, %r12 ;\ + sub $8, %rsp ;\ + and $0xFFFFFFFFFFFFFFF0, %rsp ;\ + mov %r12, (%rsp) ;\ + leaq (%rsp), %rsi ;\ + call EXT(fn) ;\ + mov (%rsp), %rsp + +#define CCALL(fn) \ + mov %rsp, %r12 ;\ + and $0xFFFFFFFFFFFFFFF0, %rsp ;\ + call EXT(fn) ;\ + mov %r12, %rsp + +#define CCALL1(fn, arg1) \ + mov arg1, %rdi ;\ + CCALL(fn) + +#define CCALL2(fn, arg1, arg2) \ + mov arg1, %rdi ;\ + CCALL(fn) + +#define CCALL3(fn, arg1, arg2, arg3) \ + mov arg1, %rdi ;\ + mov arg2, %rsi ;\ + mov arg3, %rdx ;\ + CCALL(fn) + +#else +#error unsupported architecture +#endif + #endif /* _I386_ASM_H_ */ diff --git a/osfmk/i386/bsd_i386.c b/osfmk/i386/bsd_i386.c index 4b933d763..57b222a14 100644 --- a/osfmk/i386/bsd_i386.c +++ b/osfmk/i386/bsd_i386.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -73,25 +73,6 @@ extern void mach_kauth_cred_uthread_update(void); #endif -kern_return_t -thread_userstack( - thread_t, - int, - thread_state_t, - unsigned int, - mach_vm_offset_t *, - int * -); - -kern_return_t -thread_entrypoint( - thread_t, - int, - thread_state_t, - unsigned int, - mach_vm_offset_t * -); - void * find_user_regs(thread_t); unsigned int get_msr_exportmask(void); @@ -100,8 +81,7 @@ unsigned int get_msr_nbits(void); unsigned int get_msr_rbits(void); -extern void throttle_lowpri_io(boolean_t); - +extern void throttle_lowpri_io(int); /* * thread_userstack: @@ -115,7 +95,7 @@ thread_userstack( int flavor, thread_state_t tstate, __unused unsigned int count, - user_addr_t *user_stack, + mach_vm_offset_t *user_stack, int *customstack ) { @@ -129,14 +109,15 @@ thread_userstack( state25 = (x86_thread_state32_t *) tstate; - if (state25->esp) + if (state25->esp) { *user_stack = state25->esp; - else + if (customstack) + *customstack = 1; + } else { *user_stack = VM_USRSTACK32; - if (customstack && state25->esp) - *customstack = 1; - else - *customstack = 0; + if (customstack) + *customstack = 0; + } break; } @@ -146,14 +127,15 @@ thread_userstack( state25 = (x86_thread_state64_t *) tstate; - if (state25->rsp) + if (state25->rsp) { *user_stack = state25->rsp; - else + if (customstack) + *customstack = 1; + } else { *user_stack = VM_USRSTACK64; - if (customstack && state25->rsp) - *customstack = 1; - else - *customstack = 0; + if (customstack) + *customstack = 0; + } break; } @@ -202,62 +184,6 @@ thread_entrypoint( return (KERN_SUCCESS); } -/* - * Duplicate parent state in child - * for U**X fork. - */ -kern_return_t -machine_thread_dup( - thread_t parent, - thread_t child -) -{ - - pcb_t parent_pcb; - pcb_t child_pcb; - - if ((child_pcb = child->machine.pcb) == NULL || - (parent_pcb = parent->machine.pcb) == NULL) - return (KERN_FAILURE); - /* - * Copy over the x86_saved_state registers - */ - if (cpu_mode_is64bit()) { - if (thread_is_64bit(parent)) - bcopy(USER_REGS64(parent), USER_REGS64(child), sizeof(x86_saved_state64_t)); - else - bcopy(USER_REGS32(parent), USER_REGS32(child), sizeof(x86_saved_state_compat32_t)); - } else - bcopy(USER_REGS32(parent), USER_REGS32(child), sizeof(x86_saved_state32_t)); - - /* - * Check to see if parent is using floating point - * and if so, copy the registers to the child - */ - fpu_dup_fxstate(parent, child); - -#ifdef MACH_BSD - /* - * Copy the parent's cthread id and USER_CTHREAD descriptor, if 32-bit. - */ - child_pcb->cthread_self = parent_pcb->cthread_self; - if (!thread_is_64bit(parent)) - child_pcb->cthread_desc = parent_pcb->cthread_desc; - - /* - * FIXME - should a user specified LDT, TSS and V86 info - * be duplicated as well?? - probably not. - */ - // duplicate any use LDT entry that was set I think this is appropriate. - if (parent_pcb->uldt_selector!= 0) { - child_pcb->uldt_selector = parent_pcb->uldt_selector; - child_pcb->uldt_desc = parent_pcb->uldt_desc; - } -#endif - - return (KERN_SUCCESS); -} - /* * FIXME - thread_set_child */ @@ -266,6 +192,7 @@ void thread_set_child(thread_t child, int pid); void thread_set_child(thread_t child, int pid) { + pal_register_cache_state(child, DIRTY); if (thread_is_64bit(child)) { x86_saved_state64_t *iss64; @@ -287,31 +214,6 @@ thread_set_child(thread_t child, int pid) } -void thread_set_parent(thread_t parent, int pid); - -void -thread_set_parent(thread_t parent, int pid) -{ - - if (thread_is_64bit(parent)) { - x86_saved_state64_t *iss64; - - iss64 = USER_REGS64(parent); - - iss64->rax = pid; - iss64->rdx = 0; - iss64->isf.rflags &= ~EFL_CF; - } else { - x86_saved_state32_t *iss32; - - iss32 = USER_REGS32(parent); - - iss32->eax = pid; - iss32->edx = 0; - iss32->efl &= ~EFL_CF; - } -} - /* * System Call handling code @@ -449,142 +351,6 @@ machdep_syscall64(x86_saved_state_t *state) /* NOTREACHED */ } -/* - * thread_fast_set_cthread_self: Sets the machine kernel thread ID of the - * current thread to the given thread ID; fast version for 32-bit processes - * - * Parameters: self Thread ID to set - * - * Returns: 0 Success - * !0 Not success - */ -kern_return_t -thread_fast_set_cthread_self(uint32_t self) -{ - thread_t thread = current_thread(); - pcb_t pcb = thread->machine.pcb; - struct real_descriptor desc = { - .limit_low = 1, - .limit_high = 0, - .base_low = self & 0xffff, - .base_med = (self >> 16) & 0xff, - .base_high = (self >> 24) & 0xff, - .access = ACC_P|ACC_PL_U|ACC_DATA_W, - .granularity = SZ_32|SZ_G, - }; - - current_thread()->machine.pcb->cthread_self = (uint64_t) self; /* preserve old func too */ - - /* assign descriptor */ - mp_disable_preemption(); - pcb->cthread_desc = desc; - *ldt_desc_p(USER_CTHREAD) = desc; - saved_state32(pcb->iss)->gs = USER_CTHREAD; - mp_enable_preemption(); - - return (USER_CTHREAD); -} - -/* - * thread_fast_set_cthread_self64: Sets the machine kernel thread ID of the - * current thread to the given thread ID; fast version for 64-bit processes - * - * Parameters: self Thread ID - * - * Returns: 0 Success - * !0 Not success - */ -kern_return_t -thread_fast_set_cthread_self64(uint64_t self) -{ - pcb_t pcb = current_thread()->machine.pcb; - cpu_data_t *cdp; - - /* check for canonical address, set 0 otherwise */ - if (!IS_USERADDR64_CANONICAL(self)) - self = 0ULL; - - pcb->cthread_self = self; - mp_disable_preemption(); - cdp = current_cpu_datap(); -#if defined(__x86_64__) - if ((cdp->cpu_uber.cu_user_gs_base != pcb->cthread_self) || - (pcb->cthread_self != rdmsr64(MSR_IA32_KERNEL_GS_BASE))) - wrmsr64(MSR_IA32_KERNEL_GS_BASE, self); -#endif - cdp->cpu_uber.cu_user_gs_base = self; - mp_enable_preemption(); - return (USER_CTHREAD); -} - -/* - * thread_set_user_ldt routine is the interface for the user level - * settable ldt entry feature. allowing a user to create arbitrary - * ldt entries seems to be too large of a security hole, so instead - * this mechanism is in place to allow user level processes to have - * an ldt entry that can be used in conjunction with the FS register. - * - * Swapping occurs inside the pcb.c file along with initialization - * when a thread is created. The basic functioning theory is that the - * pcb->uldt_selector variable will contain either 0 meaning the - * process has not set up any entry, or the selector to be used in - * the FS register. pcb->uldt_desc contains the actual descriptor the - * user has set up stored in machine usable ldt format. - * - * Currently one entry is shared by all threads (USER_SETTABLE), but - * this could be changed in the future by changing how this routine - * allocates the selector. There seems to be no real reason at this - * time to have this added feature, but in the future it might be - * needed. - * - * address is the linear address of the start of the data area size - * is the size in bytes of the area flags should always be set to 0 - * for now. in the future it could be used to set R/W permisions or - * other functions. Currently the segment is created as a data segment - * up to 1 megabyte in size with full read/write permisions only. - * - * this call returns the segment selector or -1 if any error occurs - */ -kern_return_t -thread_set_user_ldt(uint32_t address, uint32_t size, uint32_t flags) -{ - pcb_t pcb; - struct fake_descriptor temp; - int mycpu; - - if (flags != 0) - return -1; // flags not supported - if (size > 0xFFFFF) - return -1; // size too big, 1 meg is the limit - - mp_disable_preemption(); - mycpu = cpu_number(); - - // create a "fake" descriptor so we can use fix_desc() - // to build a real one... - // 32 bit default operation size - // standard read/write perms for a data segment - pcb = (pcb_t)current_thread()->machine.pcb; - temp.offset = address; - temp.lim_or_seg = size; - temp.size_or_wdct = SZ_32; - temp.access = ACC_P|ACC_PL_U|ACC_DATA_W; - - // turn this into a real descriptor - fix_desc(&temp,1); - - // set up our data in the pcb - pcb->uldt_desc = *(struct real_descriptor*)&temp; - pcb->uldt_selector = USER_SETTABLE; // set the selector value - - // now set it up in the current table... - *ldt_desc_p(USER_SETTABLE) = *(struct real_descriptor*)&temp; - - mp_enable_preemption(); - - return USER_SETTABLE; -} - #endif /* MACH_BSD */ @@ -791,6 +557,7 @@ thread_setuserstack( thread_t thread, mach_vm_address_t user_stack) { + pal_register_cache_state(thread, DIRTY); if (thread_is_64bit(thread)) { x86_saved_state64_t *iss64; @@ -817,6 +584,7 @@ thread_adjuserstack( thread_t thread, int adjust) { + pal_register_cache_state(thread, DIRTY); if (thread_is_64bit(thread)) { x86_saved_state64_t *iss64; @@ -845,6 +613,7 @@ thread_adjuserstack( void thread_setentrypoint(thread_t thread, mach_vm_address_t entry) { + pal_register_cache_state(thread, DIRTY); if (thread_is_64bit(thread)) { x86_saved_state64_t *iss64; @@ -864,6 +633,7 @@ thread_setentrypoint(thread_t thread, mach_vm_address_t entry) kern_return_t thread_setsinglestep(thread_t thread, int on) { + pal_register_cache_state(thread, DIRTY); if (thread_is_64bit(thread)) { x86_saved_state64_t *iss64; @@ -897,18 +667,15 @@ thread_setsinglestep(thread_t thread, int on) void * find_user_regs(thread_t thread) { + pal_register_cache_state(thread, DIRTY); return USER_STATE(thread); } void * get_user_regs(thread_t th) { - if (th->machine.pcb) - return(USER_STATE(th)); - else { - printf("[get_user_regs: thread does not have pcb]"); - return NULL; - } + pal_register_cache_state(th, DIRTY); + return(USER_STATE(th)); } #if CONFIG_DTRACE diff --git a/osfmk/i386/bsd_i386_native.c b/osfmk/i386/bsd_i386_native.c new file mode 100644 index 000000000..13a7cb0aa --- /dev/null +++ b/osfmk/i386/bsd_i386_native.c @@ -0,0 +1,283 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include <../bsd/sys/sysent.h> + + +/* + * Duplicate parent state in child + * for U**X fork. + */ +kern_return_t +machine_thread_dup( + thread_t parent, + thread_t child +) +{ + + pcb_t parent_pcb = THREAD_TO_PCB(parent); + pcb_t child_pcb = THREAD_TO_PCB(child); + + /* + * Copy over the x86_saved_state registers + */ + if (cpu_mode_is64bit()) { + if (thread_is_64bit(parent)) + bcopy(USER_REGS64(parent), USER_REGS64(child), sizeof(x86_saved_state64_t)); + else + bcopy(USER_REGS32(parent), USER_REGS32(child), sizeof(x86_saved_state_compat32_t)); + } else + bcopy(USER_REGS32(parent), USER_REGS32(child), sizeof(x86_saved_state32_t)); + + /* + * Check to see if parent is using floating point + * and if so, copy the registers to the child + */ + fpu_dup_fxstate(parent, child); + +#ifdef MACH_BSD + /* + * Copy the parent's cthread id and USER_CTHREAD descriptor, if 32-bit. + */ + child_pcb->cthread_self = parent_pcb->cthread_self; + if (!thread_is_64bit(parent)) + child_pcb->cthread_desc = parent_pcb->cthread_desc; + + /* + * FIXME - should a user specified LDT, TSS and V86 info + * be duplicated as well?? - probably not. + */ + // duplicate any use LDT entry that was set I think this is appropriate. + if (parent_pcb->uldt_selector!= 0) { + child_pcb->uldt_selector = parent_pcb->uldt_selector; + child_pcb->uldt_desc = parent_pcb->uldt_desc; + } +#endif + + return (KERN_SUCCESS); +} + +void thread_set_parent(thread_t parent, int pid); + +void +thread_set_parent(thread_t parent, int pid) +{ + pal_register_cache_state(parent, DIRTY); + + if (thread_is_64bit(parent)) { + x86_saved_state64_t *iss64; + + iss64 = USER_REGS64(parent); + + iss64->rax = pid; + iss64->rdx = 0; + iss64->isf.rflags &= ~EFL_CF; + } else { + x86_saved_state32_t *iss32; + + iss32 = USER_REGS32(parent); + + iss32->eax = pid; + iss32->edx = 0; + iss32->efl &= ~EFL_CF; + } +} + +/* + * thread_fast_set_cthread_self: Sets the machine kernel thread ID of the + * current thread to the given thread ID; fast version for 32-bit processes + * + * Parameters: self Thread ID to set + * + * Returns: 0 Success + * !0 Not success + */ +kern_return_t +thread_fast_set_cthread_self(uint32_t self) +{ + thread_t thread = current_thread(); + pcb_t pcb = THREAD_TO_PCB(thread); + struct real_descriptor desc = { + .limit_low = 1, + .limit_high = 0, + .base_low = self & 0xffff, + .base_med = (self >> 16) & 0xff, + .base_high = (self >> 24) & 0xff, + .access = ACC_P|ACC_PL_U|ACC_DATA_W, + .granularity = SZ_32|SZ_G, + }; + + current_thread()->machine.cthread_self = (uint64_t) self; /* preserve old func too */ + + /* assign descriptor */ + mp_disable_preemption(); + pcb->cthread_desc = desc; + *ldt_desc_p(USER_CTHREAD) = desc; + saved_state32(pcb->iss)->gs = USER_CTHREAD; + mp_enable_preemption(); + + return (USER_CTHREAD); +} + +/* + * thread_fast_set_cthread_self64: Sets the machine kernel thread ID of the + * current thread to the given thread ID; fast version for 64-bit processes + * + * Parameters: self Thread ID + * + * Returns: 0 Success + * !0 Not success + */ +kern_return_t +thread_fast_set_cthread_self64(uint64_t self) +{ + pcb_t pcb = THREAD_TO_PCB(current_thread()); + cpu_data_t *cdp; + + /* check for canonical address, set 0 otherwise */ + if (!IS_USERADDR64_CANONICAL(self)) + self = 0ULL; + + pcb->cthread_self = self; + mp_disable_preemption(); + cdp = current_cpu_datap(); +#if defined(__x86_64__) + if ((cdp->cpu_uber.cu_user_gs_base != pcb->cthread_self) || + (pcb->cthread_self != rdmsr64(MSR_IA32_KERNEL_GS_BASE))) + wrmsr64(MSR_IA32_KERNEL_GS_BASE, self); +#endif + cdp->cpu_uber.cu_user_gs_base = self; + mp_enable_preemption(); + return (USER_CTHREAD); /* N.B.: not a kern_return_t! */ +} + +/* + * thread_set_user_ldt routine is the interface for the user level + * settable ldt entry feature. allowing a user to create arbitrary + * ldt entries seems to be too large of a security hole, so instead + * this mechanism is in place to allow user level processes to have + * an ldt entry that can be used in conjunction with the FS register. + * + * Swapping occurs inside the pcb.c file along with initialization + * when a thread is created. The basic functioning theory is that the + * pcb->uldt_selector variable will contain either 0 meaning the + * process has not set up any entry, or the selector to be used in + * the FS register. pcb->uldt_desc contains the actual descriptor the + * user has set up stored in machine usable ldt format. + * + * Currently one entry is shared by all threads (USER_SETTABLE), but + * this could be changed in the future by changing how this routine + * allocates the selector. There seems to be no real reason at this + * time to have this added feature, but in the future it might be + * needed. + * + * address is the linear address of the start of the data area size + * is the size in bytes of the area flags should always be set to 0 + * for now. in the future it could be used to set R/W permisions or + * other functions. Currently the segment is created as a data segment + * up to 1 megabyte in size with full read/write permisions only. + * + * this call returns the segment selector or -1 if any error occurs + */ +kern_return_t +thread_set_user_ldt(uint32_t address, uint32_t size, uint32_t flags) +{ + pcb_t pcb; + struct fake_descriptor temp; + + if (flags != 0) + return -1; // flags not supported + if (size > 0xFFFFF) + return -1; // size too big, 1 meg is the limit + + mp_disable_preemption(); + + // create a "fake" descriptor so we can use fix_desc() + // to build a real one... + // 32 bit default operation size + // standard read/write perms for a data segment + pcb = THREAD_TO_PCB(current_thread()); + temp.offset = address; + temp.lim_or_seg = size; + temp.size_or_wdct = SZ_32; + temp.access = ACC_P|ACC_PL_U|ACC_DATA_W; + + // turn this into a real descriptor + fix_desc(&temp,1); + + // set up our data in the pcb + pcb->uldt_desc = *(struct real_descriptor*)&temp; + pcb->uldt_selector = USER_SETTABLE; // set the selector value + + // now set it up in the current table... + *ldt_desc_p(USER_SETTABLE) = *(struct real_descriptor*)&temp; + + mp_enable_preemption(); + + return USER_SETTABLE; +} diff --git a/osfmk/i386/bzero.s b/osfmk/i386/bzero.s index 034a6469c..cb6a0536b 100644 --- a/osfmk/i386/bzero.s +++ b/osfmk/i386/bzero.s @@ -90,7 +90,7 @@ ENTRY(memset) * void bzero(char * addr, size_t length) */ Entry(blkclr) -ENTRY(bzero) +ENTRY2(bzero,__bzero) pushl %edi movl 4+ 4(%esp),%edi /* addr */ movl 4+ 8(%esp),%edx /* length */ diff --git a/osfmk/i386/commpage/atomic.s b/osfmk/i386/commpage/atomic.s deleted file mode 100644 index 769698b0f..000000000 --- a/osfmk/i386/commpage/atomic.s +++ /dev/null @@ -1,396 +0,0 @@ -/* - * Copyright (c) 2004-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include - -/* OSAtomic.h library native implementations. */ - -// This is a regparm(3) subroutine used by: - -// bool OSAtomicCompareAndSwap32( int32_t old, int32_t new, int32_t *value); -// int32_t OSAtomicAnd32( int32_t mask, int32_t *value); -// int32_t OSAtomicOr32( int32_t mask, int32_t *value); -// int32_t OSAtomicXor32( int32_t mask, int32_t *value); - -// It assumes old -> %eax, new -> %edx, value -> %ecx -// on success: returns with ZF set -// on failure: returns with *value in %eax, ZF clear - -// The first word of the routine contains the address of the first instruction, -// so callers can pass parameters in registers by using the absolute: - -// call *_COMPARE_AND_SWAP32 - -// TODO: move the .long onto a separate page to reduce icache pollution (?) - -COMMPAGE_FUNCTION_START(compare_and_swap32_mp, 32, 4) -.long _COMM_PAGE_COMPARE_AND_SWAP32+4 - lock - cmpxchgl %edx, (%ecx) - ret -COMMPAGE_DESCRIPTOR(compare_and_swap32_mp,_COMM_PAGE_COMPARE_AND_SWAP32,0,kUP) - -COMMPAGE_FUNCTION_START(compare_and_swap32_up, 32, 4) -.long _COMM_PAGE_COMPARE_AND_SWAP32+4 - cmpxchgl %edx, (%ecx) - ret -COMMPAGE_DESCRIPTOR(compare_and_swap32_up,_COMM_PAGE_COMPARE_AND_SWAP32,kUP,0) - -// This is a subroutine used by: -// bool OSAtomicCompareAndSwap64( int64_t old, int64_t new, int64_t *value); - -// It assumes old -> %eax/%edx, new -> %ebx/%ecx, value -> %esi -// on success: returns with ZF set -// on failure: returns with *value in %eax/%edx, ZF clear - -COMMPAGE_FUNCTION_START(compare_and_swap64_mp, 32, 4) -.long _COMM_PAGE_COMPARE_AND_SWAP64+4 - lock - cmpxchg8b (%esi) - ret -COMMPAGE_DESCRIPTOR(compare_and_swap64_mp,_COMM_PAGE_COMPARE_AND_SWAP64,0,kUP) - -COMMPAGE_FUNCTION_START(compare_and_swap64_up, 32, 4) -.long _COMM_PAGE_COMPARE_AND_SWAP64+4 - cmpxchg8b (%esi) - ret -COMMPAGE_DESCRIPTOR(compare_and_swap64_up,_COMM_PAGE_COMPARE_AND_SWAP64,kUP,0) - -// This is a subroutine used by: -// bool OSAtomicTestAndSet( uint32_t n, void *value ); -// It assumes n -> %eax, value -> %edx - -// Returns: old value of bit in CF - -COMMPAGE_FUNCTION_START(bit_test_and_set_mp, 32, 4) -.long _COMM_PAGE_BTS+4 - lock - btsl %eax, (%edx) - ret -COMMPAGE_DESCRIPTOR(bit_test_and_set_mp,_COMM_PAGE_BTS,0,kUP) - -COMMPAGE_FUNCTION_START(bit_test_and_set_up, 32, 4) -.long _COMM_PAGE_BTS+4 - btsl %eax, (%edx) - ret -COMMPAGE_DESCRIPTOR(bit_test_and_set_up,_COMM_PAGE_BTS,kUP,0) - -// This is a subroutine used by: -// bool OSAtomicTestAndClear( uint32_t n, void *value ); -// It assumes n -> %eax, value -> %edx - -// Returns: old value of bit in CF - -COMMPAGE_FUNCTION_START(bit_test_and_clear_mp, 32, 4) -.long _COMM_PAGE_BTC+4 - lock - btrl %eax, (%edx) - ret -COMMPAGE_DESCRIPTOR(bit_test_and_clear_mp,_COMM_PAGE_BTC,0,kUP) - -COMMPAGE_FUNCTION_START(bit_test_and_clear_up, 32, 4) -.long _COMM_PAGE_BTC+4 - btrl %eax, (%edx) - ret -COMMPAGE_DESCRIPTOR(bit_test_and_clear_up,_COMM_PAGE_BTC,kUP,0) - -// This is a subroutine used by: -// int32_t OSAtomicAdd32( int32_t amt, int32_t *value ); -// It assumes amt -> %eax, value -> %edx - -// Returns: old value in %eax -// NB: OSAtomicAdd32 returns the new value, so clients will add amt to %eax - -COMMPAGE_FUNCTION_START(atomic_add32_mp, 32, 4) -.long _COMM_PAGE_ATOMIC_ADD32+4 - lock - xaddl %eax, (%edx) - ret -COMMPAGE_DESCRIPTOR(atomic_add32_mp,_COMM_PAGE_ATOMIC_ADD32,0,kUP) - -COMMPAGE_FUNCTION_START(atomic_add32_up, 32, 4) -.long _COMM_PAGE_ATOMIC_ADD32+4 - xaddl %eax, (%edx) - ret -COMMPAGE_DESCRIPTOR(atomic_add32_up,_COMM_PAGE_ATOMIC_ADD32,kUP,0) - - -// OSMemoryBarrier() -// These are used both in 32 and 64-bit mode. We use a fence even on UP -// machines, so this function can be used with nontemporal stores. - -COMMPAGE_FUNCTION_START(memory_barrier, 32, 4) - lock - addl $0,(%esp) - ret -COMMPAGE_DESCRIPTOR(memory_barrier,_COMM_PAGE_MEMORY_BARRIER,0,kHasSSE2); - -COMMPAGE_FUNCTION_START(memory_barrier_sse2, 32, 4) - mfence - ret -COMMPAGE_DESCRIPTOR(memory_barrier_sse2,_COMM_PAGE_MEMORY_BARRIER,kHasSSE2,0); - - -/* - * typedef volatile struct { - * void *opaque1; <-- ptr to 1st queue element or null - * long opaque2; <-- generation count - * } OSQueueHead; - * - * void OSAtomicEnqueue( OSQueueHead *list, void *new, size_t offset); - */ - -COMMPAGE_FUNCTION_START(AtomicEnqueue, 32, 4) - pushl %edi - pushl %esi - pushl %ebx - movl 16(%esp),%edi // %edi == ptr to list head - movl 20(%esp),%ebx // %ebx == new - movl 24(%esp),%esi // %esi == offset - movl (%edi),%eax // %eax == ptr to 1st element in Q - movl 4(%edi),%edx // %edx == current generation count -1: - movl %eax,(%ebx,%esi)// link to old list head from new element - movl %edx,%ecx - incl %ecx // increment generation count - lock // always lock for now... - cmpxchg8b (%edi) // ...push on new element - jnz 1b - popl %ebx - popl %esi - popl %edi - ret -COMMPAGE_DESCRIPTOR(AtomicEnqueue,_COMM_PAGE_ENQUEUE,0,0) - - -/* void* OSAtomicDequeue( OSQueueHead *list, size_t offset); */ - -COMMPAGE_FUNCTION_START(AtomicDequeue, 32, 4) - pushl %edi - pushl %esi - pushl %ebx - movl 16(%esp),%edi // %edi == ptr to list head - movl 20(%esp),%esi // %esi == offset - movl (%edi),%eax // %eax == ptr to 1st element in Q - movl 4(%edi),%edx // %edx == current generation count -1: - testl %eax,%eax // list empty? - jz 2f // yes - movl (%eax,%esi),%ebx // point to 2nd in Q - movl %edx,%ecx - incl %ecx // increment generation count - lock // always lock for now... - cmpxchg8b (%edi) // ...pop off 1st element - jnz 1b -2: - popl %ebx - popl %esi - popl %edi - ret // ptr to 1st element in Q still in %eax -COMMPAGE_DESCRIPTOR(AtomicDequeue,_COMM_PAGE_DEQUEUE,0,0) - - - -/************************* x86_64 versions follow **************************/ - - -// This is a subroutine used by: - -// bool OSAtomicCompareAndSwap32( int32_t old, int32_t new, int32_t *value); -// int32_t OSAtomicAnd32( int32_t mask, int32_t *value); -// int32_t OSAtomicOr32( int32_t mask, int32_t *value); -// int32_t OSAtomicXor32( int32_t mask, int32_t *value); - -// It assumes: old -> %rdi (ie, it follows the ABI parameter conventions) -// new -> %rsi -// value -> %rdx -// on success: returns with ZF set -// on failure: returns with *value in %eax, ZF clear - -COMMPAGE_FUNCTION_START(compare_and_swap32_mp_64, 64, 4) - movl %edi,%eax // put old value where "cmpxchg" wants it - lock - cmpxchgl %esi, (%rdx) - ret -COMMPAGE_DESCRIPTOR(compare_and_swap32_mp_64,_COMM_PAGE_COMPARE_AND_SWAP32,0,kUP) - -COMMPAGE_FUNCTION_START(compare_and_swap32_up_64, 64, 4) - movl %edi,%eax // put old value where "cmpxchg" wants it - cmpxchgl %esi, (%rdx) - ret -COMMPAGE_DESCRIPTOR(compare_and_swap32_up_64,_COMM_PAGE_COMPARE_AND_SWAP32,kUP,0) - -// This is a subroutine used by: -// bool OSAtomicCompareAndSwap64( int64_t old, int64_t new, int64_t *value); - -// It assumes: old -> %rdi (ie, it follows the ABI parameter conventions) -// new -> %rsi -// value -> %rdx -// on success: returns with ZF set -// on failure: returns with *value in %rax, ZF clear - -COMMPAGE_FUNCTION_START(compare_and_swap64_mp_64, 64, 4) - movq %rdi,%rax // put old value where "cmpxchg" wants it - lock - cmpxchgq %rsi, (%rdx) - ret -COMMPAGE_DESCRIPTOR(compare_and_swap64_mp_64,_COMM_PAGE_COMPARE_AND_SWAP64,0,kUP) - -COMMPAGE_FUNCTION_START(compare_and_swap64_up_64, 64, 4) - movq %rdi,%rax // put old value where "cmpxchg" wants it - cmpxchgq %rsi, (%rdx) - ret -COMMPAGE_DESCRIPTOR(compare_and_swap64_up_64,_COMM_PAGE_COMPARE_AND_SWAP64,kUP,0) - -// This is a subroutine used by: -// bool OSAtomicTestAndSet( uint32_t n, void *value ); -// It is called with standard register conventions: -// n = %rdi -// value = %rsi -// Returns: old value of bit in CF - -COMMPAGE_FUNCTION_START(bit_test_and_set_mp_64, 64, 4) - lock - btsl %edi, (%rsi) - ret -COMMPAGE_DESCRIPTOR(bit_test_and_set_mp_64,_COMM_PAGE_BTS,0,kUP) - -COMMPAGE_FUNCTION_START(bit_test_and_set_up_64, 64, 4) - btsl %edi, (%rsi) - ret -COMMPAGE_DESCRIPTOR(bit_test_and_set_up_64,_COMM_PAGE_BTS,kUP,0) - -// This is a subroutine used by: -// bool OSAtomicTestAndClear( uint32_t n, void *value ); -// It is called with standard register conventions: -// n = %rdi -// value = %rsi -// Returns: old value of bit in CF - -COMMPAGE_FUNCTION_START(bit_test_and_clear_mp_64, 64, 4) - lock - btrl %edi, (%rsi) - ret -COMMPAGE_DESCRIPTOR(bit_test_and_clear_mp_64,_COMM_PAGE_BTC,0,kUP) - -COMMPAGE_FUNCTION_START(bit_test_and_clear_up_64, 64, 4) - btrl %edi, (%rsi) - ret -COMMPAGE_DESCRIPTOR(bit_test_and_clear_up_64,_COMM_PAGE_BTC,kUP,0) - -// This is a subroutine used by: -// int32_t OSAtomicAdd32( int32_t amt, int32_t *value ); -// It is called with standard register conventions: -// amt = %rdi -// value = %rsi -// Returns: old value in %edi -// NB: OSAtomicAdd32 returns the new value, so clients will add amt to %edi - -COMMPAGE_FUNCTION_START(atomic_add32_mp_64, 64, 4) - lock - xaddl %edi, (%rsi) - ret -COMMPAGE_DESCRIPTOR(atomic_add32_mp_64,_COMM_PAGE_ATOMIC_ADD32,0,kUP) - -COMMPAGE_FUNCTION_START(atomic_add32_up_64, 64, 4) - xaddl %edi, (%rsi) - ret -COMMPAGE_DESCRIPTOR(atomic_add32_up_64,_COMM_PAGE_ATOMIC_ADD32,kUP,0) - -// This is a subroutine used by: -// int64_t OSAtomicAdd64( int64_t amt, int64_t *value ); -// It is called with standard register conventions: -// amt = %rdi -// value = %rsi -// Returns: old value in %rdi -// NB: OSAtomicAdd64 returns the new value, so clients will add amt to %rdi - -COMMPAGE_FUNCTION_START(atomic_add64_mp_64, 64, 4) - lock - xaddq %rdi, (%rsi) - ret -COMMPAGE_DESCRIPTOR(atomic_add64_mp_64,_COMM_PAGE_ATOMIC_ADD64,0,kUP) - -COMMPAGE_FUNCTION_START(atomic_add64_up_64, 64, 4) - xaddq %rdi, (%rsi) - ret -COMMPAGE_DESCRIPTOR(atomic_add64_up_64,_COMM_PAGE_ATOMIC_ADD64,kUP,0) - - -/* - * typedef volatile struct { - * void *opaque1; <-- ptr to 1st queue element or null - * long opaque2; <-- generation count - * } OSQueueHead; - * - * void OSAtomicEnqueue( OSQueueHead *list, void *new, size_t offset); - */ - -// %rdi == list head, %rsi == new, %rdx == offset - -COMMPAGE_FUNCTION_START(AtomicEnqueue_64, 64, 4) - pushq %rbx - movq %rsi,%rbx // %rbx == new - movq %rdx,%rsi // %rsi == offset - movq (%rdi),%rax // %rax == ptr to 1st element in Q - movq 8(%rdi),%rdx // %rdx == current generation count -1: - movq %rax,(%rbx,%rsi)// link to old list head from new element - movq %rdx,%rcx - incq %rcx // increment generation count - lock // always lock for now... - cmpxchg16b (%rdi) // ...push on new element - jnz 1b - popq %rbx - ret -COMMPAGE_DESCRIPTOR(AtomicEnqueue_64,_COMM_PAGE_ENQUEUE,0,0) - - -/* void* OSAtomicDequeue( OSQueueHead *list, size_t offset); */ - -// %rdi == list head, %rsi == offset - -COMMPAGE_FUNCTION_START(AtomicDequeue_64, 64, 4) - pushq %rbx - movq (%rdi),%rax // %rax == ptr to 1st element in Q - movq 8(%rdi),%rdx // %rdx == current generation count -1: - testq %rax,%rax // list empty? - jz 2f // yes - movq (%rax,%rsi),%rbx // point to 2nd in Q - movq %rdx,%rcx - incq %rcx // increment generation count - lock // always lock for now... - cmpxchg16b (%rdi) // ...pop off 1st element - jnz 1b -2: - popq %rbx - ret // ptr to 1st element in Q still in %rax -COMMPAGE_DESCRIPTOR(AtomicDequeue_64,_COMM_PAGE_DEQUEUE,0,0) diff --git a/osfmk/i386/commpage/bcopy_scalar.s b/osfmk/i386/commpage/bcopy_scalar.s deleted file mode 100644 index f87242ac6..000000000 --- a/osfmk/i386/commpage/bcopy_scalar.s +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2003-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/*- - * Copyright (c) 1990 The Regents of the University of California. - * All rights reserved. - * - * This code is derived from locore.s. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -#include -#include -#include - - /* - * (ov)bcopy (src,dst,cnt) - * ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800 - */ - -COMMPAGE_FUNCTION_START(bcopy_scalar, 32, 5) - pushl %ebp /* set up a frame for backtraces */ - movl %esp,%ebp - pushl %esi - pushl %edi - movl 8(%ebp),%esi - movl 12(%ebp),%edi - jmp 1f -/* - * These need to be 32 bytes from Lbcopy_scalar - */ - .align 5, 0x90 -Lmemcpy_scalar: -Lmemmove_scalar: - pushl %ebp /* set up a frame for backtraces */ - movl %esp,%ebp - pushl %esi - pushl %edi - movl 8(%ebp),%edi - movl 12(%ebp),%esi - movl %edi,%eax -1: - movl 16(%ebp),%ecx - movl %edi,%edx - subl %esi,%edx - cmpl %ecx,%edx /* overlapping? */ - jb 2f - cld /* nope, copy forwards. */ - movl %ecx,%edx - shrl $2,%ecx /* copy by words */ - rep - movsl - movl %edx,%ecx - andl $3,%ecx /* any bytes left? */ - rep - movsb - popl %edi - popl %esi - popl %ebp - ret -2: - addl %ecx,%edi /* copy backwards. */ - addl %ecx,%esi - std - movl %ecx,%edx - andl $3,%ecx /* any fractional bytes? */ - decl %edi - decl %esi - rep - movsb - movl %edx,%ecx /* copy remainder by words */ - shrl $2,%ecx - subl $3,%esi - subl $3,%edi - rep - movsl - popl %edi - popl %esi - popl %ebp - cld - ret - -COMMPAGE_DESCRIPTOR(bcopy_scalar,_COMM_PAGE_BCOPY,0,kHasSSE2+kHasSupplementalSSE3) diff --git a/osfmk/i386/commpage/bcopy_sse2.s b/osfmk/i386/commpage/bcopy_sse2.s deleted file mode 100644 index 9e19b3892..000000000 --- a/osfmk/i386/commpage/bcopy_sse2.s +++ /dev/null @@ -1,473 +0,0 @@ -/* - * Copyright (c) 2005-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include - -/* - * The bcopy/memcpy loops, tuned for Pentium-M class processors with SSE2 - * and 64-byte cache lines, such as Core and Core 2. - * - * The following #defines are tightly coupled to the u-architecture: - */ - -#define kShort 80 // too short to bother with SSE (must be >=80) -#define kVeryLong (500*1024) // large enough for non-temporal stores (must be >= 8192) -#define kBigChunk (256*1024) // outer loop chunk size for kVeryLong sized operands -#define kFastUCode (16*1024) // cutoff for microcode fastpath for "rep/movsl" - - -// void bcopy(const void *src, void *dst, size_t len); - -COMMPAGE_FUNCTION_START(bcopy_sse2, 32, 5) - pushl %ebp // set up a frame for backtraces - movl %esp,%ebp - pushl %esi - pushl %edi - movl 8(%ebp),%esi // get source ptr - movl 12(%ebp),%edi // get dest ptr - jmp Ljoin - -// -// void *memcpy(void *dst, const void *src, size_t len); -// void *memmove(void *dst, const void *src, size_t len); -// -// NB: These need to be 32 bytes from bcopy(): -// - - .align 5, 0x90 -Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len) -Lmemmove: // void *memmove(void *dst, const void *src, size_t len) - pushl %ebp // set up a frame for backtraces - movl %esp,%ebp - pushl %esi - pushl %edi - movl 8(%ebp),%edi // get dest ptr - movl 12(%ebp),%esi // get source ptr - -Ljoin: // here from bcopy() with esi and edi loaded - movl 16(%ebp),%ecx // get length - movl %edi,%edx - subl %esi,%edx // (dest - source) - cmpl %ecx,%edx // must move in reverse if (dest - source) < length - jb LReverseIsland -Lrejoin: // here from very-long-operand copies - cmpl $(kShort),%ecx // long enough to bother with SSE? - ja LNotShort // yes - -// Handle short forward copies. As the most common case, this is the fall-through path. -// ecx = length (<= kShort) -// esi = source ptr -// edi = dest ptr - -Lshort: - movl %ecx,%edx // copy length - shrl $2,%ecx // get #doublewords - jz LLeftovers -2: // loop copying doublewords - movl (%esi),%eax - addl $4,%esi - movl %eax,(%edi) - addl $4,%edi - dec %ecx - jnz 2b -LLeftovers: // handle leftover bytes (0..3) in last word - andl $3,%edx // any leftover bytes? - jz 5f -4: // loop copying bytes - movb (%esi),%al - inc %esi - movb %al,(%edi) - inc %edi - dec %edx - jnz 4b -5: - movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove - popl %edi - popl %esi - popl %ebp - ret - - -LReverseIsland: // keep the "jb" above a short branch... - jmp LReverse // ...because reverse moves are uncommon - - -// Handle forward moves that are long enough to justify use of SSE3. -// First, 16-byte align the destination. -// ecx = length (> kShort) -// esi = source ptr -// edi = dest ptr - -LNotShort: - cmpl $(kVeryLong),%ecx // long enough to justify heavyweight loops? - movl %edi,%edx // copy destination - jae LVeryLong // use very-long-operand path - negl %edx - andl $15,%edx // get #bytes to align destination - jz LDestAligned // already aligned - subl %edx,%ecx // decrement length -1: // loop copying 1..15 bytes - movb (%esi),%al - inc %esi - movb %al,(%edi) - inc %edi - dec %edx - jnz 1b - -// Destination is now aligned. Prepare for forward loops over 64-byte chunks. -// Since kShort>=80 and we've moved at most 15 bytes already, there is at least one chunk. - -LDestAligned: - movl %ecx,%edx // copy length - movl %ecx,%eax // twice - andl $63,%ecx // get remaining bytes for Lshort - andl $-64,%edx // get number of bytes we will copy in inner loop - addl %edx,%esi // point to 1st byte not copied - addl %edx,%edi - negl %edx // now generate offset to 1st byte to be copied - testl $15,%esi // is source aligned too? - jnz LUnalignedLoop // no - - - cmpl $(kFastUCode),%eax // long enough for the fastpath in microcode? - jb LAlignedLoop // no, use SSE - cld // we'll move forward - movl %eax,%ecx // copy length again - shrl $2,%ecx // compute #words to move - addl %edx,%esi // restore ptrs to 1st byte of source and dest - addl %edx,%edi - rep // the u-code will optimize this - movsl - movl %eax,%edx // original length - jmp LLeftovers // handle 0..3 leftover bytes - - -// Forward aligned loop for medium length operands (kShort < n < kVeryLong). - - .align 4,0x90 // 16-byte align inner loops -LAlignedLoop: // loop over 64-byte chunks - movdqa (%esi,%edx),%xmm0 - movdqa 16(%esi,%edx),%xmm1 - movdqa 32(%esi,%edx),%xmm2 - movdqa 48(%esi,%edx),%xmm3 - - movdqa %xmm0,(%edi,%edx) - movdqa %xmm1,16(%edi,%edx) - movdqa %xmm2,32(%edi,%edx) - movdqa %xmm3,48(%edi,%edx) - - addl $64,%edx - jnz LAlignedLoop - - jmp Lshort // copy remaining 0..15 bytes and done - - -// Forward unaligned loop for medium length operands (kShort < n < kVeryLong). -// Note that LDDQU==MOVDQU on these machines, ie we don't care when we cross -// source cache lines. - - .align 4,0x90 // 16-byte align inner loops -LUnalignedLoop: // loop over 64-byte chunks - movdqu (%esi,%edx),%xmm0 // the loads are unaligned - movdqu 16(%esi,%edx),%xmm1 - movdqu 32(%esi,%edx),%xmm2 - movdqu 48(%esi,%edx),%xmm3 - - movdqa %xmm0,(%edi,%edx) // we can use aligned stores - movdqa %xmm1,16(%edi,%edx) - movdqa %xmm2,32(%edi,%edx) - movdqa %xmm3,48(%edi,%edx) - - addl $64,%edx - jnz LUnalignedLoop - - jmp Lshort // copy remaining 0..63 bytes and done - - -// Very long forward moves. These are at least several pages, so we loop over big -// chunks of memory (kBigChunk in size.) We first prefetch the chunk, and then copy -// it using non-temporal stores. Hopefully all the reads occur in the prefetch loop, -// so the copy loop reads from L2 and writes directly to memory (with write combining.) -// This minimizes bus turnaround and maintains good DRAM page locality. -// Note that for this scheme to work, kVeryLong must be a large fraction of L2 cache -// size. Otherwise, it is counter-productive to bypass L2 on the stores. -// ecx = length (>= kVeryLong bytes) -// edi = dest (aligned) -// esi = source - -LVeryLong: - pushl %ebx // we'll need to use this - movl %edi,%ebx // copy dest ptr - negl %ebx - andl $63,%ebx // get #bytes to cache line align destination - jz LBigChunkLoop // already aligned - -// Cache line align destination, so temporal stores in copy loops work right. - - pushl %ecx // save total length remaining - pushl %ebx // arg3 - #bytes to align destination (1..63) - pushl %esi // arg2 - source - pushl %edi // arg1 - dest - call Lmemcpy // align the destination - movl 12(%esp),%ecx // recover total length - addl $16,%esp - addl %ebx,%esi // adjust ptrs and lengths past copy - addl %ebx,%edi - subl %ebx,%ecx - -// Loop over big chunks. -// ecx = length remaining (>= 4096) -// edi = dest (64-byte aligned) -// esi = source (may be unaligned) - -LBigChunkLoop: - movl $(kBigChunk),%edx // assume we can do a full chunk - cmpl %edx,%ecx // do we have a full chunk left to do? - cmovbl %ecx,%edx // if not, only move what we have left - andl $-4096,%edx // we work in page multiples - xor %eax,%eax // initialize chunk offset - jmp LTouchLoop - -// Because the source may be unaligned, we use byte loads to touch. -// ecx = length remaining (including this chunk) -// edi = ptr to start of dest chunk -// esi = ptr to start of source chunk -// edx = chunk length (multiples of pages) -// ebx = scratch reg used to read a byte of each cache line -// eax = chunk offset - - .align 4,0x90 // 16-byte align inner loops -LTouchLoop: - movzb (%esi,%eax),%ebx // touch line 0, 2, 4, or 6 of page - movzb 1*64(%esi,%eax),%ebx // touch line 1, 3, 5, or 7 - movzb 8*64(%esi,%eax),%ebx // touch line 8, 10, 12, or 14 - movzb 9*64(%esi,%eax),%ebx // etc - - movzb 16*64(%esi,%eax),%ebx - movzb 17*64(%esi,%eax),%ebx - movzb 24*64(%esi,%eax),%ebx - movzb 25*64(%esi,%eax),%ebx - - movzb 32*64(%esi,%eax),%ebx - movzb 33*64(%esi,%eax),%ebx - movzb 40*64(%esi,%eax),%ebx - movzb 41*64(%esi,%eax),%ebx - - movzb 48*64(%esi,%eax),%ebx - movzb 49*64(%esi,%eax),%ebx - movzb 56*64(%esi,%eax),%ebx - movzb 57*64(%esi,%eax),%ebx - - subl $-128,%eax // next slice of page (adding 128 w 8-bit immediate) - testl $512,%eax // done with this page? - jz LTouchLoop // no, next of four slices - addl $(4096-512),%eax // move on to next page - cmpl %eax,%edx // done with this chunk? - jnz LTouchLoop // no, do next page - -// The chunk has been pre-fetched, now copy it using non-temporal stores. -// There are two copy loops, depending on whether the source is 16-byte aligned -// or not. - - addl %edx,%esi // increment ptrs by chunk length - addl %edx,%edi - subl %edx,%ecx // adjust remaining length - negl %edx // prepare loop index (counts up to 0) - testl $15,%esi // is source 16-byte aligned? - jnz LVeryLongUnaligned // source is not aligned - jmp LVeryLongAligned - - .align 4,0x90 // 16-byte align inner loops -LVeryLongAligned: // aligned loop over 128-bytes - movdqa (%esi,%edx),%xmm0 - movdqa 16(%esi,%edx),%xmm1 - movdqa 32(%esi,%edx),%xmm2 - movdqa 48(%esi,%edx),%xmm3 - movdqa 64(%esi,%edx),%xmm4 - movdqa 80(%esi,%edx),%xmm5 - movdqa 96(%esi,%edx),%xmm6 - movdqa 112(%esi,%edx),%xmm7 - - movntdq %xmm0,(%edi,%edx) - movntdq %xmm1,16(%edi,%edx) - movntdq %xmm2,32(%edi,%edx) - movntdq %xmm3,48(%edi,%edx) - movntdq %xmm4,64(%edi,%edx) - movntdq %xmm5,80(%edi,%edx) - movntdq %xmm6,96(%edi,%edx) - movntdq %xmm7,112(%edi,%edx) - - subl $-128,%edx // add 128 with an 8-bit immediate - jnz LVeryLongAligned - jmp LVeryLongChunkEnd - - .align 4,0x90 // 16-byte align inner loops -LVeryLongUnaligned: // unaligned loop over 128-bytes - movdqu (%esi,%edx),%xmm0 - movdqu 16(%esi,%edx),%xmm1 - movdqu 32(%esi,%edx),%xmm2 - movdqu 48(%esi,%edx),%xmm3 - movdqu 64(%esi,%edx),%xmm4 - movdqu 80(%esi,%edx),%xmm5 - movdqu 96(%esi,%edx),%xmm6 - movdqu 112(%esi,%edx),%xmm7 - - movntdq %xmm0,(%edi,%edx) - movntdq %xmm1,16(%edi,%edx) - movntdq %xmm2,32(%edi,%edx) - movntdq %xmm3,48(%edi,%edx) - movntdq %xmm4,64(%edi,%edx) - movntdq %xmm5,80(%edi,%edx) - movntdq %xmm6,96(%edi,%edx) - movntdq %xmm7,112(%edi,%edx) - - subl $-128,%edx // add 128 with an 8-bit immediate - jnz LVeryLongUnaligned - -LVeryLongChunkEnd: - cmpl $4096,%ecx // at least another page to go? - jae LBigChunkLoop // yes - - sfence // required by non-temporal stores - popl %ebx - jmp Lrejoin // handle remaining (0..4095) bytes - - -// Reverse moves. -// ecx = length -// esi = source ptr -// edi = dest ptr - -LReverse: - addl %ecx,%esi // point to end of strings - addl %ecx,%edi - cmpl $(kShort),%ecx // long enough to bother with SSE? - ja LReverseNotShort // yes - -// Handle reverse short copies. -// ecx = length -// esi = one byte past end of source -// edi = one byte past end of dest - -LReverseShort: - movl %ecx,%edx // copy length - shrl $2,%ecx // #words - jz 3f -1: - subl $4,%esi - movl (%esi),%eax - subl $4,%edi - movl %eax,(%edi) - dec %ecx - jnz 1b -3: - andl $3,%edx // bytes? - jz 5f -4: - dec %esi - movb (%esi),%al - dec %edi - movb %al,(%edi) - dec %edx - jnz 4b -5: - movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove - popl %edi - popl %esi - popl %ebp - ret - -// Handle a reverse move long enough to justify using SSE. -// ecx = length -// esi = one byte past end of source -// edi = one byte past end of dest - -LReverseNotShort: - movl %edi,%edx // copy destination - andl $15,%edx // get #bytes to align destination - je LReverseDestAligned // already aligned - subl %edx,%ecx // adjust length -1: // loop copying 1..15 bytes - dec %esi - movb (%esi),%al - dec %edi - movb %al,(%edi) - dec %edx - jnz 1b - -// Destination is now aligned. Prepare for reverse loops. - -LReverseDestAligned: - movl %ecx,%edx // copy length - andl $63,%ecx // get remaining bytes for Lshort - andl $-64,%edx // get number of bytes we will copy in inner loop - subl %edx,%esi // point to endpoint of copy - subl %edx,%edi - testl $15,%esi // is source aligned too? - jnz LReverseUnalignedLoop // no - jmp LReverseAlignedLoop // use aligned loop - - .align 4,0x90 // 16-byte align inner loops -LReverseAlignedLoop: // loop over 64-byte chunks - movdqa -16(%esi,%edx),%xmm0 - movdqa -32(%esi,%edx),%xmm1 - movdqa -48(%esi,%edx),%xmm2 - movdqa -64(%esi,%edx),%xmm3 - - movdqa %xmm0,-16(%edi,%edx) - movdqa %xmm1,-32(%edi,%edx) - movdqa %xmm2,-48(%edi,%edx) - movdqa %xmm3,-64(%edi,%edx) - - subl $64,%edx - jne LReverseAlignedLoop - - jmp LReverseShort // copy remaining 0..63 bytes and done - - -// Reverse, unaligned loop. LDDQU==MOVDQU on these machines. - - .align 4,0x90 // 16-byte align inner loops -LReverseUnalignedLoop: // loop over 64-byte chunks - movdqu -16(%esi,%edx),%xmm0 - movdqu -32(%esi,%edx),%xmm1 - movdqu -48(%esi,%edx),%xmm2 - movdqu -64(%esi,%edx),%xmm3 - - movdqa %xmm0,-16(%edi,%edx) - movdqa %xmm1,-32(%edi,%edx) - movdqa %xmm2,-48(%edi,%edx) - movdqa %xmm3,-64(%edi,%edx) - - subl $64,%edx - jne LReverseUnalignedLoop - - jmp LReverseShort // copy remaining 0..63 bytes and done - -COMMPAGE_DESCRIPTOR(bcopy_sse2,_COMM_PAGE_BCOPY,kHasSSE2+kCache64,kHasSupplementalSSE3) diff --git a/osfmk/i386/commpage/bcopy_sse3x.s b/osfmk/i386/commpage/bcopy_sse3x.s deleted file mode 100644 index 017895aab..000000000 --- a/osfmk/i386/commpage/bcopy_sse3x.s +++ /dev/null @@ -1,823 +0,0 @@ -/* - * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include - -/* - * The bcopy/memcpy loops, tuned for Pentium-M class processors with - * Supplemental SSE3 and 64-byte cache lines. - * - * The following #defines are tightly coupled to the u-architecture: - */ - -#define kShort 80 // too short to bother with SSE (must be >=80) -#define kVeryLong (500*1024) // large enough for non-temporal stores (must be >= 8192) -#define kFastUCode ((16*1024)-15) // cutoff for microcode fastpath for "rep/movsl" - -// void bcopy(const void *src, void *dst, size_t len); - -COMMPAGE_FUNCTION_START(bcopy_sse3x, 32, 5) -LZero: - pushl %ebp // set up a frame for backtraces - movl %esp,%ebp - pushl %esi - pushl %edi - movl 8(%ebp),%esi // get source ptr - movl 12(%ebp),%edi // get dest ptr - movl 16(%ebp),%ecx // get length - movl %edi,%edx - subl %esi,%edx // (dest - source) - cmpl %ecx,%edx // must move in reverse if (dest - source) < length - jb LReverseIsland - cmpl $(kShort),%ecx // long enough to bother with SSE? - jbe Lshort // no - jmp LNotShort - -// -// void *memcpy(void *dst, const void *src, size_t len); -// void *memmove(void *dst, const void *src, size_t len); -// -// NB: These need to be 32 bytes from bcopy(): -// - - .align 5, 0x90 -Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len) -Lmemmove: // void *memmove(void *dst, const void *src, size_t len) - pushl %ebp // set up a frame for backtraces - movl %esp,%ebp - pushl %esi - pushl %edi - movl 8(%ebp),%edi // get dest ptr - movl 12(%ebp),%esi // get source ptr - movl 16(%ebp),%ecx // get length - movl %edi,%edx - subl %esi,%edx // (dest - source) - cmpl %ecx,%edx // must move in reverse if (dest - source) < length - jb LReverseIsland - cmpl $(kShort),%ecx // long enough to bother with SSE? - ja LNotShort // yes - -// Handle short forward copies. As the most common case, this is the fall-through path. -// ecx = length (<= kShort) -// esi = source ptr -// edi = dest ptr - -Lshort: - movl %ecx,%edx // copy length - shrl $2,%ecx // get #doublewords - jz LLeftovers -2: // loop copying doublewords - movl (%esi),%eax - addl $4,%esi - movl %eax,(%edi) - addl $4,%edi - dec %ecx - jnz 2b -LLeftovers: // handle leftover bytes (0..3) in last word - andl $3,%edx // any leftover bytes? - jz Lexit -4: // loop copying bytes - movb (%esi),%al - inc %esi - movb %al,(%edi) - inc %edi - dec %edx - jnz 4b -Lexit: - movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove - popl %edi - popl %esi - popl %ebp - ret - - -LReverseIsland: // keep the "jb" above a short branch... - jmp LReverse // ...because reverse moves are uncommon - - -// Handle forward moves that are long enough to justify use of SSE3. -// First, 16-byte align the destination. -// ecx = length (> kShort) -// esi = source ptr -// edi = dest ptr - -LNotShort: - cmpl $(kVeryLong),%ecx // long enough to justify heavyweight loops? - movl %edi,%edx // copy destination - jae LVeryLong // use very-long-operand path - negl %edx - andl $15,%edx // get #bytes to align destination - jz LDestAligned // already aligned - subl %edx,%ecx // decrement length -1: // loop copying 1..15 bytes - movb (%esi),%al - inc %esi - movb %al,(%edi) - inc %edi - dec %edx - jnz 1b - -// Destination is now aligned. Dispatch to one of sixteen loops over 64-byte chunks, -// based on the alignment of the source. All vector loads and stores are aligned. -// Even though this means we have to shift and repack vectors, doing so is much faster -// than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already, -// there is at least one chunk. When we enter the copy loops, the following registers -// are set up: -// ecx = residual length (0..63) -// edx = -(length to move), a multiple of 64 -// esi = ptr to 1st source byte not to move (unaligned) -// edi = ptr to 1st dest byte not to move (aligned) - -LDestAligned: - movl %ecx,%edx // copy length - movl %esi,%eax // copy source address - andl $63,%ecx // get remaining bytes for Lshort - andl $-64,%edx // get number of bytes we will copy in inner loop - andl $15,%eax // mask to low 4 bits of source address - addl %edx,%esi // point to 1st byte not copied - addl %edx,%edi - negl %edx // now generate offset to 1st byte to be copied -.set LTableOffset, LTable - LZero - leal (LTableOffset)(,%eax,4), %eax // load jump table entry address, relative to LZero - movl _COMM_PAGE_BCOPY(%eax), %eax // load jump table entry - addl $(_COMM_PAGE_BCOPY), %eax // add runtime address of LZero to get final function - jmp *%eax - - .align 2 -LTable: // table of copy loop addresses -// force generation of assembly-time constants. Otherwise assembler -// creates subtractor relocations relative to first external symbol, -// and this file has none -.set LMod0Offset, LMod0 - LZero -.set LMod1Offset, LMod1 - LZero -.set LMod2Offset, LMod2 - LZero -.set LMod3Offset, LMod3 - LZero -.set LMod4Offset, LMod4 - LZero -.set LMod5Offset, LMod5 - LZero -.set LMod6Offset, LMod6 - LZero -.set LMod7Offset, LMod7 - LZero -.set LMod8Offset, LMod8 - LZero -.set LMod9Offset, LMod9 - LZero -.set LMod10Offset, LMod10 - LZero -.set LMod11Offset, LMod11 - LZero -.set LMod12Offset, LMod12 - LZero -.set LMod13Offset, LMod13 - LZero -.set LMod14Offset, LMod14 - LZero -.set LMod15Offset, LMod15 - LZero - .long LMod0Offset - .long LMod1Offset - .long LMod2Offset - .long LMod3Offset - .long LMod4Offset - .long LMod5Offset - .long LMod6Offset - .long LMod7Offset - .long LMod8Offset - .long LMod9Offset - .long LMod10Offset - .long LMod11Offset - .long LMod12Offset - .long LMod13Offset - .long LMod14Offset - .long LMod15Offset - - -// Very long forward moves. These are at least several pages. They are special cased -// and aggressively optimized, not so much because they are common or useful, but -// because they are subject to benchmark. There isn't enough room for them in the -// area reserved on the commpage for bcopy, so we put them elsewhere. We call -// the longcopy routine using the normal ABI. - -LVeryLong: - pushl %ecx // length (>= kVeryLong) - pushl %esi // source ptr - pushl %edi // dest ptr - movl $(_COMM_PAGE_LONGCOPY),%eax - call *%eax // do the long copy - addl $12,%esp // pop off our parameters - jmp Lexit - - -// On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 8-byte -// aligned operands from about 32KB up to kVeryLong for the hot cache case, and from -// about 256 bytes up to kVeryLong for cold caches. This is because the microcode -// avoids having to read destination cache lines that will be completely overwritten. -// The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since -// we do not know if the destination is in cache or not. - -Lfastpath: - addl %edx,%esi // restore ptrs to 1st byte of source and dest - addl %edx,%edi - negl %edx // make length positive - orl %edx,%ecx // restore total #bytes remaining to move - cld // we'll move forward - movl %ecx,%edx // copy total length to move - shrl $2,%ecx // compute #words to move - rep // the u-code will optimize this - movsl - jmp LLeftovers // handle 0..3 leftover bytes - - -// Forward loop for medium length operands in which low four bits of %esi == 0000 - -LMod0: - cmpl $(-kFastUCode),%edx // %edx == -length, where (length < kVeryLong) - jle Lfastpath // long enough for fastpath in microcode - jmp 1f - .align 4,0x90 // 16-byte align inner loops -1: // loop over 64-byte chunks - movdqa (%esi,%edx),%xmm0 - movdqa 16(%esi,%edx),%xmm1 - movdqa 32(%esi,%edx),%xmm2 - movdqa 48(%esi,%edx),%xmm3 - - movdqa %xmm0,(%edi,%edx) - movdqa %xmm1,16(%edi,%edx) - movdqa %xmm2,32(%edi,%edx) - movdqa %xmm3,48(%edi,%edx) - - addl $64,%edx - jnz 1b - - jmp Lshort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %esi == 0001 - -LMod1: - movdqa -1(%esi,%edx),%xmm0 // prime the loop by loading 1st quadword -1: // loop over 64-byte chunks - movdqa 15(%esi,%edx),%xmm1 - movdqa 31(%esi,%edx),%xmm2 - movdqa 47(%esi,%edx),%xmm3 - movdqa 63(%esi,%edx),%xmm4 - - movdqa %xmm0,%xmm5 - movdqa %xmm4,%xmm0 - - palignr $1,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) - palignr $1,%xmm2,%xmm3 - palignr $1,%xmm1,%xmm2 - palignr $1,%xmm5,%xmm1 - - movdqa %xmm1,(%edi,%edx) - movdqa %xmm2,16(%edi,%edx) - movdqa %xmm3,32(%edi,%edx) - movdqa %xmm4,48(%edi,%edx) - - addl $64,%edx - jnz 1b - - jmp Lshort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %esi == 0010 - -LMod2: - movdqa -2(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq -1: // loop over 64-byte chunks - movdqa 14(%esi,%edx),%xmm1 - movdqa 30(%esi,%edx),%xmm2 - movdqa 46(%esi,%edx),%xmm3 - movdqa 62(%esi,%edx),%xmm4 - - movdqa %xmm0,%xmm5 - movdqa %xmm4,%xmm0 - - palignr $2,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) - palignr $2,%xmm2,%xmm3 - palignr $2,%xmm1,%xmm2 - palignr $2,%xmm5,%xmm1 - - movdqa %xmm1,(%edi,%edx) - movdqa %xmm2,16(%edi,%edx) - movdqa %xmm3,32(%edi,%edx) - movdqa %xmm4,48(%edi,%edx) - - addl $64,%edx - jnz 1b - - jmp Lshort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %esi == 0011 - -LMod3: - movdqa -3(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq -1: // loop over 64-byte chunks - movdqa 13(%esi,%edx),%xmm1 - movdqa 29(%esi,%edx),%xmm2 - movdqa 45(%esi,%edx),%xmm3 - movdqa 61(%esi,%edx),%xmm4 - - movdqa %xmm0,%xmm5 - movdqa %xmm4,%xmm0 - - palignr $3,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) - palignr $3,%xmm2,%xmm3 - palignr $3,%xmm1,%xmm2 - palignr $3,%xmm5,%xmm1 - - movdqa %xmm1,(%edi,%edx) - movdqa %xmm2,16(%edi,%edx) - movdqa %xmm3,32(%edi,%edx) - movdqa %xmm4,48(%edi,%edx) - - addl $64,%edx - jnz 1b - - jmp Lshort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %esi == 0100 -// We use the float single data type in order to use "movss" to merge vectors. - -LMod4: - movaps -4(%esi,%edx),%xmm0 // 4-byte aligned: prime the loop - jmp 1f - .align 4,0x90 -1: // loop over 64-byte chunks - movaps 12(%esi,%edx),%xmm1 - movaps 28(%esi,%edx),%xmm2 - movss %xmm1,%xmm0 // copy low 4 bytes of source into destination - pshufd $(0x39),%xmm0,%xmm0 // rotate right 4 bytes (mask -- 00 11 10 01) - movaps 44(%esi,%edx),%xmm3 - movss %xmm2,%xmm1 - pshufd $(0x39),%xmm1,%xmm1 - movaps 60(%esi,%edx),%xmm4 - movss %xmm3,%xmm2 - pshufd $(0x39),%xmm2,%xmm2 - - movaps %xmm0,(%edi,%edx) - movss %xmm4,%xmm3 - pshufd $(0x39),%xmm3,%xmm3 - movaps %xmm1,16(%edi,%edx) - movaps %xmm2,32(%edi,%edx) - movaps %xmm4,%xmm0 - movaps %xmm3,48(%edi,%edx) - - addl $64,%edx - jnz 1b - - jmp Lshort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %esi == 0101 - -LMod5: - movdqa -5(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq -1: // loop over 64-byte chunks - movdqa 11(%esi,%edx),%xmm1 - movdqa 27(%esi,%edx),%xmm2 - movdqa 43(%esi,%edx),%xmm3 - movdqa 59(%esi,%edx),%xmm4 - - movdqa %xmm0,%xmm5 - movdqa %xmm4,%xmm0 - - palignr $5,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) - palignr $5,%xmm2,%xmm3 - palignr $5,%xmm1,%xmm2 - palignr $5,%xmm5,%xmm1 - - movdqa %xmm1,(%edi,%edx) - movdqa %xmm2,16(%edi,%edx) - movdqa %xmm3,32(%edi,%edx) - movdqa %xmm4,48(%edi,%edx) - - addl $64,%edx - jnz 1b - - jmp Lshort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %esi == 0110 - -LMod6: - movdqa -6(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq -1: // loop over 64-byte chunks - movdqa 10(%esi,%edx),%xmm1 - movdqa 26(%esi,%edx),%xmm2 - movdqa 42(%esi,%edx),%xmm3 - movdqa 58(%esi,%edx),%xmm4 - - movdqa %xmm0,%xmm5 - movdqa %xmm4,%xmm0 - - palignr $6,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) - palignr $6,%xmm2,%xmm3 - palignr $6,%xmm1,%xmm2 - palignr $6,%xmm5,%xmm1 - - movdqa %xmm1,(%edi,%edx) - movdqa %xmm2,16(%edi,%edx) - movdqa %xmm3,32(%edi,%edx) - movdqa %xmm4,48(%edi,%edx) - - addl $64,%edx - jnz 1b - - jmp Lshort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %esi == 0111 - -LMod7: - movdqa -7(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq -1: // loop over 64-byte chunks - movdqa 9(%esi,%edx),%xmm1 - movdqa 25(%esi,%edx),%xmm2 - movdqa 41(%esi,%edx),%xmm3 - movdqa 57(%esi,%edx),%xmm4 - - movdqa %xmm0,%xmm5 - movdqa %xmm4,%xmm0 - - palignr $7,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) - palignr $7,%xmm2,%xmm3 - palignr $7,%xmm1,%xmm2 - palignr $7,%xmm5,%xmm1 - - movdqa %xmm1,(%edi,%edx) - movdqa %xmm2,16(%edi,%edx) - movdqa %xmm3,32(%edi,%edx) - movdqa %xmm4,48(%edi,%edx) - - addl $64,%edx - jnz 1b - - jmp Lshort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %esi == 1000 -// We use the float double data type in order to use "shufpd" to shift by 8 bytes. - -LMod8: - cmpl $(-kFastUCode),%edx // %edx == -length, where (length < kVeryLong) - jle Lfastpath // long enough for fastpath in microcode - movapd -8(%esi,%edx),%xmm0 // 8-byte aligned: prime the loop - jmp 1f - .align 4,0x90 -1: // loop over 64-byte chunks - movapd 8(%esi,%edx),%xmm1 - movapd 24(%esi,%edx),%xmm2 - shufpd $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes) - movapd 40(%esi,%edx),%xmm3 - shufpd $01,%xmm2,%xmm1 - movapd 56(%esi,%edx),%xmm4 - shufpd $01,%xmm3,%xmm2 - - movapd %xmm0,(%edi,%edx) - shufpd $01,%xmm4,%xmm3 - movapd %xmm1,16(%edi,%edx) - movapd %xmm2,32(%edi,%edx) - movapd %xmm4,%xmm0 - movapd %xmm3,48(%edi,%edx) - - addl $64,%edx - jnz 1b - - jmp Lshort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %esi == 1001 - -LMod9: - movdqa -9(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq -1: // loop over 64-byte chunks - movdqa 7(%esi,%edx),%xmm1 - movdqa 23(%esi,%edx),%xmm2 - movdqa 39(%esi,%edx),%xmm3 - movdqa 55(%esi,%edx),%xmm4 - - movdqa %xmm0,%xmm5 - movdqa %xmm4,%xmm0 - - palignr $9,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) - palignr $9,%xmm2,%xmm3 - palignr $9,%xmm1,%xmm2 - palignr $9,%xmm5,%xmm1 - - movdqa %xmm1,(%edi,%edx) - movdqa %xmm2,16(%edi,%edx) - movdqa %xmm3,32(%edi,%edx) - movdqa %xmm4,48(%edi,%edx) - - addl $64,%edx - jnz 1b - - jmp Lshort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %esi == 1010 - -LMod10: - movdqa -10(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq -1: // loop over 64-byte chunks - movdqa 6(%esi,%edx),%xmm1 - movdqa 22(%esi,%edx),%xmm2 - movdqa 38(%esi,%edx),%xmm3 - movdqa 54(%esi,%edx),%xmm4 - - movdqa %xmm0,%xmm5 - movdqa %xmm4,%xmm0 - - palignr $10,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) - palignr $10,%xmm2,%xmm3 - palignr $10,%xmm1,%xmm2 - palignr $10,%xmm5,%xmm1 - - movdqa %xmm1,(%edi,%edx) - movdqa %xmm2,16(%edi,%edx) - movdqa %xmm3,32(%edi,%edx) - movdqa %xmm4,48(%edi,%edx) - - addl $64,%edx - jnz 1b - - jmp Lshort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %esi == 1011 - -LMod11: - movdqa -11(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq -1: // loop over 64-byte chunks - movdqa 5(%esi,%edx),%xmm1 - movdqa 21(%esi,%edx),%xmm2 - movdqa 37(%esi,%edx),%xmm3 - movdqa 53(%esi,%edx),%xmm4 - - movdqa %xmm0,%xmm5 - movdqa %xmm4,%xmm0 - - palignr $11,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) - palignr $11,%xmm2,%xmm3 - palignr $11,%xmm1,%xmm2 - palignr $11,%xmm5,%xmm1 - - movdqa %xmm1,(%edi,%edx) - movdqa %xmm2,16(%edi,%edx) - movdqa %xmm3,32(%edi,%edx) - movdqa %xmm4,48(%edi,%edx) - - addl $64,%edx - jnz 1b - - jmp Lshort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %esi == 1100 -// We use the float single data type in order to use "movss" to merge vectors. - -LMod12: - movss (%esi,%edx),%xmm0 // prefetch 1st four bytes of source, right justified - jmp 1f - .align 4,0x90 -1: // loop over 64-byte chunks - pshufd $(0x93),4(%esi,%edx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11) - pshufd $(0x93),20(%esi,%edx),%xmm2 - pshufd $(0x93),36(%esi,%edx),%xmm3 - pshufd $(0x93),52(%esi,%edx),%xmm4 - - movaps %xmm4,%xmm5 - movss %xmm3,%xmm4 // copy low 4 bytes of source into destination - movss %xmm2,%xmm3 - movss %xmm1,%xmm2 - movss %xmm0,%xmm1 - - movaps %xmm1,(%edi,%edx) - movaps %xmm2,16(%edi,%edx) - movaps %xmm5,%xmm0 - movaps %xmm3,32(%edi,%edx) - movaps %xmm4,48(%edi,%edx) - - addl $64,%edx - jnz 1b - - jmp Lshort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %esi == 1101 - -LMod13: - movdqa -13(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq -1: // loop over 64-byte chunks - movdqa 3(%esi,%edx),%xmm1 - movdqa 19(%esi,%edx),%xmm2 - movdqa 35(%esi,%edx),%xmm3 - movdqa 51(%esi,%edx),%xmm4 - - movdqa %xmm0,%xmm5 - movdqa %xmm4,%xmm0 - - palignr $13,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) - palignr $13,%xmm2,%xmm3 - palignr $13,%xmm1,%xmm2 - palignr $13,%xmm5,%xmm1 - - movdqa %xmm1,(%edi,%edx) - movdqa %xmm2,16(%edi,%edx) - movdqa %xmm3,32(%edi,%edx) - movdqa %xmm4,48(%edi,%edx) - - addl $64,%edx - jnz 1b - - jmp Lshort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %esi == 1110 - -LMod14: - movdqa -14(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq -1: // loop over 64-byte chunks - movdqa 2(%esi,%edx),%xmm1 - movdqa 18(%esi,%edx),%xmm2 - movdqa 34(%esi,%edx),%xmm3 - movdqa 50(%esi,%edx),%xmm4 - - movdqa %xmm0,%xmm5 - movdqa %xmm4,%xmm0 - - palignr $14,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) - palignr $14,%xmm2,%xmm3 - palignr $14,%xmm1,%xmm2 - palignr $14,%xmm5,%xmm1 - - movdqa %xmm1,(%edi,%edx) - movdqa %xmm2,16(%edi,%edx) - movdqa %xmm3,32(%edi,%edx) - movdqa %xmm4,48(%edi,%edx) - - addl $64,%edx - jnz 1b - - jmp Lshort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %esi == 1111 - -LMod15: - movdqa -15(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq -1: // loop over 64-byte chunks - movdqa 1(%esi,%edx),%xmm1 - movdqa 17(%esi,%edx),%xmm2 - movdqa 33(%esi,%edx),%xmm3 - movdqa 49(%esi,%edx),%xmm4 - - movdqa %xmm0,%xmm5 - movdqa %xmm4,%xmm0 - - palignr $15,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) - palignr $15,%xmm2,%xmm3 - palignr $15,%xmm1,%xmm2 - palignr $15,%xmm5,%xmm1 - - movdqa %xmm1,(%edi,%edx) - movdqa %xmm2,16(%edi,%edx) - movdqa %xmm3,32(%edi,%edx) - movdqa %xmm4,48(%edi,%edx) - - addl $64,%edx - jnz 1b - - jmp Lshort // copy remaining 0..63 bytes and done - - -// Reverse moves. These are not optimized as aggressively as their forward -// counterparts, as they are only used with destructive overlap. -// ecx = length -// esi = source ptr -// edi = dest ptr - -LReverse: - addl %ecx,%esi // point to end of strings - addl %ecx,%edi - cmpl $(kShort),%ecx // long enough to bother with SSE? - ja LReverseNotShort // yes - -// Handle reverse short copies. -// ecx = length -// esi = one byte past end of source -// edi = one byte past end of dest - -LReverseShort: - movl %ecx,%edx // copy length - shrl $2,%ecx // #words - jz 3f -1: - subl $4,%esi - movl (%esi),%eax - subl $4,%edi - movl %eax,(%edi) - dec %ecx - jnz 1b -3: - andl $3,%edx // bytes? - jz 5f -4: - dec %esi - movb (%esi),%al - dec %edi - movb %al,(%edi) - dec %edx - jnz 4b -5: - movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove - popl %edi - popl %esi - popl %ebp - ret - -// Handle a reverse move long enough to justify using SSE. -// ecx = length -// esi = one byte past end of source -// edi = one byte past end of dest - -LReverseNotShort: - movl %edi,%edx // copy destination - andl $15,%edx // get #bytes to align destination - je LReverseDestAligned // already aligned - subl %edx,%ecx // adjust length -1: // loop copying 1..15 bytes - dec %esi - movb (%esi),%al - dec %edi - movb %al,(%edi) - dec %edx - jnz 1b - -// Destination is now aligned. Prepare for reverse loops. - -LReverseDestAligned: - movl %ecx,%edx // copy length - andl $63,%ecx // get remaining bytes for Lshort - andl $-64,%edx // get number of bytes we will copy in inner loop - subl %edx,%esi // point to endpoint of copy - subl %edx,%edi - testl $15,%esi // is source aligned too? - jnz LReverseUnalignedLoop // no - -LReverseAlignedLoop: // loop over 64-byte chunks - movdqa -16(%esi,%edx),%xmm0 - movdqa -32(%esi,%edx),%xmm1 - movdqa -48(%esi,%edx),%xmm2 - movdqa -64(%esi,%edx),%xmm3 - - movdqa %xmm0,-16(%edi,%edx) - movdqa %xmm1,-32(%edi,%edx) - movdqa %xmm2,-48(%edi,%edx) - movdqa %xmm3,-64(%edi,%edx) - - subl $64,%edx - jne LReverseAlignedLoop - - jmp LReverseShort // copy remaining 0..63 bytes and done - - -// Reverse, unaligned loop. LDDQU==MOVDQU on these machines. - -LReverseUnalignedLoop: // loop over 64-byte chunks - movdqu -16(%esi,%edx),%xmm0 - movdqu -32(%esi,%edx),%xmm1 - movdqu -48(%esi,%edx),%xmm2 - movdqu -64(%esi,%edx),%xmm3 - - movdqa %xmm0,-16(%edi,%edx) - movdqa %xmm1,-32(%edi,%edx) - movdqa %xmm2,-48(%edi,%edx) - movdqa %xmm3,-64(%edi,%edx) - - subl $64,%edx - jne LReverseUnalignedLoop - - jmp LReverseShort // copy remaining 0..63 bytes and done - -COMMPAGE_DESCRIPTOR(bcopy_sse3x,_COMM_PAGE_BCOPY,kHasSSE2+kHasSupplementalSSE3+kCache64,kHasSSE4_2) diff --git a/osfmk/i386/commpage/bcopy_sse3x_64.s b/osfmk/i386/commpage/bcopy_sse3x_64.s deleted file mode 100644 index 2a0e46be9..000000000 --- a/osfmk/i386/commpage/bcopy_sse3x_64.s +++ /dev/null @@ -1,820 +0,0 @@ -/* - * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include - -/* - * The bcopy/memcpy loops, tuned for 64-bit Pentium-M class processors with - * Supplemental SSE3 and 64-byte cache lines. This is the 64-bit version. - * - * The following #defines are tightly coupled to the u-architecture: - */ - -#define kShort 80 // too short to bother with SSE (must be >=80) -#define kVeryLong (500*1024) // large enough for non-temporal stores (>=8192 and <2GB) -#define kFastUCode ((16*1024)-15) // cutoff for microcode fastpath for "rep/movsl" - -// void bcopy(const void *src, void *dst, size_t len); - -COMMPAGE_FUNCTION_START(bcopy_sse3x_64, 64, 5) -LZero: - pushq %rbp // set up a frame for backtraces - movq %rsp,%rbp - movq %rsi,%rax // copy dest ptr - movq %rdi,%rsi // xchange source and dest ptrs - movq %rax,%rdi - subq %rsi,%rax // (dest - source) - cmpq %rdx,%rax // must move in reverse if (dest - source) < length - jb LReverseIsland - cmpq $(kShort),%rdx // long enough to bother with SSE? - jbe LShort // no - jmp LNotShort - -// -// void *memcpy(void *dst, const void *src, size_t len); -// void *memmove(void *dst, const void *src, size_t len); -// -// NB: These need to be 32 bytes from bcopy(): -// - - .align 5, 0x90 -Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len) -Lmemmove: // void *memmove(void *dst, const void *src, size_t len) - pushq %rbp // set up a frame for backtraces - movq %rsp,%rbp - movq %rdi,%r11 // save return value here - movq %rdi,%rax - subq %rsi,%rax // (dest - source) - cmpq %rdx,%rax // must move in reverse if (dest - source) < length - jb LReverseIsland - cmpq $(kShort),%rdx // long enough to bother with SSE? - ja LNotShort // yes - -// Handle short forward copies. As the most common case, this is the fall-through path. -// rdx = length (<= kShort) -// rsi = source ptr -// rdi = dest ptr - -LShort: - movl %edx,%ecx // copy length using 32-bit operation - shrl $2,%ecx // get #doublewords - jz LLeftovers -2: // loop copying doublewords - movl (%rsi),%eax - addq $4,%rsi - movl %eax,(%rdi) - addq $4,%rdi - decl %ecx - jnz 2b -LLeftovers: // handle leftover bytes (0..3) in last word - andl $3,%edx // any leftover bytes? - jz 5f -4: // loop copying bytes - movb (%rsi),%al - incq %rsi - movb %al,(%rdi) - incq %rdi - decl %edx - jnz 4b -5: - movq %r11,%rax // get return value (dst ptr) for memcpy/memmove - popq %rbp - ret - - -LReverseIsland: // keep the "jb" above a short branch... - jmp LReverse // ...because reverse moves are uncommon - - -// Handle forward moves that are long enough to justify use of SSE. -// First, 16-byte align the destination. -// rdx = length (> kShort) -// rsi = source ptr -// rdi = dest ptr - -LNotShort: - cmpq $(kVeryLong),%rdx // long enough to justify heavyweight loops? - jae LVeryLong // use very-long-operand path - movl %edi,%ecx // copy low half of destination ptr - negl %ecx - andl $15,%ecx // get #bytes to align destination - jz LDestAligned // already aligned - subl %ecx,%edx // decrement length - rep // align destination - movsb - - -// Destination is now aligned. Dispatch to one of sixteen loops over 64-byte chunks, -// based on the alignment of the source. All vector loads and stores are aligned. -// Even though this means we have to shift and repack vectors, doing so is much faster -// than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already, -// there is at least one chunk. When we enter the copy loops, the following registers -// are set up: -// rdx = residual length (0..63) -// rcx = -(length to move), a multiple of 64 less than 2GB -// rsi = ptr to 1st source byte not to move (unaligned) -// rdi = ptr to 1st dest byte not to move (aligned) - -LDestAligned: - movq %rdx,%rcx // copy length - movl %esi,%eax // copy low half of source address - andl $63,%edx // get remaining bytes for LShort - andl $15,%eax // mask to low 4 bits of source address - andq $-64,%rcx // get number of bytes we will copy in inner loop -// We'd like to use lea with rip-relative addressing, but cannot in a .code64 block in -// a 32-bit object file (4586528). Generate the leaq opcode manually. -#if defined(__i386__) - .byte 0x4c - .byte 0x8d - .byte 0x05 - .long LTable-LRIP -LRIP: -#elif defined(__x86_64__) - leaq LTable(%rip), %r8 -#else -#error Unsupported architecture -#endif - addq %rcx,%rsi // point to 1st byte not copied - addq %rcx,%rdi - movl (%r8,%rax,4),%eax // get offset of routine - negq %rcx // now generate offset to 1st byte to be copied - addq %r8,%rax // generate address of copy loop - jmp *%rax // enter copy loop, selected by source alignment - - .align 2 -LTable: // table of copy loop addresses -// force generation of assembly-time constants. Otherwise assembler -// creates subtractor relocations relative to first external symbol, -// and this file has none -.set LMod0Offset, LMod0 - LTable -.set LMod1Offset, LMod1 - LTable -.set LMod2Offset, LMod2 - LTable -.set LMod3Offset, LMod3 - LTable -.set LMod4Offset, LMod4 - LTable -.set LMod5Offset, LMod5 - LTable -.set LMod6Offset, LMod6 - LTable -.set LMod7Offset, LMod7 - LTable -.set LMod8Offset, LMod8 - LTable -.set LMod9Offset, LMod9 - LTable -.set LMod10Offset, LMod10 - LTable -.set LMod11Offset, LMod11 - LTable -.set LMod12Offset, LMod12 - LTable -.set LMod13Offset, LMod13 - LTable -.set LMod14Offset, LMod14 - LTable -.set LMod15Offset, LMod15 - LTable - .long LMod0Offset - .long LMod1Offset - .long LMod2Offset - .long LMod3Offset - .long LMod4Offset - .long LMod5Offset - .long LMod6Offset - .long LMod7Offset - .long LMod8Offset - .long LMod9Offset - .long LMod10Offset - .long LMod11Offset - .long LMod12Offset - .long LMod13Offset - .long LMod14Offset - .long LMod15Offset - - -// Very long forward moves. These are at least several pages. They are special cased -// and aggressively optimized, not so much because they are common or useful, but -// because they are subject to benchmark. There isn't enough room for them in the -// area reserved on the commpage for bcopy, so we put them elsewhere. We call -// the longcopy routine using the normal ABI: -// rdi = dest -// rsi = source -// rdx = length (>= kVeryLong bytes) - -LVeryLong: - pushq %r11 // save return value - movq $_COMM_PAGE_32_TO_64(_COMM_PAGE_LONGCOPY),%rax - call *%rax // call very long operand routine - popq %rax // pop return value - popq %rbp - ret - - -// On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 16-byte -// aligned operands from about 32KB up to kVeryLong for the hot cache case, and from -// about 256 bytes up to kVeryLong for cold caches. This is because the microcode -// avoids having to read destination cache lines that will be completely overwritten. -// The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since -// we do not know if the destination is in cache or not. - -Lfastpath: - addq %rcx,%rsi // restore ptrs to 1st byte of source and dest - addq %rcx,%rdi - negl %ecx // make length positive (known to be < 2GB) - orl %edx,%ecx // restore total #bytes remaining to move - cld // we'll move forward - shrl $2,%ecx // compute #words to move - rep // the u-code will optimize this - movsl - jmp LLeftovers // handle 0..3 leftover bytes - - -// Forward loop for medium length operands in which low four bits of %rsi == 0000 - -LMod0: - cmpl $(-kFastUCode),%ecx // %rcx == -length, where (length < kVeryLong) - jle Lfastpath // long enough for fastpath in microcode - jmp 1f - .align 4,0x90 // 16-byte align inner loops -1: // loop over 64-byte chunks - movdqa (%rsi,%rcx),%xmm0 - movdqa 16(%rsi,%rcx),%xmm1 - movdqa 32(%rsi,%rcx),%xmm2 - movdqa 48(%rsi,%rcx),%xmm3 - - movdqa %xmm0,(%rdi,%rcx) - movdqa %xmm1,16(%rdi,%rcx) - movdqa %xmm2,32(%rdi,%rcx) - movdqa %xmm3,48(%rdi,%rcx) - - addq $64,%rcx - jnz 1b - - jmp LShort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %rsi == 0001 - -LMod1: - movdqa -1(%rsi,%rcx),%xmm0 // prime the loop by loading 1st quadword -1: // loop over 64-byte chunks - movdqa 15(%rsi,%rcx),%xmm1 - movdqa 31(%rsi,%rcx),%xmm2 - movdqa 47(%rsi,%rcx),%xmm3 - movdqa 63(%rsi,%rcx),%xmm4 - - movdqa %xmm0,%xmm5 - movdqa %xmm4,%xmm0 - - palignr $1,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) - palignr $1,%xmm2,%xmm3 - palignr $1,%xmm1,%xmm2 - palignr $1,%xmm5,%xmm1 - - movdqa %xmm1,(%rdi,%rcx) - movdqa %xmm2,16(%rdi,%rcx) - movdqa %xmm3,32(%rdi,%rcx) - movdqa %xmm4,48(%rdi,%rcx) - - addq $64,%rcx - jnz 1b - - jmp LShort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %rsi == 0010 - -LMod2: - movdqa -2(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq -1: // loop over 64-byte chunks - movdqa 14(%rsi,%rcx),%xmm1 - movdqa 30(%rsi,%rcx),%xmm2 - movdqa 46(%rsi,%rcx),%xmm3 - movdqa 62(%rsi,%rcx),%xmm4 - - movdqa %xmm0,%xmm5 - movdqa %xmm4,%xmm0 - - palignr $2,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) - palignr $2,%xmm2,%xmm3 - palignr $2,%xmm1,%xmm2 - palignr $2,%xmm5,%xmm1 - - movdqa %xmm1,(%rdi,%rcx) - movdqa %xmm2,16(%rdi,%rcx) - movdqa %xmm3,32(%rdi,%rcx) - movdqa %xmm4,48(%rdi,%rcx) - - addq $64,%rcx - jnz 1b - - jmp LShort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %rsi == 0011 - -LMod3: - movdqa -3(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq -1: // loop over 64-byte chunks - movdqa 13(%rsi,%rcx),%xmm1 - movdqa 29(%rsi,%rcx),%xmm2 - movdqa 45(%rsi,%rcx),%xmm3 - movdqa 61(%rsi,%rcx),%xmm4 - - movdqa %xmm0,%xmm5 - movdqa %xmm4,%xmm0 - - palignr $3,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) - palignr $3,%xmm2,%xmm3 - palignr $3,%xmm1,%xmm2 - palignr $3,%xmm5,%xmm1 - - movdqa %xmm1,(%rdi,%rcx) - movdqa %xmm2,16(%rdi,%rcx) - movdqa %xmm3,32(%rdi,%rcx) - movdqa %xmm4,48(%rdi,%rcx) - - addq $64,%rcx - jnz 1b - - jmp LShort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %rsi == 0100 -// We use the float single data type in order to use "movss" to merge vectors. - -LMod4: - movaps -4(%rsi,%rcx),%xmm0 // 4-byte aligned: prime the loop - jmp 1f - .align 4,0x90 -1: // loop over 64-byte chunks - movaps 12(%rsi,%rcx),%xmm1 - movaps 28(%rsi,%rcx),%xmm2 - movss %xmm1,%xmm0 // copy low 4 bytes of source into destination - pshufd $(0x39),%xmm0,%xmm0 // rotate right 4 bytes (mask -- 00 11 10 01) - movaps 44(%rsi,%rcx),%xmm3 - movss %xmm2,%xmm1 - pshufd $(0x39),%xmm1,%xmm1 - movaps 60(%rsi,%rcx),%xmm4 - movss %xmm3,%xmm2 - pshufd $(0x39),%xmm2,%xmm2 - - movaps %xmm0,(%rdi,%rcx) - movss %xmm4,%xmm3 - pshufd $(0x39),%xmm3,%xmm3 - movaps %xmm1,16(%rdi,%rcx) - movaps %xmm2,32(%rdi,%rcx) - movaps %xmm4,%xmm0 - movaps %xmm3,48(%rdi,%rcx) - - addq $64,%rcx - jnz 1b - - jmp LShort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %rsi == 0101 - -LMod5: - movdqa -5(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq -1: // loop over 64-byte chunks - movdqa 11(%rsi,%rcx),%xmm1 - movdqa 27(%rsi,%rcx),%xmm2 - movdqa 43(%rsi,%rcx),%xmm3 - movdqa 59(%rsi,%rcx),%xmm4 - - movdqa %xmm0,%xmm5 - movdqa %xmm4,%xmm0 - - palignr $5,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) - palignr $5,%xmm2,%xmm3 - palignr $5,%xmm1,%xmm2 - palignr $5,%xmm5,%xmm1 - - movdqa %xmm1,(%rdi,%rcx) - movdqa %xmm2,16(%rdi,%rcx) - movdqa %xmm3,32(%rdi,%rcx) - movdqa %xmm4,48(%rdi,%rcx) - - addq $64,%rcx - jnz 1b - - jmp LShort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %rsi == 0110 - -LMod6: - movdqa -6(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq -1: // loop over 64-byte chunks - movdqa 10(%rsi,%rcx),%xmm1 - movdqa 26(%rsi,%rcx),%xmm2 - movdqa 42(%rsi,%rcx),%xmm3 - movdqa 58(%rsi,%rcx),%xmm4 - - movdqa %xmm0,%xmm5 - movdqa %xmm4,%xmm0 - - palignr $6,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) - palignr $6,%xmm2,%xmm3 - palignr $6,%xmm1,%xmm2 - palignr $6,%xmm5,%xmm1 - - movdqa %xmm1,(%rdi,%rcx) - movdqa %xmm2,16(%rdi,%rcx) - movdqa %xmm3,32(%rdi,%rcx) - movdqa %xmm4,48(%rdi,%rcx) - - addq $64,%rcx - jnz 1b - - jmp LShort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %rsi == 0111 - -LMod7: - movdqa -7(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq -1: // loop over 64-byte chunks - movdqa 9(%rsi,%rcx),%xmm1 - movdqa 25(%rsi,%rcx),%xmm2 - movdqa 41(%rsi,%rcx),%xmm3 - movdqa 57(%rsi,%rcx),%xmm4 - - movdqa %xmm0,%xmm5 - movdqa %xmm4,%xmm0 - - palignr $7,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) - palignr $7,%xmm2,%xmm3 - palignr $7,%xmm1,%xmm2 - palignr $7,%xmm5,%xmm1 - - movdqa %xmm1,(%rdi,%rcx) - movdqa %xmm2,16(%rdi,%rcx) - movdqa %xmm3,32(%rdi,%rcx) - movdqa %xmm4,48(%rdi,%rcx) - - addq $64,%rcx - jnz 1b - - jmp LShort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %rsi == 1000 -// We use the float double data type in order to use "shufpd" to shift by 8 bytes. - -LMod8: - cmpl $(-kFastUCode),%ecx // %rcx == -length, where (length < kVeryLong) - jle Lfastpath // long enough for fastpath in microcode - movapd -8(%rsi,%rcx),%xmm0 // 8-byte aligned: prime the loop - jmp 1f - .align 4,0x90 -1: // loop over 64-byte chunks - movapd 8(%rsi,%rcx),%xmm1 - movapd 24(%rsi,%rcx),%xmm2 - shufpd $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes) - movapd 40(%rsi,%rcx),%xmm3 - shufpd $01,%xmm2,%xmm1 - movapd 56(%rsi,%rcx),%xmm4 - shufpd $01,%xmm3,%xmm2 - - movapd %xmm0,(%rdi,%rcx) - shufpd $01,%xmm4,%xmm3 - movapd %xmm1,16(%rdi,%rcx) - movapd %xmm2,32(%rdi,%rcx) - movapd %xmm4,%xmm0 - movapd %xmm3,48(%rdi,%rcx) - - addq $64,%rcx - jnz 1b - - jmp LShort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %rsi == 1001 - -LMod9: - movdqa -9(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq -1: // loop over 64-byte chunks - movdqa 7(%rsi,%rcx),%xmm1 - movdqa 23(%rsi,%rcx),%xmm2 - movdqa 39(%rsi,%rcx),%xmm3 - movdqa 55(%rsi,%rcx),%xmm4 - - movdqa %xmm0,%xmm5 - movdqa %xmm4,%xmm0 - - palignr $9,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) - palignr $9,%xmm2,%xmm3 - palignr $9,%xmm1,%xmm2 - palignr $9,%xmm5,%xmm1 - - movdqa %xmm1,(%rdi,%rcx) - movdqa %xmm2,16(%rdi,%rcx) - movdqa %xmm3,32(%rdi,%rcx) - movdqa %xmm4,48(%rdi,%rcx) - - addq $64,%rcx - jnz 1b - - jmp LShort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %rsi == 1010 - -LMod10: - movdqa -10(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq -1: // loop over 64-byte chunks - movdqa 6(%rsi,%rcx),%xmm1 - movdqa 22(%rsi,%rcx),%xmm2 - movdqa 38(%rsi,%rcx),%xmm3 - movdqa 54(%rsi,%rcx),%xmm4 - - movdqa %xmm0,%xmm5 - movdqa %xmm4,%xmm0 - - palignr $10,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) - palignr $10,%xmm2,%xmm3 - palignr $10,%xmm1,%xmm2 - palignr $10,%xmm5,%xmm1 - - movdqa %xmm1,(%rdi,%rcx) - movdqa %xmm2,16(%rdi,%rcx) - movdqa %xmm3,32(%rdi,%rcx) - movdqa %xmm4,48(%rdi,%rcx) - - addq $64,%rcx - jnz 1b - - jmp LShort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %rsi == 1011 - -LMod11: - movdqa -11(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq -1: // loop over 64-byte chunks - movdqa 5(%rsi,%rcx),%xmm1 - movdqa 21(%rsi,%rcx),%xmm2 - movdqa 37(%rsi,%rcx),%xmm3 - movdqa 53(%rsi,%rcx),%xmm4 - - movdqa %xmm0,%xmm5 - movdqa %xmm4,%xmm0 - - palignr $11,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) - palignr $11,%xmm2,%xmm3 - palignr $11,%xmm1,%xmm2 - palignr $11,%xmm5,%xmm1 - - movdqa %xmm1,(%rdi,%rcx) - movdqa %xmm2,16(%rdi,%rcx) - movdqa %xmm3,32(%rdi,%rcx) - movdqa %xmm4,48(%rdi,%rcx) - - addq $64,%rcx - jnz 1b - - jmp LShort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %rsi == 1100 -// We use the float single data type in order to use "movss" to merge vectors. - -LMod12: - movss (%rsi,%rcx),%xmm0 // prefetch 1st four bytes of source, right justified - jmp 1f - .align 4,0x90 -1: // loop over 64-byte chunks - pshufd $(0x93),4(%rsi,%rcx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11) - pshufd $(0x93),20(%rsi,%rcx),%xmm2 - pshufd $(0x93),36(%rsi,%rcx),%xmm3 - pshufd $(0x93),52(%rsi,%rcx),%xmm4 - - movaps %xmm4,%xmm5 - movss %xmm3,%xmm4 // copy low 4 bytes of source into destination - movss %xmm2,%xmm3 - movss %xmm1,%xmm2 - movss %xmm0,%xmm1 - - movaps %xmm1,(%rdi,%rcx) - movaps %xmm2,16(%rdi,%rcx) - movaps %xmm5,%xmm0 - movaps %xmm3,32(%rdi,%rcx) - movaps %xmm4,48(%rdi,%rcx) - - addq $64,%rcx - jnz 1b - - jmp LShort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %rsi == 1101 - -LMod13: - movdqa -13(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq -1: // loop over 64-byte chunks - movdqa 3(%rsi,%rcx),%xmm1 - movdqa 19(%rsi,%rcx),%xmm2 - movdqa 35(%rsi,%rcx),%xmm3 - movdqa 51(%rsi,%rcx),%xmm4 - - movdqa %xmm0,%xmm5 - movdqa %xmm4,%xmm0 - - palignr $13,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) - palignr $13,%xmm2,%xmm3 - palignr $13,%xmm1,%xmm2 - palignr $13,%xmm5,%xmm1 - - movdqa %xmm1,(%rdi,%rcx) - movdqa %xmm2,16(%rdi,%rcx) - movdqa %xmm3,32(%rdi,%rcx) - movdqa %xmm4,48(%rdi,%rcx) - - addq $64,%rcx - jnz 1b - - jmp LShort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %rsi == 1110 - -LMod14: - movdqa -14(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq -1: // loop over 64-byte chunks - movdqa 2(%rsi,%rcx),%xmm1 - movdqa 18(%rsi,%rcx),%xmm2 - movdqa 34(%rsi,%rcx),%xmm3 - movdqa 50(%rsi,%rcx),%xmm4 - - movdqa %xmm0,%xmm5 - movdqa %xmm4,%xmm0 - - palignr $14,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) - palignr $14,%xmm2,%xmm3 - palignr $14,%xmm1,%xmm2 - palignr $14,%xmm5,%xmm1 - - movdqa %xmm1,(%rdi,%rcx) - movdqa %xmm2,16(%rdi,%rcx) - movdqa %xmm3,32(%rdi,%rcx) - movdqa %xmm4,48(%rdi,%rcx) - - addq $64,%rcx - jnz 1b - - jmp LShort // copy remaining 0..63 bytes and done - - -// Forward loop for medium length operands in which low four bits of %rsi == 1111 - -LMod15: - movdqa -15(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq -1: // loop over 64-byte chunks - movdqa 1(%rsi,%rcx),%xmm1 - movdqa 17(%rsi,%rcx),%xmm2 - movdqa 33(%rsi,%rcx),%xmm3 - movdqa 49(%rsi,%rcx),%xmm4 - - movdqa %xmm0,%xmm5 - movdqa %xmm4,%xmm0 - - palignr $15,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) - palignr $15,%xmm2,%xmm3 - palignr $15,%xmm1,%xmm2 - palignr $15,%xmm5,%xmm1 - - movdqa %xmm1,(%rdi,%rcx) - movdqa %xmm2,16(%rdi,%rcx) - movdqa %xmm3,32(%rdi,%rcx) - movdqa %xmm4,48(%rdi,%rcx) - - addq $64,%rcx - jnz 1b - - jmp LShort // copy remaining 0..63 bytes and done - - -// Reverse moves. These are not optimized as aggressively as their forward -// counterparts, as they are only used with destructive overlap. -// rdx = length -// rsi = source ptr -// rdi = dest ptr - -LReverse: - addq %rdx,%rsi // point to end of strings - addq %rdx,%rdi - cmpq $(kShort),%rdx // long enough to bother with SSE? - ja LReverseNotShort // yes - -// Handle reverse short copies. -// edx = length (<= kShort) -// rsi = one byte past end of source -// rdi = one byte past end of dest - -LReverseShort: - movl %edx,%ecx // copy length - shrl $3,%ecx // #quadwords - jz 3f -1: - subq $8,%rsi - movq (%rsi),%rax - subq $8,%rdi - movq %rax,(%rdi) - decl %ecx - jnz 1b -3: - andl $7,%edx // bytes? - jz 5f -4: - decq %rsi - movb (%rsi),%al - decq %rdi - movb %al,(%rdi) - decl %edx - jnz 4b -5: - movq %r11,%rax // get return value (dst ptr) for memcpy/memmove - popq %rbp - ret - -// Handle a reverse move long enough to justify using SSE. -// rdx = length (> kShort) -// rsi = one byte past end of source -// rdi = one byte past end of dest - -LReverseNotShort: - movl %edi,%ecx // copy destination - andl $15,%ecx // get #bytes to align destination - je LReverseDestAligned // already aligned - subq %rcx,%rdx // adjust length -1: // loop copying 1..15 bytes - decq %rsi - movb (%rsi),%al - decq %rdi - movb %al,(%rdi) - decl %ecx - jnz 1b - -// Destination is now aligned. Prepare for reverse loops. - -LReverseDestAligned: - movq %rdx,%rcx // copy length - andl $63,%edx // get remaining bytes for LReverseShort - andq $-64,%rcx // get number of bytes we will copy in inner loop - subq %rcx,%rsi // point to endpoint of copy - subq %rcx,%rdi - testl $15,%esi // is source aligned too? - jnz LReverseUnalignedLoop // no - -LReverseAlignedLoop: // loop over 64-byte chunks - movdqa -16(%rsi,%rcx),%xmm0 - movdqa -32(%rsi,%rcx),%xmm1 - movdqa -48(%rsi,%rcx),%xmm2 - movdqa -64(%rsi,%rcx),%xmm3 - - movdqa %xmm0,-16(%rdi,%rcx) - movdqa %xmm1,-32(%rdi,%rcx) - movdqa %xmm2,-48(%rdi,%rcx) - movdqa %xmm3,-64(%rdi,%rcx) - - subq $64,%rcx - jne LReverseAlignedLoop - - jmp LReverseShort // copy remaining 0..63 bytes and done - - -// Reverse, unaligned loop. LDDQU==MOVDQU on these machines. - -LReverseUnalignedLoop: // loop over 64-byte chunks - movdqu -16(%rsi,%rcx),%xmm0 - movdqu -32(%rsi,%rcx),%xmm1 - movdqu -48(%rsi,%rcx),%xmm2 - movdqu -64(%rsi,%rcx),%xmm3 - - movdqa %xmm0,-16(%rdi,%rcx) - movdqa %xmm1,-32(%rdi,%rcx) - movdqa %xmm2,-48(%rdi,%rcx) - movdqa %xmm3,-64(%rdi,%rcx) - - subq $64,%rcx - jne LReverseUnalignedLoop - - jmp LReverseShort // copy remaining 0..63 bytes and done - -COMMPAGE_DESCRIPTOR(bcopy_sse3x_64,_COMM_PAGE_BCOPY,kHasSSE2+kHasSupplementalSSE3+kCache64,kHasSSE4_2) diff --git a/osfmk/i386/commpage/bcopy_sse42.s b/osfmk/i386/commpage/bcopy_sse42.s deleted file mode 100644 index 6a0bcd528..000000000 --- a/osfmk/i386/commpage/bcopy_sse42.s +++ /dev/null @@ -1,311 +0,0 @@ -/* - * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include - -/* - * The bcopy/memcpy loops, tuned for Nehalem. - * - * The following #defines are tightly coupled to the u-architecture: - */ - -#define kShort 80 // too short to bother with SSE (must be >=80) - - -// void bcopy(const void *src, void *dst, size_t len); - -COMMPAGE_FUNCTION_START(bcopy_sse42, 32, 5) - pushl %ebp // set up a frame for backtraces - movl %esp,%ebp - pushl %esi - pushl %edi - movl 8(%ebp),%esi // get source ptr - movl 12(%ebp),%edi // get dest ptr - movl 16(%ebp),%ecx // get length - movl %edi,%edx - subl %esi,%edx // (dest - source) - cmpl %ecx,%edx // must move in reverse if (dest - source) < length - jb LReverseIsland - cmpl $(kShort),%ecx // long enough to bother with SSE? - jbe Lshort // no - jmp LNotShort - -// -// void *memcpy(void *dst, const void *src, size_t len); -// void *memmove(void *dst, const void *src, size_t len); -// -// NB: These need to be 32 bytes from bcopy(): -// - - .align 5, 0x90 -Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len) -Lmemmove: // void *memmove(void *dst, const void *src, size_t len) - pushl %ebp // set up a frame for backtraces - movl %esp,%ebp - pushl %esi - pushl %edi - movl 8(%ebp),%edi // get dest ptr - movl 12(%ebp),%esi // get source ptr - movl 16(%ebp),%ecx // get length - movl %edi,%edx - subl %esi,%edx // (dest - source) - cmpl %ecx,%edx // must move in reverse if (dest - source) < length - jb LReverseIsland - cmpl $(kShort),%ecx // long enough to bother with SSE? - ja LNotShort // yes - -// Handle short forward copies. As the most common case, this is the fall-through path. -// ecx = length (<= kShort) -// esi = source ptr -// edi = dest ptr - -Lshort: - movl %ecx,%edx // copy length - shrl $2,%ecx // get #doublewords - jz 3f -2: // loop copying doublewords - movl (%esi),%eax - addl $4,%esi - movl %eax,(%edi) - addl $4,%edi - dec %ecx - jnz 2b -3: // handle leftover bytes (0..3) in last word - andl $3,%edx // any leftover bytes? - jz Lexit -4: // loop copying bytes - movb (%esi),%al - inc %esi - movb %al,(%edi) - inc %edi - dec %edx - jnz 4b -Lexit: - movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove - popl %edi - popl %esi - popl %ebp - ret - - -LReverseIsland: // keep the "jb" above a short branch... - jmp LReverse // ...because reverse moves are uncommon - - -// Handle forward moves that are long enough to justify use of SSE. -// First, 16-byte align the destination. -// ecx = length (> kShort) -// esi = source ptr -// edi = dest ptr - -LNotShort: - movl %edi,%edx // copy destination - negl %edx - andl $15,%edx // get #bytes to align destination - jz LDestAligned // already aligned - subl %edx,%ecx // decrement length -1: // loop copying 1..15 bytes - movb (%esi),%al - inc %esi - movb %al,(%edi) - inc %edi - dec %edx - jnz 1b - -// Destination is now aligned. Nehalem does a great job with unaligned SSE loads, -// so we use MOVDQU rather than aligned loads and shifts. Since kShort>=80, we -// know there is at least one 64-byte chunk to move. -// When we enter the copy loops, the following registers are set up: -// ecx = residual length (0..63) -// edx = -(length to move), a multiple of 64 -// esi = ptr to 1st source byte not to move (unaligned) -// edi = ptr to 1st dest byte not to move (aligned) - -LDestAligned: - movl %ecx,%edx // copy length - andl $63,%ecx // get remaining bytes for Lshort - andl $-64,%edx // get number of bytes we will copy in inner loop - addl %edx,%esi // point to 1st byte not copied - addl %edx,%edi - negl %edx // now generate offset to 1st byte to be copied - testl $15,%esi // source also aligned? - jnz LUnalignedLoop - jmp LAlignedLoop - - -// Forward loop for aligned operands. - - .align 4,0x90 // 16-byte align inner loops -LAlignedLoop: // loop over 64-byte chunks - movdqa (%esi,%edx),%xmm0 - movdqa 16(%esi,%edx),%xmm1 - movdqa 32(%esi,%edx),%xmm2 - movdqa 48(%esi,%edx),%xmm3 - - movdqa %xmm0,(%edi,%edx) - movdqa %xmm1,16(%edi,%edx) - movdqa %xmm2,32(%edi,%edx) - movdqa %xmm3,48(%edi,%edx) - - addl $64,%edx - jnz LAlignedLoop - - jmp Lshort // copy remaining 0..63 bytes and done - - -// Forward loop for unaligned operands. - - .align 4,0x90 // 16-byte align inner loops -LUnalignedLoop: // loop over 64-byte chunks - movdqu (%esi,%edx),%xmm0 - movdqu 16(%esi,%edx),%xmm1 - movdqu 32(%esi,%edx),%xmm2 - movdqu 48(%esi,%edx),%xmm3 - - movdqa %xmm0,(%edi,%edx) - movdqa %xmm1,16(%edi,%edx) - movdqa %xmm2,32(%edi,%edx) - movdqa %xmm3,48(%edi,%edx) - - addl $64,%edx - jnz LUnalignedLoop - - jmp Lshort // copy remaining 0..63 bytes and done - - -// Reverse moves. They are only used with destructive overlap. -// ecx = length -// esi = source ptr -// edi = dest ptr - -LReverse: - addl %ecx,%esi // point to end of strings - addl %ecx,%edi - cmpl $(kShort),%ecx // long enough to bother with SSE? - ja LReverseNotShort // yes - -// Handle reverse short copies. -// ecx = length -// esi = one byte past end of source -// edi = one byte past end of dest - -LReverseShort: - movl %ecx,%edx // copy length - shrl $2,%ecx // #words - jz 3f -1: - subl $4,%esi - movl (%esi),%eax - subl $4,%edi - movl %eax,(%edi) - dec %ecx - jnz 1b -3: - andl $3,%edx // bytes? - jz 5f -4: - dec %esi - movb (%esi),%al - dec %edi - movb %al,(%edi) - dec %edx - jnz 4b -5: - movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove - popl %edi - popl %esi - popl %ebp - ret - -// Handle a reverse move long enough to justify using SSE. -// ecx = length -// esi = one byte past end of source -// edi = one byte past end of dest - -LReverseNotShort: - movl %edi,%edx // copy destination - andl $15,%edx // get #bytes to align destination - je LReverseDestAligned // already aligned - subl %edx,%ecx // adjust length -1: // loop copying 1..15 bytes - dec %esi - movb (%esi),%al - dec %edi - movb %al,(%edi) - dec %edx - jnz 1b - -// Destination is now aligned. Prepare for reverse loops. - -LReverseDestAligned: - movl %ecx,%edx // copy length - andl $63,%ecx // get remaining bytes for Lshort - andl $-64,%edx // get number of bytes we will copy in inner loop - subl %edx,%esi // point to endpoint of copy - subl %edx,%edi - testl $15,%esi // is source aligned too? - jnz LReverseUnalignedLoop // no - -LReverseAlignedLoop: // loop over 64-byte chunks - movdqa -16(%esi,%edx),%xmm0 - movdqa -32(%esi,%edx),%xmm1 - movdqa -48(%esi,%edx),%xmm2 - movdqa -64(%esi,%edx),%xmm3 - - movdqa %xmm0,-16(%edi,%edx) - movdqa %xmm1,-32(%edi,%edx) - movdqa %xmm2,-48(%edi,%edx) - movdqa %xmm3,-64(%edi,%edx) - - subl $64,%edx - jne LReverseAlignedLoop - - jmp LReverseShort // copy remaining 0..63 bytes and done - - -// Reverse, unaligned loop. LDDQU==MOVDQU on these machines. - -LReverseUnalignedLoop: // loop over 64-byte chunks - movdqu -16(%esi,%edx),%xmm0 - movdqu -32(%esi,%edx),%xmm1 - movdqu -48(%esi,%edx),%xmm2 - movdqu -64(%esi,%edx),%xmm3 - - movdqa %xmm0,-16(%edi,%edx) - movdqa %xmm1,-32(%edi,%edx) - movdqa %xmm2,-48(%edi,%edx) - movdqa %xmm3,-64(%edi,%edx) - - subl $64,%edx - jne LReverseUnalignedLoop - - jmp LReverseShort // copy remaining 0..63 bytes and done - - - COMMPAGE_DESCRIPTOR(bcopy_sse42,_COMM_PAGE_BCOPY,kHasSSE4_2,0) diff --git a/osfmk/i386/commpage/bcopy_sse42_64.s b/osfmk/i386/commpage/bcopy_sse42_64.s deleted file mode 100644 index c8817d955..000000000 --- a/osfmk/i386/commpage/bcopy_sse42_64.s +++ /dev/null @@ -1,301 +0,0 @@ -/* - * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include - -/* - * The bcopy/memcpy loops, tuned for Nehalem. This is the 64-bit version. - * - * The following #defines are tightly coupled to the u-architecture: - */ - -#define kShort 80 // too short to bother with SSE (must be >=80) - - -// void bcopy(const void *src, void *dst, size_t len); - -COMMPAGE_FUNCTION_START(bcopy_sse42_64, 64, 5) - pushq %rbp // set up a frame for backtraces - movq %rsp,%rbp - movq %rsi,%rax // copy dest ptr - movq %rdi,%rsi // xchange source and dest ptrs - movq %rax,%rdi - subq %rsi,%rax // (dest - source) - cmpq %rdx,%rax // must move in reverse if (dest - source) < length - jb LReverseIsland - cmpq $(kShort),%rdx // long enough to bother with SSE? - jbe LShort // no - jmp LNotShort - -// -// void *memcpy(void *dst, const void *src, size_t len); -// void *memmove(void *dst, const void *src, size_t len); -// -// NB: These need to be 32 bytes from bcopy(): -// - - .align 5, 0x90 -Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len) -Lmemmove: // void *memmove(void *dst, const void *src, size_t len) - pushq %rbp // set up a frame for backtraces - movq %rsp,%rbp - movq %rdi,%r11 // save return value here - movq %rdi,%rax - subq %rsi,%rax // (dest - source) - cmpq %rdx,%rax // must move in reverse if (dest - source) < length - jb LReverseIsland - cmpq $(kShort),%rdx // long enough to bother with SSE? - ja LNotShort // yes - -// Handle short forward copies. As the most common case, this is the fall-through path. -// rdx = length (<= kShort) -// rsi = source ptr -// rdi = dest ptr - -LShort: - movl %edx,%ecx // copy length using 32-bit operation - shrl $2,%ecx // get #doublewords - jz 3f -2: // loop copying doublewords - movl (%rsi),%eax - addq $4,%rsi - movl %eax,(%rdi) - addq $4,%rdi - decl %ecx - jnz 2b -3: // handle leftover bytes (0..3) in last word - andl $3,%edx // any leftover bytes? - jz 5f -4: // loop copying bytes - movb (%rsi),%al - incq %rsi - movb %al,(%rdi) - incq %rdi - decl %edx - jnz 4b -5: - movq %r11,%rax // get return value (dst ptr) for memcpy/memmove - popq %rbp - ret - - -LReverseIsland: // keep the "jb" above a short branch... - jmp LReverse // ...because reverse moves are uncommon - - -// Handle forward moves that are long enough to justify use of SSE. -// First, 16-byte align the destination. -// rdx = length (> kShort) -// rsi = source ptr -// rdi = dest ptr - -LNotShort: - movl %edi,%ecx // copy low half of destination ptr - negl %ecx - andl $15,%ecx // get #bytes to align destination - jz LDestAligned // already aligned - subl %ecx,%edx // decrement length -1: // loop copying 1..15 bytes - movb (%rsi),%al - inc %rsi - movb %al,(%rdi) - inc %rdi - dec %ecx - jnz 1b - - -// Destination is now aligned. Nehalem does a great job with unaligned SSE loads, -// so we use MOVDQU rather than aligned loads and shifts. Since kShort>=80, we -// know there is at least one 64-byte chunk to move. -// When we enter the copy loops, the following registers are set up: -// rdx = residual length (0..63) -// rcx = -(length to move), a multiple of 64 less than 2GB -// rsi = ptr to 1st source byte not to move (unaligned) -// rdi = ptr to 1st dest byte not to move (aligned) - -LDestAligned: - movq %rdx,%rcx // copy length - andl $63,%edx // get remaining bytes for LShort - andq $-64,%rcx // get number of bytes we will copy in inner loop - addq %rcx,%rsi // point to 1st byte not copied - addq %rcx,%rdi - negq %rcx // now generate offset to 1st byte to be copied - testl $15,%esi // source also aligned? - jnz LUnalignedLoop - jmp LAlignedLoop - - -// Forward loop for aligned operands. - - .align 4,0x90 // 16-byte align inner loops -LAlignedLoop: // loop over 64-byte chunks - movdqa (%rsi,%rcx),%xmm0 - movdqa 16(%rsi,%rcx),%xmm1 - movdqa 32(%rsi,%rcx),%xmm2 - movdqa 48(%rsi,%rcx),%xmm3 - - movdqa %xmm0,(%rdi,%rcx) - movdqa %xmm1,16(%rdi,%rcx) - movdqa %xmm2,32(%rdi,%rcx) - movdqa %xmm3,48(%rdi,%rcx) - - addq $64,%rcx - jnz LAlignedLoop - - jmp LShort // copy remaining 0..63 bytes and done - - -// Forward loop for unaligned operands. - - .align 4,0x90 // 16-byte align inner loops -LUnalignedLoop: // loop over 64-byte chunks - movdqu (%rsi,%rcx),%xmm0 - movdqu 16(%rsi,%rcx),%xmm1 - movdqu 32(%rsi,%rcx),%xmm2 - movdqu 48(%rsi,%rcx),%xmm3 - - movdqa %xmm0,(%rdi,%rcx) - movdqa %xmm1,16(%rdi,%rcx) - movdqa %xmm2,32(%rdi,%rcx) - movdqa %xmm3,48(%rdi,%rcx) - - addq $64,%rcx - jnz LUnalignedLoop - - jmp LShort // copy remaining 0..63 bytes and done - - -// Reverse moves. These are only used with destructive overlap. -// rdx = length -// rsi = source ptr -// rdi = dest ptr - -LReverse: - addq %rdx,%rsi // point to end of strings - addq %rdx,%rdi - cmpq $(kShort),%rdx // long enough to bother with SSE? - ja LReverseNotShort // yes - -// Handle reverse short copies. -// edx = length (<= kShort) -// rsi = one byte past end of source -// rdi = one byte past end of dest - -LReverseShort: - movl %edx,%ecx // copy length - shrl $3,%ecx // #quadwords - jz 3f -1: - subq $8,%rsi - movq (%rsi),%rax - subq $8,%rdi - movq %rax,(%rdi) - decl %ecx - jnz 1b -3: - andl $7,%edx // bytes? - jz 5f -4: - decq %rsi - movb (%rsi),%al - decq %rdi - movb %al,(%rdi) - decl %edx - jnz 4b -5: - movq %r11,%rax // get return value (dst ptr) for memcpy/memmove - popq %rbp - ret - -// Handle a reverse move long enough to justify using SSE. -// rdx = length (> kShort) -// rsi = one byte past end of source -// rdi = one byte past end of dest - -LReverseNotShort: - movl %edi,%ecx // copy destination - andl $15,%ecx // get #bytes to align destination - jz LReverseDestAligned // already aligned - subq %rcx,%rdx // adjust length -1: // loop copying 1..15 bytes - decq %rsi - movb (%rsi),%al - decq %rdi - movb %al,(%rdi) - decl %ecx - jnz 1b - -// Destination is now aligned. Prepare for reverse loops. - -LReverseDestAligned: - movq %rdx,%rcx // copy length - andl $63,%edx // get remaining bytes for LReverseShort - andq $-64,%rcx // get number of bytes we will copy in inner loop - subq %rcx,%rsi // point to endpoint of copy - subq %rcx,%rdi - testl $15,%esi // is source aligned too? - jnz LReverseUnalignedLoop // no - -LReverseAlignedLoop: // loop over 64-byte chunks - movdqa -16(%rsi,%rcx),%xmm0 - movdqa -32(%rsi,%rcx),%xmm1 - movdqa -48(%rsi,%rcx),%xmm2 - movdqa -64(%rsi,%rcx),%xmm3 - - movdqa %xmm0,-16(%rdi,%rcx) - movdqa %xmm1,-32(%rdi,%rcx) - movdqa %xmm2,-48(%rdi,%rcx) - movdqa %xmm3,-64(%rdi,%rcx) - - subq $64,%rcx - jne LReverseAlignedLoop - - jmp LReverseShort // copy remaining 0..63 bytes and done - - -// Reverse, unaligned loop. LDDQU==MOVDQU on these machines. - -LReverseUnalignedLoop: // loop over 64-byte chunks - movdqu -16(%rsi,%rcx),%xmm0 - movdqu -32(%rsi,%rcx),%xmm1 - movdqu -48(%rsi,%rcx),%xmm2 - movdqu -64(%rsi,%rcx),%xmm3 - - movdqa %xmm0,-16(%rdi,%rcx) - movdqa %xmm1,-32(%rdi,%rcx) - movdqa %xmm2,-48(%rdi,%rcx) - movdqa %xmm3,-64(%rdi,%rcx) - - subq $64,%rcx - jne LReverseUnalignedLoop - - jmp LReverseShort // copy remaining 0..63 bytes and done - - - COMMPAGE_DESCRIPTOR(bcopy_sse42_64,_COMM_PAGE_BCOPY,kHasSSE4_2,0) diff --git a/osfmk/i386/commpage/bzero_scalar.s b/osfmk/i386/commpage/bzero_scalar.s deleted file mode 100644 index 6c496b9e9..000000000 --- a/osfmk/i386/commpage/bzero_scalar.s +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright (c) 2003-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1993 Winning Strategies, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by Winning Strategies, Inc. - * 4. The name of the author may not be used to endorse or promote products - * derived from this software withough specific prior written permission - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#include -#include -#include - -/* - * bzero (void *b, size_t len) - * write len zero bytes to the string b. - * - * Written by: - * J.T. Conklin (jtc@wimsey.com), Winning Strategies, Inc. - */ - -COMMPAGE_FUNCTION_START(bzero_scalar, 32, 4) - pushl %ebp /* set up a frame for backtraces */ - movl %esp,%ebp - pushl %edi - pushl %ebx - movl 8(%ebp),%edi - movl 12(%ebp),%ecx - - cld /* set fill direction forward */ - xorl %eax,%eax /* set fill data to 0 */ - - /* - * if the string is too short, it's really not worth the overhead - * of aligning to word boundries, etc. So we jump to a plain - * unaligned set. - */ - cmpl $0x0f,%ecx - jbe L1 - - movl %edi,%edx /* compute misalignment */ - negl %edx - andl $3,%edx - movl %ecx,%ebx - subl %edx,%ebx - - movl %edx,%ecx /* zero until word aligned */ - rep - stosb - - movl %ebx,%ecx /* zero by words */ - shrl $2,%ecx - rep - stosl - - movl %ebx,%ecx - andl $3,%ecx /* zero remainder by bytes */ -L1: rep - stosb - - popl %ebx - popl %edi - popl %ebp - ret - -COMMPAGE_DESCRIPTOR(bzero_scalar,_COMM_PAGE_BZERO,0,kHasSSE2) diff --git a/osfmk/i386/commpage/bzero_sse2.s b/osfmk/i386/commpage/bzero_sse2.s deleted file mode 100644 index be5facd29..000000000 --- a/osfmk/i386/commpage/bzero_sse2.s +++ /dev/null @@ -1,162 +0,0 @@ -/* - * Copyright (c) 2005-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include - -/* - * Bzero, tuned for Pentium-M class processors with SSE2 - * and 64-byte cache lines. - * - * This routine is also used for memset(p,0,n), which is a common case - * since gcc sometimes silently maps bzero() into memset(). As a result, - * we always load the original ptr into %eax before returning. - */ - -#define kShort 80 // too short to bother with SSE (must be >=80) -#define kVeryLong (1024*1024) - -// void bzero(void *b, size_t len); - -COMMPAGE_FUNCTION_START(bzero_sse2, 32, 5) - pushl %ebp // set up a frame for backtraces - movl %esp,%ebp - pushl %edi - movl 8(%ebp),%edi // get ptr - movl 12(%ebp),%edx // get length - - xorl %eax,%eax // set fill data to 0 - cmpl $(kShort),%edx // long enough for SSE? - jg LNotShort // yes - -// Here for short operands or the end of long ones. -// %edx = length -// %edi = ptr -// %eax = zero - -Lshort: - cmpl $16,%edx // long enough to word align? - jge 3f // yes - test %edx,%edx // length==0? - jz 6f -1: - movb %al,(%edi) // zero a byte - inc %edi - dec %edx - jnz 1b - jmp 6f -2: - movb %al,(%edi) // zero a byte - inc %edi - dec %edx -3: - test $3,%edi // is ptr doubleword aligned? - jnz 2b // no - movl %edx,%ecx // copy length - shrl $2,%edx // #doublewords to store -4: - movl %eax,(%edi) // zero an aligned doubleword - addl $4,%edi - dec %edx - jnz 4b - andl $3,%ecx // mask down to #bytes at end (0..3) - jz 6f // none -5: - movb %al,(%edi) // zero a byte - inc %edi - dec %ecx - jnz 5b -6: - movl 8(%ebp),%eax // get return value in case this was a call of memset() - popl %edi - popl %ebp - ret - - -// We will be using SSE, so align ptr. - -LNotShort: - movl %edi,%ecx - negl %ecx - andl $15,%ecx // mask down to #bytes to 16-byte align - jz LDestAligned // already aligned - subl %ecx,%edx // decrement length -0: // loop storing bytes to align the ptr - movb %al,(%edi) // pack in a byte - inc %edi - dec %ecx - jnz 0b - -// Destination is now 16-byte aligned. Prepare to loop over 64-byte chunks. -// %edx = length -// %edi = ptr -// %eax = zero - -LDestAligned: - movl %edx,%ecx - andl $63,%edx // mask down to residual length (0..63) - andl $-64,%ecx // get #bytes we will zero in this loop - pxor %xmm0,%xmm0 // zero an SSE register - addl %ecx,%edi // increment ptr by length to move - cmpl $(kVeryLong),%ecx // long enough to justify non-temporal stores? - jae LVeryLong // yes - negl %ecx // negate length to move - jmp 1f - -// Loop over 64-byte chunks, storing into cache. - - .align 4,0x90 // keep inner loops 16-byte aligned -1: - movdqa %xmm0,(%edi,%ecx) - movdqa %xmm0,16(%edi,%ecx) - movdqa %xmm0,32(%edi,%ecx) - movdqa %xmm0,48(%edi,%ecx) - addl $64,%ecx - jne 1b - - jmp Lshort - -// Very long operands: use non-temporal stores to bypass cache. - -LVeryLong: - negl %ecx // negate length to move - jmp 1f - - .align 4,0x90 // keep inner loops 16-byte aligned -1: - movntdq %xmm0,(%edi,%ecx) - movntdq %xmm0,16(%edi,%ecx) - movntdq %xmm0,32(%edi,%ecx) - movntdq %xmm0,48(%edi,%ecx) - addl $64,%ecx - jne 1b - - sfence // required by non-temporal stores - jmp Lshort - -COMMPAGE_DESCRIPTOR(bzero_sse2,_COMM_PAGE_BZERO,kHasSSE2,kHasSSE4_2) diff --git a/osfmk/i386/commpage/bzero_sse2_64.s b/osfmk/i386/commpage/bzero_sse2_64.s deleted file mode 100644 index c0ec8a458..000000000 --- a/osfmk/i386/commpage/bzero_sse2_64.s +++ /dev/null @@ -1,161 +0,0 @@ -/* - * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include - -/* - * Bzero, tuned for Pentium-M class processors with SSE2 - * and 64-byte cache lines. This is the 64-bit version. - * - * This routine is also used for memset(p,0,n), which is a common case - * since gcc sometimes silently maps bzero() into memset(). As a result, - * we always load the original ptr into %eax before returning. - */ - -#define kShort 80 // too short to bother with SSE (must be >=80) -#define kVeryLong (1024*1024) - -// void bzero(void *b, size_t len); - -COMMPAGE_FUNCTION_START(bzero_sse2_64, 64, 5) - pushq %rbp // set up a frame for backtraces - movq %rsp,%rbp - xorl %eax,%eax // set fill data to 0 - movq %rdi,%r11 // save original ptr as return value - cmpq $(kShort),%rsi // long enough for SSE? - jg LNotShort // yes - -// Here for short operands or the end of long ones. -// %esi = length (<= kShort) -// %rdi = ptr -// %eax = zero - -Lshort: - cmpl $16,%esi // long enough to word align? - jge 3f // yes - test %esi,%esi // length==0? - jz 6f -1: - movb %al,(%rdi) // zero a byte - incq %rdi - decl %esi - jnz 1b - jmp 6f -2: - movb %al,(%rdi) // zero a byte - incq %rdi - decl %esi -3: - testl $3,%edi // is ptr doubleword aligned? - jnz 2b // no - movl %esi,%ecx // copy length - shrl $2,%esi // #doublewords to store -4: - movl %eax,(%rdi) // zero an aligned doubleword - addq $4,%rdi - decl %esi - jnz 4b - andl $3,%ecx // mask down to #bytes at end (0..3) - jz 6f // none -5: - movb %al,(%rdi) // zero a byte - incq %rdi - decl %ecx - jnz 5b -6: - movq %r11,%rax // set return value in case this was a call of memset() - popq %rbp - ret - - -// We will be using SSE, so align ptr. -// %rsi = length (> kShort) -// %rdi = ptr -// %eax = zero - -LNotShort: - movl %edi,%ecx // get #bytes to 16-byte align ptr - negl %ecx - andl $15,%ecx - jz LDestAligned // already aligned - subq %rcx,%rsi // decrement length -0: // loop storing bytes to align the ptr - movb %al,(%rdi) // pack in a byte - incq %rdi - decl %ecx - jnz 0b - -// Destination is now 16-byte aligned. Prepare to loop over 64-byte chunks. -// %rsi = length (> (kShort-15)) -// %rdi = ptr (aligned) -// %eax = zero - -LDestAligned: - movq %rsi,%rcx - andl $63,%esi // mask down to residual length (0..63) - andq $-64,%rcx // get #bytes we will zero in this loop - pxor %xmm0,%xmm0 // zero an SSE register - addq %rcx,%rdi // increment ptr by length to move - cmpq $(kVeryLong),%rcx // long enough to justify non-temporal stores? - jae LVeryLong // yes - negq %rcx // negate length to move - jmp 1f - -// Loop over 64-byte chunks, storing into cache. - - .align 4,0x90 // keep inner loops 16-byte aligned -1: - movdqa %xmm0,(%rdi,%rcx) - movdqa %xmm0,16(%rdi,%rcx) - movdqa %xmm0,32(%rdi,%rcx) - movdqa %xmm0,48(%rdi,%rcx) - addq $64,%rcx - jne 1b - - jmp Lshort - -// Very long operands: use non-temporal stores to bypass cache. - -LVeryLong: - negq %rcx // negate length to move - jmp 1f - - .align 4,0x90 // keep inner loops 16-byte aligned -1: - movntdq %xmm0,(%rdi,%rcx) - movntdq %xmm0,16(%rdi,%rcx) - movntdq %xmm0,32(%rdi,%rcx) - movntdq %xmm0,48(%rdi,%rcx) - addq $64,%rcx - jne 1b - - sfence // required by non-temporal stores - jmp Lshort - -COMMPAGE_DESCRIPTOR(bzero_sse2_64,_COMM_PAGE_BZERO,kHasSSE2,kHasSSE4_2) diff --git a/osfmk/i386/commpage/bzero_sse42.s b/osfmk/i386/commpage/bzero_sse42.s deleted file mode 100644 index 32e8ea65f..000000000 --- a/osfmk/i386/commpage/bzero_sse42.s +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include - -/* - * Bzero, tuned for processors with SSE4.2 and 64-byte cache lines, ie Nehalem. - * We don't actually use SSE4.2, but rather use it to identify Nehalem. - * - * We do not use nontemporal operations, but use MOVDQA in preference to REP/STOS. - * - * This routine is also used for memset(p,0,n), which is a common case - * since gcc sometimes silently maps bzero() into memset(). As a result, - * we always load the original ptr into %eax before returning. - */ - -#define kShort 80 // too short to bother with SSE (must be >=80) - - -COMMPAGE_FUNCTION_START(bzero_sse42, 32, 5) - pushl %ebp // set up a frame for backtraces - movl %esp,%ebp - pushl %edi - movl 8(%ebp),%edi // get ptr - movl 12(%ebp),%edx // get length - - xorl %eax,%eax // set fill data to 0 - cmpl $(kShort),%edx // long enough for SSE? - jg LNotShort // yes - -// Here for short operands or the end of long ones. -// %edx = length -// %edi = ptr -// %eax = zero - -Lshort: - cmpl $12,%edx // long enough to word align? - jge 3f // yes - test %edx,%edx // length==0? - jz 6f -1: - movb %al,(%edi) // zero a byte - inc %edi - dec %edx - jnz 1b - jmp 6f -2: - movb %al,(%edi) // zero a byte - inc %edi - dec %edx -3: - test $3,%edi // is ptr doubleword aligned? - jnz 2b // no - movl %edx,%ecx // copy length - shrl $2,%edx // #doublewords to store -4: - movl %eax,(%edi) // zero an aligned doubleword - addl $4,%edi - dec %edx - jnz 4b - andl $3,%ecx // mask down to #bytes at end (0..3) - jz 6f // none -5: - movb %al,(%edi) // zero a byte - inc %edi - dec %ecx - jnz 5b -6: - movl 8(%ebp),%eax // get return value in case this was a call of memset() - popl %edi - popl %ebp - ret - - -// We will be using SSE, so align ptr. -// %edx = length -// %edi = ptr -// %eax = zero - -LNotShort: - testl $3,%edi // 4-byte aligned? - jz 2f // yes - movb %al,(%edi) // zero another byte - incl %edi - decl %edx - jmp LNotShort -1: // zero doublewords until 16-byte aligned - movl %eax,(%edi) - addl $4,%edi - subl $4,%edx -2: - testl $15,%edi // 16-byte aligned? - jnz 1b // no - - -// Destination is now 16-byte aligned. Prepare to loop over 64-byte chunks. -// %edx = length -// %edi = ptr -// %eax = zero - -LDestAligned: - movl %edx,%ecx - andl $63,%edx // mask down to residual length (0..63) - andl $-64,%ecx // get #bytes we will zero in this loop - pxor %xmm0,%xmm0 // zero an SSE register - addl %ecx,%edi // increment ptr by length to move - negl %ecx // negate length to move - jmp 1f - -// Loop over 64-byte chunks, storing into cache. - - .align 4,0x90 // keep inner loops 16-byte aligned -1: - movdqa %xmm0,(%edi,%ecx) - movdqa %xmm0,16(%edi,%ecx) - movdqa %xmm0,32(%edi,%ecx) - movdqa %xmm0,48(%edi,%ecx) - addl $64,%ecx - jne 1b - - jmp Lshort - - - - COMMPAGE_DESCRIPTOR(bzero_sse42,_COMM_PAGE_BZERO,kHasSSE4_2,0) diff --git a/osfmk/i386/commpage/bzero_sse42_64.s b/osfmk/i386/commpage/bzero_sse42_64.s deleted file mode 100644 index 999b9311a..000000000 --- a/osfmk/i386/commpage/bzero_sse42_64.s +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include - -/* - * Bzero, tuned for processors with SSE4.2 and 64-byte cache lines, ie Nehalem. - * We don't actually use SSE4.2, but rather use it to identify Nehalem. - * This is the 64-bit version. - * - * We do not use nontemporal operations, but use MOVDQA in preference to REP/STOS. - * - * This routine is also used for memset(p,0,n), which is a common case - * since gcc sometimes silently maps bzero() into memset(). As a result, - * we always load the original ptr into %eax before returning. - */ - -#define kShort 80 // too short to bother with SSE (must be >=80) - - -// void bzero(void *b, size_t len); - -COMMPAGE_FUNCTION_START(bzero_sse42_64, 64, 5) - pushq %rbp // set up a frame for backtraces - movq %rsp,%rbp - xorl %eax,%eax // set fill data to 0 - movq %rdi,%r11 // save original ptr as return value - cmpq $(kShort),%rsi // long enough for SSE? - jg LNotShort // yes - -// Here for short operands or the end of long ones. -// %esi = length (<= kShort) -// %rdi = ptr -// %eax = zero - -Lshort: - cmpl $12,%esi // long enough to word align? - jge 3f // yes - test %esi,%esi // length==0? - jz 6f -1: - movb %al,(%rdi) // zero a byte - incq %rdi - decl %esi - jnz 1b - jmp 6f -2: - movb %al,(%rdi) // zero a byte - incq %rdi - decl %esi -3: - testl $3,%edi // is ptr doubleword aligned? - jnz 2b // no - movl %esi,%ecx // copy length - shrl $2,%esi // #doublewords to store -4: - movl %eax,(%rdi) // zero an aligned doubleword - addq $4,%rdi - decl %esi - jnz 4b - andl $3,%ecx // mask down to #bytes at end (0..3) - jz 6f // none -5: - movb %al,(%rdi) // zero a byte - incq %rdi - decl %ecx - jnz 5b -6: - movq %r11,%rax // set return value in case this was a call of memset() - popq %rbp - ret - - -// We will be using SSE, so align ptr. -// %rsi = length (> kShort) -// %rdi = ptr -// %eax = zero - -LNotShort: - testl $3,%edi // 4-byte aligned? - jz 2f // yes - movb %al,(%rdi) // zero another byte - incq %rdi - decq %rsi - jmp LNotShort -1: // zero doublewords until 16-byte aligned - movl %eax,(%rdi) - addq $4,%rdi - subq $4,%rsi -2: - testl $15,%edi // 16-byte aligned? - jnz 1b // no - -// Destination is now 16-byte aligned. Prepare to loop over 64-byte chunks. -// %rsi = length (> (kShort-15)) -// %rdi = ptr (aligned) -// %eax = zero - -LDestAligned: - movq %rsi,%rcx - andl $63,%esi // mask down to residual length (0..63) - andq $-64,%rcx // get #bytes we will zero in this loop - pxor %xmm0,%xmm0 // zero an SSE register - addq %rcx,%rdi // increment ptr by length to move - negq %rcx // negate length to move - jmp 1f - -// Loop over 64-byte chunks, storing into cache. - - .align 4,0x90 // keep inner loops 16-byte aligned -1: - movdqa %xmm0,(%rdi,%rcx) - movdqa %xmm0,16(%rdi,%rcx) - movdqa %xmm0,32(%rdi,%rcx) - movdqa %xmm0,48(%rdi,%rcx) - addq $64,%rcx - jne 1b - - jmp Lshort - - - COMMPAGE_DESCRIPTOR(bzero_sse42_64,_COMM_PAGE_BZERO,kHasSSE4_2,0) diff --git a/osfmk/i386/commpage/cacheflush.s b/osfmk/i386/commpage/cacheflush.s deleted file mode 100644 index 4d9e98b0b..000000000 --- a/osfmk/i386/commpage/cacheflush.s +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2003-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include - -// void sysFlushDcache( void *p, size_t len ); -// 32-bit version - -COMMPAGE_FUNCTION_START(sys_flush_dcache, 32, 4) - movl 8(%esp),%ecx // get length - movl 4(%esp),%edx // get ptr - testl %ecx,%ecx // length 0? - jz 2f // yes - mfence // ensure previous stores make it to memory - clflush -1(%edx,%ecx) // make sure last line is flushed -1: - clflush (%edx) // flush a line - addl $64,%edx - subl $64,%ecx - ja 1b - mfence // make sure memory is updated before we return -2: - ret -COMMPAGE_DESCRIPTOR(sys_flush_dcache,_COMM_PAGE_FLUSH_DCACHE,kCache64,0) - - -// void sysFlushDcache( void *p, size_t len ); -// 64-bit version -// %rdi = ptr, %rsi = length -COMMPAGE_FUNCTION_START(sys_flush_dcache_64, 64, 4) - testq %rsi,%rsi // length 0? - jz 2f // yes - mfence // ensure previous stores make it to memory - clflush -1(%rdi,%rsi) // make sure last line is flushed -1: - clflush (%rdi) // flush a line - addq $64,%rdi - subq $64,%rsi - ja 1b - mfence // make sure memory is updated before we return -2: - ret -COMMPAGE_DESCRIPTOR(sys_flush_dcache_64,_COMM_PAGE_FLUSH_DCACHE,kCache64,0) - -// void sysIcacheInvalidate( void *p, size_t len ); - -COMMPAGE_FUNCTION_START(sys_icache_invalidate, 32, 4) - // This is a NOP on intel processors, since the intent of the API - // is to make data executable, and Intel L1Is are coherent with L1D. - // We can use same routine both in 32 and 64-bit mode, since it is - // just a RET instruction. - ret -COMMPAGE_DESCRIPTOR(sys_icache_invalidate,_COMM_PAGE_FLUSH_ICACHE,0,0) diff --git a/osfmk/i386/commpage/commpage.c b/osfmk/i386/commpage/commpage.c index 53030645b..cc52576c5 100644 --- a/osfmk/i386/commpage/commpage.c +++ b/osfmk/i386/commpage/commpage.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2008 Apple Inc. All rights reserved. + * Copyright (c) 2003-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -52,7 +52,7 @@ #include #include #include -#include +#include #include #include #include @@ -66,21 +66,18 @@ #include #include +#include /* the lists of commpage routines are in commpage_asm.s */ extern commpage_descriptor* commpage_32_routines[]; extern commpage_descriptor* commpage_64_routines[]; -/* translated commpage descriptors from commpage_sigs.c */ -extern commpage_descriptor sigdata_descriptor; -extern commpage_descriptor *ba_descriptors[]; - extern vm_map_t commpage32_map; // the shared submap, set up in vm init extern vm_map_t commpage64_map; // the shared submap, set up in vm init char *commPagePtr32 = NULL; // virtual addr in kernel map of 32-bit commpage char *commPagePtr64 = NULL; // ...and of 64-bit commpage -int _cpu_capabilities = 0; // define the capability vector +uint32_t _cpu_capabilities = 0; // define the capability vector int noVMX = 0; /* if true, do not set kHasAltivec in ppc _cpu_capabilities */ @@ -96,6 +93,8 @@ static commpage_address_t commPageBaseOffset; // subtract from 32-bit runtime ad static commpage_time_data *time_data32 = NULL; static commpage_time_data *time_data64 = NULL; +decl_simple_lock_data(static,commpage_active_cpus_lock); + /* Allocate the commpage and add to the shared submap created by vm: * 1. allocate a page in the kernel map (RW) * 2. wire it down @@ -157,6 +156,13 @@ commpage_allocate( panic("cannot map commpage"); ipc_port_release(handle); + + // Initialize the text section of the commpage with INT3 + char *commpage_ptr = (char*)(intptr_t)kernel_addr; + vm_size_t i; + for( i = _COMM_PAGE_TEXT_START - _COMM_PAGE_START_ADDRESS; i < size; i++ ) + // This is the hex for the X86 opcode INT3 + commpage_ptr[i] = 0xCC; return (void*)(intptr_t)kernel_addr; // return address in kernel map } @@ -193,7 +199,7 @@ commpage_cpus( void ) static void commpage_init_cpu_capabilities( void ) { - int bits; + uint32_t bits; int cpus; ml_cpu_info_t cpu_info; @@ -201,6 +207,9 @@ commpage_init_cpu_capabilities( void ) ml_cpu_get_info(&cpu_info); switch (cpu_info.vector_unit) { + case 9: + bits |= kHasAVX1_0; + /* fall thru */ case 8: bits |= kHasSSE4_2; /* fall thru */ @@ -275,48 +284,13 @@ commpage_stuff( void *dest = commpage_addr_of(address); if (address < next) - panic("commpage overlap at address 0x%p, 0x%x < 0x%x", dest, address, next); + panic("commpage overlap at address 0x%p, 0x%x < 0x%x", dest, address, next); bcopy(source,dest,length); next = address + length; } -static void -commpage_stuff_swap( - commpage_address_t address, - void *source, - int length, - int legacy ) -{ - if ( legacy ) { - void *dest = commpage_addr_of(address); - dest = (void *)((uintptr_t) dest + _COMM_PAGE_SIGS_OFFSET); - switch (length) { - case 2: - OSWriteSwapInt16(dest, 0, *(uint16_t *)source); - break; - case 4: - OSWriteSwapInt32(dest, 0, *(uint32_t *)source); - break; - case 8: - OSWriteSwapInt64(dest, 0, *(uint64_t *)source); - break; - } - } -} - -static void -commpage_stuff2( - commpage_address_t address, - void *source, - int length, - int legacy ) -{ - commpage_stuff_swap(address, source, length, legacy); - commpage_stuff(address, source, length); -} - /* Copy a routine into comm page if it matches running machine. */ static void @@ -345,8 +319,6 @@ commpage_stuff_routine( } /* Fill in the 32- or 64-bit commpage. Called once for each. - * The 32-bit ("legacy") commpage has a bunch of stuff added to it - * for translated processes, some of which is byte-swapped. */ static void @@ -356,17 +328,16 @@ commpage_populate_one( size_t area_used, // _COMM_PAGE32_AREA_USED or _COMM_PAGE64_AREA_USED commpage_address_t base_offset, // will become commPageBaseOffset commpage_descriptor** commpage_routines, // list of routine ptrs for this commpage - boolean_t legacy, // true if 32-bit commpage commpage_time_data** time_data, // &time_data32 or &time_data64 const char* signature ) // "commpage 32-bit" or "commpage 64-bit" { + uint8_t c1; short c2; - int c4; - static double two52 = 1048576.0 * 1048576.0 * 4096.0; // 2**52 - static double ten6 = 1000000.0; // 10**6 + int c4; + uint64_t c8; + uint32_t cfamily; commpage_descriptor **rd; short version = _COMM_PAGE_THIS_VERSION; - int swapcaps; next = 0; cur_routine = 0; @@ -380,25 +351,11 @@ commpage_populate_one( * ascending order, so we can check for overlap and panic if so. */ commpage_stuff(_COMM_PAGE_SIGNATURE,signature,(int)strlen(signature)); - commpage_stuff2(_COMM_PAGE_VERSION,&version,sizeof(short),legacy); + commpage_stuff(_COMM_PAGE_VERSION,&version,sizeof(short)); commpage_stuff(_COMM_PAGE_CPU_CAPABILITIES,&_cpu_capabilities,sizeof(int)); - /* excuse our magic constants, we cannot include ppc/cpu_capabilities.h */ - /* always set kCache32 and kDcbaAvailable */ - swapcaps = 0x44; - if ( _cpu_capabilities & kUP ) - swapcaps |= (kUP + (1 << kNumCPUsShift)); - else - swapcaps |= 2 << kNumCPUsShift; /* limit #cpus to 2 */ - if ( ! noVMX ) /* if rosetta will be emulating altivec... */ - swapcaps |= 0x101; /* ...then set kHasAltivec and kDataStreamsAvailable too */ - commpage_stuff_swap(_COMM_PAGE_CPU_CAPABILITIES, &swapcaps, sizeof(int), legacy); - c2 = 32; - commpage_stuff_swap(_COMM_PAGE_CACHE_LINESIZE,&c2,2,legacy); - - if (_cpu_capabilities & kCache32) - c2 = 32; - else if (_cpu_capabilities & kCache64) + c2 = 32; // default + if (_cpu_capabilities & kCache64) c2 = 64; else if (_cpu_capabilities & kCache128) c2 = 128; @@ -407,10 +364,17 @@ commpage_populate_one( c4 = MP_SPIN_TRIES; commpage_stuff(_COMM_PAGE_SPIN_COUNT,&c4,4); - if ( legacy ) { - commpage_stuff2(_COMM_PAGE_2_TO_52,&two52,8,legacy); - commpage_stuff2(_COMM_PAGE_10_TO_6,&ten6,8,legacy); - } + /* machine_info valid after ml_get_max_cpus() */ + c1 = machine_info.physical_cpu_max; + commpage_stuff(_COMM_PAGE_PHYSICAL_CPUS,&c1,1); + c1 = machine_info.logical_cpu_max; + commpage_stuff(_COMM_PAGE_LOGICAL_CPUS,&c1,1); + + c8 = ml_cpu_cache_size(0); + commpage_stuff(_COMM_PAGE_MEMORY_SIZE, &c8, 8); + + cfamily = cpuid_info()->cpuid_cpufamily; + commpage_stuff(_COMM_PAGE_CPUFAMILY, &cfamily, 4); for( rd = commpage_routines; *rd != NULL ; rd++ ) commpage_stuff_routine(*rd); @@ -421,14 +385,6 @@ commpage_populate_one( if (next > _COMM_PAGE_END) panic("commpage overflow: next = 0x%08x, commPagePtr = 0x%p", next, commPagePtr); - if ( legacy ) { - next = 0; - for( rd = ba_descriptors; *rd != NULL ; rd++ ) - commpage_stuff_routine(*rd); - - next = 0; - commpage_stuff_routine(&sigdata_descriptor); - } } @@ -449,7 +405,6 @@ commpage_populate( void ) _COMM_PAGE32_AREA_USED, _COMM_PAGE32_BASE_ADDRESS, commpage_32_routines, - TRUE, /* legacy (32-bit) commpage */ &time_data32, "commpage 32-bit"); #ifndef __LP64__ @@ -464,7 +419,6 @@ commpage_populate( void ) _COMM_PAGE64_AREA_USED, _COMM_PAGE32_START_ADDRESS, /* commpage address are relative to 32-bit commpage placement */ commpage_64_routines, - FALSE, /* not a legacy commpage */ &time_data64, "commpage 64-bit"); #ifndef __LP64__ @@ -473,6 +427,9 @@ commpage_populate( void ) #endif } + simple_lock_init(&commpage_active_cpus_lock, 0); + + commpage_update_active_cpus(); rtc_nanotime_init_commpage(); } @@ -629,6 +586,34 @@ commpage_set_spin_count( } +/* Updated every time a logical CPU goes offline/online */ +void +commpage_update_active_cpus(void) +{ + char *cp; + volatile uint8_t *ip; + + /* At least 32-bit commpage must be initialized */ + if (!commPagePtr32) + return; + + simple_lock(&commpage_active_cpus_lock); + + cp = commPagePtr32; + cp += (_COMM_PAGE_ACTIVE_CPUS - _COMM_PAGE32_BASE_ADDRESS); + ip = (volatile uint8_t*) cp; + *ip = (uint8_t) processor_avail_count; + + cp = commPagePtr64; + if ( cp ) { + cp += (_COMM_PAGE_ACTIVE_CPUS - _COMM_PAGE32_START_ADDRESS); + ip = (volatile uint8_t*) cp; + *ip = (uint8_t) processor_avail_count; + } + + simple_unlock(&commpage_active_cpus_lock); +} + /* Check to see if a given address is in the Preemption Free Zone (PFZ) */ diff --git a/osfmk/i386/commpage/commpage.h b/osfmk/i386/commpage/commpage.h index 013ca246e..c8369d78d 100644 --- a/osfmk/i386/commpage/commpage.h +++ b/osfmk/i386/commpage/commpage.h @@ -160,6 +160,7 @@ extern void commpage_set_nanotime(uint64_t tsc_base, uint64_t ns_base, uint32_t extern void commpage_set_memory_pressure(unsigned int pressure); extern void commpage_set_spin_count(unsigned int count); extern void commpage_sched_gen_inc(void); +extern void commpage_update_active_cpus(void); extern uint32_t commpage_is_in_pfz32(uint32_t); extern uint32_t commpage_is_in_pfz64(addr64_t); diff --git a/osfmk/i386/commpage/commpage_asm.s b/osfmk/i386/commpage/commpage_asm.s index 4e3ad82e2..af6227f72 100644 --- a/osfmk/i386/commpage/commpage_asm.s +++ b/osfmk/i386/commpage/commpage_asm.s @@ -90,49 +90,8 @@ _commpage_sched_gen_inc: .align 3 .globl _commpage_32_routines _commpage_32_routines: - COMMPAGE_DESCRIPTOR_REFERENCE(compare_and_swap32_mp) - COMMPAGE_DESCRIPTOR_REFERENCE(compare_and_swap32_up) - COMMPAGE_DESCRIPTOR_REFERENCE(compare_and_swap64_mp) - COMMPAGE_DESCRIPTOR_REFERENCE(compare_and_swap64_up) - COMMPAGE_DESCRIPTOR_REFERENCE(AtomicEnqueue) - COMMPAGE_DESCRIPTOR_REFERENCE(AtomicDequeue) - COMMPAGE_DESCRIPTOR_REFERENCE(memory_barrier) - COMMPAGE_DESCRIPTOR_REFERENCE(memory_barrier_sse2) - COMMPAGE_DESCRIPTOR_REFERENCE(atomic_add32_mp) - COMMPAGE_DESCRIPTOR_REFERENCE(atomic_add32_up) - COMMPAGE_DESCRIPTOR_REFERENCE(cpu_number) - COMMPAGE_DESCRIPTOR_REFERENCE(mach_absolute_time) - COMMPAGE_DESCRIPTOR_REFERENCE(spin_lock_try_mp) - COMMPAGE_DESCRIPTOR_REFERENCE(spin_lock_try_up) - COMMPAGE_DESCRIPTOR_REFERENCE(spin_lock_mp) - COMMPAGE_DESCRIPTOR_REFERENCE(spin_lock_up) - COMMPAGE_DESCRIPTOR_REFERENCE(spin_unlock) - COMMPAGE_DESCRIPTOR_REFERENCE(pthread_getspecific) - COMMPAGE_DESCRIPTOR_REFERENCE(gettimeofday) - COMMPAGE_DESCRIPTOR_REFERENCE(sys_flush_dcache) - COMMPAGE_DESCRIPTOR_REFERENCE(sys_icache_invalidate) - COMMPAGE_DESCRIPTOR_REFERENCE(pthread_self) COMMPAGE_DESCRIPTOR_REFERENCE(preempt) -// COMMPAGE_DESCRIPTOR_REFERENCE(relinquish) - COMMPAGE_DESCRIPTOR_REFERENCE(bit_test_and_set_mp) - COMMPAGE_DESCRIPTOR_REFERENCE(bit_test_and_set_up) - COMMPAGE_DESCRIPTOR_REFERENCE(bit_test_and_clear_mp) - COMMPAGE_DESCRIPTOR_REFERENCE(bit_test_and_clear_up) - COMMPAGE_DESCRIPTOR_REFERENCE(bzero_scalar) - COMMPAGE_DESCRIPTOR_REFERENCE(bzero_sse2) - COMMPAGE_DESCRIPTOR_REFERENCE(bzero_sse42) - COMMPAGE_DESCRIPTOR_REFERENCE(bcopy_scalar) - COMMPAGE_DESCRIPTOR_REFERENCE(bcopy_sse2) - COMMPAGE_DESCRIPTOR_REFERENCE(bcopy_sse3x) - COMMPAGE_DESCRIPTOR_REFERENCE(bcopy_sse42) - COMMPAGE_DESCRIPTOR_REFERENCE(memset_pattern_sse2) - COMMPAGE_DESCRIPTOR_REFERENCE(longcopy_sse3x) COMMPAGE_DESCRIPTOR_REFERENCE(backoff) - COMMPAGE_DESCRIPTOR_REFERENCE(AtomicFifoEnqueue) - COMMPAGE_DESCRIPTOR_REFERENCE(AtomicFifoDequeue) - COMMPAGE_DESCRIPTOR_REFERENCE(nanotime) - COMMPAGE_DESCRIPTOR_REFERENCE(nanotime_slow) - COMMPAGE_DESCRIPTOR_REFERENCE(pthread_mutex_lock) COMMPAGE_DESCRIPTOR_REFERENCE(pfz_enqueue) COMMPAGE_DESCRIPTOR_REFERENCE(pfz_dequeue) COMMPAGE_DESCRIPTOR_REFERENCE(pfz_mutex_lock) @@ -151,45 +110,8 @@ _commpage_32_routines: .align 3 .globl _commpage_64_routines _commpage_64_routines: - COMMPAGE_DESCRIPTOR_REFERENCE(compare_and_swap32_mp_64) - COMMPAGE_DESCRIPTOR_REFERENCE(compare_and_swap32_up_64) - COMMPAGE_DESCRIPTOR_REFERENCE(compare_and_swap64_mp_64) - COMMPAGE_DESCRIPTOR_REFERENCE(compare_and_swap64_up_64) - COMMPAGE_DESCRIPTOR_REFERENCE(AtomicEnqueue_64) - COMMPAGE_DESCRIPTOR_REFERENCE(AtomicDequeue_64) - COMMPAGE_DESCRIPTOR_REFERENCE(memory_barrier_sse2) /* same routine as 32-bit version */ - COMMPAGE_DESCRIPTOR_REFERENCE(atomic_add32_mp_64) - COMMPAGE_DESCRIPTOR_REFERENCE(atomic_add32_up_64) - COMMPAGE_DESCRIPTOR_REFERENCE(atomic_add64_mp_64) - COMMPAGE_DESCRIPTOR_REFERENCE(atomic_add64_up_64) - COMMPAGE_DESCRIPTOR_REFERENCE(cpu_number_64) - COMMPAGE_DESCRIPTOR_REFERENCE(mach_absolute_time) - COMMPAGE_DESCRIPTOR_REFERENCE(spin_lock_try_mp_64) - COMMPAGE_DESCRIPTOR_REFERENCE(spin_lock_try_up_64) - COMMPAGE_DESCRIPTOR_REFERENCE(spin_lock_mp_64) - COMMPAGE_DESCRIPTOR_REFERENCE(spin_lock_up_64) - COMMPAGE_DESCRIPTOR_REFERENCE(spin_unlock_64) - COMMPAGE_DESCRIPTOR_REFERENCE(pthread_getspecific_64) - COMMPAGE_DESCRIPTOR_REFERENCE(gettimeofday_64) - COMMPAGE_DESCRIPTOR_REFERENCE(sys_flush_dcache_64) - COMMPAGE_DESCRIPTOR_REFERENCE(sys_icache_invalidate) /* same routine as 32-bit version, just a "ret" */ - COMMPAGE_DESCRIPTOR_REFERENCE(pthread_self_64) COMMPAGE_DESCRIPTOR_REFERENCE(preempt_64) - COMMPAGE_DESCRIPTOR_REFERENCE(bit_test_and_set_mp_64) - COMMPAGE_DESCRIPTOR_REFERENCE(bit_test_and_set_up_64) - COMMPAGE_DESCRIPTOR_REFERENCE(bit_test_and_clear_mp_64) - COMMPAGE_DESCRIPTOR_REFERENCE(bit_test_and_clear_up_64) - COMMPAGE_DESCRIPTOR_REFERENCE(bzero_sse2_64) - COMMPAGE_DESCRIPTOR_REFERENCE(bzero_sse42_64) - COMMPAGE_DESCRIPTOR_REFERENCE(bcopy_sse3x_64) - COMMPAGE_DESCRIPTOR_REFERENCE(bcopy_sse42_64) - COMMPAGE_DESCRIPTOR_REFERENCE(memset_pattern_sse2_64) - COMMPAGE_DESCRIPTOR_REFERENCE(longcopy_sse3x_64) COMMPAGE_DESCRIPTOR_REFERENCE(backoff_64) - COMMPAGE_DESCRIPTOR_REFERENCE(AtomicFifoEnqueue_64) - COMMPAGE_DESCRIPTOR_REFERENCE(AtomicFifoDequeue_64) - COMMPAGE_DESCRIPTOR_REFERENCE(nanotime_64) - COMMPAGE_DESCRIPTOR_REFERENCE(pthread_mutex_lock_64) COMMPAGE_DESCRIPTOR_REFERENCE(pfz_enqueue_64) COMMPAGE_DESCRIPTOR_REFERENCE(pfz_dequeue_64) COMMPAGE_DESCRIPTOR_REFERENCE(pfz_mutex_lock_64) diff --git a/osfmk/i386/commpage/commpage_gettimeofday.s b/osfmk/i386/commpage/commpage_gettimeofday.s deleted file mode 100644 index afa87ca02..000000000 --- a/osfmk/i386/commpage/commpage_gettimeofday.s +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) 2003-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include - -#define NSEC_PER_SEC 1000*1000*1000 -#define NSEC_PER_USEC 1000 - -COMMPAGE_FUNCTION_START(gettimeofday, 32, 4) - push %ebp - mov %esp,%ebp - push %esi - push %ebx - -0: - movl _COMM_PAGE_GTOD_GENERATION,%esi /* get generation (0 if disabled) */ - testl %esi,%esi /* disabled? */ - jz 4f - - mov $ _COMM_PAGE_NANOTIME,%eax - call *%eax /* get ns in %edx:%eax */ - - - sub _COMM_PAGE_GTOD_NS_BASE,%eax - sbb _COMM_PAGE_GTOD_NS_BASE+4,%edx - mov _COMM_PAGE_GTOD_SEC_BASE,%ebx /* load all the data before checking generation */ - mov $ NSEC_PER_SEC,%ecx - - cmpl _COMM_PAGE_GTOD_GENERATION,%esi /* has time data changed out from under us? */ - jne 0b - - div %ecx - add %eax,%ebx - - mov $ NSEC_PER_USEC,%ecx - mov %edx,%eax - xor %edx,%edx - div %ecx - - mov 8(%ebp),%ecx - mov %ebx,(%ecx) - mov %eax,4(%ecx) - xor %eax,%eax - -3: - pop %ebx - pop %esi - pop %ebp - ret -4: /* fail */ - movl $1,%eax - jmp 3b -COMMPAGE_DESCRIPTOR(gettimeofday,_COMM_PAGE_GETTIMEOFDAY,0,0) - - -COMMPAGE_FUNCTION_START(gettimeofday_64, 64, 4) - // %rdi = ptr to timeval - pushq %rbp // set up a frame for backtraces - movq %rsp,%rbp - movq %rdi,%r9 // save ptr to timeval - movq $_COMM_PAGE_32_TO_64(_COMM_PAGE_TIME_DATA_START),%r10 -0: - movl _GTOD_GENERATION(%r10),%r11d // get generation (0 if disabled) - testl %r11d,%r11d // disabled? - jz 4f - - movq $_COMM_PAGE_32_TO_64(_COMM_PAGE_NANOTIME),%rax - call *%rax // get %rax <- nanotime(), preserving %r9, %r10 and %r11 - - movl _GTOD_SEC_BASE(%r10),%r8d // get _COMM_PAGE_TIMESTAMP - subq _GTOD_NS_BASE(%r10),%rax // generate nanoseconds since timestamp - cmpl _GTOD_GENERATION(%r10),%r11d // has data changed out from under us? - jne 0b - - movl $ NSEC_PER_SEC,%ecx - movq %rax,%rdx - shrq $32,%rdx // get high half of delta in %edx - divl %ecx // %eax <- seconds since timestamp, %edx <- nanoseconds - addl %eax,%r8d // add seconds elapsed to timestamp seconds - - movl $ NSEC_PER_USEC,%ecx - movl %edx,%eax - xorl %edx,%edx - divl %ecx // divide residual ns by 1000 to get residual us in %eax - - movq %r8,(%r9) // store 64-bit seconds into timeval - movl %eax,8(%r9) // store 32-bit useconds into timeval - xorl %eax,%eax // return 0 for success -3: - popq %rbp - ret -4: // fail - movl $1,%eax - jmp 3b -COMMPAGE_DESCRIPTOR(gettimeofday_64,_COMM_PAGE_GETTIMEOFDAY,0,0) diff --git a/osfmk/i386/commpage/commpage_mach_absolute_time.s b/osfmk/i386/commpage/commpage_mach_absolute_time.s deleted file mode 100644 index 590e4d7b6..000000000 --- a/osfmk/i386/commpage/commpage_mach_absolute_time.s +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Copyright (c) 2003-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include -#include - -#include - -COMMPAGE_FUNCTION_START(mach_absolute_time, 32, 4) - int $0x3 - ret -COMMPAGE_DESCRIPTOR(mach_absolute_time,_COMM_PAGE_ABSOLUTE_TIME,0,0) - - -/* return nanotime in %edx:%eax */ - -COMMPAGE_FUNCTION_START(nanotime, 32, 4) - pushl %ebp - movl %esp,%ebp - pushl %esi - pushl %ebx - -0: - movl _COMM_PAGE_NT_GENERATION,%esi /* get generation (0 if being changed) */ - testl %esi,%esi /* if being updated, loop until stable */ - jz 0b - - lfence - rdtsc /* get TSC in %edx:%eax */ - lfence - - subl _COMM_PAGE_NT_TSC_BASE,%eax - sbbl _COMM_PAGE_NT_TSC_BASE+4,%edx - - movl _COMM_PAGE_NT_SCALE,%ecx - - movl %edx,%ebx - mull %ecx - movl %ebx,%eax - movl %edx,%ebx - mull %ecx - addl %ebx,%eax - adcl $0,%edx - - addl _COMM_PAGE_NT_NS_BASE,%eax - adcl _COMM_PAGE_NT_NS_BASE+4,%edx - - cmpl _COMM_PAGE_NT_GENERATION,%esi /* have the parameters changed? */ - jne 0b /* yes, loop until stable */ - - popl %ebx - popl %esi - popl %ebp - ret -COMMPAGE_DESCRIPTOR(nanotime,_COMM_PAGE_NANOTIME,0,kSlow) - - -/* nanotime routine for machines slower than ~1Gz (SLOW_TSC_THRESHOLD) */ -COMMPAGE_FUNCTION_START(nanotime_slow, 32, 4) - push %ebp - mov %esp,%ebp - push %esi - push %edi - push %ebx - -0: - movl _COMM_PAGE_NT_GENERATION,%esi - testl %esi,%esi /* if generation is 0, data being changed */ - jz 0b /* so loop until stable */ - - lfence - rdtsc /* get TSC in %edx:%eax */ - lfence - subl _COMM_PAGE_NT_TSC_BASE,%eax - sbbl _COMM_PAGE_NT_TSC_BASE+4,%edx - - pushl %esi /* save generation */ - /* - * Do the math to convert tsc ticks to nanoseconds. We first - * do long multiply of 1 billion times the tsc. Then we do - * long division by the tsc frequency - */ - mov $1000000000, %ecx /* number of nanoseconds in a second */ - mov %edx, %ebx - mul %ecx - mov %edx, %edi - mov %eax, %esi - mov %ebx, %eax - mul %ecx - add %edi, %eax - adc $0, %edx /* result in edx:eax:esi */ - mov %eax, %edi - mov _COMM_PAGE_NT_SHIFT,%ecx /* overloaded as the low 32 tscFreq */ - xor %eax, %eax - xchg %edx, %eax - div %ecx - xor %eax, %eax - mov %edi, %eax - div %ecx - mov %eax, %ebx - mov %esi, %eax - div %ecx - mov %ebx, %edx /* result in edx:eax */ - popl %esi /* recover generation */ - - add _COMM_PAGE_NT_NS_BASE,%eax - adc _COMM_PAGE_NT_NS_BASE+4,%edx - - cmpl _COMM_PAGE_NT_GENERATION,%esi /* have the parameters changed? */ - jne 0b /* yes, loop until stable */ - - pop %ebx - pop %edi - pop %esi - pop %ebp - ret /* result in edx:eax */ -COMMPAGE_DESCRIPTOR(nanotime_slow,_COMM_PAGE_NANOTIME,kSlow,0) - - -/* The 64-bit version. We return the 64-bit nanotime in %rax, - * and by convention we must preserve %r9, %r10, and %r11. - */ -COMMPAGE_FUNCTION_START(nanotime_64, 64, 4) - pushq %rbp // set up a frame for backtraces - movq %rsp,%rbp - movq $_COMM_PAGE_32_TO_64(_COMM_PAGE_TIME_DATA_START),%rsi -1: - movl _NT_GENERATION(%rsi),%r8d // get generation - testl %r8d,%r8d // if 0, data is being changed... - jz 1b // ...so loop until stable - lfence - rdtsc // edx:eax := tsc - lfence - shlq $32,%rdx // rax := ((edx << 32) | eax), ie 64-bit tsc - orq %rdx,%rax - subq _NT_TSC_BASE(%rsi), %rax // rax := (tsc - base_tsc) - movl _NT_SCALE(%rsi),%ecx - mulq %rcx // rdx:rax := (tsc - base_tsc) * scale - shrdq $32,%rdx,%rax // _COMM_PAGE_NT_SHIFT is always 32 - addq _NT_NS_BASE(%rsi),%rax // (((tsc - base_tsc) * scale) >> 32) + ns_base - - cmpl _NT_GENERATION(%rsi),%r8d // did the data change during computation? - jne 1b - popq %rbp - ret -COMMPAGE_DESCRIPTOR(nanotime_64,_COMM_PAGE_NANOTIME,0,kSlow) diff --git a/osfmk/i386/commpage/commpage_sigs.c b/osfmk/i386/commpage/commpage_sigs.c deleted file mode 100644 index 0c100a276..000000000 --- a/osfmk/i386/commpage/commpage_sigs.c +++ /dev/null @@ -1,189 +0,0 @@ -#include "commpage.h" - -static unsigned int sigdata[] = -{ - 0x06004018, 0x01000000, 0x6d6f635f, 0x65726170, 0x646e615f, 0x6177735f, - 0x00323370, 0x06004018, 0x2828e07c, 0x4018077c, 0x14008240, 0x2d29807c, - 0xf0ffa240, 0x01006038, 0x2000804e, 0x00006038, 0x2000804e, 0x06004018, - 0x00000000, 0x6d6f635f, 0x65726170, 0x646e615f, 0x6177735f, 0x00323370, - 0x06004018, 0x06004018, 0x01000000, 0x6d6f635f, 0x65726170, 0x646e615f, - 0x6177735f, 0x62323370, 0x00000000, 0x06004018, 0xac06007c, 0x2828e07c, - 0x4018077c, 0x18008240, 0x2d29807c, 0xf0ffa240, 0x2c01004c, 0x01006038, - 0x2000804e, 0x00006038, 0x2000804e, 0x2000804e, 0x06004018, 0x00000000, - 0x6d6f635f, 0x65726170, 0x646e615f, 0x6177735f, 0x62323370, 0x00000000, - 0x06004018, 0x06004018, 0x01000000, 0x6d6f635f, 0x65726170, 0x646e615f, - 0x6177735f, 0x62343670, 0x00000000, 0x06004018, 0xac04207c, 0xa828e07c, - 0x4018277c, 0x1800c240, 0xad29807c, 0xf0ffc240, 0x2c01004c, 0x01006038, - 0x2000804e, 0xf8ff0039, 0x00006038, 0xad09887c, 0x2000804e, 0x2000804e, - 0x06004018, 0x00000000, 0x6d6f635f, 0x65726170, 0x646e615f, 0x6177735f, - 0x62343670, 0x00000000, 0x06004018, 0x06004018, 0x01000000, 0x6d656d5f, - 0x5f746573, 0x74746170, 0x006e7265, 0x06004018, 0xc0008428, 0xa642407c, - 0x00804064, 0x10000571, 0xa643007c, 0x1000a038, 0xce48007c, 0x2000c038, - 0x7c008441, 0x30002039, 0x10008241, 0xce41007c, 0x10000839, 0xf0ff8438, - 0xbed18054, 0xec45007c, 0xbe068454, 0xffff0030, 0x40004039, 0x60006039, - 0xa603097c, 0xec45067c, 0x10000048, 0x00000060, 0x00000060, 0x00000060, - 0xec450a7c, 0xec450b7c, 0xce41007c, 0xce41057c, 0xce41067c, 0xce41097c, - 0x40000839, 0xe4ff0042, 0xce41007c, 0xce41057c, 0xce41067c, 0xce41097c, - 0x40000839, 0x7fd98054, 0xfe068454, 0x18008241, 0xa603097c, 0xce41007c, - 0xce41057c, 0x20000839, 0xf4ff0042, 0xa643407c, 0x2000804e, 0x2000804e, - 0x06004018, 0x00000000, 0x6d656d5f, 0x5f746573, 0x74746170, 0x006e7265, - 0x06004018, 0x06004018, 0x01000000, 0x6f74615f, 0x5f63696d, 0x75716e65, - 0x00657565, 0x06004018, 0x2818c07c, 0x2e29c47c, 0x00000060, 0x2d19807c, - 0x2000e24d, 0xecffff4b, 0x2000804e, 0x06004018, 0x00000000, 0x6f74615f, - 0x5f63696d, 0x75716e65, 0x00657565, 0x06004018, 0x06004018, 0x01000000, - 0x6f74615f, 0x5f63696d, 0x75716564, 0x00657565, 0x06004018, 0x781b657c, - 0x2828607c, 0x0000032c, 0x2000824d, 0x2e20c37c, 0x2d29c07c, 0xecffc240, - 0x00000060, 0x2000804e, 0x06004018, 0x00000000, 0x6f74615f, 0x5f63696d, - 0x75716564, 0x00657565, 0x06004018, 0x06004018, 0x01000000, 0x6f74615f, - 0x5f63696d, 0x72726162, 0x00726569, 0x06004018, 0x2000804e, 0x06004018, - 0x00000000, 0x6f74615f, 0x5f63696d, 0x72726162, 0x00726569, 0x06004018, - 0x06004018, 0x01000000, 0x6f74615f, 0x5f63696d, 0x5f646461, 0x00003233, - 0x06004018, 0x2820a07c, 0x142ac37c, 0x2d21c07c, 0xf4ffc240, 0x7833c37c, - 0x2000804e, 0x06004018, 0x00000000, 0x6f74615f, 0x5f63696d, 0x5f646461, - 0x00003233, 0x06004018, 0x06004018, 0x01000000, 0x63616d5f, 0x62615f68, - 0x756c6f73, 0x745f6574, 0x00656d69, 0x06004018, 0x00004018, 0x04006000, - 0x00000000, 0x00000000, 0x63616d5f, 0x62615f68, 0x756c6f73, 0x745f6574, - 0x5f656d69, 0x68676968, 0x00000000, 0x00004018, 0x00004018, 0x04008000, - 0x00000000, 0x00000000, 0x63616d5f, 0x62615f68, 0x756c6f73, 0x745f6574, - 0x5f656d69, 0x00776f6c, 0x00004018, 0x2000804e, 0x06004018, 0x00000000, - 0x63616d5f, 0x62615f68, 0x756c6f73, 0x745f6574, 0x00656d69, 0x06004018, - 0x06004018, 0x01000000, 0x6970735f, 0x6f6c5f6e, 0x745f6b63, 0x00007972, - 0x06004018, 0x05004018, 0x04006000, 0x05004018, 0x00004018, 0x04006000, - 0x00000000, 0x00000000, 0x6970735f, 0x6f6c5f6e, 0x745f6b63, 0x775f7972, - 0x70706172, 0x00007265, 0x00004018, 0x2000804e, 0x06004018, 0x00000000, - 0x6970735f, 0x6f6c5f6e, 0x745f6b63, 0x00007972, 0x06004018, 0x06004018, - 0x01000000, 0x6970735f, 0x6f6c5f6e, 0x00006b63, 0x06004018, 0x05004018, - 0x04006000, 0x05004018, 0x00004018, 0x00000000, 0x00000000, 0x00000000, - 0x6970735f, 0x6f6c5f6e, 0x775f6b63, 0x70706172, 0x00007265, 0x00004018, - 0x2000804e, 0x06004018, 0x00000000, 0x6970735f, 0x6f6c5f6e, 0x00006b63, - 0x06004018, 0x06004018, 0x01000000, 0x6970735f, 0x6e755f6e, 0x6b636f6c, - 0x00000000, 0x06004018, 0x05004018, 0x04006000, 0x05004018, 0x00004018, - 0x00000000, 0x00000000, 0x00000000, 0x6970735f, 0x6e755f6e, 0x6b636f6c, - 0x00000000, 0x00004018, 0x2000804e, 0x06004018, 0x00000000, 0x6970735f, - 0x6e755f6e, 0x6b636f6c, 0x00000000, 0x06004018, 0x06004018, 0x01000000, - 0x6874705f, 0x64616572, 0x7465675f, 0x63657073, 0x63696669, 0x00000000, - 0x06004018, 0x02004018, 0xc082ffff, 0x02004018, 0x2000804e, 0x06004018, - 0x00000000, 0x6874705f, 0x64616572, 0x7465675f, 0x63657073, 0x63696669, - 0x00000000, 0x06004018, 0x06004018, 0x01000000, 0x7465675f, 0x656d6974, - 0x6164666f, 0x00000079, 0x06004018, 0x05004018, 0x04006000, 0x05004018, - 0x00004018, 0x04006000, 0x00000000, 0x00000000, 0x7465675f, 0x656d6974, - 0x6164666f, 0x72775f79, 0x65707061, 0x00000072, 0x00004018, 0x2000804e, - 0x06004018, 0x00000000, 0x7465675f, 0x656d6974, 0x6164666f, 0x00000079, - 0x06004018, 0x06004018, 0x01000000, 0x7379735f, 0x6163645f, 0x5f656863, - 0x73756c66, 0x00000068, 0x06004018, 0x05004018, 0x04006000, 0x05004018, - 0x05004018, 0x04008000, 0x05004018, 0x00004018, 0x00000000, 0x00000000, - 0x00000000, 0x7379735f, 0x6163645f, 0x5f656863, 0x73756c66, 0x00000068, - 0x00004018, 0x2000804e, 0x06004018, 0x00000000, 0x7379735f, 0x6163645f, - 0x5f656863, 0x73756c66, 0x00000068, 0x06004018, 0x06004018, 0x01000000, - 0x7379735f, 0x6163695f, 0x5f656863, 0x61766e69, 0x6164696c, 0x00006574, - 0x06004018, 0x05004018, 0x04006000, 0x05004018, 0x05004018, 0x04008000, - 0x05004018, 0x00004018, 0x00000000, 0x00000000, 0x00000000, 0x7379735f, - 0x6163695f, 0x5f656863, 0x61766e69, 0x6164696c, 0x775f6574, 0x70706172, - 0x00007265, 0x00004018, 0x2000804e, 0x06004018, 0x00000000, 0x7379735f, - 0x6163695f, 0x5f656863, 0x61766e69, 0x6164696c, 0x00006574, 0x06004018, - 0x06004018, 0x01000000, 0x6874705f, 0x64616572, 0x6c65735f, 0x00000066, - 0x06004018, 0x02004018, 0x8085ffff, 0x02004018, 0x2000804e, 0x06004018, - 0x00000000, 0x6874705f, 0x64616572, 0x6c65735f, 0x00000066, 0x06004018, - 0x06004018, 0x01000000, 0x657a625f, 0x00006f72, 0x06004018, 0x05004018, - 0x04006000, 0x05004018, 0x05004018, 0x04008000, 0x05004018, 0x00004018, - 0x00000000, 0x00000000, 0x00000000, 0x657a625f, 0x00006f72, 0x00004018, - 0x2000804e, 0x06004018, 0x00000000, 0x657a625f, 0x00006f72, 0x06004018, - 0x06004018, 0x01000000, 0x6f63625f, 0x00007970, 0x06004018, 0x05004018, - 0x04006000, 0x05004018, 0x05004018, 0x04008000, 0x05004018, 0x05004018, - 0x0400a000, 0x05004018, 0x00004018, 0x00000000, 0x00000000, 0x00000000, - 0x6f63625f, 0x00007970, 0x00004018, 0x2000804e, 0x06004018, 0x00000000, - 0x6f63625f, 0x00007970, 0x06004018, 0x06004018, 0x01000000, 0x6d656d5f, - 0x65766f6d, 0x00000000, 0x06004018, 0x05004018, 0x04006000, 0x05004018, - 0x05004018, 0x04008000, 0x05004018, 0x05004018, 0x0400a000, 0x05004018, - 0x00004018, 0x00000000, 0x00000000, 0x00000000, 0x6d656d5f, 0x65766f6d, - 0x00000000, 0x00004018, 0x2000804e, 0x06004018, 0x00000000, 0x6d656d5f, - 0x65766f6d, 0x00000000, 0x06004018, 0x06004018, 0x01000000, 0x6e616e5f, - 0x6d69746f, 0x00000065, 0x06004018, 0x00004018, 0x04006000, 0x00000000, - 0x00000000, 0x6e616e5f, 0x6d69746f, 0x69685f65, 0x00006867, 0x00004018, - 0x00004018, 0x04008000, 0x00000000, 0x00000000, 0x6e616e5f, 0x6d69746f, - 0x6f6c5f65, 0x00000077, 0x00004018, 0x2000804e, 0x06004018, 0x00000000, - 0x6e616e5f, 0x6d69746f, 0x00000065, 0x06004018, 0x06004018, 0x01000000, - 0x6a626f5f, 0x736d5f63, 0x6e655367, 0x00000064, 0x06004018, 0x00004018, - 0x00000000, 0x00000400, 0x00000000, 0x6a626f5f, 0x736d5f63, 0x6e655367, - 0x00000064, 0x00004018, 0x06004018, 0x00000000, 0x6a626f5f, 0x736d5f63, - 0x6e655367, 0x00000064, 0x06004018, 0x06004018, 0x01000000, 0x6a626f5f, - 0x73615f63, 0x6e676973, 0x6176695f, 0x00000072, 0x06004018, 0x00004018, - 0x00000000, 0x00000400, 0x00000000, 0x6a626f5f, 0x73615f63, 0x6e676973, - 0x6176695f, 0x65675f72, 0x6972656e, 0x00000063, 0x00004018, 0x06004018, - 0x00000000, 0x6a626f5f, 0x73615f63, 0x6e676973, 0x6176695f, 0x00000072, - 0x06004018, 0x06004018, 0x01000000, 0x6a626f5f, 0x73615f63, 0x6e676973, - 0x6f6c675f, 0x006c6162, 0x06004018, 0x00004018, 0x00000000, 0x00000400, - 0x00000000, 0x6a626f5f, 0x73615f63, 0x6e676973, 0x6f6c675f, 0x5f6c6162, - 0x656e6567, 0x00636972, 0x00004018, 0x06004018, 0x00000000, 0x6a626f5f, - 0x73615f63, 0x6e676973, 0x6f6c675f, 0x006c6162, 0x06004018, 0x06004018, - 0x01000000, 0x6a626f5f, 0x73615f63, 0x6e676973, 0x7274735f, 0x43676e6f, - 0x00747361, 0x06004018, 0x00004018, 0x00000000, 0x00000400, 0x00000000, - 0x6a626f5f, 0x73615f63, 0x6e676973, 0x7274735f, 0x43676e6f, 0x5f747361, - 0x656e6567, 0x00636972, 0x00004018, 0x06004018, 0x00000000, 0x6a626f5f, - 0x73615f63, 0x6e676973, 0x7274735f, 0x43676e6f, 0x00747361, 0x06004018, -}; -commpage_descriptor sigdata_descriptor = -{ sigdata, sizeof(sigdata), 0xffff3000, 0, 0 }; - -static unsigned int badata[] = -{ - 0xae3aff4b, 0x3e3aff4b, 0xce39ff4b, 0x7239ff4b, 0x0230ff4b, 0x5e32ff4b, - 0xb232ff4b, 0x0e33ff4b, 0x4a33ff4b, 0x9a33ff4b, 0x3a34ff4b, 0xae34ff4b, - 0x1635ff4b, 0x8235ff4b, 0xda35ff4b, 0x4e36ff4b, 0xd236ff4b, 0x6a37ff4b, - 0xb237ff4b, 0x1238ff4b, 0x7e38ff4b, 0x6630ff4b, 0xde30ff4b, 0x5e31ff4b, -}; -static commpage_descriptor badata_descriptor_ary[] = -{ - { &badata[ 0], 4, 0xfffefea0, 0, 0 }, - { &badata[ 1], 4, 0xfffefeb0, 0, 0 }, - { &badata[ 2], 4, 0xfffefec0, 0, 0 }, - { &badata[ 3], 4, 0xfffeff00, 0, 0 }, - { &badata[ 4], 4, 0xffff8080, 0, 0 }, - { &badata[ 5], 4, 0xffff8100, 0, 0 }, - { &badata[ 6], 4, 0xffff8140, 0, 0 }, - { &badata[ 7], 4, 0xffff8180, 0, 0 }, - { &badata[ 8], 4, 0xffff81a0, 0, 0 }, - { &badata[ 9], 4, 0xffff8200, 0, 0 }, - { &badata[10], 4, 0xffff8220, 0, 0 }, - { &badata[11], 4, 0xffff8260, 0, 0 }, - { &badata[12], 4, 0xffff82a0, 0, 0 }, - { &badata[13], 4, 0xffff82c0, 0, 0 }, - { &badata[14], 4, 0xffff82e0, 0, 0 }, - { &badata[15], 4, 0xffff84e0, 0, 0 }, - { &badata[16], 4, 0xffff8520, 0, 0 }, - { &badata[17], 4, 0xffff8580, 0, 0 }, - { &badata[18], 4, 0xffff8600, 0, 0 }, - { &badata[19], 4, 0xffff8780, 0, 0 }, - { &badata[20], 4, 0xffff87a0, 0, 0 }, - { &badata[21], 4, 0xffff8f80, 0, 0 }, - { &badata[22], 4, 0xffff8fc0, 0, 0 }, - { &badata[23], 4, 0xffff9000, 0, 0 }, -}; -commpage_descriptor *ba_descriptors[] = -{ - &badata_descriptor_ary[ 0], - &badata_descriptor_ary[ 1], - &badata_descriptor_ary[ 2], - &badata_descriptor_ary[ 3], - &badata_descriptor_ary[ 4], - &badata_descriptor_ary[ 5], - &badata_descriptor_ary[ 6], - &badata_descriptor_ary[ 7], - &badata_descriptor_ary[ 8], - &badata_descriptor_ary[ 9], - &badata_descriptor_ary[10], - &badata_descriptor_ary[11], - &badata_descriptor_ary[12], - &badata_descriptor_ary[13], - &badata_descriptor_ary[14], - &badata_descriptor_ary[15], - &badata_descriptor_ary[16], - &badata_descriptor_ary[17], - &badata_descriptor_ary[18], - &badata_descriptor_ary[19], - &badata_descriptor_ary[20], - &badata_descriptor_ary[21], - &badata_descriptor_ary[22], - &badata_descriptor_ary[23], - 0 -}; diff --git a/osfmk/i386/commpage/cpu_number.s b/osfmk/i386/commpage/cpu_number.s deleted file mode 100644 index d86b13ba1..000000000 --- a/osfmk/i386/commpage/cpu_number.s +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include -#include - -#include - -/* - * These commpage routines provide fast access to the logical cpu number - * of the calling processor assuming no pre-emption occurs. This number - * is encoded in the bottom 12-bits of the limit field of the IDTR (the - * Interrupt Descriptor Table Register). The SIDT instruction is used in - * userspace to read this register and thus to gain access to the cpu number. - * The IDTR is loaded by the kernel for each processor at startup - see - * osfmk/i386/mp_desc.c. - */ - -/* return logical cpu number in %eax */ - -COMMPAGE_FUNCTION_START(cpu_number, 32, 4) - push %ebp - mov %esp,%ebp - sub $8, %esp // space to read IDTR - - sidt (%esp) // store limit:base on stack - movw (%esp), %ax // get limit - and $0xfff, %eax // mask off lower 12 bits to return - - mov %ebp,%esp - pop %ebp - ret -COMMPAGE_DESCRIPTOR(cpu_number,_COMM_PAGE_CPU_NUMBER,0,0) - - -/* The 64-bit version. - */ -COMMPAGE_FUNCTION_START(cpu_number_64, 64, 4) - push %rbp - mov %rsp,%rbp - sub $16,%rsp // space to read IDTR - - sidt (%rsp) // store limit:base on stack - movw (%rsp), %rax // get limit - and $0xfff, %rax // mask off lower 12 bits to return - - mov %rbp,%rsp - pop %rbp - ret -COMMPAGE_DESCRIPTOR(cpu_number_64,_COMM_PAGE_CPU_NUMBER,0,0) diff --git a/osfmk/i386/commpage/fifo_queues.s b/osfmk/i386/commpage/fifo_queues.s index e390a3b17..fa2bbf82c 100644 --- a/osfmk/i386/commpage/fifo_queues.s +++ b/osfmk/i386/commpage/fifo_queues.s @@ -66,48 +66,6 @@ * void OSAtomicFifoEnqueue( OSFifoQueueHead *list, void *new, size_t offset); */ -COMMPAGE_FUNCTION_START(AtomicFifoEnqueue, 32, 4) - pushl %edi - pushl %esi - pushl %ebx - xorl %ebx,%ebx // clear "preemption pending" flag - movl 16(%esp),%edi // %edi == ptr to list head - movl 20(%esp),%esi // %esi == new - movl 24(%esp),%edx // %edx == offset - COMMPAGE_CALL(_COMM_PAGE_PFZ_ENQUEUE,_COMM_PAGE_FIFO_ENQUEUE,AtomicFifoEnqueue) - testl %ebx,%ebx // pending preemption? - jz 1f - COMMPAGE_CALL(_COMM_PAGE_PREEMPT,_COMM_PAGE_FIFO_ENQUEUE,AtomicFifoEnqueue) -1: - popl %ebx - popl %esi - popl %edi - ret -COMMPAGE_DESCRIPTOR(AtomicFifoEnqueue,_COMM_PAGE_FIFO_ENQUEUE,0,0) - - -/* void* OSAtomicFifoDequeue( OSFifoQueueHead *list, size_t offset); */ - -COMMPAGE_FUNCTION_START(AtomicFifoDequeue, 32, 4) - pushl %edi - pushl %esi - pushl %ebx - xorl %ebx,%ebx // clear "preemption pending" flag - movl 16(%esp),%edi // %edi == ptr to list head - movl 20(%esp),%edx // %edx == offset - COMMPAGE_CALL(_COMM_PAGE_PFZ_DEQUEUE,_COMM_PAGE_FIFO_DEQUEUE,AtomicFifoDequeue) - testl %ebx,%ebx // pending preemption? - jz 1f - pushl %eax // save return value across sysenter - COMMPAGE_CALL(_COMM_PAGE_PREEMPT,_COMM_PAGE_FIFO_DEQUEUE,AtomicFifoDequeue) - popl %eax -1: - popl %ebx - popl %esi - popl %edi - ret // ptr to 1st element in Q still in %eax -COMMPAGE_DESCRIPTOR(AtomicFifoDequeue,_COMM_PAGE_FIFO_DEQUEUE,0,0) - /* Subroutine to make a preempt syscall. Called when we notice %ebx is * nonzero after returning from a PFZ subroutine. @@ -255,38 +213,6 @@ COMMPAGE_DESCRIPTOR(pfz_dequeue,_COMM_PAGE_PFZ_DEQUEUE,0,0) * void OSAtomicFifoEnqueue( OSFifoQueueHead *list, void *new, size_t offset); */ -// %rdi == list head, %rsi == new, %rdx == offset - -COMMPAGE_FUNCTION_START(AtomicFifoEnqueue_64, 64, 4) - pushq %rbx - xorl %ebx,%ebx // clear "preemption pending" flag - COMMPAGE_CALL(_COMM_PAGE_PFZ_ENQUEUE,_COMM_PAGE_FIFO_ENQUEUE,AtomicFifoEnqueue_64) - testl %ebx,%ebx // pending preemption? - jz 1f - COMMPAGE_CALL(_COMM_PAGE_PREEMPT,_COMM_PAGE_FIFO_ENQUEUE,AtomicFifoEnqueue_64) -1: - popq %rbx - ret -COMMPAGE_DESCRIPTOR(AtomicFifoEnqueue_64,_COMM_PAGE_FIFO_ENQUEUE,0,0) - - -/* void* OSAtomicDequeue( OSQueueHead *list, size_t offset); */ - -// %rdi == list head, %rsi == offset - -COMMPAGE_FUNCTION_START(AtomicFifoDequeue_64, 64, 4) - pushq %rbx - xorl %ebx,%ebx // clear "preemption pending" flag - movq %rsi,%rdx // move offset to %rdx to be like the Enqueue case - COMMPAGE_CALL(_COMM_PAGE_PFZ_DEQUEUE,_COMM_PAGE_FIFO_DEQUEUE,AtomicFifoDequeue_64) - testl %ebx,%ebx // pending preemption? - jz 1f - COMMPAGE_CALL(_COMM_PAGE_PREEMPT,_COMM_PAGE_FIFO_DEQUEUE,AtomicFifoDequeue_64) -1: - popq %rbx - ret // ptr to 1st element in Q in %rax -COMMPAGE_DESCRIPTOR(AtomicFifoDequeue_64,_COMM_PAGE_FIFO_DEQUEUE,0,0) - /* Subroutine to make a preempt syscall. Called when we notice %ebx is * nonzero after returning from a PFZ subroutine. Not in PFZ. diff --git a/osfmk/i386/commpage/longcopy_sse3x.s b/osfmk/i386/commpage/longcopy_sse3x.s deleted file mode 100644 index 3a1de25ed..000000000 --- a/osfmk/i386/commpage/longcopy_sse3x.s +++ /dev/null @@ -1,221 +0,0 @@ -/* - * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include - - -/* - * The bcopy/memcpy loops for very long operands, tuned for Pentium-M - * class processors with Supplemental SSE3 and 64-byte cache lines. - * - * The following #defines are tightly coupled to the u-architecture: - */ - -#define kBigChunk (256*1024) // outer loop chunk size for kVeryLong sized operands - - -// Very long forward moves. These are at least several pages, so we loop over big -// chunks of memory (kBigChunk in size.) We first prefetch the chunk, and then copy -// it using non-temporal stores. Hopefully all the reads occur in the prefetch loop, -// so the copy loop reads from L2 and writes directly to memory (with write combining.) -// This minimizes bus turnaround and maintains good DRAM page locality. -// Note that for this scheme to work, kVeryLong must be a large fraction of L2 cache -// size. Otherwise, it is counter-productive to bypass L2 on the stores. -// -// We are called from the commpage bcopy loops when they encounter very long -// operands, with the standard ABI. -// -// void longcopy(const void *dest, void *sou, size_t len) - -// void longcopy(const void *dest, void *sou, size_t len) - -COMMPAGE_FUNCTION_START(longcopy_sse3x, 32, 5) - pushl %ebp // set up a frame for backtraces - movl %esp,%ebp - pushl %esi - pushl %edi - pushl %ebx // we'll need to use this too - movl 8(%ebp),%edi // get dest ptr - movl 12(%ebp),%esi // get source ptr - movl 16(%ebp),%ecx // get length - movl %edi,%ebx // copy dest ptr - negl %ebx - andl $63,%ebx // get #bytes to cache line align destination - jz LBigChunkLoop // already aligned - -// Cache line align destination, so temporal stores in copy loops work right. - - pushl %ebx // arg3 - #bytes to align destination (1..63) - pushl %esi // arg2 - source - pushl %edi // arg1 - dest - movl $(_COMM_PAGE_MEMCPY),%eax - call *%eax // align the destination - addl $12,%esp - movl 8(%ebp),%edi // recover dest ptr - movl 12(%ebp),%esi // recover source ptr - movl 16(%ebp),%ecx // recover length - addl %ebx,%esi // adjust ptrs and lengths past copy - addl %ebx,%edi - subl %ebx,%ecx - -// Loop over big chunks. -// ecx = length remaining (>= 4096) -// edi = dest (64-byte aligned) -// esi = source (may be unaligned) - -LBigChunkLoop: - movl $(kBigChunk),%edx // assume we can do a full chunk - cmpl %edx,%ecx // do we have a full chunk left to do? - cmovbl %ecx,%edx // if not, only move what we have left - andl $-4096,%edx // we work in page multiples - xor %eax,%eax // initialize chunk offset - jmp LTouchLoop - -// Touch in the next chunk. We try to keep the prefetch unit in "kick-start" mode, -// by touching two adjacent cache lines every 8 lines of each page, in four slices. -// Because the source may be unaligned, we use byte loads to touch. -// ecx = length remaining (including this chunk) -// edi = ptr to start of dest chunk -// esi = ptr to start of source chunk -// edx = chunk length (multiples of pages) -// ebx = scratch reg used to read a byte of each cache line -// eax = chunk offset - - .align 4,0x90 // 16-byte align inner loops -LTouchLoop: - movzb (%esi,%eax),%ebx // touch line 0, 2, 4, or 6 of page - movzb 1*64(%esi,%eax),%ebx // touch line 1, 3, 5, or 7 - movzb 8*64(%esi,%eax),%ebx // touch line 8, 10, 12, or 14 - movzb 9*64(%esi,%eax),%ebx // etc - - movzb 16*64(%esi,%eax),%ebx - movzb 17*64(%esi,%eax),%ebx - movzb 24*64(%esi,%eax),%ebx - movzb 25*64(%esi,%eax),%ebx - - movzb 32*64(%esi,%eax),%ebx - movzb 33*64(%esi,%eax),%ebx - movzb 40*64(%esi,%eax),%ebx - movzb 41*64(%esi,%eax),%ebx - - movzb 48*64(%esi,%eax),%ebx - movzb 49*64(%esi,%eax),%ebx - movzb 56*64(%esi,%eax),%ebx - movzb 57*64(%esi,%eax),%ebx - - subl $-128,%eax // next slice of page (adding 128 w 8-bit immediate) - testl $512,%eax // done with this page? - jz LTouchLoop // no, next of four slices - addl $(4096-512),%eax // move on to next page - cmpl %eax,%edx // done with this chunk? - jnz LTouchLoop // no, do next page - -// The chunk has been pre-fetched, now copy it using non-temporal stores. -// There are two copy loops, depending on whether the source is 16-byte aligned -// or not. - - addl %edx,%esi // increment ptrs by chunk length - addl %edx,%edi - subl %edx,%ecx // adjust remaining length - negl %edx // prepare loop index (counts up to 0) - testl $15,%esi // is source 16-byte aligned? - jnz LVeryLongUnaligned // source is not aligned - jmp LVeryLongAligned - - .align 4,0x90 // 16-byte align inner loops -LVeryLongAligned: // aligned loop over 128-bytes - movdqa (%esi,%edx),%xmm0 - movdqa 16(%esi,%edx),%xmm1 - movdqa 32(%esi,%edx),%xmm2 - movdqa 48(%esi,%edx),%xmm3 - movdqa 64(%esi,%edx),%xmm4 - movdqa 80(%esi,%edx),%xmm5 - movdqa 96(%esi,%edx),%xmm6 - movdqa 112(%esi,%edx),%xmm7 - - movntdq %xmm0,(%edi,%edx) - movntdq %xmm1,16(%edi,%edx) - movntdq %xmm2,32(%edi,%edx) - movntdq %xmm3,48(%edi,%edx) - movntdq %xmm4,64(%edi,%edx) - movntdq %xmm5,80(%edi,%edx) - movntdq %xmm6,96(%edi,%edx) - movntdq %xmm7,112(%edi,%edx) - - subl $-128,%edx // add 128 with an 8-bit immediate - jnz LVeryLongAligned - jmp LVeryLongChunkEnd - - .align 4,0x90 // 16-byte align inner loops -LVeryLongUnaligned: // unaligned loop over 128-bytes - movdqu (%esi,%edx),%xmm0 - movdqu 16(%esi,%edx),%xmm1 - movdqu 32(%esi,%edx),%xmm2 - movdqu 48(%esi,%edx),%xmm3 - movdqu 64(%esi,%edx),%xmm4 - movdqu 80(%esi,%edx),%xmm5 - movdqu 96(%esi,%edx),%xmm6 - movdqu 112(%esi,%edx),%xmm7 - - movntdq %xmm0,(%edi,%edx) - movntdq %xmm1,16(%edi,%edx) - movntdq %xmm2,32(%edi,%edx) - movntdq %xmm3,48(%edi,%edx) - movntdq %xmm4,64(%edi,%edx) - movntdq %xmm5,80(%edi,%edx) - movntdq %xmm6,96(%edi,%edx) - movntdq %xmm7,112(%edi,%edx) - - subl $-128,%edx // add 128 with an 8-bit immediate - jnz LVeryLongUnaligned - -LVeryLongChunkEnd: - cmpl $4096,%ecx // at least another page to go? - jae LBigChunkLoop // yes - -// Done. Call memcpy() again to handle the 0-4095 bytes at the end. - - sfence // required by non-temporal stores - testl %ecx,%ecx // anything left to copy? - jz 1f - pushl %ecx // arg3 - #bytes to align destination (1..63) - pushl %esi // arg2 - source - pushl %edi // arg1 - dest - movl $(_COMM_PAGE_MEMCPY),%eax - call *%eax // align the destination - addl $12,%esp // pop off arguments -1: - popl %ebx - popl %edi - popl %esi - popl %ebp - ret - -/* always match for now, as commpage_stuff_routine() will panic if no match */ -COMMPAGE_DESCRIPTOR(longcopy_sse3x, _COMM_PAGE_LONGCOPY, 0 ,0) diff --git a/osfmk/i386/commpage/longcopy_sse3x_64.s b/osfmk/i386/commpage/longcopy_sse3x_64.s deleted file mode 100644 index 439c4447f..000000000 --- a/osfmk/i386/commpage/longcopy_sse3x_64.s +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include - - -/* - * The bcopy/memcpy loops for very long operands, tuned for 64-bit - * Pentium-M class processors with Supplemental SSE3 and 64-byte cache lines. - * This is the 64-bit version. - * - * The following #defines are tightly coupled to the u-architecture: - */ - -#define kBigChunk (256*1024) // outer loop chunk size for kVeryLong sized operands - - -// Very long forward moves. These are at least several pages, so we loop over big -// chunks of memory (kBigChunk in size.) We first prefetch the chunk, and then copy -// it using non-temporal stores. Hopefully all the reads occur in the prefetch loop, -// so the copy loop reads from L2 and writes directly to memory (with write combining.) -// This minimizes bus turnaround and maintains good DRAM page locality. -// Note that for this scheme to work, kVeryLong must be a large fraction of L2 cache -// size. Otherwise, it is counter-productive to bypass L2 on the stores. -// -// We are called from the commpage bcopy loops when they encounter very long -// operands, with the standard ABI: -// rdi = dest ptr -// rsi = source ptr -// rdx = length (>= 8kb, probably much bigger) - -// void longcopy(const void *dest, void *sou, size_t len) - -COMMPAGE_FUNCTION_START(longcopy_sse3x_64, 64, 5) - pushq %rbp // set up a frame for backtraces - movq %rsp,%rbp - movl %edi,%eax // copy dest ptr - negl %eax - andl $63,%eax // get #bytes to cache line align destination - jz LBigChunkLoop // already aligned - -// Cache line align destination, so temporal stores in copy loops work right. -// The recursive call returns with the source and dest ptrs properly updated. - - subq %rax,%rdx // get length remaining after dest is aligned - pushq %rdx // save length remaining - movl %eax,%edx // #bytes to copy to align destination - movq $_COMM_PAGE_32_TO_64(_COMM_PAGE_MEMCPY),%rax - call *%rax - popq %rdx // recover adjusted length - -// Loop over big chunks. -// rdx = length remaining (>= 4096) -// rdi = dest (64-byte aligned) -// rsi = source (may be unaligned) - -LBigChunkLoop: - movl $(kBigChunk),%r8d // assume we can do a full chunk - cmpq %r8,%rdx // do we have a full chunk left to do? - cmovbl %edx,%r8d // if not, only move what we have left - andl $-4096,%r8d // we work in page multiples - xorl %eax,%eax // initialize chunk offset - jmp LTouchLoop - -// Touch in the next chunk. We try to keep the prefetch unit in "kick-start" mode, -// by touching two adjacent cache lines every 8 lines of each page, in four slices. -// Because the source may be unaligned, we use byte loads to touch. -// rdx = length remaining (including this chunk) -// rdi = ptr to start of dest chunk -// rsi = ptr to start of source chunk -// r8d = chunk length (multiples of pages, less than 2**32) -// ecx = scratch reg used to read a byte of each cache line -// eax = chunk offset - - .align 4,0x90 // 16-byte align inner loops -LTouchLoop: - movzb (%rsi,%rax),%ecx // touch line 0, 2, 4, or 6 of page - movzb 1*64(%rsi,%rax),%ecx // touch line 1, 3, 5, or 7 - movzb 8*64(%rsi,%rax),%ecx // touch line 8, 10, 12, or 14 - movzb 9*64(%rsi,%rax),%ecx // etc - - movzb 16*64(%rsi,%rax),%ecx - movzb 17*64(%rsi,%rax),%ecx - movzb 24*64(%rsi,%rax),%ecx - movzb 25*64(%rsi,%rax),%ecx - - movzb 32*64(%rsi,%rax),%ecx - movzb 33*64(%rsi,%rax),%ecx - movzb 40*64(%rsi,%rax),%ecx - movzb 41*64(%rsi,%rax),%ecx - - movzb 48*64(%rsi,%rax),%ecx - movzb 49*64(%rsi,%rax),%ecx - movzb 56*64(%rsi,%rax),%ecx - movzb 57*64(%rsi,%rax),%ecx - - subl $-128,%eax // next slice of page (adding 128 w 8-bit immediate) - testl $512,%eax // done with this page? - jz LTouchLoop // no, next of four slices - addl $(4096-512),%eax // move on to next page - cmpl %eax,%r8d // done with this chunk? - jnz LTouchLoop // no, do next page - -// The chunk has been pre-fetched, now copy it using non-temporal stores. -// There are two copy loops, depending on whether the source is 16-byte aligned -// or not. - - movl %r8d,%ecx // copy chunk size to a reg that doesn't use REX prefix - addq %rcx,%rsi // increment ptrs by chunk length - addq %rcx,%rdi - subq %rcx,%rdx // adjust remaining length - negq %rcx // prepare loop index (counts up to 0) - testl $15,%esi // is source 16-byte aligned? - jnz LVeryLongUnaligned // no - jmp LVeryLongAligned - - .align 4,0x90 // 16-byte align inner loops -LVeryLongAligned: // aligned loop over 128-bytes - movdqa (%rsi,%rcx),%xmm0 - movdqa 16(%rsi,%rcx),%xmm1 - movdqa 32(%rsi,%rcx),%xmm2 - movdqa 48(%rsi,%rcx),%xmm3 - movdqa 64(%rsi,%rcx),%xmm4 - movdqa 80(%rsi,%rcx),%xmm5 - movdqa 96(%rsi,%rcx),%xmm6 - movdqa 112(%rsi,%rcx),%xmm7 - - movntdq %xmm0,(%rdi,%rcx) - movntdq %xmm1,16(%rdi,%rcx) - movntdq %xmm2,32(%rdi,%rcx) - movntdq %xmm3,48(%rdi,%rcx) - movntdq %xmm4,64(%rdi,%rcx) - movntdq %xmm5,80(%rdi,%rcx) - movntdq %xmm6,96(%rdi,%rcx) - movntdq %xmm7,112(%rdi,%rcx) - - subq $-128,%rcx // add 128 with an 8-bit immediate - jnz LVeryLongAligned - jmp LVeryLongChunkEnd - - .align 4,0x90 // 16-byte align inner loops -LVeryLongUnaligned: // unaligned loop over 128-bytes - movdqu (%rsi,%rcx),%xmm0 - movdqu 16(%rsi,%rcx),%xmm1 - movdqu 32(%rsi,%rcx),%xmm2 - movdqu 48(%rsi,%rcx),%xmm3 - movdqu 64(%rsi,%rcx),%xmm4 - movdqu 80(%rsi,%rcx),%xmm5 - movdqu 96(%rsi,%rcx),%xmm6 - movdqu 112(%rsi,%rcx),%xmm7 - - movntdq %xmm0,(%rdi,%rcx) - movntdq %xmm1,16(%rdi,%rcx) - movntdq %xmm2,32(%rdi,%rcx) - movntdq %xmm3,48(%rdi,%rcx) - movntdq %xmm4,64(%rdi,%rcx) - movntdq %xmm5,80(%rdi,%rcx) - movntdq %xmm6,96(%rdi,%rcx) - movntdq %xmm7,112(%rdi,%rcx) - - subq $-128,%rcx // add 128 with an 8-bit immediate - jnz LVeryLongUnaligned - -LVeryLongChunkEnd: - cmpq $4096,%rdx // at least another page to go? - jae LBigChunkLoop // yes - -// Done. Call memcpy() again to handle the 0-4095 bytes at the end. -// We still have the args in the right registers: -// rdi = destination ptr -// rsi = source ptr -// rdx = length remaining (0..4095) - - sfence // required by non-temporal stores - testl %edx,%edx // anything left to copy? - jz 1f - movq $_COMM_PAGE_32_TO_64(_COMM_PAGE_MEMCPY),%rax - call *%rax -1: - popq %rbp // restore frame ptr - ret - -/* always match for now, as commpage_stuff_routine() will panic if no match */ -COMMPAGE_DESCRIPTOR(longcopy_sse3x_64, _COMM_PAGE_LONGCOPY, 0 ,0) diff --git a/osfmk/i386/commpage/memset_pattern_sse2.s b/osfmk/i386/commpage/memset_pattern_sse2.s deleted file mode 100644 index 3025ef62b..000000000 --- a/osfmk/i386/commpage/memset_pattern_sse2.s +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Copyright (c) 2005-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include - -/* The common path for nonzero memset and the memset_pattern routines, - * tuned for Pentium-M class processors with SSE2 and 64-byte cache lines. - * This is used by the following functions: - * - * void *memset(void *b, int c, size_t len); // when c!=0 - * void memset_pattern4(void *b, const void *c4, size_t len); - * void memset_pattern8(void *b, const void *c8, size_t len); - * void memset_pattern16(void *b, const void *c16, size_t len); - * - * Note bzero() and memset() of 0 are handled separately. - */ - -#define kShort 63 -#define kVeryLong (1024*1024) - -// Initial entry from Libc with parameters passed in registers. Although we -// correctly handle misaligned ptrs and short operands, they are inefficient. -// Therefore our caller should filter out short operands and exploit local -// knowledge (ie, original pattern length) to align the ptr if possible. -// When called, we expect: -// %edi = ptr to memory to set (not necessarily aligned) -// %edx = length (may be short or even 0) -// %xmm0 = the pattern to store -// Return conditions: -// %eax, %edi, %esi, %ecx, and %edx all trashed - -COMMPAGE_FUNCTION_START(memset_pattern_sse2, 32, 5) - cmpl $(kShort),%edx // long enough to bother aligning? - ja LNotShort // yes - jmp LShort // no - -// Here for short operands or the end of long ones. -// %edx = length -// %edi = ptr (may not be not aligned) -// %xmm0 = pattern - -LUnalignedStore16: - movdqu %xmm0,(%edi) // stuff in another 16 bytes - subl $16,%edx - addl $16,%edi -LShort: - cmpl $16,%edx // room for another vector? - jge LUnalignedStore16 // yes -LLessThan16: // here at end of copy with < 16 bytes remaining - test $8,%dl // 8-byte store required? - jz 2f // no - movq %xmm0,(%edi) // pack in 8 low bytes - psrldq $8,%xmm0 // then shift vector down 8 bytes - addl $8,%edi -2: - test $4,%dl // 4-byte store required? - jz 3f // no - movd %xmm0,(%edi) // pack in 4 low bytes - psrldq $4,%xmm0 // then shift vector down 4 bytes - addl $4,%edi -3: - andl $3,%edx // more to go? - jz 5f // no - movd %xmm0,%eax // move remainders out into %eax -4: // loop on up to three bytes - movb %al,(%edi) // pack in next byte - shrl $8,%eax // shift next byte into position - inc %edi - dec %edx - jnz 4b -5: ret - -// Long enough to justify aligning ptr. Note that we have to rotate the -// pattern to account for any alignment. We do this by doing two unaligned -// stores, and then an aligned load from the middle of the two stores. -// This will stall on store forwarding alignment mismatch, and the unaligned -// stores can be pretty slow too, but the alternatives aren't any better. -// Fortunately, in most cases our caller has already aligned the ptr. -// %edx = length (> kShort) -// %edi = ptr (may not be aligned) -// %xmm0 = pattern - -LNotShort: - movl %edi,%ecx // copy dest ptr - negl %ecx - andl $15,%ecx // mask down to #bytes to 16-byte align - jz LAligned // skip if already aligned - movdqu %xmm0,(%edi) // store 16 unaligned bytes - movdqu %xmm0,16(%edi) // and 16 more, to be sure we have an aligned chunk - addl %ecx,%edi // now point to the aligned chunk - subl %ecx,%edx // adjust remaining count - movdqa (%edi),%xmm0 // get the rotated pattern (probably stalling) - addl $16,%edi // skip past the aligned chunk - subl $16,%edx - -// Set up for 64-byte loops. -// %edx = length remaining -// %edi = ptr (aligned) -// %xmm0 = rotated pattern - -LAligned: - movl %edx,%ecx // copy length remaining - andl $63,%edx // mask down to residual length (0..63) - andl $-64,%ecx // %ecx <- #bytes we will zero in by-64 loop - jz LNoMoreChunks // no 64-byte chunks - addl %ecx,%edi // increment ptr by length to move - cmpl $(kVeryLong),%ecx // long enough to justify non-temporal stores? - jge LVeryLong // yes - negl %ecx // negate length to move - jmp 1f - -// Loop over 64-byte chunks, storing into cache. - - .align 4,0x90 // keep inner loops 16-byte aligned -1: - movdqa %xmm0,(%edi,%ecx) - movdqa %xmm0,16(%edi,%ecx) - movdqa %xmm0,32(%edi,%ecx) - movdqa %xmm0,48(%edi,%ecx) - addl $64,%ecx - jne 1b - - jmp LNoMoreChunks - -// Very long operands: use non-temporal stores to bypass cache. - -LVeryLong: - negl %ecx // negate length to move - jmp 1f - - .align 4,0x90 // keep inner loops 16-byte aligned -1: - movntdq %xmm0,(%edi,%ecx) - movntdq %xmm0,16(%edi,%ecx) - movntdq %xmm0,32(%edi,%ecx) - movntdq %xmm0,48(%edi,%ecx) - addl $64,%ecx - jne 1b - - sfence // required by non-temporal stores - jmp LNoMoreChunks - -// Handle leftovers: loop by 16. -// %edx = length remaining (<64) -// %edi = ptr (aligned) -// %xmm0 = rotated pattern - -LLoopBy16: - movdqa %xmm0,(%edi) // pack in 16 more bytes - subl $16,%edx // decrement count - addl $16,%edi // increment ptr -LNoMoreChunks: - cmpl $16,%edx // more to go? - jge LLoopBy16 // yes - jmp LLessThan16 // handle up to 15 remaining bytes - -COMMPAGE_DESCRIPTOR(memset_pattern_sse2,_COMM_PAGE_MEMSET_PATTERN,kHasSSE2,0) diff --git a/osfmk/i386/commpage/memset_pattern_sse2_64.s b/osfmk/i386/commpage/memset_pattern_sse2_64.s deleted file mode 100644 index e2d1bb007..000000000 --- a/osfmk/i386/commpage/memset_pattern_sse2_64.s +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include - -/* The common path for nonzero memset and the memset_pattern routines, - * tuned for Pentium-M class processors with SSE2 and 64-byte cache lines. - * This is the 64-bit bersion. It is used by the following functions: - * - * void *memset(void *b, int c, size_t len); // when c!=0 - * void memset_pattern4(void *b, const void *c4, size_t len); - * void memset_pattern8(void *b, const void *c8, size_t len); - * void memset_pattern16(void *b, const void *c16, size_t len); - * - * Note bzero() and memset() of 0 are handled separately. - */ - -#define kShort 63 -#define kVeryLong (1024*1024) - -// Initial entry from Libc with parameters passed in registers. Although we -// correctly handle misaligned ptrs and short operands, they are inefficient. -// Therefore our caller should filter out short operands and exploit local -// knowledge (ie, original pattern length) to align the ptr if possible. -// When called, we expect: -// %rdi = ptr to memory to set (not necessarily aligned) -// %rdx = length (may be short or even 0) -// %xmm0 = the pattern to store -// Return conditions: -// %rax, %rdi, %rsi, %rcx, and %rdx all trashed -// we preserve %r8, %r9, %r10, and %r11 - -COMMPAGE_FUNCTION_START(memset_pattern_sse2_64, 64, 5) - cmpq $(kShort),%rdx // long enough to bother aligning? - ja LNotShort // yes - jmp LShort // no - -// Here for short operands or the end of long ones. -// %rdx = length (<= kShort) -// %rdi = ptr (may not be not aligned) -// %xmm0 = pattern - -LUnalignedStore16: - movdqu %xmm0,(%rdi) // stuff in another 16 bytes - subl $16,%edx - addq $16,%rdi -LShort: - cmpl $16,%edx // room for another vector? - jge LUnalignedStore16 // yes -LLessThan16: // here at end of copy with < 16 bytes remaining - test $8,%dl // 8-byte store required? - jz 2f // no - movq %xmm0,(%rdi) // pack in 8 low bytes - psrldq $8,%xmm0 // then shift vector down 8 bytes - addq $8,%rdi -2: - test $4,%dl // 4-byte store required? - jz 3f // no - movd %xmm0,(%rdi) // pack in 4 low bytes - psrldq $4,%xmm0 // then shift vector down 4 bytes - addq $4,%rdi -3: - andl $3,%edx // more to go? - jz 5f // no - movd %xmm0,%eax // move remainders out into %eax -4: // loop on up to three bytes - movb %al,(%rdi) // pack in next byte - shrl $8,%eax // shift next byte into position - incq %rdi - dec %edx - jnz 4b -5: ret - -// Long enough to justify aligning ptr. Note that we have to rotate the -// pattern to account for any alignment. We do this by doing two unaligned -// stores, and then an aligned load from the middle of the two stores. -// This will stall on store forwarding alignment mismatch, and the unaligned -// stores can be pretty slow too, but the alternatives aren't any better. -// Fortunately, in most cases our caller has already aligned the ptr. -// %rdx = length (> kShort) -// %rdi = ptr (may not be aligned) -// %xmm0 = pattern - -LNotShort: - movl %edi,%ecx // copy low bits of dest ptr - negl %ecx - andl $15,%ecx // mask down to #bytes to 16-byte align - jz LAligned // skip if already aligned - movdqu %xmm0,(%rdi) // store 16 unaligned bytes - movdqu %xmm0,16(%rdi) // and 16 more, to be sure we have an aligned chunk - addq %rcx,%rdi // now point to the aligned chunk - subq %rcx,%rdx // adjust remaining count - movdqa (%rdi),%xmm0 // get the rotated pattern (probably stalling) - addq $16,%rdi // skip past the aligned chunk - subq $16,%rdx - -// Set up for 64-byte loops. -// %rdx = length remaining -// %rdi = ptr (aligned) -// %xmm0 = rotated pattern - -LAligned: - movq %rdx,%rcx // copy length remaining - andl $63,%edx // mask down to residual length (0..63) - andq $-64,%rcx // %ecx <- #bytes we will zero in by-64 loop - jz LNoMoreChunks // no 64-byte chunks - addq %rcx,%rdi // increment ptr by length to move - cmpq $(kVeryLong),%rcx // long enough to justify non-temporal stores? - jge LVeryLong // yes - negq %rcx // negate length to move - jmp 1f - -// Loop over 64-byte chunks, storing into cache. - - .align 4,0x90 // keep inner loops 16-byte aligned -1: - movdqa %xmm0,(%rdi,%rcx) - movdqa %xmm0,16(%rdi,%rcx) - movdqa %xmm0,32(%rdi,%rcx) - movdqa %xmm0,48(%rdi,%rcx) - addq $64,%rcx - jne 1b - - jmp LNoMoreChunks - -// Very long operands: use non-temporal stores to bypass cache. - -LVeryLong: - negq %rcx // negate length to move - jmp 1f - - .align 4,0x90 // keep inner loops 16-byte aligned -1: - movntdq %xmm0,(%rdi,%rcx) - movntdq %xmm0,16(%rdi,%rcx) - movntdq %xmm0,32(%rdi,%rcx) - movntdq %xmm0,48(%rdi,%rcx) - addq $64,%rcx - jne 1b - - sfence // required by non-temporal stores - jmp LNoMoreChunks - -// Handle leftovers: loop by 16. -// %edx = length remaining (<64) -// %edi = ptr (aligned) -// %xmm0 = rotated pattern - -LLoopBy16: - movdqa %xmm0,(%rdi) // pack in 16 more bytes - subl $16,%edx // decrement count - addq $16,%rdi // increment ptr -LNoMoreChunks: - cmpl $16,%edx // more to go? - jge LLoopBy16 // yes - jmp LLessThan16 // handle up to 15 remaining bytes - -COMMPAGE_DESCRIPTOR(memset_pattern_sse2_64,_COMM_PAGE_MEMSET_PATTERN,kHasSSE2,0) diff --git a/osfmk/i386/commpage/pthreads.s b/osfmk/i386/commpage/pthreads.s index 217662445..1794228ff 100644 --- a/osfmk/i386/commpage/pthreads.s +++ b/osfmk/i386/commpage/pthreads.s @@ -31,37 +31,6 @@ #include #include -#define _PTHREAD_TSD_OFFSET32 0x48 -#define _PTHREAD_TSD_OFFSET64 0x60 - - -/* These routines do not need to be on the copmmpage on Intel. They are for now - * to avoid revlock, but the code should move to Libc, and we should eventually remove - * these. - */ -COMMPAGE_FUNCTION_START(pthread_getspecific, 32, 4) - movl 4(%esp), %eax - movl %gs:_PTHREAD_TSD_OFFSET32(,%eax,4), %eax - ret -COMMPAGE_DESCRIPTOR(pthread_getspecific,_COMM_PAGE_PTHREAD_GETSPECIFIC,0,0) - -COMMPAGE_FUNCTION_START(pthread_self, 32, 4) - movl %gs:_PTHREAD_TSD_OFFSET32, %eax - ret -COMMPAGE_DESCRIPTOR(pthread_self,_COMM_PAGE_PTHREAD_SELF,0,0) - -/* the 64-bit versions: */ -COMMPAGE_FUNCTION_START(pthread_getspecific_64, 64, 4) - movq %gs:_PTHREAD_TSD_OFFSET64(,%rdi,8), %rax - ret -COMMPAGE_DESCRIPTOR(pthread_getspecific_64,_COMM_PAGE_PTHREAD_GETSPECIFIC,0,0) - -COMMPAGE_FUNCTION_START(pthread_self_64, 64, 4) - movq %gs:_PTHREAD_TSD_OFFSET64, %rax - ret -COMMPAGE_DESCRIPTOR(pthread_self_64,_COMM_PAGE_PTHREAD_SELF,0,0) - - /* Temporary definitions. Replace by #including the correct file when available. */ #define PTHRW_EBIT 0x01 @@ -114,47 +83,6 @@ COMMPAGE_DESCRIPTOR(pthread_self_64,_COMM_PAGE_PTHREAD_SELF,0,0) */ -/* int // we return 0 on acquire, 1 on syscall - * pthread_mutex_lock( uint32_t *lvalp, // ptr to mutex LVAL/UVAL pair - * int flags, // flags to pass kernel if we do syscall - * uint64_t mtid, // my Thread ID - * uint32_t mask, // bits to test in LVAL (ie, EBIT etc) - * uint64_t *tidp, // ptr to TID field of mutex - * int *syscall_return ); // if syscall, return value stored here - */ -COMMPAGE_FUNCTION_START(pthread_mutex_lock, 32, 4) - pushl %ebp // set up frame for backtrace - movl %esp,%ebp - pushl %esi - pushl %edi - pushl %ebx - xorl %ebx,%ebx // clear "preemption pending" flag - movl 20(%esp),%edi // %edi == ptr to LVAL/UVAL structure - lea 20(%esp),%esi // %esi == ptr to argument list - movl _COMM_PAGE_SPIN_COUNT, %edx - movl 16(%esi),%ecx // get mask (ie, PTHRW_EBIT etc) -1: - testl PTHRW_LVAL(%edi),%ecx // is mutex available? - jz 2f // yes, it is available - pause - decl %edx // decrement max spin count - jnz 1b // keep spinning -2: - COMMPAGE_CALL(_COMM_PAGE_PFZ_MUTEX_LOCK,_COMM_PAGE_MUTEX_LOCK,pthread_mutex_lock) - testl %ebx,%ebx // pending preemption? - jz 3f - pushl %eax // save return value across sysenter - COMMPAGE_CALL(_COMM_PAGE_PREEMPT,_COMM_PAGE_MUTEX_LOCK,pthread_mutex_lock) - popl %eax -3: - popl %ebx - popl %edi - popl %esi - popl %ebp - ret -COMMPAGE_DESCRIPTOR(pthread_mutex_lock,_COMM_PAGE_MUTEX_LOCK,0,0) - - /* Internal routine to handle pthread mutex lock operation. This is in the PFZ. * %edi == ptr to LVAL/UVAL pair * %esi == ptr to argument list on stack @@ -233,45 +161,6 @@ COMMPAGE_DESCRIPTOR(pfz_mutex_lock,_COMM_PAGE_PFZ_MUTEX_LOCK,0,0) -/* int // we return 0 on acquire, 1 on syscall - * pthread_mutex_lock( uint32_t *lvalp, // ptr to mutex LVAL/UVAL pair - * int flags, // flags to pass kernel if we do syscall - * uint64_t mtid, // my Thread ID - * uint32_t mask, // bits to test in LVAL (ie, EBIT etc) - * uint64_t *tidp, // ptr to TID field of mutex - * int *syscall_return ); // if syscall, return value stored here - * - * %rdi = lvalp - * %esi = flags - * %rdx = mtid - * %ecx = mask - * %r8 = tidp - * %r9 = &syscall_return - */ -COMMPAGE_FUNCTION_START(pthread_mutex_lock_64, 64, 4) - pushq %rbp // set up frame for backtrace - movq %rsp,%rbp - pushq %rbx - xorl %ebx,%ebx // clear "preemption pending" flag - movl _COMM_PAGE_32_TO_64(_COMM_PAGE_SPIN_COUNT), %eax -1: - testl PTHRW_LVAL(%rdi),%ecx // is mutex available? - jz 2f // yes, it is available - pause - decl %eax // decrement max spin count - jnz 1b // keep spinning -2: - COMMPAGE_CALL(_COMM_PAGE_PFZ_MUTEX_LOCK,_COMM_PAGE_MUTEX_LOCK,pthread_mutex_lock_64) - testl %ebx,%ebx // pending preemption? - jz 1f // no - COMMPAGE_CALL(_COMM_PAGE_PREEMPT,_COMM_PAGE_MUTEX_LOCK,pthread_mutex_lock_64) -1: - popq %rbx - popq %rbp - ret -COMMPAGE_DESCRIPTOR(pthread_mutex_lock_64,_COMM_PAGE_MUTEX_LOCK,0,0) - - /* Internal routine to handle pthread mutex lock operation. This is in the PFZ. * %rdi = lvalp * %esi = flags diff --git a/osfmk/i386/commpage/spinlocks.s b/osfmk/i386/commpage/spinlocks.s deleted file mode 100644 index a0e98bcb3..000000000 --- a/osfmk/i386/commpage/spinlocks.s +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Copyright (c) 2003-2009 Apple, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include -#include - - -COMMPAGE_FUNCTION_START(spin_lock_try_up, 32, 4) - movl 4(%esp), %ecx - xorl %eax, %eax - orl $-1, %edx - cmpxchgl %edx, (%ecx) - setz %dl - movzbl %dl, %eax - ret -COMMPAGE_DESCRIPTOR(spin_lock_try_up,_COMM_PAGE_SPINLOCK_TRY,kUP,0) - - -COMMPAGE_FUNCTION_START(spin_lock_try_mp, 32, 4) - movl 4(%esp), %ecx - xorl %eax, %eax - orl $-1, %edx - lock - cmpxchgl %edx, (%ecx) - setz %dl - movzbl %dl, %eax - ret -COMMPAGE_DESCRIPTOR(spin_lock_try_mp,_COMM_PAGE_SPINLOCK_TRY,0,kUP) - - -COMMPAGE_FUNCTION_START(spin_lock_up, 32, 4) - movl 4(%esp), %ecx - xorl %eax, %eax - orl $-1, %edx - cmpxchgl %edx, (%ecx) - jnz 1f - ret -1: - /* failed to get lock so relinquish the processor immediately on UP */ - pushl $1 /* 1 ms */ - pushl $1 /* SWITCH_OPTION_DEPRESS */ - pushl $0 /* THREAD_NULL */ - pushl $0 /* push dummy stack ret addr */ - movl $-61,%eax /* SYSCALL_THREAD_SWITCH */ - int $(MACH_INT) - addl $16, %esp /* adjust stack*/ - jmp Lspin_lock_up -COMMPAGE_DESCRIPTOR(spin_lock_up,_COMM_PAGE_SPINLOCK_LOCK,kUP,0) - - -COMMPAGE_FUNCTION_START(spin_lock_mp, 32, 4) - movl 4(%esp), %ecx - xorl %eax, %eax -0: - orl $-1, %edx - lock - cmpxchgl %edx, (%ecx) - jnz 1f - ret -1: - xorl %eax, %eax - movl $(MP_SPIN_TRIES), %edx -2: - pause - cmpl %eax, (%ecx) - jz 0b /* favor success and slow down spin loop */ - decl %edx - jnz 2b - /* failed to get lock after spinning so relinquish */ - pushl $1 /* 1 ms */ - pushl $1 /* SWITCH_OPTION_DEPRESS */ - pushl $0 /* THREAD_NULL */ - pushl $0 /* push dummy stack ret addr */ - movl $-61,%eax /* SYSCALL_THREAD_SWITCH */ - int $(MACH_INT) - addl $16, %esp /* adjust stack*/ - jmp Lspin_lock_mp -COMMPAGE_DESCRIPTOR(spin_lock_mp,_COMM_PAGE_SPINLOCK_LOCK,0,kUP) - - -COMMPAGE_FUNCTION_START(spin_unlock, 32, 4) - movl 4(%esp), %ecx - movl $0, (%ecx) - ret -COMMPAGE_DESCRIPTOR(spin_unlock,_COMM_PAGE_SPINLOCK_UNLOCK,0,0) - - -/* ============================ 64-bit versions follow ===================== */ - - -COMMPAGE_FUNCTION_START(spin_lock_try_up_64, 64, 4) - xorl %eax, %eax - orl $-1, %edx - cmpxchgl %edx, (%rdi) - setz %dl - movzbl %dl, %eax - ret -COMMPAGE_DESCRIPTOR(spin_lock_try_up_64,_COMM_PAGE_SPINLOCK_TRY,kUP,0) - - -COMMPAGE_FUNCTION_START(spin_lock_try_mp_64, 64, 4) - xorl %eax, %eax - orl $-1, %edx - lock - cmpxchgl %edx, (%rdi) - setz %dl - movzbl %dl, %eax - ret -COMMPAGE_DESCRIPTOR(spin_lock_try_mp_64,_COMM_PAGE_SPINLOCK_TRY,0,kUP) - - -COMMPAGE_FUNCTION_START(spin_lock_up_64, 64, 4) - movq %rdi,%r8 -0: - xorl %eax, %eax - orl $-1, %edx - cmpxchgl %edx, (%r8) - jnz 1f - ret -1: - /* failed to get lock so relinquish the processor immediately on UP */ - xorl %edi,%edi /* THREAD_NULL */ - movl $1,%esi /* SWITCH_OPTION_DEPRESS */ - movl $1,%edx /* 1 ms */ - movl $(SYSCALL_CONSTRUCT_MACH(61)),%eax /* 61 = thread_switch */ - syscall - jmp 0b -COMMPAGE_DESCRIPTOR(spin_lock_up_64,_COMM_PAGE_SPINLOCK_LOCK,kUP,0) - - -COMMPAGE_FUNCTION_START(spin_lock_mp_64, 64, 4) - movq %rdi,%r8 -0: - xorl %eax, %eax - orl $-1, %edx - lock - cmpxchgl %edx, (%r8) - jnz 1f - ret -1: - xorl %eax, %eax - movl $(MP_SPIN_TRIES), %edx -2: /* spin for awhile before relinquish */ - pause - cmpl %eax, (%r8) - jz 0b - decl %edx - jnz 2b - /* failed to get lock after spinning so relinquish */ - xorl %edi,%edi /* THREAD_NULL */ - movl $1,%esi /* SWITCH_OPTION_DEPRESS */ - movl $1,%edx /* 1 ms */ - movl $(SYSCALL_CONSTRUCT_MACH(61)),%eax /* 61 = thread_switch */ - syscall - jmp 0b -COMMPAGE_DESCRIPTOR(spin_lock_mp_64,_COMM_PAGE_SPINLOCK_LOCK,0,kUP) - - -COMMPAGE_FUNCTION_START(spin_unlock_64, 64, 4) - movl $0, (%rdi) - ret -COMMPAGE_DESCRIPTOR(spin_unlock_64,_COMM_PAGE_SPINLOCK_UNLOCK,0,0) diff --git a/osfmk/i386/copyio.c b/osfmk/i386/copyio.c new file mode 100644 index 000000000..82516b196 --- /dev/null +++ b/osfmk/i386/copyio.c @@ -0,0 +1,621 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * the copy engine has the following characteristics + * - copyio handles copies to/from user or kernel space + * - copypv deals with physical or virtual addresses + * + * implementation details as follows + * - a cache of up to NCOPY_WINDOWS is maintained per thread for + * access of user virutal space + * - the window size is determined by the amount of virtual space + * that can be mapped by a single page table + * - the mapping is done by copying the page table pointer from + * the user's directory entry corresponding to the window's + * address in user space to the directory entry corresponding + * to the window slot in the kernel's address space + * - the set of mappings is preserved across context switches, + * so the copy can run with pre-emption enabled + * - there is a gdt entry set up to anchor the kernel window on + * each processor + * - the copies are done using the selector corresponding to the + * gdt entry + * - the addresses corresponding to the user virtual address are + * relative to the beginning of the window being used to map + * that region... thus the thread can be pre-empted and switched + * to a different processor while in the midst of a copy + * - the window caches must be invalidated if the pmap changes out + * from under the thread... this can happen during vfork/exec... + * inval_copy_windows is the invalidation routine to be used + * - the copyio engine has 4 different states associated with it + * that allows for lazy tlb flushes and the ability to avoid + * a flush all together if we've just come from user space + * the 4 states are as follows... + * + * WINDOWS_OPENED - set by copyio to indicate to the context + * switch code that it is necessary to do a tlbflush after + * switching the windows since we're in the middle of a copy + * + * WINDOWS_CLOSED - set by copyio to indicate that it's done + * using the windows, so that the context switch code need + * not do the tlbflush... instead it will set the state to... + * + * WINDOWS_DIRTY - set by the context switch code to indicate + * to the copy engine that it is responsible for doing a + * tlbflush before using the windows again... it's also + * set by the inval_copy_windows routine to indicate the + * same responsibility. + * + * WINDOWS_CLEAN - set by the return to user path to indicate + * that a tlbflush has happened and that there is no need + * for copyio to do another when it is entered next... + * + * - a window for mapping single physical pages is provided for copypv + * - this window is maintained across context switches and has the + * same characteristics as the user space windows w/r to pre-emption + */ + +extern int copyout_user(const char *, vm_offset_t, vm_size_t); +extern int copyout_kern(const char *, vm_offset_t, vm_size_t); +extern int copyin_user(const vm_offset_t, char *, vm_size_t); +extern int copyin_kern(const vm_offset_t, char *, vm_size_t); +extern int copyoutphys_user(const char *, vm_offset_t, vm_size_t); +extern int copyoutphys_kern(const char *, vm_offset_t, vm_size_t); +extern int copyinphys_user(const vm_offset_t, char *, vm_size_t); +extern int copyinphys_kern(const vm_offset_t, char *, vm_size_t); +extern int copyinstr_user(const vm_offset_t, char *, vm_size_t, vm_size_t *); +extern int copyinstr_kern(const vm_offset_t, char *, vm_size_t, vm_size_t *); + +static int copyio(int, user_addr_t, char *, vm_size_t, vm_size_t *, int); +static int copyio_phys(addr64_t, addr64_t, vm_size_t, int); + + +#define COPYIN 0 +#define COPYOUT 1 +#define COPYINSTR 2 +#define COPYINPHYS 3 +#define COPYOUTPHYS 4 + +void inval_copy_windows(thread_t thread) +{ + int i; + + for (i = 0; i < NCOPY_WINDOWS; i++) { + thread->machine.copy_window[i].user_base = -1; + } + thread->machine.nxt_window = 0; + thread->machine.copyio_state = WINDOWS_DIRTY; + + KERNEL_DEBUG(0xeff70058 | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), (int)thread->map, 0, 0, 0); +} + + +static int +copyio(int copy_type, user_addr_t user_addr, char *kernel_addr, + vm_size_t nbytes, vm_size_t *lencopied, int use_kernel_map) +{ + thread_t thread; + pmap_t pmap; + pt_entry_t *updp; + pt_entry_t *kpdp; + user_addr_t user_base; + vm_offset_t user_offset; + vm_offset_t kern_vaddr; + vm_size_t cnt; + vm_size_t bytes_copied; + int error = 0; + int window_index; + int copyio_state; + boolean_t istate; +#if KDEBUG + int debug_type = 0xeff70010; + debug_type += (copy_type << 2); +#endif + + thread = current_thread(); + + KERNEL_DEBUG(debug_type | DBG_FUNC_START, (int)(user_addr >> 32), (int)user_addr, + (int)nbytes, thread->machine.copyio_state, 0); + + if (nbytes == 0) { + KERNEL_DEBUG(debug_type | DBG_FUNC_END, (unsigned)user_addr, + (unsigned)kernel_addr, (unsigned)nbytes, 0, 0); + return (0); + } + pmap = thread->map->pmap; + + if (pmap == kernel_pmap || use_kernel_map) { + + kern_vaddr = (vm_offset_t)user_addr; + + switch (copy_type) { + + case COPYIN: + error = copyin_kern(kern_vaddr, kernel_addr, nbytes); + break; + + case COPYOUT: + error = copyout_kern(kernel_addr, kern_vaddr, nbytes); + break; + + case COPYINSTR: + error = copyinstr_kern(kern_vaddr, kernel_addr, nbytes, lencopied); + break; + + case COPYINPHYS: + error = copyinphys_kern(kern_vaddr, kernel_addr, nbytes); + break; + + case COPYOUTPHYS: + error = copyoutphys_kern(kernel_addr, kern_vaddr, nbytes); + break; + } + KERNEL_DEBUG(debug_type | DBG_FUNC_END, (unsigned)kern_vaddr, + (unsigned)kernel_addr, (unsigned)nbytes, + error | 0x80000000, 0); + return (error); + } + +#if CONFIG_DTRACE + thread->machine.specFlags |= CopyIOActive; +#endif /* CONFIG_DTRACE */ + + if ((nbytes && (user_addr + nbytes <= user_addr)) || + (user_addr < vm_map_min(thread->map)) || + (user_addr + nbytes > vm_map_max(thread->map))) { + error = EFAULT; + goto done; + } + + user_base = user_addr & ~((user_addr_t)(NBPDE - 1)); + user_offset = (vm_offset_t)(user_addr & (NBPDE - 1)); + + KERNEL_DEBUG(debug_type | DBG_FUNC_NONE, (int)(user_base >> 32), (int)user_base, + (int)user_offset, 0, 0); + + cnt = NBPDE - user_offset; + + if (cnt > nbytes) + cnt = nbytes; + + istate = ml_set_interrupts_enabled(FALSE); + + copyio_state = thread->machine.copyio_state; + thread->machine.copyio_state = WINDOWS_OPENED; + + (void) ml_set_interrupts_enabled(istate); + + + for (;;) { + + for (window_index = 0; window_index < NCOPY_WINDOWS; window_index++) { + if (thread->machine.copy_window[window_index].user_base == user_base) + break; + } + if (window_index >= NCOPY_WINDOWS) { + + window_index = thread->machine.nxt_window; + thread->machine.nxt_window++; + + if (thread->machine.nxt_window >= NCOPY_WINDOWS) + thread->machine.nxt_window = 0; + + /* + * it's necessary to disable pre-emption + * since I have to compute the kernel descriptor pointer + * for the new window + */ + istate = ml_set_interrupts_enabled(FALSE); + + thread->machine.copy_window[window_index].user_base = user_base; + + updp = pmap_pde(pmap, user_base); + + kpdp = current_cpu_datap()->cpu_copywindow_pdp; + kpdp += window_index; + + pmap_store_pte(kpdp, updp ? *updp : 0); + + (void) ml_set_interrupts_enabled(istate); + + copyio_state = WINDOWS_DIRTY; + + KERNEL_DEBUG(0xeff70040 | DBG_FUNC_NONE, window_index, + (unsigned)user_base, (unsigned)updp, + (unsigned)kpdp, 0); + + } +#if JOE_DEBUG + else { + istate = ml_set_interrupts_enabled(FALSE); + + updp = pmap_pde(pmap, user_base); + + kpdp = current_cpu_datap()->cpu_copywindow_pdp; + + kpdp += window_index; + + if ((*kpdp & PG_FRAME) != (*updp & PG_FRAME)) { + panic("copyio: user pdp mismatch - kpdp = 0x%qx, updp = 0x%qx\n", *kpdp, *updp); + } + (void) ml_set_interrupts_enabled(istate); + } +#endif + if (copyio_state == WINDOWS_DIRTY) { + flush_tlb(); + + copyio_state = WINDOWS_CLEAN; + + KERNEL_DEBUG(0xeff70054 | DBG_FUNC_NONE, window_index, 0, 0, 0, 0); + } + user_offset += (window_index * NBPDE); + + KERNEL_DEBUG(0xeff70044 | DBG_FUNC_NONE, (unsigned)user_offset, + (unsigned)kernel_addr, cnt, 0, 0); + + switch (copy_type) { + + case COPYIN: + error = copyin_user(user_offset, kernel_addr, cnt); + break; + + case COPYOUT: + error = copyout_user(kernel_addr, user_offset, cnt); + break; + + case COPYINPHYS: + error = copyinphys_user(user_offset, kernel_addr, cnt); + break; + + case COPYOUTPHYS: + error = copyoutphys_user(kernel_addr, user_offset, cnt); + break; + + case COPYINSTR: + error = copyinstr_user(user_offset, kernel_addr, cnt, &bytes_copied); + + /* + * lencopied should be updated on success + * or ENAMETOOLONG... but not EFAULT + */ + if (error != EFAULT) + *lencopied += bytes_copied; + + /* + * if we still have room, then the ENAMETOOLONG + * is just an artifact of the buffer straddling + * a window boundary and we should continue + */ + if (error == ENAMETOOLONG && nbytes > cnt) + error = 0; + + if (error) { +#if KDEBUG + nbytes = *lencopied; +#endif + break; + } + if (*(kernel_addr + bytes_copied - 1) == 0) { + /* + * we found a NULL terminator... we're done + */ +#if KDEBUG + nbytes = *lencopied; +#endif + goto done; + } + if (cnt == nbytes) { + /* + * no more room in the buffer and we haven't + * yet come across a NULL terminator + */ +#if KDEBUG + nbytes = *lencopied; +#endif + error = ENAMETOOLONG; + break; + } + assert(cnt == bytes_copied); + + break; + } + if (error) + break; + if ((nbytes -= cnt) == 0) + break; + + kernel_addr += cnt; + user_base += NBPDE; + user_offset = 0; + + if (nbytes > NBPDE) + cnt = NBPDE; + else + cnt = nbytes; + } +done: + thread->machine.copyio_state = WINDOWS_CLOSED; + + KERNEL_DEBUG(debug_type | DBG_FUNC_END, (unsigned)user_addr, + (unsigned)kernel_addr, (unsigned)nbytes, error, 0); + +#if CONFIG_DTRACE + thread->machine.specFlags &= ~CopyIOActive; +#endif /* CONFIG_DTRACE */ + + return (error); +} + +static int +copyio_phys(addr64_t source, addr64_t sink, vm_size_t csize, int which) +{ + pmap_paddr_t paddr; + user_addr_t vaddr; + char *window_offset; + pt_entry_t pentry; + int ctype; + int retval; + boolean_t istate; + + + if (which & cppvPsnk) { + paddr = (pmap_paddr_t)sink; + vaddr = (user_addr_t)source; + ctype = COPYINPHYS; + pentry = (pt_entry_t)(INTEL_PTE_VALID | (paddr & PG_FRAME) | INTEL_PTE_RW); + } else { + paddr = (pmap_paddr_t)source; + vaddr = (user_addr_t)sink; + ctype = COPYOUTPHYS; + pentry = (pt_entry_t)(INTEL_PTE_VALID | (paddr & PG_FRAME)); + } + /* Fold in cache attributes for this physical page */ + pentry |= pmap_get_cache_attributes(i386_btop(paddr)); + window_offset = (char *)(uintptr_t)((uint32_t)paddr & (PAGE_SIZE - 1)); + + assert(!((current_thread()->machine.specFlags & CopyIOActive) && ((which & cppvKmap) == 0))); + + if (current_thread()->machine.physwindow_busy) { + pt_entry_t old_pentry; + + KERNEL_DEBUG(0xeff70048 | DBG_FUNC_NONE, paddr, csize, 0, -1, 0); + /* + * we had better be targeting wired memory at this point + * we will not be able to handle a fault with interrupts + * disabled... we disable them because we can't tolerate + * being preempted during this nested use of the window + */ + istate = ml_set_interrupts_enabled(FALSE); + + old_pentry = *(current_cpu_datap()->cpu_physwindow_ptep); + pmap_store_pte((current_cpu_datap()->cpu_physwindow_ptep), pentry); + + invlpg((uintptr_t)current_cpu_datap()->cpu_physwindow_base); + + retval = copyio(ctype, vaddr, window_offset, csize, NULL, which & cppvKmap); + + pmap_store_pte((current_cpu_datap()->cpu_physwindow_ptep), old_pentry); + + invlpg((uintptr_t)current_cpu_datap()->cpu_physwindow_base); + + (void) ml_set_interrupts_enabled(istate); + } else { + /* + * mark the window as in use... if an interrupt hits while we're + * busy, or we trigger another coyppv from the fault path into + * the driver on a user address space page fault due to a copyin/out + * then we need to save and restore the current window state instead + * of caching the window preserving it across context switches + */ + current_thread()->machine.physwindow_busy = 1; + + if (current_thread()->machine.physwindow_pte != pentry) { + KERNEL_DEBUG(0xeff70048 | DBG_FUNC_NONE, paddr, csize, 0, 0, 0); + + current_thread()->machine.physwindow_pte = pentry; + + /* + * preemption at this point would be bad since we + * could end up on the other processor after we grabbed the + * pointer to the current cpu data area, but before we finished + * using it to stuff the page table entry since we would + * be modifying a window that no longer belonged to us + * the invlpg can be done unprotected since it only flushes + * this page address from the tlb... if it flushes the wrong + * one, no harm is done, and the context switch that moved us + * to the other processor will have already take care of + * flushing the tlb after it reloaded the page table from machine.physwindow_pte + */ + istate = ml_set_interrupts_enabled(FALSE); + + pmap_store_pte((current_cpu_datap()->cpu_physwindow_ptep), pentry); + (void) ml_set_interrupts_enabled(istate); + + invlpg((uintptr_t)current_cpu_datap()->cpu_physwindow_base); + } +#if JOE_DEBUG + else { + if (pentry != + (*(current_cpu_datap()->cpu_physwindow_ptep) & (INTEL_PTE_VALID | PG_FRAME | INTEL_PTE_RW))) + panic("copyio_phys: pentry != *physwindow_ptep"); + } +#endif + retval = copyio(ctype, vaddr, window_offset, csize, NULL, which & cppvKmap); + + current_thread()->machine.physwindow_busy = 0; + } + return (retval); +} + +int +copyinmsg(const user_addr_t user_addr, char *kernel_addr, mach_msg_size_t nbytes) +{ + return (copyio(COPYIN, user_addr, kernel_addr, nbytes, NULL, 0)); +} + +int +copyin(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes) +{ + return (copyio(COPYIN, user_addr, kernel_addr, nbytes, NULL, 0)); +} + +int +copyinstr(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes, vm_size_t *lencopied) +{ + *lencopied = 0; + + return (copyio(COPYINSTR, user_addr, kernel_addr, nbytes, lencopied, 0)); +} + +int +copyoutmsg(const char *kernel_addr, user_addr_t user_addr, mach_msg_size_t nbytes) +{ + return (copyio(COPYOUT, user_addr, (char *)(uintptr_t)kernel_addr, nbytes, NULL, 0)); +} + +int +copyout(const void *kernel_addr, user_addr_t user_addr, vm_size_t nbytes) +{ + return (copyio(COPYOUT, user_addr, (char *)(uintptr_t)kernel_addr, nbytes, NULL, 0)); +} + + +kern_return_t +copypv(addr64_t src64, addr64_t snk64, unsigned int size, int which) +{ + unsigned int lop, csize; + int bothphys = 0; + + KERNEL_DEBUG(0xeff7004c | DBG_FUNC_START, (unsigned)src64, + (unsigned)snk64, size, which, 0); + + if ((which & (cppvPsrc | cppvPsnk)) == 0 ) /* Make sure that only one is virtual */ + panic("copypv: no more than 1 parameter may be virtual\n"); /* Not allowed */ + + if ((which & (cppvPsrc | cppvPsnk)) == (cppvPsrc | cppvPsnk)) + bothphys = 1; /* both are physical */ + + while (size) { + + if (bothphys) { + lop = (unsigned int)(PAGE_SIZE - (snk64 & (PAGE_SIZE - 1))); /* Assume sink smallest */ + + if (lop > (unsigned int)(PAGE_SIZE - (src64 & (PAGE_SIZE - 1)))) + lop = (unsigned int)(PAGE_SIZE - (src64 & (PAGE_SIZE - 1))); /* No, source is smaller */ + } else { + /* + * only need to compute the resid for the physical page + * address... we don't care about where we start/finish in + * the virtual since we just call the normal copyin/copyout + */ + if (which & cppvPsrc) + lop = (unsigned int)(PAGE_SIZE - (src64 & (PAGE_SIZE - 1))); + else + lop = (unsigned int)(PAGE_SIZE - (snk64 & (PAGE_SIZE - 1))); + } + csize = size; /* Assume we can copy it all */ + if (lop < size) + csize = lop; /* Nope, we can't do it all */ +#if 0 + /* + * flush_dcache64 is currently a nop on the i386... + * it's used when copying to non-system memory such + * as video capture cards... on PPC there was a need + * to flush due to how we mapped this memory... not + * sure if it's needed on i386. + */ + if (which & cppvFsrc) + flush_dcache64(src64, csize, 1); /* If requested, flush source before move */ + if (which & cppvFsnk) + flush_dcache64(snk64, csize, 1); /* If requested, flush sink before move */ +#endif + if (bothphys) { + bcopy_phys(src64, snk64, csize); /* Do a physical copy, virtually */ + } + else { + if (copyio_phys(src64, snk64, csize, which)) { + return (KERN_FAILURE); + } + } +#if 0 + if (which & cppvFsrc) + flush_dcache64(src64, csize, 1); /* If requested, flush source after move */ + if (which & cppvFsnk) + flush_dcache64(snk64, csize, 1); /* If requested, flush sink after move */ +#endif + size -= csize; /* Calculate what is left */ + snk64 += csize; /* Bump sink to next physical address */ + src64 += csize; /* Bump source to next physical address */ + } + KERNEL_DEBUG(0xeff7004c | DBG_FUNC_END, (unsigned)src64, + (unsigned)snk64, size, which, 0); + + return KERN_SUCCESS; +} +void +copy_window_fault(thread_t thread, vm_map_t map, int window) +{ + pt_entry_t *updp; + pt_entry_t *kpdp; + + /* + * in case there was no page table assigned + * for the user base address and the pmap + * got 'expanded' due to this fault, we'll + * copy in the descriptor + * + * we're either setting the page table descriptor + * to the same value or it was 0... no need + * for a TLB flush in either case + */ + + updp = pmap_pde(map->pmap, thread->machine.copy_window[window].user_base); + assert(updp); + if (0 == updp) panic("trap: updp 0"); /* XXX DEBUG */ + kpdp = current_cpu_datap()->cpu_copywindow_pdp; + kpdp += window; + +#if JOE_DEBUG + if (*kpdp && (*kpdp & PG_FRAME) != (*updp & PG_FRAME)) + panic("kernel_fault: user pdp doesn't match - updp = 0x%qx, kpdp = 0x%qx\n", *updp, *kpdp); +#endif + pmap_store_pte(kpdp, *updp); +} diff --git a/osfmk/i386/cpu.c b/osfmk/i386/cpu.c index 0bcfbb77f..4cdeed647 100644 --- a/osfmk/i386/cpu.c +++ b/osfmk/i386/cpu.c @@ -40,12 +40,14 @@ #include #include #include -#include +#include #include #if CONFIG_VMX #include #endif #include +#include +#include struct processor processor_master; @@ -101,6 +103,9 @@ cpu_init(void) { cpu_data_t *cdp = current_cpu_datap(); + timer_call_initialize_queue(&cdp->rtclock_timer.queue); + cdp->rtclock_timer.deadline = EndOfAllTime; + cdp->cpu_type = cpuid_cputype(); cdp->cpu_subtype = cpuid_cpusubtype(); @@ -167,13 +172,6 @@ cpu_machine_init( PE_cpu_machine_init(cdp->cpu_id, !cdp->cpu_boot_complete); cdp->cpu_boot_complete = TRUE; cdp->cpu_running = TRUE; -#if 0 - if (cpu_datap(cpu)->hibernate) - { - cpu_datap(cpu)->hibernate = 0; - hibernate_machine_init(); - } -#endif ml_init_interrupt(); #if CONFIG_VMX @@ -246,8 +244,6 @@ slot_threadtype( return (cpu_datap(slot_num)->cpu_threadtype); } - - cpu_type_t cpu_type(void) { diff --git a/osfmk/i386/cpu_capabilities.h b/osfmk/i386/cpu_capabilities.h index 470e8a3e7..a820ea7aa 100644 --- a/osfmk/i386/cpu_capabilities.h +++ b/osfmk/i386/cpu_capabilities.h @@ -53,11 +53,11 @@ #define kHasSSE4_1 0x00000400 #define kHasSSE4_2 0x00000800 #define kHasAES 0x00001000 -#define kInOrderPipeline 0x00002000 /* in-order execution */ +#define kInOrderPipeline 0x00002000 #define kSlow 0x00004000 /* tsc < nanosecond */ #define kUP 0x00008000 /* set if (kNumCPUs == 1) */ #define kNumCPUs 0x00FF0000 /* number of CPUs (see _NumCPUs() below) */ - +#define kHasAVX1_0 0x01000000 #define kNumCPUsShift 16 /* see _NumCPUs() below */ #ifndef __ASSEMBLER__ @@ -75,6 +75,13 @@ int _NumCPUs( void ) #endif /* __ASSEMBLER__ */ +/* The following macro is used to generate the 64-bit commpage address for a given + * routine, based on its 32-bit address. This is used in the kernel to compile + * the 64-bit commpage. Since the kernel can be a 32-bit object, cpu_capabilities.h + * only defines the 32-bit address. + */ +#define _COMM_PAGE_32_TO_64( ADDRESS ) ( ADDRESS + _COMM_PAGE64_START_ADDRESS - _COMM_PAGE32_START_ADDRESS ) + /* * The shared kernel/user "comm page(s)": @@ -86,15 +93,19 @@ int _NumCPUs( void ) * Because Mach VM cannot map the last page of an address space, we don't use it. */ -#define _COMM_PAGE32_AREA_LENGTH ( 19 * 4096 ) /* reserved length of entire comm area */ -#define _COMM_PAGE32_BASE_ADDRESS ( 0xfffec000 ) /* base address of allocated memory, -20 pages */ -#define _COMM_PAGE32_START_ADDRESS ( 0xffff0000 ) /* address traditional commpage code starts on, -16 pages */ -#define _COMM_PAGE32_AREA_USED ( 19 * 4096 ) /* this is the amt actually allocated */ -#define _COMM_PAGE32_SIGS_OFFSET 0x8000 /* offset to routine signatures */ +#define _COMM_PAGE32_AREA_LENGTH ( 2 * 4096 ) /* reserved length of entire comm area */ +#define _COMM_PAGE32_BASE_ADDRESS ( 0xffff0000 ) /* base address of allocated memory */ +#define _COMM_PAGE32_START_ADDRESS ( _COMM_PAGE32_BASE_ADDRESS ) /* address traditional commpage code starts on */ +#define _COMM_PAGE32_AREA_USED ( 2 * 4096 ) /* this is the amt actually allocated */ +#define _COMM_PAGE32_SIGS_OFFSET 0x8000 /* offset to routine signatures */ #define _COMM_PAGE64_AREA_LENGTH ( 2 * 1024 * 1024 ) /* reserved length of entire comm area (2MB) */ -#define _COMM_PAGE64_BASE_ADDRESS ( 0x00007fffffe00000ULL ) /* base address of allocated memory */ -#define _COMM_PAGE64_START_ADDRESS ( _COMM_PAGE64_BASE_ADDRESS ) /* address traditional commpage code starts on */ +#ifdef __ASSEMBLER__ +#define _COMM_PAGE64_BASE_ADDRESS ( 0x00007fffffe00000 ) /* base address of allocated memory */ +#else /* __ASSEMBLER__ */ +#define _COMM_PAGE64_BASE_ADDRESS ( 0x00007fffffe00000ULL ) /* base address of allocated memory */ +#endif /* __ASSEMBLER__ */ +#define _COMM_PAGE64_START_ADDRESS ( _COMM_PAGE64_BASE_ADDRESS ) /* address traditional commpage code starts on */ #define _COMM_PAGE64_AREA_USED ( 2 * 4096 ) /* this is the amt actually populated */ /* no need for an Objective-C area on Intel */ @@ -139,34 +150,25 @@ int _NumCPUs( void ) #define _COMM_PAGE_SIGNATURE (_COMM_PAGE_START_ADDRESS+0x000) /* first few bytes are a signature */ #define _COMM_PAGE_VERSION (_COMM_PAGE_START_ADDRESS+0x01E) /* 16-bit version# */ -#define _COMM_PAGE_THIS_VERSION 11 /* version of the commarea format */ +#define _COMM_PAGE_THIS_VERSION 12 /* version of the commarea format */ #define _COMM_PAGE_CPU_CAPABILITIES (_COMM_PAGE_START_ADDRESS+0x020) /* uint32_t _cpu_capabilities */ -#define _COMM_PAGE_NCPUS (_COMM_PAGE_START_ADDRESS+0x022) /* uint8_t number of configured CPUs */ +#define _COMM_PAGE_NCPUS (_COMM_PAGE_START_ADDRESS+0x022) /* uint8_t number of configured CPUs (hw.logicalcpu at boot time) */ +#define _COMM_PAGE_UNUSED0 (_COMM_PAGE_START_ADDRESS+0x024) /* 2 unused bytes, reserved for future expansion of cpu_capabilities */ #define _COMM_PAGE_CACHE_LINESIZE (_COMM_PAGE_START_ADDRESS+0x026) /* uint16_t cache line size */ #define _COMM_PAGE_SCHED_GEN (_COMM_PAGE_START_ADDRESS+0x028) /* uint32_t scheduler generation number (count of pre-emptions) */ #define _COMM_PAGE_MEMORY_PRESSURE (_COMM_PAGE_START_ADDRESS+0x02c) /* uint32_t copy of vm_memory_pressure */ #define _COMM_PAGE_SPIN_COUNT (_COMM_PAGE_START_ADDRESS+0x030) /* uint32_t max spin count for mutex's */ -#define _COMM_PAGE_UNUSED1 (_COMM_PAGE_START_ADDRESS+0x034) /* 12 unused bytes */ +#define _COMM_PAGE_ACTIVE_CPUS (_COMM_PAGE_START_ADDRESS+0x034) /* uint8_t number of active CPUs (hw.activecpu) */ +#define _COMM_PAGE_PHYSICAL_CPUS (_COMM_PAGE_START_ADDRESS+0x035) /* uint8_t number of physical CPUs (hw.physicalcpu_max) */ +#define _COMM_PAGE_LOGICAL_CPUS (_COMM_PAGE_START_ADDRESS+0x036) /* uint8_t number of logical CPUs (hw.logicalcpu_max) */ +#define _COMM_PAGE_UNUSED1 (_COMM_PAGE_START_ADDRESS+0x037) /* 1 unused bytes */ +#define _COMM_PAGE_MEMORY_SIZE (_COMM_PAGE_START_ADDRESS+0x038) /* uint64_t max memory size */ -#ifdef KERNEL_PRIVATE - -/* slots defined in all cases, but commpage setup code must not populate for 64-bit commpage */ -#define _COMM_PAGE_2_TO_52 (_COMM_PAGE_START_ADDRESS+0x040) /* double float constant 2**52 */ -#define _COMM_PAGE_10_TO_6 (_COMM_PAGE_START_ADDRESS+0x048) /* double float constant 10**6 */ - -#else /* !KERNEL_PRIVATE */ - -#if defined(__i386__) /* following are not defined in 64-bit */ -#define _COMM_PAGE_2_TO_52 (_COMM_PAGE_START_ADDRESS+0x040) /* double float constant 2**52 */ -#define _COMM_PAGE_10_TO_6 (_COMM_PAGE_START_ADDRESS+0x048) /* double float constant 10**6 */ -#else -#define _COMM_PAGE_UNUSED2 (_COMM_PAGE_START_ADDRESS+0x040) /* 16 unused bytes */ -#endif - -#endif /* !KERNEL_PRIVATE */ +#define _COMM_PAGE_CPUFAMILY (_COMM_PAGE_START_ADDRESS+0x040) /* uint32_t hw.cpufamily, x86*/ +#define _COMM_PAGE_UNUSED2 (_COMM_PAGE_START_ADDRESS+0x044) /* [0x44,0x50) unused */ #define _COMM_PAGE_TIME_DATA_START (_COMM_PAGE_START_ADDRESS+0x050) /* base of offsets below (_NT_SCALE etc) */ #define _COMM_PAGE_NT_TSC_BASE (_COMM_PAGE_START_ADDRESS+0x050) /* used by nanotime() */ @@ -194,49 +196,10 @@ int _NumCPUs( void ) /* When new jump table entries are added, corresponding symbols should be added below */ /* New slots should be allocated with at least 16-byte alignment. Some like bcopy require */ /* 32-byte alignment, and should be aligned as such in the assembly source before they are relocated */ -#define _COMM_PAGE_COMPARE_AND_SWAP32 (_COMM_PAGE_START_ADDRESS+0x080) /* compare-and-swap word */ -#define _COMM_PAGE_COMPARE_AND_SWAP64 (_COMM_PAGE_START_ADDRESS+0x0c0) /* compare-and-swap doubleword */ -#define _COMM_PAGE_ENQUEUE (_COMM_PAGE_START_ADDRESS+0x100) /* enqueue */ -#define _COMM_PAGE_DEQUEUE (_COMM_PAGE_START_ADDRESS+0x140) /* dequeue */ -#define _COMM_PAGE_MEMORY_BARRIER (_COMM_PAGE_START_ADDRESS+0x180) /* memory barrier */ -#define _COMM_PAGE_ATOMIC_ADD32 (_COMM_PAGE_START_ADDRESS+0x1a0) /* add atomic word */ -#define _COMM_PAGE_ATOMIC_ADD64 (_COMM_PAGE_START_ADDRESS+0x1c0) /* add atomic doubleword */ - -#define _COMM_PAGE_CPU_NUMBER (_COMM_PAGE_START_ADDRESS+0x1e0) /* user-level cpu_number() */ - -#define _COMM_PAGE_ABSOLUTE_TIME (_COMM_PAGE_START_ADDRESS+0x200) /* mach_absolute_time() */ -#define _COMM_PAGE_SPINLOCK_TRY (_COMM_PAGE_START_ADDRESS+0x220) /* spinlock_try() */ -#define _COMM_PAGE_SPINLOCK_LOCK (_COMM_PAGE_START_ADDRESS+0x260) /* spinlock_lock() */ -#define _COMM_PAGE_SPINLOCK_UNLOCK (_COMM_PAGE_START_ADDRESS+0x2a0) /* spinlock_unlock() */ -#define _COMM_PAGE_PTHREAD_GETSPECIFIC (_COMM_PAGE_START_ADDRESS+0x2c0) /* pthread_getspecific() */ -#define _COMM_PAGE_GETTIMEOFDAY (_COMM_PAGE_START_ADDRESS+0x2e0) /* used by gettimeofday() */ -#define _COMM_PAGE_FLUSH_DCACHE (_COMM_PAGE_START_ADDRESS+0x4e0) /* sys_dcache_flush() */ -#define _COMM_PAGE_FLUSH_ICACHE (_COMM_PAGE_START_ADDRESS+0x520) /* sys_icache_invalidate() */ -#define _COMM_PAGE_PTHREAD_SELF (_COMM_PAGE_START_ADDRESS+0x580) /* pthread_self() */ +#define _COMM_PAGE_TEXT_START (_COMM_PAGE_START_ADDRESS+0x080) /* start of text section */ #define _COMM_PAGE_PREEMPT (_COMM_PAGE_START_ADDRESS+0x5a0) /* used by PFZ code */ - -#define _COMM_PAGE_RELINQUISH (_COMM_PAGE_START_ADDRESS+0x5c0) /* used by spinlocks */ -#define _COMM_PAGE_BTS (_COMM_PAGE_START_ADDRESS+0x5e0) /* bit test-and-set */ -#define _COMM_PAGE_BTC (_COMM_PAGE_START_ADDRESS+0x5f0) /* bit test-and-clear */ - -#define _COMM_PAGE_BZERO (_COMM_PAGE_START_ADDRESS+0x600) /* bzero() */ -#define _COMM_PAGE_BCOPY (_COMM_PAGE_START_ADDRESS+0x780) /* bcopy() */ -#define _COMM_PAGE_MEMCPY (_COMM_PAGE_START_ADDRESS+0x7a0) /* memcpy() */ -#define _COMM_PAGE_MEMMOVE (_COMM_PAGE_START_ADDRESS+0x7a0) /* memmove() */ -#define _COMM_PAGE_BCOPY_END (_COMM_PAGE_START_ADDRESS+0xfff) /* used by rosetta */ - -#define _COMM_PAGE_MEMSET_PATTERN (_COMM_PAGE_START_ADDRESS+0x1000) /* used by nonzero memset() */ -#define _COMM_PAGE_LONGCOPY (_COMM_PAGE_START_ADDRESS+0x1200) /* used by bcopy() for very long operands */ -#define _COMM_PAGE_LONGCOPY_END (_COMM_PAGE_START_ADDRESS+0x15ff) /* used by rosetta */ - #define _COMM_PAGE_BACKOFF (_COMM_PAGE_START_ADDRESS+0x1600) /* called from PFZ */ -#define _COMM_PAGE_FIFO_ENQUEUE (_COMM_PAGE_START_ADDRESS+0x1680) /* FIFO enqueue */ -#define _COMM_PAGE_FIFO_DEQUEUE (_COMM_PAGE_START_ADDRESS+0x16c0) /* FIFO dequeue */ -#define _COMM_PAGE_NANOTIME (_COMM_PAGE_START_ADDRESS+0x1700) /* nanotime() */ -#define _COMM_PAGE_MUTEX_LOCK (_COMM_PAGE_START_ADDRESS+0x1780) /* pthread_mutex_lock() */ - -#define _COMM_PAGE_UNUSED5 (_COMM_PAGE_START_ADDRESS+0x17e0) /* unused space for regular code up to 0x1c00 */ #define _COMM_PAGE_PFZ_START (_COMM_PAGE_START_ADDRESS+0x1c00) /* start of Preemption Free Zone */ @@ -265,38 +228,8 @@ symbol_name: nop .text /* Required to make a well behaved symbol file */ - CREATE_COMM_PAGE_SYMBOL(___compare_and_swap32, _COMM_PAGE_COMPARE_AND_SWAP32) - CREATE_COMM_PAGE_SYMBOL(___compare_and_swap64, _COMM_PAGE_COMPARE_AND_SWAP64) - CREATE_COMM_PAGE_SYMBOL(___atomic_enqueue, _COMM_PAGE_ENQUEUE) - CREATE_COMM_PAGE_SYMBOL(___atomic_dequeue, _COMM_PAGE_DEQUEUE) - CREATE_COMM_PAGE_SYMBOL(___memory_barrier, _COMM_PAGE_MEMORY_BARRIER) - CREATE_COMM_PAGE_SYMBOL(___atomic_add32, _COMM_PAGE_ATOMIC_ADD32) - CREATE_COMM_PAGE_SYMBOL(___atomic_add64, _COMM_PAGE_ATOMIC_ADD64) - CREATE_COMM_PAGE_SYMBOL(___cpu_number, _COMM_PAGE_CPU_NUMBER) - CREATE_COMM_PAGE_SYMBOL(___mach_absolute_time, _COMM_PAGE_ABSOLUTE_TIME) - CREATE_COMM_PAGE_SYMBOL(___spin_lock_try, _COMM_PAGE_SPINLOCK_TRY) - CREATE_COMM_PAGE_SYMBOL(___spin_lock, _COMM_PAGE_SPINLOCK_LOCK) - CREATE_COMM_PAGE_SYMBOL(___spin_unlock, _COMM_PAGE_SPINLOCK_UNLOCK) - CREATE_COMM_PAGE_SYMBOL(___pthread_getspecific, _COMM_PAGE_PTHREAD_GETSPECIFIC) - CREATE_COMM_PAGE_SYMBOL(___gettimeofday, _COMM_PAGE_GETTIMEOFDAY) - CREATE_COMM_PAGE_SYMBOL(___sys_dcache_flush, _COMM_PAGE_FLUSH_DCACHE) - CREATE_COMM_PAGE_SYMBOL(___sys_icache_invalidate, _COMM_PAGE_FLUSH_ICACHE) - CREATE_COMM_PAGE_SYMBOL(___pthread_self, _COMM_PAGE_PTHREAD_SELF) - CREATE_COMM_PAGE_SYMBOL(___pfz_preempt, _COMM_PAGE_PREEMPT) - CREATE_COMM_PAGE_SYMBOL(___spin_lock_relinquish, _COMM_PAGE_RELINQUISH) - CREATE_COMM_PAGE_SYMBOL(___bit_test_and_set, _COMM_PAGE_BTS) - CREATE_COMM_PAGE_SYMBOL(___bit_test_and_clear, _COMM_PAGE_BTC) - CREATE_COMM_PAGE_SYMBOL(___bzero, _COMM_PAGE_BZERO) - CREATE_COMM_PAGE_SYMBOL(___bcopy, _COMM_PAGE_BCOPY) - CREATE_COMM_PAGE_SYMBOL(___memcpy, _COMM_PAGE_MEMCPY) -/* CREATE_COMM_PAGE_SYMBOL(___memmove, _COMM_PAGE_MEMMOVE) */ - CREATE_COMM_PAGE_SYMBOL(___memset_pattern, _COMM_PAGE_MEMSET_PATTERN) - CREATE_COMM_PAGE_SYMBOL(___longcopy, _COMM_PAGE_LONGCOPY) + CREATE_COMM_PAGE_SYMBOL(___preempt, _COMM_PAGE_PREEMPT) CREATE_COMM_PAGE_SYMBOL(___backoff, _COMM_PAGE_BACKOFF) - CREATE_COMM_PAGE_SYMBOL(___fifo_enqueue, _COMM_PAGE_FIFO_ENQUEUE) - CREATE_COMM_PAGE_SYMBOL(___fifo_dequeue, _COMM_PAGE_FIFO_DEQUEUE) - CREATE_COMM_PAGE_SYMBOL(___nanotime, _COMM_PAGE_NANOTIME) - CREATE_COMM_PAGE_SYMBOL(___mutex_lock, _COMM_PAGE_MUTEX_LOCK) CREATE_COMM_PAGE_SYMBOL(___pfz_enqueue, _COMM_PAGE_PFZ_ENQUEUE) CREATE_COMM_PAGE_SYMBOL(___pfz_dequeue, _COMM_PAGE_PFZ_DEQUEUE) CREATE_COMM_PAGE_SYMBOL(___pfz_mutex_lock, _COMM_PAGE_PFZ_MUTEX_LOCK) diff --git a/osfmk/i386/cpu_data.h b/osfmk/i386/cpu_data.h index 63eb4446b..22de8b2b0 100644 --- a/osfmk/i386/cpu_data.h +++ b/osfmk/i386/cpu_data.h @@ -43,7 +43,7 @@ #include #include #include -#include +#include #include #include @@ -51,6 +51,8 @@ #include #endif +#include + /* * Data structures referenced (anonymously) from per-cpu data: */ @@ -58,14 +60,13 @@ struct cpu_cons_buffer; struct cpu_desc_table; struct mca_state; - /* * Data structures embedded in per-cpu data: */ typedef struct rtclock_timer { - queue_head_t queue; + mpqueue_head_t queue; uint64_t deadline; - boolean_t is_set; + uint64_t when_set; boolean_t has_expired; } rtclock_timer_t; @@ -125,9 +126,11 @@ typedef enum { typedef struct { addr64_t cu_isf; /* thread->pcb->iss.isf */ uint64_t cu_tmp; /* temporary scratch */ - addr64_t cu_user_gs_base; + addr64_t cu_user_gs_base; } cpu_uber_t; +typedef uint16_t pcid_t; +typedef uint8_t pcid_ref_t; /* * Per-cpu data. * @@ -143,16 +146,17 @@ typedef struct { */ typedef struct cpu_data { + struct pal_cpu_data cpu_pal_data; /* PAL-specific data */ +#define cpu_pd cpu_pal_data /* convenience alias */ struct cpu_data *cpu_this; /* pointer to myself */ thread_t cpu_active_thread; + int cpu_preemption_level; + int cpu_number; /* Logical CPU */ void *cpu_int_state; /* interrupt state */ vm_offset_t cpu_active_stack; /* kernel stack base */ vm_offset_t cpu_kernel_stack; /* kernel stack top */ vm_offset_t cpu_int_stack_top; - int cpu_preemption_level; - int cpu_simple_lock_count; int cpu_interrupt_level; - int cpu_number; /* Logical CPU */ int cpu_phys_number; /* Physical CPU */ cpu_id_t cpu_id; /* Platform Expert */ int cpu_signals; /* IPI events */ @@ -167,9 +171,16 @@ typedef struct cpu_data int cpu_running; rtclock_timer_t rtclock_timer; boolean_t cpu_is64bit; - task_map_t cpu_task_map; + volatile addr64_t cpu_active_cr3 __attribute((aligned(64))); + union { + volatile uint32_t cpu_tlb_invalid; + struct { + volatile uint16_t cpu_tlb_invalid_local; + volatile uint16_t cpu_tlb_invalid_global; + }; + }; + volatile task_map_t cpu_task_map; volatile addr64_t cpu_task_cr3; - volatile addr64_t cpu_active_cr3; addr64_t cpu_kernel_cr3; cpu_uber_t cpu_uber; void *cpu_chud; @@ -195,20 +206,17 @@ typedef struct cpu_data boolean_t cpu_iflag; boolean_t cpu_boot_complete; int cpu_hibernate; - #if NCOPY_WINDOWS > 0 vm_offset_t cpu_copywindow_base; uint64_t *cpu_copywindow_pdp; vm_offset_t cpu_physwindow_base; uint64_t *cpu_physwindow_ptep; - void *cpu_hi_iss; #endif + void *cpu_hi_iss; - - - volatile boolean_t cpu_tlb_invalid; - uint32_t cpu_hwIntCnt[256]; /* Interrupt counts */ +#define HWINTCNT_SIZE 256 + uint32_t cpu_hwIntCnt[HWINTCNT_SIZE]; /* Interrupt counts */ uint64_t cpu_dr7; /* debug control register */ uint64_t cpu_int_event_time; /* intr entry/exit time */ #if CONFIG_VMX @@ -226,11 +234,26 @@ typedef struct cpu_data * arg store * validity flag. */ - rtc_nanotime_t *cpu_nanotime; /* Nanotime info */ + pal_rtc_nanotime_t *cpu_nanotime; /* Nanotime info */ thread_t csw_old_thread; thread_t csw_new_thread; - uint64_t cpu_max_observed_int_latency; - int cpu_max_observed_int_latency_vector; +#if defined(__x86_64__) + uint32_t cpu_pmap_pcid_enabled; + pcid_t cpu_active_pcid; + pcid_t cpu_last_pcid; + volatile pcid_ref_t *cpu_pmap_pcid_coherentp; + volatile pcid_ref_t *cpu_pmap_pcid_coherentp_kernel; +#define PMAP_PCID_MAX_PCID (0x1000) + pcid_t cpu_pcid_free_hint; + pcid_ref_t cpu_pcid_refcounts[PMAP_PCID_MAX_PCID]; + pmap_t cpu_pcid_last_pmap_dispatched[PMAP_PCID_MAX_PCID]; +#ifdef PCID_STATS + uint64_t cpu_pmap_pcid_flushes; + uint64_t cpu_pmap_pcid_preserves; +#endif +#endif /* x86_64 */ + uint64_t cpu_max_observed_int_latency; + int cpu_max_observed_int_latency_vector; uint64_t debugger_entry_time; volatile boolean_t cpu_NMI_acknowledged; /* A separate nested interrupt stack flag, to account @@ -240,6 +263,8 @@ typedef struct cpu_data */ uint32_t cpu_nested_istack; uint32_t cpu_nested_istack_events; + x86_saved_state64_t *cpu_fatal_trap_state; + x86_saved_state64_t *cpu_post_fatal_trap_state; } cpu_data_t; extern cpu_data_t *cpu_data_ptr[]; @@ -256,6 +281,24 @@ extern cpu_data_t cpu_data_master; : "i" (offsetof(cpu_data_t,member))); \ return ret; +#define CPU_DATA_GET_INDEX(member,index,type) \ + type ret; \ + __asm__ volatile ("mov %%gs:(%1),%0" \ + : "=r" (ret) \ + : "r" (offsetof(cpu_data_t,member[index]))); \ + return ret; + +#define CPU_DATA_SET(member,value) \ + __asm__ volatile ("mov %0,%%gs:%P1" \ + : \ + : "r" (value), "i" (offsetof(cpu_data_t,member))); +#define CPU_DATA_XCHG(member,value,type) \ + type ret; \ + __asm__ volatile ("xchg %0,%%gs:%P1" \ + : "=r" (ret) \ + : "i" (offsetof(cpu_data_t,member)), "0" (value)); \ + return ret; + /* * Everyone within the osfmk part of the kernel can use the fast * inline versions of these routines. Everyone outside, must call @@ -269,14 +312,14 @@ get_active_thread(void) #define current_thread_fast() get_active_thread() #define current_thread() current_thread_fast() -#if defined(__i386__) static inline boolean_t get_is64bit(void) { CPU_DATA_GET(cpu_is64bit, boolean_t) } +#if CONFIG_YONAH #define cpu_mode_is64bit() get_is64bit() -#elif defined(__x86_64__) +#else #define cpu_mode_is64bit() TRUE #endif @@ -286,11 +329,6 @@ get_preemption_level(void) CPU_DATA_GET(cpu_preemption_level,int) } static inline int -get_simple_lock_count(void) -{ - CPU_DATA_GET(cpu_simple_lock_count,int) -} -static inline int get_interrupt_level(void) { CPU_DATA_GET(cpu_interrupt_level,int) diff --git a/osfmk/i386/cpuid.c b/osfmk/i386/cpuid.c index b836ba88a..c6891aefb 100644 --- a/osfmk/i386/cpuid.c +++ b/osfmk/i386/cpuid.c @@ -235,7 +235,7 @@ static void cpuid_fn(uint32_t selector, uint32_t *result) #else static void cpuid_fn(uint32_t selector, uint32_t *result) { - if (cpu_mode_is64bit()) { + if (get_is64bit()) { asm("call _cpuid64" : "=a" (result[0]), "=b" (result[1]), @@ -353,7 +353,15 @@ cpuid_set_cache_info( i386_cpu_info_t * info_p ) info_p->cache_sharing[type] = cache_sharing; info_p->cache_partitions[type] = cache_partitions; linesizes[type] = cache_linesize; - + + /* + * Overwrite associativity determined via + * CPUID.0x80000006 -- this leaf is more + * accurate + */ + if (type == L2U) + info_p->cpuid_cache_L2_associativity = cache_associativity; + /* Compute the number of page colors for this cache, * which is: * ( linesize * sets ) / page_size @@ -501,10 +509,24 @@ cpuid_set_generic_info(i386_cpu_info_t *info_p) /* Get cache and addressing info. */ if (info_p->cpuid_max_ext >= 0x80000006) { + uint32_t assoc; cpuid_fn(0x80000006, reg); info_p->cpuid_cache_linesize = bitfield32(reg[ecx], 7, 0); - info_p->cpuid_cache_L2_associativity = - bitfield32(reg[ecx],15,12); + assoc = bitfield32(reg[ecx],15,12); + /* + * L2 associativity is encoded, though in an insufficiently + * descriptive fashion, e.g. 24-way is mapped to 16-way. + * Represent a fully associative cache as 0xFFFF. + * Overwritten by associativity as determined via CPUID.4 + * if available. + */ + if (assoc == 6) + assoc = 8; + else if (assoc == 8) + assoc = 16; + else if (assoc == 0xF) + assoc = 0xFFFF; + info_p->cpuid_cache_L2_associativity = assoc; info_p->cpuid_cache_size = bitfield32(reg[ecx],31,16); cpuid_fn(0x80000008, reg); info_p->cpuid_address_bits_physical = @@ -513,8 +535,15 @@ cpuid_set_generic_info(i386_cpu_info_t *info_p) bitfield32(reg[eax],15, 8); } - /* get processor signature and decode */ + /* + * Get processor signature and decode + * and bracket this with the approved procedure for reading the + * the microcode version number a.k.a. signature a.k.a. BIOS ID + */ + wrmsr64(MSR_IA32_BIOS_SIGN_ID, 0); cpuid_fn(1, reg); + info_p->cpuid_microcode_version = + (uint32_t) (rdmsr64(MSR_IA32_BIOS_SIGN_ID) >> 32); info_p->cpuid_signature = reg[eax]; info_p->cpuid_stepping = bitfield32(reg[eax], 3, 0); info_p->cpuid_model = bitfield32(reg[eax], 7, 4); @@ -525,6 +554,9 @@ cpuid_set_generic_info(i386_cpu_info_t *info_p) info_p->cpuid_brand = bitfield32(reg[ebx], 7, 0); info_p->cpuid_features = quad(reg[ecx], reg[edx]); + /* Get "processor flag"; necessary for microcode update matching */ + info_p->cpuid_processor_flag = (rdmsr64(MSR_IA32_PLATFORM_ID)>> 50) & 3; + /* Fold extensions into family/model */ if (info_p->cpuid_family == 0x0f) info_p->cpuid_family += info_p->cpuid_extfamily; @@ -550,10 +582,6 @@ cpuid_set_generic_info(i386_cpu_info_t *info_p) reg[edx] & (uint32_t)CPUID_EXTFEATURE_TSCI; } - /* Find the microcode version number a.k.a. signature a.k.a. BIOS ID */ - info_p->cpuid_microcode_version = - (uint32_t) (rdmsr64(MSR_IA32_BIOS_SIGN_ID) >> 32); - if (info_p->cpuid_max_basic >= 0x5) { cpuid_mwait_leaf_t *cmp = &info_p->cpuid_mwait_leaf; @@ -625,12 +653,11 @@ cpuid_set_cpufamily(i386_cpu_info_t *info_p) switch (info_p->cpuid_family) { case 6: switch (info_p->cpuid_model) { - case 13: - cpufamily = CPUFAMILY_INTEL_6_13; - break; +#if CONFIG_YONAH case 14: cpufamily = CPUFAMILY_INTEL_YONAH; break; +#endif case 15: cpufamily = CPUFAMILY_INTEL_MEROM; break; @@ -681,7 +708,7 @@ cpuid_set_info(void) info_p->cpuid_cpu_type = CPU_TYPE_X86; info_p->cpuid_cpu_subtype = CPU_SUBTYPE_X86_ARCH1; - + /* Must be invoked after set_generic_info */ cpuid_set_cache_info(&cpuid_cpu_info); /* @@ -764,11 +791,11 @@ static struct { {CPUID_FEATURE_MOVBE, "MOVBE"}, {CPUID_FEATURE_POPCNT, "POPCNT"}, {CPUID_FEATURE_AES, "AES"}, + {CPUID_FEATURE_VMM, "VMM"}, + {CPUID_FEATURE_PCID, "PCID"}, {CPUID_FEATURE_XSAVE, "XSAVE"}, {CPUID_FEATURE_OSXSAVE, "OSXSAVE"}, - {CPUID_FEATURE_VMM, "VMM"}, {CPUID_FEATURE_SEGLIM64, "SEGLIM64"}, - {CPUID_FEATURE_PCID, "PCID"}, {CPUID_FEATURE_TSCTMR, "TSCTMR"}, {CPUID_FEATURE_AVX1_0, "AVX1.0"}, {0, 0} diff --git a/osfmk/i386/cpuid.h b/osfmk/i386/cpuid.h index 4c3c329c0..51bd428f6 100644 --- a/osfmk/i386/cpuid.h +++ b/osfmk/i386/cpuid.h @@ -147,7 +147,6 @@ #define CPUID_MODEL_DALES_32NM 0x25 /* Clarkdale, Arrandale */ #define CPUID_MODEL_WESTMERE 0x2C /* Gulftown, Westmere-EP, Westmere-WS */ #define CPUID_MODEL_WESTMERE_EX 0x2F -/* Additional internal models go here */ #define CPUID_MODEL_SANDYBRIDGE 0x2A #define CPUID_MODEL_JAKETOWN 0x2D @@ -266,6 +265,7 @@ typedef struct { uint64_t cpuid_extfeatures; uint32_t cpuid_signature; uint8_t cpuid_brand; + uint8_t cpuid_processor_flag; uint32_t cache_size[LCACHE_MAX]; uint32_t cache_linesize; diff --git a/osfmk/i386/cswitch.s b/osfmk/i386/cswitch.s index 3110cc2c6..6651e5404 100644 --- a/osfmk/i386/cswitch.s +++ b/osfmk/i386/cswitch.s @@ -60,15 +60,9 @@ #include #include +#include #include -#ifdef SYMMETRY -#include -#endif - -#if AT386 -#include -#endif /* AT386 */ #define CX(addr, reg) addr(,reg,4) @@ -80,7 +74,7 @@ Entry(Load_context) movl S_ARG0,%ecx /* get thread */ movl TH_KERNEL_STACK(%ecx),%ecx /* get kernel stack */ - lea -IKS_SIZE-IEL_SIZE(%ecx),%edx + lea -IKS_SIZE(%ecx),%edx add EXT(kernel_stack_size),%edx /* point to stack top */ movl %ecx,%gs:CPU_ACTIVE_STACK /* store stack address */ movl %edx,%gs:CPU_KERNEL_STACK /* store stack top */ @@ -116,7 +110,7 @@ Entry(Switch_context) movl 8(%esp),%ecx /* get new thread */ movl %ecx,%gs:CPU_ACTIVE_THREAD /* new thread is active */ movl TH_KERNEL_STACK(%ecx),%ebx /* get its kernel stack */ - lea -IKS_SIZE-IEL_SIZE(%ebx),%ecx + lea -IKS_SIZE(%ebx),%ecx add EXT(kernel_stack_size),%ecx /* point to stack top */ diff --git a/osfmk/i386/db_interface.c b/osfmk/i386/db_interface.c index e4c025bdf..9e76b5406 100644 --- a/osfmk/i386/db_interface.c +++ b/osfmk/i386/db_interface.c @@ -806,6 +806,7 @@ db_machdep_init(void) int c; db_simple_lock_init(&kdb_lock, 0); +#if MACH_KDB /*this only works for legacy 32-bit machines */ for (c = 0; c < real_ncpus; ++c) { if (c == master_cpu) { master_dbtss.esp0 = (int)(db_task_stack_store + @@ -818,6 +819,7 @@ db_machdep_init(void) */ } } +#endif } /* diff --git a/osfmk/i386/db_machdep.h b/osfmk/i386/db_machdep.h index ca046869d..e57dfca36 100644 --- a/osfmk/i386/db_machdep.h +++ b/osfmk/i386/db_machdep.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -187,7 +187,7 @@ extern void db_task_name( /* macro for checking if a thread has used floating-point */ -#define db_act_fp_used(act) (act && act->machine.pcb->ifps) +#define db_act_fp_used(act) (act && act->machine.ifps) extern void db_tss_to_frame( int tss_sel, diff --git a/osfmk/i386/db_trace.c b/osfmk/i386/db_trace.c index a14bb16b5..136418ea2 100644 --- a/osfmk/i386/db_trace.c +++ b/osfmk/i386/db_trace.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -255,9 +255,9 @@ db_i386_reg_value( } } if (dp == 0) { - if (!thr_act || thr_act->machine.pcb == 0) + if (!thr_act) db_error("no pcb\n"); - dp = (unsigned int *)((unsigned int)(thr_act->machine.pcb->iss) + + dp = (unsigned int *)((unsigned int)(thr_act->machine.iss) + ((unsigned int)vp->valuep - (unsigned int)&ddb_regs)); } } @@ -409,8 +409,8 @@ db_nextframe( break; case SYSCALL: - if (thr_act != THREAD_NULL && thr_act->machine.pcb) { - iss32 = (x86_saved_state32_t *)thr_act->machine.pcb->iss; + if (thr_act != THREAD_NULL) { + iss32 = (x86_saved_state32_t *)thr_act->machine.iss; *ip = (db_addr_t)(iss32->eip); *fp = (struct i386_frame *)(iss32->ebp); @@ -548,10 +548,6 @@ db_stack_trace_cmd( frame = (struct i386_frame *)ddb_regs.ebp; callpc = (db_addr_t)ddb_regs.eip; } else { - if (th->machine.pcb == 0) { - db_printf("thread has no pcb\n"); - return; - } if (!th) { db_printf("thread has no shuttle\n"); @@ -565,7 +561,7 @@ db_stack_trace_cmd( DB_STGY_PROC, task); db_printf("\n"); - iss32 = (x86_saved_state32_t *)th->machine.pcb->iss; + iss32 = (x86_saved_state32_t *)th->machine.iss; frame = (struct i386_frame *) (iss32->ebp); callpc = (db_addr_t) (iss32->eip); @@ -586,7 +582,7 @@ db_stack_trace_cmd( * which is not the top_most one in the RPC chain: * use the activation's pcb. */ - iss32 = (x86_saved_state32_t *)th->machine.pcb->iss; + iss32 = (x86_saved_state32_t *)th->machine.iss; frame = (struct i386_frame *) (iss32->ebp); callpc = (db_addr_t) (iss32->eip); diff --git a/osfmk/i386/endian.h b/osfmk/i386/endian.h index 88fbc2cd5..62dedd9e9 100644 --- a/osfmk/i386/endian.h +++ b/osfmk/i386/endian.h @@ -50,8 +50,8 @@ */ #if !defined(ntohs) -unsigned short ntohs(unsigned short); -extern __inline__ +static __inline__ unsigned short ntohs(unsigned short); +static __inline__ unsigned short ntohs(unsigned short w_int) { @@ -67,8 +67,8 @@ unsigned short htons(unsigned short); #endif #if !defined(ntohl) -unsigned long ntohl(unsigned long); -extern __inline__ +static __inline__ unsigned long ntohl(unsigned long); +static __inline__ unsigned long ntohl(register unsigned long value) { diff --git a/osfmk/i386/etimer.c b/osfmk/i386/etimer.c index 72d3c94b7..c196f8b9f 100644 --- a/osfmk/i386/etimer.c +++ b/osfmk/i386/etimer.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -76,40 +76,46 @@ etimer_intr(int user_mode, pp = current_cpu_datap(); - abstime = mach_absolute_time(); /* Get the time now */ + SCHED_STATS_TIMER_POP(current_processor()); + + abstime = mach_absolute_time(); /* Get the time now */ /* has a pending clock timer expired? */ - mytimer = &pp->rtclock_timer; + mytimer = &pp->rtclock_timer; /* Point to the event timer */ if (mytimer->deadline <= abstime) { - /* + /* * Log interrupt service latency (-ve value expected by tool) * a non-PM event is expected next. + * The requested deadline may be earlier than when it was set + * - use MAX to avoid reporting bogus latencies. */ - latency = (int32_t) (abstime - mytimer->deadline); + latency = (int32_t) (abstime - MAX(mytimer->deadline, + mytimer->when_set)); KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_EXCP_DECI, 0) | DBG_FUNC_NONE, - -latency, - (uint32_t)rip, user_mode, 0, 0); + DECR_TRAP_LATENCY | DBG_FUNC_NONE, + -latency, rip, user_mode, 0, 0); - mytimer->has_expired = TRUE; /* Remember that we popped */ + mytimer->has_expired = TRUE; /* Remember that we popped */ mytimer->deadline = timer_queue_expire(&mytimer->queue, abstime); mytimer->has_expired = FALSE; - /* Get the time again since we ran for a bit */ + /* Get the time again since we ran a bit */ abstime = mach_absolute_time(); + mytimer->when_set = abstime; } /* is it time for power management state change? */ if ((pmdeadline = pmCPUGetDeadline(pp)) && (pmdeadline <= abstime)) { KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_EXCP_DECI, 3) | DBG_FUNC_START, - 0, 0, 0, 0, 0); + DECR_PM_DEADLINE | DBG_FUNC_START, + 0, 0, 0, 0, 0); pmCPUDeadline(pp); KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_EXCP_DECI, 3) | DBG_FUNC_END, - 0, 0, 0, 0, 0); + DECR_PM_DEADLINE | DBG_FUNC_END, + 0, 0, 0, 0, 0); } + /* schedule our next deadline */ etimer_resync_deadlines(); } @@ -126,7 +132,8 @@ void etimer_set_deadline(uint64_t deadline) pp = current_cpu_datap(); mytimer = &pp->rtclock_timer; /* Point to the timer itself */ - mytimer->deadline = deadline; /* Set the new expiration time */ + mytimer->deadline = deadline; /* Set new expiration time */ + mytimer->when_set = mach_absolute_time(); etimer_resync_deadlines(); @@ -164,7 +171,7 @@ etimer_resync_deadlines(void) */ pmdeadline = pmCPUGetDeadline(pp); if (0 < pmdeadline && pmdeadline < deadline) - deadline = pmdeadline; + deadline = pmdeadline; /* * Go and set the "pop" event. @@ -173,10 +180,10 @@ etimer_resync_deadlines(void) /* Record non-PM deadline for latency tool */ if (deadline != pmdeadline) { - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_EXCP_DECI, 1) | DBG_FUNC_NONE, - decr, 2, - deadline, (uint32_t)(deadline >> 32), 0); + KERNEL_DEBUG_CONSTANT( + DECR_SET_DEADLINE | DBG_FUNC_NONE, + decr, 2, + deadline, (uint32_t)(deadline >> 32), 0); } splx(s); } @@ -199,32 +206,45 @@ __unused void *arg) mytimer->has_expired = TRUE; mytimer->deadline = timer_queue_expire(&mytimer->queue, abstime); mytimer->has_expired = FALSE; + mytimer->when_set = mach_absolute_time(); etimer_resync_deadlines(); } -queue_t +uint64_t +timer_call_slop( + uint64_t deadline) +{ + uint64_t now = mach_absolute_time(); + if (deadline > now) { + return MIN((deadline - now) >> 3, NSEC_PER_MSEC); /* Min of 12.5% and 1ms */ + } + + return 0; +} + +mpqueue_head_t * timer_queue_assign( uint64_t deadline) { - cpu_data_t *cdp = current_cpu_datap(); - rtclock_timer_t *timer; + cpu_data_t *cdp = current_cpu_datap(); + mpqueue_head_t *queue; if (cdp->cpu_running) { - timer = &cdp->rtclock_timer; + queue = &cdp->rtclock_timer.queue; - if (deadline < timer->deadline) + if (deadline < cdp->rtclock_timer.deadline) etimer_set_deadline(deadline); } else - timer = &cpu_datap(master_cpu)->rtclock_timer; + queue = &cpu_datap(master_cpu)->rtclock_timer.queue; - return (&timer->queue); + return queue; } void timer_queue_cancel( - queue_t queue, + mpqueue_head_t *queue, uint64_t deadline, uint64_t new_deadline) { @@ -233,3 +253,53 @@ timer_queue_cancel( etimer_set_deadline(new_deadline); } } + +/* + * etimer_queue_migrate() is called from the Power-Management kext + * when a logical processor goes idle (in a deep C-state) with a distant + * deadline so that it's timer queue can be moved to another processor. + * This target processor should be the least idle (most busy) -- + * currently this is the primary processor for the calling thread's package. + * Locking restrictions demand that the target cpu must be the boot cpu. + */ +uint32_t +etimer_queue_migrate(int target_cpu) +{ + cpu_data_t *target_cdp = cpu_datap(target_cpu); + cpu_data_t *cdp = current_cpu_datap(); + int ntimers_moved; + + assert(!ml_get_interrupts_enabled()); + assert(target_cpu != cdp->cpu_number); + assert(target_cpu == master_cpu); + + KERNEL_DEBUG_CONSTANT( + DECR_TIMER_MIGRATE | DBG_FUNC_START, + target_cpu, + cdp->rtclock_timer.deadline, (cdp->rtclock_timer.deadline >>32), + 0, 0); + + /* + * Move timer requests from the local queue to the target processor's. + * The return value is the number of requests moved. If this is 0, + * it indicates that the first (i.e. earliest) timer is earlier than + * the earliest for the target processor. Since this would force a + * resync, the move of this and all later requests is aborted. + */ + ntimers_moved = timer_queue_migrate(&cdp->rtclock_timer.queue, + &target_cdp->rtclock_timer.queue); + + /* + * Assuming we moved stuff, clear local deadline. + */ + if (ntimers_moved > 0) { + cdp->rtclock_timer.deadline = EndOfAllTime; + setPop(EndOfAllTime); + } + + KERNEL_DEBUG_CONSTANT( + DECR_TIMER_MIGRATE | DBG_FUNC_END, + target_cpu, ntimers_moved, 0, 0, 0); + + return ntimers_moved; +} diff --git a/osfmk/i386/fpu.c b/osfmk/i386/fpu.c index 7b4be4ebe..7227b93a2 100644 --- a/osfmk/i386/fpu.c +++ b/osfmk/i386/fpu.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -61,6 +61,7 @@ #include #include #include +#include #include #include @@ -434,7 +435,7 @@ fpu_save_context(thread_t thread) struct x86_fx_thread_state *ifps; assert(ml_get_interrupts_enabled() == FALSE); - ifps = (thread)->machine.pcb->ifps; + ifps = (thread)->machine.ifps; #if DEBUG if (ifps && ((ifps->fp_valid != FALSE) && (ifps->fp_valid != TRUE))) { panic("ifps->fp_valid: %u\n", ifps->fp_valid); @@ -448,7 +449,7 @@ fpu_save_context(thread_t thread) */ clear_ts(); /* registers are in FPU - save to memory */ - fpu_store_registers(ifps, (thread_is_64bit(thread) && is_saved_state64(thread->machine.pcb->iss))); + fpu_store_registers(ifps, (thread_is_64bit(thread) && is_saved_state64(thread->machine.iss))); ifps->fp_valid = TRUE; } set_ts(); @@ -492,7 +493,7 @@ fpu_set_fxstate( state = (x86_float_state64_t *)tstate; assert(thr_act != THREAD_NULL); - pcb = thr_act->machine.pcb; + pcb = THREAD_TO_PCB(thr_act); if (state == NULL) { /* @@ -598,7 +599,7 @@ fpu_get_fxstate( state = (x86_float_state64_t *)tstate; assert(thr_act != THREAD_NULL); - pcb = thr_act->machine.pcb; + pcb = THREAD_TO_PCB(thr_act); simple_lock(&pcb->lock); @@ -657,12 +658,12 @@ fpu_dup_fxstate( boolean_t intr; pcb_t ppcb; - ppcb = parent->machine.pcb; + ppcb = THREAD_TO_PCB(parent); if (ppcb->ifps == NULL) return; - if (child->machine.pcb->ifps) + if (child->machine.ifps) panic("fpu_dup_fxstate: child's ifps non-null"); new_ifps = fp_state_alloc(); @@ -683,11 +684,11 @@ fpu_dup_fxstate( (void)ml_set_interrupts_enabled(intr); if (ifps->fp_valid) { - child->machine.pcb->ifps = new_ifps; + child->machine.ifps = new_ifps; assert((fp_register_state_size == sizeof(struct x86_fx_thread_state)) || (fp_register_state_size == sizeof(struct x86_avx_thread_state))); bcopy((char *)(ppcb->ifps), - (char *)(child->machine.pcb->ifps), fp_register_state_size); + (char *)(child->machine.ifps), fp_register_state_size); /* Mark the new fp saved state as non-live. */ /* Temporarily disabled: radar 4647827 @@ -750,7 +751,7 @@ fpnoextflt(void) struct x86_fx_thread_state *ifps = 0; thr_act = current_thread(); - pcb = thr_act->machine.pcb; + pcb = THREAD_TO_PCB(thr_act); assert(fp_register_state_size != 0); @@ -769,7 +770,7 @@ fpnoextflt(void) clear_ts(); /* Enable FPU use */ - if (get_interrupt_level()) { + if (__improbable(get_interrupt_level())) { /* * Save current coprocessor context if valid * Initialize coprocessor live context @@ -816,7 +817,7 @@ fpextovrflt(void) * This is a non-recoverable error. * Invalidate the thread`s FPU state. */ - pcb = thr_act->machine.pcb; + pcb = THREAD_TO_PCB(thr_act); simple_lock(&pcb->lock); ifps = pcb->ifps; pcb->ifps = 0; @@ -853,7 +854,7 @@ void fpexterrflt(void) { thread_t thr_act = current_thread(); - struct x86_fx_thread_state *ifps = thr_act->machine.pcb->ifps; + struct x86_fx_thread_state *ifps = thr_act->machine.ifps; boolean_t intr; intr = ml_set_interrupts_enabled(FALSE); @@ -896,7 +897,7 @@ void fp_save( thread_t thr_act) { - pcb_t pcb = thr_act->machine.pcb; + pcb_t pcb = THREAD_TO_PCB(thr_act); struct x86_fx_thread_state *ifps = pcb->ifps; assert(ifps != 0); @@ -918,7 +919,7 @@ void fp_load( thread_t thr_act) { - pcb_t pcb = thr_act->machine.pcb; + pcb_t pcb = THREAD_TO_PCB(thr_act); struct x86_fx_thread_state *ifps = pcb->ifps; assert(ifps); @@ -941,7 +942,7 @@ void fpSSEexterrflt(void) { thread_t thr_act = current_thread(); - struct x86_fx_thread_state *ifps = thr_act->machine.pcb->ifps; + struct x86_fx_thread_state *ifps = thr_act->machine.ifps; boolean_t intr; intr = ml_set_interrupts_enabled(FALSE); @@ -972,7 +973,7 @@ fpSSEexterrflt(void) void fp_setvalid(boolean_t value) { thread_t thr_act = current_thread(); - struct x86_fx_thread_state *ifps = thr_act->machine.pcb->ifps; + struct x86_fx_thread_state *ifps = thr_act->machine.ifps; if (ifps) { ifps->fp_valid = value; @@ -985,7 +986,7 @@ fp_setvalid(boolean_t value) { } } -boolean_t +__private_extern__ boolean_t ml_fpu_avx_enabled(void) { return (fpu_YMM_present == TRUE); } diff --git a/osfmk/i386/fpu.h b/osfmk/i386/fpu.h index a606aab41..5b0658f60 100644 --- a/osfmk/i386/fpu.h +++ b/osfmk/i386/fpu.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -68,6 +68,14 @@ #include #include +typedef enum { + FXSAVE32 = 1, + FXSAVE64 = 2, + XSAVE32 = 3, + XSAVE64 = 4, + FP_UNUSED = 5 + } fp_save_layout_t; + extern int fp_kind; extern void init_fpu(void); diff --git a/osfmk/i386/gdt.c b/osfmk/i386/gdt.c index c3502e06b..7677f2488 100644 --- a/osfmk/i386/gdt.c +++ b/osfmk/i386/gdt.c @@ -63,44 +63,44 @@ #include struct real_descriptor master_gdt[GDTSZ] __attribute__ ((section("__INITGDT,__data")))= { - [SEL_TO_INDEX(KERNEL32_CS)] MAKE_REAL_DESCRIPTOR( /* kernel 32-bit code */ + [SEL_TO_INDEX(KERNEL32_CS)] = MAKE_REAL_DESCRIPTOR( /* kernel 32-bit code */ 0, 0xfffff, SZ_32|SZ_G, ACC_P|ACC_PL_K|ACC_CODE_R ), - [SEL_TO_INDEX(KERNEL_DS)] MAKE_REAL_DESCRIPTOR( /* kernel data */ + [SEL_TO_INDEX(KERNEL_DS)] = MAKE_REAL_DESCRIPTOR( /* kernel data */ 0, 0xfffff, SZ_32|SZ_G, ACC_P|ACC_PL_K|ACC_DATA_W ), - [SEL_TO_INDEX(KERNEL64_CS)] MAKE_REAL_DESCRIPTOR( /* kernel 64-bit code */ + [SEL_TO_INDEX(KERNEL64_CS)] = MAKE_REAL_DESCRIPTOR( /* kernel 64-bit code */ 0, 0xfffff, SZ_64|SZ_G, ACC_P|ACC_PL_K|ACC_CODE_R ), - [SEL_TO_INDEX(KERNEL64_SS)] MAKE_REAL_DESCRIPTOR( /* kernel 64-bit syscall stack */ + [SEL_TO_INDEX(KERNEL64_SS)] = MAKE_REAL_DESCRIPTOR( /* kernel 64-bit syscall stack */ 0, 0xfffff, SZ_32|SZ_G, ACC_P|ACC_PL_K|ACC_DATA_W ), #ifdef __x86_64__ - [SEL_TO_INDEX(USER_CS)] MAKE_REAL_DESCRIPTOR( /* 32-bit user code segment */ + [SEL_TO_INDEX(USER_CS)] = MAKE_REAL_DESCRIPTOR( /* 32-bit user code segment */ 0, 0xfffff, SZ_32|SZ_G, ACC_P|ACC_PL_U|ACC_CODE_R ), - [SEL_TO_INDEX(USER_DS)] MAKE_REAL_DESCRIPTOR( /* 32-bit user data segment */ + [SEL_TO_INDEX(USER_DS)] = MAKE_REAL_DESCRIPTOR( /* 32-bit user data segment */ 0, 0xfffff, SZ_32|SZ_G, ACC_P|ACC_PL_U|ACC_DATA_W ), - [SEL_TO_INDEX(USER64_CS)] MAKE_REAL_DESCRIPTOR( /* user 64-bit code segment */ + [SEL_TO_INDEX(USER64_CS)] = MAKE_REAL_DESCRIPTOR( /* user 64-bit code segment */ 0, 0xfffff, SZ_64|SZ_G, diff --git a/osfmk/i386/genassym.c b/osfmk/i386/genassym.c index 0f5edf0e5..bb77d38a2 100644 --- a/osfmk/i386/genassym.c +++ b/osfmk/i386/genassym.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -93,7 +93,6 @@ #include <../bsd/sys/lockstat.h> #endif - /* * genassym.c is used to produce an * assembly file which, intermingled with unuseful assembly code, @@ -131,6 +130,8 @@ main( DECLARE("AST_URGENT", AST_URGENT); DECLARE("AST_BSD", AST_BSD); + DECLARE("MAX_CPUS", MAX_CPUS); + /* Simple Lock structure */ DECLARE("SLOCK_ILK", offsetof(usimple_lock_t, interlock)); #if MACH_LDEBUG @@ -149,7 +150,6 @@ main( #ifdef __i386__ DECLARE("MUTEX_TYPE", offsetof(lck_mtx_ext_t *, lck_mtx_deb.type)); DECLARE("MUTEX_PC", offsetof(lck_mtx_ext_t *, lck_mtx_deb.pc)); - DECLARE("MUTEX_THREAD", offsetof(lck_mtx_ext_t *, lck_mtx_deb.thread)); DECLARE("MUTEX_ATTR", offsetof(lck_mtx_ext_t *, lck_mtx_attr)); DECLARE("MUTEX_ATTR_DEBUG", LCK_MTX_ATTR_DEBUG); DECLARE("MUTEX_ATTR_DEBUGb", LCK_MTX_ATTR_DEBUGb); @@ -158,8 +158,6 @@ main( DECLARE("MUTEX_TAG", MUTEX_TAG); #endif DECLARE("MUTEX_IND", LCK_MTX_TAG_INDIRECT); - DECLARE("MUTEX_EXT", LCK_MTX_PTR_EXTENDED); - DECLARE("MUTEX_ITAG", offsetof(lck_mtx_t *, lck_mtx_tag)); DECLARE("MUTEX_PTR", offsetof(lck_mtx_t *, lck_mtx_ptr)); DECLARE("MUTEX_ASSERT_OWNED", LCK_MTX_ASSERT_OWNED); DECLARE("MUTEX_ASSERT_NOTOWNED",LCK_MTX_ASSERT_NOTOWNED); @@ -189,62 +187,55 @@ main( DECLARE("TH_RECOVER", offsetof(thread_t, recover)); DECLARE("TH_CONTINUATION", offsetof(thread_t, continuation)); DECLARE("TH_KERNEL_STACK", offsetof(thread_t, kernel_stack)); + DECLARE("TH_MUTEX_COUNT", offsetof(thread_t, mutex_count)); + DECLARE("TH_WAS_PROMOTED_ON_WAKEUP", offsetof(thread_t, was_promoted_on_wakeup)); - DECLARE("TASK_MACH_EXC_PORT", - offsetof(task_t, exc_actions[EXC_MACH_SYSCALL].port)); - DECLARE("TASK_SYSCALLS_MACH", offsetof(struct task *, syscalls_mach)); - DECLARE("TASK_SYSCALLS_UNIX", offsetof(struct task *, syscalls_unix)); + DECLARE("TH_SYSCALLS_MACH", offsetof(thread_t, syscalls_mach)); + DECLARE("TH_SYSCALLS_UNIX", offsetof(thread_t, syscalls_unix)); DECLARE("TASK_VTIMERS", offsetof(struct task *, vtimers)); /* These fields are being added on demand */ - DECLARE("ACT_MACH_EXC_PORT", - offsetof(thread_t, exc_actions[EXC_MACH_SYSCALL].port)); - - DECLARE("ACT_TASK", offsetof(thread_t, task)); - DECLARE("ACT_AST", offsetof(thread_t, ast)); - DECLARE("ACT_PCB", offsetof(thread_t, machine.pcb)); - DECLARE("ACT_SPF", offsetof(thread_t, machine.specFlags)); - DECLARE("ACT_MAP", offsetof(thread_t, map)); - DECLARE("ACT_PCB_ISS", offsetof(thread_t, machine.xxx_pcb.iss)); - DECLARE("ACT_PCB_IDS", offsetof(thread_t, machine.xxx_pcb.ids)); + DECLARE("TH_TASK", offsetof(thread_t, task)); + DECLARE("TH_AST", offsetof(thread_t, ast)); + DECLARE("TH_MAP", offsetof(thread_t, map)); + DECLARE("TH_SPF", offsetof(thread_t, machine.specFlags)); + DECLARE("TH_PCB_ISS", offsetof(thread_t, machine.iss)); + DECLARE("TH_PCB_IDS", offsetof(thread_t, machine.ids)); + DECLARE("TH_PCB_FPS", offsetof(thread_t, machine.ifps)); #if NCOPY_WINDOWS > 0 - DECLARE("ACT_COPYIO_STATE", offsetof(thread_t, machine.copyio_state)); + DECLARE("TH_COPYIO_STATE", offsetof(thread_t, machine.copyio_state)); DECLARE("WINDOWS_CLEAN", WINDOWS_CLEAN); #endif DECLARE("MAP_PMAP", offsetof(vm_map_t, pmap)); #define IEL_SIZE (sizeof(struct i386_exception_link *)) - DECLARE("IEL_SIZE", IEL_SIZE); DECLARE("IKS_SIZE", sizeof(struct x86_kernel_state)); /* * KSS_* are offsets from the top of the kernel stack (cpu_kernel_stack) */ #if defined(__i386__) - DECLARE("KSS_EBX", IEL_SIZE + offsetof(struct x86_kernel_state *, k_ebx)); - DECLARE("KSS_ESP", IEL_SIZE + offsetof(struct x86_kernel_state *, k_esp)); - DECLARE("KSS_EBP", IEL_SIZE + offsetof(struct x86_kernel_state *, k_ebp)); - DECLARE("KSS_EDI", IEL_SIZE + offsetof(struct x86_kernel_state *, k_edi)); - DECLARE("KSS_ESI", IEL_SIZE + offsetof(struct x86_kernel_state *, k_esi)); - DECLARE("KSS_EIP", IEL_SIZE + offsetof(struct x86_kernel_state *, k_eip)); + DECLARE("KSS_EBX", offsetof(struct x86_kernel_state *, k_ebx)); + DECLARE("KSS_ESP", offsetof(struct x86_kernel_state *, k_esp)); + DECLARE("KSS_EBP", offsetof(struct x86_kernel_state *, k_ebp)); + DECLARE("KSS_EDI", offsetof(struct x86_kernel_state *, k_edi)); + DECLARE("KSS_ESI", offsetof(struct x86_kernel_state *, k_esi)); + DECLARE("KSS_EIP", offsetof(struct x86_kernel_state *, k_eip)); #elif defined(__x86_64__) - DECLARE("KSS_RBX", IEL_SIZE + offsetof(struct x86_kernel_state *, k_rbx)); - DECLARE("KSS_RSP", IEL_SIZE + offsetof(struct x86_kernel_state *, k_rsp)); - DECLARE("KSS_RBP", IEL_SIZE + offsetof(struct x86_kernel_state *, k_rbp)); - DECLARE("KSS_R12", IEL_SIZE + offsetof(struct x86_kernel_state *, k_r12)); - DECLARE("KSS_R13", IEL_SIZE + offsetof(struct x86_kernel_state *, k_r13)); - DECLARE("KSS_R14", IEL_SIZE + offsetof(struct x86_kernel_state *, k_r14)); - DECLARE("KSS_R15", IEL_SIZE + offsetof(struct x86_kernel_state *, k_r15)); - DECLARE("KSS_RIP", IEL_SIZE + offsetof(struct x86_kernel_state *, k_rip)); + DECLARE("KSS_RBX", offsetof(struct x86_kernel_state *, k_rbx)); + DECLARE("KSS_RSP", offsetof(struct x86_kernel_state *, k_rsp)); + DECLARE("KSS_RBP", offsetof(struct x86_kernel_state *, k_rbp)); + DECLARE("KSS_R12", offsetof(struct x86_kernel_state *, k_r12)); + DECLARE("KSS_R13", offsetof(struct x86_kernel_state *, k_r13)); + DECLARE("KSS_R14", offsetof(struct x86_kernel_state *, k_r14)); + DECLARE("KSS_R15", offsetof(struct x86_kernel_state *, k_r15)); + DECLARE("KSS_RIP", offsetof(struct x86_kernel_state *, k_rip)); #else #error Unsupported architecture #endif - DECLARE("PCB_FPS", offsetof(pcb_t, ifps)); - DECLARE("PCB_ISS", offsetof(pcb_t, iss)); - DECLARE("DS_DR0", offsetof(struct x86_debug_state32 *, dr0)); DECLARE("DS_DR1", offsetof(struct x86_debug_state32 *, dr1)); DECLARE("DS_DR2", offsetof(struct x86_debug_state32 *, dr2)); @@ -432,9 +423,7 @@ main( DECLARE("CPU_INTERRUPT_LEVEL", offsetof(cpu_data_t *, cpu_interrupt_level)); DECLARE("CPU_NESTED_ISTACK", - offsetof(cpu_data_t *, cpu_nested_istack)); - DECLARE("CPU_SIMPLE_LOCK_COUNT", - offsetof(cpu_data_t *,cpu_simple_lock_count)); + offsetof(cpu_data_t *, cpu_nested_istack)); DECLARE("CPU_NUMBER_GS", offsetof(cpu_data_t *,cpu_number)); DECLARE("CPU_RUNNING", @@ -500,7 +489,31 @@ main( offsetof(cpu_data_t *, cpu_dr7)); DECLARE("hwIntCnt", offsetof(cpu_data_t *,cpu_hwIntCnt)); - +#if defined(__x86_64__) + DECLARE("CPU_ACTIVE_PCID", + offsetof(cpu_data_t *, cpu_active_pcid)); + DECLARE("CPU_PCID_COHERENTP", + offsetof(cpu_data_t *, cpu_pmap_pcid_coherentp)); + DECLARE("CPU_PCID_COHERENTP_KERNEL", + offsetof(cpu_data_t *, cpu_pmap_pcid_coherentp_kernel)); + DECLARE("CPU_PMAP_PCID_ENABLED", + offsetof(cpu_data_t *, cpu_pmap_pcid_enabled)); + +#ifdef PCID_STATS + DECLARE("CPU_PMAP_USER_RETS", + offsetof(cpu_data_t *, cpu_pmap_user_rets)); + DECLARE("CPU_PMAP_PCID_PRESERVES", + offsetof(cpu_data_t *, cpu_pmap_pcid_preserves)); + DECLARE("CPU_PMAP_PCID_FLUSHES", + offsetof(cpu_data_t *, cpu_pmap_pcid_flushes)); +#endif + DECLARE("CPU_TLB_INVALID", + offsetof(cpu_data_t *, cpu_tlb_invalid)); + DECLARE("CPU_TLB_INVALID_LOCAL", + offsetof(cpu_data_t *, cpu_tlb_invalid_local)); + DECLARE("CPU_TLB_INVALID_GLOBAL", + offsetof(cpu_data_t *, cpu_tlb_invalid_global)); +#endif /* x86_64 */ DECLARE("enaExpTrace", enaExpTrace); DECLARE("enaExpTraceb", enaExpTraceb); DECLARE("enaUsrFCall", enaUsrFCall); @@ -561,15 +574,15 @@ main( DECLARE("DEVICETREEP", offsetof(struct boot_args *, deviceTreeP)); DECLARE("RNT_TSC_BASE", - offsetof(rtc_nanotime_t *, tsc_base)); + offsetof(pal_rtc_nanotime_t *, tsc_base)); DECLARE("RNT_NS_BASE", - offsetof(rtc_nanotime_t *, ns_base)); + offsetof(pal_rtc_nanotime_t *, ns_base)); DECLARE("RNT_SCALE", - offsetof(rtc_nanotime_t *, scale)); + offsetof(pal_rtc_nanotime_t *, scale)); DECLARE("RNT_SHIFT", - offsetof(rtc_nanotime_t *, shift)); + offsetof(pal_rtc_nanotime_t *, shift)); DECLARE("RNT_GENERATION", - offsetof(rtc_nanotime_t *, generation)); + offsetof(pal_rtc_nanotime_t *, generation)); /* values from kern/timer.h */ #ifdef __LP64__ diff --git a/osfmk/i386/hibernate_i386.c b/osfmk/i386/hibernate_i386.c index b333db549..6cdc1cec9 100644 --- a/osfmk/i386/hibernate_i386.c +++ b/osfmk/i386/hibernate_i386.c @@ -48,8 +48,6 @@ extern ppnum_t max_ppnum; #define MAX_BANKS 32 -int hibernate_page_list_allocate_avoided; - /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ hibernate_page_list_t * @@ -73,8 +71,6 @@ hibernate_page_list_allocate(void) msize = args->MemoryMapDescriptorSize; mcount = args->MemoryMapSize / msize; - hibernate_page_list_allocate_avoided = 0; - num_banks = 0; for (i = 0; i < mcount; i++, mptr = (EfiMemoryRange *)(((vm_offset_t)mptr) + msize)) { @@ -86,7 +82,7 @@ hibernate_page_list_allocate(void) if ((base + num - 1) > max_ppnum) num = max_ppnum - base + 1; if (!num) - continue; + continue; switch (mptr->Type) { @@ -131,9 +127,6 @@ hibernate_page_list_allocate(void) case kEfiRuntimeServicesData: // contents are volatile once the platform expert starts case kEfiACPIReclaimMemory: - hibernate_page_list_allocate_avoided += num; - break; - // non dram case kEfiReservedMemoryType: case kEfiUnusableMemory: @@ -227,13 +220,8 @@ hibernate_processor_setup(IOHibernateImageHeader * header) header->runtimePages = args->efiRuntimeServicesPageStart; header->runtimePageCount = args->efiRuntimeServicesPageCount; header->runtimeVirtualPages = args->efiRuntimeServicesVirtualPageStart; - if (args->Version == kBootArgsVersion1 && args->Revision >= kBootArgsRevision1_6) { - header->performanceDataStart = args->performanceDataStart; - header->performanceDataSize = args->performanceDataSize; - } else { - header->performanceDataStart = 0; - header->performanceDataSize = 0; - } + header->performanceDataStart = args->performanceDataStart; + header->performanceDataSize = args->performanceDataSize; return (KERN_SUCCESS); } diff --git a/osfmk/i386/hibernate_restore.c b/osfmk/i386/hibernate_restore.c index c1dfd4e16..bf0508a69 100644 --- a/osfmk/i386/hibernate_restore.c +++ b/osfmk/i386/hibernate_restore.c @@ -29,19 +29,18 @@ #include #include +#include + extern pd_entry_t BootstrapPTD[2048]; #define TWO_MEG_MASK 0xFFFFFFFFFFE00000ULL - -#define DST_INDEX 2047UL - -static char *dstPtr = (char *)(DST_INDEX << PDSHIFT); +#define FOUR_K_MASK 0xFFFFFFFFFFFFF000ULL // src is virtually mapped, not page aligned, // dst is a physical 4k page aligned ptr, len is one 4K page // src & dst will not overlap -void +uintptr_t hibernate_restore_phys_page(uint64_t src, uint64_t dst, uint32_t len, uint32_t procFlags) { (void)procFlags; @@ -50,25 +49,78 @@ hibernate_restore_phys_page(uint64_t src, uint64_t dst, uint32_t len, uint32_t p uint32_t idx; if (src == 0) - return; + return (uintptr_t)dst; - if (dst < (uint64_t) (uintptr_t)dstPtr) - { - d = (uint64_t *) (uintptr_t)dst; - } - else - { - /* Outside 1-1 4G map so set up the mappings for the dest page using 2MB pages */ - BootstrapPTD[DST_INDEX] = (dst & TWO_MEG_MASK) | INTEL_PTE_PS | INTEL_PTE_VALID | INTEL_PTE_WRITE | INTEL_PTE_WRITE; - - /* Invalidate the page tables for this */ - invlpg((uintptr_t) dstPtr); - - /* Mask off the offset from the 2MB window */ - dst &= ~TWO_MEG_MASK; - d = (uint64_t *) (dstPtr + dst); - } + d = (uint64_t *)pal_hib_map(DEST_COPY_AREA, dst); s = (uint64_t *) (uintptr_t)src; + for (idx = 0; idx < (len / (uint32_t)sizeof(uint64_t)); idx++) d[idx] = s[idx]; + + return (uintptr_t)d; +} +#undef hibprintf + +void hibprintf(const char *fmt, ...); + +void +pal_hib_window_setup(ppnum_t page) +{ + uint64_t *pp; + uint64_t phys = ptoa_64(page); + int i; + + BootstrapPTD[2047] = (phys & ~((uint64_t)I386_LPGMASK)) | INTEL_PTE_PS | INTEL_PTE_VALID | INTEL_PTE_WRITE; + + invlpg(HIB_PTES); + + pp = (uint64_t *)(uintptr_t)(HIB_PTES + (phys & I386_LPGMASK)); + + for(i=0;i<512;i++) + *pp = 0; + + pp[0] = phys | INTEL_PTE_VALID | INTEL_PTE_WRITE; + BootstrapPTD[2047] = phys | INTEL_PTE_VALID | INTEL_PTE_WRITE; + + invlpg(HIB_PTES); +} + +uintptr_t +pal_hib_map(uintptr_t v, uint64_t p) +{ + int index; + + switch(v) { + case DEST_COPY_AREA: + index = 1; + break; + case SRC_COPY_AREA: + index = 2; + break; + case COPY_PAGE_AREA: + index = 3; + break; + default: + index = -1; + asm("cli;hlt;"); + } + + uint64_t *ptes = (uint64_t *)HIB_PTES; + + /* Outside 1-1 4G map so set up the mappings for the dest page using 2MB pages */ + ptes[index] = (p & FOUR_K_MASK) | INTEL_PTE_VALID | INTEL_PTE_WRITE; + + /* Invalidate the page tables for this */ + invlpg((uintptr_t)v); + + return v; +} + +void hibernateRestorePALState(uint32_t *arg) +{ + (void)arg; +} +void +pal_hib_patchup(void) +{ } diff --git a/osfmk/i386/hw_lock_types.h b/osfmk/i386/hw_lock_types.h index bfeee9407..52f4355f6 100644 --- a/osfmk/i386/hw_lock_types.h +++ b/osfmk/i386/hw_lock_types.h @@ -90,7 +90,7 @@ * later in kern/lock.h.. */ struct hslock { - long lock_data; + uintptr_t lock_data; }; typedef struct hslock hw_lock_data_t, *hw_lock_t; #define hw_lock_addr(hwl) (&((hwl).lock_data)) diff --git a/osfmk/i386/i386_init.c b/osfmk/i386/i386_init.c index 445c6afed..596888b5f 100644 --- a/osfmk/i386/i386_init.c +++ b/osfmk/i386/i386_init.c @@ -78,20 +78,23 @@ #include #include #include +#include #include #include -#include #include #include #include #include #include #include +#if CONFIG_MTRR #include +#endif #include #if CONFIG_MCA #include #endif +#include #include #include #include @@ -103,6 +106,9 @@ #include #endif #endif +#if DEBUG +#include +#endif #if DEBUG #define DBG(x...) kprintf(x) @@ -122,21 +128,16 @@ extern const char version[]; extern const char version_variant[]; extern int nx_enabled; -extern int noVMX; /* if set, rosetta should not emulate altivec */ - #ifdef __x86_64__ extern void *low_eintstack; #endif -extern void serial_init(void); - void *KPTphys; pd_entry_t *IdlePTD; #ifdef __i386__ pd_entry_t *IdlePDPT64; #endif - char *physfree; /* @@ -166,7 +167,7 @@ fillkpt(pt_entry_t *base, int prot, uintptr_t src, int index, int count) } } -extern vm_offset_t first_avail; +extern pmap_paddr_t first_avail; #ifdef __x86_64__ int break_kprintf = 0; @@ -175,8 +176,8 @@ uint64_t x86_64_pre_sleep(void) { IdlePML4[0] = IdlePML4[KERNEL_PML4_INDEX]; - uint64_t oldcr3 = get_cr3(); - set_cr3((uint32_t) (uintptr_t)ID_MAP_VTOP(IdlePML4)); + uint64_t oldcr3 = get_cr3_raw(); + set_cr3_raw((uint32_t) (uintptr_t)ID_MAP_VTOP(IdlePML4)); return oldcr3; } @@ -184,7 +185,7 @@ void x86_64_post_sleep(uint64_t new_cr3) { IdlePML4[0] = 0; - set_cr3((uint32_t) new_cr3); + set_cr3_raw((uint32_t) new_cr3); } #endif @@ -194,7 +195,6 @@ x86_64_post_sleep(uint64_t new_cr3) #endif - #ifdef __x86_64__ // Set up the physical mapping - NPHYSMAP GB of memory mapped at a high address // NPHYSMAP is determined by the maximum supported RAM size plus 4GB to account @@ -227,6 +227,10 @@ physmap_init(void) IdlePML4[KERNEL_PHYSMAP_INDEX] = ((uintptr_t)ID_MAP_VTOP(physmapL3)) | INTEL_PTE_VALID | INTEL_PTE_WRITE; + if (cpuid_extfeatures() & CPUID_EXTFEATURE_XD) { + IdlePML4[KERNEL_PHYSMAP_INDEX] |= INTEL_PTE_NX; + } + DBG("physical map idlepml4[%d]: 0x%llx\n", KERNEL_PHYSMAP_INDEX, IdlePML4[KERNEL_PHYSMAP_INDEX]); } @@ -267,7 +271,7 @@ Idle_PTs_init(void) #endif // Flush the TLB now we're done rewriting the page tables.. - set_cr3(get_cr3()); + set_cr3_raw(get_cr3_raw()); } /* @@ -302,7 +306,7 @@ vstart(vm_offset_t boot_args_start) lphysfree = kernelBootArgs->kaddr + kernelBootArgs->ksize; physfree = (void *)(uintptr_t)((lphysfree + PAGE_SIZE - 1) &~ (PAGE_SIZE - 1)); #if DEBUG - serial_init(); + pal_serial_init(); #endif DBG("revision 0x%x\n", kernelBootArgs->Revision); DBG("version 0x%x\n", kernelBootArgs->Version); @@ -316,7 +320,13 @@ vstart(vm_offset_t boot_args_start) kernelBootArgs, &kernelBootArgs->ksize, &kernelBootArgs->kaddr); - +#ifdef __x86_64__ + /* enable NX/XD, boot processor */ + if (cpuid_extfeatures() & CPUID_EXTFEATURE_XD) { + wrmsr64(MSR_IA32_EFER, rdmsr64(MSR_IA32_EFER) | MSR_IA32_EFER_NXE); + DBG("vstart() NX/XD enabled\n"); + } +#endif postcode(PSTART_PAGE_TABLES); Idle_PTs_init(); @@ -324,12 +334,18 @@ vstart(vm_offset_t boot_args_start) first_avail = (vm_offset_t)ID_MAP_VTOP(physfree); cpu = 0; + cpu_data_alloc(TRUE); } else { /* Find our logical cpu number */ cpu = lapic_to_cpu[(LAPIC_READ(ID)>>LAPIC_ID_SHIFT) & LAPIC_ID_MASK]; +#ifdef __x86_64__ + if (cpuid_extfeatures() & CPUID_EXTFEATURE_XD) { + wrmsr64(MSR_IA32_EFER, rdmsr64(MSR_IA32_EFER) | MSR_IA32_EFER_NXE); + DBG("vstart() NX/XD enabled, non-boot\n"); + } +#endif } - if(is_boot_cpu) cpu_data_alloc(TRUE); #ifdef __x86_64__ if(is_boot_cpu) cpu_desc_init64(cpu_datap(cpu)); @@ -339,14 +355,11 @@ vstart(vm_offset_t boot_args_start) cpu_desc_init(cpu_datap(cpu)); cpu_desc_load(cpu_datap(cpu)); #endif - cpu_mode_init(current_cpu_datap()); - - /* enable NX/XD */ - if (cpuid_extfeatures() & CPUID_EXTFEATURE_XD) - wrmsr64(MSR_IA32_EFER, rdmsr64(MSR_IA32_EFER) | MSR_IA32_EFER_NXE); - DBG("vstart() NX/XD enabled\n"); - - + if (is_boot_cpu) + cpu_mode_init(current_cpu_datap()); /* cpu_mode_init() will be + * invoked on the APs + * via i386_init_slave() + */ #ifdef __x86_64__ /* Done with identity mapping */ IdlePML4[0] = 0; @@ -354,6 +367,11 @@ vstart(vm_offset_t boot_args_start) postcode(VSTART_EXIT); #ifdef __i386__ + if (cpuid_extfeatures() & CPUID_EXTFEATURE_XD) { + wrmsr64(MSR_IA32_EFER, rdmsr64(MSR_IA32_EFER) | MSR_IA32_EFER_NXE); + DBG("vstart() NX/XD enabled, i386\n"); + } + if (is_boot_cpu) i386_init(boot_args_start); else @@ -394,13 +412,12 @@ i386_init(vm_offset_t boot_args_start) uint64_t maxmemtouse; unsigned int cpus = 0; boolean_t fidn; -#ifdef __i386__ - boolean_t legacy_mode; -#endif boolean_t IA32e = TRUE; postcode(I386_INIT_ENTRY); + pal_i386_init(); + #if CONFIG_MCA /* Initialize machine-check handling */ mca_cpu_init(); @@ -414,20 +431,19 @@ i386_init(vm_offset_t boot_args_start) DBG("i386_init(0x%lx) kernelBootArgs=%p\n", (unsigned long)boot_args_start, kernelBootArgs); + PE_init_platform(FALSE, kernelBootArgs); + postcode(PE_INIT_PLATFORM_D); + + kernel_early_bootstrap(); + master_cpu = 0; cpu_init(); postcode(CPU_INIT_D); - - PE_init_platform(FALSE, kernelBootArgs); - postcode(PE_INIT_PLATFORM_D); - - printf_init(); /* Init this in case we need debugger */ panic_init(); /* Init this in case we need debugger */ - /* setup debugging output if one has been chosen */ PE_init_kprintf(FALSE); @@ -460,7 +476,6 @@ i386_init(vm_offset_t boot_args_start) max_ncpus = cpus; } - /* * debug support for > 4G systems */ @@ -471,12 +486,21 @@ i386_init(vm_offset_t boot_args_start) force_immediate_debugger_NMI = FALSE; else force_immediate_debugger_NMI = fidn; -#ifdef __i386__ + +#if DEBUG + nanoseconds_to_absolutetime(URGENCY_NOTIFICATION_ASSERT_NS, &urgency_notification_assert_abstime_threshold); +#endif + PE_parse_boot_argn("urgency_notification_abstime", + &urgency_notification_assert_abstime_threshold, + sizeof(urgency_notification_assert_abstime_threshold)); + +#if CONFIG_YONAH /* * At this point we check whether we are a 64-bit processor * and that we're not restricted to legacy mode, 32-bit operation. */ if (cpuid_extfeatures() & CPUID_EXTFEATURE_EM64T) { + boolean_t legacy_mode; kprintf("EM64T supported"); if (PE_parse_boot_argn("-legacy", &legacy_mode, sizeof (legacy_mode))) { kprintf(" but legacy mode forced\n"); @@ -491,27 +515,20 @@ i386_init(vm_offset_t boot_args_start) if (!(cpuid_extfeatures() & CPUID_EXTFEATURE_XD)) nx_enabled = 0; - /* Obtain "lcks" options:this currently controls lock statistics */ - if (!PE_parse_boot_argn("lcks", &LcksOpts, sizeof (LcksOpts))) - LcksOpts = 0; - /* * VM initialization, after this we're using page tables... * The maximum number of cpus must be set beforehand. */ i386_vm_init(maxmemtouse, IA32e, kernelBootArgs); - if ( ! PE_parse_boot_argn("novmx", &noVMX, sizeof (noVMX))) - noVMX = 0; /* OK to support Altivec in rosetta? */ + /* create the console for verbose or pretty mode */ + /* Note: doing this prior to tsc_init() allows for graceful panic! */ + PE_init_platform(TRUE, kernelBootArgs); + PE_create_console(); tsc_init(); power_management_init(); - PE_init_platform(TRUE, kernelBootArgs); - - /* create the console for verbose or pretty mode */ - PE_create_console(); - processor_bootstrap(); thread_bootstrap(); @@ -546,17 +563,24 @@ do_init_slave(boolean_t fast_restart) init_fpu(); +#if CONFIG_MTRR mtrr_update_cpu(); +#endif } else init_param = FAST_SLAVE_INIT; + /* update CPU microcode */ + ucode_update_wake(); + #if CONFIG_VMX /* resume VT operation */ vmx_resume(); #endif +#if CONFIG_MTRR if (!fast_restart) pat_init(); +#endif cpu_thread_init(); /* not strictly necessary */ diff --git a/osfmk/i386/i386_lock.s b/osfmk/i386/i386_lock.s index 267b4b0db..9ea9f982b 100644 --- a/osfmk/i386/i386_lock.s +++ b/osfmk/i386/i386_lock.s @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -48,11 +48,7 @@ #define PAUSE rep; nop - -#define PUSHF pushf -#define POPF popf -#define CLI cli - +#include /* * When performance isn't the only concern, it's @@ -124,18 +120,23 @@ /* For x86_64, the varargs ABI requires that %al indicate * how many SSE register contain arguments. In our case, 0 */ #if __i386__ -#define LOAD_STRING_ARG0(label) pushl $##label ; -#define LOAD_ARG1(x) pushl x ; +#define ALIGN_STACK() subl $8, %esp; andl $0xFFFFFFF0, %esp ; +#define LOAD_STRING_ARG0(label) movl $##label, (%esp) ; +#define LOAD_ARG1(x) mov x, 4(%esp) ; +#define LOAD_PTR_ARG1(x) mov x, 4(%esp) ; #define CALL_PANIC() call EXT(panic) ; #else +#define ALIGN_STACK() and $0xFFFFFFFFFFFFFFF0, %rsp ; #define LOAD_STRING_ARG0(label) leaq label(%rip), %rdi ; -#define LOAD_ARG1(x) movq x, %rsi ; +#define LOAD_ARG1(x) mov x, %esi ; +#define LOAD_PTR_ARG1(x) mov x, %rsi ; #define CALL_PANIC() xorb %al,%al ; call EXT(panic) ; #endif #define CHECK_UNLOCK(current, owner) \ cmp current, owner ; \ je 1f ; \ + ALIGN_STACK() ; \ LOAD_STRING_ARG0(2f) ; \ CALL_PANIC() ; \ hlt ; \ @@ -157,6 +158,7 @@ #define CHECK_MUTEX_TYPE() \ cmpl $ MUTEX_TAG,M_TYPE ; \ je 1f ; \ + ALIGN_STACK() ; \ LOAD_STRING_ARG0(2f) ; \ CALL_PANIC() ; \ hlt ; \ @@ -177,7 +179,9 @@ jne 1f ; \ cmpl $0,%gs:CPU_PREEMPTION_LEVEL ; \ je 1f ; \ - LOAD_ARG1(%gs:CPU_PREEMPTION_LEVEL) ; \ + ALIGN_STACK() ; \ + movl %gs:CPU_PREEMPTION_LEVEL, %eax ; \ + LOAD_ARG1(%eax) ; \ LOAD_STRING_ARG0(2f) ; \ CALL_PANIC() ; \ hlt ; \ @@ -192,6 +196,7 @@ #define CHECK_MYLOCK(current, owner) \ cmp current, owner ; \ jne 1f ; \ + ALIGN_STACK() ; \ LOAD_STRING_ARG0(2f) ; \ CALL_PANIC() ; \ hlt ; \ @@ -206,32 +211,47 @@ #define CHECK_MYLOCK(thd) #endif /* MACH_LDEBUG */ - #define PREEMPTION_DISABLE \ - incl %gs:CPU_PREEMPTION_LEVEL - - + incl %gs:CPU_PREEMPTION_LEVEL + +#if MACH_LDEBUG || 1 +#define PREEMPTION_LEVEL_DEBUG 1 +#endif +#if PREEMPTION_LEVEL_DEBUG #define PREEMPTION_ENABLE \ decl %gs:CPU_PREEMPTION_LEVEL ; \ - jne 9f ; \ + js 17f ; \ + jnz 19f ; \ + testl $AST_URGENT,%gs:CPU_PENDING_AST ; \ + jz 19f ; \ PUSHF ; \ - testl $ EFL_IF,S_PC ; \ - je 8f ; \ - CLI ; \ - movl %gs:CPU_PENDING_AST,%eax ; \ - testl $ AST_URGENT,%eax ; \ - je 8f ; \ - movl %gs:CPU_INTERRUPT_LEVEL,%eax ; \ - testl %eax,%eax ; \ - jne 8f ; \ + testl $EFL_IF, S_PC ; \ + jz 18f ; \ POPF ; \ int $(T_PREEMPT) ; \ - jmp 9f ; \ -8: \ + jmp 19f ; \ +17: \ + call _preemption_underflow_panic ; \ +18: \ POPF ; \ -9: +19: +#else +#define PREEMPTION_ENABLE \ + decl %gs:CPU_PREEMPTION_LEVEL ; \ + jnz 19f ; \ + testl $AST_URGENT,%gs:CPU_PENDING_AST ; \ + jz 19f ; \ + PUSHF ; \ + testl $EFL_IF, S_PC ; \ + jz 18f ; \ + POPF ; \ + int $(T_PREEMPT) ; \ + jmp 19f ; \ +18: \ + POPF ; \ +19: +#endif - #if CONFIG_DTRACE @@ -646,8 +666,10 @@ Entry(lck_rw_lock_shared) */ LOCKSTAT_LABEL(_lck_rw_lock_shared_lockstat_patch_point) ret - /* Fall thru when patched, counting on lock pointer in LCK_RW_REGISTER */ - LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, LCK_RW_REGISTER) + /* + Fall thru when patched, counting on lock pointer in LCK_RW_REGISTER + */ + LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, LCK_RW_REGISTER) #endif ret 2: @@ -972,6 +994,7 @@ Entry(lck_rw_done) PAUSE jmp 1b 8: + ALIGN_STACK() LOAD_STRING_ARG0(rwl_release_error_str) CALL_PANIC() @@ -1121,13 +1144,11 @@ Entry(lck_rw_held_read_or_upgrade) #define LMTX_A_REG32 %eax #define LMTX_C_REG %ecx #define LMTX_C_REG32 %ecx -#define LMTX_D_REG %edx #define LMTX_RET_REG %eax +#define LMTX_RET_REG32 %eax #define LMTX_LGROUP_REG %esi #define LMTX_SSTATE_REG %edi #define LOAD_LMTX_REG(arg) mov arg, LMTX_REG -#define LOAD_REG_ARG0(reg) push reg -#define LOAD_REG_ARG1(reg) push reg #define LMTX_CHK_EXTENDED cmp LMTX_REG, LMTX_ARG0 #define LMTX_ASSERT_OWNED cmpl $(MUTEX_ASSERT_OWNED), LMTX_ARG1 @@ -1222,13 +1243,11 @@ Entry(lck_rw_held_read_or_upgrade) #define LMTX_A_REG32 %eax #define LMTX_C_REG %rcx #define LMTX_C_REG32 %ecx -#define LMTX_D_REG %rdx #define LMTX_RET_REG %rax +#define LMTX_RET_REG32 %eax #define LMTX_LGROUP_REG %r10 #define LMTX_SSTATE_REG %r11 #define LOAD_LMTX_REG(arg) mov %rdi, %rdx -#define LOAD_REG_ARG0(reg) mov reg, %rdi -#define LOAD_REG_ARG1(reg) mov reg, %rsi #define LMTX_CHK_EXTENDED cmp LMTX_REG, LMTX_REG_ORIG #define LMTX_ASSERT_OWNED cmp $(MUTEX_ASSERT_OWNED), LMTX_ARG1 @@ -1319,7 +1338,7 @@ Entry(lck_rw_held_read_or_upgrade) pop LMTX_SSTATE_REG ; \ pop LMTX_LGROUP_REG ; \ 12: - + #else #error Unsupported architecture #endif @@ -1332,8 +1351,6 @@ Entry(lck_rw_held_read_or_upgrade) #define M_PROMOTED_MSK 0x04000000 #define M_SPIN_MSK 0x08000000 - - /* * void lck_mtx_assert(lck_mtx_t* l, unsigned int) * Takes the address of a lock, and an assertion type as parameters. @@ -1348,10 +1365,11 @@ NONLEAF_ENTRY(lck_mtx_assert) LOAD_LMTX_REG(B_ARG0) /* Load lock address */ mov %gs:CPU_ACTIVE_THREAD, LMTX_A_REG /* Load current thread */ - mov M_OWNER(LMTX_REG), LMTX_C_REG - cmp $(MUTEX_IND), LMTX_C_REG /* Is this an indirect mutex? */ - cmove M_PTR(LMTX_REG), LMTX_REG /* If so, take indirection */ - + mov M_STATE(LMTX_REG), LMTX_C_REG32 + cmp $(MUTEX_IND), LMTX_C_REG32 /* Is this an indirect mutex? */ + jne 0f + mov M_PTR(LMTX_REG), LMTX_REG /* If so, take indirection */ +0: mov M_OWNER(LMTX_REG), LMTX_C_REG /* Load owner */ LMTX_ASSERT_OWNED jne 2f /* Assert ownership? */ @@ -1364,18 +1382,21 @@ NONLEAF_ENTRY(lck_mtx_assert) 2: cmp LMTX_A_REG, LMTX_C_REG /* Current thread match? */ jne 1b /* No, return */ - LOAD_REG_ARG1(LMTX_REG) + ALIGN_STACK() + LOAD_PTR_ARG1(LMTX_REG) LOAD_STRING_ARG0(mutex_assert_owned_str) jmp 4f 3: - LOAD_REG_ARG1(LMTX_REG) + ALIGN_STACK() + LOAD_PTR_ARG1(LMTX_REG) LOAD_STRING_ARG0(mutex_assert_not_owned_str) 4: CALL_PANIC() lck_mtx_destroyed: - LOAD_REG_ARG1(LMTX_REG) + ALIGN_STACK() + LOAD_PTR_ARG1(LMTX_REG) LOAD_STRING_ARG0(mutex_interlock_destroyed_str) CALL_PANIC() @@ -1396,54 +1417,38 @@ mutex_interlock_destroyed_str: * lck_mtx_try_lock() * lck_mtx_unlock() * lck_mtx_lock_spin() + * lck_mtx_lock_spin_always() * lck_mtx_convert_spin() */ - +NONLEAF_ENTRY(lck_mtx_lock_spin_always) + LOAD_LMTX_REG(B_ARG0) /* fetch lock pointer */ + jmp Llmls_avoid_check + NONLEAF_ENTRY(lck_mtx_lock_spin) LOAD_LMTX_REG(B_ARG0) /* fetch lock pointer */ CHECK_PREEMPTION_LEVEL() - - mov M_STATE(LMTX_REG), LMTX_C_REG32 - test $(M_ILOCKED_MSK), LMTX_C_REG /* is the interlock held */ - je Llmls_enter /* no - can't be INDIRECT or DESTROYED */ - - mov M_OWNER(LMTX_REG), LMTX_A_REG - cmp $(MUTEX_DESTROYED), LMTX_A_REG /* check to see if its marked destroyed */ - je lck_mtx_destroyed - cmp $(MUTEX_IND), LMTX_A_REG /* Is this an indirect mutex */ - jne Llmls_loop - - LMTX_ENTER_EXTENDED - - mov M_STATE(LMTX_REG), LMTX_C_REG32 - test $(M_SPIN_MSK), LMTX_C_REG - je Llmls_loop - - LMTX_UPDATE_MISS -Llmls_loop: - PAUSE +Llmls_avoid_check: mov M_STATE(LMTX_REG), LMTX_C_REG32 - - test $(M_ILOCKED_MSK), LMTX_C_REG /* is the interlock held */ - jne Llmls_loop -Llmls_enter: - test $(M_MLOCKED_MSK), LMTX_C_REG /* is the mutex locked */ - jne Llml_contended /* fall back to normal mutex handling */ - - PUSHF /* save interrupt state */ + test $(M_ILOCKED_MSK | M_MLOCKED_MSK), LMTX_C_REG32 /* is the interlock or mutex held */ + jnz Llmls_slow +Llmls_try: /* no - can't be INDIRECT, DESTROYED or locked */ mov LMTX_C_REG, LMTX_A_REG /* eax contains snapshot for cmpxchgl */ - or $(M_ILOCKED_MSK | M_SPIN_MSK), LMTX_C_REG - CLI /* disable interrupts */ + or $(M_ILOCKED_MSK | M_SPIN_MSK), LMTX_C_REG32 + + PREEMPTION_DISABLE lock cmpxchg LMTX_C_REG32, M_STATE(LMTX_REG) /* atomic compare and exchange */ - jne 1f + jne Llmls_busy_disabled mov %gs:CPU_ACTIVE_THREAD, LMTX_A_REG mov LMTX_A_REG, M_OWNER(LMTX_REG) /* record owner of interlock */ - - PREEMPTION_DISABLE - POPF /* restore interrupt state */ +#if MACH_LDEBUG + test LMTX_A_REG, LMTX_A_REG + jz 1f + incl TH_MUTEX_COUNT(LMTX_A_REG) /* lock statistic */ +1: +#endif /* MACH_LDEBUG */ LMTX_CHK_EXTENDED_EXIT /* return with the interlock held and preemption disabled */ @@ -1456,59 +1461,73 @@ Llmls_enter: #endif ret -1: - POPF /* restore interrupt state */ - jmp Llmls_loop - - - -NONLEAF_ENTRY(lck_mtx_lock) - LOAD_LMTX_REG(B_ARG0) /* fetch lock pointer */ - - CHECK_PREEMPTION_LEVEL() +Llmls_slow: + test $M_ILOCKED_MSK, LMTX_C_REG32 /* is the interlock held */ + jz Llml_contended /* no, must have been the mutex */ - mov M_STATE(LMTX_REG), LMTX_C_REG32 - test $(M_ILOCKED_MSK), LMTX_C_REG /* is the interlock held */ - je Llml_enter /* no - can't be INDIRECT or DESTROYED */ - - mov M_OWNER(LMTX_REG), LMTX_A_REG - cmp $(MUTEX_DESTROYED), LMTX_A_REG /* check to see if its marked destroyed */ + cmp $(MUTEX_DESTROYED), LMTX_C_REG32 /* check to see if its marked destroyed */ je lck_mtx_destroyed - cmp $(MUTEX_IND), LMTX_A_REG /* Is this an indirect mutex? */ - jne Llml_loop + cmp $(MUTEX_IND), LMTX_C_REG32 /* Is this an indirect mutex */ + jne Llmls_loop /* no... must be interlocked */ LMTX_ENTER_EXTENDED mov M_STATE(LMTX_REG), LMTX_C_REG32 - test $(M_SPIN_MSK), LMTX_C_REG - je Llml_loop + test $(M_SPIN_MSK), LMTX_C_REG32 + jz Llmls_loop1 - LMTX_UPDATE_MISS -Llml_loop: + LMTX_UPDATE_MISS /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */ +Llmls_loop: PAUSE mov M_STATE(LMTX_REG), LMTX_C_REG32 +Llmls_loop1: + test $(M_ILOCKED_MSK | M_MLOCKED_MSK), LMTX_C_REG32 + jz Llmls_try + test $(M_MLOCKED_MSK), LMTX_C_REG32 + jnz Llml_contended /* mutex owned by someone else, go contend for it */ + jmp Llmls_loop + +Llmls_busy_disabled: + PREEMPTION_ENABLE + jmp Llmls_loop - test $(M_ILOCKED_MSK), LMTX_C_REG - jne Llml_loop -Llml_enter: - test $(M_MLOCKED_MSK), LMTX_C_REG - jne Llml_contended /* mutex owned by someone else, go contend for it */ + +NONLEAF_ENTRY(lck_mtx_lock) + LOAD_LMTX_REG(B_ARG0) /* fetch lock pointer */ + + CHECK_PREEMPTION_LEVEL() + + mov M_STATE(LMTX_REG), LMTX_C_REG32 + test $(M_ILOCKED_MSK | M_MLOCKED_MSK), LMTX_C_REG32 /* is the interlock or mutex held */ + jnz Llml_slow +Llml_try: /* no - can't be INDIRECT, DESTROYED or locked */ mov LMTX_C_REG, LMTX_A_REG /* eax contains snapshot for cmpxchgl */ - or $(M_MLOCKED_MSK), LMTX_C_REG + or $(M_ILOCKED_MSK | M_MLOCKED_MSK), LMTX_C_REG32 + + PREEMPTION_DISABLE lock cmpxchg LMTX_C_REG32, M_STATE(LMTX_REG) /* atomic compare and exchange */ - jne Llml_loop + jne Llml_busy_disabled mov %gs:CPU_ACTIVE_THREAD, LMTX_A_REG mov LMTX_A_REG, M_OWNER(LMTX_REG) /* record owner of mutex */ +#if MACH_LDEBUG + test LMTX_A_REG, LMTX_A_REG + jz 1f + incl TH_MUTEX_COUNT(LMTX_A_REG) /* lock statistic */ +1: +#endif /* MACH_LDEBUG */ -Llml_acquired: testl $(M_WAITERS_MSK), M_STATE(LMTX_REG) - je 1f + jz Llml_finish LMTX_CALLEXT1(lck_mtx_lock_acquire_x86) -1: + +Llml_finish: + andl $(~M_ILOCKED_MSK), M_STATE(LMTX_REG) + PREEMPTION_ENABLE + LMTX_CHK_EXTENDED /* is this an extended mutex */ jne 2f @@ -1530,8 +1549,39 @@ Llml_acquired: LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_ACQUIRE, LMTX_REG) #endif ret + + +Llml_slow: + test $M_ILOCKED_MSK, LMTX_C_REG32 /* is the interlock held */ + jz Llml_contended /* no, must have been the mutex */ + cmp $(MUTEX_DESTROYED), LMTX_C_REG32 /* check to see if its marked destroyed */ + je lck_mtx_destroyed + cmp $(MUTEX_IND), LMTX_C_REG32 /* Is this an indirect mutex? */ + jne Llml_loop /* no... must be interlocked */ + + LMTX_ENTER_EXTENDED + + mov M_STATE(LMTX_REG), LMTX_C_REG32 + test $(M_SPIN_MSK), LMTX_C_REG32 + jz Llml_loop1 + + LMTX_UPDATE_MISS /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */ +Llml_loop: + PAUSE + mov M_STATE(LMTX_REG), LMTX_C_REG32 +Llml_loop1: + test $(M_ILOCKED_MSK | M_MLOCKED_MSK), LMTX_C_REG32 + jz Llml_try + test $(M_MLOCKED_MSK), LMTX_C_REG32 + jnz Llml_contended /* mutex owned by someone else, go contend for it */ + jmp Llml_loop + +Llml_busy_disabled: + PREEMPTION_ENABLE + jmp Llml_loop + Llml_contended: LMTX_CHK_EXTENDED /* is this an extended mutex */ je 0f @@ -1540,7 +1590,8 @@ Llml_contended: LMTX_CALLEXT1(lck_mtx_lock_spinwait_x86) test LMTX_RET_REG, LMTX_RET_REG - je Llml_acquired /* acquired mutex */ + jz Llml_acquired /* acquired mutex, interlock held and preemption disabled */ + cmp $1, LMTX_RET_REG /* check for direct wait status */ je 2f LMTX_CHK_EXTENDED /* is this an extended mutex */ @@ -1548,32 +1599,43 @@ Llml_contended: LMTX_UPDATE_DIRECT_WAIT 2: mov M_STATE(LMTX_REG), LMTX_C_REG32 - test $(M_ILOCKED_MSK), LMTX_C_REG - jne 6f + test $(M_ILOCKED_MSK), LMTX_C_REG32 + jnz 6f - PUSHF /* save state of interrupt mask */ mov LMTX_C_REG, LMTX_A_REG /* eax contains snapshot for cmpxchgl */ - or $(M_ILOCKED_MSK), LMTX_C_REG /* try to take the interlock */ - CLI /* disable interrupts */ + or $(M_ILOCKED_MSK), LMTX_C_REG32 /* try to take the interlock */ + + PREEMPTION_DISABLE lock cmpxchg LMTX_C_REG32, M_STATE(LMTX_REG) /* atomic compare and exchange */ jne 5f - test $(M_MLOCKED_MSK), LMTX_C_REG /* we've got the interlock and */ - jne 3f - or $(M_MLOCKED_MSK), LMTX_C_REG /* the mutex is free... grab it directly */ - and $(~M_ILOCKED_MSK), LMTX_C_REG + test $(M_MLOCKED_MSK), LMTX_C_REG32 /* we've got the interlock and */ + jnz 3f + or $(M_MLOCKED_MSK), LMTX_C_REG32 /* the mutex is free... grab it directly */ + mov LMTX_C_REG32, M_STATE(LMTX_REG) mov %gs:CPU_ACTIVE_THREAD, LMTX_A_REG mov LMTX_A_REG, M_OWNER(LMTX_REG) /* record owner of mutex */ - mov LMTX_C_REG32, M_STATE(LMTX_REG) /* now drop the interlock */ +#if MACH_LDEBUG + test LMTX_A_REG, LMTX_A_REG + jz 1f + incl TH_MUTEX_COUNT(LMTX_A_REG) /* lock statistic */ +1: +#endif /* MACH_LDEBUG */ - POPF /* restore interrupt state */ - jmp Llml_acquired -3: /* interlock held, mutex busy */ - PREEMPTION_DISABLE - POPF /* restore interrupt state */ +Llml_acquired: + testl $(M_WAITERS_MSK), M_STATE(LMTX_REG) + jnz 1f + mov M_OWNER(LMTX_REG), LMTX_A_REG + mov TH_WAS_PROMOTED_ON_WAKEUP(LMTX_A_REG), LMTX_A_REG32 + test LMTX_A_REG32, LMTX_A_REG32 + jz Llml_finish +1: + LMTX_CALLEXT1(lck_mtx_lock_acquire_x86) + jmp Llml_finish +3: /* interlock held, mutex busy */ LMTX_CHK_EXTENDED /* is this an extended mutex */ je 4f LMTX_UPDATE_WAIT @@ -1581,7 +1643,7 @@ Llml_contended: LMTX_CALLEXT1(lck_mtx_lock_wait_x86) jmp Llml_contended 5: - POPF /* restore interrupt state */ + PREEMPTION_ENABLE 6: PAUSE jmp 2b @@ -1592,38 +1654,25 @@ NONLEAF_ENTRY(lck_mtx_try_lock_spin) LOAD_LMTX_REG(B_ARG0) /* fetch lock pointer */ mov M_STATE(LMTX_REG), LMTX_C_REG32 - test $(M_ILOCKED_MSK), LMTX_C_REG /* is the interlock held */ - je Llmts_enter /* no - can't be INDIRECT or DESTROYED */ - - mov M_OWNER(LMTX_REG), LMTX_A_REG - cmp $(MUTEX_DESTROYED), LMTX_A_REG /* check to see if its marked destroyed */ - je lck_mtx_destroyed - cmp $(MUTEX_IND), LMTX_A_REG /* Is this an indirect mutex? */ - jne Llmts_enter - - LMTX_ENTER_EXTENDED -Llmts_loop: - PAUSE - mov M_STATE(LMTX_REG), LMTX_C_REG32 -Llmts_enter: - test $(M_MLOCKED_MSK | M_SPIN_MSK), LMTX_C_REG - jne Llmts_fail - test $(M_ILOCKED_MSK), LMTX_C_REG - jne Llmts_loop - - PUSHF /* save interrupt state */ + test $(M_ILOCKED_MSK | M_MLOCKED_MSK), LMTX_C_REG32 /* is the interlock or mutex held */ + jnz Llmts_slow +Llmts_try: /* no - can't be INDIRECT, DESTROYED or locked */ mov LMTX_C_REG, LMTX_A_REG /* eax contains snapshot for cmpxchgl */ or $(M_ILOCKED_MSK | M_SPIN_MSK), LMTX_C_REG - CLI /* disable interrupts */ + + PREEMPTION_DISABLE lock cmpxchg LMTX_C_REG32, M_STATE(LMTX_REG) /* atomic compare and exchange */ - jne 3f + jne Llmts_busy_disabled mov %gs:CPU_ACTIVE_THREAD, LMTX_A_REG mov LMTX_A_REG, M_OWNER(LMTX_REG) /* record owner of mutex */ - - PREEMPTION_DISABLE - POPF /* restore interrupt state */ +#if MACH_LDEBUG + test LMTX_A_REG, LMTX_A_REG + jz 1f + incl TH_MUTEX_COUNT(LMTX_A_REG) /* lock statistic */ +1: +#endif /* MACH_LDEBUG */ LMTX_CHK_EXTENDED_EXIT leave @@ -1637,52 +1686,68 @@ Llmts_enter: #endif mov $1, LMTX_RET_REG /* return success */ ret -3: - POPF /* restore interrupt state */ - jmp Llmts_loop - - -NONLEAF_ENTRY(lck_mtx_try_lock) - LOAD_LMTX_REG(B_ARG0) /* fetch lock pointer */ +Llmts_slow: + test $(M_ILOCKED_MSK), LMTX_C_REG32 /* is the interlock held */ + jz Llmts_fail /* no, must be held as a mutex */ - mov M_STATE(LMTX_REG), LMTX_C_REG32 - test $(M_ILOCKED_MSK), LMTX_C_REG /* is the interlock held */ - je Llmt_enter /* no - can't be INDIRECT or DESTROYED */ - - mov M_OWNER(LMTX_REG), LMTX_A_REG - cmp $(MUTEX_DESTROYED), LMTX_A_REG /* check to see if its marked destroyed */ + cmp $(MUTEX_DESTROYED), LMTX_C_REG32 /* check to see if its marked destroyed */ je lck_mtx_destroyed - cmp $(MUTEX_IND), LMTX_A_REG /* Is this an indirect mutex? */ - jne Llmt_enter + cmp $(MUTEX_IND), LMTX_C_REG32 /* Is this an indirect mutex? */ + jne Llmts_loop1 LMTX_ENTER_EXTENDED -Llmt_loop: +Llmts_loop: PAUSE mov M_STATE(LMTX_REG), LMTX_C_REG32 -Llmt_enter: - test $(M_MLOCKED_MSK | M_SPIN_MSK), LMTX_C_REG - jne Llmt_fail - test $(M_ILOCKED_MSK), LMTX_C_REG - jne Llmt_loop +Llmts_loop1: + test $(M_MLOCKED_MSK | M_SPIN_MSK), LMTX_C_REG32 + jnz Llmts_fail + test $(M_ILOCKED_MSK), LMTX_C_REG32 + jz Llmts_try + jmp Llmts_loop + +Llmts_busy_disabled: + PREEMPTION_ENABLE + jmp Llmts_loop + + + +NONLEAF_ENTRY(lck_mtx_try_lock) + LOAD_LMTX_REG(B_ARG0) /* fetch lock pointer */ + mov M_STATE(LMTX_REG), LMTX_C_REG32 + test $(M_ILOCKED_MSK | M_MLOCKED_MSK), LMTX_C_REG32 /* is the interlock or mutex held */ + jnz Llmt_slow +Llmt_try: /* no - can't be INDIRECT, DESTROYED or locked */ mov LMTX_C_REG, LMTX_A_REG /* eax contains snapshot for cmpxchgl */ - or $(M_MLOCKED_MSK), LMTX_C_REG + or $(M_ILOCKED_MSK | M_MLOCKED_MSK), LMTX_C_REG32 + + PREEMPTION_DISABLE lock cmpxchg LMTX_C_REG32, M_STATE(LMTX_REG) /* atomic compare and exchange */ - jne Llmt_loop + jne Llmt_busy_disabled mov %gs:CPU_ACTIVE_THREAD, LMTX_A_REG mov LMTX_A_REG, M_OWNER(LMTX_REG) /* record owner of mutex */ +#if MACH_LDEBUG + test LMTX_A_REG, LMTX_A_REG + jz 1f + incl TH_MUTEX_COUNT(LMTX_A_REG) /* lock statistic */ +1: +#endif /* MACH_LDEBUG */ LMTX_CHK_EXTENDED_EXIT - test $(M_WAITERS_MSK), LMTX_C_REG - je 2f + test $(M_WAITERS_MSK), LMTX_C_REG32 + jz 0f + LMTX_CALLEXT1(lck_mtx_lock_acquire_x86) -2: - leave +0: + andl $(~M_ILOCKED_MSK), M_STATE(LMTX_REG) + PREEMPTION_ENABLE + leave #if CONFIG_DTRACE mov $1, LMTX_RET_REG /* return success */ /* Dtrace probe: LS_LCK_MTX_TRY_LOCK_ACQUIRE */ @@ -1694,6 +1759,30 @@ Llmt_enter: mov $1, LMTX_RET_REG /* return success */ ret +Llmt_slow: + test $(M_ILOCKED_MSK), LMTX_C_REG32 /* is the interlock held */ + jz Llmt_fail /* no, must be held as a mutex */ + + cmp $(MUTEX_DESTROYED), LMTX_C_REG32 /* check to see if its marked destroyed */ + je lck_mtx_destroyed + cmp $(MUTEX_IND), LMTX_C_REG32 /* Is this an indirect mutex? */ + jne Llmt_loop + + LMTX_ENTER_EXTENDED +Llmt_loop: + PAUSE + mov M_STATE(LMTX_REG), LMTX_C_REG32 +Llmt_loop1: + test $(M_MLOCKED_MSK | M_SPIN_MSK), LMTX_C_REG32 + jnz Llmt_fail + test $(M_ILOCKED_MSK), LMTX_C_REG32 + jz Llmt_try + jmp Llmt_loop + +Llmt_busy_disabled: + PREEMPTION_ENABLE + jmp Llmt_loop + Llmt_fail: Llmts_fail: @@ -1710,34 +1799,36 @@ Llmts_fail: NONLEAF_ENTRY(lck_mtx_convert_spin) LOAD_LMTX_REG(B_ARG0) /* fetch lock pointer */ - mov M_OWNER(LMTX_REG), LMTX_A_REG - cmp $(MUTEX_IND), LMTX_A_REG /* Is this an indirect mutex? */ - cmove M_PTR(LMTX_REG), LMTX_REG /* If so, take indirection */ + mov M_STATE(LMTX_REG), LMTX_C_REG32 + cmp $(MUTEX_IND), LMTX_C_REG32 /* Is this an indirect mutex? */ + jne 0f + mov M_PTR(LMTX_REG), LMTX_REG /* If so, take indirection */ + mov M_STATE(LMTX_REG), LMTX_C_REG32 +0: + test $(M_MLOCKED_MSK), LMTX_C_REG32 /* already owned as a mutex, just return */ + jnz 2f + test $(M_WAITERS_MSK), LMTX_C_REG32 /* are there any waiters? */ + jz 1f + LMTX_CALLEXT1(lck_mtx_lock_acquire_x86) mov M_STATE(LMTX_REG), LMTX_C_REG32 - test $(M_MLOCKED_MSK), LMTX_C_REG /* already owned as a mutex, just return */ - jne 2f 1: - and $(~(M_ILOCKED_MSK | M_SPIN_MSK)), LMTX_C_REG /* convert from spin version to mutex */ - or $(M_MLOCKED_MSK), LMTX_C_REG + and $(~(M_ILOCKED_MSK | M_SPIN_MSK)), LMTX_C_REG32 /* convert from spin version to mutex */ + or $(M_MLOCKED_MSK), LMTX_C_REG32 mov LMTX_C_REG32, M_STATE(LMTX_REG) /* since I own the interlock, I don't need an atomic update */ - PREEMPTION_ENABLE /* only %eax is consumed */ - - test $(M_WAITERS_MSK), LMTX_C_REG /* are there any waiters? */ - je 2f - - LMTX_CALLEXT1(lck_mtx_lock_acquire_x86) + PREEMPTION_ENABLE 2: NONLEAF_RET + #if defined(__i386__) NONLEAF_ENTRY(lck_mtx_unlock) LOAD_LMTX_REG(B_ARG0) /* fetch lock pointer */ mov M_OWNER(LMTX_REG), LMTX_A_REG test LMTX_A_REG, LMTX_A_REG - jnz Llmu_prim + jnz Llmu_entry leave ret NONLEAF_ENTRY(lck_mtx_unlock_darwin10) @@ -1745,49 +1836,56 @@ NONLEAF_ENTRY(lck_mtx_unlock_darwin10) NONLEAF_ENTRY(lck_mtx_unlock) #endif LOAD_LMTX_REG(B_ARG0) /* fetch lock pointer */ - mov M_OWNER(LMTX_REG), LMTX_A_REG +Llmu_entry: + mov M_STATE(LMTX_REG), LMTX_C_REG32 Llmu_prim: - cmp $(MUTEX_IND), LMTX_A_REG /* Is this an indirect mutex? */ + cmp $(MUTEX_IND), LMTX_C_REG32 /* Is this an indirect mutex? */ je Llmu_ext -0: - mov M_STATE(LMTX_REG), LMTX_C_REG32 - test $(M_MLOCKED_MSK), LMTX_C_REG /* check for full mutex */ - jne 1f - xor LMTX_A_REG, LMTX_A_REG - mov LMTX_A_REG, M_OWNER(LMTX_REG) - mov LMTX_C_REG, LMTX_A_REG /* keep original state in %ecx for later evaluation */ - and $(~(M_ILOCKED_MSK | M_SPIN_MSK | M_PROMOTED_MSK)), LMTX_A_REG - mov LMTX_A_REG32, M_STATE(LMTX_REG) /* since I own the interlock, I don't need an atomic update */ - - PREEMPTION_ENABLE /* need to re-enable preemption - clobbers eax */ - jmp 2f -1: +Llmu_chktype: + test $(M_MLOCKED_MSK), LMTX_C_REG32 /* check for full mutex */ + jz Llmu_unlock +Llmu_mutex: test $(M_ILOCKED_MSK), LMTX_C_REG /* have to wait for interlock to clear */ - jne 7f + jnz Llmu_busy - PUSHF /* save interrupt state */ mov LMTX_C_REG, LMTX_A_REG /* eax contains snapshot for cmpxchgl */ - and $(~M_MLOCKED_MSK), LMTX_C_REG /* drop mutex */ - or $(M_ILOCKED_MSK), LMTX_C_REG /* pick up interlock */ - CLI + and $(~M_MLOCKED_MSK), LMTX_C_REG32 /* drop mutex */ + or $(M_ILOCKED_MSK), LMTX_C_REG32 /* pick up interlock */ + + PREEMPTION_DISABLE lock cmpxchg LMTX_C_REG32, M_STATE(LMTX_REG) /* atomic compare and exchange */ - jne 6f /* branch on failure to spin loop */ + jne Llmu_busy_disabled /* branch on failure to spin loop */ +Llmu_unlock: xor LMTX_A_REG, LMTX_A_REG mov LMTX_A_REG, M_OWNER(LMTX_REG) mov LMTX_C_REG, LMTX_A_REG /* keep original state in %ecx for later evaluation */ - and $(~(M_ILOCKED_MSK | M_PROMOTED_MSK)), LMTX_A_REG - mov LMTX_A_REG32, M_STATE(LMTX_REG) /* since I own the interlock, I don't need an atomic update */ - POPF /* restore interrupt state */ + and $(~(M_ILOCKED_MSK | M_SPIN_MSK | M_PROMOTED_MSK)), LMTX_A_REG + + test $(M_WAITERS_MSK), LMTX_A_REG32 + jz 2f + dec LMTX_A_REG32 /* decrement waiter count */ 2: - test $(M_PROMOTED_MSK | M_WAITERS_MSK), LMTX_C_REG - je 3f - and $(M_PROMOTED_MSK), LMTX_C_REG + mov LMTX_A_REG32, M_STATE(LMTX_REG) /* since I own the interlock, I don't need an atomic update */ + +#if MACH_LDEBUG + /* perform lock statistics after drop to prevent delay */ + mov %gs:CPU_ACTIVE_THREAD, LMTX_A_REG + test LMTX_A_REG, LMTX_A_REG + jz 1f + decl TH_MUTEX_COUNT(LMTX_A_REG) /* lock statistic */ +1: +#endif /* MACH_LDEBUG */ + + test $(M_PROMOTED_MSK | M_WAITERS_MSK), LMTX_C_REG32 + jz 3f LMTX_CALLEXT2(lck_mtx_unlock_wakeup_x86, LMTX_C_REG) 3: + PREEMPTION_ENABLE + LMTX_CHK_EXTENDED jne 4f @@ -1810,77 +1908,25 @@ Llmu_prim: LOCKSTAT_RECORD(LS_LCK_MTX_EXT_UNLOCK_RELEASE, LMTX_REG) #endif ret -6: - POPF /* restore interrupt state */ -7: + + +Llmu_busy_disabled: + PREEMPTION_ENABLE +Llmu_busy: PAUSE mov M_STATE(LMTX_REG), LMTX_C_REG32 - jmp 1b + jmp Llmu_mutex + Llmu_ext: mov M_PTR(LMTX_REG), LMTX_REG mov M_OWNER(LMTX_REG), LMTX_A_REG mov %gs:CPU_ACTIVE_THREAD, LMTX_C_REG CHECK_UNLOCK(LMTX_C_REG, LMTX_A_REG) - jmp 0b - - -LEAF_ENTRY(lck_mtx_lock_decr_waiter) - LOAD_LMTX_REG(L_ARG0) /* fetch lock pointer - no indirection here */ -1: mov M_STATE(LMTX_REG), LMTX_C_REG32 + jmp Llmu_chktype - test $(M_WAITERS_MSK), LMTX_C_REG - je 2f - test $(M_ILOCKED_MSK), LMTX_C_REG /* have to wait for interlock to clear */ - jne 3f - - mov LMTX_C_REG, LMTX_A_REG /* eax contains snapshot for cmpxchgl */ - dec LMTX_C_REG /* decrement waiter count */ - lock - cmpxchg LMTX_C_REG32, M_STATE(LMTX_REG) /* atomic compare and exchange */ - jne 3f /* branch on failure to spin loop */ - mov $1, LMTX_RET_REG - LEAF_RET -2: - xor LMTX_RET_REG, LMTX_RET_REG - LEAF_RET -3: - PAUSE - jmp 1b - - -LEAF_ENTRY(lck_mtx_lock_get_pri) - LOAD_LMTX_REG(L_ARG0) /* fetch lock pointer - no indirection here */ -1: - mov M_STATE(LMTX_REG), LMTX_C_REG32 - - test $(M_WAITERS_MSK), LMTX_C_REG - jne 2f - test $(M_ILOCKED_MSK), LMTX_C_REG /* have to wait for interlock to clear */ - jne 3f - - mov LMTX_C_REG, LMTX_A_REG /* eax contains snapshot for cmpxchgl */ - and $(~M_PRIORITY_MSK), LMTX_C_REG /* no waiters, reset mutex priority to 0 */ - lock - cmpxchg LMTX_C_REG32, M_STATE(LMTX_REG) /* atomic compare and exchange */ - jne 3f /* branch on failure to spin loop */ - - xor LMTX_RET_REG, LMTX_RET_REG /* return mutex priority == 0 */ - LEAF_RET -2: - mov LMTX_C_REG, LMTX_RET_REG - and $(M_PRIORITY_MSK), LMTX_RET_REG - shr $16, LMTX_RET_REG /* return current mutex priority */ - LEAF_RET -3: - PAUSE - jmp 1b - - - - LEAF_ENTRY(lck_mtx_ilk_unlock) LOAD_LMTX_REG(L_ARG0) /* fetch lock pointer - no indirection here */ @@ -1897,93 +1943,80 @@ LEAF_ENTRY(lck_mtx_lock_grab_mutex) mov M_STATE(LMTX_REG), LMTX_C_REG32 - test $(M_ILOCKED_MSK | M_MLOCKED_MSK), LMTX_C_REG /* can't have the mutex yet */ - jne 2f + test $(M_ILOCKED_MSK | M_MLOCKED_MSK), LMTX_C_REG32 /* can't have the mutex yet */ + jnz 3f mov LMTX_C_REG, LMTX_A_REG /* eax contains snapshot for cmpxchgl */ - or $(M_MLOCKED_MSK), LMTX_C_REG + or $(M_ILOCKED_MSK | M_MLOCKED_MSK), LMTX_C_REG32 + + PREEMPTION_DISABLE lock cmpxchg LMTX_C_REG32, M_STATE(LMTX_REG) /* atomic compare and exchange */ jne 2f /* branch on failure to spin loop */ mov %gs:CPU_ACTIVE_THREAD, LMTX_A_REG mov LMTX_A_REG, M_OWNER(LMTX_REG) /* record owner of mutex */ +#if MACH_LDEBUG + test LMTX_A_REG, LMTX_A_REG + jz 1f + incl TH_MUTEX_COUNT(LMTX_A_REG) /* lock statistic */ +1: +#endif /* MACH_LDEBUG */ mov $1, LMTX_RET_REG /* return success */ LEAF_RET 2: + PREEMPTION_ENABLE +3: xor LMTX_RET_REG, LMTX_RET_REG /* return failure */ LEAF_RET -LEAF_ENTRY(lck_mtx_lock_mark_promoted) - LOAD_LMTX_REG(L_ARG0) /* fetch lock pointer - no indirection here */ -1: - mov M_STATE(LMTX_REG), LMTX_C_REG32 - - test $(M_PROMOTED_MSK), LMTX_C_REG - jne 3f - test $(M_ILOCKED_MSK), LMTX_C_REG /* have to wait for interlock to clear */ - jne 2f - - mov LMTX_C_REG, LMTX_A_REG /* eax contains snapshot for cmpxchgl */ - or $(M_PROMOTED_MSK), LMTX_C_REG - lock - cmpxchg LMTX_C_REG32, M_STATE(LMTX_REG) /* atomic compare and exchange */ - jne 2f /* branch on failure to spin loop */ - - mov $1, LMTX_RET_REG - LEAF_RET -2: - PAUSE - jmp 1b -3: - xor LMTX_RET_REG, LMTX_RET_REG - LEAF_RET - - - LEAF_ENTRY(lck_mtx_lock_mark_destroyed) LOAD_LMTX_REG(L_ARG0) 1: - mov M_OWNER(LMTX_REG), LMTX_A_REG - - cmp $(MUTEX_DESTROYED), LMTX_A_REG /* check to see if its marked destroyed */ - je 3f - cmp $(MUTEX_IND), LMTX_A_REG /* Is this an indirect mutex? */ + mov M_STATE(LMTX_REG), LMTX_C_REG32 + cmp $(MUTEX_IND), LMTX_C_REG32 /* Is this an indirect mutex? */ jne 2f - movl $(MUTEX_DESTROYED), M_OWNER(LMTX_REG) /* convert to destroyed state */ + movl $(MUTEX_DESTROYED), M_STATE(LMTX_REG) /* convert to destroyed state */ jmp 3f 2: - mov M_STATE(LMTX_REG), LMTX_C_REG32 - test $(M_ILOCKED_MSK), LMTX_C_REG /* have to wait for interlock to clear */ - jne 5f + jnz 5f - PUSHF /* save interrupt state */ + PREEMPTION_DISABLE mov LMTX_C_REG, LMTX_A_REG /* eax contains snapshot for cmpxchgl */ - or $(M_ILOCKED_MSK), LMTX_C_REG - CLI + or $(M_ILOCKED_MSK), LMTX_C_REG32 lock cmpxchg LMTX_C_REG32, M_STATE(LMTX_REG) /* atomic compare and exchange */ jne 4f /* branch on failure to spin loop */ - movl $(MUTEX_DESTROYED), M_OWNER(LMTX_REG) /* convert to destroyed state */ - POPF /* restore interrupt state */ + movl $(MUTEX_DESTROYED), M_STATE(LMTX_REG) /* convert to destroyed state */ + PREEMPTION_ENABLE 3: LEAF_RET /* return with M_ILOCKED set */ 4: - POPF /* restore interrupt state */ + PREEMPTION_ENABLE 5: PAUSE jmp 1b - - +LEAF_ENTRY(preemption_underflow_panic) + FRAME + incl %gs:CPU_PREEMPTION_LEVEL + ALIGN_STACK() + LOAD_STRING_ARG0(16f) + CALL_PANIC() + hlt + .data +16: String "Preemption level underflow, possible cause unlocking an unlocked mutex or spinlock" + .text + + LEAF_ENTRY(_disable_preemption) #if MACH_RT - _DISABLE_PREEMPTION + PREEMPTION_DISABLE #endif /* MACH_RT */ LEAF_RET @@ -1997,6 +2030,7 @@ LEAF_ENTRY(_enable_preemption) #else movl %gs:CPU_PREEMPTION_LEVEL,%esi #endif + ALIGN_STACK() LOAD_STRING_ARG0(_enable_preemption_less_than_zero) CALL_PANIC() hlt @@ -2006,7 +2040,7 @@ _enable_preemption_less_than_zero: .text 1: #endif /* MACH_ASSERT */ - _ENABLE_PREEMPTION + PREEMPTION_ENABLE #endif /* MACH_RT */ LEAF_RET @@ -2015,6 +2049,7 @@ LEAF_ENTRY(_enable_preemption_no_check) #if MACH_ASSERT cmpl $0,%gs:CPU_PREEMPTION_LEVEL jg 1f + ALIGN_STACK() LOAD_STRING_ARG0(_enable_preemption_no_check_less_than_zero) CALL_PANIC() hlt @@ -2031,7 +2066,7 @@ _enable_preemption_no_check_less_than_zero: LEAF_ENTRY(_mp_disable_preemption) #if MACH_RT - _DISABLE_PREEMPTION + PREEMPTION_DISABLE #endif /* MACH_RT */ LEAF_RET @@ -2045,6 +2080,7 @@ LEAF_ENTRY(_mp_enable_preemption) #else movl %gs:CPU_PREEMPTION_LEVEL,%esi #endif + ALIGN_PANIC() LOAD_STRING_ARG0(_mp_enable_preemption_less_than_zero) CALL_PANIC() hlt @@ -2054,7 +2090,7 @@ _mp_enable_preemption_less_than_zero: .text 1: #endif /* MACH_ASSERT */ - _ENABLE_PREEMPTION + PREEMPTION_ENABLE #endif /* MACH_RT */ LEAF_RET @@ -2063,6 +2099,7 @@ LEAF_ENTRY(_mp_enable_preemption_no_check) #if MACH_ASSERT cmpl $0,%gs:CPU_PREEMPTION_LEVEL jg 1f + ALIGN_STACK() LOAD_STRING_ARG0(_mp_enable_preemption_no_check_less_than_zero) CALL_PANIC() hlt diff --git a/osfmk/i386/i386_vm_init.c b/osfmk/i386/i386_vm_init.c index 301d02274..989895eb0 100644 --- a/osfmk/i386/i386_vm_init.c +++ b/osfmk/i386/i386_vm_init.c @@ -75,31 +75,24 @@ #include #include #include -#include #include #include #include #include #include +#include #include #include -#if DEBUG -#define DBG(x...) kprintf("DBG: " x) -#define PRINT_PMAP_MEMORY_TABLE -#else -#define DBG(x...) -#endif - vm_size_t mem_size = 0; -vm_offset_t first_avail = 0;/* first after page tables */ +pmap_paddr_t first_avail = 0;/* first after page tables */ uint64_t max_mem; /* Size of physical memory (bytes), adjusted by maxmem */ uint64_t mem_actual; uint64_t sane_size = 0; /* Memory size to use for defaults calculations */ -#define MAXLORESERVE ( 32 * 1024 * 1024) +#define MAXLORESERVE (32 * 1024 * 1024) ppnum_t max_ppnum = 0; ppnum_t lowest_lo = 0; @@ -119,11 +112,6 @@ vm_offset_t static_memory_end = 0; vm_offset_t sHIB, eHIB, stext, etext, sdata, edata, end; -boolean_t kernel_text_ps_4K = TRUE; -boolean_t wpkernel = TRUE; - -extern void *KPTphys; - /* * _mh_execute_header is the mach_header for the currently executing kernel */ @@ -135,6 +123,9 @@ void *sectPRELINKB; unsigned long sectSizePRELINK; void *sectHIBB; unsigned long sectSizeHIB; void *sectINITPTB; unsigned long sectSizeINITPT; +kernel_segment_command_t *segTEXT; +kernel_section_t *cursectTEXT, *lastsectTEXT; + extern uint64_t firmware_Conventional_bytes; extern uint64_t firmware_RuntimeServices_bytes; extern uint64_t firmware_ACPIReclaim_bytes; @@ -145,6 +136,9 @@ extern uint64_t firmware_Unusable_bytes; extern uint64_t firmware_other_bytes; uint64_t firmware_MMIO_bytes; +#if DEBUG +#define PRINT_PMAP_MEMORY_TABLE +#endif /* DEBUG */ /* * Basic VM initialization. */ @@ -163,7 +157,19 @@ i386_vm_init(uint64_t maxmem, unsigned int safeboot; ppnum_t maxpg = 0; uint32_t pmap_type; + uint32_t maxloreserve; uint32_t maxdmaaddr; + uint32_t mbuf_reserve = 0; + boolean_t mbuf_override = FALSE; + +#if DEBUG + kprintf("Boot args revision: %d version: %d", + args->Revision, args->Version); + kprintf(" commandline: \""); + for(i=0; iCommandLine[i]); + kprintf("\"\n"); +#endif /* * Now retrieve addresses for end, edata, and etext @@ -185,11 +191,18 @@ i386_vm_init(uint64_t maxmem, sectPRELINKB = (void *) getsegdatafromheader( &_mh_execute_header, "__PRELINK_TEXT", §SizePRELINK); + segTEXT = getsegbynamefromheader(&_mh_execute_header, "__TEXT"); + cursectTEXT = lastsectTEXT = firstsect(segTEXT); + /* Discover the last TEXT section within the TEXT segment */ + while ((cursectTEXT = nextsect(segTEXT, cursectTEXT)) != NULL) { + lastsectTEXT = cursectTEXT; + } + sHIB = (vm_offset_t) sectHIBB; eHIB = (vm_offset_t) sectHIBB + sectSizeHIB; /* Zero-padded from ehib to stext if text is 2M-aligned */ stext = (vm_offset_t) sectTEXTB; - etext = (vm_offset_t) sectTEXTB + sectSizeTEXT; + etext = (vm_offset_t) round_page_64(lastsectTEXT->addr + lastsectTEXT->size); /* Zero-padded from etext to sdata if text is 2M-aligned */ sdata = (vm_offset_t) sectDATAB; edata = (vm_offset_t) sectDATAB + sectSizeDATA; @@ -307,8 +320,10 @@ i386_vm_init(uint64_t maxmem, break; } +#if DEBUG kprintf("EFI region %d: type %u/%d, base 0x%x, top 0x%x\n", i, mptr->Type, pmap_type, base, top); +#endif if (maxpg) { if (base >= maxpg) @@ -327,7 +342,8 @@ i386_vm_init(uint64_t maxmem, /* * Usable memory region */ - if (top < I386_LOWMEM_RESERVED) { + if (top < I386_LOWMEM_RESERVED || + !pal_is_usable_memory(base, top)) { prev_pmptr = 0; continue; } @@ -408,8 +424,11 @@ i386_vm_init(uint64_t maxmem, if (prev_pmptr && pmptr->type == prev_pmptr->type && pmptr->base == pmptr->alloc && - pmptr->base == (prev_pmptr->end + 1)) { - prev_pmptr->end = pmptr->end; + pmptr->base == (prev_pmptr->end + 1)) + { + if(prev_pmptr->end == prev_pmptr->alloc) + prev_pmptr->alloc = pmptr->base; + prev_pmptr->end = pmptr->end; } else { pmap_memory_region_count++; prev_pmptr = pmptr; @@ -462,7 +481,7 @@ i386_vm_init(uint64_t maxmem, sane_size = (sane_size + 128 * MB - 1) & ~((uint64_t)(128 * MB - 1)); /* - * We cap at KERNEL_MAXMEM bytes (currently 32GB for K32, 64GB for K64). + * We cap at KERNEL_MAXMEM bytes (currently 32GB for K32, 96GB for K64). * Unless overriden by the maxmem= boot-arg * -- which is a non-zero maxmem argument to this function. */ @@ -532,9 +551,6 @@ i386_vm_init(uint64_t maxmem, max_valid_low_ppnum = (ppnum_t)(max_valid_dma_address / PAGE_SIZE); } if (avail_end >= max_valid_dma_address) { - uint32_t maxloreserve; - uint32_t mbuf_reserve = 0; - boolean_t mbuf_override = FALSE; if (!PE_parse_boot_argn("maxloreserve", &maxloreserve, sizeof (maxloreserve))) { @@ -562,10 +578,12 @@ i386_vm_init(uint64_t maxmem, vm_lopage_needed = TRUE; } } + /* * Initialize kernel physical map. * Kernel virtual address starts at VM_KERNEL_MIN_ADDRESS. */ + kprintf("avail_remaining = 0x%lx\n", (unsigned long)avail_remaining); pmap_bootstrap(0, IA32e); } @@ -576,6 +594,7 @@ pmap_free_pages(void) return (unsigned int)avail_remaining; } + boolean_t pmap_next_page_reserved(ppnum_t *); /* @@ -685,209 +704,3 @@ pmap_valid_page( return FALSE; } -/* - * Called once VM is fully initialized so that we can release unused - * sections of low memory to the general pool. - * Also complete the set-up of identity-mapped sections of the kernel: - * 1) write-protect kernel text - * 2) map kernel text using large pages if possible - * 3) read and write-protect page zero (for K32) - * 4) map the global page at the appropriate virtual address. - * - * Use of large pages - * ------------------ - * To effectively map and write-protect all kernel text pages, the text - * must be 2M-aligned at the base, and the data section above must also be - * 2M-aligned. That is, there's padding below and above. This is achieved - * through linker directives. Large pages are used only if this alignment - * exists (and not overriden by the -kernel_text_page_4K boot-arg). The - * memory layout is: - * - * : : - * | __DATA | - * sdata: ================== 2Meg - * | | - * | zero-padding | - * | | - * etext: ------------------ - * | | - * : : - * | | - * | __TEXT | - * | | - * : : - * | | - * stext: ================== 2Meg - * | | - * | zero-padding | - * | | - * eHIB: ------------------ - * | __HIB | - * : : - * - * Prior to changing the mapping from 4K to 2M, the zero-padding pages - * [eHIB,stext] and [etext,sdata] are ml_static_mfree()'d. Then all the - * 4K pages covering [stext,etext] are coalesced as 2M large pages. - * The now unused level-1 PTE pages are also freed. - */ -void -pmap_lowmem_finalize(void) -{ - spl_t spl; - int i; - - /* Check the kernel is linked at the expected base address */ - if (i386_btop(kvtophys((vm_offset_t) &IdlePML4)) != - I386_KERNEL_IMAGE_BASE_PAGE) - panic("pmap_lowmem_finalize() unexpected kernel base address"); - - /* - * Free all pages in pmap regions below the base: - * rdar://6332712 - * We can't free all the pages to VM that EFI reports available. - * Pages in the range 0xc0000-0xff000 aren't safe over sleep/wake. - * There's also a size miscalculation here: pend is one page less - * than it should be but this is not fixed to be backwards - * compatible. - * Due to this current EFI limitation, we take only the first - * entry in the memory region table. However, the loop is retained - * (with the intended termination criteria commented out) in the - * hope that some day we can free all low-memory ranges. - * This loop assumes the first range does not span the kernel - * image base & avail_start. We skip this process on systems - * with "kernel reserved" ranges, as the low memory reclamation - * is handled in the initial memory map processing loop on - * such systems. - */ - for (i = 0; -// pmap_memory_regions[i].end <= I386_KERNEL_IMAGE_BASE_PAGE; - i < 1 && (pmap_reserved_ranges == 0); - i++) { - vm_offset_t pbase = (vm_offset_t)i386_ptob(pmap_memory_regions[i].base); - vm_offset_t pend = (vm_offset_t)i386_ptob(pmap_memory_regions[i].end); -// vm_offset_t pend = i386_ptob(pmap_memory_regions[i].end+1); - - DBG("ml_static_mfree(%p,%p) for pmap region %d\n", - (void *) ml_static_ptovirt(pbase), - (void *) (pend - pbase), i); - ml_static_mfree(ml_static_ptovirt(pbase), pend - pbase); - } - - /* - * If text and data are both 2MB-aligned, - * we can map text with large-pages, - * unless the -kernel_text_ps_4K boot-arg overrides. - */ - if ((stext & I386_LPGMASK) == 0 && (sdata & I386_LPGMASK) == 0) { - kprintf("Kernel text is 2MB aligned"); - kernel_text_ps_4K = FALSE; - if (PE_parse_boot_argn("-kernel_text_ps_4K", - &kernel_text_ps_4K, - sizeof (kernel_text_ps_4K))) - kprintf(" but will be mapped with 4K pages\n"); - else - kprintf(" and will be mapped with 2M pages\n"); - } - - (void) PE_parse_boot_argn("wpkernel", &wpkernel, sizeof (wpkernel)); - if (wpkernel) - kprintf("Kernel text %p-%p to be write-protected\n", - (void *) stext, (void *) etext); - - spl = splhigh(); - - /* - * Scan over text if mappings are to be changed: - * - Remap kernel text readonly unless the "wpkernel" boot-arg is 0 - * - Change to large-pages if possible and not overriden. - */ - if (kernel_text_ps_4K && wpkernel) { - vm_offset_t myva; - for (myva = stext; myva < etext; myva += PAGE_SIZE) { - pt_entry_t *ptep; - - ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva); - if (ptep) - pmap_store_pte(ptep, *ptep & ~INTEL_PTE_RW); - } - } - - if (!kernel_text_ps_4K) { - vm_offset_t myva; - - /* - * Release zero-filled page padding used for 2M-alignment. - */ - DBG("ml_static_mfree(%p,%p) for padding below text\n", - (void *) eHIB, (void *) (stext - eHIB)); - ml_static_mfree(eHIB, stext - eHIB); - DBG("ml_static_mfree(%p,%p) for padding above text\n", - (void *) etext, (void *) (sdata - etext)); - ml_static_mfree(etext, sdata - etext); - - /* - * Coalesce text pages into large pages. - */ - for (myva = stext; myva < sdata; myva += I386_LPGBYTES) { - pt_entry_t *ptep; - vm_offset_t pte_phys; - pt_entry_t *pdep; - pt_entry_t pde; - - pdep = pmap_pde(kernel_pmap, (vm_map_offset_t)myva); - ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva); - DBG("myva: %p pdep: %p ptep: %p\n", - (void *) myva, (void *) pdep, (void *) ptep); - if ((*ptep & INTEL_PTE_VALID) == 0) - continue; - pte_phys = (vm_offset_t)(*ptep & PG_FRAME); - pde = *pdep & PTMASK; /* page attributes from pde */ - pde |= INTEL_PTE_PS; /* make it a 2M entry */ - pde |= pte_phys; /* take page frame from pte */ - - if (wpkernel) - pde &= ~INTEL_PTE_RW; - DBG("pmap_store_pte(%p,0x%llx)\n", - (void *)pdep, pde); - pmap_store_pte(pdep, pde); - - /* - * Free the now-unused level-1 pte. - * Note: ptep is a virtual address to the pte in the - * recursive map. We can't use this address to free - * the page. Instead we need to compute its address - * in the Idle PTEs in "low memory". - */ - vm_offset_t vm_ptep = (vm_offset_t) KPTphys - + (pte_phys >> PTPGSHIFT); - DBG("ml_static_mfree(%p,0x%x) for pte\n", - (void *) vm_ptep, PAGE_SIZE); - ml_static_mfree(vm_ptep, PAGE_SIZE); - } - - /* Change variable read by sysctl machdep.pmap */ - pmap_kernel_text_ps = I386_LPGBYTES; - } - -#if defined(__i386__) - /* no matter what, kernel page zero is not accessible */ - pmap_store_pte(pmap_pte(kernel_pmap, 0), INTEL_PTE_INVALID); -#endif - - /* map lowmem global page into fixed addr */ - pt_entry_t *pte = NULL; - if (0 == (pte = pmap_pte(kernel_pmap, - VM_MIN_KERNEL_LOADED_ADDRESS + 0x2000))) - panic("lowmem pte"); - /* make sure it is defined on page boundary */ - assert(0 == ((vm_offset_t) &lowGlo & PAGE_MASK)); - pmap_store_pte(pte, kvtophys((vm_offset_t)&lowGlo) - | INTEL_PTE_REF - | INTEL_PTE_MOD - | INTEL_PTE_WIRED - | INTEL_PTE_VALID - | INTEL_PTE_RW); - splx(spl); - flush_tlb(); -} - diff --git a/osfmk/i386/idle_pt.c b/osfmk/i386/idle_pt.c index ebbfc556d..663375acf 100644 --- a/osfmk/i386/idle_pt.c +++ b/osfmk/i386/idle_pt.c @@ -27,10 +27,15 @@ */ #include -pml4_entry_t IdlePML4[PTE_PER_PAGE] __attribute__((section("__INITPT, __data"))) = { +#define PML4_PROT (INTEL_PTE_VALID | INTEL_PTE_WRITE) +pml4_entry_t IdlePML4[PTE_PER_PAGE] __attribute__((section("__INITPT, __data"))) = { #ifdef __x86_64__ - [ 0] = ((uint64_t)(INITPT_SEG_BASE + PAGE_SIZE) | INTEL_PTE_VALID | INTEL_PTE_WRITE), - [KERNEL_PML4_INDEX] = ((uint64_t)(INITPT_SEG_BASE + PAGE_SIZE) | INTEL_PTE_VALID | INTEL_PTE_WRITE), + [ 0] + = ((uint64_t)(INITPT_SEG_BASE + PAGE_SIZE) | PML4_PROT), +#if KERNEL_PML4_INDEX != 0 + [KERNEL_PML4_INDEX] + = ((uint64_t)(INITPT_SEG_BASE + PAGE_SIZE) | PML4_PROT), +#endif #endif }; @@ -39,7 +44,7 @@ pml4_entry_t IdlePML4[PTE_PER_PAGE] __attribute__((section("__INITPT, __data")) #elif defined(__i386__) #define PDPT_PROT (INTEL_PTE_VALID) #endif -pdpt_entry_t IdlePDPT[PTE_PER_PAGE] __attribute__((section("__INITPT, __data"))) = { +pdpt_entry_t IdlePDPT[PTE_PER_PAGE] __attribute__((section("__INITPT, __data"))) = { [0] = ((uint64_t)(INITPT_SEG_BASE + 2*PAGE_SIZE) | PDPT_PROT), [1] = ((uint64_t)(INITPT_SEG_BASE + 3*PAGE_SIZE) | PDPT_PROT), [2] = ((uint64_t)(INITPT_SEG_BASE + 4*PAGE_SIZE) | PDPT_PROT), @@ -50,6 +55,8 @@ pdpt_entry_t IdlePDPT[PTE_PER_PAGE] __attribute__((section("__INITPT, __data")) #error Please update idle_pt.c to reflect the new value of NPGPTD #endif +#if MACHINE_BOOTSTRAPPTD + #define ID_MAP_2MEG(x) [(x)] = ((((uint64_t)(x)) << 21) | (INTEL_PTE_PS | INTEL_PTE_VALID | INTEL_PTE_WRITE)), #define L0(x,n) x(n) @@ -70,3 +77,4 @@ pdpt_entry_t IdlePDPT[PTE_PER_PAGE] __attribute__((section("__INITPT, __data")) pd_entry_t BootstrapPTD[2048] __attribute__((section("__INITPT, __data"))) = { FOR_0_TO_2047(ID_MAP_2MEG) }; +#endif /* MACHINE_BOOTSTRAPPTD */ diff --git a/osfmk/i386/idt.s b/osfmk/i386/idt.s index 17e6331ea..362b783a4 100644 --- a/osfmk/i386/idt.s +++ b/osfmk/i386/idt.s @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -60,6 +60,24 @@ #include #include #include +#include +#define _ARCH_I386_ASM_HELP_H_ /* Prevent inclusion of user header */ +#include +#include +#include +#include + +/* + * Low-memory handlers. + */ +#define LO_ALLINTRS EXT(lo_allintrs32) +#define LO_ALLTRAPS EXT(lo_alltraps32) +#define LO_SYSENTER EXT(lo_sysenter32) +#define LO_UNIX_SCALL EXT(lo_unix_scall32) +#define LO_MACH_SCALL EXT(lo_mach_scall32) +#define LO_MDEP_SCALL EXT(lo_mdep_scall32) +#define LO_DIAG_SCALL EXT(lo_diag_scall32) + #define HI_DATA(lo_addr) ( (EXT(lo_addr) - EXT(hi_remap_data)) + HIGH_IDT_BASE ) #define HI_TEXT(lo_text) ( (EXT(lo_text) - EXT(hi_remap_text)) + HIGH_MEM_BASE ) @@ -103,7 +121,7 @@ Entry(name) ;\ pushl $0 ;\ pushl $(n) ;\ pusha ;\ - movl $ EXT(lo_alltraps),%ebx ;\ + movl $(LO_ALLTRAPS),%ebx ;\ jmp enter_lohandler @@ -116,7 +134,7 @@ Entry(name) ;\ pushl $0 ;\ pushl $(n) ;\ pusha ;\ - movl $ EXT(lo_alltraps),%ebx ;\ + movl $(LO_ALLTRAPS),%ebx ;\ jmp enter_lohandler @@ -152,11 +170,11 @@ Entry(name) ;\ * Error code has been pushed. Push trap number. */ #define EXCEP_ERR(n,name) \ - IDT_ENTRY(name,K_INTR_GATE);\ -Entry(name) ;\ - pushl $(n) ;\ - pusha ;\ - movl $ EXT(lo_alltraps),%ebx ;\ + IDT_ENTRY(name,K_INTR_GATE) ;\ +Entry(name) ;\ + pushl $(n) ;\ + pusha ;\ + movl $(LO_ALLTRAPS),%ebx ;\ jmp enter_lohandler @@ -170,7 +188,7 @@ L_ ## n: ;\ pushl $0 ;\ pushl $(n) ;\ pusha ;\ - movl $ EXT(lo_allintrs),%ebx ;\ + movl $(LO_ALLINTRS),%ebx ;\ jmp enter_lohandler @@ -471,8 +489,7 @@ Entry(lo_kernel_cr3) .text -/******************************************************************************************************* - * +/* * Trap/interrupt entry points. * * All traps must create the following save area on the PCB "stack": @@ -498,14 +515,21 @@ Entry(lo_kernel_cr3) * user ss - if from user */ - +ret_to_kernel: + jmp *1f +1: .long HI_TEXT(hi_ret_to_kernel) + +ret_to_user: + jmp *1f +1: .long HI_TEXT(hi_ret_to_user) + Entry(hi_ret_to_user) movl %esp,%ebx movl %gs:CPU_ACTIVE_THREAD,%ecx - subl ACT_PCB_ISS(%ecx),%ebx - movl $(WINDOWS_CLEAN),ACT_COPYIO_STATE(%ecx) + subl TH_PCB_ISS(%ecx),%ebx + movl $(WINDOWS_CLEAN),TH_COPYIO_STATE(%ecx) - movl ACT_PCB_IDS(%ecx),%eax /* get debug state struct */ + movl TH_PCB_IDS(%ecx),%eax /* get debug state struct */ cmpl $0,%eax /* is there a debug state */ je 1f /* branch if not */ movl DS_DR0(%eax), %ecx /* Load the 32 bit debug registers */ @@ -562,7 +586,7 @@ Entry(hi_unix_scall) pushl %eax /* save system call number */ pushl $0 /* clear trap number slot */ pusha /* save the general registers */ - movl $ EXT(lo_unix_scall),%ebx + movl $(LO_UNIX_SCALL),%ebx jmp enter_lohandler @@ -570,7 +594,7 @@ Entry(hi_mach_scall) pushl %eax /* save system call number */ pushl $0 /* clear trap number slot */ pusha /* save the general registers */ - movl $ EXT(lo_mach_scall),%ebx + movl $(LO_MACH_SCALL),%ebx jmp enter_lohandler @@ -578,7 +602,7 @@ Entry(hi_mdep_scall) pushl %eax /* save system call number */ pushl $0 /* clear trap number slot */ pusha /* save the general registers */ - movl $ EXT(lo_mdep_scall),%ebx + movl $(LO_MDEP_SCALL),%ebx jmp enter_lohandler @@ -586,7 +610,7 @@ Entry(hi_diag_scall) pushl %eax // Save sselector pushl $0 // Clear trap number slot pusha // save the general registers - movl $EXT(lo_diag_scall),%ebx // Get the function down low to transfer to + movl $(LO_DIAG_SCALL),%ebx // Get the function down low to transfer to jmp enter_lohandler // Leap to it... @@ -622,7 +646,7 @@ hi_sysenter_2: pushl $0 /* clear trap number slot */ pusha /* save the general registers */ orl $(EFL_IF),R32_EFLAGS-R32_EDI(%esp) /* (edi was last reg pushed) */ - movl $ EXT(lo_sysenter),%ebx + movl $(LO_SYSENTER),%ebx enter_lohandler: pushl %ds pushl %es @@ -650,11 +674,12 @@ enter_lohandler1: testb $3,R32_CS(%esp) jz 2f movl %esp,%edx /* came from user mode */ + xor %ebp, %ebp subl %gs:CPU_HI_ISS,%edx movl %gs:CPU_ACTIVE_THREAD,%ecx - addl ACT_PCB_ISS(%ecx),%edx /* rebase the high stack to a low address */ + addl TH_PCB_ISS(%ecx),%edx /* rebase the high stack to a low address */ movl %edx,%esp - cmpl $0, ACT_PCB_IDS(%ecx) /* Is there a debug register state? */ + cmpl $0, TH_PCB_IDS(%ecx) /* Is there a debug register state? */ je 2f movl $0, %ecx /* If so, reset DR7 (the control) */ movl %ecx, %dr7 @@ -673,7 +698,7 @@ Entry(hi_page_fault) movl %cr2,%eax /* get the faulting address */ movl %eax,R32_CR2-R32_EDI(%esp)/* save in esp save slot */ - movl $ EXT(lo_alltraps),%ebx + movl $(LO_ALLTRAPS),%ebx jmp enter_lohandler @@ -728,7 +753,7 @@ hi_debug_trap: pushl $0 pushl $(T_DEBUG) /* handle as user trap */ pusha /* save the general registers */ - movl $ EXT(lo_alltraps),%ebx + movl $(LO_ALLTRAPS),%ebx jmp enter_lohandler @@ -769,7 +794,7 @@ trap_check_kernel_exit: je fault_popl_gs hi_take_trap: pusha /* save the general registers */ - movl $ EXT(lo_alltraps),%ebx + movl $(LO_ALLTRAPS),%ebx jmp enter_lohandler @@ -798,7 +823,7 @@ fault_iret: popl %eax /* restore eax */ /* now treat as fault from user */ pusha /* save the general registers */ - movl $ EXT(lo_alltraps),%ebx + movl $(LO_ALLTRAPS),%ebx jmp enter_lohandler /* @@ -840,19 +865,471 @@ push_none: /* now treat as fault from user */ /* except that segment registers are */ /* already pushed */ - movl $ EXT(lo_alltraps),%ebx + movl $(LO_ALLTRAPS),%ebx jmp enter_lohandler1 .text -Entry(lo_ret_to_user) - jmp *1f -1: .long HI_TEXT(hi_ret_to_user) +Entry(hi_remap_etext) -Entry(lo_ret_to_kernel) - jmp *1f -1: .long HI_TEXT(hi_ret_to_kernel) -Entry(hi_remap_etext) +/* + * All 32 bit task 'exceptions' enter lo_alltraps: + * esp -> x86_saved_state_t + * + * The rest of the state is set up as: + * cr3 -> kernel directory + * esp -> low based stack + * gs -> CPU_DATA_GS + * cs -> KERNEL32_CS + * ss/ds/es -> KERNEL_DS + * + * interrupts disabled + * direction flag cleared + */ +Entry(lo_alltraps32) + movl R32_CS(%esp),%eax /* assume 32-bit state */ + cmpl $(SS_64),SS_FLAVOR(%esp)/* 64-bit? */ + jne 1f + movl R64_CS(%esp),%eax /* 64-bit user mode */ +1: + testb $3,%al + jz trap_from_kernel + /* user mode trap */ + TIME_TRAP_UENTRY + + movl %gs:CPU_ACTIVE_THREAD,%ecx + movl TH_TASK(%ecx),%ebx + + /* Check for active vtimers in the current task */ + TASK_VTIMER_CHECK(%ebx, %ecx) + + movl %gs:CPU_KERNEL_STACK,%ebx + xchgl %ebx,%esp /* switch to kernel stack */ + + CCALL1(user_trap, %ebx) /* call user trap routine */ + /* user_trap() unmasks interrupts */ + cli /* hold off intrs - critical section */ + xorl %ecx,%ecx /* don't check if we're in the PFZ */ + +/* + * Return from trap or system call, checking for ASTs. + * On lowbase PCB stack with intrs disabled + */ +Entry(return_from_trap32) + movl %gs:CPU_ACTIVE_THREAD, %esp + movl TH_PCB_ISS(%esp), %esp /* switch back to PCB stack */ + movl %gs:CPU_PENDING_AST, %eax + testl %eax, %eax + je EXT(return_to_user) /* branch if no AST */ +LEXT(return_from_trap_with_ast) + movl %gs:CPU_KERNEL_STACK, %ebx + xchgl %ebx, %esp /* switch to kernel stack */ + + testl %ecx, %ecx /* see if we need to check for an EIP in the PFZ */ + je 2f /* no, go handle the AST */ + cmpl $(SS_64), SS_FLAVOR(%ebx) /* are we a 64-bit task? */ + je 1f + /* no... 32-bit user mode */ + movl R32_EIP(%ebx), %eax + pushl %ebx /* save PCB stack */ + xorl %ebp, %ebp /* clear frame pointer */ + CCALL1(commpage_is_in_pfz32, %eax) + popl %ebx /* retrieve pointer to PCB stack */ + testl %eax, %eax + je 2f /* not in the PFZ... go service AST */ + movl %eax, R32_EBX(%ebx) /* let the PFZ know we've pended an AST */ + xchgl %ebx, %esp /* switch back to PCB stack */ + jmp EXT(return_to_user) +1: /* 64-bit user mode */ + movl R64_RIP(%ebx), %ecx + movl R64_RIP+4(%ebx), %eax + pushl %ebx /* save PCB stack */ + xorl %ebp, %ebp /* clear frame pointer */ + CCALL2(commpage_is_in_pfz64, %ecx, %eax) + popl %ebx /* retrieve pointer to PCB stack */ + testl %eax, %eax + je 2f /* not in the PFZ... go service AST */ + movl %eax, R64_RBX(%ebx) /* let the PFZ know we've pended an AST */ + xchgl %ebx, %esp /* switch back to PCB stack */ + jmp EXT(return_to_user) +2: + sti /* interrupts always enabled on return to user mode */ + xorl %ebp, %ebp /* Clear framepointer */ + CCALL1(i386_astintr, $0) /* take the AST */ + cli + xorl %ecx, %ecx /* don't check if we're in the PFZ */ + jmp EXT(return_from_trap32) /* and check again (rare) */ + + +/* + * Trap from kernel mode. No need to switch stacks. + * Interrupts must be off here - we will set them to state at time of trap + * as soon as it's safe for us to do so and not recurse doing preemption + */ +trap_from_kernel: + movl %esp, %eax /* saved state addr */ + pushl R32_EIP(%esp) /* Simulate a CALL from fault point */ + pushl %ebp /* Extend framepointer chain */ + movl %esp, %ebp + CCALL1WITHSP(kernel_trap, %eax) /* Call kernel trap handler */ + popl %ebp + addl $4, %esp + cli + + movl %gs:CPU_PENDING_AST,%eax /* get pending asts */ + testl $ AST_URGENT,%eax /* any urgent preemption? */ + je ret_to_kernel /* no, nothing to do */ + cmpl $ T_PREEMPT,R32_TRAPNO(%esp) + je ret_to_kernel /* T_PREEMPT handled in kernel_trap() */ + testl $ EFL_IF,R32_EFLAGS(%esp) /* interrupts disabled? */ + je ret_to_kernel + cmpl $0,%gs:CPU_PREEMPTION_LEVEL /* preemption disabled? */ + jne ret_to_kernel + movl %gs:CPU_KERNEL_STACK,%eax + movl %esp,%ecx + xorl %eax,%ecx + and EXT(kernel_stack_mask),%ecx + testl %ecx,%ecx /* are we on the kernel stack? */ + jne ret_to_kernel /* no, skip it */ + + CCALL1(i386_astintr, $1) /* take the AST */ + + jmp ret_to_kernel + + +/* + * All interrupts on all tasks enter here with: + * esp-> -> x86_saved_state_t + * + * cr3 -> kernel directory + * esp -> low based stack + * gs -> CPU_DATA_GS + * cs -> KERNEL32_CS + * ss/ds/es -> KERNEL_DS + * + * interrupts disabled + * direction flag cleared + */ +Entry(lo_allintrs32) + /* + * test whether already on interrupt stack + */ + movl %gs:CPU_INT_STACK_TOP,%ecx + cmpl %esp,%ecx + jb 1f + leal -INTSTACK_SIZE(%ecx),%edx + cmpl %esp,%edx + jb int_from_intstack +1: + xchgl %ecx,%esp /* switch to interrupt stack */ + + movl %cr0,%eax /* get cr0 */ + orl $(CR0_TS),%eax /* or in TS bit */ + movl %eax,%cr0 /* set cr0 */ + + subl $8, %esp /* for 16-byte stack alignment */ + pushl %ecx /* save pointer to old stack */ + movl %ecx,%gs:CPU_INT_STATE /* save intr state */ + + TIME_INT_ENTRY /* do timing */ + + movl %gs:CPU_ACTIVE_THREAD,%ecx + movl TH_TASK(%ecx),%ebx + + /* Check for active vtimers in the current task */ + TASK_VTIMER_CHECK(%ebx, %ecx) + + incl %gs:CPU_PREEMPTION_LEVEL + incl %gs:CPU_INTERRUPT_LEVEL + + movl %gs:CPU_INT_STATE, %eax + CCALL1(interrupt, %eax) /* call generic interrupt routine */ + + cli /* just in case we returned with intrs enabled */ + xorl %eax,%eax + movl %eax,%gs:CPU_INT_STATE /* clear intr state pointer */ + + decl %gs:CPU_INTERRUPT_LEVEL + decl %gs:CPU_PREEMPTION_LEVEL + + TIME_INT_EXIT /* do timing */ + + movl %gs:CPU_ACTIVE_THREAD,%eax + movl TH_PCB_FPS(%eax),%eax /* get pcb's ifps */ + testl %eax, %eax /* Is there a context */ + je 1f /* Branch if not */ + cmpl $0, FP_VALID(%eax) /* Check fp_valid */ + jne 1f /* Branch if valid */ + clts /* Clear TS */ + jmp 2f +1: + movl %cr0,%eax /* get cr0 */ + orl $(CR0_TS),%eax /* or in TS bit */ + movl %eax,%cr0 /* set cr0 */ +2: + popl %esp /* switch back to old stack */ + + /* Load interrupted code segment into %eax */ + movl R32_CS(%esp),%eax /* assume 32-bit state */ + cmpl $(SS_64),SS_FLAVOR(%esp)/* 64-bit? */ + jne 3f + movl R64_CS(%esp),%eax /* 64-bit user mode */ +3: + testb $3,%al /* user mode, */ + jnz ast_from_interrupt_user /* go handle potential ASTs */ + /* + * we only want to handle preemption requests if + * the interrupt fell in the kernel context + * and preemption isn't disabled + */ + movl %gs:CPU_PENDING_AST,%eax + testl $ AST_URGENT,%eax /* any urgent requests? */ + je ret_to_kernel /* no, nothing to do */ + + cmpl $0,%gs:CPU_PREEMPTION_LEVEL /* preemption disabled? */ + jne ret_to_kernel /* yes, skip it */ + + movl %gs:CPU_KERNEL_STACK,%eax + movl %esp,%ecx + xorl %eax,%ecx + and EXT(kernel_stack_mask),%ecx + testl %ecx,%ecx /* are we on the kernel stack? */ + jne ret_to_kernel /* no, skip it */ + + /* + * Take an AST from kernel space. We don't need (and don't want) + * to do as much as the case where the interrupt came from user + * space. + */ + CCALL1(i386_astintr, $1) + + jmp ret_to_kernel + + +/* + * nested int - simple path, can't preempt etc on way out + */ +int_from_intstack: + incl %gs:CPU_PREEMPTION_LEVEL + incl %gs:CPU_INTERRUPT_LEVEL + + movl %esp, %edx /* x86_saved_state */ + CCALL1(interrupt, %edx) + + decl %gs:CPU_INTERRUPT_LEVEL + decl %gs:CPU_PREEMPTION_LEVEL + + jmp ret_to_kernel + +/* + * Take an AST from an interrupted user + */ +ast_from_interrupt_user: + movl %gs:CPU_PENDING_AST,%eax + testl %eax,%eax /* pending ASTs? */ + je ret_to_user /* no, nothing to do */ + + TIME_TRAP_UENTRY + + movl $1, %ecx /* check if we're in the PFZ */ + jmp EXT(return_from_trap_with_ast) /* return */ + + +/* + * 32bit Tasks + * System call entries via INTR_GATE or sysenter: + * + * esp -> x86_saved_state32_t + * cr3 -> kernel directory + * esp -> low based stack + * gs -> CPU_DATA_GS + * cs -> KERNEL32_CS + * ss/ds/es -> KERNEL_DS + * + * interrupts disabled + * direction flag cleared + */ + +Entry(lo_sysenter32) + /* + * We can be here either for a mach syscall or a unix syscall, + * as indicated by the sign of the code: + */ + movl R32_EAX(%esp),%eax + testl %eax,%eax + js EXT(lo_mach_scall32) /* < 0 => mach */ + /* > 0 => unix */ + +Entry(lo_unix_scall32) + TIME_TRAP_UENTRY + + movl %gs:CPU_KERNEL_STACK,%edi + xchgl %edi,%esp /* switch to kernel stack */ + movl %gs:CPU_ACTIVE_THREAD,%ecx /* get current thread */ + movl TH_TASK(%ecx),%ebx /* point to current task */ + incl TH_SYSCALLS_UNIX(%ecx) /* increment call count */ + + /* Check for active vtimers in the current task */ + TASK_VTIMER_CHECK(%ebx, %ecx) + + sti + + CCALL1(unix_syscall, %edi) + /* + * always returns through thread_exception_return + */ + + +Entry(lo_mach_scall32) + TIME_TRAP_UENTRY + + movl %gs:CPU_KERNEL_STACK,%edi + xchgl %edi,%esp /* switch to kernel stack */ + movl %gs:CPU_ACTIVE_THREAD,%ecx /* get current thread */ + movl TH_TASK(%ecx),%ebx /* point to current task */ + incl TH_SYSCALLS_MACH(%ecx) /* increment call count */ + + /* Check for active vtimers in the current task */ + TASK_VTIMER_CHECK(%ebx, %ecx) + + sti + + CCALL1(mach_call_munger, %edi) + /* + * always returns through thread_exception_return + */ + + +Entry(lo_mdep_scall32) + TIME_TRAP_UENTRY + + movl %gs:CPU_KERNEL_STACK,%edi + xchgl %edi,%esp /* switch to kernel stack */ + movl %gs:CPU_ACTIVE_THREAD,%ecx /* get current thread */ + movl TH_TASK(%ecx),%ebx /* point to current task */ + + /* Check for active vtimers in the current task */ + TASK_VTIMER_CHECK(%ebx, %ecx) + + sti + + CCALL1(machdep_syscall, %edi) + /* + * always returns through thread_exception_return + */ + + +Entry(lo_diag_scall32) + TIME_TRAP_UENTRY + + movl %gs:CPU_KERNEL_STACK,%edi + xchgl %edi,%esp /* switch to kernel stack */ + movl %gs:CPU_ACTIVE_THREAD,%ecx /* get current thread */ + movl TH_TASK(%ecx),%ebx /* point to current task */ + + /* Check for active vtimers in the current task */ + TASK_VTIMER_CHECK(%ebx, %ecx) + + pushl %edi /* push pbc stack for later */ + + CCALL1(diagCall, %edi) // Call diagnostics + + cli // Disable interruptions just in case + popl %esp // Get back the original stack + cmpl $0,%eax // What kind of return is this? + jne EXT(return_to_user) // Normal return, do not check asts... + + CCALL5(i386_exception, $EXC_SYSCALL, $0x6000, $0, $1, $0) + // pass what would be the diag syscall + // error return - cause an exception + /* no return */ + + +LEXT(return_to_user) + TIME_TRAP_UEXIT + jmp ret_to_user + + +/* + * Double-fault exception handler task. The last gasp... + */ +Entry(df_task_start) + CCALL1(panic_double_fault32, $(T_DOUBLE_FAULT)) + hlt + + +/* + * machine-check handler task. The last gasp... + */ +Entry(mc_task_start) + CCALL1(panic_machine_check32, $(T_MACHINE_CHECK)) + hlt + +#if MACH_KDB +#include +#define CX(addr,reg) addr(,reg,4) +#if 0 +/* + * Note that the per-fault entry points are not currently + * functional. The only way to make them work would be to + * set up separate TSS's for each fault type, which doesn't + * currently seem worthwhile. (The offset part of a task + * gate is always ignored.) So all faults that task switch + * currently resume at db_task_start. + */ +/* + * Double fault (Murphy's point) - error code (0) on stack + */ +Entry(db_task_dbl_fault) + popl %eax + movl $(T_DOUBLE_FAULT),%ebx + jmp db_task_start +/* + * Segment not present - error code on stack + */ +Entry(db_task_seg_np) + popl %eax + movl $(T_SEGMENT_NOT_PRESENT),%ebx + jmp db_task_start +/* + * Stack fault - error code on (current) stack + */ +Entry(db_task_stk_fault) + popl %eax + movl $(T_STACK_FAULT),%ebx + jmp db_task_start +/* + * General protection fault - error code on stack + */ +Entry(db_task_gen_prot) + popl %eax + movl $(T_GENERAL_PROTECTION),%ebx + jmp db_task_start +#endif /* 0 */ +/* + * The entry point where execution resumes after last-ditch debugger task + * switch. + */ +Entry(db_task_start) + movl %esp,%edx + subl $(ISS32_SIZE),%edx + movl %edx,%esp /* allocate x86_saved_state on stack */ + movl %eax,R32_ERR(%esp) + movl %ebx,R32_TRAPNO(%esp) + pushl %edx + CPU_NUMBER(%edx) + movl CX(EXT(master_dbtss),%edx),%edx + movl TSS_LINK(%edx),%eax + pushl %eax /* pass along selector of previous TSS */ + call EXT(db_tss_to_frame) + popl %eax /* get rid of TSS selector */ + call EXT(db_trap_from_asm) + addl $0x4,%esp + /* + * And now...? + */ + iret /* ha, ha, ha... */ +#endif /* MACH_KDB */ diff --git a/osfmk/i386/idt64.s b/osfmk/i386/idt64.s index 8efe5ed44..4d91cb82f 100644 --- a/osfmk/i386/idt64.s +++ b/osfmk/i386/idt64.s @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -31,13 +31,16 @@ #include #include #include +#include #define _ARCH_I386_ASM_HELP_H_ /* Prevent inclusion of user header */ #include #include #include +#include + /* - * Locore handlers. + * Low-memory compability-mode handlers. */ #define LO_ALLINTRS EXT(lo_allintrs) #define LO_ALLTRAPS EXT(lo_alltraps) @@ -79,8 +82,8 @@ #define EXCEP64_ERR(n,name) \ IDT64_ENTRY(name,0,K_INTR_GATE) ;\ Entry(name) ;\ + push $(LO_ALLTRAPS) ;\ push $(n) ;\ - movl $(LO_ALLTRAPS), 4(%rsp) ;\ jmp L_enter_lohandler @@ -92,8 +95,8 @@ Entry(name) ;\ IDT64_ENTRY(name,0,K_INTR_GATE) ;\ Entry(name) ;\ push $0 ;\ + push $(LO_ALLTRAPS) ;\ push $(n) ;\ - movl $(LO_ALLTRAPS), 4(%rsp) ;\ jmp L_enter_lohandler @@ -106,8 +109,8 @@ Entry(name) ;\ IDT64_ENTRY(name,0,U_INTR_GATE) ;\ Entry(name) ;\ push $0 ;\ + push $(LO_ALLTRAPS) ;\ push $(n) ;\ - movl $(LO_ALLTRAPS), 4(%rsp) ;\ jmp L_enter_lohandler @@ -138,8 +141,8 @@ Entry(name) ;\ .align FALIGN ;\ L_ ## n: ;\ push $0 ;\ + push $(LO_ALLINTRS) ;\ push $(n) ;\ - movl $(LO_ALLINTRS), 4(%rsp) ;\ jmp L_enter_lohandler @@ -474,16 +477,17 @@ EXCEPTION64(0xff,t64_preempt) */ .code32 + /* * Control is passed here to return to the compatibility mode user. * At this stage we're in kernel space in compatibility mode * but we need to switch into 64-bit mode in the 4G-based trampoline * space before performing the iret. */ -Entry(lo64_ret_to_user) +ret_to_user: movl %gs:CPU_ACTIVE_THREAD,%ecx - movl ACT_PCB_IDS(%ecx),%eax /* Obtain this thread's debug state */ + movl TH_PCB_IDS(%ecx),%eax /* Obtain this thread's debug state */ cmpl $0,%eax /* Is there a debug register context? */ je 2f /* branch if not */ cmpl $(TASK_MAP_32BIT), %gs:CPU_TASK_MAP /* Are we a 32-bit task? */ @@ -529,7 +533,7 @@ Entry(lo64_ret_to_user) je 1f /* flag the copyio engine state as WINDOWS_CLEAN */ mov %gs:CPU_ACTIVE_THREAD,%eax - movl $(WINDOWS_CLEAN),ACT_COPYIO_STATE(%eax) + movl $(WINDOWS_CLEAN),TH_COPYIO_STATE(%eax) mov %rcx,%cr3 /* switch to user's address space */ 1: @@ -551,7 +555,7 @@ Entry(lo64_ret_to_user) jne L_64bit_return jmp L_32bit_return -Entry(lo64_ret_to_kernel) +ret_to_kernel: ENTER_64BIT_MODE() ENTER_UBERSPACE() @@ -610,9 +614,9 @@ EXT(ret32_set_fs): EXT(ret32_set_gs): movw R32_GS(%rsp), %gs - add $(ISC32_OFFSET)+8+8, %rsp /* pop compat frame + - trapno/trapfn and error */ - cmp $(SYSENTER_CS),ISF64_CS-8-8(%rsp) + add $(ISC32_OFFSET)+8+8+8, %rsp /* pop compat frame + + trapno, trapfn and error */ + cmp $(SYSENTER_CS),ISF64_CS-8-8-8(%rsp) /* test for fast entry/exit */ je L_fast_exit EXT(ret32_iret): @@ -661,9 +665,9 @@ L_64bit_return: mov R64_RCX(%rsp), %rcx mov R64_RAX(%rsp), %rax - add $(ISS64_OFFSET)+8+8, %rsp /* pop saved state frame + - trapno/trapfn and error */ - cmpl $(SYSCALL_CS),ISF64_CS-8-8(%rsp) + add $(ISS64_OFFSET)+8+8+8, %rsp /* pop saved state frame + + trapno, trapfn and error */ + cmpl $(SYSCALL_CS),ISF64_CS-8-8-8(%rsp) /* test for fast entry/exit */ je L_sysret EXT(ret64_iret): @@ -676,9 +680,9 @@ L_sysret: * r1 user rflags * rsp user stack pointer */ - mov ISF64_RIP-16(%rsp), %rcx - mov ISF64_RFLAGS-16(%rsp), %r11 - mov ISF64_RSP-16(%rsp), %rsp + mov ISF64_RIP-8-8-8(%rsp), %rcx + mov ISF64_RFLAGS-8-8-8(%rsp), %r11 + mov ISF64_RSP-8-8-8(%rsp), %rsp sysretq /* return from system call */ /* @@ -704,8 +708,8 @@ Entry(hi64_unix_scall) swapgs /* switch to kernel gs (cpu_data) */ L_unix_scall_continue: push %rax /* save system call number */ + push $(LO_UNIX_SCALL) push $(UNIX_INT) - movl $(LO_UNIX_SCALL), 4(%rsp) jmp L_32bit_enter_check @@ -713,8 +717,8 @@ Entry(hi64_mach_scall) swapgs /* switch to kernel gs (cpu_data) */ L_mach_scall_continue: push %rax /* save system call number */ + push $(LO_MACH_SCALL) push $(MACH_INT) - movl $(LO_MACH_SCALL), 4(%rsp) jmp L_32bit_enter_check @@ -722,8 +726,8 @@ Entry(hi64_mdep_scall) swapgs /* switch to kernel gs (cpu_data) */ L_mdep_scall_continue: push %rax /* save system call number */ + push $(LO_MDEP_SCALL) push $(MACHDEP_INT) - movl $(LO_MDEP_SCALL), 4(%rsp) jmp L_32bit_enter_check @@ -731,8 +735,8 @@ Entry(hi64_diag_scall) swapgs /* switch to kernel gs (cpu_data) */ L_diag_scall_continue: push %rax /* save system call number */ + push $(LO_DIAG_SCALL) push $(DIAG_INT) - movl $(LO_DIAG_SCALL), 4(%rsp) jmp L_32bit_enter_check Entry(hi64_syscall) @@ -794,6 +798,7 @@ Entry(hi64_sysenter) L_sysenter_continue: push %rdx /* eip */ push %rax /* err/eax - syscall code */ + push $0 push $(T_SYSENTER) orl $(EFL_IF), ISF64_RFLAGS(%rsp) movl $(LO_MACH_SCALL), ISF64_TRAPFN(%rsp) @@ -830,6 +835,7 @@ EXT(hi64_sysenter_user_arg_copy): /* Fall through to 32-bit handler */ L_32bit_enter: + cld /* * Make space for the compatibility save area. */ @@ -908,6 +914,12 @@ L_enter_lohandler2: mov %rcx, %cr3 mov %rcx, %gs:CPU_ACTIVE_CR3 2: + movl %gs:CPU_ACTIVE_THREAD,%ecx /* Get the active thread */ + cmpl $0, TH_PCB_IDS(%ecx) /* Is there a debug register state? */ + jz 21f + xor %ecx, %ecx /* If so, reset DR7 (the control) */ + mov %rcx, %dr7 +21: /* * Switch to compatibility mode. * Then establish kernel segments. @@ -931,13 +943,8 @@ L_enter_lohandler2: mov $(CPU_DATA_GS), %eax mov %eax, %gs - movl %gs:CPU_ACTIVE_THREAD,%ecx /* Get the active thread */ - cmpl $0, ACT_PCB_IDS(%ecx) /* Is there a debug register state? */ - je 1f - movl $0, %ecx /* If so, reset DR7 (the control) */ - movl %ecx, %dr7 -1: - addl $1,%gs:hwIntCnt(,%ebx,4) // Bump the trap/intr count + incl %gs:hwIntCnt(,%ebx,4) /* Bump the trap/intr count */ + /* Dispatch the designated lo handler */ jmp *%edx @@ -958,6 +965,7 @@ L_64bit_enter: sub $(ISS64_OFFSET), %rsp movl $(SS_64), SS_FLAVOR(%rsp) + cld /* * Save segment regs */ @@ -1001,12 +1009,12 @@ L_64bit_enter_after_fault: jmp L_enter_lohandler2 Entry(hi64_page_fault) + push $(LO_ALLTRAPS) push $(T_PAGE_FAULT) - movl $(LO_ALLTRAPS), 4(%rsp) cmpl $(KERNEL_UBER_BASE_HI32), ISF64_RIP+4(%rsp) jne L_enter_lohandler cmpl $(EXT(hi64_sysenter_user_arg_copy)), ISF64_RIP(%rsp) - jne L_enter_lohandler + jne hi64_kernel_trap mov ISF64_RSP(%rsp), %rsp jmp L_32bit_enter @@ -1019,8 +1027,8 @@ Entry(hi64_page_fault) Entry(hi64_debug) swapgs /* set %gs for cpu data */ push $0 /* error code */ + push $(LO_ALLTRAPS) push $(T_DEBUG) - movl $(LO_ALLTRAPS), ISF64_TRAPFN(%rsp) testb $3, ISF64_CS(%rsp) jnz L_enter_lohandler_continue @@ -1052,7 +1060,7 @@ Entry(hi64_debug) * Interrupt stack frame has been pushed on the temporary stack. * We have to switch to pcb stack and copy eflags. */ - add $32,%rsp /* remove trapno/trapfn/err/rip/cs */ + add $40,%rsp /* remove trapno/trapfn/err/rip/cs */ push %rcx /* save %rcx - user stack pointer */ mov 32(%rsp),%rcx /* top of intr stack -> pcb stack */ xchg %rcx,%rsp /* switch to pcb stack */ @@ -1066,8 +1074,8 @@ Entry(hi64_debug) Entry(hi64_double_fault) swapgs /* set %gs for cpu data */ + push $(LO_DOUBLE_FAULT) push $(T_DOUBLE_FAULT) - movl $(LO_DOUBLE_FAULT), ISF64_TRAPFN(%rsp) cmpl $(KERNEL_UBER_BASE_HI32), ISF64_RIP+4(%rsp) jne L_enter_lohandler_continue /* trap not in uber-space */ @@ -1084,96 +1092,133 @@ Entry(hi64_double_fault) * Check for a GP/NP fault in the kernel_return * sequence; if there, report it as a GP/NP fault on the user's instruction. * - * rsp-> 0: trap code (NP or GP) and trap function - * 8: segment number in error (error code) - * 16 rip - * 24 cs - * 32 rflags - * 40 rsp - * 48 ss - * 56 old registers (trap is from kernel) + * rsp-> 0 ISF64_TRAPNO: trap code (NP or GP) + * 8 ISF64_TRAPFN: trap function + * 16 ISF64_ERR: segment number in error (error code) + * 24 ISF64_RIP: rip + * 32 ISF64_CS: cs + * 40 ISF64_RFLAGS: rflags + * 48 ISF64_RSP: rsp + * 56 ISF64_SS: ss + * 64 old registers (trap is from kernel) */ Entry(hi64_gen_prot) + push $(LO_ALLTRAPS) push $(T_GENERAL_PROTECTION) jmp trap_check_kernel_exit /* check for kernel exit sequence */ Entry(hi64_stack_fault) + push $(LO_ALLTRAPS) push $(T_STACK_FAULT) jmp trap_check_kernel_exit /* check for kernel exit sequence */ Entry(hi64_segnp) + push $(LO_ALLTRAPS) push $(T_SEGMENT_NOT_PRESENT) /* indicate fault type */ trap_check_kernel_exit: - movl $(LO_ALLTRAPS), 4(%rsp) - testb $3,24(%rsp) - jnz hi64_take_trap + testb $3,ISF64_CS(%rsp) + jnz L_enter_lohandler /* trap was from kernel mode, so */ /* check for the kernel exit sequence */ - cmpl $(KERNEL_UBER_BASE_HI32), 16+4(%rsp) - jne hi64_take_trap /* trap not in uber-space */ + cmpl $(KERNEL_UBER_BASE_HI32), ISF64_RIP+4(%rsp) + jne L_enter_lohandler_continue /* trap not in uber-space */ - cmpl $(EXT(ret32_iret)), 16(%rsp) + cmpl $(EXT(ret32_iret)), ISF64_RIP(%rsp) je L_fault_iret32 - cmpl $(EXT(ret32_set_ds)), 16(%rsp) + cmpl $(EXT(ret32_set_ds)), ISF64_RIP(%rsp) je L_32bit_fault_set_seg - cmpl $(EXT(ret32_set_es)), 16(%rsp) + cmpl $(EXT(ret32_set_es)), ISF64_RIP(%rsp) je L_32bit_fault_set_seg - cmpl $(EXT(ret32_set_fs)), 16(%rsp) + cmpl $(EXT(ret32_set_fs)), ISF64_RIP(%rsp) je L_32bit_fault_set_seg - cmpl $(EXT(ret32_set_gs)), 16(%rsp) + cmpl $(EXT(ret32_set_gs)), ISF64_RIP(%rsp) je L_32bit_fault_set_seg - cmpl $(EXT(ret64_iret)), 16(%rsp) + cmpl $(EXT(ret64_iret)), ISF64_RIP(%rsp) je L_fault_iret64 cmpl $(EXT(hi64_sysenter_user_arg_copy)), ISF64_RIP(%rsp) - jne hi64_take_trap - mov ISF64_RSP(%rsp), %rsp - jmp L_32bit_enter -hi64_take_trap: - jmp L_enter_lohandler + cmove ISF64_RSP(%rsp), %rsp + je L_32bit_enter + +hi64_kernel_trap: + /* + * Here after taking an unexpected trap from kernel mode - perhaps + * while running in the trampolines hereabouts. + * Make sure we're not on the PCB stack, if so move to the kernel stack. + * This is likely a fatal condition. + * But first, try to be sure we have the kernel gs base active... + */ + cmpq $0, %gs:CPU_THIS /* test gs_base */ + js 1f /* -ve kernel addr, no swap */ + swapgs /* +ve user addr, swap */ +1: + movq %rax, %gs:CPU_UBER_TMP /* save %rax */ + movq %gs:CPU_UBER_ISF, %rax /* PCB stack addr */ + subq %rsp, %rax + cmpq $(PAGE_SIZE), %rax /* current stack in PCB? */ + movq %gs:CPU_UBER_TMP, %rax /* restore %rax */ + ja L_enter_lohandler_continue /* stack not in PCB */ + + /* + * Here if %rsp is in the PCB + * Copy the interrupt stack frame from PCB stack to kernel stack + */ + movq %gs:CPU_KERNEL_STACK, %rax /* note: %rax restored below */ + xchgq %rax, %rsp + pushq ISF64_SS(%rax) + pushq ISF64_RSP(%rax) + pushq ISF64_RFLAGS(%rax) + pushq ISF64_CS(%rax) + pushq ISF64_RIP(%rax) + pushq ISF64_ERR(%rax) + pushq ISF64_TRAPFN(%rax) + pushq ISF64_TRAPNO(%rax) + movq %gs:CPU_UBER_TMP, %rax /* restore %rax */ + jmp L_enter_lohandler_continue + - /* * GP/NP fault on IRET: CS or SS is in error. * All registers contain the user's values. * * on SP is - * 0 trap number/function - * 8 errcode - * 16 rip - * 24 cs - * 32 rflags - * 40 rsp - * 48 ss --> new trapno/trapfn - * 56 (16-byte padding) --> new errcode - * 64 user rip - * 72 user cs - * 80 user rflags - * 88 user rsp - * 96 user ss + * 0 ISF64_TRAPNO: trap code (NP or GP) + * 8 ISF64_TRAPFN: trap function + * 16 ISF64_ERR: segment number in error (error code) + * 24 ISF64_RIP: rip + * 32 ISF64_CS: cs + * 40 ISF64_RFLAGS: rflags + * 48 ISF64_RSP: rsp + * 56 ISF64_SS: ss --> new new trapno/trapfn + * 64 pad --> new errcode + * 72 user rip + * 80 user cs + * 88 user rflags + * 96 user rsp + * 104 user ss (16-byte aligned) */ L_fault_iret32: - mov %rax, 16(%rsp) /* save rax (we don`t need saved rip) */ - mov 0(%rsp), %rax /* get trap number */ - mov %rax, 48(%rsp) /* put in user trap number */ - mov 8(%rsp), %rax /* get error code */ - mov %rax, 56(%rsp) /* put in user errcode */ - mov 16(%rsp), %rax /* restore rax */ - add $48, %rsp /* reset to original frame */ + mov %rax, ISF64_RIP(%rsp) /* save rax (we don`t need saved rip) */ + mov ISF64_TRAPNO(%rsp), %rax + mov %rax, ISF64_SS(%rsp) /* put in user trap number */ + mov ISF64_ERR(%rsp), %rax + mov %rax, 8+ISF64_SS(%rsp) /* put in user errcode */ + mov ISF64_RIP(%rsp), %rax /* restore rax */ + add $(ISF64_SS), %rsp /* reset to original frame */ /* now treat as fault from user */ swapgs jmp L_32bit_enter L_fault_iret64: - mov %rax, 16(%rsp) /* save rax (we don`t need saved rip) */ - mov 0(%rsp), %rax /* get trap number */ - mov %rax, 48(%rsp) /* put in user trap number */ - mov 8(%rsp), %rax /* get error code */ - mov %rax, 56(%rsp) /* put in user errcode */ - mov 16(%rsp), %rax /* restore rax */ - add $48, %rsp /* reset to original frame */ + mov %rax, ISF64_RIP(%rsp) /* save rax (we don`t need saved rip) */ + mov ISF64_TRAPNO(%rsp), %rax + mov %rax, ISF64_SS(%rsp) /* put in user trap number */ + mov ISF64_ERR(%rsp), %rax + mov %rax, 8+ISF64_SS(%rsp) /* put in user errcode */ + mov ISF64_RIP(%rsp), %rax /* restore rax */ + add $(ISF64_SS), %rsp /* reset to original frame */ /* now treat as fault from user */ swapgs jmp L_64bit_enter @@ -1183,9 +1228,9 @@ L_fault_iret64: * on the stack untouched since we didn't move the stack pointer. */ L_32bit_fault_set_seg: - mov 0(%rsp), %rax /* get trap number/function */ - mov 8(%rsp), %rdx /* get error code */ - mov 40(%rsp), %rsp /* reload stack prior to fault */ + mov ISF64_TRAPNO(%rsp), %rax + mov ISF64_ERR(%rsp), %rdx + mov ISF64_RSP(%rsp), %rsp /* reload stack prior to fault */ mov %rax,ISC32_TRAPNO(%rsp) mov %rdx,ISC32_ERR(%rsp) /* now treat as fault from user */ @@ -1201,17 +1246,503 @@ L_32bit_fault_set_seg: * Fatal exception handlers: */ Entry(db_task_dbl_fault64) + push $(LO_DOUBLE_FAULT) push $(T_DOUBLE_FAULT) - movl $(LO_DOUBLE_FAULT), ISF64_TRAPFN(%rsp) jmp L_enter_lohandler Entry(db_task_stk_fault64) + push $(LO_DOUBLE_FAULT) push $(T_STACK_FAULT) - movl $(LO_DOUBLE_FAULT), ISF64_TRAPFN(%rsp) jmp L_enter_lohandler Entry(mc64) push $(0) /* Error */ + push $(LO_MACHINE_CHECK) push $(T_MACHINE_CHECK) - movl $(LO_MACHINE_CHECK), ISF64_TRAPFN(%rsp) jmp L_enter_lohandler + + + .code32 + +/* + * All task 'exceptions' enter lo_alltraps: + * esp -> x86_saved_state_t + * + * The rest of the state is set up as: + * cr3 -> kernel directory + * esp -> low based stack + * gs -> CPU_DATA_GS + * cs -> KERNEL32_CS + * ss/ds/es -> KERNEL_DS + * + * interrupts disabled + * direction flag cleared + */ +Entry(lo_alltraps) + movl R32_CS(%esp),%eax /* assume 32-bit state */ + cmpl $(SS_64),SS_FLAVOR(%esp)/* 64-bit? */ + jne 1f + movl R64_CS(%esp),%eax /* 64-bit user mode */ +1: + testb $3,%al + jz trap_from_kernel + /* user mode trap */ + TIME_TRAP_UENTRY + + movl %gs:CPU_ACTIVE_THREAD,%ecx + movl TH_TASK(%ecx),%ebx + + /* Check for active vtimers in the current task */ + TASK_VTIMER_CHECK(%ebx, %ecx) + + movl %gs:CPU_KERNEL_STACK,%ebx + xchgl %ebx,%esp /* switch to kernel stack */ + + CCALL1(user_trap, %ebx) /* call user trap routine */ + /* user_trap() unmasks interrupts */ + cli /* hold off intrs - critical section */ + xorl %ecx,%ecx /* don't check if we're in the PFZ */ + +/* + * Return from trap or system call, checking for ASTs. + * On lowbase PCB stack with intrs disabled + */ +Entry(return_from_trap) + movl %gs:CPU_ACTIVE_THREAD, %esp + movl TH_PCB_ISS(%esp),%esp /* switch back to PCB stack */ + movl %gs:CPU_PENDING_AST, %eax + testl %eax, %eax + je return_to_user /* branch if no AST */ +LEXT(return_from_trap_with_ast) + movl %gs:CPU_KERNEL_STACK, %ebx + xchgl %ebx, %esp /* switch to kernel stack */ + + testl %ecx, %ecx /* see if we need to check for an EIP in the PFZ */ + je 2f /* no, go handle the AST */ + cmpl $(SS_64), SS_FLAVOR(%ebx) /* are we a 64-bit task? */ + je 1f + /* no... 32-bit user mode */ + movl R32_EIP(%ebx), %eax + pushl %ebx /* save PCB stack */ + xorl %ebp, %ebp /* clear frame pointer */ + CCALL1(commpage_is_in_pfz32, %eax) + popl %ebx /* retrieve pointer to PCB stack */ + testl %eax, %eax + je 2f /* not in the PFZ... go service AST */ + movl %eax, R32_EBX(%ebx) /* let the PFZ know we've pended an AST */ + xchgl %ebx, %esp /* switch back to PCB stack */ + jmp return_to_user +1: /* 64-bit user mode */ + movl R64_RIP(%ebx), %ecx + movl R64_RIP+4(%ebx), %eax + pushl %ebx /* save PCB stack */ + xorl %ebp, %ebp /* clear frame pointer */ + CCALL2(commpage_is_in_pfz64, %ecx, %eax) + popl %ebx /* retrieve pointer to PCB stack */ + testl %eax, %eax + je 2f /* not in the PFZ... go service AST */ + movl %eax, R64_RBX(%ebx) /* let the PFZ know we've pended an AST */ + xchgl %ebx, %esp /* switch back to PCB stack */ + jmp return_to_user +2: + sti /* interrupts always enabled on return to user mode */ + pushl %ebx /* save PCB stack */ + xorl %ebp, %ebp /* Clear framepointer */ + CCALL1(i386_astintr, $0) /* take the AST */ + cli + + popl %esp /* switch back to PCB stack (w/exc link) */ + + xorl %ecx, %ecx /* don't check if we're in the PFZ */ + jmp EXT(return_from_trap) /* and check again (rare) */ + + + +/* + * Trap from kernel mode. No need to switch stacks. + * Interrupts must be off here - we will set them to state at time of trap + * as soon as it's safe for us to do so and not recurse doing preemption + */ +trap_from_kernel: + movl %esp, %eax /* saved state addr */ + pushl R32_EIP(%esp) /* Simulate a CALL from fault point */ + pushl %ebp /* Extend framepointer chain */ + movl %esp, %ebp + CCALL1WITHSP(kernel_trap, %eax) /* Call kernel trap handler */ + popl %ebp + addl $4, %esp + cli + + movl %gs:CPU_PENDING_AST,%eax /* get pending asts */ + testl $ AST_URGENT,%eax /* any urgent preemption? */ + je ret_to_kernel /* no, nothing to do */ + cmpl $ T_PREEMPT,R32_TRAPNO(%esp) + je ret_to_kernel /* T_PREEMPT handled in kernel_trap() */ + testl $ EFL_IF,R32_EFLAGS(%esp) /* interrupts disabled? */ + je ret_to_kernel + cmpl $0,%gs:CPU_PREEMPTION_LEVEL /* preemption disabled? */ + jne ret_to_kernel + movl %gs:CPU_KERNEL_STACK,%eax + movl %esp,%ecx + xorl %eax,%ecx + and EXT(kernel_stack_mask),%ecx + testl %ecx,%ecx /* are we on the kernel stack? */ + jne ret_to_kernel /* no, skip it */ + + CCALL1(i386_astintr, $1) /* take the AST */ + + +/* + * All interrupts on all tasks enter here with: + * esp-> -> x86_saved_state_t + * + * cr3 -> kernel directory + * esp -> low based stack + * gs -> CPU_DATA_GS + * cs -> KERNEL32_CS + * ss/ds/es -> KERNEL_DS + * + * interrupts disabled + * direction flag cleared + */ +Entry(lo_allintrs) + /* + * test whether already on interrupt stack + */ + movl %gs:CPU_INT_STACK_TOP,%ecx + cmpl %esp,%ecx + jb 1f + leal -INTSTACK_SIZE(%ecx),%edx + cmpl %esp,%edx + jb int_from_intstack +1: + xchgl %ecx,%esp /* switch to interrupt stack */ + + movl %cr0,%eax /* get cr0 */ + orl $(CR0_TS),%eax /* or in TS bit */ + movl %eax,%cr0 /* set cr0 */ + + subl $8, %esp /* for 16-byte stack alignment */ + pushl %ecx /* save pointer to old stack */ + movl %ecx,%gs:CPU_INT_STATE /* save intr state */ + + TIME_INT_ENTRY /* do timing */ + + movl %gs:CPU_ACTIVE_THREAD,%ecx + movl TH_TASK(%ecx),%ebx + + /* Check for active vtimers in the current task */ + TASK_VTIMER_CHECK(%ebx, %ecx) + + incl %gs:CPU_PREEMPTION_LEVEL + incl %gs:CPU_INTERRUPT_LEVEL + + movl %gs:CPU_INT_STATE, %eax + CCALL1(interrupt, %eax) /* call generic interrupt routine */ + + cli /* just in case we returned with intrs enabled */ + xorl %eax,%eax + movl %eax,%gs:CPU_INT_STATE /* clear intr state pointer */ + + decl %gs:CPU_INTERRUPT_LEVEL + decl %gs:CPU_PREEMPTION_LEVEL + + TIME_INT_EXIT /* do timing */ + + movl %gs:CPU_ACTIVE_THREAD,%eax + movl TH_PCB_FPS(%eax),%eax /* get pcb's ifps */ + testl %eax, %eax /* Is there a context */ + je 1f /* Branch if not */ + cmpl $0, FP_VALID(%eax) /* Check fp_valid */ + jne 1f /* Branch if valid */ + clts /* Clear TS */ + jmp 2f +1: + movl %cr0,%eax /* get cr0 */ + orl $(CR0_TS),%eax /* or in TS bit */ + movl %eax,%cr0 /* set cr0 */ +2: + popl %esp /* switch back to old stack */ + + /* Load interrupted code segment into %eax */ + movl R32_CS(%esp),%eax /* assume 32-bit state */ + cmpl $(SS_64),SS_FLAVOR(%esp)/* 64-bit? */ + jne 3f + movl R64_CS(%esp),%eax /* 64-bit user mode */ +3: + testb $3,%al /* user mode, */ + jnz ast_from_interrupt_user /* go handle potential ASTs */ + /* + * we only want to handle preemption requests if + * the interrupt fell in the kernel context + * and preemption isn't disabled + */ + movl %gs:CPU_PENDING_AST,%eax + testl $ AST_URGENT,%eax /* any urgent requests? */ + je ret_to_kernel /* no, nothing to do */ + + cmpl $0,%gs:CPU_PREEMPTION_LEVEL /* preemption disabled? */ + jne ret_to_kernel /* yes, skip it */ + + movl %gs:CPU_KERNEL_STACK,%eax + movl %esp,%ecx + xorl %eax,%ecx + and EXT(kernel_stack_mask),%ecx + testl %ecx,%ecx /* are we on the kernel stack? */ + jne ret_to_kernel /* no, skip it */ + + /* + * Take an AST from kernel space. We don't need (and don't want) + * to do as much as the case where the interrupt came from user + * space. + */ + CCALL1(i386_astintr, $1) + + jmp ret_to_kernel + + +/* + * nested int - simple path, can't preempt etc on way out + */ +int_from_intstack: + incl %gs:CPU_PREEMPTION_LEVEL + incl %gs:CPU_INTERRUPT_LEVEL + incl %gs:CPU_NESTED_ISTACK + + movl %esp, %edx /* x86_saved_state */ + CCALL1(interrupt, %edx) + + decl %gs:CPU_INTERRUPT_LEVEL + decl %gs:CPU_PREEMPTION_LEVEL + decl %gs:CPU_NESTED_ISTACK + + jmp ret_to_kernel + +/* + * Take an AST from an interrupted user + */ +ast_from_interrupt_user: + movl %gs:CPU_PENDING_AST,%eax + testl %eax,%eax /* pending ASTs? */ + je ret_to_user /* no, nothing to do */ + + TIME_TRAP_UENTRY + + movl $1, %ecx /* check if we're in the PFZ */ + jmp EXT(return_from_trap_with_ast) /* return */ + + +/* + * 32bit Tasks + * System call entries via INTR_GATE or sysenter: + * + * esp -> x86_saved_state32_t + * cr3 -> kernel directory + * esp -> low based stack + * gs -> CPU_DATA_GS + * cs -> KERNEL32_CS + * ss/ds/es -> KERNEL_DS + * + * interrupts disabled + * direction flag cleared + */ + +Entry(lo_unix_scall) + TIME_TRAP_UENTRY + + movl %gs:CPU_KERNEL_STACK,%edi + xchgl %edi,%esp /* switch to kernel stack */ + movl %gs:CPU_ACTIVE_THREAD,%ecx /* get current thread */ + movl TH_TASK(%ecx),%ebx /* point to current task */ + incl TH_SYSCALLS_UNIX(%ecx) /* increment call count */ + + /* Check for active vtimers in the current task */ + TASK_VTIMER_CHECK(%ebx, %ecx) + + sti + + CCALL1(unix_syscall, %edi) + /* + * always returns through thread_exception_return + */ + + +Entry(lo_mach_scall) + TIME_TRAP_UENTRY + + movl %gs:CPU_KERNEL_STACK,%edi + xchgl %edi,%esp /* switch to kernel stack */ + movl %gs:CPU_ACTIVE_THREAD,%ecx /* get current thread */ + movl TH_TASK(%ecx),%ebx /* point to current task */ + incl TH_SYSCALLS_MACH(%ecx) /* increment call count */ + + /* Check for active vtimers in the current task */ + TASK_VTIMER_CHECK(%ebx, %ecx) + + sti + + CCALL1(mach_call_munger, %edi) + /* + * always returns through thread_exception_return + */ + + +Entry(lo_mdep_scall) + TIME_TRAP_UENTRY + + movl %gs:CPU_KERNEL_STACK,%edi + xchgl %edi,%esp /* switch to kernel stack */ + movl %gs:CPU_ACTIVE_THREAD,%ecx /* get current thread */ + movl TH_TASK(%ecx),%ebx /* point to current task */ + + /* Check for active vtimers in the current task */ + TASK_VTIMER_CHECK(%ebx, %ecx) + + sti + + CCALL1(machdep_syscall, %edi) + /* + * always returns through thread_exception_return + */ + + +Entry(lo_diag_scall) + TIME_TRAP_UENTRY + + movl %gs:CPU_KERNEL_STACK,%edi + xchgl %edi,%esp /* switch to kernel stack */ + movl %gs:CPU_ACTIVE_THREAD,%ecx /* get current thread */ + movl TH_TASK(%ecx),%ebx /* point to current task */ + + /* Check for active vtimers in the current task */ + TASK_VTIMER_CHECK(%ebx, %ecx) + + pushl %edi /* push pbc stack for later */ + + CCALL1(diagCall, %edi) // Call diagnostics + + cli // Disable interruptions just in case + cmpl $0,%eax // What kind of return is this? + je 1f // - branch if bad (zero) + popl %esp // Get back the original stack + jmp return_to_user // Normal return, do not check asts... +1: + CCALL5(i386_exception, $EXC_SYSCALL, $0x6000, $0, $1, $0) + // pass what would be the diag syscall + // error return - cause an exception + /* no return */ + + +return_to_user: + TIME_TRAP_UEXIT + jmp ret_to_user + + +/* + * 64bit Tasks + * System call entries via syscall only: + * + * esp -> x86_saved_state64_t + * cr3 -> kernel directory + * esp -> low based stack + * gs -> CPU_DATA_GS + * cs -> KERNEL32_CS + * ss/ds/es -> KERNEL_DS + * + * interrupts disabled + * direction flag cleared + */ + +Entry(lo_syscall) + TIME_TRAP_UENTRY + + movl %gs:CPU_KERNEL_STACK,%edi + xchgl %edi,%esp /* switch to kernel stack */ + + movl %gs:CPU_ACTIVE_THREAD,%ecx /* get current thread */ + movl TH_TASK(%ecx),%ebx /* point to current task */ + + /* Check for active vtimers in the current task */ + TASK_VTIMER_CHECK(%ebx, %ecx) + + /* + * We can be here either for a mach, unix machdep or diag syscall, + * as indicated by the syscall class: + */ + movl R64_RAX(%edi), %eax /* syscall number/class */ + movl %eax, %edx + andl $(SYSCALL_CLASS_MASK), %edx /* syscall class */ + cmpl $(SYSCALL_CLASS_MACH< - -#define SPL0 0 -#define SPL1 1 -#define SPL2 2 -#define SPL3 3 -#define SPL4 4 -#define SPL5 5 -#define SPL6 6 - -#define SPLPP 5 -#define SPLTTY 6 -#define SPLNI 6 - -#define IPLHI 8 -#define SPLHI IPLHI - -#if MACH_KPROF -#define SPL7 7 -#else -#define SPL7 IPLHI -#endif - -#define SPL_CMP_GT(a, b) ((unsigned)(a) > (unsigned)(b)) -#define SPL_CMP_LT(a, b) ((unsigned)(a) < (unsigned)(b)) -#define SPL_CMP_GE(a, b) ((unsigned)(a) >= (unsigned)(b)) -#define SPL_CMP_LE(a, b) ((unsigned)(a) <= (unsigned)(b)) - diff --git a/osfmk/i386/lapic.c b/osfmk/i386/lapic.c index e98665f04..b365d6070 100644 --- a/osfmk/i386/lapic.c +++ b/osfmk/i386/lapic.c @@ -32,16 +32,6 @@ #include #include -#include -#include -#include -#include -#include -#include - -#include -#include - #include #include #include @@ -52,53 +42,17 @@ #include #include #include -#if CONFIG_MCA -#include -#endif - -#if CONFIG_COUNTERS -#include -#endif - -#if MACH_KDB -#include -#endif #include -#if MP_DEBUG -#define PAUSE delay(1000000) -#define DBG(x...) kprintf(x) -#else -#define DBG(x...) -#define PAUSE -#endif /* MP_DEBUG */ - /* Base vector for local APIC interrupt sources */ int lapic_interrupt_base = LAPIC_DEFAULT_INTERRUPT_BASE; -lapic_ops_table_t *lapic_ops; /* Lapic operations switch */ - #define MAX_LAPICIDS (LAPIC_ID_MAX+1) int lapic_to_cpu[MAX_LAPICIDS]; int cpu_to_lapic[MAX_CPUS]; -static vm_offset_t lapic_pbase; /* Physical base memory-mapped regs */ -static vm_offset_t lapic_vbase; /* Virtual base memory-mapped regs */ - -static i386_intr_func_t lapic_intr_func[LAPIC_FUNC_TABLE_SIZE]; - -/* TRUE if local APIC was enabled by the OS not by the BIOS */ -static boolean_t lapic_os_enabled = FALSE; - -static boolean_t lapic_errors_masked = FALSE; -static uint64_t lapic_last_master_error = 0; -static uint64_t lapic_error_time_threshold = 0; -static unsigned lapic_master_error_count = 0; -static unsigned lapic_error_count_threshold = 5; -static boolean_t lapic_dont_panic = FALSE; - -static void +void lapic_cpu_map_init(void) { int i; @@ -146,821 +100,3 @@ ml_get_cpuid(uint32_t lapic_index) return (uint32_t)lapic_to_cpu[lapic_index]; } - -#ifdef MP_DEBUG -void -lapic_cpu_map_dump(void) -{ - int i; - - for (i = 0; i < MAX_CPUS; i++) { - if (cpu_to_lapic[i] == -1) - continue; - kprintf("cpu_to_lapic[%d]: %d\n", - i, cpu_to_lapic[i]); - } - for (i = 0; i < MAX_LAPICIDS; i++) { - if (lapic_to_cpu[i] == -1) - continue; - kprintf("lapic_to_cpu[%d]: %d\n", - i, lapic_to_cpu[i]); - } -} -#endif /* MP_DEBUG */ - -static void -legacy_init(void) -{ - int result; - vm_map_entry_t entry; - - /* Establish a map to the local apic */ - lapic_vbase = (vm_offset_t)vm_map_min(kernel_map); - result = vm_map_find_space(kernel_map, - (vm_map_address_t *) &lapic_vbase, - round_page(LAPIC_SIZE), 0, - VM_MAKE_TAG(VM_MEMORY_IOKIT), &entry); - if (result != KERN_SUCCESS) { - panic("legacy_init: vm_map_find_entry FAILED (err=%d)", result); - } - vm_map_unlock(kernel_map); -/* Map in the local APIC non-cacheable, as recommended by Intel - * in section 8.4.1 of the "System Programming Guide". - */ - pmap_enter(pmap_kernel(), - lapic_vbase, - (ppnum_t) i386_btop(lapic_pbase), - VM_PROT_READ|VM_PROT_WRITE, - VM_WIMG_IO, - TRUE); -} - - -static uint32_t -legacy_read(lapic_register_t reg) -{ - return *LAPIC_MMIO(reg); -} - -static void -legacy_write(lapic_register_t reg, uint32_t value) -{ - *LAPIC_MMIO(reg) = value; -} - -static lapic_ops_table_t legacy_ops = { - legacy_init, - legacy_read, - legacy_write -}; - -static void -x2apic_init(void) -{ -} - -static uint32_t -x2apic_read(lapic_register_t reg) -{ - uint32_t lo; - uint32_t hi; - - rdmsr(LAPIC_MSR(reg), lo, hi); - return lo; -} - -static void -x2apic_write(lapic_register_t reg, uint32_t value) -{ - wrmsr(LAPIC_MSR(reg), value, 0); -} - -static lapic_ops_table_t x2apic_ops = { - x2apic_init, - x2apic_read, - x2apic_write -}; - - -void -lapic_init(void) -{ - uint32_t lo; - uint32_t hi; - boolean_t is_boot_processor; - boolean_t is_lapic_enabled; - boolean_t is_x2apic; - - /* Examine the local APIC state */ - rdmsr(MSR_IA32_APIC_BASE, lo, hi); - is_boot_processor = (lo & MSR_IA32_APIC_BASE_BSP) != 0; - is_lapic_enabled = (lo & MSR_IA32_APIC_BASE_ENABLE) != 0; - is_x2apic = (lo & MSR_IA32_APIC_BASE_EXTENDED) != 0; - lapic_pbase = (lo & MSR_IA32_APIC_BASE_BASE); - kprintf("MSR_IA32_APIC_BASE %p %s %s mode %s\n", (void *) lapic_pbase, - is_lapic_enabled ? "enabled" : "disabled", - is_x2apic ? "extended" : "legacy", - is_boot_processor ? "BSP" : "AP"); - if (!is_boot_processor || !is_lapic_enabled) - panic("Unexpected local APIC state\n"); - - lapic_ops = is_x2apic ? &x2apic_ops : &legacy_ops; - - lapic_ops->init(); - - if ((LAPIC_READ(VERSION)&LAPIC_VERSION_MASK) < 0x14) { - panic("Local APIC version 0x%x, 0x14 or more expected\n", - (LAPIC_READ(VERSION)&LAPIC_VERSION_MASK)); - } - - /* Set up the lapic_id <-> cpu_number map and add this boot processor */ - lapic_cpu_map_init(); - lapic_cpu_map((LAPIC_READ(ID)>>LAPIC_ID_SHIFT)&LAPIC_ID_MASK, 0); - kprintf("Boot cpu local APIC id 0x%x\n", cpu_to_lapic[0]); -} - - -static int -lapic_esr_read(void) -{ - /* write-read register */ - LAPIC_WRITE(ERROR_STATUS, 0); - return LAPIC_READ(ERROR_STATUS); -} - -static void -lapic_esr_clear(void) -{ - LAPIC_WRITE(ERROR_STATUS, 0); - LAPIC_WRITE(ERROR_STATUS, 0); -} - -static const char *DM_str[8] = { - "Fixed", - "Lowest Priority", - "Invalid", - "Invalid", - "NMI", - "Reset", - "Invalid", - "ExtINT"}; - -static const char *TMR_str[] = { - "OneShot", - "Periodic", - "TSC-Deadline", - "Illegal" -}; - -void -lapic_dump(void) -{ - int i; - -#define BOOL(a) ((a)?' ':'!') -#define VEC(lvt) \ - LAPIC_READ(lvt)&LAPIC_LVT_VECTOR_MASK -#define DS(lvt) \ - (LAPIC_READ(lvt)&LAPIC_LVT_DS_PENDING)?" SendPending" : "Idle" -#define DM(lvt) \ - DM_str[(LAPIC_READ(lvt)>>LAPIC_LVT_DM_SHIFT)&LAPIC_LVT_DM_MASK] -#define MASK(lvt) \ - BOOL(LAPIC_READ(lvt)&LAPIC_LVT_MASKED) -#define TM(lvt) \ - (LAPIC_READ(lvt)&LAPIC_LVT_TM_LEVEL)? "Level" : "Edge" -#define IP(lvt) \ - (LAPIC_READ(lvt)&LAPIC_LVT_IP_PLRITY_LOW)? "Low " : "High" - - kprintf("LAPIC %d at %p version 0x%x\n", - (LAPIC_READ(ID)>>LAPIC_ID_SHIFT)&LAPIC_ID_MASK, - (void *) lapic_vbase, - LAPIC_READ(VERSION)&LAPIC_VERSION_MASK); - kprintf("Priorities: Task 0x%x Arbitration 0x%x Processor 0x%x\n", - LAPIC_READ(TPR)&LAPIC_TPR_MASK, - LAPIC_READ(APR)&LAPIC_APR_MASK, - LAPIC_READ(PPR)&LAPIC_PPR_MASK); - kprintf("Destination Format 0x%x Logical Destination 0x%x\n", - LAPIC_READ(DFR)>>LAPIC_DFR_SHIFT, - LAPIC_READ(LDR)>>LAPIC_LDR_SHIFT); - kprintf("%cEnabled %cFocusChecking SV 0x%x\n", - BOOL(LAPIC_READ(SVR)&LAPIC_SVR_ENABLE), - BOOL(!(LAPIC_READ(SVR)&LAPIC_SVR_FOCUS_OFF)), - LAPIC_READ(SVR) & LAPIC_SVR_MASK); -#if CONFIG_MCA - if (mca_is_cmci_present()) - kprintf("LVT_CMCI: Vector 0x%02x [%s] %s %cmasked\n", - VEC(LVT_CMCI), - DM(LVT_CMCI), - DS(LVT_CMCI), - MASK(LVT_CMCI)); -#endif - kprintf("LVT_TIMER: Vector 0x%02x %s %cmasked %s\n", - VEC(LVT_TIMER), - DS(LVT_TIMER), - MASK(LVT_TIMER), - TMR_str[(LAPIC_READ(LVT_TIMER) >> LAPIC_LVT_TMR_SHIFT) - & LAPIC_LVT_TMR_MASK]); - kprintf(" Initial Count: 0x%08x \n", LAPIC_READ(TIMER_INITIAL_COUNT)); - kprintf(" Current Count: 0x%08x \n", LAPIC_READ(TIMER_CURRENT_COUNT)); - kprintf(" Divide Config: 0x%08x \n", LAPIC_READ(TIMER_DIVIDE_CONFIG)); - kprintf("LVT_PERFCNT: Vector 0x%02x [%s] %s %cmasked\n", - VEC(LVT_PERFCNT), - DM(LVT_PERFCNT), - DS(LVT_PERFCNT), - MASK(LVT_PERFCNT)); - kprintf("LVT_THERMAL: Vector 0x%02x [%s] %s %cmasked\n", - VEC(LVT_THERMAL), - DM(LVT_THERMAL), - DS(LVT_THERMAL), - MASK(LVT_THERMAL)); - kprintf("LVT_LINT0: Vector 0x%02x [%s][%s][%s] %s %cmasked\n", - VEC(LVT_LINT0), - DM(LVT_LINT0), - TM(LVT_LINT0), - IP(LVT_LINT0), - DS(LVT_LINT0), - MASK(LVT_LINT0)); - kprintf("LVT_LINT1: Vector 0x%02x [%s][%s][%s] %s %cmasked\n", - VEC(LVT_LINT1), - DM(LVT_LINT1), - TM(LVT_LINT1), - IP(LVT_LINT1), - DS(LVT_LINT1), - MASK(LVT_LINT1)); - kprintf("LVT_ERROR: Vector 0x%02x %s %cmasked\n", - VEC(LVT_ERROR), - DS(LVT_ERROR), - MASK(LVT_ERROR)); - kprintf("ESR: %08x \n", lapic_esr_read()); - kprintf(" "); - for(i=0xf; i>=0; i--) - kprintf("%x%x%x%x",i,i,i,i); - kprintf("\n"); - kprintf("TMR: 0x"); - for(i=7; i>=0; i--) - kprintf("%08x",LAPIC_READ_OFFSET(TMR_BASE, i)); - kprintf("\n"); - kprintf("IRR: 0x"); - for(i=7; i>=0; i--) - kprintf("%08x",LAPIC_READ_OFFSET(IRR_BASE, i)); - kprintf("\n"); - kprintf("ISR: 0x"); - for(i=7; i >= 0; i--) - kprintf("%08x",LAPIC_READ_OFFSET(ISR_BASE, i)); - kprintf("\n"); -} - -#if MACH_KDB -/* - * Displays apic junk - * - * da - */ -void -db_apic(__unused db_expr_t addr, - __unused int have_addr, - __unused db_expr_t count, - __unused char *modif) -{ - - lapic_dump(); - - return; -} - -#endif - -boolean_t -lapic_probe(void) -{ - uint32_t lo; - uint32_t hi; - - if (cpuid_features() & CPUID_FEATURE_APIC) - return TRUE; - - if (cpuid_family() == 6 || cpuid_family() == 15) { - /* - * Mobile Pentiums: - * There may be a local APIC which wasn't enabled by BIOS. - * So we try to enable it explicitly. - */ - rdmsr(MSR_IA32_APIC_BASE, lo, hi); - lo &= ~MSR_IA32_APIC_BASE_BASE; - lo |= MSR_IA32_APIC_BASE_ENABLE | LAPIC_START; - lo |= MSR_IA32_APIC_BASE_ENABLE; - wrmsr(MSR_IA32_APIC_BASE, lo, hi); - - /* - * Re-initialize cpu features info and re-check. - */ - cpuid_set_info(); - if (cpuid_features() & CPUID_FEATURE_APIC) { - printf("Local APIC discovered and enabled\n"); - lapic_os_enabled = TRUE; - lapic_interrupt_base = LAPIC_REDUCED_INTERRUPT_BASE; - return TRUE; - } - } - - return FALSE; -} - -void -lapic_shutdown(void) -{ - uint32_t lo; - uint32_t hi; - uint32_t value; - - /* Shutdown if local APIC was enabled by OS */ - if (lapic_os_enabled == FALSE) - return; - - mp_disable_preemption(); - - /* ExtINT: masked */ - if (get_cpu_number() == master_cpu) { - value = LAPIC_READ(LVT_LINT0); - value |= LAPIC_LVT_MASKED; - LAPIC_WRITE(LVT_LINT0, value); - } - - /* Error: masked */ - LAPIC_WRITE(LVT_ERROR, LAPIC_READ(LVT_ERROR) | LAPIC_LVT_MASKED); - - /* Timer: masked */ - LAPIC_WRITE(LVT_TIMER, LAPIC_READ(LVT_TIMER) | LAPIC_LVT_MASKED); - - /* Perfmon: masked */ - LAPIC_WRITE(LVT_PERFCNT, LAPIC_READ(LVT_PERFCNT) | LAPIC_LVT_MASKED); - - /* APIC software disabled */ - LAPIC_WRITE(SVR, LAPIC_READ(SVR) & ~LAPIC_SVR_ENABLE); - - /* Bypass the APIC completely and update cpu features */ - rdmsr(MSR_IA32_APIC_BASE, lo, hi); - lo &= ~MSR_IA32_APIC_BASE_ENABLE; - wrmsr(MSR_IA32_APIC_BASE, lo, hi); - cpuid_set_info(); - - mp_enable_preemption(); -} - -void -lapic_configure(void) -{ - int value; - - if (lapic_error_time_threshold == 0 && cpu_number() == 0) { - nanoseconds_to_absolutetime(NSEC_PER_SEC >> 2, &lapic_error_time_threshold); - if (!PE_parse_boot_argn("lapic_dont_panic", &lapic_dont_panic, sizeof(lapic_dont_panic))) { - lapic_dont_panic = FALSE; - } - } - - /* Set flat delivery model, logical processor id */ - LAPIC_WRITE(DFR, LAPIC_DFR_FLAT); - LAPIC_WRITE(LDR, (get_cpu_number()) << LAPIC_LDR_SHIFT); - - /* Accept all */ - LAPIC_WRITE(TPR, 0); - - LAPIC_WRITE(SVR, LAPIC_VECTOR(SPURIOUS) | LAPIC_SVR_ENABLE); - - /* ExtINT */ - if (get_cpu_number() == master_cpu) { - value = LAPIC_READ(LVT_LINT0); - value &= ~LAPIC_LVT_MASKED; - value |= LAPIC_LVT_DM_EXTINT; - LAPIC_WRITE(LVT_LINT0, value); - } - - /* Timer: unmasked, one-shot */ - LAPIC_WRITE(LVT_TIMER, LAPIC_VECTOR(TIMER)); - - /* Perfmon: unmasked */ - LAPIC_WRITE(LVT_PERFCNT, LAPIC_VECTOR(PERFCNT)); - - /* Thermal: unmasked */ - LAPIC_WRITE(LVT_THERMAL, LAPIC_VECTOR(THERMAL)); - -#if CONFIG_MCA - /* CMCI, if available */ - if (mca_is_cmci_present()) - LAPIC_WRITE(LVT_CMCI, LAPIC_VECTOR(CMCI)); -#endif - - if (((cpu_number() == master_cpu) && lapic_errors_masked == FALSE) || - (cpu_number() != master_cpu)) { - lapic_esr_clear(); - LAPIC_WRITE(LVT_ERROR, LAPIC_VECTOR(ERROR)); - } -} - -void -lapic_set_timer( - boolean_t interrupt_unmasked, - lapic_timer_mode_t mode, - lapic_timer_divide_t divisor, - lapic_timer_count_t initial_count) -{ - uint32_t timer_vector; - - mp_disable_preemption(); - timer_vector = LAPIC_READ(LVT_TIMER); - timer_vector &= ~(LAPIC_LVT_MASKED|LAPIC_LVT_PERIODIC);; - timer_vector |= interrupt_unmasked ? 0 : LAPIC_LVT_MASKED; - timer_vector |= (mode == periodic) ? LAPIC_LVT_PERIODIC : 0; - LAPIC_WRITE(LVT_TIMER, timer_vector); - LAPIC_WRITE(TIMER_DIVIDE_CONFIG, divisor); - LAPIC_WRITE(TIMER_INITIAL_COUNT, initial_count); - mp_enable_preemption(); -} - -void -lapic_config_timer( - boolean_t interrupt_unmasked, - lapic_timer_mode_t mode, - lapic_timer_divide_t divisor) -{ - uint32_t timer_vector; - - mp_disable_preemption(); - timer_vector = LAPIC_READ(LVT_TIMER); - timer_vector &= ~(LAPIC_LVT_MASKED | - LAPIC_LVT_PERIODIC | - LAPIC_LVT_TSC_DEADLINE); - timer_vector |= interrupt_unmasked ? 0 : LAPIC_LVT_MASKED; - timer_vector |= (mode == periodic) ? LAPIC_LVT_PERIODIC : 0; - LAPIC_WRITE(LVT_TIMER, timer_vector); - LAPIC_WRITE(TIMER_DIVIDE_CONFIG, divisor); - mp_enable_preemption(); -} - -/* - * Configure TSC-deadline timer mode. The lapic interrupt is always unmasked. - */ -void -lapic_config_tsc_deadline_timer(void) -{ - uint32_t timer_vector; - - DBG("lapic_config_tsc_deadline_timer()\n"); - mp_disable_preemption(); - timer_vector = LAPIC_READ(LVT_TIMER); - timer_vector &= ~(LAPIC_LVT_MASKED | - LAPIC_LVT_PERIODIC); - timer_vector |= LAPIC_LVT_TSC_DEADLINE; - LAPIC_WRITE(LVT_TIMER, timer_vector); - - /* Serialize writes per Intel OSWG */ - do { - lapic_set_tsc_deadline_timer(rdtsc64() + (1ULL<<32)); - } while (lapic_get_tsc_deadline_timer() == 0); - lapic_set_tsc_deadline_timer(0); - - mp_enable_preemption(); - DBG("lapic_config_tsc_deadline_timer() done\n"); -} - -void -lapic_set_timer_fast( - lapic_timer_count_t initial_count) -{ - LAPIC_WRITE(LVT_TIMER, LAPIC_READ(LVT_TIMER) & ~LAPIC_LVT_MASKED); - LAPIC_WRITE(TIMER_INITIAL_COUNT, initial_count); -} - -void -lapic_set_tsc_deadline_timer(uint64_t deadline) -{ - wrmsr64(MSR_IA32_TSC_DEADLINE, deadline); -} - -uint64_t -lapic_get_tsc_deadline_timer(void) -{ - return rdmsr64(MSR_IA32_TSC_DEADLINE); -} - -void -lapic_get_timer( - lapic_timer_mode_t *mode, - lapic_timer_divide_t *divisor, - lapic_timer_count_t *initial_count, - lapic_timer_count_t *current_count) -{ - mp_disable_preemption(); - if (mode) - *mode = (LAPIC_READ(LVT_TIMER) & LAPIC_LVT_PERIODIC) ? - periodic : one_shot; - if (divisor) - *divisor = LAPIC_READ(TIMER_DIVIDE_CONFIG) & LAPIC_TIMER_DIVIDE_MASK; - if (initial_count) - *initial_count = LAPIC_READ(TIMER_INITIAL_COUNT); - if (current_count) - *current_count = LAPIC_READ(TIMER_CURRENT_COUNT); - mp_enable_preemption(); -} - -static inline void -_lapic_end_of_interrupt(void) -{ - LAPIC_WRITE(EOI, 0); -} - -void -lapic_end_of_interrupt(void) -{ - _lapic_end_of_interrupt(); -} - -void lapic_unmask_perfcnt_interrupt(void) { - LAPIC_WRITE(LVT_PERFCNT, LAPIC_VECTOR(PERFCNT)); -} - -void lapic_set_perfcnt_interrupt_mask(boolean_t mask) { - uint32_t m = (mask ? LAPIC_LVT_MASKED : 0); - LAPIC_WRITE(LVT_PERFCNT, LAPIC_VECTOR(PERFCNT) | m); -} - -void -lapic_set_intr_func(int vector, i386_intr_func_t func) -{ - if (vector > lapic_interrupt_base) - vector -= lapic_interrupt_base; - - switch (vector) { - case LAPIC_NMI_INTERRUPT: - case LAPIC_INTERPROCESSOR_INTERRUPT: - case LAPIC_TIMER_INTERRUPT: - case LAPIC_THERMAL_INTERRUPT: - case LAPIC_PERFCNT_INTERRUPT: - case LAPIC_CMCI_INTERRUPT: - case LAPIC_PM_INTERRUPT: - lapic_intr_func[vector] = func; - break; - default: - panic("lapic_set_intr_func(%d,%p) invalid vector\n", - vector, func); - } -} - -void lapic_set_pmi_func(i386_intr_func_t func) { - lapic_set_intr_func(LAPIC_VECTOR(PERFCNT), func); -} - -int -lapic_interrupt(int interrupt_num, x86_saved_state_t *state) -{ - int retval = 0; - int esr = -1; - - interrupt_num -= lapic_interrupt_base; - if (interrupt_num < 0) { - if (interrupt_num == (LAPIC_NMI_INTERRUPT - lapic_interrupt_base) && - lapic_intr_func[LAPIC_NMI_INTERRUPT] != NULL) { - retval = (*lapic_intr_func[LAPIC_NMI_INTERRUPT])(state); - return retval; - } - else - return 0; - } - - switch(interrupt_num) { - case LAPIC_TIMER_INTERRUPT: - case LAPIC_THERMAL_INTERRUPT: - case LAPIC_INTERPROCESSOR_INTERRUPT: - case LAPIC_PM_INTERRUPT: - if (lapic_intr_func[interrupt_num] != NULL) - (void) (*lapic_intr_func[interrupt_num])(state); - _lapic_end_of_interrupt(); - retval = 1; - break; - case LAPIC_PERFCNT_INTERRUPT: - /* If a function has been registered, invoke it. Otherwise, - * pass up to IOKit. - */ - if (lapic_intr_func[interrupt_num] != NULL) { - (void) (*lapic_intr_func[interrupt_num])(state); - /* Unmask the interrupt since we don't expect legacy users - * to be responsible for it. - */ - lapic_unmask_perfcnt_interrupt(); - _lapic_end_of_interrupt(); - retval = 1; - } - break; - case LAPIC_CMCI_INTERRUPT: - if (lapic_intr_func[interrupt_num] != NULL) - (void) (*lapic_intr_func[interrupt_num])(state); - /* return 0 for plaform expert to handle */ - break; - case LAPIC_ERROR_INTERRUPT: - /* We treat error interrupts on APs as fatal. - * The current interrupt steering scheme directs most - * external interrupts to the BSP (HPET interrupts being - * a notable exception); hence, such an error - * on an AP may signify LVT corruption (with "may" being - * the operative word). On the BSP, we adopt a more - * lenient approach, in the interests of enhancing - * debuggability and reducing fragility. - * If "lapic_error_count_threshold" error interrupts - * occur within "lapic_error_time_threshold" absolute - * time units, we mask the error vector and log. The - * error interrupts themselves are likely - * side effects of issues which are beyond the purview of - * the local APIC interrupt handler, however. The Error - * Status Register value (the illegal destination - * vector code is one observed in practice) indicates - * the immediate cause of the error. - */ - esr = lapic_esr_read(); - lapic_dump(); - - if ((debug_boot_arg && (lapic_dont_panic == FALSE)) || - cpu_number() != master_cpu) { - panic("Local APIC error, ESR: %d\n", esr); - } - - if (cpu_number() == master_cpu) { - uint64_t abstime = mach_absolute_time(); - if ((abstime - lapic_last_master_error) < lapic_error_time_threshold) { - if (lapic_master_error_count++ > lapic_error_count_threshold) { - lapic_errors_masked = TRUE; - LAPIC_WRITE(LVT_ERROR, LAPIC_READ(LVT_ERROR) | LAPIC_LVT_MASKED); - printf("Local APIC: errors masked\n"); - } - } - else { - lapic_last_master_error = abstime; - lapic_master_error_count = 0; - } - printf("Local APIC error on master CPU, ESR: %d, error count this run: %d\n", esr, lapic_master_error_count); - } - - _lapic_end_of_interrupt(); - retval = 1; - break; - case LAPIC_SPURIOUS_INTERRUPT: - kprintf("SPIV\n"); - /* No EOI required here */ - retval = 1; - break; - case LAPIC_PMC_SW_INTERRUPT: - { -#if CONFIG_COUNTERS - thread_t old, new; - ml_get_csw_threads(&old, &new); - - if (pmc_context_switch(old, new) == TRUE) { - retval = 1; - /* No EOI required for SWI */ - } -#endif /* CONFIG_COUNTERS */ - } - break; - } - - return retval; -} - -void -lapic_smm_restore(void) -{ - boolean_t state; - - if (lapic_os_enabled == FALSE) - return; - - state = ml_set_interrupts_enabled(FALSE); - - if (LAPIC_ISR_IS_SET(LAPIC_REDUCED_INTERRUPT_BASE, TIMER)) { - /* - * Bogus SMI handler enables interrupts but does not know about - * local APIC interrupt sources. When APIC timer counts down to - * zero while in SMM, local APIC will end up waiting for an EOI - * but no interrupt was delivered to the OS. - */ - _lapic_end_of_interrupt(); - - /* - * timer is one-shot, trigger another quick countdown to trigger - * another timer interrupt. - */ - if (LAPIC_READ(TIMER_CURRENT_COUNT) == 0) { - LAPIC_WRITE(TIMER_INITIAL_COUNT, 1); - } - - kprintf("lapic_smm_restore\n"); - } - - ml_set_interrupts_enabled(state); -} - -void -lapic_send_ipi(int cpu, int vector) -{ - boolean_t state; - - if (vector < lapic_interrupt_base) - vector += lapic_interrupt_base; - - state = ml_set_interrupts_enabled(FALSE); - - /* Wait for pending outgoing send to complete */ - while (LAPIC_READ(ICR) & LAPIC_ICR_DS_PENDING) { - cpu_pause(); - } - - LAPIC_WRITE(ICRD, cpu_to_lapic[cpu] << LAPIC_ICRD_DEST_SHIFT); - LAPIC_WRITE(ICR, vector | LAPIC_ICR_DM_FIXED); - - (void) ml_set_interrupts_enabled(state); -} - -/* - * The following interfaces are privately exported to AICPM. - */ - -boolean_t -lapic_is_interrupt_pending(void) -{ - int i; - - for (i = 0; i < 8; i += 1) { - if ((LAPIC_READ_OFFSET(IRR_BASE, i) != 0) || - (LAPIC_READ_OFFSET(ISR_BASE, i) != 0)) - return (TRUE); - } - - return (FALSE); -} - -boolean_t -lapic_is_interrupting(uint8_t vector) -{ - int i; - int bit; - uint32_t irr; - uint32_t isr; - - i = vector / 32; - bit = 1 << (vector % 32); - - irr = LAPIC_READ_OFFSET(IRR_BASE, i); - isr = LAPIC_READ_OFFSET(ISR_BASE, i); - - if ((irr | isr) & bit) - return (TRUE); - - return (FALSE); -} - -void -lapic_interrupt_counts(uint64_t intrs[256]) -{ - int i; - int j; - int bit; - uint32_t irr; - uint32_t isr; - - if (intrs == NULL) - return; - - for (i = 0; i < 8; i += 1) { - irr = LAPIC_READ_OFFSET(IRR_BASE, i); - isr = LAPIC_READ_OFFSET(ISR_BASE, i); - - if ((isr | irr) == 0) - continue; - - for (j = (i == 0) ? 16 : 0; j < 32; j += 1) { - bit = (32 * i) + j; - if ((isr | irr) & (1 << j)) - intrs[bit] += 1; - } - } -} - -void -lapic_disable_timer(void) -{ - uint32_t lvt_timer; - - /* - * If we're in deadline timer mode, - * simply clear the deadline timer, otherwise - * mask the timer interrupt and clear the countdown. - */ - lvt_timer = LAPIC_READ(LVT_TIMER); - if (lvt_timer & LAPIC_LVT_TSC_DEADLINE) { - wrmsr64(MSR_IA32_TSC_DEADLINE, 0); - } else { - LAPIC_WRITE(LVT_TIMER, lvt_timer | LAPIC_LVT_MASKED); - LAPIC_WRITE(TIMER_INITIAL_COUNT, 0); - lvt_timer = LAPIC_READ(LVT_TIMER); - } -} diff --git a/osfmk/i386/lapic.h b/osfmk/i386/lapic.h index 655864230..9d6f53abb 100644 --- a/osfmk/i386/lapic.h +++ b/osfmk/i386/lapic.h @@ -246,6 +246,7 @@ extern void lapic_shutdown(void); extern void lapic_smm_restore(void); extern boolean_t lapic_probe(void); extern void lapic_dump(void); +extern void lapic_cpu_map_dump(void); extern int lapic_interrupt( int interrupt, x86_saved_state_t *state); extern void lapic_end_of_interrupt(void); @@ -256,6 +257,7 @@ extern void lapic_send_ipi(int cpu, int interupt); extern int lapic_to_cpu[]; extern int cpu_to_lapic[]; extern int lapic_interrupt_base; +extern void lapic_cpu_map_init(void); extern void lapic_cpu_map(int lapic, int cpu_num); extern uint32_t ml_get_apicid(uint32_t cpu); extern uint32_t ml_get_cpuid(uint32_t lapic_index); diff --git a/osfmk/i386/lapic_native.c b/osfmk/i386/lapic_native.c new file mode 100644 index 000000000..7142be269 --- /dev/null +++ b/osfmk/i386/lapic_native.c @@ -0,0 +1,919 @@ +/* + * Copyright (c) 2008-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * @OSF_COPYRIGHT@ + */ + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if CONFIG_MCA +#include +#endif + +#if CONFIG_COUNTERS +#include +#endif + +#if MACH_KDB +#include +#endif + +#include + +#if MP_DEBUG +#define PAUSE delay(1000000) +#define DBG(x...) kprintf(x) +#else +#define DBG(x...) +#define PAUSE +#endif /* MP_DEBUG */ + +lapic_ops_table_t *lapic_ops; /* Lapic operations switch */ + +static vm_map_offset_t lapic_pbase; /* Physical base memory-mapped regs */ +static vm_offset_t lapic_vbase; /* Virtual base memory-mapped regs */ + +static i386_intr_func_t lapic_intr_func[LAPIC_FUNC_TABLE_SIZE]; + +/* TRUE if local APIC was enabled by the OS not by the BIOS */ +static boolean_t lapic_os_enabled = FALSE; + +static boolean_t lapic_errors_masked = FALSE; +static uint64_t lapic_last_master_error = 0; +static uint64_t lapic_error_time_threshold = 0; +static unsigned lapic_master_error_count = 0; +static unsigned lapic_error_count_threshold = 5; +static boolean_t lapic_dont_panic = FALSE; + +#ifdef MP_DEBUG +void +lapic_cpu_map_dump(void) +{ + int i; + + for (i = 0; i < MAX_CPUS; i++) { + if (cpu_to_lapic[i] == -1) + continue; + kprintf("cpu_to_lapic[%d]: %d\n", + i, cpu_to_lapic[i]); + } + for (i = 0; i < MAX_LAPICIDS; i++) { + if (lapic_to_cpu[i] == -1) + continue; + kprintf("lapic_to_cpu[%d]: %d\n", + i, lapic_to_cpu[i]); + } +} +#endif /* MP_DEBUG */ + +static void +legacy_init(void) +{ + int result; + vm_map_entry_t entry; + vm_map_offset_t lapic_vbase64; + /* Establish a map to the local apic */ + + lapic_vbase64 = (vm_offset_t)vm_map_min(kernel_map); + result = vm_map_find_space(kernel_map, + &lapic_vbase64, + round_page(LAPIC_SIZE), 0, + VM_MAKE_TAG(VM_MEMORY_IOKIT), &entry); + /* Convert 64-bit vm_map_offset_t to "pointer sized" vm_offset_t + */ + lapic_vbase = (vm_offset_t) lapic_vbase64; + if (result != KERN_SUCCESS) { + panic("legacy_init: vm_map_find_entry FAILED (err=%d)", result); + } + vm_map_unlock(kernel_map); +/* Map in the local APIC non-cacheable, as recommended by Intel + * in section 8.4.1 of the "System Programming Guide". + */ + pmap_enter(pmap_kernel(), + lapic_vbase, + (ppnum_t) i386_btop(lapic_pbase), + VM_PROT_READ|VM_PROT_WRITE, + VM_WIMG_IO, + TRUE); +} + + +static uint32_t +legacy_read(lapic_register_t reg) +{ + return *LAPIC_MMIO(reg); +} + +static void +legacy_write(lapic_register_t reg, uint32_t value) +{ + *LAPIC_MMIO(reg) = value; +} + +static lapic_ops_table_t legacy_ops = { + legacy_init, + legacy_read, + legacy_write +}; + +static void +x2apic_init(void) +{ +} + +static uint32_t +x2apic_read(lapic_register_t reg) +{ + uint32_t lo; + uint32_t hi; + + rdmsr(LAPIC_MSR(reg), lo, hi); + return lo; +} + +static void +x2apic_write(lapic_register_t reg, uint32_t value) +{ + wrmsr(LAPIC_MSR(reg), value, 0); +} + +static lapic_ops_table_t x2apic_ops = { + x2apic_init, + x2apic_read, + x2apic_write +}; + + +void +lapic_init(void) +{ + uint32_t lo; + uint32_t hi; + boolean_t is_boot_processor; + boolean_t is_lapic_enabled; + boolean_t is_x2apic; + + /* Examine the local APIC state */ + rdmsr(MSR_IA32_APIC_BASE, lo, hi); + is_boot_processor = (lo & MSR_IA32_APIC_BASE_BSP) != 0; + is_lapic_enabled = (lo & MSR_IA32_APIC_BASE_ENABLE) != 0; + is_x2apic = (lo & MSR_IA32_APIC_BASE_EXTENDED) != 0; + lapic_pbase = (lo & MSR_IA32_APIC_BASE_BASE); + kprintf("MSR_IA32_APIC_BASE 0x%llx %s %s mode %s\n", lapic_pbase, + is_lapic_enabled ? "enabled" : "disabled", + is_x2apic ? "extended" : "legacy", + is_boot_processor ? "BSP" : "AP"); + if (!is_boot_processor || !is_lapic_enabled) + panic("Unexpected local APIC state\n"); + + lapic_ops = is_x2apic ? &x2apic_ops : &legacy_ops; + + lapic_ops->init(); + + if ((LAPIC_READ(VERSION)&LAPIC_VERSION_MASK) < 0x14) { + panic("Local APIC version 0x%x, 0x14 or more expected\n", + (LAPIC_READ(VERSION)&LAPIC_VERSION_MASK)); + } + + /* Set up the lapic_id <-> cpu_number map and add this boot processor */ + lapic_cpu_map_init(); + lapic_cpu_map((LAPIC_READ(ID)>>LAPIC_ID_SHIFT)&LAPIC_ID_MASK, 0); + kprintf("Boot cpu local APIC id 0x%x\n", cpu_to_lapic[0]); +} + + +static int +lapic_esr_read(void) +{ + /* write-read register */ + LAPIC_WRITE(ERROR_STATUS, 0); + return LAPIC_READ(ERROR_STATUS); +} + +static void +lapic_esr_clear(void) +{ + LAPIC_WRITE(ERROR_STATUS, 0); + LAPIC_WRITE(ERROR_STATUS, 0); +} + +static const char *DM_str[8] = { + "Fixed", + "Lowest Priority", + "Invalid", + "Invalid", + "NMI", + "Reset", + "Invalid", + "ExtINT"}; + +static const char *TMR_str[] = { + "OneShot", + "Periodic", + "TSC-Deadline", + "Illegal" + "Illegal" +}; + +void +lapic_dump(void) +{ + int i; + +#define BOOL(a) ((a)?' ':'!') +#define VEC(lvt) \ + LAPIC_READ(lvt)&LAPIC_LVT_VECTOR_MASK +#define DS(lvt) \ + (LAPIC_READ(lvt)&LAPIC_LVT_DS_PENDING)?" SendPending" : "Idle" +#define DM(lvt) \ + DM_str[(LAPIC_READ(lvt)>>LAPIC_LVT_DM_SHIFT)&LAPIC_LVT_DM_MASK] +#define MASK(lvt) \ + BOOL(LAPIC_READ(lvt)&LAPIC_LVT_MASKED) +#define TM(lvt) \ + (LAPIC_READ(lvt)&LAPIC_LVT_TM_LEVEL)? "Level" : "Edge" +#define IP(lvt) \ + (LAPIC_READ(lvt)&LAPIC_LVT_IP_PLRITY_LOW)? "Low " : "High" + + kprintf("LAPIC %d at %p version 0x%x\n", + (LAPIC_READ(ID)>>LAPIC_ID_SHIFT)&LAPIC_ID_MASK, + (void *) lapic_vbase, + LAPIC_READ(VERSION)&LAPIC_VERSION_MASK); + kprintf("Priorities: Task 0x%x Arbitration 0x%x Processor 0x%x\n", + LAPIC_READ(TPR)&LAPIC_TPR_MASK, + LAPIC_READ(APR)&LAPIC_APR_MASK, + LAPIC_READ(PPR)&LAPIC_PPR_MASK); + kprintf("Destination Format 0x%x Logical Destination 0x%x\n", + LAPIC_READ(DFR)>>LAPIC_DFR_SHIFT, + LAPIC_READ(LDR)>>LAPIC_LDR_SHIFT); + kprintf("%cEnabled %cFocusChecking SV 0x%x\n", + BOOL(LAPIC_READ(SVR)&LAPIC_SVR_ENABLE), + BOOL(!(LAPIC_READ(SVR)&LAPIC_SVR_FOCUS_OFF)), + LAPIC_READ(SVR) & LAPIC_SVR_MASK); +#if CONFIG_MCA + if (mca_is_cmci_present()) + kprintf("LVT_CMCI: Vector 0x%02x [%s] %s %cmasked\n", + VEC(LVT_CMCI), + DM(LVT_CMCI), + DS(LVT_CMCI), + MASK(LVT_CMCI)); +#endif + kprintf("LVT_TIMER: Vector 0x%02x %s %cmasked %s\n", + VEC(LVT_TIMER), + DS(LVT_TIMER), + MASK(LVT_TIMER), + TMR_str[(LAPIC_READ(LVT_TIMER) >> LAPIC_LVT_TMR_SHIFT) + & LAPIC_LVT_TMR_MASK]); + kprintf(" Initial Count: 0x%08x \n", LAPIC_READ(TIMER_INITIAL_COUNT)); + kprintf(" Current Count: 0x%08x \n", LAPIC_READ(TIMER_CURRENT_COUNT)); + kprintf(" Divide Config: 0x%08x \n", LAPIC_READ(TIMER_DIVIDE_CONFIG)); + kprintf("LVT_PERFCNT: Vector 0x%02x [%s] %s %cmasked\n", + VEC(LVT_PERFCNT), + DM(LVT_PERFCNT), + DS(LVT_PERFCNT), + MASK(LVT_PERFCNT)); + kprintf("LVT_THERMAL: Vector 0x%02x [%s] %s %cmasked\n", + VEC(LVT_THERMAL), + DM(LVT_THERMAL), + DS(LVT_THERMAL), + MASK(LVT_THERMAL)); + kprintf("LVT_LINT0: Vector 0x%02x [%s][%s][%s] %s %cmasked\n", + VEC(LVT_LINT0), + DM(LVT_LINT0), + TM(LVT_LINT0), + IP(LVT_LINT0), + DS(LVT_LINT0), + MASK(LVT_LINT0)); + kprintf("LVT_LINT1: Vector 0x%02x [%s][%s][%s] %s %cmasked\n", + VEC(LVT_LINT1), + DM(LVT_LINT1), + TM(LVT_LINT1), + IP(LVT_LINT1), + DS(LVT_LINT1), + MASK(LVT_LINT1)); + kprintf("LVT_ERROR: Vector 0x%02x %s %cmasked\n", + VEC(LVT_ERROR), + DS(LVT_ERROR), + MASK(LVT_ERROR)); + kprintf("ESR: %08x \n", lapic_esr_read()); + kprintf(" "); + for(i=0xf; i>=0; i--) + kprintf("%x%x%x%x",i,i,i,i); + kprintf("\n"); + kprintf("TMR: 0x"); + for(i=7; i>=0; i--) + kprintf("%08x",LAPIC_READ_OFFSET(TMR_BASE, i)); + kprintf("\n"); + kprintf("IRR: 0x"); + for(i=7; i>=0; i--) + kprintf("%08x",LAPIC_READ_OFFSET(IRR_BASE, i)); + kprintf("\n"); + kprintf("ISR: 0x"); + for(i=7; i >= 0; i--) + kprintf("%08x",LAPIC_READ_OFFSET(ISR_BASE, i)); + kprintf("\n"); +} + +#if MACH_KDB +/* + * Displays apic junk + * + * da + */ +void +db_apic(__unused db_expr_t addr, + __unused int have_addr, + __unused db_expr_t count, + __unused char *modif) +{ + + lapic_dump(); + + return; +} + +#endif + +boolean_t +lapic_probe(void) +{ + uint32_t lo; + uint32_t hi; + + if (cpuid_features() & CPUID_FEATURE_APIC) + return TRUE; + + if (cpuid_family() == 6 || cpuid_family() == 15) { + /* + * Mobile Pentiums: + * There may be a local APIC which wasn't enabled by BIOS. + * So we try to enable it explicitly. + */ + rdmsr(MSR_IA32_APIC_BASE, lo, hi); + lo &= ~MSR_IA32_APIC_BASE_BASE; + lo |= MSR_IA32_APIC_BASE_ENABLE | LAPIC_START; + lo |= MSR_IA32_APIC_BASE_ENABLE; + wrmsr(MSR_IA32_APIC_BASE, lo, hi); + + /* + * Re-initialize cpu features info and re-check. + */ + cpuid_set_info(); + if (cpuid_features() & CPUID_FEATURE_APIC) { + printf("Local APIC discovered and enabled\n"); + lapic_os_enabled = TRUE; + lapic_interrupt_base = LAPIC_REDUCED_INTERRUPT_BASE; + return TRUE; + } + } + + return FALSE; +} + +void +lapic_shutdown(void) +{ + uint32_t lo; + uint32_t hi; + uint32_t value; + + /* Shutdown if local APIC was enabled by OS */ + if (lapic_os_enabled == FALSE) + return; + + mp_disable_preemption(); + + /* ExtINT: masked */ + if (get_cpu_number() == master_cpu) { + value = LAPIC_READ(LVT_LINT0); + value |= LAPIC_LVT_MASKED; + LAPIC_WRITE(LVT_LINT0, value); + } + + /* Error: masked */ + LAPIC_WRITE(LVT_ERROR, LAPIC_READ(LVT_ERROR) | LAPIC_LVT_MASKED); + + /* Timer: masked */ + LAPIC_WRITE(LVT_TIMER, LAPIC_READ(LVT_TIMER) | LAPIC_LVT_MASKED); + + /* Perfmon: masked */ + LAPIC_WRITE(LVT_PERFCNT, LAPIC_READ(LVT_PERFCNT) | LAPIC_LVT_MASKED); + + /* APIC software disabled */ + LAPIC_WRITE(SVR, LAPIC_READ(SVR) & ~LAPIC_SVR_ENABLE); + + /* Bypass the APIC completely and update cpu features */ + rdmsr(MSR_IA32_APIC_BASE, lo, hi); + lo &= ~MSR_IA32_APIC_BASE_ENABLE; + wrmsr(MSR_IA32_APIC_BASE, lo, hi); + cpuid_set_info(); + + mp_enable_preemption(); +} + +void +lapic_configure(void) +{ + int value; + + if (lapic_error_time_threshold == 0 && cpu_number() == 0) { + nanoseconds_to_absolutetime(NSEC_PER_SEC >> 2, &lapic_error_time_threshold); + if (!PE_parse_boot_argn("lapic_dont_panic", &lapic_dont_panic, sizeof(lapic_dont_panic))) { + lapic_dont_panic = FALSE; + } + } + + /* Set flat delivery model, logical processor id */ + LAPIC_WRITE(DFR, LAPIC_DFR_FLAT); + LAPIC_WRITE(LDR, (get_cpu_number()) << LAPIC_LDR_SHIFT); + + /* Accept all */ + LAPIC_WRITE(TPR, 0); + + LAPIC_WRITE(SVR, LAPIC_VECTOR(SPURIOUS) | LAPIC_SVR_ENABLE); + + /* ExtINT */ + if (get_cpu_number() == master_cpu) { + value = LAPIC_READ(LVT_LINT0); + value &= ~LAPIC_LVT_MASKED; + value |= LAPIC_LVT_DM_EXTINT; + LAPIC_WRITE(LVT_LINT0, value); + } + + /* Timer: unmasked, one-shot */ + LAPIC_WRITE(LVT_TIMER, LAPIC_VECTOR(TIMER)); + + /* Perfmon: unmasked */ + LAPIC_WRITE(LVT_PERFCNT, LAPIC_VECTOR(PERFCNT)); + + /* Thermal: unmasked */ + LAPIC_WRITE(LVT_THERMAL, LAPIC_VECTOR(THERMAL)); + +#if CONFIG_MCA + /* CMCI, if available */ + if (mca_is_cmci_present()) + LAPIC_WRITE(LVT_CMCI, LAPIC_VECTOR(CMCI)); +#endif + + if (((cpu_number() == master_cpu) && lapic_errors_masked == FALSE) || + (cpu_number() != master_cpu)) { + lapic_esr_clear(); + LAPIC_WRITE(LVT_ERROR, LAPIC_VECTOR(ERROR)); + } +} + +void +lapic_set_timer( + boolean_t interrupt_unmasked, + lapic_timer_mode_t mode, + lapic_timer_divide_t divisor, + lapic_timer_count_t initial_count) +{ + uint32_t timer_vector; + + mp_disable_preemption(); + timer_vector = LAPIC_READ(LVT_TIMER); + timer_vector &= ~(LAPIC_LVT_MASKED|LAPIC_LVT_PERIODIC);; + timer_vector |= interrupt_unmasked ? 0 : LAPIC_LVT_MASKED; + timer_vector |= (mode == periodic) ? LAPIC_LVT_PERIODIC : 0; + LAPIC_WRITE(LVT_TIMER, timer_vector); + LAPIC_WRITE(TIMER_DIVIDE_CONFIG, divisor); + LAPIC_WRITE(TIMER_INITIAL_COUNT, initial_count); + mp_enable_preemption(); +} + +void +lapic_config_timer( + boolean_t interrupt_unmasked, + lapic_timer_mode_t mode, + lapic_timer_divide_t divisor) +{ + uint32_t timer_vector; + + mp_disable_preemption(); + timer_vector = LAPIC_READ(LVT_TIMER); + timer_vector &= ~(LAPIC_LVT_MASKED | + LAPIC_LVT_PERIODIC | + LAPIC_LVT_TSC_DEADLINE); + timer_vector |= interrupt_unmasked ? 0 : LAPIC_LVT_MASKED; + timer_vector |= (mode == periodic) ? LAPIC_LVT_PERIODIC : 0; + LAPIC_WRITE(LVT_TIMER, timer_vector); + LAPIC_WRITE(TIMER_DIVIDE_CONFIG, divisor); + mp_enable_preemption(); +} + +/* + * Configure TSC-deadline timer mode. The lapic interrupt is always unmasked. + */ +__private_extern__ +void +lapic_config_tsc_deadline_timer(void) +{ + uint32_t timer_vector; + + DBG("lapic_config_tsc_deadline_timer()\n"); + mp_disable_preemption(); + timer_vector = LAPIC_READ(LVT_TIMER); + timer_vector &= ~(LAPIC_LVT_MASKED | + LAPIC_LVT_PERIODIC); + timer_vector |= LAPIC_LVT_TSC_DEADLINE; + LAPIC_WRITE(LVT_TIMER, timer_vector); + + /* Serialize writes per Intel OSWG */ + do { + lapic_set_tsc_deadline_timer(rdtsc64() + (1ULL<<32)); + } while (lapic_get_tsc_deadline_timer() == 0); + lapic_set_tsc_deadline_timer(0); + + mp_enable_preemption(); + DBG("lapic_config_tsc_deadline_timer() done\n"); +} + +void +lapic_set_timer_fast( + lapic_timer_count_t initial_count) +{ + LAPIC_WRITE(LVT_TIMER, LAPIC_READ(LVT_TIMER) & ~LAPIC_LVT_MASKED); + LAPIC_WRITE(TIMER_INITIAL_COUNT, initial_count); +} + +__private_extern__ +void +lapic_set_tsc_deadline_timer(uint64_t deadline) +{ + /* Don't bother disarming: wrmsr64(MSR_IA32_TSC_DEADLINE, 0); */ + wrmsr64(MSR_IA32_TSC_DEADLINE, deadline); +} + +__private_extern__ +uint64_t +lapic_get_tsc_deadline_timer(void) +{ + return rdmsr64(MSR_IA32_TSC_DEADLINE); +} + +void +lapic_get_timer( + lapic_timer_mode_t *mode, + lapic_timer_divide_t *divisor, + lapic_timer_count_t *initial_count, + lapic_timer_count_t *current_count) +{ + mp_disable_preemption(); + if (mode) + *mode = (LAPIC_READ(LVT_TIMER) & LAPIC_LVT_PERIODIC) ? + periodic : one_shot; + if (divisor) + *divisor = LAPIC_READ(TIMER_DIVIDE_CONFIG) & LAPIC_TIMER_DIVIDE_MASK; + if (initial_count) + *initial_count = LAPIC_READ(TIMER_INITIAL_COUNT); + if (current_count) + *current_count = LAPIC_READ(TIMER_CURRENT_COUNT); + mp_enable_preemption(); +} + +static inline void +_lapic_end_of_interrupt(void) +{ + LAPIC_WRITE(EOI, 0); +} + +void +lapic_end_of_interrupt(void) +{ + _lapic_end_of_interrupt(); +} + +void lapic_unmask_perfcnt_interrupt(void) { + LAPIC_WRITE(LVT_PERFCNT, LAPIC_VECTOR(PERFCNT)); +} + +void lapic_set_perfcnt_interrupt_mask(boolean_t mask) { + uint32_t m = (mask ? LAPIC_LVT_MASKED : 0); + LAPIC_WRITE(LVT_PERFCNT, LAPIC_VECTOR(PERFCNT) | m); +} + +void +lapic_set_intr_func(int vector, i386_intr_func_t func) +{ + if (vector > lapic_interrupt_base) + vector -= lapic_interrupt_base; + + switch (vector) { + case LAPIC_NMI_INTERRUPT: + case LAPIC_INTERPROCESSOR_INTERRUPT: + case LAPIC_TIMER_INTERRUPT: + case LAPIC_THERMAL_INTERRUPT: + case LAPIC_PERFCNT_INTERRUPT: + case LAPIC_CMCI_INTERRUPT: + case LAPIC_PM_INTERRUPT: + lapic_intr_func[vector] = func; + break; + default: + panic("lapic_set_intr_func(%d,%p) invalid vector\n", + vector, func); + } +} + +void lapic_set_pmi_func(i386_intr_func_t func) { + lapic_set_intr_func(LAPIC_VECTOR(PERFCNT), func); +} + +int +lapic_interrupt(int interrupt_num, x86_saved_state_t *state) +{ + int retval = 0; + int esr = -1; + + interrupt_num -= lapic_interrupt_base; + if (interrupt_num < 0) { + if (interrupt_num == (LAPIC_NMI_INTERRUPT - lapic_interrupt_base) && + lapic_intr_func[LAPIC_NMI_INTERRUPT] != NULL) { + retval = (*lapic_intr_func[LAPIC_NMI_INTERRUPT])(state); + return retval; + } + else + return 0; + } + + switch(interrupt_num) { + case LAPIC_TIMER_INTERRUPT: + case LAPIC_THERMAL_INTERRUPT: + case LAPIC_INTERPROCESSOR_INTERRUPT: + case LAPIC_PM_INTERRUPT: + if (lapic_intr_func[interrupt_num] != NULL) + (void) (*lapic_intr_func[interrupt_num])(state); + _lapic_end_of_interrupt(); + retval = 1; + break; + case LAPIC_PERFCNT_INTERRUPT: + /* If a function has been registered, invoke it. Otherwise, + * pass up to IOKit. + */ + if (lapic_intr_func[interrupt_num] != NULL) { + (void) (*lapic_intr_func[interrupt_num])(state); + /* Unmask the interrupt since we don't expect legacy users + * to be responsible for it. + */ + lapic_unmask_perfcnt_interrupt(); + _lapic_end_of_interrupt(); + retval = 1; + } + break; + case LAPIC_CMCI_INTERRUPT: + if (lapic_intr_func[interrupt_num] != NULL) + (void) (*lapic_intr_func[interrupt_num])(state); + /* return 0 for plaform expert to handle */ + break; + case LAPIC_ERROR_INTERRUPT: + /* We treat error interrupts on APs as fatal. + * The current interrupt steering scheme directs most + * external interrupts to the BSP (HPET interrupts being + * a notable exception); hence, such an error + * on an AP may signify LVT corruption (with "may" being + * the operative word). On the BSP, we adopt a more + * lenient approach, in the interests of enhancing + * debuggability and reducing fragility. + * If "lapic_error_count_threshold" error interrupts + * occur within "lapic_error_time_threshold" absolute + * time units, we mask the error vector and log. The + * error interrupts themselves are likely + * side effects of issues which are beyond the purview of + * the local APIC interrupt handler, however. The Error + * Status Register value (the illegal destination + * vector code is one observed in practice) indicates + * the immediate cause of the error. + */ + esr = lapic_esr_read(); + lapic_dump(); + + if ((debug_boot_arg && (lapic_dont_panic == FALSE)) || + cpu_number() != master_cpu) { + panic("Local APIC error, ESR: %d\n", esr); + } + + if (cpu_number() == master_cpu) { + uint64_t abstime = mach_absolute_time(); + if ((abstime - lapic_last_master_error) < lapic_error_time_threshold) { + if (lapic_master_error_count++ > lapic_error_count_threshold) { + lapic_errors_masked = TRUE; + LAPIC_WRITE(LVT_ERROR, LAPIC_READ(LVT_ERROR) | LAPIC_LVT_MASKED); + printf("Local APIC: errors masked\n"); + } + } + else { + lapic_last_master_error = abstime; + lapic_master_error_count = 0; + } + printf("Local APIC error on master CPU, ESR: %d, error count this run: %d\n", esr, lapic_master_error_count); + } + + _lapic_end_of_interrupt(); + retval = 1; + break; + case LAPIC_SPURIOUS_INTERRUPT: + kprintf("SPIV\n"); + /* No EOI required here */ + retval = 1; + break; + case LAPIC_PMC_SW_INTERRUPT: + { +#if CONFIG_COUNTERS + thread_t old, new; + ml_get_csw_threads(&old, &new); + + if (pmc_context_switch(old, new) == TRUE) { + retval = 1; + /* No EOI required for SWI */ + } +#endif /* CONFIG_COUNTERS */ + } + break; + } + + return retval; +} + +void +lapic_smm_restore(void) +{ + boolean_t state; + + if (lapic_os_enabled == FALSE) + return; + + state = ml_set_interrupts_enabled(FALSE); + + if (LAPIC_ISR_IS_SET(LAPIC_REDUCED_INTERRUPT_BASE, TIMER)) { + /* + * Bogus SMI handler enables interrupts but does not know about + * local APIC interrupt sources. When APIC timer counts down to + * zero while in SMM, local APIC will end up waiting for an EOI + * but no interrupt was delivered to the OS. + */ + _lapic_end_of_interrupt(); + + /* + * timer is one-shot, trigger another quick countdown to trigger + * another timer interrupt. + */ + if (LAPIC_READ(TIMER_CURRENT_COUNT) == 0) { + LAPIC_WRITE(TIMER_INITIAL_COUNT, 1); + } + + kprintf("lapic_smm_restore\n"); + } + + ml_set_interrupts_enabled(state); +} + +void +lapic_send_ipi(int cpu, int vector) +{ + boolean_t state; + + if (vector < lapic_interrupt_base) + vector += lapic_interrupt_base; + + state = ml_set_interrupts_enabled(FALSE); + + /* Wait for pending outgoing send to complete */ + while (LAPIC_READ(ICR) & LAPIC_ICR_DS_PENDING) { + cpu_pause(); + } + + LAPIC_WRITE(ICRD, cpu_to_lapic[cpu] << LAPIC_ICRD_DEST_SHIFT); + LAPIC_WRITE(ICR, vector | LAPIC_ICR_DM_FIXED); + + (void) ml_set_interrupts_enabled(state); +} + +/* + * The following interfaces are privately exported to AICPM. + */ + +boolean_t +lapic_is_interrupt_pending(void) +{ + int i; + + for (i = 0; i < 8; i += 1) { + if ((LAPIC_READ_OFFSET(IRR_BASE, i) != 0) || + (LAPIC_READ_OFFSET(ISR_BASE, i) != 0)) + return (TRUE); + } + + return (FALSE); +} + +boolean_t +lapic_is_interrupting(uint8_t vector) +{ + int i; + int bit; + uint32_t irr; + uint32_t isr; + + i = vector / 32; + bit = 1 << (vector % 32); + + irr = LAPIC_READ_OFFSET(IRR_BASE, i); + isr = LAPIC_READ_OFFSET(ISR_BASE, i); + + if ((irr | isr) & bit) + return (TRUE); + + return (FALSE); +} + +void +lapic_interrupt_counts(uint64_t intrs[256]) +{ + int i; + int j; + int bit; + uint32_t irr; + uint32_t isr; + + if (intrs == NULL) + return; + + for (i = 0; i < 8; i += 1) { + irr = LAPIC_READ_OFFSET(IRR_BASE, i); + isr = LAPIC_READ_OFFSET(ISR_BASE, i); + + if ((isr | irr) == 0) + continue; + + for (j = (i == 0) ? 16 : 0; j < 32; j += 1) { + bit = (32 * i) + j; + if ((isr | irr) & (1 << j)) + intrs[bit] += 1; + } + } +} + +void +lapic_disable_timer(void) +{ + uint32_t lvt_timer; + + /* + * If we're in deadline timer mode, + * simply clear the deadline timer, otherwise + * mask the timer interrupt and clear the countdown. + */ + lvt_timer = LAPIC_READ(LVT_TIMER); + if (lvt_timer & LAPIC_LVT_TSC_DEADLINE) { + wrmsr64(MSR_IA32_TSC_DEADLINE, 0); + } else { + LAPIC_WRITE(LVT_TIMER, lvt_timer | LAPIC_LVT_MASKED); + LAPIC_WRITE(TIMER_INITIAL_COUNT, 0); + lvt_timer = LAPIC_READ(LVT_TIMER); + } +} diff --git a/osfmk/i386/ldt.c b/osfmk/i386/ldt.c index 91416fd20..ff1facd34 100644 --- a/osfmk/i386/ldt.c +++ b/osfmk/i386/ldt.c @@ -65,38 +65,38 @@ struct real_descriptor master_ldt[LDTSZ] __attribute__ ((aligned (4096))) = { #ifdef __i386__ - [SEL_TO_INDEX(SYSENTER_CS)] MAKE_REAL_DESCRIPTOR( /* kernel code (sysenter) */ + [SEL_TO_INDEX(SYSENTER_CS)] = MAKE_REAL_DESCRIPTOR( /* kernel code (sysenter) */ 0, 0xfffff, SZ_32|SZ_G, ACC_P|ACC_PL_K|ACC_CODE_R ), - [SEL_TO_INDEX(SYSENTER_DS)] MAKE_REAL_DESCRIPTOR( /* kernel data (sysenter) */ + [SEL_TO_INDEX(SYSENTER_DS)] = MAKE_REAL_DESCRIPTOR( /* kernel data (sysenter) */ 0, 0xfffff, SZ_32|SZ_G, ACC_P|ACC_PL_K|ACC_DATA_W ), - [SEL_TO_INDEX(USER_CS)] MAKE_REAL_DESCRIPTOR( /* user code segment */ + [SEL_TO_INDEX(USER_CS)] = MAKE_REAL_DESCRIPTOR( /* user code segment */ 0, 0xfffff, SZ_32|SZ_G, ACC_P|ACC_PL_U|ACC_CODE_R ), - [SEL_TO_INDEX(USER_DS)] MAKE_REAL_DESCRIPTOR( /* user data segment */ + [SEL_TO_INDEX(USER_DS)] = MAKE_REAL_DESCRIPTOR( /* user data segment */ 0, 0xfffff, SZ_32|SZ_G, ACC_P|ACC_PL_U|ACC_DATA_W ), - [SEL_TO_INDEX(USER64_CS)] MAKE_REAL_DESCRIPTOR( /* user 64-bit code segment */ + [SEL_TO_INDEX(USER64_CS)] = MAKE_REAL_DESCRIPTOR( /* user 64-bit code segment */ 0, 0xfffff, SZ_64|SZ_G, ACC_P|ACC_PL_U|ACC_CODE_R ), #endif - [SEL_TO_INDEX(USER_CTHREAD)] MAKE_REAL_DESCRIPTOR( /* user cthread segment */ + [SEL_TO_INDEX(USER_CTHREAD)] = MAKE_REAL_DESCRIPTOR( /* user cthread segment */ 0, 0xfffff, SZ_32|SZ_G, diff --git a/osfmk/i386/locks.h b/osfmk/i386/locks.h index d74e94156..a0409d257 100644 --- a/osfmk/i386/locks.h +++ b/osfmk/i386/locks.h @@ -41,47 +41,61 @@ extern unsigned int LcksOpts; #define enaLkDeb 0x00000001 /* Request debug in default attribute */ #define enaLkStat 0x00000002 /* Request statistic in default attribute */ -#endif +#endif /* MACH_KERNEL_PRIVATE */ -#ifdef MACH_KERNEL_PRIVATE +#if defined(MACH_KERNEL_PRIVATE) typedef struct { - unsigned long interlock; - unsigned long lck_spin_pad[9]; /* XXX - usimple_lock_data_t */ + volatile uintptr_t interlock; +#if MACH_LDEBUG + unsigned long lck_spin_pad[9]; /* XXX - usimple_lock_data_t */ +#endif } lck_spin_t; #define LCK_SPIN_TAG_DESTROYED 0x00002007 /* lock marked as Destroyed */ -#else +#else /* MACH_KERNEL_PRIVATE */ #ifdef KERNEL_PRIVATE typedef struct { unsigned long opaque[10]; } lck_spin_t; -#else +#else /* KERNEL_PRIVATE */ typedef struct __lck_spin_t__ lck_spin_t; #endif #endif #ifdef MACH_KERNEL_PRIVATE +/* The definition of this structure, including the layout of the + * state bitfield, is tailored to the asm implementation in i386_lock.s + */ typedef struct _lck_mtx_ { union { struct { volatile uintptr_t lck_mtxd_owner; - unsigned long lck_mtxd_ptr; - volatile uint32_t lck_mtxd_waiters:16, - lck_mtxd_pri:8, - lck_mtxd_ilocked:1, - lck_mtxd_mlocked:1, - lck_mtxd_promoted:1, - lck_mtxd_spin:1, - lck_mtxd_pad4:4; /* padding */ -#ifdef __x86_64__ - unsigned int lck_mtxd_pad; -#endif + union { + struct { + volatile uint32_t + lck_mtxd_waiters:16, + lck_mtxd_pri:8, + lck_mtxd_ilocked:1, + lck_mtxd_mlocked:1, + lck_mtxd_promoted:1, + lck_mtxd_spin:1, + lck_mtxd_is_ext:1, + lck_mtxd_pad3:3; + }; + uint32_t lck_mtxd_state; + }; +#if defined(__x86_64__) + /* Pad field used as a canary, initialized to ~0 */ + uint32_t lck_mtxd_pad32; +#endif } lck_mtxd; struct { - unsigned long lck_mtxi_tag; struct _lck_mtx_ext_ *lck_mtxi_ptr; - unsigned long lck_mtxi_pad; + uint32_t lck_mtxi_tag; +#if defined(__x86_64__) + uint32_t lck_mtxi_pad32; +#endif } lck_mtxi; } lck_mtx_sw; } lck_mtx_t; @@ -89,31 +103,25 @@ typedef struct _lck_mtx_ { #define lck_mtx_owner lck_mtx_sw.lck_mtxd.lck_mtxd_owner #define lck_mtx_waiters lck_mtx_sw.lck_mtxd.lck_mtxd_waiters #define lck_mtx_pri lck_mtx_sw.lck_mtxd.lck_mtxd_pri -#define lck_mtx_ilocked lck_mtx_sw.lck_mtxd.lck_mtxd_ilocked -#define lck_mtx_mlocked lck_mtx_sw.lck_mtxd.lck_mtxd_mlocked #define lck_mtx_promoted lck_mtx_sw.lck_mtxd.lck_mtxd_promoted -#define lck_mtx_spin lck_mtx_sw.lck_mtxd.lck_mtxd_spin +#define lck_mtx_is_ext lck_mtx_sw.lck_mtxd.lck_mtxd_is_ext #define lck_mtx_tag lck_mtx_sw.lck_mtxi.lck_mtxi_tag #define lck_mtx_ptr lck_mtx_sw.lck_mtxi.lck_mtxi_ptr -#define lck_mtx_state lck_mtx_sw.lck_mtxi.lck_mtxi_pad - -#define LCK_MTX_TAG_INDIRECT 0x00001007 /* lock marked as Indirect */ -#define LCK_MTX_TAG_DESTROYED 0x00002007 /* lock marked as Destroyed */ -#define LCK_MTX_PTR_EXTENDED 0x00003007 /* lock is extended version */ +#define lck_mtx_state lck_mtx_sw.lck_mtxd.lck_mtxd_state +/* This pattern must subsume the interlocked, mlocked and spin bits */ +#define LCK_MTX_TAG_INDIRECT 0x07ff1007 /* lock marked as Indirect */ +#define LCK_MTX_TAG_DESTROYED 0x07fe2007 /* lock marked as Destroyed */ /* Adaptive spin before blocking */ extern unsigned int MutexSpin; extern int lck_mtx_lock_spinwait_x86(lck_mtx_t *mutex); extern void lck_mtx_lock_wait_x86(lck_mtx_t *mutex); extern void lck_mtx_lock_acquire_x86(lck_mtx_t *mutex); -extern void lck_mtx_unlock_wakeup_x86(lck_mtx_t *mutex, int owner_was_promoted); +extern void lck_mtx_unlock_wakeup_x86(lck_mtx_t *mutex, int prior_lock_state); extern void lck_mtx_lock_mark_destroyed(lck_mtx_t *mutex); -extern int lck_mtx_lock_mark_promoted(lck_mtx_t *mutex); -extern int lck_mtx_lock_decr_waiter(lck_mtx_t *mutex); extern int lck_mtx_lock_grab_mutex(lck_mtx_t *mutex); -extern integer_t lck_mtx_lock_get_pri(lck_mtx_t *mutex); extern void hw_lock_byte_init(uint8_t *lock_byte); extern void hw_lock_byte_lock(uint8_t *lock_byte); @@ -153,10 +161,19 @@ typedef struct _lck_mtx_ext_ { #define LCK_MTX_ATTR_STAT 0x2 #define LCK_MTX_ATTR_STATb 1 +#else /* MACH_KERNEL_PRIVATE */ +#ifdef XNU_KERNEL_PRIVATE +typedef struct { + unsigned long opaque[2]; +} lck_mtx_t; + +typedef struct { + unsigned long opaque[10]; +} lck_mtx_ext_t; #else #ifdef KERNEL_PRIVATE typedef struct { - unsigned long opaque[3]; + unsigned long opaque[2]; } lck_mtx_t; typedef struct { @@ -168,6 +185,7 @@ typedef struct __lck_mtx_t__ lck_mtx_t; typedef struct __lck_mtx_ext_t__ lck_mtx_ext_t; #endif #endif +#endif #ifdef MACH_KERNEL_PRIVATE #pragma pack(1) /* Make sure the structure stays as we defined it */ diff --git a/osfmk/i386/locks_i386.c b/osfmk/i386/locks_i386.c index 8c715d086..3f94ba02b 100644 --- a/osfmk/i386/locks_i386.c +++ b/osfmk/i386/locks_i386.c @@ -88,6 +88,7 @@ #include #include +#include /* * We need only enough declarations from the BSD-side to be able to @@ -211,7 +212,6 @@ lck_rw_type_t lck_rw_done_gen( lck_rw_t *lck, int prior_lock_state); - /* * Routine: lck_spin_alloc_init */ @@ -329,7 +329,7 @@ static uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) { for (i = 0; i < real_ncpus; i++) { if ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr) { spinlock_owner_cpu = i; - if ((uint32_t)cpu_number() == i) + if ((uint32_t) cpu_number() == i) break; cpu_datap(i)->cpu_NMI_acknowledged = FALSE; cpu_NMI_interrupt(i); @@ -359,14 +359,15 @@ usimple_lock( OBTAIN_PC(pc); USLDBG(usld_lock_pre(l, pc)); -/* Try to get the lock with a timeout */ - if(!hw_lock_to(&l->interlock, LockTimeOutTSC)) { + + if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC) == 0)) { boolean_t uslock_acquired = FALSE; while (machine_timeout_suspended()) { enable_preemption(); if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC))) break; - } + } + if (uslock_acquired == FALSE) { uint32_t lock_cpu; spinlock_timed_out = l; @@ -903,8 +904,8 @@ lck_rw_destroy( ((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_pad8)))) /* - * We need to disable interrupts while holding the mutex interlock - * to prevent an IPI intervening. + * We disable interrupts while holding the RW interlock to prevent an + * interrupt from exacerbating hold time. * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock(). */ static boolean_t @@ -1635,6 +1636,9 @@ lck_rw_assert( panic("rw lock (%p) not held (mode=%u), first word %08x\n", lck, type, *(uint32_t *)lck); } +#ifdef MUTEX_ZONE +extern zone_t lck_mtx_zone; +#endif /* * Routine: lck_mtx_alloc_init */ @@ -1644,10 +1648,13 @@ lck_mtx_alloc_init( lck_attr_t *attr) { lck_mtx_t *lck; - +#ifdef MUTEX_ZONE + if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0) + lck_mtx_init(lck, grp, attr); +#else if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0) lck_mtx_init(lck, grp, attr); - +#endif return(lck); } @@ -1660,7 +1667,11 @@ lck_mtx_free( lck_grp_t *grp) { lck_mtx_destroy(lck, grp); +#ifdef MUTEX_ZONE + zfree(lck_mtx_zone, lck); +#else kfree(lck, sizeof(lck_mtx_t)); +#endif } /* @@ -1682,9 +1693,12 @@ lck_mtx_ext_init( lck->lck_mtx_grp = grp; if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT) - lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT; + lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT; - lck->lck_mtx.lck_mtx_ptr = (void *)LCK_MTX_PTR_EXTENDED; + lck->lck_mtx.lck_mtx_is_ext = 1; +#if defined(__x86_64__) + lck->lck_mtx.lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF; +#endif } /* @@ -1709,18 +1723,14 @@ lck_mtx_init( lck_mtx_ext_init(lck_ext, grp, lck_attr); lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT; lck->lck_mtx_ptr = lck_ext; - lck->lck_mtx_ilocked = 1; } } else { lck->lck_mtx_owner = 0; - lck->lck_mtx_ptr = 0; - lck->lck_mtx_waiters = 0; - lck->lck_mtx_pri = 0; - lck->lck_mtx_ilocked = 0; - lck->lck_mtx_mlocked = 0; - lck->lck_mtx_promoted = 0; - lck->lck_mtx_spin = 0; + lck->lck_mtx_state = 0; } +#if defined(__x86_64__) + lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF; +#endif lck_grp_reference(grp); lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX); } @@ -1746,17 +1756,14 @@ lck_mtx_init_ext( lck_mtx_ext_init(lck_ext, grp, lck_attr); lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT; lck->lck_mtx_ptr = lck_ext; - lck->lck_mtx_ilocked = 1; } else { lck->lck_mtx_owner = 0; - lck->lck_mtx_ptr = 0; - lck->lck_mtx_waiters = 0; - lck->lck_mtx_pri = 0; - lck->lck_mtx_ilocked = 0; - lck->lck_mtx_mlocked = 0; - lck->lck_mtx_promoted = 0; - lck->lck_mtx_spin = 0; + lck->lck_mtx_state = 0; } +#if defined(__x86_64__) + lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF; +#endif + lck_grp_reference(grp); lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX); } @@ -1795,50 +1802,67 @@ lck_mtx_destroy( /* * Routine: lck_mtx_unlock_wakeup_x86 * - * Invoked on unlock when there is contention. + * Invoked on unlock when there is + * contention (i.e. the assembly routine sees that + * that mutex->lck_mtx_waiters != 0 or + * that mutex->lck_mtx_promoted != 0... * + * neither the mutex or interlock is held */ void lck_mtx_unlock_wakeup_x86 ( lck_mtx_t *mutex, - int owner_was_promoted) + int prior_lock_state) { + lck_mtx_t fake_lck; + + /* + * prior_lock state is a snapshot of the 2nd word of the + * lock in question... we'll fake up a lock with the bits + * copied into place and carefully not access anything + * beyond whats defined in the second word of a lck_mtx_t + */ + fake_lck.lck_mtx_state = prior_lock_state; + + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START, + mutex, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0); - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START, (int)mutex, owner_was_promoted, mutex->lck_mtx_waiters, 0, 0); + if (__probable(fake_lck.lck_mtx_waiters)) { - if (lck_mtx_lock_decr_waiter(mutex)) - thread_wakeup_one((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int))); + if (fake_lck.lck_mtx_waiters > 1) + thread_wakeup_one_with_pri((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int)), fake_lck.lck_mtx_pri); + else + thread_wakeup_one((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int))); + } - if (owner_was_promoted) { + if (__improbable(fake_lck.lck_mtx_promoted)) { thread_t thread = current_thread(); - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), thread->promotions, - thread->sched_mode & TH_MODE_PROMOTED, 0, 0); + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE, + thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0); if (thread->promotions > 0) { spl_t s = splsched(); thread_lock(thread); - if (--thread->promotions == 0 && (thread->sched_mode & TH_MODE_PROMOTED)) { + if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED)) { - thread->sched_mode &= ~TH_MODE_PROMOTED; + thread->sched_flags &= ~TH_SFLAG_PROMOTED; - if (thread->sched_mode & TH_MODE_ISDEPRESSED) { - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_SCHED,MACH_DEMOTE) | DBG_FUNC_NONE, - thread->sched_pri, DEPRESSPRI, 0, mutex, 0); + if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) { + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE, + thread->sched_pri, DEPRESSPRI, 0, mutex, 0); set_sched_pri(thread, DEPRESSPRI); } else { if (thread->priority < thread->sched_pri) { - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_SCHED,MACH_DEMOTE) | DBG_FUNC_NONE, - thread->sched_pri, thread->priority, 0, mutex, 0); + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE, + thread->sched_pri, thread->priority, 0, mutex, 0); - compute_priority(thread, FALSE); + SCHED(compute_priority)(thread, FALSE); } } } @@ -1846,7 +1870,8 @@ lck_mtx_unlock_wakeup_x86 ( splx(s); } } - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END, (int)mutex, 0, mutex->lck_mtx_waiters, 0, 0); + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END, + mutex, 0, mutex->lck_mtx_waiters, 0, 0); } @@ -1854,43 +1879,54 @@ lck_mtx_unlock_wakeup_x86 ( * Routine: lck_mtx_lock_acquire_x86 * * Invoked on acquiring the mutex when there is - * contention. - * mutex is owned... interlock is not held + * contention (i.e. the assembly routine sees that + * that mutex->lck_mtx_waiters != 0 or + * thread->was_promoted_on_wakeup != 0)... + * + * mutex is owned... interlock is held... preemption is disabled */ void lck_mtx_lock_acquire_x86( lck_mtx_t *mutex) { - thread_t thread = current_thread(); + thread_t thread; integer_t priority; + spl_t s; - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START, (int)mutex, 0, mutex->lck_mtx_waiters, 0, 0); + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START, + mutex, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0); - priority = lck_mtx_lock_get_pri(mutex); + if (mutex->lck_mtx_waiters) + priority = mutex->lck_mtx_pri; + else + priority = 0; - if (thread->sched_pri < priority) { + thread = (thread_t)mutex->lck_mtx_owner; /* faster then current_thread() */ - if (lck_mtx_lock_mark_promoted(mutex)) { - spl_t s = splsched(); + if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) { - thread_lock(thread); + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE, + thread->sched_pri, priority, thread->was_promoted_on_wakeup, mutex, 0); - if (thread->sched_pri < priority) { + s = splsched(); + thread_lock(thread); - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_SCHED,MACH_PROMOTE) | DBG_FUNC_NONE, - thread->sched_pri, priority, 0, mutex, 0); + if (thread->sched_pri < priority) + set_sched_pri(thread, priority); - set_sched_pri(thread, priority); - } + if (mutex->lck_mtx_promoted == 0) { + mutex->lck_mtx_promoted = 1; + thread->promotions++; - thread->sched_mode |= TH_MODE_PROMOTED; - - thread_unlock(thread); - splx(s); + thread->sched_flags |= TH_SFLAG_PROMOTED; } + thread->was_promoted_on_wakeup = 0; + + thread_unlock(thread); + splx(s); } - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END, (int)mutex, 0, mutex->lck_mtx_waiters, 0, 0); + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END, + mutex, 0, mutex->lck_mtx_waiters, 0, 0); } @@ -1903,6 +1939,9 @@ lck_mtx_lock_acquire_x86( * time waiting for the lock to be released. * * Called with the interlock unlocked. + * returns 0 if mutex acquired + * returns 1 if we spun + * returns 2 if we didn't spin due to the holder not running */ int lck_mtx_lock_spinwait_x86( @@ -1913,9 +1952,9 @@ lck_mtx_lock_spinwait_x86( int retval = 1; int loopcount = 0; - KERNEL_DEBUG( - MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START, - (int)mutex, (int)mutex->lck_mtx_owner, mutex->lck_mtx_waiters, 0, 0); + + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START, + mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, 0, 0); deadline = mach_absolute_time() + MutexSpin; @@ -1928,7 +1967,7 @@ lck_mtx_lock_spinwait_x86( * - we haven't spun for long enough. */ do { - if (lck_mtx_lock_grab_mutex(mutex)) { + if (__probable(lck_mtx_lock_grab_mutex(mutex))) { retval = 0; break; } @@ -1959,7 +1998,7 @@ lck_mtx_lock_spinwait_x86( * penalize only lock groups that have debug/stats enabled * with dtrace processing if desired. */ - if (mutex->lck_mtx_ptr != (void *)LCK_MTX_PTR_EXTENDED) { + if (__probable(mutex->lck_mtx_is_ext == 0)) { LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex, mach_absolute_time() - (deadline - MutexSpin)); } else { @@ -1969,9 +2008,8 @@ lck_mtx_lock_spinwait_x86( /* The lockstat acquire event is recorded by the assembly code beneath us. */ #endif - KERNEL_DEBUG( - MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END, - (int)mutex, (int)mutex->lck_mtx_owner, mutex->lck_mtx_waiters, retval, 0); + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END, + mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, retval, 0); return retval; } @@ -1984,7 +2022,8 @@ lck_mtx_lock_spinwait_x86( * Invoked in order to wait on contention. * * Called with the interlock locked and - * returns it unlocked. + * preemption disabled... + * returns it unlocked and with preemption enabled */ void lck_mtx_lock_wait_x86 ( @@ -1993,7 +2032,6 @@ lck_mtx_lock_wait_x86 ( thread_t self = current_thread(); thread_t holder; integer_t priority; - integer_t old_lck_mtx_pri; spl_t s; #if CONFIG_DTRACE uint64_t sleep_start = 0; @@ -2002,7 +2040,8 @@ lck_mtx_lock_wait_x86 ( sleep_start = mach_absolute_time(); } #endif - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START, (int)mutex, (int)mutex->lck_mtx_owner, mutex->lck_mtx_waiters, 0, 0); + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START, + mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0); priority = self->sched_pri; @@ -2011,45 +2050,41 @@ lck_mtx_lock_wait_x86 ( if (priority < BASEPRI_DEFAULT) priority = BASEPRI_DEFAULT; - if (mutex->lck_mtx_waiters == 0) - old_lck_mtx_pri = 0; - else - old_lck_mtx_pri = mutex->lck_mtx_pri; - - if (old_lck_mtx_pri < priority) + if (mutex->lck_mtx_waiters == 0 || priority > mutex->lck_mtx_pri) mutex->lck_mtx_pri = priority; + mutex->lck_mtx_waiters++; - if ( (holder = (thread_t)mutex->lck_mtx_owner) ) { + if ( (holder = (thread_t)mutex->lck_mtx_owner) && + holder->sched_pri < mutex->lck_mtx_pri ) { s = splsched(); thread_lock(holder); - if (holder->sched_pri < priority) { + if (holder->sched_pri < mutex->lck_mtx_pri) { KERNEL_DEBUG_CONSTANT( MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE, - holder->sched_pri, priority, holder, mutex, 0); + holder->sched_pri, priority, thread_tid(holder), mutex, 0); set_sched_pri(holder, priority); if (mutex->lck_mtx_promoted == 0) { holder->promotions++; - holder->sched_mode |= TH_MODE_PROMOTED; - + holder->sched_flags |= TH_SFLAG_PROMOTED; + mutex->lck_mtx_promoted = 1; } } thread_unlock(holder); splx(s); } - mutex->lck_mtx_waiters++; - assert_wait((event_t)(((unsigned int*)mutex)+((sizeof(lck_mtx_t)-1)/sizeof(unsigned int))), THREAD_UNINT); lck_mtx_ilk_unlock(mutex); thread_block(THREAD_CONTINUE_NULL); - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, (int)mutex, (int)mutex->lck_mtx_owner, mutex->lck_mtx_waiters, 0, 0); + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, + mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0); #if CONFIG_DTRACE /* @@ -2057,7 +2092,7 @@ lck_mtx_lock_wait_x86 ( * measured from when we were entered. */ if (sleep_start) { - if (mutex->lck_mtx_ptr != (void *)LCK_MTX_PTR_EXTENDED) { + if (mutex->lck_mtx_is_ext == 0) { LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex, mach_absolute_time() - sleep_start); } else { diff --git a/osfmk/i386/locore.s b/osfmk/i386/locore.s index 65f7006c6..6e8e3d3a2 100644 --- a/osfmk/i386/locore.s +++ b/osfmk/i386/locore.s @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -66,23 +66,12 @@ #include #include #include -#include -#include #include #include #include -#include -#include - -#define _ARCH_I386_ASM_HELP_H_ /* Prevent inclusion of user header */ -#include - -#include +#include -#define CLI cli -#define STI sti - /* * PTmap is recursive pagemap at top of virtual address space. * Within PTmap, the page directory can be found (third indirection). @@ -92,15 +81,6 @@ .set _PTD,_PTmap + (PTDPTDI * NBPG) .set _PTDpde,_PTD + (PTDPTDI * PDESIZE) -/* - * APTmap, APTD is the alternate recursive pagemap. - * It's used when modifying another process's page tables. - */ - .globl _APTmap,_APTD,_APTDpde - .set _APTmap,(APTDPTDI << PDESHIFT) - .set _APTD,_APTmap + (APTDPTDI * NBPG) - .set _APTDpde,_PTD + (APTDPTDI * PDESIZE) - #if __MACHO__ /* Under Mach-O, etext is a variable which contains * the last text address @@ -113,56 +93,6 @@ #define ETEXT_ADDR $ EXT(etext) #endif -#define CX(addr,reg) addr(,reg,4) - -/* - * The following macros make calls into C code. - * They dynamically align the stack to 16 bytes. - * Arguments are moved (not pushed) onto the correctly aligned stack. - * NOTE: EDI is destroyed in the process, and hence cannot - * be directly used as a parameter. Users of this macro must - * independently preserve EDI (a non-volatile) if the routine is - * intended to be called from C, for instance. - */ - -#define CCALL(fn) \ - movl %esp, %edi ;\ - andl $0xFFFFFFF0, %esp ;\ - call EXT(fn) ;\ - movl %edi, %esp - -#define CCALL1(fn, arg1) \ - movl %esp, %edi ;\ - subl $4, %esp ;\ - andl $0xFFFFFFF0, %esp ;\ - movl arg1, 0(%esp) ;\ - call EXT(fn) ;\ - movl %edi, %esp - -#define CCALL2(fn, arg1, arg2) \ - movl %esp, %edi ;\ - subl $8, %esp ;\ - andl $0xFFFFFFF0, %esp ;\ - movl arg2, 4(%esp) ;\ - movl arg1, 0(%esp) ;\ - call EXT(fn) ;\ - movl %edi, %esp - -/* - * CCALL5 is used for callee functions with 3 arguments but - * where arg2 (a3:a2) and arg3 (a5:a4) are 64-bit values. - */ -#define CCALL5(fn, a1, a2, a3, a4, a5) \ - movl %esp, %edi ;\ - subl $20, %esp ;\ - andl $0xFFFFFFF0, %esp ;\ - movl a5, 16(%esp) ;\ - movl a4, 12(%esp) ;\ - movl a3, 8(%esp) ;\ - movl a2, 4(%esp) ;\ - movl a1, 0(%esp) ;\ - call EXT(fn) ;\ - movl %edi, %esp .text locore_start: @@ -212,275 +142,6 @@ LEXT(recover_table_end) ;\ RECOVERY_SECTION RECOVER_TABLE_START -/* - * Timing routines. - */ -Entry(timer_update) - movl 4(%esp),%ecx - movl 8(%esp),%eax - movl 12(%esp),%edx - movl %eax,TIMER_HIGHCHK(%ecx) - movl %edx,TIMER_LOW(%ecx) - movl %eax,TIMER_HIGH(%ecx) - ret - -Entry(timer_grab) - movl 4(%esp),%ecx -0: movl TIMER_HIGH(%ecx),%edx - movl TIMER_LOW(%ecx),%eax - cmpl TIMER_HIGHCHK(%ecx),%edx - jne 0b - ret - -#if STAT_TIME - -#define TIME_TRAP_UENTRY -#define TIME_TRAP_UEXIT -#define TIME_INT_ENTRY -#define TIME_INT_EXIT - -#else -/* - * Nanosecond timing. - */ - -/* - * Nanotime returned in %edx:%eax. - * Computed from tsc based on the scale factor - * and an implicit 32 bit shift. - * - * Uses %eax, %ebx, %ecx, %edx, %esi, %edi. - */ -#define NANOTIME \ - mov %gs:CPU_NANOTIME,%edi ; \ - RTC_NANOTIME_READ_FAST() - - -/* - * Add 64-bit delta in register dreg : areg to timer pointed to by register treg. - */ -#define TIMER_UPDATE(treg,dreg,areg,offset) \ - addl (TIMER_LOW+(offset))(treg),areg /* add low bits */ ;\ - adcl dreg,(TIMER_HIGH+(offset))(treg) /* add carry high bits */ ;\ - movl areg,(TIMER_LOW+(offset))(treg) /* store updated low bit */ ;\ - movl (TIMER_HIGH+(offset))(treg),dreg /* copy high bits */ ;\ - movl dreg,(TIMER_HIGHCHK+(offset))(treg) /* to high check */ - -/* - * Add time delta to old timer and start new. - */ -#define TIMER_EVENT(old,new) \ - NANOTIME /* edx:eax nanosecs */ ; \ - movl %eax,%esi /* save timestamp */ ; \ - movl %edx,%edi /* save timestamp */ ; \ - movl %gs:CPU_ACTIVE_THREAD,%ecx /* get current thread */ ; \ - subl (old##_TIMER)+TIMER_TSTAMP(%ecx),%eax /* compute elapsed time */ ; \ - sbbl (old##_TIMER)+TIMER_TSTAMP+4(%ecx),%edx /* compute elapsed time */ ; \ - TIMER_UPDATE(%ecx,%edx,%eax,old##_TIMER) /* update timer */ ; \ - movl %esi,(new##_TIMER)+TIMER_TSTAMP(%ecx) /* set timestamp */ ; \ - movl %edi,(new##_TIMER)+TIMER_TSTAMP+4(%ecx) /* set timestamp */ ; \ - leal (new##_TIMER)(%ecx), %ecx /* compute new timer pointer */ ; \ - movl %gs:CPU_PROCESSOR,%ebx /* get current processor */ ; \ - movl %ecx,THREAD_TIMER(%ebx) /* set current timer */ ; \ - movl %esi,%eax /* restore timestamp */ ; \ - movl %edi,%edx /* restore timestamp */ ; \ - subl (old##_STATE)+TIMER_TSTAMP(%ebx),%eax /* compute elapsed time */ ; \ - sbbl (old##_STATE)+TIMER_TSTAMP+4(%ebx),%edx /* compute elapsed time */ ; \ - TIMER_UPDATE(%ebx,%edx,%eax,old##_STATE) /* update timer */ ; \ - leal (new##_STATE)(%ebx),%ecx /* compute new state pointer */ ; \ - movl %ecx,CURRENT_STATE(%ebx) /* set current state */ ; \ - movl %esi,TIMER_TSTAMP(%ecx) /* set timestamp */ ; \ - movl %edi,TIMER_TSTAMP+4(%ecx) /* set timestamp */ - -/* - * Update time on user trap entry. - * Uses %eax,%ebx,%ecx,%edx,%esi,%edi. - */ -#define TIME_TRAP_UENTRY TIMER_EVENT(USER,SYSTEM) - -/* - * update time on user trap exit. - * Uses %eax,%ebx,%ecx,%edx,%esi,%edi. - */ -#define TIME_TRAP_UEXIT TIMER_EVENT(SYSTEM,USER) - -/* - * update time on interrupt entry. - * Uses %eax,%ebx,%ecx,%edx,%esi,%edi. - * Saves processor state info on stack. - */ -#define TIME_INT_ENTRY \ - NANOTIME /* edx:eax nanosecs */ ; \ - movl %eax,%gs:CPU_INT_EVENT_TIME /* save in cpu data */ ; \ - movl %edx,%gs:CPU_INT_EVENT_TIME+4 /* save in cpu data */ ; \ - movl %eax,%esi /* save timestamp */ ; \ - movl %edx,%edi /* save timestamp */ ; \ - movl %gs:CPU_PROCESSOR,%ebx /* get current processor */ ; \ - movl THREAD_TIMER(%ebx),%ecx /* get current timer */ ; \ - subl TIMER_TSTAMP(%ecx),%eax /* compute elapsed time */ ; \ - sbbl TIMER_TSTAMP+4(%ecx),%edx /* compute elapsed time */ ; \ - TIMER_UPDATE(%ecx,%edx,%eax,0) /* update timer */ ; \ - movl KERNEL_TIMER(%ebx),%ecx /* point to kernel timer */ ; \ - movl %esi,TIMER_TSTAMP(%ecx) /* set timestamp */ ; \ - movl %edi,TIMER_TSTAMP+4(%ecx) /* set timestamp */ ; \ - movl %esi,%eax /* restore timestamp */ ; \ - movl %edi,%edx /* restore timestamp */ ; \ - movl CURRENT_STATE(%ebx),%ecx /* get current state */ ; \ - pushl %ecx /* save state */ ; \ - subl TIMER_TSTAMP(%ecx),%eax /* compute elapsed time */ ; \ - sbbl TIMER_TSTAMP+4(%ecx),%edx /* compute elapsed time */ ; \ - TIMER_UPDATE(%ecx,%edx,%eax,0) /* update timer */ ; \ - leal IDLE_STATE(%ebx),%eax /* get idle state */ ; \ - cmpl %eax,%ecx /* compare current state */ ; \ - je 0f /* skip if equal */ ; \ - leal SYSTEM_STATE(%ebx),%ecx /* get system state */ ; \ - movl %ecx,CURRENT_STATE(%ebx) /* set current state */ ; \ -0: movl %esi,TIMER_TSTAMP(%ecx) /* set timestamp */ ; \ - movl %edi,TIMER_TSTAMP+4(%ecx) /* set timestamp */ - -/* - * update time on interrupt exit. - * Uses %eax,%ebx,%ecx,%edx,%esi,%edi. - * Restores processor state info from stack. - */ -#define TIME_INT_EXIT \ - NANOTIME /* edx:eax nanosecs */ ; \ - movl %eax,%gs:CPU_INT_EVENT_TIME /* save in cpu data */ ; \ - movl %edx,%gs:CPU_INT_EVENT_TIME+4 /* save in cpu data */ ; \ - movl %eax,%esi /* save timestamp */ ; \ - movl %edx,%edi /* save timestamp */ ; \ - movl %gs:CPU_PROCESSOR,%ebx /* get current processor */ ; \ - movl KERNEL_TIMER(%ebx),%ecx /* point to kernel timer */ ; \ - subl TIMER_TSTAMP(%ecx),%eax /* compute elapsed time */ ; \ - sbbl TIMER_TSTAMP+4(%ecx),%edx /* compute elapsed time */ ; \ - TIMER_UPDATE(%ecx,%edx,%eax,0) /* update timer */ ; \ - movl THREAD_TIMER(%ebx),%ecx /* interrupted timer */ ; \ - movl %esi,TIMER_TSTAMP(%ecx) /* set timestamp */ ; \ - movl %edi,TIMER_TSTAMP+4(%ecx) /* set timestamp */ ; \ - movl %esi,%eax /* restore timestamp */ ; \ - movl %edi,%edx /* restore timestamp */ ; \ - movl CURRENT_STATE(%ebx),%ecx /* get current state */ ; \ - subl TIMER_TSTAMP(%ecx),%eax /* compute elapsed time */ ; \ - sbbl TIMER_TSTAMP+4(%ecx),%edx /* compute elapsed time */ ; \ - TIMER_UPDATE(%ecx,%edx,%eax,0) /* update timer */ ; \ - popl %ecx /* restore state */ ; \ - movl %ecx,CURRENT_STATE(%ebx) /* set current state */ ; \ - movl %esi,TIMER_TSTAMP(%ecx) /* set timestamp */ ; \ - movl %edi,TIMER_TSTAMP+4(%ecx) /* set timestamp */ - -#endif /* STAT_TIME */ - -#undef PDEBUG - -#ifdef PDEBUG - -/* - * Traditional, not ANSI. - */ -#define CAH(label) \ - .data ;\ - .globl label/**/count ;\ -label/**/count: ;\ - .long 0 ;\ - .globl label/**/limit ;\ -label/**/limit: ;\ - .long 0 ;\ - .text ;\ - addl $1,%ss:label/**/count ;\ - cmpl $0,label/**/limit ;\ - jz label/**/exit ;\ - pushl %eax ;\ -label/**/loop: ;\ - movl %ss:label/**/count,%eax ;\ - cmpl %eax,%ss:label/**/limit ;\ - je label/**/loop ;\ - popl %eax ;\ -label/**/exit: - -#else /* PDEBUG */ - -#define CAH(label) - -#endif /* PDEBUG */ - -#if MACH_KDB -/* - * Last-ditch debug code to handle faults that might result - * from entering kernel (from collocated server) on an invalid - * stack. On collocated entry, there's no hardware-initiated - * stack switch, so a valid stack must be in place when an - * exception occurs, or we may double-fault. - * - * In case of a double-fault, our only recourse is to switch - * hardware "tasks", so that we avoid using the current stack. - * - * The idea here is just to get the processor into the debugger, - * post-haste. No attempt is made to fix up whatever error got - * us here, so presumably continuing from the debugger will - * simply land us here again -- at best. - */ -#if 0 -/* - * Note that the per-fault entry points are not currently - * functional. The only way to make them work would be to - * set up separate TSS's for each fault type, which doesn't - * currently seem worthwhile. (The offset part of a task - * gate is always ignored.) So all faults that task switch - * currently resume at db_task_start. - */ -/* - * Double fault (Murphy's point) - error code (0) on stack - */ -Entry(db_task_dbl_fault) - popl %eax - movl $(T_DOUBLE_FAULT),%ebx - jmp db_task_start -/* - * Segment not present - error code on stack - */ -Entry(db_task_seg_np) - popl %eax - movl $(T_SEGMENT_NOT_PRESENT),%ebx - jmp db_task_start -/* - * Stack fault - error code on (current) stack - */ -Entry(db_task_stk_fault) - popl %eax - movl $(T_STACK_FAULT),%ebx - jmp db_task_start -/* - * General protection fault - error code on stack - */ -Entry(db_task_gen_prot) - popl %eax - movl $(T_GENERAL_PROTECTION),%ebx - jmp db_task_start -#endif /* 0 */ -/* - * The entry point where execution resumes after last-ditch debugger task - * switch. - */ -Entry(db_task_start) - movl %esp,%edx - subl $(ISS32_SIZE),%edx - movl %edx,%esp /* allocate x86_saved_state on stack */ - movl %eax,R32_ERR(%esp) - movl %ebx,R32_TRAPNO(%esp) - pushl %edx - CPU_NUMBER(%edx) - movl CX(EXT(master_dbtss),%edx),%edx - movl TSS_LINK(%edx),%eax - pushl %eax /* pass along selector of previous TSS */ - call EXT(db_tss_to_frame) - popl %eax /* get rid of TSS selector */ - call EXT(db_trap_from_asm) - addl $0x4,%esp - /* - * And now...? - */ - iret /* ha, ha, ha... */ -#endif /* MACH_KDB */ /* * Called as a function, makes the current thread @@ -497,650 +158,17 @@ LEXT(thread_bootstrap_return) #endif LEXT(thread_exception_return) - CLI - movl %gs:CPU_KERNEL_STACK,%ecx - - movl (%ecx),%esp /* switch back to PCB stack */ - xorl %ecx,%ecx /* don't check if we're in the PFZ */ - jmp EXT(return_from_trap) - -Entry(call_continuation) - movl S_ARG0,%eax /* get continuation */ - movl S_ARG1,%edx /* continuation param */ - movl S_ARG2,%ecx /* wait result */ - movl %gs:CPU_KERNEL_STACK,%esp /* pop the stack */ - xorl %ebp,%ebp /* zero frame pointer */ - subl $8,%esp /* align the stack */ - pushl %ecx - pushl %edx - call *%eax /* call continuation */ - addl $16,%esp - movl %gs:CPU_ACTIVE_THREAD,%eax - pushl %eax - call EXT(thread_terminate) - - - -/******************************************************************************************************* - * - * All 64 bit task 'exceptions' enter lo_alltraps: - * esp -> x86_saved_state_t - * - * The rest of the state is set up as: - * cr3 -> kernel directory - * esp -> low based stack - * gs -> CPU_DATA_GS - * cs -> KERNEL32_CS - * ss/ds/es -> KERNEL_DS - * - * interrupts disabled - * direction flag cleared - */ -Entry(lo_alltraps) - movl R32_CS(%esp),%eax /* assume 32-bit state */ - cmpl $(SS_64),SS_FLAVOR(%esp)/* 64-bit? */ - jne 1f - movl R64_CS(%esp),%eax /* 64-bit user mode */ -1: - testb $3,%al - jz trap_from_kernel - /* user mode trap */ - TIME_TRAP_UENTRY - - movl %gs:CPU_ACTIVE_THREAD,%ecx - movl ACT_TASK(%ecx),%ebx - - /* Check for active vtimers in the current task */ - cmpl $0,TASK_VTIMERS(%ebx) - jz 1f - - /* Set a pending AST */ - orl $(AST_BSD),%gs:CPU_PENDING_AST - - /* Set a thread AST (atomic) */ - lock - orl $(AST_BSD),ACT_AST(%ecx) - -1: - movl %gs:CPU_KERNEL_STACK,%ebx - xchgl %ebx,%esp /* switch to kernel stack */ - sti - - CCALL1(user_trap, %ebx) /* call user trap routine */ - cli /* hold off intrs - critical section */ - popl %esp /* switch back to PCB stack */ - xorl %ecx,%ecx /* don't check if we're in the PFZ */ - -/* - * Return from trap or system call, checking for ASTs. - * On lowbase PCB stack with intrs disabled - */ -LEXT(return_from_trap) - movl %gs:CPU_PENDING_AST, %eax - testl %eax, %eax - je EXT(return_to_user) /* branch if no AST */ - -LEXT(return_from_trap_with_ast) - movl %gs:CPU_KERNEL_STACK, %ebx - xchgl %ebx, %esp /* switch to kernel stack */ - - testl %ecx, %ecx /* see if we need to check for an EIP in the PFZ */ - je 2f /* no, go handle the AST */ - cmpl $(SS_64), SS_FLAVOR(%ebx) /* are we a 64-bit task? */ - je 1f - /* no... 32-bit user mode */ - movl R32_EIP(%ebx), %eax - pushl %ebx /* save PCB stack */ - xorl %ebp, %ebp /* clear frame pointer */ - CCALL1(commpage_is_in_pfz32, %eax) - popl %ebx /* retrieve pointer to PCB stack */ - testl %eax, %eax - je 2f /* not in the PFZ... go service AST */ - movl %eax, R32_EBX(%ebx) /* let the PFZ know we've pended an AST */ - xchgl %ebx, %esp /* switch back to PCB stack */ - jmp EXT(return_to_user) -1: /* 64-bit user mode */ - movl R64_RIP(%ebx), %ecx - movl R64_RIP+4(%ebx), %eax - pushl %ebx /* save PCB stack */ - xorl %ebp, %ebp /* clear frame pointer */ - CCALL2(commpage_is_in_pfz64, %ecx, %eax) - popl %ebx /* retrieve pointer to PCB stack */ - testl %eax, %eax - je 2f /* not in the PFZ... go service AST */ - movl %eax, R64_RBX(%ebx) /* let the PFZ know we've pended an AST */ - xchgl %ebx, %esp /* switch back to PCB stack */ - jmp EXT(return_to_user) -2: - STI /* interrupts always enabled on return to user mode */ - pushl %ebx /* save PCB stack */ - xorl %ebp, %ebp /* Clear framepointer */ - CCALL1(i386_astintr, $0) /* take the AST */ - CLI - - popl %esp /* switch back to PCB stack (w/exc link) */ - - xorl %ecx, %ecx /* don't check if we're in the PFZ */ - jmp EXT(return_from_trap) /* and check again (rare) */ - -LEXT(return_to_user) - TIME_TRAP_UEXIT - -LEXT(ret_to_user) - cmpl $0, %gs:CPU_IS64BIT - je EXT(lo_ret_to_user) - jmp EXT(lo64_ret_to_user) - - - -/* - * Trap from kernel mode. No need to switch stacks. - * Interrupts must be off here - we will set them to state at time of trap - * as soon as it's safe for us to do so and not recurse doing preemption - */ -trap_from_kernel: - movl %esp, %eax /* saved state addr */ - pushl R32_EIP(%esp) /* Simulate a CALL from fault point */ - pushl %ebp /* Extend framepointer chain */ - movl %esp, %ebp - CCALL1(kernel_trap, %eax) /* Call kernel trap handler */ - popl %ebp - addl $4, %esp cli + xorl %ecx,%ecx /* don't check if in the PFZ */ + cmpl $0, %gs:CPU_IS64BIT + je EXT(return_from_trap32) + jmp EXT(return_from_trap) - movl %gs:CPU_PENDING_AST,%eax /* get pending asts */ - testl $ AST_URGENT,%eax /* any urgent preemption? */ - je ret_to_kernel /* no, nothing to do */ - cmpl $ T_PREEMPT,R32_TRAPNO(%esp) - je ret_to_kernel /* T_PREEMPT handled in kernel_trap() */ - testl $ EFL_IF,R32_EFLAGS(%esp) /* interrupts disabled? */ - je ret_to_kernel - cmpl $0,%gs:CPU_PREEMPTION_LEVEL /* preemption disabled? */ - jne ret_to_kernel - movl %gs:CPU_KERNEL_STACK,%eax - movl %esp,%ecx - xorl %eax,%ecx - and EXT(kernel_stack_mask),%ecx - testl %ecx,%ecx /* are we on the kernel stack? */ - jne ret_to_kernel /* no, skip it */ - - CCALL1(i386_astintr, $1) /* take the AST */ - -ret_to_kernel: - cmpl $0, %gs:CPU_IS64BIT - je EXT(lo_ret_to_kernel) - jmp EXT(lo64_ret_to_kernel) - - - -/******************************************************************************************************* - * - * All interrupts on all tasks enter here with: - * esp-> -> x86_saved_state_t - * - * cr3 -> kernel directory - * esp -> low based stack - * gs -> CPU_DATA_GS - * cs -> KERNEL32_CS - * ss/ds/es -> KERNEL_DS - * - * interrupts disabled - * direction flag cleared - */ -Entry(lo_allintrs) - /* - * test whether already on interrupt stack - */ - movl %gs:CPU_INT_STACK_TOP,%ecx - cmpl %esp,%ecx - jb 1f - leal -INTSTACK_SIZE(%ecx),%edx - cmpl %esp,%edx - jb int_from_intstack -1: - xchgl %ecx,%esp /* switch to interrupt stack */ - - movl %cr0,%eax /* get cr0 */ - orl $(CR0_TS),%eax /* or in TS bit */ - movl %eax,%cr0 /* set cr0 */ - - subl $8, %esp /* for 16-byte stack alignment */ - pushl %ecx /* save pointer to old stack */ - movl %ecx,%gs:CPU_INT_STATE /* save intr state */ - - TIME_INT_ENTRY /* do timing */ - - movl %gs:CPU_ACTIVE_THREAD,%ecx - movl ACT_TASK(%ecx),%ebx - - /* Check for active vtimers in the current task */ - cmpl $0,TASK_VTIMERS(%ebx) - jz 1f - - /* Set a pending AST */ - orl $(AST_BSD),%gs:CPU_PENDING_AST - - /* Set a thread AST (atomic) */ - lock - orl $(AST_BSD),ACT_AST(%ecx) - -1: - incl %gs:CPU_PREEMPTION_LEVEL - incl %gs:CPU_INTERRUPT_LEVEL - - movl %gs:CPU_INT_STATE, %eax - CCALL1(interrupt, %eax) /* call generic interrupt routine */ - - cli /* just in case we returned with intrs enabled */ - xorl %eax,%eax - movl %eax,%gs:CPU_INT_STATE /* clear intr state pointer */ - - decl %gs:CPU_INTERRUPT_LEVEL - decl %gs:CPU_PREEMPTION_LEVEL - - TIME_INT_EXIT /* do timing */ - - movl %gs:CPU_ACTIVE_THREAD,%eax - movl ACT_PCB(%eax),%eax /* get act`s PCB */ - movl PCB_FPS(%eax),%eax /* get pcb's ims.ifps */ - cmpl $0,%eax /* Is there a context */ - je 1f /* Branch if not */ - movl FP_VALID(%eax),%eax /* Load fp_valid */ - cmpl $0,%eax /* Check if valid */ - jne 1f /* Branch if valid */ - clts /* Clear TS */ - jmp 2f -1: - movl %cr0,%eax /* get cr0 */ - orl $(CR0_TS),%eax /* or in TS bit */ - movl %eax,%cr0 /* set cr0 */ -2: - popl %esp /* switch back to old stack */ - - /* Load interrupted code segment into %eax */ - movl R32_CS(%esp),%eax /* assume 32-bit state */ - cmpl $(SS_64),SS_FLAVOR(%esp)/* 64-bit? */ - jne 3f - movl R64_CS(%esp),%eax /* 64-bit user mode */ -3: - testb $3,%al /* user mode, */ - jnz ast_from_interrupt_user /* go handle potential ASTs */ - /* - * we only want to handle preemption requests if - * the interrupt fell in the kernel context - * and preemption isn't disabled - */ - movl %gs:CPU_PENDING_AST,%eax - testl $ AST_URGENT,%eax /* any urgent requests? */ - je ret_to_kernel /* no, nothing to do */ - - cmpl $0,%gs:CPU_PREEMPTION_LEVEL /* preemption disabled? */ - jne ret_to_kernel /* yes, skip it */ - - movl %gs:CPU_KERNEL_STACK,%eax - movl %esp,%ecx - xorl %eax,%ecx - and EXT(kernel_stack_mask),%ecx - testl %ecx,%ecx /* are we on the kernel stack? */ - jne ret_to_kernel /* no, skip it */ - - /* - * Take an AST from kernel space. We don't need (and don't want) - * to do as much as the case where the interrupt came from user - * space. - */ - CCALL1(i386_astintr, $1) - - jmp ret_to_kernel - - -/* - * nested int - simple path, can't preempt etc on way out - */ -int_from_intstack: - incl %gs:CPU_PREEMPTION_LEVEL - incl %gs:CPU_INTERRUPT_LEVEL - incl %gs:CPU_NESTED_ISTACK - - movl %esp, %edx /* x86_saved_state */ - CCALL1(interrupt, %edx) - - decl %gs:CPU_INTERRUPT_LEVEL - decl %gs:CPU_PREEMPTION_LEVEL - decl %gs:CPU_NESTED_ISTACK - jmp ret_to_kernel - -/* - * Take an AST from an interrupted user - */ -ast_from_interrupt_user: - movl %gs:CPU_PENDING_AST,%eax - testl %eax,%eax /* pending ASTs? */ - je EXT(ret_to_user) /* no, nothing to do */ - - TIME_TRAP_UENTRY - - movl $1, %ecx /* check if we're in the PFZ */ - jmp EXT(return_from_trap_with_ast) /* return */ - - -/******************************************************************************************************* - * - * 32bit Tasks - * System call entries via INTR_GATE or sysenter: - * - * esp -> x86_saved_state32_t - * cr3 -> kernel directory - * esp -> low based stack - * gs -> CPU_DATA_GS - * cs -> KERNEL32_CS - * ss/ds/es -> KERNEL_DS - * - * interrupts disabled - * direction flag cleared - */ - -Entry(lo_sysenter) - /* - * We can be here either for a mach syscall or a unix syscall, - * as indicated by the sign of the code: - */ - movl R32_EAX(%esp),%eax - testl %eax,%eax - js EXT(lo_mach_scall) /* < 0 => mach */ - /* > 0 => unix */ - -Entry(lo_unix_scall) - TIME_TRAP_UENTRY - - movl %gs:CPU_ACTIVE_THREAD,%ecx /* get current thread */ - movl ACT_TASK(%ecx),%ebx /* point to current task */ - addl $1,TASK_SYSCALLS_UNIX(%ebx) /* increment call count */ - - /* Check for active vtimers in the current task */ - cmpl $0,TASK_VTIMERS(%ebx) - jz 1f - - /* Set a pending AST */ - orl $(AST_BSD),%gs:CPU_PENDING_AST - - /* Set a thread AST (atomic) */ - lock - orl $(AST_BSD),ACT_AST(%ecx) - -1: - movl %gs:CPU_KERNEL_STACK,%ebx - xchgl %ebx,%esp /* switch to kernel stack */ - - sti - - CCALL1(unix_syscall, %ebx) - /* - * always returns through thread_exception_return - */ - - -Entry(lo_mach_scall) - TIME_TRAP_UENTRY - - movl %gs:CPU_ACTIVE_THREAD,%ecx /* get current thread */ - movl ACT_TASK(%ecx),%ebx /* point to current task */ - addl $1,TASK_SYSCALLS_MACH(%ebx) /* increment call count */ - - /* Check for active vtimers in the current task */ - cmpl $0,TASK_VTIMERS(%ebx) - jz 1f - - /* Set a pending AST */ - orl $(AST_BSD),%gs:CPU_PENDING_AST - - /* Set a thread AST (atomic) */ - lock - orl $(AST_BSD),ACT_AST(%ecx) - -1: - movl %gs:CPU_KERNEL_STACK,%ebx - xchgl %ebx,%esp /* switch to kernel stack */ - - sti - - CCALL1(mach_call_munger, %ebx) - /* - * always returns through thread_exception_return - */ - - -Entry(lo_mdep_scall) - TIME_TRAP_UENTRY - - movl %gs:CPU_ACTIVE_THREAD,%ecx /* get current thread */ - movl ACT_TASK(%ecx),%ebx /* point to current task */ - - /* Check for active vtimers in the current task */ - cmpl $0,TASK_VTIMERS(%ebx) - jz 1f - - /* Set a pending AST */ - orl $(AST_BSD),%gs:CPU_PENDING_AST - - /* Set a thread AST (atomic) */ - lock - orl $(AST_BSD),ACT_AST(%ecx) - -1: - movl %gs:CPU_KERNEL_STACK,%ebx - xchgl %ebx,%esp /* switch to kernel stack */ - - sti - - CCALL1(machdep_syscall, %ebx) - /* - * always returns through thread_exception_return - */ - - -Entry(lo_diag_scall) - TIME_TRAP_UENTRY - - movl %gs:CPU_ACTIVE_THREAD,%ecx /* get current thread */ - movl ACT_TASK(%ecx),%ebx /* point to current task */ - - /* Check for active vtimers in the current task */ - cmpl $0,TASK_VTIMERS(%ebx) - jz 1f - - /* Set a pending AST */ - orl $(AST_BSD),%gs:CPU_PENDING_AST - - /* Set a thread AST (atomic) */ - lock - orl $(AST_BSD),ACT_AST(%ecx) - -1: - movl %gs:CPU_KERNEL_STACK,%ebx // Get the address of the kernel stack - xchgl %ebx,%esp // Switch to it, saving the previous - - CCALL1(diagCall, %ebx) // Call diagnostics - - cmpl $0,%eax // What kind of return is this? - je 2f - cli // Disable interruptions just in case they were enabled - popl %esp // Get back the original stack - jmp EXT(return_to_user) // Normal return, do not check asts... -2: - CCALL5(i386_exception, $EXC_SYSCALL, $0x6000, $0, $1, $0) - // pass what would be the diag syscall - // error return - cause an exception - /* no return */ - - - -/******************************************************************************************************* - * - * 64bit Tasks - * System call entries via syscall only: - * - * esp -> x86_saved_state64_t - * cr3 -> kernel directory - * esp -> low based stack - * gs -> CPU_DATA_GS - * cs -> KERNEL32_CS - * ss/ds/es -> KERNEL_DS - * - * interrupts disabled - * direction flag cleared - */ - -Entry(lo_syscall) - TIME_TRAP_UENTRY - - /* - * We can be here either for a mach, unix machdep or diag syscall, - * as indicated by the syscall class: - */ - movl R64_RAX(%esp), %eax /* syscall number/class */ - movl %eax, %ebx - andl $(SYSCALL_CLASS_MASK), %ebx /* syscall class */ - cmpl $(SYSCALL_CLASS_MACH<> *scale) >= divisor)) - * (*scale)++; - * *scale = 32 - *scale; - * return ((dividend << *scale) / divisor); - */ -ENTRY(div_scale) - PUSH_FRAME - xorl %ecx, %ecx /* *scale = 0 */ - xorl %eax, %eax - movl ARG0, %edx /* get dividend */ -0: - cmpl ARG1, %edx /* if (divisor > dividend) */ - jle 1f /* goto 1f */ - addl $1, %ecx /* (*scale)++ */ - shrdl $1, %edx, %eax /* dividend >> 1 */ - shrl $1, %edx /* dividend >> 1 */ - jmp 0b /* goto 0b */ -1: - divl ARG1 /* (dividend << (32 - *scale)) / divisor */ - movl ARG2, %edx /* get scale */ - movl $32, (%edx) /* *scale = 32 */ - subl %ecx, (%edx) /* *scale -= %ecx */ - POP_FRAME - ret - -/* - * unsigned int - * mul_scale(unsigned int multiplicand, - * unsigned int multiplier, - * unsigned int *scale) - * - * This function returns ((multiplicand * multiplier) >> *scale) where - * scale is the largest possible value before overflow. This is used in - * computation where precision must be achieved in order to avoid - * floating point usage. - * - * Algorithm: - * *scale = 0; - * while (overflow((multiplicand * multiplier) >> *scale)) - * (*scale)++; - * return ((multiplicand * multiplier) >> *scale); - */ -ENTRY(mul_scale) - PUSH_FRAME - xorl %ecx, %ecx /* *scale = 0 */ - movl ARG0, %eax /* get multiplicand */ - mull ARG1 /* multiplicand * multiplier */ -0: - cmpl $0, %edx /* if (!overflow()) */ - je 1f /* goto 1 */ - addl $1, %ecx /* (*scale)++ */ - shrdl $1, %edx, %eax /* (multiplicand * multiplier) >> 1 */ - shrl $1, %edx /* (multiplicand * multiplier) >> 1 */ - jmp 0b -1: - movl ARG2, %edx /* get scale */ - movl %ecx, (%edx) /* set *scale */ - POP_FRAME - ret - - - -/* - * Double-fault exception handler task. The last gasp... - */ -Entry(df_task_start) - CCALL1(panic_double_fault32, $(T_DOUBLE_FAULT)) - hlt - - -/* - * machine-check handler task. The last gasp... - */ -Entry(mc_task_start) - CCALL1(panic_machine_check32, $(T_MACHINE_CHECK)) - hlt - -/* - * Compatibility mode's last gasp... - */ -Entry(lo_df64) - movl %esp, %eax - CCALL1(panic_double_fault64, %eax) - hlt - -Entry(lo_mc64) - movl %esp, %eax - CCALL1(panic_machine_check64, %eax) - hlt - diff --git a/osfmk/i386/loose_ends.c b/osfmk/i386/loose_ends.c index bc205fe1e..ee59d599a 100644 --- a/osfmk/i386/loose_ends.c +++ b/osfmk/i386/loose_ends.c @@ -78,7 +78,6 @@ #include #include - #if 0 #undef KERNEL_DEBUG @@ -105,8 +104,7 @@ void machine_callstack(natural_t *buf, vm_size_t callstack_max); #define value_64bit(value) ((value) & 0xFFFFFFFF00000000LL) #define low32(x) ((unsigned int)((x) & 0x00000000FFFFFFFFLL)) - - +#define JOE_DEBUG 0 void bzero_phys_nc( @@ -166,6 +164,38 @@ bcopy_phys( mp_enable_preemption(); } +/* + * allow a function to get a quick virtual mapping of a physical page + */ + +int +apply_func_phys( + addr64_t dst64, + vm_size_t bytes, + int (*func)(void * buffer, vm_size_t bytes, void * arg), + void * arg) +{ + mapwindow_t *dst_map; + int rc = -1; + + /* ensure we stay within a page */ + if ( ((((uint32_t)dst64 & (NBPG-1)) + bytes) > NBPG) ) { + panic("apply_func_phys alignment"); + } + mp_disable_preemption(); + + dst_map = pmap_get_mapwindow((pt_entry_t)(INTEL_PTE_VALID | INTEL_PTE_RW | ((pmap_paddr_t)dst64 & PG_FRAME) | + INTEL_PTE_REF | INTEL_PTE_MOD)); + + rc = func((void *)((uintptr_t)dst_map->prv_CADDR | ((uint32_t)dst64 & INTEL_OFFMASK)), bytes, arg); + + pmap_put_mapwindow(dst_map); + + mp_enable_preemption(); + + return rc; +} + /* * ovbcopy - like bcopy, but recognizes overlapping ranges and handles * them correctly. @@ -324,7 +354,7 @@ ml_phys_write_data(pmap_paddr_t paddr, unsigned long data, int size) break; case 4: default: - *(unsigned int *)((uintptr_t)map->prv_CADDR | ((uint32_t)paddr & INTEL_OFFMASK)) = data; + *(unsigned int *)((uintptr_t)map->prv_CADDR | ((uint32_t)paddr & INTEL_OFFMASK)) = (uint32_t)data; break; } pmap_put_mapwindow(map); @@ -455,7 +485,7 @@ int bcmp( break; while (--len); - return len; + return (int)len; } int @@ -569,7 +599,7 @@ void dcache_incoherent_io_store64(addr64_t pa, unsigned int count) count += offset; offset = (uint32_t)(addr & ((addr64_t) (page_size - 1))); - chunk = page_size - offset; + chunk = (uint32_t)page_size - offset; do { @@ -581,7 +611,7 @@ void dcache_incoherent_io_store64(addr64_t pa, unsigned int count) count -= chunk; addr += chunk; - chunk = page_size; + chunk = (uint32_t) page_size; offset = 0; if (count) { @@ -603,11 +633,23 @@ void dcache_incoherent_io_flush64(addr64_t pa, unsigned int count) return(dcache_incoherent_io_store64(pa,count)); } + void -flush_dcache64(__unused addr64_t addr, - __unused unsigned count, - __unused int phys) +flush_dcache64(addr64_t addr, unsigned count, int phys) { + if (phys) { + dcache_incoherent_io_flush64(addr, count); + } + else { + uint32_t linesize = cpuid_info()->cache_linesize; + addr64_t bound = (addr + count + linesize - 1) & ~(linesize - 1); + __mfence(); + while (addr < bound) { + __clflush((void *) (uintptr_t) addr); + addr += linesize; + } + __mfence(); + } } void @@ -661,570 +703,78 @@ cache_flush_page_phys(ppnum_t pa) } -/* - * the copy engine has the following characteristics - * - copyio handles copies to/from user or kernel space - * - copypv deals with physical or virtual addresses - * - * implementation details as follows - * - a cache of up to NCOPY_WINDOWS is maintained per thread for - * access of user virutal space - * - the window size is determined by the amount of virtual space - * that can be mapped by a single page table - * - the mapping is done by copying the page table pointer from - * the user's directory entry corresponding to the window's - * address in user space to the directory entry corresponding - * to the window slot in the kernel's address space - * - the set of mappings is preserved across context switches, - * so the copy can run with pre-emption enabled - * - there is a gdt entry set up to anchor the kernel window on - * each processor - * - the copies are done using the selector corresponding to the - * gdt entry - * - the addresses corresponding to the user virtual address are - * relative to the beginning of the window being used to map - * that region... thus the thread can be pre-empted and switched - * to a different processor while in the midst of a copy - * - the window caches must be invalidated if the pmap changes out - * from under the thread... this can happen during vfork/exec... - * inval_copy_windows is the invalidation routine to be used - * - the copyio engine has 4 different states associated with it - * that allows for lazy tlb flushes and the ability to avoid - * a flush all together if we've just come from user space - * the 4 states are as follows... - * - * WINDOWS_OPENED - set by copyio to indicate to the context - * switch code that it is necessary to do a tlbflush after - * switching the windows since we're in the middle of a copy - * - * WINDOWS_CLOSED - set by copyio to indicate that it's done - * using the windows, so that the context switch code need - * not do the tlbflush... instead it will set the state to... - * - * WINDOWS_DIRTY - set by the context switch code to indicate - * to the copy engine that it is responsible for doing a - * tlbflush before using the windows again... it's also - * set by the inval_copy_windows routine to indicate the - * same responsibility. - * - * WINDOWS_CLEAN - set by the return to user path to indicate - * that a tlbflush has happened and that there is no need - * for copyio to do another when it is entered next... - * - * - a window for mapping single physical pages is provided for copypv - * - this window is maintained across context switches and has the - * same characteristics as the user space windows w/r to pre-emption - */ - -extern int copyout_user(const char *, vm_offset_t, vm_size_t); -extern int copyout_kern(const char *, vm_offset_t, vm_size_t); -extern int copyin_user(const vm_offset_t, char *, vm_size_t); -extern int copyin_kern(const vm_offset_t, char *, vm_size_t); -extern int copyoutphys_user(const char *, vm_offset_t, vm_size_t); -extern int copyoutphys_kern(const char *, vm_offset_t, vm_size_t); -extern int copyinphys_user(const vm_offset_t, char *, vm_size_t); -extern int copyinphys_kern(const vm_offset_t, char *, vm_size_t); -extern int copyinstr_user(const vm_offset_t, char *, vm_size_t, vm_size_t *); -extern int copyinstr_kern(const vm_offset_t, char *, vm_size_t, vm_size_t *); - -static int copyio(int, user_addr_t, char *, vm_size_t, vm_size_t *, int); -static int copyio_phys(addr64_t, addr64_t, vm_size_t, int); - - -#define COPYIN 0 -#define COPYOUT 1 -#define COPYINSTR 2 -#define COPYINPHYS 3 -#define COPYOUTPHYS 4 - - -void inval_copy_windows(thread_t thread) +#if !MACH_KDP +void +kdp_register_callout(void) { - int i; - - for (i = 0; i < NCOPY_WINDOWS; i++) { - thread->machine.copy_window[i].user_base = -1; - } - thread->machine.nxt_window = 0; - thread->machine.copyio_state = WINDOWS_DIRTY; - - KERNEL_DEBUG(0xeff70058 | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), (int)thread->map, 0, 0, 0); } - - -static int -copyio(int copy_type, user_addr_t user_addr, char *kernel_addr, - vm_size_t nbytes, vm_size_t *lencopied, int use_kernel_map) -{ - thread_t thread; - pmap_t pmap; - pt_entry_t *updp; - pt_entry_t *kpdp; - user_addr_t user_base; - vm_offset_t user_offset; - vm_offset_t kern_vaddr; - vm_size_t cnt; - vm_size_t bytes_copied; - int error = 0; - int window_index; - int copyio_state; - boolean_t istate; -#if KDEBUG - int debug_type = 0xeff70010; - debug_type += (copy_type << 2); -#endif - - thread = current_thread(); - - KERNEL_DEBUG(debug_type | DBG_FUNC_START, (int)(user_addr >> 32), (int)user_addr, - (int)nbytes, thread->machine.copyio_state, 0); - - if (nbytes == 0) { - KERNEL_DEBUG(debug_type | DBG_FUNC_END, (unsigned)user_addr, - (unsigned)kernel_addr, (unsigned)nbytes, 0, 0); - return (0); - } - pmap = thread->map->pmap; - - if (pmap == kernel_pmap || use_kernel_map) { - - kern_vaddr = (vm_offset_t)user_addr; - - switch (copy_type) { - - case COPYIN: - error = copyin_kern(kern_vaddr, kernel_addr, nbytes); - break; - - case COPYOUT: - error = copyout_kern(kernel_addr, kern_vaddr, nbytes); - break; - - case COPYINSTR: - error = copyinstr_kern(kern_vaddr, kernel_addr, nbytes, lencopied); - break; - - case COPYINPHYS: - error = copyinphys_kern(kern_vaddr, kernel_addr, nbytes); - break; - - case COPYOUTPHYS: - error = copyoutphys_kern(kernel_addr, kern_vaddr, nbytes); - break; - } - KERNEL_DEBUG(debug_type | DBG_FUNC_END, (unsigned)kern_vaddr, - (unsigned)kernel_addr, (unsigned)nbytes, - error | 0x80000000, 0); - return (error); - } - -#if CONFIG_DTRACE - thread->machine.specFlags |= CopyIOActive; -#endif /* CONFIG_DTRACE */ - - if ((nbytes && (user_addr + nbytes <= user_addr)) || - (user_addr < vm_map_min(thread->map)) || - (user_addr + nbytes > vm_map_max(thread->map))) { - error = EFAULT; - goto done; - } - - user_base = user_addr & ~((user_addr_t)(NBPDE - 1)); - user_offset = (vm_offset_t)(user_addr & (NBPDE - 1)); - - KERNEL_DEBUG(debug_type | DBG_FUNC_NONE, (int)(user_base >> 32), (int)user_base, - (int)user_offset, 0, 0); - - cnt = NBPDE - user_offset; - - if (cnt > nbytes) - cnt = nbytes; - - istate = ml_set_interrupts_enabled(FALSE); - - copyio_state = thread->machine.copyio_state; - thread->machine.copyio_state = WINDOWS_OPENED; - - (void) ml_set_interrupts_enabled(istate); - - - for (;;) { - - for (window_index = 0; window_index < NCOPY_WINDOWS; window_index++) { - if (thread->machine.copy_window[window_index].user_base == user_base) - break; - } - if (window_index >= NCOPY_WINDOWS) { - - window_index = thread->machine.nxt_window; - thread->machine.nxt_window++; - - if (thread->machine.nxt_window >= NCOPY_WINDOWS) - thread->machine.nxt_window = 0; - thread->machine.copy_window[window_index].user_base = user_base; - - /* - * it's necessary to disable pre-emption - * since I have to compute the kernel descriptor pointer - * for the new window - */ - istate = ml_set_interrupts_enabled(FALSE); - - updp = pmap_pde(pmap, user_base); - - kpdp = current_cpu_datap()->cpu_copywindow_pdp; - kpdp += window_index; - - pmap_store_pte(kpdp, updp ? *updp : 0); - - (void) ml_set_interrupts_enabled(istate); - - copyio_state = WINDOWS_DIRTY; - - KERNEL_DEBUG(0xeff70040 | DBG_FUNC_NONE, window_index, - (unsigned)user_base, (unsigned)updp, - (unsigned)kpdp, 0); - - } -#if JOE_DEBUG - else { - istate = ml_set_interrupts_enabled(FALSE); - - updp = pmap_pde(pmap, user_base); - - kpdp = current_cpu_datap()->cpu_copywindow_pdp; - - kpdp += window_index; - - if ((*kpdp & PG_FRAME) != (*updp & PG_FRAME)) { - panic("copyio: user pdp mismatch - kpdp = 0x%qx, updp = 0x%qx\n", *kpdp, *updp); - } - (void) ml_set_interrupts_enabled(istate); - } -#endif - if (copyio_state == WINDOWS_DIRTY) { - flush_tlb(); - - copyio_state = WINDOWS_CLEAN; - - KERNEL_DEBUG(0xeff70054 | DBG_FUNC_NONE, window_index, 0, 0, 0, 0); - } - user_offset += (window_index * NBPDE); - - KERNEL_DEBUG(0xeff70044 | DBG_FUNC_NONE, (unsigned)user_offset, - (unsigned)kernel_addr, cnt, 0, 0); - - switch (copy_type) { - - case COPYIN: - error = copyin_user(user_offset, kernel_addr, cnt); - break; - - case COPYOUT: - error = copyout_user(kernel_addr, user_offset, cnt); - break; - - case COPYINPHYS: - error = copyinphys_user(user_offset, kernel_addr, cnt); - break; - - case COPYOUTPHYS: - error = copyoutphys_user(kernel_addr, user_offset, cnt); - break; - - case COPYINSTR: - error = copyinstr_user(user_offset, kernel_addr, cnt, &bytes_copied); - - /* - * lencopied should be updated on success - * or ENAMETOOLONG... but not EFAULT - */ - if (error != EFAULT) - *lencopied += bytes_copied; - - /* - * if we still have room, then the ENAMETOOLONG - * is just an artifact of the buffer straddling - * a window boundary and we should continue - */ - if (error == ENAMETOOLONG && nbytes > cnt) - error = 0; - - if (error) { -#if KDEBUG - nbytes = *lencopied; -#endif - break; - } - if (*(kernel_addr + bytes_copied - 1) == 0) { - /* - * we found a NULL terminator... we're done - */ -#if KDEBUG - nbytes = *lencopied; -#endif - goto done; - } - if (cnt == nbytes) { - /* - * no more room in the buffer and we haven't - * yet come across a NULL terminator - */ -#if KDEBUG - nbytes = *lencopied; #endif - error = ENAMETOOLONG; - break; - } - assert(cnt == bytes_copied); - - break; - } - if (error) - break; - if ((nbytes -= cnt) == 0) - break; - - kernel_addr += cnt; - user_base += NBPDE; - user_offset = 0; - - if (nbytes > NBPDE) - cnt = NBPDE; - else - cnt = nbytes; - } -done: - thread->machine.copyio_state = WINDOWS_CLOSED; - - KERNEL_DEBUG(debug_type | DBG_FUNC_END, (unsigned)user_addr, - (unsigned)kernel_addr, (unsigned)nbytes, error, 0); -#if CONFIG_DTRACE - thread->machine.specFlags &= ~CopyIOActive; -#endif /* CONFIG_DTRACE */ - - return (error); +#if !CONFIG_VMX +int host_vmxon(boolean_t exclusive __unused) +{ + return VMX_UNSUPPORTED; } - -static int -copyio_phys(addr64_t source, addr64_t sink, vm_size_t csize, int which) +void host_vmxoff(void) { - pmap_paddr_t paddr; - user_addr_t vaddr; - char *window_offset; - pt_entry_t pentry; - int ctype; - int retval; - boolean_t istate; - - if (which & cppvPsnk) { - paddr = (pmap_paddr_t)sink; - vaddr = (user_addr_t)source; - ctype = COPYINPHYS; - pentry = (pt_entry_t)(INTEL_PTE_VALID | (paddr & PG_FRAME) | INTEL_PTE_RW); - } else { - paddr = (pmap_paddr_t)source; - vaddr = (user_addr_t)sink; - ctype = COPYOUTPHYS; - pentry = (pt_entry_t)(INTEL_PTE_VALID | (paddr & PG_FRAME)); - } - window_offset = (char *)((uint32_t)paddr & (PAGE_SIZE - 1)); - - assert(!((current_thread()->machine.specFlags & CopyIOActive) && ((which & cppvKmap) == 0))); - - if (current_thread()->machine.physwindow_busy) { - pt_entry_t old_pentry; - - KERNEL_DEBUG(0xeff70048 | DBG_FUNC_NONE, paddr, csize, 0, -1, 0); - /* - * we had better be targeting wired memory at this point - * we will not be able to handle a fault with interrupts - * disabled... we disable them because we can't tolerate - * being preempted during this nested use of the window - */ - istate = ml_set_interrupts_enabled(FALSE); - - old_pentry = *(current_cpu_datap()->cpu_physwindow_ptep); - pmap_store_pte((current_cpu_datap()->cpu_physwindow_ptep), pentry); - - invlpg((uintptr_t)current_cpu_datap()->cpu_physwindow_base); - - retval = copyio(ctype, vaddr, window_offset, csize, NULL, which & cppvKmap); - - pmap_store_pte((current_cpu_datap()->cpu_physwindow_ptep), old_pentry); - - invlpg((uintptr_t)current_cpu_datap()->cpu_physwindow_base); - - (void) ml_set_interrupts_enabled(istate); - } else { - /* - * mark the window as in use... if an interrupt hits while we're - * busy, or we trigger another coyppv from the fault path into - * the driver on a user address space page fault due to a copyin/out - * then we need to save and restore the current window state instead - * of caching the window preserving it across context switches - */ - current_thread()->machine.physwindow_busy = 1; - - if (current_thread()->machine.physwindow_pte != pentry) { - KERNEL_DEBUG(0xeff70048 | DBG_FUNC_NONE, paddr, csize, 0, 0, 0); - - current_thread()->machine.physwindow_pte = pentry; - - /* - * preemption at this point would be bad since we - * could end up on the other processor after we grabbed the - * pointer to the current cpu data area, but before we finished - * using it to stuff the page table entry since we would - * be modifying a window that no longer belonged to us - * the invlpg can be done unprotected since it only flushes - * this page address from the tlb... if it flushes the wrong - * one, no harm is done, and the context switch that moved us - * to the other processor will have already take care of - * flushing the tlb after it reloaded the page table from machine.physwindow_pte - */ - istate = ml_set_interrupts_enabled(FALSE); - - pmap_store_pte((current_cpu_datap()->cpu_physwindow_ptep), pentry); - (void) ml_set_interrupts_enabled(istate); - - invlpg((uintptr_t)current_cpu_datap()->cpu_physwindow_base); - } -#if JOE_DEBUG - else { - if (pentry != - (*(current_cpu_datap()->cpu_physwindow_ptep) & (INTEL_PTE_VALID | PG_FRAME | INTEL_PTE_RW))) - panic("copyio_phys: pentry != *physwindow_ptep"); - } + return; +} #endif - retval = copyio(ctype, vaddr, window_offset, csize, NULL, which & cppvKmap); - current_thread()->machine.physwindow_busy = 0; - } - return (retval); -} +#ifdef __LP64__ -int -copyinmsg(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes) -{ - return (copyio(COPYIN, user_addr, kernel_addr, nbytes, NULL, 0)); -} +#define INT_SIZE (BYTE_SIZE * sizeof (int)) -int -copyin(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes) +/* + * Set indicated bit in bit string. + */ +void +setbit(int bitno, int *s) { - return (copyio(COPYIN, user_addr, kernel_addr, nbytes, NULL, 0)); + s[bitno / INT_SIZE] |= 1 << (bitno % INT_SIZE); } -int -copyinstr(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes, vm_size_t *lencopied) +/* + * Clear indicated bit in bit string. + */ +void +clrbit(int bitno, int *s) { - *lencopied = 0; - - return (copyio(COPYINSTR, user_addr, kernel_addr, nbytes, lencopied, 0)); + s[bitno / INT_SIZE] &= ~(1 << (bitno % INT_SIZE)); } +/* + * Test if indicated bit is set in bit string. + */ int -copyoutmsg(const char *kernel_addr, user_addr_t user_addr, vm_size_t nbytes) +testbit(int bitno, int *s) { - return (copyio(COPYOUT, user_addr, (char *)(uintptr_t)kernel_addr, nbytes, NULL, 0)); + return s[bitno / INT_SIZE] & (1 << (bitno % INT_SIZE)); } +/* + * Find first bit set in bit string. + */ int -copyout(const void *kernel_addr, user_addr_t user_addr, vm_size_t nbytes) -{ - return (copyio(COPYOUT, user_addr, (char *)(uintptr_t)kernel_addr, nbytes, NULL, 0)); -} - - -kern_return_t -copypv(addr64_t src64, addr64_t snk64, unsigned int size, int which) +ffsbit(int *s) { - unsigned int lop, csize; - int bothphys = 0; - - KERNEL_DEBUG(0xeff7004c | DBG_FUNC_START, (unsigned)src64, - (unsigned)snk64, size, which, 0); - - if ((which & (cppvPsrc | cppvPsnk)) == 0 ) /* Make sure that only one is virtual */ - panic("copypv: no more than 1 parameter may be virtual\n"); /* Not allowed */ - - if ((which & (cppvPsrc | cppvPsnk)) == (cppvPsrc | cppvPsnk)) - bothphys = 1; /* both are physical */ - - while (size) { - - if (bothphys) { - lop = (unsigned int)(PAGE_SIZE - (snk64 & (PAGE_SIZE - 1))); /* Assume sink smallest */ - - if (lop > (unsigned int)(PAGE_SIZE - (src64 & (PAGE_SIZE - 1)))) - lop = (unsigned int)(PAGE_SIZE - (src64 & (PAGE_SIZE - 1))); /* No, source is smaller */ - } else { - /* - * only need to compute the resid for the physical page - * address... we don't care about where we start/finish in - * the virtual since we just call the normal copyin/copyout - */ - if (which & cppvPsrc) - lop = (unsigned int)(PAGE_SIZE - (src64 & (PAGE_SIZE - 1))); - else - lop = (unsigned int)(PAGE_SIZE - (snk64 & (PAGE_SIZE - 1))); - } - csize = size; /* Assume we can copy it all */ - if (lop < size) - csize = lop; /* Nope, we can't do it all */ -#if 0 - /* - * flush_dcache64 is currently a nop on the i386... - * it's used when copying to non-system memory such - * as video capture cards... on PPC there was a need - * to flush due to how we mapped this memory... not - * sure if it's needed on i386. - */ - if (which & cppvFsrc) - flush_dcache64(src64, csize, 1); /* If requested, flush source before move */ - if (which & cppvFsnk) - flush_dcache64(snk64, csize, 1); /* If requested, flush sink before move */ -#endif - if (bothphys) - bcopy_phys(src64, snk64, csize); /* Do a physical copy, virtually */ - else { - if (copyio_phys(src64, snk64, csize, which)) - return (KERN_FAILURE); - } -#if 0 - if (which & cppvFsrc) - flush_dcache64(src64, csize, 1); /* If requested, flush source after move */ - if (which & cppvFsnk) - flush_dcache64(snk64, csize, 1); /* If requested, flush sink after move */ -#endif - size -= csize; /* Calculate what is left */ - snk64 += csize; /* Bump sink to next physical address */ - src64 += csize; /* Bump source to next physical address */ - } - KERNEL_DEBUG(0xeff7004c | DBG_FUNC_END, (unsigned)src64, - (unsigned)snk64, size, which, 0); + int offset; - return KERN_SUCCESS; -} - -#if !MACH_KDP -void -kdp_register_callout(void) -{ + for (offset = 0; !*s; offset += (int)INT_SIZE, ++s); + return offset + __builtin_ctz(*s); } -#endif -#if !CONFIG_VMX -int host_vmxon(boolean_t exclusive __unused) +int +ffs(unsigned int mask) { - return VMX_UNSUPPORTED; -} + if (mask == 0) + return 0; -void host_vmxoff(void) -{ - return; + /* + * NOTE: cannot use __builtin_ffs because it generates a call to + * 'ffs' + */ + return 1 + __builtin_ctz(mask); } #endif diff --git a/osfmk/i386/machine_check.c b/osfmk/i386/machine_check.c index 6eaff9d8f..77681d340 100644 --- a/osfmk/i386/machine_check.c +++ b/osfmk/i386/machine_check.c @@ -45,6 +45,7 @@ static uint32_t mca_family = 0; static unsigned int mca_error_bank_count = 0; static boolean_t mca_control_MSR_present = FALSE; static boolean_t mca_threshold_status_present = FALSE; +static boolean_t mca_sw_error_recovery_present = FALSE; static boolean_t mca_extended_MSRs_present = FALSE; static unsigned int mca_extended_MSRs_count = 0; static boolean_t mca_cmci_present = FALSE; @@ -89,6 +90,7 @@ mca_get_availability(void) mca_error_bank_count = ia32_mcg_cap.bits.count; mca_control_MSR_present = ia32_mcg_cap.bits.mcg_ctl_p; mca_threshold_status_present = ia32_mcg_cap.bits.mcg_tes_p; + mca_sw_error_recovery_present = ia32_mcg_cap.bits.mcg_ser_p; mca_cmci_present = ia32_mcg_cap.bits.mcg_ext_corr_err_p; if (family == 0x0F) { mca_extended_MSRs_present = ia32_mcg_cap.bits.mcg_ext_p; @@ -269,25 +271,22 @@ static void mca_dump_32bit_state(void) static void mca_report_cpu_info(void) { - uint64_t microcode; i386_cpu_info_t *infop = cpuid_info(); - // microcode revision is top 32 bits of MSR_IA32_UCODE_REV - microcode = rdmsr64(MSR_IA32_UCODE_REV) >> 32; kdb_printf(" family: %d model: %d stepping: %d microcode: %d\n", infop->cpuid_family, infop->cpuid_model, infop->cpuid_stepping, - (uint32_t) microcode); + infop->cpuid_microcode_version); kdb_printf(" %s\n", infop->cpuid_brand_string); } static const char *mc8_memory_operation[] = { - [MC8_MMM_GENERIC] "generic", - [MC8_MMM_READ] "read", - [MC8_MMM_WRITE] "write", - [MC8_MMM_ADDRESS_COMMAND] "address/command", - [MC8_MMM_RESERVED] "reserved" + [MC8_MMM_GENERIC] = "generic", + [MC8_MMM_READ] = "read", + [MC8_MMM_WRITE] = "write", + [MC8_MMM_ADDRESS_COMMAND] = "address/command", + [MC8_MMM_RESERVED] = "reserved" }; static void @@ -312,19 +311,20 @@ mca_dump_bank_mc8(mca_state_t *state, int i) kdb_printf( " Channel number: %d%s\n" " Memory Operation: %s\n" - " Machine-specific error: %s%s%s%s%s%s%s%s\n" + " Machine-specific error: %s%s%s%s%s%s%s%s%s\n" " COR_ERR_CNT: %d\n", mc8.channel_number, IF(mc8.channel_number == 15, " (unknown)"), mc8_memory_operation[mmm], - IF(mc8.read_ecc, "Read ECC"), - IF(mc8.ecc_on_a_scrub, "ECC on scrub"), - IF(mc8.write_parity, "Write parity"), - IF(mc8.redundant_memory, "Redundant memory"), - IF(mc8.sparing, "Sparing/Resilvering"), - IF(mc8.access_out_of_range, "Access out of Range"), - IF(mc8.address_parity, "Address Parity"), - IF(mc8.byte_enable_parity, "Byte Enable Parity"), + IF(mc8.read_ecc, "Read ECC "), + IF(mc8.ecc_on_a_scrub, "ECC on scrub "), + IF(mc8.write_parity, "Write parity "), + IF(mc8.redundant_memory, "Redundant memory "), + IF(mc8.sparing, "Sparing/Resilvering "), + IF(mc8.access_out_of_range, "Access out of Range "), + IF(mc8.rtid_out_of_range, "RTID out of Range "), + IF(mc8.address_parity, "Address Parity "), + IF(mc8.byte_enable_parity, "Byte Enable Parity "), mc8.cor_err_cnt); kdb_printf( " Status bits:\n%s%s%s%s%s%s", @@ -344,10 +344,12 @@ mca_dump_bank_mc8(mca_state_t *state, int i) mc8_misc.u64 = bank->mca_mci_misc; kdb_printf( " IA32_MC%d_MISC(0x%x): 0x%016qx\n" + " RTID: %d\n" " DIMM: %d\n" " Channel: %d\n" " Syndrome: 0x%x\n", i, IA32_MCi_MISC(i), mc8_misc.u64, + mc8_misc.bits.rtid, mc8_misc.bits.dimm, mc8_misc.bits.channel, (int) mc8_misc.bits.syndrome); @@ -355,10 +357,10 @@ mca_dump_bank_mc8(mca_state_t *state, int i) } static const char *mca_threshold_status[] = { - [THRESHOLD_STATUS_NO_TRACKING] "No tracking", - [THRESHOLD_STATUS_GREEN] "Green", - [THRESHOLD_STATUS_YELLOW] "Yellow", - [THRESHOLD_STATUS_RESERVED] "Reserved" + [THRESHOLD_STATUS_NO_TRACKING] = "No tracking", + [THRESHOLD_STATUS_GREEN] = "Green", + [THRESHOLD_STATUS_YELLOW] = "Yellow", + [THRESHOLD_STATUS_RESERVED] = "Reserved" }; static void @@ -395,6 +397,13 @@ mca_dump_bank(mca_state_t *state, int i) mca_threshold_status[threshold] : "Undefined"); } + if (mca_threshold_status_present && + mca_sw_error_recovery_present) { + kdb_printf( + " Software Error Recovery:\n%s%s", + IF(status.bits_tes_p.ar, " Recovery action reqd\n"), + IF(status.bits_tes_p.s, " Signaling UCR error\n")); + } kdb_printf( " Status bits:\n%s%s%s%s%s%s", IF(status.bits.pcc, " Processor context corrupt\n"), diff --git a/osfmk/i386/machine_check.h b/osfmk/i386/machine_check.h index e940fa8c0..70c75c826 100644 --- a/osfmk/i386/machine_check.h +++ b/osfmk/i386/machine_check.h @@ -58,6 +58,7 @@ typedef union { uint64_t mcg_ecms :BIT1(12); uint64_t mcg_reserved2 :BITS(15,13); uint64_t mcg_ext_cnt :BITS(23,16); + uint64_t mcg_ser_p :BIT1(24); } bits; uint64_t u64; } ia32_mcg_cap_t; @@ -128,11 +129,13 @@ typedef union { uint64_t over :BIT1(62); uint64_t val :BIT1(63); } bits; - struct { /* Variant if threshold-based error status present: */ + struct { /* Variant if threshold-based error status present: */ uint64_t mca_error :BITS(15,0); uint64_t model_specific_error :BITS(31,16); uint64_t other_information :BITS(52,32); uint64_t threshold :BITS(54,53); + uint64_t ar :BIT1(55); + uint64_t s :BIT1(56); uint64_t pcc :BIT1(57); uint64_t addrv :BIT1(58); uint64_t miscv :BIT1(59); @@ -151,6 +154,7 @@ typedef union { uint64_t redundant_memory :BIT1(19); uint64_t sparing :BIT1(20); uint64_t access_out_of_range :BIT1(21); + uint64_t rtid_out_of_range :BIT1(22); uint64_t address_parity :BIT1(23); uint64_t byte_enable_parity :BIT1(24); uint64_t reserved :BITS(37,25); @@ -173,7 +177,8 @@ typedef union { #define MC8_MMM_RESERVED 4 typedef union { struct { - uint64_t reserved1 :BITS(15,0); + uint64_t rtid :BITS(7,0); + uint64_t reserved1 :BITS(15,8); uint64_t dimm :BITS(17,16); uint64_t channel :BITS(19,18); uint64_t reserved2 :BITS(31,20); diff --git a/osfmk/i386/machine_cpu.h b/osfmk/i386/machine_cpu.h index 2460bf606..82532f088 100644 --- a/osfmk/i386/machine_cpu.h +++ b/osfmk/i386/machine_cpu.h @@ -41,6 +41,7 @@ void cpu_machine_init( void handle_pending_TLB_flushes( void); +int cpu_signal_handler(x86_saved_state_t *regs); kern_return_t cpu_register( int *slot_nump); diff --git a/osfmk/i386/machine_routines.c b/osfmk/i386/machine_routines.c index 4525c8a31..b7d3f559a 100644 --- a/osfmk/i386/machine_routines.c +++ b/osfmk/i386/machine_routines.c @@ -38,16 +38,16 @@ #include #include #include +#include #include #include +#include #include #include #include #include #include #include -#include - #if MACH_KDB #include #include @@ -65,7 +65,6 @@ #define DBG(x...) #endif - extern void wakeup(void *); static int max_cpus_initialized = 0; @@ -75,6 +74,10 @@ unsigned int LockTimeOutTSC; unsigned int MutexSpin; uint64_t LastDebuggerEntryAllowance; +extern uint64_t panic_restart_timeout; + +boolean_t virtualized = FALSE; + #define MAX_CPUS_SET 0x1 #define MAX_CPUS_WAIT 0x2 @@ -131,7 +134,6 @@ ml_static_mfree( assert((vaddr & (PAGE_SIZE-1)) == 0); /* must be page aligned */ - for (vaddr_cur = vaddr; vaddr_cur < round_page_64(vaddr+size); vaddr_cur += PAGE_SIZE) { @@ -207,7 +209,6 @@ void ml_init_interrupt(void) } - /* Get Interrupts Enabled */ boolean_t ml_get_interrupts_enabled(void) { @@ -220,27 +221,25 @@ boolean_t ml_get_interrupts_enabled(void) /* Set Interrupts Enabled */ boolean_t ml_set_interrupts_enabled(boolean_t enable) { - unsigned long flags; - - __asm__ volatile("pushf; pop %0" : "=r" (flags)); + unsigned long flags; + boolean_t istate; + + __asm__ volatile("pushf; pop %0" : "=r" (flags)); - if (enable) { - ast_t *myast; + istate = ((flags & EFL_IF) != 0); - myast = ast_pending(); + if (enable) { + __asm__ volatile("sti;nop"); - if ( (get_preemption_level() == 0) && (*myast & AST_URGENT) ) { - __asm__ volatile("sti"); - __asm__ volatile ("int $0xff"); - } else { - __asm__ volatile ("sti"); + if ((get_preemption_level() == 0) && (*ast_pending() & AST_URGENT)) + __asm__ volatile ("int $0xff"); + } + else { + if (istate) + __asm__ volatile("cli"); } - } - else { - __asm__ volatile("cli"); - } - return (flags & EFL_IF) != 0; + return istate; } /* Check if running at interrupt context */ @@ -435,7 +434,10 @@ ml_cpu_get_info(ml_cpu_info_t *cpu_infop) * As distinct from whether the cpu has these capabilities. */ os_supports_sse = !!(get_cr4() & CR4_OSXMM); - if ((cpuid_features() & CPUID_FEATURE_SSE4_2) && os_supports_sse) + + if (ml_fpu_avx_enabled()) + cpu_infop->vector_unit = 9; + else if ((cpuid_features() & CPUID_FEATURE_SSE4_2) && os_supports_sse) cpu_infop->vector_unit = 8; else if ((cpuid_features() & CPUID_FEATURE_SSE4_1) && os_supports_sse) cpu_infop->vector_unit = 7; @@ -525,7 +527,8 @@ ml_init_lock_timeout(void) uint32_t mtxspin; uint64_t default_timeout_ns = NSEC_PER_SEC>>2; uint32_t slto; - + uint32_t prt; + if (PE_parse_boot_argn("slto_us", &slto, sizeof (slto))) default_timeout_ns = slto * NSEC_PER_USEC; @@ -544,6 +547,9 @@ ml_init_lock_timeout(void) MutexSpin = (unsigned int)abstime; nanoseconds_to_absolutetime(4ULL * NSEC_PER_SEC, &LastDebuggerEntryAllowance); + if (PE_parse_boot_argn("panic_restart_timeout", &prt, sizeof (prt))) + nanoseconds_to_absolutetime(prt * NSEC_PER_SEC, &panic_restart_timeout); + virtualized = ((cpuid_features() & CPUID_FEATURE_VMM) != 0); interrupt_latency_tracker_setup(); } @@ -649,8 +655,33 @@ vm_offset_t ml_stack_remaining(void) } } +void +kernel_preempt_check(void) +{ + boolean_t intr; + unsigned long flags; + + assert(get_preemption_level() == 0); + + __asm__ volatile("pushf; pop %0" : "=r" (flags)); + + intr = ((flags & EFL_IF) != 0); + + if ((*ast_pending() & AST_URGENT) && intr == TRUE) { + /* + * can handle interrupts and preemptions + * at this point + */ + + /* + * now cause the PRE-EMPTION trap + */ + __asm__ volatile ("int %0" :: "N" (T_PREEMPT)); + } +} + boolean_t machine_timeout_suspended(void) { - return (mp_recent_debugger_activity() || panic_active() || pmap_tlb_flush_timeout || spinlock_timed_out); + return (virtualized || pmap_tlb_flush_timeout || spinlock_timed_out || panic_active() || mp_recent_debugger_activity()); } #if MACH_KDB diff --git a/osfmk/i386/machine_routines.h b/osfmk/i386/machine_routines.h index e222fb18d..42f77f6c4 100644 --- a/osfmk/i386/machine_routines.h +++ b/osfmk/i386/machine_routines.h @@ -67,7 +67,6 @@ void ml_cpu_set_ldt(int); /* Initialize Interrupts */ void ml_init_interrupt(void); - /* Generate a fake interrupt */ void ml_cause_interrupt(void); @@ -132,11 +131,9 @@ void ml_get_bouncepool_info( boolean_t machine_timeout_suspended(void); #endif /* PEXPERT_KERNEL_PRIVATE || MACH_KERNEL_PRIVATE */ +/* Warm up a CPU to receive an interrupt */ +kern_return_t ml_interrupt_prewarm(uint64_t deadline); -void interrupt_latency_tracker_setup(void); -void interrupt_reset_latency_stats(void); -void interrupt_populate_latency_stats(char *, unsigned); -boolean_t ml_fpu_avx_enabled(void); #endif /* XNU_KERNEL_PRIVATE */ #ifdef KERNEL_PRIVATE @@ -235,14 +232,14 @@ void ml_phys_write_double_64( /* Struct for ml_cpu_get_info */ struct ml_cpu_info { - unsigned long vector_unit; - unsigned long cache_line_size; - unsigned long l1_icache_size; - unsigned long l1_dcache_size; - unsigned long l2_settings; - unsigned long l2_cache_size; - unsigned long l3_settings; - unsigned long l3_cache_size; + uint32_t vector_unit; + uint32_t cache_line_size; + uint32_t l1_icache_size; + uint32_t l1_dcache_size; + uint32_t l2_settings; + uint32_t l2_cache_size; + uint32_t l3_settings; + uint32_t l3_cache_size; }; typedef struct ml_cpu_info ml_cpu_info_t; @@ -273,6 +270,7 @@ extern void ml_set_maxbusdelay(uint32_t mdelay); extern uint32_t ml_get_maxbusdelay(void); extern void ml_set_maxintdelay(uint64_t mdelay); extern uint64_t ml_get_maxintdelay(void); +extern boolean_t ml_get_interrupt_prewake_applicable(void); extern uint64_t tmrCvt(uint64_t time, uint64_t conversion); @@ -304,6 +302,11 @@ void ml_get_csw_threads(thread_t * /*old*/, thread_t * /*new*/); __END_DECLS +#ifdef XNU_KERNEL_PRIVATE +boolean_t ml_fpu_avx_enabled(void); +void interrupt_latency_tracker_setup(void); +void interrupt_reset_latency_stats(void); +void interrupt_populate_latency_stats(char *, unsigned); - +#endif /* XNU_KERNEL_PRIVATE */ #endif /* _I386_MACHINE_ROUTINES_H_ */ diff --git a/osfmk/i386/machine_routines_asm.s b/osfmk/i386/machine_routines_asm.s index ae2e8aaf1..0e3d9fb68 100644 --- a/osfmk/i386/machine_routines_asm.s +++ b/osfmk/i386/machine_routines_asm.s @@ -27,12 +27,11 @@ */ #include -#include -#include +#include #include - +#include #include -#include +#include #include /* @@ -136,51 +135,6 @@ LEXT(tmrCvt) ret // Leave... -/* void _rtc_nanotime_store(uint64_t tsc, - uint64_t nsec, - uint32_t scale, - uint32_t shift, - rtc_nanotime_t *dst) ; -*/ - .globl EXT(_rtc_nanotime_store) - .align FALIGN - -LEXT(_rtc_nanotime_store) - push %ebp - movl %esp,%ebp - push %esi - - mov 32(%ebp),%edx /* get ptr to rtc_nanotime_info */ - - movl RNT_GENERATION(%edx),%esi /* get current generation */ - movl $0,RNT_GENERATION(%edx) /* flag data as being updated */ - - mov 8(%ebp),%eax - mov %eax,RNT_TSC_BASE(%edx) - mov 12(%ebp),%eax - mov %eax,RNT_TSC_BASE+4(%edx) - - mov 24(%ebp),%eax - mov %eax,RNT_SCALE(%edx) - - mov 28(%ebp),%eax - mov %eax,RNT_SHIFT(%edx) - - mov 16(%ebp),%eax - mov %eax,RNT_NS_BASE(%edx) - mov 20(%ebp),%eax - mov %eax,RNT_NS_BASE+4(%edx) - - incl %esi /* next generation */ - jnz 1f - incl %esi /* skip 0, which is a flag */ -1: movl %esi,RNT_GENERATION(%edx) /* update generation and make usable */ - - pop %esi - pop %ebp - ret - - /* void _rtc_nanotime_adjust( uint64_t tsc_base_delta, rtc_nanotime_t *dst); @@ -252,7 +206,7 @@ LEXT(_rtc_nanotime_read) jnz Lslow /* Processor whose TSC frequency is faster than SLOW_TSC_THRESHOLD */ - RTC_NANOTIME_READ_FAST() + PAL_RTC_NANOTIME_READ_FAST() popl %ebx popl %edi @@ -316,3 +270,42 @@ Lslow: pop %ebp ret /* result in edx:eax */ + + +/* + * Timing routines. + */ +Entry(timer_update) + movl 4(%esp),%ecx + movl 8(%esp),%eax + movl 12(%esp),%edx + movl %eax,TIMER_HIGHCHK(%ecx) + movl %edx,TIMER_LOW(%ecx) + movl %eax,TIMER_HIGH(%ecx) + ret + +Entry(timer_grab) + movl 4(%esp),%ecx +0: movl TIMER_HIGH(%ecx),%edx + movl TIMER_LOW(%ecx),%eax + cmpl TIMER_HIGHCHK(%ecx),%edx + jne 0b + ret + + +Entry(call_continuation) + movl S_ARG0,%eax /* get continuation */ + movl S_ARG1,%edx /* continuation param */ + movl S_ARG2,%ecx /* wait result */ + movl %gs:CPU_KERNEL_STACK,%esp /* pop the stack */ + xorl %ebp,%ebp /* zero frame pointer */ + subl $8,%esp /* align the stack */ + pushl %ecx + pushl %edx + call *%eax /* call continuation */ + addl $16,%esp + movl %gs:CPU_ACTIVE_THREAD,%eax + pushl %eax + call EXT(thread_terminate) + + diff --git a/osfmk/i386/misc_protos.h b/osfmk/i386/misc_protos.h index 71e707c07..724490e22 100644 --- a/osfmk/i386/misc_protos.h +++ b/osfmk/i386/misc_protos.h @@ -105,6 +105,9 @@ extern unsigned int mul_scale( /* Move arbitrarily-aligned data from one physical address to another */ extern void bcopy_phys(addr64_t from, addr64_t to, vm_size_t nbytes); +/* allow a function to get a quick virtual mapping of a physical page */ +extern int apply_func_phys(addr64_t src64, vm_size_t bytes, int (*func)(void * buffer, vm_size_t bytes, void * arg), void * arg); + extern void ml_copy_phys(addr64_t, addr64_t, vm_size_t); /* Flush all cachelines for a page. */ @@ -161,6 +164,8 @@ copy_debug_state32(x86_debug_state32_t *src, x86_debug_state32_t *target, boolea void copy_debug_state64(x86_debug_state64_t *src, x86_debug_state64_t *target, boolean_t all); +extern void act_machine_switch_pcb(thread_t old, thread_t new); + /* Fast-restart parameters */ #define FULL_SLAVE_INIT (NULL) #define FAST_SLAVE_INIT ((void *)(uintptr_t)1) diff --git a/osfmk/i386/mp.c b/osfmk/i386/mp.c index 021f0638f..e90a298f9 100644 --- a/osfmk/i386/mp.c +++ b/osfmk/i386/mp.c @@ -1,5 +1,4 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -49,6 +48,8 @@ #include #include #include +#include +#include #include #include @@ -65,11 +66,9 @@ #include #include #include -#include #include #include #include -#include #include #if CONFIG_MCA #include @@ -99,10 +98,17 @@ #define PAUSE #endif /* MP_DEBUG */ +/* Debugging/test trace events: */ +#define TRACE_MP_TLB_FLUSH MACHDBG_CODE(DBG_MACH_MP, 0) +#define TRACE_MP_CPUS_CALL MACHDBG_CODE(DBG_MACH_MP, 1) +#define TRACE_MP_CPUS_CALL_LOCAL MACHDBG_CODE(DBG_MACH_MP, 2) +#define TRACE_MP_CPUS_CALL_ACTION MACHDBG_CODE(DBG_MACH_MP, 3) +#define TRACE_MP_CPUS_CALL_NOBUF MACHDBG_CODE(DBG_MACH_MP, 4) #define ABS(v) (((v) > 0)?(v):-(v)) void slave_boot_init(void); +void i386_cpu_IPI(int cpu); #if MACH_KDB static void mp_kdb_wait(void); @@ -115,7 +121,6 @@ static void mp_rendezvous_action(void); static void mp_broadcast_action(void); static boolean_t cpu_signal_pending(int cpu, mp_event_t event); -static int cpu_signal_handler(x86_saved_state_t *regs); static int NMIInterruptHandler(x86_saved_state_t *regs); boolean_t smp_initialized = FALSE; @@ -165,11 +170,18 @@ lck_mtx_ext_t mp_bc_lock_ext; static volatile int debugger_cpu = -1; volatile long NMIPI_acks = 0; +static void mp_cpus_call_init(void); +static void mp_cpus_call_cpu_init(void); static void mp_cpus_call_action(void); static void mp_call_PM(void); char mp_slave_stack[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); // Temp stack for slave init +/* PAL-related routines */ +boolean_t i386_smp_init(int nmi_vector, i386_intr_func_t nmi_handler, + int ipi_vector, i386_intr_func_t ipi_handler); +void i386_start_cpu(int lapic_id, int cpu_num); +void i386_send_NMI(int cpu); #if GPROF /* @@ -193,7 +205,22 @@ struct profile_vars *_profile_vars_cpus[MAX_CPUS] = { &_profile_vars }; static lck_grp_t smp_lck_grp; static lck_grp_attr_t smp_lck_grp_attr; -extern void slave_pstart(void); +#define NUM_CPU_WARM_CALLS 20 +struct timer_call cpu_warm_call_arr[NUM_CPU_WARM_CALLS]; +queue_head_t cpu_warm_call_list; +decl_simple_lock_data(static, cpu_warm_lock); + +typedef struct cpu_warm_data { + timer_call_t cwd_call; + uint64_t cwd_deadline; + int cwd_result; +} *cpu_warm_data_t; + +static void cpu_prewarm_init(void); +static void cpu_warm_timer_call_func(call_entry_param_t p0, call_entry_param_t p1); +static void _cpu_warm_setup(void *arg); +static timer_call_t grab_warm_timer_call(void); +static void free_warm_timer_call(timer_call_t call); void smp_init(void) @@ -206,27 +233,25 @@ smp_init(void) lck_mtx_init_ext(&mp_bc_lock, &mp_bc_lock_ext, &smp_lck_grp, LCK_ATTR_NULL); console_init(); - /* Local APIC? */ - if (!lapic_probe()) + if(!i386_smp_init(LAPIC_NMI_INTERRUPT, NMIInterruptHandler, + LAPIC_VECTOR(INTERPROCESSOR), cpu_signal_handler)) return; - lapic_init(); - lapic_configure(); - lapic_set_intr_func(LAPIC_NMI_INTERRUPT, NMIInterruptHandler); - lapic_set_intr_func(LAPIC_VECTOR(INTERPROCESSOR), cpu_signal_handler); - cpu_thread_init(); GPROF_INIT(); DBGLOG_CPU_INIT(master_cpu); - install_real_mode_bootstrap(slave_pstart); + mp_cpus_call_init(); + mp_cpus_call_cpu_init(); if (PE_parse_boot_argn("TSC_sync_margin", &TSC_sync_margin, sizeof(TSC_sync_margin))) kprintf("TSC sync Margin 0x%x\n", TSC_sync_margin); smp_initialized = TRUE; + cpu_prewarm_init(); + return; } @@ -285,6 +310,7 @@ intel_startCPU_fast(int slot_num) * longer than a full restart would require so it should be more * than long enough. */ + mp_wait_for_cpu_up(slot_num, 30000, 1); mp_enable_preemption(); @@ -328,12 +354,7 @@ start_cpu(void *arg) if (cpu_number() != psip->starter_cpu) return; - LAPIC_WRITE(ICRD, psip->target_lapic << LAPIC_ICRD_DEST_SHIFT); - LAPIC_WRITE(ICR, LAPIC_ICR_DM_INIT); - delay(100); - - LAPIC_WRITE(ICRD, psip->target_lapic << LAPIC_ICRD_DEST_SHIFT); - LAPIC_WRITE(ICR, LAPIC_ICR_DM_STARTUP|(REAL_MODE_BOOTSTRAP_OFFSET>>12)); + i386_start_cpu(psip->target_lapic, psip->target_cpu); #ifdef POSTCODE_DELAY /* Wait much longer if postcodes are displayed for a delay period. */ @@ -391,7 +412,7 @@ intel_startCPU( DBGLOG_CPU_INIT(slot_num); DBG("intel_startCPU(%d) lapic_id=%d\n", slot_num, lapic); - DBG("IdlePTD(%p): 0x%x\n", &IdlePTD, (int) IdlePTD); + DBG("IdlePTD(%p): 0x%x\n", &IdlePTD, (int) (uintptr_t)IdlePTD); /* * Initialize (or re-initialize) the descriptor tables for this cpu. @@ -459,7 +480,7 @@ cpu_signal_handler(x86_saved_state_t *regs) int i=100; #endif /* MACH_KDB && MACH_ASSERT */ - mp_disable_preemption(); + SCHED_STATS_IPI(current_processor()); my_cpu = cpu_number(); my_word = &cpu_data_ptr[my_cpu]->cpu_signals; @@ -467,6 +488,7 @@ cpu_signal_handler(x86_saved_state_t *regs) * signals could arrive while these are being processed * so it's no more than a hint. */ + cpu_data_ptr[my_cpu]->cpu_prior_signals = *my_word; do { @@ -530,8 +552,6 @@ cpu_signal_handler(x86_saved_state_t *regs) } } while (*my_word); - mp_enable_preemption(); - return 0; } @@ -540,6 +560,13 @@ NMIInterruptHandler(x86_saved_state_t *regs) { void *stackptr; + if (panic_active() && !panicDebugging) { + if (pmsafe_debug) + pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_SAFE); + for(;;) + cpu_pause(); + } + atomic_incl(&NMIPI_acks, 1); sync_iss_to_iks_unconditionally(regs); #if defined (__i386__) @@ -555,11 +582,10 @@ NMIInterruptHandler(x86_saved_state_t *regs) char pstr[160]; snprintf(&pstr[0], sizeof(pstr), "Panic(CPU %d): NMIPI for spinlock acquisition timeout, spinlock: %p, spinlock owner: %p, current_thread: %p, spinlock_owner_cpu: 0x%x\n", cpu_number(), spinlock_timed_out, (void *) spinlock_timed_out->interlock.lock_data, current_thread(), spinlock_owner_cpu); panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs); - } else if (pmap_tlb_flush_timeout == TRUE) { char pstr[128]; - snprintf(&pstr[0], sizeof(pstr), "Panic(CPU %d): Unresponsive processor, TLB state:%d\n", cpu_number(), current_cpu_datap()->cpu_tlb_invalid); - panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs); + snprintf(&pstr[0], sizeof(pstr), "Panic(CPU %d): Unresponsive processor (this CPU did not acknowledge interrupts) TLB state:%d\n", cpu_number(), current_cpu_datap()->cpu_tlb_invalid); + panic_i386_backtrace(stackptr, 48, &pstr[0], TRUE, regs); } #if MACH_KDP @@ -574,51 +600,6 @@ NMIInterruptHandler(x86_saved_state_t *regs) return 1; } -#ifdef MP_DEBUG -int max_lock_loops = 100000000; -int trappedalready = 0; /* (BRINGUP) */ -#endif /* MP_DEBUG */ - -static void -i386_cpu_IPI(int cpu) -{ - boolean_t state; - -#ifdef MP_DEBUG - if(cpu_datap(cpu)->cpu_signals & 6) { /* (BRINGUP) */ - kprintf("i386_cpu_IPI: sending enter debugger signal (%08X) to cpu %d\n", cpu_datap(cpu)->cpu_signals, cpu); - } -#endif /* MP_DEBUG */ - -#if MACH_KDB -#ifdef MP_DEBUG - if(!trappedalready && (cpu_datap(cpu)->cpu_signals & 6)) { /* (BRINGUP) */ - if(kdb_cpu != cpu_number()) { - trappedalready = 1; - panic("i386_cpu_IPI: sending enter debugger signal (%08X) to cpu %d and I do not own debugger, owner = %08X\n", - cpu_datap(cpu)->cpu_signals, cpu, kdb_cpu); - } - } -#endif /* MP_DEBUG */ -#endif - - /* Wait for previous interrupt to be delivered... */ -#ifdef MP_DEBUG - int pending_busy_count = 0; - while (LAPIC_READ(ICR) & LAPIC_ICR_DS_PENDING) { - if (++pending_busy_count > max_lock_loops) - panic("i386_cpu_IPI() deadlock\n"); -#else - while (LAPIC_READ(ICR) & LAPIC_ICR_DS_PENDING) { -#endif /* MP_DEBUG */ - cpu_pause(); - } - - state = ml_set_interrupts_enabled(FALSE); - LAPIC_WRITE(ICRD, cpu_to_lapic[cpu] << LAPIC_ICRD_DEST_SHIFT); - LAPIC_WRITE(ICR, LAPIC_VECTOR(INTERPROCESSOR) | LAPIC_ICR_DM_FIXED); - (void) ml_set_interrupts_enabled(state); -} /* * cpu_interrupt is really just to be used by the scheduler to @@ -628,10 +609,15 @@ i386_cpu_IPI(int cpu) void cpu_interrupt(int cpu) { + boolean_t did_IPI = FALSE; + if (smp_initialized && pmCPUExitIdle(cpu_datap(cpu))) { i386_cpu_IPI(cpu); + did_IPI = TRUE; } + + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), cpu, did_IPI, 0, 0, 0); } /* @@ -640,17 +626,8 @@ cpu_interrupt(int cpu) void cpu_NMI_interrupt(int cpu) { - boolean_t state; - if (smp_initialized) { - state = ml_set_interrupts_enabled(FALSE); -/* Program the interrupt command register */ - LAPIC_WRITE(ICRD, cpu_to_lapic[cpu] << LAPIC_ICRD_DEST_SHIFT); -/* The vector is ignored in this case--the target CPU will enter on the - * NMI vector. - */ - LAPIC_WRITE(ICR, LAPIC_VECTOR(INTERPROCESSOR)|LAPIC_ICR_DM_NMI); - (void) ml_set_interrupts_enabled(state); + i386_send_NMI(cpu); } } @@ -695,7 +672,7 @@ i386_signal_cpu(int cpu, mp_event_t event, mp_sync_t mode) return; if (event == MP_TLB_FLUSH) - KERNEL_DEBUG(0xef800020 | DBG_FUNC_START, cpu, 0, 0, 0, 0); + KERNEL_DEBUG(TRACE_MP_TLB_FLUSH | DBG_FUNC_START, cpu, 0, 0, 0, 0); DBGLOG(cpu_signal, cpu, event); @@ -714,7 +691,7 @@ i386_signal_cpu(int cpu, mp_event_t event, mp_sync_t mode) } } if (event == MP_TLB_FLUSH) - KERNEL_DEBUG(0xef800020 | DBG_FUNC_END, cpu, 0, 0, 0, 0); + KERNEL_DEBUG(TRACE_MP_TLB_FLUSH | DBG_FUNC_END, cpu, 0, 0, 0, 0); } /* @@ -780,7 +757,6 @@ mp_rendezvous_action(void) intrs_enabled = ml_get_interrupts_enabled(); - /* spin on entry rendezvous */ atomic_incl(&mp_rv_entry, 1); while (mp_rv_entry < mp_rv_ncpus) { @@ -789,9 +765,11 @@ mp_rendezvous_action(void) handle_pending_TLB_flushes(); cpu_pause(); } + /* action function */ if (mp_rv_action_func != NULL) mp_rv_action_func(mp_rv_func_arg); + /* spin on exit rendezvous */ atomic_incl(&mp_rv_exit, 1); while (mp_rv_exit < mp_rv_ncpus) { @@ -799,6 +777,7 @@ mp_rendezvous_action(void) handle_pending_TLB_flushes(); cpu_pause(); } + /* teardown function */ if (mp_rv_teardown_func != NULL) mp_rv_teardown_func(mp_rv_func_arg); @@ -907,38 +886,186 @@ mp_rendezvous_no_intrs( arg); } -void -handle_pending_TLB_flushes(void) + +typedef struct { + queue_chain_t link; /* queue linkage */ + void (*func)(void *,void *); /* routine to call */ + void *arg0; /* routine's 1st arg */ + void *arg1; /* routine's 2nd arg */ + volatile long *countp; /* completion counter */ +} mp_call_t; + +#define MP_CPUS_CALL_BUFS_PER_CPU MAX_CPUS +static queue_head_t mp_cpus_call_freelist; +static queue_head_t mp_cpus_call_queue[MAX_CPUS]; +/* + * The free list and the per-cpu call queues are protected by the following + * lock which is taken wil interrupts disabled. + */ +decl_simple_lock_data(,mp_cpus_call_lock); + +static inline boolean_t +mp_call_lock(void) +{ + boolean_t intrs_enabled; + + intrs_enabled = ml_set_interrupts_enabled(FALSE); + simple_lock(&mp_cpus_call_lock); + + return intrs_enabled; +} + +static inline boolean_t +mp_call_is_locked(void) +{ + return !ml_get_interrupts_enabled() && + hw_lock_held((hw_lock_t)&mp_cpus_call_lock); +} + +static inline void +mp_call_unlock(boolean_t intrs_enabled) +{ + simple_unlock(&mp_cpus_call_lock); + ml_set_interrupts_enabled(intrs_enabled); +} + +static inline mp_call_t * +mp_call_alloc(void) +{ + mp_call_t *callp; + + assert(mp_call_is_locked()); + if (queue_empty(&mp_cpus_call_freelist)) + return NULL; + queue_remove_first(&mp_cpus_call_freelist, callp, typeof(callp), link); + return callp; +} + +static inline void +mp_call_free(mp_call_t *callp) { - volatile int *my_word = ¤t_cpu_datap()->cpu_signals; + assert(mp_call_is_locked()); + queue_enter_first(&mp_cpus_call_freelist, callp, typeof(callp), link); +} + +static inline mp_call_t * +mp_call_dequeue(queue_t call_queue) +{ + mp_call_t *callp; - if (i_bit(MP_TLB_FLUSH, my_word) && (pmap_tlb_flush_timeout == FALSE)) { - DBGLOG(cpu_handle, cpu_number(), MP_TLB_FLUSH); - i_bit_clear(MP_TLB_FLUSH, my_word); - pmap_update_interrupt(); + assert(mp_call_is_locked()); + if (queue_empty(call_queue)) + return NULL; + queue_remove_first(call_queue, callp, typeof(callp), link); + return callp; +} + +/* Called on the boot processor to initialize global structures */ +static void +mp_cpus_call_init(void) +{ + DBG("mp_cpus_call_init()\n"); + simple_lock_init(&mp_cpus_call_lock, 0); + queue_init(&mp_cpus_call_freelist); +} + +/* + * Called by each processor to add call buffers to the free list + * and to initialize the per-cpu call queue. + * Also called but ignored on slave processors on re-start/wake. + */ +static void +mp_cpus_call_cpu_init(void) +{ + boolean_t intrs_enabled; + int i; + mp_call_t *callp; + + if (mp_cpus_call_queue[cpu_number()].next != NULL) + return; /* restart/wake case: called already */ + + queue_init(&mp_cpus_call_queue[cpu_number()]); + for (i = 0; i < MP_CPUS_CALL_BUFS_PER_CPU; i++) { + callp = (mp_call_t *) kalloc(sizeof(mp_call_t)); + intrs_enabled = mp_call_lock(); + mp_call_free(callp); + mp_call_unlock(intrs_enabled); } + + DBG("mp_cpus_call_init() done on cpu %d\n", cpu_number()); } /* * This is called from cpu_signal_handler() to process an MP_CALL signal. + * And also from i386_deactivate_cpu() when a cpu is being taken offline. */ static void mp_cpus_call_action(void) { - if (mp_rv_action_func != NULL) - mp_rv_action_func(mp_rv_func_arg); - atomic_incl(&mp_rv_complete, 1); + queue_t cpu_head; + boolean_t intrs_enabled; + mp_call_t *callp; + mp_call_t call; + + assert(!ml_get_interrupts_enabled()); + cpu_head = &mp_cpus_call_queue[cpu_number()]; + intrs_enabled = mp_call_lock(); + while ((callp = mp_call_dequeue(cpu_head)) != NULL) { + /* Copy call request to the stack to free buffer */ + call = *callp; + mp_call_free(callp); + if (call.func != NULL) { + mp_call_unlock(intrs_enabled); + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPUS_CALL_ACTION, + call.func, call.arg0, call.arg1, call.countp, 0); + call.func(call.arg0, call.arg1); + (void) mp_call_lock(); + } + if (call.countp != NULL) + atomic_incl(call.countp, 1); + } + mp_call_unlock(intrs_enabled); +} + +static boolean_t +mp_call_queue( + int cpu, + void (*action_func)(void *, void *), + void *arg0, + void *arg1, + volatile long *countp) +{ + queue_t cpu_head = &mp_cpus_call_queue[cpu]; + mp_call_t *callp; + + assert(mp_call_is_locked()); + callp = mp_call_alloc(); + if (callp == NULL) + return FALSE; + + callp->func = action_func; + callp->arg0 = arg0; + callp->arg1 = arg1; + callp->countp = countp; + + queue_enter(cpu_head, callp, typeof(callp), link); + + return TRUE; } /* * mp_cpus_call() runs a given function on cpus specified in a given cpu mask. - * If the mode is SYNC, the function is called serially on the target cpus - * in logical cpu order. If the mode is ASYNC, the function is called in - * parallel over the specified cpus. + * Possible modes are: + * SYNC: function is called serially on target cpus in logical cpu order + * waiting for each call to be acknowledged before proceeding + * ASYNC: function call is queued to the specified cpus + * waiting for all calls to complete in parallel before returning + * NOSYNC: function calls are queued + * but we return before confirmation of calls completing. * The action function may be NULL. * The cpu mask may include the local cpu. Offline cpus are ignored. - * Return does not occur until the function has completed on all cpus. - * The return value is the number of cpus on which the function was called. + * The return value is the number of cpus on which the call was made or queued. */ cpu_t mp_cpus_call( @@ -946,32 +1073,77 @@ mp_cpus_call( mp_sync_t mode, void (*action_func)(void *), void *arg) +{ + return mp_cpus_call1( + cpus, + mode, + (void (*)(void *,void *))action_func, + arg, + NULL, + NULL, + NULL); +} + +static void +mp_cpus_call_wait(boolean_t intrs_enabled, + long mp_cpus_signals, + volatile long *mp_cpus_calls) +{ + queue_t cpu_head; + + cpu_head = &mp_cpus_call_queue[cpu_number()]; + + while (*mp_cpus_calls < mp_cpus_signals) { + if (!intrs_enabled) { + if (!queue_empty(cpu_head)) + mp_cpus_call_action(); + + handle_pending_TLB_flushes(); + } + cpu_pause(); + } +} + +cpu_t +mp_cpus_call1( + cpumask_t cpus, + mp_sync_t mode, + void (*action_func)(void *, void *), + void *arg0, + void *arg1, + cpumask_t *cpus_calledp, + cpumask_t *cpus_notcalledp) { cpu_t cpu; - boolean_t intrs_enabled = ml_get_interrupts_enabled(); + boolean_t intrs_enabled = FALSE; boolean_t call_self = FALSE; + cpumask_t cpus_called = 0; + cpumask_t cpus_notcalled = 0; + long mp_cpus_signals = 0; + volatile long mp_cpus_calls = 0; + + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPUS_CALL | DBG_FUNC_START, + cpus, mode, action_func, arg0, arg1); if (!smp_initialized) { if ((cpus & CPUMASK_SELF) == 0) - return 0; + goto out; if (action_func != NULL) { - (void) ml_set_interrupts_enabled(FALSE); - action_func(arg); + intrs_enabled = ml_set_interrupts_enabled(FALSE); + action_func(arg0, arg1); ml_set_interrupts_enabled(intrs_enabled); } - return 1; + call_self = TRUE; + goto out; } - - /* obtain rendezvous lock */ - simple_lock(&mp_rv_lock); - - /* Use the rendezvous data structures for this call */ - mp_rv_action_func = action_func; - mp_rv_func_arg = arg; - mp_rv_ncpus = 0; - mp_rv_complete = 0; - simple_lock(&x86_topo_lock); + /* + * Queue the call for each non-local requested cpu. + * The topo lock is not taken. Instead we sniff the cpu_running state + * and then re-check it after taking the call lock. A cpu being taken + * offline runs the action function after clearing the cpu_running. + */ for (cpu = 0; cpu < (cpu_t) real_ncpus; cpu++) { if (((cpu_to_cpumask(cpu) & cpus) == 0) || !cpu_datap(cpu)->cpu_running) @@ -982,61 +1154,92 @@ mp_cpus_call( * we defer our call until we have signalled all others. */ call_self = TRUE; + cpus_called |= cpu_to_cpumask(cpu); if (mode == SYNC && action_func != NULL) { - (void) ml_set_interrupts_enabled(FALSE); - action_func(arg); - ml_set_interrupts_enabled(intrs_enabled); + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPUS_CALL_LOCAL, + action_func, arg0, arg1, 0, 0); + action_func(arg0, arg1); } } else { /* - * Bump count of other cpus called and signal this cpu. - * Note: we signal asynchronously regardless of mode - * because we wait on mp_rv_complete either here - * (if mode == SYNC) or later (if mode == ASYNC). - * While spinning, poll for TLB flushes if interrupts - * are disabled. + * Here to queue a call to cpu and IPI. + * Spinning for request buffer unless NOSYNC. */ - mp_rv_ncpus++; - i386_signal_cpu(cpu, MP_CALL, ASYNC); - if (mode == SYNC) { - simple_unlock(&x86_topo_lock); - while (mp_rv_complete < mp_rv_ncpus) { - if (!intrs_enabled) + queue_call: + intrs_enabled = mp_call_lock(); + if (!cpu_datap(cpu)->cpu_running) { + mp_call_unlock(intrs_enabled); + continue; + } + if (mode == NOSYNC) { + if (!mp_call_queue(cpu, action_func, arg0, arg1, + NULL)) { + cpus_notcalled |= cpu_to_cpumask(cpu); + mp_call_unlock(intrs_enabled); + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPUS_CALL_NOBUF, + cpu, 0, 0, 0, 0); + continue; + } + } else { + if (!mp_call_queue(cpu, action_func, arg0, arg1, + &mp_cpus_calls)) { + mp_call_unlock(intrs_enabled); + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPUS_CALL_NOBUF, + cpu, 0, 0, 0, 0); + if (!intrs_enabled) { + mp_cpus_call_action(); handle_pending_TLB_flushes(); + } cpu_pause(); + goto queue_call; } - simple_lock(&x86_topo_lock); + } + mp_cpus_signals++; + cpus_called |= cpu_to_cpumask(cpu); + i386_signal_cpu(cpu, MP_CALL, ASYNC); + mp_call_unlock(intrs_enabled); + if (mode == SYNC) { + mp_cpus_call_wait(intrs_enabled, mp_cpus_signals, &mp_cpus_calls); } } } - simple_unlock(&x86_topo_lock); - /* - * If calls are being made asynchronously, - * make the local call now if needed, and then - * wait for all other cpus to finish their calls. - */ - if (mode == ASYNC) { - if (call_self && action_func != NULL) { - (void) ml_set_interrupts_enabled(FALSE); - action_func(arg); + /* Call locally if mode not SYNC */ + if (mode != SYNC && call_self ) { + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPUS_CALL_LOCAL, + action_func, arg0, arg1, 0, 0); + if (action_func != NULL) { + ml_set_interrupts_enabled(FALSE); + action_func(arg0, arg1); ml_set_interrupts_enabled(intrs_enabled); } - while (mp_rv_complete < mp_rv_ncpus) { - if (!intrs_enabled) - handle_pending_TLB_flushes(); - cpu_pause(); - } } - - /* Determine the number of cpus called */ - cpu = mp_rv_ncpus + (call_self ? 1 : 0); - simple_unlock(&mp_rv_lock); + /* For ASYNC, now wait for all signaled cpus to complete their calls */ + if (mode == ASYNC) { + mp_cpus_call_wait(intrs_enabled, mp_cpus_signals, &mp_cpus_calls); + } + +out: + cpu = (cpu_t) mp_cpus_signals + (call_self ? 1 : 0); + + if (cpus_calledp) + *cpus_calledp = cpus_called; + if (cpus_notcalledp) + *cpus_notcalledp = cpus_notcalled; + + KERNEL_DEBUG_CONSTANT( + TRACE_MP_CPUS_CALL | DBG_FUNC_END, + cpu, cpus_called, cpus_notcalled, 0, 0); return cpu; } + static void mp_broadcast_action(void) { @@ -1156,7 +1359,7 @@ void mp_kdp_enter(void) { unsigned int cpu; - unsigned int ncpus; + unsigned int ncpus = 0; unsigned int my_cpu; uint64_t tsc_timeout; @@ -1170,7 +1373,6 @@ mp_kdp_enter(void) mp_kdp_state = ml_set_interrupts_enabled(FALSE); my_cpu = cpu_number(); cpu_datap(my_cpu)->debugger_entry_time = mach_absolute_time(); - simple_lock(&mp_kdp_lock); if (pmsafe_debug && !kdp_snapshot) @@ -1184,7 +1386,6 @@ mp_kdp_enter(void) #endif simple_lock(&mp_kdp_lock); } - my_cpu = cpu_number(); debugger_cpu = my_cpu; ncpus = 1; mp_kdp_ncpus = 1; /* self */ @@ -1246,7 +1447,7 @@ mp_kdp_enter(void) } DBG("mp_kdp_enter() %lu processors done %s\n", - mp_kdp_ncpus, (mp_kdp_ncpus == ncpus) ? "OK" : "timed out"); + (int)mp_kdp_ncpus, (mp_kdp_ncpus == ncpus) ? "OK" : "timed out"); postcode(MP_KDP_ENTER); } @@ -1353,6 +1554,8 @@ mp_kdp_exit(void) if (pmsafe_debug && !kdp_snapshot) pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL); + debugger_exit_time = mach_absolute_time(); + DBG("mp_kdp_exit() done\n"); (void) ml_set_interrupts_enabled(mp_kdp_state); postcode(0); @@ -1381,6 +1584,7 @@ cause_ast_check( if (cpu != cpu_number()) { i386_signal_cpu(cpu, MP_AST, ASYNC); + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), cpu, 1, 0, 0, 0); } } @@ -1419,7 +1623,7 @@ remote_kdb(void) cpu_pause(); } - DBG("mp_kdp_enter() %d processors done %s\n", + DBG("mp_kdp_enter() %lu processors done %s\n", mp_kdb_ncpus, (mp_kdb_ncpus == kdb_ncpus) ? "OK" : "timed out"); } @@ -1495,8 +1699,8 @@ slave_machine_init(void *param) * Cold start */ clock_init(); - cpu_machine_init(); /* Interrupts enabled hereafter */ + mp_cpus_call_cpu_init(); } } @@ -1554,3 +1758,117 @@ db_trap_hist(void) #endif /* TRAP_DEBUG */ #endif /* MACH_KDB */ +static void +cpu_prewarm_init() +{ + int i; + + simple_lock_init(&cpu_warm_lock, 0); + queue_init(&cpu_warm_call_list); + for (i = 0; i < NUM_CPU_WARM_CALLS; i++) { + enqueue_head(&cpu_warm_call_list, (queue_entry_t)&cpu_warm_call_arr[i]); + } +} + +static timer_call_t +grab_warm_timer_call() +{ + spl_t x; + timer_call_t call = NULL; + + x = splsched(); + simple_lock(&cpu_warm_lock); + if (!queue_empty(&cpu_warm_call_list)) { + call = (timer_call_t) dequeue_head(&cpu_warm_call_list); + } + simple_unlock(&cpu_warm_lock); + splx(x); + + return call; +} + +static void +free_warm_timer_call(timer_call_t call) +{ + spl_t x; + + x = splsched(); + simple_lock(&cpu_warm_lock); + enqueue_head(&cpu_warm_call_list, (queue_entry_t)call); + simple_unlock(&cpu_warm_lock); + splx(x); +} + +/* + * Runs in timer call context (interrupts disabled). + */ +static void +cpu_warm_timer_call_func( + call_entry_param_t p0, + __unused call_entry_param_t p1) +{ + free_warm_timer_call((timer_call_t)p0); + return; +} + +/* + * Runs with interrupts disabled on the CPU we wish to warm (i.e. CPU 0). + */ +static void +_cpu_warm_setup( + void *arg) +{ + cpu_warm_data_t cwdp = (cpu_warm_data_t)arg; + + timer_call_enter(cwdp->cwd_call, cwdp->cwd_deadline, TIMER_CALL_CRITICAL | TIMER_CALL_LOCAL); + cwdp->cwd_result = 0; + + return; +} + +/* + * Not safe to call with interrupts disabled. + */ +kern_return_t +ml_interrupt_prewarm( + uint64_t deadline) +{ + struct cpu_warm_data cwd; + timer_call_t call; + cpu_t ct; + + if (ml_get_interrupts_enabled() == FALSE) { + panic("%s: Interrupts disabled?\n", __FUNCTION__); + } + + /* + * If the platform doesn't need our help, say that we succeeded. + */ + if (!ml_get_interrupt_prewake_applicable()) { + return KERN_SUCCESS; + } + + /* + * Grab a timer call to use. + */ + call = grab_warm_timer_call(); + if (call == NULL) { + return KERN_RESOURCE_SHORTAGE; + } + + timer_call_setup(call, cpu_warm_timer_call_func, call); + cwd.cwd_call = call; + cwd.cwd_deadline = deadline; + cwd.cwd_result = 0; + + /* + * For now, non-local interrupts happen on the master processor. + */ + ct = mp_cpus_call(cpu_to_cpumask(master_cpu), SYNC, _cpu_warm_setup, &cwd); + if (ct == 0) { + free_warm_timer_call(call); + return KERN_FAILURE; + } else { + return cwd.cwd_result; + } +} diff --git a/osfmk/i386/mp.h b/osfmk/i386/mp.h index 8a2abbd0a..6974ef256 100644 --- a/osfmk/i386/mp.h +++ b/osfmk/i386/mp.h @@ -106,7 +106,7 @@ extern volatile boolean_t mp_kdp_trap; extern volatile boolean_t force_immediate_debugger_NMI; extern volatile boolean_t pmap_tlb_flush_timeout; extern volatile usimple_lock_t spinlock_timed_out; -extern volatile uint32_t spinlock_owner_cpu; +extern volatile uint32_t spinlock_owner_cpu; extern uint64_t LastDebuggerEntryAllowance; @@ -163,10 +163,12 @@ cpu_to_cpumask(cpu_t cpu) * Invoke a function (possibly NULL) on a set of cpus specified by a mask. * The mask may include the local cpu. * If the mode is: - * - ASYNC: other cpus make their calls in parallel. - * - SYNC: the calls are performed serially in logical cpu order. - * This call returns when the function has been run on all specified cpus. - * The return value is the number of cpus on which the call was made. + * - ASYNC: other cpus make their calls in parallel + * - SYNC: the calls are performed serially in logical cpu order + * - NOSYNC: the calls are queued + * Unless the mode is NOSYNC, mp_cpus_call() returns when the function has been + * called on all specified cpus. + * The return value is the number of cpus where the call was made or queued. * The action function is called with interrupts disabled. */ extern cpu_t mp_cpus_call( @@ -174,6 +176,14 @@ extern cpu_t mp_cpus_call( mp_sync_t mode, void (*action_func)(void *), void *arg); +extern cpu_t mp_cpus_call1( + cpumask_t cpus, + mp_sync_t mode, + void (*action_func)(void *, void*), + void *arg0, + void *arg1, + cpumask_t *cpus_calledp, + cpumask_t *cpus_notcalledp); /* * Power-management-specific SPI to: @@ -183,7 +193,6 @@ extern cpu_t mp_cpus_call( extern void PM_interrupt_register(void (*fn)(void)); extern void cpu_PM_interrupt(int cpu); - __END_DECLS #if MP_DEBUG @@ -249,38 +258,14 @@ extern cpu_signal_event_log_t *cpu_handle[]; #ifdef ASSEMBLER #define i_bit(bit, word) ((long)(*(word)) & (1L << (bit))) #else -// Workaround for 6640051 -static inline long +__attribute__((always_inline)) static inline long i_bit_impl(long word, long bit) { - return word & 1L << bit; + long bitmask = 1L << bit; + return word & bitmask; } #define i_bit(bit, word) i_bit_impl((long)(*(word)), bit) #endif - -/* - * Device driver synchronization. - * - * at386_io_lock(op) and at386_io_unlock() are called - * by device drivers when accessing H/W. The underlying - * Processing is machine dependant. But the op argument - * to the at386_io_lock is generic - */ - -#define MP_DEV_OP_MAX 4 -#define MP_DEV_WAIT MP_DEV_OP_MAX /* Wait for the lock */ - -/* - * If the caller specifies an op value different than MP_DEV_WAIT, the - * at386_io_lock function must return true if lock was successful else - * false - */ - -#define MP_DEV_OP_START 0 /* If lock busy, register a pending start op */ -#define MP_DEV_OP_INTR 1 /* If lock busy, register a pending intr */ -#define MP_DEV_OP_TIMEO 2 /* If lock busy, register a pending timeout */ -#define MP_DEV_OP_CALLB 3 /* If lock busy, register a pending callback */ - #if MACH_RT #if defined(__i386__) diff --git a/osfmk/i386/mp_desc.c b/osfmk/i386/mp_desc.c index 084038e4b..2421dc734 100644 --- a/osfmk/i386/mp_desc.c +++ b/osfmk/i386/mp_desc.c @@ -81,7 +81,6 @@ #include - #ifdef __x86_64__ #define K_INTR_GATE (ACC_P|ACC_PL_K|ACC_INTR_GATE) #define U_INTR_GATE (ACC_P|ACC_PL_U|ACC_INTR_GATE) @@ -108,7 +107,7 @@ #undef USER_TRAP_SPC #define TRAP(n, name) \ - [n] { \ + [n] = { \ (uintptr_t)&name, \ KERNEL64_CS, \ 0, \ @@ -120,7 +119,7 @@ #define TRAP_SPC TRAP #define TRAP_IST(n, name) \ - [n] { \ + [n] = { \ (uintptr_t)&name, \ KERNEL64_CS, \ 1, \ @@ -129,7 +128,7 @@ }, #define INTERRUPT(n) \ - [n] { \ + [n] = { \ (uintptr_t)&_intr_ ## n,\ KERNEL64_CS, \ 0, \ @@ -138,7 +137,7 @@ }, #define USER_TRAP(n, name) \ - [n] { \ + [n] = { \ (uintptr_t)&name, \ KERNEL64_CS, \ 0, \ @@ -174,7 +173,7 @@ extern uint32_t low_eintstack[]; /* top */ */ cpu_data_t cpu_data_master = { .cpu_this = &cpu_data_master, - .cpu_nanotime = &rtc_nanotime_info, + .cpu_nanotime = &pal_rtc_nanotime_info, .cpu_int_stack_top = (vm_offset_t) low_eintstack, #ifdef __i386__ .cpu_is64bit = FALSE, @@ -182,7 +181,7 @@ cpu_data_t cpu_data_master = { .cpu_is64bit = TRUE #endif }; -cpu_data_t *cpu_data_ptr[MAX_CPUS] = { [0] &cpu_data_master }; +cpu_data_t *cpu_data_ptr[MAX_CPUS] = { [0] = &cpu_data_master }; decl_simple_lock_data(,ncpus_lock); /* protects real_ncpus */ unsigned int real_ncpus = 1; @@ -383,21 +382,21 @@ fix_desc64(void *descp, int count) case ACC_CALL_GATE: case ACC_INTR_GATE: case ACC_TRAP_GATE: - real.gate.offset_low16 = fakep->offset64 & 0xFFFF; + real.gate.offset_low16 = (uint16_t)(fakep->offset64 & 0xFFFF); real.gate.selector16 = fakep->lim_or_seg & 0xFFFF; real.gate.IST = fakep->size_or_IST & 0x7; real.gate.access8 = fakep->access; - real.gate.offset_high16 = (fakep->offset64>>16)&0xFFFF; + real.gate.offset_high16 = (uint16_t)((fakep->offset64>>16) & 0xFFFF); real.gate.offset_top32 = (uint32_t)(fakep->offset64>>32); break; default: /* Otherwise */ real.desc.limit_low16 = fakep->lim_or_seg & 0xFFFF; - real.desc.base_low16 = fakep->offset64 & 0xFFFF; - real.desc.base_med8 = (fakep->offset64 >> 16) & 0xFF; + real.desc.base_low16 = (uint16_t)(fakep->offset64 & 0xFFFF); + real.desc.base_med8 = (uint8_t)((fakep->offset64 >> 16) & 0xFF); real.desc.access8 = fakep->access; real.desc.limit_high4 = (fakep->lim_or_seg >> 16) & 0xFF; real.desc.granularity4 = fakep->size_or_IST; - real.desc.base_high8 = (fakep->offset64 >> 24) & 0xFF; + real.desc.base_high8 = (uint8_t)((fakep->offset64 >> 24) & 0xFF); real.desc.base_top32 = (uint32_t)(fakep->offset64>>32); } @@ -536,13 +535,13 @@ cpu_desc_init(cpu_data_t *cdp) cdt->gdt[sel_idx(CPU_DATA_GS)].offset = (vm_offset_t) cdp; fix_desc(&cdt->gdt[sel_idx(CPU_DATA_GS)], 1); -#if MACH_KDB +#if MACH_KDB /* this only works for legacy 32-bit machines */ cdt->gdt[sel_idx(DEBUG_TSS)] = tss_desc_pattern; cdt->gdt[sel_idx(DEBUG_TSS)].offset = (vm_offset_t) cdi->cdi_dbtss; fix_desc(&cdt->gdt[sel_idx(DEBUG_TSS)], 1); cdt->dbtss.esp0 = (int)(db_task_stack_store + - (INTSTACK_SIZE * (cdp->cpu_number)) - sizeof (natural_t)); + (INTSTACK_SIZE * (cdp->cpu_number + 1)) - sizeof (natural_t)); cdt->dbtss.esp = cdt->dbtss.esp0; cdt->dbtss.eip = (int)&db_task_start; #endif /* MACH_KDB */ @@ -635,7 +634,8 @@ cpu_desc_init64(cpu_data_t *cdp) kernel_tss_desc64; fix_desc64(&cdt->gdt[sel_idx(KERNEL_TSS)], 1); - /* Set double-fault stack as IST1 */ + /* Set (zeroed) double-fault stack as IST1 */ + bzero((void *) cdt->dfstk, sizeof(cdt->dfstk)); cdt->ktss.ist1 = UBER64((unsigned long)cdt->dfstk + sizeof(cdt->dfstk)); #ifdef __i386__ cdt->gdt[sel_idx(CPU_DATA_GS)] = cpudata_desc_pattern; @@ -775,6 +775,7 @@ fast_syscall_init64(__unused cpu_data_t *cdp) #endif } + cpu_data_t * cpu_data_alloc(boolean_t is_boot_cpu) { @@ -790,8 +791,6 @@ cpu_data_alloc(boolean_t is_boot_cpu) #if NCOPY_WINDOWS > 0 cdp->cpu_pmap = pmap_cpu_alloc(TRUE); #endif - queue_init(&cdp->rtclock_timer.queue); - cdp->rtclock_timer.deadline = EndOfAllTime; } return cdp; } @@ -823,7 +822,6 @@ cpu_data_alloc(boolean_t is_boot_cpu) bzero((void*) cdp->cpu_int_stack_top, INTSTACK_SIZE); cdp->cpu_int_stack_top += INTSTACK_SIZE; - /* * Allocate descriptor table: * Size depends on cpu mode. @@ -860,9 +858,7 @@ cpu_data_alloc(boolean_t is_boot_cpu) real_ncpus++; simple_unlock(&ncpus_lock); - cdp->cpu_nanotime = &rtc_nanotime_info; - queue_init(&cdp->rtclock_timer.queue); - cdp->rtclock_timer.deadline = EndOfAllTime; + cdp->cpu_nanotime = &pal_rtc_nanotime_info; kprintf("cpu_data_alloc(%d) %p desc_table: %p " "ldt: %p " @@ -885,6 +881,64 @@ cpu_data_alloc(boolean_t is_boot_cpu) return NULL; } +boolean_t +valid_user_data_selector(uint16_t selector) +{ + sel_t sel = selector_to_sel(selector); + + if (selector == 0) + return (TRUE); + + if (sel.ti == SEL_LDT) + return (TRUE); + else if (sel.index < GDTSZ) { + if ((gdt_desc_p(selector)->access & ACC_PL_U) == ACC_PL_U) + return (TRUE); + } + + return (FALSE); +} + +boolean_t +valid_user_code_selector(uint16_t selector) +{ + sel_t sel = selector_to_sel(selector); + + if (selector == 0) + return (FALSE); + + if (sel.ti == SEL_LDT) { + if (sel.rpl == USER_PRIV) + return (TRUE); + } + else if (sel.index < GDTSZ && sel.rpl == USER_PRIV) { + if ((gdt_desc_p(selector)->access & ACC_PL_U) == ACC_PL_U) + return (TRUE); + } + + return (FALSE); +} + +boolean_t +valid_user_stack_selector(uint16_t selector) +{ + sel_t sel = selector_to_sel(selector); + + if (selector == 0) + return (FALSE); + + if (sel.ti == SEL_LDT) { + if (sel.rpl == USER_PRIV) + return (TRUE); + } + else if (sel.index < GDTSZ && sel.rpl == USER_PRIV) { + if ((gdt_desc_p(selector)->access & ACC_PL_U) == ACC_PL_U) + return (TRUE); + } + + return (FALSE); +} + boolean_t valid_user_segment_selectors(uint16_t cs, uint16_t ss, @@ -901,7 +955,6 @@ valid_user_segment_selectors(uint16_t cs, valid_user_data_selector(gs); } - #if NCOPY_WINDOWS > 0 static vm_offset_t user_window_base = 0; @@ -954,6 +1007,10 @@ cpu_userwindow_init(int cpu) user_window = user_window_base + (cpu * NCOPY_WINDOWS * NBPDE); cdp->cpu_copywindow_base = user_window; + /* + * Abuse this pdp entry, the pdp now actually points to + * an array of copy windows addresses. + */ cdp->cpu_copywindow_pdp = pmap_pde(kernel_pmap, user_window); #ifdef __i386__ @@ -1005,7 +1062,7 @@ void cpu_mode_init(cpu_data_t *cdp) { #ifdef __i386__ - if (cpu_mode_is64bit()) { + if (cdp->cpu_is64bit) { cpu_IA32e_enable(cdp); cpu_desc_load64(cdp); fast_syscall_init64(cdp); diff --git a/osfmk/i386/mp_desc.h b/osfmk/i386/mp_desc.h index 14d186eb1..97b04c9cb 100644 --- a/osfmk/i386/mp_desc.h +++ b/osfmk/i386/mp_desc.h @@ -121,63 +121,14 @@ extern void cpu_desc_init64(cpu_data_t *cdp); extern void cpu_desc_load(cpu_data_t *cdp); extern void cpu_desc_load64(cpu_data_t *cdp); -static inline boolean_t -valid_user_data_selector(uint16_t selector) -{ - sel_t sel = selector_to_sel(selector); - - if (selector == 0) - return (TRUE); - - if (sel.ti == SEL_LDT) - return (TRUE); - else if (sel.index < GDTSZ) { - if ((gdt_desc_p(selector)->access & ACC_PL_U) == ACC_PL_U) - return (TRUE); - } - - return (FALSE); -} - -static inline boolean_t -valid_user_code_selector(uint16_t selector) -{ - sel_t sel = selector_to_sel(selector); - - if (selector == 0) - return (FALSE); - - if (sel.ti == SEL_LDT) { - if (sel.rpl == USER_PRIV) - return (TRUE); - } - else if (sel.index < GDTSZ && sel.rpl == USER_PRIV) { - if ((gdt_desc_p(selector)->access & ACC_PL_U) == ACC_PL_U) - return (TRUE); - } - - return (FALSE); -} - -static inline boolean_t -valid_user_stack_selector(uint16_t selector) -{ - sel_t sel = selector_to_sel(selector); - - if (selector == 0) - return (FALSE); - - if (sel.ti == SEL_LDT) { - if (sel.rpl == USER_PRIV) - return (TRUE); - } - else if (sel.index < GDTSZ && sel.rpl == USER_PRIV) { - if ((gdt_desc_p(selector)->access & ACC_PL_U) == ACC_PL_U) - return (TRUE); - } - - return (FALSE); -} +extern boolean_t +valid_user_data_selector(uint16_t selector); + +extern boolean_t +valid_user_code_selector(uint16_t selector); + +extern boolean_t +valid_user_stack_selector(uint16_t selector); extern boolean_t valid_user_segment_selectors(uint16_t cs, diff --git a/osfmk/i386/mp_events.h b/osfmk/i386/mp_events.h index e870b0d03..32fde7cc5 100644 --- a/osfmk/i386/mp_events.h +++ b/osfmk/i386/mp_events.h @@ -65,7 +65,7 @@ const char *mp_event_name[] = { \ "MP_LAST" \ } -typedef enum { SYNC, ASYNC } mp_sync_t; +typedef enum { SYNC, ASYNC, NOSYNC } mp_sync_t; __BEGIN_DECLS diff --git a/osfmk/i386/mp_native.c b/osfmk/i386/mp_native.c new file mode 100644 index 000000000..73d3b1ca0 --- /dev/null +++ b/osfmk/i386/mp_native.c @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include /* install_real_mode_bootstrap */ +#include +#include /* lapic_* functions */ +#include +#include +#include + +/* PAL-related routines */ +void i386_cpu_IPI(int cpu); +boolean_t i386_smp_init(int nmi_vector, i386_intr_func_t nmi_handler, + int ipi_vector, i386_intr_func_t ipi_handler); +void i386_start_cpu(int lapic_id, int cpu_num); +void i386_send_NMI(int cpu); +void handle_pending_TLB_flushes(void); + +extern void slave_pstart(void); + +#ifdef MP_DEBUG +int trappedalready = 0; /* (BRINGUP) */ +#endif /* MP_DEBUG */ + +boolean_t +i386_smp_init(int nmi_vector, i386_intr_func_t nmi_handler, int ipi_vector, i386_intr_func_t ipi_handler) +{ + /* Local APIC? */ + if (!lapic_probe()) + return FALSE; + + lapic_init(); + lapic_configure(); + lapic_set_intr_func(nmi_vector, nmi_handler); + lapic_set_intr_func(ipi_vector, ipi_handler); + + install_real_mode_bootstrap(slave_pstart); + + return TRUE; +} + +void +i386_start_cpu(int lapic_id, __unused int cpu_num ) +{ + LAPIC_WRITE(ICRD, lapic_id << LAPIC_ICRD_DEST_SHIFT); + LAPIC_WRITE(ICR, LAPIC_ICR_DM_INIT); + delay(100); + + LAPIC_WRITE(ICRD, lapic_id << LAPIC_ICRD_DEST_SHIFT); + LAPIC_WRITE(ICR, LAPIC_ICR_DM_STARTUP|(REAL_MODE_BOOTSTRAP_OFFSET>>12)); +} + +void +i386_send_NMI(int cpu) +{ + boolean_t state = ml_set_interrupts_enabled(FALSE); + /* Program the interrupt command register */ + LAPIC_WRITE(ICRD, cpu_to_lapic[cpu] << LAPIC_ICRD_DEST_SHIFT); + /* The vector is ignored in this case--the target CPU will enter on the + * NMI vector. + */ + LAPIC_WRITE(ICR, LAPIC_VECTOR(INTERPROCESSOR)|LAPIC_ICR_DM_NMI); + (void) ml_set_interrupts_enabled(state); +} + +void +handle_pending_TLB_flushes(void) +{ + volatile int *my_word = ¤t_cpu_datap()->cpu_signals; + + if (i_bit(MP_TLB_FLUSH, my_word) && (pmap_tlb_flush_timeout == FALSE)) { + DBGLOG(cpu_handle, cpu_number(), MP_TLB_FLUSH); + i_bit_clear(MP_TLB_FLUSH, my_word); + pmap_update_interrupt(); + } +} + +void +i386_cpu_IPI(int cpu) +{ +#ifdef MP_DEBUG + if(cpu_datap(cpu)->cpu_signals & 6) { /* (BRINGUP) */ + kprintf("i386_cpu_IPI: sending enter debugger signal (%08X) to cpu %d\n", cpu_datap(cpu)->cpu_signals, cpu); + } +#endif /* MP_DEBUG */ + +#if MACH_KDB +#ifdef MP_DEBUG + if(!trappedalready && (cpu_datap(cpu)->cpu_signals & 6)) { /* (BRINGUP) */ + if(kdb_cpu != cpu_number()) { + trappedalready = 1; + panic("i386_cpu_IPI: sending enter debugger signal (%08X) to cpu %d and I do not own debugger, owner = %08X\n", + cpu_datap(cpu)->cpu_signals, cpu, kdb_cpu); + } + } +#endif /* MP_DEBUG */ +#endif + + lapic_send_ipi(cpu, LAPIC_VECTOR(INTERPROCESSOR)); +} diff --git a/osfmk/i386/mtrr.c b/osfmk/i386/mtrr.c index 9129f40c6..63a19c6a2 100644 --- a/osfmk/i386/mtrr.c +++ b/osfmk/i386/mtrr.c @@ -333,7 +333,7 @@ mtrr_update_action(void * cache_control_type) set_cr4(cr4 & ~CR4_PGE); /* flush TLBs */ - flush_tlb(); + flush_tlb_raw(); if (CACHE_CONTROL_PAT == cache_control_type) { /* Change PA6 attribute field to WC */ @@ -365,7 +365,7 @@ mtrr_update_action(void * cache_control_type) /* flush all caches and TLBs a second time */ wbinvd(); - flush_tlb(); + flush_tlb_raw(); /* restore normal cache mode */ set_cr0(cr0); @@ -486,7 +486,6 @@ mtrr_range_add(addr64_t address, uint64_t length, uint32_t type) return KERN_NOT_SUPPORTED; } - /* check memory type (GPF exception for undefined types) */ if ((type != MTRR_TYPE_UNCACHEABLE) && (type != MTRR_TYPE_WRITECOMBINE) && diff --git a/osfmk/i386/pal_hibernate.h b/osfmk/i386/pal_hibernate.h new file mode 100644 index 000000000..025e56ea4 --- /dev/null +++ b/osfmk/i386/pal_hibernate.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef _I386_PAL_HIBERNATE_H +#define _I386_PAL_HIBERNATE_H + +#define HIB_PTES (4*GB - 1*I386_LPGBYTES) /*4GB - 2m */ +#define DEST_COPY_AREA (HIB_PTES + 1*I386_PGBYTES) +#define SRC_COPY_AREA (HIB_PTES + 2*I386_PGBYTES) +#define COPY_PAGE_AREA (HIB_PTES + 3*I386_PGBYTES) + +#define HIB_BASE sectINITPTB +#define HIB_ENTRYPOINT acpi_wake_prot_entry + +void pal_hib_window_setup(ppnum_t page); +uintptr_t pal_hib_map(uintptr_t v, uint64_t p); +void hibernateRestorePALState(uint32_t *src); +void pal_hib_patchup(void); +#define PAL_HIBERNATE_MAGIC_1 0xfeedfacedeadbeef +#define PAL_HIBERNATE_MAGIC_2 0x41b312133714 +#endif /* _I386_PAL_HIBERNATE_H */ diff --git a/osfmk/ppc/cpu_number.h b/osfmk/i386/pal_lock_asm.h similarity index 82% rename from osfmk/ppc/cpu_number.h rename to osfmk/i386/pal_lock_asm.h index cd38aa6d6..a7baad7f9 100644 --- a/osfmk/ppc/cpu_number.h +++ b/osfmk/i386/pal_lock_asm.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,16 +25,15 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * @OSF_COPYRIGHT@ - */ -#ifdef KERNEL_PRIVATE +#ifndef _I386_PAL_LOCK_ASM_H +#define _I386_PAL_LOCK_ASM_H -#ifndef _PPC_CPU_NUMBER_H_ -#define _PPC_CPU_NUMBER_H_ +#ifdef XNU_KERNEL_PRIVATE -extern int cpu_number(void); +#define PUSHF pushf +#define POPF popf +#define CLI cli -#endif /* _PPC_CPU_NUMBER_H_ */ +#endif /* XNU_KERNEL_PRIVATE */ -#endif /* KERNEL_PRIVATE */ +#endif /* _I386_PAL_LOCK_ASM_H */ diff --git a/osfmk/i386/pal_native.h b/osfmk/i386/pal_native.h new file mode 100644 index 000000000..13cbf69fb --- /dev/null +++ b/osfmk/i386/pal_native.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef _I386_PAL_I386_H +#define _I386_PAL_I386_H + +#ifdef XNU_KERNEL_PRIVATE + +/* No-op on bare-metal */ +#define pal_dbg_page_fault(x, y, z) +#define pal_dbg_set_task_name( x ) +#define pal_set_signal_delivery( x ) + +#define pal_is_usable_memory(b, t) (TRUE) + +#define pal_hlt() __asm__ volatile ("sti; hlt") +#define pal_sti() __asm__ volatile ("sti") +#define pal_cli() __asm__ volatile ("cli") + +static inline +void pal_stop_cpu(boolean_t cli) +{ + if( cli ) + __asm__ volatile ( "cli" ); + __asm__ volatile ( "wbinvd; hlt" ); +} + +#define pal_register_cache_state(t, v) + +#define pal_execve_return(t) +#define pal_thread_terminate_self(t) +#define pal_ast_check(t) +#define pal_switch_pmap(t,u,v) + +#define panic_display_pal_info() do { } while(0) +#define pal_kernel_announce() do { } while(0) + +#define PAL_AICPM_PROPERTY_VALUE 0 + +#define pal_pmc_swi() __asm__ __volatile__("int %0"::"i"(LAPIC_PMC_SWI_VECTOR):"memory") + +/* Macro used by non-native xnus for access to low globals when it may + * have moved. + */ +#define PAL_KDP_ADDR(x) (x) + +struct pal_rtc_nanotime { + volatile uint64_t tsc_base; /* timestamp */ + volatile uint64_t ns_base; /* nanoseconds */ + uint32_t scale; /* tsc -> nanosec multiplier */ + uint32_t shift; /* tsc -> nanosec shift/div */ + /* shift is overloaded with + * lower 32bits of tsc_freq + * on slower machines (SLOW_TSC_THRESHOLD) */ + volatile uint32_t generation; /* 0 == being updated */ + uint32_t spare1; +}; + + +#ifdef MACH_KERNEL_PRIVATE + +struct pal_cpu_data { + +}; + +struct pal_pcb { + +}; + +struct pal_apic_table { + +}; + +#endif /* MACH_KERNEL_PRIVATE */ + +#endif /* XNU_KERNEL_PRIVATE */ + +#endif /* _I386_PAL_I386_H */ diff --git a/osfmk/i386/pal_routines.c b/osfmk/i386/pal_routines.c new file mode 100644 index 000000000..34e5bd0a5 --- /dev/null +++ b/osfmk/i386/pal_routines.c @@ -0,0 +1,349 @@ +/* + * Copyright (c) 2009-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * file: pal_routines.c + * Platform Abstraction Layer routines for bare-metal i386 and x86_64 + */ + + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +//#define PAL_DEBUG 1 +#ifdef PAL_DEBUG +#define DBG(x...) kprintf("PAL_DBG: " x) +#else +#define DBG(x...) +#endif /* PAL_DEBUG */ + +extern void *gPEEFIRuntimeServices; +extern void *gPEEFISystemTable; + +/* nanotime conversion information */ +pal_rtc_nanotime_t pal_rtc_nanotime_info = {0,0,0,0,1,0}; + +/* APIC kext may use this to access xnu internal state */ +struct pal_apic_table *apic_table = NULL; + +decl_simple_lock_data(static , pal_efi_lock); +#ifdef __x86_64__ +#define PML4_PROT (INTEL_PTE_VALID | INTEL_PTE_WRITE) +#define INIT_PDPT_BASE (INITPT_SEG_BASE + PAGE_SIZE) +static pml4_entry_t IDPML4[PTE_PER_PAGE] __attribute__ ((aligned (4096))) = { + [0] = (uint64_t)(INIT_PDPT_BASE | PML4_PROT), + [KERNEL_PML4_INDEX] = (uint64_t)(INIT_PDPT_BASE | PML4_PROT), +}; +uint64_t pal_efi_saved_cr0; +uint64_t pal_efi_saved_cr3; +#endif + + +/* Serial routines */ +int +pal_serial_init(void) +{ + return serial_init(); +} + +void +pal_serial_putc(char c) +{ + serial_putc(c); +} + +int +pal_serial_getc(void) +{ + return serial_getc(); +} + + +/* Generic routines */ +void +pal_i386_init(void) +{ + simple_lock_init(&pal_efi_lock, 0); +} + +void +pal_get_control_registers( pal_cr_t *cr0, pal_cr_t *cr2, + pal_cr_t *cr3, pal_cr_t *cr4 ) +{ + *cr0 = get_cr0(); + *cr2 = get_cr2(); + *cr3 = get_cr3_raw(); + *cr4 = get_cr4(); +} + + +/* + * define functions below here to ensure we have symbols for these, + * even though they're not used on this platform. + */ +#undef pal_dbg_page_fault +void +pal_dbg_page_fault( thread_t thread __unused, + user_addr_t vaddr __unused, + kern_return_t kr __unused ) +{ +} + +#undef pal_dbg_set_task_name +void +pal_dbg_set_task_name( task_t task __unused ) +{ +} + +#undef pal_set_signal_delivery +void +pal_set_signal_delivery(thread_t thread __unused) +{ +} + +/* EFI thunks */ +extern void +_pal_efi_call_in_64bit_mode_asm(uint64_t func, + struct pal_efi_registers *efi_reg, + void *stack_contents, + size_t stack_contents_size); + +kern_return_t +pal_efi_call_in_64bit_mode(uint64_t func, + struct pal_efi_registers *efi_reg, + void *stack_contents, + size_t stack_contents_size, /* 16-byte multiple */ + uint64_t *efi_status) +{ + DBG("pal_efi_call_in_64bit_mode(0x%016llx, %p, %p, %lu, %p)\n", + func, efi_reg, stack_contents, stack_contents_size, efi_status); + + if (func == 0) { + return KERN_INVALID_ADDRESS; + } + + if ((efi_reg == NULL) + || (stack_contents == NULL) + || (stack_contents_size % 16 != 0)) { + return KERN_INVALID_ARGUMENT; + } + + if (!gPEEFISystemTable || !gPEEFIRuntimeServices) { + return KERN_NOT_SUPPORTED; + } + + _pal_efi_call_in_64bit_mode_asm(func, + efi_reg, + stack_contents, + stack_contents_size); + + *efi_status = efi_reg->rax; + + return KERN_SUCCESS; +} + +extern void +_pal_efi_call_in_32bit_mode_asm(uint32_t func, + struct pal_efi_registers *efi_reg, + void *stack_contents, + size_t stack_contents_size); + +kern_return_t +pal_efi_call_in_32bit_mode(uint32_t func, + struct pal_efi_registers *efi_reg, + void *stack_contents, + size_t stack_contents_size, /* 16-byte multiple */ + uint32_t *efi_status) +{ + DBG("pal_efi_call_in_32bit_mode(0x%08x, %p, %p, %lu, %p)\n", + func, efi_reg, stack_contents, stack_contents_size, efi_status); + + if (func == 0) { + return KERN_INVALID_ADDRESS; + } + + if ((efi_reg == NULL) + || (stack_contents == NULL) + || (stack_contents_size % 16 != 0)) { + return KERN_INVALID_ARGUMENT; + } + + if (!gPEEFISystemTable || !gPEEFIRuntimeServices) { + return KERN_NOT_SUPPORTED; + } + + DBG("pal_efi_call_in_32bit_mode() efi_reg:\n"); + DBG(" rcx: 0x%016llx\n", efi_reg->rcx); + DBG(" rdx: 0x%016llx\n", efi_reg->rdx); + DBG(" r8: 0x%016llx\n", efi_reg->r8); + DBG(" r9: 0x%016llx\n", efi_reg->r9); + DBG(" rax: 0x%016llx\n", efi_reg->rax); + + DBG("pal_efi_call_in_32bit_mode() stack:\n"); +#if PAL_DEBUG + size_t i; + for (i = 0; i < stack_contents_size; i += sizeof(uint32_t)) { + uint32_t *p = (uint32_t *) ((uintptr_t)stack_contents + i); + DBG(" %p: 0x%08x\n", p, *p); + } +#endif + +#ifdef __x86_64__ + /* + * Ensure no interruptions. + * Taking a spinlock for serialization is technically unnecessary + * because the EFIRuntime kext should serialize. + */ + boolean_t istate = ml_set_interrupts_enabled(FALSE); + simple_lock(&pal_efi_lock); + + /* + * Switch to special page tables with the entire high kernel space + * double-mapped into the bottom 4GB. + * + * NB: We assume that all data passed exchanged with RuntimeServices is + * located in the 4GB of KVA based at VM_MIN_ADDRESS. In particular, kexts + * loaded the basement (below VM_MIN_ADDRESS) cannot pass static data. + * Kernel stack and heap space is OK. + */ + MARK_CPU_IDLE(cpu_number()); + pal_efi_saved_cr3 = get_cr3_raw(); + pal_efi_saved_cr0 = get_cr0(); + clear_ts(); + set_cr3_raw((uint64_t) ID_MAP_VTOP(IDPML4)); + + swapgs(); /* Save kernel's GS base */ + + /* Set segment state ready for compatibility mode */ + set_gs(NULL_SEG); + set_fs(NULL_SEG); + set_es(KERNEL_DS); + set_ds(KERNEL_DS); + set_ss(KERNEL_DS); + + _pal_efi_call_in_32bit_mode_asm(func, + efi_reg, + stack_contents, + stack_contents_size); + + /* Restore NULL segment state */ + set_ss(NULL_SEG); + set_es(NULL_SEG); + set_ds(NULL_SEG); + + swapgs(); /* Restore kernel's GS base */ + + /* Restore the 64-bit user GS base we just destroyed */ + wrmsr64(MSR_IA32_KERNEL_GS_BASE, + current_cpu_datap()->cpu_uber.cu_user_gs_base); + + /* End of mapping games */ + set_cr3_raw(pal_efi_saved_cr3); + set_cr0(pal_efi_saved_cr0); + MARK_CPU_ACTIVE(cpu_number()); + + simple_unlock(&pal_efi_lock); + ml_set_interrupts_enabled(istate); +#else + _pal_efi_call_in_32bit_mode_asm(func, + efi_reg, + stack_contents, + stack_contents_size); +#endif + + *efi_status = (uint32_t)efi_reg->rax; + DBG("pal_efi_call_in_32bit_mode() efi_status: 0x%x\n", *efi_status); + + return KERN_SUCCESS; +} + +/* wind-back a syscall instruction */ +void +pal_syscall_restart(thread_t thread __unused, x86_saved_state_t *state) +{ + /* work out which flavour thread it is */ + if( is_saved_state32(state) ) + { + x86_saved_state32_t *regs32; + regs32 = saved_state32(state); + + if (regs32->cs == SYSENTER_CS || regs32->cs == SYSENTER_TF_CS) + regs32->eip -= 5; + else + regs32->eip -= 2; + } + else + { + x86_saved_state64_t *regs64; + + assert( is_saved_state64(state) ); + regs64 = saved_state64(state); + + /* Only one instruction for 64-bit threads */ + regs64->isf.rip -= 2; + } + +} + +/* Helper function to put the machine to sleep (or shutdown) */ + +boolean_t +pal_machine_sleep(uint8_t type_a __unused, uint8_t type_b __unused, uint32_t bit_position __unused, + uint32_t disable_mask __unused, uint32_t enable_mask __unused) +{ + return 0; +} + + +/* shouldn't be used on native */ +void +pal_get_kern_regs( x86_saved_state_t *state ) +{ + panic( "pal_get_kern_regs called. state %p\n", state ); +} + +void +pal_preemption_assert(void) +{ +} + +void +hibernate_pal_prepare(void) +{ +} diff --git a/osfmk/i386/pal_routines.h b/osfmk/i386/pal_routines.h new file mode 100644 index 000000000..dc59735b9 --- /dev/null +++ b/osfmk/i386/pal_routines.h @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef _I386_PAL_ROUTINES_H +#define _I386_PAL_ROUTINES_H + +#include +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +/* PAL routines exported to kexts */ + +/* + * Load registers with these values. In 32-bit mode, + * only the low-order half is loaded (if applicable) + */ +struct pal_efi_registers { + uint64_t rcx; + uint64_t rdx; + uint64_t r8; + uint64_t r9; + uint64_t rax; +}; + +/* + * Load registers and stack with these values before + * executing "call" instruction + */ +kern_return_t +pal_efi_call_in_64bit_mode(uint64_t func, + struct pal_efi_registers *efi_reg, + void *stack_contents, + size_t stack_contents_size, /* 16-byte multiple */ + uint64_t *efi_status); + +kern_return_t +pal_efi_call_in_32bit_mode(uint32_t func, + struct pal_efi_registers *efi_reg, + void *stack_contents, + size_t stack_contents_size, /* 16-byte multiple */ + uint32_t *efi_status); + +/* Go into ACPI sleep */ + +boolean_t pal_machine_sleep(uint8_t type_a, + uint8_t type_b, + uint32_t bit_position, + uint32_t disable_mask, + uint32_t enable_mask); + +/* xnu internal PAL routines */ +#ifdef XNU_KERNEL_PRIVATE + +/* Define any PAL-specific types for x86 */ +#ifdef __i386__ +typedef uint32_t pal_cr_t; +#else +typedef uint64_t pal_cr_t; +#endif + +struct pal_cpu_data; /* Defined per-platform */ +struct pal_pcb; /* Defined per-platform */ +struct pal_apic_table; /* Defined per-platform */ + +/* For use by APIC kext */ +extern struct pal_apic_table *apic_table; + +/* serial / debug output routines */ +extern int pal_serial_init(void); +extern void pal_serial_putc(char); +extern int pal_serial_getc(void); + +/* Generic I386 PAL functions go here */ +extern void pal_i386_init(void); +extern void pal_set_signal_delivery(thread_t); + +/* Get values for cr0..4 */ +extern void pal_get_control_registers( pal_cr_t *cr0, pal_cr_t *cr2, + pal_cr_t *cr3, pal_cr_t *cr4 ); + +/* Debug hook invoked in the page-fault path */ +extern void pal_dbg_page_fault( thread_t thread, user_addr_t vadddr, + kern_return_t kr ); + +/* Set a task's name in the platform kernel debugger */ +extern void pal_dbg_set_task_name( task_t task ); + +/* wind-back to the start of a system call */ +void pal_syscall_restart(thread_t thread, x86_saved_state_t *state); + +/* Hook for non-vfork exec */ +void pal_execve_return(thread_t thread); + +/* Called by thread_terminate_self() */ +void pal_thread_terminate_self(thread_t thread); + +/* Called by ast_check() */ +void pal_ast_check(thread_t thread); + +/* Called by sync_iss_to_iks */ +extern void pal_get_kern_regs( x86_saved_state_t *state ); + +/* Called by load_machfile */ +void pal_switch_pmap(thread_t, pmap_t, boolean_t); + +/* + * Platform-specific hlt/sti. + */ +extern void pal_hlt(void); +extern void pal_sti(void); +extern void pal_cli(void); + +/* + * Mark in-memory thread register cache state validity. + */ +typedef enum { DIRTY, VALID } pal_cache_state_t; +void pal_register_cache_state(thread_t thread, pal_cache_state_t state); + + +/* Catch code running on the except thread that shouldn't be */ +void pal_preemption_assert(void); + +void hibernate_pal_prepare(void); +void pal_efi_hibernate_prepare(void); + +/* Include a PAL-specific header, too, for xnu-internal overrides */ +#include + + +/* Allow for tricky IOKit property matching */ +#define PAL_AICPM_PROPERTY_NAME "intel_cpupm_matching" +static inline void +pal_get_resource_property(const char **property_name, int *property_value) +{ + *property_name = PAL_AICPM_PROPERTY_NAME; + *property_value = PAL_AICPM_PROPERTY_VALUE; +} + +/* assembly function to update TSC / timebase info */ +extern void _pal_rtc_nanotime_store( + uint64_t tsc, + uint64_t nsec, + uint32_t scale, + uint32_t shift, + struct pal_rtc_nanotime *dst); + +/* global nanotime info */ +extern struct pal_rtc_nanotime pal_rtc_nanotime_info; + +#endif /* XNU_KERNEL_PRIVATE */ + +#if defined(__cplusplus) +} +#endif + +#endif /* _I386_PAL_ROUTINES_H */ diff --git a/osfmk/i386/pal_routines_asm.s b/osfmk/i386/pal_routines_asm.s new file mode 100644 index 000000000..0c4089af9 --- /dev/null +++ b/osfmk/i386/pal_routines_asm.s @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include + +#include + +/* + * Copy "count" bytes from "src" to %esp, using + * "tmpindex" for a scratch counter and %eax + */ +#define COPY_STACK(src, count, tmpindex) \ + mov $0, tmpindex /* initial scratch counter */ ; \ +1: \ + mov 0(src,tmpindex,1), %eax /* copy one 32-bit word from source... */ ; \ + mov %eax, 0(%esp,tmpindex,1) /* ... to stack */ ; \ + add $4, tmpindex /* increment counter */ ; \ + cmp count, tmpindex /* exit it stack has been copied */ ; \ + jne 1b + +/* + void + pal_efi_call_in_64bit_mode_asm(uint64_t func, + struct pal_efi_registers *efi_reg, + void *stack_contents, + size_t stack_contents_size) + + * Switch from compatibility mode to long mode, and + * then execute the function pointer with the specified + * register and stack contents (based at %rsp). Afterwards, + * collect the return value, restore the original state, + * and return. +*/ +ENTRY(_pal_efi_call_in_64bit_mode_asm) + FRAME + + /* save non-volatile registers */ + push %ebx + push %esi + push %edi + + sub $12, %esp /* align to 16-byte boundary */ + mov 16(%ebp), %esi /* load efi_reg into %esi */ + mov 20(%ebp), %edx /* load stack_contents into %edx */ + mov 24(%ebp), %ecx /* load s_c_s into %ecx */ + sub %ecx, %esp /* make room for stack contents */ + + COPY_STACK(%edx, %ecx, %edi) + + ENTER_64BIT_MODE() + + /* load efi_reg into real registers */ + mov 0(%rsi), %rcx + mov 8(%rsi), %rdx + mov 16(%rsi), %r8 + mov 24(%rsi), %r9 + mov 32(%rsi), %rax + + mov 8(%rbp), %rdi /* load func pointer */ + call *%rdi /* call EFI runtime */ + + mov 16(%rbp), %esi /* load efi_reg into %esi */ + mov %rax, 32(%rsi) /* save RAX back */ + + ENTER_COMPAT_MODE() + + add 24(%ebp), %esp /* discard stack contents */ + add $12, %esp /* restore stack pointer */ + + pop %edi + pop %esi + pop %ebx + + EMARF + ret + +/* + void + pal_efi_call_in_32bit_mode_asm(uint32_t func, + struct pal_efi_registers *efi_reg, + void *stack_contents, + size_t stack_contents_size) +*/ +ENTRY(_pal_efi_call_in_32bit_mode_asm) + FRAME + + /* save non-volatile registers */ + push %ebx + push %esi + push %edi + + sub $12, %esp /* align to 16-byte boundary */ + mov 12(%ebp), %esi /* load efi_reg into %esi */ + mov 16(%ebp), %edx /* load stack_contents into %edx */ + mov 20(%ebp), %ecx /* load s_c_s into %ecx */ + sub %ecx, %esp /* make room for stack contents */ + + COPY_STACK(%edx, %ecx, %edi) + + /* load efi_reg into real registers */ + mov 0(%esi), %ecx + mov 8(%esi), %edx + mov 32(%esi), %eax + + mov 8(%ebp), %edi /* load func pointer */ + call *%edi /* call EFI runtime */ + + mov 12(%ebp), %esi /* load efi_reg into %esi */ + mov %eax, 32(%esi) /* save RAX back */ + movl $0, 36(%esi) /* zero out high bits of RAX */ + + add 20(%ebp), %esp /* discard stack contents */ + add $12, %esp /* restore stack pointer */ + + pop %edi + pop %esi + pop %ebx + + EMARF + ret + + +/* void _rtc_nanotime_store(uint64_t tsc, + uint64_t nsec, + uint32_t scale, + uint32_t shift, + rtc_nanotime_t *dst) ; +*/ + +ENTRY(_pal_rtc_nanotime_store) + push %ebp + movl %esp,%ebp + push %esi + + mov 32(%ebp),%edx /* get ptr to rtc_nanotime_info */ + + movl RNT_GENERATION(%edx),%esi /* get current generation */ + movl $0,RNT_GENERATION(%edx) /* flag data as being updated */ + + mov 8(%ebp),%eax + mov %eax,RNT_TSC_BASE(%edx) + mov 12(%ebp),%eax + mov %eax,RNT_TSC_BASE+4(%edx) + + mov 24(%ebp),%eax + mov %eax,RNT_SCALE(%edx) + + mov 28(%ebp),%eax + mov %eax,RNT_SHIFT(%edx) + + mov 16(%ebp),%eax + mov %eax,RNT_NS_BASE(%edx) + mov 20(%ebp),%eax + mov %eax,RNT_NS_BASE+4(%edx) + + incl %esi /* next generation */ + jnz 1f + incl %esi /* skip 0, which is a flag */ +1: movl %esi,RNT_GENERATION(%edx) /* update generation and make usable */ + + pop %esi + pop %ebp + + ret + + diff --git a/osfmk/ppc/mp.h b/osfmk/i386/pal_rtclock_asm.h similarity index 86% rename from osfmk/ppc/mp.h rename to osfmk/i386/pal_rtclock_asm.h index 9b2dde5a2..69070bebb 100644 --- a/osfmk/ppc/mp.h +++ b/osfmk/i386/pal_rtclock_asm.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2009-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,14 +25,9 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * @OSF_COPYRIGHT@ - */ - -#ifndef _PPC_MP_H_ -#define _PPC_MP_H_ +#ifndef _I386_PAL_RTCLOCK_ASM_H +#define _I386_PAL_RTCLOCK_ASM_H -#include -#include +#include -#endif /* _PPC_MP_H_ */ +#endif /* _I386_PAL_RTCLOCK_ASM_H */ diff --git a/osfmk/i386/pcb.c b/osfmk/i386/pcb.c index 421cc3f53..caf0c68db 100644 --- a/osfmk/i386/pcb.c +++ b/osfmk/i386/pcb.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -87,21 +87,16 @@ #include #include #include -#include -#include #include -#include #include +#include #include #if defined(__i386__) #include #endif -#include #include #include /* LAPIC_PMC_SWI_VECTOR */ -#include - #if CONFIG_COUNTERS #include #endif /* CONFIG_COUNTERS */ @@ -134,9 +129,6 @@ zone_t ids_zone; /* zone for debug_state area */ /* Forward */ -void act_machine_throughcall(thread_t thr_act); -void act_machine_return(int); - extern void Thread_continue(void); extern void Load_context( thread_t thread); @@ -185,7 +177,7 @@ static inline void pmc_swi(thread_t old, thread_t new) { current_cpu_datap()->csw_old_thread = old; current_cpu_datap()->csw_new_thread = new; - __asm__ __volatile__("int %0"::"i"(LAPIC_PMC_SWI_VECTOR):"memory"); + pal_pmc_swi(); } static inline void @@ -366,7 +358,7 @@ set_debug_state32(thread_t thread, x86_debug_state32_t *ds) x86_debug_state32_t *ids; pcb_t pcb; - pcb = thread->machine.pcb; + pcb = THREAD_TO_PCB(thread); ids = pcb->ids; if (debug_state_is_valid32(ds) != TRUE) { @@ -400,7 +392,7 @@ set_debug_state64(thread_t thread, x86_debug_state64_t *ds) x86_debug_state64_t *ids; pcb_t pcb; - pcb = thread->machine.pcb; + pcb = THREAD_TO_PCB(thread); ids = pcb->ids; if (debug_state_is_valid64(ds) != TRUE) { @@ -432,7 +424,7 @@ get_debug_state32(thread_t thread, x86_debug_state32_t *ds) { x86_debug_state32_t *saved_state; - saved_state = thread->machine.pcb->ids; + saved_state = thread->machine.ids; if (saved_state) { copy_debug_state32(saved_state, ds, TRUE); @@ -445,7 +437,7 @@ get_debug_state64(thread_t thread, x86_debug_state64_t *ds) { x86_debug_state64_t *saved_state; - saved_state = (x86_debug_state64_t *)thread->machine.pcb->ids; + saved_state = (x86_debug_state64_t *)thread->machine.ids; if (saved_state) { copy_debug_state64(saved_state, ds, TRUE); @@ -467,318 +459,6 @@ void consider_machine_adjust(void) { } -extern void *get_bsduthreadarg(thread_t th); - -#if defined(__x86_64__) -static void -act_machine_switch_pcb( thread_t new ) -{ - pcb_t pcb = new->machine.pcb; - struct real_descriptor *ldtp; - mach_vm_offset_t pcb_stack_top; - cpu_data_t *cdp = current_cpu_datap(); - - assert(new->kernel_stack != 0); - - if (!cpu_mode_is64bit()) { - panic("K64 is 64bit!"); - } else if (is_saved_state64(pcb->iss)) { - /* - * The test above is performed against the thread save state - * flavor and not task's 64-bit feature flag because of the - * thread/task 64-bit state divergence that can arise in - * task_set_64bit() x86: the task state is changed before - * the individual thread(s). - */ - x86_saved_state64_tagged_t *iss64; - vm_offset_t isf; - - assert(is_saved_state64(pcb->iss)); - - iss64 = (x86_saved_state64_tagged_t *) pcb->iss; - - /* - * Set pointer to PCB's interrupt stack frame in cpu data. - * Used by syscall and double-fault trap handlers. - */ - isf = (vm_offset_t) &iss64->state.isf; - cdp->cpu_uber.cu_isf = isf; - pcb_stack_top = (vm_offset_t) (iss64 + 1); - /* require 16-byte alignment */ - assert((pcb_stack_top & 0xF) == 0); - - /* Interrupt stack is pcb */ - current_ktss64()->rsp0 = pcb_stack_top; - - /* - * Top of temporary sysenter stack points to pcb stack. - * Although this is not normally used by 64-bit users, - * it needs to be set in case a sysenter is attempted. - */ - *current_sstk64() = pcb_stack_top; - - cdp->cpu_task_map = new->map->pmap->pm_task_map; - - /* - * Enable the 64-bit user code segment, USER64_CS. - * Disable the 32-bit user code segment, USER_CS. - */ - ldt_desc_p(USER64_CS)->access |= ACC_PL_U; - ldt_desc_p(USER_CS)->access &= ~ACC_PL_U; - - /* - * Switch user's GS base if necessary - * by setting the Kernel GS base MSR - * - this will become the user's on the swapgs when - * returning to user-space. Avoid this for - * kernel threads (no user TLS support required) - * and verify the memory shadow of the segment base - * in the event it was altered in user space. - */ - if ((pcb->cthread_self != 0) || (new->task != kernel_task)) { - if ((cdp->cpu_uber.cu_user_gs_base != pcb->cthread_self) || (pcb->cthread_self != rdmsr64(MSR_IA32_KERNEL_GS_BASE))) { - cdp->cpu_uber.cu_user_gs_base = pcb->cthread_self; - wrmsr64(MSR_IA32_KERNEL_GS_BASE, pcb->cthread_self); - } - } - } else { - x86_saved_state_compat32_t *iss32compat; - vm_offset_t isf; - - assert(is_saved_state32(pcb->iss)); - iss32compat = (x86_saved_state_compat32_t *) pcb->iss; - - pcb_stack_top = (uintptr_t) (iss32compat + 1); - /* require 16-byte alignment */ - assert((pcb_stack_top & 0xF) == 0); - - /* - * Set pointer to PCB's interrupt stack frame in cpu data. - * Used by debug trap handler. - */ - isf = (vm_offset_t) &iss32compat->isf64; - cdp->cpu_uber.cu_isf = isf; - - /* Top of temporary sysenter stack points to pcb stack */ - *current_sstk64() = pcb_stack_top; - - /* Interrupt stack is pcb */ - current_ktss64()->rsp0 = pcb_stack_top; - - cdp->cpu_task_map = TASK_MAP_32BIT; - /* Precalculate pointers to syscall argument store, for use - * in the trampolines. - */ - cdp->cpu_uber_arg_store = (vm_offset_t)get_bsduthreadarg(new); - cdp->cpu_uber_arg_store_valid = (vm_offset_t)&pcb->arg_store_valid; - pcb->arg_store_valid = 0; - - /* - * Disable USER64_CS - * Enable USER_CS - */ - ldt_desc_p(USER64_CS)->access &= ~ACC_PL_U; - ldt_desc_p(USER_CS)->access |= ACC_PL_U; - - /* - * Set the thread`s cthread (a.k.a pthread) - * For 32-bit user this involves setting the USER_CTHREAD - * descriptor in the LDT to point to the cthread data. - * The involves copying in the pre-initialized descriptor. - */ - ldtp = (struct real_descriptor *)current_ldt(); - ldtp[sel_idx(USER_CTHREAD)] = pcb->cthread_desc; - if (pcb->uldt_selector != 0) - ldtp[sel_idx(pcb->uldt_selector)] = pcb->uldt_desc; - cdp->cpu_uber.cu_user_gs_base = pcb->cthread_self; - - /* - * Set the thread`s LDT or LDT entry. - */ - if (new->task == TASK_NULL || new->task->i386_ldt == 0) { - /* - * Use system LDT. - */ - ml_cpu_set_ldt(KERNEL_LDT); - } else { - /* - * Task has its own LDT. - */ - user_ldt_set(new); - } - } - - /* - * Bump the scheduler generation count in the commpage. - * This can be read by user code to detect its preemption. - */ - commpage_sched_gen_inc(); -} -#else -static void -act_machine_switch_pcb( thread_t new ) -{ - pcb_t pcb = new->machine.pcb; - struct real_descriptor *ldtp; - vm_offset_t pcb_stack_top; - vm_offset_t hi_pcb_stack_top; - vm_offset_t hi_iss; - cpu_data_t *cdp = current_cpu_datap(); - - assert(new->kernel_stack != 0); - STACK_IEL(new->kernel_stack)->saved_state = pcb->iss; - - if (!cpu_mode_is64bit()) { - x86_saved_state32_tagged_t *hi_iss32; - /* - * Save a pointer to the top of the "kernel" stack - - * actually the place in the PCB where a trap into - * kernel mode will push the registers. - */ - hi_iss = (vm_offset_t)((unsigned long) - pmap_cpu_high_map_vaddr(cpu_number(), HIGH_CPU_ISS0) | - ((unsigned long)pcb->iss & PAGE_MASK)); - - cdp->cpu_hi_iss = (void *)hi_iss; - - pmap_high_map(pcb->iss_pte0, HIGH_CPU_ISS0); - pmap_high_map(pcb->iss_pte1, HIGH_CPU_ISS1); - - hi_iss32 = (x86_saved_state32_tagged_t *) hi_iss; - assert(hi_iss32->tag == x86_SAVED_STATE32); - - hi_pcb_stack_top = (int) (hi_iss32 + 1); - - /* - * For fast syscall, top of interrupt stack points to pcb stack - */ - *(vm_offset_t *) current_sstk() = hi_pcb_stack_top; - - current_ktss()->esp0 = hi_pcb_stack_top; - - } else if (is_saved_state64(pcb->iss)) { - /* - * The test above is performed against the thread save state - * flavor and not task's 64-bit feature flag because of the - * thread/task 64-bit state divergence that can arise in - * task_set_64bit() x86: the task state is changed before - * the individual thread(s). - */ - x86_saved_state64_tagged_t *iss64; - vm_offset_t isf; - - assert(is_saved_state64(pcb->iss)); - - iss64 = (x86_saved_state64_tagged_t *) pcb->iss; - - /* - * Set pointer to PCB's interrupt stack frame in cpu data. - * Used by syscall and double-fault trap handlers. - */ - isf = (vm_offset_t) &iss64->state.isf; - cdp->cpu_uber.cu_isf = UBER64(isf); - pcb_stack_top = (vm_offset_t) (iss64 + 1); - /* require 16-byte alignment */ - assert((pcb_stack_top & 0xF) == 0); - /* Interrupt stack is pcb */ - current_ktss64()->rsp0 = UBER64(pcb_stack_top); - - /* - * Top of temporary sysenter stack points to pcb stack. - * Although this is not normally used by 64-bit users, - * it needs to be set in case a sysenter is attempted. - */ - *current_sstk64() = UBER64(pcb_stack_top); - - cdp->cpu_task_map = new->map->pmap->pm_task_map; - - /* - * Enable the 64-bit user code segment, USER64_CS. - * Disable the 32-bit user code segment, USER_CS. - */ - ldt_desc_p(USER64_CS)->access |= ACC_PL_U; - ldt_desc_p(USER_CS)->access &= ~ACC_PL_U; - - } else { - x86_saved_state_compat32_t *iss32compat; - vm_offset_t isf; - - assert(is_saved_state32(pcb->iss)); - iss32compat = (x86_saved_state_compat32_t *) pcb->iss; - - pcb_stack_top = (int) (iss32compat + 1); - /* require 16-byte alignment */ - assert((pcb_stack_top & 0xF) == 0); - - /* - * Set pointer to PCB's interrupt stack frame in cpu data. - * Used by debug trap handler. - */ - isf = (vm_offset_t) &iss32compat->isf64; - cdp->cpu_uber.cu_isf = UBER64(isf); - - /* Top of temporary sysenter stack points to pcb stack */ - *current_sstk64() = UBER64(pcb_stack_top); - - /* Interrupt stack is pcb */ - current_ktss64()->rsp0 = UBER64(pcb_stack_top); - - cdp->cpu_task_map = TASK_MAP_32BIT; - /* Precalculate pointers to syscall argument store, for use - * in the trampolines. - */ - cdp->cpu_uber_arg_store = UBER64((vm_offset_t)get_bsduthreadarg(new)); - cdp->cpu_uber_arg_store_valid = UBER64((vm_offset_t)&pcb->arg_store_valid); - pcb->arg_store_valid = 0; - - /* - * Disable USER64_CS - * Enable USER_CS - */ - ldt_desc_p(USER64_CS)->access &= ~ACC_PL_U; - ldt_desc_p(USER_CS)->access |= ACC_PL_U; - } - - /* - * Set the thread`s cthread (a.k.a pthread) - * For 32-bit user this involves setting the USER_CTHREAD - * descriptor in the LDT to point to the cthread data. - * The involves copying in the pre-initialized descriptor. - */ - ldtp = (struct real_descriptor *)current_ldt(); - ldtp[sel_idx(USER_CTHREAD)] = pcb->cthread_desc; - if (pcb->uldt_selector != 0) - ldtp[sel_idx(pcb->uldt_selector)] = pcb->uldt_desc; - - - /* - * For 64-bit, we additionally set the 64-bit User GS base - * address. On return to 64-bit user, the GS.Base MSR will be written. - */ - cdp->cpu_uber.cu_user_gs_base = pcb->cthread_self; - - /* - * Set the thread`s LDT or LDT entry. - */ - if (new->task == TASK_NULL || new->task->i386_ldt == 0) { - /* - * Use system LDT. - */ - ml_cpu_set_ldt(KERNEL_LDT); - } else { - /* - * Task has its own LDT. - */ - user_ldt_set(new); - } - - /* - * Bump the scheduler generation count in the commpage. - * This can be read by user code to detect its preemption. - */ - commpage_sched_gen_inc(); -} -#endif /* * Switch to the first thread on a CPU. @@ -791,7 +471,7 @@ machine_load_context( machine_pmc_cswitch(NULL, new); #endif new->machine.specFlags |= OnProc; - act_machine_switch_pcb(new); + act_machine_switch_pcb(NULL, new); Load_context(new); } @@ -817,7 +497,6 @@ machine_switch_context( */ fpu_save_context(old); - old->machine.specFlags &= ~OnProc; new->machine.specFlags |= OnProc; @@ -837,12 +516,12 @@ machine_switch_context( * Switch address maps if need be, even if not switching tasks. * (A server activation may be "borrowing" a client map.) */ - PMAP_SWITCH_CONTEXT(old, new, cpu_number()) + PMAP_SWITCH_CONTEXT(old, new, cpu_number()); /* * Load the rest of the user state for the new thread */ - act_machine_switch_pcb(new); + act_machine_switch_pcb(old, new); return(Switch_context(old, continuation, new)); } @@ -861,16 +540,6 @@ machine_processor_shutdown( return(Shutdown_context(thread, doshutdown, processor)); } -/* - * act_machine_sv_free - * release saveareas associated with an act. if flag is true, release - * user level savearea(s) too, else don't - */ -void -act_machine_sv_free(__unused thread_t act, __unused int flag) -{ -} - /* * This is where registers that are not normally specified by the mach-o @@ -885,16 +554,16 @@ machine_thread_state_initialize( * The initialized state will then be lazily faulted-in, if required. * And if we're target, re-arm the no-fpu trap. */ - if (thread->machine.pcb->ifps) { + if (thread->machine.ifps) { (void) fpu_set_fxstate(thread, NULL, x86_FLOAT_STATE64); if (thread == current_thread()) clear_fpu(); } - if (thread->machine.pcb->ids) { - zfree(ids_zone, thread->machine.pcb->ids); - thread->machine.pcb->ids = NULL; + if (thread->machine.ids) { + zfree(ids_zone, thread->machine.ids); + thread->machine.ids = NULL; } return KERN_SUCCESS; @@ -940,6 +609,7 @@ get_exception_state64(thread_t thread, x86_exception_state64_t *es) saved_state = USER_REGS64(thread); es->trapno = saved_state->isf.trapno; + es->cpu = saved_state->isf.cpu; es->err = (typeof(es->err))saved_state->isf.err; es->faultvaddr = saved_state->cr2; } @@ -952,6 +622,7 @@ get_exception_state32(thread_t thread, x86_exception_state32_t *es) saved_state = USER_REGS32(thread); es->trapno = saved_state->trapno; + es->cpu = saved_state->cpu; es->err = saved_state->err; es->faultvaddr = saved_state->cr2; } @@ -962,6 +633,7 @@ set_thread_state32(thread_t thread, x86_thread_state32_t *ts) { x86_saved_state32_t *saved_state; + pal_register_cache_state(thread, DIRTY); saved_state = USER_REGS32(thread); @@ -1027,6 +699,7 @@ set_thread_state64(thread_t thread, x86_thread_state64_t *ts) { x86_saved_state64_t *saved_state; + pal_register_cache_state(thread, DIRTY); saved_state = USER_REGS64(thread); @@ -1066,6 +739,7 @@ get_thread_state32(thread_t thread, x86_thread_state32_t *ts) { x86_saved_state32_t *saved_state; + pal_register_cache_state(thread, VALID); saved_state = USER_REGS32(thread); @@ -1093,6 +767,7 @@ get_thread_state64(thread_t thread, x86_thread_state64_t *ts) { x86_saved_state64_t *saved_state; + pal_register_cache_state(thread, VALID); saved_state = USER_REGS64(thread); @@ -1120,87 +795,6 @@ get_thread_state64(thread_t thread, x86_thread_state64_t *ts) } -void -thread_set_wq_state32(thread_t thread, thread_state_t tstate) -{ - x86_thread_state32_t *state; - x86_saved_state32_t *saved_state; - thread_t curth = current_thread(); - spl_t s=0; - - - saved_state = USER_REGS32(thread); - - state = (x86_thread_state32_t *)tstate; - - if (curth != thread) { - s = splsched(); - thread_lock(thread); - } - - saved_state->ebp = 0; - saved_state->eip = state->eip; - saved_state->eax = state->eax; - saved_state->ebx = state->ebx; - saved_state->ecx = state->ecx; - saved_state->edx = state->edx; - saved_state->edi = state->edi; - saved_state->esi = state->esi; - saved_state->uesp = state->esp; - saved_state->efl = EFL_USER_SET; - - saved_state->cs = USER_CS; - saved_state->ss = USER_DS; - saved_state->ds = USER_DS; - saved_state->es = USER_DS; - - - if (curth != thread) { - thread_unlock(thread); - splx(s); - } -} - - -void -thread_set_wq_state64(thread_t thread, thread_state_t tstate) -{ - x86_thread_state64_t *state; - x86_saved_state64_t *saved_state; - thread_t curth = current_thread(); - spl_t s=0; - - - saved_state = USER_REGS64(thread); - state = (x86_thread_state64_t *)tstate; - - if (curth != thread) { - s = splsched(); - thread_lock(thread); - } - - saved_state->rbp = 0; - saved_state->rdi = state->rdi; - saved_state->rsi = state->rsi; - saved_state->rdx = state->rdx; - saved_state->rcx = state->rcx; - saved_state->r8 = state->r8; - saved_state->r9 = state->r9; - - saved_state->isf.rip = state->rip; - saved_state->isf.rsp = state->rsp; - saved_state->isf.cs = USER64_CS; - saved_state->isf.rflags = EFL_USER_SET; - - - if (curth != thread) { - thread_unlock(thread); - splx(s); - } -} - - - /* * act_machine_set_state: * @@ -1237,6 +831,7 @@ machine_thread_set_state( state->gs)) return KERN_INVALID_ARGUMENT; + pal_register_cache_state(thr_act, DIRTY); saved_state = USER_REGS32(thr_act); @@ -1307,6 +902,7 @@ machine_thread_set_state( !IS_USERADDR64_CANONICAL(state->isf.rip)) return KERN_INVALID_ARGUMENT; + pal_register_cache_state(thr_act, DIRTY); saved_state = USER_REGS64(thr_act); @@ -1757,6 +1353,11 @@ machine_thread_get_state( *count = x86_EXCEPTION_STATE32_COUNT; get_exception_state32(thr_act, (x86_exception_state32_t *)tstate); + /* + * Suppress the cpu number for binary compatibility + * of this deprecated state. + */ + ((x86_exception_state32_t *)tstate)->cpu = 0; break; } @@ -1771,6 +1372,11 @@ machine_thread_get_state( *count = x86_EXCEPTION_STATE64_COUNT; get_exception_state64(thr_act, (x86_exception_state64_t *)tstate); + /* + * Suppress the cpu number for binary compatibility + * of this deprecated state. + */ + ((x86_exception_state64_t *)tstate)->cpu = 0; break; } @@ -2029,156 +1635,6 @@ machine_thread_get_kern_state( } -/* - * Initialize the machine-dependent state for a new thread. - */ -kern_return_t -machine_thread_create( - thread_t thread, - task_t task) -{ - pcb_t pcb = &thread->machine.xxx_pcb; - x86_saved_state_t *iss; - -#if NCOPY_WINDOWS > 0 - inval_copy_windows(thread); - - thread->machine.physwindow_pte = 0; - thread->machine.physwindow_busy = 0; -#endif - - /* - * Allocate pcb only if required. - */ - if (pcb->sf == NULL) { - pcb->sf = zalloc(iss_zone); - if (pcb->sf == NULL) - panic("iss_zone"); - } - - if (task_has_64BitAddr(task)) { - x86_sframe64_t *sf64; - - sf64 = (x86_sframe64_t *) pcb->sf; - - bzero((char *)sf64, sizeof(x86_sframe64_t)); - - iss = (x86_saved_state_t *) &sf64->ssf; - iss->flavor = x86_SAVED_STATE64; - /* - * Guarantee that the bootstrapped thread will be in user - * mode. - */ - iss->ss_64.isf.rflags = EFL_USER_SET; - iss->ss_64.isf.cs = USER64_CS; - iss->ss_64.isf.ss = USER_DS; - iss->ss_64.fs = USER_DS; - iss->ss_64.gs = USER_DS; - } else { - if (cpu_mode_is64bit()) { - x86_sframe_compat32_t *sfc32; - - sfc32 = (x86_sframe_compat32_t *)pcb->sf; - - bzero((char *)sfc32, sizeof(x86_sframe_compat32_t)); - - iss = (x86_saved_state_t *) &sfc32->ssf.iss32; - iss->flavor = x86_SAVED_STATE32; -#if defined(__i386__) -#if DEBUG - { - x86_saved_state_compat32_t *xssc; - - xssc = (x86_saved_state_compat32_t *) iss; - - xssc->pad_for_16byte_alignment[0] = 0x64326432; - xssc->pad_for_16byte_alignment[1] = 0x64326432; - } -#endif /* DEBUG */ - } else { - x86_sframe32_t *sf32; - struct real_descriptor *ldtp; - pmap_paddr_t paddr; - - sf32 = (x86_sframe32_t *) pcb->sf; - - bzero((char *)sf32, sizeof(x86_sframe32_t)); - - iss = (x86_saved_state_t *) &sf32->ssf; - iss->flavor = x86_SAVED_STATE32; - pcb->iss_pte0 = pte_kernel_rw(kvtophys((vm_offset_t)iss)); - if (0 == (paddr = pa_to_pte(kvtophys((vm_offset_t)iss + PAGE_SIZE)))) - pcb->iss_pte1 = INTEL_PTE_INVALID; - else - pcb->iss_pte1 = pte_kernel_rw(paddr); - - - ldtp = (struct real_descriptor *) - pmap_index_to_virt(HIGH_FIXED_LDT_BEGIN); - pcb->cthread_desc = ldtp[sel_idx(USER_DS)]; - pcb->uldt_desc = ldtp[sel_idx(USER_DS)]; -#endif /* __i386__ */ - } - /* - * Guarantee that the bootstrapped thread will be in user - * mode. - */ - iss->ss_32.cs = USER_CS; - iss->ss_32.ss = USER_DS; - iss->ss_32.ds = USER_DS; - iss->ss_32.es = USER_DS; - iss->ss_32.fs = USER_DS; - iss->ss_32.gs = USER_DS; - iss->ss_32.efl = EFL_USER_SET; - - } - pcb->iss = iss; - - thread->machine.pcb = pcb; - simple_lock_init(&pcb->lock, 0); - - pcb->arg_store_valid = 0; - pcb->cthread_self = 0; - pcb->uldt_selector = 0; - - /* Ensure that the "cthread" descriptor describes a valid - * segment. - */ - if ((pcb->cthread_desc.access & ACC_P) == 0) { - struct real_descriptor *ldtp; - ldtp = (struct real_descriptor *)current_ldt(); - pcb->cthread_desc = ldtp[sel_idx(USER_DS)]; - } - - - return(KERN_SUCCESS); -} - -/* - * Machine-dependent cleanup prior to destroying a thread - */ -void -machine_thread_destroy( - thread_t thread) -{ - register pcb_t pcb = thread->machine.pcb; - - assert(pcb); - - if (pcb->ifps != 0) - fpu_free(pcb->ifps); - if (pcb->sf != 0) { - zfree(iss_zone, pcb->sf); - pcb->sf = 0; - } - if (pcb->ids) { - zfree(ids_zone, pcb->ids); - pcb->ids = NULL; - } - thread->machine.pcb = (pcb_t)0; - -} - void machine_thread_switch_addrmode(thread_t thread) { @@ -2189,17 +1645,20 @@ machine_thread_switch_addrmode(thread_t thread) disable_preemption(); /* - * Reset the state saveareas. + * Reset the state saveareas. As we're resetting, we anticipate no + * memory allocations in this path. */ machine_thread_create(thread, thread->task); /* If we're switching ourselves, reset the pcb addresses etc. */ if (thread == current_thread()) { + boolean_t istate = ml_set_interrupts_enabled(FALSE); #if defined(__i386__) - if (current_cpu_datap()->cpu_active_cr3 != kernel_pmap->pm_cr3) - pmap_load_kernel_cr3(); + if (current_cpu_datap()->cpu_active_cr3 != kernel_pmap->pm_cr3) + pmap_load_kernel_cr3(); #endif /* defined(__i386) */ - act_machine_switch_pcb(thread); + act_machine_switch_pcb(NULL, thread); + ml_set_interrupts_enabled(istate); } enable_preemption(); } @@ -2238,30 +1697,6 @@ machine_thread_terminate_self(void) } } -void -act_machine_return( - int code - ) -{ - /* - * This code is called with nothing locked. - * It also returns with nothing locked, if it returns. - * - * This routine terminates the current thread activation. - * If this is the only activation associated with its - * thread shuttle, then the entire thread (shuttle plus - * activation) is terminated. - */ - assert( code == KERN_TERMINATED ); - - thread_terminate_self(); - - /*NOTREACHED*/ - - panic("act_machine_return(%d): TALKING ZOMBIE! (1)", code); -} - - /* * Perform machine-dependent per-thread initializations */ @@ -2329,9 +1764,6 @@ dump_handlers(thread_t thr_act) void dump_regs(thread_t thr_act) { - if (thr_act->machine.pcb == NULL) - return; - if (thread_is_64bit(thr_act)) { x86_saved_state64_t *ssp; @@ -2371,14 +1803,14 @@ dump_act(thread_t thr_act) printf("\tsusp=%d user_stop=%d active=%x ast=%x\n", thr_act->suspend_count, thr_act->user_stop_count, thr_act->active, thr_act->ast); - printf("\tpcb=%p\n", thr_act->machine.pcb); + printf("\tpcb=%p\n", &thr_act->machine); if (thr_act->kernel_stack) { vm_offset_t stack = thr_act->kernel_stack; printf("\tk_stk %lx eip %x ebx %x esp %x iss %p\n", (long)stack, STACK_IKS(stack)->k_eip, STACK_IKS(stack)->k_ebx, - STACK_IKS(stack)->k_esp, STACK_IEL(stack)->saved_state); + STACK_IKS(stack)->k_esp, thr_act->machine.iss); } dump_handlers(thr_act); @@ -2392,9 +1824,6 @@ get_useraddr(void) { thread_t thr_act = current_thread(); - if (thr_act->machine.pcb == NULL) - return(0); - if (thread_is_64bit(thr_act)) { x86_saved_state64_t *iss64; @@ -2452,11 +1881,11 @@ machine_stack_attach( #if defined(__x86_64__) statep->k_rip = (unsigned long) Thread_continue; statep->k_rbx = (unsigned long) thread_continue; - statep->k_rsp = (unsigned long) STACK_IEL(stack); + statep->k_rsp = (unsigned long) (STACK_IKS(stack) - 1); #else statep->k_eip = (unsigned long) Thread_continue; statep->k_ebx = (unsigned long) thread_continue; - statep->k_esp = (unsigned long) STACK_IEL(stack); + statep->k_esp = (unsigned long) (STACK_IKS(stack) - 1); #endif return; @@ -2494,12 +1923,11 @@ machine_stack_handoff(thread_t old, fpu_save_context(old); - old->machine.specFlags &= ~OnProc; new->machine.specFlags |= OnProc; PMAP_SWITCH_CONTEXT(old, new, cpu_number()); - act_machine_switch_pcb(new); + act_machine_switch_pcb(old, new); machine_set_current_thread(new); @@ -2644,13 +2072,13 @@ void act_thread_cfree(__unused void *ctx) } void x86_toggle_sysenter_arg_store(thread_t thread, boolean_t valid); void x86_toggle_sysenter_arg_store(thread_t thread, boolean_t valid) { - thread->machine.pcb->arg_store_valid = valid; + thread->machine.arg_store_valid = valid; } boolean_t x86_sysenter_arg_store_isvalid(thread_t thread); boolean_t x86_sysenter_arg_store_isvalid(thread_t thread) { - return (thread->machine.pcb->arg_store_valid); + return (thread->machine.arg_store_valid); } /* @@ -2702,4 +2130,3 @@ copy_debug_state64( target->dr6 = src->dr6; target->dr7 = src->dr7; } - diff --git a/osfmk/i386/pcb_native.c b/osfmk/i386/pcb_native.c new file mode 100644 index 000000000..8ce815029 --- /dev/null +++ b/osfmk/i386/pcb_native.c @@ -0,0 +1,652 @@ +/* + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * @OSF_COPYRIGHT@ + */ +/* + * Mach Operating System + * Copyright (c) 1991,1990 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ + +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(__i386__) +#include +#endif +#include +#include + +#define ASSERT_IS_16BYTE_MULTIPLE_SIZEOF(_type_) \ +extern char assert_is_16byte_multiple_sizeof_ ## _type_ \ + [(sizeof(_type_) % 16) == 0 ? 1 : -1] + +/* Compile-time checks for vital save area sizing: */ +ASSERT_IS_16BYTE_MULTIPLE_SIZEOF(x86_64_intr_stack_frame_t); +ASSERT_IS_16BYTE_MULTIPLE_SIZEOF(x86_sframe64_t); +ASSERT_IS_16BYTE_MULTIPLE_SIZEOF(x86_saved_state_compat32_t); +ASSERT_IS_16BYTE_MULTIPLE_SIZEOF(x86_saved_state_t); + +#define DIRECTION_FLAG_DEBUG (DEBUG | DEVELOPMENT) + +extern zone_t iss_zone; /* zone for saved_state area */ +extern zone_t ids_zone; /* zone for debug_state area */ + +extern void *get_bsduthreadarg(thread_t); +void +act_machine_switch_pcb(__unused thread_t old, thread_t new) +{ + pcb_t pcb = THREAD_TO_PCB(new); + cpu_data_t *cdp = current_cpu_datap(); + struct real_descriptor *ldtp; + mach_vm_offset_t pcb_stack_top; + + assert(new->kernel_stack != 0); + assert(ml_get_interrupts_enabled() == FALSE); +#ifdef DIRECTION_FLAG_DEBUG + if (x86_get_flags() & EFL_DF) { + panic("Direction flag detected: 0x%lx", x86_get_flags()); + } +#endif + +#if defined(__x86_64__) + /* + * Clear segment state + * unconditionally for DS/ES/FS but more carefully for GS whose + * cached state we track. + */ + set_ds(NULL_SEG); + set_es(NULL_SEG); + set_fs(NULL_SEG); + if (get_gs() != NULL_SEG) { + swapgs(); /* switch to user's GS context */ + set_gs(NULL_SEG); + swapgs(); /* and back to kernel */ + + /* record the active machine state lost */ + cdp->cpu_uber.cu_user_gs_base = 0; + } + + if (is_saved_state64(pcb->iss)) { + /* + * The test above is performed against the thread save state + * flavor and not task's 64-bit feature flag because of the + * thread/task 64-bit state divergence that can arise in + * task_set_64bit() x86: the task state is changed before + * the individual thread(s). + */ + x86_saved_state64_tagged_t *iss64; + vm_offset_t isf; + + assert(is_saved_state64(pcb->iss)); + + iss64 = (x86_saved_state64_tagged_t *) pcb->iss; + + /* + * Set pointer to PCB's interrupt stack frame in cpu data. + * Used by syscall and double-fault trap handlers. + */ + isf = (vm_offset_t) &iss64->state.isf; + cdp->cpu_uber.cu_isf = isf; + pcb_stack_top = (vm_offset_t) (iss64 + 1); + /* require 16-byte alignment */ + assert((pcb_stack_top & 0xF) == 0); + + /* Interrupt stack is pcb */ + current_ktss64()->rsp0 = pcb_stack_top; + + /* + * Top of temporary sysenter stack points to pcb stack. + * Although this is not normally used by 64-bit users, + * it needs to be set in case a sysenter is attempted. + */ + *current_sstk64() = pcb_stack_top; + + cdp->cpu_task_map = new->map->pmap->pm_task_map; + + /* + * Enable the 64-bit user code segment, USER64_CS. + * Disable the 32-bit user code segment, USER_CS. + */ + ldt_desc_p(USER64_CS)->access |= ACC_PL_U; + ldt_desc_p(USER_CS)->access &= ~ACC_PL_U; + + /* + * Switch user's GS base if necessary + * by setting the Kernel's GS base MSR + * - this will become the user's on the swapgs when + * returning to user-space. Avoid this for + * kernel threads (no user TLS support required) + * and verify the memory shadow of the segment base + * in the event it was altered in user space. + */ + if ((pcb->cthread_self != 0) || (new->task != kernel_task)) { + if ((cdp->cpu_uber.cu_user_gs_base != pcb->cthread_self) || (pcb->cthread_self != rdmsr64(MSR_IA32_KERNEL_GS_BASE))) { + cdp->cpu_uber.cu_user_gs_base = pcb->cthread_self; + wrmsr64(MSR_IA32_KERNEL_GS_BASE, pcb->cthread_self); + } + } + } else { + x86_saved_state_compat32_t *iss32compat; + vm_offset_t isf; + + assert(is_saved_state32(pcb->iss)); + iss32compat = (x86_saved_state_compat32_t *) pcb->iss; + + pcb_stack_top = (uintptr_t) (iss32compat + 1); + /* require 16-byte alignment */ + assert((pcb_stack_top & 0xF) == 0); + + /* + * Set pointer to PCB's interrupt stack frame in cpu data. + * Used by debug trap handler. + */ + isf = (vm_offset_t) &iss32compat->isf64; + cdp->cpu_uber.cu_isf = isf; + + /* Top of temporary sysenter stack points to pcb stack */ + *current_sstk64() = pcb_stack_top; + + /* Interrupt stack is pcb */ + current_ktss64()->rsp0 = pcb_stack_top; + + cdp->cpu_task_map = TASK_MAP_32BIT; + /* Precalculate pointers to syscall argument store, for use + * in the trampolines. + */ + cdp->cpu_uber_arg_store = (vm_offset_t)get_bsduthreadarg(new); + cdp->cpu_uber_arg_store_valid = (vm_offset_t)&pcb->arg_store_valid; + pcb->arg_store_valid = 0; + + /* + * Disable USER64_CS + * Enable USER_CS + */ + ldt_desc_p(USER64_CS)->access &= ~ACC_PL_U; + ldt_desc_p(USER_CS)->access |= ACC_PL_U; + + /* + * Set the thread`s cthread (a.k.a pthread) + * For 32-bit user this involves setting the USER_CTHREAD + * descriptor in the LDT to point to the cthread data. + * The involves copying in the pre-initialized descriptor. + */ + ldtp = (struct real_descriptor *)current_ldt(); + ldtp[sel_idx(USER_CTHREAD)] = pcb->cthread_desc; + if (pcb->uldt_selector != 0) + ldtp[sel_idx(pcb->uldt_selector)] = pcb->uldt_desc; + cdp->cpu_uber.cu_user_gs_base = pcb->cthread_self; + + /* + * Set the thread`s LDT or LDT entry. + */ + if (new->task == TASK_NULL || new->task->i386_ldt == 0) { + /* + * Use system LDT. + */ + ml_cpu_set_ldt(KERNEL_LDT); + } else { + /* + * Task has its own LDT. + */ + user_ldt_set(new); + } + } + +#else /* !__x86_64__ */ + + vm_offset_t hi_pcb_stack_top; + vm_offset_t hi_iss; + + if (!cpu_mode_is64bit()) { + x86_saved_state32_tagged_t *hi_iss32; + /* + * Save a pointer to the top of the "kernel" stack - + * actually the place in the PCB where a trap into + * kernel mode will push the registers. + */ + hi_iss = (vm_offset_t)((unsigned long) + pmap_cpu_high_map_vaddr(cpu_number(), HIGH_CPU_ISS0) | + ((unsigned long)pcb->iss & PAGE_MASK)); + + cdp->cpu_hi_iss = (void *)hi_iss; + + pmap_high_map(pcb->iss_pte0, HIGH_CPU_ISS0); + pmap_high_map(pcb->iss_pte1, HIGH_CPU_ISS1); + + hi_iss32 = (x86_saved_state32_tagged_t *) hi_iss; + assert(hi_iss32->tag == x86_SAVED_STATE32); + + hi_pcb_stack_top = (int) (hi_iss32 + 1); + + /* + * For fast syscall, top of interrupt stack points to pcb stack + */ + *(vm_offset_t *) current_sstk() = hi_pcb_stack_top; + + current_ktss()->esp0 = hi_pcb_stack_top; + + } else if (is_saved_state64(pcb->iss)) { + /* + * The test above is performed against the thread save state + * flavor and not task's 64-bit feature flag because of the + * thread/task 64-bit state divergence that can arise in + * task_set_64bit() x86: the task state is changed before + * the individual thread(s). + */ + x86_saved_state64_tagged_t *iss64; + vm_offset_t isf; + + assert(is_saved_state64(pcb->iss)); + + iss64 = (x86_saved_state64_tagged_t *) pcb->iss; + + /* + * Set pointer to PCB's interrupt stack frame in cpu data. + * Used by syscall and double-fault trap handlers. + */ + isf = (vm_offset_t) &iss64->state.isf; + cdp->cpu_uber.cu_isf = UBER64(isf); + pcb_stack_top = (vm_offset_t) (iss64 + 1); + /* require 16-byte alignment */ + assert((pcb_stack_top & 0xF) == 0); + /* Interrupt stack is pcb */ + current_ktss64()->rsp0 = UBER64(pcb_stack_top); + + /* + * Top of temporary sysenter stack points to pcb stack. + * Although this is not normally used by 64-bit users, + * it needs to be set in case a sysenter is attempted. + */ + *current_sstk64() = UBER64(pcb_stack_top); + + cdp->cpu_task_map = new->map->pmap->pm_task_map; + + /* + * Enable the 64-bit user code segment, USER64_CS. + * Disable the 32-bit user code segment, USER_CS. + */ + ldt_desc_p(USER64_CS)->access |= ACC_PL_U; + ldt_desc_p(USER_CS)->access &= ~ACC_PL_U; + + } else { + x86_saved_state_compat32_t *iss32compat; + vm_offset_t isf; + + assert(is_saved_state32(pcb->iss)); + iss32compat = (x86_saved_state_compat32_t *) pcb->iss; + + pcb_stack_top = (int) (iss32compat + 1); + /* require 16-byte alignment */ + assert((pcb_stack_top & 0xF) == 0); + + /* + * Set pointer to PCB's interrupt stack frame in cpu data. + * Used by debug trap handler. + */ + isf = (vm_offset_t) &iss32compat->isf64; + cdp->cpu_uber.cu_isf = UBER64(isf); + + /* Top of temporary sysenter stack points to pcb stack */ + *current_sstk64() = UBER64(pcb_stack_top); + + /* Interrupt stack is pcb */ + current_ktss64()->rsp0 = UBER64(pcb_stack_top); + + cdp->cpu_task_map = TASK_MAP_32BIT; + /* Precalculate pointers to syscall argument store, for use + * in the trampolines. + */ + cdp->cpu_uber_arg_store = UBER64((vm_offset_t)get_bsduthreadarg(new)); + cdp->cpu_uber_arg_store_valid = UBER64((vm_offset_t)&pcb->arg_store_valid); + pcb->arg_store_valid = 0; + + /* + * Disable USER64_CS + * Enable USER_CS + */ + ldt_desc_p(USER64_CS)->access &= ~ACC_PL_U; + ldt_desc_p(USER_CS)->access |= ACC_PL_U; + } + + /* + * Set the thread`s cthread (a.k.a pthread) + * For 32-bit user this involves setting the USER_CTHREAD + * descriptor in the LDT to point to the cthread data. + * The involves copying in the pre-initialized descriptor. + */ + ldtp = (struct real_descriptor *)current_ldt(); + ldtp[sel_idx(USER_CTHREAD)] = pcb->cthread_desc; + if (pcb->uldt_selector != 0) + ldtp[sel_idx(pcb->uldt_selector)] = pcb->uldt_desc; + + /* + * For 64-bit, we additionally set the 64-bit User GS base + * address. On return to 64-bit user, the GS.Base MSR will be written. + */ + cdp->cpu_uber.cu_user_gs_base = pcb->cthread_self; + + /* + * Set the thread`s LDT or LDT entry. + */ + if (new->task == TASK_NULL || new->task->i386_ldt == 0) { + /* + * Use system LDT. + */ + ml_cpu_set_ldt(KERNEL_LDT); + } else { + /* + * Task has its own LDT. + */ + user_ldt_set(new); + } +#endif + + /* + * Bump the scheduler generation count in the commpage. + * This can be read by user code to detect its preemption. + */ + commpage_sched_gen_inc(); +} +void +thread_set_wq_state32(thread_t thread, thread_state_t tstate) +{ + x86_thread_state32_t *state; + x86_saved_state32_t *saved_state; + thread_t curth = current_thread(); + spl_t s=0; + + pal_register_cache_state(thread, DIRTY); + + saved_state = USER_REGS32(thread); + + state = (x86_thread_state32_t *)tstate; + + if (curth != thread) { + s = splsched(); + thread_lock(thread); + } + + saved_state->ebp = 0; + saved_state->eip = state->eip; + saved_state->eax = state->eax; + saved_state->ebx = state->ebx; + saved_state->ecx = state->ecx; + saved_state->edx = state->edx; + saved_state->edi = state->edi; + saved_state->esi = state->esi; + saved_state->uesp = state->esp; + saved_state->efl = EFL_USER_SET; + + saved_state->cs = USER_CS; + saved_state->ss = USER_DS; + saved_state->ds = USER_DS; + saved_state->es = USER_DS; + + if (curth != thread) { + thread_unlock(thread); + splx(s); + } +} + + +void +thread_set_wq_state64(thread_t thread, thread_state_t tstate) +{ + x86_thread_state64_t *state; + x86_saved_state64_t *saved_state; + thread_t curth = current_thread(); + spl_t s=0; + + pal_register_cache_state(thread, DIRTY); + + saved_state = USER_REGS64(thread); + state = (x86_thread_state64_t *)tstate; + + if (curth != thread) { + s = splsched(); + thread_lock(thread); + } + + saved_state->rbp = 0; + saved_state->rdi = state->rdi; + saved_state->rsi = state->rsi; + saved_state->rdx = state->rdx; + saved_state->rcx = state->rcx; + saved_state->r8 = state->r8; + saved_state->r9 = state->r9; + + saved_state->isf.rip = state->rip; + saved_state->isf.rsp = state->rsp; + saved_state->isf.cs = USER64_CS; + saved_state->isf.rflags = EFL_USER_SET; + + if (curth != thread) { + thread_unlock(thread); + splx(s); + } +} + +/* + * Initialize the machine-dependent state for a new thread. + */ +kern_return_t +machine_thread_create( + thread_t thread, + task_t task) +{ + pcb_t pcb = THREAD_TO_PCB(thread); + x86_saved_state_t *iss; + +#if NCOPY_WINDOWS > 0 + inval_copy_windows(thread); + + thread->machine.physwindow_pte = 0; + thread->machine.physwindow_busy = 0; +#endif + + /* + * Allocate save frame only if required. + */ + if (pcb->sf == NULL) { + assert((get_preemption_level() == 0)); + pcb->sf = zalloc(iss_zone); + if (pcb->sf == NULL) + panic("iss_zone"); + } + + if (task_has_64BitAddr(task)) { + x86_sframe64_t *sf64; + + sf64 = (x86_sframe64_t *) pcb->sf; + + bzero((char *)sf64, sizeof(x86_sframe64_t)); + + iss = (x86_saved_state_t *) &sf64->ssf; + iss->flavor = x86_SAVED_STATE64; + /* + * Guarantee that the bootstrapped thread will be in user + * mode. + */ + iss->ss_64.isf.rflags = EFL_USER_SET; + iss->ss_64.isf.cs = USER64_CS; + iss->ss_64.isf.ss = USER_DS; + iss->ss_64.fs = USER_DS; + iss->ss_64.gs = USER_DS; + } else { + if (cpu_mode_is64bit()) { + x86_sframe_compat32_t *sfc32; + + sfc32 = (x86_sframe_compat32_t *)pcb->sf; + + bzero((char *)sfc32, sizeof(x86_sframe_compat32_t)); + + iss = (x86_saved_state_t *) &sfc32->ssf.iss32; + iss->flavor = x86_SAVED_STATE32; +#if defined(__i386__) +#if DEBUG + { + sfc32->pad_for_16byte_alignment[0] = 0x64326432; + sfc32->pad_for_16byte_alignment[1] = 0x64326432; + } +#endif /* DEBUG */ + } else { + x86_sframe32_t *sf32; + struct real_descriptor *ldtp; + pmap_paddr_t paddr; + + sf32 = (x86_sframe32_t *) pcb->sf; + + bzero((char *)sf32, sizeof(x86_sframe32_t)); + + iss = (x86_saved_state_t *) &sf32->ssf; + iss->flavor = x86_SAVED_STATE32; + + pcb->iss_pte0 = pte_kernel_rw(kvtophys((vm_offset_t)iss)); + if (0 == (paddr = pa_to_pte(kvtophys((vm_offset_t)iss + PAGE_SIZE)))) + pcb->iss_pte1 = INTEL_PTE_INVALID; + else + pcb->iss_pte1 = pte_kernel_rw(paddr); + + ldtp = (struct real_descriptor *) + pmap_index_to_virt(HIGH_FIXED_LDT_BEGIN); + pcb->cthread_desc = ldtp[sel_idx(USER_DS)]; + pcb->uldt_desc = ldtp[sel_idx(USER_DS)]; +#endif /* __i386__ */ + } + /* + * Guarantee that the bootstrapped thread will be in user + * mode. + */ + iss->ss_32.cs = USER_CS; + iss->ss_32.ss = USER_DS; + iss->ss_32.ds = USER_DS; + iss->ss_32.es = USER_DS; + iss->ss_32.fs = USER_DS; + iss->ss_32.gs = USER_DS; + iss->ss_32.efl = EFL_USER_SET; + + } + pcb->iss = iss; + + simple_lock_init(&pcb->lock, 0); + + pcb->arg_store_valid = 0; + pcb->cthread_self = 0; + pcb->uldt_selector = 0; + + /* Ensure that the "cthread" descriptor describes a valid + * segment. + */ + if ((pcb->cthread_desc.access & ACC_P) == 0) { + struct real_descriptor *ldtp; + ldtp = (struct real_descriptor *)current_ldt(); + pcb->cthread_desc = ldtp[sel_idx(USER_DS)]; + } + + return(KERN_SUCCESS); +} + +/* + * Machine-dependent cleanup prior to destroying a thread + */ +void +machine_thread_destroy( + thread_t thread) +{ + register pcb_t pcb = THREAD_TO_PCB(thread); + + if (pcb->ifps != 0) + fpu_free(pcb->ifps); + if (pcb->sf != 0) { + zfree(iss_zone, pcb->sf); + pcb->sf = 0; + } + if (pcb->ids) { + zfree(ids_zone, pcb->ids); + pcb->ids = NULL; + } +} diff --git a/osfmk/i386/pmCPU.c b/osfmk/i386/pmCPU.c index c469d7a1c..22eafd1b8 100644 --- a/osfmk/i386/pmCPU.c +++ b/osfmk/i386/pmCPU.c @@ -42,13 +42,15 @@ #include #include #include -#include #include #include #include -#include +#include #include #include +#include + +#include extern int disableConsoleOutput; @@ -57,7 +59,7 @@ decl_simple_lock_data(,pm_init_lock); /* * The following is set when the KEXT loads and initializes. */ -pmDispatch_t *pmDispatch = NULL; +pmDispatch_t *pmDispatch = NULL; static uint32_t pmInitDone = 0; static boolean_t earlyTopology = FALSE; @@ -111,7 +113,10 @@ machine_idle(void) * cause problems in some MP configurations w.r.t. the APIC * stopping during a GV3 transition). */ - __asm__ volatile ("sti; hlt"); + pal_hlt(); + + /* Once woken, re-disable interrupts. */ + pal_cli(); } /* @@ -125,7 +130,7 @@ machine_idle(void) * Re-enable interrupts. */ out: - __asm__ volatile("sti"); + pal_sti(); } /* @@ -140,19 +145,19 @@ pmCPUHalt(uint32_t reason) switch (reason) { case PM_HALT_DEBUG: cpup->lcpu.state = LCPU_PAUSE; - __asm__ volatile ("wbinvd; hlt"); + pal_stop_cpu(FALSE); break; case PM_HALT_PANIC: cpup->lcpu.state = LCPU_PAUSE; - __asm__ volatile ("cli; wbinvd; hlt"); + pal_stop_cpu(TRUE); break; case PM_HALT_NORMAL: default: - __asm__ volatile ("cli"); + pal_cli(); - if (pmInitDone + if (pmInitDone && pmDispatch != NULL && pmDispatch->pmCPUHalt != NULL) { /* @@ -166,7 +171,8 @@ pmCPUHalt(uint32_t reason) i386_init_slave_fast(); panic("init_slave_fast returned"); - } else { + } else + { /* * If no power managment and a processor is taken off-line, * then invalidate the cache and halt it (it will not be able @@ -174,10 +180,11 @@ pmCPUHalt(uint32_t reason) */ __asm__ volatile ("wbinvd"); cpup->lcpu.state = LCPU_HALT; - __asm__ volatile ( "wbinvd; hlt" ); + pal_stop_cpu(FALSE); panic("back from Halt"); } + break; } } @@ -269,13 +276,15 @@ pmLockCPUTopology(int lock) /* * Called to get the next deadline that has been set by the * power management code. + * Note: a return of 0 from AICPM and this routine signifies + * that no deadline is set. */ uint64_t pmCPUGetDeadline(cpu_data_t *cpu) { uint64_t deadline = 0; - if (pmInitDone + if (pmInitDone && pmDispatch != NULL && pmDispatch->GetDeadline != NULL) deadline = (*pmDispatch->GetDeadline)(&cpu->lcpu); @@ -500,6 +509,19 @@ ml_set_maxintdelay(uint64_t mdelay) pmDispatch->setMaxIntDelay(mdelay); } +boolean_t +ml_get_interrupt_prewake_applicable() +{ + boolean_t applicable = FALSE; + + if (pmInitDone + && pmDispatch != NULL + && pmDispatch->pmInterruptPrewakeApplicable != NULL) + applicable = pmDispatch->pmInterruptPrewakeApplicable(); + + return applicable; +} + /* * Put a CPU into "safe" mode with respect to power. * @@ -604,26 +626,58 @@ machine_choose_processor(processor_set_t pset, } static int -pmThreadGetUrgency(__unused uint64_t *rt_period, __unused uint64_t *rt_deadline) +pmThreadGetUrgency(uint64_t *rt_period, uint64_t *rt_deadline) { - return(0); + return(thread_get_urgency(rt_period, rt_deadline)); } +#if DEBUG +uint32_t urgency_stats[64][THREAD_URGENCY_MAX]; +#endif + +#define URGENCY_NOTIFICATION_ASSERT_NS (5 * 1000 * 1000) +uint64_t urgency_notification_assert_abstime_threshold, urgency_notification_max_recorded; + void thread_tell_urgency(int urgency, - uint64_t rt_period, - uint64_t rt_deadline) -{ - KERNEL_DEBUG_CONSTANT(0x1400054, - urgency, rt_period, (rt_deadline >> 32), rt_deadline, 0); - - if (!pmInitDone - || pmDispatch == NULL - || pmDispatch->pmThreadTellUrgency == NULL) - return; + uint64_t rt_period, + uint64_t rt_deadline) +{ + uint64_t urgency_notification_time_start, delta; + boolean_t urgency_assert = (urgency_notification_assert_abstime_threshold != 0); + assert(get_preemption_level() > 0 || ml_get_interrupts_enabled() == FALSE); +#if DEBUG + urgency_stats[cpu_number() % 64][urgency]++; +#endif + if (!pmInitDone + || pmDispatch == NULL + || pmDispatch->pmThreadTellUrgency == NULL) + return; + + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED,MACH_URGENCY) | DBG_FUNC_START, urgency, rt_period, (rt_deadline >> 32), rt_deadline, 0); + + if (__improbable((urgency_assert == TRUE))) + urgency_notification_time_start = mach_absolute_time(); + + pmDispatch->pmThreadTellUrgency(urgency, rt_period, rt_deadline); + + if (__improbable((urgency_assert == TRUE))) { + delta = mach_absolute_time() - urgency_notification_time_start; + + if (__improbable(delta > urgency_notification_max_recorded)) { + /* This is not synchronized, but it doesn't matter + * if we (rarely) miss an event, as it is statistically + * unlikely that it will never recur. + */ + urgency_notification_max_recorded = delta; + + if (__improbable((delta > urgency_notification_assert_abstime_threshold) && !machine_timeout_suspended())) + panic("Urgency notification callout %p exceeded threshold, 0x%llx abstime units", pmDispatch->pmThreadTellUrgency, delta); + } + } - pmDispatch->pmThreadTellUrgency(urgency, rt_period, rt_deadline); + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED,MACH_URGENCY) | DBG_FUNC_END, urgency, rt_period, (rt_deadline >> 32), rt_deadline, 0); } void @@ -693,21 +747,25 @@ pmGetNanotimeInfo(pm_rtc_nanotime_t *rtc_nanotime) * Make sure that nanotime didn't change while we were reading it. */ do { - rtc_nanotime->generation = rtc_nanotime_info.generation; /* must be first */ - rtc_nanotime->tsc_base = rtc_nanotime_info.tsc_base; - rtc_nanotime->ns_base = rtc_nanotime_info.ns_base; - rtc_nanotime->scale = rtc_nanotime_info.scale; - rtc_nanotime->shift = rtc_nanotime_info.shift; - } while(rtc_nanotime_info.generation != 0 - && rtc_nanotime->generation != rtc_nanotime_info.generation); + rtc_nanotime->generation = pal_rtc_nanotime_info.generation; /* must be first */ + rtc_nanotime->tsc_base = pal_rtc_nanotime_info.tsc_base; + rtc_nanotime->ns_base = pal_rtc_nanotime_info.ns_base; + rtc_nanotime->scale = pal_rtc_nanotime_info.scale; + rtc_nanotime->shift = pal_rtc_nanotime_info.shift; + } while(pal_rtc_nanotime_info.generation != 0 + && rtc_nanotime->generation != pal_rtc_nanotime_info.generation); } static uint32_t -pmTimerQueueMigrate(__unused int target_cpu) +pmTimerQueueMigrate(int target_cpu) { - return (0); + /* Call the etimer code to do this. */ + return (target_cpu != cpu_number()) + ? etimer_queue_migrate(target_cpu) + : 0; } + /* * Called by the power management kext to register itself and to get the * callbacks it might need into other kernel functions. This interface @@ -736,19 +794,18 @@ pmKextRegister(uint32_t version, pmDispatch_t *cpuFuncs, callbacks->LCPUtoProcessor = pmLCPUtoProcessor; callbacks->ThreadBind = thread_bind; callbacks->GetSavedRunCount = pmGetSavedRunCount; - callbacks->pmSendIPI = pmSendIPI; callbacks->GetNanotimeInfo = pmGetNanotimeInfo; callbacks->ThreadGetUrgency = pmThreadGetUrgency; callbacks->RTCClockAdjust = rtc_clock_adjust; callbacks->timerQueueMigrate = pmTimerQueueMigrate; callbacks->topoParms = &topoParms; + callbacks->pmSendIPI = pmSendIPI; callbacks->InterruptPending = lapic_is_interrupt_pending; callbacks->IsInterrupting = lapic_is_interrupting; callbacks->InterruptStats = lapic_interrupt_counts; callbacks->DisableApicTimer = lapic_disable_timer; } else { - panic("Version mis-match between Kernel (%d) and CPU PM (%d)", - PM_DISPATCH_VERSION, version); + panic("Version mis-match between Kernel and CPU PM"); } if (cpuFuncs != NULL) { diff --git a/osfmk/i386/pmCPU.h b/osfmk/i386/pmCPU.h index 55041fc10..c443c1efa 100644 --- a/osfmk/i386/pmCPU.h +++ b/osfmk/i386/pmCPU.h @@ -30,15 +30,14 @@ #define _I386_PMCPU_H_ #include -#include #ifndef ASSEMBLER /* - * This value should be changed each time that pmDsipatch_t or pmCallBacks_t + * This value should be changed each time that pmDispatch_t or pmCallBacks_t * changes. */ -#define PM_DISPATCH_VERSION 23 +#define PM_DISPATCH_VERSION 102 /* * Dispatch table for functions that get installed when the power @@ -79,11 +78,10 @@ typedef struct int (*pmIPIHandler)(void *state); void (*pmThreadTellUrgency)(int urgency, uint64_t rt_period, uint64_t rt_deadline); void (*pmActiveRTThreads)(boolean_t active); + boolean_t (*pmInterruptPrewakeApplicable)(void); } pmDispatch_t; - -/* - * common time fields exported to PM code. This structure may be +/* common time fields exported to PM code. This structure may be * allocated on the stack, so avoid making it unnecessarily large. */ typedef struct pm_rtc_nanotime { @@ -115,9 +113,8 @@ typedef struct { void (*pmSendIPI)(int cpu); void (*GetNanotimeInfo)(pm_rtc_nanotime_t *); int (*ThreadGetUrgency)(uint64_t *rt_period, uint64_t *rt_deadline); - uint32_t (*timeQueueMigrate)(int cpu); - void (*RTCClockAdjust)(uint64_t adjustment); uint32_t (*timerQueueMigrate)(int cpu); + void (*RTCClockAdjust)(uint64_t adjustment); x86_topology_parameters_t *topoParms; boolean_t (*InterruptPending)(void); boolean_t (*IsInterrupting)(uint8_t vector); @@ -144,8 +141,6 @@ void pmTimerSave(void); void pmTimerRestore(void); kern_return_t pmCPUExitHalt(int cpu); kern_return_t pmCPUExitHaltToOff(int cpu); -void thread_tell_urgency(int urgency, uint64_t rt_period, uint64_t rt_deadline); -void active_rt_threads(boolean_t active); #define PM_HALT_NORMAL 0 /* normal halt path */ #define PM_HALT_DEBUG 1 /* debug code wants to halt */ @@ -160,7 +155,9 @@ void pmSafeMode(x86_lcpu_t *lcpu, uint32_t flags); #define PM_SAFE_FL_RESUME 0x00000020 /* resume execution on the CPU */ extern int pmsafe_debug; -extern int idlehalt; +/* Default urgency timing threshold for the DEBUG build */ +#define URGENCY_NOTIFICATION_ASSERT_NS (5 * 1000 * 1000) +extern uint64_t urgency_notification_assert_abstime_threshold; /****************************************************************************** * diff --git a/osfmk/i386/pmap.c b/osfmk/i386/pmap.c index 3d12ba9f2..b672bdc6b 100644 --- a/osfmk/i386/pmap.c +++ b/osfmk/i386/pmap.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -144,6 +144,7 @@ #include #include #include +#include /* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */ @@ -160,35 +161,6 @@ #include #endif /* IWANTTODEBUG */ -/* - * Forward declarations for internal functions. - */ - -void pmap_remove_range( - pmap_t pmap, - vm_map_offset_t va, - pt_entry_t *spte, - pt_entry_t *epte); - -void phys_attribute_clear( - ppnum_t phys, - int bits); - -int phys_attribute_test( - ppnum_t phys, - int bits); - -void phys_attribute_set( - ppnum_t phys, - int bits); - -void pmap_set_reference( - ppnum_t pn); - -boolean_t phys_page_exists( - ppnum_t pn); - - #ifdef PMAP_DEBUG void dump_pmap(pmap_t); void dump_4GB_pdpt(pmap_t p); @@ -203,26 +175,19 @@ int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64 #endif int allow_stack_exec = 0; /* No apps may execute from the stack by default */ +#if CONFIG_YONAH boolean_t cpu_64bit = FALSE; +#else +const boolean_t cpu_64bit = TRUE; +#endif boolean_t pmap_trace = FALSE; -/* - * when spinning through pmap_remove - * ensure that we don't spend too much - * time with preemption disabled. - * I'm setting the current threshold - * to 20us - */ -#define MAX_PREEMPTION_LATENCY_NS 20000 - uint64_t max_preemption_latency_tsc = 0; - pv_hashed_entry_t *pv_hash_table; /* hash lists */ uint32_t npvhash = 0; - /* * pv_list entries are kept on a list that can only be accessed * with the pmap system locked (at SPLVM, not in the cpus_active set). @@ -235,27 +200,10 @@ decl_simple_lock_data(,pv_hashed_free_list_lock) decl_simple_lock_data(,pv_hashed_kern_free_list_lock) decl_simple_lock_data(,pv_hash_table_lock) -int pv_free_count = 0; -int pv_hashed_free_count = 0; -int pv_kern_free_count = 0; -int pv_hashed_kern_free_count = 0; - zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry structures */ static zone_t pdpt_zone; -/* - * Each entry in the pv_head_table is locked by a bit in the - * pv_lock_table. The lock bits are accessed by the physical - * address of the page they lock. - */ - -char *pv_lock_table; /* pointer to array of bits */ -#define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) - -char *pv_hash_lock_table; -#define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) - /* * First and last physical addresses that we maintain any information * for. Initialized to zero so that pmap operations done before @@ -266,6 +214,10 @@ boolean_t pmap_initialized = FALSE;/* Has pmap_init completed? */ static struct vm_object kptobj_object_store; static vm_object_t kptobj; +/* + * Index into pv_head table, its lock bits, and the modify/reference and managed bits + */ + /* * Array of physical page attribites for managed pages. * One byte per physical page. @@ -273,11 +225,6 @@ static vm_object_t kptobj; char *pmap_phys_attributes; unsigned int last_managed_page = 0; -/* - * Amount of virtual memory mapped by one - * page-directory entry. - */ -#define PDE_MAPPED_SIZE (pdetova(1)) uint64_t pde_mapped_size; /* @@ -369,30 +316,37 @@ struct zone *pmap_zone; /* zone of pmap structures */ int pmap_debug = 0; /* flag for debugging prints */ unsigned int inuse_ptepages_count = 0; +long long alloc_ptepages_count __attribute__((aligned(8))) = 0LL; /* aligned for atomic access */ +unsigned int bootstrap_wired_pages = 0; +int pt_fake_zone_index = -1; + +extern long NMIPI_acks; + +static inline void +PMAP_ZINFO_SALLOC(vm_size_t bytes) +{ + current_thread()->tkm_shared.alloc += bytes; +} + +static inline void +PMAP_ZINFO_SFREE(vm_size_t bytes) +{ + current_thread()->tkm_shared.free += (bytes); +} addr64_t kernel64_cr3; boolean_t no_shared_cr3 = FALSE; /* -no_shared_cr3 boot arg */ - -/* - * Pmap cache. Cache is threaded through ref_count field of pmap. - * Max will eventually be constant -- variable for experimentation. - */ -int pmap_cache_max = 32; -int pmap_alloc_chunk = 8; -pmap_t pmap_cache_list; -int pmap_cache_count; -decl_simple_lock_data(,pmap_cache_lock) +boolean_t kernel_text_ps_4K = TRUE; +boolean_t wpkernel = TRUE; extern char end; - static int nkpt; -extern long NMIPI_acks; - pt_entry_t *DMAP1, *DMAP2; caddr_t DADDR1; caddr_t DADDR2; + /* * for legacy, returns the address of the pde entry. * for 64 bit, causes the pdpt page containing the pde entry to be mapped, @@ -412,7 +366,6 @@ pmap_pde(pmap_t m, vm_map_offset_t v) return pde; } - /* * the single pml4 page per pmap is allocated at pmap create time and exists * for the duration of the pmap. we allocate this page in kernel vm (to save us one @@ -644,7 +597,6 @@ pmap_map_bd( if (prot & VM_PROT_WRITE) template |= INTEL_PTE_WRITE; - while (start_addr < end_addr) { spl = splhigh(); pte = pmap_pte(kernel_pmap, (vm_map_offset_t)virt); @@ -658,14 +610,20 @@ pmap_map_bd( start_addr += PAGE_SIZE; } - flush_tlb(); return(virt); } -extern char *first_avail; +extern pmap_paddr_t first_avail; extern vm_offset_t virtual_avail, virtual_end; extern pmap_paddr_t avail_start, avail_end; +extern vm_offset_t sHIB; +extern vm_offset_t eHIB; +extern vm_offset_t stext; +extern vm_offset_t etext; +extern vm_offset_t sdata; + +extern void *KPTphys; void pmap_cpu_init(void) @@ -834,7 +792,6 @@ pmap_bootstrap( boolean_t IA32e) { vm_offset_t va; - pt_entry_t *pte; int i; pdpt_entry_t *pdpt; spl_t s; @@ -872,7 +829,10 @@ pmap_bootstrap( pmap_store_pte(pdpt, pa | INTEL_PTE_VALID); } +#if CONFIG_YONAH + /* 32-bit and legacy support depends on IA32e mode being disabled */ cpu_64bit = IA32e; +#endif lo_kernel_cr3 = kernel_pmap->pm_cr3; current_cpu_datap()->cpu_kernel_cr3 = (addr64_t) kernel_pmap->pm_cr3; @@ -887,6 +847,8 @@ pmap_bootstrap( nkpt = NKPT; OSAddAtomic(NKPT, &inuse_ptepages_count); + OSAddAtomic64(NKPT, &alloc_ptepages_count); + bootstrap_wired_pages = NKPT; virtual_avail = (vm_offset_t)VADDR(KPTDI,0) + (vm_offset_t)first_avail; virtual_end = (vm_offset_t)(VM_MAX_KERNEL_ADDRESS); @@ -895,11 +857,11 @@ pmap_bootstrap( * Reserve some special page table entries/VA space for temporary * mapping of pages. */ -#define SYSMAP(c, p, v, n) \ - v = (c)va; va += ((n)*INTEL_PGBYTES); p = pte; pte += (n) - va = virtual_avail; + pt_entry_t *pte; pte = vtopte(va); +#define SYSMAP(c, p, v, n) \ + v = (c)va; va += ((n)*INTEL_PGBYTES); p = pte; pte += (n) for (i=0; i> PTPGSHIFT); + DBG("ml_static_mfree(%p,0x%x) for pte\n", + (void *) vm_ptep, PAGE_SIZE); + ml_static_mfree(vm_ptep, PAGE_SIZE); + } + + /* Change variable read by sysctl machdep.pmap */ + pmap_kernel_text_ps = I386_LPGBYTES; + } + /* no matter what, kernel page zero is not accessible */ + pmap_store_pte(pmap_pte(kernel_pmap, 0), INTEL_PTE_INVALID); + + /* map lowmem global page into fixed addr */ + pt_entry_t *pte = NULL; + if (0 == (pte = pmap_pte(kernel_pmap, + VM_MIN_KERNEL_LOADED_ADDRESS + 0x2000))) + panic("lowmem pte"); + /* make sure it is defined on page boundary */ + assert(0 == ((vm_offset_t) &lowGlo & PAGE_MASK)); + pmap_store_pte(pte, kvtophys((vm_offset_t)&lowGlo) + | INTEL_PTE_REF + | INTEL_PTE_MOD + | INTEL_PTE_WIRED + | INTEL_PTE_VALID + | INTEL_PTE_RW); + splx(spl); + flush_tlb(); +} #define managed_page(x) ( (unsigned int)x <= last_managed_page && (pmap_phys_attributes[x] & PHYS_MANAGED) ) @@ -1324,6 +1489,8 @@ pmap_create( va = (vm_offset_t)p->dirbase; p->pdirbase = kvtophys(va); + PMAP_ZINFO_SALLOC(NBPTD); + template = INTEL_PTE_VALID; for (i = 0; i< NPGPTD; i++, pdpt++ ) { pmap_paddr_t pa; @@ -1347,6 +1514,8 @@ pmap_create( p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_hold); OSAddAtomic(1, &inuse_ptepages_count); + OSAddAtomic64(1, &alloc_ptepages_count); + PMAP_ZINFO_SALLOC(PAGE_SIZE); /* allocate the vm_objs to hold the pdpt, pde and pte pages */ @@ -1362,7 +1531,7 @@ pmap_create( /* uber space points to uber mapped kernel */ s = splhigh(); pml4p = pmap64_pml4(p, 0ULL); - pmap_store_pte((pml4p+KERNEL_UBER_PML4_INDEX), *kernel_pmap->pm_pml4); + pmap_store_pte((pml4p+KERNEL_UBER_PML4_INDEX),*kernel_pmap->pm_pml4); if (!is_64bit) { @@ -1458,6 +1627,7 @@ void pmap_clear_4GB_pagezero(pmap_t p) { pdpt_entry_t *user_pdptp; + boolean_t istate; if (p->pm_task_map != TASK_MAP_64BIT_SHARED) return; @@ -1466,6 +1636,9 @@ pmap_clear_4GB_pagezero(pmap_t p) p->pm_task_map = TASK_MAP_64BIT; + istate = ml_set_interrupts_enabled(FALSE); + if (current_cpu_datap()->cpu_task_map == TASK_MAP_64BIT_SHARED) + current_cpu_datap()->cpu_task_map = TASK_MAP_64BIT; pmap_load_kernel_cr3(); user_pdptp = pmap64_pdpt(p, 0x0); @@ -1474,6 +1647,8 @@ pmap_clear_4GB_pagezero(pmap_t p) pmap_store_pte(user_pdptp+2, 0); pmap_store_pte(user_pdptp+3, 0); + ml_set_interrupts_enabled(istate); + PMAP_UNLOCK(p); } @@ -1542,8 +1717,11 @@ pmap_destroy( */ if (!cpu_64bit) { OSAddAtomic(-p->pm_obj->resident_page_count, &inuse_ptepages_count); + PMAP_ZINFO_PFREE(p->pm_obj->resident_page_count * PAGE_SIZE); kmem_free(kernel_map, (vm_offset_t)p->dirbase, NBPTD); + PMAP_ZINFO_SFREE(NBPTD); + zfree(pdpt_zone, (void *)p->pm_hold); vm_object_deallocate(p->pm_obj); @@ -1552,8 +1730,8 @@ pmap_destroy( int inuse_ptepages = 0; /* free 64 bit mode structs */ - inuse_ptepages++; kmem_free(kernel_map, (vm_offset_t)p->pm_hold, PAGE_SIZE); + PMAP_ZINFO_SFREE(PAGE_SIZE); inuse_ptepages += p->pm_obj_pml4->resident_page_count; vm_object_deallocate(p->pm_obj_pml4); @@ -1564,8 +1742,10 @@ pmap_destroy( inuse_ptepages += p->pm_obj->resident_page_count; vm_object_deallocate(p->pm_obj); - OSAddAtomic(-inuse_ptepages, &inuse_ptepages_count); + OSAddAtomic(-(inuse_ptepages+1), &inuse_ptepages_count); + PMAP_ZINFO_PFREE(inuse_ptepages * PAGE_SIZE); } + zfree(pmap_zone, p); PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END, @@ -1589,7 +1769,6 @@ pmap_reference( } } - /* * Remove phys addr if mapped in specified map * @@ -1604,22 +1783,6 @@ pmap_remove_some_phys( } -/* - * Routine: - * pmap_disconnect - * - * Function: - * Disconnect all mappings for this page and return reference and change status - * in generic format. - * - */ -unsigned int pmap_disconnect( - ppnum_t pa) -{ - pmap_page_protect(pa, 0); /* disconnect the page */ - return (pmap_get_refmod(pa)); /* return ref/chg status */ -} - /* * Set the physical protection on the * specified range of this map as requested. @@ -1694,7 +1857,9 @@ pmap_protect( sva = lva; } if (num_found) + { PMAP_UPDATE_TLBS(map, orig_sva, eva); + } PMAP_UNLOCK(map); @@ -1723,51 +1888,6 @@ pmap_map_block( } } - -/* - * Routine: pmap_change_wiring - * Function: Change the wiring attribute for a map/virtual-address - * pair. - * In/out conditions: - * The mapping must already exist in the pmap. - */ -void -pmap_change_wiring( - register pmap_t map, - vm_map_offset_t vaddr, - boolean_t wired) -{ - register pt_entry_t *pte; - - /* - * We must grab the pmap system lock because we may - * change a pte_page queue. - */ - PMAP_LOCK(map); - - if ((pte = pmap_pte(map, vaddr)) == PT_ENTRY_NULL) - panic("pmap_change_wiring: pte missing"); - - if (wired && !iswired(*pte)) { - /* - * wiring down mapping - */ - OSAddAtomic(+1, &map->stats.wired_count); - pmap_update_pte(pte, *pte, (*pte | INTEL_PTE_WIRED)); - } - else if (!wired && iswired(*pte)) { - /* - * unwiring mapping - */ - assert(map->stats.wired_count >= 1); - OSAddAtomic(-1, &map->stats.wired_count); - pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_WIRED)); - } - - PMAP_UNLOCK(map); -} - - /* * Routine: pmap_extract * Function: @@ -1838,6 +1958,8 @@ pmap_expand_pml4( vm_page_unlock_queues(); OSAddAtomic(1, &inuse_ptepages_count); + OSAddAtomic64(1, &alloc_ptepages_count); + PMAP_ZINFO_PALLOC(PAGE_SIZE); /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */ vm_object_lock(map->pm_obj_pml4); @@ -1853,6 +1975,7 @@ pmap_expand_pml4( VM_PAGE_FREE(m); OSAddAtomic(-1, &inuse_ptepages_count); + PMAP_ZINFO_PFREE(PAGE_SIZE); return; } pmap_set_noencrypt(pn); @@ -1928,6 +2051,8 @@ pmap_expand_pdpt( vm_page_unlock_queues(); OSAddAtomic(1, &inuse_ptepages_count); + OSAddAtomic64(1, &alloc_ptepages_count); + PMAP_ZINFO_PALLOC(PAGE_SIZE); /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */ vm_object_lock(map->pm_obj_pdpt); @@ -1943,6 +2068,7 @@ pmap_expand_pdpt( VM_PAGE_FREE(m); OSAddAtomic(-1, &inuse_ptepages_count); + PMAP_ZINFO_PFREE(PAGE_SIZE); return; } pmap_set_noencrypt(pn); @@ -2040,6 +2166,8 @@ pmap_expand( vm_page_unlock_queues(); OSAddAtomic(1, &inuse_ptepages_count); + OSAddAtomic64(1, &alloc_ptepages_count); + PMAP_ZINFO_PALLOC(PAGE_SIZE); /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */ vm_object_lock(map->pm_obj); @@ -2056,6 +2184,7 @@ pmap_expand( VM_PAGE_FREE(m); OSAddAtomic(-1, &inuse_ptepages_count); + PMAP_ZINFO_PFREE(PAGE_SIZE); return; } pmap_set_noencrypt(pn); @@ -2205,11 +2334,12 @@ pmap_collect( if (m == VM_PAGE_NULL) panic("pmap_collect: pte page not in object"); + vm_object_unlock(p->pm_obj); + VM_PAGE_FREE(m); OSAddAtomic(-1, &inuse_ptepages_count); - - vm_object_unlock(p->pm_obj); + PMAP_ZINFO_PFREE(PAGE_SIZE); } PMAP_LOCK(p); @@ -2261,319 +2391,6 @@ pmap_pageable( #endif /* lint */ } -/* - * Clear specified attribute bits. - */ -void -phys_attribute_clear( - ppnum_t pn, - int bits) -{ - pv_rooted_entry_t pv_h; - register pv_hashed_entry_t pv_e; - register pt_entry_t *pte; - int pai; - register pmap_t pmap; - - pmap_intr_assert(); - assert(pn != vm_page_fictitious_addr); - if (pn == vm_page_guard_addr) - return; - - pai = ppn_to_pai(pn); - - if (!managed_page(pai)) { - /* - * Not a managed page. - */ - return; - } - - - PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, - (int) pn, bits, 0, 0, 0); - - pv_h = pai_to_pvh(pai); - - LOCK_PVH(pai); - - /* - * Walk down PV list, clearing all modify or reference bits. - * We do not have to lock the pv_list because we have - * the entire pmap system locked. - */ - if (pv_h->pmap != PMAP_NULL) { - /* - * There are some mappings. - */ - - pv_e = (pv_hashed_entry_t)pv_h; - - do { - pmap = pv_e->pmap; - - { - vm_map_offset_t va; - - va = pv_e->va; - - /* - * Clear modify and/or reference bits. - */ - - pte = pmap_pte(pmap, va); - pmap_update_pte(pte, *pte, (*pte & ~bits)); - /* Ensure all processors using this translation - * invalidate this TLB entry. The invalidation *must* follow - * the PTE update, to ensure that the TLB shadow of the - * 'D' bit (in particular) is synchronized with the - * updated PTE. - */ - PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE); - } - - pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink); - - } while (pv_e != (pv_hashed_entry_t)pv_h); - } - pmap_phys_attributes[pai] &= ~bits; - - UNLOCK_PVH(pai); - - PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END, - 0, 0, 0, 0, 0); - -} - -/* - * Check specified attribute bits. - */ -int -phys_attribute_test( - ppnum_t pn, - int bits) -{ - pv_rooted_entry_t pv_h; - register pv_hashed_entry_t pv_e; - register pt_entry_t *pte; - int pai; - register pmap_t pmap; - int attributes = 0; - - pmap_intr_assert(); - assert(pn != vm_page_fictitious_addr); - if (pn == vm_page_guard_addr) - return 0; - - pai = ppn_to_pai(pn); - - if (!managed_page(pai)) { - /* - * Not a managed page. - */ - return (0); - } - - /* - * super fast check... if bits already collected - * no need to take any locks... - * if not set, we need to recheck after taking - * the lock in case they got pulled in while - * we were waiting for the lock - */ - if ( (pmap_phys_attributes[pai] & bits) == bits) - return (bits); - - pv_h = pai_to_pvh(pai); - - LOCK_PVH(pai); - - attributes = pmap_phys_attributes[pai] & bits; - - - /* - * Walk down PV list, checking the mappings until we - * reach the end or we've found the attributes we've asked for - * We do not have to lock the pv_list because we have - * the entire pmap system locked. - */ - if (pv_h->pmap != PMAP_NULL) { - /* - * There are some mappings. - */ - pv_e = (pv_hashed_entry_t)pv_h; - if (attributes != bits) do { - - pmap = pv_e->pmap; - - { - vm_map_offset_t va; - - va = pv_e->va; - /* - * first make sure any processor actively - * using this pmap, flushes its TLB state - */ - PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE); - - /* - * pick up modify and/or reference bits from this mapping - */ - pte = pmap_pte(pmap, va); - attributes |= (int)(*pte & bits); - - } - - pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink); - - } while ((attributes != bits) && (pv_e != (pv_hashed_entry_t)pv_h)); - } - - UNLOCK_PVH(pai); - return (attributes); -} - -/* - * Set specified attribute bits. - */ -void -phys_attribute_set( - ppnum_t pn, - int bits) -{ - int pai; - - pmap_intr_assert(); - assert(pn != vm_page_fictitious_addr); - if (pn == vm_page_guard_addr) - return; - - pai = ppn_to_pai(pn); - - if (!managed_page(pai)) { - /* - * Not a managed page. - */ - return; - } - - LOCK_PVH(pai); - - pmap_phys_attributes[pai] |= bits; - - UNLOCK_PVH(pai); -} - -/* - * Set the modify bit on the specified physical page. - */ - -void pmap_set_modify( - ppnum_t pn) -{ - phys_attribute_set(pn, PHYS_MODIFIED); -} - -/* - * Clear the modify bits on the specified physical page. - */ - -void -pmap_clear_modify( - ppnum_t pn) -{ - phys_attribute_clear(pn, PHYS_MODIFIED); -} - -/* - * pmap_is_modified: - * - * Return whether or not the specified physical page is modified - * by any physical maps. - */ - -boolean_t -pmap_is_modified( - ppnum_t pn) -{ - if (phys_attribute_test(pn, PHYS_MODIFIED)) - return TRUE; - - return FALSE; -} - -/* - * pmap_clear_reference: - * - * Clear the reference bit on the specified physical page. - */ - -void -pmap_clear_reference( - ppnum_t pn) -{ - phys_attribute_clear(pn, PHYS_REFERENCED); -} - -void -pmap_set_reference(ppnum_t pn) -{ - phys_attribute_set(pn, PHYS_REFERENCED); -} - -/* - * pmap_is_referenced: - * - * Return whether or not the specified physical page is referenced - * by any physical maps. - */ - -boolean_t -pmap_is_referenced( - ppnum_t pn) -{ - if (phys_attribute_test(pn, PHYS_REFERENCED)) - return TRUE; - - return FALSE; -} - -/* - * pmap_get_refmod(phys) - * returns the referenced and modified bits of the specified - * physical page. - */ -unsigned int -pmap_get_refmod(ppnum_t pa) -{ - int refmod; - unsigned int retval = 0; - - refmod = phys_attribute_test(pa, PHYS_MODIFIED | PHYS_REFERENCED); - - if (refmod & PHYS_MODIFIED) - retval |= VM_MEM_MODIFIED; - if (refmod & PHYS_REFERENCED) - retval |= VM_MEM_REFERENCED; - - return (retval); -} - -/* - * pmap_clear_refmod(phys, mask) - * clears the referenced and modified bits as specified by the mask - * of the specified physical page. - */ -void -pmap_clear_refmod(ppnum_t pa, unsigned int mask) -{ - unsigned int x86Mask; - - x86Mask = ( ((mask & VM_MEM_MODIFIED)? PHYS_MODIFIED : 0) - | ((mask & VM_MEM_REFERENCED)? PHYS_REFERENCED : 0)); - phys_attribute_clear(pa, x86Mask); -} - void invalidate_icache(__unused vm_offset_t addr, __unused unsigned cnt, @@ -2879,7 +2696,6 @@ pmap_cpu_free(struct cpu_pmap *cp) } } - mapwindow_t * pmap_get_mapwindow(pt_entry_t pentry) { @@ -2887,7 +2703,8 @@ pmap_get_mapwindow(pt_entry_t pentry) int i; assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0); - + /* fold in cache attributes for this physical page */ + pentry |= pmap_get_cache_attributes(i386_btop(pte_to_pa(pentry))); /* * Note: 0th map reserved for pmap_pte() */ @@ -2895,11 +2712,11 @@ pmap_get_mapwindow(pt_entry_t pentry) mp = ¤t_cpu_datap()->cpu_pmap->mapwindow[i]; if (*mp->prv_CMAP == 0) { - pmap_store_pte(mp->prv_CMAP, pentry); + pmap_store_pte(mp->prv_CMAP, pentry); - invlpg((uintptr_t)mp->prv_CADDR); + invlpg((uintptr_t)mp->prv_CADDR); - return (mp); + return (mp); } } panic("pmap_get_mapwindow: no windows available"); @@ -2937,17 +2754,26 @@ void pmap_disable_NX(pmap_t pmap) { } void -pt_fake_zone_info(int *count, vm_size_t *cur_size, vm_size_t *max_size, vm_size_t *elem_size, - vm_size_t *alloc_size, int *collectable, int *exhaustable) +pt_fake_zone_init(int zone_index) +{ + pt_fake_zone_index = zone_index; +} + +void +pt_fake_zone_info(int *count, + vm_size_t *cur_size, vm_size_t *max_size, vm_size_t *elem_size, vm_size_t *alloc_size, + uint64_t *sum_size, int *collectable, int *exhaustable, int *caller_acct) { *count = inuse_ptepages_count; *cur_size = PAGE_SIZE * inuse_ptepages_count; *max_size = PAGE_SIZE * (inuse_ptepages_count + vm_page_inactive_count + vm_page_active_count + vm_page_free_count); *elem_size = PAGE_SIZE; *alloc_size = PAGE_SIZE; + *sum_size = alloc_ptepages_count * PAGE_SIZE; *collectable = 1; *exhaustable = 0; + *caller_acct = 1; } vm_offset_t pmap_cpu_high_map_vaddr(int cpu, enum high_cpu_types e) @@ -2986,7 +2812,7 @@ pmap_cpuset_NMIPI(cpu_set cpu_mask) { if (cpu_mask & cpu_bit) cpu_NMI_interrupt(cpu); } - deadline = mach_absolute_time() + (LockTimeOut * 2); + deadline = mach_absolute_time() + (((uint64_t)LockTimeOut) * 3); while (mach_absolute_time() < deadline) cpu_pause(); } @@ -3001,7 +2827,7 @@ pmap_cpuset_NMIPI(cpu_set cpu_mask) { * - return ... the caller will unlock the pmap */ void -pmap_flush_tlbs(pmap_t pmap) +pmap_flush_tlbs(pmap_t pmap, vm_map_offset_t startv, vm_map_offset_t endv) { unsigned int cpu; unsigned int cpu_bit; @@ -3044,8 +2870,8 @@ pmap_flush_tlbs(pmap_t pmap) } } - PMAP_TRACE(PMAP_CODE(PMAP__FLUSH_TLBS) | DBG_FUNC_START, - (int) pmap, cpus_to_signal, flush_self, 0, 0); + PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS) | DBG_FUNC_START, + (uintptr_t) pmap, cpus_to_signal, flush_self, startv, 0); if (cpus_to_signal) { cpu_set cpus_to_respond = cpus_to_signal; @@ -3056,6 +2882,7 @@ pmap_flush_tlbs(pmap_t pmap) */ while (cpus_to_respond != 0) { long orig_acks = 0; + for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) { if ((cpus_to_respond & cpu_bit) != 0) { if (!cpu_datap(cpu)->cpu_running || @@ -3068,7 +2895,8 @@ pmap_flush_tlbs(pmap_t pmap) if (cpus_to_respond == 0) break; } - if (mach_absolute_time() > deadline) { + + if (cpus_to_respond && (mach_absolute_time() > deadline)) { if (machine_timeout_suspended()) continue; pmap_tlb_flush_timeout = TRUE; @@ -3094,8 +2922,8 @@ pmap_flush_tlbs(pmap_t pmap) panic("pmap_flush_tlbs: pmap == kernel_pmap && flush_self != TRUE; kernel CR3: 0x%llX, CPU active CR3: 0x%llX, CPU Task Map: %d", kernel_pmap->pm_cr3, current_cpu_datap()->cpu_active_cr3, current_cpu_datap()->cpu_task_map); } - PMAP_TRACE(PMAP_CODE(PMAP__FLUSH_TLBS) | DBG_FUNC_END, - (int) pmap, cpus_to_signal, flush_self, 0, 0); + PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS) | DBG_FUNC_END, + (uintptr_t) pmap, cpus_to_signal, startv, endv, 0); } void @@ -3120,16 +2948,6 @@ pmap_update_interrupt(void) PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_END, 0, 0, 0, 0, 0); } - - -unsigned int pmap_cache_attributes(ppnum_t pn) { - - if (!managed_page(ppn_to_pai(pn))) - return (VM_WIMG_IO); - - return (VM_WIMG_COPYBACK); -} - #ifdef PMAP_DEBUG void pmap_dump(pmap_t p) @@ -3222,4 +3040,3 @@ void dump_4GB_pdpt_thread(thread_t tp) #endif - diff --git a/osfmk/i386/pmap.h b/osfmk/i386/pmap.h index 5d3ac764e..44b6bf742 100644 --- a/osfmk/i386/pmap.h +++ b/osfmk/i386/pmap.h @@ -80,10 +80,13 @@ #include #include #include +#include #include #include +#include + /* * Define the generic in terms of the specific */ @@ -172,6 +175,24 @@ typedef uint64_t pt_entry_t; typedef uint64_t pmap_paddr_t; +#if DEBUG +#define PMAP_ASSERT 1 +#endif +#if PMAP_ASSERT +#define pmap_assert(ex) ((ex) ? (void)0 : Assert(__FILE__, __LINE__, # ex)) + +#define pmap_assert2(ex, fmt, args...) \ + do { \ + if (!(ex)) { \ + kprintf("Assertion %s failed (%s:%d, caller %p) " fmt , #ex, __FILE__, __LINE__, __builtin_return_address(0), ##args); \ + panic("Assertion %s failed (%s:%d, caller %p) " fmt , #ex, __FILE__, __LINE__, __builtin_return_address(0), ##args); \ + } \ + } while(0) +#else +#define pmap_assert(ex) +#define pmap_assert2(ex, fmt, args...) +#endif + /* superpages */ #ifdef __x86_64__ #define SUPERPAGE_NBASEPAGES 512 @@ -385,19 +406,10 @@ enum high_fixed_addresses { #define pdenum(pmap, a) (((vm_offset_t)(a) >> PDESHIFT) & PDEMASK) #define PMAP_INVALID_PDPTNUM (~0ULL) -#ifdef __i386__ #define pdeidx(pmap, a) (((a) >> PDSHIFT) & ((1ULL<<(48 - PDSHIFT)) -1)) #define pdptidx(pmap, a) (((a) >> PDPTSHIFT) & ((1ULL<<(48 - PDPTSHIFT)) -1)) #define pml4idx(pmap, a) (((a) >> PML4SHIFT) & ((1ULL<<(48 - PML4SHIFT)) -1)) -#else -#define VAMASK ((1ULL<<48)-1) -#define pml4idx(pmap, a) ((((a) & VAMASK) >> PML4SHIFT) & \ - ((1ULL<<(48 - PML4SHIFT))-1)) -#define pdptidx(pmap, a) ((((a) & PML4MASK) >> PDPTSHIFT) & \ - ((1ULL<<(48 - PDPTSHIFT))-1)) -#define pdeidx(pmap, a) ((((a) & PML4MASK) >> PDSHIFT) & \ - ((1ULL<<(48 - PDSHIFT)) - 1)) -#endif + /* * Convert page descriptor index to user virtual address @@ -433,7 +445,8 @@ enum high_fixed_addresses { #define INTEL_PTE_INVALID 0 /* This is conservative, but suffices */ -#define INTEL_PTE_RSVD ((1ULL << 8) | (1ULL << 9) | (1ULL << 10) | (1ULL << 11) | (0x1FFULL << 54)) +#define INTEL_PTE_RSVD ((1ULL << 10) | (1ULL << 11) | (0x1FFULL << 54)) + #define pa_to_pte(a) ((a) & INTEL_PTE_PFN) /* XXX */ #define pte_to_pa(p) ((p) & INTEL_PTE_PFN) /* XXX */ #define pte_increment_pa(p) ((p) += INTEL_OFFMASK+1) @@ -513,26 +526,26 @@ struct md_page { */ struct pmap { + decl_simple_lock_data(,lock) /* lock on map */ + pmap_paddr_t pm_cr3; /* physical addr */ + boolean_t pm_shared; pd_entry_t *dirbase; /* page directory pointer */ #ifdef __i386__ pmap_paddr_t pdirbase; /* phys. address of dirbase */ + vm_offset_t pm_hold; /* true pdpt zalloc addr */ #endif vm_object_t pm_obj; /* object to hold pde's */ - int ref_count; /* reference count */ - int nx_enabled; task_map_t pm_task_map; - decl_simple_lock_data(,lock) /* lock on map */ - struct pmap_statistics stats; /* map statistics */ -#ifdef __i386__ - vm_offset_t pm_hold; /* true pdpt zalloc addr */ -#endif - pmap_paddr_t pm_cr3; /* physical addr */ pdpt_entry_t *pm_pdpt; /* KVA of 3rd level page */ pml4_entry_t *pm_pml4; /* VKA of top level */ vm_object_t pm_obj_pdpt; /* holds pdpt pages */ vm_object_t pm_obj_pml4; /* holds pml4 pages */ - vm_object_t pm_obj_top; /* holds single top level page */ - boolean_t pm_shared; +#define PMAP_PCID_MAX_CPUS (48) /* Must be a multiple of 8 */ + pcid_t pmap_pcid_cpus[PMAP_PCID_MAX_CPUS]; + volatile uint8_t pmap_pcid_coherency_vector[PMAP_PCID_MAX_CPUS]; + struct pmap_statistics stats; /* map statistics */ + int ref_count; /* reference count */ + int nx_enabled; }; @@ -578,23 +591,30 @@ extern unsigned pmap_memory_region_current; #define PMAP_MEMORY_REGIONS_SIZE 128 extern pmap_memory_region_t pmap_memory_regions[]; +#include static inline void set_dirbase(pmap_t tpmap, __unused thread_t thread) { - current_cpu_datap()->cpu_task_cr3 = tpmap->pm_cr3; - current_cpu_datap()->cpu_task_map = tpmap->pm_task_map; + int ccpu = cpu_number(); + cpu_datap(ccpu)->cpu_task_cr3 = tpmap->pm_cr3; + cpu_datap(ccpu)->cpu_task_map = tpmap->pm_task_map; #ifndef __i386__ /* * Switch cr3 if necessary * - unless running with no_shared_cr3 debugging mode * and we're not on the kernel's cr3 (after pre-empted copyio) */ - if (!no_shared_cr3) { - if (get_cr3() != tpmap->pm_cr3) - set_cr3(tpmap->pm_cr3); + if (__probable(!no_shared_cr3)) { + if (get_cr3_base() != tpmap->pm_cr3) { + if (pmap_pcid_ncpus) { + pmap_pcid_activate(tpmap, ccpu); + } + else + set_cr3_raw(tpmap->pm_cr3); + } } else { - if (get_cr3() != current_cpu_datap()->cpu_kernel_cr3) - set_cr3(current_cpu_datap()->cpu_kernel_cr3); + if (get_cr3_base() != cpu_datap(ccpu)->cpu_kernel_cr3) + set_cr3_raw(cpu_datap(ccpu)->cpu_kernel_cr3); } #endif } @@ -616,7 +636,7 @@ extern addr64_t (kvtophys)( extern void pmap_expand( pmap_t pmap, vm_map_offset_t addr); - +#if !defined(__x86_64__) extern pt_entry_t *pmap_pte( struct pmap *pmap, vm_map_offset_t addr); @@ -632,7 +652,7 @@ extern pd_entry_t *pmap64_pde( extern pdpt_entry_t *pmap64_pdpt( struct pmap *pmap, vm_map_offset_t addr); - +#endif extern vm_offset_t pmap_map( vm_offset_t virt, vm_map_offset_t start, @@ -670,7 +690,10 @@ extern void pmap_commpage64_init( int count); #endif - +/* + * Get cache attributes (as pagetable bits) for the specified phys page + */ +extern unsigned pmap_get_cache_attributes(ppnum_t); #if NCOPY_WINDOWS > 0 extern struct cpu_pmap *pmap_cpu_alloc( boolean_t is_boot_cpu); @@ -704,10 +727,11 @@ extern vm_offset_t pmap_cpu_high_shared_remap(int, enum high_cpu_types, vm_offse extern vm_offset_t pmap_high_shared_remap(enum high_fixed_addresses, vm_offset_t, int); #endif -extern void pt_fake_zone_info(int *, vm_size_t *, vm_size_t *, vm_size_t *, vm_size_t *, int *, int *); +extern void pt_fake_zone_init(int); +extern void pt_fake_zone_info(int *, vm_size_t *, vm_size_t *, vm_size_t *, vm_size_t *, + uint64_t *, int *, int *, int *); extern void pmap_pagetable_corruption_msg_log(int (*)(const char * fmt, ...)__printflike(1,2)); - /* * Macros for speed. */ @@ -727,8 +751,11 @@ extern void pmap_pagetable_corruption_msg_log(int (*)(const char * fmt, ...)__pr #define PMAP_DEACTIVATE_MAP(map, thread) \ if (vm_map_pmap(map)->pm_task_map == TASK_MAP_64BIT_SHARED) \ pmap_load_kernel_cr3(); +#elif defined(__x86_64__) +#define PMAP_DEACTIVATE_MAP(map, thread) \ + pmap_assert(pmap_pcid_ncpus ? (pcid_for_pmap_cpu_tuple(map->pmap, cpu_number()) == (get_cr3_raw() & 0xFFF)) : TRUE); #else -#define PMAP_DEACTIVATE_MAP(map, my_cpu) +#define PMAP_DEACTIVATE_MAP(map, thread) #endif #if defined(__i386__) @@ -772,18 +799,16 @@ extern void pmap_pagetable_corruption_msg_log(int (*)(const char * fmt, ...)__pr #else /* __x86_64__ */ #define PMAP_SWITCH_CONTEXT(old_th, new_th, my_cpu) { \ - spl_t spl; \ \ - spl = splhigh(); \ + pmap_assert(ml_get_interrupts_enabled() == FALSE); \ if (old_th->map != new_th->map) { \ PMAP_DEACTIVATE_MAP(old_th->map, old_th); \ PMAP_ACTIVATE_MAP(new_th->map, new_th); \ } \ - splx(spl); \ } #endif /* __i386__ */ -#ifdef __i386__ +#if NCOPY_WINDOWS > 0 #define PMAP_SWITCH_USER(th, new_map, my_cpu) { \ spl_t spl; \ \ @@ -792,7 +817,7 @@ extern void pmap_pagetable_corruption_msg_log(int (*)(const char * fmt, ...)__pr th->map = new_map; \ PMAP_ACTIVATE_MAP(th->map, th); \ splx(spl); \ - inval_copy_windows(th); \ + inval_copy_windows(th); \ } #else #define PMAP_SWITCH_USER(th, new_map, my_cpu) { \ @@ -810,7 +835,7 @@ extern void pmap_pagetable_corruption_msg_log(int (*)(const char * fmt, ...)__pr * Marking the current cpu's cr3 inactive is achieved by setting its lsb. * Marking the current cpu's cr3 active once more involves clearng this bit. * Note that valid page tables are page-aligned and so the bottom 12 bits - * are noramlly zero. + * are normally zero, modulo PCID. * We can only mark the current cpu active/inactive but we can test any cpu. */ #define CPU_CR3_MARK_INACTIVE() \ @@ -837,13 +862,13 @@ extern void pmap_pagetable_corruption_msg_log(int (*)(const char * fmt, ...)__pr */ #if defined(__x86_64__) #define MARK_CPU_IDLE(my_cpu) { \ - int s = splhigh(); \ + assert(ml_get_interrupts_enabled() == FALSE); \ CPU_CR3_MARK_INACTIVE(); \ __asm__ volatile("mfence"); \ - splx(s); \ } #else /* __i386__ native */ #define MARK_CPU_IDLE(my_cpu) { \ + assert(ml_get_interrupts_enabled() == FALSE); \ /* \ * Mark this cpu idle, and remove it from the active set, \ * since it is not actively using any pmap. Signal_cpus \ @@ -851,20 +876,17 @@ extern void pmap_pagetable_corruption_msg_log(int (*)(const char * fmt, ...)__pr * but will queue the update request for when the cpu \ * becomes active. \ */ \ - int s = splhigh(); \ if (!cpu_mode_is64bit() || no_shared_cr3) \ process_pmap_updates(); \ else \ pmap_load_kernel_cr3(); \ CPU_CR3_MARK_INACTIVE(); \ __asm__ volatile("mfence"); \ - splx(s); \ } #endif /* __i386__ */ #define MARK_CPU_ACTIVE(my_cpu) { \ - \ - int s = splhigh(); \ + assert(ml_get_interrupts_enabled() == FALSE); \ /* \ * If a kernel_pmap update was requested while this cpu \ * was idle, process it as if we got the interrupt. \ @@ -880,7 +902,6 @@ extern void pmap_pagetable_corruption_msg_log(int (*)(const char * fmt, ...)__pr \ if (current_cpu_datap()->cpu_tlb_invalid) \ process_pmap_updates(); \ - splx(s); \ } #define PMAP_CONTEXT(pmap, thread) @@ -898,11 +919,13 @@ extern void pmap_pagetable_corruption_msg_log(int (*)(const char * fmt, ...)__pr #define pmap_attribute_cache_sync(addr,size,attr,value) \ (KERN_INVALID_ADDRESS) -#define MACHINE_PMAP_IS_EMPTY 1 +#define MACHINE_PMAP_IS_EMPTY 1 extern boolean_t pmap_is_empty(pmap_t pmap, vm_map_offset_t start, vm_map_offset_t end); +#define MACHINE_BOOTSTRAPPTD 1 /* Static bootstrap page-tables */ + #endif /* ASSEMBLER */ diff --git a/osfmk/i386/pmap_common.c b/osfmk/i386/pmap_common.c new file mode 100644 index 000000000..d81248dae --- /dev/null +++ b/osfmk/i386/pmap_common.c @@ -0,0 +1,505 @@ +/* + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include + +/* + * Each entry in the pv_head_table is locked by a bit in the + * pv_lock_table. The lock bits are accessed by the physical + * address of the page they lock. + */ + +char *pv_lock_table; /* pointer to array of bits */ +char *pv_hash_lock_table; + +pv_rooted_entry_t pv_head_table; /* array of entries, one per + * page */ +uint32_t pv_hashed_free_count = 0; +uint32_t pv_hashed_kern_free_count = 0; + +pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[PMAP_PAGETABLE_CORRUPTION_MAX_LOG]; +uint32_t pmap_pagetable_corruption_incidents; +uint64_t pmap_pagetable_corruption_last_abstime = (~(0ULL) >> 1); +uint64_t pmap_pagetable_corruption_interval_abstime; +thread_call_t pmap_pagetable_corruption_log_call; +static thread_call_data_t pmap_pagetable_corruption_log_call_data; +boolean_t pmap_pagetable_corruption_timeout = FALSE; + +volatile uint32_t mappingrecurse = 0; + +uint32_t pv_hashed_low_water_mark, pv_hashed_kern_low_water_mark, pv_hashed_alloc_chunk, pv_hashed_kern_alloc_chunk; + +thread_t mapping_replenish_thread; +event_t mapping_replenish_event, pmap_user_pv_throttle_event; + +uint64_t pmap_pv_throttle_stat, pmap_pv_throttled_waiters; + +unsigned int pmap_cache_attributes(ppnum_t pn) { + if (pmap_get_cache_attributes(pn) & INTEL_PTE_NCACHE) + return (VM_WIMG_IO); + else + return (VM_WIMG_COPYBACK); +} + +void pmap_set_cache_attributes(ppnum_t pn, unsigned int cacheattr) { + unsigned int current, template = 0; + int pai; + + if (cacheattr & VM_MEM_NOT_CACHEABLE) { + if(!(cacheattr & VM_MEM_GUARDED)) + template |= PHYS_PTA; + template |= PHYS_NCACHE; + } + + pmap_intr_assert(); + + assert((pn != vm_page_fictitious_addr) && (pn != vm_page_guard_addr)); + + pai = ppn_to_pai(pn); + + if (!IS_MANAGED_PAGE(pai)) { + return; + } + + /* override cache attributes for this phys page + * Does not walk through existing mappings to adjust, + * assumes page is disconnected + */ + + LOCK_PVH(pai); + + pmap_update_cache_attributes_locked(pn, template); + + current = pmap_phys_attributes[pai] & PHYS_CACHEABILITY_MASK; + pmap_phys_attributes[pai] &= ~PHYS_CACHEABILITY_MASK; + pmap_phys_attributes[pai] |= template; + + UNLOCK_PVH(pai); + + if ((template & PHYS_NCACHE) && !(current & PHYS_NCACHE)) { + pmap_sync_page_attributes_phys(pn); + } +} + +unsigned pmap_get_cache_attributes(ppnum_t pn) { + if (last_managed_page == 0) + return 0; + + if (!IS_MANAGED_PAGE(ppn_to_pai(pn))) { + return INTEL_PTE_NCACHE; + } + + /* + * The cache attributes are read locklessly for efficiency. + */ + unsigned int attr = pmap_phys_attributes[ppn_to_pai(pn)]; + unsigned int template = 0; + + if (attr & PHYS_PTA) + template |= INTEL_PTE_PTA; + if (attr & PHYS_NCACHE) + template |= INTEL_PTE_NCACHE; + return template; +} + + + +boolean_t +pmap_is_noencrypt(ppnum_t pn) +{ + int pai; + + pai = ppn_to_pai(pn); + + if (!IS_MANAGED_PAGE(pai)) + return (TRUE); + + if (pmap_phys_attributes[pai] & PHYS_NOENCRYPT) + return (TRUE); + + return (FALSE); +} + + +void +pmap_set_noencrypt(ppnum_t pn) +{ + int pai; + + pai = ppn_to_pai(pn); + + if (IS_MANAGED_PAGE(pai)) { + LOCK_PVH(pai); + + pmap_phys_attributes[pai] |= PHYS_NOENCRYPT; + + UNLOCK_PVH(pai); + } +} + + +void +pmap_clear_noencrypt(ppnum_t pn) +{ + int pai; + + pai = ppn_to_pai(pn); + + if (IS_MANAGED_PAGE(pai)) { + LOCK_PVH(pai); + + pmap_phys_attributes[pai] &= ~PHYS_NOENCRYPT; + + UNLOCK_PVH(pai); + } +} + +void +compute_pmap_gc_throttle(void *arg __unused) +{ + +} + + +__private_extern__ void +pmap_pagetable_corruption_msg_log(int (*log_func)(const char * fmt, ...)__printflike(1,2)) { + if (pmap_pagetable_corruption_incidents > 0) { + int i, e = MIN(pmap_pagetable_corruption_incidents, PMAP_PAGETABLE_CORRUPTION_MAX_LOG); + (*log_func)("%u pagetable corruption incident(s) detected, timeout: %u\n", pmap_pagetable_corruption_incidents, pmap_pagetable_corruption_timeout); + for (i = 0; i < e; i++) { + (*log_func)("Incident 0x%x, reason: 0x%x, action: 0x%x, time: 0x%llx\n", pmap_pagetable_corruption_records[i].incident, pmap_pagetable_corruption_records[i].reason, pmap_pagetable_corruption_records[i].action, pmap_pagetable_corruption_records[i].abstime); + } + } +} + +static inline void +pmap_pagetable_corruption_log_setup(void) { + if (pmap_pagetable_corruption_log_call == NULL) { + nanotime_to_absolutetime(PMAP_PAGETABLE_CORRUPTION_INTERVAL, 0, &pmap_pagetable_corruption_interval_abstime); + thread_call_setup(&pmap_pagetable_corruption_log_call_data, + (thread_call_func_t) pmap_pagetable_corruption_msg_log, + (thread_call_param_t) &printf); + pmap_pagetable_corruption_log_call = &pmap_pagetable_corruption_log_call_data; + } +} + +void +mapping_free_prime(void) +{ + unsigned i; + pv_hashed_entry_t pvh_e; + pv_hashed_entry_t pvh_eh; + pv_hashed_entry_t pvh_et; + int pv_cnt; + + /* Scale based on DRAM size */ + pv_hashed_low_water_mark = MAX(PV_HASHED_LOW_WATER_MARK_DEFAULT, ((uint32_t)(sane_size >> 30)) * 2000); + pv_hashed_low_water_mark = MIN(pv_hashed_low_water_mark, 16000); + /* Alterable via sysctl */ + pv_hashed_kern_low_water_mark = MAX(PV_HASHED_KERN_LOW_WATER_MARK_DEFAULT, ((uint32_t)(sane_size >> 30)) * 1000); + pv_hashed_kern_low_water_mark = MIN(pv_hashed_kern_low_water_mark, 16000); + pv_hashed_kern_alloc_chunk = PV_HASHED_KERN_ALLOC_CHUNK_INITIAL; + pv_hashed_alloc_chunk = PV_HASHED_ALLOC_CHUNK_INITIAL; + + pv_cnt = 0; + pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL; + + for (i = 0; i < (5 * PV_HASHED_ALLOC_CHUNK_INITIAL); i++) { + pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); + + pvh_e->qlink.next = (queue_entry_t)pvh_eh; + pvh_eh = pvh_e; + + if (pvh_et == PV_HASHED_ENTRY_NULL) + pvh_et = pvh_e; + pv_cnt++; + } + PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt); + + pv_cnt = 0; + pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL; + for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK_INITIAL; i++) { + pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); + + pvh_e->qlink.next = (queue_entry_t)pvh_eh; + pvh_eh = pvh_e; + + if (pvh_et == PV_HASHED_ENTRY_NULL) + pvh_et = pvh_e; + pv_cnt++; + } + PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt); +} + +void mapping_replenish(void); + +void mapping_adjust(void) { + kern_return_t mres; + + pmap_pagetable_corruption_log_setup(); + + mres = kernel_thread_start_priority((thread_continue_t)mapping_replenish, NULL, MAXPRI_KERNEL, &mapping_replenish_thread); + if (mres != KERN_SUCCESS) { + panic("pmap: mapping_replenish_thread creation failed"); + } + thread_deallocate(mapping_replenish_thread); +} + +unsigned pmap_mapping_thread_wakeups; +unsigned pmap_kernel_reserve_replenish_stat; +unsigned pmap_user_reserve_replenish_stat; +unsigned pmap_kern_reserve_alloc_stat; + +void mapping_replenish(void) +{ + pv_hashed_entry_t pvh_e; + pv_hashed_entry_t pvh_eh; + pv_hashed_entry_t pvh_et; + int pv_cnt; + unsigned i; + + /* We qualify for VM privileges...*/ + current_thread()->options |= TH_OPT_VMPRIV; + + for (;;) { + + while (pv_hashed_kern_free_count < pv_hashed_kern_low_water_mark) { + pv_cnt = 0; + pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL; + + for (i = 0; i < pv_hashed_kern_alloc_chunk; i++) { + pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); + pvh_e->qlink.next = (queue_entry_t)pvh_eh; + pvh_eh = pvh_e; + + if (pvh_et == PV_HASHED_ENTRY_NULL) + pvh_et = pvh_e; + pv_cnt++; + } + pmap_kernel_reserve_replenish_stat += pv_cnt; + PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt); + } + + pv_cnt = 0; + pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL; + + if (pv_hashed_free_count < pv_hashed_low_water_mark) { + for (i = 0; i < pv_hashed_alloc_chunk; i++) { + pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); + + pvh_e->qlink.next = (queue_entry_t)pvh_eh; + pvh_eh = pvh_e; + + if (pvh_et == PV_HASHED_ENTRY_NULL) + pvh_et = pvh_e; + pv_cnt++; + } + pmap_user_reserve_replenish_stat += pv_cnt; + PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt); + } +/* Wake threads throttled while the kernel reserve was being replenished. + */ + if (pmap_pv_throttled_waiters) { + pmap_pv_throttled_waiters = 0; + thread_wakeup(&pmap_user_pv_throttle_event); + } + /* Check if the kernel pool has been depleted since the + * first pass, to reduce refill latency. + */ + if (pv_hashed_kern_free_count < pv_hashed_kern_low_water_mark) + continue; + /* Block sans continuation to avoid yielding kernel stack */ + assert_wait(&mapping_replenish_event, THREAD_UNINT); + mappingrecurse = 0; + thread_block(THREAD_CONTINUE_NULL); + pmap_mapping_thread_wakeups++; + } +} + +/* + * Set specified attribute bits. + */ + +void +phys_attribute_set( + ppnum_t pn, + int bits) +{ + int pai; + + pmap_intr_assert(); + assert(pn != vm_page_fictitious_addr); + if (pn == vm_page_guard_addr) + return; + + pai = ppn_to_pai(pn); + + if (!IS_MANAGED_PAGE(pai)) { + /* Not a managed page. */ + return; + } + + LOCK_PVH(pai); + pmap_phys_attributes[pai] |= bits; + UNLOCK_PVH(pai); +} + +/* + * Set the modify bit on the specified physical page. + */ + +void +pmap_set_modify(ppnum_t pn) +{ + phys_attribute_set(pn, PHYS_MODIFIED); +} + +/* + * Clear the modify bits on the specified physical page. + */ + +void +pmap_clear_modify(ppnum_t pn) +{ + phys_attribute_clear(pn, PHYS_MODIFIED); +} + +/* + * pmap_is_modified: + * + * Return whether or not the specified physical page is modified + * by any physical maps. + */ + +boolean_t +pmap_is_modified(ppnum_t pn) +{ + if (phys_attribute_test(pn, PHYS_MODIFIED)) + return TRUE; + return FALSE; +} + + +/* + * pmap_clear_reference: + * + * Clear the reference bit on the specified physical page. + */ + +void +pmap_clear_reference(ppnum_t pn) +{ + phys_attribute_clear(pn, PHYS_REFERENCED); +} + +void +pmap_set_reference(ppnum_t pn) +{ + phys_attribute_set(pn, PHYS_REFERENCED); +} + +/* + * pmap_is_referenced: + * + * Return whether or not the specified physical page is referenced + * by any physical maps. + */ + +boolean_t +pmap_is_referenced(ppnum_t pn) +{ + if (phys_attribute_test(pn, PHYS_REFERENCED)) + return TRUE; + return FALSE; +} + + +/* + * pmap_get_refmod(phys) + * returns the referenced and modified bits of the specified + * physical page. + */ +unsigned int +pmap_get_refmod(ppnum_t pn) +{ + int refmod; + unsigned int retval = 0; + + refmod = phys_attribute_test(pn, PHYS_MODIFIED | PHYS_REFERENCED); + + if (refmod & PHYS_MODIFIED) + retval |= VM_MEM_MODIFIED; + if (refmod & PHYS_REFERENCED) + retval |= VM_MEM_REFERENCED; + + return (retval); +} + +/* + * pmap_clear_refmod(phys, mask) + * clears the referenced and modified bits as specified by the mask + * of the specified physical page. + */ +void +pmap_clear_refmod(ppnum_t pn, unsigned int mask) +{ + unsigned int x86Mask; + + x86Mask = ( ((mask & VM_MEM_MODIFIED)? PHYS_MODIFIED : 0) + | ((mask & VM_MEM_REFERENCED)? PHYS_REFERENCED : 0)); + phys_attribute_clear(pn, x86Mask); +} + +/* + * Routine: + * pmap_disconnect + * + * Function: + * Disconnect all mappings for this page and return reference and change status + * in generic format. + * + */ +unsigned int +pmap_disconnect(ppnum_t pa) +{ + unsigned refmod, vmrefmod = 0; + + pmap_page_protect(pa, 0); /* disconnect the page */ + + pmap_assert(pa != vm_page_fictitious_addr); + if ((pa == vm_page_guard_addr) || !IS_MANAGED_PAGE(pa)) + return 0; + refmod = pmap_phys_attributes[pa] & (PHYS_MODIFIED | PHYS_REFERENCED); + + if (refmod & PHYS_MODIFIED) + vmrefmod |= VM_MEM_MODIFIED; + if (refmod & PHYS_REFERENCED) + vmrefmod |= VM_MEM_REFERENCED; + + return vmrefmod; +} diff --git a/osfmk/i386/pmap_internal.h b/osfmk/i386/pmap_internal.h index 63bebc3ab..37757f191 100644 --- a/osfmk/i386/pmap_internal.h +++ b/osfmk/i386/pmap_internal.h @@ -28,7 +28,6 @@ #include #include -#include #ifdef MACH_KERNEL_PRIVATE @@ -44,9 +43,8 @@ simple_unlock(&(pmap)->lock); \ } - #define PMAP_UPDATE_TLBS(pmap, s, e) \ - pmap_flush_tlbs(pmap) + pmap_flush_tlbs(pmap, s, e) #define iswired(pte) ((pte) & INTEL_PTE_WIRED) @@ -60,6 +58,9 @@ extern boolean_t pmap_trace; #define PMAP_TRACE(x,a,b,c,d,e) KERNEL_DEBUG(x,a,b,c,d,e) #endif /* PMAP_TRACES */ +#define PMAP_TRACE_CONSTANT(x,a,b,c,d,e) \ + KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \ + void pmap_expand_pml4( pmap_t map, vm_map_offset_t v); @@ -67,12 +68,26 @@ void pmap_expand_pml4( void pmap_expand_pdpt( pmap_t map, vm_map_offset_t v); -extern void pmap_flush_tlbs(pmap_t pmap); -#if defined(__x86_64__) -extern const boolean_t cpu_64bit; -#else +void phys_attribute_set( + ppnum_t phys, + int bits); + +void pmap_set_reference( + ppnum_t pn); + +boolean_t phys_page_exists( + ppnum_t pn); + +void pmap_flush_tlbs(pmap_t, vm_map_offset_t, vm_map_offset_t); + +void +pmap_update_cache_attributes_locked(ppnum_t, unsigned); + +#if CONFIG_YONAH extern boolean_t cpu_64bit; +#else +extern const boolean_t cpu_64bit; #endif /* @@ -99,8 +114,8 @@ extern boolean_t cpu_64bit; PV HASHING Changes - JK 1/2007 Pve's establish physical to virtual mappings. These are used for aliasing of a -physical page to (potentially many) virtual addresses within pmaps. In the previous -implementation the structure of the pv_entries (each 16 bytes in size) was +physical page to (potentially many) virtual addresses within pmaps. In the +previous implementation the structure of the pv_entries (each 16 bytes in size) was typedef struct pv_entry { struct pv_entry_t next; @@ -108,20 +123,23 @@ typedef struct pv_entry { vm_map_offset_t va; } *pv_entry_t; -An initial array of these is created at boot time, one per physical page of memory, -indexed by the physical page number. Additionally, a pool of entries is created from a -pv_zone to be used as needed by pmap_enter() when it is creating new mappings. -Originally, we kept this pool around because the code in pmap_enter() was unable to -block if it needed an entry and none were available - we'd panic. Some time ago I -restructured the pmap_enter() code so that for user pmaps it can block while zalloc'ing -a pv structure and restart, removing a panic from the code (in the case of the kernel -pmap we cannot block and still panic, so, we keep a separate hot pool for use only on -kernel pmaps). The pool has not been removed since there is a large performance gain -keeping freed pv's around for reuse and not suffering the overhead of zalloc for every new pv we need. - -As pmap_enter() created new mappings it linked the new pve's for them off the fixed -pv array for that ppn (off the next pointer). These pve's are accessed for several -operations, one of them being address space teardown. In that case, we basically do this +An initial array of these is created at boot time, one per physical page of +memory, indexed by the physical page number. Additionally, a pool of entries +is created from a pv_zone to be used as needed by pmap_enter() when it is +creating new mappings. Originally, we kept this pool around because the code +in pmap_enter() was unable to block if it needed an entry and none were +available - we'd panic. Some time ago I restructured the pmap_enter() code +so that for user pmaps it can block while zalloc'ing a pv structure and restart, +removing a panic from the code (in the case of the kernel pmap we cannot block +and still panic, so, we keep a separate hot pool for use only on kernel pmaps). +The pool has not been removed since there is a large performance gain keeping +freed pv's around for reuse and not suffering the overhead of zalloc for every +new pv we need. + +As pmap_enter() created new mappings it linked the new pve's for them off the +fixed pv array for that ppn (off the next pointer). These pve's are accessed +for several operations, one of them being address space teardown. In that case, +we basically do this for (every page/pte in the space) { calc pve_ptr from the ppn in the pte @@ -133,124 +151,197 @@ operations, one of them being address space teardown. In that case, we basicall } } -The problem arose when we were running, say 8000 (or even 2000) apache or other processes -and one or all terminate. The list hanging off each pv array entry could have thousands of -entries. We were continuously linearly searching each of these lists as we stepped through -the address space we were tearing down. Because of the locks we hold, likely taking a cache -miss for each node, and interrupt disabling for MP issues the system became completely -unresponsive for many seconds while we did this. - -Realizing that pve's are accessed in two distinct ways (linearly running the list by ppn -for operations like pmap_page_protect and finding and modifying/removing a single pve as -part of pmap_enter processing) has led to modifying the pve structures and databases. - -There are now two types of pve structures. A "rooted" structure which is basically the -original structure accessed in an array by ppn, and a ''hashed'' structure accessed on a -hash list via a hash of [pmap, vaddr]. These have been designed with the two goals of -minimizing wired memory and making the lookup of a ppn faster. Since a vast majority of -pages in the system are not aliased and hence represented by a single pv entry I've kept -the rooted entry size as small as possible because there is one of these dedicated for -every physical page of memory. The hashed pve's are larger due to the addition of the hash -link and the ppn entry needed for matching while running the hash list to find the entry we -are looking for. This way, only systems that have lots of aliasing (like 2000+ httpd procs) -will pay the extra memory price. Both structures have the same first three fields allowing -some simplification in the code. +The problem arose when we were running, say 8000 (or even 2000) apache or +other processes and one or all terminate. The list hanging off each pv array +entry could have thousands of entries. We were continuously linearly searching +each of these lists as we stepped through the address space we were tearing +down. Because of the locks we hold, likely taking a cache miss for each node, +and interrupt disabling for MP issues the system became completely unresponsive +for many seconds while we did this. + +Realizing that pve's are accessed in two distinct ways (linearly running the +list by ppn for operations like pmap_page_protect and finding and +modifying/removing a single pve as part of pmap_enter processing) has led to +modifying the pve structures and databases. + +There are now two types of pve structures. A "rooted" structure which is +basically the original structure accessed in an array by ppn, and a ''hashed'' +structure accessed on a hash list via a hash of [pmap, vaddr]. These have been +designed with the two goals of minimizing wired memory and making the lookup of +a ppn faster. Since a vast majority of pages in the system are not aliased +and hence represented by a single pv entry I've kept the rooted entry size as +small as possible because there is one of these dedicated for every physical +page of memory. The hashed pve's are larger due to the addition of the hash +link and the ppn entry needed for matching while running the hash list to find +the entry we are looking for. This way, only systems that have lots of +aliasing (like 2000+ httpd procs) will pay the extra memory price. Both +structures have the same first three fields allowing some simplification in +the code. They have these shapes typedef struct pv_rooted_entry { - queue_head_t qlink; - vm_map_offset_t va; - pmap_t pmap; + queue_head_t qlink; + vm_map_offset_t va; + pmap_t pmap; } *pv_rooted_entry_t; typedef struct pv_hashed_entry { - queue_head_t qlink; - vm_map_offset_t va; - pmap_t pmap; - ppnum_t ppn; - struct pv_hashed_entry *nexth; + queue_head_t qlink; + vm_map_offset_t va; + pmap_t pmap; + ppnum_t ppn; + struct pv_hashed_entry *nexth; } *pv_hashed_entry_t; -The main flow difference is that the code is now aware of the rooted entry and the hashed -entries. Code that runs the pv list still starts with the rooted entry and then continues -down the qlink onto the hashed entries. Code that is looking up a specific pv entry first -checks the rooted entry and then hashes and runs the hash list for the match. The hash list -lengths are much smaller than the original pv lists that contained all aliases for the specific ppn. +The main flow difference is that the code is now aware of the rooted entry and +the hashed entries. Code that runs the pv list still starts with the rooted +entry and then continues down the qlink onto the hashed entries. Code that is +looking up a specific pv entry first checks the rooted entry and then hashes +and runs the hash list for the match. The hash list lengths are much smaller +than the original pv lists that contained all aliases for the specific ppn. */ -typedef struct pv_rooted_entry { /* first three entries must match pv_hashed_entry_t */ - queue_head_t qlink; - vm_map_offset_t va; /* virtual address for mapping */ - pmap_t pmap; /* pmap where mapping lies */ +typedef struct pv_rooted_entry { + /* first three entries must match pv_hashed_entry_t */ + queue_head_t qlink; + vm_map_offset_t va; /* virtual address for mapping */ + pmap_t pmap; /* pmap where mapping lies */ } *pv_rooted_entry_t; #define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0) - -typedef struct pv_hashed_entry { /* first three entries must match pv_rooted_entry_t */ - queue_head_t qlink; - vm_map_offset_t va; - pmap_t pmap; - ppnum_t ppn; - struct pv_hashed_entry *nexth; +typedef struct pv_hashed_entry { + /* first three entries must match pv_rooted_entry_t */ + queue_head_t qlink; + vm_map_offset_t va; + pmap_t pmap; + ppnum_t ppn; + struct pv_hashed_entry *nexth; } *pv_hashed_entry_t; #define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0) -/* #define PV_DEBUG 1 uncomment to enable some PV debugging code */ +//#define PV_DEBUG 1 /* uncomment to enable some PV debugging code */ #ifdef PV_DEBUG #define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized"); #else -#define CHK_NPVHASH() +#define CHK_NPVHASH(x) #endif #define NPVHASH 4095 /* MUST BE 2^N - 1 */ -#define PV_HASHED_LOW_WATER_MARK 5000 -#define PV_HASHED_KERN_LOW_WATER_MARK 400 -#define PV_HASHED_ALLOC_CHUNK 2000 -#define PV_HASHED_KERN_ALLOC_CHUNK 200 - -#define PV_HASHED_ALLOC(pvh_e) { \ - simple_lock(&pv_hashed_free_list_lock); \ - if ((pvh_e = pv_hashed_free_list) != 0) { \ - pv_hashed_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \ - pv_hashed_free_count--; \ - if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) \ - if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \ - thread_call_enter(mapping_adjust_call); \ - } \ - simple_unlock(&pv_hashed_free_list_lock); \ +#define PV_HASHED_LOW_WATER_MARK_DEFAULT 5000 +#define PV_HASHED_KERN_LOW_WATER_MARK_DEFAULT 2000 +#define PV_HASHED_ALLOC_CHUNK_INITIAL 2000 +#define PV_HASHED_KERN_ALLOC_CHUNK_INITIAL 200 + +extern volatile uint32_t mappingrecurse; +extern uint32_t pv_hashed_low_water_mark, pv_hashed_kern_low_water_mark; + +/* + * PV hash locking + */ + +#define LOCK_PV_HASH(hash) lock_hash_hash(hash) +#define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash) +extern uint32_t npvhash; +extern pv_hashed_entry_t *pv_hash_table; /* hash lists */ +extern pv_hashed_entry_t pv_hashed_free_list; +extern pv_hashed_entry_t pv_hashed_kern_free_list; +decl_simple_lock_data(extern, pv_hashed_free_list_lock) +decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock) +decl_simple_lock_data(extern, pv_hash_table_lock) + +extern zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry + * structures */ + +extern uint32_t pv_hashed_free_count; +extern uint32_t pv_hashed_kern_free_count; +/* + * Each entry in the pv_head_table is locked by a bit in the + * pv_lock_table. The lock bits are accessed by the address of + * the frame they lock. + */ +#define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) +#define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) +extern char *pv_lock_table; /* pointer to array of bits */ +extern char *pv_hash_lock_table; +extern pv_rooted_entry_t pv_head_table; /* array of entries, one per page */ + +extern event_t mapping_replenish_event; + +static inline void PV_HASHED_ALLOC(pv_hashed_entry_t *pvh_ep) { + + simple_lock(&pv_hashed_free_list_lock); + /* If the kernel reserved pool is low, let non-kernel mappings allocate + * synchronously, possibly subject to a throttle. + */ + if ((pv_hashed_kern_free_count >= pv_hashed_kern_low_water_mark) && + (*pvh_ep = pv_hashed_free_list) != 0) { + pv_hashed_free_list = (pv_hashed_entry_t)(*pvh_ep)->qlink.next; + pv_hashed_free_count--; + } + + simple_unlock(&pv_hashed_free_list_lock); + + if (pv_hashed_free_count < pv_hashed_low_water_mark) { + if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse)) + thread_wakeup(&mapping_replenish_event); + } } -#define PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \ - simple_lock(&pv_hashed_free_list_lock); \ - pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list; \ - pv_hashed_free_list = pvh_eh; \ - pv_hashed_free_count += pv_cnt; \ - simple_unlock(&pv_hashed_free_list_lock); \ +static inline void PV_HASHED_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) { + simple_lock(&pv_hashed_free_list_lock); + pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list; + pv_hashed_free_list = pvh_eh; + pv_hashed_free_count += pv_cnt; + simple_unlock(&pv_hashed_free_list_lock); } -#define PV_HASHED_KERN_ALLOC(pvh_e) { \ - simple_lock(&pv_hashed_kern_free_list_lock); \ - if ((pvh_e = pv_hashed_kern_free_list) != 0) { \ - pv_hashed_kern_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \ - pv_hashed_kern_free_count--; \ - if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) \ - if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \ - thread_call_enter(mapping_adjust_call); \ - } \ - simple_unlock(&pv_hashed_kern_free_list_lock); \ +extern unsigned pmap_kern_reserve_alloc_stat; + +static inline void PV_HASHED_KERN_ALLOC(pv_hashed_entry_t *pvh_e) { + simple_lock(&pv_hashed_kern_free_list_lock); + + if ((*pvh_e = pv_hashed_kern_free_list) != 0) { + pv_hashed_kern_free_list = (pv_hashed_entry_t)(*pvh_e)->qlink.next; + pv_hashed_kern_free_count--; + pmap_kern_reserve_alloc_stat++; + } + + simple_unlock(&pv_hashed_kern_free_list_lock); + + if (pv_hashed_kern_free_count < pv_hashed_kern_low_water_mark) { + if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse)) + thread_wakeup(&mapping_replenish_event); + } } -#define PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \ - simple_lock(&pv_hashed_kern_free_list_lock); \ - pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list; \ - pv_hashed_kern_free_list = pvh_eh; \ - pv_hashed_kern_free_count += pv_cnt; \ - simple_unlock(&pv_hashed_kern_free_list_lock); \ +static inline void PV_HASHED_KERN_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) { + simple_lock(&pv_hashed_kern_free_list_lock); + pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list; + pv_hashed_kern_free_list = pvh_eh; + pv_hashed_kern_free_count += pv_cnt; + simple_unlock(&pv_hashed_kern_free_list_lock); +} + +extern uint64_t pmap_pv_throttle_stat, pmap_pv_throttled_waiters; +extern event_t pmap_user_pv_throttle_event; + +static inline void pmap_pv_throttle(__unused pmap_t p) { + pmap_assert(p != kernel_pmap); + /* Apply throttle on non-kernel mappings */ + if (pv_hashed_kern_free_count < (pv_hashed_kern_low_water_mark / 2)) { + pmap_pv_throttle_stat++; + /* This doesn't need to be strictly accurate, merely a hint + * to eliminate the timeout when the reserve is replenished. + */ + pmap_pv_throttled_waiters++; + assert_wait_timeout(&pmap_user_pv_throttle_event, THREAD_UNINT, 1, 1000 * NSEC_PER_USEC); + thread_block(THREAD_CONTINUE_NULL); + } } /* @@ -264,7 +355,6 @@ typedef struct pv_hashed_entry { /* first three entries must match pv_rooted #define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table) #define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table) #define pvhash(idx) (&pv_hash_table[idx]) - #define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table) #define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table) @@ -279,6 +369,9 @@ typedef struct pv_hashed_entry { /* first three entries must match pv_rooted #define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */ #define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */ #define PHYS_NOENCRYPT INTEL_PTE_USER /* no need to encrypt this page in the hibernation image */ +#define PHYS_NCACHE INTEL_PTE_NCACHE +#define PHYS_PTA INTEL_PTE_PTA +#define PHYS_CACHEABILITY_MASK (INTEL_PTE_PTA | INTEL_PTE_NCACHE) /* * Amount of virtual memory mapped by one @@ -325,31 +418,7 @@ typedef struct pv_hashed_entry { /* first three entries must match pv_rooted unlock_pvh_pai(index); \ mp_enable_preemption(); \ } -/* - * PV hash locking - */ - -#define LOCK_PV_HASH(hash) lock_hash_hash(hash) -#define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash) -extern uint32_t npvhash; -extern pv_hashed_entry_t *pv_hash_table; /* hash lists */ -extern pv_hashed_entry_t pv_hashed_free_list; -extern pv_hashed_entry_t pv_hashed_kern_free_list; -decl_simple_lock_data(extern, pv_hashed_free_list_lock) -decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock) -decl_simple_lock_data(extern, pv_hash_table_lock) -extern zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry structures */ - -extern int pv_hashed_free_count; -extern int pv_hashed_kern_free_count; -#define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) -#define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) -extern char *pv_lock_table; /* pointer to array of bits */ - -extern char *pv_hash_lock_table; -extern pv_rooted_entry_t pv_head_table; /* array of entries, one - * per page */ extern uint64_t pde_mapped_size; extern char *pmap_phys_attributes; @@ -379,23 +448,23 @@ extern uint64_t max_preemption_latency_tsc; #define pmap_intr_assert() #endif -extern int nx_enabled; -extern unsigned int inuse_ptepages_count; +extern int nx_enabled; +extern unsigned int inuse_ptepages_count; static inline uint32_t pvhashidx(pmap_t pmap, vm_map_offset_t va) { return ((uint32_t)(uintptr_t)pmap ^ - ((uint32_t)((uint64_t)va >> PAGE_SHIFT) & 0xFFFFFFFF)) & + ((uint32_t)(va >> PAGE_SHIFT) & 0xFFFFFFFF)) & npvhash; } + /* * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain. * properly deals with the anchor. * must be called with the hash locked, does not unlock it */ - static inline void pmap_pvh_unlink(pv_hashed_entry_t pvh) { @@ -457,7 +526,7 @@ pv_hash_remove(pv_hashed_entry_t pvh_e) remque(&pvh_e->qlink); pmap_pvh_unlink(pvh_e); UNLOCK_PV_HASH(pvhash_idx); -} +} static inline boolean_t popcnt1(uint64_t distance) { return ((distance & (distance - 1)) == 0); @@ -639,16 +708,16 @@ pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t * pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva); return action; } + /* * Remove pv list entry. * Called with pv_head_table entry locked. * Returns pv entry to be freed (or NULL). */ - static inline __attribute__((always_inline)) pv_hashed_entry_t -pmap_pv_remove( pmap_t pmap, - vm_map_offset_t vaddr, - ppnum_t *ppnp, +pmap_pv_remove(pmap_t pmap, + vm_map_offset_t vaddr, + ppnum_t *ppnp, pt_entry_t *pte) { pv_hashed_entry_t pvh_e; @@ -721,7 +790,8 @@ pmap_pv_remove( pmap_t pmap, LOCK_PV_HASH(pvhash_idx); pprevh = pvhash(pvhash_idx); if (PV_HASHED_ENTRY_NULL == *pprevh) { - panic("pmap_pv_remove(%p,0x%llx,0x%x): empty hash", pmap, vaddr, ppn); + panic("pmap_pv_remove(%p,0x%llx,0x%x): empty hash", + pmap, vaddr, ppn); } pvh_e = *pprevh; pmap_pv_hashlist_walks++; @@ -735,6 +805,7 @@ pmap_pv_remove( pmap_t pmap, pprevh = &pvh_e->nexth; pvh_e = pvh_e->nexth; } + if (PV_HASHED_ENTRY_NULL == pvh_e) { pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_PRESENT); @@ -755,6 +826,7 @@ pmap_pv_remove( pmap_t pmap, } } } + pmap_pv_hashlist_cnts += pv_cnt; if (pmap_pv_hashlist_max < pv_cnt) pmap_pv_hashlist_max = pv_cnt; @@ -766,4 +838,161 @@ pmap_pv_remove( pmap_t pmap, return pvh_e; } + +extern int pt_fake_zone_index; +static inline void +PMAP_ZINFO_PALLOC(vm_size_t bytes) +{ + thread_t thr = current_thread(); + task_t task; + zinfo_usage_t zinfo; + + thr->tkm_private.alloc += bytes; + if (pt_fake_zone_index != -1 && + (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) + OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].alloc); +} + +static inline void +PMAP_ZINFO_PFREE(vm_size_t bytes) +{ + thread_t thr = current_thread(); + task_t task; + zinfo_usage_t zinfo; + + thr->tkm_private.free += bytes; + if (pt_fake_zone_index != -1 && + (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) + OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].free); +} + +extern boolean_t pmap_initialized;/* Has pmap_init completed? */ +#define valid_page(x) (pmap_initialized && pmap_valid_page(x)) + +// XXX +#define HIGH_MEM_BASE ((uint32_t)( -NBPDE) ) /* shared gdt etc seg addr */ /* XXX64 ?? */ +// XXX + + +int phys_attribute_test( + ppnum_t phys, + int bits); +void phys_attribute_clear( + ppnum_t phys, + int bits); + +//#define PCID_DEBUG 1 +#if PCID_DEBUG +#define pmap_pcid_log(fmt, args...) \ + do { \ + kprintf(fmt, ##args); \ + printf(fmt, ##args); \ + } while(0) +#else +#define pmap_pcid_log(fmt, args...) +#endif +void pmap_pcid_configure(void); + +#if defined(__x86_64__) +/* + * The single pml4 page per pmap is allocated at pmap create time and exists + * for the duration of the pmap. we allocate this page in kernel vm. + * this returns the address of the requested pml4 entry in the top level page. + */ +static inline +pml4_entry_t * +pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr) +{ +#if PMAP_ASSERT + return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_cr3)[(vaddr >> PML4SHIFT) & (NPML4PG-1)]); +#else + return &pmap->pm_pml4[(vaddr >> PML4SHIFT) & (NPML4PG-1)]; +#endif +} + +/* + * Returns address of requested PDPT entry in the physmap. + */ +static inline pdpt_entry_t * +pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr) +{ + pml4_entry_t newpf; + pml4_entry_t *pml4; + + assert(pmap); + if ((vaddr > 0x00007FFFFFFFFFFFULL) && + (vaddr < 0xFFFF800000000000ULL)) { + return (0); + } + + pml4 = pmap64_pml4(pmap, vaddr); + if (pml4 && ((*pml4 & INTEL_PTE_VALID))) { + newpf = *pml4 & PG_FRAME; + return &((pdpt_entry_t *) PHYSMAP_PTOV(newpf)) + [(vaddr >> PDPTSHIFT) & (NPDPTPG-1)]; + } + return (NULL); +} +/* + * Returns the address of the requested PDE entry in the physmap. + */ +static inline pd_entry_t * +pmap64_pde(pmap_t pmap, vm_map_offset_t vaddr) +{ + pdpt_entry_t newpf; + pdpt_entry_t *pdpt; + + assert(pmap); + if ((vaddr > 0x00007FFFFFFFFFFFULL) && + (vaddr < 0xFFFF800000000000ULL)) { + return (0); + } + + pdpt = pmap64_pdpt(pmap, vaddr); + + if (pdpt && ((*pdpt & INTEL_PTE_VALID))) { + newpf = *pdpt & PG_FRAME; + return &((pd_entry_t *) PHYSMAP_PTOV(newpf)) + [(vaddr >> PDSHIFT) & (NPDPG-1)]; + } + return (NULL); +} + +static inline pd_entry_t * +pmap_pde(pmap_t m, vm_map_offset_t v) +{ + pd_entry_t *pde; + + assert(m); + pde = pmap64_pde(m, v); + + return pde; +} + + +/* + * return address of mapped pte for vaddr va in pmap pmap. + * + * In case the pde maps a superpage, return the pde, which, in this case + * is the actual page table entry. + */ +static inline pt_entry_t * +pmap_pte(pmap_t pmap, vm_map_offset_t vaddr) +{ + pd_entry_t *pde; + pd_entry_t newpf; + + assert(pmap); + pde = pmap_pde(pmap, vaddr); + + if (pde && ((*pde & INTEL_PTE_VALID))) { + if (*pde & INTEL_PTE_PS) + return pde; + newpf = *pde & PG_FRAME; + return &((pt_entry_t *)PHYSMAP_PTOV(newpf)) + [i386_btop(vaddr) & (ppnum_t)(NPTEPG-1)]; + } + return (NULL); +} +#endif #endif /* MACH_KERNEL_PRIVATE */ diff --git a/osfmk/i386/pmap_pcid.h b/osfmk/i386/pmap_pcid.h new file mode 100644 index 000000000..0e16f3e2d --- /dev/null +++ b/osfmk/i386/pmap_pcid.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _PMAP_PCID_ +#define _PMAP_PCID_ 1 +#if defined(__x86_64__) +void pmap_pcid_initialize(pmap_t); +void pmap_pcid_initialize_kernel(pmap_t); +pcid_t pmap_pcid_allocate_pcid(int); +void pmap_pcid_deallocate_pcid(int, pmap_t); +void pmap_destroy_pcid_sync_action(void *); +void pmap_destroy_pcid_sync(pmap_t); +void pmap_pcid_lazy_flush(pmap_t); +void pmap_pcid_activate(pmap_t, int); +pcid_t pcid_for_pmap_cpu_tuple(pmap_t, int); + +#define PMAP_INVALID ((pmap_t)0xDEAD7347) +#define PMAP_PCID_INVALID_PCID (0xDEAD) +#define PMAP_PCID_MAX_REFCOUNT (0xF0) +#define PMAP_PCID_MIN_PCID (1) + +extern uint32_t pmap_pcid_ncpus; + +static inline void +tlb_flush_global(void) { + uintptr_t cr4 = get_cr4(); + pmap_assert(ml_get_interrupts_enabled() == FALSE || get_preemption_level() !=0); + pmap_assert2(((cr4 & CR4_PGE) || ml_at_interrupt_context()), "CR4: 0x%lx", cr4); + /* + * We are, unfortunately, forced to rely on this expensive + * read-modify-write-write scheme due to the inadequate + * TLB invalidation ISA. The read is necessary as + * the kernel does not "own" the contents of CR4, the VMX + * feature in particular. It may be possible to + * avoid a global flush and instead track a generation + * count of kernel invalidations, but that scheme + * has its disadvantages as well. + */ + set_cr4(cr4 & ~CR4_PGE); + set_cr4(cr4 | CR4_PGE); + return; +} + +static inline void pmap_pcid_invalidate_all_cpus(pmap_t tpmap) { + unsigned i; + + pmap_assert((sizeof(tpmap->pmap_pcid_coherency_vector) >= real_ncpus) && (!(sizeof(tpmap->pmap_pcid_coherency_vector) & 7))); + + for (i = 0; i < real_ncpus; i+=8) { + *(uint64_t *)(uintptr_t)&tpmap->pmap_pcid_coherency_vector[i] = (~0ULL); + } +} + +static inline void pmap_pcid_validate_current(void) { + int ccpu = cpu_number(); + volatile uint8_t *cptr = cpu_datap(ccpu)->cpu_pmap_pcid_coherentp; +#ifdef PMAP_MODULE + pmap_assert(cptr == &(current_thread()->map->pmap->pmap_pcid_coherency_vector[ccpu])); +#endif + if (cptr) { + *cptr = 0; + } + +} + +static inline void pmap_pcid_invalidate_cpu(pmap_t tpmap, int ccpu) { + tpmap->pmap_pcid_coherency_vector[ccpu] = 0xFF; +} + +static inline void pmap_pcid_validate_cpu(pmap_t tpmap, int ccpu) { + tpmap->pmap_pcid_coherency_vector[ccpu] = 0; +} +#endif /* x86_64 */ +#endif diff --git a/osfmk/i386/pmap_x86_common.c b/osfmk/i386/pmap_x86_common.c index a8c3423b4..d7e63d6b0 100644 --- a/osfmk/i386/pmap_x86_common.c +++ b/osfmk/i386/pmap_x86_common.c @@ -29,27 +29,12 @@ #include #include - void pmap_remove_range( pmap_t pmap, vm_map_offset_t va, pt_entry_t *spte, pt_entry_t *epte); -pv_rooted_entry_t pv_head_table; /* array of entries, one per - * page */ -thread_call_t mapping_adjust_call; -static thread_call_data_t mapping_adjust_call_data; -uint32_t mappingrecurse = 0; - -pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[PMAP_PAGETABLE_CORRUPTION_MAX_LOG]; -uint32_t pmap_pagetable_corruption_incidents; -uint64_t pmap_pagetable_corruption_last_abstime = (~(0ULL) >> 1); -uint64_t pmap_pagetable_corruption_interval_abstime; -thread_call_t pmap_pagetable_corruption_log_call; -static thread_call_data_t pmap_pagetable_corruption_log_call_data; -boolean_t pmap_pagetable_corruption_timeout = FALSE; - /* * The Intel platform can nest at the PDE level, so NBPDE (i.e. 2MB) at a time, * on a NBPDE boundary. @@ -103,8 +88,8 @@ kern_return_t pmap_nest(pmap_t grand, pmap_t subord, addr64_t va_start, addr64_t panic("pmap_nest: va_start(0x%llx) != nstart(0x%llx)\n", va_start, nstart); PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_START, - (int) grand, (int) subord, - (int) (va_start>>32), (int) va_start, 0); + (uintptr_t) grand, (uintptr_t) subord, + (uintptr_t) (va_start>>32), (uintptr_t) va_start, 0); nvaddr = (vm_map_offset_t)nstart; num_pde = size >> PDESHIFT; @@ -216,8 +201,8 @@ kern_return_t pmap_unnest(pmap_t grand, addr64_t vaddr, uint64_t size) { uint64_t npdpt = PMAP_INVALID_PDPTNUM; PMAP_TRACE(PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START, - (int) grand, - (int) (vaddr>>32), (int) vaddr, 0, 0); + (uintptr_t) grand, + (uintptr_t) (vaddr>>32), (uintptr_t) vaddr, 0, 0); if ((size & (pmap_nesting_size_min-1)) || (vaddr & (pmap_nesting_size_min-1))) { @@ -337,6 +322,67 @@ pmap_find_phys(pmap_t pmap, addr64_t va) return ppn; } +/* + * Update cache attributes for all extant managed mappings. + * Assumes PV for this page is locked, and that the page + * is managed. + */ + +void +pmap_update_cache_attributes_locked(ppnum_t pn, unsigned attributes) { + pv_rooted_entry_t pv_h, pv_e; + pv_hashed_entry_t pvh_e, nexth; + vm_map_offset_t vaddr; + pmap_t pmap; + pt_entry_t *ptep; + + assert(IS_MANAGED_PAGE(pn)); + + pv_h = pai_to_pvh(pn); + /* TODO: translate the PHYS_* bits to PTE bits, while they're + * currently identical, they may not remain so + * Potential optimization (here and in page_protect), + * parallel shootdowns, check for redundant + * attribute modifications. + */ + + /* + * Alter attributes on all mappings + */ + if (pv_h->pmap != PMAP_NULL) { + pv_e = pv_h; + pvh_e = (pv_hashed_entry_t)pv_e; + + do { + pmap = pv_e->pmap; + vaddr = pv_e->va; + ptep = pmap_pte(pmap, vaddr); + + if (0 == ptep) + panic("pmap_update_cache_attributes_locked: Missing PTE, pmap: %p, pn: 0x%x vaddr: 0x%llx kernel_pmap: %p", pmap, pn, vaddr, kernel_pmap); + + nexth = (pv_hashed_entry_t)queue_next(&pvh_e->qlink); + pmap_update_pte(ptep, *ptep, (*ptep & ~PHYS_CACHEABILITY_MASK) | attributes); + PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE); + pvh_e = nexth; + } while ((pv_e = (pv_rooted_entry_t)nexth) != pv_h); + } +} + +void x86_filter_TLB_coherency_interrupts(boolean_t dofilter) { + assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0); + + if (dofilter) { + CPU_CR3_MARK_INACTIVE(); + } else { + CPU_CR3_MARK_ACTIVE(); + __asm__ volatile("mfence"); + if (current_cpu_datap()->cpu_tlb_invalid) + process_pmap_updates(); + } +} + + /* * Insert the given physical page (p) at * the specified virtual address (v) in the @@ -444,7 +490,6 @@ pmap_enter( *pte = 0; } - old_pa = pte_to_pa(*pte); pai = pa_index(old_pa); old_pa_locked = FALSE; @@ -469,12 +514,15 @@ pmap_enter( * at this address. */ if (old_pa == pa) { + pt_entry_t old_attributes = + *pte & ~(INTEL_PTE_REF | INTEL_PTE_MOD); /* * May be changing its wired attribute or protection */ template = pa_to_pte(pa) | INTEL_PTE_VALID; + template |= pmap_get_cache_attributes(pa_index(pa)); if (VM_MEM_NOT_CACHEABLE == (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT))) { @@ -492,11 +540,11 @@ pmap_enter( if (wired) { template |= INTEL_PTE_WIRED; - if (!iswired(*pte)) + if (!iswired(old_attributes)) OSAddAtomic(+1, &pmap->stats.wired_count); } else { - if (iswired(*pte)) { + if (iswired(old_attributes)) { assert(pmap->stats.wired_count >= 1); OSAddAtomic(-1, &pmap->stats.wired_count); @@ -504,6 +552,9 @@ pmap_enter( } if (superpage) /* this path can not be used */ template |= INTEL_PTE_PS; /* to change the page size! */ + /* Determine delta, PV locked */ + need_tlbflush = + ((old_attributes ^ template) != INTEL_PTE_WIRED); /* store modified PTE and preserve RC bits */ pmap_update_pte(pte, *pte, @@ -512,7 +563,6 @@ pmap_enter( UNLOCK_PVH(pai); old_pa_locked = FALSE; } - need_tlbflush = TRUE; goto Done; } @@ -548,19 +598,12 @@ pmap_enter( pmap_store_pte(pte, 0); if (IS_MANAGED_PAGE(pai)) { -#if TESTING - if (pmap->stats.resident_count < 1) - panic("pmap_enter: resident_count"); -#endif + pmap_assert(old_pa_locked == TRUE); assert(pmap->stats.resident_count >= 1); OSAddAtomic(-1, &pmap->stats.resident_count); if (iswired(*pte)) { -#if TESTING - if (pmap->stats.wired_count < 1) - panic("pmap_enter: wired_count"); -#endif assert(pmap->stats.wired_count >= 1); OSAddAtomic(-1, &pmap->stats.wired_count); @@ -624,7 +667,7 @@ pmap_enter( pvh_e = pvh_new; pvh_new = PV_HASHED_ENTRY_NULL; } else if (PV_HASHED_ENTRY_NULL == pvh_e) { - PV_HASHED_ALLOC(pvh_e); + PV_HASHED_ALLOC(&pvh_e); if (PV_HASHED_ENTRY_NULL == pvh_e) { /* * the pv list is empty. if we are on @@ -636,10 +679,11 @@ pmap_enter( * us. */ if (kernel_pmap == pmap) { - PV_HASHED_KERN_ALLOC(pvh_e); + PV_HASHED_KERN_ALLOC(&pvh_e); } else { UNLOCK_PVH(pai); PMAP_UNLOCK(pmap); + pmap_pv_throttle(pmap); pvh_new = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); goto Retry; } @@ -664,7 +708,7 @@ pmap_enter( * only count the mapping * for 'managed memory' */ - OSAddAtomic(+1, & pmap->stats.resident_count); + OSAddAtomic(+1, &pmap->stats.resident_count); if (pmap->stats.resident_count > pmap->stats.resident_max) { pmap->stats.resident_max = pmap->stats.resident_count; } @@ -681,7 +725,13 @@ pmap_enter( * only the pfn changes. */ template = pa_to_pte(pa) | INTEL_PTE_VALID; + /* + * DRK: It may be worth asserting on cache attribute flags that diverge + * from the existing physical page attributes. + */ + template |= pmap_get_cache_attributes(pa_index(pa)); + if (flags & VM_MEM_NOT_CACHEABLE) { if (!(flags & VM_MEM_GUARDED)) template |= INTEL_PTE_PTA; @@ -728,9 +778,10 @@ pmap_enter( m = vm_page_lookup(delpage_pm_obj, delpage_pde_index); if (m == VM_PAGE_NULL) panic("pmap_enter: pte page not in object"); + vm_object_unlock(delpage_pm_obj); VM_PAGE_FREE(m); OSAddAtomic(-1, &inuse_ptepages_count); - vm_object_unlock(delpage_pm_obj); + PMAP_ZINFO_PFREE(PAGE_SIZE); } PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, 0, 0, 0, 0, 0); @@ -1076,10 +1127,9 @@ pmap_page_protect( vaddr = pv_e->va; pte = pmap_pte(pmap, vaddr); -#if DEBUG - if (pa_index(pte_to_pa(*pte)) != pn) - panic("pmap_page_protect: PTE mismatch, pn: 0x%x, pmap: %p, vaddr: 0x%llx, pte: 0x%llx", pn, pmap, vaddr, *pte); -#endif + pmap_assert2((pa_index(pte_to_pa(*pte)) == pn), + "pmap_page_protect: PTE mismatch, pn: 0x%x, pmap: %p, vaddr: 0x%llx, pte: 0x%llx", pn, pmap, vaddr, *pte); + if (0 == pte) { panic("pmap_page_protect() " "pmap=%p pn=0x%x vaddr=0x%llx\n", @@ -1089,16 +1139,21 @@ pmap_page_protect( /* * Remove the mapping if new protection is NONE - * or if write-protecting a kernel mapping. */ - if (remove || pmap == kernel_pmap) { + if (remove) { /* * Remove the mapping, collecting dirty bits. */ pmap_update_pte(pte, *pte, *pte & ~INTEL_PTE_VALID); + + /* Remove per-pmap wired count */ + if (iswired(*pte)) { + OSAddAtomic(-1, &pmap->stats.wired_count); + } + PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE); pmap_phys_attributes[pai] |= - *pte & (PHYS_MODIFIED|PHYS_REFERENCED); + *pte & (PHYS_MODIFIED|PHYS_REFERENCED); pmap_store_pte(pte, 0); #if TESTING @@ -1117,8 +1172,6 @@ pmap_page_protect( * Fix up head later. */ pv_h->pmap = PMAP_NULL; - - pmap_phys_attributes[pai] &= ~PHYS_NOENCRYPT; } else { /* * Delete this entry. @@ -1133,8 +1186,11 @@ pmap_page_protect( } } else { /* - * Write-protect. + * Write-protect, after opportunistic refmod collect */ + pmap_phys_attributes[pai] |= + *pte & (PHYS_MODIFIED|PHYS_REFERENCED); + pmap_update_pte(pte, *pte, *pte & ~INTEL_PTE_WRITE); PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE); } @@ -1170,180 +1226,206 @@ pmap_page_protect( 0, 0, 0, 0, 0); } -__private_extern__ void -pmap_pagetable_corruption_msg_log(int (*log_func)(const char * fmt, ...)__printflike(1,2)) { - if (pmap_pagetable_corruption_incidents > 0) { - int i, e = MIN(pmap_pagetable_corruption_incidents, PMAP_PAGETABLE_CORRUPTION_MAX_LOG); - (*log_func)("%u pagetable corruption incident(s) detected, timeout: %u\n", pmap_pagetable_corruption_incidents, pmap_pagetable_corruption_timeout); - for (i = 0; i < e; i++) { - (*log_func)("Incident 0x%x, reason: 0x%x, action: 0x%x, time: 0x%llx\n", pmap_pagetable_corruption_records[i].incident, pmap_pagetable_corruption_records[i].reason, pmap_pagetable_corruption_records[i].action, pmap_pagetable_corruption_records[i].abstime); - } - } -} - +/* + * Clear specified attribute bits. + */ void -mapping_free_prime(void) +phys_attribute_clear( + ppnum_t pn, + int bits) { - int i; - pv_hashed_entry_t pvh_e; - pv_hashed_entry_t pvh_eh; - pv_hashed_entry_t pvh_et; - int pv_cnt; - - pv_cnt = 0; - pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL; - for (i = 0; i < (5 * PV_HASHED_ALLOC_CHUNK); i++) { - pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); + pv_rooted_entry_t pv_h; + pv_hashed_entry_t pv_e; + pt_entry_t *pte; + int pai; + pmap_t pmap; + char attributes = 0; + + pmap_intr_assert(); + assert(pn != vm_page_fictitious_addr); + if (pn == vm_page_guard_addr) + return; - pvh_e->qlink.next = (queue_entry_t)pvh_eh; - pvh_eh = pvh_e; + pai = ppn_to_pai(pn); - if (pvh_et == PV_HASHED_ENTRY_NULL) - pvh_et = pvh_e; - pv_cnt++; + if (!IS_MANAGED_PAGE(pai)) { + /* + * Not a managed page. + */ + return; } - PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt); - pv_cnt = 0; - pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL; - for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) { - pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); + PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, + pn, bits, 0, 0, 0); - pvh_e->qlink.next = (queue_entry_t)pvh_eh; - pvh_eh = pvh_e; + pv_h = pai_to_pvh(pai); - if (pvh_et == PV_HASHED_ENTRY_NULL) - pvh_et = pvh_e; - pv_cnt++; - } - PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt); + LOCK_PVH(pai); -} + /* + * Walk down PV list, clearing all modify or reference bits. + * We do not have to lock the pv_list because we have + * the entire pmap system locked. + */ + if (pv_h->pmap != PMAP_NULL) { + /* + * There are some mappings. + */ -static inline void -pmap_pagetable_corruption_log_setup(void) { - if (pmap_pagetable_corruption_log_call == NULL) { - nanotime_to_absolutetime(PMAP_PAGETABLE_CORRUPTION_INTERVAL, 0, &pmap_pagetable_corruption_interval_abstime); - thread_call_setup(&pmap_pagetable_corruption_log_call_data, - (thread_call_func_t) pmap_pagetable_corruption_msg_log, - (thread_call_param_t) &printf); - pmap_pagetable_corruption_log_call = &pmap_pagetable_corruption_log_call_data; - } -} + pv_e = (pv_hashed_entry_t)pv_h; -void -mapping_adjust(void) -{ - pv_hashed_entry_t pvh_e; - pv_hashed_entry_t pvh_eh; - pv_hashed_entry_t pvh_et; - int pv_cnt; - int i; - - if (mapping_adjust_call == NULL) { - thread_call_setup(&mapping_adjust_call_data, - (thread_call_func_t) mapping_adjust, - (thread_call_param_t) NULL); - mapping_adjust_call = &mapping_adjust_call_data; - } + do { + vm_map_offset_t va; - pmap_pagetable_corruption_log_setup(); + pmap = pv_e->pmap; + va = pv_e->va; - pv_cnt = 0; - pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL; - if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) { - for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) { - pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); + /* + * Clear modify and/or reference bits. + */ + pte = pmap_pte(pmap, va); + attributes |= *pte & (PHYS_MODIFIED|PHYS_REFERENCED); - pvh_e->qlink.next = (queue_entry_t)pvh_eh; - pvh_eh = pvh_e; + pmap_update_pte(pte, *pte, (*pte & ~bits)); + /* Ensure all processors using this translation + * invalidate this TLB entry. The invalidation *must* + * follow the PTE update, to ensure that the TLB + * shadow of the 'D' bit (in particular) is + * synchronized with the updated PTE. + */ + PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE); - if (pvh_et == PV_HASHED_ENTRY_NULL) - pvh_et = pvh_e; - pv_cnt++; - } - PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt); + pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink); + + } while (pv_e != (pv_hashed_entry_t)pv_h); } + /* Opportunistic refmod collection, annulled + * if both REF and MOD are being cleared. + */ - pv_cnt = 0; - pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL; - if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) { - for (i = 0; i < PV_HASHED_ALLOC_CHUNK; i++) { - pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); + pmap_phys_attributes[pai] |= attributes; + pmap_phys_attributes[pai] &= (~bits); - pvh_e->qlink.next = (queue_entry_t)pvh_eh; - pvh_eh = pvh_e; + UNLOCK_PVH(pai); - if (pvh_et == PV_HASHED_ENTRY_NULL) - pvh_et = pvh_e; - pv_cnt++; - } - PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt); - } - mappingrecurse = 0; + PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END, + 0, 0, 0, 0, 0); } - -boolean_t -pmap_is_noencrypt(ppnum_t pn) +/* + * Check specified attribute bits. + */ +int +phys_attribute_test( + ppnum_t pn, + int bits) { - int pai; + pv_rooted_entry_t pv_h; + pv_hashed_entry_t pv_e; + pt_entry_t *pte; + int pai; + pmap_t pmap; + int attributes = 0; + + pmap_intr_assert(); + assert(pn != vm_page_fictitious_addr); + if (pn == vm_page_guard_addr) + return 0; pai = ppn_to_pai(pn); - if (!IS_MANAGED_PAGE(pai)) - return (TRUE); + if (!IS_MANAGED_PAGE(pai)) { + /* + * Not a managed page. + */ + return 0; + } - if (pmap_phys_attributes[pai] & PHYS_NOENCRYPT) - return (TRUE); + /* + * Fast check... if bits already collected + * no need to take any locks... + * if not set, we need to recheck after taking + * the lock in case they got pulled in while + * we were waiting for the lock + */ + if ((pmap_phys_attributes[pai] & bits) == bits) + return bits; - return (FALSE); -} + pv_h = pai_to_pvh(pai); + LOCK_PVH(pai); -void -pmap_set_noencrypt(ppnum_t pn) -{ - int pai; + attributes = pmap_phys_attributes[pai] & bits; - pai = ppn_to_pai(pn); - if (IS_MANAGED_PAGE(pai)) { - LOCK_PVH(pai); + /* + * Walk down PV list, checking the mappings until we + * reach the end or we've found the desired attributes. + */ + if (attributes != bits && + pv_h->pmap != PMAP_NULL) { + /* + * There are some mappings. + */ + pv_e = (pv_hashed_entry_t)pv_h; + do { + vm_map_offset_t va; - pmap_phys_attributes[pai] |= PHYS_NOENCRYPT; + pmap = pv_e->pmap; + va = pv_e->va; + /* + * pick up modify and/or reference bits from mapping + */ - UNLOCK_PVH(pai); + pte = pmap_pte(pmap, va); + attributes |= (int)(*pte & bits); + + pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink); + + } while ((attributes != bits) && + (pv_e != (pv_hashed_entry_t)pv_h)); } -} + pmap_phys_attributes[pai] |= attributes; + UNLOCK_PVH(pai); + return (attributes); +} +/* + * Routine: pmap_change_wiring + * Function: Change the wiring attribute for a map/virtual-address + * pair. + * In/out conditions: + * The mapping must already exist in the pmap. + */ void -pmap_clear_noencrypt(ppnum_t pn) +pmap_change_wiring( + pmap_t map, + vm_map_offset_t vaddr, + boolean_t wired) { - int pai; + pt_entry_t *pte; - pai = ppn_to_pai(pn); - - if (IS_MANAGED_PAGE(pai)) { - LOCK_PVH(pai); + PMAP_LOCK(map); - pmap_phys_attributes[pai] &= ~PHYS_NOENCRYPT; + if ((pte = pmap_pte(map, vaddr)) == PT_ENTRY_NULL) + panic("pmap_change_wiring: pte missing"); - UNLOCK_PVH(pai); + if (wired && !iswired(*pte)) { + /* + * wiring down mapping + */ + OSAddAtomic(+1, &map->stats.wired_count); + pmap_update_pte(pte, *pte, (*pte | INTEL_PTE_WIRED)); } -} - -void x86_filter_TLB_coherency_interrupts(boolean_t dofilter) { - assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0); - - if (dofilter) { - CPU_CR3_MARK_INACTIVE(); - } else { - CPU_CR3_MARK_ACTIVE(); - __asm__ volatile("mfence"); - if (current_cpu_datap()->cpu_tlb_invalid) - process_pmap_updates(); + else if (!wired && iswired(*pte)) { + /* + * unwiring mapping + */ + assert(map->stats.wired_count >= 1); + OSAddAtomic(-1, &map->stats.wired_count); + pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_WIRED)); } -} + PMAP_UNLOCK(map); +} diff --git a/osfmk/i386/proc_reg.h b/osfmk/i386/proc_reg.h index 54fca68e3..b35d5c0a0 100644 --- a/osfmk/i386/proc_reg.h +++ b/osfmk/i386/proc_reg.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -146,6 +146,7 @@ * CR4 */ #define CR4_OSXSAVE 0x00040000 /* OS supports XSAVE */ +#define CR4_PCIDE 0x00020000 /* PCID Enable */ #define CR4_SMXE 0x00004000 /* Enable SMX operation */ #define CR4_VMXE 0x00002000 /* Enable VMX operation */ #define CR4_OSXMM 0x00000400 /* SSE/SSE2 exceptions supported in OS */ @@ -170,6 +171,9 @@ #define XFEM_SSE XCR0_SSE #define XFEM_X87 XCR0_X87 #define XCR0 (0) + +#define PMAP_PCID_PRESERVE (1ULL << 63) +#define PMAP_PCID_MASK (0xFFF) #ifndef ASSEMBLER #include @@ -179,6 +183,66 @@ __BEGIN_DECLS #define set_ts() set_cr0(get_cr0() | CR0_TS) +static inline uint16_t get_es(void) +{ + uint16_t es; + __asm__ volatile("mov %%es, %0" : "=r" (es)); + return es; +} + +static inline void set_es(uint16_t es) +{ + __asm__ volatile("mov %0, %%es" : : "r" (es)); +} + +static inline uint16_t get_ds(void) +{ + uint16_t ds; + __asm__ volatile("mov %%ds, %0" : "=r" (ds)); + return ds; +} + +static inline void set_ds(uint16_t ds) +{ + __asm__ volatile("mov %0, %%ds" : : "r" (ds)); +} + +static inline uint16_t get_fs(void) +{ + uint16_t fs; + __asm__ volatile("mov %%fs, %0" : "=r" (fs)); + return fs; +} + +static inline void set_fs(uint16_t fs) +{ + __asm__ volatile("mov %0, %%fs" : : "r" (fs)); +} + +static inline uint16_t get_gs(void) +{ + uint16_t gs; + __asm__ volatile("mov %%gs, %0" : "=r" (gs)); + return gs; +} + +static inline void set_gs(uint16_t gs) +{ + __asm__ volatile("mov %0, %%gs" : : "r" (gs)); +} + +static inline uint16_t get_ss(void) +{ + uint16_t ss; + __asm__ volatile("mov %%ss, %0" : "=r" (ss)); + return ss; +} + +static inline void set_ss(uint16_t ss) +{ + __asm__ volatile("mov %0, %%ss" : : "r" (ss)); +} + static inline uintptr_t get_cr0(void) { uintptr_t cr0; @@ -198,6 +262,19 @@ static inline uintptr_t get_cr2(void) return(cr2); } +static inline uintptr_t get_cr3_raw(void) +{ + register uintptr_t cr3; + __asm__ volatile("mov %%cr3, %0" : "=r" (cr3)); + return(cr3); +} + +static inline void set_cr3_raw(uintptr_t value) +{ + __asm__ volatile("mov %0, %%cr3" : : "r" (value)); +} + +#if defined(__i386__) static inline uintptr_t get_cr3(void) { register uintptr_t cr3; @@ -209,7 +286,20 @@ static inline void set_cr3(uintptr_t value) { __asm__ volatile("mov %0, %%cr3" : : "r" (value)); } +#else +static inline uintptr_t get_cr3_base(void) +{ + register uintptr_t cr3; + __asm__ volatile("mov %%cr3, %0" : "=r" (cr3)); + return(cr3 & ~(0xFFFULL)); +} + +static inline void set_cr3_composed(uintptr_t base, uint16_t pcid, uint32_t preserve) +{ + __asm__ volatile("mov %0, %%cr3" : : "r" (base | pcid | ( ( (uint64_t)preserve) << 63) ) ); +} +#endif static inline uintptr_t get_cr4(void) { uintptr_t cr4; @@ -222,6 +312,13 @@ static inline void set_cr4(uintptr_t value) __asm__ volatile("mov %0, %%cr4" : : "r" (value)); } +static inline uintptr_t x86_get_flags(void) +{ + uintptr_t erflags; + __asm__ volatile("pushf; pop %0" : "=r" (erflags)); + return erflags; +} + static inline void clear_ts(void) { __asm__ volatile("clts"); @@ -268,8 +365,6 @@ static inline void swapgs(void) #ifdef MACH_KERNEL_PRIVATE - - #ifdef __i386__ #include @@ -286,16 +381,17 @@ static inline void flush_tlb(void) set_cr3(get_cr3()); } } +static inline void flush_tlb_raw(void) +{ + flush_tlb(); +} + #elif defined(__x86_64__) -static inline void flush_tlb(void) +static inline void flush_tlb_raw(void) { - set_cr3(get_cr3()); + set_cr3_raw(get_cr3_raw()); } -#else -#error Unsupported architecture #endif - - #endif /* MACH_KERNEL_PRIVATE */ static inline void wbinvd(void) @@ -375,19 +471,19 @@ static inline void wrmsr64(uint32_t msr, uint64_t val) static inline uint64_t rdtsc64(void) { - uint32_t lo, hi; + uint64_t lo, hi; rdtsc(lo, hi); - return (((uint64_t)hi) << 32) | ((uint64_t)lo); + return ((hi) << 32) | (lo); } static inline uint64_t rdtscp64(uint32_t *aux) { - uint32_t lo, hi; + uint64_t lo, hi; __asm__ volatile("rdtscp; mov %%ecx, %1" : "=a" (lo), "=d" (hi), "=m" (*aux) : : "ecx"); - return (((uint64_t)hi) << 32) | ((uint64_t)lo); + return ((hi) << 32) | (lo); } #else diff --git a/osfmk/i386/rtclock.c b/osfmk/i386/rtclock.c index 72b1f556f..d9de63185 100644 --- a/osfmk/i386/rtclock.c +++ b/osfmk/i386/rtclock.c @@ -56,22 +56,21 @@ #include #include #include /* for kernel_map */ -#include #include #include #include #include #include #include +#include #include #include -#include #include #include #include #include #include -#include +#include #define UI_CPUFREQ_ROUNDING_FACTOR 10000000 @@ -81,165 +80,9 @@ int rtclock_init(void); uint64_t tsc_rebase_abs_time = 0; -void rtclock_intr(x86_saved_state_t *regs); - static void rtc_set_timescale(uint64_t cycles); static uint64_t rtc_export_speed(uint64_t cycles); -rtc_nanotime_t rtc_nanotime_info = {0,0,0,0,1,0}; - -static uint64_t rtc_decrementer_min; -static uint64_t rtc_decrementer_max; - -static uint64_t -deadline_to_decrementer( - uint64_t deadline, - uint64_t now) -{ - uint64_t delta; - - if (deadline <= now) - return rtc_decrementer_min; - else { - delta = deadline - now; - return MIN(MAX(rtc_decrementer_min,delta),rtc_decrementer_max); - } -} - -static inline uint64_t -_absolutetime_to_tsc(uint64_t ns) -{ - uint32_t generation; - uint64_t tsc; - - do { - generation = rtc_nanotime_info.generation; - tsc = tmrCvt(ns - rtc_nanotime_info.ns_base, tscFCvtn2t) - + rtc_nanotime_info.tsc_base; - } while (generation == 0 || - generation != rtc_nanotime_info.generation); - - return tsc; -} - -/* - * Regular local APIC timer case: - */ -static void -rtc_lapic_config_timer(void) -{ - lapic_config_timer(TRUE, one_shot, divide_by_1); -} -static uint64_t -rtc_lapic_set_timer(uint64_t deadline, uint64_t now) -{ - uint64_t count; - uint64_t set = 0; - - if (deadline > 0) { - /* - * Convert delta to bus ticks - * - time now is not relevant - */ - count = deadline_to_decrementer(deadline, now); - set = now + count; - lapic_set_timer_fast((uint32_t) tmrCvt(count, busFCvtn2t)); - } else { - lapic_set_timer(FALSE, one_shot, divide_by_1, 0); - } - return set; -} - -/* - * TSC-deadline timer case: - */ -static void -rtc_lapic_config_tsc_deadline_timer(void) -{ - lapic_config_tsc_deadline_timer(); -} -static uint64_t -rtc_lapic_set_tsc_deadline_timer(uint64_t deadline, uint64_t now) -{ - uint64_t set = 0; - - if (deadline > 0) { - /* - * Convert to TSC - */ - set = now + deadline_to_decrementer(deadline, now); - lapic_set_tsc_deadline_timer(_absolutetime_to_tsc(set)); - } else { - lapic_set_tsc_deadline_timer(0); - } - return set; -} - -/* - * Definitions for timer operations table - */ -typedef struct { - void (*config)(void); - uint64_t (*set) (uint64_t, uint64_t); -} rtc_timer_t; - -rtc_timer_t rtc_timer_lapic = { - rtc_lapic_config_timer, - rtc_lapic_set_timer -}; - -rtc_timer_t rtc_timer_tsc_deadline = { - rtc_lapic_config_tsc_deadline_timer, - rtc_lapic_set_tsc_deadline_timer -}; - -rtc_timer_t *rtc_timer = &rtc_timer_lapic; /* defaults to LAPIC timer */ - -/* - * rtc_timer_init() is called at startup on the boot processor only. - */ -static void -rtc_timer_init(void) -{ - int TSC_deadline_timer = 0; - - /* See whether we can use the local apic in TSC-deadline mode */ - if ((cpuid_features() & CPUID_FEATURE_TSCTMR)) { - TSC_deadline_timer = 1; - PE_parse_boot_argn("TSC_deadline_timer", &TSC_deadline_timer, - sizeof(TSC_deadline_timer)); - printf("TSC Deadline Timer supported %s enabled\n", - TSC_deadline_timer ? "and" : "but not"); - } - - if (TSC_deadline_timer) { - rtc_timer = &rtc_timer_tsc_deadline; - rtc_decrementer_max = UINT64_MAX; /* effectively none */ - /* - * The min could be as low as 1nsec, - * but we're being conservative for now and making it the same - * as for the local apic timer. - */ - rtc_decrementer_min = 1*NSEC_PER_USEC; /* 1 usec */ - } else { - /* - * Compute the longest interval using LAPIC timer. - */ - rtc_decrementer_max = tmrCvt(0x7fffffffULL, busFCvtt2n); - kprintf("maxDec: %lld\n", rtc_decrementer_max); - rtc_decrementer_min = 1*NSEC_PER_USEC; /* 1 usec */ - } - - /* Point LAPIC interrupts to hardclock() */ - lapic_set_timer_func((i386_intr_func_t) rtclock_intr); -} - -static inline uint64_t -rtc_timer_set(uint64_t deadline, uint64_t now) -{ - return rtc_timer->set(deadline, now); -} - void rtc_timer_start(void) { @@ -268,7 +111,7 @@ _tsc_to_nanoseconds(uint64_t value) "addl %%edi,%%eax ;" "adcl $0,%%edx " : "+A" (value) - : "c" (rtc_nanotime_info.scale) + : "c" (pal_rtc_nanotime_info.scale) : "esi", "edi"); #elif defined(__x86_64__) asm volatile("mul %%rcx;" @@ -276,7 +119,7 @@ _tsc_to_nanoseconds(uint64_t value) "shlq $32, %%rdx;" "orq %%rdx, %%rax;" : "=a"(value) - : "a"(value), "c"(rtc_nanotime_info.scale) + : "a"(value), "c"(pal_rtc_nanotime_info.scale) : "rdx", "cc" ); #else #error Unsupported architecture @@ -359,7 +202,7 @@ rtclock_config(void) * be guaranteed by the caller. */ static inline void -rtc_nanotime_set_commpage(rtc_nanotime_t *rntp) +rtc_nanotime_set_commpage(pal_rtc_nanotime_t *rntp) { commpage_set_nanotime(rntp->tsc_base, rntp->ns_base, rntp->scale, rntp->shift); } @@ -370,18 +213,18 @@ rtc_nanotime_set_commpage(rtc_nanotime_t *rntp) * Intialize the nanotime info from the base time. */ static inline void -_rtc_nanotime_init(rtc_nanotime_t *rntp, uint64_t base) +_rtc_nanotime_init(pal_rtc_nanotime_t *rntp, uint64_t base) { uint64_t tsc = rdtsc64(); - _rtc_nanotime_store(tsc, base, rntp->scale, rntp->shift, rntp); + _pal_rtc_nanotime_store(tsc, base, rntp->scale, rntp->shift, rntp); } static void rtc_nanotime_init(uint64_t base) { - _rtc_nanotime_init(&rtc_nanotime_info, base); - rtc_nanotime_set_commpage(&rtc_nanotime_info); + _rtc_nanotime_init(&pal_rtc_nanotime_info, base); + rtc_nanotime_set_commpage(&pal_rtc_nanotime_info); } /* @@ -396,8 +239,7 @@ rtc_nanotime_init_commpage(void) { spl_t s = splclock(); - rtc_nanotime_set_commpage(&rtc_nanotime_info); - + rtc_nanotime_set_commpage(&pal_rtc_nanotime_info); splx(s); } @@ -416,7 +258,7 @@ rtc_nanotime_read(void) return _rtc_nanotime_read(&rtc_nanotime_info, 1); /* slow processor */ else #endif - return _rtc_nanotime_read(&rtc_nanotime_info, 0); /* assume fast processor */ + return _rtc_nanotime_read(&pal_rtc_nanotime_info, 0); /* assume fast processor */ } /* @@ -429,7 +271,7 @@ rtc_nanotime_read(void) void rtc_clock_napped(uint64_t base, uint64_t tsc_base) { - rtc_nanotime_t *rntp = &rtc_nanotime_info; + pal_rtc_nanotime_t *rntp = &pal_rtc_nanotime_info; uint64_t oldnsecs; uint64_t newnsecs; uint64_t tsc; @@ -444,30 +286,29 @@ rtc_clock_napped(uint64_t base, uint64_t tsc_base) * is later than the time using the old base values. */ if (oldnsecs < newnsecs) { - _rtc_nanotime_store(tsc_base, base, rntp->scale, rntp->shift, rntp); + _pal_rtc_nanotime_store(tsc_base, base, rntp->scale, rntp->shift, rntp); rtc_nanotime_set_commpage(rntp); + trace_set_timebases(tsc_base, base); } } - /* * Invoked from power management to correct the SFLM TSC entry drift problem: - * a small delta is added to the tsc_base. This is equivalent to nudging time - * backwards. We require this of the order of a TSC quantum which won't cause - * callers of mach_absolute_time() to see time going backwards! + * a small delta is added to the tsc_base. This is equivalent to nudgin time + * backwards. We require this to be on the order of a TSC quantum which won't + * cause callers of mach_absolute_time() to see time going backwards! */ void rtc_clock_adjust(uint64_t tsc_base_delta) { - rtc_nanotime_t *rntp = &rtc_nanotime_info; + pal_rtc_nanotime_t *rntp = &pal_rtc_nanotime_info; - assert(!ml_get_interrupts_enabled()); - assert(tsc_base_delta < 100ULL); /* i.e. it's small */ - _rtc_nanotime_adjust(tsc_base_delta, rntp); - rtc_nanotime_set_commpage(rntp); + assert(!ml_get_interrupts_enabled()); + assert(tsc_base_delta < 100ULL); /* i.e. it's small */ + _rtc_nanotime_adjust(tsc_base_delta, rntp); + rtc_nanotime_set_commpage(rntp); } - void rtc_clock_stepping(__unused uint32_t new_frequency, __unused uint32_t old_frequency) @@ -485,7 +326,7 @@ rtc_clock_stepped(__unused uint32_t new_frequency, /* * rtc_sleep_wakeup: * - * Invoked from power manageent when we have awoken from a sleep (S3) + * Invoked from power management when we have awoken from a sleep (S3) * and the TSC has been reset. The nanotime data is updated based on * the passed in value. * @@ -539,9 +380,8 @@ rtclock_init(void) ml_init_lock_timeout(); } - /* Set fixed configuration for lapic timers */ + /* Set fixed configuration for lapic timers */ rtc_timer->config(); - rtc_timer_start(); return (1); @@ -553,12 +393,14 @@ rtclock_init(void) static void rtc_set_timescale(uint64_t cycles) { - rtc_nanotime_t *rntp = &rtc_nanotime_info; + pal_rtc_nanotime_t *rntp = &pal_rtc_nanotime_info; rntp->scale = (uint32_t)(((uint64_t)NSEC_PER_SEC << 32) / cycles); +#if CONFIG_EMBEDDED if (cycles <= SLOW_TSC_THRESHOLD) rntp->shift = (uint32_t)cycles; else +#endif rntp->shift = 32; if (tsc_rebase_abs_time == 0) @@ -680,26 +522,23 @@ uint64_t setPop( uint64_t time) { - uint64_t now; - uint64_t pop; + uint64_t now; + uint64_t pop; /* 0 and EndOfAllTime are special-cases for "clear the timer" */ - if (time == 0 || time == EndOfAllTime) { + if (time == 0 || time == EndOfAllTime ) { time = EndOfAllTime; now = 0; - pop = rtc_timer_set(0, 0); + pop = rtc_timer->set(0, 0); } else { - now = rtc_nanotime_read(); - pop = rtc_timer_set(time, now); + now = rtc_nanotime_read(); /* The time in nanoseconds */ + pop = rtc_timer->set(time, now); } - /* Record actual deadline set */ + /* Record requested and actual deadlines set */ x86_lcpu()->rtcDeadline = time; - x86_lcpu()->rtcPop = pop; + x86_lcpu()->rtcPop = pop; - /* - * Pass back the delta we set - */ return pop - now; } diff --git a/osfmk/i386/rtclock_asm.h b/osfmk/i386/rtclock_asm.h new file mode 100644 index 000000000..fedf7a4f4 --- /dev/null +++ b/osfmk/i386/rtclock_asm.h @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2004-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * @OSF_COPYRIGHT@ + */ +/* + * @APPLE_FREE_COPYRIGHT@ + */ +/* + * File: rtclock_asm.h + * Purpose: Assembly routines for handling the machine dependent + * real-time clock. + */ + +#ifndef _I386_RTCLOCK_H_ +#define _I386_RTCLOCK_H_ + +#include + +#if defined(__i386__) + +/* + * Nanotime returned in %edx:%eax. + * Computed from tsc based on the scale factor + * and an implicit 32 bit shift. + * + * Uses %eax, %ebx, %ecx, %edx, %esi, %edi. + */ +#define NANOTIME \ + mov %gs:CPU_NANOTIME,%edi ; \ + PAL_RTC_NANOTIME_READ_FAST() + + +/* + * Add 64-bit delta in register dreg : areg to timer pointed to by register treg. + */ +#define TIMER_UPDATE(treg,dreg,areg,offset) \ + addl (TIMER_LOW+(offset))(treg),areg /* add low bits */ ; \ + adcl dreg,(TIMER_HIGH+(offset))(treg) /* carry high bits */; \ + movl areg,(TIMER_LOW+(offset))(treg) /* updated low bit */; \ + movl (TIMER_HIGH+(offset))(treg),dreg /* copy high bits */ ; \ + movl dreg,(TIMER_HIGHCHK+(offset))(treg) /* to high check */ + +/* + * Add time delta to old timer and start new. + */ +#define TIMER_EVENT(old,new) \ + NANOTIME /* edx:eax nanosecs */ ; \ + movl %eax,%esi /* save timestamp */ ; \ + movl %edx,%edi /* save timestamp */ ; \ + movl %gs:CPU_ACTIVE_THREAD,%ecx /* get current thread */ ; \ + subl (old##_TIMER)+TIMER_TSTAMP(%ecx),%eax /* elapsed */ ; \ + sbbl (old##_TIMER)+TIMER_TSTAMP+4(%ecx),%edx /* time */ ; \ + TIMER_UPDATE(%ecx,%edx,%eax,old##_TIMER) /* update timer */ ; \ + movl %esi,(new##_TIMER)+TIMER_TSTAMP(%ecx) /* set timestamp */ ; \ + movl %edi,(new##_TIMER)+TIMER_TSTAMP+4(%ecx) /* set timestamp */ ; \ + leal (new##_TIMER)(%ecx), %ecx /* compute new timer pointer */ ; \ + movl %gs:CPU_PROCESSOR,%ebx /* get current processor */ ; \ + movl %ecx,THREAD_TIMER(%ebx) /* set current timer */ ; \ + movl %esi,%eax /* restore timestamp */ ; \ + movl %edi,%edx /* restore timestamp */ ; \ + subl (old##_STATE)+TIMER_TSTAMP(%ebx),%eax /* elapsed */ ; \ + sbbl (old##_STATE)+TIMER_TSTAMP+4(%ebx),%edx /* time */ ; \ + TIMER_UPDATE(%ebx,%edx,%eax,old##_STATE)/* update timer */ ; \ + leal (new##_STATE)(%ebx),%ecx /* new state pointer */ ; \ + movl %ecx,CURRENT_STATE(%ebx) /* set current state */ ; \ + movl %esi,TIMER_TSTAMP(%ecx) /* set timestamp */ ; \ + movl %edi,TIMER_TSTAMP+4(%ecx) /* set timestamp */ + +/* + * Update time on user trap entry. + * Uses %eax,%ebx,%ecx,%edx,%esi,%edi. + */ +#define TIME_TRAP_UENTRY TIMER_EVENT(USER,SYSTEM) + +/* + * update time on user trap exit. + * Uses %eax,%ebx,%ecx,%edx,%esi,%edi. + */ +#define TIME_TRAP_UEXIT TIMER_EVENT(SYSTEM,USER) + +/* + * update time on interrupt entry. + * Uses %eax,%ebx,%ecx,%edx,%esi,%edi. + * Saves processor state info on stack. + */ +#define TIME_INT_ENTRY \ + NANOTIME /* edx:eax nanosecs */ ; \ + movl %eax,%gs:CPU_INT_EVENT_TIME /* save in cpu data */ ; \ + movl %edx,%gs:CPU_INT_EVENT_TIME+4 /* save in cpu data */ ; \ + movl %eax,%esi /* save timestamp */ ; \ + movl %edx,%edi /* save timestamp */ ; \ + movl %gs:CPU_PROCESSOR,%ebx /* get current processor */ ; \ + movl THREAD_TIMER(%ebx),%ecx /* get current timer */ ; \ + subl TIMER_TSTAMP(%ecx),%eax /* compute elapsed time */ ; \ + sbbl TIMER_TSTAMP+4(%ecx),%edx /* compute elapsed time */ ; \ + TIMER_UPDATE(%ecx,%edx,%eax,0) /* update timer */ ; \ + movl KERNEL_TIMER(%ebx),%ecx /* point to kernel timer */ ; \ + movl %esi,TIMER_TSTAMP(%ecx) /* set timestamp */ ; \ + movl %edi,TIMER_TSTAMP+4(%ecx) /* set timestamp */ ; \ + movl %esi,%eax /* restore timestamp */ ; \ + movl %edi,%edx /* restore timestamp */ ; \ + movl CURRENT_STATE(%ebx),%ecx /* get current state */ ; \ + pushl %ecx /* save state */ ; \ + subl TIMER_TSTAMP(%ecx),%eax /* compute elapsed time */ ; \ + sbbl TIMER_TSTAMP+4(%ecx),%edx /* compute elapsed time */ ; \ + TIMER_UPDATE(%ecx,%edx,%eax,0) /* update timer */ ; \ + leal IDLE_STATE(%ebx),%eax /* get idle state */ ; \ + cmpl %eax,%ecx /* compare current state */ ; \ + je 0f /* skip if equal */ ; \ + leal SYSTEM_STATE(%ebx),%ecx /* get system state */ ; \ + movl %ecx,CURRENT_STATE(%ebx) /* set current state */ ; \ +0: movl %esi,TIMER_TSTAMP(%ecx) /* set timestamp */ ; \ + movl %edi,TIMER_TSTAMP+4(%ecx) /* set timestamp */ + +/* + * update time on interrupt exit. + * Uses %eax,%ebx,%ecx,%edx,%esi,%edi. + * Restores processor state info from stack. + */ +#define TIME_INT_EXIT \ + NANOTIME /* edx:eax nanosecs */ ; \ + movl %eax,%gs:CPU_INT_EVENT_TIME /* save in cpu data */ ; \ + movl %edx,%gs:CPU_INT_EVENT_TIME+4 /* save in cpu data */ ; \ + movl %eax,%esi /* save timestamp */ ; \ + movl %edx,%edi /* save timestamp */ ; \ + movl %gs:CPU_PROCESSOR,%ebx /* get current processor */ ; \ + movl KERNEL_TIMER(%ebx),%ecx /* point to kernel timer */ ; \ + subl TIMER_TSTAMP(%ecx),%eax /* compute elapsed time */ ; \ + sbbl TIMER_TSTAMP+4(%ecx),%edx /* compute elapsed time */ ; \ + TIMER_UPDATE(%ecx,%edx,%eax,0) /* update timer */ ; \ + movl THREAD_TIMER(%ebx),%ecx /* interrupted timer */ ; \ + movl %esi,TIMER_TSTAMP(%ecx) /* set timestamp */ ; \ + movl %edi,TIMER_TSTAMP+4(%ecx) /* set timestamp */ ; \ + movl %esi,%eax /* restore timestamp */ ; \ + movl %edi,%edx /* restore timestamp */ ; \ + movl CURRENT_STATE(%ebx),%ecx /* get current state */ ; \ + subl TIMER_TSTAMP(%ecx),%eax /* compute elapsed time */ ; \ + sbbl TIMER_TSTAMP+4(%ecx),%edx /* compute elapsed time */ ; \ + TIMER_UPDATE(%ecx,%edx,%eax,0) /* update timer */ ; \ + popl %ecx /* restore state */ ; \ + movl %ecx,CURRENT_STATE(%ebx) /* set current state */ ; \ + movl %esi,TIMER_TSTAMP(%ecx) /* set timestamp */ ; \ + movl %edi,TIMER_TSTAMP+4(%ecx) /* set timestamp */ + +#elif defined(__x86_64__) + +/* + * Nanotime returned in %rax. + * Computed from tsc based on the scale factor and an implicit 32 bit shift. + * This code must match what _rtc_nanotime_read does in + * machine_routines_asm.s. Failure to do so can + * result in "weird" timing results. + * + * Uses: %rsi, %rdi, %rdx, %rcx + */ +#define NANOTIME \ + movq %gs:CPU_NANOTIME,%rdi ; \ + PAL_RTC_NANOTIME_READ_FAST() + +/* + * Add 64-bit delta in register reg to timer pointed to by register treg. + */ +#define TIMER_UPDATE(treg,reg,offset) \ + addq reg,(offset)+TIMER_ALL(treg) /* add timer */ + +/* + * Add time delta to old timer and start new. + * Uses: %rsi, %rdi, %rdx, %rcx, %rax + */ +#define TIMER_EVENT(old,new) \ + NANOTIME /* %rax := nanosecs */ ; \ + movq %rax,%rsi /* save timestamp */ ; \ + movq %gs:CPU_ACTIVE_THREAD,%rcx /* get thread */ ; \ + subq (old##_TIMER)+TIMER_TSTAMP(%rcx),%rax /* compute elapsed */; \ + TIMER_UPDATE(%rcx,%rax,old##_TIMER) /* update timer */ ; \ + leaq (new##_TIMER)(%rcx),%rcx /* point to new timer */ ; \ + movq %rsi,TIMER_TSTAMP(%rcx) /* set timestamp */ ; \ + movq %gs:CPU_PROCESSOR,%rdx /* get processor */ ; \ + movq %rcx,THREAD_TIMER(%rdx) /* set current timer */ ; \ + movq %rsi,%rax /* restore timestamp */ ; \ + subq (old##_STATE)+TIMER_TSTAMP(%rdx),%rax /* compute elapsed */; \ + TIMER_UPDATE(%rdx,%rax,old##_STATE) /* update timer */ ; \ + leaq (new##_STATE)(%rdx),%rcx /* point to new state */ ; \ + movq %rcx,CURRENT_STATE(%rdx) /* set current state */ ; \ + movq %rsi,TIMER_TSTAMP(%rcx) /* set timestamp */ + +/* + * Update time on user trap entry. + * Uses: %rsi, %rdi, %rdx, %rcx, %rax + */ +#define TIME_TRAP_UENTRY TIMER_EVENT(USER,SYSTEM) + +/* + * update time on user trap exit. + * Uses: %rsi, %rdi, %rdx, %rcx, %rax + */ +#define TIME_TRAP_UEXIT TIMER_EVENT(SYSTEM,USER) + +/* + * update time on interrupt entry. + * Uses: %rsi, %rdi, %rdx, %rcx, %rax + * Saves processor state info on stack. + */ +#define TIME_INT_ENTRY \ + NANOTIME /* %rax := nanosecs */ ; \ + movq %rax,%gs:CPU_INT_EVENT_TIME /* save in cpu data */ ; \ + movq %rax,%rsi /* save timestamp */ ; \ + movq %gs:CPU_PROCESSOR,%rdx /* get processor */ ; \ + movq THREAD_TIMER(%rdx),%rcx /* get current timer */ ; \ + subq TIMER_TSTAMP(%rcx),%rax /* compute elapsed */ ; \ + TIMER_UPDATE(%rcx,%rax,0) /* update timer */ ; \ + movq KERNEL_TIMER(%rdx),%rcx /* get kernel timer */ ; \ + movq %rsi,TIMER_TSTAMP(%rcx) /* set timestamp */ ; \ + movq %rsi,%rax /* restore timestamp */ ; \ + movq CURRENT_STATE(%rdx),%rcx /* get current state */ ; \ + pushq %rcx /* save state */ ; \ + subq TIMER_TSTAMP(%rcx),%rax /* compute elapsed */ ; \ + TIMER_UPDATE(%rcx,%rax,0) /* update timer */ ; \ + leaq IDLE_STATE(%rdx),%rax /* get idle state */ ; \ + cmpq %rax,%rcx /* compare current */ ; \ + je 0f /* skip if equal */ ; \ + leaq SYSTEM_STATE(%rdx),%rcx /* get system state */ ; \ + movq %rcx,CURRENT_STATE(%rdx) /* set current state */ ; \ +0: movq %rsi,TIMER_TSTAMP(%rcx) /* set timestamp */ + +/* + * update time on interrupt exit. + * Uses: %rsi, %rdi, %rdx, %rcx, %rax + * Restores processor state info from stack. + */ +#define TIME_INT_EXIT \ + NANOTIME /* %rax := nanosecs */ ; \ + movq %rax,%gs:CPU_INT_EVENT_TIME /* save in cpu data */ ; \ + movq %rax,%rsi /* save timestamp */ ; \ + movq %gs:CPU_PROCESSOR,%rdx /* get processor */ ; \ + movq KERNEL_TIMER(%rdx),%rcx /* get kernel timer */ ; \ + subq TIMER_TSTAMP(%rcx),%rax /* compute elapsed */ ; \ + TIMER_UPDATE(%rcx,%rax,0) /* update timer */ ; \ + movq THREAD_TIMER(%rdx),%rcx /* interrupted timer */ ; \ + movq %rsi,TIMER_TSTAMP(%rcx) /* set timestamp */ ; \ + movq %rsi,%rax /* restore timestamp */ ; \ + movq CURRENT_STATE(%rdx),%rcx /* get current state */ ; \ + subq TIMER_TSTAMP(%rcx),%rax /* compute elapsed */ ; \ + TIMER_UPDATE(%rcx,%rax,0) /* update timer */ ; \ + popq %rcx /* restore state */ ; \ + movq %rcx,CURRENT_STATE(%rdx) /* set current state */ ; \ + movq %rsi,TIMER_TSTAMP(%rcx) /* set timestamp */ + +#endif + +/* + * Check for vtimers for task. + * task_reg is register pointing to current task + * thread_reg is register pointing to current thread + */ +#define TASK_VTIMER_CHECK(task_reg,thread_reg) \ + cmpl $0,TASK_VTIMERS(task_reg) ; \ + jz 1f ; \ + orl $(AST_BSD),%gs:CPU_PENDING_AST /* Set pending AST */ ; \ + lock ; \ + orl $(AST_BSD),TH_AST(thread_reg) /* Set thread AST */ ; \ +1: ; \ + +#endif /* _I386_RTCLOCK_H_ */ diff --git a/osfmk/i386/rtclock.h b/osfmk/i386/rtclock_asm_native.h similarity index 67% rename from osfmk/i386/rtclock.h rename to osfmk/i386/rtclock_asm_native.h index d98b8808f..c17320b7a 100644 --- a/osfmk/i386/rtclock.h +++ b/osfmk/i386/rtclock_asm_native.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2010 Apple Inc. All rights reserved. + * Copyright (c) 2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -32,54 +32,13 @@ * @APPLE_FREE_COPYRIGHT@ */ /* - * File: rtclock.h - * Purpose: Routines for handling the machine dependent - * real-time clock. + * File: rtclock_asm_native.h + * Purpose: Native routines for reading nanotime */ -#ifndef _I386_RTCLOCK_H_ -#define _I386_RTCLOCK_H_ +#ifndef _PAL_RTCLOCK_ASM_NATIVE_H_ +#define _PAL_RTCLOCK_ASM_NATIVE_H_ -#ifndef ASSEMBLER -typedef struct rtc_nanotime { - volatile uint64_t tsc_base; /* timestamp */ - volatile uint64_t ns_base; /* nanoseconds */ - uint32_t scale; /* tsc -> nanosec multiplier */ - uint32_t shift; /* tsc -> nanosec shift/div */ - /* shift is overloaded with - * lower 32bits of tsc_freq - * on slower machines (SLOW_TSC_THRESHOLD) */ - volatile uint32_t generation; /* 0 == being updated */ - uint32_t spare1; -} rtc_nanotime_t; - -#if 0 -#include -#endif - -struct cpu_data; - -extern uint64_t tsc_rebase_abs_time; - -extern void _rtc_nanotime_store( - uint64_t tsc, - uint64_t nsec, - uint32_t scale, - uint32_t shift, - rtc_nanotime_t *dst); - -extern void _rtc_nanotime_adjust( - uint64_t tsc_base_delta, - rtc_nanotime_t *dst); - -extern uint64_t _rtc_nanotime_read( - rtc_nanotime_t *rntp, - int slow); - -extern rtc_nanotime_t rtc_nanotime_info; -#endif - -#define SLOW_TSC_THRESHOLD 1000067800 /* TSC is too slow for regular nanotime() algorithm */ #if defined(__i386__) /* @@ -87,7 +46,7 @@ extern rtc_nanotime_t rtc_nanotime_info; * %edi points to nanotime info struct * %edx:%eax returns nanotime */ -#define RTC_NANOTIME_READ_FAST() \ +#define PAL_RTC_NANOTIME_READ_FAST() \ 0: movl RNT_GENERATION(%edi),%esi /* being updated? */ ; \ testl %esi,%esi ; \ jz 0b /* wait until done */ ; \ @@ -116,10 +75,10 @@ extern rtc_nanotime_t rtc_nanotime_info; * %rdi points to nanotime info struct. * %rax returns nanotime */ -#define RTC_NANOTIME_READ_FAST() \ +#define PAL_RTC_NANOTIME_READ_FAST() \ 0: movl RNT_GENERATION(%rdi),%esi ; \ - test %esi,%esi /* info updating? */ ; \ - jz 0b /* - wait if so */ ; \ + test %esi,%esi /* info updating? */ ; \ + jz 0b /* - wait if so */ ; \ lfence ; \ rdtsc ; \ lfence ; \ @@ -134,6 +93,7 @@ extern rtc_nanotime_t rtc_nanotime_info; cmpl RNT_GENERATION(%rdi),%esi /* repeat if changed */ ; \ jne 0b -#endif +#endif /* !defined(x86_64) */ + -#endif /* _I386_RTCLOCK_H_ */ +#endif /* _PAL_RTCLOCK_ASM_NATIVE_H_ */ diff --git a/osfmk/i386/rtclock_native.c b/osfmk/i386/rtclock_native.c new file mode 100644 index 000000000..5ffaf91d8 --- /dev/null +++ b/osfmk/i386/rtclock_native.c @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * @OSF_COPYRIGHT@ + */ + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static uint64_t rtc_decrementer_min; +static uint64_t rtc_decrementer_max; + +static uint64_t +deadline_to_decrementer( + uint64_t deadline, + uint64_t now) +{ + uint64_t delta; + + if (deadline <= now) + return rtc_decrementer_min; + else { + delta = deadline - now; + return MIN(MAX(rtc_decrementer_min,delta),rtc_decrementer_max); + } +} + +static inline uint64_t +_absolutetime_to_tsc(uint64_t ns) +{ + uint32_t generation; + uint64_t tsc; + + do { + generation = pal_rtc_nanotime_info.generation; + tsc = tmrCvt(ns - pal_rtc_nanotime_info.ns_base, tscFCvtn2t) + + pal_rtc_nanotime_info.tsc_base; + } while (generation == 0 || + generation != pal_rtc_nanotime_info.generation); + + return tsc; +} + +/* + * Regular local APIC timer case: + */ +static void +rtc_lapic_config_timer(void) +{ + lapic_config_timer(TRUE, one_shot, divide_by_1); +} +static uint64_t +rtc_lapic_set_timer(uint64_t deadline, uint64_t now) +{ + uint64_t count; + uint64_t set = 0; + + if (deadline > 0) { + /* + * Convert delta to bus ticks + * - time now is not relevant + */ + count = deadline_to_decrementer(deadline, now); + set = now + count; + lapic_set_timer_fast((uint32_t) tmrCvt(count, busFCvtn2t)); + } else { + lapic_set_timer(FALSE, one_shot, divide_by_1, 0); + } + return set; +} + +/* + * TSC-deadline timer case: + */ +static void +rtc_lapic_config_tsc_deadline_timer(void) +{ + lapic_config_tsc_deadline_timer(); +} +static uint64_t +rtc_lapic_set_tsc_deadline_timer(uint64_t deadline, uint64_t now) +{ + uint64_t set = 0; + + if (deadline > 0) { + /* + * Convert to TSC + */ + set = now + deadline_to_decrementer(deadline, now); + lapic_set_tsc_deadline_timer(_absolutetime_to_tsc(set)); + } else { + lapic_set_tsc_deadline_timer(0); + } + + KERNEL_DEBUG_CONSTANT( + DECR_SET_TSC_DEADLINE | DBG_FUNC_NONE, + now, deadline, + rdtsc64(), lapic_get_tsc_deadline_timer(), + 0); + + return set; +} + +/* + * Definitions for timer operations table + */ + +rtc_timer_t rtc_timer_lapic = { + rtc_lapic_config_timer, + rtc_lapic_set_timer +}; + +rtc_timer_t rtc_timer_tsc_deadline = { + rtc_lapic_config_tsc_deadline_timer, + rtc_lapic_set_tsc_deadline_timer +}; + +rtc_timer_t *rtc_timer = &rtc_timer_lapic; /* defaults to LAPIC timer */ + +/* + * rtc_timer_init() is called at startup on the boot processor only. + */ +void +rtc_timer_init(void) +{ + int TSC_deadline_timer = 0; + + /* See whether we can use the local apic in TSC-deadline mode */ + if ((cpuid_features() & CPUID_FEATURE_TSCTMR)) { + TSC_deadline_timer = 1; + PE_parse_boot_argn("TSC_deadline_timer", &TSC_deadline_timer, + sizeof(TSC_deadline_timer)); + printf("TSC Deadline Timer supported %s enabled\n", + TSC_deadline_timer ? "and" : "but not"); + } + + if (TSC_deadline_timer) { + rtc_timer = &rtc_timer_tsc_deadline; + rtc_decrementer_max = UINT64_MAX; /* effectively none */ + /* + * The min could be as low as 1nsec, + * but we're being conservative for now and making it the same + * as for the local apic timer. + */ + rtc_decrementer_min = 1*NSEC_PER_USEC; /* 1 usec */ + } else { + /* + * Compute the longest interval using LAPIC timer. + */ + rtc_decrementer_max = tmrCvt(0x7fffffffULL, busFCvtt2n); + kprintf("maxDec: %lld\n", rtc_decrementer_max); + rtc_decrementer_min = 1*NSEC_PER_USEC; /* 1 usec */ + } + + /* Point LAPIC interrupts to hardclock() */ + lapic_set_timer_func((i386_intr_func_t) rtclock_intr); +} diff --git a/osfmk/ppc/rtclock.h b/osfmk/i386/rtclock_protos.h similarity index 64% rename from osfmk/ppc/rtclock.h rename to osfmk/i386/rtclock_protos.h index 77f287ead..2d944765d 100644 --- a/osfmk/ppc/rtclock.h +++ b/osfmk/i386/rtclock_protos.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2008 Apple Inc. All rights reserved. + * Copyright (c) 2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -32,30 +32,37 @@ * @APPLE_FREE_COPYRIGHT@ */ /* - * File: rtclock.h - * Purpose: Routines for handling the machine dependent + * File: rtclock_protos.h + * Purpose: C Routines for handling the machine dependent * real-time clock. */ -#ifndef _PPC_RTCLOCK_H_ -#define _PPC_RTCLOCK_H_ +#ifndef _I386_RTCLOCK_PROTOS_H_ +#define _I386_RTCLOCK_PROTOS_H_ -#include +typedef struct pal_rtc_nanotime pal_rtc_nanotime_t; +extern uint64_t tsc_rebase_abs_time; -#define EndOfAllTime 0xFFFFFFFFFFFFFFFFULL +extern void _rtc_nanotime_adjust( + uint64_t tsc_base_delta, + pal_rtc_nanotime_t *dst); -extern void rtclock_intr(struct savearea *ssp); +extern uint64_t _rtc_nanotime_read( + pal_rtc_nanotime_t *rntp, + int slow); -#pragma pack(push,4) -struct rtclock_timer_t { - queue_head_t queue; - uint64_t deadline; - uint32_t - /*boolean_t*/ is_set:1, - has_expired:1, - :0; -}; -#pragma pack(pop) -typedef struct rtclock_timer_t rtclock_timer_t; +extern void rtclock_intr(x86_saved_state_t *regs); -#endif /* _PPC_RTCLOCK_H_ */ + +/* + * Timer control. + */ +typedef struct { + void (*config)(void); + uint64_t (*set) (uint64_t, uint64_t); +} rtc_timer_t; +extern rtc_timer_t *rtc_timer; + +extern void rtc_timer_init(void); + +#endif /* _I386_RTCLOCK_PROTOS_H_ */ diff --git a/osfmk/i386/seg.h b/osfmk/i386/seg.h index 89643edf2..df191c5d1 100644 --- a/osfmk/i386/seg.h +++ b/osfmk/i386/seg.h @@ -220,7 +220,6 @@ extern struct i386_tss master_mctss; extern void mc_task_start(void); #if MACH_KDB -extern char db_stack_store[]; extern char db_task_stack_store[]; extern struct i386_tss master_dbtss; extern void db_task_start(void); @@ -345,9 +344,10 @@ __END_DECLS #endif #ifdef __i386__ +#if !defined(USER_WINDOW_SEL) #define USER_WINDOW_SEL 0x70 /* 14: window for copyin/copyout */ #define PHYS_WINDOW_SEL 0x78 /* 15: window for copyin/copyout */ - +#endif #define KERNEL64_CS 0x80 /* 16: kernel 64-bit code */ #define KERNEL64_SS 0x88 /* 17: kernel 64-bit (syscall) stack */ #else // __x86_64__ diff --git a/osfmk/i386/serial_io.h b/osfmk/i386/serial_io.h index 58e75e4a3..1640256a2 100644 --- a/osfmk/i386/serial_io.h +++ b/osfmk/i386/serial_io.h @@ -37,6 +37,8 @@ #include -int serial_init(void); +int serial_init(void); +void serial_putc(char); +int serial_getc(void); #endif /* _I386_SERIAL_IO_H_ */ diff --git a/osfmk/i386/simple_lock.h b/osfmk/i386/simple_lock.h index fb30ba83f..563c17739 100644 --- a/osfmk/i386/simple_lock.h +++ b/osfmk/i386/simple_lock.h @@ -72,9 +72,6 @@ #if defined(MACH_KERNEL_PRIVATE) && defined(__APPLE_API_PRIVATE) #include #include -#endif - -#if defined(MACH_KERNEL_PRIVATE) && defined(__APPLE_API_PRIVATE) #if MACH_LDEBUG #define USLOCK_DEBUG 1 @@ -85,19 +82,21 @@ typedef struct uslock_debug { void *lock_pc; /* pc where lock operation began */ void *lock_thread; /* thread that acquired lock */ + void *unlock_thread; /* last thread to release lock */ + void *unlock_pc; /* pc where lock operation ended */ unsigned long duration[2]; unsigned short state; unsigned char lock_cpu; - void *unlock_thread; /* last thread to release lock */ unsigned char unlock_cpu; - void *unlock_pc; /* pc where lock operation ended */ } uslock_debug; typedef struct slock { hw_lock_data_t interlock; /* must be first... see lock.c */ +#if USLOCK_DEBUG unsigned short lock_type; /* must be second... see lock.c */ #define USLOCK_TAG 0x5353 uslock_debug debug; +#endif } usimple_lock_data_t, *usimple_lock_t; extern void i386_lock_unlock_with_flush( diff --git a/osfmk/i386/start.s b/osfmk/i386/start.s index b0ba8110a..5472ffde3 100644 --- a/osfmk/i386/start.s +++ b/osfmk/i386/start.s @@ -65,7 +65,6 @@ #define CX(addr,reg) addr(,reg,4) -#include #include #include @@ -82,7 +81,7 @@ EXT(low_intstack): .globl EXT(gIOHibernateRestoreStack) EXT(gIOHibernateRestoreStack): - .set ., .+INTSTACK_SIZE + .space INTSTACK_SIZE .globl EXT(low_eintstack) EXT(low_eintstack:) @@ -110,7 +109,7 @@ LEXT(gdtptr) .align 12 .globl EXT(df_task_stack) EXT(df_task_stack): - .set ., .+INTSTACK_SIZE + .space INTSTACK_SIZE .globl EXT(df_task_stack_end) EXT(df_task_stack_end): @@ -121,38 +120,22 @@ EXT(df_task_stack_end): .align 12 .globl EXT(mc_task_stack) EXT(mc_task_stack): - .set ., .+INTSTACK_SIZE + .space INTSTACK_SIZE .globl EXT(mc_task_stack_end) EXT(mc_task_stack_end): #if MACH_KDB -/* - * Kernel debugger stack for each processor. - */ - .align 12 - .globl EXT(db_stack_store) -EXT(db_stack_store): - .set ., .+(INTSTACK_SIZE*MAX_CPUS) - /* * Stack for last-ditch debugger task for each processor. */ .align 12 .globl EXT(db_task_stack_store) EXT(db_task_stack_store): - .set ., .+(INTSTACK_SIZE*MAX_CPUS) + .space (INTSTACK_SIZE*MAX_CPUS) -/* - * per-processor kernel debugger stacks - */ - .align ALIGN - .globl EXT(kgdb_stack_store) -EXT(kgdb_stack_store): - .set ., .+(INTSTACK_SIZE*MAX_CPUS) #endif /* MACH_KDB */ - /* * BSP CPU start here. * eax points to kernbootstruct @@ -229,7 +212,7 @@ LEXT(slave_pstart) mov $EXT(mp_slave_stack)+PAGE_SIZE, %esp; jmp paging - + /* Code to get from real mode to protected mode */ #define operand_size_prefix .byte 0x66 diff --git a/osfmk/i386/startup64.c b/osfmk/i386/startup64.c index c85bf1955..b445882cd 100644 --- a/osfmk/i386/startup64.c +++ b/osfmk/i386/startup64.c @@ -264,11 +264,7 @@ dump_frame64(x86_saved_state64_t *sp) kprintf("%p: 0x%016llx\n", ip, *ip); kprintf("sp->isf.trapno: 0x%08x\n", sp->isf.trapno); -#ifdef __i386__ - kprintf("sp->isf.trapfn: 0x%08x\n", sp->isf.trapfn); -#else kprintf("sp->isf.trapfn: 0x%016llx\n", sp->isf.trapfn); -#endif kprintf("sp->isf.err: 0x%016llx\n", sp->isf.err); kprintf("sp->isf.rip: 0x%016llx\n", sp->isf.rip); kprintf("sp->isf.cs: 0x%016llx\n", sp->isf.cs); diff --git a/osfmk/i386/thread.h b/osfmk/i386/thread.h index faab785af..bbccc7832 100644 --- a/osfmk/i386/thread.h +++ b/osfmk/i386/thread.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -80,34 +80,7 @@ #include - -/* - * i386_saved_state: - * - * Has been exported to servers. See: mach/i386/thread_status.h - * - * This structure corresponds to the state of user registers - * as saved upon kernel entry. It lives in the pcb. - * It is also pushed onto the stack for exceptions in the kernel. - * For performance, it is also used directly in syscall exceptions - * if the server has requested i386_THREAD_STATE flavor for the exception - * port. - */ - -/* - * Save area for user floating-point state. - * Allocated only when necessary. - */ - -typedef enum { - FXSAVE32 = 1, - FXSAVE64 = 2, - XSAVE32 = 3, - XSAVE64 = 4, - FP_UNUSED = 5 - } fp_save_layout_t; - - +#include /* * x86_kernel_state: @@ -118,92 +91,89 @@ typedef enum { #ifdef __i386__ struct x86_kernel_state { - int k_ebx; /* kernel context */ - int k_esp; - int k_ebp; - int k_edi; - int k_esi; - int k_eip; + uint32_t k_ebx; /* kernel context */ + uint32_t k_esp; + uint32_t k_ebp; + uint32_t k_edi; + uint32_t k_esi; + uint32_t k_eip; /* - * Kernel stacks are 16-byte aligned with a 4-byte i386_exception_link at - * the top, followed by an x86_kernel_state. After both structs have - * been pushed, we want to be 16-byte aligned. A dummy int gets us there. + * Kernel stacks are 16-byte aligned with x86_kernel_state at the top, + * so we need a couple of dummy 32-bit words here. */ - int dummy; + uint32_t dummy[2]; }; #else struct x86_kernel_state { - unsigned long k_rbx; /* kernel context */ - unsigned long k_rsp; - unsigned long k_rbp; - unsigned long k_r12; - unsigned long k_r13; - unsigned long k_r14; - unsigned long k_r15; - unsigned long k_rip; - unsigned long dummy; + uint64_t k_rbx; /* kernel context */ + uint64_t k_rsp; + uint64_t k_rbp; + uint64_t k_r12; + uint64_t k_r13; + uint64_t k_r14; + uint64_t k_r15; + uint64_t k_rip; }; #endif -typedef struct pcb { - void *sf; - x86_saved_state_t *iss; - void *ifps; -#ifdef MACH_BSD - uint64_t cthread_self; /* for use of cthread package */ - struct real_descriptor cthread_desc; - unsigned long uldt_selector; /* user ldt selector to set */ - struct real_descriptor uldt_desc; /* the actual user setable ldt data */ -#endif - decl_simple_lock_data(,lock); - uint64_t iss_pte0; - uint64_t iss_pte1; - void *ids; - uint32_t arg_store_valid; -} *pcb_t; - /* * Maps state flavor to number of words in the state: */ __private_extern__ unsigned int _MachineStateCount[]; -#define USER_STATE(ThrAct) ((ThrAct)->machine.pcb->iss) -#define USER_REGS32(ThrAct) (saved_state32(USER_STATE(ThrAct))) -#define USER_REGS64(ThrAct) (saved_state64(USER_STATE(ThrAct))) - -#define user_pc(ThrAct) (is_saved_state32(USER_STATE(ThrAct)) ? \ - USER_REGS32(ThrAct)->eip : \ - USER_REGS64(ThrAct)->isf.rip ) - - +/* + * The machine-dependent thread state - registers and all platform-dependent + * state - is saved in the machine thread structure which is embedded in + * the thread data structure. For historical reasons this is also referred to + * as the PCB. + */ struct machine_thread { - /* - * pointer to process control block - * (actual storage may as well be here, too) - */ - struct pcb xxx_pcb; - pcb_t pcb; + void *sf; + x86_saved_state_t *iss; + void *ifps; + void *ids; + decl_simple_lock_data(,lock); /* protects ifps and ids */ + uint64_t iss_pte0; + uint64_t iss_pte1; + uint32_t arg_store_valid; +#ifdef MACH_BSD + uint64_t cthread_self; /* for use of cthread package */ + struct real_descriptor cthread_desc; + unsigned long uldt_selector; /* user ldt selector to set */ + struct real_descriptor uldt_desc; /* actual user setable ldt */ +#endif - uint32_t specFlags; + struct pal_pcb pal_pcb; + + uint32_t specFlags; #define OnProc 0x1 #define CopyIOActive 0x2 /* Checked to ensure DTrace actions do not re-enter copyio(). */ #if NCOPY_WINDOWS > 0 - struct { user_addr_t user_base; } copy_window[NCOPY_WINDOWS]; - int nxt_window; - int copyio_state; + int nxt_window; + int copyio_state; #define WINDOWS_DIRTY 0 #define WINDOWS_CLEAN 1 #define WINDOWS_CLOSED 2 #define WINDOWS_OPENED 3 - uint64_t physwindow_pte; - int physwindow_busy; + uint64_t physwindow_pte; + int physwindow_busy; #endif }; +typedef struct machine_thread *pcb_t; +#define THREAD_TO_PCB(Thr) (&(Thr)->machine) + +#define USER_STATE(Thr) ((Thr)->machine.iss) +#define USER_REGS32(Thr) (saved_state32(USER_STATE(Thr))) +#define USER_REGS64(Thr) (saved_state64(USER_STATE(Thr))) + +#define user_pc(Thr) (is_saved_state32(USER_STATE(Thr)) ? \ + USER_REGS32(Thr)->eip : \ + USER_REGS64(Thr)->isf.rip ) extern void *get_user_regs(thread_t); @@ -211,33 +181,19 @@ extern void *act_thread_csave(void); extern void act_thread_catt(void *ctx); extern void act_thread_cfree(void *ctx); -/* - * i386_exception_link: - * - * This structure lives at the high end of the kernel stack. - * It points to the current thread`s user registers. - */ -struct i386_exception_link { - x86_saved_state_t *saved_state; -}; - /* * On the kernel stack is: * stack: ... - * struct i386_exception_link (pointer to user state) * struct x86_kernel_state * stack+kernel_stack_size */ #define STACK_IKS(stack) \ ((struct x86_kernel_state *)((stack) + kernel_stack_size) - 1) -#define STACK_IEL(stack) \ - ((struct i386_exception_link *)STACK_IKS(stack) - 1) /* - * Return the current stack depth - * including x86_kernel_state and i386_exception_link + * Return the current stack depth including x86_kernel_state */ static inline vm_offset_t current_stack_depth(void) @@ -253,7 +209,6 @@ current_stack_depth(void) #endif return (current_cpu_datap()->cpu_kernel_stack + sizeof(struct x86_kernel_state) - + sizeof(struct i386_exception_link *) - stack_ptr); } @@ -263,11 +218,4 @@ current_stack_depth(void) */ #define GET_RETURN_PC(addr) (__builtin_return_address(0)) -/* - * Defining this indicates that MD code will supply an exception() - * routine, conformant with kern/exception.c (dependency alert!) - * but which does wonderfully fast, machine-dependent magic. - */ -#define MACHINE_FAST_EXCEPTION 1 - #endif /* _I386_THREAD_H_ */ diff --git a/osfmk/i386/trap.c b/osfmk/i386/trap.c index 07b3cf479..55be4fc75 100644 --- a/osfmk/i386/trap.c +++ b/osfmk/i386/trap.c @@ -118,8 +118,10 @@ #include -extern void throttle_lowpri_io(boolean_t); +#include +extern void throttle_lowpri_io(int); +extern void kprint_state(x86_saved_state64_t *saved_state); /* * Forward declarations @@ -128,14 +130,13 @@ static void user_page_fault_continue(kern_return_t kret); #ifdef __i386__ static void panic_trap(x86_saved_state32_t *saved_state); static void set_recovery_ip(x86_saved_state32_t *saved_state, vm_offset_t ip); -static void panic_64(x86_saved_state_t *, int, const char *, boolean_t); +extern void panic_64(x86_saved_state_t *, int, const char *, boolean_t); #else static void panic_trap(x86_saved_state64_t *saved_state); static void set_recovery_ip(x86_saved_state64_t *saved_state, vm_offset_t ip); #endif volatile perfCallback perfTrapHook = NULL; /* Pointer to CHUD trap hook routine */ -volatile perfCallback perfASTHook = NULL; /* Pointer to CHUD AST hook routine */ #if CONFIG_DTRACE /* See */ @@ -152,6 +153,7 @@ thread_syscall_return( boolean_t is_mach; int code; + pal_register_cache_state(thr_act, DIRTY); if (thread_is_64bit(thr_act)) { x86_saved_state64_t *regs; @@ -222,6 +224,7 @@ thread_kdb_return(void) thread_t thr_act = current_thread(); x86_saved_state_t *iss = USER_STATE(thr_act); + pal_register_cache_state(thr_act, DIRTY); if (is_saved_state64(iss)) { x86_saved_state64_t *regs; @@ -247,16 +250,13 @@ thread_kdb_return(void) #endif /* MACH_KDB */ -void +static inline void user_page_fault_continue( kern_return_t kr) { thread_t thread = current_thread(); - ast_t *myast; - boolean_t intr; user_addr_t vaddr; - #if MACH_KDB x86_saved_state_t *regs = USER_STATE(thread); int err; @@ -288,7 +288,7 @@ user_page_fault_continue( vaddr = uregs->cr2; } - if ((kr == KERN_SUCCESS) || (kr == KERN_ABORTED)) { + if (__probable((kr == KERN_SUCCESS) || (kr == KERN_ABORTED))) { #if MACH_KDB if (!db_breakpoints_inserted) { db_set_breakpoints(); @@ -301,15 +301,6 @@ user_page_fault_continue( saved_state32(regs))) kdb_trap(T_WATCHPOINT, 0, saved_state32(regs)); #endif /* MACH_KDB */ - intr = ml_set_interrupts_enabled(FALSE); - myast = ast_pending(); - while (*myast & AST_ALL) { - ast_taken(AST_ALL, intr); - ml_set_interrupts_enabled(FALSE); - myast = ast_pending(); - } - ml_set_interrupts_enabled(intr); - thread_exception_return(); /*NOTREACHED*/ } @@ -322,6 +313,8 @@ user_page_fault_continue( } #endif /* MACH_KDB */ + /* PAL debug hook */ + pal_dbg_page_fault( thread, vaddr, kr ); i386_exception(EXC_BAD_ACCESS, kr, vaddr); /*NOTREACHED*/ @@ -341,9 +334,11 @@ extern struct recovery recover_table_end[]; const char * trap_type[] = {TRAP_NAMES}; unsigned TRAP_TYPES = sizeof(trap_type)/sizeof(trap_type[0]); +extern void PE_incoming_interrupt(int interrupt); + #if defined(__x86_64__) && DEBUG -static void -print_state(x86_saved_state64_t *saved_state) +void +kprint_state(x86_saved_state64_t *saved_state) { kprintf("current_cpu_datap() 0x%lx\n", (uintptr_t)current_cpu_datap()); kprintf("Current GS base MSR 0x%llx\n", rdmsr64(MSR_IA32_GS_BASE)); @@ -385,21 +380,9 @@ print_state(x86_saved_state64_t *saved_state) kprintf(" isf.rsp 0x%llx\n", saved_state->isf.rsp); kprintf(" isf.ss 0x%llx\n", saved_state->isf.ss); } -/* - * K64 debug - fatal handler for debug code in the trap vectors. - */ -extern void -panic_idt64(x86_saved_state_t *rsp); -void -panic_idt64(x86_saved_state_t *rsp) -{ - print_state(saved_state64(rsp)); - panic("panic_idt64"); -} #endif - /* * Non-zero indicates latency assert is enabled and capped at valued * absolute time units. @@ -442,9 +425,6 @@ void interrupt_populate_latency_stats(char *buf, unsigned bufsize) { if (tcpu < real_ncpus) snprintf(buf, bufsize, "0x%x 0x%x 0x%llx", tcpu, cpu_data_ptr[tcpu]->cpu_max_observed_int_latency_vector, cpu_data_ptr[tcpu]->cpu_max_observed_int_latency); } - - -extern void PE_incoming_interrupt(int interrupt); /* * Handle interrupts: @@ -458,6 +438,7 @@ interrupt(x86_saved_state_t *state) uint64_t rsp; int interrupt_num; boolean_t user_mode = FALSE; + int ipl; int cnum = cpu_number(); if (is_saved_state64(state) == TRUE) { @@ -484,27 +465,34 @@ interrupt(x86_saved_state_t *state) KERNEL_DEBUG_CONSTANT( MACHDBG_CODE(DBG_MACH_EXCP_INTR, 0) | DBG_FUNC_START, - interrupt_num, (long) rip, user_mode, 0, 0); + interrupt_num, rip, user_mode, 0, 0); + + SCHED_STATS_INTERRUPT(current_processor()); + + ipl = get_preemption_level(); /* * Handle local APIC interrupts * else call platform expert for devices. - */ - if (!lapic_interrupt(interrupt_num, state)) { + */ + if (!lapic_interrupt(interrupt_num, state)) PE_incoming_interrupt(interrupt_num); + + if (__improbable(get_preemption_level() != ipl)) { + panic("Preemption level altered by interrupt vector 0x%x: initial 0x%x, final: 0x%x\n", interrupt_num, ipl, get_preemption_level()); } KERNEL_DEBUG_CONSTANT( MACHDBG_CODE(DBG_MACH_EXCP_INTR, 0) | DBG_FUNC_END, - 0, 0, 0, 0, 0); + interrupt_num, 0, 0, 0, 0); if (cpu_data_ptr[cnum]->cpu_nested_istack) { cpu_data_ptr[cnum]->cpu_nested_istack_events++; } - else { + else { uint64_t int_latency = mach_absolute_time() - cpu_data_ptr[cnum]->cpu_int_event_time; if (ilat_assert && (int_latency > interrupt_latency_cap) && !machine_timeout_suspended()) { - panic("Interrupt vector 0x%x exceeded interrupt latency threshold, 0x%llx absolute time delta, prior signals: 0x%x", interrupt_num, int_latency, cpu_data_ptr[cnum]->cpu_prior_signals); + panic("Interrupt vector 0x%x exceeded interrupt latency threshold, 0x%llx absolute time delta, prior signals: 0x%x, current signals: 0x%x", interrupt_num, int_latency, cpu_data_ptr[cnum]->cpu_prior_signals, cpu_data_ptr[cnum]->cpu_signals); } if (int_latency > cpu_data_ptr[cnum]->cpu_max_observed_int_latency) { cpu_data_ptr[cnum]->cpu_max_observed_int_latency = int_latency; @@ -512,7 +500,6 @@ interrupt(x86_saved_state_t *state) } } - /* * Having serviced the interrupt first, look at the interrupted stack depth. */ @@ -550,7 +537,8 @@ unsigned kdp_has_active_watchpoints = 0; void kernel_trap( - x86_saved_state_t *state) + x86_saved_state_t *state, + uintptr_t *lo_spp) { #ifdef __i386__ x86_saved_state32_t *saved_state; @@ -579,19 +567,28 @@ kernel_trap( thread = current_thread(); #ifdef __i386__ - if (is_saved_state64(state)) { + if (__improbable(is_saved_state64(state))) { panic_64(state, 0, "Kernel trap with 64-bit state", FALSE); } + saved_state = saved_state32(state); + + /* Record cpu where state was captured (trampolines don't set this) */ + saved_state->cpu = cpu_number(); + vaddr = (user_addr_t)saved_state->cr2; type = saved_state->trapno; code = saved_state->err & 0xffff; intr = (saved_state->efl & EFL_IF) != 0; /* state of ints at trap */ kern_ip = (vm_offset_t)saved_state->eip; #else - if (is_saved_state32(state)) + if (__improbable(is_saved_state32(state))) panic("kernel_trap(%p) with 32-bit state", state); saved_state = saved_state64(state); + + /* Record cpu where state was captured */ + saved_state->isf.cpu = cpu_number(); + vaddr = (user_addr_t)saved_state->cr2; type = saved_state->isf.trapno; code = (int)(saved_state->isf.err & 0xffff); @@ -601,18 +598,18 @@ kernel_trap( myast = ast_pending(); - perfCallback fn = perfASTHook; - if (fn) { + perfASTCallback astfn = perfASTHook; + if (__improbable(astfn != NULL)) { if (*myast & AST_CHUD_ALL) - fn(type, NULL, 0, 0); + astfn(AST_CHUD_ALL, myast); } else *myast &= ~AST_CHUD_ALL; /* * Is there a hook? */ - fn = perfTrapHook; - if (fn) { + perfCallback fn = perfTrapHook; + if (__improbable(fn != NULL)) { if (fn(type, NULL, 0, 0) == KERN_SUCCESS) { /* * If it succeeds, we are done... @@ -622,8 +619,8 @@ kernel_trap( } #if CONFIG_DTRACE - if (tempDTraceTrapHook) { - if (tempDTraceTrapHook(type, state, 0, 0) == KERN_SUCCESS) { + if (__improbable(tempDTraceTrapHook != NULL)) { + if (tempDTraceTrapHook(type, state, lo_spp, 0) == KERN_SUCCESS) { /* * If it succeeds, we are done... */ @@ -637,7 +634,7 @@ kernel_trap( * on preemption below. but we do want to re-enable interrupts * as soon we possibly can to hold latency down */ - if (T_PREEMPT == type) { + if (__improbable(T_PREEMPT == type)) { ast_taken(AST_PREEMPTION, FALSE); KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_EXCP_KTRAP_x86, type)) | DBG_FUNC_NONE, @@ -651,7 +648,7 @@ kernel_trap( */ map = kernel_map; - if (thread != THREAD_NULL && thread->map != kernel_map) { + if (__probable(thread != THREAD_NULL && thread->map != kernel_map)) { #if NCOPY_WINDOWS > 0 vm_offset_t copy_window_base; vm_offset_t kvaddr; @@ -665,11 +662,11 @@ kernel_trap( * we only need to look at the window * associated with this processor */ - copy_window_base = current_cpu_datap()->cpu_copywindow_base; + copy_window_base = current_cpu_datap()->cpu_copywindow_base; if (kvaddr >= copy_window_base && kvaddr < (copy_window_base + (NBPDE * NCOPY_WINDOWS)) ) { - window_index = (kvaddr - copy_window_base) / NBPDE; + window_index = (int)((kvaddr - copy_window_base) / NBPDE); if (thread->machine.copy_window[window_index].user_base != (user_addr_t)-1) { @@ -693,8 +690,9 @@ kernel_trap( */ if (no_shared_cr3 && (thread->machine.specFlags&CopyIOActive) && - map->pmap->pm_cr3 != get_cr3()) { - set_cr3(map->pmap->pm_cr3); + map->pmap->pm_cr3 != get_cr3_base()) { + pmap_assert(current_cpu_datap()->cpu_pmap_pcid_enabled == FALSE); + set_cr3_raw(map->pmap->pm_cr3); return; } } @@ -782,7 +780,7 @@ kernel_trap( #endif /* MACH_KDB */ #if CONFIG_DTRACE - if (thread->options & TH_OPT_DTRACE) { /* Executing under dtrace_probe? */ + if (thread != THREAD_NULL && thread->options & TH_OPT_DTRACE) { /* Executing under dtrace_probe? */ if (dtrace_tally_fault(vaddr)) { /* Should a fault under dtrace be ignored? */ /* * DTrace has "anticipated" the possibility of this fault, and has @@ -815,33 +813,9 @@ kernel_trap( if (result == KERN_SUCCESS) { #if NCOPY_WINDOWS > 0 if (fault_in_copy_window != -1) { - pt_entry_t *updp; - pt_entry_t *kpdp; - - /* - * in case there was no page table assigned - * for the user base address and the pmap - * got 'expanded' due to this fault, we'll - * copy in the descriptor - * - * we're either setting the page table descriptor - * to the same value or it was 0... no need - * for a TLB flush in either case - */ - - ml_set_interrupts_enabled(FALSE); - updp = pmap_pde(map->pmap, thread->machine.copy_window[fault_in_copy_window].user_base); - assert(updp); - if (0 == updp) panic("trap: updp 0"); /* XXX DEBUG */ - kpdp = current_cpu_datap()->cpu_copywindow_pdp; - kpdp += fault_in_copy_window; - -#if JOE_DEBUG - if (*kpdp && (*kpdp & PG_FRAME) != (*updp & PG_FRAME)) - panic("kernel_fault: user pdp doesn't match - updp = 0x%qx, kpdp = 0x%qx\n", *updp, *kpdp); -#endif - pmap_store_pte(kpdp, *updp); - + ml_set_interrupts_enabled(FALSE); + copy_window_fault(thread, map, + fault_in_copy_window); (void) ml_set_interrupts_enabled(intr); } #endif /* NCOPY_WINDOWS > 0 */ @@ -855,9 +829,6 @@ kernel_trap( #endif /* CONFIG_DTRACE */ case T_GENERAL_PROTECTION: -#if defined(__x86_64__) && DEBUG - print_state(saved_state); -#endif /* * If there is a failure recovery address * for this fault, go there. @@ -872,7 +843,7 @@ kernel_trap( /* * Check thread recovery address also. */ - if (thread->recover) { + if (thread != THREAD_NULL && thread->recover) { set_recovery_ip(saved_state, thread->recover); thread->recover = 0; return; @@ -883,7 +854,6 @@ kernel_trap( * * fall through... */ - default: /* * Exception 15 is reserved but some chips may generate it @@ -893,6 +863,9 @@ kernel_trap( kprintf("kernel_trap() ignoring spurious trap 15\n"); return; } +#if defined(__x86_64__) && DEBUG + kprint_state(saved_state); +#endif debugger_entry: /* Ensure that the i386_kernel_state at the base of the * current thread's stack (if any) is synchronized with the @@ -923,7 +896,7 @@ kernel_trap( } #endif } - + __asm__ volatile("cli":::"cc"); panic_trap(saved_state); /* * NO RETURN @@ -951,10 +924,10 @@ static void panic_trap(x86_saved_state32_t *regs) { const char *trapname = "Unknown"; - uint32_t cr0 = get_cr0(); - uint32_t cr2 = get_cr2(); - uint32_t cr3 = get_cr3(); - uint32_t cr4 = get_cr4(); + pal_cr_t cr0, cr2, cr3, cr4; + + pal_get_control_registers( &cr0, &cr2, &cr3, &cr4 ); + /* * Issue an I/O port read if one has been requested - this is an * event logic analyzers can use as a trigger point. @@ -977,7 +950,7 @@ panic_trap(x86_saved_state32_t *regs) regs->eip, regs->trapno, trapname, cr0, cr2, cr3, cr4, regs->eax,regs->ebx,regs->ecx,regs->edx, regs->cr2,regs->ebp,regs->esi,regs->edi, - regs->efl,regs->eip,regs->cs, regs->ds, regs->err); + regs->efl,regs->eip,regs->cs & 0xFFFF, regs->ds & 0xFFFF, regs->err); /* * This next statement is not executed, * but it's needed to stop the compiler using tail call optimization @@ -990,11 +963,11 @@ static void panic_trap(x86_saved_state64_t *regs) { const char *trapname = "Unknown"; - uint64_t cr0 = get_cr0(); - uint64_t cr2 = get_cr2(); - uint64_t cr3 = get_cr3(); - uint64_t cr4 = get_cr4(); + pal_cr_t cr0, cr2, cr3, cr4; + pal_get_control_registers( &cr0, &cr2, &cr3, &cr4 ); + assert(ml_get_interrupts_enabled() == FALSE); + current_cpu_datap()->cpu_fatal_trap_state = regs; /* * Issue an I/O port read if one has been requested - this is an * event logic analyzers can use as a trigger point. @@ -1016,15 +989,15 @@ panic_trap(x86_saved_state64_t *regs) "R8: 0x%016llx, R9: 0x%016llx, R10: 0x%016llx, R11: 0x%016llx\n" "R12: 0x%016llx, R13: 0x%016llx, R14: 0x%016llx, R15: 0x%016llx\n" "RFL: 0x%016llx, RIP: 0x%016llx, CS: 0x%016llx, SS: 0x%016llx\n" - "Error code: 0x%016llx\n", + "CR2: 0x%016llx, Error code: 0x%016llx, Faulting CPU: 0x%x\n", regs->isf.rip, regs->isf.trapno, trapname, cr0, cr2, cr3, cr4, regs->rax, regs->rbx, regs->rcx, regs->rdx, regs->isf.rsp, regs->rbp, regs->rsi, regs->rdi, regs->r8, regs->r9, regs->r10, regs->r11, regs->r12, regs->r13, regs->r14, regs->r15, - regs->isf.rflags, regs->isf.rip, regs->isf.cs, regs->isf.ss, - regs->isf.err); + regs->isf.rflags, regs->isf.rip, regs->isf.cs & 0xFFFF, + regs->isf.ss & 0xFFFF,regs->cr2, regs->isf.err, regs->isf.cpu); /* * This next statement is not executed, * but it's needed to stop the compiler using tail call optimization @@ -1034,181 +1007,6 @@ panic_trap(x86_saved_state64_t *regs) } #endif -extern void kprintf_break_lock(void); - -#ifdef __i386__ -static void -panic_32(__unused int code, __unused int pc, __unused const char *msg, boolean_t do_mca_dump, boolean_t do_bt) -{ - struct i386_tss *my_ktss = current_ktss(); - - /* Set postcode (DEBUG only) */ - postcode(pc); - - /* - * Issue an I/O port read if one has been requested - this is an - * event logic analyzers can use as a trigger point. - */ - panic_io_port_read(); - - /* - * Break kprintf lock in case of recursion, - * and record originally faulted instruction address. - */ - kprintf_break_lock(); - - if (do_mca_dump) { -#if CONFIG_MCA - /* - * Dump the contents of the machine check MSRs (if any). - */ - mca_dump(); -#endif - } - -#if MACH_KDP - /* - * Print backtrace leading to first fault: - */ - if (do_bt) - panic_i386_backtrace((void *) my_ktss->ebp, 10, NULL, FALSE, NULL); -#endif - - panic("%s at 0x%08x, thread:%p, code:0x%x, " - "registers:\n" - "CR0: 0x%08x, CR2: 0x%08x, CR3: 0x%08x, CR4: 0x%08x\n" - "EAX: 0x%08x, EBX: 0x%08x, ECX: 0x%08x, EDX: 0x%08x\n" - "ESP: 0x%08x, EBP: 0x%08x, ESI: 0x%08x, EDI: 0x%08x\n" - "EFL: 0x%08x, EIP: 0x%08x\n", - msg, - my_ktss->eip, current_thread(), code, - (uint32_t)get_cr0(), (uint32_t)get_cr2(), (uint32_t)get_cr3(), (uint32_t)get_cr4(), - my_ktss->eax, my_ktss->ebx, my_ktss->ecx, my_ktss->edx, - my_ktss->esp, my_ktss->ebp, my_ktss->esi, my_ktss->edi, - my_ktss->eflags, my_ktss->eip); -} - -/* - * Called from locore on a special reserved stack after a double-fault - * is taken in kernel space. - * Kernel stack overflow is one route here. - */ -void -panic_double_fault32(int code) -{ - panic_32(code, PANIC_DOUBLE_FAULT, "Double fault", FALSE, TRUE); -} - -/* - * Called from locore on a special reserved stack after a machine-check - */ -void -panic_machine_check32(int code) -{ - panic_32(code, PANIC_MACHINE_CHECK, "Machine-check", TRUE, FALSE); -} -#endif /* __i386__ */ - -static void -panic_64(x86_saved_state_t *sp, __unused int pc, __unused const char *msg, boolean_t do_mca_dump) -{ - /* Set postcode (DEBUG only) */ - postcode(pc); - - /* - * Issue an I/O port read if one has been requested - this is an - * event logic analyzers can use as a trigger point. - */ - panic_io_port_read(); - - /* - * Break kprintf lock in case of recursion, - * and record originally faulted instruction address. - */ - kprintf_break_lock(); - - if (do_mca_dump) { -#if CONFIG_MCA - /* - * Dump the contents of the machine check MSRs (if any). - */ - mca_dump(); -#endif - } - -#ifdef __i386__ - /* - * Dump the interrupt stack frame at last kernel entry. - */ - if (is_saved_state64(sp)) { - x86_saved_state64_t *ss64p = saved_state64(sp); - panic("%s thread:%p, trapno:0x%x, err:0x%qx, " - "registers:\n" - "CR0: 0x%08x, CR2: 0x%08x, CR3: 0x%08x, CR4: 0x%08x\n" - "RAX: 0x%016qx, RBX: 0x%016qx, RCX: 0x%016qx, RDX: 0x%016qx\n" - "RSP: 0x%016qx, RBP: 0x%016qx, RSI: 0x%016qx, RDI: 0x%016qx\n" - "R8: 0x%016qx, R9: 0x%016qx, R10: 0x%016qx, R11: 0x%016qx\n" - "R12: 0x%016qx, R13: 0x%016qx, R14: 0x%016qx, R15: 0x%016qx\n" - "RFL: 0x%016qx, RIP: 0x%016qx, CR2: 0x%016qx\n", - msg, - current_thread(), ss64p->isf.trapno, ss64p->isf.err, - (uint32_t)get_cr0(), (uint32_t)get_cr2(), (uint32_t)get_cr3(), (uint32_t)get_cr4(), - ss64p->rax, ss64p->rbx, ss64p->rcx, ss64p->rdx, - ss64p->isf.rsp, ss64p->rbp, ss64p->rsi, ss64p->rdi, - ss64p->r8, ss64p->r9, ss64p->r10, ss64p->r11, - ss64p->r12, ss64p->r13, ss64p->r14, ss64p->r15, - ss64p->isf.rflags, ss64p->isf.rip, ss64p->cr2); - } else { - x86_saved_state32_t *ss32p = saved_state32(sp); - panic("%s at 0x%08x, thread:%p, trapno:0x%x, err:0x%x," - "registers:\n" - "CR0: 0x%08x, CR2: 0x%08x, CR3: 0x%08x, CR4: 0x%08x\n" - "EAX: 0x%08x, EBX: 0x%08x, ECX: 0x%08x, EDX: 0x%08x\n" - "ESP: 0x%08x, EBP: 0x%08x, ESI: 0x%08x, EDI: 0x%08x\n" - "EFL: 0x%08x, EIP: 0x%08x\n", - msg, - ss32p->eip, current_thread(), ss32p->trapno, ss32p->err, - (uint32_t)get_cr0(), (uint32_t)get_cr2(), (uint32_t)get_cr3(), (uint32_t)get_cr4(), - ss32p->eax, ss32p->ebx, ss32p->ecx, ss32p->edx, - ss32p->uesp, ss32p->ebp, ss32p->esi, ss32p->edi, - ss32p->efl, ss32p->eip); - } -#else - x86_saved_state64_t *regs = saved_state64(sp); - panic("%s thread:%p at 0x%016llx, registers:\n" - "CR0: 0x%016lx, CR2: 0x%016lx, CR3: 0x%016lx, CR4: 0x%016lx\n" - "RAX: 0x%016llx, RBX: 0x%016llx, RCX: 0x%016llx, RDX: 0x%016llx\n" - "RSP: 0x%016llx, RBP: 0x%016llx, RSI: 0x%016llx, RDI: 0x%016llx\n" - "R8: 0x%016llx, R9: 0x%016llx, R10: 0x%016llx, R11: 0x%016llx\n" - "R12: 0x%016llx, R13: 0x%016llx, R14: 0x%016llx, R15: 0x%016llx\n" - "RFL: 0x%016llx, RIP: 0x%016llx, CS: 0x%016llx, SS: 0x%016llx\n" - "Error code: 0x%016llx\n", - msg, - current_thread(), regs->isf.rip, - get_cr0(), get_cr2(), get_cr3(), get_cr4(), - regs->rax, regs->rbx, regs->rcx, regs->rdx, - regs->isf.rsp, regs->rbp, regs->rsi, regs->rdi, - regs->r8, regs->r9, regs->r10, regs->r11, - regs->r12, regs->r13, regs->r14, regs->r15, - regs->isf.rflags, regs->isf.rip, regs->isf.cs, regs->isf.ss, - regs->isf.err); -#endif -} - -void -panic_double_fault64(x86_saved_state_t *sp) -{ - panic_64(sp, PANIC_DOUBLE_FAULT, "Double fault", FALSE); - -} -void - -panic_machine_check64(x86_saved_state_t *sp) -{ - panic_64(sp, PANIC_MACHINE_CHECK, "Machine Check", TRUE); - -} - #if CONFIG_DTRACE extern kern_return_t dtrace_user_probe(x86_saved_state_t *); #endif @@ -1231,6 +1029,7 @@ user_trap( ast_t *myast; kern_return_t kret; user_addr_t rip; + unsigned long dr6 = 0; /* 32 bit for i386, 64 bit for x86_64 */ assert((is_saved_state32(saved_state) && !thread_is_64bit(thread)) || (is_saved_state64(saved_state) && thread_is_64bit(thread))); @@ -1240,6 +1039,9 @@ user_trap( regs = saved_state64(saved_state); + /* Record cpu where state was captured */ + regs->isf.cpu = cpu_number(); + type = regs->isf.trapno; err = (int)regs->isf.err & 0xffff; vaddr = (user_addr_t)regs->cr2; @@ -1249,12 +1051,26 @@ user_trap( regs = saved_state32(saved_state); + /* Record cpu where state was captured */ + regs->cpu = cpu_number(); + type = regs->trapno; err = regs->err & 0xffff; vaddr = (user_addr_t)regs->cr2; rip = (user_addr_t)regs->eip; } + if ((type == T_DEBUG) && thread->machine.ids) { + unsigned long clear = 0; + /* Stash and clear this processor's DR6 value, in the event + * this was a debug register match + */ + __asm__ volatile ("mov %%db6, %0" : "=r" (dr6)); + __asm__ volatile ("mov %0, %%db6" : : "r" (clear)); + } + + pal_sti(); + KERNEL_DEBUG_CONSTANT( (MACHDBG_CODE(DBG_MACH_EXCP_UTRAP_x86, type)) | DBG_FUNC_NONE, (unsigned)(vaddr>>32), (unsigned)vaddr, @@ -1268,17 +1084,18 @@ user_trap( kprintf("user_trap(0x%08x) type=%d vaddr=0x%016llx\n", saved_state, type, vaddr); #endif - perfCallback fn = perfASTHook; - if (fn) { + + perfASTCallback astfn = perfASTHook; + if (__improbable(astfn != NULL)) { myast = ast_pending(); if (*myast & AST_CHUD_ALL) { - fn(type, saved_state, 0, 0); + astfn(AST_CHUD_ALL, myast); } } /* Is there a hook? */ - fn = perfTrapHook; - if (fn) { + perfCallback fn = perfTrapHook; + if (__improbable(fn != NULL)) { if (fn(type, saved_state, 0, 0) == KERN_SUCCESS) return; /* If it succeeds, we are done... */ } @@ -1291,7 +1108,7 @@ user_trap( DEBUG_KPRINT_SYSCALL_MASK(1, "user_trap: type=0x%x(%s) err=0x%x cr2=%p rip=%p\n", type, trap_type[type], err, (void *)(long) vaddr, (void *)(long) rip); - + switch (type) { case T_DIVIDE_ERROR: @@ -1302,12 +1119,11 @@ user_trap( case T_DEBUG: { pcb_t pcb; - long clear = 0; /* 32 bit for i386, 64 bit for x86_64 */ /* - * get dr6 and set it in the thread's pcb before - * returning to userland + * Update the PCB with this processor's DR6 value + * in the event this was a debug register match. */ - pcb = thread->machine.pcb; + pcb = THREAD_TO_PCB(thread); if (pcb->ids) { /* * We can get and set the status register @@ -1315,16 +1131,13 @@ user_trap( * because the high order bits are not * used on x86_64 */ - unsigned long dr6_temp; /* 32 bit for i386, 64 bit for x86_64 */ - __asm__ volatile ("mov %%db6, %0" : "=r" (dr6_temp)); /* Register constraint by necessity */ if (thread_is_64bit(thread)) { x86_debug_state64_t *ids = pcb->ids; - ids->dr6 = dr6_temp; + ids->dr6 = dr6; } else { /* 32 bit thread */ x86_debug_state32_t *ids = pcb->ids; - ids->dr6 = (uint32_t) dr6_temp; + ids->dr6 = (uint32_t) dr6; } - __asm__ volatile ("mov %0, %%db6" : : "r" (clear)); } exc = EXC_BREAKPOINT; code = EXC_I386_SGL; @@ -1406,7 +1219,7 @@ user_trap( if (err & T_PF_WRITE) prot |= VM_PROT_WRITE; #if PAE - if (err & T_PF_EXECUTE) + if (__improbable(err & T_PF_EXECUTE)) prot |= VM_PROT_EXECUTE; #endif kret = vm_fault(thread->map, vm_map_trunc_page(vaddr), @@ -1462,8 +1275,6 @@ user_trap( /* * Handle AST traps for i386. - * Check for delayed floating-point exception from - * AT-bus machines. */ extern void log_thread_action (thread_t, char *); @@ -1511,44 +1322,6 @@ i386_exception( } - -void -kernel_preempt_check(void) -{ - ast_t *myast; - boolean_t intr; - - /* - * disable interrupts to both prevent pre-emption - * and to keep the ast state from changing via - * an interrupt handler making something runnable - */ - intr = ml_set_interrupts_enabled(FALSE); - - myast = ast_pending(); - - if ((*myast & AST_URGENT) && intr == TRUE && get_interrupt_level() == 0) { - /* - * can handle interrupts and preemptions - * at this point - */ - ml_set_interrupts_enabled(intr); - - /* - * now cause the PRE-EMPTION trap - */ - __asm__ volatile (" int $0xff"); - } else { - /* - * if interrupts were already disabled or - * we're in an interrupt context, we can't - * preempt... of course if AST_URGENT - * isn't set we also don't want to - */ - ml_set_interrupts_enabled(intr); - } -} - #if MACH_KDB extern void db_i386_state(x86_saved_state32_t *regs); @@ -1595,6 +1368,10 @@ sync_iss_to_iks(x86_saved_state_t *saved_state) vm_offset_t kstack; boolean_t record_active_regs = FALSE; + /* The PAL may have a special way to sync registers */ + if( saved_state->flavor == THREAD_STATE_NONE ) + pal_get_kern_regs( saved_state ); + if ((kstack = current_thread()->kernel_stack) != 0) { #ifdef __i386__ x86_saved_state32_t *regs = saved_state32(saved_state); @@ -1604,8 +1381,7 @@ sync_iss_to_iks(x86_saved_state_t *saved_state) iks = STACK_IKS(kstack); - - /* Did we take the trap/interrupt in kernel mode? */ + /* Did we take the trap/interrupt in kernel mode? */ #ifdef __i386__ if (regs == USER_REGS32(current_thread())) record_active_regs = TRUE; diff --git a/osfmk/i386/trap.h b/osfmk/i386/trap.h index ff00c7476..f24141112 100644 --- a/osfmk/i386/trap.h +++ b/osfmk/i386/trap.h @@ -114,7 +114,6 @@ #define T_PF_EXECUTE 0x10 /* instruction fetch when NX */ #endif - #if !defined(ASSEMBLER) && defined(MACH_KERNEL) #include @@ -129,7 +128,7 @@ extern void sync_iss_to_iks(x86_saved_state_t *regs); extern void sync_iss_to_iks_unconditionally( x86_saved_state_t *regs); -extern void kernel_trap(x86_saved_state_t *regs); +extern void kernel_trap(x86_saved_state_t *regs, uintptr_t *lo_spp); extern void user_trap(x86_saved_state_t *regs); @@ -148,11 +147,13 @@ extern void i386_astintr(int preemption); typedef kern_return_t (*perfCallback)( int trapno, void *regs, - int unused1, - int unused2); + uintptr_t *lo_spp, + int); + +typedef kern_return_t (*perfASTCallback)(ast_t reasons, ast_t *myast); extern volatile perfCallback perfTrapHook; -extern volatile perfCallback perfASTHook; +extern volatile perfASTCallback perfASTHook; extern volatile perfCallback perfIntHook; extern void panic_i386_backtrace(void *, int, const char *, boolean_t, x86_saved_state_t *); diff --git a/osfmk/i386/trap_native.c b/osfmk/i386/trap_native.c new file mode 100644 index 000000000..26a9cbf07 --- /dev/null +++ b/osfmk/i386/trap_native.c @@ -0,0 +1,295 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* +* @OSF_COPYRIGHT@ +*/ +/* +* Mach Operating System +* Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University +* All Rights Reserved. +* +* Permission to use, copy, modify and distribute this software and its +* documentation is hereby granted, provided that both the copyright +* notice and this permission notice appear in all copies of the +* software, derivative works or modified versions, and any portions +* thereof, and that both notices appear in supporting documentation. +* +* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" +* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR +* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. +* +* Carnegie Mellon requests users of this software to return to +* +* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU +* School of Computer Science +* Carnegie Mellon University +* Pittsburgh PA 15213-3890 +* +* any improvements or extensions that they make and grant Carnegie Mellon +* the rights to redistribute these changes. +*/ +/* +*/ + +/* +* Hardware trap/fault handler. + */ + +#include +#include +#include +#include +#include +#include /* panic_io_port_read() */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#if CONFIG_MCA +#include +#endif +#include + +#include +#include + +extern void kprintf_break_lock(void); +extern void kprint_state(x86_saved_state64_t *saved_state); +void panic_64(x86_saved_state_t *, int, const char *, boolean_t); + +extern volatile int panic_double_fault_cpu; + +#if defined(__x86_64__) && DEBUG +/* + * K64 debug - fatal handler for debug code in the trap vectors. + */ +extern void +panic_idt64(x86_saved_state_t *rsp); +void +panic_idt64(x86_saved_state_t *rsp) +{ + kprint_state(saved_state64(rsp)); + panic("panic_idt64"); +} +#endif + +#ifdef __i386__ +static void +panic_32(__unused int code, __unused int pc, __unused const char *msg, boolean_t do_mca_dump, boolean_t do_bt) +{ + struct i386_tss *my_ktss = current_ktss(); + + /* Set postcode (DEBUG only) */ + postcode(pc); + + /* + * Issue an I/O port read if one has been requested - this is an + * event logic analyzers can use as a trigger point. + */ + panic_io_port_read(); + + /* + * Break kprintf lock in case of recursion, + * and record originally faulted instruction address. + */ + kprintf_break_lock(); + + if (do_mca_dump) { +#if CONFIG_MCA + /* + * Dump the contents of the machine check MSRs (if any). + */ + mca_dump(); +#endif + } + +#if MACH_KDP + /* + * Print backtrace leading to first fault: + */ + if (do_bt) + panic_i386_backtrace((void *) my_ktss->ebp, 10, NULL, FALSE, NULL); +#endif + + panic("%s at 0x%08x, code:0x%x, " + "registers:\n" + "CR0: 0x%08x, CR2: 0x%08x, CR3: 0x%08x, CR4: 0x%08x\n" + "EAX: 0x%08x, EBX: 0x%08x, ECX: 0x%08x, EDX: 0x%08x\n" + "ESP: 0x%08x, EBP: 0x%08x, ESI: 0x%08x, EDI: 0x%08x\n" + "EFL: 0x%08x, EIP: 0x%08x\n", + msg, + my_ktss->eip, code, + (uint32_t)get_cr0(), (uint32_t)get_cr2(), (uint32_t)get_cr3(), (uint32_t)get_cr4(), + my_ktss->eax, my_ktss->ebx, my_ktss->ecx, my_ktss->edx, + my_ktss->esp, my_ktss->ebp, my_ktss->esi, my_ktss->edi, + my_ktss->eflags, my_ktss->eip); +} + +/* + * Called from locore on a special reserved stack after a double-fault + * is taken in kernel space. + * Kernel stack overflow is one route here. + */ +void +panic_double_fault32(int code) +{ + (void)OSCompareAndSwap((UInt32) -1, (UInt32) cpu_number(), (volatile UInt32 *)&panic_double_fault_cpu); + panic_32(code, PANIC_DOUBLE_FAULT, "Double fault", FALSE, TRUE); +} + +/* + * Called from locore on a special reserved stack after a machine-check + */ +void +panic_machine_check32(int code) +{ + panic_32(code, PANIC_MACHINE_CHECK, "Machine-check", TRUE, FALSE); +} +#endif /* __i386__ */ + +void +panic_64(x86_saved_state_t *sp, __unused int pc, __unused const char *msg, boolean_t do_mca_dump) +{ + /* Set postcode (DEBUG only) */ + postcode(pc); + + /* + * Issue an I/O port read if one has been requested - this is an + * event logic analyzers can use as a trigger point. + */ + panic_io_port_read(); + + + /* + * Break kprintf lock in case of recursion, + * and record originally faulted instruction address. + */ + kprintf_break_lock(); + + if (do_mca_dump) { +#if CONFIG_MCA + /* + * Dump the contents of the machine check MSRs (if any). + */ + mca_dump(); +#endif + } + +#ifdef __i386__ + /* + * Dump the interrupt stack frame at last kernel entry. + */ + if (is_saved_state64(sp)) { + x86_saved_state64_t *ss64p = saved_state64(sp); + panic("%s trapno:0x%x, err:0x%qx, " + "registers:\n" + "CR0: 0x%08x, CR2: 0x%08x, CR3: 0x%08x, CR4: 0x%08x\n" + "RAX: 0x%016qx, RBX: 0x%016qx, RCX: 0x%016qx, RDX: 0x%016qx\n" + "RSP: 0x%016qx, RBP: 0x%016qx, RSI: 0x%016qx, RDI: 0x%016qx\n" + "R8: 0x%016qx, R9: 0x%016qx, R10: 0x%016qx, R11: 0x%016qx\n" + "R12: 0x%016qx, R13: 0x%016qx, R14: 0x%016qx, R15: 0x%016qx\n" + "RFL: 0x%016qx, RIP: 0x%016qx, CR2: 0x%016qx\n", + msg, + ss64p->isf.trapno, ss64p->isf.err, + (uint32_t)get_cr0(), (uint32_t)get_cr2(), (uint32_t)get_cr3(), (uint32_t)get_cr4(), + ss64p->rax, ss64p->rbx, ss64p->rcx, ss64p->rdx, + ss64p->isf.rsp, ss64p->rbp, ss64p->rsi, ss64p->rdi, + ss64p->r8, ss64p->r9, ss64p->r10, ss64p->r11, + ss64p->r12, ss64p->r13, ss64p->r14, ss64p->r15, + ss64p->isf.rflags, ss64p->isf.rip, ss64p->cr2); + } else { + x86_saved_state32_t *ss32p = saved_state32(sp); + panic("%s at 0x%08x, trapno:0x%x, err:0x%x," + "registers:\n" + "CR0: 0x%08x, CR2: 0x%08x, CR3: 0x%08x, CR4: 0x%08x\n" + "EAX: 0x%08x, EBX: 0x%08x, ECX: 0x%08x, EDX: 0x%08x\n" + "ESP: 0x%08x, EBP: 0x%08x, ESI: 0x%08x, EDI: 0x%08x\n" + "EFL: 0x%08x, EIP: 0x%08x\n", + msg, + ss32p->eip, ss32p->trapno, ss32p->err, + (uint32_t)get_cr0(), (uint32_t)get_cr2(), (uint32_t)get_cr3(), (uint32_t)get_cr4(), + ss32p->eax, ss32p->ebx, ss32p->ecx, ss32p->edx, + ss32p->uesp, ss32p->ebp, ss32p->esi, ss32p->edi, + ss32p->efl, ss32p->eip); + } +#else + x86_saved_state64_t *regs = saved_state64(sp); + panic("%s at 0x%016llx, registers:\n" + "CR0: 0x%016lx, CR2: 0x%016lx, CR3: 0x%016lx, CR4: 0x%016lx\n" + "RAX: 0x%016llx, RBX: 0x%016llx, RCX: 0x%016llx, RDX: 0x%016llx\n" + "RSP: 0x%016llx, RBP: 0x%016llx, RSI: 0x%016llx, RDI: 0x%016llx\n" + "R8: 0x%016llx, R9: 0x%016llx, R10: 0x%016llx, R11: 0x%016llx\n" + "R12: 0x%016llx, R13: 0x%016llx, R14: 0x%016llx, R15: 0x%016llx\n" + "RFL: 0x%016llx, RIP: 0x%016llx, CS: 0x%016llx, SS: 0x%016llx\n" + "Error code: 0x%016llx\n", + msg, + regs->isf.rip, + get_cr0(), get_cr2(), get_cr3_raw(), get_cr4(), + regs->rax, regs->rbx, regs->rcx, regs->rdx, + regs->isf.rsp, regs->rbp, regs->rsi, regs->rdi, + regs->r8, regs->r9, regs->r10, regs->r11, + regs->r12, regs->r13, regs->r14, regs->r15, + regs->isf.rflags, regs->isf.rip, regs->isf.cs & 0xFFFF, regs->isf.ss & 0xFFFF, + regs->isf.err); +#endif +} + +void +panic_double_fault64(x86_saved_state_t *sp) +{ + (void)OSCompareAndSwap((UInt32) -1, (UInt32) cpu_number(), (volatile UInt32 *)&panic_double_fault_cpu); + panic_64(sp, PANIC_DOUBLE_FAULT, "Double fault", FALSE); + +} +void + +panic_machine_check64(x86_saved_state_t *sp) +{ + panic_64(sp, PANIC_MACHINE_CHECK, "Machine Check", TRUE); + +} diff --git a/osfmk/i386/tsc.c b/osfmk/i386/tsc.c index 9e794797b..5205da23e 100644 --- a/osfmk/i386/tsc.c +++ b/osfmk/i386/tsc.c @@ -51,7 +51,6 @@ #include #include #include /* for kernel_map */ -#include #include #include #include diff --git a/osfmk/i386/tsc.h b/osfmk/i386/tsc.h index f6c5eba78..2f6011b93 100644 --- a/osfmk/i386/tsc.h +++ b/osfmk/i386/tsc.h @@ -42,7 +42,9 @@ #define BASE_NHM_CLOCK_SOURCE 133333333ULL #define IA32_PERF_STS 0x198 +#define SLOW_TSC_THRESHOLD 1000067800 /* TSC is too slow for regular nanotime() algorithm */ +#ifndef ASSEMBLER extern uint64_t busFCvtt2n; extern uint64_t busFCvtn2t; extern uint64_t tscFreq; @@ -74,6 +76,6 @@ typedef struct tscInfo tscInfo_t; extern void tsc_get_info(tscInfo_t *info); extern void tsc_init(void); - +#endif /* ASSEMBLER */ #endif /* _I386_TSC_H_ */ #endif /* KERNEL_PRIVATE */ diff --git a/osfmk/i386/ucode.c b/osfmk/i386/ucode.c new file mode 100644 index 000000000..e416cc5ff --- /dev/null +++ b/osfmk/i386/ucode.c @@ -0,0 +1,201 @@ +/* + * ucode.c + * + * Microcode updater interface sysctl + */ + +#include +#include +#include +#include +#include +#include +#include // mp_broadcast +#include // cpu_number + +#define IA32_BIOS_UPDT_TRIG (0x79) /* microcode update trigger MSR */ + +struct intel_ucupdate *global_update = NULL; + +/* Exceute the actual update! */ +static void +update_microcode(void) +{ + /* SDM Example 9-8 code shows that we load the + * address of the UpdateData within the microcode blob, + * not the address of the header. + */ + wrmsr64(IA32_BIOS_UPDT_TRIG, (uint64_t)(uintptr_t)&global_update->data); +} + +/* locks */ +static lck_grp_attr_t *ucode_slock_grp_attr = NULL; +static lck_grp_t *ucode_slock_grp = NULL; +static lck_attr_t *ucode_slock_attr = NULL; +static lck_spin_t *ucode_slock = NULL; + +static kern_return_t +register_locks(void) +{ + /* already allocated? */ + if (ucode_slock_grp_attr && ucode_slock_grp && ucode_slock_attr && ucode_slock) + return KERN_SUCCESS; + + /* allocate lock group attribute and group */ + if (!(ucode_slock_grp_attr = lck_grp_attr_alloc_init())) + goto nomem_out; + + lck_grp_attr_setstat(ucode_slock_grp_attr); + + if (!(ucode_slock_grp = lck_grp_alloc_init("uccode_lock", ucode_slock_grp_attr))) + goto nomem_out; + + /* Allocate lock attribute */ + if (!(ucode_slock_attr = lck_attr_alloc_init())) + goto nomem_out; + + /* Allocate the spin lock */ + /* We keep one global spin-lock. We could have one per update + * request... but srsly, why would you update microcode like that? + */ + if (!(ucode_slock = lck_spin_alloc_init(ucode_slock_grp, ucode_slock_attr))) + goto nomem_out; + + return KERN_SUCCESS; + +nomem_out: + /* clean up */ + if (ucode_slock) + lck_spin_free(ucode_slock, ucode_slock_grp); + if (ucode_slock_attr) + lck_attr_free(ucode_slock_attr); + if (ucode_slock_grp) + lck_grp_free(ucode_slock_grp); + if (ucode_slock_grp_attr) + lck_grp_attr_free(ucode_slock_grp_attr); + + return KERN_NO_SPACE; +} + +/* Copy in an update */ +static int +copyin_update(uint64_t inaddr) +{ + struct intel_ucupdate update_header; + struct intel_ucupdate *update; + vm_size_t size; + kern_return_t ret; + int error; + + /* Copy in enough header to peek at the size */ + error = copyin((user_addr_t)inaddr, (void *)&update_header, sizeof(update_header)); + if (error) + return error; + + /* Get the actual, alleged size */ + size = update_header.total_size; + + /* huge bogus piece of data that somehow made it through? */ + if (size >= 1024 * 1024) + return ENOMEM; + + /* Old microcodes? */ + if (size == 0) + size = 2048; /* default update size; see SDM */ + + /* + * create the buffer for the update + * It need only be aligned to 16-bytes, according to the SDM. + * This also wires it down + */ + ret = kmem_alloc_kobject(kernel_map, (vm_offset_t *)&update, size); + if (ret != KERN_SUCCESS) + return ENOMEM; + + /* Copy it in */ + error = copyin((user_addr_t)inaddr, (void*)update, size); + if (error) { + kmem_free(kernel_map, (vm_offset_t)update, size); + return error; + } + + global_update = update; + return 0; +} + +/* + * This is called once by every CPU on a wake from sleep/hibernate + * and is meant to re-apply a microcode update that got lost + * by sleeping. + */ +void +ucode_update_wake() +{ + if (global_update) { + kprintf("ucode: Re-applying update after wake (CPU #%d)\n", cpu_number()); + update_microcode(); +#ifdef DEBUG + } else { + kprintf("ucode: No update to apply (CPU #%d)\n", cpu_number()); +#endif + } +} + +static void +cpu_update(__unused void *arg) +{ + /* grab the lock */ + lck_spin_lock(ucode_slock); + + /* execute the update */ + update_microcode(); + + /* if CPU #0, update global CPU information */ + if (!cpu_number()) + cpuid_set_info(); + + /* release the lock */ + lck_spin_unlock(ucode_slock); +} + +/* Farm an update out to all CPUs */ +static void +xcpu_update(void) +{ + if (register_locks() != KERN_SUCCESS) + return; + + /* Get all CPUs to perform the update */ + mp_broadcast(cpu_update, NULL); +} + +/* + * sysctl function + * + */ +int +ucode_interface(uint64_t addr) +{ + int error; + +#if !DEBUG + /* + * Userland may only call this once per boot. Anything else + * would not make sense (all updates are cumulative), and also + * leak memory, because we don't free previous updates. + */ + if (global_update) + return EPERM; +#endif + + /* Get the whole microcode */ + error = copyin_update(addr); + + if (error) + return error; + + /* Farm out the updates */ + xcpu_update(); + + return 0; +} diff --git a/osfmk/i386/ucode.h b/osfmk/i386/ucode.h new file mode 100644 index 000000000..55dc70645 --- /dev/null +++ b/osfmk/i386/ucode.h @@ -0,0 +1,30 @@ +/* + * ucode.h + * + * Interface definitions for the microcode updater interface sysctl + */ + +/* Intel defined microcode format */ +struct intel_ucupdate { + /* Header information */ + uint32_t header_version; + uint32_t update_revision; + uint32_t date; + uint32_t processor_signature; + uint32_t checksum; + uint32_t loader_revision; + uint32_t processor_flags; + uint32_t data_size; + uint32_t total_size; + + /* Reserved for future expansion */ + uint32_t reserved0; + uint32_t reserved1; + uint32_t reserved2; + + /* First word of the update data */ + uint32_t data; +}; + +extern int ucode_interface(uint64_t addr); +extern void ucode_update_wake(void); diff --git a/osfmk/i386/user_ldt.c b/osfmk/i386/user_ldt.c index 45f51361f..8b5791455 100644 --- a/osfmk/i386/user_ldt.c +++ b/osfmk/i386/user_ldt.c @@ -237,7 +237,7 @@ i386_set_ldt( case 0: case ACC_P: /* valid empty descriptor, clear Present preemptively */ - dp->access &= ~ACC_P; + dp->access &= (~ACC_P & 0xff); break; case ACC_P | ACC_PL_U | ACC_DATA: case ACC_P | ACC_PL_U | ACC_DATA_W: @@ -389,7 +389,7 @@ user_ldt_set( bcopy(user_ldt->ldt, &ldtp[user_ldt->start], sizeof(struct real_descriptor) * (user_ldt->count)); - gdt_desc_p(USER_LDT)->limit_low = (sizeof(struct real_descriptor) * (user_ldt->start + user_ldt->count)) - 1; + gdt_desc_p(USER_LDT)->limit_low = (uint16_t)((sizeof(struct real_descriptor) * (user_ldt->start + user_ldt->count)) - 1); ml_cpu_set_ldt(USER_LDT); } else { diff --git a/osfmk/i386/vmx/vmx_asm.h b/osfmk/i386/vmx/vmx_asm.h index bd0de4688..c295f6b03 100644 --- a/osfmk/i386/vmx/vmx_asm.h +++ b/osfmk/i386/vmx/vmx_asm.h @@ -39,7 +39,7 @@ #define VMX_FAIL_VALID -2 #define VMX_SUCCEED 0 -static inline void enter_64bit_mode(void) { +__attribute__((always_inline)) static inline void enter_64bit_mode(void) { __asm__ __volatile__ ( ".byte 0xea /* far jump longmode */ \n\t" ".long 1f \n\t" @@ -49,7 +49,7 @@ static inline void enter_64bit_mode(void) { :: "i" (KERNEL64_CS) ); } -static inline void enter_compat_mode(void) { +__attribute__((always_inline)) static inline void enter_compat_mode(void) { asm( "ljmp *4f \n\t" "4: \n\t" diff --git a/osfmk/i386/vmx/vmx_cpu.c b/osfmk/i386/vmx/vmx_cpu.c index 34bd07acc..22cebe2d8 100644 --- a/osfmk/i386/vmx/vmx_cpu.c +++ b/osfmk/i386/vmx/vmx_cpu.c @@ -35,7 +35,6 @@ #include #include #include -#include #include /* for host_info() */ #define VMX_KPRINTF(x...) /* kprintf("vmx: " x) */ @@ -190,7 +189,7 @@ vmx_get_specs() Enter VMX root operation on this CPU. -------------------------------------------------------------------------- */ static void -vmx_on(void) +vmx_on(void *arg __unused) { vmx_cpu_t *cpu = ¤t_cpu_datap()->cpu_vmx; addr64_t vmxon_region_paddr; @@ -222,7 +221,7 @@ vmx_on(void) Leave VMX root operation on this CPU. -------------------------------------------------------------------------- */ static void -vmx_off(void) +vmx_off(void *arg __unused) { int result; @@ -322,7 +321,7 @@ host_vmxon(boolean_t exclusive) if (do_it) { vmx_allocate_vmxon_regions(); - mp_rendezvous(NULL, (void (*)(void *))vmx_on, NULL, NULL); + mp_rendezvous(NULL, vmx_on, NULL, NULL); } return error; } @@ -348,7 +347,7 @@ host_vmxoff() simple_unlock(&vmx_use_count_lock); if (do_it) { - mp_rendezvous(NULL, (void (*)(void *))vmx_off, NULL, NULL); + mp_rendezvous(NULL, vmx_off, NULL, NULL); vmx_free_vmxon_regions(); } @@ -365,7 +364,7 @@ vmx_suspend() { VMX_KPRINTF("vmx_suspend\n"); if (vmx_use_count) - vmx_off(); + vmx_off(NULL); } /* ----------------------------------------------------------------------------- @@ -378,5 +377,5 @@ vmx_resume() VMX_KPRINTF("vmx_resume\n"); vmx_init(); /* init VMX on CPU #0 */ if (vmx_use_count) - vmx_on(); + vmx_on(NULL); } diff --git a/osfmk/ipc/ipc_entry.c b/osfmk/ipc/ipc_entry.c index b468943ac..595660239 100644 --- a/osfmk/ipc/ipc_entry.c +++ b/osfmk/ipc/ipc_entry.c @@ -236,7 +236,7 @@ ipc_entry_get( gen = IE_BITS_NEW_GEN(free_entry->ie_bits); free_entry->ie_bits = gen; - free_entry->ie_request = 0; + free_entry->ie_request = IE_REQ_NONE; /* * The new name can't be MACH_PORT_NULL because index @@ -377,7 +377,7 @@ ipc_entry_alloc_name( table[next_index].ie_next; entry->ie_bits = gen; - entry->ie_request = 0; + entry->ie_request = IE_REQ_NONE; *entryp = entry; assert(entry->ie_object == IO_NULL); @@ -516,7 +516,12 @@ ipc_entry_dealloc( assert(space->is_active); assert(entry->ie_object == IO_NULL); - assert(entry->ie_request == 0); + assert(entry->ie_request == IE_REQ_NONE); + +#if 1 + if (entry->ie_request != IE_REQ_NONE) + panic("ipc_entry_dealloc()\n"); +#endif index = MACH_PORT_INDEX(name); table = space->is_table; diff --git a/osfmk/ipc/ipc_entry.h b/osfmk/ipc/ipc_entry.h index e1c01d154..14d7d1846 100644 --- a/osfmk/ipc/ipc_entry.h +++ b/osfmk/ipc/ipc_entry.h @@ -114,6 +114,8 @@ struct ipc_entry { #define ie_next index.next #define ie_index hash.table +#define IE_REQ_NONE 0 /* no request */ + #define IE_BITS_UREFS_MASK 0x0000ffff /* 16 bits of user-reference */ #define IE_BITS_UREFS(bits) ((bits) & IE_BITS_UREFS_MASK) diff --git a/osfmk/ipc/ipc_init.c b/osfmk/ipc/ipc_init.c index 8e4773748..72f01383c 100644 --- a/osfmk/ipc/ipc_init.c +++ b/osfmk/ipc/ipc_init.c @@ -151,10 +151,6 @@ ipc_bootstrap(void) ipc_space_max * sizeof(struct ipc_space), sizeof(struct ipc_space), "ipc spaces"); -#if 0 - /* make it exhaustible */ - zone_change(ipc_space_zone, Z_EXHAUST, TRUE); -#endif zone_change(ipc_space_zone, Z_NOENCRYPT, TRUE); ipc_tree_entry_zone = @@ -162,10 +158,6 @@ ipc_bootstrap(void) ipc_tree_entry_max * sizeof(struct ipc_tree_entry), sizeof(struct ipc_tree_entry), "ipc tree entries"); -#if 0 - /* make it exhaustible */ - zone_change(ipc_tree_entry_zone, Z_EXHAUST, TRUE); -#endif zone_change(ipc_tree_entry_zone, Z_NOENCRYPT, TRUE); /* @@ -176,11 +168,8 @@ ipc_bootstrap(void) ipc_port_max * sizeof(struct ipc_port), sizeof(struct ipc_port), "ipc ports"); - /* - * XXX Can't make the port zone exhaustible because the kernel - * XXX panics when port allocation for an internal object fails. - *zone_change(ipc_object_zones[IOT_PORT], Z_EXHAUST, TRUE); - */ + /* cant charge callers for port allocations (references passed) */ + zone_change(ipc_object_zones[IOT_PORT], Z_CALLERACCT, FALSE); zone_change(ipc_object_zones[IOT_PORT], Z_NOENCRYPT, TRUE); ipc_object_zones[IOT_PORT_SET] = @@ -188,8 +177,6 @@ ipc_bootstrap(void) ipc_pset_max * sizeof(struct ipc_pset), sizeof(struct ipc_pset), "ipc port sets"); - /* make it exhaustible */ - zone_change(ipc_object_zones[IOT_PORT_SET], Z_EXHAUST, TRUE); zone_change(ipc_object_zones[IOT_PORT_SET], Z_NOENCRYPT, TRUE); /* @@ -201,6 +188,7 @@ ipc_bootstrap(void) IKM_SAVED_KMSG_SIZE, IKM_SAVED_KMSG_SIZE, "ipc kmsgs"); + zone_change(ipc_kmsg_zone, Z_CALLERACCT, FALSE); zone_change(ipc_kmsg_zone, Z_NOENCRYPT, TRUE); #if CONFIG_MACF_MACH @@ -209,6 +197,9 @@ ipc_bootstrap(void) ipc_port_max * sizeof(struct ipc_labelh), sizeof(struct ipc_labelh), "label handles"); + /* cant charge callers for label allocations (port refs passed) */ + zone_change(ipc_labelh_zone, Z_CALLERACCT, FALSE); + #endif /* create special spaces */ diff --git a/osfmk/ipc/ipc_kmsg.c b/osfmk/ipc/ipc_kmsg.c index d61a26a77..167d42145 100644 --- a/osfmk/ipc/ipc_kmsg.c +++ b/osfmk/ipc/ipc_kmsg.c @@ -808,51 +808,59 @@ void ipc_kmsg_destroy( ipc_kmsg_t kmsg) { - ipc_kmsg_queue_t queue; - boolean_t empty; - /* - * ipc_kmsg_clean can cause more messages to be destroyed. - * Curtail recursion by queueing messages. If a message - * is already queued, then this is a recursive call. + * Destroying a message can cause more messages to be destroyed. + * Curtail recursion by putting messages on the deferred + * destruction queue. If this was the first message on the + * queue, this instance must process the full queue. */ + if (ipc_kmsg_delayed_destroy(kmsg)) + ipc_kmsg_reap_delayed(); +} - queue = &(current_thread()->ith_messages); - empty = ipc_kmsg_queue_empty(queue); - ipc_kmsg_enqueue(queue, kmsg); +/* + * Routine: ipc_kmsg_delayed_destroy + * Purpose: + * Enqueues a kernel message for deferred destruction. + * Returns: + * Boolean indicator that the caller is responsible to reap + * deferred messages. + */ - if (empty) { - /* must leave kmsg in queue while cleaning it */ +boolean_t ipc_kmsg_delayed_destroy( + ipc_kmsg_t kmsg) +{ + ipc_kmsg_queue_t queue = &(current_thread()->ith_messages); + boolean_t first = ipc_kmsg_queue_empty(queue); - while ((kmsg = ipc_kmsg_queue_first(queue)) != IKM_NULL) { - ipc_kmsg_clean(kmsg); - ipc_kmsg_rmqueue(queue, kmsg); - ipc_kmsg_free(kmsg); - } - } + ipc_kmsg_enqueue(queue, kmsg); + return first; } /* - * Routine: ipc_kmsg_destroy_dest + * Routine: ipc_kmsg_destroy_queue * Purpose: - * Destroys a kernel message. Releases all rights, - * references, and memory held by the message (including - * the destination port reference. - * Frees the message. + * Destroys messages from the per-thread + * deferred reaping queue. * Conditions: * No locks held. */ + void -ipc_kmsg_destroy_dest( - ipc_kmsg_t kmsg) +ipc_kmsg_reap_delayed(void) { - ipc_port_t port; - - port = kmsg->ikm_header->msgh_remote_port; + ipc_kmsg_queue_t queue = &(current_thread()->ith_messages); + ipc_kmsg_t kmsg; - ipc_port_release(port); - kmsg->ikm_header->msgh_remote_port = MACH_PORT_NULL; - ipc_kmsg_destroy(kmsg); + /* + * must leave kmsg in queue while cleaning it to assure + * no nested calls recurse into here. + */ + while ((kmsg = ipc_kmsg_queue_first(queue)) != IKM_NULL) { + ipc_kmsg_clean(kmsg); + ipc_kmsg_rmqueue(queue, kmsg); + ipc_kmsg_free(kmsg); + } } /* @@ -864,7 +872,7 @@ ipc_kmsg_destroy_dest( * Conditions: * No locks held. */ - +static unsigned int _ipc_kmsg_clean_invalid_desc = 0; void ipc_kmsg_clean_body( __unused ipc_kmsg_t kmsg, @@ -943,7 +951,7 @@ ipc_kmsg_clean_body( break; } default : { - printf("cleanup: don't understand this type of descriptor\n"); + _ipc_kmsg_clean_invalid_desc++; /* don't understand this type of descriptor */ } } } @@ -975,7 +983,7 @@ ipc_kmsg_clean_partial( object = (ipc_object_t) kmsg->ikm_header->msgh_remote_port; assert(IO_VALID(object)); - ipc_object_destroy(object, MACH_MSGH_BITS_REMOTE(mbits)); + ipc_object_destroy_dest(object, MACH_MSGH_BITS_REMOTE(mbits)); object = (ipc_object_t) kmsg->ikm_header->msgh_local_port; if (IO_VALID(object)) @@ -1007,7 +1015,7 @@ ipc_kmsg_clean( mbits = kmsg->ikm_header->msgh_bits; object = (ipc_object_t) kmsg->ikm_header->msgh_remote_port; if (IO_VALID(object)) - ipc_object_destroy(object, MACH_MSGH_BITS_REMOTE(mbits)); + ipc_object_destroy_dest(object, MACH_MSGH_BITS_REMOTE(mbits)); object = (ipc_object_t) kmsg->ikm_header->msgh_local_port; if (IO_VALID(object)) @@ -1244,7 +1252,6 @@ ipc_kmsg_get_from_kernel( assert(size >= sizeof(mach_msg_header_t)); // assert((size & 3) == 0); - assert(IP_VALID((ipc_port_t) msg->msgh_remote_port)); dest_port = (ipc_port_t)msg->msgh_remote_port; msg_and_trailer_size = size + MAX_TRAILER_SIZE; @@ -1254,7 +1261,7 @@ ipc_kmsg_get_from_kernel( * clients. These are set up for those kernel clients * which cannot afford to wait. */ - if (IP_PREALLOC(dest_port)) { + if (IP_VALID(dest_port) && IP_PREALLOC(dest_port)) { mach_msg_size_t max_desc = 0; ip_lock(dest_port); @@ -1332,6 +1339,7 @@ ipc_kmsg_get_from_kernel( * MACH_MSG_SUCCESS The message was accepted. * MACH_SEND_TIMED_OUT Caller still has message. * MACH_SEND_INTERRUPTED Caller still has message. + * MACH_SEND_INVALID_DEST Caller still has message. */ mach_msg_return_t ipc_kmsg_send( @@ -1346,9 +1354,6 @@ ipc_kmsg_send( port = (ipc_port_t) kmsg->ikm_header->msgh_remote_port; assert(IP_VALID(port)); - if ((option & ~(MACH_SEND_TIMEOUT|MACH_SEND_ALWAYS)) != 0) - printf("ipc_kmsg_send: bad option 0x%x\n", option); - ip_lock(port); if (port->ip_receiver == ipc_space_kernel) { @@ -1543,16 +1548,6 @@ ipc_kmsg_put_to_kernel( * and the bits field is updated. The destination port * will be a valid port pointer. * - * The notify argument implements the MACH_SEND_CANCEL option. - * If it is not MACH_PORT_NULL, it should name a receive right. - * If the processing of the destination port would generate - * a port-deleted notification (because the right for the - * destination port is destroyed and it had a request for - * a dead-name notification registered), and the port-deleted - * notification would be sent to the named receive right, - * then it isn't sent and the send-once right for the notify - * port is quietly destroyed. - * * Conditions: * Nothing locked. * Returns: @@ -1560,9 +1555,6 @@ ipc_kmsg_put_to_kernel( * MACH_SEND_INVALID_HEADER * Illegal value in the message header bits. * MACH_SEND_INVALID_DEST The space is dead. - * MACH_SEND_INVALID_NOTIFY - * Notify is non-null and doesn't name a receive right. - * (Either KERN_INVALID_NAME or KERN_INVALID_RIGHT.) * MACH_SEND_INVALID_DEST Can't copyin destination port. * (Either KERN_INVALID_NAME or KERN_INVALID_RIGHT.) * MACH_SEND_INVALID_REPLY Can't copyin reply port. @@ -1573,7 +1565,7 @@ mach_msg_return_t ipc_kmsg_copyin_header( mach_msg_header_t *msg, ipc_space_t space, - mach_port_name_t notify) + boolean_t notify) { mach_msg_bits_t mbits = msg->msgh_bits & MACH_MSGH_BITS_USER; mach_port_name_t dest_name = CAST_MACH_PORT_TO_NAME(msg->msgh_remote_port); @@ -1584,8 +1576,7 @@ ipc_kmsg_copyin_header( mach_msg_type_name_t reply_type = MACH_MSGH_BITS_LOCAL(mbits); ipc_object_t dest_port, reply_port; ipc_port_t dest_soright, reply_soright; - ipc_port_t notify_port; - ipc_entry_t entry; + ipc_entry_t dest_entry, reply_entry; if ((mbits != msg->msgh_bits) || (!MACH_MSG_TYPE_PORT_ANY_SEND(dest_type)) || @@ -1609,10 +1600,10 @@ ipc_kmsg_copyin_header( * because copying the header involves copying the port rights too * and we need to do the send check before anything is actually copied. */ - entry = ipc_entry_lookup(space, dest_name); - if (entry != IE_NULL) { + dest_entry = ipc_entry_lookup(space, dest_name); + if (dest_entry != IE_NULL) { int error = 0; - ipc_port_t port = (ipc_port_t) entry->ie_object; + ipc_port_t port = (ipc_port_t) dest_entry->ie_object; if (port == IP_NULL) goto invalid_dest; ip_lock(port); @@ -1629,20 +1620,6 @@ ipc_kmsg_copyin_header( } #endif - if (notify != MACH_PORT_NULL) { - if ((entry = ipc_entry_lookup(space, notify)) == IE_NULL) { - is_write_unlock(space); - return MACH_SEND_INVALID_NOTIFY; - } - if((entry->ie_bits & MACH_PORT_TYPE_RECEIVE) == 0) { - is_write_unlock(space); - return MACH_SEND_INVALID_NOTIFY; - } - - notify_port = (ipc_port_t) entry->ie_object; - } else - notify_port = IP_NULL; - if (dest_name == reply_name) { mach_port_name_t name = dest_name; @@ -1658,13 +1635,14 @@ ipc_kmsg_copyin_header( * copy-send and make-send. */ - entry = ipc_entry_lookup(space, name); - if (entry == IE_NULL) + dest_entry = ipc_entry_lookup(space, name); + if (dest_entry == IE_NULL) goto invalid_dest; + reply_entry = dest_entry; assert(reply_type != 0); /* because name not null */ - if (!ipc_right_copyin_check(space, name, entry, reply_type)) + if (!ipc_right_copyin_check(space, name, reply_entry, reply_type)) goto invalid_reply; if ((dest_type == MACH_MSG_TYPE_MOVE_SEND_ONCE) || @@ -1685,7 +1663,7 @@ ipc_kmsg_copyin_header( (dest_type == MACH_MSG_TYPE_MAKE_SEND_ONCE) || (reply_type == MACH_MSG_TYPE_MAKE_SEND) || (reply_type == MACH_MSG_TYPE_MAKE_SEND_ONCE)) { - kr = ipc_right_copyin(space, name, entry, + kr = ipc_right_copyin(space, name, dest_entry, dest_type, FALSE, &dest_port, &dest_soright); if (kr != KERN_SUCCESS) @@ -1702,16 +1680,15 @@ ipc_kmsg_copyin_header( */ assert(IO_VALID(dest_port)); - assert(entry->ie_bits & MACH_PORT_TYPE_RECEIVE); assert(dest_soright == IP_NULL); - kr = ipc_right_copyin(space, name, entry, + kr = ipc_right_copyin(space, name, reply_entry, reply_type, TRUE, &reply_port, &reply_soright); assert(kr == KERN_SUCCESS); assert(reply_port == dest_port); - assert(entry->ie_bits & MACH_PORT_TYPE_RECEIVE); + assert(reply_entry->ie_bits & MACH_PORT_TYPE_RECEIVE); assert(reply_soright == IP_NULL); } else if ((dest_type == MACH_MSG_TYPE_COPY_SEND) && (reply_type == MACH_MSG_TYPE_COPY_SEND)) { @@ -1720,13 +1697,13 @@ ipc_kmsg_copyin_header( * and dup the send right we get out. */ - kr = ipc_right_copyin(space, name, entry, + kr = ipc_right_copyin(space, name, dest_entry, dest_type, FALSE, &dest_port, &dest_soright); if (kr != KERN_SUCCESS) goto invalid_dest; - assert(entry->ie_bits & MACH_PORT_TYPE_SEND); + assert(dest_entry->ie_bits & MACH_PORT_TYPE_SEND); assert(dest_soright == IP_NULL); /* @@ -1746,14 +1723,16 @@ ipc_kmsg_copyin_header( * to get two send rights for the price of one. */ - kr = ipc_right_copyin_two(space, name, entry, + kr = ipc_right_copyin_two(space, name, dest_entry, &dest_port, &dest_soright); if (kr != KERN_SUCCESS) goto invalid_dest; /* the entry might need to be deallocated */ - if (IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_NONE) - ipc_entry_dealloc(space, name, entry); + if (IE_BITS_TYPE(dest_entry->ie_bits) == MACH_PORT_TYPE_NONE) { + ipc_entry_dealloc(space, name, dest_entry); + dest_entry = IE_NULL; + } reply_port = dest_port; reply_soright = IP_NULL; @@ -1770,7 +1749,7 @@ ipc_kmsg_copyin_header( * and dup the send right we get out. */ - kr = ipc_right_copyin(space, name, entry, + kr = ipc_right_copyin(space, name, dest_entry, MACH_MSG_TYPE_MOVE_SEND, FALSE, &dest_port, &soright); if (kr != KERN_SUCCESS) @@ -1778,8 +1757,10 @@ ipc_kmsg_copyin_header( /* the entry might need to be deallocated */ - if (IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_NONE) - ipc_entry_dealloc(space, name, entry); + if (IE_BITS_TYPE(dest_entry->ie_bits) == MACH_PORT_TYPE_NONE) { + ipc_entry_dealloc(space, name, dest_entry); + dest_entry = IE_NULL; + } /* * It's OK if the port we got is dead now, @@ -1804,26 +1785,25 @@ ipc_kmsg_copyin_header( * to make atomic. Just copyin the destination. */ - entry = ipc_entry_lookup(space, dest_name); - if (entry == IE_NULL) + dest_entry = ipc_entry_lookup(space, dest_name); + if (dest_entry == IE_NULL) goto invalid_dest; - kr = ipc_right_copyin(space, dest_name, entry, + kr = ipc_right_copyin(space, dest_name, dest_entry, dest_type, FALSE, &dest_port, &dest_soright); if (kr != KERN_SUCCESS) goto invalid_dest; /* the entry might need to be deallocated */ - - if (IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_NONE) - ipc_entry_dealloc(space, dest_name, entry); + if (IE_BITS_TYPE(dest_entry->ie_bits) == MACH_PORT_TYPE_NONE) { + ipc_entry_dealloc(space, dest_name, dest_entry); + dest_entry = IE_NULL; + } reply_port = (ipc_object_t)CAST_MACH_NAME_TO_PORT(reply_name); reply_soright = IP_NULL; } else { - ipc_entry_t dest_entry, reply_entry; - /* * This is the tough case to make atomic. * The difficult problem is serializing with port death. @@ -1856,7 +1836,7 @@ ipc_kmsg_copyin_header( * JMM - The code to handle this was too expensive and, anyway, * we intend to separate the dest lookup from the reply copyin * by a wide margin, so the user will have to learn to deal! - * I will be making the change soon! + * I will be making the change soon in rdar://problem/6275821. */ dest_entry = ipc_entry_lookup(space, dest_name); @@ -1890,29 +1870,36 @@ ipc_kmsg_copyin_header( /* the entries might need to be deallocated */ - if (IE_BITS_TYPE(reply_entry->ie_bits) == MACH_PORT_TYPE_NONE) + if (IE_BITS_TYPE(reply_entry->ie_bits) == MACH_PORT_TYPE_NONE) { ipc_entry_dealloc(space, reply_name, reply_entry); + reply_entry = IE_NULL; + } - if (IE_BITS_TYPE(dest_entry->ie_bits) == MACH_PORT_TYPE_NONE) + if (IE_BITS_TYPE(dest_entry->ie_bits) == MACH_PORT_TYPE_NONE) { ipc_entry_dealloc(space, dest_name, dest_entry); + dest_entry = IE_NULL; + } } + dest_type = ipc_object_copyin_type(dest_type); + reply_type = ipc_object_copyin_type(reply_type); + /* - * At this point, dest_port, reply_port, - * dest_soright, reply_soright are all initialized. - * Any defunct entries have been deallocated. - * The space is still write-locked, and we need to - * make the MACH_SEND_CANCEL check. The notify_port pointer - * is still usable, because the copyin code above won't ever - * deallocate a receive right, so its entry still exists - * and holds a ref. Note notify_port might even equal - * dest_port or reply_port. + * JMM - Without rdar://problem/6275821, this is the last place we can + * re-arm the send-possible notifications. It may trigger unexpectedly + * early (send may NOT have failed), but better than missing. */ - - if ((notify != MACH_PORT_NULL) && - (dest_soright == notify_port)) { - ipc_port_release_sonce(dest_soright); - dest_soright = IP_NULL; + if (notify && dest_type != MACH_MSG_TYPE_PORT_SEND_ONCE && + dest_entry != IE_NULL && dest_entry->ie_request != IE_REQ_NONE) { + ipc_port_t dport = (ipc_port_t)dest_port; + + assert(dport != IP_NULL); + ip_lock(dport); + if (ip_active(dport) && + dport->ip_receiver != ipc_space_kernel && ip_full(dport)) { + ipc_port_request_sparm(dport, dest_name, dest_entry->ie_request); + } + ip_unlock(dport); } is_write_unlock(space); @@ -1923,9 +1910,6 @@ ipc_kmsg_copyin_header( if (reply_soright != IP_NULL) ipc_notify_port_deleted(reply_soright, reply_name); - dest_type = ipc_object_copyin_type(dest_type); - reply_type = ipc_object_copyin_type(reply_type); - msg->msgh_bits = (MACH_MSGH_BITS_OTHER(mbits) | MACH_MSGH_BITS(dest_type, reply_type)); msg->msgh_remote_port = (ipc_port_t)dest_port; @@ -2472,7 +2456,6 @@ ipc_kmsg_copyin_body( * MACH_MSG_SUCCESS Successful copyin. * MACH_SEND_INVALID_HEADER * Illegal value in the message header bits. - * MACH_SEND_INVALID_NOTIFY Bad notify port. * MACH_SEND_INVALID_DEST Can't copyin destination port. * MACH_SEND_INVALID_REPLY Can't copyin reply port. * MACH_SEND_INVALID_MEMORY Can't grab out-of-line memory. @@ -2486,7 +2469,7 @@ ipc_kmsg_copyin( ipc_kmsg_t kmsg, ipc_space_t space, vm_map_t map, - mach_port_name_t notify) + boolean_t notify) { mach_msg_return_t mr; @@ -2535,7 +2518,7 @@ ipc_kmsg_copyin( * Nothing locked. */ -void +mach_msg_return_t ipc_kmsg_copyin_from_kernel( ipc_kmsg_t kmsg) { @@ -2546,6 +2529,8 @@ ipc_kmsg_copyin_from_kernel( ipc_object_t local = (ipc_object_t) kmsg->ikm_header->msgh_local_port; /* translate the destination and reply ports */ + if (!IO_VALID(remote)) + return MACH_SEND_INVALID_DEST; ipc_object_copyin_from_kernel(remote, rname); if (IO_VALID(local)) @@ -2569,7 +2554,7 @@ ipc_kmsg_copyin_from_kernel( kmsg->ikm_header->msgh_bits = bits; if ((bits & MACH_MSGH_BITS_COMPLEX) == 0) - return; + return MACH_MSG_SUCCESS; } { mach_msg_descriptor_t *saddr; @@ -2663,10 +2648,11 @@ ipc_kmsg_copyin_from_kernel( } } } + return MACH_MSG_SUCCESS; } #if IKM_SUPPORT_LEGACY -void +mach_msg_return_t ipc_kmsg_copyin_from_kernel_legacy( ipc_kmsg_t kmsg) { @@ -2677,6 +2663,8 @@ ipc_kmsg_copyin_from_kernel_legacy( ipc_object_t local = (ipc_object_t) kmsg->ikm_header->msgh_local_port; /* translate the destination and reply ports */ + if (!IO_VALID(remote)) + return MACH_SEND_INVALID_DEST; ipc_object_copyin_from_kernel(remote, rname); if (IO_VALID(local)) @@ -2700,7 +2688,7 @@ ipc_kmsg_copyin_from_kernel_legacy( kmsg->ikm_header->msgh_bits = bits; if ((bits & MACH_MSGH_BITS_COMPLEX) == 0) - return; + return MACH_MSG_SUCCESS; } { mach_msg_legacy_descriptor_t *saddr; @@ -2833,6 +2821,7 @@ ipc_kmsg_copyin_from_kernel_legacy( } } } + return MACH_MSG_SUCCESS; } #endif /* IKM_SUPPORT_LEGACY */ @@ -2845,13 +2834,6 @@ ipc_kmsg_copyin_from_kernel_legacy( * If it does succeed the remote/local port fields * contain port names instead of object pointers, * and the bits field is updated. - * - * The notify argument implements the MACH_RCV_NOTIFY option. - * If it is not MACH_PORT_NULL, it should name a receive right. - * If the process of receiving the reply port creates a - * new right in the receiving task, then the new right is - * automatically registered for a dead-name notification, - * with the notify port supplying the send-once right. * Conditions: * Nothing locked. * Returns: @@ -2872,14 +2854,20 @@ ipc_kmsg_copyin_from_kernel_legacy( mach_msg_return_t ipc_kmsg_copyout_header( mach_msg_header_t *msg, - ipc_space_t space, - mach_port_name_t notify) + ipc_space_t space) { mach_msg_bits_t mbits = msg->msgh_bits; ipc_port_t dest = (ipc_port_t) msg->msgh_remote_port; assert(IP_VALID(dest)); + /* + * While we still hold a reference on the received-from port, + * process all send-possible notfications we received along with + * the message. + */ + ipc_port_spnotify(dest); + { mach_msg_type_name_t dest_type = MACH_MSGH_BITS_REMOTE(mbits); mach_msg_type_name_t reply_type = MACH_MSGH_BITS_LOCAL(mbits); @@ -2887,64 +2875,27 @@ ipc_kmsg_copyout_header( mach_port_name_t dest_name, reply_name; if (IP_VALID(reply)) { - ipc_port_t notify_port; ipc_entry_t entry; kern_return_t kr; /* - * Handling notify (for MACH_RCV_NOTIFY) is tricky. - * The problem is atomically making a send-once right - * from the notify port and installing it for a - * dead-name request in the new entry, because this - * requires two port locks (on the notify port and - * the reply port). However, we can safely make - * and consume send-once rights for the notify port - * as long as we hold the space locked. This isn't - * an atomicity problem, because the only way - * to detect that a send-once right has been created - * and then consumed if it wasn't needed is by getting - * at the receive right to look at ip_sorights, and - * because the space is write-locked status calls can't - * lookup the notify port receive right. When we make - * the send-once right, we lock the notify port, - * so any status calls in progress will be done. + * Get reply port entry (if none, skip to dest port + * copyout). This may require growing the space. */ is_write_lock(space); for (;;) { - ipc_port_request_index_t request; - if (!space->is_active) { is_write_unlock(space); return (MACH_RCV_HEADER_ERROR| MACH_MSG_IPC_SPACE); } - if (notify != MACH_PORT_NULL) { - notify_port = ipc_port_lookup_notify(space, - notify); - if (notify_port == IP_NULL) { - printf("ipc_kmsg_copyout_header: no notify port\n"); - is_write_unlock(space); - return MACH_RCV_INVALID_NOTIFY; - } - } else - notify_port = IP_NULL; - if ((reply_type != MACH_MSG_TYPE_PORT_SEND_ONCE) && ipc_right_reverse(space, (ipc_object_t) reply, &reply_name, &entry)) { /* reply port is locked and active */ - - /* - * We don't need the notify_port - * send-once right, but we can't release - * it here because reply port is locked. - * Wait until after the copyout to - * release the notify port right. - */ - assert(entry->ie_bits & MACH_PORT_TYPE_SEND_RECEIVE); break; @@ -2955,9 +2906,6 @@ ipc_kmsg_copyout_header( ip_release(reply); ip_check_unlock(reply); - if (notify_port != IP_NULL) - ipc_port_release_sonce(notify_port); - ip_lock(dest); is_write_unlock(space); @@ -2971,24 +2919,12 @@ ipc_kmsg_copyout_header( if (kr != KERN_SUCCESS) { ip_unlock(reply); - if (notify_port != IP_NULL) - ipc_port_release_sonce(notify_port); - /* space is locked */ kr = ipc_entry_grow_table(space, ITS_SIZE_NONE); if (kr != KERN_SUCCESS) { - /* space is unlocked */ - - if (kr == KERN_RESOURCE_SHORTAGE) { - printf("ipc_kmsg_copyout_header: can't grow kernel ipc space\n"); - return (MACH_RCV_HEADER_ERROR| - MACH_MSG_IPC_KERNEL); - } else { - printf("ipc_kmsg_copyout_header: can't grow user ipc space\n"); - return (MACH_RCV_HEADER_ERROR| - MACH_MSG_IPC_SPACE); - } + return (MACH_RCV_HEADER_ERROR| + MACH_MSG_IPC_SPACE); } /* space is locked again; start over */ @@ -2998,48 +2934,7 @@ ipc_kmsg_copyout_header( MACH_PORT_TYPE_NONE); assert(entry->ie_object == IO_NULL); - if (notify_port == IP_NULL) { - /* not making a dead-name request */ - - entry->ie_object = (ipc_object_t) reply; - break; - } - - kr = ipc_port_dnrequest(reply, reply_name, - notify_port, &request); - if (kr != KERN_SUCCESS) { - ip_unlock(reply); - - ipc_port_release_sonce(notify_port); - - ipc_entry_dealloc(space, reply_name, entry); - is_write_unlock(space); - - ip_lock(reply); - if (!ip_active(reply)) { - /* will fail next time around loop */ - - ip_unlock(reply); - is_write_lock(space); - continue; - } - - kr = ipc_port_dngrow(reply, ITS_SIZE_NONE); - /* port is unlocked */ - if (kr != KERN_SUCCESS) { - printf("ipc_kmsg_copyout_header: can't grow kernel ipc space2\n"); - return (MACH_RCV_HEADER_ERROR| - MACH_MSG_IPC_KERNEL); - } - - is_write_lock(space); - continue; - } - - notify_port = IP_NULL; /* don't release right below */ - entry->ie_object = (ipc_object_t) reply; - entry->ie_request = request; break; } @@ -3052,17 +2947,13 @@ ipc_kmsg_copyout_header( /* reply port is unlocked */ assert(kr == KERN_SUCCESS); - if (notify_port != IP_NULL) - ipc_port_release_sonce(notify_port); - ip_lock(dest); is_write_unlock(space); } else { /* * No reply port! This is an easy case. * We only need to have the space locked - * when checking notify and when locking - * the destination (to ensure atomicity). + * when locking the destination. */ is_read_lock(space); @@ -3071,24 +2962,6 @@ ipc_kmsg_copyout_header( return MACH_RCV_HEADER_ERROR|MACH_MSG_IPC_SPACE; } - if (notify != MACH_PORT_NULL) { - ipc_entry_t entry; - - /* must check notify even though it won't be used */ - - if ((entry = ipc_entry_lookup(space, notify)) == IE_NULL) { - printf("ipc_kmsg_copyout_header: ipc_entry_lookup failed\n"); - is_read_unlock(space); - return MACH_RCV_INVALID_NOTIFY; - } - - if ((entry->ie_bits & MACH_PORT_TYPE_RECEIVE) == 0) { - printf("ipc_kmsg_copyout_header: MACH_PORT_TYPE_RECEIVE not set!\n"); - is_read_unlock(space); - return MACH_RCV_INVALID_NOTIFY; - } - } - ip_lock(dest); is_read_unlock(space); @@ -3704,8 +3577,6 @@ ipc_kmsg_copyout_size( * Nothing locked. * Returns: * MACH_MSG_SUCCESS Copied out all rights and memory. - * MACH_RCV_INVALID_NOTIFY Bad notify port. - * Rights and memory in the message are intact. * MACH_RCV_HEADER_ERROR + special bits * Rights and memory in the message are intact. * MACH_RCV_BODY_ERROR + special bits @@ -3718,12 +3589,11 @@ ipc_kmsg_copyout( ipc_kmsg_t kmsg, ipc_space_t space, vm_map_t map, - mach_port_name_t notify, mach_msg_body_t *slist) { mach_msg_return_t mr; - mr = ipc_kmsg_copyout_header(kmsg->ikm_header, space, notify); + mr = ipc_kmsg_copyout_header(kmsg->ikm_header, space); if (mr != MACH_MSG_SUCCESS) { return mr; } diff --git a/osfmk/ipc/ipc_kmsg.h b/osfmk/ipc/ipc_kmsg.h index 8687cafbf..6fb07b6dd 100644 --- a/osfmk/ipc/ipc_kmsg.h +++ b/osfmk/ipc/ipc_kmsg.h @@ -269,9 +269,12 @@ extern void ipc_kmsg_free( extern void ipc_kmsg_destroy( ipc_kmsg_t kmsg); -/* destroy kernel message and a reference on the dest */ -extern void ipc_kmsg_destroy_dest( - ipc_kmsg_t kmsg); +/* Enqueue kernel message for deferred destruction */ +extern boolean_t ipc_kmsg_delayed_destroy( + ipc_kmsg_t kmsg); + +/* Process all the delayed message destroys */ +extern void ipc_kmsg_reap_delayed(void); /* Preallocate a kernel message buffer */ extern ipc_kmsg_t ipc_kmsg_prealloc( @@ -321,29 +324,28 @@ extern void ipc_kmsg_put_to_kernel( extern mach_msg_return_t ipc_kmsg_copyin_header( mach_msg_header_t *msg, ipc_space_t space, - mach_port_name_t notify); + boolean_t notify); /* Copyin port rights and out-of-line memory from a user message */ extern mach_msg_return_t ipc_kmsg_copyin( ipc_kmsg_t kmsg, ipc_space_t space, vm_map_t map, - mach_port_name_t notify); + boolean_t notify); /* Copyin port rights and out-of-line memory from a kernel message */ -extern void ipc_kmsg_copyin_from_kernel( +extern mach_msg_return_t ipc_kmsg_copyin_from_kernel( ipc_kmsg_t kmsg); #if IKM_SUPPORT_LEGACY -extern void ipc_kmsg_copyin_from_kernel_legacy( +extern mach_msg_return_t ipc_kmsg_copyin_from_kernel_legacy( ipc_kmsg_t kmsg); #endif /* Copyout port rights in the header of a message */ extern mach_msg_return_t ipc_kmsg_copyout_header( mach_msg_header_t *msg, - ipc_space_t space, - mach_port_name_t notify); + ipc_space_t space); /* Copyout a port right returning a name */ extern mach_msg_return_t ipc_kmsg_copyout_object( @@ -357,7 +359,6 @@ extern mach_msg_return_t ipc_kmsg_copyout( ipc_kmsg_t kmsg, ipc_space_t space, vm_map_t map, - mach_port_name_t notify, mach_msg_body_t *slist); /* Copyout port rights and out-of-line memory from the body of a message */ diff --git a/osfmk/ipc/ipc_labelh.c b/osfmk/ipc/ipc_labelh.c index e6763aa36..934eaf7e4 100644 --- a/osfmk/ipc/ipc_labelh.c +++ b/osfmk/ipc/ipc_labelh.c @@ -79,6 +79,9 @@ labelh_new_user(ipc_space_t space, struct label *inl, mach_port_name_t *namep) /* XXX - perform entrypoint check here? */ + /* JMM - redo as port allocation, kobject set, and then copyout */ + assert(!CONFIG_MACF_MACH); + /* * Note: the calling task will have a receive right for the port. * This is different from label handles that reference tasks @@ -94,6 +97,7 @@ labelh_new_user(ipc_space_t space, struct label *inl, mach_port_name_t *namep) port->ip_mscount++; port->ip_srights++; is_write_lock(space); + /* XXX - must validate space is still active and unwind if not */ entry = ipc_entry_lookup(space, *namep); if (entry != IE_NULL) entry->ie_bits |= MACH_PORT_TYPE_SEND; diff --git a/osfmk/ipc/ipc_mqueue.c b/osfmk/ipc/ipc_mqueue.c index 9d17b81b9..406b5ae93 100644 --- a/osfmk/ipc/ipc_mqueue.c +++ b/osfmk/ipc/ipc_mqueue.c @@ -949,7 +949,7 @@ ipc_mqueue_select_on_thread( * Locks may be held by callers, so this routine cannot block. * Caller holds reference on the message queue. */ -int +unsigned ipc_mqueue_peek(ipc_mqueue_t mq) { wait_queue_link_t wql; @@ -963,10 +963,7 @@ ipc_mqueue_peek(ipc_mqueue_t mq) * Don't block trying to get the lock. */ s = splsched(); - if (!imq_lock_try(mq)) { - splx(s); - return -1; - } + imq_lock(mq); /* * peek at the contained port message queues, return as soon as @@ -992,7 +989,8 @@ ipc_mqueue_peek(ipc_mqueue_t mq) /* * Routine: ipc_mqueue_destroy * Purpose: - * Destroy a message queue. Set any blocked senders running. + * Destroy a (non-set) message queue. + * Set any blocked senders running. * Destroy the kmsgs in the queue. * Conditions: * Nothing locked. @@ -1000,10 +998,11 @@ ipc_mqueue_peek(ipc_mqueue_t mq) */ void ipc_mqueue_destroy( - ipc_mqueue_t mqueue) + ipc_mqueue_t mqueue) { ipc_kmsg_queue_t kmqueue; ipc_kmsg_t kmsg; + boolean_t reap = FALSE; spl_t s; @@ -1019,19 +1018,27 @@ ipc_mqueue_destroy( THREAD_RESTART, FALSE); + /* + * Move messages from the specified queue to the per-thread + * clean/drain queue while we have the mqueue lock. + */ kmqueue = &mqueue->imq_messages; - while ((kmsg = ipc_kmsg_dequeue(kmqueue)) != IKM_NULL) { - imq_unlock(mqueue); - splx(s); - - ipc_kmsg_destroy_dest(kmsg); - - s = splsched(); - imq_lock(mqueue); + boolean_t first; + first = ipc_kmsg_delayed_destroy(kmsg); + if (first) + reap = first; } + imq_unlock(mqueue); splx(s); + + /* + * Destroy the messages we enqueued if we aren't nested + * inside some other attempt to drain the same queue. + */ + if (reap) + ipc_kmsg_reap_delayed(); } /* diff --git a/osfmk/ipc/ipc_mqueue.h b/osfmk/ipc/ipc_mqueue.h index 90d3322cf..c8a3f7a2e 100644 --- a/osfmk/ipc/ipc_mqueue.h +++ b/osfmk/ipc/ipc_mqueue.h @@ -213,7 +213,7 @@ extern void ipc_mqueue_select_on_thread( thread_t thread); /* Peek into a messaqe queue to see if there are messages */ -extern int ipc_mqueue_peek( +extern unsigned ipc_mqueue_peek( ipc_mqueue_t mqueue); /* Clear a message count reservation */ diff --git a/osfmk/ipc/ipc_notify.c b/osfmk/ipc/ipc_notify.c index 25a26aa63..498401cc0 100644 --- a/osfmk/ipc/ipc_notify.c +++ b/osfmk/ipc/ipc_notify.c @@ -88,6 +88,24 @@ ipc_notify_port_deleted( /* send-once right consumed */ } +/* + * Routine: ipc_notify_send_possible + * Purpose: + * Send a send-possible notification. + * Conditions: + * Nothing locked. + * Consumes a ref/soright for port. + */ + +void +ipc_notify_send_possible( + ipc_port_t port, + mach_port_name_t name) +{ + (void)mach_notify_send_possible(port, name); + /* send-once right consumed */ +} + /* * Routine: ipc_notify_port_destroyed * Purpose: diff --git a/osfmk/ipc/ipc_notify.h b/osfmk/ipc/ipc_notify.h index a2da35065..0d87a6ec0 100644 --- a/osfmk/ipc/ipc_notify.h +++ b/osfmk/ipc/ipc_notify.h @@ -75,6 +75,11 @@ extern void ipc_notify_port_deleted( ipc_port_t port, mach_port_name_t name); +/* Send a send-possible notification */ +extern void ipc_notify_send_possible( + ipc_port_t port, + mach_port_name_t name); + /* Send a port-destroyed notification */ extern void ipc_notify_port_destroyed( ipc_port_t port, diff --git a/osfmk/ipc/ipc_object.c b/osfmk/ipc/ipc_object.c index 95a1cfbf3..176e80ec8 100644 --- a/osfmk/ipc/ipc_object.c +++ b/osfmk/ipc/ipc_object.c @@ -90,6 +90,7 @@ #include #include #include +#include #include #include @@ -680,6 +681,42 @@ ipc_object_destroy( } } +/* + * Routine: ipc_object_destroy_dest + * Purpose: + * Destroys a naked capability for the destination of + * of a message. Consumes a ref for the object. + * + * Conditions: + * Nothing locked. + */ + +void +ipc_object_destroy_dest( + ipc_object_t object, + mach_msg_type_name_t msgt_name) +{ + assert(IO_VALID(object)); + assert(io_otype(object) == IOT_PORT); + + switch (msgt_name) { + case MACH_MSG_TYPE_PORT_SEND: + ipc_port_release_send((ipc_port_t) object); + break; + + case MACH_MSG_TYPE_PORT_SEND_ONCE: + if (io_active(object) && + !ip_full_kernel((ipc_port_t) object)) + ipc_notify_send_once((ipc_port_t) object); + else + ipc_port_release_sonce((ipc_port_t) object); + break; + + default: + panic("ipc_object_destroy_dest: strange rights"); + } +} + /* * Routine: ipc_object_copyout * Purpose: @@ -1033,14 +1070,7 @@ io_free( if (otype == IOT_PORT) { port = (ipc_port_t) object; -#if MACH_ASSERT - ipc_port_track_dealloc(port); -#endif /* MACH_ASSERT */ - -#if CONFIG_MACF_MACH - /* Port label should have been initialized after creation. */ - mac_port_label_destroy(&port->ip_label); -#endif + ipc_port_finalize(port); } io_lock_destroy(object); zfree(ipc_object_zones[otype], object); diff --git a/osfmk/ipc/ipc_object.h b/osfmk/ipc/ipc_object.h index 003707f59..a813b29bf 100644 --- a/osfmk/ipc/ipc_object.h +++ b/osfmk/ipc/ipc_object.h @@ -300,6 +300,11 @@ extern void ipc_object_destroy( ipc_object_t object, mach_msg_type_name_t msgt_name); +/* Destroy a naked destination capability */ +extern void ipc_object_destroy_dest( + ipc_object_t object, + mach_msg_type_name_t msgt_name); + /* Copyout a capability, placing it into a space */ extern kern_return_t ipc_object_copyout( ipc_space_t space, diff --git a/osfmk/ipc/ipc_port.c b/osfmk/ipc/ipc_port.c index 76185c9ba..0ece0705c 100644 --- a/osfmk/ipc/ipc_port.c +++ b/osfmk/ipc/ipc_port.c @@ -139,9 +139,9 @@ ipc_port_timestamp(void) } /* - * Routine: ipc_port_dnrequest + * Routine: ipc_port_request_alloc * Purpose: - * Try to allocate a dead-name request slot. + * Try to allocate a request slot. * If successful, returns the request index. * Otherwise returns zero. * Conditions: @@ -152,20 +152,24 @@ ipc_port_timestamp(void) */ kern_return_t -ipc_port_dnrequest( +ipc_port_request_alloc( ipc_port_t port, mach_port_name_t name, ipc_port_t soright, + boolean_t send_possible, + boolean_t immediate, ipc_port_request_index_t *indexp) { ipc_port_request_t ipr, table; ipc_port_request_index_t index; + uintptr_t mask = 0; assert(ip_active(port)); assert(name != MACH_PORT_NULL); assert(soright != IP_NULL); - table = port->ip_dnrequests; + table = port->ip_requests; + if (table == IPR_NULL) return KERN_NO_SPACE; @@ -178,16 +182,25 @@ ipc_port_dnrequest( table->ipr_next = ipr->ipr_next; ipr->ipr_name = name; - ipr->ipr_soright = soright; + + if (send_possible) { + mask |= IPR_SOR_SPREQ_MASK; + if (immediate) { + mask |= IPR_SOR_SPARM_MASK; + port->ip_sprequests = TRUE; + } + } + ipr->ipr_soright = IPR_SOR_MAKE(soright, mask); *indexp = index; + return KERN_SUCCESS; } /* - * Routine: ipc_port_dngrow + * Routine: ipc_port_request_grow * Purpose: - * Grow a port's table of dead-name requests. + * Grow a port's table of requests. * Conditions: * The port must be locked and active. * Nothing else locked; will allocate memory. @@ -201,7 +214,7 @@ ipc_port_dnrequest( */ kern_return_t -ipc_port_dngrow( +ipc_port_request_grow( ipc_port_t port, ipc_table_elems_t target_size) { @@ -210,9 +223,9 @@ ipc_port_dngrow( assert(ip_active(port)); - otable = port->ip_dnrequests; + otable = port->ip_requests; if (otable == IPR_NULL) - its = &ipc_table_dnrequests[0]; + its = &ipc_table_requests[0]; else its = otable->ipr_size + 1; @@ -235,7 +248,7 @@ ipc_port_dngrow( ip_unlock(port); if ((its->its_size == 0) || - ((ntable = it_dnrequests_alloc(its)) == IPR_NULL)) { + ((ntable = it_requests_alloc(its)) == IPR_NULL)) { ipc_port_release(port); return KERN_RESOURCE_SHORTAGE; } @@ -246,12 +259,11 @@ ipc_port_dngrow( /* * Check that port is still active and that nobody else * has slipped in and grown the table on us. Note that - * just checking port->ip_dnrequests == otable isn't - * sufficient; must check ipr_size. + * just checking if the current table pointer == otable + * isn't sufficient; must check ipr_size. */ - if (ip_active(port) && - (port->ip_dnrequests == otable) && + if (ip_active(port) && (port->ip_requests == otable) && ((otable == IPR_NULL) || (otable->ipr_size+1 == its))) { ipc_table_size_t oits; ipc_table_elems_t osize, nsize; @@ -288,55 +300,125 @@ ipc_port_dngrow( ntable->ipr_next = free; ntable->ipr_size = its; - port->ip_dnrequests = ntable; + port->ip_requests = ntable; ip_unlock(port); if (otable != IPR_NULL) { - it_dnrequests_free(oits, otable); + it_requests_free(oits, otable); } } else { ip_check_unlock(port); - it_dnrequests_free(its, ntable); + it_requests_free(its, ntable); } return KERN_SUCCESS; } /* - * Routine: ipc_port_dncancel + * Routine: ipc_port_request_sparm + * Purpose: + * Arm delayed send-possible request. + * Conditions: + * The port must be locked and active. + */ + +void +ipc_port_request_sparm( + ipc_port_t port, + __assert_only mach_port_name_t name, + ipc_port_request_index_t index) +{ + if (index != IE_REQ_NONE) { + ipc_port_request_t ipr, table; + + assert(ip_active(port)); + + table = port->ip_requests; + assert(table != IPR_NULL); + + ipr = &table[index]; + assert(ipr->ipr_name == name); + + if (IPR_SOR_SPREQ(ipr->ipr_soright)) { + ipr->ipr_soright = IPR_SOR_MAKE(ipr->ipr_soright, IPR_SOR_SPARM_MASK); + port->ip_sprequests = TRUE; + } + } +} + +/* + * Routine: ipc_port_request_type * Purpose: - * Cancel a dead-name request and return the send-once right. + * Determine the type(s) of port requests enabled for a name. * Conditions: - * The port must locked and active. + * The port must be locked or inactive (to avoid table growth). + * The index must not be IE_REQ_NONE and for the name in question. + */ +mach_port_type_t +ipc_port_request_type( + ipc_port_t port, + __assert_only mach_port_name_t name, + ipc_port_request_index_t index) +{ + ipc_port_request_t ipr, table; + mach_port_type_t type = 0; + + table = port->ip_requests; + assert (table != IPR_NULL); + + assert(index != IE_REQ_NONE); + ipr = &table[index]; + assert(ipr->ipr_name == name); + + if (IP_VALID(IPR_SOR_PORT(ipr->ipr_soright))) { + type |= MACH_PORT_TYPE_DNREQUEST; + + if (IPR_SOR_SPREQ(ipr->ipr_soright)) { + type |= MACH_PORT_TYPE_SPREQUEST; + + if (!IPR_SOR_SPARMED(ipr->ipr_soright)) { + type |= MACH_PORT_TYPE_SPREQUEST_DELAYED; + } else { + assert(port->ip_sprequests == TRUE); + } + } + } + return type; +} + +/* + * Routine: ipc_port_request_cancel + * Purpose: + * Cancel a dead-name/send-possible request and return the send-once right. + * Conditions: + * The port must be locked and active. + * The index must not be IPR_REQ_NONE and must correspond with name. */ ipc_port_t -ipc_port_dncancel( - ipc_port_t port, +ipc_port_request_cancel( + ipc_port_t port, __assert_only mach_port_name_t name, - ipc_port_request_index_t index) + ipc_port_request_index_t index) { ipc_port_request_t ipr, table; - ipc_port_t dnrequest; + ipc_port_t request = IP_NULL; assert(ip_active(port)); - assert(name != MACH_PORT_NULL); - assert(index != 0); - - table = port->ip_dnrequests; + table = port->ip_requests; assert(table != IPR_NULL); + assert (index != IE_REQ_NONE); ipr = &table[index]; - dnrequest = ipr->ipr_soright; assert(ipr->ipr_name == name); + request = IPR_SOR_PORT(ipr->ipr_soright); /* return ipr to the free list inside the table */ - ipr->ipr_name = MACH_PORT_NULL; ipr->ipr_next = table->ipr_next; table->ipr_next = index; - return dnrequest; + return request; } /* @@ -470,7 +552,7 @@ ipc_port_init( port->ip_nsrequest = IP_NULL; port->ip_pdrequest = IP_NULL; - port->ip_dnrequests = IPR_NULL; + port->ip_requests = IPR_NULL; port->ip_pset_count = 0; port->ip_premsg = IKM_NULL; @@ -578,37 +660,105 @@ ipc_port_alloc_name( } /* - * Generate dead name notifications. Called from ipc_port_destroy. - * Port is unlocked but still has reference(s); - * dnrequests was taken from port while the port - * was locked but the port now has port->ip_dnrequests set to IPR_NULL. + * Routine: ipc_port_spnotify + * Purpose: + * Generate send-possible port notifications. + * Conditions: + * Nothing locked, reference held on port. */ void -ipc_port_dnnotify( - __unused ipc_port_t port, - ipc_port_request_t dnrequests) +ipc_port_spnotify( + ipc_port_t port) { - ipc_table_size_t its = dnrequests->ipr_size; - ipc_table_elems_t size = its->its_size; - ipc_port_request_index_t index; - - for (index = 1; index < size; index++) { - ipc_port_request_t ipr = &dnrequests[index]; - mach_port_name_t name = ipr->ipr_name; - ipc_port_t soright; + ipc_port_request_index_t index = 0; + ipc_table_elems_t size = 0; - if (name == MACH_PORT_NULL) - continue; + /* + * If the port has no send-possible request + * armed, don't bother to lock the port. + */ + if (!port->ip_sprequests) + return; - soright = ipr->ipr_soright; - assert(soright != IP_NULL); + ip_lock(port); + if (!port->ip_sprequests) { + ip_unlock(port); + return; + } + port->ip_sprequests = FALSE; - ipc_notify_dead_name(soright, name); + revalidate: + if (ip_active(port)) { + ipc_port_request_t requests; + + /* table may change each time port unlocked (reload) */ + requests = port->ip_requests; + assert(requests != IPR_NULL); + + /* + * no need to go beyond table size when first + * we entered - those are future notifications. + */ + if (size == 0) + size = requests->ipr_size->its_size; + + /* no need to backtrack either */ + while (++index < size) { + ipc_port_request_t ipr = &requests[index]; + mach_port_name_t name = ipr->ipr_name; + ipc_port_t soright = IPR_SOR_PORT(ipr->ipr_soright); + boolean_t armed = IPR_SOR_SPARMED(ipr->ipr_soright); + + if (MACH_PORT_VALID(name) && armed && IP_VALID(soright)) { + /* claim send-once right - slot still inuse */ + ipr->ipr_soright = IP_NULL; + ip_unlock(port); + + ipc_notify_send_possible(soright, name); + + ip_lock(port); + goto revalidate; + } + } } + ip_unlock(port); +} - it_dnrequests_free(its, dnrequests); +/* + * Routine: ipc_port_dnnotify + * Purpose: + * Generate dead name notifications for + * all outstanding dead-name and send- + * possible requests. + * Conditions: + * Nothing locked. + * Port must be inactive. + * Reference held on port. + */ +void +ipc_port_dnnotify( + ipc_port_t port) +{ + ipc_port_request_t requests = port->ip_requests; + + assert(!ip_active(port)); + if (requests != IPR_NULL) { + ipc_table_size_t its = requests->ipr_size; + ipc_table_elems_t size = its->its_size; + ipc_port_request_index_t index; + for (index = 1; index < size; index++) { + ipc_port_request_t ipr = &requests[index]; + mach_port_name_t name = ipr->ipr_name; + ipc_port_t soright = IPR_SOR_PORT(ipr->ipr_soright); + + if (MACH_PORT_VALID(name) && IP_VALID(soright)) { + ipc_notify_dead_name(soright, name); + } + } + } } + /* * Routine: ipc_port_destroy * Purpose: @@ -629,7 +779,6 @@ ipc_port_destroy( ipc_port_t pdrequest, nsrequest; ipc_mqueue_t mqueue; ipc_kmsg_t kmsg; - ipc_port_request_t dnrequests; assert(ip_active(port)); /* port->ip_receiver_name is garbage */ @@ -659,10 +808,6 @@ ipc_port_destroy( port->ip_object.io_bits &= ~IO_BITS_ACTIVE; port->ip_timestamp = ipc_port_timestamp(); - /* save for later */ - dnrequests = port->ip_dnrequests; - port->ip_dnrequests = IPR_NULL; - /* * If the port has a preallocated message buffer and that buffer * is not inuse, free it. If it has an inuse one, then the kmsg @@ -679,7 +824,6 @@ ipc_port_destroy( ip_unlock(port); /* throw away no-senders request */ - nsrequest = port->ip_nsrequest; if (nsrequest != IP_NULL) ipc_notify_send_once(nsrequest); /* consumes ref */ @@ -689,9 +833,7 @@ ipc_port_destroy( ipc_mqueue_destroy(mqueue); /* generate dead-name notifications */ - if (dnrequests != IPR_NULL) { - ipc_port_dnnotify(port, dnrequests); - } + ipc_port_dnnotify(port); ipc_kobject_destroy(port); @@ -1001,7 +1143,7 @@ ipc_port_copyout_send( /* * Routine: ipc_port_release_send * Purpose: - * Release a (valid) naked send right. + * Release a naked send right. * Consumes a ref for the port. * Conditions: * Nothing locked. @@ -1014,7 +1156,8 @@ ipc_port_release_send( ipc_port_t nsrequest = IP_NULL; mach_port_mscount_t mscount; - assert(IP_VALID(port)); + if (!IP_VALID(port)) + return; ip_lock(port); ip_release(port); @@ -1049,7 +1192,8 @@ ipc_port_t ipc_port_make_sonce( ipc_port_t port) { - assert(IP_VALID(port)); + if (!IP_VALID(port)) + return port; ip_lock(port); assert(ip_active(port)); @@ -1078,7 +1222,8 @@ void ipc_port_release_sonce( ipc_port_t port) { - assert(IP_VALID(port)); + if (!IP_VALID(port)) + return; ip_lock(port); @@ -1111,7 +1256,8 @@ ipc_port_release_receive( { ipc_port_t dest; - assert(IP_VALID(port)); + if (!IP_VALID(port)) + return; ip_lock(port); assert(ip_active(port)); @@ -1200,6 +1346,37 @@ ipc_port_dealloc_special( ipc_port_destroy(port); } +/* + * Routine: ipc_port_finalize + * Purpose: + * Called on last reference deallocate to + * free any remaining data associated with the + * port. + * Conditions: + * Nothing locked. + */ +void +ipc_port_finalize( + ipc_port_t port) +{ + ipc_port_request_t requests = port->ip_requests; + + assert(!ip_active(port)); + if (requests != IPR_NULL) { + ipc_table_size_t its = requests->ipr_size; + it_requests_free(its, requests); + port->ip_requests = IPR_NULL; + } + +#if MACH_ASSERT + ipc_port_track_dealloc(port); +#endif /* MACH_ASSERT */ + +#if CONFIG_MACF_MACH + /* Port label should have been initialized after creation. */ + mac_port_label_destroy(&port->ip_label); +#endif +} #if MACH_ASSERT #include @@ -1314,6 +1491,7 @@ ipc_port_track_dealloc( } #endif + #endif /* MACH_ASSERT */ @@ -1396,7 +1574,7 @@ ipc_port_print( iprintf("nsrequest=0x%x", port->ip_nsrequest); printf(", pdrequest=0x%x", port->ip_pdrequest); - printf(", dnrequests=0x%x\n", port->ip_dnrequests); + printf(", requests=0x%x\n", port->ip_requests); iprintf("pset_count=0x%x", port->ip_pset_count); printf(", seqno=%d", port->ip_messages.imq_seqno); diff --git a/osfmk/ipc/ipc_port.h b/osfmk/ipc/ipc_port.h index 4998a84bc..34aab79d8 100644 --- a/osfmk/ipc/ipc_port.h +++ b/osfmk/ipc/ipc_port.h @@ -127,13 +127,15 @@ struct ipc_port { } data; ipc_kobject_t ip_kobject; + mach_port_mscount_t ip_mscount; mach_port_rights_t ip_srights; mach_port_rights_t ip_sorights; struct ipc_port *ip_nsrequest; struct ipc_port *ip_pdrequest; - struct ipc_port_request *ip_dnrequests; + struct ipc_port_request *ip_requests; + boolean_t ip_sprequests; unsigned int ip_pset_count; struct ipc_kmsg *ip_premsg; @@ -191,6 +193,9 @@ struct ipc_port { #define ip_kotype(port) io_kotype(&(port)->ip_object) +#define ip_full_kernel(port) imq_full_kernel(&(port)->ip_messages) +#define ip_full(port) imq_full(&(port)->ip_messages) + /* * JMM - Preallocation flag * This flag indicates that there is a message buffer preallocated for this @@ -215,7 +220,7 @@ MACRO_BEGIN \ (port)->ip_premsg = IKM_NULL; \ MACRO_END - +/* JMM - address alignment/packing for LP64 */ struct ipc_port_request { union { struct ipc_port *port; @@ -234,6 +239,17 @@ struct ipc_port_request { #define ipr_soright notify.port #define ipr_name name.name +/* + * Use the low bits in the ipr_soright to specify the request type + */ +#define IPR_SOR_SPARM_MASK 1 /* send-possible armed */ +#define IPR_SOR_SPREQ_MASK 2 /* send-possible requested */ +#define IPR_SOR_SPBIT_MASK 3 /* combo */ +#define IPR_SOR_SPARMED(sor) (((uintptr_t)(sor) & IPR_SOR_SPARM_MASK) != 0) +#define IPR_SOR_SPREQ(sor) (((uintptr_t)(sor) & IPR_SOR_SPREQ_MASK) != 0) +#define IPR_SOR_PORT(sor) ((ipc_port_t)((uintptr_t)(sor) & ~IPR_SOR_SPBIT_MASK)) +#define IPR_SOR_MAKE(p,m) ((ipc_port_t)((uintptr_t)(p) | (m))) + extern lck_grp_t ipc_lck_grp; extern lck_attr_t ipc_lck_attr; @@ -297,32 +313,47 @@ extern ipc_port_timestamp_t ipc_port_timestamp(void); MACH_PORT_RIGHT_SEND, \ (ipc_object_t *) (portp)) -/* Allocate a dead-name request slot */ +/* Allocate a notification request slot */ extern kern_return_t -ipc_port_dnrequest( +ipc_port_request_alloc( ipc_port_t port, mach_port_name_t name, ipc_port_t soright, + boolean_t send_possible, + boolean_t immediate, ipc_port_request_index_t *indexp); -/* Grow a port's table of dead-name requests */ -extern kern_return_t ipc_port_dngrow( +/* Grow one of a port's tables of notifcation requests */ +extern kern_return_t ipc_port_request_grow( ipc_port_t port, ipc_table_elems_t target_size); -/* Cancel a dead-name request and return the send-once right */ -extern ipc_port_t ipc_port_dncancel( +/* Return the type(s) of notification requests outstanding */ +extern mach_port_type_t ipc_port_request_type( ipc_port_t port, mach_port_name_t name, ipc_port_request_index_t index); -#define ipc_port_dnrename(port, index, oname, nname) \ +/* Cancel a notification request and return the send-once right */ +extern ipc_port_t ipc_port_request_cancel( + ipc_port_t port, + mach_port_name_t name, + ipc_port_request_index_t index); + +/* Arm any delayed send-possible notification */ +extern void ipc_port_request_sparm( + ipc_port_t port, + mach_port_name_t name, + ipc_port_request_index_t index); + +/* Macros for manipulating a port's dead name notificaiton requests */ +#define ipc_port_request_rename(port, index, oname, nname) \ MACRO_BEGIN \ ipc_port_request_t ipr, table; \ \ assert(ip_active(port)); \ \ - table = port->ip_dnrequests; \ + table = port->ip_requests; \ assert(table != IPR_NULL); \ \ ipr = &table[index]; \ @@ -331,6 +362,7 @@ MACRO_BEGIN \ ipr->ipr_name = nname; \ MACRO_END + /* Make a port-deleted request */ extern void ipc_port_pdrequest( ipc_port_t port, @@ -375,8 +407,11 @@ extern kern_return_t ipc_port_alloc_name( /* Generate dead name notifications */ extern void ipc_port_dnnotify( - ipc_port_t port, - ipc_port_request_t dnrequests); + ipc_port_t port); + +/* Generate send-possible notifications */ +extern void ipc_port_spnotify( + ipc_port_t port); /* Destroy a port */ extern void ipc_port_destroy( @@ -435,6 +470,10 @@ extern void ipc_port_release_sonce( extern void ipc_port_release_receive( ipc_port_t port); +/* finalize the destruction of a port before it gets freed */ +extern void ipc_port_finalize( + ipc_port_t port); + /* Allocate a port in a special space */ extern ipc_port_t ipc_port_alloc_special( ipc_space_t space); diff --git a/osfmk/ipc/ipc_pset.c b/osfmk/ipc/ipc_pset.c index 4e0dd2b68..462527119 100644 --- a/osfmk/ipc/ipc_pset.c +++ b/osfmk/ipc/ipc_pset.c @@ -307,7 +307,7 @@ static int filt_machportattach(struct knote *kn); static void filt_machportdetach(struct knote *kn); static int filt_machport(struct knote *kn, long hint); static void filt_machporttouch(struct knote *kn, struct kevent64_s *kev, long type); -static int filt_machportpeek(struct knote *kn); +static unsigned filt_machportpeek(struct knote *kn); struct filterops machport_filtops = { .f_attach = filt_machportattach, .f_detach = filt_machportdetach, @@ -515,7 +515,7 @@ filt_machporttouch(struct knote *kn, struct kevent64_s *kev, long type) * will catch changes in this status when the event gets posted * up to the knote's kqueue). */ -static int +static unsigned filt_machportpeek(struct knote *kn) { ipc_pset_t pset = kn->kn_ptr.p_pset; diff --git a/osfmk/ipc/ipc_right.c b/osfmk/ipc/ipc_right.c index e7ffd94ec..d3db278b8 100644 --- a/osfmk/ipc/ipc_right.c +++ b/osfmk/ipc/ipc_right.c @@ -243,12 +243,6 @@ ipc_right_reverse( * registered send-once right. If notify is IP_NULL, * just cancels the previously registered request. * - * This interacts with the IE_BITS_COMPAT, because they - * both use ie_request. If this is a compat entry, then - * previous always gets IP_NULL. If notify is IP_NULL, - * then the entry remains a compat entry. Otherwise - * the real dead-name request is registered and the entry - * is no longer a compat entry. * Conditions: * Nothing locked. May allocate memory. * Only consumes/returns refs if successful. @@ -265,28 +259,39 @@ ipc_right_reverse( */ kern_return_t -ipc_right_dnrequest( +ipc_right_request_alloc( ipc_space_t space, mach_port_name_t name, boolean_t immediate, + boolean_t send_possible, ipc_port_t notify, ipc_port_t *previousp) { - ipc_port_t previous; + ipc_port_request_index_t prev_request; + ipc_port_t previous = IP_NULL; + ipc_entry_t entry; + kern_return_t kr; for (;;) { - ipc_entry_t entry; - ipc_entry_bits_t bits; - kern_return_t kr; - kr = ipc_right_lookup_write(space, name, &entry); if (kr != KERN_SUCCESS) return kr; + /* space is write-locked and active */ - bits = entry->ie_bits; - if (bits & MACH_PORT_TYPE_PORT_RIGHTS) { + + prev_request = entry->ie_request; + + /* if nothing to do or undo, we're done */ + if (notify == IP_NULL && prev_request == IE_REQ_NONE) { + is_write_unlock(space); + *previousp = IP_NULL; + return KERN_SUCCESS; + } + + /* see if the entry is of proper type for requests */ + if (entry->ie_bits & MACH_PORT_TYPE_PORT_RIGHTS) { + ipc_port_request_index_t new_request; ipc_port_t port; - ipc_port_request_index_t request; port = (ipc_port_t) entry->ie_object; assert(port != IP_NULL); @@ -294,70 +299,71 @@ ipc_right_dnrequest( if (!ipc_right_check(space, port, name, entry)) { /* port is locked and active */ + /* if no new request, just cancel previous */ if (notify == IP_NULL) { - previous = ipc_right_dncancel_macro( - space, port, name, entry); - + if (prev_request != IE_REQ_NONE) + previous = ipc_port_request_cancel(port, name, prev_request); ip_unlock(port); + entry->ie_request = IE_REQ_NONE; is_write_unlock(space); break; } /* - * If a registered soright exists, - * want to atomically switch with it. - * If ipc_port_dncancel finds us a - * soright, then the following - * ipc_port_dnrequest will reuse - * that slot, so we are guaranteed - * not to unlock and retry. + * send-once rights, kernel objects, and non-full other queues + * fire immediately (if immediate specified). */ + if (send_possible && immediate && + ((entry->ie_bits & MACH_PORT_TYPE_SEND_ONCE) || + port->ip_receiver == ipc_space_kernel || !ip_full(port))) { + if (prev_request != IE_REQ_NONE) + previous = ipc_port_request_cancel(port, name, prev_request); + ip_unlock(port); + entry->ie_request = IE_REQ_NONE; + is_write_unlock(space); - previous = ipc_right_dncancel_macro(space, - port, name, entry); + ipc_notify_send_possible(notify, name); + break; + } - kr = ipc_port_dnrequest(port, name, notify, - &request); + /* + * If there is a previous request, free it. Any subsequent + * allocation cannot fail, thus assuring an atomic swap. + */ + if (prev_request != IE_REQ_NONE) + previous = ipc_port_request_cancel(port, name, prev_request); + + kr = ipc_port_request_alloc(port, name, notify, + send_possible, immediate, + &new_request); if (kr != KERN_SUCCESS) { assert(previous == IP_NULL); is_write_unlock(space); - kr = ipc_port_dngrow(port, - ITS_SIZE_NONE); + kr = ipc_port_request_grow(port, ITS_SIZE_NONE); /* port is unlocked */ + if (kr != KERN_SUCCESS) return kr; continue; } - assert(request != 0); + assert(new_request != IE_REQ_NONE); ip_unlock(port); - - entry->ie_request = request; + entry->ie_request = new_request; is_write_unlock(space); break; - } else { - - /* - * Our capability bits were changed by ipc_right_check - * because it found an inactive port and removed our - * references to it (converting our entry into a dead - * one). Reload the bits (and obviously we can't use - * the port name anymore). - */ - bits = entry->ie_bits; - } + /* entry may have changed to dead-name by ipc_right_check() */ - assert(bits & MACH_PORT_TYPE_DEAD_NAME); } - if ((bits & MACH_PORT_TYPE_DEAD_NAME) && - immediate && (notify != IP_NULL)) { - mach_port_urefs_t urefs = IE_BITS_UREFS(bits); + /* treat send_possible requests as immediate w.r.t. dead-name */ + if ((send_possible || immediate) && notify != IP_NULL && + (entry->ie_bits & MACH_PORT_TYPE_DEAD_NAME)) { + mach_port_urefs_t urefs = IE_BITS_UREFS(entry->ie_bits); - assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_DEAD_NAME); assert(urefs > 0); if (MACH_PORT_UREFS_OVERFLOW(urefs, 1)) { @@ -374,7 +380,7 @@ ipc_right_dnrequest( } is_write_unlock(space); - if (bits & MACH_PORT_TYPE_PORT_OR_DEAD) + if (entry->ie_bits & MACH_PORT_TYPE_PORT_OR_DEAD) return KERN_INVALID_ARGUMENT; else return KERN_INVALID_RIGHT; @@ -385,9 +391,9 @@ ipc_right_dnrequest( } /* - * Routine: ipc_right_dncancel + * Routine: ipc_right_request_cancel * Purpose: - * Cancel a dead-name request and return the send-once right. + * Cancel a notification request and return the send-once right. * Afterwards, entry->ie_request == 0. * Conditions: * The space must be write-locked; the port must be locked. @@ -395,21 +401,23 @@ ipc_right_dnrequest( */ ipc_port_t -ipc_right_dncancel( +ipc_right_request_cancel( __unused ipc_space_t space, ipc_port_t port, mach_port_name_t name, ipc_entry_t entry) { - ipc_port_t dnrequest; + ipc_port_t previous; assert(ip_active(port)); assert(port == (ipc_port_t) entry->ie_object); - dnrequest = ipc_port_dncancel(port, name, entry->ie_request); - entry->ie_request = 0; + if (entry->ie_request == IE_REQ_NONE) + return IP_NULL; - return dnrequest; + previous = ipc_port_request_cancel(port, name, entry->ie_request); + entry->ie_request = IE_REQ_NONE; + return previous; } /* @@ -478,8 +486,6 @@ ipc_right_check( } - ipc_port_release(port); - /* convert entry to dead name */ if ((bits & MACH_PORT_TYPE_SEND) && !(bits & MACH_PORT_TYPE_RECEIVE)) @@ -489,27 +495,32 @@ ipc_right_check( /* * If there was a notification request outstanding on this - * name, and since the port went dead, that notification - * must already be on its way up from the port layer. We - * don't need the index of the notification port anymore. + * name, and the port went dead, that notification + * must already be on its way up from the port layer. + * + * Add the reference that the notification carries. It + * is done here, and not in the notification delivery, + * because the latter doesn't have a space reference and + * trying to actually move a send-right reference would + * get short-circuited into a MACH_PORT_DEAD by IPC. Since + * all calls that deal with the right eventually come + * through here, it has the same result. * - * JMM - We also add a reference to the entry since the - * notification only carries the name and NOT a reference - * (or right). This makes for pretty loose reference - * counting, since it is only happenstance that we - * detected the notification in progress like this. - * But most (all?) calls that try to deal with this entry - * will also come through here, so the reference gets added - * before the entry gets used eventually (I would rather it - * be explicit in the notification generation, though) + * Once done, clear the request index so we only account + * for it once. */ - if (entry->ie_request != 0) { - assert(IE_BITS_UREFS(bits) < MACH_PORT_UREFS_MAX); - entry->ie_request = 0; - bits++; + if (entry->ie_request != IE_REQ_NONE) { + if (ipc_port_request_type(port, name, entry->ie_request) != 0) { + assert(IE_BITS_UREFS(bits) < MACH_PORT_UREFS_MAX); + bits++; + } + entry->ie_request = IE_REQ_NONE; } entry->ie_bits = bits; entry->ie_object = IO_NULL; + + ipc_port_release(port); + return TRUE; } @@ -548,14 +559,14 @@ ipc_right_clean( switch (type) { case MACH_PORT_TYPE_DEAD_NAME: - assert(entry->ie_request == 0); + assert(entry->ie_request == IE_REQ_NONE); assert(entry->ie_object == IO_NULL); break; case MACH_PORT_TYPE_PORT_SET: { ipc_pset_t pset = (ipc_pset_t) entry->ie_object; - assert(entry->ie_request == 0); + assert(entry->ie_request == IE_REQ_NONE); assert(pset != IPS_NULL); ips_lock(pset); @@ -570,7 +581,7 @@ ipc_right_clean( case MACH_PORT_TYPE_SEND_RECEIVE: case MACH_PORT_TYPE_SEND_ONCE: { ipc_port_t port = (ipc_port_t) entry->ie_object; - ipc_port_t dnrequest; + ipc_port_t request; ipc_port_t nsrequest = IP_NULL; mach_port_mscount_t mscount = 0; @@ -583,7 +594,7 @@ ipc_right_clean( break; } - dnrequest = ipc_right_dncancel_macro(space, port, + request = ipc_right_request_cancel_macro(space, port, name, entry); if (type & MACH_PORT_TYPE_SEND) { @@ -619,8 +630,8 @@ ipc_right_clean( if (nsrequest != IP_NULL) ipc_notify_no_senders(nsrequest, mscount); - if (dnrequest != IP_NULL) - ipc_notify_port_deleted(dnrequest, name); + if (request != IP_NULL) + ipc_notify_port_deleted(request, name); break; } @@ -657,7 +668,7 @@ ipc_right_destroy( switch (type) { case MACH_PORT_TYPE_DEAD_NAME: - assert(entry->ie_request == 0); + assert(entry->ie_request == IE_REQ_NONE); assert(entry->ie_object == IO_NULL); ipc_entry_dealloc(space, name, entry); @@ -666,7 +677,7 @@ ipc_right_destroy( case MACH_PORT_TYPE_PORT_SET: { ipc_pset_t pset = (ipc_pset_t) entry->ie_object; - assert(entry->ie_request == 0); + assert(entry->ie_request == IE_REQ_NONE); assert(pset != IPS_NULL); entry->ie_object = IO_NULL; @@ -686,7 +697,7 @@ ipc_right_destroy( ipc_port_t port = (ipc_port_t) entry->ie_object; ipc_port_t nsrequest = IP_NULL; mach_port_mscount_t mscount = 0; - ipc_port_t dnrequest; + ipc_port_t request; assert(port != IP_NULL); @@ -701,14 +712,14 @@ ipc_right_destroy( ip_release(port); ip_check_unlock(port); - entry->ie_request = 0; + entry->ie_request = IE_REQ_NONE; entry->ie_object = IO_NULL; ipc_entry_dealloc(space, name, entry); break; } - dnrequest = ipc_right_dncancel_macro(space, port, name, entry); + request = ipc_right_request_cancel_macro(space, port, name, entry); entry->ie_object = IO_NULL; ipc_entry_dealloc(space, name, entry); @@ -745,8 +756,8 @@ ipc_right_destroy( if (nsrequest != IP_NULL) ipc_notify_no_senders(nsrequest, mscount); - if (dnrequest != IP_NULL) - ipc_notify_port_deleted(dnrequest, name); + if (request != IP_NULL) + ipc_notify_port_deleted(request, name); break; } @@ -792,7 +803,7 @@ ipc_right_dealloc( dead_name: assert(IE_BITS_UREFS(bits) > 0); - assert(entry->ie_request == 0); + assert(entry->ie_request == IE_REQ_NONE); assert(entry->ie_object == IO_NULL); if (IE_BITS_UREFS(bits) == 1) { @@ -806,7 +817,7 @@ ipc_right_dealloc( } case MACH_PORT_TYPE_SEND_ONCE: { - ipc_port_t port, dnrequest; + ipc_port_t port, request; assert(IE_BITS_UREFS(bits) == 1); @@ -823,7 +834,7 @@ ipc_right_dealloc( assert(port->ip_sorights > 0); - dnrequest = ipc_right_dncancel_macro(space, port, name, entry); + request = ipc_right_request_cancel_macro(space, port, name, entry); ip_unlock(port); entry->ie_object = IO_NULL; @@ -833,14 +844,14 @@ ipc_right_dealloc( ipc_notify_send_once(port); - if (dnrequest != IP_NULL) - ipc_notify_port_deleted(dnrequest, name); + if (request != IP_NULL) + ipc_notify_port_deleted(request, name); break; } case MACH_PORT_TYPE_SEND: { ipc_port_t port; - ipc_port_t dnrequest = IP_NULL; + ipc_port_t request = IP_NULL; ipc_port_t nsrequest = IP_NULL; mach_port_mscount_t mscount = 0; @@ -868,7 +879,7 @@ ipc_right_dealloc( } } - dnrequest = ipc_right_dncancel_macro(space, port, + request = ipc_right_request_cancel_macro(space, port, name, entry); ipc_hash_delete(space, (ipc_object_t) port, name, entry); @@ -887,8 +898,8 @@ ipc_right_dealloc( if (nsrequest != IP_NULL) ipc_notify_no_senders(nsrequest, mscount); - if (dnrequest != IP_NULL) - ipc_notify_port_deleted(dnrequest, name); + if (request != IP_NULL) + ipc_notify_port_deleted(request, name); break; } @@ -988,7 +999,7 @@ ipc_right_delta( assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_PORT_SET); assert(IE_BITS_UREFS(bits) == 0); - assert(entry->ie_request == 0); + assert(entry->ie_request == IE_REQ_NONE); if (delta == 0) goto success; @@ -1015,7 +1026,7 @@ ipc_right_delta( case MACH_PORT_RIGHT_RECEIVE: { ipc_port_t port; - ipc_port_t dnrequest = IP_NULL; + ipc_port_t request = IP_NULL; if ((bits & MACH_PORT_TYPE_RECEIVE) == 0) goto invalid_right; @@ -1047,26 +1058,43 @@ ipc_right_delta( assert(IE_BITS_UREFS(bits) < MACH_PORT_UREFS_MAX); assert(port->ip_srights > 0); - /* - * The remaining send right turns into a - * dead name. Notice we don't decrement - * ip_srights, generate a no-senders notif, - * or use ipc_right_dncancel, because the - * port is destroyed "first". - */ - bits &= ~IE_BITS_TYPE_MASK; - bits |= MACH_PORT_TYPE_DEAD_NAME; - if (entry->ie_request) { - entry->ie_request = 0; - bits++; + if (port->ip_pdrequest != NULL) { + /* + * Since another task has requested a + * destroy notification for this port, it + * isn't actually being destroyed - the receive + * right is just being moved to another task. + * Since we still have one or more send rights, + * we need to record the loss of the receive + * right and enter the remaining send right + * into the hash table. + */ + entry->ie_bits &= ~MACH_PORT_TYPE_RECEIVE; + ipc_hash_insert(space, (ipc_object_t) port, + name, entry); + ip_reference(port); + } else { + /* + * The remaining send right turns into a + * dead name. Notice we don't decrement + * ip_srights, generate a no-senders notif, + * or use ipc_right_dncancel, because the + * port is destroyed "first". + */ + bits &= ~IE_BITS_TYPE_MASK; + bits |= MACH_PORT_TYPE_DEAD_NAME; + if (entry->ie_request) { + entry->ie_request = IE_REQ_NONE; + bits++; + } + entry->ie_bits = bits; + entry->ie_object = IO_NULL; } - entry->ie_bits = bits; - entry->ie_object = IO_NULL; } else { assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_RECEIVE); assert(IE_BITS_UREFS(bits) == 0); - dnrequest = ipc_right_dncancel_macro(space, port, + request = ipc_right_request_cancel_macro(space, port, name, entry); entry->ie_object = IO_NULL; ipc_entry_dealloc(space, name, entry); @@ -1076,13 +1104,13 @@ ipc_right_delta( ipc_port_clear_receiver(port); ipc_port_destroy(port); /* consumes ref, unlocks */ - if (dnrequest != IP_NULL) - ipc_notify_port_deleted(dnrequest, name); + if (request != IP_NULL) + ipc_notify_port_deleted(request, name); break; } case MACH_PORT_RIGHT_SEND_ONCE: { - ipc_port_t port, dnrequest; + ipc_port_t port, request; if ((bits & MACH_PORT_TYPE_SEND_ONCE) == 0) goto invalid_right; @@ -1111,7 +1139,7 @@ ipc_right_delta( goto success; } - dnrequest = ipc_right_dncancel_macro(space, port, name, entry); + request = ipc_right_request_cancel_macro(space, port, name, entry); ip_unlock(port); entry->ie_object = IO_NULL; @@ -1121,8 +1149,8 @@ ipc_right_delta( ipc_notify_send_once(port); - if (dnrequest != IP_NULL) - ipc_notify_port_deleted(dnrequest, name); + if (request != IP_NULL) + ipc_notify_port_deleted(request, name); break; } @@ -1147,7 +1175,7 @@ ipc_right_delta( assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_DEAD_NAME); assert(IE_BITS_UREFS(bits) > 0); assert(entry->ie_object == IO_NULL); - assert(entry->ie_request == 0); + assert(entry->ie_request == IE_REQ_NONE); urefs = IE_BITS_UREFS(bits); if (MACH_PORT_UREFS_UNDERFLOW(urefs, delta)) @@ -1169,7 +1197,7 @@ ipc_right_delta( case MACH_PORT_RIGHT_SEND: { mach_port_urefs_t urefs; ipc_port_t port; - ipc_port_t dnrequest = IP_NULL; + ipc_port_t request = IP_NULL; ipc_port_t nsrequest = IP_NULL; mach_port_mscount_t mscount = 0; @@ -1220,7 +1248,7 @@ ipc_right_delta( assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_SEND); - dnrequest = ipc_right_dncancel_macro(space, port, + request = ipc_right_request_cancel_macro(space, port, name, entry); ipc_hash_delete(space, (ipc_object_t) port, name, entry); @@ -1240,8 +1268,8 @@ ipc_right_delta( if (nsrequest != IP_NULL) ipc_notify_no_senders(nsrequest, mscount); - if (dnrequest != IP_NULL) - ipc_notify_port_deleted(dnrequest, name); + if (request != IP_NULL) + ipc_notify_port_deleted(request, name); break; } @@ -1287,27 +1315,42 @@ ipc_right_info( mach_port_type_t *typep, mach_port_urefs_t *urefsp) { + ipc_port_t port; ipc_entry_bits_t bits; - mach_port_type_t type; + mach_port_type_t type = 0; ipc_port_request_index_t request; bits = entry->ie_bits; + request = entry->ie_request; + port = (ipc_port_t) entry->ie_object; - if (bits & MACH_PORT_TYPE_SEND_RIGHTS) { - ipc_port_t port = (ipc_port_t) entry->ie_object; + if (bits & MACH_PORT_TYPE_RECEIVE) { + assert(IP_VALID(port)); - if (ipc_right_check(space, port, name, entry)) { + if (request != IE_REQ_NONE) { + ip_lock(port); + assert(ip_active(port)); + type |= ipc_port_request_type(port, name, request); + ip_unlock(port); + } + + } else if (bits & MACH_PORT_TYPE_SEND_RIGHTS) { + /* + * validate port is still alive - if so, get request + * types while we still have it locked. Otherwise, + * recapture the (now dead) bits. + */ + if (!ipc_right_check(space, port, name, entry)) { + if (request != IE_REQ_NONE) + type |= ipc_port_request_type(port, name, request); + ip_unlock(port); + } else { bits = entry->ie_bits; assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_DEAD_NAME); - } else - ip_unlock(port); + } } - type = IE_BITS_TYPE(bits); - request = entry->ie_request; - - if (request != 0) - type |= MACH_PORT_TYPE_DNREQUEST; + type |= IE_BITS_TYPE(bits); *typep = type; *urefsp = IE_BITS_UREFS(bits); @@ -1564,7 +1607,7 @@ ipc_right_copyin( case MACH_MSG_TYPE_MOVE_RECEIVE: { ipc_port_t port; - ipc_port_t dnrequest = IP_NULL; + ipc_port_t request = IP_NULL; if ((bits & MACH_PORT_TYPE_RECEIVE) == 0) goto invalid_right; @@ -1601,7 +1644,7 @@ ipc_right_copyin( assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_RECEIVE); assert(IE_BITS_UREFS(bits) == 0); - dnrequest = ipc_right_dncancel_macro(space, port, + request = ipc_right_request_cancel_macro(space, port, name, entry); entry->ie_object = IO_NULL; } @@ -1614,7 +1657,7 @@ ipc_right_copyin( ip_unlock(port); *objectp = (ipc_object_t) port; - *sorightp = dnrequest; + *sorightp = request; break; } @@ -1671,7 +1714,7 @@ ipc_right_copyin( case MACH_MSG_TYPE_MOVE_SEND: { ipc_port_t port; - ipc_port_t dnrequest = IP_NULL; + ipc_port_t request = IP_NULL; if (bits & MACH_PORT_TYPE_DEAD_NAME) goto move_dead; @@ -1725,7 +1768,7 @@ ipc_right_copyin( assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_SEND); - dnrequest = ipc_right_dncancel_macro(space, port, + request = ipc_right_request_cancel_macro(space, port, name, entry); ipc_hash_delete(space, (ipc_object_t) port, name, entry); @@ -1742,13 +1785,13 @@ ipc_right_copyin( ip_unlock(port); *objectp = (ipc_object_t) port; - *sorightp = dnrequest; + *sorightp = request; break; } case MACH_MSG_TYPE_MOVE_SEND_ONCE: { ipc_port_t port; - ipc_port_t dnrequest; + ipc_port_t request; if (bits & MACH_PORT_TYPE_DEAD_NAME) goto move_dead; @@ -1792,7 +1835,7 @@ ipc_right_copyin( assert(IE_BITS_UREFS(bits) == 1); assert(port->ip_sorights > 0); - dnrequest = ipc_right_dncancel_macro(space, port, name, entry); + request = ipc_right_request_cancel_macro(space, port, name, entry); ip_unlock(port); entry->ie_object = IO_NULL; @@ -1800,7 +1843,7 @@ ipc_right_copyin( (IE_BITS_UREFS_MASK | MACH_PORT_TYPE_SEND_ONCE); *objectp = (ipc_object_t) port; - *sorightp = dnrequest; + *sorightp = request; break; } @@ -1814,7 +1857,7 @@ ipc_right_copyin( copy_dead: assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_DEAD_NAME); assert(IE_BITS_UREFS(bits) > 0); - assert(entry->ie_request == 0); + assert(entry->ie_request == IE_REQ_NONE); assert(entry->ie_object == 0); if (!deadok) @@ -1827,7 +1870,7 @@ ipc_right_copyin( move_dead: assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_DEAD_NAME); assert(IE_BITS_UREFS(bits) > 0); - assert(entry->ie_request == 0); + assert(entry->ie_request == IE_REQ_NONE); assert(entry->ie_object == 0); if (!deadok) @@ -1950,7 +1993,7 @@ ipc_right_copyin_two( ipc_entry_bits_t bits; mach_port_urefs_t urefs; ipc_port_t port; - ipc_port_t dnrequest = IP_NULL; + ipc_port_t request = IP_NULL; #if CONFIG_MACF_MACH task_t self = current_task(); int rc; @@ -2000,7 +2043,7 @@ ipc_right_copyin_two( } else { assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_SEND); - dnrequest = ipc_right_dncancel_macro(space, port, + request = ipc_right_request_cancel_macro(space, port, name, entry); port->ip_srights++; @@ -2019,7 +2062,7 @@ ipc_right_copyin_two( ip_unlock(port); *objectp = (ipc_object_t) port; - *sorightp = dnrequest; + *sorightp = request; return KERN_SUCCESS; invalid_right: @@ -2257,7 +2300,7 @@ ipc_right_rename( * Note IE_BITS_COMPAT implies ie_request != 0. */ - if (request != 0) { + if (request != IE_REQ_NONE) { ipc_port_t port; assert(bits & MACH_PORT_TYPE_PORT_RIGHTS); @@ -2265,17 +2308,17 @@ ipc_right_rename( assert(port != IP_NULL); if (ipc_right_check(space, port, oname, oentry)) { - request = 0; + request = IE_REQ_NONE; object = IO_NULL; bits = oentry->ie_bits; assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_DEAD_NAME); - assert(oentry->ie_request == 0); + assert(oentry->ie_request == IE_REQ_NONE); } else { /* port is locked and active */ - ipc_port_dnrename(port, request, oname, nname); + ipc_port_request_rename(port, request, oname, nname); ip_unlock(port); - oentry->ie_request = 0; + oentry->ie_request = IE_REQ_NONE; } } @@ -2341,7 +2384,7 @@ ipc_right_rename( panic("ipc_right_rename: strange rights"); } - assert(oentry->ie_request == 0); + assert(oentry->ie_request == IE_REQ_NONE); oentry->ie_object = IO_NULL; ipc_entry_dealloc(space, oname, oentry); is_write_unlock(space); diff --git a/osfmk/ipc/ipc_right.h b/osfmk/ipc/ipc_right.h index b7affe7b3..8b12cd895 100644 --- a/osfmk/ipc/ipc_right.h +++ b/osfmk/ipc/ipc_right.h @@ -95,24 +95,25 @@ extern boolean_t ipc_right_reverse( mach_port_name_t *namep, ipc_entry_t *entryp); -/* Make a dead-name request, returning the registered send-once right */ -extern kern_return_t ipc_right_dnrequest( +/* Make a notification request, returning the previous send-once right */ +extern kern_return_t ipc_right_request_alloc( ipc_space_t space, mach_port_name_t name, boolean_t immediate, + boolean_t send_possible, ipc_port_t notify, ipc_port_t *previousp); -/* Cancel a dead-name request and return the send-once right */ -extern ipc_port_t ipc_right_dncancel( +/* Cancel a notification request and return the send-once right */ +extern ipc_port_t ipc_right_request_cancel( ipc_space_t space, ipc_port_t port, mach_port_name_t name, ipc_entry_t entry); -#define ipc_right_dncancel_macro(space, port, name, entry) \ - ((entry->ie_request == 0) ? IP_NULL : \ - ipc_right_dncancel((space), (port), (name), (entry))) +#define ipc_right_request_cancel_macro(space, port, name, entry) \ + ((entry->ie_request == IE_REQ_NONE) ? IP_NULL : \ + ipc_right_request_cancel((space), (port), (name), (entry))) /* Check if an entry is being used */ extern boolean_t ipc_right_inuse( diff --git a/osfmk/ipc/ipc_space.c b/osfmk/ipc/ipc_space.c index 434580250..1aaecc594 100644 --- a/osfmk/ipc/ipc_space.c +++ b/osfmk/ipc/ipc_space.c @@ -249,6 +249,11 @@ ipc_space_clean( while (space->is_growing) is_write_sleep(space); + if (!space->is_active) { + is_write_unlock(space); + return; + } + /* * Now we can futz with it since we have the write lock. */ diff --git a/osfmk/ipc/ipc_table.c b/osfmk/ipc/ipc_table.c index bfbac619e..4e19f8844 100644 --- a/osfmk/ipc/ipc_table.c +++ b/osfmk/ipc/ipc_table.c @@ -90,8 +90,8 @@ extern vm_map_t kalloc_map; ipc_table_size_t ipc_table_entries; unsigned int ipc_table_entries_size = 512; -ipc_table_size_t ipc_table_dnrequests; -unsigned int ipc_table_dnrequests_size = 64; +ipc_table_size_t ipc_table_requests; +unsigned int ipc_table_requests_size = 64; void ipc_table_fill( @@ -151,17 +151,17 @@ ipc_table_init(void) ipc_table_entries[ipc_table_entries_size - 2].its_size; - ipc_table_dnrequests = (ipc_table_size_t) + ipc_table_requests = (ipc_table_size_t) kalloc(sizeof(struct ipc_table_size) * - ipc_table_dnrequests_size); - assert(ipc_table_dnrequests != ITS_NULL); + ipc_table_requests_size); + assert(ipc_table_requests != ITS_NULL); - ipc_table_fill(ipc_table_dnrequests, ipc_table_dnrequests_size - 1, + ipc_table_fill(ipc_table_requests, ipc_table_requests_size - 1, 2, sizeof(struct ipc_port_request)); /* the last element should have zero size */ - ipc_table_dnrequests[ipc_table_dnrequests_size - 1].its_size = 0; + ipc_table_requests[ipc_table_requests_size - 1].its_size = 0; } /* diff --git a/osfmk/ipc/ipc_table.h b/osfmk/ipc/ipc_table.h index 1c5eb1bcf..fee56f778 100644 --- a/osfmk/ipc/ipc_table.h +++ b/osfmk/ipc/ipc_table.h @@ -106,7 +106,7 @@ struct ipc_table_size { }; extern ipc_table_size_t ipc_table_entries; -extern ipc_table_size_t ipc_table_dnrequests; +extern ipc_table_size_t ipc_table_requests; /* Initialize IPC capabilities table storage */ extern void ipc_table_init(void) __attribute__((section("__TEXT, initcode"))); @@ -161,12 +161,12 @@ extern void ipc_table_free( (void *)(table) \ ) -#define it_dnrequests_alloc(its) \ +#define it_requests_alloc(its) \ ((ipc_port_request_t) \ ipc_table_alloc((its)->its_size * \ sizeof(struct ipc_port_request))) -#define it_dnrequests_free(its, table) \ +#define it_requests_free(its, table) \ ipc_table_free((its)->its_size * \ sizeof(struct ipc_port_request), \ (void *)(table)) diff --git a/osfmk/ipc/ipc_types.h b/osfmk/ipc/ipc_types.h index a7ac3475e..5857e5ecf 100644 --- a/osfmk/ipc/ipc_types.h +++ b/osfmk/ipc/ipc_types.h @@ -70,6 +70,7 @@ typedef struct ipc_kmsg *ipc_kmsg_t; #define IKM_NULL ((ipc_kmsg_t) 0) typedef void (*mach_msg_continue_t)(mach_msg_return_t); /* after wakeup */ +#define MACH_MSG_CONTINUE_NULL ((mach_msg_continue_t) 0) #else /* MACH_KERNEL_PRIVATE */ diff --git a/osfmk/ipc/mach_debug.c b/osfmk/ipc/mach_debug.c index 04442f1fd..f255df6f5 100644 --- a/osfmk/ipc/mach_debug.c +++ b/osfmk/ipc/mach_debug.c @@ -326,8 +326,16 @@ mach_port_space_info( iin->iin_name = MACH_PORT_MAKE(index, IE_BITS_GEN(bits)); iin->iin_collision = (bits & IE_BITS_COLLISION) ? TRUE : FALSE; iin->iin_type = IE_BITS_TYPE(bits); - if (entry->ie_request) - iin->iin_type |= MACH_PORT_TYPE_DNREQUEST; + if ((entry->ie_bits & MACH_PORT_TYPE_PORT_RIGHTS) != MACH_PORT_TYPE_NONE && + entry->ie_request != IE_REQ_NONE) { + ipc_port_t port = (ipc_port_t) entry->ie_object; + + assert(IP_VALID(port)); + ip_lock(port); + iin->iin_type |= ipc_port_request_type(port, iin->iin_name, entry->ie_request); + ip_unlock(port); + } + iin->iin_urefs = IE_BITS_UREFS(bits); iin->iin_object = (natural_t)(uintptr_t)entry->ie_object; iin->iin_next = entry->ie_next; @@ -349,8 +357,16 @@ mach_port_space_info( iin->iin_name = tentry->ite_name; iin->iin_collision = (bits & IE_BITS_COLLISION) ? TRUE : FALSE; iin->iin_type = IE_BITS_TYPE(bits); - if (entry->ie_request) - iin->iin_type |= MACH_PORT_TYPE_DNREQUEST; + if ((entry->ie_bits & MACH_PORT_TYPE_PORT_RIGHTS) != MACH_PORT_TYPE_NONE && + entry->ie_request != IE_REQ_NONE) { + ipc_port_t port = (ipc_port_t) entry->ie_object; + + assert(IP_VALID(port)); + ip_lock(port); + iin->iin_type |= ipc_port_request_type(port, iin->iin_name, entry->ie_request); + ip_unlock(port); + } + iin->iin_urefs = IE_BITS_UREFS(bits); iin->iin_object = (natural_t)(uintptr_t)entry->ie_object; iin->iin_next = entry->ie_next; @@ -456,18 +472,18 @@ mach_port_dnrequest_info( return kr; /* port is locked and active */ - if (port->ip_dnrequests == IPR_NULL) { + if (port->ip_requests == IPR_NULL) { total = 0; used = 0; } else { - ipc_port_request_t dnrequests = port->ip_dnrequests; + ipc_port_request_t requests = port->ip_requests; ipc_port_request_index_t index; - total = dnrequests->ipr_size->its_size; + total = requests->ipr_size->its_size; for (index = 1, used = 0; index < total; index++) { - ipc_port_request_t ipr = &dnrequests[index]; + ipc_port_request_t ipr = &requests[index]; if (ipr->ipr_name != MACH_PORT_NULL) used++; diff --git a/osfmk/ipc/mach_msg.c b/osfmk/ipc/mach_msg.c index 8137915f0..b83ef8191 100644 --- a/osfmk/ipc/mach_msg.c +++ b/osfmk/ipc/mach_msg.c @@ -172,10 +172,6 @@ mach_msg_format_0_trailer_t trailer_template = { * MACH_SEND_INVALID_REPLY Can't copyin reply port. * MACH_SEND_TIMED_OUT Timeout expired without delivery. * MACH_SEND_INTERRUPTED Delivery interrupted. - * MACH_SEND_NO_NOTIFY Can't allocate a msg-accepted request. - * MACH_SEND_WILL_NOTIFY Msg-accepted notif. requested. - * MACH_SEND_NOTIFY_IN_PROGRESS - * This space has already forced a message to this port. */ mach_msg_return_t @@ -184,7 +180,7 @@ mach_msg_send( mach_msg_option_t option, mach_msg_size_t send_size, mach_msg_timeout_t send_timeout, - mach_port_name_t notify) + __unused mach_port_name_t notify) { ipc_space_t space = current_space(); vm_map_t map = current_map(); @@ -222,20 +218,13 @@ mach_msg_send( trailer->msgh_trailer_type = MACH_MSG_TRAILER_FORMAT_0; trailer->msgh_trailer_size = MACH_MSG_TRAILER_MINIMUM_SIZE; - if (option & MACH_SEND_CANCEL) { - if (notify == MACH_PORT_NULL) - mr = MACH_SEND_INVALID_NOTIFY; - else - mr = ipc_kmsg_copyin(kmsg, space, map, notify); - } else - mr = ipc_kmsg_copyin(kmsg, space, map, MACH_PORT_NULL); + mr = ipc_kmsg_copyin(kmsg, space, map, option & MACH_SEND_NOTIFY); if (mr != MACH_MSG_SUCCESS) { ipc_kmsg_free(kmsg); return mr; } mr = ipc_kmsg_send(kmsg, option & MACH_SEND_TIMEOUT, send_timeout); - if (mr != MACH_MSG_SUCCESS) { mr |= ipc_kmsg_copyout_pseudo(kmsg, space, map, MACH_MSG_BODY_NULL); (void) memcpy((void *) msg, (const void *) kmsg->ikm_header, @@ -247,7 +236,7 @@ mach_msg_send( } /* - * Routine: mach_msg_receive + * Routine: mach_msg_receive_results * Purpose: * Receive a message. * Conditions: @@ -381,11 +370,10 @@ mach_msg_receive_results(void) mach_msg_body_t *slist; slist = ipc_kmsg_get_scatter(msg_addr, slist_size, kmsg); - mr = ipc_kmsg_copyout(kmsg, space, map, MACH_PORT_NULL, slist); + mr = ipc_kmsg_copyout(kmsg, space, map, slist); ipc_kmsg_free_scatter(slist, slist_size); } else { - mr = ipc_kmsg_copyout(kmsg, space, map, - MACH_PORT_NULL, MACH_MSG_BODY_NULL); + mr = ipc_kmsg_copyout(kmsg, space, map, MACH_MSG_BODY_NULL); } if (mr != MACH_MSG_SUCCESS) { @@ -473,7 +461,7 @@ mach_msg_overwrite_trap( mach_msg_size_t rcv_size = args->rcv_size; mach_port_name_t rcv_name = args->rcv_name; mach_msg_timeout_t msg_timeout = args->timeout; - mach_port_name_t notify = args->notify; + __unused mach_port_name_t notify = args->notify; mach_vm_address_t rcv_msg_addr = args->rcv_msg; mach_msg_size_t scatter_list_size = 0; /* NOT INITIALIZED - but not used in pactice */ __unused mach_port_seqno_t temp_seqno = 0; @@ -490,13 +478,7 @@ mach_msg_overwrite_trap( if (mr != MACH_MSG_SUCCESS) return mr; - if (option & MACH_SEND_CANCEL) { - if (notify == MACH_PORT_NULL) - mr = MACH_SEND_INVALID_NOTIFY; - else - mr = ipc_kmsg_copyin(kmsg, space, map, notify); - } else - mr = ipc_kmsg_copyin(kmsg, space, map, MACH_PORT_NULL); + mr = ipc_kmsg_copyin(kmsg, space, map, option & MACH_SEND_NOTIFY); if (mr != MACH_MSG_SUCCESS) { ipc_kmsg_free(kmsg); return mr; diff --git a/osfmk/ipc/mach_port.c b/osfmk/ipc/mach_port.c index 389e80bb1..adfc70bcb 100644 --- a/osfmk/ipc/mach_port.c +++ b/osfmk/ipc/mach_port.c @@ -128,6 +128,9 @@ static mach_port_qos_t qos_template; * Routine: mach_port_names_helper * Purpose: * A helper function for mach_port_names. + * + * Conditions: + * Space containing entry is [at least] read-locked. */ void @@ -141,44 +144,51 @@ mach_port_names_helper( { ipc_entry_bits_t bits; ipc_port_request_index_t request; - mach_port_type_t type; + mach_port_type_t type = 0; ipc_entry_num_t actual; + ipc_port_t port; bits = entry->ie_bits; request = entry->ie_request; - if (bits & MACH_PORT_TYPE_SEND_RIGHTS) { - ipc_port_t port; - boolean_t died; + port = (ipc_port_t) entry->ie_object; - port = (ipc_port_t) entry->ie_object; - assert(port != IP_NULL); + if (bits & MACH_PORT_TYPE_RECEIVE) { + assert(IP_VALID(port)); - /* - * The timestamp serializes mach_port_names - * with ipc_port_destroy. If the port died, - * but after mach_port_names started, pretend - * that it isn't dead. - */ + if (request != IE_REQ_NONE) { + ip_lock(port); + assert(ip_active(port)); + type |= ipc_port_request_type(port, name, request); + ip_unlock(port); + } - ip_lock(port); - died = (!ip_active(port) && - IP_TIMESTAMP_ORDER(port->ip_timestamp, timestamp)); - ip_unlock(port); + } else if (bits & MACH_PORT_TYPE_SEND_RIGHTS) { + mach_port_type_t reqtype; - if (died) { - /* pretend this is a dead-name entry */ + assert(IP_VALID(port)); + ip_lock(port); + reqtype = (request != IE_REQ_NONE) ? + ipc_port_request_type(port, name, request) : 0; + + /* + * If the port is alive, or was alive when the mach_port_names + * started, then return that fact. Otherwise, pretend we found + * a dead name entry. + */ + if (ip_active(port) || IP_TIMESTAMP_ORDER(timestamp, port->ip_timestamp)) { + type |= reqtype; + } else { bits &= ~(IE_BITS_TYPE_MASK); bits |= MACH_PORT_TYPE_DEAD_NAME; - if (request != 0) + /* account for additional reference for dead-name notification */ + if (reqtype != 0) bits++; - request = 0; } + ip_unlock(port); } - type = IE_BITS_TYPE(bits); - if (request != 0) - type |= MACH_PORT_TYPE_DNREQUEST; + type |= IE_BITS_TYPE(bits); actual = *actualp; names[actual] = name; @@ -436,6 +446,11 @@ mach_port_type( kr = ipc_right_info(space, name, entry, typep, &urefs); if (kr == KERN_SUCCESS) is_write_unlock(space); +#if 1 + /* JMM - workaround rdar://problem/9121297 (CF being too picky on these bits). */ + *typep &= ~(MACH_PORT_TYPE_SPREQUEST | MACH_PORT_TYPE_SPREQUEST_DELAYED); +#endif + /* space is unlocked */ return kr; } @@ -1472,6 +1487,18 @@ mach_port_request_notification( break; } + case MACH_NOTIFY_SEND_POSSIBLE: + + if (!MACH_PORT_VALID(name)) { + return KERN_INVALID_ARGUMENT; + } + + kr = ipc_right_request_alloc(space, name, sync != 0, + TRUE, notify, previousp); + if (kr != KERN_SUCCESS) + return kr; + break; + case MACH_NOTIFY_DEAD_NAME: if (!MACH_PORT_VALID(name)) { @@ -1483,8 +1510,8 @@ mach_port_request_notification( return KERN_INVALID_ARGUMENT; } - kr = ipc_right_dnrequest(space, name, sync != 0, - notify, previousp); + kr = ipc_right_request_alloc(space, name, sync != 0, + FALSE, notify, previousp); if (kr != KERN_SUCCESS) return kr; break; @@ -1677,7 +1704,7 @@ mach_port_get_attributes( return kr; /* port is locked and active */ - table = port->ip_dnrequests; + table = port->ip_requests; if (table == IPR_NULL) *(int *)info = 0; else @@ -1744,7 +1771,7 @@ mach_port_set_attributes( return kr; /* port is locked and active */ - kr = ipc_port_dngrow(port, *(int *)info); + kr = ipc_port_request_grow(port, *(int *)info); if (kr != KERN_SUCCESS) return kr; break; @@ -1870,6 +1897,12 @@ task_set_port_space( kern_return_t kr; is_write_lock(space); + + if (!space->is_active) { + is_write_unlock(space); + return KERN_INVALID_TASK; + } + kr = ipc_entry_grow_table(space, table_entries); if (kr == KERN_SUCCESS) is_write_unlock(space); diff --git a/osfmk/kdp/kdp.c b/osfmk/kdp/kdp.c index 5536038e7..7eb3459ac 100644 --- a/osfmk/kdp/kdp.c +++ b/osfmk/kdp/kdp.c @@ -37,11 +37,13 @@ #include #include +#include #include /* bcopy */ #include #include +#include #include #include @@ -151,8 +153,6 @@ kdp_stackshot(int pid, void *tracebuf, uint32_t tracebuf_size, uint32_t trace_fl boolean_t kdp_copyin(pmap_t, uint64_t, void *, size_t); extern void bcopy_phys(addr64_t, addr64_t, vm_size_t); -extern char version[]; - boolean_t kdp_packet( unsigned char *pkt, @@ -247,10 +247,10 @@ kdp_connect( rp->error = KDPERR_ALREADY_CONNECTED; } else { - kdp.reply_port = rport; - kdp.exception_port = eport; - kdp.is_conn = TRUE; - kdp.conn_seq = seq; + kdp.reply_port = rport; + kdp.exception_port = eport; + kdp.is_conn = TRUE; + kdp.conn_seq = seq; kdp.session_key = key; rp->error = KDPERR_NO_ERROR; @@ -375,7 +375,7 @@ kdp_kernelversion( rp->hdr.len = sizeof (*rp); dprintf(("kdp_kernelversion\n")); - slen = strlcpy(rp->version, version, MAX_KDP_DATA_SIZE); + slen = strlcpy(rp->version, kdp_kernelversion_string, MAX_KDP_DATA_SIZE); rp->hdr.len += slen + 1; /* strlcpy returns the amount copied with NUL */ @@ -547,8 +547,8 @@ kdp_readmem( size_t plen = *len; kdp_readmem_reply_t *rp = &pkt->readmem_reply; mach_vm_size_t cnt; -#if __i386__ || __arm__ - void *pversion = &version; +#if __i386__ + void *pversion = &kdp_kernelversion_string; #endif if (plen < sizeof (*rq)) @@ -563,9 +563,9 @@ kdp_readmem( unsigned int n = rq->nbytes; dprintf(("kdp_readmem addr %x size %d\n", rq->address, n)); -#if __i386__ || __arm__ +#if __i386__ /* XXX This is a hack to facilitate the "showversion" macro - * on i386/ARM, which is used to obtain the kernel version without + * on i386, which is used to obtain the kernel version without * symbols - a pointer to the version string should eventually * be pinned at a fixed address when an equivalent of the * VECTORS segment (loaded at a fixed load address, and contains @@ -1066,6 +1066,42 @@ kdp_copyin(pmap_t p, uint64_t uaddr, void *dest, size_t size) { return (rem == 0); } + +static void +kdp_mem_snapshot(struct mem_snapshot *mem_snap) +{ + mem_snap->snapshot_magic = STACKSHOT_MEM_SNAPSHOT_MAGIC; + mem_snap->free_pages = vm_page_free_count; + mem_snap->active_pages = vm_page_active_count; + mem_snap->inactive_pages = vm_page_inactive_count; + mem_snap->purgeable_pages = vm_page_purgeable_count; + mem_snap->wired_pages = vm_page_wire_count; + mem_snap->speculative_pages = vm_page_speculative_count; + mem_snap->throttled_pages = vm_page_throttled_count; +} + + +/* + * Method for grabbing timer values safely, in the sense that no infinite loop will occur + * Certain flavors of the timer_grab function, which would seem to be the thing to use, + * can loop infinitely if called while the timer is in the process of being updated. + * Unfortunately, it is (rarely) possible to get inconsistent top and bottom halves of + * the timer using this method. This seems insoluble, since stackshot runs in a context + * where the timer might be half-updated, and has no way of yielding control just long + * enough to finish the update. + */ + +static uint64_t safe_grab_timer_value(struct timer *t) +{ +#if defined(__LP64__) + return t->all_bits; +#else + uint64_t time = t->high_bits; /* endian independent grab */ + time = (time << 32) | t->low_bits; + return time; +#endif +} + int kdp_stackshot(int pid, void *tracebuf, uint32_t tracebuf_size, uint32_t trace_flags, uint32_t dispatch_offset, uint32_t *pbytesTraced) { @@ -1080,41 +1116,74 @@ kdp_stackshot(int pid, void *tracebuf, uint32_t tracebuf_size, uint32_t trace_fl unsigned framesize = 2 * sizeof(vm_offset_t); struct task ctask; struct thread cthread; + struct _vm_map cmap; + struct pmap cpmap; + + queue_head_t *task_list = &tasks; + boolean_t is_active_list = TRUE; boolean_t dispatch_p = ((trace_flags & STACKSHOT_GET_DQ) != 0); boolean_t save_loadinfo_p = ((trace_flags & STACKSHOT_SAVE_LOADINFO) != 0); - queue_iterate(&tasks, task, task_t, tasks) { + if(trace_flags & STACKSHOT_GET_GLOBAL_MEM_STATS) { + if(tracepos + sizeof(struct mem_snapshot) > tracebound) { + error = -1; + goto error_exit; + } + kdp_mem_snapshot((struct mem_snapshot *)tracepos); + tracepos += sizeof(struct mem_snapshot); + } + +walk_list: + queue_iterate(task_list, task, task_t, tasks) { if ((task == NULL) || (ml_nofault_copy((vm_offset_t) task, (vm_offset_t) &ctask, sizeof(struct task)) != sizeof(struct task))) goto error_exit; int task_pid = pid_from_task(task); boolean_t task64 = task_has_64BitAddr(task); + if (!task->active) { + /* + * Not interested in terminated tasks without threads, and + * at the moment, stackshot can't handle a task without a name. + */ + if (queue_empty(&task->threads) || task_pid == -1) { + continue; + } + } + /* Trace everything, unless a process was specified */ if ((pid == -1) || (pid == task_pid)) { task_snapshot_t task_snap; - uint32_t uuid_info_count; - mach_vm_address_t uuid_info_addr; - - if (save_loadinfo_p && task_pid > 0) { + uint32_t uuid_info_count = 0; + mach_vm_address_t uuid_info_addr = 0; + boolean_t have_map = (task->map != NULL) && + (ml_nofault_copy((vm_offset_t)(task->map), (vm_offset_t)&cmap, sizeof(struct _vm_map)) == sizeof(struct _vm_map)); + boolean_t have_pmap = have_map && (cmap.pmap != NULL) && + (ml_nofault_copy((vm_offset_t)(cmap.pmap), (vm_offset_t)&cpmap, sizeof(struct pmap)) == sizeof(struct pmap)); + + if (have_pmap && task->active && save_loadinfo_p && task_pid > 0) { // Read the dyld_all_image_infos struct from the task memory to get UUID array count and location if (task64) { struct dyld_all_image_infos64 task_image_infos; - if (!kdp_copyin(task->map->pmap, task->all_image_info_addr, &task_image_infos, sizeof(struct dyld_all_image_infos64))) - goto error_exit; - uuid_info_count = (uint32_t)task_image_infos.uuidArrayCount; - uuid_info_addr = task_image_infos.uuidArray; + if (kdp_copyin(task->map->pmap, task->all_image_info_addr, &task_image_infos, sizeof(struct dyld_all_image_infos64))) { + uuid_info_count = (uint32_t)task_image_infos.uuidArrayCount; + uuid_info_addr = task_image_infos.uuidArray; + } } else { struct dyld_all_image_infos task_image_infos; - if (!kdp_copyin(task->map->pmap, task->all_image_info_addr, &task_image_infos, sizeof(struct dyld_all_image_infos))) - goto error_exit; - uuid_info_count = task_image_infos.uuidArrayCount; - uuid_info_addr = task_image_infos.uuidArray; + if (kdp_copyin(task->map->pmap, task->all_image_info_addr, &task_image_infos, sizeof(struct dyld_all_image_infos))) { + uuid_info_count = task_image_infos.uuidArrayCount; + uuid_info_addr = task_image_infos.uuidArray; + } + } + + // If we get a NULL uuid_info_addr (which can happen when we catch dyld in the middle of updating + // this data structure), we zero the uuid_info_count so that we won't even try to save load info + // for this task. + if (!uuid_info_addr) { + uuid_info_count = 0; } - } else { - uuid_info_count = 0; - uuid_info_addr = 0; } if (tracepos + sizeof(struct task_snapshot) > tracebound) { @@ -1134,7 +1203,17 @@ kdp_stackshot(int pid, void *tracebuf, uint32_t tracebuf_size, uint32_t trace_fl task_snap->ss_flags = 0; if (task64) task_snap->ss_flags |= kUser64_p; + if (!task->active) + task_snap->ss_flags |= kTerminatedSnapshot; + + task_snap->suspend_count = task->suspend_count; + task_snap->task_size = have_pmap ? pmap_resident_count(task->map->pmap) : 0; + task_snap->faults = task->faults; + task_snap->pageins = task->pageins; + task_snap->cow_faults = task->cow_faults; + task_snap->user_time_in_terminated_threads = task->total_user_time; + task_snap->system_time_in_terminated_threads = task->total_system_time; tracepos += sizeof(struct task_snapshot); if (task_pid > 0 && uuid_info_count > 0) { @@ -1147,10 +1226,11 @@ kdp_stackshot(int pid, void *tracebuf, uint32_t tracebuf_size, uint32_t trace_fl } // Copy in the UUID info array - if (!kdp_copyin(task->map->pmap, uuid_info_addr, tracepos, uuid_info_array_size)) - goto error_exit; - - tracepos += uuid_info_array_size; + // It may be nonresident, in which case just fix up nloadinfos to 0 in the task_snap + if (have_pmap && !kdp_copyin(task->map->pmap, uuid_info_addr, tracepos, uuid_info_array_size)) + task_snap->nloadinfos = 0; + else + tracepos += uuid_info_array_size; } queue_iterate(&task->threads, thread, thread_t, task_threads){ @@ -1167,12 +1247,13 @@ kdp_stackshot(int pid, void *tracebuf, uint32_t tracebuf_size, uint32_t trace_fl tsnap->state = thread->state; tsnap->wait_event = thread->wait_event; tsnap->continuation = (uint64_t) (uintptr_t) thread->continuation; - + tsnap->user_time = safe_grab_timer_value(&thread->user_timer); + tsnap->system_time = safe_grab_timer_value(&thread->system_timer); tsnap->snapshot_magic = STACKSHOT_THREAD_SNAPSHOT_MAGIC; tracepos += sizeof(struct thread_snapshot); tsnap->ss_flags = 0; - if (dispatch_p && (task != kernel_task) && (task->active) && (task->map)) { + if (dispatch_p && (task != kernel_task) && (task->active) && have_pmap) { uint64_t dqkeyaddr = thread_dispatchqaddr(thread); if (dqkeyaddr != 0) { uint64_t dqaddr = 0; @@ -1190,6 +1271,7 @@ kdp_stackshot(int pid, void *tracebuf, uint32_t tracebuf_size, uint32_t trace_fl /* Call through to the machine specific trace routines * Frames are added past the snapshot header. */ + tracebytes = 0; if (thread->kernel_stack != 0) { #if defined(__LP64__) tracebytes = machine_trace_thread64(thread, tracepos, tracebound, MAX_FRAMES, FALSE); @@ -1204,7 +1286,7 @@ kdp_stackshot(int pid, void *tracebuf, uint32_t tracebuf_size, uint32_t trace_fl tracepos += tracebytes; tracebytes = 0; /* Trace user stack, if any */ - if (thread->task->map != kernel_map) { + if (task->active && thread->task->map != kernel_map) { /* 64-bit task? */ if (task_has_64BitAddr(thread->task)) { tracebytes = machine_trace_thread64(thread, tracepos, tracebound, MAX_FRAMES, TRUE); @@ -1223,6 +1305,12 @@ kdp_stackshot(int pid, void *tracebuf, uint32_t tracebuf_size, uint32_t trace_fl } } + if (is_active_list) { + is_active_list = FALSE; + task_list = &terminated_tasks; + goto walk_list; + } + error_exit: /* Release stack snapshot wait indicator */ kdp_snapshot_postflight(); diff --git a/osfmk/kdp/kdp_core.h b/osfmk/kdp/kdp_core.h index 99a278720..d99f2bbcd 100644 --- a/osfmk/kdp/kdp_core.h +++ b/osfmk/kdp/kdp_core.h @@ -44,12 +44,11 @@ #define KDP_ERROR 5 /* error code */ #define KDP_SEEK 6 /* Seek to specified offset */ #define KDP_EOF 7 /* signal end of file */ - -#if defined(__LP64__) #define KDP_FEATURE_MASK_STRING "features" -enum {KDP_FEATURE_LARGE_CRASHDUMPS = 1}; -extern uint32_t kdp_crashdump_feature_mask; -#endif + +enum {KDP_FEATURE_LARGE_CRASHDUMPS = 1, KDP_FEATURE_LARGE_PKT_SIZE = 2}; +extern uint32_t kdp_feature_large_crashdumps, kdp_feature_large_pkt_size; + struct corehdr { short th_opcode; /* packet type */ union { @@ -57,7 +56,7 @@ struct corehdr { unsigned int tu_code; /* error code */ char tu_rpl[1]; /* request packet payload */ } th_u; - char th_data[1]; /* data or error string */ + char th_data[0]; /* data or error string */ }__attribute__((packed)); #define th_block th_u.tu_block @@ -93,4 +92,6 @@ int kdp_send_crashdump_pkt(unsigned int request, char *corename, uint64_t length, void *panic_data); int kdp_send_crashdump_data(unsigned int request, char *corename, - uint64_t length, caddr_t txstart); + int64_t length, caddr_t txstart); + +#define KDP_CRASHDUMP_POLL_COUNT (2500) diff --git a/osfmk/kdp/kdp_dyld.h b/osfmk/kdp/kdp_dyld.h index ef228574e..910565f2e 100644 --- a/osfmk/kdp/kdp_dyld.h +++ b/osfmk/kdp/kdp_dyld.h @@ -81,4 +81,4 @@ struct dyld_all_image_infos64 { user64_addr_t systemOrderFlag; user64_size_t uuidArrayCount; // dyld defines this as a uintptr_t despite it being a count user64_addr_t uuidArray; -}; +}; \ No newline at end of file diff --git a/osfmk/kdp/kdp_en_debugger.h b/osfmk/kdp/kdp_en_debugger.h index dd63d30e2..c8a99822f 100644 --- a/osfmk/kdp/kdp_en_debugger.h +++ b/osfmk/kdp/kdp_en_debugger.h @@ -33,6 +33,7 @@ typedef void (*kdp_send_t)(void * pkt, unsigned int pkt_len); typedef void (*kdp_receive_t)(void * pkt, unsigned int * pkt_len, unsigned int timeout); + void kdp_register_send_receive(kdp_send_t send, kdp_receive_t receive); diff --git a/osfmk/kdp/kdp_private.h b/osfmk/kdp/kdp_private.h index 07e5123ff..bcd2f3399 100644 --- a/osfmk/kdp/kdp_private.h +++ b/osfmk/kdp/kdp_private.h @@ -29,6 +29,7 @@ /* * Private functions for kdp.c */ +extern char kdp_kernelversion_string[]; static boolean_t kdp_unknown( diff --git a/osfmk/kdp/kdp_udp.c b/osfmk/kdp/kdp_udp.c index 3b298fe6e..22bf8978a 100644 --- a/osfmk/kdp/kdp_udp.c +++ b/osfmk/kdp/kdp_udp.c @@ -57,19 +57,33 @@ #include /* kernel_map */ #include +#include #include +/* we just want the link status flags, so undef KERNEL_PRIVATE for this + * header file. */ +#undef KERNEL_PRIVATE +#include +#define KERNEL_PRIVATE + #include -#define DO_ALIGN 1 /* align all packet data accesses */ +#include +#include + +#define DO_ALIGN 1 /* align all packet data accesses */ +#define KDP_SERIAL_IPADDR 0xABADBABE /* IP address used for serial KDP */ +#define LINK_UP_STATUS (IFM_AVALID | IFM_ACTIVE) extern int kdp_getc(void); extern int reattach_wait; -extern int serial_getc(void); -extern void serial_putc(char); -extern int serial_init(void); +/* only used by IONetworkingFamily */ +typedef uint32_t (*kdp_link_t)(void); +typedef boolean_t (*kdp_mode_t)(boolean_t); +void kdp_register_link(kdp_link_t link, kdp_mode_t mode); +void kdp_unregister_link(kdp_link_t link, kdp_mode_t mode); static u_short ip_id; /* ip packet ctr, for ids */ @@ -115,9 +129,17 @@ static const char volatile int kdp_flag = 0; -static kdp_send_t kdp_en_send_pkt; +static kdp_send_t kdp_en_send_pkt; static kdp_receive_t kdp_en_recv_pkt; +static kdp_link_t kdp_en_linkstatus; +static kdp_mode_t kdp_en_setmode; +#if CONFIG_SERIAL_KDP +static void kdp_serial_send(void *rpkt, unsigned int rpkt_len); +#define KDP_SERIAL_ENABLED() (kdp_en_send_pkt == kdp_serial_send) +#else +#define KDP_SERIAL_ENABLED() (0) +#endif static uint32_t kdp_current_ip_address = 0; static struct ether_addr kdp_current_mac_address = {{0, 0, 0, 0, 0, 0}}; @@ -130,6 +152,8 @@ static uint32_t parsed_router_ip = 0; static uint32_t router_ip = 0; static uint32_t target_ip = 0; +static boolean_t save_ip_in_nvram = FALSE; + static volatile boolean_t panicd_specified = FALSE; static boolean_t router_specified = FALSE; static boolean_t corename_specified = FALSE; @@ -151,8 +175,10 @@ static boolean_t flag_arp_resolved = FALSE; static unsigned int panic_timeout = 100000; static unsigned int last_panic_port = CORE_REMOTE_PORT; -unsigned int SEGSIZE = 512; +#define KDP_THROTTLE_VALUE (10ULL * NSEC_PER_SEC) +uint32_t kdp_crashdump_pkt_size = 512; +#define KDP_LARGE_CRASHDUMP_PKT_SIZE (1440 - 6 - sizeof(struct udpiphdr)) static char panicd_ip_str[20]; static char router_ip_str[20]; static char corename_str[50]; @@ -169,10 +195,13 @@ extern void kdp_call(void); extern boolean_t kdp_call_kdb(void); extern int kern_dump(void); +extern int inet_aton(const char *cp, struct in_addr *pin); +extern int inet_ntoa2(struct in_addr * pin, char * cp, const int len); + void * kdp_get_interface(void); -void kdp_set_gateway_mac(void *); -void kdp_set_ip_and_mac_addresses(struct in_addr *, struct ether_addr *); -void kdp_set_interface(void *); +void kdp_set_gateway_mac(void *gatewaymac); +void kdp_set_ip_and_mac_addresses(struct in_addr *ipaddr, struct ether_addr *); +void kdp_set_interface(void *interface, const struct ether_addr *macaddr); void kdp_disable_arp(void); static void kdp_arp_reply(struct ether_arp *); @@ -180,10 +209,11 @@ static void kdp_process_arp_reply(struct ether_arp *); static boolean_t kdp_arp_resolve(uint32_t, struct ether_addr *); static volatile unsigned kdp_reentry_deadline; -#if defined(__LP64__) -uint32_t kdp_crashdump_feature_mask = KDP_FEATURE_LARGE_CRASHDUMPS; -static uint32_t kdp_feature_large_crashdumps; -#endif + +static uint32_t kdp_crashdump_feature_mask = KDP_FEATURE_LARGE_CRASHDUMPS | KDP_FEATURE_LARGE_PKT_SIZE; +uint32_t kdp_feature_large_crashdumps, kdp_feature_large_pkt_size; + +char kdp_kernelversion_string[256]; static boolean_t gKDPDebug = FALSE; #define KDP_DEBUG(...) if (gKDPDebug) printf(__VA_ARGS__); @@ -200,6 +230,13 @@ static uint32_t stack_snapshot_dispatch_offset; static unsigned int old_debugger; +#define SBLOCKSZ (2048) +uint64_t kdp_dump_start_time = 0; +uint64_t kdp_min_superblock_dump_time = ~1ULL; +uint64_t kdp_max_superblock_dump_time = 0; +uint64_t kdp_superblock_dump_time = 0; +uint64_t kdp_superblock_dump_start_time = 0; + void kdp_snapshot_preflight(int pid, void * tracebuf, uint32_t tracebuf_size, uint32_t flags, uint32_t dispatch_offset); @@ -231,6 +268,53 @@ kdp_timer_callout_init(void) { } +/* only send/receive data if the link is up */ +inline static void wait_for_link(void) +{ + static int first = 0; + + if (!kdp_en_linkstatus) + return; + + while (((*kdp_en_linkstatus)() & LINK_UP_STATUS) != LINK_UP_STATUS) { + if (first) + continue; + + first = 1; + printf("Waiting for link to become available.\n"); + kprintf("Waiting for link to become available.\n"); + } +} + + +inline static void kdp_send_data(void *packet, unsigned int len) +{ + wait_for_link(); + (*kdp_en_send_pkt)(packet, len); +} + + +inline static void kdp_receive_data(void *packet, unsigned int *len, + unsigned int timeout) +{ + wait_for_link(); + (*kdp_en_recv_pkt)(packet, len, timeout); +} + + + +void kdp_register_link(kdp_link_t link, kdp_mode_t mode) +{ + kdp_en_linkstatus = link; + kdp_en_setmode = mode; +} + +void kdp_unregister_link(__unused kdp_link_t link, __unused kdp_mode_t mode) +{ + kdp_en_linkstatus = NULL; + kdp_en_setmode = NULL; +} + void kdp_register_send_receive( kdp_send_t send, @@ -243,15 +327,14 @@ kdp_register_send_receive( kdp_timer_callout_init(); PE_parse_boot_argn("debug", &debug, sizeof (debug)); -#if defined(__LP64__) kdp_crashdump_feature_mask = htonl(kdp_crashdump_feature_mask); -#endif + if (!debug) return; - kdp_en_send_pkt = send; - kdp_en_recv_pkt = receive; + kdp_en_send_pkt = send; + kdp_en_recv_pkt = receive; if (debug & DB_KDP_BP_DIS) kdp_flag |= KDP_BP_DIS; @@ -303,8 +386,8 @@ kdp_unregister_send_receive( if (current_debugger == KDP_CUR_DB) current_debugger = NO_CUR_DB; kdp_flag &= ~KDP_READY; - kdp_en_send_pkt = NULL; - kdp_en_recv_pkt = NULL; + kdp_en_send_pkt = NULL; + kdp_en_recv_pkt = NULL; } /* Cache stack snapshot parameters in preparation for a trace */ @@ -449,10 +532,11 @@ kdp_reply( pkt.len += (unsigned int)sizeof (struct ether_header); // save reply for possible retransmission + assert(pkt.len <= KDP_MAXPACKET); if (!sideband) - bcopy((char *)&pkt, (char *)&saved_reply, sizeof(pkt)); + bcopy((char *)&pkt, (char *)&saved_reply, sizeof(saved_reply)); - (*kdp_en_send_pkt)(&pkt.data[pkt.off], pkt.len); + kdp_send_data(&pkt.data[pkt.off], pkt.len); // increment expected sequence number if (!sideband) @@ -515,15 +599,66 @@ kdp_send( eh->ether_type = htons(ETHERTYPE_IP); pkt.len += (unsigned int)sizeof (struct ether_header); - (*kdp_en_send_pkt)(&pkt.data[pkt.off], pkt.len); + kdp_send_data(&pkt.data[pkt.off], pkt.len); } -/* We don't interpret this pointer, we just give it to the -bsd stack so it can decide when to set the MAC and IP info. */ + +inline static void debugger_if_necessary(void) +{ + if ((current_debugger == KDP_CUR_DB) && halt_in_debugger) { + kdp_call(); + halt_in_debugger=0; + } +} + + +/* We don't interpret this pointer, we just give it to the bsd stack + so it can decide when to set the MAC and IP info. We'll + early initialize the MAC/IP info if we can so that we can use + KDP early in boot. These values may subsequently get over-written + when the interface gets initialized for real. +*/ void -kdp_set_interface(void *ifp) +kdp_set_interface(void *ifp, const struct ether_addr *macaddr) { + char kdpstr[80]; + struct in_addr addr = { 0 }; + unsigned int len; + kdp_current_ifp = ifp; + + if (PE_parse_boot_argn("kdp_ip_addr", kdpstr, sizeof(kdpstr))) { + /* look for a static ip address */ + if (inet_aton(kdpstr, &addr) == FALSE) + goto done; + + goto config_network; + } + + /* use saved ip address */ + save_ip_in_nvram = TRUE; + + len = sizeof(kdpstr); + if (PEReadNVRAMProperty("_kdp_ipstr", kdpstr, &len) == FALSE) + goto done; + + kdpstr[len < sizeof(kdpstr) ? len : sizeof(kdpstr) - 1] = '\0'; + if (inet_aton(kdpstr, &addr) == FALSE) + goto done; + +config_network: + kdp_current_ip_address = addr.s_addr; + if (macaddr) + kdp_current_mac_address = *macaddr; + + /* we can't drop into the debugger at this point because the + link will likely not be up. when getDebuggerLinkStatus() support gets + added to the appropriate network drivers, adding the + following will enable this capability: + debugger_if_necessary(); + */ +done: + return; } void * @@ -537,19 +672,48 @@ kdp_set_ip_and_mac_addresses( struct in_addr *ipaddr, struct ether_addr *macaddr) { - kdp_current_ip_address = ipaddr->s_addr; - kdp_current_mac_address = *macaddr; - if ((current_debugger == KDP_CUR_DB) && halt_in_debugger) { - kdp_call(); - halt_in_debugger=0; - } + static uint64_t last_time = (uint64_t) -1; + static uint64_t throttle_val = 0; + uint64_t cur_time; + char addr[16]; + + if (kdp_current_ip_address == ipaddr->s_addr) + goto done; + + /* don't replace if serial debugging is configured */ + if (!KDP_SERIAL_ENABLED() || + (kdp_current_ip_address != KDP_SERIAL_IPADDR)) { + kdp_current_mac_address = *macaddr; + kdp_current_ip_address = ipaddr->s_addr; + } + + if (save_ip_in_nvram == FALSE) + goto done; + + if (inet_ntoa2(ipaddr, addr, sizeof(addr)) == FALSE) + goto done; + + /* throttle writes if needed */ + if (!throttle_val) + nanoseconds_to_absolutetime(KDP_THROTTLE_VALUE, &throttle_val); + + cur_time = mach_absolute_time(); + if (last_time == (uint64_t) -1 || + ((cur_time - last_time) > throttle_val)) { + PEWriteNVRAMProperty("_kdp_ipstr", addr, + (const unsigned int) strlen(addr)); + } + last_time = cur_time; + +done: + debugger_if_necessary(); } void kdp_set_gateway_mac(void *gatewaymac) { - router_mac = *(struct ether_addr *)gatewaymac; - flag_router_mac_initialized = TRUE; + router_mac = *(struct ether_addr *)gatewaymac; + flag_router_mac_initialized = TRUE; } struct ether_addr @@ -657,7 +821,7 @@ kdp_arp_reply(struct ether_arp *ea) (void)memcpy(&pkt.data[pkt.off], ea, sizeof(*ea)); pkt.off -= (unsigned int)sizeof (struct ether_header); /* pkt.len is still the length we want, ether_header+ether_arp */ - (*kdp_en_send_pkt)(&pkt.data[pkt.off], pkt.len); + kdp_send_data(&pkt.data[pkt.off], pkt.len); } } @@ -681,7 +845,7 @@ kdp_poll(void) } pkt.off = pkt.len = 0; - (*kdp_en_recv_pkt)(pkt.data, &pkt.len, 3/* ms */); + kdp_receive_data(pkt.data, &pkt.len, 3/* ms */); if (pkt.len == 0) return; @@ -795,7 +959,7 @@ transmit_ARP_request(uint32_t ip_addr) pkt.off = 0; pkt.len = sizeof(struct ether_header) + sizeof(struct ether_arp); /* Transmit */ - (*kdp_en_send_pkt)(&pkt.data[pkt.off], pkt.len); + kdp_send_data(&pkt.data[pkt.off], pkt.len); } static boolean_t @@ -878,8 +1042,8 @@ kdp_handler( // check for retransmitted request if (hdr->seq == (exception_seq - 1)) { /* retransmit last reply */ - (*kdp_en_send_pkt)(&saved_reply.data[saved_reply.off], - saved_reply.len); + kdp_send_data(&saved_reply.data[saved_reply.off], + saved_reply.len); goto again; } else if ((hdr->seq != exception_seq) && (hdr->request != KDP_CONNECT)) { @@ -946,33 +1110,38 @@ kdp_connection_wait(void) * the panic.log */ - printf( "ethernet MAC address: %02x:%02x:%02x:%02x:%02x:%02x\n", - kdp_mac_addr.ether_addr_octet[0] & 0xff, - kdp_mac_addr.ether_addr_octet[1] & 0xff, - kdp_mac_addr.ether_addr_octet[2] & 0xff, - kdp_mac_addr.ether_addr_octet[3] & 0xff, - kdp_mac_addr.ether_addr_octet[4] & 0xff, - kdp_mac_addr.ether_addr_octet[5] & 0xff); + if (KDP_SERIAL_ENABLED()) { + printf("Using serial KDP.\n"); + kprintf("Using serial KDP.\n"); + } else { + printf( "ethernet MAC address: %02x:%02x:%02x:%02x:%02x:%02x\n", + kdp_mac_addr.ether_addr_octet[0] & 0xff, + kdp_mac_addr.ether_addr_octet[1] & 0xff, + kdp_mac_addr.ether_addr_octet[2] & 0xff, + kdp_mac_addr.ether_addr_octet[3] & 0xff, + kdp_mac_addr.ether_addr_octet[4] & 0xff, + kdp_mac_addr.ether_addr_octet[5] & 0xff); - kprintf( "ethernet MAC address: %02x:%02x:%02x:%02x:%02x:%02x\n", - kdp_mac_addr.ether_addr_octet[0] & 0xff, - kdp_mac_addr.ether_addr_octet[1] & 0xff, - kdp_mac_addr.ether_addr_octet[2] & 0xff, - kdp_mac_addr.ether_addr_octet[3] & 0xff, - kdp_mac_addr.ether_addr_octet[4] & 0xff, - kdp_mac_addr.ether_addr_octet[5] & 0xff); + kprintf( "ethernet MAC address: %02x:%02x:%02x:%02x:%02x:%02x\n", + kdp_mac_addr.ether_addr_octet[0] & 0xff, + kdp_mac_addr.ether_addr_octet[1] & 0xff, + kdp_mac_addr.ether_addr_octet[2] & 0xff, + kdp_mac_addr.ether_addr_octet[3] & 0xff, + kdp_mac_addr.ether_addr_octet[4] & 0xff, + kdp_mac_addr.ether_addr_octet[5] & 0xff); - printf( "ip address: %d.%d.%d.%d\n", - (ip_addr & 0xff000000) >> 24, - (ip_addr & 0xff0000) >> 16, - (ip_addr & 0xff00) >> 8, - (ip_addr & 0xff)); + printf( "ip address: %d.%d.%d.%d\n", + (ip_addr & 0xff000000) >> 24, + (ip_addr & 0xff0000) >> 16, + (ip_addr & 0xff00) >> 8, + (ip_addr & 0xff)); - kprintf( "ip address: %d.%d.%d.%d\n", - (ip_addr & 0xff000000) >> 24, - (ip_addr & 0xff0000) >> 16, - (ip_addr & 0xff00) >> 8, - (ip_addr & 0xff)); + kprintf( "ip address: %d.%d.%d.%d\n", + (ip_addr & 0xff000000) >> 24, + (ip_addr & 0xff0000) >> 16, + (ip_addr & 0xff00) >> 8, + (ip_addr & 0xff)); + } printf("\nWaiting for remote debugger connection.\n"); @@ -1145,10 +1314,12 @@ kdp_raise_exception( kdp.kdp_cpu = cpu_number(); kdp.kdp_thread = current_thread(); + if (kdp_en_setmode) + (*kdp_en_setmode)(TRUE); /* enabling link mode */ + if (pkt.input) kdp_panic("kdp_raise_exception"); - if (((kdp_flag & KDP_PANIC_DUMP_ENABLED) || (kdp_flag & PANIC_LOG_DUMP)) && (panicstr != (char *) 0)) { kdp_panic_dump(); @@ -1223,6 +1394,8 @@ kdp_raise_exception( goto again; exit_raise_exception: + if (kdp_en_setmode) + (*kdp_en_setmode)(FALSE); /* link cleanup */ enable_preemption(); } @@ -1245,11 +1418,9 @@ create_panic_header(unsigned int request, const char *corename, struct corehdr *coreh; const char *mode = "octet"; char modelen = strlen(mode); -#if defined(__LP64__) + size_t fmask_size = sizeof(KDP_FEATURE_MASK_STRING) + sizeof(kdp_crashdump_feature_mask); -#else - size_t fmask_size = 0; -#endif + pkt.off = sizeof (struct ether_header); pkt.len = (unsigned int)(length + ((request == KDP_WRQ) ? modelen + fmask_size : 0) + (corename ? strlen(corename): 0) + sizeof(struct corehdr)); @@ -1303,11 +1474,13 @@ create_panic_header(unsigned int request, const char *corename, *cp++ = '\0'; cp += strlcpy (cp, mode, KDP_MAXPACKET - strlen(corename)); *cp++ = '\0'; -#if defined(__LP64__) cp += strlcpy(cp, KDP_FEATURE_MASK_STRING, sizeof(KDP_FEATURE_MASK_STRING)); *cp++ = '\0'; /* Redundant */ bcopy(&kdp_crashdump_feature_mask, cp, sizeof(kdp_crashdump_feature_mask)); -#endif + kdp_crashdump_pkt_size = KDP_LARGE_CRASHDUMP_PKT_SIZE; + PE_parse_boot_argn("kdp_crashdump_pkt_size", &kdp_crashdump_pkt_size, sizeof(kdp_crashdump_pkt_size)); + cp += sizeof(kdp_crashdump_feature_mask); + *(uint32_t *)cp = htonl(kdp_crashdump_pkt_size); } else { @@ -1330,14 +1503,11 @@ static int kdp_send_crashdump_seek(char *corename, uint64_t seek_off) { int panic_error; -#if defined(__LP64__) if (kdp_feature_large_crashdumps) { panic_error = kdp_send_crashdump_pkt(KDP_SEEK, corename, sizeof(seek_off), &seek_off); - } else -#endif - { + } else { uint32_t off = (uint32_t) seek_off; panic_error = kdp_send_crashdump_pkt(KDP_SEEK, corename, sizeof(off), &off); @@ -1353,40 +1523,44 @@ static int kdp_send_crashdump_seek(char *corename, uint64_t seek_off) } int kdp_send_crashdump_data(unsigned int request, char *corename, - uint64_t length, caddr_t txstart) + int64_t length, caddr_t txstart) { int panic_error = 0; while (length > 0) { - uint64_t chunk = MIN(SEGSIZE, length); - + uint64_t chunk = MIN(kdp_crashdump_pkt_size, length); + panic_error = kdp_send_crashdump_pkt(request, corename, chunk, - (caddr_t) txstart); + txstart); if (panic_error < 0) { printf ("kdp_send_crashdump_pkt failed with error %d\n", panic_error); return panic_error; } - if (!(panic_block % 2000)) - kdb_printf_unbuffered("."); - txstart += chunk; length -= chunk; } return 0; } +uint32_t kdp_crashdump_short_pkt; + int kdp_send_crashdump_pkt(unsigned int request, char *corename, uint64_t length, void *panic_data) { + int poll_count; struct corehdr *th = NULL; - int poll_count = 2500; - - char rretries = 0, tretries = 0; + char rretries, tretries; + + if (kdp_dump_start_time == 0) { + kdp_dump_start_time = mach_absolute_time(); + kdp_superblock_dump_start_time = kdp_dump_start_time; + } + tretries = rretries = 0; + poll_count = KDP_CRASHDUMP_POLL_COUNT; pkt.off = pkt.len = 0; - if (request == KDP_WRQ) /* longer timeout for initial request */ poll_count += 1000; @@ -1409,27 +1583,34 @@ kdp_send_crashdump_pkt(unsigned int request, char *corename, th = create_panic_header(request, corename, (unsigned)length, panic_block); if (request == KDP_DATA) { - /* as all packets are SEGSIZE in length, the last packet + /* as all packets are kdp_crashdump_pkt_size in length, the last packet * may end up with trailing bits. make sure that those * bits aren't confusing. */ - if (length < SEGSIZE) - memset(th->th_data + length, 'X', - SEGSIZE - (uint32_t) length); + if (length < kdp_crashdump_pkt_size) { + kdp_crashdump_short_pkt++; + memset(th->th_data + length, 'Y', + kdp_crashdump_pkt_size - (uint32_t) length); + } - if (!kdp_machine_vm_read((mach_vm_address_t)(intptr_t)panic_data, (caddr_t) th->th_data, length)) { - memset ((caddr_t) th->th_data, 'X', (size_t)length); + if (!kdp_machine_vm_read((mach_vm_address_t)(uintptr_t)panic_data, (caddr_t) th->th_data, length)) { + uintptr_t next_page = round_page((uintptr_t)panic_data); + memset((caddr_t) th->th_data, 'X', (size_t)length); + if ((next_page - ((uintptr_t) panic_data)) < length) { + uint64_t resid = length - (next_page - (intptr_t) panic_data); + if (!kdp_machine_vm_read((mach_vm_address_t)(uintptr_t)next_page, (caddr_t) th->th_data + (length - resid), resid)) { + memset((caddr_t) th->th_data + (length - resid), 'X', (size_t)resid); + } + } } } else if (request == KDP_SEEK) { -#if defined(__LP64__) if (kdp_feature_large_crashdumps) *(uint64_t *) th->th_data = OSSwapHostToBigInt64((*(uint64_t *) panic_data)); else -#endif - *(unsigned int *) th->th_data = htonl(*(unsigned int *) panic_data); + *(unsigned int *) th->th_data = htonl(*(unsigned int *) panic_data); } - (*kdp_en_send_pkt)(&pkt.data[pkt.off], pkt.len); + kdp_send_data(&pkt.data[pkt.off], pkt.len); /* Listen for the ACK */ RECEIVE_RETRY: @@ -1443,17 +1624,22 @@ kdp_send_crashdump_pkt(unsigned int request, char *corename, pkt.input = FALSE; th = (struct corehdr *) &pkt.data[pkt.off]; -#if defined(__LP64__) if (request == KDP_WRQ) { uint16_t opcode64 = ntohs(th->th_opcode); uint16_t features64 = (opcode64 & 0xFF00)>>8; if ((opcode64 & 0xFF) == KDP_ACK) { kdp_feature_large_crashdumps = features64 & KDP_FEATURE_LARGE_CRASHDUMPS; + if (features64 & KDP_FEATURE_LARGE_PKT_SIZE) { + kdp_feature_large_pkt_size = 1; + } + else { + kdp_feature_large_pkt_size = 0; + kdp_crashdump_pkt_size = 512; + } printf("Protocol features: 0x%x\n", (uint32_t) features64); th->th_opcode = htons(KDP_ACK); } } -#endif if (ntohs(th->th_opcode) == KDP_ACK && ntohl(th->th_block) == panic_block) { } else @@ -1485,12 +1671,25 @@ kdp_send_crashdump_pkt(unsigned int request, char *corename, kdp_us_spin ((tretries%4) * panic_timeout); /* capped linear backoff */ goto TRANSMIT_RETRY; } - - panic_block++; - - if (request == KDP_EOF) + + if (!(++panic_block % SBLOCKSZ)) { + uint64_t ctime; + kdb_printf_unbuffered("."); + ctime = mach_absolute_time(); + kdp_superblock_dump_time = ctime - kdp_superblock_dump_start_time; + kdp_superblock_dump_start_time = ctime; + if (kdp_superblock_dump_time > kdp_max_superblock_dump_time) + kdp_max_superblock_dump_time = kdp_superblock_dump_time; + if (kdp_superblock_dump_time < kdp_min_superblock_dump_time) + kdp_min_superblock_dump_time = kdp_superblock_dump_time; + } + + if (request == KDP_EOF) { printf("\nTotal number of packets transmitted: %d\n", panic_block); - + printf("Avg. superblock transfer abstime 0x%llx\n", ((mach_absolute_time() - kdp_dump_start_time) / panic_block) * SBLOCKSZ); + printf("Minimum superblock transfer abstime: 0x%llx\n", kdp_min_superblock_dump_time); + printf("Maximum superblock transfer abstime: 0x%llx\n", kdp_max_superblock_dump_time); + } return 1; } @@ -1521,8 +1720,6 @@ strnstr(char *s, const char *find, size_t slen) return (s); } -extern char version[]; - /* Horrid hack to extract xnu version if possible - a much cleaner approach * would be to have the integrator run a script which would copy the * xnu version into a string or an int somewhere at project submission @@ -1541,10 +1738,9 @@ kdp_get_xnu_version(char *versionbuf) char *vptr; strlcpy(vstr, "custom", 10); - if (kdp_machine_vm_read((mach_vm_address_t)(uintptr_t)version, versionbuf, 128)) { - versionbuf[127] = '\0'; - versionpos = strnstr(versionbuf, "xnu-", 115); + versionbuf[127] = '\0'; + versionpos = strnstr(versionbuf, "xnu-", 115); if (versionpos) { strncpy(vstr, versionpos, sizeof(vstr)); vstr[sizeof(vstr)-1] = '\0'; @@ -1562,8 +1758,6 @@ kdp_get_xnu_version(char *versionbuf) return retval; } -extern char *inet_aton(const char *cp, struct in_addr *pin); - void kdp_set_dump_info(const uint32_t flags, const char *filename, const char *destipstr, const char *routeripstr, @@ -1685,23 +1879,23 @@ kdp_panic_dump(void) char coreprefix[10]; int panic_error; - uint64_t abstime; + uint64_t abstime; uint32_t current_ip = ntohl((uint32_t)kdp_current_ip_address); if (flag_panic_dump_in_progress) { - printf("System dump aborted.\n"); + kdb_printf("System dump aborted.\n"); goto panic_dump_exit; } printf("Entering system dump routine\n"); if (!kdp_en_recv_pkt || !kdp_en_send_pkt) { - printf("Error: No transport device registered for kernel crashdump\n"); - return; + kdb_printf("Error: No transport device registered for kernel crashdump\n"); + return; } if (!panicd_specified) { - printf("A dump server was not specified in the boot-args, terminating kernel core dump.\n"); + kdb_printf("A dump server was not specified in the boot-args, terminating kernel core dump.\n"); goto panic_dump_exit; } @@ -1734,27 +1928,27 @@ kdp_panic_dump(void) } if (0 == inet_aton(panicd_ip_str, (struct in_addr *) &panic_server_ip)) { - printf("inet_aton() failed interpreting %s as a panic server IP\n", panicd_ip_str); + kdb_printf("inet_aton() failed interpreting %s as a panic server IP\n", panicd_ip_str); } else - printf("Attempting connection to panic server configured at IP %s, port %d\n", panicd_ip_str, panicd_port); + kdb_printf("Attempting connection to panic server configured at IP %s, port %d\n", panicd_ip_str, panicd_port); destination_mac = router_mac; if (kdp_arp_resolve(panic_server_ip, &temp_mac)) { - printf("Resolved %s's (or proxy's) link level address\n", panicd_ip_str); + kdb_printf("Resolved %s's (or proxy's) link level address\n", panicd_ip_str); destination_mac = temp_mac; } else { if (!flag_panic_dump_in_progress) goto panic_dump_exit; if (router_specified) { if (0 == inet_aton(router_ip_str, (struct in_addr *) &parsed_router_ip)) - printf("inet_aton() failed interpreting %s as an IP\n", router_ip_str); + kdb_printf("inet_aton() failed interpreting %s as an IP\n", router_ip_str); else { router_ip = parsed_router_ip; if (kdp_arp_resolve(router_ip, &temp_mac)) { destination_mac = temp_mac; - printf("Routing through specified router IP %s (%d)\n", router_ip_str, router_ip); + kdb_printf("Routing through specified router IP %s (%d)\n", router_ip_str, router_ip); } } } @@ -1762,7 +1956,7 @@ kdp_panic_dump(void) if (!flag_panic_dump_in_progress) goto panic_dump_exit; - printf("Transmitting packets to link level address: %02x:%02x:%02x:%02x:%02x:%02x\n", + kdb_printf("Transmitting packets to link level address: %02x:%02x:%02x:%02x:%02x:%02x\n", destination_mac.ether_addr_octet[0] & 0xff, destination_mac.ether_addr_octet[1] & 0xff, destination_mac.ether_addr_octet[2] & 0xff, @@ -1770,17 +1964,17 @@ kdp_panic_dump(void) destination_mac.ether_addr_octet[4] & 0xff, destination_mac.ether_addr_octet[5] & 0xff); - printf("Kernel map size is %llu\n", (unsigned long long) get_vmmap_size(kernel_map)); - printf("Sending write request for %s\n", corename_str); + kdb_printf("Kernel map size is %llu\n", (unsigned long long) get_vmmap_size(kernel_map)); + kdb_printf("Sending write request for %s\n", corename_str); if ((panic_error = kdp_send_crashdump_pkt(KDP_WRQ, corename_str, 0 , NULL)) < 0) { - printf ("kdp_send_crashdump_pkt failed with error %d\n", panic_error); + kdb_printf ("kdp_send_crashdump_pkt failed with error %d\n", panic_error); goto panic_dump_exit; } /* Just the panic log requested */ if ((panicstr != (char *) 0) && (kdp_flag & PANIC_LOG_DUMP)) { - printf("Transmitting panic log, please wait: "); + kdb_printf_unbuffered("Transmitting panic log, please wait: "); kdp_send_crashdump_data(KDP_DATA, corename_str, debug_buf_ptr - debug_buf, debug_buf); @@ -1794,15 +1988,13 @@ kdp_panic_dump(void) long start_off = msgbufp->msg_bufx; long len; - printf("Transmitting system log, please wait: "); + kdb_printf_unbuffered("Transmitting system log, please wait: "); if (start_off >= msgbufp->msg_bufr) { len = msgbufp->msg_size - start_off; kdp_send_crashdump_data(KDP_DATA, corename_str, len, msgbufp->msg_bufc + start_off); - /* seek to remove trailing bytes */ - if (len & (SEGSIZE - 1)) - kdp_send_crashdump_seek(corename_str, len); + kdp_send_crashdump_seek(corename_str, len); start_off = 0; } @@ -1843,14 +2035,8 @@ static boolean_t needs_serial_init = TRUE; static void kdp_serial_send(void *rpkt, unsigned int rpkt_len) { - if (needs_serial_init) - { - serial_init(); - needs_serial_init = FALSE; - } - // printf("tx\n"); - kdp_serialize_packet((unsigned char *)rpkt, rpkt_len, serial_putc); + kdp_serialize_packet((unsigned char *)rpkt, rpkt_len, pal_serial_putc); } static void @@ -1859,18 +2045,12 @@ kdp_serial_receive(void *rpkt, unsigned int *rpkt_len, unsigned int timeout) int readkar; uint64_t now, deadline; - if (needs_serial_init) - { - serial_init(); - needs_serial_init = FALSE; - } - clock_interval_to_deadline(timeout, 1000 * 1000 /* milliseconds */, &deadline); // printf("rx\n"); for(clock_get_uptime(&now); now < deadline; clock_get_uptime(&now)) { - readkar = serial_getc(); + readkar = pal_serial_getc(); if(readkar >= 0) { unsigned char *packet; @@ -1885,6 +2065,21 @@ kdp_serial_receive(void *rpkt, unsigned int *rpkt_len, unsigned int timeout) *rpkt_len = 0; } +static boolean_t +kdp_serial_setmode(boolean_t active) +{ + if (active == FALSE) /* leaving KDP */ + return TRUE; + + if (!needs_serial_init) + return TRUE; + + pal_serial_init(); + needs_serial_init = FALSE; + return TRUE; +} + + static void kdp_serial_callout(__unused void *arg, kdp_event_t event) { /* When we stop KDP, set the bit to re-initialize the console serial port @@ -1912,6 +2107,21 @@ static void kdp_serial_callout(__unused void *arg, kdp_event_t event) void kdp_init(void) { + strlcpy(kdp_kernelversion_string, version, sizeof(kdp_kernelversion_string)); + + /* Relies on platform layer calling panic_init() before kdp_init() */ + if (kernel_uuid[0] != '\0') { + /* + * Update kdp_kernelversion_string with our UUID + * generated at link time. + */ + + strlcat(kdp_kernelversion_string, "; UUID=", sizeof(kdp_kernelversion_string)); + strlcat(kdp_kernelversion_string, kernel_uuid, sizeof(kdp_kernelversion_string)); + } + + if (debug_boot_arg & DB_REBOOT_POST_CORE) + kdp_flag |= REBOOT_POST_CORE; #if CONFIG_SERIAL_KDP char kdpname[80]; struct in_addr ipaddr; @@ -1928,9 +2138,10 @@ kdp_init(void) return; #endif - kprintf("Intializing serial KDP\n"); + kprintf("Initializing serial KDP\n"); kdp_register_callout(kdp_serial_callout, NULL); + kdp_register_link(NULL, kdp_serial_setmode); kdp_register_send_receive(kdp_serial_send, kdp_serial_receive); /* fake up an ip and mac for early serial debugging */ @@ -1940,7 +2151,8 @@ kdp_init(void) macaddr.ether_addr_octet[3] = 'i'; macaddr.ether_addr_octet[4] = 'a'; macaddr.ether_addr_octet[5] = 'l'; - ipaddr.s_addr = 0xABADBABE; + ipaddr.s_addr = KDP_SERIAL_IPADDR; kdp_set_ip_and_mac_addresses(&ipaddr, &macaddr); + #endif /* CONFIG_SERIAL_KDP */ } diff --git a/osfmk/kdp/ml/i386/kdp_vm.c b/osfmk/kdp/ml/i386/kdp_vm.c index 752db7b2b..5633c73b9 100644 --- a/osfmk/kdp/ml/i386/kdp_vm.c +++ b/osfmk/kdp/ml/i386/kdp_vm.c @@ -44,9 +44,6 @@ #include #include -extern vm_offset_t sectTEXTB, sectDATAB, sectLINKB, sectPRELINKB; -extern unsigned long sectSizeTEXT, sectSizeDATA, sectSizeLINK, sectSizePRELINK; - int kern_dump(void); int kdp_dump_trap(int type, x86_saved_state32_t *regs); @@ -156,15 +153,16 @@ kern_dump(void) vm_map_t map; unsigned int thread_count, segment_count; unsigned int command_size = 0, header_size = 0, tstate_size = 0; - unsigned int hoffset = 0, foffset = 0, nfoffset = 0, vmoffset = 0; - unsigned int max_header_size = 0; - vm_offset_t header; + + uint64_t hoffset = 0, foffset = 0, nfoffset = 0, max_header_size; + vm_offset_t header, txstart; + vm_address_t vmoffset; + struct mach_header *mh; struct segment_command *sc; vm_size_t size; vm_prot_t prot = 0; vm_prot_t maxprot = 0; - vm_inherit_t inherit = 0; mythread_state_flavor_t flavors[MAX_TSTATE_FLAVORS]; vm_size_t nflavors; vm_size_t i; @@ -176,9 +174,6 @@ kern_dump(void) int error = 0; int panic_error = 0; - unsigned int txstart = 0; - unsigned int mach_section_count = 4; - unsigned int num_sects_txed = 0; map = kernel_map; @@ -194,7 +189,7 @@ kern_dump(void) tstate_size += sizeof(mythread_state_flavor_t) + (flavors[i].count * sizeof(int)); - command_size = (segment_count + mach_section_count) * + command_size = (segment_count) * sizeof(struct segment_command) + thread_count * sizeof(struct thread_command) + tstate_size * thread_count; @@ -212,7 +207,7 @@ kern_dump(void) mh->cputype = cpu_type(); mh->cpusubtype = cpu_subtype(); mh->filetype = MH_CORE; - mh->ncmds = segment_count + thread_count + mach_section_count; + mh->ncmds = segment_count + thread_count; mh->sizeofcmds = command_size; mh->flags = 0; @@ -225,7 +220,7 @@ kern_dump(void) max_header_size = foffset; - vmoffset = VM_MIN_ADDRESS; /* offset into VM */ + vmoffset = VM_MIN_KERNEL_ADDRESS; /* offset into VM */ /* Transmit the Mach-O MH_CORE header, and seek forward past the * area reserved for the segment and thread commands @@ -249,64 +244,36 @@ kern_dump(void) error = panic_error; goto out; } - printf ("Transmitting kernel state, please wait: "); - - while ((segment_count > 0) || (kret == KERN_SUCCESS)){ - /* Check if we've transmitted all the kernel sections */ - if (num_sects_txed == mach_section_count) { - - while (1) { - - /* - * Get region information for next region. - */ - - vbrcount = VM_REGION_SUBMAP_INFO_COUNT_64; - if((kret = vm_region_recurse_64(map, - &vmoffset, &size, &nesting_depth, - (vm_region_recurse_info_t)&vbr, - &vbrcount)) != KERN_SUCCESS) { - break; - } - - if(vbr.is_submap) { - nesting_depth++; - continue; - } else { - break; - } - } + printf ("Transmitting kernel state:\n"); - if(kret != KERN_SUCCESS) - break; + while ((segment_count > 0) || (kret == KERN_SUCCESS)) { + while (1) { - prot = vbr.protection; - maxprot = vbr.max_protection; - inherit = vbr.inheritance; - } - else - { - switch (num_sects_txed) { - case 0: - /* Transmit the kernel text section */ - vmoffset = sectTEXTB; - size = sectSizeTEXT; - break; - case 1: - vmoffset = sectDATAB; - size = sectSizeDATA; - break; - case 2: - vmoffset = sectPRELINKB; - size = sectSizePRELINK; + /* + * Get region information for next region. + */ + + vbrcount = VM_REGION_SUBMAP_INFO_COUNT_64; + if((kret = vm_region_recurse_64(map, + &vmoffset, &size, &nesting_depth, + (vm_region_recurse_info_t)&vbr, + &vbrcount)) != KERN_SUCCESS) { break; - case 3: - vmoffset = sectLINKB; - size = sectSizeLINK; + } + + if(vbr.is_submap) { + nesting_depth++; + continue; + } else { break; } - num_sects_txed++; } + + if(kret != KERN_SUCCESS) + break; + + prot = vbr.protection; + maxprot = vbr.max_protection; /* * Fill in segment command structure. */ @@ -319,7 +286,7 @@ kern_dump(void) sc->segname[0] = 0; sc->vmaddr = vmoffset; sc->vmsize = size; - sc->fileoff = foffset; + sc->fileoff = (uint32_t) foffset; sc->filesize = size; sc->maxprot = maxprot; sc->initprot = prot; @@ -392,8 +359,7 @@ kern_dump(void) } /* last packet */ - if ((panic_error = kdp_send_crashdump_pkt (KDP_EOF, NULL, 0, ((void *) 0))) < 0) - { + if ((panic_error = kdp_send_crashdump_pkt (KDP_EOF, NULL, 0, ((void *) 0))) < 0) { printf ("kdp_send_crashdump_pkt failed with error %d\n", panic_error); error = panic_error; goto out; diff --git a/osfmk/kdp/ml/i386/kdp_x86_common.c b/osfmk/kdp/ml/i386/kdp_x86_common.c index 8f08df116..221d683ac 100644 --- a/osfmk/kdp/ml/i386/kdp_x86_common.c +++ b/osfmk/kdp/ml/i386/kdp_x86_common.c @@ -44,6 +44,8 @@ #include #include +#include + // #define KDP_VM_READ_DEBUG 1 // #define KDP_VM_WRITE_DEBUG 1 @@ -73,8 +75,8 @@ kdp_vtophys( mach_vm_size_t kdp_machine_vm_read( mach_vm_address_t src, caddr_t dst, mach_vm_size_t len) { - addr64_t cur_virt_src = (addr64_t)src; - addr64_t cur_virt_dst = (addr64_t)(intptr_t)dst; + addr64_t cur_virt_src = PAL_KDP_ADDR((addr64_t)src); + addr64_t cur_virt_dst = PAL_KDP_ADDR((addr64_t)(intptr_t)dst); addr64_t cur_phys_dst, cur_phys_src; mach_vm_size_t resid = len; mach_vm_size_t cnt = 0, cnt_src, cnt_dst; @@ -201,8 +203,8 @@ kdp_machine_vm_write( caddr_t src, mach_vm_address_t dst, mach_vm_size_t len) printf("kdp_vm_write: src %p dst %llx len %llx - %08X %08X\n", (void *)src, dst, len, ((unsigned int *)src)[0], ((unsigned int *)src)[1]); #endif - cur_virt_src = (addr64_t)(intptr_t)src; - cur_virt_dst = (addr64_t)dst; + cur_virt_src = PAL_KDP_ADDR((addr64_t)(intptr_t)src); + cur_virt_dst = PAL_KDP_ADDR((addr64_t)dst); resid = (unsigned)len; diff --git a/osfmk/kdp/ml/ppc/kdp_asm.s b/osfmk/kdp/ml/ppc/kdp_asm.s deleted file mode 100644 index cdc0cfc5f..000000000 --- a/osfmk/kdp/ml/ppc/kdp_asm.s +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#include -#include -#include -#include -#include -#include -#include - -/* void kdp_call_with_ctx(int type, struct ppc_thread_state *ssp) - * - * Switch on kdp stack and enter the debugger. On return, - * switch back to the previous stack - * - * If the kdp stack is not free, we allocate ourselves a frame below - * the current kdp frame. This should never occur in a perfect world. - */ - -ENTRY(kdp_call_with_ctx, TAG_NO_FRAME_USED) - - lis r2,hi16(MASK(MSR_VEC)) ; Get the vector enable - mfmsr r7 ; Get the MSR - ori r2,r2,lo16(MASK(MSR_EE)|MASK(MSR_FP)) ; Get FP and EE - mflr r0 - andc r7,r7,r2 ; Clear FP, VEC, and EE - mtmsr r7 - isync ; Need this because we may have ditched fp/vec - mfsprg r8,0 /* Get the per_proc block address */ - stw r0, FM_LR_SAVE(r1) /* save lr in the current frame */ - - lwz r9, PP_DEBSTACKPTR(r8) /* get kdp stack pointer */ - cmpwi r9, 0 - bne 0f - -#ifdef LET_KDP_REENTER - mr r9, r1 /* get current stack pointer */ - subi r9, r9, FM_REDZONE + FM_SIZE -#else - bl EXT(kdp_print_backtrace) -#endif - -0: - stw r1, FM_ARG0(r9) /* Store old stack pointer */ - li r0, 0 - stw r0, PP_DEBSTACKPTR(r8) /* Mark kdp stack as busy */ - - subi r1, r9, FM_SIZE - stw r0, FM_BACKPTR(r1) - - bl EXT(kdp_trap) - - lis r2,hi16(MASK(MSR_VEC)) ; Get the vector enable - mfmsr r0 /* Get the MSR */ - ori r2,r2,lo16(MASK(MSR_EE)|MASK(MSR_FP)) ; Get FP and EE - addi r1, r1, FM_SIZE - andc r0,r0,r2 ; Clear FP, VEC, and EE - mtmsr r0 - isync ; Need this because we may have ditched fp/vec - - mfsprg r8,0 /* Get the per_proc block address */ - - stw r1, PP_DEBSTACKPTR(r8) /* Mark gdb stack as free */ - lwz r1, FM_ARG0(r1) - lwz r0, FM_LR_SAVE(r1) - mtlr r0 - - blr - - diff --git a/osfmk/kdp/ml/ppc/kdp_machdep.c b/osfmk/kdp/ml/ppc/kdp_machdep.c deleted file mode 100644 index e1e89331d..000000000 --- a/osfmk/kdp/ml/ppc/kdp_machdep.c +++ /dev/null @@ -1,827 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#define KDP_TEST_HARNESS 0 -#if KDP_TEST_HARNESS -#define dprintf(x) kprintf x -#else -#define dprintf(x) -#endif - -void print_saved_state(void *); -void kdp_call(void); -int kdp_getc(void); -boolean_t kdp_call_kdb(void); - -extern pmap_t kdp_pmap; - -int -machine_trace_thread(thread_t thread, char *tracepos, char *tracebound, int nframes, boolean_t user_p); - -int -machine_trace_thread64(thread_t thread, char *tracepos, char *tracebound, int nframes, boolean_t user_p); - -unsigned -machine_read64(addr64_t srcaddr, caddr_t dstaddr, uint32_t len); - -void -kdp_exception( - unsigned char *pkt, - int *len, - unsigned short *remote_port, - unsigned int exception, - unsigned int code, - unsigned int subcode -) -{ - struct { - kdp_exception_t pkt; - kdp_exc_info_t exc; - } aligned_pkt; - kdp_exception_t *rq = (kdp_exception_t *)&aligned_pkt; - - bcopy((char *)pkt, (char *)rq, sizeof(*rq)); - rq->hdr.request = KDP_EXCEPTION; - rq->hdr.is_reply = 0; - rq->hdr.seq = kdp.exception_seq; - rq->hdr.key = 0; - rq->hdr.len = sizeof (*rq) + sizeof(kdp_exc_info_t); - - rq->n_exc_info = 1; - rq->exc_info[0].cpu = 0; - rq->exc_info[0].exception = exception; - rq->exc_info[0].code = code; - rq->exc_info[0].subcode = subcode; - - rq->hdr.len += rq->n_exc_info * sizeof (kdp_exc_info_t); - - bcopy((char *)rq, (char *)pkt, rq->hdr.len); - - kdp.exception_ack_needed = TRUE; - - *remote_port = kdp.exception_port; - *len = rq->hdr.len; -} - -boolean_t -kdp_exception_ack( - unsigned char *pkt, - int len -) -{ - kdp_exception_ack_t aligned_pkt; - kdp_exception_ack_t *rq = (kdp_exception_ack_t *)&aligned_pkt; - - if ((size_t)len < sizeof (*rq)) - return(FALSE); - - bcopy((char *)pkt, (char *)rq, sizeof(*rq)); - - if (!rq->hdr.is_reply || rq->hdr.request != KDP_EXCEPTION) - return(FALSE); - - dprintf(("kdp_exception_ack seq %x %x\n", rq->hdr.seq, kdp.exception_seq)); - - if (rq->hdr.seq == kdp.exception_seq) { - kdp.exception_ack_needed = FALSE; - kdp.exception_seq++; - } - return(TRUE); -} - -static void -kdp_getintegerstate( - struct ppc_thread_state *state -) -{ - struct savearea *saved_state; - - saved_state = kdp.saved_state; - - bzero((char *)state,sizeof (struct ppc_thread_state)) ; - - state->srr0 = (unsigned int)saved_state->save_srr0; - state->srr1 = (unsigned int)saved_state->save_srr1; - state->r0 = (unsigned int)saved_state->save_r0; - state->r1 = (unsigned int)saved_state->save_r1; - state->r2 = (unsigned int)saved_state->save_r2; - state->r3 = (unsigned int)saved_state->save_r3; - state->r4 = (unsigned int)saved_state->save_r4; - state->r5 = (unsigned int)saved_state->save_r5; - state->r6 = (unsigned int)saved_state->save_r6; - state->r7 = (unsigned int)saved_state->save_r7; - state->r8 = (unsigned int)saved_state->save_r8; - state->r9 = (unsigned int)saved_state->save_r9; - state->r10 = (unsigned int)saved_state->save_r10; - state->r11 = (unsigned int)saved_state->save_r11; - state->r12 = (unsigned int)saved_state->save_r12; - state->r13 = (unsigned int)saved_state->save_r13; - state->r14 = (unsigned int)saved_state->save_r14; - state->r15 = (unsigned int)saved_state->save_r15; - state->r16 = (unsigned int)saved_state->save_r16; - state->r17 = (unsigned int)saved_state->save_r17; - state->r18 = (unsigned int)saved_state->save_r18; - state->r19 = (unsigned int)saved_state->save_r19; - state->r20 = (unsigned int)saved_state->save_r20; - state->r21 = (unsigned int)saved_state->save_r21; - state->r22 = (unsigned int)saved_state->save_r22; - state->r23 = (unsigned int)saved_state->save_r23; - state->r24 = (unsigned int)saved_state->save_r24; - state->r25 = (unsigned int)saved_state->save_r25; - state->r26 = (unsigned int)saved_state->save_r26; - state->r27 = (unsigned int)saved_state->save_r27; - state->r28 = (unsigned int)saved_state->save_r28; - state->r29 = (unsigned int)saved_state->save_r29; - state->r30 = (unsigned int)saved_state->save_r30; - state->r31 = (unsigned int)saved_state->save_r31; - state->cr = (unsigned int)saved_state->save_cr; - state->xer = (unsigned int)saved_state->save_xer; - state->lr = (unsigned int)saved_state->save_lr; - state->ctr = (unsigned int)saved_state->save_ctr; -} - -static void -kdp_getintegerstate64( - struct ppc_thread_state64 *state -) -{ - struct savearea *saved_state; - - saved_state = kdp.saved_state; - - bzero((char *)state,sizeof (struct ppc_thread_state64)) ; - - state->srr0 = saved_state->save_srr0; - state->srr1 = saved_state->save_srr1; - state->r0 = saved_state->save_r0; - state->r1 = saved_state->save_r1; - state->r2 = saved_state->save_r2; - state->r3 = saved_state->save_r3; - state->r4 = saved_state->save_r4; - state->r5 = saved_state->save_r5; - state->r6 = saved_state->save_r6; - state->r7 = saved_state->save_r7; - state->r8 = saved_state->save_r8; - state->r9 = saved_state->save_r9; - state->r10 = saved_state->save_r10; - state->r11 = saved_state->save_r11; - state->r12 = saved_state->save_r12; - state->r13 = saved_state->save_r13; - state->r14 = saved_state->save_r14; - state->r15 = saved_state->save_r15; - state->r16 = saved_state->save_r16; - state->r17 = saved_state->save_r17; - state->r18 = saved_state->save_r18; - state->r19 = saved_state->save_r19; - state->r20 = saved_state->save_r20; - state->r21 = saved_state->save_r21; - state->r22 = saved_state->save_r22; - state->r23 = saved_state->save_r23; - state->r24 = saved_state->save_r24; - state->r25 = saved_state->save_r25; - state->r26 = saved_state->save_r26; - state->r27 = saved_state->save_r27; - state->r28 = saved_state->save_r28; - state->r29 = saved_state->save_r29; - state->r30 = saved_state->save_r30; - state->r31 = saved_state->save_r31; - state->cr = saved_state->save_cr; - state->xer = saved_state->save_xer; - state->lr = saved_state->save_lr; - state->ctr = saved_state->save_ctr; -} - -kdp_error_t -kdp_machine_read_regs( - __unused unsigned int cpu, - unsigned int flavor, - char *data, - int *size -) -{ - switch (flavor) { - - case PPC_THREAD_STATE: - dprintf(("kdp_readregs THREAD_STATE\n")); - kdp_getintegerstate((struct ppc_thread_state *)data); - *size = PPC_THREAD_STATE_COUNT * sizeof(int); - return KDPERR_NO_ERROR; - - case PPC_THREAD_STATE64: - dprintf(("kdp_readregs THREAD_STATE\n")); - kdp_getintegerstate64((struct ppc_thread_state64 *)data); - *size = PPC_THREAD_STATE64_COUNT * sizeof(int); - return KDPERR_NO_ERROR; - - case PPC_FLOAT_STATE: - dprintf(("kdp_readregs THREAD_FPSTATE\n")); - bzero((char *)data ,sizeof(struct ppc_float_state)); - *size = PPC_FLOAT_STATE_COUNT * sizeof(int); - return KDPERR_NO_ERROR; - - default: - dprintf(("kdp_readregs bad flavor %d\n")); - return KDPERR_BADFLAVOR; - } -} - -static void -kdp_setintegerstate( - struct ppc_thread_state *state -) -{ - struct savearea *saved_state; - - saved_state = kdp.saved_state; - - saved_state->save_srr0 = state->srr0; - saved_state->save_srr1 = state->srr1; - saved_state->save_r0 = state->r0; - saved_state->save_r1 = state->r1; - saved_state->save_r2 = state->r2; - saved_state->save_r3 = state->r3; - saved_state->save_r4 = state->r4; - saved_state->save_r5 = state->r5; - saved_state->save_r6 = state->r6; - saved_state->save_r7 = state->r7; - saved_state->save_r8 = state->r8; - saved_state->save_r9 = state->r9; - saved_state->save_r10 = state->r10; - saved_state->save_r11 = state->r11; - saved_state->save_r12 = state->r12; - saved_state->save_r13 = state->r13; - saved_state->save_r14 = state->r14; - saved_state->save_r15 = state->r15; - saved_state->save_r16 = state->r16; - saved_state->save_r17 = state->r17; - saved_state->save_r18 = state->r18; - saved_state->save_r19 = state->r19; - saved_state->save_r20 = state->r20; - saved_state->save_r21 = state->r21; - saved_state->save_r22 = state->r22; - saved_state->save_r23 = state->r23; - saved_state->save_r24 = state->r24; - saved_state->save_r25 = state->r25; - saved_state->save_r26 = state->r26; - saved_state->save_r27 = state->r27; - saved_state->save_r28 = state->r28; - saved_state->save_r29 = state->r29; - saved_state->save_r30 = state->r30; - saved_state->save_r31 = state->r31; - saved_state->save_cr = state->cr; - saved_state->save_xer = state->xer; - saved_state->save_lr = state->lr; - saved_state->save_ctr = state->ctr; -} - -static void -kdp_setintegerstate64( - struct ppc_thread_state64 *state -) -{ - struct savearea *saved_state; - - saved_state = kdp.saved_state; - - saved_state->save_srr0 = state->srr0; - saved_state->save_srr1 = state->srr1; - saved_state->save_r0 = state->r0; - saved_state->save_r1 = state->r1; - saved_state->save_r2 = state->r2; - saved_state->save_r3 = state->r3; - saved_state->save_r4 = state->r4; - saved_state->save_r5 = state->r5; - saved_state->save_r6 = state->r6; - saved_state->save_r7 = state->r7; - saved_state->save_r8 = state->r8; - saved_state->save_r9 = state->r9; - saved_state->save_r10 = state->r10; - saved_state->save_r11 = state->r11; - saved_state->save_r12 = state->r12; - saved_state->save_r13 = state->r13; - saved_state->save_r14 = state->r14; - saved_state->save_r15 = state->r15; - saved_state->save_r16 = state->r16; - saved_state->save_r17 = state->r17; - saved_state->save_r18 = state->r18; - saved_state->save_r19 = state->r19; - saved_state->save_r20 = state->r20; - saved_state->save_r21 = state->r21; - saved_state->save_r22 = state->r22; - saved_state->save_r23 = state->r23; - saved_state->save_r24 = state->r24; - saved_state->save_r25 = state->r25; - saved_state->save_r26 = state->r26; - saved_state->save_r27 = state->r27; - saved_state->save_r28 = state->r28; - saved_state->save_r29 = state->r29; - saved_state->save_r30 = state->r30; - saved_state->save_r31 = state->r31; - saved_state->save_cr = state->cr; - saved_state->save_xer = state->xer; - saved_state->save_lr = state->lr; - saved_state->save_ctr = state->ctr; -} - -kdp_error_t -kdp_machine_write_regs( - __unused unsigned int cpu, - unsigned int flavor, - char *data, - __unused int *size -) -{ - switch (flavor) { - - case PPC_THREAD_STATE: - dprintf(("kdp_writeregs THREAD_STATE\n")); - kdp_setintegerstate((struct ppc_thread_state *)data); - -#if KDP_TEST_HARNESS - DumpTheSave((struct savearea *)data); /* (TEST/DEBUG) */ -#endif - return KDPERR_NO_ERROR; - - case PPC_THREAD_STATE64: - dprintf(("kdp_writeregs THREAD_STATE64\n")); - kdp_setintegerstate64((struct ppc_thread_state64 *)data); - -#if KDP_TEST_HARNESS - DumpTheSave((struct savearea *)data); /* (TEST/DEBUG) */ -#endif - return KDPERR_NO_ERROR; - case PPC_FLOAT_STATE: - dprintf(("kdp_writeregs THREAD_FPSTATE\n")); - return KDPERR_NO_ERROR; - - default: - dprintf(("kdp_writeregs bad flavor %d\n")); - return KDPERR_BADFLAVOR; - } -} - -void -kdp_machine_hostinfo( - kdp_hostinfo_t *hostinfo -) -{ - int i; - - hostinfo->cpus_mask = 0; - hostinfo->cpu_type = 0; - - for (i = 0; i < machine_info.max_cpus; i++) { - if ((PerProcTable[i].ppe_vaddr == (struct per_proc_info *)NULL) || - !(PerProcTable[i].ppe_vaddr->running)) - continue; - - hostinfo->cpus_mask |= (1 << i); - if (hostinfo->cpu_type == 0) { - hostinfo->cpu_type = slot_type(i); - hostinfo->cpu_subtype = slot_subtype(i); - } - } -} - -void -kdp_panic( - const char *msg -) -{ - printf("kdp panic: %s\n", msg); - while(1) {} -} - -extern void halt_all_cpus(boolean_t); - -void -kdp_machine_reboot(void) -{ - printf("Attempting system restart..."); - /* Call the platform specific restart*/ - if (PE_halt_restart) - (*PE_halt_restart)(kPERestartCPU); - /* If we do reach this, give up */ - halt_all_cpus(TRUE); -} - -int -kdp_intr_disbl(void) -{ - return (splhigh()); -} - -void -kdp_intr_enbl(int s) -{ - splx(s); -} - -void -kdp_us_spin(int usec) -{ - delay(usec/100); -} - -void print_saved_state(void *state) -{ - struct ppc_thread_state *saved_state; - - saved_state = state; - - printf("pc = 0x%x\n", saved_state->srr0); - printf("msr = 0x%x\n", saved_state->srr1); - printf("rp = 0x%x\n", saved_state->lr); - printf("sp = 0x%x\n", saved_state->r1); - -} - -void -kdp_call(void) -{ - Debugger("inline call to debugger(machine_startup)"); -} - -/* - * table to convert system specific code to generic codes for kdb - */ -int kdp_trap_codes[] = { - EXC_BAD_ACCESS, /* 0x0000 INVALID EXCEPTION */ - EXC_BAD_ACCESS, /* 0x0100 System reset */ - EXC_BAD_ACCESS, /* 0x0200 Machine check */ - EXC_BAD_ACCESS, /* 0x0300 Data access */ - EXC_BAD_ACCESS, /* 0x0400 Instruction access */ - EXC_BAD_ACCESS, /* 0x0500 External interrupt */ - EXC_BAD_ACCESS, /* 0x0600 Alignment */ - EXC_BREAKPOINT, /* 0x0700 Program - fp exc, ill/priv instr, trap */ - EXC_ARITHMETIC, /* 0x0800 Floating point disabled */ - EXC_SOFTWARE, /* 0x0900 Decrementer */ - EXC_BAD_ACCESS, /* 0x0A00 I/O controller interface */ - EXC_BAD_ACCESS, /* 0x0B00 INVALID EXCEPTION */ - EXC_SOFTWARE, /* 0x0C00 System call exception */ - EXC_BREAKPOINT, /* 0x0D00 Trace */ - EXC_SOFTWARE, /* 0x0E00 FP assist */ - EXC_SOFTWARE, /* 0x0F00 Performance monitoring */ - EXC_ARITHMETIC, /* 0x0F20 Altivec disabled */ - EXC_BAD_ACCESS, /* 0x1000 Instruction PTE miss */ - EXC_BAD_ACCESS, /* 0x1100 Data load PTE miss */ - EXC_BAD_ACCESS, /* 0x1200 Data store PTE miss */ - EXC_BREAKPOINT, /* 0x1300 Instruction bkpt */ - EXC_SOFTWARE, /* 0x1400 System management */ - EXC_BAD_ACCESS, /* 0x1500 INVALID EXCEPTION */ - EXC_ARITHMETIC, /* 0x1600 Altivec Assist */ - EXC_BAD_ACCESS, /* 0x1700 INVALID EXCEPTION */ - EXC_BAD_ACCESS, /* 0x1800 INVALID EXCEPTION */ - EXC_BAD_ACCESS, /* 0x1900 INVALID EXCEPTION */ - EXC_BAD_ACCESS, /* 0x1A00 INVALID EXCEPTION */ - EXC_BAD_ACCESS, /* 0x1B00 INVALID EXCEPTION */ - EXC_BAD_ACCESS, /* 0x1C00 INVALID EXCEPTION */ - EXC_BAD_ACCESS, /* 0x1D00 INVALID EXCEPTION */ - EXC_BAD_ACCESS, /* 0x1E00 INVALID EXCEPTION */ - EXC_BAD_ACCESS, /* 0x1F00 INVALID EXCEPTION */ - EXC_BREAKPOINT, /* 0x2000 Run Mode/Trace */ - EXC_BAD_ACCESS, /* 0x2100 INVALID EXCEPTION */ - EXC_BAD_ACCESS, /* 0x2200 INVALID EXCEPTION */ - EXC_BAD_ACCESS, /* 0x2300 INVALID EXCEPTION */ - EXC_BAD_ACCESS, /* 0x2400 INVALID EXCEPTION */ - EXC_BAD_ACCESS, /* 0x2500 INVALID EXCEPTION */ - EXC_BAD_ACCESS, /* 0x2600 INVALID EXCEPTION */ - EXC_BAD_ACCESS, /* 0x2700 INVALID EXCEPTION */ - EXC_BAD_ACCESS, /* 0x2800 INVALID EXCEPTION */ - EXC_BAD_ACCESS, /* 0x2900 INVALID EXCEPTION */ - EXC_BAD_ACCESS, /* 0x2A00 INVALID EXCEPTION */ - EXC_BAD_ACCESS, /* 0x2B00 INVALID EXCEPTION */ - EXC_BAD_ACCESS, /* 0x2C00 INVALID EXCEPTION */ - EXC_BAD_ACCESS, /* 0x2D00 INVALID EXCEPTION */ - EXC_BAD_ACCESS, /* 0x2E00 INVALID EXCEPTION */ - EXC_BAD_ACCESS, /* 0x2F00 INVALID EXCEPTION */ - EXC_SOFTWARE /* 0x3000 AST trap (software) */ -}; - -int -kdp_getc(void) -{ - return(cnmaygetc()); -} - -int kdp_backtrace; -int kdp_sr_dump; -int kdp_dabr; -int kdp_noisy; - -#define kdp_code(x) kdp_trap_codes[((x)==T_AST?0x31:(x)/T_VECTOR_SIZE)] - -void -kdp_trap( - unsigned int exception, - struct savearea *saved_state -) -{ - unsigned int *fp; - unsigned int sp; - - if (kdp_noisy) { - if (kdp_backtrace) { - printf("\nvector=%x, \n", exception/4); - sp = saved_state->save_r1; - printf("stack backtrace - sp(%x) ", sp); - fp = (unsigned int *) *((unsigned int *)sp); - while (fp) { - printf("0x%08x ", fp[2]); - fp = (unsigned int *)*fp; - } - printf("\n"); - } -#ifdef XXX - if (kdp_sr_dump) { - dump_segment_registers(); - } -#endif - - printf("vector=%d ", exception/4); - } - kdp_raise_exception(kdp_code(exception), 0, 0, saved_state); - - if (kdp_noisy) - printf("kdp_trap: kdp_raise_exception() ret\n"); - - if ((unsigned int)(saved_state->save_srr0) == 0x7c800008) - saved_state->save_srr0 += 4; /* BKPT_SIZE */ - - if(saved_state->save_srr1 & (MASK(MSR_SE) | MASK(MSR_BE))) { /* Are we just stepping or continuing */ - db_run_mode = STEP_ONCE; /* We are stepping */ - } - else db_run_mode = STEP_CONTINUE; /* Otherwise we are continuing */ - -#ifdef XXX - mtspr(dabr, kdp_dabr); -#endif -} - -boolean_t -kdp_call_kdb( - void) -{ - switch_debugger=1; - return(TRUE); -} - -static void kdp_print_registers(struct savearea *state) -{ - int i; - for (i=0; i<32; i++) { - if ((i % 8) == 0) - printf("\n%4d :",i); - printf(" %08llx",*(&state->save_r0+i)); - } - printf("\n"); - printf("cr = 0x%08x\t\t",state->save_cr); - printf("xer = 0x%08llx\n",state->save_xer); - printf("lr = 0x%08llx\t\t",state->save_lr); - printf("ctr = 0x%08llx\n",state->save_ctr); - printf("srr0(iar) = 0x%08llx\t\t",state->save_srr0); - printf("srr1(msr) = 0x%08llx\n",state->save_srr1); - printf("\n"); -} - -void kdp_print_backtrace(unsigned, struct savearea *); - -void -kdp_print_backtrace( - unsigned int exception, - struct savearea *saved_state) -{ - disable_debug_output = FALSE; - debug_mode = TRUE; - printf("re-entering kdp:\n"); - printf("vector=%x, \n", exception/4); - kdp_print_registers(saved_state); - print_backtrace(saved_state); - printf("panic: We are hanging here...\n"); - while(1); -} - -void -kdp_machine_get_breakinsn( - uint8_t *bytes, - uint32_t *size -) -{ - *(uint32_t *)bytes = 0x7fe00008; - *size = sizeof(uint32_t); -} - -#define LR_OFFSET 8 -#define LR_OFFSET64 16 - -int -machine_trace_thread(thread_t thread, char *tracepos, char *tracebound, int nframes, boolean_t user_p) -{ - uint32_t *tracebuf = (uint32_t *)tracepos; - uint32_t fence = 0; - uint32_t stackptr = 0; - uint32_t stacklimit = 0xb0000000; - int framecount = 0; - uint32_t init_srr0 = 0; - uint32_t prevsp = 0; - uint32_t framesize = 2 * sizeof(vm_offset_t); - - if (user_p) { - /* Examine the user savearea */ - init_srr0 = thread->machine.upcb->save_srr0; - stackptr = thread->machine.upcb->save_r1; - /* This bound isn't useful, but it doesn't hinder us */ - stacklimit = 0xffffffff; - kdp_pmap = thread->task->map->pmap; - } - else { - stackptr = thread->machine.pcb->save_r1; - init_srr0 = thread->machine.pcb->save_srr0; - } - /* Fill in the "current" program counter */ - *tracebuf++ = init_srr0; - - for (framecount = 0; framecount < nframes; framecount++) { -/* Bounds check */ - if ((uint32_t) (tracebound - ((char *)tracebuf)) < (4 * framesize)) { - tracebuf--; - break; - } - - *tracebuf++ = stackptr; -/* Invalid frame, or hit fence */ - if (!stackptr || (stackptr == fence)) { - break; - } -/* Stack grows downward */ - if (stackptr < prevsp) { - break; - } -/* Unaligned frame */ - if (stackptr & 0x000000F) { - break; - } - if (stackptr > stacklimit) { - break; - } -/* Assume there's a saved link register, and read it */ - if (kdp_machine_vm_read((caddr_t) (stackptr + LR_OFFSET), (caddr_t) tracebuf, sizeof(caddr_t)) != sizeof(caddr_t)) { - break; - } - - tracebuf++; - prevsp = stackptr; -/* Next frame */ - if (kdp_machine_vm_read((caddr_t) stackptr, (caddr_t) &stackptr, sizeof(caddr_t)) != sizeof(caddr_t)) { - *tracebuf++ = 0; - break; - } - } -/* Reset the target pmap */ - kdp_pmap = NULL; - return (uint32_t) (((char *) tracebuf) - tracepos); -} - -/* Routine to encapsulate the 64-bit address read hack*/ -unsigned -machine_read64(addr64_t srcaddr, caddr_t dstaddr, uint32_t len) -{ - unsigned retval; - - retval = kdp_machine_vm_read(srcaddr, dstaddr, len); - return retval; -} - -int -machine_trace_thread64(thread_t thread, char *tracepos, char *tracebound, int nframes, boolean_t user_p) -{ - uint64_t *tracebuf = (uint64_t *)tracepos; - uint32_t fence = 0; - addr64_t stackptr = 0; - uint64_t stacklimit = 0xb0000000; - int framecount = 0; - addr64_t init_srr0 = 0; - addr64_t prevsp = 0; - unsigned framesize = 2 * sizeof(addr64_t); - - if (user_p) { - init_srr0 = thread->machine.upcb->save_srr0; - stackptr = thread->machine.upcb->save_r1; - stacklimit = 0xffffffffffffffffULL; - kdp_pmap = thread->task->map->pmap; - } - else { - stackptr = thread->machine.pcb->save_r1; - init_srr0 = thread->machine.pcb->save_srr0; - } - - *tracebuf++ = init_srr0; - - for (framecount = 0; framecount < nframes; framecount++) { - - if ((uint32_t)(tracebound - ((char *)tracebuf)) < (4 * framesize)) { - tracebuf--; - break; - } - - *tracebuf++ = stackptr; - - if (!stackptr || (stackptr == fence)){ - break; - } - if (stackptr < prevsp) { - break; - } - if (stackptr & 0x000000F) { - break; - } - if (stackptr > stacklimit) { - break; - } - - if (machine_read64(stackptr+LR_OFFSET64, (caddr_t) tracebuf, sizeof(addr64_t)) != sizeof(addr64_t)) { - break; - } - tracebuf++; - - prevsp = stackptr; - if (machine_read64(stackptr, (caddr_t) &stackptr, sizeof(addr64_t)) != sizeof(addr64_t)) { - *tracebuf++ = 0; - break; - } - } - - kdp_pmap = NULL; - return (uint32_t) (((char *) tracebuf) - tracepos); -} - - -void -kdp_ml_enter_debugger(void) -{ - __asm__ __volatile__("tw 4,r3,r3"); -} - -int -kdp_machine_ioport_read(kdp_readioport_req_t *rq, caddr_t data, uint16_t lcpu) -{ - return 0; -} - -int -kdp_machine_ioport_write(kdp_writeioport_req_t *rq, caddr_t data, uint16_t lcpu) -{ - return 0; -} - -int -kdp_machine_msr64_read(kdp_readmsr64_req_t *rq, caddr_t data, uint16_t lcpu) -{ - return 0; -} - -int -kdp_machine_msr64_write(kdp_writemsr64_req_t *rq, __unused caddr_t data, uint16_t lcpu) -{ - return 0; -} diff --git a/osfmk/kdp/ml/ppc/kdp_misc.s b/osfmk/kdp/ml/ppc/kdp_misc.s deleted file mode 100644 index a007a296b..000000000 --- a/osfmk/kdp/ml/ppc/kdp_misc.s +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#include -#include -#include -#include -#include - -ENTRY(kdp_sync_cache, TAG_NO_FRAME_USED) - sync /* data sync */ - isync /* inst sync */ - blr /* return nothing */ - - -; -; This is a really stupid physical copy. 1 whole byte at a time... -; Source and dest are long longs. We do this with 64-bit on if -; supported. -; - - .align 5 - .globl EXT(kdp_copy_phys) - -LEXT(kdp_copy_phys) - - mflr r12 ; Save return - - bl EXT(ml_set_physical_disabled) ; No DR and get 64-bit - - rlwinm r3,r3,0,1,0 ; Dup low to high source - rlwinm r5,r5,0,1,0 ; Dup low to high dest - rlwimi r3,r4,0,0,31 ; Copy bottom on in source - rlwimi r5,r6,0,0,31 ; Copy bottom on in dest - -kcpagain: addic. r7,r7,-1 ; Drop count - blt-- kcpdone ; All done... - lbz r0,0(r3) ; Grab a whole one - stb r0,0(r5) ; Lay it gently down - addi r3,r3,1 ; Next source - addi r5,r5,1 ; Next destination - b kcpagain ; Once more with feeling... - -kcpdone: bl EXT(ml_restore) ; Put trans, etc back - mtlr r12 ; Restore return - blr ; Come again please... - diff --git a/osfmk/kdp/ml/ppc/kdp_vm.c b/osfmk/kdp/ml/ppc/kdp_vm.c deleted file mode 100644 index 737fd862d..000000000 --- a/osfmk/kdp/ml/ppc/kdp_vm.c +++ /dev/null @@ -1,570 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include -#include - - -pmap_t kdp_pmap; -boolean_t kdp_trans_off; -boolean_t kdp_read_io; - -extern vm_offset_t sectTEXTB, sectDATAB, sectLINKB, sectPRELINKB; -extern unsigned long sectSizeTEXT, sectSizeDATA, sectSizeLINK, sectSizePRELINK; - -static addr64_t kdp_vtophys(pmap_t pmap, addr64_t va); -int kern_dump(void); - -typedef struct { - int flavor; /* the number for this flavor */ - mach_msg_type_number_t count; /* count of ints in this flavor */ -} mythread_state_flavor_t; - -static mythread_state_flavor_t thread_flavor_array[] = { - {PPC_THREAD_STATE , PPC_THREAD_STATE_COUNT}, -}; - -static int kdp_mynum_flavors = 1; -static int MAX_TSTATE_FLAVORS = 1; - -typedef struct { - vm_offset_t header; - int hoffset; - mythread_state_flavor_t *flavors; - int tstate_size; -} tir_t; - -char command_buffer[512]; - -/* - * - */ -static addr64_t -kdp_vtophys( - pmap_t pmap, - addr64_t va) -{ - addr64_t pa; - ppnum_t pp; - - pp = pmap_find_phys(pmap, va); /* Get the page number */ - if(!pp) return 0; /* Just return if no translation */ - - pa = ((addr64_t)pp << 12) | (va & 0x0000000000000FFFULL); /* Shove in the page offset */ - return(pa); -} -/* Verify that src is valid, and physically copy len bytes from src to - * dst, translating if necessary. If translation is enabled - * (kdp_trans_off is 0), a non-zero kdp_pmap specifies the pmap to use - * when translating src. - */ - -mach_vm_size_t -kdp_machine_vm_read( mach_vm_address_t src, caddr_t dst, mach_vm_size_t len) -{ - addr64_t cur_virt_src, cur_virt_dst; - addr64_t cur_phys_src, cur_phys_dst; - unsigned resid, cnt; - unsigned int dummy; - pmap_t pmap; - -#ifdef KDP_VM_READ_DEBUG - kprintf("kdp_machine_vm_read1: src %llx dst %llx len %x - %08X %08X\n", src, dst, len, ((unsigned long *)src)[0], ((unsigned long *)src)[1]); -#endif - - cur_virt_src = (addr64_t)src; - cur_virt_dst = (addr64_t)(intptr_t)dst; - - if (kdp_trans_off) { - resid = len; /* Get the length to copy */ - - while (resid != 0) { - - if((cur_phys_dst = kdp_vtophys(kernel_pmap, cur_virt_dst)) == 0) - goto exit; - - if(kdp_read_io == 0) - if(!mapping_phys_lookup((ppnum_t)(cur_virt_src >> 12), &dummy)) return 0; /* Can't read where there's not any memory */ - - cnt = 4096 - (cur_virt_src & 0xFFF); /* Get length left on page */ - if (cnt > (4096 - (cur_virt_dst & 0xFFF))) - cnt = 4096 - (cur_virt_dst & 0xFFF); - - if (cnt > resid) cnt = resid; - - bcopy_phys(cur_virt_src, cur_phys_dst, cnt); /* Copy stuff over */ - - cur_virt_src += cnt; - cur_virt_dst += cnt; - resid -= cnt; - } - - } else { - - resid = len; - - if(kdp_pmap) pmap = kdp_pmap; /* If special pmap, use it */ - else pmap = kernel_pmap; /* otherwise, use kernel's */ - - while (resid != 0) { -/* Always translate the destination using the kernel_pmap. */ - if((cur_phys_dst = kdp_vtophys(kernel_pmap, cur_virt_dst)) == 0) - goto exit; - - if((cur_phys_src = kdp_vtophys(pmap, cur_virt_src)) == 0) - goto exit; - - if(kdp_read_io == 0) - if(!mapping_phys_lookup((ppnum_t)(cur_phys_src >> 12), &dummy)) goto exit; /* Can't read where there's not any memory */ - - cnt = 4096 - (cur_virt_src & 0xFFF); /* Get length left on page */ - if (cnt > (4096 - (cur_virt_dst & 0xFFF))) - cnt = 4096 - (cur_virt_dst & 0xFFF); - - if (cnt > resid) cnt = resid; - -#ifdef KDP_VM_READ_DEBUG - kprintf("kdp_machine_vm_read2: pmap %08X, virt %016LLX, phys %016LLX\n", - pmap, cur_virt_src, cur_phys_src); -#endif - - bcopy_phys(cur_phys_src, cur_phys_dst, cnt); /* Copy stuff over */ - - cur_virt_src +=cnt; - cur_virt_dst +=cnt; - resid -= cnt; - } - } -exit: -#ifdef KDP_VM_READ_DEBUG - kprintf("kdp_machine_vm_read: ret %08X\n", len-resid); -#endif - return (len - resid); -} - -mach_vm_size_t -kdp_machine_phys_read(kdp_readphysmem64_req_t *rq __unused, caddr_t dst __unused, uint16_t lcpu __unused) -{ - return 0; /* unimplemented */ -} - -/* - * - */ -mach_vm_size_t -kdp_machine_vm_write( caddr_t src, mach_vm_address_t dst, mach_vm_size_t len) -{ - addr64_t cur_virt_src, cur_virt_dst; - addr64_t cur_phys_src, cur_phys_dst; - unsigned resid, cnt, cnt_src, cnt_dst; - -#ifdef KDP_VM_WRITE_DEBUG - printf("kdp_vm_write: src %x dst %x len %x - %08X %08X\n", src, dst, len, ((unsigned long *)src)[0], ((unsigned long *)src)[1]); -#endif - - cur_virt_src = (addr64_t)(intptr_t)src; - cur_virt_dst = (addr64_t)dst; - - resid = len; - - while (resid != 0) { - if ((cur_phys_dst = kdp_vtophys(kernel_pmap, cur_virt_dst)) == 0) - goto exit; - - if ((cur_phys_src = kdp_vtophys(kernel_pmap, cur_virt_src)) == 0) - goto exit; - - cnt_src = ((cur_phys_src + NBPG) & (-NBPG)) - cur_phys_src; - cnt_dst = ((cur_phys_dst + NBPG) & (-NBPG)) - cur_phys_dst; - - if (cnt_src > cnt_dst) - cnt = cnt_dst; - else - cnt = cnt_src; - if (cnt > resid) - cnt = resid; - - bcopy_phys(cur_phys_src, cur_phys_dst, cnt); /* Copy stuff over */ - sync_cache64(cur_phys_dst, cnt); /* Sync caches */ - - cur_virt_src +=cnt; - cur_virt_dst +=cnt; - resid -= cnt; - } -exit: - return (len - resid); -} - -mach_vm_size_t -kdp_machine_phys_write(kdp_writephysmem64_req_t *rq __unused, caddr_t src __unused, - uint16_t lcpu __unused) -{ - return 0; /* unimplemented */ -} - -static void -kern_collectth_state(thread_t thread, tir_t *t) -{ - vm_offset_t header; - int hoffset, i ; - mythread_state_flavor_t *flavors; - struct thread_command *tc; - /* - * Fill in thread command structure. - */ - header = t->header; - hoffset = t->hoffset; - flavors = t->flavors; - - tc = (struct thread_command *) (header + hoffset); - tc->cmd = LC_THREAD; - tc->cmdsize = sizeof(struct thread_command) - + t->tstate_size; - hoffset += sizeof(struct thread_command); - /* - * Follow with a struct thread_state_flavor and - * the appropriate thread state struct for each - * thread state flavor. - */ - for (i = 0; i < kdp_mynum_flavors; i++) { - *(mythread_state_flavor_t *)(header+hoffset) = - flavors[i]; - hoffset += sizeof(mythread_state_flavor_t); - - if (machine_thread_get_kern_state(thread, flavors[i].flavor, - (thread_state_t) (header+hoffset), - &flavors[i].count) != KERN_SUCCESS) - printf ("Failure in machine_thread_get_kern_state()\n"); - hoffset += flavors[i].count*sizeof(int); - } - - t->hoffset = hoffset; -} - -int -kdp_dump_trap( - int type, - __unused struct savearea *regs) -{ - printf ("An unexpected trap (type %d) occurred during the kernel dump, terminating.\n", type); - kdp_send_crashdump_pkt(KDP_EOF, NULL, 0, ((void *) 0)); - abort_panic_transfer(); - kdp_flag &= ~KDP_PANIC_DUMP_ENABLED; - kdp_flag &= ~PANIC_CORE_ON_NMI; - kdp_flag &= ~PANIC_LOG_DUMP; - - kdp_reset(); - - kdp_raise_exception(EXC_BAD_ACCESS, 0, 0, kdp.saved_state); - return( 0 ); -} - -/* - * Kernel dump (limited to currently executing 32 bit mach_kernel only) - */ -int -kern_dump(void) -{ - int error = 0; - vm_map_t map; - unsigned int thread_count, segment_count; - unsigned int command_size = 0, header_size = 0, tstate_size = 0; - unsigned int hoffset = 0, foffset = 0, nfoffset = 0, vmoffset = 0; - unsigned int max_header_size = 0; - vm_offset_t header; - struct mach_header *mh; - struct segment_command *sc; - vm_size_t size; - vm_prot_t prot = 0; - vm_prot_t maxprot = 0; - vm_inherit_t inherit = 0; - int error1 = 0; - mythread_state_flavor_t flavors[MAX_TSTATE_FLAVORS]; - vm_size_t nflavors; - vm_size_t i; - uint32_t nesting_depth = 0; - kern_return_t kret = 0; - struct vm_region_submap_info_64 vbr; - mach_msg_type_number_t vbrcount = 0; - tir_t tir1; - - int panic_error = 0; - unsigned int txstart = 0; - unsigned int mach_section_count = 4; - unsigned int num_sects_txed = 0; - - map = kernel_map; - - thread_count = 1; - segment_count = get_vmmap_entries(map); - - printf("Kernel map has %d entries\n", segment_count); - - nflavors = kdp_mynum_flavors; - bcopy((char *)thread_flavor_array,(char *) flavors,sizeof(thread_flavor_array)); - - for (i = 0; i < nflavors; i++) - tstate_size += sizeof(mythread_state_flavor_t) + - (flavors[i].count * sizeof(int)); - - command_size = (segment_count + mach_section_count) * - sizeof(struct segment_command) + - thread_count*sizeof(struct thread_command) + - tstate_size*thread_count; - - header_size = command_size + sizeof(struct mach_header); - header = (vm_offset_t) command_buffer; - - /* - * Set up Mach-O header for currently executing 32 bit kernel. - */ - printf ("Generated Mach-O header size was %d\n", header_size); - - mh = (struct mach_header *) header; - mh->magic = MH_MAGIC; - mh->cputype = cpu_type(); - mh->cpusubtype = cpu_subtype(); /* XXX incorrect; should match kernel */ - mh->filetype = MH_CORE; - mh->ncmds = segment_count + thread_count + mach_section_count; - mh->sizeofcmds = command_size; - mh->flags = 0; - - hoffset = sizeof(struct mach_header); /* offset into header */ - foffset = round_page_32(header_size); /* offset into file */ - /* Padding.. */ - if ((foffset - header_size) < (4*sizeof(struct segment_command))) { - /* Hack */ - foffset += ((4*sizeof(struct segment_command)) - (foffset-header_size)); - } - - max_header_size = foffset; - - vmoffset = VM_MIN_ADDRESS; /* offset into VM */ - - /* Transmit the Mach-O MH_CORE header, and seek forward past the - * area reserved for the segment and thread commands - * to begin data transmission - */ - - if ((panic_error = kdp_send_crashdump_pkt(KDP_SEEK, NULL, sizeof(nfoffset) , &nfoffset)) < 0) { - printf ("kdp_send_crashdump_pkt failed with error %d\n", panic_error); - return -1; - } - - if ((panic_error = kdp_send_crashdump_data(KDP_DATA, NULL, sizeof(struct mach_header), (caddr_t) mh) < 0)) { - printf ("kdp_send_crashdump_data failed with error %d\n", panic_error); - return -1 ; - } - - if ((panic_error = kdp_send_crashdump_pkt(KDP_SEEK, NULL, sizeof(foffset) , &foffset) < 0)) { - printf ("kdp_send_crashdump_pkt failed with error %d\n", panic_error); - return (-1); - } - printf ("Transmitting kernel state, please wait: "); - - while ((segment_count > 0) || (kret == KERN_SUCCESS)){ - /* Check if we've transmitted all the kernel sections */ - if (num_sects_txed == mach_section_count) { - - while (1) { - - /* - * Get region information for next region. - */ - - vbrcount = VM_REGION_SUBMAP_INFO_COUNT_64; - if((kret = vm_region_recurse_64(map, - &vmoffset, &size, &nesting_depth, - (vm_region_recurse_info_t)&vbr, - &vbrcount)) != KERN_SUCCESS) { - break; - } - - if(vbr.is_submap) { - nesting_depth++; - continue; - } else { - break; - } - } - - if(kret != KERN_SUCCESS) - break; - - prot = vbr.protection; - maxprot = vbr.max_protection; - inherit = vbr.inheritance; - } - else - { - switch (num_sects_txed) { - case 0: - { - /* Transmit the kernel text section */ - vmoffset = sectTEXTB; - size = sectSizeTEXT; - } - break; - case 1: - { - vmoffset = sectDATAB; - size = sectSizeDATA; - } - break; - case 2: - { - vmoffset = sectPRELINKB; - size = sectSizePRELINK; - } - break; - case 3: - { - vmoffset = sectLINKB; - size = sectSizeLINK; - } - break; - /* TODO the lowmem vector area may be useful, but its transmission is - * disabled for now. The traceback table area should be transmitted - * as well - that's indirected from 0x5080. - */ - } - num_sects_txed++; - } - /* - * Fill in segment command structure. - */ - - if (hoffset > max_header_size) - break; - sc = (struct segment_command *) (header); - sc->cmd = LC_SEGMENT; - sc->cmdsize = sizeof(struct segment_command); - sc->segname[0] = 0; - sc->vmaddr = vmoffset; - sc->vmsize = size; - sc->fileoff = foffset; - sc->filesize = size; - sc->maxprot = maxprot; - sc->initprot = prot; - sc->nsects = 0; - - if ((panic_error = kdp_send_crashdump_pkt(KDP_SEEK, NULL, sizeof(hoffset) , &hoffset)) < 0) { - printf ("kdp_send_crashdump_pkt failed with error %d\n", panic_error); - return -1; - } - - if ((panic_error = kdp_send_crashdump_data(KDP_DATA, NULL, sizeof(struct segment_command) , (caddr_t) sc)) < 0) { - printf ("kdp_send_crashdump_data failed with error %d\n", panic_error); - return -1 ; - } - - /* Do not transmit memory tagged VM_MEMORY_IOKIT - instead, seek past that - * region on the server - this creates a hole in the file - */ - - if ((vbr.user_tag != VM_MEMORY_IOKIT)) { - - if ((panic_error = kdp_send_crashdump_pkt(KDP_SEEK, NULL, sizeof(foffset) , &foffset)) < 0) { - printf ("kdp_send_crashdump_pkt failed with error %d\n", panic_error); - return (-1); - } - - txstart = vmoffset; - - if ((panic_error = kdp_send_crashdump_data(KDP_DATA, NULL, size, (caddr_t) txstart)) < 0) { - printf ("kdp_send_crashdump_data failed with error %d\n", panic_error); - return -1 ; - } - } - - hoffset += sizeof(struct segment_command); - foffset += size; - vmoffset += size; - segment_count--; - } - tir1.header = header; - tir1.hoffset = 0; - tir1.flavors = flavors; - tir1.tstate_size = tstate_size; - - /* Now send out the LC_THREAD load command, with the thread information - * for the current activation. - * Note that the corefile can contain LC_SEGMENT commands with file offsets - * that point past the edge of the corefile, in the event that the last N - * VM regions were all I/O mapped or otherwise non-transferable memory, - * not followed by a normal VM region; i.e. there will be no hole that - * reaches to the end of the core file. - */ - kern_collectth_state (current_thread(), &tir1); - - if ((panic_error = kdp_send_crashdump_pkt(KDP_SEEK, NULL, sizeof(hoffset) , &hoffset)) < 0) { - printf ("kdp_send_crashdump_pkt failed with error %d\n", panic_error); - return -1; - } - - if ((panic_error = kdp_send_crashdump_data(KDP_DATA, NULL, tir1.hoffset , (caddr_t) header)) < 0) { - printf ("kdp_send_crashdump_data failed with error %d\n", panic_error); - return -1 ; - } - - /* last packet */ - if ((panic_error = kdp_send_crashdump_pkt(KDP_EOF, NULL, 0, ((void *) 0))) < 0) - { - printf ("kdp_send_crashdump_pkt failed with error %d\n", panic_error); - return (-1) ; - } - - if (error == 0) - error = error1; - return (error); -} diff --git a/osfmk/kdp/ml/x86_64/kdp_machdep.c b/osfmk/kdp/ml/x86_64/kdp_machdep.c index 1da2a0133..d7e071569 100644 --- a/osfmk/kdp/ml/x86_64/kdp_machdep.c +++ b/osfmk/kdp/ml/x86_64/kdp_machdep.c @@ -467,6 +467,11 @@ kdp_i386_trap( break; } + if (current_cpu_datap()->cpu_fatal_trap_state) { + current_cpu_datap()->cpu_post_fatal_trap_state = saved_state; + saved_state = current_cpu_datap()->cpu_fatal_trap_state; + } + kdp_raise_exception(exception, code, subcode, saved_state); /* If the instruction single step bit is set, disable kernel preemption */ diff --git a/osfmk/kdp/ml/x86_64/kdp_vm.c b/osfmk/kdp/ml/x86_64/kdp_vm.c index 8a80e7a3a..a76167621 100644 --- a/osfmk/kdp/ml/x86_64/kdp_vm.c +++ b/osfmk/kdp/ml/x86_64/kdp_vm.c @@ -45,9 +45,6 @@ #include #include -extern vm_offset_t sectTEXTB, sectDATAB, sectLINKB, sectPRELINKB; -extern unsigned long sectSizeTEXT, sectSizeDATA, sectSizeLINK, sectSizePRELINK; - int kern_dump(void); int kdp_dump_trap(int type, x86_saved_state64_t *regs); @@ -108,8 +105,31 @@ kern_collectth_state(thread_t thread, tir_t *t) if (flavors[i].flavor == x86_THREAD_STATE64) { x86_thread_state64_t *tstate = (x86_thread_state64_t *) (header + hoffset); vm_offset_t kstack; + x86_saved_state64_t *cpstate = current_cpu_datap()->cpu_fatal_trap_state; bzero(tstate, x86_THREAD_STATE64_COUNT * sizeof(int)); - if ((kstack = thread->kernel_stack) != 0){ + if ((current_thread() == thread) && (cpstate != NULL)) { + tstate->rax = cpstate->rax; + tstate->rbx = cpstate->rbx; + tstate->rcx = cpstate->rcx; + tstate->rdx = cpstate->rdx; + tstate->rdi = cpstate->rdi; + tstate->rsi = cpstate->rsi; + tstate->rbp = cpstate->rbp; + tstate->r8 = cpstate->r8; + tstate->r9 = cpstate->r9; + tstate->r10 = cpstate->r10; + tstate->r11 = cpstate->r11; + tstate->r12 = cpstate->r12; + tstate->r13 = cpstate->r13; + tstate->r14 = cpstate->r14; + tstate->r15 = cpstate->r15; + tstate->rip = cpstate->isf.rip; + tstate->rsp = cpstate->isf.rsp; + tstate->rflags = cpstate->isf.rflags; + tstate->cs = cpstate->isf.cs; + tstate->fs = cpstate->fs; + tstate->gs = cpstate->gs; + } else if ((kstack = thread->kernel_stack) != 0){ struct x86_kernel_state *iks = STACK_IKS(kstack); tstate->rbx = iks->k_rbx; tstate->rsp = iks->k_rsp; @@ -119,7 +139,7 @@ kern_collectth_state(thread_t thread, tir_t *t) tstate->r14 = iks->k_r14; tstate->r15 = iks->k_r15; tstate->rip = iks->k_rip; - } + } } else if (machine_thread_get_kern_state(thread, flavors[i].flavor, (thread_state_t) (header+hoffset), @@ -168,7 +188,6 @@ kern_dump(void) mach_vm_size_t size = 0; vm_prot_t prot = 0; vm_prot_t maxprot = 0; - vm_inherit_t inherit = 0; mythread_state_flavor_t flavors[MAX_TSTATE_FLAVORS]; vm_size_t nflavors; vm_size_t i; @@ -180,7 +199,6 @@ kern_dump(void) int error = 0; int panic_error = 0; - unsigned int mach_section_count = 0; map = kernel_map; @@ -196,7 +214,7 @@ kern_dump(void) tstate_size += (uint32_t)(sizeof(mythread_state_flavor_t) + (flavors[i].count * sizeof(int))); - command_size = (uint32_t)((segment_count + mach_section_count) * + command_size = (uint32_t)((segment_count) * sizeof(struct segment_command_64) + thread_count * sizeof(struct thread_command) + tstate_size * thread_count); @@ -214,7 +232,7 @@ kern_dump(void) mh64->cputype = cpu_type(); mh64->cpusubtype = cpu_subtype(); mh64->filetype = MH_CORE; - mh64->ncmds = segment_count + thread_count + mach_section_count; + mh64->ncmds = segment_count + thread_count; mh64->sizeofcmds = command_size; mh64->flags = 0; mh64->reserved = 0; @@ -281,7 +299,6 @@ kern_dump(void) prot = vbr.protection; maxprot = vbr.max_protection; - inherit = vbr.inheritance; /* * Fill in segment command structure. diff --git a/osfmk/kern/Makefile b/osfmk/kern/Makefile index 3b742c63d..cf8f5539e 100644 --- a/osfmk/kern/Makefile +++ b/osfmk/kern/Makefile @@ -18,6 +18,7 @@ EXPORT_ONLY_FILES = \ cpu_data.h \ debug.h \ etimer.h \ + extmod_statistics.h \ ipc_mig.h \ ipc_misc.h \ kalloc.h \ diff --git a/osfmk/kern/ast.c b/osfmk/kern/ast.c index b6540f92d..e7b895598 100644 --- a/osfmk/kern/ast.c +++ b/osfmk/kern/ast.c @@ -80,6 +80,10 @@ #include #include #include // for CHUD AST hook +#include + + +volatile perfASTCallback perfASTHook; void @@ -99,14 +103,14 @@ ast_taken( boolean_t preempt_trap = (reasons == AST_PREEMPTION); ast_t *myast = ast_pending(); thread_t thread = current_thread(); - perfCallback perf_hook = perfASTHook; + perfASTCallback perf_hook = perfASTHook; /* * CHUD hook - all threads including idle processor threads */ if (perf_hook) { if (*myast & AST_CHUD_ALL) { - (*perf_hook)(0, NULL, 0, 0); + (*perf_hook)(reasons, myast); if (*myast == AST_NONE) return; @@ -189,6 +193,7 @@ ast_check( thread_t thread = processor->active_thread; processor->current_pri = thread->sched_pri; + processor->current_thmode = thread->sched_mode; if ( processor->state == PROCESSOR_RUNNING || processor->state == PROCESSOR_SHUTDOWN ) { ast_t preempt; @@ -196,6 +201,8 @@ ast_check( /* * Propagate thread ast to processor. */ + pal_ast_check(thread); + ast_propagate(thread->ast); /* diff --git a/osfmk/kern/audit_sessionport.c b/osfmk/kern/audit_sessionport.c index f42000464..7e8ee9c30 100644 --- a/osfmk/kern/audit_sessionport.c +++ b/osfmk/kern/audit_sessionport.c @@ -30,12 +30,13 @@ #include #include #include +#include #if CONFIG_AUDIT /* * audit_session_mksend * - * Description: Obtain a send right for given audit session information. + * Description: Obtain a send right for given audit session. * * Parameters: *aia_p Audit session information to assosiate with * the new port. @@ -45,48 +46,60 @@ * Returns: !NULL Resulting send right. * NULL Failed to allocate port (due to lack of memory * resources). - * - * *sessionport The session port that may have been allocated. - * - * Notes: On return, sendport will be set to the new send right on success, - * or null/dead on error. + + * Assumptions: Caller holds a reference on the session during the call. + * If there were no outstanding send rights against the port, + * hold a reference on the session and arm a new no-senders + * notification to determine when to release that reference. + * Otherwise, by creating an additional send right, we share + * the port's reference until all send rights go away. */ ipc_port_t audit_session_mksend(struct auditinfo_addr *aia_p, ipc_port_t *sessionport) { - ipc_port_t notifyport; ipc_port_t sendport = IPC_PORT_NULL; + ipc_port_t port; /* - * If we have an existing, active session port then use it. + * If we don't have an existing session port, then create one. */ - sendport = ipc_port_make_send(*sessionport); - if (IP_VALID(sendport)) { - ip_lock(sendport); - if (ip_active(sendport) && - IKOT_AU_SESSIONPORT == ip_kotype(sendport)) { - ip_unlock(sendport); - return (sendport); - } - ip_unlock(sendport); - ipc_port_release_send(sendport); + port = *sessionport; + if (!IP_VALID(port)) { + ipc_port_t new_port = ipc_port_alloc_kernel(); + if (!IP_VALID(new_port)) + return new_port; + ipc_kobject_set(new_port, (ipc_kobject_t)aia_p, IKOT_AU_SESSIONPORT); + if (!OSCompareAndSwapPtr(port, new_port, sessionport)) + ipc_port_dealloc_kernel(new_port); + port = *sessionport; } + assert(ip_active(port) && IKOT_AU_SESSIONPORT == ip_kotype(port)); + sendport = ipc_port_make_send(port); + /* - * Otherwise, create a new one for this session. + * If we don't have a no-senders notification outstanding against + * the port, take a reference on the session and request one. */ - *sessionport = ipc_port_alloc_kernel(); - if (IP_VALID(*sessionport)) { - ipc_kobject_set(*sessionport, (ipc_kobject_t)aia_p, - IKOT_AU_SESSIONPORT); - - /* Request a no-senders notification. */ - notifyport = ipc_port_make_sonce(*sessionport); - ip_lock(*sessionport); - /* unlocked by ipc_port_nsrequest */ - ipc_port_nsrequest(*sessionport, 1, notifyport, ¬ifyport); + if (IP_NULL == port->ip_nsrequest) { + ipc_port_t notifyport; + + audit_session_aiaref(aia_p); + + /* Need a send-once right for the target of the notification */ + notifyport = ipc_port_make_sonce(port); + + /* Request a no-senders notification (at the new make-send threshold) */ + ip_lock(port); + ipc_port_nsrequest(port, port->ip_mscount, notifyport, ¬ifyport); + /* port unlocked */ + + if (IP_NULL != notifyport) { + /* race requesting notification */ + audit_session_aiaunref(aia_p); + ipc_port_release_sonce(notifyport); + } } - sendport = ipc_port_make_send(*sessionport); return (sendport); } @@ -113,10 +126,12 @@ audit_session_porttoaia(ipc_port_t port) if (IP_VALID(port)) { ip_lock(port); - if (ip_active(port) && IKOT_AU_SESSIONPORT == ip_kotype(port)) + if (IKOT_AU_SESSIONPORT == ip_kotype(port)) { + assert(ip_active(port)); aia_p = (struct auditinfo_addr *)port->ip_kobject; + } ip_unlock(port); - } + } return (aia_p); } @@ -149,28 +164,50 @@ audit_session_nosenders(mach_msg_header_t *msg) ipc_port_t notifyport; struct auditinfo_addr *port_aia_p = NULL; - if (!IP_VALID(port)) - return; + assert(IKOT_AU_SESSIONPORT == ip_kotype(port)); ip_lock(port); - if (ip_active(port) && IKOT_AU_SESSIONPORT == ip_kotype(port)) { - port_aia_p = (struct auditinfo_addr *)port->ip_kobject; - assert(NULL != port_aia_p); - if (port->ip_mscount <= notification->not_count) - ipc_kobject_set_atomically(port, IKO_NULL, IKOT_NONE); - else { - /* re-arm the notification */ - ip_unlock(port); - notifyport = ipc_port_make_sonce(port); - ip_lock(port); - /* unlocked by ipc_port_nsrequest */ - ipc_port_nsrequest(port, port->ip_mscount, notifyport, - ¬ifyport); - return; + assert(ip_active(port)); + port_aia_p = (struct auditinfo_addr *)port->ip_kobject; + assert(NULL != port_aia_p); + + /* + * if new send rights have been made since the last notify + * request, re-arm the notification with the new threshold. + */ + if (port->ip_mscount > notification->not_count) { + ip_unlock(port); + notifyport = ipc_port_make_sonce(port); + ip_lock(port); + ipc_port_nsrequest(port, port->ip_mscount, notifyport, ¬ifyport); + /* port unlocked */ + + if (IP_NULL != notifyport) { + /* race re-arming the notification */ + ipc_port_release_sonce(notifyport); + audit_session_aiaunref(port_aia_p); } + return; } + + /* + * Otherwise, no more extant send rights, so release the + * reference held on the session by those send rights. + */ ip_unlock(port); - if (NULL != port_aia_p) - audit_session_portaiadestroy(port_aia_p); - ipc_port_dealloc_kernel(port); + audit_session_aiaunref(port_aia_p); +} + +void +audit_session_portdestroy(ipc_port_t *sessionport) +{ + ipc_port_t port = *sessionport; + + if (IP_VALID(port)) { + assert (ip_active(port)); + assert(IKOT_AU_SESSIONPORT == ip_kotype(port)); + ipc_kobject_set_atomically(port, IKO_NULL, IKOT_NONE); + ipc_port_dealloc_kernel(port); + *sessionport = IP_NULL; + } } #endif /* CONFIG_AUDIT */ diff --git a/osfmk/kern/audit_sessionport.h b/osfmk/kern/audit_sessionport.h index 5a26f3451..8b6a85477 100644 --- a/osfmk/kern/audit_sessionport.h +++ b/osfmk/kern/audit_sessionport.h @@ -31,11 +31,11 @@ struct auditinfo_addr; -ipc_port_t audit_session_mksend(struct auditinfo_addr *aia_p, - ipc_port_t *sessionport); +ipc_port_t audit_session_mksend(struct auditinfo_addr *, ipc_port_t *); struct auditinfo_addr *audit_session_porttoaia(ipc_port_t); -void audit_session_portaiadestroy(struct auditinfo_addr *); +void audit_session_portdestroy(ipc_port_t *); void audit_session_nosenders(mach_msg_header_t *); - +void audit_session_aiaref(struct auditinfo_addr *); +void audit_session_aiaunref(struct auditinfo_addr *); #endif /* _KERN_AUDIT_SESSIONPORT_H_ */ #endif /* KERNEL_PRIVATE */ diff --git a/osfmk/kern/bsd_kern.c b/osfmk/kern/bsd_kern.c index 822c07ce9..07de86ef0 100644 --- a/osfmk/kern/bsd_kern.c +++ b/osfmk/kern/bsd_kern.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -95,6 +95,15 @@ void *get_bsdthread_info(thread_t th) return(th->uthread); } +/* + * XXX + */ +int get_thread_lock_count(thread_t th); /* forced forward */ +int get_thread_lock_count(thread_t th) +{ + return(th->mutex_count); +} + /* * XXX: wait for BSD to fix signal code * Until then, we cannot block here. We know the task @@ -135,7 +144,7 @@ get_signalact( !queue_end(&task->threads, (queue_entry_t)inc); ) { thread_mtx_lock(inc); if (inc->active && - (inc->sched_mode & TH_MODE_ISABORTED) != TH_MODE_ABORT) { + (inc->sched_flags & TH_SFLAG_ABORTED_MASK) != TH_SFLAG_ABORT) { thread = inc; break; } @@ -185,7 +194,7 @@ check_actforsig( thread_mtx_lock(inc); if (inc->active && - (inc->sched_mode & TH_MODE_ISABORTED) != TH_MODE_ABORT) { + (inc->sched_flags & TH_SFLAG_ABORTED_MASK) != TH_SFLAG_ABORT) { result = KERN_SUCCESS; break; } @@ -282,7 +291,7 @@ int is_64signalregset(void) * returned. */ vm_map_t -swap_task_map(task_t task, thread_t thread, vm_map_t map) +swap_task_map(task_t task, thread_t thread, vm_map_t map, boolean_t doswitch) { vm_map_t old_map; @@ -290,8 +299,12 @@ swap_task_map(task_t task, thread_t thread, vm_map_t map) panic("swap_task_map"); task_lock(task); + mp_disable_preemption(); old_map = task->map; thread->map = task->map = map; + if (doswitch) + pmap_switch(map->pmap); + mp_enable_preemption(); task_unlock(task); #if (defined(__i386__) || defined(__x86_64__)) && NCOPY_WINDOWS > 0 @@ -452,7 +465,7 @@ boolean_t thread_should_abort( thread_t th) { - return ((th->sched_mode & TH_MODE_ISABORTED) == TH_MODE_ABORT); + return ((th->sched_flags & TH_SFLAG_ABORTED_MASK) == TH_SFLAG_ABORT); } /* @@ -470,14 +483,14 @@ current_thread_aborted ( thread_t th = current_thread(); spl_t s; - if ((th->sched_mode & TH_MODE_ISABORTED) == TH_MODE_ABORT && + if ((th->sched_flags & TH_SFLAG_ABORTED_MASK) == TH_SFLAG_ABORT && (th->options & TH_OPT_INTMASK) != THREAD_UNINT) return (TRUE); - if (th->sched_mode & TH_MODE_ABORTSAFELY) { + if (th->sched_flags & TH_SFLAG_ABORTSAFELY) { s = splsched(); thread_lock(th); - if (th->sched_mode & TH_MODE_ABORTSAFELY) - th->sched_mode &= ~TH_MODE_ISABORTED; + if (th->sched_flags & TH_SFLAG_ABORTSAFELY) + th->sched_flags &= ~TH_SFLAG_ABORTED_MASK; thread_unlock(th); splx(s); } @@ -532,7 +545,9 @@ fill_taskprocinfo(task_t task, struct proc_taskinfo_internal * ptinfo) vm_map_t map; task_absolutetime_info_data_t tinfo; thread_t thread; - int cswitch = 0, numrunning = 0; + uint32_t cswitch = 0, numrunning = 0; + uint32_t syscalls_unix = 0; + uint32_t syscalls_mach = 0; map = (task == kernel_task)? kernel_map: task->map; @@ -563,6 +578,9 @@ fill_taskprocinfo(task_t task, struct proc_taskinfo_internal * ptinfo) tval = timer_grab(&thread->system_timer); tinfo.threads_system += tval; tinfo.total_system += tval; + + syscalls_unix += thread->syscalls_unix; + syscalls_mach += thread->syscalls_mach; } ptinfo->pti_total_system = tinfo.total_system; @@ -575,8 +593,8 @@ fill_taskprocinfo(task_t task, struct proc_taskinfo_internal * ptinfo) ptinfo->pti_cow_faults = task->cow_faults; ptinfo->pti_messages_sent = task->messages_sent; ptinfo->pti_messages_received = task->messages_received; - ptinfo->pti_syscalls_mach = task->syscalls_mach; - ptinfo->pti_syscalls_unix = task->syscalls_unix; + ptinfo->pti_syscalls_mach = task->syscalls_mach + syscalls_mach; + ptinfo->pti_syscalls_unix = task->syscalls_unix + syscalls_unix; ptinfo->pti_csw = task->c_switch + cswitch; ptinfo->pti_threadnum = task->thread_count; ptinfo->pti_numrunning = numrunning; @@ -598,13 +616,7 @@ fill_taskthreadinfo(task_t task, uint64_t thaddr, struct proc_threadinfo_interna for (thact = (thread_t)queue_first(&task->threads); !queue_end(&task->threads, (queue_entry_t)thact); ) { -#if defined(__ppc__) || defined(__arm__) if (thact->machine.cthread_self == thaddr) -#elif defined (__i386__) || defined (__x86_64__) - if (thact->machine.pcb->cthread_self == thaddr) -#else -#error architecture not supported -#endif { count = THREAD_BASIC_INFO_COUNT; @@ -658,13 +670,7 @@ fill_taskthreadlist(task_t task, void * buffer, int thcount) for (thact = (thread_t)queue_first(&task->threads); !queue_end(&task->threads, (queue_entry_t)thact); ) { -#if defined(__ppc__) || defined(__arm__) thaddr = thact->machine.cthread_self; -#elif defined (__i386__) || defined (__x86_64__) - thaddr = thact->machine.pcb->cthread_self; -#else -#error architecture not supported -#endif *uptr++ = thaddr; numthr++; if (numthr >= thcount) diff --git a/osfmk/kern/call_entry.h b/osfmk/kern/call_entry.h index 57ab51d5e..36f47a31b 100644 --- a/osfmk/kern/call_entry.h +++ b/osfmk/kern/call_entry.h @@ -35,41 +35,116 @@ #ifdef MACH_KERNEL_PRIVATE #include -typedef void *call_entry_param_t; -typedef void (*call_entry_func_t)( - call_entry_param_t param0, - call_entry_param_t param1); +typedef void *call_entry_param_t; +typedef void (*call_entry_func_t)( + call_entry_param_t param0, + call_entry_param_t param1); typedef struct call_entry { - queue_chain_t q_link; - queue_t queue; + queue_chain_t q_link; + queue_head_t *queue; call_entry_func_t func; call_entry_param_t param0; call_entry_param_t param1; - uint64_t deadline; + uint64_t deadline; } call_entry_data_t; -typedef struct call_entry *call_entry_t; +typedef struct call_entry *call_entry_t; -extern queue_t call_entry_enqueue_deadline( - call_entry_t entry, - queue_t queue, - uint64_t deadline); -extern queue_t call_entry_enqueue_tail( - call_entry_t entry, - queue_t queue); - -extern queue_t call_entry_dequeue( - call_entry_t entry); - -#define call_entry_setup(entry, pfun, p0) \ -MACRO_BEGIN \ +#define call_entry_setup(entry, pfun, p0) \ +MACRO_BEGIN \ (entry)->func = (call_entry_func_t)(pfun); \ - (entry)->param0 = (call_entry_param_t)(p0); \ - (entry)->queue = NULL; \ + (entry)->param0 = (call_entry_param_t)(p0); \ + (entry)->queue = NULL; \ MACRO_END +#define qe(x) ((queue_entry_t)(x)) +#define CE(x) ((call_entry_t)(x)) + +static __inline__ queue_head_t * +call_entry_enqueue_tail( + call_entry_t entry, + queue_t queue) +{ + queue_t old_queue = entry->queue; + + if (old_queue != NULL) + (void)remque(qe(entry)); + + enqueue_tail(queue, qe(entry)); + + entry->queue = queue; + + return (old_queue); +} + +static __inline__ queue_head_t * +call_entry_dequeue( + call_entry_t entry) +{ + queue_t old_queue = entry->queue; + + if (old_queue != NULL) { + (void)remque(qe(entry)); + + entry->queue = NULL; + } + return (old_queue); +} + +static __inline__ queue_head_t * +call_entry_enqueue_deadline( + call_entry_t entry, + queue_head_t *queue, + uint64_t deadline) +{ + queue_t old_queue = entry->queue; + call_entry_t current; + + if (old_queue != queue || entry->deadline < deadline) { + if (old_queue == NULL) { + current = CE(queue_first(queue)); + } else if (old_queue != queue) { + (void)remque(qe(entry)); + current = CE(queue_first(queue)); + } else { + current = CE(queue_next(qe(entry))); + (void)remque(qe(entry)); + } + + while (TRUE) { + if (queue_end(queue, qe(current)) || + deadline < current->deadline) { + current = CE(queue_prev(qe(current))); + break; + } + + current = CE(queue_next(qe(current))); + } + insque(qe(entry), qe(current)); + } + else + if (deadline < entry->deadline) { + current = CE(queue_prev(qe(entry))); + + (void)remque(qe(entry)); + + while (TRUE) { + if (queue_end(queue, qe(current)) || + current->deadline <= deadline) { + break; + } + + current = CE(queue_prev(qe(current))); + } + insque(qe(entry), qe(current)); + } + entry->queue = queue; + entry->deadline = deadline; + + return (old_queue); +} #endif /* MACH_KERNEL_PRIVATE */ #endif /* _KERN_CALL_ENTRY_H_ */ diff --git a/osfmk/kern/clock.c b/osfmk/kern/clock.c index fd2e29797..e9c487ad6 100644 --- a/osfmk/kern/clock.c +++ b/osfmk/kern/clock.c @@ -50,7 +50,7 @@ uint32_t hz_tick_interval = 1; -decl_simple_lock_data(static,clock_lock) +decl_simple_lock_data(,clock_lock) #define clock_lock() \ simple_lock(&clock_lock) @@ -72,7 +72,6 @@ decl_simple_lock_data(static,clock_lock) * where CONV converts absolute time units into seconds and a fraction. */ static struct clock_calend { - uint64_t epoch; uint64_t offset; @@ -161,11 +160,6 @@ clock_config(void) thread_call_setup(&calend_wakecall, (thread_call_func_t)IOKitResetTime, NULL); clock_oldconfig(); - - /* - * Initialize the timer callouts. - */ - timer_call_initialize(); } /* @@ -246,6 +240,15 @@ clock_get_calendar_microtime( if (clock_calend.adjdelta < 0) { uint32_t t32; + /* + * Since offset is decremented during a negative adjustment, + * ensure that time increases monotonically without going + * temporarily backwards. + * If the delta has not yet passed, now is set to the start + * of the current adjustment period; otherwise, we're between + * the expiry of the delta and the next call to calend_adjust(), + * and we offset accordingly. + */ if (now > clock_calend.adjstart) { t32 = (uint32_t)(now - clock_calend.adjstart); @@ -305,6 +308,7 @@ clock_get_calendar_nanotime( now += clock_calend.offset; absolutetime_to_microtime(now, secs, nanosecs); + *nanosecs *= NSEC_PER_USEC; *secs += (clock_sec_t)clock_calend.epoch; @@ -408,6 +412,7 @@ clock_set_calendar_microtime( * Set the new calendar epoch. */ clock_calend.epoch = secs; + nanoseconds_to_absolutetime((uint64_t)microsecs * NSEC_PER_USEC, &clock_calend.offset); /* @@ -473,6 +478,7 @@ clock_initialize_calendar(void) * Set the new calendar epoch. */ clock_calend.epoch = secs; + nanoseconds_to_absolutetime((uint64_t)microsecs * NSEC_PER_USEC, &clock_calend.offset); /* @@ -538,7 +544,7 @@ clock_adjtime( interval = calend_set_adjustment(secs, microsecs); if (interval != 0) { calend_adjdeadline = mach_absolute_time() + interval; - if (!timer_call_enter(&calend_adjcall, calend_adjdeadline)) + if (!timer_call_enter(&calend_adjcall, calend_adjdeadline, TIMER_CALL_CRITICAL)) calend_adjactive++; } else @@ -558,47 +564,103 @@ calend_set_adjustment( int64_t total, ototal; uint32_t interval = 0; + /* + * Compute the total adjustment time in nanoseconds. + */ total = (int64_t)*secs * NSEC_PER_SEC + *microsecs * NSEC_PER_USEC; + /* + * Disable commpage gettimeofday(). + */ commpage_disable_timestamp(); + /* + * Get current absolute time. + */ now = mach_absolute_time(); + /* + * Save the old adjustment total for later return. + */ ototal = calend_adjtotal; + /* + * Is a new correction specified? + */ if (total != 0) { + /* + * Set delta to the standard, small, adjustment skew. + */ int32_t delta = calend_adjskew; if (total > 0) { + /* + * Positive adjustment. If greater than the preset 'big' + * threshold, slew at a faster rate, capping if necessary. + */ if (total > calend_adjbig) delta *= 10; if (delta > total) delta = (int32_t)total; + /* + * Convert the delta back from ns to absolute time and store in adjoffset. + */ nanoseconds_to_absolutetime((uint64_t)delta, &t64); clock_calend.adjoffset = (uint32_t)t64; } else { + /* + * Negative adjustment; therefore, negate the delta. If + * greater than the preset 'big' threshold, slew at a faster + * rate, capping if necessary. + */ if (total < -calend_adjbig) delta *= 10; delta = -delta; if (delta < total) delta = (int32_t)total; + /* + * Save the current absolute time. Subsequent time operations occuring + * during this negative correction can make use of this value to ensure + * that time increases monotonically. + */ clock_calend.adjstart = now; + /* + * Convert the delta back from ns to absolute time and store in adjoffset. + */ nanoseconds_to_absolutetime((uint64_t)-delta, &t64); clock_calend.adjoffset = (uint32_t)t64; } + /* + * Store the total adjustment time in ns. + */ calend_adjtotal = total; + + /* + * Store the delta for this adjustment period in ns. + */ clock_calend.adjdelta = delta; + /* + * Set the interval in absolute time for later return. + */ interval = calend_adjinterval; } - else + else { + /* + * No change; clear any prior adjustment. + */ calend_adjtotal = clock_calend.adjdelta = 0; + } + /* + * If an prior correction was in progress, return the + * remaining uncorrected time from it. + */ if (ototal != 0) { *secs = (long)(ototal / NSEC_PER_SEC); *microsecs = (int)((ototal % NSEC_PER_SEC) / NSEC_PER_USEC); @@ -627,7 +689,7 @@ calend_adjust_call(void) if (interval != 0) { clock_deadline_for_periodic_event(interval, mach_absolute_time(), &calend_adjdeadline); - if (!timer_call_enter(&calend_adjcall, calend_adjdeadline)) + if (!timer_call_enter(&calend_adjcall, calend_adjdeadline, TIMER_CALL_CRITICAL)) calend_adjactive++; } } @@ -661,21 +723,21 @@ calend_adjust(void) } } else - if (delta < 0) { - clock_calend.offset -= clock_calend.adjoffset; + if (delta < 0) { + clock_calend.offset -= clock_calend.adjoffset; - calend_adjtotal -= delta; - if (delta < calend_adjtotal) { - clock_calend.adjdelta = delta = (int32_t)calend_adjtotal; + calend_adjtotal -= delta; + if (delta < calend_adjtotal) { + clock_calend.adjdelta = delta = (int32_t)calend_adjtotal; - nanoseconds_to_absolutetime((uint64_t)-delta, &t64); - clock_calend.adjoffset = (uint32_t)t64; + nanoseconds_to_absolutetime((uint64_t)-delta, &t64); + clock_calend.adjoffset = (uint32_t)t64; + } + + if (clock_calend.adjdelta != 0) + clock_calend.adjstart = now; } - if (clock_calend.adjdelta != 0) - clock_calend.adjstart = now; - } - if (clock_calend.adjdelta != 0) interval = calend_adjinterval; diff --git a/osfmk/kern/clock_oldops.c b/osfmk/kern/clock_oldops.c index c268382d9..9a3e6c93f 100644 --- a/osfmk/kern/clock_oldops.c +++ b/osfmk/kern/clock_oldops.c @@ -766,7 +766,7 @@ set_alarm( uint64_t abstime; nanotime_to_absolutetime(alarm_time->tv_sec, alarm_time->tv_nsec, &abstime); - timer_call_enter(&alarm_expire_timer, abstime); + timer_call_enter(&alarm_expire_timer, abstime, 0); } /* diff --git a/osfmk/kern/debug.c b/osfmk/kern/debug.c index 39aa1f425..b6d146746 100644 --- a/osfmk/kern/debug.c +++ b/osfmk/kern/debug.c @@ -75,20 +75,19 @@ #include #endif -#ifdef __ppc__ -#include -#include -#endif - #if defined(__i386__) || defined(__x86_64__) #include #include #endif #include +#include #include #include +#include +#include +#include unsigned int halt_in_debugger = 0; unsigned int switch_debugger = 0; @@ -122,6 +121,7 @@ char *debug_buf_ptr = debug_buf; unsigned int debug_buf_size = sizeof(debug_buf); static char model_name[64]; +/* uuid_string_t */ char kernel_uuid[37]; struct pasc { unsigned a: 7; @@ -184,6 +184,14 @@ MACRO_END void panic_init(void) { + unsigned long uuidlen = 0; + void *uuid; + + uuid = getuuidfromheader(&_mh_execute_header, &uuidlen); + if ((uuid != NULL) && (uuidlen == sizeof(uuid_t))) { + uuid_unparse_upper(*(uuid_t *)uuid, kernel_uuid); + } + simple_lock_init(&panic_lock, 0); panic_is_inited = 1; panic_caller = 0; @@ -216,7 +224,7 @@ debug_log_init(void) void _consume_panic_args(int a __unused, ...) { - panic(NULL); + panic("panic"); } void @@ -227,7 +235,15 @@ panic(const char *str, ...) thread_t thread; wait_queue_t wq; +#if defined(__i386__) || defined(__x86_64__) + /* Attempt to display the unparsed panic string */ + const char *tstr = str; + kprintf("Panic initiated, string: "); + while (tstr && *tstr) + kprintf("%c", *tstr++); + kprintf("\n"); +#endif if (kdebug_enable) kdbg_dump_trace_to_file("/var/tmp/panic.trace"); @@ -236,10 +252,6 @@ panic(const char *str, ...) panic_safe(); -#ifdef __ppc__ - lastTrace = LLTraceSet(0); /* Disable low-level tracing */ -#endif - thread = current_thread(); /* Get failing thread */ wq = thread->wait_queue; /* Save the old value */ thread->wait_queue = NULL; /* Clear the wait so we do not get double panics when we try locks */ @@ -340,6 +352,7 @@ debug_putc(char c) } /* In-place packing routines -- inefficient, but they're called at most once. + * Assumes "buflen" is a multiple of 8. */ int packA(char *inbuf, uint32_t length, uint32_t buflen) @@ -347,7 +360,7 @@ int packA(char *inbuf, uint32_t length, uint32_t buflen) unsigned int i, j = 0; pasc_t pack; - length = MIN(((length & ~7) +8), buflen); + length = MIN(((length + 7) & ~7), buflen); for (i = 0; i < length; i+=8) { @@ -362,7 +375,7 @@ int packA(char *inbuf, uint32_t length, uint32_t buflen) bcopy ((char *) &pack, inbuf + j, 7); j += 7; } - return ((length * 7)/8); + return j; } void unpackA(char *inbuf, uint32_t length) @@ -414,10 +427,20 @@ static void panic_display_model_name(void) { if (ml_nofault_copy((vm_offset_t) &model_name, (vm_offset_t) &tmp_model_name, sizeof(model_name)) != sizeof(model_name)) return; - model_name[sizeof(model_name) - 1] = '\0'; + tmp_model_name[sizeof(tmp_model_name) - 1] = '\0'; + + if (tmp_model_name[0] != 0) + kdb_printf("System model name: %s\n", tmp_model_name); +} + +static void panic_display_kernel_uuid(void) { + char tmp_kernel_uuid[sizeof(kernel_uuid)]; + + if (ml_nofault_copy((vm_offset_t) &kernel_uuid, (vm_offset_t) &tmp_kernel_uuid, sizeof(kernel_uuid)) != sizeof(kernel_uuid)) + return; - if (model_name[0] != 0) - kdb_printf("System model name: %s\n", model_name); + if (tmp_kernel_uuid[0] != '\0') + kdb_printf("Kernel UUID: %s\n", tmp_kernel_uuid); } static void panic_display_uptime(void) { @@ -430,30 +453,37 @@ static void panic_display_uptime(void) { extern const char version[]; extern char osversion[]; +static volatile uint32_t config_displayed = 0; + __private_extern__ void panic_display_system_configuration(void) { - static volatile boolean_t config_displayed = FALSE; panic_display_process_name(); - if (config_displayed == FALSE) { - config_displayed = TRUE; + if (OSCompareAndSwap(0, 1, &config_displayed)) { + char buf[256]; + if (strlcpy(buf, PE_boot_args(), sizeof(buf))) + kdb_printf("Boot args: %s\n", buf); kdb_printf("\nMac OS version:\n%s\n", (osversion[0] != 0) ? osversion : "Not yet set"); kdb_printf("\nKernel version:\n%s\n",version); + panic_display_kernel_uuid(); + panic_display_pal_info(); panic_display_model_name(); panic_display_uptime(); -#if defined(__i386__) || defined(__x86_64__) - pmap_pagetable_corruption_msg_log(&kdb_printf); -#endif /* i386 || x86_64 */ panic_display_zprint(); +#if CONFIG_ZLEAKS + panic_display_ztrace(); +#endif /* CONFIG_ZLEAKS */ kext_dump_panic_lists(&kdb_log); } } extern zone_t first_zone; extern unsigned int num_zones, stack_total; +extern unsigned long long stack_allocs; #if defined(__i386__) || defined (__x86_64__) extern unsigned int inuse_ptepages_count; +extern long long alloc_ptepages_count; #endif extern boolean_t panic_include_zprint; @@ -493,6 +523,37 @@ __private_extern__ void panic_display_zprint() } } +#if CONFIG_ZLEAKS +extern boolean_t panic_include_ztrace; +extern struct ztrace* top_ztrace; +/* + * Prints the backtrace most suspected of being a leaker, if we paniced in the zone allocator. + * top_ztrace and panic_include_ztrace comes from osfmk/kern/zalloc.c + */ +__private_extern__ void panic_display_ztrace(void) +{ + if(panic_include_ztrace == TRUE) { + unsigned int i = 0; + struct ztrace top_ztrace_copy; + + /* Make sure not to trip another panic if there's something wrong with memory */ + if(ml_nofault_copy((vm_offset_t)top_ztrace, (vm_offset_t)&top_ztrace_copy, sizeof(struct ztrace)) == sizeof(struct ztrace)) { + kdb_printf("\nBacktrace suspected of leaking: (outstanding bytes: %lu)\n", (uintptr_t)top_ztrace_copy.zt_size); + /* Print the backtrace addresses */ + for (i = 0; (i < top_ztrace_copy.zt_depth && i < MAX_ZTRACE_DEPTH) ; i++) { + kdb_printf("%p\n", top_ztrace_copy.zt_stack[i]); + } + /* Print any kexts in that backtrace, along with their link addresses so we can properly blame them */ + kmod_panic_dump((vm_offset_t *)&top_ztrace_copy.zt_stack[0], top_ztrace_copy.zt_depth); + } + else { + kdb_printf("\nCan't access top_ztrace...\n"); + } + kdb_printf("\n"); + } +} +#endif /* CONFIG_ZLEAKS */ + #if !MACH_KDP static struct ether_addr kdp_current_mac_address = {{0, 0, 0, 0, 0, 0}}; diff --git a/osfmk/kern/debug.h b/osfmk/kern/debug.h index 308435ece..66702bc16 100644 --- a/osfmk/kern/debug.h +++ b/osfmk/kern/debug.h @@ -40,8 +40,10 @@ struct thread_snapshot { uint32_t nkern_frames; uint32_t nuser_frames; uint64_t wait_event; - uint64_t continuation; + uint64_t continuation; uint64_t thread_id; + uint64_t user_time; + uint64_t system_time; int32_t state; char ss_flags; } __attribute__ ((packed)); @@ -50,6 +52,13 @@ struct task_snapshot { uint32_t snapshot_magic; int32_t pid; uint32_t nloadinfos; + uint64_t user_time_in_terminated_threads; + uint64_t system_time_in_terminated_threads; + int suspend_count; + int task_size; // pages + int faults; // number of page faults + int pageins; // number of actual pageins + int cow_faults; // number of copy-on-write faults char ss_flags; /* We restrict ourselves to a statically defined * (current as of 2009) length for the @@ -59,19 +68,34 @@ struct task_snapshot { char p_comm[17]; } __attribute__ ((packed)); + +struct mem_snapshot { + uint32_t snapshot_magic; + uint32_t free_pages; + uint32_t active_pages; + uint32_t inactive_pages; + uint32_t purgeable_pages; + uint32_t wired_pages; + uint32_t speculative_pages; + uint32_t throttled_pages; +} __attribute__((packed)); + enum { kUser64_p = 0x1, kKernel64_p = 0x2, - kHasDispatchSerial = 0x4 + kHasDispatchSerial = 0x4, + kTerminatedSnapshot = 0x8 }; enum { - STACKSHOT_GET_DQ = 0x1, - STACKSHOT_SAVE_LOADINFO = 0x2 + STACKSHOT_GET_DQ = 0x1, + STACKSHOT_SAVE_LOADINFO = 0x2, + STACKSHOT_GET_GLOBAL_MEM_STATS = 0x4 }; #define STACKSHOT_THREAD_SNAPSHOT_MAGIC 0xfeedface #define STACKSHOT_TASK_SNAPSHOT_MAGIC 0xdecafbad +#define STACKSHOT_MEM_SNAPSHOT_MAGIC 0xabcddcba #endif /* __APPLE_API_UNSTABLE */ #endif /* __APPLE_API_PRIVATE */ @@ -81,6 +105,7 @@ enum { extern unsigned int systemLogDiags; extern char debug_buf[]; extern unsigned int debug_boot_arg; +extern char kernel_uuid[]; #ifdef MACH_KERNEL_PRIVATE @@ -93,12 +118,12 @@ extern unsigned int current_debugger; #define KDP_CUR_DB 0x1 #define KDB_CUR_DB 0x2 -extern unsigned int active_debugger; +extern unsigned int active_debugger; extern unsigned int debug_mode; extern unsigned int disable_debug_output; -extern unsigned int panicDebugging; -extern unsigned int logPanicDataToScreen; +extern unsigned int panicDebugging; +extern unsigned int logPanicDataToScreen; extern int db_run_mode; @@ -130,7 +155,9 @@ void unpackA(char *inbuf, uint32_t length); void panic_display_system_configuration(void); void panic_display_zprint(void); - +#if CONFIG_ZLEAKS +void panic_display_ztrace(void); +#endif /* CONFIG_ZLEAKS */ #endif /* MACH_KERNEL_PRIVATE */ #define DB_HALT 0x1 diff --git a/osfmk/kern/etimer.h b/osfmk/kern/etimer.h index de66f9749..8c3674288 100644 --- a/osfmk/kern/etimer.h +++ b/osfmk/kern/etimer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2010 Apple Inc. All rights reserved. + * Copyright (c) 2004-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -58,6 +58,17 @@ extern int setPop(uint64_t time); extern void etimer_resync_deadlines(void); +extern uint32_t etimer_queue_migrate(int target_cpu); + +/* Kernel trace events associated with timers */ +#define DECR_TRAP_LATENCY MACHDBG_CODE(DBG_MACH_EXCP_DECI, 0) +#define DECR_SET_DEADLINE MACHDBG_CODE(DBG_MACH_EXCP_DECI, 1) +#define DECR_TIMER_CALLOUT MACHDBG_CODE(DBG_MACH_EXCP_DECI, 2) +#define DECR_PM_DEADLINE MACHDBG_CODE(DBG_MACH_EXCP_DECI, 3) +#define DECR_TIMER_MIGRATE MACHDBG_CODE(DBG_MACH_EXCP_DECI, 4) +#define DECR_RDHPET MACHDBG_CODE(DBG_MACH_EXCP_DECI, 5) +#define DECR_SET_TSC_DEADLINE MACHDBG_CODE(DBG_MACH_EXCP_DECI, 6) + #endif /* _KERN_ETIMER_H_ */ #endif /* KERNEL_PRIVATE */ diff --git a/osfmk/kern/exception.c b/osfmk/kern/exception.c index a3578e1a0..27082522f 100644 --- a/osfmk/kern/exception.c +++ b/osfmk/kern/exception.c @@ -438,7 +438,6 @@ kern_return_t abnormal_exit_notify(mach_exception_data_type_t exccode, */ kern_return_t sys_perf_notify(thread_t thread, int pid) { - host_priv_t hostp; struct exception_action *excp; ipc_port_t xport; diff --git a/osfmk/kern/extmod_statistics.c b/osfmk/kern/extmod_statistics.c new file mode 100644 index 000000000..4eb26d1ce --- /dev/null +++ b/osfmk/kern/extmod_statistics.c @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include + +#include +#include +#include +#include +#include + +#include + +/* + * This code module adds statistics to track when + * a userspace task is modified by another userspace + * task. This can facilitate triage of crashes + * and abberant behavior, which are not expected + * to occur when the program is running in its + * qualified environment. + * + * We assume the target task has a lifecycle lock + * that will prevent it from exiting + * (task_reference/task_reference_internal), which + * should be called either explicitly, or implicitly + * via MIG glue code (convert_port_to_task). + * + * Host-wide statistics don't asssume any locks are + * held, and use atomic operations. + * + * If we can detect that the kernel proper is + * performing these operations, don't count + * it as an external modification. Some of the + * external modification routines are called + * by the kernel during thread setup, in which + * case we rename the userspace entrypoint called + * by the MIG demuxer to have a "_from_user" suffix. + */ + +/* externs for BSD kernel */ +extern void fslog_extmod_msgtracer(void *, void *); + +/* local routines */ +static void +extmod_statistics_log(task_t current_task, task_t target); + +void +extmod_statistics_incr_task_for_pid(task_t target) +{ + task_t ctask = current_task(); + + if ((ctask == kernel_task) || (target == TASK_NULL)) + return; + + if (target != ctask) { + ctask->extmod_statistics.task_for_pid_caller_count++; + target->extmod_statistics.task_for_pid_count++; + OSIncrementAtomic64(&host_extmod_statistics.task_for_pid_count); + } +} + +void +extmod_statistics_incr_thread_set_state(thread_t target) +{ + task_t ctask = current_task(); + task_t ttask; + + if ((ctask == kernel_task) || (target == THREAD_NULL)) + return; + + ttask = get_threadtask(target); + + if (ttask == TASK_NULL) + return; + + if (ttask != ctask) { + ctask->extmod_statistics.thread_set_state_caller_count++; + ttask->extmod_statistics.thread_set_state_count++; + OSIncrementAtomic64(&host_extmod_statistics.thread_set_state_count); + } +} + +void +extmod_statistics_incr_thread_create(task_t target) +{ + task_t ctask = current_task(); + + if ((ctask == kernel_task) || (target == TASK_NULL)) + return; + + if (target != ctask) { + ctask->extmod_statistics.thread_creation_caller_count++; + target->extmod_statistics.thread_creation_count++; + OSIncrementAtomic64(&host_extmod_statistics.thread_creation_count); + + extmod_statistics_log(ctask, target); + } +} + +static void +extmod_statistics_log(task_t current_task, task_t target) +{ + void *c_proc; + void *t_proc; + + c_proc = get_bsdtask_info(current_task); + t_proc = get_bsdtask_info(target); + if (c_proc && t_proc) { + fslog_extmod_msgtracer(c_proc, t_proc); + } +} diff --git a/osfmk/ppc/PPCcalls.c b/osfmk/kern/extmod_statistics.h similarity index 71% rename from osfmk/ppc/PPCcalls.c rename to osfmk/kern/extmod_statistics.h index 39203ec5d..5bf20066a 100644 --- a/osfmk/ppc/PPCcalls.c +++ b/osfmk/kern/extmod_statistics.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,25 +25,22 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - /* - * To add a new entry: - * Add an "PPCcall(routine)" to the table in ppc/PPCcalls.h + * kern/extmod_statistics.h * - * Add trap definition to mach/ppc/syscall_sw.h and - * recompile user library. + * Definitions for statistics related to external + * modification of a task by another agent on the system. * */ -#include +#ifndef _KERN_EXTMOD_STATISTICS_H_ +#define _KERN_EXTMODE_STATISTICS_H_ + +#include +#include + +extern void extmod_statistics_incr_task_for_pid(task_t target); +extern void extmod_statistics_incr_thread_set_state(thread_t target); +extern void extmod_statistics_incr_thread_create(task_t target); + +#endif /* _KERN_EXTMOD_STATISTICS_H_ */ diff --git a/osfmk/kern/hibernate.c b/osfmk/kern/hibernate.c index b56317bf9..018ea6b78 100644 --- a/osfmk/kern/hibernate.c +++ b/osfmk/kern/hibernate.c @@ -51,14 +51,17 @@ hibernate_setup(IOHibernateImageHeader * header, boolean_t vmflush, hibernate_page_list_t ** page_list_ret, hibernate_page_list_t ** page_list_wired_ret, + hibernate_page_list_t ** page_list_pal_ret, boolean_t * encryptedswap) { hibernate_page_list_t * page_list = NULL; hibernate_page_list_t * page_list_wired = NULL; + hibernate_page_list_t * page_list_pal = NULL; uint32_t gobble_count; *page_list_ret = NULL; *page_list_wired_ret = NULL; + *page_list_pal_ret = NULL; if (vmflush) hibernate_flush_memory(); @@ -72,6 +75,13 @@ hibernate_setup(IOHibernateImageHeader * header, kfree(page_list, page_list->list_size); return (KERN_RESOURCE_SHORTAGE); } + page_list_pal = hibernate_page_list_allocate(); + if (!page_list_pal) + { + kfree(page_list, page_list->list_size); + kfree(page_list_wired, page_list_wired->list_size); + return (KERN_RESOURCE_SHORTAGE); + } *encryptedswap = dp_encryption; @@ -90,6 +100,7 @@ hibernate_setup(IOHibernateImageHeader * header, *page_list_ret = page_list; *page_list_wired_ret = page_list_wired; + *page_list_pal_ret = page_list_pal; return (KERN_SUCCESS); } diff --git a/osfmk/kern/host.c b/osfmk/kern/host.c index f77cccb71..15b742050 100644 --- a/osfmk/kern/host.c +++ b/osfmk/kern/host.c @@ -90,6 +90,8 @@ host_data_t realhost; +vm_extmod_statistics_data_t host_extmod_statistics; + kern_return_t host_processors( host_priv_t host_priv, @@ -189,6 +191,8 @@ host_info( case HOST_SCHED_INFO: { register host_sched_info_t sched_info; + uint32_t quantum_time; + uint64_t quantum_ns; /* * Return scheduler information. @@ -198,8 +202,11 @@ host_info( sched_info = (host_sched_info_t) info; + quantum_time = SCHED(initial_quantum_size)(THREAD_NULL); + absolutetime_to_nanoseconds(quantum_time, &quantum_ns); + sched_info->min_timeout = - sched_info->min_quantum = std_quantum_us / 1000; + sched_info->min_quantum = (uint32_t)(quantum_ns / 1000 / 1000); *count = HOST_SCHED_INFO_COUNT; @@ -397,23 +404,29 @@ MACRO_END cpu_load_info->cpu_ticks[CPU_STATE_IDLE] = 0; cpu_load_info->cpu_ticks[CPU_STATE_NICE] = 0; - processor = processor_list; - GET_TICKS_VALUE(processor, CPU_STATE_USER, user_state); - GET_TICKS_VALUE(processor, CPU_STATE_SYSTEM, system_state); - GET_TICKS_VALUE(processor, CPU_STATE_IDLE, idle_state); + simple_lock(&processor_list_lock); - if (processor_count > 1) { - simple_lock(&processor_list_lock); + for (processor = processor_list; processor != NULL; processor = processor->processor_list) { + timer_data_t idle_temp; + timer_t idle_state; - while ((processor = processor->processor_list) != NULL) { - GET_TICKS_VALUE(processor, CPU_STATE_USER, user_state); - GET_TICKS_VALUE(processor, CPU_STATE_SYSTEM, system_state); + GET_TICKS_VALUE(processor, CPU_STATE_USER, user_state); + GET_TICKS_VALUE(processor, CPU_STATE_SYSTEM, system_state); + + idle_state = &PROCESSOR_DATA(processor, idle_state); + idle_temp = *idle_state; + + if (PROCESSOR_DATA(processor, current_state) != idle_state || + timer_grab(&idle_temp) != timer_grab(idle_state)) GET_TICKS_VALUE(processor, CPU_STATE_IDLE, idle_state); - } + else { + timer_advance(&idle_temp, mach_absolute_time() - idle_temp.tstamp); - simple_unlock(&processor_list_lock); + cpu_load_info->cpu_ticks[CPU_STATE_IDLE] += + (uint32_t)(timer_grab(&idle_temp) / hz_tick_interval); + } } - + simple_unlock(&processor_list_lock); *count = HOST_CPU_LOAD_INFO_COUNT; return (KERN_SUCCESS); @@ -512,6 +525,21 @@ host_statistics64( return(KERN_SUCCESS); } + case HOST_EXTMOD_INFO64: /* We were asked to get vm_statistics64 */ + { + vm_extmod_statistics_t out_extmod_statistics; + + if (*count < HOST_EXTMOD_INFO64_COUNT) + return (KERN_FAILURE); + + out_extmod_statistics = (vm_extmod_statistics_t) info; + *out_extmod_statistics = host_extmod_statistics; + + *count = HOST_EXTMOD_INFO64_COUNT; + + return(KERN_SUCCESS); + } + default: /* If we didn't recognize the flavor, send to host_statistics */ return(host_statistics(host, flavor, (host_info_t) info, count)); } @@ -532,6 +560,73 @@ host_priv_statistics( return(host_statistics((host_t)host_priv, flavor, info, count)); } +kern_return_t +set_sched_stats_active( + boolean_t active) +{ + sched_stats_active = active; + return KERN_SUCCESS; +} + + +kern_return_t +get_sched_statistics( + struct _processor_statistics_np *out, + uint32_t *count) +{ + processor_t processor; + + if (!sched_stats_active) { + return KERN_FAILURE; + } + + simple_lock(&processor_list_lock); + + if (*count < (processor_count + 2) * sizeof(struct _processor_statistics_np)) { /* One for RT, one for FS */ + simple_unlock(&processor_list_lock); + return KERN_FAILURE; + } + + processor = processor_list; + while (processor) { + struct processor_sched_statistics *stats = &processor->processor_data.sched_stats; + + out->ps_cpuid = processor->cpu_id; + out->ps_csw_count = stats->csw_count; + out->ps_preempt_count = stats->preempt_count; + out->ps_preempted_rt_count = stats->preempted_rt_count; + out->ps_preempted_by_rt_count = stats->preempted_by_rt_count; + out->ps_rt_sched_count = stats->rt_sched_count; + out->ps_interrupt_count = stats->interrupt_count; + out->ps_ipi_count = stats->ipi_count; + out->ps_timer_pop_count = stats->timer_pop_count; + out->ps_runq_count_sum = SCHED(processor_runq_stats_count_sum)(processor); + out->ps_idle_transitions = stats->idle_transitions; + + out++; + processor = processor->processor_list; + } + + *count = (uint32_t) (processor_count * sizeof(struct _processor_statistics_np)); + + simple_unlock(&processor_list_lock); + + /* And include RT Queue information */ + bzero(out, sizeof(*out)); + out->ps_cpuid = (-1); + out->ps_runq_count_sum = rt_runq.runq_stats.count_sum; + out++; + *count += (uint32_t)sizeof(struct _processor_statistics_np); + + /* And include Fair Share Queue information at the end */ + bzero(out, sizeof(*out)); + out->ps_cpuid = (-2); + out->ps_runq_count_sum = SCHED(fairshare_runq_stats_count_sum)(); + *count += (uint32_t)sizeof(struct _processor_statistics_np); + + return KERN_SUCCESS; +} + kern_return_t host_page_size( host_t host, diff --git a/osfmk/kern/host.h b/osfmk/kern/host.h index 3c64c3b08..24b052648 100644 --- a/osfmk/kern/host.h +++ b/osfmk/kern/host.h @@ -74,7 +74,7 @@ #include #include #include - +#include struct host { decl_lck_mtx_data(,lock) /* lock to protect exceptions */ @@ -89,6 +89,8 @@ extern host_data_t realhost; #define host_lock(host) lck_mtx_lock(&(host)->lock) #define host_unlock(host) lck_mtx_unlock(&(host)->lock) +extern vm_extmod_statistics_data_t host_extmod_statistics; + #endif /* MACH_KERNEL_PRIVATE */ /* diff --git a/osfmk/kern/host_notify.c b/osfmk/kern/host_notify.c index 769d1cc2e..1ca87dcaf 100644 --- a/osfmk/kern/host_notify.c +++ b/osfmk/kern/host_notify.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2003-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,14 +25,6 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. - * - * HISTORY - * - * 16 January 2003 (debo) - * Created. - */ #include #include @@ -143,7 +135,7 @@ host_notify_port_destroy( ip_unlock(port); assert(entry->port == port); - remqueue(NULL, (queue_entry_t)entry); + remqueue((queue_entry_t)entry); lck_mtx_unlock(&host_notify_lock); zfree(host_notify_zone, entry); diff --git a/osfmk/kern/host_statistics.h b/osfmk/kern/host_statistics.h index a1471c6f4..efe59d74e 100644 --- a/osfmk/kern/host_statistics.h +++ b/osfmk/kern/host_statistics.h @@ -43,16 +43,9 @@ #include -#if defined(__ppc__) /* On ppc, vm statistics are still 32-bit */ -#define VM_STAT_INCR(event) \ -MACRO_BEGIN \ - OSAddAtomic(1, (SInt32 *) (&(PROCESSOR_DATA(current_processor(), vm_stat).event))); \ -MACRO_END -#else /* !(defined(__ppc__)) */ #define VM_STAT_INCR(event) \ MACRO_BEGIN \ OSAddAtomic64(1, (SInt64 *) (&(PROCESSOR_DATA(current_processor(), vm_stat).event))); \ MACRO_END -#endif /* !(defined(__ppc__)) */ #endif /* _KERN_HOST_STATISTICS_H_ */ diff --git a/osfmk/kern/ipc_kobject.c b/osfmk/kern/ipc_kobject.c index a0cc915a8..8963abea6 100644 --- a/osfmk/kern/ipc_kobject.c +++ b/osfmk/kern/ipc_kobject.c @@ -560,12 +560,10 @@ ipc_kobject_notify( return TRUE; } #endif -#if CONFIG_EMBEDDED if (ip_kotype(port) == IKOT_FILEPORT) { fileport_notify(request_header); return TRUE; } -#endif break; diff --git a/osfmk/kern/ipc_mig.c b/osfmk/kern/ipc_mig.c index b437edd9b..1b2a9163d 100644 --- a/osfmk/kern/ipc_mig.c +++ b/osfmk/kern/ipc_mig.c @@ -79,6 +79,7 @@ #include #include #include +#include #include #include @@ -115,14 +116,15 @@ mach_msg_send_from_kernel( ipc_kmsg_t kmsg; mach_msg_return_t mr; - if (!MACH_PORT_VALID(CAST_MACH_PORT_TO_NAME(msg->msgh_remote_port))) - return MACH_SEND_INVALID_DEST; - mr = ipc_kmsg_get_from_kernel(msg, send_size, &kmsg); if (mr != MACH_MSG_SUCCESS) return mr; - ipc_kmsg_copyin_from_kernel_legacy(kmsg); + mr = ipc_kmsg_copyin_from_kernel_legacy(kmsg); + if (mr != MACH_MSG_SUCCESS) { + ipc_kmsg_free(kmsg); + return mr; + } mr = ipc_kmsg_send_always(kmsg); if (mr != MACH_MSG_SUCCESS) { @@ -142,14 +144,15 @@ mach_msg_send_from_kernel_proper( ipc_kmsg_t kmsg; mach_msg_return_t mr; - if (!MACH_PORT_VALID(CAST_MACH_PORT_TO_NAME(msg->msgh_remote_port))) - return MACH_SEND_INVALID_DEST; - mr = ipc_kmsg_get_from_kernel(msg, send_size, &kmsg); if (mr != MACH_MSG_SUCCESS) return mr; - ipc_kmsg_copyin_from_kernel(kmsg); + mr = ipc_kmsg_copyin_from_kernel(kmsg); + if (mr != MACH_MSG_SUCCESS) { + ipc_kmsg_free(kmsg); + return mr; + } mr = ipc_kmsg_send_always(kmsg); if (mr != MACH_MSG_SUCCESS) { @@ -171,14 +174,16 @@ mach_msg_send_from_kernel_with_options( ipc_kmsg_t kmsg; mach_msg_return_t mr; - if (!MACH_PORT_VALID(CAST_MACH_PORT_TO_NAME(msg->msgh_remote_port))) - return MACH_SEND_INVALID_DEST; - mr = ipc_kmsg_get_from_kernel(msg, send_size, &kmsg); if (mr != MACH_MSG_SUCCESS) return mr; - ipc_kmsg_copyin_from_kernel_legacy(kmsg); + mr = ipc_kmsg_copyin_from_kernel_legacy(kmsg); + if (mr != MACH_MSG_SUCCESS) { + ipc_kmsg_free(kmsg); + return mr; + } + mr = ipc_kmsg_send(kmsg, option, timeout_val); if (mr != MACH_MSG_SUCCESS) { ipc_kmsg_destroy(kmsg); @@ -252,7 +257,6 @@ mach_msg_rpc_from_kernel_body( mach_port_seqno_t seqno; mach_msg_return_t mr; - assert(MACH_PORT_VALID(CAST_MACH_PORT_TO_NAME(msg->msgh_remote_port))); assert(msg->msgh_local_port == MACH_PORT_NULL); mr = ipc_kmsg_get_from_kernel(msg, send_size, &kmsg); @@ -277,13 +281,16 @@ mach_msg_rpc_from_kernel_body( #if IKM_SUPPORT_LEGACY if(legacy) - ipc_kmsg_copyin_from_kernel_legacy(kmsg); + mr = ipc_kmsg_copyin_from_kernel_legacy(kmsg); else - ipc_kmsg_copyin_from_kernel(kmsg); + mr = ipc_kmsg_copyin_from_kernel(kmsg); #else - ipc_kmsg_copyin_from_kernel(kmsg); + mr = ipc_kmsg_copyin_from_kernel(kmsg); #endif - + if (mr != MACH_MSG_SUCCESS) { + ipc_kmsg_free(kmsg); + return mr; + } mr = ipc_kmsg_send_always(kmsg); if (mr != MACH_MSG_SUCCESS) { ipc_kmsg_destroy(kmsg); @@ -446,7 +453,7 @@ mach_msg_overwrite( max_trailer->msgh_trailer_type = MACH_MSG_TRAILER_FORMAT_0; max_trailer->msgh_trailer_size = MACH_MSG_TRAILER_MINIMUM_SIZE; - mr = ipc_kmsg_copyin(kmsg, space, map, MACH_PORT_NULL); + mr = ipc_kmsg_copyin(kmsg, space, map, FALSE); if (mr != MACH_MSG_SUCCESS) { ipc_kmsg_free(kmsg); return mr; @@ -504,8 +511,7 @@ mach_msg_overwrite( return MACH_RCV_TOO_LARGE; } - mr = ipc_kmsg_copyout(kmsg, space, map, MACH_PORT_NULL, - MACH_MSG_BODY_NULL); + mr = ipc_kmsg_copyout(kmsg, space, map, MACH_MSG_BODY_NULL); if (mr != MACH_MSG_SUCCESS) { if ((mr &~ MACH_MSG_MASK) == MACH_RCV_BODY_ERROR) { ipc_kmsg_put_to_kernel(msg, kmsg, diff --git a/osfmk/kern/ipc_misc.c b/osfmk/kern/ipc_misc.c index 9eeefc347..547abeaad 100644 --- a/osfmk/kern/ipc_misc.c +++ b/osfmk/kern/ipc_misc.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008, 2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -31,6 +31,11 @@ #include #include +#include +#include +#include +#include + extern void fileport_releasefg(struct fileglob *); /* @@ -140,6 +145,96 @@ fileport_notify(mach_msg_header_t *msg) } else { ip_unlock(port); } +} + +/* + * fileport_invoke + * + * Description: Invoke a function with the fileglob underlying the fileport. + * Returns the error code related to the fileglob lookup. + * + * Parameters: task The target task + * action The function to invoke with the fileglob + * arg Anonymous pointer to caller state + * rval The value returned from calling 'action' + */ +kern_return_t +fileport_invoke(task_t task, mach_port_name_t name, + int (*action)(mach_port_name_t, struct fileglob *, void *), + void *arg, int *rval) +{ + kern_return_t kr; + ipc_port_t fileport; + struct fileglob *fg; + + kr = ipc_object_copyin(task->itk_space, name, + MACH_MSG_TYPE_COPY_SEND, (ipc_object_t *)&fileport); + if (kr != KERN_SUCCESS) + return (kr); + + if ((fg = fileport_port_to_fileglob(fileport)) != NULL) + *rval = (*action)(name, fg, arg); + else + kr = KERN_FAILURE; + ipc_port_release_send(fileport); + return (kr); +} + +/* + * fileport_walk + * + * Description: Invoke the action function on every fileport in the task. + * + * This could be more efficient if we refactored mach_port_names() + * so that (a) it didn't compute the type information unless asked + * and (b) it could be asked to -not- unwire/copyout the memory + * and (c) if we could ask for port names by kobject type. Not + * clear that it's worth all that complexity, though. + * + * Parameters: task The target task + * action The function to invoke on each fileport + * arg Anonymous pointer to caller state. + */ +kern_return_t +fileport_walk(task_t task, + int (*action)(mach_port_name_t, struct fileglob *, void *arg), + void *arg) +{ + mach_port_name_t *names; + mach_msg_type_number_t ncnt, tcnt; + vm_map_copy_t map_copy_names, map_copy_types; + vm_map_address_t map_names; + kern_return_t kr; + uint_t i; + int rval; + + /* + * mach_port_names returns the 'name' and 'types' in copied-in + * form. Discard 'types' immediately, then copyout 'names' + * back into the kernel before walking the array. + */ + + kr = mach_port_names(task->itk_space, + (mach_port_name_t **)&map_copy_names, &ncnt, + (mach_port_type_t **)&map_copy_types, &tcnt); + if (kr != KERN_SUCCESS) + return (kr); + + vm_map_copy_discard(map_copy_types); + + kr = vm_map_copyout(ipc_kernel_map, &map_names, map_copy_names); + if (kr != KERN_SUCCESS) { + vm_map_copy_discard(map_copy_names); + return (kr); + } + names = (mach_port_name_t *)(uintptr_t)map_names; + + for (rval = 0, i = 0; i < ncnt; i++) + if (fileport_invoke(task, names[i], action, arg, + &rval) == KERN_SUCCESS && -1 == rval) + break; /* early termination clause */ - return; + vm_deallocate(ipc_kernel_map, + (vm_address_t)names, ncnt * sizeof (*names)); + return (KERN_SUCCESS); } diff --git a/osfmk/kern/ipc_misc.h b/osfmk/kern/ipc_misc.h index 0e572e030..04fbd6505 100644 --- a/osfmk/kern/ipc_misc.h +++ b/osfmk/kern/ipc_misc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009 Apple Inc. All rights reserved. + * Copyright (c) 2009, 2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -33,6 +33,10 @@ struct fileglob; ipc_port_t fileport_alloc(struct fileglob *); struct fileglob *fileport_port_to_fileglob(ipc_port_t); void fileport_notify(mach_msg_header_t *); +kern_return_t fileport_invoke(task_t, mach_port_name_t, + int (*)(mach_port_name_t, struct fileglob *, void *), void *, int *); +kern_return_t fileport_walk(task_t, + int (*)(mach_port_name_t, struct fileglob *, void *), void *); #endif /* _KERN_IPC_MISC_H_ */ #endif /* KERNEL_PRIVATE */ diff --git a/osfmk/kern/kalloc.c b/osfmk/kern/kalloc.c index 03c55052d..f84a19956 100644 --- a/osfmk/kern/kalloc.c +++ b/osfmk/kern/kalloc.c @@ -92,11 +92,47 @@ vm_size_t kalloc_kernmap_size; /* size of kallocs that can come from kernel map unsigned int kalloc_large_inuse; vm_size_t kalloc_large_total; vm_size_t kalloc_large_max; -volatile vm_size_t kalloc_largest_allocated = 0; +vm_size_t kalloc_largest_allocated = 0; +uint64_t kalloc_large_sum; + +int kalloc_fake_zone_index = -1; /* index of our fake zone in statistics arrays */ vm_offset_t kalloc_map_min; vm_offset_t kalloc_map_max; +#ifdef MUTEX_ZONE +/* + * Diagnostic code to track mutexes separately rather than via the 2^ zones + */ + zone_t lck_mtx_zone; +#endif + +static void +KALLOC_ZINFO_SALLOC(vm_size_t bytes) +{ + thread_t thr = current_thread(); + task_t task; + zinfo_usage_t zinfo; + + thr->tkm_shared.alloc += bytes; + if (kalloc_fake_zone_index != -1 && + (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) + zinfo[kalloc_fake_zone_index].alloc += bytes; +} + +static void +KALLOC_ZINFO_SFREE(vm_size_t bytes) +{ + thread_t thr = current_thread(); + task_t task; + zinfo_usage_t zinfo; + + thr->tkm_shared.free += bytes; + if (kalloc_fake_zone_index != -1 && + (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) + zinfo[kalloc_fake_zone_index].free += bytes; +} + /* * All allocations of size less than kalloc_max are rounded to the * next highest power of 2. This allocator is built on top of @@ -158,11 +194,23 @@ void * kalloc_canblock( boolean_t canblock); +lck_grp_t *kalloc_lck_grp; +lck_mtx_t kalloc_lock; + +#define kalloc_spin_lock() lck_mtx_lock_spin(&kalloc_lock) +#define kalloc_unlock() lck_mtx_unlock(&kalloc_lock) + + /* OSMalloc local data declarations */ static queue_head_t OSMalloc_tag_list; -decl_simple_lock_data(static,OSMalloc_tag_lock) +lck_grp_t *OSMalloc_tag_lck_grp; +lck_mtx_t OSMalloc_tag_lock; + +#define OSMalloc_tag_spin_lock() lck_mtx_lock_spin(&OSMalloc_tag_lock) +#define OSMalloc_tag_unlock() lck_mtx_unlock(&OSMalloc_tag_lock) + /* OSMalloc forward declarations */ void OSMalloc_init(void); @@ -225,7 +273,9 @@ kalloc_init( /* * Allocate a zone for each size we are going to handle. - * We specify non-paged memory. + * We specify non-paged memory. Don't charge the caller + * for the allocation, as we aren't sure how the memory + * will be handled. */ for (i = 0, size = 1; size < kalloc_max; i++, size <<= 1) { if (size < KALLOC_MINSIZE) { @@ -237,8 +287,15 @@ kalloc_init( } k_zone[i] = zinit(size, k_zone_max[i] * size, size, k_zone_name[i]); + zone_change(k_zone[i], Z_CALLERACCT, FALSE); } + kalloc_lck_grp = lck_grp_alloc_init("kalloc.large", LCK_GRP_ATTR_NULL); + lck_mtx_init(&kalloc_lock, kalloc_lck_grp, LCK_ATTR_NULL); OSMalloc_init(); +#ifdef MUTEX_ZONE + lck_mtx_zone = zinit(sizeof(struct _lck_mtx_), 1024*256, 4096, "lck_mtx"); +#endif + } void * @@ -261,36 +318,42 @@ kalloc_canblock( /* kmem_alloc could block so we return if noblock */ if (!canblock) { - return(NULL); + return(NULL); } - if (size >= kalloc_kernmap_size) { - volatile vm_offset_t prev_largest; + if (size >= kalloc_kernmap_size) alloc_map = kernel_map; - /* Thread-safe version of the workaround for 4740071 - * (a double FREE()) - */ - do { - prev_largest = kalloc_largest_allocated; - } while ((size > prev_largest) && !OSCompareAndSwap((UInt32)prev_largest, (UInt32)size, (volatile UInt32 *) &kalloc_largest_allocated)); - } else + else alloc_map = kalloc_map; if (kmem_alloc(alloc_map, (vm_offset_t *)&addr, size) != KERN_SUCCESS) { if (alloc_map != kernel_map) { if (kmem_alloc(kernel_map, (vm_offset_t *)&addr, size) != KERN_SUCCESS) addr = NULL; - } + } else addr = NULL; } if (addr != NULL) { + kalloc_spin_lock(); + /* + * Thread-safe version of the workaround for 4740071 + * (a double FREE()) + */ + if (size > kalloc_largest_allocated) + kalloc_largest_allocated = size; + kalloc_large_inuse++; kalloc_large_total += size; + kalloc_large_sum += size; if (kalloc_large_total > kalloc_large_max) kalloc_large_max = kalloc_large_total; + + kalloc_unlock(); + + KALLOC_ZINFO_SALLOC(size); } return(addr); } @@ -374,6 +437,7 @@ krealloc( kmem_free(alloc_map, (vm_offset_t)*addrp, old_size); kalloc_large_total += (new_size - old_size); + kalloc_large_sum += (new_size - old_size); if (kalloc_large_total > kalloc_large_max) kalloc_large_max = kalloc_large_total; @@ -412,11 +476,18 @@ krealloc( *addrp = NULL; return; } + kalloc_spin_lock(); + kalloc_large_inuse++; + kalloc_large_sum += new_size; kalloc_large_total += new_size; if (kalloc_large_total > kalloc_large_max) kalloc_large_max = kalloc_large_total; + + kalloc_unlock(); + + KALLOC_ZINFO_SALLOC(new_size); } else { register int new_zindex; @@ -515,9 +586,14 @@ kfree( } kmem_free(alloc_map, (vm_offset_t)data, size); + kalloc_spin_lock(); + kalloc_large_total -= size; kalloc_large_inuse--; + kalloc_unlock(); + + KALLOC_ZINFO_SFREE(size); return; } @@ -560,18 +636,32 @@ kalloc_zone( } #endif +void +kalloc_fake_zone_init(int zone_index) +{ + kalloc_fake_zone_index = zone_index; +} void -kalloc_fake_zone_info(int *count, vm_size_t *cur_size, vm_size_t *max_size, vm_size_t *elem_size, - vm_size_t *alloc_size, int *collectable, int *exhaustable) +kalloc_fake_zone_info(int *count, + vm_size_t *cur_size, vm_size_t *max_size, vm_size_t *elem_size, vm_size_t *alloc_size, + uint64_t *sum_size, int *collectable, int *exhaustable, int *caller_acct) { *count = kalloc_large_inuse; *cur_size = kalloc_large_total; *max_size = kalloc_large_max; - *elem_size = kalloc_large_total / kalloc_large_inuse; - *alloc_size = kalloc_large_total / kalloc_large_inuse; + + if (kalloc_large_inuse) { + *elem_size = kalloc_large_total / kalloc_large_inuse; + *alloc_size = kalloc_large_total / kalloc_large_inuse; + } else { + *elem_size = 0; + *alloc_size = 0; + } + *sum_size = kalloc_large_sum; *collectable = 0; *exhaustable = 0; + *caller_acct = 0; } @@ -580,7 +670,9 @@ OSMalloc_init( void) { queue_init(&OSMalloc_tag_list); - simple_lock_init(&OSMalloc_tag_lock, 0); + + OSMalloc_tag_lck_grp = lck_grp_alloc_init("OSMalloc_tag", LCK_GRP_ATTR_NULL); + lck_mtx_init(&OSMalloc_tag_lock, OSMalloc_tag_lck_grp, LCK_ATTR_NULL); } OSMallocTag @@ -601,9 +693,9 @@ OSMalloc_Tagalloc( strncpy(OSMTag->OSMT_name, str, OSMT_MAX_NAME); - simple_lock(&OSMalloc_tag_lock); + OSMalloc_tag_spin_lock(); enqueue_tail(&OSMalloc_tag_list, (queue_entry_t)OSMTag); - simple_unlock(&OSMalloc_tag_lock); + OSMalloc_tag_unlock(); OSMTag->OSMT_state = OSMT_VALID; return(OSMTag); } @@ -627,9 +719,9 @@ OSMalloc_Tagrele( if (hw_atomic_sub(&tag->OSMT_refcnt, 1) == 0) { if (hw_compare_and_store(OSMT_VALID|OSMT_RELEASED, OSMT_VALID|OSMT_RELEASED, &tag->OSMT_state)) { - simple_lock(&OSMalloc_tag_lock); + OSMalloc_tag_spin_lock(); (void)remque((queue_entry_t)tag); - simple_unlock(&OSMalloc_tag_lock); + OSMalloc_tag_unlock(); kfree((void*)tag, sizeof(*tag)); } else panic("OSMalloc_Tagrele(): refcnt 0\n"); @@ -644,9 +736,9 @@ OSMalloc_Tagfree( panic("OSMalloc_Tagfree(): bad state 0x%08X\n", tag->OSMT_state); if (hw_atomic_sub(&tag->OSMT_refcnt, 1) == 0) { - simple_lock(&OSMalloc_tag_lock); + OSMalloc_tag_spin_lock(); (void)remque((queue_entry_t)tag); - simple_unlock(&OSMalloc_tag_lock); + OSMalloc_tag_unlock(); kfree((void*)tag, sizeof(*tag)); } } diff --git a/osfmk/kern/kalloc.h b/osfmk/kern/kalloc.h index 7966959f2..9fcb07edc 100644 --- a/osfmk/kern/kalloc.h +++ b/osfmk/kern/kalloc.h @@ -88,14 +88,18 @@ extern void krealloc(void **addrp, vm_size_t new_size, simple_lock_t lock); +extern void kalloc_fake_zone_init( int ); + extern void kalloc_fake_zone_info( int *count, vm_size_t *cur_size, vm_size_t *max_size, vm_size_t *elem_size, vm_size_t *alloc_size, + uint64_t *sum_size, int *collectable, - int *exhaustable); + int *exhaustable, + int *caller_acct); extern vm_size_t kalloc_max_prerounded; extern vm_size_t kalloc_large_total; diff --git a/osfmk/kern/kern_types.h b/osfmk/kern/kern_types.h index 7c3e93616..d8e98aa6f 100644 --- a/osfmk/kern/kern_types.h +++ b/osfmk/kern/kern_types.h @@ -110,6 +110,15 @@ typedef struct pset_node *pset_node_t; typedef struct affinity_set *affinity_set_t; #define AFFINITY_SET_NULL ((affinity_set_t) 0) +typedef struct run_queue *run_queue_t; +#define RUN_QUEUE_NULL ((run_queue_t) 0) + +typedef struct grrr_run_queue *grrr_run_queue_t; +#define GRRR_RUN_QUEUE_NULL ((grrr_run_queue_t) 0) + +typedef struct grrr_group *grrr_group_t; +#define GRRR_GROUP_NULL ((grrr_group_t) 0) + #else /* MACH_KERNEL_PRIVATE */ struct wait_queue_set ; diff --git a/osfmk/kern/kext_alloc.c b/osfmk/kern/kext_alloc.c index 407efcf16..1d3aea127 100644 --- a/osfmk/kern/kext_alloc.c +++ b/osfmk/kern/kext_alloc.c @@ -39,7 +39,6 @@ #include #include -#define KEXT_ALLOC_MAX_OFFSET (2 * 1024 * 1024 * 1024UL) vm_map_t g_kext_map = 0; static mach_vm_offset_t kext_alloc_base = 0; @@ -70,15 +69,15 @@ kext_alloc_init(void) text_end = vm_map_round_page(text->vmaddr + text->vmsize); text_size = text_end - text_start; - kext_alloc_base = text_end - KEXT_ALLOC_MAX_OFFSET; - kext_alloc_size = KEXT_ALLOC_MAX_OFFSET - text_size; + kext_alloc_base = KEXT_ALLOC_BASE(text_end); + kext_alloc_size = KEXT_ALLOC_SIZE(text_size); kext_alloc_max = kext_alloc_base + kext_alloc_size; /* Allocate the subblock of the kernel map */ rval = kmem_suballoc(kernel_map, (vm_offset_t *) &kext_alloc_base, kext_alloc_size, /* pageable */ TRUE, - VM_FLAGS_FIXED|VM_FLAGS_OVERWRITE|VM_FLAGS_BELOW_MIN, + VM_FLAGS_FIXED|VM_FLAGS_OVERWRITE, &g_kext_map); if (rval != KERN_SUCCESS) { panic("kext_alloc_init: kmem_suballoc failed 0x%x\n", rval); diff --git a/osfmk/kern/kmod.c b/osfmk/kern/kmod.c index 121967342..d0563ce09 100644 --- a/osfmk/kern/kmod.c +++ b/osfmk/kern/kmod.c @@ -85,7 +85,7 @@ extern void proc_selfname(char * buf, int size); __FUNCTION__, procname); \ } while (0) -#if __ppc__ || __i386__ +#if __i386__ // in libkern/OSKextLib.cpp extern kern_return_t kext_get_kmod_info( kmod_info_array_t * kmod_list, @@ -93,7 +93,7 @@ extern kern_return_t kext_get_kmod_info( #define KMOD_MIG_UNUSED #else #define KMOD_MIG_UNUSED __unused -#endif /* __ppc__ || __i386__ */ +#endif /* __i386__ */ /********************************************************************* @@ -148,7 +148,7 @@ kmod_get_info( kmod_info_array_t * kmod_list KMOD_MIG_UNUSED, mach_msg_type_number_t * kmodCount KMOD_MIG_UNUSED) { -#if __ppc__ || __i386__ +#if __i386__ if (current_task() != kernel_task && task_has_64BitAddr(current_task())) { NOT_SUPPORTED_USER64(); return KERN_NOT_SUPPORTED; @@ -157,5 +157,5 @@ kmod_get_info( #else NOT_SUPPORTED_KERNEL(); return KERN_NOT_SUPPORTED; -#endif /* __ppc__ || __i386__ */ +#endif /* __i386__ */ } diff --git a/osfmk/kern/locks.c b/osfmk/kern/locks.c index e31e970c6..07b9924a1 100644 --- a/osfmk/kern/locks.c +++ b/osfmk/kern/locks.c @@ -107,6 +107,12 @@ void lck_mod_init( void) { + /* + * Obtain "lcks" options:this currently controls lock statistics + */ + if (!PE_parse_boot_argn("lcks", &LcksOpts, sizeof (LcksOpts))) + LcksOpts = 0; + queue_init(&lck_grp_queue); /* @@ -537,8 +543,12 @@ lck_mtx_sleep_deadline( if (res == THREAD_WAITING) { lck_mtx_unlock(lck); res = thread_block(THREAD_CONTINUE_NULL); - if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) - lck_mtx_lock(lck); + if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) { + if ((lck_sleep_action & LCK_SLEEP_SPIN)) + lck_mtx_lock_spin(lck); + else + lck_mtx_lock(lck); + } } else if (lck_sleep_action & LCK_SLEEP_UNLOCK) @@ -590,7 +600,7 @@ lck_mtx_lock_wait ( thread_lock(holder); if (mutex->lck_mtx_pri == 0) holder->promotions++; - holder->sched_mode |= TH_MODE_PROMOTED; + holder->sched_flags |= TH_SFLAG_PROMOTED; if ( mutex->lck_mtx_pri < priority && holder->sched_pri < priority ) { KERNEL_DEBUG_CONSTANT( @@ -672,7 +682,7 @@ lck_mtx_lock_acquire( thread_lock(thread); thread->promotions++; - thread->sched_mode |= TH_MODE_PROMOTED; + thread->sched_flags |= TH_SFLAG_PROMOTED; if (thread->sched_pri < priority) { KERNEL_DEBUG_CONSTANT( MACHDBG_CODE(DBG_MACH_SCHED,MACH_PROMOTE) | DBG_FUNC_NONE, @@ -709,20 +719,22 @@ lck_mtx_unlock_wakeup ( else mutex = &lck->lck_mtx_ptr->lck_mtx; + if (thread != holder) + panic("lck_mtx_unlock_wakeup: mutex %p holder %p\n", mutex, holder); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_START, (int)lck, (int)holder, 0, 0, 0); - if (thread != holder) - panic("lck_mtx_unlock_wakeup: mutex %p holder %p\n", mutex, holder); + assert(mutex->lck_mtx_waiters > 0); + thread_wakeup_one((event_t)(((unsigned int*)lck)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int))); if (thread->promotions > 0) { spl_t s = splsched(); thread_lock(thread); if ( --thread->promotions == 0 && - (thread->sched_mode & TH_MODE_PROMOTED) ) { - thread->sched_mode &= ~TH_MODE_PROMOTED; - if (thread->sched_mode & TH_MODE_ISDEPRESSED) { + (thread->sched_flags & TH_SFLAG_PROMOTED) ) { + thread->sched_flags &= ~TH_SFLAG_PROMOTED; + if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) { KERNEL_DEBUG_CONSTANT( MACHDBG_CODE(DBG_MACH_SCHED,MACH_DEMOTE) | DBG_FUNC_NONE, thread->sched_pri, DEPRESSPRI, 0, lck, 0); @@ -738,14 +750,12 @@ lck_mtx_unlock_wakeup ( 0, lck, 0); } - compute_priority(thread, FALSE); + SCHED(compute_priority)(thread, FALSE); } } thread_unlock(thread); splx(s); } - assert(mutex->lck_mtx_waiters > 0); - thread_wakeup_one((event_t)(((unsigned int*)lck)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int))); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0); } diff --git a/osfmk/kern/locks.h b/osfmk/kern/locks.h index d23fbc36e..659336194 100644 --- a/osfmk/kern/locks.h +++ b/osfmk/kern/locks.h @@ -292,6 +292,7 @@ extern void lck_mtx_unlock(lck_mtx_t *lck) __DARWIN10_ALIAS(lck_mtx_unlock); extern void lck_mtx_unlock( lck_mtx_t *lck); #endif /* __i386__ */ + extern void lck_mtx_destroy( lck_mtx_t *lck, lck_grp_t *grp); @@ -323,18 +324,26 @@ extern void mutex_pause(uint32_t); extern void lck_mtx_yield ( lck_mtx_t *lck); -#if defined(i386) || defined(x86_64) +#if defined(__i386__) || defined(__x86_64__) extern boolean_t lck_mtx_try_lock_spin( lck_mtx_t *lck); +extern void lck_mtx_lock_spin_always( + lck_mtx_t *lck); + extern void lck_mtx_lock_spin( lck_mtx_t *lck); extern void lck_mtx_convert_spin( lck_mtx_t *lck); + +#define lck_mtx_unlock_always(l) lck_mtx_unlock(l) + #else #define lck_mtx_try_lock_spin(l) lck_mtx_try_lock(l) #define lck_mtx_lock_spin(l) lck_mtx_lock(l) +#define lck_mtx_lock_spin_always(l) lck_spin_lock(l) +#define lck_mtx_unlock_always(l) lck_spin_unlock(l) #define lck_mtx_convert_spin(l) do {} while (0) #endif diff --git a/osfmk/kern/mach_param.h b/osfmk/kern/mach_param.h index 1afd09bf5..44b21a9da 100644 --- a/osfmk/kern/mach_param.h +++ b/osfmk/kern/mach_param.h @@ -80,7 +80,7 @@ extern int thread_max, task_threadmax, task_max; + 40000) /* slop for objects */ /* Number of ports, system-wide */ -#define SET_MAX (task_max + thread_max + 200) +#define SET_MAX (task_max + (thread_max * 2) + 200) /* Max number of port sets */ #define ITE_MAX (1 << 16) /* Max number of splay tree entries */ diff --git a/osfmk/kern/machine.c b/osfmk/kern/machine.c index 72d91647e..9b310f031 100644 --- a/osfmk/kern/machine.c +++ b/osfmk/kern/machine.c @@ -88,6 +88,8 @@ #include #include +#include + #if HIBERNATION #include #endif @@ -120,11 +122,14 @@ processor_up( init_ast_check(processor); pset = processor->processor_set; pset_lock(pset); - if (++pset->processor_count == 1) - pset->low_pri = pset->low_count = processor; + if (++pset->online_processor_count == 1) { + pset_pri_init_hint(pset, processor); + pset_count_init_hint(pset, processor); + } enqueue_tail(&pset->active_queue, (queue_entry_t)processor); processor->state = PROCESSOR_RUNNING; (void)hw_atomic_add(&processor_avail_count, 1); + commpage_update_active_cpus(); pset_unlock(pset); ml_cpu_up(); splx(s); @@ -214,10 +219,10 @@ processor_shutdown( } if (processor->state == PROCESSOR_IDLE) - remqueue(&pset->idle_queue, (queue_entry_t)processor); + remqueue((queue_entry_t)processor); else if (processor->state == PROCESSOR_RUNNING) - remqueue(&pset->active_queue, (queue_entry_t)processor); + remqueue((queue_entry_t)processor); processor->state = PROCESSOR_SHUTDOWN; @@ -283,6 +288,7 @@ processor_offline( new_thread = processor->idle_thread; processor->active_thread = new_thread; processor->current_pri = IDLEPRI; + processor->current_thmode = TH_MODE_NONE; processor->deadline = UINT64_MAX; new_thread->last_processor = processor; @@ -298,10 +304,13 @@ processor_offline( pset = processor->processor_set; pset_lock(pset); processor->state = PROCESSOR_OFF_LINE; - if (--pset->processor_count == 0) - pset->low_pri = pset->low_count = PROCESSOR_NULL; + if (--pset->online_processor_count == 0) { + pset_pri_init_hint(pset, PROCESSOR_NULL); + pset_count_init_hint(pset, PROCESSOR_NULL); + } (void)hw_atomic_sub(&processor_avail_count, 1); - processor_queue_shutdown(processor); + commpage_update_active_cpus(); + SCHED(processor_queue_shutdown)(processor); /* pset lock dropped */ ml_cpu_down(); diff --git a/osfmk/kern/misc_protos.h b/osfmk/kern/misc_protos.h index 0b7d5a0cc..f7fb46b3c 100644 --- a/osfmk/kern/misc_protos.h +++ b/osfmk/kern/misc_protos.h @@ -109,7 +109,8 @@ extern int copyoutmsg( mach_msg_size_t nbytes); /* Invalidate copy window(s) cache */ -extern void inval_copy_windows(thread_t); +extern void inval_copy_windows(thread_t); +extern void copy_window_fault(thread_t, vm_map_t, int); extern int sscanf(const char *input, const char *fmt, ...) __scanflike(2,3); @@ -166,6 +167,8 @@ extern void cnputcusr(char); extern void conslog_putc(char); +extern void cons_putc_locked(char); + extern void consdebug_putc(char); extern void consdebug_log(char); diff --git a/osfmk/kern/mk_sp.c b/osfmk/kern/mk_sp.c index 78a6371fd..d8e86124b 100644 --- a/osfmk/kern/mk_sp.c +++ b/osfmk/kern/mk_sp.c @@ -79,23 +79,27 @@ thread_policy_common( if (thread->static_param) return (KERN_SUCCESS); + if ((policy == POLICY_TIMESHARE) + && !SCHED(supports_timeshare_mode)()) + policy = TH_MODE_FIXED; + s = splsched(); thread_lock(thread); - if ( !(thread->sched_mode & TH_MODE_REALTIME) && - !(thread->safe_mode & TH_MODE_REALTIME) ) { - if (!(thread->sched_mode & TH_MODE_FAILSAFE)) { - integer_t oldmode = (thread->sched_mode & TH_MODE_TIMESHARE); + if ( (thread->sched_mode != TH_MODE_REALTIME) && + (thread->saved_mode != TH_MODE_REALTIME) ) { + if (!(thread->sched_flags & TH_SFLAG_DEMOTED_MASK)) { + boolean_t oldmode = thread->sched_mode == TH_MODE_TIMESHARE; if (policy == POLICY_TIMESHARE && !oldmode) { - thread->sched_mode |= TH_MODE_TIMESHARE; + thread->sched_mode = TH_MODE_TIMESHARE; if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN) sched_share_incr(); } else if (policy != POLICY_TIMESHARE && oldmode) { - thread->sched_mode &= ~TH_MODE_TIMESHARE; + thread->sched_mode = TH_MODE_FIXED; if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN) sched_share_decr(); @@ -103,9 +107,9 @@ thread_policy_common( } else { if (policy == POLICY_TIMESHARE) - thread->safe_mode |= TH_MODE_TIMESHARE; + thread->saved_mode = TH_MODE_TIMESHARE; else - thread->safe_mode &= ~TH_MODE_TIMESHARE; + thread->saved_mode = TH_MODE_FIXED; } if (priority >= thread->max_priority) @@ -129,6 +133,12 @@ thread_policy_common( thread->importance = priority - thread->task_priority; +#if CONFIG_EMBEDDED + /* No one can have a base priority less than MAXPRI_THROTTLE */ + if (priority < MAXPRI_THROTTLE) + priority = MAXPRI_THROTTLE; +#endif /* CONFIG_EMBEDDED */ + set_priority(thread, priority); } diff --git a/osfmk/kern/pms.h b/osfmk/kern/pms.h index dceb5bbe5..990c71b2a 100644 --- a/osfmk/kern/pms.h +++ b/osfmk/kern/pms.h @@ -145,9 +145,6 @@ typedef struct { } pmsctl_t; extern pmsCtl pmsCtls; /* Power Management Stepper control */ -#ifdef __ppc__ -extern uint32_t pmsCtlp; -#endif extern uint32_t pmsBroadcastWait; /* Number of outstanding broadcasts */ extern int pmsInstalled; extern int pmsExperimental; @@ -157,12 +154,6 @@ extern pmsSetFunc_t pmsFuncTab[pmsSetFuncMax]; extern pmsQueryFunc_t pmsQueryFunc; extern uint32_t pmsPlatformData; -#ifdef __ppc__ -# ifdef XNU_KERNEL_PRIVATE -# include -# endif /* XNU_KERNEL_PRIVATE */ -extern int pmsCntrl(struct savearea *save); -#endif /* __ppc__ */ extern kern_return_t pmsControl(uint32_t request, user_addr_t reqaddr, uint32_t reqsize); extern void pmsInit(void); extern void pmsStep(int timer); @@ -188,10 +179,8 @@ extern kern_return_t pmsBuild(pmsDef *pd, uint32_t pdsize, pmsSetFunc_t *functab extern void pmsRun(uint32_t nstep); extern void pmsPark(void); extern void pmsStart(void); -# ifndef __ppc__ extern kern_return_t pmsCPULoadVIDTable(uint16_t *tablep, int nstates); /* i386 only */ extern kern_return_t pmsCPUSetPStateLimit(uint32_t limit); -# endif #ifdef __cplusplus } #endif diff --git a/osfmk/kern/printf.c b/osfmk/kern/printf.c index fd04f883a..730be5c81 100644 --- a/osfmk/kern/printf.c +++ b/osfmk/kern/printf.c @@ -172,17 +172,12 @@ #endif #include -#ifdef __ppc__ -#include -#endif - #define isdigit(d) ((d) >= '0' && (d) <= '9') #define Ctod(c) ((c) - '0') #define MAXBUF (sizeof(long long int) * 8) /* enough for binary */ static char digs[] = "0123456789abcdef"; - #if CONFIG_NO_PRINTF_STRINGS /* Prevent CPP from breaking the definition below */ #undef printf @@ -762,6 +757,14 @@ conslog_putc( #endif } +void +cons_putc_locked( + char c) +{ + if ((debug_mode && !disable_debug_output) || !disableConsoleOutput) + cnputc(c); +} + #if MACH_KDB extern void db_putchar(char c); #endif @@ -860,6 +863,8 @@ kdb_printf_unbuffered(const char *fmt, ...) return 0; } +#if !CONFIG_EMBEDDED + static void copybyte(int c, void *arg) { @@ -891,3 +896,4 @@ sprintf(char *buf, const char *fmt, ...) *copybyte_str = '\0'; return (int)strlen(buf); } +#endif /* !CONFIG_EMBEDDED */ diff --git a/osfmk/kern/priority.c b/osfmk/kern/priority.c index b46c5bee3..74b90dfa8 100644 --- a/osfmk/kern/priority.c +++ b/osfmk/kern/priority.c @@ -69,6 +69,7 @@ #include #include #include +#include #include #include #include @@ -93,63 +94,58 @@ thread_quantum_expire( thread_lock(thread); + /* + * We've run up until our quantum expiration, and will (potentially) + * continue without re-entering the scheduler, so update this now. + */ + thread->last_run_time = processor->quantum_end; + /* * Check for fail-safe trip. */ - if (!(thread->sched_mode & (TH_MODE_TIMESHARE|TH_MODE_PROMOTED))) { - uint64_t new_computation; + if ((thread->sched_mode == TH_MODE_REALTIME || thread->sched_mode == TH_MODE_FIXED) && + !(thread->sched_flags & TH_SFLAG_PROMOTED) && + !(thread->options & TH_OPT_SYSTEM_CRITICAL)) { + uint64_t new_computation; - new_computation = processor->quantum_end; - new_computation -= thread->computation_epoch; - if (new_computation + thread->computation_metered > - max_unsafe_computation) { + new_computation = processor->quantum_end - thread->computation_epoch; + new_computation += thread->computation_metered; + if (new_computation > max_unsafe_computation) { - if (thread->sched_mode & TH_MODE_REALTIME) { - thread->priority = DEPRESSPRI; + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_FAILSAFE)|DBG_FUNC_NONE, + (uintptr_t)thread->sched_pri, (uintptr_t)thread->sched_mode, 0, 0, 0); - thread->safe_mode |= TH_MODE_REALTIME; - thread->sched_mode &= ~TH_MODE_REALTIME; + if (thread->sched_mode == TH_MODE_REALTIME) { + thread->priority = DEPRESSPRI; + } + + thread->saved_mode = thread->sched_mode; + + if (SCHED(supports_timeshare_mode)) { + sched_share_incr(); + thread->sched_mode = TH_MODE_TIMESHARE; + } else { + /* XXX handle fixed->fixed case */ + thread->sched_mode = TH_MODE_FIXED; } - sched_share_incr(); - - thread->safe_release = sched_tick + sched_safe_duration; - thread->sched_mode |= (TH_MODE_FAILSAFE|TH_MODE_TIMESHARE); + thread->safe_release = processor->quantum_end + sched_safe_duration; + thread->sched_flags |= TH_SFLAG_FAILSAFE; } } /* * Recompute scheduled priority if appropriate. */ - if (thread->sched_stamp != sched_tick) - update_priority(thread); + if (SCHED(can_update_priority)(thread)) + SCHED(update_priority)(thread); else - if (thread->sched_mode & TH_MODE_TIMESHARE) { - register uint32_t delta; - - thread_timer_delta(thread, delta); - - /* - * Accumulate timesharing usage only - * during contention for processor - * resources. - */ - if (thread->pri_shift < INT8_MAX) - thread->sched_usage += delta; - - thread->cpu_delta += delta; - - /* - * Adjust the scheduled priority if - * the thread has not been promoted - * and is not depressed. - */ - if ( !(thread->sched_mode & TH_MODE_PROMOTED) && - !(thread->sched_mode & TH_MODE_ISDEPRESSED) ) - compute_my_priority(thread); - } + SCHED(lightweight_update_priority)(thread); + SCHED(quantum_expire)(thread); + processor->current_pri = thread->sched_pri; + processor->current_thmode = thread->sched_mode; /* * This quantum is up, give this thread another. @@ -158,9 +154,11 @@ thread_quantum_expire( processor->timeslice--; thread_quantum_init(thread); + thread->last_quantum_refill_time = processor->quantum_end; + processor->quantum_end += thread->current_quantum; timer_call_enter1(&processor->quantum_timer, - thread, processor->quantum_end); + thread, processor->quantum_end, 0); /* * Context switch check. @@ -173,7 +171,7 @@ thread_quantum_expire( pset_lock(pset); pset_pri_hint(pset, processor, processor->current_pri); - pset_count_hint(pset, processor, processor->runq.count); + pset_count_hint(pset, processor, SCHED(processor_runq_count)(processor)); pset_unlock(pset); } @@ -181,6 +179,46 @@ thread_quantum_expire( thread_unlock(thread); } +#if defined(CONFIG_SCHED_TRADITIONAL) + +void +sched_traditional_quantum_expire(thread_t thread __unused) +{ + /* + * No special behavior when a timeshare, fixed, or realtime thread + * uses up its entire quantum + */ +} + +void +lightweight_update_priority(thread_t thread) +{ + if (thread->sched_mode == TH_MODE_TIMESHARE) { + register uint32_t delta; + + thread_timer_delta(thread, delta); + + /* + * Accumulate timesharing usage only + * during contention for processor + * resources. + */ + if (thread->pri_shift < INT8_MAX) + thread->sched_usage += delta; + + thread->cpu_delta += delta; + + /* + * Adjust the scheduled priority if + * the thread has not been promoted + * and is not depressed. + */ + if ( !(thread->sched_flags & TH_SFLAG_PROMOTED) && + !(thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) ) + compute_my_priority(thread); + } +} + /* * Define shifts for simulating (5/8) ** n * @@ -236,6 +274,8 @@ static struct shift_data sched_decay_shifts[SCHED_DECAY_TICKS] = { (pri) = MAXPRI_KERNEL; \ MACRO_END +#endif /* defined(CONFIG_SCHED_TRADITIONAL) */ + #endif /* @@ -252,9 +292,11 @@ set_priority( register int priority) { thread->priority = priority; - compute_priority(thread, FALSE); + SCHED(compute_priority)(thread, FALSE); } +#if defined(CONFIG_SCHED_TRADITIONAL) + /* * compute_priority: * @@ -271,10 +313,10 @@ compute_priority( { register int priority; - if ( !(thread->sched_mode & TH_MODE_PROMOTED) && - (!(thread->sched_mode & TH_MODE_ISDEPRESSED) || + if ( !(thread->sched_flags & TH_SFLAG_PROMOTED) && + (!(thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) || override_depress ) ) { - if (thread->sched_mode & TH_MODE_TIMESHARE) + if (thread->sched_mode == TH_MODE_TIMESHARE) do_priority_computation(thread, priority); else priority = thread->priority; @@ -305,6 +347,23 @@ compute_my_priority( thread->sched_pri = priority; } +/* + * can_update_priority + * + * Make sure we don't do re-dispatches more frequently than a scheduler tick. + * + * Called with the thread locked. + */ +boolean_t +can_update_priority( + thread_t thread) +{ + if (sched_tick == thread->sched_stamp) + return (FALSE); + else + return (TRUE); +} + /* * update_priority * @@ -368,43 +427,45 @@ update_priority( /* * Check for fail-safe release. */ - if ( (thread->sched_mode & TH_MODE_FAILSAFE) && - thread->sched_stamp >= thread->safe_release ) { - if (!(thread->safe_mode & TH_MODE_TIMESHARE)) { - if (thread->safe_mode & TH_MODE_REALTIME) { + if ( (thread->sched_flags & TH_SFLAG_FAILSAFE) && + mach_absolute_time() >= thread->safe_release ) { + if (thread->saved_mode != TH_MODE_TIMESHARE) { + if (thread->saved_mode == TH_MODE_REALTIME) { thread->priority = BASEPRI_RTQUEUES; - - thread->sched_mode |= TH_MODE_REALTIME; } - thread->sched_mode &= ~TH_MODE_TIMESHARE; + thread->sched_mode = thread->saved_mode; + thread->saved_mode = TH_MODE_NONE; if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN) sched_share_decr(); - if (!(thread->sched_mode & TH_MODE_ISDEPRESSED)) + if (!(thread->sched_flags & TH_SFLAG_DEPRESSED_MASK)) set_sched_pri(thread, thread->priority); } - thread->safe_mode = 0; - thread->sched_mode &= ~TH_MODE_FAILSAFE; + thread->sched_flags &= ~TH_SFLAG_FAILSAFE; } /* * Recompute scheduled priority if appropriate. */ - if ( (thread->sched_mode & TH_MODE_TIMESHARE) && - !(thread->sched_mode & TH_MODE_PROMOTED) && - !(thread->sched_mode & TH_MODE_ISDEPRESSED) ) { + if ( (thread->sched_mode == TH_MODE_TIMESHARE) && + !(thread->sched_flags & TH_SFLAG_PROMOTED) && + !(thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) ) { register int new_pri; do_priority_computation(thread, new_pri); if (new_pri != thread->sched_pri) { - boolean_t removed = run_queue_remove(thread); + boolean_t removed = thread_run_queue_remove(thread); thread->sched_pri = new_pri; if (removed) thread_setrun(thread, SCHED_TAILQ); } } + + return; } + +#endif /* CONFIG_SCHED_TRADITIONAL */ diff --git a/osfmk/kern/processor.c b/osfmk/kern/processor.c index ca65ceca6..b0771351f 100644 --- a/osfmk/kern/processor.c +++ b/osfmk/kern/processor.c @@ -89,6 +89,7 @@ struct pset_node pset_node0; decl_simple_lock_data(static,pset_node_lock) queue_head_t tasks; +queue_head_t terminated_tasks; /* To be used ONLY for stackshot. */ int tasks_count; queue_head_t threads; int threads_count; @@ -103,6 +104,7 @@ uint32_t processor_avail_count; processor_t master_processor; int master_cpu = 0; +boolean_t sched_stats_active = FALSE; /* Forwards */ kern_return_t processor_set_things( @@ -120,6 +122,7 @@ processor_bootstrap(void) simple_lock_init(&pset_node_lock, 0); queue_init(&tasks); + queue_init(&terminated_tasks); queue_init(&threads); simple_lock_init(&processor_list_lock, 0); @@ -140,12 +143,16 @@ processor_init( int cpu_id, processor_set_t pset) { - run_queue_init(&processor->runq); + if (processor != master_processor) { + /* Scheduler state deferred until sched_init() */ + SCHED(processor_init)(processor); + } processor->state = PROCESSOR_OFF_LINE; processor->active_thread = processor->next_thread = processor->idle_thread = THREAD_NULL; processor->processor_set = pset; processor->current_pri = MINPRI; + processor->current_thmode = TH_MODE_NONE; processor->cpu_id = cpu_id; timer_call_setup(&processor->quantum_timer, thread_quantum_expire, processor); processor->deadline = UINT64_MAX; @@ -236,10 +243,16 @@ pset_init( processor_set_t pset, pset_node_t node) { + if (pset != &pset0) { + /* Scheduler state deferred until sched_init() */ + SCHED(pset_init)(pset); + } + queue_init(&pset->active_queue); queue_init(&pset->idle_queue); - pset->processor_count = 0; - pset->low_pri = pset->low_count = PROCESSOR_NULL; + pset->online_processor_count = 0; + pset_pri_init_hint(pset, PROCESSOR_NULL); + pset_count_init_hint(pset, PROCESSOR_NULL); pset->cpu_set_low = pset->cpu_set_hi = 0; pset->cpu_set_count = 0; pset_lock_init(pset); @@ -321,16 +334,32 @@ processor_info( { register processor_cpu_load_info_t cpu_load_info; - if (*count < PROCESSOR_CPU_LOAD_INFO_COUNT) + if (*count < PROCESSOR_CPU_LOAD_INFO_COUNT) return (KERN_FAILURE); - cpu_load_info = (processor_cpu_load_info_t) info; + cpu_load_info = (processor_cpu_load_info_t) info; cpu_load_info->cpu_ticks[CPU_STATE_USER] = (uint32_t)(timer_grab(&PROCESSOR_DATA(processor, user_state)) / hz_tick_interval); cpu_load_info->cpu_ticks[CPU_STATE_SYSTEM] = (uint32_t)(timer_grab(&PROCESSOR_DATA(processor, system_state)) / hz_tick_interval); - cpu_load_info->cpu_ticks[CPU_STATE_IDLE] = + { + timer_data_t idle_temp; + timer_t idle_state; + + idle_state = &PROCESSOR_DATA(processor, idle_state); + idle_temp = *idle_state; + + if (PROCESSOR_DATA(processor, current_state) != idle_state || + timer_grab(&idle_temp) != timer_grab(idle_state)) + cpu_load_info->cpu_ticks[CPU_STATE_IDLE] = (uint32_t)(timer_grab(&PROCESSOR_DATA(processor, idle_state)) / hz_tick_interval); + else { + timer_advance(&idle_temp, mach_absolute_time() - idle_temp.tstamp); + + cpu_load_info->cpu_ticks[CPU_STATE_IDLE] = + (uint32_t)(timer_grab(&idle_temp) / hz_tick_interval); + } + } cpu_load_info->cpu_ticks[CPU_STATE_NICE] = 0; *count = PROCESSOR_CPU_LOAD_INFO_COUNT; diff --git a/osfmk/kern/processor.h b/osfmk/kern/processor.h index 342a90081..0407b8541 100644 --- a/osfmk/kern/processor.h +++ b/osfmk/kern/processor.h @@ -87,13 +87,19 @@ struct processor_set { processor_t low_pri, low_count; - int processor_count; + int online_processor_count; int cpu_set_low, cpu_set_hi; int cpu_set_count; decl_simple_lock_data(,sched_lock) /* lock for above */ +#if defined(CONFIG_SCHED_TRADITIONAL) || defined(CONFIG_SCHED_FIXEDPRIORITY) + struct run_queue pset_runq; /* runq for this processor set */ + int pset_runq_bound_count; + /* # of threads in runq bound to any processor in pset */ +#endif + struct ipc_port * pset_self; /* port for operations */ struct ipc_port * pset_name_self; /* port for information */ @@ -114,7 +120,7 @@ struct pset_node { extern struct pset_node pset_node0; -extern queue_head_t tasks, threads; +extern queue_head_t tasks, terminated_tasks, threads; /* Terminated tasks are ONLY for stackshot */ extern int tasks_count, threads_count; decl_lck_mtx_data(extern,tasks_threads_lock) @@ -138,6 +144,7 @@ struct processor { processor_set_t processor_set; /* assigned set */ int current_pri; /* priority of current thread */ + sched_mode_t current_thmode; /* sched mode of current thread */ int cpu_id; /* platform numeric id */ timer_call_data_t quantum_timer; /* timer for quantum expiration */ @@ -147,7 +154,13 @@ struct processor { uint64_t deadline; /* current deadline */ int timeslice; /* quanta before timeslice ends */ +#if defined(CONFIG_SCHED_TRADITIONAL) || defined(CONFIG_SCHED_FIXEDPRIORITY) struct run_queue runq; /* runq for this processor */ + int runq_bound_count; /* # of threads bound to this processor */ +#endif +#if defined(CONFIG_SCHED_GRRR) + struct grrr_run_queue grrr_runq; /* Group Ratio Round-Robin runq */ +#endif processor_meta_t processor_meta; struct ipc_port * processor_self; /* port for operations */ @@ -164,6 +177,8 @@ extern uint32_t processor_avail_count; extern processor_t master_processor; +extern boolean_t sched_stats_active; + /* * Processor state is accessed by locking the scheduling lock * for the assigned processor set. @@ -203,7 +218,7 @@ MACRO_END #define pset_count_hint(ps, p, cnt) \ MACRO_BEGIN \ if ((p) != (ps)->low_count) { \ - if ((cnt) < (ps)->low_count->runq.count) \ + if ((cnt) < SCHED(processor_runq_count)((ps)->low_count)) \ (ps)->low_count = (p); \ else \ if ((ps)->low_count->state < PROCESSOR_IDLE) \ @@ -211,6 +226,17 @@ MACRO_BEGIN \ } \ MACRO_END +#define pset_pri_init_hint(ps, p) \ +MACRO_BEGIN \ + (ps)->low_pri = (p); \ +MACRO_END + +#define pset_count_init_hint(ps, p) \ +MACRO_BEGIN \ + (ps)->low_count = (p); \ +MACRO_END + + extern void processor_bootstrap(void) __attribute__((section("__TEXT, initcode"))); extern void processor_init( diff --git a/osfmk/kern/processor_data.h b/osfmk/kern/processor_data.h index 200ec35f3..eda5bcce5 100644 --- a/osfmk/kern/processor_data.h +++ b/osfmk/kern/processor_data.h @@ -41,6 +41,18 @@ #include #include +struct processor_sched_statistics { + uint32_t csw_count; + uint32_t preempt_count; + uint32_t preempted_rt_count; + uint32_t preempted_by_rt_count; + uint32_t rt_sched_count; + uint32_t interrupt_count; + uint32_t ipi_count; + uint32_t timer_pop_count; + uint32_t idle_transitions; +}; + struct processor_data { /* Processor state statistics */ timer_data_t idle_state; @@ -72,6 +84,8 @@ struct processor_data { unsigned long page_grab_count; int start_color; void *free_pages; + + struct processor_sched_statistics sched_stats; }; typedef struct processor_data processor_data_t; @@ -82,6 +96,34 @@ typedef struct processor_data processor_data_t; extern void processor_data_init( processor_t processor); +#define SCHED_STATS_INTERRUPT(p) \ +MACRO_BEGIN \ + if (__builtin_expect(sched_stats_active, 0)) { \ + (p)->processor_data.sched_stats.interrupt_count++; \ + } \ +MACRO_END + +#define SCHED_STATS_TIMER_POP(p) \ +MACRO_BEGIN \ + if (__builtin_expect(sched_stats_active, 0)) { \ + (p)->processor_data.sched_stats.timer_pop_count++; \ + } \ +MACRO_END + +#define SCHED_STATS_IPI(p) \ +MACRO_BEGIN \ + if (__builtin_expect(sched_stats_active, 0)) { \ + (p)->processor_data.sched_stats.ipi_count++; \ + } \ +MACRO_END + +#define SCHED_STATS_CPU_IDLE_START(p) \ +MACRO_BEGIN \ + if (__builtin_expect(sched_stats_active, 0)) { \ + (p)->processor_data.sched_stats.idle_transitions++; \ + } \ +MACRO_END + #endif /* MACH_KERNEL_PRIVATE */ #endif /* _KERN_PROCESSOR_DATA_H_ */ diff --git a/osfmk/kern/queue.c b/osfmk/kern/queue.c index 06eba9ebc..052770f7a 100644 --- a/osfmk/kern/queue.c +++ b/osfmk/kern/queue.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -28,66 +28,6 @@ /* * @OSF_COPYRIGHT@ */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:33 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:25:55 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.10.3 1995/03/15 17:21:19 bruel - * compile only if !__GNUC__. - * [95/03/09 bruel] - * - * Revision 1.1.10.2 1995/01/06 19:48:05 devrcs - * mk6 CR668 - 1.3b26 merge - * * Revision 1.1.3.5 1994/05/06 18:51:43 tmt - * Merge in DEC Alpha changes to osc1.3b19. - * Merge Alpha changes into osc1.312b source code. - * Remove ifdef sun around insque and remque. - * * End1.3merge - * [1994/11/04 09:29:15 dwm] - * - * Revision 1.1.10.1 1994/09/23 02:25:00 ezf - * change marker to not FREE - * [1994/09/22 21:35:34 ezf] - * - * Revision 1.1.3.3 1993/07/28 17:16:26 bernard - * CR9523 -- Prototypes. - * [1993/07/21 17:00:38 bernard] - * - * Revision 1.1.3.2 1993/06/02 23:39:41 jeffc - * Added to OSF/1 R1.3 from NMK15.0. - * [1993/06/02 21:13:58 jeffc] - * - * Revision 1.1 1992/09/30 02:09:52 robert - * Initial revision - * - * $EndLog$ - */ -/* CMU_HIST */ -/* - * Revision 2.4 91/05/14 16:45:45 mrt - * Correcting copyright - * - * Revision 2.3 91/05/08 12:48:22 dbg - * Compile queue routines on vax. - * [91/03/26 dbg] - * - * Revision 2.2 91/02/05 17:28:38 mrt - * Changed to new Mach copyright - * [91/02/01 16:16:22 mrt] - * - * Revision 2.1 89/08/03 15:51:47 rwd - * Created. - * - * 17-Mar-87 David Golub (dbg) at Carnegie-Mellon University - * Created from routines written by David L. Black. - * - */ -/* CMU_ENDHIST */ /* * Mach Operating System * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University @@ -190,14 +130,13 @@ dequeue_tail( /* * Remove arbitrary element from queue. - * Does not check whether element is on queue - the world + * Does not check whether element is on a queue - the world * will go haywire if it isn't. */ /*ARGSUSED*/ void remqueue( - queue_t que, register queue_entry_t elt) { elt->next->prev = elt->prev; diff --git a/osfmk/kern/queue.h b/osfmk/kern/queue.h index d0bab0c61..836b55293 100644 --- a/osfmk/kern/queue.h +++ b/osfmk/kern/queue.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -106,7 +106,7 @@ typedef struct queue_entry *queue_entry_t; /* * enqueue puts "elt" on the "queue". * dequeue returns the first element in the "queue". - * remqueue removes the specified "elt" from the specified "queue". + * remqueue removes the specified "elt" from its queue. */ #define enqueue(queue,elt) enqueue_tail(queue, elt) @@ -137,7 +137,6 @@ extern queue_entry_t dequeue_tail( /* Dequeue element */ extern void remqueue( - queue_t que, queue_entry_t elt); /* Enqueue element after a particular elem */ @@ -153,6 +152,15 @@ __END_DECLS #else /* !__GNUC__ */ +#ifdef XNU_KERNEL_PRIVATE +#define __DEQUEUE_ELT_CLEANUP(elt) do { \ + (elt)->next = (queue_entry_t) 0; \ + (elt)->prev = (queue_entry_t) 0; \ + } while (0) +#else +#define __DEQUEUE_ELT_CLEANUP(elt) do { } while(0) +#endif /* !XNU_KERNEL_PRIVATE */ + static __inline__ void enqueue_head( queue_t que, @@ -185,6 +193,7 @@ dequeue_head( elt = que->next; elt->next->prev = que; que->next = elt->next; + __DEQUEUE_ELT_CLEANUP(elt); } return (elt); @@ -200,6 +209,7 @@ dequeue_tail( elt = que->prev; elt->prev->next = que; que->prev = elt->prev; + __DEQUEUE_ELT_CLEANUP(elt); } return (elt); @@ -207,11 +217,11 @@ dequeue_tail( static __inline__ void remqueue( - __unused queue_t que, queue_entry_t elt) { elt->next->prev = elt->prev; elt->prev->next = elt->next; + __DEQUEUE_ELT_CLEANUP(elt); } static __inline__ void @@ -231,6 +241,7 @@ remque( { (elt->next)->prev = elt->prev; (elt->prev)->next = elt->next; + __DEQUEUE_ELT_CLEANUP(elt); } #endif /* !__GNUC__ */ @@ -603,34 +614,53 @@ MACRO_END */ struct mpqueue_head { struct queue_entry head; /* header for queue */ - decl_simple_lock_data(, lock) /* lock for queue */ + lck_mtx_t lock_data; + lck_mtx_ext_t lock_data_ext; }; typedef struct mpqueue_head mpqueue_head_t; #define round_mpq(size) (size) -#define mpqueue_init(q) \ + +#if defined(__i386__) || defined(__x86_64__) + +#define mpqueue_init(q, lck_grp, lck_attr) \ +MACRO_BEGIN \ + queue_init(&(q)->head); \ + lck_mtx_init_ext(&(q)->lock_data, \ + &(q)->lock_data_ext, \ + lck_grp, \ + lck_attr); \ +MACRO_END + +#else + +#define mpqueue_init(q, lck_grp, lck_attr) \ MACRO_BEGIN \ queue_init(&(q)->head); \ - simple_lock_init(&(q)->lock, 0); \ + lck_spin_init(&(q)->lock_data, \ + lck_grp, \ + lck_attr); \ MACRO_END +#endif + #define mpenqueue_tail(q, elt) \ MACRO_BEGIN \ - simple_lock(&(q)->lock); \ + lck_mtx_lock_spin_always(&(q)->lock_data); \ enqueue_tail(&(q)->head, elt); \ - simple_unlock(&(q)->lock); \ + lck_mtx_unlock_always(&(q)->lock_data); \ MACRO_END #define mpdequeue_head(q, elt) \ MACRO_BEGIN \ - simple_lock(&(q)->lock); \ + lck_mtx_lock_spin_always(&(q)->lock_data); \ if (queue_empty(&(q)->head)) \ *(elt) = 0; \ else \ *(elt) = dequeue_head(&(q)->head); \ - simple_unlock(&(q)->lock); \ + lck_mtx_unlock_always(&(q)->lock_data); \ MACRO_END #endif /* MACH_KERNEL_PRIVATE */ diff --git a/osfmk/kern/sched.h b/osfmk/kern/sched.h index 10f7b4680..9532f4095 100644 --- a/osfmk/kern/sched.h +++ b/osfmk/kern/sched.h @@ -166,40 +166,108 @@ #define DEPRESSPRI MINPRI /* depress priority */ #endif +/* Type used for thread->sched_mode and saved_mode */ +typedef enum { + TH_MODE_NONE = 0, /* unassigned, usually for saved_mode only */ + TH_MODE_REALTIME, /* time constraints supplied */ + TH_MODE_FIXED, /* use fixed priorities, no decay */ + TH_MODE_TIMESHARE, /* use timesharing algorithm */ + TH_MODE_FAIRSHARE /* use fair-share scheduling */ +} sched_mode_t; + /* * Macro to check for invalid priorities. */ #define invalid_pri(pri) ((pri) < MINPRI || (pri) > MAXPRI) +struct runq_stats { + uint64_t count_sum; + uint64_t last_change_timestamp; +}; + +#if defined(CONFIG_SCHED_TRADITIONAL) || defined(CONFIG_SCHED_PROTO) || defined(CONFIG_SCHED_FIXEDPRIORITY) + struct run_queue { int highq; /* highest runnable queue */ int bitmap[NRQBM]; /* run queue bitmap array */ int count; /* # of threads total */ int urgency; /* level of preemption urgency */ queue_head_t queues[NRQS]; /* one for each priority */ + + struct runq_stats runq_stats; }; -typedef struct run_queue *run_queue_t; -#define RUN_QUEUE_NULL ((run_queue_t) 0) +#endif /* defined(CONFIG_SCHED_TRADITIONAL) || defined(CONFIG_SCHED_PROTO) || defined(CONFIG_SCHED_FIXEDPRIORITY) */ -#define first_timeslice(processor) ((processor)->timeslice > 0) +struct rt_queue { + int count; /* # of threads total */ + queue_head_t queue; /* all runnable RT threads */ -#define thread_quantum_init(thread) \ -MACRO_BEGIN \ - (thread)->current_quantum = \ - ((thread)->sched_mode & TH_MODE_REALTIME)? \ - (thread)->realtime.computation: std_quantum; \ -MACRO_END + struct runq_stats runq_stats; +}; + +#if defined(CONFIG_SCHED_TRADITIONAL) || defined(CONFIG_SCHED_PROTO) || defined(CONFIG_SCHED_FIXEDPRIORITY) +struct fairshare_queue { + int count; /* # of threads total */ + queue_head_t queue; /* all runnable threads demoted to fairshare scheduling */ + + struct runq_stats runq_stats; +}; +#endif -extern struct run_queue rt_runq; +#if defined(CONFIG_SCHED_GRRR_CORE) /* - * Scheduler routines. + * We map standard Mach priorities to an abstract scale that more properly + * indicates how we want processor time allocated under contention. */ +typedef uint8_t grrr_proportional_priority_t; +typedef uint8_t grrr_group_index_t; + +#define NUM_GRRR_PROPORTIONAL_PRIORITIES 256 +#define MAX_GRRR_PROPORTIONAL_PRIORITY ((grrr_proportional_priority_t)255) + +#if 0 +#define NUM_GRRR_GROUPS 8 /* log(256) */ +#endif + +#define NUM_GRRR_GROUPS 64 /* 256/4 */ + +struct grrr_group { + queue_chain_t priority_order; /* next greatest weight group */ + grrr_proportional_priority_t minpriority; + grrr_group_index_t index; + + queue_head_t clients; + int count; + uint32_t weight; +#if 0 + uint32_t deferred_removal_weight; +#endif + uint32_t work; + thread_t current_client; +}; + +struct grrr_run_queue { + int count; + uint32_t last_rescale_tick; + struct grrr_group groups[NUM_GRRR_GROUPS]; + queue_head_t sorted_group_list; + uint32_t weight; + grrr_group_t current_group; + + struct runq_stats runq_stats; +}; + +#endif /* defined(CONFIG_SCHED_GRRR_CORE) */ + +#define first_timeslice(processor) ((processor)->timeslice > 0) -/* Remove thread from its run queue */ -extern boolean_t run_queue_remove( - thread_t thread); +extern struct rt_queue rt_runq; + +/* + * Scheduler routines. + */ /* Handle quantum expiration for an executing thread */ extern void thread_quantum_expire( @@ -209,13 +277,21 @@ extern void thread_quantum_expire( /* Context switch check for current processor */ extern ast_t csw_check(processor_t processor); +#if defined(CONFIG_SCHED_TRADITIONAL) extern uint32_t std_quantum, min_std_quantum; extern uint32_t std_quantum_us; +#endif + +extern uint32_t thread_depress_time; +extern uint32_t default_timeshare_computation; +extern uint32_t default_timeshare_constraint; extern uint32_t max_rt_quantum, min_rt_quantum; extern uint32_t sched_cswtime; +#if defined(CONFIG_SCHED_TRADITIONAL) + /* * Age usage (1 << SCHED_TICK_SHIFT) times per second. */ @@ -224,6 +300,10 @@ extern uint32_t sched_cswtime; extern unsigned sched_tick; extern uint32_t sched_tick_interval; +#endif /* CONFIG_SCHED_TRADITIONAL */ + +extern uint64_t sched_one_second_interval; + /* Periodic computation of various averages */ extern void compute_averages(void); @@ -236,16 +316,24 @@ extern void compute_stack_target( extern void compute_memory_pressure( void *arg); +extern void compute_zone_gc_throttle( + void *arg); + +extern void compute_pmap_gc_throttle( + void *arg); + /* * Conversion factor from usage * to priority. */ +#if defined(CONFIG_SCHED_TRADITIONAL) extern uint32_t sched_pri_shift; extern uint32_t sched_fixed_shift; extern int8_t sched_load_shifts[NRQS]; +#endif extern int32_t sched_poll_yield_shift; -extern uint32_t sched_safe_duration; +extern uint64_t sched_safe_duration; extern uint32_t sched_run_count, sched_share_count; extern uint32_t sched_load_average, sched_mach_factor; @@ -256,13 +344,13 @@ extern uint64_t max_unsafe_computation; extern uint64_t max_poll_computation; #define sched_run_incr() \ -MACRO_BEGIN \ - machine_run_count(hw_atomic_add(&sched_run_count, 1)); \ +MACRO_BEGIN \ + hw_atomic_add(&sched_run_count, 1); \ MACRO_END #define sched_run_decr() \ -MACRO_BEGIN \ - machine_run_count(hw_atomic_sub(&sched_run_count, 1)); \ +MACRO_BEGIN \ + hw_atomic_sub(&sched_run_count, 1); \ MACRO_END #define sched_share_incr() \ diff --git a/osfmk/kern/sched_average.c b/osfmk/kern/sched_average.c index e20ddff73..5db621937 100644 --- a/osfmk/kern/sched_average.c +++ b/osfmk/kern/sched_average.c @@ -72,6 +72,7 @@ uint32_t avenrun[3] = {0, 0, 0}; uint32_t mach_factor[3] = {0, 0, 0}; +#if defined(CONFIG_SCHED_TRADITIONAL) /* * Values are scaled by LOAD_SCALE, defined in processor_info.h */ @@ -87,22 +88,24 @@ static uint32_t fract[3] = { #undef base #undef frac +#endif /* CONFIG_SCHED_TRADITIONAL */ + static unsigned int sched_nrun; typedef void (*sched_avg_comp_t)( void *param); -#define SCHED_AVG_SECS(n) ((n) << SCHED_TICK_SHIFT) - static struct sched_average { sched_avg_comp_t comp; void *param; - int period; - int tick; + int period; /* in seconds */ + uint64_t deadline; } sched_average[] = { - { compute_averunnable, &sched_nrun, SCHED_AVG_SECS(5), 0 }, - { compute_stack_target, NULL, SCHED_AVG_SECS(5), 1 }, - { compute_memory_pressure, NULL, SCHED_AVG_SECS(1), 0 }, + { compute_averunnable, &sched_nrun, 5, 0 }, + { compute_stack_target, NULL, 5, 1 }, + { compute_memory_pressure, NULL, 1, 0 }, + { compute_zone_gc_throttle, NULL, 1, 0 }, + { compute_pmap_gc_throttle, NULL, 60, 0 }, { NULL, NULL, 0, 0 } }; @@ -114,7 +117,8 @@ compute_averages(void) int ncpus, nthreads, nshared; uint32_t factor_now, average_now, load_now = 0; sched_average_t avg; - + uint64_t abstime; + /* * Retrieve counts, ignoring * the current thread. @@ -154,6 +158,13 @@ compute_averages(void) load_now = NRQS - 1; } + /* + * Sample total running threads. + */ + sched_nrun = nthreads; + +#if defined(CONFIG_SCHED_TRADITIONAL) + /* * The conversion factor consists of * two components: a fixed value based @@ -167,11 +178,6 @@ compute_averages(void) */ sched_pri_shift = sched_fixed_shift - sched_load_shifts[load_now]; - /* - * Sample total running threads. - */ - sched_nrun = nthreads; - /* * Compute old-style Mach load averages. */ @@ -186,14 +192,16 @@ compute_averages(void) (average_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE; } } +#endif /* CONFIG_SCHED_TRADITIONAL */ /* * Compute averages in other components. */ + abstime = mach_absolute_time(); for (avg = sched_average; avg->comp != NULL; ++avg) { - if (++avg->tick >= avg->period) { + if (abstime >= avg->deadline) { (*avg->comp)(avg->param); - avg->tick = 0; + avg->deadline = abstime + avg->period * sched_one_second_interval; } } } diff --git a/osfmk/kern/sched_fixedpriority.c b/osfmk/kern/sched_fixedpriority.c new file mode 100644 index 000000000..1eca4aaac --- /dev/null +++ b/osfmk/kern/sched_fixedpriority.c @@ -0,0 +1,727 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include + +static void +sched_fixedpriority_init(void); + +static void +sched_fixedpriority_with_pset_runqueue_init(void); + +static void +sched_fixedpriority_timebase_init(void); + +static void +sched_fixedpriority_processor_init(processor_t processor); + +static void +sched_fixedpriority_pset_init(processor_set_t pset); + +static void +sched_fixedpriority_maintenance_continuation(void); + +static thread_t +sched_fixedpriority_choose_thread(processor_t processor, + int priority); + +static thread_t +sched_fixedpriority_steal_thread(processor_set_t pset); + +static void +sched_fixedpriority_compute_priority(thread_t thread, + boolean_t override_depress); + +static processor_t +sched_fixedpriority_choose_processor( processor_set_t pset, + processor_t processor, + thread_t thread); + + +static boolean_t +sched_fixedpriority_processor_enqueue( + processor_t processor, + thread_t thread, + integer_t options); + +static void +sched_fixedpriority_processor_queue_shutdown( + processor_t processor); + +static boolean_t +sched_fixedpriority_processor_queue_remove( + processor_t processor, + thread_t thread); + +static boolean_t +sched_fixedpriority_processor_queue_empty(processor_t processor); + +static boolean_t +sched_fixedpriority_processor_queue_has_priority(processor_t processor, + int priority, + boolean_t gte); + +static boolean_t +sched_fixedpriority_priority_is_urgent(int priority); + +static ast_t +sched_fixedpriority_processor_csw_check(processor_t processor); + +static uint32_t +sched_fixedpriority_initial_quantum_size(thread_t thread); + +static sched_mode_t +sched_fixedpriority_initial_thread_sched_mode(task_t parent_task); + +static boolean_t +sched_fixedpriority_supports_timeshare_mode(void); + +static boolean_t +sched_fixedpriority_can_update_priority(thread_t thread); + +static void +sched_fixedpriority_update_priority(thread_t thread); + +static void +sched_fixedpriority_lightweight_update_priority(thread_t thread); + +static void +sched_fixedpriority_quantum_expire(thread_t thread); + +static boolean_t +sched_fixedpriority_should_current_thread_rechoose_processor(processor_t processor); + +static int +sched_fixedpriority_processor_runq_count(processor_t processor); + +static uint64_t +sched_fixedpriority_processor_runq_stats_count_sum(processor_t processor); + +const struct sched_dispatch_table sched_fixedpriority_dispatch = { + sched_fixedpriority_init, + sched_fixedpriority_timebase_init, + sched_fixedpriority_processor_init, + sched_fixedpriority_pset_init, + sched_fixedpriority_maintenance_continuation, + sched_fixedpriority_choose_thread, + sched_fixedpriority_steal_thread, + sched_fixedpriority_compute_priority, + sched_fixedpriority_choose_processor, + sched_fixedpriority_processor_enqueue, + sched_fixedpriority_processor_queue_shutdown, + sched_fixedpriority_processor_queue_remove, + sched_fixedpriority_processor_queue_empty, + sched_fixedpriority_priority_is_urgent, + sched_fixedpriority_processor_csw_check, + sched_fixedpriority_processor_queue_has_priority, + sched_fixedpriority_initial_quantum_size, + sched_fixedpriority_initial_thread_sched_mode, + sched_fixedpriority_supports_timeshare_mode, + sched_fixedpriority_can_update_priority, + sched_fixedpriority_update_priority, + sched_fixedpriority_lightweight_update_priority, + sched_fixedpriority_quantum_expire, + sched_fixedpriority_should_current_thread_rechoose_processor, + sched_fixedpriority_processor_runq_count, + sched_fixedpriority_processor_runq_stats_count_sum, + sched_traditional_fairshare_init, + sched_traditional_fairshare_runq_count, + sched_traditional_fairshare_runq_stats_count_sum, + sched_traditional_fairshare_enqueue, + sched_traditional_fairshare_dequeue, + sched_traditional_fairshare_queue_remove, + TRUE /* direct_dispatch_to_idle_processors */ +}; + +const struct sched_dispatch_table sched_fixedpriority_with_pset_runqueue_dispatch = { + sched_fixedpriority_with_pset_runqueue_init, + sched_fixedpriority_timebase_init, + sched_fixedpriority_processor_init, + sched_fixedpriority_pset_init, + sched_fixedpriority_maintenance_continuation, + sched_fixedpriority_choose_thread, + sched_fixedpriority_steal_thread, + sched_fixedpriority_compute_priority, + sched_fixedpriority_choose_processor, + sched_fixedpriority_processor_enqueue, + sched_fixedpriority_processor_queue_shutdown, + sched_fixedpriority_processor_queue_remove, + sched_fixedpriority_processor_queue_empty, + sched_fixedpriority_priority_is_urgent, + sched_fixedpriority_processor_csw_check, + sched_fixedpriority_processor_queue_has_priority, + sched_fixedpriority_initial_quantum_size, + sched_fixedpriority_initial_thread_sched_mode, + sched_fixedpriority_supports_timeshare_mode, + sched_fixedpriority_can_update_priority, + sched_fixedpriority_update_priority, + sched_fixedpriority_lightweight_update_priority, + sched_fixedpriority_quantum_expire, + sched_fixedpriority_should_current_thread_rechoose_processor, + sched_fixedpriority_processor_runq_count, + sched_fixedpriority_processor_runq_stats_count_sum, + sched_traditional_fairshare_init, + sched_traditional_fairshare_runq_count, + sched_traditional_fairshare_runq_stats_count_sum, + sched_traditional_fairshare_enqueue, + sched_traditional_fairshare_dequeue, + sched_traditional_fairshare_queue_remove, + FALSE /* direct_dispatch_to_idle_processors */ +}; + +extern int max_unsafe_quanta; + +#define SCHED_FIXEDPRIORITY_DEFAULT_QUANTUM 5 /* in ms */ +static uint32_t sched_fixedpriority_quantum_ms = SCHED_FIXEDPRIORITY_DEFAULT_QUANTUM; +static uint32_t sched_fixedpriority_quantum; + +#define SCHED_FIXEDPRIORITY_DEFAULT_FAIRSHARE_MINIMUM_BLOCK_TIME 100 /* ms */ +static uint32_t fairshare_minimum_blocked_time_ms = SCHED_FIXEDPRIORITY_DEFAULT_FAIRSHARE_MINIMUM_BLOCK_TIME; +static uint32_t fairshare_minimum_blocked_time; + +static uint32_t sched_fixedpriority_tick; +static uint64_t sched_fixedpriority_tick_deadline; +extern uint32_t grrr_rescale_tick; + +static boolean_t sched_fixedpriority_use_pset_runqueue = FALSE; + +__attribute__((always_inline)) +static inline run_queue_t runq_for_processor(processor_t processor) +{ + if (sched_fixedpriority_use_pset_runqueue) + return &processor->processor_set->pset_runq; + else + return &processor->runq; +} + +__attribute__((always_inline)) +static inline void runq_consider_incr_bound_count(processor_t processor, thread_t thread) +{ + if (thread->bound_processor == PROCESSOR_NULL) + return; + + assert(thread->bound_processor == processor); + + if (sched_fixedpriority_use_pset_runqueue) + processor->processor_set->pset_runq_bound_count++; + + processor->runq_bound_count++; +} + +__attribute__((always_inline)) +static inline void runq_consider_decr_bound_count(processor_t processor, thread_t thread) +{ + if (thread->bound_processor == PROCESSOR_NULL) + return; + + assert(thread->bound_processor == processor); + + if (sched_fixedpriority_use_pset_runqueue) + processor->processor_set->pset_runq_bound_count--; + + processor->runq_bound_count--; +} + +static void +sched_fixedpriority_init(void) +{ + if (!PE_parse_boot_argn("fixedpriority_quantum", &sched_fixedpriority_quantum_ms, sizeof (sched_fixedpriority_quantum_ms))) { + sched_fixedpriority_quantum_ms = SCHED_FIXEDPRIORITY_DEFAULT_QUANTUM; + } + + if (sched_fixedpriority_quantum_ms < 1) + sched_fixedpriority_quantum_ms = SCHED_FIXEDPRIORITY_DEFAULT_QUANTUM; + + printf("standard fixed priority timeslicing quantum is %u ms\n", sched_fixedpriority_quantum_ms); +} + +static void +sched_fixedpriority_with_pset_runqueue_init(void) +{ + sched_fixedpriority_init(); + sched_fixedpriority_use_pset_runqueue = TRUE; +} + +static void +sched_fixedpriority_timebase_init(void) +{ + uint64_t abstime; + + /* standard timeslicing quantum */ + clock_interval_to_absolutetime_interval( + sched_fixedpriority_quantum_ms, NSEC_PER_MSEC, &abstime); + assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); + sched_fixedpriority_quantum = (uint32_t)abstime; + + thread_depress_time = 1 * sched_fixedpriority_quantum; + default_timeshare_computation = sched_fixedpriority_quantum / 2; + default_timeshare_constraint = sched_fixedpriority_quantum; + + max_unsafe_computation = max_unsafe_quanta * sched_fixedpriority_quantum; + sched_safe_duration = 2 * max_unsafe_quanta * sched_fixedpriority_quantum; + + if (!PE_parse_boot_argn("fairshare_minblockedtime", &fairshare_minimum_blocked_time_ms, sizeof (fairshare_minimum_blocked_time_ms))) { + fairshare_minimum_blocked_time_ms = SCHED_FIXEDPRIORITY_DEFAULT_FAIRSHARE_MINIMUM_BLOCK_TIME; + } + + clock_interval_to_absolutetime_interval( + fairshare_minimum_blocked_time_ms, NSEC_PER_MSEC, &abstime); + + assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); + fairshare_minimum_blocked_time = (uint32_t)abstime; +} + +static void +sched_fixedpriority_processor_init(processor_t processor) +{ + if (!sched_fixedpriority_use_pset_runqueue) { + run_queue_init(&processor->runq); + } + processor->runq_bound_count = 0; +} + +static void +sched_fixedpriority_pset_init(processor_set_t pset) +{ + if (sched_fixedpriority_use_pset_runqueue) { + run_queue_init(&pset->pset_runq); + } + pset->pset_runq_bound_count = 0; +} + + +static void +sched_fixedpriority_maintenance_continuation(void) +{ + uint64_t abstime = mach_absolute_time(); + + sched_fixedpriority_tick++; + grrr_rescale_tick++; + + /* + * Compute various averages. + */ + compute_averages(); + + if (sched_fixedpriority_tick_deadline == 0) + sched_fixedpriority_tick_deadline = abstime; + + clock_deadline_for_periodic_event(10*sched_one_second_interval, abstime, + &sched_fixedpriority_tick_deadline); + + assert_wait_deadline((event_t)sched_fixedpriority_maintenance_continuation, THREAD_UNINT, sched_fixedpriority_tick_deadline); + thread_block((thread_continue_t)sched_fixedpriority_maintenance_continuation); + /*NOTREACHED*/ +} + + +static thread_t +sched_fixedpriority_choose_thread(processor_t processor, + int priority) +{ + thread_t thread; + + thread = choose_thread(processor, runq_for_processor(processor), priority); + if (thread != THREAD_NULL) { + runq_consider_decr_bound_count(processor, thread); + } + + return thread; +} + +static thread_t +sched_fixedpriority_steal_thread(processor_set_t pset) +{ + pset_unlock(pset); + + return (THREAD_NULL); + +} + +static void +sched_fixedpriority_compute_priority(thread_t thread, + boolean_t override_depress) +{ + /* Reset current priority to base priority */ + if ( !(thread->sched_flags & TH_SFLAG_PROMOTED) && + (!(thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) || + override_depress ) ) { + set_sched_pri(thread, thread->priority); + } +} + +static processor_t +sched_fixedpriority_choose_processor( processor_set_t pset, + processor_t processor, + thread_t thread) +{ + return choose_processor(pset, processor, thread); +} +static boolean_t +sched_fixedpriority_processor_enqueue( + processor_t processor, + thread_t thread, + integer_t options) +{ + run_queue_t rq = runq_for_processor(processor); + boolean_t result; + + result = run_queue_enqueue(rq, thread, options); + thread->runq = processor; + runq_consider_incr_bound_count(processor, thread); + + return (result); +} + +static void +sched_fixedpriority_processor_queue_shutdown( + processor_t processor) +{ + processor_set_t pset = processor->processor_set; + thread_t thread; + queue_head_t tqueue, bqueue; + + queue_init(&tqueue); + queue_init(&bqueue); + + while ((thread = sched_fixedpriority_choose_thread(processor, IDLEPRI)) != THREAD_NULL) { + if (thread->bound_processor == PROCESSOR_NULL) { + enqueue_tail(&tqueue, (queue_entry_t)thread); + } else { + enqueue_tail(&bqueue, (queue_entry_t)thread); + } + } + + while ((thread = (thread_t)dequeue_head(&bqueue)) != THREAD_NULL) { + sched_fixedpriority_processor_enqueue(processor, thread, SCHED_TAILQ); + } + + pset_unlock(pset); + + while ((thread = (thread_t)dequeue_head(&tqueue)) != THREAD_NULL) { + thread_lock(thread); + + thread_setrun(thread, SCHED_TAILQ); + + thread_unlock(thread); + } +} + +static boolean_t +sched_fixedpriority_processor_queue_remove( + processor_t processor, + thread_t thread) +{ + void * rqlock; + run_queue_t rq; + + rqlock = &processor->processor_set->sched_lock; + rq = runq_for_processor(processor); + + simple_lock(rqlock); + if (processor == thread->runq) { + /* + * Thread is on a run queue and we have a lock on + * that run queue. + */ + runq_consider_decr_bound_count(processor, thread); + run_queue_remove(rq, thread); + } + else { + /* + * The thread left the run queue before we could + * lock the run queue. + */ + assert(thread->runq == PROCESSOR_NULL); + processor = PROCESSOR_NULL; + } + + simple_unlock(rqlock); + + return (processor != PROCESSOR_NULL); +} + +static boolean_t +sched_fixedpriority_processor_queue_empty(processor_t processor) +{ + /* + * See sched_traditional_with_pset_runqueue_processor_queue_empty + * for algorithm + */ + int count = runq_for_processor(processor)->count; + + if (sched_fixedpriority_use_pset_runqueue) { + processor_set_t pset = processor->processor_set; + + count -= pset->pset_runq_bound_count; + count += processor->runq_bound_count; + } + + return count == 0; +} + +static boolean_t +sched_fixedpriority_processor_queue_has_priority(processor_t processor, + int priority, + boolean_t gte) +{ + if (gte) + return runq_for_processor(processor)->highq >= priority; + else + return runq_for_processor(processor)->highq > priority; +} + +/* Implement sched_preempt_pri in code */ +static boolean_t +sched_fixedpriority_priority_is_urgent(int priority) +{ + if (priority <= BASEPRI_FOREGROUND) + return FALSE; + + if (priority < MINPRI_KERNEL) + return TRUE; + + if (priority >= BASEPRI_PREEMPT) + return TRUE; + + return FALSE; +} + +static ast_t +sched_fixedpriority_processor_csw_check(processor_t processor) +{ + run_queue_t runq; + + runq = runq_for_processor(processor); + if (runq->highq > processor->current_pri) { + if (runq->urgency > 0) + return (AST_PREEMPT | AST_URGENT); + + if (processor->active_thread && thread_eager_preemption(processor->active_thread)) + return (AST_PREEMPT | AST_URGENT); + + return AST_PREEMPT; + } else if (processor->current_thmode == TH_MODE_FAIRSHARE) { + if (!sched_fixedpriority_processor_queue_empty(processor)) { + /* Allow queued threads to run if the current thread got demoted to fairshare */ + return (AST_PREEMPT | AST_URGENT); + } else if ((!first_timeslice(processor)) && SCHED(fairshare_runq_count)() > 0) { + /* Allow other fairshare threads to run */ + return AST_PREEMPT | AST_URGENT; + } + } + + return AST_NONE; +} + +static uint32_t +sched_fixedpriority_initial_quantum_size(thread_t thread __unused) +{ + return sched_fixedpriority_quantum; +} + +static sched_mode_t +sched_fixedpriority_initial_thread_sched_mode(task_t parent_task) +{ + if (parent_task == kernel_task) + return TH_MODE_FIXED; + else + return TH_MODE_TIMESHARE; +} + +static boolean_t +sched_fixedpriority_supports_timeshare_mode(void) +{ + return TRUE; +} + +static boolean_t +sched_fixedpriority_can_update_priority(thread_t thread __unused) +{ + return ((thread->sched_flags & TH_SFLAG_PRI_UPDATE) == 0); +} + +static void +sched_fixedpriority_update_priority(thread_t thread) +{ + uint64_t current_time = mach_absolute_time(); + + thread->sched_flags |= TH_SFLAG_PRI_UPDATE; + + if (thread->sched_flags & TH_SFLAG_FAIRSHARE_TRIPPED) { + + /* + * Make sure we've waited fairshare_minimum_blocked_time both from the time + * we were throttled into the fairshare band, and the last time + * we ran. + */ + if (current_time >= thread->last_run_time + fairshare_minimum_blocked_time) { + + boolean_t removed = thread_run_queue_remove(thread); + + thread->sched_flags &= ~TH_SFLAG_FAIRSHARE_TRIPPED; + thread->sched_mode = thread->saved_mode; + thread->saved_mode = TH_MODE_NONE; + + if (removed) + thread_setrun(thread, SCHED_TAILQ); + + KERNEL_DEBUG_CONSTANT1( + MACHDBG_CODE(DBG_MACH_SCHED,MACH_FAIRSHARE_EXIT) | DBG_FUNC_NONE, (uint32_t)(thread->last_run_time & 0xFFFFFFFF), (uint32_t)(thread->last_run_time >> 32), (uint32_t)(current_time & 0xFFFFFFFF), (uint32_t)(current_time >> 32), thread_tid(thread)); + + } + } else if ((thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) && (thread->bound_processor == PROCESSOR_NULL)) { + boolean_t removed = thread_run_queue_remove(thread); + + thread->sched_flags |= TH_SFLAG_FAIRSHARE_TRIPPED; + thread->saved_mode = thread->sched_mode; + thread->sched_mode = TH_MODE_FAIRSHARE; + + thread->last_quantum_refill_time = thread->last_run_time - 2 * sched_fixedpriority_quantum - 1; + + if (removed) + thread_setrun(thread, SCHED_TAILQ); + + KERNEL_DEBUG_CONSTANT( + MACHDBG_CODE(DBG_MACH_SCHED,MACH_FAIRSHARE_ENTER) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), 0xFFFFFFFF, 0, 0, 0); + + } + + /* + * Check for fail-safe release. + */ + if ( (thread->sched_flags & TH_SFLAG_FAILSAFE) && + current_time >= thread->safe_release ) { + + + thread->sched_flags &= ~TH_SFLAG_FAILSAFE; + + if (!(thread->sched_flags & TH_SFLAG_DEMOTED_MASK)) { + /* Restore to previous */ + + thread->sched_mode = thread->saved_mode; + thread->saved_mode = TH_MODE_NONE; + + if (thread->sched_mode == TH_MODE_REALTIME) { + thread->priority = BASEPRI_RTQUEUES; + + } + + if (!(thread->sched_flags & TH_SFLAG_DEPRESSED_MASK)) + set_sched_pri(thread, thread->priority); + } + } + + thread->sched_flags &= ~TH_SFLAG_PRI_UPDATE; + return; +} + +static void +sched_fixedpriority_lightweight_update_priority(thread_t thread __unused) +{ + return; +} + +static void +sched_fixedpriority_quantum_expire( + thread_t thread) +{ + /* Put thread into fairshare class, core scheduler will manage runqueue */ + if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->task != kernel_task) && !(thread->sched_flags & TH_SFLAG_DEMOTED_MASK)) { + uint64_t elapsed = thread->last_run_time - thread->last_quantum_refill_time; + + /* If we managed to use our quantum in less than 2*quantum wall clock time, + * we are considered CPU bound and eligible for demotion. Since the quantum + * is reset when thread_unblock() is called, we are only really considering + * threads that elongate their execution time due to preemption. + */ + if ((elapsed < 2 * sched_fixedpriority_quantum) && (thread->bound_processor == PROCESSOR_NULL)) { + + thread->saved_mode = thread->sched_mode; + thread->sched_mode = TH_MODE_FAIRSHARE; + thread->sched_flags |= TH_SFLAG_FAIRSHARE_TRIPPED; + KERNEL_DEBUG_CONSTANT( + MACHDBG_CODE(DBG_MACH_SCHED,MACH_FAIRSHARE_ENTER) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), (uint32_t)(elapsed & 0xFFFFFFFF), (uint32_t)(elapsed >> 32), 0, 0); + } + } +} + + +static boolean_t +sched_fixedpriority_should_current_thread_rechoose_processor(processor_t processor __unused) +{ + return (TRUE); +} + + +static int +sched_fixedpriority_processor_runq_count(processor_t processor) +{ + return runq_for_processor(processor)->count; +} + +static uint64_t +sched_fixedpriority_processor_runq_stats_count_sum(processor_t processor) +{ + return runq_for_processor(processor)->runq_stats.count_sum; +} diff --git a/osfmk/kern/sched_grrr.c b/osfmk/kern/sched_grrr.c new file mode 100644 index 000000000..d27b29e87 --- /dev/null +++ b/osfmk/kern/sched_grrr.c @@ -0,0 +1,956 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include + +#if defined(CONFIG_SCHED_GRRR_CORE) + +static void +grrr_priority_mapping_init(void); + +static boolean_t +grrr_enqueue( + grrr_run_queue_t rq, + thread_t thread); + +static thread_t +grrr_select( + grrr_run_queue_t rq); + +static void +grrr_remove( + grrr_run_queue_t rq, + thread_t thread); + + +static void +grrr_sorted_list_insert_group(grrr_run_queue_t rq, + grrr_group_t group); + +static void +grrr_rescale_work(grrr_run_queue_t rq); + +static void +grrr_runqueue_init(grrr_run_queue_t runq); + +/* Map Mach priorities to ones suitable for proportional sharing */ +static grrr_proportional_priority_t grrr_priority_mapping[NRQS]; + +/* Map each proportional priority to its group */ +static grrr_group_index_t grrr_group_mapping[NUM_GRRR_PROPORTIONAL_PRIORITIES]; + +uint32_t grrr_rescale_tick; + +#endif /* defined(CONFIG_SCHED_GRRR_CORE) */ + +#if defined(CONFIG_SCHED_GRRR) + +static void +sched_grrr_init(void); + +static void +sched_grrr_timebase_init(void); + +static void +sched_grrr_processor_init(processor_t processor); + +static void +sched_grrr_pset_init(processor_set_t pset); + +static void +sched_grrr_maintenance_continuation(void); + +static thread_t +sched_grrr_choose_thread(processor_t processor, + int priority); + +static thread_t +sched_grrr_steal_thread(processor_set_t pset); + +static void +sched_grrr_compute_priority(thread_t thread, + boolean_t override_depress); + +static processor_t +sched_grrr_choose_processor( processor_set_t pset, + processor_t processor, + thread_t thread); + +static boolean_t +sched_grrr_processor_enqueue( + processor_t processor, + thread_t thread, + integer_t options); + +static void +sched_grrr_processor_queue_shutdown( + processor_t processor); + +static boolean_t +sched_grrr_processor_queue_remove( + processor_t processor, + thread_t thread); + +static boolean_t +sched_grrr_processor_queue_empty(processor_t processor); + +static boolean_t +sched_grrr_processor_queue_has_priority(processor_t processor, + int priority, + boolean_t gte); + +static boolean_t +sched_grrr_priority_is_urgent(int priority); + +static ast_t +sched_grrr_processor_csw_check(processor_t processor); + +static uint32_t +sched_grrr_initial_quantum_size(thread_t thread); + +static sched_mode_t +sched_grrr_initial_thread_sched_mode(task_t parent_task); + +static boolean_t +sched_grrr_supports_timeshare_mode(void); + +static boolean_t +sched_grrr_can_update_priority(thread_t thread); + +static void +sched_grrr_update_priority(thread_t thread); + +static void +sched_grrr_lightweight_update_priority(thread_t thread); + +static void +sched_grrr_quantum_expire(thread_t thread); + +static boolean_t +sched_grrr_should_current_thread_rechoose_processor(processor_t processor); + +static int +sched_grrr_processor_runq_count(processor_t processor); + +static uint64_t +sched_grrr_processor_runq_stats_count_sum(processor_t processor); + +const struct sched_dispatch_table sched_grrr_dispatch = { + sched_grrr_init, + sched_grrr_timebase_init, + sched_grrr_processor_init, + sched_grrr_pset_init, + sched_grrr_maintenance_continuation, + sched_grrr_choose_thread, + sched_grrr_steal_thread, + sched_grrr_compute_priority, + sched_grrr_choose_processor, + sched_grrr_processor_enqueue, + sched_grrr_processor_queue_shutdown, + sched_grrr_processor_queue_remove, + sched_grrr_processor_queue_empty, + sched_grrr_priority_is_urgent, + sched_grrr_processor_csw_check, + sched_grrr_processor_queue_has_priority, + sched_grrr_initial_quantum_size, + sched_grrr_initial_thread_sched_mode, + sched_grrr_supports_timeshare_mode, + sched_grrr_can_update_priority, + sched_grrr_update_priority, + sched_grrr_lightweight_update_priority, + sched_grrr_quantum_expire, + sched_grrr_should_current_thread_rechoose_processor, + sched_grrr_processor_runq_count, + sched_grrr_processor_runq_stats_count_sum, + sched_grrr_fairshare_init, + sched_grrr_fairshare_runq_count, + sched_grrr_fairshare_runq_stats_count_sum, + sched_grrr_fairshare_enqueue, + sched_grrr_fairshare_dequeue, + sched_grrr_fairshare_queue_remove, + TRUE /* direct_dispatch_to_idle_processors */ +}; + +extern int default_preemption_rate; +extern int max_unsafe_quanta; + +static uint32_t grrr_quantum_us; +static uint32_t grrr_quantum; + +static uint64_t sched_grrr_tick_deadline; + +static void +sched_grrr_init(void) +{ + if (default_preemption_rate < 1) + default_preemption_rate = 100; + grrr_quantum_us = (1000 * 1000) / default_preemption_rate; + + printf("standard grrr timeslicing quantum is %d us\n", grrr_quantum_us); + + grrr_priority_mapping_init(); +} + +static void +sched_grrr_timebase_init(void) +{ + uint64_t abstime; + + /* standard timeslicing quantum */ + clock_interval_to_absolutetime_interval( + grrr_quantum_us, NSEC_PER_USEC, &abstime); + assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); + grrr_quantum = (uint32_t)abstime; + + thread_depress_time = 1 * grrr_quantum; + default_timeshare_computation = grrr_quantum / 2; + default_timeshare_constraint = grrr_quantum; + + max_unsafe_computation = max_unsafe_quanta * grrr_quantum; + sched_safe_duration = 2 * max_unsafe_quanta * grrr_quantum; + +} + +static void +sched_grrr_processor_init(processor_t processor) +{ + grrr_runqueue_init(&processor->grrr_runq); +} + +static void +sched_grrr_pset_init(processor_set_t pset __unused) +{ +} + +static void +sched_grrr_maintenance_continuation(void) +{ + uint64_t abstime = mach_absolute_time(); + + grrr_rescale_tick++; + + /* + * Compute various averages. + */ + compute_averages(); + + if (sched_grrr_tick_deadline == 0) + sched_grrr_tick_deadline = abstime; + + clock_deadline_for_periodic_event(10*sched_one_second_interval, abstime, + &sched_grrr_tick_deadline); + + assert_wait_deadline((event_t)sched_grrr_maintenance_continuation, THREAD_UNINT, sched_grrr_tick_deadline); + thread_block((thread_continue_t)sched_grrr_maintenance_continuation); + /*NOTREACHED*/ +} + + +static thread_t +sched_grrr_choose_thread(processor_t processor, + int priority __unused) +{ + grrr_run_queue_t rq = &processor->grrr_runq; + + return grrr_select(rq); +} + +static thread_t +sched_grrr_steal_thread(processor_set_t pset) +{ + pset_unlock(pset); + + return (THREAD_NULL); + +} + +static void +sched_grrr_compute_priority(thread_t thread, + boolean_t override_depress __unused) +{ + set_sched_pri(thread, thread->priority); +} + +static processor_t +sched_grrr_choose_processor( processor_set_t pset, + processor_t processor, + thread_t thread) +{ + return choose_processor(pset, processor, thread); +} + +static boolean_t +sched_grrr_processor_enqueue( + processor_t processor, + thread_t thread, + integer_t options __unused) +{ + grrr_run_queue_t rq = &processor->grrr_runq; + boolean_t result; + + result = grrr_enqueue(rq, thread); + + thread->runq = processor; + + return result; +} + +static void +sched_grrr_processor_queue_shutdown( + processor_t processor) +{ + processor_set_t pset = processor->processor_set; + thread_t thread; + queue_head_t tqueue, bqueue; + + queue_init(&tqueue); + queue_init(&bqueue); + + while ((thread = sched_grrr_choose_thread(processor, IDLEPRI)) != THREAD_NULL) { + if (thread->bound_processor == PROCESSOR_NULL) { + enqueue_tail(&tqueue, (queue_entry_t)thread); + } else { + enqueue_tail(&bqueue, (queue_entry_t)thread); + } + } + + while ((thread = (thread_t)dequeue_head(&bqueue)) != THREAD_NULL) { + sched_grrr_processor_enqueue(processor, thread, SCHED_TAILQ); + } + + pset_unlock(pset); + + while ((thread = (thread_t)dequeue_head(&tqueue)) != THREAD_NULL) { + thread_lock(thread); + + thread_setrun(thread, SCHED_TAILQ); + + thread_unlock(thread); + } +} + +static boolean_t +sched_grrr_processor_queue_remove( + processor_t processor, + thread_t thread) +{ + void * rqlock; + + rqlock = &processor->processor_set->sched_lock; + simple_lock(rqlock); + + if (processor == thread->runq) { + /* + * Thread is on a run queue and we have a lock on + * that run queue. + */ + grrr_run_queue_t rq = &processor->grrr_runq; + + grrr_remove(rq, thread); + } else { + /* + * The thread left the run queue before we could + * lock the run queue. + */ + assert(thread->runq == PROCESSOR_NULL); + processor = PROCESSOR_NULL; + } + + simple_unlock(rqlock); + + return (processor != PROCESSOR_NULL); +} + +static boolean_t +sched_grrr_processor_queue_empty(processor_t processor __unused) +{ + boolean_t result; + + result = (processor->grrr_runq.count == 0); + + return result; +} + +static boolean_t +sched_grrr_processor_queue_has_priority(processor_t processor, + int priority, + boolean_t gte __unused) +{ + grrr_run_queue_t rq = &processor->grrr_runq; + unsigned int i; + + i = grrr_group_mapping[grrr_priority_mapping[priority]]; + for ( ; i < NUM_GRRR_GROUPS; i++) { + if (rq->groups[i].count > 0) + return (TRUE); + } + + return (FALSE); +} + +/* Implement sched_preempt_pri in code */ +static boolean_t +sched_grrr_priority_is_urgent(int priority) +{ + if (priority <= BASEPRI_FOREGROUND) + return FALSE; + + if (priority < MINPRI_KERNEL) + return TRUE; + + if (priority >= BASEPRI_PREEMPT) + return TRUE; + + return FALSE; +} + +static ast_t +sched_grrr_processor_csw_check(processor_t processor) +{ + int count; + + count = sched_grrr_processor_runq_count(processor); + + if (count > 0) { + + return AST_PREEMPT; + } + + return AST_NONE; +} + +static uint32_t +sched_grrr_initial_quantum_size(thread_t thread __unused) +{ + return grrr_quantum; +} + +static sched_mode_t +sched_grrr_initial_thread_sched_mode(task_t parent_task) +{ + if (parent_task == kernel_task) + return TH_MODE_FIXED; + else + return TH_MODE_TIMESHARE; +} + +static boolean_t +sched_grrr_supports_timeshare_mode(void) +{ + return TRUE; +} + +static boolean_t +sched_grrr_can_update_priority(thread_t thread __unused) +{ + return FALSE; +} + +static void +sched_grrr_update_priority(thread_t thread __unused) +{ + +} + +static void +sched_grrr_lightweight_update_priority(thread_t thread __unused) +{ + return; +} + +static void +sched_grrr_quantum_expire( + thread_t thread __unused) +{ +} + + +static boolean_t +sched_grrr_should_current_thread_rechoose_processor(processor_t processor __unused) +{ + return (TRUE); +} + +static int +sched_grrr_processor_runq_count(processor_t processor) +{ + return processor->grrr_runq.count; +} + +static uint64_t +sched_grrr_processor_runq_stats_count_sum(processor_t processor) +{ + return processor->grrr_runq.runq_stats.count_sum; +} + +#endif /* defined(CONFIG_SCHED_GRRR) */ + +#if defined(CONFIG_SCHED_GRRR_CORE) + +static void +grrr_priority_mapping_init(void) +{ + unsigned int i; + + /* Map 0->0 up to 10->20 */ + for (i=0; i <= 10; i++) { + grrr_priority_mapping[i] = 2*i; + } + + /* Map user priorities 11->33 up to 51 -> 153 */ + for (i=11; i <= 51; i++) { + grrr_priority_mapping[i] = 3*i; + } + + /* Map high priorities 52->180 up to 127->255 */ + for (i=52; i <= 127; i++) { + grrr_priority_mapping[i] = 128 + i; + } + + for (i = 0; i < NUM_GRRR_PROPORTIONAL_PRIORITIES; i++) { + +#if 0 + unsigned j, k; + /* Calculate log(i); */ + for (j=0, k=1; k <= i; j++, k *= 2); +#endif + + /* Groups of 4 */ + grrr_group_mapping[i] = i >> 2; + } + +} + +static thread_t +grrr_intragroup_schedule(grrr_group_t group) +{ + thread_t thread; + + if (group->count == 0) { + return THREAD_NULL; + } + + thread = group->current_client; + if (thread == THREAD_NULL) { + thread = (thread_t)queue_first(&group->clients); + } + + if (1 /* deficit */) { + group->current_client = (thread_t)queue_next((queue_entry_t)thread); + if (queue_end(&group->clients, (queue_entry_t)group->current_client)) { + group->current_client = (thread_t)queue_first(&group->clients); + } + + thread = group->current_client; + } + + return thread; +} + +static thread_t +grrr_intergroup_schedule(grrr_run_queue_t rq) +{ + thread_t thread; + grrr_group_t group; + + if (rq->count == 0) { + return THREAD_NULL; + } + + group = rq->current_group; + + if (group == GRRR_GROUP_NULL) { + group = (grrr_group_t)queue_first(&rq->sorted_group_list); + } + + thread = grrr_intragroup_schedule(group); + + if ((group->work >= (UINT32_MAX-256)) || (rq->last_rescale_tick != grrr_rescale_tick)) { + grrr_rescale_work(rq); + } + group->work++; + + if (queue_end(&rq->sorted_group_list, queue_next((queue_entry_t)group))) { + /* last group, go back to beginning */ + group = (grrr_group_t)queue_first(&rq->sorted_group_list); + } else { + grrr_group_t nextgroup = (grrr_group_t)queue_next((queue_entry_t)group); + uint64_t orderleft, orderright; + + /* + * The well-ordering condition for intergroup selection is: + * + * (group->work+1) / (nextgroup->work+1) > (group->weight) / (nextgroup->weight) + * + * Multiply both sides by their denominators to avoid division + * + */ + orderleft = (group->work + 1) * ((uint64_t)nextgroup->weight); + orderright = (nextgroup->work + 1) * ((uint64_t)group->weight); + if (orderleft > orderright) { + group = nextgroup; + } else { + group = (grrr_group_t)queue_first(&rq->sorted_group_list); + } + } + + rq->current_group = group; + + return thread; +} + +static void +grrr_runqueue_init(grrr_run_queue_t runq) +{ + grrr_group_index_t index; + + runq->count = 0; + + for (index = 0; index < NUM_GRRR_GROUPS; index++) { + unsigned int prisearch; + + for (prisearch = 0; + prisearch < NUM_GRRR_PROPORTIONAL_PRIORITIES; + prisearch++) { + if (grrr_group_mapping[prisearch] == index) { + runq->groups[index].minpriority = (grrr_proportional_priority_t)prisearch; + break; + } + } + + runq->groups[index].index = index; + + queue_init(&runq->groups[index].clients); + runq->groups[index].count = 0; + runq->groups[index].weight = 0; + runq->groups[index].work = 0; + runq->groups[index].current_client = THREAD_NULL; + } + + queue_init(&runq->sorted_group_list); + runq->weight = 0; + runq->current_group = GRRR_GROUP_NULL; +} + +static void +grrr_rescale_work(grrr_run_queue_t rq) +{ + grrr_group_index_t index; + + /* avoid overflow by scaling by 1/8th */ + for (index = 0; index < NUM_GRRR_GROUPS; index++) { + rq->groups[index].work >>= 3; + } + + rq->last_rescale_tick = grrr_rescale_tick; +} + +static boolean_t +grrr_enqueue( + grrr_run_queue_t rq, + thread_t thread) +{ + grrr_proportional_priority_t gpriority; + grrr_group_index_t gindex; + grrr_group_t group; + + gpriority = grrr_priority_mapping[thread->sched_pri]; + gindex = grrr_group_mapping[gpriority]; + group = &rq->groups[gindex]; + +#if 0 + thread->grrr_deficit = 0; +#endif + + if (group->count == 0) { + /* Empty group, this is the first client */ + enqueue_tail(&group->clients, (queue_entry_t)thread); + group->count = 1; + group->weight = gpriority; + group->current_client = thread; + } else { + /* Insert before the current client */ + if (group->current_client == THREAD_NULL || + queue_first(&group->clients) == (queue_entry_t)group->current_client) { + enqueue_head(&group->clients, (queue_entry_t)thread); + } else { + insque((queue_entry_t)thread, queue_prev((queue_entry_t)group->current_client)); + } + SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); + group->count++; + group->weight += gpriority; + + /* Since there was already a client, this is on the per-processor sorted list already */ + remqueue((queue_entry_t)group); + } + + grrr_sorted_list_insert_group(rq, group); + + rq->count++; + rq->weight += gpriority; + + return (FALSE); +} + +static thread_t +grrr_select(grrr_run_queue_t rq) +{ + thread_t thread; + + thread = grrr_intergroup_schedule(rq); + if (thread != THREAD_NULL) { + grrr_proportional_priority_t gpriority; + grrr_group_index_t gindex; + grrr_group_t group; + + gpriority = grrr_priority_mapping[thread->sched_pri]; + gindex = grrr_group_mapping[gpriority]; + group = &rq->groups[gindex]; + + remqueue((queue_entry_t)thread); + SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); + group->count--; + group->weight -= gpriority; + if (group->current_client == thread) { + group->current_client = THREAD_NULL; + } + + remqueue((queue_entry_t)group); + if (group->count == 0) { + if (rq->current_group == group) { + rq->current_group = GRRR_GROUP_NULL; + } + } else { + /* Need to re-insert in sorted location */ + grrr_sorted_list_insert_group(rq, group); + } + + rq->count--; + rq->weight -= gpriority; + + thread->runq = PROCESSOR_NULL; + } + + + return (thread); +} + +static void +grrr_remove( + grrr_run_queue_t rq, + thread_t thread) +{ + grrr_proportional_priority_t gpriority; + grrr_group_index_t gindex; + grrr_group_t group; + + gpriority = grrr_priority_mapping[thread->sched_pri]; + gindex = grrr_group_mapping[gpriority]; + group = &rq->groups[gindex]; + + remqueue((queue_entry_t)thread); + SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); + group->count--; + group->weight -= gpriority; + if (group->current_client == thread) { + group->current_client = THREAD_NULL; + } + + remqueue((queue_entry_t)group); + if (group->count == 0) { + if (rq->current_group == group) { + rq->current_group = GRRR_GROUP_NULL; + } + } else { + /* Need to re-insert in sorted location */ + grrr_sorted_list_insert_group(rq, group); + } + + rq->count--; + rq->weight -= gpriority; + + thread->runq = PROCESSOR_NULL; +} + +static void +grrr_sorted_list_insert_group(grrr_run_queue_t rq, + grrr_group_t group) +{ + /* Simple insertion sort */ + if (queue_empty(&rq->sorted_group_list)) { + enqueue_tail(&rq->sorted_group_list, (queue_entry_t)group); + } else { + grrr_group_t search_group; + + /* Start searching from the head (heaviest weight) for the first + * element less than us, so we can insert before it + */ + search_group = (grrr_group_t)queue_first(&rq->sorted_group_list); + while (!queue_end(&rq->sorted_group_list, (queue_entry_t)search_group) ) { + + if (search_group->weight < group->weight) { + /* we should be before this */ + search_group = (grrr_group_t)queue_prev((queue_entry_t)search_group); + break; + } if (search_group->weight == group->weight) { + /* Use group index as a tie breaker */ + if (search_group->index < group->index) { + search_group = (grrr_group_t)queue_prev((queue_entry_t)search_group); + break; + } + } + + /* otherwise, our weight is too small, keep going */ + search_group = (grrr_group_t)queue_next((queue_entry_t)search_group); + } + + if (queue_end(&rq->sorted_group_list, (queue_entry_t)search_group)) { + enqueue_tail(&rq->sorted_group_list, (queue_entry_t)group); + } else { + insque((queue_entry_t)group, (queue_entry_t)search_group); + } + } +} + +#endif /* defined(CONFIG_SCHED_GRRR_CORE) */ + +#if defined(CONFIG_SCHED_GRRR) || defined(CONFIG_SCHED_FIXEDPRIORITY) + +static struct grrr_run_queue fs_grrr_runq; +#define FS_GRRR_RUNQ ((processor_t)-2) +decl_simple_lock_data(static,fs_grrr_lock); + +void +sched_grrr_fairshare_init(void) +{ + grrr_priority_mapping_init(); + + simple_lock_init(&fs_grrr_lock, 0); + grrr_runqueue_init(&fs_grrr_runq); +} + + +int +sched_grrr_fairshare_runq_count(void) +{ + return fs_grrr_runq.count; +} + +uint64_t +sched_grrr_fairshare_runq_stats_count_sum(void) +{ + return fs_grrr_runq.runq_stats.count_sum; +} + +void +sched_grrr_fairshare_enqueue(thread_t thread) +{ + simple_lock(&fs_grrr_lock); + + (void)grrr_enqueue(&fs_grrr_runq, thread); + + thread->runq = FS_GRRR_RUNQ; + + simple_unlock(&fs_grrr_lock); +} + +thread_t sched_grrr_fairshare_dequeue(void) +{ + thread_t thread; + + simple_lock(&fs_grrr_lock); + if (fs_grrr_runq.count > 0) { + thread = grrr_select(&fs_grrr_runq); + + simple_unlock(&fs_grrr_lock); + + return (thread); + } + simple_unlock(&fs_grrr_lock); + + return THREAD_NULL; +} + +boolean_t sched_grrr_fairshare_queue_remove(thread_t thread) +{ + + simple_lock(&fs_grrr_lock); + + if (FS_GRRR_RUNQ == thread->runq) { + grrr_remove(&fs_grrr_runq, thread); + + simple_unlock(&fs_grrr_lock); + return (TRUE); + } + else { + /* + * The thread left the run queue before we could + * lock the run queue. + */ + assert(thread->runq == PROCESSOR_NULL); + simple_unlock(&fs_grrr_lock); + return (FALSE); + } +} + +#endif /* defined(CONFIG_SCHED_GRRR) || defined(CONFIG_SCHED_FIXEDPRIORITY) */ diff --git a/osfmk/kern/sched_prim.c b/osfmk/kern/sched_prim.c index dddaf47d7..c73ef0f3d 100644 --- a/osfmk/kern/sched_prim.c +++ b/osfmk/kern/sched_prim.c @@ -73,10 +73,12 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -107,10 +109,16 @@ #include -struct run_queue rt_runq; +struct rt_queue rt_runq; #define RT_RUNQ ((processor_t)-1) decl_simple_lock_data(static,rt_lock); +#if defined(CONFIG_SCHED_TRADITIONAL) || defined(CONFIG_SCHED_PROTO) || defined(CONFIG_SCHED_GRRR) || defined(CONFIG_SCHED_FIXEDPRIORITY) +static struct fairshare_queue fs_runq; +#define FS_RUNQ ((processor_t)-2) +decl_simple_lock_data(static,fs_lock); +#endif + #define DEFAULT_PREEMPTION_RATE 100 /* (1/s) */ int default_preemption_rate = DEFAULT_PREEMPTION_RATE; @@ -123,57 +131,203 @@ int max_poll_quanta = MAX_POLL_QUANTA; #define SCHED_POLL_YIELD_SHIFT 4 /* 1/16 */ int sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT; -uint64_t max_unsafe_computation; -uint32_t sched_safe_duration; uint64_t max_poll_computation; +uint64_t max_unsafe_computation; +uint64_t sched_safe_duration; + +#if defined(CONFIG_SCHED_TRADITIONAL) + uint32_t std_quantum; uint32_t min_std_quantum; uint32_t std_quantum_us; +#endif /* CONFIG_SCHED_TRADITIONAL */ + +uint32_t thread_depress_time; +uint32_t default_timeshare_computation; +uint32_t default_timeshare_constraint; + uint32_t max_rt_quantum; uint32_t min_rt_quantum; uint32_t sched_cswtime; +#if defined(CONFIG_SCHED_TRADITIONAL) + unsigned sched_tick; uint32_t sched_tick_interval; uint32_t sched_pri_shift = INT8_MAX; uint32_t sched_fixed_shift; +static boolean_t sched_traditional_use_pset_runqueue = FALSE; + +__attribute__((always_inline)) +static inline run_queue_t runq_for_processor(processor_t processor) +{ + if (sched_traditional_use_pset_runqueue) + return &processor->processor_set->pset_runq; + else + return &processor->runq; +} + +__attribute__((always_inline)) +static inline void runq_consider_incr_bound_count(processor_t processor, thread_t thread) +{ + if (thread->bound_processor == PROCESSOR_NULL) + return; + + assert(thread->bound_processor == processor); + + if (sched_traditional_use_pset_runqueue) + processor->processor_set->pset_runq_bound_count++; + + processor->runq_bound_count++; +} + +__attribute__((always_inline)) +static inline void runq_consider_decr_bound_count(processor_t processor, thread_t thread) +{ + if (thread->bound_processor == PROCESSOR_NULL) + return; + + assert(thread->bound_processor == processor); + + if (sched_traditional_use_pset_runqueue) + processor->processor_set->pset_runq_bound_count--; + + processor->runq_bound_count--; +} + +#endif /* CONFIG_SCHED_TRADITIONAL */ + +uint64_t sched_one_second_interval; + uint32_t sched_run_count, sched_share_count; uint32_t sched_load_average, sched_mach_factor; /* Forwards */ + +#if defined(CONFIG_SCHED_TRADITIONAL) + static void load_shift_init(void) __attribute__((section("__TEXT, initcode"))); static void preempt_pri_init(void) __attribute__((section("__TEXT, initcode"))); -static thread_t run_queue_dequeue( - run_queue_t runq, - integer_t options); +#endif /* CONFIG_SCHED_TRADITIONAL */ -static thread_t choose_thread( - processor_t processor, - int priority); +static thread_t thread_select( + thread_t thread, + processor_t processor); +#if CONFIG_SCHED_IDLE_IN_PLACE static thread_t thread_select_idle( thread_t thread, processor_t processor); +#endif -static thread_t processor_idle( +thread_t processor_idle( thread_t thread, processor_t processor); +#if defined(CONFIG_SCHED_TRADITIONAL) + static thread_t steal_thread( processor_set_t pset); +static thread_t steal_thread_disabled( + processor_set_t pset) __attribute__((unused)); + + static thread_t steal_processor_thread( processor_t processor); static void thread_update_scan(void); +static void processor_setrun( + processor_t processor, + thread_t thread, + integer_t options); + +static boolean_t +processor_enqueue( + processor_t processor, + thread_t thread, + integer_t options); + +static boolean_t +processor_queue_remove( + processor_t processor, + thread_t thread); + +static boolean_t processor_queue_empty(processor_t processor); + +static boolean_t priority_is_urgent(int priority); + +static ast_t processor_csw_check(processor_t processor); + +static boolean_t processor_queue_has_priority(processor_t processor, + int priority, + boolean_t gte); + +static boolean_t should_current_thread_rechoose_processor(processor_t processor); + +static int sched_traditional_processor_runq_count(processor_t processor); + +static boolean_t sched_traditional_with_pset_runqueue_processor_queue_empty(processor_t processor); + +static uint64_t sched_traditional_processor_runq_stats_count_sum(processor_t processor); + +static uint64_t sched_traditional_with_pset_runqueue_processor_runq_stats_count_sum(processor_t processor); +#endif + + +#if defined(CONFIG_SCHED_TRADITIONAL) + +static void +sched_traditional_init(void); + +static void +sched_traditional_timebase_init(void); + +static void +sched_traditional_processor_init(processor_t processor); + +static void +sched_traditional_pset_init(processor_set_t pset); + +static void +sched_traditional_with_pset_runqueue_init(void); + +#endif + +static void +sched_realtime_init(void) __attribute__((section("__TEXT, initcode"))); + +static void +sched_realtime_timebase_init(void); + +#if defined(CONFIG_SCHED_TRADITIONAL) +static void +sched_traditional_tick_continue(void); + +static uint32_t +sched_traditional_initial_quantum_size(thread_t thread); + +static sched_mode_t +sched_traditional_initial_thread_sched_mode(task_t parent_task); + +static boolean_t +sched_traditional_supports_timeshare_mode(void); + +static thread_t +sched_traditional_choose_thread( + processor_t processor, + int priority); + +#endif + #if DEBUG extern int debug_task; #define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args) @@ -214,11 +368,221 @@ boolean_t thread_runnable( * */ +#if defined(CONFIG_SCHED_TRADITIONAL) int8_t sched_load_shifts[NRQS]; int sched_preempt_pri[NRQBM]; +#endif + + +#if defined(CONFIG_SCHED_TRADITIONAL) + +const struct sched_dispatch_table sched_traditional_dispatch = { + sched_traditional_init, + sched_traditional_timebase_init, + sched_traditional_processor_init, + sched_traditional_pset_init, + sched_traditional_tick_continue, + sched_traditional_choose_thread, + steal_thread, + compute_priority, + choose_processor, + processor_enqueue, + processor_queue_shutdown, + processor_queue_remove, + processor_queue_empty, + priority_is_urgent, + processor_csw_check, + processor_queue_has_priority, + sched_traditional_initial_quantum_size, + sched_traditional_initial_thread_sched_mode, + sched_traditional_supports_timeshare_mode, + can_update_priority, + update_priority, + lightweight_update_priority, + sched_traditional_quantum_expire, + should_current_thread_rechoose_processor, + sched_traditional_processor_runq_count, + sched_traditional_processor_runq_stats_count_sum, + sched_traditional_fairshare_init, + sched_traditional_fairshare_runq_count, + sched_traditional_fairshare_runq_stats_count_sum, + sched_traditional_fairshare_enqueue, + sched_traditional_fairshare_dequeue, + sched_traditional_fairshare_queue_remove, + TRUE /* direct_dispatch_to_idle_processors */ +}; + +const struct sched_dispatch_table sched_traditional_with_pset_runqueue_dispatch = { + sched_traditional_with_pset_runqueue_init, + sched_traditional_timebase_init, + sched_traditional_processor_init, + sched_traditional_pset_init, + sched_traditional_tick_continue, + sched_traditional_choose_thread, + steal_thread, + compute_priority, + choose_processor, + processor_enqueue, + processor_queue_shutdown, + processor_queue_remove, + sched_traditional_with_pset_runqueue_processor_queue_empty, + priority_is_urgent, + processor_csw_check, + processor_queue_has_priority, + sched_traditional_initial_quantum_size, + sched_traditional_initial_thread_sched_mode, + sched_traditional_supports_timeshare_mode, + can_update_priority, + update_priority, + lightweight_update_priority, + sched_traditional_quantum_expire, + should_current_thread_rechoose_processor, + sched_traditional_processor_runq_count, + sched_traditional_with_pset_runqueue_processor_runq_stats_count_sum, + sched_traditional_fairshare_init, + sched_traditional_fairshare_runq_count, + sched_traditional_fairshare_runq_stats_count_sum, + sched_traditional_fairshare_enqueue, + sched_traditional_fairshare_dequeue, + sched_traditional_fairshare_queue_remove, + FALSE /* direct_dispatch_to_idle_processors */ +}; + +#endif + +const struct sched_dispatch_table *sched_current_dispatch = NULL; + +/* + * Statically allocate a buffer to hold the longest possible + * scheduler description string, as currently implemented. + * bsd/kern/kern_sysctl.c has a corresponding definition in bsd/ + * to export to userspace via sysctl(3). If either version + * changes, update the other. + * + * Note that in addition to being an upper bound on the strings + * in the kernel, it's also an exact parameter to PE_get_default(), + * which interrogates the device tree on some platforms. That + * API requires the caller know the exact size of the device tree + * property, so we need both a legacy size (32) and the current size + * (48) to deal with old and new device trees. The device tree property + * is similarly padded to a fixed size so that the same kernel image + * can run on multiple devices with different schedulers configured + * in the device tree. + */ +#define SCHED_STRING_MAX_LENGTH (48) + +char sched_string[SCHED_STRING_MAX_LENGTH]; +static enum sched_enum _sched_enum = sched_enum_unknown; void sched_init(void) +{ + char sched_arg[SCHED_STRING_MAX_LENGTH] = { '\0' }; + + /* Check for runtime selection of the scheduler algorithm */ + if (!PE_parse_boot_argn("sched", sched_arg, sizeof (sched_arg))) { + /* If no boot-args override, look in device tree */ + if (!PE_get_default("kern.sched", sched_arg, + SCHED_STRING_MAX_LENGTH)) { + sched_arg[0] = '\0'; + } + } + + if (strlen(sched_arg) > 0) { + if (0) { + /* Allow pattern below */ +#if defined(CONFIG_SCHED_TRADITIONAL) + } else if (0 == strcmp(sched_arg, kSchedTraditionalString)) { + sched_current_dispatch = &sched_traditional_dispatch; + _sched_enum = sched_enum_traditional; + strlcpy(sched_string, kSchedTraditionalString, sizeof(sched_string)); + kprintf("Scheduler: Runtime selection of %s\n", kSchedTraditionalString); + } else if (0 == strcmp(sched_arg, kSchedTraditionalWithPsetRunqueueString)) { + sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch; + _sched_enum = sched_enum_traditional_with_pset_runqueue; + strlcpy(sched_string, kSchedTraditionalWithPsetRunqueueString, sizeof(sched_string)); + kprintf("Scheduler: Runtime selection of %s\n", kSchedTraditionalWithPsetRunqueueString); +#endif +#if defined(CONFIG_SCHED_PROTO) + } else if (0 == strcmp(sched_arg, kSchedProtoString)) { + sched_current_dispatch = &sched_proto_dispatch; + _sched_enum = sched_enum_proto; + strlcpy(sched_string, kSchedProtoString, sizeof(sched_string)); + kprintf("Scheduler: Runtime selection of %s\n", kSchedProtoString); +#endif +#if defined(CONFIG_SCHED_GRRR) + } else if (0 == strcmp(sched_arg, kSchedGRRRString)) { + sched_current_dispatch = &sched_grrr_dispatch; + _sched_enum = sched_enum_grrr; + strlcpy(sched_string, kSchedGRRRString, sizeof(sched_string)); + kprintf("Scheduler: Runtime selection of %s\n", kSchedGRRRString); +#endif +#if defined(CONFIG_SCHED_FIXEDPRIORITY) + } else if (0 == strcmp(sched_arg, kSchedFixedPriorityString)) { + sched_current_dispatch = &sched_fixedpriority_dispatch; + _sched_enum = sched_enum_fixedpriority; + strlcpy(sched_string, kSchedFixedPriorityString, sizeof(sched_string)); + kprintf("Scheduler: Runtime selection of %s\n", kSchedFixedPriorityString); + } else if (0 == strcmp(sched_arg, kSchedFixedPriorityWithPsetRunqueueString)) { + sched_current_dispatch = &sched_fixedpriority_with_pset_runqueue_dispatch; + _sched_enum = sched_enum_fixedpriority_with_pset_runqueue; + strlcpy(sched_string, kSchedFixedPriorityWithPsetRunqueueString, sizeof(sched_string)); + kprintf("Scheduler: Runtime selection of %s\n", kSchedFixedPriorityWithPsetRunqueueString); +#endif + } else { + panic("Unrecognized scheduler algorithm: %s", sched_arg); + } + } else { +#if defined(CONFIG_SCHED_TRADITIONAL) + sched_current_dispatch = &sched_traditional_dispatch; + _sched_enum = sched_enum_traditional; + strlcpy(sched_string, kSchedTraditionalString, sizeof(sched_string)); + kprintf("Scheduler: Default of %s\n", kSchedTraditionalString); +#elif defined(CONFIG_SCHED_PROTO) + sched_current_dispatch = &sched_proto_dispatch; + _sched_enum = sched_enum_proto; + strlcpy(sched_string, kSchedProtoString, sizeof(sched_string)); + kprintf("Scheduler: Default of %s\n", kSchedProtoString); +#elif defined(CONFIG_SCHED_GRRR) + sched_current_dispatch = &sched_grrr_dispatch; + _sched_enum = sched_enum_grrr; + strlcpy(sched_string, kSchedGRRRString, sizeof(sched_string)); + kprintf("Scheduler: Default of %s\n", kSchedGRRRString); +#elif defined(CONFIG_SCHED_FIXEDPRIORITY) + sched_current_dispatch = &sched_fixedpriority_dispatch; + _sched_enum = sched_enum_fixedpriority; + strlcpy(sched_string, kSchedFixedPriorityString, sizeof(sched_string)); + kprintf("Scheduler: Default of %s\n", kSchedFixedPriorityString); +#else +#error No default scheduler implementation +#endif + } + + SCHED(init)(); + SCHED(fairshare_init)(); + sched_realtime_init(); + ast_init(); + + SCHED(pset_init)(&pset0); + SCHED(processor_init)(master_processor); +} + +void +sched_timebase_init(void) +{ + uint64_t abstime; + + clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC, &abstime); + sched_one_second_interval = abstime; + + SCHED(timebase_init)(); + sched_realtime_timebase_init(); +} + +#if defined(CONFIG_SCHED_TRADITIONAL) + +static void +sched_traditional_init(void) { /* * Calculate the timeslicing quantum @@ -230,19 +594,13 @@ sched_init(void) printf("standard timeslicing quantum is %d us\n", std_quantum_us); - sched_safe_duration = (2 * max_unsafe_quanta / default_preemption_rate) * - (1 << SCHED_TICK_SHIFT); - load_shift_init(); preempt_pri_init(); - simple_lock_init(&rt_lock, 0); - run_queue_init(&rt_runq); sched_tick = 0; - ast_init(); } -void -sched_timebase_init(void) +static void +sched_traditional_timebase_init(void) { uint64_t abstime; uint32_t shift; @@ -258,17 +616,6 @@ sched_timebase_init(void) assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); min_std_quantum = (uint32_t)abstime; - /* smallest rt computaton (50 us) */ - clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime); - assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); - min_rt_quantum = (uint32_t)abstime; - - /* maximum rt computation (50 ms) */ - clock_interval_to_absolutetime_interval( - 50, 1000*NSEC_PER_USEC, &abstime); - assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); - max_rt_quantum = (uint32_t)abstime; - /* scheduler tick interval */ clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT, NSEC_PER_USEC, &abstime); @@ -285,9 +632,82 @@ sched_timebase_init(void) sched_fixed_shift = shift; max_unsafe_computation = max_unsafe_quanta * std_quantum; + sched_safe_duration = 2 * max_unsafe_quanta * std_quantum; + max_poll_computation = max_poll_quanta * std_quantum; + thread_depress_time = 1 * std_quantum; + default_timeshare_computation = std_quantum / 2; + default_timeshare_constraint = std_quantum; + +} + +static void +sched_traditional_processor_init(processor_t processor) +{ + if (!sched_traditional_use_pset_runqueue) { + run_queue_init(&processor->runq); + } + processor->runq_bound_count = 0; +} + +static void +sched_traditional_pset_init(processor_set_t pset) +{ + if (sched_traditional_use_pset_runqueue) { + run_queue_init(&pset->pset_runq); + } + pset->pset_runq_bound_count = 0; +} + +static void +sched_traditional_with_pset_runqueue_init(void) +{ + sched_traditional_init(); + sched_traditional_use_pset_runqueue = TRUE; +} + +#endif /* CONFIG_SCHED_TRADITIONAL */ + +#if defined(CONFIG_SCHED_TRADITIONAL) || defined(CONFIG_SCHED_PROTO) || defined(CONFIG_SCHED_GRRR) || defined(CONFIG_SCHED_FIXEDPRIORITY) +void +sched_traditional_fairshare_init(void) +{ + simple_lock_init(&fs_lock, 0); + + fs_runq.count = 0; + queue_init(&fs_runq.queue); +} +#endif + +static void +sched_realtime_init(void) +{ + simple_lock_init(&rt_lock, 0); + + rt_runq.count = 0; + queue_init(&rt_runq.queue); } +static void +sched_realtime_timebase_init(void) +{ + uint64_t abstime; + + /* smallest rt computaton (50 us) */ + clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime); + assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); + min_rt_quantum = (uint32_t)abstime; + + /* maximum rt computation (50 ms) */ + clock_interval_to_absolutetime_interval( + 50, 1000*NSEC_PER_USEC, &abstime); + assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); + max_rt_quantum = (uint32_t)abstime; + +} + +#if defined(CONFIG_SCHED_TRADITIONAL) + /* * Set up values for timeshare * loading factors. @@ -318,6 +738,8 @@ preempt_pri_init(void) setbit(i, p); } +#endif /* CONFIG_SCHED_TRADITIONAL */ + /* * Thread wait timer expiration. */ @@ -363,7 +785,7 @@ thread_set_timer( thread_lock(thread); if ((thread->state & TH_WAIT) != 0) { clock_interval_to_deadline(interval, scale_factor, &deadline); - if (!timer_call_enter(&thread->wait_timer, deadline)) + if (!timer_call_enter(&thread->wait_timer, deadline, thread->sched_pri >= BASEPRI_RTQUEUES ? TIMER_CALL_CRITICAL : 0)) thread->wait_timer_active++; thread->wait_timer_is_set = TRUE; } @@ -381,7 +803,7 @@ thread_set_timer_deadline( s = splsched(); thread_lock(thread); if ((thread->state & TH_WAIT) != 0) { - if (!timer_call_enter(&thread->wait_timer, deadline)) + if (!timer_call_enter(&thread->wait_timer, deadline, thread->sched_pri >= BASEPRI_RTQUEUES ? TIMER_CALL_CRITICAL : 0)) thread->wait_timer_active++; thread->wait_timer_is_set = TRUE; } @@ -453,26 +875,31 @@ thread_unblock( * Update run counts. */ sched_run_incr(); - if (thread->sched_mode & TH_MODE_TIMESHARE) + if (thread->sched_mode == TH_MODE_TIMESHARE) sched_share_incr(); } else { /* * Signal if idling on another processor. */ +#if CONFIG_SCHED_IDLE_IN_PLACE if (thread->state & TH_IDLE) { processor_t processor = thread->last_processor; if (processor != current_processor()) machine_signal_idle(processor); } +#else + assert((thread->state & TH_IDLE) == 0); +#endif + result = TRUE; } /* * Calculate deadline for real-time threads. */ - if (thread->sched_mode & TH_MODE_REALTIME) { + if (thread->sched_mode == TH_MODE_REALTIME) { thread->realtime.deadline = mach_absolute_time(); thread->realtime.deadline += thread->realtime.constraint; } @@ -554,9 +981,9 @@ thread_mark_wait_locked( at_safe_point = (interruptible == THREAD_ABORTSAFE); if ( interruptible == THREAD_UNINT || - !(thread->sched_mode & TH_MODE_ABORT) || + !(thread->sched_flags & TH_SFLAG_ABORT) || (!at_safe_point && - (thread->sched_mode & TH_MODE_ABORTSAFELY))) { + (thread->sched_flags & TH_SFLAG_ABORTSAFELY))) { DTRACE_SCHED(sleep); @@ -565,8 +992,8 @@ thread_mark_wait_locked( return (thread->wait_result = THREAD_WAITING); } else - if (thread->sched_mode & TH_MODE_ABORTSAFELY) - thread->sched_mode &= ~TH_MODE_ISABORTED; + if (thread->sched_flags & TH_SFLAG_ABORTSAFELY) + thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK; return (thread->wait_result = THREAD_INTERRUPTED); } @@ -1033,7 +1460,18 @@ kern_return_t thread_wakeup_prim( event_t event, boolean_t one_thread, - wait_result_t result) + wait_result_t result) +{ + return (thread_wakeup_prim_internal(event, one_thread, result, -1)); +} + + +kern_return_t +thread_wakeup_prim_internal( + event_t event, + boolean_t one_thread, + wait_result_t result, + int priority) { register wait_queue_t wq; register int index; @@ -1041,9 +1479,9 @@ thread_wakeup_prim( index = wait_hash(event); wq = &wait_queues[index]; if (one_thread) - return (wait_queue_wakeup_one(wq, event, result)); + return (wait_queue_wakeup_one(wq, event, result, priority)); else - return (wait_queue_wakeup_all(wq, event, result)); + return (wait_queue_wakeup_all(wq, event, result)); } /* @@ -1092,17 +1530,23 @@ thread_select( thread_t new_thread = THREAD_NULL; boolean_t inactive_state; + assert(processor == current_processor()); + do { /* * Update the priority. */ - if (thread->sched_stamp != sched_tick) - update_priority(thread); - + if (SCHED(can_update_priority)(thread)) + SCHED(update_priority)(thread); + processor->current_pri = thread->sched_pri; + processor->current_thmode = thread->sched_mode; pset_lock(pset); + assert(pset->low_count); + assert(pset->low_pri); + inactive_state = processor->state != PROCESSOR_SHUTDOWN && machine_processor_is_inactive(processor); simple_lock(&rt_lock); @@ -1113,12 +1557,7 @@ thread_select( * bound to a different processor, nor be in the wrong * processor set. */ - if ( -#if CONFIG_EMBEDDED - ((thread->state & ~TH_SUSP) == TH_RUN) && -#else - thread->state == TH_RUN && -#endif + if ( ((thread->state & ~TH_SUSP) == TH_RUN) && (thread->sched_pri >= BASEPRI_RTQUEUES || processor->processor_meta == PROCESSOR_META_NULL || processor->processor_meta->primary == processor) && @@ -1128,24 +1567,16 @@ thread_select( thread->affinity_set->aset_pset == pset) ) { if ( thread->sched_pri >= BASEPRI_RTQUEUES && first_timeslice(processor) ) { - if (rt_runq.highq >= BASEPRI_RTQUEUES) { - register run_queue_t runq = &rt_runq; + if (rt_runq.count > 0) { register queue_t q; - q = runq->queues + runq->highq; + q = &rt_runq.queue; if (((thread_t)q->next)->realtime.deadline < processor->deadline) { - thread = (thread_t)q->next; - ((queue_entry_t)thread)->next->prev = q; - q->next = ((queue_entry_t)thread)->next; + thread = (thread_t)dequeue_head(q); thread->runq = PROCESSOR_NULL; - runq->count--; runq->urgency--; - assert(runq->urgency >= 0); - if (queue_empty(q)) { - if (runq->highq != IDLEPRI) - clrbit(MAXPRI - runq->highq, runq->bitmap); - runq->highq = MAXPRI - ffsbit(runq->bitmap); - } + SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count); + rt_runq.count--; } } @@ -1158,8 +1589,8 @@ thread_select( return (thread); } - if (!inactive_state && rt_runq.highq < thread->sched_pri && - (new_thread = choose_thread(processor, thread->sched_pri)) == THREAD_NULL) { + if (!inactive_state && (thread->sched_mode != TH_MODE_FAIRSHARE || SCHED(fairshare_runq_count)() == 0) && (rt_runq.count == 0 || BASEPRI_RTQUEUES < thread->sched_pri) && + (new_thread = SCHED(choose_thread)(processor, thread->sched_mode == TH_MODE_FAIRSHARE ? MINPRI : thread->sched_pri)) == THREAD_NULL) { simple_unlock(&rt_lock); @@ -1167,7 +1598,7 @@ thread_select( pset_pri_hint(pset, processor, processor->current_pri); - pset_count_hint(pset, processor, processor->runq.count); + pset_count_hint(pset, processor, SCHED(processor_runq_count)(processor)); processor->deadline = UINT64_MAX; @@ -1178,14 +1609,14 @@ thread_select( } if (new_thread != THREAD_NULL || - (processor->runq.highq >= rt_runq.highq && - (new_thread = choose_thread(processor, MINPRI)) != THREAD_NULL)) { + (SCHED(processor_queue_has_priority)(processor, rt_runq.count == 0 ? IDLEPRI : BASEPRI_RTQUEUES, TRUE) && + (new_thread = SCHED(choose_thread)(processor, MINPRI)) != THREAD_NULL)) { simple_unlock(&rt_lock); if (!inactive_state) { pset_pri_hint(pset, processor, new_thread->sched_pri); - pset_count_hint(pset, processor, processor->runq.count); + pset_count_hint(pset, processor, SCHED(processor_runq_count)(processor)); } processor->deadline = UINT64_MAX; @@ -1195,7 +1626,12 @@ thread_select( } if (rt_runq.count > 0) { - thread = run_queue_dequeue(&rt_runq, SCHED_HEADQ); + thread = (thread_t)dequeue_head(&rt_runq.queue); + + thread->runq = PROCESSOR_NULL; + SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count); + rt_runq.count--; + simple_unlock(&rt_lock); processor->deadline = thread->realtime.deadline; @@ -1206,6 +1642,17 @@ thread_select( simple_unlock(&rt_lock); + /* No realtime threads and no normal threads on the per-processor + * runqueue. Finally check for global fairshare threads. + */ + if ((new_thread = SCHED(fairshare_dequeue)()) != THREAD_NULL) { + + processor->deadline = UINT64_MAX; + pset_unlock(pset); + + return (new_thread); + } + processor->deadline = UINT64_MAX; /* @@ -1214,10 +1661,10 @@ thread_select( */ if (inactive_state) { if (processor->state == PROCESSOR_RUNNING) - remqueue(&pset->active_queue, (queue_entry_t)processor); + remqueue((queue_entry_t)processor); else if (processor->state == PROCESSOR_IDLE) - remqueue(&pset->idle_queue, (queue_entry_t)processor); + remqueue((queue_entry_t)processor); processor->state = PROCESSOR_INACTIVE; @@ -1230,15 +1677,16 @@ thread_select( * No runnable threads, attempt to steal * from other processors. */ - new_thread = steal_thread(pset); - if (new_thread != THREAD_NULL) + new_thread = SCHED(steal_thread)(pset); + if (new_thread != THREAD_NULL) { return (new_thread); + } /* * If other threads have appeared, shortcut * around again. */ - if (processor->runq.count > 0 || rt_runq.count > 0) + if (!SCHED(processor_queue_empty)(processor) || rt_runq.count > 0 || SCHED(fairshare_runq_count)() > 0) continue; pset_lock(pset); @@ -1248,26 +1696,28 @@ thread_select( * was running. */ if (processor->state == PROCESSOR_RUNNING) { - remqueue(&pset->active_queue, (queue_entry_t)processor); + remqueue((queue_entry_t)processor); processor->state = PROCESSOR_IDLE; if (processor->processor_meta == PROCESSOR_META_NULL || processor->processor_meta->primary == processor) { enqueue_head(&pset->idle_queue, (queue_entry_t)processor); - pset->low_pri = pset->low_count = processor; + pset_pri_init_hint(pset, processor); + pset_count_init_hint(pset, processor); } else { enqueue_head(&processor->processor_meta->idle_queue, (queue_entry_t)processor); - pset_unlock(pset); - return (processor->idle_thread); + pset_unlock(pset); + return (processor->idle_thread); } } pset_unlock(pset); +#if CONFIG_SCHED_IDLE_IN_PLACE /* * Choose idle thread if fast idle is not possible. */ - if ((thread->state & (TH_IDLE|TH_TERMINATE|TH_SUSP)) || !(thread->state & TH_WAIT) || thread->wake_active) + if ((thread->state & (TH_IDLE|TH_TERMINATE|TH_SUSP)) || !(thread->state & TH_WAIT) || thread->wake_active || thread->sched_pri >= BASEPRI_RTQUEUES) return (processor->idle_thread); /* @@ -1277,11 +1727,23 @@ thread_select( */ new_thread = thread_select_idle(thread, processor); +#else /* !CONFIG_SCHED_IDLE_IN_PLACE */ + + /* + * Do a full context switch to idle so that the current + * thread can start running on another processor without + * waiting for the fast-idled processor to wake up. + */ + return (processor->idle_thread); + +#endif /* !CONFIG_SCHED_IDLE_IN_PLACE */ + } while (new_thread == THREAD_NULL); return (new_thread); } +#if CONFIG_SCHED_IDLE_IN_PLACE /* * thread_select_idle: * @@ -1296,12 +1758,13 @@ thread_select_idle( { thread_t new_thread; - if (thread->sched_mode & TH_MODE_TIMESHARE) + if (thread->sched_mode == TH_MODE_TIMESHARE) sched_share_decr(); sched_run_decr(); thread->state |= TH_IDLE; processor->current_pri = IDLEPRI; + processor->current_thmode = TH_MODE_NONE; thread_unlock(thread); @@ -1309,6 +1772,7 @@ thread_select_idle( * Switch execution timing to processor idle thread. */ processor->last_dispatch = mach_absolute_time(); + thread->last_run_time = processor->last_dispatch; thread_timer_event(processor->last_dispatch, &processor->idle_thread->system_timer); PROCESSOR_DATA(processor, kernel_timer) = &processor->idle_thread->system_timer; @@ -1320,6 +1784,8 @@ thread_select_idle( (*thread->sched_call)(SCHED_CALL_BLOCK, thread); + thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0); + /* * Enable interrupts and perform idling activities. No * preemption due to TH_IDLE being set. @@ -1333,6 +1799,23 @@ thread_select_idle( thread_lock(thread); + /* + * If we idled in place, simulate a context switch back + * to the original priority of the thread so that the + * platform layer cannot distinguish this from a true + * switch to the idle thread. + */ + if (thread->sched_mode == TH_MODE_REALTIME) + thread_tell_urgency(THREAD_URGENCY_REAL_TIME, thread->realtime.period, thread->realtime.deadline); + /* Identify non-promoted threads which have requested a + * "background" priority. + */ + else if ((thread->sched_pri <= MAXPRI_THROTTLE) && + (thread->priority <= MAXPRI_THROTTLE)) + thread_tell_urgency(THREAD_URGENCY_BACKGROUND, thread->sched_pri, thread->priority); + else + thread_tell_urgency(THREAD_URGENCY_NORMAL, thread->sched_pri, thread->priority); + /* * If awakened, switch to thread timer and start a new quantum. * Otherwise skip; we will context switch to another thread or return here. @@ -1343,9 +1826,10 @@ thread_select_idle( PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer; thread_quantum_init(thread); + thread->last_quantum_refill_time = processor->last_dispatch; processor->quantum_end = processor->last_dispatch + thread->current_quantum; - timer_call_enter1(&processor->quantum_timer, thread, processor->quantum_end); + timer_call_enter1(&processor->quantum_timer, thread, processor->quantum_end, 0); processor->timeslice = 1; thread->computation_epoch = processor->last_dispatch; @@ -1354,11 +1838,32 @@ thread_select_idle( thread->state &= ~TH_IDLE; sched_run_incr(); - if (thread->sched_mode & TH_MODE_TIMESHARE) + if (thread->sched_mode == TH_MODE_TIMESHARE) sched_share_incr(); return (new_thread); } +#endif /* CONFIG_SCHED_IDLE_IN_PLACE */ + +#if defined(CONFIG_SCHED_TRADITIONAL) +static thread_t +sched_traditional_choose_thread( + processor_t processor, + int priority) +{ + thread_t thread; + + thread = choose_thread(processor, runq_for_processor(processor), priority); + if (thread != THREAD_NULL) { + runq_consider_decr_bound_count(processor, thread); + } + + return thread; +} + +#endif /* defined(CONFIG_SCHED_TRADITIONAL) */ + +#if defined(CONFIG_SCHED_TRADITIONAL) || defined(CONFIG_SCHED_FIXEDPRIORITY) /* * choose_thread: @@ -1370,12 +1875,12 @@ thread_select_idle( * Associated pset must be locked. Returns THREAD_NULL * on failure. */ -static thread_t +thread_t choose_thread( processor_t processor, + run_queue_t rq, int priority) { - run_queue_t rq = &processor->runq; queue_t queue = rq->queues + rq->highq; int pri = rq->highq, count = rq->count; thread_t thread; @@ -1385,11 +1890,12 @@ choose_thread( while (!queue_end(queue, (queue_entry_t)thread)) { if (thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor) { - remqueue(queue, (queue_entry_t)thread); + remqueue((queue_entry_t)thread); thread->runq = PROCESSOR_NULL; + SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); rq->count--; - if (testbit(pri, sched_preempt_pri)) { + if (SCHED(priority_is_urgent)(pri)) { rq->urgency--; assert(rq->urgency >= 0); } if (queue_empty(queue)) { @@ -1411,6 +1917,8 @@ choose_thread( return (THREAD_NULL); } +#endif /* defined(CONFIG_SCHED_TRADITIONAL) || defined(CONFIG_SCHED_FIXEDPRIORITY) */ + /* * Perform a context switch and start executing the new thread. * @@ -1478,7 +1986,7 @@ thread_invoke( * Allow time constraint threads to hang onto * a stack. */ - if ((self->sched_mode & TH_MODE_REALTIME) && !self->reserved_stack) + if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack) self->reserved_stack = self->kernel_stack; if (continuation != NULL) { @@ -1500,6 +2008,7 @@ thread_invoke( processor = current_processor(); processor->active_thread = thread; processor->current_pri = thread->sched_pri; + processor->current_thmode = thread->sched_mode; if (thread->last_processor != processor && thread->last_processor != NULL) { if (thread->last_processor->processor_set != processor->processor_set) thread->ps_switch++; @@ -1513,16 +2022,24 @@ thread_invoke( self->reason = reason; processor->last_dispatch = mach_absolute_time(); + self->last_run_time = processor->last_dispatch; thread_timer_event(processor->last_dispatch, &thread->system_timer); PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer; KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF)|DBG_FUNC_NONE, self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0); + if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) { + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)|DBG_FUNC_NONE, + (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0); + } + DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, thread->task->bsd_info); - TLOG(1, "thread_invoke: calling machine_stack_handoff\n"); - machine_stack_handoff(self, thread); + SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri); + + TLOG(1, "thread_invoke: calling stack_handoff\n"); + stack_handoff(self, thread); DTRACE_SCHED(on__cpu); @@ -1545,6 +2062,9 @@ thread_invoke( counter(++c_thread_invoke_same); thread_unlock(self); + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE, + self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0); + self->continuation = self->parameter = NULL; funnel_refunnel_check(self, 3); @@ -1571,6 +2091,10 @@ thread_invoke( ast_context(self); counter(++c_thread_invoke_same); thread_unlock(self); + + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE, + self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0); + return (TRUE); } } @@ -1581,6 +2105,7 @@ thread_invoke( processor = current_processor(); processor->active_thread = thread; processor->current_pri = thread->sched_pri; + processor->current_thmode = thread->sched_mode; if (thread->last_processor != processor && thread->last_processor != NULL) { if (thread->last_processor->processor_set != processor->processor_set) thread->ps_switch++; @@ -1597,14 +2122,22 @@ thread_invoke( self->reason = reason; processor->last_dispatch = mach_absolute_time(); + self->last_run_time = processor->last_dispatch; thread_timer_event(processor->last_dispatch, &thread->system_timer); PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer; KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE, self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0); + if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) { + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)|DBG_FUNC_NONE, + (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0); + } + DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, thread->task->bsd_info); + SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri); + /* * This is where we actually switch register context, * and address space if required. We will next run @@ -1671,7 +2204,7 @@ thread_dispatch( else thread->current_quantum = 0; - if (thread->sched_mode & TH_MODE_REALTIME) { + if (thread->sched_mode == TH_MODE_REALTIME) { /* * Cancel the deadline if the thread has * consumed the entire quantum. @@ -1681,6 +2214,7 @@ thread_dispatch( thread->reason |= AST_QUANTUM; } } else { +#if defined(CONFIG_SCHED_TRADITIONAL) /* * For non-realtime threads treat a tiny * remaining quantum as an expired quantum @@ -1690,6 +2224,7 @@ thread_dispatch( thread->reason |= AST_QUANTUM; thread->current_quantum += std_quantum; } +#endif } /* @@ -1738,7 +2273,7 @@ thread_dispatch( thread->state &= ~TH_RUN; - if (thread->sched_mode & TH_MODE_TIMESHARE) + if (thread->sched_mode == TH_MODE_TIMESHARE) sched_share_decr(); sched_run_decr(); @@ -1762,17 +2297,30 @@ thread_dispatch( } if (!(self->state & TH_IDLE)) { + + if (self->sched_mode == TH_MODE_REALTIME) + thread_tell_urgency(THREAD_URGENCY_REAL_TIME, self->realtime.period, self->realtime.deadline); + /* Identify non-promoted threads which have requested a + * "background" priority. + */ + else if ((self->sched_pri <= MAXPRI_THROTTLE) && + (self->priority <= MAXPRI_THROTTLE)) + thread_tell_urgency(THREAD_URGENCY_BACKGROUND, self->sched_pri, self->priority); + else + thread_tell_urgency(THREAD_URGENCY_NORMAL, self->sched_pri, self->priority); /* * Get a new quantum if none remaining. */ - if (self->current_quantum == 0) + if (self->current_quantum == 0) { thread_quantum_init(self); + self->last_quantum_refill_time = processor->last_dispatch; + } /* * Set up quantum timer and timeslice. */ processor->quantum_end = (processor->last_dispatch + self->current_quantum); - timer_call_enter1(&processor->quantum_timer, self, processor->quantum_end); + timer_call_enter1(&processor->quantum_timer, self, processor->quantum_end, 0); processor->timeslice = 1; @@ -1781,6 +2329,8 @@ thread_dispatch( else { timer_call_cancel(&processor->quantum_timer); processor->timeslice = 0; + + thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0); } } @@ -1832,7 +2382,7 @@ thread_block_reason( self->continuation = continuation; self->parameter = parameter; - if (kdebug_thread_block && kdebug_enable && self->state != TH_RUN) { + if (__improbable(kdebug_thread_block && kdebug_enable && self->state != TH_RUN)) { uint32_t bt[8]; OSBacktrace((void **)&bt[0], 8); @@ -1944,18 +2494,52 @@ thread_continue( /*NOTREACHED*/ } -/* - * run_queue_init: - * - * Initialize a run queue before first use. - */ void -run_queue_init( - run_queue_t rq) +thread_quantum_init(thread_t thread) { - int i; + if (thread->sched_mode == TH_MODE_REALTIME) { + thread->current_quantum = thread->realtime.computation; + } else { + thread->current_quantum = SCHED(initial_quantum_size)(thread); + } +} - rq->highq = IDLEPRI; +#if defined(CONFIG_SCHED_TRADITIONAL) +static uint32_t +sched_traditional_initial_quantum_size(thread_t thread __unused) +{ + return std_quantum; +} + +static sched_mode_t +sched_traditional_initial_thread_sched_mode(task_t parent_task) +{ + if (parent_task == kernel_task) + return TH_MODE_FIXED; + else + return TH_MODE_TIMESHARE; +} + +static boolean_t +sched_traditional_supports_timeshare_mode(void) +{ + return TRUE; +} + +#endif /* CONFIG_SCHED_TRADITIONAL */ + +/* + * run_queue_init: + * + * Initialize a run queue before first use. + */ +void +run_queue_init( + run_queue_t rq) +{ + int i; + + rq->highq = IDLEPRI; for (i = 0; i < NRQBM; i++) rq->bitmap[i] = 0; setbit(MAXPRI - IDLEPRI, rq->bitmap); @@ -1964,16 +2548,97 @@ run_queue_init( queue_init(&rq->queues[i]); } +#if defined(CONFIG_SCHED_TRADITIONAL) || defined(CONFIG_SCHED_PROTO) || defined(CONFIG_SCHED_GRRR) || defined(CONFIG_SCHED_FIXEDPRIORITY) +int +sched_traditional_fairshare_runq_count(void) +{ + return fs_runq.count; +} + +uint64_t +sched_traditional_fairshare_runq_stats_count_sum(void) +{ + return fs_runq.runq_stats.count_sum; +} + +void +sched_traditional_fairshare_enqueue(thread_t thread) +{ + queue_t queue = &fs_runq.queue; + + simple_lock(&fs_lock); + + enqueue_tail(queue, (queue_entry_t)thread); + + thread->runq = FS_RUNQ; + SCHED_STATS_RUNQ_CHANGE(&fs_runq.runq_stats, fs_runq.count); + fs_runq.count++; + + simple_unlock(&fs_lock); +} + +thread_t +sched_traditional_fairshare_dequeue(void) +{ + thread_t thread; + + simple_lock(&fs_lock); + if (fs_runq.count > 0) { + thread = (thread_t)dequeue_head(&fs_runq.queue); + + thread->runq = PROCESSOR_NULL; + SCHED_STATS_RUNQ_CHANGE(&fs_runq.runq_stats, fs_runq.count); + fs_runq.count--; + + simple_unlock(&fs_lock); + + return (thread); + } + simple_unlock(&fs_lock); + + return THREAD_NULL; +} + +boolean_t +sched_traditional_fairshare_queue_remove(thread_t thread) +{ + queue_t q; + + simple_lock(&fs_lock); + q = &fs_runq.queue; + + if (FS_RUNQ == thread->runq) { + remqueue((queue_entry_t)thread); + SCHED_STATS_RUNQ_CHANGE(&fs_runq.runq_stats, fs_runq.count); + fs_runq.count--; + + thread->runq = PROCESSOR_NULL; + simple_unlock(&fs_lock); + return (TRUE); + } + else { + /* + * The thread left the run queue before we could + * lock the run queue. + */ + assert(thread->runq == PROCESSOR_NULL); + simple_unlock(&fs_lock); + return (FALSE); + } +} + +#endif /* defined(CONFIG_SCHED_TRADITIONAL) || defined(CONFIG_SCHED_PROTO) || defined(CONFIG_SCHED_GRRR) || defined(CONFIG_SCHED_FIXEDPRIORITY) */ + /* * run_queue_dequeue: * * Perform a dequeue operation on a run queue, * and return the resulting thread. * - * The run queue must be locked (see run_queue_remove() + * The run queue must be locked (see thread_run_queue_remove() * for more info), and not empty. */ -static thread_t +thread_t run_queue_dequeue( run_queue_t rq, integer_t options) @@ -1982,19 +2647,16 @@ run_queue_dequeue( queue_t queue = rq->queues + rq->highq; if (options & SCHED_HEADQ) { - thread = (thread_t)queue->next; - ((queue_entry_t)thread)->next->prev = queue; - queue->next = ((queue_entry_t)thread)->next; + thread = (thread_t)dequeue_head(queue); } else { - thread = (thread_t)queue->prev; - ((queue_entry_t)thread)->prev->next = queue; - queue->prev = ((queue_entry_t)thread)->prev; + thread = (thread_t)dequeue_tail(queue); } thread->runq = PROCESSOR_NULL; + SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); rq->count--; - if (testbit(rq->highq, sched_preempt_pri)) { + if (SCHED(priority_is_urgent)(rq->highq)) { rq->urgency--; assert(rq->urgency >= 0); } if (queue_empty(queue)) { @@ -2006,6 +2668,103 @@ run_queue_dequeue( return (thread); } +/* + * run_queue_enqueue: + * + * Perform a enqueue operation on a run queue. + * + * The run queue must be locked (see thread_run_queue_remove() + * for more info). + */ +boolean_t +run_queue_enqueue( + run_queue_t rq, + thread_t thread, + integer_t options) +{ + queue_t queue = rq->queues + thread->sched_pri; + boolean_t result = FALSE; + + if (queue_empty(queue)) { + enqueue_tail(queue, (queue_entry_t)thread); + + setbit(MAXPRI - thread->sched_pri, rq->bitmap); + if (thread->sched_pri > rq->highq) { + rq->highq = thread->sched_pri; + result = TRUE; + } + } + else + if (options & SCHED_TAILQ) + enqueue_tail(queue, (queue_entry_t)thread); + else + enqueue_head(queue, (queue_entry_t)thread); + + if (SCHED(priority_is_urgent)(thread->sched_pri)) + rq->urgency++; + SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); + rq->count++; + + return (result); + +} + +/* + * run_queue_remove: + * + * Remove a specific thread from a runqueue. + * + * The run queue must be locked. + */ +void +run_queue_remove( + run_queue_t rq, + thread_t thread) +{ + + remqueue((queue_entry_t)thread); + SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); + rq->count--; + if (SCHED(priority_is_urgent)(thread->sched_pri)) { + rq->urgency--; assert(rq->urgency >= 0); + } + + if (queue_empty(rq->queues + thread->sched_pri)) { + /* update run queue status */ + if (thread->sched_pri != IDLEPRI) + clrbit(MAXPRI - thread->sched_pri, rq->bitmap); + rq->highq = MAXPRI - ffsbit(rq->bitmap); + } + + thread->runq = PROCESSOR_NULL; +} + +/* + * fairshare_setrun: + * + * Dispatch a thread for round-robin execution. + * + * Thread must be locked. Associated pset must + * be locked, and is returned unlocked. + */ +static void +fairshare_setrun( + processor_t processor, + thread_t thread) +{ + processor_set_t pset = processor->processor_set; + + thread->chosen_processor = processor; + + SCHED(fairshare_enqueue)(thread); + + if (processor != current_processor()) + machine_signal_idle(processor); + + pset_unlock(pset); + +} + /* * realtime_queue_insert: * @@ -2015,8 +2774,7 @@ static boolean_t realtime_queue_insert( thread_t thread) { - run_queue_t rq = &rt_runq; - queue_t queue = rq->queues + thread->sched_pri; + queue_t queue = &rt_runq.queue; uint64_t deadline = thread->realtime.deadline; boolean_t preempt = FALSE; @@ -2024,10 +2782,6 @@ realtime_queue_insert( if (queue_empty(queue)) { enqueue_tail(queue, (queue_entry_t)thread); - - setbit(MAXPRI - thread->sched_pri, rq->bitmap); - if (thread->sched_pri > rq->highq) - rq->highq = thread->sched_pri; preempt = TRUE; } else { @@ -2050,7 +2804,8 @@ realtime_queue_insert( } thread->runq = RT_RUNQ; - rq->count++; rq->urgency++; + SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count); + rt_runq.count++; simple_unlock(&rt_lock); @@ -2072,11 +2827,14 @@ realtime_setrun( { processor_set_t pset = processor->processor_set; + thread->chosen_processor = processor; + /* * Dispatch directly onto idle processor. */ - if (processor->state == PROCESSOR_IDLE) { - remqueue(&pset->idle_queue, (queue_entry_t)processor); + if ( (thread->bound_processor == processor) + && processor->state == PROCESSOR_IDLE) { + remqueue((queue_entry_t)processor); enqueue_tail(&pset->active_queue, (queue_entry_t)processor); processor->next_thread = thread; @@ -2090,8 +2848,11 @@ realtime_setrun( } if (realtime_queue_insert(thread)) { + int prstate = processor->state; if (processor == current_processor()) ast_on(AST_PREEMPT | AST_URGENT); + else if ((prstate == PROCESSOR_DISPATCHING) || (prstate == PROCESSOR_IDLE)) + machine_signal_idle(processor); else cause_ast_check(processor); } @@ -2099,6 +2860,14 @@ realtime_setrun( pset_unlock(pset); } +#if defined(CONFIG_SCHED_TRADITIONAL) + +static boolean_t +priority_is_urgent(int priority) +{ + return testbit(priority, sched_preempt_pri) ? TRUE : FALSE; +} + /* * processor_enqueue: * @@ -2108,7 +2877,7 @@ realtime_setrun( * Returns TRUE if a preemption is indicated based on the state * of the run queue. * - * The run queue must be locked (see run_queue_remove() + * The run queue must be locked (see thread_run_queue_remove() * for more info). */ static boolean_t @@ -2117,33 +2886,18 @@ processor_enqueue( thread_t thread, integer_t options) { - run_queue_t rq = &processor->runq; - queue_t queue = rq->queues + thread->sched_pri; - boolean_t result = FALSE; + run_queue_t rq = runq_for_processor(processor); + boolean_t result; - if (queue_empty(queue)) { - enqueue_tail(queue, (queue_entry_t)thread); - - setbit(MAXPRI - thread->sched_pri, rq->bitmap); - if (thread->sched_pri > rq->highq) { - rq->highq = thread->sched_pri; - result = TRUE; - } - } - else - if (options & SCHED_TAILQ) - enqueue_tail(queue, (queue_entry_t)thread); - else - enqueue_head(queue, (queue_entry_t)thread); - + result = run_queue_enqueue(rq, thread, options); thread->runq = processor; - if (testbit(thread->sched_pri, sched_preempt_pri)) - rq->urgency++; - rq->count++; + runq_consider_incr_bound_count(processor, thread); return (result); } +#endif /* CONFIG_SCHED_TRADITIONAL */ + /* * processor_setrun: * @@ -2162,11 +2916,15 @@ processor_setrun( processor_set_t pset = processor->processor_set; ast_t preempt; + thread->chosen_processor = processor; + /* * Dispatch directly onto idle processor. */ - if (processor->state == PROCESSOR_IDLE) { - remqueue(&pset->idle_queue, (queue_entry_t)processor); + if ( (SCHED(direct_dispatch_to_idle_processors) || + thread->bound_processor == processor) + && processor->state == PROCESSOR_IDLE) { + remqueue((queue_entry_t)processor); enqueue_tail(&pset->active_queue, (queue_entry_t)processor); processor->next_thread = thread; @@ -2182,15 +2940,17 @@ processor_setrun( /* * Set preemption mode. */ - if (testbit(thread->sched_pri, sched_preempt_pri)) + if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri) + preempt = (AST_PREEMPT | AST_URGENT); + else if(processor->active_thread && thread_eager_preemption(processor->active_thread)) preempt = (AST_PREEMPT | AST_URGENT); else - if (thread->sched_mode & TH_MODE_TIMESHARE && thread->sched_pri < thread->priority) + if ((thread->sched_mode == TH_MODE_TIMESHARE) && thread->sched_pri < thread->priority) preempt = AST_NONE; else preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE; - if (!processor_enqueue(processor, thread, options)) + if (!SCHED(processor_enqueue)(processor, thread, options)) preempt = AST_NONE; if (preempt != AST_NONE) { @@ -2199,9 +2959,14 @@ processor_setrun( ast_on(preempt); } else + if ( processor->state == PROCESSOR_IDLE || processor->state == PROCESSOR_DISPATCHING) { + machine_signal_idle(processor); + } + else if ( (processor->state == PROCESSOR_RUNNING || processor->state == PROCESSOR_SHUTDOWN) && - thread->sched_pri >= processor->current_pri ) { + (thread->sched_pri >= processor->current_pri || + processor->current_thmode == TH_MODE_FAIRSHARE)) { cause_ast_check(processor); } } @@ -2210,10 +2975,112 @@ processor_setrun( thread->sched_pri >= processor->current_pri ) { cause_ast_check(processor); } + else + if ( processor->state == PROCESSOR_IDLE && + processor != current_processor() ) { + machine_signal_idle(processor); + } pset_unlock(pset); } +#if defined(CONFIG_SCHED_TRADITIONAL) + +static boolean_t +processor_queue_empty(processor_t processor) +{ + return runq_for_processor(processor)->count == 0; + +} + +static boolean_t +sched_traditional_with_pset_runqueue_processor_queue_empty(processor_t processor) +{ + processor_set_t pset = processor->processor_set; + int count = runq_for_processor(processor)->count; + + /* + * The pset runq contains the count of all runnable threads + * for all processors in the pset. However, for threads that + * are bound to another processor, the current "processor" + * is not eligible to execute the thread. So we only + * include bound threads that our bound to the current + * "processor". This allows the processor to idle when the + * count of eligible threads drops to 0, even if there's + * a runnable thread bound to a different processor in the + * shared runq. + */ + + count -= pset->pset_runq_bound_count; + count += processor->runq_bound_count; + + return count == 0; +} + +static ast_t +processor_csw_check(processor_t processor) +{ + run_queue_t runq; + + assert(processor->active_thread != NULL); + + runq = runq_for_processor(processor); + if (runq->highq > processor->current_pri) { + if (runq->urgency > 0) + return (AST_PREEMPT | AST_URGENT); + + if (processor->active_thread && thread_eager_preemption(processor->active_thread)) + return (AST_PREEMPT | AST_URGENT); + + return AST_PREEMPT; + } + + return AST_NONE; +} + +static boolean_t +processor_queue_has_priority(processor_t processor, + int priority, + boolean_t gte) +{ + if (gte) + return runq_for_processor(processor)->highq >= priority; + else + return runq_for_processor(processor)->highq > priority; +} + +static boolean_t +should_current_thread_rechoose_processor(processor_t processor) +{ + return (processor->current_pri < BASEPRI_RTQUEUES + && processor->processor_meta != PROCESSOR_META_NULL + && processor->processor_meta->primary != processor); +} + +static int +sched_traditional_processor_runq_count(processor_t processor) +{ + return runq_for_processor(processor)->count; +} + + +static uint64_t +sched_traditional_processor_runq_stats_count_sum(processor_t processor) +{ + return runq_for_processor(processor)->runq_stats.count_sum; +} + +static uint64_t +sched_traditional_with_pset_runqueue_processor_runq_stats_count_sum(processor_t processor) +{ + if (processor->cpu_id == processor->processor_set->cpu_set_low) + return runq_for_processor(processor)->runq_stats.count_sum; + else + return 0ULL; +} + +#endif /* CONFIG_SCHED_TRADITIONAL */ + #define next_pset(p) (((p)->pset_list != PROCESSOR_SET_NULL)? (p)->pset_list: (p)->node->psets) /* @@ -2233,7 +3100,7 @@ choose_next_pset( do { nset = next_pset(nset); - } while (nset->processor_count < 1 && nset != pset); + } while (nset->online_processor_count < 1 && nset != pset); return (nset); } @@ -2250,7 +3117,7 @@ choose_next_pset( * The thread must be locked. The pset must be locked, * and the resulting pset is locked on return. */ -static processor_t +processor_t choose_processor( processor_set_t pset, processor_t processor, @@ -2258,7 +3125,7 @@ choose_processor( { processor_set_t nset, cset = pset; processor_meta_t pmeta = PROCESSOR_META_NULL; - processor_t mprocessor; + processor_t mprocessor; /* * Prefer the hinted processor, when appropriate. @@ -2308,8 +3175,8 @@ choose_processor( lp_processor = cset->low_pri; /* Consider hinted processor */ if (lp_processor != PROCESSOR_NULL && - ((lp_processor->processor_meta == PROCESSOR_META_NULL) || - ((lp_processor == lp_processor->processor_meta->primary) && + ((lp_processor->processor_meta == PROCESSOR_META_NULL) || + ((lp_processor == lp_processor->processor_meta->primary) && !queue_empty(&lp_processor->processor_meta->idle_queue))) && lp_processor->state != PROCESSOR_INACTIVE && lp_processor->state != PROCESSOR_SHUTDOWN && @@ -2359,6 +3226,7 @@ choose_processor( return lp_processor; if (thread->realtime.deadline < furthest_deadline) return fd_processor; + processor = PROCESSOR_NULL; } else { @@ -2375,7 +3243,7 @@ choose_processor( if (cset->low_count != PROCESSOR_NULL && cset->low_count->state != PROCESSOR_INACTIVE && cset->low_count->state != PROCESSOR_SHUTDOWN && cset->low_count->state != PROCESSOR_OFF_LINE && (processor == PROCESSOR_NULL || (thread->sched_pri <= BASEPRI_DEFAULT && - cset->low_count->runq.count < processor->runq.count))) { + SCHED(processor_runq_count)(cset->low_count) < SCHED(processor_runq_count)(processor)))) { processor = cset->low_count; } @@ -2387,9 +3255,10 @@ choose_processor( if (processor != PROCESSOR_NULL) enqueue_tail(&cset->active_queue, (queue_entry_t)processor); } + if (processor != PROCESSOR_NULL && pmeta == PROCESSOR_META_NULL) { if (processor->processor_meta != PROCESSOR_META_NULL && - !queue_empty(&processor->processor_meta->idle_queue)) + !queue_empty(&processor->processor_meta->idle_queue)) pmeta = processor->processor_meta; } } @@ -2493,8 +3362,8 @@ thread_setrun( /* * Update priority if needed. */ - if (thread->sched_stamp != sched_tick) - update_priority(thread); + if (SCHED(can_update_priority)(thread)) + SCHED(update_priority)(thread); assert(thread->runq == PROCESSOR_NULL); @@ -2509,7 +3378,7 @@ thread_setrun( pset = thread->affinity_set->aset_pset; pset_lock(pset); - processor = choose_processor(pset, PROCESSOR_NULL, thread); + processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread); } else if (thread->last_processor != PROCESSOR_NULL) { @@ -2519,7 +3388,13 @@ thread_setrun( processor = thread->last_processor; pset = processor->processor_set; pset_lock(pset); - processor = choose_processor(pset, processor, thread); + processor = SCHED(choose_processor)(pset, processor, thread); + + if ((thread->last_processor != processor) && (thread->last_processor != PROCESSOR_NULL)) { + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LPA_BROKEN)|DBG_FUNC_NONE, + (uintptr_t)thread_tid(thread), (uintptr_t)thread->last_processor->cpu_id, (uintptr_t)processor->cpu_id, thread->last_processor->state, 0); + } + } else { /* @@ -2537,7 +3412,7 @@ thread_setrun( pset = choose_next_pset(pset); pset_lock(pset); - processor = choose_processor(pset, PROCESSOR_NULL, thread); + processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread); task->pset_hint = processor->processor_set; } } @@ -2557,6 +3432,8 @@ thread_setrun( */ if (thread->sched_pri >= BASEPRI_RTQUEUES) realtime_setrun(processor, thread); + else if (thread->sched_mode == TH_MODE_FAIRSHARE) + fairshare_setrun(processor, thread); else processor_setrun(processor, thread, options); } @@ -2573,6 +3450,8 @@ task_choose_pset( return (pset); } +#if defined(CONFIG_SCHED_TRADITIONAL) + /* * processor_queue_shutdown: * @@ -2587,7 +3466,7 @@ processor_queue_shutdown( processor_t processor) { processor_set_t pset = processor->processor_set; - run_queue_t rq = &processor->runq; + run_queue_t rq = runq_for_processor(processor); queue_t queue = rq->queues + rq->highq; int pri = rq->highq, count = rq->count; thread_t next, thread; @@ -2601,11 +3480,13 @@ processor_queue_shutdown( next = (thread_t)queue_next((queue_entry_t)thread); if (thread->bound_processor == PROCESSOR_NULL) { - remqueue(queue, (queue_entry_t)thread); + remqueue((queue_entry_t)thread); thread->runq = PROCESSOR_NULL; + SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); + runq_consider_decr_bound_count(processor, thread); rq->count--; - if (testbit(pri, sched_preempt_pri)) { + if (SCHED(priority_is_urgent)(pri)) { rq->urgency--; assert(rq->urgency >= 0); } if (queue_empty(queue)) { @@ -2635,6 +3516,8 @@ processor_queue_shutdown( } } +#endif /* CONFIG_SCHED_TRADITIONAL */ + /* * Check for a preemption point in * the current context. @@ -2646,53 +3529,30 @@ csw_check( processor_t processor) { ast_t result = AST_NONE; - run_queue_t runq; if (first_timeslice(processor)) { - runq = &rt_runq; - if (runq->highq >= BASEPRI_RTQUEUES) + if (rt_runq.count > 0) return (AST_PREEMPT | AST_URGENT); - if (runq->highq > processor->current_pri) { - if (runq->urgency > 0) - return (AST_PREEMPT | AST_URGENT); - - result |= AST_PREEMPT; - } - - runq = &processor->runq; - if (runq->highq > processor->current_pri) { - if (runq->urgency > 0) - return (AST_PREEMPT | AST_URGENT); - - result |= AST_PREEMPT; - } + result |= SCHED(processor_csw_check)(processor); + if (result & AST_URGENT) + return result; } else { - runq = &rt_runq; - if (runq->highq >= processor->current_pri) { - if (runq->urgency > 0) - return (AST_PREEMPT | AST_URGENT); - - result |= AST_PREEMPT; - } - - runq = &processor->runq; - if (runq->highq >= processor->current_pri) { - if (runq->urgency > 0) - return (AST_PREEMPT | AST_URGENT); + if (rt_runq.count > 0 && BASEPRI_RTQUEUES >= processor->current_pri) + return (AST_PREEMPT | AST_URGENT); - result |= AST_PREEMPT; - } + result |= SCHED(processor_csw_check)(processor); + if (result & AST_URGENT) + return result; } if (result != AST_NONE) return (result); - if (processor->current_pri < BASEPRI_RTQUEUES && processor->processor_meta != PROCESSOR_META_NULL && - processor->processor_meta->primary != processor) + if (SCHED(should_current_thread_rechoose_processor)(processor)) return (AST_PREEMPT); - + if (machine_processor_is_inactive(processor)) return (AST_PREEMPT); @@ -2716,7 +3576,7 @@ set_sched_pri( thread_t thread, int priority) { - boolean_t removed = run_queue_remove(thread); + boolean_t removed = thread_run_queue_remove(thread); thread->sched_pri = priority; if (removed) @@ -2729,6 +3589,7 @@ set_sched_pri( ast_t preempt; processor->current_pri = priority; + processor->current_thmode = thread->sched_mode; if ((preempt = csw_check(processor)) != AST_NONE) ast_on(preempt); } @@ -2769,8 +3630,48 @@ run_queue_check( #endif /* DEBUG */ +#if defined(CONFIG_SCHED_TRADITIONAL) + +/* locks the runqueue itself */ + +static boolean_t +processor_queue_remove( + processor_t processor, + thread_t thread) +{ + void * rqlock; + run_queue_t rq; + + rqlock = &processor->processor_set->sched_lock; + rq = runq_for_processor(processor); + + simple_lock(rqlock); + if (processor == thread->runq) { + /* + * Thread is on a run queue and we have a lock on + * that run queue. + */ + runq_consider_decr_bound_count(processor, thread); + run_queue_remove(rq, thread); + } + else { + /* + * The thread left the run queue before we could + * lock the run queue. + */ + assert(thread->runq == PROCESSOR_NULL); + processor = PROCESSOR_NULL; + } + + simple_unlock(rqlock); + + return (processor != PROCESSOR_NULL); +} + +#endif /* CONFIG_SCHED_TRADITIONAL */ + /* - * run_queue_remove: + * thread_run_queue_remove: * * Remove a thread from a current run queue and * return TRUE if successful. @@ -2778,7 +3679,7 @@ run_queue_check( * Thread must be locked. */ boolean_t -run_queue_remove( +thread_run_queue_remove( thread_t thread) { processor_t processor = thread->runq; @@ -2790,41 +3691,32 @@ run_queue_remove( * and removed. */ if (processor != PROCESSOR_NULL) { - void * rqlock; - run_queue_t rq; + queue_t q; /* * The processor run queues are locked by the * processor set. Real-time priorities use a * global queue with a dedicated lock. */ - if (thread->sched_pri < BASEPRI_RTQUEUES) { - rqlock = &processor->processor_set->sched_lock; - rq = &processor->runq; + if (thread->sched_mode == TH_MODE_FAIRSHARE) { + return SCHED(fairshare_queue_remove)(thread); } - else { - rqlock = &rt_lock; rq = &rt_runq; + + if (thread->sched_pri < BASEPRI_RTQUEUES) { + return SCHED(processor_queue_remove)(processor, thread); } - simple_lock(rqlock); + simple_lock(&rt_lock); + q = &rt_runq.queue; if (processor == thread->runq) { /* * Thread is on a run queue and we have a lock on * that run queue. */ - remqueue(&rq->queues[0], (queue_entry_t)thread); - rq->count--; - if (testbit(thread->sched_pri, sched_preempt_pri)) { - rq->urgency--; assert(rq->urgency >= 0); - } - - if (queue_empty(rq->queues + thread->sched_pri)) { - /* update run queue status */ - if (thread->sched_pri != IDLEPRI) - clrbit(MAXPRI - thread->sched_pri, rq->bitmap); - rq->highq = MAXPRI - ffsbit(rq->bitmap); - } + remqueue((queue_entry_t)thread); + SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count); + rt_runq.count--; thread->runq = PROCESSOR_NULL; } @@ -2837,12 +3729,14 @@ run_queue_remove( processor = PROCESSOR_NULL; } - simple_unlock(rqlock); + simple_unlock(&rt_lock); } return (processor != PROCESSOR_NULL); } +#if defined(CONFIG_SCHED_TRADITIONAL) + /* * steal_processor_thread: * @@ -2856,7 +3750,7 @@ static thread_t steal_processor_thread( processor_t processor) { - run_queue_t rq = &processor->runq; + run_queue_t rq = runq_for_processor(processor); queue_t queue = rq->queues + rq->highq; int pri = rq->highq, count = rq->count; thread_t thread; @@ -2865,11 +3759,13 @@ steal_processor_thread( thread = (thread_t)queue_first(queue); while (!queue_end(queue, (queue_entry_t)thread)) { if (thread->bound_processor == PROCESSOR_NULL) { - remqueue(queue, (queue_entry_t)thread); + remqueue((queue_entry_t)thread); thread->runq = PROCESSOR_NULL; + SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); + runq_consider_decr_bound_count(processor, thread); rq->count--; - if (testbit(pri, sched_preempt_pri)) { + if (SCHED(priority_is_urgent)(pri)) { rq->urgency--; assert(rq->urgency >= 0); } if (queue_empty(queue)) { @@ -2912,10 +3808,10 @@ steal_thread( do { processor = (processor_t)queue_first(&cset->active_queue); while (!queue_end(&cset->active_queue, (queue_entry_t)processor)) { - if (processor->runq.count > 0) { + if (runq_for_processor(processor)->count > 0) { thread = steal_processor_thread(processor); if (thread != THREAD_NULL) { - remqueue(&cset->active_queue, (queue_entry_t)processor); + remqueue((queue_entry_t)processor); enqueue_tail(&cset->active_queue, (queue_entry_t)processor); pset_unlock(cset); @@ -2942,6 +3838,55 @@ steal_thread( return (THREAD_NULL); } +static thread_t steal_thread_disabled( + processor_set_t pset) +{ + pset_unlock(pset); + + return (THREAD_NULL); +} + +#endif /* CONFIG_SCHED_TRADITIONAL */ + + +int +thread_get_urgency(uint64_t *rt_period, uint64_t *rt_deadline) +{ + processor_t processor; + thread_t thread; + + processor = current_processor(); + + thread = processor->next_thread; + + if (thread != NULL) { + if (thread->sched_mode == TH_MODE_REALTIME) { + + if (rt_period != NULL) + *rt_period = thread->realtime.period; + if (rt_deadline != NULL) + *rt_deadline = thread->realtime.deadline; + + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_GET_URGENCY), THREAD_URGENCY_REAL_TIME, thread->realtime.period, + (thread->realtime.deadline >> 32), thread->realtime.deadline, 0); + + return (THREAD_URGENCY_REAL_TIME); + } else if ((thread->sched_pri <= MAXPRI_THROTTLE) && + (thread->priority <= MAXPRI_THROTTLE)) { + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_GET_URGENCY), THREAD_URGENCY_BACKGROUND, thread->sched_pri, thread->priority, 0, 0); + return (THREAD_URGENCY_BACKGROUND); + } + else + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_GET_URGENCY), THREAD_URGENCY_NORMAL, 0, 0, 0, 0); + + return (THREAD_URGENCY_NORMAL); + } + else + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_GET_URGENCY), THREAD_URGENCY_NONE, 0, 0, 0, 0); + return (THREAD_URGENCY_NONE); +} + + /* * This is the processor idle loop, which just looks for other threads * to execute. Processor idle threads invoke this without supplying a @@ -2949,7 +3894,14 @@ steal_thread( * * Returns a the next thread to execute if dispatched directly. */ -static thread_t + +#if 0 +#define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__) +#else +#define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0) +#endif + +thread_t processor_idle( thread_t thread, processor_t processor) @@ -2957,22 +3909,29 @@ processor_idle( processor_set_t pset = processor->processor_set; thread_t new_thread; int state; - (void)splsched(); KERNEL_DEBUG_CONSTANT( MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_START, (uintptr_t)thread_tid(thread), 0, 0, 0, 0); + SCHED_STATS_CPU_IDLE_START(processor); + timer_switch(&PROCESSOR_DATA(processor, system_state), mach_absolute_time(), &PROCESSOR_DATA(processor, idle_state)); PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, idle_state); - while (processor->next_thread == THREAD_NULL && processor->runq.count == 0 && rt_runq.count == 0 && + while (processor->next_thread == THREAD_NULL && SCHED(processor_queue_empty)(processor) && rt_runq.count == 0 && SCHED(fairshare_runq_count)() == 0 && (thread == THREAD_NULL || ((thread->state & (TH_WAIT|TH_SUSP)) == TH_WAIT && !thread->wake_active))) { + IDLE_KERNEL_DEBUG_CONSTANT( + MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq.count, SCHED(processor_runq_count)(processor), -1, 0); + machine_idle(); (void)splsched(); + IDLE_KERNEL_DEBUG_CONSTANT( + MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq.count, SCHED(processor_runq_count)(processor), -2, 0); + if (processor->state == PROCESSOR_INACTIVE && !machine_processor_is_inactive(processor)) break; } @@ -2992,19 +3951,20 @@ processor_idle( processor->next_thread = THREAD_NULL; processor->state = PROCESSOR_RUNNING; - if ( processor->runq.highq > new_thread->sched_pri || - (rt_runq.highq > 0 && rt_runq.highq >= new_thread->sched_pri) ) { + if (SCHED(processor_queue_has_priority)(processor, new_thread->sched_pri, FALSE) || + (rt_runq.count > 0 && BASEPRI_RTQUEUES >= new_thread->sched_pri) ) { processor->deadline = UINT64_MAX; pset_unlock(pset); thread_lock(new_thread); + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REDISPATCH), (uintptr_t)thread_tid(new_thread), new_thread->sched_pri, rt_runq.count, 0, 0); thread_setrun(new_thread, SCHED_HEADQ); thread_unlock(new_thread); KERNEL_DEBUG_CONSTANT( MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, (uintptr_t)thread_tid(thread), state, 0, 0, 0); - + return (THREAD_NULL); } @@ -3012,12 +3972,12 @@ processor_idle( KERNEL_DEBUG_CONSTANT( MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, (uintptr_t)thread_tid(thread), state, (uintptr_t)thread_tid(new_thread), 0, 0); - + return (new_thread); } else if (state == PROCESSOR_IDLE) { - remqueue(&pset->idle_queue, (queue_entry_t)processor); + remqueue((queue_entry_t)processor); processor->state = PROCESSOR_RUNNING; enqueue_tail(&pset->active_queue, (queue_entry_t)processor); @@ -3045,7 +4005,7 @@ processor_idle( KERNEL_DEBUG_CONSTANT( MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, (uintptr_t)thread_tid(thread), state, 0, 0, 0); - + return (THREAD_NULL); } } @@ -3054,7 +4014,7 @@ processor_idle( KERNEL_DEBUG_CONSTANT( MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, (uintptr_t)thread_tid(thread), state, 0, 0, 0); - + return (THREAD_NULL); } @@ -3105,8 +4065,6 @@ idle_thread_create( return (KERN_SUCCESS); } -static uint64_t sched_tick_deadline; - /* * sched_startup: * @@ -3120,14 +4078,16 @@ sched_startup(void) kern_return_t result; thread_t thread; - result = kernel_thread_start_priority((thread_continue_t)sched_tick_thread, NULL, MAXPRI_KERNEL, &thread); + result = kernel_thread_start_priority((thread_continue_t)sched_init_thread, + (void *)SCHED(maintenance_continuation), + MAXPRI_KERNEL, &thread); if (result != KERN_SUCCESS) panic("sched_startup"); thread_deallocate(thread); /* - * Yield to the sched_tick_thread while it times + * Yield to the sched_init_thread while it times * a series of context switches back. It stores * the baseline value in sched_cswtime. * @@ -3136,20 +4096,20 @@ sched_startup(void) */ while (sched_cswtime == 0) thread_block(THREAD_CONTINUE_NULL); +} - thread_daemon_init(); +#if defined(CONFIG_SCHED_TRADITIONAL) - thread_call_initialize(); -} +static uint64_t sched_tick_deadline = 0; /* - * sched_tick_thread: + * sched_init_thread: * * Perform periodic bookkeeping functions about ten * times per second. */ static void -sched_tick_continue(void) +sched_traditional_tick_continue(void) { uint64_t abstime = mach_absolute_time(); @@ -3166,14 +4126,44 @@ sched_tick_continue(void) */ thread_update_scan(); + if (sched_tick_deadline == 0) + sched_tick_deadline = abstime; + clock_deadline_for_periodic_event(sched_tick_interval, abstime, &sched_tick_deadline); - assert_wait_deadline((event_t)sched_tick_thread, THREAD_UNINT, sched_tick_deadline); - thread_block((thread_continue_t)sched_tick_continue); + assert_wait_deadline((event_t)sched_traditional_tick_continue, THREAD_UNINT, sched_tick_deadline); + thread_block((thread_continue_t)sched_traditional_tick_continue); /*NOTREACHED*/ } +#endif /* CONFIG_SCHED_TRADITIONAL */ + +static uint32_t +time_individual_cswitch(void) +{ + uint32_t switches = 0; + uint64_t newtime, starttime; + + /* Wait for absolute time to increase. */ + starttime = mach_absolute_time(); + do { + newtime = mach_absolute_time(); + } while (newtime == starttime); + + /* Measure one or more context switches until time increases again. + * This ensures we get non-zero timings even if absolute time + * increases very infrequently compared to CPU clock. */ + starttime = newtime; + do { + thread_block(THREAD_CONTINUE_NULL); + newtime = mach_absolute_time(); + ++switches; + } while (newtime == starttime); + /* Round up. */ + return (uint32_t) ((newtime - starttime + switches - 1) / switches); +} + /* * Time a series of context switches to determine * a baseline. Toss the high and low and return @@ -3183,15 +4173,11 @@ static uint32_t time_cswitch(void) { uint32_t new, hi, low, accum; - uint64_t abstime; - int i, tries = 7; + int i, tries = 7, denom; accum = hi = low = 0; for (i = 0; i < tries; ++i) { - abstime = mach_absolute_time(); - thread_block(THREAD_CONTINUE_NULL); - - new = (uint32_t)(mach_absolute_time() - abstime); + new = time_individual_cswitch(); if (i == 0) accum = hi = low = new; @@ -3204,21 +4190,24 @@ time_cswitch(void) accum += new; } } - - return ((accum - hi - low) / (2 * (tries - 2))); + /* Round up. */ + denom = 2 * (tries - 2); + return (accum - hi - low + denom - 1) / denom; } void -sched_tick_thread(void) +sched_init_thread(void (*continuation)(void)) { sched_cswtime = time_cswitch(); + assert(sched_cswtime > 0); - sched_tick_deadline = mach_absolute_time(); + continuation(); - sched_tick_continue(); /*NOTREACHED*/ } +#if defined(CONFIG_SCHED_TRADITIONAL) + /* * thread_update_scan / runq_scan: * @@ -3258,7 +4247,7 @@ runq_scan( while (count > 0) { queue_iterate(q, thread, thread_t, links) { if ( thread->sched_stamp != sched_tick && - (thread->sched_mode & TH_MODE_TIMESHARE) ) { + (thread->sched_mode == TH_MODE_TIMESHARE) ) { if (thread_update_count == THREAD_UPDATE_SIZE) return (TRUE); @@ -3292,7 +4281,7 @@ thread_update_scan(void) s = splsched(); pset_lock(pset); - restart_needed = runq_scan(&processor->runq); + restart_needed = runq_scan(runq_for_processor(processor)); pset_unlock(pset); splx(s); @@ -3321,9 +4310,10 @@ thread_update_scan(void) s = splsched(); thread_lock(thread); - if ( !(thread->state & (TH_WAIT|TH_SUSP)) && - thread->sched_stamp != sched_tick ) - update_priority(thread); + if ( !(thread->state & (TH_WAIT)) ) { + if (SCHED(can_update_priority)(thread)) + SCHED(update_priority)(thread); + } thread_unlock(thread); splx(s); @@ -3331,20 +4321,115 @@ thread_update_scan(void) } } while (restart_needed); } + +#endif /* CONFIG_SCHED_TRADITIONAL */ + +boolean_t +thread_eager_preemption(thread_t thread) +{ + return ((thread->sched_flags & TH_SFLAG_EAGERPREEMPT) != 0); +} + +void +thread_set_eager_preempt(thread_t thread) +{ + spl_t x; + processor_t p; + ast_t ast = AST_NONE; + + x = splsched(); + p = current_processor(); + + thread_lock(thread); + thread->sched_flags |= TH_SFLAG_EAGERPREEMPT; + + if (thread == current_thread()) { + thread_unlock(thread); + + ast = csw_check(p); + if (ast != AST_NONE) { + (void) thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast); + } + } else { + p = thread->last_processor; + + if (p != PROCESSOR_NULL && p->state == PROCESSOR_RUNNING && + p->active_thread == thread) { + cause_ast_check(p); + } + thread_unlock(thread); + } + + splx(x); +} + +void +thread_clear_eager_preempt(thread_t thread) +{ + spl_t x; + + x = splsched(); + thread_lock(thread); + + thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT; + + thread_unlock(thread); + splx(x); +} +/* + * Scheduling statistics + */ +void +sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri) +{ + struct processor_sched_statistics *stats; + boolean_t to_realtime = FALSE; + + stats = &processor->processor_data.sched_stats; + stats->csw_count++; + + if (otherpri >= BASEPRI_REALTIME) { + stats->rt_sched_count++; + to_realtime = TRUE; + } + + if ((reasons & AST_PREEMPT) != 0) { + stats->preempt_count++; + + if (selfpri >= BASEPRI_REALTIME) { + stats->preempted_rt_count++; + } + + if (to_realtime) { + stats->preempted_by_rt_count++; + } + + } +} + +void +sched_stats_handle_runq_change(struct runq_stats *stats, int old_count) +{ + uint64_t timestamp = mach_absolute_time(); + + stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count; + stats->last_change_timestamp = timestamp; +} + /* - * Just in case someone doesn't use the macro + * For calls from assembly code */ -#undef thread_wakeup +#undef thread_wakeup void thread_wakeup( - event_t x); + event_t x); void thread_wakeup( - event_t x) + event_t x) { - thread_wakeup_with_result(x, THREAD_AWAKENED); + thread_wakeup_with_result(x, THREAD_AWAKENED); } boolean_t diff --git a/osfmk/kern/sched_prim.h b/osfmk/kern/sched_prim.h index 9f1c95347..0f89239ae 100644 --- a/osfmk/kern/sched_prim.h +++ b/osfmk/kern/sched_prim.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -147,12 +147,20 @@ extern void compute_my_priority( thread_t thread); /* Periodic scheduler activity */ -extern void sched_tick_thread(void); +extern void sched_init_thread(void (*)(void)); /* Perform sched_tick housekeeping activities */ -extern void update_priority( +extern boolean_t can_update_priority( thread_t thread); +extern void update_priority( + thread_t thread); + +extern void lightweight_update_priority( + thread_t thread); + +extern void sched_traditional_quantum_expire(thread_t thread); + /* Idle processor thread */ extern void idle_thread(void); @@ -185,13 +193,80 @@ extern processor_set_t task_choose_pset( extern processor_t thread_bind( processor_t processor); +/* Choose the best processor to run a thread */ +extern processor_t choose_processor( + processor_set_t pset, + processor_t processor, + thread_t thread); + +/* Choose a thread from a processor's priority-based runq */ +extern thread_t choose_thread( + processor_t processor, + run_queue_t runq, + int priority); + + +extern void thread_quantum_init( + thread_t thread); + extern void run_queue_init( run_queue_t runq); +extern thread_t run_queue_dequeue( + run_queue_t runq, + integer_t options); + +extern boolean_t run_queue_enqueue( + run_queue_t runq, + thread_t thread, + integer_t options); + +extern void run_queue_remove( + run_queue_t runq, + thread_t thread); + +/* Remove thread from its run queue */ +extern boolean_t thread_run_queue_remove( + thread_t thread); + extern void thread_timer_expire( void *thread, void *p1); +extern boolean_t thread_eager_preemption( + thread_t thread); + +/* Fair Share routines */ +#if defined(CONFIG_SCHED_TRADITIONAL) || defined(CONFIG_SCHED_PROTO) || defined(CONFIG_SCHED_FIXEDPRIORITY) +void sched_traditional_fairshare_init(void); + +int sched_traditional_fairshare_runq_count(void); + +uint64_t sched_traditional_fairshare_runq_stats_count_sum(void); + +void sched_traditional_fairshare_enqueue(thread_t thread); + +thread_t sched_traditional_fairshare_dequeue(void); + +boolean_t sched_traditional_fairshare_queue_remove(thread_t thread); +#endif + +#if defined(CONFIG_SCHED_GRRR) || defined(CONFIG_SCHED_FIXEDPRIORITY) +void sched_grrr_fairshare_init(void); + +int sched_grrr_fairshare_runq_count(void); + +uint64_t sched_grrr_fairshare_runq_stats_count_sum(void); + +void sched_grrr_fairshare_enqueue(thread_t thread); + +thread_t sched_grrr_fairshare_dequeue(void); + +boolean_t sched_grrr_fairshare_queue_remove(thread_t thread); +#endif + +extern boolean_t sched_generic_direct_dispatch_to_idle_processors; + /* Set the maximum interrupt level for the thread */ __private_extern__ wait_interrupt_t thread_interrupt_level( wait_interrupt_t interruptible); @@ -205,6 +280,55 @@ __private_extern__ kern_return_t clear_wait_internal( thread_t thread, wait_result_t result); +extern void sched_stats_handle_csw( + processor_t processor, + int reasons, + int selfpri, + int otherpri); + +extern void sched_stats_handle_runq_change( + struct runq_stats *stats, + int old_count); + + + +#define SCHED_STATS_CSW(processor, reasons, selfpri, otherpri) \ +do { \ + if (__builtin_expect(sched_stats_active, 0)) { \ + sched_stats_handle_csw((processor), \ + (reasons), (selfpri), (otherpri)); \ + } \ +} while (0) + + +#define SCHED_STATS_RUNQ_CHANGE(stats, old_count) \ +do { \ + if (__builtin_expect(sched_stats_active, 0)) { \ + sched_stats_handle_runq_change((stats), \ + (old_count)); \ + } \ +} while (0) + +#define THREAD_URGENCY_NONE 0 /* indicates that there is no currently runnable */ +#define THREAD_URGENCY_BACKGROUND 1 /* indicates that the thread is marked as a "background" thread */ +#define THREAD_URGENCY_NORMAL 2 /* indicates that the thread is marked as a "normal" thread */ +#define THREAD_URGENCY_REAL_TIME 3 /* indicates that the thread is marked as a "real-time" or urgent thread */ +#define THREAD_URGENCY_MAX 4 /* Marker */ +/* Returns the "urgency" of the currently running thread (provided by scheduler) */ +extern int thread_get_urgency( + uint64_t *rt_period, + uint64_t *rt_deadline); + +/* Tells the "urgency" of the just scheduled thread (provided by CPU PM) */ +extern void thread_tell_urgency( + int urgency, + uint64_t rt_period, + uint64_t rt_deadline); + +/* Tells if there are "active" RT threads in the system (provided by CPU PM) */ +extern void active_rt_threads( + boolean_t active); + #endif /* MACH_KERNEL_PRIVATE */ __BEGIN_DECLS @@ -259,14 +383,27 @@ extern wait_result_t assert_wait_deadline( extern kern_return_t thread_wakeup_prim( event_t event, boolean_t one_thread, - wait_result_t result); + wait_result_t result); + +#ifdef MACH_KERNEL_PRIVATE +extern kern_return_t thread_wakeup_prim_internal( + event_t event, + boolean_t one_thread, + wait_result_t result, + int priority); +#endif #define thread_wakeup(x) \ - thread_wakeup_prim((x), FALSE, THREAD_AWAKENED) + thread_wakeup_prim((x), FALSE, THREAD_AWAKENED) #define thread_wakeup_with_result(x, z) \ - thread_wakeup_prim((x), FALSE, (z)) + thread_wakeup_prim((x), FALSE, (z)) #define thread_wakeup_one(x) \ - thread_wakeup_prim((x), TRUE, THREAD_AWAKENED) + thread_wakeup_prim((x), TRUE, THREAD_AWAKENED) + +#ifdef MACH_KERNEL_PRIVATE +#define thread_wakeup_one_with_pri(x, pri) \ + thread_wakeup_prim_internal((x), TRUE, THREAD_AWAKENED, pri) +#endif extern boolean_t preemption_enabled(void); @@ -302,6 +439,223 @@ extern void thread_cancel_timer(void); #endif /* KERNEL_PRIVATE */ +#ifdef MACH_KERNEL_PRIVATE + +/* + * Scheduler algorithm indirection. If only one algorithm is + * enabled at compile-time, a direction function call is used. + * If more than one is enabled, calls are dispatched through + * a function pointer table. + */ + +#if !defined(CONFIG_SCHED_TRADITIONAL) && !defined(CONFIG_SCHED_PROTO) && !defined(CONFIG_SCHED_GRRR) && !defined(CONFIG_SCHED_FIXEDPRIORITY) +#error Enable at least one scheduler algorithm in osfmk/conf/MASTER.XXX +#endif + +#define SCHED(f) (sched_current_dispatch->f) + +struct sched_dispatch_table { + void (*init)(void); /* Init global state */ + void (*timebase_init)(void); /* Timebase-dependent initialization */ + void (*processor_init)(processor_t processor); /* Per-processor scheduler init */ + void (*pset_init)(processor_set_t pset); /* Per-processor set scheduler init */ + + void (*maintenance_continuation)(void); /* Function called regularly */ + + /* + * Choose a thread of greater or equal priority from the per-processor + * runqueue for timeshare/fixed threads + */ + thread_t (*choose_thread)( + processor_t processor, + int priority); + + /* + * Steal a thread from another processor in the pset so that it can run + * immediately + */ + thread_t (*steal_thread)( + processor_set_t pset); + + /* + * Recalculate sched_pri based on base priority, past running time, + * and scheduling class. + */ + void (*compute_priority)( + thread_t thread, + boolean_t override_depress); + + /* + * Pick the best processor for a thread (any kind of thread) to run on. + */ + processor_t (*choose_processor)( + processor_set_t pset, + processor_t processor, + thread_t thread); + /* + * Enqueue a timeshare or fixed priority thread onto the per-processor + * runqueue + */ + boolean_t (*processor_enqueue)( + processor_t processor, + thread_t thread, + integer_t options); + + /* Migrate threads away in preparation for processor shutdown */ + void (*processor_queue_shutdown)( + processor_t processor); + + /* Remove the specific thread from the per-processor runqueue */ + boolean_t (*processor_queue_remove)( + processor_t processor, + thread_t thread); + + /* + * Does the per-processor runqueue have any timeshare or fixed priority + * threads on it? Called without pset lock held, so should + * not assume immutability while executing. + */ + boolean_t (*processor_queue_empty)(processor_t processor); + + /* + * Would this priority trigger an urgent preemption if it's sitting + * on the per-processor runqueue? + */ + boolean_t (*priority_is_urgent)(int priority); + + /* + * Does the per-processor runqueue contain runnable threads that + * should cause the currently-running thread to be preempted? + */ + ast_t (*processor_csw_check)(processor_t processor); + + /* + * Does the per-processor runqueue contain a runnable thread + * of > or >= priority, as a preflight for choose_thread() or other + * thread selection + */ + boolean_t (*processor_queue_has_priority)(processor_t processor, + int priority, + boolean_t gte); + + /* Quantum size for the specified non-realtime thread. */ + uint32_t (*initial_quantum_size)(thread_t thread); + + /* Scheduler mode for a new thread */ + sched_mode_t (*initial_thread_sched_mode)(task_t parent_task); + + /* Scheduler algorithm supports timeshare (decay) mode */ + boolean_t (*supports_timeshare_mode)(void); + + /* + * Is it safe to call update_priority, which may change a thread's + * runqueue or other state. This can be used to throttle changes + * to dynamic priority. + */ + boolean_t (*can_update_priority)(thread_t thread); + + /* + * Update both scheduled priority and other persistent state. + * Side effects may including migration to another processor's runqueue. + */ + void (*update_priority)(thread_t thread); + + /* Lower overhead update to scheduled priority and state. */ + void (*lightweight_update_priority)(thread_t thread); + + /* Callback for non-realtime threads when the quantum timer fires */ + void (*quantum_expire)(thread_t thread); + + /* + * Even though we could continue executing on this processor, does the + * topology (SMT, for instance) indicate that a better processor could be + * chosen + */ + boolean_t (*should_current_thread_rechoose_processor)(processor_t processor); + + /* + * Runnable threads on per-processor runqueue. Should only + * be used for relative comparisons of load between processors. + */ + int (*processor_runq_count)(processor_t processor); + + /* Aggregate runcount statistics for per-processor runqueue */ + uint64_t (*processor_runq_stats_count_sum)(processor_t processor); + + /* Initialize structures to track demoted fairshare threads */ + void (*fairshare_init)(void); + + /* Number of runnable fairshare threads */ + int (*fairshare_runq_count)(void); + + /* Aggregate runcount statistics for fairshare runqueue */ + uint64_t (*fairshare_runq_stats_count_sum)(void); + + void (*fairshare_enqueue)(thread_t thread); + + thread_t (*fairshare_dequeue)(void); + + boolean_t (*fairshare_queue_remove)(thread_t thread); + + /* + * Use processor->next_thread to pin a thread to an idle + * processor. If FALSE, threads are enqueued and can + * be stolen by other processors. + */ + boolean_t direct_dispatch_to_idle_processors; +}; + +#if defined(CONFIG_SCHED_TRADITIONAL) +#define kSchedTraditionalString "traditional" +#define kSchedTraditionalWithPsetRunqueueString "traditional_with_pset_runqueue" +extern const struct sched_dispatch_table sched_traditional_dispatch; +extern const struct sched_dispatch_table sched_traditional_with_pset_runqueue_dispatch; +#endif + +#if defined(CONFIG_SCHED_PROTO) +#define kSchedProtoString "proto" +extern const struct sched_dispatch_table sched_proto_dispatch; +#endif + +#if defined(CONFIG_SCHED_GRRR) +#define kSchedGRRRString "grrr" +extern const struct sched_dispatch_table sched_grrr_dispatch; +#endif + +#if defined(CONFIG_SCHED_FIXEDPRIORITY) +#define kSchedFixedPriorityString "fixedpriority" +#define kSchedFixedPriorityWithPsetRunqueueString "fixedpriority_with_pset_runqueue" +extern const struct sched_dispatch_table sched_fixedpriority_dispatch; +extern const struct sched_dispatch_table sched_fixedpriority_with_pset_runqueue_dispatch; +#endif + +/* + * It is an error to invoke any scheduler-related code + * before this is set up + */ +enum sched_enum { + sched_enum_unknown = 0, +#if defined(CONFIG_SCHED_TRADITIONAL) + sched_enum_traditional = 1, + sched_enum_traditional_with_pset_runqueue = 2, +#endif +#if defined(CONFIG_SCHED_PROTO) + sched_enum_proto = 3, +#endif +#if defined(CONFIG_SCHED_GRRR) + sched_enum_grrr = 4, +#endif +#if defined(CONFIG_SCHED_FIXEDPRIORITY) + sched_enum_fixedpriority = 5, + sched_enum_fixedpriority_with_pset_runqueue = 6, +#endif + sched_enum_max = 7 +}; + +extern const struct sched_dispatch_table *sched_current_dispatch; + +#endif /* MACH_KERNEL_PRIVATE */ + __END_DECLS #endif /* _KERN_SCHED_PRIM_H_ */ diff --git a/osfmk/kern/sched_proto.c b/osfmk/kern/sched_proto.c new file mode 100644 index 000000000..e31cb0590 --- /dev/null +++ b/osfmk/kern/sched_proto.c @@ -0,0 +1,597 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include + +static void +sched_proto_init(void); + +static void +sched_proto_timebase_init(void); + +static void +sched_proto_processor_init(processor_t processor); + +static void +sched_proto_pset_init(processor_set_t pset); + +static void +sched_proto_maintenance_continuation(void); + +static thread_t +sched_proto_choose_thread(processor_t processor, + int priority); + +static thread_t +sched_proto_steal_thread(processor_set_t pset); + +static void +sched_proto_compute_priority(thread_t thread, + boolean_t override_depress); + +static processor_t +sched_proto_choose_processor( processor_set_t pset, + processor_t processor, + thread_t thread); + + +static boolean_t +sched_proto_processor_enqueue( + processor_t processor, + thread_t thread, + integer_t options); + +static void +sched_proto_processor_queue_shutdown( + processor_t processor); + +static boolean_t +sched_proto_processor_queue_remove( + processor_t processor, + thread_t thread); + +static boolean_t +sched_proto_processor_queue_empty(processor_t processor); + +static boolean_t +sched_proto_processor_queue_has_priority(processor_t processor, + int priority, + boolean_t gte); + +static boolean_t +sched_proto_priority_is_urgent(int priority); + +static ast_t +sched_proto_processor_csw_check(processor_t processor); + +static uint32_t +sched_proto_initial_quantum_size(thread_t thread); + +static sched_mode_t +sched_proto_initial_thread_sched_mode(task_t parent_task); + +static boolean_t +sched_proto_supports_timeshare_mode(void); + +static boolean_t +sched_proto_can_update_priority(thread_t thread); + +static void +sched_proto_update_priority(thread_t thread); + +static void +sched_proto_lightweight_update_priority(thread_t thread); + +static void +sched_proto_quantum_expire(thread_t thread); + +static boolean_t +sched_proto_should_current_thread_rechoose_processor(processor_t processor); + +static int +sched_proto_processor_runq_count(processor_t processor); + +static uint64_t +sched_proto_processor_runq_stats_count_sum(processor_t processor); + +const struct sched_dispatch_table sched_proto_dispatch = { + sched_proto_init, + sched_proto_timebase_init, + sched_proto_processor_init, + sched_proto_pset_init, + sched_proto_maintenance_continuation, + sched_proto_choose_thread, + sched_proto_steal_thread, + sched_proto_compute_priority, + sched_proto_choose_processor, + sched_proto_processor_enqueue, + sched_proto_processor_queue_shutdown, + sched_proto_processor_queue_remove, + sched_proto_processor_queue_empty, + sched_proto_priority_is_urgent, + sched_proto_processor_csw_check, + sched_proto_processor_queue_has_priority, + sched_proto_initial_quantum_size, + sched_proto_initial_thread_sched_mode, + sched_proto_supports_timeshare_mode, + sched_proto_can_update_priority, + sched_proto_update_priority, + sched_proto_lightweight_update_priority, + sched_proto_quantum_expire, + sched_proto_should_current_thread_rechoose_processor, + sched_proto_processor_runq_count, + sched_proto_processor_runq_stats_count_sum, + sched_traditional_fairshare_init, + sched_traditional_fairshare_runq_count, + sched_traditional_fairshare_runq_stats_count_sum, + sched_traditional_fairshare_enqueue, + sched_traditional_fairshare_dequeue, + sched_traditional_fairshare_queue_remove, + TRUE /* direct_dispatch_to_idle_processors */ +}; + +static struct run_queue *global_runq; +static struct run_queue global_runq_storage; + +#define GLOBAL_RUNQ ((processor_t)-2) +decl_simple_lock_data(static,global_runq_lock); + +extern int max_unsafe_quanta; + +static uint32_t proto_quantum_us; +static uint32_t proto_quantum; + +static uint32_t runqueue_generation; + +static processor_t proto_processor; + +static uint64_t sched_proto_tick_deadline; +static uint32_t sched_proto_tick; + +static void +sched_proto_init(void) +{ + proto_quantum_us = 10*1000; + + printf("standard proto timeslicing quantum is %d us\n", proto_quantum_us); + + simple_lock_init(&global_runq_lock, 0); + global_runq = &global_runq_storage; + run_queue_init(global_runq); + runqueue_generation = 0; + + proto_processor = master_processor; +} + +static void +sched_proto_timebase_init(void) +{ + uint64_t abstime; + + /* standard timeslicing quantum */ + clock_interval_to_absolutetime_interval( + proto_quantum_us, NSEC_PER_USEC, &abstime); + assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); + proto_quantum = (uint32_t)abstime; + + thread_depress_time = 1 * proto_quantum; + default_timeshare_computation = proto_quantum / 2; + default_timeshare_constraint = proto_quantum; + + max_unsafe_computation = max_unsafe_quanta * proto_quantum; + sched_safe_duration = 2 * max_unsafe_quanta * proto_quantum; + +} + +static void +sched_proto_processor_init(processor_t processor __unused) +{ + /* No per-processor state */ +} + +static void +sched_proto_pset_init(processor_set_t pset __unused) +{ +} + +static void +sched_proto_maintenance_continuation(void) +{ + uint64_t abstime = mach_absolute_time(); + + sched_proto_tick++; + + /* Every 8 seconds, switch to another processor */ + if ((sched_proto_tick & 0x7) == 0) { + processor_t new_processor; + + new_processor = proto_processor->processor_list; + if (new_processor == PROCESSOR_NULL) + proto_processor = master_processor; + else + proto_processor = new_processor; + } + + + /* + * Compute various averages. + */ + compute_averages(); + + if (sched_proto_tick_deadline == 0) + sched_proto_tick_deadline = abstime; + + clock_deadline_for_periodic_event(sched_one_second_interval, abstime, + &sched_proto_tick_deadline); + + assert_wait_deadline((event_t)sched_proto_maintenance_continuation, THREAD_UNINT, sched_proto_tick_deadline); + thread_block((thread_continue_t)sched_proto_maintenance_continuation); + /*NOTREACHED*/ +} + +static thread_t +sched_proto_choose_thread(processor_t processor, + int priority) +{ + run_queue_t rq = global_runq; + queue_t queue; + int pri, count; + thread_t thread; + + + simple_lock(&global_runq_lock); + + queue = rq->queues + rq->highq; + pri = rq->highq; + count = rq->count; + + /* + * Since we don't depress priorities, a high priority thread + * may get selected over and over again. Put a runqueue + * generation number in the thread structure so that we + * can ensure that we've cycled through all runnable tasks + * before coming back to a high priority thread. This isn't + * perfect, especially if the number of runnable threads always + * stays high, but is a workable approximation + */ + + while (count > 0 && pri >= priority) { + thread = (thread_t)queue_first(queue); + while (!queue_end(queue, (queue_entry_t)thread)) { + if ((thread->bound_processor == PROCESSOR_NULL || + thread->bound_processor == processor) && + runqueue_generation != thread->runqueue_generation) { + remqueue((queue_entry_t)thread); + + thread->runq = PROCESSOR_NULL; + thread->runqueue_generation = runqueue_generation; + SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); + rq->count--; + if (queue_empty(queue)) { + if (pri != IDLEPRI) + clrbit(MAXPRI - pri, rq->bitmap); + rq->highq = MAXPRI - ffsbit(rq->bitmap); + } + + simple_unlock(&global_runq_lock); + return (thread); + } + count--; + + thread = (thread_t)queue_next((queue_entry_t)thread); + } + + queue--; pri--; + } + + runqueue_generation++; + + simple_unlock(&global_runq_lock); + return (THREAD_NULL); +} + +static thread_t +sched_proto_steal_thread(processor_set_t pset) +{ + pset_unlock(pset); + + return (THREAD_NULL); + +} + +static void +sched_proto_compute_priority(thread_t thread, + boolean_t override_depress __unused) +{ + set_sched_pri(thread, thread->priority); +} + +static processor_t +sched_proto_choose_processor( processor_set_t pset, + processor_t processor, + thread_t thread __unused) +{ + processor = proto_processor; + + /* + * Check that the correct processor set is + * returned locked. + */ + if (pset != processor->processor_set) { + pset_unlock(pset); + + pset = processor->processor_set; + pset_lock(pset); + } + + return (processor); +} + +static boolean_t +sched_proto_processor_enqueue( + processor_t processor __unused, + thread_t thread, + integer_t options) +{ + run_queue_t rq = global_runq; + boolean_t result; + + simple_lock(&global_runq_lock); + result = run_queue_enqueue(rq, thread, options); + thread->runq = GLOBAL_RUNQ; + simple_unlock(&global_runq_lock); + + return (result); +} + +static void +sched_proto_processor_queue_shutdown( + processor_t processor) +{ + /* With a global runqueue, just stop choosing this processor */ + (void)processor; +} + +static boolean_t +sched_proto_processor_queue_remove( + processor_t processor, + thread_t thread) +{ + void * rqlock; + run_queue_t rq; + + rqlock = &global_runq_lock; + rq = global_runq; + + simple_lock(rqlock); + if (processor == thread->runq) { + /* + * Thread is on a run queue and we have a lock on + * that run queue. + */ + remqueue((queue_entry_t)thread); + SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); + rq->count--; + if (SCHED(priority_is_urgent)(thread->sched_pri)) { + rq->urgency--; assert(rq->urgency >= 0); + } + + if (queue_empty(rq->queues + thread->sched_pri)) { + /* update run queue status */ + if (thread->sched_pri != IDLEPRI) + clrbit(MAXPRI - thread->sched_pri, rq->bitmap); + rq->highq = MAXPRI - ffsbit(rq->bitmap); + } + + thread->runq = PROCESSOR_NULL; + } + else { + /* + * The thread left the run queue before we could + * lock the run queue. + */ + assert(thread->runq == PROCESSOR_NULL); + processor = PROCESSOR_NULL; + } + + simple_unlock(rqlock); + + return (processor != PROCESSOR_NULL); +} + +static boolean_t +sched_proto_processor_queue_empty(processor_t processor __unused) +{ + boolean_t result; + + result = (global_runq->count == 0); + + return result; +} + +static boolean_t +sched_proto_processor_queue_has_priority(processor_t processor __unused, + int priority, + boolean_t gte) +{ + boolean_t result; + + simple_lock(&global_runq_lock); + + if (gte) + result = global_runq->highq >= priority; + else + result = global_runq->highq >= priority; + + simple_unlock(&global_runq_lock); + + return result; +} + +/* Implement sched_preempt_pri in code */ +static boolean_t +sched_proto_priority_is_urgent(int priority) +{ + if (priority <= BASEPRI_FOREGROUND) + return FALSE; + + if (priority < MINPRI_KERNEL) + return TRUE; + + if (priority >= BASEPRI_PREEMPT) + return TRUE; + + return FALSE; +} + +static ast_t +sched_proto_processor_csw_check(processor_t processor __unused) +{ + run_queue_t runq; + int count, urgency; + + runq = global_runq; + count = runq->count; + urgency = runq->urgency; + + if (count > 0) { + if (urgency > 0) + return (AST_PREEMPT | AST_URGENT); + + return AST_PREEMPT; + } + + return AST_NONE; +} + +static uint32_t +sched_proto_initial_quantum_size(thread_t thread __unused) +{ + return proto_quantum; +} + +static sched_mode_t +sched_proto_initial_thread_sched_mode(task_t parent_task) +{ + if (parent_task == kernel_task) + return TH_MODE_FIXED; + else + return TH_MODE_TIMESHARE; +} + +static boolean_t +sched_proto_supports_timeshare_mode(void) +{ + return TRUE; +} + +static boolean_t +sched_proto_can_update_priority(thread_t thread __unused) +{ + return FALSE; +} + +static void +sched_proto_update_priority(thread_t thread __unused) +{ + +} + +static void +sched_proto_lightweight_update_priority(thread_t thread __unused) +{ + +} + +static void +sched_proto_quantum_expire(thread_t thread __unused) +{ + +} + +static boolean_t +sched_proto_should_current_thread_rechoose_processor(processor_t processor) +{ + return (proto_processor != processor); +} + +static int +sched_proto_processor_runq_count(processor_t processor) +{ + if (master_processor == processor) { + return global_runq->count; + } else { + return 0; + } +} + +static uint64_t +sched_proto_processor_runq_stats_count_sum(processor_t processor) +{ + if (master_processor == processor) { + return global_runq->runq_stats.count_sum; + } else { + return 0ULL; + } +} + diff --git a/osfmk/kern/stack.c b/osfmk/kern/stack.c index a59122569..6b5ea8302 100644 --- a/osfmk/kern/stack.c +++ b/osfmk/kern/stack.c @@ -63,6 +63,9 @@ static vm_offset_t stack_free_list; static unsigned int stack_free_count, stack_free_hiwat; /* free list count */ static unsigned int stack_hiwat; unsigned int stack_total; /* current total count */ +unsigned long long stack_allocs; /* total count of allocations */ + +static int stack_fake_zone_index = -1; /* index in zone_info array */ static unsigned int stack_free_target; static int stack_free_delta; @@ -76,6 +79,51 @@ vm_offset_t kernel_stack_size = KERNEL_STACK_SIZE; vm_offset_t kernel_stack_mask = -KERNEL_STACK_SIZE; vm_offset_t kernel_stack_depth_max = 0; +static inline void +STACK_ZINFO_PALLOC(thread_t thread) +{ + task_t task; + zinfo_usage_t zinfo; + + thread->tkm_private.alloc += kernel_stack_size; + if (stack_fake_zone_index != -1 && + (task = thread->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) + OSAddAtomic64(kernel_stack_size, + (int64_t *)&zinfo[stack_fake_zone_index].alloc); +} + +static inline void +STACK_ZINFO_PFREE(thread_t thread) +{ + task_t task; + zinfo_usage_t zinfo; + + thread->tkm_private.free += kernel_stack_size; + if (stack_fake_zone_index != -1 && + (task = thread->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) + OSAddAtomic64(kernel_stack_size, + (int64_t *)&zinfo[stack_fake_zone_index].free); +} + +static inline void +STACK_ZINFO_HANDOFF(thread_t from, thread_t to) +{ + from->tkm_private.free += kernel_stack_size; + to->tkm_private.alloc += kernel_stack_size; + if (stack_fake_zone_index != -1) { + task_t task; + zinfo_usage_t zinfo; + + if ((task = from->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) + OSAddAtomic64(kernel_stack_size, + (int64_t *)&zinfo[stack_fake_zone_index].free); + + if ((task = to->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) + OSAddAtomic64(kernel_stack_size, + (int64_t *)&zinfo[stack_fake_zone_index].alloc); + } +} + /* * The next field is at the base of the stack, * so the low end is left unsullied. @@ -98,6 +146,9 @@ roundup_pow2(vm_offset_t size) return 1UL << (log2(size - 1) + 1); } +static vm_offset_t stack_alloc_internal(void); +static void stack_free_stack(vm_offset_t); + void stack_init(void) { @@ -125,18 +176,17 @@ stack_init(void) * Allocate a stack for a thread, may * block. */ -void -stack_alloc( - thread_t thread) + +static vm_offset_t +stack_alloc_internal(void) { vm_offset_t stack; spl_t s; int guard_flags; - assert(thread->kernel_stack == 0); - s = splsched(); stack_lock(); + stack_allocs++; stack = stack_free_list; if (stack != 0) { stack_free_list = stack_next(stack); @@ -174,8 +224,25 @@ stack_alloc( stack += PAGE_SIZE; } + return stack; +} - machine_stack_attach(thread, stack); +void +stack_alloc( + thread_t thread) +{ + + assert(thread->kernel_stack == 0); + machine_stack_attach(thread, stack_alloc_internal()); + STACK_ZINFO_PALLOC(thread); +} + +void +stack_handoff(thread_t from, thread_t to) +{ + assert(from == current_thread()); + machine_stack_handoff(from, to); + STACK_ZINFO_HANDOFF(from, to); } /* @@ -190,11 +257,23 @@ stack_free( vm_offset_t stack = machine_stack_detach(thread); assert(stack); - if (stack != thread->reserved_stack) + if (stack != thread->reserved_stack) { + STACK_ZINFO_PFREE(thread); stack_free_stack(stack); + } } void +stack_free_reserved( + thread_t thread) +{ + if (thread->reserved_stack != thread->kernel_stack) { + stack_free_stack(thread->reserved_stack); + STACK_ZINFO_PFREE(thread); + } +} + +static void stack_free_stack( vm_offset_t stack) { @@ -240,6 +319,7 @@ stack_alloc_try( cache = &PROCESSOR_DATA(current_processor(), stack_cache); stack = cache->free; if (stack != 0) { + STACK_ZINFO_PALLOC(thread); cache->free = stack_next(stack); cache->count--; } @@ -248,6 +328,7 @@ stack_alloc_try( stack_lock(); stack = stack_free_list; if (stack != 0) { + STACK_ZINFO_PALLOC(thread); stack_free_list = stack_next(stack); stack_free_count--; stack_free_delta--; @@ -360,14 +441,23 @@ __unused void *arg) } void -stack_fake_zone_info(int *count, vm_size_t *cur_size, vm_size_t *max_size, vm_size_t *elem_size, - vm_size_t *alloc_size, int *collectable, int *exhaustable) +stack_fake_zone_init(int zone_index) +{ + stack_fake_zone_index = zone_index; +} + +void +stack_fake_zone_info(int *count, + vm_size_t *cur_size, vm_size_t *max_size, vm_size_t *elem_size, vm_size_t *alloc_size, + uint64_t *sum_size, int *collectable, int *exhaustable, int *caller_acct) { unsigned int total, hiwat, free; + unsigned long long all; spl_t s; s = splsched(); stack_lock(); + all = stack_allocs; total = stack_total; hiwat = stack_hiwat; free = stack_free_count; @@ -379,8 +469,11 @@ stack_fake_zone_info(int *count, vm_size_t *cur_size, vm_size_t *max_size, vm_si *max_size = kernel_stack_size * hiwat; *elem_size = kernel_stack_size; *alloc_size = kernel_stack_size; + *sum_size = all * kernel_stack_size; + *collectable = 1; *exhaustable = 0; + *caller_acct = 1; } /* OBSOLETE */ diff --git a/osfmk/kern/startup.c b/osfmk/kern/startup.c index fb673da76..47290e3d8 100644 --- a/osfmk/kern/startup.c +++ b/osfmk/kern/startup.c @@ -105,6 +105,7 @@ #include #include #include +#include #if MACH_KDP #include @@ -118,11 +119,6 @@ #include #endif -#ifdef __ppc__ -#include -#include -#endif - static void kernel_bootstrap_thread(void); static void load_context( @@ -135,36 +131,58 @@ extern void cpu_physwindow_init(int); // libkern/OSKextLib.cpp extern void OSKextRemoveKextBootstrap(void); -void srv_setup(void); -extern void bsd_srv_setup(int); +void scale_setup(void); +extern void bsd_scale_setup(int); extern unsigned int semaphore_max; - /* * Running in virtual memory, on the interrupt stack. */ -extern int srv; +extern int serverperfmode; + +/* size of kernel trace buffer, disabled by default */ +unsigned int new_nkdbufs = 0; + +/* mach leak logging */ +int log_leaks = 0; +int turn_on_log_leaks = 0; + + +void +kernel_early_bootstrap(void) +{ + + lck_mod_init(); + + /* + * Initialize the timer callout world + */ + timer_call_initialize(); +} + void kernel_bootstrap(void) { kern_return_t result; - thread_t thread; + thread_t thread; + char namep[16]; printf("%s\n", version); /* log kernel version */ #define kernel_bootstrap_kprintf(x...) /* kprintf("kernel_bootstrap: " x) */ - /* i386_vm_init already checks for this ; do it aagin anyway */ - if (PE_parse_boot_argn("srv", &srv, sizeof (srv))) { - srv = 1; - } + if (PE_parse_boot_argn("-l", namep, sizeof (namep))) /* leaks logging */ + turn_on_log_leaks = 1; - srv_setup(); + PE_parse_boot_argn("trace", &new_nkdbufs, sizeof (new_nkdbufs)); - kernel_bootstrap_kprintf("calling lck_mod_init\n"); - lck_mod_init(); + /* i386_vm_init already checks for this ; do it aagin anyway */ + if (PE_parse_boot_argn("serverperfmode", &serverperfmode, sizeof (serverperfmode))) { + serverperfmode = 1; + } + scale_setup(); kernel_bootstrap_kprintf("calling vm_mem_bootstrap\n"); vm_mem_bootstrap(); @@ -232,6 +250,13 @@ kernel_bootstrap(void) thread->state = TH_RUN; thread_deallocate(thread); + /* transfer statistics from init thread to kernel */ + thread_t init_thread = current_thread(); + kernel_task->tkm_private.alloc = init_thread->tkm_private.alloc; + kernel_task->tkm_private.free = init_thread->tkm_private.free; + kernel_task->tkm_shared.alloc = init_thread->tkm_shared.alloc; + kernel_task->tkm_shared.free = init_thread->tkm_shared.free; + kernel_bootstrap_kprintf("calling load_context - done\n"); load_context(thread); /*NOTREACHED*/ @@ -264,6 +289,18 @@ kernel_bootstrap_thread(void) kernel_bootstrap_thread_kprintf("calling sched_startup\n"); sched_startup(); + /* + * Thread lifecycle maintenance (teardown, stack allocation) + */ + kernel_bootstrap_thread_kprintf("calling thread_daemon_init\n"); + thread_daemon_init(); + + /* + * Thread callout service. + */ + kernel_bootstrap_thread_kprintf("calling thread_call_initialize\n"); + thread_call_initialize(); + /* * Remain on current processor as * additional processors come online. @@ -307,6 +344,14 @@ kernel_bootstrap_thread(void) pmc_bootstrap(); #endif +#if (defined(__i386__) || defined(__x86_64__)) + if (turn_on_log_leaks && !new_nkdbufs) + new_nkdbufs = 200000; + start_kern_tracing(new_nkdbufs); + if (turn_on_log_leaks) + log_leaks = 1; +#endif + #ifdef IOKIT PE_init_iokit(); #endif @@ -324,6 +369,14 @@ kernel_bootstrap_thread(void) cpu_userwindow_init(0); #endif +#if (!defined(__i386__) && !defined(__x86_64__)) + if (turn_on_log_leaks && !new_nkdbufs) + new_nkdbufs = 200000; + start_kern_tracing(new_nkdbufs); + if (turn_on_log_leaks) + log_leaks = 1; +#endif + /* * Initialize the shared region module. */ @@ -459,6 +512,7 @@ load_context( processor->active_thread = thread; processor->current_pri = thread->sched_pri; + processor->current_thmode = thread->sched_mode; processor->deadline = UINT64_MAX; thread->last_processor = processor; @@ -477,23 +531,32 @@ load_context( } void -srv_setup() +scale_setup() { int scale = 0; #if defined(__LP64__) - /* if memory is more than 16G, then apply rules for processes */ - if ((srv != 0) && ((uint64_t)sane_size >= (uint64_t)(16 * 1024 * 1024 *1024ULL))) { + typeof(task_max) task_max_base = task_max; + + /* Raise limits for servers with >= 16G */ + if ((serverperfmode != 0) && ((uint64_t)sane_size >= (uint64_t)(16 * 1024 * 1024 *1024ULL))) { scale = (int)((uint64_t)sane_size / (uint64_t)(8 * 1024 * 1024 *1024ULL)); /* limit to 128 G */ if (scale > 16) scale = 16; - task_max = 2500 * scale; + task_max_base = 2500; + } else if ((uint64_t)sane_size >= (uint64_t)(3 * 1024 * 1024 *1024ULL)) + scale = 2; + + task_max = MAX(task_max, task_max_base * scale); + + if (scale != 0) { task_threadmax = task_max; - thread_max = task_max * 5; - } else - scale = 0; + thread_max = task_max * 5; + } + #endif - bsd_srv_setup(scale); + + bsd_scale_setup(scale); ipc_space_max = SPACE_MAX; ipc_tree_entry_max = ITE_MAX; diff --git a/osfmk/kern/startup.h b/osfmk/kern/startup.h index bb60c7d40..7c239784f 100644 --- a/osfmk/kern/startup.h +++ b/osfmk/kern/startup.h @@ -42,6 +42,7 @@ __BEGIN_DECLS */ /* Initialize kernel */ +extern void kernel_early_bootstrap(void) __attribute__((section("__TEXT, initcode"))); extern void kernel_bootstrap(void) __attribute__((section("__TEXT, initcode"))); /* Initialize machine dependent stuff */ diff --git a/osfmk/kern/sync_lock.c b/osfmk/kern/sync_lock.c index 174381f5f..b69958ad7 100644 --- a/osfmk/kern/sync_lock.c +++ b/osfmk/kern/sync_lock.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -72,12 +72,10 @@ th = (ul)->holder; \ if ((th)->active) { \ thread_mtx_lock(th); \ - remqueue(&th->held_ulocks, \ - (queue_entry_t) (ul)); \ + remqueue((queue_entry_t) (ul)); \ thread_mtx_unlock(th); \ } else { \ - remqueue(&th->held_ulocks, \ - (queue_entry_t) (ul)); \ + remqueue((queue_entry_t) (ul)); \ } \ (ul)->holder = THREAD_NULL; \ MACRO_END @@ -98,7 +96,7 @@ #define lock_set_ownership_clear(ls, t) \ MACRO_BEGIN \ task_lock((t)); \ - remqueue(&(t)->lock_set_list, (queue_entry_t) (ls)); \ + remqueue((queue_entry_t) (ls)); \ (t)->lock_sets_owned--; \ task_unlock((t)); \ MACRO_END diff --git a/osfmk/kern/sync_sema.c b/osfmk/kern/sync_sema.c index a072684ad..80ffb8199 100644 --- a/osfmk/kern/sync_sema.c +++ b/osfmk/kern/sync_sema.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -241,7 +241,7 @@ semaphore_destroy( task_unlock(task); return KERN_INVALID_ARGUMENT; } - remqueue(&task->semaphore_list, (queue_entry_t) semaphore); + remqueue((queue_entry_t) semaphore); semaphore->owner = TASK_NULL; task->semaphores_owned--; task_unlock(task); diff --git a/osfmk/kern/syscall_subr.c b/osfmk/kern/syscall_subr.c index 3daf1ec38..e45f99f17 100644 --- a/osfmk/kern/syscall_subr.c +++ b/osfmk/kern/syscall_subr.c @@ -115,7 +115,7 @@ swtch_continue(void) disable_preemption(); myprocessor = current_processor(); - result = myprocessor->runq.count > 0 || rt_runq.count > 0; + result = !SCHED(processor_queue_empty)(myprocessor) || rt_runq.count > 0; enable_preemption(); thread_syscall_return(result); @@ -131,7 +131,7 @@ swtch( disable_preemption(); myprocessor = current_processor(); - if (myprocessor->runq.count == 0 && rt_runq.count == 0) { + if (SCHED(processor_queue_empty)(myprocessor) && rt_runq.count == 0) { mp_enable_preemption(); return (FALSE); @@ -144,7 +144,7 @@ swtch( disable_preemption(); myprocessor = current_processor(); - result = myprocessor->runq.count > 0 || rt_runq.count > 0; + result = !SCHED(processor_queue_empty)(myprocessor) || rt_runq.count > 0; enable_preemption(); return (result); @@ -160,7 +160,7 @@ swtch_pri_continue(void) disable_preemption(); myprocessor = current_processor(); - result = myprocessor->runq.count > 0 || rt_runq.count > 0; + result = !SCHED(processor_queue_empty)(myprocessor) || rt_runq.count > 0; mp_enable_preemption(); thread_syscall_return(result); @@ -176,7 +176,7 @@ __unused struct swtch_pri_args *args) disable_preemption(); myprocessor = current_processor(); - if (myprocessor->runq.count == 0 && rt_runq.count == 0) { + if (SCHED(processor_queue_empty)(myprocessor) && rt_runq.count == 0) { mp_enable_preemption(); return (FALSE); @@ -185,7 +185,7 @@ __unused struct swtch_pri_args *args) counter(c_swtch_pri_block++); - thread_depress_abstime(std_quantum); + thread_depress_abstime(thread_depress_time); thread_block_reason((thread_continue_t)swtch_pri_continue, NULL, AST_YIELD); @@ -193,7 +193,7 @@ __unused struct swtch_pri_args *args) disable_preemption(); myprocessor = current_processor(); - result = myprocessor->runq.count > 0 || rt_runq.count > 0; + result = !SCHED(processor_queue_empty)(myprocessor) || rt_runq.count > 0; enable_preemption(); return (result); @@ -290,7 +290,7 @@ thread_switch( thread->sched_pri < BASEPRI_RTQUEUES && (thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor) && - run_queue_remove(thread) ) { + thread_run_queue_remove(thread) ) { /* * Hah, got it!! */ @@ -347,16 +347,16 @@ thread_depress_abstime( s = splsched(); thread_lock(self); - if (!(self->sched_mode & TH_MODE_ISDEPRESSED)) { + if (!(self->sched_flags & TH_SFLAG_DEPRESSED_MASK)) { processor_t myprocessor = self->last_processor; self->sched_pri = DEPRESSPRI; myprocessor->current_pri = self->sched_pri; - self->sched_mode |= TH_MODE_DEPRESS; + self->sched_flags |= TH_SFLAG_DEPRESS; if (interval != 0) { clock_absolutetime_interval_to_deadline(interval, &deadline); - if (!timer_call_enter(&self->depress_timer, deadline)) + if (!timer_call_enter(&self->depress_timer, deadline, 0)) self->depress_timer_active++; } } @@ -389,8 +389,8 @@ thread_depress_expire( s = splsched(); thread_lock(thread); if (--thread->depress_timer_active == 0) { - thread->sched_mode &= ~TH_MODE_ISDEPRESSED; - compute_priority(thread, FALSE); + thread->sched_flags &= ~TH_SFLAG_DEPRESSED_MASK; + SCHED(compute_priority)(thread, FALSE); } thread_unlock(thread); splx(s); @@ -408,10 +408,10 @@ thread_depress_abort_internal( s = splsched(); thread_lock(thread); - if (!(thread->sched_mode & TH_MODE_POLLDEPRESS)) { - if (thread->sched_mode & TH_MODE_ISDEPRESSED) { - thread->sched_mode &= ~TH_MODE_ISDEPRESSED; - compute_priority(thread, FALSE); + if (!(thread->sched_flags & TH_SFLAG_POLLDEPRESS)) { + if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) { + thread->sched_flags &= ~TH_SFLAG_DEPRESSED_MASK; + SCHED(compute_priority)(thread, FALSE); result = KERN_SUCCESS; } @@ -433,7 +433,7 @@ thread_poll_yield( assert(self == current_thread()); s = splsched(); - if (!(self->sched_mode & (TH_MODE_REALTIME|TH_MODE_TIMESHARE))) { + if (self->sched_mode == TH_MODE_FIXED) { uint64_t total_computation, abstime; abstime = mach_absolute_time(); @@ -444,16 +444,16 @@ thread_poll_yield( ast_t preempt; thread_lock(self); - if (!(self->sched_mode & TH_MODE_ISDEPRESSED)) { + if (!(self->sched_flags & TH_SFLAG_DEPRESSED_MASK)) { self->sched_pri = DEPRESSPRI; myprocessor->current_pri = self->sched_pri; } self->computation_epoch = abstime; self->computation_metered = 0; - self->sched_mode |= TH_MODE_POLLDEPRESS; + self->sched_flags |= TH_SFLAG_POLLDEPRESS; abstime += (total_computation >> sched_poll_yield_shift); - if (!timer_call_enter(&self->depress_timer, abstime)) + if (!timer_call_enter(&self->depress_timer, abstime, 0)) self->depress_timer_active++; thread_unlock(self); @@ -473,7 +473,7 @@ thread_yield_internal( disable_preemption(); myprocessor = current_processor(); - if (myprocessor->runq.count == 0 && rt_runq.count == 0) { + if (SCHED(processor_queue_empty)(myprocessor) && rt_runq.count == 0) { mp_enable_preemption(); return; diff --git a/osfmk/kern/syscall_sw.c b/osfmk/kern/syscall_sw.c index 59a402aa3..7dc2d61fd 100644 --- a/osfmk/kern/syscall_sw.c +++ b/osfmk/kern/syscall_sw.c @@ -135,7 +135,11 @@ mach_trap_t mach_trap_table[MACH_TRAP_TABLE_COUNT] = { /* 40 */ MACH_TRAP(kern_invalid, 0, NULL, NULL), /* 41 */ MACH_TRAP(kern_invalid, 0, NULL, NULL), /* 42 */ MACH_TRAP(kern_invalid, 0, NULL, NULL), +#if !defined(CONFIG_EMBEDDED) /* 43 */ MACH_TRAP(map_fd, 5, munge_wwwww, munge_ddddd), +#else +/* 43 */ MACH_TRAP(kern_invalid, 0, NULL, NULL), +#endif /* !defined(CONFIG_EMBEDDED) */ /* 44 */ MACH_TRAP(task_name_for_pid, 3, munge_www, munge_ddd), /* 45 */ MACH_TRAP(task_for_pid, 3, munge_www, munge_ddd), /* 46 */ MACH_TRAP(pid_for_task, 2, munge_ww,munge_dd), diff --git a/osfmk/kern/syscall_sw.h b/osfmk/kern/syscall_sw.h index c6259eeb0..d186546d5 100644 --- a/osfmk/kern/syscall_sw.h +++ b/osfmk/kern/syscall_sw.h @@ -71,17 +71,13 @@ typedef void mach_munge_t(const void *, void *); typedef struct { int mach_trap_arg_count; int (*mach_trap_function)(void); -#if defined(__i386__) - boolean_t mach_trap_stack; -#else +#if 0 /* no active architectures use mungers for mach traps */ mach_munge_t *mach_trap_arg_munge32; /* system call arguments for 32-bit */ mach_munge_t *mach_trap_arg_munge64; /* system call arguments for 64-bit */ #endif -#if !MACH_ASSERT - int mach_trap_unused; -#else +#if MACH_ASSERT const char* mach_trap_name; -#endif /* !MACH_ASSERT */ +#endif /* MACH_ASSERT */ } mach_trap_t; #define MACH_TRAP_TABLE_COUNT 128 @@ -90,23 +86,16 @@ typedef struct { extern mach_trap_t mach_trap_table[]; extern int mach_trap_count; -#if defined(__i386__) -#if !MACH_ASSERT -#define MACH_TRAP(name, arg_count, munge32, munge64) \ - { (arg_count), (int (*)(void)) (name), FALSE, 0 } -#else -#define MACH_TRAP(name, arg_count, munge32, munge64) \ - { (arg_count), (int (*)(void)) (name), FALSE, #name } -#endif /* !MACH_ASSERT */ -#else /* !defined(__i386__) */ +#if defined(__i386__) || defined(__x86_64__) #if !MACH_ASSERT #define MACH_TRAP(name, arg_count, munge32, munge64) \ - { (arg_count), (int (*)(void)) (name), (munge32), (munge64), 0 } + { (arg_count), (int (*)(void)) (name) } #else #define MACH_TRAP(name, arg_count, munge32, munge64) \ - { (arg_count), (int (*)(void)) (name), (munge32), (munge64), #name } + { (arg_count), (int (*)(void)) (name), #name } #endif /* !MACH_ASSERT */ - -#endif /* !defined(__i386__) */ +#else /* !defined(__i386__) && !defined(__x86_64__) && !defined(__arm__) */ +#error Unsupported architecture +#endif /* !defined(__i386__) && !defined(__x86_64__) && !defined(__arm__) */ #endif /* _KERN_SYSCALL_SW_H_ */ diff --git a/osfmk/kern/task.c b/osfmk/kern/task.c index c5efca2a7..985f3c144 100644 --- a/osfmk/kern/task.c +++ b/osfmk/kern/task.c @@ -131,12 +131,6 @@ #include #endif /* MACH_KDB */ -#ifdef __ppc__ -#include -#include -#endif - - /* * Exported interfaces */ @@ -163,8 +157,14 @@ lck_attr_t task_lck_attr; lck_grp_t task_lck_grp; lck_grp_attr_t task_lck_grp_attr; +zinfo_usage_store_t tasks_tkm_private; +zinfo_usage_store_t tasks_tkm_shared; + int task_max = CONFIG_TASK_MAX; /* Max number of tasks */ +/* externs for BSD kernel */ +extern void proc_getexecutableuuid(void *, unsigned char *, unsigned long); + /* Forwards */ void task_hold_locked( @@ -226,17 +226,6 @@ task_set_64bit( (vm_map_offset_t) VM_MAX_ADDRESS, MACH_VM_MAX_ADDRESS, 0); -#ifdef __ppc__ - /* - * PPC51: ppc64 is limited to 51-bit addresses. - * Memory mapped above that limit is handled specially - * at the pmap level, so let pmap clean the commpage mapping - * explicitly... - */ - pmap_unmap_sharedpage(task->map->pmap); /* Unmap commpage */ - /* ... and avoid regular pmap cleanup */ - vm_flags |= VM_MAP_REMOVE_NO_PMAP_CLEANUP; -#endif /* __ppc__ */ /* remove the higher VM mappings */ (void) vm_map_remove(task->map, MACH_VM_MAX_ADDRESS, @@ -285,6 +274,7 @@ task_init(void) task_max * sizeof(struct task), TASK_CHUNK * sizeof(struct task), "tasks"); + zone_change(task_zone, Z_NOENCRYPT, TRUE); /* @@ -409,6 +399,13 @@ task_create_internal( new_task->taskFeatures[0] = 0; /* Init task features */ new_task->taskFeatures[1] = 0; /* Init task features */ + new_task->tkm_private.alloc = 0; + new_task->tkm_private.free = 0; + new_task->tkm_shared.alloc = 0; + new_task->tkm_shared.free = 0; + + zinfo_task_init(new_task); + #ifdef MACH_BSD new_task->bsd_info = NULL; #endif /* MACH_BSD */ @@ -416,12 +413,8 @@ task_create_internal( #if defined(__i386__) || defined(__x86_64__) new_task->i386_ldt = 0; new_task->task_debug = NULL; - #endif -#ifdef __ppc__ - if(BootProcInfo.pf.Available & pf64Bit) new_task->taskFeatures[0] |= tf64BitData; /* If 64-bit machine, show we have 64-bit registers at least */ -#endif queue_init(&new_task->semaphore_list); queue_init(&new_task->lock_set_list); @@ -473,6 +466,16 @@ task_create_internal( task_affinity_create(parent_task, new_task); new_task->pset_hint = parent_task->pset_hint = task_choose_pset(parent_task); + new_task->policystate = parent_task->policystate; + /* inherit the self action state */ + new_task->actionstate = parent_task->actionstate; + new_task->ext_policystate = parent_task->ext_policystate; +#if NOTYET + /* till the child lifecycle is cleared do not inherit external action */ + new_task->ext_actionstate = parent_task->ext_actionstate; +#else + new_task->ext_actionstate = default_task_null_policy; +#endif } else { new_task->sec_token = KERNEL_SECURITY_TOKEN; @@ -483,8 +486,14 @@ task_create_internal( if(is_64bit) task_set_64BitAddr(new_task); #endif + new_task->all_image_info_addr = (mach_vm_address_t)0; + new_task->all_image_info_size = (mach_vm_size_t)0; new_task->pset_hint = PROCESSOR_SET_NULL; + new_task->policystate = default_task_proc_policy; + new_task->ext_policystate = default_task_proc_policy; + new_task->actionstate = default_task_null_policy; + new_task->ext_actionstate = default_task_null_policy; } if (kernel_task == TASK_NULL) { @@ -495,6 +504,8 @@ task_create_internal( new_task->priority = BASEPRI_DEFAULT; new_task->max_priority = MAXPRI_USER; } + + bzero(&new_task->extmod_statistics, sizeof(new_task->extmod_statistics)); lck_mtx_lock(&tasks_threads_lock); queue_enter(&tasks, new_task, task_t, tasks); @@ -525,6 +536,10 @@ task_deallocate( if (task_deallocate_internal(task) > 0) return; + lck_mtx_lock(&tasks_threads_lock); + queue_remove(&terminated_tasks, task, task_t, tasks); + lck_mtx_unlock(&tasks_threads_lock); + ipc_task_terminate(task); if (task->affinity_space) @@ -538,6 +553,11 @@ task_deallocate( #if CONFIG_MACF_MACH labelh_release(task->label); #endif + OSAddAtomic64(task->tkm_private.alloc, (int64_t *)&tasks_tkm_private.alloc); + OSAddAtomic64(task->tkm_private.free, (int64_t *)&tasks_tkm_private.free); + OSAddAtomic64(task->tkm_shared.alloc, (int64_t *)&tasks_tkm_shared.alloc); + OSAddAtomic64(task->tkm_shared.free, (int64_t *)&tasks_tkm_shared.free); + zinfo_task_free(task); zfree(task_zone, task); } @@ -603,9 +623,9 @@ task_terminate_internal( task_lock(task); } - if (!task->active || !self->active) { + if (!task->active) { /* - * Task or current act is already being terminated. + * Task is already being terminated. * Just return an error. If we are dying, this will * just get us to our AST special handler and that * will get us to finalize the termination of ourselves. @@ -665,13 +685,6 @@ task_terminate_internal( */ ipc_space_destroy(task->itk_space); -#ifdef __ppc__ - /* - * PPC51: ppc64 is limited to 51-bit addresses. - */ - pmap_unmap_sharedpage(task->map->pmap); /* Unmap commpage */ -#endif /* __ppc__ */ - if (vm_map_has_4GB_pagezero(task->map)) vm_map_clear_4GB_pagezero(task->map); @@ -693,6 +706,7 @@ task_terminate_internal( lck_mtx_lock(&tasks_threads_lock); queue_remove(&tasks, task, task_t, tasks); + queue_enter(&terminated_tasks, task, task_t, tasks); tasks_count--; lck_mtx_unlock(&tasks_threads_lock); @@ -702,10 +716,6 @@ task_terminate_internal( */ thread_interrupt_level(interrupt_save); -#if __ppc__ - perfmon_release_facility(task); // notify the perfmon facility -#endif - /* * Get rid of the task active reference on itself. */ @@ -1162,8 +1172,9 @@ task_resume( } if (task->user_stop_count > 0) { - if (--task->user_stop_count == 0) + if (--task->user_stop_count == 0) { release = TRUE; + } } else { task_unlock(task); @@ -1182,6 +1193,60 @@ task_resume( return (KERN_SUCCESS); } +#if CONFIG_FREEZE + +/* + * task_freeze: + * + * Freeze a currently suspended task. + * + * Conditions: + * The caller holds a reference to the task + */ +kern_return_t +task_freeze( + register task_t task, + uint32_t *purgeable_count, + uint32_t *wired_count, + uint32_t *clean_count, + uint32_t *dirty_count, + boolean_t *shared, + boolean_t walk_only) +{ + if (task == TASK_NULL || task == kernel_task) + return (KERN_INVALID_ARGUMENT); + + if (walk_only) { + vm_map_freeze_walk(task->map, purgeable_count, wired_count, clean_count, dirty_count, shared); + } else { + vm_map_freeze(task->map, purgeable_count, wired_count, clean_count, dirty_count, shared); + } + + return (KERN_SUCCESS); +} + +/* + * task_thaw: + * + * Thaw a currently frozen task. + * + * Conditions: + * The caller holds a reference to the task + */ +kern_return_t +task_thaw( + register task_t task) +{ + if (task == TASK_NULL || task == kernel_task) + return (KERN_INVALID_ARGUMENT); + + vm_map_thaw(task->map); + + return (KERN_SUCCESS); +} + +#endif /* CONFIG_FREEZE */ + kern_return_t host_security_set_task_token( host_security_t host_security, @@ -1439,17 +1504,126 @@ task_info( { task_dyld_info_t info; - if (*task_info_count < TASK_DYLD_INFO_COUNT) { + /* + * We added the format field to TASK_DYLD_INFO output. For + * temporary backward compatibility, accept the fact that + * clients may ask for the old version - distinquished by the + * size of the expected result structure. + */ +#define TASK_LEGACY_DYLD_INFO_COUNT \ + offsetof(struct task_dyld_info, all_image_info_format)/sizeof(natural_t) + + if (*task_info_count < TASK_LEGACY_DYLD_INFO_COUNT) { error = KERN_INVALID_ARGUMENT; break; } + info = (task_dyld_info_t)task_info_out; info->all_image_info_addr = task->all_image_info_addr; info->all_image_info_size = task->all_image_info_size; - *task_info_count = TASK_DYLD_INFO_COUNT; + + /* only set format on output for those expecting it */ + if (*task_info_count >= TASK_DYLD_INFO_COUNT) { + info->all_image_info_format = task_has_64BitAddr(task) ? + TASK_DYLD_ALL_IMAGE_INFO_64 : + TASK_DYLD_ALL_IMAGE_INFO_32 ; + *task_info_count = TASK_DYLD_INFO_COUNT; + } else { + *task_info_count = TASK_LEGACY_DYLD_INFO_COUNT; + } break; } + case TASK_EXTMOD_INFO: + { + task_extmod_info_t info; + void *p; + + if (*task_info_count < TASK_EXTMOD_INFO_COUNT) { + error = KERN_INVALID_ARGUMENT; + break; + } + + info = (task_extmod_info_t)task_info_out; + + p = get_bsdtask_info(task); + if (p) { + proc_getexecutableuuid(p, info->task_uuid, sizeof(info->task_uuid)); + } else { + bzero(info->task_uuid, sizeof(info->task_uuid)); + } + info->extmod_statistics = task->extmod_statistics; + *task_info_count = TASK_EXTMOD_INFO_COUNT; + + break; + } + + case TASK_KERNELMEMORY_INFO: + { + task_kernelmemory_info_t tkm_info; + thread_t thread; + + if (*task_info_count < TASK_KERNELMEMORY_INFO_COUNT) { + error = KERN_INVALID_ARGUMENT; + break; + } + + tkm_info = (task_kernelmemory_info_t) task_info_out; + + if (task == kernel_task) { + /* + * All shared allocs/frees from other tasks count against + * the kernel private memory usage. If we are looking up + * info for the kernel task, gather from everywhere. + */ + task_unlock(task); + + /* start by accounting for all the terminated tasks against the kernel */ + tkm_info->total_palloc = tasks_tkm_private.alloc + tasks_tkm_shared.alloc; + tkm_info->total_pfree = tasks_tkm_private.free + tasks_tkm_shared.free; + tkm_info->total_salloc = 0; + tkm_info->total_sfree = 0; + + /* count all other task/thread shared alloc/free against the kernel */ + lck_mtx_lock(&tasks_threads_lock); + queue_iterate(&tasks, task, task_t, tasks) { + if (task == kernel_task) { + tkm_info->total_palloc += task->tkm_private.alloc; + tkm_info->total_pfree += task->tkm_private.free; + } + tkm_info->total_palloc += task->tkm_shared.alloc; + tkm_info->total_pfree += task->tkm_shared.free; + } + queue_iterate(&threads, thread, thread_t, threads) { + if (thread->task == kernel_task) { + tkm_info->total_palloc += thread->tkm_private.alloc; + tkm_info->total_pfree += thread->tkm_private.free; + } + tkm_info->total_palloc += thread->tkm_shared.alloc; + tkm_info->total_pfree += thread->tkm_shared.free; + } + lck_mtx_unlock(&tasks_threads_lock); + } else { + /* account for all the terminated threads in the process */ + tkm_info->total_palloc = task->tkm_private.alloc; + tkm_info->total_pfree = task->tkm_private.free; + tkm_info->total_salloc = task->tkm_shared.alloc; + tkm_info->total_sfree = task->tkm_shared.free; + + /* then add in all the running threads */ + queue_iterate(&task->threads, thread, thread_t, task_threads) { + tkm_info->total_palloc += thread->tkm_private.alloc; + tkm_info->total_pfree += thread->tkm_private.free; + tkm_info->total_salloc += thread->tkm_shared.alloc; + tkm_info->total_sfree += thread->tkm_shared.free; + } + task_unlock(task); + } + + *task_info_count = TASK_KERNELMEMORY_INFO_COUNT; + return KERN_SUCCESS; + } + /* OBSOLETE */ case TASK_SCHED_FIFO_INFO: { @@ -1460,12 +1634,15 @@ task_info( } error = KERN_INVALID_POLICY; + break; } /* OBSOLETE */ case TASK_SCHED_RR_INFO: { register policy_rr_base_t rr_base; + uint32_t quantum_time; + uint64_t quantum_ns; if (*task_info_count < POLICY_RR_BASE_COUNT) { error = KERN_INVALID_ARGUMENT; @@ -1481,7 +1658,10 @@ task_info( rr_base->base_priority = task->priority; - rr_base->quantum = std_quantum_us / 1000; + quantum_time = SCHED(initial_quantum_size)(THREAD_NULL); + absolutetime_to_nanoseconds(quantum_time, &quantum_ns); + + rr_base->quantum = (uint32_t)(quantum_ns / 1000 / 1000); *task_info_count = POLICY_RR_BASE_COUNT; break; @@ -1546,6 +1726,7 @@ task_info( case TASK_SCHED_INFO: error = KERN_INVALID_ARGUMENT; + break; case TASK_EVENTS_INFO: { @@ -1571,7 +1752,9 @@ task_info( events_info->csw = task->c_switch; queue_iterate(&task->threads, thread, thread_t, task_threads) { - events_info->csw += thread->c_switch; + events_info->csw += thread->c_switch; + events_info->syscalls_mach += thread->syscalls_mach; + events_info->syscalls_unix += thread->syscalls_unix; } @@ -1586,8 +1769,8 @@ task_info( } error = task_affinity_info(task, task_info_out, task_info_count); + break; } - default: error = KERN_INVALID_ARGUMENT; } @@ -1942,6 +2125,24 @@ task_reference( task_reference_internal(task); } +/* + * This routine is called always with task lock held. + * And it returns a thread handle without reference as the caller + * operates on it under the task lock held. + */ +thread_t +task_findtid(task_t task, uint64_t tid) +{ + thread_t thread= THREAD_NULL; + + queue_iterate(&task->threads, thread, thread_t, task_threads) { + if (thread->thread_id == tid) + break; + } + return(thread); +} + + #if CONFIG_MACF_MACH /* * Protect 2 task labels against modification by adding a reference on diff --git a/osfmk/kern/task.h b/osfmk/kern/task.h index 0e7ea86e2..af0482aca 100644 --- a/osfmk/kern/task.h +++ b/osfmk/kern/task.h @@ -104,15 +104,117 @@ #include #include #include +#include #include #include #include #include #include -#include #include #include +#endif /* MACH_KERNEL_PRIVATE */ + +#ifdef XNU_KERNEL_PRIVATE + +/* defns for task->rsu_controldata */ +#define TASK_POLICY_CPU_RESOURCE_USAGE 0 +#define TASK_POLICY_WIREDMEM_RESOURCE_USAGE 1 +#define TASK_POLICY_VIRTUALMEM_RESOURCE_USAGE 2 +#define TASK_POLICY_DISK_RESOURCE_USAGE 3 +#define TASK_POLICY_NETWORK_RESOURCE_USAGE 4 +#define TASK_POLICY_POWER_RESOURCE_USAGE 5 + +#define TASK_POLICY_RESOURCE_USAGE_COUNT 6 + +/* + * Process Action and Policy bit definitions + +The bit defns of the policy states +64 60 56 52 48 44 40 36 32 28 24 20 16 12 8 0 +|----|-----|----|----|----|----|----|----|----|----|----|----|----|----|--------| +|RFU | RFU | PWR| NET| DSK| CPU| VM | WM | LVM| RFU| CPU| NET| GPU| DSK| BGRND | +|----|-----|----|----|----|----|----|----|----|----|----|----|----|----|--------| +|<----------- RESOURCE USAGE -------->|< LOWSRC>|<-HARDWARE ACCESS->|BackGrnd| +|----|-----|----|----|----|----|----|----|----|----|----|----|----|----|--------| + +* +*/ + +#define TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE 0x00 +#define TASK_POLICY_BACKGROUND_ATTRIBUTE_LOWPRI 0x01 +#define TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE 0x02 +#define TASK_POLICY_BACKGROUND_ATTRIBUTE_NETTHROTTLE 0x04 +#define TASK_POLICY_BACKGROUND_ATTRIBUTE_NOGPU 0x08 +#if CONFIG_EMBEDDED +#define TASK_POLICY_BACKGROUND_ATTRIBUTE_ALL 0x0F +#else /* CONFIG_EMBEDDED */ +#define TASK_POLICY_BACKGROUND_ATTRIBUTE_ALL 0x07 +#endif /* CONFIG_EMBEDDED */ +#define TASK_POLICY_BACKGROUND_ATTRIBUTE_DEFAULT TASK_POLICY_BACKGROUND_ATTRIBUTE_ALL + +/* Hardware disk access attributes, bit different as it should reflect IOPOL_XXX */ +#define TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NONE 0x00 +#define TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL 0x01 +#define TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_PASSIVE 0x02 +#define TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE 0x03 +#define TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_DEFAULT TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL + +/* Hardware disk access attributes */ +#define TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NONE 0x00 +#define TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NORMAL 0x00 +#define TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_FULLACCESS 0x00 +#define TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS 0x01 +#define TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_DEFAULT 0x00 + +/* Hardware Network access attributes */ +#define TASK_POLICY_HWACCESS_NET_ATTRIBUTE_NONE 0x00 +#define TASK_POLICY_HWACCESS_NET_ATTRIBUTE_NORMAL 0x00 +#define TASK_POLICY_HWACCESS_NET_ATTRIBUTE_THROTTLE 0x01 +#define TASK_POLICY_HWACCESS_NET_ATTRIBUTE_DEFAULT 0x00 + +/* Hardware CPU access attributes */ +#define TASK_POLICY_HWACCESS_CPU_ATTRIBUTE_NONE 0x00 +#define TASK_POLICY_HWACCESS_CPU_ATTRIBUTE_NORMAL 0x00 +#define TASK_POLICY_HWACCESS_CPU_ATTRIBUTE_ALL 0x00 +#define TASK_POLICY_HWACCESS_CPU_ATTRIBUTE_ONE 0x01 +#define TASK_POLICY_HWACCESS_CPU_ATTRIBUTE_LLCACHE 0x02 +#define TASK_POLICY_HWACCESS_CPU_ATTRIBUTE_DEFAULT 0x00 + +/* Resource usage/low resource attributes */ +#define TASK_POLICY_RESOURCE_ATTRIBUTE_NONE 0x00 +#define TASK_POLICY_RESOURCE_ATTRIBUTE_THROTTLE 0x01 +#define TASK_POLICY_RESOURCE_ATTRIBUTE_SUSPEND 0x02 +#define TASK_POLICY_RESOURCE_ATTRIBUTE_TERMINATE 0x03 +#define TASK_POLICY_RESOURCE_ATTRIBUTE_NOTIFY 0x04 +#define TASK_POLICY_RESOURCE_ATTRIBUTE_DEFAULT 0x00 + +#endif /* XNU_KERNEL_PRIVATE */ + +#ifdef MACH_KERNEL_PRIVATE + +typedef struct process_policy { + uint64_t apptype:4, + rfu1:4, + ru_power:4, /* Resource Usage Power */ + ru_net:4, /* Resource Usage Network */ + ru_disk:4, /* Resource Usage Disk */ + ru_cpu:4, /* Resource Usage CPU */ + ru_virtmem:4, /* Resource Usage VM */ + ru_wiredmem:4,/* Resource Usage Wired Memory */ + low_vm:4, /* Low Virtual Memory */ + rfu2:4, + hw_cpu:4, /* HW Access to CPU */ + hw_net:4, /* HW Access to Network */ + hw_gpu:4, /* HW Access to GPU */ + hw_disk:4, /* HW Access to Disk */ + hw_bg:8; /* Darwin Background Policy */ +} process_policy_t; + +#include + +extern process_policy_t default_task_proc_policy; /* init value for the process policy attributes */ +extern process_policy_t default_task_null_policy; /* none as the value for the process policy attributes */ struct task { /* Synchronization/destruction information */ @@ -193,9 +295,14 @@ struct task { integer_t messages_received; /* messages received counter */ integer_t syscalls_mach; /* mach system call counter */ integer_t syscalls_unix; /* unix system call counter */ - uint32_t c_switch; /* total context switches */ - uint32_t p_switch; /* total processor switches */ - uint32_t ps_switch; /* total pset switches */ + uint32_t c_switch; /* total context switches */ + uint32_t p_switch; /* total processor switches */ + uint32_t ps_switch; /* total pset switches */ + + zinfo_usage_store_t tkm_private;/* private kmem alloc/free stats (reaped threads) */ + zinfo_usage_store_t tkm_shared; /* shared kmem alloc/free stats (reaped threads) */ + zinfo_usage_t tkm_zinfo; /* per-task, per-zone usage statistics */ + #ifdef MACH_BSD void *bsd_info; #endif @@ -221,6 +328,14 @@ struct task { uint32_t t_chud; /* CHUD flags, used for Shark */ #endif + process_policy_t ext_actionstate; /* externally applied actions */ + process_policy_t ext_policystate; /* externally defined process policy states*/ + process_policy_t actionstate; /* self applied acions */ + process_policy_t policystate; /* process wide policy states */ + + uint64_t rsu_controldata[TASK_POLICY_RESOURCE_USAGE_COUNT]; + + vm_extmod_statistics_data_t extmod_statistics; }; #define task_lock(task) lck_mtx_lock(&(task)->lock) @@ -293,6 +408,24 @@ extern kern_return_t task_hold( extern kern_return_t task_release( task_t task); +#if CONFIG_FREEZE + +/* Freeze a task's resident pages */ +extern kern_return_t task_freeze( + task_t task, + uint32_t *purgeable_count, + uint32_t *wired_count, + uint32_t *clean_count, + uint32_t *dirty_count, + boolean_t *shared, + boolean_t walk_only); + +/* Thaw a currently frozen task */ +extern kern_return_t task_thaw( + task_t task); + +#endif /* CONFIG_FREEZE */ + /* Halt all other threads in the current task */ extern kern_return_t task_start_halt( task_t task); @@ -352,7 +485,7 @@ extern int get_task_numactivethreads(task_t task); /* JMM - should just be temporary (implementation in bsd_kern still) */ extern void set_bsdtask_info(task_t,void *); extern vm_map_t get_task_map_reference(task_t); -extern vm_map_t swap_task_map(task_t, thread_t, vm_map_t); +extern vm_map_t swap_task_map(task_t, thread_t, vm_map_t, boolean_t); extern pmap_t get_task_pmap(task_t); extern uint64_t get_task_resident_size(task_t); @@ -373,6 +506,74 @@ extern kern_return_t machine_task_set_state( mach_msg_type_number_t state_count); +int proc_get_task_bg_policy(task_t task); +int proc_get_thread_bg_policy(task_t task, uint64_t tid); +int proc_get_self_isbackground(void); +int proc_get_selfthread_isbackground(void); + +int proc_get_darwinbgstate(task_t, uint32_t *); +int proc_set_bgtaskpolicy(task_t task, int intval); +int proc_set1_bgtaskpolicy(task_t task, int intval); +int proc_set_bgthreadpolicy(task_t task, uint64_t tid, int val); +int proc_set1_bgthreadpolicy(task_t task, uint64_t tid, int val); + +int proc_add_bgtaskpolicy(task_t task, int val); +int proc_add_bgthreadpolicy(task_t task, uint64_t tid, int val); +int proc_remove_bgtaskpolicy(task_t task, int policy); +int proc_remove_bgthreadpolicy(task_t task, uint64_t tid, int val); + +int proc_apply_bgtaskpolicy(task_t task); +int proc_apply_bgtaskpolicy_external(task_t task); +int proc_apply_bgtaskpolicy_internal(task_t task); +int proc_apply_bgthreadpolicy(task_t task, uint64_t tid); +int proc_apply_bgtask_selfpolicy(void); +int proc_apply_bgthread_selfpolicy(void); +int proc_apply_workq_bgthreadpolicy(thread_t); + +int proc_restore_bgtaskpolicy(task_t task); +int proc_restore_bgthreadpolicy(task_t task, uint64_t tid); +int proc_restore_bgthread_selfpolicy(void); +int proc_restore_workq_bgthreadpolicy(thread_t); + +/* hw access routines */ +int proc_apply_task_diskacc(task_t task, int policy); +int proc_apply_thread_diskacc(task_t task, uint64_t tid, int policy); +int proc_apply_thread_selfdiskacc(int policy); +int proc_get_task_disacc(task_t task); +int proc_get_task_selfdiskacc(void); +int proc_get_thread_selfdiskacc(void); +int proc_denyinherit_policy(task_t task); +int proc_denyselfset_policy(task_t task); + +int proc_get_task_selfgpuacc_deny(void); +int proc_apply_task_gpuacc(task_t task, int prio); + +int proc_get_task_ruse_cpu(task_t task, uint32_t * policyp, uint32_t * percentagep, uint64_t * intervalp, uint64_t * deadlinep); +int proc_set_task_ruse_cpu(task_t task, uint32_t policy, uint32_t percentage, uint64_t interval, uint64_t deadline); +thread_t task_findtid(task_t, uint64_t); + +#define PROC_POLICY_OSX_APPTYPE_NONE 0 +#define PROC_POLICY_OSX_APPTYPE_TAL 1 +#define PROC_POLICY_OSX_APPTYPE_WIDGET 2 +#define PROC_POLICY_OSX_APPTYPE_DBCLIENT 2 /* Not a bug, just rename of widget */ +#define PROC_POLICY_IOS_APPTYPE 3 +#define PROC_POLICY_IOS_NONUITYPE 4 + +void proc_set_task_apptype(task_t, int); +int proc_disable_task_apptype(task_t task, int policy_subtype); +int proc_enable_task_apptype(task_t task, int policy_subtype); + +/* resource handle callback */ +int task_action_cpuusage(task_t); + +/* BSD call back functions */ +extern int proc_apply_resource_actions(void * p, int type, int action); +extern int proc_restore_resource_actions(void * p, int type, int action); +extern int task_restore_resource_actions(task_t task, int type); + +extern void proc_apply_task_networkbg(void * bsd_info); +extern void proc_restore_task_networkbg(void * bsd_info); +extern void proc_set_task_networkbg(void * bsd_info, int setbg); #endif /* XNU_KERNEL_PRIVATE */ #ifdef KERNEL_PRIVATE diff --git a/osfmk/kern/task_policy.c b/osfmk/kern/task_policy.c index d3395ddb4..e8f9bc628 100644 --- a/osfmk/kern/task_policy.c +++ b/osfmk/kern/task_policy.c @@ -31,6 +31,53 @@ #include #include +#include +#include +#include +#include + +static int proc_apply_bgtaskpolicy_locked(task_t task, int, int); +static int proc_restore_bgtaskpolicy_locked(task_t, int, int, int); +static int task_get_cpuusage(task_t task, uint32_t * percentagep, uint64_t * intervalp, uint64_t * deadlinep); +static int task_set_cpuusage(task_t task, uint32_t percentage, uint64_t interval, uint64_t deadline); +static int task_apply_resource_actions(task_t task, int type); +static int proc_apply_bgthreadpolicy_locked(thread_t thread, int selfset); +static void restore_bgthreadpolicy_locked(thread_t thread, int selfset); + +process_policy_t default_task_proc_policy = {0, + 0, + TASK_POLICY_RESOURCE_ATTRIBUTE_NONE, + TASK_POLICY_RESOURCE_ATTRIBUTE_NONE, + TASK_POLICY_RESOURCE_ATTRIBUTE_NONE, + TASK_POLICY_RESOURCE_ATTRIBUTE_NONE, + TASK_POLICY_RESOURCE_ATTRIBUTE_NONE, + TASK_POLICY_RESOURCE_ATTRIBUTE_NONE, + TASK_POLICY_RESOURCE_ATTRIBUTE_NONE, + 0, + TASK_POLICY_HWACCESS_CPU_ATTRIBUTE_ALL, + TASK_POLICY_HWACCESS_NET_ATTRIBUTE_NORMAL, + TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_FULLACCESS, + TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL, + TASK_POLICY_BACKGROUND_ATTRIBUTE_ALL + }; + +process_policy_t default_task_null_policy = {0, + 0, + TASK_POLICY_RESOURCE_ATTRIBUTE_NONE, + TASK_POLICY_RESOURCE_ATTRIBUTE_NONE, + TASK_POLICY_RESOURCE_ATTRIBUTE_NONE, + TASK_POLICY_RESOURCE_ATTRIBUTE_NONE, + TASK_POLICY_RESOURCE_ATTRIBUTE_NONE, + TASK_POLICY_RESOURCE_ATTRIBUTE_NONE, + TASK_POLICY_RESOURCE_ATTRIBUTE_NONE, + 0, + TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NONE, + TASK_POLICY_HWACCESS_NET_ATTRIBUTE_NONE, + TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NONE, + TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL, + TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE + }; + static void task_priority( @@ -46,6 +93,8 @@ task_policy_set( mach_msg_type_number_t count) { kern_return_t result = KERN_SUCCESS; + void * bsdinfo = NULL; + int setbg = 0; if (task == TASK_NULL || task == kernel_task) return (KERN_INVALID_ARGUMENT); @@ -54,72 +103,151 @@ task_policy_set( case TASK_CATEGORY_POLICY: { - task_category_policy_t info = (task_category_policy_t)policy_info; + task_category_policy_t info = (task_category_policy_t)policy_info; if (count < TASK_CATEGORY_POLICY_COUNT) return (KERN_INVALID_ARGUMENT); +#if CONFIG_EMBEDDED + if ((current_task() == task) && (info != NULL) && + (info->role != TASK_THROTTLE_APPLICATION)) + return (KERN_INVALID_ARGUMENT); +#endif + task_lock(task); + if ( info->role == TASK_FOREGROUND_APPLICATION || + info->role == TASK_BACKGROUND_APPLICATION) { +#if !CONFIG_EMBEDDED + if (task->ext_actionstate.apptype != PROC_POLICY_OSX_APPTYPE_NONE) { + switch (info->role) { + case TASK_FOREGROUND_APPLICATION: + switch (task->ext_actionstate.apptype) { + case PROC_POLICY_OSX_APPTYPE_TAL: + /* Move the app to foreground with no DarwinBG */ + proc_restore_bgtaskpolicy_locked(task, 1, 1, BASEPRI_FOREGROUND); + bsdinfo = task->bsd_info; + setbg = 0; + break; + + case PROC_POLICY_OSX_APPTYPE_DBCLIENT: + /* reset the apptype so enforcement on background/foregound */ + task->ext_actionstate.apptype = PROC_POLICY_OSX_APPTYPE_NONE; + /* Internal application and make it foreground pri */ + proc_restore_bgtaskpolicy_locked(task, 1, 0, BASEPRI_FOREGROUND); + bsdinfo = task->bsd_info; + setbg = 0; + break; + + default: + /* the app types cannot be in CONTROL, GRAPHICS STATE, so it will de default state here */ + task_priority(task, + ((info->role == TASK_FOREGROUND_APPLICATION)? + BASEPRI_FOREGROUND: BASEPRI_BACKGROUND), + task->max_priority); + break; + } + task->role = TASK_FOREGROUND_APPLICATION; + break; + + case TASK_BACKGROUND_APPLICATION: + switch (task->ext_actionstate.apptype) { + case PROC_POLICY_OSX_APPTYPE_TAL: + /* TAL apps will get Darwin backgrounded if not already set */ + if (task->ext_actionstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) { + /* external application of Darwin BG */ + proc_apply_bgtaskpolicy_locked(task, 1, 1); + bsdinfo = task->bsd_info; + setbg = 1; + } + break; + + default: + task_priority(task, + ((info->role == TASK_FOREGROUND_APPLICATION)? + BASEPRI_FOREGROUND: BASEPRI_BACKGROUND), + task->max_priority); + break; + } + task->role = TASK_BACKGROUND_APPLICATION; + break; - if ( info->role == TASK_FOREGROUND_APPLICATION || - info->role == TASK_BACKGROUND_APPLICATION ) { + default: + /* do nothing */ + break; + + } /* switch info->role */ + } else { /* apptype != PROC_POLICY_OSX_APPTYPE_NONE */ +#endif /* !CONFIG_EMBEDDED */ switch (task->role) { case TASK_FOREGROUND_APPLICATION: case TASK_BACKGROUND_APPLICATION: case TASK_UNSPECIFIED: - task_priority(task, - ((info->role == TASK_FOREGROUND_APPLICATION)? - BASEPRI_FOREGROUND: BASEPRI_BACKGROUND), - task->max_priority); + /* if there are no process wide backgrounding ... */ + if ((task->ext_actionstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) && + (task->actionstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE)) { + task_priority(task, + ((info->role == TASK_FOREGROUND_APPLICATION)? + BASEPRI_FOREGROUND: BASEPRI_BACKGROUND), + task->max_priority); + } task->role = info->role; break; case TASK_CONTROL_APPLICATION: case TASK_RENICED: - /* fail silently */ + /* else fail silently */ break; default: result = KERN_INVALID_ARGUMENT; break; } - } - else - if (info->role == TASK_CONTROL_APPLICATION) { - if ( task != current_task() || - task->sec_token.val[0] != 0 ) +#if !CONFIG_EMBEDDED + } /* apptype != PROC_POLICY_OSX_APPTYPE_NONE */ +#endif /* !CONFIG_EMBEDDED */ + + } else if (info->role == TASK_CONTROL_APPLICATION) { + if (task != current_task()|| + task->sec_token.val[0] != 0) result = KERN_INVALID_ARGUMENT; else { task_priority(task, BASEPRI_CONTROL, task->max_priority); task->role = info->role; } - } - else - if (info->role == TASK_GRAPHICS_SERVER) { - if ( task != current_task() || - task->sec_token.val[0] != 0 ) + } else if (info->role == TASK_GRAPHICS_SERVER) { + if (task != current_task() || + task->sec_token.val[0] != 0) result = KERN_INVALID_ARGUMENT; else { task_priority(task, MAXPRI_RESERVED - 3, MAXPRI_RESERVED); task->role = info->role; } - } - else + } else +#if CONFIG_EMBEDDED if (info->role == TASK_THROTTLE_APPLICATION) { task_priority(task, MAXPRI_THROTTLE, MAXPRI_THROTTLE); task->role = info->role; - } - else - if (info->role == TASK_DEFAULT_APPLICATION) { + } else if (info->role == TASK_DEFAULT_APPLICATION || info->role == TASK_NONUI_APPLICATION) + { task_priority(task, BASEPRI_DEFAULT, MAXPRI_USER); task->role = info->role; - } - else + } else +#else /* CONFIG_EMBEDDED */ + if (info->role == TASK_DEFAULT_APPLICATION) + { + task_priority(task, BASEPRI_DEFAULT, MAXPRI_USER); + task->role = info->role; + } else +#endif /* CONFIG_EMBEDDED */ result = KERN_INVALID_ARGUMENT; task_unlock(task); + /* if backgrounding action ... */ + if (bsdinfo != NULL) + proc_set_task_networkbg(bsdinfo, setbg); + break; } @@ -225,3 +353,979 @@ task_policy_get( return (KERN_SUCCESS); } + +/* task Darwin BG enforcement/settings related routines */ +int +proc_get_task_bg_policy(task_t task) +{ + + int selfset = 0; + int val = 0; + + if (current_task() == task) + selfset = 1; + + if (selfset == 0) { + val = task->ext_policystate.hw_bg; + } else { + val = task->policystate.hw_bg; + } + + return(val); +} + + +int +proc_get_thread_bg_policy(task_t task, uint64_t tid) +{ + thread_t self = current_thread(); + thread_t thread = THREAD_NULL; + int val = 0; + + if (tid == self->thread_id) { + val = self->policystate.hw_bg; + } else { + task_lock(task); + thread = task_findtid(task, tid); + if (thread != NULL) + val = thread->ext_policystate.hw_bg; + task_unlock(task); + } + + return(val); +} + +int +proc_get_self_isbackground(void) +{ + task_t task = current_task();; + thread_t thread = current_thread(); + + if ((task->ext_actionstate.hw_bg != TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) || + (task->actionstate.hw_bg != TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) || + (thread->ext_actionstate.hw_bg != TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) || + (thread->actionstate.hw_bg != TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE)) + return(1); + else + return(0); + +} + +int proc_get_selfthread_isbackground(void) +{ + thread_t thread = current_thread(); + + if ((thread->ext_actionstate.hw_bg != TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) || + (thread->actionstate.hw_bg != TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE)) + return(1); + else + return(0); +} + + +int +proc_set_bgtaskpolicy(task_t task, int intval) +{ + + int selfset = 0; + + if (current_task() == task) + selfset = 1; + + task_lock(task); + + if (selfset == 0) { + /* allready set? */ + if (task->ext_policystate.hw_bg != intval) + task->ext_policystate.hw_bg = intval; + } else { + if (task->policystate.hw_bg != intval) + task->policystate.hw_bg = intval; + } + + task_unlock(task); + return(0); +} + +/* set and apply as well */ +int proc_set1_bgtaskpolicy(task_t task, int prio) +{ + int error = 0; + + if (prio == PRIO_DARWIN_BG) { + error = proc_set_bgtaskpolicy(task, TASK_POLICY_BACKGROUND_ATTRIBUTE_ALL); + if (error == 0) + error = proc_apply_bgtaskpolicy(task); + } else { + error = proc_restore_bgtaskpolicy(task); + } + + return(error); +} + + +int +proc_set_bgthreadpolicy(task_t task, uint64_t tid, int prio) +{ + thread_t self = current_thread(); + thread_t thread = THREAD_NULL; + int reset; + + if (prio == 0) + reset = 1; + task_lock(task); + if (tid == self->thread_id) { + self->policystate.hw_bg = prio; + } else { + thread = task_findtid(task, tid); + if (thread != NULL) + thread->ext_policystate.hw_bg = prio; + } + + task_unlock(task); + + return(0); +} + +int +proc_set1_bgthreadpolicy(task_t task, uint64_t tid, int prio) +{ + int error = 0; + + if (prio == PRIO_DARWIN_BG) { + error = proc_set_bgthreadpolicy(task, tid, TASK_POLICY_BACKGROUND_ATTRIBUTE_ALL); + if (error == 0) + error = proc_apply_bgthreadpolicy(task, tid); + } else { + error = proc_restore_bgthreadpolicy(task, tid); + } + + return(error); +} + +int +proc_add_bgtaskpolicy(task_t task, int val) +{ + int selfset = 0; + + if (current_task() == task) + selfset = 1; + + task_lock(task); + + if (selfset == 0) { + task->policystate.hw_bg |= val; + } else { + task->ext_policystate.hw_bg |= val; + } + + task_unlock(task); + return(0); +} + +int +proc_add_bgthreadpolicy(task_t task, uint64_t tid, int val) +{ + thread_t self = current_thread(); + thread_t thread = THREAD_NULL; + int reset; + + if (val == 0) + reset = 1; + task_lock(task); + if (tid == self->thread_id) { + self->policystate.hw_bg |= val; + } else { + thread = task_findtid(task, tid); + if (thread != NULL) + thread->ext_policystate.hw_bg |= val; + } + + task_unlock(task); + + return(val); +} + +int +proc_remove_bgtaskpolicy(task_t task, int intval) +{ + int selfset = 0; + + if (current_task() == task) + selfset = 1; + + task_lock(task); + + if (selfset == 0) { + task->policystate.hw_bg &= ~intval; + } else { + task->ext_policystate.hw_bg &= ~intval; + } + + task_unlock(task); + return(0); +} + +int +proc_remove_bgthreadpolicy(task_t task, uint64_t tid, int val) +{ + thread_t self = current_thread(); + thread_t thread = THREAD_NULL; + int reset; + + if (val == 0) + reset = 1; + task_lock(task); + if (tid == self->thread_id) { + self->policystate.hw_bg &= ~val; + } else { + thread = task_findtid(task, tid); + if (thread != NULL) + thread->ext_policystate.hw_bg &= ~val; + } + + task_unlock(task); + + return(val); +} + +int +proc_apply_bgtask_selfpolicy(void) +{ + return(proc_apply_bgtaskpolicy(current_task())); +} + +int +proc_apply_bgtaskpolicy(task_t task) +{ + int external = 1; + + if (task == current_task()) + external = 0; + + return(proc_apply_bgtaskpolicy_locked(task, 0, external)); +} + +int +proc_apply_bgtaskpolicy_external(task_t task) +{ + return(proc_apply_bgtaskpolicy_locked(task, 0, 1)); + +} + +int +proc_apply_bgtaskpolicy_internal(task_t task) +{ + return(proc_apply_bgtaskpolicy_locked(task, 0, 0)); +} + + +static int +proc_apply_bgtaskpolicy_locked(task_t task, int locked, int external) +{ + if (locked == 0) + task_lock(task); + + if (external != 0) { + /* allready set? */ + if (task->ext_actionstate.hw_bg != task->ext_policystate.hw_bg) { + task->ext_actionstate.hw_bg = task->ext_policystate.hw_bg; + task_priority(task, MAXPRI_THROTTLE, MAXPRI_THROTTLE); + /* background state applied */ + } + } else { + if (task->actionstate.hw_bg != task->policystate.hw_bg) { + task->actionstate.hw_bg = task->policystate.hw_bg; + task_priority(task, MAXPRI_THROTTLE, MAXPRI_THROTTLE); + } + } + if (locked == 0) + task_unlock(task); + return(0); +} + +/* apply the self backgrounding even if the thread is not current thread/task(timer threads) */ +int +proc_apply_workq_bgthreadpolicy(thread_t thread) +{ + int error; + task_t wqtask = TASK_NULL; + + if (thread != THREAD_NULL) { + wqtask = thread->task; + task_lock(wqtask); + /* apply the background as selfset internal one */ + error = proc_apply_bgthreadpolicy_locked(thread, 1); + task_unlock(wqtask); + } else + error = ESRCH; + + return(error); +} + +int +proc_apply_bgthreadpolicy(task_t task, uint64_t tid) +{ + thread_t self = current_thread(); + thread_t thread = THREAD_NULL; + int selfset = 0, error = 0; + task_t localtask = TASK_NULL; + + if (tid == self->thread_id) { + selfset = 1; + localtask = current_task(); + } else { + localtask = task; + } + + task_lock(localtask); + if (selfset != 0) { + thread = self; + } else { + thread = task_findtid(task, tid); + } + + error = proc_apply_bgthreadpolicy_locked(thread, selfset); + task_unlock(localtask); + + return(error); +} + +static int +proc_apply_bgthreadpolicy_locked(thread_t thread, int selfset) +{ + int set = 0; + thread_precedence_policy_data_t policy; + + if (thread != NULL) { + if (selfset != 0) { + /* internal application */ + if (thread->actionstate.hw_bg != thread->policystate.hw_bg) { + thread->actionstate.hw_bg = thread->policystate.hw_bg; + if (thread->ext_actionstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) + set = 1; + + } + } else { + /* external application */ + if (thread->ext_actionstate.hw_bg != thread->ext_policystate.hw_bg) { + thread->ext_actionstate.hw_bg = thread->ext_policystate.hw_bg; + if (thread->actionstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) + set = 1; + } + } + + if (set != 0) { + /* set thread priority (we did not save previous value) */ + policy.importance = INT_MIN; + + thread_policy_set_internal(thread, THREAD_PRECEDENCE_POLICY, + (thread_policy_t)&policy, + THREAD_PRECEDENCE_POLICY_COUNT ); + + } + } else + return(ESRCH); + + return(0); +} + +int +proc_apply_bgthread_selfpolicy(void) +{ + return(proc_apply_bgthreadpolicy(current_task(), current_thread()->thread_id)); +} + + +int +proc_restore_bgtaskpolicy(task_t task) +{ + int external = 1; + + if (current_task() == task) + external = 0; + return(proc_restore_bgtaskpolicy_locked(task, 0, external, BASEPRI_DEFAULT)); +} + +static int +proc_restore_bgtaskpolicy_locked(task_t task, int locked, int external, int pri) +{ + if (locked == 0) + task_lock(task); + + if (external != 0) { + task->ext_actionstate.hw_bg = TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE; + /* self BG in flight? */ + if (task->actionstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) { + task_priority(task, pri, MAXPRI_USER); +#if CONFIG_EMBEDDED + /* non embedded users need role for policy reapplication */ + task->role = TASK_DEFAULT_APPLICATION; +#endif /* CONFIG_EMBEDDED */ + } + } else { + task->actionstate.hw_bg = TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE; + /* external BG in flight? */ + if (task->ext_actionstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) { + task_priority(task, pri, MAXPRI_USER); +#if CONFIG_EMBEDDED + /* non embedded users need role for policy reapplication */ + task->role = TASK_DEFAULT_APPLICATION; +#endif /* CONFIG_EMBEDDED */ + } + } + + if (locked == 0) + task_unlock(task); + + return(0); +} + +/* restore the self backgrounding even if the thread is not current thread */ +int +proc_restore_workq_bgthreadpolicy(thread_t thread) +{ + int error = 0; + task_t wqtask = TASK_NULL; + + if (thread != THREAD_NULL) { + wqtask = thread->task; + task_lock(wqtask); + /* remove the background and restore default importance as self(internal) removal */ + restore_bgthreadpolicy_locked(thread, 1); + task_unlock(wqtask); + } else + error = ESRCH; + + return(error); +} + +int proc_restore_bgthread_selfpolicy(void) +{ + return(proc_restore_bgthreadpolicy(current_task(), thread_tid(current_thread()))); + +} + + +int +proc_restore_bgthreadpolicy(task_t task, uint64_t tid) +{ + int selfset = 0; + thread_t self = current_thread(); + thread_t thread = THREAD_NULL; + + task_lock(task); + if (tid == self->thread_id) { + thread = self; + selfset = 1; + } else { + thread = task_findtid(task, tid); + } + + if (thread != NULL) + restore_bgthreadpolicy_locked(thread, selfset); + + task_unlock(task); + + if (thread != NULL) + return(0); + else + return(1); +} + +static void +restore_bgthreadpolicy_locked(thread_t thread, int selfset) +{ + thread_precedence_policy_data_t policy; + int reset = 0; + + if (thread != NULL) { + if (selfset != 0) { + thread->actionstate.hw_bg = TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE; + /* external BG in flight? */ + if (thread->ext_actionstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) + reset = 1; + + } else { + thread->ext_actionstate.hw_bg = TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE; + /* self BG in flight? */ + if (thread->actionstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) + reset = 1; + } + + if (reset != 0) { + /* reset thread priority (we did not save previous value) */ + policy.importance = 0; + thread_policy_set_internal(thread, THREAD_PRECEDENCE_POLICY, + (thread_policy_t)&policy, + THREAD_PRECEDENCE_POLICY_COUNT ); + } + } +} + +void +proc_set_task_apptype(task_t task, int type) +{ + switch (type) { + case PROC_POLICY_OSX_APPTYPE_TAL: + task->ext_policystate.apptype = type; + task->policystate.apptype = type; + proc_apply_bgtaskpolicy_external(task); + /* indicate that BG is set and next foreground needs to reset */ + task->ext_actionstate.apptype = type; + break; + + case PROC_POLICY_OSX_APPTYPE_DBCLIENT: + task->ext_policystate.apptype = type; + task->policystate.apptype = type; + proc_apply_bgtaskpolicy_internal(task); + /* indicate that BG is set and next foreground needs to reset */ + task->ext_actionstate.apptype = type; + break; + + case PROC_POLICY_IOS_APPTYPE: + task->ext_policystate.apptype = type; + task->policystate.apptype = type; + break; + case PROC_POLICY_IOS_NONUITYPE: + task->ext_policystate.apptype = type; + task->policystate.apptype = type; + /* set to deny access to gpu */ + task->ext_actionstate.hw_gpu = TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS; + task->ext_policystate.hw_gpu = TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS; + break; + + default: + break; + } +} + +/* update the darwin backdground action state in the flags field for libproc */ +#define PROC_FLAG_DARWINBG 0x8000 /* process in darwin background */ +#define PROC_FLAG_EXT_DARWINBG 0x10000 /* process in darwin background - external enforcement */ + +int +proc_get_darwinbgstate(task_t task, uint32_t * flagsp) +{ + if (task->ext_actionstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_ALL){ + *flagsp |= PROC_FLAG_EXT_DARWINBG; + } + if (task->actionstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_ALL){ + *flagsp |= PROC_FLAG_DARWINBG; + } + + return(0); +} + +/* + * HW disk access realted routines, they need to return + * IOPOL_XXX equivalents for spec_xxx/throttle updates. + */ + +int +proc_get_task_disacc(task_t task) +{ + if ((task->ext_actionstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE) != 0) + return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE); + if (task->ext_actionstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL) + return(task->ext_actionstate.hw_disk); + if ((task->actionstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE) != 0) + return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE); + if (task->actionstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL) + return(task->actionstate.hw_disk); + return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL); +} + +int +proc_get_task_selfdiskacc(void) +{ + task_t task = current_task(); + thread_t thread= current_thread(); + + /* + * As per defined iopolicysys behavior, thread trumps task. + * Do we need to follow that for external enforcements of BG or hw access? + * Status quo for now.. + */ + if((thread->ext_actionstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE) != 0) + return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE); + if (thread->ext_actionstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL) + return(thread->ext_actionstate.hw_disk); + if((thread->actionstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE) != 0) + return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE); + if (thread->actionstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL) + return(thread->actionstate.hw_disk); + + if ((task->ext_actionstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE) != 0) + return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE); + if (task->ext_actionstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL) + return(task->ext_actionstate.hw_disk); + if ((task->actionstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE) != 0) + return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE); + if (task->actionstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL) + return(task->actionstate.hw_disk); + return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL); +} + +int +proc_get_thread_selfdiskacc(void) +{ + thread_t thread = current_thread(); + + if((thread->ext_actionstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE) != 0) + return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE); + if (thread->ext_actionstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL) + return(thread->ext_actionstate.hw_disk); + if((thread->actionstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE) != 0) + return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE); + if (thread->actionstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL) + return(thread->actionstate.hw_disk); + return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL); +} + +int proc_apply_task_diskacc(task_t task, int policy) +{ + task_t self = current_task(); + + task_lock(task); + if (task == self) { + task->actionstate.hw_disk = policy; + task->policystate.hw_disk = policy; + } else { + task->ext_actionstate.hw_disk = policy; + task->ext_policystate.hw_disk = policy; + } + task_unlock(task); + return(0); +} + +int proc_apply_thread_diskacc(task_t task, uint64_t tid, int policy) +{ + thread_t thread; + + if (tid == TID_NULL) { + thread = current_thread(); + proc_apply_thread_selfdiskacc(policy); + } else { + task_lock(task); + thread = task_findtid(task, tid); + if (thread != NULL) { + thread->ext_actionstate.hw_disk = policy; + thread->ext_policystate.hw_disk = policy; + } + task_unlock(task); + } + if (thread != NULL) + return(0); + else + return(0); +} + +int +proc_apply_thread_selfdiskacc(int policy) +{ + task_t task = current_task(); + thread_t thread = current_thread(); + + task_lock(task); + thread->actionstate.hw_disk = policy; + thread->policystate.hw_disk = policy; + task_unlock(task); + return(0); +} + +int +proc_denyinherit_policy(__unused task_t task) +{ + return(0); +} + +int +proc_denyselfset_policy(__unused task_t task) +{ + return(0); +} + +/* HW GPU access related routines */ +int +proc_get_task_selfgpuacc_deny(void) +{ + task_t task = current_task(); + thread_t thread = current_thread(); + + if (((task->ext_actionstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_NOGPU) != 0) || (task->ext_actionstate.hw_gpu == TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS)) + return(TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS); + if (((task->actionstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_NOGPU) != 0) || (task->actionstate.hw_gpu == TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS)) + return(TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS); + if (((thread->ext_actionstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_NOGPU) != 0) || (thread->ext_actionstate.hw_gpu == TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS)) + return(TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS); + if (((thread->actionstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_NOGPU) != 0) || (thread->actionstate.hw_gpu == TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS)) + return(TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS); + + return(TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NORMAL); +} + +int +proc_apply_task_gpuacc(task_t task, int policy) +{ + + task_t self = current_task(); + + task_lock(task); + if (task == self) { + task->actionstate.hw_gpu = policy; + task->policystate.hw_gpu = policy; + } else { + task->ext_actionstate.hw_gpu = policy; + task->ext_policystate.hw_gpu = policy; + } + task_unlock(task); + + return(0); +} + +/* Resource usage , CPU realted routines */ +int +proc_get_task_ruse_cpu(task_t task, uint32_t * policyp, uint32_t * percentagep, uint64_t * intervalp, uint64_t * deadlinep) +{ + + int error = 0; + + task_lock(task); + if (task != current_task()) { + *policyp = task->ext_policystate.ru_cpu; + } else { + *policyp = task->policystate.ru_cpu; + } + + error = task_get_cpuusage(task, percentagep, intervalp, deadlinep); + + return(error); +} + +int +proc_set_task_ruse_cpu(task_t task, uint32_t policy, uint32_t percentage, uint64_t interval, uint64_t deadline) +{ + int error = 0; + + task_lock(task); + if (task != current_task()) { + task->ext_policystate.ru_cpu = policy; + } else { + task->policystate.ru_cpu = policy; + } + error = task_set_cpuusage(task, percentage, interval, deadline); + task_unlock(task); + return(error); +} + + +/* used to apply resource limit related actions */ +static int +task_apply_resource_actions(task_t task, int type) +{ + int action = TASK_POLICY_RESOURCE_ATTRIBUTE_NONE; + void * bsdinfo = NULL; + + switch (type) { + case TASK_POLICY_CPU_RESOURCE_USAGE: + break; + case TASK_POLICY_WIREDMEM_RESOURCE_USAGE: + case TASK_POLICY_VIRTUALMEM_RESOURCE_USAGE: + case TASK_POLICY_DISK_RESOURCE_USAGE: + case TASK_POLICY_NETWORK_RESOURCE_USAGE: + case TASK_POLICY_POWER_RESOURCE_USAGE: + return(0); + + default: + return(1); + }; + + /* only cpu actions for now */ + task_lock(task); + + if (task->ext_actionstate.ru_cpu == TASK_POLICY_RESOURCE_ATTRIBUTE_NONE) { + /* apply action */ + task->ext_actionstate.ru_cpu = task->ext_policystate.ru_cpu; + action = task->ext_actionstate.ru_cpu; + } + if (action != TASK_POLICY_RESOURCE_ATTRIBUTE_NONE) { + bsdinfo = task->bsd_info; + task_unlock(task); + proc_apply_resource_actions(bsdinfo, TASK_POLICY_CPU_RESOURCE_USAGE, action); + } else + task_unlock(task); + + return(0); +} + +int +task_restore_resource_actions(task_t task, int type) +{ + int action; + void * bsdinfo = NULL; + + switch (type) { + case TASK_POLICY_CPU_RESOURCE_USAGE: + break; + case TASK_POLICY_WIREDMEM_RESOURCE_USAGE: + case TASK_POLICY_VIRTUALMEM_RESOURCE_USAGE: + case TASK_POLICY_DISK_RESOURCE_USAGE: + case TASK_POLICY_NETWORK_RESOURCE_USAGE: + case TASK_POLICY_POWER_RESOURCE_USAGE: + return(0); + + default: + return(1); + }; + + /* only cpu actions for now */ + task_lock(task); + + action = task->ext_actionstate.ru_cpu; + if (task->ext_actionstate.ru_cpu != TASK_POLICY_RESOURCE_ATTRIBUTE_NONE) { + /* reset action */ + task->ext_actionstate.ru_cpu = TASK_POLICY_RESOURCE_ATTRIBUTE_NONE; + } + if (action != TASK_POLICY_RESOURCE_ATTRIBUTE_NONE) { + bsdinfo = task->bsd_info; + task_unlock(task); + proc_restore_resource_actions(bsdinfo, TASK_POLICY_CPU_RESOURCE_USAGE, action); + } else + task_unlock(task); + + return(0); + +} + +/* For ledger hookups */ +static int +task_get_cpuusage(__unused task_t task, uint32_t * percentagep, uint64_t * intervalp, uint64_t * deadlinep) +{ + *percentagep = 0; + *intervalp = 0; + *deadlinep = 0; + + return(0); +} + +static int +task_set_cpuusage(__unused task_t task, __unused uint32_t percentage, __unused uint64_t interval, __unused uint64_t deadline) +{ + return(0); +} + +/* called by ledger unit to enforce action due to resource usage criteria being met */ +int +task_action_cpuusage(task_t task) +{ + return(task_apply_resource_actions(task, TASK_POLICY_CPU_RESOURCE_USAGE)); +} + +int +proc_disable_task_apptype(task_t task, int policy_subtype) +{ + void * bsdinfo = NULL; + int setbg = 0; + int ret = 0; + int maxpri = BASEPRI_DEFAULT; + + task_lock(task); + + if (task->ext_policystate.apptype != policy_subtype) { + ret = EINVAL; + goto out; + } + +#if !CONFIG_EMBEDDED + switch (task->role) { + case TASK_FOREGROUND_APPLICATION: + maxpri = BASEPRI_FOREGROUND; + break; + case TASK_BACKGROUND_APPLICATION: + maxpri = BASEPRI_BACKGROUND; + break; + default: + maxpri = BASEPRI_DEFAULT; + } +#endif + + if (task->ext_actionstate.apptype != PROC_POLICY_OSX_APPTYPE_NONE) { + switch (task->ext_actionstate.apptype) { + case PROC_POLICY_OSX_APPTYPE_TAL: + /* disable foreground/background handling */ + task->ext_actionstate.apptype = PROC_POLICY_OSX_APPTYPE_NONE; + /* external BG application removal */ + proc_restore_bgtaskpolicy_locked(task, 1, 1, maxpri); + bsdinfo = task->bsd_info; + setbg = 0; + break; + + case PROC_POLICY_OSX_APPTYPE_DBCLIENT: + /* disable foreground/background handling */ + task->ext_actionstate.apptype = PROC_POLICY_OSX_APPTYPE_NONE; + /* internal BG application removal */ + proc_restore_bgtaskpolicy_locked(task, 1, 0, maxpri); + bsdinfo = task->bsd_info; + setbg = 0; + break; + + default: + ret = EINVAL; + break; + } + } else + ret = EINVAL; + +out: + task_unlock(task); + /* if backgrounding action ... */ + if (bsdinfo != NULL) + proc_set_task_networkbg(bsdinfo, setbg); + + return(ret); +} + +int +proc_enable_task_apptype(task_t task, int policy_subtype) +{ + void * bsdinfo = NULL; + int setbg = 0; + int ret = 0; + + task_lock(task); + + if (task->ext_policystate.apptype != policy_subtype) { + ret = EINVAL; + goto out; + } + + if (task->ext_actionstate.apptype == PROC_POLICY_OSX_APPTYPE_NONE) { + switch (task->ext_policystate.apptype) { + case PROC_POLICY_OSX_APPTYPE_TAL: + /* TAL policy is activated again */ + task->ext_actionstate.apptype = task->ext_policystate.apptype; + if (task->role == TASK_BACKGROUND_APPLICATION) { + if (task->role == TASK_BACKGROUND_APPLICATION) { + proc_apply_bgtaskpolicy_locked(task, 1, 1); + bsdinfo = task->bsd_info; + setbg = 1; + } + } + ret = 0; + break; + default: + ret = EINVAL; + } + } else + ret = EINVAL; + +out: + task_unlock(task); + /* if backgrounding action ... */ + if (bsdinfo != NULL) + proc_set_task_networkbg(bsdinfo, setbg); + + return(ret); +} + diff --git a/osfmk/kern/thread.c b/osfmk/kern/thread.c index adff14820..84f7cf817 100644 --- a/osfmk/kern/thread.c +++ b/osfmk/kern/thread.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -91,11 +91,13 @@ #include #include +#include #include #include #include #include +#include #include #include #include @@ -182,8 +184,9 @@ thread_bootstrap(void) thread_template.parameter = NULL; thread_template.importance = 0; - thread_template.sched_mode = 0; - thread_template.safe_mode = 0; + thread_template.sched_mode = TH_MODE_NONE; + thread_template.sched_flags = 0; + thread_template.saved_mode = TH_MODE_NONE; thread_template.safe_release = 0; thread_template.priority = 0; @@ -198,14 +201,18 @@ thread_bootstrap(void) thread_template.realtime.deadline = UINT64_MAX; thread_template.current_quantum = 0; + thread_template.last_run_time = 0; + thread_template.last_quantum_refill_time = 0; thread_template.computation_metered = 0; thread_template.computation_epoch = 0; +#if defined(CONFIG_SCHED_TRADITIONAL) thread_template.sched_stamp = 0; - thread_template.sched_usage = 0; thread_template.pri_shift = INT8_MAX; + thread_template.sched_usage = 0; thread_template.cpu_usage = thread_template.cpu_delta = 0; +#endif thread_template.c_switch = thread_template.p_switch = thread_template.ps_switch = 0; thread_template.bound_processor = PROCESSOR_NULL; @@ -247,6 +254,18 @@ thread_bootstrap(void) thread_template.affinity_set = NULL; + thread_template.syscalls_unix = 0; + thread_template.syscalls_mach = 0; + + thread_template.tkm_private.alloc = 0; + thread_template.tkm_private.free = 0; + thread_template.tkm_shared.alloc = 0; + thread_template.tkm_shared.free = 0; + thread_template.actionstate = default_task_null_policy; + thread_template.ext_actionstate = default_task_null_policy; + thread_template.policystate = default_task_proc_policy; + thread_template.ext_policystate = default_task_proc_policy; + init_thread = thread_template; machine_set_current_thread(&init_thread); } @@ -259,8 +278,9 @@ thread_init(void) thread_max * sizeof(struct thread), THREAD_CHUNK * sizeof(struct thread), "threads"); + zone_change(thread_zone, Z_NOENCRYPT, TRUE); - + lck_grp_attr_setdefault(&thread_lck_grp_attr); lck_grp_init(&thread_lck_grp, "thread", &thread_lck_grp_attr); lck_attr_setdefault(&thread_lck_attr); @@ -288,10 +308,13 @@ void thread_terminate_self(void) { thread_t thread = current_thread(); + task_t task; spl_t s; int threadcnt; + pal_thread_terminate_self(thread); + DTRACE_PROC(lwp__exit); thread_mtx_lock(thread); @@ -309,8 +332,8 @@ thread_terminate_self(void) * Cancel priority depression, wait for concurrent expirations * on other processors. */ - if (thread->sched_mode & TH_MODE_ISDEPRESSED) { - thread->sched_mode &= ~TH_MODE_ISDEPRESSED; + if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) { + thread->sched_flags &= ~TH_SFLAG_DEPRESSED_MASK; if (timer_call_cancel(&thread->depress_timer)) thread->depress_timer_active--; @@ -374,8 +397,7 @@ thread_terminate_self(void) * If there is a reserved stack, release it. */ if (thread->reserved_stack != 0) { - if (thread->reserved_stack != thread->kernel_stack) - stack_free_stack(thread->reserved_stack); + stack_free_reserved(thread); thread->reserved_stack = 0; } @@ -404,6 +426,7 @@ thread_deallocate( if (thread_deallocate_internal(thread) > 0) return; + ipc_thread_terminate(thread); task = thread->task; @@ -417,14 +440,14 @@ thread_deallocate( } #endif /* MACH_BSD */ - task_deallocate(task); - if (thread->kernel_stack != 0) stack_free(thread); lck_mtx_destroy(&thread->mutex, &thread_lck_grp); machine_thread_destroy(thread); + task_deallocate(task); + zfree(thread_zone, thread); } @@ -436,8 +459,11 @@ thread_deallocate( static void thread_terminate_daemon(void) { - thread_t thread; - task_t task; + thread_t self, thread; + task_t task; + + self = current_thread(); + self->options |= TH_OPT_SYSTEM_CRITICAL; (void)splsched(); simple_lock(&thread_terminate_lock); @@ -456,6 +482,14 @@ thread_terminate_daemon(void) task->p_switch += thread->p_switch; task->ps_switch += thread->ps_switch; + task->syscalls_unix += thread->syscalls_unix; + task->syscalls_mach += thread->syscalls_mach; + + task->tkm_private.alloc += thread->tkm_private.alloc; + task->tkm_private.free += thread->tkm_private.free; + task->tkm_shared.alloc += thread->tkm_shared.alloc; + task->tkm_shared.free += thread->tkm_shared.free; + queue_remove(&task->threads, thread, thread_t, task_threads); task->thread_count--; @@ -483,6 +517,7 @@ thread_terminate_daemon(void) simple_unlock(&thread_terminate_lock); /* splsched */ + self->options &= ~TH_OPT_SYSTEM_CRITICAL; thread_block((thread_continue_t)thread_terminate_daemon); /*NOTREACHED*/ } @@ -561,7 +596,7 @@ void thread_daemon_init(void) { kern_return_t result; - thread_t thread; + thread_t thread = NULL; simple_lock_init(&thread_terminate_lock, 0); queue_init(&thread_terminate_queue); @@ -712,18 +747,25 @@ thread_create_internal( #endif /* Set the thread's scheduling parameters */ - if (parent_task != kernel_task) - new_thread->sched_mode |= TH_MODE_TIMESHARE; + new_thread->sched_mode = SCHED(initial_thread_sched_mode)(parent_task); + new_thread->sched_flags = 0; new_thread->max_priority = parent_task->max_priority; new_thread->task_priority = parent_task->priority; new_thread->priority = (priority < 0)? parent_task->priority: priority; if (new_thread->priority > new_thread->max_priority) new_thread->priority = new_thread->max_priority; +#if CONFIG_EMBEDDED + if (new_thread->priority < MAXPRI_THROTTLE) { + new_thread->priority = MAXPRI_THROTTLE; + } +#endif /* CONFIG_EMBEDDED */ new_thread->importance = new_thread->priority - new_thread->task_priority; +#if defined(CONFIG_SCHED_TRADITIONAL) new_thread->sched_stamp = sched_tick; new_thread->pri_shift = sched_pri_shift; - compute_priority(new_thread, FALSE); +#endif + SCHED(compute_priority)(new_thread, FALSE); new_thread->active = TRUE; @@ -751,10 +793,11 @@ thread_create_internal( return (KERN_SUCCESS); } -kern_return_t -thread_create( +static kern_return_t +thread_create_internal2( task_t task, - thread_t *new_thread) + thread_t *new_thread, + boolean_t from_user) { kern_return_t result; thread_t thread; @@ -771,6 +814,9 @@ thread_create( if (task->suspend_count > 0) thread_hold(thread); + if (from_user) + extmod_statistics_incr_thread_create(task); + task_unlock(task); lck_mtx_unlock(&tasks_threads_lock); @@ -779,13 +825,36 @@ thread_create( return (KERN_SUCCESS); } +/* No prototype, since task_server.h has the _from_user version if KERNEL_SERVER */ kern_return_t -thread_create_running( +thread_create( + task_t task, + thread_t *new_thread); + +kern_return_t +thread_create( + task_t task, + thread_t *new_thread) +{ + return thread_create_internal2(task, new_thread, FALSE); +} + +kern_return_t +thread_create_from_user( + task_t task, + thread_t *new_thread) +{ + return thread_create_internal2(task, new_thread, TRUE); +} + +static kern_return_t +thread_create_running_internal2( register task_t task, int flavor, thread_state_t new_state, mach_msg_type_number_t new_state_count, - thread_t *new_thread) + thread_t *new_thread, + boolean_t from_user) { register kern_return_t result; thread_t thread; @@ -812,6 +881,9 @@ thread_create_running( thread_start_internal(thread); thread_mtx_unlock(thread); + if (from_user) + extmod_statistics_incr_thread_create(task); + task_unlock(task); lck_mtx_unlock(&tasks_threads_lock); @@ -820,6 +892,41 @@ thread_create_running( return (result); } +/* Prototype, see justification above */ +kern_return_t +thread_create_running( + register task_t task, + int flavor, + thread_state_t new_state, + mach_msg_type_number_t new_state_count, + thread_t *new_thread); + +kern_return_t +thread_create_running( + register task_t task, + int flavor, + thread_state_t new_state, + mach_msg_type_number_t new_state_count, + thread_t *new_thread) +{ + return thread_create_running_internal2( + task, flavor, new_state, new_state_count, + new_thread, FALSE); +} + +kern_return_t +thread_create_running_from_user( + register task_t task, + int flavor, + thread_state_t new_state, + mach_msg_type_number_t new_state_count, + thread_t *new_thread) +{ + return thread_create_running_internal2( + task, flavor, new_state, new_state_count, + new_thread, TRUE); +} + kern_return_t thread_create_workq( task_t task, @@ -977,8 +1084,8 @@ thread_info_internal( /* * Update lazy-evaluated scheduler info because someone wants it. */ - if (thread->sched_stamp != sched_tick) - update_priority(thread); + if (SCHED(can_update_priority)(thread)) + SCHED(update_priority)(thread); basic_info->sleep_time = 0; @@ -987,14 +1094,19 @@ thread_info_internal( * then for 5/8 ageing. The correction factor [3/5] is * (1/(5/8) - 1). */ - basic_info->cpu_usage = (integer_t)(((uint64_t)thread->cpu_usage - * TH_USAGE_SCALE) / sched_tick_interval); - basic_info->cpu_usage = (basic_info->cpu_usage * 3) / 5; - + basic_info->cpu_usage = 0; +#if defined(CONFIG_SCHED_TRADITIONAL) + if (sched_tick_interval) { + basic_info->cpu_usage = (integer_t)(((uint64_t)thread->cpu_usage + * TH_USAGE_SCALE) / sched_tick_interval); + basic_info->cpu_usage = (basic_info->cpu_usage * 3) / 5; + } +#endif + if (basic_info->cpu_usage > TH_USAGE_SCALE) basic_info->cpu_usage = TH_USAGE_SCALE; - basic_info->policy = ((thread->sched_mode & TH_MODE_TIMESHARE)? + basic_info->policy = ((thread->sched_mode == TH_MODE_TIMESHARE)? POLICY_TIMESHARE: POLICY_RR); flags = 0; @@ -1045,11 +1157,7 @@ thread_info_internal( thread_lock(thread); identifier_info->thread_id = thread->thread_id; -#if defined(__ppc__) || defined(__arm__) identifier_info->thread_handle = thread->machine.cthread_self; -#else - identifier_info->thread_handle = thread->machine.pcb->cthread_self; -#endif if(thread->task->bsd_info) { identifier_info->dispatch_qaddr = identifier_info->thread_handle + get_dispatchqueue_offset_from_proc(thread->task->bsd_info); } else { @@ -1074,14 +1182,14 @@ thread_info_internal( s = splsched(); thread_lock(thread); - if (!(thread->sched_mode & TH_MODE_TIMESHARE)) { + if (thread->sched_mode != TH_MODE_TIMESHARE) { thread_unlock(thread); splx(s); return (KERN_INVALID_POLICY); } - ts_info->depressed = (thread->sched_mode & TH_MODE_ISDEPRESSED) != 0; + ts_info->depressed = (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) != 0; if (ts_info->depressed) { ts_info->base_priority = DEPRESSPRI; ts_info->depress_priority = thread->priority; @@ -1111,7 +1219,9 @@ thread_info_internal( else if (flavor == THREAD_SCHED_RR_INFO) { policy_rr_info_t rr_info; - + uint32_t quantum_time; + uint64_t quantum_ns; + if (*thread_info_count < POLICY_RR_INFO_COUNT) return (KERN_INVALID_ARGUMENT); @@ -1120,14 +1230,14 @@ thread_info_internal( s = splsched(); thread_lock(thread); - if (thread->sched_mode & TH_MODE_TIMESHARE) { + if (thread->sched_mode == TH_MODE_TIMESHARE) { thread_unlock(thread); splx(s); return (KERN_INVALID_POLICY); } - rr_info->depressed = (thread->sched_mode & TH_MODE_ISDEPRESSED) != 0; + rr_info->depressed = (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) != 0; if (rr_info->depressed) { rr_info->base_priority = DEPRESSPRI; rr_info->depress_priority = thread->priority; @@ -1137,8 +1247,11 @@ thread_info_internal( rr_info->depress_priority = -1; } + quantum_time = SCHED(initial_quantum_size)(THREAD_NULL); + absolutetime_to_nanoseconds(quantum_time, &quantum_ns); + rr_info->max_priority = thread->max_priority; - rr_info->quantum = std_quantum_us / 1000; + rr_info->quantum = (uint32_t)(quantum_ns / 1000 / 1000); thread_unlock(thread); splx(s); @@ -1416,11 +1529,7 @@ thread_dispatchqaddr( uint64_t thread_handle = 0; if (thread != THREAD_NULL) { -#if defined(__ppc__) || defined(__arm__) thread_handle = thread->machine.cthread_self; -#else - thread_handle = thread->machine.pcb->cthread_self; -#endif if (thread->task->bsd_info) dispatchqueue_addr = thread_handle + get_dispatchqueue_offset_from_proc(thread->task->bsd_info); diff --git a/osfmk/kern/thread.h b/osfmk/kern/thread.h index db2c6e352..916391593 100644 --- a/osfmk/kern/thread.h +++ b/osfmk/kern/thread.h @@ -146,6 +146,7 @@ struct thread { #define TH_OPT_INTMASK 0x03 /* interrupt / abort level */ #define TH_OPT_VMPRIV 0x04 /* may allocate reserved memory */ #define TH_OPT_DTRACE 0x08 /* executing under dtrace_probe */ +#define TH_OPT_SYSTEM_CRITICAL 0x10 /* Thread must always be allowed to run - even under heavy load */ /* Data updated during assert_wait/thread_wakeup */ decl_simple_lock_data(,sched_lock) /* scheduling lock (thread_lock()) */ @@ -183,23 +184,37 @@ struct thread { #define TH_IDLE 0x80 /* idling processor */ /* Scheduling information */ - integer_t sched_mode; /* scheduling mode bits */ -#define TH_MODE_REALTIME 0x0001 /* time constraints supplied */ -#define TH_MODE_TIMESHARE 0x0002 /* use timesharing algorithm */ -#define TH_MODE_FAILSAFE 0x0004 /* fail-safe has tripped */ -#define TH_MODE_PROMOTED 0x0008 /* sched pri has been promoted */ -#define TH_MODE_ABORT 0x0010 /* abort interruptible waits */ -#define TH_MODE_ABORTSAFELY 0x0020 /* ... but only those at safe point */ -#define TH_MODE_ISABORTED (TH_MODE_ABORT | TH_MODE_ABORTSAFELY) -#define TH_MODE_DEPRESS 0x0040 /* normal depress yield */ -#define TH_MODE_POLLDEPRESS 0x0080 /* polled depress yield */ -#define TH_MODE_ISDEPRESSED (TH_MODE_DEPRESS | TH_MODE_POLLDEPRESS) + sched_mode_t sched_mode; /* scheduling mode */ + sched_mode_t saved_mode; /* saved mode during forced mode demotion */ + + unsigned int sched_flags; /* current flag bits */ +#define TH_SFLAG_FAIRSHARE_TRIPPED 0x0001 /* fairshare scheduling activated */ +#define TH_SFLAG_FAILSAFE 0x0002 /* fail-safe has tripped */ +#define TH_SFLAG_THROTTLED 0x0004 /* owner task in throttled state */ +#define TH_SFLAG_DEMOTED_MASK (TH_SFLAG_THROTTLED | TH_SFLAG_FAILSAFE | TH_SFLAG_FAIRSHARE_TRIPPED) + +#define TH_SFLAG_PROMOTED 0x0008 /* sched pri has been promoted */ +#define TH_SFLAG_ABORT 0x0010 /* abort interruptible waits */ +#define TH_SFLAG_ABORTSAFELY 0x0020 /* ... but only those at safe point */ +#define TH_SFLAG_ABORTED_MASK (TH_SFLAG_ABORT | TH_SFLAG_ABORTSAFELY) +#define TH_SFLAG_DEPRESS 0x0040 /* normal depress yield */ +#define TH_SFLAG_POLLDEPRESS 0x0080 /* polled depress yield */ +#define TH_SFLAG_DEPRESSED_MASK (TH_SFLAG_DEPRESS | TH_SFLAG_POLLDEPRESS) +#define TH_SFLAG_PRI_UPDATE 0x0100 /* Updating priority */ +#define TH_SFLAG_EAGERPREEMPT 0x0200 /* Any preemption of this thread should be treated as if AST_URGENT applied */ + integer_t sched_pri; /* scheduled (current) priority */ integer_t priority; /* base priority */ integer_t max_priority; /* max base priority */ integer_t task_priority; /* copy of task base priority */ +#if defined(CONFIG_SCHED_GRRR) +#if 0 + uint16_t grrr_deficit; /* fixed point (1/1000th quantum) fractional deficit */ +#endif +#endif + integer_t promotions; /* level of promotion */ integer_t pending_promoter_index; void *pending_promoter[2]; @@ -216,30 +231,38 @@ struct thread { uint64_t deadline; } realtime; + uint32_t was_promoted_on_wakeup; uint32_t current_quantum; /* duration of current quantum */ + uint64_t last_run_time; /* time when thread was switched away from */ + uint64_t last_quantum_refill_time; /* time when current_quantum was refilled after expiration */ /* Data used during setrun/dispatch */ timer_data_t system_timer; /* system mode timer */ processor_t bound_processor; /* bound to a processor? */ processor_t last_processor; /* processor last dispatched on */ + processor_t chosen_processor; /* Where we want to run this thread */ /* Fail-safe computation since last unblock or qualifying yield */ uint64_t computation_metered; uint64_t computation_epoch; - integer_t safe_mode; /* saved mode during fail-safe */ - natural_t safe_release; /* when to release fail-safe */ + uint64_t safe_release; /* when to release fail-safe */ /* Call out from scheduler */ void (*sched_call)( int type, thread_t thread); - +#if defined(CONFIG_SCHED_PROTO) + uint32_t runqueue_generation; /* last time runqueue was drained */ +#endif + /* Statistics and timesharing calculations */ +#if defined(CONFIG_SCHED_TRADITIONAL) natural_t sched_stamp; /* last scheduler tick */ natural_t sched_usage; /* timesharing cpu usage [sched] */ natural_t pri_shift; /* usage -> priority from pset */ natural_t cpu_usage; /* instrumented cpu usage [%cpu] */ natural_t cpu_delta; /* accumulated cpu_usage delta */ +#endif uint32_t c_switch; /* total context switches */ uint32_t p_switch; /* total processor switches */ uint32_t ps_switch; /* total pset switches */ @@ -366,7 +389,20 @@ struct thread { clock_sec_t t_page_creation_time; uint32_t t_chud; /* CHUD flags, used for Shark */ + + integer_t mutex_count; /* total count of locks held */ + uint64_t thread_id; /*system wide unique thread-id*/ + + /* Statistics accumulated per-thread and aggregated per-task */ + uint32_t syscalls_unix; + uint32_t syscalls_mach; + zinfo_usage_store_t tkm_private; /* private kernel memory allocs/frees */ + zinfo_usage_store_t tkm_shared; /* shared kernel memory allocs/frees */ + struct process_policy ext_actionstate; /* externally applied actions */ + struct process_policy ext_policystate; /* externally defined process policy states*/ + struct process_policy actionstate; /* self applied acions */ + struct process_policy policystate; /* process wide policy states */ }; #define ith_state saved.receive.state @@ -441,11 +477,15 @@ extern void thread_release( extern void stack_alloc( thread_t thread); +extern void stack_handoff( + thread_t from, + thread_t to); + extern void stack_free( thread_t thread); -extern void stack_free_stack( - vm_offset_t stack); +extern void stack_free_reserved( + thread_t thread); extern boolean_t stack_alloc_try( thread_t thread); @@ -454,6 +494,7 @@ extern void stack_collect(void); extern void stack_init(void) __attribute__((section("__TEXT, initcode"))); + extern kern_return_t thread_state_initialize( thread_t thread); @@ -684,6 +725,22 @@ extern kern_return_t thread_setsinglestep( thread_t thread, int on); +extern kern_return_t thread_userstack( + thread_t, + int, + thread_state_t, + unsigned int, + mach_vm_offset_t *, + int *); + +kern_return_t thread_entrypoint( + thread_t, + int, + thread_state_t, + unsigned int, + mach_vm_offset_t *); + + extern kern_return_t thread_wire_internal( host_priv_t host_priv, thread_t thread, @@ -775,6 +832,10 @@ extern kern_return_t kernel_thread_start( thread_continue_t continuation, void *parameter, thread_t *new_thread); +#ifdef KERNEL_PRIVATE +void thread_set_eager_preempt(thread_t thread); +void thread_clear_eager_preempt(thread_t thread); +#endif /* KERNEL_PRIVATE */ __END_DECLS diff --git a/osfmk/kern/thread_act.c b/osfmk/kern/thread_act.c index 8c18ffc30..455a0fb01 100644 --- a/osfmk/kern/thread_act.c +++ b/osfmk/kern/thread_act.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include #include @@ -314,12 +315,12 @@ act_abort( thread_lock(thread); - if (!(thread->sched_mode & TH_MODE_ABORT)) { - thread->sched_mode |= TH_MODE_ABORT; + if (!(thread->sched_flags & TH_SFLAG_ABORT)) { + thread->sched_flags |= TH_SFLAG_ABORT; install_special_handler_locked(thread); } else - thread->sched_mode &= ~TH_MODE_ABORTSAFELY; + thread->sched_flags &= ~TH_SFLAG_ABORTSAFELY; thread_unlock(thread); splx(s); @@ -365,8 +366,8 @@ thread_abort_safely( thread_lock(thread); if (!thread->at_safe_point || clear_wait_internal(thread, THREAD_INTERRUPTED) != KERN_SUCCESS) { - if (!(thread->sched_mode & TH_MODE_ABORT)) { - thread->sched_mode |= TH_MODE_ISABORTED; + if (!(thread->sched_flags & TH_SFLAG_ABORT)) { + thread->sched_flags |= TH_SFLAG_ABORTED_MASK; install_special_handler_locked(thread); } } @@ -460,12 +461,13 @@ thread_get_state( * Change thread's machine-dependent state. Called with nothing * locked. Returns same way. */ -kern_return_t -thread_set_state( +static kern_return_t +thread_set_state_internal( register thread_t thread, int flavor, thread_state_t state, - mach_msg_type_number_t state_count) + mach_msg_type_number_t state_count, + boolean_t from_user) { kern_return_t result = KERN_SUCCESS; @@ -500,11 +502,41 @@ thread_set_state( else result = KERN_TERMINATED; + if ((result == KERN_SUCCESS) && from_user) + extmod_statistics_incr_thread_set_state(thread); + thread_mtx_unlock(thread); return (result); } + +/* No prototype, since thread_act_server.h has the _from_user version if KERNEL_SERVER */ +kern_return_t +thread_set_state( + register thread_t thread, + int flavor, + thread_state_t state, + mach_msg_type_number_t state_count); + +kern_return_t +thread_set_state( + register thread_t thread, + int flavor, + thread_state_t state, + mach_msg_type_number_t state_count) +{ + return thread_set_state_internal(thread, flavor, state, state_count, FALSE); +} +kern_return_t +thread_set_state_from_user( + register thread_t thread, + int flavor, + thread_state_t state, + mach_msg_type_number_t state_count) +{ + return thread_set_state_internal(thread, flavor, state, state_count, TRUE); +} /* * Kernel-internal "thread" interfaces used outside this file: @@ -672,8 +704,8 @@ install_special_handler_locked( * a chance to do locking required to * block itself in special_handler(). */ - if (thread->sched_mode & TH_MODE_ISDEPRESSED) - compute_priority(thread, TRUE); + if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) + SCHED(compute_priority)(thread, TRUE); thread_ast_set(thread, AST_APC); @@ -753,7 +785,7 @@ special_handler_continue(void) spl_t s = splsched(); thread_lock(thread); - if (thread->sched_mode & TH_MODE_ISDEPRESSED) { + if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) { processor_t myprocessor = thread->last_processor; thread->sched_pri = DEPRESSPRI; @@ -784,7 +816,7 @@ special_handler( s = splsched(); thread_lock(thread); - thread->sched_mode &= ~TH_MODE_ISABORTED; + thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK; thread_unlock(thread); splx(s); @@ -816,6 +848,14 @@ special_handler( thread_mtx_unlock(thread); } +/* Prototype, see justification above */ +kern_return_t +act_set_state( + thread_t thread, + int flavor, + thread_state_t state, + mach_msg_type_number_t count); + kern_return_t act_set_state( thread_t thread, @@ -830,6 +870,20 @@ act_set_state( } +kern_return_t +act_set_state_from_user( + thread_t thread, + int flavor, + thread_state_t state, + mach_msg_type_number_t count) +{ + if (thread == current_thread()) + return (KERN_INVALID_ARGUMENT); + + return (thread_set_state_from_user(thread, flavor, state, count)); + +} + kern_return_t act_get_state( thread_t thread, diff --git a/osfmk/kern/thread_call.c b/osfmk/kern/thread_call.c index a50c6d7d3..93edbc489 100644 --- a/osfmk/kern/thread_call.c +++ b/osfmk/kern/thread_call.c @@ -46,13 +46,12 @@ #include -decl_simple_lock_data(static,thread_call_lock) static zone_t thread_call_zone; struct thread_call_group { queue_head_t pending_queue; - uint32_t pending_count; + uint32_t pending_count; queue_head_t delayed_queue; @@ -60,7 +59,7 @@ struct thread_call_group { struct wait_queue idle_wqueue; struct wait_queue daemon_wqueue; - uint32_t idle_count, active_count; + uint32_t idle_count, active_count; }; typedef struct thread_call_group *thread_call_group_t; @@ -113,13 +112,32 @@ static void thread_call_daemon( thread_call_thread( thread_call_group_t group); -static void thread_call_delayed_timer( +extern void thread_call_delayed_timer( timer_call_param_t p0, timer_call_param_t p1); #define qe(x) ((queue_entry_t)(x)) #define TC(x) ((thread_call_t)(x)) + +lck_grp_t thread_call_queues_lck_grp; +lck_grp_t thread_call_lck_grp; +lck_attr_t thread_call_lck_attr; +lck_grp_attr_t thread_call_lck_grp_attr; + +#if defined(__i386__) || defined(__x86_64__) +lck_mtx_t thread_call_lock_data; +#else +lck_spin_t thread_call_lock_data; +#endif + +#define thread_call_lock_spin() \ + lck_mtx_lock_spin_always(&thread_call_lock_data) + +#define thread_call_unlock() \ + lck_mtx_unlock_always(&thread_call_lock_data) + + /* * thread_call_initialize: * @@ -129,7 +147,7 @@ static void thread_call_delayed_timer( void thread_call_initialize(void) { - thread_call_t call; + thread_call_t call; thread_call_group_t group = &thread_call_group0; kern_return_t result; thread_t thread; @@ -138,33 +156,42 @@ thread_call_initialize(void) i = sizeof (thread_call_data_t); thread_call_zone = zinit(i, 4096 * i, 16 * i, "thread_call"); + zone_change(thread_call_zone, Z_CALLERACCT, FALSE); zone_change(thread_call_zone, Z_NOENCRYPT, TRUE); - simple_lock_init(&thread_call_lock, 0); + lck_attr_setdefault(&thread_call_lck_attr); + lck_grp_attr_setdefault(&thread_call_lck_grp_attr); + lck_grp_init(&thread_call_queues_lck_grp, "thread_call_queues", &thread_call_lck_grp_attr); + lck_grp_init(&thread_call_lck_grp, "thread_call", &thread_call_lck_grp_attr); - s = splsched(); - simple_lock(&thread_call_lock); +#if defined(__i386__) || defined(__x86_64__) + lck_mtx_init(&thread_call_lock_data, &thread_call_lck_grp, &thread_call_lck_attr); +#else + lck_spin_init(&thread_call_lock_data, &thread_call_lck_grp, &thread_call_lck_attr); +#endif + queue_init(&group->pending_queue); + queue_init(&group->delayed_queue); - queue_init(&group->pending_queue); - queue_init(&group->delayed_queue); + s = splsched(); + thread_call_lock_spin(); timer_call_setup(&group->delayed_timer, thread_call_delayed_timer, group); wait_queue_init(&group->idle_wqueue, SYNC_POLICY_FIFO); wait_queue_init(&group->daemon_wqueue, SYNC_POLICY_FIFO); - queue_init(&thread_call_internal_queue); - for ( + queue_init(&thread_call_internal_queue); + for ( call = internal_call_storage; call < &internal_call_storage[internal_call_count]; call++) { enqueue_tail(&thread_call_internal_queue, qe(call)); - } + } thread_call_daemon_awake = TRUE; - simple_unlock(&thread_call_lock); + thread_call_unlock(); splx(s); result = kernel_thread_start_priority((thread_continue_t)thread_call_daemon, group, BASEPRI_PREEMPT + 1, &thread); @@ -236,7 +263,7 @@ _pending_call_enqueue( thread_call_t call, thread_call_group_t group) { - queue_t old_queue; + queue_head_t *old_queue; old_queue = call_entry_enqueue_tail(call, &group->pending_queue); @@ -261,9 +288,9 @@ static __inline__ boolean_t _delayed_call_enqueue( thread_call_t call, thread_call_group_t group, - uint64_t deadline) + uint64_t deadline) { - queue_t old_queue; + queue_head_t *old_queue; old_queue = call_entry_enqueue_deadline(call, &group->delayed_queue, deadline); @@ -287,7 +314,7 @@ _call_dequeue( thread_call_t call, thread_call_group_t group) { - queue_t old_queue; + queue_head_t *old_queue; old_queue = call_entry_dequeue(call); @@ -310,7 +337,7 @@ _set_delayed_call_timer( thread_call_t call, thread_call_group_t group) { - timer_call_enter(&group->delayed_timer, call->deadline); + timer_call_enter(&group->delayed_timer, call->deadline, 0); } /* @@ -330,7 +357,7 @@ _remove_from_pending_queue( thread_call_param_t param0, boolean_t remove_all) { - boolean_t call_removed = FALSE; + boolean_t call_removed = FALSE; thread_call_t call; thread_call_group_t group = &thread_call_group0; @@ -424,7 +451,7 @@ thread_call_func( spl_t s; s = splsched(); - simple_lock(&thread_call_lock); + thread_call_lock_spin(); call = TC(queue_first(&group->pending_queue)); @@ -449,7 +476,7 @@ thread_call_func( thread_call_wake(group); } - simple_unlock(&thread_call_lock); + thread_call_unlock(); splx(s); } @@ -472,7 +499,7 @@ thread_call_func_delayed( spl_t s; s = splsched(); - simple_lock(&thread_call_lock); + thread_call_lock_spin(); call = _internal_call_allocate(); call->func = func; @@ -484,7 +511,7 @@ thread_call_func_delayed( if (queue_first(&group->delayed_queue) == qe(call)) _set_delayed_call_timer(call, group); - simple_unlock(&thread_call_lock); + thread_call_unlock(); splx(s); } @@ -510,7 +537,7 @@ thread_call_func_cancel( spl_t s; s = splsched(); - simple_lock(&thread_call_lock); + thread_call_lock_spin(); if (cancel_all) result = _remove_from_pending_queue(func, param, cancel_all) | @@ -519,7 +546,7 @@ thread_call_func_cancel( result = _remove_from_pending_queue(func, param, cancel_all) || _remove_from_delayed_queue(func, param, cancel_all); - simple_unlock(&thread_call_lock); + thread_call_unlock(); splx(s); return (result); @@ -554,16 +581,16 @@ thread_call_free( spl_t s; s = splsched(); - simple_lock(&thread_call_lock); + thread_call_lock_spin(); if (call->queue != NULL) { - simple_unlock(&thread_call_lock); - splx(s); + thread_call_unlock(); + splx(s); - return (FALSE); + return (FALSE); } - simple_unlock(&thread_call_lock); + thread_call_unlock(); splx(s); zfree(thread_call_zone, call); @@ -585,10 +612,10 @@ thread_call_enter( { boolean_t result = TRUE; thread_call_group_t group = &thread_call_group0; - spl_t s; + spl_t s; - s = splsched(); - simple_lock(&thread_call_lock); + s = splsched(); + thread_call_lock_spin(); if (call->queue != &group->pending_queue) { result = _pending_call_enqueue(call, group); @@ -599,8 +626,8 @@ thread_call_enter( call->param1 = 0; - simple_unlock(&thread_call_lock); - splx(s); + thread_call_unlock(); + splx(s); return (result); } @@ -612,10 +639,10 @@ thread_call_enter1( { boolean_t result = TRUE; thread_call_group_t group = &thread_call_group0; - spl_t s; + spl_t s; - s = splsched(); - simple_lock(&thread_call_lock); + s = splsched(); + thread_call_lock_spin(); if (call->queue != &group->pending_queue) { result = _pending_call_enqueue(call, group); @@ -626,8 +653,8 @@ thread_call_enter1( call->param1 = param1; - simple_unlock(&thread_call_lock); - splx(s); + thread_call_unlock(); + splx(s); return (result); } @@ -648,10 +675,10 @@ thread_call_enter_delayed( { boolean_t result = TRUE; thread_call_group_t group = &thread_call_group0; - spl_t s; + spl_t s; - s = splsched(); - simple_lock(&thread_call_lock); + s = splsched(); + thread_call_lock_spin(); result = _delayed_call_enqueue(call, group, deadline); @@ -660,8 +687,8 @@ thread_call_enter_delayed( call->param1 = 0; - simple_unlock(&thread_call_lock); - splx(s); + thread_call_unlock(); + splx(s); return (result); } @@ -674,10 +701,10 @@ thread_call_enter1_delayed( { boolean_t result = TRUE; thread_call_group_t group = &thread_call_group0; - spl_t s; + spl_t s; - s = splsched(); - simple_lock(&thread_call_lock); + s = splsched(); + thread_call_lock_spin(); result = _delayed_call_enqueue(call, group, deadline); @@ -686,8 +713,8 @@ thread_call_enter1_delayed( call->param1 = param1; - simple_unlock(&thread_call_lock); - splx(s); + thread_call_unlock(); + splx(s); return (result); } @@ -706,15 +733,15 @@ thread_call_cancel( { boolean_t result; thread_call_group_t group = &thread_call_group0; - spl_t s; + spl_t s; - s = splsched(); - simple_lock(&thread_call_lock); + s = splsched(); + thread_call_lock_spin(); result = _call_dequeue(call, group); - simple_unlock(&thread_call_lock); - splx(s); + thread_call_unlock(); + splx(s); return (result); } @@ -739,7 +766,7 @@ thread_call_is_delayed( spl_t s; s = splsched(); - simple_lock(&thread_call_lock); + thread_call_lock_spin(); if (call->queue == &group->delayed_queue) { if (deadline != NULL) @@ -747,7 +774,7 @@ thread_call_is_delayed( result = TRUE; } - simple_unlock(&thread_call_lock); + thread_call_unlock(); splx(s); return (result); @@ -769,13 +796,13 @@ static __inline__ void thread_call_wake( thread_call_group_t group) { - if (group->idle_count > 0 && wait_queue_wakeup_one(&group->idle_wqueue, NULL, THREAD_AWAKENED) == KERN_SUCCESS) { + if (group->idle_count > 0 && wait_queue_wakeup_one(&group->idle_wqueue, NO_EVENT, THREAD_AWAKENED, -1) == KERN_SUCCESS) { group->idle_count--; group->active_count++; } else if (!thread_call_daemon_awake) { thread_call_daemon_awake = TRUE; - wait_queue_wakeup_one(&group->daemon_wqueue, NULL, THREAD_AWAKENED); + wait_queue_wakeup_one(&group->daemon_wqueue, NO_EVENT, THREAD_AWAKENED, -1); } } @@ -791,7 +818,7 @@ __unused thread_t thread) { thread_call_group_t group = &thread_call_group0; - simple_lock(&thread_call_lock); + thread_call_lock_spin(); switch (type) { @@ -805,7 +832,7 @@ __unused thread_t thread) break; } - simple_unlock(&thread_call_lock); + thread_call_unlock(); } /* @@ -817,8 +844,8 @@ thread_call_thread( { thread_t self = current_thread(); - (void) splsched(); - simple_lock(&thread_call_lock); + (void) splsched(); + thread_call_lock_spin(); thread_sched_call(self, sched_call_thread); @@ -838,7 +865,7 @@ thread_call_thread( _internal_call_release(call); - simple_unlock(&thread_call_lock); + thread_call_unlock(); (void) spllo(); KERNEL_DEBUG_CONSTANT( @@ -847,10 +874,16 @@ thread_call_thread( (*func)(param0, param1); + if (get_preemption_level() != 0) { + int pl = get_preemption_level(); + panic("thread_call_thread: preemption_level %d, last callout %p(%p, %p)", + pl, func, param0, param1); + } + (void)thread_funnel_set(self->funnel_lock, FALSE); /* XXX */ (void) splsched(); - simple_lock(&thread_call_lock); + thread_call_lock_spin(); } thread_sched_call(self, NULL); @@ -859,16 +892,16 @@ thread_call_thread( if (group->idle_count < thread_call_thread_min) { group->idle_count++; - wait_queue_assert_wait(&group->idle_wqueue, NULL, THREAD_UNINT, 0); + wait_queue_assert_wait(&group->idle_wqueue, NO_EVENT, THREAD_UNINT, 0); - simple_unlock(&thread_call_lock); + thread_call_unlock(); (void) spllo(); thread_block_parameter((thread_continue_t)thread_call_thread, group); /* NOTREACHED */ } - simple_unlock(&thread_call_lock); + thread_call_unlock(); (void) spllo(); thread_terminate(self); @@ -886,12 +919,12 @@ thread_call_daemon_continue( thread_t thread; (void) splsched(); - simple_lock(&thread_call_lock); + thread_call_lock_spin(); while (group->active_count == 0 && group->pending_count > 0) { group->active_count++; - simple_unlock(&thread_call_lock); + thread_call_unlock(); (void) spllo(); result = kernel_thread_start_priority((thread_continue_t)thread_call_thread, group, BASEPRI_PREEMPT, &thread); @@ -901,13 +934,13 @@ thread_call_daemon_continue( thread_deallocate(thread); (void) splsched(); - simple_lock(&thread_call_lock); + thread_call_lock_spin(); } thread_call_daemon_awake = FALSE; - wait_queue_assert_wait(&group->daemon_wqueue, NULL, THREAD_UNINT, 0); + wait_queue_assert_wait(&group->daemon_wqueue, NO_EVENT, THREAD_UNINT, 0); - simple_unlock(&thread_call_lock); + thread_call_unlock(); (void) spllo(); thread_block_parameter((thread_continue_t)thread_call_daemon_continue, group); @@ -927,7 +960,7 @@ thread_call_daemon( /* NOTREACHED */ } -static void +void thread_call_delayed_timer( timer_call_param_t p0, __unused timer_call_param_t p1 @@ -938,7 +971,7 @@ thread_call_delayed_timer( boolean_t new_pending = FALSE; uint64_t timestamp; - simple_lock(&thread_call_lock); + thread_call_lock_spin(); timestamp = mach_absolute_time(); @@ -961,5 +994,5 @@ thread_call_delayed_timer( if (new_pending && group->active_count == 0) thread_call_wake(group); - simple_unlock(&thread_call_lock); + thread_call_unlock(); } diff --git a/osfmk/kern/thread_policy.c b/osfmk/kern/thread_policy.c index 58028df2d..7ed70a151 100644 --- a/osfmk/kern/thread_policy.c +++ b/osfmk/kern/thread_policy.c @@ -38,6 +38,14 @@ static void thread_recompute_priority( thread_t thread); +#if CONFIG_EMBEDDED +static void +thread_throttle( + thread_t thread, + integer_t task_priority); + +extern int mach_do_background_thread(thread_t thread, int prio); +#endif kern_return_t @@ -86,37 +94,40 @@ thread_policy_set_internal( timeshare = info->timeshare; } + if (!SCHED(supports_timeshare_mode)()) + timeshare = FALSE; + s = splsched(); thread_lock(thread); - if (!(thread->sched_mode & TH_MODE_FAILSAFE)) { - integer_t oldmode = (thread->sched_mode & TH_MODE_TIMESHARE); + if (!(thread->sched_flags & TH_SFLAG_DEMOTED_MASK)) { + integer_t oldmode = (thread->sched_mode == TH_MODE_TIMESHARE); - thread->sched_mode &= ~TH_MODE_REALTIME; + if (timeshare) { + thread->sched_mode = TH_MODE_TIMESHARE; - if (timeshare && !oldmode) { - thread->sched_mode |= TH_MODE_TIMESHARE; - - if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN) - sched_share_incr(); + if (!oldmode) { + if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN) + sched_share_incr(); + } } - else - if (!timeshare && oldmode) { - thread->sched_mode &= ~TH_MODE_TIMESHARE; + else { + thread->sched_mode = TH_MODE_FIXED; - if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN) - sched_share_decr(); + if (oldmode) { + if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN) + sched_share_decr(); + } } thread_recompute_priority(thread); } else { - thread->safe_mode &= ~TH_MODE_REALTIME; if (timeshare) - thread->safe_mode |= TH_MODE_TIMESHARE; + thread->saved_mode = TH_MODE_TIMESHARE; else - thread->safe_mode &= ~TH_MODE_TIMESHARE; + thread->saved_mode = TH_MODE_FIXED; } thread_unlock(thread); @@ -150,20 +161,23 @@ thread_policy_set_internal( thread->realtime.constraint = info->constraint; thread->realtime.preemptible = info->preemptible; - if (!(thread->sched_mode & TH_MODE_FAILSAFE)) { - if (thread->sched_mode & TH_MODE_TIMESHARE) { - thread->sched_mode &= ~TH_MODE_TIMESHARE; - + if (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) { + thread->saved_mode = TH_MODE_REALTIME; + } +#if CONFIG_EMBEDDED + else if (thread->task_priority <= MAXPRI_THROTTLE) { + thread->saved_mode = TH_MODE_REALTIME; + thread->sched_flags |= TH_SFLAG_THROTTLED; + } +#endif + else { + if (thread->sched_mode == TH_MODE_TIMESHARE) { if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN) sched_share_decr(); } - thread->sched_mode |= TH_MODE_REALTIME; + thread->sched_mode = TH_MODE_REALTIME; thread_recompute_priority(thread); } - else { - thread->safe_mode &= ~TH_MODE_TIMESHARE; - thread->safe_mode |= TH_MODE_REALTIME; - } thread_unlock(thread); splx(s); @@ -217,6 +231,19 @@ thread_policy_set_internal( thread_mtx_unlock(thread); return thread_affinity_set(thread, info->affinity_tag); } + +#if CONFIG_EMBEDDED + case THREAD_BACKGROUND_POLICY: + { + thread_background_policy_t info; + + info = (thread_background_policy_t) policy_info; + + thread_mtx_unlock(thread); + return mach_do_background_thread(thread, info->priority); + } +#endif /* CONFIG_EMBEDDED */ + default: result = KERN_INVALID_ARGUMENT; break; @@ -232,7 +259,7 @@ thread_recompute_priority( { integer_t priority; - if (thread->sched_mode & TH_MODE_REALTIME) + if (thread->sched_mode == TH_MODE_REALTIME) priority = BASEPRI_RTQUEUES; else { if (thread->importance > MAXPRI) @@ -250,11 +277,75 @@ thread_recompute_priority( else if (priority < MINPRI) priority = MINPRI; +#if CONFIG_EMBEDDED + /* No one can have a base priority less than MAXPRI_THROTTLE */ + if (priority < MAXPRI_THROTTLE) + priority = MAXPRI_THROTTLE; +#endif /* CONFIG_EMBEDDED */ } set_priority(thread, priority); } +#if CONFIG_EMBEDDED +static void +thread_throttle( + thread_t thread, + integer_t task_priority) +{ + if (!(thread->sched_flags & TH_SFLAG_THROTTLED) && + (task_priority <= MAXPRI_THROTTLE)) { + + if (!((thread->sched_mode == TH_MODE_REALTIME) || + (thread->saved_mode == TH_MODE_REALTIME))) { + return; + } + + /* Demote to timeshare if throttling */ + if (thread->sched_mode == TH_MODE_REALTIME) + { + thread->saved_mode = TH_MODE_REALTIME; + + if (thread->sched_mode == TH_MODE_TIMESHARE) { + if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN) + sched_share_incr(); + } + } + + /* TH_SFLAG_FAILSAFE and TH_SFLAG_THROTTLED are mutually exclusive, + * since a throttled thread is not realtime during the throttle + * and doesn't need the failsafe repromotion. We therefore clear + * the former and set the latter flags here. + */ + thread->sched_flags &= ~TH_SFLAG_FAILSAFE; + thread->sched_flags |= TH_SFLAG_THROTTLED; + + if (SCHED(supports_timeshare_mode)()) + thread->sched_mode = TH_MODE_TIMESHARE; + else + thread->sched_mode = TH_MODE_FIXED; + } + else if ((thread->sched_flags & TH_SFLAG_THROTTLED) && + (task_priority > MAXPRI_THROTTLE)) { + + /* Promote back to real time if unthrottling */ + if (!(thread->saved_mode == TH_MODE_TIMESHARE)) { + + thread->sched_mode = thread->saved_mode; + + if (thread->sched_mode == TH_MODE_TIMESHARE) { + if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN) + sched_share_decr(); + } + + thread->saved_mode = TH_MODE_NONE; + } + + thread->sched_flags &= ~TH_SFLAG_THROTTLED; + } +} +#endif + void thread_task_priority( thread_t thread, @@ -268,6 +359,10 @@ thread_task_priority( s = splsched(); thread_lock(thread); +#if CONFIG_EMBEDDED + thread_throttle(thread, priority); +#endif + thread->task_priority = priority; thread->max_priority = max_priority; @@ -286,19 +381,20 @@ thread_policy_reset( s = splsched(); thread_lock(thread); - if (!(thread->sched_mode & TH_MODE_FAILSAFE)) { - thread->sched_mode &= ~TH_MODE_REALTIME; + if (!(thread->sched_flags & TH_SFLAG_DEMOTED_MASK)) { + sched_mode_t oldmode = thread->sched_mode; + + thread->sched_mode = SCHED(initial_thread_sched_mode)(thread->task); - if (!(thread->sched_mode & TH_MODE_TIMESHARE)) { - thread->sched_mode |= TH_MODE_TIMESHARE; + if ((oldmode != TH_MODE_TIMESHARE) && (thread->sched_mode == TH_MODE_TIMESHARE)) { if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN) sched_share_incr(); } } else { - thread->safe_mode = 0; - thread->sched_mode &= ~TH_MODE_FAILSAFE; + thread->saved_mode = TH_MODE_NONE; + thread->sched_flags &= ~TH_SFLAG_DEMOTED_MASK; } thread->importance = 0; @@ -340,12 +436,12 @@ thread_policy_get( s = splsched(); thread_lock(thread); - if ( !(thread->sched_mode & TH_MODE_REALTIME) && - !(thread->safe_mode & TH_MODE_REALTIME) ) { - if (!(thread->sched_mode & TH_MODE_FAILSAFE)) - timeshare = (thread->sched_mode & TH_MODE_TIMESHARE) != 0; + if ( (thread->sched_mode != TH_MODE_REALTIME) && + (thread->saved_mode != TH_MODE_REALTIME) ) { + if (!(thread->sched_flags & TH_SFLAG_DEMOTED_MASK)) + timeshare = (thread->sched_mode == TH_MODE_TIMESHARE) != 0; else - timeshare = (thread->safe_mode & TH_MODE_TIMESHARE) != 0; + timeshare = (thread->saved_mode == TH_MODE_TIMESHARE) != 0; } else *get_default = TRUE; @@ -379,8 +475,8 @@ thread_policy_get( s = splsched(); thread_lock(thread); - if ( (thread->sched_mode & TH_MODE_REALTIME) || - (thread->safe_mode & TH_MODE_REALTIME) ) { + if ( (thread->sched_mode == TH_MODE_REALTIME) || + (thread->saved_mode == TH_MODE_REALTIME) ) { info->period = thread->realtime.period; info->computation = thread->realtime.computation; info->constraint = thread->realtime.constraint; @@ -395,8 +491,8 @@ thread_policy_get( if (*get_default) { info->period = 0; - info->computation = std_quantum / 2; - info->constraint = std_quantum; + info->computation = default_timeshare_computation; + info->constraint = default_timeshare_constraint; info->preemptible = TRUE; } diff --git a/osfmk/kern/timer_call.c b/osfmk/kern/timer_call.c index 74c4534a2..83eb0e43a 100644 --- a/osfmk/kern/timer_call.c +++ b/osfmk/kern/timer_call.c @@ -44,235 +44,432 @@ #include #endif -decl_simple_lock_data(static,timer_call_lock) -#define qe(x) ((queue_entry_t)(x)) -#define TC(x) ((timer_call_t)(x)) +#if DEBUG +#define TIMER_ASSERT 1 +#endif + +//#define TIMER_ASSERT 1 +//#define TIMER_DBG 1 + +#if TIMER_DBG +#define DBG(x...) kprintf("DBG: " x); +#else +#define DBG(x...) +#endif + +lck_grp_t timer_call_lck_grp; +lck_attr_t timer_call_lck_attr; +lck_grp_attr_t timer_call_lck_grp_attr; + + +#define timer_call_lock_spin(queue) \ + lck_mtx_lock_spin_always(&queue->lock_data) + +#define timer_call_unlock(queue) \ + lck_mtx_unlock_always(&queue->lock_data) + + +#define QUEUE(x) ((queue_t)(x)) +#define MPQUEUE(x) ((mpqueue_head_t *)(x)) +#define TIMER_CALL(x) ((timer_call_t)(x)) + +static boolean_t timer_call_enter_internal(timer_call_t call, timer_call_param_t param1, uint64_t deadline, uint32_t flags); +boolean_t mach_timer_coalescing_enabled = TRUE; + +mpqueue_head_t *timer_call_enqueue_deadline_unlocked( + timer_call_t call, + mpqueue_head_t *queue, + uint64_t deadline); + +mpqueue_head_t *timer_call_dequeue_unlocked( + timer_call_t call); + void timer_call_initialize(void) { - simple_lock_init(&timer_call_lock, 0); + lck_attr_setdefault(&timer_call_lck_attr); + lck_grp_attr_setdefault(&timer_call_lck_grp_attr); + lck_grp_init(&timer_call_lck_grp, "timer_call", &timer_call_lck_grp_attr); } + +void +timer_call_initialize_queue(mpqueue_head_t *queue) +{ + DBG("timer_call_initialize_queue(%p)\n", queue); + mpqueue_init(queue, &timer_call_lck_grp, &timer_call_lck_attr); +} + + void timer_call_setup( timer_call_t call, timer_call_func_t func, timer_call_param_t param0) { - call_entry_setup(call, func, param0); + DBG("timer_call_setup(%p,%p,%p)\n", call, func, param0); + call_entry_setup(CE(call), func, param0); + simple_lock_init(&(call)->lock, 0); + call->async_dequeue = FALSE; } -__inline__ queue_t -call_entry_enqueue_deadline( - call_entry_t entry, - queue_t queue, - uint64_t deadline) -{ - queue_t old_queue = entry->queue; - timer_call_t current; - - if (old_queue != queue || entry->deadline < deadline) { - if (old_queue != queue) - current = TC(queue_first(queue)); - else - current = TC(queue_next(qe(entry))); - - if (old_queue != NULL) - (void)remque(qe(entry)); +/* + * Timer call entry locking model + * ============================== + * + * Timer call entries are linked on per-cpu timer queues which are protected + * by the queue lock and the call entry lock. The locking protocol is: + * + * 0) The canonical locking order is timer call entry followed by queue. + * + * 1) With only the entry lock held, entry.queue is valid: + * 1a) NULL: the entry is not queued, or + * 1b) non-NULL: this queue must be locked before the entry is modified. + * After locking the queue, the call.async_dequeue flag must be checked: + * 1c) TRUE: the entry was removed from the queue by another thread + * and we must NULL the entry.queue and reset this flag, or + * 1d) FALSE: (ie. queued), the entry can be manipulated. + * + * 2) If a queue lock is obtained first, the queue is stable: + * 2a) If a try-lock of a queued entry succeeds, the call can be operated on + * and dequeued. + * 2b) If a try-lock fails, it indicates that another thread is attempting + * to change the entry and move it to a different position in this queue + * or to different queue. The entry can be dequeued but it should not be + * operated upon since it is being changed. Furthermore, we don't null + * the entry.queue pointer (protected by the entry lock we don't own). + * Instead, we set the async_dequeue flag -- see (1c). + */ - while (TRUE) { - if ( queue_end(queue, qe(current)) || - deadline < current->deadline ) { - current = TC(queue_prev(qe(current))); - break; - } +/* + * Inlines timer_call_entry_dequeue() and timer_call_entry_enqueue_deadline() + * cast between pointer types (mpqueue_head_t *) and (queue_t) so that + * we can use the call_entry_dequeue() and call_entry_enqueue_deadline() + * methods to operate on timer_call structs as if they are call_entry structs. + * These structures are identical except for their queue head pointer fields. + * + * In the debug case, we assert that the timer call locking protocol + * is being obeyed. + */ +#if TIMER_ASSERT +static __inline__ mpqueue_head_t * +timer_call_entry_dequeue( + timer_call_t entry) +{ + mpqueue_head_t *old_queue = MPQUEUE(CE(entry)->queue); + + if (!hw_lock_held((hw_lock_t)&entry->lock)) + panic("_call_entry_dequeue() " + "entry %p is not locked\n", entry); + /* + * XXX The queue lock is actually a mutex in spin mode + * but there's no way to test for it being held + * so we pretend it's a spinlock! + */ + if (!hw_lock_held((hw_lock_t)&old_queue->lock_data)) + panic("_call_entry_dequeue() " + "queue %p is not locked\n", old_queue); + + call_entry_dequeue(CE(entry)); - current = TC(queue_next(qe(current))); - } + return (old_queue); +} - insque(qe(entry), qe(current)); - } - else - if (deadline < entry->deadline) { - current = TC(queue_prev(qe(entry))); +static __inline__ mpqueue_head_t * +timer_call_entry_enqueue_deadline( + timer_call_t entry, + mpqueue_head_t *queue, + uint64_t deadline) +{ + mpqueue_head_t *old_queue = MPQUEUE(CE(entry)->queue); - (void)remque(qe(entry)); + if (!hw_lock_held((hw_lock_t)&entry->lock)) + panic("_call_entry_enqueue_deadline() " + "entry %p is not locked\n", entry); + /* XXX More lock pretense: */ + if (!hw_lock_held((hw_lock_t)&queue->lock_data)) + panic("_call_entry_enqueue_deadline() " + "queue %p is not locked\n", queue); + if (old_queue != NULL && old_queue != queue) + panic("_call_entry_enqueue_deadline() " + "old_queue %p != queue", old_queue); - while (TRUE) { - if ( queue_end(queue, qe(current)) || - current->deadline <= deadline ) { - break; - } + call_entry_enqueue_deadline(CE(entry), QUEUE(queue), deadline); - current = TC(queue_prev(qe(current))); - } + return (old_queue); +} - insque(qe(entry), qe(current)); - } +#else - entry->queue = queue; - entry->deadline = deadline; +static __inline__ mpqueue_head_t * +timer_call_entry_dequeue( + timer_call_t entry) +{ + return MPQUEUE(call_entry_dequeue(CE(entry))); +} - return (old_queue); +static __inline__ mpqueue_head_t * +timer_call_entry_enqueue_deadline( + timer_call_t entry, + mpqueue_head_t *queue, + uint64_t deadline) +{ + return MPQUEUE(call_entry_enqueue_deadline(CE(entry), + QUEUE(queue), deadline)); } -__inline__ queue_t -call_entry_enqueue_tail( - call_entry_t entry, - queue_t queue) +#endif + +#if TIMER_ASSERT +unsigned timer_call_enqueue_deadline_unlocked_async1; +unsigned timer_call_enqueue_deadline_unlocked_async2; +#endif +/* + * Assumes call_entry and queues unlocked, interrupts disabled. + */ +__inline__ mpqueue_head_t * +timer_call_enqueue_deadline_unlocked( + timer_call_t call, + mpqueue_head_t *queue, + uint64_t deadline) { - queue_t old_queue = entry->queue; + call_entry_t entry = CE(call); + mpqueue_head_t *old_queue; - if (old_queue != NULL) - (void)remque(qe(entry)); + DBG("timer_call_enqueue_deadline_unlocked(%p,%p,)\n", call, queue); - enqueue_tail(queue, qe(entry)); + simple_lock(&call->lock); + old_queue = MPQUEUE(entry->queue); + if (old_queue != NULL) { + timer_call_lock_spin(old_queue); + if (call->async_dequeue) { + /* collision (1c): null queue pointer and reset flag */ + call->async_dequeue = FALSE; + entry->queue = NULL; +#if TIMER_ASSERT + timer_call_enqueue_deadline_unlocked_async1++; +#endif + } else if (old_queue != queue) { + (void)remque(qe(entry)); + entry->queue = NULL; +#if TIMER_ASSERT + timer_call_enqueue_deadline_unlocked_async2++; +#endif + } + if (old_queue != queue) { + timer_call_unlock(old_queue); + timer_call_lock_spin(queue); + } + } else { + timer_call_lock_spin(queue); + } - entry->queue = queue; + timer_call_entry_enqueue_deadline(call, queue, deadline); + timer_call_unlock(queue); + simple_unlock(&call->lock); return (old_queue); } -__inline__ queue_t -call_entry_dequeue( - call_entry_t entry) +#if TIMER_ASSERT +unsigned timer_call_dequeue_unlocked_async1; +unsigned timer_call_dequeue_unlocked_async2; +#endif +mpqueue_head_t * +timer_call_dequeue_unlocked( + timer_call_t call) { - queue_t old_queue = entry->queue; + call_entry_t entry = CE(call); + mpqueue_head_t *old_queue; - if (old_queue != NULL) - (void)remque(qe(entry)); - - entry->queue = NULL; + DBG("timer_call_dequeue_unlocked(%p)\n", call); + simple_lock(&call->lock); + old_queue = MPQUEUE(entry->queue); + if (old_queue != NULL) { + timer_call_lock_spin(old_queue); + if (call->async_dequeue) { + /* collision (1c): null queue pointer and reset flag */ + call->async_dequeue = FALSE; +#if TIMER_ASSERT + timer_call_dequeue_unlocked_async1++; +#endif + } else { + (void)remque(qe(entry)); +#if TIMER_ASSERT + timer_call_dequeue_unlocked_async2++; +#endif + } + entry->queue = NULL; + timer_call_unlock(old_queue); + } + simple_unlock(&call->lock); return (old_queue); } -boolean_t -timer_call_enter( - timer_call_t call, - uint64_t deadline) +static boolean_t +timer_call_enter_internal( + timer_call_t call, + timer_call_param_t param1, + uint64_t deadline, + uint32_t flags) { - queue_t queue, old_queue; + mpqueue_head_t *queue; + mpqueue_head_t *old_queue; spl_t s; + uint64_t slop = 0; s = splclock(); - simple_lock(&timer_call_lock); + + call->soft_deadline = deadline; + call->flags = flags; + + if ((flags & TIMER_CALL_CRITICAL) == 0 && + mach_timer_coalescing_enabled) { + slop = timer_call_slop(deadline); + deadline += slop; + } queue = timer_queue_assign(deadline); - old_queue = call_entry_enqueue_deadline(call, queue, deadline); + old_queue = timer_call_enqueue_deadline_unlocked(call, queue, deadline); - call->param1 = NULL; + CE(call)->param1 = param1; - simple_unlock(&timer_call_lock); splx(s); return (old_queue != NULL); } +boolean_t +timer_call_enter( + timer_call_t call, + uint64_t deadline, + uint32_t flags) +{ + return timer_call_enter_internal(call, NULL, deadline, flags); +} + boolean_t timer_call_enter1( timer_call_t call, timer_call_param_t param1, - uint64_t deadline) + uint64_t deadline, + uint32_t flags) { - queue_t queue, old_queue; - spl_t s; - - s = splclock(); - simple_lock(&timer_call_lock); - - queue = timer_queue_assign(deadline); - - old_queue = call_entry_enqueue_deadline(call, queue, deadline); - - call->param1 = param1; - - simple_unlock(&timer_call_lock); - splx(s); - - return (old_queue != NULL); + return timer_call_enter_internal(call, param1, deadline, flags); } boolean_t timer_call_cancel( timer_call_t call) { - queue_t old_queue; + mpqueue_head_t *old_queue; spl_t s; s = splclock(); - simple_lock(&timer_call_lock); - old_queue = call_entry_dequeue(call); + old_queue = timer_call_dequeue_unlocked(call); if (old_queue != NULL) { - if (!queue_empty(old_queue)) - timer_queue_cancel(old_queue, call->deadline, TC(queue_first(old_queue))->deadline); + timer_call_lock_spin(old_queue); + if (!queue_empty(&old_queue->head)) + timer_queue_cancel(old_queue, CE(call)->deadline, CE(queue_first(&old_queue->head))->deadline); else - timer_queue_cancel(old_queue, call->deadline, UINT64_MAX); + timer_queue_cancel(old_queue, CE(call)->deadline, UINT64_MAX); + timer_call_unlock(old_queue); } - - simple_unlock(&timer_call_lock); splx(s); return (old_queue != NULL); } +uint32_t timer_queue_shutdown_lock_skips; void timer_queue_shutdown( - queue_t queue) + mpqueue_head_t *queue) { - timer_call_t call; - queue_t new_queue; + timer_call_t call; + mpqueue_head_t *new_queue; spl_t s; + DBG("timer_queue_shutdown(%p)\n", queue); + s = splclock(); - simple_lock(&timer_call_lock); - call = TC(queue_first(queue)); + /* Note comma operator in while expression re-locking each iteration */ + while (timer_call_lock_spin(queue), !queue_empty(&queue->head)) { + call = TIMER_CALL(queue_first(&queue->head)); + if (!simple_lock_try(&call->lock)) { + /* + * case (2b) lock order inversion, dequeue and skip + * Don't change the call_entry queue back-pointer + * but set the async_dequeue field. + */ + timer_queue_shutdown_lock_skips++; + (void) remque(qe(call)); + call->async_dequeue = TRUE; + timer_call_unlock(queue); + continue; + } - while (!queue_end(queue, qe(call))) { - new_queue = timer_queue_assign(call->deadline); + /* remove entry from old queue */ + timer_call_entry_dequeue(call); + timer_call_unlock(queue); - call_entry_enqueue_deadline(call, new_queue, call->deadline); + /* and queue it on new */ + new_queue = timer_queue_assign(CE(call)->deadline); + timer_call_lock_spin(new_queue); + timer_call_entry_enqueue_deadline( + call, new_queue, CE(call)->deadline); + timer_call_unlock(new_queue); - call = TC(queue_first(queue)); + simple_unlock(&call->lock); } - simple_unlock(&timer_call_lock); + timer_call_unlock(queue); splx(s); } +uint32_t timer_queue_expire_lock_skips; uint64_t timer_queue_expire( - queue_t queue, + mpqueue_head_t *queue, uint64_t deadline) { timer_call_t call; - simple_lock(&timer_call_lock); + DBG("timer_queue_expire(%p,)\n", queue); + + timer_call_lock_spin(queue); - call = TC(queue_first(queue)); + while (!queue_empty(&queue->head)) { + call = TIMER_CALL(queue_first(&queue->head)); - while (!queue_end(queue, qe(call))) { - if (call->deadline <= deadline) { + if (call->soft_deadline <= deadline) { timer_call_func_t func; timer_call_param_t param0, param1; - call_entry_dequeue(call); + if (!simple_lock_try(&call->lock)) { + /* case (2b) lock inversion, dequeue and skip */ + timer_queue_expire_lock_skips++; + (void) remque(qe(call)); + call->async_dequeue = TRUE; + continue; + } + + timer_call_entry_dequeue(call); - func = call->func; - param0 = call->param0; - param1 = call->param1; + func = CE(call)->func; + param0 = CE(call)->param0; + param1 = CE(call)->param1; - simple_unlock(&timer_call_lock); + simple_unlock(&call->lock); + timer_call_unlock(queue); - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_EXCP_DECI, - 2) - | DBG_FUNC_START, + KERNEL_DEBUG_CONSTANT(DECR_TIMER_CALLOUT | DBG_FUNC_START, func, param0, param1, 0, 0); @@ -291,27 +488,120 @@ timer_queue_expire( timer_call_param_t, param1); #endif - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_EXCP_DECI, - 2) - | DBG_FUNC_END, + KERNEL_DEBUG_CONSTANT(DECR_TIMER_CALLOUT | DBG_FUNC_END, func, param0, param1, 0, 0); - simple_lock(&timer_call_lock); + timer_call_lock_spin(queue); } else break; - - call = TC(queue_first(queue)); } - if (!queue_end(queue, qe(call))) - deadline = call->deadline; + if (!queue_empty(&queue->head)) + deadline = CE(call)->deadline; else deadline = UINT64_MAX; - simple_unlock(&timer_call_lock); + timer_call_unlock(queue); return (deadline); } + + +extern int serverperfmode; +uint32_t timer_queue_migrate_lock_skips; +/* + * timer_queue_migrate() is called by etimer_queue_migrate() + * to move timer requests from the local processor (queue_from) + * to a target processor's (queue_to). + */ +int +timer_queue_migrate(mpqueue_head_t *queue_from, mpqueue_head_t *queue_to) +{ + timer_call_t call; + timer_call_t head_to; + int timers_migrated = 0; + + DBG("timer_queue_migrate(%p,%p)\n", queue_from, queue_to); + + assert(!ml_get_interrupts_enabled()); + assert(queue_from != queue_to); + + if (serverperfmode) { + /* + * if we're running a high end server + * avoid migrations... they add latency + * and don't save us power under typical + * server workloads + */ + return -4; + } + + /* + * Take both local (from) and target (to) timer queue locks while + * moving the timers from the local queue to the target processor. + * We assume that the target is always the boot processor. + * But only move if all of the following is true: + * - the target queue is non-empty + * - the local queue is non-empty + * - the local queue's first deadline is later than the target's + * - the local queue contains no non-migrateable "local" call + * so that we need not have the target resync. + */ + + timer_call_lock_spin(queue_to); + + head_to = TIMER_CALL(queue_first(&queue_to->head)); + if (queue_empty(&queue_to->head)) { + timers_migrated = -1; + goto abort1; + } + + timer_call_lock_spin(queue_from); + + if (queue_empty(&queue_from->head)) { + timers_migrated = -2; + goto abort2; + } + + call = TIMER_CALL(queue_first(&queue_from->head)); + if (CE(call)->deadline < CE(head_to)->deadline) { + timers_migrated = 0; + goto abort2; + } + + /* perform scan for non-migratable timers */ + do { + if (call->flags & TIMER_CALL_LOCAL) { + timers_migrated = -3; + goto abort2; + } + call = TIMER_CALL(queue_next(qe(call))); + } while (!queue_end(&queue_from->head, qe(call))); + + /* migration loop itself -- both queues are locked */ + while (!queue_empty(&queue_from->head)) { + call = TIMER_CALL(queue_first(&queue_from->head)); + if (!simple_lock_try(&call->lock)) { + /* case (2b) lock order inversion, dequeue only */ + timer_queue_migrate_lock_skips++; + (void) remque(qe(call)); + call->async_dequeue = TRUE; + continue; + } + timer_call_entry_dequeue(call); + timer_call_entry_enqueue_deadline( + call, queue_to, CE(call)->deadline); + timers_migrated++; + simple_unlock(&call->lock); + } + +abort2: + timer_call_unlock(queue_from); +abort1: + timer_call_unlock(queue_to); + + return timers_migrated; +} diff --git a/osfmk/kern/timer_call.h b/osfmk/kern/timer_call.h index 061e3d96c..f2a074d39 100644 --- a/osfmk/kern/timer_call.h +++ b/osfmk/kern/timer_call.h @@ -36,30 +36,47 @@ #ifdef MACH_KERNEL_PRIVATE -typedef struct call_entry *timer_call_t; +#include + +/* + * NOTE: for now, bsd/dev/dtrace/dtrace_glue.c has its own definition + * of this data structure, and the two had better match. + */ +typedef struct timer_call { + struct call_entry call_entry; + decl_simple_lock_data( ,lock); /* protects call_entry queue */ + uint64_t soft_deadline; + uint32_t flags; + boolean_t async_dequeue; /* this field is protected by + call_entry queue's lock */ +} *timer_call_t; + typedef void *timer_call_param_t; typedef void (*timer_call_func_t)( timer_call_param_t param0, timer_call_param_t param1); - +#define TIMER_CALL_CRITICAL 0x01 +#define TIMER_CALL_LOCAL 0x02 extern boolean_t timer_call_enter( timer_call_t call, - uint64_t deadline); + uint64_t deadline, + uint32_t flags); extern boolean_t timer_call_enter1( timer_call_t call, timer_call_param_t param1, - uint64_t deadline); + uint64_t deadline, + uint32_t flags); extern boolean_t timer_call_cancel( timer_call_t call); -#include - -typedef struct call_entry timer_call_data_t; +typedef struct timer_call timer_call_data_t; extern void timer_call_initialize(void); +extern void timer_call_initialize_queue(mpqueue_head_t *); + extern void timer_call_setup( timer_call_t call, timer_call_func_t func, diff --git a/osfmk/kern/timer_queue.h b/osfmk/kern/timer_queue.h index 050b09afa..3975b3101 100644 --- a/osfmk/kern/timer_queue.h +++ b/osfmk/kern/timer_queue.h @@ -43,14 +43,17 @@ */ /* Request an expiration deadline, returns queue association */ -extern queue_t timer_queue_assign( +extern mpqueue_head_t *timer_queue_assign( + uint64_t deadline); + +extern uint64_t timer_call_slop( uint64_t deadline); /* Cancel an associated expiration deadline and specify new deadline */ -extern void timer_queue_cancel( - queue_t queue, - uint64_t deadline, - uint64_t new_deadline); +extern void timer_queue_cancel( + mpqueue_head_t *queue, + uint64_t deadline, + uint64_t new_deadline); /* * Invoked by platform, implemented by kernel. @@ -58,12 +61,17 @@ extern void timer_queue_cancel( /* Process deadline expiration for queue, returns new deadline */ extern uint64_t timer_queue_expire( - queue_t queue, - uint64_t deadline); + mpqueue_head_t *queue, + uint64_t deadline); /* Shutdown a timer queue and reassign existing activities */ -extern void timer_queue_shutdown( - queue_t queue); +extern void timer_queue_shutdown( + mpqueue_head_t *queue); + +/* Move timer requests from one queue to another */ +extern int timer_queue_migrate( + mpqueue_head_t *from, + mpqueue_head_t *to); #endif /* MACH_KERNEL_PRIVATE */ diff --git a/osfmk/kern/wait_queue.c b/osfmk/kern/wait_queue.c index 6763ac65c..14cd08724 100644 --- a/osfmk/kern/wait_queue.c +++ b/osfmk/kern/wait_queue.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -628,6 +628,25 @@ wait_queue_link( return ret; } +wait_queue_link_t +wait_queue_link_allocate(void) +{ + wait_queue_link_t wql; + + wql = zalloc(_wait_queue_link_zone); /* Can't fail */ + bzero(wql, sizeof(*wql)); + wql->wql_type = WAIT_QUEUE_UNLINKED; + + return wql; +} + +kern_return_t +wait_queue_link_free(wait_queue_link_t wql) +{ + zfree(_wait_queue_link_zone, wql); + return KERN_SUCCESS; +} + /* * Routine: wait_queue_unlink_locked @@ -848,6 +867,48 @@ wait_queue_set_unlink_all( return(KERN_SUCCESS); } +kern_return_t +wait_queue_set_unlink_one( + wait_queue_set_t wq_set, + wait_queue_link_t wql) +{ + wait_queue_t wq; + spl_t s; + + assert(wait_queue_is_set(wq_set)); + +retry: + s = splsched(); + wqs_lock(wq_set); + + WAIT_QUEUE_SET_CHECK(wq_set); + + /* Already unlinked, e.g. by selclearthread() */ + if (wql->wql_type == WAIT_QUEUE_UNLINKED) { + goto out; + } + + WAIT_QUEUE_SET_LINK_CHECK(wq_set, wql); + + /* On a wait queue, and we hold set queue lock ... */ + wq = wql->wql_queue; + if (wait_queue_lock_try(wq)) { + wait_queue_unlink_locked(wq, wq_set, wql); + wait_queue_unlock(wq); + } else { + wqs_unlock(wq_set); + splx(s); + delay(1); + goto retry; + } + +out: + wqs_unlock(wq_set); + splx(s); + + return KERN_SUCCESS; +} + /* * Routine: wait_queue_assert_wait64_locked * Purpose: @@ -868,6 +929,7 @@ wait_queue_assert_wait64_locked( thread_t thread) { wait_result_t wait_result; + boolean_t realtime; if (!wait_queue_assert_possible(thread)) panic("wait_queue_assert_wait64_locked"); @@ -878,7 +940,17 @@ wait_queue_assert_wait64_locked( if (event == NO_EVENT64 && wqs_is_preposted(wqs)) return(THREAD_AWAKENED); } - + + /* + * Realtime threads get priority for wait queue placements. + * This allows wait_queue_wakeup_one to prefer a waiting + * realtime thread, similar in principle to performing + * a wait_queue_wakeup_all and allowing scheduler prioritization + * to run the realtime thread, but without causing the + * lock contention of that scenario. + */ + realtime = (thread->sched_pri >= BASEPRI_REALTIME); + /* * This is the extent to which we currently take scheduling attributes * into account. If the thread is vm priviledged, we stick it at @@ -887,7 +959,9 @@ wait_queue_assert_wait64_locked( */ wait_result = thread_mark_wait_locked(thread, interruptible); if (wait_result == THREAD_WAITING) { - if (!wq->wq_fifo || thread->options & TH_OPT_VMPRIV) + if (!wq->wq_fifo + || (thread->options & TH_OPT_VMPRIV) + || realtime) enqueue_head(&wq->wq_queue, (queue_entry_t) thread); else enqueue_tail(&wq->wq_queue, (queue_entry_t) thread); @@ -896,7 +970,11 @@ wait_queue_assert_wait64_locked( thread->wait_queue = wq; if (deadline != 0) { - if (!timer_call_enter(&thread->wait_timer, deadline)) + uint32_t flags; + + flags = realtime ? TIMER_CALL_CRITICAL : 0; + + if (!timer_call_enter(&thread->wait_timer, deadline, flags)) thread->wait_timer_active++; thread->wait_timer_is_set = TRUE; } @@ -1035,7 +1113,7 @@ _wait_queue_select64_all( if (t->wait_event == event) { thread_lock(t); - remqueue(q, (queue_entry_t) t); + remqueue((queue_entry_t) t); enqueue (wake_queue, (queue_entry_t) t); t->wait_queue = WAIT_QUEUE_NULL; t->wait_event = NO_EVENT64; @@ -1242,7 +1320,7 @@ _wait_queue_select64_one( t = (thread_t)wq_element; if (t->wait_event == event) { thread_lock(t); - remqueue(q, (queue_entry_t) t); + remqueue((queue_entry_t) t); t->wait_queue = WAIT_QUEUE_NULL; t->wait_event = NO_EVENT64; t->at_safe_point = FALSE; @@ -1278,7 +1356,7 @@ wait_queue_pull_thread_locked( assert(thread->wait_queue == waitq); - remqueue(&waitq->wq_queue, (queue_entry_t)thread ); + remqueue((queue_entry_t)thread ); thread->wait_queue = WAIT_QUEUE_NULL; thread->wait_event = NO_EVENT64; thread->at_safe_point = FALSE; @@ -1314,7 +1392,7 @@ _wait_queue_select64_thread( thread_lock(thread); if ((thread->wait_queue == wq) && (thread->wait_event == event)) { - remqueue(q, (queue_entry_t) thread); + remqueue((queue_entry_t) thread); thread->at_safe_point = FALSE; thread->wait_event = NO_EVENT64; thread->wait_queue = WAIT_QUEUE_NULL; @@ -1448,7 +1526,8 @@ kern_return_t wait_queue_wakeup_one( wait_queue_t wq, event_t event, - wait_result_t result) + wait_result_t result, + int priority) { thread_t thread; spl_t s; @@ -1465,6 +1544,14 @@ wait_queue_wakeup_one( if (thread) { kern_return_t res; + if (thread->sched_pri < priority) { + if (priority <= MAXPRI) { + set_sched_pri(thread, priority); + + thread->was_promoted_on_wakeup = 1; + thread->sched_flags |= TH_SFLAG_PROMOTED; + } + } res = thread_go(thread, result); assert(res == KERN_SUCCESS); thread_unlock(thread); diff --git a/osfmk/kern/wait_queue.h b/osfmk/kern/wait_queue.h index 386bd093c..42675a30b 100644 --- a/osfmk/kern/wait_queue.h +++ b/osfmk/kern/wait_queue.h @@ -43,6 +43,7 @@ #include #include +#include #include #include /* machine_timeout_suspended() */ @@ -153,7 +154,7 @@ typedef struct _wait_queue_link { #define wait_queue_lock_try(wq) (hw_lock_try(&(wq)->wq_interlock)) /* For x86, the hardware timeout is in TSC units. */ -#if defined(i386) +#if defined(i386) || defined(x86_64) #define hwLockTimeOut LockTimeOutTSC #else #define hwLockTimeOut LockTimeOut @@ -166,8 +167,9 @@ typedef struct _wait_queue_link { */ static inline void wait_queue_lock(wait_queue_t wq) { - if (hw_lock_to(&(wq)->wq_interlock, hwLockTimeOut * 2) == 0) { + if (__improbable(hw_lock_to(&(wq)->wq_interlock, hwLockTimeOut * 2) == 0)) { boolean_t wql_acquired = FALSE; + while (machine_timeout_suspended()) { #if defined(__i386__) || defined(__x86_64__) /* @@ -179,7 +181,6 @@ static inline void wait_queue_lock(wait_queue_t wq) { if ((wql_acquired = hw_lock_to(&(wq)->wq_interlock, hwLockTimeOut * 2))) break; } - if (wql_acquired == FALSE) panic("wait queue deadlock - wq=%p, cpu=%d\n", wq, cpu_number()); } @@ -329,6 +330,15 @@ extern kern_return_t wait_queue_unlink_all( extern kern_return_t wait_queue_set_unlink_all( wait_queue_set_t set_queue); +#ifdef XNU_KERNEL_PRIVATE +extern kern_return_t wait_queue_set_unlink_one( + wait_queue_set_t set_queue, + wait_queue_link_t link); + +extern wait_queue_link_t wait_queue_link_allocate(void); + +#endif /* XNU_KERNEL_PRIVATE */ + /* legacy API */ kern_return_t wait_queue_sub_init( wait_queue_set_t set_queue, @@ -388,7 +398,8 @@ extern wait_result_t wait_queue_assert_wait( extern kern_return_t wait_queue_wakeup_one( wait_queue_t wait_queue, event_t wake_event, - wait_result_t result); + wait_result_t result, + int priority); /* wakeup all the threads waiting on pair */ extern kern_return_t wait_queue_wakeup_all( diff --git a/osfmk/kern/zalloc.c b/osfmk/kern/zalloc.c index c0e567eaf..c6bf2f01e 100644 --- a/osfmk/kern/zalloc.c +++ b/osfmk/kern/zalloc.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -71,6 +71,7 @@ #include #include #include +#include #include #include @@ -96,13 +97,6 @@ #include #include -#if defined(__ppc__) -/* for fake zone stat routines */ -#include -#include -#endif - - /* * Zone Corruption Debugging * @@ -114,7 +108,7 @@ * each other when re-using the zone element, to detect modifications. * (3) poison the freed memory by overwriting it with 0xdeadbeef. * - * The first two checks are farily light weight and are enabled by specifying "-zc" + * The first two checks are fairly light weight and are enabled by specifying "-zc" * in the boot-args. If you want more aggressive checking for use-after-free bugs * and you don't mind the additional overhead, then turn on poisoning by adding * "-zp" to the boot-args in addition to "-zc". If you specify -zp without -zc, @@ -125,6 +119,48 @@ boolean_t check_freed_element = FALSE; /* enabled by -zc in boot-args */ boolean_t zfree_clear = FALSE; /* enabled by -zp in boot-args */ +/* + * Fake zones for things that want to report via zprint but are not actually zones. + */ +struct fake_zone_info { + const char* name; + void (*init)(int); + void (*query)(int *, + vm_size_t *, vm_size_t *, vm_size_t *, vm_size_t *, + uint64_t *, int *, int *, int *); +}; + +static struct fake_zone_info fake_zones[] = { + { + .name = "kernel_stacks", + .init = stack_fake_zone_init, + .query = stack_fake_zone_info, + }, +#if defined(__i386__) || defined (__x86_64__) + { + .name = "page_tables", + .init = pt_fake_zone_init, + .query = pt_fake_zone_info, + }, +#endif /* i386 */ + { + .name = "kalloc.large", + .init = kalloc_fake_zone_init, + .query = kalloc_fake_zone_info, + }, +}; +unsigned int num_fake_zones = sizeof(fake_zones)/sizeof(fake_zones[0]); + +/* + * Zone info options + */ +boolean_t zinfo_per_task = FALSE; /* enabled by -zinfop in boot-args */ +#define ZINFO_SLOTS 200 /* for now */ +#define ZONES_MAX (ZINFO_SLOTS - num_fake_zones - 1) + +/* + * Allocation helper macros + */ #define is_kernel_data_addr(a) (!(a) || ((a) >= vm_min_kernel_address && !((a) & 0x3))) #define ADD_TO_ZONE(zone, element) \ @@ -159,13 +195,14 @@ MACRO_BEGIN \ if (zfree_clear) { \ unsigned int ii; \ for (ii = sizeof(vm_offset_t) / sizeof(uint32_t); \ - ii < zone->elem_size/sizeof(uint32_t) - sizeof(vm_offset_t) / sizeof(uint32_t); \ + ii < (zone)->elem_size/sizeof(uint32_t) - sizeof(vm_offset_t) / sizeof(uint32_t); \ ii++) \ if (((uint32_t *)(ret))[ii] != (uint32_t)0xdeadbeef) \ panic("a freed zone element has been modified");\ } \ } \ (zone)->count++; \ + (zone)->sum_count++; \ (zone)->free_elements = *((vm_offset_t *)(ret)); \ } \ MACRO_END @@ -229,6 +266,8 @@ vm_map_t zone_map = VM_MAP_NULL; zone_t zone_zone = ZONE_NULL; /* the zone containing other zones */ +zone_t zinfo_zone = ZONE_NULL; /* zone of per-task zone info */ + /* * The VM system gives us an initial chunk of memory. * It has to be big enough to allocate the zone_zone @@ -320,8 +359,7 @@ unsigned int num_zones; boolean_t zone_gc_allowed = TRUE; boolean_t zone_gc_forced = FALSE; boolean_t panic_include_zprint = FALSE; -unsigned zone_gc_last_tick = 0; -unsigned zone_gc_max_rate = 0; /* in ticks */ +boolean_t zone_gc_allowed_by_time_throttle = TRUE; /* * Zone leak debugging code @@ -366,15 +404,13 @@ static char zone_name_to_log[MAX_ZONE_NAME] = ""; /* the zone name we're logging * but one doesn't generally care about performance when tracking down a leak. The log is capped at 8000 * records since going much larger than this tends to make the system unresponsive and unbootable on small * memory configurations. The default value is 4000 records. - * - * MAX_DEPTH configures how deep of a stack trace is taken on each zalloc in the zone of interrest. 15 - * levels is usually enough to get past all the layers of code in kalloc and IOKit and see who the actual - * caller is up above these lower levels. */ - +#if defined(__LP64__) +#define ZRECORDS_MAX 16000 /* Max records allowed in the log */ +#else #define ZRECORDS_MAX 8000 /* Max records allowed in the log */ +#endif #define ZRECORDS_DEFAULT 4000 /* default records in log if zrecs is not specificed in boot-args */ -#define MAX_DEPTH 15 /* number of levels of the stack trace to record */ /* * Each record in the log contains a pointer to the zone element it refers to, a "time" number that allows @@ -388,7 +424,7 @@ struct zrecord { void *z_element; /* the element that was zalloc'ed of zfree'ed */ uint32_t z_opcode:1, /* whether it was a zalloc or zfree */ z_time:31; /* time index when operation was done */ - void *z_pc[MAX_DEPTH]; /* stack trace of caller */ + void *z_pc[MAX_ZTRACE_DEPTH]; /* stack trace of caller */ }; /* @@ -458,7 +494,526 @@ log_this_zone(const char *zonename, const char *logname) extern boolean_t zlog_ready; +#if CONFIG_ZLEAKS +#pragma mark - +#pragma mark Zone Leak Detection + +/* + * The zone leak detector, abbreviated 'zleak', keeps track of a subset of the currently outstanding + * allocations made by the zone allocator. Every z_sample_factor allocations in each zone, we capture a + * backtrace. Every free, we examine the table and determine if the allocation was being tracked, + * and stop tracking it if it was being tracked. + * + * We track the allocations in the zallocations hash table, which stores the address that was returned from + * the zone allocator. Each stored entry in the zallocations table points to an entry in the ztraces table, which + * stores the backtrace associated with that allocation. This provides uniquing for the relatively large + * backtraces - we don't store them more than once. + * + * Data collection begins when the zone map is 50% full, and only occurs for zones that are taking up + * a large amount of virtual space. + */ +#define ZLEAK_STATE_ENABLED 0x01 /* Zone leak monitoring should be turned on if zone_map fills up. */ +#define ZLEAK_STATE_ACTIVE 0x02 /* We are actively collecting traces. */ +#define ZLEAK_STATE_ACTIVATING 0x04 /* Some thread is doing setup; others should move along. */ +#define ZLEAK_STATE_FAILED 0x08 /* Attempt to allocate tables failed. We will not try again. */ +uint32_t zleak_state = 0; /* State of collection, as above */ + +boolean_t panic_include_ztrace = FALSE; /* Enable zleak logging on panic */ +vm_size_t zleak_global_tracking_threshold; /* Size of zone map at which to start collecting data */ +vm_size_t zleak_per_zone_tracking_threshold; /* Size a zone will have before we will collect data on it */ +unsigned int z_sample_factor = 1000; /* Allocations per sample attempt */ + +/* + * Counters for allocation statistics. + */ + +/* Times two active records want to occupy the same spot */ +unsigned int z_alloc_collisions = 0; +unsigned int z_trace_collisions = 0; + +/* Times a new record lands on a spot previously occupied by a freed allocation */ +unsigned int z_alloc_overwrites = 0; +unsigned int z_trace_overwrites = 0; + +/* Times a new alloc or trace is put into the hash table */ +unsigned int z_alloc_recorded = 0; +unsigned int z_trace_recorded = 0; + +/* Times zleak_log returned false due to not being able to acquire the lock */ +unsigned int z_total_conflicts = 0; + + +#pragma mark struct zallocation +/* + * Structure for keeping track of an allocation + * An allocation bucket is in use if its element is not NULL + */ +struct zallocation { + uintptr_t za_element; /* the element that was zalloc'ed or zfree'ed, NULL if bucket unused */ + vm_size_t za_size; /* how much memory did this allocation take up? */ + uint32_t za_trace_index; /* index into ztraces for backtrace associated with allocation */ + /* TODO: #if this out */ + uint32_t za_hit_count; /* for determining effectiveness of hash function */ +}; + +/* Size must be a power of two for the zhash to be able to just mask off bits instead of mod */ +#define ZLEAK_ALLOCATION_MAP_NUM 16384 +#define ZLEAK_TRACE_MAP_NUM 8192 + +uint32_t zleak_alloc_buckets = ZLEAK_ALLOCATION_MAP_NUM; +uint32_t zleak_trace_buckets = ZLEAK_TRACE_MAP_NUM; + +vm_size_t zleak_max_zonemap_size; + +/* Hashmaps of allocations and their corresponding traces */ +static struct zallocation* zallocations; +static struct ztrace* ztraces; + +/* not static so that panic can see this, see kern/debug.c */ +struct ztrace* top_ztrace; + +/* Lock to protect zallocations, ztraces, and top_ztrace from concurrent modification. */ +static lck_mtx_t zleak_lock; +static lck_attr_t zleak_lock_attr; +static lck_grp_t zleak_lock_grp; +static lck_grp_attr_t zleak_lock_grp_attr; + +/* + * Initializes the zone leak monitor. Called from zone_init() + */ +static void +zleak_init(vm_size_t max_zonemap_size) +{ + char scratch_buf[16]; + boolean_t zleak_enable_flag = FALSE; + + zleak_max_zonemap_size = max_zonemap_size; + zleak_global_tracking_threshold = max_zonemap_size / 2; + zleak_per_zone_tracking_threshold = zleak_global_tracking_threshold / 8; + + /* -zleakoff (flag to disable zone leak monitor) */ + if (PE_parse_boot_argn("-zleakoff", scratch_buf, sizeof(scratch_buf))) { + zleak_enable_flag = FALSE; + printf("zone leak detection disabled\n"); + } else { + zleak_enable_flag = TRUE; + printf("zone leak detection enabled\n"); + } + + /* zfactor=XXXX (override how often to sample the zone allocator) */ + if (PE_parse_boot_argn("zfactor", &z_sample_factor, sizeof(z_sample_factor))) { + printf("Zone leak factor override:%u\n", z_sample_factor); + } + + /* zleak-allocs=XXXX (override number of buckets in zallocations) */ + if (PE_parse_boot_argn("zleak-allocs", &zleak_alloc_buckets, sizeof(zleak_alloc_buckets))) { + printf("Zone leak alloc buckets override:%u\n", zleak_alloc_buckets); + /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */ + if (zleak_alloc_buckets == 0 || (zleak_alloc_buckets & (zleak_alloc_buckets-1))) { + printf("Override isn't a power of two, bad things might happen!"); + } + } + + /* zleak-traces=XXXX (override number of buckets in ztraces) */ + if (PE_parse_boot_argn("zleak-traces", &zleak_trace_buckets, sizeof(zleak_trace_buckets))) { + printf("Zone leak trace buckets override:%u\n", zleak_trace_buckets); + /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */ + if (zleak_trace_buckets == 0 || (zleak_trace_buckets & (zleak_trace_buckets-1))) { + printf("Override isn't a power of two, bad things might happen!"); + } + } + + /* allocate the zleak_lock */ + lck_grp_attr_setdefault(&zleak_lock_grp_attr); + lck_grp_init(&zleak_lock_grp, "zleak_lock", &zleak_lock_grp_attr); + lck_attr_setdefault(&zleak_lock_attr); + lck_mtx_init(&zleak_lock, &zleak_lock_grp, &zleak_lock_attr); + + if (zleak_enable_flag) { + zleak_state = ZLEAK_STATE_ENABLED; + } +} + +#if CONFIG_ZLEAKS + +/* + * Support for kern.zleak.active sysctl - a simplified + * simplified version of the zleak_state variable. + */ +int +get_zleak_state(void) +{ + if (zleak_state & ZLEAK_STATE_FAILED) + return (-1); + if (zleak_state & ZLEAK_STATE_ACTIVE) + return (1); + return (0); +} + +#endif + + +kern_return_t +zleak_activate(void) +{ + kern_return_t retval; + vm_size_t z_alloc_size = zleak_alloc_buckets * sizeof(struct zallocation); + vm_size_t z_trace_size = zleak_trace_buckets * sizeof(struct ztrace); + void *allocations_ptr = NULL; + void *traces_ptr = NULL; + + /* Only one thread attempts to activate at a time */ + if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) { + return KERN_SUCCESS; + } + + /* Indicate that we're doing the setup */ + lck_mtx_lock_spin(&zleak_lock); + if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) { + lck_mtx_unlock(&zleak_lock); + return KERN_SUCCESS; + } + + zleak_state |= ZLEAK_STATE_ACTIVATING; + lck_mtx_unlock(&zleak_lock); + + /* Allocate and zero tables */ + retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&allocations_ptr, z_alloc_size); + if (retval != KERN_SUCCESS) { + goto fail; + } + + retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&traces_ptr, z_trace_size); + if (retval != KERN_SUCCESS) { + goto fail; + } + + bzero(allocations_ptr, z_alloc_size); + bzero(traces_ptr, z_trace_size); + + /* Everything's set. Install tables, mark active. */ + zallocations = allocations_ptr; + ztraces = traces_ptr; + + /* + * Initialize the top_ztrace to the first entry in ztraces, + * so we don't have to check for null in zleak_log + */ + top_ztrace = &ztraces[0]; + + /* + * Note that we do need a barrier between installing + * the tables and setting the active flag, because the zfree() + * path accesses the table without a lock if we're active. + */ + lck_mtx_lock_spin(&zleak_lock); + zleak_state |= ZLEAK_STATE_ACTIVE; + zleak_state &= ~ZLEAK_STATE_ACTIVATING; + lck_mtx_unlock(&zleak_lock); + + return 0; + +fail: + /* + * If we fail to allocate memory, don't further tax + * the system by trying again. + */ + lck_mtx_lock_spin(&zleak_lock); + zleak_state |= ZLEAK_STATE_FAILED; + zleak_state &= ~ZLEAK_STATE_ACTIVATING; + lck_mtx_unlock(&zleak_lock); + + if (allocations_ptr != NULL) { + kmem_free(kernel_map, (vm_offset_t)allocations_ptr, z_alloc_size); + } + + if (traces_ptr != NULL) { + kmem_free(kernel_map, (vm_offset_t)traces_ptr, z_trace_size); + } + + return retval; +} + +/* + * TODO: What about allocations that never get deallocated, + * especially ones with unique backtraces? Should we wait to record + * until after boot has completed? + * (How many persistent zallocs are there?) + */ + +/* + * This function records the allocation in the allocations table, + * and stores the associated backtrace in the traces table + * (or just increments the refcount if the trace is already recorded) + * If the allocation slot is in use, the old allocation is replaced with the new allocation, and + * the associated trace's refcount is decremented. + * If the trace slot is in use, it returns. + * The refcount is incremented by the amount of memory the allocation consumes. + * The return value indicates whether to try again next time. + */ +static boolean_t +zleak_log(uintptr_t* bt, + uintptr_t addr, + uint32_t depth, + vm_size_t allocation_size) +{ + /* Quit if there's someone else modifying the hash tables */ + if (!lck_mtx_try_lock_spin(&zleak_lock)) { + z_total_conflicts++; + return FALSE; + } + + struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)]; + + uint32_t trace_index = hashbacktrace(bt, depth, zleak_trace_buckets); + struct ztrace* trace = &ztraces[trace_index]; + + allocation->za_hit_count++; + trace->zt_hit_count++; + + /* + * If the allocation bucket we want to be in is occupied, and if the occupier + * has the same trace as us, just bail. + */ + if (allocation->za_element != (uintptr_t) 0 && trace_index == allocation->za_trace_index) { + z_alloc_collisions++; + + lck_mtx_unlock(&zleak_lock); + return TRUE; + } + + /* STEP 1: Store the backtrace in the traces array. */ + /* A size of zero indicates that the trace bucket is free. */ + + if (trace->zt_size > 0 && bcmp(trace->zt_stack, bt, (depth * sizeof(uintptr_t))) != 0 ) { + /* + * Different unique trace with same hash! + * Just bail - if we're trying to record the leaker, hopefully the other trace will be deallocated + * and get out of the way for later chances + */ + trace->zt_collisions++; + z_trace_collisions++; + + lck_mtx_unlock(&zleak_lock); + return TRUE; + } else if (trace->zt_size > 0) { + /* Same trace, already added, so increment refcount */ + trace->zt_size += allocation_size; + } else { + /* Found an unused trace bucket, record the trace here! */ + if (trace->zt_depth != 0) /* if this slot was previously used but not currently in use */ + z_trace_overwrites++; + + z_trace_recorded++; + trace->zt_size = allocation_size; + memcpy(trace->zt_stack, bt, (depth * sizeof(uintptr_t)) ); + + trace->zt_depth = depth; + trace->zt_collisions = 0; + } + + /* STEP 2: Store the allocation record in the allocations array. */ + + if (allocation->za_element != (uintptr_t) 0) { + /* + * Straight up replace any allocation record that was there. We don't want to do the work + * to preserve the allocation entries that were there, because we only record a subset of the + * allocations anyways. + */ + + z_alloc_collisions++; + + struct ztrace* associated_trace = &ztraces[allocation->za_trace_index]; + /* Knock off old allocation's size, not the new allocation */ + associated_trace->zt_size -= allocation->za_size; + } else if (allocation->za_trace_index != 0) { + /* Slot previously used but not currently in use */ + z_alloc_overwrites++; + } + + allocation->za_element = addr; + allocation->za_trace_index = trace_index; + allocation->za_size = allocation_size; + + z_alloc_recorded++; + + if (top_ztrace->zt_size < trace->zt_size) + top_ztrace = trace; + + lck_mtx_unlock(&zleak_lock); + return TRUE; +} + +/* + * Free the allocation record and release the stacktrace. + * This should be as fast as possible because it will be called for every free. + */ +static void +zleak_free(uintptr_t addr, + vm_size_t allocation_size) +{ + if (addr == (uintptr_t) 0) + return; + + struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)]; + + /* Double-checked locking: check to find out if we're interested, lock, check to make + * sure it hasn't changed, then modify it, and release the lock. + */ + if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) { + /* if the allocation was the one, grab the lock, check again, then delete it */ + lck_mtx_lock_spin(&zleak_lock); + + if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) { + struct ztrace *trace; + + /* allocation_size had better match what was passed into zleak_log - otherwise someone is freeing into the wrong zone! */ + if (allocation->za_size != allocation_size) { + panic("Freeing as size %lu memory that was allocated with size %lu\n", + (uintptr_t)allocation_size, (uintptr_t)allocation->za_size); + } + + trace = &ztraces[allocation->za_trace_index]; + + /* size of 0 indicates trace bucket is unused */ + if (trace->zt_size > 0) { + trace->zt_size -= allocation_size; + } + + /* A NULL element means the allocation bucket is unused */ + allocation->za_element = 0; + } + lck_mtx_unlock(&zleak_lock); + } +} + +#endif /* CONFIG_ZLEAKS */ + +/* These functions outside of CONFIG_ZLEAKS because they are also used in + * mbuf.c for mbuf leak-detection. This is why they lack the z_ prefix. + */ + +/* + * This function captures a backtrace from the current stack and + * returns the number of frames captured, limited by max_frames. + * It's fast because it does no checking to make sure there isn't bad data. + * Since it's only called from threads that we're going to keep executing, + * if there's bad data we were going to die eventually. + * This seems to work for x86 and X86_64. + * ARMTODO: Test it on ARM, I think it will work but I can't test it. If it works, remove the ifdef. + * If this function is inlined, it doesn't record the frame of the function it's inside. + * (because there's no stack frame!) + */ +uint32_t +fastbacktrace(uintptr_t* bt, uint32_t max_frames) +{ +#if defined(__x86_64__) || defined(__i386__) + uintptr_t* frameptr = NULL, *frameptr_next = NULL; + uintptr_t retaddr = 0; + uint32_t frame_index = 0, frames = 0; + uintptr_t kstackb, kstackt; + + kstackb = current_thread()->kernel_stack; + kstackt = kstackb + kernel_stack_size; + /* Load stack frame pointer (EBP on x86) into frameptr */ + frameptr = __builtin_frame_address(0); + + while (frameptr != NULL && frame_index < max_frames ) { + /* Next frame pointer is pointed to by the previous one */ + frameptr_next = (uintptr_t*) *frameptr; + + /* Bail if we see a zero in the stack frame, that means we've reached the top of the stack */ + /* That also means the return address is worthless, so don't record it */ + if (frameptr_next == NULL) + break; + /* Verify thread stack bounds */ + if (((uintptr_t)frameptr_next > kstackt) || ((uintptr_t)frameptr_next < kstackb)) + break; + /* Pull return address from one spot above the frame pointer */ + retaddr = *(frameptr + 1); + + /* Store it in the backtrace array */ + bt[frame_index++] = retaddr; + + frameptr = frameptr_next; + } + + /* Save the number of frames captured for return value */ + frames = frame_index; + + /* Fill in the rest of the backtrace with zeros */ + while (frame_index < max_frames) + bt[frame_index++] = 0; + + return frames; +#else + return OSBacktrace((void*)bt, max_frames); +#endif +} + +/* "Thomas Wang's 32/64 bit mix functions." http://www.concentric.net/~Ttwang/tech/inthash.htm */ +uintptr_t +hash_mix(uintptr_t x) +{ +#ifndef __LP64__ + x += ~(x << 15); + x ^= (x >> 10); + x += (x << 3 ); + x ^= (x >> 6 ); + x += ~(x << 11); + x ^= (x >> 16); +#else + x += ~(x << 32); + x ^= (x >> 22); + x += ~(x << 13); + x ^= (x >> 8 ); + x += (x << 3 ); + x ^= (x >> 15); + x += ~(x << 27); + x ^= (x >> 31); +#endif + return x; +} + +uint32_t +hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size) +{ + + uintptr_t hash = 0; + uintptr_t mask = max_size - 1; + + while (--depth) { + hash += bt[depth]; + } + + hash = hash_mix(hash) & mask; + + assert(hash < max_size); + + return (uint32_t) hash; +} + +/* + * TODO: Determine how well distributed this is + * max_size must be a power of 2. i.e 0x10000 because 0x10000-1 is 0x0FFFF which is a great bitmask + */ +uint32_t +hashaddr(uintptr_t pt, uint32_t max_size) +{ + uintptr_t hash = 0; + uintptr_t mask = max_size - 1; + + hash = hash_mix(pt) & mask; + + assert(hash < max_size); + + return (uint32_t) hash; +} + +/* End of all leak-detection code */ +#pragma mark - + /* * zinit initializes a new zone. The zone data structures themselves * are stored in a zone, which is initially a static structure that @@ -537,6 +1092,7 @@ zinit( z->alloc_size = alloc; z->zone_name = name; z->count = 0; + z->sum_count = 0LL; z->doing_alloc = FALSE; z->doing_gc = FALSE; z->exhaustible = FALSE; @@ -545,8 +1101,16 @@ zinit( z->expandable = TRUE; z->waiting = FALSE; z->async_pending = FALSE; + z->caller_acct = TRUE; z->noencrypt = FALSE; +#if CONFIG_ZLEAKS + z->num_allocs = 0; + z->num_frees = 0; + z->zleak_capture = 0; + z->zleak_on = FALSE; +#endif /* CONFIG_ZLEAKS */ + #if ZONE_DEBUG z->active_zones.next = z->active_zones.prev = NULL; zone_debug_enable(z); @@ -555,13 +1119,20 @@ zinit( /* * Add the zone to the all-zones list. + * If we are tracking zone info per task, and we have + * already used all the available stat slots, then keep + * using the overflow zone slot. */ - z->next_zone = ZONE_NULL; thread_call_setup(&z->call_async_alloc, zalloc_async, z); simple_lock(&all_zones_lock); *last_zone = z; last_zone = &z->next_zone; + z->index = num_zones; + if (zinfo_per_task) { + if (num_zones > ZONES_MAX) + z->index = ZONES_MAX; + } num_zones++; simple_unlock(&all_zones_lock); @@ -782,6 +1353,24 @@ zone_bootstrap(void) vm_offset_t zone_zone_space; char temp_buf[16]; +#if 6094439 + /* enable zone checks by default, to try and catch offenders... */ +#if 0 + /* 7968354: turn "-zc" back off */ + check_freed_element = TRUE; + /* 7995202: turn "-zp" back off */ + zfree_clear = TRUE; +#endif + + /* ... but allow them to be turned off explicitely */ + if (PE_parse_boot_argn("-no_zc", temp_buf, sizeof (temp_buf))) { + check_freed_element = FALSE; + } + if (PE_parse_boot_argn("-no_zp", temp_buf, sizeof (temp_buf))) { + zfree_clear = FALSE; + } +#endif + /* see if we want freed zone element checking and/or poisoning */ if (PE_parse_boot_argn("-zc", temp_buf, sizeof (temp_buf))) { check_freed_element = TRUE; @@ -791,6 +1380,10 @@ zone_bootstrap(void) zfree_clear = TRUE; } + if (PE_parse_boot_argn("-zinfop", temp_buf, sizeof (temp_buf))) { + zinfo_per_task = TRUE; + } + /* * Check for and set up zone leak detection if requested via boot-args. We recognized two * boot-args: @@ -834,13 +1427,47 @@ zone_bootstrap(void) zone_zone = zinit(sizeof(struct zone), 128 * sizeof(struct zone), sizeof(struct zone), "zones"); zone_change(zone_zone, Z_COLLECT, FALSE); + zone_change(zone_zone, Z_CALLERACCT, FALSE); zone_change(zone_zone, Z_NOENCRYPT, TRUE); zone_zone_size = zalloc_end_of_space - zalloc_next_space; zget_space(NULL, zone_zone_size, &zone_zone_space); zcram(zone_zone, (void *)zone_zone_space, zone_zone_size); + + /* initialize fake zones and zone info if tracking by task */ + if (zinfo_per_task) { + vm_size_t zisize = sizeof(zinfo_usage_store_t) * ZINFO_SLOTS; + unsigned int i; + + for (i = 0; i < num_fake_zones; i++) + fake_zones[i].init(ZINFO_SLOTS - num_fake_zones + i); + zinfo_zone = zinit(zisize, zisize * CONFIG_TASK_MAX, + zisize, "per task zinfo"); + zone_change(zinfo_zone, Z_CALLERACCT, FALSE); + } +} + +void +zinfo_task_init(task_t task) +{ + if (zinfo_per_task) { + task->tkm_zinfo = zalloc(zinfo_zone); + memset(task->tkm_zinfo, 0, sizeof(zinfo_usage_store_t) * ZINFO_SLOTS); + } else { + task->tkm_zinfo = NULL; + } } +void +zinfo_task_free(task_t task) +{ + assert(task != kernel_task); + if (task->tkm_zinfo != NULL) { + zfree(zinfo_zone, task->tkm_zinfo); + task->tkm_zinfo = NULL; + } +} + void zone_init( vm_size_t max_zonemap_size) @@ -876,10 +1503,20 @@ zone_init( lck_mtx_init_ext(&zone_gc_lock, &zone_lck_ext, &zone_lck_grp, &zone_lck_attr); zone_page_init(zone_min, zone_max - zone_min, ZONE_PAGE_UNUSED); + +#if CONFIG_ZLEAKS + /* + * Initialize the zone leak monitor + */ + zleak_init(max_zonemap_size); +#endif /* CONFIG_ZLEAKS */ } extern volatile SInt32 kfree_nop_count; +#pragma mark - +#pragma mark zalloc_canblock + /* * zalloc returns an element from the specified zone. */ @@ -890,20 +1527,40 @@ zalloc_canblock( { vm_offset_t addr; kern_return_t retval; - void *bt[MAX_DEPTH]; /* only used if zone logging is enabled */ + uintptr_t zbt[MAX_ZTRACE_DEPTH]; /* used in zone leak logging and zone leak detection */ int numsaved = 0; - int i; + int i; + +#if CONFIG_ZLEAKS + uint32_t zleak_tracedepth = 0; /* log this allocation if nonzero */ +#endif /* CONFIG_ZLEAKS */ assert(zone != ZONE_NULL); + + lock_zone(zone); /* * If zone logging is turned on and this is the zone we're tracking, grab a backtrace. */ - + if (DO_LOGGING(zone)) - numsaved = OSBacktrace(&bt[0], MAX_DEPTH); - - lock_zone(zone); + numsaved = OSBacktrace((void*) zbt, MAX_ZTRACE_DEPTH); + +#if CONFIG_ZLEAKS + /* + * Zone leak detection: capture a backtrace every z_sample_factor + * allocations in this zone. + */ + if (zone->zleak_on && (zone->zleak_capture++ % z_sample_factor == 0)) { + zone->zleak_capture = 1; + + /* Avoid backtracing twice if zone logging is on */ + if (numsaved == 0 ) + zleak_tracedepth = fastbacktrace(zbt, MAX_ZTRACE_DEPTH); + else + zleak_tracedepth = numsaved; + } +#endif /* CONFIG_ZLEAKS */ REMOVE_FROM_ZONE(zone, addr, vm_offset_t); @@ -974,6 +1631,26 @@ zalloc_canblock( if (alloc_size == PAGE_SIZE) space = zone_alias_addr(space); #endif + +#if CONFIG_ZLEAKS + if ((zleak_state & (ZLEAK_STATE_ENABLED | ZLEAK_STATE_ACTIVE)) == ZLEAK_STATE_ENABLED) { + if (zone_map->size >= zleak_global_tracking_threshold) { + kern_return_t kr; + + kr = zleak_activate(); + if (kr != KERN_SUCCESS) { + printf("Failed to activate live zone leak debugging (%d).\n", kr); + } + } + } + + if ((zleak_state & ZLEAK_STATE_ACTIVE) && !(zone->zleak_on)) { + if (zone->cur_size > zleak_per_zone_tracking_threshold) { + zone->zleak_on = TRUE; + } + } +#endif /* CONFIG_ZLEAKS */ + zone_page_init(space, alloc_size, ZONE_PAGE_USED); zcram(zone, (void *)space, alloc_size); @@ -987,12 +1664,20 @@ zalloc_canblock( printf("zalloc did gc\n"); zone_display_zprint(); } - if (retry == 3) { + if (retry == 3) { panic_include_zprint = TRUE; +#if CONFIG_ZLEAKS + if ((zleak_state & ZLEAK_STATE_ACTIVE)) { + panic_include_ztrace = TRUE; + } +#endif /* CONFIG_ZLEAKS */ + /* TODO: Change this to something more descriptive, perhaps + * 'zone_map exhausted' only if we get retval 3 (KERN_NO_SPACE). + */ panic("zalloc: \"%s\" (%d elements) retry fail %d, kfree_nop_count: %d", zone->zone_name, zone->count, retval, (int)kfree_nop_count); } } else { - break; + break; } } lock_zone(zone); @@ -1021,6 +1706,7 @@ zalloc_canblock( } if (retval == KERN_SUCCESS) { zone->count++; + zone->sum_count++; zone->cur_size += zone->elem_size; #if ZONE_DEBUG if (zone_debug_enabled(zone)) { @@ -1042,6 +1728,18 @@ zalloc_canblock( VM_PAGE_WAIT(); lock_zone(zone); } else { + /* + * Equivalent to a 'retry fail 3', we're out of address space in the zone_map + * (if it returned KERN_NO_SPACE) + */ + if (retval == KERN_NO_SPACE) { + panic_include_zprint = TRUE; +#if CONFIG_ZLEAKS + if ((zleak_state & ZLEAK_STATE_ACTIVE)) { + panic_include_ztrace = TRUE; + } +#endif /* CONFIG_ZLEAKS */ + } panic("zalloc: \"%s\" (%d elements) zget_space returned %d", zone->zone_name, zone->count, retval); } } @@ -1050,6 +1748,20 @@ zalloc_canblock( REMOVE_FROM_ZONE(zone, addr, vm_offset_t); } +#if CONFIG_ZLEAKS + /* Zone leak detection: + * If we're sampling this allocation, add it to the zleaks hash table. + */ + if (addr && zleak_tracedepth > 0) { + /* Sampling can fail if another sample is happening at the same time in a different zone. */ + if (!zleak_log(zbt, addr, zleak_tracedepth, zone->elem_size)) { + /* If it failed, roll back the counter so we sample the next allocation instead. */ + zone->zleak_capture = z_sample_factor; + } + } +#endif /* CONFIG_ZLEAKS */ + + /* * See if we should be logging allocations in this zone. Logging is rarely done except when a leak is * suspected, so this code rarely executes. We need to do this code while still holding the zone lock @@ -1109,9 +1821,9 @@ zalloc_canblock( zrecords[zcurrent].z_opcode = ZOP_ALLOC; for (i = 0; i < numsaved; i++) - zrecords[zcurrent].z_pc[i] = bt[i]; + zrecords[zcurrent].z_pc[i] = (void*) zbt[i]; - for (; i < MAX_DEPTH; i++) + for (; i < MAX_ZTRACE_DEPTH; i++) zrecords[zcurrent].z_pc[i] = 0; zcurrent++; @@ -1134,12 +1846,31 @@ zalloc_canblock( addr += ZONE_DEBUG_OFFSET; } #endif + +#if CONFIG_ZLEAKS + if (addr != 0) { + zone->num_allocs++; + } +#endif /* CONFIG_ZLEAKS */ unlock_zone(zone); success: TRACE_MACHLEAKS(ZALLOC_CODE, ZALLOC_CODE_2, zone->elem_size, addr); + if (addr) { + thread_t thr = current_thread(); + task_t task; + zinfo_usage_t zinfo; + + if (zone->caller_acct) + thr->tkm_private.alloc += zone->elem_size; + else + thr->tkm_shared.alloc += zone->elem_size; + + if ((task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) + OSAddAtomic64(zone->elem_size, (int64_t *)&zinfo[zone->index].alloc); + } return((void *)addr); } @@ -1179,17 +1910,36 @@ zalloc_async( * * This form should be used when you can not block (like when * processing an interrupt). + * + * XXX: It seems like only vm_page_grab_fictitious_common uses this, and its + * friend vm_page_more_fictitious can block, so it doesn't seem like + * this is used for interrupts any more.... */ void * zget( register zone_t zone) { register vm_offset_t addr; + +#if CONFIG_ZLEAKS + uintptr_t zbt[MAX_ZTRACE_DEPTH]; /* used for zone leak detection */ + uint32_t zleak_tracedepth = 0; /* log this allocation if nonzero */ +#endif /* CONFIG_ZLEAKS */ assert( zone != ZONE_NULL ); if (!lock_try_zone(zone)) return NULL; + +#if CONFIG_ZLEAKS + /* + * Zone leak detection: capture a backtrace + */ + if (zone->zleak_on && (zone->zleak_capture++ % z_sample_factor == 0)) { + zone->zleak_capture = 1; + zleak_tracedepth = fastbacktrace(zbt, MAX_ZTRACE_DEPTH); + } +#endif /* CONFIG_ZLEAKS */ REMOVE_FROM_ZONE(zone, addr, vm_offset_t); #if ZONE_DEBUG @@ -1198,6 +1948,24 @@ zget( addr += ZONE_DEBUG_OFFSET; } #endif /* ZONE_DEBUG */ + +#if CONFIG_ZLEAKS + /* + * Zone leak detection: record the allocation + */ + if (zone->zleak_on && zleak_tracedepth > 0 && addr) { + /* Sampling can fail if another sample is happening at the same time in a different zone. */ + if (!zleak_log(zbt, addr, zleak_tracedepth, zone->elem_size)) { + /* If it failed, roll back the counter so we sample the next allocation instead. */ + zone->zleak_capture = z_sample_factor; + } + } + + if (addr != 0) { + zone->num_allocs++; + } +#endif /* CONFIG_ZLEAKS */ + unlock_zone(zone); return((void *) addr); @@ -1216,7 +1984,7 @@ zfree( void *addr) { vm_offset_t elem = (vm_offset_t) addr; - void *bt[MAX_DEPTH]; /* only used if zone logging is enable via boot-args */ + void *zbt[MAX_ZTRACE_DEPTH]; /* only used if zone logging is enabled via boot-args */ int numsaved = 0; assert(zone != ZONE_NULL); @@ -1226,7 +1994,7 @@ zfree( */ if (DO_LOGGING(zone)) - numsaved = OSBacktrace(&bt[0], MAX_DEPTH); + numsaved = OSBacktrace(&zbt[0], MAX_ZTRACE_DEPTH); #if MACH_ASSERT /* Basic sanity checks */ @@ -1274,9 +2042,9 @@ zfree( zrecords[zcurrent].z_opcode = ZOP_FREE; for (i = 0; i < numsaved; i++) - zrecords[zcurrent].z_pc[i] = bt[i]; + zrecords[zcurrent].z_pc[i] = zbt[i]; - for (; i < MAX_DEPTH; i++) + for (; i < MAX_ZTRACE_DEPTH; i++) zrecords[zcurrent].z_pc[i] = 0; zcurrent++; @@ -1321,7 +2089,7 @@ zfree( if (elem != (vm_offset_t)tmp_elem) panic("zfree()ing element from wrong zone"); } - remqueue(&zone->active_zones, (queue_t) elem); + remqueue((queue_t) elem); } #endif /* ZONE_DEBUG */ if (zone_check) { @@ -1340,7 +2108,19 @@ zfree( if (zone->count < 0) panic("zfree: count < 0!"); #endif + +#if CONFIG_ZLEAKS + zone->num_frees++; + + /* + * Zone leak detection: un-track the allocation + */ + if (zone->zleak_on) { + zleak_free(elem, zone->elem_size); + } +#endif /* CONFIG_ZLEAKS */ + /* * If elements have one or more pages, and memory is low, * request to run the garbage collection in the zone the next @@ -1351,6 +2131,20 @@ zfree( zone_gc_forced = TRUE; } unlock_zone(zone); + + { + thread_t thr = current_thread(); + task_t task; + zinfo_usage_t zinfo; + + if (zone->caller_acct) + thr->tkm_private.free += zone->elem_size; + else + thr->tkm_shared.free += zone->elem_size; + if ((task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) + OSAddAtomic64(zone->elem_size, + (int64_t *)&zinfo[zone->index].free); + } } @@ -1382,6 +2176,9 @@ zone_change( case Z_FOREIGN: zone->allows_foreign = value; break; + case Z_CALLERACCT: + zone->caller_acct = value; + break; #if MACH_ASSERT default: panic("Zone_change: Wrong Item Type!"); @@ -1886,7 +2683,7 @@ zone_gc(void) while ((zp = zone_free_pages) != NULL) { zone_free_pages = zp->link; #if ZONE_ALIAS_ADDR - z = zone_virtual_addr((vm_map_address_t)z); + z = (zone_t)zone_virtual_addr((vm_map_address_t)z); #endif kmem_free(zone_map, zone_map_min_address + PAGE_SIZE * (zp - zone_page_table), PAGE_SIZE); @@ -1905,57 +2702,334 @@ zone_gc(void) void consider_zone_gc(boolean_t force) { - /* - * By default, don't attempt zone GC more frequently - * than once / 1 minutes. - */ - - if (zone_gc_max_rate == 0) - zone_gc_max_rate = (60 << SCHED_TICK_SHIFT) + 1; if (zone_gc_allowed && - ((sched_tick > (zone_gc_last_tick + zone_gc_max_rate)) || + (zone_gc_allowed_by_time_throttle || zone_gc_forced || force)) { zone_gc_forced = FALSE; - zone_gc_last_tick = sched_tick; + zone_gc_allowed_by_time_throttle = FALSE; /* reset periodically */ zone_gc(); } } -struct fake_zone_info { - const char* name; - void (*func)(int *, vm_size_t *, vm_size_t *, vm_size_t *, vm_size_t *, - int *, int *); -}; +/* + * By default, don't attempt zone GC more frequently + * than once / 1 minutes. + */ +void +compute_zone_gc_throttle(void *arg __unused) +{ + zone_gc_allowed_by_time_throttle = TRUE; +} -static struct fake_zone_info fake_zones[] = { - { - .name = "kernel_stacks", - .func = stack_fake_zone_info, - }, -#ifdef ppc - { - .name = "save_areas", - .func = save_fake_zone_info, - }, - { - .name = "pmap_mappings", - .func = mapping_fake_zone_info, - }, -#endif /* ppc */ -#if defined(__i386__) || defined (__x86_64__) - { - .name = "page_tables", - .func = pt_fake_zone_info, - }, -#endif /* i386 */ - { - .name = "kalloc.large", - .func = kalloc_fake_zone_info, - }, -}; +kern_return_t +task_zone_info( + task_t task, + mach_zone_name_array_t *namesp, + mach_msg_type_number_t *namesCntp, + task_zone_info_array_t *infop, + mach_msg_type_number_t *infoCntp) +{ + mach_zone_name_t *names; + vm_offset_t names_addr; + vm_size_t names_size; + task_zone_info_t *info; + vm_offset_t info_addr; + vm_size_t info_size; + unsigned int max_zones, i; + zone_t z; + mach_zone_name_t *zn; + task_zone_info_t *zi; + kern_return_t kr; + + vm_size_t used; + vm_map_copy_t copy; + + + if (task == TASK_NULL) + return KERN_INVALID_TASK; + + /* + * We assume that zones aren't freed once allocated. + * We won't pick up any zones that are allocated later. + */ + + simple_lock(&all_zones_lock); + max_zones = (unsigned int)(num_zones + num_fake_zones); + z = first_zone; + simple_unlock(&all_zones_lock); + + names_size = round_page(max_zones * sizeof *names); + kr = kmem_alloc_pageable(ipc_kernel_map, + &names_addr, names_size); + if (kr != KERN_SUCCESS) + return kr; + names = (mach_zone_name_t *) names_addr; + + info_size = round_page(max_zones * sizeof *info); + kr = kmem_alloc_pageable(ipc_kernel_map, + &info_addr, info_size); + if (kr != KERN_SUCCESS) { + kmem_free(ipc_kernel_map, + names_addr, names_size); + return kr; + } + + info = (task_zone_info_t *) info_addr; + + zn = &names[0]; + zi = &info[0]; + + for (i = 0; i < max_zones - num_fake_zones; i++) { + struct zone zcopy; + + assert(z != ZONE_NULL); + + lock_zone(z); + zcopy = *z; + unlock_zone(z); + + simple_lock(&all_zones_lock); + z = z->next_zone; + simple_unlock(&all_zones_lock); + + /* assuming here the name data is static */ + (void) strncpy(zn->mzn_name, zcopy.zone_name, + sizeof zn->mzn_name); + zn->mzn_name[sizeof zn->mzn_name - 1] = '\0'; + + zi->tzi_count = (uint64_t)zcopy.count; + zi->tzi_cur_size = (uint64_t)zcopy.cur_size; + zi->tzi_max_size = (uint64_t)zcopy.max_size; + zi->tzi_elem_size = (uint64_t)zcopy.elem_size; + zi->tzi_alloc_size = (uint64_t)zcopy.alloc_size; + zi->tzi_sum_size = zcopy.sum_count * zcopy.elem_size; + zi->tzi_exhaustible = (uint64_t)zcopy.exhaustible; + zi->tzi_collectable = (uint64_t)zcopy.collectable; + zi->tzi_caller_acct = (uint64_t)zcopy.caller_acct; + if (task->tkm_zinfo != NULL) { + zi->tzi_task_alloc = task->tkm_zinfo[zcopy.index].alloc; + zi->tzi_task_free = task->tkm_zinfo[zcopy.index].free; + } else { + zi->tzi_task_alloc = 0; + zi->tzi_task_free = 0; + } + zn++; + zi++; + } + + /* + * loop through the fake zones and fill them using the specialized + * functions + */ + for (i = 0; i < num_fake_zones; i++) { + int count, collectable, exhaustible, caller_acct, index; + vm_size_t cur_size, max_size, elem_size, alloc_size; + uint64_t sum_size; + + strncpy(zn->mzn_name, fake_zones[i].name, sizeof zn->mzn_name); + zn->mzn_name[sizeof zn->mzn_name - 1] = '\0'; + fake_zones[i].query(&count, &cur_size, + &max_size, &elem_size, + &alloc_size, &sum_size, + &collectable, &exhaustible, &caller_acct); + zi->tzi_count = (uint64_t)count; + zi->tzi_cur_size = (uint64_t)cur_size; + zi->tzi_max_size = (uint64_t)max_size; + zi->tzi_elem_size = (uint64_t)elem_size; + zi->tzi_alloc_size = (uint64_t)alloc_size; + zi->tzi_sum_size = sum_size; + zi->tzi_collectable = (uint64_t)collectable; + zi->tzi_exhaustible = (uint64_t)exhaustible; + zi->tzi_caller_acct = (uint64_t)caller_acct; + if (task->tkm_zinfo != NULL) { + index = ZINFO_SLOTS - num_fake_zones + i; + zi->tzi_task_alloc = task->tkm_zinfo[index].alloc; + zi->tzi_task_free = task->tkm_zinfo[index].free; + } else { + zi->tzi_task_alloc = 0; + zi->tzi_task_free = 0; + } + zn++; + zi++; + } + + used = max_zones * sizeof *names; + if (used != names_size) + bzero((char *) (names_addr + used), names_size - used); + + kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)names_addr, + (vm_map_size_t)names_size, TRUE, ©); + assert(kr == KERN_SUCCESS); + + *namesp = (mach_zone_name_t *) copy; + *namesCntp = max_zones; + + used = max_zones * sizeof *info; + + if (used != info_size) + bzero((char *) (info_addr + used), info_size - used); + + kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)info_addr, + (vm_map_size_t)info_size, TRUE, ©); + assert(kr == KERN_SUCCESS); + + *infop = (task_zone_info_t *) copy; + *infoCntp = max_zones; + + return KERN_SUCCESS; +} + +kern_return_t +mach_zone_info( + host_t host, + mach_zone_name_array_t *namesp, + mach_msg_type_number_t *namesCntp, + mach_zone_info_array_t *infop, + mach_msg_type_number_t *infoCntp) +{ + mach_zone_name_t *names; + vm_offset_t names_addr; + vm_size_t names_size; + mach_zone_info_t *info; + vm_offset_t info_addr; + vm_size_t info_size; + unsigned int max_zones, i; + zone_t z; + mach_zone_name_t *zn; + mach_zone_info_t *zi; + kern_return_t kr; + + vm_size_t used; + vm_map_copy_t copy; + + + if (host == HOST_NULL) + return KERN_INVALID_HOST; + + num_fake_zones = sizeof fake_zones / sizeof fake_zones[0]; + + /* + * We assume that zones aren't freed once allocated. + * We won't pick up any zones that are allocated later. + */ + + simple_lock(&all_zones_lock); + max_zones = (unsigned int)(num_zones + num_fake_zones); + z = first_zone; + simple_unlock(&all_zones_lock); + + names_size = round_page(max_zones * sizeof *names); + kr = kmem_alloc_pageable(ipc_kernel_map, + &names_addr, names_size); + if (kr != KERN_SUCCESS) + return kr; + names = (mach_zone_name_t *) names_addr; + + info_size = round_page(max_zones * sizeof *info); + kr = kmem_alloc_pageable(ipc_kernel_map, + &info_addr, info_size); + if (kr != KERN_SUCCESS) { + kmem_free(ipc_kernel_map, + names_addr, names_size); + return kr; + } + + info = (mach_zone_info_t *) info_addr; + + zn = &names[0]; + zi = &info[0]; + + for (i = 0; i < max_zones - num_fake_zones; i++) { + struct zone zcopy; + + assert(z != ZONE_NULL); + + lock_zone(z); + zcopy = *z; + unlock_zone(z); + + simple_lock(&all_zones_lock); + z = z->next_zone; + simple_unlock(&all_zones_lock); + + /* assuming here the name data is static */ + (void) strncpy(zn->mzn_name, zcopy.zone_name, + sizeof zn->mzn_name); + zn->mzn_name[sizeof zn->mzn_name - 1] = '\0'; + + zi->mzi_count = (uint64_t)zcopy.count; + zi->mzi_cur_size = (uint64_t)zcopy.cur_size; + zi->mzi_max_size = (uint64_t)zcopy.max_size; + zi->mzi_elem_size = (uint64_t)zcopy.elem_size; + zi->mzi_alloc_size = (uint64_t)zcopy.alloc_size; + zi->mzi_sum_size = zcopy.sum_count * zcopy.elem_size; + zi->mzi_exhaustible = (uint64_t)zcopy.exhaustible; + zi->mzi_collectable = (uint64_t)zcopy.collectable; + zn++; + zi++; + } + + /* + * loop through the fake zones and fill them using the specialized + * functions + */ + for (i = 0; i < num_fake_zones; i++) { + int count, collectable, exhaustible, caller_acct; + vm_size_t cur_size, max_size, elem_size, alloc_size; + uint64_t sum_size; + + strncpy(zn->mzn_name, fake_zones[i].name, sizeof zn->mzn_name); + zn->mzn_name[sizeof zn->mzn_name - 1] = '\0'; + fake_zones[i].query(&count, &cur_size, + &max_size, &elem_size, + &alloc_size, &sum_size, + &collectable, &exhaustible, &caller_acct); + zi->mzi_count = (uint64_t)count; + zi->mzi_cur_size = (uint64_t)cur_size; + zi->mzi_max_size = (uint64_t)max_size; + zi->mzi_elem_size = (uint64_t)elem_size; + zi->mzi_alloc_size = (uint64_t)alloc_size; + zi->mzi_sum_size = sum_size; + zi->mzi_collectable = (uint64_t)collectable; + zi->mzi_exhaustible = (uint64_t)exhaustible; + + zn++; + zi++; + } + + used = max_zones * sizeof *names; + if (used != names_size) + bzero((char *) (names_addr + used), names_size - used); + + kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)names_addr, + (vm_map_size_t)names_size, TRUE, ©); + assert(kr == KERN_SUCCESS); + + *namesp = (mach_zone_name_t *) copy; + *namesCntp = max_zones; + + used = max_zones * sizeof *info; + + if (used != info_size) + bzero((char *) (info_addr + used), info_size - used); + + kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)info_addr, + (vm_map_size_t)info_size, TRUE, ©); + assert(kr == KERN_SUCCESS); + + *infop = (mach_zone_info_t *) copy; + *infoCntp = max_zones; + + return KERN_SUCCESS; +} + +/* + * host_zone_info - LEGACY user interface for Mach zone information + * Should use mach_zone_info() instead! + */ kern_return_t host_zone_info( host_t host, @@ -1975,7 +3049,9 @@ host_zone_info( zone_name_t *zn; zone_info_t *zi; kern_return_t kr; - size_t num_fake_zones; + + vm_size_t used; + vm_map_copy_t copy; if (host == HOST_NULL) @@ -2001,40 +3077,28 @@ host_zone_info( z = first_zone; simple_unlock(&all_zones_lock); - if (max_zones <= *namesCntp) { - /* use in-line memory */ - names_size = *namesCntp * sizeof *names; - names = *namesp; - } else { - names_size = round_page(max_zones * sizeof *names); - kr = kmem_alloc_pageable(ipc_kernel_map, - &names_addr, names_size); - if (kr != KERN_SUCCESS) - return kr; - names = (zone_name_t *) names_addr; - } - - if (max_zones <= *infoCntp) { - /* use in-line memory */ - info_size = *infoCntp * sizeof *info; - info = *infop; - } else { - info_size = round_page(max_zones * sizeof *info); - kr = kmem_alloc_pageable(ipc_kernel_map, - &info_addr, info_size); - if (kr != KERN_SUCCESS) { - if (names != *namesp) - kmem_free(ipc_kernel_map, - names_addr, names_size); - return kr; - } - - info = (zone_info_t *) info_addr; + names_size = round_page(max_zones * sizeof *names); + kr = kmem_alloc_pageable(ipc_kernel_map, + &names_addr, names_size); + if (kr != KERN_SUCCESS) + return kr; + names = (zone_name_t *) names_addr; + + info_size = round_page(max_zones * sizeof *info); + kr = kmem_alloc_pageable(ipc_kernel_map, + &info_addr, info_size); + if (kr != KERN_SUCCESS) { + kmem_free(ipc_kernel_map, + names_addr, names_size); + return kr; } + + info = (zone_info_t *) info_addr; + zn = &names[0]; zi = &info[0]; - for (i = 0; i < num_zones; i++) { + for (i = 0; i < max_zones - num_fake_zones; i++) { struct zone zcopy; assert(z != ZONE_NULL); @@ -2069,57 +3133,49 @@ host_zone_info( * functions */ for (i = 0; i < num_fake_zones; i++) { + int caller_acct; + uint64_t sum_space; strncpy(zn->zn_name, fake_zones[i].name, sizeof zn->zn_name); zn->zn_name[sizeof zn->zn_name - 1] = '\0'; - fake_zones[i].func(&zi->zi_count, &zi->zi_cur_size, - &zi->zi_max_size, &zi->zi_elem_size, - &zi->zi_alloc_size, &zi->zi_collectable, - &zi->zi_exhaustible); + fake_zones[i].query(&zi->zi_count, &zi->zi_cur_size, + &zi->zi_max_size, &zi->zi_elem_size, + &zi->zi_alloc_size, &sum_space, + &zi->zi_collectable, &zi->zi_exhaustible, &caller_acct); zn++; zi++; } - if (names != *namesp) { - vm_size_t used; - vm_map_copy_t copy; - - used = max_zones * sizeof *names; + used = max_zones * sizeof *names; + if (used != names_size) + bzero((char *) (names_addr + used), names_size - used); - if (used != names_size) - bzero((char *) (names_addr + used), names_size - used); + kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)names_addr, + (vm_map_size_t)names_size, TRUE, ©); + assert(kr == KERN_SUCCESS); - kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)names_addr, - (vm_map_size_t)names_size, TRUE, ©); - assert(kr == KERN_SUCCESS); - - *namesp = (zone_name_t *) copy; - } + *namesp = (zone_name_t *) copy; *namesCntp = max_zones; - if (info != *infop) { - vm_size_t used; - vm_map_copy_t copy; + used = max_zones * sizeof *info; + if (used != info_size) + bzero((char *) (info_addr + used), info_size - used); - used = max_zones * sizeof *info; + kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)info_addr, + (vm_map_size_t)info_size, TRUE, ©); + assert(kr == KERN_SUCCESS); - if (used != info_size) - bzero((char *) (info_addr + used), info_size - used); - - kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)info_addr, - (vm_map_size_t)info_size, TRUE, ©); - assert(kr == KERN_SUCCESS); - - *infop = (zone_info_t *) copy; - } + *infop = (zone_info_t *) copy; *infoCntp = max_zones; return KERN_SUCCESS; } extern unsigned int stack_total; +extern unsigned long long stack_allocs; #if defined(__i386__) || defined (__x86_64__) extern unsigned int inuse_ptepages_count; +extern long long alloc_ptepages_count; #endif void zone_display_zprint() @@ -2191,6 +3247,8 @@ db_print_zone( db_printf("C"); if (zcopy.expandable) db_printf("X"); + if (zcopy.caller_acct) + db_printf("A"); db_printf("\n"); } diff --git a/osfmk/kern/zalloc.h b/osfmk/kern/zalloc.h index 22f9d78b4..d7d722239 100644 --- a/osfmk/kern/zalloc.h +++ b/osfmk/kern/zalloc.h @@ -100,6 +100,7 @@ struct zone { vm_size_t max_size; /* how large can this zone grow */ vm_size_t elem_size; /* size of an element */ vm_size_t alloc_size; /* size used for more memory */ + uint64_t sum_count; /* count of allocs (life of zone) */ unsigned int /* boolean_t */ exhaustible :1, /* (F) merely return if empty? */ /* boolean_t */ collectable :1, /* (F) garbage collect empty pages */ @@ -108,16 +109,38 @@ struct zone { /* boolean_t */ doing_alloc :1, /* is zone expanding now? */ /* boolean_t */ waiting :1, /* is thread waiting for expansion? */ /* boolean_t */ async_pending :1, /* asynchronous allocation pending? */ +#if CONFIG_ZLEAKS + /* boolean_t */ zleak_on :1, /* Are we collecting allocation information? */ +#endif /* ZONE_DEBUG */ + /* boolean_t */ caller_acct: 1, /* do we account allocation/free to the caller? */ /* boolean_t */ doing_gc :1, /* garbage collect in progress? */ /* boolean_t */ noencrypt :1; + int index; /* index into zone_info arrays for this zone */ struct zone * next_zone; /* Link for all-zones list */ call_entry_data_t call_async_alloc; /* callout for asynchronous alloc */ const char *zone_name; /* a name for the zone */ #if ZONE_DEBUG queue_head_t active_zones; /* active elements */ #endif /* ZONE_DEBUG */ + +#if CONFIG_ZLEAKS + uint32_t num_allocs; /* alloc stats for zleak benchmarks */ + uint32_t num_frees; /* free stats for zleak benchmarks */ + uint32_t zleak_capture; /* per-zone counter for capturing every N allocations */ +#endif /* CONFIG_ZLEAKS */ }; +/* + * structure for tracking zone usage + * Used either one per task/thread for all zones or . + */ +typedef struct zinfo_usage_store_t { + /* These fields may be updated atomically, and so must be 8 byte aligned */ + uint64_t alloc __attribute__((aligned(8))); /* allocation counter */ + uint64_t free __attribute__((aligned(8))); /* free counter */ +} zinfo_usage_store_t; +typedef zinfo_usage_store_t *zinfo_usage_t; + extern void zone_gc(void); extern void consider_zone_gc(boolean_t); @@ -131,15 +154,23 @@ extern void zone_bootstrap(void) __attribute__((section("__TEXT, initcode"))); extern void zone_init( vm_size_t map_size) __attribute__((section("__TEXT, initcode"))); +/* Handle per-task zone info */ +extern void zinfo_task_init(task_t task); +extern void zinfo_task_free(task_t task); + + /* Stack use statistics */ +extern void stack_fake_zone_init(int zone_index); extern void stack_fake_zone_info( int *count, vm_size_t *cur_size, vm_size_t *max_size, vm_size_t *elem_size, vm_size_t *alloc_size, + uint64_t *sum_size, int *collectable, - int *exhaustable); + int *exhaustable, + int *caller_acct); #if ZONE_DEBUG @@ -220,6 +251,7 @@ extern void zone_change( #define Z_COLLECT 2 /* Make zone collectable */ #define Z_EXPAND 3 /* Make zone expandable */ #define Z_FOREIGN 4 /* Allow collectable zone to contain foreign elements */ +#define Z_CALLERACCT 5 /* Account alloc/free against the caller */ #define Z_NOENCRYPT 6 /* Don't encrypt zone during hibernation */ /* Preallocate space for zone from zone map */ @@ -230,6 +262,48 @@ extern void zprealloc( extern integer_t zone_free_count( zone_t zone); +/* + * MAX_ZTRACE_DEPTH configures how deep of a stack trace is taken on each zalloc in the zone of interest. 15 + * levels is usually enough to get past all the layers of code in kalloc and IOKit and see who the actual + * caller is up above these lower levels. + * + * This is used both for the zone leak detector and the zone corruption log. + */ + +#define MAX_ZTRACE_DEPTH 15 + +/* + * Structure for keeping track of a backtrace, used for leak detection. + * This is in the .h file because it is used during panic, see kern/debug.c + * A non-zero size indicates that the trace is in use. + */ +struct ztrace { + vm_size_t zt_size; /* How much memory are all the allocations referring to this trace taking up? */ + uint32_t zt_depth; /* depth of stack (0 to MAX_ZTRACE_DEPTH) */ + void* zt_stack[MAX_ZTRACE_DEPTH]; /* series of return addresses from OSBacktrace */ + uint32_t zt_collisions; /* How many times did a different stack land here while it was occupied? */ + uint32_t zt_hit_count; /* for determining effectiveness of hash function */ +}; + +#if CONFIG_ZLEAKS + +/* support for the kern.zleak.* sysctls */ + +extern kern_return_t zleak_activate(void); +extern vm_size_t zleak_max_zonemap_size; +extern vm_size_t zleak_global_tracking_threshold; +extern vm_size_t zleak_per_zone_tracking_threshold; + +extern int get_zleak_state(void); + +#endif /* CONFIG_ZLEAKS */ + +/* These functions used for leak detection both in zalloc.c and mbuf.c */ +extern uint32_t fastbacktrace(uintptr_t* bt, uint32_t max_frames); +extern uintptr_t hash_mix(uintptr_t x); +extern uint32_t hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size); +extern uint32_t hashaddr(uintptr_t pt, uint32_t max_size); + #endif /* XNU_KERNEL_PRIVATE */ __END_DECLS diff --git a/osfmk/kextd/Makefile b/osfmk/kextd/Makefile index d3a065420..771b0cd26 100644 --- a/osfmk/kextd/Makefile +++ b/osfmk/kextd/Makefile @@ -8,14 +8,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = -INSTINC_SUBDIRS_PPC = - INSTINC_SUBDIRS_I386 = EXPINC_SUBDIRS = -EXPINC_SUBDIRS_PPC = - EXPINC_SUBDIRS_I386 = MIG_DEFS = kextd_mach.defs diff --git a/osfmk/libsa/machine/types.h b/osfmk/libsa/machine/types.h index f79adbe87..0a6f4bb69 100644 --- a/osfmk/libsa/machine/types.h +++ b/osfmk/libsa/machine/types.h @@ -28,9 +28,7 @@ #ifndef _MACH_MACHINE_TYPES_H #define _MACH_MACHINE_TYPES_H -#if defined (__ppc__) -#include "libsa/ppc/types.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "libsa/i386/types.h" #else #error architecture not supported diff --git a/osfmk/libsa/ppc/types.h b/osfmk/libsa/ppc/types.h deleted file mode 100644 index 859f94b92..000000000 --- a/osfmk/libsa/ppc/types.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - * - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:51 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:25:36 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.2.1 1996/12/09 16:59:05 stephen - * nmklinux_1.0b3_shared into pmk1.1 - * [1996/12/09 11:18:55 stephen] - * - * Revision 1.1.2.1 1996/09/17 16:56:35 bruel - * created from standalone mach servers - * [1996/09/17 16:16:17 bruel] - * - * $EndLog$ - */ - -#ifndef _MACH_MACHINE_TYPES_H_ -#define _MACH_MACHINE_TYPES_H_ 1 - -typedef long dev_t; /* device number (major+minor) */ - -typedef signed char bit8_t; /* signed 8-bit quantity */ -typedef unsigned char u_bit8_t; /* unsigned 8-bit quantity */ - -typedef short bit16_t; /* signed 16-bit quantity */ -typedef unsigned short u_bit16_t; /* unsigned 16-bit quantity */ - -typedef int bit32_t; /* signed 32-bit quantity */ -typedef unsigned int u_bit32_t; /* unsigned 32-bit quantity */ - -/* Only 32 bits of the "bit64_t" are significant on this 32-bit machine */ -typedef struct { int __val[2]; } bit64_t; /* signed 64-bit quantity */ -typedef struct { unsigned int __val[2]; } u_bit64_t;/* unsigned 64-bit quantity */ -#define _SIG64_BITS __val[1] /* bits of interest (32) */ - -#endif /* _MACH_MACHINE_TYPES_H_ */ diff --git a/osfmk/libsa/types.h b/osfmk/libsa/types.h index ca12b7efb..341e42b0e 100644 --- a/osfmk/libsa/types.h +++ b/osfmk/libsa/types.h @@ -69,7 +69,6 @@ typedef struct _quad_ { typedef char * caddr_t; /* address of a (signed) char */ -typedef int time_t; /* a signed 32 */ typedef unsigned int daddr_t; /* an unsigned 32 */ #if 0 /* off_t should be 64-bit ! */ typedef unsigned int off_t; /* another unsigned 32 */ diff --git a/osfmk/lockd/Makefile b/osfmk/lockd/Makefile index 45820f497..2975dc2dd 100644 --- a/osfmk/lockd/Makefile +++ b/osfmk/lockd/Makefile @@ -8,14 +8,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = -INSTINC_SUBDIRS_PPC = - INSTINC_SUBDIRS_I386 = EXPINC_SUBDIRS = -EXPINC_SUBDIRS_PPC = - EXPINC_SUBDIRS_I386 = MIG_DEFS = lockd_mach.defs diff --git a/osfmk/mach/Makefile b/osfmk/mach/Makefile index eaadb00bd..770208eaa 100644 --- a/osfmk/mach/Makefile +++ b/osfmk/mach/Makefile @@ -3,39 +3,18 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) -INSTINC_SUBDIRS = \ - machine - -INSTINC_SUBDIRS_PPC = \ - ppc - -INSTINC_SUBDIRS_ARM = \ - arm - -INSTINC_SUBDIRS_I386 = \ - i386 - -INSTINC_SUBDIRS_X86_64 = \ - i386 - -EXPINC_SUBDIRS = \ - machine - -EXPINC_SUBDIRS_PPC = \ - ppc - -EXPINC_SUBDIRS_I386 = \ - i386 - -EXPINC_SUBDIRS_X86_64 = \ - i386 +INSTINC_SUBDIRS = machine +INSTINC_SUBDIRS_ARM = arm +INSTINC_SUBDIRS_I386 = i386 +INSTINC_SUBDIRS_X86_64 = i386 -EXPINC_SUBDIRS_ARM = \ - arm +EXPINC_SUBDIRS = machine +EXPINC_SUBDIRS_I386 = i386 +EXPINC_SUBDIRS_X86_64 = i386 +EXPINC_SUBDIRS_ARM = arm MIG_TYPES = \ clock_types.defs \ @@ -188,6 +167,7 @@ INSTALL_MI_GEN_LIST = INSTALL_MI_DIR = mach EXPORT_MI_LIST = \ + branch_predicates.h \ mach_interface.h \ ${DATAFILES} diff --git a/osfmk/mach/branch_predicates.h b/osfmk/mach/branch_predicates.h new file mode 100644 index 000000000..8d16db0fa --- /dev/null +++ b/osfmk/mach/branch_predicates.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + + +#ifndef _MACH_BRANCH_PREDICATES_H +#define _MACH_BRANCH_PREDICATES_H + +#define __probable(x) __builtin_expect((x), 1) +#define __improbable(x) __builtin_expect((x), 0) +#endif /* _MACH_BRANCH_PREDICATES_H */ diff --git a/osfmk/mach/clock_types.h b/osfmk/mach/clock_types.h index 12035da3c..e020066d1 100644 --- a/osfmk/mach/clock_types.h +++ b/osfmk/mach/clock_types.h @@ -84,6 +84,7 @@ typedef struct mach_timespec mach_timespec_t; #define NSEC_PER_USEC 1000 /* nanoseconds per microsecond */ #define USEC_PER_SEC 1000000 /* microseconds per second */ #define NSEC_PER_SEC 1000000000 /* nanoseconds per second */ +#define NSEC_PER_MSEC 1000000ull /* nanoseconds per millisecond */ #define BAD_MACH_TIMESPEC(t) \ ((t)->tv_nsec < 0 || (t)->tv_nsec >= NSEC_PER_SEC) diff --git a/osfmk/mach/host_info.h b/osfmk/mach/host_info.h index 3bd96bf53..9ad10eaa7 100644 --- a/osfmk/mach/host_info.h +++ b/osfmk/mach/host_info.h @@ -184,6 +184,7 @@ typedef struct host_priority_info *host_priority_info_t; /* host_statistics64() */ #define HOST_VM_INFO64 4 /* 64-bit virtual memory stats */ +#define HOST_EXTMOD_INFO64 5 /* External modification stats */ struct host_load_info { @@ -204,6 +205,13 @@ typedef struct host_load_info *host_load_info_t; /* size of the latest version of the structure */ #define HOST_VM_INFO64_LATEST_COUNT HOST_VM_INFO64_COUNT +/* in */ +/* vm_extmod_statistics */ +#define HOST_EXTMOD_INFO64_COUNT ((mach_msg_type_number_t) \ + (sizeof(vm_extmod_statistics_data_t)/sizeof(integer_t))) + +/* size of the latest version of the structure */ +#define HOST_EXTMOD_INFO64_LATEST_COUNT HOST_EXTMOD_INFO64_COUNT /* vm_statistics */ #define HOST_VM_INFO_COUNT ((mach_msg_type_number_t) \ @@ -229,4 +237,41 @@ typedef struct host_cpu_load_info *host_cpu_load_info_t; #define HOST_CPU_LOAD_INFO_COUNT ((mach_msg_type_number_t) \ (sizeof (host_cpu_load_info_data_t) / sizeof (integer_t))) +#ifdef PRIVATE +/* + * CPU Statistics information + */ +struct _processor_statistics_np { + int32_t ps_cpuid; + + uint32_t ps_csw_count; + uint32_t ps_preempt_count; + uint32_t ps_preempted_rt_count; + uint32_t ps_preempted_by_rt_count; + + uint32_t ps_rt_sched_count; + + uint32_t ps_interrupt_count; + uint32_t ps_ipi_count; + uint32_t ps_timer_pop_count; + + uint64_t ps_runq_count_sum __attribute((aligned(8))); + + uint32_t ps_idle_transitions; + +}; + +#endif /* PRIVATE */ + +#ifdef KERNEL_PRIVATE + +extern kern_return_t set_sched_stats_active( + boolean_t active); + +extern kern_return_t get_sched_statistics( + struct _processor_statistics_np *out, + uint32_t *count); +#endif /* KERNEL_PRIVATE */ + + #endif /* _MACH_HOST_INFO_H_ */ diff --git a/osfmk/mach/i386/_structs.h b/osfmk/mach/i386/_structs.h index bcac16be3..9dd3f4416 100644 --- a/osfmk/mach/i386/_structs.h +++ b/osfmk/mach/i386/_structs.h @@ -401,17 +401,19 @@ _STRUCT_X86_AVX_STATE32 #define _STRUCT_X86_EXCEPTION_STATE32 struct __darwin_i386_exception_state _STRUCT_X86_EXCEPTION_STATE32 { - unsigned int __trapno; - unsigned int __err; - unsigned int __faultvaddr; + __uint16_t __trapno; + __uint16_t __cpu; + __uint32_t __err; + __uint32_t __faultvaddr; }; #else /* !__DARWIN_UNIX03 */ #define _STRUCT_X86_EXCEPTION_STATE32 struct i386_exception_state _STRUCT_X86_EXCEPTION_STATE32 { - unsigned int trapno; - unsigned int err; - unsigned int faultvaddr; + __uint16_t trapno; + __uint16_t cpu; + __uint32_t err; + __uint32_t faultvaddr; }; #endif /* !__DARWIN_UNIX03 */ @@ -748,17 +750,19 @@ _STRUCT_X86_AVX_STATE64 #define _STRUCT_X86_EXCEPTION_STATE64 struct __darwin_x86_exception_state64 _STRUCT_X86_EXCEPTION_STATE64 { - unsigned int __trapno; - unsigned int __err; - __uint64_t __faultvaddr; + __uint16_t __trapno; + __uint16_t __cpu; + __uint32_t __err; + __uint64_t __faultvaddr; }; #else /* !__DARWIN_UNIX03 */ #define _STRUCT_X86_EXCEPTION_STATE64 struct x86_exception_state64 _STRUCT_X86_EXCEPTION_STATE64 { - unsigned int trapno; - unsigned int err; - __uint64_t faultvaddr; + __uint16_t trapno; + __uint16_t cpu; + __uint32_t err; + __uint64_t faultvaddr; }; #endif /* !__DARWIN_UNIX03 */ diff --git a/osfmk/mach/i386/_types.h b/osfmk/mach/i386/_types.h deleted file mode 100644 index 5679b84e2..000000000 --- a/osfmk/mach/i386/_types.h +++ /dev/null @@ -1,221 +0,0 @@ -/* - * Copyright (c) 2004-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -#ifndef _MACH_I386__TYPES_H_ -#define _MACH_I386__TYPES_H_ - -/* - * i386_thread_state is the structure that is exported to user threads for - * use in status/mutate calls. This structure should never change. - * - */ - -#if !__DARWIN_UNIX03 -struct i386_thread_state -#else /* __DARWIN_UNIX03 */ -struct __darwin_i386_thread_state -#endif /* __DARWIN_UNIX03 */ -{ - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - unsigned int edi; - unsigned int esi; - unsigned int ebp; - unsigned int esp; - unsigned int ss; - unsigned int eflags; - unsigned int eip; - unsigned int cs; - unsigned int ds; - unsigned int es; - unsigned int fs; - unsigned int gs; -}; - -#if !__DARWIN_UNIX03 -struct x86_thread_state64 -#else /* __DARWIN_UNIX03 */ -struct __darwin_x86_thread_state64 -#endif /* __DARWIN_UNIX03 */ -{ - uint64_t rax; - uint64_t rbx; - uint64_t rcx; - uint64_t rdx; - uint64_t rdi; - uint64_t rsi; - uint64_t rbp; - uint64_t rsp; - uint64_t r8; - uint64_t r9; - uint64_t r10; - uint64_t r11; - uint64_t r12; - uint64_t r13; - uint64_t r14; - uint64_t r15; - uint64_t rip; - uint64_t rflags; - uint64_t cs; - uint64_t fs; - uint64_t gs; -}; - - -typedef struct fp_control { - unsigned short invalid :1, - denorm :1, - zdiv :1, - ovrfl :1, - undfl :1, - precis :1, - :2, - pc :2, -#define FP_PREC_24B 0 -#define FP_PREC_53B 2 -#define FP_PREC_64B 3 - rc :2, -#define FP_RND_NEAR 0 -#define FP_RND_DOWN 1 -#define FP_RND_UP 2 -#define FP_CHOP 3 - /*inf*/ :1, - :3; -} fp_control_t; -/* - * Status word. - */ - -typedef struct fp_status { - unsigned short invalid :1, - denorm :1, - zdiv :1, - ovrfl :1, - undfl :1, - precis :1, - stkflt :1, - errsumm :1, - c0 :1, - c1 :1, - c2 :1, - tos :3, - c3 :1, - busy :1; -} fp_status_t; - -/* defn of 80bit x87 FPU or MMX register */ -struct mmst_reg { - char mmst_reg[10]; - char mmst_rsrv[6]; -}; - - -/* defn of 128 bit XMM regs */ -struct xmm_reg { - char xmm_reg[16]; -}; - -/* - * Floating point state. - */ - -#define FP_STATE_BYTES 512 /* number of chars worth of data from fpu_fcw */ -#if !__DARWIN_UNIX03 -struct i386_float_state -#else /* __DARWIN_UNIX03 */ -struct __darwin_i386_float_state -#endif /* __DARWIN_UNIX03 */ -{ - int fpu_reserved[2]; - fp_control_t fpu_fcw; /* x87 FPU control word */ - fp_status_t fpu_fsw; /* x87 FPU status word */ - uint8_t fpu_ftw; /* x87 FPU tag word */ - uint8_t fpu_rsrv1; /* reserved */ - uint16_t fpu_fop; /* x87 FPU Opcode */ - uint32_t fpu_ip; /* x87 FPU Instruction Pointer offset */ - uint16_t fpu_cs; /* x87 FPU Instruction Pointer Selector */ - uint16_t fpu_rsrv2; /* reserved */ - uint32_t fpu_dp; /* x87 FPU Instruction Operand(Data) Pointer offset */ - uint16_t fpu_ds; /* x87 FPU Instruction Operand(Data) Pointer Selector */ - uint16_t fpu_rsrv3; /* reserved */ - uint32_t fpu_mxcsr; /* MXCSR Register state */ - uint32_t fpu_mxcsrmask; /* MXCSR mask */ - struct mmst_reg fpu_stmm0; /* ST0/MM0 */ - struct mmst_reg fpu_stmm1; /* ST1/MM1 */ - struct mmst_reg fpu_stmm2; /* ST2/MM2 */ - struct mmst_reg fpu_stmm3; /* ST3/MM3 */ - struct mmst_reg fpu_stmm4; /* ST4/MM4 */ - struct mmst_reg fpu_stmm5; /* ST5/MM5 */ - struct mmst_reg fpu_stmm6; /* ST6/MM6 */ - struct mmst_reg fpu_stmm7; /* ST7/MM7 */ - struct xmm_reg fpu_xmm0; /* XMM 0 */ - struct xmm_reg fpu_xmm1; /* XMM 1 */ - struct xmm_reg fpu_xmm2; /* XMM 2 */ - struct xmm_reg fpu_xmm3; /* XMM 3 */ - struct xmm_reg fpu_xmm4; /* XMM 4 */ - struct xmm_reg fpu_xmm5; /* XMM 5 */ - struct xmm_reg fpu_xmm6; /* XMM 6 */ - struct xmm_reg fpu_xmm7; /* XMM 7 */ - char fpu_rsrv4[14*16]; /* reserved */ - int fpu_reserved1; -}; - - -#if !__DARWIN_UNIX03 -struct i386_exception_state -#else /* __DARWIN_UNIX03 */ -struct __darwin_i386_exception_state -#endif /* __DARWIN_UNIX03 */ -{ - unsigned int trapno; - unsigned int err; - unsigned int faultvaddr; -}; - -#if !__DARWIN_UNIX03 -struct x86_debug_state -#else /* __DARWIN_UNIX03 */ -struct __darwin_x86_debug_state -#endif /* __DARWIN_UNIX03 */ -{ - unsigned int dr0; - unsigned int dr1; - unsigned int dr2; - unsigned int dr3; - unsigned int dr4; - unsigned int dr5; - unsigned int dr6; - unsigned int dr7; -}; - -#endif /* _MACH_I386__TYPES_H_ */ diff --git a/osfmk/mach/i386/sdt_isa.h b/osfmk/mach/i386/sdt_isa.h index c32239162..503f5ce63 100644 --- a/osfmk/mach/i386/sdt_isa.h +++ b/osfmk/mach/i386/sdt_isa.h @@ -41,7 +41,7 @@ */ #ifdef __x86_64__ #define DTRACE_LAB(p, n) \ - "__dtrace_probeDOLLAR" DTRACE_TOSTRING(__LINE__) DTRACE_STRINGIFY(_##p##___##n) + "__dtrace_probeDOLLAR" DTRACE_TOSTRING(%=__LINE__) DTRACE_STRINGIFY(_##p##___##n) #define DTRACE_LABEL(p, n) \ ".section __DATA, __data\n\t" \ @@ -51,7 +51,7 @@ "1:" #else #define DTRACE_LAB(p, n) \ - "__dtrace_probe$" DTRACE_TOSTRING(__LINE__) DTRACE_STRINGIFY(_##p##___##n) + "__dtrace_probe$" DTRACE_TOSTRING(%=__LINE__) DTRACE_STRINGIFY(_##p##___##n) #define DTRACE_LABEL(p, n) \ ".section __DATA, __data\n\t" \ @@ -62,7 +62,7 @@ #endif #else /* !KERNEL */ #define DTRACE_LABEL(p, n) \ - "__dtrace_probe$" DTRACE_TOSTRING(__LINE__) DTRACE_STRINGIFY(_##p##___##n) ":" "\n\t" + "__dtrace_probe$" DTRACE_TOSTRING(%=__LINE__) DTRACE_STRINGIFY(_##p##___##n) ":" "\n\t" #endif /* !KERNEL */ #ifdef DTRACE_CALL_TEST @@ -103,6 +103,8 @@ #define DTRACE_CALL0ARGS(provider, name) \ asm volatile ( \ DTRACE_CALL(provider, name) \ + : \ + : \ ); #define DTRACE_CALL1ARG(provider, name) \ diff --git a/osfmk/mach/i386/thread_status.h b/osfmk/mach/i386/thread_status.h index 501fc8df0..715422ac8 100644 --- a/osfmk/mach/i386/thread_status.h +++ b/osfmk/mach/i386/thread_status.h @@ -300,11 +300,12 @@ typedef struct x86_debug_state x86_debug_state_t; * enough stack */ struct x86_seg_load_fault32 { - unsigned int trapno; - unsigned int err; - unsigned int eip; - unsigned int cs; - unsigned int efl; + uint16_t trapno; + uint16_t cpu; + uint32_t err; + uint32_t eip; + uint32_t cs; + uint32_t efl; }; #ifdef XNU_KERNEL_PRIVATE @@ -318,23 +319,24 @@ struct x86_seg_load_fault32 { * on all traps into debugger.) */ struct x86_saved_state32_from_kernel { - unsigned int gs; - unsigned int fs; - unsigned int es; - unsigned int ds; - unsigned int edi; - unsigned int esi; - unsigned int ebp; - unsigned int cr2; /* kernel esp stored by pusha - we save cr2 here later */ - unsigned int ebx; - unsigned int edx; - unsigned int ecx; - unsigned int eax; - unsigned int trapno; - unsigned int err; - unsigned int eip; - unsigned int cs; - unsigned int efl; + uint32_t gs; + uint32_t fs; + uint32_t es; + uint32_t ds; + uint32_t edi; + uint32_t esi; + uint32_t ebp; + uint32_t cr2; /* kernel esp stored by pusha - we save cr2 here later */ + uint32_t ebx; + uint32_t edx; + uint32_t ecx; + uint32_t eax; + uint16_t trapno; + uint16_t cpu; + uint32_t err; + uint32_t eip; + uint32_t cs; + uint32_t efl; }; /* @@ -343,25 +345,26 @@ struct x86_saved_state32_from_kernel { * servers, because copying can be avoided: */ struct x86_saved_state32 { - unsigned int gs; - unsigned int fs; - unsigned int es; - unsigned int ds; - unsigned int edi; - unsigned int esi; - unsigned int ebp; - unsigned int cr2; /* kernel esp stored by pusha - we save cr2 here later */ - unsigned int ebx; - unsigned int edx; - unsigned int ecx; - unsigned int eax; - unsigned int trapno; - unsigned int err; - unsigned int eip; - unsigned int cs; - unsigned int efl; - unsigned int uesp; - unsigned int ss; + uint32_t gs; + uint32_t fs; + uint32_t es; + uint32_t ds; + uint32_t edi; + uint32_t esi; + uint32_t ebp; + uint32_t cr2; /* kernel esp stored by pusha - we save cr2 here later */ + uint32_t ebx; + uint32_t edx; + uint32_t ecx; + uint32_t eax; + uint16_t trapno; + uint16_t cpu; + uint32_t err; + uint32_t eip; + uint32_t cs; + uint32_t efl; + uint32_t uesp; + uint32_t ss; }; typedef struct x86_saved_state32 x86_saved_state32_t; @@ -374,6 +377,7 @@ struct x86_saved_state32_tagged { struct x86_saved_state32 state; }; typedef struct x86_saved_state32_tagged x86_saved_state32_tagged_t; +/* Note: sizeof(x86_saved_state32_tagged_t) is a multiple of 16 bytes */ struct x86_sframe32 { /* @@ -395,13 +399,10 @@ typedef struct x86_sframe32 x86_sframe32_t; * on any exception/trap/interrupt. */ struct x86_64_intr_stack_frame { - uint32_t trapno; -#if defined(__LP64__) && defined(KERNEL) + uint16_t trapno; + uint16_t cpu; uint32_t _pad; uint64_t trapfn; -#else - uint32_t trapfn; -#endif uint64_t err; uint64_t rip; uint64_t cs; @@ -410,6 +411,7 @@ struct x86_64_intr_stack_frame { uint64_t ss; }; typedef struct x86_64_intr_stack_frame x86_64_intr_stack_frame_t; +/* Note: sizeof(x86_64_intr_stack_frame_t) must be a multiple of 16 bytes */ /* * This defines the state saved before entry into compatibility mode. @@ -418,24 +420,18 @@ typedef struct x86_64_intr_stack_frame x86_64_intr_stack_frame_t; */ struct x86_saved_state_compat32 { struct x86_saved_state32_tagged iss32; -#if defined(__LP64__) && defined(KERNEL) -#else - uint32_t pad_for_16byte_alignment[2]; -#endif - struct x86_64_intr_stack_frame isf64; + struct x86_64_intr_stack_frame isf64; }; typedef struct x86_saved_state_compat32 x86_saved_state_compat32_t; struct x86_sframe_compat32 { + uint32_t pad_for_16byte_alignment[2]; + uint64_t _register_save_slot; struct x86_64_intr_stack_frame slf; -#if defined(__LP64__) && defined(KERNEL) -#else - uint32_t pad_for_16byte_alignment[2]; -#endif struct x86_saved_state_compat32 ssf; - uint32_t empty[4]; }; typedef struct x86_sframe_compat32 x86_sframe_compat32_t; +/* Note: sizeof(x86_sframe_compat32_t) must be a multiple of 16 bytes */ /* * thread state format for task running in 64bit long mode @@ -480,9 +476,9 @@ struct x86_saved_state64 { uint32_t gs; uint32_t fs; -#ifdef __x86_64__ - uint32_t _pad_for_alignment[3]; -#endif + + uint32_t _pad_for_tagged_alignment[3]; + struct x86_64_intr_stack_frame isf; }; typedef struct x86_saved_state64 x86_saved_state64_t; @@ -496,13 +492,12 @@ struct x86_saved_state64_tagged { typedef struct x86_saved_state64_tagged x86_saved_state64_tagged_t; struct x86_sframe64 { - struct x86_64_intr_stack_frame slf; -#ifdef __i386__ - uint32_t _pad_for_alignment[3]; -#endif - struct x86_saved_state64_tagged ssf; + uint64_t _register_save_slot[2]; + struct x86_64_intr_stack_frame slf; + x86_saved_state64_tagged_t ssf; }; typedef struct x86_sframe64 x86_sframe64_t; +/* Note: sizeof(x86_sframe64_t) is a multiple of 16 bytes */ extern uint32_t get_eflags_exportmask(void); diff --git a/osfmk/mach/i386/vm_param.h b/osfmk/mach/i386/vm_param.h index 9487ff7ef..fb2ca164f 100644 --- a/osfmk/mach/i386/vm_param.h +++ b/osfmk/mach/i386/vm_param.h @@ -182,24 +182,29 @@ #if defined(__i386__) -#define VM_MIN_KERNEL_ADDRESS ((vm_offset_t) 0x00001000U) -#define VM_MIN_KERNEL_AND_KEXT_ADDRESS VM_MIN_KERNEL_ADDRESS - -#define VM_MAX_KERNEL_ADDRESS ((vm_offset_t) 0xFE7FFFFFU) -#define KERNEL_STACK_SIZE (I386_PGBYTES*4) +#define KERNEL_IMAGE_TO_PHYS(x) (x) +#define VM_MIN_KERNEL_ADDRESS ((vm_offset_t) 0x00001000U) +#define VM_MIN_KERNEL_AND_KEXT_ADDRESS VM_MIN_KERNEL_ADDRESS +#define VM_MAX_KERNEL_ADDRESS ((vm_offset_t) 0xFE7FFFFFU) #elif defined(__x86_64__) -#define VM_MIN_KERNEL_ADDRESS ((vm_offset_t) 0xFFFFFF8000000000UL) -#define VM_MIN_KERNEL_AND_KEXT_ADDRESS (VM_MIN_KERNEL_ADDRESS - 0x80000000ULL) - -#define VM_MAX_KERNEL_ADDRESS ((vm_offset_t) 0xFFFFFFFFFFFFEFFFUL) -#define KERNEL_STACK_SIZE (I386_PGBYTES*4) +#define KERNEL_IMAGE_TO_PHYS(x) (x) +#define VM_MIN_KERNEL_ADDRESS ((vm_offset_t) 0xFFFFFF8000000000UL) +#define VM_MIN_KERNEL_PAGE ((ppnum_t)0) +#define VM_MIN_KERNEL_AND_KEXT_ADDRESS (VM_MIN_KERNEL_ADDRESS - 0x80000000ULL) +#define VM_MAX_KERNEL_ADDRESS ((vm_offset_t) 0xFFFFFFFFFFFFEFFFUL) +#define VM_MAX_KERNEL_ADDRESS_EFI32 ((vm_offset_t) 0xFFFFFF80FFFFEFFFUL) +#define KEXT_ALLOC_MAX_OFFSET (2 * 1024 * 1024 * 1024UL) +#define KEXT_ALLOC_BASE(x) ((x) - KEXT_ALLOC_MAX_OFFSET) +#define KEXT_ALLOC_SIZE(x) (KEXT_ALLOC_MAX_OFFSET - (x)) #else #error unsupported architecture #endif +#define KERNEL_STACK_SIZE (I386_PGBYTES*4) + #define VM_MAP_MIN_ADDRESS MACH_VM_MIN_ADDRESS #define VM_MAP_MAX_ADDRESS MACH_VM_MAX_ADDRESS diff --git a/osfmk/mach/mach_host.defs b/osfmk/mach/mach_host.defs index df309d936..536cdce83 100644 --- a/osfmk/mach/mach_host.defs +++ b/osfmk/mach/mach_host.defs @@ -168,6 +168,8 @@ routine kmod_get_info( /* * Returns information about the memory allocation zones. * Supported in all kernels.. + * + * DEPRECATED! Use mach_zone_info() instead. */ routine host_zone_info( host : host_t; @@ -257,5 +259,17 @@ routine host_statistics64( flavor : host_flavor_t; out host_info64_out : host_info64_t, CountInOut); +/* + * Returns information about the memory allocation zones. + * Data returned is compatible with various caller and kernel + * address space sizes (unlike host_zone_info()). + */ +routine mach_zone_info( + host : host_t; + out names : mach_zone_name_array_t, + Dealloc; + out info : mach_zone_info_array_t, + Dealloc); + /* vim: set ft=c : */ diff --git a/osfmk/mach/mach_port.defs b/osfmk/mach/mach_port.defs index 5801ee42c..6c612758d 100644 --- a/osfmk/mach/mach_port.defs +++ b/osfmk/mach/mach_port.defs @@ -239,9 +239,10 @@ routine mach_port_move_member( * Requests a notification from the kernel. The request * must supply the send-once right which is used for * the notification. If a send-once right was previously - * registered, it is returned. The msg_id must be one of + * registered, it is returned. The msgid must be one of: * MACH_NOTIFY_PORT_DESTROYED (receive rights) * MACH_NOTIFY_DEAD_NAME (send/receive/send-once rights) + * MACH_NOTIFY_SEND_POSSIBLE (send/receive/send-once rights) * MACH_NOTIFY_NO_SENDERS (receive rights) * * The sync value specifies whether a notification should @@ -251,10 +252,20 @@ routine mach_port_move_member( * MACH_NOTIFY_DEAD_NAME: if non-zero, then name can be dead, * and the notification gets sent immediately. * If zero, then name can't be dead. + * MACH_NOTIFY_SEND_POSSIBLE: if non-zero, will generate a send- + * possible notification as soon as it is possible to send + * to the port. If zero, will generate a send-possible + * notification only after a subsequent failed send + * (with MACH_SEND_NOTIFY option to mach_msg call). Can + * generate a dead-name notification if name is already dead + * or becomes dead before a send-possible notification fires. * MACH_NOTIFY_NO_SENDERS: the notification gets sent * immediately if the current mscount is greater * than or equal to the sync value and there are no * extant send rights. + * + * If the name is deleted before a successfully registered notification + * is delivered, it is replaced with a port-deleted notification. */ routine mach_port_request_notification( diff --git a/osfmk/mach/mach_traps.h b/osfmk/mach/mach_traps.h index 38298e27e..37ab3277f 100644 --- a/osfmk/mach/mach_traps.h +++ b/osfmk/mach/mach_traps.h @@ -199,7 +199,7 @@ extern kern_return_t pid_for_task( mach_port_name_t t, int *x); -#if !defined(__LP64__) +#if !defined(__LP64__) && !defined(__arm__) /* these should go away altogether - so no 64 legacy please */ extern kern_return_t map_fd( @@ -209,13 +209,17 @@ extern kern_return_t map_fd( boolean_t findspace, vm_size_t size); -#endif /* !defined(__LP64__) */ +#endif /* !defined(__LP64__) && !defined(__arm__) */ #else /* KERNEL */ #ifdef XNU_KERNEL_PRIVATE -/* Syscall data translations routines */ +/* Syscall data translations routines + * + * The kernel may support multiple userspace ABIs, and must use + * argument structures with elements large enough for any of them. + */ #define PAD_(t) (sizeof(uint64_t) <= sizeof(t) \ ? 0 : sizeof(uint64_t) - sizeof(t)) #define PAD_ARG_8 @@ -231,9 +235,14 @@ extern kern_return_t map_fd( #define PAD_ARG_(arg_type, arg_name) \ char arg_name##_l_[PADL_(arg_type)]; arg_type arg_name; char arg_name##_r_[PADR_(arg_type)]; -#ifndef __MUNGE_ONCE -#define __MUNGE_ONCE -#ifdef __ppc__ +/* + * To support 32-bit clients as well as 64-bit clients, argument + * structures may need to be munged to repack the arguments. All + * active architectures do this inline in the code to dispatch Mach + * traps, without calling out to the BSD system call mungers. + */ + +#if 0 /* no active architectures use this */ void munge_w(const void *, void *); void munge_ww(const void *, void *); void munge_www(const void *, void *); @@ -258,33 +267,7 @@ void munge_wlw(const void *, void *); void munge_wwwl(const void *, void *); void munge_wwwwl(const void *, void *); void munge_wwwwwl(const void *, void *); -#else -#define munge_w NULL -#define munge_ww NULL -#define munge_www NULL -#define munge_wwww NULL -#define munge_wwwww NULL -#define munge_wwwwww NULL -#define munge_wwwwwww NULL -#define munge_wwwwwwww NULL -#define munge_d NULL -#define munge_dd NULL -#define munge_ddd NULL -#define munge_dddd NULL -#define munge_ddddd NULL -#define munge_dddddd NULL -#define munge_ddddddd NULL -#define munge_dddddddd NULL -#define munge_l NULL -#define munge_lw NULL -#define munge_lwww NULL -#define munge_wl NULL -#define munge_wlw NULL -#define munge_wwwl NULL -#define munge_wwwwl NULL -#define munge_wwwwwl NULL -#endif /* __ppc__ */ -#endif /* !__MUNGE_ONCE */ +#endif /* 0 */ struct kern_invalid_args { int32_t dummy; @@ -381,6 +364,7 @@ struct semaphore_timedwait_signal_trap_args { extern kern_return_t semaphore_timedwait_signal_trap( struct semaphore_timedwait_signal_trap_args *args); +#if !defined(CONFIG_EMBEDDED) struct map_fd_args { PAD_ARG_(int, fd); PAD_ARG_(vm_offset_t, offset); @@ -390,6 +374,7 @@ struct map_fd_args { }; extern kern_return_t map_fd( struct map_fd_args *args); +#endif /* !defined(CONFIG_EMBEDDED) */ struct task_for_pid_args { PAD_ARG_(mach_port_name_t, target_tport); diff --git a/osfmk/mach/mach_types.defs b/osfmk/mach/mach_types.defs index c4479bd51..0f36eeec5 100644 --- a/osfmk/mach/mach_types.defs +++ b/osfmk/mach/mach_types.defs @@ -213,11 +213,13 @@ type thread_policy_t = array[*:16] of integer_t; * policy_rr_info_t (5 ints) * task security token (2 ints) * task audit token (8 ints) + * dyld info (2 64-bit ints and 1 int) + * task_extmod_info_t (8 64-bit ints) * If other task_info flavors are added, this * definition may need to be changed. (See * mach/task_info.h and mach/policy.h) */ type task_flavor_t = int; -type task_info_t = array[*:10] of integer_t; +type task_info_t = array[*:32] of integer_t; type task_policy_flavor_t = natural_t; type task_policy_t = array[*:16] of integer_t; @@ -311,6 +313,7 @@ type host_info_t = array[*:15] of integer_t; * host_info64_t: variable-sized inline array that can contain: * * vm_statistics_t (6 ints and 9 longs) + * vm_extmod_statistics_t (6 64-bit ints) */ type host_info64_t = array[*:256] of integer_t; diff --git a/osfmk/mach/mach_types.h b/osfmk/mach/mach_types.h index 5f9ddf14f..e4e47f63a 100644 --- a/osfmk/mach/mach_types.h +++ b/osfmk/mach/mach_types.h @@ -245,6 +245,7 @@ typedef exception_handler_array_t exception_port_arrary_t; #define TASK_NULL ((task_t) 0) #define TASK_NAME_NULL ((task_name_t) 0) #define THREAD_NULL ((thread_t) 0) +#define TID_NULL ((uint64_t) 0) #define THR_ACT_NULL ((thread_act_t) 0) #define IPC_SPACE_NULL ((ipc_space_t) 0) #define HOST_NULL ((host_t) 0) diff --git a/osfmk/mach/mach_vm.defs b/osfmk/mach/mach_vm.defs index 0cd136c69..ade3eaa61 100644 --- a/osfmk/mach/mach_vm.defs +++ b/osfmk/mach/mach_vm.defs @@ -291,7 +291,11 @@ routine vm_behavior_set( #if !defined(_MACH_VM_PUBLISH_AS_LOCAL_) routine mach_vm_map( #else +#if defined(__arm__) && !LIBSYSCALL_INTERFACE +routine _vm_map_arm( +#else routine vm_map( +#endif #endif target_task : vm_task_entry_t; inout address : mach_vm_address_t; diff --git a/osfmk/mach/machine.h b/osfmk/mach/machine.h index d209fa278..9ebf5532c 100644 --- a/osfmk/mach/machine.h +++ b/osfmk/mach/machine.h @@ -370,10 +370,12 @@ __END_DECLS #define CPUFAMILY_INTEL_PENRYN 0x78ea4fbc #define CPUFAMILY_INTEL_NEHALEM 0x6b5a4cd2 #define CPUFAMILY_INTEL_WESTMERE 0x573b5eec +#define CPUFAMILY_INTEL_SANDYBRIDGE 0x5490b78c #define CPUFAMILY_ARM_9 0xe73283ae #define CPUFAMILY_ARM_11 0x8ff620d8 #define CPUFAMILY_ARM_XSCALE 0x53b005f5 #define CPUFAMILY_ARM_13 0x0cc90e64 +#define CPUFAMILY_ARM_14 0x96077ef1 /* The following synonyms are deprecated: */ #define CPUFAMILY_INTEL_6_14 CPUFAMILY_INTEL_YONAH diff --git a/osfmk/mach/machine/asm.h b/osfmk/mach/machine/asm.h index ba98269a5..1cdbb8109 100644 --- a/osfmk/mach/machine/asm.h +++ b/osfmk/mach/machine/asm.h @@ -29,9 +29,7 @@ #ifndef _MACH_MACHINE_ASM_H #define _MACH_MACHINE_ASM_H -#if defined (__ppc__) || defined (__ppc64__) -#include "mach/ppc/asm.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "mach/i386/asm.h" #else #error architecture not supported diff --git a/osfmk/mach/machine/boolean.h b/osfmk/mach/machine/boolean.h index 97ffd0766..521033b72 100644 --- a/osfmk/mach/machine/boolean.h +++ b/osfmk/mach/machine/boolean.h @@ -29,9 +29,7 @@ #ifndef _MACH_MACHINE_BOOLEAN_H_ #define _MACH_MACHINE_BOOLEAN_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "mach/ppc/boolean.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "mach/i386/boolean.h" #else #error architecture not supported diff --git a/osfmk/mach/machine/exception.h b/osfmk/mach/machine/exception.h index 5fc148663..5fce0e919 100644 --- a/osfmk/mach/machine/exception.h +++ b/osfmk/mach/machine/exception.h @@ -29,9 +29,7 @@ #ifndef _MACH_MACHINE_EXCEPTION_H_ #define _MACH_MACHINE_EXCEPTION_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "mach/ppc/exception.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "mach/i386/exception.h" #else #error architecture not supported diff --git a/osfmk/mach/machine/kern_return.h b/osfmk/mach/machine/kern_return.h index 82c0adf0c..e2b5bc677 100644 --- a/osfmk/mach/machine/kern_return.h +++ b/osfmk/mach/machine/kern_return.h @@ -29,9 +29,7 @@ #ifndef _MACH_MACHINE_KERN_RETURN_H_ #define _MACH_MACHINE_KERN_RETURN_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "mach/ppc/kern_return.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "mach/i386/kern_return.h" #else #error architecture not supported diff --git a/osfmk/mach/machine/machine_types.defs b/osfmk/mach/machine/machine_types.defs index 418d16bcc..2ed0d52fd 100644 --- a/osfmk/mach/machine/machine_types.defs +++ b/osfmk/mach/machine/machine_types.defs @@ -29,9 +29,7 @@ #ifndef _MACH_MACHINE_MACHINE_TYPES_DEFS #define _MACH_MACHINE_MACHINE_TYPES_DEFS -#if defined (__ppc__) || defined (__ppc64__) -#include "mach/ppc/machine_types.defs" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "mach/i386/machine_types.defs" #elif defined (__arm__) #include "mach/arm/machine_types.defs" diff --git a/osfmk/mach/machine/ndr_def.h b/osfmk/mach/machine/ndr_def.h index 10e8e3e2f..2d3451472 100644 --- a/osfmk/mach/machine/ndr_def.h +++ b/osfmk/mach/machine/ndr_def.h @@ -29,9 +29,7 @@ #ifndef _MACH_MACHINE_NDR_DEF_H #define _MACH_MACHINE_NDR_DEF_H -#if defined (__ppc__) || defined (__ppc64__) -#include "mach/ppc/ndr_def.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "mach/i386/ndr_def.h" #else #error architecture not supported diff --git a/osfmk/mach/machine/processor_info.h b/osfmk/mach/machine/processor_info.h index a4c6d639e..c7ddb5b01 100644 --- a/osfmk/mach/machine/processor_info.h +++ b/osfmk/mach/machine/processor_info.h @@ -29,9 +29,7 @@ #ifndef _MACH_MACHINE_PROCESSOR_INFO_H_ #define _MACH_MACHINE_PROCESSOR_INFO_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "mach/ppc/processor_info.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "mach/i386/processor_info.h" #else #error architecture not supported diff --git a/osfmk/mach/machine/rpc.h b/osfmk/mach/machine/rpc.h index 849260ae4..3e543a88f 100644 --- a/osfmk/mach/machine/rpc.h +++ b/osfmk/mach/machine/rpc.h @@ -29,9 +29,7 @@ #ifndef _MACH_MACHINE_RPC_H_ #define _MACH_MACHINE_RPC_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "mach/ppc/rpc.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "mach/i386/rpc.h" #else #error architecture not supported diff --git a/osfmk/mach/machine/sdt.h b/osfmk/mach/machine/sdt.h index af2b59b1e..551f2b0fc 100644 --- a/osfmk/mach/machine/sdt.h +++ b/osfmk/mach/machine/sdt.h @@ -224,6 +224,54 @@ type3, arg3, type4, arg4) \ DTRACE_PROBE4(__vminfo_, name, arg1, arg2, arg3, arg4) +#define DTRACE_IP(name) \ + DTRACE_PROBE(__ip_, name) + +#define DTRACE_IP1(name, type1, arg1) \ + DTRACE_PROBE1(__ip_, name, arg1) + +#define DTRACE_IP2(name, type1, arg1, type2, arg2) \ + DTRACE_PROBE2(__ip_, name, arg1, arg2) + +#define DTRACE_IP3(name, type1, arg1, type2, arg2, type3, arg3) \ + DTRACE_PROBE3(__ip_, name, arg1, arg2, arg3) + +#define DTRACE_IP4(name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4) \ + DTRACE_PROBE4(__ip_, name, arg1, arg2, arg3, arg4) + +#define DTRACE_IP5(name, typ1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5) \ + DTRACE_PROBE5(__ip_, name, arg1, arg2, arg3, arg4, arg5) + +#define DTRACE_IP6(name, type1, arg1, type2, arg2, type3, arg3, \ + type4,arg4, type5, arg5, type6, arg6) \ + DTRACE_PROBE6(__ip_, name, arg1, arg2, arg3, arg4, arg5, arg6) + +#define DTRACE_IP7(name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5, type6, arg6, type7, arg7) \ + DTRACE_PROBE7(__ip_, name, arg1, arg2, arg3, arg4, arg5, arg6, arg7) + +#define DTRACE_TCP(name) \ + DTRACE_PROBE(__tcp_, name) + +#define DTRACE_TCP1(name, type1, arg1) \ + DTRACE_PROBE1(__tcp_, name, arg1) + +#define DTRACE_TCP2(name, type1, arg1, type2, arg2) \ + DTRACE_PROBE2(__tcp_, name, arg1, arg2) + +#define DTRACE_TCP3(name, type1, arg1, type2, arg2, type3, arg3) \ + DTRACE_PROBE3(__tcp_, name, arg1, arg2, arg3) + +#define DTRACE_TCP4(name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4) \ + DTRACE_PROBE4(__tcp_, name, arg1, arg2, arg3, arg4) + +#define DTRACE_TCP5(name, typ1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5) \ + DTRACE_PROBE5(__tcp_, name, arg1, arg2, arg3, arg4, arg5) + #else /* CONFIG_DTRACE */ #define DTRACE_SCHED(name) do {} while (0) @@ -250,6 +298,22 @@ #define DTRACE_VM2(name, type1, arg1, type2, arg2) do {} while(0) #define DTRACE_VM3(name, type1, arg1, type2, arg2, type3, arg3) do {} while(0) #define DTRACE_VM4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) do {} while(0) +#define DTRACE_IP(name) do {} while(0) +#define DTRACE_IP1(name, type1, arg1) do {} while(0) +#define DTRACE_IP2(name, type1, arg1, type2, arg2) do {} while(0) +#define DTRACE_IP3(name, type1, arg1, type2, arg2, type3, arg3) do {} while(0) +#define DTRACE_IP4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) do {} while(0) +#define DTRACE_IP5(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4, type5, arg5) do {} while(0) +#define DTRACE_IP6(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4, type5, arg5, type6, arg6) do {} while(0) +#define DTRACE_IP7(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4, type5, arg5, \ + type6, arg6, type7, arg7) do {} while(0) + +#define DTRACE_TCP(name) do {} while(0) +#define DTRACE_TCP1(name, type1, arg1) do {} while(0) +#define DTRACE_TCP2(name, type1, arg1, type2, arg2) do {} while(0) +#define DTRACE_TCP3(name, type1, arg1, type2, arg2, type3, arg3) do {} while(0) +#define DTRACE_TCP4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) do {} while(0) +#define DTRACE_TCP5(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4, type5, arg5) do {} while(0) #endif /* CONFIG_DTRACE */ diff --git a/osfmk/mach/machine/sdt_isa.h b/osfmk/mach/machine/sdt_isa.h index 000690744..edd26dcc2 100644 --- a/osfmk/mach/machine/sdt_isa.h +++ b/osfmk/mach/machine/sdt_isa.h @@ -28,9 +28,7 @@ #ifndef _MACH_MACHINE_SDT_ISA_H_ #define _MACH_MACHINE_SDT_ISA_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include #else #error architecture not supported diff --git a/osfmk/mach/machine/syscall_sw.h b/osfmk/mach/machine/syscall_sw.h index 972331769..902b6815e 100644 --- a/osfmk/mach/machine/syscall_sw.h +++ b/osfmk/mach/machine/syscall_sw.h @@ -31,9 +31,7 @@ #ifndef _MACH_MACHINE_SYSCALL_SW_H_ #define _MACH_MACHINE_SYSCALL_SW_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "mach/ppc/syscall_sw.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "mach/i386/syscall_sw.h" #else #error architecture not supported diff --git a/osfmk/mach/machine/thread_state.h b/osfmk/mach/machine/thread_state.h index bf9a155d9..061477698 100644 --- a/osfmk/mach/machine/thread_state.h +++ b/osfmk/mach/machine/thread_state.h @@ -29,9 +29,7 @@ #ifndef _MACH_MACHINE_THREAD_STATE_H_ #define _MACH_MACHINE_THREAD_STATE_H_ -#if defined (__ppc__) || defined(__ppc64__) -#include "mach/ppc/thread_state.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "mach/i386/thread_state.h" #else #error architecture not supported diff --git a/osfmk/mach/machine/thread_status.h b/osfmk/mach/machine/thread_status.h index 10ed68996..74cda9596 100644 --- a/osfmk/mach/machine/thread_status.h +++ b/osfmk/mach/machine/thread_status.h @@ -29,9 +29,7 @@ #ifndef _MACH_MACHINE_THREAD_STATUS_H_ #define _MACH_MACHINE_THREAD_STATUS_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "mach/ppc/thread_status.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "mach/i386/thread_status.h" #else #error architecture not supported diff --git a/osfmk/mach/machine/vm_param.h b/osfmk/mach/machine/vm_param.h index 685342999..5898fdba1 100644 --- a/osfmk/mach/machine/vm_param.h +++ b/osfmk/mach/machine/vm_param.h @@ -29,9 +29,7 @@ #ifndef _MACH_MACHINE_VM_PARAM_H_ #define _MACH_MACHINE_VM_PARAM_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "mach/ppc/vm_param.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "mach/i386/vm_param.h" #else #error architecture not supported diff --git a/osfmk/mach/machine/vm_types.h b/osfmk/mach/machine/vm_types.h index a5c4c8ba1..2b7526570 100644 --- a/osfmk/mach/machine/vm_types.h +++ b/osfmk/mach/machine/vm_types.h @@ -29,9 +29,7 @@ #ifndef _MACH_MACHINE_VM_TYPES_H_ #define _MACH_MACHINE_VM_TYPES_H_ -#if defined (__ppc__) || defined(__ppc64__) -#include "mach/ppc/vm_types.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "mach/i386/vm_types.h" #else #error architecture not supported diff --git a/osfmk/mach/memory_object.defs b/osfmk/mach/memory_object.defs index 436e9b290..01afb30a0 100644 --- a/osfmk/mach/memory_object.defs +++ b/osfmk/mach/memory_object.defs @@ -203,4 +203,8 @@ routine memory_object_map( routine memory_object_last_unmap( memory_object : memory_object_t); +routine memory_object_data_reclaim( + memory_object : memory_object_t; + reclaim_backing_store : boolean_t); + /* vim: set ft=c : */ diff --git a/osfmk/mach/memory_object_types.h b/osfmk/mach/memory_object_types.h index c4a5df888..846987cfd 100644 --- a/osfmk/mach/memory_object_types.h +++ b/osfmk/mach/memory_object_types.h @@ -158,6 +158,9 @@ typedef const struct memory_object_pager_ops { vm_prot_t prot); kern_return_t (*memory_object_last_unmap)( memory_object_t mem_obj); + kern_return_t (*memory_object_data_reclaim)( + memory_object_t mem_obj, + boolean_t reclaim_backing_store); const char *memory_object_pager_name; } * memory_object_pager_ops_t; @@ -376,10 +379,10 @@ typedef struct memory_object_attr_info memory_object_attr_info_data_t; & 0xFF000000) | ((flags) & 0xFFFFFF)); /* leave room for vm_prot bits */ -#define MAP_MEM_ONLY 0x10000 /* change processor caching */ -#define MAP_MEM_NAMED_CREATE 0x20000 /* create extant object */ -#define MAP_MEM_PURGABLE 0x40000 /* create a purgable VM object */ -#define MAP_MEM_NAMED_REUSE 0x80000 /* reuse provided entry if identical */ +#define MAP_MEM_ONLY 0x010000 /* change processor caching */ +#define MAP_MEM_NAMED_CREATE 0x020000 /* create extant object */ +#define MAP_MEM_PURGABLE 0x040000 /* create a purgable VM object */ +#define MAP_MEM_NAMED_REUSE 0x080000 /* reuse provided entry if identical */ #ifdef KERNEL @@ -463,9 +466,10 @@ typedef uint32_t upl_size_t; /* page-aligned byte size */ #define UPL_UBC_MSYNC 0x02000000 #define UPL_UBC_PAGEOUT 0x04000000 #define UPL_UBC_PAGEIN 0x08000000 +#define UPL_REQUEST_SET_DIRTY 0x10000000 /* UPL flags known by this kernel */ -#define UPL_VALID_FLAGS 0x0FFFFFFF +#define UPL_VALID_FLAGS 0x1FFFFFFF /* upl abort error flags */ @@ -518,9 +522,7 @@ typedef uint32_t upl_size_t; /* page-aligned byte size */ /* * */ -#ifdef MACH_KERNEL_PRIVATE #define UPL_PAGING_ENCRYPTED 0x20 -#endif /* MACH_KERNEL_PRIVATE */ /* * this pageout is being originated as part of an explicit @@ -682,6 +684,7 @@ extern ppnum_t upl_phys_page(upl_page_info_t *upl, int index); extern boolean_t upl_device_page(upl_page_info_t *upl); extern boolean_t upl_speculative_page(upl_page_info_t *upl, int index); extern void upl_clear_dirty(upl_t upl, boolean_t value); +extern void upl_set_referenced(upl_t upl, boolean_t value); __END_DECLS diff --git a/osfmk/mach/message.h b/osfmk/mach/message.h index 9be5f5e90..195607585 100644 --- a/osfmk/mach/message.h +++ b/osfmk/mach/message.h @@ -581,12 +581,12 @@ typedef integer_t mach_msg_option_t; #define MACH_SEND_TIMEOUT 0x00000010 #define MACH_SEND_INTERRUPT 0x00000040 /* libmach implements */ -#define MACH_SEND_CANCEL 0x00000080 +#define MACH_SEND_NOTIFY 0x00000080 /* arm send-possible notify */ #define MACH_SEND_ALWAYS 0x00010000 /* internal use only */ #define MACH_SEND_TRAILER 0x00020000 #define MACH_RCV_TIMEOUT 0x00000100 -#define MACH_RCV_NOTIFY 0x00000200 +#define MACH_RCV_NOTIFY 0x00000200 /* reserved - legacy */ #define MACH_RCV_INTERRUPT 0x00000400 /* libmach implements */ #define MACH_RCV_OVERWRITE 0x00001000 diff --git a/osfmk/mach/notify.defs b/osfmk/mach/notify.defs index 6f7f81d2d..4aece97bf 100644 --- a/osfmk/mach/notify.defs +++ b/osfmk/mach/notify.defs @@ -83,7 +83,17 @@ simpleroutine mach_notify_port_deleted( #endif /* SEQNOS */ name : mach_port_name_t); -skip; /* was MACH_NOTIFY_MSG_ACCEPTED: 0102 */ +#if (KERNEL_USER | MACH_NOTIFY_SEND_POSSIBLE_EXPECTED) +/* MACH_NOTIFY_SEND_POSSIBLE: 0102 */ +simpleroutine mach_notify_send_possible( + notify : mach_port_move_send_once_t; +#if SEQNOS + msgseqno seqno : mach_port_seqno_t; +#endif /* SEQNOS */ + name : mach_port_name_t); +#else +skip; +#endif skip; /* was NOTIFY_OWNERSHIP_RIGHTS: 0103 */ diff --git a/osfmk/mach/notify.h b/osfmk/mach/notify.h index 768a865cd..845646c5c 100644 --- a/osfmk/mach/notify.h +++ b/osfmk/mach/notify.h @@ -74,8 +74,10 @@ */ #define MACH_NOTIFY_FIRST 0100 -#define MACH_NOTIFY_PORT_DELETED (MACH_NOTIFY_FIRST + 001 ) +#define MACH_NOTIFY_PORT_DELETED (MACH_NOTIFY_FIRST + 001) /* A send or send-once right was deleted. */ +#define MACH_NOTIFY_SEND_POSSIBLE (MACH_NOTIFY_FIRST + 002) + /* Now possible to send using specified right */ #define MACH_NOTIFY_PORT_DESTROYED (MACH_NOTIFY_FIRST + 005) /* A receive right was (would have been) deallocated */ #define MACH_NOTIFY_NO_SENDERS (MACH_NOTIFY_FIRST + 006) @@ -103,6 +105,13 @@ typedef struct { mach_msg_format_0_trailer_t trailer; } mach_port_deleted_notification_t; +typedef struct { + mach_msg_header_t not_header; + NDR_record_t NDR; + mach_port_name_t not_port;/* MACH_MSG_TYPE_PORT_NAME */ + mach_msg_format_0_trailer_t trailer; +} mach_send_possible_notification_t; + typedef struct { mach_msg_header_t not_header; mach_msg_body_t not_body; diff --git a/osfmk/mach/port.h b/osfmk/mach/port.h index 9db876f17..b09673aba 100644 --- a/osfmk/mach/port.h +++ b/osfmk/mach/port.h @@ -277,7 +277,9 @@ typedef mach_port_type_t *mach_port_type_array_t; /* Dummy type bits that mach_port_type/mach_port_names can return. */ -#define MACH_PORT_TYPE_DNREQUEST 0x80000000 +#define MACH_PORT_TYPE_DNREQUEST 0x80000000 +#define MACH_PORT_TYPE_SPREQUEST 0x40000000 +#define MACH_PORT_TYPE_SPREQUEST_DELAYED 0x20000000 /* User-references for capabilities. */ diff --git a/osfmk/mach/ppc/Makefile b/osfmk/mach/ppc/Makefile deleted file mode 100644 index 83f21cec6..000000000 --- a/osfmk/mach/ppc/Makefile +++ /dev/null @@ -1,35 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -VPATH+=$(SOURCE)/../../ppc: - -DATAFILES = \ - boolean.h exception.h kern_return.h ndr_def.h \ - processor_info.h rpc.h thread_state.h thread_status.h \ - vm_param.h vm_types.h machine_types.defs \ - syscall_sw.h _structs.h sdt_isa.h - -INSTALL_MD_LIST = ${DATAFILES} - -INSTALL_MD_GEN_LIST = \ - asm.h - -INSTALL_MD_DIR = mach/ppc - -EXPORT_MD_LIST = ${DATAFILES} - -EXPORT_MD_GEN_LIST = \ - asm.h - -EXPORT_MD_DIR = mach/ppc - -include $(MakeInc_rule) -include $(MakeInc_dir) - - diff --git a/osfmk/mach/ppc/_structs.h b/osfmk/mach/ppc/_structs.h deleted file mode 100644 index f2c78cda1..000000000 --- a/osfmk/mach/ppc/_structs.h +++ /dev/null @@ -1,392 +0,0 @@ -/* - * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -#ifndef _MACH_PPC__STRUCTS_H_ -#define _MACH_PPC__STRUCTS_H_ - -#include - -/* - * ppc_thread_state is the structure that is exported to user threads for - * use in status/mutate calls. This structure should never change. - * - */ - -#if __DARWIN_UNIX03 -#define _STRUCT_PPC_THREAD_STATE struct __darwin_ppc_thread_state -_STRUCT_PPC_THREAD_STATE -{ - unsigned int __srr0; /* Instruction address register (PC) */ - unsigned int __srr1; /* Machine state register (supervisor) */ - unsigned int __r0; - unsigned int __r1; - unsigned int __r2; - unsigned int __r3; - unsigned int __r4; - unsigned int __r5; - unsigned int __r6; - unsigned int __r7; - unsigned int __r8; - unsigned int __r9; - unsigned int __r10; - unsigned int __r11; - unsigned int __r12; - unsigned int __r13; - unsigned int __r14; - unsigned int __r15; - unsigned int __r16; - unsigned int __r17; - unsigned int __r18; - unsigned int __r19; - unsigned int __r20; - unsigned int __r21; - unsigned int __r22; - unsigned int __r23; - unsigned int __r24; - unsigned int __r25; - unsigned int __r26; - unsigned int __r27; - unsigned int __r28; - unsigned int __r29; - unsigned int __r30; - unsigned int __r31; - - unsigned int __cr; /* Condition register */ - unsigned int __xer; /* User's integer exception register */ - unsigned int __lr; /* Link register */ - unsigned int __ctr; /* Count register */ - unsigned int __mq; /* MQ register (601 only) */ - - unsigned int __vrsave; /* Vector Save Register */ -}; -#else /* !__DARWIN_UNIX03 */ -#define _STRUCT_PPC_THREAD_STATE struct ppc_thread_state -_STRUCT_PPC_THREAD_STATE -{ - unsigned int srr0; /* Instruction address register (PC) */ - unsigned int srr1; /* Machine state register (supervisor) */ - unsigned int r0; - unsigned int r1; - unsigned int r2; - unsigned int r3; - unsigned int r4; - unsigned int r5; - unsigned int r6; - unsigned int r7; - unsigned int r8; - unsigned int r9; - unsigned int r10; - unsigned int r11; - unsigned int r12; - unsigned int r13; - unsigned int r14; - unsigned int r15; - unsigned int r16; - unsigned int r17; - unsigned int r18; - unsigned int r19; - unsigned int r20; - unsigned int r21; - unsigned int r22; - unsigned int r23; - unsigned int r24; - unsigned int r25; - unsigned int r26; - unsigned int r27; - unsigned int r28; - unsigned int r29; - unsigned int r30; - unsigned int r31; - - unsigned int cr; /* Condition register */ - unsigned int xer; /* User's integer exception register */ - unsigned int lr; /* Link register */ - unsigned int ctr; /* Count register */ - unsigned int mq; /* MQ register (601 only) */ - - unsigned int vrsave; /* Vector Save Register */ -}; -#endif /* __DARWIN_UNIX03 */ - -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) - -#pragma pack(4) /* Make sure the structure stays as we defined it */ - -#if __DARWIN_UNIX03 -#define _STRUCT_PPC_THREAD_STATE64 struct __darwin_ppc_thread_state64 -_STRUCT_PPC_THREAD_STATE64 -{ - unsigned long long __srr0; /* Instruction address register (PC) */ - unsigned long long __srr1; /* Machine state register (supervisor) */ - unsigned long long __r0; - unsigned long long __r1; - unsigned long long __r2; - unsigned long long __r3; - unsigned long long __r4; - unsigned long long __r5; - unsigned long long __r6; - unsigned long long __r7; - unsigned long long __r8; - unsigned long long __r9; - unsigned long long __r10; - unsigned long long __r11; - unsigned long long __r12; - unsigned long long __r13; - unsigned long long __r14; - unsigned long long __r15; - unsigned long long __r16; - unsigned long long __r17; - unsigned long long __r18; - unsigned long long __r19; - unsigned long long __r20; - unsigned long long __r21; - unsigned long long __r22; - unsigned long long __r23; - unsigned long long __r24; - unsigned long long __r25; - unsigned long long __r26; - unsigned long long __r27; - unsigned long long __r28; - unsigned long long __r29; - unsigned long long __r30; - unsigned long long __r31; - - unsigned int __cr; /* Condition register */ - unsigned long long __xer; /* User's integer exception register */ - unsigned long long __lr; /* Link register */ - unsigned long long __ctr; /* Count register */ - - unsigned int __vrsave; /* Vector Save Register */ -}; -#else /* !__DARWIN_UNIX03 */ -#define _STRUCT_PPC_THREAD_STATE64 struct ppc_thread_state64 -_STRUCT_PPC_THREAD_STATE64 -{ - unsigned long long srr0; /* Instruction address register (PC) */ - unsigned long long srr1; /* Machine state register (supervisor) */ - unsigned long long r0; - unsigned long long r1; - unsigned long long r2; - unsigned long long r3; - unsigned long long r4; - unsigned long long r5; - unsigned long long r6; - unsigned long long r7; - unsigned long long r8; - unsigned long long r9; - unsigned long long r10; - unsigned long long r11; - unsigned long long r12; - unsigned long long r13; - unsigned long long r14; - unsigned long long r15; - unsigned long long r16; - unsigned long long r17; - unsigned long long r18; - unsigned long long r19; - unsigned long long r20; - unsigned long long r21; - unsigned long long r22; - unsigned long long r23; - unsigned long long r24; - unsigned long long r25; - unsigned long long r26; - unsigned long long r27; - unsigned long long r28; - unsigned long long r29; - unsigned long long r30; - unsigned long long r31; - - unsigned int cr; /* Condition register */ - unsigned long long xer; /* User's integer exception register */ - unsigned long long lr; /* Link register */ - unsigned long long ctr; /* Count register */ - - unsigned int vrsave; /* Vector Save Register */ -}; -#endif /* __DARWIN_UNIX03 */ - -#pragma pack() - -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ - -/* This structure should be double-word aligned for performance */ - -#if __DARWIN_UNIX03 -#define _STRUCT_PPC_FLOAT_STATE struct __darwin_ppc_float_state -_STRUCT_PPC_FLOAT_STATE -{ - double __fpregs[32]; - - unsigned int __fpscr_pad; /* fpscr is 64 bits, 32 bits of rubbish */ - unsigned int __fpscr; /* floating point status register */ -}; -#else /* !__DARWIN_UNIX03 */ -#define _STRUCT_PPC_FLOAT_STATE struct ppc_float_state -_STRUCT_PPC_FLOAT_STATE -{ - double fpregs[32]; - - unsigned int fpscr_pad; /* fpscr is 64 bits, 32 bits of rubbish */ - unsigned int fpscr; /* floating point status register */ -}; -#endif /* __DARWIN_UNIX03 */ - -#pragma pack(4) /* Make sure the structure stays as we defined it */ - -#if __DARWIN_UNIX03 -#define _STRUCT_PPC_VECTOR_STATE struct __darwin_ppc_vector_state -_STRUCT_PPC_VECTOR_STATE -{ -#if defined(__LP64__) - unsigned int __save_vr[32][4]; - unsigned int __save_vscr[4]; -#else - unsigned long __save_vr[32][4]; - unsigned long __save_vscr[4]; -#endif - unsigned int __save_pad5[4]; - unsigned int __save_vrvalid; /* VRs that have been saved */ - unsigned int __save_pad6[7]; -}; -#else /* !__DARWIN_UNIX03 */ -#define _STRUCT_PPC_VECTOR_STATE struct ppc_vector_state -_STRUCT_PPC_VECTOR_STATE -{ -#if defined(__LP64__) - unsigned int save_vr[32][4]; - unsigned int save_vscr[4]; -#else - unsigned long save_vr[32][4]; - unsigned long save_vscr[4]; -#endif - unsigned int save_pad5[4]; - unsigned int save_vrvalid; /* VRs that have been saved */ - unsigned int save_pad6[7]; -}; -#endif /* __DARWIN_UNIX03 */ - -#pragma pack() - -/* - * ppc_exception_state - * - * This structure corresponds to some additional state of the user - * registers as saved in the PCB upon kernel entry. They are only - * available if an exception is passed out of the kernel, and even - * then not all are guaranteed to be updated. - * - * Some padding is included in this structure which allows space for - * servers to store temporary values if need be, to maintain binary - * compatiblity. - */ - -/* Exception state for 32-bit thread (on 32-bit processor) */ -/* Still available on 64-bit processors, but may fall short */ -/* of covering the full potential state (hi half available). */ - -#pragma pack(4) /* Make sure the structure stays as we defined it */ - -#if __DARWIN_UNIX03 -#define _STRUCT_PPC_EXCEPTION_STATE struct __darwin_ppc_exception_state -_STRUCT_PPC_EXCEPTION_STATE -{ -#if defined(__LP64__) - unsigned int __dar; /* Fault registers for coredump */ - unsigned int __dsisr; - unsigned int __exception; /* number of powerpc exception taken */ - unsigned int __pad0; /* align to 16 bytes */ - unsigned int __pad1[4]; /* space in PCB "just in case" */ -#else - unsigned long __dar; /* Fault registers for coredump */ - unsigned long __dsisr; - unsigned long __exception; /* number of powerpc exception taken */ - unsigned long __pad0; /* align to 16 bytes */ - unsigned long __pad1[4]; /* space in PCB "just in case" */ -#endif -}; -#else /* !__DARWIN_UNIX03 */ -#define _STRUCT_PPC_EXCEPTION_STATE struct ppc_exception_state -_STRUCT_PPC_EXCEPTION_STATE -{ -#if defined(__LP64__) - unsigned int dar; /* Fault registers for coredump */ - unsigned int dsisr; - unsigned int exception; /* number of powerpc exception taken */ - unsigned int pad0; /* align to 16 bytes */ - unsigned int pad1[4]; /* space in PCB "just in case" */ -#else - unsigned long dar; /* Fault registers for coredump */ - unsigned long dsisr; - unsigned long exception; /* number of powerpc exception taken */ - unsigned long pad0; /* align to 16 bytes */ - unsigned long pad1[4]; /* space in PCB "just in case" */ -#endif -}; -#endif /* __DARWIN_UNIX03 */ - -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -#if __DARWIN_UNIX03 -#define _STRUCT_PPC_EXCEPTION_STATE64 struct __darwin_ppc_exception_state64 -_STRUCT_PPC_EXCEPTION_STATE64 -{ - unsigned long long __dar; /* Fault registers for coredump */ -#if defined(__LP64__) - unsigned int __dsisr; - unsigned int __exception; /* number of powerpc exception taken */ - unsigned int __pad1[4]; /* space in PCB "just in case" */ -#else - unsigned long __dsisr; - unsigned long __exception; /* number of powerpc exception taken */ - unsigned long __pad1[4]; /* space in PCB "just in case" */ -#endif -}; -#else /* !__DARWIN_UNIX03 */ -#define _STRUCT_PPC_EXCEPTION_STATE64 struct ppc_exception_state64 -_STRUCT_PPC_EXCEPTION_STATE64 -{ - unsigned long long dar; /* Fault registers for coredump */ -#if defined(__LP64__) - unsigned int dsisr; - unsigned int exception; /* number of powerpc exception taken */ - unsigned int pad1[4]; /* space in PCB "just in case" */ -#else - unsigned long dsisr; - unsigned long exception; /* number of powerpc exception taken */ - unsigned long pad1[4]; /* space in PCB "just in case" */ -#endif -}; -#endif /* __DARWIN_UNIX03 */ -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ - -#pragma pack() - -#endif /* _MACH_PPC__STRUCTS_H_ */ diff --git a/osfmk/mach/ppc/_types.h b/osfmk/mach/ppc/_types.h deleted file mode 100644 index fd3cb8f19..000000000 --- a/osfmk/mach/ppc/_types.h +++ /dev/null @@ -1,234 +0,0 @@ -/* - * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -#ifndef _MACH_PPC__TYPES_H_ -#define _MACH_PPC__TYPES_H_ - -#include - -/* - * ppc_thread_state is the structure that is exported to user threads for - * use in status/mutate calls. This structure should never change. - * - */ - -#if !__DARWIN_UNIX03 -struct ppc_thread_state -#else /* __DARWIN_UNIX03 */ -struct __darwin_ppc_thread_state -#endif /* __DARWIN_UNIX03 */ -{ - unsigned int srr0; /* Instruction address register (PC) */ - unsigned int srr1; /* Machine state register (supervisor) */ - unsigned int r0; - unsigned int r1; - unsigned int r2; - unsigned int r3; - unsigned int r4; - unsigned int r5; - unsigned int r6; - unsigned int r7; - unsigned int r8; - unsigned int r9; - unsigned int r10; - unsigned int r11; - unsigned int r12; - unsigned int r13; - unsigned int r14; - unsigned int r15; - unsigned int r16; - unsigned int r17; - unsigned int r18; - unsigned int r19; - unsigned int r20; - unsigned int r21; - unsigned int r22; - unsigned int r23; - unsigned int r24; - unsigned int r25; - unsigned int r26; - unsigned int r27; - unsigned int r28; - unsigned int r29; - unsigned int r30; - unsigned int r31; - - unsigned int cr; /* Condition register */ - unsigned int xer; /* User's integer exception register */ - unsigned int lr; /* Link register */ - unsigned int ctr; /* Count register */ - unsigned int mq; /* MQ register (601 only) */ - - unsigned int vrsave; /* Vector Save Register */ -}; - -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -#pragma pack(4) /* Make sure the structure stays as we defined it */ -struct ppc_thread_state64 { - unsigned long long srr0; /* Instruction address register (PC) */ - unsigned long long srr1; /* Machine state register (supervisor) */ - unsigned long long r0; - unsigned long long r1; - unsigned long long r2; - unsigned long long r3; - unsigned long long r4; - unsigned long long r5; - unsigned long long r6; - unsigned long long r7; - unsigned long long r8; - unsigned long long r9; - unsigned long long r10; - unsigned long long r11; - unsigned long long r12; - unsigned long long r13; - unsigned long long r14; - unsigned long long r15; - unsigned long long r16; - unsigned long long r17; - unsigned long long r18; - unsigned long long r19; - unsigned long long r20; - unsigned long long r21; - unsigned long long r22; - unsigned long long r23; - unsigned long long r24; - unsigned long long r25; - unsigned long long r26; - unsigned long long r27; - unsigned long long r28; - unsigned long long r29; - unsigned long long r30; - unsigned long long r31; - - unsigned int cr; /* Condition register */ - unsigned long long xer; /* User's integer exception register */ - unsigned long long lr; /* Link register */ - unsigned long long ctr; /* Count register */ - - unsigned int vrsave; /* Vector Save Register */ -}; - -#pragma pack() -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ - -/* This structure should be double-word aligned for performance */ - -#if !__DARWIN_UNIX03 -struct ppc_float_state -#else /* __DARWIN_UNIX03 */ -struct __darwin_ppc_float_state -#endif /* __DARWIN_UNIX03 */ -{ - double fpregs[32]; - - unsigned int fpscr_pad; /* fpscr is 64 bits, 32 bits of rubbish */ - unsigned int fpscr; /* floating point status register */ -}; - -#pragma pack(4) /* Make sure the structure stays as we defined it */ - -#if !__DARWIN_UNIX03 -struct ppc_vector_state -#else /* __DARWIN_UNIX03 */ -struct __darwin_ppc_vector_state -#endif /* __DARWIN_UNIX03 */ -{ -#if defined(__LP64__) - unsigned int save_vr[32][4]; - unsigned int save_vscr[4]; -#else - unsigned long save_vr[32][4]; - unsigned long save_vscr[4]; -#endif - unsigned int save_pad5[4]; - unsigned int save_vrvalid; /* VRs that have been saved */ - unsigned int save_pad6[7]; -}; -#pragma pack() - -/* - * ppc_exception_state - * - * This structure corresponds to some additional state of the user - * registers as saved in the PCB upon kernel entry. They are only - * available if an exception is passed out of the kernel, and even - * then not all are guaranteed to be updated. - * - * Some padding is included in this structure which allows space for - * servers to store temporary values if need be, to maintain binary - * compatiblity. - */ - -/* Exception state for 32-bit thread (on 32-bit processor) */ -/* Still available on 64-bit processors, but may fall short */ -/* of covering the full potential state (hi half available). */ - -#pragma pack(4) /* Make sure the structure stays as we defined it */ - -#if !__DARWIN_UNIX03 -struct ppc_exception_state -#else /* __DARWIN_UNIX03 */ -struct __darwin_ppc_exception_state -#endif /* __DARWIN_UNIX03 */ -{ -#if defined(__LP64__) - unsigned int dar; /* Fault registers for coredump */ - unsigned int dsisr; - unsigned int exception; /* number of powerpc exception taken */ - unsigned int pad0; /* align to 16 bytes */ - unsigned int pad1[4]; /* space in PCB "just in case" */ -#else - unsigned long dar; /* Fault registers for coredump */ - unsigned long dsisr; - unsigned long exception; /* number of powerpc exception taken */ - unsigned long pad0; /* align to 16 bytes */ - unsigned long pad1[4]; /* space in PCB "just in case" */ -#endif -}; - -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -struct ppc_exception_state64 { - unsigned long long dar; /* Fault registers for coredump */ -#if defined(__LP64__) - unsigned int dsisr; - unsigned int exception; /* number of powerpc exception taken */ - unsigned int pad1[4]; /* space in PCB "just in case" */ -#else - unsigned long dsisr; - unsigned long exception; /* number of powerpc exception taken */ - unsigned long pad1[4]; /* space in PCB "just in case" */ -#endif -}; -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ - -#pragma pack() - -#endif /* _MACH_PPC__TYPES_H_ */ diff --git a/osfmk/mach/ppc/boolean.h b/osfmk/mach/ppc/boolean.h deleted file mode 100644 index aa3769c9f..000000000 --- a/osfmk/mach/ppc/boolean.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ - -/* - * File: boolean.h - * - * Boolean type, for ppc. - */ - -#ifndef _MACH_PPC_BOOLEAN_H_ -#define _MACH_PPC_BOOLEAN_H_ - -#if defined(__ppc64__) -typedef unsigned int boolean_t; -#else -typedef int boolean_t; -#endif - -#endif /* _MACH_PPC_BOOLEAN_H_ */ diff --git a/osfmk/mach/ppc/exception.h b/osfmk/mach/ppc/exception.h deleted file mode 100644 index da4e7cb6b..000000000 --- a/osfmk/mach/ppc/exception.h +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Copyright (c) 1990, 1991, 1992, The University of Utah and - * the Center for Software Science at the University of Utah (CSS). - * All rights reserved. - * - * Permission to use, copy, modify and distribute this software is hereby - * granted provided that (1) source code retains these copyright, permission, - * and disclaimer notices, and (2) redistributions including binaries - * reproduce the notices in supporting documentation, and (3) all advertising - * materials mentioning features or use of this software display the following - * acknowledgement: ``This product includes software developed by the Center - * for Software Science at the University of Utah.'' - * - * THE UNIVERSITY OF UTAH AND CSS ALLOW FREE USE OF THIS SOFTWARE IN ITS "AS - * IS" CONDITION. THE UNIVERSITY OF UTAH AND CSS DISCLAIM ANY LIABILITY OF - * ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * CSS requests users of this software to return to css-dist@cs.utah.edu any - * improvements that they make and grant CSS redistribution rights. - * - * Utah $Hdr: $ - */ - -#ifndef _MACH_PPC_EXCEPTION_H_ -#define _MACH_PPC_EXCEPTION_H_ - -#define EXC_TYPES_COUNT 11 /* incl. illegal exception 0 */ - -#define EXCEPTION_CODE_MAX 2 /* elements in vector (code+subcode) */ -/* - * EXC_BAD_INSTRUCTION - */ - -#define EXC_PPC_INVALID_SYSCALL 1 /* invalid syscall number */ -#define EXC_PPC_UNIPL_INST 2 /* unimplemented instruction */ -#define EXC_PPC_PRIVINST 3 /* priviledged instruction */ -#define EXC_PPC_PRIVREG 4 /* priviledged register */ -#define EXC_PPC_TRACE 5 /* trace/single-step */ -#define EXC_PPC_PERFMON 6 /* performance monitor */ - -/* - * EXC_BAD_ACCESS - * Note: do not conflict with kern_return_t values returned by vm_fault - */ - -#define EXC_PPC_VM_PROT_READ 0x101 /* error reading syscall args */ -#define EXC_PPC_BADSPACE 0x102 /* bad space referenced */ -#define EXC_PPC_UNALIGNED 0x103 /* unaligned data reference */ - -/* - * EXC_ARITHMETIC - */ - -#define EXC_PPC_OVERFLOW 1 /* integer overflow */ -#define EXC_PPC_ZERO_DIVIDE 2 /* integer divide by zero */ -#define EXC_PPC_FLT_INEXACT 3 /* IEEE inexact exception */ -#define EXC_PPC_FLT_ZERO_DIVIDE 4 /* IEEE zero divide */ -#define EXC_PPC_FLT_UNDERFLOW 5 /* IEEE floating underflow */ -#define EXC_PPC_FLT_OVERFLOW 6 /* IEEE floating overflow */ -#define EXC_PPC_FLT_NOT_A_NUMBER 7 /* IEEE not a number */ - -/* - * EXC_PPC_NOEMULATION should go away when we add software emulation - * for floating point. Right now we don't support this. - */ - -#define EXC_PPC_NOEMULATION 8 /* no floating point emulation */ -#define EXC_PPC_ALTIVECASSIST 9 /* Altivec Denorm Assist */ - -/* - * EXC_SOFTWARE - * Note: 0x10000-0x10003 in use for unix signal - */ -#define EXC_PPC_TRAP 1 /* Program trap */ -#define EXC_PPC_MIGRATE 0x10100 /* Time to bolt */ - - -/* - * EXC_BREAKPOINT - */ - -#define EXC_PPC_BREAKPOINT EXC_PPC_TRAP /* breakpoint trap */ - -/* - * machine dependent exception masks - */ -#define EXC_MASK_MACHINE 0 - -#endif /* _MACH_PPC_EXCEPTION_H_ */ diff --git a/osfmk/mach/ppc/kern_return.h b/osfmk/mach/ppc/kern_return.h deleted file mode 100644 index 2c79023f2..000000000 --- a/osfmk/mach/ppc/kern_return.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ - -/* - * File: kern_return.h - * Author: Avadis Tevanian, Jr., Michael Wayne Young - * Date: 1985 - * - * Machine-dependent kernel return definitions. - */ - -#ifndef _MACH_PPC_KERN_RETURN_H_ -#define _MACH_PPC_KERN_RETURN_H_ - -#ifndef ASSEMBLER -typedef int kern_return_t; -#endif /* ASSEMBLER */ - -#endif /* _MACH_PPC_KERN_RETURN_H_ */ diff --git a/osfmk/mach/ppc/machine_types.defs b/osfmk/mach/ppc/machine_types.defs deleted file mode 100644 index f0d5c41f3..000000000 --- a/osfmk/mach/ppc/machine_types.defs +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -/* - * Header file for basic, machine-dependent data types. - */ - -#ifndef _PPC_VM_TYPES_DEFS_ -#define _PPC_VM_TYPES_DEFS_ - -type short = int16_t; -type int = int32_t; -type unsigned = uint32_t; - -type float = MACH_MSG_TYPE_REAL_32; -type double = MACH_MSG_TYPE_REAL_64; - -/* from ISO/IEC 988:1999 spec */ -/* 7.18.1.4 Integer types capable of holding object pointers */ -/* - * The [u]intptr_t types for the native - * integer type, e.g. 32 or 64 or.. whatever - * register size the machine has. They are - * used for entities that might be either - * [unsigned] integers or pointers, and for - * type-casting between the two. - * - * For instance, the IPC system represents - * a port in user space as an integer and - * in kernel space as a pointer. - */ -#if defined(__ppc64__) -type uintptr_t = uint64_t; -type intptr_t = int64_t; -#else -type uintptr_t = uint32_t; -type intptr_t = int32_t; -#endif - -/* - * These are the legacy Mach types that are - * the [rough] equivalents of the standards above. - * They were defined in terms of int, not - * long int, so they remain separate. - */ -#if defined(__ppc64__) -type register_t = int64_t; -#else -type register_t = int32_t; -#endif -type integer_t = int32_t; -type natural_t = uint32_t; - -/* - * These are the VM types that scale with the address - * space size of a given process. - */ - -#if defined(__ppc64__) -type vm_address_t = uint64_t; -type vm_offset_t = uint64_t; -type vm_size_t = uint64_t; -#else -type vm_address_t = natural_t; -type vm_offset_t = natural_t; -type vm_size_t = natural_t; -#endif - -/* - * The mach_vm_xxx_t types are sized to hold the - * maximum pointer, offset, etc... supported on the - * platform. - */ -type mach_vm_address_t = uint64_t; -type mach_vm_offset_t = uint64_t; -type mach_vm_size_t = uint64_t; - -#if MACH_IPC_COMPAT -/* - * For the old IPC interface - */ -#define MSG_TYPE_PORT_NAME uint32_t - -#endif /* MACH_IPC_COMPAT */ - -/* - * These are types used internal to Mach to implement the - * legacy 32-bit VM APIs published by the kernel. - */ -#define VM32_SUPPORT 1 - -type vm32_address_t = uint32_t; -type vm32_offset_t = uint32_t; -type vm32_size_t = uint32_t; - -#endif /* _PPC_VM_TYPES_DEFS_ */ - -/* vim: set ft=c : */ diff --git a/osfmk/mach/ppc/ndr_def.h b/osfmk/mach/ppc/ndr_def.h deleted file mode 100644 index cb012b2f4..000000000 --- a/osfmk/mach/ppc/ndr_def.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -#include - -NDR_record_t NDR_record = { - 0, /* mig_reserved */ - 0, /* mig_reserved */ - 0, /* mig_reserved */ - NDR_PROTOCOL_2_0, - NDR_INT_BIG_ENDIAN, - NDR_CHAR_ASCII, - NDR_FLOAT_IEEE, - 0, -}; diff --git a/osfmk/mach/ppc/processor_info.h b/osfmk/mach/ppc/processor_info.h deleted file mode 100644 index 168cb195d..000000000 --- a/osfmk/mach/ppc/processor_info.h +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * File: mach/ppc/processor_info.h - * - * Data structure definitions for ppc specific processor control - */ - -#ifndef _MACH_PPC_PROCESSOR_INFO_H_ -#define _MACH_PPC_PROCESSOR_INFO_H_ - -#include -#include - -#ifdef PRIVATE - -/* processor_control command operations */ -#define PROCESSOR_PM_SET_REGS 1 /* Set Performance Monitor Registers */ -#define PROCESSOR_PM_SET_MMCR 2 /* Set Monitor Mode Controls Registers */ -#define PROCESSOR_PM_CLR_PMC 3 /* Clear Performance Monitor Counter Registers */ - -/* - * Performance Monitor Register structures - * - * XXX - These have not been updated for ppc64. - */ - -typedef union { - unsigned int word; - struct { - unsigned int dis : 1; - unsigned int dp : 1; - unsigned int du : 1; - unsigned int dms : 1; - unsigned int dmr : 1; - unsigned int reserved3 : 1; /* enint */ - unsigned int reserved4 : 1; /* discount */ - unsigned int reserved5 : 2; /* rtcselect */ - unsigned int reserved6 : 1; /* intonbittrans */ - unsigned int threshold : 6; - unsigned int reserved7 : 1; /* pmc1intcontrol */ - unsigned int reserved8 : 1; /* pmcintcontrol */ - unsigned int reserved9 : 1; /* pmctrigger */ - unsigned int pmc1select : 7; - unsigned int pmc2select : 6; - }bits; -}mmcr0_t; - -typedef union { - unsigned int word; - struct { - unsigned int pmc3select : 5; - unsigned int pmc4select : 5; - unsigned int reserved : 22; - }bits; -}mmcr1_t; - -typedef union { - unsigned int word; - struct { - unsigned int threshmult : 1; - unsigned int reserved : 31; - }bits; -}mmcr2_t; - -typedef union { - unsigned int word; - struct { - unsigned int ov : 1; /* overflow value */ - unsigned int cv : 31; /* countervalue */ - }bits; -}pmcn_t; - - - -/* Processor Performance Monitor Registers definitions */ - -struct processor_pm_regs { - union { - mmcr0_t mmcr0; - mmcr1_t mmcr1; - mmcr2_t mmcr2; - }u; - pmcn_t pmc[2]; -}; - -typedef struct processor_pm_regs processor_pm_regs_data_t; -typedef struct processor_pm_regs *processor_pm_regs_t; -#define PROCESSOR_PM_REGS_COUNT ((mach_msg_type_number_t) \ - (sizeof(processor_pm_regs_data_t) / sizeof (unsigned int))) - -#define PROCESSOR_PM_REGS_COUNT_POWERPC_750 \ - (PROCESSOR_PM_REGS_COUNT * 2 ) - -#define PROCESSOR_PM_REGS_COUNT_POWERPC_7400 \ - (PROCESSOR_PM_REGS_COUNT * 3 ) - -union processor_control_data { - processor_pm_regs_data_t cmd_pm_regs[3]; -}; - -struct processor_control_cmd { - integer_t cmd_op; - cpu_type_t cmd_cpu_type; - cpu_subtype_t cmd_cpu_subtype; - union processor_control_data u; -}; - -typedef struct processor_control_cmd processor_control_cmd_data_t; -typedef struct processor_control_cmd *processor_control_cmd_t; -#define cmd_pm_regs u.cmd_pm_regs; -#define cmd_pm_ctls u.cmd_pm_ctls; - -#define PROCESSOR_CONTROL_CMD_COUNT ((mach_msg_type_number_t) \ - (((sizeof(processor_control_cmd_data_t)) - \ - (sizeof(union processor_control_data))) / sizeof (integer_t))) - - /* x should be a processor_pm_regs_t */ -#define PERFMON_MMCR0(x) ((x)[0].u.mmcr0.word) -#define PERFMON_PMC1(x) ((x)[0].pmc[0].word) -#define PERFMON_PMC2(x) ((x)[0].pmc[1].word) -#define PERFMON_MMCR1(x) ((x)[1].u.mmcr1.word) -#define PERFMON_PMC3(x) ((x)[1].pmc[0].word) -#define PERFMON_PMC4(x) ((x)[1].pmc[1].word) -#define PERFMON_MMCR2(x) ((x)[2].u.mmcr2.word) - -#define PERFMON_DIS(x) ((x)[0].u.mmcr0.bits.dis) -#define PERFMON_DP(x) ((x)[0].u.mmcr0.bits.dp) -#define PERFMON_DU(x) ((x)[0].u.mmcr0.bits.du) -#define PERFMON_DMS(x) ((x)[0].u.mmcr0.bits.dms) -#define PERFMON_DMR(x) ((x)[0].u.mmcr0.bits.dmr) -#define PERFMON_THRESHOLD(x) ((x)[0].u.mmcr0.bits.threshold) -#define PERFMON_PMC1SELECT(x) ((x)[0].u.mmcr0.bits.pmc1select) -#define PERFMON_PMC2SELECT(x) ((x)[0].u.mmcr0.bits.pmc2select) -#define PERFMON_PMC3SELECT(x) ((x)[1].u.mmcr1.bits.pmc3select) -#define PERFMON_PMC4SELECT(x) ((x)[1].u.mmcr1.bits.pmc4select) -#define PERFMON_THRESHMULT(x) ((x)[2].u.mmcr2.bits.threshmult) -#define PERFMON_PMC1_CV(x) ((x)[0].u.pmc[0].bits.cv) -#define PERFMON_PMC2_CV(x) ((x)[0].u.pmc[1].bits.cv) -#define PERFMON_PMC3_CV(x) ((x)[1].u.pmc[0].bits.cv) -#define PERFMON_PMC4_CV(x) ((x)[1].u.pmc[1].bits.cv) - -typedef unsigned int processor_temperature_data_t; -typedef unsigned int *processor_temperature_t; - -#define PROCESSOR_TEMPERATURE_COUNT 1 - -#endif /* PRIVATE */ - -#endif /* _MACH_PPC_PROCESSOR_INFO_H_ */ diff --git a/osfmk/mach/ppc/sdt_isa.h b/osfmk/mach/ppc/sdt_isa.h deleted file mode 100644 index 558a12406..000000000 --- a/osfmk/mach/ppc/sdt_isa.h +++ /dev/null @@ -1,427 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _MACH_PPC_SDT_ISA_H -#define _MACH_PPC_SDT_ISA_H - -/* #pragma ident "@(#)sdt.h 1.7 05/06/08 SMI" */ - -/* - * Only define when testing. This makes the calls into actual calls to - * test functions. - */ -/* #define DTRACE_CALL_TEST */ - -#define DTRACE_STRINGIFY(s) #s -#define DTRACE_TOSTRING(s) DTRACE_STRINGIFY(s) - -#if defined(KERNEL) -/* - * For the kernel, set an explicit global label so the symbol can be located - */ -#define DTRACE_LAB(p, n) \ - "__dtrace_probe$" DTRACE_TOSTRING(__LINE__) DTRACE_STRINGIFY(_##p##___##n) -#define DTRACE_LABEL(p, n) \ - ".section __DATA, __data\n\t" \ - ".globl " DTRACE_LAB(p, n) "\n\t" \ - DTRACE_LAB(p, n) ":" ".long 1f""\n\t" \ - ".text" "\n\t" \ - "1:" -#else /* !KERNEL */ -#define DTRACE_LABEL(p, n) \ - "__dtrace_probe$" DTRACE_TOSTRING(__LINE__) DTRACE_STRINGIFY(_##p##___##n) ":" "\n\t" -#endif /* !KERNEL */ - -#ifdef DTRACE_CALL_TEST - -#define DTRACE_CALL(p,n) \ - DTRACE_LABEL(p,n) \ - DTRACE_CALL_INSN(p,n) - -#else /* !DTRACE_CALL_TEST */ - -#define DTRACE_CALL(p,n) \ - DTRACE_LABEL(p,n) \ - DTRACE_NOPS - -#endif /* !DTRACE_CALL_TEST */ - -#ifdef __ppc__ - -#define DTRACE_NOPS \ - "nop" "\n\t" - -#define DTRACE_CALL_INSN(p,n) \ - "bl _dtracetest" DTRACE_STRINGIFY(_##p##_##n) "\n\t" - -#define ARG1_EXTENT 1 -#define ARGS2_EXTENT 2 -#define ARGS3_EXTENT 3 -#define ARGS4_EXTENT 4 -#define ARGS5_EXTENT 5 -#define ARGS6_EXTENT 6 -#define ARGS7_EXTENT 7 -#define ARGS8_EXTENT 8 -#define ARGS9_EXTENT 9 -#define ARGS10_EXTENT 10 - -#define DTRACE_CALL0ARGS(provider, name) \ - asm volatile ( \ - DTRACE_CALL(provider, name) \ - "# eat trailing nl+tab from DTRACE_CALL" \ - : \ - : \ - ); - -#define DTRACE_CALL1ARG(provider, name) \ - asm volatile ("subi r1,r1,0x20" "\n\t" \ - "lwz r3,0x0(%0)" "\n\t" \ - DTRACE_CALL(provider, name) \ - "addi r1,r1,0x20" \ - : \ - : "b" (__dtrace_args) \ - : "memory", "r3" \ - ); - -#define DTRACE_CALL2ARGS(provider, name) \ - asm volatile ("subi r1,r1,0x20" "\n\t" \ - "lwz r3,0x0(%0)" "\n\t" \ - "lwz r4,0x4(%0)" "\n\t" \ - DTRACE_CALL(provider, name) \ - "addi r1,r1,0x20" \ - : \ - : "b" (__dtrace_args) \ - : "memory", "r3", "r4" \ - ); - -#define DTRACE_CALL3ARGS(provider, name) \ - asm volatile ("subi r1,r1,0x30" "\n\t" \ - "lwz r3,0x0(%0)" "\n\t" \ - "lwz r4,0x4(%0)" "\n\t" \ - "lwz r5,0x8(%0)" "\n\t" \ - DTRACE_CALL(provider, name) \ - "addi r1,r1,0x30" \ - : \ - : "b" (__dtrace_args) \ - : "memory", "r3", "r4", "r5" \ - ); - -#define DTRACE_CALL4ARGS(provider, name) \ - asm volatile ("subi r1,r1,0x30" "\n\t" \ - "lwz r3,0x0(%0)" "\n\t" \ - "lwz r4,0x4(%0)" "\n\t" \ - "lwz r5,0x8(%0)" "\n\t" \ - "lwz r6,0xc(%0)" "\n\t" \ - DTRACE_CALL(provider, name) \ - "addi r1,r1,0x30" \ - : \ - : "b" (__dtrace_args) \ - : "memory", "r3", "r4", "r5", "r6" \ - ); - -#define DTRACE_CALL5ARGS(provider, name) \ - asm volatile ("subi r1,r1,0x30" "\n\t" \ - "lwz r3,0x0(%0)" "\n\t" \ - "lwz r4,0x4(%0)" "\n\t" \ - "lwz r5,0x8(%0)" "\n\t" \ - "lwz r6,0xc(%0)" "\n\t" \ - "lwz r7,0x10(%0)" "\n\t" \ - DTRACE_CALL(provider, name) \ - "addi r1,r1,0x30" \ - : \ - : "b" (__dtrace_args) \ - : "memory", "r3", "r4", "r5", "r6", "r7" \ - ); - -#define DTRACE_CALL6ARGS(provider, name) \ - asm volatile ("subi r1,r1,0x30" "\n\t" \ - "lwz r3,0x0(%0)" "\n\t" \ - "lwz r4,0x4(%0)" "\n\t" \ - "lwz r5,0x8(%0)" "\n\t" \ - "lwz r6,0xc(%0)" "\n\t" \ - "lwz r7,0x10(%0)" "\n\t" \ - "lwz r8,0x14(%0)" "\n\t" \ - DTRACE_CALL(provider, name) \ - "addi r1,r1,0x30" \ - : \ - : "b" (__dtrace_args) \ - : "memory", "r3", "r4", "r5", "r6", "r7", "r8" \ - ); - -#define DTRACE_CALL7ARGS(provider, name) \ - asm volatile ("subi r1,r1,0x40" "\n\t" \ - "lwz r3,0x0(%0)" "\n\t" \ - "lwz r4,0x4(%0)" "\n\t" \ - "lwz r5,0x8(%0)" "\n\t" \ - "lwz r6,0xc(%0)" "\n\t" \ - "lwz r7,0x10(%0)" "\n\t" \ - "lwz r8,0x14(%0)" "\n\t" \ - "lwz r9,0x18(%0)" "\n\t" \ - DTRACE_CALL(provider, name) \ - "addi r1,r1,0x40" \ - : \ - : "b" (__dtrace_args) \ - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9" \ - ); - -#define DTRACE_CALL8ARGS(provider, name) \ - asm volatile ("subi r1,r1,0x40" "\n\t" \ - "lwz r3,0x0(%0)" "\n\t" \ - "lwz r4,0x4(%0)" "\n\t" \ - "lwz r5,0x8(%0)" "\n\t" \ - "lwz r6,0xc(%0)" "\n\t" \ - "lwz r7,0x10(%0)" "\n\t" \ - "lwz r8,0x14(%0)" "\n\t" \ - "lwz r9,0x18(%0)" "\n\t" \ - "lwz r10,0x1c(%0)" "\n\t" \ - DTRACE_CALL(provider, name) \ - "addi r1,r1,0x40" \ - : \ - : "b" (__dtrace_args) \ - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" \ - ); - -#define DTRACE_CALL9ARGS(provider, name) \ - asm volatile ("subi r1,r1,0x40" "\n\t" \ - "lwz r3,0x0(%0)" "\n\t" \ - "lwz r4,0x4(%0)" "\n\t" \ - "lwz r5,0x8(%0)" "\n\t" \ - "lwz r6,0xc(%0)" "\n\t" \ - "lwz r7,0x10(%0)" "\n\t" \ - "lwz r8,0x14(%0)" "\n\t" \ - "lwz r9,0x18(%0)" "\n\t" \ - "lwz r10,0x1c(%0)" "\n\t" \ - "lwz r11,0x20(%0)" "\n\t" \ - "stw r11,0x38(r1)" "\n\t" \ - DTRACE_CALL(provider, name) \ - "addi r1,r1,0x40" \ - : \ - : "b" (__dtrace_args) \ - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" \ - ); - -#define DTRACE_CALL10ARGS(provider, name) \ - asm volatile ("subi r1,r1,0x40" "\n\t" \ - "lwz r3,0x0(%0)" "\n\t" \ - "lwz r4,0x4(%0)" "\n\t" \ - "lwz r5,0x8(%0)" "\n\t" \ - "lwz r6,0xc(%0)" "\n\t" \ - "lwz r7,0x10(%0)" "\n\t" \ - "lwz r8,0x14(%0)" "\n\t" \ - "lwz r9,0x18(%0)" "\n\t" \ - "lwz r10,0x1c(%0)" "\n\t" \ - "lwz r11,0x20(%0)" "\n\t" \ - "lwz r12,0x24(%0)" "\n\t" \ - "stw r11,0x38(r1)" "\n\t" \ - "stw r12,0x3c(r1)" "\n\t" \ - DTRACE_CALL(provider, name) \ - "addi r1,r1,0x40" \ - : \ - : "b" (__dtrace_args) \ - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" \ - ); - -#endif // __ppc__ - -#ifdef __ppc64__ - -#define DTRACE_NOPS \ - "nop" "\n\t" - -#define DTRACE_CALL_INSN(p,n) \ - "bl _dtracetest" DTRACE_STRINGIFY(_##p##_##n) "\n\t" - -#define ARG1_EXTENT 1 -#define ARGS2_EXTENT 2 -#define ARGS3_EXTENT 3 -#define ARGS4_EXTENT 4 -#define ARGS5_EXTENT 5 -#define ARGS6_EXTENT 6 -#define ARGS7_EXTENT 7 -#define ARGS8_EXTENT 8 -#define ARGS9_EXTENT 9 -#define ARGS10_EXTENT 10 - -#define DTRACE_CALL0ARGS(provider, name) \ - asm volatile ("subi r1,r1,0x30" "\n\t" \ - DTRACE_CALL(provider, name) \ - "addi r1,r1,0x30" \ - : \ - : \ - : \ - ); - -#define DTRACE_CALL1ARG(provider, name) \ - asm volatile ("ld r3,0x0(%0)" "\n\t" \ - "subi r1,r1,0x38" "\n\t" \ - DTRACE_CALL(provider, name) \ - "addi r1,r1,0x38" \ - : \ - : "b" (__dtrace_args) \ - : "memory", "r3" \ - ); - -#define DTRACE_CALL2ARGS(provider, name) \ - asm volatile ("subi r1,r1,0x40" "\n\t" \ - "ld r3,0x0(%0)" "\n\t" \ - "ld r4,0x8(%0)" "\n\t" \ - DTRACE_CALL(provider, name) \ - "addi r1,r1,0x40" \ - : \ - : "b" (__dtrace_args) \ - : "memory", "r3", "r4" \ - ); - -#define DTRACE_CALL3ARGS(provider, name) \ - asm volatile ("subi r1,r1,0x48" "\n\t" \ - "ld r3,0x0(%0)" "\n\t" \ - "ld r4,0x8(%0)" "\n\t" \ - "ld r5,0x10(%0)" "\n\t" \ - DTRACE_CALL(provider, name) \ - "addi r1,r1,0x48" \ - : \ - : "b" (__dtrace_args) \ - : "memory", "r3", "r4", "r5" \ - ); - -#define DTRACE_CALL4ARGS(provider, name) \ - asm volatile ("subi r1,r1,0x50" "\n\t" \ - "ld r3,0x0(%0)" "\n\t" \ - "ld r4,0x8(%0)" "\n\t" \ - "ld r5,0x10(%0)" "\n\t" \ - "ld r6,0x18(%0)" "\n\t" \ - DTRACE_CALL(provider, name) \ - "addi r1,r1,0x50" \ - : \ - : "b" (__dtrace_args) \ - : "memory", "r3", "r4", "r5", "r6" \ - ); - -#define DTRACE_CALL5ARGS(provider, name) \ - asm volatile ("subi r1,r1,0x58" "\n\t" \ - "ld r3,0x0(%0)" "\n\t" \ - "ld r4,0x8(%0)" "\n\t" \ - "ld r5,0x10(%0)" "\n\t" \ - "ld r6,0x18(%0)" "\n\t" \ - "ld r7,0x20(%0)" "\n\t" \ - DTRACE_CALL(provider, name) \ - "addi r1,r1,0x58" \ - : \ - : "b" (__dtrace_args) \ - : "memory", "r3", "r4", "r5", "r6", "r7" \ - ); - -#define DTRACE_CALL6ARGS(provider, name) \ - asm volatile ("subi r1,r1,0x60" "\n\t" \ - "ld r3,0x0(%0)" "\n\t" \ - "ld r4,0x8(%0)" "\n\t" \ - "ld r5,0x10(%0)" "\n\t" \ - "ld r6,0x18(%0)" "\n\t" \ - "ld r7,0x20(%0)" "\n\t" \ - "ld r8,0x28(%0)" "\n\t" \ - DTRACE_CALL(provider, name) \ - "addi r1,r1,0x60" \ - : \ - : "b" (__dtrace_args) \ - : "memory", "r3", "r4", "r5", "r6", "r7", "r8" \ - ); - -#define DTRACE_CALL7ARGS(provider, name) \ - asm volatile ("subi r1,r1,0x68" "\n\t" \ - "ld r3,0x0(%0)" "\n\t" \ - "ld r4,0x8(%0)" "\n\t" \ - "ld r5,0x10(%0)" "\n\t" \ - "ld r6,0x18(%0)" "\n\t" \ - "ld r7,0x20(%0)" "\n\t" \ - "ld r8,0x28(%0)" "\n\t" \ - "ld r9,0x30(%0)" "\n\t" \ - DTRACE_CALL(provider, name) \ - "addi r1,r1,0x68" \ - : \ - : "b" (__dtrace_args) \ - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9" \ - ); - -#define DTRACE_CALL8ARGS(provider, name) \ - asm volatile ("subi r1,r1,0x70" "\n\t" \ - "ld r3,0x0(%0)" "\n\t" \ - "ld r4,0x8(%0)" "\n\t" \ - "ld r5,0x10(%0)" "\n\t" \ - "ld r6,0x18(%0)" "\n\t" \ - "ld r7,0x20(%0)" "\n\t" \ - "ld r8,0x28(%0)" "\n\t" \ - "ld r9,0x30(%0)" "\n\t" \ - "ld r10,0x38(%0)" "\n\t" \ - DTRACE_CALL(provider, name) \ - "addi r1,r1,0x70" \ - : \ - : "b" (__dtrace_args) \ - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" \ - ); - -#define DTRACE_CALL9ARGS(provider, name) \ - asm volatile ("subi r1,r1,0x78" "\n\t" \ - "ld r3,0x0(%0)" "\n\t" \ - "ld r4,0x8(%0)" "\n\t" \ - "ld r5,0x10(%0)" "\n\t" \ - "ld r6,0x18(%0)" "\n\t" \ - "ld r7,0x20(%0)" "\n\t" \ - "ld r8,0x28(%0)" "\n\t" \ - "ld r9,0x30(%0)" "\n\t" \ - "ld r10,0x38(%0)" "\n\t" \ - "ld r11,0x40(%0)" "\n\t" \ - "std r11,0x70(r1)" "\n\t" \ - DTRACE_CALL(provider, name) \ - "addi r1,r1,0x78" \ - : \ - : "b" (__dtrace_args) \ - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" \ - ); - -#define DTRACE_CALL10ARGS(provider, name) \ - asm volatile ("subi r1,r1,0x80" "\n\t" \ - "ld r3,0x0(%0)" "\n\t" \ - "ld r4,0x8(%0)" "\n\t" \ - "ld r5,0x10(%0)" "\n\t" \ - "ld r6,0x18(%0)" "\n\t" \ - "ld r7,0x20(%0)" "\n\t" \ - "ld r8,0x28(%0)" "\n\t" \ - "ld r9,0x30(%0)" "\n\t" \ - "ld r10,0x38(%0)" "\n\t" \ - "ld r11,0x40(%0)" "\n\t" \ - "ld r12,0x48(%0)" "\n\t" \ - "std r11,0x70(r1)" "\n\t" \ - "std r12,0x78(r1)" "\n\t" \ - DTRACE_CALL(provider, name) \ - "addi r1,r1,0x80" \ - : \ - : "b" (__dtrace_args) \ - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" \ - ); - -#endif // __ppc64__ - -#endif /* _MACH_PPC_SDT_ISA_H */ diff --git a/osfmk/mach/ppc/syscall_sw.h b/osfmk/mach/ppc/syscall_sw.h deleted file mode 100644 index 335ff9e21..000000000 --- a/osfmk/mach/ppc/syscall_sw.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -#ifdef PRIVATE - -#ifndef _MACH_PPC_SYSCALL_SW_H_ -#define _MACH_PPC_SYSCALL_SW_H_ - -#include - -#define kernel_trap(trap_name,trap_number,number_args) \ -ENTRY(trap_name, TAG_NO_FRAME_USED) @\ - li r0, trap_number @\ - sc @\ - blr - -#define ppc_trap(trap_name,trap_number) \ -ENTRY(trap_name, TAG_NO_FRAME_USED) @\ - li r0, trap_number @\ - sc @\ - blr - -/* - * Put any definitions for PPC-only system calls in here (only if - * this file is being included from the one that instantiates the - * mach system calls). - * - * Note: PPC-only system calls are in the 0x6000 to 0x6FFF range - */ - -#ifdef _MACH_SYSCALL_SW_H_ - -ppc_trap(diagCall,0x6000) -ppc_trap(vmm_get_version,0x6001) -ppc_trap(vmm_get_features,0x6002) -ppc_trap(vmm_init_context,0x6003) -ppc_trap(vmm_dispatch,0x6004) -ppc_trap(bb_enable_bluebox,0x6005) -ppc_trap(bb_disable_bluebox,0x6006) -ppc_trap(bb_settaskenv,0x6007) -ppc_trap(vmm_stop_vm,0x6008) -ppc_trap(CHUDCall,0x6009) -ppc_trap(ppcNull,0x600A) -ppc_trap(perfmon_control,0x600B) -ppc_trap(ppcNullinst,0x600C) -ppc_trap(pmsCPUCntrl,0x600D) -#endif /* _MACH_SYSCALL_SW_H_ */ - -#endif /* _MACH_PPC_SYSCALL_SW_H_ */ - -#endif /* PRIVATE */ diff --git a/osfmk/mach/ppc/thread_status.h b/osfmk/mach/ppc/thread_status.h deleted file mode 100644 index ba077f74d..000000000 --- a/osfmk/mach/ppc/thread_status.h +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -#ifndef _MACH_PPC_THREAD_STATUS_H_ -#define _MACH_PPC_THREAD_STATUS_H_ - -#include -#include - -/* - * ppc_thread_state is the structure that is exported to user threads for - * use in status/mutate calls. This structure should never change. - * - */ - -#define PPC_THREAD_STATE 1 -#define PPC_FLOAT_STATE 2 -#define PPC_EXCEPTION_STATE 3 -#define PPC_VECTOR_STATE 4 -#define PPC_THREAD_STATE64 5 -#define PPC_EXCEPTION_STATE64 6 -#define THREAD_STATE_NONE 7 - -/* - * VALID_THREAD_STATE_FLAVOR is a platform specific macro that when passed - * an exception flavor will return whether that is a defined flavor for - * that platform. - * The macro must be manually updated to include all of the valid exception - * flavors as defined above. - */ -#define VALID_THREAD_STATE_FLAVOR(x) \ - ((x == PPC_THREAD_STATE) || \ - (x == PPC_FLOAT_STATE) || \ - (x == PPC_EXCEPTION_STATE) || \ - (x == PPC_VECTOR_STATE) || \ - (x == PPC_THREAD_STATE64) || \ - (x == PPC_EXCEPTION_STATE64) || \ - (x == THREAD_STATE_NONE)) - -typedef _STRUCT_PPC_THREAD_STATE ppc_thread_state_t; -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -typedef _STRUCT_PPC_THREAD_STATE64 ppc_thread_state64_t; -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ -typedef _STRUCT_PPC_FLOAT_STATE ppc_float_state_t; -typedef _STRUCT_PPC_VECTOR_STATE ppc_vector_state_t; - -/* - * saved state structure - * - * This structure corresponds to the saved state. - * - */ - -#ifdef MACH__POSIX_C_SOURCE_PRIVATE - -#include - -typedef struct savearea ppc_saved_state_t; - -#else /* MACH__POSIX_C_SOURCE_PRIVATE */ - -typedef struct ppc_thread_state ppc_saved_state_t; - -#endif /* MACH__POSIX_C_SOURCE_PRIVATE */ - -/* - * ppc_exception_state - * - * This structure corresponds to some additional state of the user - * registers as saved in the PCB upon kernel entry. They are only - * available if an exception is passed out of the kernel, and even - * then not all are guaranteed to be updated. - * - * Some padding is included in this structure which allows space for - * servers to store temporary values if need be, to maintain binary - * compatiblity. - */ - -/* Exception state for 32-bit thread (on 32-bit processor) */ -/* Still available on 64-bit processors, but may fall short */ -/* of covering the full potential state (hi half available). */ - -typedef _STRUCT_PPC_EXCEPTION_STATE ppc_exception_state_t; -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -typedef _STRUCT_PPC_EXCEPTION_STATE64 ppc_exception_state64_t; -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ - -/* - * Save State Flags - */ - -#define PPC_THREAD_STATE_COUNT ((mach_msg_type_number_t) \ - (sizeof(ppc_thread_state_t) / sizeof(int))) - -#define PPC_THREAD_STATE64_COUNT ((mach_msg_type_number_t) \ - (sizeof(ppc_thread_state64_t) / sizeof(int))) - -#define PPC_EXCEPTION_STATE_COUNT ((mach_msg_type_number_t) \ - (sizeof(ppc_exception_state_t) / sizeof(int))) - -#define PPC_EXCEPTION_STATE64_COUNT ((mach_msg_type_number_t) \ - (sizeof(ppc_exception_state64_t) / sizeof(int))) - -#define PPC_FLOAT_STATE_COUNT ((mach_msg_type_number_t) \ - (sizeof(ppc_float_state_t) / sizeof(int))) - -#define PPC_VECTOR_STATE_COUNT ((mach_msg_type_number_t) \ - (sizeof(ppc_vector_state_t) / sizeof(int))) - -/* - * Machine-independent way for servers and Mach's exception mechanism to - * choose the most efficient state flavor for exception RPC's: - */ -#define MACHINE_THREAD_STATE PPC_THREAD_STATE -#define MACHINE_THREAD_STATE_COUNT PPC_THREAD_STATE_COUNT - -/* - * Largest state on this machine: - */ -#define THREAD_MACHINE_STATE_MAX THREAD_STATE_MAX - -#endif /* _MACH_PPC_THREAD_STATUS_H_ */ diff --git a/osfmk/mach/ppc/vm_param.h b/osfmk/mach/ppc/vm_param.h deleted file mode 100644 index af3a94262..000000000 --- a/osfmk/mach/ppc/vm_param.h +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -#ifndef _MACH_PPC_VM_PARAM_H_ -#define _MACH_PPC_VM_PARAM_H_ - -/* - * These are the global definitions - */ - -#define BYTE_SIZE 8 /* byte size in bits */ - -#define PPC_PGBYTES 4096 /* bytes per ppc page */ -#define PPC_PGSHIFT 12 /* number of bits to shift for pages */ - -#define PAGE_SIZE PPC_PGBYTES -#define PAGE_SHIFT PPC_PGSHIFT -#define PAGE_MASK (PAGE_SIZE - 1) - -#if 0 -#define VM_MAX_PAGE_ADDRESS 0xFFFFFFFFFFFFF000ULL -#else -/* - * LP64todo - For now, we are limited to 51-bits of user addressing - */ -#define VM_MAX_PAGE_ADDRESS 0x0007FFFFFFFFF000ULL -#endif - -#define MACH_VM_MIN_ADDRESS ((mach_vm_offset_t) 0) -#define MACH_VM_MAX_ADDRESS ((mach_vm_offset_t) VM_MAX_PAGE_ADDRESS) - -/* - * These are the values relative to the local process. - */ -#if defined (__ppc64__) -/* - * LP64todo - We don't have the 64-bit address space layout yet. - * Use the 32-bit stack layout for now. - */ -#define VM_MIN_ADDRESS ((vm_offset_t) MACH_VM_MIN_ADDRESS) -#define VM_MAX_ADDRESS ((vm_offset_t) MACH_VM_MAX_ADDRESS) -#define USER_STACK_END ((vm_offset_t) 0x00000000ffff0000ULL) -#else -#define VM_MIN_ADDRESS ((vm_offset_t) 0) -#define VM_MAX_ADDRESS ((vm_offset_t) (VM_MAX_PAGE_ADDRESS & 0xFFFFFFFF)) -#define USER_STACK_END ((vm_offset_t) 0xffff0000U) -#endif /* defined(__ppc64__) */ - -#ifdef KERNEL_PRIVATE - -/* Kernel-wide values */ -#define VM_MIN_KERNEL_ADDRESS ((vm_offset_t) 0x00001000U) -#define VM_MIN_KERNEL_AND_KEXT_ADDRESS VM_MIN_KERNEL_ADDRESS -#define VM_MAX_KERNEL_ADDRESS ((vm_offset_t) 0xDFFFFFFFU) -#define KERNEL_STACK_SIZE (4 * PPC_PGBYTES) -#define INTSTACK_SIZE (5 * PPC_PGBYTES) - -#define VM_MAP_MIN_ADDRESS MACH_VM_MIN_ADDRESS -#define VM_MAP_MAX_ADDRESS MACH_VM_MAX_ADDRESS - -#ifdef MACH_KERNEL_PRIVATE - -/* For implementing legacy 32-bit interfaces */ -#define VM32_SUPPORT 1 -#define VM32_MIN_ADDRESS ((vm32_offset_t) 0) -#define VM32_MAX_ADDRESS ((vm32_offset_t) (VM_MAX_PAGE_ADDRESS & 0xFFFFFFFF)) - - -#define PMAP_ENTER_OPTIONS(pmap, virtual_address, page, protection, \ - flags, wired, options, result) \ - MACRO_BEGIN \ - result=KERN_SUCCESS; \ - PMAP_ENTER(pmap, virtual_address, page, protection, \ - flags, wired); \ - MACRO_END - - -#endif /* MACH_KERNEL_PRIVATE */ - -#endif /* KERNEL_PRIVATE */ - -#endif /* _MACH_PPC_VM_PARAM_H_ */ diff --git a/osfmk/mach/ppc/vm_types.h b/osfmk/mach/ppc/vm_types.h deleted file mode 100644 index 0b3d39485..000000000 --- a/osfmk/mach/ppc/vm_types.h +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ - -/* - * File: vm_types.h - * Author: Avadis Tevanian, Jr. - * Date: 1985 - * - * Header file for VM data types. PPC version. - */ - -#ifndef _MACH_PPC_VM_TYPES_H_ -#define _MACH_PPC_VM_TYPES_H_ - -#ifndef ASSEMBLER - -#include -#include -#include - -/* - * natural_t and integer_t are Mach's legacy types for machine- - * independent integer types (unsigned, and signed, respectively). - * Their original purpose was to define other types in a machine/ - * compiler independent way. - * - * They also had an implicit "same size as pointer" characteristic - * to them (i.e. Mach's traditional types are very ILP32 or ILP64 - * centric). We support PowerPC ABIs that do not follow either of - * these models (specifically LP64). Therefore, we had to make a - * choice between making these types scale with pointers or stay - * tied to integers. Because their use is predominantly tied to - * to the size of an integer, we are keeping that association and - * breaking free from pointer size guarantees. - * - * New use of these types is discouraged. - */ -typedef __darwin_natural_t natural_t; -typedef int integer_t; - -#if defined(__ppc__) - -/* - * For 32-bit PowerPC ABIs, the scalable types were - * always based upon natural_t (unsigned int). - * Because of potential legacy issues with name mangling, - * we cannot use the stdint uintptr_t type. - */ -typedef natural_t vm_offset_t; -typedef natural_t vm_size_t; - -#else /* __ppc64__ */ - -/* - * For 64-bit PowerPC ABIs, we have no legacy name mangling - * issues, so we use the stdint types for scaling these - * types to the same size as a pointer. - */ -typedef uintptr_t vm_offset_t; -typedef uintptr_t vm_size_t; - -#endif - -/* - * This new type is independent of a particular vm map's - * implementation size - and represents appropriate types - * for all possible maps. This is used for interfaces - * where the size of the map is not known - or we don't - * want to have to distinguish. - */ -typedef uint64_t mach_vm_address_t; -typedef uint64_t mach_vm_offset_t; -typedef uint64_t mach_vm_size_t; - -typedef uint64_t vm_map_offset_t; -typedef uint64_t vm_map_address_t; -typedef uint64_t vm_map_size_t; - -#ifdef MACH_KERNEL_PRIVATE - -#if VM32_SUPPORT - -/* - * These are types used internal to Mach to implement the - * legacy 32-bit VM APIs published by the kernel. - */ -typedef uint32_t vm32_address_t; -typedef uint32_t vm32_offset_t; -typedef uint32_t vm32_size_t; - -#endif /* VM32_SUPPORT */ - -#endif /* MACH_KERNEL_PRIVATE */ - -#endif /* ASSEMBLER */ - -/* - * If composing messages by hand (please do not) - */ -#define MACH_MSG_TYPE_INTEGER_T MACH_MSG_TYPE_INTEGER_32 - -#endif /* _MACH_PPC_VM_TYPES_H_ */ diff --git a/osfmk/mach/processor.defs b/osfmk/mach/processor.defs index f590633f1..99ea969a7 100644 --- a/osfmk/mach/processor.defs +++ b/osfmk/mach/processor.defs @@ -74,7 +74,7 @@ subsystem /* * References to processor objects are returned by: * host_processors(host_priv_t,...); - * + */ /* * Start processor. */ diff --git a/osfmk/mach/security.defs b/osfmk/mach/security.defs index 734aa90eb..8d27ae1a8 100644 --- a/osfmk/mach/security.defs +++ b/osfmk/mach/security.defs @@ -4,7 +4,7 @@ subsystem #if KERNEL_SERVER KernelServer -#endif KERNEL_SERVER +#endif /* KERNEL_SERVER */ security 5200; #include diff --git a/osfmk/mach/shared_region.h b/osfmk/mach/shared_region.h index 1e2143e1a..29ced2a40 100644 --- a/osfmk/mach/shared_region.h +++ b/osfmk/mach/shared_region.h @@ -66,7 +66,7 @@ #define SHARED_REGION_BASE_ARM 0x30000000ULL #define SHARED_REGION_SIZE_ARM 0x10000000ULL #define SHARED_REGION_NESTING_BASE_ARM 0x30000000ULL -#define SHARED_REGION_NESTING_SIZE_ARM 0x08000000ULL +#define SHARED_REGION_NESTING_SIZE_ARM 0x10000000ULL #define SHARED_REGION_NESTING_MIN_ARM ? #define SHARED_REGION_NESTING_MAX_ARM ? @@ -84,20 +84,6 @@ #define SHARED_REGION_NESTING_SIZE SHARED_REGION_NESTING_SIZE_X86_64 #define SHARED_REGION_NESTING_MIN SHARED_REGION_NESTING_MIN_X86_64 #define SHARED_REGION_NESTING_MAX SHARED_REGION_NESTING_MAX_X86_64 -#elif defined(__ppc__) -#define SHARED_REGION_BASE SHARED_REGION_BASE_PPC -#define SHARED_REGION_SIZE SHARED_REGION_SIZE_PPC -#define SHARED_REGION_NESTING_BASE SHARED_REGION_NESTING_BASE_PPC -#define SHARED_REGION_NESTING_SIZE SHARED_REGION_NESTING_SIZE_PPC -#define SHARED_REGION_NESTING_MIN SHARED_REGION_NESTING_MIN_PPC -#define SHARED_REGION_NESTING_MAX SHARED_REGION_NESTING_MAX_PPC -#elif defined(__ppc64__) -#define SHARED_REGION_BASE SHARED_REGION_BASE_PPC64 -#define SHARED_REGION_SIZE SHARED_REGION_SIZE_PPC64 -#define SHARED_REGION_NESTING_BASE SHARED_REGION_NESTING_BASE_PPC64 -#define SHARED_REGION_NESTING_SIZE SHARED_REGION_NESTING_SIZE_PPC64 -#define SHARED_REGION_NESTING_MIN SHARED_REGION_NESTING_MIN_PPC64 -#define SHARED_REGION_NESTING_MAX SHARED_REGION_NESTING_MAX_PPC64 #endif #ifdef KERNEL_PRIVATE @@ -126,6 +112,7 @@ struct shared_file_mapping_np { }; #define VM_PROT_COW 0x8 /* must not interfere with normal prot assignments */ #define VM_PROT_ZF 0x10 /* must not interfere with normal prot assignments */ +#define VM_PROT_SLIDE 0x20 /* must not interfere with normal prot assignments */ #ifndef KERNEL @@ -134,6 +121,7 @@ int shared_region_check_np(uint64_t *startaddress); int shared_region_map_np(int fd, uint32_t mappingCount, const struct shared_file_mapping_np *mappings); +int shared_region_slide_np(void); __END_DECLS #endif /* !KERNEL */ diff --git a/osfmk/mach/syscall_sw.h b/osfmk/mach/syscall_sw.h index 11e9211f8..bac3552d3 100644 --- a/osfmk/mach/syscall_sw.h +++ b/osfmk/mach/syscall_sw.h @@ -91,9 +91,9 @@ kernel_trap(semaphore_wait_signal_trap,-37,2) kernel_trap(semaphore_timedwait_trap,-38,3) kernel_trap(semaphore_timedwait_signal_trap,-39,4) -#if !defined(__LP64__) +#if !defined(__LP64__) && !defined(__arm__) kernel_trap(map_fd,-43,5) -#endif /* __LP64__ */ +#endif /*!defined(__LP64__) && !defined(__arm__) */ kernel_trap(task_name_for_pid,-44,3) kernel_trap(task_for_pid,-45,3) diff --git a/osfmk/mach/task.defs b/osfmk/mach/task.defs index ceebc9529..0c70e9aef 100644 --- a/osfmk/mach/task.defs +++ b/osfmk/mach/task.defs @@ -70,6 +70,8 @@ subsystem #include #include +#include + /* * Create a new task with an empty set of IPC rights, * and having an address space constructed from the @@ -170,7 +172,12 @@ routine task_set_special_port( * the port representing the first thr_act in that new thread. The * initial execution state of the thread is undefined. */ -routine thread_create( +routine +#ifdef KERNEL_SERVER +thread_create_from_user( +#else +thread_create( +#endif parent_task : task_t; out child_act : thread_act_t); @@ -181,7 +188,12 @@ routine thread_create( * by flavor and new_state. Returns the port representing * the new thread. */ -routine thread_create_running( +routine +#ifdef KERNEL_SERVER +thread_create_running_from_user( +#else +thread_create_running( +#endif parent_task : task_t; flavor : thread_state_flavor_t; new_state : thread_state_t; @@ -332,7 +344,16 @@ routine task_set_ras_pc( boundspc : vm_address_t); -skip; /* was kernel_task_create() */ +/* + * Return zone info as seen/used by this task. + */ +routine task_zone_info( + target_task : task_t; + out names : mach_zone_name_array_t, + Dealloc; + out info : task_zone_info_array_t, + Dealloc); + /* * JMM - Want to eliminate processor_set so keep them at the end. @@ -389,5 +410,4 @@ routine task_set_state( flavor : thread_state_flavor_t; new_state : thread_state_t); - /* vim: set ft=c : */ diff --git a/osfmk/mach/task_info.h b/osfmk/mach/task_info.h index cab9c1757..a43dc6cb9 100644 --- a/osfmk/mach/task_info.h +++ b/osfmk/mach/task_info.h @@ -195,6 +195,20 @@ typedef struct task_absolutetime_info *task_absolutetime_info_t; #define TASK_ABSOLUTETIME_INFO_COUNT ((mach_msg_type_number_t) \ (sizeof (task_absolutetime_info_data_t) / sizeof (natural_t))) +#define TASK_KERNELMEMORY_INFO 7 + +struct task_kernelmemory_info { + uint64_t total_palloc; /* private kernel mem alloc'ed */ + uint64_t total_pfree; /* private kernel mem freed */ + uint64_t total_salloc; /* shared kernel mem alloc'ed */ + uint64_t total_sfree; /* shared kernel mem freed */ +}; + +typedef struct task_kernelmemory_info task_kernelmemory_info_data_t; +typedef struct task_kernelmemory_info *task_kernelmemory_info_t; +#define TASK_KERNELMEMORY_INFO_COUNT ((mach_msg_type_number_t) \ + (sizeof (task_kernelmemory_info_data_t) / sizeof (natural_t))) + #define TASK_SECURITY_TOKEN 13 #define TASK_SECURITY_TOKEN_COUNT ((mach_msg_type_number_t) \ (sizeof(security_token_t) / sizeof(natural_t))) @@ -217,16 +231,30 @@ typedef struct task_affinity_tag_info *task_affinity_tag_info_t; #define TASK_AFFINITY_TAG_INFO_COUNT \ (sizeof(task_affinity_tag_info_data_t) / sizeof(natural_t)) -#define TASK_DYLD_INFO 17 /* This is experimental. */ +#define TASK_DYLD_INFO 17 struct task_dyld_info { mach_vm_address_t all_image_info_addr; mach_vm_size_t all_image_info_size; + integer_t all_image_info_format; }; typedef struct task_dyld_info task_dyld_info_data_t; typedef struct task_dyld_info *task_dyld_info_t; #define TASK_DYLD_INFO_COUNT \ (sizeof(task_dyld_info_data_t) / sizeof(natural_t)) +#define TASK_DYLD_ALL_IMAGE_INFO_32 0 /* format value */ +#define TASK_DYLD_ALL_IMAGE_INFO_64 1 /* format value */ + +#define TASK_EXTMOD_INFO 18 + +struct task_extmod_info { + unsigned char task_uuid[16]; + vm_extmod_statistics_data_t extmod_statistics; +}; +typedef struct task_extmod_info task_extmod_info_data_t; +typedef struct task_extmod_info *task_extmod_info_t; +#define TASK_EXTMOD_INFO_COUNT \ + (sizeof(task_extmod_info_data_t) / sizeof(natural_t)) #pragma pack() diff --git a/osfmk/mach/task_policy.h b/osfmk/mach/task_policy.h index 3a2fb39c4..71d70526f 100644 --- a/osfmk/mach/task_policy.h +++ b/osfmk/mach/task_policy.h @@ -111,6 +111,7 @@ enum task_role { TASK_CONTROL_APPLICATION, TASK_GRAPHICS_SERVER, TASK_THROTTLE_APPLICATION, + TASK_NONUI_APPLICATION, TASK_DEFAULT_APPLICATION }; diff --git a/osfmk/mach/thread_act.defs b/osfmk/mach/thread_act.defs index 47a21a9e6..9754acb63 100644 --- a/osfmk/mach/thread_act.defs +++ b/osfmk/mach/thread_act.defs @@ -100,7 +100,12 @@ routine act_get_state( * If the thread is currently executing, the state change * may be ill-defined. */ -routine act_set_state( +routine +#ifdef KERNEL_SERVER +act_set_state_from_user( +#else +act_set_state( +#endif target_act : thread_act_t; flavor : int; new_state : thread_state_t); @@ -124,7 +129,12 @@ routine thread_get_state( * If the thread is currently executing, the state change * may be ill-defined. */ -routine thread_set_state( +routine +#ifdef KERNEL_SERVER +thread_set_state_from_user( +#else +thread_set_state( +#endif target_act : thread_act_t; flavor : thread_state_flavor_t; new_state : thread_state_t); diff --git a/osfmk/mach/thread_policy.h b/osfmk/mach/thread_policy.h index d9530b776..607028837 100644 --- a/osfmk/mach/thread_policy.h +++ b/osfmk/mach/thread_policy.h @@ -215,4 +215,20 @@ typedef struct thread_affinity_policy *thread_affinity_policy_t; #define THREAD_AFFINITY_POLICY_COUNT ((mach_msg_type_number_t) \ (sizeof (thread_affinity_policy_data_t) / sizeof (integer_t))) +/* + * THREAD_BACKGROUND_POLICY: + */ + +#define THREAD_BACKGROUND_POLICY 5 + +struct thread_background_policy { + integer_t priority; +}; + +typedef struct thread_background_policy thread_background_policy_data_t; +typedef struct thread_background_policy *thread_background_policy_t; + +#define THREAD_BACKGROUND_POLICY_COUNT ((mach_msg_type_number_t) \ + (sizeof (thread_background_policy_data_t) / sizeof (integer_t))) + #endif /* _MACH_THREAD_POLICY_H_ */ diff --git a/osfmk/mach/vm_prot.h b/osfmk/mach/vm_prot.h index 6fe17d43c..ae2d67584 100644 --- a/osfmk/mach/vm_prot.h +++ b/osfmk/mach/vm_prot.h @@ -129,13 +129,20 @@ typedef int vm_prot_t; #define VM_PROT_WANTS_COPY ((vm_prot_t) 0x10) - +#ifdef PRIVATE /* * The caller wants this memory region treated as if it had a valid * code signature. */ #define VM_PROT_TRUSTED ((vm_prot_t) 0x20) +#endif /* PRIVATE */ +/* + * Another invalid protection value. + * Indicates that the other protection bits are to be applied as a mask + * against the actual protection bits of the map entry. + */ +#define VM_PROT_IS_MASK ((vm_prot_t) 0x40) #endif /* _MACH_VM_PROT_H_ */ diff --git a/osfmk/mach/vm_region.h b/osfmk/mach/vm_region.h index ceb42b7b5..ebc0e8d54 100644 --- a/osfmk/mach/vm_region.h +++ b/osfmk/mach/vm_region.h @@ -126,6 +126,7 @@ typedef struct vm_region_basic_info vm_region_basic_info_data_t; #define SM_TRUESHARED 5 #define SM_PRIVATE_ALIASED 6 #define SM_SHARED_ALIASED 7 +#define SM_LARGE_PAGE 8 /* * For submap info, the SM flags above are overlayed when a submap @@ -309,6 +310,7 @@ struct vm_page_info_basic { vm_object_id_t object_id; memory_object_offset_t offset; int depth; + int __pad; /* pad to 64-bit boundary */ }; typedef struct vm_page_info_basic *vm_page_info_basic_t; typedef struct vm_page_info_basic vm_page_info_basic_data_t; diff --git a/osfmk/mach/vm_statistics.h b/osfmk/mach/vm_statistics.h index 89ca4351e..4d1b13a56 100644 --- a/osfmk/mach/vm_statistics.h +++ b/osfmk/mach/vm_statistics.h @@ -113,15 +113,6 @@ struct vm_statistics { typedef struct vm_statistics *vm_statistics_t; typedef struct vm_statistics vm_statistics_data_t; -#if defined(__ppc__) /* On ppc, vm statistics are still 32-bit */ - -typedef struct vm_statistics *vm_statistics64_t; -typedef struct vm_statistics vm_statistics64_data_t; - -#define VM_STATISTICS_TRUNCATE_TO_32_BIT(value) value - -#else /* !(defined(__ppc__)) */ - /* * vm_statistics64 * @@ -133,6 +124,8 @@ typedef struct vm_statistics vm_statistics64_data_t; * rev3 - changed name to vm_statistics64. * changed some fields in structure to 64-bit on * arm, i386 and x86_64 architectures. + * rev4 - require 64-bit alignment for efficient access + * in the kernel. No change to reported data. * */ @@ -163,8 +156,7 @@ struct vm_statistics64 { */ natural_t speculative_count; /* # of pages speculative */ -} -; +} __attribute__((aligned(8))); typedef struct vm_statistics64 *vm_statistics64_t; typedef struct vm_statistics64 vm_statistics64_data_t; @@ -177,7 +169,27 @@ typedef struct vm_statistics64 vm_statistics64_data_t; */ #define VM_STATISTICS_TRUNCATE_TO_32_BIT(value) ((uint32_t)(((value) > UINT32_MAX ) ? UINT32_MAX : (value))) -#endif /* !(defined(__ppc__)) */ +/* + * vm_extmod_statistics + * + * Structure to record modifications to a task by an + * external agent. + * + * History: + * rev0 - original structure. + */ + +struct vm_extmod_statistics { + int64_t task_for_pid_count; /* # of times task port was looked up */ + int64_t task_for_pid_caller_count; /* # of times this task called task_for_pid */ + int64_t thread_creation_count; /* # of threads created in task */ + int64_t thread_creation_caller_count; /* # of threads created by task */ + int64_t thread_set_state_count; /* # of register state sets in task */ + int64_t thread_set_state_caller_count; /* # of register state sets by task */ +} __attribute__((aligned(8))); + +typedef struct vm_extmod_statistics *vm_extmod_statistics_t; +typedef struct vm_extmod_statistics vm_extmod_statistics_data_t; /* included for the vm_map_page_query call */ @@ -245,7 +257,6 @@ typedef struct pmap_statistics *pmap_statistics_t; #define VM_FLAGS_PURGABLE 0x0002 #define VM_FLAGS_NO_CACHE 0x0010 #ifdef KERNEL_PRIVATE -#define VM_FLAGS_BELOW_MIN 0x0080 /* map below the map's min offset */ #define VM_FLAGS_PERMANENT 0x0100 /* mapping can NEVER be unmapped */ #define VM_FLAGS_GUARD_AFTER 0x0200 /* guard page after the mapping */ #define VM_FLAGS_GUARD_BEFORE 0x0400 /* guard page before the mapping */ @@ -256,6 +267,7 @@ typedef struct pmap_statistics *pmap_statistics_t; #define VM_FLAGS_OVERWRITE 0x4000 /* delete any existing mappings first */ #ifdef KERNEL_PRIVATE #define VM_FLAGS_NO_PMAP_CHECK 0x8000 /* do not check that pmap is empty */ +#define VM_FLAGS_MAP_JIT 0x80000 /* Used to mark an entry as describing a JIT region */ #endif /* KERNEL_PRIVATE */ /* @@ -267,10 +279,12 @@ typedef struct pmap_statistics *pmap_statistics_t; #define VM_FLAGS_SUPERPAGE_SHIFT 16 #define SUPERPAGE_NONE 0 /* no superpages, if all bits are 0 */ -#define VM_FLAGS_SUPERPAGE_NONE (SUPERPAGE_NONE< -type zone_name_t = struct[80] of char; -type zone_name_array_t = array[] of zone_name_t; +type zone_name_t = struct[80] of char; /* deprecated */ +type zone_name_array_t = array[] of zone_name_t; /* deprecated */ -type zone_info_t = struct[9] of integer_t; -type zone_info_array_t = array[] of zone_info_t; +type zone_info_t = struct[9] of integer_t; /* deprecated */ +type zone_info_array_t = array[] of zone_info_t; /* deprecated */ + +type mach_zone_name_t = struct[80] of char; +type mach_zone_name_array_t = array[] of mach_zone_name_t; + +type mach_zone_info_t = struct[8] of uint64_t; +type mach_zone_info_array_t = array[] of mach_zone_info_t; + +type task_zone_info_t = struct[11] of uint64_t; +type task_zone_info_array_t = array[] of task_zone_info_t; type hash_info_bucket_t = struct[1] of natural_t; type hash_info_bucket_array_t = array[] of hash_info_bucket_t; diff --git a/osfmk/mach_debug/zone_info.h b/osfmk/mach_debug/zone_info.h index 9d2182a29..277801d5d 100644 --- a/osfmk/mach_debug/zone_info.h +++ b/osfmk/mach_debug/zone_info.h @@ -63,8 +63,9 @@ #include /* - * Remember to update the mig type definitions - * in mach_debug_types.defs when adding/removing fields. + * Legacy definitions for host_zone_info(). This interface, and + * these definitions have been deprecated in favor of the new + * mach_zone_info() inteface and types below. */ #define ZONE_NAME_MAX_LEN 80 @@ -90,4 +91,46 @@ typedef struct zone_info { typedef zone_info_t *zone_info_array_t; + +/* + * Remember to update the mig type definitions + * in mach_debug_types.defs when adding/removing fields. + */ + +#define MACH_ZONE_NAME_MAX_LEN 80 + +typedef struct mach_zone_name { + char mzn_name[ZONE_NAME_MAX_LEN]; +} mach_zone_name_t; + +typedef mach_zone_name_t *mach_zone_name_array_t; + +typedef struct mach_zone_info_data { + uint64_t mzi_count; /* count of elements in use */ + uint64_t mzi_cur_size; /* current memory utilization */ + uint64_t mzi_max_size; /* how large can this zone grow */ + uint64_t mzi_elem_size; /* size of an element */ + uint64_t mzi_alloc_size; /* size used for more memory */ + uint64_t mzi_sum_size; /* sum of all allocs (life of zone) */ + uint64_t mzi_exhaustible; /* merely return if empty? */ + uint64_t mzi_collectable; /* garbage collect elements? */ +} mach_zone_info_t; + +typedef mach_zone_info_t *mach_zone_info_array_t; + +typedef struct task_zone_info_data { + uint64_t tzi_count; /* count of elements in use */ + uint64_t tzi_cur_size; /* current memory utilization */ + uint64_t tzi_max_size; /* how large can this zone grow */ + uint64_t tzi_elem_size; /* size of an element */ + uint64_t tzi_alloc_size; /* size used for more memory */ + uint64_t tzi_sum_size; /* sum of all allocs (life of zone) */ + uint64_t tzi_exhaustible; /* merely return if empty? */ + uint64_t tzi_collectable; /* garbage collect elements? */ + uint64_t tzi_caller_acct; /* charged to caller (or kernel) */ + uint64_t tzi_task_alloc; /* sum of all allocs by this task */ + uint64_t tzi_task_free; /* sum of all frees by this task */ +} task_zone_info_t; + +typedef task_zone_info_t *task_zone_info_array_t; #endif /* _MACH_DEBUG_ZONE_INFO_H_ */ diff --git a/osfmk/machine/Makefile b/osfmk/machine/Makefile index d68ef2fbf..2170671dd 100644 --- a/osfmk/machine/Makefile +++ b/osfmk/machine/Makefile @@ -15,6 +15,8 @@ DATAFILES = \ lock.h \ locks.h \ machine_routines.h \ + pal_routines.h \ + pal_hibernate.h \ simple_lock.h INSTALL_MI_LCL_LIST = cpu_capabilities.h diff --git a/osfmk/machine/asm.h b/osfmk/machine/asm.h index 70c246cb8..c43a64523 100644 --- a/osfmk/machine/asm.h +++ b/osfmk/machine/asm.h @@ -28,9 +28,7 @@ #ifndef _MACHINE_ASM_H #define _MACHINE_ASM_H -#if defined (__ppc__) -#include "ppc/asm.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/asm.h" #else #error architecture not supported diff --git a/osfmk/machine/ast.h b/osfmk/machine/ast.h index 0c01fc7bc..b4880d25a 100644 --- a/osfmk/machine/ast.h +++ b/osfmk/machine/ast.h @@ -28,9 +28,7 @@ #ifndef _MACHINE_AST_H #define _MACHINE_AST_H -#if defined (__ppc__) -#include "ppc/ast.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/ast.h" #else #error architecture not supported diff --git a/osfmk/machine/ast_types.h b/osfmk/machine/ast_types.h index fc7d1d229..57ae58bff 100644 --- a/osfmk/machine/ast_types.h +++ b/osfmk/machine/ast_types.h @@ -28,9 +28,7 @@ #ifndef _MACHINE_AST_TYPES_H #define _MACHINE_AST_TYPES_H -#if defined (__ppc__) -#include "ppc/ast_types.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/ast_types.h" #else #error architecture not supported diff --git a/osfmk/machine/commpage.h b/osfmk/machine/commpage.h index c3d3a9989..d11521702 100644 --- a/osfmk/machine/commpage.h +++ b/osfmk/machine/commpage.h @@ -29,9 +29,7 @@ #ifndef _MACHINE_COMMPAGE_H #define _MACHINE_COMMPAGE_H -#if defined (__ppc__) -#include "ppc/commpage/commpage.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/commpage/commpage.h" #else #error architecture not supported diff --git a/osfmk/machine/cpu_affinity.h b/osfmk/machine/cpu_affinity.h index 990c856bc..5b3e47ac0 100644 --- a/osfmk/machine/cpu_affinity.h +++ b/osfmk/machine/cpu_affinity.h @@ -30,9 +30,7 @@ #ifndef _MACHINE_CPU_AFFINITY_H #define _MACHINE_CPU_AFFINITY_H -#if defined (__ppc__) -#include "ppc/cpu_affinity.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/cpu_affinity.h" #else #error architecture not supported diff --git a/osfmk/machine/cpu_capabilities.h b/osfmk/machine/cpu_capabilities.h index 606ec2898..a722dc93f 100644 --- a/osfmk/machine/cpu_capabilities.h +++ b/osfmk/machine/cpu_capabilities.h @@ -31,18 +31,14 @@ #define _MACHINE_CPU_CAPABILITIES_H #ifdef KERNEL_PRIVATE -#if defined (__ppc__) -#include "ppc/cpu_capabilities.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/cpu_capabilities.h" #else #error architecture not supported #endif #else /* !KERNEL_PRIVATE -- System Framework header */ -#if defined (__ppc__) || defined(__ppc64__) -#include -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include #else #error architecture not supported diff --git a/osfmk/machine/cpu_data.h b/osfmk/machine/cpu_data.h index 0a047481f..347235ec9 100644 --- a/osfmk/machine/cpu_data.h +++ b/osfmk/machine/cpu_data.h @@ -28,9 +28,7 @@ #ifndef _MACHINE_CPU_DATA_H #define _MACHINE_CPU_DATA_H -#if defined (__ppc__) -#include "ppc/cpu_data.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/cpu_data.h" #else #error architecture not supported diff --git a/osfmk/machine/cpu_number.h b/osfmk/machine/cpu_number.h index 47e71ba57..45c4b2b4d 100644 --- a/osfmk/machine/cpu_number.h +++ b/osfmk/machine/cpu_number.h @@ -30,9 +30,7 @@ #ifndef _MACHINE_CPU_NUMBER_H #define _MACHINE_CPU_NUMBER_H -#if defined (__ppc__) -#include "ppc/cpu_number.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/cpu_number.h" #else #error architecture not supported diff --git a/osfmk/machine/db_machdep.h b/osfmk/machine/db_machdep.h index ae38b4451..76ce9b313 100644 --- a/osfmk/machine/db_machdep.h +++ b/osfmk/machine/db_machdep.h @@ -28,9 +28,7 @@ #ifndef _MACHINE_DB_MACHDEP_H #define _MACHINE_DB_MACHDEP_H -#if defined (__ppc__) -#include "ppc/db_machdep.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/db_machdep.h" #else #error architecture not supported diff --git a/osfmk/machine/endian.h b/osfmk/machine/endian.h index 5f9c0b9d8..5078c0fd7 100644 --- a/osfmk/machine/endian.h +++ b/osfmk/machine/endian.h @@ -28,9 +28,7 @@ #ifndef _MACHINE_ENDIAN_H #define _MACHINE_ENDIAN_H -#if defined (__ppc__) -#include "ppc/endian.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/endian.h" #else #error architecture not supported diff --git a/osfmk/machine/io_map_entries.h b/osfmk/machine/io_map_entries.h index 8e9e9e456..49306bc16 100644 --- a/osfmk/machine/io_map_entries.h +++ b/osfmk/machine/io_map_entries.h @@ -30,9 +30,7 @@ #ifndef _MACHINE_IO_MAP_ENTRIES_H_ #define _MACHINE_IO_MAP_ENTRIES_H_ -#if defined (__ppc__) -#include "ppc/io_map_entries.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/io_map_entries.h" #else #error architecture not supported diff --git a/osfmk/machine/lock.h b/osfmk/machine/lock.h index 558e780d4..a870743a5 100644 --- a/osfmk/machine/lock.h +++ b/osfmk/machine/lock.h @@ -30,9 +30,7 @@ #ifndef _MACHINE_LOCK_H_ #define _MACHINE_LOCK_H_ -#if defined (__ppc__) -#include "ppc/lock.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/lock.h" #else #error architecture not supported diff --git a/osfmk/machine/locks.h b/osfmk/machine/locks.h index 786419581..ad7dcdcbe 100644 --- a/osfmk/machine/locks.h +++ b/osfmk/machine/locks.h @@ -28,9 +28,7 @@ #ifndef _MACHINE_LOCKS_H_ #define _MACHINE_LOCKS_H_ -#if defined (__ppc__) -#include "ppc/locks.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/locks.h" #else #error architecture not supported diff --git a/osfmk/machine/machine_cpu.h b/osfmk/machine/machine_cpu.h index fdc556a16..734cf8f30 100644 --- a/osfmk/machine/machine_cpu.h +++ b/osfmk/machine/machine_cpu.h @@ -28,9 +28,7 @@ #ifndef _MACHINE_MACHINE_CPU_H #define _MACHINE_MACHINE_CPU_H -#if defined (__ppc__) -#include "ppc/machine_cpu.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/machine_cpu.h" #else #error architecture not supported diff --git a/osfmk/machine/machine_routines.h b/osfmk/machine/machine_routines.h index a92705fed..361dee046 100644 --- a/osfmk/machine/machine_routines.h +++ b/osfmk/machine/machine_routines.h @@ -28,9 +28,7 @@ #ifndef _MACHINE_MACHINE_ROUTINES_H #define _MACHINE_MACHINE_ROUTINES_H -#if defined (__ppc__) -#include "ppc/machine_routines.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/machine_routines.h" #else #error architecture not supported diff --git a/osfmk/machine/machine_rpc.h b/osfmk/machine/machine_rpc.h index 0fe29c9d7..c158a0c19 100644 --- a/osfmk/machine/machine_rpc.h +++ b/osfmk/machine/machine_rpc.h @@ -28,9 +28,7 @@ #ifndef _MACHINE_MACHINE_RPC_H #define _MACHINE_MACHINE_RPC_H -#if defined (__ppc__) -#include "ppc/machine_rpc.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/machine_rpc.h" #else #error architecture not supported diff --git a/osfmk/machine/machlimits.h b/osfmk/machine/machlimits.h index f9d468434..fee4ddf0c 100644 --- a/osfmk/machine/machlimits.h +++ b/osfmk/machine/machlimits.h @@ -28,9 +28,7 @@ #ifndef _MACHINE_MACHLIMITS_H #define _MACHINE_MACHLIMITS_H -#if defined (__ppc__) -#include "ppc/machlimits.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/machlimits.h" #else #error architecture not supported diff --git a/osfmk/machine/machparam.h b/osfmk/machine/machparam.h index 3e8325307..9657b8cce 100644 --- a/osfmk/machine/machparam.h +++ b/osfmk/machine/machparam.h @@ -28,9 +28,7 @@ #ifndef _MACHINE_MACHPARAM_H #define _MACHINE_MACHPARAM_H -#if defined (__ppc__) -#include "ppc/machparam.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/machparam.h" #else #error architecture not supported diff --git a/libsyscall/mach/ppc/mach_absolute_time.s b/osfmk/machine/pal_hibernate.h similarity index 81% rename from libsyscall/mach/ppc/mach_absolute_time.s rename to osfmk/machine/pal_hibernate.h index 2f4da835f..238896dc5 100644 --- a/libsyscall/mach/ppc/mach_absolute_time.s +++ b/osfmk/machine/pal_hibernate.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,13 +25,13 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#ifndef _MACHINE_PAL_HIBERNATE_H +#define _MACHINE_PAL_HIBERNATE_H -#define __APPLE_API_PRIVATE -#include -#undef __APPLE_API_PRIVATE +#if defined (__i386__) || defined(__x86_64__) +#include "i386/pal_hibernate.h" +#else +#error architecture not supported +#endif -.text -.align 4 -.globl _mach_absolute_time -_mach_absolute_time: - ba _COMM_PAGE_ABSOLUTE_TIME +#endif /* _MACHINE_PAL_HIBERNATE_H */ diff --git a/iokit/IOKit/machine/IOSharedLockImp.h b/osfmk/machine/pal_routines.h similarity index 85% rename from iokit/IOKit/machine/IOSharedLockImp.h rename to osfmk/machine/pal_routines.h index ec0c90f2c..755b532e9 100644 --- a/iokit/IOKit/machine/IOSharedLockImp.h +++ b/osfmk/machine/pal_routines.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,12 +25,13 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#ifndef _MACHINE_PAL_ROUTINES_H +#define _MACHINE_PAL_ROUTINES_H -#if defined (__ppc__) -#include "IOKit/ppc/IOSharedLockImp.h" -#elif defined (__i386__) || defined (__x86_64__) -#include "IOKit/i386/IOSharedLockImp.h" +#if defined (__i386__) || defined(__x86_64__) +#include "i386/pal_routines.h" #else #error architecture not supported #endif +#endif /* _MACHINE_PAL_ROUTINES_H */ diff --git a/osfmk/machine/pmap.h b/osfmk/machine/pmap.h index b6290032a..78bef764e 100644 --- a/osfmk/machine/pmap.h +++ b/osfmk/machine/pmap.h @@ -28,9 +28,7 @@ #ifndef _MACHINE_PMAP_H #define _MACHINE_PMAP_H -#if defined (__ppc__) -#include "ppc/pmap.h" -#elif defined (__x86_64__) || defined (__i386__) +#if defined (__x86_64__) || defined (__i386__) #include "i386/pmap.h" #else #error architecture not supported diff --git a/osfmk/machine/sched_param.h b/osfmk/machine/sched_param.h index 04c23c73c..2c5dc0d0d 100644 --- a/osfmk/machine/sched_param.h +++ b/osfmk/machine/sched_param.h @@ -28,9 +28,7 @@ #ifndef _MACHINE_SCHED_PARAM_H #define _MACHINE_SCHED_PARAM_H -#if defined (__ppc__) -#include "ppc/sched_param.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/sched_param.h" #else #error architecture not supported diff --git a/osfmk/machine/setjmp.h b/osfmk/machine/setjmp.h index c59703092..142e4f677 100644 --- a/osfmk/machine/setjmp.h +++ b/osfmk/machine/setjmp.h @@ -28,9 +28,7 @@ #ifndef _MACHINE_SETJMP_H #define _MACHINE_SETJMP_H -#if defined (__ppc__) -#include "ppc/setjmp.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/setjmp.h" #else #error architecture not supported diff --git a/osfmk/machine/simple_lock.h b/osfmk/machine/simple_lock.h index 799b74c9b..30e2b44c5 100644 --- a/osfmk/machine/simple_lock.h +++ b/osfmk/machine/simple_lock.h @@ -30,9 +30,7 @@ #ifndef _MACHINE_SIMPLE_LOCK_H_ #define _MACHINE_SIMPLE_LOCK_H_ -#if defined (__ppc__) -#include "ppc/simple_lock.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/simple_lock.h" #else #error architecture not supported diff --git a/osfmk/machine/task.h b/osfmk/machine/task.h index faf4ba5ac..3e9fc821a 100644 --- a/osfmk/machine/task.h +++ b/osfmk/machine/task.h @@ -28,9 +28,7 @@ #ifndef _MACHINE_TASK_H #define _MACHINE_TASK_H -#if defined (__ppc__) -#include "ppc/task.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/task.h" #else #error architecture not supported diff --git a/osfmk/machine/thread.h b/osfmk/machine/thread.h index 5eeccbd33..840d103b7 100644 --- a/osfmk/machine/thread.h +++ b/osfmk/machine/thread.h @@ -28,9 +28,7 @@ #ifndef _MACHINE_THREAD_H #define _MACHINE_THREAD_H -#if defined (__ppc__) -#include "ppc/thread.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/thread.h" #else #error architecture not supported diff --git a/osfmk/machine/timer.h b/osfmk/machine/timer.h index a80a74b72..ba8e5b5ff 100644 --- a/osfmk/machine/timer.h +++ b/osfmk/machine/timer.h @@ -28,9 +28,7 @@ #ifndef _MACHINE_TIMER_H #define _MACHINE_TIMER_H -#if defined (__ppc__) -#include "ppc/timer.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/timer.h" #else #error architecture not supported diff --git a/osfmk/machine/trap.h b/osfmk/machine/trap.h index 54298707b..5fb2aa18d 100644 --- a/osfmk/machine/trap.h +++ b/osfmk/machine/trap.h @@ -28,9 +28,7 @@ #ifndef _MACHINE_TRAP_H #define _MACHINE_TRAP_H -#if defined (__ppc__) -#include "ppc/trap.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/trap.h" #else #error architecture not supported diff --git a/osfmk/machine/vm_tuning.h b/osfmk/machine/vm_tuning.h index a5906bb68..324d9d25a 100644 --- a/osfmk/machine/vm_tuning.h +++ b/osfmk/machine/vm_tuning.h @@ -28,9 +28,7 @@ #ifndef _MACHINE_VM_TUNING_H #define _MACHINE_VM_TUNING_H -#if defined (__ppc__) -#include "ppc/vm_tuning.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/vm_tuning.h" #else #error architecture not supported diff --git a/osfmk/machine/xpr.h b/osfmk/machine/xpr.h index 089a5cc65..ad747c34f 100644 --- a/osfmk/machine/xpr.h +++ b/osfmk/machine/xpr.h @@ -28,9 +28,7 @@ #ifndef _MACHINE_XPR_H #define _MACHINE_XPR_H -#if defined (__ppc__) -#include "ppc/xpr.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/xpr.h" #else #error architecture not supported diff --git a/osfmk/pmc/pmc.c b/osfmk/pmc/pmc.c index 43da760a1..f5a894823 100644 --- a/osfmk/pmc/pmc.c +++ b/osfmk/pmc/pmc.c @@ -38,11 +38,6 @@ #include #endif -#if defined(__ppc__) -#include -#include -#endif - #if CONFIG_COUNTERS /* various debug logging enable */ @@ -224,13 +219,13 @@ static volatile uint32_t perf_counters_count = 0U; * constitute a conflict. */ static queue_t system_reservations = NULL; -static volatile uint32_t system_reservation_count __attribute__((aligned(4))) = 0U; +static volatile uint32_t system_reservation_count = 0U; static queue_t task_reservations = NULL; -static volatile uint32_t task_reservation_count __attribute__((aligned(4))) = 0U; +static volatile uint32_t task_reservation_count = 0U; static queue_t thread_reservations = NULL; -static volatile uint32_t thread_reservation_count __attribute__((aligned(4))) = 0U; +static volatile uint32_t thread_reservation_count = 0U; #if XNU_KERNEL_PRIVATE @@ -928,6 +923,7 @@ static boolean_t pmc_internal_reservation_add(pmc_reservation_t resv) { case PMC_FLAG_SCOPE_SYSTEM: /* Simply add it to the system queue */ pmc_internal_reservation_enqueue(system_reservations, resv); + system_reservation_count++; lck_spin_unlock(&reservations_spin); @@ -939,6 +935,7 @@ static boolean_t pmc_internal_reservation_add(pmc_reservation_t resv) { /* Not only do we enqueue it in our local queue for tracking */ pmc_internal_reservation_enqueue(task_reservations, resv); + task_reservation_count++; lck_spin_unlock(&reservations_spin); @@ -956,6 +953,7 @@ static boolean_t pmc_internal_reservation_add(pmc_reservation_t resv) { */ pmc_internal_reservation_enqueue(thread_reservations, resv); + thread_reservation_count++; lck_spin_unlock(&reservations_spin); @@ -998,22 +996,6 @@ static void pmc_internal_reservation_broadcast(pmc_reservation_t reservation, vo /* Have each core run pmc_internal_reservation_stop_cpu asynchronously. */ mp_cpus_call(mask, ASYNC, action_func, reservation); -#elif defined(__ppc__) - size_t ii; - - if (core_cnt > 0) { - for (ii = 0; ii < core_cnt; ii++) { - if (cores[ii] == (uint32_t)cpu_number()) { - action_func(reservation); - } else { - cpu_signal(cores[ii], SIGPcall, (uint32_t)action_func, (uint32_t)reservation); - } - } - } else { - uint32_t sync; - cpu_broadcast(&sync, (void (*)(uint32_t))action_func, (uint32_t)reservation); - action_func(reservation); - } #else #error pmc_reservation_interrupt needs an inter-processor method invocation mechanism for this architecture #endif @@ -1044,6 +1026,7 @@ static void pmc_internal_reservation_remove(pmc_reservation_t resv) { case PMC_FLAG_SCOPE_SYSTEM: lck_spin_lock(&reservations_spin); pmc_internal_reservation_dequeue(system_reservations, resv); + system_reservation_count--; lck_spin_unlock(&reservations_spin); break; @@ -1054,6 +1037,7 @@ static void pmc_internal_reservation_remove(pmc_reservation_t resv) { /* remove from the global queue */ pmc_internal_reservation_dequeue(task_reservations, resv); + task_reservation_count--; /* unlock the global */ lck_spin_unlock(&reservations_spin); @@ -1066,6 +1050,7 @@ static void pmc_internal_reservation_remove(pmc_reservation_t resv) { lck_spin_lock(&reservations_spin); pmc_internal_reservation_dequeue(thread_reservations, resv); + thread_reservation_count--; lck_spin_unlock(&reservations_spin); diff --git a/osfmk/pmc/pmc.h b/osfmk/pmc/pmc.h index ab396a9c6..72692fa54 100644 --- a/osfmk/pmc/pmc.h +++ b/osfmk/pmc/pmc.h @@ -34,6 +34,8 @@ extern "C" { #include #include +#include + /**************************************************************************** * The four main object types * @@ -336,6 +338,14 @@ typedef struct pmc_methods { * KERN_RESOURCE_SHORTAGE if the kernel lacks the resources to register another performance monitor * driver, KERN_INVALID_ARGUMENT if one or both of the arguments is null */ + +/* Prevent older AppleProfileFamily kexts from loading on newer kernels. + * Alas, C doesn't necessarily have a cleaner way to do the version number concatenation + */ +#define PERF_REG_NAME1(a, b) a ## b +#define PERF_REG_NAME(a, b) PERF_REG_NAME1(a, b) +#define perf_monitor_register PERF_REG_NAME(perf_monitor_register_, VERSION_MAJOR) + kern_return_t perf_monitor_register(perf_monitor_object_t monitor, perf_monitor_methods_t *methods); /*!fn diff --git a/osfmk/ppc/AltiAssist.s b/osfmk/ppc/AltiAssist.s deleted file mode 100644 index 6ff23acba..000000000 --- a/osfmk/ppc/AltiAssist.s +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - AltiAssist.s - - Do the VMX assists - - Lovingly crafted by Bill Angell using traditional methods and only natural or recycled materials. - No animal products are used other than rendered otter bile and deep fried pork lard. - -*/ - -#include -#include -#include -#include -#include - -; -; -; General stuff what happens here: -; 1) All general context saved, interrupts off, translation off -; 2) Vector and floating point disabled, but there may be live context. -; This code is responsible for saving and restoring what is used. This -; includes exception states, java mode, etc. -; 3) No attempt is made to resolve page faults. PTE misses are handled -; automatically, but actual faults (ala copyin/copyout) are not. If -; a fault does occur, the exception that caused entry to the emulation -; routine is remapped to either an instruction or data miss (depending -; upon the stage detected) and redriven through the exception handler. -; The only time that an instruction fault can happen is when a different -; processor removes a mapping between our original fault and when we -; fetch the assisted instruction. For an assisted instruction, data -; faults should not occur (except in the MP case). For a purely -; emulated instruction, faults can occur. -; -; Emulation algorithms cloned from MacOS 9 code. -; -; Assumes that R2 = per_proc_area -; -; - - - .align 5 - .globl EXT(AltivecAssist) - -LEXT(AltivecAssist) - - li r10,emvr0 ; Point to the vector savearea - - li r11,emvr1 ; Another savearea - stvxl v0,r10,r2 ; Save V0 - stvxl v1,r11,r2 ; Save V1 - vspltisw v0,1 ; Set a 1 in V0 - vspltisw v1,8 ; Get half of the shift - vslw v0,v0,v1 ; Shift half way - vslw v0,v0,v1 ; Shift the rest of the way (we now have 0x00010000) - mfvscr v1 ; Get the VSCR - vor v1,v1,v0 ; Turn off Java mode - lvxl v0,r10,r2 ; Restore V0 - mtvscr v1 ; Set Java mode off - lvxl v1,r11,r2 ; Restore V1 - - li r11,T_IN_VAIN ; We are all done - b EXT(EmulExit) ; We are done, no tracing on... - diff --git a/osfmk/ppc/Diagnostics.c b/osfmk/ppc/Diagnostics.c deleted file mode 100644 index d6aa269c8..000000000 --- a/osfmk/ppc/Diagnostics.c +++ /dev/null @@ -1,571 +0,0 @@ -/* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_FREE_COPYRIGHT@ - */ -/* - * @APPLE_FREE_COPYRIGHT@ - */ - -/* - * Author: Bill Angell, Apple - * Date: 9/auht-aught - * - * Random diagnostics - */ - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -extern struct vc_info vinfo; -extern uint32_t warFlags; -#define warDisMBpoff 0x80000000 - -kern_return_t testPerfTrap(int trapno, struct savearea *ss, - unsigned int dsisr, addr64_t dar); - - -int diagCall(struct savearea *save) { - - union { - unsigned long long tbase; - unsigned int tb[2]; - } ttt, adj; - natural_t tbu, tbu2, tbl; - struct per_proc_info *per_proc; /* Area for my per_proc address */ - int cpu, ret, subc; - unsigned int temp, temp2, *baddr, oldwar; - addr64_t src, snk; - uint64_t srrwrk; - scomcomm sarea; - ipc_port_t port; - ipc_entry_t ientry; - processor_t prssr; - vm_address_t addrs; - - - if(!(dgWork.dgFlags & enaDiagSCs)) return 0; /* If not enabled, cause an exception */ - - switch(save->save_r3) { /* Select the routine */ - -/* - * Adjust the timebase for drift recovery testing - */ - case dgAdjTB: /* Adjust the timebase */ - - adj.tb[0] = 0; /* Clear high part */ - adj.tb[1] = save->save_r4; /* Set low order */ - if(adj.tb[1] & 0x80000000) adj.tb[0] = 0xFFFFFFFF; /* Propagate sign bit */ - - do { /* Read current time */ - asm volatile(" mftbu %0" : "=r" (tbu)); - asm volatile(" mftb %0" : "=r" (tbl)); - asm volatile(" mftbu %0" : "=r" (tbu2)); - } while (tbu != tbu2); - - ttt.tb[0] = tbu; /* Set high */ - ttt.tb[1] = tbl; /* Set low */ - - ttt.tbase = ttt.tbase + adj.tbase; /* Increment or decrement the TB */ - - tbu = ttt.tb[0]; /* Save in regular variable */ - tbl = ttt.tb[1]; /* Save in regular variable */ - - mttb(0); /* Set low to keep from ticking */ - mttbu(tbu); /* Set adjusted high */ - mttb(tbl); /* Set adjusted low */ - - return -1; /* Return no AST checking... */ - -/* - * Return physical address of a page - */ - case dgLRA: - - save->save_r3 = pmap_find_phys(current_thread()->map->pmap, save->save_r4); /* Get read address */ - - return -1; /* Return no AST checking... */ - -/* - * Copy physical to virtual - */ - case dgpcpy: - - -#if 1 - src = (save->save_r4 << 32) | (0x00000000FFFFFFFFULL & save->save_r5); /* Merge into 64-bit */ - snk = (save->save_r6 << 32) | (0x00000000FFFFFFFFULL & save->save_r7); /* Merge into 64-bit */ - save->save_r3 = copypv(src, snk, save->save_r8, save->save_r9); /* Copy the physical page */ -#endif - return 1; /* Return and check for ASTs... */ - -/* - * Read/Write physical memory - */ - case dgprw: - - src = (save->save_r5 << 32) | (0x00000000FFFFFFFFULL & save->save_r6); /* Merge into 64-bit */ - - switch(save->save_r4) { /* Select the actual function */ - - case 0: - save->save_r3 = (uint64_t)ml_phys_read_byte((unsigned int)src); - break; - - case 1: - save->save_r3 = (uint64_t)ml_phys_read_byte_64(src); - break; - - case 2: - save->save_r3 = (uint64_t)ml_phys_read((unsigned int)src); - break; - - case 3: - save->save_r3 = (uint64_t)ml_phys_read_64(src); - break; - - case 4: - ml_phys_write_byte((unsigned int)src, (unsigned int)save->save_r7); - break; - - case 5: - ml_phys_write_byte_64(src, (unsigned int)save->save_r7); - break; - - case 6: - ml_phys_write((unsigned int)src, (unsigned int)save->save_r7); - break; - - case 7: - ml_phys_write_64(src, (unsigned int)save->save_r7); - break; - } - - return 1; /* Return and check for ASTs... */ - - -/* - * Soft reset processor - */ - case dgreset: - - cpu = save->save_r4; /* Get the requested CPU number */ - - if(cpu >= MAX_CPUS) { /* Check for bogus cpu */ - save->save_r3 = KERN_FAILURE; /* Set failure */ - return 1; - } - - per_proc = PerProcTable[cpu].ppe_vaddr; /* Point to the processor */ - if(!per_proc->running) return KERN_FAILURE; /* It is not running */ - - - (void)PE_cpu_start(per_proc->cpu_id, - per_proc->start_paddr, (vm_offset_t)per_proc); - - save->save_r3 = KERN_SUCCESS; /* Set scuuess */ - - return 1; /* Return and check for ASTs... */ - -/* - * Force cache flush - */ - case dgFlush: - - cacheInit(); /* Blow cache */ - return 1; /* Return and check for ASTs... */ - -/* - * various hack tests - */ - case dgtest: - - kprintf("Trying to hang\n"); - baddr = (unsigned *)((unsigned)&baddr | 1); /* Make an odd address */ - __asm__ volatile("lwarx r2,0,%0" : : "r" (baddr)); - kprintf("Didn't hang\n"); - - return 1; /* Return and check for ASTs... */ - - - -/* - * Create a physical block map into the current task - * Don't bother to check for any errors. - * parms - vaddr, paddr, size, prot, attributes - */ - case dgBMphys: - - pmap_map_block(current_thread()->map->pmap, (addr64_t)save->save_r4, /* Map in the block */ - save->save_r5, save->save_r6, save->save_r7, save->save_r8, 0); - - return 1; /* Return and check for ASTs... */ - - -/* - * Remove any mapping from the current task - * Don't bother to check for any errors. - * parms - vaddr - */ - case dgUnMap: - - (void)mapping_remove(current_thread()->map->pmap, save->save_r4); /* Remove mapping */ - return 1; /* Return and check for ASTs... */ - - -/* - * Allows direct control of alignment handling. - * - * The bottom bit of the parameter is used to set the control bit, enaNotifyEM. - */ - case dgAlign: - - temp = dgWork.dgFlags; /* Save the old values */ - - temp2 = (save->save_r4 & 1) << (31 - enaNotifyEMb); /* Move parms into flag format */ - dgWork.dgFlags = (temp & ~enaNotifyEM) | temp2; /* Set the flag */ - - save->save_r3 = (temp >> (31 - enaNotifyEMb)) & 1; /* Return the original */ - - return 1; /* Return and check for ASTs... */ - -/* - * Return info for boot screen - */ - case dgBootScreen: - - ml_set_interrupts_enabled(1); - (void)copyout((char *)&vinfo, save->save_r4, sizeof(struct vc_info)); /* Copy out the video info */ - ml_set_interrupts_enabled(0); - return 1; /* Return and check for ASTs... */ - -/* - * Don't return info for boot screen - */ - case dgCPNull: - - ml_set_interrupts_enabled(1); - (void)copyout((char *)&vinfo, save->save_r4, 0); /* Copy out nothing */ - ml_set_interrupts_enabled(0); - return 1; /* Return and check for ASTs... */ - -/* - * Test machine check handler - only on 64-bit machines - */ - case dgmck: - if(!(PerProcTable[0].ppe_vaddr->pf.Available & pf64Bit)) return 0; /* Leave if not correct machine */ - - fwEmMck(save->save_r4, save->save_r5, save->save_r6, save->save_r7, save->save_r8, save->save_r9); /* Start injecting */ - - return -1; /* Return and don't check for ASTs... */ - -/* - * Set 64-bit on or off - only on 64-bit machines - */ - case dg64: - if(!(PerProcTable[0].ppe_vaddr->pf.Available & pf64Bit)) return 0; /* Leave if not correct machine */ - - srrwrk = save->save_srr1 >> 63; /* Save the old 64-bit bit */ - - save->save_srr1 = (save->save_srr1 & 0x7FFFFFFFFFFFFFFFULL) | (save->save_r4 << 63); /* Set the requested mode */ - save->save_r3 = srrwrk; /* Return the old value */ - - task_clear_64BitAddr(current_thread()->task); - if((save->save_r4 & 1)) task_set_64BitAddr(current_thread()->task); - - return -1; /* Return and don't check for ASTs... */ - -/* - * Test the probe read function - */ - - case dgProbeRead: - - src = (save->save_r4 << 32) | (0x00000000FFFFFFFFULL & save->save_r5); /* Merge into 64-bit */ - save->save_r3 = ml_probe_read_64(src, &temp); /* Try the address */ - save->save_r4 = temp; /* Return the data */ - return -1; /* Regurn and don't check for ASTs */ - -/* - * Do perf monitor stuff - */ - - case dgPerfMon: - - setPmon(save->save_r4, save->save_r5); /* Go load up MMCR0 and MMCR1 */ - return -1; /* Regurn and don't check for ASTs */ - -/* - * Map a page - * Don't bother to check for any errors. - * parms - vaddr, paddr, prot, attributes - */ - case dgMapPage: - - (void)mapping_make(current_thread()->map->pmap, /* Map in the page */ - (addr64_t)(((save->save_r5 & 0xFFFFFFFF) << 32) | (save->save_r5 & 0xFFFFFFFF)), save->save_r6, 0, 1, VM_PROT_READ|VM_PROT_WRITE); - - return -1; /* Return and check for ASTs... */ - -/* - * SCOM interface - * parms - pointer to scomcomm - */ - case dgScom: - - ret = copyin(save->save_r4, (void *)&sarea, sizeof(scomcomm)); /* Get the data */ - if(ret) return 0; /* Copyin failed - return an exception */ - - sarea.scomstat = 0xFFFFFFFFFFFFFFFFULL; /* Clear status */ - cpu = cpu_number(); /* Get us */ - - if((sarea.scomcpu < real_ncpus) && PerProcTable[sarea.scomcpu].ppe_vaddr->running) { - if(sarea.scomcpu == cpu) { /* Is it us? */ - if(sarea.scomfunc) { /* Are we writing */ - sarea.scomstat = ml_scom_write(sarea.scomreg, sarea.scomdata); /* Write scom */ - } - else { - sarea.scomstat = ml_scom_read(sarea.scomreg, &sarea.scomdata); /* Read scom */ - } - } - else { /* Otherwise, tell the other processor */ - (void)cpu_signal(sarea.scomcpu, SIGPcpureq, CPRQscom ,(unsigned int)&sarea); /* Ask him to do this */ - (void)hw_cpu_sync((unsigned int*)&sarea.scomstat, LockTimeOut); /* Wait for the other processor to get its temperature */ - } - } - - ret = copyout((void *)&sarea, save->save_r4, sizeof(scomcomm)); /* Get the data */ - if(ret) return 0; /* Copyin failed - return an exception */ - - return -1; /* Return and check for ASTs... */ - -/* - * Bind current thread to a processor. Parm is processor port. If port is 0, unbind. - */ - - case dgBind: - - if(save->save_r4 == 0) { /* Are we unbinding? */ - thread_bind(PROCESSOR_NULL); /* Unbind us */ - save->save_r3 = KERN_SUCCESS; /* Set success */ - return -1; /* Return and check asts */ - } - - ret = ipc_right_lookup_write(current_space(), (mach_port_name_t)save->save_r4, - &ientry); /* Look up the IPC entry */ - - if(ret != KERN_SUCCESS) { /* Couldn't find it */ - save->save_r3 = ret; /* Pass back return */ - return -1; /* Return and check asts */ - } - - port = (ipc_port_t)ientry->ie_object; /* Get the actual port */ - - if (!ip_active(port) || (ip_kotype(port) != IKOT_PROCESSOR)) { /* Active and a processor? */ - is_write_unlock(current_space()); /* Unlock the space */ - save->save_r3 = KERN_INVALID_ARGUMENT; /* This port is not a processor */ - return -1; /* Return and check asts */ - } - - prssr = (processor_t)port->ip_kobject; /* Extract the processor */ - is_write_unlock(current_space()); /* All done with the space now, unlock it */ - -/* - * The following probably isn't valid if a processor is in the processor going offline, - * but who cares, this is a diagnostic interface... - */ - - if(prssr->state == PROCESSOR_SHUTDOWN) { /* Are we trying to bind to an offline processor? */ - save->save_r3 = KERN_INVALID_ARGUMENT; /* This processor is offline */ - return -1; /* Return and check asts */ - } - - thread_bind(prssr); /* Bind us to the processor */ - thread_block(THREAD_CONTINUE_NULL); /* Make it so */ - - save->save_r3 = KERN_SUCCESS; /* Set success */ - return -1; /* Return and check asts */ - -/* - * Return per_proc for the named processor. Pass in a port. Returns per_proc or 0 if failure - */ - - case dgPproc: - - ret = ipc_right_lookup_write(current_space(), (mach_port_name_t)save->save_r4, - &ientry); /* Look up the IPC entry */ - - if(ret != KERN_SUCCESS) { /* Couldn't find it */ - save->save_r3 = 0; /* Pass back return */ - return -1; /* Return and check asts */ - } - - port = (ipc_port_t)ientry->ie_object; /* Get the actualy port */ - - if (!ip_active(port) || (ip_kotype(port) != IKOT_PROCESSOR)) { /* Active and a processor? */ - is_write_unlock(current_space()); /* Unlock the space */ - save->save_r3 = 0; /* This port is not a processor */ - return -1; /* Return and check asts */ - } - - prssr = (processor_t)port->ip_kobject; /* Extract the processor */ - is_write_unlock(current_space()); /* All done with the space now, unlock it */ - - save->save_r3 = (uint64_t)(uint32_t)PerProcTable[prssr->cpu_id].ppe_vaddr; /* Pass back ther per proc */ - return -1; /* Return and check asts */ - -/* - * Allocate contiguous memory in the kernel. Pass in size, pass back vaddr or 0 for error - * Note that this must be explicitly released by the user. There is an "issue" - * if we try to allocate directly into the user: the contiguous area has a kernel wire - * on it. If we terminate, we will hang waiting for wire to be released. Ain't no - * way that will happen, so we do it in the kernel and make them release it. That way - * we will leak rather than hang. - * - */ - case dgAcntg: - - addrs = 0; /* Clear just in case */ - - ret = kmem_alloc_contig(kernel_map, &addrs, (vm_size_t)save->save_r4, - PAGE_MASK, 0, 0, FALSE); /* That which does not make us stronger, kills us... */ - if(ret != KERN_SUCCESS) addrs = 0; /* Pass 0 if error */ - - save->save_r3 = (uint64_t)addrs; /* Pass back whatever */ - return -1; /* Return and check for ASTs... */ - - -/* - * Return physical address of a page in the kernel - */ - case dgKlra: - - save->save_r3 = pmap_find_phys(kernel_pmap, save->save_r4); /* Get read address */ - return -1; /* Return no AST checking... */ - -/* - * Release kernel memory - intent is to release congiguous memory - */ - case dgKfree: - - kmem_free( kernel_map, (vm_address_t) save->save_r4, (vm_size_t)save->save_r5); - return -1; /* Return no AST checking... */ - - - case dgWar: /* Set or reset workaround flags */ - - save->save_r3 = (uint32_t)warFlags; /* Get the old flags */ - oldwar = warFlags; /* Remember the old war flags */ - - subc = (int32_t)save->save_r4; /* Extract the subcommand */ - switch(subc) { /* Do what we need */ - case 1: /* Replace all */ - warFlags = (uint32_t)save->save_r5; /* Do them all */ - break; - - case 2: /* Turn on selected workarounds */ - warFlags = warFlags | (uint32_t)save->save_r5; - break; - - case 3: /* Turn off selected workarounds */ - warFlags = warFlags & ~((uint32_t)save->save_r5); - break; - - case 4: /* Start up selected workaround */ - break; - - case 5: /* Stop selected workaround */ - break; - - case 6: /* Reset specific workaround parameters to default */ - break; - - case 7: /* Set workaround parameters */ - break; - - default: - - break; - - } - - save->save_r3 = oldwar; /* Pass back original */ - return -1; - - - default: /* Handle invalid ones */ - return 0; /* Return an exception */ - - } - -}; - -kern_return_t -testPerfTrap(int trapno, struct savearea *ss, unsigned int dsisr, addr64_t dar) -{ - - if(trapno != T_ALIGNMENT) return KERN_FAILURE; - - kprintf("alignment exception at %08llX, srr1 = %08llX, dsisr = %08X, dar = %08llX\n", - ss->save_srr0, ss->save_srr1, dsisr, dar); - - return KERN_SUCCESS; -} - diff --git a/osfmk/ppc/Diagnostics.h b/osfmk/ppc/Diagnostics.h deleted file mode 100644 index 17e31a323..000000000 --- a/osfmk/ppc/Diagnostics.h +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_FREE_COPYRIGHT@ - */ -/* - * @APPLE_FREE_COPYRIGHT@ - */ - -/* - * Here are the Diagnostic interface interfaces - * Lovingly crafted by Bill Angell using traditional methods - * Keep selectors in sync with the x86 version where possible. -*/ -#ifdef KERNEL_PRIVATE - -#ifndef _DIAGNOSTICS_H_ -#define _DIAGNOSTICS_H_ - -#ifndef __ppc__ -#error This file is only useful on PowerPC. -#endif -#include - -int diagCall(struct savearea *save); - -#define diagSCnum 0x00006000 - -#define dgAdjTB 0 -#define dgLRA 1 -#define dgpcpy 2 -#define dgreset 3 -#define dgtest 4 -#define dgBMphys 5 -#define dgUnMap 6 -#define dgBootScreen 7 -#define dgFlush 8 -#define dgAlign 9 -#define dgprw 10 -#define dgmck 11 -#define dg64 12 -#define dgProbeRead 13 -#define dgCPNull 14 -#define dgPerfMon 15 -#define dgMapPage 16 -#define dgScom 17 -#define dgBind 18 -#define dgPproc 19 -#define dgAcntg 20 -#define dgKlra 21 -#define dgKfree 22 -#define dgWar 23 - - -typedef struct diagWork { /* Diagnostic work area */ - - unsigned int dgLock; /* Lock if needed */ - unsigned int dgFlags; /* Flags */ -#define enaExpTrace 0x00000001 -#define enaExpTraceb 31 -#define enaUsrFCall 0x00000002 -#define enaUsrFCallb 30 -#define enaUsrPhyMp 0x00000004 -#define enaUsrPhyMpb 29 -#define enaDiagSCs 0x00000008 -#define enaDiagSCsb 28 -#define enaDiagDM 0x00000010 -#define enaDiagSDMb 27 -#define enaDiagEM 0x00000020 -#define enaDiagEMb 26 -#define enaDiagTrap 0x00000040 -#define enaDiagTrapb 25 -#define enaNotifyEM 0x00000080 -#define enaNotifyEMb 24 - - unsigned int dgMisc0; - unsigned int dgMisc1; - unsigned int dgMisc2; - unsigned int dgMisc3; - unsigned int dgMisc4; - unsigned int dgMisc5; - -} diagWork; - -typedef struct scomcomm { - uint16_t scomcpu; /* CPU number */ - uint16_t scomfunc; /* 0 = read; 1 = write */ - uint32_t scomreg; /* SCOM register */ - uint64_t scomstat; /* returned status */ - uint64_t scomdata; /* input for write, output for read */ -} scomcomm; - -extern diagWork dgWork; -extern int diagTrap(struct savearea *, unsigned int); - - -#endif /* _DIAGNOSTICS_H_ */ - -#endif /* KERNEL_PRIVATE */ diff --git a/osfmk/ppc/Emulate.s b/osfmk/ppc/Emulate.s deleted file mode 100644 index 76ea4eb1c..000000000 --- a/osfmk/ppc/Emulate.s +++ /dev/null @@ -1,1445 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - Emulate.s - - Emulate instructions and traps. - - Lovingly crafted by Bill Angell using traditional methods and only natural or recycled materials. - No animal products are used other than rendered otter bile and deep fried pork lard. - -*/ - -#include -#include -#include -#include -#include -#include - -#define traceInst 30 -#define dssAllDone 29 - -; General stuff what happens here: -; 1) All general context saved, interrupts off, translation off -; 2) Vector and floating point disabled, but there may be live context. -; This code is responsible for saving and restoring what is used. This -; includes exception states, java mode, etc. -; 3) No attempt is made to resolve page faults. PTE misses are handled -; automatically, but actual faults (ala copyin/copyout) are not. If -; a fault does occur, the exception that caused entry to the emulation -; routine is remapped to either an instruction or data miss (depending -; upon the stage detected) and redrived through the exception handler. -; The only time that an instruction fault can happen is when a different -; processor removes a mapping between our original fault and when we -; fetch the assisted instruction. For an assisted instruction, data -; faults should not occur (except in the MP case). For a purely -; emulated instruction, faults can occur. -; -; - - - .align 5 - .globl EXT(Emulate) - -LEXT(Emulate) - - bf-- pf64Bitb,emn64 ; Skip if not 64-bit - b EXT(Emulate64) ; Jump to the 64-bit code... - -emn64: mfsprg r31,0 ; Get the per_proc - lwz r12,savesrr1+4(r13) ; Get the exception info - rlwinm. r0,r12,0,SRR1_PRG_ILL_INS_BIT,SRR1_PRG_ILL_INS_BIT ; Emulation candidate? - lwz r30,dgFlags(0) ; Get the flags - beq+ eExit ; Nope, do not try to emulate... - - rlwinm. r0,r30,0,enaDiagEMb,enaDiagEMb ; Do we want to try to emulate something? - mfsprg r28,2 ; Get the processor features - beq+ eExit ; No emulation allowed... - - rlwinm. r28,r28,0,pfAltivecb,pfAltivecb ; Do we have Altivec on this machine? - beq eNoVect ; Nope, no Altivec... - - dssall ; We need to kill streams because we are going to flip to problem state - sync - -eNoVect: bl eIFetch ; Get the instruction image - bne- eRedriveAsISI ; Go redrive this as an ISI... - - rlwinm. r0,r10,0,0,5 ; See if we have the "special" op code here - rlwinm r20,r10,16,22,31 ; Set rS/rD and rA - bne+ eExit ; Not special op, ignore... - - rlwinm r0,r10,31,22,31 ; Extract the sub op code - crclr cr1_eq ; Clear - rlwimi r20,r10,14,15,16 ; Move bits 29 and 30 of instruction to 15 and 16 of DSISR - cmplwi r0,790 ; lhbrx? - rlwimi r20,r10,8,17,17 ; Move bit 25 to bit 17 - cror cr1_eq,cr1_eq,cr0_eq ; Remember - cmplwi r0,534 ; lwbrx? - rlwimi r20,r10,3,18,21 ; Move bit 21-24 to bit 18-21 - cror cr1_eq,cr1_eq,cr0_eq ; Remember - cmplwi r0,918 ; sthbrx? - cror cr1_eq,cr1_eq,cr0_eq ; Remember - cmplwi r0,662 ; stwbrx? - cror cr1_eq,cr1_eq,cr0_eq ; Remember - cmplwi r0,1014 ; dcbz? - cror cr1_eq,cr1_eq,cr0_eq ; Remember - cmplwi r0,533 ; lswx? - cror cr1_eq,cr1_eq,cr0_eq ; Remember - cmplwi r0,661 ; stswx? - cror cr1_eq,cr1_eq,cr0_eq ; Remember - bne cr1_eq,eNotIndex ; Go check non-index forms... - - rlwinm. r21,r10,19,24,28 ; Extract index to rA to build EA - rlwinm r22,r10,24,24,28 ; Extract index to rB - addi r24,r13,saver0+4 ; Point to the start of registers - li r19,0 ; Assume 0 base - beq eZeroBase ; Yes... - lwzx r19,r24,r21 ; Get the base register value - -eZeroBase: lwzx r22,r24,r22 ; Get the index value - add r22,r22,r19 ; Get DAR - b eFinishUp ; Done, go finish up... - -eNotIndex: cmplwi r0,725 ; stswi? - cror cr1_eq,cr1_eq,cr0_eq ; Remember - cmplwi r0,597 ; lswi? - cror cr1_eq,cr1_eq,cr0_eq ; Remember - bne cr1,eExit ; Not one we handle... - - rlwinm. r21,r10,19,24,28 ; Extract index to rA to build EA - addi r24,r13,saver0+4 ; Point to the start of registers - li r22,0 ; Assume 0 base - beq eFinishUp ; Yes, it is... - lwzx r22,r24,r21 ; Get the base register value - -eFinishUp: stw r20,savedsisr(r13) ; Set the DSISR - li r11,T_ALIGNMENT ; Get the exception code - stw r22,savedar+4(r13) ; Save the DAR - stw r11,saveexception(r13) ; Set the exception code - b EXT(AlignAssist) ; Go emulate the handler... - - -eExit: b EXT(EmulExit) ; Just return for now... - - -; -; Fetch the failing instruction. -; Image returned in R10 if CR0_EQ is false, otherwise, an ISI should be generated. -; R1 has the DSISR if access failed. -; - - .align 5 - -eIFetch: lwz r23,savesrr1+4(r13) ; Get old MSR - mflr r28 ; Save return - - rlwinm r3,r23,32-MSR_DR_BIT+MSR_IR_BIT,MSR_DR_BIT,MSR_DR_BIT ; Move IR to DR for ifetch - mfmsr r30 ; Save the MSR for now - rlwimi r3,r23,32-MSR_RI_BIT+MSR_DR_BIT,MSR_RI_BIT,MSR_RI_BIT ; Move DR to RI for ifetch - - lwz r23,savesrr0+4(r13) ; Get instruction address - or r3,r23,r3 ; Turn on the DR and RI bit if translation was on - - crset cr0_eq ; Set this to see if we failed - mtmsr r3 ; Flip RI and, if IR was set, DR - isync - - lwz r10,0(r23) ; Fetch the instruction - - mtmsr r30 ; Trans and RI off - isync - - mtlr r28 ; Restore the LR - blr ; Return with instruction image in R10 - - -; -; Redrive as an ISI -; - -eRedriveAsISI: - lwz r6,savesrr1+4(r13) ; Get the srr1 value - lwz r4,SAVflags(r13) ; Pick up the flags - li r11,T_INSTRUCTION_ACCESS ; Set failing instruction fetch code - rlwimi r6,r1,0,1,4 ; Move the DSISR bits to the SRR1 - oris r4,r4,hi16(SAVredrive) ; Set the redrive bit - stw r11,saveexception(r13) ; Set the replacement code - stw r4,SAVflags(r13) ; Set redrive request - stw r6,savesrr1+4(r13) ; Set the srr1 value - b EXT(EmulExit) ; Bail out to handle ISI... - - -; -; This code emulates instructions that have failed because of operand -; alignment. We decode the DSISR to figure out what we need to do. -; -; DSISR: -; 0001FC00 - Instruction designation -#define iFloat 12 -#define iOptype1 15 -#define iOptype2 16 -#define iOptype3 18 -#define iOptype4 19 -#define iUpdate 17 -#define iStore 20 -#define iDouble 21 -#define iNotify 22 -; 000003E0 - Target/Source register -; 0000001F - Register to update if update form -; - - .align 5 - .globl EXT(AlignAssist) - -LEXT(AlignAssist) - bf-- pf64Bitb,aan64 ; Skip if not 64-bit - b EXT(AlignAssist64) ; Jump to the 64-bit code... - -aan64: lwz r20,savedsisr(r13) ; Get the DSISR - li r0,0 ; Assume we emulate - mfsprg r31,0 ; Get the per_proc - mtcrf 0x10,r20 ; Put instruction ID in CR for later - lwz r21,spcFlags(r31) ; Grab the special flags - stw r0,savemisc3(r13) ; Assume that we emulate ok - mtcrf 0x08,r20 ; Put instruction ID in CR for later - rlwinm. r0,r21,0,runningVMbit,runningVMbit ; Are we running a VM? - mtcrf 0x04,r20 ; Put instruction ID in CR for later - lwz r22,savesrr1+4(r13) ; Get the SRR1 - bne- aaPassAlong ; We are in a VM, no emulation for alignment exceptions... - lwz r19,dgFlags(0) ; Get the diagnostics flags - crxor iFloat,iOptype1,iOptype2 ; Set this to 0 if both bits are either 0 or 1 - mr r26,r20 ; Save the DSISR - rlwinm. r0,r22,0,MSR_SE_BIT,MSR_SE_BIT ; Were we single stepping? - lwz r23,savedar+4(r13) ; Pick up the address that we want to access - crnot traceInst,cr0_eq ; Remember if trace is on - - rlwinm. r0,r19,0,enaNotifyEMb,enaNotifyEMb ; Should we notify that an alignment exception happened? - mfmsr r30 ; Save the MSR for now - crnot iNotify,cr0_eq ; Remember to tell someone we did this - li r29,emfp0 ; Point to work area - crxor iFloat,iFloat,iOptype3 ; Set true if we have a floating point instruction - dcbz r29,r31 ; Clear and allocate a cache line for us to work in - rlwinm r24,r20,3,24,28 ; Get displacement to register to update if update form - rlwimi r20,r20,24,28,28 ; Move load/store indication to the bottom of index - rlwinm r22,r22,0,MSR_DR_BIT,MSR_DR_BIT ; Move rupt DR to DR for ifetch - rlwimi r20,r20,26,27,27 ; Move single/double indication to just above the bottom - rlwimi r22,r22,32-MSR_RI_BIT+MSR_DR_BIT,MSR_RI_BIT,MSR_RI_BIT ; Move DR to RI for i-fetch - lis r29,hi16(EXT(aaFPopTable)) ; High part of FP branch table - or r22,r30,r22 ; Set the DR and RI bits if translation was on - bf- iFloat,aaNotFloat ; This is not a floating point instruction... - ori r29,r29,lo16(EXT(aaFPopTable)) ; Low part of FP branch table - - rlwimi r29,r20,0,22,28 ; Index into table based upon register||iDouble||iStore - mtctr r29 ; Get set to call the function - bt iStore,aaFPstore ; This is an FP store... - -; -; Here we handle floating point loads -; - -aaFPload: crset cr0_eq ; Set this to see if we failed - mtmsr r22 ; Flip DR, RI - isync - - lwz r10,0(r23) ; Get the first word - bf- cr0_eq,aaLdNotDbl ; Jump out if we DSIed... - bf iDouble,aaLdNotDbl ; this is not a double... - lwz r11,4(r23) ; Get the second half - -aaLdNotDbl: mr r4,r0 ; Save the DAR if we failed the access - - mtmsr r30 ; Turn off translation again - isync - - bf- cr0_eq,aaRedriveAsDSI ; Go redrive this as a DSI... - - stw r10,emfp0(r31) ; Save the first half - stw r11,emfp0+4(r31) ; Save the second half, just in case we need it - - bctrl ; Go set the target FP register - - b aaComExit ; All done, go exit... - -; -; Here we handle floating point stores -; - - .align 5 - -aaFPstore: bctrl ; Go save the source FP register - - lwz r10,emfp0(r31) ; Get first word - crandc iDouble,iDouble,iOptype4 ; Change to 4-byte access if stfiwx - lwz r11,emfp0+4(r31) ; and the second - bf+ iOptype4,aaNotstfiwx ; This is not a stfiwx... - mr r10,r11 ; The stfiwx wants to store the second half - -aaNotstfiwx: - crset cr0_eq ; Set this to see if we failed - mtmsr r22 ; Flip DR, RI - isync - - stw r10,0(r23) ; Save the first word - bf- cr0_eq,aaStNotDbl ; Jump out if we DSIed... - bf iDouble,aaStNotDbl ; this is not a double... - stw r11,4(r23) ; Save the second half - -aaStNotDbl: mr r4,r0 ; Save the DAR if we failed the access - mtmsr r30 ; Turn off - isync - - bf- cr0_eq,aaRedriveAsDSI ; Go redrive this as a DSI... - -; -; Common exit routines -; - -aaComExit: lwz r10,savesrr0+4(r13) ; Get the failing instruction address - add r24,r24,r13 ; Offset to update register - li r11,T_IN_VAIN ; Assume we are all done - addi r10,r10,4 ; Step to the next instruction - bf iUpdate,aaComExNU ; Skip if not an update form... - stw r23,saver0+4(r24) ; Update the target - -aaComExNU: lwz r9,SAVflags(r13) ; Get the flags - stw r10,savesrr0+4(r13) ; Set new PC - bt- traceInst,aaComExitrd ; We are tracing, go emulate trace... - bf+ iNotify,aaComExGo ; Nothing special here, go... - - li r11,T_ALIGNMENT ; Set the we just did an alignment exception.... - -aaComExGo: b EXT(EmulExit) ; We are done, no tracing on... - - -; -; This is not a floating point operation -; -; The table of these emulation routines is indexed by taking the low order 4 bits of -; the instruction code in the DSISR and subtracting 7. If this comes up negative, -; the instruction is not to be emulated. Then we add bit 0 of the code * 4. This -; gives us a fairly compact and almost unique index. Both lwm and stmw map to 0 so -; that one needs to be further reduced, and we end up with holes at a few indexes. -; - - .align 5 - -aaNotFloat: - lis r19,hi16(aaEmTable) ; Point to high part of table address - rlwinm r3,r26,24,26,29 ; Isolate last 4 bits of op type * 4 - rlwimi r19,r26,20,27,27 ; Get bit 0 of instruction code * 4 into bottom of table base - addic. r3,r3,-28 ; Subtract 7*4 to adjust index - ori r19,r19,lo16(aaEmTable) ; Low part of table address - blt- aaPassAlong ; We do not handle any of these (lwarx, stwcx., eciwx, ecowx)... - add r19,r19,r3 ; Point to emulation routine - rlwinm r18,r26,30,24,28 ; Get the target/source register displacement - - mtctr r19 ; Set the routine address - - bctr ; Go emulate the instruction... - -; -; This is the table of non-floating point emulation routines. -; It is indexed by the code immediately above. - - .align 5 - -aaEmTable: - b aaLmwStmw ; This for lmw/stmw - b aaLswx ; This for lwwx - b aaLswi ; This for lswi - b aaStswx ; This for stswx - b aaStswi ; This for stswi - b aaLwbrx ; This for lwbrx - b aaPassAlong ; This an invalid index (6) - b aaStwbrx ; This for stwbrx - b aaPassAlong ; This an invalid index (8) - b aaLhbrx ; This for lhbrx - b aaPassAlong ; This an invalid index (A) - b aaSthbrx ; This for sthbrx - b aaDcbz ; This for dcbz - b aaPassAlong ; This an invalid index (D) - b aaPassAlong ; This an invalid index (E) - b aaPassAlong ; This an invalid index (F) - - -; -; Here we handle the set up for the lmw and stmw. After that, we split off to the -; individual routines. -; -; Note also that after some set up, all of the string instructions come through here as well. -; - .align 5 - -aaLmwStmw: - rlwinm r17,r18,31,1,29 ; Convert doublword based index to words - li r28,0 ; Set no extra bytes to move (used for string instructions) - subfic r17,r17,32*4 ; Calculate the length of the transfer - -aaLSComm: addi r19,r13,saver0+4 ; Offset to registers in savearea - mr r16,r23 ; Make a hunk pointer - - bt iUpdate,aaStmw ; This is the stmw... - -; -; Load multiple word -; - -aaLmwNxt: cmplwi cr1,r17,8*4 ; Is there enough to move 8? - blt- cr1,aaLmwNxtH ; Not enough for a full hunk... - subi r17,r17,8*4 ; Back off for another hunk - - crset cr0_eq ; Set this to see if we failed - mtmsr r22 ; Flip DR, RI - isync - - lwz r2,0(r16) ; Load word 0 - bf- cr0_eq,aaLmwB1 ; Error, bail... - lwz r15,4(r16) ; Load word 1 - bf- cr0_eq,aaLmwB1 ; Error, bail... - lwz r14,8(r16) ; Load word 2 - bf- cr0_eq,aaLmwB1 ; Error, bail... - lwz r5,12(r16) ; Load word 3 - bf- cr0_eq,aaLmwB1 ; Error, bail... - lwz r6,16(r16) ; Load word 4 - bf- cr0_eq,aaLmwB1 ; Error, bail... - lwz r7,20(r16) ; Load word 5 - bf- cr0_eq,aaLmwB1 ; Error, bail... - lwz r8,24(r16) ; Load word 6 - bf- cr0_eq,aaLmwB1 ; Error, bail... - lwz r9,28(r16) ; Load word 7 - -aaLmwB1: mr r4,r0 ; Remember DAR, jus in case we failed the access - mtmsr r30 ; Turn off DR, RI - isync - - bf- cr0_eq,aaRedriveAsDSI ; We failed, go redrive this as a DSI... - - addi r16,r16,8*4 ; Point up to next input aread - - stwx r2,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - stwx r15,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - stwx r14,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - stwx r5,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - stwx r6,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - stwx r7,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - stwx r8,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - stwx r9,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - - b aaLmwNxt ; Do the next hunk... - - .align 5 - -aaLmwNxtH: cmplwi cr1,r17,4*4 ; Do we have 4 left? - blt cr1,aaLmwL4 ; Nope... - - subi r17,r17,4*4 ; Set count properly - - crset cr0_eq ; Set this to see if we failed - mtmsr r22 ; Flip DR, RI, and maybe PR on - isync - - lwz r2,0(r16) ; Load word 0 - bf- cr0_eq,aaLmwB2 ; Error, bail... - lwz r15,4(r16) ; Load word 1 - bf- cr0_eq,aaLmwB2 ; Error, bail... - lwz r14,8(r16) ; Load word 2 - bf- cr0_eq,aaLmwB2 ; Error, bail... - lwz r5,12(r16) ; Load word 3 - -aaLmwB2: mr r4,r0 ; Remember DAR, jus in case we failed the access - mtmsr r30 ; Turn off DR, RI - isync - - bf- cr0_eq,aaRedriveAsDSI ; We failed, go redrive this as a DSI... - - addi r16,r16,4*4 ; Point up to next input aread - - stwx r2,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - stwx r15,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - stwx r14,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - stwx r5,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - -aaLmwL4: or. r5,r17,r28 ; Do we have anything left? - cmplwi cr1,r17,(2*4) ; Do we have one, two, or three full words left? - cmplwi cr2,r17,0 ; Do we have no full words left? - beq aaComExit ; Nothing left... - - crset cr0_eq ; Set this to see if we failed - mtmsr r22 ; Flip DR, RI, and maybe PR on - isync - - beq- cr2,aaLmwBy ; No full words, get bytes... - - lwz r2,0(r16) ; Pick up first word - bf- cr0_eq,aaLmwDn ; Read failed, escape... - addi r16,r16,4 ; Next input location - blt cr1,aaLmwBy ; We only had one, we are done... - - lwz r15,0(r16) ; Pick up second word - bf- cr0_eq,aaLmwDn ; Read failed, escape... - addi r16,r16,4 ; Next input location - beq cr1,aaLmwBy ; We had two, we are done... - - lwz r14,0(r16) ; Load word 3 - addi r16,r16,4 ; Next input location - -aaLmwBy: cmplwi cr2,r28,0 ; Any trailing bytes to do? - li r8,0 ; Clear second trailing byte - cmplwi cr1,r28,2 ; Check for 1, 2, or 3 - li r9,0 ; Clear third trailing byte - beq+ cr2,aaLmwDn ; No trailing bytes... - - lbz r5,0(r16) ; Pick up first trailing byte - bf- cr0_eq,aaLmwDn ; Read failed, escape... - blt cr1,aaLmwDn ; We only had one, we are done... - - lbz r8,1(r16) ; Pick up second trailing byte - bf- cr0_eq,aaLmwDn ; Read failed, escape... - beq cr1,aaLmwDn ; We had two, we are done... - - lbz r9,2(r16) ; Get last trailing byte - - -aaLmwDn: rlwinm r5,r5,24,0,7 ; Move first byte to top - cmplwi cr2,r17,0 ; Any full words to do? - mr r4,r0 ; Remember DAR, just in case we failed the access - rlwimi r9,r8,8,16,23 ; Move second byte above third byte - cmplwi cr1,r17,(2*4) ; Do we have one, two, or three full words left? - mr r3,r30 ; Set the normal MSR - rlwimi r5,r9,8,8,23 ; Move bytes 1 and 2 after 0 - - mtmsr r30 ; Turn off DR, RI - isync - - bf- cr0_eq,aaRedriveAsDSI ; We failed, go redrive this as a DSI... - - beq- cr2,aaLmwCb ; No full words, copy bytes... - - stwx r2,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - blt cr1,aaLmwCb ; We only had one, we are done... - - stwx r15,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - beq cr1,aaLmwCb ; We had two, we are done... - - stwx r14,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - -aaLmwCb: mr. r28,r28 ; Any trailing bytes to do? - beq+ aaComExit ; Nope, leave... - - stwx r5,r19,r18 ; Store register - - b aaComExit ; We are done.... - -; -; Store multiple word -; - - .align 5 - -aaStmw: - crclr iUpdate ; Make sure we do not think this is an update form - -aaStmwNxt: cmplwi cr1,r17,8*4 ; Is there enough to move 8? - blt- cr1,aaStmwNxtH ; Not enough for a full hunk... - subi r17,r17,8*4 ; Back off for another hunk - - lwzx r2,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - lwzx r15,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - lwzx r14,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - lwzx r5,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - lwzx r6,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - lwzx r7,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - lwzx r8,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - lwzx r9,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - - crset cr0_eq ; Set this to see if we failed - mtmsr r22 ; Flip DR, RI, and maybe PR on - isync - - stw r2,0(r16) ; Store word 0 - bf- cr0_eq,aaStmwB1 ; Error, bail... - stw r15,4(r16) ; Store word 1 - bf- cr0_eq,aaStmwB1 ; Error, bail... - stw r14,8(r16) ; Store word 2 - bf- cr0_eq,aaStmwB1 ; Error, bail... - stw r5,12(r16) ; Store word 3 - bf- cr0_eq,aaStmwB1 ; Error, bail... - stw r6,16(r16) ; Store word 4 - bf- cr0_eq,aaStmwB1 ; Error, bail... - stw r7,20(r16) ; Store word 5 - bf- cr0_eq,aaStmwB1 ; Error, bail... - stw r8,24(r16) ; Store word 6 - bf- cr0_eq,aaStmwB1 ; Error, bail... - stw r9,28(r16) ; Store word 7 - - addi r16,r16,8*4 ; Point up to next output aread - - -aaStmwB1: mr r4,r0 ; Remember DAR, jus in case we failed the access - mtmsr r30 ; Normal MSR - isync - - bt- cr0_eq,aaStmwNxt ; We have more to do and no failed access... - b aaRedriveAsDSI ; We failed, go redrive this as a DSI... - - .align 5 - -aaStmwNxtH: cmplwi cr1,r17,(4*4) ; Do we have at least 4 left? - blt cr1,aaStmwL4 ; Nope... - subi r17,r17,4*4 ; Set count properly - - lwzx r2,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - lwzx r15,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - lwzx r14,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - lwzx r5,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - - crset cr0_eq ; Set this to see if we failed - mtmsr r22 ; Flip DR, RI - isync - - stw r2,0(r16) ; Store word 0 - bf- cr0_eq,aaStmwB2 ; Error, bail... - stw r15,4(r16) ; Store word 1 - bf- cr0_eq,aaStmwB2 ; Error, bail... - stw r14,8(r16) ; Store word 2 - bf- cr0_eq,aaStmwB2 ; Error, bail... - stw r5,12(r16) ; Store word 3 - - addi r16,r16,4*4 ; Point up to next input aread - -aaStmwB2: mr r4,r0 ; Remember DAR, jus in case we failed the access - mtmsr r30 ; Normal MSR - isync - - bf- cr0_eq,aaRedriveAsDSI ; We failed, go redrive this as a DSI... - -aaStmwL4: or. r5,r17,r28 ; Do we have anything left to do? - cmplwi cr1,r17,(2*4) ; Do we have one, two, or three left? - cmplwi cr2,r17,0 ; Do we have no full words left? - beq aaComExit ; Nothing left... - - beq- cr2,aaStmwBy1 ; No full words, check out bytes - - lwzx r2,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - blt cr1,aaStmwBy1 ; We only had one, go save it... - - lwzx r15,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - beq cr1,aaStmwBy1 ; We had two, go save it... - - lwzx r14,r19,r18 ; Store register - addi r18,r18,8 ; Next register - rlwinm r18,r18,0,24,28 ; Wrap back to 0 if needed - -aaStmwBy1: mr. r28,r28 ; Do we have any trailing bytes? - beq+ aaStmwSt ; Nope... - - lwzx r5,r19,r18 ; Yes, pick up one extra register - -aaStmwSt: crset cr0_eq ; Set this to see if we failed - mtmsr r22 ; Flip DR, RI - isync - - beq- cr2,aaStmwBy2 ; No words, check trailing bytes... - - stw r2,0(r16) ; Save first word - bf- cr0_eq,aaStmwDn ; Store failed, escape... - addi r16,r16,4 ; Bump sink - blt cr1,aaStmwBy2 ; We only had one, we are done... - - stw r15,0(r16) ; Save second word - bf- cr0_eq,aaStmwDn ; Store failed, escape... - addi r16,r16,4 ; Bump sink - beq cr1,aaStmwBy2 ; We had two, we are done... - - stw r14,0(r16) ; Save third word - bf- cr0_eq,aaStmwDn ; Store failed, escape... - addi r16,r16,4 ; Bump sink - -aaStmwBy2: rlwinm r2,r5,8,24,31 ; Get byte 0 - cmplwi cr2,r28,0 ; Any trailing bytes to do? - rlwinm r14,r5,24,24,31 ; Get byte 3 - li r8,0 ; Clear second trailing byte - cmplwi cr1,r28,2 ; Check for 1, 2, or 3 - li r9,0 ; Clear third trailing byte - beq+ cr2,aaStmwDn ; No trailing bytes... - rlwinm r15,r5,16,24,31 ; Get byte 1 - - stb r2,0(r16) ; Save first byte - bf- cr0_eq,aaStmwDn ; Read failed, escape... - blt cr1,aaStmwDn ; We only had one, we are done... - - stb r15,1(r16) ; Save second byte - bf- cr0_eq,aaStmwDn ; Read failed, escape... - beq cr1,aaStmwDn ; We had two, we are done... - - stb r14,2(r16) ; Save third byte - -aaStmwDn: mr r4,r0 ; Remember DAR, jus in case we failed the access - mtmsr r30 ; Normal MSR - isync - - bf- cr0_eq,aaRedriveAsDSI ; We failed, go redrive this as a DSI... - - b aaComExit ; We are done.... - - -; -; Load String Indexed -; - - .align 5 - -aaLswx: lwz r17,savexer+4(r13) ; Pick up the XER - crclr iUpdate ; Make sure we think this the load form - rlwinm. r25,r17,0,25,31 ; Get the number of bytes to load - rlwinm r28,r17,0,30,31 ; Get the number of bytes past an even word - beq- aaComExit ; Do nothing if 0 length... - xor r17,r25,r28 ; Round down to an even word boundary - b aaLSComm ; Join up with common load/store code... - - -; -; Load String Immediate -; - - .align 5 - -aaLswi: mr r9,r23 ; Save the DAR - bl eIFetch ; Get the instruction image - bne- eRedriveAsISI ; Go redrive this as an ISI... - rlwinm r25,r10,21,27,31 ; Get the number of bytes to load - crclr iUpdate ; Make sure we think this the load form - subi r25,r25,1 ; Back off by 1 - rlwinm r25,r25,0,27,31 ; Clear back down - addi r25,r25,1 ; Add back the 1 to convert 0 to 32 - rlwinm r28,r25,0,30,31 ; Get the number of bytes past an even word - xor r17,r25,r28 ; Round down to an even word boundary - mr r23,r9 ; Move back the DAR - b aaLSComm ; Join up with common load/store code... - -; -; Store String Indexed -; - - .align 5 - -aaStswx: lwz r17,savexer+4(r13) ; Pick up the XER - crclr iUpdate ; Make sure this is clear in case we have 0 length - rlwinm. r25,r17,0,25,31 ; Get the number of bytes to load - rlwinm r28,r17,0,30,31 ; Get the number of bytes past an even word - beq- aaComExit ; Do nothing if 0 length... - xor r17,r25,r28 ; Round down to an even word boundary - crset iUpdate ; Make sure we think this the store form - b aaLSComm ; Join up with common load/store code... - - -; -; Store String Immediate -; - - .align 5 - -aaStswi: mr r9,r23 ; Save the DAR - bl eIFetch ; Get the instruction image - bne- eRedriveAsISI ; Go redrive this as an ISI... - rlwinm r25,r10,21,27,31 ; Get the number of bytes to load - crclr iUpdate ; Make sure we think this the load form - subi r25,r25,1 ; Back off by 1 - rlwinm r25,r25,0,27,31 ; Clear back down - addi r25,r25,1 ; Add back the 1 to convert 0 to 32 - rlwinm r28,r25,21,30,31 ; Get the number of bytes past an even word - xor r17,r25,r28 ; Round down to an even word boundary - mr r23,r9 ; Move back the DAR - b aaLSComm ; Join up with common load/store code... - - -; -; Load byte-reversed word -; - - .align 5 - -aaLwbrx: - add r18,r18,r13 ; Index to source register - - crset cr0_eq ; Set this to see if we failed - mtmsr r22 ; Flip DR, RI, and maybe PR on - isync - - lwz r11,0(r23) ; Load the word - - mr r4,r0 ; Save the DAR if we failed the access - mtmsr r30 ; Restore normal MSR - isync - - bf- cr0_eq,aaRedriveAsDSI ; We failed, go redrive this as a DSI... - - rlwinm r10,r11,8,0,31 ; Get byte 0 to 3 and byte 2 to 1 - rlwimi r10,r11,24,16,23 ; Move byte 1 to byte 2 - rlwimi r10,r11,24,0,7 ; Move byte 3 to byte 0 - - stw r10,saver0+4(r18) ; Set the register - - b aaComExit ; All done, go exit... - - - -; -; Store byte-reversed word -; - - .align 5 - -aaStwbrx: - add r18,r18,r13 ; Index to source register - lwz r11,saver0+4(r18) ; Get the register to store - - rlwinm r10,r11,8,0,31 ; Get byte 0 to 3 and byte 2 to 1 - rlwimi r10,r11,24,16,23 ; Move byte 1 to byte 2 - rlwimi r10,r11,24,0,7 ; Move byte 3 to byte 0 - - crset cr0_eq ; Set this to see if we failed - mtmsr r22 ; Flip DR, RI, and maybe PR on - isync - - stw r10,0(r23) ; Store the reversed halfword - - mr r4,r0 ; Save the DAR if we failed the access - mtmsr r30 ; Restore normal MSR - isync - - bt+ cr0_eq,aaComExit ; All done, go exit... - b aaRedriveAsDSI ; We failed, go redrive this as a DSI... - - - -; -; Load byte-reversed halfword -; - - .align 5 - -aaLhbrx: - add r18,r18,r13 ; Index to source register - - crset cr0_eq ; Set this to see if we failed - mtmsr r22 ; Flip DR, RI, and maybe PR on - isync - - lhz r11,0(r23) ; Load the halfword - - mr r4,r0 ; Save the DAR if we failed the access - mtmsr r30 ; Restore normal MSR - isync - - bf- cr0_eq,aaRedriveAsDSI ; We failed, go redrive this as a DSI... - - rlwinm r10,r11,8,16,23 ; Rotate bottom byte up one and clear everything else - rlwimi r10,r11,24,24,31 ; Put old second from bottom into bottom - - stw r10,saver0+4(r18) ; Set the register - - b aaComExit ; All done, go exit... - - -; -; Store byte-reversed halfword -; - - .align 5 - -aaSthbrx: - add r18,r18,r13 ; Index to source register - lwz r10,saver0+4(r18) ; Get the register to store - rlwinm r10,r10,8,0,31 ; Rotate bottom byte up one - rlwimi r10,r10,16,24,31 ; Put old second from bottom into bottom - - crset cr0_eq ; Set this to see if we failed - mtmsr r22 ; Flip DR, RI, and maybe PR on - isync - - sth r10,0(r23) ; Store the reversed halfword - - mr r4,r0 ; Save the DAR if we failed the access - mtmsr r30 ; Restore normal MSR - isync - - bt+ cr0_eq,aaComExit ; All done, go exit... - b aaRedriveAsDSI ; We failed, go redrive this as a DSI... - -; -; Data cache block zero -; - - .align 5 - -aaDcbz: - lwz r0,savesrr0+4(r13) ; get instruction address - li r4,_COMM_PAGE_BASE_ADDRESS - rlwinm r23,r23,0,0,26 ; Round EA back to a 32-byte boundary - sub r4,r0,r4 ; compute instruction offset from base of commpage - cmplwi r4,_COMM_PAGE_AREA_USED ; did fault occur in commpage? - bge+ aaDcbz1 ; skip if not in commpage - lwz r4,savecr(r13) ; if we take a dcbz in the commpage... - rlwinm r4,r4,0,0,27 ; ...clear users cr7 as a flag for commpage code - stw r4,savecr(r13) -aaDcbz1: - crset cr0_eq ; Set this to see if we failed - li r0,0 ; Clear this out - mtmsr r22 ; Flip DR, RI, and maybe PR on - isync - - stw r0,0(r23) ; Clear word - bne- aaDcbzXit ; Got DSI, we are stopping... - stw r0,4(r23) ; Clear word - bne- aaDcbzXit ; Got DSI, we are stopping... - stw r0,8(r23) ; Clear word - bne- aaDcbzXit ; Got DSI, we are stopping... - stw r0,12(r23) ; Clear word - bne- aaDcbzXit ; Got DSI, we are stopping... - stw r0,16(r23) ; Clear word - bne- aaDcbzXit ; Got DSI, we are stopping... - stw r0,20(r23) ; Clear word - bne- aaDcbzXit ; Got DSI, we are stopping... - stw r0,24(r23) ; Clear word - bne- aaDcbzXit ; Got DSI, we are stopping... - stw r0,28(r23) ; Clear word - -aaDcbzXit: mr r4,r0 ; Save the DAR if we failed the access - mtmsr r30 ; Restore normal MSR - isync - - crclr iUpdate ; Make sure we do not think this is an update form - - bt+ cr0_eq,aaComExit ; All done, go exit... - b aaRedriveAsDSI ; We failed, go redrive this as a DSI... - - -; -; Unhandled alignment exception, pass it along -; - -aaPassAlong: - li r0,1 ; Indicate that we failed to emulate - stw r0,savemisc3(r13) ; Assume that we emulate ok - b EXT(EmulExit) - - - - -; -; We go here to emulate a trace exception after we have handled alignment error -; - - .align 5 - -aaComExitrd: - lis r11,hi16(srr1clr) ; Get the bits we need to clear - oris r9,r9,hi16(SAVredrive) ; Set the redrive bit - andc r12,r12,r11 ; Clear what needs to be cleared - li r11,T_TRACE ; Set trace interrupt - stw r9,SAVflags(r13) ; Set the flags - stw r11,saveexception(r13) ; Set the exception code - b EXT(EmulExit) ; Exit and do trace interrupt... - - - -; -; Redrive as a DSI - -aaRedriveAsDSI: - mr r20,r1 ; Save the DSISR - mr r21,r4 - lwz r4,SAVflags(r13) ; Pick up the flags - li r11,T_DATA_ACCESS ; Set failing data access code - oris r4,r4,hi16(SAVredrive) ; Set the redrive bit - stw r20,savedsisr(r13) ; Set the DSISR of failed access - stw r21,savedar+4(r13) ; Set the address of the failed access - stw r11,saveexception(r13) ; Set the replacement code - stw r4,SAVflags(r13) ; Set redrive request - b EXT(EmulExit) ; Bail out to handle ISI... - - - -; -; Table of functions to load or store floating point registers -; This table is indexed reg||size||dir. That means that each -; like load/store pair (e.g., lfd f31/stfd f31) are within the same -; quadword, which is the current ifetch size. We expect most of the -; unaligned accesses to be part of copies, therefore, with this -; organization, we will save the ifetch of the store after the load. -; - - .align 10 ; Make sure we are on a 1k boundary - .globl EXT(aaFPopTable) - -LEXT(aaFPopTable) - lfs f0,emfp0(r31) ; Load single variant - blr - - stfs f0,emfp0(r31) ; Store single variant - blr - - lfd f0,emfp0(r31) ; Load double variant - blr - - stfd f0,emfp0(r31) ; Store double variant - blr - - lfs f1,emfp0(r31) ; Load single variant - blr - - stfs f1,emfp0(r31) ; Store single variant - blr - - lfd f1,emfp0(r31) ; Load double variant - blr - - stfd f1,emfp0(r31) ; Store double variant - blr - - lfs f2,emfp0(r31) ; Load single variant - blr - - stfs f2,emfp0(r31) ; Store single variant - blr - - lfd f2,emfp0(r31) ; Load double variant - blr - - stfd f2,emfp0(r31) ; Store double variant - blr - - lfs f3,emfp0(r31) ; Load single variant - blr - - stfs f3,emfp0(r31) ; Store single variant - blr - - lfd f3,emfp0(r31) ; Load double variant - blr - - stfd f3,emfp0(r31) ; Store double variant - blr - - lfs f4,emfp0(r31) ; Load single variant - blr - - stfs f4,emfp0(r31) ; Store single variant - blr - - lfd f4,emfp0(r31) ; Load double variant - blr - - stfd f4,emfp0(r31) ; Store double variant - blr - - lfs f5,emfp0(r31) ; Load single variant - blr - - stfs f5,emfp0(r31) ; Store single variant - blr - - lfd f5,emfp0(r31) ; Load double variant - blr - - stfd f5,emfp0(r31) ; Store double variant - blr - - lfs f6,emfp0(r31) ; Load single variant - blr - - stfs f6,emfp0(r31) ; Store single variant - blr - - lfd f6,emfp0(r31) ; Load double variant - blr - - stfd f6,emfp0(r31) ; Store double variant - blr - - lfs f7,emfp0(r31) ; Load single variant - blr - - stfs f7,emfp0(r31) ; Store single variant - blr - - lfd f7,emfp0(r31) ; Load double variant - blr - - stfd f7,emfp0(r31) ; Store double variant - blr - - lfs f8,emfp0(r31) ; Load single variant - blr - - stfs f8,emfp0(r31) ; Store single variant - blr - - lfd f8,emfp0(r31) ; Load double variant - blr - - stfd f8,emfp0(r31) ; Store double variant - blr - - lfs f9,emfp0(r31) ; Load single variant - blr - - stfs f9,emfp0(r31) ; Store single variant - blr - - lfd f9,emfp0(r31) ; Load double variant - blr - - stfd f9,emfp0(r31) ; Store double variant - blr - - lfs f10,emfp0(r31) ; Load single variant - blr - - stfs f10,emfp0(r31) ; Store single variant - blr - - lfd f10,emfp0(r31) ; Load double variant - blr - - stfd f10,emfp0(r31) ; Store double variant - blr - - lfs f11,emfp0(r31) ; Load single variant - blr - - stfs f11,emfp0(r31) ; Store single variant - blr - - lfd f11,emfp0(r31) ; Load double variant - blr - - stfd f11,emfp0(r31) ; Store double variant - blr - - lfs f12,emfp0(r31) ; Load single variant - blr - - stfs f12,emfp0(r31) ; Store single variant - blr - - lfd f12,emfp0(r31) ; Load double variant - blr - - stfd f12,emfp0(r31) ; Store double variant - blr - - lfs f13,emfp0(r31) ; Load single variant - blr - - stfs f13,emfp0(r31) ; Store single variant - blr - - lfd f13,emfp0(r31) ; Load double variant - blr - - stfd f13,emfp0(r31) ; Store double variant - blr - - lfs f14,emfp0(r31) ; Load single variant - blr - - stfs f14,emfp0(r31) ; Store single variant - blr - - lfd f14,emfp0(r31) ; Load double variant - blr - - stfd f14,emfp0(r31) ; Store double variant - blr - - lfs f15,emfp0(r31) ; Load single variant - blr - - stfs f15,emfp0(r31) ; Store single variant - blr - - lfd f15,emfp0(r31) ; Load double variant - blr - - stfd f15,emfp0(r31) ; Store double variant - blr - - lfs f16,emfp0(r31) ; Load single variant - blr - - stfs f16,emfp0(r31) ; Store single variant - blr - - lfd f16,emfp0(r31) ; Load double variant - blr - - stfd f16,emfp0(r31) ; Store double variant - blr - - lfs f17,emfp0(r31) ; Load single variant - blr - - stfs f17,emfp0(r31) ; Store single variant - blr - - lfd f17,emfp0(r31) ; Load double variant - blr - - stfd f17,emfp0(r31) ; Store double variant - blr - - lfs f18,emfp0(r31) ; Load single variant - blr - - stfs f18,emfp0(r31) ; Store single variant - blr - - lfd f18,emfp0(r31) ; Load double variant - blr - - stfd f18,emfp0(r31) ; Store double variant - blr - - lfs f19,emfp0(r31) ; Load single variant - blr - - stfs f19,emfp0(r31) ; Store single variant - blr - - lfd f19,emfp0(r31) ; Load double variant - blr - - stfd f19,emfp0(r31) ; Store double variant - blr - - lfs f20,emfp0(r31) ; Load single variant - blr - - stfs f20,emfp0(r31) ; Store single variant - blr - - lfd f20,emfp0(r31) ; Load double variant - blr - - stfd f20,emfp0(r31) ; Store double variant - blr - - lfs f21,emfp0(r31) ; Load single variant - blr - - stfs f21,emfp0(r31) ; Store single variant - blr - - lfd f21,emfp0(r31) ; Load double variant - blr - - stfd f21,emfp0(r31) ; Store double variant - blr - - lfs f22,emfp0(r31) ; Load single variant - blr - - stfs f22,emfp0(r31) ; Store single variant - blr - - lfd f22,emfp0(r31) ; Load double variant - blr - - stfd f22,emfp0(r31) ; Store double variant - blr - - lfs f23,emfp0(r31) ; Load single variant - blr - - stfs f23,emfp0(r31) ; Store single variant - blr - - lfd f23,emfp0(r31) ; Load double variant - blr - - stfd f23,emfp0(r31) ; Store double variant - blr - - lfs f24,emfp0(r31) ; Load single variant - blr - - stfs f24,emfp0(r31) ; Store single variant - blr - - lfd f24,emfp0(r31) ; Load double variant - blr - - stfd f24,emfp0(r31) ; Store double variant - blr - - lfs f25,emfp0(r31) ; Load single variant - blr - - stfs f25,emfp0(r31) ; Store single variant - blr - - lfd f25,emfp0(r31) ; Load double variant - blr - - stfd f25,emfp0(r31) ; Store double variant - blr - - lfs f26,emfp0(r31) ; Load single variant - blr - - stfs f26,emfp0(r31) ; Store single variant - blr - - lfd f26,emfp0(r31) ; Load double variant - blr - - stfd f26,emfp0(r31) ; Store double variant - blr - - lfs f27,emfp0(r31) ; Load single variant - blr - - stfs f27,emfp0(r31) ; Store single variant - blr - - lfd f27,emfp0(r31) ; Load double variant - blr - - stfd f27,emfp0(r31) ; Store double variant - blr - - lfs f28,emfp0(r31) ; Load single variant - blr - - stfs f28,emfp0(r31) ; Store single variant - blr - - lfd f28,emfp0(r31) ; Load double variant - blr - - stfd f28,emfp0(r31) ; Store double variant - blr - - lfs f29,emfp0(r31) ; Load single variant - blr - - stfs f29,emfp0(r31) ; Store single variant - blr - - lfd f29,emfp0(r31) ; Load double variant - blr - - stfd f29,emfp0(r31) ; Store double variant - blr - - lfs f30,emfp0(r31) ; Load single variant - blr - - stfs f30,emfp0(r31) ; Store single variant - blr - - lfd f30,emfp0(r31) ; Load double variant - blr - - stfd f30,emfp0(r31) ; Store double variant - blr - - lfs f31,emfp0(r31) ; Load single variant - blr - - stfs f31,emfp0(r31) ; Store single variant - blr - - lfd f31,emfp0(r31) ; Load double variant - blr - - stfd f31,emfp0(r31) ; Store double variant - blr - diff --git a/osfmk/ppc/Emulate64.s b/osfmk/ppc/Emulate64.s deleted file mode 100644 index 2e7854d3f..000000000 --- a/osfmk/ppc/Emulate64.s +++ /dev/null @@ -1,957 +0,0 @@ -/* - * Copyright (c) 2002-2007 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* Emulate64.s - * - * Software emulation of instructions not handled in hw, on 64-bit machines. - */ - -#include -#include -#include -#include -#include -#include -#include - -// CR bit set if the instruction is an "update" form (LFDU, STWU, etc): -#define kUpdate 25 - -// CR bit set if interrupt occured in trace mode (ie, MSR_SE_BIT): -#define kTrace 8 - -// CR bit set if notification on alignment interrupts is requested (notifyUnalignbit in spcFlags): -#define kNotify 9 - -// CR bit distinguishes between alignment and program exceptions: -#define kAlignment 10 - - - -// ************************************* -// * P R O G R A M I N T E R R U P T * -// ************************************* -// -// These are floating pt exceptions, illegal instructions, privileged mode violations, -// and traps. All we're interested in at this low level is illegal instructions. -// The ones we "emulate" are: -// DCBA, which is not implemented in the IBM 970. The emulation is to ignore it, -// as it is just a hint. -// MCRXR, which is not implemented on the IBM 970, but is in the PPC ISA. -// -// Additionally, to facilitate debugging the alignment handler, we recognize a special -// diagnostic mode that is used to simulate alignment exceptions. When in this mode, -// if the instruction has opcode==0 and the extended opcode is one of the X-form -// instructions that can take an alignment interrupt, then we change the opcode to -// 31 and pretend it got an alignment interrupt. This exercises paths that -// are hard to drive or perhaps never driven on this particular CPU. - - .text - .globl EXT(Emulate64) - .align 5 -LEXT(Emulate64) - crclr kAlignment // not an alignment exception - b a64AlignAssistJoin // join alignment handler - - -// Return from alignment handler with all the regs loaded for opcode emulation. - -a64HandleProgramInt: - rlwinm. r0,r29,0,SRR1_PRG_ILL_INS_BIT,SRR1_PRG_ILL_INS_BIT // illegal opcode? - beq a64PassAlong // No, must have been trap or priv violation etc - rlwinm r3,r20,6,26,31 // right justify opcode field (bits 0-5) - rlwinm r4,r20,31,22,31 // right justify extended opcode field (bits 21-30) - cmpwi cr0,r3,31 // X-form? - cmpwi cr1,r4,758 // DCBA? - cmpwi cr4,r4,512 // MCRXR? - crand cr1_eq,cr0_eq,cr1_eq // merge the two tests for DCBA - crand cr4_eq,cr0_eq,cr4_eq // and for MCRXR - beq++ cr1_eq,a64ExitEm // was DCBA, so ignore - bne-- cr4_eq,a64NotEmulated // skip if not MCRXR - -// Was MCRXR, so emulate. - - ld r3,savexer(r13) // get the XER - lwz r4,savecr(r13) // and the CR - rlwinm r5,r20,11,27,29 // get (CR# * 4) from instruction - rlwinm r6,r3,0,4,31 // zero XER[32-35] (also XER[0-31]) - sld r4,r4,r5 // move target CR field to bits 32-35 - rlwimi r4,r3,0,0,3 // move XER[32-35] into CR field - stw r6,savexer+4(r13) // update XER - srd r4,r4,r5 // re-position CR - stw r4,savecr(r13) // update CR - b a64ExitEm // done - -// Not an opcode we normally emulate. If in special diagnostic mode and opcode=0, -// emulate as an alignment exception. This special case is for test software. - -a64NotEmulated: - lwz r30,dgFlags(0) // Get the flags - rlwinm. r0,r30,0,enaDiagEMb,enaDiagEMb // Do we want to try to emulate something? - beq++ a64PassAlong // No emulation allowed - cmpwi r3,0 // opcode==0 ? - bne a64PassAlong // not the special case - oris r20,r20,0x7C00 // change opcode to 31 - crset kAlignment // say we took alignment exception - rlwinm r5,r4,0,26+1,26-1 // mask Update bit (32) out of extended opcode - rlwinm r5,r5,0,0,31 // Clean out leftover junk from rlwinm - - cmpwi r4,1014 // dcbz/dcbz128 ? - crmove cr1_eq,cr0_eq - cmpwi r5,21 // ldx/ldux ? - cror cr1_eq,cr0_eq,cr1_eq - cmpwi r5,599 // lfdx/lfdux ? - cror cr1_eq,cr0_eq,cr1_eq - cmpwi r5,535 // lfsx/lfsux ? - cror cr1_eq,cr0_eq,cr1_eq - cmpwi r5,343 // lhax/lhaux ? - cror cr1_eq,cr0_eq,cr1_eq - cmpwi r4,790 // lhbrx ? - cror cr1_eq,cr0_eq,cr1_eq - cmpwi r5,279 // lhzx/lhzux ? - cror cr1_eq,cr0_eq,cr1_eq - cmpwi r4,597 // lswi ? - cror cr1_eq,cr0_eq,cr1_eq - cmpwi r4,533 // lswx ? - cror cr1_eq,cr0_eq,cr1_eq - cmpwi r5,341 // lwax/lwaux ? - cror cr1_eq,cr0_eq,cr1_eq - cmpwi r4,534 // lwbrx ? - cror cr1_eq,cr0_eq,cr1_eq - cmpwi r5,23 // lwz/lwzx ? - cror cr1_eq,cr0_eq,cr1_eq - cmpwi r5,149 // stdx/stdux ? - cror cr1_eq,cr0_eq,cr1_eq - cmpwi r5,727 // stfdx/stfdux ? - cror cr1_eq,cr0_eq,cr1_eq - cmpwi r4,983 // stfiwx ? - cror cr1_eq,cr0_eq,cr1_eq - cmpwi r5,663 // stfsx/stfsux ? - cror cr1_eq,cr0_eq,cr1_eq - cmpwi r4,918 // sthbrx ? - cror cr1_eq,cr0_eq,cr1_eq - cmpwi r5,407 // sthx/sthux ? - cror cr1_eq,cr0_eq,cr1_eq - cmpwi r4,725 // stswi ? - cror cr1_eq,cr0_eq,cr1_eq - cmpwi r4,661 // stswx ? - cror cr1_eq,cr0_eq,cr1_eq - cmpwi r4,662 // stwbrx ? - cror cr1_eq,cr0_eq,cr1_eq - cmpwi r5,151 // stwx/stwux ? - cror cr1_eq,cr0_eq,cr1_eq - - beq++ cr1,a64GotInstruction // it was one of the X-forms we handle - crclr kAlignment // revert to program interrupt - b a64PassAlong // not recognized extended opcode - - -// ***************************************** -// * A L I G N M E N T I N T E R R U P T * -// ***************************************** -// -// We get here in exception context, ie with interrupts disabled, translation off, and -// in 64-bit mode, with: -// r13 = save-area pointer, with general context already saved in it -// cr6 = feature flags -// We preserve r13 and cr6. Other GPRs and CRs, the LR and CTR are used. -// -// Current 64-bit processors (GPUL) handle almost all misaligned operations in hardware, -// so this routine usually isn't called very often. Only floating pt ops that cross a page -// boundary and are not word aligned, and LMW/STMW can take exceptions to cacheable memory. -// However, in contrast to G3 and G4, any misaligned load/store will get an alignment -// interrupt on uncached memory. -// -// We always emulate scalar ops with a series of byte load/stores. Doing so is no slower -// than LWZ/STW in cases where a scalar op gets an alignment exception. -// -// This routine supports all legal permutations of alignment interrupts occuring in user or -// supervisor mode, 32 or 64-bit addressing, and translation on or off. We do not emulate -// instructions that go past the end of an address space, such as "LHZ -1(0)"; we just pass -// along the alignment exception rather than wrap around to byte 0. -// -// First, check for a few special cases such as virtual machines, etc. - - .globl EXT(AlignAssist64) - .align 5 -LEXT(AlignAssist64) - crset kAlignment // mark as alignment interrupt - -a64AlignAssistJoin: // join here from program interrupt handler - li r0,0 // Get a 0 - mfsprg r31,0 // get the per_proc data ptr - mcrf cr3,cr6 // save feature flags here... - lwz r21,spcFlags(r31) // grab the special flags - ld r29,savesrr1(r13) // get the MSR etc at the fault - ld r28,savesrr0(r13) // get the EA of faulting instruction - stw r0,savemisc3(r13) // Assume we will handle this ok - mfmsr r26 // save MSR at entry - rlwinm. r0,r21,0,runningVMbit,runningVMbit // Are we running a VM? - lwz r19,dgFlags(0) // Get the diagnostics flags - bne-- a64PassAlong // yes, let the virtual machine monitor handle - - -// Set up the MSR shadow regs. We turn on FP in this routine, and usually set DR and RI -// when accessing user space (the SLB is still set up with all the user space translations.) -// However, if the interrupt occured in the kernel with DR off, we keep it off while -// accessing the "target" address space. If we set DR to access the target space, we also -// set RI. The RI bit tells the exception handlers to clear cr0 beq and return if we get an -// exception accessing the user address space. We are careful to test cr0 beq after every such -// access. We keep the following "shadows" of the MSR in global regs across this code: -// r25 = MSR at entry, plus FP and probably DR and RI (used to access target space) -// r26 = MSR at entry -// r27 = free -// r29 = SRR1 (ie, MSR at interrupt) -// Note that EE and IR are always off, and SF is always on in this code. - - rlwinm r3,r29,31,MSR_DR_BIT,MSR_DR_BIT // Move instruction translate bit to DR - rlwimi r3,r3,32-MSR_RI_BIT+MSR_DR_BIT,MSR_RI_BIT,MSR_RI_BIT // if DR is now set, set RI too - or r25,r26,r3 // assemble MSR to use accessing target space - - -// Because the DSISR and DAR are either not set or are not to be trusted on some 64-bit -// processors on an alignment interrupt, we must fetch the faulting instruction ourselves, -// then decode/hash the opcode and reconstruct the EA manually. - - mtmsr r25 // turn on FP and (if it was on at fault) DR and RI - isync // wait for it to happen - cmpw r0,r0 // turn on beq so we can check for DSIs - lwz r20,0(r28) // fetch faulting instruction, probably with DR on - bne-- a64RedriveAsISI // got a DSI trying to fetch it, pretend it was an ISI - mtmsr r26 // turn DR back off - isync // wait for it to happen - - -// Set a few flags while we wait for the faulting instruction to arrive from cache. - - rlwinm. r0,r29,0,MSR_SE_BIT,MSR_SE_BIT // Were we single stepping? - stw r20,savemisc2(r13) // Save the instruction image in case we notify - crnot kTrace,cr0_eq - rlwinm. r0,r19,0,enaNotifyEMb,enaNotifyEMb // Should we notify? - crnot kNotify,cr0_eq - - rlwinm r3,r29,0,MSR_DR_BIT,MSR_DR_BIT // was data translation on at fault? - rlwimi r3,r3,32-MSR_RI_BIT+MSR_DR_BIT,MSR_RI_BIT,MSR_RI_BIT // if DR is now set, set RI too - or r25,r26,r3 // assemble MSR to use accessing target space - - -// Hash the intruction into a 5-bit value "AAAAB" used to index the branch table, and a -// 1-bit kUpdate flag, as follows: -// � for X-form instructions (with primary opcode 31): -// the "AAAA" bits are bits 21-24 of the instruction -// the "B" bit is the XOR of bits 29 and 30 -// the update bit is instruction bit 25 -// � for D and DS-form instructions (actually, any primary opcode except 31): -// the "AAAA" bits are bits 1-4 of the instruction -// the "B" bit is 0 -// the update bit is instruction bit 5 -// -// Just for fun (and perhaps a little speed on deep-pipe machines), we compute the hash, -// update flag, and EA without branches and with ipc >= 2. -// -// When we "bctr" to the opcode-specific reoutine, the following are all set up: -// MSR = EE and IR off, SF and FP on -// r12 = full 64-bit EA (r17 is clamped EA) -// r13 = save-area pointer (physical) -// r14 = ptr to saver0 in save-area (ie, to base of GPRs) -// r15 = 0x00000000FFFFFFFF if 32-bit mode fault, 0xFFFFFFFFFFFFFFFF if 64 -// r16 = RA * 8 (ie, reg# not reg value) -// r17 = EA, clamped to 32 bits if 32-bit mode fault (see also r12) -// r18 = (RA|0) (reg value) -// r19 = -1 if X-form, 0 if D-form -// r20 = faulting instruction -// r21 = RT * 8 (ie, reg# not reg value) -// r22 = addr(aaFPopTable)+(RT*32), ie ptr to floating pt table for target register -// r25 = MSR at entrance, probably with DR and RI set (for access to target space) -// r26 = MSR at entrance -// r27 = free -// r28 = SRR0 (ie, EA of faulting instruction) -// r29 = SRR1 (ie, MSR at fault) -// r30 = scratch, usually user data -// r31 = per-proc pointer -// cr2 = kTrace, kNotify, and kAlignment flags -// cr3 = saved copy of feature flags used in lowmem vector code -// cr6 = bits 24-27 of CR are bits 24-27 of opcode if X-form, or bits 4-5 and 00 if D-form -// bit 25 is the kUpdate flag, set for update form instructions -// cr7 = bits 28-31 of CR are bits 28-31 of opcode if X-form, or 0 if D-form - -a64GotInstruction: // here from program interrupt with instruction in r20 - rlwinm r21,r20,6+6,20,25 // move the primary opcode (bits 0-6) to bits 20-25 - la r14,saver0(r13) // r14 <- base address of GPR registers - xori r19,r21,0x07C0 // iff primary opcode is 31, set r19 to 0 - rlwinm r16,r20,16+3,24,28 // r16 <- RA*8 - subi r19,r19,1 // set bit 0 iff X-form (ie, if primary opcode is 31) - rlwinm r17,r20,21+3,24,28 // r17 <- RB*8 (if X-form) - sradi r19,r19,63 // r19 <- -1 if X-form, 0 if D-form - extsh r22,r20 // r22 <- displacement (if D-form) - - ldx r23,r14,r17 // get (RB), if any - and r15,r20,r19 // instruction if X, 0 if D - andc r17,r21,r19 // primary opcode in bits 20-25 if D, 0 if X - ldx r18,r14,r16 // get (RA) - subi r24,r16,1 // set bit 0 iff RA==0 - or r21,r15,r17 // r21 <- instruction if X, or bits 0-5 in bits 20-25 if D - sradi r24,r24,63 // r24 <- -1 if RA==0, 0 otherwise - rlwinm r17,r21,32-4,25,28 // shift opcode bits 21-24 to 25-28 (hash "AAAA" bits) - lis r10,ha16(a64BranchTable) // start to build up branch table address - rlwimi r17,r21,0,29,29 // move opcode bit 29 into hash as start of "B" bit - rlwinm r30,r21,1,29,29 // position opcode bit 30 in position 29 - and r12,r23,r19 // RB if X-form, 0 if D-form - andc r11,r22,r19 // 0 if X-form, sign extended displacement if D-form - xor r17,r17,r30 // bit 29 ("B") of hash is xor(bit29,bit30) - addi r10,r10,lo16(a64BranchTable) - or r12,r12,r11 // r12 <- (RB) or displacement, as appropriate - lwzx r30,r10,r17 // get address from branch table - mtcrf 0x01,r21 // move opcode bits 28-31 to CR7 - sradi r15,r29,32 // propogate SF bit from SRR1 (MSR_SF, which is bit 0) - andc r18,r18,r24 // r18 <- (RA|0) - mtcrf 0x02,r21 // move opcode bits 24-27 to CR6 (kUpdate is bit 25) - add r12,r18,r12 // r12 <- 64-bit EA - mtctr r30 // set up branch address - - oris r15,r15,0xFFFF // start to fill low word of r15 with 1s - rlwinm r21,r20,11+3,24,28 // r21 <- RT * 8 - lis r22,ha16(EXT(aaFPopTable)) // start to compute address of floating pt table - ori r15,r15,0xFFFF // now bits 32-63 of r15 are 1s - addi r22,r22,lo16(EXT(aaFPopTable)) - and r17,r12,r15 // clamp EA to 32 bits if fault occured in 32-bit mode - rlwimi r22,r21,2,22,26 // move RT into aaFPopTable address (which is 1KB aligned) - - bf-- kAlignment,a64HandleProgramInt // return to Program Interrupt handler - bctr // if alignment interrupt, jump to opcode-specific routine - - -// Floating-pt load single (lfs[u], lfsx[u]) - -a64LfsLfsx: - bl a64Load4Bytes // get data in r30 - mtctr r22 // set up address of "lfs fRT,emfp0(r31)" - stw r30,emfp0(r31) // put word here for aaFPopTable routine - bctrl // do the lfs - b a64UpdateCheck // update RA if necessary and exit - - -// Floating-pt store single (stfs[u], stfsx[u]) - -a64StfsStfsx: - ori r22,r22,8 // set dir==1 (ie, single store) in aaFPopTable - mtctr r22 // set up address of "stfs fRT,emfp0(r31)" - bctrl // execute the store into emfp0 - lwz r30,emfp0(r31) // get the word - bl a64Store4Bytes // store r30 into user space - b a64UpdateCheck // update RA if necessary and exit - - -// Floating-pt store as integer word (stfiwx) - -a64Stfiwx: - ori r22,r22,16+8 // set size=1, dir==1 (ie, double store) in aaFPopTable - mtctr r22 // set up FP register table address - bctrl // double precision store into emfp0 - lwz r30,emfp0+4(r31) // get the low-order word - bl a64Store4Bytes // store r30 into user space - b a64Exit // successfully emulated - - -// Floating-pt load double (lfd[u], lfdx[u]) - -a64LfdLfdx: - ori r22,r22,16 // set Double bit in aaFPopTable address - bl a64Load8Bytes // get data in r30 - mtctr r22 // set up address of "lfd fRT,emfp0(r31)" - std r30,emfp0(r31) // put doubleword here for aaFPopTable routine - bctrl // execute the load - b a64UpdateCheck // update RA if necessary and exit - - -// Floating-pt store double (stfd[u], stfdx[u]) - -a64StfdStfdx: - ori r22,r22,16+8 // set size=1, dir==1 (ie, double store) in aaFPopTable address - mtctr r22 // address of routine to stfd RT - bctrl // store into emfp0 - ld r30,emfp0(r31) // get the doubleword - bl a64Store8Bytes // store r30 into user space - b a64UpdateCheck // update RA if necessary and exit - - -// Load halfword w 0-fill (lhz[u], lhzx[u]) - -a64LhzLhzx: - bl a64Load2Bytes // load into r30 from user space (w 0-fill) - stdx r30,r14,r21 // store into RT slot in register file - b a64UpdateCheck // update RA if necessary and exit - - -// Load halfword w sign fill (lha[u], lhax[u]) - -a64LhaLhax: - bl a64Load2Bytes // load into r30 from user space (w 0-fill) - extsh r30,r30 // sign-extend - stdx r30,r14,r21 // store into RT slot in register file - b a64UpdateCheck // update RA if necessary and exit - - -// Load halfword byte reversed (lhbrx) - -a64Lhbrx: - bl a64Load2Bytes // load into r30 from user space (w 0-fill) - rlwinm r3,r30,8,16,23 // reverse bytes into r3 - rlwimi r3,r30,24,24,31 - stdx r3,r14,r21 // store into RT slot in register file - b a64Exit // successfully emulated - - -// Store halfword (sth[u], sthx[u]) - -a64SthSthx: - ldx r30,r14,r21 // get RT - bl a64Store2Bytes // store r30 into user space - b a64UpdateCheck // update RA if necessary and exit - - -// Store halfword byte reversed (sthbrx) - -a64Sthbrx: - addi r21,r21,6 // point to low two bytes of RT - lhbrx r30,r14,r21 // load and reverse - bl a64Store2Bytes // store r30 into user space - b a64Exit // successfully emulated - - -// Load word w 0-fill (lwz[u], lwzx[u]), also lwarx. - -a64LwzLwzxLwarx: - andc r3,r19,r20 // light bit 30 of r3 iff lwarx - andi. r0,r3,2 // is it lwarx? - bne-- a64PassAlong // yes, never try to emulate a lwarx - bl a64Load4Bytes // load 4 bytes from user space into r30 (0-filled) - stdx r30,r14,r21 // update register file - b a64UpdateCheck // update RA if necessary and exit - - -// Load word w sign fill (lwa, lwax[u]) - -a64Lwa: - crclr kUpdate // no update form of lwa (its a reserved encoding) -a64Lwax: - bl a64Load4Bytes // load 4 bytes from user space into r30 (0-filled) - extsw r30,r30 // sign extend - stdx r30,r14,r21 // update register file - b a64UpdateCheck // update RA if necessary and exit - - -// Load word byte reversed (lwbrx) - -a64Lwbrx: - bl a64Load4Bytes // load 4 bytes from user space into r30 (0-filled) - rlwinm r3,r30,24,0,31 // flip bytes 1234 to 4123 - rlwimi r3,r30,8,8,15 // r3 is now 4323 - rlwimi r3,r30,8,24,31 // r3 is now 4321 - stdx r3,r14,r21 // update register file - b a64Exit // successfully emulated - - -// Store word (stw[u], stwx[u]) - -a64StwStwx: - ldx r30,r14,r21 // get RT - bl a64Store4Bytes // store r30 into user space - b a64UpdateCheck // update RA if necessary and exit - - -// Store word byte reversed (stwbrx) - -a64Stwbrx: - addi r21,r21,4 // point to low word of RT - lwbrx r30,r14,r21 // load and reverse - bl a64Store4Bytes // store r30 into user space - b a64Exit // successfully emulated - - -// Load doubleword (ld[u], ldx[u]), also lwa. - -a64LdLwa: // these are DS form: ld=0, ldu=1, and lwa=2 - mtcrf 0x01,r20 // move DS field to cr7 - rlwinm r3,r20,0,30,31 // must adjust EA by subtracting DS field - sub r12,r12,r3 // subtract from full 64-bit EA - and r17,r12,r15 // then re-clamp to 32 bits if necessary - bt 30,a64Lwa // handle lwa - crmove kUpdate,31 // if opcode bit 31 is set, it is ldu so set update flag -a64Ldx: - bl a64Load8Bytes // load 8 bytes from user space into r30 - stdx r30,r14,r21 // update register file - b a64UpdateCheck // update RA if necessary and exit - - -// Store doubleword (stdx[u], std[u], stwcx) - -a64StdxStwcx: - bf-- 30,a64PassAlong // stwcx, so pass along alignment exception - b a64Stdx // was stdx -a64StdStfiwx: // if DS form: 0=std, 1=stdu, 2-3=undefined - bt 30,a64Stfiwx // handle stfiwx - rlwinm r3,r20,0,30,31 // must adjust EA by subtracting DS field - mtcrf 0x01,r20 // move DS field to cr7 - sub r12,r12,r3 // subtract from full 64-bit EA - and r17,r12,r15 // then re-clamp to 32 bits if necessary - crmove kUpdate,31 // if DS==1, then it is update form -a64Stdx: - ldx r30,r14,r21 // get RT - bl a64Store8Bytes // store RT into user space - b a64UpdateCheck // update RA if necessary and exit - - -// Dcbz and Dcbz128 (bit 10 distinguishes the two forms) - -a64DcbzDcbz128: - andis. r0,r20,0x0020 // bit 10 set? - li r3,0 // get a 0 to store - li r0,4 // assume 32-bit version, store 8 bytes 4x - rldicr r17,r17,0,63-5 // 32-byte align EA - li r4,_COMM_PAGE_BASE_ADDRESS - beq a64DcbzSetup // it was the 32-byte version - rldicr r17,r17,0,63-7 // zero low 7 bits of EA - li r0,16 // store 8 bytes 16x -a64DcbzSetup: - sub r4,r28,r4 // get instruction offset from start of commpage - and r4,r4,r15 // mask off high-order bits if 32-bit mode - cmpldi r4,_COMM_PAGE_AREA_USED // did fault occur in commpage area? - bge a64NotCommpage // not in commpage - rlwinm. r4,r29,0,MSR_PR_BIT,MSR_PR_BIT // did fault occur in user mode? - beq-- a64NotCommpage // do not zero cr7 if kernel got alignment exception - lwz r4,savecr(r13) // if we take a dcbz{128} in the commpage... - rlwinm r4,r4,0,0,27 // ...clear user's cr7... - stw r4,savecr(r13) // ...as a flag for commpage code -a64NotCommpage: - mtctr r0 - cmpw r0,r0 // turn cr0 beq on so we can check for DSIs - mtmsr r25 // turn on DR and RI so we can address user space - isync // wait for it to happen -a64DcbzLoop: - std r3,0(r17) // store into user space - bne-- a64RedriveAsDSI - addi r17,r17,8 - bdnz a64DcbzLoop - - mtmsr r26 // restore MSR - isync // wait for it to happen - b a64Exit - - -// Load and store multiple (lmw, stmw), distinguished by bit 25 - -a64LmwStmw: - subfic r22,r21,32*8 // how many regs to load or store? - srwi r22,r22,1 // get bytes to load/store - bf 25,a64LoadMultiple // handle lmw - b a64StoreMultiple // it was stmw - - -// Load string word immediate (lswi) - -a64Lswi: - rlwinm r22,r20,21,27,31 // get #bytes in r22 - and r17,r18,r15 // recompute EA as (RA|0), and clamp - subi r3,r22,1 // r22==0? - rlwimi r22,r3,6,26,26 // map count of 0 to 32 - b a64LoadMultiple - - -// Store string word immediate (stswi) - -a64Stswi: - rlwinm r22,r20,21,27,31 // get #bytes in r22 - and r17,r18,r15 // recompute EA as (RA|0), and clamp - subi r3,r22,1 // r22==0? - rlwimi r22,r3,6,26,26 // map count of 0 to 32 - b a64StoreMultiple - - -// Load string word indexed (lswx), also lwbrx - -a64LswxLwbrx: - bf 30,a64Lwbrx // was lwbrx - ld r22,savexer(r13) // get the xer - rlwinm r22,r22,0,25,31 // isolate the byte count - b a64LoadMultiple // join common code - - -// Store string word indexed (stswx), also stwbrx - -a64StswxStwbrx: - bf 30,a64Stwbrx // was stwbrx - ld r22,savexer(r13) // get the xer - rlwinm r22,r22,0,25,31 // isolate the byte count - b a64StoreMultiple // join common code - - -// Load multiple words. This handles lmw, lswi, and lswx. - -a64LoadMultiple: // r22 = byte count, may be 0 - subic. r3,r22,1 // get (#bytes-1) - blt a64Exit // done if 0 - add r4,r17,r3 // get EA of last operand byte - and r4,r4,r15 // clamp - cmpld r4,r17 // address space wrap? - blt-- a64PassAlong // pass along exception if so - srwi. r4,r22,2 // get # full words to load - rlwinm r22,r22,0,30,31 // r22 <- leftover byte count - cmpwi cr1,r22,0 // leftover bytes? - beq a64Lm3 // no words - mtctr r4 // set up word count - cmpw r0,r0 // set beq for DSI test -a64Lm2: - mtmsr r25 // turn on DR and RI - isync // wait for it to happen - lbz r3,0(r17) - bne-- a64RedriveAsDSI // got a DSI - lbz r4,1(r17) - bne-- a64RedriveAsDSI // got a DSI - lbz r5,2(r17) - bne-- a64RedriveAsDSI // got a DSI - lbz r6,3(r17) - bne-- a64RedriveAsDSI // got a DSI - rlwinm r30,r3,24,0,7 // pack bytes into r30 - rldimi r30,r4,16,40 - rldimi r30,r5,8,48 - rldimi r30,r6,0,56 - mtmsr r26 // turn DR back off so we can store into register file - isync - addi r17,r17,4 // bump EA - stdx r30,r14,r21 // pack into register file - addi r21,r21,8 // bump register file offset - rlwinm r21,r21,0,24,28 // wrap around to 0 - bdnz a64Lm2 -a64Lm3: // cr1/r22 = leftover bytes (0-3), cr0 beq set - beq cr1,a64Exit // no leftover bytes - mtctr r22 - mtmsr r25 // turn on DR so we can access user space - isync - lbz r3,0(r17) // get 1st leftover byte - bne-- a64RedriveAsDSI // got a DSI - rlwinm r30,r3,24,0,7 // position in byte 4 of r30 (and clear rest of r30) - bdz a64Lm4 // only 1 byte leftover - lbz r3,1(r17) // get 2nd byte - bne-- a64RedriveAsDSI // got a DSI - rldimi r30,r3,16,40 // insert into byte 5 of r30 - bdz a64Lm4 // only 2 bytes leftover - lbz r3,2(r17) // get 3rd byte - bne-- a64RedriveAsDSI // got a DSI - rldimi r30,r3,8,48 // insert into byte 6 -a64Lm4: - mtmsr r26 // turn DR back off so we can store into register file - isync - stdx r30,r14,r21 // pack partially-filled word into register file - b a64Exit - - -// Store multiple words. This handles stmw, stswi, and stswx. - -a64StoreMultiple: // r22 = byte count, may be 0 - subic. r3,r22,1 // get (#bytes-1) - blt a64Exit // done if 0 - add r4,r17,r3 // get EA of last operand byte - and r4,r4,r15 // clamp - cmpld r4,r17 // address space wrap? - blt-- a64PassAlong // pass along exception if so - srwi. r4,r22,2 // get # full words to load - rlwinm r22,r22,0,30,31 // r22 <- leftover byte count - cmpwi cr1,r22,0 // leftover bytes? - beq a64Sm3 // no words - mtctr r4 // set up word count - cmpw r0,r0 // turn on beq so we can check for DSIs -a64Sm2: - ldx r30,r14,r21 // get next register - addi r21,r21,8 // bump register file offset - rlwinm r21,r21,0,24,28 // wrap around to 0 - srwi r3,r30,24 // shift the four bytes into position - srwi r4,r30,16 - srwi r5,r30,8 - mtmsr r25 // turn on DR so we can access user space - isync // wait for it to happen - stb r3,0(r17) - bne-- a64RedriveAsDSI // got a DSI - stb r4,1(r17) - bne-- a64RedriveAsDSI // got a DSI - stb r5,2(r17) - bne-- a64RedriveAsDSI // got a DSI - stb r30,3(r17) - bne-- a64RedriveAsDSI // got a DSI - mtmsr r26 // turn DR back off - isync - addi r17,r17,4 // bump EA - bdnz a64Sm2 -a64Sm3: // r22 = 0-3, cr1 set on r22, cr0 beq set - beq cr1,a64Exit // no leftover bytes - ldx r30,r14,r21 // get last register - mtctr r22 - mtmsr r25 // turn on DR so we can access user space - isync // wait for it to happen -a64Sm4: - rlwinm r30,r30,8,0,31 // position next byte - stb r30,0(r17) // pack into user space - addi r17,r17,1 // bump user space ptr - bne-- a64RedriveAsDSI // got a DSI - bdnz a64Sm4 - mtmsr r26 // turn DR back off - isync - b a64Exit - - -// Subroutines to load bytes from user space. - -a64Load2Bytes: // load 2 bytes right-justified into r30 - addi r7,r17,1 // get EA of last byte - and r7,r7,r15 // clamp - cmpld r7,r17 // address wrap? - blt-- a64PassAlong // yes - mtmsr r25 // turn on DR so we can access user space - isync // wait for it to happen - sub. r30,r30,r30 // 0-fill dest and set beq - b a64Load2 // jump into routine -a64Load4Bytes: // load 4 bytes right-justified into r30 (ie, low order word) - addi r7,r17,3 // get EA of last byte - and r7,r7,r15 // clamp - cmpld r7,r17 // address wrap? - blt-- a64PassAlong // yes - mtmsr r25 // turn on DR so we can access user space - isync // wait for it to happen - sub. r30,r30,r30 // 0-fill dest and set beq - b a64Load4 // jump into routine -a64Load8Bytes: // load 8 bytes into r30 - addi r7,r17,7 // get EA of last byte - and r7,r7,r15 // clamp - cmpld r7,r17 // address wrap? - blt-- a64PassAlong // yes - mtmsr r25 // turn on DR so we can access user space - isync // wait for it to happen - sub. r30,r30,r30 // 0-fill dest and set beq - lbz r3,-7(r7) // get byte 0 - bne-- a64RedriveAsDSI // got a DSI - lbz r4,-6(r7) // and byte 1, etc - bne-- a64RedriveAsDSI // got a DSI - lbz r5,-5(r7) - bne-- a64RedriveAsDSI // got a DSI - lbz r6,-4(r7) - bne-- a64RedriveAsDSI // got a DSI - rldimi r30,r3,56,0 // position bytes in upper word - rldimi r30,r4,48,8 - rldimi r30,r5,40,16 - rldimi r30,r6,32,24 -a64Load4: - lbz r3,-3(r7) - bne-- a64RedriveAsDSI // got a DSI - lbz r4,-2(r7) - bne-- a64RedriveAsDSI // got a DSI - rldimi r30,r3,24,32 // insert bytes 4 and 5 into r30 - rldimi r30,r4,16,40 -a64Load2: - lbz r3,-1(r7) - bne-- a64RedriveAsDSI // got a DSI - lbz r4,0(r7) - bne-- a64RedriveAsDSI // got a DSI - mtmsr r26 // turn DR back off - isync - rldimi r30,r3,8,48 // insert bytes 6 and 7 into r30 - rldimi r30,r4,0,56 - blr - - -// Subroutines to store bytes into user space. - -a64Store2Bytes: // store bytes 6 and 7 of r30 - addi r7,r17,1 // get EA of last byte - and r7,r7,r15 // clamp - cmpld r7,r17 // address wrap? - blt-- a64PassAlong // yes - mtmsr r25 // turn on DR so we can access user space - isync // wait for it to happen - cmpw r0,r0 // set beq so we can check for DSI - b a64Store2 // jump into routine -a64Store4Bytes: // store bytes 4-7 of r30 (ie, low order word) - addi r7,r17,3 // get EA of last byte - and r7,r7,r15 // clamp - cmpld r7,r17 // address wrap? - blt-- a64PassAlong // yes - mtmsr r25 // turn on DR so we can access user space - isync // wait for it to happen - cmpw r0,r0 // set beq so we can check for DSI - b a64Store4 // jump into routine -a64Store8Bytes: // r30 = bytes - addi r7,r17,7 // get EA of last byte - and r7,r7,r15 // clamp - cmpld r7,r17 // address wrap? - blt-- a64PassAlong // yes - mtmsr r25 // turn on DR so we can access user space - isync // wait for it to happen - cmpw r0,r0 // set beq so we can check for DSI - rotldi r3,r30,8 // shift byte 0 into position - rotldi r4,r30,16 // and byte 1 - rotldi r5,r30,24 // and byte 2 - rotldi r6,r30,32 // and byte 3 - stb r3,-7(r7) // store byte 0 - bne-- a64RedriveAsDSI // got a DSI - stb r4,-6(r7) // and byte 1 etc... - bne-- a64RedriveAsDSI // got a DSI - stb r5,-5(r7) - bne-- a64RedriveAsDSI // got a DSI - stb r6,-4(r7) - bne-- a64RedriveAsDSI // got a DSI -a64Store4: - rotldi r3,r30,40 // shift byte 4 into position - rotldi r4,r30,48 // and byte 5 - stb r3,-3(r7) - bne-- a64RedriveAsDSI // got a DSI - stb r4,-2(r7) - bne-- a64RedriveAsDSI // got a DSI -a64Store2: - rotldi r3,r30,56 // shift byte 6 into position - stb r3,-1(r7) // store byte 6 - bne-- a64RedriveAsDSI // got a DSI - stb r30,0(r7) // store byte 7, which is already positioned - bne-- a64RedriveAsDSI // got a DSI - mtmsr r26 // turn off DR - isync - blr - - -// Exit routines. - -a64ExitEm: - li r30,T_EMULATE // Change exception code to emulate - stw r30,saveexception(r13) // Save it - b a64Exit // Join standard exit routine... - -a64PassAlong: // unhandled exception, just pass it along - li r0,1 // Set that the alignment/program exception was not emulated - crset kNotify // return T_ALIGNMENT or T_PROGRAM - stw r0,savemisc3(r13) // Set that emulation was not done - crclr kTrace // not a trace interrupt - b a64Exit1 -a64UpdateCheck: // successfully emulated, may be update form - bf kUpdate,a64Exit // update? - stdx r12,r14,r16 // yes, store 64-bit EA into RA -a64Exit: // instruction successfully emulated - addi r28,r28,4 // bump SRR0 past the emulated instruction - li r30,T_IN_VAIN // eat the interrupt since we emulated it - and r28,r28,r15 // clamp to address space size (32 vs 64) - std r28,savesrr0(r13) // save, so we return to next instruction -a64Exit1: - bt-- kTrace,a64Trace // were we in single-step at fault? - bt-- kNotify,a64Notify // should we say T_ALIGNMENT anyway? -a64Exit2: - mcrf cr6,cr3 // restore feature flags - mr r11,r30 // pass back exception code (T_IN_VAIN etc) in r11 - b EXT(EmulExit) // return to exception processing - - -// Notification requested: pass exception upstairs even though it might have been emulated. - -a64Notify: - li r30,T_ALIGNMENT // somebody wants to know about it (but don't redrive) - bt kAlignment,a64Exit2 // was an alignment exception - li r30,T_PROGRAM // was an emulated instruction - b a64Exit2 - - -// Emulate a trace interrupt after handling alignment interrupt. - -a64Trace: - lwz r9,SAVflags(r13) // get the save-area flags - li r30,T_TRACE - oris r9,r9,hi16(SAVredrive) // Set the redrive bit - stw r30,saveexception(r13) // Set the exception code - stw r9,SAVflags(r13) // Set the flags - b a64Exit2 // Exit and do trace interrupt... - - -// Got a DSI accessing user space. Redrive. One way this can happen is if another -// processor removes a mapping while we are emulating. - -a64RedriveAsISI: // this DSI happened fetching the opcode (r1==DSISR r4==DAR) - mtmsr r26 // turn DR back off - isync // wait for it to happen - li r30,T_INSTRUCTION_ACCESS - rlwimi r29,r1,0,1,4 // insert the fault type from DSI's DSISR - std r29,savesrr1(r13) // update SRR1 to look like an ISI - b a64Redrive - -a64RedriveAsDSI: // r0==DAR r1==DSISR - mtmsr r26 // turn DR back off - isync // wait for it to happen - stw r1,savedsisr(r13) // Set the DSISR of failed access - std r0,savedar(r13) // Set the address of the failed access - li r30,T_DATA_ACCESS // Set failing data access code -a64Redrive: - lwz r9,SAVflags(r13) // Pick up the flags - stw r30,saveexception(r13) // Set the replacement code - oris r9,r9,hi16(SAVredrive) // Set the redrive bit - stw r9,SAVflags(r13) // Set redrive request - crclr kTrace // don't take a trace interrupt - crclr kNotify // don't pass alignment exception - b a64Exit2 // done - - -// This is the branch table, indexed by the "AAAAB" opcode hash. - -a64BranchTable: - .long a64LwzLwzxLwarx // 00000 lwz[u], lwzx[u], lwarx - .long a64Ldx // 00001 ldx[u] - .long a64PassAlong // 00010 ldarx (never emulate these) - .long a64PassAlong // 00011 - .long a64StwStwx // 00100 stw[u], stwx[u] - .long a64StdxStwcx // 00101 stdx[u], stwcx - .long a64PassAlong // 00110 - .long a64PassAlong // 00111 stdcx (never emulate these) - .long a64LhzLhzx // 01000 lhz[u], lhzx[u] - .long a64PassAlong // 01001 - .long a64LhaLhax // 01010 lha[u], lhax[u] - .long a64Lwax // 01011 lwax[u] - .long a64SthSthx // 01100 sth[u], sthx[u] - .long a64PassAlong // 01101 - .long a64LmwStmw // 01110 lmw, stmw - .long a64PassAlong // 01111 - .long a64LfsLfsx // 10000 lfs[u], lfsx[u] - .long a64LswxLwbrx // 10001 lswx, lwbrx - .long a64LfdLfdx // 10010 lfd[u], lfdx[u] - .long a64Lswi // 10011 lswi - .long a64StfsStfsx // 10100 stfs[u], stfsx[u] - .long a64StswxStwbrx // 10101 stswx, stwbrx - .long a64StfdStfdx // 10110 stfd[u], stfdx[u] - .long a64Stswi // 10111 stswi - .long a64PassAlong // 11000 - .long a64Lhbrx // 11001 lhbrx - .long a64LdLwa // 11010 ld[u], lwa - .long a64PassAlong // 11011 - .long a64PassAlong // 11100 - .long a64Sthbrx // 11101 sthbrx - .long a64StdStfiwx // 11110 std[u], stfiwx - .long a64DcbzDcbz128 // 11111 dcbz, dcbz128 - - diff --git a/osfmk/ppc/Firmware.h b/osfmk/ppc/Firmware.h deleted file mode 100644 index c0a57f6f8..000000000 --- a/osfmk/ppc/Firmware.h +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_FREE_COPYRIGHT@ - */ -/* - * @APPLE_FREE_COPYRIGHT@ - */ - -/* - * Here be the firmware's public interfaces - * Lovingly crafted by Bill Angell using traditional methods -*/ - -#ifndef _FIRMWARE_H_ -#define _FIRMWARE_H_ - -#ifndef __ppc__ -#error This file is only useful on PowerPC. -#endif - -#include -#include - -/* - * This routine is used to write debug output to either the modem or printer port. - * parm 1 is printer (0) or modem (1); parm 2 is ID (printed directly); parm 3 converted to hex - */ - -void dbgDisp(unsigned int port, unsigned int id, unsigned int data); -void dbgLog(unsigned int d0, unsigned int d1, unsigned int d2, unsigned int d3); -void dbgLog2(unsigned int type, unsigned int p1, unsigned int p2); -void dbgDispLL(unsigned int port, unsigned int id, unsigned int data); -void fwSCCinit(unsigned int port); -void fwEmMck(unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int); /* Start injecting */ -void fwSCOM(scomcomm *); /* Read/Write SCOM */ -void setPmon(unsigned int, unsigned int); /* Set perf mon stuff */ - -extern void dbgTrace(unsigned int id, unsigned int item1, unsigned int item2, unsigned int item3, unsigned int item4); -#if 0 /* (TEST/DEBUG) - eliminate inline */ -extern __inline__ void dbgTrace(unsigned int id, unsigned int item1, unsigned int item2, unsigned int item3, unsigned int item4) { - - __asm__ volatile("mr r2,%0" : : "r" (id) : "r2"); - __asm__ volatile("mr r3,%0" : : "r" (item1) : "r3"); - __asm__ volatile("mr r4,%0" : : "r" (item2) : "r4"); - __asm__ volatile("mr r5,%0" : : "r" (item3) : "r5"); - __asm__ volatile("mr r6,%0" : : "r" (item3) : "r6"); - __asm__ volatile("lis r0,hi16(CutTrace)" : : : "r0"); - __asm__ volatile("ori r0,r0,lo16(CutTrace)" : : : "r0"); - __asm__ volatile("sc"); - return; -} -#endif - -extern void DoPreempt(void); -extern __inline__ void DoPreempt(void) { - __asm__ volatile("lis r0,hi16(DoPreemptCall)" : : : "r0"); - __asm__ volatile("ori r0,r0,lo16(DoPreemptCall)" : : : "r0"); - __asm__ volatile("sc"); - return; -} - -extern void CreateFakeIO(void); -extern __inline__ void CreateFakeIO(void) { - __asm__ volatile("lis r0,hi16(CreateFakeIOCall)" : : : "r0"); - __asm__ volatile("ori r0,r0,lo16(CreateFakeIOCall)" : : : "r0"); - __asm__ volatile("sc"); - return; -} - -extern void CreateFakeDEC(void); -extern __inline__ void CreateFakeDEC(void) { - __asm__ volatile("lis r0,hi16(CreateFakeDECCall)" : : : "r0"); - __asm__ volatile("ori r0,r0,lo16(CreateFakeDECCall)" : : : "r0"); - __asm__ volatile("sc"); - return; -} - -extern void CreateShutdownCTX(void); -extern __inline__ void CreateShutdownCTX(void) { - __asm__ volatile("lis r0,hi16(CreateShutdownCTXCall)" : : : "r0"); - __asm__ volatile("ori r0,r0,lo16(CreateShutdownCTXCall)" : : : "r0"); - __asm__ volatile("sc"); - return; -} - -extern void ChokeSys(unsigned int ercd); -extern __inline__ void ChokeSys(unsigned int ercd) { - __asm__ volatile("mr r3,%0" : : "r" (ercd) : "r3"); - __asm__ volatile("lis r0,hi16(Choke)" : : : "r0"); - __asm__ volatile("ori r0,r0,lo16(Choke)" : : : "r0"); - __asm__ volatile("sc"); - return; -} - -typedef struct Boot_Video bootBumbleC; - -extern void StoreReal(unsigned int val, unsigned int addr); -extern void ReadReal(addr64_t raddr, unsigned int *vaddr); -extern void ClearReal(unsigned int addr, unsigned int lgn); -extern void LoadDBATs(unsigned int *bat); -extern void LoadIBATs(unsigned int *bat); -extern void stFloat(unsigned int *addr); -extern int stVectors(unsigned int *addr); -extern int stSpecrs(unsigned int *addr); -extern unsigned int LLTraceSet(unsigned int tflags); -extern void GratefulDebInit(bootBumbleC *boot_video_info); -extern void GratefulDebDisp(unsigned int coord, unsigned int data); -extern void checkNMI(void); - -#pragma pack(4) /* Make sure the structure stays as we defined it */ -typedef struct GDWorkArea { /* Grateful Deb work area one per processor */ - -/* Note that a lot of info is duplicated for each processor */ - - unsigned int GDsave[32]; /* Save area for registers */ - - unsigned int GDfp0[2]; - unsigned int GDfp1[2]; - unsigned int GDfp2[2]; - unsigned int GDfp3[2]; - - unsigned int GDtop; /* Top pixel of CPU's window */ - unsigned int GDleft; /* Left pixel of CPU's window */ - unsigned int GDtopleft; /* Physical address of top left in frame buffer */ - unsigned int GDrowbytes; /* Bytes per row */ - unsigned int GDrowchar; /* Bytes per row of characters plus leading */ - unsigned int GDdepth; /* Bits per pixel */ - unsigned int GDcollgn; /* Column width in bytes */ - unsigned int GDready; /* We are ready to go */ - unsigned int GDfiller[16]; /* Fill it up to a 256 byte boundary */ - - unsigned int GDrowbuf1[128]; /* Buffer to an 8 character row */ - unsigned int GDrowbuf2[128]; /* Buffer to an 8 character row */ - -} GDWorkArea; -#pragma pack() -#define GDfontsize 16 -#define GDdispcols 2 - -#endif /* _FIRMWARE_H_ */ diff --git a/osfmk/ppc/Firmware.s b/osfmk/ppc/Firmware.s deleted file mode 100644 index d5f687f34..000000000 --- a/osfmk/ppc/Firmware.s +++ /dev/null @@ -1,2517 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_FREE_COPYRIGHT@ - */ -/* - * @APPLE_FREE_COPYRIGHT@ - */ - -/* - Firmware.s - - Handle things that should be treated as an extension of the hardware - - Lovingly crafted by Bill Angell using traditional methods and only natural or recycled materials. - No animal products are used other than rendered otter bile and deep fried pork lard. - -*/ - -#include -#include -#include -#include -#include -#include - - -/* - * Here we generate the table of supported firmware calls - */ - - - - .data - .align 5 /* Line up on cache line */ - - .globl EXT(FWtable) - -EXT(FWtable): - - .globl CutTrace /* Let everyone know 'bout it */ - .set CutTrace,(.-EXT(FWtable))/4|0x80000000 /* Call number for CutTrace */ - .long callUnimp /* This was already handled in lowmem_vectors */ - -#include - - .set EXT(FirmwareCnt), (.-EXT(FWtable))/4 /* Get the top number */ - - .text - -#define SIMPLESCC 1 -#define NOTQUITEASSIMPLE 1 -/* - * This routine handles the firmware call routine. It must be entered with IR and DR off, - * interruptions disabled, and in supervisor state. - * - * When we enter, we expect R0 to have call number, and LR - * to point to the return. Also, all registers saved in savearea in R13. - * R3 is as passed in by the user. All others must be gotten from the save area - */ - - - .align 5 - .globl EXT(FirmwareCall) - -LEXT(FirmwareCall) - - rlwinm r1,r0,2,1,29 /* Clear out bit 0 and multiply by 4 */ - lis r12,HIGH_ADDR(EXT(FWtable)) /* Get the high part of the firmware call table */ - cmplwi r1,EXT(FirmwareCnt)*4 /* Is it a valid firmware call number */ - ori r12,r12,LOW_ADDR(EXT(FWtable)) /* Now the low part */ - ble+ goodCall /* Yeah, it is... */ - - li r3,T_SYSTEM_CALL /* Tell the vector handler that we know nothing */ - b EXT(FCReturn) ; Bye dudes... - -goodCall: mfsprg r10,0 /* Make sure about the per_proc block */ - lwzx r1,r1,r12 /* Pick up the address of the routine */ - lwz r4,saver4+4(r13) /* Pass in caller's R4 */ - lwz r5,saver5+4(r13) /* Pass in caller's R5 */ - rlwinm. r1,r1,0,0,29 /* Make sure the flag bits are clear */ - - mtlr r1 /* Put it in the LR */ - beq- callUnimp /* This one was unimplimented... */ - - blrl /* Call the routine... */ - - stw r3,saver3+4(r13) /* Pass back the return code to caller */ - li r3,T_IN_VAIN /* Tell the vector handler that we took care of it */ - b EXT(FCReturn) ; Bye dudes... - -callUnimp: li r3,T_SYSTEM_CALL /* Tell the vector handler that we know nothing */ - b EXT(FCReturn) ; Bye dudes... - -/* - * This routine is used to store using a real address. It stores parmeter1 at parameter2. - */ - - .align 5 - .globl EXT(StoreReal) - -LEXT(StoreReal) - - lis r0,HIGH_ADDR(StoreRealCall) /* Get the top part of the SC number */ - ori r0,r0,LOW_ADDR(StoreRealCall) /* and the bottom part */ - sc /* Do it to it */ - blr /* Bye bye, Birdie... */ - - .align 5 - .globl EXT(StoreRealLL) - -LEXT(StoreRealLL) - - stw r3,0(r4) /* Store the word */ - blr /* Leave... */ - -/* - * This routine is used to clear a range of physical pages. - */ - - .align 5 - .globl EXT(ClearReal) - -LEXT(ClearReal) - - lis r0,HIGH_ADDR(ClearRealCall) /* Get the top part of the SC number */ - ori r0,r0,LOW_ADDR(ClearRealCall) /* and the bottom part */ - sc /* Do it to it */ - blr /* Bye bye, Birdie... */ - - - .align 5 - .globl EXT(ClearRealLL) - -LEXT(ClearRealLL) - -/* - * We take the first parameter as a physical address. The second is the length in bytes. - * Being crazy, I'll round the address down, and the length up. We could end up clearing - * an extra page at the start and one at the end, but we don't really care. If someone - * is stupid enough to give me unaligned addresses and lengths, I am just arrogant enough - * to take them at their word and to hell with them. - */ - - neg r5,r3 /* Negate the address */ - addi r4,r4,4095 /* Round length up */ - rlwinm r5,r5,0,20,31 /* Save extra length */ - rlwinm r3,r3,0,0,19 /* Round the page on down */ - add r4,r4,r5 /* Add up all extra lengths */ - li r6,32 /* Get a displacement */ - rlwinm r4,r4,0,0,19 /* Round the length back down */ - -clrloop: subi r4,r4,32 /* Back off a cache line */ - dcbz 0,r3 /* Do the even line */ - sub. r4,r4,r6 /* Back off a second time (we only do this to generate a CR */ - dcbz r6,r3 /* Clear the even line */ - addi r3,r3,64 /* Move up to every other line */ - bgt+ clrloop /* Go until we've done it all... */ - - blr /* Leave... */ -/* - * This routine will read in 32 byte of real storage. - */ - - .align 5 - .globl EXT(ReadReal) - -LEXT(ReadReal) - - mfsprg r9,2 ; Get the features - mfmsr r0 ; Get the MSR - li r8,lo16(MASK(MSR_DR)) ; Get the DR bit - rlwinm. r9,r9,0,pf64Bitb,pf64Bitb ; Are we 64-bit? - ori r8,r8,lo16(MASK(MSR_EE)) ; Add in the EE bit - li r7,1 ; Get set for it - andc r8,r0,r8 ; Turn off EE and DR - bt-- cr0_eq,rr32a ; Yes, we are... - - rldimi r8,r7,63,MSR_SF_BIT ; Set SF bit (bit 0) - sldi r3,r3,32 ; Slide on over for true 64-bit address - mtmsrd r8 - isync - or r3,r3,r4 ; Join top and bottom of address - mr r4,r5 ; Set destination address - b rrJoina ; Join on up... - -rr32a: mr r3,r4 ; Position bottom of long long - mr r4,r5 ; Set destination address - mtmsr r8 /* Disable EE and DR */ - isync /* Just make sure about it */ - -rrJoina: lwz r5,0(r3) /* Get word 0 */ - lwz r6,4(r3) /* Get word 1 */ - lwz r7,8(r3) /* Get word 2 */ - lwz r8,12(r3) /* Get word 3 */ - lis r2,hi16(MASK(MSR_VEC)) ; Get the vector enable - lwz r9,16(r3) /* Get word 4 */ - ori r2,r2,lo16(MASK(MSR_FP)) ; Get the FP enable - lwz r10,20(r3) /* Get word 5 */ - andc r0,r0,r2 ; Clear VEC and FP enables - lwz r11,24(r3) /* Get word 6 */ - lwz r12,28(r3) /* Get word 7 */ - - bt-- cr0_eq,rr32b ; We are not 64-bit... - - mtmsrd r0 - isync - b rrJoinb ; Join on up... - -rr32b: mtmsr r0 /* Restore original machine state */ - isync /* Insure goodness */ - -rrJoinb: stw r5,0(r4) /* Set word 0 */ - stw r6,4(r4) /* Set word 1 */ - stw r7,8(r4) /* Set word 2 */ - stw r8,12(r4) /* Set word 3 */ - stw r9,16(r4) /* Set word 4 */ - stw r10,20(r4) /* Set word 5 */ - stw r11,24(r4) /* Set word 6 */ - stw r12,28(r4) /* Set word 7 */ - - blr - - -/* - * This routine is used to load all 4 DBATs. - */ - - .align 5 - .globl EXT(LoadDBATs) - -LEXT(LoadDBATs) - - - lis r0,HIGH_ADDR(LoadDBATsCall) /* Top half of LoadDBATsCall firmware call number */ - ori r0,r0,LOW_ADDR(LoadDBATsCall) /* Bottom half */ - sc /* Do it to it */ - - blr /* Bye bye, Birdie... */ - - - .align 5 - .globl EXT(xLoadDBATsLL) - -LEXT(xLoadDBATsLL) - - lwz r4,0(r3) /* Get DBAT 0 high */ - lwz r5,4(r3) /* Get DBAT 0 low */ - lwz r6,8(r3) /* Get DBAT 1 high */ - lwz r7,12(r3) /* Get DBAT 1 low */ - lwz r8,16(r3) /* Get DBAT 2 high */ - lwz r9,20(r3) /* Get DBAT 2 low */ - lwz r10,24(r3) /* Get DBAT 3 high */ - lwz r11,28(r3) /* Get DBAT 3 low */ - - sync /* Common decency and the state law require that you wash your hands */ - mtdbatu 0,r4 /* Load DBAT 0 high */ - mtdbatl 0,r5 /* Load DBAT 0 low */ - mtdbatu 1,r6 /* Load DBAT 1 high */ - mtdbatl 1,r7 /* Load DBAT 1 low */ - mtdbatu 2,r8 /* Load DBAT 2 high */ - mtdbatl 2,r9 /* Load DBAT 2 low */ - mtdbatu 3,r10 /* Load DBAT 3 high */ - mtdbatl 3,r11 /* Load DBAT 3 low */ - sync /* Make sure it's done */ - isync /* Toss out anything new */ - - blr /* Leave... */ - -/* - * This routine is used to load all 4 IBATs. - */ - - .align 5 - .globl EXT(LoadIBATs) - -LEXT(LoadIBATs) - - - lis r0,HIGH_ADDR(LoadIBATsCall) /* Top half of LoadIBATsCall firmware call number */ - ori r0,r0,LOW_ADDR(LoadIBATsCall) /* Bottom half */ - sc /* Do it to it */ - blr /* Bye bye, Birdie... */ - - .align 5 - .globl EXT(xLoadIBATsLL) - -LEXT(xLoadIBATsLL) - - lwz r4,0(r3) /* Get IBAT 0 high */ - lwz r5,4(r3) /* Get IBAT 0 low */ - lwz r6,8(r3) /* Get IBAT 1 high */ - lwz r7,12(r3) /* Get IBAT 1 low */ - lwz r8,16(r3) /* Get IBAT 2 high */ - lwz r9,20(r3) /* Get IBAT 2 low */ - lwz r10,24(r3) /* Get IBAT 3 high */ - lwz r11,28(r3) /* Get IBAT 3 low */ - - sync /* Common decency and the state law require that you wash your hands */ - mtibatu 0,r4 /* Load IBAT 0 high */ - mtibatl 0,r5 /* Load IBAT 0 low */ - mtibatu 1,r6 /* Load IBAT 1 high */ - mtibatl 1,r7 /* Load IBAT 1 low */ - mtibatu 2,r8 /* Load IBAT 2 high */ - mtibatl 2,r9 /* Load IBAT 2 low */ - mtibatu 3,r10 /* Load IBAT 3 high */ - mtibatl 3,r11 /* Load IBAT 3 low */ - sync /* Make sure it's done */ - isync /* Toss out anything new */ - - blr /* Leave... */ - - -/* - * This is the glue to call the CutTrace firmware call - * dbgTrace(id, p1, p2, p3, p4) - */ - - .align 5 - .globl EXT(dbgTrace) - -LEXT(dbgTrace) - - mr r2,r3 - mr r3,r4 - lis r0,HIGH_ADDR(CutTrace) /* Top half of CreateFakeIO firmware call number */ - mr r4,r5 - mr r5,r6 - ori r0,r0,LOW_ADDR(CutTrace) /* Bottom half */ - mr r6,r7 - sc /* Do it to it */ - blr /* Bye bye, Birdie... */ - -/* - * This is the glue to create a fake I/O interruption - */ - - .align 5 - .globl EXT(CreateFakeIO) - -LEXT(CreateFakeIO) - - lis r0,HIGH_ADDR(CreateFakeIOCall) /* Top half of CreateFakeIO firmware call number */ - ori r0,r0,LOW_ADDR(CreateFakeIOCall) /* Bottom half */ - sc /* Do it to it */ - blr /* Bye bye, Birdie... */ - -/* - * This is the glue to create a fake Dec interruption - */ - - .align 5 - .globl EXT(CreateFakeDEC) - -LEXT(CreateFakeDEC) - -#if 0 - mflr r4 ; (TEST/DEBUG) - bl EXT(ml_sense_nmi) ; (TEST/DEBUG) - mtlr r4 ; (TEST/DEBUG) -#endif - - lis r0,HIGH_ADDR(CreateFakeDECCall) /* Top half of CreateFakeDEC firmware call number */ - ori r0,r0,LOW_ADDR(CreateFakeDECCall) /* Bottom half */ - sc /* Do it to it */ - blr /* Bye bye, Birdie... */ - - -/* - * This is the glue to create a shutdown context - */ - - .align 5 - .globl EXT(CreateShutdownCTX) - -LEXT(CreateShutdownCTX) - - lis r0,HIGH_ADDR(CreateShutdownCTXCall) /* Top half of CreateFakeIO firmware call number */ - ori r0,r0,LOW_ADDR(CreateShutdownCTXCall) /* Bottom half */ - sc /* Do it to it */ - blr /* Bye bye, Birdie... */ - -/* - * This is the glue to choke system - */ - - .align 5 - .globl EXT(ChokeSys) - -LEXT(ChokeSys) - - lis r0,HIGH_ADDR(Choke) /* Top half of Choke firmware call number */ - ori r0,r0,LOW_ADDR(Choke) /* Bottom half */ - sc /* Do it to it */ - blr /* Bye bye, Birdie... */ - -/* - * Used to initialize the SCC for debugging output - */ - - - .align 5 - .globl EXT(fwSCCinit) - -LEXT(fwSCCinit) - - mfmsr r8 /* Save the MSR */ - mr. r3,r3 /* See if printer or modem */ - rlwinm r12,r8,0,28,25 /* Turn off translation */ - lis r10,0xF301 /* Set the top part */ - rlwinm r12,r12,0,17,15 /* Turn off interruptions */ -#if 0 - mtmsr r12 /* Smash the MSR */ - isync /* Make it clean */ -#endif - - ori r10,r10,0x2000 /* Assume the printer (this is the normal one) */ - beq+ fwSCCprnt /* It sure are... */ - ori r10,r10,0x0002 /* Move it over to the modem port */ - -fwSCCprnt: dcbf 0,r10 /* Insure it is out */ - sync - eieio - dcbi 0,r10 /* Toss it */ - sync - - - li r7,0x09 /* Set the register */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x80 /* Reset channel A */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x04 /* Set the register */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x44 /* x16 clock, 1 stop bit */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x03 /* Set the register */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0xC0 /* 8 bits per char */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x05 /* Set the register */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0xE2 /* DTR mode, 8bit/char */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x02 /* Set the register */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x00 /* Vector 0 */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x0A /* Set the register */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x00 /* Clear misc controls */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x0B /* Set the register */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x50 /* B/R gen T/R */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x0C /* Set the register */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x0A /* 9600 baud low */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x0D /* Set the register */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x00 /* 9600 baud high */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x03 /* Set the register */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0xC1 /* 8 bits/char, Rx enable */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x05 /* Set the register */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0xEA /* 8 bits/char, Tx enable */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x0E /* Set the register */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x01 /* BR rate gen enable */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x0F /* Set the register */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x00 /* ints off */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x10 /* Reset ext/stat ints */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x10 /* Reset ext/stat ints */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x01 /* Set the register */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x10 /* int on Rx, no Tx int enable */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x09 /* Set the register */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x0A /* int on Rx, Tx int enable */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Master enable, no vector */ - dcbi 0,r10 - eieio - - li r7,0x09 /* Set the register */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - li r7,0x02 /* No vector */ - stb r7,0(r10) /* Set the register */ - dcbf 0,r10 /* Force it out */ - sync /* Master enable, no vector */ - dcbi 0,r10 - eieio - - lbz r7,0(r10) /* Clear interrupts */ - sync /* Master enable, no vector */ - dcbi 0,r10 - eieio - -wSCCrdy: eieio /* Barricade it */ - lbz r7,0(r10) /* Get current status */ - dcbi 0,r10 - sync - andi. r7,r7,0x04 /* Is transmitter empty? */ - beq wSCCrdy /* Nope... */ - - eieio - -#if 0 - mtmsr r8 /* Restore 'rupts and TR */ - isync -#endif - blr /* Leave... */ - -/* - * This routine is used to write debug output to either the modem or printer port. - * parm 1 is printer (0) or modem (1); parm 2 is ID (printed directly); parm 3 converted to hex - */ - - .align 5 - .globl EXT(dbgDisp) - -LEXT(dbgDisp) - - mr r12,r0 /* Keep R0 pristene */ - lis r0,HIGH_ADDR(dbgDispCall) /* Top half of dbgDispCall firmware call number */ - ori r0,r0,LOW_ADDR(dbgDispCall) /* Bottom half */ - - sc /* Go display the stuff */ - - mr r0,r12 /* Restore R0 */ - blr /* Return... */ - -/* Here's the low-level part of dbgDisp */ - - .align 5 - .globl EXT(dbgDispLL) - -LEXT(dbgDispLL) - -dbgDispInt: mfmsr r8 /* Save the MSR */ - -#if 0 - lis r10,0xF301 /* (TEST/DEBUG) */ - ori r10,r10,0x2002 /* (TEST/DEBUG) */ - dcbf 0,r10 /* (TEST/DEBUG) */ - sync /* (TEST/DEBUG) */ - dcbi 0,r10 /* (TEST/DEBUG) */ - eieio /* (TEST/DEBUG) */ - li r7,0x35 /* (TEST/DEBUG) */ - stb r7,4(r10) /* (TEST/DEBUG) */ - - lis r7,10 /* (TEST/DEBUG) */ -spw6: addi r7,r7,-1 /* (TEST/DEBUG) */ - mr. r7,r7 /* (TEST/DEBUG) */ - bne- spw6 /* (TEST/DEBUG) */ - dcbf 0,r10 /* (TEST/DEBUG) */ - sync /* (TEST/DEBUG) */ - dcbi 0,r10 /* (TEST/DEBUG) */ - eieio /* (TEST/DEBUG) */ -#endif - - rlwinm r12,r8,0,28,25 /* Turn off translation */ - rlwinm r12,r12,0,17,15 /* Turn off interruptions */ - - mflr r11 /* Save the link register */ - -#if 0 - mr r7,r12 /* (TEST/DEBUG) */ - bl dumpr7 /* (TEST/DEBUG) */ -#endif - - mr. r3,r3 /* See if printer or modem */ - lis r10,0xF301 /* Set the top part */ - mr r3,r4 /* Copy the ID parameter */ - -#if 0 - mr r9,r12 /* (TEST/DEBUG) */ - - mtmsr r12 /* (TEST/DEBUG) */ - isync /* (TEST/DEBUG) */ - -#if 0 - mtmsr r8 /* (TEST/DEBUG) */ - isync /* (TEST/DEBUG) */ -#endif - - lis r12,0xF301 /* (TEST/DEBUG) */ - ori r12,r12,0x2002 /* (TEST/DEBUG) */ -#if 1 - dcbf 0,r12 /* (TEST/DEBUG) */ - sync /* (TEST/DEBUG) */ - dcbi 0,r12 /* (TEST/DEBUG) */ -#endif - -xqrw1: eieio /* (TEST/DEBUG) */ - lbz r7,0(r12) /* (TEST/DEBUG) */ - dcbi 0,r12 /* (TEST/DEBUG) */ - sync /* (TEST/DEBUG) */ - andi. r7,r7,0x04 /* (TEST/DEBUG) */ - beq xqrw1 /* (TEST/DEBUG) */ - - eieio /* (TEST/DEBUG) */ - li r7,0x36 /* (TEST/DEBUG) */ - stb r7,4(r12) /* (TEST/DEBUG) */ - eieio - dcbf 0,r12 /* (TEST/DEBUG) */ - sync /* (TEST/DEBUG) */ - dcbi 0,r12 /* (TEST/DEBUG) */ - eieio /* (TEST/DEBUG) */ - - - lis r7,10 /* (TEST/DEBUG) */ -spw7: addi r7,r7,-1 /* (TEST/DEBUG) */ - mr. r7,r7 /* (TEST/DEBUG) */ - bne- spw7 /* (TEST/DEBUG) */ - dcbf 0,r12 /* (TEST/DEBUG) */ - sync /* (TEST/DEBUG) */ - dcbi 0,r12 /* (TEST/DEBUG) */ - eieio /* (TEST/DEBUG) */ - mr r12,r9 /* (TEST/DEBUG) */ -#endif - - mtmsr r12 /* Smash the MSR */ - isync /* Make it clean */ - - -#if SIMPLESCC && !NOTQUITEASSIMPLE - ori r10,r10,0x3010 /* Assume the printer (this is the normal one) */ -#else - ori r10,r10,0x2000 /* Assume the printer (this is the normal one) */ -#endif - beq+ dbgDprintr /* It sure are... */ -#if SIMPLESCC && !NOTQUITEASSIMPLE - ori r10,r10,0x0020 /* Move it over to the modem port */ -#else - ori r10,r10,0x0002 /* Move it over to the modem port */ - -#if !NOTQUITEASSIMPLE - lis r7,0xF300 /* Address of SCC rounded to 128k */ - ori r7,r7,0x0032 /* Make it cache inhibited */ - mtdbatl 3,r7 /* Load DBAT 3 low */ - lis r7,0xF300 /* Address of SCC rounded to 128k */ - ori r7,r7,0x0002 /* Make it supervisor only */ - mtdbatu 3,r7 /* Load DBAT 3 high */ - ori r12,r12,0x0010 /* Turn on DR */ - mtmsr r12 /* Smash the MSR */ - isync /* Make it clean */ - -#endif -#endif - -dbgDprintr: sync -#if 0 - mr r7,r10 /* (TEST/DEBUG) */ - bl dumpr7 /* (TEST/DEBUG) */ -#endif - - dcbi 0,r10 /* Toss it */ - eieio - -#if 0 - lis r12,0xF301 /* (TEST/DEBUG) */ - ori r12,r12,0x2002 /* (TEST/DEBUG) */ - dcbf 0,r12 /* (TEST/DEBUG) */ - sync /* (TEST/DEBUG) */ - dcbi 0,r12 /* (TEST/DEBUG) */ - eieio /* (TEST/DEBUG) */ - li r7,0x37 /* (TEST/DEBUG) */ - stb r7,4(r12) /* (TEST/DEBUG) */ - - lis r7,12 /* (TEST/DEBUG) */ -spw8: addi r7,r7,-1 /* (TEST/DEBUG) */ - mr. r7,r7 /* (TEST/DEBUG) */ - bne- spw8 /* (TEST/DEBUG) */ - dcbf 0,r12 /* (TEST/DEBUG) */ - sync /* (TEST/DEBUG) */ - dcbi 0,r12 /* (TEST/DEBUG) */ - eieio /* (TEST/DEBUG) */ -#endif - - -/* Print the ID parameter */ - - lis r12,HIGH_ADDR(fwdisplock) /* Get the display locker outer */ - ori r12,r12,LOW_ADDR(fwdisplock) /* Last part */ - - lwarx r7,0,r12 ; ? - -ddwait0: lwarx r7,0,r12 /* Get the lock */ - mr. r7,r7 /* Is it locked? */ - bne- ddwait0 /* Yup... */ - stwcx. r12,0,r12 /* Try to get it */ - bne- ddwait0 /* Nope, start all over... */ - -#if 0 - dcbf 0,r10 /* (TEST/DEBUG) */ - sync /* (TEST/DEBUG) */ - dcbi 0,r10 /* (TEST/DEBUG) */ - eieio /* (TEST/DEBUG) */ - li r7,0x38 /* (TEST/DEBUG) */ - stb r7,6(r10) /* (TEST/DEBUG) */ - - lis r7,10 /* (TEST/DEBUG) */ -spwa: addi r7,r7,-1 /* (TEST/DEBUG) */ - mr. r7,r7 /* (TEST/DEBUG) */ - bne- spwa /* (TEST/DEBUG) */ - dcbf 0,r10 /* (TEST/DEBUG) */ - sync /* (TEST/DEBUG) */ - dcbi 0,r10 /* (TEST/DEBUG) */ - eieio /* (TEST/DEBUG) */ -#endif - - rlwinm r3,r3,8,0,31 /* Get the first character */ - bl dbgDchar /* Print it */ - rlwinm r3,r3,8,0,31 /* Get the second character */ - bl dbgDchar /* Print it */ - rlwinm r3,r3,8,0,31 /* Get the third character */ - bl dbgDchar /* Print it */ - rlwinm r3,r3,8,0,31 /* Get the fourth character */ - bl dbgDchar /* Print it */ - - li r3,0x20 /* Get a space for a separator */ - bl dbgDchar /* Print it */ - bl dbg4byte /* Print register 5 in hex */ - - li r3,0x0A /* Linefeed */ - bl dbgDchar /* Send it */ - li r3,0x0D /* Carriage return */ - bl dbgDchar /* Send it */ - - mtlr r11 /* Get back the return */ -#if !SIMPLESCC && !NOTQUITEASSIMPLE - li r7,0 /* Get a zero */ - mtdbatu 3,r7 /* Invalidate DBAT 3 upper */ - mtdbatl 3,r7 /* Invalidate DBAT 3 lower */ -#endif - lis r12,HIGH_ADDR(fwdisplock) /* Get the display locker outer */ - li r7,0 /* Get a zero */ - ori r12,r12,LOW_ADDR(fwdisplock) /* Last part */ - dcbi 0,r10 /* ? */ - stw r7,0(r12) /* Release the display lock */ - mtmsr r8 /* Restore the MSR */ - isync /* Wait for it */ - blr /* Leave... */ - - -dbg4byte: mflr r12 /* Save the return */ - - lis r4,HIGH_ADDR(hexTab) /* Point to the top of table */ - li r6,8 /* Set number of hex digits to dump */ - ori r4,r4,LOW_ADDR(hexTab) /* Point to the bottom of table */ - -dbgDnext: rlwinm r5,r5,4,0,31 /* Rotate a nybble */ - subi r6,r6,1 /* Back down the count */ - rlwinm r3,r5,0,28,31 /* Isolate the last nybble */ - lbzx r3,r4,r3 /* Convert to ascii */ - bl dbgDchar /* Print it */ - mr. r6,r6 /* Any more? */ - bne+ dbgDnext /* Convert 'em all... */ - - li r3,0x20 /* Space */ - bl dbgDchar /* Send it */ - mtlr r12 /* Restore LR */ - blr /* Return... */ - -/* Write to whichever serial port. Try to leave it clean, but not too hard (this is a hack) */ - -dbgDchar: -#if SIMPLESCC && !NOTQUITEASSIMPLE - stb r3,0(r10) /* ? */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - - lis r7,3 /* Get enough for about 1ms */ - -dbgDchar0: addi r7,r7,-1 /* Count down */ - mr. r7,r7 /* Waited long enough? */ - bgt+ dbgDchar0 /* Nope... */ -#endif -#if NOTQUITEASSIMPLE -#if 0 - li r7,0x01 /* ? */ - stb r7,0(r10) /* ? */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - lbz r7,0(r10) /* ? */ - dcbi 0,r10 /* Force it out */ - sync /* kill it off */ - eieio - - li r7,0x00 /* ? */ - stb r7,0(r10) /* ? */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - lbz r7,0(r10) /* ? */ - dcbi 0,r10 /* Force it out */ - sync /* kill it off */ - eieio -#endif - -qrw1: eieio /* Barricade it */ - lbz r7,0(r10) /* ? */ - dcbi 0,r10 - sync - andi. r7,r7,0x04 /* ? */ - beq qrw1 /* Nope... */ - - eieio - - stb r3,4(r10) /* ? */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - -qrw2: eieio /* Barricade it */ - lbz r7,0(r10) /* ? */ - dcbi 0,r10 - sync - andi. r7,r7,0x04 /* ? */ - beq qrw2 /* Nope... */ - -#if 0 - eieio - li r7,0x10 /* ? */ - stb r7,0(r10) /* ? */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - - lbz r7,0(r10) /* ? */ - dcbi 0,r10 /* Force it out */ - sync /* kill it off */ - eieio -#endif - - lis r7,0x0080 /* ? */ - lis r9,0xF300 /* ? */ - ori r7,r7,0x010F /* ? */ - stw r7,0x28(r9) /* ? */ - dcbf 0,r10 /* Force it out */ - sync /* Make sure it's out there */ - dcbi 0,r10 - eieio - -#endif -#if !SIMPLESCC && !NOTQUITEASSIMPLE - rlwinm r9,r10,0,0,29 /* Get channel a */ - eieio /* Barricade it */ - - li r7,0x03 /* ? */ - stb r7,0(r9) /* ? */ - eieio /* Barricade it */ - - lbz r7,0(r9) /* ? */ - - eieio /* Barricade it */ - lbz r7,0(r9) /* ? */ - -dchrw1: eieio /* Barricade it */ - lbz r7,0(r10) /* ? */ - andi. r7,r7,0x04 /* ? */ - beq dchrw1 /* Nope... */ - - stb r3,4(r10) /* ? */ - sync /* Make sure it's there */ - eieio /* Don't get confused */ - -dchrw2: eieio /* Barricade it */ - lbz r7,0(r10) /* ? */ - andi. r7,r7,0x04 /* ? */ - beq dchrw2 /* Nope... */ - - eieio /* Avoid confusion */ - lbz r7,0(r10) /* ? */ - andi. r7,r7,0x40 /* ? */ - beq+ nounder /* Nope... */ - - eieio /* Avoid confusion */ - li r7,0xC0 /* ? */ - stb r7,0(r10) /* ? */ - -nounder: eieio /* Avoid confusion */ - li r7,0x10 /* ? */ - stb r7,0(r10) /* ? */ - - eieio /* Avoid confusion */ - li r7,0x38 /* ? */ - stb r7,0(r9) /* ? */ - - eieio /* Avoid confusion */ - li r7,0x30 /* ? */ - stb r7,0(r10) /* ? */ - - eieio /* Avoid confusion */ - li r7,0x20 /* ? */ - stb r7,0(r10) /* ? */ - eieio /* Avoid confusion */ - sync - -#endif - blr /* Return */ - - .globl hexTab - -hexTab: STRINGD "0123456789ABCDEF" /* Convert hex numbers to printable hex */ - - -/* - * Dumps all the registers in the savearea in R13 - */ - - - .align 5 - .globl EXT(dbgRegsLL) - -LEXT(dbgRegsLL) - - b EXT(FCReturn) ; Bye dudes... -#if 0 - li r3,0 /* ? */ - bl dbgRegsCm /* Join on up... */ - b EXT(FCReturn) ; Bye dudes... - - - .align 5 - .globl EXT(dbgRegs) - -LEXT(dbgRegs) - -dbgRegsCm: mfmsr r8 /* Save the MSR */ - mr. r3,r3 /* ? */ - rlwinm r12,r8,0,28,25 /* Turn off translation */ - lis r10,0xF301 /* Set the top part */ - rlwinm r12,r12,0,17,15 /* Turn off interruptions */ - mtmsr r12 /* Smash the MSR */ - isync /* Make it clean */ -#if SIMPLESCC && !NOTQUITEASSIMPLE - ori r10,r10,0x3010 /* ? */ -#else - ori r10,r10,0x2000 /* ? */ -#endif - mflr r11 /* Save the link register */ - beq+ dbgDprints /* It sure are... */ -#if SIMPLESCC && !NOTQUITEASSIMPLE - ori r10,r10,0x0020 /* ? */ -#else - ori r10,r10,0x0002 /* ? */ - - dcbf 0,r10 /* Insure it is out */ - sync - dcbi 0,r10 /* Toss it */ -#if !NOTQUITEASSIMPLE - lis r7,0xF300 /* ? */ - ori r7,r7,0x0032 /* ? */ - mtdbatl 3,r7 /* ? */ - lis r7,0xF300 /* ? */ - ori r7,r7,0x0002 /* ? */ - mtdbatu 3,r7 /* ? */ - ori r12,r12,0x0010 /* ? */ - mtmsr r12 /* ? */ - isync /* ? */ -#endif -#endif - -dbgDprints: - lis r3,HIGH_ADDR(fwdisplock) /* Get the display locker outer */ - ori r3,r3,LOW_ADDR(fwdisplock) /* Last part */ - - lwarx r5,0,r3 ; ? -ddwait1: lwarx r5,0,r3 /* Get the lock */ - mr. r5,r5 /* Is it locked? */ - bne- ddwait1 /* Yup... */ - stwcx. r3,0,r3 /* Try to get it */ - bne- ddwait1 /* Nope, start all over... */ - - li r3,0x52 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x65 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x67 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x73 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - - lwz r5,saver0(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,saver1(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,saver2(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,saver3(r13) /* Do register */ - bl dbg4byte /* Print */ - li r3,0x0A /* Linefeed */ - bl dbgDchar /* Send it */ - li r3,0x0D /* Carriage return */ - bl dbgDchar /* Send it */ - - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - lwz r5,saver4(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,saver5(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,saver6(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,saver7(r13) /* Do register */ - bl dbg4byte /* Print */ - li r3,0x0A /* Linefeed */ - bl dbgDchar /* Send it */ - li r3,0x0D /* Carriage return */ - bl dbgDchar /* Send it */ - - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - lwz r5,saver8(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,saver9(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,saver10(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,saver11(r13) /* Do register */ - bl dbg4byte /* Print */ - li r3,0x0A /* Linefeed */ - bl dbgDchar /* Send it */ - li r3,0x0D /* Carriage return */ - bl dbgDchar /* Send it */ - - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - lwz r5,saver12(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,saver13(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,saver14(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,saver15(r13) /* Do register */ - bl dbg4byte /* Print */ - li r3,0x0A /* Linefeed */ - bl dbgDchar /* Send it */ - li r3,0x0D /* Carriage return */ - bl dbgDchar /* Send it */ - - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - lwz r5,saver16(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,saver17(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,saver18(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,saver19(r13) /* Do register */ - bl dbg4byte /* Print */ - li r3,0x0A /* Linefeed */ - bl dbgDchar /* Send it */ - li r3,0x0D /* Carriage return */ - bl dbgDchar /* Send it */ - - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - lwz r5,saver20(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,saver21(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,saver22(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,saver23(r13) /* Do register */ - bl dbg4byte /* Print */ - li r3,0x0A /* Linefeed */ - bl dbgDchar /* Send it */ - li r3,0x0D /* Carriage return */ - bl dbgDchar /* Send it */ - - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - lwz r5,saver24(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,saver25(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,saver26(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,saver27(r13) /* Do register */ - bl dbg4byte /* Print */ - li r3,0x0A /* Linefeed */ - bl dbgDchar /* Send it */ - li r3,0x0D /* Carriage return */ - bl dbgDchar /* Send it */ - - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - lwz r5,saver28(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,saver29(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,saver30(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,saver31(r13) /* Do register */ - bl dbg4byte /* Print */ - li r3,0x0A /* Linefeed */ - bl dbgDchar /* Send it */ - li r3,0x0D /* Carriage return */ - bl dbgDchar /* Send it */ - -/* Segment registers */ - - li r3,0x53 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x65 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x67 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x73 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - - lwz r5,savesr0(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,savesr1(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,savesr2(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,savesr3(r13) /* Do register */ - bl dbg4byte /* Print */ - li r3,0x0A /* Linefeed */ - bl dbgDchar /* Send it */ - li r3,0x0D /* Carriage return */ - bl dbgDchar /* Send it */ - - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - lwz r5,savesr4(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,savesr5(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,savesr6(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,savesr7(r13) /* Do register */ - bl dbg4byte /* Print */ - li r3,0x0A /* Linefeed */ - bl dbgDchar /* Send it */ - li r3,0x0D /* Carriage return */ - bl dbgDchar /* Send it */ - - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - lwz r5,savesr8(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,savesr9(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,savesr10(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,savesr11(r13) /* Do register */ - bl dbg4byte /* Print */ - li r3,0x0A /* Linefeed */ - bl dbgDchar /* Send it */ - li r3,0x0D /* Carriage return */ - bl dbgDchar /* Send it */ - - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - lwz r5,savesr12(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,savesr13(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,savesr14(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,savesr15(r13) /* Do register */ - bl dbg4byte /* Print */ - li r3,0x0A /* Linefeed */ - bl dbgDchar /* Send it */ - li r3,0x0D /* Carriage return */ - bl dbgDchar /* Send it */ - - li r3,0x30 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x31 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x64 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x64 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - lwz r5,savesrr0(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,savesrr1(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,savedar(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,savedsisr(r13) /* Do register */ - bl dbg4byte /* Print */ - li r3,0x0A /* Linefeed */ - bl dbgDchar /* Send it */ - li r3,0x0D /* Carriage return */ - bl dbgDchar /* Send it */ - - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x6C /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x63 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x63 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - li r3,0x20 /* Print eyecatcher */ - bl dbgDchar /* Send it */ - lwz r5,savelr(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,savecr(r13) /* Do register */ - bl dbg4byte /* Print */ - lwz r5,savectr(r13) /* Do register */ - bl dbg4byte /* Print */ - li r3,0x0A /* Linefeed */ - bl dbgDchar /* Send it */ - li r3,0x0D /* Carriage return */ - bl dbgDchar /* Send it */ - mtlr r11 /* Get back the return */ - dcbi 0,r10 /* ? */ -#if !SIMPLESCC && !NOTQUITEASSIMPLE - li r7,0 /* Get a zero */ - mtdbatu 3,r7 /* Invalidate DBAT 3 upper */ - mtdbatl 3,r7 /* Invalidate DBAT 3 lower */ -#endif - lis r3,HIGH_ADDR(fwdisplock) /* Get the display locker outer */ - li r7,0 /* Get a zero */ - ori r3,r3,LOW_ADDR(fwdisplock) /* Last part */ - stw r7,0(r3) /* Clear display lock */ - mtmsr r8 /* Restore the MSR */ - isync /* Wait for it */ - blr /* Leave... */ -#endif - -/* - * Used for debugging to leave stuff in 0x380-0x3FF (128 bytes). - * Mapping is V=R. Stores and loads are real. - */ - - .align 5 - .globl EXT(dbgCkpt) - -LEXT(dbgCkpt) - - mr r12,r0 /* Keep R0 pristene */ - lis r0,HIGH_ADDR(dbgCkptCall) /* Top half of dbgCkptCall firmware call number */ - ori r0,r0,LOW_ADDR(dbgCkptCall) /* Bottom half */ - - sc /* Go stash the stuff */ - - mr r0,r12 /* Restore R0 */ - blr /* Return... */ - -/* Here's the low-level part of dbgCkpt */ - - .align 5 - .globl EXT(dbgCkptLL) - -LEXT(dbgCkptLL) - - - li r12,0x380 /* Point to output area */ - li r1,32 /* Get line size */ - dcbz 0,r12 /* Make sure we don't fetch a cache line */ - - lwz r4,0x00(r3) /* Load up storage to checkpoint */ - - dcbt r1,r3 /* Start in the next line */ - - lwz r5,0x04(r3) /* Load up storage to checkpoint */ - lwz r6,0x08(r3) /* Load up storage to checkpoint */ - lwz r7,0x0C(r3) /* Load up storage to checkpoint */ - lwz r8,0x10(r3) /* Load up storage to checkpoint */ - lwz r9,0x14(r3) /* Load up storage to checkpoint */ - lwz r10,0x18(r3) /* Load up storage to checkpoint */ - lwz r11,0x1C(r3) /* Load up storage to checkpoint */ - - add r3,r3,r1 /* Bump input */ - - stw r4,0x00(r12) /* Store it */ - stw r5,0x04(r12) /* Store it */ - stw r6,0x08(r12) /* Store it */ - stw r7,0x0C(r12) /* Store it */ - stw r8,0x10(r12) /* Store it */ - stw r9,0x14(r12) /* Store it */ - stw r10,0x18(r12) /* Store it */ - stw r11,0x1C(r12) /* Store it */ - - dcbz r1,r12 /* Clear the next line */ - add r12,r12,r1 /* Point to next output line */ - - lwz r4,0x00(r3) /* Load up storage to checkpoint */ - lwz r5,0x04(r3) /* Load up storage to checkpoint */ - lwz r6,0x08(r3) /* Load up storage to checkpoint */ - lwz r7,0x0C(r3) /* Load up storage to checkpoint */ - lwz r8,0x10(r3) /* Load up storage to checkpoint */ - lwz r9,0x14(r3) /* Load up storage to checkpoint */ - lwz r10,0x18(r3) /* Load up storage to checkpoint */ - lwz r11,0x1C(r3) /* Load up storage to checkpoint */ - - dcbt r1,r3 /* Touch the next line */ - add r3,r3,r1 /* Point to next input line */ - - stw r4,0x00(r12) /* Store it */ - stw r5,0x04(r12) /* Store it */ - stw r6,0x08(r12) /* Store it */ - stw r7,0x0C(r12) /* Store it */ - stw r8,0x10(r12) /* Store it */ - stw r9,0x14(r12) /* Store it */ - stw r10,0x18(r12) /* Store it */ - stw r11,0x1C(r12) /* Store it */ - - dcbz r1,r12 /* Clear the next line */ - add r12,r12,r1 /* Point to next output line */ - - lwz r4,0x00(r3) /* Load up storage to checkpoint */ - lwz r5,0x04(r3) /* Load up storage to checkpoint */ - lwz r6,0x08(r3) /* Load up storage to checkpoint */ - lwz r7,0x0C(r3) /* Load up storage to checkpoint */ - lwz r8,0x10(r3) /* Load up storage to checkpoint */ - lwz r9,0x14(r3) /* Load up storage to checkpoint */ - lwz r10,0x18(r3) /* Load up storage to checkpoint */ - lwz r11,0x1C(r3) /* Load up storage to checkpoint */ - - dcbt r1,r3 /* Touch the next line */ - add r3,r3,r1 /* Point to next input line */ - - stw r4,0x00(r12) /* Store it */ - stw r5,0x04(r12) /* Store it */ - stw r6,0x08(r12) /* Store it */ - stw r7,0x0C(r12) /* Store it */ - stw r8,0x10(r12) /* Store it */ - stw r9,0x14(r12) /* Store it */ - stw r10,0x18(r12) /* Store it */ - stw r11,0x1C(r12) /* Store it */ - - dcbz r1,r12 /* Clear the next line */ - add r12,r12,r1 /* Point to next output line */ - - lwz r4,0x00(r3) /* Load up storage to checkpoint */ - lwz r5,0x04(r3) /* Load up storage to checkpoint */ - lwz r6,0x08(r3) /* Load up storage to checkpoint */ - lwz r7,0x0C(r3) /* Load up storage to checkpoint */ - lwz r8,0x10(r3) /* Load up storage to checkpoint */ - lwz r9,0x14(r3) /* Load up storage to checkpoint */ - lwz r10,0x18(r3) /* Load up storage to checkpoint */ - lwz r11,0x1C(r3) /* Load up storage to checkpoint */ - - stw r4,0x00(r12) /* Store it */ - stw r5,0x04(r12) /* Store it */ - stw r6,0x08(r12) /* Store it */ - stw r7,0x0C(r12) /* Store it */ - stw r8,0x10(r12) /* Store it */ - stw r9,0x14(r12) /* Store it */ - stw r10,0x18(r12) /* Store it */ - stw r11,0x1C(r12) /* Store it */ - - blr - - -/* - * Do Preemption. Forces a T_PREEMPT trap to allow a preemption to occur. - */ - - .align 5 - .globl EXT(DoPreemptLL) - -LEXT(DoPreemptLL) - - li r3,T_PREEMPT /* Set preemption interrupt value */ - stw r3,saveexception(r13) /* Modify the exception type to preemption */ - b EXT(FCReturn) ; Bye dudes... - - -/* - * Force 'rupt handler to dispatch with new context - * R3 at the call contains the new savearea. - * R4 at the call contains a return code to pass back in R3. - * Forces a T_CSWITCH - */ - - .align 5 - .globl EXT(SwitchContextLL) - -LEXT(SwitchContextLL) - - li r3,T_CSWITCH /* Set context switch value */ - stw r3,saveexception(r13) /* Modify the exception type to switch context */ - b EXT(FCReturn) ; Bye dudes... - - -/* - * Create a fake I/O 'rupt. - * Forces a T_INTERRUPT trap to pretend that an actual I/O interrupt occurred. - */ - - .align 5 - .globl EXT(CreateFakeIOLL) - -LEXT(CreateFakeIOLL) - - li r3,T_INTERRUPT /* Set external interrupt value */ - stw r3,saveexception(r13) /* Modify the exception type to external */ - b EXT(FCReturn) ; Bye dudes... - -/* - * Create a shutdown context - * Forces a T_SHUTDOWN trap. - */ - - .align 5 - .globl EXT(CreateShutdownCTXLL) - -LEXT(CreateShutdownCTXLL) - - li r3,T_SHUTDOWN /* Set external interrupt value */ - stw r3,saveexception(r13) /* Modify the exception type to external */ - b EXT(FCReturn) ; Bye dudes... - -/* - * Create a fake decrementer 'rupt. - * Forces a T_DECREMENTER trap to pretend that an actual decrementer interrupt occurred. - */ - - .align 5 - .globl EXT(CreateFakeDECLL) - -LEXT(CreateFakeDECLL) - - li r3,T_DECREMENTER /* Set decrementer interrupt value */ - stw r3,saveexception(r13) /* Modify the exception type to external */ - b EXT(FCReturn) ; Bye dudes... - -/* - * Choke the system. - */ - - .align 5 - .globl EXT(DoChokeLL) - -LEXT(DoChokeLL) - - li r3,T_CHOKE ; Set external interrupt value - stw r3,saveexception(r13) ; Modify the exception type to external - b EXT(FCReturn) ; Bye dudes... - -/* - * Null firmware call - */ - - .align 5 - .globl EXT(NullLL) - -LEXT(NullLL) - - li r3,T_IN_VAIN ; Set to just ignore this one - b EXT(FCReturn) ; Bye dudes... - -; -; Null firmware call -; - - .align 5 - .globl EXT(iNullLL) - -LEXT(iNullLL) - - mfspr r4,pmc1 ; Get stamp - stw r4,0x6100+(9*16)+0x0(0) ; Save it -#if 1 - mfspr r4,pmc2 ; Get stamp - stw r4,0x6100+(9*16)+0x4(0) ; Save it - mfspr r4,pmc3 ; Get stamp - stw r4,0x6100+(9*16)+0x8(0) ; Save it - mfspr r4,pmc4 ; Get stamp - stw r4,0x6100+(9*16)+0xC(0) ; Save it -#endif - li r3,T_IN_VAIN ; Set to just ignore this one - b EXT(FCReturn) ; Bye dudes... - -; -; Set the low level trace flags -; - - .align 5 - .globl EXT(LLTraceSet) - -LEXT(LLTraceSet) - - mr r4,r3 ; Save the new value - - lwz r3,traceMask(0) ; Get the old trace flags to pass back - stw r4,traceMask(0) ; Replace with the new ones - blr ; Leave... - -#if 0 - -/* -; *************************************************************************** -; -; ----------------- Grateful Deb ---------------- -; -; Debugging: direct draw into main screen menu bar -; -; Takes R4 value, converts it to hex characters and displays it. -; -; Gotta make sure the DCBST is done to force the pixels from the cache. -; -; Position is taken as column, row (0 based) from R3. -; Characters are from hexfont, and are 16x16 pixels. -; -; Only works with two processors so far -; -; -; *************************************************************************** -*/ - -#define GDfromright 20 -#define GDfontsize 16 - - .align 5 - .globl EXT(GratefulDeb) - -LEXT(GratefulDeb) - - mfspr r6,pir /* Get the PIR */ - lis r5,HIGH_ADDR(EXT(GratefulDebWork)) /* Point to our work area */ - rlwinm r6,r6,8,23,23 /* Get part of the offset to our processors area */ - ori r5,r5,LOW_ADDR(EXT(GratefulDebWork)) /* Start building the address */ - rlwimi r6,r6,2,21,21 /* Get the rest of the offset to our processors area */ - add r6,r6,r5 /* Point at our CPU's work area */ - mfmsr r5 /* Get that MSR */ - stmw r0,GDsave(r6) /* Save all registers */ - lwz r10,GDready(r6) /* See if we're all ready to go */ - ori r0,r5,0x2000 /* Turn on the floating point */ - mr r31,r6 /* Get a more sane base register */ - mr. r10,r10 /* Are we all set? */ - mtmsr r0 /* Enable floating point */ - isync - - stfd f0,GDfp0(r31) /* Save FP */ - stfd f1,GDfp1(r31) /* Save FP */ - stfd f2,GDfp2(r31) /* Save FP */ - stfd f3,GDfp3(r31) /* Save FP */ - - beq- GDbailout /* Go and bail... */ - - rlwinm r25,r3,0,16,31 /* Isolate just the row number */ - lwz r28,GDtopleft(r31) /* Get the physical address of our line 0 */ - rlwinm r3,r3,16,16,31 /* Isolate the column number */ - lwz r27,GDrowbytes(r31) /* Get the number of bytes per row */ - lwz r9,GDrowchar(r31) /* Get the number of bytes per row of full leaded charactrers */ - lwz r26,GDdepth(r31) /* Get the bit depth */ - mullw r25,r25,r9 /* get offset to the row to write in bytes */ - lwz r24,GDcollgn(r31) /* Get the size of columns in bytes */ - add r25,r28,r25 /* Physical address of row */ - mullw r3,r3,r24 /* Get byte offset to first output column */ - - li r9,32 /* Get the initial shift calc */ - - lis r20,HIGH_ADDR(hexfont) /* Point to the font */ - - li r18,GDfontsize /* Get the number of rows in the font */ - ori r20,r20,LOW_ADDR(hexfont) /* Point to the low part */ - add r21,r25,r3 /* Physical address of top left output pixel */ - sub r9,r9,r26 /* Get right shift justifier for pixel size */ - li r7,32 /* Number of bits per word */ - -startNybble: - la r6,GDrowbuf1(r31) /* Point to the row buffer */ - li r19,8 /* Get the number of characters in a row */ - -getNybble: rlwinm r10,r4,9,23,26 /* Get the top nybble * 32 */ - rlwinm r4,r4,4,0,31 /* Rotate a nybble */ - add r10,r20,r10 /* Point to the character in the font */ - - rlwinm r16,r26,4,0,27 /* Width of row in actual bits */ - lhz r15,0(r10) /* Get the next row of the font */ - -rendrow: rlwinm r17,r15,16,0,0 /* Get the next font pixel in the row */ - rlwinm r15,r15,1,16,31 /* Move in the next font pixel */ - srawi r17,r17,31 /* Fill with 1s if black and 0s if white (reversed) */ - - slw r14,r14,r26 /* Make room for our pixel in a register */ - srw r17,r17,r9 /* Isolate one pixels worth of black or white */ - sub. r7,r7,r26 /* See how may bits are left */ - sub r16,r16,r26 /* Count how many bits are left to store for this row */ - or r14,r14,r17 /* Put in the pixel */ - bne+ notfull /* Finish rendering this word */ - - not r14,r14 /* Invert to black on white */ - stw r14,0(r6) /* Write out the word */ - li r7,32 /* Bit per word count */ - addi r6,r6,4 /* Point to the next word */ - -notfull: mr. r16,r16 /* Have we finished the whole character row? */ - bne+ rendrow /* Finish rendering the row */ - - addic. r19,r19,-1 /* Are we finished with a whole display row yet? */ - bne+ getNybble /* Not yet... */ - - la r6,GDrowbuf1(r31) /* Point to the row buffer */ - rlwinm r19,r26,31,0,29 /* Number of cache lines (depth/2) */ - mr r14,r21 /* Get the frame buffer address */ - -// BREAKPOINT_TRAP - -blitrow: lfd f0,0(r6) /* Load a line */ - lfd f1,8(r6) - lfd f2,16(r6) - lfd f3,24(r6) - - stfd f0,0(r14) /* Blit a line */ - stfd f1,8(r14) - stfd f2,16(r14) - stfd f3,24(r14) - - addi r6,r6,32 /* Next buffered line */ - - dcbst 0,r14 /* Force the line to the screen */ - sync /* Make sure the line is on it's way */ - eieio /* Make sure we beat the invalidate */ - dcbi 0,r14 /* Make sure we leave no paradox */ - - addic. r19,r19,-1 /* Done all lines yet? */ - addi r14,r14,32 /* Point to the next output */ - bne+ blitrow /* Nope, do it some more... */ - - addic. r18,r18,-1 /* Have we done all the rows in character yet? */ - addi r20,r20,2 /* Offset the font to the next row */ - add r21,r21,r27 /* Point to start of next row */ - bne+ startNybble /* Nope, go through the word one more time... */ - -GDbailout: mr r1,r31 /* Move the workarea base */ - - lfd f0,GDfp0(r31) /* Restore FP */ - lfd f1,GDfp1(r31) /* Restore FP */ - lfd f2,GDfp2(r31) /* Restore FP */ - lfd f3,GDfp3(r31) /* Restore FP */ - - mtmsr r5 /* Disable floating point */ - isync - - lmw r3,GDsave+12(r1) /* Restore most registers */ - lwz r0,GDsave(r1) /* Restore R0 */ - lwz r1,GDsave+4(r1) /* Finally, R1 */ - blr /* Leave... */ - - -/* - * void GratefulDebDisp(unsigned int coord, unsigned int data); - */ - - - .align 5 - .globl EXT(GratefulDebDisp) - -LEXT(GratefulDebDisp) - - mfmsr r9 /* Save the current MSR */ - mflr r7 /* Save the return */ - andi. r8,r9,0x7FCF /* Clear interrupt and translation */ - mtmsr r8 /* Turn 'em really off */ - isync /* Make sure about the translation part */ - bl EXT(GratefulDeb) /* Display it */ - mtmsr r9 /* Restore interrupt and translation */ - mtlr r7 /* Restore return */ - isync /* Make sure */ - blr - - -#endif - -/* - * void checkNMI(void); - */ - - - .align 5 - .globl EXT(checkNMI) - -LEXT(checkNMI) - - mfmsr r9 /* Save it */ - andi. r8,r9,0x7FCF /* Clear it */ - mtmsr r8 /* Disable it */ - isync /* Fence it */ - lis r7,0xF300 /* Find it */ - lis r2,hi16(MASK(MSR_VEC)) ; Get the vector enable - ori r7,r7,0x0020 /* Find it */ - ori r2,r2,lo16(MASK(MSR_FP)) ; Get the FP enable - dcbi 0,r7 /* Toss it */ - sync /* Sync it */ - andc r9,r9,r2 ; Clear VEC and FP enables - eieio /* Get it */ - lwz r6,0x000C(r7) /* Check it */ - eieio /* Fence it */ - dcbi 0,r7 /* Toss it */ - rlwinm. r4,r6,0,19,19 /* Check it */ - rlwinm r6,r6,0,20,18 /* Clear it */ - sync /* Sync it */ - eieio /* Fence it */ - beq+ xnonmi /* Branch on it */ - - stw r6,0x0008(r7) /* Reset it */ - sync /* Sync it */ - dcbi 0,r6 /* Toss it */ - eieio /* Fence it */ - - mtmsr r9 /* Restore it */ - isync /* Hold it */ - - BREAKPOINT_TRAP /* Kill it */ - blr /* Return from it */ - -xnonmi: /* Label it */ - mtmsr r9 /* Restore it */ - isync /* Hold it */ - blr /* Return from it */ - -; -; Saves floating point registers -; - - .align 5 - .globl EXT(stFloat) - -LEXT(stFloat) - - lis r2,hi16(MASK(MSR_VEC)) ; Get the vector enable - li r4,0 - ori r2,r2,lo16(MASK(MSR_FP)) ; Get the FP enable - ori r4,r4,lo16(MASK(MSR_EE)) ; Get the EE bit - - mfmsr r0 ; Save the MSR - - andc r4,r0,r4 ; Clear EE - ori r4,r4,lo16(MASK(MSR_FP)) ; Enable floating point - mtmsr r4 - isync - - andc r0,r0,r2 ; Clear VEC and FP enables - - stfd f0,0x00(r3) - stfd f1,0x08(r3) - stfd f2,0x10(r3) - stfd f3,0x18(r3) - stfd f4,0x20(r3) - stfd f5,0x28(r3) - stfd f6,0x30(r3) - stfd f7,0x38(r3) - stfd f8,0x40(r3) - stfd f9,0x48(r3) - stfd f10,0x50(r3) - stfd f11,0x58(r3) - stfd f12,0x60(r3) - stfd f13,0x68(r3) - stfd f14,0x70(r3) - stfd f15,0x78(r3) - stfd f16,0x80(r3) - stfd f17,0x88(r3) - stfd f18,0x90(r3) - stfd f19,0x98(r3) - stfd f20,0xA0(r3) - stfd f21,0xA8(r3) - stfd f22,0xB0(r3) - stfd f23,0xB8(r3) - stfd f24,0xC0(r3) - stfd f25,0xC8(r3) - stfd f26,0xD0(r3) - stfd f27,0xD8(r3) - stfd f28,0xE0(r3) - stfd f29,0xE8(r3) - stfd f30,0xF0(r3) - stfd f31,0xF8(r3) - mffs f0 - stfd f0,0x100(r3) - lfd f0,0x00(r3) - mtmsr r0 - isync - blr - - -; -; Saves vector registers. Returns 0 if non-Altivec machine. -; - - .align 5 - .globl EXT(stVectors) - -LEXT(stVectors) - - lis r2,hi16(MASK(MSR_VEC)) ; Get the vector enable - li r4,0 - ori r2,r2,lo16(MASK(MSR_FP)) ; Get the FP enable - ori r4,r4,lo16(MASK(MSR_EE)) ; Get the EE bit - - mfsprg r6,2 ; Get features - mr r5,r3 ; Save area address - rlwinm. r6,r6,0,pfAltivecb,pfAltivecb ; Do we have Altivec? - li r3,0 ; Assume failure - beqlr- ; No... - - mfmsr r0 ; Save the MSR - - andc r4,r0,r4 ; Clear EE - - oris r4,r4,hi16(MASK(MSR_VEC)) ; Enable vectors - mtmsr r4 - isync - - andc r0,r0,r2 ; Clear FP and VEC - - stvxl v0,0,r5 - addi r5,r5,16 - stvxl v1,0,r5 - addi r5,r5,16 - stvxl v2,0,r5 - addi r5,r5,16 - stvxl v3,0,r5 - addi r5,r5,16 - stvxl v4,0,r5 - addi r5,r5,16 - stvxl v5,0,r5 - addi r5,r5,16 - stvxl v6,0,r5 - addi r5,r5,16 - stvxl v7,0,r5 - addi r5,r5,16 - stvxl v8,0,r5 - addi r5,r5,16 - stvxl v9,0,r5 - addi r5,r5,16 - stvxl v10,0,r5 - addi r5,r5,16 - stvxl v11,0,r5 - addi r5,r5,16 - stvxl v12,0,r5 - addi r5,r5,16 - stvxl v13,0,r5 - addi r5,r5,16 - stvxl v14,0,r5 - addi r5,r5,16 - stvxl v15,0,r5 - addi r5,r5,16 - stvxl v16,0,r5 - addi r5,r5,16 - stvxl v17,0,r5 - addi r5,r5,16 - stvxl v18,0,r5 - addi r5,r5,16 - stvxl v19,0,r5 - addi r5,r5,16 - stvxl v20,0,r5 - addi r5,r5,16 - stvxl v21,0,r5 - addi r5,r5,16 - stvxl v22,0,r5 - addi r5,r5,16 - stvxl v23,0,r5 - addi r5,r5,16 - stvxl v24,0,r5 - addi r5,r5,16 - stvxl v25,0,r5 - addi r5,r5,16 - stvxl v26,0,r5 - addi r5,r5,16 - stvxl v27,0,r5 - addi r5,r5,16 - stvxl v28,0,r5 - addi r5,r5,16 - stvxl v29,0,r5 - addi r5,r5,16 - stvxl v30,0,r5 - addi r5,r5,16 - stvxl v31,0,r5 - mfvscr v31 - addi r6,r5,16 - stvxl v31,0,r6 - li r3,1 - lvxl v31,0,r5 - mtmsr r0 - isync - - blr - - -; -; Saves yet more registers -; - - .align 5 - .globl EXT(stSpecrs) - -LEXT(stSpecrs) - - - lis r2,hi16(MASK(MSR_VEC)) ; Get the vector enable - li r4,0 - ori r2,r2,lo16(MASK(MSR_FP)) ; Get the FP enable - ori r4,r4,lo16(MASK(MSR_EE)) ; Get the EE bit - - mfsprg r9,2 ; Get feature flags - mtcrf 0x02,r9 ; move pf64Bit cr6 - - mfmsr r0 ; Save the MSR - andc r0,r0,r2 ; Turn off VEC and FP - andc r4,r0,r4 ; And EE - mtmsr r4 - isync - - mfpvr r12 - stw r12,4(r3) - rlwinm r12,r12,16,16,31 - - bt++ pf64Bitb,stsSF1 ; skip if 64-bit (only they take the hint) - - mfdbatu r4,0 - mfdbatl r5,0 - mfdbatu r6,1 - mfdbatl r7,1 - mfdbatu r8,2 - mfdbatl r9,2 - mfdbatu r10,3 - mfdbatl r11,3 - stw r4,8(r3) - stw r5,12(r3) - stw r6,16(r3) - stw r7,20(r3) - stw r8,24(r3) - stw r9,28(r3) - stw r10,32(r3) - stw r11,36(r3) - - mfibatu r4,0 - mfibatl r5,0 - mfibatu r6,1 - mfibatl r7,1 - mfibatu r8,2 - mfibatl r9,2 - mfibatu r10,3 - mfibatl r11,3 - stw r4,40(r3) - stw r5,44(r3) - stw r6,48(r3) - stw r7,52(r3) - stw r8,56(r3) - stw r9,60(r3) - stw r10,64(r3) - stw r11,68(r3) - - mfsprg r4,0 - mfsprg r5,1 - mfsprg r6,2 - mfsprg r7,3 - stw r4,72(r3) - stw r5,76(r3) - stw r6,80(r3) - stw r7,84(r3) - - mfsdr1 r4 - stw r4,88(r3) - - la r4,92(r3) - li r5,0 - -stSnsr: mfsrin r6,r5 - addis r5,r5,0x1000 - stw r6,0(r4) - mr. r5,r5 - addi r4,r4,4 - bne+ stSnsr - - cmplwi r12,PROCESSOR_VERSION_750 - mfspr r4,hid0 - stw r4,(39*4)(r3) - - li r4,0 - li r5,0 - li r6,0 - li r7,0 - - mfspr r4,hid1 - mfspr r5,l2cr - mfspr r6,msscr0 - mfspr r7,msscr1 - - stw r4,(40*4)(r3) - stw r6,(42*4)(r3) - stw r5,(41*4)(r3) - stw r7,(43*4)(r3) - - li r4,0 - beq isis750 - - mfspr r4,pir -isis750: stw r4,0(r3) - - li r4,0 - li r5,0 - li r6,0 - li r7,0 - blt- b4750 - - mfspr r4,thrm1 - mfspr r5,thrm2 - mfspr r6,thrm3 - mfspr r7,ictc - -b4750: stw r4,(44*4)(r3) - stw r5,(45*4)(r3) - stw r6,(46*4)(r3) - stw r7,(47*4)(r3) - - li r4,0 - li r6,0 - cmplwi r12,PROCESSOR_VERSION_7400 - bne nnmax - - mfspr r6,dabr - mfpvr r5 - rlwinm r5,r5,0,16,31 - cmplwi r5,0x1101 - beq gnmax - cmplwi r5,0x1102 - bne nnmax - -gnmax: mfspr r4,1016 - -nnmax: stw r4,(48*4)(r3) - stw r6,(49*4)(r3) - - mtmsr r0 - isync - - blr - -stsSF1: mfsprg r4,0 - mfsprg r5,1 - mfsprg r6,2 - mfsprg r7,3 - std r4,(18*4)(r3) - std r5,(20*4)(r3) - std r6,(22*4)(r3) - std r7,(24*4)(r3) - - mfsdr1 r4 - std r4,(26*4)(r3) - - mfspr r4,hid0 - std r4,(28*4)(r3) - mfspr r4,hid1 - std r4,(30*4)(r3) - mfspr r4,hid4 - std r4,(32*4)(r3) - mfspr r4,hid5 - std r4,(34*4)(r3) - - -stsSF2: li r5,0 - la r4,(80*4)(r3) - -stsslbm: slbmfee r6,r5 - slbmfev r7,r5 - std r6,0(r4) - std r7,8(r4) - addi r5,r5,1 - cmplwi r5,64 - addi r4,r4,16 - blt stsslbm - - mtmsr r0 - isync - - blr - -; -; fwEmMck - this forces the hardware to emulate machine checks -; Only valid on 64-bit machines -; Note: we want interruptions disabled here -; - - .globl EXT(fwEmMck) - - .align 5 - -LEXT(fwEmMck) - - - rlwinm r3,r3,0,1,0 ; Copy low of high high - scomd - rlwinm r5,r5,0,1,0 ; Copy low of high high - hid1 - rlwinm r7,r7,0,1,0 ; Copy low of high high - hid4 - rlwimi r3,r4,0,0,31 ; Copy low of low low - rlwimi r5,r6,0,0,31 ; Copy low of low low - rlwimi r7,r8,0,0,31 ; Copy low of low low - - lis r9,3 ; Start forming hid1 error inject mask - lis r10,hi16(0x01084083) ; Start formaing hid4 error inject mask - ori r9,r9,0xC000 ; Next bit - ori r10,r10,lo16(0x01084083) ; Next part - sldi r9,r9,32 ; Shift up high - sldi r10,r10,8 ; Shift into position - - mfspr r0,hid1 ; Get hid1 - mfspr r2,hid4 ; and hid4 - - and r5,r5,r9 ; Keep only error inject controls - hid1 - and r7,r7,r10 ; Keep only error inject controls - hid4 - - andc r0,r0,r9 ; Clear error inject controls hid1 - andc r2,r2,r10 ; Clear error inject controls hid4 - - or r0,r0,r5 ; Add in the new controls hid1 - or r2,r2,r7 ; Add in the new controls hid4 - -/* ? */ -#if 0 - lis r12,CoreErrI ; Get the error inject controls - sync - - mtspr scomd,r3 ; Set the error inject controls - mtspr scomc,r12 ; Request error inject - mfspr r11,scomc ; Get back the status (we just ignore it) -#endif - sync - isync - - mtspr hid1,r0 ; Move in hid1 controls - mtspr hid1,r0 ; We need to do it twice - isync - - sync - mtspr hid4,r2 ; Move in hid4 controls - isync - - blr ; Leave... - -; -; fwSCOMrd - read/write SCOM -; - .align 5 - .globl EXT(fwSCOM) - -LEXT(fwSCOM) - - lhz r12,scomfunc(r3) ; Get the function - lwz r4,scomreg(r3) ; Get the register - rldicr r4,r4,8,47 ; Position for SCOM - - mr. r12,r12 ; See if read or write - bne fwSCwrite ; Go do a write - - mfsprg r0,2 ; Get the feature flags - ori r4,r4,0x8000 ; Set to read data - rlwinm. r0,r0,pfSCOMFixUpb+1,31,31 ; Set shift if we need a fix me up - sync - - mtspr scomc,r4 ; Request the register - mfspr r11,scomd ; Get the register contents - mfspr r10,scomc ; Get back the status - sync - isync - - sld r11,r11,r0 ; Fix up if needed - - std r11,scomdata(r3) ; Save result - eieio - std r10,scomstat(r3) ; Save status - - blr - -fwSCwrite: ld r5,scomdata(r3) ; Get the data - - sync - - mtspr scomd,r5 ; Set the data - mtspr scomc,r4 ; Set it - mfspr r10,scomc ; Get back the status - sync - isync - - std r10,scomstat(r3) ; Save status - - blr - -; -; diagTrap - this is used to trigger checks from user space -; any "twi 31,r31,0xFFFx" will come here (x = 0 to F). -; On entry R3 points to savearea. -; R4 is the "x" from instruction; -; Pass back 1 to no-op twi and return to user -; Pass back 0 to treat as normal twi. -; - - .globl EXT(diagTrap) - - .align 5 - -LEXT(diagTrap) - - li r3,1 ; Ignore TWI - blr ; Leave... - - - - -; -; setPmon - this is used to manipulate MMCR0 and MMCR1 - - .globl EXT(setPmon) - - .align 5 - -LEXT(setPmon) - - li r0,0 - isync - mtspr mmcr0,r0 ; Clear MMCR0 - mtspr mmcr1,r0 ; Clear MMCR1 - mtspr pmc1,r0 - mtspr pmc2,r0 - mtspr pmc3,r0 - mtspr pmc4,r0 - - isync - - mtspr mmcr0,r3 ; Set MMCR0 - mtspr mmcr1,r4 ; Set MMCR1 - isync - blr ; Leave... - - diff --git a/osfmk/ppc/FirmwareC.c b/osfmk/ppc/FirmwareC.c deleted file mode 100644 index 5ddc41c71..000000000 --- a/osfmk/ppc/FirmwareC.c +++ /dev/null @@ -1,338 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * This file contains firmware code. - * - */ - -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -Boot_Video dgVideo; -extern GDWorkArea GratefulDebWork[]; - -struct RuptCtr { /* Counts hardware interrupts */ - struct GDpos { /* Screen position for Grateful Deb display */ - unsigned short col; /* Column (-1 means no display) */ - unsigned short row; /* Row */ - } GDpos; - unsigned int count; /* Count of interrupt */ - unsigned int timed; /* If set, count updates at timed rate */ - unsigned int lasttime; /* Low of timebase when last updated */ -}; - -/* Window layout for Grateful Deb: - * - * 0 9 - * - * 0 Total Decrimenter - * 1 DSI ISI - * 2 System call External - * 3 SIGP Floating point - * 4 Program Alignment - */ - -struct RuptCtr RuptCtrs[96] = { - { /* Total interruptions */ - .GDpos = { - .col = 0, - .row = 0, - }, - .count = 0, - .timed = 1, - }, - { /* Reset */ - .GDpos = { - .col = -1, - .row = -1, - }, - .count = 0, - .timed = 0, - }, - { /* Machine check */ - .GDpos = { - .col = -1, - .row = -1, - }, - .count = 0, - .timed = 0, - }, - { /* DSIs */ - .GDpos = { - .col = 0, - .row = 1, - }, - .count = 0, - .timed = 1}, - { /* ISIs */ - .GDpos = { - .col = 1, - .row = 1, - }, - .count = 0, - .timed = 1, - }, - { /* Externals */ - .GDpos = { - .col = 1, - .row = 2, - }, - .count = 0, - .timed = 1, - }, - { /* Alignment */ - .GDpos = { - .col = 1, - .row = 4, - }, - .count = 0, - .timed = 0, - }, - {.GDpos = {.col = 0,.row = 4},.count = 0,.timed = 0}, /* Program */ - {.GDpos = {.col = 1,.row = 3},.count = 0,.timed = 0}, /* Floating point */ - {.GDpos = {.col = 1,.row = 0},.count = 0,.timed = 1}, /* Decrementer */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* I/O error */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = 0,.row = 2},.count = 0,.timed = 1}, /* System call */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Trace */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Floating point assist */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Performance monitor */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* VMX */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Instruction breakpoint */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* System management */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Trace */ - {.GDpos = {.col = 0,.row = 3},.count = 0,.timed = 0}, /* SIGP */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Preemption */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Context switch */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Special, update frequency controls */ - - /*Start of second processor counts */ - - {.GDpos = {.col = 0,.row = 0},.count = 0,.timed = 1}, /* Total interruptions */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reset */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Machine check */ - {.GDpos = {.col = 0,.row = 1},.count = 0,.timed = 1}, /* DSIs */ - {.GDpos = {.col = 1,.row = 1},.count = 0,.timed = 1}, /* ISIs */ - {.GDpos = {.col = 1,.row = 2},.count = 0,.timed = 1}, /* Externals */ - {.GDpos = {.col = 1,.row = 4},.count = 0,.timed = 0}, /* Alignment */ - {.GDpos = {.col = 0,.row = 4},.count = 0,.timed = 0}, /* Program */ - {.GDpos = {.col = 1,.row = 3},.count = 0,.timed = 0}, /* Floating point */ - {.GDpos = {.col = 1,.row = 0},.count = 0,.timed = 1}, /* Decrementer */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* I/O error */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = 0,.row = 2},.count = 0,.timed = 1}, /* System call */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Trace */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Floating point assist */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Performance monitor */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* VMX */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Instruction breakpoint */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* System management */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Trace */ - {.GDpos = {.col = 0,.row = 3},.count = 0,.timed = 0}, /* SIGP */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Preemption */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Context switch */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Reserved */ - {.GDpos = {.col = -1,.row = -1},.count = 0,.timed = 0}, /* Special, update frequency controls */ -}; - -void -GratefulDebInit(bootBumbleC *boot_video_info) -{ /* Initialize the video debugger */ - - unsigned int fillframe[256]; - unsigned int startpos, startbyte, windowleft, newwidth, i, j, startword, - oldwidth, nrmlgn; - unsigned int nwords, *byteleft, lstlgn, pixlgn, bytelgn; - - if (!boot_video_info) { /* Are we disabling it? */ - GratefulDebWork[0].GDready = 0; /* Disable output */ - return; - } - - nrmlgn = (9 * GDfontsize) * (boot_video_info->v_depth / 8); /* Get the normal column size in bytes */ - lstlgn = (((8 * GDfontsize) + (GDfontsize >> 1)) * boot_video_info->v_depth) / 8; /* Same as normal, but with 1/2 character space */ - nrmlgn = (nrmlgn + 31) & -32; /* Round to a line */ - - bytelgn = (nrmlgn * (GDdispcols - 1)) + lstlgn; /* Length in bytes */ - pixlgn = bytelgn / (boot_video_info->v_depth / 8); /* Number of pixels wide */ - - startbyte = (boot_video_info->v_width * (boot_video_info->v_depth / 8)) - bytelgn; /* Get the starting byte unaligned */ - startpos = boot_video_info->v_width - pixlgn; /* Starting pixel position */ - - startbyte += (unsigned int)boot_video_info->v_baseAddr & 31; /* Add the extra to cache boundary in frame buffer */ - startbyte &= -32; /* Make sure it's on a cache line for speed */ - startbyte += (unsigned int)boot_video_info->v_baseAddr & 31; /* Subtract the extra to cache boundary in frame buffer */ - - windowleft = startbyte - (((GDfontsize / 2) * boot_video_info->v_depth) / 8); /* Back up a half character */ - windowleft &= -4; /* Make sure it is on a word boundary */ - newwidth = windowleft / (boot_video_info->v_depth / 8); /* Get the new pixel width of screen */ - - oldwidth = boot_video_info->v_width; /* Save the old width */ -// boot_video_info->v_width = newwidth; /* Set the new width */ - - nwords = oldwidth - newwidth; /* See how much to fill in pixels */ - nwords = nwords / (32 / boot_video_info->v_depth); /* Get that in bytes */ - - startword = (newwidth + 3) / 4; /* Where does it start? */ - - byteleft = (unsigned int *)(boot_video_info->v_baseAddr + windowleft); /* Starting place */ - for (i = 0; i < nwords; i++) - byteleft[i] = 0; /* Set the row to all black */ - - byteleft = (unsigned int *)(boot_video_info->v_baseAddr + windowleft + (boot_video_info->v_rowBytes * 1)); /* Starting place */ - for (i = 0; i < nwords; i++) - byteleft[i] = 0; /* Set the row to all black */ - - byteleft = (unsigned int *)(boot_video_info->v_baseAddr + windowleft + (boot_video_info->v_rowBytes * (boot_video_info->v_height - 2))); /* Starting place */ - for (i = 0; i < nwords; i++) - byteleft[i] = 0; /* Set the row to all black */ - - byteleft = (unsigned int *)(boot_video_info->v_baseAddr + windowleft + (boot_video_info->v_rowBytes * (boot_video_info->v_height - 1))); /* Starting place */ - for (i = 0; i < nwords; i++) - byteleft[i] = 0; /* Set the row to all black */ - - for (i = 0; i < nwords; i++) - fillframe[i] = 0xFFFFFFFF; /* Set the row to all white */ - - if (boot_video_info->v_depth == 8) { /* See if 8 bits a pixel */ - fillframe[0] = 0x0000FFFF; /* Make left border */ - fillframe[nwords - 1] = 0xFFFF0000; /* Make right border */ - } else if (boot_video_info->v_depth == 16) { /* See if 16 bits a pixel */ - fillframe[0] = 0x00000000; /* Make left border */ - fillframe[nwords - 1] = 0x00000000; /* Make right border */ - } else { - fillframe[0] = 0x00000000; /* Make left border */ - fillframe[1] = 0x00000000; /* Make left border */ - fillframe[nwords - 1] = 0x00000000; /* Make right border */ - fillframe[nwords - 2] = 0x00000000; /* Make right border */ - } - - byteleft = (unsigned int *)(boot_video_info->v_baseAddr + windowleft + (boot_video_info->v_rowBytes * 2)); /* Place to start filling */ - - for (i = 2; i < (boot_video_info->v_height - 2); i++) { /* Fill the rest */ - for (j = 0; j < nwords; j++) - byteleft[j] = fillframe[j]; /* Fill the row */ - byteleft = (unsigned int *)((unsigned int)byteleft + boot_video_info->v_rowBytes); /* Next row */ - } - - for (i = 0; i < 2; i++) { /* Initialize both (for now) processor areas */ - - GratefulDebWork[i].GDtop = - 2 + (GDfontsize / 2) + (i * 18 * GDfontsize); - GratefulDebWork[i].GDleft = 2 + startpos + (GDfontsize / 2); - GratefulDebWork[i].GDtopleft = - boot_video_info->v_baseAddr + startbyte + - (GratefulDebWork[i].GDtop * boot_video_info->v_rowBytes); - GratefulDebWork[i].GDrowbytes = boot_video_info->v_rowBytes; - GratefulDebWork[i].GDrowchar = - boot_video_info->v_rowBytes * (GDfontsize + - (GDfontsize / 4)); - GratefulDebWork[i].GDdepth = boot_video_info->v_depth; - GratefulDebWork[i].GDcollgn = nrmlgn; - -// RuptCtrs[(48*i)+47].timed = gPEClockFrequencyInfo.timebase_frequency_hz >> 4; /* (Update every 16th of a second (16 fps) */ - RuptCtrs[(48 * i) + 47].timed = gPEClockFrequencyInfo.timebase_frequency_hz >> 3; /* (Update every 8th of a second (8 fps) */ -// RuptCtrs[(48*i)+47].timed = gPEClockFrequencyInfo.timebase_frequency_hz >> 2; /* (Update every 4th of a second (4 fps) */ -// RuptCtrs[(48*i)+47].timed = gPEClockFrequencyInfo.timebase_frequency_hz >> 1; /* (Update every 2th of a second (2 fps) */ -// RuptCtrs[(48*i)+47].timed = gPEClockFrequencyInfo.timebase_frequency_hz >> 0; /* (Update every 1 second (1 fps) */ - - sync(); - - GratefulDebWork[i].GDready = 1; /* This one's all ready */ - } -} - -void debugNoop(void); -void -debugNoop(void) -{ /* This does absolutely nothing */ -} diff --git a/osfmk/ppc/FirmwareCalls.h b/osfmk/ppc/FirmwareCalls.h deleted file mode 100644 index ec25ea30c..000000000 --- a/osfmk/ppc/FirmwareCalls.h +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_FREE_COPYRIGHT@ - */ -/* - * @APPLE_FREE_COPYRIGHT@ - */ - -#ifdef ASSEMBLER - -#ifdef _FIRMWARECALLS_H_ -#error Hey! You can only include FirmwareCalls.h in one assembler file, dude. And it should be Firmware.s! -#else /* _FIRMWARECALLS_H_ */ - -/* - * Entries for all firmware calls are in here (except for call 0x80000000 - CutTrace - */ - -#define _FIRMWARECALLS_H_ - -#define fwCallEnt(name, entrypt) \ - .globl name __ASMNL__ \ - .set name,(.-EXT(FWtable))/4|0x80000000 __ASMNL__ \ - .long EXT(entrypt) __ASMNL__ - -/* - * - */ - - fwCallEnt(dbgDispCall, dbgDispLL) /* Write stuff to printer or modem port */ - fwCallEnt(dbgCkptCall, dbgCkptLL) /* Save 128 bytes from r3 to 0x380 V=R mapping */ - fwCallEnt(StoreRealCall, StoreRealLL) /* Save one word in real storage */ - fwCallEnt(ClearRealCall, ClearRealLL) /* Clear physical pages */ - fwCallEnt(LoadDBATsCall, xLoadDBATsLL) /* Load all DBATs */ - fwCallEnt(LoadIBATsCall, xLoadIBATsLL) /* Load all IBATs */ - fwCallEnt(DoPreemptCall, DoPreemptLL) /* Preempt if need be */ - fwCallEnt(CreateFakeIOCall, CreateFakeIOLL) /* Make a fake I/O interruption */ - fwCallEnt(SwitchContextCall, SwitchContextLL) /* Switch context */ - fwCallEnt(Choke, DoChokeLL) /* Choke (system crash) */ - fwCallEnt(dbgRegsCall, dbgRegsLL) /* Dumps all registers */ - fwCallEnt(CreateFakeDECCall, CreateFakeDECLL) /* Make a fake decrementer interruption */ - fwCallEnt(CreateShutdownCTXCall, CreateShutdownCTXLL) /* create a shutdown context */ - fwCallEnt(NullCall, NullLL) /* Null Firmware call */ - fwCallEnt(iNullCall, iNullLL) /* Instrumented null Firmware call */ - -#endif /* _FIRMWARECALLS_H_ */ - -#else /* ASSEMBLER */ - -/* - * The firmware function headers - */ -extern void CutTrace (unsigned int item1, ...); - -#endif /* ASSEMBLER */ diff --git a/osfmk/ppc/Makefile b/osfmk/ppc/Makefile deleted file mode 100644 index b978cc676..000000000 --- a/osfmk/ppc/Makefile +++ /dev/null @@ -1,36 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - - -EXPORT_ONLY_FILES = \ - asm.h \ - cpu_number.h \ - cpu_capabilities.h \ - Diagnostics.h \ - io_map_entries.h \ - lock.h \ - locks.h \ - proc_reg.h \ - machine_routines.h \ - mappings.h \ - savearea.h \ - simple_lock.h - -INSTALL_MD_DIR = ppc - -INSTALL_MD_LCL_LIST = cpu_capabilities.h - -EXPORT_MD_LIST = ${EXPORT_ONLY_FILES} - -EXPORT_MD_DIR = ppc - -include $(MakeInc_rule) -include $(MakeInc_dir) - - diff --git a/osfmk/ppc/PPCcalls.h b/osfmk/ppc/PPCcalls.h deleted file mode 100644 index 262fe2e91..000000000 --- a/osfmk/ppc/PPCcalls.h +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * To add a new entry: - * Add an "PPCTRAP(routine)" to the table below - * - * Add trap definition to mach/ppc/syscall_sw.h and - * recompile user library. - * - * Note: - * The maximum number of calls is 0x1000 (4096 for the hexually challanged) - * - */ - -typedef int (*PPCcallEnt)(struct savearea *save); - -#define PPCcall(rout) rout -#define dis (PPCcallEnt)0 - -PPCcallEnt PPCcalls[] = { - - PPCcall(diagCall), /* 0x6000 Call diagnostics routines */ - PPCcall(vmm_get_version), /* 0x6001 Get Virtual Machine Monitor version */ - PPCcall(vmm_get_features), /* 0x6002 Get Virtual Machine Monitor supported features */ - PPCcall(vmm_init_context), /* 0x6003 Initialize a VMM context */ - PPCcall(vmm_dispatch), /* 0x6004 Dispatch a Virtual Machine Monitor call */ - PPCcall(bb_enable_bluebox), /* 0x6005 Enable this thread for use in the blue box virtual machine */ - PPCcall(bb_disable_bluebox), /* 0x6006 Disable this thread for use in the blue box virtual machine */ - PPCcall(bb_settaskenv), /* 0x6007 Set the BlueBox per thread task environment data */ - PPCcall(vmm_stop_vm), /* 0x6008 Stop a running VM */ - - PPCcall(dis), /* 0x6009 disabled */ - - PPCcall(ppcNull), /* 0x600A Null PPC syscall */ - PPCcall(perfmon_control), /* 0x600B performance monitor */ - PPCcall(ppcNullinst), /* 0x600C Instrumented Null PPC syscall */ - PPCcall(pmsCntrl), /* 0x600D Power Management Stepper */ - PPCcall(dis), /* 0x600E disabled */ - PPCcall(dis), /* 0x600F disabled */ - PPCcall(dis), /* 0x6010 disabled */ - PPCcall(dis), /* 0x6011 disabled */ - PPCcall(dis), /* 0x6012 disabled */ - PPCcall(dis), /* 0x6013 disabled */ - PPCcall(dis), /* 0x6014 disabled */ - PPCcall(dis), /* 0x6015 disabled */ - PPCcall(dis), /* 0x6016 disabled */ - PPCcall(dis), /* 0x6017 disabled */ - PPCcall(dis), /* 0x6018 disabled */ - PPCcall(dis), /* 0x6019 disabled */ - PPCcall(dis), /* 0x601A disabled */ - PPCcall(dis), /* 0x601B disabled */ - PPCcall(dis), /* 0x601C disabled */ - PPCcall(dis), /* 0x601D disabled */ - PPCcall(dis), /* 0x601E disabled */ - PPCcall(dis), /* 0x601F disabled */ -}; - -#undef dis diff --git a/osfmk/ppc/Performance.s b/osfmk/ppc/Performance.s deleted file mode 100644 index 440c39678..000000000 --- a/osfmk/ppc/Performance.s +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT_INTERNAL_USE_ONLY@ - */ - -/* - Performance.s - - Handle things that should are related to the hardware performance monitor - - Lovingly crafted by Bill Angell using traditional methods and only natural or recycled materials. - No more than 7500 chinchillas were killed in the production of the code. - -*/ - -#include -#include -#include -#include -#include -#include - -#if PERF_HIST -/* - * This routine is used to interface to the performance monitor - */ - -ENTRY(PerfCtl, TAG_NO_FRAME_USED) - - lis r0,PerfCtlCall@h /* Get the top part of the SC number */ - ori r0,r0,PerfCtlCall@l /* and the bottom part */ - sc /* Do it to it */ - blr /* Bye bye, Birdie... */ - - -ENTRY(PerfCtlLL, TAG_NO_FRAME_USED) - - cmplwi r3,maxPerf /* See if we are within range */ - mflr r11 /* Get the return point */ - li r3,0 /* Show failure */ - bgelrl- /* Load up current address and, also, leave if out of range */ -prfBase: mflr r12 /* Get our address */ - rlwinm r10,r3,2,0,31 /* Get displacement into branch table */ - addi r12,r12,prfBrnch-prfBase /* Point to the branch address */ - add r12,r12,r10 /* Point to the branch */ - mtlr r12 /* Get it in the link register */ - blr /* Vector to the specific performance command... */ - -prfBrnch: b prfClear /* Clear the histogram table */ - b prfStart /* Start the performance monitor */ - b prfStop /* Stop the performance monitor */ - b prfMap /* Map the histogram into an address space */ - .equ maxPerf, (.-prfBrnch)/4 /* Set the highest valid address */ - -/* - * Clear the monitor histogram - */ -prfClear: - li r4,PMIhist@l /* We know this to be in page 0, so no need for the high part */ - lis r8,PMIHIST_SIZE@h /* Get high half of the table size */ - lwz r4,0(r4) /* Get the real address of the histgram */ - ori r8,r8,PMIHIST_SIZE@l /* Get the low half of the table size */ - li r6,32 /* Get a displacement */ - li r3,1 /* Set up a good return code */ - mtlr r11 /* Restore the return address */ - -clrloop: subi r8,r8,32 /* Back off a cache line */ - dcbz 0,r4 /* Do the even line */ - sub. r8,r8,r6 /* Back off a second time (we only do this to generate a CR */ - dcbz r6,r4 /* Clear the even line */ - addi r4,r4,64 /* Move up to every other line */ - bgt+ clrloop /* Go until we've done it all... */ - - blr /* Leave... */ - -/* - * Start the monitor histogram - */ - prfStart: - mtlr r11 /* Restore the return address */ - blr /* Return... */ - -/* - * Stop the monitor histogram - */ - prfStop: - mtlr r11 /* Restore the return address */ - blr /* Return... */ - -/* - * Maps the monitor histogram into another address space - */ - prfMap: - mtlr r11 /* Restore the return address */ - blr /* Return... */ - -#endif - diff --git a/osfmk/ppc/PseudoKernel.c b/osfmk/ppc/PseudoKernel.c deleted file mode 100644 index fc2a10ecc..000000000 --- a/osfmk/ppc/PseudoKernel.c +++ /dev/null @@ -1,450 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - File: PseudoKernel.c - - Contains: BlueBox PseudoKernel calls - Written by: Mark Gorlinsky - Bill Angell - - Copyright: 1997 by Apple Computer, Inc., all rights reserved - -*/ - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -extern int is_suser(void); -extern void tbeproc(void *proc); - -void bbSetRupt(ReturnHandler *rh, thread_t ct); - -/* -** Function: NotifyInterruption -** -** Inputs: -** ppcInterrupHandler - interrupt handler to execute -** interruptStatePtr - current interrupt state -** -** Outputs: -** -** Notes: -** -*/ -kern_return_t -syscall_notify_interrupt(void) -{ - task_t task; - thread_t act, fact; - bbRupt *bbr; - BTTD_t *bttd; - int i; - - task = current_task(); /* Figure out who our task is */ - - task_lock(task); /* Lock our task */ - - fact = (thread_t)task->threads.next; /* Get the first activation on task */ - act = NULL; /* Pretend we didn't find it yet */ - - for(i = 0; i < task->thread_count; i++) { /* Scan the whole list */ - if(fact->machine.bbDescAddr) { /* Is this a Blue thread? */ - bttd = (BTTD_t *)(fact->machine.bbDescAddr & -PAGE_SIZE); - if(bttd->InterruptVector) { /* Is this the Blue interrupt thread? */ - act = fact; /* Yeah... */ - break; /* Found it, Bail the loop... */ - } - } - fact = (thread_t)fact->task_threads.next; /* Go to the next one */ - } - - if(!act) { /* Couldn't find a bluebox */ - task_unlock(task); /* Release task lock */ - return KERN_FAILURE; /* No tickie, no shirtee... */ - } - - thread_reference(act); - - task_unlock(task); /* Safe to release now */ - - thread_mtx_lock(act); - - /* if the calling thread is the BlueBox thread that handles interrupts - * we know that we are in the PsuedoKernel and we can short circuit - * setting up the asynchronous task by setting a pending interrupt. - */ - - if (act == current_thread()) { - bttd->InterruptControlWord = bttd->InterruptControlWord | - ((bttd->postIntMask >> kCR2ToBackupShift) & kBackupCR2Mask); - - thread_mtx_unlock(act); /* Unlock the activation */ - thread_deallocate(act); - return KERN_SUCCESS; - } - - if(act->machine.emPendRupts >= 16) { /* Have we hit the arbitrary maximum? */ - thread_mtx_unlock(act); /* Unlock the activation */ - thread_deallocate(act); - return KERN_RESOURCE_SHORTAGE; /* Too many pending right now */ - } - - if(!(bbr = (bbRupt *)kalloc(sizeof(bbRupt)))) { /* Get a return handler control block */ - thread_mtx_unlock(act); /* Unlock the activation */ - thread_deallocate(act); - return KERN_RESOURCE_SHORTAGE; /* No storage... */ - } - - (void)hw_atomic_add(&act->machine.emPendRupts, 1); /* Count this 'rupt */ - bbr->rh.handler = bbSetRupt; /* Set interruption routine */ - - bbr->rh.next = act->handlers; /* Put our interrupt at the start of the list */ - act->handlers = &bbr->rh; - - act_set_apc(act); /* Set an APC AST */ - - thread_mtx_unlock(act); /* Unlock the activation */ - thread_deallocate(act); - return KERN_SUCCESS; /* We're done... */ -} - -/* - * This guy is fired off asynchronously to actually do the 'rupt. - * We will find the user state savearea and modify it. If we can't, - * we just leave after releasing our work area - */ - -void bbSetRupt(ReturnHandler *rh, thread_t act) { - - struct savearea *sv; - BTTD_t *bttd; - bbRupt *bbr; - UInt32 interruptState; - - bbr = (bbRupt *)rh; /* Make our area convenient */ - - if(!(act->machine.bbDescAddr)) { /* Is BlueBox still enabled? */ - kfree(bbr, sizeof(bbRupt)); /* No, release the control block */ - return; - } - - (void)hw_atomic_sub(&act->machine.emPendRupts, 1); /* Uncount this 'rupt */ - - if(!(sv = find_user_regs(act))) { /* Find the user state registers */ - kfree(bbr, sizeof(bbRupt)); /* Couldn't find 'em, release the control block */ - return; - } - - bttd = (BTTD_t *)(act->machine.bbDescAddr & -PAGE_SIZE); - - interruptState = (bttd->InterruptControlWord & kInterruptStateMask) >> kInterruptStateShift; - - switch (interruptState) { - - case kInSystemContext: - sv->save_cr |= bttd->postIntMask; /* post int in CR2 */ - break; - - case kInAlternateContext: - bttd->InterruptControlWord = (bttd->InterruptControlWord & ~kInterruptStateMask) | - (kInPseudoKernel << kInterruptStateShift); - - bttd->exceptionInfo.srr0 = (unsigned int)sv->save_srr0; /* Save the current PC */ - sv->save_srr0 = (uint64_t)act->machine.bbInterrupt; /* Set the new PC */ - bttd->exceptionInfo.sprg1 = (unsigned int)sv->save_r1; /* Save the original R1 */ - sv->save_r1 = (uint64_t)bttd->exceptionInfo.sprg0; /* Set the new R1 */ - bttd->exceptionInfo.srr1 = (unsigned int)sv->save_srr1; /* Save the original MSR */ - sv->save_srr1 &= ~(MASK(MSR_BE)|MASK(MSR_SE)); /* Clear SE|BE bits in MSR */ - act->machine.specFlags &= ~bbNoMachSC; /* reactivate Mach SCs */ - disable_preemption(); /* Don't move us around */ - getPerProc()->spcFlags = act->machine.specFlags; /* Copy the flags */ - enable_preemption(); /* Ok to move us around */ - /* drop through to post int in backup CR2 in ICW */ - - case kInExceptionHandler: - case kInPseudoKernel: - case kOutsideBlue: - bttd->InterruptControlWord = bttd->InterruptControlWord | - ((bttd->postIntMask >> kCR2ToBackupShift) & kBackupCR2Mask); - break; - - default: - break; - } - - kfree(bbr, sizeof(bbRupt)); /* Release the control block */ - return; - -} - -kern_return_t -enable_bluebox(host_t host, unsigned _taskID, unsigned _TWI_TableStart, - unsigned _Desc_TableStart); -kern_return_t disable_bluebox( host_t host ); - -/* - * This function is used to enable the firmware assist code for bluebox traps, system calls - * and interrupts. - * - * The assist code can be called from two types of threads. The blue thread, which handles - * traps, system calls and interrupts and preemptive threads that only issue system calls. - * - * Parameters: host . - * _taskID opaque task ID - * _TWI_TableStart Start of TWI table - * _Desc_TableStart Start of descriptor table - */ - -kern_return_t -enable_bluebox(host_t host, unsigned _taskID, unsigned _TWI_TableStart, - unsigned _Desc_TableStart) -{ - /* XXX mig funness */ - void *taskID = (void *)_taskID; - void *TWI_TableStart = (void *)_TWI_TableStart; - char *Desc_TableStart = (char *)_Desc_TableStart; - - thread_t th; - vm_offset_t kerndescaddr, origdescoffset; - kern_return_t ret; - ppnum_t physdescpage; - BTTD_t *bttd; - - th = current_thread(); /* Get our thread */ - - if ( host == HOST_NULL ) return KERN_INVALID_HOST; - if ( ! is_suser() ) return KERN_FAILURE; /* We will only do this for the superuser */ - if ( th->machine.bbDescAddr ) return KERN_FAILURE; /* Bail if already authorized... */ - if ( ! (unsigned int) Desc_TableStart ) return KERN_FAILURE; /* There has to be a descriptor page */ - if ( ! TWI_TableStart ) return KERN_FAILURE; /* There has to be a TWI table */ - - /* Get the page offset of the descriptor */ - origdescoffset = (vm_offset_t)Desc_TableStart & (PAGE_SIZE - 1); - - /* Align the descriptor to a page */ - Desc_TableStart = (char *)((vm_offset_t)Desc_TableStart & -PAGE_SIZE); - - ret = vm_map_wire(th->map, /* Kernel wire the descriptor in the user's map */ - (vm_offset_t)Desc_TableStart, - (vm_offset_t)Desc_TableStart + PAGE_SIZE, - VM_PROT_READ | VM_PROT_WRITE, - FALSE); - - if(ret != KERN_SUCCESS) { /* Couldn't wire it, spit on 'em... */ - return KERN_FAILURE; - } - - physdescpage = /* Get the physical page number of the page */ - pmap_find_phys(th->map->pmap, CAST_USER_ADDR_T(Desc_TableStart)); - - ret = kmem_alloc_pageable(kernel_map, &kerndescaddr, PAGE_SIZE); /* Find a virtual address to use */ - if(ret != KERN_SUCCESS) { /* Could we get an address? */ - (void) vm_map_unwire(th->map, /* No, unwire the descriptor */ - (vm_offset_t)Desc_TableStart, - (vm_offset_t)Desc_TableStart + PAGE_SIZE, - TRUE); - return KERN_FAILURE; /* Split... */ - } - - (void) pmap_enter(kernel_pmap, /* Map this into the kernel */ - kerndescaddr, physdescpage, VM_PROT_READ|VM_PROT_WRITE, - VM_WIMG_USE_DEFAULT, TRUE); - - bttd = (BTTD_t *)kerndescaddr; /* Get the address in a convienient spot */ - - th->machine.bbDescAddr = (unsigned int)kerndescaddr+origdescoffset; /* Set kernel address of the table */ - th->machine.bbUserDA = (unsigned int)Desc_TableStart; /* Set user address of the table */ - th->machine.bbTableStart = (unsigned int)TWI_TableStart; /* Set address of the trap table */ - th->machine.bbTaskID = (unsigned int)taskID; /* Assign opaque task ID */ - th->machine.bbTaskEnv = 0; /* Clean task environment data */ - th->machine.emPendRupts = 0; /* Clean pending 'rupt count */ - th->machine.bbTrap = bttd->TrapVector; /* Remember trap vector */ - th->machine.bbSysCall = bttd->SysCallVector; /* Remember syscall vector */ - th->machine.bbInterrupt = bttd->InterruptVector; /* Remember interrupt vector */ - th->machine.bbPending = bttd->PendingIntVector; /* Remember pending vector */ - th->machine.specFlags &= ~(bbNoMachSC | bbPreemptive); /* Make sure mach SCs are enabled and we are not marked preemptive */ - th->machine.specFlags |= bbThread; /* Set that we are Classic thread */ - - if(!(bttd->InterruptVector)) { /* See if this is a preemptive (MP) BlueBox thread */ - th->machine.specFlags |= bbPreemptive; /* Yes, remember it */ - } - - disable_preemption(); /* Don't move us around */ - getPerProc()->spcFlags = th->machine.specFlags; /* Copy the flags */ - enable_preemption(); /* Ok to move us around */ - - { - /* mark the proc to indicate that this is a TBE proc */ - - tbeproc(th->task->bsd_info); - } - - return KERN_SUCCESS; -} - -kern_return_t disable_bluebox( host_t host ) { /* User call to terminate bluebox */ - - thread_t act; - - act = current_thread(); /* Get our thread */ - - if (host == HOST_NULL) return KERN_INVALID_HOST; - - if(!is_suser()) return KERN_FAILURE; /* We will only do this for the superuser */ - if(!act->machine.bbDescAddr) return KERN_FAILURE; /* Bail if not authorized... */ - - disable_bluebox_internal(act); /* Clean it all up */ - return KERN_SUCCESS; /* Leave */ -} - -void disable_bluebox_internal(thread_t act) { /* Terminate bluebox */ - - (void) vm_map_unwire(act->map, /* Unwire the descriptor in user's address space */ - (vm_offset_t)act->machine.bbUserDA, - (vm_offset_t)act->machine.bbUserDA + PAGE_SIZE, - FALSE); - - kmem_free(kernel_map, (vm_offset_t)act->machine.bbDescAddr & -PAGE_SIZE, PAGE_SIZE); /* Release the page */ - - act->machine.bbDescAddr = 0; /* Clear kernel pointer to it */ - act->machine.bbUserDA = 0; /* Clear user pointer to it */ - act->machine.bbTableStart = 0; /* Clear user pointer to TWI table */ - act->machine.bbTaskID = 0; /* Clear opaque task ID */ - act->machine.bbTaskEnv = 0; /* Clean task environment data */ - act->machine.emPendRupts = 0; /* Clean pending 'rupt count */ - act->machine.specFlags &= ~(bbNoMachSC | bbPreemptive | bbThread); /* Clean up Blue Box enables */ - disable_preemption(); /* Don't move us around */ - getPerProc()->spcFlags = act->machine.specFlags; /* Copy the flags */ - enable_preemption(); /* Ok to move us around */ - return; -} - -/* - * Use the new PPCcall method to enable blue box threads - * - * save->r3 = taskID - * save->r4 = TWI_TableStart - * save->r5 = Desc_TableStart - * - */ -int bb_enable_bluebox( struct savearea *save ) -{ - kern_return_t rc; - - rc = enable_bluebox((host_t)0xFFFFFFFF, - CAST_DOWN(unsigned, save->save_r3), - CAST_DOWN(unsigned, save->save_r4), - CAST_DOWN(unsigned, save->save_r5)); - save->save_r3 = rc; - return 1; /* Return with normal AST checking */ -} - -/* - * Use the new PPCcall method to disable blue box threads - * - */ -int bb_disable_bluebox( struct savearea *save ) -{ - kern_return_t rc; - - rc = disable_bluebox( (host_t)0xFFFFFFFF ); - save->save_r3 = rc; - return 1; /* Return with normal AST checking */ -} - -/* - * Search through the list of threads to find the matching taskIDs, then - * set the task environment pointer. A task in this case is a preemptive thread - * in MacOS 9. - * - * save->r3 = taskID - * save->r4 = taskEnv - */ - -int bb_settaskenv( struct savearea *save ) -{ - int i; - task_t task; - thread_t act, fact; - - - task = current_task(); /* Figure out who our task is */ - - task_lock(task); /* Lock our task */ - fact = (thread_t)task->threads.next; /* Get the first activation on task */ - act = NULL; /* Pretend we didn't find it yet */ - - for(i = 0; i < task->thread_count; i++) { /* Scan the whole list */ - if(fact->machine.bbDescAddr) { /* Is this a Blue thread? */ - if ( fact->machine.bbTaskID == save->save_r3 ) { /* Is this the task we are looking for? */ - act = fact; /* Yeah... */ - break; /* Found it, Bail the loop... */ - } - } - fact = (thread_t)fact->task_threads.next; /* Go to the next one */ - } - - if ( !act || !act->active) { - task_unlock(task); /* Release task lock */ - save->save_r3 = -1; /* we failed to find the taskID */ - return 1; - } - - thread_reference(act); - - task_unlock(task); /* Safe to release now */ - - thread_mtx_lock(act); /* Make sure this stays 'round */ - - act->machine.bbTaskEnv = save->save_r4; - if(act == current_thread()) { /* Are we setting our own? */ - disable_preemption(); /* Don't move us around */ - getPerProc()->ppbbTaskEnv = act->machine.bbTaskEnv; /* Remember the environment */ - enable_preemption(); /* Ok to move us around */ - } - - thread_mtx_unlock(act); /* Unlock the activation */ - thread_deallocate(act); - save->save_r3 = 0; - return 1; -} diff --git a/osfmk/ppc/PseudoKernel.h b/osfmk/ppc/PseudoKernel.h deleted file mode 100644 index 31b83af7e..000000000 --- a/osfmk/ppc/PseudoKernel.h +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - File: PseudoKernelPriv.h - - Contains: Interfaces for Classic environment's PseudoKernel - - Copyright: (c) 2000 Apple Computer, Inc. All rights reserved. -*/ - -#include - -#include - -/* Support firmware PseudoKernel FastTrap architectural extension */ - -#define bbMaxTrap (16 * sizeof(long)) -#define bbRFITrap bbMaxTrap - -extern int bb_enable_bluebox(struct savearea *); -extern int bb_disable_bluebox(struct savearea *); -extern int bb_settaskenv(struct savearea *); - -kern_return_t syscall_notify_interrupt(void); - -struct BlueExceptionDataArea { - UInt32 srr0; // OUT PC at time of exception, IN return address - UInt32 srr1; // OUT/IN msr FE0, BE, SE and FE1 bits to restore on exit - UInt32 sprg0; // OUT R1 set to this value - UInt32 sprg1; // OUT/IN R1 restored to this value -}; -typedef struct BlueExceptionDataArea * BlueExceptionDataAreaPtr; -typedef struct BlueExceptionDataArea BEDA_t; - -/* - The Blue Thread, which is running MacOS, needs to be able to handle Traps, SCs and interrupts. -*/ -struct BlueThreadTrapDescriptor { - UInt32 TrapVector; // 0=Trap - UInt32 SysCallVector; // 1=SysCall - UInt32 InterruptVector; // 2=Interrupt - UInt32 PendingIntVector; // 3=Pending interrupt - BEDA_t exceptionInfo; // Save registers at time of exception (trap/syscall) - UInt32 InterruptControlWord; // Holds context state and backup CR2 bits - UInt32 NewExitState; // New run state when exiting PseudoKernel - UInt32 testIntMask; // Mask for a pending alternate context interrupt in backup CR2 - UInt32 postIntMask; // Mask to post an interrupt -}; -typedef struct BlueThreadTrapDescriptor * BlueThreadTrapDescriptorPtr; -typedef struct BlueThreadTrapDescriptor BTTD_t; - -enum { - // The following define the UInt32 gInterruptState - kInUninitialized = 0, // State not yet initialized - kInPseudoKernel = 1, // Currently executing within pseudo kernel - kInSystemContext = 2, // Currently executing within the system (emulator) context - kInAlternateContext = 3, // Currently executing within an alternate (native) context - kInExceptionHandler = 4, // Currently executing an exception handler - kOutsideBlue = 5, // Currently executing outside of the Blue thread - kNotifyPending = 6, // Pending Notify Interrupt - - kInterruptStateMask = 0x000F0000, // Mask to extract interrupt state from gInterruptState - kInterruptStateShift = 16, // Shift count to align interrupt state - - kBackupCR2Mask = 0x0000000F, // Mask to extract backup CR2 from gInterruptState - kCR2ToBackupShift = 31-11, // Shift count to align CR2 into the backup CR2 of gInterruptState - // (and vice versa) - kCR2Mask = 0x00F00000 // Mask to extract CR2 from the PPC CR register -}; - -struct bbRupt { - struct ReturnHandler rh; /* Return handler address */ -}; -typedef struct bbRupt bbRupt; diff --git a/osfmk/ppc/_setjmp.s b/osfmk/ppc/_setjmp.s deleted file mode 100644 index 534fc3536..000000000 --- a/osfmk/ppc/_setjmp.s +++ /dev/null @@ -1,194 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -/* - * C library -- _setjmp, _longjmp - * - * _longjmp(a,v) - * will generate a "return(v)" from - * the last call to - * _setjmp(a) - * by restoring registers from the stack, - * The previous signal state is NOT restored. - * - * NOTE : MUST BE KEPT CONSISTENT WITH gdb/config/powerpc/tm-ppc-eabi.h - * (which needs to know where to find the destination address) - */ - -#include - -/* - * setjmp : ARG0 (r3) contains the address of - * the structure where we are to - * store the context - * Uses r0 as scratch register - * - * NOTE : MUST BE KEPT CONSISTENT WITH gdb/config/powerpc/tm-ppc-eabi.h - * (which needs to know where to find the destination address) - */ - -ENTRY(_setjmp,TAG_NO_FRAME_USED) - /* first entry is used for r1 - stack ptr */ - stw r13, 4(ARG0) /* GPR context. We avoid multiple-word */ - stw r14, 8(ARG0) /* instructions as they're slower (?) */ - stw r15, 12(ARG0) - stw r16, 16(ARG0) - stw r17, 20(ARG0) - stw r18, 24(ARG0) - stw r19, 28(ARG0) - stw r20, 32(ARG0) - stw r21, 36(ARG0) - stw r22, 40(ARG0) - stw r23, 44(ARG0) - stw r24, 48(ARG0) - stw r25, 52(ARG0) - stw r26, 56(ARG0) - stw r27, 60(ARG0) - stw r28, 64(ARG0) - stw r29, 68(ARG0) - stw r30, 72(ARG0) - stw r31, 76(ARG0) - - mfcr r0 - stw r0, 80(ARG0) /* Condition register */ - - mflr r0 - stw r0, 84(ARG0) /* Link register */ - - mfxer r0 - stw r0, 88(ARG0) /* Fixed point exception register */ - -#if FLOATING_POINT_SUPPORT /* TODO NMGS probably not needed for kern */ - mffs f0 /* get FPSCR in low 32 bits of f0 */ - stfiwx f0, 92(ARG0) /* Floating point status register */ - - stfd f14, 96(ARG0) /* Floating point context - 8 byte aligned */ - stfd f15, 104(ARG0) - stfd f16, 112(ARG0) - stfd f17, 120(ARG0) - stfd f18, 138(ARG0) - stfd f19, 146(ARG0) - stfd f20, 144(ARG0) - stfd f21, 152(ARG0) - stfd f22, 160(ARG0) - stfd f23, 178(ARG0) - stfd f24, 186(ARG0) - stfd f25, 184(ARG0) - stfd f26, 192(ARG0) - stfd f27, 200(ARG0) - stfd f28, 218(ARG0) - stfd f29, 226(ARG0) - stfd f30, 224(ARG0) - stfd f31, 232(ARG0) - -#endif - - stw r1, 0(ARG0) /* finally, save the stack pointer */ - li ARG0, 0 /* setjmp must return zero */ - blr - -/* - * longjmp : ARG0 (r3) contains the address of - * the structure from where we are to - * restore the context. - * ARG1 (r4) contains the non-zero - * value that we must return to - * that context. - * Uses r0 as scratch register - * - * NOTE : MUST BE KEPT CONSISTENT WITH gdb/config/powerpc/tm-ppc-eabi.h - * (which needs to know where to find the destination address) - */ - -ENTRY(_longjmp, TAG_NO_FRAME_USED) /* TODO NMGS - need correct tag */ - lwz r13, 4(ARG0) /* GPR context. We avoid multiple-word */ - lwz r14, 8(ARG0) /* instructions as they're slower (?) */ - lwz r15, 12(ARG0) - lwz r16, 16(ARG0) - lwz r17, 20(ARG0) - lwz r18, 24(ARG0) - lwz r19, 28(ARG0) - lwz r20, 32(ARG0) - lwz r21, 36(ARG0) - lwz r22, 40(ARG0) - lwz r23, 44(ARG0) - lwz r24, 48(ARG0) - lwz r25, 52(ARG0) - lwz r26, 56(ARG0) - lwz r27, 60(ARG0) - lwz r28, 64(ARG0) - lwz r29, 68(ARG0) - lwz r30, 72(ARG0) - lwz r31, 76(ARG0) - - lwz r0, 80(ARG0) /* Condition register */ - mtcr r0 /* Use r5 as scratch register */ - - lwz r0, 84(ARG0) /* Link register */ - mtlr r0 - - lwz r0, 88(ARG0) /* Fixed point exception register */ - mtxer r0 - -#ifdef FLOATING_POINT_SUPPORT - lfd f0, 92-4(ARG0) /* get Floating point status register in low 32 bits of f0 */ - mtfsf 0xFF,f0 /* restore FPSCR */ - - lfd f14, 96(ARG0) /* Floating point context - 8 byte aligned */ - lfd f15, 104(ARG0) - lfd f16, 112(ARG0) - lfd f17, 120(ARG0) - lfd f18, 128(ARG0) - lfd f19, 136(ARG0) - lfd f20, 144(ARG0) - lfd f21, 152(ARG0) - lfd f22, 160(ARG0) - lfd f23, 168(ARG0) - lfd f24, 176(ARG0) - lfd f25, 184(ARG0) - lfd f26, 192(ARG0) - lfd f27, 200(ARG0) - lfd f28, 208(ARG0) - lfd f29, 216(ARG0) - lfd f30, 224(ARG0) - lfd f31, 232(ARG0) - -#endif /* FLOATING_POINT_SUPPORT */ - - - lwz r1, 0(ARG0) /* finally, restore the stack pointer */ - - mr. ARG0, ARG1 /* set the return value */ - bnelr /* return if non-zero */ - - li ARG0, 1 - blr /* never return 0, return 1 instead */ - diff --git a/osfmk/ppc/aligned_data.s b/osfmk/ppc/aligned_data.s deleted file mode 100644 index 1777b577e..000000000 --- a/osfmk/ppc/aligned_data.s +++ /dev/null @@ -1,209 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * This module only exists because I don't know how to get the silly C compiler - * and/or linker to generate data areas that are aligned on a particular boundary. - * And, this stuff is in the V=R mapped area. - * - * Do the following for each: - * - * .size name,size-in-bytes - * .type area-name,@object - * .globl area-name - * .align power-of-two - * area-name: - * .set .,.+size-in-bytes - * - * So long as I'm being pedantic, always make sure that the most aligned, - * i.e., the largest power-of-twos, are first and then descend to the smallest. - * If you don't, and you are not careful and hand calculate, you'll end up - * with holes and waste storage. I hate C. - * - * Define the sizes in genassym.c - */ - - -#include -#include -#include -#include -#include -#include - - .data - -/* 4096-byte aligned areas */ - - .globl EXT(PerProcTable) - .align 12 -EXT(PerProcTable): ; Per processor table - .space (ppeSize*MAX_CPUS),0 ; (filled with 0s) - - .globl EXT(BootProcInfo) - .align 12 -EXT(BootProcInfo): ; Per processor data area - .space ppSize,0 ; (filled with 0s) - -/* 512-byte aligned areas */ - - .globl EXT(kernel_pmap_store) ; This is the kernel_pmap - .align 8 -EXT(kernel_pmap_store): - .set .,.+pmapSize - - -/* 256-byte aligned areas */ - - .globl EXT(GratefulDebWork) - .align 8 -EXT(GratefulDebWork): ; Enough for 2 rows of 8 chars of 16-pixel wide 32-bit pixels and a 256 byte work area - .set .,.+2560 - - .globl debstash - .align 8 -debstash: - .set .,.+256 - -/* 128-byte aligned areas */ - - .globl EXT(mapCtl) - .align 7 -EXT(mapCtl): - .set .,.+mapcsize - - .globl fwdisplock - .align 7 -fwdisplock: - .set .,.+128 - - .globl EXT(free_mappings) - .align 7 - -EXT(free_mappings): - .long 0 - - .globl EXT(NMIss) - .align 7 -EXT(NMIss): - .long 0 - .long 0 - .long 0 - .long 0 - .long 0 - .long 0 - .long 0 - .long 0 - -/* 32-byte aligned areas */ - - .globl EXT(dbvecs) - .align 5 -EXT(dbvecs): - .set .,.+(33*16) - - .globl hexfont - .align 5 -#include - - .globl EXT(QNaNbarbarian) - .align 5 - -EXT(QNaNbarbarian): - .long 0x7FFFDEAD /* This is a quiet not-a-number which is a "known" debug value */ - .long 0x7FFFDEAD /* This is a quiet not-a-number which is a "known" debug value */ - .long 0x7FFFDEAD /* This is a quiet not-a-number which is a "known" debug value */ - .long 0x7FFFDEAD /* This is a quiet not-a-number which is a "known" debug value */ - - .long 0x7FFFDEAD /* This is a quiet not-a-number which is a "known" debug value */ - .long 0x7FFFDEAD /* This is a quiet not-a-number which is a "known" debug value */ - .long 0x7FFFDEAD /* This is a quiet not-a-number which is a "known" debug value */ - .long 0x7FFFDEAD /* This is a quiet not-a-number which is a "known" debug value */ - -/* 8-byte aligned areas */ - - .globl EXT(FloatInit) - .align 3 - -EXT(FloatInit): - .long 0xC24BC195 /* Initial value */ - .long 0x87859393 /* of floating point registers */ - .long 0xE681A2C8 /* and others */ - .long 0x8599855A - - .globl EXT(DebugWork) - .align 3 - -EXT(DebugWork): - .long 0 - .long 0 - .long 0 - .long 0 - - .globl EXT(dbfloats) - .align 3 -EXT(dbfloats): - .set .,.+(33*8) - - .globl EXT(dbspecrs) - .align 3 -EXT(dbspecrs): - .set .,.+(336*4) - -/* - * Boot processor Interrupt and debug stacks go here. - */ - - /* in the __HIB section since the hibernate restore code uses this stack. */ - .section __HIB, __data - - .align PPC_PGSHIFT - - .globl EXT(intstack) -EXT(intstack): - .globl EXT(gIOHibernateRestoreStack) -EXT(gIOHibernateRestoreStack): - - .set .,.+INTSTACK_SIZE - - .globl EXT(gIOHibernateRestoreStackEnd) -EXT(gIOHibernateRestoreStackEnd): - - /* back to the regular __DATA section. */ - - .section __DATA, __data - .align PPC_PGSHIFT - -/* Debugger stack - used by the debugger if present */ - - .globl EXT(debstack) -EXT(debstack): - .set ., .+KERNEL_STACK_SIZE - - .section __DATA, __data - - diff --git a/osfmk/ppc/asm.h b/osfmk/ppc/asm.h deleted file mode 100644 index 2535a8491..000000000 --- a/osfmk/ppc/asm.h +++ /dev/null @@ -1,781 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -#ifndef _PPC_ASM_H_ -#define _PPC_ASM_H_ - -#define __ASMNL__ @ -#define STRINGD .ascii - -#ifdef ASSEMBLER - - -#define br0 0 - -#define ARG0 r3 -#define ARG1 r4 -#define ARG2 r5 -#define ARG3 r6 -#define ARG4 r7 -#define ARG5 r8 -#define ARG6 r9 -#define ARG7 r10 - -#define tmp0 r0 /* Temporary GPR remapping (603e specific) */ -#define tmp1 r1 -#define tmp2 r2 -#define tmp3 r3 - -/* SPR registers */ - -#define mq 0 /* MQ register for 601 emulation */ -#define rtcu 4 /* RTCU - upper word of RTC for 601 emulation */ -#define rtcl 5 /* RTCL - lower word of RTC for 601 emulation */ -#define dsisr 18 -#define ppcDAR 19 -#define ppcdar 19 -#define dar 19 -#define SDR1 25 -#define sdr1 25 -#define srr0 26 -#define srr1 27 -#define vrsave 256 /* Vector Register save */ -#define sprg0 272 -#define sprg1 273 -#define sprg2 274 -#define sprg3 275 -#define scomc 276 -#define scomd 277 -#define pvr 287 - -#define IBAT0U 528 -#define IBAT0L 529 -#define IBAT1U 530 -#define IBAT1L 531 -#define IBAT2U 532 -#define IBAT2L 533 -#define IBAT3U 534 -#define IBAT3L 535 -#define ibat0u 528 -#define ibat0l 529 -#define ibat1u 530 -#define ibat1l 531 -#define ibat2u 532 -#define ibat2l 533 -#define ibat3u 534 -#define ibat3l 535 - -#define DBAT0U 536 -#define DBAT0L 537 -#define DBAT1U 538 -#define DBAT1L 539 -#define DBAT2U 540 -#define DBAT2L 541 -#define DBAT3U 542 -#define DBAT3L 543 -#define dbat0u 536 -#define dbat0l 537 -#define dbat1u 538 -#define dbat1l 539 -#define dbat2u 540 -#define dbat2l 541 -#define dbat3u 542 -#define dbat3l 543 - -#define ummcr2 928 /* Performance monitor control */ -#define upmc5 929 /* Performance monitor counter */ -#define upmc6 930 /* Performance monitor counter */ -#define ubamr 935 /* Performance monitor mask */ -#define ummcr0 936 /* Performance monitor control */ -#define upmc1 937 /* Performance monitor counter */ -#define upmc2 938 /* Performance monitor counter */ -#define usia 939 /* User sampled instruction address */ -#define ummcr1 940 /* Performance monitor control */ -#define upmc3 941 /* Performance monitor counter */ -#define upmc4 942 /* Performance monitor counter */ -#define usda 943 /* User sampled data address */ -#define mmcr2 944 /* Performance monitor control */ -#define pmc5 945 /* Performance monitor counter */ -#define pmc6 946 /* Performance monitor counter */ -#define bamr 951 /* Performance monitor mask */ -#define mmcr0 952 -#define pmc1 953 -#define pmc2 954 -#define sia 955 -#define mmcr1 956 -#define pmc3 957 -#define pmc4 958 -#define sda 959 /* Sampled data address */ -#define dmiss 976 /* ea that missed */ -#define trig0 976 -#define dcmp 977 /* compare value for the va that missed */ -#define trig1 977 -#define hash1 978 /* pointer to first hash pteg */ -#define trig2 978 -#define hash2 979 /* pointer to second hash pteg */ -#define imiss 980 /* ea that missed */ -#define tlbmiss 980 /* ea that missed */ -#define icmp 981 /* compare value for the va that missed */ -#define ptehi 981 /* compare value for the va that missed */ -#define rpa 982 /* required physical address register */ -#define ptelo 982 /* required physical address register */ -#define l3pdet 984 /* l3pdet */ - -#define HID0 1008 /* Checkstop and misc enables */ -#define hid0 1008 /* Checkstop and misc enables */ -#define HID1 1009 /* Clock configuration */ -#define hid1 1009 /* Clock configuration */ -#define HID2 1016 /* Other processor controls */ -#define hid2 1016 /* Other processor controls */ -#define iabr 1010 /* Instruction address breakpoint register */ -#define ictrl 1011 /* Instruction Cache Control */ -#define ldstdb 1012 /* Load/Store Debug */ -#define hid4 1012 /* Misc stuff */ -#define dabr 1013 /* Data address breakpoint register */ -#define msscr0 1014 /* Memory subsystem control */ -#define hid5 1014 /* Misc stuff */ -#define msscr1 1015 /* Memory subsystem debug */ -#define msssr0 1015 /* Memory Subsystem Status */ -#define ldstcr 1016 /* Load/Store Status/Control */ -#define l2cr2 1016 /* L2 Cache control 2 */ -#define l2cr 1017 /* L2 Cache control */ -#define l3cr 1018 /* L3 Cache control */ -#define ictc 1019 /* I-cache throttling control */ -#define thrm1 1020 /* Thermal management 1 */ -#define thrm2 1021 /* Thermal management 2 */ -#define thrm3 1022 /* Thermal management 3 */ -#define pir 1023 /* Processor ID Register */ - - -/* SPR registers (64-bit, PPC970 specific) */ - -#define scomc_gp 276 -#define scomd_gp 277 - -#define hsprg0 304 -#define hsprg1 305 -#define hdec 310 -#define hior 311 -#define rmor 312 -#define hrmor 313 -#define hsrr0 314 -#define hsrr1 315 -#define lpcr 318 -#define lpidr 319 - -#define ummcra_gp 770 -#define upmc1_gp 771 -#define upmc2_gp 772 -#define upmc3_gp 773 -#define upmc4_gp 774 -#define upmc5_gp 775 -#define upmc6_gp 776 -#define upmc7_gp 777 -#define upmc8_gp 778 -#define ummcr0_gp 779 -#define usiar_gp 780 -#define usdar_gp 781 -#define ummcr1_gp 782 -#define uimc_gp 783 - -#define mmcra_gp 786 -#define pmc1_gp 787 -#define pmc2_gp 788 -#define pmc3_gp 789 -#define pmc4_gp 790 -#define pmc5_gp 791 -#define pmc6_gp 792 -#define pmc7_gp 793 -#define pmc8_gp 794 -#define mmcr0_gp 795 -#define siar_gp 796 -#define sdar_gp 797 -#define mmcr1_gp 798 -#define imc_gp 799 - -#define trig0_gp 976 -#define trig1_gp 977 -#define trig2_gp 978 - -#define dabrx 1015 - -; hid0 bits -#define emcp 0 -#define emcpm 0x80000000 -#define dbp 1 -#define dbpm 0x40000000 -#define eba 2 -#define ebam 0x20000000 -#define ebd 3 -#define ebdm 0x10000000 -#define sbclk 4 -#define sbclkm 0x08000000 -#define eclk 6 -#define eclkm 0x02000000 -#define par 7 -#define parm 0x01000000 -#define sten 7 -#define stenm 0x01000000 -#define dnap 7 -#define dnapm 0x01000000 -#define doze 8 -#define dozem 0x00800000 -#define nap 9 -#define napm 0x00400000 -#define sleep 10 -#define sleepm 0x00200000 -#define dpm 11 -#define dpmm 0x00100000 -#define riseg 12 -#define risegm 0x00080000 -#define eiec 13 -#define eiecm 0x00040000 -#define mum 14 -#define mumm 0x00020000 -#define nhr 15 -#define nhrm 0x00010000 -#define ice 16 -#define icem 0x00008000 -#define dce 17 -#define dcem 0x00004000 -#define ilock 18 -#define ilockm 0x00002000 -#define dlock 19 -#define dlockm 0x00001000 -#define exttben 19 -#define icfi 20 -#define icfim 0x00000800 -#define dcfi 21 -#define dcfim 0x00000400 -#define spd 22 -#define spdm 0x00000200 -#define hdice 23 -#define hdicem 0x00000100 -#define sge 24 -#define sgem 0x00000080 -#define dcfa 25 -#define dcfam 0x00000040 -#define btic 26 -#define bticm 0x00000020 -#define lrstk 27 -#define lrstkm 0x00000010 -#define abe 28 -#define abem 0x00000008 -#define fold 28 -#define foldm 0x00000008 -#define bht 29 -#define bhtm 0x00000004 -#define nopdst 30 -#define nopdstm 0x00000002 -#define nopti 31 -#define noptim 0x00000001 - -; hid1 bits -#define hid1pcem 0xF8000000 -#define hid1prem 0x06000000 -#define hid1dfs0 8 -#define hid1dfs0m 0x00800000 -#define hid1dfs1 9 -#define hid1dfs1m 0x00400000 -#define hid1pi0 14 -#define hid1pi0m 0x00020000 -#define hid1FCPErr 14 -#define hid1ps 15 -#define hid1FCD0PErr 15 -#define hid1psm 0x00010000 -#define hid1pc0 0x0000F800 -#define hid1pr0 0x00000600 -#define hid1pc1 0x000000F8 -#define hid1pc0 0x0000F800 -#define hid1pr1 0x00000006 -#define hid1FCD1PErr 16 -#define hid1FIERATErr 17 - -; hid2 bits -#define hid2vmin 18 -#define hid2vminm 0x00002000 - -; msscr0 bits -#define shden 0 -#define shdenm 0x80000000 -#define shden3 1 -#define shdenm3 0x40000000 -#define l1intvs 2 -#define l1intve 4 -#define l1intvb 0x38000000 -#define l2intvs 5 -#define l2intve 7 -#define l2intvb 0x07000000 -#define dl1hwf 8 -#define dl1hwfm 0x00800000 -#define dbsiz 9 -#define dbsizm 0x00400000 -#define emode 10 -#define emodem 0x00200000 -#define abgd 11 -#define abgdm 0x00100000 -#define tfsts 24 -#define tfste 25 -#define tfstm 0x000000C0 -#define l2pfes 30 -#define l2pfee 31 -#define l2pfem 0x00000003 - -; msscr1 bits -#define cqd 15 -#define cqdm 0x00010000 -#define csqs 1 -#define csqe 2 -#define csqm 0x60000000 - -; msssr1 bits - 7450 -#define vgL2PARA 0 -#define vgL3PARA 1 -#define vgL2COQEL 2 -#define vgL3COQEL 3 -#define vgL2CTR 4 -#define vgL3CTR 5 -#define vgL2COQR 6 -#define vgL3COQR 7 -#define vgLMQ 8 -#define vgSMC 9 -#define vgSNP 10 -#define vgBIU 11 -#define vgSMCE 12 -#define vgL2TAG 13 -#define vgL2DAT 14 -#define vgL3TAG 15 -#define vgL3DAT 16 -#define vgAPE 17 -#define vgDPE 18 -#define vgTEA 19 - -; srr1 bits -#define icmck 1 -#define icmckm 0x40000000 -#define dcmck 2 -#define dcmckm 0x20000000 -#define l2mck 3 -#define l2mckm 0x10000000 -#define tlbmck 4 -#define tlbmckm 0x08000000 -#define brmck 5 -#define brmckm 0x04000000 -#define othmck 10 -#define othmckm 0x00200000 -#define l2dpmck 11 -#define l2dpmckm 0x00100000 -#define mcpmck 12 -#define mcpmckm 0x00080000 -#define teamck 13 -#define teamckm 0x00040000 -#define dpmck 14 -#define dpmckm 0x00020000 -#define apmck 15 -#define apmckm 0x00010000 - -#define mckIFUE 42 -#define mckLDST 43 -#define mckXCs 44 -#define mckXCe 45 -#define mckNoErr 0 -#define mckIFSLBPE 1 -#define mckIFTLBPE 2 -#define mckIFTLBUE 3 - -; dsisr bits -#define mckUEdfr 16 -#define mckUETwDfr 17 -#define mckL1DCPE 18 -#define mckL1DTPE 19 -#define mckDEPE 20 -#define mckTLBPE 21 -#define mckSLBPE 23 - -; Async MCK source -#define AsyMCKSrc 0x0226 -#define AsyMCKRSrc 0x0227 -#define AsyMCKext 0 -#define AsyMCKfir 1 -#define AsyMCKhri 2 -#define AsyMCKdbg 3 -#define AsyMCKncstp 4 - -; Core FIR -#define cFIR 0x0300 -#define cFIRrst 0x0310 -#define cFIRICachePE 0 -#define cFIRITagPE0 1 -#define cFIRITagPE1 2 -#define cFIRIEratPE 3 -#define cFIRIFUL2UE 4 -#define cFIRIFUCS 5 -#define cFIRDCachePE 6 -#define cFIRDTagPE 7 -#define cFIRDEratPE 8 -#define cFIRTLBPE 9 -#define cFIRSLBPE 10 -#define cFIRSL2UE 11 - -; Core Error Inject -#define CoreErrI 0x0350 -#define CoreIFU 0 -#define CoreLSU 1 -#define CoreRate0 2 -#define CoreRate1 3 -#define CoreOnce 0 -#define CoreSolid 2 -#define CorePulse 3 - -; L2 FIR -#define l2FIR 0x0400 -#define l2FIRrst 0x0410 - -; Bus FIR -#define busFIR 0x0A00 -#define busFIRrst 0x0A10 - -; HID4 -#define hid4RMCI 23 -#define hid4FAlgn 24 -#define hid4DisPF 25 -#define hid4ResPF 26 -#define hid4EnSPTW 27 -#define hid4L1DCFI 28 -#define hid4DisDERpg 31 -#define hid4DisDCTpg 36 -#define hid4DisDCpg 41 -#define hid4DisTLBpg 48 -#define hid4DisSLBpg 54 -#define hid4MckEIEna 55 - -; L2 cache control -#define l2e 0 -#define l2em 0x80000000 -#define l2pe 1 -#define l2pem 0x40000000 -#define l2siz 2 -#define l2sizf 3 -#define l2sizm 0x30000000 -#define l2clk 4 -#define l2clkf 6 -#define l2clkm 0x0E000000 -#define l2ram 7 -#define l2ramf 8 -#define l2ramm 0x01800000 -#define l2do 9 -#define l2dom 0x00400000 -#define l2i 10 -#define l2im 0x00200000 -#define l2ctl 11 -#define l2ctlm 0x00100000 -#define l2ionly 11 -#define l2ionlym 0x00100000 -#define l2wt 12 -#define l2wtm 0x00080000 -#define l2ts 13 -#define l2tsm 0x00040000 -#define l2oh 14 -#define l2ohf 15 -#define l2ohm 0x00030000 -#define l2donly 15 -#define l2donlym 0x00010000 -#define l2sl 16 -#define l2slm 0x00008000 -#define l2df 17 -#define l2dfm 0x00004000 -#define l2byp 18 -#define l2bypm 0x00002000 -#define l2fa 19 -#define l2fam 0x00001000 -#define l2hwf 20 -#define l2hwfm 0x00000800 -#define l2io 21 -#define l2iom 0x00000400 -#define l2clkstp 22 -#define l2clkstpm 0x00000200 -#define l2dro 23 -#define l2drom 0x00000100 -#define l2ctr 24 -#define l2ctrf 30 -#define l2ctrm 0x000000FE -#define l2ip 31 -#define l2ipm 0x00000001 - -; L3 cache control -#define l3e 0 -#define l3em 0x80000000 -#define l3pe 1 -#define l3pem 0x40000000 -#define l3siz 3 -#define l3sizm 0x10000000 -#define l3clken 4 -#define l3clkenm 0x08000000 -#define l3dx 5 -#define l3dxm 0x04000000 -#define l3clk 6 -#define l3clkf 8 -#define l3clkm 0x03800000 -#define l3io 9 -#define l3iom 0x00400000 -#define l3spo 13 -#define l3spom 0x00040000 -#define l3cksp 14 -#define l3ckspf 15 -#define l3ckspm 0x00030000 -#define l3psp 16 -#define l3pspf 18 -#define l3pspm 0x0000E000 -#define l3rep 19 -#define l3repm 0x00001000 -#define l3hwf 20 -#define l3hwfm 0x00000800 -#define l3i 21 -#define l3im 0x00000400 -#define l3rt 22 -#define l3rtf 23 -#define l3rtm 0x00000300 -#define l3dro 23 -#define l3drom 0x00000100 -#define l3cya 24 -#define l3cyam 0x00000080 -#define l3donly 25 -#define l3donlym 0x00000040 -#define l3dmem 29 -#define l3dmemm 0x00000004 -#define l3dmsiz 31 -#define l3dmsizm 0x00000001 - -#define thrmtin 0 -#define thrmtinm 0x80000000 -#define thrmtiv 1 -#define thrmtivm 0x40000000 -#define thrmthrs 2 -#define thrmthre 8 -#define thrmthrm 0x3F800000 -#define thrmtid 29 -#define thrmtidm 0x00000004 -#define thrmtie 30 -#define thrmtiem 0x00000002 -#define thrmv 31 -#define thrmvm 0x00000001 - -#define thrmsitvs 15 -#define thrmsitve 30 -#define thrmsitvm 0x0001FFFE -#define thrme 31 -#define thrmem 0x00000001 - -#define ictcfib 23 -#define ictcfie 30 -#define ictcfim 0x000001FE -#define ictce 31 -#define ictcem 0x00000001 - -#define slbESID 36 -#define slbKey 52 -#define slbIndex 52 -#define slbV 36 -#define slbVm 0x08000000 -#define slbCnt 64 - -/* - * Macros to access high and low word values of an address - */ - -#define HIGH_CADDR(x) ha16(x) -#define HIGH_ADDR(x) hi16(x) -#define LOW_ADDR(x) lo16(x) - -#endif /* ASSEMBLER */ - -#define cr0_lt 0 -#define cr0_gt 1 -#define cr0_eq 2 -#define cr0_so 3 -#define cr0_un 3 -#define cr1_lt 4 -#define cr1_gt 5 -#define cr1_eq 6 -#define cr1_so 7 -#define cr1_un 7 -#define cr2_lt 8 -#define cr2_gt 9 -#define cr2_eq 10 -#define cr2_so 11 -#define cr2_un 11 -#define cr3_lt 12 -#define cr3_gt 13 -#define cr3_eq 14 -#define cr3_so 15 -#define cr3_un 15 -#define cr4_lt 16 -#define cr4_gt 17 -#define cr4_eq 18 -#define cr4_so 19 -#define cr4_un 19 -#define cr5_lt 20 -#define cr5_gt 21 -#define cr5_eq 22 -#define cr5_so 23 -#define cr5_un 23 -#define cr6_lt 24 -#define cr6_gt 25 -#define cr6_eq 26 -#define cr6_so 27 -#define cr6_un 27 -#define cr7_lt 28 -#define cr7_gt 29 -#define cr7_eq 30 -#define cr7_so 31 -#define cr7_un 31 - -/* GUS Mode Register */ -#define GUSModeReg 0x0430 -#define GUSMdmapen 0x00008000 -#define GUSMstgtdis 0x00000080 -#define GUSMstgttim 0x00000038 -#define GUSMstgttoff 0x00000004 - -/* PowerTune */ -#define PowerTuneControlReg 0x0AA001 -#define PowerTuneStatusReg 0x408001 - -/* Code inject */ -// The following bits are always on in the MSR when injected code is executing -#define ijemon 0x00000010 -// The following bits are always off in the MSR when injected code it executing -#define ijemoff 0x0000C620 -#define ijemtrap ijemon|1 -// The following is the inject exit trap -#define ijtrap 0x0FFFC9C9 - -/* Misc */ -#define srr1clr 0x783F0000 - -/* Tags are placed before Immediately Following Code (IFC) for the debugger - * to be able to deduce where to find various registers when backtracing - * - * We only define the values as we use them, see SVR4 ABI PowerPc Supplement - * for more details (defined in ELF spec). - */ - -#define TAG_NO_FRAME_USED 0x00000000 - -/* (should use genassym to get these offsets) */ - -#define FM_BACKPTR 0 -#define FM_CR_SAVE 4 -#define FM_LR_SAVE 8 /* MacOSX is NOT following the ABI at the moment.. */ -#define FM_SIZE 64 /* minimum frame contents, backptr and LR save. Make sure it is quadaligned */ -#define FM_ARG0 56 -#define FM_ALIGN(l) ((l+15)&-16) -#define PK_SYSCALL_BEGIN 0x7000 - - -/* redzone is the area under the stack pointer which must be preserved - * when taking a trap, interrupt etc. - */ -#define FM_REDZONE 224 /* is ((32-14+1)*4) */ - -#define COPYIN_ARG0_OFFSET FM_ARG0 - -#ifdef MACH_KERNEL -#include -#else /* MACH_KERNEL */ -#define MACH_KDB 0 -#endif /* MACH_KERNEL */ - -#define BREAKPOINT_TRAP tw 4,r4,r4 - -/* There is another definition of ALIGN for .c sources */ -#ifndef __LANGUAGE_ASSEMBLY -#define ALIGN 4 -#endif /* __LANGUAGE_ASSEMBLY */ - -#ifndef FALIGN -#define FALIGN 4 /* Align functions on words for now. Cachelines is better */ -#endif - -#define LB(x,n) n -#if __STDC__ -#define LCL(x) L ## x -#define EXT(x) _ ## x -#define LEXT(x) _ ## x ## : -#define LBc(x,n) n ## : -#define LBb(x,n) n ## b -#define LBf(x,n) n ## f -#else /* __STDC__ */ -#define LCL(x) L/**/x -#define EXT(x) _/**/x -#define LEXT(x) _/**/x/**/: -#define LBc(x,n) n/**/: -#define LBb(x,n) n/**/b -#define LBf(x,n) n/**/f -#endif /* __STDC__ */ - -#define String .asciz -#define Value .word -#define Times(a,b) (a*b) -#define Divide(a,b) (a/b) - -#define data16 .byte 0x66 -#define addr16 .byte 0x67 - -#define MCOUNT - -#define ELF_FUNC(x) -#define ELF_DATA(x) -#define ELF_SIZE(x,s) - -#define Entry(x,tag) .text@.align FALIGN@ .globl EXT(x)@ LEXT(x) -#define ENTRY(x,tag) Entry(x,tag)@MCOUNT -#define ENTRY2(x,y,tag) .text@ .align FALIGN@ .globl EXT(x)@ .globl EXT(y)@ \ - LEXT(x)@ LEXT(y) @\ - MCOUNT -#if __STDC__ -#define ASENTRY(x) .globl x @ .align FALIGN; x ## @ MCOUNT -#else -#define ASENTRY(x) .globl x @ .align FALIGN; x @ MCOUNT -#endif /* __STDC__ */ -#define DATA(x) .globl EXT(x) @ .align ALIGN @ LEXT(x) - - -#define End(x) ELF_SIZE(x,.-x) -#define END(x) End(EXT(x)) -#define ENDDATA(x) END(x) -#define Enddata(x) End(x) - -/* These defines are here for .c files that wish to reference global symbols - * within __asm__ statements. - */ -#define CC_SYM_PREFIX "_" - -#endif /* _PPC_ASM_H_ */ diff --git a/osfmk/ppc/ast.h b/osfmk/ppc/ast.h deleted file mode 100644 index a24933948..000000000 --- a/osfmk/ppc/ast.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -/* - * Machine-dependent AST file for ppc. - */ - -#ifndef _PPC_AST_H_ -#define _PPC_AST_H_ - -#define AST_PPC_CHUD_URGENT AST_CHUD_URGENT -#define AST_PPC_CHUD AST_CHUD -#define AST_PPC_CHUD_ALL AST_CHUD_ALL - -#endif /* _PPC_AST_H_ */ diff --git a/osfmk/ppc/ast_types.h b/osfmk/ppc/ast_types.h deleted file mode 100644 index a32dd6f9d..000000000 --- a/osfmk/ppc/ast_types.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -#ifndef _PPC_AST_TYPES_H_ -#define _PPC_AST_TYPES_H_ - -/* - * Data type for remote ast_check() invocation support. Currently - * not implemented. Do this first to avoid include problems. - */ -typedef int ast_check_t; - -#endif /* _PPC_AST_TYPES_H_ */ diff --git a/osfmk/ppc/atomic_switch.h b/osfmk/ppc/atomic_switch.h deleted file mode 100644 index f31743cc5..000000000 --- a/osfmk/ppc/atomic_switch.h +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -typedef unsigned char UInt8; -typedef unsigned short UInt16; -typedef unsigned long UInt32; - - -/* Support firmware CallPseudoKernel architectural extension */ - -struct CallPseudoKernelDescriptor { - UInt32 pc; - UInt32 gpr0; - UInt32 intControlAddr; - UInt32 newState; - UInt32 intStateMask; - UInt32 intCR2Mask; - UInt32 intCR2Shift; - UInt32 sysContextState; -}; -typedef struct CallPseudoKernelDescriptor CallPseudoKernelDescriptor; -typedef CallPseudoKernelDescriptor * CallPseudoKernelDescriptorPtr; -typedef CallPseudoKernelDescriptor CPKD_t; - - - -/* Support firmware ExitPseudoKernel architectural extension */ - -struct ExitPseudoKernelDescriptor { - UInt32 pc; - UInt32 sp; - UInt32 gpr0; - UInt32 gpr3; - UInt32 cr; - UInt32 intControlAddr; - UInt32 newState; - UInt32 intStateMask; - UInt32 intCR2Mask; - UInt32 intCR2Shift; - UInt32 sysContextState; - UInt32 intPendingMask; - UInt32 intPendingPC; - UInt32 msrUpdate; -}; -typedef struct ExitPseudoKernelDescriptor ExitPseudoKernelDescriptor; -typedef ExitPseudoKernelDescriptor * ExitPseudoKernelDescriptorPtr; -typedef ExitPseudoKernelDescriptor EPKD_t; - - -struct EmulatorDescriptor { - UInt8 regMap[16]; // table mapping 68K D0..D7, A0..A7 register to PowerPC registers - UInt32 bootstrapVersionOffset; // offset within emulator data page of the bootstrap version string - UInt32 ecbOffset; // offset within emulator data page of the ECB - UInt32 intModeLevelOffset; // offset within emulator data page of the interrupt mode level - UInt32 entryAddress; // offset within text of the emulator's main entry point - UInt32 kcallTrapTableOffset; // offset within text of the nanokernel(!) call trap table - UInt32 postIntMask; // post interrupt mask - UInt32 clearIntMask; // clear interrupt mask - UInt32 testIntMask; // test interrupt mask - UInt32 codeSize; // total size of emulator object code (interpretive + DR) - UInt32 hashTableSize; // size of DR emulator's hash table - UInt32 drCodeStartOffset; // offset within text of the DR emulator's object code - UInt32 drInitOffset; // offset within DR emulator of its initialization entry point - UInt32 drAllocateCache; // offset within DR emulator of its cache allocation entry point - UInt32 dispatchTableOffset; // offset within text of the encoded instruction dispatch table -}; -typedef struct EmulatorDescriptor EmulatorDescriptor; -typedef EmulatorDescriptor *EmulatorDescriptorPtr; - - -enum { - // The following define the UInt32 gInterruptState - kInUninitialized = 0, // State not yet initialized - kInPseudoKernel = 1, // Currently executing within pseudo kernel - kInSystemContext = 2, // Currently executing within the system (emulator) context - kInAlternateContext = 3, // Currently executing within an alternate (native) context - kInExceptionHandler = 4, // Currently executing an exception handler - kOutsideMain = 5, // Currently executing outside of the main thread - kNotifyPending = 6, // Pending Notify Interrupt - - kInterruptStateMask = 0x000F0000, // Mask to extract interrupt state from gInterruptState - kInterruptStateShift = 16, // Shift count to align interrupt state - - kBackupCR2Mask = 0x0000000F, // Mask to extract backup CR2 from gInterruptState - kCR2ToBackupShift = 31-11, // Shift count to align CR2 into the backup CR2 of gInterruptState - // (and vice versa) - kCR2Mask = 0x00F00000 // Mask to extract CR2 from the PPC CR register -}; - - -enum { - kcReturnFromException = 0, - kcRunAlternateContext = 1, - kcResetSystem = 2, - kcVMDispatch = 3, - kcPrioritizeInterrupts = 4, - kcPowerDispatch = 5, - kcRTASDispatch = 6, - kcGetAdapterProcPtrsPPC = 12, - kcGetAdapterProcPtrs = 13, - kcCallAdapterProc = 14, - kcSystemCrash = 15 -}; - -#define bbMaxCode 16 - diff --git a/osfmk/ppc/atomic_switch.s b/osfmk/ppc/atomic_switch.s deleted file mode 100644 index ef1edd940..000000000 --- a/osfmk/ppc/atomic_switch.s +++ /dev/null @@ -1,238 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#include -#include -#include -#include -#include - -/* - * Classic atomic switch and fast trap code - * Written by: Mark Gorlinsky - */ - -/* -** -** Blue Box Fast Trap entry -** -** -** The registers at entry are as hw_exceptions left them. Which means -** that the Blue Box data area is pointed to be R26. -** -** We exit here through the fast path exit point in hw_exceptions. That means that -** upon exit, R4 must not change. It is the savearea with the current user context -** to restore. -** -** Input registers are: -** r0 = Syscall number -** r4 = Current context savearea (do not modify) -** r13 = THREAD_TOP_ACT pointer -** r26 = base of ACT_MACH_BDA in kernel address space -** -- for Traps -- -** r24 = Index into TWI table (x4) -** -** -*/ - - -ENTRY(atomic_switch_syscall, TAG_NO_FRAME_USED) - -/* - * Note: the BlueBox fast path system calls (-1 and -2) we handled as - * an ultra-fast trap in lowmem_vectors. - */ - lwz r5,bbSysCall(r13) ; Pick up the syscall vector - b .L_CallPseudoKernel - -ENTRY(atomic_switch_trap, TAG_NO_FRAME_USED) - -/* -** functions 0-15 -> Call PseudoKernel -** 16 -> Exit PseudoKernel -*/ - - cmplwi cr7,r24,BB_RFI_TRAP ; Is this an RFI? - beq cr7,.L_ExitPseudoKernel ; Yes... - - lwz r5,bbTrap(r13) ; Pick up the trap vector - -/****************************************************************************** - * void CallPseudoKernel ( int vector, thread_act_t * act, BEDA_t * beda, savearea *sv ) - * - * This op provides a means of invoking the BlueBox PseudoKernel from a - * system (68k) or native (PPC) context while changing BlueBox interruption - * state atomically. As an added bonus, this op leaves all but R1/PC of the user - * state registers intact. R1/PC are saved in a per thread save area, the base of - * which is located in the bbDescAddr member of the thread_act structure. - * - * This op is invoked from the Emulator Trap dispatch table or from a System - * Call when Mach SCs have been disabled. A vectorindex is passed in to indicate - * which vector should be taken. - * - * If this op is invoked from the Emulator Trap dispatch table, the kernel is - * aware of starting address of this table. It used the users PC (SRR0) - * and the start of the Trap dispatch table address to verify the trap exception - * as a atomic_switch trap. If a trap exception is verified as a atomic_switch - * trap we enter here with the following registers loaded. - * - * Input registers are: - * r5 = Vector to take - * r13 = Current thread context data - * r26 = Base address of BlueBox exception data area in kernel address space - * r4 = Current context savearea (do not modify) - * - ******************************************************************************/ - -.L_CallPseudoKernel: - - mfsprg r2,1 ; Get the current activation - lwz r2,ACT_PER_PROC(r2) ; Get the per_proc block - rlwinm r6,r26,0,0,19 ; Start of page is bttd - lwz r7,ACT_MACT_SPF(r13) ; Get special flags - lwz r1,BTTD_INTERRUPT_VECTOR(r6) ; Get interrupt vector - rlwinm r7,r7,0,bbNoMachSCbit+1,bbNoMachSCbit-1 - ; Reactivate Mach SCs - lwz r8,BTTD_INTCONTROLWORD(r6) ; Get Interrupt Control Word - cmpwi r1,0 ; Is this a preemptive thread ? - stw r7,ACT_MACT_SPF(r13) ; Update special flags - stw r7,spcFlags(r2) ; Update per_proc version - beq .L_CallFromPreemptiveThread ; No int vector means preemptive thread - - rlwinm r1,r8,0,INTSTATEMASK_B,INTSTATEMASK_E - ; Extract current Interrupt state - rlwinm r8,r8,0,INTSTATEMASK_E+1,INTSTATEMASK_B-1 - ; Clear current interrupt state - xoris r2,r1,SYSCONTEXTSTATE ; Setup for System Context check - lwz r1,savecr(r4) ; Load current CR bits - cmpwi r2,0 ; Check if state is System Context? - oris r8,r8,PSEUDOKERNELSTATE ; Update state for entering the PK - bne .L_CallFromAlternateContext ; No, then do not save CR2 bits - - rlwimi r8,r1,32-INTCR2TOBACKUPSHIFT,INTBACKUPCR2MASK_B,INTBACKUPCR2MASK_E - ; Insert live CR2 in ICW BackupCR2 -.L_CallFromAlternateContext: - - stw r8,BTTD_INTCONTROLWORD(r6) ; Update ICW - -.L_CallFromPreemptiveThread: - - lwz r1,savesrr0+4(r4) ; Get current PC - lwz r2,saver1+4(r4) ; Get current R1 - lwz r3,savesrr1+4(r4) ; Get current MSR - stw r1,BEDA_SRR0(r26) ; Save current PC - rlwinm r3,r3,0,MSR_BE_BIT+1,MSR_SE_BIT-1 - ; Clear SE|BE bits in MSR - stw r2,BEDA_SPRG1(r26) ; Save current R1 - stw r3,savesrr1+4(r4) ; Load new MSR - - lwz r1,BEDA_SPRG0(r26) ; Get replacement R1 - stw r5,savesrr0+4(r4) ; Save vector as PC - stw r3,BEDA_SRR1(r26) ; Update saved MSR - stw r1,saver1+4(r4) ; Load up new R1 - - b EXT(fastexit) ; Go back and take the fast path exit... - -/****************************************************************************** - * void ExitPseudoKernel ( thread_act_t * act, BEDA_t * beda, savearea * sv ) - * - * This op provides a means of exiting from the BlueBox PseudoKernel to a - * user context. This op attempts to simulate an RFI for the returning - * Traps (atomic_switch_trap) and SysCalls (atomic_switch_syscall). Only the - * Blue Thread handling interrupts is allowed to atomically change - * interruption state and handle pending interrupts. - * - * If an interrupt is pending and we are returning to the alternate context, - * the exit is aborted and we return to an pending interrupt handler in the - * Blue Box pseudokernel. - * - * It also allows the MSR's FE0, FE1, BE and SE bits to updated for the user - * and completes the PPC register loading. - * - * Input registers are: - * r4 = Current context savearea (do not modify) - * r13 = Pointer to the current active thread's data - * r26 = Base address of BlueBox Data in kernel address space - * - ******************************************************************************/ - -.L_ExitPseudoKernel: - - rlwinm r6,r26,0,0,19 ; Start of page is bttd - lwz r7,ACT_MACT_SPF(r13) ; Get special flags - lwz r2,BTTD_INTERRUPT_VECTOR(r6) ; Get the interrupt vector - lwz r1,BEDA_SPRG1(r26) ; Get saved CTR - ori r7,r7,(0x8000 >> (bbNoMachSCbit - 16)) ; Disable Mach SCs for Blue Box - - cmpwi r2,0 ; Is this a preemptive thread - stw r1,savectr+4(r4) ; Update CTR - beq .L_ExitFromPreemptiveThread - - lwz r8,BTTD_INTCONTROLWORD(r6) ; Get ICW - lwz r1,BTTD_NEWEXITSTATE(r6) ; New interrupt state - lwz r2,BTTD_TESTINTMASK(r6) ; Get pending interrupt mask - lis r3,SYSCONTEXTSTATE ; Setup for check in system context - rlwimi r8,r1,0,INTSTATEMASK_B,INTSTATEMASK_E - ; Insert new state - cmplw cr1,r1,r3 ; System context ? - and. r2,r8,r2 ; Any pending interrupt? - lwz r1,savecr(r4) ; Get current CR - - beq cr1,.L_ExitToSystemContext ; We are in system context - beq .L_ExitUpdateRuptControlWord ; We do not have a pending interrupt - - lwz r2,saver1+4(r4) ; Get current R1 - lwz r1,BEDA_SPRG0(r26) ; Get replacement R1 - stw r2,BEDA_SPRG1(r26) ; Save current R1 - stw r1,saver1+4(r4) ; Load up new R1 - lwz r3,bbPending(r13) ; Get pending interrupt PC - b .L_ExitAbortExit ; Abort and Exit - -.L_ExitToSystemContext: - rlwimi r1,r8,INTCR2TOBACKUPSHIFT,INTCR2MASK_B,INTCR2MASK_E - ; Insert live CR2 into backup CR2 -.L_ExitUpdateRuptControlWord: - stw r8,BTTD_INTCONTROLWORD(r6) ; Update ICW - stw r1,savecr(r4) ; Update CR - -.L_ExitFromPreemptiveThread: - mfsprg r3,1 ; Get the current activation - lwz r3,ACT_PER_PROC(r3) ; Get the per_proc block - lwz r2,savesrr1+4(r4) ; Get current MSR - lwz r1,BEDA_SRR1(r26) ; Get new MSR - stw r7,ACT_MACT_SPF(r13) ; Update special flags - stw r7,spcFlags(r3) ; Update per_proc version - rlwimi r2,r1,0,MSR_FE0_BIT,MSR_FE1_BIT - ; Insert FE0,FE1,SE,BE bits - lwz r3,BEDA_SRR0(r26) ; Get new PC - stw r2,savesrr1+4(r4) ; Update MSR - -.L_ExitAbortExit: - stw r3,savesrr0+4(r4) ; Update PC - - b EXT(fastexit) ; Go back and take the fast path exit... - diff --git a/osfmk/ppc/bat_init.c b/osfmk/ppc/bat_init.c deleted file mode 100644 index 7434a4f02..000000000 --- a/osfmk/ppc/bat_init.c +++ /dev/null @@ -1,301 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#include -#include -#include -#include - -// The sophisticated BAT manager - -unsigned int mappedSegments = 0; -unsigned int availableBATs = 0xE; // BAT0 used, 1-3 available - -vm_offset_t -PEResidentAddress( vm_offset_t address, vm_size_t length ) -{ - if( mappedSegments & (1 << (15 & (address >> 28)))) - return( address); - else - return( 0); -} - -vm_offset_t -PEMapSegment( vm_offset_t address, vm_size_t length ) -{ - vm_offset_t retAddress; - bat_t bat; - int batNum; - - retAddress = PEResidentAddress( address, length ); - if( retAddress) - return( retAddress); - - if( length < (256 * 1024)) - return( 0); - if( availableBATs == 0) - return( 0); - - for( batNum = 0; - (0 == (availableBATs & (1 << batNum))); - batNum++); - - bat.upper.word = address & 0xf0000000; - bat.lower.word = bat.upper.word; - - bat.upper.bits.bl = 0x7ff; /* size = 256M */ - bat.upper.bits.vs = 1; - bat.upper.bits.vp = 0; /* user disabled */ - - bat.lower.bits.wimg = PTE_WIMG_IO; - bat.lower.bits.pp = 2; /* read/write access */ - - // Update the shadow bats. - shadow_BAT.DBATs[batNum].upper = bat.upper.word; - shadow_BAT.DBATs[batNum].lower = bat.lower.word; - - sync();isync(); - switch( batNum) { // !%$@!! mtdbat needs literal - case 0: - mtdbatu( 0, BAT_INVALID); /* invalidate old mapping */ - mtdbatl( 0, bat.lower.word); - mtdbatu( 0, bat.upper.word); - break; - case 1: - mtdbatu( 1, BAT_INVALID); - mtdbatl( 1, bat.lower.word); - mtdbatu( 1, bat.upper.word); - break; - case 2: - mtdbatu( 2, BAT_INVALID); - mtdbatl( 2, bat.lower.word); - mtdbatu( 2, bat.upper.word); - break; - case 3: - mtdbatu( 3, BAT_INVALID); - mtdbatl( 3, bat.lower.word); - mtdbatu( 3, bat.upper.word); - break; - } - sync();isync(); - - availableBATs &= ~(1 << batNum); - mappedSegments |= (1 << (15 & (address >> 28))); - - return( address); -} - -void initialize_bats(boot_args *args) -{ - int i; - - /* Give ourselves the virtual map that we would like */ - bat_t bat; - - /* Make sure that the BATs map what we expect. Note - * that we assume BAT0 maps kernel text & data. - * - * Except, oops, none of the BATs have ever been set. - * Developer worked only by fluke. - */ - - bat.upper.word = 0; - bat.upper.bits.bepi = 0x0; /* start at logical addr 0M */ - /* - * We should be smarter here about picking an - * amount to map - */ - bat.upper.bits.bl = 0x7ff; /* size = 256M */ - bat.upper.bits.vs = 1; - bat.upper.bits.vp = 0; - - bat.lower.word = 0; - bat.lower.bits.brpn = 0x0; /* start at physical addr 0 */ - bat.lower.bits.wimg = PTE_WIMG_DEFAULT; - bat.lower.bits.pp = 2; /* read/write access */ - - /* Mustn't cause any data traffic here, - * we're modifying our data BAT register! - */ - - sync(); - mtdbatu(0, BAT_INVALID); /* invalidate old mapping */ - isync(); - mtdbatl(0, bat.lower.word); - isync(); - mtdbatu(0, bat.upper.word); /* update with new mapping */ - isync(); - mtibatl(0, bat.lower.word); - isync(); - mtibatu(0, bat.upper.word); /* update with new mapping */ - isync(); - - sync();isync(); - mtdbatu(1,BAT_INVALID); mtdbatl(1,BAT_INVALID); - mtibatu(1,BAT_INVALID); mtibatl(1,BAT_INVALID); - mtdbatu(2,BAT_INVALID); mtdbatl(2,BAT_INVALID); - mtibatu(2,BAT_INVALID); mtibatl(2,BAT_INVALID); - mtdbatu(3,BAT_INVALID); mtdbatl(3,BAT_INVALID); - mtibatu(3,BAT_INVALID); mtibatl(3,BAT_INVALID); - sync();isync(); - - PEMapSegment( 0xf0000000, 0x10000000); - if( args->Video.v_baseAddr) - PEMapSegment( args->Video.v_baseAddr, 0x10000000); - - /* Set up segment registers as VM through space 0 */ - isync(); - for (i=0; i<=15; i++) { - mtsrin(KERNEL_SEG_REG0_VALUE | i, i * 0x10000000); - } - isync(); -} - -/* - * Adjust the size of the region mapped by a BAT - * to to be just large enough to include the specified - * offset, and return the offset of the new end of the region. - * Note that both 'offsets' are really *lengths*, i.e. the - * offset of the end of the mapped region from the beginning. - * Either the instruction or data BATs (or both) can be specified. - * If the new length is greater than the size mappable by a BAT, - * then that value is just returned and no changes are made. - */ -vm_offset_t -adjust_bat_limit( - vm_offset_t new_minimum, - int batn, - boolean_t ibat, - boolean_t dbat -) -{ - vm_offset_t new_limit; - - if (new_minimum <= 256*1024*1024) { - unsigned int bl = 0; - - new_limit = 128*1024; - while (new_limit < new_minimum) { - new_limit *= 2; - bl = (bl << 1) | 1; - } - - { - batu_t batu; - - if (dbat) switch (batn) { - - case 0: - mfdbatu(batu, 0 ); - batu.bits.bl = bl; - - sync(); isync(); - mtdbatu( 0, batu); - sync(); isync(); - - break; - - case 1: - mfdbatu(batu, 1 ); - batu.bits.bl = bl; - - sync(); isync(); - mtdbatu( 1, batu); - sync(); isync(); - - break; - - case 2: - mfdbatu(batu, 2 ); - batu.bits.bl = bl; - - sync(); isync(); - mtdbatu( 2, batu); - sync(); isync(); - - break; - - case 3: - mfdbatu(batu, 3 ); - batu.bits.bl = bl; - - sync(); isync(); - mtdbatu( 3, batu); - sync(); isync(); - - break; - } - - if (ibat) switch (batn) { - - case 0: - mfibatu(batu, 0 ); - batu.bits.bl = bl; - - sync(); isync(); - mtibatu( 0, batu); - sync(); isync(); - - break; - - case 1: - mfibatu(batu, 1 ); - batu.bits.bl = bl; - - sync(); isync(); - mtibatu( 1, batu); - sync(); isync(); - - break; - - case 2: - mfibatu(batu, 2 ); - batu.bits.bl = bl; - - sync(); isync(); - mtibatu( 2, batu); - sync(); isync(); - - break; - - case 3: - mfibatu(batu, 3 ); - batu.bits.bl = bl; - - sync(); isync(); - mtibatu( 3, batu); - sync(); isync(); - - break; - } - } - } - else - new_limit = new_minimum; - - return (new_limit); -} diff --git a/osfmk/ppc/bcopy.s b/osfmk/ppc/bcopy.s deleted file mode 100644 index bc05940f2..000000000 --- a/osfmk/ppc/bcopy.s +++ /dev/null @@ -1,981 +0,0 @@ -/* - * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -; -; Copy bytes of data around. Handles overlapped data. -; -; -#include -#include -#include - -; These routines use CR5 for certain flags: -; Use CR5_lt to indicate non-cached (in bcopy and memcpy) -#define noncache 20 - - -; The bcopy_phys variants use a stack frame so they can call bcopy as a subroutine. -#define BCOPY_SF_SIZE 32 // total size -#define BCOPY_SF_MSR 16 // we save caller's MSR here (possibly minus VEC and FP) - - -#define kShort 32 // short operands are special cased - - -; void bcopy_physvir_32(from, to, nbytes) -; -; Attempt to copy physically addressed memory with translation on if conditions are met. -; Otherwise do a normal bcopy_phys. This routine is used because some 32-bit processors -; are very slow doing real-mode (translation off) copies, so we set up temporary BATs -; for the passed phys addrs and do the copy with translation on. -; -; Rules are: - neither source nor destination can cross a page. -; - Interrupts must be disabled when this routine is called. -; - Translation must be on when called. -; -; To do the copy, we build a 128 DBAT for both the source and sink. If both are the same, only one -; is loaded. We do not touch the IBATs, so there is no issue if either physical page -; address is the same as the virtual address of the instructions we are executing. -; -; At the end, we invalidate the used DBATs. -; -; Note that the address parameters are long longs. We will transform these to 64-bit -; values. Note that on 32-bit architectures that this will ignore the high half of the -; passed in value. This should be ok since we can not have any bigger than 32 bit addresses -; there anyhow. -; -; Note also that this routine is used only on 32-bit machines. If you're contemplating use -; on a 64-bit processor, use the physical memory window instead; please refer to copypv() -; for an example of how this is done. - - .align 5 - .globl EXT(bcopy_physvir_32) - -LEXT(bcopy_physvir_32) - mflr r0 ; get return address - rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg - mfsprg r8,2 ; get processor feature flags - stw r0,8(r1) ; save return address - rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits - stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy - mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test - subi r0,r7,1 ; get length - 1 - rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg - add r11,r3,r0 ; Point to last byte of sink - mr r5,r7 ; Get the length into the right register - rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits - -; This test for page overflow may not work if the length is negative. Negative lengths are invalid input -; to bcopy_physvir() on 32-bit machines, and will result in a panic. - - add r12,r4,r0 ; Point to last byte of source - xor r7,r11,r3 ; See if we went to next page - xor r8,r12,r4 ; See if we went to next page - or r0,r7,r8 ; Combine wrap - -// li r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes - li r9,((2<<3)|2) ; Set default attributes - rlwinm. r0,r0,0,0,19 ; Did we overflow a page? - li r7,2 ; Set validity flags - li r8,2 ; Set validity flags - bne- bcopy_phys1 ; Overflowed page, do normal physical copy... - - rlwimi r11,r9,0,15,31 ; Set sink lower DBAT value - rlwimi r12,r9,0,15,31 ; Set source lower DBAT value - rlwimi r7,r11,0,0,14 ; Set sink upper DBAT value - rlwimi r8,r12,0,0,14 ; Set source upper DBAT value - cmplw cr1,r11,r12 ; See if sink and source are same block - - sync - - mtdbatl 0,r11 ; Set sink lower DBAT - mtdbatu 0,r7 ; Set sink upper DBAT - - beq- cr1,bcpvsame ; Source and sink are in same block - - mtdbatl 1,r12 ; Set source lower DBAT - mtdbatu 1,r8 ; Set source upper DBAT - -bcpvsame: - sync ; wait for the BATs to stabilize - isync - - bl EXT(bcopy) ; BATs set up, args in r3-r5, so do the copy with DR on - - li r0,0 ; Get set to invalidate upper half of BATs - sync ; Make sure all is well - mtdbatu 0,r0 ; Clear sink upper DBAT - mtdbatu 1,r0 ; Clear source upper DBAT - sync - isync - - lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address - addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame - mtlr r0 - blr - - -; void bcopy_phys(from, to, nbytes) -; -; Turns off data translation before the copy. This one will not work in user state. -; This routine is used on 32 and 64-bit machines. -; -; Note that the address parameters are long longs. We will transform these to 64-bit -; values. Note that on 32-bit architectures that this will ignore the high half of the -; passed in value. This should be ok since we can not have any bigger than 32 bit addresses -; there anyhow. -; -; Also note that you probably will not be happy if either the sink or source spans across the -; boundary between RAM and I/O space. Good chance of hanging the machine and this code -; will not check, so be careful. -; -; NOTE: when called, translation must be on, and we must be in 32-bit mode. -; Interrupts may or may not be disabled. - - .align 5 - .globl EXT(bcopy_phys) - -LEXT(bcopy_phys) - mflr r0 ; get return address - rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg - stw r0,8(r1) ; save - mfsprg r8,2 ; get processor feature flags - stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy - rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits - rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg - mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test - rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits - mr r5,r7 ; Get the length into the right register - -bcopy_phys1: ; enter from bcopy_physvir with pf64Bit in cr6 and parms in r3-r5 - mfmsr r9 ; Get the MSR - lis r6,hi16(MASK(MSR_VEC)) ; Get vector enable - ori r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR)) ; Add in FP and DR - andc r9,r9,r6 ; unconditionally turn DR, VEC, and FP off - bt++ pf64Bitb,bcopy_phys64 ; skip if 64-bit (only they take hint) - -; 32-bit CPUs - - mtmsr r9 ; turn DR, FP, and VEC off - isync ; Wait for it - - bl EXT(bcopy) ; do the copy with translation off and caching on - - mfmsr r9 ; Get the MSR - ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on (but leave VEC and FP off) - mtmsr r9 ; restore msr - isync ; wait for it to happen - lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on - mtlr r0 - addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame - blr - - -; 64-bit: turn DR off and SF on. - -bcopy_phys64: ; r9 = MSR with DP, VEC, and FP off - ori r8,r9,lo16(MASK(MSR_DR)) ; make a copy with DR back on... this is what we return to caller - srdi r2,r3,31 ; Get a 1 if source is in I/O memory - li r0,1 ; Note - we use this in a couple places below - srdi r10,r4,31 ; Get a 1 if sink is in I/O memory - std r8,BCOPY_SF_MSR(r1) ; save caller's MSR so we remember whether EE was on - rldimi r9,r0,63,MSR_SF_BIT ; set SF on in MSR we will copy with - cmpldi cr0,r2,1 ; Is source in I/O memory? - cmpldi cr7,r10,1 ; Is sink in I/O memory? - mtmsrd r9 ; turn 64-bit addressing on, data translation off - isync ; wait for it to happen - cror cr7_eq,cr0_eq,cr7_eq ; See if either source or sink is in I/O area - beq-- cr7,io_space_real_mode_copy ; an operand is in I/O space - - bl EXT(bcopy) ; do copy with DR off and SF on, cache enabled - -bcopy_phys64x: - mfmsr r9 ; Get the MSR we used to copy - rldicl r9,r9,0,MSR_SF_BIT+1 ; clear SF - ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on - mtmsrd r9 ; turn 64-bit mode off, translation back on - isync ; wait for it to happen - lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on - ld r8,BCOPY_SF_MSR(r1) ; get caller's MSR once translation is back on - mtlr r0 - mtmsrd r8,1 ; turn EE back on if necessary - addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame - blr - -; We need to copy with DR off, but one of the operands is in I/O space. To avoid wedging U3, -; which cannot handle a cache burst in I/O space, we must turn caching off for the real memory access. -; This can only be done by setting bits in HID4. We cannot lose control and execute random code in -; this state, so we have to disable interrupts as well. This is an unpleasant hack. - -io_space_real_mode_copy: ; r0=1, r9=MSR we want to copy with - sldi r11,r0,31-MSR_EE_BIT ; Get a mask for the EE bit - sldi r0,r0,32+8 ; Get the right bit to turn off caching - andc r9,r9,r11 ; Turn off EE bit - mfspr r2,hid4 ; Get HID4 - mtmsrd r9,1 ; Force off EE - or r2,r2,r0 ; Set bit to make real accesses cache-inhibited - sync ; Sync up - mtspr hid4,r2 ; Make real accesses cache-inhibited - isync ; Toss prefetches - - lis r12,0xE000 ; Get the unlikeliest ESID possible - srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000 - slbie r12 ; Make sure the ERAT is cleared - - sync - isync - - bl EXT(bcopy_nc) ; copy with SF on and EE, DR, VEC, and FP off, cache inhibited - - li r0,1 ; Get a 1 - sldi r0,r0,32+8 ; Get the right bit to turn off caching - mfspr r2,hid4 ; Get HID4 - andc r2,r2,r0 ; Clear bit to make real accesses cache-inhibited - sync ; Sync up - mtspr hid4,r2 ; Make real accesses not cache-inhibited - isync ; Toss prefetches - - lis r12,0xE000 ; Get the unlikeliest ESID possible - srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000 - slbie r12 ; Make sure the ERAT is cleared - b bcopy_phys64x - - -; -; shortcopy -; -; Special case short operands (<32 bytes), which are very common. Note that the check for -; reverse vs normal moves isn't quite correct in 64-bit mode; in rare cases we will move in -; reverse when it wasn't necessary to do so. This is OK, since performance of the two cases -; is similar. We do get the direction right when it counts (ie, when the operands overlap.) -; Also note that we use the G3/G4 "backend" code, even on G5. This is OK too, since G5 has -; plenty of load/store dispatch bandwidth in this case, the extra ops are hidden by latency, -; and using word instead of doubleword moves reduces the possibility of unaligned accesses, -; which cost about 20 cycles if they cross a 32-byte boundary on G5. Finally, because we -; might do unaligned accesses this code cannot be called from bcopy_nc(). -; r4 = destination -; r5 = length (<32) -; r6 = source -; r12 = (dest - source) - - .align 5 -shortcopy: - cmplw r12,r5 ; must move reverse if (dest-source)0) -; r6 = source -; r12 = (dest - source) -; cr5 = noncache flag - -copyit32: ; WARNING! can drop down to this label - cmplw cr1,r12,r5 ; must move reverse if (dest-source)0) -; r6 = source -; r8 = inverse of largest mask smaller than operand length -; r9 = neg(dest), used to compute alignment -; cr5 = noncache flag - -forward32bit: ; enter from 64-bit CPUs with word aligned uncached operands - rlwinm r7,r9,0,0x1F ; get bytes to 32-byte-align destination - andc. r0,r7,r8 ; limit to the maximum front end move - mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time... - beq alline ; Already on a line... - - mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5 - sub r5,r5,r0 ; Set the length left to move - - bf 31,alhalf ; No single byte to do... - lbz r7,0(r6) ; Get the byte - addi r6,r6,1 ; Point to the next - stb r7,0(r4) ; Save the single - addi r4,r4,1 ; Bump sink - -; Sink is halfword aligned here - -alhalf: bf 30,alword ; No halfword to do... - lhz r7,0(r6) ; Get the halfword - addi r6,r6,2 ; Point to the next - sth r7,0(r4) ; Save the halfword - addi r4,r4,2 ; Bump sink - -; Sink is word aligned here - -alword: bf 29,aldouble ; No word to do... - lwz r7,0(r6) ; Get the word - addi r6,r6,4 ; Point to the next - stw r7,0(r4) ; Save the word - addi r4,r4,4 ; Bump sink - -; Sink is double aligned here - -aldouble: bf 28,alquad ; No double to do... - lwz r7,0(r6) ; Get the first word - lwz r8,4(r6) ; Get the second word - addi r6,r6,8 ; Point to the next - stw r7,0(r4) ; Save the first word - stw r8,4(r4) ; Save the second word - addi r4,r4,8 ; Bump sink - -; Sink is quadword aligned here - -alquad: bf 27,alline ; No quad to do... - lwz r7,0(r6) ; Get the first word - lwz r8,4(r6) ; Get the second word - lwz r9,8(r6) ; Get the third word - stw r7,0(r4) ; Save the first word - lwz r11,12(r6) ; Get the fourth word - addi r6,r6,16 ; Point to the next - stw r8,4(r4) ; Save the second word - stw r9,8(r4) ; Save the third word - stw r11,12(r4) ; Save the fourth word - addi r4,r4,16 ; Bump sink - -; Sink is line aligned here - -alline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move - mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time... - mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5 - beq- backend ; No full lines to move - - mtctr r0 ; set up loop count - li r0,96 ; Stride for touch ahead - b nxtline - - .align 4 -nxtline: - lwz r2,0(r6) ; Get the first word - lwz r5,4(r6) ; Get the second word - lwz r7,8(r6) ; Get the third word - lwz r8,12(r6) ; Get the fourth word - lwz r9,16(r6) ; Get the fifth word - lwz r10,20(r6) ; Get the sixth word - lwz r11,24(r6) ; Get the seventh word - lwz r12,28(r6) ; Get the eighth word - bt- noncache,skipz ; Skip if we are not cached... - dcbz 0,r4 ; Blow away the whole line because we are replacing it - dcbt r6,r0 ; Touch ahead a bit -skipz: - addi r6,r6,32 ; Point to the next - stw r2,0(r4) ; Save the first word - stw r5,4(r4) ; Save the second word - stw r7,8(r4) ; Save the third word - stw r8,12(r4) ; Save the fourth word - stw r9,16(r4) ; Save the fifth word - stw r10,20(r4) ; Save the sixth word - stw r11,24(r4) ; Save the seventh word - stw r12,28(r4) ; Save the eighth word - addi r4,r4,32 ; Bump sink - bdnz+ nxtline ; Do the next line, if any... - - -; Move backend quadword - -backend: ; Join here from "shortcopy" for forward moves <32 bytes - bf 27,noquad ; No quad to do... - lwz r7,0(r6) ; Get the first word - lwz r8,4(r6) ; Get the second word - lwz r9,8(r6) ; Get the third word - lwz r11,12(r6) ; Get the fourth word - stw r7,0(r4) ; Save the first word - addi r6,r6,16 ; Point to the next - stw r8,4(r4) ; Save the second word - stw r9,8(r4) ; Save the third word - stw r11,12(r4) ; Save the fourth word - addi r4,r4,16 ; Bump sink - -; Move backend double - -noquad: bf 28,nodouble ; No double to do... - lwz r7,0(r6) ; Get the first word - lwz r8,4(r6) ; Get the second word - addi r6,r6,8 ; Point to the next - stw r7,0(r4) ; Save the first word - stw r8,4(r4) ; Save the second word - addi r4,r4,8 ; Bump sink - -; Move backend word - -nodouble: bf 29,noword ; No word to do... - lwz r7,0(r6) ; Get the word - addi r6,r6,4 ; Point to the next - stw r7,0(r4) ; Save the word - addi r4,r4,4 ; Bump sink - -; Move backend halfword - -noword: bf 30,nohalf ; No halfword to do... - lhz r7,0(r6) ; Get the halfword - addi r6,r6,2 ; Point to the next - sth r7,0(r4) ; Save the halfword - addi r4,r4,2 ; Bump sink - -; Move backend byte - -nohalf: bflr 31 ; Leave cuz we are all done... - lbz r7,0(r6) ; Get the byte - stb r7,0(r4) ; Save the single - blr - - -; Reverse moves on 32-bit machines, also reverse word aligned uncached moves on 64-bit machines. -; NOTE: we never do an unaligned access if the source and destination are "relatively" -; word aligned. We depend on this in the uncached case on 64-bit processors. -; These are slower because we don't bother with dcbz. Fortunately, reverse moves are uncommon. -; r4 = destination -; r5 = length (>0) -; r6 = source -; r8 = inverse of largest mask smaller than operand length -; cr5 = noncache flag (but we don't dcbz anyway) - -reverse32bit: ; here from 64-bit code with word aligned uncached operands - add r4,r5,r4 ; Point past the last sink byte - add r6,r5,r6 ; Point past the last source byte - rlwinm r7,r4,0,0x1F ; Calculate the length to align dest on cache boundary - li r12,-1 ; Make sure we touch in the actual line - andc. r0,r7,r8 ; Apply movement limit - dcbt r12,r6 ; Touch in the last line of source - mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time... - dcbtst r12,r4 ; Touch in the last line of the sink - mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5 - beq- balline ; Aready on cache line boundary (or too short to bother) - - sub r5,r5,r0 ; Precaculate move length left after alignment - - bf 31,balhalf ; No single byte to do... - lbz r7,-1(r6) ; Get the byte - subi r6,r6,1 ; Point to the next - stb r7,-1(r4) ; Save the single - subi r4,r4,1 ; Bump sink - -; Sink is halfword aligned here - -balhalf: bf 30,balword ; No halfword to do... - lhz r7,-2(r6) ; Get the halfword - subi r6,r6,2 ; Point to the next - sth r7,-2(r4) ; Save the halfword - subi r4,r4,2 ; Bump sink - -; Sink is word aligned here - -balword: bf 29,baldouble ; No word to do... - lwz r7,-4(r6) ; Get the word - subi r6,r6,4 ; Point to the next - stw r7,-4(r4) ; Save the word - subi r4,r4,4 ; Bump sink - -; Sink is double aligned here - -baldouble: bf 28,balquad ; No double to do... - lwz r7,-8(r6) ; Get the first word - lwz r8,-4(r6) ; Get the second word - subi r6,r6,8 ; Point to the next - stw r7,-8(r4) ; Save the first word - stw r8,-4(r4) ; Save the second word - subi r4,r4,8 ; Bump sink - -; Sink is quadword aligned here - -balquad: bf 27,balline ; No quad to do... - lwz r7,-16(r6) ; Get the first word - lwz r8,-12(r6) ; Get the second word - lwz r9,-8(r6) ; Get the third word - lwz r11,-4(r6) ; Get the fourth word - stw r7,-16(r4) ; Save the first word - subi r6,r6,16 ; Point to the next - stw r8,-12(r4) ; Save the second word - stw r9,-8(r4) ; Save the third word - stw r11,-4(r4) ; Save the fourth word - subi r4,r4,16 ; Bump sink - -; Sink is line aligned here - -balline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move - mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time... - mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5 - beq- bbackend ; No full lines to move - mtctr r0 ; set up loop count - b bnxtline - - .align 4 -bnxtline: - lwz r7,-32(r6) ; Get the first word - lwz r5,-28(r6) ; Get the second word - lwz r2,-24(r6) ; Get the third word - lwz r12,-20(r6) ; Get the third word - lwz r11,-16(r6) ; Get the fifth word - lwz r10,-12(r6) ; Get the sixth word - lwz r9,-8(r6) ; Get the seventh word - lwz r8,-4(r6) ; Get the eighth word - subi r6,r6,32 ; Point to the next - - stw r7,-32(r4) ; Get the first word - stw r5,-28(r4) ; Get the second word - stw r2,-24(r4) ; Get the third word - stw r12,-20(r4) ; Get the third word - stw r11,-16(r4) ; Get the fifth word - stw r10,-12(r4) ; Get the sixth word - stw r9,-8(r4) ; Get the seventh word - stw r8,-4(r4) ; Get the eighth word - subi r4,r4,32 ; Bump sink - - bdnz+ bnxtline ; Do the next line, if any... - -; -; Note: We touched these lines in at the beginning -; - -; Move backend quadword - -bbackend: ; Join here from "shortcopy" for reverse moves of <32 bytes - bf 27,bnoquad ; No quad to do... - lwz r7,-16(r6) ; Get the first word - lwz r8,-12(r6) ; Get the second word - lwz r9,-8(r6) ; Get the third word - lwz r11,-4(r6) ; Get the fourth word - stw r7,-16(r4) ; Save the first word - subi r6,r6,16 ; Point to the next - stw r8,-12(r4) ; Save the second word - stw r9,-8(r4) ; Save the third word - stw r11,-4(r4) ; Save the fourth word - subi r4,r4,16 ; Bump sink - -; Move backend double - -bnoquad: bf 28,bnodouble ; No double to do... - lwz r7,-8(r6) ; Get the first word - lwz r8,-4(r6) ; Get the second word - subi r6,r6,8 ; Point to the next - stw r7,-8(r4) ; Save the first word - stw r8,-4(r4) ; Save the second word - subi r4,r4,8 ; Bump sink - -; Move backend word - -bnodouble: bf 29,bnoword ; No word to do... - lwz r7,-4(r6) ; Get the word - subi r6,r6,4 ; Point to the next - stw r7,-4(r4) ; Save the word - subi r4,r4,4 ; Bump sink - -; Move backend halfword - -bnoword: bf 30,bnohalf ; No halfword to do... - lhz r7,-2(r6) ; Get the halfword - subi r6,r6,2 ; Point to the next - sth r7,-2(r4) ; Save the halfword - subi r4,r4,2 ; Bump sink - -; Move backend byte - -bnohalf: bflr 31 ; Leave cuz we are all done... - lbz r7,-1(r6) ; Get the byte - stb r7,-1(r4) ; Save the single - blr - - -// Here on 64-bit processors, which have a 128-byte cache line. This can be -// called either in 32 or 64-bit mode, which makes the test for reverse moves -// a little tricky. We've already filtered out the (sou==dest) and (len==0) -// special cases. -// -// When entered: -// r4 = destination (32 or 64-bit ptr) -// r5 = length (always 32 bits) -// r6 = source (32 or 64-bit ptr) -// r12 = (dest - source), reverse move required if (dest-source)=length, in mode-independent way - li r0,0 // get a 0 - lis r10,hi16(0x80000000)// get 0x80000000 - addze. r0,r0 // set cr0 on carry bit (beq if reverse move required) - neg r9,r4 // start to get alignment for destination - sraw r8,r10,r11 // get mask based on operand length, to limit alignment - bt-- noncache,c64uncached// skip if uncached - beq-- c64rdouble // handle cached reverse moves - - -// Forward, cached or doubleword aligned uncached. This is the common case. -// NOTE: we never do an unaligned access if the source and destination are "relatively" -// doubleword aligned. We depend on this in the uncached case. -// r4 = destination -// r5 = length (>0) -// r6 = source -// r8 = inverse of largest mask smaller than operand length -// r9 = neg(dest), used to compute alignment -// cr5 = noncache flag - -c64double: - rlwinm r7,r9,0,0x7F // get #bytes to 128-byte align destination - andc r7,r7,r8 // limit by operand length - andi. r8,r7,7 // r8 <- #bytes to doubleword align - srwi r9,r7,3 // r9 <- #doublewords to 128-byte align - sub r5,r5,r7 // adjust length remaining - cmpwi cr1,r9,0 // any doublewords to move to cache align? - srwi r10,r5,7 // r10 <- 128-byte chunks to xfer after aligning dest - cmpwi cr7,r10,0 // set cr7 on chunk count - beq c64double2 // dest already doubleword aligned - mtctr r8 - b c64double1 - - .align 5 // align inner loops -c64double1: // copy bytes until dest is doubleword aligned - lbz r0,0(r6) - addi r6,r6,1 - stb r0,0(r4) - addi r4,r4,1 - bdnz c64double1 - -c64double2: // r9/cr1=doublewords, r10/cr7=128-byte chunks - beq cr1,c64double4 // no doublewords to xfer in order to cache align - mtctr r9 - b c64double3 - - .align 5 // align inner loops -c64double3: // copy doublewords until dest is 128-byte aligned - ld r7,0(r6) - addi r6,r6,8 - std r7,0(r4) - addi r4,r4,8 - bdnz c64double3 - -// Here to xfer 128-byte chunks, if any. Since we only have 8 GPRs for -// data (64 bytes), we load/store each twice per 128-byte chunk. - -c64double4: // r10/cr7=128-byte chunks - rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords, after moving chunks - cmpwi cr1,r0,0 // set cr1 on leftover doublewords - beq cr7,c64double7 // no 128-byte chunks - - ; We must check for (source-dest)<128 in a mode-independent way. If within 128 bytes, - ; turn on "noncache" because we cannot use dcbz128 even if operands are cacheable. - - sub r8,r6,r4 // r8 <- (source - dest) - rldicr. r0,r8,0,63-7 // zero low 7 bits and check for 0, mode independent - cror noncache,cr0_eq,noncache // turn on "noncache" flag if (source-dest)<128 - mtctr r10 - b c64InnerLoop - - .align 5 // align inner loop -c64InnerLoop: // loop copying 128-byte cache lines to 128-aligned destination - ld r0,0(r6) // start pipe: load 1st half-line - ld r2,8(r6) - ld r7,16(r6) - ld r8,24(r6) - ld r9,32(r6) - ld r10,40(r6) - ld r11,48(r6) - ld r12,56(r6) - bt noncache,c64InnerLoop1 // skip if uncached or overlap - dcbz128 0,r4 // avoid prefetch of next cache line -c64InnerLoop1: - - std r0,0(r4) - std r2,8(r4) - std r7,16(r4) - std r8,24(r4) - std r9,32(r4) - std r10,40(r4) - std r11,48(r4) - std r12,56(r4) - - ld r0,64(r6) // load 2nd half of chunk - ld r2,72(r6) - ld r7,80(r6) - ld r8,88(r6) - ld r9,96(r6) - ld r10,104(r6) - ld r11,112(r6) - ld r12,120(r6) - addi r6,r6,128 - - std r0,64(r4) - std r2,72(r4) - std r7,80(r4) - std r8,88(r4) - std r9,96(r4) - std r10,104(r4) - std r11,112(r4) - std r12,120(r4) - addi r4,r4,128 // advance to next dest chunk - - bdnz c64InnerLoop // loop if more chunks - - -c64double7: // r5 <- leftover bytes, cr1 set on doubleword count - rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords (0-15) - andi. r5,r5,7 // r5/cr0 <- count of leftover bytes (0-7) - beq cr1,c64byte // no leftover doublewords - mtctr r0 - b c64double8 - - .align 5 // align inner loop -c64double8: // loop copying leftover doublewords - ld r0,0(r6) - addi r6,r6,8 - std r0,0(r4) - addi r4,r4,8 - bdnz c64double8 - - -// Forward byte loop. - -c64byte: // r5/cr0 <- byte count (can be big if unaligned uncached) - beqlr // done if no leftover bytes - mtctr r5 - b c64byte1 - - .align 5 // align inner loop -c64byte1: - lbz r0,0(r6) - addi r6,r6,1 - stb r0,0(r4) - addi r4,r4,1 - bdnz c64byte1 - - blr - - -// Uncached copies. We must avoid unaligned accesses, since they always take alignment -// exceptions on uncached memory on 64-bit processors. This may mean we copy long operands -// a byte at a time, but that is still much faster than alignment exceptions. -// r4 = destination -// r5 = length (>0) -// r6 = source -// r8 = inverse of largest mask smaller than operand length -// r9 = neg(dest), used to compute alignment -// r12 = (dest-source), used to test relative alignment -// cr0 = beq if reverse move required -// cr5 = noncache flag - -c64uncached: - rlwinm r10,r12,0,29,31 // relatively doubleword aligned? - rlwinm r11,r12,0,30,31 // relatively word aligned? - cmpwi cr7,r10,0 // set cr7 beq if doubleword aligned - cmpwi cr1,r11,0 // set cr1 beq if word aligned - beq-- c64reverseUncached - - beq cr7,c64double // doubleword aligned - beq cr1,forward32bit // word aligned, use G3/G4 code - cmpwi r5,0 // set cr0 on byte count - b c64byte // unaligned operands - -c64reverseUncached: - beq cr7,c64rdouble // doubleword aligned so can use LD/STD - beq cr1,reverse32bit // word aligned, use G3/G4 code - add r6,r6,r5 // point to (end+1) of source and dest - add r4,r4,r5 - cmpwi r5,0 // set cr0 on length - b c64rbyte // copy a byte at a time - - - -// Reverse doubleword copies. This is used for all cached copies, and doubleword -// aligned uncached copies. -// r4 = destination -// r5 = length (>0) -// r6 = source -// r8 = inverse of largest mask of low-order 1s smaller than operand length -// cr5 = noncache flag - -c64rdouble: - add r6,r6,r5 // point to (end+1) of source and dest - add r4,r4,r5 - rlwinm r7,r4,0,29,31 // r7 <- #bytes to doubleword align dest - andc. r7,r7,r8 // limit by operand length - sub r5,r5,r7 // adjust length - srwi r8,r5,6 // r8 <- 64-byte chunks to xfer - cmpwi cr1,r8,0 // any chunks? - beq c64rd2 // source already doubleword aligned - mtctr r7 - -c64rd1: // copy bytes until source doublword aligned - lbzu r0,-1(r6) - stbu r0,-1(r4) - bdnz c64rd1 - -c64rd2: // r8/cr1 <- count of 64-byte chunks - rlwinm r0,r5,29,29,31 // r0 <- count of leftover doublewords - andi. r5,r5,7 // r5/cr0 <- count of leftover bytes - cmpwi cr7,r0,0 // leftover doublewords? - beq cr1,c64rd4 // no chunks to xfer - mtctr r8 - b c64rd3 - - .align 5 // align inner loop -c64rd3: // loop copying 64-byte chunks - ld r7,-8(r6) - ld r8,-16(r6) - ld r9,-24(r6) - ld r10,-32(r6) - ld r11,-40(r6) - ld r12,-48(r6) - std r7,-8(r4) - std r8,-16(r4) - ld r7,-56(r6) - ldu r8,-64(r6) - std r9,-24(r4) - std r10,-32(r4) - std r11,-40(r4) - std r12,-48(r4) - std r7,-56(r4) - stdu r8,-64(r4) - bdnz c64rd3 - -c64rd4: // r0/cr7 = leftover doublewords r5/cr0 = leftover bytes - beq cr7,c64rbyte // no leftover doublewords - mtctr r0 - -c64rd5: // loop copying leftover doublewords - ldu r0,-8(r6) - stdu r0,-8(r4) - bdnz c64rd5 - - -// Reverse byte loop. - -c64rbyte: // r5/cr0 <- byte count (can be big if unaligned uncached) - beqlr // done if no leftover bytes - mtctr r5 - -c64rbyte1: - lbzu r0,-1(r6) - stbu r0,-1(r4) - bdnz c64rbyte1 - - blr - diff --git a/osfmk/ppc/bcopytest.c b/osfmk/ppc/bcopytest.c deleted file mode 100644 index bcc86bfb4..000000000 --- a/osfmk/ppc/bcopytest.c +++ /dev/null @@ -1,621 +0,0 @@ -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include - -#include /* (TEST/DEBUG) */ - -#define patper 253 - - -int main(void); -void clrarea(unsigned int *source, unsigned int *sink); -int tstcopy(void *src, void *snk, unsigned int lgn); -void clrarea2(unsigned int *source, unsigned int *sink); -int tstcopy2(void *src, void *snk, unsigned int lgn); -int tstcopy3(void *src, void *snk, unsigned int lgn); -int tstcopy4(void *src, void *snk, unsigned int lgn); -int tstcopy5(void *src, void *snk, unsigned int lgn); -int dumbcopy(void *src, void *snk, unsigned int lgn); - - -unsigned int gtick(void); - - -void bcopytest(void); -void bcopytest(void) { - - void *srcptr, *snkptr, *asrc, *asnk; - int bsrc, bsnk, size, i, ret, n; - volatile int dbg = 0; - unsigned int *sink, *source; - - kern_return_t retr; - - db_printf("bcopy test\n"); - - retr = kmem_alloc_kobject(kernel_map, (vm_offset_t *)&sink, (1024*1024)+4096); /* Get sink area */ - if(retr != KERN_SUCCESS) { /* Did we find any memory at all? */ - panic("bcopytest: Whoops... no memory for sink\n"); - } - - retr = kmem_alloc_kobject(kernel_map, (vm_offset_t *)&source, (1024*1024)+4096); /* Get source area */ - if(retr != KERN_SUCCESS) { /* Did we find any memory at all? */ - panic("bcopytest: Whoops... no memory for source\n"); - } - - db_printf("Source at %08X; Sink at %08X\n", source, sink); - - srcptr = (void *)&source[0]; - snkptr = (void *)&sink[0]; - -#if 1 - db_printf("Testing non-overlap case; source bndry = 0 to 7F; sink bndry = 0 - 7F; lgn = 1 to 256\n"); - for(bsrc = 0; bsrc < 128; bsrc++) { /* Step the source by 1 */ - for(bsnk = 0; bsnk < 128; bsnk++) { /* Step the sink by 1 */ - for(size = 1; size <= 256; size++) { /* Step the size by 1 */ - - clrarea(source, sink); /* Reset source and clear sink */ - if(size == 255) { - dbg = 99; - } - if(tstcopy((void *)((unsigned int)srcptr + bsrc), (void *)((unsigned int)snkptr + bsnk), size)) { - db_printf("Test failed; source = %02X; sink = %02X; length = %d\n", bsrc, bsnk, size); - db_printf("failed\n"); - } - } - } - } - db_printf("Non-overlap test complete\n"); -#endif - - -#if 1 - db_printf("Testing overlap\n"); - for(bsrc = 1; bsrc < 128; bsrc++) { /* Step the source by 1 */ - for(bsnk = 0; bsnk < 128; bsnk++) { /* Step the sink by 1 */ - for(size = 1; size <= 256; size++) { /* Step the size by 1 */ - - clrarea2(source, sink); /* Reset source and clear sink */ - if(bsrc < bsnk) { - dbg = 88; - } - else { - dbg = 99; - } - if(tstcopy2((void *)((unsigned int)srcptr + bsrc), (void *)((unsigned int)srcptr + bsnk), size)) { - db_printf("Test failed; source = %02X; sink = %02X; length = %d\n", bsrc, bsnk, size); - db_printf("failed\n"); - } - } - } - } - db_printf("Overlap test complete\n"); -#endif - -#if 1 - db_printf("Starting exhaustive tests\n"); - for(i = 0; i < 262144 * 4; i++) { /* Set all 1MB of source and dest to known pattern */ - ((unsigned char *)srcptr)[i] = i % patper; /* Make a non-power-of-two length pattern */ - ((unsigned char *)snkptr)[i] = i % patper; /* Make a non-power-of-two length pattern */ - } - - db_printf("No overlap; source < sink, length = 0 to 1023\nSource ="); - -#if 1 - for(bsrc = 0; bsrc < 128; bsrc++) { /* Step source by 1 */ - db_printf(" %3d", bsrc); /* Show where we're at */ - for(bsnk = 0; bsnk < 128; bsnk++) { /* Step sink by 1 */ - for(size = 0; size < 1025; size++) { /* Step size from 0 to 1023 */ - asrc = (void *)((unsigned int)srcptr + bsrc); /* Start byte address */ - asnk = (void *)((unsigned int)srcptr + bsnk + 2048); /* End byte address */ - ret = tstcopy5(asrc, asnk, size); /* Copy and validate */ - if(ret) { - db_printf("\nTest failed - source = %3d, sink = %3d size = %d\n", bsrc, bsnk, size); - db_printf("failed\n"); - } - } - } - } -#endif - - db_printf("\n"); - db_printf("No overlap; source > sink, length = 0 to 1023\nSource ="); - -#if 1 - for(bsrc = 0; bsrc < 128; bsrc++) { /* Step source by 1 */ - db_printf(" %3d", bsrc); /* Show where we're at */ - for(bsnk = 0; bsnk < 128; bsnk++) { /* Step sink by 1 */ - for(size = 0; size < 1025; size++) { /* Step size from 0 to 1023 */ - asrc = (void *)((unsigned int)srcptr + bsrc + 2048); /* Start byte address */ - asnk = (void *)((unsigned int)srcptr + bsnk); /* End byte address */ - ret = tstcopy5(asrc, asnk, size); /* Copy and validate */ - if(ret) { - db_printf("\nTest failed - source = %3d, sink = %3d size = %d\n", bsrc, bsnk, size); - db_printf("failed\n"); - } - } - } - } -#endif - - db_printf("\n"); - db_printf("Overlap; source = sink + N (N = 0 to 127), length = 0 to 1023\nN ="); - -#if 1 - for(n = 0; n < 128; n++) { /* Step n by 1 */ - db_printf(" %3d", n); /* Show where we're at */ - for(bsnk = 0; bsnk < 128; bsnk++) { /* Step sink by 1 */ - for(size = 0; size < 1025; size++) { /* Step size from 0 to 1023 */ - asrc = (void *)((unsigned int)srcptr + bsnk + n); /* Start byte address */ - asnk = (void *)((unsigned int)srcptr + bsnk); /* End byte address */ - ret = tstcopy5(asrc, asnk, size); /* Copy and validate */ - if(ret) { - db_printf("\nTest failed - source = %3d, sink = %3d size = %d\n", bsrc, bsnk, size); - db_printf("failed\n"); - } - } - } - } -#endif - - db_printf("\n"); - db_printf("Overlap; source + N = sink (N = 0 to 127), length = 0 to 1023\nSource ="); - -#if 1 - for(bsrc = 0; bsrc < 128; bsrc++) { /* Step source by 1 */ - db_printf(" %3d", bsrc); /* Show where we're at */ - for(n = 0; n < 128; n++) { /* Step N by 1 */ - for(size = 0; size < 1025; size++) { /* Step size from 0 to 1023 */ - asrc = (void *)((unsigned int)srcptr + bsnk); /* Start byte address */ - asnk = (void *)((unsigned int)srcptr + bsnk + n); /* End byte address */ - ret = tstcopy5(asrc, asnk, size); /* Copy and validate */ - if(ret) { - db_printf("\nTest failed - source = %3d, n = %3d size = %d\n", bsrc, n, size); - db_printf("failed\n"); - } - } - } - } -#endif - - - db_printf("\n"); - db_printf("Overlap; source = sink + N + 128 (N = 0 to 127), length = 0 to 1023\nN ="); - -#if 1 - for(n = 0; n < 128; n++) { /* Step n by 1 */ - db_printf(" %3d", n); /* Show where we're at */ - for(bsnk = 0; bsnk < 128; bsnk++) { /* Step sink by 1 */ - for(size = 0; size < 1025; size++) { /* Step size from 0 to 1023 */ - asrc = (void *)((unsigned int)srcptr + bsnk + n + 128); /* Start byte address */ - asnk = (void *)((unsigned int)srcptr + bsnk); /* End byte address */ - ret = tstcopy5(asrc, asnk, size); /* Copy and validate */ - if(ret) { - db_printf("\nTest failed - source = %3d, sink = %3d size = %d\n", bsrc, bsnk, size); - db_printf("failed\n"); - } - } - } - } -#endif - - db_printf("\n"); - db_printf("Overlap; source + N + 128 = sink (N = 0 to 127), length = 0 to 1023\nSource ="); - -#if 1 - for(bsrc = 0; bsrc < 128; bsrc++) { /* Step source by 1 */ - db_printf(" %3d", bsrc); /* Show where we're at */ - for(n = 0; n < 128; n++) { /* Step N by 1 */ - for(size = 0; size < 1025; size++) { /* Step size from 0 to 1023 */ - asrc = (void *)((unsigned int)srcptr + bsnk); /* Start byte address */ - asnk = (void *)((unsigned int)srcptr + bsnk + n + 128); /* End byte address */ - ret = tstcopy5(asrc, asnk, size); /* Copy and validate */ - if(ret) { - db_printf("\nTest failed - source = %3d, n = %3d size = %d\n", bsrc, n, size); - db_printf("failed\n"); - } - } - } - } -#endif - - db_printf("\n"); - db_printf("Overlap; source = sink + N + 256 (N = 0 to 127), length = 0 to 1023\nSource ="); - -#if 1 - for(n = 0; n < 128; n++) { /* Step n by 1 */ - db_printf(" %3d", n); /* Show where we're at */ - for(bsnk = 0; bsnk < 128; bsnk++) { /* Step sink by 1 */ - for(size = 0; size < 1025; size++) { /* Step size from 0 to 1023 */ - asrc = (void *)((unsigned int)srcptr + bsnk + n + 256); /* Start byte address */ - asnk = (void *)((unsigned int)srcptr + bsnk); /* End byte address */ - ret = tstcopy5(asrc, asnk, size); /* Copy and validate */ - if(ret) { - db_printf("\nTest failed - source = %3d, sink = %3d size = %d\n", bsrc, bsnk, size); - db_printf("failed\n"); - } - } - } - } -#endif - - db_printf("\n"); - db_printf("Overlap; source + N + 256 = sink (N = 0 to 127), length = 0 to 1023\nSource ="); -#if 1 - for(bsrc = 0; bsrc < 128; bsrc++) { /* Step source by 1 */ - db_printf(" %3d", bsrc); /* Show where we're at */ - for(n = 0; n < 128; n++) { /* Step N by 1 */ - for(size = 0; size < 1025; size++) { /* Step size from 0 to 1023 */ - asrc = (void *)((unsigned int)srcptr + bsnk); /* Start byte address */ - asnk = (void *)((unsigned int)srcptr + bsnk + n + 256); /* End byte address */ - ret = tstcopy5(asrc, asnk, size); /* Copy and validate */ - if(ret) { - db_printf("\nTest failed - source = %3d, n = %3d size = %d\n", bsrc, n, size); - db_printf("failed\n"); - } - } - } - } -#endif - - - - - - -#endif - -#if 0 - iterations = 1000; - tottime = 0; - totbytes = 0; - - db_printf("Random test starting; iterations = %d\n", iterations); - for(i = 0; i < 262144 * 4; i++) { /* Clear all 2MB of source (and dest for this test) */ - ((unsigned char *)srcptr)[i] = i & 255; - } - - for(i = 0; i < iterations; i++) { /* Test until we are done */ - makerand = rand() << 16 | (rand() & 0x0000FFFF); - bsrc = makerand & 0x0007FFFF; /* Generate source */ - makerand = rand() << 16 | (rand() & 0x0000FFFF); - bsnk = makerand & 0x0007FFFF; /* Generate sink */ - makerand = rand() << 16 | (rand() & 0x0000FFFF); - size = makerand & 0x0007FFFF; /* Generate length */ -#if 1 - db_printf("rt %7d: src = %08X; sink = %08X; length = %7d\n", i, ((unsigned int)srcptr + bsrc), - ((unsigned int)srcptr + bsnk), size); -#endif - - asrc = (void *)((unsigned int)srcptr + bsrc); - asnk = (void *)((unsigned int)srcptr + bsnk); - timein = gtick(); - ret = tstcopy3(asrc, asnk, size); - timeout = gtick(); - if(ret) { - db_printf("Test failed; source = %02X; sink = %02X; length = %d\n", bsrc, bsnk, size); - db_printf("failed\n"); - - } - ticks = timeout - timein; /* Get time in ticks for copy */ - tottime += ticks; - totbytes += size; - - rate = (double) totbytes / (double)tottime; /* Get bytes per tick */ -// rate = rate * (double)11250000.0; /* Bytes per second */ -// rate = rate * (double)16500000.0; /* Bytes per second */ - rate = rate * (double)tbfreq; /* Bytes per second */ - rate = rate / (double)1000000.0; /* Get number of MBs */ - - db_printf("Total bytes = %lld; total time = %lld; rate = %f10\n", totbytes, tottime, rate); - - } -#endif - - - -#if 0 - iterations = 100; - tottime = 0; - totbytes = 0; - - db_printf("Random test starting; iterations = %d\n", iterations); - for(i = 0; i < 262144 * 4; i++) { /* Clear all 2MB of source (and dest for this test) */ - ((unsigned char *)srcptr)[i] = i & 255; - } - - for(i = 0; i < iterations; i++) { /* Test until we are done */ - makerand = rand() << 16 | (rand() & 0x0000FFFF); - bsrc = makerand & 0x0007FFFF; /* Generate source */ - makerand = rand() << 16 | (rand() & 0x0000FFFF); - bsnk = makerand & 0x0007FFFF; /* Generate sink */ - makerand = rand() << 16 | (rand() & 0x0000FFFF); - size = makerand & 0x0007FFFF; /* Generate length */ -#if 1 - db_printf("rt %7d: src = %08X; sink = %08X; length = %7d\n", i, ((unsigned int)srcptr + bsrc), - ((unsigned int)srcptr + bsnk), size); -#endif - - asrc = (void *)((unsigned int)srcptr + bsrc); - asnk = (void *)((unsigned int)srcptr + bsnk); - timein = gtick(); - ret = tstcopy4(asrc, asnk, size); - timeout = gtick(); - if(ret) { - db_printf("Test failed; source = %02X; sink = %02X; length = %d\n", bsrc, bsnk, size); - db_printf("failed\n"); - - } - ticks = timeout - timein; /* Get time in ticks for copy */ - tottime += ticks; - totbytes += size; - - rate = (double) totbytes / (double)tottime; /* Get bytes per tick */ -// rate = rate * (double)11250000.0; /* Bytes per second */ -// rate = rate * (double)16500000.0; /* Bytes per second */ - rate = rate * (double)tbfreq; /* Bytes per second */ - rate = rate / (double)1000000.0; /* Get number of MBs */ - - db_printf("Total bytes = %lld; total time = %lld; rate = %f10\n", totbytes, tottime, rate); - - } -#endif - -#if 0 - iterations = 100; - tottime = 0; - totbytes = 0; - - db_printf("Random test starting; iterations = %d\n", iterations); - for(i = 0; i < 262144 * 4; i++) { /* Clear all 2MB of source (and dest for this test) */ - ((unsigned char *)srcptr)[i] = i & 255; - } - - for(i = 0; i < iterations; i++) { /* Test until we are done */ - makerand = rand() << 16 | (rand() & 0x0000FFFF); - bsrc = makerand & 0x0007FFFF; /* Generate source */ - makerand = rand() << 16 | (rand() & 0x0000FFFF); - bsnk = makerand & 0x0007FFFF; /* Generate sink */ - makerand = rand() << 16 | (rand() & 0x0000FFFF); - size = makerand & 0x0007FFFF; /* Generate length */ -#if 1 - db_printf("rt %7d: src = %08X; sink = %08X; length = %7d\n", i, ((unsigned int)srcptr + bsrc), - ((unsigned int)srcptr + bsnk), size); -#endif - - asrc = (void *)((unsigned int)srcptr + bsrc); - asnk = (void *)((unsigned int)srcptr + bsnk); - timein = gtick(); - ret = dumbcopy(asrc, asnk, size); - timeout = gtick(); - if(ret) { - db_printf("Test failed; source = %02X; sink = %02X; length = %d\n", bsrc, bsnk, size); - db_printf("failed\n"); - - } - ticks = timeout - timein; /* Get time in ticks for copy */ - tottime += ticks; - totbytes += size; - - rate = (double) totbytes / (double)tottime; /* Get bytes per tick */ - rate = rate * (double)tbfreq; /* Bytes per second */ - rate = rate / (double)1000000.0; /* Get number of MBs */ - - db_printf("Total bytes = %lld; total time = %lld; rate = %f10\n", totbytes, tottime, rate); - - } -#endif - - kmem_free(kernel_map, (vm_offset_t) sink, (1024*1024)+4096); /* Release this mapping block */ - kmem_free(kernel_map, (vm_offset_t) source, (1024*1024)+4096); /* Release this mapping block */ - - if(dbg == 22) db_printf("Gabbagoogoo\n"); - return; -} - -void clrarea(unsigned int *source, unsigned int *sink) { - - unsigned int i; - - for(i=0; i < 1024; i++) { /* Init source & sink */ - source[i] = 0x55555555; /* Known pattern */ - sink[i] = 0xAAAAAAAA; /* Known pattern */ - } - return; -} - -void -clrarea2(unsigned int *source, __unused unsigned int *sink) -{ - unsigned int i; - unsigned char *ss; - - ss = (unsigned char *)&source[0]; - - for(i=0; i < 1024 * 4; i++) { /* Init source/sink */ - ss[i] = i & 0xFF; /* Known pattern */ - } - return; -} - -int tstcopy(void *src, void *snk, unsigned int lgn) { - - unsigned int i, crap; - - bcopy(src, snk, lgn); - - for(i = 0; i < lgn; i++) { - if(((unsigned char *)snk)[i] != 0x55) { - crap = (unsigned int)&((unsigned char *)snk)[i]; - db_printf("bad copy at sink[%d] (%08X) it is %02X\n", i,crap, ((unsigned char *)snk)[i]); - return 1; - } - } - if(((unsigned char *)snk)[lgn] != 0xAA) { /* Is it right? */ - crap = (unsigned int)&((unsigned char *)snk)[i]; - db_printf("Copied too far at sink[%d] (%08X) it is %02X\n", i, crap, ((unsigned char *)snk)[lgn]); - return 1; - } - return 0; - -} - -int tstcopy2(void *src, void *snk, unsigned int lgn) { - - unsigned int i, crap; - unsigned char ic, ec; - - ic = ((unsigned char *)src)[0]; - ec = ((unsigned char *)snk)[lgn]; - - bcopy(src, snk, lgn); - - for(i = 0; i < lgn; i++) { - if(((unsigned char *)snk)[i] != ic) { - crap = (unsigned int)&((unsigned char *)snk)[i]; - db_printf("bad copy at sink[%d] (%08X) it is %02X\n", i,crap, ((unsigned char *)snk)[i]); - return 1; - } - ic = (ic + 1) & 0xFF; - } - - if(((unsigned char *)snk)[lgn] != ec) { /* Is it right? */ - crap = (unsigned int)&((unsigned char *)snk)[i]; - db_printf("Copied too far at sink[%d] (%08X) it is %02X\n", i, crap, ((unsigned char *)snk)[lgn]); - return 1; - } - return 0; - -} - -int tstcopy3(void *src, void *snk, unsigned int lgn) { - - unsigned int i, crap; - unsigned char ic, ec, oic; - - oic = ((unsigned char *)snk)[0]; - ic = ((unsigned char *)src)[0]; - ec = ((unsigned char *)snk)[lgn]; - - bcopy(src, snk, lgn); - - for(i = 0; i < lgn; i++) { - if(((unsigned char *)snk)[i] != ic) { - crap = (unsigned int)&((unsigned char *)snk)[i]; - db_printf("bad copy at sink[%d] (%08X) it is %02X\n", i ,crap, ((unsigned char *)snk)[i]); - return 1; - } - ic = (ic + 1) & 0xFF; - } - - if(((unsigned char *)snk)[lgn] != ec) { /* Is it right? */ - crap = (unsigned int)&((unsigned char *)snk)[i]; - db_printf("Copied too far at sink[%d] (%08X) it is %02X\n", i, crap, ((unsigned char *)snk)[lgn]); - return 1; - } - - for(i=0; i < lgn; i++) { /* Restore pattern */ - ((unsigned char *)snk)[i] = oic; - oic = (oic + 1) & 0xFF; - } - - return 0; - -} - -int tstcopy4(void *src, void *snk, unsigned int lgn) { - - bcopy(src, snk, lgn); - return 0; - -} - -int tstcopy5(void *src, void *snk, unsigned int lgn) { - - unsigned int i = 0, crap; - unsigned char ic, ec, oic, pc; - - oic = ((unsigned char *)snk)[0]; /* Original first sink character */ - ic = ((unsigned char *)src)[0]; /* Original first source character */ - ec = ((unsigned char *)snk)[lgn]; /* Original character just after last sink character */ - pc = ((unsigned char *)snk)[-1]; /* Original character just before sink */ - - bcopy(src, snk, lgn); - - if(((unsigned char *)snk)[lgn] != ec) { /* Did we copy too far forward? */ - crap = (unsigned int)&((unsigned char *)snk)[i]; - db_printf("Copied too far at sink[%d] (%08X) it is %02X\n", i, crap, ((unsigned char *)snk)[lgn]); - return 1; - } - - if(((unsigned char *)snk)[-1] != pc) { /* Did we copy too far backward? */ - crap = (unsigned int)&((unsigned char *)snk)[i]; - db_printf("Copied too far at sink[%d] (%08X) it is %02X\n", i, crap, ((unsigned char *)snk)[lgn]); - return 1; - } - - for(i = 0; i < lgn; i++) { /* Check sink byte sequence */ - if(((unsigned char *)snk)[i] != ic) { - crap = (unsigned int)&((unsigned char *)snk)[i]; - db_printf("bad copy at sink[%d] (%08X) it is %02X\n", i ,crap, ((unsigned char *)snk)[i]); - return 1; - } - ic = (ic + 1) % patper; - } - - for(i=0; i < lgn; i++) { /* Restore pattern */ - ((unsigned char *)snk)[i] = oic; - oic = (oic + 1) % patper; - } - - return 0; - -} - -int dumbcopy(void *src, void *snk, unsigned int lgn) { - unsigned int i; - char *p = (char *)snk; - char *q = (char *)src; - - for(i = 0; i < lgn; i++) { - *p++ = *q++; - } - return 0; - -} - - - - - - - - - - - - - diff --git a/osfmk/ppc/bits.s b/osfmk/ppc/bits.s deleted file mode 100644 index d8d5960d5..000000000 --- a/osfmk/ppc/bits.s +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - * - */ - -#include -#include - -# -# void setbit(int bitno, int *s) -# -# Set indicated bit in bit string. -# Note: being big-endian, bit 0 is 0x80000000. - -ENTRY(setbit,TAG_NO_FRAME_USED) - - rlwinm r8,r3,29,3,31 /* Get byte displacement */ - rlwinm r9,r3,0,29,31 /* Get bit within byte */ - li r6,0x80 /* Start with bit 0 */ - lbzx r5,r4,r8 /* Grab target byte */ - srw r6,r6,r9 /* Get the right bit (fits right into the load cycle) */ - or r5,r5,r6 /* Turn on the right bit */ - stbx r5,r4,r8 /* Save the byte back */ - blr - -# -# void clrbit(int bitno, int *s) -# -# Clear indicated bit in bit string. -# Note: being big-endian, bit 0 is 0x80000000. - -ENTRY(clrbit,TAG_NO_FRAME_USED) - - rlwinm r8,r3,29,3,31 /* Get byte displacement */ - rlwinm r9,r3,0,29,31 /* Get bit within byte */ - li r6,0x80 /* Start with bit 0 */ - lbzx r5,r4,r8 /* Grab target byte */ - srw r6,r6,r9 /* Get the right bit (fits right into the load cycle) */ - andc r5,r5,r6 /* Turn off the right bit */ - stbx r5,r4,r8 /* Save the byte back */ - blr - - -# /* -# * Find first bit set in bit string. -# */ -# int -# ffsbit(int *s) -# -# Returns the bit index of the first bit set (starting from 0) -# Assumes pointer is word-aligned - -ENTRY(ffsbit, TAG_NO_FRAME_USED) - lwz r0, 0(ARG0) - mr ARG1, ARG0 /* Free up ARG0 for result */ - - cmpwi r0, 0 /* Check against zero... */ - cntlzw ARG0, r0 /* Free inst... find the set bit... */ - bnelr+ /* Return if bit in first word */ - -.L_ffsbit_lp: - lwz r0, 4(ARG1) - addi ARG1, ARG1, 4 - cmpwi r0, 0 /* Check against zero... */ - cntlzw r12, r0 - add ARG0, ARG0, r12 /* ARG0 keeps bit count */ - beq+ .L_ffsbit_lp - blr - -/* - * int tstbit(int bitno, int *s) - * - * Test indicated bit in bit string. - * Note: being big-endian, bit 0 is 0x80000000. - */ - -ENTRY2(tstbit, testbit, TAG_NO_FRAME_USED) - - rlwinm r8,r3,29,3,31 /* Get byte displacement */ - rlwinm r9,r3,0,29,31 /* Get bit within byte */ - lbzx r5,r4,r8 /* Grab target byte */ - addi r9,r9,25 /* Get actual shift value */ - rlwnm r3,r5,r9,31,31 /* Pass the bit back */ - blr diff --git a/osfmk/ppc/boot.h b/osfmk/ppc/boot.h deleted file mode 100644 index 9d3e885ee..000000000 --- a/osfmk/ppc/boot.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#include diff --git a/osfmk/ppc/bzero.s b/osfmk/ppc/bzero.s deleted file mode 100644 index 0dbd810b4..000000000 --- a/osfmk/ppc/bzero.s +++ /dev/null @@ -1,331 +0,0 @@ -/* - * Copyright (c) 2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include - - .text - .align 2 - .globl _memset - .globl _bzero - .globl _bzero_nc - .globl _bzero_phys - .globl _bzero_phys_nc - - -// ***************************** -// * B Z E R O _ P H Y S _ N C * -// ***************************** -// -// void bzero_phys_nc(addr64_t phys_addr, uint32_t length); -// -// Takes a phys addr in (r3,r4), and length in r5. NO CACHING - - .align 5 -LEXT(bzero_phys_nc) - mflr r12 // save return address - rlwinm r3,r3,0,1,0 // coallesce long-long in (r3,r4) into reg64_t in r3 - rlwimi r3,r4,0,0,31 - mr r4,r5 // put length where bzero() expects it - bl EXT(ml_set_physical_get_ffs) // turn DR off, SF on, features in cr6, old MSR in r11 - bl EXT(bzero_nc) // use normal bzero() routine - mtlr r12 // restore return - b EXT(ml_restore) // restore MSR, turning DR on and SF off - - -// *********************** -// * B Z E R O _ P H Y S * -// *********************** -// -// void bzero_phys(addr64_t phys_addr, uint32_t length); -// -// Takes a phys addr in (r3,r4), and length in r5. We leave cache on. - - .align 5 -LEXT(bzero_phys) - mflr r12 // save return address - rlwinm r3,r3,0,1,0 // coallesce long-long in (r3,r4) into reg64_t in r3 - rlwimi r3,r4,0,0,31 - mr r4,r5 // put length where bzero() expects it - bl EXT(ml_set_physical_get_ffs) // turn DR off, SF on, features in cr6, old MSR in r11 - bl EXT(bzero) // use normal bzero() routine - mtlr r12 // restore return - b EXT(ml_restore) // restore MSR, turning DR on and SF off - - -// ******************* -// * B Z E R O _ N C * -// ******************* -// -// void bzero_nc(char *addr, unsigned int length); -// -// For use with uncached memory. Doesn't seem to be used at all, so probably not -// performance critical. NB: we must avoid unaligned stores, because some -// machines (eg, 970) take alignment exceptions on _any_ unaligned op to uncached -// memory. Of course, we must also avoid dcbz. - -LEXT(bzero_nc) - cmplwi cr1,r4,20 // too short to bother with 16-byte loops? - cmplwi cr7,r4,0 // check for (len==0) - li r6,0 // get a 0 - bge cr1,bznc1 // skip if length >=20 - mtctr r4 // set up byte loop - beqlr-- cr7 // done if len=0 - -// Short operands, loop over bytes. - -bznc0: - stb r6,0(r3) - addi r3,r3,1 - bdnz bznc0 - blr - -// Handle operands long enough to do doubleword stores; we must doubleword -// align, to avoid alignment exceptions. - -bznc1: - neg r7,r3 // start to compute #bytes to align - mfsprg r10,2 // get feature flags - andi. r0,r7,7 // get #bytes to doubleword align - mr r5,r3 // make copy of operand ptr as bcopy expects - mtcrf 0x02,r10 // put pf64Bitb etc in cr6 - beq bzero_tail // already doubleword aligned - sub r4,r4,r0 // adjust count - mtctr r0 // set up loop -bznc2: // zero bytes until doubleword aligned - stb r6,0(r5) - addi r5,r5,1 - bdnz bznc2 - b bzero_tail // join bzero, now that r5 is aligned - - -// ************* *************** -// * B Z E R O * and * M E M S E T * -// ************* *************** -// -// void * memset(void *b, int c, size_t len); -// void bzero(void *b, size_t len); -// -// These routines support G3, G4, and the 970, and run in both 32 and -// 64-bit mode. Lengths (size_t) are always 32 bits. -// -// Register use: -// r0 = temp -// r2 = temp -// r3 = original ptr, not changed since memset returns it -// r4 = count of bytes to set -// r5 = working operand ptr ("rp") -// r6 = value to store (usually 0) -// r7-r9 = temps -// r10 = feature flags -// r11 = old MSR (if bzero_phys) -// r12 = return address (if bzero_phys) -// cr6 = feature flags (pf64Bit, pf128Byte, and pf32Byte) - - .align 5 -LEXT(memset) // void * memset(void *b, int c, size_t len); - andi. r6,r4,0xFF // copy value to working register, test for 0 - mr r4,r5 // move length to working register - bne-- memset1 // skip if nonzero -LEXT(bzero) // void bzero(void *b, size_t len); - dcbtst 0,r3 // touch in 1st cache block - mfsprg r10,2 // get features - li r6,0 // get a 0 - neg r7,r3 // start to compute #bytes to align - andi. r0,r10,pf128Byte+pf32Byte // get cache line size - mtcrf 0x02,r10 // put pf128Byte etc in cr6 - cmplw r4,r0 // operand length >= cache line size? - mr r5,r3 // make copy of operand ptr (can't change r3) - blt bzero_tail // too short for dcbz (or dcbz128) - rlwinm r0,r7,0,0x1F // get #bytes to 32-byte align - rlwinm r9,r7,0,0x7F // get #bytes to 128-byte align - bt++ pf128Byteb,bzero_128 // skip if 128-byte processor - -// Operand length >=32 and cache line size is 32. -// r0 = #bytes to 32-byte align -// r4 = length -// r5 = ptr to operand -// r6 = 0 - - sub r2,r4,r0 // adjust length - cmpwi cr1,r0,0 // already 32-byte aligned? - srwi. r8,r2,5 // get #32-byte chunks - beq bzero_tail // not long enough to dcbz - mtctr r8 // set up loop count - rlwinm r4,r2,0,27,31 // mask down to leftover byte count - beq cr1,bz_dcbz32 // skip if already 32-byte aligned - -// 32-byte align. We just store 32 0s, rather than test and use conditional -// branches. This is usually faster, because there are no mispredicts. - - stw r6,0(r5) // zero next 32 bytes - stw r6,4(r5) - stw r6,8(r5) - stw r6,12(r5) - stw r6,16(r5) - stw r6,20(r5) - stw r6,24(r5) - stw r6,28(r5) - add r5,r5,r0 // now r5 is 32-byte aligned - b bz_dcbz32 - -// Loop doing 32-byte version of DCBZ instruction. - - .align 4 // align the inner loop -bz_dcbz32: - dcbz 0,r5 // zero another 32 bytes - addi r5,r5,32 - bdnz bz_dcbz32 - -// Store trailing bytes. This routine is used both by bzero and memset. -// r4 = #bytes to store (may be large if memset) -// r5 = address -// r6 = value to store (in all 8 bytes) -// cr6 = pf64Bit etc flags - -bzero_tail: - srwi. r0,r4,4 // get #(16-byte-chunks) - mtcrf 0x01,r4 // remaining byte count to cr7 - beq bzt3 // no 16-byte chunks - mtctr r0 // set up loop count - bt++ pf64Bitb,bzt2 // skip if 64-bit processor - b bzt1 - .align 5 -bzt1: // loop over 16-byte chunks on 32-bit processor - stw r6,0(r5) - stw r6,4(r5) - stw r6,8(r5) - stw r6,12(r5) - addi r5,r5,16 - bdnz bzt1 - b bzt3 - .align 5 -bzt2: // loop over 16-byte chunks on 64-bit processor - std r6,0(r5) - std r6,8(r5) - addi r5,r5,16 - bdnz bzt2 - bf 28,bzt4 // 8-byte chunk? - std r6,0(r5) - addi r5,r5,8 - b bzt4 -bzt3: - bf 28,bzt4 // 8-byte chunk? - stw r6,0(r5) - stw r6,4(r5) - addi r5,r5,8 -bzt4: - bf 29,bzt5 // word? - stw r6,0(r5) - addi r5,r5,4 -bzt5: - bf 30,bzt6 // halfword? - sth r6,0(r5) - addi r5,r5,2 -bzt6: - bflr 31 // byte? - stb r6,0(r5) - blr - -// Operand length is >=128 and cache line size is 128. We assume that -// because the linesize is 128 bytes, this is a 64-bit processor. -// r4 = length -// r5 = ptr to operand -// r6 = 0 -// r7 = neg(r5) -// r9 = #bytes to 128-byte align - - .align 5 -bzero_128: - sub r2,r4,r9 // r2 <- length remaining after cache-line aligning - rlwinm r0,r7,0,0xF // r0 <- #bytes to 16-byte align - srwi. r8,r2,7 // r8 <- number of cache lines to 0 - std r6,0(r5) // always store 16 bytes to 16-byte align... - std r6,8(r5) // ...even if too short for dcbz128 - add r5,r5,r0 // 16-byte align ptr - sub r4,r4,r0 // adjust count - beq bzero_tail // r8==0, not long enough to dcbz128 - sub. r7,r9,r0 // get #bytes remaining to 128-byte align - rlwinm r4,r2,0,0x7F // r4 <- length remaining after dcbz128'ing - mtctr r8 // set up dcbz128 loop - beq bz_dcbz128 // already 128-byte aligned - b bz_align // enter loop over 16-byte chunks - -// 128-byte align by looping over 16-byte chunks. - - .align 5 -bz_align: // loop over 16-byte chunks - subic. r7,r7,16 // more to go? - std r6,0(r5) - std r6,8(r5) - addi r5,r5,16 - bgt bz_align - - b bz_dcbz128 // enter dcbz128 loop - -// Loop over 128-byte cache lines. -// r4 = length remaining after cache lines (0..127) -// r5 = ptr (128-byte aligned) -// r6 = 0 -// ctr = count of cache lines to 0 - - .align 5 -bz_dcbz128: - dcbz128 0,r5 // zero a 128-byte cache line - addi r5,r5,128 - bdnz bz_dcbz128 - - b bzero_tail // handle leftovers - - -// Handle memset() for nonzero values. This case is relatively infrequent; -// the large majority of memset() calls are for 0. -// r3 = ptr -// r4 = count -// r6 = value in lower byte (nonzero) - -memset1: - cmplwi r4,16 // too short to bother aligning? - rlwimi r6,r6,8,16,23 // replicate value to low 2 bytes - mr r5,r3 // make working copy of operand ptr - rlwimi r6,r6,16,0,15 // value now in all 4 bytes - blt bzero_tail // length<16, we won't be using "std" - mfsprg r10,2 // get feature flags - neg r7,r5 // start to compute #bytes to align - rlwinm r6,r6,0,1,0 // value now in all 8 bytes (if 64-bit) - andi. r0,r7,7 // r6 <- #bytes to doubleword align - stw r6,0(r5) // store 8 bytes to avoid a loop - stw r6,4(r5) - mtcrf 0x02,r10 // get pf64Bit flag etc in cr6 - sub r4,r4,r0 // adjust count - add r5,r5,r0 // doubleword align ptr - b bzero_tail - - - diff --git a/osfmk/ppc/cache.s b/osfmk/ppc/cache.s deleted file mode 100644 index 94aa0aeeb..000000000 --- a/osfmk/ppc/cache.s +++ /dev/null @@ -1,389 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -#include -#include -#include -#include - -/* These routines run in 32 or 64-bit addressing, and handle - * 32 and 128 byte caches. They do not use compare instructions - * on addresses, since compares are 32/64-bit-mode-specific. - */ - -#define kDcbf 0x1 -#define kDcbfb 31 -#define kDcbi 0x2 -#define kDcbib 30 -#define kIcbi 0x4 -#define kIcbib 29 - - -/* - * extern void flush_dcache(vm_offset_t addr, unsigned count, boolean phys); - * extern void flush_dcache64(addr64_t addr, unsigned count, boolean phys); - * - * flush_dcache takes a virtual or physical address and count to flush - * and (can be called for multiple virtual pages). - * - * it flushes the data cache - * cache for the address range in question - * - * if 'phys' is non-zero then physical addresses will be used - */ - - - - .text - .align 5 - .globl _flush_dcache -_flush_dcache: - li r0,kDcbf // use DCBF instruction - rlwinm r3,r3,0,0,31 // truncate address in case this is a 64-bit machine - b cache_op_join // join common code - - .align 5 - .globl _flush_dcache64 -_flush_dcache64: - rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg - li r0,kDcbf // use DCBF instruction - rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits - mr r4,r5 ; Move count - mr r5,r6 ; Move physical flag - b cache_op_join // join common code - - -/* - * extern void invalidate_dcache(vm_offset_t va, unsigned count, boolean phys); - * extern void invalidate_dcache64(addr64_t va, unsigned count, boolean phys); - * - * invalidate_dcache takes a virtual or physical address and count to - * invalidate and (can be called for multiple virtual pages). - * - * it invalidates the data cache for the address range in question - */ - - .globl _invalidate_dcache -_invalidate_dcache: - li r0,kDcbi // use DCBI instruction - rlwinm r3,r3,0,0,31 // truncate address in case this is a 64-bit machine - b cache_op_join // join common code - - - .align 5 - .globl _invalidate_dcache64 -_invalidate_dcache64: - rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg - li r0,kDcbi // use DCBI instruction - rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits - mr r4,r5 ; Move count - mr r5,r6 ; Move physical flag - b cache_op_join // join common code - -/* - * extern void invalidate_icache(vm_offset_t addr, unsigned cnt, boolean phys); - * extern void invalidate_icache64(addr64_t addr, unsigned cnt, boolean phys); - * - * invalidate_icache takes a virtual or physical address and - * count to invalidate, (can be called for multiple virtual pages). - * - * it invalidates the instruction cache for the address range in question. - */ - - .globl _invalidate_icache -_invalidate_icache: - li r0,kIcbi // use ICBI instruction - rlwinm r3,r3,0,0,31 // truncate address in case this is a 64-bit machine - b cache_op_join // join common code - - - .align 5 - .globl _invalidate_icache64 -_invalidate_icache64: - rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg - li r0,kIcbi // use ICBI instruction - rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits - mr r4,r5 ; Move count - mr r5,r6 ; Move physical flag - b cache_op_join // join common code - -/* - * extern void sync_ppage(ppnum_t pa); - * - * sync_ppage takes a physical page number - * - * it writes out the data cache and invalidates the instruction - * cache for the address range in question - */ - - .globl _sync_ppage - .align 5 -_sync_ppage: // Should be the most commonly called routine, by far - mfsprg r2,2 - li r0,kDcbf+kIcbi // we need to dcbf and then icbi - mtcrf 0x02,r2 ; Move pf64Bit to cr6 - li r5,1 // set flag for physical addresses - li r4,4096 ; Set page size - bt++ pf64Bitb,spp64 ; Skip if 64-bit (only they take the hint) - rlwinm r3,r3,12,0,19 ; Convert to physical address - 32-bit - b cache_op_join ; Join up.... - -spp64: sldi r3,r3,12 ; Convert to physical address - 64-bit - b cache_op_join ; Join up.... - - - -/* - * extern void sync_cache_virtual(vm_offset_t addr, unsigned count); - * - * Like "sync_cache", except it takes a virtual address and byte count. - * It flushes the data cache, invalidates the I cache, and sync's. - */ - - .globl _sync_cache_virtual - .align 5 -_sync_cache_virtual: - li r0,kDcbf+kIcbi // we need to dcbf and then icbi - li r5,0 // set flag for virtual addresses - b cache_op_join // join common code - - -/* - * extern void sync_cache(vm_offset_t pa, unsigned count); - * extern void sync_cache64(addr64_t pa, unsigned count); - * - * sync_cache takes a physical address and count to sync, thus - * must not be called for multiple virtual pages. - * - * it writes out the data cache and invalidates the instruction - * cache for the address range in question - */ - - .globl _sync_cache - .align 5 -_sync_cache: - li r0,kDcbf+kIcbi // we need to dcbf and then icbi - li r5,1 // set flag for physical addresses - rlwinm r3,r3,0,0,31 // truncate address in case this is a 64-bit machine - b cache_op_join // join common code - - .globl _sync_cache64 - .align 5 -_sync_cache64: - rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg - li r0,kDcbf+kIcbi // we need to dcbf and then icbi - rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits - mr r4,r5 ; Copy over the length - li r5,1 // set flag for physical addresses - - - // Common code to handle the cache operations. - -cache_op_join: // here with r3=addr, r4=count, r5=phys flag, r0=bits - mfsprg r10,2 // r10 <- processor feature flags - cmpwi cr5,r5,0 // using physical addresses? - mtcrf 0x01,r0 // move kDcbf, kDcbi, and kIcbi bits to CR7 - andi. r9,r10,pf32Byte+pf128Byte // r9 <- cache line size - mtcrf 0x02,r10 // move pf64Bit bit to CR6 - subi r8,r9,1 // r8 <- (linesize-1) - beq-- cr5,cache_op_2 // skip if using virtual addresses - - bf-- pf64Bitb,cache_op_not64 // This is not a 64-bit machine - - srdi r12,r3,31 // Slide bit 32 to bit 63 - cmpldi r12,1 // Are we in the I/O mapped area? - beqlr-- // No cache ops allowed here... - -cache_op_not64: - mflr r12 // save return address - bl EXT(ml_set_physical) // turn on physical addressing - mtlr r12 // restore return address - - // get r3=first cache line, r4=first line not in set, r6=byte count - -cache_op_2: - add r7,r3,r4 // point to 1st byte not to operate on - andc r3,r3,r8 // r3 <- 1st line to operate on - add r4,r7,r8 // round up - andc r4,r4,r8 // r4 <- 1st line not to operate on - sub. r6,r4,r3 // r6 <- byte count to operate on - beq-- cache_op_exit // nothing to do - bf-- kDcbfb,cache_op_6 // no need to dcbf - - - // DCBF loop - -cache_op_5: - sub. r6,r6,r9 // more to go? - dcbf r6,r3 // flush next line to RAM - bne cache_op_5 // loop if more to go - sync // make sure the data reaches RAM - sub r6,r4,r3 // reset count - - - // ICBI loop - -cache_op_6: - bf-- kIcbib,cache_op_8 // no need to icbi -cache_op_7: - sub. r6,r6,r9 // more to go? - icbi r6,r3 // invalidate next line - bne cache_op_7 - sub r6,r4,r3 // reset count - isync - sync - - - // DCBI loop - -cache_op_8: - bf++ kDcbib,cache_op_exit // no need to dcbi -cache_op_9: - sub. r6,r6,r9 // more to go? - dcbi r6,r3 // invalidate next line - bne cache_op_9 - sync - - - // restore MSR iff necessary and done - -cache_op_exit: - beqlr-- cr5 // if using virtual addresses, no need to restore MSR - b EXT(ml_restore) // restore MSR and return - - -//////////////////////////////////////////////////// - - .align 5 - .globl _dcache_incoherent_io_store64 -_dcache_incoherent_io_store64: - rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg - rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits - mr r4,r5 ; Move count - - // here with r3=addr, r4=count - mfsprg r10,2 // r10 <- processor feature flags - andi. r9,r10,pf32Byte+pf128Byte // r9 <- cache line size - mtcrf 0x02,r10 // move pf64Bit bit to CR6 - subi r8,r9,1 // r8 <- (linesize-1) - - bf-- pf64Bitb,cache_ios_not64 // This is not a 64-bit machine - - srdi r12,r3,31 // Slide bit 32 to bit 63 - cmpldi r12,1 // Are we in the I/O mapped area? - beqlr-- // No cache ops allowed here... - -cache_ios_not64: - mflr r12 // save return address - bl EXT(ml_set_physical) // turn on physical addressing - mtlr r12 // restore return address - - // get r3=first cache line, r4=first line not in set, r6=byte count - add r7,r3,r4 // point to 1st byte not to operate on - andc r3,r3,r8 // r3 <- 1st line to operate on - add r4,r7,r8 // round up - andc r4,r4,r8 // r4 <- 1st line not to operate on - sub. r6,r4,r3 // r6 <- byte count to operate on - beq-- cache_ios_exit // nothing to do - - sub. r6,r6,r9 // >1 line? - beq cache_ios_last_line // use dcbst on all lines but last - - // DCBST loop -cache_ios_5: - sub. r6,r6,r9 // more to go? - dcbst r6,r3 // store next line - bne cache_ios_5 // loop if more to go - -cache_ios_last_line: - sync // flush last line - isync - dcbf r6,r3 - sync - isync - add r6,r6,r3 - lwz r0,0(r6) // make sure the data reaches RAM (not just the memory controller) - isync - - // restore MSR -cache_ios_exit: - b EXT(ml_restore) // restore MSR and return - - -//////////////////////////////////////////////////// - - .align 5 - .globl _dcache_incoherent_io_flush64 -_dcache_incoherent_io_flush64: - rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg - rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits - mr r4,r5 ; Move count - - // here with r3=addr, r4=count - mfsprg r10,2 // r10 <- processor feature flags - andi. r9,r10,pf32Byte+pf128Byte // r9 <- cache line size - mtcrf 0x02,r10 // move pf64Bit bit to CR6 - subi r8,r9,1 // r8 <- (linesize-1) - - bf-- pf64Bitb,cache_iof_not64 // This is not a 64-bit machine - - srdi r12,r3,31 // Slide bit 32 to bit 63 - cmpldi r12,1 // Are we in the I/O mapped area? - beqlr-- // No cache ops allowed here... - -cache_iof_not64: - mflr r12 // save return address - bl EXT(ml_set_physical) // turn on physical addressing - mtlr r12 // restore return address - - // get r3=first cache line, r4=first line not in set, r6=byte count - add r7,r3,r4 // point to 1st byte not to operate on - andc r3,r3,r8 // r3 <- 1st line to operate on - add r4,r7,r8 // round up - andc r4,r4,r8 // r4 <- 1st line not to operate on - sub. r6,r4,r3 // r6 <- byte count to operate on - beq-- cache_iof_exit // nothing to do - - // DCBF loop -cache_iof_5: - sub. r6,r6,r9 // more to go? - dcbf r6,r3 // store next line - bne cache_iof_5 // loop if more to go - -cache_iof_last_line: - sync // flush last line - isync - - // restore MSR -cache_iof_exit: - b EXT(ml_restore) // restore MSR and return - - diff --git a/osfmk/ppc/commpage/atomic.s b/osfmk/ppc/commpage/atomic.s deleted file mode 100644 index a53e61fe3..000000000 --- a/osfmk/ppc/commpage/atomic.s +++ /dev/null @@ -1,280 +0,0 @@ -/* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include // EXT, LEXT -#include -#include - - -/* OSAtomic.h library native implementations. */ - - .text - .align 2 - -atomic_add32: // int32_t OSAtomicAdd32( int32_t amt, int32_t *value ); -1: - lwarx r5,0,r4 - add r6,r3,r5 - stwcx. r6,0,r4 - bne-- 1b - mr r3,r6 - blr - - COMMPAGE_DESCRIPTOR(atomic_add32,_COMM_PAGE_ATOMIC_ADD32,0,0,kCommPageBoth) - - -atomic_add64: // int64_t OSAtomicAdd64( int64_t amt, int64_t *value ); -1: - ldarx r5,0,r4 - add r6,r3,r5 - stdcx. r6,0,r4 - bne-- 1b - mr r3,r6 - blr - - COMMPAGE_DESCRIPTOR(atomic_add64,_COMM_PAGE_ATOMIC_ADD64,k64Bit,0,kCommPage64) - -/* WARNING: Libc clients assume compare-and-swap preserves r4, r5, and r9-r12! */ -/* This is the no-barrier version */ -compare_and_swap32_on32: // bool OSAtomicCompareAndSwap32( int32_t old, int32_t new, int32_t *value); -1: - lwarx r7,0,r5 - cmplw r7,r3 - bne- 2f - stwcx. r4,0,r5 - bne- 1b - li r3,1 - blr -2: - li r3,0 // return failure - blr - - COMMPAGE_DESCRIPTOR(compare_and_swap32_on32,_COMM_PAGE_COMPARE_AND_SWAP32,0,k64Bit,kCommPageBoth) - - -/* WARNING: Libc clients assume compare-and-swap preserves r4, r5, and r9-r12! */ -/* This is the no-barrier version */ -compare_and_swap32_on64: // bool OSAtomicCompareAndSwap32( int32_t old, int32_t new, int32_t *value); -1: - lwarx r7,0,r5 - cmplw r7,r3 - bne-- 2f - stwcx. r4,0,r5 - bne-- 1b - li r3,1 - blr -2: - li r8,-8 // on 970, must release reservation - li r3,0 // return failure - stwcx. r4,r8,r1 // store into red zone to release - blr - - COMMPAGE_DESCRIPTOR(compare_and_swap32_on64,_COMM_PAGE_COMPARE_AND_SWAP32,k64Bit,0,kCommPageBoth) - - -/* WARNING: Libc clients assume compare-and-swap preserves r4, r5, and r9-r12! */ -/* This is the no-barrier version */ -compare_and_swap64: // bool OSAtomicCompareAndSwap64( int64_t old, int64_t new, int64_t *value); -1: - ldarx r7,0,r5 - cmpld r7,r3 - bne-- 2f - stdcx. r4,0,r5 - bne-- 1b - li r3,1 - blr -2: - li r8,-8 // on 970, must release reservation - li r3,0 // return failure - stdcx. r4,r8,r1 // store into red zone to release - blr - - COMMPAGE_DESCRIPTOR(compare_and_swap64,_COMM_PAGE_COMPARE_AND_SWAP64,k64Bit,0,kCommPage64) - -/* WARNING: Libc clients assume compare-and-swap preserves r4, r5, and r9-r12! */ -/* This version of compare-and-swap incorporates a memory barrier. */ -compare_and_swap32_on32b: // bool OSAtomicCompareAndSwapBarrier32( int32_t old, int32_t new, int32_t *value); - eieio // write barrier, NOP'd on a UP -1: - lwarx r7,0,r5 - cmplw r7,r3 - bne- 2f - stwcx. r4,0,r5 - bne- 1b - isync // read barrier, NOP'd on a UP - li r3,1 - blr -2: - li r3,0 // return failure - blr - - COMMPAGE_DESCRIPTOR(compare_and_swap32_on32b,_COMM_PAGE_COMPARE_AND_SWAP32B,0,k64Bit,kCommPageBoth+kCommPageSYNC+kCommPageISYNC) - - -/* WARNING: Libc clients assume compare-and-swap preserves r4, r5, and r9-r12! */ -/* This version of compare-and-swap incorporates a memory barrier. */ -compare_and_swap32_on64b: // bool OSAtomicCompareAndSwapBarrier32( int32_t old, int32_t new, int32_t *value); - lwsync // write barrier, NOP'd on a UP -1: - lwarx r7,0,r5 - cmplw r7,r3 - bne-- 2f - stwcx. r4,0,r5 - bne-- 1b - isync // read barrier, NOP'd on a UP - li r3,1 - blr -2: - li r8,-8 // on 970, must release reservation - li r3,0 // return failure - stwcx. r4,r8,r1 // store into red zone to release - blr - - COMMPAGE_DESCRIPTOR(compare_and_swap32_on64b,_COMM_PAGE_COMPARE_AND_SWAP32B,k64Bit,0,kCommPageBoth+kCommPageSYNC+kCommPageISYNC) - - -/* WARNING: Libc clients assume compare-and-swap preserves r4, r5, and r9-r12! */ -/* This version of compare-and-swap incorporates a memory barrier. */ -compare_and_swap64b: // bool OSAtomicCompareAndSwapBarrier64( int64_t old, int64_t new, int64_t *value); - lwsync // write barrier, NOP'd on a UP -1: - ldarx r7,0,r5 - cmpld r7,r3 - bne-- 2f - stdcx. r4,0,r5 - bne-- 1b - isync // read barrier, NOP'd on a UP - li r3,1 - blr -2: - li r8,-8 // on 970, must release reservation - li r3,0 // return failure - stdcx. r4,r8,r1 // store into red zone to release - blr - - COMMPAGE_DESCRIPTOR(compare_and_swap64b,_COMM_PAGE_COMPARE_AND_SWAP64B,k64Bit,0,kCommPage64+kCommPageSYNC+kCommPageISYNC) - - -atomic_enqueue32: // void OSAtomicEnqueue( void **list, void *new, size_t offset); -1: - lwarx r6,0,r3 // get link to 1st on list - stwx r6,r4,r5 // hang list off new node - eieio // make sure the "stwx" comes before "stwcx." (nop'd on UP) - stwcx. r4,0,r3 // make new 1st on list - beqlr++ - b 1b - - COMMPAGE_DESCRIPTOR(atomic_enqueue32,_COMM_PAGE_ENQUEUE,0,0,kCommPageSYNC+kCommPage32) - - -atomic_enqueue64: // void OSAtomicEnqueue( void **list, void *new, size_t offset); -1: - ldarx r6,0,r3 // get link to 1st on list - stdx r6,r4,r5 // hang list off new node - lwsync // make sure the "stdx" comes before the "stdcx." (nop'd on UP) - stdcx. r4,0,r3 // make new 1st on list - beqlr++ - b 1b - - COMMPAGE_DESCRIPTOR(atomic_enqueue64,_COMM_PAGE_ENQUEUE,k64Bit,0,kCommPageSYNC+kCommPage64) - - -atomic_dequeue32_on32: // void* OSAtomicDequeue( void **list, size_t offset); - mr r5,r3 -1: - lwarx r3,0,r5 // get 1st in list - cmpwi r3,0 // null? - beqlr // yes, list empty - lwzx r6,r3,r4 // get 2nd - stwcx. r6,0,r5 // make 2nd first - bne-- 1b - isync // cancel read-aheads (nop'd on UP) - blr - - COMMPAGE_DESCRIPTOR(atomic_dequeue32_on32,_COMM_PAGE_DEQUEUE,0,k64Bit,kCommPageISYNC+kCommPage32) - - -atomic_dequeue32_on64: // void* OSAtomicDequeue( void **list, size_t offset); - mr r5,r3 - li r7,-8 // use red zone to release reservation if necessary -1: - lwarx r3,0,r5 // get 1st in list - cmpwi r3,0 // null? - beq 2f - lwzx r6,r3,r4 // get 2nd - stwcx. r6,0,r5 // make 2nd first - isync // cancel read-aheads (nop'd on UP) - beqlr++ // return next element in r2 - b 1b // retry (lost reservation) -2: - stwcx. r0,r7,r1 // on 970, release reservation using red zone - blr // return null - - COMMPAGE_DESCRIPTOR(atomic_dequeue32_on64,_COMM_PAGE_DEQUEUE,k64Bit,0,kCommPageISYNC+kCommPage32) - - -atomic_dequeue64: // void* OSAtomicDequeue( void **list, size_t offset); - mr r5,r3 - li r7,-8 // use red zone to release reservation if necessary -1: - ldarx r3,0,r5 // get 1st in list - cmpdi r3,0 // null? - beq 2f - ldx r6,r3,r4 // get 2nd - stdcx. r6,0,r5 // make 2nd first - isync // cancel read-aheads (nop'd on UP) - beqlr++ // return next element in r2 - b 1b // retry (lost reservation) -2: - stdcx. r0,r7,r1 // on 970, release reservation using red zone - blr // return null - - COMMPAGE_DESCRIPTOR(atomic_dequeue64,_COMM_PAGE_DEQUEUE,k64Bit,0,kCommPageISYNC+kCommPage64) - - -memory_barrier_up: // void OSMemoryBarrier( void ) - blr // nothing to do on UP - - COMMPAGE_DESCRIPTOR(memory_barrier_up,_COMM_PAGE_MEMORY_BARRIER,kUP,0,kCommPageBoth) - - -memory_barrier_mp32: // void OSMemoryBarrier( void ) - isync // we use eieio in preference to sync... - eieio // ...because it is faster - blr - - COMMPAGE_DESCRIPTOR(memory_barrier_mp32,_COMM_PAGE_MEMORY_BARRIER,0,kUP+k64Bit,kCommPage32) - - -memory_barrier_mp64: // void OSMemoryBarrier( void ) - isync - lwsync // on 970, lwsync is faster than eieio - blr - - COMMPAGE_DESCRIPTOR(memory_barrier_mp64,_COMM_PAGE_MEMORY_BARRIER,k64Bit,kUP,kCommPageBoth) diff --git a/osfmk/ppc/commpage/bcopy_64.s b/osfmk/ppc/commpage/bcopy_64.s deleted file mode 100644 index 4d0b2c9bd..000000000 --- a/osfmk/ppc/commpage/bcopy_64.s +++ /dev/null @@ -1,306 +0,0 @@ -/* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* ======================================= - * BCOPY, MEMCPY, and MEMMOVE for Mac OS X - * ======================================= - * - * Version of 2/20/2003, for a hypothetic 64-bit processor without Altivec. - * This version might be used bringing up new processors, with known - * Altivec bugs that need to be worked around. It is not particularly well - * optimized. - * - * For 64-bit processors with a 128-byte cache line, running in either - * 32- or 64-bit mode. This is written for 32-bit execution, the kernel - * will translate to 64-bit code when it compiles the 64-bit commpage. - * - * Register usage. Note we use R2, so this code will not run in a PEF/CFM - * environment. - * r0 = "w7" or temp - * r2 = "w8" - * r3 = not used, as memcpy and memmove return 1st parameter as a value - * r4 = source ptr ("rs") - * r5 = count of bytes to move ("rc") - * r6 = "w1" - * r7 = "w2" - * r8 = "w3" - * r9 = "w4" - * r10 = "w5" - * r11 = "w6" - * r12 = destination ptr ("rd") - */ -#define rs r4 -#define rd r12 -#define rc r5 -#define rv r2 - -#define w1 r6 -#define w2 r7 -#define w3 r8 -#define w4 r9 -#define w5 r10 -#define w6 r11 -#define w7 r0 -#define w8 r2 - -#include -#include -#include -#include - - .text - -#define kLong 64 // too long for inline loopless code - - -// Main entry points. - - .align 5 -bcopy_64: // void bcopy(const void *src, void *dst, size_t len) - cmplwi rc,kLong // short or long? - sub w1,r4,r3 // must move in reverse if (rd-rs) -#include -#include -#include - - .text -/* - * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary - * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following - * simple transformations: - * - all word compares are changed to doubleword - * - all "srwi[.]" opcodes are changed to "srdi[.]" - * Nothing else is done. For this to work, the following rules must be - * carefully followed: - * - do not use carry or overflow - * - only use record mode if you are sure the results are mode-invariant - * for example, all "andi." and almost all "rlwinm." are fine - * - do not use "slwi", "slw", or "srw" - * An imaginative programmer could break the porting model in other ways, but the above - * are the most likely problem areas. It is perhaps surprising how well in practice - * this simple method works. - */ - -#define kShort 64 -#define kVeryLong (128*1024) - - -// Main entry points. - - .align 5 -bcopy_970: // void bcopy(const void *src, void *dst, size_t len) - cmplwi rc,kShort // short or long? - sub w1,r4,r3 // must move in reverse if (rd-rs)=kVeryLong (ie, several pages), then use the -// "bigcopy" path that pulls all the punches. This is the fastest -// case for cold-cache operands, as any this long will likely be. -// 2. If length>=128 and source is 16-byte aligned, then use the -// lvx/stvx loop over 128-byte chunks. This is the fastest -// case for hot-cache operands, 2nd fastest for cold. -// 3. If length>=128 and source is not 16-byte aligned, then use the -// lvx/vperm/stvx loop over 128-byte chunks. -// 4. If length<128 and source is 8-byte aligned, then use the -// ld/std loop over 32-byte chunks. -// 5. If length<128 and source is not 8-byte aligned, then use the -// lvx/vperm/stvx loop over 32-byte chunks. This is the slowest case. -// Registers at this point: -// r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0) -// rs = alignment unknown -// rd = 16-byte aligned -// rc = bytes remaining -// w2 = low 4 bits of (rd-rs), used to check alignment -// cr5 = beq if source is also 16-byte aligned - -LFwdAligned: - andi. w3,w2,7 // is source at least 8-byte aligned? - mtcrf 0x01,rc // move leftover count to cr7 for LShort16 - bne cr1,LFwdLongVectors // at least one 128-byte chunk, so use vectors - srwi w1,rc,5 // get 32-byte chunk count - mtcrf 0x02,rc // move bit 27 of length to cr6 for LShort32 - mtctr w1 // set up 32-byte loop (w1!=0) - beq LFwdMedAligned // source is 8-byte aligned, so use ld/std loop - mfspr rv,vrsave // get bitmap of live vector registers - oris w4,rv,0xFFF8 // we use v0-v12 - li c16,16 // get constant used in lvx - li c32,32 - mtspr vrsave,w4 // update mask - lvx v1,0,rs // prefetch 1st source quadword - lvsl vp,0,rs // get permute vector to shift left - - -// Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx. - -1: // loop over 32-byte chunks - lvx v2,c16,rs - lvx v3,c32,rs - addi rs,rs,32 - vperm vx,v1,v2,vp - vperm vy,v2,v3,vp - vor v1,v3,v3 // v1 <- v3 - stvx vx,0,rd - stvx vy,c16,rd - addi rd,rd,32 - bdnz 1b - - mtspr vrsave,rv // restore bitmap of live vr's - b LShort32 - - -// Fewer than 128 bytes and doubleword aligned: use ld/std. - - .align 5 -LFwdMedAligned: // loop over 32-byte chunks - ld w1,0(rs) - ld w2,8(rs) - ld w3,16(rs) - ld w4,24(rs) - addi rs,rs,32 - std w1,0(rd) - std w2,8(rd) - std w3,16(rd) - std w4,24(rd) - addi rd,rd,32 - bdnz LFwdMedAligned - - b LShort32 - - -// Forward, 128 bytes or more: use vectors. When entered: -// r0 = 128-byte chunks to move (>0) -// rd = 16-byte aligned -// cr5 = beq if source is 16-byte aligned -// cr7 = low 4 bits of rc (ie, leftover byte count 0-15) -// We set up many registers: -// ctr = number of 128-byte chunks to move -// r0/cr0 = leftover QWs to move -// cr7 = low 4 bits of rc (ie, leftover byte count 0-15) -// cr6 = beq if leftover byte count is 0 -// rv = original value of VRSave -// c16,c32,c48 = loaded - -LFwdLongVectors: - mfspr rv,vrsave // get bitmap of live vector registers - lis w3,kVeryLong>>16 // cutoff for very-long-operand special case path - cmplw cr1,rc,w3 // very long operand? - rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 - bge-- cr1,LBigCopy // handle big copies separately - mtctr r0 // set up loop count - cmpwi cr6,w3,0 // set cr6 on leftover byte count - oris w4,rv,0xFFF8 // we use v0-v12 - rlwinm. r0,rc,28,29,31 // get number of quadword leftovers (0-7) and set cr0 - li c16,16 // get constants used in ldvx/stvx - mtspr vrsave,w4 // update mask - li c32,32 - li c48,48 - beq cr5,LFwdLongAligned // source is also 16-byte aligned, no need for vperm - lvsl vp,0,rs // get permute vector to shift left - lvx v1,0,rs // prefetch 1st source quadword - b LFwdLongUnaligned - - -// Forward, long, unaligned vector loop. - - .align 5 // align inner loops -LFwdLongUnaligned: // loop over 128-byte chunks - addi w4,rs,64 - lvx v2,c16,rs - lvx v3,c32,rs - lvx v4,c48,rs - lvx v5,0,w4 - lvx v6,c16,w4 - vperm vw,v1,v2,vp - lvx v7,c32,w4 - lvx v8,c48,w4 - addi rs,rs,128 - vperm vx,v2,v3,vp - addi w4,rd,64 - lvx v1,0,rs - stvx vw,0,rd - vperm vy,v3,v4,vp - stvx vx,c16,rd - vperm vz,v4,v5,vp - stvx vy,c32,rd - vperm vw,v5,v6,vp - stvx vz,c48,rd - vperm vx,v6,v7,vp - addi rd,rd,128 - stvx vw,0,w4 - vperm vy,v7,v8,vp - stvx vx,c16,w4 - vperm vz,v8,v1,vp - stvx vy,c32,w4 - stvx vz,c48,w4 - bdnz LFwdLongUnaligned - - beq 4f // no leftover quadwords - mtctr r0 -3: // loop over remaining quadwords - lvx v2,c16,rs - addi rs,rs,16 - vperm vx,v1,v2,vp - vor v1,v2,v2 // v1 <- v2 - stvx vx,0,rd - addi rd,rd,16 - bdnz 3b -4: - mtspr vrsave,rv // restore bitmap of live vr's - bne cr6,LShort16 // handle last 0-15 bytes if any - blr - - -// Forward, long, 16-byte aligned vector loop. - - .align 5 -LFwdLongAligned: // loop over 128-byte chunks - addi w4,rs,64 - lvx v1,0,rs - lvx v2,c16,rs - lvx v3,c32,rs - lvx v4,c48,rs - lvx v5,0,w4 - lvx v6,c16,w4 - lvx v7,c32,w4 - lvx v8,c48,w4 - addi rs,rs,128 - addi w4,rd,64 - stvx v1,0,rd - stvx v2,c16,rd - stvx v3,c32,rd - stvx v4,c48,rd - stvx v5,0,w4 - stvx v6,c16,w4 - stvx v7,c32,w4 - stvx v8,c48,w4 - addi rd,rd,128 - bdnz LFwdLongAligned - - beq 4f // no leftover quadwords - mtctr r0 -3: // loop over remaining quadwords (1-7) - lvx v1,0,rs - addi rs,rs,16 - stvx v1,0,rd - addi rd,rd,16 - bdnz 3b -4: - mtspr vrsave,rv // restore bitmap of live vr's - bne cr6,LShort16 // handle last 0-15 bytes if any - blr - - -// Long, reverse moves. -// rs = source -// rd = destination -// rc = count -// cr5 = beq if relatively 16-byte aligned - -LLongReverse: - add rd,rd,rc // point to end of operands - add rs,rs,rc - andi. r0,rd,0xF // #bytes to 16-byte align destination - beq 2f // already aligned - -// 16-byte align destination. - - mtctr r0 // set up for loop - sub rc,rc,r0 -1: - lbzu w1,-1(rs) - stbu w1,-1(rd) - bdnz 1b - -// Prepare for reverse vector loop. When entered: -// rd = 16-byte aligned -// cr5 = beq if source also 16-byte aligned -// We set up many registers: -// ctr/cr1 = number of 64-byte chunks to move (may be 0) -// r0/cr0 = leftover QWs to move -// cr7 = low 4 bits of rc (ie, leftover byte count 0-15) -// cr6 = beq if leftover byte count is 0 -// cm1 = -1 -// rv = original value of vrsave - -2: - mfspr rv,vrsave // get bitmap of live vector registers - srwi r0,rc,6 // get count of 64-byte chunks to move (may be 0) - oris w1,rv,0xFFF8 // we use v0-v12 - mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShortReverse16 - rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too - cmpwi cr1,r0,0 // set cr1 on chunk count - mtspr vrsave,w1 // update mask - mtctr r0 // set up loop count - cmpwi cr6,w3,0 // set cr6 on leftover byte count - rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0 - li cm1,-1 // get constants used in ldvx/stvx - - bne cr5,LReverseVecUnal // handle unaligned operands - beq cr1,2f // no chunks (if no chunks, must be leftover QWs) - li cm17,-17 - li cm33,-33 - li cm49,-49 - b 1f - -// Long, reverse 16-byte-aligned vector loop. - - .align 5 // align inner loops -1: // loop over 64-byte chunks - lvx v1,cm1,rs - lvx v2,cm17,rs - lvx v3,cm33,rs - lvx v4,cm49,rs - subi rs,rs,64 - stvx v1,cm1,rd - stvx v2,cm17,rd - stvx v3,cm33,rd - stvx v4,cm49,rd - subi rd,rd,64 - bdnz 1b - - beq 4f // no leftover quadwords -2: // r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7 - mtctr r0 -3: // loop over remaining quadwords (1-7) - lvx v1,cm1,rs - subi rs,rs,16 - stvx v1,cm1,rd - subi rd,rd,16 - bdnz 3b -4: - mtspr vrsave,rv // restore bitmap of live vr's - bne cr6,LShortReverse16 // handle last 0-15 bytes if any - blr - - -// Long, reverse, unaligned vector loop. -// ctr/cr1 = number of 64-byte chunks to move (may be 0) -// r0/cr0 = leftover QWs to move -// cr7 = low 4 bits of rc (ie, leftover byte count 0-15) -// cr6 = beq if leftover byte count is 0 -// rv = original value of vrsave -// cm1 = -1 - -LReverseVecUnal: - lvsl vp,0,rs // get permute vector to shift left - lvx v1,cm1,rs // v1 always looks ahead - li cm17,-17 - beq cr1,2f // no chunks (if no chunks, must be leftover QWs) - li cm33,-33 - li cm49,-49 - b 1f - - .align 5 // align the inner loops -1: // loop over 64-byte chunks - lvx v2,cm17,rs - lvx v3,cm33,rs - lvx v4,cm49,rs - subi rs,rs,64 - vperm vx,v2,v1,vp - lvx v1,cm1,rs - vperm vy,v3,v2,vp - stvx vx,cm1,rd - vperm vz,v4,v3,vp - stvx vy,cm17,rd - vperm vx,v1,v4,vp - stvx vz,cm33,rd - stvx vx,cm49,rd - subi rd,rd,64 - bdnz 1b - - beq 4f // no leftover quadwords -2: // r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7 - mtctr r0 -3: // loop over 1-3 quadwords - lvx v2,cm17,rs - subi rs,rs,16 - vperm vx,v2,v1,vp - vor v1,v2,v2 // v1 <- v2 - stvx vx,cm1,rd - subi rd,rd,16 - bdnz 3b -4: - mtspr vrsave,rv // restore bitmap of live vr's - bne cr6,LShortReverse16 // handle last 0-15 bytes iff any - blr - - -// Very Big Copy Path. Save our return address in the stack for help decoding backtraces. -// The conditions bigcopy expects are: -// r0 = return address (also stored in caller's SF) -// r4 = source ptr -// r5 = length (at least several pages) -// r12 = dest ptr - -LBigCopy: - lis r2,0x4000 // r2 <- 0x40000000 - mflr r0 // get our return address - add. r2,r2,r2 // set cr0_lt if running in 32-bit mode - stw r0,8(r1) // save return, assuming 32-bit mode ("crsave" if 64-bit mode) - blta _COMM_PAGE_BIGCOPY // 32-bit mode, join big operand copy - std r0,16(r1) // save return in correct spot for 64-bit mode - ba _COMM_PAGE_BIGCOPY // then join big operand code - - - COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0, \ - kCommPageMTCRF+kCommPageBoth+kPort32to64) diff --git a/osfmk/ppc/commpage/bcopy_g3.s b/osfmk/ppc/commpage/bcopy_g3.s deleted file mode 100644 index f0900963e..000000000 --- a/osfmk/ppc/commpage/bcopy_g3.s +++ /dev/null @@ -1,275 +0,0 @@ -/* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* ======================================= - * BCOPY, MEMCPY, and MEMMOVE for Mac OS X - * ======================================= - * - * Version of 2/20/2003, tuned for G3. - * - * Register usage. Note we use R2, so this code will not run in a PEF/CFM - * environment. - * - * r0 = "w7" or temp - * r2 = "w8" - * r3 = not used, as memcpy and memmove return 1st parameter as a value - * r4 = source ptr ("rs") - * r5 = count of bytes to move ("rc") - * r6 = "w1" - * r7 = "w2" - * r8 = "w3" - * r9 = "w4" - * r10 = "w5" - * r11 = "w6" - * r12 = destination ptr ("rd") - * f0-f3 = used for moving 8-byte aligned data - */ -#define rs r4 // NB: we depend on rs==r4 in "lswx" instructions -#define rd r12 -#define rc r5 - -#define w1 r6 -#define w2 r7 -#define w3 r8 -#define w4 r9 -#define w5 r10 -#define w6 r11 -#define w7 r0 -#define w8 r2 - -#include -#include -#include -#include - - .text - - -#define kLong 33 // too long for string ops - - -// Main entry points. - - .align 5 -bcopy_g3: // void bcopy(const void *src, void *dst, size_t len) - cmplwi rc,kLong // length > 32 bytes? - sub w1,r4,r3 // must move in reverse if (rd-rs) 32 bytes? - sub w1,r3,rs // must move in reverse if (rd-rs)=1) - rlwinm rc,rc,0,0x1F // mask down to leftover bytes - mtctr r0 // set up loop count - beq 1f // dest already word aligned - -// Word align the destination. - - mtxer w4 // byte count to xer - cmpwi r0,0 // any chunks to xfer? - lswx w1,0,rs // move w4 bytes to align dest - add rs,rs,w4 - stswx w1,0,rd - add rd,rd,w4 - beq- 2f // pathologic case, no chunks to xfer - -// Forward, unaligned loop. - -1: - lwz w1,0(rs) - lwz w2,4(rs) - lwz w3,8(rs) - lwz w4,12(rs) - lwz w5,16(rs) - lwz w6,20(rs) - lwz w7,24(rs) - lwz w8,28(rs) - addi rs,rs,32 - stw w1,0(rd) - stw w2,4(rd) - stw w3,8(rd) - stw w4,12(rd) - stw w5,16(rd) - stw w6,20(rd) - stw w7,24(rd) - stw w8,28(rd) - addi rd,rd,32 - bdnz 1b -2: // rc = remaining bytes (0-31) - mtxer rc // set up count for string ops - mr r0,rd // move dest ptr out of the way - lswx r5,0,rs // load xer bytes into r5-r12 (rs==r4) - stswx r5,0,r0 // store them - blr - - - -// Forward, aligned loop. We use FPRs. - -LLongFloat: - andi. w4,w2,7 // W4 <- #bytes to doubleword-align destination - sub rc,rc,w4 // adjust count for alignment - srwi r0,rc,5 // number of 32-byte chunks to xfer - rlwinm rc,rc,0,0x1F // mask down to leftover bytes - mtctr r0 // set up loop count - beq 1f // dest already doubleword aligned - -// Doubleword align the destination. - - mtxer w4 // byte count to xer - cmpwi r0,0 // any chunks to xfer? - lswx w1,0,rs // move w4 bytes to align dest - add rs,rs,w4 - stswx w1,0,rd - add rd,rd,w4 - beq- 2f // pathologic case, no chunks to xfer -1: // loop over 32-byte chunks - lfd f0,0(rs) - lfd f1,8(rs) - lfd f2,16(rs) - lfd f3,24(rs) - addi rs,rs,32 - stfd f0,0(rd) - stfd f1,8(rd) - stfd f2,16(rd) - stfd f3,24(rd) - addi rd,rd,32 - bdnz 1b -2: // rc = remaining bytes (0-31) - mtxer rc // set up count for string ops - mr r0,rd // move dest ptr out of the way - lswx r5,0,rs // load xer bytes into r5-r12 (rs==r4) - stswx r5,0,r0 // store them - blr - - -// Long, reverse moves. -// cr5 = beq if relatively word aligned - -LLongReverse: - add rd,rd,rc // point to end of operands + 1 - add rs,rs,rc - beq cr5,LReverseFloat // aligned operands so can use FPRs - srwi r0,rc,5 // get chunk count - rlwinm rc,rc,0,0x1F // mask down to leftover bytes - mtctr r0 // set up loop count - mtxer rc // set up for trailing bytes -1: - lwz w1,-4(rs) - lwz w2,-8(rs) - lwz w3,-12(rs) - lwz w4,-16(rs) - stw w1,-4(rd) - lwz w5,-20(rs) - stw w2,-8(rd) - lwz w6,-24(rs) - stw w3,-12(rd) - lwz w7,-28(rs) - stw w4,-16(rd) - lwzu w8,-32(rs) - stw w5,-20(rd) - stw w6,-24(rd) - stw w7,-28(rd) - stwu w8,-32(rd) - bdnz 1b - - sub r4,rs,rc // point to 1st (leftmost) leftover byte (0..31) - sub r0,rd,rc // move dest ptr out of way - lswx r5,0,r4 // load xer bytes into r5-r12 - stswx r5,0,r0 // store them - blr - - -// Long, reverse aligned moves. We use FPRs. - -LReverseFloat: - andi. w4,rd,7 // W3 <- #bytes to doubleword-align destination - sub rc,rc,w4 // adjust count for alignment - srwi r0,rc,5 // number of 32-byte chunks to xfer - rlwinm rc,rc,0,0x1F // mask down to leftover bytes - mtctr r0 // set up loop count - beq 1f // dest already doubleword aligned - -// Doubleword align the destination. - - mtxer w4 // byte count to xer - cmpwi r0,0 // any chunks to xfer? - sub rs,rs,w4 // point to 1st bytes to xfer - sub rd,rd,w4 - lswx w1,0,rs // move w3 bytes to align dest - stswx w1,0,rd - beq- 2f // pathologic case, no chunks to xfer -1: - lfd f0,-8(rs) - lfd f1,-16(rs) - lfd f2,-24(rs) - lfdu f3,-32(rs) - stfd f0,-8(rd) - stfd f1,-16(rd) - stfd f2,-24(rd) - stfdu f3,-32(rd) - bdnz 1b -2: // rc = remaining bytes (0-31) - mtxer rc // set up count for string ops - sub r4,rs,rc // point to 1st (leftmost) leftover byte (0..31) - sub r0,rd,rc // move dest ptr out of way - lswx r5,0,r4 // load xer bytes into r5-r12 - stswx r5,0,r0 // store them - blr - - COMMPAGE_DESCRIPTOR(bcopy_g3,_COMM_PAGE_BCOPY,0,k64Bit+kHasAltivec,kCommPage32) diff --git a/osfmk/ppc/commpage/bcopy_g4.s b/osfmk/ppc/commpage/bcopy_g4.s deleted file mode 100644 index 0d901ab20..000000000 --- a/osfmk/ppc/commpage/bcopy_g4.s +++ /dev/null @@ -1,622 +0,0 @@ -/* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* ======================================= - * BCOPY, MEMCPY, and MEMMOVE for Mac OS X - * ======================================= - * - * Version of 2/20/2003, tuned for G4. The inner loops use DCBA to avoid - * reading destination cache lines. Only the 7450 actually benefits from - * this, and then only in the cold-cache case. On 7400s and 7455s, we - * patch the DCBAs into NOPs. - * - * Register usage. Note we use R2, so this code will not run in a PEF/CFM - * environment. Note also the rather delicate way we assign multiple uses - * to the same register. Beware. - * - * r0 = "w7" or temp (NB: cannot use r0 for any constant such as "c16") - * r2 = "w8" or vrsave ("rv") - * r3 = not used, as memcpy and memmove return 1st parameter as a value - * r4 = source ptr ("rs") - * r5 = count of bytes to move ("rc") - * r6 = "w1", "c16", or "cm17" - * r7 = "w2", "c32", or "cm33" - * r8 = "w3", "c48", or "cm49" - * r9 = "w4", or "cm1" - * r10 = "w5", "c96", or "cm97" - * r11 = "w6", "c128", or "cm129" - * r12 = destination ptr ("rd") - * v0 = permute vector ("vp") - * v1-v4 = qw's loaded from source - * v5-v7 = permuted qw's ("vw", "vx", "vy") - */ -#define rs r4 -#define rd r12 -#define rc r5 -#define rv r2 - -#define w1 r6 -#define w2 r7 -#define w3 r8 -#define w4 r9 -#define w5 r10 -#define w6 r11 -#define w7 r0 -#define w8 r2 - -#define c16 r6 -#define cm17 r6 -#define c32 r7 -#define cm33 r7 -#define c48 r8 -#define cm49 r8 -#define cm1 r9 -#define c96 r10 -#define cm97 r10 -#define c128 r11 -#define cm129 r11 - -#define vp v0 -#define vw v5 -#define vx v6 -#define vy v7 - -#include -#include -#include -#include - - .text - -#define kMedium 32 // too long for inline loopless code -#define kLong 96 // long enough to justify use of Altivec - - -// Main entry points. - - .align 5 -bcopy_g4: // void bcopy(const void *src, void *dst, size_t len) - cmplwi rc,kMedium // short or long? - sub w1,r4,r3 // must move in reverse if (rd-rs)=1) - mtcrf 0x01,rc // save remaining byte count here for LShort16 - mtctr r0 // set up 16-byte loop - bne cr6,3f // source not 4-byte aligned - b 2f - - .align 4 -2: // loop over 16-byte aligned chunks - lfd f0,0(rs) - lfd f1,8(rs) - addi rs,rs,16 - stfd f0,0(rd) - stfd f1,8(rd) - addi rd,rd,16 - bdnz 2b - - b LShort16 - - .align 4 -3: // loop over 16-byte unaligned chunks - lwz w1,0(rs) - lwz w2,4(rs) - lwz w3,8(rs) - lwz w4,12(rs) - addi rs,rs,16 - stw w1,0(rd) - stw w2,4(rd) - stw w3,8(rd) - stw w4,12(rd) - addi rd,rd,16 - bdnz 3b - - b LShort16 - - -// Vector loops. First, we must 32-byte align the destination. -// w1 = (rd-rs), used to check for reverse and alignment -// w4 = #bytes to 32-byte align destination -// rc = long enough for at least one vector loop - -LFwdLong: - cmpwi w4,0 // dest already aligned? - sub rc,rc,w4 // adjust length - mtcrf 0x01,w4 // cr7 <- #bytes to align dest - rlwinm w2,w1,0,0xF // relatively 16-byte aligned? - mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7 - srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1) - cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned - beq LFwdAligned // dest is already aligned - -// 32-byte align destination. - - bf 31,1f // byte to move? - lbz w1,0(rs) - addi rs,rs,1 - stb w1,0(rd) - addi rd,rd,1 -1: - bf 30,2f // halfword? - lhz w1,0(rs) - addi rs,rs,2 - sth w1,0(rd) - addi rd,rd,2 -2: - bf 29,3f // word? - lwz w1,0(rs) - addi rs,rs,4 - stw w1,0(rd) - addi rd,rd,4 -3: - bf 28,4f // doubleword? - lwz w1,0(rs) - lwz w2,4(rs) - addi rs,rs,8 - stw w1,0(rd) - stw w2,4(rd) - addi rd,rd,8 -4: - bf 27,LFwdAligned // quadword? - lwz w1,0(rs) - lwz w2,4(rs) - lwz w3,8(rs) - lwz w4,12(rs) - addi rs,rs,16 - stw w1,0(rd) - stw w2,4(rd) - stw w3,8(rd) - stw w4,12(rd) - addi rd,rd,16 - - -// Destination is 32-byte aligned. -// r0 = count of 64-byte chunks to move (not 0) -// rd = 32-byte aligned -// rc = bytes remaining -// cr5 = beq if source is 16-byte aligned -// We set up many registers: -// ctr = number of 64-byte chunks to move -// r0/cr0 = leftover QWs to move -// cr7 = low 4 bits of rc (ie, leftover byte count 0-15) -// cr6 = beq if leftover byte count is 0 -// rv = original value of vrsave -// c16 etc = loaded - -LFwdAligned: - mfspr rv,vrsave // get bitmap of live vector registers - mtcrf 0x01,rc // move leftover count to cr7 for LShort16 - rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 - mtctr r0 // set up loop count - cmpwi cr6,w3,0 // set cr6 on leftover byte count - oris w1,rv,0xFF00 // we use v0-v7 - rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0 - mtspr vrsave,w1 // update mask - li c16,16 // get constants used in ldvx/stvx - li c32,32 - li c48,48 - li c96,96 - li c128,128 - bne cr5,LForwardVecUnal // handle unaligned operands - b 1f - - .align 4 -1: // loop over 64-byte chunks - dcbt c96,rs - dcbt c128,rs - lvx v1,0,rs - lvx v2,c16,rs - lvx v3,c32,rs - lvx v4,c48,rs - addi rs,rs,64 - dcba 0,rd // patched to NOP on some machines - stvx v1,0,rd - stvx v2,c16,rd - dcba c32,rd // patched to NOP on some machines - stvx v3,c32,rd - stvx v4,c48,rd - addi rd,rd,64 - bdnz 1b - - beq 4f // no leftover quadwords - mtctr r0 -3: // loop over remaining quadwords (1-3) - lvx v1,0,rs - addi rs,rs,16 - stvx v1,0,rd - addi rd,rd,16 - bdnz 3b -4: - mtspr vrsave,rv // restore bitmap of live vr's - bne cr6,LShort16 // handle last 0-15 bytes if any - blr - - -// Long, forward, unaligned vector loop. - -LForwardVecUnal: - lvsl vp,0,rs // get permute vector to shift left - lvx v1,0,rs // prefetch 1st source quadword - b 1f - - .align 4 // align inner loops -1: // loop over 64-byte chunks - lvx v2,c16,rs - dcbt c96,rs - lvx v3,c32,rs - dcbt c128,rs - lvx v4,c48,rs - addi rs,rs,64 - vperm vw,v1,v2,vp - lvx v1,0,rs - vperm vx,v2,v3,vp - dcba 0,rd // patched to NOP on some machines - stvx vw,0,rd - vperm vy,v3,v4,vp - stvx vx,c16,rd - vperm vw,v4,v1,vp - dcba c32,rd // patched to NOP on some machines - stvx vy,c32,rd - stvx vw,c48,rd - addi rd,rd,64 - bdnz 1b - - beq- 4f // no leftover quadwords - mtctr r0 -3: // loop over remaining quadwords - lvx v2,c16,rs - addi rs,rs,16 - vperm vx,v1,v2,vp - vor v1,v2,v2 // v1 <- v2 - stvx vx,0,rd - addi rd,rd,16 - bdnz 3b -4: - mtspr vrsave,rv // restore bitmap of live vr's - bne cr6,LShort16 // handle last 0-15 bytes if any - blr - - -// Medium and long, reverse moves. We use altivec if the operands are long enough, -// else a lwz/stx loop. -// w1 = (rd-rs), used to check for reverse and alignment -// cr7 = bge if long - -LMediumReverse: - add rd,rd,rc // point to end of operands - add rs,rs,rc - andi. w4,rd,0x1F // w4 <- #bytes to 32-byte align destination - rlwinm w6,rd,0,0x3 // w6 <- #bytes to 4-byte align destination - bge cr7,LLongReverse // long enough for vectors - -// Scalar loop. -// w6 = #bytes to 4-byte align destination - - sub rc,rc,w6 // decrement length remaining - mtxer w6 // set up count for move - sub rs,rs,w6 // back up ptrs - sub rd,rd,w6 - srwi r0,rc,4 // get # 16-byte chunks (>=1) - mtcrf 0x01,rc // set remaining byte count here for LShortReverse16 - lswx w1,0,rs // move w6 bytes to align destination - stswx w1,0,rd - mtctr r0 // set up 16-byte loop - b 1f - - .align 4 -1: // loop over 16-byte aligned chunks - lwz w1,-4(rs) - lwz w2,-8(rs) - lwz w3,-12(rs) - lwzu w4,-16(rs) - stw w1,-4(rd) - stw w2,-8(rd) - stw w3,-12(rd) - stwu w4,-16(rd) - bdnz 1b - - b LShortReverse16 - - -// Reverse vector loops. First, we must 32-byte align the destination. -// w1 = (rd-rs), used to check for reverse and alignment -// w4/cr0 = #bytes to 32-byte align destination -// rc = long enough for at least one vector loop - -LLongReverse: - sub rc,rc,w4 // adjust length - mtcrf 0x01,w4 // cr7 <- #bytes to align dest - rlwinm w2,w1,0,0xF // relatively 16-byte aligned? - mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7 - srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1) - cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned - beq LReverseAligned // dest is already aligned - -// 32-byte align destination. - - bf 31,1f // byte to move? - lbzu w1,-1(rs) - stbu w1,-1(rd) -1: - bf 30,2f // halfword? - lhzu w1,-2(rs) - sthu w1,-2(rd) -2: - bf 29,3f // word? - lwzu w1,-4(rs) - stwu w1,-4(rd) -3: - bf 28,4f // doubleword? - lwz w1,-4(rs) - lwzu w2,-8(rs) - stw w1,-4(rd) - stwu w2,-8(rd) -4: - bf 27,LReverseAligned // quadword? - lwz w1,-4(rs) - lwz w2,-8(rs) - lwz w3,-12(rs) - lwzu w4,-16(rs) - stw w1,-4(rd) - stw w2,-8(rd) - stw w3,-12(rd) - stwu w4,-16(rd) - -// Destination is 32-byte aligned. -// r0 = count of 64-byte chunks to move (not 0) -// rd = 32-byte aligned -// rc = bytes remaining -// cr5 = beq if source is 16-byte aligned -// We set up many registers: -// ctr = number of 64-byte chunks to move -// r0/cr0 = leftover QWs to move -// cr7 = low 4 bits of rc (ie, leftover byte count 0-15) -// cr6 = beq if leftover byte count is 0 -// rv = original value of vrsave -// cm1 etc = loaded - -LReverseAligned: - mfspr rv,vrsave // get bitmap of live vector registers - mtcrf 0x01,rc // move leftover count to cr7 for LShort16 - rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 - mtctr r0 // set up loop count - cmpwi cr6,w3,0 // set cr6 on leftover byte count - oris w1,rv,0xFF00 // we use v0-v7 - rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0 - mtspr vrsave,w1 // update mask - li cm1,-1 // get constants used in ldvx/stvx - li cm17,-17 - li cm33,-33 - li cm49,-49 - li cm97,-97 - li cm129,-129 - bne cr5,LReverseVecUnal // handle unaligned operands - b 1f - - .align 4 // align inner loops -1: // loop over 64-byte chunks - dcbt cm97,rs - dcbt cm129,rs - lvx v1,cm1,rs - lvx v2,cm17,rs - lvx v3,cm33,rs - lvx v4,cm49,rs - subi rs,rs,64 - stvx v1,cm1,rd - stvx v2,cm17,rd - stvx v3,cm33,rd - stvx v4,cm49,rd - subi rd,rd,64 - bdnz 1b - - beq 4f // no leftover quadwords - mtctr r0 -3: // loop over remaining quadwords (1-7) - lvx v1,cm1,rs - subi rs,rs,16 - stvx v1,cm1,rd - subi rd,rd,16 - bdnz 3b -4: - mtspr vrsave,rv // restore bitmap of live vr's - bne cr6,LShortReverse16 // handle last 0-15 bytes if any - blr - - -// Long, reverse, unaligned vector loop. - -LReverseVecUnal: - lvsl vp,0,rs // get permute vector to shift left - lvx v1,cm1,rs // v1 always looks ahead - b 1f - - .align 4 // align the inner loops -1: // loop over 64-byte chunks - lvx v2,cm17,rs - dcbt cm97,rs - lvx v3,cm33,rs - dcbt cm129,rs - lvx v4,cm49,rs - subi rs,rs,64 - vperm vw,v2,v1,vp - lvx v1,cm1,rs - vperm vx,v3,v2,vp - stvx vw,cm1,rd - vperm vy,v4,v3,vp - stvx vx,cm17,rd - vperm vw,v1,v4,vp - stvx vy,cm33,rd - stvx vw,cm49,rd - subi rd,rd,64 - bdnz 1b - - beq 3f // no leftover quadwords - mtctr r0 -2: // loop over 1-3 quadwords - lvx v2,cm17,rs - subi rs,rs,16 - vperm vx,v2,v1,vp - vor v1,v2,v2 // v1 <- v2 - stvx vx,cm1,rd - subi rd,rd,16 - bdnz 2b -3: - mtspr vrsave,rv // restore bitmap of live vr's - bne cr6,LShortReverse16 // handle last 0-15 bytes iff any - blr - - COMMPAGE_DESCRIPTOR(bcopy_g4,_COMM_PAGE_BCOPY,kHasAltivec,k64Bit,kCommPageDCBA+kCommPage32) diff --git a/osfmk/ppc/commpage/bigcopy_970.s b/osfmk/ppc/commpage/bigcopy_970.s deleted file mode 100644 index add093ea3..000000000 --- a/osfmk/ppc/commpage/bigcopy_970.s +++ /dev/null @@ -1,331 +0,0 @@ -/* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* ==================================== - * Very Long Operand BCOPY for Mac OS X - * ==================================== - * - * Version of 2/21/2004, tuned for the IBM 970. This is for operands at - * least several pages long. It is called from bcopy()/memcpy()/memmove(), - * and runs both in 32 and 64-bit mode. - * - * We use the following additional strategies not used by the shorter - * operand paths. Mostly, we try to optimize for memory bandwidth: - * 1. Use DCBZ128 to avoid reading destination lines. Because this code - * resides on the commmpage, it can use a private interface with the - * kernel to minimize alignment exceptions if the destination is - * uncached. The kernel will clear cr7 whenever it emulates a DCBZ or - * DCBZ128 on the commpage. Thus we take at most one exception per call, - * which is amortized across the very long operand. - * 2. Copy larger chunks per iteration to minimize R/W bus turnaround - * and maximize DRAM page locality (opening a new page is expensive.) - * We use 256-byte chunks. - * 3. Touch in one source chunk ahead with DCBT. This is probably the - * least important change, and probably only helps restart the - * hardware stream at the start of each source page. - */ - -#define rs r13 -#define rd r14 -#define rc r15 -#define rx r16 - -#define c16 r3 -#define c32 r4 -#define c48 r5 -#define c64 r6 -#define c80 r7 -#define c96 r8 -#define c112 r9 -#define c256 r10 -#define c384 r11 -#define rv r12 // vrsave - -// Offsets within the "red zone" (which is 224 bytes long): - -#define rzR3 -8 -#define rzR13 -16 -#define rzR14 -24 -#define rzR15 -32 -#define rzR16 -40 - -#define rzV20 -64 -#define rzV21 -80 -#define rzV22 -96 -#define rzV23 -112 - - -#include -#include -#include -#include - - .text -/* - * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary - * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following - * simple transformations: - * - all word compares are changed to doubleword - * - all "srwi[.]" opcodes are changed to "srdi[.]" - * Nothing else is done. For this to work, the following rules must be - * carefully followed: - * - do not use carry or overflow - * - only use record mode if you are sure the results are mode-invariant - * for example, all "andi." and almost all "rlwinm." are fine - * - do not use "slwi", "slw", or "srw" - * An imaginative programmer could break the porting model in other ways, but the above - * are the most likely problem areas. It is perhaps surprising how well in practice - * this simple method works. - */ - -// Entry point. This is a subroutine of bcopy(). When called: -// r0 = return address (also stored in caller's SF) -// r4 = source ptr -// r5 = length (at least several pages) -// r12 = dest ptr -// -// We only do "forward" moves, ie non-overlapping or toward 0. We return with non-volatiles -// and r3 preserved. - - .align 5 -bigcopy_970: - neg r2,r12 // is destination cache-line-aligned? - std r3,rzR3(r1) // save caller's r3, which must be preserved for memcpy() - std r13,rzR13(r1) // spill non-volatile regs we use to redzone - std r14,rzR14(r1) - std r15,rzR15(r1) - andi. r2,r2,0x7F // #bytes to align - std r16,rzR16(r1) - mr rs,r4 // copy parameters into nonvolatile registers - mr rd,r12 - mr rc,r5 - mr rx,r0 // also save return address - beq 1f // skip if already aligned - -// Cache-line-align destination. - - mr r3,rd // set up dest ptr for memcpy() - mr r5,r2 // number of bytes to copy - add rs,rs,r2 // then bump our parameters past initial copy - add rd,rd,r2 - sub rc,rc,r2 - bla _COMM_PAGE_MEMCPY // 128-byte-align destination - - -// Load constant offsets and check whether source is 16-byte aligned. -// NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage, -// and we dcbz only if cr7 beq is set. - -1: - dcbt 0,rs // touch in 1st line of source - andi. r0,rs,15 // check source alignment - mfspr rv,vrsave // save caller's bitmask - li c16,16 // load the constant offsets for x-form ops - li c32,32 - srwi r2,rc,8 // get number of 256-byte chunks to xfer - li r0,-256 // we use 24 VRs (ie, 0-23) - li c48,48 - li c64,64 - li c80,80 - or r0,r0,rv // add our bits to caller's - li c96,96 - mtctr r2 // set up loop count - li c112,112 - cmpd cr7,r2,r2 // initialize cr7_eq to "on", so we dcbz128 - mtspr vrsave,r0 // say we use vr0..vr23 - li c256,256 - li c384,384 - beq LalignedLoop // handle aligned sources - - -// Set up for unaligned loop. - - lvsl v0,0,rs // get permute vector for left shift - lvxl v1,0,rs // prime the loop - li r0,rzV20 // save non-volatile VRs in redzone - stvx v20,r1,r0 - li r0,rzV21 - stvx v21,r1,r0 - li r0,rzV22 - stvx v22,r1,r0 - li r0,rzV23 - stvx v23,r1,r0 - b LunalignedLoop // enter unaligned loop - - -// Main loop for unaligned operands. We loop over 256-byte chunks (2 cache lines). -// Destination is 128-byte aligned, source is unaligned. - - .align 5 -LunalignedLoop: - dcbt c256,rs // touch in next chunk - dcbt c384,rs - addi r2,rs,128 // point to 2nd 128 bytes of source - lvxl v2,c16,rs - lvxl v3,c32,rs - lvxl v4,c48,rs - lvxl v5,c64,rs - lvxl v6,c80,rs - lvxl v7,c96,rs - lvxl v8,c112,rs - lvxl v9,0,r2 - addi rs,rs,256 // point to next source chunk - lvxl v10,c16,r2 - lvxl v11,c32,r2 - vperm v17,v1,v2,v0 - lvxl v12,c48,r2 - lvxl v13,c64,r2 - vperm v18,v2,v3,v0 - lvxl v14,c80,r2 - lvxl v15,c96,r2 - vperm v19,v3,v4,v0 - lvxl v16,c112,r2 - lvxl v1,0,rs // peek ahead at first source quad in next chunk - vperm v20,v4,v5,v0 - addi r2,rd,128 // point to 2nd 128 bytes of dest - bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel - dcbz128 0,rd - dcbz128 0,r2 -1: - vperm v21,v5,v6,v0 - stvxl v17,0,rd - vperm v22,v6,v7,v0 - stvxl v18,c16,rd - vperm v23,v7,v8,v0 - stvxl v19,c32,rd - vperm v17,v8,v9,v0 - stvxl v20,c48,rd - vperm v18,v9,v10,v0 - stvxl v21,c64,rd - vperm v19,v10,v11,v0 - stvxl v22,c80,rd - vperm v20,v11,v12,v0 - stvxl v23,c96,rd - vperm v21,v12,v13,v0 - stvxl v17,c112,rd - vperm v22,v13,v14,v0 - addi rd,rd,256 // point to next dest chunk - stvxl v18,0,r2 - vperm v23,v14,v15,v0 - stvxl v19,c16,r2 - vperm v17,v15,v16,v0 - stvxl v20,c32,r2 - vperm v18,v16,v1,v0 - stvxl v21,c48,r2 - stvxl v22,c64,r2 - stvxl v23,c80,r2 - stvxl v17,c96,r2 - stvxl v18,c112,r2 - bdnz++ LunalignedLoop // loop if another 256 bytes to go - - li r6,rzV20 // restore non-volatile VRs - li r7,rzV21 - li r8,rzV22 - li r9,rzV23 - lvx v20,r1,r6 - lvx v21,r1,r7 - lvx v22,r1,r8 - lvx v23,r1,r9 - b Ldone - - -// Aligned loop. Destination is 128-byte aligned, and source is 16-byte -// aligned. Loop over 256-byte chunks (2 cache lines.) - - .align 5 -LalignedLoop: - dcbt c256,rs // touch in next chunk - dcbt c384,rs - addi r2,rs,128 // point to 2nd 128 bytes of source - lvxl v1,0,rs - lvxl v2,c16,rs - lvxl v3,c32,rs - lvxl v4,c48,rs - lvxl v5,c64,rs - lvxl v6,c80,rs - lvxl v7,c96,rs - lvxl v8,c112,rs - lvxl v9,0,r2 - lvxl v10,c16,r2 - lvxl v11,c32,r2 - lvxl v12,c48,r2 - lvxl v13,c64,r2 - lvxl v14,c80,r2 - lvxl v15,c96,r2 - lvxl v16,c112,r2 - addi r2,rd,128 // point to 2nd 128 bytes of dest - bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel - dcbz128 0,rd - dcbz128 0,r2 -1: - addi rs,rs,256 // point to next source chunk - stvxl v1,0,rd - stvxl v2,c16,rd - stvxl v3,c32,rd - stvxl v4,c48,rd - stvxl v5,c64,rd - stvxl v6,c80,rd - stvxl v7,c96,rd - stvxl v8,c112,rd - addi rd,rd,256 // point to next dest chunk - stvxl v9,0,r2 - stvxl v10,c16,r2 - stvxl v11,c32,r2 - stvxl v12,c48,r2 - stvxl v13,c64,r2 - stvxl v14,c80,r2 - stvxl v15,c96,r2 - stvxl v16,c112,r2 - bdnz++ LalignedLoop // loop if another 256 bytes to go - - -// Done, except for 0..255 leftover bytes at end. -// rs = source ptr -// rd = dest ptr -// rc = remaining count in low 7 bits -// rv = caller's vrsave -// rx = caller's return address - -Ldone: - andi. r5,rc,0xFF // any leftover bytes? (0..255) - mtspr vrsave,rv // restore bitmap of live vr's - - mr r3,rd - mr r4,rs - bnela _COMM_PAGE_MEMCPY // copy leftover bytes - - mtlr rx // restore return address - ld r3,rzR3(r1) // restore non-volatile GPRs from redzone - ld r13,rzR13(r1) - ld r14,rzR14(r1) - ld r15,rzR15(r1) - ld r16,rzR16(r1) - blr - - - COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,kPort32to64+kCommPageBoth) - diff --git a/osfmk/ppc/commpage/bzero_128.s b/osfmk/ppc/commpage/bzero_128.s deleted file mode 100644 index f22198478..000000000 --- a/osfmk/ppc/commpage/bzero_128.s +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include -#include - - .text - .align 2 -/* - * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary - * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following - * simple transformations: - * - all word compares are changed to doubleword - * - all "srwi[.]" opcodes are changed to "srdi[.]" - * Nothing else is done. For this to work, the following rules must be - * carefully followed: - * - do not use carry or overflow - * - only use record mode if you are sure the results are mode-invariant - * for example, all "andi." and almost all "rlwinm." are fine - * - do not use "slwi", "slw", or "srw" - * An imaginative programmer could break the porting model in other ways, but the above - * are the most likely problem areas. It is perhaps surprising how well in practice - * this simple method works. - */ - -// ********************** -// * B Z E R O _ 1 2 8 * -// ********************** -// -// For 64-bit processors with a 128-byte cache line. -// -// Register use: -// r0 = zero -// r3 = original ptr, not changed since memset returns it -// r4 = count of bytes to set -// r9 = working operand ptr -// WARNING: We do not touch r2 and r10-r12, which some callers depend on. - - .align 5 -bzero_128: // void bzero(void *b, size_t len); - cmplwi cr7,r4,128 // too short for DCBZ128? - li r0,0 // get a 0 - neg r5,r3 // start to compute #bytes to align - mr r9,r3 // make copy of operand ptr (can't change r3) - blt cr7,Ltail // length < 128, too short for DCBZ - -// At least 128 bytes long, so compute alignment and #cache blocks. - - andi. r5,r5,0x7F // r5 <- #bytes to 128-byte align - sub r4,r4,r5 // adjust length - srwi r8,r4,7 // r8 <- 128-byte chunks - rlwinm r4,r4,0,0x7F // mask length down to remaining bytes - mtctr r8 // set up loop count - beq Ldcbz // skip if already aligned (r8!=0) - -// 128-byte align - - mtcrf 0x01,r5 // start to move #bytes to align to cr6 and cr7 - cmpwi cr1,r8,0 // any 128-byte cache lines to 0? - mtcrf 0x02,r5 - - bf 31,1f // byte? - stb r0,0(r9) - addi r9,r9,1 -1: - bf 30,2f // halfword? - sth r0,0(r9) - addi r9,r9,2 -2: - bf 29,3f // word? - stw r0,0(r9) - addi r9,r9,4 -3: - bf 28,4f // doubleword? - std r0,0(r9) - addi r9,r9,8 -4: - bf 27,5f // quadword? - std r0,0(r9) - std r0,8(r9) - addi r9,r9,16 -5: - bf 26,6f // 32-byte chunk? - std r0,0(r9) - std r0,8(r9) - std r0,16(r9) - std r0,24(r9) - addi r9,r9,32 -6: - bf 25,7f // 64-byte chunk? - std r0,0(r9) - std r0,8(r9) - std r0,16(r9) - std r0,24(r9) - std r0,32(r9) - std r0,40(r9) - std r0,48(r9) - std r0,56(r9) - addi r9,r9,64 -7: - beq cr1,Ltail // no chunks to dcbz128 - -// Loop doing 128-byte version of DCBZ instruction. -// NB: if the memory is cache-inhibited, the kernel will clear cr7 -// when it emulates the alignment exception. Eventually, we may want -// to check for this case. - -Ldcbz: - dcbz128 0,r9 // zero another 32 bytes - addi r9,r9,128 - bdnz Ldcbz - -// Store trailing bytes. -// r0 = 0 -// r4 = count -// r9 = ptr - -Ltail: - srwi. r5,r4,4 // r5 <- 16-byte chunks to 0 - mtcrf 0x01,r4 // remaining byte count to cr7 - mtctr r5 - beq 2f // skip if no 16-byte chunks -1: // loop over 16-byte chunks - std r0,0(r9) - std r0,8(r9) - addi r9,r9,16 - bdnz 1b -2: - bf 28,4f // 8-byte chunk? - std r0,0(r9) - addi r9,r9,8 -4: - bf 29,5f // word? - stw r0,0(r9) - addi r9,r9,4 -5: - bf 30,6f // halfword? - sth r0,0(r9) - addi r9,r9,2 -6: - bflr 31 // byte? - stb r0,0(r9) - blr - - COMMPAGE_DESCRIPTOR(bzero_128,_COMM_PAGE_BZERO,kCache128+k64Bit,0, \ - kCommPageMTCRF+kCommPageBoth+kPort32to64) diff --git a/osfmk/ppc/commpage/bzero_32.s b/osfmk/ppc/commpage/bzero_32.s deleted file mode 100644 index fe7653d6d..000000000 --- a/osfmk/ppc/commpage/bzero_32.s +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include -#include - - .text - .align 2 - - -// ******************* -// * B Z E R O _ 3 2 * -// ******************* -// -// For 32-bit processors with a 32-byte cache line. -// -// Register use: -// r0 = zero -// r3 = original ptr, not changed since memset returns it -// r4 = count of bytes to set -// r9 = working operand ptr -// We do not touch r2 and r10-r12, which some callers depend on. - - .align 5 -bzero_32: // void bzero(void *b, size_t len); - cmplwi cr7,r4,32 // too short for DCBZ? - li r0,0 // get a 0 - neg r5,r3 // start to compute #bytes to align - mr r9,r3 // make copy of operand ptr (can't change r3) - blt cr7,Ltail // length < 32, too short for DCBZ - -// At least 32 bytes long, so compute alignment and #cache blocks. - - andi. r5,r5,0x1F // r5 <- #bytes to 32-byte align - sub r4,r4,r5 // adjust length - srwi r8,r4,5 // r8 <- #32-byte chunks - cmpwi cr1,r8,0 // any chunks? - mtctr r8 // set up loop count - beq 1f // skip if already 32-byte aligned (r8!=0) - -// 32-byte align. We just store 32 0s, rather than test and use conditional -// branches. We've already stored the first few bytes above. - - stw r0,0(r9) - stw r0,4(r9) - stw r0,8(r9) - stw r0,12(r9) - stw r0,16(r9) - stw r0,20(r9) - stw r0,24(r9) - stw r0,28(r9) - add r9,r9,r5 // now rp is 32-byte aligned - beq cr1,Ltail // skip if no 32-byte chunks - -// Loop doing 32-byte version of DCBZ instruction. -// NB: we take alignment exceptions on cache-inhibited memory. -// The kernel could be changed to zero cr7 when emulating a -// dcbz (as it does on 64-bit processors), so we could avoid all -// but the first. - -1: - andi. r5,r4,0x1F // will there be trailing bytes? - b 2f - .align 4 -2: - dcbz 0,r9 // zero another 32 bytes - addi r9,r9,32 - bdnz 2b - - beqlr // no trailing bytes - -// Store trailing bytes. - -Ltail: - andi. r5,r4,0x10 // test bit 27 separately - mtcrf 0x01,r4 // remaining byte count to cr7 - - beq 2f // no 16-byte chunks - stw r0,0(r9) - stw r0,4(r9) - stw r0,8(r9) - stw r0,12(r9) - addi r9,r9,16 -2: - bf 28,4f // 8-byte chunk? - stw r0,0(r9) - stw r0,4(r9) - addi r9,r9,8 -4: - bf 29,5f // word? - stw r0,0(r9) - addi r9,r9,4 -5: - bf 30,6f // halfword? - sth r0,0(r9) - addi r9,r9,2 -6: - bflr 31 // byte? - stb r0,0(r9) - blr - - COMMPAGE_DESCRIPTOR(bzero_32,_COMM_PAGE_BZERO,kCache32,0,kCommPage32) diff --git a/osfmk/ppc/commpage/cacheflush.s b/osfmk/ppc/commpage/cacheflush.s deleted file mode 100644 index 43d7452ea..000000000 --- a/osfmk/ppc/commpage/cacheflush.s +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include // EXT, LEXT -#include -#include - - .text - .align 2 - - -// ********************************************* -// * C O M M P A G E _ F L U S H _ D C A C H E * -// ********************************************* -// -// Note that this routine is called both in 32 and 64-bit mode. -// -// r3 = ptr to 1st byte to flush -// r4 = length to flush (may be 0) - -commpage_flush_dcache: - mr. r4,r4 // test length for 0 in mode-independent way - lhz r5,_COMM_PAGE_CACHE_LINESIZE(0) - subi r9,r5,1 // get (linesize-1) - and r0,r3,r9 // get offset within line of 1st byte - add r4,r4,r0 // adjust length so we flush them all - add r4,r4,r9 // round length up... - andc r4,r4,r9 // ...to multiple of cache lines - beqlr-- // length was 0, so exit -1: - sub. r4,r4,r5 // more to go? - dcbf 0,r3 // flush another line - add r3,r3,r5 - bne 1b - sync // make sure lines are flushed before we return - blr - - COMMPAGE_DESCRIPTOR(commpage_flush_dcache,_COMM_PAGE_FLUSH_DCACHE,0,0,kCommPageBoth) - - -// ********************************************* -// * C O M M P A G E _ F L U S H _ I C A C H E * -// ********************************************* -// -// Note that this routine is called both in 32 and 64-bit mode. -// -// r3 = ptr to 1st byte to flush -// r4 = length to flush (may be 0) - -commpage_flush_icache: - mr. r4,r4 // test length for 0 in mode-independent way - lhz r5,_COMM_PAGE_CACHE_LINESIZE(0) - subi r9,r5,1 // get (linesize-1) - and r0,r3,r9 // get offset within line of 1st byte - add r4,r4,r0 // adjust length so we flush them all - mr r7,r3 // copy ptr - add r4,r4,r9 // round length up... - andc r4,r4,r9 // ...to multiple of cache lines - mr r6,r4 // copy length - beqlr-- // length was 0, so exit -1: - sub. r4,r4,r5 // more to go? - dcbf 0,r3 // flush another line - add r3,r3,r5 - bne 1b - sync // make sure lines are flushed -2: - sub. r6,r6,r5 // more to go? - icbi 0,r7 - add r7,r7,r5 - bne 2b - - // The following sync is only needed on MP machines, probably only on - // 7400-family MP machines. But because we're not certain of this, and - // this isn't a speed critical routine, we are conservative and always sync. - - sync // wait until other processors see the icbi's - isync // make sure we haven't prefetched old instructions - - blr - - COMMPAGE_DESCRIPTOR(commpage_flush_icache,_COMM_PAGE_FLUSH_ICACHE,0,0,kCommPageBoth) - - diff --git a/osfmk/ppc/commpage/commpage.c b/osfmk/ppc/commpage/commpage.c deleted file mode 100644 index 6b0227322..000000000 --- a/osfmk/ppc/commpage/commpage.c +++ /dev/null @@ -1,679 +0,0 @@ -/* - * Copyright (c) 2003-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * Here's what to do if you want to add a new routine to the comm page: - * - * 1. Add a definition for it's address in osfmk/ppc/cpu_capabilities.h, - * being careful to reserve room for future expansion. - * - * 2. Write one or more versions of the routine, each with it's own - * commpage_descriptor. The tricky part is getting the "special", - * "musthave", and "canthave" fields right, so that exactly one - * version of the routine is selected for every machine. - * The source files should be in osfmk/ppc/commpage/. - * - * 3. Add a ptr to your new commpage_descriptor(s) in the "routines" - * static array below. Of course, you'll also have to declare them - * "extern". - * - * 4. Write the code in Libc to use the new routine. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -extern vm_map_t commpage32_map; // the 32-bit shared submap, set up in vm init -extern vm_map_t commpage64_map; // the 64-bit shared submap - -char *commPagePtr32 = NULL; // virtual address of 32-bit comm page in kernel map -char *commPagePtr64 = NULL; // and 64-bit commpage -int _cpu_capabilities = 0; // define the capability vector - -static char *next; // next available byte in comm page -static int cur_routine; // comm page address of "current" routine -static int matched; // true if we've found a match for "current" routine -static char *commPagePtr; // virtual address in kernel of commpage we are working on - -extern commpage_descriptor compare_and_swap32_on32; -extern commpage_descriptor compare_and_swap32_on64; -extern commpage_descriptor compare_and_swap64; -extern commpage_descriptor atomic_enqueue32; -extern commpage_descriptor atomic_enqueue64; -extern commpage_descriptor atomic_dequeue32_on32; -extern commpage_descriptor atomic_dequeue32_on64; -extern commpage_descriptor atomic_dequeue64; -extern commpage_descriptor memory_barrier_up; -extern commpage_descriptor memory_barrier_mp32; -extern commpage_descriptor memory_barrier_mp64; -extern commpage_descriptor atomic_add32; -extern commpage_descriptor atomic_add64; -extern commpage_descriptor mach_absolute_time_32; -extern commpage_descriptor mach_absolute_time_64; -extern commpage_descriptor mach_absolute_time_lp64; -extern commpage_descriptor spinlock_32_try_mp; -extern commpage_descriptor spinlock_32_try_up; -extern commpage_descriptor spinlock_64_try_mp; -extern commpage_descriptor spinlock_64_try_up; -extern commpage_descriptor spinlock_32_lock_mp; -extern commpage_descriptor spinlock_32_lock_up; -extern commpage_descriptor spinlock_64_lock_mp; -extern commpage_descriptor spinlock_64_lock_up; -extern commpage_descriptor spinlock_32_unlock_mp; -extern commpage_descriptor spinlock_32_unlock_up; -extern commpage_descriptor spinlock_64_unlock_mp; -extern commpage_descriptor spinlock_64_unlock_up; -extern commpage_descriptor pthread_getspecific_sprg3_32; -extern commpage_descriptor pthread_getspecific_sprg3_64; -extern commpage_descriptor pthread_getspecific_uftrap; -extern commpage_descriptor gettimeofday_32; -extern commpage_descriptor gettimeofday_g5_32; -extern commpage_descriptor gettimeofday_g5_64; -extern commpage_descriptor commpage_flush_dcache; -extern commpage_descriptor commpage_flush_icache; -extern commpage_descriptor pthread_self_sprg3; -extern commpage_descriptor pthread_self_uftrap; -extern commpage_descriptor spinlock_relinquish; -extern commpage_descriptor bzero_32; -extern commpage_descriptor bzero_128; -extern commpage_descriptor bcopy_g3; -extern commpage_descriptor bcopy_g4; -extern commpage_descriptor bcopy_970; -extern commpage_descriptor bcopy_64; -extern commpage_descriptor compare_and_swap32_on32b; -extern commpage_descriptor compare_and_swap32_on64b; -extern commpage_descriptor compare_and_swap64b; -extern commpage_descriptor memset_64; -extern commpage_descriptor memset_g3; -extern commpage_descriptor memset_g4; -extern commpage_descriptor memset_g5; -extern commpage_descriptor bigcopy_970; - -/* The list of all possible commpage routines. WARNING: the check for overlap - * assumes that these routines are in strictly ascending order, sorted by address - * in the commpage. We panic if not. - */ -static commpage_descriptor *routines[] = { - &compare_and_swap32_on32, - &compare_and_swap32_on64, - &compare_and_swap64, - &atomic_enqueue32, - &atomic_enqueue64, - &atomic_dequeue32_on32, - &atomic_dequeue32_on64, - &atomic_dequeue64, - &memory_barrier_up, - &memory_barrier_mp32, - &memory_barrier_mp64, - &atomic_add32, - &atomic_add64, - &mach_absolute_time_32, - &mach_absolute_time_64, - &mach_absolute_time_lp64, - &spinlock_32_try_mp, - &spinlock_32_try_up, - &spinlock_64_try_mp, - &spinlock_64_try_up, - &spinlock_32_lock_mp, - &spinlock_32_lock_up, - &spinlock_64_lock_mp, - &spinlock_64_lock_up, - &spinlock_32_unlock_mp, - &spinlock_32_unlock_up, - &spinlock_64_unlock_mp, - &spinlock_64_unlock_up, - &pthread_getspecific_sprg3_32, - &pthread_getspecific_sprg3_64, - &pthread_getspecific_uftrap, - &gettimeofday_32, - &gettimeofday_g5_32, - &gettimeofday_g5_64, - &commpage_flush_dcache, - &commpage_flush_icache, - &pthread_self_sprg3, - &pthread_self_uftrap, - &spinlock_relinquish, - &bzero_32, - &bzero_128, - &bcopy_g3, - &bcopy_g4, - &bcopy_970, - &bcopy_64, - &compare_and_swap32_on32b, - &compare_and_swap32_on64b, - &compare_and_swap64b, - &memset_64, - &memset_g3, - &memset_g4, - &memset_g5, - &bigcopy_970, - NULL }; - - -/* Allocate the commpages and add to one of the shared submaps created by vm. - * Called once each for the 32 and 64-bit submaps. - * 1. allocate pages in the kernel map (RW) - * 2. wire them down - * 3. make a memory entry out of them - * 4. map that entry into the shared comm region map (R-only) - */ -static void* -commpage_allocate( - vm_map_t submap ) // commpage32_map or commpage64_map -{ - vm_offset_t kernel_addr = 0; // address of commpage in kernel map - vm_offset_t zero = 0; - vm_size_t size = _COMM_PAGE_AREA_USED; // size actually populated - vm_map_entry_t entry; - ipc_port_t handle; - - if (submap == NULL) - panic("commpage submap is null"); - - if (vm_map(kernel_map,&kernel_addr,_COMM_PAGE_AREA_USED,0,VM_FLAGS_ANYWHERE,NULL,0,FALSE,VM_PROT_ALL,VM_PROT_ALL,VM_INHERIT_NONE)) - panic("cannot allocate commpage"); - - if (vm_map_wire(kernel_map,kernel_addr,kernel_addr+_COMM_PAGE_AREA_USED,VM_PROT_DEFAULT,FALSE)) - panic("cannot wire commpage"); - - /* - * Now that the object is created and wired into the kernel map, mark it so that no delay - * copy-on-write will ever be performed on it as a result of mapping it into user-space. - * If such a delayed copy ever occurred, we could remove the kernel's wired mapping - and - * that would be a real disaster. - * - * JMM - What we really need is a way to create it like this in the first place. - */ - if (!vm_map_lookup_entry( kernel_map, vm_map_trunc_page(kernel_addr), &entry) || entry->is_sub_map) - panic("cannot find commpage entry"); - entry->object.vm_object->copy_strategy = MEMORY_OBJECT_COPY_NONE; - - if (mach_make_memory_entry( kernel_map, // target map - &size, // size - kernel_addr, // offset (address in kernel map) - VM_PROT_ALL, // map it RWX - &handle, // this is the object handle we get - NULL )) // parent_entry - panic("cannot make entry for commpage"); - - if (vm_map_64( submap, // target map (shared submap) - &zero, // address (map into 1st page in submap) - _COMM_PAGE_AREA_USED, // size - 0, // mask - VM_FLAGS_FIXED, // flags (it must be 1st page in submap) - handle, // port is the memory entry we just made - 0, // offset (map 1st page in memory entry) - FALSE, // copy - VM_PROT_READ|VM_PROT_EXECUTE, // cur_protection (R-only in user map) - VM_PROT_READ|VM_PROT_EXECUTE, // max_protection - VM_INHERIT_SHARE )) // inheritance - panic("cannot map commpage"); - - ipc_port_release(handle); - - return (void*) kernel_addr; // return address in kernel map -} - - -/* Get address (in kernel map) of a commpage field. */ - -static void* -commpage_addr_of( - int addr_at_runtime ) -{ - return (void*) (commPagePtr + addr_at_runtime - _COMM_PAGE_BASE_ADDRESS); -} - - -/* Determine number of CPUs on this system. We cannot rely on - * machine_info.max_cpus this early in the boot. - */ -static int -commpage_cpus( void ) -{ - int cpus; - - cpus = ml_get_max_cpus(); // NB: this call can block - - if (cpus == 0) - panic("commpage cpus==0"); - if (cpus > 0xFF) - cpus = 0xFF; - - return cpus; -} - - -/* Initialize kernel version of _cpu_capabilities vector (used by KEXTs.) */ - -static void -commpage_init_cpu_capabilities( void ) -{ - procFeatures *pfp; - int cpus; - int available; - - pfp = &(PerProcTable[0].ppe_vaddr->pf); // point to features in per-proc - available = pfp->Available; - - // If AltiVec is disabled make sure it is not reported as available. - if ((available & pfAltivec) == 0) { - _cpu_capabilities &= ~kHasAltivec; - } - - if (_cpu_capabilities & kDcbaAvailable) { // if this processor has DCBA, time it... - _cpu_capabilities |= commpage_time_dcba(); // ...and set kDcbaRecomended if it helps. - } - - cpus = commpage_cpus(); // how many CPUs do we have - if (cpus == 1) _cpu_capabilities |= kUP; - _cpu_capabilities |= (cpus << kNumCPUsShift); - - if (_cpu_capabilities & k64Bit) // 64-bit processors use SPRG3 for TLS - _cpu_capabilities |= kFastThreadLocalStorage; -} - - -/* Copy data into commpage. */ - -static void -commpage_stuff( - int address, - const void *source, - int length ) -{ - char *dest = commpage_addr_of(address); - - if (dest < next) - panic("commpage overlap: %p - %p", dest, next); - - bcopy((const char*)source,dest,length); - - next = (dest + length); -} - - -/* Modify commpage code in-place for this specific platform. */ - -static void -commpage_change( - uint32_t *ptr, - int bytes, - uint32_t search_mask, - uint32_t search_pattern, - uint32_t new_mask, - uint32_t new_pattern, - int (*check)(uint32_t instruction) ) -{ - int words = bytes >> 2; - uint32_t word; - - while( (--words) >= 0 ) { - word = *ptr; - if ((word & search_mask)==search_pattern) { - if ((check==NULL) || (check(word))) { // check instruction if necessary - word &= ~new_mask; - word |= new_pattern; - *ptr = word; - } - } - ptr++; - } -} - - -/* Check to see if exactly one bit is set in a MTCRF instruction's FXM field. - */ -static int -commpage_onebit( - uint32_t mtcrf ) -{ - int x = (mtcrf >> 12) & 0xFF; // isolate the FXM field of the MTCRF - - if (x==0) - panic("commpage bad mtcrf"); - - return (x & (x-1))==0 ? 1 : 0; // return 1 iff exactly 1 bit set in FXM field -} - - -/* Check to see if a RLWINM (whose ME is 31) is a SRWI. Since to shift right n bits - * you must "RLWINM ra,rs,32-n,n,31", if (SH+MB)==32 then we have a SRWI. - */ -static int -commpage_srwi( - uint32_t rlwinm ) -{ - int sh = (rlwinm >> 11) & 0x1F; // extract SH field of RLWINM, ie bits 16-20 - int mb = (rlwinm >> 6 ) & 0x1F; // extract MB field of RLWINM, ie bits 21-25 - - return (sh + mb) == 32; // it is a SRWI if (SH+MB)==32 -} - - -/* Handle kCommPageDCBA bit: the commpage routine uses DCBA. If the machine we're - * running on doesn't benefit from use of that instruction, map them to NOPs - * in the commpage. - */ -static void -commpage_handle_dcbas( - int address, - int length ) -{ - uint32_t *ptr, search_mask, search, replace_mask, replace; - - if ( (_cpu_capabilities & kDcbaRecommended) == 0 ) { - ptr = commpage_addr_of(address); - - search_mask = 0xFC0007FE; // search x-form opcode bits - search = 0x7C0005EC; // for a DCBA - replace_mask = 0xFFFFFFFF; // replace all bits... - replace = 0x60000000; // ...with a NOP - - commpage_change(ptr,length,search_mask,search,replace_mask,replace,NULL); - } -} - - -/* Handle kCommPageSYNC bit: this routine uses SYNC, LWSYNC, or EIEIO. If we're - * running on a UP machine, map them to NOPs. - */ -static void -commpage_handle_syncs( - int address, - int length ) -{ - uint32_t *ptr, search_mask, search, replace_mask, replace; - - if (_NumCPUs() == 1) { - ptr = commpage_addr_of(address); - - search_mask = 0xFC0005FE; // search x-form opcode bits (but ignore bit 0x00000200) - search = 0x7C0004AC; // for a SYNC, LWSYNC, or EIEIO - replace_mask = 0xFFFFFFFF; // replace all bits... - replace = 0x60000000; // ...with a NOP - - commpage_change(ptr,length,search_mask,search,replace_mask,replace,NULL); - } -} - - -/* Handle kCommPageISYNC bit: this routine uses ISYNCs. If we're running on a UP machine, - * map them to NOPs. - */ -static void -commpage_handle_isyncs( - int address, - int length ) -{ - uint32_t *ptr, search_mask, search, replace_mask, replace; - - if (_NumCPUs() == 1) { - ptr = commpage_addr_of(address); - - search_mask = 0xFC0007FE; // search xl-form opcode bits - search = 0x4C00012C; // for an ISYNC - replace_mask = 0xFFFFFFFF; // replace all bits... - replace = 0x60000000; // ...with a NOP - - commpage_change(ptr,length,search_mask,search,replace_mask,replace,NULL); - } -} - - -/* Handle kCommPageMTCRF bit. When this was written (3/03), the assembler did not - * recognize the special form of MTCRF instructions, in which exactly one bit is set - * in the 8-bit mask field. Bit 11 of the instruction should be set in this case, - * since the 970 and probably other 64-bit processors optimize it. Once the assembler - * has been updated this code can be removed, though it need not be. - */ -static void -commpage_handle_mtcrfs( - int address, - int length ) -{ - uint32_t *ptr, search_mask, search, replace_mask, replace; - - if (_cpu_capabilities & k64Bit) { - ptr = commpage_addr_of(address); - - search_mask = 0xFC0007FE; // search x-form opcode bits - search = 0x7C000120; // for a MTCRF - replace_mask = 0x00100000; // replace bit 11... - replace = 0x00100000; // ...with a 1-bit - - commpage_change(ptr,length,search_mask,search,replace_mask,replace,commpage_onebit); - } -} - - -/* Port 32-bit code to 64-bit for use in the 64-bit commpage. This sounds fancier than - * it is. We do the following: - * - map "cmpw*" into "cmpd*" - * - map "srwi" into "srdi" - * Perhaps surprisingly, this is enough to permit lots of code to run in 64-bit mode, as - * long as it is written with this in mind. - */ -static void -commpage_port_32_to_64( - int address, - int length ) -{ - uint32_t *ptr, search_mask, search, replace_mask, replace; - - ptr = commpage_addr_of(address); - - search_mask = 0xFC2007FE; // search x-form opcode bits (and L bit) - search = 0x7C000000; // for a CMPW - replace_mask = 0x00200000; // replace bit 10 (L)... - replace = 0x00200000; // ...with a 1-bit, converting word to doubleword compares - commpage_change(ptr,length,search_mask,search,replace_mask,replace,NULL); - - search_mask = 0xFC2007FE; // search x-form opcode bits (and L bit) - search = 0x7C000040; // for a CMPLW - replace_mask = 0x00200000; // replace bit 10 (L)... - replace = 0x00200000; // ...with a 1-bit, converting word to doubleword compares - commpage_change(ptr,length,search_mask,search,replace_mask,replace,NULL); - - search_mask = 0xFC200000; // search d-form opcode bits (and L bit) - search = 0x28000000; // for a CMPLWI - replace_mask = 0x00200000; // replace bit 10 (L)... - replace = 0x00200000; // ...with a 1-bit, converting word to doubleword compares - commpage_change(ptr,length,search_mask,search,replace_mask,replace,NULL); - - search_mask = 0xFC200000; // search d-form opcode bits (and L bit) - search = 0x2C000000; // for a CMPWI - replace_mask = 0x00200000; // replace bit 10 (L)... - replace = 0x00200000; // ...with a 1-bit, converting word to doubleword compares - commpage_change(ptr,length,search_mask,search,replace_mask,replace,NULL); - - search_mask = 0xFC00003E; // search d-form opcode bits and ME (mask end) field - search = 0x5400003E; // for an RLWINM with ME=31 (which might be a "srwi") - replace_mask = 0xFC00003E; // then replace RLWINM's opcode and ME field to make a RLDICL - replace = 0x78000002; // opcode is 30, ME is 0, except we add 32 to SH amount - commpage_change(ptr,length,search_mask,search,replace_mask,replace,commpage_srwi); -} - - -/* Copy a routine into comm page if it matches running machine. - */ -static void -commpage_stuff_routine( - commpage_descriptor *rd, - int mode ) // kCommPage32 or kCommPage64 -{ - char *routine_code; - int must,cant; - - if ( (rd->special & mode) == 0 ) // is this routine useable in this mode? - return; - - if (rd->commpage_address != cur_routine) { - if ((cur_routine!=0) && (matched==0)) - panic("commpage no match for last, next address %08x", rd->commpage_address); - cur_routine = rd->commpage_address; - matched = 0; - } - - must = _cpu_capabilities & rd->musthave; - cant = _cpu_capabilities & rd->canthave; - - if ((must == rd->musthave) && (cant == 0)) { - if (matched) - panic("commpage multiple matches for address %08x", rd->commpage_address); - matched = 1; - routine_code = ((char*)rd) + rd->code_offset; - - commpage_stuff(rd->commpage_address,routine_code,rd->code_length); - - if (rd->special & kCommPageDCBA) - commpage_handle_dcbas(rd->commpage_address,rd->code_length); - - if (rd->special & kCommPageSYNC) - commpage_handle_syncs(rd->commpage_address,rd->code_length); - - if (rd->special & kCommPageISYNC) - commpage_handle_isyncs(rd->commpage_address,rd->code_length); - - if (rd->special & kCommPageMTCRF) - commpage_handle_mtcrfs(rd->commpage_address,rd->code_length); - - if ((mode == kCommPage64) && (rd->special & kPort32to64)) - commpage_port_32_to_64(rd->commpage_address,rd->code_length); - } -} - - -/* Fill in the 32- or 64-bit commpage. Called once for each. */ - -static void -commpage_populate_one( - vm_map_t submap, // the map to populate - char ** kernAddressPtr, // address within kernel of this commpage - int mode, // either kCommPage32 or kCommPage64 - const char* signature ) // "commpage 32-bit" or "commpage 64-bit" -{ - char c1; - short c2; - addr64_t c8; - static double two52 = 1048576.0 * 1048576.0 * 4096.0; // 2**52 - static double ten6 = 1000000.0; // 10**6 - static uint64_t magicFE = 0xFEFEFEFEFEFEFEFFLL; // used to find 0s in strings - static uint64_t magic80 = 0x8080808080808080LL; // also used to find 0s - commpage_descriptor **rd; - short version = _COMM_PAGE_THIS_VERSION; - - next = NULL; // initialize next available byte in the commpage - cur_routine = 0; // initialize comm page address of "current" routine - - commPagePtr = (char*) commpage_allocate( submap ); - *kernAddressPtr = commPagePtr; // save address either in commPagePtr32 or 64 - - /* Stuff in the constants. We move things into the comm page in strictly - * ascending order, so we can check for overlap and panic if so. - */ - - commpage_stuff(_COMM_PAGE_SIGNATURE,signature,strlen(signature)); - - commpage_stuff(_COMM_PAGE_VERSION,&version,2); - - commpage_stuff(_COMM_PAGE_CPU_CAPABILITIES,&_cpu_capabilities,sizeof(int)); - - c1 = (_cpu_capabilities & kHasAltivec) ? -1 : 0; - commpage_stuff(_COMM_PAGE_ALTIVEC,&c1,1); - - c1 = (_cpu_capabilities & k64Bit) ? -1 : 0; - commpage_stuff(_COMM_PAGE_64_BIT,&c1,1); - - if (_cpu_capabilities & kCache32) - c2 = 32; - else if (_cpu_capabilities & kCache64) - c2 = 64; - else if (_cpu_capabilities & kCache128) - c2 = 128; - commpage_stuff(_COMM_PAGE_CACHE_LINESIZE,&c2,2); - - commpage_stuff(_COMM_PAGE_2_TO_52,&two52,8); - commpage_stuff(_COMM_PAGE_10_TO_6,&ten6,8); - commpage_stuff(_COMM_PAGE_MAGIC_FE,&magicFE,8); - commpage_stuff(_COMM_PAGE_MAGIC_80,&magic80,8); - - c8 = 0; // 0 timestamp means "disabled" - commpage_stuff(_COMM_PAGE_TIMEBASE,&c8,8); - commpage_stuff(_COMM_PAGE_TIMESTAMP,&c8,8); - commpage_stuff(_COMM_PAGE_SEC_PER_TICK,&c8,8); - - /* Now the routines. We try each potential routine in turn, - * and copy in any that "match" the platform we are running on. - * We require that exactly one routine match for each slot in the - * comm page, and panic if not. - */ - - for( rd = routines; *rd != NULL ; rd++ ) - commpage_stuff_routine(*rd,mode); - - if (!matched) - panic("commpage no match on last routine"); - - if (next > (commPagePtr + _COMM_PAGE_AREA_USED)) - panic("commpage overflow"); - - - // make all that new code executable - - sync_cache_virtual((vm_offset_t) commPagePtr,_COMM_PAGE_AREA_USED); -} - - -/* Fill in commpage: called once, during kernel initialization, from the - * startup thread before user-mode code is running. - * - * See the top of this file for a list of what you have to do to add - * a new routine to the commpage. - */ - -void -commpage_populate( void ) -{ - commpage_init_cpu_capabilities(); - commpage_populate_one( commpage32_map, &commPagePtr32, kCommPage32, "commpage 32-bit"); - if (_cpu_capabilities & k64Bit) { - commpage_populate_one( commpage64_map, &commPagePtr64, kCommPage64, "commpage 64-bit"); - pmap_init_sharedpage((vm_offset_t)commPagePtr64); // Do the 64-bit version - } - -} diff --git a/osfmk/ppc/commpage/commpage.h b/osfmk/ppc/commpage/commpage.h deleted file mode 100644 index 64a139faf..000000000 --- a/osfmk/ppc/commpage/commpage.h +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2003-2008 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef _PPC_COMMPAGE_H -#define _PPC_COMMPAGE_H - -#ifndef __ASSEMBLER__ -#include -#endif /* __ASSEMBLER__ */ - - -/* Special check bits for the compage_descriptor "special" field. */ - -#define kCommPageDCBA 0x0001 // this routine uses DCBA, map to NOP if not appropriate -#define kCommPageSYNC 0x0002 // this routine uses SYNC, LWSYNC, or EIEIO, map to NOP if UP -#define kCommPageISYNC 0x0004 // this routine uses ISYNC, map to NOP if UP -#define kCommPageMTCRF 0x0008 // set bit 11 in MTCRF if only 1 cr specified - -#define kPort32to64 0x1000 // written for 32-bit, must port to 64-bit -#define kCommPage64 0x2000 // this routine is useable in 64-bit mode -#define kCommPage32 0x4000 // this routine is useable in 32-bit mode -#define kCommPageBoth (kCommPage32+kCommPage64) - - -#ifdef __ASSEMBLER__ - -#define COMMPAGE_DESCRIPTOR(label,address,must,cant,special) \ - .globl EXT(label) @\ -LEXT(label) @\ - .short label-. @\ - .short .-label-2 @\ - .short address @\ - .short special @\ - .long must @\ - .long cant - - -#else /* __ASSEMBLER__ */ - -/* Each potential commpage routine is described by one of these. - * Note that the COMMPAGE_DESCRIPTOR macro (above), used in - * assembly language, must agree with this. - */ - -typedef struct commpage_descriptor { - short code_offset; // offset to code from this descriptor - short code_length; // length in bytes - short commpage_address; // put at this address (_COMM_PAGE_BCOPY etc) - short special; // special handling bits for DCBA and SYNC etc - long musthave; // _cpu_capability bits we must have - long canthave; // _cpu_capability bits we can't have -} commpage_descriptor; - - -extern char *commPagePtr32; // virt address of 32-bit commpage in kernel map -extern char *commPagePtr64; // virt address of 64-bit commpage in kernel map - -extern void commpage_set_timestamp(uint64_t tbr, uint64_t secs, uint32_t ticks_per_sec); - -#define commpage_disable_timestamp() commpage_set_timestamp( 0, 0, 0 ) -#define commpage_set_memory_pressure( pressure ) - -extern int commpage_time_dcba( void ); - -#endif /* __ASSEMBLER__ */ - -#endif /* _PPC_COMMPAGE_H */ diff --git a/osfmk/ppc/commpage/commpage_asm.s b/osfmk/ppc/commpage/commpage_asm.s deleted file mode 100644 index d3ea83c24..000000000 --- a/osfmk/ppc/commpage/commpage_asm.s +++ /dev/null @@ -1,272 +0,0 @@ -/* - * Copyright (c) 2003-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include -#include -#include - - -// commpage_time_dcba() uses a stack frame as follows: - -#define kBufSiz 1024 // Size of the buffer we use to do DCBA timing on G4 -#define kSFSize (kBufSiz+128+16) // Stack frame size, which contains the 128-byte-aligned buffer -#define kLoopCnt 5 // Iterations of the timing loop -#define kDCBA 22 // Bit in cr5 used as a flag in timing loop - - -// commpage_set_timestamp() uses the red zone for temporary storage: - -#define rzSaveF1 -8 // caller's FPR1 -#define rzSaveF2 -16 // caller's FPR2 -#define rzSaveF3 -24 // caller's FPR3 -#define rzSaveF4 -32 // caller's FPR4 -#define rzSaveF5 -40 // caller's FPR5 -#define rzNewTimeBase -48 // used to load 64-bit TBR into a FPR - - -// commpage_set_timestamp() uses the following data. kkTicksPerSec remembers -// the number used to compute _COMM_PAGE_SEC_PER_TICK. Since this constant -// rarely changes, we use it to avoid needless recomputation. It is a double -// value, pre-initialize with an exponent of 2**52. - -#define kkBinary0 0 // offset in data to long long 0 (a constant) -#define kkDouble1 8 // offset in data to double 1.0 (a constant) -#define kkTicksPerSec 16 // offset in data to double(ticks_per_sec) - - .data - .align 3 // three doubleword fields -Ldata: - .long 0 // kkBinary0 - .long 0 - .double 1.0e0 // kkDouble1 - .long 0x43300000 // kkTicksPerSec (plus 2**52) - .long 0 // this is where we store ticks_per_sec, to float - - .text - .align 2 - .globl EXT(commpage_time_dcba) - .globl EXT(commpage_set_timestamp) - - -/* *********************************************** - * * C O M M P A G E _ S E T _ T I M E S T A M P * - * *********************************************** - * - * Update the gettimeofday() shared data on the commpages, as follows: - * _COMM_PAGE_TIMESTAMP = the clock offset at timebase (seconds) - * _COMM_PAGE_TIMEBASE = the timebase at which the timestamp was valid - * _COMM_PAGE_SEC_PER_TICK = multiply timebase ticks by this to get seconds (double) - * The convention is that if the timebase is 0, the data is invalid. Because other - * CPUs are reading the three values asynchronously and must get a consistent set, - * it is critical that we update them with the following protocol: - * 1. set timebase to 0 (atomically), to invalidate all three values - * 2. eieio (to create a barrier in stores to cacheable memory) - * 3. change timestamp and "secs per tick" - * 4. eieio - * 5. set timebase nonzero (atomically) - * This works because readers read the timebase, then the timestamp and divisor, sync - * if MP, then read the timebase a second time and check to be sure it is equal to the first. - * - * We could save a few cycles on 64-bit machines by special casing them, but it probably - * isn't necessary because this routine shouldn't be called very often. - * - * When called: - * r3 = upper half of timebase (timebase is disabled if 0) - * r4 = lower half of timebase - * r5 = upper half of timestamp - * r6 = lower half of timestamp - * r7 = divisor (ie, timebase ticks per sec) - * We set up: - * r8 = ptr to our static data (kkBinary0, kkDouble1, kkTicksPerSec) - * r9 = ptr to 32-bit commpage in kernel map - * r10 = ptr to 64-bit commpage in kernel map - * - * --> Interrupts must be disabled and rtclock locked when called. <-- - */ - - .align 5 -LEXT(commpage_set_timestamp) // void commpage_set_timestamp(tbr,secs,divisor) - mfmsr r11 // get MSR - ori r2,r11,MASK(MSR_FP) // turn FP on - mtmsr r2 - isync // wait until MSR changes take effect - - or. r0,r3,r4 // is timebase 0? (thus disabled) - lis r8,hi16(Ldata) // point to our data - lis r9,ha16(EXT(commPagePtr32)) // get ptrs to address of commpages in kernel map - lis r10,ha16(EXT(commPagePtr64)) - stfd f1,rzSaveF1(r1) // save a FPR in the red zone - ori r8,r8,lo16(Ldata) - lwz r9,lo16(EXT(commPagePtr32))(r9) // r9 <- 32-bit commpage ptr - lwz r10,lo16(EXT(commPagePtr64))(r10) // r10 <- 64-bit commpage ptr - lfd f1,kkBinary0(r8) // get fixed 0s - li r0,_COMM_PAGE_BASE_ADDRESS // get va in user space of commpage - cmpwi cr1,r9,0 // is 32-bit commpage allocated yet? - cmpwi cr6,r10,0 // is 64-bit commpage allocated yet? - sub r9,r9,r0 // r9 <- 32-bit commpage address, biased by user va - sub r10,r10,r0 // r10<- 64-bit commpage address - beq-- cr1,3f // skip if 32-bit commpage not allocated (64-bit won't be either) - bne++ cr6,1f // skip if 64-bit commpage is allocated - mr r10,r9 // if no 64-bit commpage, point to 32-bit version with r10 too -1: - stfd f1,_COMM_PAGE_TIMEBASE(r9) // turn off the 32-bit-commpage timestamp (atomically) - stfd f1,_COMM_PAGE_TIMEBASE(r10) // and the 64-bit one too - eieio // make sure all CPUs see it is off - beq 3f // all we had to do is turn off timestamp - - lwz r0,kkTicksPerSec+4(r8) // get last ticks_per_sec (or 0 if first) - stw r3,rzNewTimeBase(r1) // store new timebase so we can lfd - stw r4,rzNewTimeBase+4(r1) - cmpw r0,r7 // do we need to recompute _COMM_PAGE_SEC_PER_TICK? - stw r5,_COMM_PAGE_TIMESTAMP(r9) // store the new timestamp in the 32-bit page - stw r6,_COMM_PAGE_TIMESTAMP+4(r9) - stw r5,_COMM_PAGE_TIMESTAMP(r10)// and the 64-bit commpage - stw r6,_COMM_PAGE_TIMESTAMP+4(r10) - lfd f1,rzNewTimeBase(r1) // get timebase in a FPR so we can store atomically - beq++ 2f // same ticks_per_sec, no need to recompute - - stw r7,kkTicksPerSec+4(r8) // must recompute SEC_PER_TICK - stfd f2,rzSaveF2(r1) // we'll need a few more temp FPRs - stfd f3,rzSaveF3(r1) - stfd f4,rzSaveF4(r1) - stfd f5,rzSaveF5(r1) - lfd f2,_COMM_PAGE_2_TO_52(r9) // f2 <- double(2**52) - lfd f3,kkTicksPerSec(r8) // float new ticks_per_sec + 2**52 - lfd f4,kkDouble1(r8) // f4 <- double(1.0) - mffs f5 // save caller's FPSCR - mtfsfi 7,1 // clear Inexeact Exception bit, set round-to-zero - fsub f3,f3,f2 // get ticks_per_sec - fdiv f3,f4,f3 // divide 1 by ticks_per_sec to get SEC_PER_TICK - stfd f3,_COMM_PAGE_SEC_PER_TICK(r9) - stfd f3,_COMM_PAGE_SEC_PER_TICK(r10) - mtfsf 0xFF,f5 // restore FPSCR - lfd f2,rzSaveF2(r1) // restore FPRs - lfd f3,rzSaveF3(r1) - lfd f4,rzSaveF4(r1) - lfd f5,rzSaveF5(r1) -2: // f1 == new timestamp - eieio // wait until the stores take - stfd f1,_COMM_PAGE_TIMEBASE(r9) // then turn the timestamp back on (atomically) - stfd f1,_COMM_PAGE_TIMEBASE(r10) // both -3: // here once all fields updated - lfd f1,rzSaveF1(r1) // restore last FPR - mtmsr r11 // turn FP back off - isync - blr - - -/* *************************************** - * * C O M M P A G E _ T I M E _ D C B A * - * *************************************** - * - * Not all processors that support the DCBA opcode actually benefit from it. - * Some store-gather and read-cancel well enough that there is no need to use - * DCBA to avoid fetching cache lines that will be completely overwritten, while - * others have this feature disabled (to work around errata etc), and so benefit - * from DCBA. Since it is hard to tell the one group from the other, we just - * time loops with and without DCBA, and pick the fastest. Thus we avoid - * delicate dependence on processor and/or platform revisions. - * - * We return either kDcbaRecommended or zero. - * - * int commpage_time_dcba( void ); - */ - -LEXT(commpage_time_dcba) - mflr r12 // get return - stw r12,8(r1) // save - stwu r1,-kSFSize(r1) // carve our temp buffer from the stack - addi r11,r1,127+16 // get base address... - rlwinm r11,r11,0,0,24 // ...of our buffer, 128-byte aligned - crset kDCBA // first, use DCBA - bl LTest // time it with DCBA - srwi r0,r3,3 // bias 12 pct in favor of not using DCBA... - add r10,r3,r0 // ...because DCBA is always slower with warm cache - crclr kDCBA - bl LTest // time without DCBA - cmplw r10,r3 // which is better? - mtlr r12 // restore return - lwz r1,0(r1) // pop off our stack frame - li r3,kDcbaRecommended // assume using DCBA is faster - bltlr - li r3,0 // no DCBA is faster - blr - - -// Subroutine to time a loop with or without DCBA. -// kDCBA = set if we should use DCBA -// r11 = base of buffer to use for test (kBufSiz bytes) -// -// We return TBR ticks in r3. -// We use r0,r3-r9. - -LTest: - li r4,kLoopCnt // number of times to loop - li r3,-1 // initialize fastest time -1: - mr r6,r11 // initialize buffer ptr - li r0,kBufSiz/32 // r0 <- cache blocks to test - mtctr r0 -2: - dcbf 0,r6 // first, force the blocks out of the cache - addi r6,r6,32 - bdnz 2b - sync // make sure all the flushes take - mr r6,r11 // re-initialize buffer ptr - mtctr r0 // reset cache-block count - mftbu r7 // remember upper half so we can check for carry - mftb r8 // start the timer -3: // loop over cache blocks - bf kDCBA,4f // should we DCBA? - dcba 0,r6 -4: - stw r0,0(r6) // store the entire cache block - stw r0,4(r6) - stw r0,8(r6) - stw r0,12(r6) - stw r0,16(r6) - stw r0,20(r6) - stw r0,24(r6) - stw r0,28(r6) - addi r6,r6,32 - bdnz 3b - mftb r9 - mftbu r0 - cmpw r0,r7 // did timebase carry? - bne 1b // yes, retest rather than fuss - sub r9,r9,r8 // r9 <- time for this loop - cmplw r9,r3 // faster than current best? - bge 5f // no - mr r3,r9 // remember fastest time through loop -5: - subi r4,r4,1 // decrement outer loop count - cmpwi r4,0 // more to go? - bne 1b // loop if so - blr // return fastest time in r3 diff --git a/osfmk/ppc/commpage/gettimeofday.s b/osfmk/ppc/commpage/gettimeofday.s deleted file mode 100644 index e9645ee37..000000000 --- a/osfmk/ppc/commpage/gettimeofday.s +++ /dev/null @@ -1,255 +0,0 @@ -/* - * Copyright (c) 2003-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include // EXT, LEXT -#include -#include - -/* The red zone is used to move data between GPRs and FPRs: */ - -#define rzTicks -8 // elapsed ticks since timestamp (double) -#define rzSeconds -16 // seconds since timestamp (double) -#define rzUSeconds -24 // useconds since timestamp (double) - - - .text - .align 2 - - -// ********************************* -// * G E T T I M E O F D A Y _ 3 2 * -// ********************************* -// -// This is a subroutine of gettimeofday.c that gets the seconds and microseconds -// in user mode, usually without having to make a system call. We do not deal with -// the timezone. The kernel maintains the following values in the comm page: -// -// _COMM_PAGE_TIMESTAMP = 64 bit seconds timestamp -// -// _COMM_PAGE_TIMEBASE = the timebase at which the timestamp was valid -// -// _COMM_PAGE_SEC_PER_TICK = multiply timebase ticks by this to get seconds (double) -// -// _COMM_PAGE_2_TO_52 = double precision constant 2**52 -// -// _COMM_PAGE_10_TO_6 = double precision constant 10**6 -// -// We have to be careful to read these values atomically. The kernel updates them -// asynchronously to account for drift or time changes (eg, ntp.) We adopt the -// convention that (timebase==0) means the timestamp is invalid, in which case we -// return a bad status so our caller can make the system call. -// -// r3 = ptr to user's timeval structure (should not be null) - -gettimeofday_32: // int gettimeofday(timeval *tp); -0: - lwz r5,_COMM_PAGE_TIMEBASE+0(0) // r5,r6 = TBR at timestamp - lwz r6,_COMM_PAGE_TIMEBASE+4(0) - lwz r8,_COMM_PAGE_TIMESTAMP+4(0) // r8 = timestamp 32 bit seconds - lfd f1,_COMM_PAGE_SEC_PER_TICK(0) -1: - mftbu r10 // r10,r11 = current timebase - mftb r11 - mftbu r12 - cmplw r10,r12 - bne- 1b - or. r0,r5,r6 // timebase 0? (ie, is timestamp invalid?) - - sync // create a barrier (patched to NOP if UP) - - lwz r0,_COMM_PAGE_TIMEBASE+0(0) // then load data a 2nd time - lwz r12,_COMM_PAGE_TIMEBASE+4(0) - lwz r9,_COMM_PAGE_TIMESTAMP+4(0) - cmplw cr6,r5,r0 // did we read a consistent set? - cmplw cr7,r6,r12 - beq- 3f // timestamp is disabled so return bad status - cmplw cr5,r9,r8 - crand cr0_eq,cr6_eq,cr7_eq - crand cr0_eq,cr0_eq,cr5_eq - bne- 0b // loop until we have a consistent set of data - - subfc r11,r6,r11 // compute ticks since timestamp - lwz r9,_COMM_PAGE_2_TO_52(0) // get exponent for (2**52) - subfe r10,r5,r10 // complete 64-bit subtract - lfd f2,_COMM_PAGE_2_TO_52(0) // f2 <- (2**52) - srwi. r0,r10,2 // if more than 2**34 ticks have elapsed... - stw r11,rzTicks+4(r1) // store elapsed ticks into red zone - or r10,r10,r9 // convert long-long in (r10,r11) into double - bne- 3f // ...call kernel to reprime timestamp - - stw r10,rzTicks(r1) // complete double - - mffs f7 - mtfsfi 7,1 - lfd f3,rzTicks(r1) // get elapsed ticks since timestamp + 2**52 - fsub f4,f3,f2 // subtract 2**52 and normalize - fmul f5,f4,f1 // f5 <- elapsed seconds since timestamp - lfd f3,_COMM_PAGE_10_TO_6(0) // get 10**6 - fctiwz f6,f5 // convert to integer - stfd f6,rzSeconds(r1) // store integer seconds into red zone - stw r9,rzSeconds(r1) // prepare to reload as floating pt - lfd f6,rzSeconds(r1) // get seconds + 2**52 - fsub f6,f6,f2 // f6 <- integral seconds - fsub f6,f5,f6 // f6 <- fractional part of elapsed seconds - fmul f6,f6,f3 // f6 <- fractional elapsed useconds - fctiwz f6,f6 // convert useconds to integer - stfd f6,rzUSeconds(r1) // store useconds into red zone - mtfsf 0xff,f7 - - lwz r5,rzSeconds+4(r1) // r5 <- seconds since timestamp - lwz r7,rzUSeconds+4(r1) // r7 <- useconds since timestamp - add r6,r8,r5 // add elapsed seconds to timestamp seconds - - stw r6,0(r3) // store secs//usecs into user's timeval - stw r7,4(r3) - li r3,0 // return success - blr -3: // too long since last timestamp or this code is disabled - li r3,1 // return bad status so our caller will make syscall - blr - - COMMPAGE_DESCRIPTOR(gettimeofday_32,_COMM_PAGE_GETTIMEOFDAY,0,k64Bit,kCommPageSYNC+kCommPage32) - - -// *************************************** -// * G E T T I M E O F D A Y _ G 5 _ 3 2 * -// *************************************** -// -// This routine is called in 32-bit mode on 64-bit processors. A timeval is a struct of -// a long seconds and int useconds, so its size depends on mode. - -gettimeofday_g5_32: // int gettimeofday(timeval *tp); -0: - ld r6,_COMM_PAGE_TIMEBASE(0) // r6 = TBR at timestamp - ld r8,_COMM_PAGE_TIMESTAMP(0) // r8 = timestamp (seconds) - lfd f1,_COMM_PAGE_SEC_PER_TICK(0) - mftb r10 // r10 = get current timebase - lwsync // create a barrier if MP (patched to NOP if UP) - ld r11,_COMM_PAGE_TIMEBASE(0) // then get data a 2nd time - ld r12,_COMM_PAGE_TIMESTAMP(0) - cmpdi cr1,r6,0 // is the timestamp disabled? - cmpld cr6,r6,r11 // did we read a consistent set? - cmpld cr7,r8,r12 - beq-- cr1,3f // exit if timestamp disabled - crand cr6_eq,cr7_eq,cr6_eq - sub r11,r10,r6 // compute elapsed ticks from timestamp - bne-- cr6,0b // loop until we have a consistent set of data - - srdi. r0,r11,35 // has it been more than 2**35 ticks since last timestamp? - std r11,rzTicks(r1) // put ticks in redzone where we can "lfd" it - bne-- 3f // timestamp too old, so reprime - - mffs f7 - mtfsfi 7,1 - lfd f3,rzTicks(r1) // get elapsed ticks since timestamp (fixed pt) - fcfid f4,f3 // float the tick count - fmul f5,f4,f1 // f5 <- elapsed seconds since timestamp - lfd f3,_COMM_PAGE_10_TO_6(0) // get 10**6 - fctidz f6,f5 // convert integer seconds to fixed pt - stfd f6,rzSeconds(r1) // save fixed pt integer seconds in red zone - fcfid f6,f6 // float the integer seconds - fsub f6,f5,f6 // f6 <- fractional part of elapsed seconds - fmul f6,f6,f3 // f6 <- fractional elapsed useconds - fctidz f6,f6 // convert useconds to fixed pt integer - stfd f6,rzUSeconds(r1) // store useconds into red zone - mtfsf 0xff,f7 - - lwz r5,rzSeconds+4(r1) // r5 <- seconds since timestamp - lwz r7,rzUSeconds+4(r1) // r7 <- useconds since timestamp - add r6,r8,r5 // add elapsed seconds to timestamp seconds - - stw r6,0(r3) // store secs//usecs into user's timeval - stw r7,4(r3) - li r3,0 // return success - blr -3: // too long since last timestamp or this code is disabled - li r3,1 // return bad status so our caller will make syscall - blr - - COMMPAGE_DESCRIPTOR(gettimeofday_g5_32,_COMM_PAGE_GETTIMEOFDAY,k64Bit,0,kCommPageSYNC+kCommPage32) - - -// *************************************** -// * G E T T I M E O F D A Y _ G 5 _ 6 4 * -// *************************************** -// -// This routine is called in 64-bit mode on 64-bit processors. A timeval is a struct of -// a long seconds and int useconds, so its size depends on mode. - -gettimeofday_g5_64: // int gettimeofday(timeval *tp); -0: - ld r6,_COMM_PAGE_TIMEBASE(0) // r6 = TBR at timestamp - ld r8,_COMM_PAGE_TIMESTAMP(0) // r8 = timestamp (seconds) - lfd f1,_COMM_PAGE_SEC_PER_TICK(0) - mftb r10 // r10 = get current timebase - lwsync // create a barrier if MP (patched to NOP if UP) - ld r11,_COMM_PAGE_TIMEBASE(0) // then get data a 2nd time - ld r12,_COMM_PAGE_TIMESTAMP(0) - cmpdi cr1,r6,0 // is the timestamp disabled? - cmpld cr6,r6,r11 // did we read a consistent set? - cmpld cr7,r8,r12 - beq-- cr1,3f // exit if timestamp disabled - crand cr6_eq,cr7_eq,cr6_eq - sub r11,r10,r6 // compute elapsed ticks from timestamp - bne-- cr6,0b // loop until we have a consistent set of data - - srdi. r0,r11,35 // has it been more than 2**35 ticks since last timestamp? - std r11,rzTicks(r1) // put ticks in redzone where we can "lfd" it - bne-- 3f // timestamp too old, so reprime - - mffs f7 - mtfsfi 7,1 - lfd f3,rzTicks(r1) // get elapsed ticks since timestamp (fixed pt) - fcfid f4,f3 // float the tick count - fmul f5,f4,f1 // f5 <- elapsed seconds since timestamp - lfd f3,_COMM_PAGE_10_TO_6(0) // get 10**6 - fctidz f6,f5 // convert integer seconds to fixed pt - stfd f6,rzSeconds(r1) // save fixed pt integer seconds in red zone - fcfid f6,f6 // float the integer seconds - fsub f6,f5,f6 // f6 <- fractional part of elapsed seconds - fmul f6,f6,f3 // f6 <- fractional elapsed useconds - fctidz f6,f6 // convert useconds to fixed pt integer - stfd f6,rzUSeconds(r1) // store useconds into red zone - mtfsf 0xff,f7 - - lwz r5,rzSeconds+4(r1) // r5 <- seconds since timestamp - lwz r7,rzUSeconds+4(r1) // r7 <- useconds since timestamp - add r6,r8,r5 // add elapsed seconds to timestamp seconds - - std r6,0(r3) // store secs//usecs into user's timeval - stw r7,8(r3) - li r3,0 // return success - blr -3: // too long since last timestamp or this code is disabled - li r3,1 // return bad status so our caller will make syscall - blr - - COMMPAGE_DESCRIPTOR(gettimeofday_g5_64,_COMM_PAGE_GETTIMEOFDAY,k64Bit,0,kCommPageSYNC+kCommPage64) - - diff --git a/osfmk/ppc/commpage/mach_absolute_time.s b/osfmk/ppc/commpage/mach_absolute_time.s deleted file mode 100644 index be9345dad..000000000 --- a/osfmk/ppc/commpage/mach_absolute_time.s +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include // EXT, LEXT -#include -#include - - .text - .align 2 - - -// ********************************************* -// * M A C H _ A B S O L U T E _ T I M E _ 3 2 * -// ********************************************* - -mach_absolute_time_32: -1: - mftbu r3 - mftb r4 - mftbu r5 - cmplw r3,r5 - beqlr+ - b 1b - - COMMPAGE_DESCRIPTOR(mach_absolute_time_32,_COMM_PAGE_ABSOLUTE_TIME,0,k64Bit,kCommPage32) - - -// ********************************************* -// * M A C H _ A B S O L U T E _ T I M E _ 6 4 * -// ********************************************* -// -// This is the version that is called in 32-bit mode, so we return the TBR in r3 and r4. - -mach_absolute_time_64: - mftb r4 - srdi r3,r4,32 - blr - - COMMPAGE_DESCRIPTOR(mach_absolute_time_64,_COMM_PAGE_ABSOLUTE_TIME,k64Bit,0,kCommPage32) - - -// ************************************************* -// * M A C H _ A B S O L U T E _ T I M E _ L P 6 4 * -// ************************************************* -// -// This is the version that is called in 64-bit mode, so we return the TBR in r3. - -mach_absolute_time_lp64: - mftb r3 - blr - - COMMPAGE_DESCRIPTOR(mach_absolute_time_lp64,_COMM_PAGE_ABSOLUTE_TIME,k64Bit,0,kCommPage64) - - diff --git a/osfmk/ppc/commpage/memset_64.s b/osfmk/ppc/commpage/memset_64.s deleted file mode 100644 index 187e742b6..000000000 --- a/osfmk/ppc/commpage/memset_64.s +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include -#include - -/* - * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary - * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following - * simple transformations: - * - all word compares are changed to doubleword - * - all "srwi[.]" opcodes are changed to "srdi[.]" - * Nothing else is done. For this to work, the following rules must be - * carefully followed: - * - do not use carry or overflow - * - only use record mode if you are sure the results are mode-invariant - * for example, all "andi." and almost all "rlwinm." are fine - * - do not use "slwi", "slw", or "srw" - * An imaginative programmer could break the porting model in other ways, but the above - * are the most likely problem areas. It is perhaps surprising how well in practice - * this simple method works. - */ - - .text - .align 2 - - -/* ********************* - * * M E M S E T _ 6 4 * - * ********************* - * - * This is a subroutine called by Libc memset and _memset_pattern for large nonzero - * operands (zero operands are funneled into bzero.) This version is for a - * hypothetic processor that is 64-bit but not Altivec. - * It is not optimized, since it would only be used during bringup. - * - * Registers at entry: - * r4 = count of bytes to store (must be >= 32) - * r8 = ptr to the 1st byte to store (16-byte aligned) - * r9 = ptr to 16-byte pattern to store (16-byte aligned) - * When we return: - * r3 = not changed, since memset returns it - * r4 = bytes remaining to store (will be <32) - * r7 = not changed - * r8 = ptr to next byte to store (still 16-byte aligned) - * r12 = not changed (holds return value for memset) - */ - -memset_64: - srwi r0,r4,5 // get number of 32-byte chunks (>0) - ld r10,0(r9) // load pattern - ld r11,8(r9) - rlwinm r4,r4,0,0x1F // mask down count - mtctr r0 // set up loop count - - // Loop over 32-byte chunks. -1: - std r10,0(r8) - std r11,8(r8) - std r10,16(r8) - std r11,24(r8) - addi r8,r8,32 - bdnz++ 1b - - blr - - - COMMPAGE_DESCRIPTOR(memset_64,_COMM_PAGE_MEMSET_PATTERN,k64Bit,kHasAltivec, \ - kCommPageBoth+kPort32to64) diff --git a/osfmk/ppc/commpage/memset_g3.s b/osfmk/ppc/commpage/memset_g3.s deleted file mode 100644 index 469627f85..000000000 --- a/osfmk/ppc/commpage/memset_g3.s +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include -#include - - .text - .align 2 - -/* ********************* - * * M E M S E T _ G 3 * - * ********************* - * - * This is a subroutine called by Libc memset and _memset_pattern for large nonzero - * operands (zero operands are funneled into bzero.) This version is for - * 32-bit processors with a 32-byte cache line and no Altivec. - * - * Registers at entry: - * r4 = count of bytes to store (must be >= 32) - * r8 = ptr to the 1st byte to store (16-byte aligned) - * r9 = ptr to 16-byte pattern to store (16-byte aligned) - * When we return: - * r3 = not changed, since memset returns it - * r4 = bytes remaining to store (will be <32) - * r7 = not changed - * r8 = ptr to next byte to store (still 16-byte aligned) - * r12 = not changed (holds return value for memset) - */ - - .align 4 -memset_g3: - andi. r0,r8,16 // cache line aligned? - lfd f0,0(r9) // pick up the pattern in two FPRs - lfd f1,8(r9) - beq 1f // skip if already aligned - - // cache line align - - stfd f0,0(r8) // no, store another 16 bytes to align - stfd f1,8(r8) - subi r4,r4,16 // skip past the 16 bytes we just stored - addi r8,r8,16 - - // Loop over cache lines. This code uses a private protocol with the kernel: - // when the kernel emulates an alignment exception on a DCBZ that occurs in the - // commpage, it zeroes CR7. We use this to detect the case where we are operating on - // uncached memory, and do not use DCBZ again in this code. We assume that either - // all the operand is cacheable or none of it is, so we only check the first DCBZ. -1: - srwi. r0,r4,6 // get count of 64-byte chunks - cmpw cr7,r0,r0 // set cr7_eq (kernel turns off on alignment exception) - rlwinm r4,r4,0,0x3F // mask down to residual count (0..63) - beq Lleftover // no chunks - dcbz 0,r8 // zero first cache line (clearing cr7 if alignment exception) - mtctr r0 - li r6,32 // get an offset for DCBZ - beq+ cr7,LDcbzEnter // enter DCBZ loop (we didn't get an alignment exception) - - // Loop over 64-byte chunks without DCBZ. -LNoDcbz: - stfd f0,0(r8) - stfd f1,8(r8) - stfd f0,16(r8) - stfd f1,24(r8) - stfd f0,32(r8) - stfd f1,40(r8) - stfd f0,48(r8) - stfd f1,56(r8) - addi r8,r8,64 - bdnz LNoDcbz - - b Lleftover - - // Loop over 64-byte chunks using DCBZ. -LDcbz: - dcbz 0,r8 -LDcbzEnter: - dcbz r6,r8 - stfd f0,0(r8) - stfd f1,8(r8) - stfd f0,16(r8) - stfd f1,24(r8) - stfd f0,32(r8) - stfd f1,40(r8) - stfd f0,48(r8) - stfd f1,56(r8) - addi r8,r8,64 - bdnz LDcbz - - // Handle leftovers (0..63 bytes) -Lleftover: - srwi. r0,r4,4 // get count of 16-byte chunks - rlwinm r4,r4,0,0xF // mask down to residuals - beqlr // no 16-byte chunks so done - mtctr r0 -2: - stfd f0,0(r8) - stfd f1,8(r8) - addi r8,r8,16 - bdnz 2b - - blr - - COMMPAGE_DESCRIPTOR(memset_g3,_COMM_PAGE_MEMSET_PATTERN,kCache32,kHasAltivec, \ - kCommPage32) diff --git a/osfmk/ppc/commpage/memset_g4.s b/osfmk/ppc/commpage/memset_g4.s deleted file mode 100644 index 9e33f45f2..000000000 --- a/osfmk/ppc/commpage/memset_g4.s +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include -#include - - .text - .align 2 - - -/* ********************* - * * M E M S E T _ G 4 * - * ********************* - * - * This is a subroutine called by Libc memset and memset_pattern for large nonzero - * operands (zero operands are funneled into bzero.) This version is for - * 32-bit processors with a 32-byte cache line and Altivec. - * - * Registers at entry: - * r4 = count of bytes to store (must be >= 32) - * r8 = ptr to the 1st byte to store (16-byte aligned) - * r9 = ptr to 16-byte pattern to store (16-byte aligned) - * When we return: - * r3 = not changed, since memset returns it - * r4 = bytes remaining to store (will be <32) - * r7 = not changed - * r8 = ptr to next byte to store (still 16-byte aligned) - * r12 = not changed (holds return value for memset) - */ - -#define kBig (3*64) // big enough to warrant using dcba (NB: must be >= 3*64) - - .align 4 -memset_g4: - cmplwi cr1,r4,kBig // big enough to warrant using dcbz? - mfspr r2,vrsave // we'll be using VRs - oris r0,r2,0x8000 // we use vr0 - andi. r5,r8,0x10 // is ptr 32-byte aligned? - mtspr vrsave,r0 - li r5,16 // get offsets for "stvx" - lvx v0,0,r9 // load the pattern into v0 - li r6,32 - blt cr1,LShort // not big enough to bother with dcba - li r9,48 - - // cache line align - - beq 2f // already aligned - stvx v0,0,r8 // store another 16 bytes to align - addi r8,r8,16 - subi r4,r4,16 - - // Set up for inner loop. -2: - srwi r0,r4,6 // get count of 64-byte chunks (>=2) - dcba 0,r8 // pre-allocate first cache line (possibly nop'd) - rlwinm r4,r4,0,0x3F // mask down to residual count (0..63) - subic r0,r0,1 // loop 1-too-few times - li r10,64 // get offsets to DCBA one chunk ahead - li r11,64+32 - mtctr r0 - dcba r6,r8 // zero 2nd cache line (possibly nop'd) - b 3f // enter DCBA loop - - // Loop over 64-byte chunks. We DCBA one chunk ahead, which is a little faster. - // Note that some G4s do not benefit from the DCBAs. We nop them in that case. - - .align 4 -3: - dcba r10,r8 // zero one 64-byte chunk ahead (possibly nop'd) - dcba r11,r8 - stvx v0,0,r8 - stvx v0,r5,r8 - stvx v0,r6,r8 - stvx v0,r9,r8 - addi r8,r8,64 - bdnz+ 3b - - // Last chunk, which we've already DCBAd. - - stvx v0,0,r8 - stvx v0,r5,r8 - stvx v0,r6,r8 - stvx v0,r9,r8 - addi r8,r8,64 - - // loop over 32-byte chunks at end -LShort: - srwi. r0,r4,5 // get count of 32-byte chunks - rlwinm r4,r4,0,0x1F // mask down to residual count (0..31) - beq 7f // no chunks so done - mtctr r0 -6: - stvx v0,0,r8 - stvx v0,r5,r8 - addi r8,r8,32 - bdnz 6b -7: - mtspr vrsave,r2 // restore caller's vrsave - blr - - - COMMPAGE_DESCRIPTOR(memset_g4,_COMM_PAGE_MEMSET_PATTERN,kCache32+kHasAltivec,0, \ - kCommPageDCBA+kCommPage32) diff --git a/osfmk/ppc/commpage/memset_g5.s b/osfmk/ppc/commpage/memset_g5.s deleted file mode 100644 index 6acf98579..000000000 --- a/osfmk/ppc/commpage/memset_g5.s +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include -#include - - .text - .align 2 -/* - * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary - * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following - * simple transformations: - * - all word compares are changed to doubleword - * - all "srwi[.]" opcodes are changed to "srdi[.]" - * Nothing else is done. For this to work, the following rules must be - * carefully followed: - * - do not use carry or overflow - * - only use record mode if you are sure the results are mode-invariant - * for example, all "andi." and almost all "rlwinm." are fine - * - do not use "slwi", "slw", or "srw" - * An imaginative programmer could break the porting model in other ways, but the above - * are the most likely problem areas. It is perhaps surprising how well in practice - * this simple method works. - */ - -/* ********************* - * * M E M S E T _ G 5 * - * ********************* - * - * This is a subroutine called by Libc memset and memset_pattern for large nonzero - * operands (zero operands are funneled into bzero.) This version is for - * 64-bit processors with a 128-byte cache line and Altivec. - * - * Registers at entry: - * r4 = count of bytes to store (must be >= 32) - * r8 = ptr to the 1st byte to store (16-byte aligned) - * r9 = ptr to 16-byte pattern to store (16-byte aligned) - * When we return: - * r3 = not changed, since memset returns it - * r4 = bytes remaining to store (will be <32) - * r7 = not changed - * r8 = ptr to next byte to store (still 16-byte aligned) - * r12 = not changed (holds return value for memset) - */ - -#define kBig (3*128) // big enough to warrant using dcbz (NB: must be >= 3*128) - - .align 5 -memset_g5: - cmplwi cr1,r4,kBig // big enough to warrant using dcbz? - neg r10,r8 // start to align ptr - mfspr r2,vrsave // we'll be using VRs - andi. r10,r10,0x70 // get #bytes to cache line align - oris r0,r2,0x8000 // we use vr0 - mtspr vrsave,r0 - li r5,16 // get offsets for "stvx" - lvx v0,0,r9 // load the pattern into v0 - li r6,32 - blt cr1,LShort // not big enough to bother with dcbz - li r9,48 - - // cache line align - - beq 2f // already aligned -1: - subic. r10,r10,16 // more to go? - stvx v0,0,r8 - addi r8,r8,16 - subi r4,r4,16 - bne 1b - - // Loop over cache lines. This code uses a private protocol with the kernel: - // when the kernel emulates an alignment exception on a DCBZ that occurs in the - // commpage, it zeroes CR7. We use this to detect the case where we are operating on - // uncached memory, and do not use DCBZ again in this code. We assume that either - // all the operand is cacheable or none of it is, so we only check the first DCBZ. -2: - cmpw cr7,r3,r3 // set cr7_eq (kernel will clear if DCBZ faults) - dcbzl 0,r8 // zero first cache line (clearing cr7 if alignment exception) - srwi r0,r4,7 // get #cache lines (>=2) - rlwinm r4,r4,0,0x7F // mask down to residual count (0..127) - bne-- cr7,LNoDcbz // exit if we took alignment exception on the first DCBZ - subic r0,r0,1 // loop 1-too-few times - li r11,128 // set DCBZ look-ahead - mtctr r0 - b 3f // use loop that DCBZs - - // Loop over cache lines. We DCBZ one line ahead, which is a little faster. - - .align 5 -3: - dcbzl r11,r8 // zero one line ahead - addi r10,r8,64 - stvx v0,0,r8 - stvx v0,r5,r8 - stvx v0,r6,r8 - stvx v0,r9,r8 - addi r8,r8,128 - stvx v0,0,r10 - stvx v0,r5,r10 - stvx v0,r6,r10 - stvx v0,r9,r10 - bdnz++ 3b - - li r0,1 // we've already DCBZ'd the last line -LNoDcbz: // r0: loop count - mtctr r0 - - // Loop which does not DCBZ. Normally this is only used for last cache line, - // because we've already zeroed it. -4: - addi r10,r8,64 - stvx v0,0,r8 - stvx v0,r5,r8 - stvx v0,r6,r8 - stvx v0,r9,r8 - addi r8,r8,128 - stvx v0,0,r10 - stvx v0,r5,r10 - stvx v0,r6,r10 - stvx v0,r9,r10 - bdnz-- 4b // optimize for the cacheable case - - // loop over 32-byte chunks -LShort: - srwi. r0,r4,5 // get count of 32-byte chunks - rlwinm r4,r4,0,0x1F // mask down to residual count (0..31) - beq 7f // no chunks so done - mtctr r0 -6: - stvx v0,0,r8 - stvx v0,r5,r8 - addi r8,r8,32 - bdnz++ 6b -7: - mtspr vrsave,r2 // restore caller's vrsave - blr - - - COMMPAGE_DESCRIPTOR(memset_g5,_COMM_PAGE_MEMSET_PATTERN,kCache128+k64Bit+kHasAltivec,0, \ - kCommPageBoth+kPort32to64) diff --git a/osfmk/ppc/commpage/pthread.s b/osfmk/ppc/commpage/pthread.s deleted file mode 100644 index 58dd6c4aa..000000000 --- a/osfmk/ppc/commpage/pthread.s +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include // EXT, LEXT -#include -#include - - .text - .align 2 - -#define USER_SPRG3 259 // user-mode-readable encoding for SPRG3 - - -// *********************************************************** -// * P T H R E A D _ G E T S P E C I F I C _ S P R G 3 _ 3 2 * -// *********************************************************** -// -// For processors with user-readable SPRG3, in 32-bit mode. Called with: -// r3 = word number -// r4 = offset to thread specific data (_PTHREAD_TSD_OFFSET) - -pthread_getspecific_sprg3_32: - slwi r5,r3,2 // convert word# to byte offset - mfspr r3,USER_SPRG3 // get per-thread cookie - add r5,r5,r4 // add in offset to first word - lwzx r3,r3,r5 // get the thread-specific word - blr - - COMMPAGE_DESCRIPTOR(pthread_getspecific_sprg3_32,_COMM_PAGE_PTHREAD_GETSPECIFIC,k64Bit,0,kCommPage32) - - -// *********************************************************** -// * P T H R E A D _ G E T S P E C I F I C _ S P R G 3 _ 6 4 * -// *********************************************************** -// -// For processors with user-readable SPRG3, in 64-bit mode. This may not be used -// because the 64-bit ABI uses r13 for the thread-local-data pointer. Called with: -// r3 = word number -// r4 = offset to thread specific data (_PTHREAD_TSD_OFFSET) - -pthread_getspecific_sprg3_64: - sldi r5,r3,3 // convert double-word# to byte offset - mfspr r3,USER_SPRG3 // get per-thread cookie - add r5,r5,r4 // add in offset to first word - ldx r3,r3,r5 // get the thread-specific doubleword - blr - - COMMPAGE_DESCRIPTOR(pthread_getspecific_sprg3_64,_COMM_PAGE_PTHREAD_GETSPECIFIC,k64Bit,0,kCommPage64) - - -// *************************************** -// * P T H R E A D _ S E L F _ S P R G 3 * -// *************************************** -// -// For processors with user-readable SPRG3. Useable both in 32 and 64-bit modes. - -pthread_self_sprg3: - mfspr r3,USER_SPRG3 // get per-thread cookie - blr - - COMMPAGE_DESCRIPTOR(pthread_self_sprg3,_COMM_PAGE_PTHREAD_SELF,k64Bit,0,kCommPageBoth) - - -// ******************************************************* -// * P T H R E A D _ G E T S P E C I F I C _ U F T R A P * -// ******************************************************* -// -// For processors that use the Ultra-Fast-Trap to get the thread-specific ptr. -// Called with: -// r3 = word number -// r4 = offset to thread specific data (_PTHREAD_TSD_OFFSET) - -pthread_getspecific_uftrap: - slwi r5,r3,2 // convert word# to byte offset - li r0,0x7FF2 // magic "pthread_self" ultra-fast trap code - sc - add r5,r5,r4 // add in offset to first word - lwzx r3,r3,r5 // get the thread-specific word - blr - - COMMPAGE_DESCRIPTOR(pthread_getspecific_uftrap,_COMM_PAGE_PTHREAD_GETSPECIFIC,0,k64Bit,kCommPage32) - - -// ***************************************** -// * P T H R E A D _ S E L F _ U F T R A P * -// ***************************************** -// -// For processors that use the Ultra-Fast-Trap to get the thread-specific ptr. - -pthread_self_uftrap: - li r0,0x7FF2 // magic "pthread_self" ultra-fast trap code - sc // get r3==TLDP - blr - - COMMPAGE_DESCRIPTOR(pthread_self_uftrap,_COMM_PAGE_PTHREAD_SELF,0,k64Bit,kCommPage32) diff --git a/osfmk/ppc/commpage/spinlocks.s b/osfmk/ppc/commpage/spinlocks.s deleted file mode 100644 index 480f49050..000000000 --- a/osfmk/ppc/commpage/spinlocks.s +++ /dev/null @@ -1,247 +0,0 @@ -/* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include // EXT, LEXT -#include -#include - - .text - .align 2 - -#define MP_SPIN_TRIES 1000 - - -/* The user mode spinlock library. There are many versions, - * in order to take advantage of a few special cases: - * - no barrier instructions (SYNC,ISYNC) are needed if UP - * - 64-bit processors can use LWSYNC instead of SYNC (if MP) - * - 32-bit processors can use ISYNC/EIEIO instead of SYNC (if MP) - * - branch hints appropriate to the processor (+ vs ++ etc) - * - potentially custom relinquish strategies (not used at present) - * - fixes for errata as necessary - * - * The convention for lockwords is that 0==free and -1==locked. - */ - - -spinlock_32_try_mp: - mr r5, r3 - li r3, 1 -1: - lwarx r4,0,r5 - li r6,-1 // locked == -1 - cmpwi r4,0 - bne- 2f - stwcx. r6,0,r5 - isync // cancel speculative execution - beqlr+ - b 1b -2: - li r3,0 // we did not get the lock - blr - - COMMPAGE_DESCRIPTOR(spinlock_32_try_mp,_COMM_PAGE_SPINLOCK_TRY,0,k64Bit+kUP,kCommPage32) - - -spinlock_32_try_up: - mr r5, r3 - li r3, 1 -1: - lwarx r4,0,r5 - li r6,-1 // locked == -1 - cmpwi r4,0 - bne- 2f - stwcx. r6,0,r5 - beqlr+ - b 1b -2: - li r3,0 // we did not get the lock - blr - - COMMPAGE_DESCRIPTOR(spinlock_32_try_up,_COMM_PAGE_SPINLOCK_TRY,kUP,k64Bit,kCommPage32) - - -spinlock_32_lock_mp: - li r5,MP_SPIN_TRIES -1: - lwarx r4,0,r3 - li r6,-1 // locked == -1 - cmpwi r4,0 - bne- 2f - stwcx. r6,0,r3 - isync // cancel speculative execution - beqlr+ // we return void - b 1b -2: - subic. r5,r5,1 // try again before relinquish? - bne 1b - ba _COMM_PAGE_RELINQUISH - - COMMPAGE_DESCRIPTOR(spinlock_32_lock_mp,_COMM_PAGE_SPINLOCK_LOCK,0,k64Bit+kUP,kCommPage32) - - -spinlock_32_lock_up: -1: - lwarx r4,0,r3 - li r6,-1 // locked == -1 - cmpwi r4,0 - bnea- _COMM_PAGE_RELINQUISH // always depress on UP (let lock owner run) - stwcx. r6,0,r3 - beqlr+ // we return void - b 1b - - COMMPAGE_DESCRIPTOR(spinlock_32_lock_up,_COMM_PAGE_SPINLOCK_LOCK,kUP,k64Bit,kCommPage32) - - -spinlock_32_unlock_mp: - li r4,0 - isync // complete prior stores before unlock - eieio // (using isync/eieio is faster than a sync) - stw r4,0(r3) - blr - - COMMPAGE_DESCRIPTOR(spinlock_32_unlock_mp,_COMM_PAGE_SPINLOCK_UNLOCK,0,k64Bit+kUP,kCommPage32) - - -spinlock_32_unlock_up: - li r4,0 - stw r4,0(r3) - blr - - COMMPAGE_DESCRIPTOR(spinlock_32_unlock_up,_COMM_PAGE_SPINLOCK_UNLOCK,kUP,k64Bit,kCommPage32) - - -spinlock_64_try_mp: - mr r5, r3 - li r3, 1 -1: - lwarx r4,0,r5 - li r6,-1 // locked == -1 - cmpwi r4,0 - bne-- 2f - stwcx. r6,0,r5 - isync // cancel speculative execution - beqlr++ - b 1b -2: - li r6,-4 - stwcx. r5,r6,r1 // clear the pending reservation (using red zone) - li r3,0 // we did not get the lock - blr - - COMMPAGE_DESCRIPTOR(spinlock_64_try_mp,_COMM_PAGE_SPINLOCK_TRY,k64Bit,kUP,kCommPageBoth) - - -spinlock_64_try_up: - mr r5, r3 - li r3, 1 -1: - lwarx r4,0,r5 - li r6,-1 // locked == -1 - cmpwi r4,0 - bne-- 2f - stwcx. r6,0,r5 - beqlr++ - b 1b -2: - li r6,-4 - stwcx. r5,r6,r1 // clear the pending reservation (using red zone) - li r3,0 // we did not get the lock - blr - - COMMPAGE_DESCRIPTOR(spinlock_64_try_up,_COMM_PAGE_SPINLOCK_TRY,k64Bit+kUP,0,kCommPageBoth) - - -spinlock_64_lock_mp: - li r5,MP_SPIN_TRIES -1: - lwarx r4,0,r3 - li r6,-1 // locked == -1 - cmpwi r4,0 - bne-- 2f - stwcx. r6,0,r3 - isync // cancel speculative execution - beqlr++ // we return void - b 1b -2: - li r6,-4 - stwcx. r3,r6,r1 // clear the pending reservation (using red zone) - subic. r5,r5,1 // try again before relinquish? - bne-- 1b // mispredict this one (a cheap back-off) - ba _COMM_PAGE_RELINQUISH - - COMMPAGE_DESCRIPTOR(spinlock_64_lock_mp,_COMM_PAGE_SPINLOCK_LOCK,k64Bit,kUP,kCommPageBoth) - - -spinlock_64_lock_up: -1: - lwarx r4,0,r3 - li r6,-1 // locked == -1 - cmpwi r4,0 - bne-- 2f - stwcx. r6,0,r3 - beqlr++ // we return void - b 1b -2: // always relinquish on UP (let lock owner run) - li r6,-4 - stwcx. r3,r6,r1 // clear the pending reservation (using red zone) - ba _COMM_PAGE_RELINQUISH - - COMMPAGE_DESCRIPTOR(spinlock_64_lock_up,_COMM_PAGE_SPINLOCK_LOCK,k64Bit+kUP,0,kCommPageBoth) - - -spinlock_64_unlock_mp: - lwsync // complete prior stores before unlock - li r4,0 - stw r4,0(r3) - blr - - COMMPAGE_DESCRIPTOR(spinlock_64_unlock_mp,_COMM_PAGE_SPINLOCK_UNLOCK,k64Bit,kUP,kCommPageBoth) - - -spinlock_64_unlock_up: - li r4,0 - stw r4,0(r3) - blr - - COMMPAGE_DESCRIPTOR(spinlock_64_unlock_up,_COMM_PAGE_SPINLOCK_UNLOCK,k64Bit+kUP,0,kCommPageBoth) - - -spinlock_relinquish: - mr r12,r3 // preserve lockword ptr across relinquish - li r3,0 // THREAD_NULL - li r4,1 // SWITCH_OPTION_DEPRESS - li r5,1 // timeout (ms) - li r0,-61 // SYSCALL_THREAD_SWITCH - sc // relinquish - mr r3,r12 - ba _COMM_PAGE_SPINLOCK_LOCK - - COMMPAGE_DESCRIPTOR(spinlock_relinquish,_COMM_PAGE_RELINQUISH,0,0,kCommPageBoth) - diff --git a/osfmk/ppc/conf.c b/osfmk/ppc/conf.c deleted file mode 100644 index adeb60ea7..000000000 --- a/osfmk/ppc/conf.c +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * @APPLE_FREE_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ - -#include -#include -#include - -/* - * Clock device subsystem configuration. The clock_list[] - * table contains the clock structures for all clocks in - * the system. - */ - -extern struct clock_ops sysclk_ops, calend_ops; - -/* - * List of clock devices. - */ -struct clock clock_list[] = { - - /* SYSTEM_CLOCK */ - { &sysclk_ops, NULL, NULL }, - - /* CALENDAR_CLOCK */ - { &calend_ops, NULL, NULL }, -}; -int clock_count = sizeof(clock_list) / sizeof(clock_list[0]); - - diff --git a/osfmk/ppc/console_feed.c b/osfmk/ppc/console_feed.c deleted file mode 100644 index 8f029d49d..000000000 --- a/osfmk/ppc/console_feed.c +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_FREE_COPYRIGHT@ - * - */ - -/* Intercept mach console output and supply it to a user application */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if MACH_KDB -#include -#endif /* MACH_KDB */ - -static struct cirbuf cons_feed_cb; -static int cons_feed_count = 0; -io_req_t cons_feed_queued = 0; - -/* console feed lock should be taken at splhigh */ -decl_simple_lock_data(,cons_feed_lock) - -boolean_t cons_feed_read_done(io_req_t ior); - -io_return_t -console_feed_open( - dev_t dev, - dev_mode_t flag, - io_req_t ior) -{ - spl_t s; - - simple_lock_init(&cons_feed_lock, 0); -#if MACH_KDB - if (console_is_serial()) { - return D_DEVICE_DOWN; - } -#endif /* MACH_KDB */ - cb_alloc(&cons_feed_cb, CONSOLE_FEED_BUFSIZE); - s = splhigh(); - simple_lock(&cons_feed_lock); - cons_feed_count++; - simple_unlock(&cons_feed_lock); - splx(s); - return D_SUCCESS; -} - -void -console_feed_close( - dev_t dev) -{ - spl_t s; - - s = splhigh(); - simple_lock(&cons_feed_lock); - cons_feed_count--; - simple_unlock(&cons_feed_lock); - splx(s); - - console_feed_cancel_and_flush(); - cb_free(&cons_feed_cb); - - return; -} - -/* A routine that can be called from a panic or other problem - * situation. It switches off the console feed and dumps any - * remaining buffered information to the original console - * (usually the screen). It doesn't free up the buffer, since - * it tries to be as minimal as possible - */ - -void console_feed_cancel_and_flush(void) -{ - int c; - spl_t s; - -#if NCONSFEED > 0 -#if MACH_KDB - if (console_is_serial()) { - return; - } -#endif /* MACH_KDB */ - - s = splhigh(); - simple_lock(&cons_feed_lock); - if (cons_feed_count == 0) { - simple_unlock(&cons_feed_lock); - splx(s); - return; - } - cons_feed_count = 0; - simple_unlock(&cons_feed_lock); - splx(s); - - do { - c = getc(&cons_feed_cb); - if (c == -1) - break; - cnputc(c); - } while (1); -#endif /* NCONSFEED > 0 */ -} - -io_return_t -console_feed_read( - dev_t dev, - io_req_t ior) -{ - spl_t s; - kern_return_t rc; - int count; - - rc = device_read_alloc(ior, (vm_size_t) ior->io_count); - if (rc != KERN_SUCCESS) - return rc; - - s = splhigh(); - simple_lock(&cons_feed_lock); - - ior->io_residual = ior->io_count; - - count = q_to_b(&cons_feed_cb, (char *) ior->io_data, ior->io_count); - if (count == 0) { - if (ior->io_mode & D_NOWAIT) { - rc = D_WOULD_BLOCK; - } - if (cons_feed_queued == NULL) { - ior->io_done = cons_feed_read_done; - cons_feed_queued = ior; - rc = D_IO_QUEUED; - } else { - /* Can't queue multiple read requests yet */ - rc = D_INVALID_OPERATION; - } - simple_unlock(&cons_feed_lock); - splx(s); - return rc; - } - - simple_unlock(&cons_feed_lock); - splx(s); - - ior->io_residual -= count; - - iodone(ior); - - if (ior->io_op & IO_SYNC) { - iowait(ior); - } - - return D_SUCCESS; -} - -/* Called when data is ready and there's a queued-up read waiting */ -boolean_t cons_feed_read_done(io_req_t ior) -{ - spl_t s; - int count; - - s = splhigh(); - simple_lock(&cons_feed_lock); - - count = q_to_b(&cons_feed_cb, (char *) ior->io_data, ior->io_count); - if (count == 0) { - if (cons_feed_queued == NULL) { - ior->io_done = cons_feed_read_done; - cons_feed_queued = ior; - } - simple_unlock(&cons_feed_lock); - splx(s); - return FALSE; - } - - simple_unlock(&cons_feed_lock); - splx(s); - - ior->io_residual -= count; - ds_read_done(ior); - - return TRUE; -} - -/* This routine is called from putc() - it should return TRUE if - * the character should be passed on to a physical console, FALSE - * if the feed has intercepted the character. It may be called from - * under interrupt (even splhigh) - */ - -boolean_t console_feed_putc(char c) -{ - spl_t s; - io_req_t ior; - boolean_t retval; - -#if MACH_KDB - if (db_active) { - return TRUE; - } -#endif /* MACH_KDB */ - - retval=TRUE; /* TRUE : character should be displayed now */ - if (!cons_feed_count) { - return TRUE; - } - s = splhigh(); - simple_lock(&cons_feed_lock); - if (!cons_feed_count) { - simple_unlock(&cons_feed_lock); - splx(s); - return TRUE; - } - /* queue up the data if we can */ - if (!putc(c, &cons_feed_cb)) { - /* able to stock the character */ - retval = FALSE; - } - if (cons_feed_queued != NULL) { - /* Queued up request - service it */ - ior = cons_feed_queued; - cons_feed_queued = NULL; - simple_unlock(&cons_feed_lock); - splx(s); - iodone(ior); - retval=FALSE; - } else { - simple_unlock(&cons_feed_lock); - splx(s); - } - return retval; -} diff --git a/osfmk/ppc/console_feed_entries.h b/osfmk/ppc/console_feed_entries.h deleted file mode 100644 index 729955043..000000000 --- a/osfmk/ppc/console_feed_entries.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_FREE_COPYRIGHT@ - * - */ - -extern io_return_t console_feed_open( - dev_t dev, - dev_mode_t flag, - io_req_t ior); - -extern void console_feed_close( - dev_t dev); - -extern io_return_t console_feed_read( - dev_t dev, - io_req_t ior); - -extern boolean_t console_feed_putc(char c); -extern void console_feed_cancel_and_flush(void); - -#define CONSOLE_FEED_BUFSIZE 4096 diff --git a/osfmk/ppc/cpu.c b/osfmk/ppc/cpu.c deleted file mode 100644 index 774b94bbd..000000000 --- a/osfmk/ppc/cpu.c +++ /dev/null @@ -1,1184 +0,0 @@ -/* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -unsigned int real_ncpus = 1; -unsigned int max_ncpus = MAX_CPUS; - -decl_simple_lock_data(static,rht_lock); - -static unsigned int rht_state = 0; -#define RHT_WAIT 0x01 -#define RHT_BUSY 0x02 - -decl_simple_lock_data(static,SignalReadyLock); - -struct SIGtimebase { - volatile boolean_t avail; - volatile boolean_t ready; - volatile boolean_t done; - uint64_t abstime; -}; - -perfCallback perfCpuSigHook; /* Pointer to CHUD cpu signal hook routine */ - -extern uint32_t debugger_sync; - -/* - * Forward definitions - */ - -void cpu_sync_timebase( - void); - -void cpu_timebase_signal_handler( - struct per_proc_info *proc_info, - struct SIGtimebase *timebaseAddr); - -/* - * Routine: cpu_bootstrap - * Function: - */ -void -cpu_bootstrap( - void) -{ - simple_lock_init(&rht_lock,0); - simple_lock_init(&SignalReadyLock,0); -} - - -/* - * Routine: cpu_init - * Function: - */ -void -cpu_init( - void) -{ - struct per_proc_info *proc_info; - - proc_info = getPerProc(); - - /* - * Restore the TBR. - */ - if (proc_info->save_tbu != 0 || proc_info->save_tbl != 0) { - mttb(0); - mttbu(proc_info->save_tbu); - mttb(proc_info->save_tbl); - } - - proc_info->rtcPop = EndOfAllTime; /* forget any existing decrementer setting */ - etimer_resync_deadlines(); /* Now that the time base is sort of correct, request the next timer pop */ - - proc_info->cpu_type = CPU_TYPE_POWERPC; - proc_info->cpu_subtype = (cpu_subtype_t)proc_info->pf.rptdProc; - proc_info->cpu_threadtype = CPU_THREADTYPE_NONE; - proc_info->running = TRUE; - -} - -/* - * Routine: cpu_machine_init - * Function: - */ -void -cpu_machine_init( - void) -{ - struct per_proc_info *proc_info; - volatile struct per_proc_info *mproc_info; - - - proc_info = getPerProc(); - mproc_info = PerProcTable[master_cpu].ppe_vaddr; - - if (proc_info != mproc_info) { - simple_lock(&rht_lock); - if (rht_state & RHT_WAIT) - thread_wakeup(&rht_state); - rht_state &= ~(RHT_BUSY|RHT_WAIT); - simple_unlock(&rht_lock); - } - - PE_cpu_machine_init(proc_info->cpu_id, !(proc_info->cpu_flags & BootDone)); - - if (proc_info->hibernate) { - uint32_t tbu, tbl; - - do { - tbu = mftbu(); - tbl = mftb(); - } while (mftbu() != tbu); - - proc_info->hibernate = 0; - hibernate_machine_init(); - - // hibernate_machine_init() could take minutes and we don't want timeouts - // to fire as soon as scheduling starts. Reset timebase so it appears - // no time has elapsed, as it would for regular sleep. - mttb(0); - mttbu(tbu); - mttb(tbl); - } - - if (proc_info != mproc_info) { - while (!((mproc_info->cpu_flags) & SignalReady)) - continue; - cpu_sync_timebase(); - } - - ml_init_interrupt(); - if (proc_info != mproc_info) - simple_lock(&SignalReadyLock); - proc_info->cpu_flags |= BootDone|SignalReady; - if (proc_info != mproc_info) { - if (proc_info->ppXFlags & SignalReadyWait) { - hw_atomic_and_noret(&proc_info->ppXFlags, ~SignalReadyWait); - thread_wakeup(&proc_info->cpu_flags); - } - simple_unlock(&SignalReadyLock); - pmsPark(); /* Timers should be cool now, park the power management stepper */ - } -} - - -/* - * Routine: cpu_per_proc_alloc - * Function: - */ -struct per_proc_info * -cpu_per_proc_alloc( - void) -{ - struct per_proc_info *proc_info = NULL; - void *interrupt_stack = NULL; - void *debugger_stack = NULL; - - if ((proc_info = (struct per_proc_info*)kalloc(sizeof(struct per_proc_info))) == (struct per_proc_info*)0) - return (struct per_proc_info *)NULL; - if ((interrupt_stack = kalloc(INTSTACK_SIZE)) == 0) { - kfree(proc_info, sizeof(struct per_proc_info)); - return (struct per_proc_info *)NULL; - } - - if ((debugger_stack = kalloc(kernel_stack_size)) == 0) { - kfree(proc_info, sizeof(struct per_proc_info)); - kfree(interrupt_stack, INTSTACK_SIZE); - return (struct per_proc_info *)NULL; - } - - bzero((void *)proc_info, sizeof(struct per_proc_info)); - - /* Set physical address of the second page */ - proc_info->pp2ndPage = (addr64_t)pmap_find_phys(kernel_pmap, - ((addr64_t)(unsigned int)proc_info) + 0x1000) - << PAGE_SHIFT; - proc_info->next_savearea = (uint64_t)save_get_init(); - proc_info->pf = BootProcInfo.pf; - proc_info->istackptr = (vm_offset_t)interrupt_stack + INTSTACK_SIZE - FM_SIZE; - proc_info->intstack_top_ss = proc_info->istackptr; - proc_info->debstackptr = (vm_offset_t)debugger_stack + kernel_stack_size - FM_SIZE; - proc_info->debstack_top_ss = proc_info->debstackptr; - - queue_init(&proc_info->rtclock_timer.queue); - proc_info->rtclock_timer.deadline = EndOfAllTime; - - return proc_info; - -} - - -/* - * Routine: cpu_per_proc_free - * Function: - */ -void -cpu_per_proc_free( - struct per_proc_info *proc_info -) -{ - if (proc_info->cpu_number == master_cpu) - return; - kfree((void *)(proc_info->intstack_top_ss - INTSTACK_SIZE + FM_SIZE), INTSTACK_SIZE); - kfree((void *)(proc_info->debstack_top_ss - kernel_stack_size + FM_SIZE), kernel_stack_size); - kfree((void *)proc_info, sizeof(struct per_proc_info)); /* Release the per_proc */ -} - - -/* - * Routine: cpu_per_proc_register - * Function: - */ -kern_return_t -cpu_per_proc_register( - struct per_proc_info *proc_info -) -{ - int cpu; - - cpu = OSIncrementAtomic(&real_ncpus); - - if (real_ncpus > max_ncpus) { - return KERN_FAILURE; - } - - proc_info->cpu_number = cpu; - PerProcTable[cpu].ppe_vaddr = proc_info; - PerProcTable[cpu].ppe_paddr = (addr64_t)pmap_find_phys(kernel_pmap, (addr64_t)(unsigned int)proc_info) << PAGE_SHIFT; - eieio(); - return KERN_SUCCESS; -} - - -/* - * Routine: cpu_start - * Function: - */ -kern_return_t -cpu_start( - int cpu) -{ - struct per_proc_info *proc_info; - kern_return_t ret; - mapping_t *mp; - - proc_info = PerProcTable[cpu].ppe_vaddr; - - if (cpu == cpu_number()) { - PE_cpu_machine_init(proc_info->cpu_id, !(proc_info->cpu_flags & BootDone)); - ml_init_interrupt(); - proc_info->cpu_flags |= BootDone|SignalReady; - - return KERN_SUCCESS; - } else { - proc_info->cpu_flags &= BootDone; - proc_info->interrupts_enabled = 0; - proc_info->pending_ast = AST_NONE; - proc_info->istackptr = proc_info->intstack_top_ss; - proc_info->rtcPop = EndOfAllTime; - proc_info->FPU_owner = NULL; - proc_info->VMX_owner = NULL; - proc_info->pms.pmsStamp = 0; /* Dummy transition time */ - proc_info->pms.pmsPop = EndOfAllTime; /* Set the pop way into the future */ - proc_info->pms.pmsState = pmsParked; /* Park the stepper */ - proc_info->pms.pmsCSetCmd = pmsCInit; /* Set dummy initial hardware state */ - mp = (mapping_t *)(&proc_info->ppUMWmp); - mp->mpFlags = 0x01000000 | mpLinkage | mpPerm | 1; - mp->mpSpace = invalSpace; - - if (proc_info->start_paddr == EXCEPTION_VECTOR(T_RESET)) { - - simple_lock(&rht_lock); - while (rht_state & RHT_BUSY) { - rht_state |= RHT_WAIT; - thread_sleep_usimple_lock((event_t)&rht_state, - &rht_lock, THREAD_UNINT); - } - rht_state |= RHT_BUSY; - simple_unlock(&rht_lock); - - ml_phys_write((vm_offset_t)&ResetHandler + 0, - RESET_HANDLER_START); - ml_phys_write((vm_offset_t)&ResetHandler + 4, - (vm_offset_t)_start_cpu); - ml_phys_write((vm_offset_t)&ResetHandler + 8, - (vm_offset_t)&PerProcTable[cpu]); - } -/* - * Note: we pass the current time to the other processor here. He will load it - * as early as possible so that there is a chance that it is close to accurate. - * After the machine is up a while, we will officially resync the clocks so - * that all processors are the same. This is just to get close. - */ - - ml_get_timebase((unsigned long long *)&proc_info->ruptStamp); - - __asm__ volatile("sync"); /* Commit to storage */ - __asm__ volatile("isync"); /* Wait a second */ - ret = PE_cpu_start(proc_info->cpu_id, - proc_info->start_paddr, (vm_offset_t)proc_info); - - if (ret != KERN_SUCCESS) { - if (proc_info->start_paddr == EXCEPTION_VECTOR(T_RESET)) { - simple_lock(&rht_lock); - if (rht_state & RHT_WAIT) - thread_wakeup(&rht_state); - rht_state &= ~(RHT_BUSY|RHT_WAIT); - simple_unlock(&rht_lock); - }; - } else { - simple_lock(&SignalReadyLock); - if (!((*(volatile short *)&proc_info->cpu_flags) & SignalReady)) { - hw_atomic_or_noret(&proc_info->ppXFlags, SignalReadyWait); - thread_sleep_simple_lock((event_t)&proc_info->cpu_flags, - &SignalReadyLock, THREAD_UNINT); - } - simple_unlock(&SignalReadyLock); - - } - return(ret); - } -} - -/* - * Routine: cpu_exit_wait - * Function: - */ -void -cpu_exit_wait( - int cpu) -{ - struct per_proc_info *tpproc; - - if ( cpu != master_cpu) { - tpproc = PerProcTable[cpu].ppe_vaddr; - while (!((*(volatile short *)&tpproc->cpu_flags) & SleepState)) {}; - } -} - - -/* - * Routine: cpu_doshutdown - * Function: - */ -void -cpu_doshutdown( - void) -{ - enable_preemption(); - processor_offline(current_processor()); -} - - -/* - * Routine: cpu_sleep - * Function: - */ -void -cpu_sleep( - void) -{ - struct per_proc_info *proc_info; - unsigned int i; - unsigned int wait_ncpus_sleep, ncpus_sleep; - facility_context *fowner; - - proc_info = getPerProc(); - - proc_info->running = FALSE; - - timer_queue_shutdown(&proc_info->rtclock_timer.queue); - proc_info->rtclock_timer.deadline = EndOfAllTime; - - fowner = proc_info->FPU_owner; /* Cache this */ - if(fowner) /* If anyone owns FPU, save it */ - fpu_save(fowner); - proc_info->FPU_owner = NULL; /* Set no fpu owner now */ - - fowner = proc_info->VMX_owner; /* Cache this */ - if(fowner) vec_save(fowner); /* If anyone owns vectors, save it */ - proc_info->VMX_owner = NULL; /* Set no vector owner now */ - - if (proc_info->cpu_number == master_cpu) { - proc_info->cpu_flags &= BootDone; - proc_info->interrupts_enabled = 0; - proc_info->pending_ast = AST_NONE; - - if (proc_info->start_paddr == EXCEPTION_VECTOR(T_RESET)) { - ml_phys_write((vm_offset_t)&ResetHandler + 0, - RESET_HANDLER_START); - ml_phys_write((vm_offset_t)&ResetHandler + 4, - (vm_offset_t)_start_cpu); - ml_phys_write((vm_offset_t)&ResetHandler + 8, - (vm_offset_t)&PerProcTable[master_cpu]); - - __asm__ volatile("sync"); - __asm__ volatile("isync"); - } - - wait_ncpus_sleep = real_ncpus-1; - ncpus_sleep = 0; - while (wait_ncpus_sleep != ncpus_sleep) { - ncpus_sleep = 0; - for(i=1; i < real_ncpus ; i++) { - if ((*(volatile short *)&(PerProcTable[i].ppe_vaddr->cpu_flags)) & SleepState) - ncpus_sleep++; - } - } - - } - - /* - * Save the TBR before stopping. - */ - do { - proc_info->save_tbu = mftbu(); - proc_info->save_tbl = mftb(); - } while (mftbu() != proc_info->save_tbu); - - PE_cpu_machine_quiesce(proc_info->cpu_id); -} - - -/* - * Routine: cpu_signal - * Function: - * Here is where we send a message to another processor. So far we only have two: - * SIGPast and SIGPdebug. SIGPast is used to preempt and kick off threads (this is - * currently disabled). SIGPdebug is used to enter the debugger. - * - * We set up the SIGP function to indicate that this is a simple message and set the - * order code (MPsigpParm0) to SIGPast or SIGPdebug). After finding the per_processor - * block for the target, we lock the message block. Then we set the parameter(s). - * Next we change the lock (also called "busy") to "passing" and finally signal - * the other processor. Note that we only wait about 1ms to get the message lock. - * If we time out, we return failure to our caller. It is their responsibility to - * recover. - */ -kern_return_t -cpu_signal( - int target, - int signal, - unsigned int p1, - unsigned int p2) -{ - - unsigned int holdStat; - struct per_proc_info *tpproc, *mpproc; - int busybitset=0; - -#if DEBUG - if(((unsigned int)target) >= MAX_CPUS) panic("cpu_signal: invalid target CPU - %08X\n", target); -#endif - - mpproc = getPerProc(); /* Point to our block */ - tpproc = PerProcTable[target].ppe_vaddr; /* Point to the target's block */ - if(mpproc == tpproc) return KERN_FAILURE; /* Cannot signal ourselves */ - - if(!tpproc->running) return KERN_FAILURE; - - if (!(tpproc->cpu_flags & SignalReady)) return KERN_FAILURE; - - if((tpproc->MPsigpStat & MPsigpMsgp) == MPsigpMsgp) { /* Is there an unreceived message already pending? */ - - if(signal == SIGPwake) { /* SIGPwake can merge into all others... */ - mpproc->hwCtr.numSIGPmwake++; /* Account for merged wakes */ - return KERN_SUCCESS; - } - - if((signal == SIGPast) && (tpproc->MPsigpParm0 == SIGPast)) { /* We can merge ASTs */ - mpproc->hwCtr.numSIGPmast++; /* Account for merged ASTs */ - return KERN_SUCCESS; /* Don't bother to send this one... */ - } - - if (tpproc->MPsigpParm0 == SIGPwake) { - if (hw_lock_mbits(&tpproc->MPsigpStat, (MPsigpMsgp | MPsigpAck), - (MPsigpBusy | MPsigpPass ), MPsigpBusy, 0)) { - busybitset = 1; - mpproc->hwCtr.numSIGPmwake++; - } - } - } - - if((busybitset == 0) && - (!hw_lock_mbits(&tpproc->MPsigpStat, MPsigpMsgp, 0, MPsigpBusy, - (gPEClockFrequencyInfo.timebase_frequency_hz >> 11)))) { /* Try to lock the message block with a .5ms timeout */ - mpproc->hwCtr.numSIGPtimo++; /* Account for timeouts */ - return KERN_FAILURE; /* Timed out, take your ball and go home... */ - } - - holdStat = MPsigpBusy | MPsigpPass | (MPsigpSigp << 8) | mpproc->cpu_number; /* Set up the signal status word */ - tpproc->MPsigpParm0 = signal; /* Set message order */ - tpproc->MPsigpParm1 = p1; /* Set additional parm */ - tpproc->MPsigpParm2 = p2; /* Set additional parm */ - - __asm__ volatile("sync"); /* Make sure it's all there */ - - tpproc->MPsigpStat = holdStat; /* Set status and pass the lock */ - __asm__ volatile("eieio"); /* I'm a paraniod freak */ - - if (busybitset == 0) - PE_cpu_signal(mpproc->cpu_id, tpproc->cpu_id); /* Kick the other processor */ - - return KERN_SUCCESS; /* All is goodness and rainbows... */ -} - - -/* - * Routine: cpu_signal_handler - * Function: - * Here is where we implement the receiver of the signaling protocol. - * We wait for the signal status area to be passed to us. Then we snarf - * up the status, the sender, and the 3 potential parms. Next we release - * the lock and signal the other guy. - */ -void -cpu_signal_handler(void) -{ - unsigned int holdStat, holdParm0, holdParm1, holdParm2; - unsigned int *parmAddr; - struct per_proc_info *proc_info; - int cpu; - broadcastFunc xfunc; - cpu = cpu_number(); /* Get the CPU number */ - - proc_info = getPerProc(); - -/* - * Since we've been signaled, wait about 31 ms for the signal lock to pass - */ - if(!hw_lock_mbits(&proc_info->MPsigpStat, (MPsigpMsgp | MPsigpAck), (MPsigpBusy | MPsigpPass), - (MPsigpBusy | MPsigpPass | MPsigpAck), (gPEClockFrequencyInfo.timebase_frequency_hz >> 5))) { - panic("cpu_signal_handler: Lock pass timed out\n"); - } - - holdStat = proc_info->MPsigpStat; /* Snarf stat word */ - holdParm0 = proc_info->MPsigpParm0; /* Snarf parameter */ - holdParm1 = proc_info->MPsigpParm1; /* Snarf parameter */ - holdParm2 = proc_info->MPsigpParm2; /* Snarf parameter */ - - __asm__ volatile("isync"); /* Make sure we don't unlock until memory is in */ - - proc_info->MPsigpStat = holdStat & ~(MPsigpMsgp | MPsigpAck | MPsigpFunc); /* Release lock */ - - switch ((holdStat & MPsigpFunc) >> 8) { /* Decode function code */ - - case MPsigpIdle: /* Was function cancelled? */ - return; /* Yup... */ - - case MPsigpSigp: /* Signal Processor message? */ - - switch (holdParm0) { /* Decode SIGP message order */ - - case SIGPast: /* Should we do an AST? */ - proc_info->hwCtr.numSIGPast++; /* Count this one */ -#if 0 - kprintf("cpu_signal_handler: AST check on cpu %x\n", cpu_number()); -#endif - ast_check((processor_t)proc_info->processor); - return; /* All done... */ - - case SIGPcpureq: /* CPU specific function? */ - - proc_info->hwCtr.numSIGPcpureq++; /* Count this one */ - switch (holdParm1) { /* Select specific function */ - - case CPRQtimebase: - - cpu_timebase_signal_handler(proc_info, (struct SIGtimebase *)holdParm2); - return; - - case CPRQsegload: - return; - - case CPRQchud: - parmAddr = (unsigned int *)holdParm2; /* Get the destination address */ - if(perfCpuSigHook) { - struct savearea *ssp = current_thread()->machine.pcb; - if(ssp) { - (perfCpuSigHook)(parmAddr[1] /* request */, ssp, 0, 0); - } - } - parmAddr[1] = 0; - parmAddr[0] = 0; /* Show we're done */ - return; - - case CPRQscom: - if(((scomcomm *)holdParm2)->scomfunc) { /* Are we writing */ - ((scomcomm *)holdParm2)->scomstat = ml_scom_write(((scomcomm *)holdParm2)->scomreg, ((scomcomm *)holdParm2)->scomdata); /* Write scom */ - } - else { /* No, reading... */ - ((scomcomm *)holdParm2)->scomstat = ml_scom_read(((scomcomm *)holdParm2)->scomreg, &((scomcomm *)holdParm2)->scomdata); /* Read scom */ - } - return; - - case CPRQsps: - { - ml_set_processor_speed_slave(holdParm2); - return; - } - default: - panic("cpu_signal_handler: unknown CPU request - %08X\n", holdParm1); - return; - } - - - case SIGPdebug: /* Enter the debugger? */ - - proc_info->hwCtr.numSIGPdebug++; /* Count this one */ - proc_info->debugger_is_slave++; /* Bump up the count to show we're here */ - (void)hw_atomic_sub(&debugger_sync, 1); /* Show we've received the 'rupt */ - __asm__ volatile("tw 4,r3,r3"); /* Enter the debugger */ - return; /* All done now... */ - - case SIGPwake: /* Wake up CPU */ - proc_info->hwCtr.numSIGPwake++; /* Count this one */ - return; /* No need to do anything, the interrupt does it all... */ - - case SIGPcall: /* Call function on CPU */ - proc_info->hwCtr.numSIGPcall++; /* Count this one */ - xfunc = (broadcastFunc)holdParm1; /* Do this since I can't seem to figure C out */ - xfunc(holdParm2); /* Call the passed function */ - return; /* Done... */ - - default: - panic("cpu_signal_handler: unknown SIGP message order - %08X\n", holdParm0); - return; - - } - - default: - panic("cpu_signal_handler: unknown SIGP function - %08X\n", (holdStat & MPsigpFunc) >> 8); - return; - - } - panic("cpu_signal_handler: we should never get here\n"); -} - - -/* - * Routine: cpu_sync_timebase - * Function: - */ -void -cpu_sync_timebase( - void) -{ - natural_t tbu, tbl; - boolean_t intr; - struct SIGtimebase syncClkSpot; - - intr = ml_set_interrupts_enabled(FALSE); /* No interruptions in here */ - - syncClkSpot.avail = FALSE; - syncClkSpot.ready = FALSE; - syncClkSpot.done = FALSE; - - while (cpu_signal(master_cpu, SIGPcpureq, CPRQtimebase, - (unsigned int)&syncClkSpot) != KERN_SUCCESS) - continue; - - while (syncClkSpot.avail == FALSE) - continue; - - isync(); - - /* - * We do the following to keep the compiler from generating extra stuff - * in tb set part - */ - tbu = syncClkSpot.abstime >> 32; - tbl = (uint32_t)syncClkSpot.abstime; - - mttb(0); - mttbu(tbu); - mttb(tbl); - - syncClkSpot.ready = TRUE; - - while (syncClkSpot.done == FALSE) - continue; - - etimer_resync_deadlines(); /* Start the timer */ - (void)ml_set_interrupts_enabled(intr); -} - - -/* - * Routine: cpu_timebase_signal_handler - * Function: - */ -void -cpu_timebase_signal_handler( - struct per_proc_info *proc_info, - struct SIGtimebase *timebaseAddr) -{ - unsigned int tbu, tbu2, tbl; - - if(proc_info->time_base_enable != (void(*)(cpu_id_t, boolean_t ))NULL) - proc_info->time_base_enable(proc_info->cpu_id, FALSE); - - timebaseAddr->abstime = 0; /* Touch to force into cache */ - sync(); - - do { - asm volatile(" mftbu %0" : "=r" (tbu)); - asm volatile(" mftb %0" : "=r" (tbl)); - asm volatile(" mftbu %0" : "=r" (tbu2)); - } while (tbu != tbu2); - - timebaseAddr->abstime = ((uint64_t)tbu << 32) | tbl; - sync(); /* Force order */ - - timebaseAddr->avail = TRUE; - - while (timebaseAddr->ready == FALSE) - continue; - - if(proc_info->time_base_enable != (void(*)(cpu_id_t, boolean_t ))NULL) - proc_info->time_base_enable(proc_info->cpu_id, TRUE); - - timebaseAddr->done = TRUE; -} - - -/* - * Routine: cpu_control - * Function: - */ -kern_return_t -cpu_control( - int slot_num, - processor_info_t info, - unsigned int count) -{ - struct per_proc_info *proc_info; - cpu_type_t tcpu_type; - cpu_subtype_t tcpu_subtype; - processor_pm_regs_t perf_regs; - processor_control_cmd_t cmd; - boolean_t oldlevel; -#define MMCR0_SUPPORT_MASK 0xf83f1fff -#define MMCR1_SUPPORT_MASK 0xffc00000 -#define MMCR2_SUPPORT_MASK 0x80000000 - - proc_info = PerProcTable[slot_num].ppe_vaddr; - tcpu_type = proc_info->cpu_type; - tcpu_subtype = proc_info->cpu_subtype; - cmd = (processor_control_cmd_t) info; - - if (count < PROCESSOR_CONTROL_CMD_COUNT) - return(KERN_FAILURE); - - if ( tcpu_type != cmd->cmd_cpu_type || - tcpu_subtype != cmd->cmd_cpu_subtype) - return(KERN_FAILURE); - - if (perfmon_acquire_facility(current_task()) != KERN_SUCCESS) { - return(KERN_RESOURCE_SHORTAGE); /* cpu performance facility in use by another task */ - } - - switch (cmd->cmd_op) - { - case PROCESSOR_PM_CLR_PMC: /* Clear Performance Monitor Counters */ - switch (tcpu_subtype) - { - case CPU_SUBTYPE_POWERPC_750: - case CPU_SUBTYPE_POWERPC_7400: - case CPU_SUBTYPE_POWERPC_7450: - { - oldlevel = ml_set_interrupts_enabled(FALSE); /* disable interrupts */ - mtpmc1(0x0); - mtpmc2(0x0); - mtpmc3(0x0); - mtpmc4(0x0); - ml_set_interrupts_enabled(oldlevel); /* enable interrupts */ - return(KERN_SUCCESS); - } - default: - return(KERN_FAILURE); - } /* tcpu_subtype */ - case PROCESSOR_PM_SET_REGS: /* Set Performance Monitor Registors */ - switch (tcpu_subtype) - { - case CPU_SUBTYPE_POWERPC_750: - if (count < (PROCESSOR_CONTROL_CMD_COUNT + - PROCESSOR_PM_REGS_COUNT_POWERPC_750)) - return(KERN_FAILURE); - else - { - perf_regs = (processor_pm_regs_t)cmd->cmd_pm_regs; - oldlevel = ml_set_interrupts_enabled(FALSE); /* disable interrupts */ - mtmmcr0(PERFMON_MMCR0(perf_regs) & MMCR0_SUPPORT_MASK); - mtpmc1(PERFMON_PMC1(perf_regs)); - mtpmc2(PERFMON_PMC2(perf_regs)); - mtmmcr1(PERFMON_MMCR1(perf_regs) & MMCR1_SUPPORT_MASK); - mtpmc3(PERFMON_PMC3(perf_regs)); - mtpmc4(PERFMON_PMC4(perf_regs)); - ml_set_interrupts_enabled(oldlevel); /* enable interrupts */ - return(KERN_SUCCESS); - } - case CPU_SUBTYPE_POWERPC_7400: - case CPU_SUBTYPE_POWERPC_7450: - if (count < (PROCESSOR_CONTROL_CMD_COUNT + - PROCESSOR_PM_REGS_COUNT_POWERPC_7400)) - return(KERN_FAILURE); - else - { - perf_regs = (processor_pm_regs_t)cmd->cmd_pm_regs; - oldlevel = ml_set_interrupts_enabled(FALSE); /* disable interrupts */ - mtmmcr0(PERFMON_MMCR0(perf_regs) & MMCR0_SUPPORT_MASK); - mtpmc1(PERFMON_PMC1(perf_regs)); - mtpmc2(PERFMON_PMC2(perf_regs)); - mtmmcr1(PERFMON_MMCR1(perf_regs) & MMCR1_SUPPORT_MASK); - mtpmc3(PERFMON_PMC3(perf_regs)); - mtpmc4(PERFMON_PMC4(perf_regs)); - mtmmcr2(PERFMON_MMCR2(perf_regs) & MMCR2_SUPPORT_MASK); - ml_set_interrupts_enabled(oldlevel); /* enable interrupts */ - return(KERN_SUCCESS); - } - default: - return(KERN_FAILURE); - } /* switch tcpu_subtype */ - case PROCESSOR_PM_SET_MMCR: - switch (tcpu_subtype) - { - case CPU_SUBTYPE_POWERPC_750: - if (count < (PROCESSOR_CONTROL_CMD_COUNT + - PROCESSOR_PM_REGS_COUNT_POWERPC_750)) - return(KERN_FAILURE); - else - { - perf_regs = (processor_pm_regs_t)cmd->cmd_pm_regs; - oldlevel = ml_set_interrupts_enabled(FALSE); /* disable interrupts */ - mtmmcr0(PERFMON_MMCR0(perf_regs) & MMCR0_SUPPORT_MASK); - mtmmcr1(PERFMON_MMCR1(perf_regs) & MMCR1_SUPPORT_MASK); - ml_set_interrupts_enabled(oldlevel); /* enable interrupts */ - return(KERN_SUCCESS); - } - case CPU_SUBTYPE_POWERPC_7400: - case CPU_SUBTYPE_POWERPC_7450: - if (count < (PROCESSOR_CONTROL_CMD_COUNT + - PROCESSOR_PM_REGS_COUNT_POWERPC_7400)) - return(KERN_FAILURE); - else - { - perf_regs = (processor_pm_regs_t)cmd->cmd_pm_regs; - oldlevel = ml_set_interrupts_enabled(FALSE); /* disable interrupts */ - mtmmcr0(PERFMON_MMCR0(perf_regs) & MMCR0_SUPPORT_MASK); - mtmmcr1(PERFMON_MMCR1(perf_regs) & MMCR1_SUPPORT_MASK); - mtmmcr2(PERFMON_MMCR2(perf_regs) & MMCR2_SUPPORT_MASK); - ml_set_interrupts_enabled(oldlevel); /* enable interrupts */ - return(KERN_SUCCESS); - } - default: - return(KERN_FAILURE); - } /* tcpu_subtype */ - default: - return(KERN_FAILURE); - } /* switch cmd_op */ -} - - -/* - * Routine: cpu_info_count - * Function: - */ -kern_return_t -cpu_info_count( - processor_flavor_t flavor, - unsigned int *count) -{ - cpu_subtype_t tcpu_subtype; - - /* - * For now, we just assume that all CPUs are of the same type - */ - tcpu_subtype = PerProcTable[master_cpu].ppe_vaddr->cpu_subtype; - switch (flavor) { - case PROCESSOR_PM_REGS_INFO: - switch (tcpu_subtype) { - case CPU_SUBTYPE_POWERPC_750: - - *count = PROCESSOR_PM_REGS_COUNT_POWERPC_750; - return(KERN_SUCCESS); - - case CPU_SUBTYPE_POWERPC_7400: - case CPU_SUBTYPE_POWERPC_7450: - - *count = PROCESSOR_PM_REGS_COUNT_POWERPC_7400; - return(KERN_SUCCESS); - - default: - *count = 0; - return(KERN_INVALID_ARGUMENT); - } /* switch tcpu_subtype */ - - case PROCESSOR_TEMPERATURE: - *count = PROCESSOR_TEMPERATURE_COUNT; - return (KERN_SUCCESS); - - default: - *count = 0; - return(KERN_INVALID_ARGUMENT); - - } -} - - -/* - * Routine: cpu_info - * Function: - */ -kern_return_t -cpu_info( - processor_flavor_t flavor, - int slot_num, - processor_info_t info, - unsigned int *count) -{ - cpu_subtype_t tcpu_subtype; - processor_pm_regs_t perf_regs; - boolean_t oldlevel; - - tcpu_subtype = PerProcTable[slot_num].ppe_vaddr->cpu_subtype; - - switch (flavor) { - case PROCESSOR_PM_REGS_INFO: - - perf_regs = (processor_pm_regs_t) info; - - switch (tcpu_subtype) { - case CPU_SUBTYPE_POWERPC_750: - - if (*count < PROCESSOR_PM_REGS_COUNT_POWERPC_750) - return(KERN_FAILURE); - - oldlevel = ml_set_interrupts_enabled(FALSE); /* disable interrupts */ - PERFMON_MMCR0(perf_regs) = mfmmcr0(); - PERFMON_PMC1(perf_regs) = mfpmc1(); - PERFMON_PMC2(perf_regs) = mfpmc2(); - PERFMON_MMCR1(perf_regs) = mfmmcr1(); - PERFMON_PMC3(perf_regs) = mfpmc3(); - PERFMON_PMC4(perf_regs) = mfpmc4(); - ml_set_interrupts_enabled(oldlevel); /* enable interrupts */ - - *count = PROCESSOR_PM_REGS_COUNT_POWERPC_750; - return(KERN_SUCCESS); - - case CPU_SUBTYPE_POWERPC_7400: - case CPU_SUBTYPE_POWERPC_7450: - - if (*count < PROCESSOR_PM_REGS_COUNT_POWERPC_7400) - return(KERN_FAILURE); - - oldlevel = ml_set_interrupts_enabled(FALSE); /* disable interrupts */ - PERFMON_MMCR0(perf_regs) = mfmmcr0(); - PERFMON_PMC1(perf_regs) = mfpmc1(); - PERFMON_PMC2(perf_regs) = mfpmc2(); - PERFMON_MMCR1(perf_regs) = mfmmcr1(); - PERFMON_PMC3(perf_regs) = mfpmc3(); - PERFMON_PMC4(perf_regs) = mfpmc4(); - PERFMON_MMCR2(perf_regs) = mfmmcr2(); - ml_set_interrupts_enabled(oldlevel); /* enable interrupts */ - - *count = PROCESSOR_PM_REGS_COUNT_POWERPC_7400; - return(KERN_SUCCESS); - - default: - return(KERN_FAILURE); - } /* switch tcpu_subtype */ - - case PROCESSOR_TEMPERATURE: /* Get the temperature of a processor */ - - *info = -1; /* Get the temperature */ - return(KERN_FAILURE); - - default: - return(KERN_INVALID_ARGUMENT); - - } /* flavor */ -} - - -/* - * Routine: cpu_to_processor - * Function: - */ -processor_t -cpu_to_processor( - int cpu) -{ - return ((processor_t)PerProcTable[cpu].ppe_vaddr->processor); -} - - -/* - * Routine: slot_type - * Function: - */ -cpu_type_t -slot_type( - int slot_num) -{ - return (PerProcTable[slot_num].ppe_vaddr->cpu_type); -} - - -/* - * Routine: slot_subtype - * Function: - */ -cpu_subtype_t -slot_subtype( - int slot_num) -{ - return (PerProcTable[slot_num].ppe_vaddr->cpu_subtype); -} - - -/* - * Routine: slot_threadtype - * Function: - */ -cpu_threadtype_t -slot_threadtype( - int slot_num) -{ - return (PerProcTable[slot_num].ppe_vaddr->cpu_threadtype); -} - - -/* - * Routine: cpu_type - * Function: - */ -cpu_type_t -cpu_type(void) -{ - return (getPerProc()->cpu_type); -} - - -/* - * Routine: cpu_subtype - * Function: - */ -cpu_subtype_t -cpu_subtype(void) -{ - return (getPerProc()->cpu_subtype); -} - - -/* - * Routine: cpu_threadtype - * Function: - */ -cpu_threadtype_t -cpu_threadtype(void) -{ - return (getPerProc()->cpu_threadtype); -} - -/* - * Call a function on all running processors - * - * Note that the synch paramter is used to wait until all functions are complete. - * It is not passed to the other processor and must be known by the called function. - * The called function must do a thread_wakeup on the synch if it decrements the - * synch count to 0. - * - * We start by initializing the synchronizer to the number of possible cpus. - * The we signal each popssible processor. - * If the signal fails, we count it. We also skip our own. - * When we are finished signaling, we adjust the syncronizer count down buy the number of failed signals. - * Because the signaled processors are also decrementing the synchronizer count, the adjustment may result in a 0 - * If this happens, all other processors are finished with the function. - * If so, we clear the wait and continue - * Otherwise, we block waiting for the other processor(s) to finish. - * - * Meanwhile, the other processors are decrementing the synchronizer when they are done - * If it goes to zero, thread_wakeup is called to run the broadcaster - * - * Note that because we account for the broadcaster in the synchronization count, we will not get any - * premature wakeup calls. - * - * Also note that when we do the adjustment of the synchronization count, it the result is 0, it means that - * all of the other processors are finished. Otherwise, we know that there is at least one more. - * When that thread decrements the synchronizer to zero, it will do a thread_wake. - * - */ - -int32_t -cpu_broadcast(uint32_t *synch, broadcastFunc func, uint32_t parm) -{ - int failsig; - unsigned int cpu, ocpu; - - cpu = cpu_number(); /* Who are we? */ - failsig = 0; /* Clear called processor count */ - - if(real_ncpus > 1) { /* Are we just a uni? */ - - *synch = real_ncpus; /* Set how many we are going to try */ - assert_wait((event_t)synch, THREAD_UNINT); /* If more than one processor, we may have to wait */ - - for(ocpu = 0; ocpu < real_ncpus; ocpu++) { /* Tell everyone to call */ - - if(ocpu == cpu) continue; /* If we talk to ourselves, people will wonder... */ - - if(KERN_SUCCESS != cpu_signal(ocpu, SIGPcall, (uint32_t)func, parm)) { /* Call the function on the other processor */ - failsig++; /* Count failed signals */ - } - } - - if (hw_atomic_sub(synch, failsig + 1) == 0) - clear_wait(current_thread(), THREAD_AWAKENED); /* Clear wait if we never signalled or all of the others finished */ - else - thread_block(THREAD_CONTINUE_NULL); /* Wait for everyone to get into step... */ - } - - return (real_ncpus - failsig - 1); /* Return the number of guys actually signalled... */ -} diff --git a/osfmk/ppc/cpu_capabilities.h b/osfmk/ppc/cpu_capabilities.h deleted file mode 100644 index 268666cd2..000000000 --- a/osfmk/ppc/cpu_capabilities.h +++ /dev/null @@ -1,254 +0,0 @@ -/* - * Copyright (c) 2003-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#ifdef PRIVATE - -#ifndef _PPC_CPU_CAPABILITIES_H -#define _PPC_CPU_CAPABILITIES_H - -/* _cpu_capabilities - * - * This is the authoritative way to determine from user mode what - * implementation-specific processor features are available. - * This API is only supported for Apple internal use. - */ - -#ifndef __ASSEMBLER__ - -extern int _cpu_capabilities; - -#endif /* __ASSEMBLER__ */ - -/* Bit definitions for _cpu_capabilities: */ - -#define kHasAltivec 0x00000001 -#define k64Bit 0x00000002 // 64-bit GPRs -#define kCache32 0x00000004 // cache line size is 32 bytes -#define kCache64 0x00000008 -#define kCache128 0x00000010 -#define kDcbaRecommended 0x00000020 // PPC: dcba is available and recommended -#define kDcbaAvailable 0x00000040 // PPC: dcba is available (but may or may not be recommended) -#define kDataStreamsRecommended 0x00000080 // PPC: dst, dstt, dstst, dss, and dssall instructions available and recommended -#define kDataStreamsAvailable 0x00000100 // PPC: dst, dstt, dstst, dss, and dssall instructions available (may or may not be rec'd) -#define kDcbtStreamsRecommended 0x00000200 // PPC: enhanced dcbt instruction available and recommended -#define kDcbtStreamsAvailable 0x00000400 // PPC: enhanced dcbt instruction available (but may or may not be recommended) -#define kFastThreadLocalStorage 0x00000800 // TLS ptr is kept in a user-mode-readable register - -#define kUP 0x00008000 // set if (kNumCPUs == 1) -#define kNumCPUs 0x00FF0000 // number of CPUs (see _NumCPUs() below) - -#define kNumCPUsShift 16 // see _NumCPUs() below - -#define kHasGraphicsOps 0x08000000 // PPC: has fres, frsqrte, and fsel instructions -#define kHasStfiwx 0x10000000 // PPC: has stfiwx instruction -#define kHasFsqrt 0x20000000 // PPC: has fsqrt and fsqrts instructions - -#ifndef __ASSEMBLER__ - -static __inline__ int _NumCPUs( void ) { return (_cpu_capabilities & kNumCPUs) >> kNumCPUsShift; } - -#endif /* __ASSEMBLER__ */ - - -/* - * The shared kernel/user "comm page(s)": - * - * The last eight pages of every address space are reserved for the kernel/user - * "comm area". Because they can be addressed via a sign-extended 16-bit field, - * it is particularly efficient to access code or data in the comm area with - * absolute branches (ba, bla, bca) or absolute load/stores ("lwz r0,-4096(0)"). - * Because the comm area can be reached from anywhere, dyld is not needed. - * Although eight pages are reserved, presently only two are populated and mapped. - * - * Routines on the comm page(s) can be thought of as the firmware for extended processor - * instructions, whose opcodes are special forms of "bla". Ie, they are cpu - * capabilities. During system initialization, the kernel populates the comm page with - * code customized for the particular processor and platform. - * - * Because Mach VM cannot map the last page of an address space, the max length of - * the comm area is seven pages. - */ - -#define _COMM_PAGE_BASE_ADDRESS (-8*4096) // start at page -8, ie 0xFFFF8000 -#define _COMM_PAGE_AREA_LENGTH ( 7*4096) // reserved length of entire comm area -#define _COMM_PAGE_AREA_USED ( 2*4096) // we use two pages so far - -/* The following set of definitions are used in the kernel, which needs to distinguish between - * the 32 and 64-bit commpage addresses and lengths. On PPC they are the same, but on Imtel - * they are not. - */ -#define _COMM_PAGE32_BASE_ADDRESS ( _COMM_PAGE_BASE_ADDRESS ) -#define _COMM_PAGE64_BASE_ADDRESS ( _COMM_PAGE_BASE_ADDRESS ) -#define _COMM_PAGE32_AREA_LENGTH ( _COMM_PAGE_AREA_LENGTH ) -#define _COMM_PAGE64_AREA_LENGTH ( _COMM_PAGE_AREA_LENGTH ) -#define _COMM_PAGE32_AREA_USED ( _COMM_PAGE_AREA_USED ) -#define _COMM_PAGE64_AREA_USED ( _COMM_PAGE_AREA_USED ) - -/* The Objective-C runtime fixed address page to optimize message dispatch */ -#define _OBJC_PAGE_BASE_ADDRESS (-20*4096) // start at page -20, ie 0xFFFEC000 - -/* - * Objective-C needs an "absolute" area all the way up to the top of the - * address space. - * For a ppc32 task, that area gets allocated at runtime from user space. - * For a ppc64 task, that area is not within the user-accessible address range, - * so we pre-allocate it at exec time (see vm_map_exec()) along with the - * comm page. - * - * NOTE: that means we can't "nest" the 64-bit comm page... - */ -#define _COMM_PAGE32_OBJC_SIZE 0ULL -#define _COMM_PAGE32_OBJC_BASE 0ULL -#if 0 -#define _COMM_PAGE64_OBJC_SIZE (4 * 4096) -#define _COMM_PAGE64_OBJC_BASE (_OBJC_PAGE_BASE_ADDRESS) -#else -/* - * PPC51: ppc64 is limited to 51-bit addresses. - * PPC64 has a 51-bit address space limit, so we can't just go and - * map the Obj-C area up there. We would have to create a nested pmap - * and make a special mapping that redirects the large virtual addresses to - * that other address space with lower addresses that fit within the 51-bit - * limit. - * VM would then have to handle this redirection when we fault one - * of these pages in but it doesn't do that at this point, so no - * Obj-C area for ppc64 for now :-( - */ -#define _COMM_PAGE64_OBJC_SIZE 0ULL -#define _COMM_PAGE64_OBJC_BASE 0ULL -#endif - -/* data in the comm page */ - -#define _COMM_PAGE_SIGNATURE (_COMM_PAGE_BASE_ADDRESS+0x000) // first few bytes are a signature -#define _COMM_PAGE_VERSION (_COMM_PAGE_BASE_ADDRESS+0x01E) // 16-bit version# -#define _COMM_PAGE_THIS_VERSION 2 // this is version 2 of the commarea format - -#define _COMM_PAGE_CPU_CAPABILITIES (_COMM_PAGE_BASE_ADDRESS+0x020) // mirror of extern int _cpu_capabilities -#define _COMM_PAGE_NCPUS (_COMM_PAGE_BASE_ADDRESS+0x021) // number of configured CPUs -#define _COMM_PAGE_ALTIVEC (_COMM_PAGE_BASE_ADDRESS+0x024) // nonzero if Altivec available -#define _COMM_PAGE_64_BIT (_COMM_PAGE_BASE_ADDRESS+0x025) // nonzero if 64-bit processor -#define _COMM_PAGE_CACHE_LINESIZE (_COMM_PAGE_BASE_ADDRESS+0x026) // cache line size (16-bit field) - -#define _COMM_PAGE_UNUSED1 (_COMM_PAGE_BASE_ADDRESS+0x028) // 24 unused bytes - -#define _COMM_PAGE_2_TO_52 (_COMM_PAGE_BASE_ADDRESS+0x040) // double float constant 2**52 -#define _COMM_PAGE_10_TO_6 (_COMM_PAGE_BASE_ADDRESS+0x048) // double float constant 10**6 -#define _COMM_PAGE_MAGIC_FE (_COMM_PAGE_BASE_ADDRESS+0x050) // magic constant 0xFEFEFEFEFEFEFEFF (to find 0s) -#define _COMM_PAGE_MAGIC_80 (_COMM_PAGE_BASE_ADDRESS+0x058) // magic constant 0x8080808080808080 (to find 0s) - -#define _COMM_PAGE_TIMEBASE (_COMM_PAGE_BASE_ADDRESS+0x060) // used by gettimeofday() -#define _COMM_PAGE_TIMESTAMP (_COMM_PAGE_BASE_ADDRESS+0x068) // used by gettimeofday() -#define _COMM_PAGE_SEC_PER_TICK (_COMM_PAGE_BASE_ADDRESS+0x070) // used by gettimeofday() - - /* jump table (bla to this address, which may be a branch to the actual code somewhere else) */ - /* When new jump table entries are added, corresponding symbols should be added below */ - -#define _COMM_PAGE_COMPARE_AND_SWAP32 (_COMM_PAGE_BASE_ADDRESS+0x080) // compare-and-swap word, no barrier -#define _COMM_PAGE_COMPARE_AND_SWAP64 (_COMM_PAGE_BASE_ADDRESS+0x0c0) // compare-and-swap doubleword, no barrier -#define _COMM_PAGE_ENQUEUE (_COMM_PAGE_BASE_ADDRESS+0x100) // enqueue -#define _COMM_PAGE_DEQUEUE (_COMM_PAGE_BASE_ADDRESS+0x140) // dequeue -#define _COMM_PAGE_MEMORY_BARRIER (_COMM_PAGE_BASE_ADDRESS+0x180) // memory barrier -#define _COMM_PAGE_ATOMIC_ADD32 (_COMM_PAGE_BASE_ADDRESS+0x1a0) // add atomic word -#define _COMM_PAGE_ATOMIC_ADD64 (_COMM_PAGE_BASE_ADDRESS+0x1c0) // add atomic doubleword - -#define _COMM_PAGE_UNUSED3 (_COMM_PAGE_BASE_ADDRESS+0x1e0) // 32 unused bytes - -#define _COMM_PAGE_ABSOLUTE_TIME (_COMM_PAGE_BASE_ADDRESS+0x200) // mach_absolute_time() -#define _COMM_PAGE_SPINLOCK_TRY (_COMM_PAGE_BASE_ADDRESS+0x220) // spinlock_try() -#define _COMM_PAGE_SPINLOCK_LOCK (_COMM_PAGE_BASE_ADDRESS+0x260) // spinlock_lock() -#define _COMM_PAGE_SPINLOCK_UNLOCK (_COMM_PAGE_BASE_ADDRESS+0x2a0) // spinlock_unlock() -#define _COMM_PAGE_PTHREAD_GETSPECIFIC (_COMM_PAGE_BASE_ADDRESS+0x2c0) // pthread_getspecific() -#define _COMM_PAGE_GETTIMEOFDAY (_COMM_PAGE_BASE_ADDRESS+0x2e0) // used by gettimeofday() -#define _COMM_PAGE_FLUSH_DCACHE (_COMM_PAGE_BASE_ADDRESS+0x4e0) // sys_dcache_flush() -#define _COMM_PAGE_FLUSH_ICACHE (_COMM_PAGE_BASE_ADDRESS+0x520) // sys_icache_invalidate() -#define _COMM_PAGE_PTHREAD_SELF (_COMM_PAGE_BASE_ADDRESS+0x580) // pthread_self() - -#define _COMM_PAGE_UNUSED4 (_COMM_PAGE_BASE_ADDRESS+0x5a0) // 32 unused bytes - -#define _COMM_PAGE_RELINQUISH (_COMM_PAGE_BASE_ADDRESS+0x5c0) // used by spinlocks - -#define _COMM_PAGE_UNUSED5 (_COMM_PAGE_BASE_ADDRESS+0x5e0) // 32 unused bytes - -#define _COMM_PAGE_BZERO (_COMM_PAGE_BASE_ADDRESS+0x600) // bzero() -#define _COMM_PAGE_BCOPY (_COMM_PAGE_BASE_ADDRESS+0x780) // bcopy() -#define _COMM_PAGE_MEMCPY (_COMM_PAGE_BASE_ADDRESS+0x7a0) // memcpy() -#define _COMM_PAGE_MEMMOVE (_COMM_PAGE_BASE_ADDRESS+0x7a0) // memmove() - -#define _COMM_PAGE_COMPARE_AND_SWAP32B (_COMM_PAGE_BASE_ADDRESS+0xf80) // compare-and-swap word w barrier -#define _COMM_PAGE_COMPARE_AND_SWAP64B (_COMM_PAGE_BASE_ADDRESS+0xfc0) // compare-and-swap doubleword w barrier - -#define _COMM_PAGE_MEMSET_PATTERN (_COMM_PAGE_BASE_ADDRESS+0x1000)// used by nonzero memset() -#define _COMM_PAGE_BIGCOPY (_COMM_PAGE_BASE_ADDRESS+0x1140)// very-long-operand copies - -#define _COMM_PAGE_END (_COMM_PAGE_BASE_ADDRESS+0x1700)// end of commpage area - -#ifdef __ASSEMBLER__ -#ifdef __COMM_PAGE_SYMBOLS - -#define CREATE_COMM_PAGE_SYMBOL(symbol_name, symbol_address) \ - .org (symbol_address - _COMM_PAGE_BASE_ADDRESS) @\ -symbol_name: nop - - .text // Required to make a well behaved symbol file - - CREATE_COMM_PAGE_SYMBOL(___compare_and_swap32, _COMM_PAGE_COMPARE_AND_SWAP32) - CREATE_COMM_PAGE_SYMBOL(___compare_and_swap64, _COMM_PAGE_COMPARE_AND_SWAP64) - CREATE_COMM_PAGE_SYMBOL(___atomic_enqueue, _COMM_PAGE_ENQUEUE) - CREATE_COMM_PAGE_SYMBOL(___atomic_dequeue, _COMM_PAGE_DEQUEUE) - CREATE_COMM_PAGE_SYMBOL(___memory_barrier, _COMM_PAGE_MEMORY_BARRIER) - CREATE_COMM_PAGE_SYMBOL(___atomic_add32, _COMM_PAGE_ATOMIC_ADD32) - CREATE_COMM_PAGE_SYMBOL(___atomic_add64, _COMM_PAGE_ATOMIC_ADD64) - CREATE_COMM_PAGE_SYMBOL(___mach_absolute_time, _COMM_PAGE_ABSOLUTE_TIME) - CREATE_COMM_PAGE_SYMBOL(___spin_lock_try, _COMM_PAGE_SPINLOCK_TRY) - CREATE_COMM_PAGE_SYMBOL(___spin_lock, _COMM_PAGE_SPINLOCK_LOCK) - CREATE_COMM_PAGE_SYMBOL(___spin_unlock, _COMM_PAGE_SPINLOCK_UNLOCK) - CREATE_COMM_PAGE_SYMBOL(___pthread_getspecific, _COMM_PAGE_PTHREAD_GETSPECIFIC) - CREATE_COMM_PAGE_SYMBOL(___gettimeofday, _COMM_PAGE_GETTIMEOFDAY) - CREATE_COMM_PAGE_SYMBOL(___sys_dcache_flush, _COMM_PAGE_FLUSH_DCACHE) - CREATE_COMM_PAGE_SYMBOL(___sys_icache_invalidate, _COMM_PAGE_FLUSH_ICACHE) - CREATE_COMM_PAGE_SYMBOL(___pthread_self, _COMM_PAGE_PTHREAD_SELF) - CREATE_COMM_PAGE_SYMBOL(___spin_lock_relinquish, _COMM_PAGE_RELINQUISH) - CREATE_COMM_PAGE_SYMBOL(___bzero, _COMM_PAGE_BZERO) - CREATE_COMM_PAGE_SYMBOL(___bcopy, _COMM_PAGE_BCOPY) - CREATE_COMM_PAGE_SYMBOL(___memcpy, _COMM_PAGE_MEMCPY) -// CREATE_COMM_PAGE_SYMBOL(___memmove, _COMM_PAGE_MEMMOVE) - CREATE_COMM_PAGE_SYMBOL(___compare_and_swap32b, _COMM_PAGE_COMPARE_AND_SWAP32B) - CREATE_COMM_PAGE_SYMBOL(___compare_and_swap64b, _COMM_PAGE_COMPARE_AND_SWAP64B) - CREATE_COMM_PAGE_SYMBOL(___memset_pattern, _COMM_PAGE_MEMSET_PATTERN) - CREATE_COMM_PAGE_SYMBOL(___bigcopy, _COMM_PAGE_BIGCOPY) - - CREATE_COMM_PAGE_SYMBOL(___end_comm_page, _COMM_PAGE_END) - - .data // Required to make a well behaved symbol file - .long 0 // Required to make a well behaved symbol file - -#endif /* __COMM_PAGE_SYMBOLS */ -#endif /* __ASSEMBLER__ */ - -#endif /* _PPC_CPU_CAPABILITIES_H */ -#endif /* PRIVATE */ diff --git a/osfmk/ppc/cpu_data.h b/osfmk/ppc/cpu_data.h deleted file mode 100644 index 3a5fa9190..000000000 --- a/osfmk/ppc/cpu_data.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - * - */ - -#ifndef PPC_CPU_DATA -#define PPC_CPU_DATA - -#ifdef MACH_KERNEL_PRIVATE - -#include -#include - -extern thread_t current_thread(void); -extern __inline__ thread_t current_thread(void) -{ - thread_t result; - - __asm__ volatile("mfsprg %0,1" : "=r" (result)); - - return (result); -} - -#define getPerProc() current_thread()->machine.PerProc - -extern int get_preemption_level(void); -extern void _enable_preemption_no_check(void); - -#define enable_preemption_no_check() _enable_preemption_no_check() -#define mp_disable_preemption() _disable_preemption() -#define mp_enable_preemption() _enable_preemption() -#define mp_enable_preemption_no_check() _enable_preemption_no_check() - -#endif /* MACH_KERNEL_PRIVATE */ - -#endif /* PPC_CPU_DATA */ diff --git a/osfmk/ppc/cpu_internal.h b/osfmk/ppc/cpu_internal.h deleted file mode 100644 index 0c876a1e9..000000000 --- a/osfmk/ppc/cpu_internal.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -#ifndef _PPC_CPU_INTERNAL_H_ -#define _PPC_CPU_INTERNAL_H_ - -#include -#include - -extern void _start_cpu( - void); - -extern void cpu_bootstrap( - void); - -extern void cpu_init( - void); - -extern kern_return_t cpu_signal( - int target, - int signal, - unsigned int p1, - unsigned int p2); - -#define SIGPast 0 /* Requests an ast on target processor */ -#define SIGPcpureq 1 /* Requests CPU specific function */ -#define SIGPdebug 2 /* Requests a debugger entry */ -#define SIGPwake 3 /* Wake up a sleeping processor */ -#define SIGPcall 4 /* Call a function on a processor */ - -#define CPRQtimebase 1 /* Get timebase of processor */ -#define CPRQsegload 2 /* Segment registers reload */ -#define CPRQscom 3 /* SCOM */ -#define CPRQchud 4 /* CHUD perfmon */ -#define CPRQsps 5 /* Set Processor Speed */ - - -extern struct per_proc_info * cpu_per_proc_alloc( - void); - -extern void cpu_per_proc_free( - struct per_proc_info *per_proc); - -extern void * console_per_proc_alloc( - boolean_t boot_processor); - -extern void console_per_proc_free( - void *per_proc_cbfr); - -extern void * chudxnu_per_proc_alloc( - boolean_t boot_processor); - -extern void chudxnu_per_proc_free( - void *per_proc_chud); - -extern kern_return_t cpu_per_proc_register( - struct per_proc_info *proc_info); - -extern unsigned int real_ncpus; -extern unsigned int max_ncpus; - -#endif /* _PPC_CPU_INTERNAL_H_ */ diff --git a/osfmk/ppc/cswtch.s b/osfmk/ppc/cswtch.s deleted file mode 100644 index 54a17af4e..000000000 --- a/osfmk/ppc/cswtch.s +++ /dev/null @@ -1,2486 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -#include -#include -#include -#include -#include -#include -#include - -#define FPVECDBG 0 - - .text - -/* - * void machine_load_context(thread_t thread) - * - * Load the context for the first thread to run on a - * cpu, and go. - */ - - .align 5 - .globl EXT(machine_load_context) - -LEXT(machine_load_context) - mfsprg r6,1 ; Get the current activation - lwz r6,ACT_PER_PROC(r6) ; Get the per_proc block - lwz r0,PP_INTSTACK_TOP_SS(r6) - stw r0,PP_ISTACKPTR(r6) - mr r9,r3 /* Set up the current thread */ - mtsprg 1,r9 - li r0,0 /* Clear a register */ - lwz r3,ACT_MACT_PCB(r9) /* Get the savearea used */ - mfmsr r5 /* Since we are passing control, get our MSR values */ - lwz r11,SAVprev+4(r3) /* Get the previous savearea */ - lwz r1,saver1+4(r3) /* Load new stack pointer */ - lwz r10,ACT_MACT_SPF(r9) /* Get the special flags */ - stw r0,saver3+4(r3) /* Make sure we pass in a 0 for the continuation */ - stw r0,FM_BACKPTR(r1) /* zero backptr */ - stw r5,savesrr1+4(r3) /* Pass our MSR to the new guy */ - stw r11,ACT_MACT_PCB(r9) /* Unstack our savearea */ - oris r10,r10,hi16(OnProc) /* Set OnProc bit */ - stw r0,ACT_PREEMPT_CNT(r9) /* Enable preemption */ - stw r10,ACT_MACT_SPF(r9) /* Update the special flags */ - stw r10,spcFlags(r6) /* Set per_proc copy of the special flags */ - b EXT(exception_exit) /* Go for it */ - -/* thread_t Switch_context(thread_t old, - * void (*cont)(void), - * thread_t new) - * - * Switch from one thread to another. If a continuation is supplied, then - * we do not need to save callee save registers. - * - */ - -/* void Call_continuation( void (*continuation)(void), void *param, wait_result_t wresult, vm_offset_t stack_ptr) - */ - - .align 5 - .globl EXT(Call_continuation) - -LEXT(Call_continuation) - mtlr r3 /* continuation */ - mr r3,r4 /* parameter */ - mr r4,r5 /* wait result */ - mr r1,r6 /* Load new stack pointer */ - blrl /* Jump to the continuation */ - mfsprg r3,1 - b EXT(thread_terminate) - -/* - * Get the old kernel stack, and store into the thread structure. - * See if a continuation is supplied, and skip state save if so. - * - * Note that interrupts must be disabled before we get here (i.e., splsched) - */ - -/* - * Switch_context(old, continuation, new) - * - * Context switches are double jumps. We pass the following to the - * context switch firmware call: - * - * R3 = switchee's savearea, virtual if continuation, low order physical for full switch - * R4 = old thread - * R5 = new SRR0 - * R6 = new SRR1 - * R7 = high order physical address of savearea for full switch - * - * savesrr0 is set to go to switch_in - * savesrr1 is set to uninterruptible with translation on - */ - - - .align 5 - .globl EXT(Switch_context) - -LEXT(Switch_context) - - lwz r12,ACT_PER_PROC(r3) ; Get the per_proc block -#if DEBUG - lwz r0,PP_ISTACKPTR(r12) ; (DEBUG/TRACE) make sure we are not - mr. r0,r0 ; (DEBUG/TRACE) on the interrupt - bne++ notonintstack ; (DEBUG/TRACE) stack - BREAKPOINT_TRAP -notonintstack: -#endif - lwz r8,ACT_MACT_PCB(r5) ; Get the PCB for the new guy - lwz r9,umwSpace(r5) ; Get user memory window address space - cmpwi cr1,r4,0 ; Remeber if there is a continuation - used waaaay down below - lwz r0,CTHREAD_SELF+0(r5) ; Pick up the user assist "word" (actually a double) - lwz r7,CTHREAD_SELF+4(r5) ; both halves - lwz r11,ACT_MACT_BTE(r5) ; Get BlueBox Task Environment - lwz r6,umwRelo(r5) ; Get user memory window relocation top - stw r12,ACT_PER_PROC(r5) ; Set per_proc in new activation - mtsprg 1,r5 - lwz r2,umwRelo+4(r5) ; Get user memory window relocation bottom - - stw r0,UAW+0(r12) ; Save the assist word for the "ultra fast path" - stw r7,UAW+4(r12) - - lwz r7,ACT_MACT_SPF(r5) ; Get the special flags - - sth r9,ppUMWmp+mpSpace(r12) ; Save the space - stw r6,ppUMWmp+mpNestReloc(r12) ; Save top part of physical address - stw r2,ppUMWmp+mpNestReloc+4(r12) ; Save bottom part of physical address - stw r11,ppbbTaskEnv(r12) ; Save the bb task env - lwz r2,traceMask(0) ; Get the enabled traces - stw r7,spcFlags(r12) ; Set per_proc copy of the special flags - lis r0,hi16(CutTrace) ; Trace FW call - mr. r2,r2 ; Any tracing going on? - lwz r11,SAVprev+4(r8) ; Get the previous of the switchee savearea - ori r0,r0,lo16(CutTrace) ; Trace FW call - beq++ cswNoTrc ; No trace today, dude... - - li r2,0x4400 ; Trace ID - mr r6,r11 ; Trace prev savearea - sc ; Cut trace entry of context switch - -cswNoTrc: lwz r2,curctx(r5) ; Grab our current context pointer - lwz r10,FPUowner(r12) ; Grab the owner of the FPU - lwz r9,VMXowner(r12) ; Grab the owner of the vector - mfmsr r6 ; Get the MSR because the switched to thread should inherit it - stw r11,ACT_MACT_PCB(r5) ; Dequeue the savearea we are switching to - li r0,1 ; Get set to hold off quickfret - - rlwinm r6,r6,0,MSR_FP_BIT+1,MSR_FP_BIT-1 ; Turn off the FP - cmplw r10,r2 ; Do we have the live float context? - lwz r10,FPUlevel(r2) ; Get the live level - mr r4,r3 ; Save our old thread to pass back - cmplw cr5,r9,r2 ; Do we have the live vector context? - rlwinm r6,r6,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1 ; Turn off the vector - stw r0,holdQFret(r12) ; Make sure we hold off releasing quickfret - bne++ cswnofloat ; Float is not ours... - - cmplw r10,r11 ; Is the level the same? - lhz r0,PP_CPU_NUMBER(r12) ; Get our CPU number - lwz r5,FPUcpu(r2) ; Get the owning cpu - bne++ cswnofloat ; Level not the same, this is not live... - - cmplw r5,r0 ; Still owned by this cpu? - lwz r10,FPUsave(r2) ; Get the pointer to next saved context - bne++ cswnofloat ; CPU claimed by someone else... - - mr. r10,r10 ; Is there a savearea here? - ori r6,r6,lo16(MASK(MSR_FP)) ; Enable floating point - - beq-- cswnofloat ; No savearea to check... - - lwz r3,SAVlevel(r10) ; Get the level - lwz r5,SAVprev+4(r10) ; Get the previous of this savearea - cmplw r3,r11 ; Is it for the current level? - - bne++ cswnofloat ; Nope... - - stw r5,FPUsave(r2) ; Pop off this savearea - - rlwinm r3,r10,0,0,19 ; Move back to start of page - - lwz r5,quickfret(r12) ; Get the first in quickfret list (top) - lwz r9,quickfret+4(r12) ; Get the first in quickfret list (bottom) - lwz r7,SACvrswap(r3) ; Get the virtual to real conversion (top) - lwz r3,SACvrswap+4(r3) ; Get the virtual to real conversion (bottom) - stw r5,SAVprev(r10) ; Link the old in (top) - stw r9,SAVprev+4(r10) ; Link the old in (bottom) - xor r3,r10,r3 ; Convert to physical - stw r7,quickfret(r12) ; Set the first in quickfret list (top) - stw r3,quickfret+4(r12) ; Set the first in quickfret list (bottom) - -#if FPVECDBG - lis r0,hi16(CutTrace) ; (TEST/DEBUG) - mr r7,r2 ; (TEST/DEBUG) - li r2,0x4401 ; (TEST/DEBUG) - oris r0,r0,lo16(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) - lhz r0,PP_CPU_NUMBER(r12) ; (TEST/DEBUG) - mr r2,r7 ; (TEST/DEBUG) -#endif - -cswnofloat: bne++ cr5,cswnovect ; Vector is not ours... - - lwz r10,VMXlevel(r2) ; Get the live level - - cmplw r10,r11 ; Is the level the same? - lhz r0,PP_CPU_NUMBER(r12) ; Get our CPU number - lwz r5,VMXcpu(r2) ; Get the owning cpu - bne++ cswnovect ; Level not the same, this is not live... - - cmplw r5,r0 ; Still owned by this cpu? - lwz r10,VMXsave(r2) ; Get the level - bne++ cswnovect ; CPU claimed by someone else... - - mr. r10,r10 ; Is there a savearea here? - oris r6,r6,hi16(MASK(MSR_VEC)) ; Enable vector - - beq-- cswnovect ; No savearea to check... - - lwz r3,SAVlevel(r10) ; Get the level - lwz r5,SAVprev+4(r10) ; Get the previous of this savearea - cmplw r3,r11 ; Is it for the current level? - - bne++ cswnovect ; Nope... - - stw r5,VMXsave(r2) ; Pop off this savearea - rlwinm r3,r10,0,0,19 ; Move back to start of page - - lwz r5,quickfret(r12) ; Get the first in quickfret list (top) - lwz r9,quickfret+4(r12) ; Get the first in quickfret list (bottom) - lwz r2,SACvrswap(r3) ; Get the virtual to real conversion (top) - lwz r3,SACvrswap+4(r3) ; Get the virtual to real conversion (bottom) - stw r5,SAVprev(r10) ; Link the old in (top) - stw r9,SAVprev+4(r10) ; Link the old in (bottom) - xor r3,r10,r3 ; Convert to physical - stw r2,quickfret(r12) ; Set the first in quickfret list (top) - stw r3,quickfret+4(r12) ; Set the first in quickfret list (bottom) - -#if FPVECDBG - lis r0,hi16(CutTrace) ; (TEST/DEBUG) - li r2,0x4501 ; (TEST/DEBUG) - oris r0,r0,lo16(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) -#endif - -cswnovect: li r0,0 ; Get set to release quickfret holdoff - rlwinm r11,r8,0,0,19 ; Switch to savearea base - lis r9,hi16(EXT(switch_in)) ; Get top of switch in routine - lwz r5,savesrr0+4(r8) ; Set up the new SRR0 -; -; Note that the low-level code requires the R7 contain the high order half of the savearea's -; physical address. This is hack city, but it is the way it is. -; - lwz r7,SACvrswap(r11) ; Get the high order V to R translation - lwz r11,SACvrswap+4(r11) ; Get the low order V to R translation - ori r9,r9,lo16(EXT(switch_in)) ; Bottom half of switch in - stw r0,holdQFret(r12) ; Make sure we release quickfret holdoff - stw r9,savesrr0+4(r8) ; Make us jump to the switch in routine - - lwz r9,SAVflags(r8) /* Get the flags */ - lis r0,hi16(SwitchContextCall) /* Top part of switch context */ - li r10,(MASK(MSR_ME)|MASK(MSR_DR)) /* Get the switcher's MSR */ - ori r0,r0,lo16(SwitchContextCall) /* Bottom part of switch context */ - stw r10,savesrr1+4(r8) /* Set up for switch in */ - rlwinm r9,r9,0,15,13 /* Reset the syscall flag */ - xor r3,r11,r8 /* Get the physical address of the new context save area */ - stw r9,SAVflags(r8) /* Set the flags */ - - bne cr1,swtchtocont ; Switch to the continuation - sc /* Switch to the new context */ - -/* We come back here in the new thread context - * R4 was set to hold the old thread pointer, but switch_in will put it into - * R3 where it belongs. - */ - blr /* Jump into the new thread */ - -; -; This is where we go when a continuation is set. We are actually -; killing off the old context of the new guy so we need to pop off -; any float or vector states for the ditched level. -; -; Note that we do the same kind of thing a chkfac in hw_exceptions.s -; - - -swtchtocont: - - stw r5,savesrr0+4(r8) ; Set the pc - stw r6,savesrr1+4(r8) ; Set the next MSR to use - stw r4,saver3+4(r8) ; Make sure we pass back the old thread - mr r3,r8 ; Pass in the virtual address of savearea - - b EXT(exception_exit) ; Blocking on continuation, toss old context... - - - -/* - * All switched to threads come here first to clean up the old thread. - * We need to do the following contortions because we need to keep - * the LR clean. And because we need to manipulate the savearea chain - * with translation on. If we could, this should be done in lowmem_vectors - * before translation is turned on. But we can't, dang it! - * - * switch_in() runs with DR on and IR off - * - * R3 = switcher's savearea (32-bit virtual) - * saver4 = old thread in switcher's save - * saver5 = new SRR0 in switcher's save - * saver6 = new SRR1 in switcher's save - - - */ - - - .align 5 - .globl EXT(switch_in) - -LEXT(switch_in) - - lwz r4,saver4+4(r3) ; Get the old thread - lwz r5,saver5+4(r3) ; Get the srr0 value - - mfsprg r0,2 ; Get feature flags - mr r9,r4 ; Get the switched from ACT - lwz r6,saver6+4(r3) ; Get the srr1 value - rlwinm. r0,r0,0,pf64Bitb,pf64Bitb ; Check for 64-bit - lwz r10,ACT_MACT_PCB(r9) ; Get the top PCB on the old thread - - stw r3,ACT_MACT_PCB(r9) ; Put the new one on top - stw r10,SAVprev+4(r3) ; Chain on the old one - - mr r3,r4 ; Pass back the old thread - - mtsrr0 r5 ; Set return point - mtsrr1 r6 ; Set return MSR - - bne++ siSixtyFour ; Go do 64-bit... - - rfi ; Jam... - -siSixtyFour: - rfid ; Jam... - -/* - * void fpu_save(facility_context ctx) - * - * Note that there are some oddities here when we save a context we are using. - * It is really not too cool to do this, but what the hey... Anyway, - * we turn fpus and vecs off before we leave., The oddity is that if you use fpus after this, the - * savearea containing the context just saved will go away. So, bottom line is - * that don't use fpus until after you are done with the saved context. - */ - .align 5 - .globl EXT(fpu_save) - -LEXT(fpu_save) - - lis r2,hi16(MASK(MSR_VEC)) ; Get the vector enable - li r12,lo16(MASK(MSR_EE)) ; Get the EE bit - ori r2,r2,lo16(MASK(MSR_FP)) ; Get FP - - mfmsr r0 ; Get the MSR - andc r0,r0,r2 ; Clear FP, VEC - andc r2,r0,r12 ; Clear EE - ori r2,r2,MASK(MSR_FP) ; Enable the floating point feature for now also - mtmsr r2 ; Set the MSR - isync - - mfsprg r6,1 ; Get the current activation - lwz r6,ACT_PER_PROC(r6) ; Get the per_proc block - lwz r12,FPUowner(r6) ; Get the context ID for owner - -#if FPVECDBG - mr r7,r0 ; (TEST/DEBUG) - li r4,0 ; (TEST/DEBUG) - mr r10,r3 ; (TEST/DEBUG) - lis r0,hi16(CutTrace) ; (TEST/DEBUG) - mr. r3,r12 ; (TEST/DEBUG) - li r2,0x6F00 ; (TEST/DEBUG) - li r5,0 ; (TEST/DEBUG) - beq-- noowneryet ; (TEST/DEBUG) - lwz r4,FPUlevel(r12) ; (TEST/DEBUG) - lwz r5,FPUsave(r12) ; (TEST/DEBUG) - -noowneryet: oris r0,r0,lo16(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) - mr r0,r7 ; (TEST/DEBUG) - mr r3,r10 ; (TEST/DEBUG) -#endif - mflr r2 ; Save the return address - - cmplw r3,r12 ; Is the specified context live? - lhz r11,PP_CPU_NUMBER(r6) ; Get our CPU number - lwz r9,FPUcpu(r3) ; Get the cpu that context was last on - bne-- fsret ; Nobody owns the FPU, no save required... - - cmplw r9,r11 ; Was the context for this processor? - la r5,FPUsync(r3) ; Point to the sync word - bne-- fsret ; Facility not last used on this processor... - -; -; It looks like we need to save this one. -; -; First, make sure that the live context block is not mucked with while -; we are trying to save it on out. Then we will give it the final check. -; - - lis r9,ha16(EXT(LockTimeOut)) ; Get the high part - mftb r8 ; Get the time now - lwz r9,lo16(EXT(LockTimeOut))(r9) ; Get the timeout value - b fssync0a ; Jump to the lock... - - .align 5 - -fssync0: li r7,lgKillResv ; Get killing field - stwcx. r7,0,r7 ; Kill reservation - -fssync0a: lwz r7,0(r5) ; Sniff the lock - mftb r10 ; Is it time yet? - cmplwi cr1,r7,0 ; Is it locked? - sub r10,r10,r8 ; How long have we been spinning? - cmplw r10,r9 ; Has it been too long? - bgt-- fstimeout ; Way too long, panic... - bne-- cr1,fssync0a ; Yea, still locked so sniff harder... - -fssync1: lwarx r7,0,r5 ; Get the sync word - li r12,1 ; Get the lock - mr. r7,r7 ; Is it unlocked? - bne-- fssync0 - stwcx. r12,0,r5 ; Store lock and test reservation - bne-- fssync1 ; Try again if lost reservation... - - isync ; Toss speculation - - lwz r12,FPUowner(r6) ; Get the context ID for owner - cmplw r3,r12 ; Check again if we own the FPU? - bne-- fsretlk ; Go unlock and return since we no longer own context - - lwz r5,FPUcpu(r12) ; Get the cpu that context was last on - lwz r7,FPUsave(r12) ; Get the current FPU savearea for the thread - cmplw r5,r11 ; Is this for the same processor? - lwz r9,FPUlevel(r12) ; Get our current level indicator - bne-- fsretlk ; Not the same processor, skip any save... - - cmplwi r7,0 ; Have we ever saved this facility context? - beq-- fsneedone ; Never saved it, so go do it... - - lwz r8,SAVlevel(r7) ; Get the level of this savearea - cmplw r9,r8 ; Correct level? - beq-- fsretlk ; The current level is already saved, bail out... - -fsneedone: bl EXT(save_get) ; Get a savearea for the context - - mfsprg r6,1 ; Get the current activation - lwz r6,ACT_PER_PROC(r6) ; Get the per_proc block - li r4,SAVfloat ; Get floating point tag - lwz r12,FPUowner(r6) ; Get back our thread - stb r4,SAVflags+2(r3) ; Mark this savearea as a float - lwz r4,facAct(r12) ; Get the activation associated with live context - lwz r8,FPUsave(r12) ; Get the current top floating point savearea - stw r4,SAVact(r3) ; Indicate the right activation for this context - lwz r9,FPUlevel(r12) ; Get our current level indicator again - stw r3,FPUsave(r12) ; Set this as the most current floating point context - stw r8,SAVprev+4(r3) ; And then chain this in front - - stw r9,SAVlevel(r3) ; Show level in savearea - - bl fp_store ; save all 32 FPRs in the save area at r3 - mtlr r2 ; Restore return - -fsretlk: li r7,0 ; Get the unlock value - eieio ; Make sure that these updates make it out - stw r7,FPUsync(r12) ; Unlock it - -fsret: mtmsr r0 ; Put interrupts on if they were and floating point off - isync - - blr - -fstimeout: mr r4,r5 ; Set the lock address - mr r5,r7 ; Set the lock word data - lis r3,hi16(fstimeout_str) ; Get the failed lck message - ori r3,r3,lo16(fstimeout_str) ; Get the failed lck message - bl EXT(panic) - BREAKPOINT_TRAP ; We die here anyway - - .data -fstimeout_str: - STRINGD "fpu_save: timeout on sync lock (0x%08X), value = 0x%08X\n\000" - .text - - -/* - * fpu_switch() - * - * Entered to handle the floating-point unavailable exception and - * switch fpu context - * - * This code is run in virtual address mode on with interrupts off. - * - * Upon exit, the code returns to the users context with the floating - * point facility turned on. - * - * ENTRY: VM switched ON - * Interrupts OFF - * State is saved in savearea pointed to by R4. - * All other registers are free. - * - */ - - .align 5 - .globl EXT(fpu_switch) - -LEXT(fpu_switch) - -#if DEBUG - lis r3,hi16(EXT(fpu_trap_count)) ; Get address of FP trap counter - ori r3,r3,lo16(EXT(fpu_trap_count)) ; Get address of FP trap counter - lwz r1,0(r3) - addi r1,r1,1 - stw r1,0(r3) -#endif /* DEBUG */ - - mfsprg r17,1 ; Get the current activation - lwz r26,ACT_PER_PROC(r17) ; Get the per_proc block - mfmsr r19 ; Get the current MSR - - mr r25,r4 ; Save the entry savearea - lwz r22,FPUowner(r26) ; Get the thread that owns the FPU - ori r19,r19,lo16(MASK(MSR_FP)) ; Enable the floating point feature - - mtmsr r19 ; Enable floating point instructions - isync - - lwz r27,ACT_MACT_PCB(r17) ; Get the current level - lwz r29,curctx(r17) ; Grab the current context anchor of the current thread - -; R22 has the "old" context anchor -; R29 has the "new" context anchor - -#if FPVECDBG - lis r0,hi16(CutTrace) ; (TEST/DEBUG) - li r2,0x7F01 ; (TEST/DEBUG) - mr r3,r22 ; (TEST/DEBUG) - mr r5,r29 ; (TEST/DEBUG) - oris r0,r0,lo16(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) -#endif - - lhz r16,PP_CPU_NUMBER(r26) ; Get the current CPU number - - mr. r22,r22 ; See if there is any live FP status - la r15,FPUsync(r22) ; Point to the sync word - - beq-- fsnosave ; No live context, so nothing to save... - - lwz r18,FPUcpu(r22) ; Get the last CPU we ran on - cmplw cr2,r22,r29 ; Are both old and new the same context? - lwz r30,FPUsave(r22) ; Get the top savearea - cmplw r18,r16 ; Make sure we are on the right processor - lwz r31,FPUlevel(r22) ; Get the context level - cmplwi cr1,r30,0 ; Anything saved yet? - - bne-- fsnosave ; No, not on the same processor... - -; -; Check to see if the live context has already been saved. -; Also check to see if all we are here just to re-enable the MSR -; and handle specially if so. -; - - cmplw r31,r27 ; See if the current and active levels are the same - crand cr0_eq,cr2_eq,cr0_eq ; Remember if both the levels and contexts are the same - - beq-- fsthesame ; New and old are the same, just go enable... - - -; -; Note it turns out that on a G5, the following load has about a 50-50 chance of -; taking a segment exception in a system that is doing heavy file I/O. We -; make a dummy access right now in order to get that resolved before we take the lock. -; We do not use the data returned because it may change over the lock -; - - beq-- cr1,fswsync ; Nothing saved, skip the probe attempt... - lwz r11,SAVlevel(r30) ; Touch the context in order to fault in the segment - -; -; Make sure that the live context block is not mucked with while -; we are trying to save it on out -; - -fswsync: lis r11,ha16(EXT(LockTimeOut)) ; Get the high part - mftb r3 ; Get the time now - lwz r11,lo16(EXT(LockTimeOut))(r11) ; Get the timeout value - b fswsync0a ; Jump to the lock... - - .align 5 - -fswsync0: li r19,lgKillResv ; Get killing field - stwcx. r19,0,r19 ; Kill reservation - -fswsync0a: lwz r19,0(r15) ; Sniff the lock - mftb r18 ; Is it time yet? - cmplwi cr1,r19,0 ; Is it locked? - sub r18,r18,r3 ; How long have we been spinning? - cmplw r18,r11 ; Has it been too long? - bgt-- fswtimeout ; Way too long, panic... - bne-- cr1,fswsync0a ; Yea, still locked so sniff harder... - -fswsync1: lwarx r19,0,r15 ; Get the sync word - li r0,1 ; Get the lock - mr. r19,r19 ; Is it unlocked? - bne-- fswsync0 - stwcx. r0,0,r15 ; Store lock and test reservation - bne-- fswsync1 ; Try again if lost reservation... - - isync ; Toss speculation - -; -; Note that now that we have the lock, we need to check if anything changed. -; Also note that the possible changes are limited. The context owner can -; never change to a different thread or level although it can be invalidated. -; A new context can not be pushed on top of us, but it can be popped. The -; cpu indicator will always change if another processor mucked with any -; contexts. -; -; It should be very rare that any of the context stuff changes across the lock. -; - - lwz r0,FPUowner(r26) ; Get the thread that owns the FPU again - lwz r11,FPUsave(r22) ; Get the top savearea again - lwz r18,FPUcpu(r22) ; Get the last CPU we ran on again - sub r0,r0,r22 ; Non-zero if we lost ownership, 0 if not - xor r11,r11,r30 ; Non-zero if saved context changed, 0 if not - xor r18,r18,r16 ; Non-zero if cpu changed, 0 if not - cmplwi cr1,r30,0 ; Is anything saved? - or r0,r0,r11 ; Zero only if both owner and context are unchanged - or. r0,r0,r18 ; Zero only if nothing has changed - li r3,0 ; Clear this - - bne-- fsnosavelk ; Something has changed, so this is not ours to save... - beq-- cr1,fsmstsave ; There is no context saved yet... - - lwz r11,SAVlevel(r30) ; Get the level of top saved context - - cmplw r31,r11 ; Are live and saved the same? - -#if FPVECDBG - lis r0,hi16(CutTrace) ; (TEST/DEBUG) - li r2,0x7F02 ; (TEST/DEBUG) - mr r3,r11 ; (TEST/DEBUG) - mr r5,r31 ; (TEST/DEBUG) - oris r0,r0,lo16(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) - li r3,0 ; (TEST/DEBUG) -#endif - - beq++ fsnosavelk ; Same level, so already saved... - -fsmstsave: stw r3,FPUowner(r26) ; Kill the context now - eieio ; Make sure everyone sees it - bl EXT(save_get) ; Go get a savearea - - lwz r12,facAct(r22) ; Get the activation associated with the context - stw r30,SAVprev+4(r3) ; Point us to the old context - stw r31,SAVlevel(r3) ; Tag our level - li r7,SAVfloat ; Get the floating point ID - stw r12,SAVact(r3) ; Make sure we point to the right guy - stb r7,SAVflags+2(r3) ; Set that we have a floating point save area - stw r3,FPUsave(r22) ; Set this as the latest context savearea for the thread - -#if FPVECDBG - lis r0,hi16(CutTrace) ; (TEST/DEBUG) - li r2,0x7F03 ; (TEST/DEBUG) - oris r0,r0,lo16(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) -#endif - - bl fp_store ; store all 32 FPRs - -fsnosavelk: li r7,0 ; Get the unlock value - eieio ; Make sure that these updates make it out - stw r7,FPUsync(r22) ; Unlock it. - -; -; The context is all saved now and the facility is free. -; -; Check if we need to fill the registers with junk, because this level has -; never used them before and some thieving bastard could hack the old values -; of some thread! Just imagine what would happen if they could! Why, nothing -; would be safe! My God! It is terrifying! -; -; Make sure that the live context block is not mucked with while -; we are trying to load it up -; - -fsnosave: la r15,FPUsync(r29) ; Point to the sync word - lis r11,ha16(EXT(LockTimeOut)) ; Get the high part - mftb r3 ; Get the time now - lwz r11,lo16(EXT(LockTimeOut))(r11) ; Get the timeout value - b fsnsync0a ; Jump to the lock... - - .align 5 - -fsnsync0: li r19,lgKillResv ; Get killing field - stwcx. r19,0,r19 ; Kill reservation - -fsnsync0a: lwz r19,0(r15) ; Sniff the lock - mftb r18 ; Is it time yet? - cmplwi cr1,r19,0 ; Is it locked? - sub r18,r18,r3 ; How long have we been spinning? - cmplw r18,r11 ; Has it been too long? - bgt-- fsntimeout ; Way too long, panic... - bne-- cr1,fsnsync0a ; Yea, still locked so sniff harder... - -fsnsync1: lwarx r19,0,r15 ; Get the sync word - li r0,1 ; Get the lock - mr. r19,r19 ; Is it unlocked? - bne-- fsnsync0 ; Unfortunately, it is locked... - stwcx. r0,0,r15 ; Store lock and test reservation - bne-- fsnsync1 ; Try again if lost reservation... - - isync ; Toss speculation - - lwz r15,ACT_MACT_PCB(r17) ; Get the current level of the "new" one - lwz r19,FPUcpu(r29) ; Get the last CPU we ran on - lwz r14,FPUsave(r29) ; Point to the top of the "new" context stack - - stw r16,FPUcpu(r29) ; Claim context for us - eieio - -#if FPVECDBG - lwz r13,FPUlevel(r29) ; (TEST/DEBUG) - lis r0,hi16(CutTrace) ; (TEST/DEBUG) - li r2,0x7F04 ; (TEST/DEBUG) - mr r1,r15 ; (TEST/DEBUG) - mr r3,r14 ; (TEST/DEBUG) - mr r5,r13 ; (TEST/DEBUG) - oris r0,r0,lo16(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) -#endif - - lis r18,hi16(EXT(PerProcTable)) ; Set base PerProcTable - mulli r19,r19,ppeSize ; Find offset to the owner per_proc_entry - ori r18,r18,lo16(EXT(PerProcTable)) ; Set base PerProcTable - li r16,FPUowner ; Displacement to float owner - add r19,r18,r19 ; Point to the owner per_proc_entry - lwz r19,ppe_vaddr(r19) ; Point to the owner per_proc - -fsinvothr: lwarx r18,r16,r19 ; Get the owner - sub r0,r18,r29 ; Subtract one from the other - sub r11,r29,r18 ; Subtract the other from the one - or r11,r11,r0 ; Combine them - srawi r11,r11,31 ; Get a 0 if equal or -1 of not - and r18,r18,r11 ; Make 0 if same, unchanged if not - stwcx. r18,r16,r19 ; Try to invalidate it - bne-- fsinvothr ; Try again if there was a collision... - - cmplwi cr1,r14,0 ; Do we possibly have some context to load? - la r11,savefp0(r14) ; Point to first line to bring in - stw r15,FPUlevel(r29) ; Set the "new" active level - eieio - stw r29,FPUowner(r26) ; Mark us as having the live context - - beq++ cr1,MakeSureThatNoTerroristsCanHurtUsByGod ; No "new" context to load... - - dcbt 0,r11 ; Touch line in - - lwz r0,SAVlevel(r14) ; Get the level of first facility savearea - lwz r3,SAVprev+4(r14) ; Get the previous context - cmplw r0,r15 ; Top level correct to load? - li r7,0 ; Get the unlock value - bne-- MakeSureThatNoTerroristsCanHurtUsByGod ; No, go initialize... - - stw r3,FPUsave(r29) ; Pop the context (we will toss the savearea later) - -#if FPVECDBG - lis r0,hi16(CutTrace) ; (TEST/DEBUG) - li r2,0x7F05 ; (TEST/DEBUG) - oris r0,r0,lo16(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) -#endif - - eieio ; Make sure that these updates make it out - stw r7,FPUsync(r29) ; Unlock context now that the context save has been removed - -// Note this code is used both by 32- and 128-byte processors. This means six extra DCBTs -// are executed on a 128-byte machine, but that is better than a mispredicted branch. - - la r11,savefp4(r14) ; Point to next line - dcbt 0,r11 ; Touch line in - lfd f0, savefp0(r14) - lfd f1,savefp1(r14) - lfd f2,savefp2(r14) - la r11,savefp8(r14) ; Point to next line - lfd f3,savefp3(r14) - dcbt 0,r11 ; Touch line in - lfd f4,savefp4(r14) - lfd f5,savefp5(r14) - lfd f6,savefp6(r14) - la r11,savefp12(r14) ; Point to next line - lfd f7,savefp7(r14) - dcbt 0,r11 ; Touch line in - lfd f8,savefp8(r14) - lfd f9,savefp9(r14) - lfd f10,savefp10(r14) - la r11,savefp16(r14) ; Point to next line - lfd f11,savefp11(r14) - dcbt 0,r11 ; Touch line in - lfd f12,savefp12(r14) - lfd f13,savefp13(r14) - lfd f14,savefp14(r14) - la r11,savefp20(r14) ; Point to next line - lfd f15,savefp15(r14) - dcbt 0,r11 ; Touch line in - lfd f16,savefp16(r14) - lfd f17,savefp17(r14) - lfd f18,savefp18(r14) - la r11,savefp24(r14) ; Point to next line - lfd f19,savefp19(r14) - dcbt 0,r11 ; Touch line in - lfd f20,savefp20(r14) - lfd f21,savefp21(r14) - la r11,savefp28(r14) ; Point to next line - lfd f22,savefp22(r14) - lfd f23,savefp23(r14) - dcbt 0,r11 ; Touch line in - lfd f24,savefp24(r14) - lfd f25,savefp25(r14) - lfd f26,savefp26(r14) - lfd f27,savefp27(r14) - lfd f28,savefp28(r14) - lfd f29,savefp29(r14) - lfd f30,savefp30(r14) - lfd f31,savefp31(r14) - - mr r3,r14 ; Get the old savearea (we popped it before) - bl EXT(save_ret) ; Toss it - -fsenable: lwz r8,savesrr1+4(r25) ; Get the msr of the interrupted guy - ori r8,r8,MASK(MSR_FP) ; Enable the floating point feature - lwz r10,ACT_MACT_SPF(r17) ; Get the act special flags - lwz r11,spcFlags(r26) ; Get per_proc spec flags cause not in sync with act - oris r10,r10,hi16(floatUsed|floatCng) ; Set that we used floating point - oris r11,r11,hi16(floatUsed|floatCng) ; Set that we used floating point - rlwinm. r0,r8,0,MSR_PR_BIT,MSR_PR_BIT ; See if we are doing this for user state - stw r8,savesrr1+4(r25) ; Set the msr of the interrupted guy - mr r3,r25 ; Pass the virtual addres of savearea - beq- fsnuser ; We are not user state... - stw r10,ACT_MACT_SPF(r17) ; Set the activation copy - stw r11,spcFlags(r26) ; Set per_proc copy - -fsnuser: -#if FPVECDBG - lis r0,hi16(CutTrace) ; (TEST/DEBUG) - li r2,0x7F07 ; (TEST/DEBUG) - oris r0,r0,lo16(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) -#endif - - b EXT(exception_exit) ; Exit to the fray... - -/* - * Initialize the registers to some bogus value - */ - -MakeSureThatNoTerroristsCanHurtUsByGod: - -#if FPVECDBG - lis r0,hi16(CutTrace) ; (TEST/DEBUG) - li r2,0x7F06 ; (TEST/DEBUG) - oris r0,r0,lo16(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) -#endif - lis r5,hi16(EXT(FloatInit)) ; Get top secret floating point init value address - li r7,0 ; Get the unlock value - ori r5,r5,lo16(EXT(FloatInit)) ; Slam bottom - eieio ; Make sure that these updates make it out - stw r7,FPUsync(r29) ; Unlock it now that the context has been removed - - lfd f0,0(r5) ; Initialize FP0 - fmr f1,f0 ; Do them all - fmr f2,f0 - fmr f3,f0 - fmr f4,f0 - fmr f5,f0 - fmr f6,f0 - fmr f7,f0 - fmr f8,f0 - fmr f9,f0 - fmr f10,f0 - fmr f11,f0 - fmr f12,f0 - fmr f13,f0 - fmr f14,f0 - fmr f15,f0 - fmr f16,f0 - fmr f17,f0 - fmr f18,f0 - fmr f19,f0 - fmr f20,f0 - fmr f21,f0 - fmr f22,f0 - fmr f23,f0 - fmr f24,f0 - fmr f25,f0 - fmr f26,f0 - fmr f27,f0 - fmr f28,f0 - fmr f29,f0 - fmr f30,f0 - fmr f31,f0 - b fsenable ; Finish setting it all up... - - -; -; We get here when we are switching to the same context at the same level and the context -; is still live. Essentially, all we are doing is turning on the facility. It may have -; gotten turned off due to doing a context save for the current level or a context switch -; back to the live guy. -; - - .align 5 - - -fsthesamel: li r7,0 ; Get the unlock value - eieio ; Make sure that these updates make it out - stw r7,FPUsync(r22) ; Unlock it. - -fsthesame: - -#if FPVECDBG - lis r0,hi16(CutTrace) ; (TEST/DEBUG) - li r2,0x7F0A ; (TEST/DEBUG) - oris r0,r0,lo16(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) -#endif - beq- cr1,fsenable ; Not saved yet, nothing to pop, go enable and exit... - - lwz r11,SAVlevel(r30) ; Get the level of top saved context - lwz r14,SAVprev+4(r30) ; Get the previous savearea - - cmplw r11,r31 ; Are live and saved the same? - - bne++ fsenable ; Level not the same, nothing to pop, go enable and exit... - - mr r3,r30 ; Get the old savearea (we popped it before) - stw r14,FPUsave(r22) ; Pop the savearea from the stack - bl EXT(save_ret) ; Toss it - b fsenable ; Go enable and exit... - -; -; Note that we need to choke in this code rather than panic because there is no -; stack. -; - -fswtimeout: lis r0,hi16(Choke) ; Choke code - ori r0,r0,lo16(Choke) ; and the rest - li r3,failTimeout ; Timeout code - sc ; System ABEND - -fsntimeout: lis r0,hi16(Choke) ; Choke code - ori r0,r0,lo16(Choke) ; and the rest - li r3,failTimeout ; Timeout code - sc ; System ABEND - -vswtimeout0: - lis r0,hi16(Choke) ; Choke code - ori r0,r0,lo16(Choke) ; and the rest - li r3,failTimeout ; Timeout code - sc ; System ABEND - -vswtimeout1: - lis r0,hi16(Choke) ; Choke code - ori r0,r0,lo16(Choke) ; and the rest - li r3,failTimeout ; Timeout code - sc ; System ABEND - -; -; This function invalidates any live floating point context for the passed in facility_context. -; This is intended to be called just before act_machine_sv_free tosses saveareas. -; - - .align 5 - .globl EXT(toss_live_fpu) - -LEXT(toss_live_fpu) - - lis r0,hi16(MASK(MSR_VEC)) ; Get VEC - mfmsr r9 ; Get the MSR - ori r0,r0,lo16(MASK(MSR_FP)) ; Add in FP - rlwinm. r8,r9,0,MSR_FP_BIT,MSR_FP_BIT ; Are floats on right now? - andc r9,r9,r0 ; Force off VEC and FP - ori r0,r0,lo16(MASK(MSR_EE)) ; Turn off EE - andc r0,r9,r0 ; Turn off EE now - mtmsr r0 ; No interruptions - isync - beq+ tlfnotours ; Floats off, can not be live here... - - mfsprg r8,1 ; Get the current activation - lwz r8,ACT_PER_PROC(r8) ; Get the per_proc block - -; -; Note that at this point, since floats are on, we are the owner -; of live state on this processor -; - - lwz r6,FPUowner(r8) ; Get the thread that owns the floats - li r0,0 ; Clear this just in case we need it - cmplw r6,r3 ; Are we tossing our own context? - bne-- tlfnotours ; Nope... - - lfd f1,Zero(0) ; Make a 0 - mtfsf 0xFF,f1 ; Clear it - -tlfnotours: lwz r11,FPUcpu(r3) ; Get the cpu on which we last loaded context - lis r12,hi16(EXT(PerProcTable)) ; Set base PerProcTable - mulli r11,r11,ppeSize ; Find offset to the owner per_proc_entry - ori r12,r12,lo16(EXT(PerProcTable)) ; Set base PerProcTable - li r10,FPUowner ; Displacement to float owner - add r11,r12,r11 ; Point to the owner per_proc_entry - lwz r11,ppe_vaddr(r11) ; Point to the owner per_proc - -tlfinvothr: lwarx r12,r10,r11 ; Get the owner - - sub r0,r12,r3 ; Subtract one from the other - sub r8,r3,r12 ; Subtract the other from the one - or r8,r8,r0 ; Combine them - srawi r8,r8,31 ; Get a 0 if equal or -1 of not - and r12,r12,r8 ; Make 0 if same, unchanged if not - stwcx. r12,r10,r11 ; Try to invalidate it - bne-- tlfinvothr ; Try again if there was a collision... - - mtmsr r9 ; Restore interruptions - isync ; Could be turning off floats here - blr ; Leave... - - -/* - * Altivec stuff is here. The techniques used are pretty identical to - * the floating point. Except that we will honor the VRSAVE register - * settings when loading and restoring registers. - * - * There are two indications of saved VRs: the VRSAVE register and the vrvalid - * mask. VRSAVE is set by the vector user and represents the VRs that they - * say that they are using. The vrvalid mask indicates which vector registers - * are saved in the savearea. Whenever context is saved, it is saved according - * to the VRSAVE register. It is loaded based on VRSAVE anded with - * vrvalid (all other registers are splatted with 0s). This is done because we - * don't want to load any registers we don't have a copy of, we want to set them - * to zero instead. - * - * Note that there are some oddities here when we save a context we are using. - * It is really not too cool to do this, but what the hey... Anyway, - * we turn vectors and fpu off before we leave. - * The oddity is that if you use vectors after this, the - * savearea containing the context just saved will go away. So, bottom line is - * that don't use vectors until after you are done with the saved context. - * - */ - - .align 5 - .globl EXT(vec_save) - -LEXT(vec_save) - - - lis r2,hi16(MASK(MSR_VEC)) ; Get VEC - mfmsr r0 ; Get the MSR - ori r2,r2,lo16(MASK(MSR_FP)) ; Add in FP - andc r0,r0,r2 ; Force off VEC and FP - ori r2,r2,lo16(MASK(MSR_EE)) ; Clear EE - andc r2,r0,r2 ; Clear EE for now - oris r2,r2,hi16(MASK(MSR_VEC)) ; Enable the vector facility for now also - mtmsr r2 ; Set the MSR - isync - - mfsprg r6,1 ; Get the current activation - lwz r6,ACT_PER_PROC(r6) ; Get the per_proc block - lwz r12,VMXowner(r6) ; Get the context ID for owner - -#if FPVECDBG - mr r11,r6 ; (TEST/DEBUG) - mr r7,r0 ; (TEST/DEBUG) - li r4,0 ; (TEST/DEBUG) - mr r10,r3 ; (TEST/DEBUG) - lis r0,hi16(CutTrace) ; (TEST/DEBUG) - mr. r3,r12 ; (TEST/DEBUG) - li r2,0x5F00 ; (TEST/DEBUG) - li r5,0 ; (TEST/DEBUG) - lwz r6,liveVRS(r6) ; (TEST/DEBUG) - beq-- noowneryeu ; (TEST/DEBUG) - lwz r4,VMXlevel(r12) ; (TEST/DEBUG) - lwz r5,VMXsave(r12) ; (TEST/DEBUG) - -noowneryeu: oris r0,r0,lo16(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) - mr r0,r7 ; (TEST/DEBUG) - mr r3,r10 ; (TEST/DEBUG) - mr r6,r11 ; (TEST/DEBUG) -#endif - mflr r2 ; Save the return address - - cmplw r3,r12 ; Is the specified context live? - lhz r11,PP_CPU_NUMBER(r6) ; Get our CPU number - bne-- vsret ; We do not own the vector, no save required... - lwz r9,VMXcpu(r12) ; Get the cpu that context was last on - - cmplw r9,r11 ; Was the context for this processor? - la r5,VMXsync(r3) ; Point to the sync word - bne-- vsret ; Specified context is not live - -; -; It looks like we need to save this one. Or possibly toss a saved one if -; the VRSAVE is 0. -; -; First, make sure that the live context block is not mucked with while -; we are trying to save it on out. Then we will give it the final check. -; - - lis r9,ha16(EXT(LockTimeOut)) ; Get the high part - mftb r8 ; Get the time now - lwz r9,lo16(EXT(LockTimeOut))(r9) ; Get the timeout value - b vssync0a ; Jump to the lock... - - .align 5 - -vssync0: li r7,lgKillResv ; Get killing field - stwcx. r7,0,r7 ; Kill reservation - -vssync0a: lwz r7,0(r5) ; Sniff the lock - mftb r10 ; Is it time yet? - cmplwi cr1,r7,0 ; Is it locked? - sub r10,r10,r8 ; How long have we been spinning? - cmplw r10,r9 ; Has it been too long? - bgt-- vswtimeout0 ; Way too long, panic... - bne-- cr1,vssync0a ; Yea, still locked so sniff harder... - -vssync1: lwarx r7,0,r5 ; Get the sync word - li r12,1 ; Get the lock - mr. r7,r7 ; Is it unlocked? - bne-- vssync0 ; No, it is unlocked... - stwcx. r12,0,r5 ; Store lock and test reservation - bne-- vssync1 ; Try again if lost reservation... - - isync ; Toss speculation - - lwz r12,VMXowner(r6) ; Get the context ID for owner - cmplw r3,r12 ; Check again if we own VMX? - lwz r10,liveVRS(r6) ; Get the right VRSave register - bne-- vsretlk ; Go unlock and return since we no longer own context - - lwz r5,VMXcpu(r12) ; Get the cpu that context was last on - lwz r7,VMXsave(r12) ; Get the current vector savearea for the thread - cmplwi cr1,r10,0 ; Is VRsave set to 0? - cmplw r5,r11 ; Is this for the same processor? - lwz r9,VMXlevel(r12) ; Get our current level indicator - bne-- vsretlk ; Not the same processor, skip any save... - - cmplwi r7,0 ; Have we ever saved this facility context? - beq-- vsneedone ; Never saved it, so we need an area... - - lwz r8,SAVlevel(r7) ; Get the level this savearea is for - cmplw r9,r8 ; Correct level? - bne-- vsneedone ; Different level, so we need to save... - - bne++ cr1,vsretlk ; VRsave is non-zero so we need to keep what is saved... - - lwz r4,SAVprev+4(r7) ; Pick up the previous area - li r5,0 ; Assume we just dumped the last - mr. r4,r4 ; Is there one? - stw r4,VMXsave(r12) ; Dequeue this savearea - beq-- vsnomore ; We do not have another... - - lwz r5,SAVlevel(r4) ; Get the level associated with save - -vsnomore: stw r5,VMXlevel(r12) ; Save the level - li r7,0 ; Clear - stw r7,VMXowner(r6) ; Show no live context here - -vsbackout: mr r4,r0 ; restore the saved MSR - eieio - stw r7,VMXsync(r12) ; Unlock the context - - b EXT(save_ret_wMSR) ; Toss the savearea and return from there... - - .align 5 - -vsneedone: beq-- cr1,vsclrlive ; VRSave is zero, go blow away the context... - - bl EXT(save_get) ; Get a savearea for the context - - mfsprg r6,1 ; Get the current activation - lwz r6,ACT_PER_PROC(r6) ; Get the per_proc block - li r4,SAVvector ; Get vector tag - lwz r12,VMXowner(r6) ; Get back our context ID - stb r4,SAVflags+2(r3) ; Mark this savearea as a vector - mr. r12,r12 ; See if we were disowned while away. Very, very small chance of it... - li r7,0 ; Clear - beq-- vsbackout ; If disowned, just toss savearea... - lwz r4,facAct(r12) ; Get the activation associated with live context - lwz r8,VMXsave(r12) ; Get the current top vector savearea - stw r4,SAVact(r3) ; Indicate the right activation for this context - lwz r9,VMXlevel(r12) ; Get our current level indicator again - stw r3,VMXsave(r12) ; Set this as the most current floating point context - stw r8,SAVprev+4(r3) ; And then chain this in front - - stw r9,SAVlevel(r3) ; Set level in savearea - mfcr r12 ; save CRs across call to vr_store - lwz r10,liveVRS(r6) ; Get the right VRSave register - - bl vr_store ; store live VRs into savearea as required (uses r4-r11) - - mfsprg r6,1 ; Get the current activation - mtcrf 255,r12 ; Restore the non-volatile CRs - lwz r6,ACT_PER_PROC(r6) ; Get the per_proc block - mtlr r2 ; Restore return address - lwz r12,VMXowner(r6) ; Get back our context ID - -vsretlk: li r7,0 ; Get the unlock value - eieio ; Make sure that these updates make it out - stw r7,VMXsync(r12) ; Unlock it - -vsret: mtmsr r0 ; Put interrupts on if they were and vector off - isync - - blr - -vsclrlive: li r7,0 ; Clear - stw r7,VMXowner(r6) ; Show no live context here - b vsretlk ; Go unlock and leave... - -/* - * vec_switch() - * - * Entered to handle the vector unavailable exception and - * switch vector context - * - * This code is run with virtual address mode on and interrupts off. - * - * Upon exit, the code returns to the users context with the vector - * facility turned on. - * - * ENTRY: VM switched ON - * Interrupts OFF - * State is saved in savearea pointed to by R4. - * All other registers are free. - * - */ - - .align 5 - .globl EXT(vec_switch) - -LEXT(vec_switch) - -#if DEBUG - lis r3,hi16(EXT(vec_trap_count)) ; Get address of vector trap counter - ori r3,r3,lo16(EXT(vec_trap_count)) ; Get address of vector trap counter - lwz r1,0(r3) - addi r1,r1,1 - stw r1,0(r3) -#endif /* DEBUG */ - - mfsprg r17,1 ; Get the current activation - lwz r26,ACT_PER_PROC(r17) ; Get the per_proc block - mfmsr r19 ; Get the current MSR - - mr r25,r4 ; Save the entry savearea - oris r19,r19,hi16(MASK(MSR_VEC)) ; Enable the vector feature - lwz r22,VMXowner(r26) ; Get the thread that owns the vector - - mtmsr r19 ; Enable vector instructions - isync - - lwz r27,ACT_MACT_PCB(r17) ; Get the current level - lwz r29,curctx(r17) ; Grab the current context anchor of the current thread - -; R22 has the "old" context anchor -; R29 has the "new" context anchor - -#if FPVECDBG - lis r0,HIGH_ADDR(CutTrace) ; (TEST/DEBUG) - li r2,0x5F01 ; (TEST/DEBUG) - mr r3,r22 ; (TEST/DEBUG) - mr r5,r29 ; (TEST/DEBUG) - lwz r6,liveVRS(r26) ; (TEST/DEBUG) - oris r0,r0,LOW_ADDR(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) -#endif - - lhz r16,PP_CPU_NUMBER(r26) ; Get the current CPU number - - mr. r22,r22 ; See if there is any live vector status - la r15,VMXsync(r22) ; Point to the sync word - - beq-- vswnosave ; No live context, so nothing to save... - - lwz r18,VMXcpu(r22) ; Get the last CPU we ran on - cmplw cr2,r22,r29 ; Are both old and new the same context? - lwz r30,VMXsave(r22) ; Get the top savearea - cmplwi cr1,r30,0 ; Anything saved yet? - lwz r31,VMXlevel(r22) ; Get the context level - cmplw r18,r16 ; Make sure we are on the right processor - - lwz r10,liveVRS(r26) ; Get the right VRSave register - - bne-- vswnosave ; No, not on the same processor... - -; -; Check to see if the live context has already been saved. -; Also check to see if all we are here just to re-enable the MSR -; and handle specially if so. -; - - cmplw r31,r27 ; See if the current and active levels are the same - crand cr0_eq,cr2_eq,cr0_eq ; Remember if both the levels and contexts are the same - - beq-- vswthesame ; New and old are the same, just go enable... - -; -; Make sure that the live context block is not mucked with while -; we are trying to save it on out -; - - lis r11,ha16(EXT(LockTimeOut)) ; Get the high part - mftb r3 ; Get the time now - lwz r11,lo16(EXT(LockTimeOut))(r11) ; Get the timeout value - b vswsync0a ; Jump to the lock... - - .align 5 - -vswsync0: li r19,lgKillResv ; Get killing field - stwcx. r19,0,r19 ; Kill reservation - -vswsync0a: lwz r19,0(r15) ; Sniff the lock - mftb r18 ; Is it time yet? - cmplwi cr1,r19,0 ; Is it locked? - sub r18,r18,r3 ; How long have we been spinning? - cmplw r18,r11 ; Has it been too long? - bgt-- vswtimeout0 ; Way too long, panic... - bne-- cr1,vswsync0a ; Yea, still locked so sniff harder... - -vswsync1: lwarx r19,0,r15 ; Get the sync word - li r0,1 ; Get the lock - mr. r19,r19 ; Is it unlocked? - bne-- vswsync0 - stwcx. r0,0,r15 ; Store lock and test reservation - bne-- vswsync1 ; Try again if lost reservation... - - isync ; Toss speculation - -; -; Note that now that we have the lock, we need to check if anything changed. -; Also note that the possible changes are limited. The context owner can -; never change to a different thread or level although it can be invalidated. -; A new context can not be pushed on top of us, but it can be popped. The -; cpu indicator will always change if another processor mucked with any -; contexts. -; -; It should be very rare that any of the context stuff changes across the lock. -; - - lwz r0,VMXowner(r26) ; Get the thread that owns the vectors again - lwz r11,VMXsave(r22) ; Get the top savearea again - lwz r18,VMXcpu(r22) ; Get the last CPU we ran on again - sub r0,r0,r22 ; Non-zero if we lost ownership, 0 if not - xor r11,r11,r30 ; Non-zero if saved context changed, 0 if not - xor r18,r18,r16 ; Non-zero if cpu changed, 0 if not - cmplwi cr1,r30,0 ; Is anything saved? - or r0,r0,r11 ; Zero only if both owner and context are unchanged - or. r0,r0,r18 ; Zero only if nothing has changed - cmplwi cr2,r10,0 ; Check VRSave to see if we really need to save anything... - li r8,0 ; Clear - - bne-- vswnosavelk ; Something has changed, so this is not ours to save... - beq-- cr1,vswmstsave ; There is no context saved yet... - - lwz r11,SAVlevel(r30) ; Get the level of top saved context - - cmplw r31,r11 ; Are live and saved the same? - -#if FPVECDBG - lis r0,hi16(CutTrace) ; (TEST/DEBUG) - li r2,0x5F02 ; (TEST/DEBUG) - mr r3,r30 ; (TEST/DEBUG) - mr r5,r31 ; (TEST/DEBUG) - oris r0,r0,lo16(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) -#endif - - beq++ vswnosavelk ; Same level, already saved... - bne-- cr2,vswnosavelk ; Live context saved and VRSave not 0, no save and keep context... - - lwz r4,SAVprev+4(r30) ; Pick up the previous area - li r5,0 ; Assume this is the only one (which should be the ususal case) - mr. r4,r4 ; Was this the only one? - stw r4,VMXsave(r22) ; Dequeue this savearea - beq++ vswonlyone ; This was the only one... - lwz r5,SAVlevel(r4) ; Get the level associated with previous save - -vswonlyone: stw r5,VMXlevel(r22) ; Save the level - stw r8,VMXowner(r26) ; Clear owner - - mr r3,r30 ; Copy the savearea we are tossing - bl EXT(save_ret) ; Toss the savearea - b vswnosavelk ; Go load up the context... - - .align 5 - -vswmstsave: stw r8,VMXowner(r26) ; Clear owner - beq-- cr2,vswnosavelk ; The VRSave was 0, so there is nothing to save... - - bl EXT(save_get) ; Go get a savearea - - lwz r12,facAct(r22) ; Get the activation associated with the context - stw r3,VMXsave(r22) ; Set this as the latest context savearea for the thread - stw r30,SAVprev+4(r3) ; Point us to the old context - stw r31,SAVlevel(r3) ; Tag our level - li r7,SAVvector ; Get the vector ID - stw r12,SAVact(r3) ; Make sure we point to the right guy - stb r7,SAVflags+2(r3) ; Set that we have a vector save area - -#if FPVECDBG - lis r0,hi16(CutTrace) ; (TEST/DEBUG) - li r2,0x5F03 ; (TEST/DEBUG) - oris r0,r0,lo16(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) -#endif - - lwz r10,liveVRS(r26) ; Get the right VRSave register - bl vr_store ; store VRs into savearea according to vrsave (uses r4-r11) - -; -; The context is all saved now and the facility is free. -; -; Check if we need to fill the registers with junk, because this level has -; never used them before and some thieving bastard could hack the old values -; of some thread! Just imagine what would happen if they could! Why, nothing -; would be safe! My God! It is terrifying! -; -; Also, along the way, thanks to Ian Ollmann, we generate the 0x7FFFDEAD (QNaNbarbarian) -; constant that we may need to fill unused vector registers. -; -; Make sure that the live context block is not mucked with while -; we are trying to load it up -; - -vswnosavelk: - li r7,0 ; Get the unlock value - eieio ; Make sure that these updates make it out - stw r7,VMXsync(r22) ; Unlock the old context - -vswnosave: la r15,VMXsync(r29) ; Point to the sync word - lis r11,ha16(EXT(LockTimeOut)) ; Get the high part - mftb r3 ; Get the time now - lwz r11,lo16(EXT(LockTimeOut))(r11) ; Get the timeout value - b vswnsync0a ; Jump to the lock... - - .align 5 - -vswnsync0: li r19,lgKillResv ; Get killing field - stwcx. r19,0,r19 ; Kill reservation - -vswnsync0a: lwz r19,0(r15) ; Sniff the lock - mftb r18 ; Is it time yet? - cmplwi cr1,r19,0 ; Is it locked? - sub r18,r18,r3 ; How long have we been spinning? - cmplw r18,r11 ; Has it been too long? - bgt-- vswtimeout1 ; Way too long, panic... - bne-- cr1,vswnsync0a ; Yea, still locked so sniff harder... - -vswnsync1: lwarx r19,0,r15 ; Get the sync word - li r0,1 ; Get the lock - mr. r19,r19 ; Is it unlocked? - bne-- vswnsync0 ; Unfortunately, it is locked... - stwcx. r0,0,r15 ; Store lock and test reservation - bne-- vswnsync1 ; Try again if lost reservation... - - isync ; Toss speculation - - vspltisb v31,-10 ; Get 0xF6F6F6F6 - lwz r15,ACT_MACT_PCB(r17) ; Get the current level of the "new" one - vspltisb v30,5 ; Get 0x05050505 - lwz r19,VMXcpu(r29) ; Get the last CPU we ran on - vspltish v29,4 ; Get 0x00040004 - lwz r14,VMXsave(r29) ; Point to the top of the "new" context stack - vrlb v31,v31,v30 ; Get 0xDEDEDEDE - - stw r16,VMXcpu(r29) ; Claim context for us - eieio - -#if FPVECDBG - lwz r13,VMXlevel(r29) ; (TEST/DEBUG) - lis r0,hi16(CutTrace) ; (TEST/DEBUG) - li r2,0x5F04 ; (TEST/DEBUG) - mr r1,r15 ; (TEST/DEBUG) - mr r3,r14 ; (TEST/DEBUG) - mr r5,r13 ; (TEST/DEBUG) - oris r0,r0,lo16(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) -#endif - - lis r18,hi16(EXT(PerProcTable)) ; Set base PerProcTable - vspltisb v28,-2 ; Get 0xFEFEFEFE - mulli r19,r19,ppeSize ; Find offset to the owner per_proc_entry - vsubuhm v31,v31,v29 ; Get 0xDEDADEDA - ori r18,r18,lo16(EXT(PerProcTable)) ; Set base PerProcTable - vpkpx v30,v28,v3 ; Get 0x7FFF7FFF - li r16,VMXowner ; Displacement to vector owner - add r19,r18,r19 ; Point to the owner per_proc_entry - lwz r19,ppe_vaddr(r19) ; Point to the owner per_proc - vrlb v31,v31,v29 ; Get 0xDEADDEAD - -vswinvothr: lwarx r18,r16,r19 ; Get the owner - - sub r0,r18,r29 ; Subtract one from the other - sub r11,r29,r18 ; Subtract the other from the one - or r11,r11,r0 ; Combine them - srawi r11,r11,31 ; Get a 0 if equal or -1 of not - and r18,r18,r11 ; Make 0 if same, unchanged if not - stwcx. r18,r16,r19 ; Try to invalidate it - bne-- vswinvothr ; Try again if there was a collision... - - cmplwi cr1,r14,0 ; Do we possibly have some context to load? - vmrghh v31,v30,v31 ; Get 0x7FFFDEAD. V31 keeps this value until the bitter end - stw r15,VMXlevel(r29) ; Set the "new" active level - eieio - stw r29,VMXowner(r26) ; Mark us as having the live context - - beq-- cr1,ProtectTheAmericanWay ; Nothing to restore, first time use... - - lwz r3,SAVprev+4(r14) ; Get the previous context - lwz r0,SAVlevel(r14) ; Get the level of first facility savearea - cmplw r0,r15 ; Top level correct to load? - bne-- ProtectTheAmericanWay ; No, go initialize... - - stw r3,VMXsave(r29) ; Pop the context (we will toss the savearea later) - -#if FPVECDBG - lis r0,hi16(CutTrace) ; (TEST/DEBUG) - li r2,0x5F05 ; (TEST/DEBUG) - oris r0,r0,lo16(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) -#endif - - lwz r10,savevrvalid(r14) ; Get the valid VRs in the savearea - lwz r22,savevrsave(r25) ; Get the most current VRSAVE - and r10,r10,r22 ; Figure out just what registers need to be loaded - mr r3,r14 ; r3 <- ptr to savearea with VRs - bl vr_load ; load VRs from save area based on vrsave in r10 - - bl EXT(save_ret) ; Toss the save area after loading VRs - -vrenablelk: li r7,0 ; Get the unlock value - eieio ; Make sure that these updates make it out - stw r7,VMXsync(r29) ; Unlock the new context - -vrenable: lwz r8,savesrr1+4(r25) ; Get the msr of the interrupted guy - oris r8,r8,hi16(MASK(MSR_VEC)) ; Enable the vector facility - lwz r10,ACT_MACT_SPF(r17) ; Get the act special flags - lwz r11,spcFlags(r26) ; Get per_proc spec flags cause not in sync with act - oris r10,r10,hi16(vectorUsed|vectorCng) ; Set that we used vectors - oris r11,r11,hi16(vectorUsed|vectorCng) ; Set that we used vectors - rlwinm. r0,r8,0,MSR_PR_BIT,MSR_PR_BIT ; See if we are doing this for user state - stw r8,savesrr1+4(r25) ; Set the msr of the interrupted guy - mr r3,r25 ; Pass virtual address of the savearea - beq- vrnuser ; We are not user state... - stw r10,ACT_MACT_SPF(r17) ; Set the activation copy - stw r11,spcFlags(r26) ; Set per_proc copy - -vrnuser: -#if FPVECDBG - lis r0,hi16(CutTrace) ; (TEST/DEBUG) - li r2,0x5F07 ; (TEST/DEBUG) - oris r0,r0,lo16(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) -#endif - b EXT(exception_exit) ; Exit to the fray... - -/* - * Initialize the registers to some bogus value - */ - -ProtectTheAmericanWay: - -#if FPVECDBG - lis r0,hi16(CutTrace) ; (TEST/DEBUG) - li r2,0x5F06 ; (TEST/DEBUG) - oris r0,r0,lo16(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) -#endif - - vor v0,v31,v31 ; Copy into the next register - vor v1,v31,v31 ; Copy into the next register - vor v2,v31,v31 ; Copy into the next register - vor v3,v31,v31 ; Copy into the next register - vor v4,v31,v31 ; Copy into the next register - vor v5,v31,v31 ; Copy into the next register - vor v6,v31,v31 ; Copy into the next register - vor v7,v31,v31 ; Copy into the next register - vor v8,v31,v31 ; Copy into the next register - vor v9,v31,v31 ; Copy into the next register - vor v10,v31,v31 ; Copy into the next register - vor v11,v31,v31 ; Copy into the next register - vor v12,v31,v31 ; Copy into the next register - vor v13,v31,v31 ; Copy into the next register - vor v14,v31,v31 ; Copy into the next register - vor v15,v31,v31 ; Copy into the next register - vor v16,v31,v31 ; Copy into the next register - vor v17,v31,v31 ; Copy into the next register - vor v18,v31,v31 ; Copy into the next register - vor v19,v31,v31 ; Copy into the next register - vor v20,v31,v31 ; Copy into the next register - vor v21,v31,v31 ; Copy into the next register - vor v22,v31,v31 ; Copy into the next register - vor v23,v31,v31 ; Copy into the next register - vor v24,v31,v31 ; Copy into the next register - vor v25,v31,v31 ; Copy into the next register - vor v26,v31,v31 ; Copy into the next register - vor v27,v31,v31 ; Copy into the next register - vor v28,v31,v31 ; Copy into the next register - vor v29,v31,v31 ; Copy into the next register - vor v30,v31,v31 ; Copy into the next register - b vrenablelk ; Finish setting it all up... - - - -; -; We get here when we are switching to the same context at the same level and the context -; is still live. Essentially, all we are doing is turning on the faility. It may have -; gotten turned off due to doing a context save for the current level or a context switch -; back to the live guy. -; - - .align 5 - -vswthesame: - -#if FPVECDBG - lis r0,hi16(CutTrace) ; (TEST/DEBUG) - li r2,0x5F0A ; (TEST/DEBUG) - oris r0,r0,lo16(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) -#endif - beq- cr1,vrenable ; Not saved yet, nothing to pop, go enable and exit... - - lwz r11,SAVlevel(r30) ; Get the level of top saved context - lwz r14,SAVprev+4(r30) ; Get the previous savearea - - cmplw r11,r31 ; Are live and saved the same? - - bne+ vrenable ; Level not the same, nothing to pop, go enable and exit... - - mr r3,r30 ; Get the old savearea (we popped it before) - stw r11,VMXsave(r22) ; Pop the vector stack - bl EXT(save_ret) ; Toss it - b vrenable ; Go enable and exit... - - -; -; This function invalidates any live vector context for the passed in facility_context. -; This is intended to be called just before act_machine_sv_free tosses saveareas. -; - - .align 5 - .globl EXT(toss_live_vec) - -LEXT(toss_live_vec) - - lis r0,hi16(MASK(MSR_VEC)) ; Get VEC - mfmsr r9 ; Get the MSR - ori r0,r0,lo16(MASK(MSR_FP)) ; Add in FP - rlwinm. r8,r9,0,MSR_VEC_BIT,MSR_VEC_BIT ; Are vectors on right now? - andc r9,r9,r0 ; Force off VEC and FP - ori r0,r0,lo16(MASK(MSR_EE)) ; Turn off EE - andc r0,r9,r0 ; Turn off EE now - mtmsr r0 ; No interruptions - isync - beq+ tlvnotours ; Vector off, can not be live here... - - mfsprg r8,1 ; Get the current activation - lwz r8,ACT_PER_PROC(r8) ; Get the per_proc block - -; -; Note that at this point, since vecs are on, we are the owner -; of live state on this processor -; - - lwz r6,VMXowner(r8) ; Get the thread that owns the vector - li r0,0 ; Clear this just in case we need it - cmplw r6,r3 ; Are we tossing our own context? - bne- tlvnotours ; Nope... - - vspltish v1,1 ; Turn on the non-Java bit and saturate - vspltisw v0,1 ; Turn on the saturate bit - vxor v1,v1,v0 ; Turn off saturate - mtspr vrsave,r0 ; Clear VRSAVE - mtvscr v1 ; Set the non-java, no saturate status - -tlvnotours: lwz r11,VMXcpu(r3) ; Get the cpu on which we last loaded context - lis r12,hi16(EXT(PerProcTable)) ; Set base PerProcTable - mulli r11,r11,ppeSize ; Find offset to the owner per_proc_entry - ori r12,r12,lo16(EXT(PerProcTable)) ; Set base PerProcTable - li r10,VMXowner ; Displacement to vector owner - add r11,r12,r11 ; Point to the owner per_proc_entry - lwz r11,ppe_vaddr(r11) ; Point to the owner per_proc - li r0,0 ; Set a 0 to invalidate context - -tlvinvothr: lwarx r12,r10,r11 ; Get the owner - - sub r0,r12,r3 ; Subtract one from the other - sub r8,r3,r12 ; Subtract the other from the one - or r8,r8,r0 ; Combine them - srawi r8,r8,31 ; Get a 0 if equal or -1 of not - and r12,r12,r8 ; Make 0 if same, unchanged if not - stwcx. r12,r10,r11 ; Try to invalidate it - bne-- tlvinvothr ; Try again if there was a collision... - - mtmsr r9 ; Restore interruptions - isync ; Could be turning off vectors here - blr ; Leave.... - -#if 0 -; -; This function invalidates any live vector context for the passed in facility_context -; if the level is current. It also tosses the corresponding savearea if there is one. -; This function is primarily used whenever we detect a VRSave that is all zeros. -; - - .align 5 - .globl EXT(vec_trash) - -LEXT(vec_trash) - - lwz r12,facAct(r3) ; Get the activation - lwz r11,VMXlevel(r3) ; Get the context level - lwz r10,ACT_MACT_PCB(r12) ; Grab the current level for the thread - lwz r9,VMXsave(r3) ; Get the savearea, if any - cmplw r10,r11 ; Are we at the right level? - cmplwi cr1,r9,0 ; Remember if there is a savearea - bnelr+ ; No, we do nothing... - - lwz r11,VMXcpu(r3) ; Get the cpu on which we last loaded context - lis r12,hi16(EXT(PerProcTable)) ; Set base PerProcTable - mulli r11,r11,ppeSize ; Find offset to the owner per_proc_entry - ori r12,r12,lo16(EXT(PerProcTable)) ; Set base PerProcTable - li r10,VMXowner ; Displacement to vector owner - add r11,r12,r11 ; Point to the owner per_proc_entry - lwz r11,ppe_vaddr(r11) ; Point to the owner per_proc - -vtinvothr: lwarx r12,r10,r11 ; Get the owner - - sub r0,r12,r3 ; Subtract one from the other - sub r8,r3,r12 ; Subtract the other from the one - or r8,r8,r0 ; Combine them - srawi r8,r8,31 ; Get a 0 if equal or -1 of not - and r12,r12,r8 ; Make 0 if same, unchanged if not - stwcx. r12,r10,r11 ; Try to invalidate it - bne-- vtinvothr ; Try again if there was a collision... - - - beqlr++ cr1 ; Leave if there is no savearea - lwz r8,SAVlevel(r9) ; Get the level of the savearea - cmplw r8,r11 ; Savearea for the current level? - bnelr++ ; No, nothing to release... - - lwz r8,SAVprev+4(r9) ; Pick up the previous area - mr. r8,r8 ; Is there a previous? - beq-- vtnoprev ; Nope... - lwz r7,SAVlevel(r8) ; Get the level associated with save - -vtnoprev: stw r8,VMXsave(r3) ; Dequeue this savearea - stw r7,VMXlevel(r3) ; Pop the level - - mr r3,r9 ; Get the savearea to release - b EXT(save_ret) ; Go and toss the save area (note, we will return from there)... -#endif - -; -; Just some test code to force vector and/or floating point in the kernel -; - - .align 5 - .globl EXT(fctx_test) - -LEXT(fctx_test) - - mfsprg r3,1 ; Get the current thread - mr. r3,r3 ; Are we actually up and running? - beqlr- ; No... - - fmr f0,f0 ; Use floating point - mftb r4 ; Get time base for a random number - li r5,1 ; Get a potential vrsave to use - andi. r4,r4,0x3F ; Get a number from 0 - 63 - slw r5,r5,r4 ; Choose a register to save (should be 0 half the time) - mtspr vrsave,r5 ; Set VRSave - vor v0,v0,v0 ; Use vectors - blr - - -// ******************* -// * f p _ s t o r e * -// ******************* -// -// Store FPRs into a save area. Called by fpu_save and fpu_switch. -// -// When called: -// floating pt is enabled -// r3 = ptr to save area -// -// We destroy: -// r11. - -fp_store: - mfsprg r11,2 ; get feature flags - mtcrf 0x02,r11 ; put cache line size bits in cr6 - la r11,savefp0(r3) ; point to 1st line - dcbz128 0,r11 ; establish 1st line no matter what linesize is - bt-- pf32Byteb,fp_st32 ; skip if a 32-byte machine - -// Store the FPRs on a 128-byte machine. - - stfd f0,savefp0(r3) - stfd f1,savefp1(r3) - la r11,savefp16(r3) ; Point to the 2nd cache line - stfd f2,savefp2(r3) - stfd f3,savefp3(r3) - dcbz128 0,r11 ; establish 2nd line - stfd f4,savefp4(r3) - stfd f5,savefp5(r3) - stfd f6,savefp6(r3) - stfd f7,savefp7(r3) - stfd f8,savefp8(r3) - stfd f9,savefp9(r3) - stfd f10,savefp10(r3) - stfd f11,savefp11(r3) - stfd f12,savefp12(r3) - stfd f13,savefp13(r3) - stfd f14,savefp14(r3) - stfd f15,savefp15(r3) - stfd f16,savefp16(r3) - stfd f17,savefp17(r3) - stfd f18,savefp18(r3) - stfd f19,savefp19(r3) - stfd f20,savefp20(r3) - stfd f21,savefp21(r3) - stfd f22,savefp22(r3) - stfd f23,savefp23(r3) - stfd f24,savefp24(r3) - stfd f25,savefp25(r3) - stfd f26,savefp26(r3) - stfd f27,savefp27(r3) - stfd f28,savefp28(r3) - stfd f29,savefp29(r3) - stfd f30,savefp30(r3) - stfd f31,savefp31(r3) - blr - -// Store FPRs on a 32-byte machine. - -fp_st32: - la r11,savefp4(r3) ; Point to the 2nd line - stfd f0,savefp0(r3) - dcbz 0,r11 ; Allocate cache - stfd f1,savefp1(r3) - stfd f2,savefp2(r3) - la r11,savefp8(r3) ; Point to the 3rd line - stfd f3,savefp3(r3) - dcbz 0,r11 ; Allocate cache - stfd f4,savefp4(r3) - stfd f5,savefp5(r3) - stfd f6,savefp6(r3) - la r11,savefp12(r3) ; Point to the 4th line - stfd f7,savefp7(r3) - dcbz 0,r11 ; Allocate cache - stfd f8,savefp8(r3) - stfd f9,savefp9(r3) - stfd f10,savefp10(r3) - la r11,savefp16(r3) ; Point to the 5th line - stfd f11,savefp11(r3) - dcbz 0,r11 ; Allocate cache - stfd f12,savefp12(r3) - stfd f13,savefp13(r3) - stfd f14,savefp14(r3) - la r11,savefp20(r3) ; Point to the 6th line - stfd f15,savefp15(r3) - dcbz 0,r11 ; Allocate cache - stfd f16,savefp16(r3) - stfd f17,savefp17(r3) - stfd f18,savefp18(r3) - la r11,savefp24(r3) ; Point to the 7th line - stfd f19,savefp19(r3) - dcbz 0,r11 ; Allocate cache - stfd f20,savefp20(r3) - - stfd f21,savefp21(r3) - stfd f22,savefp22(r3) - la r11,savefp28(r3) ; Point to the 8th line - stfd f23,savefp23(r3) - dcbz 0,r11 ; allocate it - stfd f24,savefp24(r3) - stfd f25,savefp25(r3) - stfd f26,savefp26(r3) - stfd f27,savefp27(r3) - - stfd f28,savefp28(r3) - stfd f29,savefp29(r3) - stfd f30,savefp30(r3) - stfd f31,savefp31(r3) - blr - - -// ******************* -// * v r _ s t o r e * -// ******************* -// -// Store VRs into savearea, according to bits set in passed vrsave bitfield. This routine is used -// both by vec_save and vec_switch. In order to minimize conditional branches and touching in -// unnecessary cache blocks, we either save all or none of the VRs in a block. We have separate paths -// for each cache block size. -// -// When called: -// interrupts are off, vectors are enabled -// r3 = ptr to save area -// r10 = vrsave (not 0) -// -// We destroy: -// r4 - r11, all CRs. - -vr_store: - mfsprg r9,2 ; get feature flags - stw r10,savevrvalid(r3) ; Save the validity information in savearea - slwi r8,r10,1 ; Shift over 1 - mtcrf 0x02,r9 ; put cache line size bits in cr6 where we can test - or r8,r10,r8 ; r8 <- even bits show which pairs are in use - bt-- pf32Byteb,vr_st32 ; skip if 32-byte cacheline processor - - -; Save vectors on a 128-byte linesize processor. We save all or none of the 8 registers in each of -; the four cache lines. This minimizes mispredicted branches yet handles cache lines optimally. - - slwi r7,r8,2 ; shift groups-of-2 over by 2 - li r4,16 ; load offsets for X-form stores - or r8,r7,r8 ; show if any in group of 4 are in use - li r5,32 - slwi r7,r8,4 ; shift groups-of-4 over by 4 - li r6,48 - or r11,r7,r8 ; show if any in group of 8 are in use - li r7,64 - mtcrf 0x80,r11 ; set CRs one at a time (faster) - li r8,80 - mtcrf 0x20,r11 - li r9,96 - mtcrf 0x08,r11 - li r10,112 - mtcrf 0x02,r11 - - bf 0,vr_st64b ; skip if none of vr0-vr7 are in use - la r11,savevr0(r3) ; get address of this group of registers in save area - dcbz128 0,r11 ; zero the line - stvxl v0,0,r11 ; save 8 VRs in the line - stvxl v1,r4,r11 - stvxl v2,r5,r11 - stvxl v3,r6,r11 - stvxl v4,r7,r11 - stvxl v5,r8,r11 - stvxl v6,r9,r11 - stvxl v7,r10,r11 - -vr_st64b: - bf 8,vr_st64c ; skip if none of vr8-vr15 are in use - la r11,savevr8(r3) ; get address of this group of registers in save area - dcbz128 0,r11 ; zero the line - stvxl v8,0,r11 ; save 8 VRs in the line - stvxl v9,r4,r11 - stvxl v10,r5,r11 - stvxl v11,r6,r11 - stvxl v12,r7,r11 - stvxl v13,r8,r11 - stvxl v14,r9,r11 - stvxl v15,r10,r11 - -vr_st64c: - bf 16,vr_st64d ; skip if none of vr16-vr23 are in use - la r11,savevr16(r3) ; get address of this group of registers in save area - dcbz128 0,r11 ; zero the line - stvxl v16,0,r11 ; save 8 VRs in the line - stvxl v17,r4,r11 - stvxl v18,r5,r11 - stvxl v19,r6,r11 - stvxl v20,r7,r11 - stvxl v21,r8,r11 - stvxl v22,r9,r11 - stvxl v23,r10,r11 - -vr_st64d: - bflr 24 ; done if none of vr24-vr31 are in use - la r11,savevr24(r3) ; get address of this group of registers in save area - dcbz128 0,r11 ; zero the line - stvxl v24,0,r11 ; save 8 VRs in the line - stvxl v25,r4,r11 - stvxl v26,r5,r11 - stvxl v27,r6,r11 - stvxl v28,r7,r11 - stvxl v29,r8,r11 - stvxl v30,r9,r11 - stvxl v31,r10,r11 - blr - -; Save vectors on a 32-byte linesize processor. We save in 16 groups of 2: we either save both -; or neither in each group. This cuts down on conditional branches. -; r8 = bitmask with bit n set (for even n) if either of that pair of VRs is in use -; r3 = savearea - -vr_st32: - mtcrf 0xFF,r8 ; set CR bits so we can branch on them - li r4,16 ; load offset for X-form stores - - bf 0,vr_st32b ; skip if neither VR in this pair is in use - la r11,savevr0(r3) ; get address of this group of registers in save area - dcba 0,r11 ; establish the line wo reading it - stvxl v0,0,r11 ; save the two VRs in the line - stvxl v1,r4,r11 - -vr_st32b: - bf 2,vr_st32c ; skip if neither VR in this pair is in use - la r11,savevr2(r3) ; get address of this group of registers in save area - dcba 0,r11 ; establish the line wo reading it - stvxl v2,0,r11 ; save the two VRs in the line - stvxl v3,r4,r11 - -vr_st32c: - bf 4,vr_st32d ; skip if neither VR in this pair is in use - la r11,savevr4(r3) ; get address of this group of registers in save area - dcba 0,r11 ; establish the line wo reading it - stvxl v4,0,r11 ; save the two VRs in the line - stvxl v5,r4,r11 - -vr_st32d: - bf 6,vr_st32e ; skip if neither VR in this pair is in use - la r11,savevr6(r3) ; get address of this group of registers in save area - dcba 0,r11 ; establish the line wo reading it - stvxl v6,0,r11 ; save the two VRs in the line - stvxl v7,r4,r11 - -vr_st32e: - bf 8,vr_st32f ; skip if neither VR in this pair is in use - la r11,savevr8(r3) ; get address of this group of registers in save area - dcba 0,r11 ; establish the line wo reading it - stvxl v8,0,r11 ; save the two VRs in the line - stvxl v9,r4,r11 - -vr_st32f: - bf 10,vr_st32g ; skip if neither VR in this pair is in use - la r11,savevr10(r3) ; get address of this group of registers in save area - dcba 0,r11 ; establish the line wo reading it - stvxl v10,0,r11 ; save the two VRs in the line - stvxl v11,r4,r11 - -vr_st32g: - bf 12,vr_st32h ; skip if neither VR in this pair is in use - la r11,savevr12(r3) ; get address of this group of registers in save area - dcba 0,r11 ; establish the line wo reading it - stvxl v12,0,r11 ; save the two VRs in the line - stvxl v13,r4,r11 - -vr_st32h: - bf 14,vr_st32i ; skip if neither VR in this pair is in use - la r11,savevr14(r3) ; get address of this group of registers in save area - dcba 0,r11 ; establish the line wo reading it - stvxl v14,0,r11 ; save the two VRs in the line - stvxl v15,r4,r11 - -vr_st32i: - bf 16,vr_st32j ; skip if neither VR in this pair is in use - la r11,savevr16(r3) ; get address of this group of registers in save area - dcba 0,r11 ; establish the line wo reading it - stvxl v16,0,r11 ; save the two VRs in the line - stvxl v17,r4,r11 - -vr_st32j: - bf 18,vr_st32k ; skip if neither VR in this pair is in use - la r11,savevr18(r3) ; get address of this group of registers in save area - dcba 0,r11 ; establish the line wo reading it - stvxl v18,0,r11 ; save the two VRs in the line - stvxl v19,r4,r11 - -vr_st32k: - bf 20,vr_st32l ; skip if neither VR in this pair is in use - la r11,savevr20(r3) ; get address of this group of registers in save area - dcba 0,r11 ; establish the line wo reading it - stvxl v20,0,r11 ; save the two VRs in the line - stvxl v21,r4,r11 - -vr_st32l: - bf 22,vr_st32m ; skip if neither VR in this pair is in use - la r11,savevr22(r3) ; get address of this group of registers in save area - dcba 0,r11 ; establish the line wo reading it - stvxl v22,0,r11 ; save the two VRs in the line - stvxl v23,r4,r11 - -vr_st32m: - bf 24,vr_st32n ; skip if neither VR in this pair is in use - la r11,savevr24(r3) ; get address of this group of registers in save area - dcba 0,r11 ; establish the line wo reading it - stvxl v24,0,r11 ; save the two VRs in the line - stvxl v25,r4,r11 - -vr_st32n: - bf 26,vr_st32o ; skip if neither VR in this pair is in use - la r11,savevr26(r3) ; get address of this group of registers in save area - dcba 0,r11 ; establish the line wo reading it - stvxl v26,0,r11 ; save the two VRs in the line - stvxl v27,r4,r11 - -vr_st32o: - bf 28,vr_st32p ; skip if neither VR in this pair is in use - la r11,savevr28(r3) ; get address of this group of registers in save area - dcba 0,r11 ; establish the line wo reading it - stvxl v28,0,r11 ; save the two VRs in the line - stvxl v29,r4,r11 - -vr_st32p: - bflr 30 ; done if neither VR in this pair is in use - la r11,savevr30(r3) ; get address of this group of registers in save area - dcba 0,r11 ; establish the line wo reading it - stvxl v30,0,r11 ; save the two VRs in the line - stvxl v31,r4,r11 - blr - - -// ***************** -// * v r _ l o a d * -// ***************** -// -// Load live VRs from a savearea, according to bits set in a passed vector. This is the reverse -// of "vr_store". Like it, we avoid touching unnecessary cache blocks and minimize conditional -// branches by loading all VRs from a cache line, if we have to load any. If we don't load the VRs -// in a cache line, we bug them. Note that this behavior is slightly different from earlier kernels, -// which would bug all VRs that aren't live. -// -// When called: -// interrupts are off, vectors are enabled -// r3 = ptr to save area -// r10 = vector of live regs to load (ie, savevrsave & savevrvalid, may be 0) -// v31 = bugbug constant (0x7FFFDEAD7FFFDEAD7FFFDEAD7FFFDEAD) -// -// We destroy: -// r4 - r11, all CRs. - -vr_load: - mfsprg r9,2 ; get feature flags - li r6,1 ; assuming 32-byte, get (#VRs)-1 in a cacheline - mtcrf 0x02,r9 ; set cache line size bits in cr6 - lis r7,0xC000 ; assuming 32-byte, set bits 0-1 - bt-- pf32Byteb,vr_ld0 ; skip if 32-bit processor - li r6,7 ; 128-byte machines have 8 VRs in a cacheline - lis r7,0xFF00 ; so set bits 0-7 - -// Loop touching in cache blocks we will load from. -// r3 = savearea ptr -// r5 = we light bits for the VRs we will be loading -// r6 = 1 if 32-byte, 7 if 128-byte -// r7 = 0xC0000000 if 32-byte, 0xFF000000 if 128-byte -// r10 = live VR bits -// v31 = bugbug constant - -vr_ld0: - li r5,0 ; initialize set of VRs to load - la r11,savevr0(r3) ; get address of register file - b vr_ld2 ; enter loop in middle - - .align 5 -vr_ld1: ; loop over each cache line we will load - dcbt r4,r11 ; start prefetch of the line - andc r10,r10,r9 ; turn off the bits in this line - or r5,r5,r9 ; we will load all these -vr_ld2: ; initial entry pt - cntlzw r4,r10 ; get offset to next live VR - andc r4,r4,r6 ; cacheline align it - srw. r9,r7,r4 ; position bits for VRs in that cache line - slwi r4,r4,4 ; get byte offset within register file to that line - bne vr_ld1 ; loop if more bits in r10 - - bf-- pf128Byteb,vr_ld32 ; skip if not 128-byte lines - -// Handle a processor with 128-byte cache lines. Four groups of 8 VRs. -// r3 = savearea ptr -// r5 = 1st bit in each cacheline is 1 iff any reg in that line must be loaded -// r11 = addr(savevr0) -// v31 = bugbug constant - - mtcrf 0x80,r5 ; set up bits for conditional branches - li r4,16 ; load offsets for X-form stores - li r6,48 - mtcrf 0x20,r5 ; load CRs ona at a time, which is faster - li r7,64 - li r8,80 - mtcrf 0x08,r5 - li r9,96 - li r10,112 - mtcrf 0x02,r5 - li r5,32 - - bt 0,vr_ld128a ; skip if this line must be loaded - vor v0,v31,v31 ; no VR must be loaded, so bug them all - vor v1,v31,v31 - vor v2,v31,v31 - vor v3,v31,v31 - vor v4,v31,v31 - vor v5,v31,v31 - vor v6,v31,v31 - vor v7,v31,v31 - b vr_ld128b -vr_ld128a: ; must load from this line - lvxl v0,0,r11 - lvxl v1,r4,r11 - lvxl v2,r5,r11 - lvxl v3,r6,r11 - lvxl v4,r7,r11 - lvxl v5,r8,r11 - lvxl v6,r9,r11 - lvxl v7,r10,r11 - -vr_ld128b: ; here to handle next cache line - la r11,savevr8(r3) ; load offset to it - bt 8,vr_ld128c ; skip if this line must be loaded - vor v8,v31,v31 ; no VR must be loaded, so bug them all - vor v9,v31,v31 - vor v10,v31,v31 - vor v11,v31,v31 - vor v12,v31,v31 - vor v13,v31,v31 - vor v14,v31,v31 - vor v15,v31,v31 - b vr_ld128d -vr_ld128c: ; must load from this line - lvxl v8,0,r11 - lvxl v9,r4,r11 - lvxl v10,r5,r11 - lvxl v11,r6,r11 - lvxl v12,r7,r11 - lvxl v13,r8,r11 - lvxl v14,r9,r11 - lvxl v15,r10,r11 - -vr_ld128d: ; here to handle next cache line - la r11,savevr16(r3) ; load offset to it - bt 16,vr_ld128e ; skip if this line must be loaded - vor v16,v31,v31 ; no VR must be loaded, so bug them all - vor v17,v31,v31 - vor v18,v31,v31 - vor v19,v31,v31 - vor v20,v31,v31 - vor v21,v31,v31 - vor v22,v31,v31 - vor v23,v31,v31 - b vr_ld128f -vr_ld128e: ; must load from this line - lvxl v16,0,r11 - lvxl v17,r4,r11 - lvxl v18,r5,r11 - lvxl v19,r6,r11 - lvxl v20,r7,r11 - lvxl v21,r8,r11 - lvxl v22,r9,r11 - lvxl v23,r10,r11 - -vr_ld128f: ; here to handle next cache line - la r11,savevr24(r3) ; load offset to it - bt 24,vr_ld128g ; skip if this line must be loaded - vor v24,v31,v31 ; no VR must be loaded, so bug them all - vor v25,v31,v31 - vor v26,v31,v31 - vor v27,v31,v31 - vor v28,v31,v31 - vor v29,v31,v31 - vor v30,v31,v31 - blr -vr_ld128g: ; must load from this line - lvxl v24,0,r11 - lvxl v25,r4,r11 - lvxl v26,r5,r11 - lvxl v27,r6,r11 - lvxl v28,r7,r11 - lvxl v29,r8,r11 - lvxl v30,r9,r11 - lvxl v31,r10,r11 - blr - -// Handle a processor with 32-byte cache lines. Sixteen groups of two VRs. -// r5 = 1st bit in each cacheline is 1 iff any reg in that line must be loaded -// r11 = addr(savevr0) - -vr_ld32: - mtcrf 0xFF,r5 ; set up bits for conditional branches - li r4,16 ; load offset for X-form stores - - bt 0,vr_ld32load0 ; skip if we must load this line - vor v0,v31,v31 ; neither VR is live, so bug them both - vor v1,v31,v31 - b vr_ld32test2 -vr_ld32load0: ; must load VRs in this line - lvxl v0,0,r11 - lvxl v1,r4,r11 - -vr_ld32test2: ; here to handle next cache line - la r11,savevr2(r3) ; get offset to next cache line - bt 2,vr_ld32load2 ; skip if we must load this line - vor v2,v31,v31 ; neither VR is live, so bug them both - vor v3,v31,v31 - b vr_ld32test4 -vr_ld32load2: ; must load VRs in this line - lvxl v2,0,r11 - lvxl v3,r4,r11 - -vr_ld32test4: ; here to handle next cache line - la r11,savevr4(r3) ; get offset to next cache line - bt 4,vr_ld32load4 ; skip if we must load this line - vor v4,v31,v31 ; neither VR is live, so bug them both - vor v5,v31,v31 - b vr_ld32test6 -vr_ld32load4: ; must load VRs in this line - lvxl v4,0,r11 - lvxl v5,r4,r11 - -vr_ld32test6: ; here to handle next cache line - la r11,savevr6(r3) ; get offset to next cache line - bt 6,vr_ld32load6 ; skip if we must load this line - vor v6,v31,v31 ; neither VR is live, so bug them both - vor v7,v31,v31 - b vr_ld32test8 -vr_ld32load6: ; must load VRs in this line - lvxl v6,0,r11 - lvxl v7,r4,r11 - -vr_ld32test8: ; here to handle next cache line - la r11,savevr8(r3) ; get offset to next cache line - bt 8,vr_ld32load8 ; skip if we must load this line - vor v8,v31,v31 ; neither VR is live, so bug them both - vor v9,v31,v31 - b vr_ld32test10 -vr_ld32load8: ; must load VRs in this line - lvxl v8,0,r11 - lvxl v9,r4,r11 - -vr_ld32test10: ; here to handle next cache line - la r11,savevr10(r3) ; get offset to next cache line - bt 10,vr_ld32load10 ; skip if we must load this line - vor v10,v31,v31 ; neither VR is live, so bug them both - vor v11,v31,v31 - b vr_ld32test12 -vr_ld32load10: ; must load VRs in this line - lvxl v10,0,r11 - lvxl v11,r4,r11 - -vr_ld32test12: ; here to handle next cache line - la r11,savevr12(r3) ; get offset to next cache line - bt 12,vr_ld32load12 ; skip if we must load this line - vor v12,v31,v31 ; neither VR is live, so bug them both - vor v13,v31,v31 - b vr_ld32test14 -vr_ld32load12: ; must load VRs in this line - lvxl v12,0,r11 - lvxl v13,r4,r11 - -vr_ld32test14: ; here to handle next cache line - la r11,savevr14(r3) ; get offset to next cache line - bt 14,vr_ld32load14 ; skip if we must load this line - vor v14,v31,v31 ; neither VR is live, so bug them both - vor v15,v31,v31 - b vr_ld32test16 -vr_ld32load14: ; must load VRs in this line - lvxl v14,0,r11 - lvxl v15,r4,r11 - -vr_ld32test16: ; here to handle next cache line - la r11,savevr16(r3) ; get offset to next cache line - bt 16,vr_ld32load16 ; skip if we must load this line - vor v16,v31,v31 ; neither VR is live, so bug them both - vor v17,v31,v31 - b vr_ld32test18 -vr_ld32load16: ; must load VRs in this line - lvxl v16,0,r11 - lvxl v17,r4,r11 - -vr_ld32test18: ; here to handle next cache line - la r11,savevr18(r3) ; get offset to next cache line - bt 18,vr_ld32load18 ; skip if we must load this line - vor v18,v31,v31 ; neither VR is live, so bug them both - vor v19,v31,v31 - b vr_ld32test20 -vr_ld32load18: ; must load VRs in this line - lvxl v18,0,r11 - lvxl v19,r4,r11 - -vr_ld32test20: ; here to handle next cache line - la r11,savevr20(r3) ; get offset to next cache line - bt 20,vr_ld32load20 ; skip if we must load this line - vor v20,v31,v31 ; neither VR is live, so bug them both - vor v21,v31,v31 - b vr_ld32test22 -vr_ld32load20: ; must load VRs in this line - lvxl v20,0,r11 - lvxl v21,r4,r11 - -vr_ld32test22: ; here to handle next cache line - la r11,savevr22(r3) ; get offset to next cache line - bt 22,vr_ld32load22 ; skip if we must load this line - vor v22,v31,v31 ; neither VR is live, so bug them both - vor v23,v31,v31 - b vr_ld32test24 -vr_ld32load22: ; must load VRs in this line - lvxl v22,0,r11 - lvxl v23,r4,r11 - -vr_ld32test24: ; here to handle next cache line - la r11,savevr24(r3) ; get offset to next cache line - bt 24,vr_ld32load24 ; skip if we must load this line - vor v24,v31,v31 ; neither VR is live, so bug them both - vor v25,v31,v31 - b vr_ld32test26 -vr_ld32load24: ; must load VRs in this line - lvxl v24,0,r11 - lvxl v25,r4,r11 - -vr_ld32test26: ; here to handle next cache line - la r11,savevr26(r3) ; get offset to next cache line - bt 26,vr_ld32load26 ; skip if we must load this line - vor v26,v31,v31 ; neither VR is live, so bug them both - vor v27,v31,v31 - b vr_ld32test28 -vr_ld32load26: ; must load VRs in this line - lvxl v26,0,r11 - lvxl v27,r4,r11 - -vr_ld32test28: ; here to handle next cache line - la r11,savevr28(r3) ; get offset to next cache line - bt 28,vr_ld32load28 ; skip if we must load this line - vor v28,v31,v31 ; neither VR is live, so bug them both - vor v29,v31,v31 - b vr_ld32test30 -vr_ld32load28: ; must load VRs in this line - lvxl v28,0,r11 - lvxl v29,r4,r11 - -vr_ld32test30: ; here to handle next cache line - la r11,savevr30(r3) ; get offset to next cache line - bt 30,vr_ld32load30 ; skip if we must load this line - vor v30,v31,v31 ; neither VR is live, so bug them both - blr -vr_ld32load30: ; must load VRs in this line - lvxl v30,0,r11 - lvxl v31,r4,r11 - blr diff --git a/osfmk/ppc/db_asm.s b/osfmk/ppc/db_asm.s deleted file mode 100644 index 626fa1822..000000000 --- a/osfmk/ppc/db_asm.s +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -#include -#include -#include -#include -#include - - -/* void - * db_phys_cmp(src_a, src_b, bytecount) - * vm_offset_t src_a; - * vm_offset_t src_b; - * int bytecount - * - * This routine will compare bytecount bytes from physical address src_a and physical - * address src_b. - */ - -#warning THIS IS BROKEN FOR 64-BIT - - /* Switch off data translations */ - lis r7,hi16(MASK(MSR_VEC)) - ori r7,r7,lo16(MASK(MSR_FP)) - mfmsr r6 - andc r6,r6,r7 ; Force FP and vec off - ori r7,r7,lo16(MASK(MSR_DR)) ; Set the DR bit - andc r7,r6,r7 ; Force DR off - mtmsr r7 - isync /* Ensure data translations are off */ - - subi r3, r3, 4 - subi r4, r4, 4 - - cmpwi r5, 3 - ble- .L_db_phys_cmp_bytes -.L_db_phys_cmp_loop: - lwz r0, 4(r3) - lwz r7, 4(r4) - addi r3, r3, 4 - addi r4, r4, 4 - subi r5, r5, 4 - cmpw r0, r7 - bne .L_db_phys_cmp_false - cmpwi r5, 3 - bgt+ .L_db_phys_cmp_loop - - /* If no leftover bytes, we're done now */ - cmpwi r5, 0 - beq+ .L_db_phys_cmp_true - -.L_db_phys_cmp_bytes: - addi r3, r3, 3 - addi r4, r4, 3 -.L_db_phys_cmp_byte_loop: - lbz r0, 1(r3) - lbz r7, 1(r4) - addi r3, r3, 1 - addi r4, r4, 1 - subi r5, r5, 1 - cmpw r0, r7 - bne .L_db_phys_cmp_false - cmpwi r5, 0 - bne+ .L_db_phys_cmp_loop - -.L_db_phys_cmp_true: - li r3, 1 - b .L_db_phys_cmp_done - -.L_db_phys_cmp_false: - li r3, 0 - -.L_db_phys_cmp_done: - mtmsr r6 /* Restore original translations */ - isync /* Ensure data translations are off */ - - blr - diff --git a/osfmk/ppc/db_disasm.c b/osfmk/ppc/db_disasm.c deleted file mode 100644 index 6410471ef..000000000 --- a/osfmk/ppc/db_disasm.c +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -/* - * Instruction disassembler. - */ - -#include -#include - -#include -#include -#include - -#include -#include - -#include "ppc_disasm.h" - -db_addr_t db_disasm_pc, db_disasm_symaddr; -boolean_t db_disasm_print_symaddr; - -/* - * Disassemble instruction at 'loc'. 'altfmt' specifies an - * (optional) alternate format. Return address of start of - * next instruction. - */ -db_addr_t -db_disasm(db_addr_t loc, __unused boolean_t altfmt, task_t task) -{ - int inst; - char *p; - - inst = db_get_task_value(loc, 4, FALSE, task); - db_disasm_pc = loc; - db_disasm_print_symaddr = FALSE; - p = in(inst); - db_printf("%s", p); - if (db_disasm_print_symaddr) { - db_printf(" <"); - db_task_printsym(db_disasm_symaddr, DB_STGY_ANY, task); - db_printf(">"); - } - db_printf("\n"); /* Make sure we have a new line for multiline displays */ - dis_done(); - return (loc+4); -} - -/* - * Given four bytes of instruction (stored as an int, not an - * array of characters), compute if the instruction reads - * memory. - */ -int -db_inst_load(__unused unsigned long insw) -{ -#if 1 - db_printf("db_inst_load: coming soon in a debugger near you!\n"); - return 0; -#else - unsigned char insb, bits; - - insb = insw & 0xff; - insw >>= 8; - bits = db_ldstrtab[insb]; - if (!(bits & DBLS_LOAD)) - return (0); - while (1) { - switch (bits & DBLS_MODS) { - case 0: - return (1); - case DBLS_MODRM: - insb = insw & 0xff; - return ((insb & 0xc0) != 0xc0); - case DBLS_SECOND|DBLS_MODRM: - insb = insw & 0xff; - return ((insb & 0xc0) != 0xc0 ? 2 : 0); - case DBLS_SECOND: - return (2); - case DBLS_ESCAPE: - insb = insw & 0xff; - insw >>= 8; - bits = db_ldstrtab0f[insb]; - break; - case DBLS_SWREG: - return (db_inst_swreg(TRUE, insw, insb)); - default: - panic ("db_inst_load: unknown mod bits"); - } - } -#endif -} - -/* - * Given four bytes of instruction (stored as an int, not an - * array of characters), compute if the instruction writes - * memory. - */ -int -db_inst_store(__unused unsigned long insw) -{ -#if 1 - db_printf("db_inst_store: coming soon in a debugger near you!\n"); - return 0; -#else - unsigned char insb, bits; - - insb = insw & 0xff; - insw >>= 8; - bits = db_ldstrtab[insb]; - if (!(bits & DBLS_STORE)) - return (0); - while (1) { - switch (bits & DBLS_MODS) { - case 0: - return (1); - case DBLS_MODRM: - insb = insw & 0xff; - return ((insb & 0xc0) != 0xc0); - case DBLS_SECOND|DBLS_MODRM: - insb = insw & 0xff; - return ((insb & 0xc0) != 0xc0 ? 2 : 0); - case DBLS_SECOND: - return (2); - case DBLS_ESCAPE: - insb = insw & 0xff; - insw >>= 8; - bits = db_ldstrtab0f[insb]; - break; - case DBLS_SWREG: - return (db_inst_swreg(FALSE, insw, insb)); - default: - panic ("db_inst_store: unknown mod bits"); - } - } -#endif -} - -/* - * Extra routines for the automatically generated disassembler - */ -char * -hex( - bits n) -{ - char *p; - - if (n < 10) - return dec(n); - p = dis_alloc(11); - sprintf(p, "0x%lx", n); - return p; -} - -char * -dec( - bits n) -{ - char *p = dis_alloc(11); - sprintf(p, "%lu", n); - return p; -} - -char * -brdispl( - bits displ, - bits nbits) -{ - int sign, extended; - - sign = 1 << (nbits - 1); - extended = (displ & sign ? displ - (sign << 1) : displ); - db_disasm_symaddr = db_disasm_pc + (extended << 2); - db_disasm_print_symaddr = TRUE; - return hex(extended << 2); -} - -char * -mbz(bits n) -{ - return n ? "[reserved bits not zero]" : ""; -} - -size_t db_disasm_string_size = 0; -#define DB_DISASM_STRING_MAXSIZE 4096 -char db_disasm_string[DB_DISASM_STRING_MAXSIZE]; - -void *db_disasm_malloc(size_t size); /* forward */ -void * -db_disasm_malloc( - size_t size) -{ - void * new_buf; - - if (db_disasm_string_size + size <= DB_DISASM_STRING_MAXSIZE) { - new_buf = (void *) (db_disasm_string + db_disasm_string_size); - db_disasm_string_size += size; - return new_buf; - } - db_printf("db_disasm_malloc(size=%d) failed: %d left !\n", - size, - DB_DISASM_STRING_MAXSIZE - db_disasm_string_size); - return (void *) 0; -} diff --git a/osfmk/ppc/db_interface.c b/osfmk/ppc/db_interface.c deleted file mode 100644 index 3109d1b5e..000000000 --- a/osfmk/ppc/db_interface.c +++ /dev/null @@ -1,592 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include /* for switch_to_serial_console */ - -#include -#include -#include -#include -#include -#include -#include /* for halt_all_cpus() */ -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct savearea *ppc_last_saved_statep; -struct savearea ppc_nested_saved_state; -unsigned ppc_last_kdb_sp; -db_regs_t ddb_regs; /* register state */ - -extern int debugger_cpu; /* Current cpu running debugger */ - -int db_all_set_up = 0; - - -#if !MACH_KDP -void kdp_register_send_receive(void); -#endif - -/* - * Enter KDB through a keyboard trap. - * We show the registers as of the keyboard interrupt - * instead of those at its call to KDB. - */ -struct int_regs { - /* XXX more registers ? */ - struct ppc_interrupt_state *is; -}; - -extern int TRAP_TYPES; - -/* - * Code used to synchronize kdb among all cpus, one active at a time, switch - * from on to another using kdb_on! #cpu or cpu #cpu - */ - -decl_simple_lock_data(, kdb_lock) /* kdb lock */ - -#define db_simple_lock_init(l, e) hw_lock_init(&((l)->interlock)) -#define db_simple_lock_try(l) hw_lock_try(&((l)->interlock)) -#define db_simple_unlock(l) hw_lock_unlock(&((l)->interlock)) - -extern volatile unsigned int cpus_holding_bkpts; /* counter for number of cpus holding - breakpoints (ie: cpus that did not - insert back breakpoints) */ -extern boolean_t db_breakpoints_inserted; - -/* Forward */ - -extern void kdbprinttrap( - int type, - int code, - int *pc, - int sp); -extern void db_write_bytes_user_space( - vm_offset_t addr, - int size, - char *data, - task_t task); -extern int db_search_null( - task_t task, - unsigned *svaddr, - unsigned evaddr, - unsigned *skaddr, - int flag); -extern int kdb_enter(int); -extern void kdb_leave(void); -extern void lock_kdb(void); -extern void unlock_kdb(void); - -#if DB_MACHINE_COMMANDS -struct db_command ppc_db_commands[] = { - { "lt", db_low_trace, CS_MORE|CS_SET_DOT, 0 }, - { (char *)0, 0, 0, 0 } -}; -#endif /* DB_MACHINE_COMMANDS */ - -#if !MACH_KDP -void kdp_register_send_receive(void) {} -#endif - -extern jmp_buf_t *db_recover; - -/* - * kdb_trap - field a TRACE or BPT trap - */ -void -kdb_trap( - int type, - struct savearea *regs) -{ - boolean_t trap_from_user; - int previous_console_device; - int code=0; - - previous_console_device=switch_to_serial_console(); - - switch (type) { - case T_TRACE: /* single_step */ - case T_PROGRAM: /* breakpoint */ -#if 0 - case T_WATCHPOINT: /* watchpoint */ -#endif - case -1: /* keyboard interrupt */ - break; - - default: - if (db_recover) { - ppc_nested_saved_state = *regs; - db_printf("Caught "); - if (type > TRAP_TYPES) - db_printf("type %d", type); - else - db_printf("%s", trap_type[type]); - db_printf(" trap, pc = %llx\n", - regs->save_srr0); - db_error(""); - /*NOTREACHED*/ - } - kdbprinttrap(type, code, (int *)®s->save_srr0, regs->save_r1); - } - - getPerProc()->db_saved_state = regs; - - ppc_last_saved_statep = regs; - ppc_last_kdb_sp = (unsigned) &type; - - if (!IS_USER_TRAP(regs)) { - bzero((char *)&ddb_regs, sizeof (ddb_regs)); - ddb_regs = *regs; - trap_from_user = FALSE; - - } - else { - ddb_regs = *regs; - trap_from_user = TRUE; - } - - db_task_trap(type, code, trap_from_user); - - *regs = ddb_regs; - - if ((type == T_PROGRAM) && - (db_get_task_value(regs->save_srr0, - BKPT_SIZE, - FALSE, - db_target_space(current_thread(), - trap_from_user)) - == BKPT_INST)) - regs->save_srr0 += BKPT_SIZE; - - getPerProc()->db_saved_state = 0; - switch_to_old_console(previous_console_device); - -} - - -/* - * Print trap reason. - */ - -void -kdbprinttrap( - int type, - int code, - int *pc, - int sp) -{ - printf("kernel: "); - if (type > TRAP_TYPES) - db_printf("type %d", type); - else - db_printf("%s", trap_type[type]); - db_printf(" trap, code=%x pc@%x = %x sp=%x\n", - code, pc, *(int *)pc, sp); - db_run_mode = STEP_CONTINUE; -} - -/* - * - */ -static addr64_t -db_vtophys(pmap_t pmap, vm_offset_t va) -{ - ppnum_t pp; - addr64_t pa; - - pp = pmap_find_phys(pmap, (addr64_t)va); - - if (pp == 0) return(0); /* Couldn't find it */ - - pa = ((addr64_t)pp << 12) | (addr64_t)(va & 0xFFF); /* Get physical address */ - - return(pa); -} - -/* - * Read bytes from task address space for debugger. - */ -void -db_read_bytes( - vm_offset_t addr, - int size, - char *data, - task_t task) -{ - int n,max; - addr64_t phys_dst; - addr64_t phys_src; - pmap_t pmap; - - while (size > 0) { - if (task != NULL) - pmap = task->map->pmap; - else - pmap = kernel_pmap; - - phys_src = db_vtophys(pmap, (vm_offset_t)addr); - if (phys_src == 0) { - db_printf("\nno memory is assigned to src address %08x\n", - addr); - db_error(0); - /* NOTREACHED */ - } - - phys_dst = db_vtophys(kernel_pmap, (vm_offset_t)data); - if (phys_dst == 0) { - db_printf("\nno memory is assigned to dst address %08x\n", - data); - db_error(0); - /* NOTREACHED */ - } - - /* don't over-run any page boundaries - check src range */ - max = round_page_64(phys_src + 1) - phys_src; - if (max > size) - max = size; - /* Check destination won't run over boundary either */ - n = round_page_64(phys_dst + 1) - phys_dst; - - if (n < max) max = n; - size -= max; - addr += max; - phys_copy(phys_src, phys_dst, max); - - /* resync I+D caches */ - sync_cache64(phys_dst, max); - - phys_src += max; - phys_dst += max; - } -} - -/* - * Write bytes to task address space for debugger. - */ -void -db_write_bytes( - vm_offset_t addr, - int size, - char *data, - task_t task) -{ - int n,max; - addr64_t phys_dst; - addr64_t phys_src; - pmap_t pmap; - - while (size > 0) { - - phys_src = db_vtophys(kernel_pmap, (vm_offset_t)data); - if (phys_src == 0) { - db_printf("\nno memory is assigned to src address %08x\n", - data); - db_error(0); - /* NOTREACHED */ - } - - /* space stays as kernel space unless in another task */ - if (task == NULL) pmap = kernel_pmap; - else pmap = task->map->pmap; - - phys_dst = db_vtophys(pmap, (vm_offset_t)addr); - if (phys_dst == 0) { - db_printf("\nno memory is assigned to dst address %08x\n", - addr); - db_error(0); - /* NOTREACHED */ - } - - /* don't over-run any page boundaries - check src range */ - max = round_page_64(phys_src + 1) - phys_src; - if (max > size) - max = size; - /* Check destination won't run over boundary either */ - n = round_page_64(phys_dst + 1) - phys_dst; - if (n < max) - max = n; - size -= max; - addr += max; - phys_copy(phys_src, phys_dst, max); - - /* resync I+D caches */ - sync_cache64(phys_dst, max); - - phys_src += max; - phys_dst += max; - } -} - -boolean_t -db_check_access( - vm_offset_t addr, - int size, - task_t task) -{ - register int n; - - if (task == kernel_task || task == TASK_NULL) { - if (kernel_task == TASK_NULL) return(TRUE); - task = kernel_task; - } else if (task == TASK_NULL) { - if (current_thread() == THR_ACT_NULL) return(FALSE); - task = current_thread()->task; - } - - while (size > 0) { - if(!pmap_find_phys(task->map->pmap, (addr64_t)addr)) return (FALSE); /* Fail if page not mapped */ - n = trunc_page_32(addr+PPC_PGBYTES) - addr; - if (n > size) - n = size; - size -= n; - addr += n; - } - return(TRUE); -} - -boolean_t -db_phys_eq( - task_t task1, - vm_offset_t addr1, - task_t task2, - vm_offset_t addr2) -{ - addr64_t physa, physb; - - if ((addr1 & (PPC_PGBYTES-1)) != (addr2 & (PPC_PGBYTES-1))) /* Is byte displacement the same? */ - return FALSE; - - if (task1 == TASK_NULL) { /* See if there is a task active */ - if (current_thread() == THR_ACT_NULL) /* See if there is a current task */ - return FALSE; - task1 = current_thread()->task; /* If so, use that one */ - } - - if(!(physa = db_vtophys(task1->map->pmap, (vm_offset_t)trunc_page_32(addr1)))) return FALSE; /* Get real address of the first */ - if(!(physb = db_vtophys(task2->map->pmap, (vm_offset_t)trunc_page_32(addr2)))) return FALSE; /* Get real address of the second */ - - return (physa == physb); /* Check if they are equal, then return... */ -} - -#define DB_USER_STACK_ADDR (0xc0000000) -#define DB_NAME_SEARCH_LIMIT (DB_USER_STACK_ADDR-(PPC_PGBYTES*3)) - -boolean_t -db_phys_cmp(__unused vm_offset_t a1, __unused vm_offset_t a2, - __unused vm_size_t s1) -{ - db_printf("db_phys_cmp: not implemented\n"); - return 0; -} - - -int -db_search_null(__unused task_t task, __unused unsigned *svaddr, - __unused unsigned evaddr, __unused unsigned *skaddr, - __unused int flag) -{ - db_printf("db_search_null: not implemented\n"); - return(-1); -} - -struct proc; -unsigned char *getProcName(struct proc *proc); - -void -db_task_name( - task_t task) -{ - register unsigned char *p; - unsigned char tname[33]; - int i; - - p = 0; - tname[0] = 0; - - if(task->bsd_info) p = getProcName((struct proc *)(task->bsd_info)); /* Point to task name */ - - if(p) { - for(i = 0; i < 32; i++) { /* Move no more than 32 bytes */ - tname[i] = p[i]; - if(p[i] == 0) break; - } - tname[i] = 0; - db_printf("%s", tname); - } - else db_printf("no name"); -} - -extern int kdb_flag; -void -db_machdep_init(void) -{ -#define KDB_READY 0x1 - kdb_flag |= KDB_READY; -} - - -#ifdef __STDC__ -//#define KDB_SAVE(type, name) extern type name; type name##_save = name -#define KDB_SAVE(type, name) type name##_save = name -#define KDB_RESTORE(name) name = name##_save -#else /* __STDC__ */ -#define KDB_SAVE(type, name) type name/**/_save = name -//#define KDB_SAVE(type, name) extern type name; type name/**/_save = name -#define KDB_RESTORE(name) name = name/**/_save -#endif /* __STDC__ */ - -#define KDB_SAVE_CTXT() \ - KDB_SAVE(int, db_run_mode); \ - KDB_SAVE(boolean_t, db_sstep_print); \ - KDB_SAVE(int, db_loop_count); \ - KDB_SAVE(int, db_call_depth); \ - KDB_SAVE(int, db_inst_count); \ - KDB_SAVE(int, db_last_inst_count); \ - KDB_SAVE(int, db_load_count); \ - KDB_SAVE(int, db_store_count); \ - KDB_SAVE(boolean_t, db_cmd_loop_done); \ - KDB_SAVE(jmp_buf_t *, db_recover); \ - KDB_SAVE(db_addr_t, db_dot); \ - KDB_SAVE(db_addr_t, db_last_addr); \ - KDB_SAVE(db_addr_t, db_prev); \ - KDB_SAVE(db_addr_t, db_next); \ - KDB_SAVE(db_regs_t, ddb_regs); - -#define KDB_RESTORE_CTXT() \ - KDB_RESTORE(db_run_mode); \ - KDB_RESTORE(db_sstep_print); \ - KDB_RESTORE(db_loop_count); \ - KDB_RESTORE(db_call_depth); \ - KDB_RESTORE(db_inst_count); \ - KDB_RESTORE(db_last_inst_count); \ - KDB_RESTORE(db_load_count); \ - KDB_RESTORE(db_store_count); \ - KDB_RESTORE(db_cmd_loop_done); \ - KDB_RESTORE(db_recover); \ - KDB_RESTORE(db_dot); \ - KDB_RESTORE(db_last_addr); \ - KDB_RESTORE(db_prev); \ - KDB_RESTORE(db_next); \ - KDB_RESTORE(ddb_regs); - -extern boolean_t db_sstep_print; -extern int db_loop_count; -extern int db_call_depth; -extern int db_inst_count; -extern int db_last_inst_count; -extern int db_load_count; -extern int db_store_count; -extern boolean_t db_cmd_loop_done; -extern void unlock_debugger(void); -extern void lock_debugger(void); -/* - * switch to another cpu - */ -void -kdb_on( - int cpu) -{ - KDB_SAVE_CTXT(); - if (cpu < 0 || cpu >= (int)real_ncpus || !PerProcTable[cpu].ppe_vaddr->debugger_active) - return; - db_set_breakpoints(); - db_set_watchpoints(); - debugger_cpu = cpu; - unlock_debugger(); - lock_debugger(); - db_clear_breakpoints(); - db_clear_watchpoints(); - KDB_RESTORE_CTXT(); - if (debugger_cpu == -1) {/* someone continued */ - debugger_cpu = cpu_number(); - db_continue_cmd(0, 0, 0, NULL); - } -} - -/* - * system reboot - */ - -void -db_reboot(__unused db_expr_t addr, __unused boolean_t have_addr, - __unused db_expr_t count, char *modif) -{ - boolean_t reboot = TRUE; - char *cp, c; - - cp = modif; - while ((c = *cp++) != 0) { - if (c == 'r') /* reboot */ - reboot = TRUE; - if (c == 'h') /* halt */ - reboot = FALSE; - } - if(!reboot) halt_all_cpus(FALSE); /* If no reboot, try to be clean about it */ - - if (PE_halt_restart) - (*PE_halt_restart)(kPERestartCPU); - db_printf("Sorry, system can't reboot automatically yet... You need to do it by hand...\n"); - -} diff --git a/osfmk/ppc/db_low_trace.c b/osfmk/ppc/db_low_trace.c deleted file mode 100644 index e081b2643..000000000 --- a/osfmk/ppc/db_low_trace.c +++ /dev/null @@ -1,1106 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_FREE_COPYRIGHT@ - */ -/* - * @APPLE_FREE_COPYRIGHT@ - */ - -/* - * Author: Bill Angell, Apple - * Date: 6/97 - * - * exceptions and certain C functions write into a trace table which - * can be examined via the machine 'lt' command under kdb - */ - - -#include /* For strcpy() */ -#include -#include - -#include -#include -#include -#include -#include -#include -#include /* For db_option() */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -void db_dumppca(unsigned int ptegindex); -void db_dumpmapping(struct mapping *mp); /* Dump out a mapping */ -extern kmod_info_t *kmod; /* Find the kmods */ - -db_addr_t db_low_trace_prev = 0; - -/* - * Print out the low level trace table: - * - * Displays the entry and 15 before it in newest to oldest order - * - * lt [entaddr] - - * If entaddr is omitted, it starts with the most current - * If entaddr = 0, it starts with the most current and does the whole table - */ -void -db_low_trace(db_expr_t addr, boolean_t have_addr, db_expr_t count, char *modif) -{ - int c, i; - unsigned int tempx, cnt; - unsigned int xTraceCurr, xTraceStart, xTraceEnd, cxltr; - db_addr_t next_addr; - LowTraceRecord xltr; - unsigned char cmark; - addr64_t xxltr; - - cnt = 16; /* Default to 16 entries */ - - xTraceCurr = trcWork.traceCurr; /* Transfer current pointer */ - xTraceStart = trcWork.traceStart; /* Transfer start of table */ - xTraceEnd = trcWork.traceEnd; /* Transfer end of table */ - - if(addr == -1) cnt = 0x7FFFFFFF; /* Max the count */ - - if(!addr || (addr == -1)) { - addr=xTraceCurr-sizeof(LowTraceRecord); /* Start at the newest */ - if((unsigned int)addr=xTraceEnd) { /* In the table? */ - db_printf("address not in low memory trace table\n"); /* Tell the fool */ - return; /* Leave... */ - } - - if((unsigned int)addr&0x0000007F) { /* Proper alignment? */ - db_printf("address not aligned on trace entry boundary (0x80)\n"); /* Tell 'em */ - return; /* Leave... */ - } - - xxltr = addr; /* Set the start */ - cxltr = ((xTraceCurr == xTraceStart ? xTraceEnd : xTraceCurr) - sizeof(LowTraceRecord)); /* Get address of newest entry */ - - db_low_trace_prev = addr; /* Starting point */ - - for(i=0; i < cnt; i++) { /* Dump the 16 (or all) entries */ - - ReadReal((addr64_t)xxltr, (unsigned int *)&xltr); /* Get the first half */ - ReadReal((addr64_t)xxltr + 32, &(((unsigned int *)&xltr)[8])); /* Get the second half */ - ReadReal((addr64_t)xxltr + 64, &(((unsigned int *)&xltr)[16])); /* Get the second half */ - ReadReal((addr64_t)xxltr + 96, &(((unsigned int *)&xltr)[24])); /* Get the second half */ - - db_printf("\n%s%08llX %1X %08X %08X - %04X", (xxltr != cxltr ? " " : "*"), - xxltr, - (xltr.LTR_cpu & 0xFF), xltr.LTR_timeHi, xltr.LTR_timeLo, - (xltr.LTR_excpt & 0x8000 ? 0xFFFF : xltr.LTR_excpt * 64)); /* Print the first line */ - - if(xltr.LTR_cpu & 0xFF00) db_printf(", sflgs = %02X\n", ((xltr.LTR_cpu >> 8) & 0xFF)); - else db_printf("\n"); - - db_printf(" DAR/DSR/CR: %016llX %08X %08X\n", xltr.LTR_dar, xltr.LTR_dsisr, xltr.LTR_cr); - - db_printf(" SRR0/SRR1 %016llX %016llX\n", xltr.LTR_srr0, xltr.LTR_srr1); - db_printf(" LR/CTR %016llX %016llX\n", xltr.LTR_lr, xltr.LTR_ctr); - - db_printf(" R0/R1/R2 %016llX %016llX %016llX\n", xltr.LTR_r0, xltr.LTR_r1, xltr.LTR_r2); - db_printf(" R3/R4/R5 %016llX %016llX %016llX\n", xltr.LTR_r3, xltr.LTR_r4, xltr.LTR_r5); - db_printf(" R6/sv/rsv %016llX %016llX %08X\n", xltr.LTR_r6, xltr.LTR_save, xltr.LTR_rsvd0); - - if((cnt != 16) && (xxltr == xTraceCurr)) break; /* If whole table dump, exit when we hit start again... */ - - xxltr-=sizeof(LowTraceRecord); /* Back it on up */ - if(xxltr', '?', /* 3x */ - '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', /* 4x */ - 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[',0x5C, ']', '^', '_', /* 5x */ - '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', /* 6x */ - 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '.', /* 7x */ - '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', /* 8x */ - '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', /* 9x */ - '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', /* Ax */ - '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', /* Bx */ - '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', /* Cx */ - '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', /* Dx */ - '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', /* Ex */ - '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', /* Fx */ -}; - -/* - * Print out 256 bytes in characters - * - * - * dc [entaddr] - */ -void -db_display_char(db_expr_t addr, boolean_t have_addr, db_expr_t count, - char * modif) -{ - - int i, j, k; - unsigned char xlt[256], *xaddr; - - xaddr = (unsigned char *)addr; - - - for(i = 0; i < 8; i++) { /* Print 256 bytes */ - j = 0; - for(k = 0; k < 32; k++) { - xlt[j] = xtran[*xaddr]; - xaddr++; - j++; - if((k & 3) == 3) { - xlt[j] = ' '; - j++; - } - } - xlt[j] = 0; - - db_printf("%016llX %s\n", (addr64_t)(xaddr - 32), xlt); /* Print a line */ - } - - db_next = (db_expr_t)xaddr; - - -} - -/* - * Print out 256 bytes of real storage - * - * Displays the entry and 15 before it in newest to oldest order - * - * dr [entaddr] - */ -void -db_display_real(db_expr_t addr, boolean_t have_addr, db_expr_t count, - char *modif) -{ - int i; - unsigned int xbuf[8]; - - for(i=0; i<8; i++) { /* Print 256 bytes */ - ReadReal(addr, &xbuf[0]); /* Get the real storage data */ - db_printf("%016llX %08X %08X %08X %08X %08X %08X %08X %08X\n", addr, /* Print a line */ - xbuf[0], xbuf[1], xbuf[2], xbuf[3], - xbuf[4], xbuf[5], xbuf[6], xbuf[7]); - addr = addr + 0x00000020; /* Point to next address */ - } - db_next = addr; -} - -unsigned int dvspace = 0; - -/* - * Print out virtual to real translation information - * - * - * dm vaddr [space] (defaults to last entered) - */ -void -db_display_mappings(db_expr_t addr, boolean_t have_addr, db_expr_t count, - char *modif) -{ - db_expr_t xspace; - pmap_t pmap; - addr64_t lnextva; - - mapping_t *mp; - - if (db_expression(&xspace)) { /* Get the address space requested */ - if(xspace >= maxAdrSp) { - db_printf("requested address space (%llX) larger than max (%X)\n", xspace, maxAdrSp - 1); - return; - } - dvspace = xspace; /* Get the space or set default */ - } - - db_printf("mapping information for %016llX in space %8X:\n", addr, dvspace); - - pmap = pmapTrans[dvspace].pmapVAddr; /* Find the pmap address */ - if(!pmap) { /* The pmap is not in use */ - db_printf("The space %X is not assigned to a pmap\n", dvspace); /* Say we are wrong */ - return; - } - - mp = hw_find_map(pmap, (addr64_t)addr, &lnextva); /* Try to find the mapping for this address */ - if((unsigned int)mp == mapRtBadLk) { /* Did we lock up ok? */ - db_printf("Timeout locking physical entry for virtual address %016ll8X\n", addr); - return; - } - - if(!mp) { /* Did we find one? */ - db_printf("Not mapped\n"); - return; /* Didn't find any, return FALSE... */ - } - - mapping_drop_busy(mp); /* The mapping shouldn't be changing */ - - db_dumpmapping(mp); /* Dump it all out */ - - /* Tell them we did it */ -} - -/* - * Print out hash table data - * - * - * dh vaddr [space] (defaults to last entered) - */ -void -db_display_hash(db_expr_t addr, boolean_t have_addr, db_expr_t count, - char *modif) -{ - db_expr_t xspace; - unsigned int seg, vsid, ptegindex, htsize; - pmap_t pmap; - addr64_t lnextva, llva, vpn, esid; - uint64_t hash; - int s4bit; - - llva = (addr64_t)((unsigned int)addr); /* Make sure we are 64-bit now */ - - s4bit = !((PerProcTable[0].ppe_vaddr->pf.Available & pf64Bit) == 0); /* Are we a big guy? */ - if (db_expression(&xspace)) { /* Get the address space requested */ - if(xspace >= maxAdrSp) { - db_printf("requested address space (%llX) larger than max (%X)\n", xspace, maxAdrSp - 1); - return; - } - dvspace = xspace; /* Get the space or set default */ - } - - pmap = pmapTrans[dvspace].pmapVAddr; /* Find the pmap address */ - if(!pmap) { /* The pmap is not in use */ - db_printf("The space %X is not assigned to a pmap\n", dvspace); /* Say we are wrong */ - return; - } - - hash = (uint64_t)pmap->space | ((uint64_t)pmap->space << maxAdrSpb) | ((uint64_t)pmap->space << (2 * maxAdrSpb)); /* Get hash value */ - hash = hash & 0x0000001FFFFFFFFF; /* Make sure we stay within supported ranges */ - - esid = ((llva >> 14) & -maxAdrSp) ^ hash; /* Get ESID */ - llva = ((llva >> 12) & 0xFFFF) ^ esid; /* Get index into hash table */ - - if(s4bit) htsize = hash_table_size >> 7; /* Get number of entries in hash table for 64-bit */ - else htsize = hash_table_size >> 6; /* get number of entries in hash table for 32-bit */ - - ptegindex = llva & (htsize - 1); /* Get the index to the pteg and pca */ - db_dumppca(ptegindex); /* dump the info */ - - /* Tell them we did it */ -} - -/* - * Displays all of the in-use pmaps in the system. - * - * dp - */ -void -db_display_pmap(db_expr_t addr, boolean_t have_addr, db_expr_t count, - char *modif) -{ - pmap_t pmap; - int i; - unsigned int v0, v1, st0, st1; - - pmap = (pmap_t)addr; - if(!have_addr) pmap = kernel_pmap; /* Start at the beginning */ - - db_printf("PMAP (real) Next Prev Space Flags Ref spaceNum Resident Wired\n"); -// xxxxxxxx rrrrrrrrrrrrrrrr xxxxxxxx pppppppp ssssssss cccccccc vvvvvvvv nnnnnnnn rrrrrrrr wwwwwwwww - while(1) { /* Do them all */ - db_printf("%08X %016llX %08X %08X %08X %08X %08X %08X %08X %08X\n", - pmap, (addr64_t)pmap ^ pmap->pmapvr, - pmap->pmap_link.next, pmap->pmap_link.prev, - pmap->space, pmap->pmapFlags, pmap->ref_count, pmap->spaceNum, - pmap->stats.resident_count, - pmap->stats.wired_count); - - db_printf("lists = %d, rand = %08X, visits = %016llX, searches = %08X\n", - pmap->pmapCurLists, pmap->pmapRandNum, - pmap->pmapSearchVisits, pmap->pmapSearchCnt); - - db_printf("cctl = %08X, SCSubTag = %016llX\n", - pmap->pmapCCtl, pmap->pmapSCSubTag); - - for(i = 0; i < 16; i +=2) { - v0 = (pmap->pmapCCtl >> (31 - i) & 1); /* Get high order bit */ - v1 = (pmap->pmapCCtl >> (30 - i) & 1); /* Get high order bit */ - st0 = (pmap->pmapSCSubTag >> (60 - (4 * i))) & 0xF; /* Get the sub-tag */ - st1 = (pmap->pmapSCSubTag >> (56 - (4 * i))) & 0xF; /* Get the sub-tag */ - - db_printf(" %01X %01X %016llX/%016llX %01X %01X %016llX/%016llX\n", - v0, st0, pmap->pmapSegCache[i].sgcESID, pmap->pmapSegCache[i].sgcVSID, - v1, st1, pmap->pmapSegCache[i+1].sgcESID, pmap->pmapSegCache[i+1].sgcVSID); - } - - db_printf("\n"); - if(have_addr) break; /* Do only one if address supplied */ - pmap = (pmap_t)pmap->pmap_link.next; /* Skip to the next */ - if(pmap == kernel_pmap) break; /* We've wrapped, we're done */ - } -} - - -/* - * Checks the pmap skip lists - * - * - * cp pmap - */ -void -db_check_pmaps(db_expr_t addr, boolean_t have_addr, db_expr_t count, - char *modif) -{ - int i; - unsigned int ret; - uint64_t dumpa[32]; - pmap_t pmap; - - pmap = (pmap_t)addr; - if(!have_addr) pmap = kernel_pmap; /* If no map supplied, start with kernel */ - - while(1) { /* Do them all */ - ret = mapSkipListVerifyC(pmap, &dumpa); /* Check out the map */ - if(!ret) db_printf("Skiplists verified ok, pmap = %08X\n", pmap); - else { - db_printf("Verification failure at %08X, pmap = %08X\n", ret, pmap); - for(i = 0; i < 32; i += 4) { - db_printf("R%02d %016llX %016llX %016llX %016llX\n", i, - dumpa[i], dumpa[i + 1], dumpa[i + 2], dumpa[i + 3]); - } - } - if(have_addr) break; /* Do only one if address supplied */ - pmap = (pmap_t)pmap->pmap_link.next; /* Skip to the next */ - if(pmap == kernel_pmap) break; /* We've wrapped, we're done */ - } -} - - -/* - * Displays iokit junk - * - * di - */ - -void db_piokjunk(void); - -void -db_display_iokit(__unused db_expr_t addr, __unused boolean_t have_addr, - __unused db_expr_t count, __unused char *modif) -{ - db_piokjunk(); -} - -/* - * Prints out a mapping control block - * - */ - -void db_dumpmapping(struct mapping *mp) { /* Dump out a mapping */ - - pmap_t pmap; - int i; - - db_printf("Dump of mapping block: %08X, pmap: %08X (%016llX)\n", mp, pmapTrans[mp->mpSpace].pmapVAddr, - pmapTrans[mp->mpSpace].pmapPAddr); /* Header */ - db_printf(" mpFlags: %08X\n", mp->mpFlags); - db_printf(" mpSpace: %04X\n", mp->mpSpace); - db_printf(" mpBSize: %04X\n", mp->u.mpBSize); - db_printf(" mpPte: %08X\n", mp->mpPte); - db_printf(" mpPAddr: %08X\n", mp->mpPAddr); - db_printf(" mpVAddr: %016llX\n", mp->mpVAddr); - db_printf(" mpAlias: %016llX\n", mp->mpAlias); - db_printf(" mpList00: %016llX\n", mp->mpList0); - - for(i = 1; i < (mp->mpFlags & mpLists); i++) { /* Dump out secondary physical skip lists */ - db_printf(" mpList%02d: %016llX\n", i, mp->mpList[i - 1]); - } -} - -/* - * Prints out a PTEG and PCA - * - */ - -void db_dumppca(unsigned int ptegindex) { - - addr64_t pteg, pca, llva; - unsigned int xpteg[32], xpca[8], space, hash, pva, seg, api, va; - int i, s4bit; - unsigned long long llslot, llseg, llhash; - - s4bit = !((PerProcTable[0].ppe_vaddr->pf.Available & pf64Bit) == 0); /* Are we a big guy? */ - - pteg = hash_table_base + (ptegindex << 6); /* Point to the PTEG */ - if(s4bit) pteg = hash_table_base + (ptegindex << 7); /* Point to the PTEG */ - pca = hash_table_base - ((ptegindex + 1) * 4); /* Point to the PCA */ - db_printf("PTEG = %016llX, PCA = %016llX (index = %08X)\n", pteg, pca, ptegindex); - - ReadReal(pteg, &xpteg[0]); /* Get first half of the pteg */ - ReadReal(pteg + 0x20, &xpteg[8]); /* Get second half of the pteg */ - ReadReal(pca, &xpca[0]); /* Get pca */ - - db_printf("PCA: free = %02X, steal = %02X, auto = %02X, misc = %02X\n", - ((xpca[0] >> 24) & 255), ((xpca[0] >> 16) & 255), ((xpca[0] >> 8) & 255), xpca[0] & 255); - - if(!s4bit) { /* Little guy? */ - - for(i = 0; i < 16; i += 2) { /* Step through pteg */ - db_printf("%08X %08X - ", xpteg[i], xpteg[i + 1]); /* Dump the pteg slot */ - - if(xpteg[i] & 0x80000000) db_printf(" valid - "); /* Is it valid? */ - else db_printf("invalid - "); /* Nope, invalid */ - - space = (xpteg[i] >> 7) & (maxAdrSp - 1); /* Extract the space */ - hash = space | (space << maxAdrSpb) | (space << (2 * maxAdrSpb)); /* Get the hash */ - pva = ptegindex ^ hash; /* Get part of the vaddr */ - seg = (xpteg[i] >> 7) ^ hash; /* Get the segment number */ - api = (xpteg[i] & 0x3F); /* Get the API */ - va = ((seg << (28 - maxAdrSpb)) & 0xF0000000) | (api << 22) | ((pva << 12) & 0x003FF000); /* Get the vaddr */ - db_printf("va = %08X\n", va); - } - } - else { - ReadReal(pteg + 0x40, &xpteg[16]); /* Get third half of the pteg */ - ReadReal(pteg + 0x60, &xpteg[24]); /* Get fourth half of the pteg */ - - for(i = 0; i < 32; i += 4) { /* Step through pteg */ - db_printf("%08X%08X %08X%08X - ", xpteg[i], xpteg[i + 1], xpteg[i + 2], xpteg[i + 3]); /* Dump the pteg slot */ - - if(xpteg[i + 1] & 1) db_printf(" valid - "); /* Is it valid? */ - else db_printf("invalid - "); /* Nope, invalid */ - - llslot = ((long long)xpteg[i] << 32) | (long long)xpteg[i + 1]; /* Make a long long version of this */ - space = (llslot >> 12) & (maxAdrSp - 1); /* Extract the space */ - llhash = (unsigned long long)space | ((unsigned long long)space << maxAdrSpb) | ((unsigned long long)space << (2 * maxAdrSpb)); /* Get the hash */ - llhash = llhash & 0x0000001FFFFFFFFFULL; /* Make sure we stay within supported ranges */ - pva = (unsigned long long)ptegindex ^ llhash; /* Get part of the vaddr */ - llseg = (llslot >> 12) ^ llhash; /* Get the segment number */ - api = (llslot >> 7) & 0x1F; /* Get the API */ - llva = ((llseg << (28 - maxAdrSpb)) & 0xFFFFFFFFF0000000ULL) | (api << 23) | ((pva << 12) & 0x007FF000); /* Get the vaddr */ - db_printf("va = %016llX\n", llva); - } - } -} - - -/* - * Print out 256 bytes of virtual storage - * - * - * dv [entaddr] [space] - * address must be on 32-byte boundary. It will be rounded down if not - */ -void -db_display_virtual(db_expr_t addr, boolean_t have_addr, db_expr_t count, - char *modif) -{ - - int i, size, lines, rlines; - unsigned int xbuf[8]; - db_expr_t xspace; - pmap_t pmap; - - mapping_t *mp, *mpv; - addr64_t pa; - ppnum_t pnum; - - if (db_expression(&xspace)) { /* Parse the space ID */ - if(xspace >= (1 << maxAdrSpb)) { /* Check if they gave us a sane space number */ - db_printf("Invalid space ID: %llX - max is %X\n", xspace, (1 << maxAdrSpb) - 1); - return; - } - dvspace = xspace; /* Get the space or set default */ - } - - pmap = (pmap_t)pmapTrans[dvspace].pmapVAddr; /* Find the pmap address */ - if((unsigned int)pmap == 0) { /* Is there actually a pmap here? */ - db_printf("Address space not found: %X\n", dvspace); /* Complain */ - return; - } - - addr &= -32; - - size = 4096 - (addr & 0x00000FFF); /* Bytes left on page */ - lines = size / 32; /* Number of lines in first or only part */ - if(lines > 8) lines = 8; - rlines = 8 - lines; - if(rlines < 0) lines = 0; - - db_printf("Dumping %016llX (pmap = %08X, space = %X); ", addr, pmap, dvspace); - - pnum = pmap_find_phys(pmap, (addr64_t)addr); /* Phynd the Physical */ - if(!pnum) { /* Did we find one? */ - db_printf("Not mapped\n"); - return; /* Didn't find any, return FALSE... */ - } - - pa = (addr64_t)(pnum << 12) | (addr64_t)(addr & 0xFFF); /* Get the physical address */ - db_printf("phys = %016llX\n", pa); - - for(i=0; itasks.next) { /* Go through the tasks */ - taskact = 0; /* Reset activation count */ - db_printf("\nTask %4d @%08X:\n", tottasks, task); /* Show where we're at */ - for(act = (thread_act_t)task->threads.next; act != (thread_act_t)&task->threads; act = (thread_act_t)act->task_threads.next) { /* Go through activations */ - db_printf(" Act %4d @%08X - p: %08X current context: %08X\n", - taskact, act, act->machine.pcb, act->machine.curctx); - - save = (struct savearea *)act->machine.pcb; /* Set the start of the normal chain */ - chainsize = 0; - - db_printf(" General context - fp: %08X fl: %08X fc: %d vp: %08X vl: %08X vp: %d\n", - act->machine.facctx.FPUsave, act->machine.facctx.FPUlevel, act->machine.facctx.FPUcpu, - act->machine.facctx.VMXsave, act->machine.facctx.VMXlevel, act->machine.facctx.VMXcpu); - - while(save) { /* Do them all */ - totsaves++; /* Count savearea */ - db_printf(" Norm %08X: %016llX %016llX - tot = %d\n", save, save->save_srr0, save->save_srr1, totsaves); - save = (struct savearea *)save->save_hdr.save_prev; /* Next one */ - if(chainsize++ > chainmax) { /* See if we might be in a loop */ - db_printf(" Chain terminated by count (%d) before %08X\n", chainmax, save); - break; - } - } - - save = (struct savearea *)act->machine.facctx.FPUsave; /* Set the start of the floating point chain */ - chainsize = 0; - while(save) { /* Do them all */ - totsaves++; /* Count savearea */ - db_printf(" FPU %08X: %08X - tot = %d\n", save, save->save_hdr.save_level, totsaves); - save = (struct savearea *)save->save_hdr.save_prev; /* Next one */ - if(chainsize++ > chainmax) { /* See if we might be in a loop */ - db_printf(" Chain terminated by count (%d) before %08X\n", chainmax, save); - break; - } - } - - save = (struct savearea *)act->machine.facctx.VMXsave; /* Set the start of the floating point chain */ - chainsize = 0; - while(save) { /* Do them all */ - totsaves++; /* Count savearea */ - db_printf(" Vec %08X: %08X - tot = %d\n", save, save->save_hdr.save_level, totsaves); - save = (struct savearea *)save->save_hdr.save_prev; /* Next one */ - if(chainsize++ > chainmax) { /* See if we might be in a loop */ - db_printf(" Chain terminated by count (%d) before %08X\n", chainmax, save); - break; - } - } - - if(CTable = act->machine.vmmControl) { /* Are there virtual machines? */ - - for(vmid = 0; vmid < kVmmMaxContexts; vmid++) { - - if(!(CTable->vmmc[vmid].vmmFlags & vmmInUse)) continue; /* Skip if vm is not in use */ - - if(!CTable->vmmc[vmid].vmmFacCtx.FPUsave && !CTable->vmmc[vmid].vmmFacCtx.VMXsave) continue; /* If neither types, skip this vm */ - - db_printf(" VMachine ID %3d - fp: %08X fl: %08X fc: %d vp: %08X vl: %08X vp: %d\n", vmid, /* Title it */ - CTable->vmmc[vmid].vmmFacCtx.FPUsave, CTable->vmmc[vmid].vmmFacCtx.FPUlevel, CTable->vmmc[vmid].vmmFacCtx.FPUcpu, - CTable->vmmc[vmid].vmmFacCtx.VMXsave, CTable->vmmc[vmid].vmmFacCtx.VMXlevel, CTable->vmmc[vmid].vmmFacCtx.VMXcpu - ); - - save = (struct savearea *)CTable->vmmc[vmid].vmmFacCtx.FPUsave; /* Set the start of the floating point chain */ - chainsize = 0; - while(save) { /* Do them all */ - totsaves++; /* Count savearea */ - db_printf(" FPU %08X: %08X - tot = %d\n", save, save->save_hdr.save_level, totsaves); - save = (struct savearea *)save->save_hdr.save_prev; /* Next one */ - if(chainsize++ > chainmax) { /* See if we might be in a loop */ - db_printf(" Chain terminated by count (%d) before %08X\n", chainmax, save); - break; - } - } - - save = (struct savearea *)CTable->vmmc[vmid].vmmFacCtx.VMXsave; /* Set the start of the floating point chain */ - chainsize = 0; - while(save) { /* Do them all */ - totsaves++; /* Count savearea */ - db_printf(" Vec %08X: %08X - tot = %d\n", save, save->save_hdr.save_level, totsaves); - save = (struct savearea *)save->save_hdr.save_prev; /* Next one */ - if(chainsize++ > chainmax) { /* See if we might be in a loop */ - db_printf(" Chain terminated by count (%d) before %08X\n", chainmax, save); - break; - } - } - } - } - taskact++; - } - tottasks++; - } - - db_printf("Total saveareas accounted for: %d\n", totsaves); -} - -/* - * Print out extra registers - * - * - * dx - */ - -extern unsigned int dbfloats[33][2]; -extern unsigned int dbvecs[33][4]; -extern unsigned int dbspecrs[336]; - -void -db_display_xregs(db_expr_t addr, boolean_t have_addr, db_expr_t count, - char *modif) -{ - int i, j, pents; - - stSpecrs(dbspecrs); /* Save special registers */ - if(PerProcTable[0].ppe_vaddr->pf.Available & pf64Bit) { - db_printf("PIR: %08X\n", dbspecrs[0]); - db_printf("PVR: %08X\n", dbspecrs[1]); - db_printf("SDR1: %08X.%08X\n", dbspecrs[26], dbspecrs[27]); - db_printf("HID0: %08X.%08X\n", dbspecrs[28], dbspecrs[29]); - db_printf("HID1: %08X.%08X\n", dbspecrs[30], dbspecrs[31]); - db_printf("HID4: %08X.%08X\n", dbspecrs[32], dbspecrs[33]); - db_printf("HID5: %08X.%08X\n", dbspecrs[34], dbspecrs[35]); - db_printf("SPRG0: %08X.%08X %08X.%08X\n", dbspecrs[18], dbspecrs[19], dbspecrs[20], dbspecrs[21]); - db_printf("SPRG2: %08X.%08X %08X.%08X\n", dbspecrs[22], dbspecrs[23], dbspecrs[24], dbspecrs[25]); - db_printf("\n"); - for(i = 0; i < (64 * 4); i += 4) { - db_printf("SLB %02d: %08X.%08X %08X.%08X\n", i / 4, dbspecrs[80 + i], dbspecrs[81 + i], dbspecrs[82 + i], dbspecrs[83 + i]); - } - } - else { - db_printf("PIR: %08X\n", dbspecrs[0]); - db_printf("PVR: %08X\n", dbspecrs[1]); - db_printf("SDR1: %08X\n", dbspecrs[22]); - db_printf("HID0: %08X\n", dbspecrs[39]); - db_printf("HID1: %08X\n", dbspecrs[40]); - db_printf("L2CR: %08X\n", dbspecrs[41]); - db_printf("MSSCR0: %08X\n", dbspecrs[42]); - db_printf("MSSCR1: %08X\n", dbspecrs[43]); - db_printf("THRM1: %08X\n", dbspecrs[44]); - db_printf("THRM2: %08X\n", dbspecrs[45]); - db_printf("THRM3: %08X\n", dbspecrs[46]); - db_printf("ICTC: %08X\n", dbspecrs[47]); - db_printf("L2CR2: %08X\n", dbspecrs[48]); - db_printf("DABR: %08X\n", dbspecrs[49]); - - db_printf("DBAT: %08X %08X %08X %08X\n", dbspecrs[2], dbspecrs[3], dbspecrs[4], dbspecrs[5]); - db_printf(" %08X %08X %08X %08X\n", dbspecrs[6], dbspecrs[7], dbspecrs[8], dbspecrs[9]); - db_printf("IBAT: %08X %08X %08X %08X\n", dbspecrs[10], dbspecrs[11], dbspecrs[12], dbspecrs[13]); - db_printf(" %08X %08X %08X %08X\n", dbspecrs[14], dbspecrs[15], dbspecrs[16], dbspecrs[17]); - db_printf("SPRG: %08X %08X %08X %08X\n", dbspecrs[18], dbspecrs[19], dbspecrs[20], dbspecrs[21]); - db_printf("\n"); - for(i = 0; i < 16; i += 8) { /* Print 8 at a time */ - db_printf("SR%02d: %08X %08X %08X %08X %08X %08X %08X %08X\n", i, - dbspecrs[23+i], dbspecrs[24+i], dbspecrs[25+i], dbspecrs[26+i], - dbspecrs[27+i], dbspecrs[28+i], dbspecrs[29+i], dbspecrs[30+i]); - } - } - - db_printf("\n"); - - stFloat(dbfloats); /* Save floating point registers */ - for(i = 0; i < 32; i += 4) { /* Print 4 at a time */ - db_printf("F%02d: %08X %08X %08X %08X %08X %08X %08X %08X\n", i, - dbfloats[i][0], dbfloats[i][1], dbfloats[i+1][0], dbfloats[i+1][1], - dbfloats[i+2][0], dbfloats[i+2][1], dbfloats[i+3][0], dbfloats[i+3][1]); - } - db_printf("FCR: %08X %08X\n", dbfloats[32][0], dbfloats[32][1]); /* Print FSCR */ - - if(!stVectors(dbvecs)) return; /* Return if not Altivec capable */ - - db_printf("\n"); - - for(i = 0; i < 32; i += 2) { /* Print 2 at a time */ - db_printf("V%02d: %08X %08X %08X %08X %08X %08X %08X %08X\n", i, - dbvecs[i][0], dbvecs[i][1], dbvecs[i][2], dbvecs[i][3], - dbvecs[i+1][0], dbvecs[i+1][1], dbvecs[i+1][2], dbvecs[i+1][3]); - } - db_printf("VCR: %08X %08X %08X %08X\n", dbvecs[32][0], dbvecs[32][1], dbvecs[32][2], dbvecs[32][3]); /* Print VSCR */ - - /* Tell them we did it */ -} - -/* - * Check check mappings and hash table for consistency - * - * cm - */ -void -db_check_mappings(db_expr_t addr, boolean_t have_addr, db_expr_t count, - char *modif) -{ - addr64_t pteg, pca, llva, lnextva; - unsigned int xpteg[32], xpca[8], space, hash, pva, seg, api, va, free, free2, xauto, PTEGcnt, wimgkk, wimgxx, slotoff; - int i, j, fnderr, slot, slot2, k, s4bit; - pmap_t pmap; - mapping_t *mp; - ppnum_t ppn, pa, aoff; - unsigned long long llslot, llseg, llhash; - - s4bit = 0; /* Assume dinky? */ - if(PerProcTable[0].ppe_vaddr->pf.Available & pf64Bit) s4bit = 1; /* Are we a big guy? */ - - PTEGcnt = hash_table_size / 64; /* Get the number of PTEGS */ - if(s4bit) PTEGcnt = PTEGcnt / 2; /* PTEGs are twice as big */ - - pteg = hash_table_base; /* Start of hash table */ - pca = hash_table_base - 4; /* Start of PCA */ - - for(i = 0; i < PTEGcnt; i++) { /* Step through them all */ - - fnderr = 0; - - ReadReal(pteg, &xpteg[0]); /* Get first half of the pteg */ - ReadReal(pteg + 0x20, &xpteg[8]); /* Get second half of the pteg */ - if(s4bit) { /* See if we need the other half */ - ReadReal(pteg + 0x40, &xpteg[16]); /* Get third half of the pteg */ - ReadReal(pteg + 0x60, &xpteg[24]); /* Get fourth half of the pteg */ - } - ReadReal(pca, &xpca[0]); /* Get pca */ - - if(xpca[0] & 0x00000001) { /* Is PCA locked? */ - db_printf("Unexpected locked PCA\n"); /* Yeah, this may be bad */ - fnderr = 1; /* Remember to print the pca/pteg pair later */ - } - - free = 0x80000000; - - for(j = 0; j < 7; j++) { /* Search for duplicates */ - slot = j * 2; /* Point to the slot */ - if(s4bit) slot = slot * 2; /* Adjust for bigger slots */ - if(!(xpca[0] & free)) { /* Check more if slot is allocated */ - for(k = j + 1; k < 8; k++) { /* Search remaining slots */ - slot2 = k * 2; /* Point to the slot */ - if(s4bit) slot2 = slot2 * 2; /* Adjust for bigger slots */ - if((xpteg[slot] == xpteg[slot2]) - && (!s4bit || (xpteg[slot + 1] == xpteg[slot2 + 1]))) { /* Do we have duplicates? */ - db_printf("Duplicate tags in pteg, slot %d and slot %d\n", j, k); - fnderr = 1; - } - } - } - free = free >> 1; /* Move slot over */ - } - - free = 0x80000000; - xauto = 0x00008000; - - for(j = 0; j < 8; j++) { /* Step through the slots */ - - slot = j * 2; /* Point to the slot */ - if(s4bit) slot = slot * 2; /* Hagfish? */ - if(xpca[0] & free) { /* Check if marked free */ - if((!s4bit && (xpteg[slot] & 0x80000000)) /* Is a supposedly free slot valid? */ - || (s4bit && (xpteg[slot + 1] & 1))) { - db_printf("Free slot still valid - %d\n", j); - fnderr = 1; - } - } - else { /* We have an in use slot here */ - - if(!(!s4bit && (xpteg[slot] & 0x80000000)) /* Is a supposedly in use slot valid? */ - && !(s4bit && (xpteg[slot + 1] & 1))) { - db_printf("Inuse slot not valid - %d\n", j); - fnderr = 1; - } - else { /* Slot is valid, check mapping */ - if(!s4bit) { /* Not Hagfish? */ - space = (xpteg[slot] >> 7) & (maxAdrSp - 1); /* Extract the space */ - hash = space | (space << maxAdrSpb) | (space << (2 * maxAdrSpb)); /* Get the hash */ - pva = i ^ hash; /* Get part of the vaddr */ - seg = (xpteg[slot] >> 7) ^ hash; /* Get the segment number */ - api = (xpteg[slot] & 0x3F); /* Get the API */ - va = ((seg << (28 - maxAdrSpb)) & 0xF0000000) | (api << 22) | ((pva << 12) & 0x003FF000); /* Get the vaddr */ - llva = (addr64_t)va; /* Make this a long long */ - wimgxx = xpteg[slot + 1] & 0x7F; /* Get the wimg and pp */ - ppn = xpteg[slot + 1] >> 12; /* Get physical page number */ - slotoff = (i * 64) + (j * 8) | 1; /* Get offset to slot and valid bit */ - } - else { /* Yes, Hagfish */ - llslot = ((long long)xpteg[slot] << 32) | (long long)xpteg[slot + 1]; /* Make a long long version of this */ - space = (llslot >> 12) & (maxAdrSp - 1); /* Extract the space */ - llhash = (unsigned long long)space | ((unsigned long long)space << maxAdrSpb) | ((unsigned long long)space << (2 * maxAdrSpb)); /* Get the hash */ - llhash = llhash & 0x0000001FFFFFFFFFULL; /* Make sure we stay within supported ranges */ - pva = i ^ llhash; /* Get part of the vaddr */ - llseg = ((llslot >> 12) ^ llhash); /* Get the segment number */ - api = (llslot >> 7) & 0x1F; /* Get the API */ - llva = ((llseg << (28 - maxAdrSpb)) & 0xFFFFFFFFF0000000ULL) | (api << 23) | ((pva << 12) & 0x007FF000); /* Get the vaddr */ - wimgxx = xpteg[slot + 3] & 0x7F; /* Get the wimg and pp */ - ppn = (xpteg[slot + 2] << 20) | (xpteg[slot + 3] >> 12); /* Get physical page number */ - slotoff = (i * 128) + (j * 16) | 1; /* Get offset to slot and valid bit */ - } - - pmap = pmapTrans[space].pmapVAddr; /* Find the pmap address */ - if(!pmap) { /* The pmap is not in use */ - db_printf("The space %08X is not assigned to a pmap, slot = %d\n", space, slot); /* Say we are wrong */ - fnderr = 1; - goto dcmout; - } - - if (pmap->pmapFlags & pmapVMgsaa) { - unsigned int ret; - mapping_t mpcopy; - ret = hw_find_map_gv(pmap, llva, &mpcopy); - } else { - mp = hw_find_map(pmap, llva, &lnextva); /* Try to find the mapping for this address */ - // db_printf("%08X - %017llX\n", mp, llva); - if((unsigned int)mp == mapRtBadLk) { /* Did we lock up ok? */ - db_printf("Timeout locking mapping for for virtual address %016ll8X, slot = %d\n", llva, j); - return; - } - - if(!mp) { /* Did we find one? */ - db_printf("Not mapped, slot = %d, va = %08X\n", j, (unsigned int)llva); - fnderr = 1; - goto dcmout; - } - - if((mp->mpFlags & 0xFF000000) > 0x01000000) { /* Is busy count too high? */ - db_printf("Busy count too high, slot = %d\n", j); - fnderr = 1; - } - - if((mp->mpFlags & mpType) == mpBlock) { /* Is this a block map? */ - if(!(xpca[0] & xauto)) { /* Is it marked as such? */ - db_printf("mapping marked as block, PCA is not, slot = %d\n", j); - fnderr = 1; - } - } - else { /* Is a block */ - if(xpca[0] & xauto) { /* Is it marked as such? */ - db_printf("mapping not marked as block, PCA is, slot = %d\n", j); - fnderr = 1; - } - if(mp->mpPte != slotoff) { /* See if mapping PTEG offset is us */ - db_printf("mapping does not point to PTE, slot = %d\n", j); - fnderr = 1; - } - } - - wimgkk = (unsigned int)mp->mpVAddr; /* Get last half of vaddr where keys, etc are */ - wimgkk = (wimgkk ^ wimgxx) & 0x7F; /* XOR to find differences from PTE */ - if(wimgkk) { /* See if key in PTE is what we want */ - db_printf("key or WIMG does not match, slot = %d\n", j); - fnderr = 1; - } - - aoff = (ppnum_t)((llva >> 12) - (mp->mpVAddr >> 12)); /* Get the offset from vaddr */ - pa = aoff + mp->mpPAddr; /* Get the physical page number we expect */ - if(pa != ppn) { /* Is physical address expected? */ - db_printf("Physical address does not match, slot = %d\n", j); - fnderr = 1; - } - - mapping_drop_busy(mp); /* We're done with the mapping */ - } - } - - } -dcmout: - free = free >> 1; - xauto = xauto >> 1; - } - - - if(fnderr)db_dumppca(i); /* Print if error */ - - pteg = pteg + 64; /* Go to the next one */ - if(s4bit) pteg = pteg + 64; /* Hagfish? */ - pca = pca - 4; /* Go to the next one */ - - - } -} - -/* - * Displays all of the kmods in the system. - * - * dp - */ -void -db_display_kmod(db_expr_t addr, boolean_t have_addr, db_expr_t count, - char *modif) -{ - kmod_info_t *kmd; - unsigned int strt, end; - - kmd = kmod; /* Start at the start */ - - db_printf("info addr start - end name ver\n"); - - while(kmd) { /* Dump 'em all */ - strt = (unsigned int)kmd->address + kmd->hdr_size; /* Get start of kmod text */ - end = (unsigned int)kmd->address + kmd->size; /* Get end of kmod */ - db_printf("%08X %08X %08X - %08X: %s, %s\n", kmd, kmd->address, strt, end, - kmd->name, kmd->version); - kmd = kmd->next; /* Step to it */ - } -} - -/* - * Displays stuff - * - * gs - */ -unsigned char xxgpo[36] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; - -void -db_gsnoop(db_expr_t addr, boolean_t have_addr, db_expr_t count, char *modif) -{ - int i, j; - unsigned char *gp, gpn[36]; -#define ngpr 34 - - gp = (unsigned char *)0x8000005C; - - for(i = 0; i < ngpr; i++) gpn[i] = gp[i]; /* Copy 'em */ - - for(i = 0; i < ngpr; i++) { - db_printf("%02X ", gpn[i]); - } - db_printf("\n"); - - for(i = 0; i < ngpr; i++) { - if(gpn[i] != xxgpo[i]) db_printf("^^ "); - else db_printf(" "); - } - db_printf("\n"); - - for(i = 0; i < ngpr; i++) xxgpo[i] = gpn[i]; /* Save 'em */ -} - - -void Dumbo(void); -void Dumbo(void){ -} diff --git a/osfmk/ppc/db_low_trace.h b/osfmk/ppc/db_low_trace.h deleted file mode 100644 index efc3faedb..000000000 --- a/osfmk/ppc/db_low_trace.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_FREE_COPYRIGHT@ - */ -/* - * @APPLE_FREE_COPYRIGHT@ - */ - -#ifndef _DDB_DB_LTR_H_ -#define _DDB_DB_LTR_H_ - -#include -#include - -/* - * Prototypes for functions exported by this module. - */ - -void db_list_pmap(db_expr_t, boolean_t, db_expr_t, char *); -void db_low_trace(db_expr_t, boolean_t, db_expr_t, char *); -void db_display_long(db_expr_t, boolean_t, db_expr_t, char *); -void db_display_char(db_expr_t, boolean_t, db_expr_t, char *); -void db_display_real(db_expr_t, boolean_t, db_expr_t, char *); -void db_display_virtual(db_expr_t, boolean_t, db_expr_t, char *); -void db_display_mappings(db_expr_t, boolean_t, db_expr_t, char *); -void db_display_hash(db_expr_t, boolean_t, db_expr_t, char *); -void db_display_pmap(db_expr_t, boolean_t, db_expr_t, char *); -void db_display_iokit(db_expr_t, boolean_t, db_expr_t, char *); -void db_display_save(db_expr_t, boolean_t, db_expr_t, char *); -void db_display_xregs(db_expr_t, boolean_t, db_expr_t, char *); -void db_display_kmod(db_expr_t, boolean_t, db_expr_t, char *); -void db_gsnoop(db_expr_t, boolean_t, db_expr_t count, char *); -void db_check_mappings(db_expr_t, boolean_t, db_expr_t, char *); -void db_check_pmaps(db_expr_t, boolean_t, db_expr_t, char *); - -#endif /* !_DDB_DB_LTR_H_ */ diff --git a/osfmk/ppc/db_machdep.h b/osfmk/ppc/db_machdep.h deleted file mode 100644 index cb9162c4e..000000000 --- a/osfmk/ppc/db_machdep.h +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ - -#ifndef _PPC_DB_MACHDEP_H_ -#define _PPC_DB_MACHDEP_H_ - -/* - * Machine-dependent defines for new kernel debugger. - */ - -#include -#include -#include -#include -#include -#include -#include - -typedef addr64_t db_addr_t; /* address - unsigned */ -typedef uint64_t db_expr_t; /* expression - signed??? try unsigned */ - -typedef struct savearea db_regs_t; -extern db_regs_t ddb_regs; /* register state */ -#define DDB_REGS (&ddb_regs) -extern int db_active; /* ddb is active */ - -#define PC_REGS(regs) ((db_addr_t)(regs)->save_srr0) - -#define BKPT_INST 0x7c810808 /* breakpoint instruction */ -#define BKPT_SIZE (4) /* size of breakpoint inst */ -#define BKPT_SET(inst) (BKPT_INST) - -#define db_clear_single_step(regs) ((regs)->save_srr1 &= ~MASK(MSR_SE)) -#define db_set_single_step(regs) ((regs)->save_srr1 |= MASK(MSR_SE)) - -#define IS_BREAKPOINT_TRAP(type, code) (FALSE) -#define IS_WATCHPOINT_TRAP(type, code) (FALSE) - -#define inst_trap_return(ins) (FALSE) -#define inst_return(ins) (FALSE) -#define inst_call(ins) (FALSE) - -int db_inst_load(unsigned long); -int db_inst_store(unsigned long); - -/* access capability and access macros */ - -#define DB_ACCESS_LEVEL DB_ACCESS_ANY /* any space */ -#define DB_CHECK_ACCESS(addr,size,task) \ - db_check_access(addr,size,task) -#define DB_PHYS_EQ(task1,addr1,task2,addr2) \ - db_phys_eq(task1,addr1,task2,addr2) -#define DB_VALID_KERN_ADDR(addr) \ - ((addr) >= VM_MIN_KERNEL_ADDRESS && \ - (addr) < vm_last_addr) -#define DB_VALID_ADDRESS(addr,user) \ - ((!(user) && DB_VALID_KERN_ADDR(addr)) || \ - ((user) && (addr) < VM_MAX_ADDRESS)) - -/* - * Given pointer to savearea, determine if it represents - * a thread executing a) in user space, b) in the kernel, or c) - * in a kernel-loaded task. Return true for cases a) and c). - */ -#define IS_USER_TRAP(regs) \ - (USER_MODE(regs->save_srr1)) - -extern boolean_t db_check_access( - vm_offset_t addr, - int size, - task_t task); -extern boolean_t db_phys_eq( - task_t task1, - vm_offset_t addr1, - task_t task2, - vm_offset_t addr2); -extern db_addr_t db_disasm( - db_addr_t loc, - boolean_t altfmt, - task_t task); -extern void db_read_bytes( - vm_offset_t addr, - int size, - char *data, - task_t task); -extern void db_write_bytes( - vm_offset_t addr, - int size, - char *data, - task_t task); -extern void db_stack_trace_cmd( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char *modif); -extern void db_reboot( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char *modif); - -/* macros for printing OS server dependent task name */ - -#define DB_TASK_NAME(task) db_task_name(task) -#define DB_TASK_NAME_TITLE "COMMAND " -#define DB_TASK_NAME_LEN 39 -#define DB_NULL_TASK_NAME "? " - -extern void db_task_name( - task_t task); - -/* macro for checking if a thread has used floating-point */ - -#define db_act_fp_used(act) (FALSE) - -extern void kdb_trap( - int type, - struct savearea *regs); -extern boolean_t db_trap_from_asm( - struct savearea *regs); -extern void kdb_on( - int cpu); -extern void cnpollc( - boolean_t on); - -extern boolean_t db_phys_cmp( - vm_offset_t, - vm_offset_t, - vm_size_t); - -#endif /* _PPC_DB_MACHDEP_H_ */ diff --git a/osfmk/ppc/db_trace.c b/osfmk/ppc/db_trace.c deleted file mode 100644 index 601378162..000000000 --- a/osfmk/ppc/db_trace.c +++ /dev/null @@ -1,1122 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -#include - -#include -#include - -#include - -#include -#include -#include - -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -extern jmp_buf_t *db_recover; - -struct savearea ddb_null_kregs; - -extern vm_offset_t vm_min_inks_addr; /* set by db_clone_symtabXXX */ - -#define DB_NUMARGS_MAX 5 - -#define INFIXEDSTACK(va) 0 \ - -#define INKERNELSTACK(va, th) 1 - -struct db_ppc_frame { - struct db_ppc_frame *f_frame; - int pad1; - uint32_t f_retaddr; - int pad3; - int pad4; - int pad5; - uint32_t f_arg[DB_NUMARGS_MAX]; -}; - -#define TRAP 1 -#define INTERRUPT 2 -#define SYSCALL 3 - -db_addr_t db_user_trap_symbol_value = 0; -db_addr_t db_kernel_trap_symbol_value = 0; -db_addr_t db_interrupt_symbol_value = 0; -db_addr_t db_return_to_iret_symbol_value = 0; -db_addr_t db_syscall_symbol_value = 0; -boolean_t db_trace_symbols_found = FALSE; - -static int db_ppc_reg_value( - struct db_variable * vp, - db_expr_t * val, - int flag, - db_var_aux_param_t ap); -static void db_find_trace_symbols(void); -static int db_numargs( - struct db_ppc_frame *fp, - task_t task); -static boolean_t db_find_arg( - struct db_ppc_frame *frame, - db_addr_t calleepc, - task_t task, - int narg, - db_addr_t *arg); -static void db_nextframe( - struct db_ppc_frame **lfp, - struct db_ppc_frame **fp, - db_addr_t *ip, - int frame_type, - thread_act_t thr_act, - db_addr_t linkpc); - -/* - * Machine register set. - */ -struct db_variable db_regs[] = { - /* XXX "pc" is an alias to "srr0"... */ - { - .name = "pc", - .valuep = &ddb_regs.save_srr0, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "srr0", - .valuep = &ddb_regs.save_srr0, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "srr1", - .valuep = &ddb_regs.save_srr1, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r0", - .valuep = &ddb_regs.save_r0, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r1", - .valuep = &ddb_regs.save_r1, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r2", - .valuep = &ddb_regs.save_r2, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r3", - .valuep = &ddb_regs.save_r3, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r4", - .valuep = &ddb_regs.save_r4, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r5", - .valuep = &ddb_regs.save_r5, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r6", - .valuep = &ddb_regs.save_r6, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r7", - .valuep = &ddb_regs.save_r7, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r8", - .valuep = &ddb_regs.save_r8, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r9", - .valuep = &ddb_regs.save_r9, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r10", - .valuep = &ddb_regs.save_r10, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r11", - .valuep = &ddb_regs.save_r11, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r12", - .valuep = &ddb_regs.save_r12, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r13", - .valuep = &ddb_regs.save_r13, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r14", - .valuep = &ddb_regs.save_r14, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r15", - .valuep = &ddb_regs.save_r15, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r16", - .valuep = &ddb_regs.save_r16, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r17", - .valuep = &ddb_regs.save_r17, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r18", - .valuep = &ddb_regs.save_r18, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r19", - .valuep = &ddb_regs.save_r19, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r20", - .valuep = &ddb_regs.save_r20, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r21", - .valuep = &ddb_regs.save_r21, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r22", - .valuep = &ddb_regs.save_r22, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r23", - .valuep = &ddb_regs.save_r23, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r24", - .valuep = &ddb_regs.save_r24, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r25", - .valuep = &ddb_regs.save_r25, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r26", - .valuep = &ddb_regs.save_r26, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r27", - .valuep = &ddb_regs.save_r27, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r28", - .valuep = &ddb_regs.save_r28, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r29", - .valuep = &ddb_regs.save_r29, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r30", - .valuep = &ddb_regs.save_r30, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "r31", - .valuep = &ddb_regs.save_r31, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "cr", - .valuep = (db_expr_t *)&ddb_regs.save_cr, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "xer", - .valuep = &ddb_regs.save_xer, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "lr", - .valuep = &ddb_regs.save_lr, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, - { - .name = "ctr", - .valuep = &ddb_regs.save_ctr, - .fcn = db_ppc_reg_value, - .min_level = 0, - .max_level = 0, - .low = 0, - .high = 0, - .hidden_level = TRUE, - }, -}; -struct db_variable *db_eregs = db_regs + sizeof(db_regs)/sizeof(db_regs[0]); - -int -db_ppc_reg_value( - struct db_variable *vp, - db_expr_t *valuep, - int flag, - db_var_aux_param_t ap) -{ - db_expr_t *dp = 0; - db_expr_t null_reg = 0; - uint32_t *dp32; - thread_act_t thr_act = ap->thr_act; - unsigned int cpu; - - if (db_option(ap->modif, 'u')) { - if (thr_act == THR_ACT_NULL) { - if ((thr_act = current_thread()) == THR_ACT_NULL) - db_error("no user registers\n"); - } - if (thr_act == current_thread()) { - if (IS_USER_TRAP((&ddb_regs))) dp = vp->valuep; - else if (INFIXEDSTACK(ddb_regs.save_r1)) - db_error("cannot get/set user registers in nested interrupt\n"); - } - } - else { - if (thr_act == THR_ACT_NULL || thr_act == current_thread()) { - dp = vp->valuep; - } - else { - if (thr_act->kernel_stack) { - for (cpu = 0; cpu < real_ncpus; cpu++) { - if (cpu_to_processor(cpu)->state == PROCESSOR_RUNNING && - cpu_to_processor(cpu)->active_thread == thr_act && - PerProcTable[cpu].ppe_vaddr->db_saved_state) { - - dp = (db_expr_t)(((uint32_t)(PerProcTable[cpu].ppe_vaddr->db_saved_state)) + - (((uint32_t) vp->valuep) - - (uint32_t) &ddb_regs)); - break; - } - } - - if (dp == 0) - dp = &null_reg; - } - else { - /* only PC is valid */ - if (vp->valuep == &ddb_regs.save_srr0) - dp = (db_expr_t *)&thr_act->continuation; - else - dp = &null_reg; - } - } - } - if (dp == 0) { - if (!db_option(ap->modif, 'u')) { - for (cpu = 0; cpu < real_ncpus; cpu++) { - if (cpu_to_processor(cpu)->state == PROCESSOR_RUNNING && - cpu_to_processor(cpu)->active_thread == thr_act && - PerProcTable[cpu].ppe_vaddr->db_saved_state) { - dp = (int *) (((int)(PerProcTable[cpu].ppe_vaddr->db_saved_state)) + - (((int) vp->valuep) - (int) &ddb_regs)); - break; - } - } - } - if (dp == 0) { - if (!thr_act || thr_act->machine.pcb == 0) - db_error("no pcb\n"); - dp = (int *)((int)thr_act->machine.pcb + ((int)vp->valuep - (int)&ddb_regs)); - } - } - - if(vp->valuep == (db_expr_t *)&ddb_regs.save_cr) { /* Is this the CR we are doing? */ - dp32 = (uint32_t *)dp; /* Make this easier */ - if (flag == DB_VAR_SET) - *dp32 = *valuep; - else - *valuep = *dp32; - } - else { /* Normal 64-bit registers */ - if (flag == DB_VAR_SET) - *dp = *valuep; - else - *valuep = *(unsigned long long *)dp; - } - - return 0; -} - - -void -db_find_trace_symbols(void) -{ - db_expr_t value; - boolean_t found_some; - - found_some = FALSE; - if (db_value_of_name(CC_SYM_PREFIX "thandler", &value)) { - db_user_trap_symbol_value = (db_addr_t) value; - found_some = TRUE; - } - if (db_value_of_name(CC_SYM_PREFIX "thandler", &value)) { - db_kernel_trap_symbol_value = (db_addr_t) value; - found_some = TRUE; - } - if (db_value_of_name(CC_SYM_PREFIX "ihandler", &value)) { - db_interrupt_symbol_value = (db_addr_t) value; - found_some = TRUE; - } -#if 0 - if (db_value_of_name(CC_SYM_PREFIX "return_to_iret", &value)) { - db_return_to_iret_symbol_value = (db_addr_t) value; - found_some = TRUE; - } -#endif - if (db_value_of_name(CC_SYM_PREFIX "thandler", &value)) { - db_syscall_symbol_value = (db_addr_t) value; - found_some = TRUE; - } - if (found_some) - db_trace_symbols_found = TRUE; -} - -int -db_numargs( - struct db_ppc_frame *fp, - task_t task) -{ - return DB_NUMARGS_MAX; -} - -boolean_t -db_find_arg( - struct db_ppc_frame *fp, - db_addr_t calleepc, - task_t task, - int narg, - db_addr_t *arg) -{ - db_addr_t argp; - db_addr_t calleep; - db_addr_t offset; - int i; - int inst; - char *name; - -#if 0 - db_find_task_sym_and_offset(calleepc, &name, &offset, task); - calleep = calleepc-offset; - - for (i = 0; calleep < calleepc; i++, calleep++) { - if (!DB_CHECK_ACCESS((int) calleep, 4, task)) { - continue; - } - inst = db_get_task_value(calleep, 4, FALSE, task); - if ((inst & 0xffff0000) == (0x907f0000 + (narg << 21)) || - (inst & 0xffff0000) == (0x90610000 + (narg << 21))) { - argp = (db_addr_t) &(fp->f_arg[narg]); - *arg = argp; - return TRUE; - } - } -#endif - return FALSE; -} - -extern int TRAP_TYPES; -/* - * Figure out the next frame up in the call stack. - * For trap(), we print the address of the faulting instruction and - * proceed with the calling frame. We return the ip that faulted. - * If the trap was caused by jumping through a bogus pointer, then - * the next line in the backtrace will list some random function as - * being called. It should get the argument list correct, though. - * It might be possible to dig out from the next frame up the name - * of the function that faulted, but that could get hairy. - */ -void -db_nextframe( - struct db_ppc_frame **lfp, /* in/out */ - struct db_ppc_frame **fp, /* in/out */ - db_addr_t *ip, /* out */ - int frame_type, /* in */ - thread_act_t thr_act, - db_addr_t linkpc) /* in */ -{ - struct savearea *saved_regs; - - task_t task = (thr_act != THR_ACT_NULL)? thr_act->task: TASK_NULL; - - switch(frame_type) { - case TRAP: - db_printf(">>>>> trap <<<<<\n"); - goto miss_frame; - break; - case INTERRUPT: - if (*lfp == 0) { - db_printf(">>>>> interrupt <<<<<\n"); - goto miss_frame; - } - db_printf(">>>>> interrupt <<<<<\n"); - goto miss_frame; - break; - case SYSCALL: - if (thr_act != THR_ACT_NULL && thr_act->machine.pcb) { - *ip = (db_addr_t) thr_act->machine.pcb->save_srr0; - *fp = (struct db_ppc_frame *) (thr_act->machine.pcb->save_r1); - break; - } - /* falling down for unknown case */ - default: -miss_frame: - if(!pmap_find_phys(kernel_pmap, (addr64_t)*fp)) { /* Check if this is valid */ - db_printf("Frame not mapped %08X\n",*fp); /* Say not found */ - *fp = 0; /* Show not found */ - break; /* Out of here */ - } - - if ((*fp)->f_frame) - *ip = (db_addr_t) - db_get_task_value((int)&(*fp)->f_frame->f_retaddr, - 4, FALSE, task); - else - *ip = (db_addr_t) - db_get_task_value((int)&(*fp)->f_retaddr, - 4, FALSE, task); - - *lfp = *fp; - *fp = (struct db_ppc_frame *) - db_get_task_value((int)&(*fp)->f_frame, 4, FALSE, task); - break; - } -} - -void -db_stack_trace_cmd( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char *modif) -{ - struct db_ppc_frame *frame, *lastframe; - db_addr_t callpc, linkpc, lastcallpc; - int frame_type; - boolean_t kernel_only = TRUE; - boolean_t trace_thread = FALSE; - boolean_t trace_all_threads = FALSE; - int thcount = 0; - char *filename; - int linenum; - task_t task; - thread_act_t th, top_act; - int user_frame; - int frame_count; - jmp_buf_t *prev; - jmp_buf_t db_jmp_buf; - queue_entry_t act_list; - - if (!db_trace_symbols_found) - db_find_trace_symbols(); - { - char *cp = modif; - char c; - - while ((c = *cp++) != 0) { - if (c == 't') - trace_thread = TRUE; - if (c == 'T') { - trace_all_threads = TRUE; - trace_thread = TRUE; - } - if (c == 'u') - kernel_only = FALSE; - } - } - - if (trace_all_threads) { - if (!have_addr && !trace_thread) { - have_addr = TRUE; - trace_thread = TRUE; - act_list = &(current_task()->threads); - addr = (db_expr_t) queue_first(act_list); - } - else if (trace_thread) { - if (have_addr) { - if (!db_check_act_address_valid((thread_act_t)addr)) { - if (db_lookup_task((task_t)addr) == -1) - return; - act_list = &(((task_t)addr)->threads); - addr = (db_expr_t) queue_first(act_list); - } - else { - act_list = &(((thread_act_t)addr)->task->threads); - thcount = db_lookup_task_act(((thread_act_t)addr)->task, - (thread_act_t)addr); - } - } - else { - th = db_default_act; - if (th == THR_ACT_NULL) - th = current_thread(); - if (th == THR_ACT_NULL) { - db_printf("no active thr_act\n"); - return; - } - have_addr = TRUE; - act_list = &th->task->threads; - addr = (db_expr_t) queue_first(act_list); - } - } - } - - if (count == -1) - count = 65535; - -next_thread: - top_act = THR_ACT_NULL; - - user_frame = 0; - frame_count = count; - - if (!have_addr && !trace_thread) { - frame = (struct db_ppc_frame *)(ddb_regs.save_r1); - callpc = (db_addr_t)ddb_regs.save_srr0; - linkpc = (db_addr_t)ddb_regs.save_lr; - th = current_thread(); - task = (th != THR_ACT_NULL)? th->task: TASK_NULL; - } - else if (trace_thread) { - if (have_addr) { - th = (thread_act_t) addr; - if (!db_check_act_address_valid(th)) - return; - } - else { - th = db_default_act; - if (th == THR_ACT_NULL) - th = current_thread(); - if (th == THR_ACT_NULL) { - db_printf("no active thread\n"); - return; - } - } - if (trace_all_threads) - db_printf("---------- Thread 0x%x (#%d of %d) ----------\n", - addr, thcount, th->task->thread_count); - -next_activation: - user_frame = 0; - - task = th->task; - if (th == current_thread()) { - frame = (struct db_ppc_frame *)(ddb_regs.save_r1); - callpc = (db_addr_t)ddb_regs.save_srr0; - linkpc = (db_addr_t)ddb_regs.save_lr; - } - else { - if (th->machine.pcb == 0) { - db_printf("thread has no pcb\n"); - goto thread_done; - } - if (th->kernel_stack == 0) { - struct savearea *pss = th->machine.pcb; - - db_printf("Continuation "); - db_task_printsym((db_expr_t)th->continuation, - DB_STGY_PROC, task); - db_printf("\n"); - frame = (struct db_ppc_frame *) (pss->save_r1); - callpc = (db_addr_t) (pss->save_srr0); - linkpc = (db_addr_t) (pss->save_lr); - } - else { - int cpu; - - for (cpu = 0; cpu < real_ncpus; cpu++) { - if (cpu_to_processor(cpu)->state == PROCESSOR_RUNNING && - cpu_to_processor(cpu)->active_thread == th && - PerProcTable[cpu].ppe_vaddr->db_saved_state) { - break; - } - } - if (top_act != THR_ACT_NULL) { - /* - * Trying to get the backtrace of an activation - * which is not the top_most one in the RPC chain: - * use the activation's pcb. - */ - struct savearea *pss; - - pss = th->machine.pcb; - frame = (struct db_ppc_frame *) (pss->save_r1); - callpc = (db_addr_t) (pss->save_srr0); - linkpc = (db_addr_t) (pss->save_lr); - } else { - if (cpu == real_ncpus) { - struct savearea *iks; - int r; - - iks = th->machine.pcb; - prev = db_recover; - if ((r = _setjmp(db_recover = &db_jmp_buf)) == 0) { - frame = (struct db_ppc_frame *) (iks->save_r1); - callpc = (db_addr_t) (iks->save_lr); - linkpc = 0; - } else { - /* - * The kernel stack has probably been - * paged out (swapped out activation). - */ - db_recover = prev; - if (r == 2) /* 'q' from db_more() */ - db_error(0); - db_printf("\n", - iks); - goto next_act; - } - db_recover = prev; - } else { - db_printf(">>>>> active on cpu %d <<<<<\n", - cpu); - frame = (struct db_ppc_frame *) - (PerProcTable[cpu].ppe_vaddr->db_saved_state->save_r1); - callpc = (db_addr_t) PerProcTable[cpu].ppe_vaddr->db_saved_state->save_srr0; - linkpc = (db_addr_t) PerProcTable[cpu].ppe_vaddr->db_saved_state->save_lr; - } - } - } - } - } else { - frame = (struct db_ppc_frame *)addr; - th = (db_default_act)? db_default_act: current_thread(); - task = (th != THR_ACT_NULL)? th->task: TASK_NULL; - if (frame->f_frame) { - callpc = (db_addr_t)db_get_task_value - ((int)&frame->f_frame->f_retaddr, - 4, FALSE, (user_frame) ? task : 0); - callpc = callpc-sizeof(callpc); - } else - callpc =0; - linkpc = 0; - } - - if (!INKERNELSTACK((unsigned)frame, th)) { - db_printf(">>>>> user space <<<<<\n"); - if (kernel_only) - goto thread_done; - user_frame++; - } - - lastframe = 0; - lastcallpc = (db_addr_t) 0; - while (frame_count-- && frame != 0) { - int narg = DB_NUMARGS_MAX; - int arg; - char * name; - db_expr_t offset; - db_addr_t call_func = 0; - int r; - db_addr_t off; - - db_symbol_values(NULL, - db_search_task_symbol_and_line( - callpc, DB_STGY_XTRN, &offset, &filename, - &linenum, (user_frame) ? task : 0, &narg), - &name, (db_expr_t *)&call_func); - if ( name == NULL) { - db_find_task_sym_and_offset(callpc, - &name, &off, (user_frame) ? task : 0); - offset = (db_expr_t) off; - } - - if (user_frame == 0) { - if (call_func && - (call_func == db_user_trap_symbol_value || - call_func == db_kernel_trap_symbol_value)) { - frame_type = TRAP; - narg = 1; - } else if (call_func && - call_func == db_interrupt_symbol_value) { - frame_type = INTERRUPT; - goto next_frame; - } else if (call_func && - call_func == db_syscall_symbol_value) { - frame_type = SYSCALL; - goto next_frame; - } else { - frame_type = 0; - prev = db_recover; - if ((r = _setjmp(db_recover = &db_jmp_buf)) - == 0) { - if (narg < 0) - narg = db_numargs(frame, - (user_frame) ? task : 0); - db_recover = prev; - } else { - db_recover = prev; - goto next_act; - } - } - } else { - frame_type = 0; - prev = db_recover; - if ((r = _setjmp(db_recover = &db_jmp_buf)) == 0) { - if (narg < 0) - narg = db_numargs(frame, - (user_frame) ? task : 0); - db_recover = prev; - } else { - db_recover = prev; - goto next_act; - } - } - - if (name == 0 || offset > db_maxoff) { - db_printf("[%08X]0x%08X(", frame, callpc); - } else { - db_printf("[%08X]%s", frame, name); - if (offset) - db_printf("+%llx", offset); - db_printf("("); - }; - - narg = db_numargs(frame, (user_frame) ? task : 0); - - for (arg = 0; arg < narg; arg++) { - db_addr_t argp; - int value; - boolean_t found; - - prev = db_recover; - if ((r = _setjmp(db_recover = &db_jmp_buf)) == 0) { - found = FALSE; - if (lastframe) - found = db_find_arg(frame, lastframe->f_retaddr, - (user_frame) ? task : 0, arg, &argp); - if (found) - value = db_get_task_value(argp, 4, FALSE, - (user_frame) ? task : 0); - } else { - db_recover = prev; - if (r == 2) /* 'q' from db_more() */ - db_error(0); - db_printf("... )"); - db_printf("\n"); - goto next_act; - } - db_recover = prev; - if (found) - db_printf("%08X", value); - else - db_printf("??"); - argp = argp + sizeof(argp); - if (arg < narg-1) - db_printf(","); - } - if (arg != narg) - db_printf("..."); - db_printf(")"); - db_printf("\n"); - -next_frame: - lastcallpc = callpc; - prev = db_recover; - if ((r = _setjmp(db_recover = &db_jmp_buf)) == 0) { - db_nextframe(&lastframe, &frame, &callpc, frame_type, - (user_frame) ? th : THR_ACT_NULL, linkpc); - callpc = callpc-sizeof(callpc); - db_recover = prev; - } else { - db_recover = prev; - frame = 0; - } - linkpc = 0; - - if (frame == 0) { -next_act: - /* end of chain */ - break; - } - if (!INKERNELSTACK(lastframe, th) || - !INKERNELSTACK((unsigned)frame, th)) - user_frame++; - if (user_frame == 1) { - db_printf(">>>>> user space <<<<<\n"); - if (kernel_only) - break; - } - - if (frame <= lastframe) { - if ((INKERNELSTACK(lastframe, th) && !INKERNELSTACK(frame, th))) - continue; - db_printf("Bad frame pointer: 0x%x\n", frame); - break; - } - } - -thread_done: - if (trace_all_threads) { - if (top_act != THR_ACT_NULL) - th = top_act; - th = (thread_act_t) queue_next(&th->task_threads); - if (! queue_end(act_list, (queue_entry_t) th)) { - db_printf("\n"); - addr = (db_expr_t) th; - thcount++; - goto next_thread; - } - } -} diff --git a/osfmk/ppc/endian.h b/osfmk/ppc/endian.h deleted file mode 100644 index 397b09de8..000000000 --- a/osfmk/ppc/endian.h +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - * - */ - -#ifndef _MACHINE_ENDIAN_H_ -#define _MACHINE_ENDIAN_H_ - -/* - * Definitions for byte order, - * according to byte significance from low address to high. - */ -#define LITTLE_ENDIAN 1234 /* least-significant byte first (vax) */ -#define BIG_ENDIAN 4321 /* most-significant byte first (IBM, net) */ -#define PDP_ENDIAN 3412 /* LSB first in word, MSW first in long (pdp) */ - -#ifdef __BIG_ENDIAN__ /* Predefined by compiler */ -#define BYTE_ORDER BIG_ENDIAN /* byte order we use on ppc */ -#define ENDIAN BIG -#else -#error code has not been ported to little endian targets yet -#endif - -/* - * Macros for network/external number representation conversion. - */ -#if BYTE_ORDER == BIG_ENDIAN && !defined(lint) -#define ntohl(x) (x) -#define ntohs(x) (x) -#define htonl(x) (x) -#define htons(x) (x) - -static __inline__ unsigned int byte_reverse_word(unsigned int word); -static __inline__ unsigned int byte_reverse_word(unsigned int word) { - unsigned int result; - __asm__ volatile("lwbrx %0, 0, %1" : "=r" (result) : "r" (&word)); - return result; -} - -/* The above function is commutative, so we can use it for - * translations in both directions (to/from little endianness) - * Note that htolx and ltohx are probably identical, they are - * included for completeness. - */ -#define htoll(x) byte_reverse_word(x) -#define htols(x) (byte_reverse_word(x) >> 16) -#define ltohl(x) htoll(x) -#define ltohs(x) htols(x) - -#define htobl(x) (x) -#define htobs(x) (x) -#define btohl(x) (x) -#define btohs(x) (x) - -#else -unsigned short ntohs(), htons(); -unsigned long ntohl(), htonl(); -#endif - -/* This defines the order of elements in a bitfield, - * it is principally used by the SCSI subsystem in - * the definitions of mapped registers - */ -#define BYTE_MSF 1 - -#endif /* _MACHINE_ENDIAN_H_ */ diff --git a/osfmk/ppc/etimer.c b/osfmk/ppc/etimer.c deleted file mode 100644 index dca034b91..000000000 --- a/osfmk/ppc/etimer.c +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * @APPLE_FREE_COPYRIGHT@ - */ -/* - * File: etimer.c - * Purpose: Routines for handling the machine independent - * event timer. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include - -/* - * Event timer interrupt. - * - * XXX a drawback of this implementation is that events serviced earlier must not set deadlines - * that occur before the entire chain completes. - * - * XXX a better implementation would use a set of generic callouts and iterate over them - */ -void -etimer_intr( -__unused int inuser, -__unused uint64_t iaddr) -{ - uint64_t abstime; - rtclock_timer_t *mytimer; - struct per_proc_info *pp; - - pp = getPerProc(); - - mytimer = &pp->rtclock_timer; /* Point to the event timer */ - - abstime = mach_absolute_time(); /* Get the time now */ - - /* is it time for power management state change? */ - if (pp->pms.pmsPop <= abstime) { - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_EXCP_DECI, 3) | DBG_FUNC_START, 0, 0, 0, 0, 0); - pmsStep(1); /* Yes, advance step */ - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_EXCP_DECI, 3) | DBG_FUNC_END, 0, 0, 0, 0, 0); - - abstime = mach_absolute_time(); /* Get the time again since we ran a bit */ - } - - /* has a pending clock timer expired? */ - if (mytimer->deadline <= abstime) { /* Have we expired the deadline? */ - mytimer->has_expired = TRUE; /* Remember that we popped */ - mytimer->deadline = timer_queue_expire(&mytimer->queue, abstime); - mytimer->has_expired = FALSE; - } - - /* schedule our next deadline */ - pp->rtcPop = EndOfAllTime; /* any real deadline will be earlier */ - etimer_resync_deadlines(); -} - -/* - * Set the clock deadline. - */ -void etimer_set_deadline(uint64_t deadline) -{ - rtclock_timer_t *mytimer; - spl_t s; - struct per_proc_info *pp; - - s = splclock(); /* no interruptions */ - pp = getPerProc(); - - mytimer = &pp->rtclock_timer; /* Point to the timer itself */ - mytimer->deadline = deadline; /* Set the new expiration time */ - - etimer_resync_deadlines(); - - splx(s); -} - - -/* - * Re-evaluate the outstanding deadlines and select the most proximate. - * - * Should be called at splclock. - */ -void -etimer_resync_deadlines(void) -{ - uint64_t deadline; - rtclock_timer_t *mytimer; - spl_t s = splclock(); /* No interruptions please */ - struct per_proc_info *pp; - - pp = getPerProc(); - - deadline = ~0ULL; - - /* if we have a clock timer set sooner, pop on that */ - mytimer = &pp->rtclock_timer; /* Point to the timer itself */ - if (!mytimer->has_expired && mytimer->deadline > 0) - deadline = mytimer->deadline; - - /* if we have a power management event coming up, how about that? */ - if (pp->pms.pmsPop > 0 && pp->pms.pmsPop < deadline) - deadline = pp->pms.pmsPop; - - - if (deadline > 0 && deadline <= pp->rtcPop) { - int decr; - uint64_t now; - - now = mach_absolute_time(); - decr = setPop(deadline); - - if (deadline < now) - pp->rtcPop = now + decr; - else - pp->rtcPop = deadline; - - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_EXCP_DECI, 1) | DBG_FUNC_NONE, decr, 2, 0, 0, 0); - } - splx(s); -} - -queue_t -timer_queue_assign( - uint64_t deadline) -{ - struct per_proc_info *pp = getPerProc(); - rtclock_timer_t *timer; - - if (pp->running) { - timer = &pp->rtclock_timer; - - if (deadline < timer->deadline) - etimer_set_deadline(deadline); - } - else - timer = &PerProcTable[master_cpu].ppe_vaddr->rtclock_timer; - - return (&timer->queue); -} - -void -timer_queue_cancel( - queue_t queue, - uint64_t deadline, - uint64_t new_deadline) -{ - if (queue == &getPerProc()->rtclock_timer.queue) { - if (deadline < new_deadline) - etimer_set_deadline(new_deadline); - } -} diff --git a/osfmk/ppc/exception.h b/osfmk/ppc/exception.h deleted file mode 100644 index 394b884e4..000000000 --- a/osfmk/ppc/exception.h +++ /dev/null @@ -1,693 +0,0 @@ -/* - * Copyright (c) 2000-2008 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -/* Miscellaneous constants and structures used by the exception - * handlers - */ - -#ifndef _PPC_EXCEPTION_H_ -#define _PPC_EXCEPTION_H_ - -#include - -#ifndef ASSEMBLER - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Per processor CPU features */ -#pragma pack(4) /* Make sure the structure stays as we defined it */ -struct procFeatures { - unsigned int Available; /* 0x000 */ -#define pfFloat 0x80000000 -#define pfFloatb 0 -#define pfAltivec 0x40000000 -#define pfAltivecb 1 -#define pfAvJava 0x20000000 -#define pfAvJavab 2 -#define pfSMPcap 0x10000000 -#define pfSMPcapb 3 -#define pfCanSleep 0x08000000 -#define pfCanSleepb 4 -#define pfCanNap 0x04000000 -#define pfCanNapb 5 -#define pfCanDoze 0x02000000 -#define pfCanDozeb 6 -#define pfSlowNap 0x00400000 -#define pfSlowNapb 9 -#define pfNoMuMMCK 0x00200000 -#define pfNoMuMMCKb 10 -#define pfNoL2PFNap 0x00100000 -#define pfNoL2PFNapb 11 -#define pfSCOMFixUp 0x00080000 -#define pfSCOMFixUpb 12 -#define pfHasDcba 0x00040000 -#define pfHasDcbab 13 -#define pfL1fa 0x00010000 -#define pfL1fab 15 -#define pfL2 0x00008000 -#define pfL2b 16 -#define pfL2fa 0x00004000 -#define pfL2fab 17 -#define pfL2i 0x00002000 -#define pfL2ib 18 -#define pfLClck 0x00001000 -#define pfLClckb 19 -#define pfWillNap 0x00000800 -#define pfWillNapb 20 -#define pfNoMSRir 0x00000400 -#define pfNoMSRirb 21 -#define pfL3pdet 0x00000200 -#define pfL3pdetb 22 -#define pf128Byte 0x00000080 -#define pf128Byteb 24 -#define pf32Byte 0x00000020 -#define pf32Byteb 26 -#define pf64Bit 0x00000010 -#define pf64Bitb 27 -#define pfL3 0x00000004 -#define pfL3b 29 -#define pfL3fa 0x00000002 -#define pfL3fab 30 -#define pfValid 0x00000001 -#define pfValidb 31 - unsigned short rptdProc; /* 0x004 */ - unsigned short lineSize; /* 0x006 */ - unsigned int l1iSize; /* 0x008 */ - unsigned int l1dSize; /* 0x00C */ - unsigned int l2cr; /* 0x010 */ - unsigned int l2Size; /* 0x014 */ - unsigned int l3cr; /* 0x018 */ - unsigned int l3Size; /* 0x01C */ - unsigned int pfMSSCR0; /* 0x020 */ - unsigned int pfMSSCR1; /* 0x024 */ - unsigned int pfICTRL; /* 0x028 */ - unsigned int pfLDSTCR; /* 0x02C */ - unsigned int pfLDSTDB; /* 0x030 */ - unsigned int pfMaxVAddr; /* 0x034 */ - unsigned int pfMaxPAddr; /* 0x038 */ - unsigned int pfPTEG; /* 0x03C */ - uint64_t pfHID0; /* 0x040 */ - uint64_t pfHID1; /* 0x048 */ - uint64_t pfHID2; /* 0x050 */ - uint64_t pfHID3; /* 0x058 */ - uint64_t pfHID4; /* 0x060 */ - uint64_t pfHID5; /* 0x068 */ - unsigned int l2crOriginal; /* 0x070 */ - unsigned int l3crOriginal; /* 0x074 */ - unsigned int pfBootConfig; /* 0x078 */ - unsigned int pfPowerModes; /* 0x07C */ -#define pmDPLLVmin 0x00010000 -#define pmDPLLVminb 15 -#define pmType 0x000000FF -#define pmPowerTune 0x00000003 -#define pmDFS 0x00000002 -#define pmDualPLL 0x00000001 - unsigned int pfPowerTune0; /* 0x080 */ - unsigned int pfPowerTune1; /* 0x084 */ - unsigned int rsrvd88[6]; /* 0x088 */ -}; -#pragma pack() - -typedef struct procFeatures procFeatures; - - -/* - * - * Various performance counters - */ -#pragma pack(4) /* Make sure the structure stays as we defined it */ -struct hwCtrs { - - unsigned int hwInVains; /* In vain */ - unsigned int hwResets; /* Reset */ - unsigned int hwMachineChecks; /* Machine check */ - unsigned int hwDSIs; /* DSIs */ - unsigned int hwISIs; /* ISIs */ - unsigned int hwExternals; /* Externals */ - unsigned int hwAlignments; /* Alignment */ - unsigned int hwPrograms; /* Program */ - unsigned int hwFloatPointUnavailable; /* Floating point */ - unsigned int hwDecrementers; /* Decrementer */ - unsigned int hwIOErrors; /* I/O error */ - unsigned int hwrsvd0; /* Reserved */ - unsigned int hwSystemCalls; /* System call */ - unsigned int hwTraces; /* Trace */ - unsigned int hwFloatingPointAssists; /* Floating point assist */ - unsigned int hwPerformanceMonitors; /* Performance monitor */ - unsigned int hwAltivecs; /* VMX */ - unsigned int hwrsvd1; /* Reserved */ - unsigned int hwrsvd2; /* Reserved */ - unsigned int hwrsvd3; /* Reserved */ - unsigned int hwInstBreakpoints; /* Instruction breakpoint */ - unsigned int hwSystemManagements; /* System management */ - unsigned int hwAltivecAssists; /* Altivec Assist */ - unsigned int hwThermal; /* Thermals */ - unsigned int hwrsvd5; /* Reserved */ - unsigned int hwrsvd6; /* Reserved */ - unsigned int hwrsvd7; /* Reserved */ - unsigned int hwrsvd8; /* Reserved */ - unsigned int hwrsvd9; /* Reserved */ - unsigned int hwrsvd10; /* Reserved */ - unsigned int hwrsvd11; /* Reserved */ - unsigned int hwrsvd12; /* Reserved */ - unsigned int hwrsvd13; /* Reserved */ - unsigned int hwTrace601; /* Trace */ - unsigned int hwSIGPs; /* SIGP */ - unsigned int hwPreemptions; /* Preemption */ - unsigned int hwContextSwitchs; /* Context switch */ - unsigned int hwShutdowns; /* Shutdowns */ - unsigned int hwChokes; /* System ABENDs */ - unsigned int hwDataSegments; /* Data Segment Interruptions */ - unsigned int hwInstructionSegments; /* Instruction Segment Interruptions */ - unsigned int hwSoftPatches; /* Soft Patch interruptions */ - unsigned int hwMaintenances; /* Maintenance interruptions */ - unsigned int hwInstrumentations; /* Instrumentation interruptions */ - unsigned int hwrsvd14; /* Reserved */ - unsigned int hwhdec; /* 0B4 Hypervisor decrementer */ - - unsigned int hwspare0[11]; /* 0B8 Reserved */ - unsigned int hwspare0a; /* 0E4 Reserved */ - unsigned int hwspare0b; /* 0E8 Reserved */ - unsigned int hwspare0c; /* 0EC Reserved */ - unsigned int hwspare0d; /* 0F0 Reserved */ - unsigned int hwIgnored; /* 0F4 Interruptions ignored */ - unsigned int hwRedrives; /* 0F8 Number of redriven interrupts */ - unsigned int hwSteals; /* 0FC Steals */ -/* 100 */ - - unsigned int hwMckHang; /* ? */ - unsigned int hwMckSLBPE; /* ? */ - unsigned int hwMckTLBPE; /* ? */ - unsigned int hwMckERCPE; /* ? */ - unsigned int hwMckL1DPE; /* ? */ - unsigned int hwMckL1TPE; /* ? */ - unsigned int hwMckUE; /* ? */ - unsigned int hwMckIUE; /* ? */ - unsigned int hwMckIUEr; /* ? */ - unsigned int hwMckDUE; /* ? */ - unsigned int hwMckDTW; /* ? */ - unsigned int hwMckUnk; /* ? */ - unsigned int hwMckExt; /* ? */ - unsigned int hwMckICachePE; /* ? */ - unsigned int hwMckITagPE; /* ? */ - unsigned int hwMckIEratPE; /* ? */ - unsigned int hwMckDEratPE; /* ? */ - unsigned int hwspare2[15]; /* Pad to next 128 bndry */ -/* 0x180 */ - - unsigned int napStamp[2]; /* Time base when we napped */ - unsigned int napTotal[2]; /* Total nap time in ticks */ - unsigned int numSIGPast; /* Number of SIGP asts recieved */ - unsigned int numSIGPcpureq; /* Number of SIGP cpu requests recieved */ - unsigned int numSIGPdebug; /* Number of SIGP debugs recieved */ - unsigned int numSIGPwake; /* Number of SIGP wakes recieved */ - unsigned int numSIGPtimo; /* Number of SIGP send timeouts */ - unsigned int numSIGPmast; /* Number of SIGPast messages merged */ - unsigned int numSIGPmwake; /* Number of SIGPwake messages merged */ - - unsigned int hwWalkPhys; /* Number of entries to hw_walk_phys */ - unsigned int hwWalkFull; /* Full purge of connected PTE's */ - unsigned int hwWalkMerge; /* RC merge of connected PTE's */ - unsigned int hwWalkQuick; /* Quick scan of connected PTE's */ - unsigned int numSIGPcall; /* Number of SIGPcall messages received */ - - unsigned int hwspare3[16]; /* Pad to 512 */ - -}; -#pragma pack() - -typedef struct hwCtrs hwCtrs; - -struct patch_entry { - unsigned int *addr; - unsigned int data; - unsigned int type; - unsigned int value; -}; - -typedef struct patch_entry patch_entry_t; - -#define PATCH_INVALID 0 -#define PATCH_PROCESSOR 1 -#define PATCH_FEATURE 2 -#define PATCH_END_OF_TABLE 3 - -#define PatchExt32 0x80000000 -#define PatchExt32b 0 -#define PatchLwsync 0x40000000 -#define PatchLwsyncb 1 - -/* When an exception is taken, this info is accessed via sprg0 */ -/* We should always have this one on a cache line boundary */ - -#pragma pack(4) /* Make sure the structure stays as we defined it */ -struct per_proc_info { - unsigned short cpu_number; - unsigned short cpu_flags; /* Various low-level flags */ - vm_offset_t istackptr; - vm_offset_t intstack_top_ss; - - vm_offset_t debstackptr; - vm_offset_t debstack_top_ss; - - unsigned int spcFlags; /* Special thread flags */ - unsigned int old_thread; - ast_t pending_ast; /* mask of pending ast(s) */ - - /* PPC cache line boundary here - 020 */ - - int cpu_type; - int cpu_subtype; - int cpu_threadtype; -/* - * Note: the following two pairs of words need to stay in order and each pair must - * be in the same reservation (line) granule - */ - struct facility_context *FPU_owner; /* Owner of the FPU on this cpu */ - unsigned int liveVRSave; /* VRSave assiciated with live vector registers */ - struct facility_context *VMX_owner; /* Owner of the VMX on this cpu */ - unsigned int spcTRc; /* Special trace count */ - unsigned int spcTRp; /* Special trace buffer pointer */ - - /* PPC cache line boundary here - 040 */ - addr64_t quickfret; /* List of saveareas to release */ - addr64_t lclfree; /* Pointer to local savearea list */ - unsigned int lclfreecnt; /* Entries in local savearea list */ - unsigned int holdQFret; /* Hold off releasing quickfret list */ - uint64_t rtcPop; /* Real Time Clock pop */ - - /* PPC cache line boundary here - 060 */ - boolean_t interrupts_enabled; - IOInterruptHandler interrupt_handler; - void * interrupt_nub; - unsigned int interrupt_source; - void * interrupt_target; - void * interrupt_refCon; - uint64_t next_savearea; /* pointer to the next savearea */ - - /* PPC cache line boundary here - 080 */ - unsigned int MPsigpStat; /* Signal Processor status (interlocked update for this one) */ -#define MPsigpMsgp 0xC0000000 /* Message pending (busy + pass ) */ -#define MPsigpBusy 0x80000000 /* Processor area busy, i.e., locked */ -#define MPsigpPass 0x40000000 /* Busy lock passed to receiving processor */ -#define MPsigpAck 0x20000000 /* Ack Busy lock passed to receiving processor */ -#define MPsigpSrc 0x000000FF /* Processor that owns busy, i.e., the ID of */ - /* whomever set busy. When a busy is passed, */ - /* this is the requestor of the function. */ -#define MPsigpFunc 0x0000FF00 /* Current function */ -#define MPsigpIdle 0x00 /* No function pending */ -#define MPsigpSigp 0x04 /* Signal a processor */ - unsigned int MPsigpParm0; /* SIGP parm 0 */ - unsigned int MPsigpParm1; /* SIGP parm 1 */ - unsigned int MPsigpParm2; /* SIGP parm 2 */ - cpu_id_t cpu_id; - vm_offset_t start_paddr; - unsigned int ruptStamp[2]; /* Timebase at last interruption */ - - /* PPC cache line boundary here - 0A0 */ - procFeatures pf; /* Processor features */ - - /* PPC cache line boundary here - 140 */ - void * pp_cbfr; - void * pp_chud; - rtclock_timer_t rtclock_timer; - unsigned int ppbbTaskEnv; /* BlueBox Task Environment */ - - /* PPC cache line boundary here - 160 */ - struct savearea * db_saved_state; - time_base_enable_t time_base_enable; - uint32_t ppXFlags; - int running; - int debugger_is_slave; - int debugger_active; - int debugger_pending; - uint32_t debugger_holdoff; - - /* PPC cache line boundary here - 180 */ - uint64_t Uassist; /* User Assist DoubleWord */ - uint64_t validSegs; /* Valid SR/STB slots */ - addr64_t ppUserPmap; /* Current user state pmap (physical address) */ - unsigned int ppUserPmapVirt; /* Current user state pmap (virtual address) */ - unsigned int ppMapFlags; /* Mapping flags */ - - /* PPC cache line boundary here - 1A0 */ - unsigned short ppInvSeg; /* Forces complete invalidate of SRs/SLB (this must stay with ppInvSeg) */ - unsigned short ppCurSeg; /* Set to 1 if user segments, 0 if kernel (this must stay with ppInvSeg) */ - unsigned int ppSegSteal; /* Count of segment slot steals */ - ppnum_t VMMareaPhys; /* vmm state page physical addr */ - unsigned int VMMXAFlgs; /* vmm extended flags */ - unsigned int FAMintercept; /* vmm FAM Exceptions to intercept */ - unsigned int hibernate; /* wake from hibernate */ - uint32_t save_tbl; - uint32_t save_tbu; - - /* PPC cache line boundary here - 1C0 */ - unsigned int ppUMWmp[16]; /* Linkage mapping for user memory window - 64 bytes */ - - /* PPC cache line boundary here - 200 */ - uint64_t tempr0; /* temporary savearea */ - uint64_t tempr1; - uint64_t tempr2; - uint64_t tempr3; - - uint64_t tempr4; - uint64_t tempr5; - uint64_t tempr6; - uint64_t tempr7; - - uint64_t tempr8; - uint64_t tempr9; - uint64_t tempr10; - uint64_t tempr11; - - uint64_t tempr12; - uint64_t tempr13; - uint64_t tempr14; - uint64_t tempr15; - - uint64_t tempr16; - uint64_t tempr17; - uint64_t tempr18; - uint64_t tempr19; - - uint64_t tempr20; - uint64_t tempr21; - uint64_t tempr22; - uint64_t tempr23; - - uint64_t tempr24; - uint64_t tempr25; - uint64_t tempr26; - uint64_t tempr27; - - uint64_t tempr28; - uint64_t tempr29; - uint64_t tempr30; - uint64_t tempr31; - - - /* PPC cache line boundary here - 300 */ - double emfp0; /* Copies of floating point registers */ - double emfp1; /* Used for emulation purposes */ - double emfp2; - double emfp3; - - double emfp4; - double emfp5; - double emfp6; - double emfp7; - - double emfp8; - double emfp9; - double emfp10; - double emfp11; - - double emfp12; - double emfp13; - double emfp14; - double emfp15; - - double emfp16; - double emfp17; - double emfp18; - double emfp19; - - double emfp20; - double emfp21; - double emfp22; - double emfp23; - - double emfp24; - double emfp25; - double emfp26; - double emfp27; - - double emfp28; - double emfp29; - double emfp30; - double emfp31; - -/* - 400 */ - unsigned int emfpscr_pad; - unsigned int emfpscr; - unsigned int empadfp[6]; - -/* - 420 */ - unsigned int emvr0[4]; /* Copies of vector registers used both */ - unsigned int emvr1[4]; /* for full vector emulation or */ - unsigned int emvr2[4]; /* as saveareas while assisting denorms */ - unsigned int emvr3[4]; - unsigned int emvr4[4]; - unsigned int emvr5[4]; - unsigned int emvr6[4]; - unsigned int emvr7[4]; - unsigned int emvr8[4]; - unsigned int emvr9[4]; - unsigned int emvr10[4]; - unsigned int emvr11[4]; - unsigned int emvr12[4]; - unsigned int emvr13[4]; - unsigned int emvr14[4]; - unsigned int emvr15[4]; - unsigned int emvr16[4]; - unsigned int emvr17[4]; - unsigned int emvr18[4]; - unsigned int emvr19[4]; - unsigned int emvr20[4]; - unsigned int emvr21[4]; - unsigned int emvr22[4]; - unsigned int emvr23[4]; - unsigned int emvr24[4]; - unsigned int emvr25[4]; - unsigned int emvr26[4]; - unsigned int emvr27[4]; - unsigned int emvr28[4]; - unsigned int emvr29[4]; - unsigned int emvr30[4]; - unsigned int emvr31[4]; - unsigned int emvscr[4]; - unsigned int empadvr[4]; -/* - 640 */ -/* note implicit dependence on kSkipListMaxLists, which must be <= 28 */ - addr64_t skipListPrev[28]; /* prev ptrs saved as side effect of calling mapSearchFull() */ - -/* - 720 */ - - unsigned int patcharea[56]; -/* - 800 */ - - hwCtrs hwCtr; /* Hardware exception counters */ -/* - A00 */ - addr64_t pp2ndPage; /* Physical address of the second page of the per_proc */ - addr64_t ijsave; /* Pointer to original savearea for injected code */ - uint32_t pprsvd0A10[4]; -/* - A20 */ - pmsd pms; /* Power Management Stepper control */ - unsigned int pprsvd0A40[16]; /* Reserved */ -/* - A80 */ - uint32_t pprsvd0A80[16]; /* Reserved */ - - unsigned int pprsvd0AC0[336]; /* Reserved out to next page boundary */ -/* - 1000 */ - -/* - * This is the start of the second page of the per_proc block. Because we do not - * allocate physically contiguous memory, it may be physically discontiguous from the - * first page. Currently there isn't anything here that is accessed translation off, - * but if we need it, pp2ndPage contains the physical address. - * - * Note that the boot processor's per_proc is statically allocated, so it will be a - * V=R contiguous area. That allows access during early boot before we turn translation on - * for the first time. - */ - - unsigned int processor[384]; /* processor structure */ - - unsigned int pprsvd1[640]; /* Reserved out to next page boundary */ -/* - 2000 */ - -}; - -#pragma pack() - - -/* - * Macro to convert a processor_t processor to its attached per_proc_info_t per_proc - */ -#define PROCESSOR_TO_PER_PROC(x) \ - ((struct per_proc_info*)((unsigned int)(x) \ - - (unsigned int)(((struct per_proc_info *)0)->processor))) - -extern struct per_proc_info BootProcInfo; - -#define MAX_CPUS 256 - -struct per_proc_entry { - addr64_t ppe_paddr; /* Physical address of the first page of per_proc, 2nd is in pp2ndPage. */ - unsigned int ppe_pad4[1]; - struct per_proc_info *ppe_vaddr; /* Virtual address of the per_proc */ -}; - -extern struct per_proc_entry PerProcTable[MAX_CPUS-1]; - - -extern const char *trap_type[]; - -#endif /* ndef ASSEMBLER */ /* with this savearea should be redriven */ - -/* cpu_flags defs */ -#define SIGPactive 0x8000 -#define needSRload 0x4000 -#define turnEEon 0x2000 -#define SleepState 0x0800 -#define SleepStateb 4 -#define mcountOff 0x0400 -#define SignalReady 0x0200 -#define BootDone 0x0100 -#define loadMSR 0x7FF4 - -/* ppXFlags defs */ -#define SignalReadyWait 0x00000001 - -#define T_VECTOR_SIZE 4 /* function pointer size */ - -/* Hardware exceptions */ - -#define T_IN_VAIN (0x00 * T_VECTOR_SIZE) -#define T_RESET (0x01 * T_VECTOR_SIZE) -#define T_MACHINE_CHECK (0x02 * T_VECTOR_SIZE) -#define T_DATA_ACCESS (0x03 * T_VECTOR_SIZE) -#define T_INSTRUCTION_ACCESS (0x04 * T_VECTOR_SIZE) -#define T_INTERRUPT (0x05 * T_VECTOR_SIZE) -#define T_ALIGNMENT (0x06 * T_VECTOR_SIZE) -#define T_PROGRAM (0x07 * T_VECTOR_SIZE) -#define T_FP_UNAVAILABLE (0x08 * T_VECTOR_SIZE) -#define T_DECREMENTER (0x09 * T_VECTOR_SIZE) -#define T_IO_ERROR (0x0a * T_VECTOR_SIZE) -#define T_RESERVED (0x0b * T_VECTOR_SIZE) -#define T_SYSTEM_CALL (0x0c * T_VECTOR_SIZE) -#define T_TRACE (0x0d * T_VECTOR_SIZE) -#define T_FP_ASSIST (0x0e * T_VECTOR_SIZE) -#define T_PERF_MON (0x0f * T_VECTOR_SIZE) -#define T_VMX (0x10 * T_VECTOR_SIZE) -#define T_INVALID_EXCP0 (0x11 * T_VECTOR_SIZE) -#define T_INVALID_EXCP1 (0x12 * T_VECTOR_SIZE) -#define T_INVALID_EXCP2 (0x13 * T_VECTOR_SIZE) -#define T_INSTRUCTION_BKPT (0x14 * T_VECTOR_SIZE) -#define T_SYSTEM_MANAGEMENT (0x15 * T_VECTOR_SIZE) -#define T_ALTIVEC_ASSIST (0x16 * T_VECTOR_SIZE) -#define T_THERMAL (0x17 * T_VECTOR_SIZE) -#define T_INVALID_EXCP5 (0x18 * T_VECTOR_SIZE) -#define T_INVALID_EXCP6 (0x19 * T_VECTOR_SIZE) -#define T_INVALID_EXCP7 (0x1A * T_VECTOR_SIZE) -#define T_INVALID_EXCP8 (0x1B * T_VECTOR_SIZE) -#define T_INVALID_EXCP9 (0x1C * T_VECTOR_SIZE) -#define T_INVALID_EXCP10 (0x1D * T_VECTOR_SIZE) -#define T_INVALID_EXCP11 (0x1E * T_VECTOR_SIZE) -#define T_INVALID_EXCP12 (0x1F * T_VECTOR_SIZE) -#define T_EMULATE (0x20 * T_VECTOR_SIZE) - -#define T_RUNMODE_TRACE (0x21 * T_VECTOR_SIZE) /* 601 only */ - -#define T_SIGP (0x22 * T_VECTOR_SIZE) -#define T_PREEMPT (0x23 * T_VECTOR_SIZE) -#define T_CSWITCH (0x24 * T_VECTOR_SIZE) -#define T_SHUTDOWN (0x25 * T_VECTOR_SIZE) -#define T_CHOKE (0x26 * T_VECTOR_SIZE) - -#define T_DATA_SEGMENT (0x27 * T_VECTOR_SIZE) -#define T_INSTRUCTION_SEGMENT (0x28 * T_VECTOR_SIZE) - -#define T_SOFT_PATCH (0x29 * T_VECTOR_SIZE) -#define T_MAINTENANCE (0x2A * T_VECTOR_SIZE) -#define T_INSTRUMENTATION (0x2B * T_VECTOR_SIZE) -#define T_ARCHDEP0 (0x2C * T_VECTOR_SIZE) -#define T_HDEC (0x2D * T_VECTOR_SIZE) -#define T_INJECT_EXIT (0x2E * T_VECTOR_SIZE) -#define T_DTRACE_RET T_INJECT_EXIT - -#define T_AST (0x100 * T_VECTOR_SIZE) -#define T_MAX T_CHOKE /* Maximum exception no */ - -#define T_FAM 0x00004000 - -#define EXCEPTION_VECTOR(exception) (exception * 0x100 / T_VECTOR_SIZE ) - -/* - * System choke (failure) codes - */ - -#define failDebug 0 -#define failStack 1 -#define failMapping 2 -#define failContext 3 -#define failNoSavearea 4 -#define failSaveareaCorr 5 -#define failBadLiveContext 6 -#define failSkipLists 7 -#define failUnalignedStk 8 -#define failPmap 9 -#define failTimeout 10 - -/* Always must be last - update failNames table in model_dep.c as well */ -#define failUnknown 11 - -#ifndef ASSEMBLER - -#pragma pack(4) /* Make sure the structure stays as we defined it */ -typedef struct resethandler { - unsigned int type; - vm_offset_t call_paddr; - vm_offset_t arg__paddr; -} resethandler_t; -#pragma pack() - -extern resethandler_t ResetHandler; - -#endif - -#define RESET_HANDLER_NULL 0x0 -#define RESET_HANDLER_START 0x1 -#define RESET_HANDLER_BUPOR 0x2 -#define RESET_HANDLER_IGNORE 0x3 - -#endif /* _PPC_EXCEPTION_H_ */ diff --git a/osfmk/ppc/fpu_protos.h b/osfmk/ppc/fpu_protos.h deleted file mode 100644 index 7ceed096a..000000000 --- a/osfmk/ppc/fpu_protos.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - * - */ - -#ifndef _PPC_FPU_PROTOS_H_ -#define _PPC_FPU_PROTOS_H_ - -#include - -extern void fpu_save(struct facility_context *); -extern void fpu_disable(void); - -#endif /* _PPC_FPU_PROTOS_H_ */ diff --git a/osfmk/ppc/genassym.c b/osfmk/ppc/genassym.c deleted file mode 100644 index 8207aeb55..000000000 --- a/osfmk/ppc/genassym.c +++ /dev/null @@ -1,1438 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - * - */ - -/* - * genassym.c is used to produce an - * assembly file which, intermingled with unuseful assembly code, - * has all the necessary definitions emitted. This assembly file is - * then postprocessed with sed to extract only these definitions - * and thus the final assyms.s is created. - * - * This convoluted means is necessary since the structure alignment - * and packing may be different between the host machine and the - * target so we are forced into using the cross compiler to generate - * the values, but we cannot run anything on the target machine. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if CONFIG_DTRACE -#define NEED_DTRACE_DEFS -#include <../bsd/sys/lockstat.h> -#endif - -/* Undefine standard offsetof because it is different than the one here */ -#undef offsetof -#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE)0)->MEMBER) - -#define DECLARE(SYM,VAL) \ - __asm("#DEFINITION##define\t" SYM "\t%0" : : "n" ((u_int)(VAL))) - -int main(int argc, char *argv[]) -{ - /* Process Control Block */ - DECLARE("ACT_MACT_KSP", offsetof(thread_t, machine.ksp)); - DECLARE("ACT_MACT_BEDA", offsetof(thread_t, machine.bbDescAddr)); - DECLARE("ACT_MACT_BTS", offsetof(thread_t, machine.bbTableStart)); - DECLARE("ACT_MACT_BTE", offsetof(thread_t, machine.bbTaskEnv)); - DECLARE("ACT_MACT_SPF", offsetof(thread_t, machine.specFlags)); - DECLARE("ACT_PREEMPT_CNT", offsetof(thread_t, machine.preemption_count)); - DECLARE("ACT_PER_PROC", offsetof(thread_t, machine.PerProc)); - DECLARE("qactTimer", offsetof(thread_t, machine.qactTimer)); - DECLARE("umwSpace", offsetof(thread_t, machine.umwSpace)); - DECLARE("umwRelo", offsetof(thread_t, machine.umwRelo)); - DECLARE("umwSwitchAway", umwSwitchAway); - DECLARE("umwSwitchAwayb", umwSwitchAwayb); - DECLARE("bbTrap", offsetof(thread_t, machine.bbTrap)); - DECLARE("bbSysCall", offsetof(thread_t, machine.bbSysCall)); - DECLARE("bbInterrupt", offsetof(thread_t, machine.bbInterrupt)); - DECLARE("bbPending", offsetof(thread_t, machine.bbPending)); - - DECLARE("floatUsed", floatUsed); - DECLARE("vectorUsed", vectorUsed); - DECLARE("runningVM", runningVM); - DECLARE("runningVMbit", runningVMbit); - DECLARE("floatCng", floatCng); - DECLARE("floatCngbit", floatCngbit); - DECLARE("vectorCng", vectorCng); - DECLARE("vectorCngbit", vectorCngbit); - DECLARE("userProtKey", userProtKey); - DECLARE("userProtKeybit", userProtKeybit); - - DECLARE("bbThread", bbThread); - DECLARE("bbThreadbit", bbThreadbit); - DECLARE("bbNoMachSC", bbNoMachSC); - DECLARE("bbNoMachSCbit",bbNoMachSCbit); - DECLARE("bbPreemptive", bbPreemptive); - DECLARE("bbPreemptivebit", bbPreemptivebit); - - DECLARE("fvChkb", fvChkb); - DECLARE("fvChk", fvChk); - DECLARE("FamVMena", FamVMena); - DECLARE("FamVMenabit", FamVMenabit); - DECLARE("FamVMmode", FamVMmode); - DECLARE("FamVMmodebit", FamVMmodebit); - DECLARE("perfMonitor", perfMonitor); - DECLARE("perfMonitorbit", perfMonitorbit); - DECLARE("OnProc", OnProc); - DECLARE("OnProcbit", OnProcbit); - - /* Per Proc info structure */ - DECLARE("PP_CPU_NUMBER", offsetof(struct per_proc_info *, cpu_number)); - DECLARE("PP_CPU_FLAGS", offsetof(struct per_proc_info *, cpu_flags)); - DECLARE("PP_ISTACKPTR", offsetof(struct per_proc_info *, istackptr)); - DECLARE("PP_INTSTACK_TOP_SS", offsetof(struct per_proc_info *, intstack_top_ss)); - DECLARE("PP_DEBSTACKPTR", offsetof(struct per_proc_info *, debstackptr)); - DECLARE("PP_DEBSTACK_TOP_SS", offsetof(struct per_proc_info *, debstack_top_ss)); - DECLARE("PP_HIBERNATE", offsetof(struct per_proc_info *, hibernate)); - DECLARE("FPUowner", offsetof(struct per_proc_info *, FPU_owner)); - DECLARE("VMXowner", offsetof(struct per_proc_info *, VMX_owner)); - DECLARE("holdQFret", offsetof(struct per_proc_info *, holdQFret)); - DECLARE("rtcPop", offsetof(struct per_proc_info *, rtcPop)); - - DECLARE("PP_PENDING_AST", offsetof(struct per_proc_info *, pending_ast)); - DECLARE("quickfret", offsetof(struct per_proc_info *, quickfret)); - DECLARE("lclfree", offsetof(struct per_proc_info *, lclfree)); - DECLARE("lclfreecnt", offsetof(struct per_proc_info *, lclfreecnt)); - DECLARE("PP_INTS_ENABLED", offsetof(struct per_proc_info *, interrupts_enabled)); - DECLARE("UAW", offsetof(struct per_proc_info *, Uassist)); - DECLARE("next_savearea", offsetof(struct per_proc_info *, next_savearea)); - DECLARE("ppbbTaskEnv", offsetof(struct per_proc_info *, ppbbTaskEnv)); - DECLARE("liveVRS", offsetof(struct per_proc_info *, liveVRSave)); - DECLARE("spcFlags", offsetof(struct per_proc_info *, spcFlags)); - DECLARE("spcTRc", offsetof(struct per_proc_info *, spcTRc)); - DECLARE("spcTRp", offsetof(struct per_proc_info *, spcTRp)); - DECLARE("ruptStamp", offsetof(struct per_proc_info *, ruptStamp)); - DECLARE("pfAvailable", offsetof(struct per_proc_info *, pf.Available)); - DECLARE("pfFloat", pfFloat); - DECLARE("pfFloatb", pfFloatb); - DECLARE("pfAltivec", pfAltivec); - DECLARE("pfAltivecb", pfAltivecb); - DECLARE("pfAvJava", pfAvJava); - DECLARE("pfAvJavab", pfAvJavab); - DECLARE("pfSMPcap", pfSMPcap); - DECLARE("pfSMPcapb", pfSMPcapb); - DECLARE("pfCanSleep", pfCanSleep); - DECLARE("pfCanSleepb", pfCanSleepb); - DECLARE("pfCanNap", pfCanNap); - DECLARE("pfCanNapb", pfCanNapb); - DECLARE("pfCanDoze", pfCanDoze); - DECLARE("pfCanDozeb", pfCanDozeb); - DECLARE("pfSlowNap", pfSlowNap); - DECLARE("pfSlowNapb", pfSlowNapb); - DECLARE("pfNoMuMMCK", pfNoMuMMCK); - DECLARE("pfNoMuMMCKb", pfNoMuMMCKb); - DECLARE("pfNoL2PFNap", pfNoL2PFNap); - DECLARE("pfNoL2PFNapb", pfNoL2PFNapb); - DECLARE("pfSCOMFixUp", pfSCOMFixUp); - DECLARE("pfSCOMFixUpb", pfSCOMFixUpb); - DECLARE("pfHasDcba", pfHasDcba); - DECLARE("pfHasDcbab", pfHasDcbab); - DECLARE("pfL1fa", pfL1fa); - DECLARE("pfL1fab", pfL1fab); - DECLARE("pfL2", pfL2); - DECLARE("pfL2b", pfL2b); - DECLARE("pfL2fa", pfL2fa); - DECLARE("pfL2fab", pfL2fab); - DECLARE("pfL2i", pfL2i); - DECLARE("pfL2ib", pfL2ib); - DECLARE("pfLClck", pfLClck); - DECLARE("pfLClckb", pfLClckb); - DECLARE("pfWillNap", pfWillNap); - DECLARE("pfWillNapb", pfWillNapb); - DECLARE("pfNoMSRir", pfNoMSRir); - DECLARE("pfNoMSRirb", pfNoMSRirb); - DECLARE("pfL3pdet", pfL3pdet); - DECLARE("pfL3pdetb", pfL3pdetb); - DECLARE("pf128Byte", pf128Byte); - DECLARE("pf128Byteb", pf128Byteb); - DECLARE("pf32Byte", pf32Byte); - DECLARE("pf32Byteb", pf32Byteb); - DECLARE("pf64Bit", pf64Bit); - DECLARE("pf64Bitb", pf64Bitb); - DECLARE("pfL3", pfL3); - DECLARE("pfL3b", pfL3b); - DECLARE("pfL3fa", pfL3fa); - DECLARE("pfL3fab", pfL3fab); - DECLARE("pfValid", pfValid); - DECLARE("pfValidb", pfValidb); - DECLARE("pfrptdProc", offsetof(struct per_proc_info *, pf.rptdProc)); - DECLARE("pflineSize", offsetof(struct per_proc_info *, pf.lineSize)); - DECLARE("pfl1iSize", offsetof(struct per_proc_info *, pf.l1iSize)); - DECLARE("pfl1dSize", offsetof(struct per_proc_info *, pf.l1dSize)); - DECLARE("pfl2cr", offsetof(struct per_proc_info *, pf.l2cr)); - DECLARE("pfl2Size", offsetof(struct per_proc_info *, pf.l2Size)); - DECLARE("pfl3cr", offsetof(struct per_proc_info *, pf.l3cr)); - DECLARE("pfl3Size", offsetof(struct per_proc_info *, pf.l3Size)); - DECLARE("pfHID0", offsetof(struct per_proc_info *, pf.pfHID0)); - DECLARE("pfHID1", offsetof(struct per_proc_info *, pf.pfHID1)); - DECLARE("pfHID2", offsetof(struct per_proc_info *, pf.pfHID2)); - DECLARE("pfHID3", offsetof(struct per_proc_info *, pf.pfHID3)); - DECLARE("pfHID4", offsetof(struct per_proc_info *, pf.pfHID4)); - DECLARE("pfHID5", offsetof(struct per_proc_info *, pf.pfHID5)); - DECLARE("pfMSSCR0", offsetof(struct per_proc_info *, pf.pfMSSCR0)); - DECLARE("pfMSSCR1", offsetof(struct per_proc_info *, pf.pfMSSCR1)); - DECLARE("pfICTRL", offsetof(struct per_proc_info *, pf.pfICTRL)); - DECLARE("pfLDSTCR", offsetof(struct per_proc_info *, pf.pfLDSTCR)); - DECLARE("pfLDSTDB", offsetof(struct per_proc_info *, pf.pfLDSTDB)); - DECLARE("pfl2crOriginal", offsetof(struct per_proc_info *, pf.l2crOriginal)); - DECLARE("pfl3crOriginal", offsetof(struct per_proc_info *, pf.l3crOriginal)); - DECLARE("pfBootConfig", offsetof(struct per_proc_info *, pf.pfBootConfig)); - DECLARE("pfPowerModes", offsetof(struct per_proc_info *, pf.pfPowerModes)); - DECLARE("pfPowerTune0", offsetof(struct per_proc_info *, pf.pfPowerTune0)); - DECLARE("pfPowerTune1", offsetof(struct per_proc_info *, pf.pfPowerTune1)); - DECLARE("pmType", pmType); - DECLARE("pmDPLLVmin", pmDPLLVmin); - DECLARE("pmDPLLVminb", pmDPLLVminb); - DECLARE("pmPowerTune", pmPowerTune); - DECLARE("pmDFS", pmDFS); - DECLARE("pmDualPLL", pmDualPLL); - DECLARE("pfPTEG", offsetof(struct per_proc_info *, pf.pfPTEG)); - DECLARE("pfMaxVAddr", offsetof(struct per_proc_info *, pf.pfMaxVAddr)); - DECLARE("pfMaxPAddr", offsetof(struct per_proc_info *, pf.pfMaxPAddr)); - DECLARE("pfSize", sizeof(procFeatures)); - - DECLARE("validSegs", offsetof(struct per_proc_info *, validSegs)); - DECLARE("ppUserPmapVirt", offsetof(struct per_proc_info *, ppUserPmapVirt)); - DECLARE("ppUserPmap", offsetof(struct per_proc_info *, ppUserPmap)); - DECLARE("ppMapFlags", offsetof(struct per_proc_info *, ppMapFlags)); - DECLARE("ppInvSeg", offsetof(struct per_proc_info *, ppInvSeg)); - DECLARE("ppCurSeg", offsetof(struct per_proc_info *, ppCurSeg)); - DECLARE("ppSegSteal", offsetof(struct per_proc_info *, ppSegSteal)); - - DECLARE("VMMareaPhys", offsetof(struct per_proc_info *, VMMareaPhys)); - DECLARE("VMMXAFlgs", offsetof(struct per_proc_info *, VMMXAFlgs)); - DECLARE("FAMintercept", offsetof(struct per_proc_info *, FAMintercept)); - - DECLARE("ppUMWmp", offsetof(struct per_proc_info *, ppUMWmp)); - - DECLARE("tempr0", offsetof(struct per_proc_info *, tempr0)); - DECLARE("tempr1", offsetof(struct per_proc_info *, tempr1)); - DECLARE("tempr2", offsetof(struct per_proc_info *, tempr2)); - DECLARE("tempr3", offsetof(struct per_proc_info *, tempr3)); - DECLARE("tempr4", offsetof(struct per_proc_info *, tempr4)); - DECLARE("tempr5", offsetof(struct per_proc_info *, tempr5)); - DECLARE("tempr6", offsetof(struct per_proc_info *, tempr6)); - DECLARE("tempr7", offsetof(struct per_proc_info *, tempr7)); - DECLARE("tempr8", offsetof(struct per_proc_info *, tempr8)); - DECLARE("tempr9", offsetof(struct per_proc_info *, tempr9)); - DECLARE("tempr10", offsetof(struct per_proc_info *, tempr10)); - DECLARE("tempr11", offsetof(struct per_proc_info *, tempr11)); - DECLARE("tempr12", offsetof(struct per_proc_info *, tempr12)); - DECLARE("tempr13", offsetof(struct per_proc_info *, tempr13)); - DECLARE("tempr14", offsetof(struct per_proc_info *, tempr14)); - DECLARE("tempr15", offsetof(struct per_proc_info *, tempr15)); - DECLARE("tempr16", offsetof(struct per_proc_info *, tempr16)); - DECLARE("tempr17", offsetof(struct per_proc_info *, tempr17)); - DECLARE("tempr18", offsetof(struct per_proc_info *, tempr18)); - DECLARE("tempr19", offsetof(struct per_proc_info *, tempr19)); - DECLARE("tempr20", offsetof(struct per_proc_info *, tempr20)); - DECLARE("tempr21", offsetof(struct per_proc_info *, tempr21)); - DECLARE("tempr22", offsetof(struct per_proc_info *, tempr22)); - DECLARE("tempr23", offsetof(struct per_proc_info *, tempr23)); - DECLARE("tempr24", offsetof(struct per_proc_info *, tempr24)); - DECLARE("tempr25", offsetof(struct per_proc_info *, tempr25)); - DECLARE("tempr26", offsetof(struct per_proc_info *, tempr26)); - DECLARE("tempr27", offsetof(struct per_proc_info *, tempr27)); - DECLARE("tempr28", offsetof(struct per_proc_info *, tempr28)); - DECLARE("tempr29", offsetof(struct per_proc_info *, tempr29)); - DECLARE("tempr30", offsetof(struct per_proc_info *, tempr30)); - DECLARE("tempr31", offsetof(struct per_proc_info *, tempr31)); - - DECLARE("emfp0", offsetof(struct per_proc_info *, emfp0)); - DECLARE("emfp1", offsetof(struct per_proc_info *, emfp1)); - DECLARE("emfp2", offsetof(struct per_proc_info *, emfp2)); - DECLARE("emfp3", offsetof(struct per_proc_info *, emfp3)); - DECLARE("emfp4", offsetof(struct per_proc_info *, emfp4)); - DECLARE("emfp5", offsetof(struct per_proc_info *, emfp5)); - DECLARE("emfp6", offsetof(struct per_proc_info *, emfp6)); - DECLARE("emfp7", offsetof(struct per_proc_info *, emfp7)); - DECLARE("emfp8", offsetof(struct per_proc_info *, emfp8)); - DECLARE("emfp9", offsetof(struct per_proc_info *, emfp9)); - DECLARE("emfp10", offsetof(struct per_proc_info *, emfp10)); - DECLARE("emfp11", offsetof(struct per_proc_info *, emfp11)); - DECLARE("emfp12", offsetof(struct per_proc_info *, emfp12)); - DECLARE("emfp13", offsetof(struct per_proc_info *, emfp13)); - DECLARE("emfp14", offsetof(struct per_proc_info *, emfp14)); - DECLARE("emfp15", offsetof(struct per_proc_info *, emfp15)); - DECLARE("emfp16", offsetof(struct per_proc_info *, emfp16)); - DECLARE("emfp17", offsetof(struct per_proc_info *, emfp17)); - DECLARE("emfp18", offsetof(struct per_proc_info *, emfp18)); - DECLARE("emfp19", offsetof(struct per_proc_info *, emfp19)); - DECLARE("emfp20", offsetof(struct per_proc_info *, emfp20)); - DECLARE("emfp21", offsetof(struct per_proc_info *, emfp21)); - DECLARE("emfp22", offsetof(struct per_proc_info *, emfp22)); - DECLARE("emfp23", offsetof(struct per_proc_info *, emfp23)); - DECLARE("emfp24", offsetof(struct per_proc_info *, emfp24)); - DECLARE("emfp25", offsetof(struct per_proc_info *, emfp25)); - DECLARE("emfp26", offsetof(struct per_proc_info *, emfp26)); - DECLARE("emfp27", offsetof(struct per_proc_info *, emfp27)); - DECLARE("emfp28", offsetof(struct per_proc_info *, emfp28)); - DECLARE("emfp29", offsetof(struct per_proc_info *, emfp29)); - DECLARE("emfp30", offsetof(struct per_proc_info *, emfp30)); - DECLARE("emfp31", offsetof(struct per_proc_info *, emfp31)); - DECLARE("emfpscr_pad", offsetof(struct per_proc_info *, emfpscr_pad)); - DECLARE("emfpscr", offsetof(struct per_proc_info *, emfpscr)); - - DECLARE("emvr0", offsetof(struct per_proc_info *, emvr0)); - DECLARE("emvr1", offsetof(struct per_proc_info *, emvr1)); - DECLARE("emvr2", offsetof(struct per_proc_info *, emvr2)); - DECLARE("emvr3", offsetof(struct per_proc_info *, emvr3)); - DECLARE("emvr4", offsetof(struct per_proc_info *, emvr4)); - DECLARE("emvr5", offsetof(struct per_proc_info *, emvr5)); - DECLARE("emvr6", offsetof(struct per_proc_info *, emvr6)); - DECLARE("emvr7", offsetof(struct per_proc_info *, emvr7)); - DECLARE("emvr8", offsetof(struct per_proc_info *, emvr8)); - DECLARE("emvr9", offsetof(struct per_proc_info *, emvr9)); - DECLARE("emvr10", offsetof(struct per_proc_info *, emvr10)); - DECLARE("emvr11", offsetof(struct per_proc_info *, emvr11)); - DECLARE("emvr12", offsetof(struct per_proc_info *, emvr12)); - DECLARE("emvr13", offsetof(struct per_proc_info *, emvr13)); - DECLARE("emvr14", offsetof(struct per_proc_info *, emvr14)); - DECLARE("emvr15", offsetof(struct per_proc_info *, emvr15)); - DECLARE("emvr16", offsetof(struct per_proc_info *, emvr16)); - DECLARE("emvr17", offsetof(struct per_proc_info *, emvr17)); - DECLARE("emvr18", offsetof(struct per_proc_info *, emvr18)); - DECLARE("emvr19", offsetof(struct per_proc_info *, emvr19)); - DECLARE("emvr20", offsetof(struct per_proc_info *, emvr20)); - DECLARE("emvr21", offsetof(struct per_proc_info *, emvr21)); - DECLARE("emvr22", offsetof(struct per_proc_info *, emvr22)); - DECLARE("emvr23", offsetof(struct per_proc_info *, emvr23)); - DECLARE("emvr24", offsetof(struct per_proc_info *, emvr24)); - DECLARE("emvr25", offsetof(struct per_proc_info *, emvr25)); - DECLARE("emvr26", offsetof(struct per_proc_info *, emvr26)); - DECLARE("emvr27", offsetof(struct per_proc_info *, emvr27)); - DECLARE("emvr28", offsetof(struct per_proc_info *, emvr28)); - DECLARE("emvr29", offsetof(struct per_proc_info *, emvr29)); - DECLARE("emvr30", offsetof(struct per_proc_info *, emvr30)); - DECLARE("emvr31", offsetof(struct per_proc_info *, emvr31)); - DECLARE("empadvr", offsetof(struct per_proc_info *, empadvr)); - DECLARE("skipListPrev", offsetof(struct per_proc_info *, skipListPrev)); - DECLARE("ppSize", sizeof(struct per_proc_info)); - DECLARE("ppe_paddr", offsetof(struct per_proc_entry *, ppe_paddr)); - DECLARE("ppe_vaddr", offsetof(struct per_proc_entry *, ppe_vaddr)); - DECLARE("ppeSize", sizeof(struct per_proc_entry)); - DECLARE("MAX_CPUS", MAX_CPUS); - DECLARE("patcharea", offsetof(struct per_proc_info *, patcharea)); - - DECLARE("hwCounts", offsetof(struct per_proc_info *, hwCtr)); - DECLARE("hwInVains", offsetof(struct per_proc_info *, hwCtr.hwInVains)); - DECLARE("hwResets", offsetof(struct per_proc_info *, hwCtr.hwResets)); - DECLARE("hwMachineChecks", offsetof(struct per_proc_info *, hwCtr.hwMachineChecks)); - DECLARE("hwDSIs", offsetof(struct per_proc_info *, hwCtr.hwDSIs)); - DECLARE("hwISIs", offsetof(struct per_proc_info *, hwCtr.hwISIs)); - DECLARE("hwExternals", offsetof(struct per_proc_info *, hwCtr.hwExternals)); - DECLARE("hwAlignments", offsetof(struct per_proc_info *, hwCtr.hwAlignments)); - DECLARE("hwPrograms", offsetof(struct per_proc_info *, hwCtr.hwPrograms)); - DECLARE("hwFloatPointUnavailable", offsetof(struct per_proc_info *, hwCtr.hwFloatPointUnavailable)); - DECLARE("hwDecrementers", offsetof(struct per_proc_info *, hwCtr.hwDecrementers)); - DECLARE("hwIOErrors", offsetof(struct per_proc_info *, hwCtr.hwIOErrors)); - DECLARE("hwrsvd0", offsetof(struct per_proc_info *, hwCtr.hwrsvd0)); - DECLARE("hwSystemCalls", offsetof(struct per_proc_info *, hwCtr.hwSystemCalls)); - DECLARE("hwTraces", offsetof(struct per_proc_info *, hwCtr.hwTraces)); - DECLARE("hwFloatingPointAssists", offsetof(struct per_proc_info *, hwCtr.hwFloatingPointAssists)); - DECLARE("hwPerformanceMonitors", offsetof(struct per_proc_info *, hwCtr.hwPerformanceMonitors)); - DECLARE("hwAltivecs", offsetof(struct per_proc_info *, hwCtr.hwAltivecs)); - DECLARE("hwrsvd1", offsetof(struct per_proc_info *, hwCtr.hwrsvd1)); - DECLARE("hwrsvd2", offsetof(struct per_proc_info *, hwCtr.hwrsvd2)); - DECLARE("hwrsvd3", offsetof(struct per_proc_info *, hwCtr.hwrsvd3)); - DECLARE("hwInstBreakpoints", offsetof(struct per_proc_info *, hwCtr.hwInstBreakpoints)); - DECLARE("hwSystemManagements", offsetof(struct per_proc_info *, hwCtr.hwSystemManagements)); - DECLARE("hwAltivecAssists", offsetof(struct per_proc_info *, hwCtr.hwAltivecAssists)); - DECLARE("hwThermal", offsetof(struct per_proc_info *, hwCtr.hwThermal)); - DECLARE("hwrsvd5", offsetof(struct per_proc_info *, hwCtr.hwrsvd5)); - DECLARE("hwrsvd6", offsetof(struct per_proc_info *, hwCtr.hwrsvd6)); - DECLARE("hwrsvd7", offsetof(struct per_proc_info *, hwCtr.hwrsvd7)); - DECLARE("hwrsvd8", offsetof(struct per_proc_info *, hwCtr.hwrsvd8)); - DECLARE("hwrsvd9", offsetof(struct per_proc_info *, hwCtr.hwrsvd9)); - DECLARE("hwrsvd10", offsetof(struct per_proc_info *, hwCtr.hwrsvd10)); - DECLARE("hwrsvd11", offsetof(struct per_proc_info *, hwCtr.hwrsvd11)); - DECLARE("hwrsvd12", offsetof(struct per_proc_info *, hwCtr.hwrsvd12)); - DECLARE("hwrsvd13", offsetof(struct per_proc_info *, hwCtr.hwrsvd13)); - DECLARE("hwTrace601", offsetof(struct per_proc_info *, hwCtr.hwTrace601)); - DECLARE("hwSIGPs", offsetof(struct per_proc_info *, hwCtr.hwSIGPs)); - DECLARE("hwPreemptions", offsetof(struct per_proc_info *, hwCtr.hwPreemptions)); - DECLARE("hwContextSwitchs", offsetof(struct per_proc_info *, hwCtr.hwContextSwitchs)); - DECLARE("hwShutdowns", offsetof(struct per_proc_info *, hwCtr.hwShutdowns)); - DECLARE("hwChokes", offsetof(struct per_proc_info *, hwCtr.hwChokes)); - DECLARE("hwDataSegments", offsetof(struct per_proc_info *, hwCtr.hwDataSegments)); - DECLARE("hwInstructionSegments", offsetof(struct per_proc_info *, hwCtr.hwInstructionSegments)); - DECLARE("hwSoftPatches", offsetof(struct per_proc_info *, hwCtr.hwSoftPatches)); - DECLARE("hwMaintenances", offsetof(struct per_proc_info *, hwCtr.hwMaintenances)); - DECLARE("hwInstrumentations", offsetof(struct per_proc_info *, hwCtr.hwInstrumentations)); - DECLARE("hwRedrives", offsetof(struct per_proc_info *, hwCtr.hwRedrives)); - DECLARE("hwIgnored", offsetof(struct per_proc_info *, hwCtr.hwIgnored)); - DECLARE("hwhdec", offsetof(struct per_proc_info *, hwCtr.hwhdec)); - DECLARE("hwSteals", offsetof(struct per_proc_info *, hwCtr.hwSteals)); - - DECLARE("hwWalkPhys", offsetof(struct per_proc_info *, hwCtr.hwWalkPhys)); - DECLARE("hwWalkFull", offsetof(struct per_proc_info *, hwCtr.hwWalkFull)); - DECLARE("hwWalkMerge", offsetof(struct per_proc_info *, hwCtr.hwWalkMerge)); - DECLARE("hwWalkQuick", offsetof(struct per_proc_info *, hwCtr.hwWalkQuick)); - - DECLARE("hwMckHang", offsetof(struct per_proc_info *, hwCtr.hwMckHang)); - DECLARE("hwMckSLBPE", offsetof(struct per_proc_info *, hwCtr.hwMckSLBPE)); - DECLARE("hwMckTLBPE", offsetof(struct per_proc_info *, hwCtr.hwMckTLBPE)); - DECLARE("hwMckERCPE", offsetof(struct per_proc_info *, hwCtr.hwMckERCPE)); - DECLARE("hwMckL1DPE", offsetof(struct per_proc_info *, hwCtr.hwMckL1DPE)); - DECLARE("hwMckL1TPE", offsetof(struct per_proc_info *, hwCtr.hwMckL1TPE)); - DECLARE("hwMckUE", offsetof(struct per_proc_info *, hwCtr.hwMckUE)); - DECLARE("hwMckIUE", offsetof(struct per_proc_info *, hwCtr.hwMckIUE)); - DECLARE("hwMckIUEr", offsetof(struct per_proc_info *, hwCtr.hwMckIUEr)); - DECLARE("hwMckDUE", offsetof(struct per_proc_info *, hwCtr.hwMckDUE)); - DECLARE("hwMckDTW", offsetof(struct per_proc_info *, hwCtr.hwMckDTW)); - DECLARE("hwMckUnk", offsetof(struct per_proc_info *, hwCtr.hwMckUnk)); - DECLARE("hwMckExt", offsetof(struct per_proc_info *, hwCtr.hwMckExt)); - DECLARE("hwMckICachePE", offsetof(struct per_proc_info *, hwCtr.hwMckICachePE)); - DECLARE("hwMckITagPE", offsetof(struct per_proc_info *, hwCtr.hwMckITagPE)); - DECLARE("hwMckIEratPE", offsetof(struct per_proc_info *, hwCtr.hwMckIEratPE)); - DECLARE("hwMckDEratPE", offsetof(struct per_proc_info *, hwCtr.hwMckDEratPE)); - - DECLARE("ijsave", offsetof(struct per_proc_info *, ijsave)); - - DECLARE("napStamp", offsetof(struct per_proc_info *, hwCtr.napStamp)); - DECLARE("napTotal", offsetof(struct per_proc_info *, hwCtr.napTotal)); - DECLARE("PP_PROCESSOR", offsetof(struct per_proc_info *, processor[0])); - DECLARE("PP_PROCESSOR_SIZE", sizeof(((struct per_proc_info *)0)->processor)); - DECLARE("PROCESSOR_SIZE", sizeof (struct processor)); - - DECLARE("patchAddr", offsetof(struct patch_entry *, addr)); - DECLARE("patchData", offsetof(struct patch_entry *, data)); - DECLARE("patchType", offsetof(struct patch_entry *, type)); - DECLARE("patchValue", offsetof(struct patch_entry *, value)); - DECLARE("peSize", sizeof(patch_entry_t)); - DECLARE("PATCH_PROCESSOR", PATCH_PROCESSOR); - DECLARE("PATCH_FEATURE", PATCH_FEATURE); - DECLARE("PATCH_END_OF_TABLE", PATCH_END_OF_TABLE); - DECLARE("PatchExt32", PatchExt32); - DECLARE("PatchExt32b", PatchExt32b); - DECLARE("PatchLwsync", PatchLwsync); - DECLARE("PatchLwsyncb", PatchLwsyncb); - - DECLARE("RESETHANDLER_TYPE", offsetof(struct resethandler *, type)); - DECLARE("RESETHANDLER_CALL", offsetof(struct resethandler *, call_paddr)); - DECLARE("RESETHANDLER_ARG", offsetof(struct resethandler *, arg__paddr)); - - /* we want offset from - * bottom of kernel stack, not offset into structure - */ -#define IKSBASE (u_int)STACK_IKS(0) - - /* values from kern/thread.h */ - DECLARE("THREAD_STATE", offsetof(thread_t, state)); - DECLARE("TH_IDLE", TH_IDLE); - DECLARE("THREAD_KERNEL_STACK", offsetof(thread_t, kernel_stack)); - DECLARE("THREAD_RECOVER", offsetof(thread_t, recover)); - DECLARE("THREAD_FUNNEL_LOCK", - offsetof(thread_t, funnel_lock)); - DECLARE("THREAD_FUNNEL_STATE", - offsetof(thread_t, funnel_state)); - DECLARE("LOCK_FNL_MUTEX", - offsetof(struct funnel_lock *, fnl_mutex)); - - DECLARE("ACT_TASK", offsetof(thread_t, task)); - DECLARE("ACT_MACT_PCB", offsetof(thread_t, machine.pcb)); - DECLARE("ACT_MACT_UPCB", offsetof(thread_t, machine.upcb)); - DECLARE("ACT_AST", offsetof(thread_t, ast)); - DECLARE("ACT_VMMAP", offsetof(thread_t, map)); - DECLARE("vmmCEntry", offsetof(thread_t, machine.vmmCEntry)); - DECLARE("vmmControl", offsetof(thread_t, machine.vmmControl)); - DECLARE("curctx", offsetof(thread_t, machine.curctx)); - DECLARE("deferctx", offsetof(thread_t, machine.deferctx)); - DECLARE("facctx", offsetof(thread_t, machine.facctx)); -#ifdef MACH_BSD - DECLARE("CTHREAD_SELF", offsetof(thread_t, machine.cthread_self)); -#endif - - DECLARE("FPUsave", offsetof(struct facility_context *,FPUsave)); - DECLARE("FPUlevel", offsetof(struct facility_context *,FPUlevel)); - DECLARE("FPUcpu", offsetof(struct facility_context *,FPUcpu)); - DECLARE("FPUsync", offsetof(struct facility_context *,FPUsync)); - DECLARE("VMXsave", offsetof(struct facility_context *,VMXsave)); - DECLARE("VMXlevel", offsetof(struct facility_context *,VMXlevel)); - DECLARE("VMXcpu", offsetof(struct facility_context *,VMXcpu)); - DECLARE("VMXsync", offsetof(struct facility_context *,VMXsync)); - DECLARE("facAct", offsetof(struct facility_context *,facAct)); - - /* Values from vmachmon.h */ - - DECLARE("kVmmGetVersion", kVmmGetVersion); - DECLARE("kVmmvGetFeatures", kVmmvGetFeatures); - DECLARE("kVmmInitContext", kVmmInitContext); - DECLARE("kVmmTearDownContext", kVmmTearDownContext); - DECLARE("kVmmTearDownAll", kVmmTearDownAll); - DECLARE("kVmmMapPage", kVmmMapPage); - DECLARE("kVmmGetPageMapping", kVmmGetPageMapping); - DECLARE("kVmmUnmapPage", kVmmUnmapPage); - DECLARE("kVmmUnmapAllPages", kVmmUnmapAllPages); - DECLARE("kVmmGetPageDirtyFlag", kVmmGetPageDirtyFlag); - DECLARE("kVmmGetFloatState", kVmmGetFloatState); - DECLARE("kVmmGetVectorState", kVmmGetVectorState); - DECLARE("kVmmSetTimer", kVmmSetTimer); - DECLARE("kVmmGetTimer", kVmmGetTimer); - DECLARE("kVmmExecuteVM", kVmmExecuteVM); - DECLARE("kVmmProtectPage", kVmmProtectPage); - DECLARE("kVmmMapList", kVmmMapList); - DECLARE("kVmmUnmapList", kVmmUnmapList); - DECLARE("kVmmActivateXA", kVmmActivateXA); - DECLARE("kVmmDeactivateXA", kVmmDeactivateXA); - DECLARE("kVmmGetXA", kVmmGetXA); - DECLARE("kVmmMapPage64", kVmmMapPage64); - DECLARE("kVmmGetPageMapping64", kVmmGetPageMapping64); - DECLARE("kVmmUnmapPage64", kVmmUnmapPage64); - DECLARE("kVmmGetPageDirtyFlag64", kVmmGetPageDirtyFlag64); - DECLARE("kVmmMapExecute64", kVmmMapExecute64); - DECLARE("kVmmProtectExecute64", kVmmProtectExecute64); - DECLARE("kVmmMapList64", kVmmMapList64); - DECLARE("kVmmUnmapList64", kVmmUnmapList64); - DECLARE("kvmmExitToHost", kvmmExitToHost); - DECLARE("kvmmResumeGuest", kvmmResumeGuest); - DECLARE("kvmmGetGuestRegister", kvmmGetGuestRegister); - DECLARE("kvmmSetGuestRegister", kvmmSetGuestRegister); - - DECLARE("kVmmReturnNull", kVmmReturnNull); - DECLARE("kVmmStopped", kVmmStopped); - DECLARE("kVmmBogusContext", kVmmBogusContext); - DECLARE("kVmmReturnDataPageFault", kVmmReturnDataPageFault); - DECLARE("kVmmReturnInstrPageFault", kVmmReturnInstrPageFault); - DECLARE("kVmmReturnAlignmentFault", kVmmReturnAlignmentFault); - DECLARE("kVmmReturnProgramException", kVmmReturnProgramException); - DECLARE("kVmmReturnSystemCall", kVmmReturnSystemCall); - DECLARE("kVmmReturnTraceException", kVmmReturnTraceException); - DECLARE("kVmmInvalidAdSpace", kVmmInvalidAdSpace); - - DECLARE("kVmmProtXtnd", kVmmProtXtnd); - DECLARE("kVmmProtNARW", kVmmProtNARW); - DECLARE("kVmmProtRORW", kVmmProtRORW); - DECLARE("kVmmProtRWRW", kVmmProtRWRW); - DECLARE("kVmmProtRORO", kVmmProtRORO); - - DECLARE("vmmFlags", offsetof(struct vmmCntrlEntry *, vmmFlags)); - DECLARE("vmmXAFlgs", offsetof(struct vmmCntrlEntry *, vmmXAFlgs)); - DECLARE("vmmPmap", offsetof(struct vmmCntrlEntry *, vmmPmap)); - DECLARE("vmmInUseb", vmmInUseb); - DECLARE("vmmInUse", vmmInUse); - DECLARE("vmmContextKern", offsetof(struct vmmCntrlEntry *, vmmContextKern)); - DECLARE("vmmContextPhys", offsetof(struct vmmCntrlEntry *, vmmContextPhys)); - DECLARE("vmmContextUser", offsetof(struct vmmCntrlEntry *, vmmContextUser)); - DECLARE("vmmFacCtx", offsetof(struct vmmCntrlEntry *, vmmFacCtx)); - DECLARE("vmmLastMap", offsetof(struct vmmCntrlTable *, vmmLastMap)); - DECLARE("vmmGFlags", offsetof(struct vmmCntrlTable *, vmmGFlags)); - DECLARE("vmmc", offsetof(struct vmmCntrlTable *, vmmc)); - DECLARE("vmmAdsp", offsetof(struct vmmCntrlTable *, vmmAdsp)); - DECLARE("vmmLastAdSp", vmmLastAdSp); - DECLARE("vmmFAMintercept", offsetof(struct vmmCntrlEntry *, vmmFAMintercept)); - DECLARE("vmmCEntrySize", sizeof(struct vmmCntrlEntry)); - DECLARE("kVmmMaxContexts", kVmmMaxContexts); - - DECLARE("interface_version", offsetof(struct vmm_state_page_t *, interface_version)); - DECLARE("thread_index", offsetof(struct vmm_state_page_t *, thread_index)); - DECLARE("vmmStat", offsetof(struct vmm_state_page_t *, vmmStat)); - DECLARE("vmmCntrl", offsetof(struct vmm_state_page_t *, vmmCntrl)); - DECLARE("vmm_proc_state", offsetof(struct vmm_state_page_t *, vmm_proc_state)); - - DECLARE("return_code", offsetof(struct vmm_state_page_t *, return_code)); - - DECLARE("return_params", offsetof(struct vmm_state_page_t *, vmmRet.vmmrp32.return_params)); - DECLARE("return_paramsX", offsetof(struct vmm_state_page_t *, vmmRet.vmmrp64.return_params)); - -#if 0 - DECLARE("return_params", offsetof(struct vmm_state_page_t *, return_params)); - DECLARE("vmm_proc_state", offsetof(struct vmm_state_page_t *, vmm_proc_state)); -#endif - DECLARE("vmmppcVRs", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcVRs)); - DECLARE("vmmppcVSCR", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcVSCR)); - DECLARE("vmmppcFPRs", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcFPRs)); - DECLARE("vmmppcFPSCR", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcFPSCR)); - - DECLARE("vmmppcpc", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcPC)); - DECLARE("vmmppcmsr", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcMSR)); - DECLARE("vmmppcr0", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x00)); - DECLARE("vmmppcr1", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x04)); - DECLARE("vmmppcr2", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x08)); - DECLARE("vmmppcr3", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x0C)); - DECLARE("vmmppcr4", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x10)); - DECLARE("vmmppcr5", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x14)); - - DECLARE("vmmppcr6", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x18)); - DECLARE("vmmppcr7", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x1C)); - DECLARE("vmmppcr8", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x20)); - DECLARE("vmmppcr9", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x24)); - DECLARE("vmmppcr10", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x28)); - DECLARE("vmmppcr11", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x2C)); - DECLARE("vmmppcr12", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x30)); - DECLARE("vmmppcr13", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x34)); - - DECLARE("vmmppcr14", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x38)); - DECLARE("vmmppcr15", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x3C)); - DECLARE("vmmppcr16", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x40)); - DECLARE("vmmppcr17", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x44)); - DECLARE("vmmppcr18", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x48)); - DECLARE("vmmppcr19", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x4C)); - DECLARE("vmmppcr20", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x50)); - DECLARE("vmmppcr21", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x54)); - - DECLARE("vmmppcr22", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x58)); - DECLARE("vmmppcr23", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x5C)); - DECLARE("vmmppcr24", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x60)); - DECLARE("vmmppcr25", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x64)); - DECLARE("vmmppcr26", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x68)); - DECLARE("vmmppcr27", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x6C)); - DECLARE("vmmppcr28", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x70)); - DECLARE("vmmppcr29", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x74)); - - DECLARE("vmmppcr30", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x78)); - DECLARE("vmmppcr31", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcGPRs+0x7C)); - DECLARE("vmmppccr", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcCR)); - DECLARE("vmmppcxer", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcXER)); - DECLARE("vmmppclr", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcLR)); - DECLARE("vmmppcctr", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcCTR)); - DECLARE("vmmppcmq", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcMQ)); - DECLARE("vmmppcvrsave", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs32.ppcVRSave)); - - DECLARE("vmmppcXpc", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcPC)); - DECLARE("vmmppcXmsr", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcMSR)); - DECLARE("vmmppcXr0", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0x00)); - DECLARE("vmmppcXr1", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0x08)); - DECLARE("vmmppcXr2", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0x10)); - DECLARE("vmmppcXr3", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0x18)); - DECLARE("vmmppcXr4", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0x20)); - DECLARE("vmmppcXr5", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0x28)); - - DECLARE("vmmppcXr6", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0x30)); - DECLARE("vmmppcXr7", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0x38)); - DECLARE("vmmppcXr8", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0x40)); - DECLARE("vmmppcXr9", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0x48)); - DECLARE("vmmppcXr10", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0x50)); - DECLARE("vmmppcXr11", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0x58)); - DECLARE("vmmppcXr12", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0x60)); - DECLARE("vmmppcXr13", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0x68)); - - DECLARE("vmmppcXr14", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0x70)); - DECLARE("vmmppcXr15", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0x78)); - DECLARE("vmmppcXr16", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0x80)); - DECLARE("vmmppcXr17", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0x88)); - DECLARE("vmmppcXr18", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0x90)); - DECLARE("vmmppcXr19", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0x98)); - DECLARE("vmmppcXr20", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0xA0)); - DECLARE("vmmppcXr21", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0xA8)); - - DECLARE("vmmppcXr22", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0xB0)); - DECLARE("vmmppcXr23", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0xB8)); - DECLARE("vmmppcXr24", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0xC0)); - DECLARE("vmmppcXr25", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0xC8)); - DECLARE("vmmppcXr26", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0xD0)); - DECLARE("vmmppcXr27", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0xD8)); - DECLARE("vmmppcXr28", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0xE0)); - DECLARE("vmmppcXr29", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0xE8)); - - DECLARE("vmmppcXr30", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0xF0)); - DECLARE("vmmppcXr31", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcGPRs+0xF8)); - DECLARE("vmmppcXcr", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcCR)); - DECLARE("vmmppcXxer", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcXER)); - DECLARE("vmmppcXlr", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcLR)); - DECLARE("vmmppcXctr", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcCTR)); - DECLARE("vmmppcXvrsave", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcRegs.ppcRegs64.ppcVRSave)); - - DECLARE("vmmppcvscr", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcVSCR+0x00)); - DECLARE("vmmppcfpscrpad", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcFPSCR)); - DECLARE("vmmppcfpscr", offsetof(struct vmm_state_page_t *, vmm_proc_state.ppcFPSCR+4)); - - DECLARE("famguestr0", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs32.guest_register)); - DECLARE("famguestr1", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs32.guest_register+0x4)); - DECLARE("famguestr2", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs32.guest_register+0x8)); - DECLARE("famguestr3", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs32.guest_register+0xC)); - DECLARE("famguestr4", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs32.guest_register+0x10)); - DECLARE("famguestr5", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs32.guest_register+0x14)); - DECLARE("famguestr6", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs32.guest_register+0x18)); - DECLARE("famguestr7", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs32.guest_register+0x1C)); - DECLARE("famguestpc", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs32.guest_pc)); - DECLARE("famguestmsr", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs32.guest_msr)); - DECLARE("famdispcode", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs32.fastassist_dispatch_code)); - DECLARE("famrefcon", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs32.fastassist_refcon)); - DECLARE("famparam", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs32.fastassist_parameter)); - DECLARE("famhandler", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs32.fastassist_dispatch)); - DECLARE("famintercepts", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs32.fastassist_intercepts)); - - DECLARE("famguestXr0", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs64.guest_register)); - DECLARE("famguestXr1", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs64.guest_register+0x8)); - DECLARE("famguestXr2", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs64.guest_register+0x10)); - DECLARE("famguestXr3", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs64.guest_register+0x18)); - DECLARE("famguestXr4", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs64.guest_register+0x20)); - DECLARE("famguestXr5", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs64.guest_register+0x28)); - DECLARE("famguestXr6", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs64.guest_register+0x30)); - DECLARE("famguestXr7", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs64.guest_register+0x38)); - DECLARE("famguestXpc", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs64.guest_pc)); - DECLARE("famguestXmsr", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs64.guest_msr)); - DECLARE("famdispcodeX", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs64.fastassist_dispatch_code)); - DECLARE("famrefconX", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs64.fastassist_refcon)); - DECLARE("famparamX", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs64.fastassist_parameter)); - DECLARE("famhandlerX", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs64.fastassist_dispatch)); - DECLARE("faminterceptsX", offsetof(struct vmm_state_page_t *, vmm_fastassist_state.vmmfs64.fastassist_intercepts)); - - DECLARE("vmmFloatCngd", vmmFloatCngd); - DECLARE("vmmFloatCngdb", vmmFloatCngdb); - DECLARE("vmmVectCngd", vmmVectCngd); - DECLARE("vmmVectCngdb", vmmVectCngdb); - DECLARE("vmmTimerPop", vmmTimerPop); - DECLARE("vmmTimerPopb", vmmTimerPopb); - DECLARE("vmmFAMmode", vmmFAMmode); - DECLARE("vmmFAMmodeb", vmmFAMmodeb); - DECLARE("vmmSpfSave", vmmSpfSave); - DECLARE("vmmSpfSaveb", vmmSpfSaveb); - DECLARE("vmmFloatLoad", vmmFloatLoad); - DECLARE("vmmFloatLoadb", vmmFloatLoadb); - DECLARE("vmmVectLoad", vmmVectLoad); - DECLARE("vmmVectLoadb", vmmVectLoadb); - DECLARE("vmmVectVRall", vmmVectVRall); - DECLARE("vmmVectVRallb", vmmVectVRallb); - DECLARE("vmmVectVAss", vmmVectVAss); - DECLARE("vmmVectVAssb", vmmVectVAssb); - DECLARE("vmmXStart", vmmXStart); - DECLARE("vmmXStartb", vmmXStartb); - DECLARE("vmmXStop", vmmXStop); - DECLARE("vmmXStopb", vmmXStopb); - DECLARE("vmmKey", vmmKey); - DECLARE("vmmKeyb", vmmKeyb); - DECLARE("vmmFamSet", vmmFamSet); - DECLARE("vmmFamSetb", vmmFamSetb); - DECLARE("vmmFamEna", vmmFamEna); - DECLARE("vmmFamEnab", vmmFamEnab); - DECLARE("vmm64Bit", vmm64Bit); - - /* values from kern/task.h */ - DECLARE("TASK_SYSCALLS_MACH", offsetof(struct task *, syscalls_mach)); - DECLARE("TASK_SYSCALLS_UNIX", offsetof(struct task *, syscalls_unix)); - - DECLARE("TASK_VTIMERS", offsetof(struct task *, vtimers)); - - /* values from vm/vm_map.h */ - DECLARE("VMMAP_PMAP", offsetof(struct _vm_map *, pmap)); - - /* values from machine/pmap.h */ - DECLARE("pmapSpace", offsetof(struct pmap *, space)); - DECLARE("spaceNum", offsetof(struct pmap *, spaceNum)); - DECLARE("pmapSXlk", offsetof(struct pmap *, pmapSXlk)); - DECLARE("pmapCCtl", offsetof(struct pmap *, pmapCCtl)); - DECLARE("pmapCCtlVal", pmapCCtlVal); - DECLARE("pmapCCtlLck", pmapCCtlLck); - DECLARE("pmapCCtlLckb", pmapCCtlLckb); - DECLARE("pmapCCtlGen", pmapCCtlGen); - DECLARE("pmapSegCacheCnt", pmapSegCacheCnt); - DECLARE("pmapSegCacheUse", pmapSegCacheUse); - DECLARE("pmapvr", offsetof(struct pmap *, pmapvr)); - DECLARE("pmapFlags", offsetof(struct pmap *, pmapFlags)); - DECLARE("pmapKeys", pmapKeys); - DECLARE("pmapKeyDef", pmapKeyDef); - DECLARE("pmapSCSubTag", offsetof(struct pmap *, pmapSCSubTag)); - DECLARE("pmapVmmExt", offsetof(struct pmap *, pmapVmmExt)); - DECLARE("pmapVmmExtPhys", offsetof(struct pmap *, pmapVmmExtPhys)); - DECLARE("pmapVMhost", pmapVMhost); - DECLARE("pmapVMgsaa", pmapVMgsaa); - DECLARE("pmapSegCache", offsetof(struct pmap *, pmapSegCache)); - DECLARE("pmapCurLists", offsetof(struct pmap *, pmapCurLists)); - DECLARE("pmapRandNum", offsetof(struct pmap *, pmapRandNum)); - DECLARE("pmapSkipLists", offsetof(struct pmap *, pmapSkipLists)); - DECLARE("pmapSearchVisits", offsetof(struct pmap *, pmapSearchVisits)); - DECLARE("pmapSearchCnt", offsetof(struct pmap *, pmapSearchCnt)); - DECLARE("pmapSize", pmapSize); - DECLARE("kSkipListFanoutShift", kSkipListFanoutShift); - DECLARE("kSkipListMaxLists", kSkipListMaxLists); - DECLARE("invalSpace", invalSpace); - - DECLARE("sgcESID", offsetof(struct sgc *, sgcESID)); - DECLARE("sgcESmsk", sgcESmsk); - DECLARE("sgcVSID", offsetof(struct sgc *, sgcVSID)); - DECLARE("sgcVSmsk", sgcVSmsk); - DECLARE("sgcVSKeys", sgcVSKeys); - DECLARE("sgcVSKeyUsr", sgcVSKeyUsr); - DECLARE("sgcVSNoEx", sgcVSNoEx); - DECLARE("pmapPAddr", offsetof(struct pmapTransTab *, pmapPAddr)); - DECLARE("pmapVAddr", offsetof(struct pmapTransTab *, pmapVAddr)); - DECLARE("pmapTransSize", sizeof(pmapTransTab)); - DECLARE("pmapResidentCnt", offsetof(struct pmap *, stats.resident_count)); - DECLARE("pmapResidentMax", offsetof(struct pmap *, stats.resident_max)); - - DECLARE("maxAdrSp", maxAdrSp); - DECLARE("maxAdrSpb", maxAdrSpb); - - DECLARE("cppvPsnkb", cppvPsnkb); - DECLARE("cppvPsrcb", cppvPsrcb); - DECLARE("cppvFsnkb", cppvFsnkb); - DECLARE("cppvFsrcb", cppvFsrcb); - DECLARE("cppvNoModSnkb", cppvNoModSnkb); - DECLARE("cppvNoRefSrcb", cppvNoRefSrcb); - DECLARE("cppvKmapb", cppvKmapb); - - DECLARE("vmxSalt", offsetof(struct pmap_vmm_ext *, vmxSalt)); - DECLARE("vmxHostPmapPhys", offsetof(struct pmap_vmm_ext *, vmxHostPmapPhys)); - DECLARE("vmxHostPmap", offsetof(struct pmap_vmm_ext *, vmxHostPmap)); - DECLARE("vmxHashPgIdx", offsetof(struct pmap_vmm_ext *, vmxHashPgIdx)); - DECLARE("vmxHashPgList", offsetof(struct pmap_vmm_ext *, vmxHashPgList)); - DECLARE("vmxStats", offsetof(struct pmap_vmm_ext *, vmxStats)); - DECLARE("vmxSize", sizeof(struct pmap_vmm_ext)); - DECLARE("VMX_HPIDX_OFFSET", VMX_HPIDX_OFFSET); - DECLARE("VMX_HPLIST_OFFSET", VMX_HPLIST_OFFSET); - DECLARE("VMX_ACTMAP_OFFSET", VMX_ACTMAP_OFFSET); - DECLARE("vxsGpf", offsetof(struct pmap_vmm_ext *, vmxStats.vxsGpf)); - DECLARE("vxsGpfMiss", offsetof(struct pmap_vmm_ext *, vmxStats.vxsGpfMiss)); - DECLARE("vxsGrm", offsetof(struct pmap_vmm_ext *, vmxStats.vxsGrm)); - DECLARE("vxsGrmMiss", offsetof(struct pmap_vmm_ext *, vmxStats.vxsGrmMiss)); - DECLARE("vxsGrmActive", offsetof(struct pmap_vmm_ext *, vmxStats.vxsGrmActive)); - DECLARE("vxsGra", offsetof(struct pmap_vmm_ext *, vmxStats.vxsGra)); - DECLARE("vxsGraHits", offsetof(struct pmap_vmm_ext *, vmxStats.vxsGraHits)); - DECLARE("vxsGraActive", offsetof(struct pmap_vmm_ext *, vmxStats.vxsGraActive)); - DECLARE("vxsGrl", offsetof(struct pmap_vmm_ext *, vmxStats.vxsGrl)); - DECLARE("vxsGrlActive", offsetof(struct pmap_vmm_ext *, vmxStats.vxsGrlActive)); - DECLARE("vxsGrs", offsetof(struct pmap_vmm_ext *, vmxStats.vxsGrs)); - DECLARE("vxsGrsHitAct", offsetof(struct pmap_vmm_ext *, vmxStats.vxsGrsHitAct)); - DECLARE("vxsGrsHitSusp", offsetof(struct pmap_vmm_ext *, vmxStats.vxsGrsHitSusp)); - DECLARE("vxsGrsMissGV", offsetof(struct pmap_vmm_ext *, vmxStats.vxsGrsMissGV)); - DECLARE("vxsGrsHitPE", offsetof(struct pmap_vmm_ext *, vmxStats.vxsGrsHitPE)); - DECLARE("vxsGrsMissPE", offsetof(struct pmap_vmm_ext *, vmxStats.vxsGrsMissPE)); - DECLARE("vxsGad", offsetof(struct pmap_vmm_ext *, vmxStats.vxsGad)); - DECLARE("vxsGadHit", offsetof(struct pmap_vmm_ext *, vmxStats.vxsGadHit)); - DECLARE("vxsGadFree", offsetof(struct pmap_vmm_ext *, vmxStats.vxsGadFree)); - DECLARE("vxsGadDormant", offsetof(struct pmap_vmm_ext *, vmxStats.vxsGadDormant)); - DECLARE("vxsGadSteal", offsetof(struct pmap_vmm_ext *, vmxStats.vxsGadSteal)); - DECLARE("vxsGsu", offsetof(struct pmap_vmm_ext *, vmxStats.vxsGsu)); - DECLARE("vxsGsuHit", offsetof(struct pmap_vmm_ext *, vmxStats.vxsGsuHit)); - DECLARE("vxsGsuMiss", offsetof(struct pmap_vmm_ext *, vmxStats.vxsGsuMiss)); - DECLARE("vxsGtd", offsetof(struct pmap_vmm_ext *, vmxStats.vxsGtd)); - DECLARE("vxsGtdHit", offsetof(struct pmap_vmm_ext *, vmxStats.vxsGtdHit)); - DECLARE("vxsGtdMiss", offsetof(struct pmap_vmm_ext *, vmxStats.vxsGtdMiss)); - - /* values from kern/timer.h */ - DECLARE("TIMER_LOW", offsetof(struct timer *, low_bits)); - DECLARE("TIMER_HIGH", offsetof(struct timer *, high_bits)); - DECLARE("TIMER_HIGHCHK", offsetof(struct timer *, high_bits_check)); - DECLARE("TIMER_TSTAMP", offsetof(struct timer *, tstamp)); - - DECLARE("THREAD_TIMER", offsetof(struct processor *, processor_data.thread_timer)); - DECLARE("KERNEL_TIMER", offsetof(struct processor *, processor_data.kernel_timer)); - DECLARE("SYSTEM_TIMER", offsetof(struct thread *, system_timer)); - DECLARE("USER_TIMER", offsetof(struct thread *, user_timer)); - DECLARE("SYSTEM_STATE", offsetof(struct processor *, processor_data.system_state)); - DECLARE("USER_STATE", offsetof(struct processor *, processor_data.user_state)); - DECLARE("CURRENT_STATE", offsetof(struct processor *, processor_data.current_state)); - - /* Constants from pmap.h */ - DECLARE("PPC_SID_KERNEL", PPC_SID_KERNEL); - - /* values for accessing mach_trap table */ - DECLARE("MACH_TRAP_ARG_MUNGE32", - offsetof(mach_trap_t *, mach_trap_arg_munge32)); - DECLARE("MACH_TRAP_ARG_MUNGE64", - offsetof(mach_trap_t *, mach_trap_arg_munge64)); - DECLARE("MACH_TRAP_ARGC", - offsetof(mach_trap_t *, mach_trap_arg_count)); - DECLARE("MACH_TRAP_FUNCTION", - offsetof(mach_trap_t *, mach_trap_function)); - - DECLARE("MACH_TRAP_TABLE_COUNT", MACH_TRAP_TABLE_COUNT); - - DECLARE("PPCcallmax", sizeof(PPCcalls)); - - /* Misc values used by assembler */ - DECLARE("AST_ALL", AST_ALL); - DECLARE("AST_URGENT", AST_URGENT); - DECLARE("AST_BSD", AST_BSD); - - /* Spin Lock structure */ - DECLARE("SLOCK_ILK", offsetof(lck_spin_t *, interlock)); - - /* Mutex structure */ - DECLARE("MUTEX_DATA", offsetof(lck_mtx_t *, lck_mtx_data)); - DECLARE("MUTEX_WAITERS",offsetof(lck_mtx_t *, lck_mtx_waiters)); - DECLARE("MUTEX_PROMOTED_PRI",offsetof(lck_mtx_t *, lck_mtx_pri)); - DECLARE("MUTEX_TYPE", offsetof(lck_mtx_ext_t *, lck_mtx_deb.type)); - DECLARE("MUTEX_STACK", offsetof(lck_mtx_ext_t *, lck_mtx_deb.stack)); - DECLARE("MUTEX_FRAMES", LCK_FRAMES_MAX); - DECLARE("MUTEX_THREAD", offsetof(lck_mtx_ext_t *, lck_mtx_deb.thread)); - DECLARE("MUTEX_ATTR", offsetof(lck_mtx_ext_t *, lck_mtx_attr)); - DECLARE("MUTEX_ATTR_DEBUG", LCK_MTX_ATTR_DEBUG); - DECLARE("MUTEX_ATTR_DEBUGb", LCK_MTX_ATTR_DEBUGb); - DECLARE("MUTEX_ATTR_STAT", LCK_MTX_ATTR_STAT); - DECLARE("MUTEX_ATTR_STATb", LCK_MTX_ATTR_STATb); - DECLARE("MUTEX_GRP", offsetof(lck_mtx_ext_t *, lck_mtx_grp)); - DECLARE("MUTEX_TAG", MUTEX_TAG); - DECLARE("MUTEX_IND", LCK_MTX_TAG_INDIRECT); - DECLARE("MUTEX_ITAG",offsetof(lck_mtx_t *, lck_mtx_tag)); - DECLARE("MUTEX_PTR",offsetof(lck_mtx_t *, lck_mtx_ptr)); - DECLARE("MUTEX_ASSERT_OWNED", LCK_MTX_ASSERT_OWNED); - DECLARE("MUTEX_ASSERT_NOTOWNED",LCK_MTX_ASSERT_NOTOWNED); - DECLARE("GRP_MTX_STAT_UTIL", offsetof(lck_grp_t *, lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_util_cnt)); - DECLARE("GRP_MTX_STAT_MISS", offsetof(lck_grp_t *, lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_miss_cnt)); - DECLARE("GRP_MTX_STAT_WAIT", offsetof(lck_grp_t *, lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_wait_cnt)); - - /* RW lock structure */ - DECLARE("RW_IND", LCK_RW_TAG_INDIRECT); - DECLARE("RW_PTR", offsetof(lck_rw_t *, lck_rw_ptr)); - DECLARE("RW_SHARED", LCK_RW_TYPE_SHARED); - DECLARE("RW_EXCL", LCK_RW_TYPE_EXCLUSIVE); - DECLARE("RW_EVENT", (((sizeof(lck_rw_t)-1))/sizeof(unsigned int))*sizeof(unsigned int)); - - /* values from low_trace.h */ - DECLARE("LTR_cpu", offsetof(struct LowTraceRecord *, LTR_cpu)); - DECLARE("LTR_excpt", offsetof(struct LowTraceRecord *, LTR_excpt)); - DECLARE("LTR_timeHi", offsetof(struct LowTraceRecord *, LTR_timeHi)); - DECLARE("LTR_timeLo", offsetof(struct LowTraceRecord *, LTR_timeLo)); - DECLARE("LTR_cr", offsetof(struct LowTraceRecord *, LTR_cr)); - DECLARE("LTR_srr0", offsetof(struct LowTraceRecord *, LTR_srr0)); - DECLARE("LTR_srr1", offsetof(struct LowTraceRecord *, LTR_srr1)); - DECLARE("LTR_dar", offsetof(struct LowTraceRecord *, LTR_dar)); - DECLARE("LTR_dsisr", offsetof(struct LowTraceRecord *, LTR_dsisr)); - DECLARE("LTR_rsvd0", offsetof(struct LowTraceRecord *, LTR_rsvd0)); - DECLARE("LTR_save", offsetof(struct LowTraceRecord *, LTR_save)); - DECLARE("LTR_lr", offsetof(struct LowTraceRecord *, LTR_lr)); - DECLARE("LTR_ctr", offsetof(struct LowTraceRecord *, LTR_ctr)); - DECLARE("LTR_r0", offsetof(struct LowTraceRecord *, LTR_r0)); - DECLARE("LTR_r1", offsetof(struct LowTraceRecord *, LTR_r1)); - DECLARE("LTR_r2", offsetof(struct LowTraceRecord *, LTR_r2)); - DECLARE("LTR_r3", offsetof(struct LowTraceRecord *, LTR_r3)); - DECLARE("LTR_r4", offsetof(struct LowTraceRecord *, LTR_r4)); - DECLARE("LTR_r5", offsetof(struct LowTraceRecord *, LTR_r5)); - DECLARE("LTR_r6", offsetof(struct LowTraceRecord *, LTR_r6)); - DECLARE("LTR_size", sizeof(struct LowTraceRecord)); - -/* Values from pexpert.h */ - DECLARE("PECFIcpurate", offsetof(struct clock_frequency_info_t *, cpu_clock_rate_hz)); - DECLARE("PECFIbusrate", offsetof(struct clock_frequency_info_t *, bus_clock_rate_hz)); - -/* Values from pmap_internals.h and mappings.h */ - - DECLARE("mpFlags", offsetof(struct mapping *, mpFlags)); - DECLARE("mpBusy", mpBusy); - DECLARE("mpPrevious", mpPrevious); - DECLARE("mpNext", mpNext); - DECLARE("mpPIndex", mpPIndex); - DECLARE("mpType", mpType); - DECLARE("mpNormal", mpNormal); - DECLARE("mpBlock", mpBlock); - DECLARE("mpMinSpecial", mpMinSpecial); - DECLARE("mpNest", mpNest); - DECLARE("mpLinkage", mpLinkage); - DECLARE("mpACID", mpACID); - DECLARE("mpGuest", mpGuest); - DECLARE("mpFIP", mpFIP); - DECLARE("mpFIPb", mpFIPb); - DECLARE("mpPcfg", mpPcfg); - DECLARE("mpPcfgb", mpPcfgb); - DECLARE("mpRIP", mpRIP); - DECLARE("mpRIPb", mpRIPb); - DECLARE("mpPerm", mpPerm); - DECLARE("mpPermb", mpPermb); - DECLARE("mpBSu", mpBSu); - DECLARE("mpBSub", mpBSub); - DECLARE("mpLists", mpLists); - DECLARE("mpListsb", mpListsb); - DECLARE("mpgFlags", mpgFlags); - DECLARE("mpgFree", mpgFree); - DECLARE("mpgGlobal", mpgGlobal); - DECLARE("mpgDormant", mpgDormant); - - DECLARE("mpSpace", offsetof(struct mapping *, mpSpace)); - DECLARE("mpBSize", offsetof(struct mapping *, u.mpBSize)); - DECLARE("mpgCursor", offsetof(struct mapping *, u.mpgCursor)); - DECLARE("mpPte", offsetof(struct mapping *, mpPte)); - DECLARE("mpHValid", mpHValid); - DECLARE("mpHValidb", mpHValidb); - - DECLARE("mpPAddr", offsetof(struct mapping *, mpPAddr)); - DECLARE("mpVAddr", offsetof(struct mapping *, mpVAddr)); - DECLARE("mpHWFlags", mpHWFlags); - DECLARE("mpHWFlagsb", mpHWFlagsb); - DECLARE("mpN", mpN); - DECLARE("mpNb", mpNb); - DECLARE("mpPP", mpPP); - DECLARE("mpPPb", mpPPb); - DECLARE("mpPPe", mpPPe); - DECLARE("mpKKN", mpKKN); - DECLARE("mpKKNb", mpKKNb); - DECLARE("mpWIMG", mpWIMG); - DECLARE("mpWIMGb", mpWIMGb); - DECLARE("mpW", mpW); - DECLARE("mpWb", mpWb); - DECLARE("mpI", mpI); - DECLARE("mpIb", mpIb); - DECLARE("mpM", mpM); - DECLARE("mpMb", mpMb); - DECLARE("mpG", mpG); - DECLARE("mpGb", mpGb); - DECLARE("mpWIMGe", mpWIMGe); - DECLARE("mpC", mpC); - DECLARE("mpCb", mpCb); - DECLARE("mpR", mpR); - DECLARE("mpRb", mpRb); - DECLARE("mpAlias", offsetof(struct mapping *, mpAlias)); - DECLARE("mpNestReloc", offsetof(struct mapping *, mpNestReloc)); - DECLARE("mpBlkRemCur", offsetof(struct mapping *, mpBlkRemCur)); - DECLARE("mpList0", offsetof(struct mapping *, mpList0)); - DECLARE("mpList ", offsetof(struct mapping *, mpList)); - DECLARE("mpBasicSize", mpBasicSize); - DECLARE("mpBasicLists", mpBasicLists); - - DECLARE("mbvrswap", offsetof(struct mappingblok *, mapblokvrswap)); - DECLARE("mbfree", offsetof(struct mappingblok *, mapblokfree)); - DECLARE("mapcsize", sizeof(struct mappingctl)); - - DECLARE("hwpPurgePTE", hwpPurgePTE); - DECLARE("hwpMergePTE", hwpMergePTE); - DECLARE("hwpNoopPTE", hwpNoopPTE); - -// DANGER WIL ROBINSON!!! This wonderfully magical tool doesn't seem to handle 64-bit constants, -// leaving us with only the cold ash of a zero. ppI, ppG, and who knows what else is affected. - DECLARE("ppLink", offsetof(struct phys_entry *, ppLink)); - DECLARE("ppLock", ppLock); - DECLARE("ppFlags", ppFlags); -// DECLARE("ppI", ppI); - DECLARE("ppIb", ppIb); -// DECLARE("ppG", ppG); - DECLARE("ppGb", ppGb); - DECLARE("ppR", ppR); - DECLARE("ppRb", ppRb); - DECLARE("ppC", ppC); - DECLARE("ppCb", ppCb); - DECLARE("physEntrySize",physEntrySize); - DECLARE("ppLFAmask", ppLFAmask); - DECLARE("ppLFArrot", ppLFArrot); - - DECLARE("pcfFlags", offsetof(struct pcfg *, pcfFlags)); - DECLARE("pcfEncode", offsetof(struct pcfg *, pcfEncode)); - DECLARE("pcfPSize", offsetof(struct pcfg *, pcfPSize)); - DECLARE("pcfShift", offsetof(struct pcfg *, pcfShift)); - DECLARE("pcfValid", pcfValid); - DECLARE("pcfLarge", pcfLarge); - DECLARE("pcfDedSeg", pcfDedSeg); - DECLARE("pcfSize", sizeof(struct pcfg)); - DECLARE("pcfDefPcfg", pcfDefPcfg); - DECLARE("pcfLargePcfg", pcfLargePcfg); - - DECLARE("PCAallo", offsetof(struct PCA *, flgs.PCAallo)); - DECLARE("PCAfree", offsetof(struct PCA *, flgs.PCAalflgs.PCAfree)); - DECLARE("PCAauto", offsetof(struct PCA *, flgs.PCAalflgs.PCAauto)); - DECLARE("PCAmisc", offsetof(struct PCA *, flgs.PCAalflgs.PCAmisc)); - DECLARE("PCAlock", PCAlock); - DECLARE("PCAlockb", PCAlockb); - DECLARE("PCAsteal", offsetof(struct PCA *, flgs.PCAalflgs.PCAsteal)); - - DECLARE("mrPhysTab", offsetof(struct mem_region *, mrPhysTab)); - DECLARE("mrStart", offsetof(struct mem_region *, mrStart)); - DECLARE("mrEnd", offsetof(struct mem_region *, mrEnd)); - DECLARE("mrAStart", offsetof(struct mem_region *, mrAStart)); - DECLARE("mrAEnd", offsetof(struct mem_region *, mrAEnd)); - DECLARE("mrSize", sizeof(struct mem_region)); - - DECLARE("mapRemChunk", mapRemChunk); - - DECLARE("mapRetCode", mapRetCode); - DECLARE("mapRtOK", mapRtOK); - DECLARE("mapRtBadLk", mapRtBadLk); - DECLARE("mapRtPerm", mapRtPerm); - DECLARE("mapRtNotFnd", mapRtNotFnd); - DECLARE("mapRtBlock", mapRtBlock); - DECLARE("mapRtNest", mapRtNest); - DECLARE("mapRtRemove", mapRtRemove); - DECLARE("mapRtMapDup", mapRtMapDup); - DECLARE("mapRtGuest", mapRtGuest); - DECLARE("mapRtEmpty", mapRtEmpty); - DECLARE("mapRtSmash", mapRtSmash); - -#if 0 - DECLARE("MFpcaptr", offsetof(struct mappingflush *, pcaptr)); - DECLARE("MFmappingcnt", offsetof(struct mappingflush *, mappingcnt)); - DECLARE("MFmapping", offsetof(struct mappingflush *, mapping)); - DECLARE("MFmappingSize", sizeof(struct mfmapping)); -#endif - - DECLARE("GV_GROUPS_LG2", GV_GROUPS_LG2); - DECLARE("GV_GROUPS", GV_GROUPS); - DECLARE("GV_SLOT_SZ_LG2", GV_SLOT_SZ_LG2); - DECLARE("GV_SLOT_SZ", GV_SLOT_SZ); - DECLARE("GV_SLOTS_LG2", GV_SLOTS_LG2); - DECLARE("GV_SLOTS", GV_SLOTS); - DECLARE("GV_PGIDX_SZ_LG2", GV_PGIDX_SZ_LG2); - DECLARE("GV_PAGE_SZ_LG2", GV_PAGE_SZ_LG2); - DECLARE("GV_PAGE_SZ", GV_PAGE_SZ); - DECLARE("GV_PAGE_MASK", GV_PAGE_MASK); - DECLARE("GV_HPAGES", GV_HPAGES); - DECLARE("GV_GRPS_PPG_LG2", GV_GRPS_PPG_LG2); - DECLARE("GV_GRPS_PPG", GV_GRPS_PPG); - DECLARE("GV_GRP_MASK", GV_GRP_MASK); - DECLARE("GV_SLOT_MASK", GV_SLOT_MASK); - DECLARE("GV_HPAGE_SHIFT", GV_HPAGE_SHIFT); - DECLARE("GV_HPAGE_MASK", GV_HPAGE_MASK); - DECLARE("GV_HGRP_SHIFT", GV_HGRP_SHIFT); - DECLARE("GV_HGRP_MASK", GV_HGRP_MASK); - DECLARE("GV_MAPWD_BITS_LG2",GV_MAPWD_BITS_LG2); - DECLARE("GV_MAPWD_SZ_LG2", GV_MAPWD_SZ_LG2); - DECLARE("GV_MAP_WORDS", GV_MAP_WORDS); - DECLARE("GV_MAP_MASK", GV_MAP_MASK); - DECLARE("GV_MAP_SHIFT", GV_MAP_SHIFT); - DECLARE("GV_BAND_SHIFT", GV_BAND_SHIFT); - DECLARE("GV_BAND_SZ_LG2", GV_BAND_SZ_LG2); - DECLARE("GV_BAND_MASK", GV_BAND_MASK); - -#if 1 - DECLARE("GDsave", offsetof(struct GDWorkArea *, GDsave)); - DECLARE("GDfp0", offsetof(struct GDWorkArea *, GDfp0)); - DECLARE("GDfp1", offsetof(struct GDWorkArea *, GDfp1)); - DECLARE("GDfp2", offsetof(struct GDWorkArea *, GDfp2)); - DECLARE("GDfp3", offsetof(struct GDWorkArea *, GDfp3)); - DECLARE("GDtop", offsetof(struct GDWorkArea *, GDtop)); - DECLARE("GDleft", offsetof(struct GDWorkArea *, GDleft)); - DECLARE("GDtopleft", offsetof(struct GDWorkArea *, GDtopleft)); - DECLARE("GDrowbytes", offsetof(struct GDWorkArea *, GDrowbytes)); - DECLARE("GDrowchar", offsetof(struct GDWorkArea *, GDrowchar)); - DECLARE("GDdepth", offsetof(struct GDWorkArea *, GDdepth)); - DECLARE("GDcollgn", offsetof(struct GDWorkArea *, GDcollgn)); - DECLARE("GDready", offsetof(struct GDWorkArea *, GDready)); - DECLARE("GDrowbuf1", offsetof(struct GDWorkArea *, GDrowbuf1)); - DECLARE("GDrowbuf2", offsetof(struct GDWorkArea *, GDrowbuf2)); -#endif - - DECLARE("enaExpTrace", enaExpTrace); - DECLARE("enaExpTraceb", enaExpTraceb); - DECLARE("enaUsrFCall", enaUsrFCall); - DECLARE("enaUsrFCallb", enaUsrFCallb); - DECLARE("enaUsrPhyMp", enaUsrPhyMp); - DECLARE("enaUsrPhyMpb", enaUsrPhyMpb); - DECLARE("enaDiagSCs", enaDiagSCs); - DECLARE("enaDiagSCsb", enaDiagSCsb); - DECLARE("enaDiagEM", enaDiagEM); - DECLARE("enaDiagEMb", enaDiagEMb); - DECLARE("enaNotifyEM", enaNotifyEM); - DECLARE("enaNotifyEMb", enaNotifyEMb); - DECLARE("disLkType", disLkType); - DECLARE("disLktypeb", disLktypeb); - DECLARE("disLkThread", disLkThread); - DECLARE("disLkThreadb", disLkThreadb); - DECLARE("enaLkExtStck", enaLkExtStck); - DECLARE("enaLkExtStckb",enaLkExtStckb); - DECLARE("disLkMyLck", disLkMyLck); - DECLARE("disLkMyLckb", disLkMyLckb); - DECLARE("dgMisc1", offsetof(struct diagWork *, dgMisc1)); - DECLARE("dgMisc2", offsetof(struct diagWork *, dgMisc2)); - DECLARE("dgMisc3", offsetof(struct diagWork *, dgMisc3)); - DECLARE("dgMisc4", offsetof(struct diagWork *, dgMisc4)); - DECLARE("dgMisc5", offsetof(struct diagWork *, dgMisc5)); - - DECLARE("SACnext", offsetof(struct savearea_comm *, sac_next)); - DECLARE("SACprev", offsetof(struct savearea_comm *, sac_prev)); - DECLARE("SACvrswap", offsetof(struct savearea_comm *, sac_vrswap)); - DECLARE("SACalloc", offsetof(struct savearea_comm *, sac_alloc)); - DECLARE("SACflags", offsetof(struct savearea_comm *, sac_flags)); - DECLARE("sac_cnt", sac_cnt); - DECLARE("sac_empty", sac_empty); - DECLARE("sac_perm", sac_perm); - DECLARE("sac_permb", sac_permb); - - DECLARE("LocalSaveTarget", LocalSaveTarget); - DECLARE("LocalSaveMin", LocalSaveMin); - DECLARE("LocalSaveMax", LocalSaveMax); - DECLARE("FreeListMin", FreeListMin); - DECLARE("SaveLowHysteresis", SaveLowHysteresis); - DECLARE("SaveHighHysteresis", SaveHighHysteresis); - DECLARE("InitialSaveAreas", InitialSaveAreas); - DECLARE("InitialSaveTarget", InitialSaveTarget); - DECLARE("InitialSaveBloks", InitialSaveBloks); - - DECLARE("SAVprev", offsetof(struct savearea_comm *, save_prev)); - DECLARE("SAVact", offsetof(struct savearea_comm *, save_act)); - DECLARE("SAVflags", offsetof(struct savearea_comm *, save_flags)); - DECLARE("SAVlevel", offsetof(struct savearea_comm *, save_level)); - DECLARE("SAVtime", offsetof(struct savearea_comm *, save_time)); - DECLARE("savemisc0", offsetof(struct savearea_comm *, save_misc0)); - DECLARE("savemisc1", offsetof(struct savearea_comm *, save_misc1)); - DECLARE("savemisc2", offsetof(struct savearea_comm *, save_misc2)); - DECLARE("savemisc3", offsetof(struct savearea_comm *, save_misc3)); - - DECLARE("SAVsize", sizeof(struct savearea)); - DECLARE("SAVsizefpu", sizeof(struct savearea_vec)); - DECLARE("SAVsizevec", sizeof(struct savearea_fpu)); - DECLARE("SAVcommsize", sizeof(struct savearea_comm)); - - DECLARE("savesrr0", offsetof(struct savearea *, save_srr0)); - DECLARE("savesrr1", offsetof(struct savearea *, save_srr1)); - DECLARE("savecr", offsetof(struct savearea *, save_cr)); - DECLARE("savexer", offsetof(struct savearea *, save_xer)); - DECLARE("savelr", offsetof(struct savearea *, save_lr)); - DECLARE("savectr", offsetof(struct savearea *, save_ctr)); - DECLARE("savedar", offsetof(struct savearea *, save_dar)); - DECLARE("savedsisr", offsetof(struct savearea *, save_dsisr)); - DECLARE("saveexception", offsetof(struct savearea *, save_exception)); - DECLARE("savefpscrpad", offsetof(struct savearea *, save_fpscrpad)); - DECLARE("savefpscr", offsetof(struct savearea *, save_fpscr)); - DECLARE("savevrsave", offsetof(struct savearea *, save_vrsave)); - DECLARE("savevscr", offsetof(struct savearea *, save_vscr)); - - DECLARE("savemmcr0", offsetof(struct savearea *, save_mmcr0)); - DECLARE("savemmcr1", offsetof(struct savearea *, save_mmcr1)); - DECLARE("savemmcr2", offsetof(struct savearea *, save_mmcr2)); - DECLARE("savepmc", offsetof(struct savearea *, save_pmc)); - - DECLARE("saveinstr", offsetof(struct savearea *, save_instr)); - - DECLARE("savexdat0", offsetof(struct savearea *, save_xdat0)); - DECLARE("savexdat1", offsetof(struct savearea *, save_xdat1)); - DECLARE("savexdat2", offsetof(struct savearea *, save_xdat2)); - DECLARE("savexdat3", offsetof(struct savearea *, save_xdat3)); - - DECLARE("saver0", offsetof(struct savearea *, save_r0)); - DECLARE("saver1", offsetof(struct savearea *, save_r1)); - DECLARE("saver2", offsetof(struct savearea *, save_r2)); - DECLARE("saver3", offsetof(struct savearea *, save_r3)); - DECLARE("saver4", offsetof(struct savearea *, save_r4)); - DECLARE("saver5", offsetof(struct savearea *, save_r5)); - DECLARE("saver6", offsetof(struct savearea *, save_r6)); - DECLARE("saver7", offsetof(struct savearea *, save_r7)); - DECLARE("saver8", offsetof(struct savearea *, save_r8)); - DECLARE("saver9", offsetof(struct savearea *, save_r9)); - DECLARE("saver10", offsetof(struct savearea *, save_r10)); - DECLARE("saver11", offsetof(struct savearea *, save_r11)); - DECLARE("saver12", offsetof(struct savearea *, save_r12)); - DECLARE("saver13", offsetof(struct savearea *, save_r13)); - DECLARE("saver14", offsetof(struct savearea *, save_r14)); - DECLARE("saver15", offsetof(struct savearea *, save_r15)); - DECLARE("saver16", offsetof(struct savearea *, save_r16)); - DECLARE("saver17", offsetof(struct savearea *, save_r17)); - DECLARE("saver18", offsetof(struct savearea *, save_r18)); - DECLARE("saver19", offsetof(struct savearea *, save_r19)); - DECLARE("saver20", offsetof(struct savearea *, save_r20)); - DECLARE("saver21", offsetof(struct savearea *, save_r21)); - DECLARE("saver22", offsetof(struct savearea *, save_r22)); - DECLARE("saver23", offsetof(struct savearea *, save_r23)); - DECLARE("saver24", offsetof(struct savearea *, save_r24)); - DECLARE("saver25", offsetof(struct savearea *, save_r25)); - DECLARE("saver26", offsetof(struct savearea *, save_r26)); - DECLARE("saver27", offsetof(struct savearea *, save_r27)); - DECLARE("saver28", offsetof(struct savearea *, save_r28)); - DECLARE("saver29", offsetof(struct savearea *, save_r29)); - DECLARE("saver30", offsetof(struct savearea *, save_r30)); - DECLARE("saver31", offsetof(struct savearea *, save_r31)); - - DECLARE("savefp0", offsetof(struct savearea_fpu *, save_fp0)); - DECLARE("savefp1", offsetof(struct savearea_fpu *, save_fp1)); - DECLARE("savefp2", offsetof(struct savearea_fpu *, save_fp2)); - DECLARE("savefp3", offsetof(struct savearea_fpu *, save_fp3)); - DECLARE("savefp4", offsetof(struct savearea_fpu *, save_fp4)); - DECLARE("savefp5", offsetof(struct savearea_fpu *, save_fp5)); - DECLARE("savefp6", offsetof(struct savearea_fpu *, save_fp6)); - DECLARE("savefp7", offsetof(struct savearea_fpu *, save_fp7)); - DECLARE("savefp8", offsetof(struct savearea_fpu *, save_fp8)); - DECLARE("savefp9", offsetof(struct savearea_fpu *, save_fp9)); - DECLARE("savefp10", offsetof(struct savearea_fpu *, save_fp10)); - DECLARE("savefp11", offsetof(struct savearea_fpu *, save_fp11)); - DECLARE("savefp12", offsetof(struct savearea_fpu *, save_fp12)); - DECLARE("savefp13", offsetof(struct savearea_fpu *, save_fp13)); - DECLARE("savefp14", offsetof(struct savearea_fpu *, save_fp14)); - DECLARE("savefp15", offsetof(struct savearea_fpu *, save_fp15)); - DECLARE("savefp16", offsetof(struct savearea_fpu *, save_fp16)); - DECLARE("savefp17", offsetof(struct savearea_fpu *, save_fp17)); - DECLARE("savefp18", offsetof(struct savearea_fpu *, save_fp18)); - DECLARE("savefp19", offsetof(struct savearea_fpu *, save_fp19)); - DECLARE("savefp20", offsetof(struct savearea_fpu *, save_fp20)); - DECLARE("savefp21", offsetof(struct savearea_fpu *, save_fp21)); - DECLARE("savefp22", offsetof(struct savearea_fpu *, save_fp22)); - DECLARE("savefp23", offsetof(struct savearea_fpu *, save_fp23)); - DECLARE("savefp24", offsetof(struct savearea_fpu *, save_fp24)); - DECLARE("savefp25", offsetof(struct savearea_fpu *, save_fp25)); - DECLARE("savefp26", offsetof(struct savearea_fpu *, save_fp26)); - DECLARE("savefp27", offsetof(struct savearea_fpu *, save_fp27)); - DECLARE("savefp28", offsetof(struct savearea_fpu *, save_fp28)); - DECLARE("savefp29", offsetof(struct savearea_fpu *, save_fp29)); - DECLARE("savefp30", offsetof(struct savearea_fpu *, save_fp30)); - DECLARE("savefp31", offsetof(struct savearea_fpu *, save_fp31)); - - DECLARE("savevr0", offsetof(struct savearea_vec *, save_vr0)); - DECLARE("savevr1", offsetof(struct savearea_vec *, save_vr1)); - DECLARE("savevr2", offsetof(struct savearea_vec *, save_vr2)); - DECLARE("savevr3", offsetof(struct savearea_vec *, save_vr3)); - DECLARE("savevr4", offsetof(struct savearea_vec *, save_vr4)); - DECLARE("savevr5", offsetof(struct savearea_vec *, save_vr5)); - DECLARE("savevr6", offsetof(struct savearea_vec *, save_vr6)); - DECLARE("savevr7", offsetof(struct savearea_vec *, save_vr7)); - DECLARE("savevr8", offsetof(struct savearea_vec *, save_vr8)); - DECLARE("savevr9", offsetof(struct savearea_vec *, save_vr9)); - DECLARE("savevr10", offsetof(struct savearea_vec *, save_vr10)); - DECLARE("savevr11", offsetof(struct savearea_vec *, save_vr11)); - DECLARE("savevr12", offsetof(struct savearea_vec *, save_vr12)); - DECLARE("savevr13", offsetof(struct savearea_vec *, save_vr13)); - DECLARE("savevr14", offsetof(struct savearea_vec *, save_vr14)); - DECLARE("savevr15", offsetof(struct savearea_vec *, save_vr15)); - DECLARE("savevr16", offsetof(struct savearea_vec *, save_vr16)); - DECLARE("savevr17", offsetof(struct savearea_vec *, save_vr17)); - DECLARE("savevr18", offsetof(struct savearea_vec *, save_vr18)); - DECLARE("savevr19", offsetof(struct savearea_vec *, save_vr19)); - DECLARE("savevr20", offsetof(struct savearea_vec *, save_vr20)); - DECLARE("savevr21", offsetof(struct savearea_vec *, save_vr21)); - DECLARE("savevr22", offsetof(struct savearea_vec *, save_vr22)); - DECLARE("savevr23", offsetof(struct savearea_vec *, save_vr23)); - DECLARE("savevr24", offsetof(struct savearea_vec *, save_vr24)); - DECLARE("savevr25", offsetof(struct savearea_vec *, save_vr25)); - DECLARE("savevr26", offsetof(struct savearea_vec *, save_vr26)); - DECLARE("savevr27", offsetof(struct savearea_vec *, save_vr27)); - DECLARE("savevr28", offsetof(struct savearea_vec *, save_vr28)); - DECLARE("savevr29", offsetof(struct savearea_vec *, save_vr29)); - DECLARE("savevr30", offsetof(struct savearea_vec *, save_vr30)); - DECLARE("savevr31", offsetof(struct savearea_vec *, save_vr31)); - DECLARE("savevrvalid", offsetof(struct savearea_vec *, save_vrvalid)); - - /* PseudoKernel Exception Descriptor info */ - DECLARE("BEDA_SRR0", offsetof(BEDA_t *, srr0)); - DECLARE("BEDA_SRR1", offsetof(BEDA_t *, srr1)); - DECLARE("BEDA_SPRG0", offsetof(BEDA_t *, sprg0)); - DECLARE("BEDA_SPRG1", offsetof(BEDA_t *, sprg1)); - - /* PseudoKernel Interrupt Control Word */ - DECLARE("BTTD_INTCONTROLWORD", offsetof(BTTD_t *, InterruptControlWord)); - - /* New state when exiting the pseudokernel */ - DECLARE("BTTD_NEWEXITSTATE", offsetof(BTTD_t *, NewExitState)); - - /* PseudoKernel Test/Post Interrupt */ - DECLARE("BTTD_TESTINTMASK", offsetof(BTTD_t *, testIntMask)); - DECLARE("BTTD_POSTINTMASK", offsetof(BTTD_t *, postIntMask)); - - /* PseudoKernel Vectors */ - DECLARE("BTTD_TRAP_VECTOR", offsetof(BTTD_t *, TrapVector)); - DECLARE("BTTD_SYSCALL_VECTOR", offsetof(BTTD_t *, SysCallVector)); - DECLARE("BTTD_INTERRUPT_VECTOR", offsetof(BTTD_t *, InterruptVector)); - DECLARE("BTTD_PENDINGINT_VECTOR", offsetof(BTTD_t *, PendingIntVector)); - - /* PseudoKernel Bits, Masks and misc */ - DECLARE("SYSCONTEXTSTATE", kInSystemContext); - DECLARE("PSEUDOKERNELSTATE", kInPseudoKernel); - DECLARE("INTSTATEMASK_B", 12); - DECLARE("INTSTATEMASK_E", 15); - DECLARE("INTCR2MASK_B", 8); - DECLARE("INTCR2MASK_E", 11); - DECLARE("INTBACKUPCR2MASK_B", 28); - DECLARE("INTBACKUPCR2MASK_E", 31); - DECLARE("INTCR2TOBACKUPSHIFT", kCR2ToBackupShift); - DECLARE("BB_MAX_TRAP", bbMaxTrap); - DECLARE("BB_RFI_TRAP", bbRFITrap); - - /* Various hackery */ - DECLARE("procState", offsetof(struct processor *, state)); - - DECLARE("CPU_SUBTYPE_POWERPC_ALL", CPU_SUBTYPE_POWERPC_ALL); - DECLARE("CPU_SUBTYPE_POWERPC_750", CPU_SUBTYPE_POWERPC_750); - DECLARE("CPU_SUBTYPE_POWERPC_7400", CPU_SUBTYPE_POWERPC_7400); - DECLARE("CPU_SUBTYPE_POWERPC_7450", CPU_SUBTYPE_POWERPC_7450); - DECLARE("CPU_SUBTYPE_POWERPC_970", CPU_SUBTYPE_POWERPC_970); - - DECLARE("shdIBAT", offsetof(struct shadowBAT *, IBATs)); - DECLARE("shdDBAT", offsetof(struct shadowBAT *, DBATs)); - - /* Low Memory Globals */ - - DECLARE("lgVerCode", offsetof(struct lowglo *, lgVerCode)); - DECLARE("lgPPStart", offsetof(struct lowglo *, lgPPStart)); - DECLARE("maxDec", offsetof(struct lowglo *, lgMaxDec)); - DECLARE("mckFlags", offsetof(struct lowglo *, lgMckFlags)); - DECLARE("lgPMWvaddr", offsetof(struct lowglo *, lgPMWvaddr)); - DECLARE("lgUMWvaddr", offsetof(struct lowglo *, lgUMWvaddr)); - DECLARE("trcWork", offsetof(struct lowglo *, lgTrcWork)); - DECLARE("traceMask", offsetof(struct lowglo *, lgTrcWork.traceMask)); - DECLARE("traceCurr", offsetof(struct lowglo *, lgTrcWork.traceCurr)); - DECLARE("traceStart", offsetof(struct lowglo *, lgTrcWork.traceStart)); - DECLARE("traceEnd", offsetof(struct lowglo *, lgTrcWork.traceEnd)); - DECLARE("traceMsnd", offsetof(struct lowglo *, lgTrcWork.traceMsnd)); - - DECLARE("Zero", offsetof(struct lowglo *, lgZero)); - DECLARE("saveanchor", offsetof(struct lowglo *, lgSaveanchor)); - - DECLARE("SVlock", offsetof(struct lowglo *, lgSaveanchor.savelock)); - DECLARE("SVpoolfwd", offsetof(struct lowglo *, lgSaveanchor.savepoolfwd)); - DECLARE("SVpoolbwd", offsetof(struct lowglo *, lgSaveanchor.savepoolbwd)); - DECLARE("SVfree", offsetof(struct lowglo *, lgSaveanchor.savefree)); - DECLARE("SVfreecnt", offsetof(struct lowglo *, lgSaveanchor.savefreecnt)); - DECLARE("SVadjust", offsetof(struct lowglo *, lgSaveanchor.saveadjust)); - DECLARE("SVinuse", offsetof(struct lowglo *, lgSaveanchor.saveinuse)); - DECLARE("SVtarget", offsetof(struct lowglo *, lgSaveanchor.savetarget)); - DECLARE("SVsaveinusesnapshot", offsetof(struct lowglo *, lgSaveanchor.saveinusesnapshot)); - DECLARE("SVsavefreesnapshot", offsetof(struct lowglo *, lgSaveanchor.savefreesnapshot)); - DECLARE("SVsize", sizeof(struct Saveanchor)); - - DECLARE("tlbieLock", offsetof(struct lowglo *, lgTlbieLck)); - - DECLARE("dgFlags", offsetof(struct lowglo *, lgdgWork.dgFlags)); - DECLARE("dgLock", offsetof(struct lowglo *, lgdgWork.dgLock)); - DECLARE("dgMisc0", offsetof(struct lowglo *, lgdgWork.dgMisc0)); - - DECLARE("lglcksWork", offsetof(struct lowglo *, lglcksWork)); - DECLARE("lgKillResv", offsetof(struct lowglo *, lgKillResv)); - DECLARE("lgpPcfg", offsetof(struct lowglo *, lgpPcfg)); - - - DECLARE("scomcpu", offsetof(struct scomcomm *, scomcpu)); - DECLARE("scomfunc", offsetof(struct scomcomm *, scomfunc)); - DECLARE("scomreg", offsetof(struct scomcomm *, scomreg)); - DECLARE("scomstat", offsetof(struct scomcomm *, scomstat)); - DECLARE("scomdata", offsetof(struct scomcomm *, scomdata)); - -#if CONFIG_DTRACE - DECLARE("LS_LCK_MTX_UNLOCK_RELEASE", LS_LCK_MTX_UNLOCK_RELEASE); - DECLARE("LS_LCK_MTX_LOCK_ACQUIRE", LS_LCK_MTX_LOCK_ACQUIRE); -#endif - - return(0); /* For ANSI C :-) */ -} diff --git a/osfmk/ppc/hexfont.h b/osfmk/ppc/hexfont.h deleted file mode 100644 index 38035a0c9..000000000 --- a/osfmk/ppc/hexfont.h +++ /dev/null @@ -1,301 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* 0123456789ABCDEF */ - -hexfont: .short 0x0000 /* 0b0000000000000000 */ - .short 0x07C0 /* 0b0000011111000000 */ - .short 0x1FF0 /* 0b0001111111110000 */ - .short 0x3C78 /* 0b0011110001111000 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x701C /* 0b0111000000011100 */ - .short 0x701C /* 0b0111000000011100 */ - .short 0x701C /* 0b0111000000011100 */ - .short 0x701C /* 0b0111000000011100 */ - .short 0x701C /* 0b0111000000011100 */ - .short 0x701C /* 0b0111000000011100 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x3C78 /* 0b0011110001111000 */ - .short 0x1FF0 /* 0b0001111111110000 */ - .short 0x07C0 /* 0b0000011111000000 */ - .short 0x0000 /* 0b0000000000000000 */ - - .short 0x0000 /* 0b0000000000000000 */ - .short 0x0080 /* 0b0000000010000000 */ - .short 0x0180 /* 0b0000000110000000 */ - .short 0x0380 /* 0b0000001110000000 */ - .short 0x0780 /* 0b0000011110000000 */ - .short 0x0F80 /* 0b0000111110000000 */ - .short 0x1F80 /* 0b0001111110000000 */ - .short 0x0380 /* 0b0000001110000000 */ - .short 0x0380 /* 0b0000001110000000 */ - .short 0x0380 /* 0b0000001110000000 */ - .short 0x0380 /* 0b0000001110000000 */ - .short 0x0380 /* 0b0000001110000000 */ - .short 0x0380 /* 0b0000001110000000 */ - .short 0x1FF0 /* 0b0001111111110000 */ - .short 0x1FF0 /* 0b0001111111110000 */ - .short 0x0000 /* 0b0000000000000000 */ - - .short 0x0000 /* 0b0000000000000000 */ - .short 0x0FE0 /* 0b0000111111100000 */ - .short 0x1FF0 /* 0b0001111111110000 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x7038 /* 0b0111000000111000 */ - .short 0x0038 /* 0b0000000000111000 */ - .short 0x0038 /* 0b0000000000111000 */ - .short 0x00F0 /* 0b0000000011110000 */ - .short 0x01E0 /* 0b0000000111100000 */ - .short 0x0380 /* 0b0000001110000000 */ - .short 0x0F00 /* 0b0000111100000000 */ - .short 0x1C00 /* 0b0001110000000000 */ - .short 0x3800 /* 0b0011100000000000 */ - .short 0x7FFC /* 0b0111111111111100 */ - .short 0x7FFC /* 0b0111111111111100 */ - .short 0x0000 /* 0b0000000000000000 */ - - .short 0x0000 /* 0b0000000000000000 */ - .short 0x0FE0 /* 0b0000111111100000 */ - .short 0x1FF0 /* 0b0001111111110000 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x0038 /* 0b0000000000111000 */ - .short 0x0038 /* 0b0000000000111000 */ - .short 0x00F0 /* 0b0000000011110000 */ - .short 0x00F0 /* 0b0000000011110000 */ - .short 0x0038 /* 0b0000000000111000 */ - .short 0x0038 /* 0b0000000000111000 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x1FF0 /* 0b0001111111110000 */ - .short 0x0FE0 /* 0b0000111111100000 */ - .short 0x0000 /* 0b0000000000000000 */ - - .short 0x0000 /* 0b0000000000000000 */ - .short 0x0020 /* 0b0000000000100000 */ - .short 0x0060 /* 0b0000000001100000 */ - .short 0x00E0 /* 0b0000000011100000 */ - .short 0x01E0 /* 0b0000000111100000 */ - .short 0x03E0 /* 0b0000001111100000 */ - .short 0x07E0 /* 0b0000011111100000 */ - .short 0x0EE0 /* 0b0000111011100000 */ - .short 0x1CE0 /* 0b0001110011100000 */ - .short 0x3FF8 /* 0b0011111111111000 */ - .short 0x7FF8 /* 0b0111111111111000 */ - .short 0x00E0 /* 0b0000000011100000 */ - .short 0x00E0 /* 0b0000000011100000 */ - .short 0x00E0 /* 0b0000000011100000 */ - .short 0x00E0 /* 0b0000000011100000 */ - .short 0x0000 /* 0b0000000000000000 */ - - .short 0x0000 /* 0b0000000000000000 */ - .short 0x3FF8 /* 0b0011111111111000 */ - .short 0x3FF8 /* 0b0011111111111000 */ - .short 0x3800 /* 0b0011100000000000 */ - .short 0x3800 /* 0b0011100000000000 */ - .short 0x3FC0 /* 0b0011111111000000 */ - .short 0x3FF0 /* 0b0011111111110000 */ - .short 0x3870 /* 0b0011100001110000 */ - .short 0x3038 /* 0b0011000000111000 */ - .short 0x0038 /* 0b0000000000111000 */ - .short 0x0038 /* 0b0000000000111000 */ - .short 0x0038 /* 0b0000000000111000 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x3FF0 /* 0b0011111111110000 */ - .short 0x0FC0 /* 0b0000111111000000 */ - .short 0x0000 /* 0b0000000000000000 */ - - .short 0x0000 /* 0b0000000000000000 */ - .short 0x0070 /* 0b0000000001110000 */ - .short 0x00E0 /* 0b0000000011100000 */ - .short 0x01C0 /* 0b0000000111000000 */ - .short 0x0380 /* 0b0000001110000000 */ - .short 0x0700 /* 0b0000011100000000 */ - .short 0x0E00 /* 0b0000111000000000 */ - .short 0x1C00 /* 0b0001110000000000 */ - .short 0x3FE0 /* 0b0011111111100000 */ - .short 0x3FF8 /* 0b0011111111111000 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x1FF0 /* 0b0001111111110000 */ - .short 0x0FE0 /* 0b0000111111100000 */ - .short 0x0000 /* 0b0000000000000000 */ - - .short 0x0000 /* 0b0000000000000000 */ - .short 0x3FFC /* 0b0011111111111100 */ - .short 0x3FFC /* 0b0011111111111100 */ - .short 0x003C /* 0b0000000000111000 */ - .short 0x0038 /* 0b0000000000111000 */ - .short 0x0070 /* 0b0000000001110000 */ - .short 0x0070 /* 0b0000000001110000 */ - .short 0x00E0 /* 0b0000000011100000 */ - .short 0x00E0 /* 0b0000000011100000 */ - .short 0x01C0 /* 0b0000000111000000 */ - .short 0x01C0 /* 0b0000000111000000 */ - .short 0x0380 /* 0b0000001110000000 */ - .short 0x0380 /* 0b0000001110000000 */ - .short 0x0700 /* 0b0000011100000000 */ - .short 0x0700 /* 0b0000011100000000 */ - .short 0x0000 /* 0b0000000000000000 */ - - .short 0x0000 /* 0b0000000000000000 */ - .short 0x0FE0 /* 0b0000111111100000 */ - .short 0x1FF0 /* 0b0001111111110000 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x1FF0 /* 0b0001111111110000 */ - .short 0x0FE0 /* 0b0000111111100000 */ - .short 0x1FF0 /* 0b0001111111110000 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x1FF0 /* 0b0001111111110000 */ - .short 0x0FE0 /* 0b0000111111100000 */ - .short 0x0000 /* 0b0000000000000000 */ - - .short 0x0000 /* 0b0000000000000000 */ - .short 0x0FE0 /* 0b0000111111100000 */ - .short 0x1FF8 /* 0b0001111111111000 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x383C /* 0b0011100000011100 */ - .short 0x383C /* 0b0011100000011100 */ - .short 0x1FFC /* 0b0001111111111100 */ - .short 0x0FF8 /* 0b0000111111111000 */ - .short 0x0078 /* 0b0000000001111000 */ - .short 0x0070 /* 0b0000000001110000 */ - .short 0x00E0 /* 0b0000000011100000 */ - .short 0x01C0 /* 0b0000000111000000 */ - .short 0x0380 /* 0b0000001110000000 */ - .short 0x0700 /* 0b0000011100000000 */ - .short 0x0E00 /* 0b0000111000000000 */ - .short 0x0000 /* 0b0000000000000000 */ - - .short 0x0000 /* 0b0000000000000000 */ - .short 0x07E0 /* 0b0000011111100000 */ - .short 0x0FF0 /* 0b0000111111110000 */ - .short 0x1C38 /* 0b0001110000111000 */ - .short 0x1C38 /* 0b0001110000111000 */ - .short 0x381C /* 0b0011100000011100 */ - .short 0x381C /* 0b0011100000011100 */ - .short 0x3FFC /* 0b0011111111111100 */ - .short 0x3FFC /* 0b0011111111111100 */ - .short 0x381C /* 0b0011100000011100 */ - .short 0x381C /* 0b0011100000011100 */ - .short 0x381C /* 0b0011100000011100 */ - .short 0x381C /* 0b0011100000011100 */ - .short 0x381C /* 0b0011100000011100 */ - .short 0x381C /* 0b0011100000011100 */ - .short 0x0000 /* 0b0000000000000000 */ - - .short 0x0000 /* 0b0000000000000000 */ - .short 0x3FE0 /* 0b0011111111100000 */ - .short 0x3FF0 /* 0b0011111111110000 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x3FF8 /* 0b0011111111111000 */ - .short 0x3FF0 /* 0b0011111111110000 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x381C /* 0b0011100000011100 */ - .short 0x381C /* 0b0011100000011100 */ - .short 0x383C /* 0b0011100000111100 */ - .short 0x3FF8 /* 0b0011111111111000 */ - .short 0x3FE0 /* 0b0011111111100000 */ - .short 0x0000 /* 0b0000000000000000 */ - - .short 0x0000 /* 0b0000000000000000 */ - .short 0x0FE0 /* 0b0000111111100000 */ - .short 0x1FF0 /* 0b0001111111110000 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x3800 /* 0b0011100000000000 */ - .short 0x7000 /* 0b0111000000000000 */ - .short 0x7000 /* 0b0111000000000000 */ - .short 0x7000 /* 0b0111000000000000 */ - .short 0x7000 /* 0b0111000000000000 */ - .short 0x7000 /* 0b0111000000000000 */ - .short 0x7000 /* 0b0111000000000000 */ - .short 0x3800 /* 0b0011100000000000 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x1FF0 /* 0b0001111111110000 */ - .short 0x0FE0 /* 0b0000111111100000 */ - .short 0x0000 /* 0b0000000000000000 */ - - .short 0x0000 /* 0b0000000000000000 */ - .short 0x3FE0 /* 0b0011111111100000 */ - .short 0x3FF0 /* 0b0011111111110000 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x381C /* 0b0011100000011100 */ - .short 0x381C /* 0b0011100000011100 */ - .short 0x381C /* 0b0011100000011100 */ - .short 0x381C /* 0b0011100000011100 */ - .short 0x381C /* 0b0011100000011100 */ - .short 0x381C /* 0b0011100000011100 */ - .short 0x381C /* 0b0011100000011100 */ - .short 0x381C /* 0b0011100000011100 */ - .short 0x3838 /* 0b0011100000111000 */ - .short 0x3FF0 /* 0b0011111111110000 */ - .short 0x3FE0 /* 0b0011111111100000 */ - .short 0x0000 /* 0b0000000000000000 */ - - .short 0x0000 /* 0b0000000000000000 */ - .short 0x3FFC /* 0b0011111111111100 */ - .short 0x3FFC /* 0b0011111111111100 */ - .short 0x3800 /* 0b0011100000000000 */ - .short 0x3800 /* 0b0011100000000000 */ - .short 0x3800 /* 0b0011100000000000 */ - .short 0x3800 /* 0b0011100000000000 */ - .short 0x3FE0 /* 0b0011111111100000 */ - .short 0x3FE0 /* 0b0011111111100000 */ - .short 0x3800 /* 0b0011100000000000 */ - .short 0x3800 /* 0b0011100000000000 */ - .short 0x3800 /* 0b0011100000000000 */ - .short 0x3800 /* 0b0011100000000000 */ - .short 0x3FFC /* 0b0011111111111100 */ - .short 0x3FFC /* 0b0011111111111100 */ - .short 0x0000 /* 0b0000000000000000 */ - - .short 0x0000 /* 0b0000000000000000 */ - .short 0x3FFC /* 0b0011111111111100 */ - .short 0x3FFC /* 0b0011111111111100 */ - .short 0x3800 /* 0b0011100000000000 */ - .short 0x3800 /* 0b0011100000000000 */ - .short 0x3800 /* 0b0011100000000000 */ - .short 0x3FE0 /* 0b0011111111100000 */ - .short 0x3FE0 /* 0b0011111111100000 */ - .short 0x3800 /* 0b0011100000000000 */ - .short 0x3800 /* 0b0011100000000000 */ - .short 0x3800 /* 0b0011100000000000 */ - .short 0x3800 /* 0b0011100000000000 */ - .short 0x3800 /* 0b0011100000000000 */ - .short 0x3800 /* 0b0011100000000000 */ - .short 0x3800 /* 0b0011100000000000 */ - .short 0x0000 /* 0b0000000000000000 */ - diff --git a/osfmk/ppc/hibernate_ppc.c b/osfmk/ppc/hibernate_ppc.c deleted file mode 100644 index 2bd051994..000000000 --- a/osfmk/ppc/hibernate_ppc.c +++ /dev/null @@ -1,213 +0,0 @@ -/* - * Copyright (c) 2004-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ - -unsigned int save_kdebug_enable = 0; - - -hibernate_page_list_t * -hibernate_page_list_allocate(void) -{ - vm_size_t size; - uint32_t bank; - uint32_t pages, page_count; - hibernate_page_list_t * list; - hibernate_bitmap_t * bitmap; - - page_count = 0; - size = sizeof(hibernate_page_list_t); - - for (bank = 0; bank < (uint32_t) pmap_mem_regions_count; bank++) - { - size += sizeof(hibernate_bitmap_t); - pages = pmap_mem_regions[bank].mrEnd + 1 - pmap_mem_regions[bank].mrStart; - page_count += pages; - size += ((pages + 31) >> 5) * sizeof(uint32_t); - } - - list = kalloc(size); - if (!list) - return (list); - - list->list_size = size; - list->page_count = page_count; - list->bank_count = pmap_mem_regions_count; - - bitmap = &list->bank_bitmap[0]; - for (bank = 0; bank < list->bank_count; bank++) - { - bitmap->first_page = pmap_mem_regions[bank].mrStart; - bitmap->last_page = pmap_mem_regions[bank].mrEnd; - bitmap->bitmapwords = (pmap_mem_regions[bank].mrEnd + 1 - - pmap_mem_regions[bank].mrStart + 31) >> 5; - - bitmap = (hibernate_bitmap_t *) &bitmap->bitmap[bitmap->bitmapwords]; - } - return (list); -} - -void -hibernate_page_list_setall_machine(hibernate_page_list_t * page_list, - hibernate_page_list_t * page_list_wired, - uint32_t * pagesOut) -{ - uint32_t page, count, PCAsize; - - /* Get total size of PCA table */ - PCAsize = round_page((hash_table_size / PerProcTable[0].ppe_vaddr->pf.pfPTEG) - * sizeof(PCA_t)); - - page = atop_64(hash_table_base - PCAsize); - count = atop_64(hash_table_size + PCAsize); - - hibernate_set_page_state(page_list, page_list_wired, page, count, 0); - pagesOut -= count; - - HIBLOG("removed hash, pca: %d pages\n", count); - - save_snapshot(); -} - -// mark pages not to be saved and not for scratch usage during restore -void -hibernate_page_list_set_volatile(__unused hibernate_page_list_t *page_list, - __unused hibernate_page_list_t *page_list_wired, - __unused uint32_t *pagesOut) -{ -} - -kern_return_t -hibernate_processor_setup(IOHibernateImageHeader * header) -{ - header->processorFlags = PerProcTable[0].ppe_vaddr->pf.Available; - - PerProcTable[0].ppe_vaddr->hibernate = 1; - - return (KERN_SUCCESS); -} - -void -hibernate_vm_lock(void) -{ - if (getPerProc()->hibernate) - { - vm_page_lock_queues(); - lck_mtx_lock(&vm_page_queue_free_lock); - } -} - -void -hibernate_vm_unlock(void) -{ - if (getPerProc()->hibernate) - { - lck_mtx_unlock(&vm_page_queue_free_lock); - vm_page_unlock_queues(); - } -} - -void ml_ppc_sleep(void) -{ - struct per_proc_info *proc_info; - boolean_t dohalt; - - proc_info = getPerProc(); - if (!proc_info->hibernate) - { - ml_ppc_do_sleep(); - return; - } - - { - uint64_t start, end, nsec; - - HIBLOG("mapping_hibernate_flush start\n"); - clock_get_uptime(&start); - - mapping_hibernate_flush(); - - clock_get_uptime(&end); - absolutetime_to_nanoseconds(end - start, &nsec); - HIBLOG("mapping_hibernate_flush time: %qd ms\n", nsec / 1000000ULL); - } - - dohalt = hibernate_write_image(); - - if (dohalt) - { - // off - HIBLOG("power off\n"); - if (PE_halt_restart) - (*PE_halt_restart)(kPEHaltCPU); - } - else - { - // sleep - HIBLOG("sleep\n"); - - // should we come back via regular wake, set the state in memory. - PerProcTable[0].ppe_vaddr->hibernate = 0; - - PE_cpu_machine_quiesce(proc_info->cpu_id); - return; - } -} - -void -hibernate_newruntime_map(__unused void * map, - __unused vm_size_t map_size, - __unused uint32_t runtime_offset) -{ -} diff --git a/osfmk/ppc/hibernate_restore.s b/osfmk/ppc/hibernate_restore.s deleted file mode 100644 index 9025e6589..000000000 --- a/osfmk/ppc/hibernate_restore.s +++ /dev/null @@ -1,192 +0,0 @@ -/* - * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include - -/* -This code is linked into the kernel but part of the "__HIB" section, which means -its used by code running in the special context of restoring the kernel text and data -from the hibernation image read by the booter. hibernate_kernel_entrypoint() and everything -it calls or references (ie. hibernate_restore_phys_page()) -needs to be careful to only touch memory also in the "__HIB" section. -*/ - -/* -void -hibernate_restore_phys_page(uint64_t src, uint64_t dst, uint32_t len, uint32_t procFlags); -*/ - - .align 5 - .globl EXT(hibernate_restore_phys_page) - .globl EXT(hibernate_machine_entrypoint) - -LEXT(hibernate_restore_phys_page) - - andi. r0, r8, pf64Bit - bne hibernate_restore_phys_page64 - - srwi r10,r7,5 ; r10 <- 32-byte chunks to xfer - mtctr r10 - cmpwi r4, 0 - beq hibernate_restore_phys_pageFlush - -hibernate_restore_phys_pageCopy: - lwz r0,0(r4) - lwz r2,4(r4) - lwz r7,8(r4) - lwz r8,12(r4) - lwz r9,16(r4) - lwz r10,20(r4) - lwz r11,24(r4) - lwz r12,28(r4) - - dcbz 0,r6 ; avoid prefetch of next cache line - stw r0,0(r6) - stw r2,4(r6) - stw r7,8(r6) - stw r8,12(r6) - stw r9,16(r6) - stw r10,20(r6) - stw r11,24(r6) - stw r12,28(r6) - - dcbf 0, r6 - sync - icbi 0, r6 - isync - sync - - addi r4,r4,32 - addi r6,r6,32 - - bdnz hibernate_restore_phys_pageCopy ; loop if more chunks - blr - -hibernate_restore_phys_pageFlush: - dcbf 0, r6 - sync - icbi 0, r6 - isync - sync - - addi r6,r6,32 - bdnz hibernate_restore_phys_pageFlush ; loop if more chunks - blr - - -hibernate_restore_phys_page64: - rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg - rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits - rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg - rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits - - mfmsr r9 ; Get the MSR - li r0,1 ; Note - we use this in a couple places below - rldimi r9,r0,63,MSR_SF_BIT ; set SF on in MSR we will copy with - mtmsrd r9 ; turn 64-bit addressing on - isync ; wait for it to happen - - srwi r10,r7,7 ; r10 <- 128-byte chunks to xfer - mtctr r10 - cmpdi r3, 0 - beq hibernate_restore_phys_page64Flush - -hibernate_restore_phys_page64Copy: - ld r0,0(r3) - ld r2,8(r3) - ld r7,16(r3) - ld r8,24(r3) - ld r9,32(r3) - ld r10,40(r3) - ld r11,48(r3) - ld r12,56(r3) - - dcbz128 0,r4 ; avoid prefetch of next cache line - std r0,0(r4) - std r2,8(r4) - std r7,16(r4) - std r8,24(r4) - std r9,32(r4) - std r10,40(r4) - std r11,48(r4) - std r12,56(r4) - - ld r0,64(r3) ; load 2nd half of chunk - ld r2,72(r3) - ld r7,80(r3) - ld r8,88(r3) - ld r9,96(r3) - ld r10,104(r3) - ld r11,112(r3) - ld r12,120(r3) - - std r0,64(r4) - std r2,72(r4) - std r7,80(r4) - std r8,88(r4) - std r9,96(r4) - std r10,104(r4) - std r11,112(r4) - std r12,120(r4) - - dcbf 0, r4 - sync - icbi 0, r4 - isync - sync - - addi r3,r3,128 - addi r4,r4,128 - - bdnz hibernate_restore_phys_page64Copy ; loop if more chunks - - -hibernate_restore_phys_page64Done: - mfmsr r9 ; Get the MSR we used to copy - rldicl r9,r9,0,MSR_SF_BIT+1 ; clear SF - mtmsrd r9 ; turn 64-bit mode off - isync ; wait for it to happen - blr - -hibernate_restore_phys_page64Flush: - dcbf 0, r4 - sync - icbi 0, r4 - isync - sync - - addi r4,r4,128 - - bdnz hibernate_restore_phys_page64Flush ; loop if more chunks - b hibernate_restore_phys_page64Done - -LEXT(hibernate_machine_entrypoint) - b EXT(hibernate_kernel_entrypoint) - diff --git a/osfmk/ppc/hw_exception.s b/osfmk/ppc/hw_exception.s deleted file mode 100644 index ab77e1774..000000000 --- a/osfmk/ppc/hw_exception.s +++ /dev/null @@ -1,1832 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -/* Low level routines dealing with exception entry and exit. - * There are various types of exception: - * - * Interrupt, trap, system call and debugger entry. Each has it's own - * handler since the state save routine is different for each. The - * code is very similar (a lot of cut and paste). - * - * The code for the FPU disabled handler (lazy fpu) is in cswtch.s - */ - -#include -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include - - -#define VERIFYSAVE 0 -#define FPVECDBG 0 -#define FPFLOOD 0 -#define INSTRUMENT 0 - -/* - * thandler(type) - * - * ENTRY: VM switched ON - * Interrupts OFF - * R3 contains exception code - * R4 points to the saved context (virtual address) - * Everything is saved in savearea - */ - -/* - * If pcb.ksp == 0 then the kernel stack is already busy, - * we make a stack frame - * leaving enough space for the 'red zone' in case the - * trapped thread was in the middle of saving state below - * its stack pointer. - * - * otherwise we make a stack frame and - * the kernel stack (setting pcb.ksp to 0) - * - * on return, we do the reverse, the last state is popped from the pcb - * and pcb.ksp is set to the top of stack - */ - -/* TRAP_SPACE_NEEDED is the space assumed free on the kernel stack when - * another trap is taken. We need at least enough space for a saved state - * structure plus two small backpointer frames, and we add a few - * hundred bytes for the space needed by the C (which may be less but - * may be much more). We're trying to catch kernel stack overflows :-) - */ - -#define TRAP_SPACE_NEEDED FM_REDZONE+(2*FM_SIZE)+256 - - .text - - .align 5 - .globl EXT(thandler) -LEXT(thandler) ; Trap handler - - mfsprg r13,1 ; Get the current activation - lwz r25,ACT_PER_PROC(r13) ; Get the per_proc block - - lwz r1,PP_ISTACKPTR(r25) ; Get interrupt stack pointer - - cmpwi cr0,r1,0 ; Are we on interrupt stack? - mr r6,r13 - beq- cr0,EXT(ihandler) ; If on interrupt stack, treat this as interrupt... - lwz r26,ACT_MACT_SPF(r13) ; Get special flags - lwz r8,ACT_MACT_PCB(r13) ; Get the last savearea used - rlwinm. r26,r26,0,bbThreadbit,bbThreadbit ; Do we have Blue Box Assist active? - lwz r1,ACT_MACT_KSP(r13) ; Get the top of kernel stack - bnel- checkassist ; See if we should assist this - stw r4,ACT_MACT_PCB(r13) ; Point to our savearea - stw r8,SAVprev+4(r4) ; Queue the new save area in the front - -#if VERIFYSAVE - bl versave ; (TEST/DEBUG) -#endif - - lwz r9,THREAD_KERNEL_STACK(r6) ; Get our kernel stack start - cmpwi cr1,r1,0 ; Are we already on kernel stack? - stw r13,SAVact(r4) ; Mark the savearea as belonging to this activation - lwz r26,saver1+4(r4) ; Get the stack at interrupt time - - bne+ cr1,.L_kstackfree ; We are not on kernel stack yet... - - subi r1,r26,FM_REDZONE ; Make a red zone on interrupt time kernel stack - -.L_kstackfree: - lwz r31,savesrr1+4(r4) ; Pick up the entry MSR - sub r9,r1,r9 ; Get displacment into the kernel stack - li r0,0 ; Make this 0 - rlwinm. r0,r9,0,28,31 ; Verify that we have a 16-byte aligned stack (and get a 0) - cmplwi cr2,r9,KERNEL_STACK_SIZE ; Do we still have room on the stack? - beq cr1,.L_state_on_kstack ; using above test for pcb/stack - - stw r0,ACT_MACT_KSP(r13) ; Show that we have taken the stack - -.L_state_on_kstack: - lwz r9,savevrsave(r4) ; Get the VRSAVE register - bne-- kernelStackUnaligned ; Stack is unaligned... - rlwinm. r6,r31,0,MSR_VEC_BIT,MSR_VEC_BIT ; Was vector on? - subi r1,r1,FM_SIZE ; Push a header onto the current stack - bgt-- cr2,kernelStackBad ; Kernel stack is bogus... - -kernelStackNotBad: ; Vector was off - beq++ tvecoff ; Vector off, do not save vrsave... - stw r9,liveVRS(r25) ; Set the live value - -tvecoff: stw r26,FM_BACKPTR(r1) ; Link back to the previous frame - -#if DEBUG -/* If debugging, we need two frames, the first being a dummy - * which links back to the trapped routine. The second is - * that which the C routine below will need - */ - lwz r3,savesrr0+4(r4) ; Get the point of interruption - stw r3,FM_LR_SAVE(r1) ; save old instr ptr as LR value - stwu r1, -FM_SIZE(r1) ; and make new frame -#endif /* DEBUG */ - - mr r30,r4 - lwz r3,SAVtime(r4) - lwz r4,SAVtime+4(r4) - addi r5,r13,SYSTEM_TIMER - bl EXT(thread_timer_event) - addi r5,r25,SYSTEM_STATE - bl EXT(state_event) - - lwz r7,ACT_TASK(r13) - lwz r8,TASK_VTIMERS(r7) - cmpwi r8,0 - beq++ 0f - - lwz r7,ACT_PER_PROC(r13) - li r4,AST_BSD - lwz r8,PP_PENDING_AST(r7) - or r8,r8,r4 - stw r8,PP_PENDING_AST(r7) - addi r3,r13,ACT_AST - bl EXT(hw_atomic_or) -0: - -/* call trap handler proper, with - * ARG0 = type - * ARG1 = saved_state ptr - * ARG2 = dsisr - * ARG3 = dar - */ - - mr r4,r30 - lwz r3,saveexception(r30) ; Get the exception code - lwz r0,ACT_MACT_SPF(r13) ; Get the special flags - - addi r5,r3,-T_DATA_ACCESS ; Adjust to start of range - rlwinm. r0,r0,0,runningVMbit,runningVMbit ; Are we in VM state? (cr0_eq == 0 if yes) - cmplwi cr2,r5,T_TRACE-T_DATA_ACCESS ; Are we still in range? (cr_gt if not) - - lwz r5,savedsisr(r4) ; Get the saved DSISR - - crnor cr7_eq,cr0_eq,cr2_gt ; We should intercept if in VM and is a true trap (cr7_eq == 1 if yes) - rlwinm. r0,r31,0,MSR_PR_BIT,MSR_PR_BIT ; Are we trapping from supervisor state? (cr0_eq == 1 if yes) - - cmpi cr2,r3,T_PREEMPT ; Is this a preemption? - - beq-- .L_check_VM - stw r4,ACT_MACT_UPCB(r13) ; Store user savearea -.L_check_VM: - - crandc cr0_eq,cr7_eq,cr0_eq ; Do not intercept if we are in the kernel (cr0_eq == 1 if yes) - - lwz r6,savedar(r4) ; Get the DAR (top) - lwz r7,savedar+4(r4) ; Get the DAR (bottom) - - beq- cr2,.L_call_trap ; Do not turn on interrupts for T_PREEMPT - beq- exitFromVM ; Any true trap but T_MACHINE_CHECK exits us from the VM... - -/* syscall exception might warp here if there's nothing left - * to do except generate a trap - */ - -.L_call_trap: - -#if FPFLOOD - stfd f31,emfp31(r25) ; (TEST/DEBUG) -#endif - - bl EXT(trap) - - lis r10,hi16(MASK(MSR_VEC)) ; Get the vector enable - mfmsr r7 ; Get the MSR - ori r10,r10,lo16(MASK(MSR_FP)|MASK(MSR_EE)) ; Add in FP and EE - andc r7,r7,r10 ; Turn off VEC, FP, and EE - mtmsr r7 ; Disable for interrupts - mfsprg r8,1 ; Get the current activation - lwz r10,ACT_PER_PROC(r8) ; Get the per_proc block -/* - * This is also the point where new threads come when they are created. - * The new thread is setup to look like a thread that took an - * interrupt and went immediatly into trap. - */ - -thread_return: - lwz r11,SAVflags(r3) ; Get the flags of the current savearea - lwz r0,savesrr1+4(r3) ; Get the MSR we are going to - lwz r4,SAVprev+4(r3) ; Pick up the previous savearea - mfsprg r8,1 ; Get the current thread - rlwinm r11,r11,0,15,13 ; Clear the syscall flag - rlwinm. r0,r0,0,MSR_PR_BIT,MSR_PR_BIT ; Are we going to the user? - mr r1,r8 - stw r11,SAVflags(r3) ; Save back the flags (with reset stack cleared) - - lwz r5,THREAD_KERNEL_STACK(r1) ; Get the base pointer to the stack - stw r4,ACT_MACT_PCB(r8) ; Point to the previous savearea (or 0 if none) - addi r5,r5,KERNEL_STACK_SIZE-FM_SIZE ; Reset to empty - - beq-- chkfac ; We are not leaving the kernel yet... - - stw r5,ACT_MACT_KSP(r8) ; Save the empty stack pointer - b chkfac ; Go end it all... - - -; -; Here is where we go when we detect that the kernel stack is all messed up. -; We just try to dump some info and get into the debugger. -; - -kernelStackBad: - - lwz r3,PP_DEBSTACK_TOP_SS(r25) ; Pick up debug stack top - subi r3,r3,KERNEL_STACK_SIZE-FM_SIZE ; Adjust to start of stack - sub r3,r1,r3 ; Get displacement into debug stack - cmplwi cr2,r3,KERNEL_STACK_SIZE-FM_SIZE ; Check if we are on debug stack - blt+ cr2,kernelStackNotBad ; Yeah, that is ok too... - - lis r0,hi16(Choke) ; Choke code - ori r0,r0,lo16(Choke) ; and the rest - li r3,failStack ; Bad stack code - sc ; System ABEND - -kernelStackUnaligned: - lis r0,hi16(Choke) ; Choke code - ori r0,r0,lo16(Choke) ; and the rest - li r3,failUnalignedStk ; Unaligned stack code - sc ; System ABEND - - -/* - * shandler(type) - * - * ENTRY: VM switched ON - * Interrupts OFF - * R3 contains exception code - * R4 points to the saved context (virtual address) - * Everything is saved in savearea - */ - -/* - * If pcb.ksp == 0 then the kernel stack is already busy, - * this is an error - jump to the debugger entry - * - * otherwise depending upon the type of - * syscall, look it up in the kernel table - * or pass it to the server. - * - * on return, we do the reverse, the state is popped from the pcb - * and pcb.ksp is set to the top of stack. - */ - -/* - * NOTE: - * mach system calls are negative - * BSD system calls are low positive - * PPC-only system calls are in the range 0x6xxx - * PPC-only "fast" traps are in the range 0x7xxx - */ - - .align 5 - .globl EXT(shandler) -LEXT(shandler) ; System call handler - - lwz r7,savesrr1+4(r4) ; Get the SRR1 value - mfsprg r13,1 ; Get the current activation - lwz r25,ACT_PER_PROC(r13) ; Get the per_proc block - lwz r0,saver0+4(r4) ; Get the original syscall number - lwz r17,PP_ISTACKPTR(r25) ; Get interrupt stack pointer - rlwinm r15,r0,0,0,19 ; Clear the bottom of call number for fast check - mr. r17,r17 ; Are we on interrupt stack? - lwz r9,savevrsave(r4) ; Get the VRsave register - beq-- EXT(ihandler) ; On interrupt stack, not allowed... - rlwinm. r6,r7,0,MSR_VEC_BIT,MSR_VEC_BIT ; Was vector on? - mr r16,r13 - - beq++ svecoff ; Vector off, do not save vrsave... - stw r9,liveVRS(r25) ; Set the live value -; -; Check if SCs are being redirected for the BlueBox or to VMM -; - -svecoff: lwz r6,ACT_MACT_SPF(r13) ; Pick up activation special flags - mtcrf 0x40,r6 ; Check special flags - mtcrf 0x01,r6 ; Check special flags - crmove cr6_eq,runningVMbit ; Remember if we are in VMM - bne++ cr6,sVMchecked ; Not running VM - lwz r18,spcFlags(r25) ; Load per_proc special flags - rlwinm. r18,r18,0,FamVMmodebit,FamVMmodebit ; Is FamVMmodebit set? - beq sVMchecked ; Not in FAM - cmpwi r0,0x6004 ; Is it vmm_dispatch syscall: - bne sVMchecked - lwz r26,saver3+4(r4) ; Get the original syscall number - cmpwi cr6,r26,kvmmExitToHost ; vmm_exit_to_host request -sVMchecked: - bf++ bbNoMachSCbit,noassist ; Take branch if SCs are not redirected - lwz r26,ACT_MACT_BEDA(r13) ; Pick up the pointer to the blue box exception area - b EXT(atomic_switch_syscall) ; Go to the assist... - -noassist: cmplwi r15,0x7000 ; Do we have a fast path trap? - lwz r14,ACT_MACT_PCB(r13) ; Now point to the PCB - beql fastpath ; We think it is a fastpath... - - lwz r1,ACT_MACT_KSP(r13) ; Get the kernel stack pointer -#if DEBUG - mr. r1,r1 ; Are we already on the kernel stack? - li r3,T_SYSTEM_CALL ; Yup, pretend we had an interrupt... - beq- EXT(ihandler) ; Bad boy, bad boy... What cha gonna do when they come for you? -#endif /* DEBUG */ - - stw r4,ACT_MACT_PCB(r13) ; Point to our savearea - stw r4,ACT_MACT_UPCB(r13) ; Store user savearea - li r0,0 ; Clear this out - stw r14,SAVprev+4(r4) ; Queue the new save area in the front - stw r13,SAVact(r4) ; Point the savearea at its activation - -#if VERIFYSAVE - bl versave ; (TEST/DEBUG) -#endif - - lwz r15,saver1+4(r4) ; Grab interrupt time stack - mr r30,r4 ; Save pointer to the new context savearea - stw r0,ACT_MACT_KSP(r13) ; Mark stack as busy with 0 val - stw r15,FM_BACKPTR(r1) ; Link stack frame backwards - - lwz r3,SAVtime(r30) - lwz r4,SAVtime+4(r30) - addi r5,r13,SYSTEM_TIMER - bl EXT(thread_timer_event) - addi r5,r25,SYSTEM_STATE - bl EXT(state_event) - - lwz r7,ACT_TASK(r13) - lwz r8,TASK_VTIMERS(r7) - cmpwi r8,0 - beq++ 0f - - lwz r7,ACT_PER_PROC(r13) - li r4,AST_BSD - lwz r8,PP_PENDING_AST(r7) - or r8,r8,r4 - stw r8,PP_PENDING_AST(r7) - addi r3,r13,ACT_AST - bl EXT(hw_atomic_or) -0: - -#if DEBUG -/* If debugging, we need two frames, the first being a dummy - * which links back to the trapped routine. The second is - * that which the C routine below will need - */ - lwz r8,savesrr0+4(r30) ; Get the point of interruption - stw r8,FM_LR_SAVE(r1) ; Save old instr ptr as LR value - stwu r1, -FM_SIZE(r1) ; and make new frame -#endif /* DEBUG */ - - mr r4,r30 - - lwz r15,SAVflags(r30) ; Get the savearea flags - lwz r0,saver0+4(r30) ; Get R0 back - mfmsr r11 ; Get the MSR - stwu r1,-(FM_SIZE+ARG_SIZE+MUNGE_ARGS_SIZE)(r1) ; Make a stack frame - ori r11,r11,lo16(MASK(MSR_EE)) ; Turn on interruption enabled bit - rlwinm r10,r0,0,0,19 ; Keep only the top part - oris r15,r15,SAVsyscall >> 16 ; Mark that it this is a syscall - cmplwi r10,0x6000 ; Is it the special ppc-only guy? - stw r15,SAVflags(r30) ; Save syscall marker - beq-- cr6,exitFromVM ; It is time to exit from alternate context... - - beq-- ppcscall ; Call the ppc-only system call handler... - - mr. r0,r0 ; What kind is it? - mtmsr r11 ; Enable interruptions - - blt-- .L_kernel_syscall ; System call number if negative, this is a mach call... - - lwz r8,ACT_TASK(r13) ; Get our task - cmpwi cr0,r0,0x7FFA ; Special blue box call? - beq-- .L_notify_interrupt_syscall ; Yeah, call it... - - lwz r7,TASK_SYSCALLS_UNIX(r8) ; Get the current count - mr r3,r30 ; Get PCB/savearea - mr r4,r13 ; current activation - addi r7,r7,1 ; Bump it - stw r7,TASK_SYSCALLS_UNIX(r8) ; Save it - -#if FPFLOOD - stfd f31,emfp31(r25) ; (TEST/DEBUG) -#endif - - bl EXT(unix_syscall) ; Check out unix... - -.L_call_server_syscall_exception: - li r3,EXC_SYSCALL ; doexception(EXC_SYSCALL, num, 1) - -.L_call_server_exception: - mr r4,r0 ; Set syscall selector - li r5,1 - b EXT(doexception) ; Go away, never to return... - -.L_notify_interrupt_syscall: - lwz r3,saver3+4(r30) ; Get the new PC address to pass in - bl EXT(syscall_notify_interrupt) -/* - * Ok, return from C function, R3 = return value - * - * saved state is still in R30 and the active thread is in R16 . - */ - mr r31,r16 ; Move the current thread pointer - stw r3,saver3+4(r30) ; Stash the return code - b .L_thread_syscall_ret_check_ast - -; -; Handle PPC-only system call interface -; These are called with interruptions disabled -; and the savearea/pcb as the first parameter. -; It is up to the callee to enable interruptions if -; they should be. We are in a state here where -; both interrupts and preemption are ok, but because we could -; be calling diagnostic code we will not enable. -; -; Also, the callee is responsible for finding any parameters -; in the savearea/pcb. It also must set saver3 with any return -; code before returning. -; -; There are 3 possible return codes: -; 0 the call is disabled or something, we treat this like it was bogus -; + the call finished ok, check for AST -; - the call finished ok, do not check for AST -; -; Note: the last option is intended for special diagnostics calls that -; want the thread to return and execute before checking for preemption. -; -; NOTE: Both R16 (thread) and R30 (savearea) need to be preserved over this call!!!! -; - - .align 5 - -ppcscall: rlwinm r11,r0,2,18,29 ; Make an index into the table - lis r10,hi16(EXT(PPCcalls)) ; Get PPC-only system call table - cmplwi r11,PPCcallmax ; See if we are too big - ori r10,r10,lo16(EXT(PPCcalls)) ; Merge in low half - bgt- .L_call_server_syscall_exception ; Bogus call... - lwzx r11,r10,r11 ; Get function address - -; -; Note: make sure we do not change the savearea in R30 to -; a different register without checking. Some of the PPCcalls -; depend upon it being there. -; - - mr r3,r30 ; Pass the savearea - mr r4,r13 ; Pass the activation - mr. r11,r11 ; See if there is a function here - mtctr r11 ; Set the function address - beq- .L_call_server_syscall_exception ; Disabled call... -#if INSTRUMENT - mfspr r4,pmc1 ; Get stamp - stw r4,0x6100+(9*16)+0x0(0) ; Save it - mfspr r4,pmc2 ; Get stamp - stw r4,0x6100+(9*16)+0x4(0) ; Save it - mfspr r4,pmc3 ; Get stamp - stw r4,0x6100+(9*16)+0x8(0) ; Save it - mfspr r4,pmc4 ; Get stamp - stw r4,0x6100+(9*16)+0xC(0) ; Save it -#endif - bctrl ; Call it - - .globl EXT(ppcscret) - -LEXT(ppcscret) - mr. r3,r3 ; See what we should do - mr r31,r16 ; Restore the current thread pointer - bgt+ .L_thread_syscall_ret_check_ast ; Take normal AST checking return.... - mfsprg r10,1 ; Get the current activation - lwz r10,ACT_PER_PROC(r10) ; Get the per_proc block - blt+ .L_thread_syscall_return ; Return, but no ASTs.... - lwz r0,saver0+4(r30) ; Restore the system call number - b .L_call_server_syscall_exception ; Go to common exit... - - - -/* - * we get here for mach system calls - * when kdebug tracing is enabled - */ - -ksystrace: - mr r4,r30 ; Pass in saved state - bl EXT(syscall_trace) - - cmplw r31,r29 ; Is this syscall in the table? - add r31,r27,r28 ; Point right to the syscall table entry - - bge- .L_call_server_syscall_exception ; The syscall number is invalid - - lwz r0,savesrr1(r30) ; Get the saved srr1 - rlwinm. r0,r0,0,MSR_SF_BIT,MSR_SF_BIT ; Test for 64 bit caller - lwz r0,MACH_TRAP_ARG_MUNGE32(r31) ; Pick up the 32 bit munge function address - beq-- .L_ksystrace_munge - lwz r0,MACH_TRAP_ARG_MUNGE64(r31) ; Pick up the 64 bit munge function address - -.L_ksystrace_munge: - cmplwi r0,0 ; do we have a munger to call? - mtctr r0 ; Set the function call address - addi r3,r30,saver3 ; Pointer to args from save area - addi r4,r1,FM_ARG0+ARG_SIZE ; Pointer for munged args - beq-- .L_ksystrace_trapcall ; just make the trap call - bctrl ; Call the munge function - -.L_ksystrace_trapcall: - lwz r0,MACH_TRAP_FUNCTION(r31) ; Pick up the function address - mtctr r0 ; Set the function call address - addi r3,r1,FM_ARG0+ARG_SIZE ; Pointer to munged args - bctrl - - mr r4,r30 ; Pass in the savearea - bl EXT(syscall_trace_end) ; Trace the exit of the system call - b .L_mach_return - - - -/* Once here, we know that the syscall was -ve - * we should still have r1=ksp, - * r16 = pointer to current thread, - * r13 = pointer to top activation, - * r0 = syscall number - * r30 = pointer to saved state (in pcb) - */ - - .align 5 - -.L_kernel_syscall: -; -; Call a function that can print out our syscall info -; Note that we don t care about any volatiles yet -; - lwz r10,ACT_TASK(r13) ; Get our task - lwz r0,saver0+4(r30) - lis r8,hi16(EXT(kdebug_enable)) ; Get top of kdebug_enable - lis r28,hi16(EXT(mach_trap_table)) ; Get address of table - ori r8,r8,lo16(EXT(kdebug_enable)) ; Get bottom of kdebug_enable - lwz r8,0(r8) ; Get kdebug_enable - - lwz r7,TASK_SYSCALLS_MACH(r10) ; Get the current count - neg r31,r0 ; Make this positive - mr r3,r31 ; save it - slwi r27,r3,4 ; multiply by 16 - slwi r3,r3,2 ; and the original by 4 - ori r28,r28,lo16(EXT(mach_trap_table)) ; Get address of table - add r27,r27,r3 ; for a total of 20x (5 words/entry) - addi r7,r7,1 ; Bump TASK_SYSCALLS_MACH count - cmplwi r8,0 ; Is kdebug_enable non-zero - stw r7,TASK_SYSCALLS_MACH(r10) ; Save count - bne-- ksystrace ; yes, tracing enabled - - cmplwi r31,MACH_TRAP_TABLE_COUNT ; Is this syscall in the table? - add r31,r27,r28 ; Point right to the syscall table entry - - bge-- .L_call_server_syscall_exception ; The syscall number is invalid - - lwz r0,savesrr1(r30) ; Get the saved srr1 - rlwinm. r0,r0,0,MSR_SF_BIT,MSR_SF_BIT ; Test for 64 bit caller - lwz r0,MACH_TRAP_ARG_MUNGE32(r31) ; Pick up the 32 bit munge function address - beq-- .L_kernel_syscall_munge - lwz r0,MACH_TRAP_ARG_MUNGE64(r31) ; Pick up the 64 bit munge function address - -.L_kernel_syscall_munge: - cmplwi r0,0 ; test for null munger - mtctr r0 ; Set the function call address - addi r3,r30,saver3 ; Pointer to args from save area - addi r4,r1,FM_ARG0+ARG_SIZE ; Pointer for munged args - beq-- .L_kernel_syscall_trapcall ; null munger - skip to trap call - bctrl ; Call the munge function - -.L_kernel_syscall_trapcall: - lwz r0,MACH_TRAP_FUNCTION(r31) ; Pick up the function address - mtctr r0 ; Set the function call address - addi r3,r1,FM_ARG0+ARG_SIZE ; Pointer to munged args - -#if FPFLOOD - stfd f31,emfp31(r25) ; (TEST/DEBUG) -#endif - - bctrl - - -/* - * Ok, return from C function, R3 = return value - * - * get the active thread's PCB pointer and thus pointer to user state - * saved state is still in R30 and the active thread is in R16 - */ - -.L_mach_return: - srawi r0,r3,31 ; properly extend the return code - cmpi cr0,r3,KERN_INVALID_ARGUMENT ; deal with invalid system calls - mr r31,r16 ; Move the current thread pointer - stw r0, saver3(r30) ; stash the high part of the return code - stw r3,saver3+4(r30) ; Stash the low part of the return code - beq-- cr0,.L_mach_invalid_ret ; otherwise fall through into the normal return path -.L_mach_invalid_arg: - - -/* 'standard' syscall returns here - INTERRUPTS ARE STILL ON - * the syscall may perform a thread_set_syscall_return - * followed by a thread_exception_return, ending up - * at thread_syscall_return below, with SS_R3 having - * been set up already - * - * When we are here, r31 should point to the current thread, - * r30 should point to the current pcb - * r3 contains value that we're going to return to the user - * which has already been stored back into the save area - */ - -.L_thread_syscall_ret_check_ast: - lis r10,hi16(MASK(MSR_VEC)) ; Get the vector enable - mfmsr r12 ; Get the current MSR - ori r10,r10,lo16(MASK(MSR_FP)|MASK(MSR_EE)) ; Add in FP and EE - andc r12,r12,r10 ; Turn off VEC, FP, and EE - mtmsr r12 ; Turn interruptions off - - mfsprg r10,1 ; Get the current activation - lwz r10,ACT_PER_PROC(r10) ; Get the per_proc block - -/* Check to see if there's an outstanding AST */ - - lwz r4,PP_PENDING_AST(r10) - cmpi cr0,r4, 0 ; Any pending asts? - beq++ cr0,.L_syscall_no_ast ; Nope... - -/* Yes there is, call ast_taken - * pretending that the user thread took an AST exception here, - * ast_taken will save all state and bring us back here - */ - -#if DEBUG -/* debug assert - make sure that we're not returning to kernel */ - lwz r3,savesrr1+4(r30) - andi. r3,r3,MASK(MSR_PR) - bne++ scrnotkern ; returning to user level, check - - lis r0,hi16(Choke) ; Choke code - ori r0,r0,lo16(Choke) ; and the rest - li r3,failContext ; Bad state code - sc ; System ABEND - -scrnotkern: -#endif /* DEBUG */ - - lis r3,hi16(AST_ALL) ; Set ast flags - li r4,1 ; Set interrupt allowed - ori r3,r3,lo16(AST_ALL) - bl EXT(ast_taken) ; Process the pending ast - b .L_thread_syscall_ret_check_ast ; Go see if there was another... - -.L_mach_invalid_ret: -/* - * need to figure out why we got an KERN_INVALID_ARG - * if it was due to a non-existent system call - * then we want to throw an exception... otherwise - * we want to pass the error code back to the caller - */ - lwz r0,saver0+4(r30) ; reload the original syscall number - neg r28,r0 ; Make this positive - mr r4,r28 ; save a copy - slwi r27,r4,4 ; multiply by 16 - slwi r4,r4,2 ; and another 4 - lis r28,hi16(EXT(mach_trap_table)) ; Get address of table - add r27,r27,r4 ; for a total of 20x (5 words/entry) - ori r28,r28,lo16(EXT(mach_trap_table)) ; Get address of table - add r28,r27,r28 ; Point right to the syscall table entry - lwz r27,MACH_TRAP_FUNCTION(r28) ; Pick up the function address - lis r28,hi16(EXT(kern_invalid)) ; Get high half of invalid syscall function - ori r28,r28,lo16(EXT(kern_invalid)) ; Get low half of invalid syscall function - cmpw cr0,r27,r28 ; Check if this is an invalid system call - beq-- .L_call_server_syscall_exception ; We have a bad system call - b .L_mach_invalid_arg ; a system call returned KERN_INVALID_ARG - - -/* thread_exception_return returns to here, almost all - * registers intact. It expects a full context restore - * of what it hasn't restored itself (ie. what we use). - * - * In particular for us, - * we still have r31 points to the current thread, - * r30 points to the current pcb - */ - - .align 5 - -.L_syscall_no_ast: -.L_thread_syscall_return: - - mr r3,r30 ; Get savearea to the correct register for common exit - - lwz r11,SAVflags(r30) ; Get the flags - lwz r5,THREAD_KERNEL_STACK(r31) ; Get the base pointer to the stack - lwz r4,SAVprev+4(r30) ; Get the previous save area - rlwinm r11,r11,0,15,13 ; Clear the syscall flag - mfsprg r8,1 ; Now find the current activation - addi r5,r5,KERNEL_STACK_SIZE-FM_SIZE ; Reset to empty - stw r11,SAVflags(r30) ; Stick back the flags - stw r5,ACT_MACT_KSP(r8) ; Save the empty stack pointer - stw r4,ACT_MACT_PCB(r8) ; Save previous save area - b chkfac ; Go end it all... - -/* - * thread_exception_return() - * - * Return to user mode directly from within a system call. - */ - - .align 5 - .globl EXT(thread_bootstrap_return) -LEXT(thread_bootstrap_return) ; NOTE: THIS IS GOING AWAY IN A FEW DAYS.... - - .globl EXT(thread_exception_return) -LEXT(thread_exception_return) ; Directly return to user mode - -.L_thread_exc_ret_check_ast: - lis r10,hi16(MASK(MSR_VEC)) ; Get the vector enable - mfmsr r3 ; Get the MSR - ori r10,r10,lo16(MASK(MSR_FP)|MASK(MSR_EE)) ; Add in FP and EE - andc r3,r3,r10 ; Turn off VEC, FP, and EE - mtmsr r3 ; Disable interrupts - -/* Check to see if there's an outstanding AST */ -/* We don't bother establishing a call frame even though CHECK_AST - can invoke ast_taken(), because it can just borrow our caller's - frame, given that we're not going to return. -*/ - - mfsprg r10,1 ; Get the current activation - lwz r10,ACT_PER_PROC(r10) ; Get the per_proc block - lwz r4,PP_PENDING_AST(r10) - cmpi cr0,r4, 0 - beq+ cr0,.L_exc_ret_no_ast - -/* Yes there is, call ast_taken - * pretending that the user thread took an AST exception here, - * ast_taken will save all state and bring us back here - */ - - lis r3,hi16(AST_ALL) - li r4,1 - ori r3,r3,lo16(AST_ALL) - - bl EXT(ast_taken) - b .L_thread_exc_ret_check_ast ; check for a second AST (rare) - -/* arriving here, interrupts should be disabled */ -/* Get the active thread's PCB pointer to restore regs - */ -.L_exc_ret_no_ast: - - mfsprg r30,1 ; Get the currrent activation - mr r31,r30 - - lwz r30,ACT_MACT_PCB(r30) - mr. r30,r30 ; Is there any context yet? - beq- makeDummyCtx ; No, hack one up... -#if DEBUG -/* - * debug assert - make sure that we're not returning to kernel - * get the active thread's PCB pointer and thus pointer to user state - */ - - lwz r3,savesrr1+4(r30) - andi. r3,r3,MASK(MSR_PR) - bne+ ret_user2 ; We are ok... - - lis r0,hi16(Choke) ; Choke code - ori r0,r0,lo16(Choke) ; and the rest - li r3,failContext ; Bad state code - sc ; System ABEND - -ret_user2: -#endif /* DEBUG */ - -/* If the system call flag isn't set, then we came from a trap, - * so warp into the return_from_trap (thread_return) routine, - * which takes PCB pointer in R3, not in r30! - */ - lwz r0,SAVflags(r30) ; Grab the savearea flags - andis. r0,r0,SAVsyscall>>16 ; Are we returning from a syscall? - mr r3,r30 ; Copy pcb pointer into r3 in case we need it - beq-- cr0,thread_return ; Nope, must be a thread return... - b .L_thread_syscall_return ; Join up with the system call return... - -; -; This is where we handle someone trying who did a thread_create followed -; by a thread_resume with no intervening thread_set_state. Just make an -; empty context, initialize it to trash and let em execute at 0... -; - - .align 5 - -makeDummyCtx: - bl EXT(save_get) ; Get a save_area - li r4,SAVgeneral ; Get the general context type - li r0,0 ; Get a 0 - stb r4,SAVflags+2(r3) ; Set type - addi r2,r3,savefpscr+4 ; Point past what we are clearing - mr r4,r3 ; Save the start - -cleardummy: stw r0,0(r4) ; Clear stuff - addi r4,r4,4 ; Next word - cmplw r4,r2 ; Still some more? - blt+ cleardummy ; Yeah... - - lis r2,hi16(MSR_EXPORT_MASK_SET) ; Set the high part of the user MSR - ori r2,r2,lo16(MSR_EXPORT_MASK_SET) ; And the low part - stw r2,savesrr1+4(r3) ; Set the default user MSR - - b thread_return ; Go let em try to execute, hah! - -/* - * ihandler(type) - * - * ENTRY: VM switched ON - * Interrupts OFF - * R3 contains exception code - * R4 points to the saved context (virtual address) - * Everything is saved in savearea - * - */ - - .align 5 - .globl EXT(ihandler) -LEXT(ihandler) ; Interrupt handler */ - -/* - * get the value of istackptr, if it's zero then we're already on the - * interrupt stack. - */ - - lwz r10,savesrr1+4(r4) ; Get SRR1 - lwz r7,savevrsave(r4) ; Get the VRSAVE register - mfsprg r13,1 ; Get the current activation - lwz r25,ACT_PER_PROC(r13) ; Get the per_proc block - li r14,0 ; Zero this for now - rlwinm. r16,r10,0,MSR_VEC_BIT,MSR_VEC_BIT ; Was vector on? - lwz r1,PP_ISTACKPTR(r25) ; Get the interrupt stack - li r16,0 ; Zero this for now - - beq+ ivecoff ; Vector off, do not save vrsave... - stw r7,liveVRS(r25) ; Set the live value - -ivecoff: li r0,0 ; Get a constant 0 - rlwinm r5,r10,0,MSR_PR_BIT,MSR_PR_BIT ; Are we trapping from supervisor state? - mr. r1,r1 ; Is it active? - cmplwi cr2,r5,0 ; cr2_eq == 1 if yes - mr r16,r13 - lwz r14,ACT_MACT_PCB(r13) ; Now point to the PCB - lwz r9,saver1+4(r4) ; Pick up the rupt time stack - stw r14,SAVprev+4(r4) ; Queue the new save area in the front - stw r13,SAVact(r4) ; Point the savearea at its activation - stw r4,ACT_MACT_PCB(r13) ; Point to our savearea - beq cr2,ifromk - stw r4,ACT_MACT_UPCB(r13) ; Store user savearea - -ifromk: bne .L_istackfree ; Nope... - -/* We're already on the interrupt stack, get back the old - * stack pointer and make room for a frame - */ - - lwz r10,PP_INTSTACK_TOP_SS(r25) ; Get the top of the interrupt stack - addi r5,r9,INTSTACK_SIZE-FM_SIZE ; Shift stack for bounds check - subi r1,r9,FM_REDZONE ; Back up beyond the red zone - sub r5,r5,r10 ; Get displacement into stack - cmplwi r5,INTSTACK_SIZE-FM_SIZE ; Is the stack actually invalid? - blt+ ihsetback ; The stack is ok... - - lwz r5,PP_DEBSTACK_TOP_SS(r25) ; Pick up debug stack top - subi r5,r5,KERNEL_STACK_SIZE-FM_SIZE ; Adjust to start of stack - sub r5,r1,r5 ; Get displacement into debug stack - cmplwi cr2,r5,KERNEL_STACK_SIZE-FM_SIZE ; Check if we are on debug stack - blt+ cr2,ihsetback ; Yeah, that is ok too... - - lis r0,hi16(Choke) ; Choke code - ori r0,r0,lo16(Choke) ; and the rest - li r3,failStack ; Bad stack code - sc ; System ABEND - -intUnalignedStk: - lis r0,hi16(Choke) ; Choke code - ori r0,r0,lo16(Choke) ; and the rest - li r3,failUnalignedStk ; Unaligned stack code - sc ; System ABEND - - .align 5 - -.L_istackfree: - rlwinm. r0,r1,0,28,31 ; Check if stack is aligned (and get 0) - lwz r10,SAVflags(r4) ; Get savearea flags - bne-- intUnalignedStk ; Stack is unaligned... - stw r0,PP_ISTACKPTR(r25) ; Mark the stack in use - oris r10,r10,hi16(SAVrststk) ; Indicate we reset stack when we return from this one - stw r10,SAVflags(r4) ; Stick it back - -/* - * To summarize, when we reach here, the state has been saved and - * the stack is marked as busy. We now generate a small - * stack frame with backpointers to follow the calling - * conventions. We set up the backpointers to the trapped - * routine allowing us to backtrace. - */ - -ihsetback: subi r1,r1,FM_SIZE ; Make a new frame - stw r9,FM_BACKPTR(r1) ; Point back to previous stackptr - -#if VERIFYSAVE - beq- cr1,ihbootnover ; (TEST/DEBUG) - bl versave ; (TEST/DEBUG) -ihbootnover: ; (TEST/DEBUG) -#endif - -#if DEBUG -/* If debugging, we need two frames, the first being a dummy - * which links back to the trapped routine. The second is - * that which the C routine below will need - */ - lwz r5,savesrr0+4(r4) ; Get interrupt address - stw r5,FM_LR_SAVE(r1) ; save old instr ptr as LR value - stwu r1,-FM_SIZE(r1) ; Make another new frame for C routine -#endif /* DEBUG */ - - mr r31,r3 - mr r30,r4 - - lwz r3,SAVtime(r4) - lwz r4,SAVtime+4(r4) - addi r5,r25,PP_PROCESSOR - lwz r5,KERNEL_TIMER(r5) - bl EXT(thread_timer_event) - addi r6,r25,PP_PROCESSOR - lwz r5,CURRENT_STATE(r6) - addi r7,r6,USER_STATE - cmplw r5,r7 - bne 0f - addi r5,r6,SYSTEM_STATE - bl EXT(state_event) -0: - - lwz r7,ACT_TASK(r13) - lwz r8,TASK_VTIMERS(r7) - cmpwi r8,0 - beq++ 0f - - lwz r7,ACT_PER_PROC(r13) - li r4,AST_BSD - lwz r8,PP_PENDING_AST(r7) - or r8,r8,r4 - stw r8,PP_PENDING_AST(r7) - addi r3,r13,ACT_AST - bl EXT(hw_atomic_or) -0: - - mr r3,r31 - mr r4,r30 - lwz r5,savedsisr(r30) ; Get the DSISR - lwz r6,savedar+4(r30) ; Get the DAR - -#if FPFLOOD - stfd f31,emfp31(r25) ; (TEST/DEBUG) -#endif - - bl EXT(interrupt) - -/* interrupt() returns a pointer to the saved state in r3 */ - - lis r10,hi16(MASK(MSR_VEC)) ; Get the vector enable - mfmsr r0 ; Get our MSR - ori r10,r10,lo16(MASK(MSR_FP)|MASK(MSR_EE)) ; Add in FP and EE - andc r0,r0,r10 ; Turn off VEC, FP, and EE - mtmsr r0 ; Make sure interrupts are disabled - mfsprg r8,1 ; Get the current activation - lwz r10,ACT_PER_PROC(r8) ; Get the per_proc block - - lwz r7,SAVflags(r3) ; Pick up the flags - lwz r9,SAVprev+4(r3) ; Get previous save area - cmplwi cr1,r8,0 ; Are we still initializing? - lwz r12,savesrr1+4(r3) ; Get the MSR we will load on return - andis. r11,r7,hi16(SAVrststk) ; Is this the first on the stack? - stw r9,ACT_MACT_PCB(r8) ; Point to previous context savearea - mr r4,r3 ; Move the savearea pointer - beq .L_no_int_ast2 ; Get going if not the top-o-stack... - - -/* We're the last frame on the stack. Restore istackptr to empty state. - * - * Check for ASTs if one of the below is true: - * returning to user mode - * returning to a kloaded server - */ - lwz r9,PP_INTSTACK_TOP_SS(r10) ; Get the empty stack value - andc r7,r7,r11 ; Remove the stack reset bit in case we pass this one - stw r9,PP_ISTACKPTR(r10) ; Save that saved state ptr - lwz r3,ACT_PREEMPT_CNT(r8) ; Get preemption level - stw r7,SAVflags(r4) ; Save the flags - cmplwi r3, 0 ; Check for preemption - bne .L_no_int_ast ; Do not preempt if level is not zero - andi. r6,r12,MASK(MSR_PR) ; privilege mode - lwz r11,PP_PENDING_AST(r10) ; Get the pending AST mask - beq- .L_kernel_int_ast ; In kernel space, AST_URGENT check - li r3,T_AST ; Assume the worst - mr. r11,r11 ; Are there any pending? - beq .L_no_int_ast ; Nope... - b .L_call_thandler - -.L_kernel_int_ast: - andi. r11,r11,AST_URGENT ; Do we have AST_URGENT? - li r3,T_PREEMPT ; Assume the worst - beq .L_no_int_ast ; Nope... - -/* - * There is a pending AST. Massage things to make it look like - * we took a trap and jump into the trap handler. To do this - * we essentially pretend to return from the interrupt but - * at the last minute jump into the trap handler with an AST - * trap instead of performing an rfi. - */ - -.L_call_thandler: - stw r3,saveexception(r4) ; Set the exception code to T_AST/T_PREEMPT - b EXT(thandler) ; We need to preempt so treat like a trap... - -.L_no_int_ast: - mr r3,r4 ; Get into the right register for common code - -.L_no_int_ast2: - rlwinm r7,r7,0,15,13 ; Clear the syscall flag - li r4,0 ; Assume for a moment that we are in init - stw r7,SAVflags(r3) ; Set the flags with cleared syscall flag - beq-- cr1,chkfac ; Jump away if we are in init... - - lwz r4,ACT_MACT_PCB(r8) ; Get the new level marker - - -; -; This section is common to all exception exits. It throws away vector -; and floating point saveareas as the exception level of a thread is -; exited. -; -; It also enables the facility if its context is live -; Requires: -; R3 = Savearea to be released (virtual) -; R4 = New top of savearea stack (could be 0) -; R8 = pointer to activation -; R10 = per_proc block -; -; Note that barring unforseen crashes, there is no escape from this point -; on. We WILL call exception_exit and launch this context. No worries -; about preemption or interruptions here. -; -; Note that we will set up R26 with whatever context we will be launching, -; so it will indicate the current, or the deferred it it is set and we -; are going to user state. CR2_eq will be set to indicate deferred. -; - -chkfac: lwz r29,savesrr1+4(r3) ; Get the current MSR - mr. r28,r8 ; Are we still in boot? - mr r31,r10 ; Move per_proc address - mr r30,r4 ; Preserve new level - mr r27,r3 ; Save the old level - beq-- chkenax ; Yeah, skip it all... - - rlwinm. r0,r29,0,MSR_PR_BIT,MSR_PR_BIT ; Are we going into user state? - - lwz r20,curctx(r28) ; Get our current context - lwz r26,deferctx(r28) ; Get any deferred context switch - li r0,1 ; Get set to hold off quickfret - rlwinm r29,r29,0,MSR_FP_BIT+1,MSR_FP_BIT-1 ; Turn off floating point for now - lwz r21,FPUlevel(r20) ; Get the facility level - cmplwi cr2,r26,0 ; Are we going into a deferred context later? - rlwinm r29,r29,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1 ; Turn off vector for now - crnor cr2_eq,cr0_eq,cr2_eq ; Set cr2_eq if going to user state and there is deferred - lhz r19,PP_CPU_NUMBER(r31) ; Get our CPU number - cmplw r27,r21 ; Are we returning from the active level? - stw r0,holdQFret(r31) ; Make sure we hold off releasing quickfret - bne++ fpuchkena ; Nope... - -; -; First clean up any live context we are returning from -; - - lwz r22,FPUcpu(r20) ; Get CPU this context was last dispatched on - - stw r19,FPUcpu(r20) ; Claim context for us - - eieio ; Make sure this gets out before owner clear - -#if ppeSize != 16 -#error per_proc_entry is not 16bytes in size -#endif - - lis r23,hi16(EXT(PerProcTable)) ; Set base PerProcTable - slwi r22,r22,4 ; Find offset to the owner per_proc_entry - ori r23,r23,lo16(EXT(PerProcTable)) ; Set base PerProcTable - li r24,FPUowner ; Displacement to float owner - add r22,r23,r22 ; Point to the owner per_proc_entry - lwz r22,ppe_vaddr(r22) ; Point to the owner per_proc - -fpuinvothr: lwarx r23,r24,r22 ; Get the owner - - sub r0,r23,r20 ; Subtract one from the other - sub r21,r20,r23 ; Subtract the other from the one - or r21,r21,r0 ; Combine them - srawi r21,r21,31 ; Get a 0 if equal or -1 of not - and r23,r23,r21 ; Make 0 if same, unchanged if not - stwcx. r23,r24,r22 ; Try to invalidate it - bne-- fpuinvothr ; Try again if there was a collision... - - isync - -; -; Now if there is a savearea associated with the popped context, release it. -; Either way, pop the level to the top stacked context. -; - - lwz r22,FPUsave(r20) ; Get pointer to the first savearea - li r21,0 ; Assume we popped all the way out - mr. r22,r22 ; Is there anything there? - beq++ fpusetlvl ; No, see if we need to enable... - - lwz r21,SAVlevel(r22) ; Get the level of that savearea - cmplw r21,r27 ; Is this the saved copy of the live stuff? - bne fpusetlvl ; No, leave as is... - - lwz r24,SAVprev+4(r22) ; Pick up the previous area - li r21,0 ; Assume we popped all the way out - mr. r24,r24 ; Any more context stacked? - beq-- fpuonlyone ; Nope... - lwz r21,SAVlevel(r24) ; Get the level associated with save - -fpuonlyone: stw r24,FPUsave(r20) ; Dequeue this savearea - - rlwinm r3,r22,0,0,19 ; Find main savearea header - - lwz r8,quickfret(r31) ; Get the first in quickfret list (top) - lwz r9,quickfret+4(r31) ; Get the first in quickfret list (bottom) - lwz r2,SACvrswap(r3) ; Get the virtual to real conversion (top) - lwz r3,SACvrswap+4(r3) ; Get the virtual to real conversion (bottom) - stw r8,SAVprev(r22) ; Link the old in (top) - stw r9,SAVprev+4(r22) ; Link the old in (bottom) - xor r3,r22,r3 ; Convert to physical - stw r2,quickfret(r31) ; Set the first in quickfret list (top) - stw r3,quickfret+4(r31) ; Set the first in quickfret list (bottom) - -#if FPVECDBG - lis r0,HIGH_ADDR(CutTrace) ; (TEST/DEBUG) - li r2,0x3301 ; (TEST/DEBUG) - oris r0,r0,LOW_ADDR(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) -#endif - -fpusetlvl: stw r21,FPUlevel(r20) ; Save the level - -; -; Here we check if we are at the right level -; We need to check the level we are entering, not the one we are exiting. -; Therefore, we will use the defer level if it is non-zero and we are -; going into user state. -; - -fpuchkena: bt-- cr2_eq,fpuhasdfrd ; Skip if deferred, R26 already set up... - mr r26,r20 ; Use the non-deferred value - -fpuhasdfrd: -#if 0 - rlwinm. r0,r29,0,MSR_PR_BIT,MSR_PR_BIT ; (TEST/DEBUG) Going into user state? - beq fpunusrstt ; (TEST/DEBUG) Nope... - lwz r23,FPUlevel(r26) ; (TEST/DEBUG) Get the level ID - lwz r24,FPUsave(r26) ; (TEST/DEBUG) Get the first savearea - mr. r23,r23 ; (TEST/DEBUG) Should be level 0 - beq++ fpulvl0 ; (TEST/DEBUG) Yes... - - lis r0,hi16(Choke) ; (TEST/DEBUG) Choke code - ori r0,r0,lo16(Choke) ; (TEST/DEBUG) and the rest - sc ; (TEST/DEBUG) System ABEND - -fpulvl0: mr. r24,r24 ; (TEST/DEBUG) Any context? - beq fpunusrstt ; (TEST/DEBUG) No... - lwz r23,SAVlevel(r24) ; (TEST/DEBUG) Get level of context - lwz r21,SAVprev+4(r24) ; (TEST/DEBUG) Get previous pointer - mr. r23,r23 ; (TEST/DEBUG) Is this our user context? - beq++ fpulvl0b ; (TEST/DEBUG) Yes... - - lis r0,hi16(Choke) ; (TEST/DEBUG) Choke code - ori r0,r0,lo16(Choke) ; (TEST/DEBUG) and the rest - sc ; (TEST/DEBUG) System ABEND - -fpulvl0b: mr. r21,r21 ; (TEST/DEBUG) Is there a forward chain? - beq++ fpunusrstt ; (TEST/DEBUG) Nope... - - lis r0,hi16(Choke) ; (TEST/DEBUG) Choke code - ori r0,r0,lo16(Choke) ; (TEST/DEBUG) and the rest - sc ; (TEST/DEBUG) System ABEND - -fpunusrstt: ; (TEST/DEBUG) -#endif - - lwz r21,FPUowner(r31) ; Get the ID of the live context - lwz r23,FPUlevel(r26) ; Get the level ID - lwz r24,FPUcpu(r26) ; Get the CPU that the context was last dispatched on - cmplw cr3,r26,r21 ; Do we have the live context? - cmplw r30,r23 ; Are we about to launch the live level? - bne-- cr3,chkvec ; No, can not possibly enable... - cmplw cr1,r19,r24 ; Was facility used on this processor last? - bne-- chkvec ; No, not live... - bne-- cr1,chkvec ; No, wrong cpu, have to enable later.... - - lwz r24,FPUsave(r26) ; Get the first savearea - mr. r24,r24 ; Any savearea? - beq++ fpuena ; Nope... - lwz r25,SAVlevel(r24) ; Get the level of savearea - lwz r0,SAVprev+4(r24) ; Get the previous - - cmplw r30,r25 ; Is savearea for the level we are launching? - bne++ fpuena ; No, just go enable... - - stw r0,FPUsave(r26) ; Pop the chain - - rlwinm r3,r24,0,0,19 ; Find main savearea header - - lwz r8,quickfret(r31) ; Get the first in quickfret list (top) - lwz r9,quickfret+4(r31) ; Get the first in quickfret list (bottom) - lwz r2,SACvrswap(r3) ; Get the virtual to real conversion (top) - lwz r3,SACvrswap+4(r3) ; Get the virtual to real conversion (bottom) - stw r8,SAVprev(r24) ; Link the old in (top) - stw r9,SAVprev+4(r24) ; Link the old in (bottom) - xor r3,r24,r3 ; Convert to physical - stw r2,quickfret(r31) ; Set the first in quickfret list (top) - stw r3,quickfret+4(r31) ; Set the first in quickfret list (bottom) - -#if FPVECDBG - lis r0,HIGH_ADDR(CutTrace) ; (TEST/DEBUG) - li r2,0x3302 ; (TEST/DEBUG) - oris r0,r0,LOW_ADDR(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) -#endif - -fpuena: ori r29,r29,lo16(MASK(MSR_FP)) ; Enable facility - -chkvec: - - lwz r21,VMXlevel(r20) ; Get the facility level - - cmplw r27,r21 ; Are we returning from the active level? - bne+ vmxchkena ; Nope... - - -; -; First clean up any live context we are returning from -; - - lwz r22,VMXcpu(r20) ; Get CPU this context was last dispatched on - - stw r19,VMXcpu(r20) ; Claim context for us - - eieio ; Make sure this gets out before owner clear - - lis r23,hi16(EXT(PerProcTable)) ; Set base PerProcTable - slwi r22,r22,4 ; Find offset to the owner per_proc_entry - ori r23,r23,lo16(EXT(PerProcTable)) ; Set base PerProcTable - li r24,VMXowner ; Displacement to float owner - add r22,r23,r22 ; Point to the owner per_proc_entry - lwz r22,ppe_vaddr(r22) ; Point to the owner per_proc - -vmxinvothr: lwarx r23,r24,r22 ; Get the owner - - sub r0,r23,r20 ; Subtract one from the other - sub r21,r20,r23 ; Subtract the other from the one - or r21,r21,r0 ; Combine them - srawi r21,r21,31 ; Get a 0 if equal or -1 of not - and r23,r23,r21 ; Make 0 if same, unchanged if not - stwcx. r23,r24,r22 ; Try to invalidate it - bne-- vmxinvothr ; Try again if there was a collision... - - isync - -; -; Now if there is a savearea associated with the popped context, release it. -; Either way, pop the level to the top stacked context. -; - - lwz r22,VMXsave(r20) ; Get pointer to the first savearea - li r21,0 ; Assume we popped all the way out - mr. r22,r22 ; Is there anything there? - beq++ vmxsetlvl ; No, see if we need to enable... - - lwz r21,SAVlevel(r22) ; Get the level of that savearea - cmplw r21,r27 ; Is this the saved copy of the live stuff? - bne vmxsetlvl ; No, leave as is... - - lwz r24,SAVprev+4(r22) ; Pick up the previous area - li r21,0 ; Assume we popped all the way out - mr. r24,r24 ; Any more context? - beq-- vmxonlyone ; Nope... - lwz r21,SAVlevel(r24) ; Get the level associated with save - -vmxonlyone: stw r24,VMXsave(r20) ; Dequeue this savearea - - rlwinm r3,r22,0,0,19 ; Find main savearea header - - lwz r8,quickfret(r31) ; Get the first in quickfret list (top) - lwz r9,quickfret+4(r31) ; Get the first in quickfret list (bottom) - lwz r2,SACvrswap(r3) ; Get the virtual to real conversion (top) - lwz r3,SACvrswap+4(r3) ; Get the virtual to real conversion (bottom) - stw r8,SAVprev(r22) ; Link the old in (top) - stw r9,SAVprev+4(r22) ; Link the old in (bottom) - xor r3,r22,r3 ; Convert to physical - stw r2,quickfret(r31) ; Set the first in quickfret list (top) - stw r3,quickfret+4(r31) ; Set the first in quickfret list (bottom) - -#if FPVECDBG - lis r0,HIGH_ADDR(CutTrace) ; (TEST/DEBUG) - li r2,0x3401 ; (TEST/DEBUG) - oris r0,r0,LOW_ADDR(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) -#endif - -vmxsetlvl: stw r21,VMXlevel(r20) ; Save the level - -; -; Here we check if we are at the right level -; - -vmxchkena: lwz r21,VMXowner(r31) ; Get the ID of the live context - lwz r23,VMXlevel(r26) ; Get the level ID - cmplw r26,r21 ; Do we have the live context? - lwz r24,VMXcpu(r26) ; Get the CPU that the context was last dispatched on - bne-- setena ; No, can not possibly enable... - cmplw r30,r23 ; Are we about to launch the live level? - cmplw cr1,r19,r24 ; Was facility used on this processor last? - bne-- setena ; No, not live... - bne-- cr1,setena ; No, wrong cpu, have to enable later.... - - lwz r24,VMXsave(r26) ; Get the first savearea - mr. r24,r24 ; Any savearea? - beq++ vmxena ; Nope... - lwz r25,SAVlevel(r24) ; Get the level of savearea - lwz r0,SAVprev+4(r24) ; Get the previous - cmplw r30,r25 ; Is savearea for the level we are launching? - bne++ vmxena ; No, just go enable... - - stw r0,VMXsave(r26) ; Pop the chain - - rlwinm r3,r24,0,0,19 ; Find main savearea header - - lwz r8,quickfret(r31) ; Get the first in quickfret list (top) - lwz r9,quickfret+4(r31) ; Get the first in quickfret list (bottom) - lwz r2,SACvrswap(r3) ; Get the virtual to real conversion (top) - lwz r3,SACvrswap+4(r3) ; Get the virtual to real conversion (bottom) - stw r8,SAVprev(r24) ; Link the old in (top) - stw r9,SAVprev+4(r24) ; Link the old in (bottom) - xor r3,r24,r3 ; Convert to physical - stw r2,quickfret(r31) ; Set the first in quickfret list (top) - stw r3,quickfret+4(r31) ; Set the first in quickfret list (bottom) - -#if FPVECDBG - lis r0,HIGH_ADDR(CutTrace) ; (TEST/DEBUG) - li r2,0x3402 ; (TEST/DEBUG) - oris r0,r0,LOW_ADDR(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) -#endif - -vmxena: oris r29,r29,hi16(MASK(MSR_VEC)) ; Enable facility - -setena: lwz r18,umwSpace(r28) ; Get the space ID in case we are launching user - rlwinm. r0,r29,0,MSR_PR_BIT,MSR_PR_BIT ; Are we about to launch user state? - li r0,0 ; Get set to release quickfret holdoff - crmove cr7_eq,cr0_eq ; Remember if we are going to user state - rlwimi. r20,r29,(((31-floatCngbit)+(MSR_FP_BIT+1))&31),floatCngbit,floatCngbit ; Set flag if we enabled floats - lwz r19,deferctx(r28) ; Get any deferred facility context switch - rlwinm r20,r29,(((31-vectorCngbit)+(MSR_VEC_BIT+1))&31),vectorCngbit,vectorCngbit ; Set flag if we enabled vector - stw r29,savesrr1+4(r27) ; Turn facility on or off - stw r0,holdQFret(r31) ; Release quickfret - oris r18,r18,hi16(umwSwitchAway) ; Set the switch-away bit in case we go to user - - beq setenaa ; Neither float nor vector turned on.... - - lwz r5,ACT_MACT_SPF(r28) ; Get activation copy - lwz r6,spcFlags(r31) ; Get per_proc copy - or r5,r5,r20 ; Set vector/float changed bits in activation - or r6,r6,r20 ; Set vector/float changed bits in per_proc - stw r5,ACT_MACT_SPF(r28) ; Set activation copy - stw r6,spcFlags(r31) ; Set per_proc copy - -setenaa: mfdec r24 ; Get decrementer - bf+ cr2_eq,nodefer ; No deferred to switch to... - - li r20,0 ; Clear this - stw r26,curctx(r28) ; Make the facility context current - stw r20,deferctx(r28) ; Clear deferred context - -nodefer: lwz r22,qactTimer(r28) ; Get high order quick activation timer - mr. r24,r24 ; See if it has popped already... - lwz r23,qactTimer+4(r28) ; Get low order qact timer - ble- chkifuser ; We have popped or are just about to... - -segtb: mftbu r20 ; Get the upper time base - mftb r21 ; Get the low - mftbu r19 ; Get upper again - or. r0,r22,r23 ; Any time set? - cmplw cr1,r20,r19 ; Did they change? - beq++ chkifuser ; No time set.... - bne-- cr1,segtb ; Timebase ticked, get them again... - - subfc r6,r21,r23 ; Subtract current from qact time - li r0,0 ; Make a 0 - subfe r5,r20,r22 ; Finish subtract - subfze r0,r0 ; Get a 0 if qact was bigger than current, -1 otherwise - andc. r12,r5,r0 ; Set 0 if qact has passed - andc r13,r6,r0 ; Set 0 if qact has passed - bne chkifuser ; If high order is non-zero, this is too big for a decrementer - cmplw r13,r24 ; Is this earlier than the decrementer? (logical compare takes care of high bit on) - bge++ chkifuser ; No, do not reset decrementer... - - mtdec r13 ; Set our value - -chkifuser: bl EXT(mach_absolute_time) - lwz r5,ACT_PER_PROC(r28) - addi r6,r5,PP_PROCESSOR - lwz r5,KERNEL_TIMER(r6) - lwz r29,CURRENT_STATE(r6) - beq-- cr7,chkifuser1 ; Skip this if we are going to kernel... - stw r18,umwSpace(r28) ; Half-invalidate to force MapUserAddressWindow to reload SRs - addi r5,r28,USER_TIMER - addi r29,r6,USER_STATE - -chkifuser1: bl EXT(thread_timer_event) - mr r5,r29 - bl EXT(state_event) - -chkenax: - -#if DEBUG - lwz r20,SAVact(r27) ; (TEST/DEBUG) Make sure our restore - mfsprg r21, 1 ; (TEST/DEBUG) with the current act. - cmpwi r21,0 ; (TEST/DEBUG) - beq-- yeswereok ; (TEST/DEBUG) - cmplw r21,r20 ; (TEST/DEBUG) - beq++ yeswereok ; (TEST/DEBUG) - - lis r0,hi16(Choke) ; (TEST/DEBUG) Choke code - ori r0,r0,lo16(Choke) ; (TEST/DEBUG) and the rest - mr r21,r27 ; (TEST/DEBUG) Save the savearea address - li r3,failContext ; (TEST/DEBUG) Bad state code - sc ; (TEST/DEBUG) System ABEND - -yeswereok: -#endif - - mr r3,r27 ; Pass savearea back - b EXT(exception_exit) ; We are all done now... - - - -; -; Null PPC call - performance testing, does absolutely nothing -; - - .align 5 - - .globl EXT(ppcNull) - -LEXT(ppcNull) - - li r3,-1 ; Make sure we test no asts - blr - - -; -; Instrumented null PPC call - performance testing, does absolutely nothing -; Forces various timestamps to be returned. -; - - .align 5 - - .globl EXT(ppcNullinst) - -LEXT(ppcNullinst) - - li r3,-1 ; Make sure we test no asts - blr - - -/* - * Here's where we handle the fastpath stuff - * We'll do what we can here because registers are already - * loaded and it will be less confusing that moving them around. - * If we need to though, we'll branch off somewhere's else. - * - * Registers when we get here: - * - * r0 = syscall number - * r4 = savearea/pcb - * r13 = activation - * r14 = previous savearea (if any) - * r16 = thread - * r25 = per_proc - */ - - .align 5 - -fastpath: cmplwi cr3,r0,0x7FF5 ; Is this a null fastpath? - beq-- cr3,fastexutl ; Yes, bail fast... - cmplwi cr3,r0,0x7FF1 ; Is it CthreadSetSelfNumber? - bnelr-- cr3 ; Not a fast path... - -/* - * void cthread_set_self(cproc_t p) - * - * Set's thread state "user_value". In practice this is the thread-local-data-pointer (TLDP), - * though we do not interpret it. This call is mostly used by 32-bit tasks, but we save all 64 bits - * in case a 64-bit task wants to use this facility. They normally do not, because the 64-bit - * ABI reserves r13 for the TLDP. - * - * This op is invoked as follows: - * li r0, CthreadSetSelfNumber // load the fast-trap number - * sc // invoke fast-trap - * blr - */ - -CthreadSetSelfNumber: - lwz r3,saver3+0(r4) /* get the TLDP passed in r3 */ - lwz r5,saver3+4(r4) /* (all 64 bits, in case this is a 64-bit task) */ - stw r3,CTHREAD_SELF+0(r13) /* Remember it in the activation... */ - stw r5,CTHREAD_SELF+4(r13) - stw r3,UAW+0(r25) /* ...and in the per-proc */ - stw r5,UAW+4(r25) - - - .globl EXT(fastexit) -EXT(fastexit): -fastexutl: mr r3,r4 ; Pass back savearea - b EXT(exception_exit) ; Go back to the caller... - - -/* - * Here's where we check for a hit on the Blue Box Assist - * Most registers are non-volatile, so be careful here. If we don't - * recognize the trap instruction we go back for regular processing. - * Otherwise we transfer to the assist code. - */ - - .align 5 - -checkassist: - lwz r0,saveexception(r4) ; Get the exception code - lwz r23,savesrr1+4(r4) ; Get the interrupted MSR - lwz r26,ACT_MACT_BEDA(r13) ; Get Blue Box Descriptor Area - mtcrf 0x18,r23 ; Check what SRR1 says - lwz r24,ACT_MACT_BTS(r13) ; Get the table start - cmplwi r0,T_AST ; Check for T_AST trap - lwz r27,savesrr0+4(r4) ; Get trapped address - crnand cr1_eq,SRR1_PRG_TRAP_BIT,MSR_PR_BIT ; We need both trap and user state - sub r24,r27,r24 ; See how far into it we are - cror cr0_eq,cr0_eq,cr1_eq ; Need to bail if AST or not trap or not user state - cmplwi cr1,r24,BB_MAX_TRAP ; Do we fit in the list? - cror cr0_eq,cr0_eq,cr1_gt ; Also leave it trap not in range - btlr- cr0_eq ; No assist if AST or not trap or not user state or trap not in range - b EXT(atomic_switch_trap) ; Go to the assist... - -; -; Virtual Machine Monitor -; Here is where we exit from the emulated context -; Note that most registers get trashed here -; R3 and R30 are preserved across the call and hold the activation -; and savearea respectivily. -; - - .align 5 - -exitFromVM: mr r30,r4 ; Get the savearea - mr r3,r13 ; Get the activation - - b EXT(vmm_exit) ; Do it to it - - .align 5 - .globl EXT(retFromVM) - -LEXT(retFromVM) - mfsprg r10,1 ; Get the current activation - lwz r10,ACT_PER_PROC(r10) ; Get the per_proc block - mr r8,r3 ; Get the activation - lwz r4,SAVprev+4(r30) ; Pick up the previous savearea - mr r3,r30 ; Put savearea in proper register for common code - lwz r11,SAVflags(r30) ; Get the flags of the current savearea - rlwinm r11,r11,0,15,13 ; Clear the syscall flag - mr r1,r8 - stw r11,SAVflags(r3) ; Save back the flags (with reset stack cleared) - - stw r4,ACT_MACT_PCB(r8) ; Point to the previous savearea (or 0 if none) - - lwz r5,THREAD_KERNEL_STACK(r1) ; Get the base pointer to the stack - addi r5,r5,KERNEL_STACK_SIZE-FM_SIZE ; Reset to empty - stw r5,ACT_MACT_KSP(r8) ; Save the empty stack pointer - b chkfac ; Go end it all... - - -; -; chandler (note: not a candle maker or tallow merchant) -; -; Here is the system choke handler. This is where the system goes -; to die. -; -; We get here as a result of a T_CHOKE exception which is generated -; by the Choke firmware call or by lowmem_vectors when it detects a -; fatal error. Examples of where this may be used is when we detect -; problems in low-level mapping chains, trashed savearea free chains, -; or stack guardpage violations. -; -; Note that we can not set a back chain in the stack when we come -; here because we are probably here because the chain was corrupt. -; - - - .align 5 - .globl EXT(chandler) -LEXT(chandler) ; Choke handler - - li r31,0 ; Get a 0 - mfsprg r25,1 ; Get the current activation - lwz r25,ACT_PER_PROC(r25) ; Get the per_proc block - stw r31,traceMask(0) ; Force tracing off right now - - - - lwz r1,PP_DEBSTACKPTR(r25) ; Get debug stack pointer - cmpwi r1,-1 ; Are we already choking? - bne chokefirst ; Nope... - -chokespin: addi r31,r31,1 ; Spin and hope for an analyzer connection... - addi r31,r31,1 ; Spin and hope for an analyzer connection... - addi r31,r31,1 ; Spin and hope for an analyzer connection... - addi r31,r31,1 ; Spin and hope for an analyzer connection... - addi r31,r31,1 ; Spin and hope for an analyzer connection... - addi r31,r31,1 ; Spin and hope for an analyzer connection... - b chokespin ; Spin and hope for an analyzer connection... - -chokefirst: li r0,-1 ; Set choke value - mr. r1,r1 ; See if we are on debug stack yet - lwz r10,saver1+4(r4) ; - stw r0,PP_DEBSTACKPTR(r25) ; Show we are choking - bne chokestart ; We are not on the debug stack yet... - - lwz r2,PP_DEBSTACK_TOP_SS(r25) ; Get debug stack top - sub r11,r2,r10 ; Get stack depth - - cmplwi r11,KERNEL_STACK_SIZE-FM_SIZE-TRAP_SPACE_NEEDED ; Check if stack pointer is ok - bgt chokespin ; Bad stack pointer or too little left, just die... - - subi r1,r10,FM_REDZONE ; Make a red zone - -chokestart: li r0,0 ; Get a zero - stw r0,FM_BACKPTR(r1) ; We now have terminated the back chain - - bl EXT(SysChoked) ; Call the "C" phase of this - b chokespin ; Should not be here so just go spin... - - -#if VERIFYSAVE -; -; Savearea chain verification -; - -versave: -#if 0 - lis r22,hi16(EXT(DebugWork)) ; (TEST/DEBUG) - ori r22,r22,lo16(EXT(DebugWork)) ; (TEST/DEBUG) - lwz r23,0(r22) ; (TEST/DEBUG) - mr. r23,r23 ; (TEST/DEBUG) - beqlr- ; (TEST/DEBUG) - mfsprg r20,1 ; Get the current activation - lwz r20,ACT_PER_PROC(r20) ; Get the per_proc block - lwz r21,pfAvailable(r20) ; (TEST/DEBUG) - mr. r21,r21 ; (TEST/DEBUG) - bnelr+ ; (TEST/DEBUG) - - stw r22,0(r22) ; (TEST/DEBUG) Lock out more checks - BREAKPOINT_TRAP ; (TEST/DEBUG) Get into debugger -#endif - -#if 0 - ;; This code is broken and migration will make the matter even worse -; -; Make sure that all savearea chains have the right type on them -; - - lis r28,hi16(EXT(default_pset)) ; (TEST/DEBUG) - lis r27,hi16(EXT(DebugWork)) ; (TEST/DEBUG) - ori r28,r28,lo16(EXT(default_pset)) ; (TEST/DEBUG) - ori r27,r27,lo16(EXT(DebugWork)) ; (TEST/DEBUG) - li r20,0 ; (TEST/DEBUG) - lwz r26,0(r27) ; (TEST/DEBUG) - lwz r27,psthreadcnt(r28) ; (TEST/DEBUG) - mr. r26,r26 ; (TEST/DEBUG) Have we locked the test out? - lwz r28,psthreads(r28) ; (TEST/DEBUG) - mflr r31 ; (TEST/DEBUG) Save return - bnelr- ; (TEST/DEBUG) Test already triggered, skip... - b fckgo ; (TEST/DEBUG) Join up... - -fcknext: mr. r27,r27 ; (TEST/DEBUG) Any more threads? - bne+ fckxxx ; (TEST/DEBUG) Yes... - - mtlr r31 ; (TEST/DEBUG) Restore return - blr ; (TEST/DEBUG) Leave... - -fckxxx: lwz r28,THREAD_PSTHRN(r28) ; (TEST/DEBUG) Get next thread - -fckgo: subi r27,r27,1 ; (TEST/DEBUG) Decrement thread count - lwz r24,THREAD_TOP_ACT(r28) ; (TEST/DEBUG) Get activation for the thread - lwz r20,ACT_MACT_PCB(r24) ; (TEST/DEBUG) Get the normal context - li r21,SAVgeneral ; (TEST/DEBUG) Make sure this is all general context - bl versavetype ; (TEST/DEBUG) Check the chain - - lwz r20,facctx+FPUsave(r24) ; (TEST/DEBUG) Get regular floating point - li r21,SAVfloat ; (TEST/DEBUG) Make sure this is all floating point - bl versavetype ; (TEST/DEBUG) Check the chain - - lwz r20,facctx+VMXsave(r24) ; (TEST/DEBUG) Get regular vector point - li r21,SAVvector ; (TEST/DEBUG) Make sure this is all vector - bl versavetype ; (TEST/DEBUG) Check the chain - - lwz r29,vmmControl(r24) ; (TEST/DEBUG) Get the virtual machine control blocks - mr. r29,r29 ; (TEST/DEBUG) Are there any? - beq+ fcknext ; (TEST/DEBUG) Nope, next thread... - - li r22,kVmmMaxContextsPerThread ; (TEST/DEBUG) Get the number of control blocks - subi r29,r29,vmmCEntrySize ; (TEST/DEBUG) Get running start - -fcknvmm: subi r22,r22,1 ; (TEST/DEBUG) Do all of them - mr. r22,r22 ; (TEST/DEBUG) Are we all done? - addi r29,r29,vmmCEntrySize ; (TEST/DEBUG) Get the next entry - blt- fcknext ; (TEST/DEBUG) Yes, check next thread... - - lwz r23,vmmFlags(r29) ; (TEST/DEBUG) Get entry flags - rlwinm. r23,r23,0,0,0 ; (TEST/DEBUG) Is this in use? - beq+ fcknvmm ; (TEST/DEBUG) Not in use... - - lwz r20,vmmFacCtx+FPUsave(r29) ; (TEST/DEBUG) Get regular floating point - li r21,SAVfloat ; (TEST/DEBUG) Make sure this is all floating point - bl versavetype ; (TEST/DEBUG) Check the chain - - lwz r20,vmmFacCtx+VMXsave(r29) ; (TEST/DEBUG) Get regular vector point - li r21,SAVvector ; (TEST/DEBUG) Make sure this is all vector - bl versavetype ; (TEST/DEBUG) Check the chain - b fcknvmm ; (TEST/DEBUG) Get then vmm block... - -versavetype: - mr. r20,r20 ; (TEST/DEBUG) Chain done? - beqlr- ; (TEST/DEBUG) Yes... - - lwz r23,SAVflags(r20) ; (TEST/DEBUG) Get the flags - rlwinm r23,r23,24,24,31 ; (TEST/DEBUG) Position it - cmplw r23,r21 ; (TEST/DEBUG) Are we the correct type? - beq+ versvok ; (TEST/DEBUG) This one is ok... - - lis r22,hi16(EXT(DebugWork)) ; (TEST/DEBUG) - ori r22,r22,lo16(EXT(DebugWork)) ; (TEST/DEBUG) - stw r22,0(r22) ; (TEST/DEBUG) Lock out more checks - BREAKPOINT_TRAP ; (TEST/DEBUG) Get into debugger - -versvok: lwz r20,SAVprev+4(r20) ; (TEST/DEBUG) Get the previous one - b versavetype ; (TEST/DEBUG) Go check its type... -#endif - - -#endif diff --git a/osfmk/ppc/hw_lock.s b/osfmk/ppc/hw_lock.s deleted file mode 100644 index 880bbf6ef..000000000 --- a/osfmk/ppc/hw_lock.s +++ /dev/null @@ -1,2187 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include -#include -#include - - -#include -#if CONFIG_DTRACE - #define LOCKSTAT_LABEL(lab) \ - .data __ASMNL__ \ - .globl lab __ASMNL__ \ - lab: __ASMNL__ \ - .long 9f __ASMNL__ \ - .text __ASMNL__ \ - 9: __ASMNL__ \ - - .globl _dtrace_probe, _lockstat_probemap -#define LOCKSTAT_RECORD(id) \ - lis r6,hi16(_lockstat_probemap) __ASMNL__ \ - ori r6,r6,lo16(_lockstat_probemap) __ASMNL__ \ - lwz r5,4*id(r6) __ASMNL__ \ - mr. r5,r5 __ASMNL__ \ - beqlr-- __ASMNL__ \ - mr r4,r3 __ASMNL__ \ - mr r3,r5 __ASMNL__ \ - li r5,0 __ASMNL__ \ - li r6,0 __ASMNL__ \ - li r7,0 __ASMNL__ \ - li r8,0 __ASMNL__ \ - PROLOG(0) __ASMNL__ \ - bl _dtrace_probe __ASMNL__ \ - EPILOG -#endif - - - -#define STRING ascii - -#define ILK_LOCKED 0x01 -#define WAIT_FLAG 0x02 -#define WANT_UPGRADE 0x04 -#define WANT_EXCL 0x08 -#define PRIV_EXCL 0x8000 - -#define TH_FN_OWNED 0x01 - -# volatile CR bits -#define hwtimeout 20 -#define mlckmiss 21 - -#define RW_DATA 0 - -#define PROLOG(space) \ - stwu r1,-(FM_ALIGN(space)+FM_SIZE)(r1) __ASMNL__ \ - mfcr r2 __ASMNL__ \ - mflr r0 __ASMNL__ \ - stw r3,FM_ARG0(r1) __ASMNL__ \ - stw r11,FM_ARG0+0x04(r1) __ASMNL__ \ - stw r2,(FM_ALIGN(space)+FM_SIZE+FM_CR_SAVE)(r1) __ASMNL__ \ - stw r0,(FM_ALIGN(space)+FM_SIZE+FM_LR_SAVE)(r1) __ASMNL__ - -#define EPILOG \ - lwz r1,0(r1) __ASMNL__ \ - lwz r0,FM_LR_SAVE(r1) __ASMNL__ \ - mtlr r0 __ASMNL__ - -/* - * void hw_lock_init(hw_lock_t) - * - * Initialize a hardware lock. - */ - .align 5 - .globl EXT(hw_lock_init) - -LEXT(hw_lock_init) - - li r0, 0 ; set lock to free == 0 - stw r0, 0(r3) ; Initialize the lock - blr - -/* - * unsigned int hw_lock_bit(hw_lock_t, unsigned int bit, unsigned int timeout) - * - * Try to acquire spin-lock. The second parameter is the bit mask to test and set. - * multiple bits may be set. Return success (1) or failure (0). - * Attempt will fail after timeout ticks of the timebase. - */ - .align 5 - .globl EXT(hw_lock_bit) - -LEXT(hw_lock_bit) - - crset hwtimeout ; timeout option - mr r12,r4 ; Load bit mask - mr r4,r5 ; Load timeout value - b lckcomm ; Join on up... - -/* - * void hw_lock_lock(hw_lock_t) - * - * Acquire lock, spinning until it becomes available. - * Return with preemption disabled. - * We will just set a default timeout and jump into the NORMAL timeout lock. - */ - .align 5 - .globl EXT(hw_lock_lock) - -LEXT(hw_lock_lock) - crclr hwtimeout ; no timeout option - li r4,0 ; request default timeout value - li r12,ILK_LOCKED ; Load bit mask - b lckcomm ; Join on up... - -lockDisa: - crset hwtimeout ; timeout option - li r4,0 ; request default timeout value - li r12,ILK_LOCKED ; Load bit mask - b lckcomm ; Join on up... - -/* - * unsigned int hw_lock_to(hw_lock_t, unsigned int timeout) - * - * Try to acquire spin-lock. Return success (1) or failure (0). - * Attempt will fail after timeout ticks of the timebase. - * We try fairly hard to get this lock. We disable for interruptions, but - * reenable after a "short" timeout (128 ticks, we may want to change this). - * After checking to see if the large timeout value (passed in) has expired and a - * sufficient number of cycles have gone by (to insure pending 'rupts are taken), - * we return either in abject failure, or disable and go back to the lock sniff routine. - * If the sniffer finds the lock free, it jumps right up and tries to grab it. - */ - .align 5 - .globl EXT(hw_lock_to) - -LEXT(hw_lock_to) - crset hwtimeout ; timeout option - li r12,ILK_LOCKED ; Load bit mask -lckcomm: - mfsprg r6,1 ; Get the current activation - lwz r5,ACT_PREEMPT_CNT(r6) ; Get the preemption level - addi r5,r5,1 ; Bring up the disable count - stw r5,ACT_PREEMPT_CNT(r6) ; Save it back - mr r5,r3 ; Get the address of the lock - li r8,0 ; Set r8 to zero - -lcktry: lwarx r6,0,r5 ; Grab the lock value - and. r3,r6,r12 ; Is it locked? - or r6,r6,r12 ; Set interlock - bne-- lckspin ; Yeah, wait for it to clear... - stwcx. r6,0,r5 ; Try to seize that there durn lock - bne-- lcktry ; Couldn't get it... - li r3,1 ; return true - .globl EXT(hwllckPatch_isync) -LEXT(hwllckPatch_isync) - isync ; Make sure we don't use a speculativily loaded value - blr ; Go on home... - -lckspin: li r6,lgKillResv ; Get killing field - stwcx. r6,0,r6 ; Kill reservation - - mr. r4,r4 ; Test timeout value - bne++ lockspin0 - lis r4,hi16(EXT(LockTimeOut)) ; Get the high part - ori r4,r4,lo16(EXT(LockTimeOut)) ; And the low part - lwz r4,0(r4) ; Get the timeout value -lockspin0: - mr. r8,r8 ; Is r8 set to zero - bne++ lockspin1 ; If yes, first spin attempt - lis r0,hi16(MASK(MSR_VEC)) ; Get vector enable - mfmsr r9 ; Get the MSR value - ori r0,r0,lo16(MASK(MSR_FP)) ; Get FP enable - ori r7,r0,lo16(MASK(MSR_EE)) ; Get EE bit on too - andc r9,r9,r0 ; Clear FP and VEC - andc r7,r9,r7 ; Clear EE as well - mtmsr r7 ; Turn off interruptions - isync ; May have turned off vec and fp here - mftb r8 ; Get timestamp on entry - b lcksniff - -lockspin1: mtmsr r7 ; Turn off interruptions - mftb r8 ; Get timestamp on entry - -lcksniff: lwz r3,0(r5) ; Get that lock in here - and. r3,r3,r12 ; Is it free yet? - beq++ lckretry ; Yeah, try for it again... - - mftb r10 ; Time stamp us now - sub r10,r10,r8 ; Get the elapsed time - cmplwi r10,128 ; Have we been spinning for 128 tb ticks? - blt++ lcksniff ; Not yet... - - mtmsr r9 ; Say, any interrupts pending? - -; The following instructions force the pipeline to be interlocked to that only one -; instruction is issued per cycle. The insures that we stay enabled for a long enough -; time; if it's too short, pending interruptions will not have a chance to be taken - - subi r4,r4,128 ; Back off elapsed time from timeout value - or r4,r4,r4 ; Do nothing here but force a single cycle delay - mr. r4,r4 ; See if we used the whole timeout - li r3,0 ; Assume a timeout return code - or r4,r4,r4 ; Do nothing here but force a single cycle delay - - ble-- lckfail ; We failed - b lockspin1 ; Now that we've opened an enable window, keep trying... -lckretry: - mtmsr r9 ; Restore interrupt state - li r8,1 ; Insure that R8 is not 0 - b lcktry -lckfail: ; We couldn't get the lock - bf hwtimeout,lckpanic - li r3,0 ; Set failure return code - blr ; Return, head hanging low... -lckpanic: - mr r4,r5 - mr r5,r3 - lis r3,hi16(lckpanic_str) ; Get the failed lck message - ori r3,r3,lo16(lckpanic_str) ; Get the failed lck message - bl EXT(panic) - BREAKPOINT_TRAP ; We die here anyway - .data -lckpanic_str: - STRINGD "timeout on attempt to acquire lock (0x%08X), value = 0x%08X\n\000" - .text - -/* - * void hw_lock_unlock(hw_lock_t) - * - * Unconditionally release lock. - * Release preemption level. - */ - .align 5 - .globl EXT(hw_lock_unlock) - -LEXT(hw_lock_unlock) - - .globl EXT(hwulckPatch_isync) -LEXT(hwulckPatch_isync) - isync - .globl EXT(hwulckPatch_eieio) -LEXT(hwulckPatch_eieio) - eieio - li r0, 0 ; set lock to free - stw r0, 0(r3) - - b epStart ; Go enable preemption... - -/* - * unsigned int hw_unlock_bit(hw_lock_t, unsigned int bit) - * - * Release bit based spin-lock. The second parameter is the bit mask to clear. - * Multiple bits may be cleared. - * - */ - .align 5 - .globl EXT(hw_unlock_bit) - -LEXT(hw_unlock_bit) - - .globl EXT(hwulckbPatch_isync) -LEXT(hwulckbPatch_isync) - isync - .globl EXT(hwulckbPatch_eieio) -LEXT(hwulckbPatch_eieio) - eieio -ubittry: lwarx r0,0,r3 ; Grab the lock value - andc r0,r0,r4 ; Clear the lock bits - stwcx. r0,0,r3 ; Try to clear that there durn lock - bne- ubittry ; Try again, couldn't save it... - - b epStart ; Go enable preemption... - -/* - * unsigned int hw_lock_mbits(hw_lock_t, unsigned int bits, unsigned int value, - * unsigned int newb, unsigned int timeout) - * - * Try to acquire spin-lock. The second parameter is the bit mask to check. - * The third is the value of those bits and the 4th is what to set them to. - * Return success (1) or failure (0). - * Attempt will fail after timeout ticks of the timebase. - * We try fairly hard to get this lock. We disable for interruptions, but - * reenable after a "short" timeout (128 ticks, we may want to shorten this). - * After checking to see if the large timeout value (passed in) has expired and a - * sufficient number of cycles have gone by (to insure pending 'rupts are taken), - * we return either in abject failure, or disable and go back to the lock sniff routine. - * If the sniffer finds the lock free, it jumps right up and tries to grab it. - */ - .align 5 - .globl EXT(hw_lock_mbits) - -LEXT(hw_lock_mbits) - - li r10,0 - -mbittry: lwarx r12,0,r3 ; Grab the lock value - and r0,r12,r4 ; Clear extra bits - andc r12,r12,r4 ; Clear all bits in the bit mask - or r12,r12,r6 ; Turn on the lock bits - cmplw r0,r5 ; Are these the right bits? - bne-- mbitspin ; Nope, wait for it to clear... - stwcx. r12,0,r3 ; Try to seize that there durn lock - beq++ mbitgot ; We got it, yahoo... - b mbittry ; Just start up again if the store failed... - - .align 5 -mbitspin: li r11,lgKillResv ; Point to killing field - stwcx. r11,0,r11 ; Kill it - - mr. r10,r10 ; Is r10 set to zero - bne++ mbitspin0 ; If yes, first spin attempt - lis r0,hi16(MASK(MSR_VEC)) ; Get vector enable - mfmsr r9 ; Get the MSR value - ori r0,r0,lo16(MASK(MSR_FP)) ; Get FP enable - ori r8,r0,lo16(MASK(MSR_EE)) ; Get EE bit on too - andc r9,r9,r0 ; Clear FP and VEC - andc r8,r9,r8 ; Clear EE as well - mtmsr r8 ; Turn off interruptions - isync ; May have turned off vectors or float here - mftb r10 ; Get the low part of the time base - b mbitsniff -mbitspin0: - mtmsr r8 ; Turn off interruptions - mftb r10 ; Get the low part of the time base -mbitsniff: - lwz r12,0(r3) ; Get that lock in here - and r0,r12,r4 ; Clear extra bits - cmplw r0,r5 ; Are these the right bits? - beq++ mbitretry ; Yeah, try for it again... - - mftb r11 ; Time stamp us now - sub r11,r11,r10 ; Get the elapsed time - cmplwi r11,128 ; Have we been spinning for 128 tb ticks? - blt++ mbitsniff ; Not yet... - - mtmsr r9 ; Say, any interrupts pending? - -; The following instructions force the pipeline to be interlocked to that only one -; instruction is issued per cycle. The insures that we stay enabled for a long enough -; time. If it is too short, pending interruptions will not have a chance to be taken - - subi r7,r7,128 ; Back off elapsed time from timeout value - or r7,r7,r7 ; Do nothing here but force a single cycle delay - mr. r7,r7 ; See if we used the whole timeout - or r7,r7,r7 ; Do nothing here but force a single cycle delay - - ble-- mbitfail ; We failed - b mbitspin0 ; Now that we have opened an enable window, keep trying... -mbitretry: - mtmsr r9 ; Enable for interruptions - li r10,1 ; Make sure this is non-zero - b mbittry - - .align 5 -mbitgot: - li r3,1 ; Set good return code - .globl EXT(hwlmlckPatch_isync) -LEXT(hwlmlckPatch_isync) - isync ; Make sure we do not use a speculativily loaded value - blr - -mbitfail: li r3,0 ; Set failure return code - blr ; Return, head hanging low... - -/* - * unsigned int hw_cpu_sync(unsigned int *, unsigned int timeout) - * - * Spin until word hits 0 or timeout. - * Return success (1) or failure (0). - * Attempt will fail after timeout ticks of the timebase. - * - * The theory is that a processor will bump a counter as it signals - * other processors. Then it will spin untl the counter hits 0 (or - * times out). The other processors, as it receives the signal will - * decrement the counter. - * - * The other processors use interlocked update to decrement, this one - * does not need to interlock. - */ - .align 5 - .globl EXT(hw_cpu_sync) - -LEXT(hw_cpu_sync) - - mftb r10 ; Get the low part of the time base - mr r9,r3 ; Save the sync word address - li r3,1 ; Assume we work - -csynctry: lwz r11,0(r9) ; Grab the sync value - mr. r11,r11 ; Counter hit 0? - beqlr- ; Yeah, we are sunk... - mftb r12 ; Time stamp us now - - sub r12,r12,r10 ; Get the elapsed time - cmplw r4,r12 ; Have we gone too long? - bge+ csynctry ; Not yet... - - li r3,0 ; Set failure... - blr ; Return, head hanging low... - -/* - * unsigned int hw_cpu_wcng(unsigned int *, unsigned int, unsigned int timeout) - * - * Spin until word changes or timeout. - * Return success (1) or failure (0). - * Attempt will fail after timeout ticks of the timebase. - * - * This is used to insure that a processor passes a certain point. - * An example of use is to monitor the last interrupt time in the - * per_proc block. This can be used to insure that the other processor - * has seen at least one interrupt since a specific time. - */ - .align 5 - .globl EXT(hw_cpu_wcng) - -LEXT(hw_cpu_wcng) - - mftb r10 ; Get the low part of the time base - mr r9,r3 ; Save the sync word address - li r3,1 ; Assume we work - -wcngtry: lwz r11,0(r9) ; Grab the value - cmplw r11,r4 ; Do they still match? - bnelr- ; Nope, cool... - mftb r12 ; Time stamp us now - - sub r12,r12,r10 ; Get the elapsed time - cmplw r5,r12 ; Have we gone too long? - bge+ wcngtry ; Not yet... - - li r3,0 ; Set failure... - blr ; Return, head hanging low... - - -/* - * unsigned int hw_lock_try(hw_lock_t) - * - * Try to acquire spin-lock. Return success (1) or failure (0) - * Returns with preemption disabled on success. - * - */ - .align 5 - .globl EXT(hw_lock_try) - -LEXT(hw_lock_try) - - lis r0,hi16(MASK(MSR_VEC)) ; Get vector enable - mfmsr r9 ; Get the MSR value - ori r0,r0,lo16(MASK(MSR_FP)) ; Get FP enable - ori r7,r0,lo16(MASK(MSR_EE)) ; Get EE bit on too - andc r9,r9,r0 ; Clear FP and VEC - andc r7,r9,r7 ; Clear EE as well - - mtmsr r7 ; Disable interruptions and thus, preemption - - lwz r5,0(r3) ; Quick load - andi. r6,r5,ILK_LOCKED ; TEST... - bne-- .L_lock_try_failed ; No go... - -.L_lock_try_loop: - lwarx r5,0,r3 ; Ld from addr of arg and reserve - - andi. r6,r5,ILK_LOCKED ; TEST... - ori r5,r5,ILK_LOCKED - bne-- .L_lock_try_failedX ; branch if taken. Predict free - - stwcx. r5,0,r3 ; And SET (if still reserved) - bne-- .L_lock_try_loop ; If set failed, loop back - - .globl EXT(hwltlckPatch_isync) -LEXT(hwltlckPatch_isync) - isync - - mfsprg r6,1 ; Get current activation - lwz r5,ACT_PREEMPT_CNT(r6) ; Get the preemption level - addi r5,r5,1 ; Bring up the disable count - stw r5,ACT_PREEMPT_CNT(r6) ; Save it back - - mtmsr r9 ; Allow interruptions now - li r3,1 ; Set that the lock was free - blr - -.L_lock_try_failedX: - li r6,lgKillResv ; Killing field - stwcx. r6,0,r6 ; Kill reservation - -.L_lock_try_failed: - mtmsr r9 ; Allow interruptions now - li r3,0 ; FAILURE - lock was taken - blr - -/* - * unsigned int hw_lock_held(hw_lock_t) - * - * Return 1 if lock is held - * Doesn't change preemption state. - * N.B. Racy, of course. - */ - .align 5 - .globl EXT(hw_lock_held) - -LEXT(hw_lock_held) - - isync ; Make sure we don't use a speculativily fetched lock - lwz r3, 0(r3) ; Get lock value - andi. r6,r3,ILK_LOCKED ; Extract the ILK_LOCKED bit - blr - -/* - * uint32_t hw_compare_and_store(uint32_t oldval, uint32_t newval, uint32_t *dest) - * - * Compare old to area if equal, store new, and return true - * else return false and no store - * This is an atomic operation - */ - .align 5 - .globl EXT(hw_compare_and_store) - .globl EXT(OSCompareAndSwap) - .globl EXT(OSCompareAndSwapPtr) - -LEXT(hw_compare_and_store) -LEXT(OSCompareAndSwap) -LEXT(OSCompareAndSwapPtr) - - mr r6,r3 ; Save the old value - -cstry: lwarx r9,0,r5 ; Grab the area value - li r3,1 ; Assume it works - cmplw cr0,r9,r6 ; Does it match the old value? - bne-- csfail ; No, it must have changed... - stwcx. r4,0,r5 ; Try to save the new value - bne-- cstry ; Didn't get it, try again... - .globl EXT(hwcsatomicPatch_isync) -LEXT(hwcsatomicPatch_isync) - isync ; Just hold up prefetch - blr ; Return... - -csfail: li r3,lgKillResv ; Killing field - stwcx. r3,0,r3 ; Blow reservation - - li r3,0 ; Set failure - blr ; Better luck next time... - - -/* - * uint32_t hw_atomic_add(uint32_t *dest, uint32_t delt) - * - * Atomically add the second parameter to the first. - * Returns the result. - * - */ - .align 5 - .globl EXT(hw_atomic_add) - -LEXT(hw_atomic_add) - - mr r6,r3 ; Save the area - -addtry: lwarx r3,0,r6 ; Grab the area value - add r3,r3,r4 ; Add the value - stwcx. r3,0,r6 ; Try to save the new value - bne-- addtry ; Didn't get it, try again... - blr ; Return... - - -/* - * uint32_t hw_atomic_sub(uint32_t *dest, uint32_t delt) - * - * Atomically subtract the second parameter from the first. - * Returns the result. - * - */ - .align 5 - .globl EXT(hw_atomic_sub) - -LEXT(hw_atomic_sub) - - mr r6,r3 ; Save the area - -subtry: lwarx r3,0,r6 ; Grab the area value - sub r3,r3,r4 ; Subtract the value - stwcx. r3,0,r6 ; Try to save the new value - bne-- subtry ; Didn't get it, try again... - blr ; Return... - - -/* - * uint32_t hw_atomic_or(uint32_t *dest, uint32_t mask) - * - * Atomically ORs the second parameter into the first. - * Returns the result. - */ - .align 5 - .globl EXT(hw_atomic_or) -LEXT(hw_atomic_or) - .globl EXT(hw_atomic_or_noret) -LEXT(hw_atomic_or_noret) - mr r6,r3 ; Save the area - -ortry: lwarx r3,0,r6 ; Grab the area value - or r3,r3,r4 ; OR the value - stwcx. r3,0,r6 ; Try to save the new value - bne-- ortry ; Did not get it, try again... - blr ; Return... - - -/* - * uint32_t hw_atomic_and(uint32_t *dest, uint32_t mask) - * - * Atomically ANDs the second parameter with the first. - * Returns the result. - * - */ - .align 5 - .globl EXT(hw_atomic_and) -LEXT(hw_atomic_and) - .globl EXT(hw_atomic_and_noret) -LEXT(hw_atomic_and_noret) - mr r6,r3 ; Save the area - -andtry: lwarx r3,0,r6 ; Grab the area value - and r3,r3,r4 ; AND the value - stwcx. r3,0,r6 ; Try to save the new value - bne-- andtry ; Did not get it, try again... - blr ; Return... - - -/* - * void hw_queue_atomic(unsigned int * anchor, unsigned int * elem, unsigned int disp) - * - * Atomically inserts the element at the head of the list - * anchor is the pointer to the first element - * element is the pointer to the element to insert - * disp is the displacement into the element to the chain pointer - */ - .align 5 - .globl EXT(hw_queue_atomic) - .globl EXT(OSEnqueueAtomic) - -LEXT(hw_queue_atomic) -LEXT(OSEnqueueAtomic) - - mr r7,r4 ; Make end point the same as start - mr r8,r5 ; Copy the displacement also - b hw_queue_comm ; Join common code... - -/* - * void hw_queue_atomic_list(unsigned int * anchor, unsigned int * first, unsigned int * last, unsigned int disp) - * - * Atomically inserts the list of elements at the head of the list - * anchor is the pointer to the first element - * first is the pointer to the first element to insert - * last is the pointer to the last element to insert - * disp is the displacement into the element to the chain pointer - */ - .align 5 - .globl EXT(hw_queue_atomic_list) - -LEXT(hw_queue_atomic_list) - - mr r7,r5 ; Make end point the same as start - mr r8,r6 ; Copy the displacement also - -hw_queue_comm: - lwarx r9,0,r3 ; Pick up the anchor - stwx r9,r8,r7 ; Chain that to the end of the new stuff - eieio ; Make sure this store makes it before the anchor update - stwcx. r4,0,r3 ; Try to chain into the front - bne-- hw_queue_comm ; Didn't make it, try again... - - blr ; Return... - -/* - * unsigned int *hw_dequeue_atomic(unsigned int *anchor, unsigned int disp) - * - * Atomically removes the first element in a list and returns it. - * anchor is the pointer to the first element - * disp is the displacement into the element to the chain pointer - * Returns element if found, 0 if empty. - */ - .align 5 - .globl EXT(hw_dequeue_atomic) - .globl EXT(OSDequeueAtomic) - -LEXT(hw_dequeue_atomic) -LEXT(OSDequeueAtomic) - - mr r5,r3 ; Save the anchor - -hw_dequeue_comm: - lwarx r3,0,r5 ; Pick up the anchor - mr. r3,r3 ; Is the list empty? - beq-- hdcFail ; Leave it list empty... - lwzx r9,r4,r3 ; Get the next in line - stwcx. r9,0,r5 ; Try to chain into the front - beqlr++ ; Got the thing, go away with it... - b hw_dequeue_comm ; Did not make it, try again... - -hdcFail: li r4,lgKillResv ; Killing field - stwcx. r4,0,r4 ; Dump reservation - blr ; Leave... - - -/* - * Routines for mutex lock debugging. - */ - -/* - * Gets lock check flags in CR6: CR bits 24-27 - */ -#define CHECK_SETUP(rg) \ - lbz rg,lglcksWork(0) __ASMNL__ \ - mtcrf 2,rg __ASMNL__ - - -/* - * Checks for expected lock type. - */ -#define CHECK_MUTEX_TYPE() \ - bf MUTEX_ATTR_DEBUGb,1f __ASMNL__ \ - bt 24+disLktypeb,1f __ASMNL__ \ - lwz r10,MUTEX_TYPE(r3) __ASMNL__ \ - cmpwi r10,MUTEX_TAG __ASMNL__ \ - beq++ 1f __ASMNL__ \ - PROLOG(0) __ASMNL__ \ - mr r4,r11 __ASMNL__ \ - mr r5,r10 __ASMNL__ \ - lis r3,hi16(not_a_mutex) __ASMNL__ \ - ori r3,r3,lo16(not_a_mutex) __ASMNL__ \ - bl EXT(panic) __ASMNL__ \ - BREAKPOINT_TRAP __ASMNL__ \ -1: - - .data -not_a_mutex: - STRINGD "mutex (0x%08X) not a mutex type (0x%08X)\n\000" - .text - -/* - * Verifies return to the correct thread in "unlock" situations. - */ -#define CHECK_THREAD(thread_offset) \ - bf MUTEX_ATTR_DEBUGb,3f __ASMNL__ \ - bt 24+disLkThreadb,3f __ASMNL__ \ - mfsprg r10,1 __ASMNL__ \ - lwz r5,MUTEX_DATA(r3) __ASMNL__ \ - rlwinm. r9,r5,0,0,29 __ASMNL__ \ - bne++ 1f __ASMNL__ \ - lis r3,hi16(not_held) __ASMNL__ \ - ori r3,r3,lo16(not_held) __ASMNL__ \ - b 2f __ASMNL__ \ -1: __ASMNL__ \ - cmpw r9,r10 __ASMNL__ \ - beq++ 3f __ASMNL__ \ - mr r5,r10 __ASMNL__ \ - mr r6,r9 __ASMNL__ \ - lis r3,hi16(wrong_thread) __ASMNL__ \ - ori r3,r3,lo16(wrong_thread) __ASMNL__ \ -2: __ASMNL__ \ - mr r4,r11 __ASMNL__ \ - PROLOG(0) __ASMNL__ \ - bl EXT(panic) __ASMNL__ \ - BREAKPOINT_TRAP __ASMNL__ \ -3: - - .data -not_held: - STRINGD "mutex (0x%08X) not held\n\000" -wrong_thread: - STRINGD "mutex (0x%08X) unlocked by non-owner(0x%08X), current owner(0x%08X)\n\000" - .text - -#define CHECK_MYLOCK() \ - bf MUTEX_ATTR_DEBUGb,1f __ASMNL__ \ - bt 24+disLkMyLckb,1f __ASMNL__ \ - mfsprg r10,1 __ASMNL__ \ - lwz r9,MUTEX_DATA(r3) __ASMNL__ \ - rlwinm r9,r9,0,0,29 __ASMNL__ \ - cmpw r9,r10 __ASMNL__ \ - bne++ 1f __ASMNL__ \ - mr r4,r11 __ASMNL__ \ - lis r3, hi16(mylock_attempt) __ASMNL__ \ - ori r3,r3,lo16(mylock_attempt) __ASMNL__ \ - bl EXT(panic) __ASMNL__ \ - BREAKPOINT_TRAP __ASMNL__ \ -1: - - .data -mylock_attempt: - STRINGD "mutex (0x%08X) recursive lock attempt\n\000" - .text - -#define LCK_STACK(lck, stack, lck_stack, frame_cnt, lr_save, tmp) \ - bf 24+enaLkExtStckb,3f __ASMNL__ \ - addi lck_stack,lck,MUTEX_STACK __ASMNL__ \ - li frame_cnt,MUTEX_FRAMES-1 __ASMNL__ \ -1: __ASMNL__ \ - mr tmp,stack __ASMNL__ \ - lwz stack,0(stack) __ASMNL__ \ - xor tmp,stack,tmp __ASMNL__ \ - cmplwi tmp,8192 __ASMNL__ \ - bge-- 2f __ASMNL__ \ - lwz lr_save,FM_LR_SAVE(stack) __ASMNL__ \ - stwu lr_save,4(lck_stack) __ASMNL__ \ - subi frame_cnt,frame_cnt,1 __ASMNL__ \ - cmpi cr0,frame_cnt,0 __ASMNL__ \ - bne 1b __ASMNL__ \ - b 3f __ASMNL__ \ -2: __ASMNL__ \ - li tmp,0 __ASMNL__ \ - stwu tmp,4(lck_stack) __ASMNL__ \ - subi frame_cnt,frame_cnt,1 __ASMNL__ \ - cmpi cr0,frame_cnt,0 __ASMNL__ \ - bne 2b __ASMNL__ \ -3: - - .align 5 - mr r11,r3 ; Save lock addr -mlckeEnter: - lwz r0,MUTEX_ATTR(r3) - mtcrf 1,r0 ; Set cr7 - CHECK_SETUP(r12) - CHECK_MUTEX_TYPE() - - bf MUTEX_ATTR_DEBUGb,L_mtx_lock_assert_wait_2 - PROLOG(0) - bl EXT(assert_wait_possible) - mr. r3,r3 - bne L_mtx_lock_assert_wait_1 - lis r3,hi16(L_mtx_lock_assert_wait_panic_str) - ori r3,r3,lo16(L_mtx_lock_assert_wait_panic_str) - bl EXT(panic) - BREAKPOINT_TRAP ; We die here anyway - - .data -L_mtx_lock_assert_wait_panic_str: - STRINGD "mutex lock attempt with assert_wait_possible false\n\000" - .text - -L_mtx_lock_assert_wait_1: - lwz r3,FM_ARG0(r1) - lwz r11,FM_ARG0+0x04(r1) - lwz r2,(FM_ALIGN(0)+FM_SIZE+FM_CR_SAVE)(r1) - mtcr r2 - EPILOG -L_mtx_lock_assert_wait_2: - - mfsprg r6,1 ; load the current thread - bf MUTEX_ATTR_STATb,mlckestatskip ; Branch if no stat - lwz r5,MUTEX_GRP(r3) ; Load lock group - li r7,GRP_MTX_STAT_UTIL+4 ; Set stat util offset -mlckestatloop: - lwarx r8,r7,r5 ; Load stat util cnt - addi r8,r8,1 ; Increment stat util cnt - stwcx. r8,r7,r5 ; Store stat util cnt - bne-- mlckestatloop ; Retry if failed - mr. r8,r8 ; Test for zero - bne++ mlckestatskip ; Did stat util cnt wrapped? - lwz r8,GRP_MTX_STAT_UTIL(r5) ; Load upper stat util cnt - addi r8,r8,1 ; Increment upper stat util cnt - stw r8,GRP_MTX_STAT_UTIL(r5) ; Store upper stat util cnt -mlckestatskip: - lwz r5,MUTEX_DATA(r3) ; Get the lock quickly - li r4,0 - li r8,0 - lis r0,hi16(MASK(MSR_VEC)) ; Get vector enable - mfmsr r9 ; Get the MSR value - ori r0,r0,lo16(MASK(MSR_FP)) ; Get FP enable - ori r7,r0,lo16(MASK(MSR_EE)) ; Get EE bit on too - andc r9,r9,r0 ; Clear FP and VEC - andc r7,r9,r7 ; Clear EE as well - mtmsr r7 ; Turn off interruptions - isync ; May have turned off vec and fp here - mr. r5,r5 ; Quick check - bne-- mlckespin01 ; Can not get it right now... - -mlcketry: - lwarx r5,MUTEX_DATA,r3 ; load the mutex lock - mr. r5,r5 - bne-- mlckespin0 ; Can not get it right now... - stwcx. r6,MUTEX_DATA,r3 ; grab the lock - bne-- mlcketry ; loop back if failed - .globl EXT(mlckePatch_isync) -LEXT(mlckePatch_isync) - isync ; stop prefeteching - mflr r12 - bf MUTEX_ATTR_DEBUGb,mlckedebskip - mr r8,r6 ; Get the active thread - stw r12,MUTEX_STACK(r3) ; Save our caller - stw r8,MUTEX_THREAD(r3) ; Set the mutex's holding thread - mr r5,r1 - LCK_STACK(r3,r5,r6,r7,r8,r10) -mlckedebskip: - mtmsr r9 ; Say, any interrupts pending? - blr - -mlckespin0: - li r5,lgKillResv ; Killing field - stwcx. r5,0,r5 ; Kill reservation -mlckespin01: - mflr r12 - mtmsr r9 ; Say, any interrupts pending? - bl mlckspin1 - mtmsr r7 ; Turn off interruptions, vec and fp off already - mtlr r12 - b mlcketry - -/* - * void lck_mtx_lock(lck_mtx_t*) - * - */ - .align 5 - .globl EXT(lck_mtx_lock) -LEXT(lck_mtx_lock) - - mfsprg r6,1 ; load the current thread - lwz r5,MUTEX_DATA(r3) ; Get the lock quickly - mr r11,r3 ; Save lock addr - li r4,0 - li r8,0 - li r9,0 - mr. r5,r5 ; Quick check - bne-- mlckspin00 ; Indirect or Can not get it right now... - -mlcktry: - lwarx r5,MUTEX_DATA,r3 ; load the mutex lock - mr. r5,r5 - bne-- mlckspin01 ; Can not get it right now... - stwcx. r6,MUTEX_DATA,r3 ; grab the lock - bne-- mlcktry ; loop back if failed - .globl EXT(mlckPatch_isync) -LEXT(mlckPatch_isync) - isync ; stop prefeteching - blr -; Need to debug making blr above a patch point and record: -; LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE) - -mlckspin00: - cmpli cr0,r5,MUTEX_IND ; Is it a mutex indirect - bne-- mlckspin02 ; No, go handle contention - lwz r3,MUTEX_PTR(r3) ; load mutex ext pointer - b mlckeEnter -mlckspin01: - li r5,lgKillResv ; Killing field - stwcx. r5,0,r5 ; Kill reservation -mlckspin02: - mflr r12 - li r0,0 - mtcrf 1,r0 ; Set cr7 to zero - bl mlckspin1 - mtlr r12 - b mlcktry - - -mlckspin1: - mr. r4,r4 ; Test timeout value - bne++ mlckspin2 - lis r4,hi16(EXT(MutexSpin)) ; Get the high part - ori r4,r4,lo16(EXT(MutexSpin) ) ; And the low part - lwz r4,0(r4) ; Get spin timerout value - mr. r4,r4 ; Test spin timeout value - bne++ mlckspin2 ; Is spin timeout requested - crclr mlckmiss ; Clear miss test - b mlckslow1 ; Don't try to spin - -mlckspin2: mr. r8,r8 ; Is r8 set to zero - bne++ mlckspin3 ; If yes, first spin attempt - crclr mlckmiss ; Clear miss test - mr. r9,r9 ; Is r9 set to zero - bne++ mlckspin3 ; If yes, r9 set with msr value - lis r0,hi16(MASK(MSR_VEC)) ; Get vector enable - mfmsr r9 ; Get the MSR value - ori r0,r0,lo16(MASK(MSR_FP)) ; Get FP enable - ori r7,r0,lo16(MASK(MSR_EE)) ; Get EE bit on too - andc r9,r9,r0 ; Clear FP and VEC - andc r7,r9,r7 ; Clear EE as well - mtmsr r7 ; Turn off interruptions - isync ; May have turned off vec and fp here - mftb r8 ; Get timestamp on entry - b mlcksniff - -mlckspin3: mtmsr r7 ; Turn off interruptions - mftb r8 ; Get timestamp on entry - -mlcksniff: lwz r5,MUTEX_DATA(r3) ; Get that lock in here - mr. r5,r5 ; Is the lock held - beq++ mlckretry ; No, try for it again... - rlwinm. r10,r5,0,0,29 ; Extract the lock owner - beq++ mlckslow0 ; InterLock is held - bf MUTEX_ATTR_STATb,mlStatSkip ; Branch if no stat - andi. r5,r5,ILK_LOCKED ; extract interlocked? - bne mlStatSkip ; yes, skip - bt mlckmiss,mlStatSkip ; miss already counted - crset mlckmiss ; Remember miss recorded - lwz r5,MUTEX_GRP(r3) ; Load lock group - addi r5,r5,GRP_MTX_STAT_MISS+4 ; Add stat miss offset -mlStatLoop: - lwarx r6,0,r5 ; Load stat miss cnt - addi r6,r6,1 ; Increment stat miss cnt - stwcx. r6,0,r5 ; Update stat miss cnt - bne-- mlStatLoop ; Retry if failed - mfsprg r6,1 ; Reload current thread -mlStatSkip: - lwz r2,ACT_MACT_SPF(r10) ; Get the special flags - rlwinm. r2,r2,0,OnProcbit,OnProcbit ; Is OnProcbit set? - beq mlckslow0 ; Lock owner isn't running - lis r2,hi16(TH_IDLE) ; Get thread idle state - ori r2,r2,lo16(TH_IDLE) ; Get thread idle state - lwz r10,THREAD_STATE(r10) ; Get the thread state - and. r10,r10,r2 ; Is idle set? - bne mlckslow0 ; Lock owner is idling - - mftb r10 ; Time stamp us now - sub r10,r10,r8 ; Get the elapsed time - cmplwi r10,128 ; Have we been spinning for 128 tb ticks? - blt++ mlcksniff ; Not yet... - - mtmsr r9 ; Say, any interrupts pending? - -; The following instructions force the pipeline to be interlocked to that only one -; instruction is issued per cycle. The insures that we stay enabled for a long enough -; time; if it's too short, pending interruptions will not have a chance to be taken - - subi r4,r4,128 ; Back off elapsed time from timeout value - or r4,r4,r4 ; Do nothing here but force a single cycle delay - mr. r4,r4 ; See if we used the whole timeout - or r4,r4,r4 ; Do nothing here but force a single cycle delay - - ble-- mlckslow1 ; We failed - b mlckspin3 ; Now that we've opened an enable window, keep trying... -mlckretry: - mtmsr r9 ; Restore interrupt state - li r8,1 ; Show already through once - blr - -mlckslow0: ; We couldn't get the lock - mtmsr r9 ; Restore interrupt state - -mlckslow1: - mtlr r12 - - PROLOG(0) -.L_ml_retry: - bl lockDisa ; Go get a lock on the mutex's interlock lock - mr. r4,r3 ; Did we get it? - lwz r3,FM_ARG0(r1) ; Restore the lock address - bne++ mlGotInt ; We got it just fine... - mr r4,r11 ; Saved lock addr - lis r3,hi16(mutex_failed1) ; Get the failed mutex message - ori r3,r3,lo16(mutex_failed1) ; Get the failed mutex message - bl EXT(panic) ; Call panic - BREAKPOINT_TRAP ; We die here anyway, can not get the lock - - .data -mutex_failed1: - STRINGD "attempt to interlock mutex (0x%08X) failed on mutex lock\n\000" - .text - -mlGotInt: - -; Note that there is no reason to do a load and reserve here. We already -; hold the interlock lock and no one can touch this field unless they -; have that, so, we're free to play - - lwz r4,MUTEX_DATA(r3) ; Get the mutex's lock field - rlwinm. r9,r4,30,2,31 ; So, can we have it? - bne- mlInUse ; Nope, sombody's playing already... - - bf++ MUTEX_ATTR_DEBUGb,mlDebSkip - CHECK_SETUP(r5) - mfsprg r9,1 ; Get the current activation - lwz r5,0(r1) ; Get previous save frame - lwz r6,FM_LR_SAVE(r5) ; Get our caller's address - mr r8,r9 ; Get the active thread - stw r6,MUTEX_STACK(r3) ; Save our caller - stw r8,MUTEX_THREAD(r3) ; Set the mutex's holding thread - LCK_STACK(r3,r5,r6,r7,r8,r10) -mlDebSkip: - mr r3,r11 ; Get the based lock address - bl EXT(lck_mtx_lock_acquire) - lwz r2,(FM_ALIGN(0)+FM_SIZE+FM_CR_SAVE)(r1) - mfsprg r5,1 - mtcr r2 - mr. r4,r3 - lwz r3,FM_ARG0(r1) ; restore r3 (saved in prolog) - lwz r11,FM_ARG0+0x04(r1) ; restore r11 (saved in prolog) - beq mlUnlock - ori r5,r5,WAIT_FLAG - -mlUnlock: eieio - stw r5,MUTEX_DATA(r3) ; grab the mutexlock and free the interlock - - EPILOG ; Restore all saved registers - b epStart ; Go enable preemption... - -; We come to here when we have a resource conflict. In other words, -; the mutex is held. - -mlInUse: - - CHECK_SETUP(r12) - CHECK_MYLOCK() ; Assert we don't own the lock already */ - -; Note that we come in here with the interlock set. The wait routine -; will unlock it before waiting. - - bf MUTEX_ATTR_STATb,mlStatSkip2 ; Branch if no stat - lwz r5,MUTEX_GRP(r3) ; Load lck group - bt mlckmiss,mlStatSkip1 ; Skip miss already counted - crset mlckmiss ; Remember miss recorded - li r9,GRP_MTX_STAT_MISS+4 ; Get stat miss offset -mlStatLoop1: - lwarx r8,r9,r5 ; Load stat miss cnt - addi r8,r8,1 ; Increment stat miss cnt - stwcx. r8,r9,r5 ; Store stat miss cnt - bne-- mlStatLoop1 ; Retry if failed -mlStatSkip1: - lwz r9,GRP_MTX_STAT_WAIT+4(r5) ; Load wait cnt - addi r9,r9,1 ; Increment wait cnt - stw r9,GRP_MTX_STAT_WAIT+4(r5) ; Update miss cnt -mlStatSkip2: - ori r4,r4,WAIT_FLAG ; Set the wait flag - stw r4,MUTEX_DATA(r3) - rlwinm r4,r4,0,0,29 ; Extract the lock owner - mfcr r2 - stw r2,(FM_ALIGN(0)+FM_SIZE+FM_CR_SAVE)(r1) - mr r3,r11 ; Get the based lock address - bl EXT(lck_mtx_lock_wait) ; Wait for our turn at the lock - - lwz r3,FM_ARG0(r1) ; restore r3 (saved in prolog) - lwz r11,FM_ARG0+0x04(r1) ; restore r11 (saved in prolog) - lwz r2,(FM_ALIGN(0)+FM_SIZE+FM_CR_SAVE)(r1) - mtcr r2 - b .L_ml_retry ; and try again... - - -/* - * void lck_mtx_try_lock(_extlck_mtx_ext_t*) - * - */ - .align 5 - .globl EXT(lck_mtx_try_lock_ext) -LEXT(lck_mtx_try_lock_ext) - mr r11,r3 ; Save lock addr -mlteEnter: - lwz r0,MUTEX_ATTR(r3) - mtcrf 1,r0 ; Set cr7 - CHECK_SETUP(r12) - CHECK_MUTEX_TYPE() - - bf MUTEX_ATTR_STATb,mlteStatSkip ; Branch if no stat - lwz r5,MUTEX_GRP(r3) ; Load lock group - li r7,GRP_MTX_STAT_UTIL+4 ; Set stat util offset -mlteStatLoop: - lwarx r8,r7,r5 ; Load stat util cnt - addi r8,r8,1 ; Increment stat util cnt - stwcx. r8,r7,r5 ; Store stat util cnt - bne-- mlteStatLoop ; Retry if failed - mr. r8,r8 ; Test for zero - bne++ mlteStatSkip ; Did stat util cnt wrapped? - lwz r8,GRP_MTX_STAT_UTIL(r5) ; Load upper stat util cnt - addi r8,r8,1 ; Increment upper stat util cnt - stw r8,GRP_MTX_STAT_UTIL(r5) ; Store upper stat util cnt -mlteStatSkip: - mfsprg r6,1 ; load the current thread - lwz r5,MUTEX_DATA(r3) ; Get the lock value - mr. r5,r5 ; Quick check - bne-- L_mtx_try_slow ; Can not get it now... - mfmsr r9 ; Get the MSR value - lis r0,hi16(MASK(MSR_VEC)) ; Get vector enable - ori r0,r0,lo16(MASK(MSR_FP)) ; Get FP enable - ori r7,r0,lo16(MASK(MSR_EE)) ; Get EE bit on too - andc r9,r9,r0 ; Clear FP and VEC - andc r7,r9,r7 ; Clear EE as well - mtmsr r7 ; Turn off interruptions - isync ; May have turned off vec and fp here - -mlteLoopTry: - lwarx r5,MUTEX_DATA,r3 ; load the lock value - mr. r5,r5 - bne-- mlteSlowX ; branch to the slow path - stwcx. r6,MUTEX_DATA,r3 ; grab the lock - bne-- mlteLoopTry ; retry if failed - .globl EXT(mltelckPatch_isync) -LEXT(mltelckPatch_isync) - isync ; stop prefetching - mflr r12 - bf MUTEX_ATTR_DEBUGb,mlteDebSkip - mr r8,r6 ; Get the active thread - stw r12,MUTEX_STACK(r3) ; Save our caller - stw r8,MUTEX_THREAD(r3) ; Set the mutex's holding thread - mr r5,r1 - LCK_STACK(r3,r5,r6,r7,r8,r10) -mlteDebSkip: - li r3, 1 - mtmsr r9 ; Say, any interrupts pending? - blr -mlteSlowX: - li r5,lgKillResv ; Killing field - stwcx. r5,0,r5 ; Kill reservation - mtmsr r9 ; Say, any interrupts pending? - b L_mtx_try_slow - - -/* - * void lck_mtx_try_lock(lck_mtx_t*) - * - */ - .align 5 - .globl EXT(lck_mtx_try_lock) -LEXT(lck_mtx_try_lock) - - mfsprg r6,1 ; load the current thread - lwz r5,MUTEX_DATA(r3) ; Get the lock value - mr r11,r3 ; Save lock addr - mr. r5,r5 ; Quick check - bne-- mltSlow00 ; Indirect or Can not get it now... - -mltLoopTry: - lwarx r5,MUTEX_DATA,r3 ; load the lock value - mr. r5,r5 - bne-- mltSlow01 ; branch to the slow path - stwcx. r6,MUTEX_DATA,r3 ; grab the lock - bne-- mltLoopTry ; retry if failed - .globl EXT(mltlckPatch_isync) -LEXT(mltlckPatch_isync) - isync ; stop prefetching - li r3, 1 - blr - -mltSlow00: - cmpli cr0,r5,MUTEX_IND ; Is it a mutex indirect - bne-- mltSlow02 ; No, go handle contention - lwz r3,MUTEX_PTR(r3) ; load mutex ext pointer - b mlteEnter -mltSlow01: - li r5,lgKillResv ; Killing field - stwcx. r5,0,r5 ; Kill reservation - -mltSlow02: - li r0,0 - mtcrf 1,r0 ; Set cr7 to zero - -L_mtx_try_slow: - PROLOG(0) - - lwz r6,MUTEX_DATA(r3) ; Quick check - rlwinm. r6,r6,30,2,31 ; to see if someone has this lock already - bne- mtFail ; Someone's got it already... - - bl lockDisa ; Go get a lock on the mutex's interlock lock - mr. r4,r3 ; Did we get it? - lwz r3,FM_ARG0(r1) ; Restore the lock address - bne++ mtGotInt ; We got it just fine... - mr r4,r11 ; Saved lock addr - lis r3,hi16(mutex_failed2) ; Get the failed mutex message - ori r3,r3,lo16(mutex_failed2) ; Get the failed mutex message - bl EXT(panic) ; Call panic - BREAKPOINT_TRAP ; We die here anyway, can not get the lock - - .data -mutex_failed2: - STRINGD "attempt to interlock mutex (0x%08X) failed on mutex lock try\n\000" - .text - -mtGotInt: - -; Note that there is no reason to do a load and reserve here. We already -; hold the interlock and no one can touch at this field unless they -; have that, so, we're free to play - - lwz r4,MUTEX_DATA(r3) ; Get the mutex's lock field - rlwinm. r9,r4,30,2,31 ; So, can we have it? - bne- mtInUse ; Nope, sombody's playing already... - - bf++ MUTEX_ATTR_DEBUGb,mtDebSkip - CHECK_SETUP(r5) - mfsprg r9,1 ; Get the current activation - lwz r5,0(r1) ; Get previous save frame - lwz r6,FM_LR_SAVE(r5) ; Get our caller's address - mr r8,r9 ; Get the active thread - stw r6,MUTEX_STACK(r3) ; Save our caller - stw r8,MUTEX_THREAD(r3) ; Set the mutex's holding thread - LCK_STACK(r3,r5,r6,r7,r8,r10) -mtDebSkip: - mr r3,r11 ; Get the based lock address - bl EXT(lck_mtx_lock_acquire) - mfsprg r5,1 - mr. r4,r3 - lwz r3,FM_ARG0(r1) ; restore r3 (saved in prolog) - lwz r11,FM_ARG0+0x04(r1) ; restore r11 (saved in prolog) - beq mtUnlock - ori r5,r5,WAIT_FLAG - -mtUnlock: eieio - stw r5,MUTEX_DATA(r3) ; grab the mutexlock and free the interlock - - bl epStart ; Go enable preemption... - - li r3, 1 - EPILOG ; Restore all saved registers - blr ; Return... - -; We come to here when we have a resource conflict. In other words, -; the mutex is held. - -mtInUse: - bf++ MUTEX_ATTR_STATb,mtStatSkip ; Branch if no stat - lwz r5,MUTEX_GRP(r3) ; Load lock group - li r9,GRP_MTX_STAT_MISS+4 ; Get stat miss offset -mtStatLoop: - lwarx r8,r9,r5 ; Load stat miss cnt - addi r8,r8,1 ; Increment stat miss cnt - stwcx. r8,r9,r5 ; Store stat miss cnt - bne-- mtStatLoop ; Retry if failed -mtStatSkip: - rlwinm r4,r4,0,0,30 ; Get the unlock value - stw r4,MUTEX_DATA(r3) ; free the interlock - bl epStart ; Go enable preemption... - -mtFail: li r3,0 ; Set failure code - EPILOG ; Restore all saved registers - blr ; Return... - - - -/* - * void lck_mtx_ext_unlock(lck_mtx_ext_t* l) - * - */ - .align 5 - .globl EXT(lck_mtx_ext_unlock) -LEXT(lck_mtx_ext_unlock) -mlueEnter: - .globl EXT(mulckePatch_isync) -LEXT(mulckePatch_isync) - isync - .globl EXT(mulckePatch_eieio) -LEXT(mulckePatch_eieio) - eieio - mr r11,r3 ; Save lock addr -mlueEnter1: - lwz r0,MUTEX_ATTR(r3) - mtcrf 1,r0 ; Set cr7 - CHECK_SETUP(r12) - CHECK_MUTEX_TYPE() - CHECK_THREAD(MUTEX_THREAD) - - lwz r5,MUTEX_DATA(r3) ; Get the lock - rlwinm. r4,r5,0,30,31 ; Quick check - bne-- L_mtx_unlock_slow ; Can not get it now... - mfmsr r9 ; Get the MSR value - lis r0,hi16(MASK(MSR_VEC)) ; Get vector enable - ori r0,r0,lo16(MASK(MSR_FP)) ; Get FP enable - ori r7,r0,lo16(MASK(MSR_EE)) ; Get EE bit on too - andc r9,r9,r0 ; Clear FP and VEC - andc r7,r9,r7 ; Clear EE as well - mtmsr r7 ; Turn off interruptions - isync ; May have turned off vec and fp here - -mlueLoop: - lwarx r5,MUTEX_DATA,r3 - rlwinm. r4,r5,0,30,31 ; Bail if pending waiter or interlock set - li r5,0 ; Clear the mutexlock - bne-- mlueSlowX - stwcx. r5,MUTEX_DATA,r3 - bne-- mlueLoop - mtmsr r9 ; Say, any interrupts pending? - blr - -mlueSlowX: - li r5,lgKillResv ; Killing field - stwcx. r5,0,r5 ; Dump reservation - mtmsr r9 ; Say, any interrupts pending? - b L_mtx_unlock_slow ; Join slow path... - -/* - * void lck_mtx_unlock(lck_mtx_t* l) - * - */ - .align 5 - .globl EXT(lck_mtx_unlock) -LEXT(lck_mtx_unlock) -mluEnter: - .globl EXT(mulckPatch_isync) -LEXT(mulckPatch_isync) - isync - .globl EXT(mulckPatch_eieio) -LEXT(mulckPatch_eieio) - eieio - mr r11,r3 ; Save lock addr -mluEnter1: - lwz r5,MUTEX_DATA(r3) ; Get the lock - rlwinm. r4,r5,0,30,31 ; Quick check - bne-- mluSlow0 ; Indirect or Can not get it now... - -mluLoop: - lwarx r5,MUTEX_DATA,r3 - rlwinm. r4,r5,0,30,31 ; Bail if pending waiter or interlock set - li r5,0 ; Clear the mutexlock - bne-- mluSlowX - stwcx. r5,MUTEX_DATA,r3 - bne-- mluLoop -#if CONFIG_DTRACE -/* lock released - LS_LCK_MTX_UNLOCK_RELEASE */ - LOCKSTAT_LABEL(_lck_mtx_unlock_lockstat_patch_point) - blr - - LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE) -#endif - blr - - -mluSlow0: - cmpli cr0,r5,MUTEX_IND ; Is it a mutex indirect - bne-- L_mtx_unlock_slow ; No, go handle contention - lwz r3,MUTEX_PTR(r3) ; load mutex ext pointer - b mlueEnter1 -mluSlowX: - li r5,lgKillResv ; Killing field - stwcx. r5,0,r5 ; Dump reservation - -L_mtx_unlock_slow: - - PROLOG(0) - - bl lockDisa ; Go get a lock on the mutex's interlock lock - mr. r4,r3 ; Did we get it? - lwz r3,FM_ARG0(r1) ; Restore the lock address - bne++ muGotInt ; We got it just fine... - mr r4,r11 ; Saved lock addr - lis r3,hi16(mutex_failed3) ; Get the failed mutex message - ori r3,r3,lo16(mutex_failed3) ; Get the failed mutex message - bl EXT(panic) ; Call panic - BREAKPOINT_TRAP ; We die here anyway, can not get the lock - - .data -mutex_failed3: - STRINGD "attempt to interlock mutex (0x%08X) failed on mutex unlock\n\000" - .text - - -muGotInt: - lwz r4,MUTEX_DATA(r3) - andi. r5,r4,WAIT_FLAG ; are there any waiters ? - rlwinm r4,r4,0,0,29 - beq+ muUnlock ; Nope, we're done... - - mr r3,r11 ; Get the based lock address - bl EXT(lck_mtx_unlock_wakeup) ; yes, wake a thread - lwz r3,FM_ARG0(r1) ; restore r3 (saved in prolog) - lwz r11,FM_ARG0+0x04(r1) ; restore r11 (saved in prolog) - lwz r5,MUTEX_DATA(r3) ; load the lock - -muUnlock: - andi. r5,r5,WAIT_FLAG ; Get the unlock value - eieio - stw r5,MUTEX_DATA(r3) ; unlock the interlock and lock - - EPILOG ; Deal with the stack now, enable_preemption doesn't always want one - b epStart ; Go enable preemption... - -/* - * void lck_mtx_assert(lck_mtx_t* l, unsigned int) - * - */ - .align 5 - .globl EXT(lck_mtx_assert) -LEXT(lck_mtx_assert) - mr r11,r3 -maEnter: - lwz r5,MUTEX_DATA(r3) - cmpli cr0,r5,MUTEX_IND ; Is it a mutex indirect - bne-- maCheck ; No, go check the assertion - lwz r3,MUTEX_PTR(r3) ; load mutex ext pointer - b maEnter -maCheck: - mfsprg r6,1 ; load the current thread - rlwinm r5,r5,0,0,29 ; Extract the lock owner - cmpwi r4,MUTEX_ASSERT_OWNED - cmplw cr1,r6,r5 ; Is the lock held by current act - crandc cr0_eq,cr0_eq,cr1_eq ; Check owned assertion - bne-- maNext - mr r4,r11 - lis r3,hi16(mutex_assert1) ; Get the failed mutex message - ori r3,r3,lo16(mutex_assert1) ; Get the failed mutex message - b maPanic ; Panic path -maNext: - cmpwi r4,MUTEX_ASSERT_NOTOWNED ; Check not owned assertion - crand cr0_eq,cr0_eq,cr1_eq ; - bnelr++ -maPanic: - PROLOG(0) - mr r4,r11 - lis r3,hi16(mutex_assert2) ; Get the failed mutex message - ori r3,r3,lo16(mutex_assert2) ; Get the failed mutex message - bl EXT(panic) ; Call panic - BREAKPOINT_TRAP ; We die here anyway - - .data -mutex_assert1: - STRINGD "mutex (0x%08X) not owned\n\000" -mutex_assert2: - STRINGD "mutex (0x%08X) owned\n\000" - .text - - -/* - * void lck_mtx_ilk_unlock(lck_mtx *lock) - */ - .globl EXT(lck_mtx_ilk_unlock) -LEXT(lck_mtx_ilk_unlock) - - lwz r10,MUTEX_DATA(r3) - rlwinm r10,r10,0,0,30 - eieio - stw r10,MUTEX_DATA(r3) - - b epStart ; Go enable preemption... - -/* - * void _enable_preemption_no_check(void) - * - * This version does not check if we get preempted or not - */ - .align 4 - .globl EXT(_enable_preemption_no_check) - -LEXT(_enable_preemption_no_check) - - cmplw cr1,r1,r1 ; Force zero cr so we know not to check if preempted - b epCommn ; Join up with the other enable code... - -/* - * void _enable_preemption(void) - * - * This version checks if we get preempted or not - */ - .align 5 - .globl EXT(_enable_preemption) - -LEXT(_enable_preemption) - -; Here is where we enable preemption. - -epStart: - cmplwi cr1,r1,0 ; Force non-zero cr so we know to check if preempted - -epCommn: - mfsprg r3,1 ; Get current activation - li r8,-1 ; Get a decrementer - lwz r5,ACT_PREEMPT_CNT(r3) ; Get the preemption level - add. r5,r5,r8 ; Bring down the disable count - blt- epTooFar ; Yeah, we did... - stw r5,ACT_PREEMPT_CNT(r3) ; Save it back - crandc cr0_eq,cr0_eq,cr1_eq - beq+ epCheckPreempt ; Go check if we need to be preempted... - blr ; Leave... -epTooFar: - mr r4,r5 - lis r3,hi16(epTooFarStr) ; First half of panic string - ori r3,r3,lo16(epTooFarStr) ; Second half of panic string - PROLOG(0) - bl EXT(panic) - BREAKPOINT_TRAP ; We die here anyway - - .data -epTooFarStr: - STRINGD "enable_preemption: preemption_level %d\n\000" - - .text - .align 5 -epCheckPreempt: - lis r0,hi16(MASK(MSR_VEC)) ; Get vector enable - mfmsr r9 ; Get the MSR value - ori r0,r0,lo16(MASK(MSR_FP)) ; Get FP enable - andi. r4,r9,lo16(MASK(MSR_EE)) ; We cannot preempt if interruptions are off - beq+ epCPno ; No preemption here... - ori r7,r0,lo16(MASK(MSR_EE)) ; Get EE bit on too - andc r9,r9,r0 ; Clear FP and VEC - andc r7,r9,r7 ; Clear EE as well - mtmsr r7 ; Turn off interruptions - isync ; May have turned off vec and fp here - lwz r3,ACT_PER_PROC(r3) ; Get the per_proc block - lwz r7,PP_PENDING_AST(r3) ; Get pending AST mask - li r5,AST_URGENT ; Get the requests we do honor - lis r0,hi16(DoPreemptCall) ; Just in case, get the top of firmware call - and. r7,r7,r5 ; Should we preempt? - ori r0,r0,lo16(DoPreemptCall) ; Merge in bottom part - mtmsr r9 ; Allow interrupts if we can -epCPno: - beqlr+ ; We probably will not preempt... - sc ; Do the preemption - blr ; Now, go away now... - -/* - * void disable_preemption(void) - * - * Here is where we disable preemption. - */ - .align 5 - .globl EXT(_disable_preemption) - -LEXT(_disable_preemption) - - mfsprg r6,1 ; Get the current activation - lwz r5,ACT_PREEMPT_CNT(r6) ; Get the preemption level - addi r5,r5,1 ; Bring up the disable count - stw r5,ACT_PREEMPT_CNT(r6) ; Save it back - blr ; Return... - -/* - * int get_preemption_level(void) - * - * Return the current preemption level - */ - .align 5 - .globl EXT(get_preemption_level) - -LEXT(get_preemption_level) - - mfsprg r6,1 ; Get current activation - lwz r3,ACT_PREEMPT_CNT(r6) ; Get the preemption level - blr ; Return... - -/* - * void ppc_usimple_lock_init(simple_lock_t, etap_event_t) - * - * Initialize a simple lock. - */ - .align 5 - .globl EXT(ppc_usimple_lock_init) - -LEXT(ppc_usimple_lock_init) - - li r0, 0 ; set lock to free == 0 - stw r0, 0(r3) ; Initialize the lock - blr - -/* - * void lck_spin_lock(lck_spin_t *) - * void ppc_usimple_lock(simple_lock_t *) - * - */ - .align 5 - .globl EXT(lck_spin_lock) -LEXT(lck_spin_lock) - .globl EXT(ppc_usimple_lock) -LEXT(ppc_usimple_lock) - - mfsprg r6,1 ; Get the current activation - lwz r5,ACT_PREEMPT_CNT(r6) ; Get the preemption level - addi r5,r5,1 ; Bring up the disable count - stw r5,ACT_PREEMPT_CNT(r6) ; Save it back - mr r5,r3 ; Get the address of the lock - li r8,0 ; Set r8 to zero - li r4,0 ; Set r4 to zero - -slcktry: lwarx r11,SLOCK_ILK,r5 ; Grab the lock value - andi. r3,r11,ILK_LOCKED ; Is it locked? - ori r11,r6,ILK_LOCKED ; Set interlock - bne-- slckspin ; Yeah, wait for it to clear... - stwcx. r11,SLOCK_ILK,r5 ; Try to seize that there durn lock - bne-- slcktry ; Couldn't get it... - .globl EXT(slckPatch_isync) -LEXT(slckPatch_isync) - isync ; Make sure we don't use a speculativily loaded value - blr ; Go on home... - -slckspin: li r11,lgKillResv ; Killing field - stwcx. r11,0,r11 ; Kill reservation - - mr. r4,r4 ; Test timeout value - bne++ slockspin0 - lis r4,hi16(EXT(LockTimeOut)) ; Get the high part - ori r4,r4,lo16(EXT(LockTimeOut)) ; And the low part - lwz r4,0(r4) ; Get the timerout value - -slockspin0: mr. r8,r8 ; Is r8 set to zero - bne++ slockspin1 ; If yes, first spin attempt - lis r0,hi16(MASK(MSR_VEC)) ; Get vector enable - mfmsr r9 ; Get the MSR value - ori r0,r0,lo16(MASK(MSR_FP)) ; Get FP enable - ori r7,r0,lo16(MASK(MSR_EE)) ; Get EE bit on too - andc r9,r9,r0 ; Clear FP and VEC - andc r7,r9,r7 ; Clear EE as well - mtmsr r7 ; Turn off interruptions - isync ; May have turned off vec and fp here - mftb r8 ; Get timestamp on entry - b slcksniff - -slockspin1: mtmsr r7 ; Turn off interruptions - mftb r8 ; Get timestamp on entry - -slcksniff: lwz r3,SLOCK_ILK(r5) ; Get that lock in here - andi. r3,r3,ILK_LOCKED ; Is it free yet? - beq++ slckretry ; Yeah, try for it again... - - mftb r10 ; Time stamp us now - sub r10,r10,r8 ; Get the elapsed time - cmplwi r10,128 ; Have we been spinning for 128 tb ticks? - blt++ slcksniff ; Not yet... - - mtmsr r9 ; Say, any interrupts pending? - -; The following instructions force the pipeline to be interlocked to that only one -; instruction is issued per cycle. The insures that we stay enabled for a long enough -; time; if it's too short, pending interruptions will not have a chance to be taken - - subi r4,r4,128 ; Back off elapsed time from timeout value - or r4,r4,r4 ; Do nothing here but force a single cycle delay - mr. r4,r4 ; See if we used the whole timeout - li r3,0 ; Assume a timeout return code - or r4,r4,r4 ; Do nothing here but force a single cycle delay - - ble-- slckfail ; We failed - b slockspin1 ; Now that we've opened an enable window, keep trying... -slckretry: - mtmsr r9 ; Restore interrupt state - li r8,1 ; Show already through once - b slcktry -slckfail: ; We couldn't get the lock - lis r3,hi16(slckpanic_str) - ori r3,r3,lo16(slckpanic_str) - mr r4,r5 - mflr r5 - PROLOG(0) - bl EXT(panic) - BREAKPOINT_TRAP ; We die here anyway - - .data -slckpanic_str: - STRINGD "simple lock (0x%08X) deadlock detection, pc=0x%08X\n\000" - .text - -/* - * boolean_t lck_spin_try_lock(lck_spin_t *) - * unsigned int ppc_usimple_lock_try(simple_lock_t *) - * - */ - .align 5 - .globl EXT(lck_spin_try_lock) -LEXT(lck_spin_try_lock) - .globl EXT(ppc_usimple_lock_try) -LEXT(ppc_usimple_lock_try) - - lis r0,hi16(MASK(MSR_VEC)) ; Get vector enable - mfmsr r9 ; Get the MSR value - ori r0,r0,lo16(MASK(MSR_FP)) ; Get FP enable - ori r7,r0,lo16(MASK(MSR_EE)) ; Get EE bit on too - andc r9,r9,r0 ; Clear FP and VEC - andc r7,r9,r7 ; Clear EE as well - mtmsr r7 ; Disable interruptions and thus, preemption - mfsprg r6,1 ; Get current activation - - lwz r11,SLOCK_ILK(r3) ; Get the lock - andi. r5,r11,ILK_LOCKED ; Check it... - bne-- slcktryfail ; Quickly fail... - -slcktryloop: - lwarx r11,SLOCK_ILK,r3 ; Ld from addr of arg and reserve - - andi. r5,r11,ILK_LOCKED ; TEST... - ori r5,r6,ILK_LOCKED - bne-- slcktryfailX ; branch if taken. Predict free - - stwcx. r5,SLOCK_ILK,r3 ; And SET (if still reserved) - bne-- slcktryloop ; If set failed, loop back - - .globl EXT(stlckPatch_isync) -LEXT(stlckPatch_isync) - isync - - lwz r5,ACT_PREEMPT_CNT(r6) ; Get the preemption level - addi r5,r5,1 ; Bring up the disable count - stw r5,ACT_PREEMPT_CNT(r6) ; Save it back - - mtmsr r9 ; Allow interruptions now - li r3,1 ; Set that the lock was free - blr - -slcktryfailX: - li r5,lgKillResv ; Killing field - stwcx. r5,0,r5 ; Kill reservation - -slcktryfail: - mtmsr r9 ; Allow interruptions now - li r3,0 ; FAILURE - lock was taken - blr - - -/* - * void lck_spin_unlock(lck_spin_t *) - * void ppc_usimple_unlock_rwcmb(simple_lock_t *) - * - */ - .align 5 - .globl EXT(lck_spin_unlock) -LEXT(lck_spin_unlock) - .globl EXT(ppc_usimple_unlock_rwcmb) -LEXT(ppc_usimple_unlock_rwcmb) - - li r0,0 - .globl EXT(sulckPatch_isync) -LEXT(sulckPatch_isync) - isync - .globl EXT(sulckPatch_eieio) -LEXT(sulckPatch_eieio) - eieio - stw r0, SLOCK_ILK(r3) - - b epStart ; Go enable preemption... - -/* - * void ppc_usimple_unlock_rwmb(simple_lock_t *) - * - */ - .align 5 - .globl EXT(ppc_usimple_unlock_rwmb) - -LEXT(ppc_usimple_unlock_rwmb) - - li r0,0 - sync - stw r0, SLOCK_ILK(r3) - - b epStart ; Go enable preemption... - -/* - * void lck_rw_lock_exclusive(lck_rw_t*) - * - */ - .align 5 - .globl EXT(lck_rw_lock_exclusive) -LEXT(lck_rw_lock_exclusive) -#if !MACH_LDEBUG - .globl EXT(lock_write) -LEXT(lock_write) -#endif - lis r7,0xFFFF - ori r7,r7,(WANT_EXCL|WANT_UPGRADE|ILK_LOCKED) -rwleloop: lwarx r5,RW_DATA,r3 ; Grab the lock value - and. r8,r5,r7 ; Can we have it? - ori r6,r5,WANT_EXCL ; Mark Exclusive - bne-- rwlespin ; Branch if cannot be held - stwcx. r6,RW_DATA,r3 ; Update lock word - bne-- rwleloop - .globl EXT(rwlePatch_isync) -LEXT(rwlePatch_isync) - isync - blr -rwlespin: - li r4,lgKillResv ; Killing field - stwcx. r4,0,r4 ; Kill it - cmpli cr0,r5,RW_IND ; Is it a lock indirect - bne-- rwlespin1 ; No, go handle contention - mr r4,r3 ; pass lock pointer - lwz r3,RW_PTR(r3) ; load lock ext pointer - b EXT(lck_rw_lock_exclusive_ext) -rwlespin1: - b EXT(lck_rw_lock_exclusive_gen) - -/* - * void lck_rw_lock_shared(lck_rw_t*) - * - */ - .align 5 - .globl EXT(lck_rw_lock_shared) -LEXT(lck_rw_lock_shared) -#if !MACH_LDEBUG - .globl EXT(lock_read) -LEXT(lock_read) -#endif -rwlsloop: lwarx r5,RW_DATA,r3 ; Grab the lock value - andi. r7,r5,WANT_EXCL|WANT_UPGRADE|ILK_LOCKED ; Can we have it? - bne-- rwlsopt ; Branch if cannot be held -rwlsloopres: - addis r6,r5,1 ; Increment read cnt - stwcx. r6,RW_DATA,r3 ; Update lock word - bne-- rwlsloop - .globl EXT(rwlsPatch_isync) -LEXT(rwlsPatch_isync) - isync - blr -rwlsopt: - andi. r7,r5,PRIV_EXCL|ILK_LOCKED ; Can we have it? - bne-- rwlsspin ; Branch if cannot be held - lis r7,0xFFFF ; Get read cnt mask - and. r8,r5,r7 ; Is it shared - bne rwlsloopres ; Branch if can be held -rwlsspin: - li r4,lgKillResv ; Killing field - stwcx. r4,0,r4 ; Kill it - cmpli cr0,r5,RW_IND ; Is it a lock indirect - bne-- rwlsspin1 ; No, go handle contention - mr r4,r3 ; pass lock pointer - lwz r3,RW_PTR(r3) ; load lock ext pointer - b EXT(lck_rw_lock_shared_ext) -rwlsspin1: - b EXT(lck_rw_lock_shared_gen) - -/* - * boolean_t lck_rw_lock_shared_to_exclusive(lck_rw_t*) - * - */ - .align 5 - .globl EXT(lck_rw_lock_shared_to_exclusive) -LEXT(lck_rw_lock_shared_to_exclusive) -#if !MACH_LDEBUG - .globl EXT(lock_read_to_write) -LEXT(lock_read_to_write) -#endif -rwlseloop: lwarx r5,RW_DATA,r3 ; Grab the lock value - addis r6,r5,0xFFFF ; Decrement read cnt - lis r8,0xFFFF ; Get read count mask - ori r8,r8,WANT_UPGRADE|ILK_LOCKED ; Include Interlock and upgrade flags - and. r7,r6,r8 ; Can we have it? - ori r9,r6,WANT_UPGRADE ; Mark Exclusive - bne-- rwlsespin ; Branch if cannot be held - stwcx. r9,RW_DATA,r3 ; Update lock word - bne-- rwlseloop - .globl EXT(rwlsePatch_isync) -LEXT(rwlsePatch_isync) - isync - li r3,1 ; Succeed, return TRUE... - blr -rwlsespin: - li r4,lgKillResv ; Killing field - stwcx. r4,0,r4 ; Kill it - cmpli cr0,r5,RW_IND ; Is it a lock indirect - bne-- rwlsespin1 ; No, go handle contention - mr r4,r3 ; pass lock pointer - lwz r3,RW_PTR(r3) ; load lock ext pointer - b EXT(lck_rw_lock_shared_to_exclusive_ext) -rwlsespin1: - b EXT(lck_rw_lock_shared_to_exclusive_gen) - - - -/* - * void lck_rw_lock_exclusive_to_shared(lck_rw_t*) - * - */ - .align 5 - .globl EXT(lck_rw_lock_exclusive_to_shared) -LEXT(lck_rw_lock_exclusive_to_shared) -#if !MACH_LDEBUG - .globl EXT(lock_write_to_read) -LEXT(lock_write_to_read) -#endif - .globl EXT(rwlesPatch_isync) -LEXT(rwlesPatch_isync) - isync - .globl EXT(rwlesPatch_eieio) -LEXT(rwlesPatch_eieio) - eieio -rwlesloop: lwarx r5,RW_DATA,r3 ; Grab the lock value - andi. r7,r5,ILK_LOCKED ; Test interlock flag - bne-- rwlesspin ; Branch if interlocked - lis r6,1 ; Get 1 for read count - andi. r10,r5,WANT_UPGRADE ; Is it held with upgrade - li r9,WANT_UPGRADE|WAIT_FLAG ; Get upgrade and wait flags mask - bne rwlesexcl1 ; Skip if held with upgrade - li r9,WANT_EXCL|WAIT_FLAG ; Get exclusive and wait flags mask -rwlesexcl1: - andc r7,r5,r9 ; Marked free - rlwimi r6,r7,0,16,31 ; Set shared cnt to one - stwcx. r6,RW_DATA,r3 ; Update lock word - bne-- rwlesloop - andi. r7,r5,WAIT_FLAG ; Test wait flag - beqlr++ ; Return of no waiters - addi r3,r3,RW_EVENT ; Get lock event address - b EXT(thread_wakeup) ; wakeup waiters -rwlesspin: - li r4,lgKillResv ; Killing field - stwcx. r4,0,r4 ; Kill it - cmpli cr0,r5,RW_IND ; Is it a lock indirect - bne-- rwlesspin1 ; No, go handle contention - mr r4,r3 ; pass lock pointer - lwz r3,RW_PTR(r3) ; load lock ext pointer - b EXT(lck_rw_lock_exclusive_to_shared_ext) -rwlesspin1: - b EXT(lck_rw_lock_exclusive_to_shared_gen) - - - -/* - * boolean_t lck_rw_try_lock_exclusive(lck_rw_t*) - * - */ - .align 5 - .globl EXT(lck_rw_try_lock_exclusive) -LEXT(lck_rw_try_lock_exclusive) - lis r10,0xFFFF ; Load read count mask - ori r10,r10,WANT_EXCL|WANT_UPGRADE ; Include exclusive and upgrade flags -rwtleloop: lwarx r5,RW_DATA,r3 ; Grab the lock value - andi. r7,r5,ILK_LOCKED ; Test interlock flag - bne-- rwtlespin ; Branch if interlocked - and. r7,r5,r10 ; Can we have it - ori r6,r5,WANT_EXCL ; Mark Exclusive - bne-- rwtlefail ; - stwcx. r6,RW_DATA,r3 ; Update lock word - bne-- rwtleloop - .globl EXT(rwtlePatch_isync) -LEXT(rwtlePatch_isync) - isync - li r3,1 ; Return TRUE - blr -rwtlefail: - li r4,lgKillResv ; Killing field - stwcx. r4,0,r4 ; Kill it - li r3,0 ; Return FALSE - blr -rwtlespin: - li r4,lgKillResv ; Killing field - stwcx. r4,0,r4 ; Kill it - cmpli cr0,r5,RW_IND ; Is it a lock indirect - bne-- rwtlespin1 ; No, go handle contention - mr r4,r3 ; pass lock pointer - lwz r3,RW_PTR(r3) ; load lock ext pointer - b EXT(lck_rw_try_lock_exclusive_ext) -rwtlespin1: - b EXT(lck_rw_try_lock_exclusive_gen) - - -/* - * boolean_t lck_rw_try_lock_shared(lck_rw_t*) - * - */ - .align 5 - .globl EXT(lck_rw_try_lock_shared) -LEXT(lck_rw_try_lock_shared) -rwtlsloop: lwarx r5,RW_DATA,r3 ; Grab the lock value - andi. r7,r5,ILK_LOCKED ; Test interlock flag - bne-- rwtlsspin ; Branch if interlocked - andi. r7,r5,WANT_EXCL|WANT_UPGRADE ; So, can we have it? - bne-- rwtlsopt ; Branch if held exclusive -rwtlsloopres: - addis r6,r5,1 ; Increment read cnt - stwcx. r6,RW_DATA,r3 ; Update lock word - bne-- rwtlsloop - .globl EXT(rwtlsPatch_isync) -LEXT(rwtlsPatch_isync) - isync - li r3,1 ; Return TRUE - blr -rwtlsopt: - andi. r7,r5,PRIV_EXCL ; Can we have it? - bne-- rwtlsfail ; Branch if cannot be held - lis r7,0xFFFF ; Get read cnt mask - and. r8,r5,r7 ; Is it shared - bne rwtlsloopres ; Branch if can be held -rwtlsfail: - li r3,0 ; Return FALSE - blr -rwtlsspin: - li r4,lgKillResv ; Killing field - stwcx. r4,0,r4 ; Kill it - cmpli cr0,r5,RW_IND ; Is it a lock indirect - bne-- rwtlsspin1 ; No, go handle contention - mr r4,r3 ; pass lock pointer - lwz r3,RW_PTR(r3) ; load lock ext pointer - b EXT(lck_rw_try_lock_shared_ext) -rwtlsspin1: - b EXT(lck_rw_try_lock_shared_gen) - - - -/* - * lck_rw_type_t lck_rw_done(lck_rw_t*) - * - */ - .align 5 - .globl EXT(lck_rw_done) -LEXT(lck_rw_done) -#if !MACH_LDEBUG - .globl EXT(lock_done) -LEXT(lock_done) -#endif - .globl EXT(rwldPatch_isync) -LEXT(rwldPatch_isync) - isync - .globl EXT(rwldPatch_eieio) -LEXT(rwldPatch_eieio) - eieio - li r10,WAIT_FLAG ; Get wait flag - lis r7,0xFFFF ; Get read cnt mask - mr r12,r3 ; Save lock addr -rwldloop: lwarx r5,RW_DATA,r3 ; Grab the lock value - andi. r8,r5,ILK_LOCKED ; Test interlock flag - bne-- rwldspin ; Branch if interlocked - and. r8,r5,r7 ; Is it shared - cmpi cr1,r8,0 ; Is it shared - beq cr1,rwldexcl ; No, check exclusive - li r11,RW_SHARED ; Set return value - addis r6,r5,0xFFFF ; Decrement read count - and. r8,r6,r7 ; Is it still shared - li r8,0 ; Assume no wakeup - bne rwldshared1 ; Skip if still held shared - and r8,r6,r10 ; Extract wait flag - andc r6,r6,r10 ; Clear wait flag -rwldshared1: - b rwldstore -rwldexcl: - li r11,RW_EXCL ; Set return value - li r9,WANT_UPGRADE ; Get upgrade flag - and. r6,r5,r9 ; Is it held with upgrade - li r9,WANT_UPGRADE|WAIT_FLAG ; Mask upgrade abd wait flags - bne rwldexcl1 ; Skip if held with upgrade - li r9,WANT_EXCL|WAIT_FLAG ; Mask exclusive and wait flags -rwldexcl1: - andc r6,r5,r9 ; Marked free - and r8,r5,r10 ; Null if no waiter -rwldstore: - stwcx. r6,RW_DATA,r3 ; Update lock word - bne-- rwldloop - mr. r8,r8 ; wakeup needed? - mr r3,r11 ; Return lock held type - beqlr++ - mr r3,r12 ; Restore lock address - PROLOG(0) - addi r3,r3,RW_EVENT ; Get lock event address - bl EXT(thread_wakeup) ; wakeup threads - lwz r2,(FM_ALIGN(0)+FM_SIZE+FM_CR_SAVE)(r1) - mtcr r2 - EPILOG - li r3,RW_SHARED ; Assume lock type shared - bne cr1,rwldret ; Branch if was held exclusive - li r3,RW_EXCL ; Return lock type exclusive -rwldret: - blr -rwldspin: - li r4,lgKillResv ; Killing field - stwcx. r4,0,r4 ; Kill it - cmpli cr0,r5,RW_IND ; Is it a lock indirect - bne-- rwldspin1 ; No, go handle contention - mr r4,r3 ; pass lock pointer - lwz r3,RW_PTR(r3) ; load lock ext pointer - b EXT(lck_rw_done_ext) -rwldspin1: - b EXT(lck_rw_done_gen) - -/* - * void lck_rw_ilk_lock(lck_rw_t *lock) - */ - .globl EXT(lck_rw_ilk_lock) -LEXT(lck_rw_ilk_lock) - crclr hwtimeout ; no timeout option - li r4,0 ; request default timeout value - li r12,ILK_LOCKED ; Load bit mask - b lckcomm ; Join on up... - -/* - * void lck_rw_ilk_unlock(lck_rw_t *lock) - */ - .globl EXT(lck_rw_ilk_unlock) -LEXT(lck_rw_ilk_unlock) - li r4,1 - b EXT(hw_unlock_bit) diff --git a/osfmk/ppc/hw_lock_types.h b/osfmk/ppc/hw_lock_types.h deleted file mode 100644 index be6e5568c..000000000 --- a/osfmk/ppc/hw_lock_types.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (C) 1998 Apple Computer - * All Rights Reserved - */ -/* - * @OSF_COPYRIGHT@ - */ - -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ - -#ifndef _PPC_HW_LOCK_TYPES_H_ -#define _PPC_HW_LOCK_TYPES_H_ - -struct hslock { - int lock_data; -}; - -typedef struct hslock hw_lock_data_t, *hw_lock_t; - -#define hw_lock_addr(hwl) (&((hwl).lock_data)) - - -#endif /* _PPC_HW_LOCK_TYPES_H_ */ diff --git a/osfmk/ppc/hw_perfmon.c b/osfmk/ppc/hw_perfmon.c deleted file mode 100644 index 64d38d1e0..000000000 --- a/osfmk/ppc/hw_perfmon.c +++ /dev/null @@ -1,959 +0,0 @@ -/* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -decl_simple_lock_data(,hw_perfmon_lock) -static task_t hw_perfmon_owner = TASK_NULL; -static int hw_perfmon_thread_count = 0; - -/* Notes: - * -supervisor/user level filtering is unnecessary because of the way PMCs and MMCRs are context switched - * (can only count user events anyway) - * -marked filtering is unnecssary because each thread has its own virtualized set of PMCs and MMCRs - * -virtual counter PMI is passed up as a breakpoint exception - */ - -int perfmon_init(void) -{ - simple_lock_init(&hw_perfmon_lock, FALSE); - return KERN_SUCCESS; -} - -/* PMC Facility Owner: - * TASK_NULL - no one owns it - * kernel_task - owned by hw_perfmon - * other task - owned by another task - */ - -int perfmon_acquire_facility(task_t task) -{ - kern_return_t retval = KERN_SUCCESS; - - simple_lock(&hw_perfmon_lock); - - if(hw_perfmon_owner==task) { -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_acquire_facility - ACQUIRED: already owner\n"); -#endif - retval = KERN_SUCCESS; - /* already own it */ - } else if(hw_perfmon_owner==TASK_NULL) { /* no one owns it */ - hw_perfmon_owner = task; - hw_perfmon_thread_count = 0; -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_acquire_facility - ACQUIRED: no current owner - made new owner\n"); -#endif - retval = KERN_SUCCESS; - } else { /* someone already owns it */ - if(hw_perfmon_owner==kernel_task) { - if(hw_perfmon_thread_count==0) { /* kernel owns it but no threads using it */ - hw_perfmon_owner = task; - hw_perfmon_thread_count = 0; -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_acquire_facility - ACQUIRED: kernel is current owner but no threads using it\n"); -#endif - retval = KERN_SUCCESS; - } else { -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_acquire_facility - DENIED: kernel is current owner and facility in use\n"); -#endif - retval = KERN_RESOURCE_SHORTAGE; - } - } else { /* non-kernel owner */ -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_acquire_facility - DENIED: another active task owns the facility\n"); -#endif - retval = KERN_RESOURCE_SHORTAGE; - } - } - - simple_unlock(&hw_perfmon_lock); - return retval; -} - -int perfmon_release_facility(task_t task) -{ - kern_return_t retval = KERN_SUCCESS; - task_t old_perfmon_owner = hw_perfmon_owner; - - simple_lock(&hw_perfmon_lock); - - if(task!=hw_perfmon_owner) { - retval = KERN_NO_ACCESS; - } else { - if(old_perfmon_owner==kernel_task) { - if(hw_perfmon_thread_count>0) { -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_release_facility - NOT RELEASED: kernel task is owner and has active perfmon threads\n"); -#endif - retval = KERN_NO_ACCESS; - } else { -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_release_facility - RELEASED: kernel task was owner\n"); -#endif - hw_perfmon_owner = TASK_NULL; - retval = KERN_SUCCESS; - } - } else { -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_release_facility - RELEASED: user task was owner\n"); -#endif - hw_perfmon_owner = TASK_NULL; - retval = KERN_SUCCESS; - } - } - - simple_unlock(&hw_perfmon_lock); - return retval; -} - -static int -perfmon_enable(thread_t thread) -{ - struct savearea *sv = thread->machine.pcb; - kern_return_t retval = KERN_SUCCESS; - int curPMC; - - if(thread->machine.specFlags & perfMonitor) { - return KERN_SUCCESS; /* already enabled */ - } else if(perfmon_acquire_facility(kernel_task)!=KERN_SUCCESS) { - return KERN_RESOURCE_SHORTAGE; /* facility is in use */ - } else { /* kernel_task owns the faciltity and this thread has not yet been counted */ - simple_lock(&hw_perfmon_lock); - hw_perfmon_thread_count++; - simple_unlock(&hw_perfmon_lock); - } - - sv->save_mmcr1 = 0; - sv->save_mmcr2 = 0; - - switch(PerProcTable[0].ppe_vaddr->cpu_subtype) { - case CPU_SUBTYPE_POWERPC_750: - case CPU_SUBTYPE_POWERPC_7400: - case CPU_SUBTYPE_POWERPC_7450: - { - ppc32_mmcr0_reg_t mmcr0_reg; - - mmcr0_reg.value = 0; - mmcr0_reg.field.disable_counters_always = TRUE; - mmcr0_reg.field.disable_counters_supervisor = TRUE; /* no choice */ - sv->save_mmcr0 = mmcr0_reg.value; - } - break; - case CPU_SUBTYPE_POWERPC_970: - { - ppc64_mmcr0_reg_t mmcr0_reg; - - mmcr0_reg.value = 0; - mmcr0_reg.field.disable_counters_always = TRUE; - mmcr0_reg.field.disable_counters_supervisor = TRUE; /* no choice */ - sv->save_mmcr0 = mmcr0_reg.value; - } - break; - default: - retval = KERN_FAILURE; - break; - } - - if(retval==KERN_SUCCESS) { - for(curPMC=0; curPMCsave_pmc[curPMC] = 0; - thread->machine.pmcovfl[curPMC] = 0; - } - thread->machine.perfmonFlags = 0; - thread->machine.specFlags |= perfMonitor; /* enable perf monitor facility for this thread */ - if(thread==current_thread()) { - getPerProc()->spcFlags |= perfMonitor; /* update per_proc */ - } - } - -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_enable - mmcr0=0x%llx mmcr1=0x%llx mmcr2=0x%llx\n", sv->save_mmcr0, sv->save_mmcr1, sv->save_mmcr2); -#endif - - return retval; -} - -int perfmon_disable(thread_t thread) -{ - struct savearea *sv = thread->machine.pcb; - int curPMC; - - if(!(thread->machine.specFlags & perfMonitor)) { - return KERN_NO_ACCESS; /* not enabled */ - } else { - simple_lock(&hw_perfmon_lock); - hw_perfmon_thread_count--; - simple_unlock(&hw_perfmon_lock); - perfmon_release_facility(kernel_task); /* will release if hw_perfmon_thread_count is 0 */ - } - - thread->machine.specFlags &= ~perfMonitor; /* disable perf monitor facility for this thread */ - if(thread==current_thread()) { - PerProcTable[cpu_number()].ppe_vaddr->spcFlags &= ~perfMonitor; /* update per_proc */ - } - sv->save_mmcr0 = 0; - sv->save_mmcr1 = 0; - sv->save_mmcr2 = 0; - - for(curPMC=0; curPMCsave_pmc[curPMC] = 0; - thread->machine.pmcovfl[curPMC] = 0; - thread->machine.perfmonFlags = 0; - } - -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_disable - mmcr0=0x%llx mmcr1=0x%llx mmcr2=0x%llx\n", sv->save_mmcr0, sv->save_mmcr1, sv->save_mmcr2); -#endif - - return KERN_SUCCESS; -} - -static int -perfmon_clear_counters(thread_t thread) -{ - struct savearea *sv = thread->machine.pcb; - int curPMC; - -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_clear_counters (CPU%d)\n", cpu_number()); -#endif - - /* clear thread copy */ - for(curPMC=0; curPMCsave_pmc[curPMC] = 0; - thread->machine.pmcovfl[curPMC] = 0; - } - - return KERN_SUCCESS; -} - -static int -perfmon_write_counters(thread_t thread, uint64_t *pmcs) -{ - struct savearea *sv = thread->machine.pcb; - int curPMC; - -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_write_counters (CPU%d): mmcr0 = %016llX, pmc1=%llX pmc2=%llX pmc3=%llX pmc4=%llX pmc5=%llX pmc6=%llX pmc7=%llX pmc8=%llX\n", cpu_number(), sv->save_mmcr0, pmcs[PMC_1], pmcs[PMC_2], pmcs[PMC_3], pmcs[PMC_4], pmcs[PMC_5], pmcs[PMC_6], pmcs[PMC_7], pmcs[PMC_8]); -#endif - - /* update thread copy */ - for(curPMC=0; curPMCsave_pmc[curPMC] = pmcs[curPMC] & 0x7FFFFFFF; - thread->machine.pmcovfl[curPMC] = (pmcs[curPMC]>>31) & 0xFFFFFFFF; - } - - return KERN_SUCCESS; -} - -static int -perfmon_read_counters(thread_t thread, uint64_t *pmcs) -{ - struct savearea *sv = thread->machine.pcb; - int curPMC; - - /* retrieve from thread copy */ - for(curPMC=0; curPMCmachine.pmcovfl[curPMC]; - pmcs[curPMC] = pmcs[curPMC]<<31; - pmcs[curPMC] |= (sv->save_pmc[curPMC] & 0x7FFFFFFF); - } - - /* zero any unused counters on this platform */ - switch(PerProcTable[0].ppe_vaddr->cpu_subtype) { - case CPU_SUBTYPE_POWERPC_750: - case CPU_SUBTYPE_POWERPC_7400: - case CPU_SUBTYPE_POWERPC_7450: - pmcs[PMC_7] = 0; - pmcs[PMC_8] = 0; - break; - default: - break; - } - -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_read_counters (CPU%d): mmcr0 = %016llX pmc1=%llX pmc2=%llX pmc3=%llX pmc4=%llX pmc5=%llX pmc6=%llX pmc7=%llX pmc8=%llX\n", cpu_number(), sv->save_mmcr0, pmcs[PMC_1], pmcs[PMC_2], pmcs[PMC_3], pmcs[PMC_4], pmcs[PMC_5], pmcs[PMC_6], pmcs[PMC_7], pmcs[PMC_8]); -#endif - - return KERN_SUCCESS; -} - -static int -perfmon_start_counters(thread_t thread) -{ - struct savearea *sv = thread->machine.pcb; - kern_return_t retval = KERN_SUCCESS; - - switch(PerProcTable[0].ppe_vaddr->cpu_subtype) { - case CPU_SUBTYPE_POWERPC_750: - case CPU_SUBTYPE_POWERPC_7400: - { - ppc32_mmcr0_reg_t mmcr0_reg; - mmcr0_reg.value = sv->save_mmcr0; - mmcr0_reg.field.disable_counters_always = FALSE; - /* XXXXX PMI broken on 750, 750CX, 750FX, 7400 and 7410 v1.2 and earlier XXXXX */ - mmcr0_reg.field.on_pmi_stop_counting = FALSE; - mmcr0_reg.field.enable_pmi = FALSE; - mmcr0_reg.field.enable_pmi_on_pmc1 = FALSE; - mmcr0_reg.field.enable_pmi_on_pmcn = FALSE; - sv->save_mmcr0 = mmcr0_reg.value; - } - break; - case CPU_SUBTYPE_POWERPC_7450: - { - ppc32_mmcr0_reg_t mmcr0_reg; - mmcr0_reg.value = sv->save_mmcr0; - mmcr0_reg.field.disable_counters_always = FALSE; - mmcr0_reg.field.on_pmi_stop_counting = TRUE; - mmcr0_reg.field.enable_pmi = TRUE; - mmcr0_reg.field.enable_pmi_on_pmc1 = TRUE; - mmcr0_reg.field.enable_pmi_on_pmcn = TRUE; - sv->save_mmcr0 = mmcr0_reg.value; - } - break; - case CPU_SUBTYPE_POWERPC_970: - { - ppc64_mmcr0_reg_t mmcr0_reg; - mmcr0_reg.value = sv->save_mmcr0; - mmcr0_reg.field.disable_counters_always = FALSE; - mmcr0_reg.field.on_pmi_stop_counting = TRUE; - mmcr0_reg.field.enable_pmi = TRUE; - mmcr0_reg.field.enable_pmi_on_pmc1 = TRUE; - mmcr0_reg.field.enable_pmi_on_pmcn = TRUE; - sv->save_mmcr0 = mmcr0_reg.value; - } - break; - default: - retval = KERN_FAILURE; - break; - } - -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_start_counters (CPU%d) - mmcr0=0x%llx mmcr1=0x%llx mmcr2=0x%llx\n", cpu_number(), sv->save_mmcr0, sv->save_mmcr1, sv->save_mmcr2); -#endif - - return retval; -} - -static int -perfmon_stop_counters(thread_t thread) -{ - struct savearea *sv = thread->machine.pcb; - kern_return_t retval = KERN_SUCCESS; - - switch(PerProcTable[0].ppe_vaddr->cpu_subtype) { - case CPU_SUBTYPE_POWERPC_750: - case CPU_SUBTYPE_POWERPC_7400: - case CPU_SUBTYPE_POWERPC_7450: - { - ppc32_mmcr0_reg_t mmcr0_reg; - mmcr0_reg.value = sv->save_mmcr0; - mmcr0_reg.field.disable_counters_always = TRUE; - sv->save_mmcr0 = mmcr0_reg.value; - } - break; - case CPU_SUBTYPE_POWERPC_970: - { - ppc64_mmcr0_reg_t mmcr0_reg; - mmcr0_reg.value = sv->save_mmcr0; - mmcr0_reg.field.disable_counters_always = TRUE; - sv->save_mmcr0 = mmcr0_reg.value; - } - break; - default: - retval = KERN_FAILURE; - break; - } - -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_stop_counters (CPU%d) - mmcr0=0x%llx mmcr1=0x%llx mmcr2=0x%llx\n", cpu_number(), sv->save_mmcr0, sv->save_mmcr1, sv->save_mmcr2); -#endif - - return retval; -} - -static int -perfmon_set_event(thread_t thread, int pmc, int event) -{ - struct savearea *sv = thread->machine.pcb; - kern_return_t retval = KERN_SUCCESS; - -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_set_event b4 (CPU%d) - pmc=%d, event=%d - mmcr0=0x%llx mmcr1=0x%llx mmcr2=0x%llx\n", cpu_number(), pmc, event, sv->save_mmcr0, sv->save_mmcr1, sv->save_mmcr2); -#endif - - switch(PerProcTable[0].ppe_vaddr->cpu_subtype) { - case CPU_SUBTYPE_POWERPC_750: - case CPU_SUBTYPE_POWERPC_7400: - { - ppc32_mmcr0_reg_t mmcr0_reg; - ppc32_mmcr1_reg_t mmcr1_reg; - - mmcr0_reg.value = sv->save_mmcr0; - mmcr1_reg.value = sv->save_mmcr1; - - switch(pmc) { - case PMC_1: - mmcr0_reg.field.pmc1_event = event; - sv->save_mmcr0 = mmcr0_reg.value; - break; - case PMC_2: - mmcr0_reg.field.pmc2_event = event; - sv->save_mmcr0 = mmcr0_reg.value; - break; - case PMC_3: - mmcr1_reg.field.pmc3_event = event; - sv->save_mmcr1 = mmcr1_reg.value; - break; - case PMC_4: - mmcr1_reg.field.pmc4_event = event; - sv->save_mmcr1 = mmcr1_reg.value; - break; - default: - retval = KERN_FAILURE; - break; - } - } - break; - case CPU_SUBTYPE_POWERPC_7450: - { - ppc32_mmcr0_reg_t mmcr0_reg; - ppc32_mmcr1_reg_t mmcr1_reg; - - mmcr0_reg.value = sv->save_mmcr0; - mmcr1_reg.value = sv->save_mmcr1; - - switch(pmc) { - case PMC_1: - mmcr0_reg.field.pmc1_event = event; - sv->save_mmcr0 = mmcr0_reg.value; - break; - case PMC_2: - mmcr0_reg.field.pmc2_event = event; - sv->save_mmcr0 = mmcr0_reg.value; - break; - case PMC_3: - mmcr1_reg.field.pmc3_event = event; - sv->save_mmcr1 = mmcr1_reg.value; - break; - case PMC_4: - mmcr1_reg.field.pmc4_event = event; - sv->save_mmcr1 = mmcr1_reg.value; - break; - case PMC_5: - mmcr1_reg.field.pmc5_event = event; - sv->save_mmcr1 = mmcr1_reg.value; - break; - case PMC_6: - mmcr1_reg.field.pmc6_event = event; - sv->save_mmcr1 = mmcr1_reg.value; - break; - default: - retval = KERN_FAILURE; - break; - } - } - break; - case CPU_SUBTYPE_POWERPC_970: - { - ppc64_mmcr0_reg_t mmcr0_reg; - ppc64_mmcr1_reg_t mmcr1_reg; - - mmcr0_reg.value = sv->save_mmcr0; - mmcr1_reg.value = sv->save_mmcr1; - - switch(pmc) { - case PMC_1: - mmcr0_reg.field.pmc1_event = event; - sv->save_mmcr0 = mmcr0_reg.value; - break; - case PMC_2: - mmcr0_reg.field.pmc2_event = event; - sv->save_mmcr0 = mmcr0_reg.value; - break; - case PMC_3: - mmcr1_reg.field.pmc3_event = event; - sv->save_mmcr1 = mmcr1_reg.value; - break; - case PMC_4: - mmcr1_reg.field.pmc4_event = event; - sv->save_mmcr1 = mmcr1_reg.value; - break; - case PMC_5: - mmcr1_reg.field.pmc5_event = event; - sv->save_mmcr1 = mmcr1_reg.value; - break; - case PMC_6: - mmcr1_reg.field.pmc6_event = event; - sv->save_mmcr1 = mmcr1_reg.value; - break; - case PMC_7: - mmcr1_reg.field.pmc7_event = event; - sv->save_mmcr1 = mmcr1_reg.value; - break; - case PMC_8: - mmcr1_reg.field.pmc8_event = event; - sv->save_mmcr1 = mmcr1_reg.value; - break; - default: - retval = KERN_FAILURE; - break; - } - } - break; - default: - retval = KERN_FAILURE; - break; - } - -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_set_event (CPU%d) - pmc=%d, event=%d - mmcr0=0x%llx mmcr1=0x%llx mmcr2=0x%llx\n", cpu_number(), pmc, event, sv->save_mmcr0, sv->save_mmcr1, sv->save_mmcr2); -#endif - - return retval; -} - -static int -perfmon_set_event_func(thread_t thread, uint32_t f) -{ - struct savearea *sv = thread->machine.pcb; - kern_return_t retval = KERN_SUCCESS; - -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_set_event_func - func=%s\n", - f==PPC_PERFMON_FUNC_FPU ? "FUNC" : - f==PPC_PERFMON_FUNC_ISU ? "ISU" : - f==PPC_PERFMON_FUNC_IFU ? "IFU" : - f==PPC_PERFMON_FUNC_VMX ? "VMX" : - f==PPC_PERFMON_FUNC_IDU ? "IDU" : - f==PPC_PERFMON_FUNC_GPS ? "GPS" : - f==PPC_PERFMON_FUNC_LSU0 ? "LSU0" : - f==PPC_PERFMON_FUNC_LSU1A ? "LSU1A" : - f==PPC_PERFMON_FUNC_LSU1B ? "LSU1B" : - f==PPC_PERFMON_FUNC_SPECA ? "SPECA" : - f==PPC_PERFMON_FUNC_SPECB ? "SPECB" : - f==PPC_PERFMON_FUNC_SPECC ? "SPECC" : - "UNKNOWN"); -#endif /* HWPERFMON_DEBUG */ - - switch(PerProcTable[0].ppe_vaddr->cpu_subtype) { - case CPU_SUBTYPE_POWERPC_750: - case CPU_SUBTYPE_POWERPC_7400: - case CPU_SUBTYPE_POWERPC_7450: - retval = KERN_FAILURE; /* event functional unit only applies to 970 */ - break; - case CPU_SUBTYPE_POWERPC_970: - { - ppc64_mmcr1_reg_t mmcr1_reg; - ppc_func_unit_t func_unit; - - func_unit.value = f; - mmcr1_reg.value = sv->save_mmcr1; - - mmcr1_reg.field.ttm0_select = func_unit.field.TTM0SEL; - mmcr1_reg.field.ttm1_select = func_unit.field.TTM1SEL; - mmcr1_reg.field.ttm2_select = 0; /* not used */ - mmcr1_reg.field.ttm3_select = func_unit.field.TTM3SEL; - mmcr1_reg.field.speculative_event = func_unit.field.SPECSEL; - mmcr1_reg.field.lane0_select = func_unit.field.TD_CP_DBGxSEL; - mmcr1_reg.field.lane1_select = func_unit.field.TD_CP_DBGxSEL; - mmcr1_reg.field.lane2_select = func_unit.field.TD_CP_DBGxSEL; - mmcr1_reg.field.lane3_select = func_unit.field.TD_CP_DBGxSEL; - - sv->save_mmcr1 = mmcr1_reg.value; - } - break; - default: - retval = KERN_FAILURE; - break; - } - - return retval; -} - -static int -perfmon_set_threshold(thread_t thread, int threshold) -{ - struct savearea *sv = thread->machine.pcb; - kern_return_t retval = KERN_SUCCESS; - - switch(PerProcTable[0].ppe_vaddr->cpu_subtype) { - case CPU_SUBTYPE_POWERPC_750: - { - ppc32_mmcr0_reg_t mmcr0_reg; - - mmcr0_reg.value = sv->save_mmcr0; - - if(threshold>63) { /* no multiplier on 750 */ - int newThreshold = 63; -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_set_threshold - WARNING: supplied threshold (%d) exceeds max threshold value - clamping to %d\n", threshold, newThreshold); -#endif - threshold = newThreshold; - } - mmcr0_reg.field.threshold_value = threshold; - - sv->save_mmcr0 = mmcr0_reg.value; - } - break; - - case CPU_SUBTYPE_POWERPC_7400: - case CPU_SUBTYPE_POWERPC_7450: - { - ppc32_mmcr0_reg_t mmcr0_reg; - ppc32_mmcr2_reg_t mmcr2_reg; - - mmcr0_reg.value = sv->save_mmcr0; - mmcr2_reg.value = sv->save_mmcr2; - - if(threshold<=(2*63)) { /* 2x multiplier */ - if(threshold%2 != 0) { - int newThreshold = 2*(threshold/2); -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_set_threshold - WARNING: supplied threshold (%d) is not evenly divisible by 2x multiplier - using threshold of %d instead\n", threshold, newThreshold); -#endif - threshold = newThreshold; - } - mmcr2_reg.field.threshold_multiplier = 0; - } else if(threshold<=(32*63)) { /* 32x multiplier */ - if(threshold%32 != 0) { - int newThreshold = 32*(threshold/32); -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_set_threshold - WARNING: supplied threshold (%d) is not evenly divisible by 32x multiplier - using threshold of %d instead\n", threshold, newThreshold); -#endif - threshold = newThreshold; - } - mmcr2_reg.field.threshold_multiplier = 1; - } else { - int newThreshold = 32*63; -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_set_threshold - WARNING: supplied threshold (%d) exceeds max threshold value - clamping to %d\n", threshold, newThreshold); -#endif - threshold = newThreshold; - mmcr2_reg.field.threshold_multiplier = 1; - } - mmcr0_reg.field.threshold_value = threshold; - - sv->save_mmcr0 = mmcr0_reg.value; - sv->save_mmcr2 = mmcr2_reg.value; - - } - break; - case CPU_SUBTYPE_POWERPC_970: - { - ppc64_mmcr0_reg_t mmcr0_reg; - - mmcr0_reg.value = sv->save_mmcr0; - - if(threshold>63) { /* multiplier is in HID1 on 970 - not context switching HID1 so always 1x */ - int newThreshold = 63; -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_set_threshold - WARNING: supplied threshold (%d) exceeds max threshold value - clamping to %d\n", threshold, newThreshold); -#endif - threshold = newThreshold; - } - mmcr0_reg.field.threshold_value = threshold; - - sv->save_mmcr0 = mmcr0_reg.value; - } - break; - default: - retval = KERN_FAILURE; - break; - } - -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_set_threshold - threshold=%d - mmcr0=0x%llx mmcr1=0x%llx mmcr2=0x%llx\n", threshold, sv->save_mmcr0, sv->save_mmcr1, sv->save_mmcr2); -#endif - - return retval; -} - -static int -perfmon_set_tbsel(thread_t thread, int tbsel) -{ - struct savearea *sv = thread->machine.pcb; - kern_return_t retval = KERN_SUCCESS; - - switch(PerProcTable[0].ppe_vaddr->cpu_subtype) { - case CPU_SUBTYPE_POWERPC_750: - case CPU_SUBTYPE_POWERPC_7400: - case CPU_SUBTYPE_POWERPC_7450: - { - ppc32_mmcr0_reg_t mmcr0_reg; - - mmcr0_reg.value = sv->save_mmcr0; - switch(tbsel) { - case 0x0: - case 0x1: - case 0x2: - case 0x3: - mmcr0_reg.field.timebase_bit_selector = tbsel; - break; - default: - retval = KERN_FAILURE; - } - sv->save_mmcr0 = mmcr0_reg.value; - } - break; - case CPU_SUBTYPE_POWERPC_970: - { - ppc64_mmcr0_reg_t mmcr0_reg; - - mmcr0_reg.value = sv->save_mmcr0; - switch(tbsel) { - case 0x0: - case 0x1: - case 0x2: - case 0x3: - mmcr0_reg.field.timebase_bit_selector = tbsel; - break; - default: - retval = KERN_FAILURE; - } - sv->save_mmcr0 = mmcr0_reg.value; - } - break; - default: - retval = KERN_FAILURE; - break; - } - -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_set_tbsel - tbsel=%d - mmcr0=0x%llx mmcr1=0x%llx mmcr2=0x%llx\n", tbsel, sv->save_mmcr0, sv->save_mmcr1, sv->save_mmcr2); -#endif - - return retval; -} - -int perfmon_control(struct savearea *ssp) -{ - mach_port_name_t thr_port = CAST_DOWN(mach_port_name_t, ssp->save_r3); - int action = (int)ssp->save_r4; - int pmc = (int)ssp->save_r5; - int val = (int)ssp->save_r6; - uint64_t *usr_pmcs_p = CAST_DOWN(uint64_t *, ssp->save_r7); - thread_t thread = THREAD_NULL; - uint64_t kern_pmcs[MAX_CPUPMC_COUNT]; - kern_return_t retval = KERN_SUCCESS; - int error; - boolean_t oldlevel; - - thread = port_name_to_thread(thr_port); // convert user space thread port name to a thread_t - if(!thread) { - ssp->save_r3 = KERN_INVALID_ARGUMENT; - return 1; /* Return and check for ASTs... */ - } - - if(thread!=current_thread()) { - thread_suspend(thread); - } - -#ifdef HWPERFMON_DEBUG - // kprintf("perfmon_control: action=0x%x pmc=%d val=%d pmcs=0x%x\n", action, pmc, val, usr_pmcs_p); -#endif - - oldlevel = ml_set_interrupts_enabled(FALSE); - - /* individual actions which do not require perfmon facility to be enabled */ - if(action==PPC_PERFMON_DISABLE) { - retval = perfmon_disable(thread); - } - else if(action==PPC_PERFMON_ENABLE) { - retval = perfmon_enable(thread); - } - - else { /* individual actions which do require perfmon facility to be enabled */ - if(!(thread->machine.specFlags & perfMonitor)) { /* perfmon not enabled */ -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_control: ERROR - perfmon not enabled for this thread\n"); -#endif - retval = KERN_NO_ACCESS; - goto perfmon_return; - } - - if(action==PPC_PERFMON_SET_EVENT) { - retval = perfmon_set_event(thread, pmc, val); - } - else if(action==PPC_PERFMON_SET_THRESHOLD) { - retval = perfmon_set_threshold(thread, val); - } - else if(action==PPC_PERFMON_SET_TBSEL) { - retval = perfmon_set_tbsel(thread, val); - } - else if(action==PPC_PERFMON_SET_EVENT_FUNC) { - retval = perfmon_set_event_func(thread, val); - } - else if(action==PPC_PERFMON_ENABLE_PMI_BRKPT) { - if(val) { - thread->machine.perfmonFlags |= PERFMONFLAG_BREAKPOINT_FOR_PMI; - } else { - thread->machine.perfmonFlags &= ~PERFMONFLAG_BREAKPOINT_FOR_PMI; - } - retval = KERN_SUCCESS; - } - - /* combinable actions */ - else { - if(action & PPC_PERFMON_STOP_COUNTERS) { - error = perfmon_stop_counters(thread); - if(error!=KERN_SUCCESS) { - retval = error; - goto perfmon_return; - } - } - if(action & PPC_PERFMON_CLEAR_COUNTERS) { - error = perfmon_clear_counters(thread); - if(error!=KERN_SUCCESS) { - retval = error; - goto perfmon_return; - } - } - if(action & PPC_PERFMON_WRITE_COUNTERS) { - if((error = copyin(CAST_USER_ADDR_T(usr_pmcs_p), (void *)kern_pmcs, MAX_CPUPMC_COUNT*sizeof(uint64_t)))) { - retval = error; - goto perfmon_return; - } - error = perfmon_write_counters(thread, kern_pmcs); - if(error!=KERN_SUCCESS) { - retval = error; - goto perfmon_return; - } - } - if(action & PPC_PERFMON_READ_COUNTERS) { - error = perfmon_read_counters(thread, kern_pmcs); - if(error!=KERN_SUCCESS) { - retval = error; - goto perfmon_return; - } - if((error = copyout((void *)kern_pmcs, CAST_USER_ADDR_T(usr_pmcs_p), MAX_CPUPMC_COUNT*sizeof(uint64_t)))) { - retval = error; - goto perfmon_return; - } - } - if(action & PPC_PERFMON_START_COUNTERS) { - error = perfmon_start_counters(thread); - if(error!=KERN_SUCCESS) { - retval = error; - goto perfmon_return; - } - } - } - } - - perfmon_return: - ml_set_interrupts_enabled(oldlevel); - -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_control (CPU%d): mmcr0 = %016llX, pmc1=%X pmc2=%X pmc3=%X pmc4=%X pmc5=%X pmc6=%X pmc7=%X pmc8=%X\n", cpu_number(), ssp->save_mmcr0, ssp->save_pmc[PMC_1], ssp->save_pmc[PMC_2], ssp->save_pmc[PMC_3], ssp->save_pmc[PMC_4], ssp->save_pmc[PMC_5], ssp->save_pmc[PMC_6], ssp->save_pmc[PMC_7], ssp->save_pmc[PMC_8]); -#endif - - if(thread!=current_thread()) { - thread_resume(thread); - } - -#ifdef HWPERFMON_DEBUG - if(retval!=KERN_SUCCESS) { - kprintf("perfmon_control - ERROR: retval=%d\n", retval); - } -#endif /* HWPERFMON_DEBUG */ - - ssp->save_r3 = retval; - return 1; /* Return and check for ASTs... */ -} - -int perfmon_handle_pmi(struct savearea *ssp) -{ - int curPMC; - kern_return_t retval = KERN_SUCCESS; - thread_t thread = current_thread(); - -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_handle_pmi: got rupt\n"); -#endif - - if(!(thread->machine.specFlags & perfMonitor)) { /* perfmon not enabled */ -#ifdef HWPERFMON_DEBUG - kprintf("perfmon_handle_pmi: ERROR - perfmon not enabled for this thread\n"); -#endif - return KERN_FAILURE; - } - - for(curPMC=0; curPMCmachine.pcb->save_pmc[curPMC] & 0x80000000) { - if(thread->machine.pmcovfl[curPMC]==0xFFFFFFFF && (thread->machine.perfmonFlags & PERFMONFLAG_BREAKPOINT_FOR_PMI)) { - doexception(EXC_BREAKPOINT, EXC_PPC_PERFMON, (unsigned int)ssp->save_srr0); // pass up a breakpoint exception - return KERN_SUCCESS; - } else { - thread->machine.pmcovfl[curPMC]++; - thread->machine.pcb->save_pmc[curPMC] = 0; - } - } - } - - if(retval==KERN_SUCCESS) { - switch(PerProcTable[0].ppe_vaddr->cpu_subtype) { - case CPU_SUBTYPE_POWERPC_7450: - { - ppc32_mmcr0_reg_t mmcr0_reg; - - mmcr0_reg.value = thread->machine.pcb->save_mmcr0; - mmcr0_reg.field.disable_counters_always = FALSE; - mmcr0_reg.field.enable_pmi = TRUE; - thread->machine.pcb->save_mmcr0 = mmcr0_reg.value; - } - retval = KERN_SUCCESS; - break; - case CPU_SUBTYPE_POWERPC_970: - { - ppc64_mmcr0_reg_t mmcr0_reg; - - mmcr0_reg.value = thread->machine.pcb->save_mmcr0; - mmcr0_reg.field.disable_counters_always = FALSE; - mmcr0_reg.field.enable_pmi = TRUE; - thread->machine.pcb->save_mmcr0 = mmcr0_reg.value; - } - retval = KERN_SUCCESS; - break; - default: - retval = KERN_FAILURE; - break; - } - } - - return retval; -} diff --git a/osfmk/ppc/hw_perfmon.h b/osfmk/ppc/hw_perfmon.h deleted file mode 100644 index c091d0b7b..000000000 --- a/osfmk/ppc/hw_perfmon.h +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#ifndef _HW_PERFMON_H_ -#define _HW_PERFMON_H_ - -#ifndef __ppc__ -#error This file is only useful on PowerPC. -#endif - -#define MAX_CPUPMC_COUNT 8 - -#define PMC_1 0 -#define PMC_2 1 -#define PMC_3 2 -#define PMC_4 3 -#define PMC_5 4 -#define PMC_6 5 -#define PMC_7 6 -#define PMC_8 7 - -/* these actions can be combined and simultaneously performed with a single call to perfmon_control() */ -typedef enum { - PPC_PERFMON_CLEAR_COUNTERS = 0x0002, - PPC_PERFMON_START_COUNTERS = 0x0004, - PPC_PERFMON_STOP_COUNTERS = 0x0008, - PPC_PERFMON_READ_COUNTERS = 0x0010, - PPC_PERFMON_WRITE_COUNTERS = 0x0020 -} perfmon_multi_action_t; - -/* these actions can not be combined and each requires a separate call to perfmon_control() */ -typedef enum { - PPC_PERFMON_ENABLE = 0x00010000, - PPC_PERFMON_DISABLE = 0x00020000, - PPC_PERFMON_SET_EVENT = 0x00030000, - PPC_PERFMON_SET_THRESHOLD = 0x00040000, - PPC_PERFMON_SET_TBSEL = 0x00050000, - PPC_PERFMON_SET_EVENT_FUNC = 0x00060000, - PPC_PERFMON_ENABLE_PMI_BRKPT = 0x00070000 -} perfmon_single_action_t; - -/* used to select byte lane and speculative events (currently 970 only) */ -typedef enum { /* SPECSEL[0:1] TD_CP_DBGxSEL[0:1] TTM3SEL[0:1] TTM1SEL[0:1] TTM0SEL[0:1] */ - PPC_PERFMON_FUNC_FPU = 0, /* 00 00 00 00 00 */ - PPC_PERFMON_FUNC_ISU = 1, /* 00 00 00 00 01 */ - PPC_PERFMON_FUNC_IFU = 2, /* 00 00 00 00 10 */ - PPC_PERFMON_FUNC_VMX = 3, /* 00 00 00 00 11 */ - PPC_PERFMON_FUNC_IDU = 64, /* 00 01 00 00 00 */ - PPC_PERFMON_FUNC_GPS = 76, /* 00 01 00 11 00 */ - PPC_PERFMON_FUNC_LSU0 = 128, /* 00 10 00 00 00 */ - PPC_PERFMON_FUNC_LSU1A = 192, /* 00 11 00 00 00 */ - PPC_PERFMON_FUNC_LSU1B = 240, /* 00 11 11 00 00 */ - PPC_PERFMON_FUNC_SPECA = 256, /* 01 00 00 00 00 */ - PPC_PERFMON_FUNC_SPECB = 512, /* 10 00 00 00 00 */ - PPC_PERFMON_FUNC_SPECC = 768, /* 11 00 00 00 00 */ -} perfmon_functional_unit_t; - -#ifdef MACH_KERNEL_PRIVATE -int perfmon_acquire_facility(task_t task); -int perfmon_release_facility(task_t task); - -extern int perfmon_disable(thread_t thr_act); -extern int perfmon_init(void); -extern int perfmon_control(struct savearea *save); -extern int perfmon_handle_pmi(struct savearea *ssp); - -/* perfmonFlags */ -#define PERFMONFLAG_BREAKPOINT_FOR_PMI 0x1 - -#endif /* MACH_KERNEL_PRIVATE */ - -/* - * From user space: - * - * int perfmon_control(thread_t thread, perfmon_action_t action, int pmc, u_int32_t val, u_int64_t *pmcs); - * - * r3: thread - * r4: action - * r5: pmc - * r6: event/threshold/tbsel/count - * r7: pointer to space for PMC counts: uint64_t[MAX_CPUPMC_COUNT] - * - * perfmon_control(thread, PPC_PERFMON_CLEAR_COUNTERS, 0, 0, NULL); - * perfmon_control(thread, PPC_PERFMON_START_COUNTERS, 0, 0, NULL); - * perfmon_control(thread, PPC_PERFMON_STOP_COUNTERS, 0, 0, NULL); - * perfmon_control(thread, PPC_PERFMON_READ_COUNTERS, 0, 0, uint64_t *pmcs); - * perfmon_control(thread, PPC_PERFMON_WRITE_COUNTERS, 0, 0, uint64_t *pmcs); - * perfmon_control(thread, PPC_PERFMON_ENABLE, 0, 0, NULL); - * perfmon_control(thread, PPC_PERFMON_DISABLE, 0, 0, NULL); - * perfmon_control(thread, PPC_PERFMON_SET_EVENT, int pmc, int event, NULL); - * perfmon_control(thread, PPC_PERFMON_SET_THRESHOLD, 0, int threshold, NULL); - * perfmon_control(thread, PPC_PERFMON_SET_TBSEL, 0, int tbsel, NULL); - * perfmon_control(thread, PPC_PERFMON_SET_EVENT_FUNC, 0, perfmon_functional_unit_t func, NULL); - * perfmon_control(thread, PPC_PERFMON_ENABLE_PMI_BRKPT, 0, boolean_t enable, NULL); - * - */ - -#endif /* _HW_PERFMON_H_ */ diff --git a/osfmk/ppc/hw_perfmon_mmcr.h b/osfmk/ppc/hw_perfmon_mmcr.h deleted file mode 100644 index 6dd894d94..000000000 --- a/osfmk/ppc/hw_perfmon_mmcr.h +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Copyright (c) 2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef _HW_PERFMON_MMCR_H_ -#define _HW_PERFMON_MMCR_H_ - -#ifndef __ppc__ -#error This file is only useful on PowerPC. -#endif - -typedef struct { - uint32_t disable_counters_always : 1; /* 0: disable counters */ - uint32_t disable_counters_supervisor : 1; /* 1: disable counters (supervisor) */ - uint32_t disable_counters_user : 1; /* 2: disable counters (user) */ - uint32_t disable_counters_marked : 1; /* 3: disable counters (marked bit == 1) */ - uint32_t disable_counters_unmarked : 1; /* 4: disable counters (marked bit == 0) */ - uint32_t enable_pmi : 1; /* 5: performance monitor interrupt enable */ - uint32_t on_pmi_stop_counting : 1; /* 6: disable counters (pmi) */ - uint32_t timebase_bit_selector : 2; /* 7-8: TBL bit for TB events */ - uint32_t enable_timebase_pmi : 1; /* 9: enable pmi on TBL bit transition */ - uint32_t threshold_value : 6; /* 10-15: threshold value */ - uint32_t enable_pmi_on_pmc1 : 1; /* 16: enable pmi on pmc1 overflow */ - uint32_t enable_pmi_on_pmcn : 1; /* 17: enable pmi on any pmc except pmc1 overflow */ - uint32_t enable_pmi_trigger : 1; /* 18: enable triggering of pmcn by pmc1 overflow */ - uint32_t pmc1_event : 7; /* 19-25: pmc1 event select */ - uint32_t pmc2_event : 6; /* 26-31: pmc2 event select */ -} ppc32_mmcr0_bits_t; - -typedef union { - uint32_t value; - ppc32_mmcr0_bits_t field; -} ppc32_mmcr0_reg_t; - -typedef struct { - uint32_t pmc3_event : 5; - uint32_t pmc4_event : 5; - uint32_t pmc5_event : 5; - uint32_t pmc6_event : 6; - uint32_t /*reserved*/ : 11; -} ppc32_mmcr1_bits_t; - -typedef union { - uint32_t value; - ppc32_mmcr1_bits_t field; -} ppc32_mmcr1_reg_t; - -typedef struct { - uint32_t threshold_multiplier : 1; - uint32_t /*reserved*/ : 31; -} ppc32_mmcr2_bits_t; - -typedef union { - uint32_t value; - ppc32_mmcr2_bits_t field; -} ppc32_mmcr2_reg_t; - -typedef struct { - uint32_t /* reserved */ : 32; /* 0-31: reserved */ - uint32_t disable_counters_always : 1; /* 32: disable counters */ - uint32_t disable_counters_supervisor : 1; /* 33: disable counters (supervisor) */ - uint32_t disable_counters_user : 1; /* 34: disable counters (user) */ - uint32_t disable_counters_marked : 1; /* 35: disable counters (marked bit == 1) */ - uint32_t disable_counters_unmarked : 1; /* 36: disable counters (marked bit == 0) */ - uint32_t enable_pmi : 1; /* 37: performance monitor interrupt enable */ - uint32_t on_pmi_stop_counting : 1; /* 38: disable counters (pmi) */ - uint32_t timebase_bit_selector : 2; /* 39-40: TBL bit for timebase events */ - uint32_t enable_timebase_pmi : 1; /* 41: enable pmi on TBL bit transition */ - uint32_t threshold_value : 6; /* 42-47: threshold value */ - uint32_t enable_pmi_on_pmc1 : 1; /* 48: enable pmi on pmc1 overflow */ - uint32_t enable_pmi_on_pmcn : 1; /* 49: enable pmi on any pmc except pmc1 overflow */ - uint32_t enable_pmi_trigger : 1; /* 50: enable triggering of pmcn by pmc1 overflow */ - uint32_t pmc1_event : 5; /* 51-55: pmc1 event select */ - uint32_t perfmon_event_occurred : 1; /* 56: performance monitor event has occurred */ - uint32_t /* reserved */ : 1; /* 57: reserved */ - uint32_t pmc2_event : 5; /* 58-62: pmc2 event select */ - uint32_t disable_counters_hypervisor : 1; /* 63: disable counters (hypervisor) */ -} ppc64_mmcr0_bits_t; - -typedef union { - uint64_t value; - ppc64_mmcr0_bits_t field; -} ppc64_mmcr0_reg_t; - -typedef struct { - uint32_t ttm0_select : 2; /* 0-1: FPU/ISU/IFU/VMX unit select */ - uint32_t /* reserved */ : 1; /* 2: reserved */ - uint32_t ttm1_select : 2; /* 3-4: IDU/ISU/ISU unit select */ - uint32_t /* reserved */ : 1; /* 5: reserved */ - uint32_t ttm2_select : 2; /* 6-7: IFU/LSU0 unit select */ - uint32_t /* reserved */ : 1; /* 8: reserved */ - uint32_t ttm3_select : 2; /* 9-10: LSU1 select */ - uint32_t /* reserved */ : 1; /* 11: reserved */ - uint32_t lane0_select : 2; /* 12-13: Byte lane 0 unit select (TD_CP_DBG0SEL) */ - uint32_t lane1_select : 2; /* 14-15: Byte lane 1 unit select (TD_CP_DBG1SEL) */ - uint32_t lane2_select : 2; /* 16-17: Byte lane 2 unit select (TD_CP_DBG2SEL) */ - uint32_t lane3_select : 2; /* 18-19: Byte lane 3 unit select (TD_CP_DBG3SEL) */ - uint32_t /* reserved */ : 4; /* 20-23: reserved */ - uint32_t pmc1_adder_lane_select : 1; /* 24: PMC1 Event Adder Lane Select (PMC1_ADDER_SELECT) */ - uint32_t pmc2_adder_lane_select : 1; /* 25: PMC2 Event Adder Lane Select (PMC2_ADDER_SELECT) */ - uint32_t pmc6_adder_lane_select : 1; /* 26: PMC6 Event Adder Lane Select (PMC6_ADDER_SELECT) */ - uint32_t pmc5_adder_lane_select : 1; /* 27: PMC5 Event Adder Lane Select (PMC5_ADDER_SELECT) */ - uint32_t pmc8_adder_lane_select : 1; /* 28: PMC8 Event Adder Lane Select (PMC8_ADDER_SELECT) */ - uint32_t pmc7_adder_lane_select : 1; /* 29: PMC7 Event Adder Lane Select (PMC7_ADDER_SELECT) */ - uint32_t pmc3_adder_lane_select : 1; /* 30: PMC3 Event Adder Lane Select (PMC3_ADDER_SELECT) */ - uint32_t pmc4_adder_lane_select : 1; /* 31: PMC4 Event Adder Lane Select (PMC4_ADDER_SELECT) */ - uint32_t pmc3_event : 5; /* 32-36: pmc3 event select */ - uint32_t pmc4_event : 5; /* 37-41: pmc4 event select */ - uint32_t pmc5_event : 5; /* 42-46: pmc5 event select */ - uint32_t pmc6_event : 5; /* 47-51: pmc6 event select */ - uint32_t pmc7_event : 5; /* 52-56: pmc7 event select */ - uint32_t pmc8_event : 5; /* 57-61: pmc8 event select */ - uint32_t speculative_event : 2; /* 62-63: SPeCulative count event SELector */ -} ppc64_mmcr1_bits_t; - -typedef union { - uint64_t value; - ppc64_mmcr1_bits_t field; -} ppc64_mmcr1_reg_t; - -typedef struct { - uint32_t /* reserved */ : 32; /* 0-31: reserved */ - uint32_t siar_sdar_same_instruction : 1; /* 32: SIAR and SDAR are from same instruction */ - uint32_t disable_counters_pmc1_pmc4 : 1; /* 33: disable counters PMC1-PMC4 */ - uint32_t disable_counters_pmc5_pmc8 : 1; /* 34: disable counters PMC5-PMC8 */ - uint32_t problem_state_siar : 1; /* 35: MSR[PR] bit when SIAR set */ - uint32_t hypervisor_state_siar : 1; /* 36: MSR[HV] bit when SIAR set */ - uint32_t /* reserved */ : 3; /* 37-39: reserved */ - uint32_t threshold_start_event : 3; /* 40-42: threshold start event */ - uint32_t threshold_end_event : 3; /* 43-45: threshold end event */ - uint32_t /* reserved */ : 3; /* 46-48: reserved */ - uint32_t imr_select : 1; /* 49: imr select */ - uint32_t imr_mark : 2; /* 50-51: imr mark */ - uint32_t imr_mask : 4; /* 52-55: imr mask */ - uint32_t imr_match : 4; /* 56-59: imr match */ - uint32_t disable_counters_tags_inactive : 1; /* 60: disable counters in tags inactive mode */ - uint32_t disable_counters_tags_active : 1; /* 61: disable counters in tags active mode */ - uint32_t disable_counters_wait_state : 1; /* 62: freeze counters in wait state (CNTL[31]=0) */ - uint32_t sample_enable : 1; /* 63: sampling enabled */ -} ppc64_mmcra_bits_t; - -typedef union { - uint64_t value; - ppc64_mmcra_bits_t field; -} ppc64_mmcra_reg_t; - -/* PPC_PERFMON_FUNC_* values are taken apart to fill in the appropriate configuration bitfields: */ -typedef struct { - uint32_t /* reserved */ : 22; - uint32_t SPECSEL : 2; - uint32_t TD_CP_DBGxSEL : 2; - uint32_t TTM3SEL : 2; - uint32_t TTM1SEL : 2; - uint32_t TTM0SEL : 2; -} ppc_func_bits_t; - -typedef union { - uint32_t value; - ppc_func_bits_t field; -} ppc_func_unit_t; - -#endif /* _HW_PERFMON_MMCR_H_ */ diff --git a/osfmk/ppc/hw_vm.s b/osfmk/ppc/hw_vm.s deleted file mode 100644 index bcad7dad2..000000000 --- a/osfmk/ppc/hw_vm.s +++ /dev/null @@ -1,8794 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - - .text - -; -; 0 0 1 2 3 4 4 5 6 -; 0 8 6 4 2 0 8 6 3 -; +--------+--------+--------+--------+--------+--------+--------+--------+ -; |00000000|00000SSS|SSSSSSSS|SSSSSSSS|SSSSPPPP|PPPPPPPP|PPPPxxxx|xxxxxxxx| - EA -; +--------+--------+--------+--------+--------+--------+--------+--------+ -; -; 0 0 1 -; 0 8 6 -; +--------+--------+--------+ -; |//////BB|BBBBBBBB|BBBB////| - SID - base -; +--------+--------+--------+ -; -; 0 0 1 -; 0 8 6 -; +--------+--------+--------+ -; |////////|11111111|111111//| - SID - copy 1 -; +--------+--------+--------+ -; -; 0 0 1 -; 0 8 6 -; +--------+--------+--------+ -; |////////|//222222|22222222| - SID - copy 2 -; +--------+--------+--------+ -; -; 0 0 1 -; 0 8 6 -; +--------+--------+--------+ -; |//////33|33333333|33//////| - SID - copy 3 - not needed -; +--------+--------+--------+ for 65 bit VPN -; -; 0 0 1 2 3 4 4 5 5 -; 0 8 6 4 2 0 8 1 5 -; +--------+--------+--------+--------+--------+--------+--------+ -; |00000000|00000002|22222222|11111111|111111BB|BBBBBBBB|BBBB////| - SID Hash - this is all -; +--------+--------+--------+--------+--------+--------+--------+ SID copies ORed -; 0 0 1 2 3 4 4 5 5 -; 0 8 6 4 2 0 8 1 5 -; +--------+--------+--------+--------+--------+--------+--------+ -; |00000000|0000000S|SSSSSSSS|SSSSSSSS|SSSSSS00|00000000|0000////| - Shifted high order EA -; +--------+--------+--------+--------+--------+--------+--------+ left shifted "segment" -; part of EA to make -; room for SID base -; -; -; 0 0 1 2 3 4 4 5 5 -; 0 8 6 4 2 0 8 1 5 -; +--------+--------+--------+--------+--------+--------+--------+ -; |00000000|0000000V|VVVVVVVV|VVVVVVVV|VVVVVVVV|VVVVVVVV|VVVV////| - VSID - SID Hash XORed -; +--------+--------+--------+--------+--------+--------+--------+ with shifted EA -; -; 0 0 1 2 3 4 4 5 6 7 7 -; 0 8 6 4 2 0 8 6 4 2 9 -; +--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+ -; |00000000|0000000V|VVVVVVVV|VVVVVVVV|VVVVVVVV|VVVVVVVV|VVVVPPPP|PPPPPPPP|PPPPxxxx|xxxxxxxx| - VPN -; +--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+ -; - - -/* addr64_t hw_add_map(struct pmap *pmap, struct mapping *mp) - Adds a mapping - * - * Maps a page or block into a pmap - * - * Returns 0 if add worked or the vaddr of the first overlap if not - * - * Make mapping - not block or I/O - note: this is low-level, upper should remove duplicates - * - * 1) bump mapping busy count - * 2) lock pmap share - * 3) find mapping full path - finds all possible list previous elements - * 4) upgrade pmap to exclusive - * 5) add mapping to search list - * 6) find physent - * 7) lock physent - * 8) add to physent - * 9) unlock physent - * 10) unlock pmap - * 11) drop mapping busy count - * - * - * Make mapping - block or I/O - note: this is low-level, upper should remove duplicates - * - * 1) bump mapping busy count - * 2) lock pmap share - * 3) find mapping full path - finds all possible list previous elements - * 4) upgrade pmap to exclusive - * 5) add mapping to search list - * 6) unlock pmap - * 7) drop mapping busy count - * - */ - - .align 5 - .globl EXT(hw_add_map) - -LEXT(hw_add_map) - - stwu r1,-(FM_ALIGN((31-17+1)*4)+FM_SIZE)(r1) ; Make some space on the stack - mflr r0 ; Save the link register - stw r17,FM_ARG0+0x00(r1) ; Save a register - stw r18,FM_ARG0+0x04(r1) ; Save a register - stw r19,FM_ARG0+0x08(r1) ; Save a register - mfsprg r19,2 ; Get feature flags - stw r20,FM_ARG0+0x0C(r1) ; Save a register - stw r21,FM_ARG0+0x10(r1) ; Save a register - mtcrf 0x02,r19 ; move pf64Bit cr6 - stw r22,FM_ARG0+0x14(r1) ; Save a register - stw r23,FM_ARG0+0x18(r1) ; Save a register - stw r24,FM_ARG0+0x1C(r1) ; Save a register - stw r25,FM_ARG0+0x20(r1) ; Save a register - stw r26,FM_ARG0+0x24(r1) ; Save a register - stw r27,FM_ARG0+0x28(r1) ; Save a register - stw r28,FM_ARG0+0x2C(r1) ; Save a register - stw r29,FM_ARG0+0x30(r1) ; Save a register - stw r30,FM_ARG0+0x34(r1) ; Save a register - stw r31,FM_ARG0+0x38(r1) ; Save a register - stw r0,(FM_ALIGN((31-17+1)*4)+FM_SIZE+FM_LR_SAVE)(r1) ; Save the return - -#if DEBUG - lwz r11,pmapFlags(r3) ; Get pmaps flags - rlwinm. r11,r11,0,pmapVMgsaa ; Is guest shadow assist active? - bne hamPanic ; Call not valid for guest shadow assist pmap -#endif - - rlwinm r11,r4,0,0,19 ; Round down to get mapping block address - mr r28,r3 ; Save the pmap - mr r31,r4 ; Save the mapping - bt++ pf64Bitb,hamSF1 ; skip if 64-bit (only they take the hint) - lwz r20,pmapvr+4(r3) ; Get conversion mask for pmap - lwz r21,mbvrswap+4(r11) ; Get conversion mask for mapping - - b hamSF1x ; Done... - -hamSF1: ld r20,pmapvr(r3) ; Get conversion mask for pmap - ld r21,mbvrswap(r11) ; Get conversion mask for mapping - -hamSF1x: bl EXT(mapSetUp) ; Turn off interrupts, translation, and possibly enter 64-bit - - mr r17,r11 ; Save the MSR - xor r28,r28,r20 ; Convert the pmap to physical addressing - xor r31,r31,r21 ; Convert the mapping to physical addressing - - la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkShared ; Go get a shared lock on the mapping lists - mr. r3,r3 ; Did we get the lock? - lwz r24,mpFlags(r31) ; Pick up the flags - bne-- hamBadLock ; Nope... - - li r21,0 ; Remember that we have the shared lock - -; -; Note that we do a full search (i.e., no shortcut level skips, etc.) -; here so that we will know the previous elements so we can dequeue them -; later. -; - -hamRescan: lwz r4,mpVAddr(r31) ; Get the new vaddr top half - lwz r5,mpVAddr+4(r31) ; Get the new vaddr bottom half - mr r3,r28 ; Pass in pmap to search - lhz r23,mpBSize(r31) ; Get the block size for later - mr r29,r4 ; Save top half of vaddr for later - mr r30,r5 ; Save bottom half of vaddr for later - - bl EXT(mapSearchFull) ; Go see if we can find it - - li r22,lo16(0x800C) ; Get 0xFFFF800C - rlwinm r0,r24,mpBSub+1,31,31 ; Rotate to get 0 if 4K bsu or 1 if 32MB bsu - addi r23,r23,1 ; Get actual length - rlwnm r22,r22,r0,27,31 ; Rotate to get 12 or 25 - lis r0,0x8000 ; Get 0xFFFFFFFF80000000 - slw r9,r23,r22 ; Isolate the low part - rlwnm r22,r23,r22,22,31 ; Extract the high order - addic r23,r9,-4096 ; Get the length to the last page - add r0,r0,r0 ; Get 0xFFFFFFFF00000000 for 64-bit or 0 for 32-bit - addme r22,r22 ; Do high order as well... - mr. r3,r3 ; Did we find a mapping here? - or r0,r30,r0 ; Fill high word of 64-bit with 1s so we will properly carry - bne-- hamOverlay ; We found a mapping, this is no good, can not double map... - - addc r9,r0,r23 ; Add size to get last page in new range - or. r0,r4,r5 ; Are we beyond the end? - adde r8,r29,r22 ; Add the rest of the length on - rlwinm r9,r9,0,0,31 ; Clean top half of sum - beq++ hamFits ; We are at the end... - - cmplw cr1,r9,r5 ; Is the bottom part of our end less? - cmplw r8,r4 ; Is our end before the next (top part) - crand cr0_eq,cr0_eq,cr1_lt ; Is the second half less and the first half equal? - cror cr0_eq,cr0_eq,cr0_lt ; Or is the top half less - - bf-- cr0_eq,hamOverlay ; No, we do fit, there is an overlay... - -; -; Here we try to convert to an exclusive lock. This will fail if someone else -; has it shared. -; -hamFits: mr. r21,r21 ; Do we already have the exclusive lock? - la r3,pmapSXlk(r28) ; Point to the pmap search lock - - bne-- hamGotX ; We already have the exclusive... - - bl sxlkPromote ; Try to promote shared to exclusive - mr. r3,r3 ; Could we? - beq++ hamGotX ; Yeah... - -; -; Since we could not promote our lock, we need to convert to it. -; That means that we drop the shared lock and wait to get it -; exclusive. Since we release the lock, we need to do the look up -; again. -; - - la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkConvert ; Convert shared to exclusive - mr. r3,r3 ; Could we? - bne-- hamBadLock ; Nope, we must have timed out... - - li r21,1 ; Remember that we have the exclusive lock - b hamRescan ; Go look again... - - .align 5 - -hamGotX: mr r3,r28 ; Get the pmap to insert into - mr r4,r31 ; Point to the mapping - bl EXT(mapInsert) ; Insert the mapping into the list - - rlwinm r11,r24,mpPcfgb+2,mpPcfg>>6 ; Get the index into the page config table - lhz r8,mpSpace(r31) ; Get the address space - lwz r11,lgpPcfg(r11) ; Get the page config - mfsdr1 r7 ; Get the hash table base/bounds - - lwz r4,pmapResidentCnt(r28) ; Get the mapped page count - lwz r12,pmapResidentMax(r28) ; r12 = pmap->stats.resident_max - addi r4,r4,1 ; Bump up the mapped page count - stw r4,pmapResidentCnt(r28) ; Set the mapped page count - cmplw r12,r4 ; if pmap->stats.resident_max >= pmap->stats.resident_count - bge+ hamSkipMax ; goto hamSkipResMax - stw r4,pmapResidentMax(r28) ; pmap->stats.resident_max = pmap->stats.resident_count - -hamSkipMax: andi. r0,r24,mpType ; Is this a normal mapping? - - rlwimi r8,r8,14,4,17 ; Double address space - rlwinm r9,r30,0,4,31 ; Clear segment - rlwinm r10,r30,18,14,17 ; Shift EA[32:35] down to correct spot in VSID (actually shift up 14) - rlwimi r8,r8,28,0,3 ; Get the last nybble of the hash - rlwimi r10,r29,18,0,13 ; Shift EA[18:31] down to VSID (31-bit math works because of max hash table size) - rlwinm r7,r7,0,16,31 ; Isolate length mask (or count) - srw r9,r9,r11 ; Isolate just the page index - xor r10,r10,r8 ; Calculate the low 32 bits of the VSID - - xor r9,r9,r10 ; Get the hash to the PTEG - - bne-- hamDoneNP ; Not a normal mapping, therefore, no physent... - - bl mapPhysFindLock ; Go find and lock the physent - - bt++ pf64Bitb,ham64 ; This is 64-bit... - - lwz r11,ppLink+4(r3) ; Get the alias chain pointer - rlwinm r7,r7,16,0,15 ; Get the PTEG wrap size - slwi r9,r9,6 ; Make PTEG offset - ori r7,r7,0xFFC0 ; Stick in the bottom part - rlwinm r12,r11,0,~ppFlags ; Clean it up - and r9,r9,r7 ; Wrap offset into table - mr r4,r31 ; Set the link to install - stw r9,mpPte(r31) ; Point the mapping at the PTEG (exact offset is invalid) - stw r12,mpAlias+4(r31) ; Move to the mapping - bl mapPhyCSet32 ; Install the link - b hamDone ; Go finish up... - - .align 5 - -ham64: li r0,ppLFAmask ; Get mask to clean up alias pointer - subfic r7,r7,46 ; Get number of leading zeros - eqv r4,r4,r4 ; Get all ones - ld r11,ppLink(r3) ; Get the alias chain pointer - rotrdi r0,r0,ppLFArrot ; Rotate clean up mask to get 0xF0000000000000000F - srd r4,r4,r7 ; Get the wrap mask - sldi r9,r9,7 ; Change hash to PTEG offset - andc r11,r11,r0 ; Clean out the lock and flags - and r9,r9,r4 ; Wrap to PTEG - mr r4,r31 - stw r9,mpPte(r31) ; Point the mapping at the PTEG (exact offset is invalid) - std r11,mpAlias(r31) ; Set the alias pointer in the mapping - - bl mapPhyCSet64 ; Install the link - -hamDone: bl mapPhysUnlock ; Unlock the physent chain - -hamDoneNP: la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkUnlock ; Unlock the search list - - mr r3,r31 ; Get the mapping pointer - bl mapDropBusy ; Drop the busy count - - li r3,0 ; Set successful return - li r4,0 ; Set successful return - -hamReturn: bt++ pf64Bitb,hamR64 ; Yes... - - mtmsr r17 ; Restore enables/translation/etc. - isync - b hamReturnC ; Join common... - -hamR64: mtmsrd r17 ; Restore enables/translation/etc. - isync - -hamReturnC: lwz r0,(FM_ALIGN((31-17+1)*4)+FM_SIZE+FM_LR_SAVE)(r1) ; Get the return - lwz r17,FM_ARG0+0x00(r1) ; Save a register - lwz r18,FM_ARG0+0x04(r1) ; Save a register - lwz r19,FM_ARG0+0x08(r1) ; Save a register - lwz r20,FM_ARG0+0x0C(r1) ; Save a register - mtlr r0 ; Restore the return - lwz r21,FM_ARG0+0x10(r1) ; Save a register - lwz r22,FM_ARG0+0x14(r1) ; Save a register - lwz r23,FM_ARG0+0x18(r1) ; Save a register - lwz r24,FM_ARG0+0x1C(r1) ; Save a register - lwz r25,FM_ARG0+0x20(r1) ; Save a register - lwz r26,FM_ARG0+0x24(r1) ; Save a register - lwz r27,FM_ARG0+0x28(r1) ; Save a register - lwz r28,FM_ARG0+0x2C(r1) ; Save a register - lwz r29,FM_ARG0+0x30(r1) ; Save a register - lwz r30,FM_ARG0+0x34(r1) ; Save a register - lwz r31,FM_ARG0+0x38(r1) ; Save a register - lwz r1,0(r1) ; Pop the stack - - blr ; Leave... - - - .align 5 - -hamOverlay: lwz r22,mpFlags(r3) ; Get the overlay flags - li r0,mpC|mpR ; Get a mask to turn off RC bits - lwz r23,mpFlags(r31) ; Get the requested flags - lwz r20,mpVAddr(r3) ; Get the overlay address - lwz r8,mpVAddr(r31) ; Get the requested address - lwz r21,mpVAddr+4(r3) ; Get the overlay address - lwz r9,mpVAddr+4(r31) ; Get the requested address - lhz r10,mpBSize(r3) ; Get the overlay length - lhz r11,mpBSize(r31) ; Get the requested length - lwz r24,mpPAddr(r3) ; Get the overlay physical address - lwz r25,mpPAddr(r31) ; Get the requested physical address - andc r21,r21,r0 ; Clear RC bits - andc r9,r9,r0 ; Clear RC bits - - la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkUnlock ; Unlock the search list - - rlwinm. r0,r22,0,mpRIPb,mpRIPb ; Are we in the process of removing this one? - mr r3,r20 ; Save the top of the colliding address - rlwinm r4,r21,0,0,19 ; Save the bottom of the colliding address - - bne++ hamRemv ; Removing, go say so so we help... - - cmplw r20,r8 ; High part of vaddr the same? - cmplw cr1,r21,r9 ; Low part? - crand cr5_eq,cr0_eq,cr1_eq ; Remember if same - - cmplw r10,r11 ; Size the same? - cmplw cr1,r24,r25 ; Physical address? - crand cr5_eq,cr5_eq,cr0_eq ; Remember - crand cr5_eq,cr5_eq,cr1_eq ; Remember if same - - xor r23,r23,r22 ; Compare mapping flag words - andi. r23,r23,mpType|mpPerm ; Are mapping types and attributes the same? - crand cr5_eq,cr5_eq,cr0_eq ; Merge in final check - bf-- cr5_eq,hamSmash ; This is not the same, so we return a smash... - - ori r4,r4,mapRtMapDup ; Set duplicate - b hamReturn ; And leave... - -hamRemv: ori r4,r4,mapRtRemove ; We are in the process of removing the collision - b hamReturn ; Come back yall... - -hamSmash: ori r4,r4,mapRtSmash ; Tell caller that it has some clean up to do - b hamReturn ; Join common epilog code - - .align 5 - -hamBadLock: li r3,0 ; Set lock time out error code - li r4,mapRtBadLk ; Set lock time out error code - b hamReturn ; Leave.... - -hamPanic: lis r0,hi16(Choke) ; System abend - ori r0,r0,lo16(Choke) ; System abend - li r3,failMapping ; Show that we failed some kind of mapping thing - sc - - - - -/* - * mapping *hw_rem_map(pmap, vaddr, addr64_t *next) - remove a mapping from the system. - * - * Upon entry, R3 contains a pointer to a pmap. Since vaddr is - * a 64-bit quantity, it is a long long so it is in R4 and R5. - * - * We return the virtual address of the removed mapping as a - * R3. - * - * Note that this is designed to be called from 32-bit mode with a stack. - * - * We disable translation and all interruptions here. This keeps is - * from having to worry about a deadlock due to having anything locked - * and needing it to process a fault. - * - * Note that this must be done with both interruptions off and VM off - * - * Remove mapping via pmap, regular page, no pte - * - * 1) lock pmap share - * 2) find mapping full path - finds all possible list previous elements - * 4) upgrade pmap to exclusive - * 3) bump mapping busy count - * 5) remove mapping from search list - * 6) unlock pmap - * 7) lock physent - * 8) remove from physent - * 9) unlock physent - * 10) drop mapping busy count - * 11) drain mapping busy count - * - * - * Remove mapping via pmap, regular page, with pte - * - * 1) lock pmap share - * 2) find mapping full path - finds all possible list previous elements - * 3) upgrade lock to exclusive - * 4) bump mapping busy count - * 5) lock PTEG - * 6) invalidate pte and tlbie - * 7) atomic merge rc into physent - * 8) unlock PTEG - * 9) remove mapping from search list - * 10) unlock pmap - * 11) lock physent - * 12) remove from physent - * 13) unlock physent - * 14) drop mapping busy count - * 15) drain mapping busy count - * - * - * Remove mapping via pmap, I/O or block - * - * 1) lock pmap share - * 2) find mapping full path - finds all possible list previous elements - * 3) upgrade lock to exclusive - * 4) bump mapping busy count - * 5) mark remove-in-progress - * 6) check and bump remove chunk cursor if needed - * 7) unlock pmap - * 8) if something to invalidate, go to step 11 - - * 9) drop busy - * 10) return with mapRtRemove to force higher level to call again - - * 11) Lock PTEG - * 12) invalidate ptes, no tlbie - * 13) unlock PTEG - * 14) repeat 11 - 13 for all pages in chunk - * 15) if not final chunk, go to step 9 - * 16) invalidate tlb entries for the whole block map but no more than the full tlb - * 17) lock pmap share - * 18) find mapping full path - finds all possible list previous elements - * 19) upgrade lock to exclusive - * 20) remove mapping from search list - * 21) drop mapping busy count - * 22) drain mapping busy count - * - */ - - .align 5 - .globl EXT(hw_rem_map) - -LEXT(hw_rem_map) - -; -; NOTE NOTE NOTE - IF WE CHANGE THIS STACK FRAME STUFF WE NEED TO CHANGE -; THE HW_PURGE_* ROUTINES ALSO -; - -#define hrmStackSize ((31-15+1)*4)+4 - stwu r1,-(FM_ALIGN(hrmStackSize)+FM_SIZE)(r1) ; Make some space on the stack - mflr r0 ; Save the link register - stw r15,FM_ARG0+0x00(r1) ; Save a register - stw r16,FM_ARG0+0x04(r1) ; Save a register - stw r17,FM_ARG0+0x08(r1) ; Save a register - stw r18,FM_ARG0+0x0C(r1) ; Save a register - stw r19,FM_ARG0+0x10(r1) ; Save a register - mfsprg r19,2 ; Get feature flags - stw r20,FM_ARG0+0x14(r1) ; Save a register - stw r21,FM_ARG0+0x18(r1) ; Save a register - mtcrf 0x02,r19 ; move pf64Bit cr6 - stw r22,FM_ARG0+0x1C(r1) ; Save a register - stw r23,FM_ARG0+0x20(r1) ; Save a register - stw r24,FM_ARG0+0x24(r1) ; Save a register - stw r25,FM_ARG0+0x28(r1) ; Save a register - stw r26,FM_ARG0+0x2C(r1) ; Save a register - stw r27,FM_ARG0+0x30(r1) ; Save a register - stw r28,FM_ARG0+0x34(r1) ; Save a register - stw r29,FM_ARG0+0x38(r1) ; Save a register - stw r30,FM_ARG0+0x3C(r1) ; Save a register - stw r31,FM_ARG0+0x40(r1) ; Save a register - stw r6,FM_ARG0+0x44(r1) ; Save address to save next mapped vaddr - stw r0,(FM_ALIGN(hrmStackSize)+FM_SIZE+FM_LR_SAVE)(r1) ; Save the return - -#if DEBUG - lwz r11,pmapFlags(r3) ; Get pmaps flags - rlwinm. r11,r11,0,pmapVMgsaa ; Is guest shadow assist active? - bne hrmPanic ; Call not valid for guest shadow assist pmap -#endif - - bt++ pf64Bitb,hrmSF1 ; skip if 64-bit (only they take the hint) - lwz r9,pmapvr+4(r3) ; Get conversion mask - b hrmSF1x ; Done... - -hrmSF1: ld r9,pmapvr(r3) ; Get conversion mask - -hrmSF1x: - bl EXT(mapSetUp) ; Turn off interrupts, translation, and possibly enter 64-bit - - xor r28,r3,r9 ; Convert the pmap to physical addressing - -; -; Here is where we join in from the hw_purge_* routines -; - -hrmJoin: lwz r3,pmapFlags(r28) ; Get pmap's flags - mfsprg r19,2 ; Get feature flags again (for alternate entries) - - mr r17,r11 ; Save the MSR - mr r29,r4 ; Top half of vaddr - mr r30,r5 ; Bottom half of vaddr - - rlwinm. r3,r3,0,pmapVMgsaa ; Is guest shadow assist active? - bne-- hrmGuest ; Yes, handle specially - - la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkShared ; Go get a shared lock on the mapping lists - mr. r3,r3 ; Did we get the lock? - bne-- hrmBadLock ; Nope... - -; -; Note that we do a full search (i.e., no shortcut level skips, etc.) -; here so that we will know the previous elements so we can dequeue them -; later. Note: we get back mpFlags in R7. -; - - mr r3,r28 ; Pass in pmap to search - mr r4,r29 ; High order of address - mr r5,r30 ; Low order of address - bl EXT(mapSearchFull) ; Go see if we can find it - - andi. r0,r7,mpPerm ; Mapping marked permanent? - crmove cr5_eq,cr0_eq ; Remember permanent marking - mr r20,r7 ; Remember mpFlags - mr. r31,r3 ; Did we? (And remember mapping address for later) - mr r15,r4 ; Save top of next vaddr - mr r16,r5 ; Save bottom of next vaddr - beq-- hrmNotFound ; Nope, not found... - - bf-- cr5_eq,hrmPerm ; This one can't be removed... -; -; Here we try to promote to an exclusive lock. This will fail if someone else -; has it shared. -; - - la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkPromote ; Try to promote shared to exclusive - mr. r3,r3 ; Could we? - beq++ hrmGotX ; Yeah... - -; -; Since we could not promote our lock, we need to convert to it. -; That means that we drop the shared lock and wait to get it -; exclusive. Since we release the lock, we need to do the look up -; again. -; - - la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkConvert ; Convert shared to exclusive - mr. r3,r3 ; Could we? - bne-- hrmBadLock ; Nope, we must have timed out... - - mr r3,r28 ; Pass in pmap to search - mr r4,r29 ; High order of address - mr r5,r30 ; Low order of address - bl EXT(mapSearchFull) ; Rescan the list - - andi. r0,r7,mpPerm ; Mapping marked permanent? - crmove cr5_eq,cr0_eq ; Remember permanent marking - mr. r31,r3 ; Did we lose it when we converted? - mr r20,r7 ; Remember mpFlags - mr r15,r4 ; Save top of next vaddr - mr r16,r5 ; Save bottom of next vaddr - beq-- hrmNotFound ; Yeah, we did, someone tossed it for us... - - bf-- cr5_eq,hrmPerm ; This one can't be removed... - -; -; We have an exclusive lock on the mapping chain. And we -; also have the busy count bumped in the mapping so it can -; not vanish on us. -; - -hrmGotX: mr r3,r31 ; Get the mapping - bl mapBumpBusy ; Bump up the busy count - -; -; Invalidate any PTEs associated with this -; mapping (more than one if a block) and accumulate the reference -; and change bits. -; -; Here is also where we need to split 32- and 64-bit processing -; - - lwz r21,mpPte(r31) ; Grab the offset to the PTE - rlwinm r23,r29,0,1,0 ; Copy high order vaddr to high if 64-bit machine - mfsdr1 r29 ; Get the hash table base and size - - rlwinm r0,r20,0,mpType ; Isolate mapping type - cmplwi cr5,r0,mpBlock ; Remember whether this is a block mapping - cmplwi r0,mpMinSpecial ; cr0_lt <- not a special mapping type - - rlwinm r0,r21,0,mpHValidb,mpHValidb ; See if we actually have a PTE - ori r2,r2,0xFFFF ; Get mask to clean out hash table base (works for both 32- and 64-bit) - cmpwi cr1,r0,0 ; Have we made a PTE for this yet? - rlwinm r21,r21,0,~mpHValid ; Clear out valid bit - crorc cr0_eq,cr1_eq,cr0_lt ; No need to look at PTE if none or a special mapping - rlwimi r23,r30,0,0,31 ; Insert low under high part of address - andc r29,r29,r2 ; Clean up hash table base - li r22,0 ; Clear this on out (also sets RC to 0 if we bail) - mr r30,r23 ; Move the now merged vaddr to the correct register - add r26,r29,r21 ; Point to the PTEG slot - - bt++ pf64Bitb,hrmSplit64 ; Go do 64-bit version... - - rlwinm r9,r21,28,4,29 ; Convert PTEG to PCA entry - beq- cr5,hrmBlock32 ; Go treat block specially... - subfic r9,r9,-4 ; Get the PCA entry offset - bt- cr0_eq,hrmPysDQ32 ; Skip next if no possible PTE... - add r7,r9,r29 ; Point to the PCA slot - - bl mapLockPteg ; Go lock up the PTEG (Note: we need to save R6 to set PCA) - - lwz r21,mpPte(r31) ; Get the quick pointer again - lwz r5,0(r26) ; Get the top of PTE - - rlwinm. r0,r21,0,mpHValidb,mpHValidb ; See if we actually have a PTE - rlwinm r21,r21,0,~mpHValid ; Clear out valid bit - rlwinm r5,r5,0,1,31 ; Turn off valid bit in PTE - stw r21,mpPte(r31) ; Make sure we invalidate mpPte, still pointing to PTEG (keep walk_page from making a mistake) - beq- hrmUlckPCA32 ; Pte is gone, no need to invalidate... - - stw r5,0(r26) ; Invalidate the PTE - - li r9,tlbieLock ; Get the TLBIE lock - - sync ; Make sure the invalid PTE is actually in memory - -hrmPtlb32: lwarx r5,0,r9 ; Get the TLBIE lock - mr. r5,r5 ; Is it locked? - li r5,1 ; Get locked indicator - bne- hrmPtlb32 ; It is locked, go spin... - stwcx. r5,0,r9 ; Try to get it - bne- hrmPtlb32 ; We was beat... - - rlwinm. r0,r19,0,pfSMPcapb,pfSMPcapb ; Can this processor do SMP? - - tlbie r30 ; Invalidate it all corresponding TLB entries - - beq- hrmNTlbs ; Jump if we can not do a TLBSYNC.... - - eieio ; Make sure that the tlbie happens first - tlbsync ; Wait for everyone to catch up - sync ; Make sure of it all - -hrmNTlbs: li r0,0 ; Clear this - rlwinm r2,r21,29,29,31 ; Get slot number (8 byte entries) - stw r0,tlbieLock(0) ; Clear the tlbie lock - lis r0,0x8000 ; Get bit for slot 0 - eieio ; Make sure those RC bit have been stashed in PTE - - srw r0,r0,r2 ; Get the allocation hash mask - lwz r22,4(r26) ; Get the latest reference and change bits - or r6,r6,r0 ; Show that this slot is free - -hrmUlckPCA32: - eieio ; Make sure all updates come first - stw r6,0(r7) ; Unlock the PTEG - -; -; Now, it is time to remove the mapping and unlock the chain. -; But first, we need to make sure no one else is using this -; mapping so we drain the busy now -; - -hrmPysDQ32: mr r3,r31 ; Point to the mapping - bl mapDrainBusy ; Go wait until mapping is unused - - mr r3,r28 ; Get the pmap to remove from - mr r4,r31 ; Point to the mapping - bl EXT(mapRemove) ; Remove the mapping from the list - - lwz r4,pmapResidentCnt(r28) ; Get the mapped page count - rlwinm r0,r20,0,mpType ; Isolate mapping type - cmplwi cr1,r0,mpMinSpecial ; cr1_lt <- not a special mapping type - la r3,pmapSXlk(r28) ; Point to the pmap search lock - subi r4,r4,1 ; Drop down the mapped page count - stw r4,pmapResidentCnt(r28) ; Set the mapped page count - bl sxlkUnlock ; Unlock the search list - - bf-- cr1_lt,hrmRetn32 ; This one has no real memory associated with it so we are done... - - bl mapPhysFindLock ; Go find and lock the physent - - lwz r9,ppLink+4(r3) ; Get first mapping - - mr r4,r22 ; Get the RC bits we just got - bl mapPhysMerge ; Go merge the RC bits - - rlwinm r9,r9,0,~ppFlags ; Clear the flags from the mapping pointer - - cmplw r9,r31 ; Are we the first on the list? - bne- hrmNot1st ; Nope... - - li r9,0 ; Get a 0 - lwz r4,mpAlias+4(r31) ; Get our new forward pointer - stw r9,mpAlias+4(r31) ; Make sure we are off the chain - bl mapPhyCSet32 ; Go set the physent link and preserve flags - - b hrmPhyDQd ; Join up and unlock it all... - - .align 5 - -hrmPerm: li r8,-4096 ; Get the value we need to round down to a page - and r8,r8,r31 ; Get back to a page - lwz r8,mbvrswap+4(r8) ; Get last half of virtual to real swap - - la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkUnlock ; Unlock the search list - - xor r3,r31,r8 ; Flip mapping address to virtual - ori r3,r3,mapRtPerm ; Set permanent mapping error - b hrmErRtn - -hrmBadLock: li r3,mapRtBadLk ; Set bad lock - b hrmErRtn - -hrmEndInSight: - la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkUnlock ; Unlock the search list - -hrmDoneChunk: - mr r3,r31 ; Point to the mapping - bl mapDropBusy ; Drop the busy here since we need to come back - li r3,mapRtRemove ; Say we are still removing this - b hrmErRtn - - .align 5 - -hrmNotFound: - la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkUnlock ; Unlock the search list - li r3,mapRtNotFnd ; No mapping found - -hrmErRtn: bt++ pf64Bitb,hrmSF1z ; skip if 64-bit (only they take the hint) - - mtmsr r17 ; Restore enables/translation/etc. - isync - b hrmRetnCmn ; Join the common return code... - -hrmSF1z: mtmsrd r17 ; Restore enables/translation/etc. - isync - b hrmRetnCmn ; Join the common return code... - - .align 5 - -hrmNot1st: mr. r8,r9 ; Remember and test current node - beq- hrmPhyDQd ; Could not find our node, someone must have unmapped us... - lwz r9,mpAlias+4(r9) ; Chain to the next - cmplw r9,r31 ; Is this us? - bne- hrmNot1st ; Not us... - - lwz r9,mpAlias+4(r9) ; Get our forward pointer - stw r9,mpAlias+4(r8) ; Unchain us - - nop ; For alignment - -hrmPhyDQd: bl mapPhysUnlock ; Unlock the physent chain - -hrmRetn32: rlwinm r8,r31,0,0,19 ; Find start of page - mr r3,r31 ; Copy the pointer to the mapping - lwz r8,mbvrswap+4(r8) ; Get last half of virtual to real swap - bl mapDrainBusy ; Go wait until mapping is unused - - xor r3,r31,r8 ; Flip mapping address to virtual - - mtmsr r17 ; Restore enables/translation/etc. - isync - -hrmRetnCmn: lwz r6,FM_ARG0+0x44(r1) ; Get address to save next mapped vaddr - lwz r0,(FM_ALIGN(hrmStackSize)+FM_SIZE+FM_LR_SAVE)(r1) ; Restore the return - lwz r17,FM_ARG0+0x08(r1) ; Restore a register - lwz r18,FM_ARG0+0x0C(r1) ; Restore a register - mr. r6,r6 ; Should we pass back the "next" vaddr? - lwz r19,FM_ARG0+0x10(r1) ; Restore a register - lwz r20,FM_ARG0+0x14(r1) ; Restore a register - mtlr r0 ; Restore the return - - rlwinm r16,r16,0,0,19 ; Clean to a page boundary - beq hrmNoNextAdr ; Do not pass back the next vaddr... - stw r15,0(r6) ; Pass back the top of the next vaddr - stw r16,4(r6) ; Pass back the bottom of the next vaddr - -hrmNoNextAdr: - lwz r15,FM_ARG0+0x00(r1) ; Restore a register - lwz r16,FM_ARG0+0x04(r1) ; Restore a register - lwz r21,FM_ARG0+0x18(r1) ; Restore a register - rlwinm r3,r3,0,0,31 ; Clear top of register if 64-bit - lwz r22,FM_ARG0+0x1C(r1) ; Restore a register - lwz r23,FM_ARG0+0x20(r1) ; Restore a register - lwz r24,FM_ARG0+0x24(r1) ; Restore a register - lwz r25,FM_ARG0+0x28(r1) ; Restore a register - lwz r26,FM_ARG0+0x2C(r1) ; Restore a register - lwz r27,FM_ARG0+0x30(r1) ; Restore a register - lwz r28,FM_ARG0+0x34(r1) ; Restore a register - lwz r29,FM_ARG0+0x38(r1) ; Restore a register - lwz r30,FM_ARG0+0x3C(r1) ; Restore a register - lwz r31,FM_ARG0+0x40(r1) ; Restore a register - lwz r1,0(r1) ; Pop the stack - blr ; Leave... - -; -; Here is where we come when all is lost. Somehow, we failed a mapping function -; that must work... All hope is gone. Alas, we die....... -; - -hrmPanic: lis r0,hi16(Choke) ; System abend - ori r0,r0,lo16(Choke) ; System abend - li r3,failMapping ; Show that we failed some kind of mapping thing - sc - - -; -; Invalidate block mappings by invalidating a chunk of autogen PTEs in PTEGs hashed -; in the range. Then, if we did not finish, return a code indicating that we need to -; be called again. Eventually, we will finish and then, we will do a TLBIE for each -; PTEG up to the point where we have cleared it all (64 for 32-bit architecture) -; -; A potential speed up is that we stop the invalidate loop once we have walked through -; the hash table once. This really is not worth the trouble because we need to have -; mapped 1/2 of physical RAM in an individual block. Way unlikely. -; -; We should rethink this and see if we think it will be faster to check PTE and -; only invalidate the specific PTE rather than all block map PTEs in the PTEG. -; - - .align 5 - -hrmBlock32: lis r29,0xD000 ; Get shift to 32MB bsu - rlwinm r24,r20,mpBSub+1+2,29,29 ; Rotate to get 0 if 4K bsu or 13 if 32MB bsu - lhz r25,mpBSize(r31) ; Get the number of pages in block - lhz r23,mpSpace(r31) ; Get the address space hash - lwz r9,mpBlkRemCur(r31) ; Get our current remove position - rlwnm r29,r29,r24,28,31 ; Rotate to get 0 or 13 - addi r25,r25,1 ; Account for zero-based counting - ori r0,r20,mpRIP ; Turn on the remove in progress flag - slw r25,r25,r29 ; Adjust for 32MB if needed - mfsdr1 r29 ; Get the hash table base and size - rlwinm r24,r23,maxAdrSpb,32-maxAdrSpb-maxAdrSpb,31-maxAdrSpb ; Get high order of hash - subi r25,r25,1 ; Convert back to zero-based counting - lwz r27,mpVAddr+4(r31) ; Get the base vaddr - sub r4,r25,r9 ; Get number of pages left - cmplw cr1,r9,r25 ; Have we already hit the end? - addi r10,r9,mapRemChunk ; Point to the start of the next chunk - addi r2,r4,-mapRemChunk ; See if mapRemChunk or more - rlwinm r26,r29,16,7,15 ; Get the hash table size - srawi r2,r2,31 ; We have -1 if less than mapRemChunk or 0 if equal or more - stb r0,mpFlags+3(r31) ; Save the flags with the mpRIP bit on - subi r4,r4,mapRemChunk-1 ; Back off for a running start (will be negative for more than mapRemChunk) - cmpwi cr7,r2,0 ; Remember if we have finished - slwi r0,r9,12 ; Make cursor into page offset - or r24,r24,r23 ; Get full hash - and r4,r4,r2 ; If more than a chunk, bring this back to 0 - rlwinm r29,r29,0,0,15 ; Isolate the hash table base - add r27,r27,r0 ; Adjust vaddr to start of current chunk - addi r4,r4,mapRemChunk-1 ; Add mapRemChunk-1 to get max(num left, chunksize) - - bgt- cr1,hrmEndInSight ; Someone is already doing the last hunk... - - la r3,pmapSXlk(r28) ; Point to the pmap search lock - stw r10,mpBlkRemCur(r31) ; Set next chunk to do (note: this may indicate after end) - bl sxlkUnlock ; Unlock the search list while we are invalidating - - rlwinm r8,r27,4+maxAdrSpb,31-maxAdrSpb-3,31-maxAdrSpb ; Isolate the segment - rlwinm r30,r27,26,6,25 ; Shift vaddr to PTEG offset (and remember VADDR in R27) - xor r24,r24,r8 ; Get the proper VSID - rlwinm r21,r27,26,10,25 ; Shift page index to PTEG offset (and remember VADDR in R27) - ori r26,r26,lo16(0xFFC0) ; Stick in the rest of the length - rlwinm r22,r4,6,10,25 ; Shift size to PTEG offset - rlwinm r24,r24,6,0,25 ; Shift hash to PTEG units - add r22,r22,r30 ; Get end address (in PTEG units) - -hrmBInv32: rlwinm r23,r30,0,10,25 ; Isolate just the page index - xor r23,r23,r24 ; Hash it - and r23,r23,r26 ; Wrap it into the table - rlwinm r3,r23,28,4,29 ; Change to PCA offset - subfic r3,r3,-4 ; Get the PCA entry offset - add r7,r3,r29 ; Point to the PCA slot - cmplw cr5,r30,r22 ; Check if we reached the end of the range - addi r30,r30,64 ; bump to the next vaddr - - bl mapLockPteg ; Lock the PTEG - - rlwinm. r4,r6,16,0,7 ; Position, save, and test block mappings in PCA - add r5,r23,r29 ; Point to the PTEG - li r0,0 ; Set an invalid PTE value - beq+ hrmBNone32 ; No block map PTEs in this PTEG... - mtcrf 0x80,r4 ; Set CRs to select PTE slots - mtcrf 0x40,r4 ; Set CRs to select PTE slots - - bf 0,hrmSlot0 ; No autogen here - stw r0,0x00(r5) ; Invalidate PTE - -hrmSlot0: bf 1,hrmSlot1 ; No autogen here - stw r0,0x08(r5) ; Invalidate PTE - -hrmSlot1: bf 2,hrmSlot2 ; No autogen here - stw r0,0x10(r5) ; Invalidate PTE - -hrmSlot2: bf 3,hrmSlot3 ; No autogen here - stw r0,0x18(r5) ; Invalidate PTE - -hrmSlot3: bf 4,hrmSlot4 ; No autogen here - stw r0,0x20(r5) ; Invalidate PTE - -hrmSlot4: bf 5,hrmSlot5 ; No autogen here - stw r0,0x28(r5) ; Invalidate PTE - -hrmSlot5: bf 6,hrmSlot6 ; No autogen here - stw r0,0x30(r5) ; Invalidate PTE - -hrmSlot6: bf 7,hrmSlot7 ; No autogen here - stw r0,0x38(r5) ; Invalidate PTE - -hrmSlot7: rlwinm r0,r4,16,16,23 ; Move in use to autogen - or r6,r6,r4 ; Flip on the free bits that corrospond to the autogens we cleared - andc r6,r6,r0 ; Turn off all the old autogen bits - -hrmBNone32: eieio ; Make sure all updates come first - - stw r6,0(r7) ; Unlock and set the PCA - - bne+ cr5,hrmBInv32 ; Go invalidate the next... - - bge+ cr7,hrmDoneChunk ; We have not as yet done the last chunk, go tell our caller to call again... - - mr r3,r31 ; Copy the pointer to the mapping - bl mapDrainBusy ; Go wait until we are sure all other removers are done with this one - - sync ; Make sure memory is consistent - - subi r5,r25,63 ; Subtract TLB size from page count (note we are 0 based here) - li r6,63 ; Assume full invalidate for now - srawi r5,r5,31 ; Make 0 if we need a full purge, -1 otherwise - andc r6,r6,r5 ; Clear max if we have less to do - and r5,r25,r5 ; Clear count if we have more than max - lwz r27,mpVAddr+4(r31) ; Get the base vaddr again - li r7,tlbieLock ; Get the TLBIE lock - or r5,r5,r6 ; Get number of TLBIEs needed - -hrmBTLBlck: lwarx r2,0,r7 ; Get the TLBIE lock - mr. r2,r2 ; Is it locked? - li r2,1 ; Get our lock value - bne- hrmBTLBlck ; It is locked, go wait... - stwcx. r2,0,r7 ; Try to get it - bne- hrmBTLBlck ; We was beat... - -hrmBTLBi: addic. r5,r5,-1 ; See if we did them all - tlbie r27 ; Invalidate it everywhere - addi r27,r27,0x1000 ; Up to the next page - bge+ hrmBTLBi ; Make sure we have done it all... - - rlwinm. r0,r19,0,pfSMPcapb,pfSMPcapb ; Can this processor do SMP? - li r2,0 ; Lock clear value - - sync ; Make sure all is quiet - beq- hrmBNTlbs ; Jump if we can not do a TLBSYNC.... - - eieio ; Make sure that the tlbie happens first - tlbsync ; Wait for everyone to catch up - sync ; Wait for quiet again - -hrmBNTlbs: stw r2,tlbieLock(0) ; Clear the tlbie lock - - la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkShared ; Go get a shared lock on the mapping lists - mr. r3,r3 ; Did we get the lock? - bne- hrmPanic ; Nope... - - lwz r4,mpVAddr(r31) ; High order of address - lwz r5,mpVAddr+4(r31) ; Low order of address - mr r3,r28 ; Pass in pmap to search - mr r29,r4 ; Save this in case we need it (only promote fails) - mr r30,r5 ; Save this in case we need it (only promote fails) - bl EXT(mapSearchFull) ; Go see if we can find it - - mr. r3,r3 ; Did we? (And remember mapping address for later) - mr r15,r4 ; Save top of next vaddr - mr r16,r5 ; Save bottom of next vaddr - beq- hrmPanic ; Nope, not found... - - cmplw r3,r31 ; Same mapping? - bne- hrmPanic ; Not good... - - la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkPromote ; Try to promote shared to exclusive - mr. r3,r3 ; Could we? - mr r3,r31 ; Restore the mapping pointer - beq+ hrmBDone1 ; Yeah... - - la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkConvert ; Convert shared to exclusive - mr. r3,r3 ; Could we? - bne-- hrmPanic ; Nope, we must have timed out... - - mr r3,r28 ; Pass in pmap to search - mr r4,r29 ; High order of address - mr r5,r30 ; Low order of address - bl EXT(mapSearchFull) ; Rescan the list - - mr. r3,r3 ; Did we lose it when we converted? - mr r15,r4 ; Save top of next vaddr - mr r16,r5 ; Save bottom of next vaddr - beq-- hrmPanic ; Yeah, we did, someone tossed it for us... - -hrmBDone1: bl mapDrainBusy ; Go wait until mapping is unused - - mr r3,r28 ; Get the pmap to remove from - mr r4,r31 ; Point to the mapping - bl EXT(mapRemove) ; Remove the mapping from the list - - lwz r4,pmapResidentCnt(r28) ; Get the mapped page count - la r3,pmapSXlk(r28) ; Point to the pmap search lock - subi r4,r4,1 ; Drop down the mapped page count - stw r4,pmapResidentCnt(r28) ; Set the mapped page count - bl sxlkUnlock ; Unlock the search list - - b hrmRetn32 ; We are all done, get out... - -; -; Here we handle the 64-bit version of hw_rem_map -; - - .align 5 - -hrmSplit64: rlwinm r9,r21,27,5,29 ; Convert PTEG to PCA entry - beq-- cr5,hrmBlock64 ; Go treat block specially... - subfic r9,r9,-4 ; Get the PCA entry offset - bt-- cr0_eq,hrmPysDQ64 ; Skip next if no possible PTE... - add r7,r9,r29 ; Point to the PCA slot - - bl mapLockPteg ; Go lock up the PTEG - - lwz r21,mpPte(r31) ; Get the quick pointer again - ld r5,0(r26) ; Get the top of PTE - - rlwinm. r0,r21,0,mpHValidb,mpHValidb ; See if we actually have a PTE - rlwinm r21,r21,0,~mpHValid ; Clear out valid bit - sldi r23,r5,16 ; Shift AVPN up to EA format -// **** Need to adjust above shift based on the page size - large pages need to shift a bit more - rldicr r5,r5,0,62 ; Clear the valid bit - rldimi r23,r30,0,36 ; Insert the page portion of the VPN - stw r21,mpPte(r31) ; Make sure we invalidate mpPte but keep pointing to PTEG (keep walk_page from making a mistake) - beq-- hrmUlckPCA64 ; Pte is gone, no need to invalidate... - - std r5,0(r26) ; Invalidate the PTE - - li r9,tlbieLock ; Get the TLBIE lock - - sync ; Make sure the invalid PTE is actually in memory - -hrmPtlb64: lwarx r5,0,r9 ; Get the TLBIE lock - rldicl r23,r23,0,16 ; Clear bits 0:15 cause they say to - mr. r5,r5 ; Is it locked? - li r5,1 ; Get locked indicator - bne-- hrmPtlb64w ; It is locked, go spin... - stwcx. r5,0,r9 ; Try to get it - bne-- hrmPtlb64 ; We was beat... - - tlbie r23 ; Invalidate all corresponding TLB entries - - eieio ; Make sure that the tlbie happens first - tlbsync ; Wait for everyone to catch up - - ptesync ; Make sure of it all - li r0,0 ; Clear this - rlwinm r2,r21,28,29,31 ; Get slot number (16 byte entries) - stw r0,tlbieLock(0) ; Clear the tlbie lock - oris r0,r0,0x8000 ; Assume slot 0 - - srw r0,r0,r2 ; Get slot mask to deallocate - - lwz r22,12(r26) ; Get the latest reference and change bits - or r6,r6,r0 ; Make the guy we killed free - -hrmUlckPCA64: - eieio ; Make sure all updates come first - - stw r6,0(r7) ; Unlock and change the PCA - -hrmPysDQ64: mr r3,r31 ; Point to the mapping - bl mapDrainBusy ; Go wait until mapping is unused - - mr r3,r28 ; Get the pmap to remove from - mr r4,r31 ; Point to the mapping - bl EXT(mapRemove) ; Remove the mapping from the list - - rlwinm r0,r20,0,mpType ; Isolate mapping type - cmplwi cr1,r0,mpMinSpecial ; cr1_lt <- not a special mapping type - lwz r4,pmapResidentCnt(r28) ; Get the mapped page count - la r3,pmapSXlk(r28) ; Point to the pmap search lock - subi r4,r4,1 ; Drop down the mapped page count - stw r4,pmapResidentCnt(r28) ; Set the mapped page count - bl sxlkUnlock ; Unlock the search list - - bf-- cr1_lt,hrmRetn64 ; This one has no real memory associated with it so we are done... - - bl mapPhysFindLock ; Go find and lock the physent - - li r0,ppLFAmask ; Get mask to clean up mapping pointer - ld r9,ppLink(r3) ; Get first mapping - rotrdi r0,r0,ppLFArrot ; Rotate clean up mask to get 0xF0000000000000000F - mr r4,r22 ; Get the RC bits we just got - - bl mapPhysMerge ; Go merge the RC bits - - andc r9,r9,r0 ; Clean up the mapping pointer - - cmpld r9,r31 ; Are we the first on the list? - bne-- hrmNot1st64 ; Nope... - - li r9,0 ; Get a 0 - ld r4,mpAlias(r31) ; Get our forward pointer - - std r9,mpAlias(r31) ; Make sure we are off the chain - bl mapPhyCSet64 ; Go set the physent link and preserve flags - - b hrmPhyDQd64 ; Join up and unlock it all... - -hrmPtlb64w: li r5,lgKillResv ; Point to some spare memory - stwcx. r5,0,r5 ; Clear the pending reservation - - -hrmPtlb64x: lwz r5,0(r9) ; Do a regular load to avoid taking reservation - mr. r5,r5 ; is it locked? - beq++ hrmPtlb64 ; Nope... - b hrmPtlb64x ; Sniff some more... - - .align 5 - -hrmNot1st64: - mr. r8,r9 ; Remember and test current node - beq-- hrmPhyDQd64 ; Could not find our node... - ld r9,mpAlias(r9) ; Chain to the next - cmpld r9,r31 ; Is this us? - bne-- hrmNot1st64 ; Not us... - - ld r9,mpAlias(r9) ; Get our forward pointer - std r9,mpAlias(r8) ; Unchain us - - nop ; For alignment - -hrmPhyDQd64: - bl mapPhysUnlock ; Unlock the physent chain - -hrmRetn64: rldicr r8,r31,0,51 ; Find start of page - mr r3,r31 ; Copy the pointer to the mapping - lwz r8,mbvrswap+4(r8) ; Get last half of virtual to real swap - bl mapDrainBusy ; Go wait until mapping is unused - - xor r3,r31,r8 ; Flip mapping address to virtual - - mtmsrd r17 ; Restore enables/translation/etc. - isync - - b hrmRetnCmn ; Join the common return path... - - -; -; Check hrmBlock32 for comments. -; - - .align 5 - -hrmBlock64: lis r29,0xD000 ; Get shift to 32MB bsu - rlwinm r10,r20,mpBSub+1+2,29,29 ; Rotate to get 0 if 4K bsu or 13 if 32MB bsu - lhz r24,mpSpace(r31) ; Get the address space hash - lhz r25,mpBSize(r31) ; Get the number of pages in block - lwz r9,mpBlkRemCur(r31) ; Get our current remove position - rlwnm r29,r29,r10,28,31 ; Rotate to get 0 or 13 - addi r25,r25,1 ; Account for zero-based counting - ori r0,r20,mpRIP ; Turn on the remove in progress flag - slw r25,r25,r29 ; Adjust for 32MB if needed - mfsdr1 r29 ; Get the hash table base and size - ld r27,mpVAddr(r31) ; Get the base vaddr - subi r25,r25,1 ; Convert back to zero-based counting - rlwinm r5,r29,0,27,31 ; Isolate the size - sub r4,r25,r9 ; Get number of pages left - cmplw cr1,r9,r25 ; Have we already hit the end? - addi r10,r9,mapRemChunk ; Point to the start of the next chunk - addi r2,r4,-mapRemChunk ; See if mapRemChunk or more - stb r0,mpFlags+3(r31) ; Save the flags with the mpRIP bit on - srawi r2,r2,31 ; We have -1 if less than mapRemChunk or 0 if equal or more - subi r4,r4,mapRemChunk-1 ; Back off for a running start (will be negative for more than mapRemChunk) - cmpwi cr7,r2,0 ; Remember if we are doing the last chunk - and r4,r4,r2 ; If more than a chunk, bring this back to 0 - srdi r27,r27,12 ; Change address into page index - addi r4,r4,mapRemChunk-1 ; Add mapRemChunk-1 to get max(num left, chunksize) - add r27,r27,r9 ; Adjust vaddr to start of current chunk - - bgt-- cr1,hrmEndInSight ; Someone is already doing the last hunk... - - la r3,pmapSXlk(r28) ; Point to the pmap search lock - stw r10,mpBlkRemCur(r31) ; Set next chunk to do (note: this may indicate after end) - bl sxlkUnlock ; Unlock the search list while we are invalidating - - rlwimi r24,r24,14,4,17 ; Insert a copy of space hash - eqv r26,r26,r26 ; Get all foxes here - rldimi r24,r24,28,8 ; Make a couple copies up higher - rldicr r29,r29,0,47 ; Isolate just the hash table base - subfic r5,r5,46 ; Get number of leading zeros - srd r26,r26,r5 ; Shift the size bits over - mr r30,r27 ; Get start of chunk to invalidate - rldicr r26,r26,0,56 ; Make length in PTEG units - add r22,r4,r30 ; Get end page number - -hrmBInv64: srdi r0,r30,2 ; Shift page index over to form ESID - rldicr r0,r0,0,49 ; Clean all but segment portion - rlwinm r2,r30,0,16,31 ; Get the current page index - xor r0,r0,r24 ; Form VSID - xor r8,r2,r0 ; Hash the vaddr - sldi r8,r8,7 ; Make into PTEG offset - and r23,r8,r26 ; Wrap into the hash table - rlwinm r3,r23,27,5,29 ; Change to PCA offset (table is always 2GB or less so 32-bit instructions work here) - subfic r3,r3,-4 ; Get the PCA entry offset - add r7,r3,r29 ; Point to the PCA slot - - cmplw cr5,r30,r22 ; Have we reached the end of the range? - - bl mapLockPteg ; Lock the PTEG - - rlwinm. r4,r6,16,0,7 ; Extract the block mappings in this here PTEG and see if there are any - add r5,r23,r29 ; Point to the PTEG - li r0,0 ; Set an invalid PTE value - beq++ hrmBNone64 ; No block map PTEs in this PTEG... - mtcrf 0x80,r4 ; Set CRs to select PTE slots - mtcrf 0x40,r4 ; Set CRs to select PTE slots - - - bf 0,hrmSlot0s ; No autogen here - std r0,0x00(r5) ; Invalidate PTE - -hrmSlot0s: bf 1,hrmSlot1s ; No autogen here - std r0,0x10(r5) ; Invalidate PTE - -hrmSlot1s: bf 2,hrmSlot2s ; No autogen here - std r0,0x20(r5) ; Invalidate PTE - -hrmSlot2s: bf 3,hrmSlot3s ; No autogen here - std r0,0x30(r5) ; Invalidate PTE - -hrmSlot3s: bf 4,hrmSlot4s ; No autogen here - std r0,0x40(r5) ; Invalidate PTE - -hrmSlot4s: bf 5,hrmSlot5s ; No autogen here - std r0,0x50(r5) ; Invalidate PTE - -hrmSlot5s: bf 6,hrmSlot6s ; No autogen here - std r0,0x60(r5) ; Invalidate PTE - -hrmSlot6s: bf 7,hrmSlot7s ; No autogen here - std r0,0x70(r5) ; Invalidate PTE - -hrmSlot7s: rlwinm r0,r4,16,16,23 ; Move in use to autogen - or r6,r6,r4 ; Flip on the free bits that corrospond to the autogens we cleared - andc r6,r6,r0 ; Turn off all the old autogen bits - -hrmBNone64: eieio ; Make sure all updates come first - stw r6,0(r7) ; Unlock and set the PCA - - addi r30,r30,1 ; bump to the next PTEG - bne++ cr5,hrmBInv64 ; Go invalidate the next... - - bge+ cr7,hrmDoneChunk ; We have not as yet done the last chunk, go tell our caller to call again... - - mr r3,r31 ; Copy the pointer to the mapping - bl mapDrainBusy ; Go wait until we are sure all other removers are done with this one - - sync ; Make sure memory is consistent - - subi r5,r25,255 ; Subtract TLB size from page count (note we are 0 based here) - li r6,255 ; Assume full invalidate for now - srawi r5,r5,31 ; Make 0 if we need a full purge, -1 otherwise - andc r6,r6,r5 ; Clear max if we have less to do - and r5,r25,r5 ; Clear count if we have more than max - sldi r24,r24,28 ; Get the full XOR value over to segment position - ld r27,mpVAddr(r31) ; Get the base vaddr - li r7,tlbieLock ; Get the TLBIE lock - or r5,r5,r6 ; Get number of TLBIEs needed - -hrmBTLBlcl: lwarx r2,0,r7 ; Get the TLBIE lock - mr. r2,r2 ; Is it locked? - li r2,1 ; Get our lock value - bne-- hrmBTLBlcm ; It is locked, go wait... - stwcx. r2,0,r7 ; Try to get it - bne-- hrmBTLBlcl ; We was beat... - -hrmBTLBj: sldi r2,r27,maxAdrSpb ; Move to make room for address space ID - rldicr r2,r2,0,35-maxAdrSpb ; Clear out the extra - addic. r5,r5,-1 ; See if we did them all - xor r2,r2,r24 ; Make the VSID - rldimi r2,r27,0,36 ; Insert the page portion of the VPN - rldicl r2,r2,0,16 ; Clear bits 0:15 cause they say we gotta - - tlbie r2 ; Invalidate it everywhere - addi r27,r27,0x1000 ; Up to the next page - bge++ hrmBTLBj ; Make sure we have done it all... - - eieio ; Make sure that the tlbie happens first - tlbsync ; wait for everyone to catch up - - li r2,0 ; Lock clear value - - ptesync ; Wait for quiet again - - stw r2,tlbieLock(0) ; Clear the tlbie lock - - la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkShared ; Go get a shared lock on the mapping lists - mr. r3,r3 ; Did we get the lock? - bne- hrmPanic ; Nope... - - lwz r4,mpVAddr(r31) ; High order of address - lwz r5,mpVAddr+4(r31) ; Low order of address - mr r3,r28 ; Pass in pmap to search - mr r29,r4 ; Save this in case we need it (only promote fails) - mr r30,r5 ; Save this in case we need it (only promote fails) - bl EXT(mapSearchFull) ; Go see if we can find it - - mr. r3,r3 ; Did we? (And remember mapping address for later) - mr r15,r4 ; Save top of next vaddr - mr r16,r5 ; Save bottom of next vaddr - beq- hrmPanic ; Nope, not found... - - cmpld r3,r31 ; Same mapping? - bne- hrmPanic ; Not good... - - la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkPromote ; Try to promote shared to exclusive - mr. r3,r3 ; Could we? - mr r3,r31 ; Restore the mapping pointer - beq+ hrmBDone2 ; Yeah... - - la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkConvert ; Convert shared to exclusive - mr. r3,r3 ; Could we? - bne-- hrmPanic ; Nope, we must have timed out... - - mr r3,r28 ; Pass in pmap to search - mr r4,r29 ; High order of address - mr r5,r30 ; Low order of address - bl EXT(mapSearchFull) ; Rescan the list - - mr. r3,r3 ; Did we lose it when we converted? - mr r15,r4 ; Save top of next vaddr - mr r16,r5 ; Save bottom of next vaddr - beq-- hrmPanic ; Yeah, we did, someone tossed it for us... - -hrmBDone2: bl mapDrainBusy ; Go wait until mapping is unused - - mr r3,r28 ; Get the pmap to remove from - mr r4,r31 ; Point to the mapping - bl EXT(mapRemove) ; Remove the mapping from the list - - lwz r4,pmapResidentCnt(r28) ; Get the mapped page count - la r3,pmapSXlk(r28) ; Point to the pmap search lock - subi r4,r4,1 ; Drop down the mapped page count - stw r4,pmapResidentCnt(r28) ; Set the mapped page count - bl sxlkUnlock ; Unlock the search list - - b hrmRetn64 ; We are all done, get out... - -hrmBTLBlcm: li r2,lgKillResv ; Get space unreserve line - stwcx. r2,0,r2 ; Unreserve it - -hrmBTLBlcn: lwz r2,0(r7) ; Get the TLBIE lock - mr. r2,r2 ; Is it held? - beq++ hrmBTLBlcl ; Nope... - b hrmBTLBlcn ; Yeah... - -; -; Guest shadow assist -- mapping remove -; -; Method of operation: -; o Locate the VMM extension block and the host pmap -; o Obtain the host pmap's search lock exclusively -; o Locate the requested mapping in the shadow hash table, -; exit if not found -; o If connected, disconnect the PTE and gather R&C to physent -; o Locate and lock the physent -; o Remove mapping from physent's chain -; o Unlock physent -; o Unlock pmap's search lock -; -; Non-volatile registers on entry: -; r17: caller's msr image -; r19: sprg2 (feature flags) -; r28: guest pmap's physical address -; r29: high-order 32 bits of guest virtual address -; r30: low-order 32 bits of guest virtual address -; -; Non-volatile register usage: -; r26: VMM extension block's physical address -; r27: host pmap's physical address -; r28: guest pmap's physical address -; r29: physent's physical address -; r30: guest virtual address -; r31: guest mapping's physical address -; - .align 5 -hrmGuest: - rlwinm r30,r30,0,0xFFFFF000 ; Clean up low-order bits of 32-bit guest vaddr - bt++ pf64Bitb,hrmG64 ; Test for 64-bit machine - lwz r26,pmapVmmExtPhys+4(r28) ; r26 <- VMM pmap extension block paddr - lwz r27,vmxHostPmapPhys+4(r26) ; r27 <- host pmap's paddr - b hrmGStart ; Join common code - -hrmG64: ld r26,pmapVmmExtPhys(r28) ; r26 <- VMM pmap extension block paddr - ld r27,vmxHostPmapPhys(r26) ; r27 <- host pmap's paddr - rldimi r30,r29,32,0 ; Insert high-order 32 bits of 64-bit guest vaddr - -hrmGStart: la r3,pmapSXlk(r27) ; r3 <- host pmap's search lock address - bl sxlkExclusive ; Get lock exclusive - - lwz r3,vxsGrm(r26) ; Get mapping remove request count - - lwz r9,pmapSpace(r28) ; r9 <- guest space ID number - la r31,VMX_HPIDX_OFFSET(r26) ; r31 <- base of hash page physical index - srwi r11,r30,12 ; Form shadow hash: - xor r11,r9,r11 ; spaceID ^ (vaddr >> 12) - rlwinm r12,r11,GV_HPAGE_SHIFT,GV_HPAGE_MASK - ; Form index offset from hash page number - add r31,r31,r12 ; r31 <- hash page index entry - li r0,(GV_SLOTS - 1) ; Prepare to iterate over mapping slots - mtctr r0 ; in this group - bt++ pf64Bitb,hrmG64Search ; Separate handling for 64-bit search - lwz r31,4(r31) ; r31 <- hash page paddr - rlwimi r31,r11,GV_HGRP_SHIFT,GV_HGRP_MASK - ; r31 <- hash group paddr - - addi r3,r3,1 ; Increment remove request count - stw r3,vxsGrm(r26) ; Update remove request count - - lwz r3,mpFlags(r31) ; r3 <- 1st mapping slot's flags - lhz r4,mpSpace(r31) ; r4 <- 1st mapping slot's space ID - lwz r5,mpVAddr+4(r31) ; r5 <- 1st mapping slot's virtual address - b hrmG32SrchLp ; Let the search begin! - - .align 5 -hrmG32SrchLp: - mr r6,r3 ; r6 <- current mapping slot's flags - lwz r3,mpFlags+GV_SLOT_SZ(r31) ; r3 <- next mapping slot's flags - mr r7,r4 ; r7 <- current mapping slot's space ID - lhz r4,mpSpace+GV_SLOT_SZ(r31) ; r4 <- next mapping slot's space ID - clrrwi r8,r5,12 ; r8 <- current mapping slot's virtual addr w/o flags - lwz r5,mpVAddr+4+GV_SLOT_SZ(r31); r5 <- next mapping slot's virtual addr - rlwinm r11,r6,0,mpgFree ; Isolate guest free mapping flag - xor r7,r7,r9 ; Compare space ID - or r0,r11,r7 ; r0 <- !(free && space match) - xor r8,r8,r30 ; Compare virtual address - or. r0,r0,r8 ; cr0_eq <- !free && space match && virtual addr match - beq hrmGSrchHit ; Join common path on hit (r31 points to guest mapping) - - addi r31,r31,GV_SLOT_SZ ; r31 <- next mapping slot - bdnz hrmG32SrchLp ; Iterate - - mr r6,r3 ; r6 <- current mapping slot's flags - clrrwi r5,r5,12 ; Remove flags from virtual address - rlwinm r11,r6,0,mpgFree ; Isolate guest free mapping flag - xor r4,r4,r9 ; Compare space ID - or r0,r11,r4 ; r0 <- !(free && space match) - xor r5,r5,r30 ; Compare virtual address - or. r0,r0,r5 ; cr0_eq <- !free && space match && virtual addr match - beq hrmGSrchHit ; Join common path on hit (r31 points to guest mapping) - b hrmGSrchMiss ; No joy in our hash group - -hrmG64Search: - ld r31,0(r31) ; r31 <- hash page paddr - insrdi r31,r11,GV_GRPS_PPG_LG2,64-(GV_HGRP_SHIFT+GV_GRPS_PPG_LG2) - ; r31 <- hash group paddr - lwz r3,mpFlags(r31) ; r3 <- 1st mapping slot's flags - lhz r4,mpSpace(r31) ; r4 <- 1st mapping slot's space ID - ld r5,mpVAddr(r31) ; r5 <- 1st mapping slot's virtual address - b hrmG64SrchLp ; Let the search begin! - - .align 5 -hrmG64SrchLp: - mr r6,r3 ; r6 <- current mapping slot's flags - lwz r3,mpFlags+GV_SLOT_SZ(r31) ; r3 <- next mapping slot's flags - mr r7,r4 ; r7 <- current mapping slot's space ID - lhz r4,mpSpace+GV_SLOT_SZ(r31) ; r4 <- next mapping slot's space ID - clrrdi r8,r5,12 ; r8 <- current mapping slot's virtual addr w/o flags - ld r5,mpVAddr+GV_SLOT_SZ(r31) ; r5 <- next mapping slot's virtual addr - rlwinm r11,r6,0,mpgFree ; Isolate guest free mapping flag - xor r7,r7,r9 ; Compare space ID - or r0,r11,r7 ; r0 <- !(free && space match) - xor r8,r8,r30 ; Compare virtual address - or. r0,r0,r8 ; cr0_eq <- !free && space match && virtual addr match - beq hrmGSrchHit ; Join common path on hit (r31 points to guest mapping) - - addi r31,r31,GV_SLOT_SZ ; r31 <- next mapping slot - bdnz hrmG64SrchLp ; Iterate - - mr r6,r3 ; r6 <- current mapping slot's flags - clrrdi r5,r5,12 ; Remove flags from virtual address - rlwinm r11,r6,0,mpgFree ; Isolate guest free mapping flag - xor r4,r4,r9 ; Compare space ID - or r0,r11,r4 ; r0 <- !(free && space match) - xor r5,r5,r30 ; Compare virtual address - or. r0,r0,r5 ; cr0_eq <- !free && space match && virtual addr match - beq hrmGSrchHit ; Join common path on hit (r31 points to guest mapping) -hrmGSrchMiss: - lwz r3,vxsGrmMiss(r26) ; Get remove miss count - li r25,mapRtNotFnd ; Return not found - addi r3,r3,1 ; Increment miss count - stw r3,vxsGrmMiss(r26) ; Update miss count - b hrmGReturn ; Join guest return - - .align 5 -hrmGSrchHit: - rlwinm. r0,r6,0,mpgDormant ; Is this entry dormant? - bne hrmGDormant ; Yes, nothing to disconnect - - lwz r3,vxsGrmActive(r26) ; Get active hit count - addi r3,r3,1 ; Increment active hit count - stw r3,vxsGrmActive(r26) ; Update hit count - - bt++ pf64Bitb,hrmGDscon64 ; Handle 64-bit disconnect separately - bl mapInvPte32 ; Disconnect PTE, invalidate, gather ref and change - ; r31 <- mapping's physical address - ; r3 -> PTE slot physical address - ; r4 -> High-order 32 bits of PTE - ; r5 -> Low-order 32 bits of PTE - ; r6 -> PCA - ; r7 -> PCA physical address - rlwinm r2,r3,29,29,31 ; Get PTE's slot number in the PTEG (8-byte PTEs) - b hrmGFreePTE ; Join 64-bit path to release the PTE -hrmGDscon64: - bl mapInvPte64 ; Disconnect PTE, invalidate, gather ref and change - rlwinm r2,r3,28,29,31 ; Get PTE's slot number in the PTEG (16-byte PTEs) -hrmGFreePTE: - mr. r3,r3 ; Was there a valid PTE? - beq hrmGDormant ; No valid PTE, we're almost done - lis r0,0x8000 ; Prepare free bit for this slot - srw r0,r0,r2 ; Position free bit - or r6,r6,r0 ; Set it in our PCA image - lwz r8,mpPte(r31) ; Get PTE offset - rlwinm r8,r8,0,~mpHValid ; Make the offset invalid - stw r8,mpPte(r31) ; Save invalidated PTE offset - eieio ; Synchronize all previous updates (mapInvPtexx didn't) - stw r6,0(r7) ; Update PCA and unlock the PTEG - -hrmGDormant: - lwz r3,mpPAddr(r31) ; r3 <- physical 4K-page number - bl mapFindLockPN ; Find 'n' lock this page's physent - mr. r29,r3 ; Got lock on our physent? - beq-- hrmGBadPLock ; No, time to bail out - - crset cr1_eq ; cr1_eq <- previous link is the anchor - bt++ pf64Bitb,hrmGRemove64 ; Use 64-bit version on 64-bit machine - la r11,ppLink+4(r29) ; Point to chain anchor - lwz r9,ppLink+4(r29) ; Get chain anchor - rlwinm. r9,r9,0,~ppFlags ; Remove flags, yielding 32-bit physical chain pointer -hrmGRemLoop: - beq- hrmGPEMissMiss ; End of chain, this is not good - cmplw r9,r31 ; Is this the mapping to remove? - lwz r8,mpAlias+4(r9) ; Get forward chain pointer - bne hrmGRemNext ; No, chain onward - bt cr1_eq,hrmGRemRetry ; Mapping to remove is chained from anchor - stw r8,0(r11) ; Unchain gpv->phys mapping - b hrmGDelete ; Finish deleting mapping -hrmGRemRetry: - lwarx r0,0,r11 ; Get previous link - rlwimi r0,r8,0,~ppFlags ; Insert new forward pointer whilst preserving flags - stwcx. r0,0,r11 ; Update previous link - bne- hrmGRemRetry ; Lost reservation, retry - b hrmGDelete ; Finish deleting mapping - -hrmGRemNext: - la r11,mpAlias+4(r9) ; Point to (soon to be) previous link - crclr cr1_eq ; ~cr1_eq <- Previous link is not the anchor - mr. r9,r8 ; Does next entry exist? - b hrmGRemLoop ; Carry on - -hrmGRemove64: - li r7,ppLFAmask ; Get mask to clean up mapping pointer - rotrdi r7,r7,ppLFArrot ; Rotate clean up mask to get 0xF0000000000000000F - la r11,ppLink(r29) ; Point to chain anchor - ld r9,ppLink(r29) ; Get chain anchor - andc. r9,r9,r7 ; Remove flags, yielding 64-bit physical chain pointer -hrmGRem64Lp: - beq-- hrmGPEMissMiss ; End of chain, this is not good - cmpld r9,r31 ; Is this the mapping to remove? - ld r8,mpAlias(r9) ; Get forward chain pinter - bne hrmGRem64Nxt ; No mapping to remove, chain on, dude - bt cr1_eq,hrmGRem64Rt ; Mapping to remove is chained from anchor - std r8,0(r11) ; Unchain gpv->phys mapping - b hrmGDelete ; Finish deleting mapping -hrmGRem64Rt: - ldarx r0,0,r11 ; Get previous link - and r0,r0,r7 ; Get flags - or r0,r0,r8 ; Insert new forward pointer - stdcx. r0,0,r11 ; Slam it back in - bne-- hrmGRem64Rt ; Lost reservation, retry - b hrmGDelete ; Finish deleting mapping - - .align 5 -hrmGRem64Nxt: - la r11,mpAlias(r9) ; Point to (soon to be) previous link - crclr cr1_eq ; ~cr1_eq <- Previous link is not the anchor - mr. r9,r8 ; Does next entry exist? - b hrmGRem64Lp ; Carry on - -hrmGDelete: - mr r3,r29 ; r3 <- physent addr - bl mapPhysUnlock ; Unlock physent chain - lwz r3,mpFlags(r31) ; Get mapping's flags - rlwinm r3,r3,0,~mpgFlags ; Clear all guest flags - ori r3,r3,mpgFree ; Mark mapping free - stw r3,mpFlags(r31) ; Update flags - li r25,mapRtGuest ; Set return code to 'found guest mapping' - -hrmGReturn: - la r3,pmapSXlk(r27) ; r3 <- host pmap search lock phys addr - bl sxlkUnlock ; Release host pmap search lock - - mr r3,r25 ; r3 <- return code - bt++ pf64Bitb,hrmGRtn64 ; Handle 64-bit separately - mtmsr r17 ; Restore 'rupts, translation - isync ; Throw a small wrench into the pipeline - b hrmRetnCmn ; Nothing to do now but pop a frame and return -hrmGRtn64: mtmsrd r17 ; Restore 'rupts, translation, 32-bit mode - b hrmRetnCmn ; Join common return - -hrmGBadPLock: -hrmGPEMissMiss: - lis r0,hi16(Choke) ; Seen the arrow on the doorpost - ori r0,r0,lo16(Choke) ; Sayin' "THIS LAND IS CONDEMNED" - li r3,failMapping ; All the way from New Orleans - sc ; To Jeruselem - - -/* - * mapping *hw_purge_phys(physent) - remove a mapping from the system - * - * Upon entry, R3 contains a pointer to a physent. - * - * This function removes the first mapping from a physical entry - * alias list. It locks the list, extracts the vaddr and pmap from - * the first entry. It then jumps into the hw_rem_map function. - * NOTE: since we jump into rem_map, we need to set up the stack - * identically. Also, we set the next parm to 0 so we do not - * try to save a next vaddr. - * - * We return the virtual address of the removed mapping as a - * R3. - * - * Note that this is designed to be called from 32-bit mode with a stack. - * - * We disable translation and all interruptions here. This keeps is - * from having to worry about a deadlock due to having anything locked - * and needing it to process a fault. - * - * Note that this must be done with both interruptions off and VM off - * - * - * Remove mapping via physical page (mapping_purge) - * - * 1) lock physent - * 2) extract vaddr and pmap - * 3) unlock physent - * 4) do "remove mapping via pmap" - * - * - */ - - .align 5 - .globl EXT(hw_purge_phys) - -LEXT(hw_purge_phys) - stwu r1,-(FM_ALIGN(hrmStackSize)+FM_SIZE)(r1) ; Make some space on the stack - mflr r0 ; Save the link register - stw r15,FM_ARG0+0x00(r1) ; Save a register - stw r16,FM_ARG0+0x04(r1) ; Save a register - stw r17,FM_ARG0+0x08(r1) ; Save a register - stw r18,FM_ARG0+0x0C(r1) ; Save a register - stw r19,FM_ARG0+0x10(r1) ; Save a register - stw r20,FM_ARG0+0x14(r1) ; Save a register - stw r21,FM_ARG0+0x18(r1) ; Save a register - stw r22,FM_ARG0+0x1C(r1) ; Save a register - stw r23,FM_ARG0+0x20(r1) ; Save a register - stw r24,FM_ARG0+0x24(r1) ; Save a register - stw r25,FM_ARG0+0x28(r1) ; Save a register - li r6,0 ; Set no next address return - stw r26,FM_ARG0+0x2C(r1) ; Save a register - stw r27,FM_ARG0+0x30(r1) ; Save a register - stw r28,FM_ARG0+0x34(r1) ; Save a register - stw r29,FM_ARG0+0x38(r1) ; Save a register - stw r30,FM_ARG0+0x3C(r1) ; Save a register - stw r31,FM_ARG0+0x40(r1) ; Save a register - stw r6,FM_ARG0+0x44(r1) ; Save address to save next mapped vaddr - stw r0,(FM_ALIGN(hrmStackSize)+FM_SIZE+FM_LR_SAVE)(r1) ; Save the return - - bl EXT(mapSetUp) ; Turn off interrupts, translation, and possibly enter 64-bit - - bl mapPhysLock ; Lock the physent - - bt++ pf64Bitb,hppSF ; skip if 64-bit (only they take the hint) - - lwz r12,ppLink+4(r3) ; Grab the pointer to the first mapping - li r0,ppFlags ; Set the bottom stuff to clear - b hppJoin ; Join the common... - -hppSF: li r0,ppLFAmask - ld r12,ppLink(r3) ; Get the pointer to the first mapping - rotrdi r0,r0,ppLFArrot ; Rotate clean up mask to get 0xF0000000000000000F - -hppJoin: andc. r12,r12,r0 ; Clean and test link - beq-- hppNone ; There are no more mappings on physical page - - lis r28,hi16(EXT(pmapTrans)) ; Get the top of the start of the pmap hash to pmap translate table - lhz r7,mpSpace(r12) ; Get the address space hash - ori r28,r28,lo16(EXT(pmapTrans)) ; Get the top of the start of the pmap hash to pmap translate table - slwi r0,r7,2 ; Multiply space by 4 - lwz r4,mpVAddr(r12) ; Get the top of the vaddr - slwi r7,r7,3 ; Multiply space by 8 - lwz r5,mpVAddr+4(r12) ; and the bottom - add r7,r7,r0 ; Get correct displacement into translate table - lwz r28,0(r28) ; Get the actual translation map - - add r28,r28,r7 ; Point to the pmap translation - - bl mapPhysUnlock ; Time to unlock the physical entry - - bt++ pf64Bitb,hppSF2 ; skip if 64-bit (only they take the hint) - - lwz r28,pmapPAddr+4(r28) ; Get the physical address of the pmap - b hrmJoin ; Go remove the mapping... - -hppSF2: ld r28,pmapPAddr(r28) ; Get the physical address of the pmap - b hrmJoin ; Go remove the mapping... - - .align 5 - -hppNone: bl mapPhysUnlock ; Time to unlock the physical entry - - bt++ pf64Bitb,hppSF3 ; skip if 64-bit (only they take the hint)... - - mtmsr r11 ; Restore enables/translation/etc. - isync - b hppRetnCmn ; Join the common return code... - -hppSF3: mtmsrd r11 ; Restore enables/translation/etc. - isync - -; -; NOTE: we have not used any registers other than the volatiles to this point -; - -hppRetnCmn: lwz r12,(FM_ALIGN(hrmStackSize)+FM_SIZE+FM_LR_SAVE)(r1) ; Restore the return - - li r3,mapRtEmpty ; Physent chain is empty - mtlr r12 ; Restore the return - lwz r1,0(r1) ; Pop the stack - blr ; Leave... - -/* - * mapping *hw_purge_map(pmap, vaddr, addr64_t *next) - remove a mapping from the system. - * - * Upon entry, R3 contains a pointer to a pmap. Since vaddr is - * a 64-bit quantity, it is a long long so it is in R4 and R5. - * - * We return the virtual address of the removed mapping as a - * R3. - * - * Note that this is designed to be called from 32-bit mode with a stack. - * - * We disable translation and all interruptions here. This keeps is - * from having to worry about a deadlock due to having anything locked - * and needing it to process a fault. - * - * Note that this must be done with both interruptions off and VM off - * - * Remove a mapping which can be reestablished by VM - * - */ - - .align 5 - .globl EXT(hw_purge_map) - -LEXT(hw_purge_map) - stwu r1,-(FM_ALIGN(hrmStackSize)+FM_SIZE)(r1) ; Make some space on the stack - mflr r0 ; Save the link register - stw r15,FM_ARG0+0x00(r1) ; Save a register - stw r16,FM_ARG0+0x04(r1) ; Save a register - stw r17,FM_ARG0+0x08(r1) ; Save a register - stw r18,FM_ARG0+0x0C(r1) ; Save a register - stw r19,FM_ARG0+0x10(r1) ; Save a register - mfsprg r19,2 ; Get feature flags - stw r20,FM_ARG0+0x14(r1) ; Save a register - stw r21,FM_ARG0+0x18(r1) ; Save a register - mtcrf 0x02,r19 ; move pf64Bit cr6 - stw r22,FM_ARG0+0x1C(r1) ; Save a register - stw r23,FM_ARG0+0x20(r1) ; Save a register - stw r24,FM_ARG0+0x24(r1) ; Save a register - stw r25,FM_ARG0+0x28(r1) ; Save a register - stw r26,FM_ARG0+0x2C(r1) ; Save a register - stw r27,FM_ARG0+0x30(r1) ; Save a register - stw r28,FM_ARG0+0x34(r1) ; Save a register - stw r29,FM_ARG0+0x38(r1) ; Save a register - stw r30,FM_ARG0+0x3C(r1) ; Save a register - stw r31,FM_ARG0+0x40(r1) ; Save a register - stw r6,FM_ARG0+0x44(r1) ; Save address to save next mapped vaddr - stw r0,(FM_ALIGN(hrmStackSize)+FM_SIZE+FM_LR_SAVE)(r1) ; Save the return - -#if DEBUG - lwz r11,pmapFlags(r3) ; Get pmaps flags - rlwinm. r11,r11,0,pmapVMgsaa ; Is guest shadow assist active? - bne hpmPanic ; Call not valid for guest shadow assist pmap -#endif - - bt++ pf64Bitb,hpmSF1 ; skip if 64-bit (only they take the hint) - lwz r9,pmapvr+4(r3) ; Get conversion mask - b hpmSF1x ; Done... - -hpmSF1: ld r9,pmapvr(r3) ; Get conversion mask - -hpmSF1x: - bl EXT(mapSetUp) ; Turn off interrupts, translation, and possibly enter 64-bit - - xor r28,r3,r9 ; Convert the pmap to physical addressing - - mr r17,r11 ; Save the MSR - - la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkExclusive ; Go get an exclusive lock on the mapping lists - mr. r3,r3 ; Did we get the lock? - bne-- hrmBadLock ; Nope... -; -; Note that we do a full search (i.e., no shortcut level skips, etc.) -; here so that we will know the previous elements so we can dequeue them -; later. -; -hpmSearch: - mr r3,r28 ; Pass in pmap to search - mr r29,r4 ; Top half of vaddr - mr r30,r5 ; Bottom half of vaddr - bl EXT(mapSearchFull) ; Rescan the list - mr. r31,r3 ; Did we? (And remember mapping address for later) - or r0,r4,r5 ; Are we beyond the end? - mr r15,r4 ; Save top of next vaddr - cmplwi cr1,r0,0 ; See if there is another - mr r16,r5 ; Save bottom of next vaddr - bne-- hpmGotOne ; We found one, go check it out... - -hpmCNext: bne++ cr1,hpmSearch ; There is another to check... - b hrmNotFound ; No more in pmap to check... - -hpmGotOne: lwz r20,mpFlags(r3) ; Get the flags - andi. r0,r20,lo16(mpType|mpPerm) ; cr0_eq <- normal mapping && !permanent - rlwinm r21,r20,8,24,31 ; Extract the busy count - cmplwi cr2,r21,0 ; Is it busy? - crand cr0_eq,cr2_eq,cr0_eq ; not busy and can be removed? - beq++ hrmGotX ; Found, branch to remove the mapping... - b hpmCNext ; Nope... - -hpmPanic: lis r0,hi16(Choke) ; System abend - ori r0,r0,lo16(Choke) ; System abend - li r3,failMapping ; Show that we failed some kind of mapping thing - sc - -/* - * mapping *hw_purge_space(physent, pmap) - remove a mapping from the system based upon address space - * - * Upon entry, R3 contains a pointer to a pmap. - * pa is a pointer to the physent - * - * This function removes the first mapping for a specific pmap from a physical entry - * alias list. It locks the list, extracts the vaddr and pmap from - * the first apporpriate entry. It then jumps into the hw_rem_map function. - * NOTE: since we jump into rem_map, we need to set up the stack - * identically. Also, we set the next parm to 0 so we do not - * try to save a next vaddr. - * - * We return the virtual address of the removed mapping as a - * R3. - * - * Note that this is designed to be called from 32-bit mode with a stack. - * - * We disable translation and all interruptions here. This keeps is - * from having to worry about a deadlock due to having anything locked - * and needing it to process a fault. - * - * Note that this must be done with both interruptions off and VM off - * - * - * Remove mapping via physical page (mapping_purge) - * - * 1) lock physent - * 2) extract vaddr and pmap - * 3) unlock physent - * 4) do "remove mapping via pmap" - * - * - */ - - .align 5 - .globl EXT(hw_purge_space) - -LEXT(hw_purge_space) - stwu r1,-(FM_ALIGN(hrmStackSize)+FM_SIZE)(r1) ; Make some space on the stack - mflr r0 ; Save the link register - stw r15,FM_ARG0+0x00(r1) ; Save a register - stw r16,FM_ARG0+0x04(r1) ; Save a register - stw r17,FM_ARG0+0x08(r1) ; Save a register - mfsprg r2,2 ; Get feature flags - stw r18,FM_ARG0+0x0C(r1) ; Save a register - stw r19,FM_ARG0+0x10(r1) ; Save a register - stw r20,FM_ARG0+0x14(r1) ; Save a register - stw r21,FM_ARG0+0x18(r1) ; Save a register - stw r22,FM_ARG0+0x1C(r1) ; Save a register - mtcrf 0x02,r2 ; move pf64Bit cr6 - stw r23,FM_ARG0+0x20(r1) ; Save a register - stw r24,FM_ARG0+0x24(r1) ; Save a register - stw r25,FM_ARG0+0x28(r1) ; Save a register - stw r26,FM_ARG0+0x2C(r1) ; Save a register - stw r27,FM_ARG0+0x30(r1) ; Save a register - li r6,0 ; Set no next address return - stw r28,FM_ARG0+0x34(r1) ; Save a register - stw r29,FM_ARG0+0x38(r1) ; Save a register - stw r30,FM_ARG0+0x3C(r1) ; Save a register - stw r31,FM_ARG0+0x40(r1) ; Save a register - stw r6,FM_ARG0+0x44(r1) ; Save address to save next mapped vaddr - stw r0,(FM_ALIGN(hrmStackSize)+FM_SIZE+FM_LR_SAVE)(r1) ; Save the return - -#if DEBUG - lwz r11,pmapFlags(r4) ; Get pmaps flags - rlwinm. r11,r11,0,pmapVMgsaa ; Is guest shadow assist active? - bne hpsPanic ; Call not valid for guest shadow assist pmap -#endif - - bt++ pf64Bitb,hpsSF1 ; skip if 64-bit (only they take the hint) - - lwz r9,pmapvr+4(r4) ; Get conversion mask for pmap - - b hpsSF1x ; Done... - -hpsSF1: ld r9,pmapvr(r4) ; Get conversion mask for pmap - -hpsSF1x: bl EXT(mapSetUp) ; Turn off interrupts, translation, and possibly enter 64-bit - - xor r4,r4,r9 ; Convert the pmap to physical addressing - - bl mapPhysLock ; Lock the physent - - lwz r8,pmapSpace(r4) ; Get the space hash - - bt++ pf64Bitb,hpsSF ; skip if 64-bit (only they take the hint) - - lwz r12,ppLink+4(r3) ; Grab the pointer to the first mapping - -hpsSrc32: rlwinm. r12,r12,0,~ppFlags ; Clean and test mapping address - beq hpsNone ; Did not find one... - - lhz r10,mpSpace(r12) ; Get the space - - cmplw r10,r8 ; Is this one of ours? - beq hpsFnd ; Yes... - - lwz r12,mpAlias+4(r12) ; Chain on to the next - b hpsSrc32 ; Check it out... - - .align 5 - -hpsSF: li r0,ppLFAmask - ld r12,ppLink(r3) ; Get the pointer to the first mapping - rotrdi r0,r0,ppLFArrot ; Rotate clean up mask to get 0xF0000000000000000F - -hpsSrc64: andc. r12,r12,r0 ; Clean and test mapping address - beq hpsNone ; Did not find one... - - lhz r10,mpSpace(r12) ; Get the space - - cmplw r10,r8 ; Is this one of ours? - beq hpsFnd ; Yes... - - ld r12,mpAlias(r12) ; Chain on to the next - b hpsSrc64 ; Check it out... - - .align 5 - -hpsFnd: mr r28,r4 ; Set the pmap physical address - lwz r4,mpVAddr(r12) ; Get the top of the vaddr - lwz r5,mpVAddr+4(r12) ; and the bottom - - bl mapPhysUnlock ; Time to unlock the physical entry - b hrmJoin ; Go remove the mapping... - - .align 5 - -hpsNone: bl mapPhysUnlock ; Time to unlock the physical entry - - bt++ pf64Bitb,hpsSF3 ; skip if 64-bit (only they take the hint)... - - mtmsr r11 ; Restore enables/translation/etc. - isync - b hpsRetnCmn ; Join the common return code... - -hpsSF3: mtmsrd r11 ; Restore enables/translation/etc. - isync - -; -; NOTE: we have not used any registers other than the volatiles to this point -; - -hpsRetnCmn: lwz r12,(FM_ALIGN(hrmStackSize)+FM_SIZE+FM_LR_SAVE)(r1) ; Restore the return - - li r3,mapRtEmpty ; No mappings for specified pmap on physent chain - mtlr r12 ; Restore the return - lwz r1,0(r1) ; Pop the stack - blr ; Leave... - -hpsPanic: lis r0,hi16(Choke) ; System abend - ori r0,r0,lo16(Choke) ; System abend - li r3,failMapping ; Show that we failed some kind of mapping thing - sc - -/* - * mapping *hw_scrub_guest(physent, pmap) - remove first guest mapping associated with host - * on this physent chain - * - * Locates the first guest mapping on the physent chain that is associated with the - * specified host pmap. If this succeeds, the mapping is removed by joining the general - * remove path; otherwise, we return NULL. The caller is expected to invoke this entry - * repeatedly until no additional guest mappings that match our criteria are removed. - * - * Because this entry point exits through hw_rem_map, our prolog pushes its frame. - * - * Parameters: - * r3 : physent, 32-bit kernel virtual address - * r4 : host pmap, 32-bit kernel virtual address - * - * Volatile register usage (for linkage through hrmJoin): - * r4 : high-order 32 bits of guest virtual address - * r5 : low-order 32 bits of guest virtual address - * r11: saved MSR image - * - * Non-volatile register usage: - * r26: VMM extension block's physical address - * r27: host pmap's physical address - * r28: guest pmap's physical address - * - */ - - .align 5 - .globl EXT(hw_scrub_guest) - -LEXT(hw_scrub_guest) - stwu r1,-(FM_ALIGN(hrmStackSize)+FM_SIZE)(r1) ; Make some space on the stack - mflr r0 ; Save the link register - stw r15,FM_ARG0+0x00(r1) ; Save a register - stw r16,FM_ARG0+0x04(r1) ; Save a register - stw r17,FM_ARG0+0x08(r1) ; Save a register - mfsprg r2,2 ; Get feature flags - stw r18,FM_ARG0+0x0C(r1) ; Save a register - stw r19,FM_ARG0+0x10(r1) ; Save a register - stw r20,FM_ARG0+0x14(r1) ; Save a register - stw r21,FM_ARG0+0x18(r1) ; Save a register - stw r22,FM_ARG0+0x1C(r1) ; Save a register - mtcrf 0x02,r2 ; move pf64Bit cr6 - stw r23,FM_ARG0+0x20(r1) ; Save a register - stw r24,FM_ARG0+0x24(r1) ; Save a register - stw r25,FM_ARG0+0x28(r1) ; Save a register - stw r26,FM_ARG0+0x2C(r1) ; Save a register - stw r27,FM_ARG0+0x30(r1) ; Save a register - li r6,0 ; Set no next address return - stw r28,FM_ARG0+0x34(r1) ; Save a register - stw r29,FM_ARG0+0x38(r1) ; Save a register - stw r30,FM_ARG0+0x3C(r1) ; Save a register - stw r31,FM_ARG0+0x40(r1) ; Save a register - stw r6,FM_ARG0+0x44(r1) ; Save address to save next mapped vaddr - stw r0,(FM_ALIGN(hrmStackSize)+FM_SIZE+FM_LR_SAVE)(r1) ; Save the return - - lwz r11,pmapVmmExt(r4) ; get VMM pmap extension block vaddr - - bt++ pf64Bitb,hsg64Salt ; Test for 64-bit machine - lwz r26,pmapVmmExtPhys+4(r4) ; Get VMM pmap extension block paddr - lwz r9,pmapvr+4(r4) ; Get 32-bit virt<->real conversion salt - b hsgStart ; Get to work - -hsg64Salt: ld r26,pmapVmmExtPhys(r4) ; Get VMM pmap extension block paddr - ld r9,pmapvr+4(r4) ; Get 64-bit virt<->real conversion salt - -hsgStart: bl EXT(mapSetUp) ; Disable 'rupts, translation, enter 64-bit mode - xor r27,r4,r9 ; Convert host pmap_t virt->real - bl mapPhysLock ; Lock the physent - - bt++ pf64Bitb,hsg64Scan ; Test for 64-bit machine - - lwz r12,ppLink+4(r3) ; Grab the pointer to the first mapping -hsg32Loop: rlwinm. r12,r12,0,~ppFlags ; Clean and test mapping address - beq hsg32Miss ; Did not find one... - lwz r8,mpFlags(r12) ; Get mapping's flags - lhz r7,mpSpace(r12) ; Get mapping's space id - rlwinm r8,r8,0,mpType ; Extract mapping's type code - lis r28,hi16(EXT(pmapTrans)) ; Get the top of the start of the pmap hash to pmap translate table - xori r8,r8,mpGuest ; Is it a guest mapping? - ori r28,r28,lo16(EXT(pmapTrans)) ; Get the top of the start of the pmap hash to pmap translate table - slwi r9,r7,2 ; Multiply space by 4 - lwz r28,0(r28) ; Get the actual translation map - lwz r4,mpVAddr(r12) ; Get the top of the vaddr - slwi r7,r7,3 ; Multiply space by 8 - lwz r5,mpVAddr+4(r12) ; Get the bottom of the vaddr - add r7,r7,r9 ; Get correct displacement into translate table - add r28,r28,r7 ; Point to the pmap translation - lwz r28,pmapPAddr+4(r28) ; Get guest pmap paddr - lwz r7,pmapVmmExtPhys+4(r28) ; Get VMM extension block paddr - xor r7,r7,r26 ; Is guest associated with specified host? - or. r7,r7,r8 ; Guest mapping && associated with host? - lwz r12,mpAlias+4(r12) ; Chain on to the next - bne hsg32Loop ; Try next mapping on alias chain - -hsg32Hit: bl mapPhysUnlock ; Unlock physent chain - b hrmJoin ; Join common path for mapping removal - - .align 5 -hsg32Miss: bl mapPhysUnlock ; Unlock physent chain - mtmsr r11 ; Restore 'rupts, translation - isync ; Throw a small wrench into the pipeline - li r3,mapRtEmpty ; No mappings found matching specified criteria - b hrmRetnCmn ; Exit through common epilog - - .align 5 -hsg64Scan: li r6,ppLFAmask ; Get lock, flag, attribute mask seed - ld r12,ppLink(r3) ; Grab the pointer to the first mapping - rotrdi r6,r6,ppLFArrot ; Rotate clean up mask to get 0xF0000000000000000F -hsg64Loop: andc. r12,r12,r6 ; Clean and test mapping address - beq hsg64Miss ; Did not find one... - lwz r8,mpFlags(r12) ; Get mapping's flags - lhz r7,mpSpace(r12) ; Get mapping's space id - rlwinm r8,r8,0,mpType ; Extract mapping's type code - lis r28,hi16(EXT(pmapTrans)) ; Get the top of the start of the pmap hash to pmap translate table - xori r8,r8,mpGuest ; Is it a guest mapping? - ori r28,r28,lo16(EXT(pmapTrans)) ; Get the top of the start of the pmap hash to pmap translate table - slwi r9,r7,2 ; Multiply space by 4 - lwz r28,0(r28) ; Get the actual translation map - lwz r4,mpVAddr(r12) ; Get the top of the vaddr - slwi r7,r7,3 ; Multiply space by 8 - lwz r5,mpVAddr+4(r12) ; Get the bottom of the vaddr - add r7,r7,r9 ; Get correct displacement into translate table - add r28,r28,r7 ; Point to the pmap translation - ld r28,pmapPAddr(r28) ; Get guest pmap paddr - ld r7,pmapVmmExtPhys(r28) ; Get VMM extension block paddr - xor r7,r7,r26 ; Is guest associated with specified host? - or. r7,r7,r8 ; Guest mapping && associated with host? - ld r12,mpAlias(r12) ; Chain on to the next - bne hsg64Loop ; Try next mapping on alias chain - -hsg64Hit: bl mapPhysUnlock ; Unlock physent chain - b hrmJoin ; Join common path for mapping removal - - .align 5 -hsg64Miss: bl mapPhysUnlock ; Unlock physent chain - mtmsrd r11 ; Restore 'rupts, translation - li r3,mapRtEmpty ; No mappings found matching specified criteria - b hrmRetnCmn ; Exit through common epilog - - -/* - * mapping *hw_find_space(physent, space) - finds the first mapping on physent for specified space - * - * Upon entry, R3 contains a pointer to a physent. - * space is the space ID from the pmap in question - * - * We return the virtual address of the found mapping in - * R3. Note that the mapping busy is bumped. - * - * Note that this is designed to be called from 32-bit mode with a stack. - * - * We disable translation and all interruptions here. This keeps is - * from having to worry about a deadlock due to having anything locked - * and needing it to process a fault. - * - */ - - .align 5 - .globl EXT(hw_find_space) - -LEXT(hw_find_space) - stwu r1,-(FM_SIZE)(r1) ; Make some space on the stack - mflr r0 ; Save the link register - mr r8,r4 ; Remember the space - stw r0,(FM_SIZE+FM_LR_SAVE)(r1) ; Save the return - - bl EXT(mapSetUp) ; Turn off interrupts, translation, and possibly enter 64-bit - - bl mapPhysLock ; Lock the physent - - bt++ pf64Bitb,hfsSF ; skip if 64-bit (only they take the hint) - - lwz r12,ppLink+4(r3) ; Grab the pointer to the first mapping - -hfsSrc32: rlwinm. r12,r12,0,~ppFlags ; Clean and test mapping address - beq hfsNone ; Did not find one... - - lhz r10,mpSpace(r12) ; Get the space - - cmplw r10,r8 ; Is this one of ours? - beq hfsFnd ; Yes... - - lwz r12,mpAlias+4(r12) ; Chain on to the next - b hfsSrc32 ; Check it out... - - .align 5 - -hfsSF: li r0,ppLFAmask - ld r12,ppLink(r3) ; Get the pointer to the first mapping - rotrdi r0,r0,ppLFArrot ; Rotate clean up mask to get 0xF0000000000000000F - -hfsSrc64: andc. r12,r12,r0 ; Clean and test mapping address - beq hfsNone ; Did not find one... - - lhz r10,mpSpace(r12) ; Get the space - - cmplw r10,r8 ; Is this one of ours? - beq hfsFnd ; Yes... - - ld r12,mpAlias(r12) ; Chain on to the next - b hfsSrc64 ; Check it out... - - .align 5 - -hfsFnd: mr r8,r3 ; Save the physent - mr r3,r12 ; Point to the mapping - bl mapBumpBusy ; If we found it, bump up the busy count so the mapping does not disapear - - mr r3,r8 ; Get back the physical entry - li r7,0xFFF ; Get a page size mask - bl mapPhysUnlock ; Time to unlock the physical entry - - andc r3,r12,r7 ; Move the mapping back down to a page - lwz r3,mbvrswap+4(r3) ; Get last half of virtual to real swap - xor r12,r3,r12 ; Convert to virtual - b hfsRet ; Time to return - - .align 5 - -hfsNone: bl mapPhysUnlock ; Time to unlock the physical entry - -hfsRet: bt++ pf64Bitb,hfsSF3 ; skip if 64-bit (only they take the hint)... - - mtmsr r11 ; Restore enables/translation/etc. - isync - b hfsRetnCmn ; Join the common return code... - -hfsSF3: mtmsrd r11 ; Restore enables/translation/etc. - isync - -; -; NOTE: we have not used any registers other than the volatiles to this point -; - -hfsRetnCmn: mr r3,r12 ; Get the mapping or a 0 if we failed - -#if DEBUG - mr. r3,r3 ; Anything to return? - beq hfsRetnNull ; Nope - lwz r11,mpFlags(r3) ; Get mapping flags - rlwinm r0,r11,0,mpType ; Isolate the mapping type - cmplwi r0,mpGuest ; Shadow guest mapping? - beq hfsPanic ; Yup, kick the bucket -hfsRetnNull: -#endif - - lwz r12,(FM_SIZE+FM_LR_SAVE)(r1) ; Restore the return - - mtlr r12 ; Restore the return - lwz r1,0(r1) ; Pop the stack - blr ; Leave... - -hfsPanic: lis r0,hi16(Choke) ; System abend - ori r0,r0,lo16(Choke) ; System abend - li r3,failMapping ; Show that we failed some kind of mapping thing - sc - -; -; mapping *hw_find_map(pmap, va, *nextva) - Looks up a vaddr in a pmap -; Returns 0 if not found or the virtual address of the mapping if -; if is. Also, the mapping has the busy count bumped. -; - .align 5 - .globl EXT(hw_find_map) - -LEXT(hw_find_map) - stwu r1,-(FM_ALIGN((31-25+1)*4)+FM_SIZE)(r1) ; Make some space on the stack - mflr r0 ; Save the link register - stw r25,FM_ARG0+0x00(r1) ; Save a register - stw r26,FM_ARG0+0x04(r1) ; Save a register - mr r25,r6 ; Remember address of next va - stw r27,FM_ARG0+0x08(r1) ; Save a register - stw r28,FM_ARG0+0x0C(r1) ; Save a register - stw r29,FM_ARG0+0x10(r1) ; Save a register - stw r30,FM_ARG0+0x14(r1) ; Save a register - stw r31,FM_ARG0+0x18(r1) ; Save a register - stw r0,(FM_ALIGN((31-26+1)*4)+FM_SIZE+FM_LR_SAVE)(r1) ; Save the return - -#if DEBUG - lwz r11,pmapFlags(r3) ; Get pmaps flags - rlwinm. r11,r11,0,pmapVMgsaa ; Is guest shadow assist active? - bne hfmPanic ; Call not valid for guest shadow assist pmap -#endif - - lwz r6,pmapvr(r3) ; Get the first part of the VR translation for pmap - lwz r7,pmapvr+4(r3) ; Get the second part - - - bl EXT(mapSetUp) ; Turn off interrupts, translation, and possibly enter 64-bit - - mr r27,r11 ; Remember the old MSR - mr r26,r12 ; Remember the feature bits - - xor r28,r3,r7 ; Change the common 32- and 64-bit half - - bf-- pf64Bitb,hfmSF1 ; skip if 32-bit... - - rldimi r28,r6,32,0 ; Shift the fixed upper part of the physical over and cram in top - -hfmSF1: mr r29,r4 ; Save top half of vaddr - mr r30,r5 ; Save the bottom half - - la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkShared ; Go get a shared lock on the mapping lists - mr. r3,r3 ; Did we get the lock? - bne-- hfmBadLock ; Nope... - - mr r3,r28 ; get the pmap address - mr r4,r29 ; Get bits 0:31 to look for - mr r5,r30 ; Get bits 32:64 - - bl EXT(mapSearch) ; Go see if we can find it (note: R7 comes back with mpFlags) - - rlwinm r0,r7,0,mpRIPb,mpRIPb ; Find remove in progress bit - mr. r31,r3 ; Save the mapping if we found it - cmplwi cr1,r0,0 ; Are we removing? - mr r29,r4 ; Save next va high half - crorc cr0_eq,cr0_eq,cr1_eq ; Not found or removing - mr r30,r5 ; Save next va low half - li r6,0 ; Assume we did not find it - li r26,0xFFF ; Get a mask to relocate to start of mapping page - - bt-- cr0_eq,hfmNotFnd ; We did not find it... - - bl mapBumpBusy ; If we found it, bump up the busy count so the mapping does not disapear - - andc r4,r31,r26 ; Get back to the mapping page start - -; Note: we can treat 32- and 64-bit the same here. Because we are going from -; physical to virtual and we only do 32-bit virtual, we only need the low order -; word of the xor. - - lwz r4,mbvrswap+4(r4) ; Get last half of virtual to real swap - li r6,-1 ; Indicate we found it and it is not being removed - xor r31,r31,r4 ; Flip to virtual - -hfmNotFnd: la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkUnlock ; Unlock the search list - - rlwinm r3,r31,0,0,31 ; Move mapping to return register and clear top of register if 64-bit - and r3,r3,r6 ; Clear if not found or removing - -hfmReturn: bt++ pf64Bitb,hfmR64 ; Yes... - - mtmsr r27 ; Restore enables/translation/etc. - isync - b hfmReturnC ; Join common... - -hfmR64: mtmsrd r27 ; Restore enables/translation/etc. - isync - -hfmReturnC: stw r29,0(r25) ; Save the top of the next va - stw r30,4(r25) ; Save the bottom of the next va - lwz r0,(FM_ALIGN((31-25+1)*4)+FM_SIZE+FM_LR_SAVE)(r1) ; Save the return - lwz r25,FM_ARG0+0x00(r1) ; Restore a register - lwz r26,FM_ARG0+0x04(r1) ; Restore a register - and r3,r3,r6 ; Clear return if the mapping is being removed - lwz r27,FM_ARG0+0x08(r1) ; Restore a register - mtlr r0 ; Restore the return - lwz r28,FM_ARG0+0x0C(r1) ; Restore a register - lwz r29,FM_ARG0+0x10(r1) ; Restore a register - lwz r30,FM_ARG0+0x14(r1) ; Restore a register - lwz r31,FM_ARG0+0x18(r1) ; Restore a register - lwz r1,0(r1) ; Pop the stack - blr ; Leave... - - .align 5 - -hfmBadLock: li r3,1 ; Set lock time out error code - b hfmReturn ; Leave.... - -hfmPanic: lis r0,hi16(Choke) ; System abend - ori r0,r0,lo16(Choke) ; System abend - li r3,failMapping ; Show that we failed some kind of mapping thing - sc - - -/* - * void hw_clear_maps(void) - * - * Remove all mappings for all phys entries. - * - * - */ - - .align 5 - .globl EXT(hw_clear_maps) - -LEXT(hw_clear_maps) - mflr r10 ; Save the link register - mfcr r9 ; Save the condition register - bl EXT(mapSetUp) ; Turn off interrupts, translation, and possibly enter 64-bit - - lis r5,hi16(EXT(pmap_mem_regions)) ; Point to the start of the region table - ori r5,r5,lo16(EXT(pmap_mem_regions)) ; Point to the start of the region table - -hcmNextRegion: - lwz r3,mrPhysTab(r5) ; Get the actual table address - lwz r0,mrStart(r5) ; Get start of table entry - lwz r4,mrEnd(r5) ; Get end of table entry - addi r5,r5,mrSize ; Point to the next regions - - cmplwi r3,0 ; No more regions? - beq-- hcmDone ; Leave... - - sub r4,r4,r0 ; Calculate physical entry count - addi r4,r4,1 - mtctr r4 - - bt++ pf64Bitb,hcmNextPhys64 ; 64-bit version - - -hcmNextPhys32: - lwz r4,ppLink+4(r3) ; Grab the pointer to the first mapping - addi r3,r3,physEntrySize ; Next phys_entry - -hcmNextMap32: - rlwinm. r4,r4,0,~ppFlags ; Clean and test mapping address - beq hcmNoMap32 ; Did not find one... - - lwz r0,mpPte(r4) ; Grab the offset to the PTE - rlwinm r0,r0,0,~mpHValid ; Clear out valid bit - stw r0,mpPte(r4) ; Get the quick pointer again - - lwz r4,mpAlias+4(r4) ; Chain on to the next - b hcmNextMap32 ; Check it out... -hcmNoMap32: - bdnz hcmNextPhys32 - b hcmNextRegion - - - .align 5 -hcmNextPhys64: - li r0,ppLFAmask ; Get mask to clean up mapping pointer - ld r4,ppLink(r3) ; Get the pointer to the first mapping - rotrdi r0,r0,ppLFArrot ; Rotate clean up mask to get 0xF0000000000000000F - addi r3,r3,physEntrySize ; Next phys_entry - -hcmNextMap64: - andc. r4,r4,r0 ; Clean and test mapping address - beq hcmNoMap64 ; Did not find one... - - lwz r0,mpPte(r4) ; Grab the offset to the PTE - rlwinm r0,r0,0,~mpHValid ; Clear out valid bit - stw r0,mpPte(r4) ; Get the quick pointer again - - ld r4,mpAlias(r4) ; Chain on to the next - li r0,ppLFAmask ; Get mask to clean up mapping pointer - rotrdi r0,r0,ppLFArrot ; Rotate clean up mask to get 0xF0000000000000000F - b hcmNextMap64 ; Check it out... -hcmNoMap64: - bdnz hcmNextPhys64 - b hcmNextRegion - - - .align 5 -hcmDone: - mtlr r10 ; Restore the return - mtcr r9 ; Restore the condition register - bt++ pf64Bitb,hcmDone64 ; 64-bit version -hcmDone32: - mtmsr r11 ; Restore translation/mode/etc. - isync - blr ; Leave... - -hcmDone64: - mtmsrd r11 ; Restore translation/mode/etc. - isync - blr ; Leave... - - - -/* - * unsigned int hw_walk_phys(pp, preop, op, postop, parm, opmod) - * walks all mapping for a physical page and performs - * specified operations on each. - * - * pp is unlocked physent - * preop is operation to perform on physent before walk. This would be - * used to set cache attribute or protection - * op is the operation to perform on each mapping during walk - * postop is operation to perform in the phsyent after walk. this would be - * used to set or reset the RC bits. - * opmod modifies the action taken on any connected PTEs visited during - * the mapping walk. - * - * We return the RC bits from before postop is run. - * - * Note that this is designed to be called from 32-bit mode with a stack. - * - * We disable translation and all interruptions here. This keeps is - * from having to worry about a deadlock due to having anything locked - * and needing it to process a fault. - * - * We lock the physent, execute preop, and then walk each mapping in turn. - * If there is a PTE, it is invalidated and the RC merged into the physent. - * Then we call the op function. - * Then we revalidate the PTE. - * Once all all mappings are finished, we save the physent RC and call the - * postop routine. Then we unlock the physent and return the RC. - * - * - */ - - .align 5 - .globl EXT(hw_walk_phys) - -LEXT(hw_walk_phys) - stwu r1,-(FM_ALIGN((31-24+1)*4)+FM_SIZE)(r1) ; Make some space on the stack - mflr r0 ; Save the link register - stw r24,FM_ARG0+0x00(r1) ; Save a register - stw r25,FM_ARG0+0x04(r1) ; Save a register - stw r26,FM_ARG0+0x08(r1) ; Save a register - stw r27,FM_ARG0+0x0C(r1) ; Save a register - mr r24,r8 ; Save the parm - mr r25,r7 ; Save the parm - stw r28,FM_ARG0+0x10(r1) ; Save a register - stw r29,FM_ARG0+0x14(r1) ; Save a register - stw r30,FM_ARG0+0x18(r1) ; Save a register - stw r31,FM_ARG0+0x1C(r1) ; Save a register - stw r0,(FM_ALIGN((31-24+1)*4)+FM_SIZE+FM_LR_SAVE)(r1) ; Save the return - - bl EXT(mapSetUp) ; Turn off interrupts, translation, and possibly enter 64-bit - - mfsprg r26,0 ; (INSTRUMENTATION) - lwz r27,hwWalkPhys(r26) ; (INSTRUMENTATION) - addi r27,r27,1 ; (INSTRUMENTATION) - stw r27,hwWalkPhys(r26) ; (INSTRUMENTATION) - la r26,hwWalkFull(r26) ; (INSTRUMENTATION) - slwi r12,r24,2 ; (INSTRUMENTATION) - lwzx r27,r26,r12 ; (INSTRUMENTATION) - addi r27,r27,1 ; (INSTRUMENTATION) - stwx r27,r26,r12 ; (INSTRUMENTATION) - - mr r26,r11 ; Save the old MSR - lis r27,hi16(hwpOpBase) ; Get high order of op base - slwi r4,r4,7 ; Convert preop to displacement - ori r27,r27,lo16(hwpOpBase) ; Get low order of op base - slwi r5,r5,7 ; Convert op to displacement - add r12,r4,r27 ; Point to the preop routine - slwi r28,r6,7 ; Convert postop to displacement - mtctr r12 ; Set preop routine - add r28,r28,r27 ; Get the address of the postop routine - add r27,r5,r27 ; Get the address of the op routine - - bl mapPhysLock ; Lock the physent - - mr r29,r3 ; Save the physent address - - bt++ pf64Bitb,hwp64 ; skip if 64-bit (only they take the hint) - - bctrl ; Call preop routine - bne- hwpEarly32 ; preop says to bail now... - - cmplwi r24,hwpMergePTE ; Classify operation modifier - mtctr r27 ; Set up the op function address - lwz r31,ppLink+4(r3) ; Grab the pointer to the first mapping - blt hwpSrc32 ; Do TLB invalidate/purge/merge/reload for each mapping - beq hwpMSrc32 ; Do TLB merge for each mapping - -hwpQSrc32: rlwinm. r31,r31,0,~ppFlags ; Clean and test mapping address - beq hwpNone32 ; Did not find one... - - bctrl ; Call the op function - - bne- hwpEarly32 ; op says to bail now... - lwz r31,mpAlias+4(r31) ; Chain on to the next - b hwpQSrc32 ; Check it out... - - .align 5 -hwpMSrc32: rlwinm. r31,r31,0,~ppFlags ; Clean and test mapping address - beq hwpNone32 ; Did not find one... - - bl mapMergeRC32 ; Merge reference and change into mapping and physent - bctrl ; Call the op function - - bne- hwpEarly32 ; op says to bail now... - lwz r31,mpAlias+4(r31) ; Chain on to the next - b hwpMSrc32 ; Check it out... - - .align 5 -hwpSrc32: rlwinm. r31,r31,0,~ppFlags ; Clean and test mapping address - beq hwpNone32 ; Did not find one... - -; -; Note: mapInvPte32 returns the PTE in R3 (or 0 if none), PTE high in R4, -; PTE low in R5. The PCA address is in R7. The PTEG come back locked. -; If there is no PTE, PTE low is obtained from mapping -; - bl mapInvPte32 ; Invalidate and lock PTE, also merge into physent - - bctrl ; Call the op function - - crmove cr1_eq,cr0_eq ; Save the return code - - mr. r3,r3 ; Was there a previously valid PTE? - beq- hwpNxt32 ; Nope... - - stw r5,4(r3) ; Store second half of PTE - eieio ; Make sure we do not reorder - stw r4,0(r3) ; Revalidate the PTE - - eieio ; Make sure all updates come first - stw r6,0(r7) ; Unlock the PCA - -hwpNxt32: bne- cr1,hwpEarly32 ; op says to bail now... - lwz r31,mpAlias+4(r31) ; Chain on to the next - b hwpSrc32 ; Check it out... - - .align 5 - -hwpNone32: mtctr r28 ; Get the post routine address - - lwz r30,ppLink+4(r29) ; Save the old RC - mr r3,r29 ; Get the physent address - bctrl ; Call post routine - - bl mapPhysUnlock ; Unlock the physent - - mtmsr r26 ; Restore translation/mode/etc. - isync - - b hwpReturn ; Go restore registers and return... - - .align 5 - -hwpEarly32: lwz r30,ppLink+4(r29) ; Save the old RC - mr r3,r29 ; Get the physent address - bl mapPhysUnlock ; Unlock the physent - - mtmsr r26 ; Restore translation/mode/etc. - isync - - b hwpReturn ; Go restore registers and return... - - .align 5 - -hwp64: bctrl ; Call preop routine - bne-- hwpEarly64 ; preop says to bail now... - - cmplwi r24,hwpMergePTE ; Classify operation modifier - mtctr r27 ; Set up the op function address - - li r24,ppLFAmask - ld r31,ppLink(r3) ; Get the pointer to the first mapping - rotrdi r24,r24,ppLFArrot ; Rotate clean up mask to get 0xF0000000000000000F - blt hwpSrc64 ; Do TLB invalidate/purge/merge/reload for each mapping - beq hwpMSrc64 ; Do TLB merge for each mapping - -hwpQSrc64: andc. r31,r31,r24 ; Clean and test mapping address - beq hwpNone64 ; Did not find one... - - bctrl ; Call the op function - - bne-- hwpEarly64 ; op says to bail now... - ld r31,mpAlias(r31) ; Chain on to the next - b hwpQSrc64 ; Check it out... - - .align 5 -hwpMSrc64: andc. r31,r31,r24 ; Clean and test mapping address - beq hwpNone64 ; Did not find one... - - bl mapMergeRC64 ; Merge reference and change into mapping and physent - bctrl ; Call the op function - - bne-- hwpEarly64 ; op says to bail now... - ld r31,mpAlias(r31) ; Chain on to the next - b hwpMSrc64 ; Check it out... - - .align 5 -hwpSrc64: andc. r31,r31,r24 ; Clean and test mapping address - beq hwpNone64 ; Did not find one... -; -; Note: mapInvPte64 returns the PTE in R3 (or 0 if none), PTE high in R4, -; PTE low in R5. PTEG comes back locked if there is one -; - bl mapInvPte64 ; Invalidate and lock PTEG, also merge into physent - - bctrl ; Call the op function - - crmove cr1_eq,cr0_eq ; Save the return code - - mr. r3,r3 ; Was there a previously valid PTE? - beq-- hwpNxt64 ; Nope... - - std r5,8(r3) ; Save bottom of PTE - eieio ; Make sure we do not reorder - std r4,0(r3) ; Revalidate the PTE - - eieio ; Make sure all updates come first - stw r6,0(r7) ; Unlock the PCA - -hwpNxt64: bne-- cr1,hwpEarly64 ; op says to bail now... - ld r31,mpAlias(r31) ; Chain on to the next - b hwpSrc64 ; Check it out... - - .align 5 - -hwpNone64: mtctr r28 ; Get the post routine address - - lwz r30,ppLink+4(r29) ; Save the old RC - mr r3,r29 ; Get the physent address - bctrl ; Call post routine - - bl mapPhysUnlock ; Unlock the physent - - mtmsrd r26 ; Restore translation/mode/etc. - isync - b hwpReturn ; Go restore registers and return... - - .align 5 - -hwpEarly64: lwz r30,ppLink+4(r29) ; Save the old RC - mr r3,r29 ; Get the physent address - bl mapPhysUnlock ; Unlock the physent - - mtmsrd r26 ; Restore translation/mode/etc. - isync - -hwpReturn: lwz r0,(FM_ALIGN((31-24+1)*4)+FM_SIZE+FM_LR_SAVE)(r1) ; Restore the return - lwz r24,FM_ARG0+0x00(r1) ; Restore a register - lwz r25,FM_ARG0+0x04(r1) ; Restore a register - lwz r26,FM_ARG0+0x08(r1) ; Restore a register - mr r3,r30 ; Pass back the RC - lwz r27,FM_ARG0+0x0C(r1) ; Restore a register - lwz r28,FM_ARG0+0x10(r1) ; Restore a register - mtlr r0 ; Restore the return - lwz r29,FM_ARG0+0x14(r1) ; Restore a register - lwz r30,FM_ARG0+0x18(r1) ; Restore a register - lwz r31,FM_ARG0+0x1C(r1) ; Restore a register - lwz r1,0(r1) ; Pop the stack - blr ; Leave... - - -; -; The preop/op/postop function table. -; Each function must be 64-byte aligned and be no more than -; 16 instructions. If more than 16, we must fix address calculations -; at the start of hwpOpBase -; -; The routine must set CR0_EQ in order to continue scan. -; If CR0_EQ is not set, an early return from the function is made. -; - - .align 7 - -hwpOpBase: - -; Function 0 - No operation - -hwpNoop: cmplw r0,r0 ; Make sure CR0_EQ is set - blr ; Just return... - - .align 5 - -; This is the continuation of function 4 - Set attributes in mapping - -; We changed the attributes of a mapped page. Make sure there are no cache paradoxes. -; NOTE: Do we have to deal with i-cache here? - -hwpSAM: li r11,4096 ; Get page size - -hwpSAMinvd: sub. r11,r11,r9 ; Back off a line - dcbf r11,r5 ; Flush the line in the data cache - bgt++ hwpSAMinvd ; Go do the rest of it... - - sync ; Make sure it is done - - li r11,4096 ; Get page size - -hwpSAMinvi: sub. r11,r11,r9 ; Back off a line - icbi r11,r5 ; Flush the line in the icache - bgt++ hwpSAMinvi ; Go do the rest of it... - - sync ; Make sure it is done - - cmpw r0,r0 ; Make sure we return CR0_EQ - blr ; Return... - - -; Function 1 - Set protection in physent (obsolete) - - .set .,hwpOpBase+(1*128) ; Generate error if previous function too long - -hwpSPrtPhy: cmplw r0,r0 ; Make sure we return CR0_EQ - blr ; Return... - - -; Function 2 - Set protection in mapping - -; NOTE: Changes to no-execute permission are ignored - - .set .,hwpOpBase+(2*128) ; Generate error if previous function too long - -hwpSPrtMap: lwz r9,mpFlags(r31) ; Get the mapping flags - lwz r8,mpVAddr+4(r31) ; Get the protection part of mapping - rlwinm. r9,r9,0,mpPermb,mpPermb ; Is the mapping permanent? - li r0,lo16(mpPP) ; Get protection bits - crnot cr0_eq,cr0_eq ; Change CR0_EQ to true if mapping is permanent - rlwinm r2,r25,0,mpPP ; Isolate new protection bits - beqlr-- ; Leave if permanent mapping (before we trash R5)... - andc r5,r5,r0 ; Clear the old prot bits - or r5,r5,r2 ; Move in the new prot bits - rlwimi r8,r5,0,20,31 ; Copy into the mapping copy - cmpw r0,r0 ; Make sure we return CR0_EQ - stw r8,mpVAddr+4(r31) ; Set the flag part of mapping - blr ; Leave... - -; Function 3 - Set attributes in physent - - .set .,hwpOpBase+(3*128) ; Generate error if previous function too long - -hwpSAtrPhy: li r5,ppLink ; Get offset for flag part of physent - -hwpSAtrPhX: lwarx r4,r5,r29 ; Get the old flags - rlwimi r4,r25,0,ppIb,ppGb ; Stick in the new attributes - stwcx. r4,r5,r29 ; Try to stuff it - bne-- hwpSAtrPhX ; Try again... -; Note: CR0_EQ is set because of stwcx. - blr ; Return... - -; Function 4 - Set attributes in mapping - - .set .,hwpOpBase+(4*128) ; Generate error if previous function too long - -hwpSAtrMap: lwz r9,mpFlags(r31) ; Get the mapping flags - lwz r8,mpVAddr+4(r31) ; Get the attribute part of mapping - li r2,mpM ; Force on coherent - rlwinm. r9,r9,0,mpPermb,mpPermb ; Is the mapping permanent? - li r0,lo16(mpWIMG) ; Get wimg mask - crnot cr0_eq,cr0_eq ; Change CR0_EQ to true if mapping is permanent - rlwimi r2,r25,32-(mpIb-32-ppIb),mpIb-32,mpIb-32 - ; Copy in the cache inhibited bit - beqlr-- ; Leave if permanent mapping (before we trash R5)... - andc r5,r5,r0 ; Clear the old wimg - rlwimi r2,r25,32-(mpGb-32-ppGb),mpGb-32,mpGb-32 - ; Copy in the guarded bit - mfsprg r9,2 ; Feature flags - or r5,r5,r2 ; Move in the new wimg - rlwimi r8,r5,0,20,31 ; Copy into the mapping copy - lwz r2,mpPAddr(r31) ; Get the physical address - li r0,0xFFF ; Start a mask - andi. r9,r9,pf32Byte+pf128Byte ; Get cache line size - rlwinm r5,r0,0,1,0 ; Copy to top half - stw r8,mpVAddr+4(r31) ; Set the flag part of mapping - rlwinm r2,r2,12,1,0 ; Copy to top and rotate to make physical address with junk left - and r5,r5,r2 ; Clean stuff in top 32 bits - andc r2,r2,r0 ; Clean bottom too - rlwimi r5,r2,0,0,31 ; Insert low 23 to make full physical address - b hwpSAM ; Join common - -; NOTE: we moved the remainder of the code out of here because it -; did not fit in the 128 bytes allotted. It got stuck into the free space -; at the end of the no-op function. - - - - -; Function 5 - Clear reference in physent - - .set .,hwpOpBase+(5*128) ; Generate error if previous function too long - -hwpCRefPhy: li r5,ppLink+4 ; Get offset for flag part of physent - -hwpCRefPhX: lwarx r4,r5,r29 ; Get the old flags - rlwinm r4,r4,0,ppRb+1-32,ppRb-1-32 ; Clear R - stwcx. r4,r5,r29 ; Try to stuff it - bne-- hwpCRefPhX ; Try again... -; Note: CR0_EQ is set because of stwcx. - blr ; Return... - - -; Function 6 - Clear reference in mapping - - .set .,hwpOpBase+(6*128) ; Generate error if previous function too long - -hwpCRefMap: li r0,lo16(mpR) ; Get reference bit - lwz r8,mpVAddr+4(r31) ; Get the flag part of mapping - andc r5,r5,r0 ; Clear in PTE copy - andc r8,r8,r0 ; and in the mapping - cmpw r0,r0 ; Make sure we return CR0_EQ - stw r8,mpVAddr+4(r31) ; Set the flag part of mapping - blr ; Return... - - -; Function 7 - Clear change in physent - - .set .,hwpOpBase+(7*128) ; Generate error if previous function too long - -hwpCCngPhy: li r5,ppLink+4 ; Get offset for flag part of physent - -hwpCCngPhX: lwarx r4,r5,r29 ; Get the old flags - rlwinm r4,r4,0,ppCb+1-32,ppCb-1-32 ; Clear C - stwcx. r4,r5,r29 ; Try to stuff it - bne-- hwpCCngPhX ; Try again... -; Note: CR0_EQ is set because of stwcx. - blr ; Return... - - -; Function 8 - Clear change in mapping - - .set .,hwpOpBase+(8*128) ; Generate error if previous function too long - -hwpCCngMap: li r0,lo16(mpC) ; Get change bit - lwz r8,mpVAddr+4(r31) ; Get the flag part of mapping - andc r5,r5,r0 ; Clear in PTE copy - andc r8,r8,r0 ; and in the mapping - cmpw r0,r0 ; Make sure we return CR0_EQ - stw r8,mpVAddr+4(r31) ; Set the flag part of mapping - blr ; Return... - - -; Function 9 - Set reference in physent - - .set .,hwpOpBase+(9*128) ; Generate error if previous function too long - -hwpSRefPhy: li r5,ppLink+4 ; Get offset for flag part of physent - -hwpSRefPhX: lwarx r4,r5,r29 ; Get the old flags - ori r4,r4,lo16(ppR) ; Set the reference - stwcx. r4,r5,r29 ; Try to stuff it - bne-- hwpSRefPhX ; Try again... -; Note: CR0_EQ is set because of stwcx. - blr ; Return... - - -; Function 10 - Set reference in mapping - - .set .,hwpOpBase+(10*128) ; Generate error if previous function too long - -hwpSRefMap: lwz r8,mpVAddr+4(r31) ; Get the flag part of mapping - ori r8,r8,lo16(mpR) ; Set reference in mapping - cmpw r0,r0 ; Make sure we return CR0_EQ - stw r8,mpVAddr+4(r31) ; Set the flag part of mapping - blr ; Return... - -; Function 11 - Set change in physent - - .set .,hwpOpBase+(11*128) ; Generate error if previous function too long - -hwpSCngPhy: li r5,ppLink+4 ; Get offset for flag part of physent - -hwpSCngPhX: lwarx r4,r5,r29 ; Get the old flags - ori r4,r4,lo16(ppC) ; Set the change bit - stwcx. r4,r5,r29 ; Try to stuff it - bne-- hwpSCngPhX ; Try again... -; Note: CR0_EQ is set because of stwcx. - blr ; Return... - -; Function 12 - Set change in mapping - - .set .,hwpOpBase+(12*128) ; Generate error if previous function too long - -hwpSCngMap: lwz r8,mpVAddr+4(r31) ; Get the flag part of mapping - ori r8,r8,lo16(mpC) ; Set chage in mapping - cmpw r0,r0 ; Make sure we return CR0_EQ - stw r8,mpVAddr+4(r31) ; Set the flag part of mapping - blr ; Return... - -; Function 13 - Test reference in physent - - .set .,hwpOpBase+(13*128) ; Generate error if previous function too long - -hwpTRefPhy: lwz r0,ppLink+4(r29) ; Get the flags from physent - rlwinm. r0,r0,0,ppRb-32,ppRb-32 ; Isolate reference bit and see if 0 - blr ; Return (CR0_EQ set to continue if reference is off)... - - -; Function 14 - Test reference in mapping - - .set .,hwpOpBase+(14*128) ; Generate error if previous function too long - -hwpTRefMap: rlwinm. r0,r5,0,mpRb-32,mpRb-32 ; Isolate reference bit and see if 0 - blr ; Return (CR0_EQ set to continue if reference is off)... - - -; Function 15 - Test change in physent - - .set .,hwpOpBase+(15*128) ; Generate error if previous function too long - -hwpTCngPhy: lwz r0,ppLink+4(r29) ; Get the flags from physent - rlwinm. r0,r0,0,ppCb-32,ppCb-32 ; Isolate change bit and see if 0 - blr ; Return (CR0_EQ set to continue if change is off)... - - -; Function 16 - Test change in mapping - - .set .,hwpOpBase+(16*128) ; Generate error if previous function too long - -hwpTCngMap: rlwinm. r0,r5,0,mpCb-32,mpCb-32 ; Isolate change bit and see if 0 - blr ; Return (CR0_EQ set to continue if change is off)... - - -; Function 17 - Test reference and change in physent - - .set .,hwpOpBase+(17*128) ; Generate error if previous function too long - -hwpTRefCngPhy: - lwz r0,ppLink+4(r29) ; Get the flags from physent - rlwinm r0,r0,0,ppRb-32,ppCb-32 ; Isolate reference and change bits - cmplwi r0,lo16(ppR|ppC) ; cr0_eq <- ((R == 1) && (C == 1)) - crnot cr0_eq,cr0_eq ; cr0_eq <- ((R == 0) || (C == 0)) - blr ; Return (CR0_EQ set to continue if either R or C is off)... - - -; Function 18 - Test reference and change in mapping - - .set .,hwpOpBase+(18*128) ; Generate error if previous function too long -hwpTRefCngMap: - rlwinm r0,r5,0,mpRb-32,mpCb-32 ; Isolate reference and change bits from mapping - cmplwi r0,lo16(mpR|mpC) ; cr0_eq <- ((R == 1) && (C == 1)) - crnot cr0_eq,cr0_eq ; cr0_eq <- ((R == 0) || (C == 0)) - blr ; Return (CR0_EQ set to continue if either R or C is off)... - - -; Function 19 - Clear reference and change in physent - - .set .,hwpOpBase+(19*128) ; Generate error if previous function too long -hwpCRefCngPhy: - li r5,ppLink+4 ; Get offset for flag part of physent - -hwpCRefCngPhX: - lwarx r4,r5,r29 ; Get the old flags - andc r4,r4,r25 ; Clear R and C as specified by mask - stwcx. r4,r5,r29 ; Try to stuff it - bne-- hwpCRefCngPhX ; Try again... -; Note: CR0_EQ is set because of stwcx. - blr ; Return... - - -; Function 20 - Clear reference and change in mapping - - .set .,hwpOpBase+(20*128) ; Generate error if previous function too long -hwpCRefCngMap: - srwi r0,r25,(ppRb - mpRb) ; Align reference/change clear mask (phys->map) - lwz r8,mpVAddr+4(r31) ; Get the flag part of mapping - andc r5,r5,r0 ; Clear in PTE copy - andc r8,r8,r0 ; and in the mapping - cmpw r0,r0 ; Make sure we return CR0_EQ - stw r8,mpVAddr+4(r31) ; Set the flag part of mapping - blr ; Return... - - - .set .,hwpOpBase+(21*128) ; Generate error if previous function too long - -; -; unsigned int hw_protect(pmap, va, prot, *nextva) - Changes protection on a specific mapping. -; -; Returns: -; mapRtOK - if all is ok -; mapRtBadLk - if mapping lock fails -; mapRtPerm - if mapping is permanent -; mapRtNotFnd - if mapping is not found -; mapRtBlock - if mapping is a block -; - .align 5 - .globl EXT(hw_protect) - -LEXT(hw_protect) - stwu r1,-(FM_ALIGN((31-24+1)*4)+FM_SIZE)(r1) ; Make some space on the stack - mflr r0 ; Save the link register - stw r24,FM_ARG0+0x00(r1) ; Save a register - stw r25,FM_ARG0+0x04(r1) ; Save a register - mr r25,r7 ; Remember address of next va - stw r26,FM_ARG0+0x08(r1) ; Save a register - stw r27,FM_ARG0+0x0C(r1) ; Save a register - stw r28,FM_ARG0+0x10(r1) ; Save a register - mr r24,r6 ; Save the new protection flags - stw r29,FM_ARG0+0x14(r1) ; Save a register - stw r30,FM_ARG0+0x18(r1) ; Save a register - stw r31,FM_ARG0+0x1C(r1) ; Save a register - stw r0,(FM_ALIGN((31-24+1)*4)+FM_SIZE+FM_LR_SAVE)(r1) ; Save the return - -#if DEBUG - lwz r11,pmapFlags(r3) ; Get pmaps flags - rlwinm. r11,r11,0,pmapVMgsaa ; Is guest shadow assist active? - bne hpPanic ; Call not valid for guest shadow assist pmap -#endif - - lwz r6,pmapvr(r3) ; Get the first part of the VR translation for pmap - lwz r7,pmapvr+4(r3) ; Get the second part - - - bl EXT(mapSetUp) ; Turn off interrupts, translation, and possibly enter 64-bit - - mr r27,r11 ; Remember the old MSR - mr r26,r12 ; Remember the feature bits - - xor r28,r3,r7 ; Change the common 32- and 64-bit half - - bf-- pf64Bitb,hpSF1 ; skip if 32-bit... - - rldimi r28,r6,32,0 ; Shift the fixed upper part of the physical over and cram in top - -hpSF1: mr r29,r4 ; Save top half of vaddr - mr r30,r5 ; Save the bottom half - - la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkShared ; Go get a shared lock on the mapping lists - mr. r3,r3 ; Did we get the lock? - bne-- hpBadLock ; Nope... - - mr r3,r28 ; get the pmap address - mr r4,r29 ; Get bits 0:31 to look for - mr r5,r30 ; Get bits 32:64 - - bl EXT(mapSearch) ; Go see if we can find it (note: R7 comes back with mpFlags) - - rlwinm. r0,r7,0,mpType ; Is this a normal mapping? - crmove cr1_eq,cr0_eq ; cr1_eq <- this is a normal mapping - andi. r0,r7,mpPerm|mpRIP ; Is it permanent or being removed? - cror cr1_eq,cr0_eq,cr1_eq ; cr1_eq <- normal mapping and not permanent and not being removed - mr. r31,r3 ; Save the mapping if we found it - mr r29,r4 ; Save next va high half - mr r30,r5 ; Save next va low half - - beq-- hpNotFound ; Not found... - - bf-- cr1_eq,hpNotAllowed ; Something special is happening... - - bt++ pf64Bitb,hpDo64 ; Split for 64 bit - - bl mapInvPte32 ; Invalidate and lock PTEG, also merge into physent - - rlwimi r5,r24,0,mpPPb-32,mpPPe-32 ; Stick in the new pp (note that we ignore no-execute for 32-bit) - mr. r3,r3 ; Was there a previously valid PTE? - - stb r5,mpVAddr+7(r31) ; Set the new pp field (do not muck with the rest) - - beq-- hpNoOld32 ; Nope... - - stw r5,4(r3) ; Store second half of PTE - eieio ; Make sure we do not reorder - stw r4,0(r3) ; Revalidate the PTE - - eieio ; Make sure all updates come first - stw r6,0(r7) ; Unlock PCA - -hpNoOld32: la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkUnlock ; Unlock the search list - - li r3,mapRtOK ; Set normal return - b hpR32 ; Join common... - - .align 5 - - -hpDo64: bl mapInvPte64 ; Invalidate and lock PTEG, also merge into physent - - rldimi r5,r24,0,mpNb ; Stick in the new no-exectue and pp bits - mr. r3,r3 ; Was there a previously valid PTE? - - stb r5,mpVAddr+7(r31) ; Set the new pp field (do not muck with the rest) - - beq-- hpNoOld64 ; Nope... - - std r5,8(r3) ; Store second half of PTE - eieio ; Make sure we do not reorder - std r4,0(r3) ; Revalidate the PTE - - eieio ; Make sure all updates come first - stw r6,0(r7) ; Unlock PCA - -hpNoOld64: la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkUnlock ; Unlock the search list - - li r3,mapRtOK ; Set normal return - b hpR64 ; Join common... - - .align 5 - -hpReturn: bt++ pf64Bitb,hpR64 ; Yes... - -hpR32: mtmsr r27 ; Restore enables/translation/etc. - isync - b hpReturnC ; Join common... - -hpR64: mtmsrd r27 ; Restore enables/translation/etc. - isync - -hpReturnC: stw r29,0(r25) ; Save the top of the next va - stw r30,4(r25) ; Save the bottom of the next va - lwz r0,(FM_ALIGN((31-24+1)*4)+FM_SIZE+FM_LR_SAVE)(r1) ; Save the return - lwz r24,FM_ARG0+0x00(r1) ; Save a register - lwz r25,FM_ARG0+0x04(r1) ; Save a register - lwz r26,FM_ARG0+0x08(r1) ; Save a register - mtlr r0 ; Restore the return - lwz r27,FM_ARG0+0x0C(r1) ; Save a register - lwz r28,FM_ARG0+0x10(r1) ; Save a register - lwz r29,FM_ARG0+0x14(r1) ; Save a register - lwz r30,FM_ARG0+0x18(r1) ; Save a register - lwz r31,FM_ARG0+0x1C(r1) ; Save a register - lwz r1,0(r1) ; Pop the stack - blr ; Leave... - - .align 5 - -hpBadLock: li r3,mapRtBadLk ; Set lock time out error code - b hpReturn ; Leave.... - -hpNotFound: la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkUnlock ; Unlock the search list - - li r3,mapRtNotFnd ; Set that we did not find the requested page - b hpReturn ; Leave.... - -hpNotAllowed: - rlwinm. r0,r7,0,mpRIPb,mpRIPb ; Is it actually being removed? - la r3,pmapSXlk(r28) ; Point to the pmap search lock - bne-- hpNotFound ; Yeah... - bl sxlkUnlock ; Unlock the search list - - li r3,mapRtBlock ; Assume it was a block - rlwinm r0,r7,0,mpType ; Isolate mapping type - cmplwi r0,mpBlock ; Is this a block mapping? - beq++ hpReturn ; Yes, leave... - - li r3,mapRtPerm ; Set that we hit a permanent page - b hpReturn ; Leave.... - -hpPanic: lis r0,hi16(Choke) ; System abend - ori r0,r0,lo16(Choke) ; System abend - li r3,failMapping ; Show that we failed some kind of mapping thing - sc - - -; -; int hw_test_rc(pmap, va, reset) - tests RC on a specific va -; -; Returns following code ORed with RC from mapping -; mapRtOK - if all is ok -; mapRtBadLk - if mapping lock fails -; mapRtNotFnd - if mapping is not found -; - .align 5 - .globl EXT(hw_test_rc) - -LEXT(hw_test_rc) - stwu r1,-(FM_ALIGN((31-24+1)*4)+FM_SIZE)(r1) ; Make some space on the stack - mflr r0 ; Save the link register - stw r24,FM_ARG0+0x00(r1) ; Save a register - stw r25,FM_ARG0+0x04(r1) ; Save a register - stw r26,FM_ARG0+0x08(r1) ; Save a register - stw r27,FM_ARG0+0x0C(r1) ; Save a register - stw r28,FM_ARG0+0x10(r1) ; Save a register - mr r24,r6 ; Save the reset request - stw r29,FM_ARG0+0x14(r1) ; Save a register - stw r30,FM_ARG0+0x18(r1) ; Save a register - stw r31,FM_ARG0+0x1C(r1) ; Save a register - stw r0,(FM_ALIGN((31-24+1)*4)+FM_SIZE+FM_LR_SAVE)(r1) ; Save the return - -#if DEBUG - lwz r11,pmapFlags(r3) ; Get pmaps flags - rlwinm. r11,r11,0,pmapVMgsaa ; Is guest shadow assist active? - bne htrPanic ; Call not valid for guest shadow assist pmap -#endif - - lwz r6,pmapvr(r3) ; Get the first part of the VR translation for pmap - lwz r7,pmapvr+4(r3) ; Get the second part - - - bl EXT(mapSetUp) ; Turn off interrupts, translation, and possibly enter 64-bit - - mr r27,r11 ; Remember the old MSR - mr r26,r12 ; Remember the feature bits - - xor r28,r3,r7 ; Change the common 32- and 64-bit half - - bf-- pf64Bitb,htrSF1 ; skip if 32-bit... - - rldimi r28,r6,32,0 ; Shift the fixed upper part of the physical over and cram in top - -htrSF1: mr r29,r4 ; Save top half of vaddr - mr r30,r5 ; Save the bottom half - - la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkShared ; Go get a shared lock on the mapping lists - mr. r3,r3 ; Did we get the lock? - li r25,0 ; Clear RC - bne-- htrBadLock ; Nope... - - mr r3,r28 ; get the pmap address - mr r4,r29 ; Get bits 0:31 to look for - mr r5,r30 ; Get bits 32:64 - - bl EXT(mapSearch) ; Go see if we can find it (R7 comes back with mpFlags) - - rlwinm. r0,r7,0,mpType ; Is this a normal mapping? - crmove cr1_eq,cr0_eq ; cr1_eq <- this is a normal mapping - andi. r0,r7,mpPerm|mpRIP ; Is it permanent or being removed? - crand cr1_eq,cr0_eq,cr1_eq ; cr1_eq <- normal mapping and not permanent and not being removed - mr. r31,r3 ; Save the mapping if we found it - crandc cr1_eq,cr1_eq,cr0_eq ; cr1_eq <- found & normal & not permanent & not being removed - - bf-- cr1_eq,htrNotFound ; Not found, something special, or being removed... - - bt++ pf64Bitb,htrDo64 ; Split for 64 bit - - bl mapInvPte32 ; Invalidate and lock PTEG, also merge into physent - - cmplwi cr1,r24,0 ; Do we want to clear RC? - lwz r12,mpVAddr+4(r31) ; Get the bottom of the mapping vaddr field - mr. r3,r3 ; Was there a previously valid PTE? - li r0,lo16(mpR|mpC) ; Get bits to clear - - and r25,r5,r0 ; Save the RC bits - beq++ cr1,htrNoClr32 ; Nope... - - andc r12,r12,r0 ; Clear mapping copy of RC - andc r5,r5,r0 ; Clear PTE copy of RC - sth r12,mpVAddr+6(r31) ; Set the new RC - -htrNoClr32: beq-- htrNoOld32 ; No previously valid PTE... - - sth r5,6(r3) ; Store updated RC - eieio ; Make sure we do not reorder - stw r4,0(r3) ; Revalidate the PTE - - eieio ; Make sure all updates come first - stw r6,0(r7) ; Unlock PCA - -htrNoOld32: la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkUnlock ; Unlock the search list - li r3,mapRtOK ; Set normal return - b htrR32 ; Join common... - - .align 5 - - -htrDo64: bl mapInvPte64 ; Invalidate and lock PTEG, also merge into physent - - cmplwi cr1,r24,0 ; Do we want to clear RC? - lwz r12,mpVAddr+4(r31) ; Get the bottom of the mapping vaddr field - mr. r3,r3 ; Was there a previously valid PTE? - li r0,lo16(mpR|mpC) ; Get bits to clear - - and r25,r5,r0 ; Save the RC bits - beq++ cr1,htrNoClr64 ; Nope... - - andc r12,r12,r0 ; Clear mapping copy of RC - andc r5,r5,r0 ; Clear PTE copy of RC - sth r12,mpVAddr+6(r31) ; Set the new RC - -htrNoClr64: beq-- htrNoOld64 ; Nope, no pevious pte... - - sth r5,14(r3) ; Store updated RC - eieio ; Make sure we do not reorder - std r4,0(r3) ; Revalidate the PTE - - eieio ; Make sure all updates come first - stw r6,0(r7) ; Unlock PCA - -htrNoOld64: la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkUnlock ; Unlock the search list - li r3,mapRtOK ; Set normal return - b htrR64 ; Join common... - - .align 5 - -htrReturn: bt++ pf64Bitb,htrR64 ; Yes... - -htrR32: mtmsr r27 ; Restore enables/translation/etc. - isync - b htrReturnC ; Join common... - -htrR64: mtmsrd r27 ; Restore enables/translation/etc. - isync - -htrReturnC: lwz r0,(FM_ALIGN((31-24+1)*4)+FM_SIZE+FM_LR_SAVE)(r1) ; Save the return - or r3,r3,r25 ; Send the RC bits back - lwz r24,FM_ARG0+0x00(r1) ; Save a register - lwz r25,FM_ARG0+0x04(r1) ; Save a register - lwz r26,FM_ARG0+0x08(r1) ; Save a register - mtlr r0 ; Restore the return - lwz r27,FM_ARG0+0x0C(r1) ; Save a register - lwz r28,FM_ARG0+0x10(r1) ; Save a register - lwz r29,FM_ARG0+0x14(r1) ; Save a register - lwz r30,FM_ARG0+0x18(r1) ; Save a register - lwz r31,FM_ARG0+0x1C(r1) ; Save a register - lwz r1,0(r1) ; Pop the stack - blr ; Leave... - - .align 5 - -htrBadLock: li r3,mapRtBadLk ; Set lock time out error code - b htrReturn ; Leave.... - -htrNotFound: - la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkUnlock ; Unlock the search list - - li r3,mapRtNotFnd ; Set that we did not find the requested page - b htrReturn ; Leave.... - -htrPanic: lis r0,hi16(Choke) ; System abend - ori r0,r0,lo16(Choke) ; System abend - li r3,failMapping ; Show that we failed some kind of mapping thing - sc - - -; -; -; mapFindLockPN - find and lock physent for a given page number -; -; - .align 5 -mapFindLockPN: - lis r9,hi16(EXT(pmap_mem_regions)) ; Point to the start of the region table - mr r2,r3 ; Save our target - ori r9,r9,lo16(EXT(pmap_mem_regions)) ; Point to the start of the region table - -mapFLPNitr: lwz r3,mrPhysTab(r9) ; Get the actual table address - lwz r5,mrStart(r9) ; Get start of table entry - lwz r0,mrEnd(r9) ; Get end of table entry - addi r9,r9,mrSize ; Point to the next slot - cmplwi cr7,r3,0 ; Are we at the end of the table? - cmplw r2,r5 ; See if we are in this table - cmplw cr1,r2,r0 ; Check end also - sub r4,r2,r5 ; Calculate index to physical entry - beq-- cr7,mapFLPNmiss ; Leave if we did not find an entry... - cror cr0_lt,cr0_lt,cr1_gt ; Set CR0_LT if it is NOT this entry - slwi r4,r4,3 ; Get offset to physical entry - - blt-- mapFLPNitr ; Did not find it... - - add r3,r3,r4 ; Point right to the slot - b mapPhysLock ; Join common lock code - -mapFLPNmiss: - li r3,0 ; Show that we did not find it - blr ; Leave... - - -; -; mapPhysFindLock - find physent list and lock it -; R31 points to mapping -; - .align 5 - -mapPhysFindLock: - lbz r4,mpFlags+1(r31) ; Get the index into the physent bank table - lis r3,ha16(EXT(pmap_mem_regions)) ; Get high order of physent table (note use of ha16 to get value appropriate for an addi of low part) - rlwinm r4,r4,2,24,29 ; Mask index bits and convert to byte offset - addi r4,r4,lo16(EXT(pmap_mem_regions)) ; Get low part of address of entry - add r3,r3,r4 ; Point to table entry - lwz r5,mpPAddr(r31) ; Get physical page number - lwz r7,mrStart(r3) ; Get the start of range - lwz r3,mrPhysTab(r3) ; Get the start of the entries for this bank - sub r6,r5,r7 ; Get index to physent - rlwinm r6,r6,3,0,28 ; Get offset to physent - add r3,r3,r6 ; Point right to the physent - b mapPhysLock ; Join in the lock... - -; -; mapPhysLock - lock a physent list -; R3 contains list header -; - .align 5 - -mapPhysLockS: - li r2,lgKillResv ; Get a spot to kill reservation - stwcx. r2,0,r2 ; Kill it... - -mapPhysLockT: - lwz r2,ppLink(r3) ; Get physent chain header - rlwinm. r2,r2,0,0,0 ; Is lock clear? - bne-- mapPhysLockT ; Nope, still locked... - -mapPhysLock: - lwarx r2,0,r3 ; Get the lock - rlwinm. r0,r2,0,0,0 ; Is it locked? - oris r0,r2,0x8000 ; Set the lock bit - bne-- mapPhysLockS ; It is locked, spin on it... - stwcx. r0,0,r3 ; Try to stuff it back... - bne-- mapPhysLock ; Collision, try again... - isync ; Clear any speculations - blr ; Leave... - - -; -; mapPhysUnlock - unlock a physent list -; R3 contains list header -; - .align 5 - -mapPhysUnlock: - lwz r0,ppLink(r3) ; Get physent chain header - rlwinm r0,r0,0,1,31 ; Clear the lock bit - eieio ; Make sure unlock comes last - stw r0,ppLink(r3) ; Unlock the list - blr - -; -; mapPhysMerge - merge the RC bits into the master copy -; R3 points to the physent -; R4 contains the RC bits -; -; Note: we just return if RC is 0 -; - .align 5 - -mapPhysMerge: - rlwinm. r4,r4,PTE1_REFERENCED_BIT+(64-ppRb),ppRb-32,ppCb-32 ; Isolate RC bits - la r5,ppLink+4(r3) ; Point to the RC field - beqlr-- ; Leave if RC is 0... - -mapPhysMergeT: - lwarx r6,0,r5 ; Get the RC part - or r6,r6,r4 ; Merge in the RC - stwcx. r6,0,r5 ; Try to stuff it back... - bne-- mapPhysMergeT ; Collision, try again... - blr ; Leave... - -; -; Sets the physent link pointer and preserves all flags -; The list is locked -; R3 points to physent -; R4 has link to set -; - - .align 5 - -mapPhyCSet32: - la r5,ppLink+4(r3) ; Point to the link word - -mapPhyCSetR: - lwarx r2,0,r5 ; Get the link and flags - rlwimi r4,r2,0,ppFlags ; Insert the flags - stwcx. r4,0,r5 ; Stick them back - bne-- mapPhyCSetR ; Someone else did something, try again... - blr ; Return... - - .align 5 - -mapPhyCSet64: - li r0,ppLFAmask ; Get mask to clean up mapping pointer - rotrdi r0,r0,ppLFArrot ; Rotate clean up mask to get 0xF0000000000000000F - -mapPhyCSet64x: - ldarx r2,0,r3 ; Get the link and flags - and r5,r2,r0 ; Isolate the flags - or r6,r4,r5 ; Add them to the link - stdcx. r6,0,r3 ; Stick them back - bne-- mapPhyCSet64x ; Someone else did something, try again... - blr ; Return... - -; -; mapBumpBusy - increment the busy count on a mapping -; R3 points to mapping -; - - .align 5 - -mapBumpBusy: - lwarx r4,0,r3 ; Get mpBusy - addis r4,r4,0x0100 ; Bump the busy count - stwcx. r4,0,r3 ; Save it back - bne-- mapBumpBusy ; This did not work, try again... - blr ; Leave... - -; -; mapDropBusy - increment the busy count on a mapping -; R3 points to mapping -; - - .globl EXT(mapping_drop_busy) - .align 5 - -LEXT(mapping_drop_busy) -mapDropBusy: - lwarx r4,0,r3 ; Get mpBusy - addis r4,r4,0xFF00 ; Drop the busy count - stwcx. r4,0,r3 ; Save it back - bne-- mapDropBusy ; This did not work, try again... - blr ; Leave... - -; -; mapDrainBusy - drain the busy count on a mapping -; R3 points to mapping -; Note: we already have a busy for ourselves. Only one -; busy per processor is allowed, so we just spin here -; waiting for the count to drop to 1. -; Also, the mapping can not be on any lists when we do this -; so all we are doing is waiting until it can be released. -; - - .align 5 - -mapDrainBusy: - lwz r4,mpFlags(r3) ; Get mpBusy - rlwinm r4,r4,8,24,31 ; Clean it up - cmplwi r4,1 ; Is is just our busy? - beqlr++ ; Yeah, it is clear... - b mapDrainBusy ; Try again... - - - -; -; handleDSeg - handle a data segment fault -; handleISeg - handle an instruction segment fault -; -; All that we do here is to map these to DSI or ISI and insure -; that the hash bit is not set. This forces the fault code -; to also handle the missing segment. -; -; At entry R2 contains per_proc, R13 contains savarea pointer, -; and R11 is the exception code. -; - - .align 5 - .globl EXT(handleDSeg) - -LEXT(handleDSeg) - - li r11,T_DATA_ACCESS ; Change fault to DSI - stw r11,saveexception(r13) ; Change the exception code from seg fault to PTE miss - b EXT(handlePF) ; Join common... - - .align 5 - .globl EXT(handleISeg) - -LEXT(handleISeg) - - li r11,T_INSTRUCTION_ACCESS ; Change fault to ISI - stw r11,saveexception(r13) ; Change the exception code from seg fault to PTE miss - b EXT(handlePF) ; Join common... - - -/* - * handlePF - handle a page fault interruption - * - * At entry R2 contains per_proc, R13 contains savarea pointer, - * and R11 is the exception code. - * - * This first part does a quick check to see if we can handle the fault. - * We canot handle any kind of protection exceptions here, so we pass - * them up to the next level. - * - * NOTE: In order for a page-fault redrive to work, the translation miss - * bit must be set in the DSISR (or SRR1 for IFETCH). That must occur - * before we come here. - */ - - .align 5 - .globl EXT(handlePF) - -LEXT(handlePF) - - mfsprg r12,2 ; Get feature flags - cmplwi r11,T_INSTRUCTION_ACCESS ; See if this is for the instruction - lwz r8,savesrr1+4(r13) ; Get the MSR to determine mode - mtcrf 0x02,r12 ; move pf64Bit to cr6 - lis r0,hi16(dsiNoEx|dsiProt|dsiInvMode|dsiAC) ; Get the types that we cannot handle here - lwz r18,SAVflags(r13) ; Get the flags - - beq-- gotIfetch ; We have an IFETCH here... - - lwz r27,savedsisr(r13) ; Get the DSISR - lwz r29,savedar(r13) ; Get the first half of the DAR - lwz r30,savedar+4(r13) ; And second half - - b ckIfProt ; Go check if this is a protection fault... - -gotIfetch: andis. r27,r8,hi16(dsiValid) ; Clean this up to construct a DSISR value - lwz r29,savesrr0(r13) ; Get the first half of the instruction address - lwz r30,savesrr0+4(r13) ; And second half - stw r27,savedsisr(r13) ; Save the "constructed" DSISR - -ckIfProt: and. r4,r27,r0 ; Is this a non-handlable exception? - li r20,64 ; Set a limit of 64 nests for sanity check - bne-- hpfExit ; Yes... (probably not though) - -; -; Note: if the RI is on, we are accessing user space from the kernel, therefore we -; should be loading the user pmap here. -; - - andi. r0,r8,lo16(MASK(MSR_PR)|MASK(MSR_RI)) ; Are we addressing user or kernel space? - lis r8,hi16(EXT(kernel_pmap_phys)) ; Assume kernel - mr r19,r2 ; Remember the per_proc - ori r8,r8,lo16(EXT(kernel_pmap_phys)) ; Assume kernel (bottom of address) - mr r23,r30 ; Save the low part of faulting address - beq-- hpfInKern ; Skip if we are in the kernel - la r8,ppUserPmap(r19) ; Point to the current user pmap - -hpfInKern: mr r22,r29 ; Save the high part of faulting address - - bt-- pf64Bitb,hpf64a ; If 64-bit, skip the next bit... - -; -; On 32-bit machines we emulate a segment exception by loading unused SRs with a -; predefined value that corresponds to no address space. When we see that value -; we turn off the PTE miss bit in the DSISR to drive the code later on that will -; cause the proper SR to be loaded. -; - - lwz r28,4(r8) ; Pick up the pmap - rlwinm. r18,r18,0,SAVredriveb,SAVredriveb ; Was this a redrive? - mr r25,r28 ; Save the original pmap (in case we nest) - lwz r0,pmapFlags(r28) ; Get pmap's flags - bne hpfGVtest ; Segs are not ours if so... - mfsrin r4,r30 ; Get the SR that was used for translation - cmplwi r4,invalSpace ; Is this a simulated segment fault? - bne++ hpfGVtest ; No... - - rlwinm r27,r27,0,dsiMissb+1,dsiMissb-1 ; Clear the PTE miss bit in DSISR - b hpfGVtest ; Join on up... - - .align 5 - - nop ; Push hpfNest to a 32-byte boundary - nop ; Push hpfNest to a 32-byte boundary - nop ; Push hpfNest to a 32-byte boundary - -hpf64a: ld r28,0(r8) ; Get the pmap pointer (64-bit) - mr r25,r28 ; Save the original pmap (in case we nest) - lwz r0,pmapFlags(r28) ; Get pmap's flags - -hpfGVtest: rlwinm. r0,r0,0,pmapVMgsaa ; Using guest shadow mapping assist? - bne hpfGVxlate ; Yup, do accelerated shadow stuff - -; -; This is where we loop descending nested pmaps -; - -hpfNest: la r3,pmapSXlk(r28) ; Point to the pmap search lock - addi r20,r20,-1 ; Count nest try - bl sxlkShared ; Go get a shared lock on the mapping lists - mr. r3,r3 ; Did we get the lock? - bne-- hpfBadLock ; Nope... - - mr r3,r28 ; Get the pmap pointer - mr r4,r22 ; Get top of faulting vaddr - mr r5,r23 ; Get bottom of faulting vaddr - bl EXT(mapSearch) ; Go see if we can find it (R7 gets mpFlags) - - rlwinm r0,r7,0,mpRIPb,mpRIPb ; Are we removing this one? - mr. r31,r3 ; Save the mapping if we found it - cmplwi cr1,r0,0 ; Check for removal - crorc cr0_eq,cr0_eq,cr1_eq ; Merge not found and removing - - bt-- cr0_eq,hpfNotFound ; Not found or removing... - - rlwinm r0,r7,0,mpType ; Isolate mapping type - cmplwi r0,mpNest ; Are we again nested? - cmplwi cr1,r0,mpLinkage ; Are we a linkage type? - cror cr0_eq,cr1_eq,cr0_eq ; cr0_eq <- nested or linkage type? - mr r26,r7 ; Get the flags for this mapping (passed back from search call) - - lhz r21,mpSpace(r31) ; Get the space - - bne++ hpfFoundIt ; No, we found our guy... - - -#if pmapTransSize != 12 -#error pmapTrans entry size is not 12 bytes!!!!!!!!!!!! It is pmapTransSize -#endif - cmplwi r0,mpLinkage ; Linkage mapping? - cmplwi cr1,r20,0 ; Too many nestings? - beq-- hpfSpclNest ; Do we need to do special handling? - -hpfCSrch: lhz r21,mpSpace(r31) ; Get the space - lwz r8,mpNestReloc(r31) ; Get the vaddr relocation - lwz r9,mpNestReloc+4(r31) ; Get the vaddr relocation bottom half - la r3,pmapSXlk(r28) ; Point to the old pmap search lock - lis r0,0x8000 ; Get 0xFFFFFFFF80000000 - lis r10,hi16(EXT(pmapTrans)) ; Get the translate table - add r0,r0,r0 ; Get 0xFFFFFFFF00000000 for 64-bit or 0 for 32-bit - blt-- cr1,hpfNestTooMuch ; Too many nestings, must be a loop... - or r23,r23,r0 ; Make sure a carry will propagate all the way in 64-bit - slwi r11,r21,3 ; Multiply space by 8 - ori r10,r10,lo16(EXT(pmapTrans)) ; Get the translate table low part - addc r23,r23,r9 ; Relocate bottom half of vaddr - lwz r10,0(r10) ; Get the actual translation map - slwi r12,r21,2 ; Multiply space by 4 - add r10,r10,r11 ; Add in the higher part of the index - rlwinm r23,r23,0,0,31 ; Clean up the relocated address (does nothing in 32-bit) - adde r22,r22,r8 ; Relocate the top half of the vaddr - add r12,r12,r10 ; Now we are pointing at the space to pmap translation entry - bl sxlkUnlock ; Unlock the search list - - bt++ pf64Bitb,hpfGetPmap64 ; Separate handling for 64-bit machines - lwz r28,pmapPAddr+4(r12) ; Get the physical address of the new pmap - cmplwi r28,0 ; Is the pmap paddr valid? - bne+ hpfNest ; Nest into new pmap... - b hpfBadPmap ; Handle bad pmap - -hpfGetPmap64: - ld r28,pmapPAddr(r12) ; Get the physical address of the new pmap - cmpldi r28,0 ; Is the pmap paddr valid? - bne++ hpfNest ; Nest into new pmap... - b hpfBadPmap ; Handle bad pmap - - -; -; Error condition. We only allow 64 nestings. This keeps us from having to -; check for recusive nests when we install them. -; - - .align 5 - -hpfNestTooMuch: - lwz r20,savedsisr(r13) ; Get the DSISR - la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkUnlock ; Unlock the search list (R3 good from above) - ori r20,r20,1 ; Indicate that there was a nesting problem - stw r20,savedsisr(r13) ; Stash it - lwz r11,saveexception(r13) ; Restore the exception code - b EXT(PFSExit) ; Yes... (probably not though) - -; -; Error condition - lock failed - this is fatal -; - - .align 5 - -hpfBadLock: - lis r0,hi16(Choke) ; System abend - ori r0,r0,lo16(Choke) ; System abend - li r3,failMapping ; Show mapping failure - sc - -; -; Error condition - space id selected an invalid pmap - fatal -; - - .align 5 - -hpfBadPmap: - lis r0,hi16(Choke) ; System abend - ori r0,r0,lo16(Choke) ; System abend - li r3,failPmap ; Show invalid pmap - sc - -; -; Did not find any kind of mapping -; - - .align 5 - -hpfNotFound: - la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkUnlock ; Unlock it - lwz r11,saveexception(r13) ; Restore the exception code - -hpfExit: ; We need this because we can not do a relative branch - b EXT(PFSExit) ; Yes... (probably not though) - - -; -; Here is where we handle special mappings. So far, the only use is to load a -; processor specific segment register for copy in/out handling. -; -; The only (so far implemented) special map is used for copyin/copyout. -; We keep a mapping of a "linkage" mapping in the per_proc. -; The linkage mapping is basically a nested pmap that is switched in -; as part of context switch. It relocates the appropriate user address -; space slice into the right place in the kernel. -; - - .align 5 - -hpfSpclNest: - la r31,ppUMWmp(r19) ; Just point to the mapping - oris r27,r27,hi16(dsiLinkage) ; Show that we had a linkage mapping here - b hpfCSrch ; Go continue search... - - -; -; We have now found a mapping for the address we faulted on. -; - -; -; Here we go about calculating what the VSID should be. We concatanate -; the space ID (14 bits wide) 3 times. We then slide the vaddr over -; so that bits 0:35 are in 14:49 (leaves a hole for one copy of the space ID). -; Then we XOR and expanded space ID and the shifted vaddr. This gives us -; the VSID. -; -; This is used both for segment handling and PTE handling -; - - -#if maxAdrSpb != 14 -#error maxAdrSpb (address space id size) is not 14 bits!!!!!!!!!!!! -#endif - -; Important non-volatile registers at this point ('home' means the final pmap/mapping found -; when a multi-level mapping has been successfully searched): -; r21: home space id number -; r22: relocated high-order 32 bits of vaddr -; r23: relocated low-order 32 bits of vaddr -; r25: pmap physical address -; r27: dsisr -; r28: home pmap physical address -; r29: high-order 32 bits of faulting vaddr -; r30: low-order 32 bits of faulting vaddr -; r31: mapping's physical address - - .align 5 - -hpfFoundIt: lwz r12,pmapFlags(r28) ; Get the pmap flags so we can find the keys for this segment -hpfGVfound: rlwinm. r0,r27,0,dsiMissb,dsiMissb ; Did we actually miss the segment? - rlwinm r15,r23,18,14,17 ; Shift 32:35 (0:3) of vaddr just above space ID - rlwinm r20,r21,28,22,31 ; Shift upper 10 bits of space into high order - rlwinm r14,r22,18,14,31 ; Shift 0:17 of vaddr over - rlwinm r0,r27,0,dsiLinkageb,dsiLinkageb ; Isolate linkage mapping flag - rlwimi r21,r21,14,4,17 ; Make a second copy of space above first - cmplwi cr5,r0,0 ; Did we just do a special nesting? - rlwimi r15,r22,18,0,13 ; Shift 18:31 of vaddr just above shifted 32:35 - crorc cr0_eq,cr0_eq,cr5_eq ; Force outselves through the seg load code if special nest - rlwimi r21,r21,28,0,3 ; Get low order of 3rd copy of space at top of register - xor r14,r14,r20 ; Calculate the top half of VSID - xor r15,r15,r21 ; Calculate the bottom half of the VSID - rlwinm r14,r14,12,15,19 ; Slide the top of the VSID over to correct position (trim for 65 bit addressing) - rlwinm r12,r12,9,20,22 ; Isolate and position key for cache entry - rlwimi r14,r15,12,20,31 ; Slide top of bottom of VSID over into the top - rlwinm r15,r15,12,0,19 ; Slide the last nybble into the low order segment position - or r12,r12,r15 ; Add key into the bottom of VSID -; -; Note: ESID is in R22:R23 pair; VSID is in R14:R15; cache form VSID is R14:R12 - - bne++ hpfPteMiss ; Nope, normal PTE miss... - -; -; Here is the only place that we make an entry in the pmap segment cache. -; -; Note that we do not make an entry in the segment cache for special -; nested mappings. This makes the copy in/out segment get refreshed -; when switching threads. -; -; The first thing that we do is to look up the ESID we are going to load -; into a segment in the pmap cache. If it is already there, this is -; a segment that appeared since the last time we switched address spaces. -; If all is correct, then it was another processors that made the cache -; entry. If not, well, it is an error that we should die on, but I have -; not figured a good way to trap it yet. -; -; If we get a hit, we just bail, otherwise, lock the pmap cache, select -; an entry based on the generation number, update the cache entry, and -; also update the pmap sub-tag as well. The sub-tag is a table of 4 bit -; entries that correspond to the last 4 bits (32:35 for 64-bit and -; 0:3 for 32-bit) of the ESID. -; -; Then we unlock and bail. -; -; First lock it. Then select a free slot or steal one based on the generation -; number. Then store it, update the allocation flags, and unlock. -; -; The cache entry contains an image of the ESID/VSID pair we would load for -; 64-bit architecture. For 32-bit, it is a simple transform to an SR image. -; -; Remember, this cache entry goes in the ORIGINAL pmap (saved in R25), not -; the current one, which may have changed because we nested. -; -; Also remember that we do not store the valid bit in the ESID. If we -; od, this will break some other stuff. -; - - bne-- cr5,hpfNoCacheEnt2 ; Skip the cache entry if this is a "special nest" fault.... - - mr r3,r25 ; Point to the pmap - mr r4,r29 ; ESID high half - mr r5,r30 ; ESID low half - bl pmapCacheLookup ; Go see if this is in the cache already - - mr. r3,r3 ; Did we find it? - mr r4,r11 ; Copy this to a different register - - bne-- hpfNoCacheEnt ; Yes, we found it, no need to make another entry... - - lwz r10,pmapSCSubTag(r25) ; Get the first part of the sub-tag lookup table - lwz r11,pmapSCSubTag+4(r25) ; Get the second part of the sub-tag lookup table - - cntlzw r7,r4 ; Find a free slot - - subi r6,r7,pmapSegCacheUse ; We end up with a negative if we find one - rlwinm r30,r30,0,0,3 ; Clean up the ESID - srawi r6,r6,31 ; Get 0xFFFFFFFF if we have one, 0 if not - addi r5,r4,1 ; Bump the generation number - and r7,r7,r6 ; Clear bit number if none empty - andc r8,r4,r6 ; Clear generation count if we found an empty - rlwimi r4,r5,0,17,31 ; Insert the new generation number into the control word - or r7,r7,r8 ; Select a slot number - li r8,0 ; Clear - andi. r7,r7,pmapSegCacheUse-1 ; Wrap into the number we are using - oris r8,r8,0x8000 ; Get the high bit on - la r9,pmapSegCache(r25) ; Point to the segment cache - slwi r6,r7,4 ; Get index into the segment cache - slwi r2,r7,2 ; Get index into the segment cache sub-tag index - srw r8,r8,r7 ; Get the mask - cmplwi r2,32 ; See if we are in the first or second half of sub-tag - li r0,0 ; Clear - rlwinm r2,r2,0,27,31 ; Wrap shift so we do not shift cache entries 8-F out - oris r0,r0,0xF000 ; Get the sub-tag mask - add r9,r9,r6 ; Point to the cache slot - srw r0,r0,r2 ; Slide sub-tag mask to right slot (shift work for either half) - srw r5,r30,r2 ; Slide sub-tag to right slot (shift work for either half) - - stw r29,sgcESID(r9) ; Save the top of the ESID - andc r10,r10,r0 ; Clear sub-tag slot in case we are in top - andc r11,r11,r0 ; Clear sub-tag slot in case we are in bottom - stw r30,sgcESID+4(r9) ; Save the bottom of the ESID - or r10,r10,r5 ; Stick in subtag in case top half - or r11,r11,r5 ; Stick in subtag in case bottom half - stw r14,sgcVSID(r9) ; Save the top of the VSID - andc r4,r4,r8 ; Clear the invalid bit for the slot we just allocated - stw r12,sgcVSID+4(r9) ; Save the bottom of the VSID and the key - bge hpfSCSTbottom ; Go save the bottom part of sub-tag - - stw r10,pmapSCSubTag(r25) ; Save the top of the sub-tag - b hpfNoCacheEnt ; Go finish up... - -hpfSCSTbottom: - stw r11,pmapSCSubTag+4(r25) ; Save the bottom of the sub-tag - - -hpfNoCacheEnt: - eieio ; Make sure cache is updated before lock - stw r4,pmapCCtl(r25) ; Unlock, allocate, and bump generation number - - -hpfNoCacheEnt2: - lwz r4,ppMapFlags(r19) ; Get the protection key modifier - bt++ pf64Bitb,hpfLoadSeg64 ; If 64-bit, go load the segment... - -; -; Make and enter 32-bit segment register -; - - lwz r16,validSegs(r19) ; Get the valid SR flags - xor r12,r12,r4 ; Alter the storage key before loading segment register - rlwinm r2,r30,4,28,31 ; Isolate the segment we are setting - rlwinm r6,r12,19,1,3 ; Insert the keys and N bit - lis r0,0x8000 ; Set bit 0 - rlwimi r6,r12,20,12,31 ; Insert 4:23 the VSID - srw r0,r0,r2 ; Get bit corresponding to SR - rlwimi r6,r14,20,8,11 ; Get the last nybble of the SR contents - or r16,r16,r0 ; Show that SR is valid - - mtsrin r6,r30 ; Set the actual SR - - stw r16,validSegs(r19) ; Set the valid SR flags - - b hpfPteMiss ; SR loaded, go do a PTE... - -; -; Make and enter 64-bit segment look-aside buffer entry. -; Note that the cache entry is the right format except for valid bit. -; We also need to convert from long long to 64-bit register values. -; - - - .align 5 - -hpfLoadSeg64: - ld r16,validSegs(r19) ; Get the valid SLB entry flags - sldi r8,r29,32 ; Move high order address over - sldi r10,r14,32 ; Move high part of VSID over - - not r3,r16 ; Make valids be 0s - li r0,1 ; Prepare to set bit 0 - - cntlzd r17,r3 ; Find a free SLB - xor r12,r12,r4 ; Alter the storage key before loading segment table entry - or r9,r8,r30 ; Form full 64-bit address - cmplwi r17,63 ; Did we find a free SLB entry? - sldi r0,r0,63 ; Get bit 0 set - or r10,r10,r12 ; Move in low part and keys - addi r17,r17,1 ; Skip SLB 0 always - blt++ hpfFreeSeg ; Yes, go load it... - -; -; No free SLB entries, select one that is in use and invalidate it -; - lwz r4,ppSegSteal(r19) ; Get the next slot to steal - addi r17,r4,pmapSegCacheUse+1 ; Select stealee from non-cached slots only - addi r4,r4,1 ; Set next slot to steal - slbmfee r7,r17 ; Get the entry that is in the selected spot - subi r2,r4,63-pmapSegCacheUse ; Force steal to wrap - rldicr r7,r7,0,35 ; Clear the valid bit and the rest - srawi r2,r2,31 ; Get -1 if steal index still in range - slbie r7 ; Invalidate the in-use SLB entry - and r4,r4,r2 ; Reset steal index when it should wrap - isync ; - - stw r4,ppSegSteal(r19) ; Set the next slot to steal -; -; We are now ready to stick the SLB entry in the SLB and mark it in use -; - -hpfFreeSeg: - subi r4,r17,1 ; Adjust shift to account for skipping slb 0 - mr r7,r9 ; Get a copy of the ESID with bits 36:63 clear - srd r0,r0,r4 ; Set bit mask for allocation - oris r9,r9,0x0800 ; Turn on the valid bit - or r16,r16,r0 ; Turn on the allocation flag - rldimi r9,r17,0,58 ; Copy in the SLB entry selector - - beq++ cr5,hpfNoBlow ; Skip blowing away the SLBE if this is not a special nest... - slbie r7 ; Blow away a potential duplicate - -hpfNoBlow: slbmte r10,r9 ; Make that SLB entry - - std r16,validSegs(r19) ; Mark as valid - b hpfPteMiss ; STE loaded, go do a PTE... - -; -; The segment has been set up and loaded if need be. Now we are ready to build the -; PTE and get it into the hash table. -; -; Note that there is actually a race here. If we start fault processing on -; a different pmap, i.e., we have descended into a nested pmap, it is possible -; that the nest could have been removed from the original pmap. We would -; succeed with this translation anyway. I do not think we need to worry -; about this (famous last words) because nobody should be unnesting anything -; if there are still people activily using them. It should be up to the -; higher level VM system to put the kibosh on this. -; -; There is also another race here: if we fault on the same mapping on more than -; one processor at the same time, we could end up with multiple PTEs for the same -; mapping. This is not a good thing.... We really only need one of the -; fault handlers to finish, so what we do is to set a "fault in progress" flag in -; the mapping. If we see that set, we just abandon the handler and hope that by -; the time we restore context and restart the interrupted code, the fault has -; been resolved by the other guy. If not, we will take another fault. -; - -; -; NOTE: IMPORTANT - CR7 contains a flag indicating if we have a block mapping or not. -; It is required to stay there until after we call mapSelSlot!!!! -; - - .align 5 - -hpfPteMiss: lwarx r0,0,r31 ; Load the mapping flag field - lwz r12,mpPte(r31) ; Get the quick pointer to PTE - li r3,mpHValid ; Get the PTE valid bit - andi. r2,r0,lo16(mpFIP) ; Are we handling a fault on the other side? - ori r2,r0,lo16(mpFIP) ; Set the fault in progress flag - crnot cr1_eq,cr0_eq ; Remember if FIP was on - and. r12,r12,r3 ; Isolate the valid bit - crorc cr0_eq,cr1_eq,cr0_eq ; Bail if FIP is on. Then, if already have PTE, bail... - beq-- hpfAbandon ; Yes, other processor is or already has handled this... - rlwinm r0,r2,0,mpType ; Isolate mapping type - cmplwi r0,mpBlock ; Is this a block mapping? - crnot cr7_eq,cr0_eq ; Remember if we have a block mapping - stwcx. r2,0,r31 ; Store the flags - bne-- hpfPteMiss ; Collision, try again... - - bt++ pf64Bitb,hpfBldPTE64 ; Skip down to the 64 bit stuff... - -; -; At this point we are about to do the 32-bit PTE generation. -; -; The following is the R14:R15 pair that contains the "shifted" VSID: -; -; 1 2 3 4 4 5 6 -; 0 8 6 4 2 0 8 6 3 -; +--------+--------+--------+--------+--------+--------+--------+--------+ -; |00000000|0000000V|VVVVVVVV|VVVVVVVV|VVVVVVVV|VVVVVVVV|VVVV////|////////| -; +--------+--------+--------+--------+--------+--------+--------+--------+ -; -; The 24 bits of the 32-bit architecture VSID is in the following: -; -; 1 2 3 4 4 5 6 -; 0 8 6 4 2 0 8 6 3 -; +--------+--------+--------+--------+--------+--------+--------+--------+ -; |////////|////////|////////|////VVVV|VVVVVVVV|VVVVVVVV|VVVV////|////////| -; +--------+--------+--------+--------+--------+--------+--------+--------+ -; - - -hpfBldPTE32: - lwz r25,mpVAddr+4(r31) ; Grab the base virtual address for the mapping (32-bit portion) - lwz r24,mpPAddr(r31) ; Grab the base physical page number for the mapping - - mfsdr1 r27 ; Get the hash table base address - - rlwinm r0,r23,0,4,19 ; Isolate just the page index - rlwinm r18,r23,10,26,31 ; Extract the API - xor r19,r15,r0 ; Calculate hash << 12 - mr r2,r25 ; Save the flag part of the mapping - rlwimi r18,r14,27,1,4 ; Move bits 28:31 of the "shifted" VSID into the PTE image - rlwinm r16,r27,16,7,15 ; Extract the hash table size - rlwinm r25,r25,0,0,19 ; Clear out the flags - slwi r24,r24,12 ; Change ppnum to physical address (note: 36-bit addressing no supported) - sub r25,r23,r25 ; Get offset in mapping to page (0 unless block map) - ori r16,r16,lo16(0xFFC0) ; Slap in the bottom of the mask - rlwinm r27,r27,0,0,15 ; Extract the hash table base - rlwinm r19,r19,26,6,25 ; Shift hash over to make offset into hash table - add r24,r24,r25 ; Adjust to true physical address - rlwimi r18,r15,27,5,24 ; Move bits 32:31 of the "shifted" VSID into the PTE image - rlwimi r24,r2,0,20,31 ; Slap in the WIMG and prot - and r19,r19,r16 ; Wrap hash table offset into the hash table - ori r24,r24,lo16(mpR) ; Turn on the reference bit right now - rlwinm r20,r19,28,10,29 ; Shift hash over to make offset into PCA - add r19,r19,r27 ; Point to the PTEG - subfic r20,r20,-4 ; Get negative offset to PCA - oris r18,r18,lo16(0x8000) ; Make sure the valid bit is on - add r20,r20,r27 ; Point to the PCA slot - -; -; We now have a valid PTE pair in R18/R24. R18 is PTE upper and R24 is PTE lower. -; R19 contains the offset of the PTEG in the hash table. R20 has offset into the PCA. -; -; We need to check PTE pointer (mpPte) again after we lock the PTEG. It is possible -; that some other processor beat us and stuck in a PTE or that -; all we had was a simple segment exception and the PTE was there the whole time. -; If we find one a pointer, we are done. -; - - mr r7,r20 ; Copy the PCA pointer - bl mapLockPteg ; Lock the PTEG - - lwz r12,mpPte(r31) ; Get the offset to the PTE - mr r17,r6 ; Remember the PCA image - mr r16,r6 ; Prime the post-select PCA image - andi. r0,r12,mpHValid ; Is there a PTE here already? - li r21,8 ; Get the number of slots - - bne- cr7,hpfNoPte32 ; Skip this for a block mapping... - - bne- hpfBailOut ; Someone already did this for us... - -; -; The mapSelSlot function selects a PTEG slot to use. As input, it uses R6 as a -; pointer to the PCA. When it returns, R3 contains 0 if an unoccupied slot was -; selected, 1 if it stole a non-block PTE, or 2 if it stole a block mapped PTE. -; R4 returns the slot index. -; -; REMEMBER: CR7 indicates that we are building a block mapping. -; - -hpfNoPte32: subic. r21,r21,1 ; See if we have tried all slots - mr r6,r17 ; Get back the original PCA - rlwimi r6,r16,0,8,15 ; Insert the updated steal slot - blt- hpfBailOut ; Holy Cow, all slots are locked... - - bl mapSelSlot ; Go select a slot (note that the PCA image is already set up) - - cmplwi cr5,r3,1 ; Did we steal a slot? - rlwimi r19,r4,3,26,28 ; Insert PTE index into PTEG address yielding PTE address - mr r16,r6 ; Remember the PCA image after selection - blt+ cr5,hpfInser32 ; Nope, no steal... - - lwz r6,0(r19) ; Get the old PTE - lwz r7,4(r19) ; Get the real part of the stealee - rlwinm r6,r6,0,1,31 ; Clear the valid bit - bgt cr5,hpfNipBM ; Do not try to lock a non-existant physent for a block mapping... - srwi r3,r7,12 ; Change phys address to a ppnum - bl mapFindPhyTry ; Go find and try to lock physent (note: if R3 is 0, there is no physent for this page) - cmplwi cr1,r3,0 ; Check if this is in RAM - bne- hpfNoPte32 ; Could not get it, try for another... - - crmove cr5_gt,cr1_eq ; If we did not find a physent, pretend that this is a block map - -hpfNipBM: stw r6,0(r19) ; Set the invalid PTE - - sync ; Make sure the invalid is stored - li r9,tlbieLock ; Get the TLBIE lock - rlwinm r10,r6,21,0,3 ; Shift last 4 bits of space to segment part - -hpfTLBIE32: lwarx r0,0,r9 ; Get the TLBIE lock - mfsprg r4,0 ; Get the per_proc - rlwinm r8,r6,25,18,31 ; Extract the space ID - rlwinm r11,r6,25,18,31 ; Extract the space ID - lwz r7,hwSteals(r4) ; Get the steal count - srwi r2,r6,7 ; Align segment number with hash - rlwimi r11,r11,14,4,17 ; Get copy above ourselves - mr. r0,r0 ; Is it locked? - srwi r0,r19,6 ; Align PTEG offset for back hash - xor r2,r2,r11 ; Get the segment number (plus a whole bunch of extra bits) - xor r11,r11,r0 ; Hash backwards to partial vaddr - rlwinm r12,r2,14,0,3 ; Shift segment up - mfsprg r2,2 ; Get feature flags - li r0,1 ; Get our lock word - rlwimi r12,r6,22,4,9 ; Move up the API - bne- hpfTLBIE32 ; It is locked, go wait... - rlwimi r12,r11,12,10,19 ; Move in the rest of the vaddr - - stwcx. r0,0,r9 ; Try to get it - bne- hpfTLBIE32 ; We was beat... - addi r7,r7,1 ; Bump the steal count - - rlwinm. r0,r2,0,pfSMPcapb,pfSMPcapb ; Can this be an MP box? - li r0,0 ; Lock clear value - - tlbie r12 ; Invalidate it everywhere - - - beq- hpfNoTS32 ; Can not have MP on this machine... - - eieio ; Make sure that the tlbie happens first - tlbsync ; Wait for everyone to catch up - sync ; Make sure of it all - -hpfNoTS32: stw r0,tlbieLock(0) ; Clear the tlbie lock - - stw r7,hwSteals(r4) ; Save the steal count - bgt cr5,hpfInser32 ; We just stole a block mapping... - - lwz r4,4(r19) ; Get the RC of the just invalidated PTE - - la r11,ppLink+4(r3) ; Point to the master RC copy - lwz r7,ppLink+4(r3) ; Grab the pointer to the first mapping - rlwinm r2,r4,27,ppRb-32,ppCb-32 ; Position the new RC - -hpfMrgRC32: lwarx r0,0,r11 ; Get the master RC - or r0,r0,r2 ; Merge in the new RC - stwcx. r0,0,r11 ; Try to stick it back - bne- hpfMrgRC32 ; Try again if we collided... - - -hpfFPnch: rlwinm. r7,r7,0,~ppFlags ; Clean and test mapping address - beq- hpfLostPhys ; We could not find our mapping. Kick the bucket... - - lhz r10,mpSpace(r7) ; Get the space - lwz r9,mpVAddr+4(r7) ; And the vaddr - cmplw cr1,r10,r8 ; Is this one of ours? - xor r9,r12,r9 ; Compare virtual address - cmplwi r9,0x1000 ; See if we really match - crand cr0_eq,cr1_eq,cr0_lt ; See if both space and vaddr match - beq+ hpfFPnch2 ; Yes, found ours... - - lwz r7,mpAlias+4(r7) ; Chain on to the next - b hpfFPnch ; Check it out... - -hpfFPnch2: sub r0,r19,r27 ; Get offset to the PTEG - stw r0,mpPte(r7) ; Invalidate the quick pointer (keep quick pointer pointing to PTEG) - bl mapPhysUnlock ; Unlock the physent now - -hpfInser32: oris r18,r18,lo16(0x8000) ; Make sure the valid bit is on - - stw r24,4(r19) ; Stuff in the real part of the PTE - eieio ; Make sure this gets there first - - stw r18,0(r19) ; Stuff the virtual part of the PTE and make it valid - mr r17,r16 ; Get the PCA image to save - b hpfFinish ; Go join the common exit code... - - -; -; At this point we are about to do the 64-bit PTE generation. -; -; The following is the R14:R15 pair that contains the "shifted" VSID: -; -; 1 2 3 4 4 5 6 -; 0 8 6 4 2 0 8 6 3 -; +--------+--------+--------+--------+--------+--------+--------+--------+ -; |00000000|0000000V|VVVVVVVV|VVVVVVVV|VVVVVVVV|VVVVVVVV|VVVV////|////////| -; +--------+--------+--------+--------+--------+--------+--------+--------+ -; -; - - .align 5 - -hpfBldPTE64: - ld r10,mpVAddr(r31) ; Grab the base virtual address for the mapping - lwz r24,mpPAddr(r31) ; Grab the base physical page number for the mapping - - mfsdr1 r27 ; Get the hash table base address - - sldi r11,r22,32 ; Slide top of adjusted EA over - sldi r14,r14,32 ; Slide top of VSID over - rlwinm r5,r27,0,27,31 ; Isolate the size - eqv r16,r16,r16 ; Get all foxes here - rlwimi r15,r23,16,20,24 ; Stick in EA[36:40] to make AVPN - mr r2,r10 ; Save the flag part of the mapping - or r11,r11,r23 ; Stick in bottom of adjusted EA for full 64-bit value - rldicr r27,r27,0,45 ; Clean up the hash table base - or r15,r15,r14 ; Stick in bottom of AVPN for full 64-bit value - rlwinm r0,r11,0,4,19 ; Clear out everything but the page - subfic r5,r5,46 ; Get number of leading zeros - xor r19,r0,r15 ; Calculate hash - ori r15,r15,1 ; Turn on valid bit in AVPN to make top of PTE - srd r16,r16,r5 ; Shift over to get length of table - srdi r19,r19,5 ; Convert page offset to hash table offset - rldicr r16,r16,0,56 ; Clean up lower bits in hash table size - rldicr r10,r10,0,51 ; Clear out flags - sldi r24,r24,12 ; Change ppnum to physical address - sub r11,r11,r10 ; Get the offset from the base mapping - and r19,r19,r16 ; Wrap into hash table - add r24,r24,r11 ; Get actual physical address of this page - srdi r20,r19,5 ; Convert PTEG offset to PCA offset - rldimi r24,r2,0,52 ; Insert the keys, WIMG, RC, etc. - subfic r20,r20,-4 ; Get negative offset to PCA - ori r24,r24,lo16(mpR) ; Force on the reference bit - add r20,r20,r27 ; Point to the PCA slot - add r19,r19,r27 ; Point to the PTEG - -; -; We now have a valid PTE pair in R15/R24. R15 is PTE upper and R24 is PTE lower. -; R19 contains the offset of the PTEG in the hash table. R20 has offset into the PCA. -; -; We need to check PTE pointer (mpPte) again after we lock the PTEG. It is possible -; that some other processor beat us and stuck in a PTE or that -; all we had was a simple segment exception and the PTE was there the whole time. -; If we find one a pointer, we are done. -; - - mr r7,r20 ; Copy the PCA pointer - bl mapLockPteg ; Lock the PTEG - - lwz r12,mpPte(r31) ; Get the offset to the PTE - mr r17,r6 ; Remember the PCA image - mr r18,r6 ; Prime post-selection PCA image - andi. r0,r12,mpHValid ; See if we have a PTE now - li r21,8 ; Get the number of slots - - bne-- cr7,hpfNoPte64 ; Skip this for a block mapping... - - bne-- hpfBailOut ; Someone already did this for us... - -; -; The mapSelSlot function selects a PTEG slot to use. As input, it uses R3 as a -; pointer to the PCA. When it returns, R3 contains 0 if an unoccupied slot was -; selected, 1 if it stole a non-block PTE, or 2 if it stole a block mapped PTE. -; R4 returns the slot index. -; -; REMEMBER: CR7 indicates that we are building a block mapping. -; - -hpfNoPte64: subic. r21,r21,1 ; See if we have tried all slots - mr r6,r17 ; Restore original state of PCA - rlwimi r6,r18,0,8,15 ; Insert the updated steal slot - blt- hpfBailOut ; Holy Cow, all slots are locked... - - bl mapSelSlot ; Go select a slot - - cmplwi cr5,r3,1 ; Did we steal a slot? - mr r18,r6 ; Remember the PCA image after selection - insrdi r19,r4,3,57 ; Insert slot index into PTEG address bits 57:59, forming the PTE address - lwz r10,hwSteals(r2) ; Get the steal count - blt++ cr5,hpfInser64 ; Nope, no steal... - - ld r6,0(r19) ; Get the old PTE - ld r7,8(r19) ; Get the real part of the stealee - rldicr r6,r6,0,62 ; Clear the valid bit - bgt cr5,hpfNipBMx ; Do not try to lock a non-existant physent for a block mapping... - srdi r3,r7,12 ; Change page address to a page address - bl mapFindPhyTry ; Go find and try to lock physent (note: if R3 is 0, there is no physent for this page) - cmplwi cr1,r3,0 ; Check if this is in RAM - bne-- hpfNoPte64 ; Could not get it, try for another... - - crmove cr5_gt,cr1_eq ; If we did not find a physent, pretend that this is a block map - -hpfNipBMx: std r6,0(r19) ; Set the invalid PTE - li r9,tlbieLock ; Get the TLBIE lock - - srdi r11,r6,5 ; Shift VSID over for back hash - mfsprg r4,0 ; Get the per_proc - xor r11,r11,r19 ; Hash backwards to get low bits of VPN - sync ; Make sure the invalid is stored - - sldi r12,r6,16 ; Move AVPN to EA position - sldi r11,r11,5 ; Move this to the page position - -hpfTLBIE64: lwarx r0,0,r9 ; Get the TLBIE lock - mr. r0,r0 ; Is it locked? - li r0,1 ; Get our lock word - bne-- hpfTLBIE65 ; It is locked, go wait... - - stwcx. r0,0,r9 ; Try to get it - rldimi r12,r11,0,41 ; Stick the low part of the page number into the AVPN - rldicl r8,r6,52,50 ; Isolate the address space ID - bne-- hpfTLBIE64 ; We was beat... - addi r10,r10,1 ; Bump the steal count - - rldicl r11,r12,0,16 ; Clear cause the book says so - li r0,0 ; Lock clear value - - tlbie r11 ; Invalidate it everywhere - - mr r7,r8 ; Get a copy of the space ID - eieio ; Make sure that the tlbie happens first - rldimi r7,r7,14,36 ; Copy address space to make hash value - tlbsync ; Wait for everyone to catch up - rldimi r7,r7,28,22 ; Add in a 3rd copy of the hash up top - srdi r2,r6,26 ; Shift original segment down to bottom - - ptesync ; Make sure of it all - xor r7,r7,r2 ; Compute original segment - stw r0,tlbieLock(0) ; Clear the tlbie lock - - stw r10,hwSteals(r4) ; Save the steal count - bgt cr5,hpfInser64 ; We just stole a block mapping... - - rldimi r12,r7,28,0 ; Insert decoded segment - rldicl r4,r12,0,13 ; Trim to max supported address - - ld r12,8(r19) ; Get the RC of the just invalidated PTE - - la r11,ppLink+4(r3) ; Point to the master RC copy - ld r7,ppLink(r3) ; Grab the pointer to the first mapping - rlwinm r2,r12,27,ppRb-32,ppCb-32 ; Position the new RC - -hpfMrgRC64: lwarx r0,0,r11 ; Get the master RC - li r12,ppLFAmask ; Get mask to clean up alias pointer - or r0,r0,r2 ; Merge in the new RC - rotrdi r12,r12,ppLFArrot ; Rotate clean up mask to get 0xF0000000000000000F - stwcx. r0,0,r11 ; Try to stick it back - bne-- hpfMrgRC64 ; Try again if we collided... - -hpfFPnchx: andc. r7,r7,r12 ; Clean and test mapping address - beq-- hpfLostPhys ; We could not find our mapping. Kick the bucket... - - lhz r10,mpSpace(r7) ; Get the space - ld r9,mpVAddr(r7) ; And the vaddr - cmplw cr1,r10,r8 ; Is this one of ours? - xor r9,r4,r9 ; Compare virtual address - cmpldi r9,0x1000 ; See if we really match - crand cr0_eq,cr1_eq,cr0_lt ; See if both space and vaddr match - beq++ hpfFPnch2x ; Yes, found ours... - - ld r7,mpAlias(r7) ; Chain on to the next - b hpfFPnchx ; Check it out... - - .align 5 - -hpfTLBIE65: li r7,lgKillResv ; Point to the reservatio kill area - stwcx. r7,0,r7 ; Kill reservation - -hpfTLBIE63: lwz r0,0(r9) ; Get the TLBIE lock - mr. r0,r0 ; Is it locked? - beq++ hpfTLBIE64 ; Yup, wait for it... - b hpfTLBIE63 ; Nope, try again.. - - - -hpfFPnch2x: sub r0,r19,r27 ; Get offset to PTEG - stw r0,mpPte(r7) ; Invalidate the quick pointer (keep pointing at PTEG though) - bl mapPhysUnlock ; Unlock the physent now - - -hpfInser64: std r24,8(r19) ; Stuff in the real part of the PTE - eieio ; Make sure this gets there first - std r15,0(r19) ; Stuff the virtual part of the PTE and make it valid - mr r17,r18 ; Get the PCA image to set - b hpfFinish ; Go join the common exit code... - -hpfLostPhys: - lis r0,hi16(Choke) ; System abend - we must find the stolen mapping or we are dead - ori r0,r0,lo16(Choke) ; System abend - sc - -; -; This is the common code we execute when we are finished setting up the PTE. -; - - .align 5 - -hpfFinish: sub r4,r19,r27 ; Get offset of PTE - ori r4,r4,lo16(mpHValid) ; Add valid bit to PTE offset - bne cr7,hpfBailOut ; Do not set the PTE pointer for a block map - stw r4,mpPte(r31) ; Remember our PTE - -hpfBailOut: eieio ; Make sure all updates come first - stw r17,0(r20) ; Unlock and set the final PCA - -; -; This is where we go if we have started processing the fault, but find that someone -; else has taken care of it. -; - -hpfIgnore: lwz r2,mpFlags(r31) ; Get the mapping flags - rlwinm r2,r2,0,mpFIPb+1,mpFIPb-1 ; Clear the "fault in progress" flag - sth r2,mpFlags+2(r31) ; Set it - - la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkUnlock ; Unlock the search list - - li r11,T_IN_VAIN ; Say that it was handled - b EXT(PFSExit) ; Leave... - -; -; This is where we go when we find that someone else -; is in the process of handling the fault. -; - -hpfAbandon: li r3,lgKillResv ; Kill off any reservation - stwcx. r3,0,r3 ; Do it - - la r3,pmapSXlk(r28) ; Point to the pmap search lock - bl sxlkUnlock ; Unlock the search list - - li r11,T_IN_VAIN ; Say that it was handled - b EXT(PFSExit) ; Leave... - -; -; Guest shadow assist -- page fault handler -; -; Here we handle a fault in a guest pmap that has the guest shadow mapping -; assist active. We locate the VMM pmap extension block, which contains an -; index over the discontiguous multi-page shadow hash table. The index -; corresponding to our vaddr is selected, and the selected group within -; that page is searched for a valid and active entry that contains -; our vaddr and space id. The search is pipelined, so that we may fetch -; the next slot while examining the current slot for a hit. The final -; search iteration is unrolled so that we don't fetch beyond the end of -; our group, which could have dire consequences depending upon where the -; physical hash page is located. -; -; The VMM pmap extension block occupies a page. Begining at offset 0, we -; have the pmap_vmm_ext proper. Aligned at the first 128-byte boundary -; after the pmap_vmm_ext is the hash table physical address index, a -; linear list of 64-bit physical addresses of the pages that comprise -; the hash table. -; -; In the event that we succesfully locate a guest mapping, we re-join -; the page fault path at hpfGVfound with the mapping's address in r31; -; otherwise, we re-join at hpfNotFound. In either case, we re-join holding -; a share of the pmap search lock for the host pmap with the host pmap's -; address in r28, the guest pmap's space id in r21, and the guest pmap's -; flags in r12. -; - - .align 5 -hpfGVxlate: - bt pf64Bitb,hpfGV64 ; Take 64-bit path for 64-bit machine - - lwz r11,pmapVmmExtPhys+4(r28) ; r11 <- VMM pmap extension block paddr - lwz r12,pmapFlags(r28) ; r12 <- guest pmap's flags - lwz r21,pmapSpace(r28) ; r21 <- guest space ID number - lwz r28,vmxHostPmapPhys+4(r11) ; r28 <- host pmap's paddr - la r31,VMX_HPIDX_OFFSET(r11) ; r31 <- base of hash page physical index - rlwinm r10,r30,0,0xFFFFF000 ; r10 <- page-aligned guest vaddr - lwz r6,vxsGpf(r11) ; Get guest fault count - - srwi r3,r10,12 ; Form shadow hash: - xor r3,r3,r21 ; spaceID ^ (vaddr >> 12) - rlwinm r4,r3,GV_HPAGE_SHIFT,GV_HPAGE_MASK - ; Form index offset from hash page number - add r31,r31,r4 ; r31 <- hash page index entry - lwz r31,4(r31) ; r31 <- hash page paddr - rlwimi r31,r3,GV_HGRP_SHIFT,GV_HGRP_MASK - ; r31 <- hash group paddr - - la r3,pmapSXlk(r28) ; Point to the host pmap's search lock - bl sxlkShared ; Go get a shared lock on the mapping lists - mr. r3,r3 ; Did we get the lock? - bne- hpfBadLock ; Nope... - - lwz r3,mpFlags(r31) ; r3 <- 1st mapping slot's flags - lhz r4,mpSpace(r31) ; r4 <- 1st mapping slot's space ID - lwz r5,mpVAddr+4(r31) ; r5 <- 1st mapping slot's virtual address - addi r6,r6,1 ; Increment guest fault count - li r0,(GV_SLOTS - 1) ; Prepare to iterate over mapping slots - mtctr r0 ; in this group - stw r6,vxsGpf(r11) ; Update guest fault count - b hpfGVlp32 - - .align 5 -hpfGVlp32: - mr r6,r3 ; r6 <- current mapping slot's flags - lwz r3,mpFlags+GV_SLOT_SZ(r31) ; r3 <- next mapping slot's flags - mr r7,r4 ; r7 <- current mapping slot's space ID - lhz r4,mpSpace+GV_SLOT_SZ(r31) ; r4 <- next mapping slot's space ID - clrrwi r8,r5,12 ; r8 <- current mapping slot's virtual addr w/o flags - lwz r5,mpVAddr+4+GV_SLOT_SZ(r31); r5 <- next mapping slot's virtual addr - andi. r6,r6,mpgFree+mpgDormant ; Isolate guest free and dormant flags - xor r7,r7,r21 ; Compare space ID - or r0,r6,r7 ; r0 <- !(!free && !dormant && space match) - xor r8,r8,r10 ; Compare virtual address - or. r0,r0,r8 ; cr0_eq <- !free && !dormant && space match && virtual addr match - beq hpfGVfound ; Join common patch on hit (r31 points to mapping) - - addi r31,r31,GV_SLOT_SZ ; r31 <- next mapping slot - bdnz hpfGVlp32 ; Iterate - - clrrwi r5,r5,12 ; Remove flags from virtual address - andi. r3,r3,mpgFree+mpgDormant ; Isolate guest free and dormant flag - xor r4,r4,r21 ; Compare space ID - or r0,r3,r4 ; r0 <- !(!free && !dormant && space match) - xor r5,r5,r10 ; Compare virtual address - or. r0,r0,r5 ; cr0_eq <- !free && !dormant && space match && virtual addr match - beq hpfGVfound ; Join common patch on hit (r31 points to mapping) - - b hpfGVmiss - - .align 5 -hpfGV64: - ld r11,pmapVmmExtPhys(r28) ; r11 <- VMM pmap extension block paddr - lwz r12,pmapFlags(r28) ; r12 <- guest pmap's flags - lwz r21,pmapSpace(r28) ; r21 <- guest space ID number - ld r28,vmxHostPmapPhys(r11) ; r28 <- host pmap's paddr - la r31,VMX_HPIDX_OFFSET(r11) ; r31 <- base of hash page physical index - rlwinm r10,r30,0,0xFFFFF000 ; Form 64-bit guest vaddr - rldimi r10,r29,32,0 ; cleaning up low-order 12 bits - lwz r6,vxsGpf(r11) ; Get guest fault count - - srwi r3,r10,12 ; Form shadow hash: - xor r3,r3,r21 ; spaceID ^ (vaddr >> 12) - rlwinm r4,r3,GV_HPAGE_SHIFT,GV_HPAGE_MASK - ; Form index offset from hash page number - add r31,r31,r4 ; r31 <- hash page index entry - ld r31,0(r31) ; r31 <- hash page paddr - insrdi r31,r3,GV_GRPS_PPG_LG2,64-(GV_HGRP_SHIFT+GV_GRPS_PPG_LG2) - ; r31 <- hash group paddr - - la r3,pmapSXlk(r28) ; Point to the host pmap's search lock - bl sxlkShared ; Go get a shared lock on the mapping lists - mr. r3,r3 ; Did we get the lock? - bne-- hpfBadLock ; Nope... - - lwz r3,mpFlags(r31) ; r3 <- 1st mapping slot's flags - lhz r4,mpSpace(r31) ; r4 <- 1st mapping slot's space ID - ld r5,mpVAddr(r31) ; r5 <- 1st mapping slot's virtual address - addi r6,r6,1 ; Increment guest fault count - li r0,(GV_SLOTS - 1) ; Prepare to iterate over mapping slots - mtctr r0 ; in this group - stw r6,vxsGpf(r11) ; Update guest fault count - b hpfGVlp64 - - .align 5 -hpfGVlp64: - mr r6,r3 ; r6 <- current mapping slot's flags - lwz r3,mpFlags+GV_SLOT_SZ(r31) ; r3 <- next mapping slot's flags - mr r7,r4 ; r7 <- current mapping slot's space ID - lhz r4,mpSpace+GV_SLOT_SZ(r31) ; r4 <- next mapping slot's space ID - clrrdi r8,r5,12 ; r8 <- current mapping slot's virtual addr w/o flags - ld r5,mpVAddr+GV_SLOT_SZ(r31) ; r5 <- next mapping slot's virtual addr - andi. r6,r6,mpgFree+mpgDormant ; Isolate guest free and dormant flag - xor r7,r7,r21 ; Compare space ID - or r0,r6,r7 ; r0 <- !(!free && !dormant && space match) - xor r8,r8,r10 ; Compare virtual address - or. r0,r0,r8 ; cr0_eq <- !free && !dormant && space match && virtual addr match - beq hpfGVfound ; Join common path on hit (r31 points to mapping) - - addi r31,r31,GV_SLOT_SZ ; r31 <- next mapping slot - bdnz hpfGVlp64 ; Iterate - - clrrdi r5,r5,12 ; Remove flags from virtual address - andi. r3,r3,mpgFree+mpgDormant ; Isolate guest free and dormant flag - xor r4,r4,r21 ; Compare space ID - or r0,r3,r4 ; r0 <- !(!free && !dormant && space match) - xor r5,r5,r10 ; Compare virtual address - or. r0,r0,r5 ; cr0_eq <- !free && !dormant && space match && virtual addr match - beq hpfGVfound ; Join common path on hit (r31 points to mapping) - -hpfGVmiss: - lwz r6,vxsGpfMiss(r11) ; Guest guest fault miss count - addi r6,r6,1 ; Increment miss count - stw r6,vxsGpfMiss(r11) ; Update guest fault miss count - b hpfNotFound - -/* - * hw_set_user_space(pmap) - * hw_set_user_space_dis(pmap) - * - * Indicate whether memory space needs to be switched. - * We really need to turn off interrupts here, because we need to be non-preemptable - * - * hw_set_user_space_dis is used when interruptions are already disabled. Mind the - * register usage here. The VMM switch code in vmachmon.s that calls this - * know what registers are in use. Check that if these change. - */ - - - - .align 5 - .globl EXT(hw_set_user_space) - -LEXT(hw_set_user_space) - - lis r8,hi16(MASK(MSR_VEC)) ; Get the vector enable - mfmsr r10 ; Get the current MSR - ori r8,r8,lo16(MASK(MSR_FP)) ; Add in FP - ori r9,r8,lo16(MASK(MSR_EE)) ; Add in the EE - andc r10,r10,r8 ; Turn off VEC, FP for good - andc r9,r10,r9 ; Turn off EE also - mtmsr r9 ; Disable them - isync ; Make sure FP and vec are off - mfsprg r6,1 ; Get the current activation - lwz r6,ACT_PER_PROC(r6) ; Get the per_proc block - lwz r2,ppUserPmapVirt(r6) ; Get our virtual pmap address - mfsprg r4,2 ; The the feature flags - lwz r7,pmapvr(r3) ; Get the v to r translation - lwz r8,pmapvr+4(r3) ; Get the v to r translation - mtcrf 0x80,r4 ; Get the Altivec flag - xor r4,r3,r8 ; Get bottom of the real address of bmap anchor - cmplw cr1,r3,r2 ; Same address space as before? - stw r7,ppUserPmap(r6) ; Show our real pmap address - crorc cr1_eq,cr1_eq,pfAltivecb ; See if same address space or not altivec machine - stw r4,ppUserPmap+4(r6) ; Show our real pmap address - stw r3,ppUserPmapVirt(r6) ; Show our virtual pmap address - mtmsr r10 ; Restore interruptions - beqlr-- cr1 ; Leave if the same address space or not Altivec - - dssall ; Need to kill all data streams if adrsp changed - sync - blr ; Return... - - .align 5 - .globl EXT(hw_set_user_space_dis) - -LEXT(hw_set_user_space_dis) - - lwz r7,pmapvr(r3) ; Get the v to r translation - mfsprg r4,2 ; The the feature flags - lwz r8,pmapvr+4(r3) ; Get the v to r translation - mfsprg r6,1 ; Get the current activation - lwz r6,ACT_PER_PROC(r6) ; Get the per_proc block - lwz r2,ppUserPmapVirt(r6) ; Get our virtual pmap address - mtcrf 0x80,r4 ; Get the Altivec flag - xor r4,r3,r8 ; Get bottom of the real address of bmap anchor - cmplw cr1,r3,r2 ; Same address space as before? - stw r7,ppUserPmap(r6) ; Show our real pmap address - crorc cr1_eq,cr1_eq,pfAltivecb ; See if same address space or not altivec machine - stw r4,ppUserPmap+4(r6) ; Show our real pmap address - stw r3,ppUserPmapVirt(r6) ; Show our virtual pmap address - beqlr-- cr1 ; Leave if the same - - dssall ; Need to kill all data streams if adrsp changed - sync - blr ; Return... - -/* int mapalc1(struct mappingblok *mb) - Finds, allocates, and zeros a free 1-bit mapping entry - * - * Lock must already be held on mapping block list - * returns 0 if all slots filled. - * returns n if a slot is found and it is not the last - * returns -n if a slot is found and it is the last - * when n and -n are returned, the corresponding bit is cleared - * the mapping is zeroed out before return - * - */ - - .align 5 - .globl EXT(mapalc1) - -LEXT(mapalc1) - lwz r4,mbfree(r3) ; Get the 1st mask - lis r0,0x8000 ; Get the mask to clear the first free bit - lwz r5,mbfree+4(r3) ; Get the 2nd mask - mr r12,r3 ; Save the block ptr - cntlzw r3,r4 ; Get first 1-bit in 1st word - srw. r9,r0,r3 ; Get bit corresponding to first free one - cntlzw r10,r5 ; Get first free field in second word - andc r4,r4,r9 ; Turn 1-bit off in 1st word - bne mapalc1f ; Found one in 1st word - - srw. r9,r0,r10 ; Get bit corresponding to first free one in 2nd word - li r3,0 ; assume failure return - andc r5,r5,r9 ; Turn it off - beqlr-- ; There are no 1 bits left... - addi r3,r10,32 ; set the correct number - -mapalc1f: - or. r0,r4,r5 ; any more bits set? - stw r4,mbfree(r12) ; update bitmasks - stw r5,mbfree+4(r12) - - slwi r6,r3,6 ; get (n * mpBasicSize), ie offset of mapping in block - addi r7,r6,32 - dcbz r6,r12 ; clear the 64-byte mapping - dcbz r7,r12 - - bnelr++ ; return if another bit remains set - - neg r3,r3 ; indicate we just returned the last bit - blr - - -/* int mapalc2(struct mappingblok *mb) - Finds, allocates, and zero's a free 2-bit mapping entry - * - * Lock must already be held on mapping block list - * returns 0 if all slots filled. - * returns n if a slot is found and it is not the last - * returns -n if a slot is found and it is the last - * when n and -n are returned, the corresponding bits are cleared - * We find runs of 2 consecutive 1 bits by cntlzw(n & (n<<1)). - * the mapping is zero'd out before return - */ - - .align 5 - .globl EXT(mapalc2) -LEXT(mapalc2) - lwz r4,mbfree(r3) ; Get the first mask - lis r0,0x8000 ; Get the mask to clear the first free bit - lwz r5,mbfree+4(r3) ; Get the second mask - mr r12,r3 ; Save the block ptr - slwi r6,r4,1 ; shift first word over - and r6,r4,r6 ; lite start of double bit runs in 1st word - slwi r7,r5,1 ; shift 2nd word over - cntlzw r3,r6 ; Get first free 2-bit run in 1st word - and r7,r5,r7 ; lite start of double bit runs in 2nd word - srw. r9,r0,r3 ; Get bit corresponding to first run in 1st word - cntlzw r10,r7 ; Get first free field in second word - srwi r11,r9,1 ; shift over for 2nd bit in 1st word - andc r4,r4,r9 ; Turn off 1st bit in 1st word - andc r4,r4,r11 ; turn off 2nd bit in 1st word - bne mapalc2a ; Found two consecutive free bits in 1st word - - srw. r9,r0,r10 ; Get bit corresponding to first free one in second word - li r3,0 ; assume failure - srwi r11,r9,1 ; get mask for 2nd bit - andc r5,r5,r9 ; Turn off 1st bit in 2nd word - andc r5,r5,r11 ; turn off 2nd bit in 2nd word - beq-- mapalc2c ; There are no runs of 2 bits in 2nd word either - addi r3,r10,32 ; set the correct number - -mapalc2a: - or. r0,r4,r5 ; any more bits set? - stw r4,mbfree(r12) ; update bitmasks - stw r5,mbfree+4(r12) - slwi r6,r3,6 ; get (n * mpBasicSize), ie offset of mapping in block - addi r7,r6,32 - addi r8,r6,64 - addi r9,r6,96 - dcbz r6,r12 ; zero out the 128-byte mapping - dcbz r7,r12 ; we use the slow 32-byte dcbz even on 64-bit machines - dcbz r8,r12 ; because the mapping may not be 128-byte aligned - dcbz r9,r12 - - bnelr++ ; return if another bit remains set - - neg r3,r3 ; indicate we just returned the last bit - blr - -mapalc2c: - rlwinm r7,r5,1,31,31 ; move bit 0 of 2nd word to bit 31 - and. r0,r4,r7 ; is the 2-bit field that spans the 2 words free? - beqlr ; no, we failed - rlwinm r4,r4,0,0,30 ; yes, turn off bit 31 of 1st word - rlwinm r5,r5,0,1,31 ; turn off bit 0 of 2nd word - li r3,31 ; get index of this field - b mapalc2a - - -; -; This routine initialzes the hash table and PCA. -; It is done here because we may need to be 64-bit to do it. -; - - .align 5 - .globl EXT(hw_hash_init) - -LEXT(hw_hash_init) - - mfsprg r10,2 ; Get feature flags - lis r12,hi16(EXT(hash_table_size)) ; Get hash table size address - mtcrf 0x02,r10 ; move pf64Bit to cr6 - lis r11,hi16(EXT(hash_table_base)) ; Get hash table base address - lis r4,0xFF01 ; Set all slots free and start steal at end - ori r12,r12,lo16(EXT(hash_table_size)) ; Get hash table size address - ori r11,r11,lo16(EXT(hash_table_base)) ; Get hash table base address - - lwz r12,0(r12) ; Get hash table size - li r3,0 ; Get start - bt++ pf64Bitb,hhiSF ; skip if 64-bit (only they take the hint) - - lwz r11,4(r11) ; Get hash table base - -hhiNext32: cmplw r3,r12 ; Have we reached the end? - bge- hhiCPCA32 ; Yes... - dcbz r3,r11 ; Clear the line - addi r3,r3,32 ; Next one... - b hhiNext32 ; Go on... - -hhiCPCA32: rlwinm r12,r12,28,4,29 ; Get number of slots * 4 - li r3,-4 ; Displacement to first PCA entry - neg r12,r12 ; Get negative end of PCA - -hhiNPCA32: stwx r4,r3,r11 ; Initialize the PCA entry - subi r3,r3,4 ; Next slot - cmpw r3,r12 ; Have we finished? - bge+ hhiNPCA32 ; Not yet... - blr ; Leave... - -hhiSF: mfmsr r9 ; Save the MSR - li r8,1 ; Get a 1 - mr r0,r9 ; Get a copy of the MSR - ld r11,0(r11) ; Get hash table base - rldimi r0,r8,63,MSR_SF_BIT ; Set SF bit (bit 0) - mtmsrd r0 ; Turn on SF - isync - - -hhiNext64: cmpld r3,r12 ; Have we reached the end? - bge-- hhiCPCA64 ; Yes... - dcbz128 r3,r11 ; Clear the line - addi r3,r3,128 ; Next one... - b hhiNext64 ; Go on... - -hhiCPCA64: rlwinm r12,r12,27,5,29 ; Get number of slots * 4 - li r3,-4 ; Displacement to first PCA entry - neg r12,r12 ; Get negative end of PCA - -hhiNPCA64: stwx r4,r3,r11 ; Initialize the PCA entry - subi r3,r3,4 ; Next slot - cmpd r3,r12 ; Have we finished? - bge++ hhiNPCA64 ; Not yet... - - mtmsrd r9 ; Turn off SF if it was off - isync - blr ; Leave... - - -; -; This routine sets up the hardware to start translation. -; Note that we do NOT start translation. -; - - .align 5 - .globl EXT(hw_setup_trans) - -LEXT(hw_setup_trans) - - mfsprg r11,0 ; Get the per_proc block - mfsprg r12,2 ; Get feature flags - li r0,0 ; Get a 0 - li r2,1 ; And a 1 - mtcrf 0x02,r12 ; Move pf64Bit to cr6 - stw r0,validSegs(r11) ; Make sure we think all SR/STEs are invalid - stw r0,validSegs+4(r11) ; Make sure we think all SR/STEs are invalid, part deux - sth r2,ppInvSeg(r11) ; Force a reload of the SRs - sth r0,ppCurSeg(r11) ; Set that we are starting out in kernel - - bt++ pf64Bitb,hstSF ; skip if 64-bit (only they take the hint) - - li r9,0 ; Clear out a register - sync - isync - mtdbatu 0,r9 ; Invalidate maps - mtdbatl 0,r9 ; Invalidate maps - mtdbatu 1,r9 ; Invalidate maps - mtdbatl 1,r9 ; Invalidate maps - mtdbatu 2,r9 ; Invalidate maps - mtdbatl 2,r9 ; Invalidate maps - mtdbatu 3,r9 ; Invalidate maps - mtdbatl 3,r9 ; Invalidate maps - - mtibatu 0,r9 ; Invalidate maps - mtibatl 0,r9 ; Invalidate maps - mtibatu 1,r9 ; Invalidate maps - mtibatl 1,r9 ; Invalidate maps - mtibatu 2,r9 ; Invalidate maps - mtibatl 2,r9 ; Invalidate maps - mtibatu 3,r9 ; Invalidate maps - mtibatl 3,r9 ; Invalidate maps - - lis r11,hi16(EXT(hash_table_base)) ; Get hash table base address - lis r12,hi16(EXT(hash_table_size)) ; Get hash table size address - ori r11,r11,lo16(EXT(hash_table_base)) ; Get hash table base address - ori r12,r12,lo16(EXT(hash_table_size)) ; Get hash table size address - lwz r11,4(r11) ; Get hash table base - lwz r12,0(r12) ; Get hash table size - subi r12,r12,1 ; Back off by 1 - rlwimi r11,r12,16,23,31 ; Stick the size into the sdr1 image - - mtsdr1 r11 ; Ok, we now have the hash table set up - sync - - li r12,invalSpace ; Get the invalid segment value - li r10,0 ; Start low - -hstsetsr: mtsrin r12,r10 ; Set the SR - addis r10,r10,0x1000 ; Bump the segment - mr. r10,r10 ; Are we finished? - bne+ hstsetsr ; Nope... - sync - blr ; Return... - -; -; 64-bit version -; - -hstSF: lis r11,hi16(EXT(hash_table_base)) ; Get hash table base address - lis r12,hi16(EXT(hash_table_size)) ; Get hash table size address - ori r11,r11,lo16(EXT(hash_table_base)) ; Get hash table base address - ori r12,r12,lo16(EXT(hash_table_size)) ; Get hash table size address - ld r11,0(r11) ; Get hash table base - lwz r12,0(r12) ; Get hash table size - cntlzw r10,r12 ; Get the number of bits - subfic r10,r10,13 ; Get the extra bits we need - or r11,r11,r10 ; Add the size field to SDR1 - - mtsdr1 r11 ; Ok, we now have the hash table set up - sync - - li r0,0 ; Set an SLB slot index of 0 - slbia ; Trash all SLB entries (except for entry 0 that is) - slbmfee r7,r0 ; Get the entry that is in SLB index 0 - rldicr r7,r7,0,35 ; Clear the valid bit and the rest - slbie r7 ; Invalidate it - - blr ; Return... - - -; -; This routine turns on translation for the first time on a processor -; - - .align 5 - .globl EXT(hw_start_trans) - -LEXT(hw_start_trans) - - - mfmsr r10 ; Get the msr - ori r10,r10,lo16(MASK(MSR_IR) | MASK(MSR_DR)) ; Turn on translation - - mtmsr r10 ; Everything falls apart here - isync - - blr ; Back to it. - - - -; -; This routine validates a segment register. -; hw_map_seg(pmap_t pmap, addr64_t seg, addr64_t va) -; -; r3 = virtual pmap -; r4 = segment[0:31] -; r5 = segment[32:63] -; r6 = va[0:31] -; r7 = va[32:63] -; -; Note that we transform the addr64_t (long long) parameters into single 64-bit values. -; Note that there is no reason to apply the key modifier here because this is only -; used for kernel accesses. -; - - .align 5 - .globl EXT(hw_map_seg) - -LEXT(hw_map_seg) - - lwz r0,pmapSpace(r3) ; Get the space, we will need it soon - lwz r9,pmapFlags(r3) ; Get the flags for the keys now - mfsprg r10,2 ; Get feature flags - -; -; Note: the following code would problably be easier to follow if I split it, -; but I just wanted to see if I could write this to work on both 32- and 64-bit -; machines combined. -; - -; -; Here we enter with va[0:31] in r6[0:31] (or r6[32:63] on 64-bit machines) -; and va[32:63] in r7[0:31] (or r7[32:63] on 64-bit machines) - - rlwinm r4,r4,0,1,0 ; Copy seg[0:31] into r4[0;31] - no-op for 32-bit - rlwinm r7,r7,18,14,17 ; Slide va[32:35] east to just west of space ID - mtcrf 0x02,r10 ; Move pf64Bit and pfNoMSRirb to cr5 and 6 - srwi r8,r6,14 ; Slide va[0:17] east to just west of the rest - rlwimi r7,r6,18,0,13 ; Slide va[18:31] east to just west of slid va[32:25] - rlwimi r0,r0,14,4,17 ; Dup address space ID above itself - rlwinm r8,r8,0,1,0 ; Dup low part into high (does nothing on 32-bit machines) - rlwinm r2,r0,28,0,31 ; Rotate rotate low nybble to top of low half - rlwimi r2,r2,0,1,0 ; Replicate bottom 32 into top 32 - rlwimi r8,r7,0,0,31 ; Join va[0:17] with va[18:35] (just like mr on 32-bit machines) - - rlwimi r2,r0,0,4,31 ; We should now have 4 copies of the space - ; concatenated together. There is garbage - ; at the top for 64-bit but we will clean - ; that out later. - rlwimi r4,r5,0,0,31 ; Copy seg[32:63] into r4[32:63] - just like mr for 32-bit - - -; -; Here we exit with va[0:35] shifted into r8[14:51], zeros elsewhere, or -; va[18:35] shifted into r8[0:17], zeros elsewhere on 32-bit machines -; - -; -; What we have now is: -; -; 0 0 1 2 3 4 4 5 6 -; 0 8 6 4 2 0 8 6 3 - for 64-bit machines -; +--------+--------+--------+--------+--------+--------+--------+--------+ -; r2 = |xxxx0000|AAAAAAAA|AAAAAABB|BBBBBBBB|BBBBCCCC|CCCCCCCC|CCDDDDDD|DDDDDDDD| - hash value -; +--------+--------+--------+--------+--------+--------+--------+--------+ -; 0 0 1 2 3 - for 32-bit machines -; 0 8 6 4 1 -; -; 0 0 1 2 3 4 4 5 6 -; 0 8 6 4 2 0 8 6 3 - for 64-bit machines -; +--------+--------+--------+--------+--------+--------+--------+--------+ -; r8 = |00000000|000000SS|SSSSSSSS|SSSSSSSS|SSSSSSSS|SSSSSSSS|SS000000|00000000| - shifted and cleaned EA -; +--------+--------+--------+--------+--------+--------+--------+--------+ -; 0 0 1 2 3 - for 32-bit machines -; 0 8 6 4 1 -; -; 0 0 1 2 3 4 4 5 6 -; 0 8 6 4 2 0 8 6 3 - for 64-bit machines -; +--------+--------+--------+--------+--------+--------+--------+--------+ -; r4 = |SSSSSSSS|SSSSSSSS|SSSSSSSS|SSSSSSSS|SSSS0000|00000000|00000000|00000000| - Segment -; +--------+--------+--------+--------+--------+--------+--------+--------+ -; 0 0 1 2 3 - for 32-bit machines -; 0 8 6 4 1 - - - xor r8,r8,r2 ; Calculate VSID - - bf-- pf64Bitb,hms32bit ; Skip out if 32-bit... - mfsprg r12,0 ; Get the per_proc - li r0,1 ; Prepare to set bit 0 (also to clear EE) - mfmsr r6 ; Get current MSR - li r2,MASK(MSR_IR)|MASK(MSR_DR) ; Get the translation bits - mtmsrd r0,1 ; Set only the EE bit to 0 - rlwinm r6,r6,0,MSR_EE_BIT,MSR_EE_BIT ; See if EE bit is on - mfmsr r11 ; Get the MSR right now, after disabling EE - andc r2,r11,r2 ; Turn off translation now - rldimi r2,r0,63,0 ; Get bit 64-bit turned on - or r11,r11,r6 ; Turn on the EE bit if it was on - mtmsrd r2 ; Make sure translation and EE are off and 64-bit is on - isync ; Hang out a bit - - ld r6,validSegs(r12) ; Get the valid SLB entry flags - sldi r9,r9,9 ; Position the key and noex bit - - rldimi r5,r8,12,0 ; Form the VSID/key - - not r3,r6 ; Make valids be 0s - - cntlzd r7,r3 ; Find a free SLB - cmplwi r7,63 ; Did we find a free SLB entry? - - slbie r4 ; Since this ESID may still be in an SLBE, kill it - - oris r4,r4,0x0800 ; Turn on the valid bit in ESID - addi r7,r7,1 ; Make sure we skip slb 0 - blt++ hmsFreeSeg ; Yes, go load it... - -; -; No free SLB entries, select one that is in use and invalidate it -; - lwz r2,ppSegSteal(r12) ; Get the next slot to steal - addi r7,r2,pmapSegCacheUse+1 ; Select stealee from non-cached slots only - addi r2,r2,1 ; Set next slot to steal - slbmfee r3,r7 ; Get the entry that is in the selected spot - subi r8,r2,64-(pmapSegCacheUse+1) ; Force steal to wrap - rldicr r3,r3,0,35 ; Clear the valid bit and the rest - srawi r8,r8,31 ; Get -1 if steal index still in range - slbie r3 ; Invalidate the in-use SLB entry - and r2,r2,r8 ; Reset steal index when it should wrap - isync ; - - stw r2,ppSegSteal(r12) ; Set the next slot to steal -; -; We are now ready to stick the SLB entry in the SLB and mark it in use -; - -hmsFreeSeg: subi r2,r7,1 ; Adjust for skipped slb 0 - rldimi r4,r7,0,58 ; Copy in the SLB entry selector - srd r0,r0,r2 ; Set bit mask for allocation - rldicl r5,r5,0,15 ; Clean out the unsupported bits - or r6,r6,r0 ; Turn on the allocation flag - - slbmte r5,r4 ; Make that SLB entry - - std r6,validSegs(r12) ; Mark as valid - mtmsrd r11 ; Restore the MSR - isync - blr ; Back to it... - - .align 5 - -hms32bit: - mfsprg r12,1 ; Get the current activation - lwz r12,ACT_PER_PROC(r12) ; Get the per_proc block - rlwinm r8,r8,0,8,31 ; Clean up the VSID - rlwinm r2,r4,4,28,31 ; Isolate the segment we are setting - lis r0,0x8000 ; Set bit 0 - rlwimi r8,r9,28,1,3 ; Insert the keys and N bit - srw r0,r0,r2 ; Get bit corresponding to SR - addi r7,r12,validSegs ; Point to the valid segment flags directly - - mtsrin r8,r4 ; Set the actual SR - isync ; Need to make sure this is done - -hmsrupt: lwarx r6,0,r7 ; Get and reserve the valid segment flags - or r6,r6,r0 ; Show that SR is valid - stwcx. r6,0,r7 ; Set the valid SR flags - bne-- hmsrupt ; Had an interrupt, need to get flags again... - - blr ; Back to it... - - -; -; This routine invalidates a segment register. -; - - .align 5 - .globl EXT(hw_blow_seg) - -LEXT(hw_blow_seg) - - mfsprg r10,2 ; Get feature flags - mtcrf 0x02,r10 ; move pf64Bit and pfNoMSRirb to cr5 and 6 - - rlwinm r9,r4,0,0,3 ; Save low segment address and make sure it is clean - - bf-- pf64Bitb,hbs32bit ; Skip out if 32-bit... - - li r0,1 ; Prepare to set bit 0 (also to clear EE) - mfmsr r6 ; Get current MSR - li r2,MASK(MSR_IR)|MASK(MSR_DR) ; Get the translation bits - mtmsrd r0,1 ; Set only the EE bit to 0 - rlwinm r6,r6,0,MSR_EE_BIT,MSR_EE_BIT ; See if EE bit is on - mfmsr r11 ; Get the MSR right now, after disabling EE - andc r2,r11,r2 ; Turn off translation now - rldimi r2,r0,63,0 ; Get bit 64-bit turned on - or r11,r11,r6 ; Turn on the EE bit if it was on - mtmsrd r2 ; Make sure translation and EE are off and 64-bit is on - isync ; Hang out a bit - - rldimi r9,r3,32,0 ; Insert the top part of the ESID - - slbie r9 ; Invalidate the associated SLB entry - - mtmsrd r11 ; Restore the MSR - isync - blr ; Back to it. - - .align 5 - -hbs32bit: - mfsprg r12,1 ; Get the current activation - lwz r12,ACT_PER_PROC(r12) ; Get the per_proc block - addi r7,r12,validSegs ; Point to the valid segment flags directly - lwarx r4,0,r7 ; Get and reserve the valid segment flags - rlwinm r6,r9,4,28,31 ; Convert segment to number - lis r2,0x8000 ; Set up a mask - srw r2,r2,r6 ; Make a mask - and. r0,r4,r2 ; See if this is even valid - li r5,invalSpace ; Set the invalid address space VSID - beqlr ; Leave if already invalid... - - mtsrin r5,r9 ; Slam the segment register - isync ; Need to make sure this is done - -hbsrupt: andc r4,r4,r2 ; Clear the valid bit for this segment - stwcx. r4,0,r7 ; Set the valid SR flags - beqlr++ ; Stored ok, no interrupt, time to leave... - - lwarx r4,0,r7 ; Get and reserve the valid segment flags again - b hbsrupt ; Try again... - -; -; This routine invadates the entire pmap segment cache -; -; Translation is on, interrupts may or may not be enabled. -; - - .align 5 - .globl EXT(invalidateSegs) - -LEXT(invalidateSegs) - - la r10,pmapCCtl(r3) ; Point to the segment cache control - eqv r2,r2,r2 ; Get all foxes - -isInv: lwarx r4,0,r10 ; Get the segment cache control value - rlwimi r4,r2,0,0,15 ; Slam in all invalid bits - rlwinm. r0,r4,0,pmapCCtlLckb,pmapCCtlLckb ; Is it already locked? - bne-- isInv0 ; Yes, try again... - - stwcx. r4,0,r10 ; Try to invalidate it - bne-- isInv ; Someone else just stuffed it... - blr ; Leave... - - -isInv0: li r4,lgKillResv ; Get reservation kill zone - stwcx. r4,0,r4 ; Kill reservation - -isInv1: lwz r4,pmapCCtl(r3) ; Get the segment cache control - rlwinm. r0,r4,0,pmapCCtlLckb,pmapCCtlLckb ; Is it already locked? - bne-- isInv ; Nope... - b isInv1 ; Still locked do it again... - -; -; This routine switches segment registers between kernel and user. -; We have some assumptions and rules: -; We are in the exception vectors -; pf64Bitb is set up -; R3 contains the MSR we going to -; We can not use R4, R13, R20, R21, R25, R26, R29 -; R13 is the savearea -; R29 has the per_proc -; -; We return R3 as 0 if we did not switch between kernel and user -; We also maintain and apply the user state key modifier used by VMM support; -; If we go to the kernel it is set to 0, otherwise it follows the bit -; in spcFlags. -; - - .align 5 - .globl EXT(switchSegs) - -LEXT(switchSegs) - - lwz r22,ppInvSeg(r29) ; Get the ppInvSeg (force invalidate) and ppCurSeg (user or kernel segments indicator) - lwz r9,spcFlags(r29) ; Pick up the special user state flags - rlwinm r2,r3,MSR_PR_BIT+1,31,31 ; Isolate the problem mode bit - rlwinm r3,r3,MSR_RI_BIT+1,31,31 ; Isolate the recoverable interrupt bit - lis r8,hi16(EXT(kernel_pmap_phys)) ; Assume kernel - or r2,r2,r3 ; This will 1 if we will be using user segments - li r3,0 ; Get a selection mask - cmplw r2,r22 ; This will be EQ if same state and not ppInvSeg - ori r8,r8,lo16(EXT(kernel_pmap_phys)) ; Assume kernel (bottom of address) - sub r3,r3,r2 ; Form select mask - 0 if kernel, -1 if user - la r19,ppUserPmap(r29) ; Point to the current user pmap - -; The following line is an exercise of a generally unreadable but recompile-friendly programing practice - rlwinm r30,r9,userProtKeybit+1+(63-sgcVSKeyUsr),sgcVSKeyUsr-32,sgcVSKeyUsr-32 ; Isolate the user state protection key - - andc r8,r8,r3 ; Zero kernel pmap ptr if user, untouched otherwise - and r19,r19,r3 ; Zero user pmap ptr if kernel, untouched otherwise - and r30,r30,r3 ; Clear key modifier if kernel, leave otherwise - or r8,r8,r19 ; Get the pointer to the pmap we are using - - beqlr ; We are staying in the same mode, do not touch segs... - - lwz r28,0(r8) ; Get top half of pmap address - lwz r10,4(r8) ; Get bottom half - - stw r2,ppInvSeg(r29) ; Clear request for invalidate and save ppCurSeg - rlwinm r28,r28,0,1,0 ; Copy top to top - stw r30,ppMapFlags(r29) ; Set the key modifier - rlwimi r28,r10,0,0,31 ; Insert bottom - - la r10,pmapCCtl(r28) ; Point to the segment cache control - la r9,pmapSegCache(r28) ; Point to the segment cache - -ssgLock: lwarx r15,0,r10 ; Get and reserve the segment cache control - rlwinm. r0,r15,0,pmapCCtlLckb,pmapCCtlLckb ; Someone have the lock? - ori r16,r15,lo16(pmapCCtlLck) ; Set lock bit - bne-- ssgLock0 ; Yup, this is in use... - - stwcx. r16,0,r10 ; Try to set the lock - bne-- ssgLock ; Did we get contention? - - not r11,r15 ; Invert the invalids to valids - li r17,0 ; Set a mask for the SRs we are loading - isync ; Make sure we are all caught up - - bf-- pf64Bitb,ssg32Enter ; If 32-bit, jump into it... - - li r0,0 ; Clear - slbia ; Trash all SLB entries (except for entry 0 that is) - li r17,1 ; Get SLB index to load (skip slb 0) - oris r0,r0,0x8000 ; Get set for a mask - b ssg64Enter ; Start on a cache line... - - .align 5 - -ssgLock0: li r15,lgKillResv ; Killing field - stwcx. r15,0,r15 ; Kill reservation - -ssgLock1: lwz r15,pmapCCtl(r28) ; Get the segment cache controls - rlwinm. r15,r15,0,pmapCCtlLckb,pmapCCtlLckb ; Someone have the lock? - beq++ ssgLock ; Yup, this is in use... - b ssgLock1 ; Nope, try again... -; -; This is the 32-bit address space switch code. -; We take a reservation on the segment cache and walk through. -; For each entry, we load the specified entries and remember which -; we did with a mask. Then, we figure out which segments should be -; invalid and then see which actually are. Then we load those with the -; defined invalid VSID. -; Afterwards, we unlock the segment cache. -; - - .align 5 - -ssg32Enter: cntlzw r12,r11 ; Find the next slot in use - cmplwi r12,pmapSegCacheUse ; See if we are done - slwi r14,r12,4 ; Index to the cache slot - lis r0,0x8000 ; Get set for a mask - add r14,r14,r9 ; Point to the entry - - bge- ssg32Done ; All done... - - lwz r5,sgcESID+4(r14) ; Get the ESID part - srw r2,r0,r12 ; Form a mask for the one we are loading - lwz r7,sgcVSID+4(r14) ; And get the VSID bottom - - andc r11,r11,r2 ; Clear the bit - lwz r6,sgcVSID(r14) ; And get the VSID top - - rlwinm r2,r5,4,28,31 ; Change the segment number to a number - - xor r7,r7,r30 ; Modify the key before we actually set it - srw r0,r0,r2 ; Get a mask for the SR we are loading - rlwinm r8,r7,19,1,3 ; Insert the keys and N bit - or r17,r17,r0 ; Remember the segment - rlwimi r8,r7,20,12,31 ; Insert 4:23 the VSID - rlwimi r8,r6,20,8,11 ; Get the last nybble of the SR contents - - mtsrin r8,r5 ; Load the segment - b ssg32Enter ; Go enter the next... - - .align 5 - -ssg32Done: lwz r16,validSegs(r29) ; Get the valid SRs flags - stw r15,pmapCCtl(r28) ; Unlock the segment cache controls - - lis r0,0x8000 ; Get set for a mask - li r2,invalSpace ; Set the invalid address space VSID - - nop ; Align loop - nop ; Align loop - andc r16,r16,r17 ; Get list of SRs that were valid before but not now - nop ; Align loop - -ssg32Inval: cntlzw r18,r16 ; Get the first one to invalidate - cmplwi r18,16 ; Have we finished? - srw r22,r0,r18 ; Get the mask bit - rlwinm r23,r18,28,0,3 ; Get the segment register we need - andc r16,r16,r22 ; Get rid of the guy we just did - bge ssg32Really ; Yes, we are really done now... - - mtsrin r2,r23 ; Invalidate the SR - b ssg32Inval ; Do the next... - - .align 5 - -ssg32Really: - stw r17,validSegs(r29) ; Set the valid SR flags - li r3,1 ; Set kernel/user transition - blr - -; -; This is the 64-bit address space switch code. -; First we blow away all of the SLB entries. -; Walk through, -; loading the SLB. Afterwards, we release the cache lock -; -; Note that because we have to treat SLBE 0 specially, we do not ever use it... -; Its a performance thing... -; - - .align 5 - -ssg64Enter: cntlzw r12,r11 ; Find the next slot in use - cmplwi r12,pmapSegCacheUse ; See if we are done - slwi r14,r12,4 ; Index to the cache slot - srw r16,r0,r12 ; Form a mask for the one we are loading - add r14,r14,r9 ; Point to the entry - andc r11,r11,r16 ; Clear the bit - bge-- ssg64Done ; All done... - - ld r5,sgcESID(r14) ; Get the ESID part - ld r6,sgcVSID(r14) ; And get the VSID part - oris r5,r5,0x0800 ; Turn on the valid bit - or r5,r5,r17 ; Insert the SLB slot - xor r6,r6,r30 ; Modify the key before we actually set it - addi r17,r17,1 ; Bump to the next slot - slbmte r6,r5 ; Make that SLB entry - b ssg64Enter ; Go enter the next... - - .align 5 - -ssg64Done: stw r15,pmapCCtl(r28) ; Unlock the segment cache controls - - eqv r16,r16,r16 ; Load up with all foxes - subfic r17,r17,64 ; Get the number of 1 bits we need - - sld r16,r16,r17 ; Get a mask for the used SLB entries - li r3,1 ; Set kernel/user transition - std r16,validSegs(r29) ; Set the valid SR flags - blr - -; -; mapSetUp - this function sets initial state for all mapping functions. -; We turn off all translations (physical), disable interruptions, and -; enter 64-bit mode if applicable. -; -; We also return the original MSR in r11, the feature flags in R12, -; and CR6 set up so we can do easy branches for 64-bit -; hw_clear_maps assumes r10, r9 will not be trashed. -; - - .align 5 - .globl EXT(mapSetUp) - -LEXT(mapSetUp) - - lis r0,hi16(MASK(MSR_VEC)) ; Get the vector mask - mfsprg r12,2 ; Get feature flags - ori r0,r0,lo16(MASK(MSR_FP)) ; Get the FP as well - mtcrf 0x04,r12 ; move pf64Bit and pfNoMSRirb to cr5 and 6 - mfmsr r11 ; Save the MSR - mtcrf 0x02,r12 ; move pf64Bit and pfNoMSRirb to cr5 and 6 - andc r11,r11,r0 ; Clear VEC and FP for good - ori r0,r0,lo16(MASK(MSR_EE)|MASK(MSR_DR)|MASK(MSR_IR)) ; Get rid of EE, IR, and DR - li r2,1 ; Prepare for 64 bit - andc r0,r11,r0 ; Clear the rest - bt pfNoMSRirb,msuNoMSR ; No MSR... - bt++ pf64Bitb,msuSF ; skip if 64-bit (only they take the hint) - - mtmsr r0 ; Translation and all off - isync ; Toss prefetch - blr ; Return... - - .align 5 - -msuSF: rldimi r0,r2,63,MSR_SF_BIT ; set SF bit (bit 0) - mtmsrd r0 ; set 64-bit mode, turn off EE, DR, and IR - isync ; synchronize - blr ; Return... - - .align 5 - -msuNoMSR: mr r2,r3 ; Save R3 across call - mr r3,r0 ; Get the new MSR value - li r0,loadMSR ; Get the MSR setter SC - sc ; Set it - mr r3,r2 ; Restore R3 - blr ; Go back all set up... - - -; -; Guest shadow assist -- remove all guest mappings -; -; Remove all mappings for a guest pmap from the shadow hash table. -; -; Parameters: -; r3 : address of pmap, 32-bit kernel virtual address -; -; Non-volatile register usage: -; r24 : host pmap's physical address -; r25 : VMM extension block's physical address -; r26 : physent address -; r27 : guest pmap's space ID number -; r28 : current hash table page index -; r29 : guest pmap's physical address -; r30 : saved msr image -; r31 : current mapping -; - .align 5 - .globl EXT(hw_rem_all_gv) - -LEXT(hw_rem_all_gv) - -#define graStackSize ((31-24+1)*4)+4 - stwu r1,-(FM_ALIGN(graStackSize)+FM_SIZE)(r1) - ; Mint a new stack frame - mflr r0 ; Get caller's return address - mfsprg r11,2 ; Get feature flags - mtcrf 0x02,r11 ; Insert feature flags into cr6 - stw r0,(FM_ALIGN(graStackSize)+FM_SIZE+FM_LR_SAVE)(r1) - ; Save caller's return address - stw r31,FM_ARG0+0x00(r1) ; Save non-volatile r31 - stw r30,FM_ARG0+0x04(r1) ; Save non-volatile r30 - stw r29,FM_ARG0+0x08(r1) ; Save non-volatile r29 - stw r28,FM_ARG0+0x0C(r1) ; Save non-volatile r28 - stw r27,FM_ARG0+0x10(r1) ; Save non-volatile r27 - stw r26,FM_ARG0+0x14(r1) ; Save non-volatile r26 - stw r25,FM_ARG0+0x18(r1) ; Save non-volatile r25 - stw r24,FM_ARG0+0x1C(r1) ; Save non-volatile r24 - - lwz r11,pmapVmmExt(r3) ; r11 <- VMM pmap extension block vaddr - - bt++ pf64Bitb,gra64Salt ; Test for 64-bit machine - lwz r25,pmapVmmExtPhys+4(r3) ; r25 <- VMM pmap extension block paddr - lwz r9,pmapvr+4(r3) ; Get 32-bit virt<->real conversion salt - lwz r24,vmxHostPmapPhys+4(r11) ; r24 <- host pmap's paddr - b graStart ; Get to it -gra64Salt: ld r25,pmapVmmExtPhys(r3) ; r25 <- VMM pmap extension block paddr - ld r9,pmapvr(r3) ; Get 64-bit virt<->real conversion salt - ld r24,vmxHostPmapPhys(r11) ; r24 <- host pmap's paddr -graStart: bl EXT(mapSetUp) ; Disable 'rupts, translation, enter 64-bit mode - xor r29,r3,r9 ; Convert pmap_t virt->real - mr r30,r11 ; Save caller's msr image - - la r3,pmapSXlk(r24) ; r3 <- host pmap's search lock - bl sxlkExclusive ; Get lock exclusive - - lwz r3,vxsGra(r25) ; Get remove all count - addi r3,r3,1 ; Increment remove all count - stw r3,vxsGra(r25) ; Update remove all count - - li r28,0 ; r28 <- first hash page table index to search - lwz r27,pmapSpace(r29) ; r27 <- guest pmap's space ID number -graPgLoop: - la r31,VMX_HPIDX_OFFSET(r25) ; Get base of hash page physical index - rlwinm r11,r28,GV_PGIDX_SZ_LG2,GV_HPAGE_MASK - ; Convert page index into page physical index offset - add r31,r31,r11 ; Calculate page physical index entry address - bt++ pf64Bitb,gra64Page ; Separate handling for 64-bit - lwz r31,4(r31) ; r31 <- first slot in hash table page to examine - b graLoop ; Examine all slots in this page -gra64Page: ld r31,0(r31) ; r31 <- first slot in hash table page to examine - b graLoop ; Examine all slots in this page - - .align 5 -graLoop: lwz r3,mpFlags(r31) ; Get mapping's flags - lhz r4,mpSpace(r31) ; Get mapping's space ID number - rlwinm r6,r3,0,mpgFree ; Isolate guest free mapping flag - xor r4,r4,r27 ; Compare space ID number - or. r0,r6,r4 ; cr0_eq <- !free && space id match - bne graMiss ; Not one of ours, skip it - - lwz r11,vxsGraHits(r25) ; Get remove hit count - addi r11,r11,1 ; Increment remove hit count - stw r11,vxsGraHits(r25) ; Update remove hit count - - rlwinm. r0,r3,0,mpgDormant ; Is this entry dormant? - bne graRemPhys ; Yes, nothing to disconnect - - lwz r11,vxsGraActive(r25) ; Get remove active count - addi r11,r11,1 ; Increment remove hit count - stw r11,vxsGraActive(r25) ; Update remove hit count - - bt++ pf64Bitb,graDscon64 ; Handle 64-bit disconnect separately - bl mapInvPte32 ; Disconnect PTE, invalidate, gather ref and change - ; r31 <- mapping's physical address - ; r3 -> PTE slot physical address - ; r4 -> High-order 32 bits of PTE - ; r5 -> Low-order 32 bits of PTE - ; r6 -> PCA - ; r7 -> PCA physical address - rlwinm r2,r3,29,29,31 ; Get PTE's slot number in the PTEG (8-byte PTEs) - b graFreePTE ; Join 64-bit path to release the PTE -graDscon64: bl mapInvPte64 ; Disconnect PTE, invalidate, gather ref and change - rlwinm r2,r3,28,29,31 ; Get PTE's slot number in the PTEG (16-byte PTEs) -graFreePTE: mr. r3,r3 ; Was there a valid PTE? - beq- graRemPhys ; No valid PTE, we're almost done - lis r0,0x8000 ; Prepare free bit for this slot - srw r0,r0,r2 ; Position free bit - or r6,r6,r0 ; Set it in our PCA image - lwz r8,mpPte(r31) ; Get PTE pointer - rlwinm r8,r8,0,~mpHValid ; Make the pointer invalid - stw r8,mpPte(r31) ; Save invalidated PTE pointer - eieio ; Synchronize all previous updates (mapInvPtexx doesn't) - stw r6,0(r7) ; Update PCA and unlock the PTEG - -graRemPhys: - lwz r3,mpPAddr(r31) ; r3 <- physical 4K-page number - bl mapFindLockPN ; Find 'n' lock this page's physent - mr. r26,r3 ; Got lock on our physent? - beq-- graBadPLock ; No, time to bail out - - crset cr1_eq ; cr1_eq <- previous link is the anchor - bt++ pf64Bitb,graRemove64 ; Use 64-bit version on 64-bit machine - la r11,ppLink+4(r26) ; Point to chain anchor - lwz r9,ppLink+4(r26) ; Get chain anchor - rlwinm. r9,r9,0,~ppFlags ; Remove flags, yielding 32-bit physical chain pointer - -graRemLoop: beq- graRemoveMiss ; End of chain, this is not good - cmplw r9,r31 ; Is this the mapping to remove? - lwz r8,mpAlias+4(r9) ; Get forward chain pointer - bne graRemNext ; No, chain onward - bt cr1_eq,graRemRetry ; Mapping to remove is chained from anchor - stw r8,0(r11) ; Unchain gpv->phys mapping - b graRemoved ; Exit loop -graRemRetry: - lwarx r0,0,r11 ; Get previous link - rlwimi r0,r8,0,~ppFlags ; Insert new forward pointer whilst preserving flags - stwcx. r0,0,r11 ; Update previous link - bne- graRemRetry ; Lost reservation, retry - b graRemoved ; Good work, let's get outta here - -graRemNext: la r11,mpAlias+4(r9) ; Point to (soon to be) previous link - crclr cr1_eq ; ~cr1_eq <- Previous link is not the anchor - mr. r9,r8 ; Does next entry exist? - b graRemLoop ; Carry on - -graRemove64: - li r7,ppLFAmask ; Get mask to clean up mapping pointer - rotrdi r7,r7,ppLFArrot ; Rotate clean up mask to get 0xF0000000000000000F - la r11,ppLink(r26) ; Point to chain anchor - ld r9,ppLink(r26) ; Get chain anchor - andc. r9,r9,r7 ; Remove flags, yielding 64-bit physical chain pointer -graRem64Lp: beq-- graRemoveMiss ; End of chain, this is not good - cmpld r9,r31 ; Is this the mapping to remove? - ld r8,mpAlias(r9) ; Get forward chain pinter - bne graRem64Nxt ; Not mapping to remove, chain on, dude - bt cr1_eq,graRem64Rt ; Mapping to remove is chained from anchor - std r8,0(r11) ; Unchain gpv->phys mapping - b graRemoved ; Exit loop -graRem64Rt: ldarx r0,0,r11 ; Get previous link - and r0,r0,r7 ; Get flags - or r0,r0,r8 ; Insert new forward pointer - stdcx. r0,0,r11 ; Slam it back in - bne-- graRem64Rt ; Lost reservation, retry - b graRemoved ; Good work, let's go home - -graRem64Nxt: - la r11,mpAlias(r9) ; Point to (soon to be) previous link - crclr cr1_eq ; ~cr1_eq <- Previous link is not the anchor - mr. r9,r8 ; Does next entry exist? - b graRem64Lp ; Carry on - -graRemoved: - mr r3,r26 ; r3 <- physent's address - bl mapPhysUnlock ; Unlock the physent (and its chain of mappings) - - lwz r3,mpFlags(r31) ; Get mapping's flags - rlwinm r3,r3,0,~mpgFlags ; Clear all guest flags - ori r3,r3,mpgFree ; Mark mapping free - stw r3,mpFlags(r31) ; Update flags - -graMiss: addi r31,r31,GV_SLOT_SZ ; r31 <- next mapping - rlwinm. r0,r31,0,GV_PAGE_MASK ; End of hash table page? - bne graLoop ; No, examine next slot - addi r28,r28,1 ; Increment hash table page index - cmplwi r28,GV_HPAGES ; End of hash table? - bne graPgLoop ; Examine next hash table page - - la r3,pmapSXlk(r24) ; r3 <- host pmap's search lock - bl sxlkUnlock ; Release host pmap's search lock - - bt++ pf64Bitb,graRtn64 ; Handle 64-bit separately - mtmsr r30 ; Restore 'rupts, translation - isync ; Throw a small wrench into the pipeline - b graPopFrame ; Nothing to do now but pop a frame and return -graRtn64: mtmsrd r30 ; Restore 'rupts, translation, 32-bit mode -graPopFrame: - lwz r0,(FM_ALIGN(graStackSize)+FM_SIZE+FM_LR_SAVE)(r1) - ; Get caller's return address - lwz r31,FM_ARG0+0x00(r1) ; Restore non-volatile r31 - lwz r30,FM_ARG0+0x04(r1) ; Restore non-volatile r30 - lwz r29,FM_ARG0+0x08(r1) ; Restore non-volatile r29 - lwz r28,FM_ARG0+0x0C(r1) ; Restore non-volatile r28 - mtlr r0 ; Prepare return address - lwz r27,FM_ARG0+0x10(r1) ; Restore non-volatile r27 - lwz r26,FM_ARG0+0x14(r1) ; Restore non-volatile r26 - lwz r25,FM_ARG0+0x18(r1) ; Restore non-volatile r25 - lwz r24,FM_ARG0+0x1C(r1) ; Restore non-volatile r24 - lwz r1,0(r1) ; Pop stack frame - blr ; Return to caller - -graBadPLock: -graRemoveMiss: - lis r0,hi16(Choke) ; Dmitri, you know how we've always talked about the - ori r0,r0,lo16(Choke) ; possibility of something going wrong with the bomb? - li r3,failMapping ; The BOMB, Dmitri. - sc ; The hydrogen bomb. - - -; -; Guest shadow assist -- remove local guest mappings -; -; Remove local mappings for a guest pmap from the shadow hash table. -; -; Parameters: -; r3 : address of guest pmap, 32-bit kernel virtual address -; -; Non-volatile register usage: -; r20 : current active map word's physical address -; r21 : current hash table page address -; r22 : updated active map word in process -; r23 : active map word in process -; r24 : host pmap's physical address -; r25 : VMM extension block's physical address -; r26 : physent address -; r27 : guest pmap's space ID number -; r28 : current active map index -; r29 : guest pmap's physical address -; r30 : saved msr image -; r31 : current mapping -; - .align 5 - .globl EXT(hw_rem_local_gv) - -LEXT(hw_rem_local_gv) - -#define grlStackSize ((31-20+1)*4)+4 - stwu r1,-(FM_ALIGN(grlStackSize)+FM_SIZE)(r1) - ; Mint a new stack frame - mflr r0 ; Get caller's return address - mfsprg r11,2 ; Get feature flags - mtcrf 0x02,r11 ; Insert feature flags into cr6 - stw r0,(FM_ALIGN(grlStackSize)+FM_SIZE+FM_LR_SAVE)(r1) - ; Save caller's return address - stw r31,FM_ARG0+0x00(r1) ; Save non-volatile r31 - stw r30,FM_ARG0+0x04(r1) ; Save non-volatile r30 - stw r29,FM_ARG0+0x08(r1) ; Save non-volatile r29 - stw r28,FM_ARG0+0x0C(r1) ; Save non-volatile r28 - stw r27,FM_ARG0+0x10(r1) ; Save non-volatile r27 - stw r26,FM_ARG0+0x14(r1) ; Save non-volatile r26 - stw r25,FM_ARG0+0x18(r1) ; Save non-volatile r25 - stw r24,FM_ARG0+0x1C(r1) ; Save non-volatile r24 - stw r23,FM_ARG0+0x20(r1) ; Save non-volatile r23 - stw r22,FM_ARG0+0x24(r1) ; Save non-volatile r22 - stw r21,FM_ARG0+0x28(r1) ; Save non-volatile r21 - stw r20,FM_ARG0+0x2C(r1) ; Save non-volatile r20 - - lwz r11,pmapVmmExt(r3) ; r11 <- VMM pmap extension block vaddr - - bt++ pf64Bitb,grl64Salt ; Test for 64-bit machine - lwz r25,pmapVmmExtPhys+4(r3) ; r25 <- VMM pmap extension block paddr - lwz r9,pmapvr+4(r3) ; Get 32-bit virt<->real conversion salt - lwz r24,vmxHostPmapPhys+4(r11) ; r24 <- host pmap's paddr - b grlStart ; Get to it -grl64Salt: ld r25,pmapVmmExtPhys(r3) ; r25 <- VMM pmap extension block paddr - ld r9,pmapvr(r3) ; Get 64-bit virt<->real conversion salt - ld r24,vmxHostPmapPhys(r11) ; r24 <- host pmap's paddr - -grlStart: bl EXT(mapSetUp) ; Disable 'rupts, translation, enter 64-bit mode - xor r29,r3,r9 ; Convert pmap_t virt->real - mr r30,r11 ; Save caller's msr image - - la r3,pmapSXlk(r24) ; r3 <- host pmap's search lock - bl sxlkExclusive ; Get lock exclusive - - li r28,0 ; r28 <- index of first active map word to search - lwz r27,pmapSpace(r29) ; r27 <- guest pmap's space ID number - b grlMap1st ; Examine first map word - - .align 5 -grlNextMap: stw r22,0(r21) ; Save updated map word - addi r28,r28,1 ; Increment map word index - cmplwi r28,GV_MAP_WORDS ; See if we're done - beq grlDone ; Yup, let's get outta here - -grlMap1st: la r20,VMX_ACTMAP_OFFSET(r25) ; Get base of active map word array - rlwinm r11,r28,GV_MAPWD_SZ_LG2,GV_MAP_MASK - ; Convert map index into map index offset - add r20,r20,r11 ; Calculate map array element address - lwz r22,0(r20) ; Get active map word at index - mr. r23,r22 ; Any active mappings indicated? - beq grlNextMap ; Nope, check next word - - la r21,VMX_HPIDX_OFFSET(r25) ; Get base of hash page physical index - rlwinm r11,r28,GV_MAP_SHIFT,GV_HPAGE_MASK - ; Extract page index from map word index and convert - ; into page physical index offset - add r21,r21,r11 ; Calculate page physical index entry address - bt++ pf64Bitb,grl64Page ; Separate handling for 64-bit - lwz r21,4(r21) ; Get selected hash table page's address - b grlLoop ; Examine all slots in this page -grl64Page: ld r21,0(r21) ; Get selected hash table page's address - b grlLoop ; Examine all slots in this page - - .align 5 -grlLoop: cntlzw r11,r23 ; Get next active bit lit in map word - cmplwi r11,32 ; Any active mappings left in this word? - lis r12,0x8000 ; Prepare mask to reset bit - srw r12,r12,r11 ; Position mask bit - andc r23,r23,r12 ; Reset lit bit - beq grlNextMap ; No bits lit, examine next map word - - slwi r31,r11,GV_SLOT_SZ_LG2 ; Get slot offset in slot band from lit bit number - rlwinm r31,r28,GV_BAND_SHIFT,GV_BAND_MASK - ; Extract slot band number from index and insert - add r31,r31,r21 ; Add hash page address yielding mapping slot address - - lwz r3,mpFlags(r31) ; Get mapping's flags - lhz r4,mpSpace(r31) ; Get mapping's space ID number - rlwinm r5,r3,0,mpgGlobal ; Extract global bit - xor r4,r4,r27 ; Compare space ID number - or. r4,r4,r5 ; (space id miss || global) - bne grlLoop ; Not one of ours, skip it - andc r22,r22,r12 ; Reset active bit corresponding to this mapping - ori r3,r3,mpgDormant ; Mark entry dormant - stw r3,mpFlags(r31) ; Update mapping's flags - - bt++ pf64Bitb,grlDscon64 ; Handle 64-bit disconnect separately - bl mapInvPte32 ; Disconnect PTE, invalidate, gather ref and change - ; r31 <- mapping's physical address - ; r3 -> PTE slot physical address - ; r4 -> High-order 32 bits of PTE - ; r5 -> Low-order 32 bits of PTE - ; r6 -> PCA - ; r7 -> PCA physical address - rlwinm r2,r3,29,29,31 ; Get PTE's slot number in the PTEG (8-byte PTEs) - b grlFreePTE ; Join 64-bit path to release the PTE -grlDscon64: bl mapInvPte64 ; Disconnect PTE, invalidate, gather ref and change - rlwinm r2,r3,28,29,31 ; Get PTE's slot number in the PTEG (16-byte PTEs) -grlFreePTE: mr. r3,r3 ; Was there a valid PTE? - beq- grlLoop ; No valid PTE, we're done with this mapping - lis r0,0x8000 ; Prepare free bit for this slot - srw r0,r0,r2 ; Position free bit - or r6,r6,r0 ; Set it in our PCA image - lwz r8,mpPte(r31) ; Get PTE pointer - rlwinm r8,r8,0,~mpHValid ; Make the pointer invalid - stw r8,mpPte(r31) ; Save invalidated PTE pointer - eieio ; Synchronize all previous updates (mapInvPtexx doesn't) - stw r6,0(r7) ; Update PCA and unlock the PTEG - b grlLoop ; On to next active mapping in this map word - -grlDone: la r3,pmapSXlk(r24) ; r3 <- host pmap's search lock - bl sxlkUnlock ; Release host pmap's search lock - - bt++ pf64Bitb,grlRtn64 ; Handle 64-bit separately - mtmsr r30 ; Restore 'rupts, translation - isync ; Throw a small wrench into the pipeline - b grlPopFrame ; Nothing to do now but pop a frame and return -grlRtn64: mtmsrd r30 ; Restore 'rupts, translation, 32-bit mode -grlPopFrame: - lwz r0,(FM_ALIGN(grlStackSize)+FM_SIZE+FM_LR_SAVE)(r1) - ; Get caller's return address - lwz r31,FM_ARG0+0x00(r1) ; Restore non-volatile r31 - lwz r30,FM_ARG0+0x04(r1) ; Restore non-volatile r30 - lwz r29,FM_ARG0+0x08(r1) ; Restore non-volatile r29 - lwz r28,FM_ARG0+0x0C(r1) ; Restore non-volatile r28 - mtlr r0 ; Prepare return address - lwz r27,FM_ARG0+0x10(r1) ; Restore non-volatile r27 - lwz r26,FM_ARG0+0x14(r1) ; Restore non-volatile r26 - lwz r25,FM_ARG0+0x18(r1) ; Restore non-volatile r25 - lwz r24,FM_ARG0+0x1C(r1) ; Restore non-volatile r24 - lwz r23,FM_ARG0+0x20(r1) ; Restore non-volatile r23 - lwz r22,FM_ARG0+0x24(r1) ; Restore non-volatile r22 - lwz r21,FM_ARG0+0x28(r1) ; Restore non-volatile r21 - lwz r20,FM_ARG0+0x2C(r1) ; Restore non-volatile r20 - lwz r1,0(r1) ; Pop stack frame - blr ; Return to caller - - -; -; Guest shadow assist -- resume a guest mapping -; -; Locates the specified dormant mapping, and if it exists validates it and makes it -; active. -; -; Parameters: -; r3 : address of host pmap, 32-bit kernel virtual address -; r4 : address of guest pmap, 32-bit kernel virtual address -; r5 : host virtual address, high-order 32 bits -; r6 : host virtual address, low-order 32 bits -; r7 : guest virtual address, high-order 32 bits -; r8 : guest virtual address, low-order 32 bits -; r9 : guest mapping protection code -; -; Non-volatile register usage: -; r23 : VMM extension block's physical address -; r24 : physent physical address -; r25 : caller's msr image from mapSetUp -; r26 : guest mapping protection code -; r27 : host pmap physical address -; r28 : guest pmap physical address -; r29 : host virtual address -; r30 : guest virtual address -; r31 : gva->phys mapping's physical address -; - .align 5 - .globl EXT(hw_res_map_gv) - -LEXT(hw_res_map_gv) - -#define grsStackSize ((31-23+1)*4)+4 - - stwu r1,-(FM_ALIGN(grsStackSize)+FM_SIZE)(r1) - ; Mint a new stack frame - mflr r0 ; Get caller's return address - mfsprg r11,2 ; Get feature flags - mtcrf 0x02,r11 ; Insert feature flags into cr6 - stw r0,(FM_ALIGN(grsStackSize)+FM_SIZE+FM_LR_SAVE)(r1) - ; Save caller's return address - stw r31,FM_ARG0+0x00(r1) ; Save non-volatile r31 - stw r30,FM_ARG0+0x04(r1) ; Save non-volatile r30 - stw r29,FM_ARG0+0x08(r1) ; Save non-volatile r29 - stw r28,FM_ARG0+0x0C(r1) ; Save non-volatile r28 - stw r27,FM_ARG0+0x10(r1) ; Save non-volatile r27 - stw r26,FM_ARG0+0x14(r1) ; Save non-volatile r26 - stw r25,FM_ARG0+0x18(r1) ; Save non-volatile r25 - stw r24,FM_ARG0+0x1C(r1) ; Save non-volatile r24 - stw r23,FM_ARG0+0x20(r1) ; Save non-volatile r23 - - rlwinm r29,r6,0,0xFFFFF000 ; Clean up low-order 32 bits of host vaddr - rlwinm r30,r8,0,0xFFFFF000 ; Clean up low-order 32 bits of guest vaddr - mr r26,r9 ; Copy guest mapping protection code - - lwz r11,pmapVmmExt(r3) ; r11 <- VMM pmap extension block vaddr - lwz r9,pmapSpace(r4) ; r9 <- guest space ID number - bt++ pf64Bitb,grs64Salt ; Handle 64-bit machine separately - lwz r23,pmapVmmExtPhys+4(r3) ; r23 <- VMM pmap extension block paddr - lwz r27,pmapvr+4(r3) ; Get 32-bit virt<->real host pmap conversion salt - lwz r28,pmapvr+4(r4) ; Get 32-bit virt<->real guest pmap conversion salt - la r31,VMX_HPIDX_OFFSET(r11) ; r31 <- base of hash page physical index - srwi r11,r30,12 ; Form shadow hash: - xor r11,r11,r9 ; spaceID ^ (vaddr >> 12) - rlwinm r10,r11,GV_HPAGE_SHIFT,GV_HPAGE_MASK - ; Form index offset from hash page number - add r31,r31,r10 ; r31 <- hash page index entry - lwz r31,4(r31) ; r31 <- hash page paddr - rlwimi r31,r11,GV_HGRP_SHIFT,GV_HGRP_MASK - ; r31 <- hash group paddr - b grsStart ; Get to it - -grs64Salt: rldimi r29,r5,32,0 ; Insert high-order 32 bits of 64-bit host vaddr - rldimi r30,r7,32,0 ; Insert high-order 32 bits of 64-bit guest vaddr - ld r23,pmapVmmExtPhys(r3) ; r23 <- VMM pmap extension block paddr - ld r27,pmapvr(r3) ; Get 64-bit virt<->real host pmap conversion salt - ld r28,pmapvr(r4) ; Get 64-bit virt<->real guest pmap conversion salt - la r31,VMX_HPIDX_OFFSET(r11) ; r31 <- base of hash page physical index - srwi r11,r30,12 ; Form shadow hash: - xor r11,r11,r9 ; spaceID ^ (vaddr >> 12) - rlwinm r10,r11,GV_HPAGE_SHIFT,GV_HPAGE_MASK - ; Form index offset from hash page number - add r31,r31,r10 ; r31 <- hash page index entry - ld r31,0(r31) ; r31 <- hash page paddr - insrdi r31,r11,GV_GRPS_PPG_LG2,64-(GV_HGRP_SHIFT+GV_GRPS_PPG_LG2) - ; r31 <- hash group paddr - -grsStart: xor r27,r3,r27 ; Convert host pmap_t virt->real - xor r28,r4,r28 ; Convert guest pmap_t virt->real - bl EXT(mapSetUp) ; Disable 'rupts, translation, maybe enter 64-bit mode - mr r25,r11 ; Save caller's msr image - - la r3,pmapSXlk(r27) ; r3 <- host pmap's search lock address - bl sxlkExclusive ; Get lock exclusive - - li r0,(GV_SLOTS - 1) ; Prepare to iterate over mapping slots - mtctr r0 ; in this group - bt++ pf64Bitb,grs64Search ; Test for 64-bit machine - - lwz r3,mpFlags(r31) ; r3 <- 1st mapping slot's flags - lhz r4,mpSpace(r31) ; r4 <- 1st mapping slot's space ID - lwz r5,mpVAddr+4(r31) ; r5 <- 1st mapping slot's virtual address - b grs32SrchLp ; Let the search begin! - - .align 5 -grs32SrchLp: - mr r6,r3 ; r6 <- current mapping slot's flags - lwz r3,mpFlags+GV_SLOT_SZ(r31) ; r3 <- next mapping slot's flags - mr r7,r4 ; r7 <- current mapping slot's space ID - lhz r4,mpSpace+GV_SLOT_SZ(r31) ; r4 <- next mapping slot's space ID - clrrwi r8,r5,12 ; r8 <- current mapping slot's virtual addr w/o flags - lwz r5,mpVAddr+4+GV_SLOT_SZ(r31); r5 <- next mapping slot's virtual addr - rlwinm r11,r6,0,mpgFree ; Isolate guest free flag - xor r7,r7,r9 ; Compare space ID - or r0,r11,r7 ; r0 <- !(!free && space match) - xor r8,r8,r30 ; Compare virtual address - or. r0,r0,r8 ; cr0_eq <- !free && space match && virtual addr match - beq grsSrchHit ; Join common path on hit (r31 points to guest mapping) - - addi r31,r31,GV_SLOT_SZ ; r31 <- next mapping slot - bdnz grs32SrchLp ; Iterate - - mr r6,r3 ; r6 <- current mapping slot's flags - clrrwi r5,r5,12 ; Remove flags from virtual address - rlwinm r11,r6,0,mpgFree ; Isolate guest free flag - xor r4,r4,r9 ; Compare space ID - or r0,r11,r4 ; r0 <- !(!free && space match) - xor r5,r5,r30 ; Compare virtual address - or. r0,r0,r5 ; cr0_eq <- !free && space match && virtual addr match - beq grsSrchHit ; Join common path on hit (r31 points to guest mapping) - b grsSrchMiss ; No joy in our hash group - -grs64Search: - lwz r3,mpFlags(r31) ; r3 <- 1st mapping slot's flags - lhz r4,mpSpace(r31) ; r4 <- 1st mapping slot's space ID - ld r5,mpVAddr(r31) ; r5 <- 1st mapping slot's virtual address - b grs64SrchLp ; Let the search begin! - - .align 5 -grs64SrchLp: - mr r6,r3 ; r6 <- current mapping slot's flags - lwz r3,mpFlags+GV_SLOT_SZ(r31) ; r3 <- next mapping slot's flags - mr r7,r4 ; r7 <- current mapping slot's space ID - lhz r4,mpSpace+GV_SLOT_SZ(r31) ; r4 <- next mapping slot's space ID - clrrdi r8,r5,12 ; r8 <- current mapping slot's virtual addr w/o flags - ld r5,mpVAddr+GV_SLOT_SZ(r31) ; r5 <- next mapping slot's virtual addr - rlwinm r11,r6,0,mpgFree ; Isolate guest free flag - xor r7,r7,r9 ; Compare space ID - or r0,r11,r7 ; r0 <- !(!free && space match) - xor r8,r8,r30 ; Compare virtual address - or. r0,r0,r8 ; cr0_eq <- !free && space match && virtual addr match - beq grsSrchHit ; Join common path on hit (r31 points to guest mapping) - - addi r31,r31,GV_SLOT_SZ ; r31 <- next mapping slot - bdnz grs64SrchLp ; Iterate - - mr r6,r3 ; r6 <- current mapping slot's flags - clrrdi r5,r5,12 ; Remove flags from virtual address - rlwinm r11,r6,0,mpgFree ; Isolate guest free flag - xor r4,r4,r9 ; Compare space ID - or r0,r11,r4 ; r0 <- !(!free && space match) - xor r5,r5,r30 ; Compare virtual address - or. r0,r0,r5 ; cr0_eq <- !free && space match && virtual addr match - bne grsSrchMiss ; No joy in our hash group - -grsSrchHit: - rlwinm. r0,r6,0,mpgDormant ; Is the mapping dormant? - bne grsFindHost ; Yes, nothing to disconnect - - bt++ pf64Bitb,grsDscon64 ; Handle 64-bit disconnect separately - bl mapInvPte32 ; Disconnect PTE, invalidate, gather ref and change - ; r31 <- mapping's physical address - ; r3 -> PTE slot physical address - ; r4 -> High-order 32 bits of PTE - ; r5 -> Low-order 32 bits of PTE - ; r6 -> PCA - ; r7 -> PCA physical address - rlwinm r2,r3,29,29,31 ; Get PTE's slot number in the PTEG (8-byte PTEs) - b grsFreePTE ; Join 64-bit path to release the PTE -grsDscon64: bl mapInvPte64 ; Disconnect PTE, invalidate, gather ref and change - rlwinm r2,r3,28,29,31 ; Get PTE's slot number in the PTEG (16-byte PTEs) -grsFreePTE: mr. r3,r3 ; Was there a valid PTE? - beq- grsFindHost ; No valid PTE, we're almost done - lis r0,0x8000 ; Prepare free bit for this slot - srw r0,r0,r2 ; Position free bit - or r6,r6,r0 ; Set it in our PCA image - lwz r8,mpPte(r31) ; Get PTE pointer - rlwinm r8,r8,0,~mpHValid ; Make the pointer invalid - stw r8,mpPte(r31) ; Save invalidated PTE pointer - eieio ; Synchronize all previous updates (mapInvPtexx didn't) - stw r6,0(r7) ; Update PCA and unlock the PTEG - -grsFindHost: - -// We now have a dormant guest mapping that matches our space id and virtual address. Our next -// step is to locate the host mapping that completes the guest mapping's connection to a physical -// frame. The guest and host mappings must connect to the same physical frame, so they must both -// be chained on the same physent. We search the physent chain for a host mapping matching our -// host's space id and the host virtual address. If we succeed, we know that the entire chain -// of mappings (guest virtual->host virtual->physical) is valid, so the dormant mapping can be -// resumed. If we fail to find the specified host virtual->physical mapping, it is because the -// host virtual or physical address has changed since the guest mapping was suspended, so it -// is no longer valid and cannot be resumed -- we therefore delete the guest mappping and tell -// our caller that it will have to take its long path, translating the host virtual address -// through the host's skiplist and installing a new guest mapping. - - lwz r3,mpPAddr(r31) ; r3 <- physical 4K-page number - bl mapFindLockPN ; Find 'n' lock this page's physent - mr. r24,r3 ; Got lock on our physent? - beq-- grsBadPLock ; No, time to bail out - - bt++ pf64Bitb,grsPFnd64 ; 64-bit version of physent chain search - - lwz r9,ppLink+4(r24) ; Get first mapping on physent - lwz r6,pmapSpace(r27) ; Get host pmap's space id number - rlwinm r9,r9,0,~ppFlags ; Be-gone, unsightly flags -grsPELoop: mr. r12,r9 ; Got a mapping to look at? - beq- grsPEMiss ; Nope, we've missed hva->phys mapping - lwz r7,mpFlags(r12) ; Get mapping's flags - lhz r4,mpSpace(r12) ; Get mapping's space id number - lwz r5,mpVAddr+4(r12) ; Get mapping's virtual address - lwz r9,mpAlias+4(r12) ; Next mapping in physent alias chain - - rlwinm r0,r7,0,mpType ; Isolate mapping's type - rlwinm r5,r5,0,~mpHWFlags ; Bye-bye unsightly flags - xori r0,r0,mpNormal ; Normal mapping? - xor r4,r4,r6 ; Compare w/ host space id number - xor r5,r5,r29 ; Compare w/ host virtual address - or r0,r0,r4 ; r0 <- (wrong type || !space id) - or. r0,r0,r5 ; cr0_eq <- (right type && space id hit && hva hit) - beq grsPEHit ; Hit - b grsPELoop ; Iterate - -grsPFnd64: li r0,ppLFAmask ; Get mask to clean up mapping pointer - rotrdi r0,r0,ppLFArrot ; Rotate clean up mask to get 0xF0000000000000000F - ld r9,ppLink(r24) ; Get first mapping on physent - lwz r6,pmapSpace(r27) ; Get pmap's space id number - andc r9,r9,r0 ; Cleanup mapping pointer -grsPELp64: mr. r12,r9 ; Got a mapping to look at? - beq-- grsPEMiss ; Nope, we've missed hva->phys mapping - lwz r7,mpFlags(r12) ; Get mapping's flags - lhz r4,mpSpace(r12) ; Get mapping's space id number - ld r5,mpVAddr(r12) ; Get mapping's virtual address - ld r9,mpAlias(r12) ; Next mapping physent alias chain - rlwinm r0,r7,0,mpType ; Isolate mapping's type - rldicr r5,r5,0,mpHWFlagsb-1 ; Bye-bye unsightly flags - xori r0,r0,mpNormal ; Normal mapping? - xor r4,r4,r6 ; Compare w/ host space id number - xor r5,r5,r29 ; Compare w/ host virtual address - or r0,r0,r4 ; r0 <- (wrong type || !space id) - or. r0,r0,r5 ; cr0_eq <- (right type && space id hit && hva hit) - beq grsPEHit ; Hit - b grsPELp64 ; Iterate - -grsPEHit: lwz r0,mpVAddr+4(r31) ; Get va byte containing protection bits - rlwimi r0,r26,0,mpPP ; Insert new protection bits - stw r0,mpVAddr+4(r31) ; Write 'em back - - eieio ; Ensure previous mapping updates are visible - lwz r0,mpFlags(r31) ; Get flags - rlwinm r0,r0,0,~mpgDormant ; Turn off dormant flag - stw r0,mpFlags(r31) ; Set updated flags, entry is now valid - - li r31,mapRtOK ; Indicate success - b grsRelPhy ; Exit through physent lock release - -grsPEMiss: crset cr1_eq ; cr1_eq <- previous link is the anchor - bt++ pf64Bitb,grsRemove64 ; Use 64-bit version on 64-bit machine - la r11,ppLink+4(r24) ; Point to chain anchor - lwz r9,ppLink+4(r24) ; Get chain anchor - rlwinm. r9,r9,0,~ppFlags ; Remove flags, yielding 32-bit physical chain pointer -grsRemLoop: beq- grsPEMissMiss ; End of chain, this is not good - cmplw r9,r31 ; Is this the mapping to remove? - lwz r8,mpAlias+4(r9) ; Get forward chain pointer - bne grsRemNext ; No, chain onward - bt cr1_eq,grsRemRetry ; Mapping to remove is chained from anchor - stw r8,0(r11) ; Unchain gpv->phys mapping - b grsDelete ; Finish deleting mapping -grsRemRetry: - lwarx r0,0,r11 ; Get previous link - rlwimi r0,r8,0,~ppFlags ; Insert new forward pointer whilst preserving flags - stwcx. r0,0,r11 ; Update previous link - bne- grsRemRetry ; Lost reservation, retry - b grsDelete ; Finish deleting mapping - - .align 5 -grsRemNext: la r11,mpAlias+4(r9) ; Point to (soon to be) previous link - crclr cr1_eq ; ~cr1_eq <- Previous link is not the anchor - mr. r9,r8 ; Does next entry exist? - b grsRemLoop ; Carry on - -grsRemove64: - li r7,ppLFAmask ; Get mask to clean up mapping pointer - rotrdi r7,r7,ppLFArrot ; Rotate clean up mask to get 0xF0000000000000000F - la r11,ppLink(r24) ; Point to chain anchor - ld r9,ppLink(r24) ; Get chain anchor - andc. r9,r9,r7 ; Remove flags, yielding 64-bit physical chain pointer -grsRem64Lp: beq-- grsPEMissMiss ; End of chain, this is not good - cmpld r9,r31 ; Is this the mapping to remove? - ld r8,mpAlias(r9) ; Get forward chain pinter - bne grsRem64Nxt ; Not mapping to remove, chain on, dude - bt cr1_eq,grsRem64Rt ; Mapping to remove is chained from anchor - std r8,0(r11) ; Unchain gpv->phys mapping - b grsDelete ; Finish deleting mapping -grsRem64Rt: ldarx r0,0,r11 ; Get previous link - and r0,r0,r7 ; Get flags - or r0,r0,r8 ; Insert new forward pointer - stdcx. r0,0,r11 ; Slam it back in - bne-- grsRem64Rt ; Lost reservation, retry - b grsDelete ; Finish deleting mapping - - .align 5 -grsRem64Nxt: - la r11,mpAlias(r9) ; Point to (soon to be) previous link - crclr cr1_eq ; ~cr1_eq <- Previous link is not the anchor - mr. r9,r8 ; Does next entry exist? - b grsRem64Lp ; Carry on - -grsDelete: - lwz r3,mpFlags(r31) ; Get mapping's flags - rlwinm r3,r3,0,~mpgFlags ; Clear all guest flags - ori r3,r3,mpgFree ; Mark mapping free - stw r3,mpFlags(r31) ; Update flags - - li r31,mapRtNotFnd ; Didn't succeed - -grsRelPhy: mr r3,r24 ; r3 <- physent addr - bl mapPhysUnlock ; Unlock physent chain - -grsRelPmap: la r3,pmapSXlk(r27) ; r3 <- host pmap search lock phys addr - bl sxlkUnlock ; Release host pmap search lock - -grsRtn: mr r3,r31 ; r3 <- result code - bt++ pf64Bitb,grsRtn64 ; Handle 64-bit separately - mtmsr r25 ; Restore 'rupts, translation - isync ; Throw a small wrench into the pipeline - b grsPopFrame ; Nothing to do now but pop a frame and return -grsRtn64: mtmsrd r25 ; Restore 'rupts, translation, 32-bit mode -grsPopFrame: - lwz r0,(FM_ALIGN(grsStackSize)+FM_SIZE+FM_LR_SAVE)(r1) - ; Get caller's return address - lwz r31,FM_ARG0+0x00(r1) ; Restore non-volatile r31 - lwz r30,FM_ARG0+0x04(r1) ; Restore non-volatile r30 - lwz r29,FM_ARG0+0x08(r1) ; Restore non-volatile r29 - lwz r28,FM_ARG0+0x0C(r1) ; Restore non-volatile r28 - mtlr r0 ; Prepare return address - lwz r27,FM_ARG0+0x10(r1) ; Restore non-volatile r27 - lwz r26,FM_ARG0+0x14(r1) ; Restore non-volatile r26 - lwz r25,FM_ARG0+0x18(r1) ; Restore non-volatile r25 - lwz r24,FM_ARG0+0x1C(r1) ; Restore non-volatile r24 - lwz r23,FM_ARG0+0x20(r1) ; Restore non-volatile r23 - lwz r1,0(r1) ; Pop stack frame - blr ; Return to caller - - .align 5 -grsSrchMiss: - li r31,mapRtNotFnd ; Could not locate requested mapping - b grsRelPmap ; Exit through host pmap search lock release - -grsBadPLock: -grsPEMissMiss: - lis r0,hi16(Choke) ; Dmitri, you know how we've always talked about the - ori r0,r0,lo16(Choke) ; possibility of something going wrong with the bomb? - li r3,failMapping ; The BOMB, Dmitri. - sc ; The hydrogen bomb. - - -; -; Guest shadow assist -- add a guest mapping -; -; Adds a guest mapping. -; -; Parameters: -; r3 : address of host pmap, 32-bit kernel virtual address -; r4 : address of guest pmap, 32-bit kernel virtual address -; r5 : guest virtual address, high-order 32 bits -; r6 : guest virtual address, low-order 32 bits (with mpHWFlags) -; r7 : new mapping's flags -; r8 : physical address, 32-bit page number -; -; Non-volatile register usage: -; r22 : hash group's physical address -; r23 : VMM extension block's physical address -; r24 : mapping's flags -; r25 : caller's msr image from mapSetUp -; r26 : physent physical address -; r27 : host pmap physical address -; r28 : guest pmap physical address -; r29 : physical address, 32-bit 4k-page number -; r30 : guest virtual address -; r31 : gva->phys mapping's physical address -; - - .align 5 - .globl EXT(hw_add_map_gv) - - -LEXT(hw_add_map_gv) - -#define gadStackSize ((31-22+1)*4)+4 - - stwu r1,-(FM_ALIGN(gadStackSize)+FM_SIZE)(r1) - ; Mint a new stack frame - mflr r0 ; Get caller's return address - mfsprg r11,2 ; Get feature flags - mtcrf 0x02,r11 ; Insert feature flags into cr6 - stw r0,(FM_ALIGN(gadStackSize)+FM_SIZE+FM_LR_SAVE)(r1) - ; Save caller's return address - stw r31,FM_ARG0+0x00(r1) ; Save non-volatile r31 - stw r30,FM_ARG0+0x04(r1) ; Save non-volatile r30 - stw r29,FM_ARG0+0x08(r1) ; Save non-volatile r29 - stw r28,FM_ARG0+0x0C(r1) ; Save non-volatile r28 - stw r27,FM_ARG0+0x10(r1) ; Save non-volatile r27 - stw r26,FM_ARG0+0x14(r1) ; Save non-volatile r26 - stw r25,FM_ARG0+0x18(r1) ; Save non-volatile r25 - stw r24,FM_ARG0+0x1C(r1) ; Save non-volatile r24 - stw r23,FM_ARG0+0x20(r1) ; Save non-volatile r23 - stw r22,FM_ARG0+0x24(r1) ; Save non-volatile r22 - - rlwinm r30,r5,0,1,0 ; Get high-order 32 bits of guest vaddr - rlwimi r30,r6,0,0,31 ; Get low-order 32 bits of guest vaddr - mr r24,r7 ; Copy guest mapping's flags - mr r29,r8 ; Copy target frame's physical address - - lwz r11,pmapVmmExt(r3) ; r11 <- VMM pmap extension block vaddr - lwz r9,pmapSpace(r4) ; r9 <- guest space ID number - bt++ pf64Bitb,gad64Salt ; Test for 64-bit machine - lwz r23,pmapVmmExtPhys+4(r3) ; r23 <- VMM pmap extension block paddr - lwz r27,pmapvr+4(r3) ; Get 32-bit virt<->real host pmap conversion salt - lwz r28,pmapvr+4(r4) ; Get 32-bit virt<->real guest pmap conversion salt - la r22,VMX_HPIDX_OFFSET(r11) ; r22 <- base of hash page physical index - srwi r11,r30,12 ; Form shadow hash: - xor r11,r11,r9 ; spaceID ^ (vaddr >> 12) - rlwinm r10,r11,GV_HPAGE_SHIFT,GV_HPAGE_MASK - ; Form index offset from hash page number - add r22,r22,r10 ; r22 <- hash page index entry - lwz r22,4(r22) ; r22 <- hash page paddr - rlwimi r22,r11,GV_HGRP_SHIFT,GV_HGRP_MASK - ; r22 <- hash group paddr - b gadStart ; Get to it - -gad64Salt: ld r23,pmapVmmExtPhys(r3) ; r23 <- VMM pmap extension block paddr - ld r27,pmapvr(r3) ; Get 64-bit virt<->real host pmap conversion salt - ld r28,pmapvr(r4) ; Get 64-bit virt<->real guest pmap conversion salt - la r22,VMX_HPIDX_OFFSET(r11) ; r22 <- base of hash page physical index - srwi r11,r30,12 ; Form shadow hash: - xor r11,r11,r9 ; spaceID ^ (vaddr >> 12) - rlwinm r10,r11,GV_HPAGE_SHIFT,GV_HPAGE_MASK - ; Form index offset from hash page number - add r22,r22,r10 ; r22 <- hash page index entry - ld r22,0(r22) ; r22 <- hash page paddr - insrdi r22,r11,GV_GRPS_PPG_LG2,64-(GV_HGRP_SHIFT+GV_GRPS_PPG_LG2) - ; r22 <- hash group paddr - -gadStart: xor r27,r3,r27 ; Convert host pmap_t virt->real - xor r28,r4,r28 ; Convert guest pmap_t virt->real - bl EXT(mapSetUp) ; Disable 'rupts, translation, maybe enter 64-bit mode - mr r25,r11 ; Save caller's msr image - - la r3,pmapSXlk(r27) ; r3 <- host pmap's search lock address - bl sxlkExclusive ; Get lock exlusive - - mr r31,r22 ; Prepare to search this group - li r0,(GV_SLOTS - 1) ; Prepare to iterate over mapping slots - mtctr r0 ; in this group - bt++ pf64Bitb,gad64Search ; Test for 64-bit machine - - lwz r3,mpFlags(r31) ; r3 <- 1st mapping slot's flags - lhz r4,mpSpace(r31) ; r4 <- 1st mapping slot's space ID - lwz r5,mpVAddr+4(r31) ; r5 <- 1st mapping slot's virtual address - clrrwi r12,r30,12 ; r12 <- virtual address we're searching for - b gad32SrchLp ; Let the search begin! - - .align 5 -gad32SrchLp: - mr r6,r3 ; r6 <- current mapping slot's flags - lwz r3,mpFlags+GV_SLOT_SZ(r31) ; r3 <- next mapping slot's flags - mr r7,r4 ; r7 <- current mapping slot's space ID - lhz r4,mpSpace+GV_SLOT_SZ(r31) ; r4 <- next mapping slot's space ID - clrrwi r8,r5,12 ; r8 <- current mapping slot's virtual addr w/o flags - lwz r5,mpVAddr+4+GV_SLOT_SZ(r31); r5 <- next mapping slot's virtual addr - rlwinm r11,r6,0,mpgFree ; Isolate guest free flag - xor r7,r7,r9 ; Compare space ID - or r0,r11,r7 ; r0 <- !(!free && space match) - xor r8,r8,r12 ; Compare virtual address - or. r0,r0,r8 ; cr0_eq <- !free && space match && virtual addr match - beq gadRelPmap ; Join common path on hit (r31 points to guest mapping) - - addi r31,r31,GV_SLOT_SZ ; r31 <- next mapping slot - bdnz gad32SrchLp ; Iterate - - mr r6,r3 ; r6 <- current mapping slot's flags - clrrwi r5,r5,12 ; Remove flags from virtual address - rlwinm r11,r6,0,mpgFree ; Isolate guest free flag - xor r4,r4,r9 ; Compare space ID - or r0,r11,r4 ; r0 <- !(!free && && space match) - xor r5,r5,r12 ; Compare virtual address - or. r0,r0,r5 ; cr0_eq <- free && space match && virtual addr match - beq gadRelPmap ; Join common path on hit (r31 points to guest mapping) - b gadScan ; No joy in our hash group - -gad64Search: - lwz r3,mpFlags(r31) ; r3 <- 1st mapping slot's flags - lhz r4,mpSpace(r31) ; r4 <- 1st mapping slot's space ID - ld r5,mpVAddr(r31) ; r5 <- 1st mapping slot's virtual address - clrrdi r12,r30,12 ; r12 <- virtual address we're searching for - b gad64SrchLp ; Let the search begin! - - .align 5 -gad64SrchLp: - mr r6,r3 ; r6 <- current mapping slot's flags - lwz r3,mpFlags+GV_SLOT_SZ(r31) ; r3 <- next mapping slot's flags - mr r7,r4 ; r7 <- current mapping slot's space ID - lhz r4,mpSpace+GV_SLOT_SZ(r31) ; r4 <- next mapping slot's space ID - clrrdi r8,r5,12 ; r8 <- current mapping slot's virtual addr w/o flags - ld r5,mpVAddr+GV_SLOT_SZ(r31) ; r5 <- next mapping slot's virtual addr - rlwinm r11,r6,0,mpgFree ; Isolate guest free flag - xor r7,r7,r9 ; Compare space ID - or r0,r11,r7 ; r0 <- !(!free && space match) - xor r8,r8,r12 ; Compare virtual address - or. r0,r0,r8 ; cr0_eq <- !free && space match && virtual addr match - beq gadRelPmap ; Hit, let upper-level redrive sort it out - - addi r31,r31,GV_SLOT_SZ ; r31 <- next mapping slot - bdnz gad64SrchLp ; Iterate - - mr r6,r3 ; r6 <- current mapping slot's flags - clrrdi r5,r5,12 ; Remove flags from virtual address - rlwinm r11,r6,0,mpgFree ; Isolate guest free flag - xor r4,r4,r9 ; Compare space ID - or r0,r11,r4 ; r0 <- !(!free && && space match) - xor r5,r5,r12 ; Compare virtual address - or. r0,r0,r5 ; cr0_eq <- !free && space match && virtual addr match - bne gadScan ; No joy in our hash group - b gadRelPmap ; Hit, let upper-level redrive sort it out - -gadScan: lbz r12,mpgCursor(r22) ; Get group's cursor - rlwinm r12,r12,GV_SLOT_SZ_LG2,(GV_SLOT_MASK << GV_SLOT_SZ_LG2) - ; Prepare to address slot at cursor - li r0,(GV_SLOTS - 1) ; Prepare to iterate over mapping slots - mtctr r0 ; in this group - or r2,r22,r12 ; r2 <- 1st mapping to search - lwz r3,mpFlags(r2) ; r3 <- 1st mapping slot's flags - li r11,0 ; No dormant entries found yet - b gadScanLoop ; Let the search begin! - - .align 5 -gadScanLoop: - addi r12,r12,GV_SLOT_SZ ; Calculate next slot number to search - rlwinm r12,r12,0,(GV_SLOT_MASK << GV_SLOT_SZ_LG2) - ; Trim off any carry, wrapping into slot number range - mr r31,r2 ; r31 <- current mapping's address - or r2,r22,r12 ; r2 <- next mapping to search - mr r6,r3 ; r6 <- current mapping slot's flags - lwz r3,mpFlags(r2) ; r3 <- next mapping slot's flags - rlwinm. r0,r6,0,mpgFree ; Test free flag - bne gadFillMap ; Join common path on hit (r31 points to free mapping) - rlwinm r0,r6,0,mpgDormant ; Dormant entry? - xori r0,r0,mpgDormant ; Invert dormant flag - or. r0,r0,r11 ; Skip all but the first dormant entry we see - bne gadNotDorm ; Not dormant or we've already seen one - mr r11,r31 ; We'll use this dormant entry if we don't find a free one first -gadNotDorm: bdnz gadScanLoop ; Iterate - - mr r31,r2 ; r31 <- final mapping's address - rlwinm. r0,r6,0,mpgFree ; Test free flag in final mapping - bne gadFillMap ; Join common path on hit (r31 points to dormant mapping) - rlwinm r0,r6,0,mpgDormant ; Dormant entry? - xori r0,r0,mpgDormant ; Invert dormant flag - or. r0,r0,r11 ; Skip all but the first dormant entry we see - bne gadCkDormant ; Not dormant or we've already seen one - mr r11,r31 ; We'll use this dormant entry if we don't find a free one first - -gadCkDormant: - mr. r31,r11 ; Get dormant mapping, if any, and test - bne gadUpCursor ; Go update the cursor, we'll take the dormant entry - -gadSteal: - lbz r12,mpgCursor(r22) ; Get group's cursor - rlwinm r12,r12,GV_SLOT_SZ_LG2,(GV_SLOT_MASK << GV_SLOT_SZ_LG2) - ; Prepare to address slot at cursor - or r31,r22,r12 ; r31 <- address of mapping to steal - - bt++ pf64Bitb,gadDscon64 ; Handle 64-bit disconnect separately - bl mapInvPte32 ; Disconnect PTE, invalidate, gather ref and change - ; r31 <- mapping's physical address - ; r3 -> PTE slot physical address - ; r4 -> High-order 32 bits of PTE - ; r5 -> Low-order 32 bits of PTE - ; r6 -> PCA - ; r7 -> PCA physical address - rlwinm r2,r3,29,29,31 ; Get PTE's slot number in the PTEG (8-byte PTEs) - b gadFreePTE ; Join 64-bit path to release the PTE -gadDscon64: bl mapInvPte64 ; Disconnect PTE, invalidate, gather ref and change - rlwinm r2,r3,28,29,31 ; Get PTE's slot number in the PTEG (16-byte PTEs) -gadFreePTE: mr. r3,r3 ; Was there a valid PTE? - beq- gadUpCursor ; No valid PTE, we're almost done - lis r0,0x8000 ; Prepare free bit for this slot - srw r0,r0,r2 ; Position free bit - or r6,r6,r0 ; Set it in our PCA image - lwz r8,mpPte(r31) ; Get PTE pointer - rlwinm r8,r8,0,~mpHValid ; Make the pointer invalid - stw r8,mpPte(r31) ; Save invalidated PTE pointer - eieio ; Synchronize all previous updates (mapInvPtexx didn't) - stw r6,0(r7) ; Update PCA and unlock the PTEG - -gadUpCursor: - rlwinm r12,r31,(32-GV_SLOT_SZ_LG2),GV_SLOT_MASK - ; Recover slot number from stolen mapping's address - addi r12,r12,1 ; Increment slot number - rlwinm r12,r12,0,GV_SLOT_MASK ; Clip to slot number range - stb r12,mpgCursor(r22) ; Update group's cursor - - lwz r3,mpPAddr(r31) ; r3 <- physical 4K-page number - bl mapFindLockPN ; Find 'n' lock this page's physent - mr. r26,r3 ; Got lock on our physent? - beq-- gadBadPLock ; No, time to bail out - - crset cr1_eq ; cr1_eq <- previous link is the anchor - bt++ pf64Bitb,gadRemove64 ; Use 64-bit version on 64-bit machine - la r11,ppLink+4(r26) ; Point to chain anchor - lwz r9,ppLink+4(r26) ; Get chain anchor - rlwinm. r9,r9,0,~ppFlags ; Remove flags, yielding 32-bit physical chain pointer -gadRemLoop: beq- gadPEMissMiss ; End of chain, this is not good - cmplw r9,r31 ; Is this the mapping to remove? - lwz r8,mpAlias+4(r9) ; Get forward chain pointer - bne gadRemNext ; No, chain onward - bt cr1_eq,gadRemRetry ; Mapping to remove is chained from anchor - stw r8,0(r11) ; Unchain gpv->phys mapping - b gadDelDone ; Finish deleting mapping -gadRemRetry: - lwarx r0,0,r11 ; Get previous link - rlwimi r0,r8,0,~ppFlags ; Insert new forward pointer whilst preserving flags - stwcx. r0,0,r11 ; Update previous link - bne- gadRemRetry ; Lost reservation, retry - b gadDelDone ; Finish deleting mapping - -gadRemNext: la r11,mpAlias+4(r9) ; Point to (soon to be) previous link - crclr cr1_eq ; ~cr1_eq <- Previous link is not the anchor - mr. r9,r8 ; Does next entry exist? - b gadRemLoop ; Carry on - -gadRemove64: - li r7,ppLFAmask ; Get mask to clean up mapping pointer - rotrdi r7,r7,ppLFArrot ; Rotate clean up mask to get 0xF0000000000000000F - la r11,ppLink(r26) ; Point to chain anchor - ld r9,ppLink(r26) ; Get chain anchor - andc. r9,r9,r7 ; Remove flags, yielding 64-bit physical chain pointer -gadRem64Lp: beq-- gadPEMissMiss ; End of chain, this is not good - cmpld r9,r31 ; Is this the mapping to remove? - ld r8,mpAlias(r9) ; Get forward chain pinter - bne gadRem64Nxt ; Not mapping to remove, chain on, dude - bt cr1_eq,gadRem64Rt ; Mapping to remove is chained from anchor - std r8,0(r11) ; Unchain gpv->phys mapping - b gadDelDone ; Finish deleting mapping -gadRem64Rt: ldarx r0,0,r11 ; Get previous link - and r0,r0,r7 ; Get flags - or r0,r0,r8 ; Insert new forward pointer - stdcx. r0,0,r11 ; Slam it back in - bne-- gadRem64Rt ; Lost reservation, retry - b gadDelDone ; Finish deleting mapping - - .align 5 -gadRem64Nxt: - la r11,mpAlias(r9) ; Point to (soon to be) previous link - crclr cr1_eq ; ~cr1_eq <- Previous link is not the anchor - mr. r9,r8 ; Does next entry exist? - b gadRem64Lp ; Carry on - -gadDelDone: - mr r3,r26 ; Get physent address - bl mapPhysUnlock ; Unlock physent chain - -gadFillMap: - lwz r12,pmapSpace(r28) ; Get guest space id number - li r2,0 ; Get a zero - stw r24,mpFlags(r31) ; Set mapping's flags - sth r12,mpSpace(r31) ; Set mapping's space id number - stw r2,mpPte(r31) ; Set mapping's pte pointer invalid - stw r29,mpPAddr(r31) ; Set mapping's physical address - bt++ pf64Bitb,gadVA64 ; Use 64-bit version on 64-bit machine - stw r30,mpVAddr+4(r31) ; Set mapping's virtual address (w/flags) - b gadChain ; Continue with chaining mapping to physent -gadVA64: std r30,mpVAddr(r31) ; Set mapping's virtual address (w/flags) - -gadChain: mr r3,r29 ; r3 <- physical frame address - bl mapFindLockPN ; Find 'n' lock this page's physent - mr. r26,r3 ; Got lock on our physent? - beq-- gadBadPLock ; No, time to bail out - - bt++ pf64Bitb,gadChain64 ; Use 64-bit version on 64-bit machine - lwz r12,ppLink+4(r26) ; Get forward chain - rlwinm r11,r12,0,~ppFlags ; Get physent's forward pointer sans flags - rlwimi r12,r31,0,~ppFlags ; Insert new mapping, preserve physent flags - stw r11,mpAlias+4(r31) ; New mapping will head chain - stw r12,ppLink+4(r26) ; Point physent to new mapping - b gadFinish ; All over now... - -gadChain64: li r7,ppLFAmask ; Get mask to clean up mapping pointer - rotrdi r7,r7,ppLFArrot ; Rotate clean up mask to get 0xF0000000000000000F - ld r12,ppLink(r26) ; Get forward chain - andc r11,r12,r7 ; Get physent's forward chain pointer sans flags - and r12,r12,r7 ; Isolate pointer's flags - or r12,r12,r31 ; Insert new mapping's address forming pointer - std r11,mpAlias(r31) ; New mapping will head chain - std r12,ppLink(r26) ; Point physent to new mapping - -gadFinish: eieio ; Ensure new mapping is completely visible - -gadRelPhy: mr r3,r26 ; r3 <- physent addr - bl mapPhysUnlock ; Unlock physent chain - -gadRelPmap: la r3,pmapSXlk(r27) ; r3 <- host pmap search lock phys addr - bl sxlkUnlock ; Release host pmap search lock - - bt++ pf64Bitb,gadRtn64 ; Handle 64-bit separately - mtmsr r25 ; Restore 'rupts, translation - isync ; Throw a small wrench into the pipeline - b gadPopFrame ; Nothing to do now but pop a frame and return -gadRtn64: mtmsrd r25 ; Restore 'rupts, translation, 32-bit mode -gadPopFrame: - lwz r0,(FM_ALIGN(gadStackSize)+FM_SIZE+FM_LR_SAVE)(r1) - ; Get caller's return address - lwz r31,FM_ARG0+0x00(r1) ; Restore non-volatile r31 - lwz r30,FM_ARG0+0x04(r1) ; Restore non-volatile r30 - lwz r29,FM_ARG0+0x08(r1) ; Restore non-volatile r29 - lwz r28,FM_ARG0+0x0C(r1) ; Restore non-volatile r28 - mtlr r0 ; Prepare return address - lwz r27,FM_ARG0+0x10(r1) ; Restore non-volatile r27 - lwz r26,FM_ARG0+0x14(r1) ; Restore non-volatile r26 - lwz r25,FM_ARG0+0x18(r1) ; Restore non-volatile r25 - lwz r24,FM_ARG0+0x1C(r1) ; Restore non-volatile r24 - lwz r23,FM_ARG0+0x20(r1) ; Restore non-volatile r23 - lwz r22,FM_ARG0+0x24(r1) ; Restore non-volatile r22 - lwz r1,0(r1) ; Pop stack frame - blr ; Return to caller - -gadPEMissMiss: -gadBadPLock: - lis r0,hi16(Choke) ; Dmitri, you know how we've always talked about the - ori r0,r0,lo16(Choke) ; possibility of something going wrong with the bomb? - li r3,failMapping ; The BOMB, Dmitri. - sc ; The hydrogen bomb. - - -; -; Guest shadow assist -- supend a guest mapping -; -; Suspends a guest mapping. -; -; Parameters: -; r3 : address of host pmap, 32-bit kernel virtual address -; r4 : address of guest pmap, 32-bit kernel virtual address -; r5 : guest virtual address, high-order 32 bits -; r6 : guest virtual address, low-order 32 bits -; -; Non-volatile register usage: -; r26 : VMM extension block's physical address -; r27 : host pmap physical address -; r28 : guest pmap physical address -; r29 : caller's msr image from mapSetUp -; r30 : guest virtual address -; r31 : gva->phys mapping's physical address -; - - .align 5 - .globl EXT(hw_susp_map_gv) - -LEXT(hw_susp_map_gv) - -#define gsuStackSize ((31-26+1)*4)+4 - - stwu r1,-(FM_ALIGN(gsuStackSize)+FM_SIZE)(r1) - ; Mint a new stack frame - mflr r0 ; Get caller's return address - mfsprg r11,2 ; Get feature flags - mtcrf 0x02,r11 ; Insert feature flags into cr6 - stw r0,(FM_ALIGN(gsuStackSize)+FM_SIZE+FM_LR_SAVE)(r1) - ; Save caller's return address - stw r31,FM_ARG0+0x00(r1) ; Save non-volatile r31 - stw r30,FM_ARG0+0x04(r1) ; Save non-volatile r30 - stw r29,FM_ARG0+0x08(r1) ; Save non-volatile r29 - stw r28,FM_ARG0+0x0C(r1) ; Save non-volatile r28 - stw r27,FM_ARG0+0x10(r1) ; Save non-volatile r27 - stw r26,FM_ARG0+0x14(r1) ; Save non-volatile r26 - - rlwinm r30,r6,0,0xFFFFF000 ; Clean up low-order 32 bits of guest vaddr - - lwz r11,pmapVmmExt(r3) ; r11 <- VMM pmap extension block vaddr - lwz r9,pmapSpace(r4) ; r9 <- guest space ID number - bt++ pf64Bitb,gsu64Salt ; Test for 64-bit machine - - lwz r26,pmapVmmExtPhys+4(r3) ; r26 <- VMM pmap extension block paddr - lwz r27,pmapvr+4(r3) ; Get 32-bit virt<->real host pmap conversion salt - lwz r28,pmapvr+4(r4) ; Get 32-bit virt<->real guest pmap conversion salt - la r31,VMX_HPIDX_OFFSET(r11) ; r31 <- base of hash page physical index - srwi r11,r30,12 ; Form shadow hash: - xor r11,r11,r9 ; spaceID ^ (vaddr >> 12) - rlwinm r10,r11,GV_HPAGE_SHIFT,GV_HPAGE_MASK - ; Form index offset from hash page number - add r31,r31,r10 ; r31 <- hash page index entry - lwz r31,4(r31) ; r31 <- hash page paddr - rlwimi r31,r11,GV_HGRP_SHIFT,GV_HGRP_MASK - ; r31 <- hash group paddr - b gsuStart ; Get to it -gsu64Salt: rldimi r30,r5,32,0 ; Insert high-order 32 bits of 64-bit guest vaddr - ld r26,pmapVmmExtPhys(r3) ; r26 <- VMM pmap extension block paddr - ld r27,pmapvr(r3) ; Get 64-bit virt<->real host pmap conversion salt - ld r28,pmapvr(r4) ; Get 64-bit virt<->real guest pmap conversion salt - la r31,VMX_HPIDX_OFFSET(r11) ; r31 <- base of hash page physical index - srwi r11,r30,12 ; Form shadow hash: - xor r11,r11,r9 ; spaceID ^ (vaddr >> 12) - rlwinm r10,r11,GV_HPAGE_SHIFT,GV_HPAGE_MASK - ; Form index offset from hash page number - add r31,r31,r10 ; r31 <- hash page index entry - ld r31,0(r31) ; r31 <- hash page paddr - insrdi r31,r11,GV_GRPS_PPG_LG2,64-(GV_HGRP_SHIFT+GV_GRPS_PPG_LG2) - ; r31 <- hash group paddr - -gsuStart: xor r27,r3,r27 ; Convert host pmap_t virt->real - xor r28,r4,r28 ; Convert guest pmap_t virt->real - bl EXT(mapSetUp) ; Disable 'rupts, translation, maybe enter 64-bit mode - mr r29,r11 ; Save caller's msr image - - la r3,pmapSXlk(r27) ; r3 <- host pmap's search lock address - bl sxlkExclusive ; Get lock exclusive - - li r0,(GV_SLOTS - 1) ; Prepare to iterate over mapping slots - mtctr r0 ; in this group - bt++ pf64Bitb,gsu64Search ; Test for 64-bit machine - - lwz r3,mpFlags(r31) ; r3 <- 1st mapping slot's flags - lhz r4,mpSpace(r31) ; r4 <- 1st mapping slot's space ID - lwz r5,mpVAddr+4(r31) ; r5 <- 1st mapping slot's virtual address - b gsu32SrchLp ; Let the search begin! - - .align 5 -gsu32SrchLp: - mr r6,r3 ; r6 <- current mapping slot's flags - lwz r3,mpFlags+GV_SLOT_SZ(r31) ; r3 <- next mapping slot's flags - mr r7,r4 ; r7 <- current mapping slot's space ID - lhz r4,mpSpace+GV_SLOT_SZ(r31) ; r4 <- next mapping slot's space ID - clrrwi r8,r5,12 ; r8 <- current mapping slot's virtual addr w/o flags - lwz r5,mpVAddr+4+GV_SLOT_SZ(r31); r5 <- next mapping slot's virtual addr - andi. r11,r6,mpgFree+mpgDormant ; Isolate guest free and dormant flags - xor r7,r7,r9 ; Compare space ID - or r0,r11,r7 ; r0 <- !(!free && !dormant && space match) - xor r8,r8,r30 ; Compare virtual address - or. r0,r0,r8 ; cr0_eq <- !free && !dormant && space match && virtual addr match - beq gsuSrchHit ; Join common path on hit (r31 points to guest mapping) - - addi r31,r31,GV_SLOT_SZ ; r31 <- next mapping slot - bdnz gsu32SrchLp ; Iterate - - mr r6,r3 ; r6 <- current mapping slot's flags - clrrwi r5,r5,12 ; Remove flags from virtual address - andi. r11,r6,mpgFree+mpgDormant ; Isolate guest free and dormant flags - xor r4,r4,r9 ; Compare space ID - or r0,r11,r4 ; r0 <- !(!free && !dormant && space match) - xor r5,r5,r30 ; Compare virtual address - or. r0,r0,r5 ; cr0_eq <- !free && !dormant && space match && virtual addr match - beq gsuSrchHit ; Join common path on hit (r31 points to guest mapping) - b gsuSrchMiss ; No joy in our hash group - -gsu64Search: - lwz r3,mpFlags(r31) ; r3 <- 1st mapping slot's flags - lhz r4,mpSpace(r31) ; r4 <- 1st mapping slot's space ID - ld r5,mpVAddr(r31) ; r5 <- 1st mapping slot's virtual address - b gsu64SrchLp ; Let the search begin! - - .align 5 -gsu64SrchLp: - mr r6,r3 ; r6 <- current mapping slot's flags - lwz r3,mpFlags+GV_SLOT_SZ(r31) ; r3 <- next mapping slot's flags - mr r7,r4 ; r7 <- current mapping slot's space ID - lhz r4,mpSpace+GV_SLOT_SZ(r31) ; r4 <- next mapping slot's space ID - clrrdi r8,r5,12 ; r8 <- current mapping slot's virtual addr w/o flags - ld r5,mpVAddr+GV_SLOT_SZ(r31) ; r5 <- next mapping slot's virtual addr - andi. r11,r6,mpgFree+mpgDormant ; Isolate guest free and dormant flags - xor r7,r7,r9 ; Compare space ID - or r0,r11,r7 ; r0 <- !(!free && !dormant && space match) - xor r8,r8,r30 ; Compare virtual address - or. r0,r0,r8 ; cr0_eq <- !free && !dormant && space match && virtual addr match - beq gsuSrchHit ; Join common path on hit (r31 points to guest mapping) - - addi r31,r31,GV_SLOT_SZ ; r31 <- next mapping slot - bdnz gsu64SrchLp ; Iterate - - mr r6,r3 ; r6 <- current mapping slot's flags - clrrdi r5,r5,12 ; Remove flags from virtual address - andi. r11,r6,mpgFree+mpgDormant ; Isolate guest free and dormant flags - xor r4,r4,r9 ; Compare space ID - or r0,r11,r4 ; r0 <- !(!free && !dormant && space match) - xor r5,r5,r30 ; Compare virtual address - or. r0,r0,r5 ; cr0_eq <- !free && !dormant && space match && virtual addr match - bne gsuSrchMiss ; No joy in our hash group - -gsuSrchHit: - bt++ pf64Bitb,gsuDscon64 ; Handle 64-bit disconnect separately - bl mapInvPte32 ; Disconnect PTE, invalidate, gather ref and change - ; r31 <- mapping's physical address - ; r3 -> PTE slot physical address - ; r4 -> High-order 32 bits of PTE - ; r5 -> Low-order 32 bits of PTE - ; r6 -> PCA - ; r7 -> PCA physical address - rlwinm r2,r3,29,29,31 ; Get PTE's slot number in the PTEG (8-byte PTEs) - b gsuFreePTE ; Join 64-bit path to release the PTE -gsuDscon64: bl mapInvPte64 ; Disconnect PTE, invalidate, gather ref and change - rlwinm r2,r3,28,29,31 ; Get PTE's slot number in the PTEG (16-byte PTEs) -gsuFreePTE: mr. r3,r3 ; Was there a valid PTE? - beq- gsuNoPTE ; No valid PTE, we're almost done - lis r0,0x8000 ; Prepare free bit for this slot - srw r0,r0,r2 ; Position free bit - or r6,r6,r0 ; Set it in our PCA image - lwz r8,mpPte(r31) ; Get PTE pointer - rlwinm r8,r8,0,~mpHValid ; Make the pointer invalid - stw r8,mpPte(r31) ; Save invalidated PTE pointer - eieio ; Synchronize all previous updates (mapInvPtexx didn't) - stw r6,0(r7) ; Update PCA and unlock the PTEG - -gsuNoPTE: lwz r3,mpFlags(r31) ; Get mapping's flags - ori r3,r3,mpgDormant ; Mark entry dormant - stw r3,mpFlags(r31) ; Save updated flags - eieio ; Ensure update is visible when we unlock - -gsuSrchMiss: - la r3,pmapSXlk(r27) ; r3 <- host pmap search lock phys addr - bl sxlkUnlock ; Release host pmap search lock - - bt++ pf64Bitb,gsuRtn64 ; Handle 64-bit separately - mtmsr r29 ; Restore 'rupts, translation - isync ; Throw a small wrench into the pipeline - b gsuPopFrame ; Nothing to do now but pop a frame and return -gsuRtn64: mtmsrd r29 ; Restore 'rupts, translation, 32-bit mode -gsuPopFrame: - lwz r0,(FM_ALIGN(gsuStackSize)+FM_SIZE+FM_LR_SAVE)(r1) - ; Get caller's return address - lwz r31,FM_ARG0+0x00(r1) ; Restore non-volatile r31 - lwz r30,FM_ARG0+0x04(r1) ; Restore non-volatile r30 - lwz r29,FM_ARG0+0x08(r1) ; Restore non-volatile r29 - lwz r28,FM_ARG0+0x0C(r1) ; Restore non-volatile r28 - mtlr r0 ; Prepare return address - lwz r27,FM_ARG0+0x10(r1) ; Restore non-volatile r27 - lwz r26,FM_ARG0+0x14(r1) ; Restore non-volatile r26 - lwz r1,0(r1) ; Pop stack frame - blr ; Return to caller - -; -; Guest shadow assist -- test guest mapping reference and change bits -; -; Locates the specified guest mapping, and if it exists gathers its reference -; and change bit, optionally�resetting them. -; -; Parameters: -; r3 : address of host pmap, 32-bit kernel virtual address -; r4 : address of guest pmap, 32-bit kernel virtual address -; r5 : guest virtual address, high-order 32 bits -; r6 : guest virtual address, low-order 32 bits -; r7 : reset boolean -; -; Non-volatile register usage: -; r24 : VMM extension block's physical address -; r25 : return code (w/reference and change bits) -; r26 : reset boolean -; r27 : host pmap physical address -; r28 : guest pmap physical address -; r29 : caller's msr image from mapSetUp -; r30 : guest virtual address -; r31 : gva->phys mapping's physical address -; - - .align 5 - .globl EXT(hw_test_rc_gv) - -LEXT(hw_test_rc_gv) - -#define gtdStackSize ((31-24+1)*4)+4 - - stwu r1,-(FM_ALIGN(gtdStackSize)+FM_SIZE)(r1) - ; Mint a new stack frame - mflr r0 ; Get caller's return address - mfsprg r11,2 ; Get feature flags - mtcrf 0x02,r11 ; Insert feature flags into cr6 - stw r0,(FM_ALIGN(gtdStackSize)+FM_SIZE+FM_LR_SAVE)(r1) - ; Save caller's return address - stw r31,FM_ARG0+0x00(r1) ; Save non-volatile r31 - stw r30,FM_ARG0+0x04(r1) ; Save non-volatile r30 - stw r29,FM_ARG0+0x08(r1) ; Save non-volatile r29 - stw r28,FM_ARG0+0x0C(r1) ; Save non-volatile r28 - stw r27,FM_ARG0+0x10(r1) ; Save non-volatile r27 - stw r26,FM_ARG0+0x14(r1) ; Save non-volatile r26 - stw r25,FM_ARG0+0x18(r1) ; Save non-volatile r25 - stw r24,FM_ARG0+0x1C(r1) ; Save non-volatile r24 - - rlwinm r30,r6,0,0xFFFFF000 ; Clean up low-order 20 bits of guest vaddr - - lwz r11,pmapVmmExt(r3) ; r11 <- VMM pmap extension block vaddr - lwz r9,pmapSpace(r4) ; r9 <- guest space ID number - - bt++ pf64Bitb,gtd64Salt ; Test for 64-bit machine - - lwz r24,pmapVmmExtPhys+4(r3) ; r24 <- VMM pmap extension block paddr - lwz r27,pmapvr+4(r3) ; Get 32-bit virt<->real host pmap conversion salt - lwz r28,pmapvr+4(r4) ; Get 32-bit virt<->real guest pmap conversion salt - la r31,VMX_HPIDX_OFFSET(r11) ; r31 <- base of hash page physical index - srwi r11,r30,12 ; Form shadow hash: - xor r11,r11,r9 ; spaceID ^ (vaddr >> 12) - rlwinm r10,r11,GV_HPAGE_SHIFT,GV_HPAGE_MASK - ; Form index offset from hash page number - add r31,r31,r10 ; r31 <- hash page index entry - lwz r31,4(r31) ; r31 <- hash page paddr - rlwimi r31,r11,GV_HGRP_SHIFT,GV_HGRP_MASK - ; r31 <- hash group paddr - b gtdStart ; Get to it - -gtd64Salt: rldimi r30,r5,32,0 ; Insert high-order 32 bits of 64-bit guest vaddr - ld r24,pmapVmmExtPhys(r3) ; r24 <- VMM pmap extension block paddr - ld r27,pmapvr(r3) ; Get 64-bit virt<->real host pmap conversion salt - ld r28,pmapvr(r4) ; Get 64-bit virt<->real guest pmap conversion salt - la r31,VMX_HPIDX_OFFSET(r11) ; r31 <- base of hash page physical index - srwi r11,r30,12 ; Form shadow hash: - xor r11,r11,r9 ; spaceID ^ (vaddr >> 12) - rlwinm r10,r11,GV_HPAGE_SHIFT,GV_HPAGE_MASK - ; Form index offset from hash page number - add r31,r31,r10 ; r31 <- hash page index entry - ld r31,0(r31) ; r31 <- hash page paddr - insrdi r31,r11,GV_GRPS_PPG_LG2,64-(GV_HGRP_SHIFT+GV_GRPS_PPG_LG2) - ; r31 <- hash group paddr - -gtdStart: xor r27,r3,r27 ; Convert host pmap_t virt->real - xor r28,r4,r28 ; Convert guest pmap_t virt->real - mr r26,r7 ; Save reset boolean - bl EXT(mapSetUp) ; Disable 'rupts, translation, maybe enter 64-bit mode - mr r29,r11 ; Save caller's msr image - - la r3,pmapSXlk(r27) ; r3 <- host pmap's search lock address - bl sxlkExclusive ; Get lock exclusive - - li r0,(GV_SLOTS - 1) ; Prepare to iterate over mapping slots - mtctr r0 ; in this group - bt++ pf64Bitb,gtd64Search ; Test for 64-bit machine - - lwz r3,mpFlags(r31) ; r3 <- 1st mapping slot's flags - lhz r4,mpSpace(r31) ; r4 <- 1st mapping slot's space ID - lwz r5,mpVAddr+4(r31) ; r5 <- 1st mapping slot's virtual address - b gtd32SrchLp ; Let the search begin! - - .align 5 -gtd32SrchLp: - mr r6,r3 ; r6 <- current mapping slot's flags - lwz r3,mpFlags+GV_SLOT_SZ(r31) ; r3 <- next mapping slot's flags - mr r7,r4 ; r7 <- current mapping slot's space ID - lhz r4,mpSpace+GV_SLOT_SZ(r31) ; r4 <- next mapping slot's space ID - clrrwi r8,r5,12 ; r8 <- current mapping slot's virtual addr w/o flags - lwz r5,mpVAddr+4+GV_SLOT_SZ(r31); r5 <- next mapping slot's virtual addr - andi. r11,r6,mpgFree+mpgDormant ; Isolate guest free and dormant flags - xor r7,r7,r9 ; Compare space ID - or r0,r11,r7 ; r0 <- !(!free && !dormant && space match) - xor r8,r8,r30 ; Compare virtual address - or. r0,r0,r8 ; cr0_eq <- !free && !dormant && space match && virtual addr match - beq gtdSrchHit ; Join common path on hit (r31 points to guest mapping) - - addi r31,r31,GV_SLOT_SZ ; r31 <- next mapping slot - bdnz gtd32SrchLp ; Iterate - - mr r6,r3 ; r6 <- current mapping slot's flags - clrrwi r5,r5,12 ; Remove flags from virtual address - andi. r11,r6,mpgFree+mpgDormant ; Isolate guest free and dormant flags - xor r4,r4,r9 ; Compare space ID - or r0,r11,r4 ; r0 <- !(!free && !dormant && space match) - xor r5,r5,r30 ; Compare virtual address - or. r0,r0,r5 ; cr0_eq <- !free && !dormant && space match && virtual addr match - beq gtdSrchHit ; Join common path on hit (r31 points to guest mapping) - b gtdSrchMiss ; No joy in our hash group - -gtd64Search: - lwz r3,mpFlags(r31) ; r3 <- 1st mapping slot's flags - lhz r4,mpSpace(r31) ; r4 <- 1st mapping slot's space ID - ld r5,mpVAddr(r31) ; r5 <- 1st mapping slot's virtual address - b gtd64SrchLp ; Let the search begin! - - .align 5 -gtd64SrchLp: - mr r6,r3 ; r6 <- current mapping slot's flags - lwz r3,mpFlags+GV_SLOT_SZ(r31) ; r3 <- next mapping slot's flags - mr r7,r4 ; r7 <- current mapping slot's space ID - lhz r4,mpSpace+GV_SLOT_SZ(r31) ; r4 <- next mapping slot's space ID - clrrdi r8,r5,12 ; r8 <- current mapping slot's virtual addr w/o flags - ld r5,mpVAddr+GV_SLOT_SZ(r31) ; r5 <- next mapping slot's virtual addr - andi. r11,r6,mpgFree+mpgDormant ; Isolate guest free and dormant flags - xor r7,r7,r9 ; Compare space ID - or r0,r11,r7 ; r0 <- !(!free && !dormant && space match) - xor r8,r8,r30 ; Compare virtual address - or. r0,r0,r8 ; cr0_eq <- !free && !dormant && space match && virtual addr match - beq gtdSrchHit ; Join common path on hit (r31 points to guest mapping) - - addi r31,r31,GV_SLOT_SZ ; r31 <- next mapping slot - bdnz gtd64SrchLp ; Iterate - - mr r6,r3 ; r6 <- current mapping slot's flags - clrrdi r5,r5,12 ; Remove flags from virtual address - andi. r11,r6,mpgFree+mpgDormant ; Isolate guest free and dormant flags - xor r4,r4,r9 ; Compare space ID - or r0,r11,r4 ; r0 <- !(!free && !dormant && space match) - xor r5,r5,r30 ; Compare virtual address - or. r0,r0,r5 ; cr0_eq <- !free && !dormant && space match && virtual addr match - bne gtdSrchMiss ; No joy in our hash group - -gtdSrchHit: - bt++ pf64Bitb,gtdDo64 ; Split for 64 bit - - bl mapInvPte32 ; Invalidate and lock PTEG, also merge into physent - - cmplwi cr1,r26,0 ; Do we want to clear RC? - lwz r12,mpVAddr+4(r31) ; Get the bottom of the mapping vaddr field - mr. r3,r3 ; Was there a previously valid PTE? - li r0,lo16(mpR|mpC) ; Get bits to clear - - and r25,r5,r0 ; Copy RC bits into result - beq++ cr1,gtdNoClr32 ; Nope... - - andc r12,r12,r0 ; Clear mapping copy of RC - andc r5,r5,r0 ; Clear PTE copy of RC - sth r12,mpVAddr+6(r31) ; Set the new RC in mapping - -gtdNoClr32: beq-- gtdNoOld32 ; No previously valid PTE... - - sth r5,6(r3) ; Store updated RC in PTE - eieio ; Make sure we do not reorder - stw r4,0(r3) ; Revalidate the PTE - - eieio ; Make sure all updates come first - stw r6,0(r7) ; Unlock PCA - -gtdNoOld32: la r3,pmapSXlk(r27) ; Point to the pmap search lock - bl sxlkUnlock ; Unlock the search list - b gtdR32 ; Join common... - - .align 5 - - -gtdDo64: bl mapInvPte64 ; Invalidate and lock PTEG, also merge into physent - - cmplwi cr1,r26,0 ; Do we want to clear RC? - lwz r12,mpVAddr+4(r31) ; Get the bottom of the mapping vaddr field - mr. r3,r3 ; Was there a previously valid PTE? - li r0,lo16(mpR|mpC) ; Get bits to clear - - and r25,r5,r0 ; Copy RC bits into result - beq++ cr1,gtdNoClr64 ; Nope... - - andc r12,r12,r0 ; Clear mapping copy of RC - andc r5,r5,r0 ; Clear PTE copy of RC - sth r12,mpVAddr+6(r31) ; Set the new RC - -gtdNoClr64: beq-- gtdNoOld64 ; Nope, no pevious pte... - - sth r5,14(r3) ; Store updated RC - eieio ; Make sure we do not reorder - std r4,0(r3) ; Revalidate the PTE - - eieio ; Make sure all updates come first - stw r6,0(r7) ; Unlock PCA - -gtdNoOld64: la r3,pmapSXlk(r27) ; Point to the pmap search lock - bl sxlkUnlock ; Unlock the search list - b gtdR64 ; Join common... - -gtdSrchMiss: - la r3,pmapSXlk(r27) ; Point to the pmap search lock - bl sxlkUnlock ; Unlock the search list - li r25,mapRtNotFnd ; Get ready to return not found - bt++ pf64Bitb,gtdR64 ; Test for 64-bit machine - -gtdR32: mtmsr r29 ; Restore caller's msr image - isync - b gtdEpilog - -gtdR64: mtmsrd r29 ; Restore caller's msr image - -gtdEpilog: lwz r0,(FM_ALIGN(gtdStackSize)+FM_SIZE+FM_LR_SAVE)(r1) - ; Get caller's return address - mr r3,r25 ; Get return code - lwz r31,FM_ARG0+0x00(r1) ; Restore non-volatile r31 - lwz r30,FM_ARG0+0x04(r1) ; Restore non-volatile r30 - lwz r29,FM_ARG0+0x08(r1) ; Restore non-volatile r29 - lwz r28,FM_ARG0+0x0C(r1) ; Restore non-volatile r28 - mtlr r0 ; Prepare return address - lwz r27,FM_ARG0+0x10(r1) ; Restore non-volatile r27 - lwz r26,FM_ARG0+0x14(r1) ; Restore non-volatile r26 - lwz r25,FM_ARG0+0x18(r1) ; Restore non-volatile r25 - lwz r24,FM_ARG0+0x1C(r1) ; Restore non-volatile r24 - lwz r1,0(r1) ; Pop stack frame - blr ; Return to caller - -; -; Guest shadow assist -- convert guest to host virtual address -; -; Locates the specified guest mapping, and if it exists locates the -; first mapping belonging to its host on the physical chain and returns -; its virtual address. -; -; Note that if there are multiple mappings belonging to this host -; chained to the physent to which the guest mapping is chained, then -; host virtual aliases exist for this physical address. If host aliases -; exist, then we select the first on the physent chain, making it -; unpredictable which of the two or more possible host virtual addresses -; will be returned. -; -; Parameters: -; r3 : address of guest pmap, 32-bit kernel virtual address -; r4 : guest virtual address, high-order 32 bits -; r5 : guest virtual address, low-order 32 bits -; -; Non-volatile register usage: -; r24 : physent physical address -; r25 : VMM extension block's physical address -; r26 : host virtual address -; r27 : host pmap physical address -; r28 : guest pmap physical address -; r29 : caller's msr image from mapSetUp -; r30 : guest virtual address -; r31 : gva->phys mapping's physical address -; - - .align 5 - .globl EXT(hw_gva_to_hva) - -LEXT(hw_gva_to_hva) - -#define gthStackSize ((31-24+1)*4)+4 - - stwu r1,-(FM_ALIGN(gtdStackSize)+FM_SIZE)(r1) - ; Mint a new stack frame - mflr r0 ; Get caller's return address - mfsprg r11,2 ; Get feature flags - mtcrf 0x02,r11 ; Insert feature flags into cr6 - stw r0,(FM_ALIGN(gtdStackSize)+FM_SIZE+FM_LR_SAVE)(r1) - ; Save caller's return address - stw r31,FM_ARG0+0x00(r1) ; Save non-volatile r31 - stw r30,FM_ARG0+0x04(r1) ; Save non-volatile r30 - stw r29,FM_ARG0+0x08(r1) ; Save non-volatile r29 - stw r28,FM_ARG0+0x0C(r1) ; Save non-volatile r28 - stw r27,FM_ARG0+0x10(r1) ; Save non-volatile r27 - stw r26,FM_ARG0+0x14(r1) ; Save non-volatile r26 - stw r25,FM_ARG0+0x18(r1) ; Save non-volatile r25 - stw r24,FM_ARG0+0x1C(r1) ; Save non-volatile r24 - - rlwinm r30,r5,0,0xFFFFF000 ; Clean up low-order 32 bits of guest vaddr - - lwz r11,pmapVmmExt(r3) ; r11 <- VMM pmap extension block vaddr - lwz r9,pmapSpace(r3) ; r9 <- guest space ID number - - bt++ pf64Bitb,gth64Salt ; Test for 64-bit machine - - lwz r25,pmapVmmExtPhys+4(r3) ; r25 <- VMM pmap extension block paddr - lwz r28,pmapvr+4(r3) ; Get 32-bit virt<->real guest pmap conversion salt - lwz r27,vmxHostPmapPhys+4(r11) ; Get host pmap physical address - la r31,VMX_HPIDX_OFFSET(r11) ; r31 <- base of hash page physical index - srwi r11,r30,12 ; Form shadow hash: - xor r11,r11,r9 ; spaceID ^ (vaddr >> 12) - rlwinm r10,r11,GV_HPAGE_SHIFT,GV_HPAGE_MASK - ; Form index offset from hash page number - add r31,r31,r10 ; r31 <- hash page index entry - lwz r31,4(r31) ; r31 <- hash page paddr - rlwimi r31,r11,GV_HGRP_SHIFT,GV_HGRP_MASK - ; r31 <- hash group paddr - b gthStart ; Get to it - -gth64Salt: rldimi r30,r4,32,0 ; Insert high-order 32 bits of 64-bit guest vaddr - ld r25,pmapVmmExtPhys(r3) ; r24 <- VMM pmap extension block paddr - ld r28,pmapvr(r3) ; Get 64-bit virt<->real guest pmap conversion salt - ld r27,vmxHostPmapPhys(r11) ; Get host pmap physical address - la r31,VMX_HPIDX_OFFSET(r11) ; r31 <- base of hash page physical index - srwi r11,r30,12 ; Form shadow hash: - xor r11,r11,r9 ; spaceID ^ (vaddr >> 12) - rlwinm r10,r11,GV_HPAGE_SHIFT,GV_HPAGE_MASK - ; Form index offset from hash page number - add r31,r31,r10 ; r31 <- hash page index entry - ld r31,0(r31) ; r31 <- hash page paddr - insrdi r31,r11,GV_GRPS_PPG_LG2,64-(GV_HGRP_SHIFT+GV_GRPS_PPG_LG2) - ; r31 <- hash group paddr - -gthStart: xor r28,r3,r28 ; Convert guest pmap_t virt->real - bl EXT(mapSetUp) ; Disable 'rupts, translation, maybe enter 64-bit mode - mr r29,r11 ; Save caller's msr image - - la r3,pmapSXlk(r27) ; r3 <- host pmap's search lock address - bl sxlkExclusive ; Get lock exclusive - - li r0,(GV_SLOTS - 1) ; Prepare to iterate over mapping slots - mtctr r0 ; in this group - bt++ pf64Bitb,gth64Search ; Test for 64-bit machine - - lwz r3,mpFlags(r31) ; r3 <- 1st mapping slot's flags - lhz r4,mpSpace(r31) ; r4 <- 1st mapping slot's space ID - lwz r5,mpVAddr+4(r31) ; r5 <- 1st mapping slot's virtual address - b gth32SrchLp ; Let the search begin! - - .align 5 -gth32SrchLp: - mr r6,r3 ; r6 <- current mapping slot's flags - lwz r3,mpFlags+GV_SLOT_SZ(r31) ; r3 <- next mapping slot's flags - mr r7,r4 ; r7 <- current mapping slot's space ID - lhz r4,mpSpace+GV_SLOT_SZ(r31) ; r4 <- next mapping slot's space ID - clrrwi r8,r5,12 ; r8 <- current mapping slot's virtual addr w/o flags - lwz r5,mpVAddr+4+GV_SLOT_SZ(r31); r5 <- next mapping slot's virtual addr - andi. r11,r6,mpgFree+mpgDormant ; Isolate guest free and dormant flags - xor r7,r7,r9 ; Compare space ID - or r0,r11,r7 ; r0 <- !(!free && !dormant && space match) - xor r8,r8,r30 ; Compare virtual address - or. r0,r0,r8 ; cr0_eq <- !free && !dormant && space match && virtual addr match - beq gthSrchHit ; Join common path on hit (r31 points to guest mapping) - - addi r31,r31,GV_SLOT_SZ ; r31 <- next mapping slot - bdnz gth32SrchLp ; Iterate - - mr r6,r3 ; r6 <- current mapping slot's flags - clrrwi r5,r5,12 ; Remove flags from virtual address - andi. r11,r6,mpgFree+mpgDormant ; Isolate guest free and dormant flags - xor r4,r4,r9 ; Compare space ID - or r0,r11,r4 ; r0 <- !(!free && !dormant && space match) - xor r5,r5,r30 ; Compare virtual address - or. r0,r0,r5 ; cr0_eq <- !free && !dormant && space match && virtual addr match - beq gthSrchHit ; Join common path on hit (r31 points to guest mapping) - b gthSrchMiss ; No joy in our hash group - -gth64Search: - lwz r3,mpFlags(r31) ; r3 <- 1st mapping slot's flags - lhz r4,mpSpace(r31) ; r4 <- 1st mapping slot's space ID - ld r5,mpVAddr(r31) ; r5 <- 1st mapping slot's virtual address - b gth64SrchLp ; Let the search begin! - - .align 5 -gth64SrchLp: - mr r6,r3 ; r6 <- current mapping slot's flags - lwz r3,mpFlags+GV_SLOT_SZ(r31) ; r3 <- next mapping slot's flags - mr r7,r4 ; r7 <- current mapping slot's space ID - lhz r4,mpSpace+GV_SLOT_SZ(r31) ; r4 <- next mapping slot's space ID - clrrdi r8,r5,12 ; r8 <- current mapping slot's virtual addr w/o flags - ld r5,mpVAddr+GV_SLOT_SZ(r31) ; r5 <- next mapping slot's virtual addr - andi. r11,r6,mpgFree+mpgDormant ; Isolate guest free and dormant flags - xor r7,r7,r9 ; Compare space ID - or r0,r11,r7 ; r0 <- !(!free && !dormant && space match) - xor r8,r8,r30 ; Compare virtual address - or. r0,r0,r8 ; cr0_eq <- !free && !dormant && space match && virtual addr match - beq gthSrchHit ; Join common path on hit (r31 points to guest mapping) - - addi r31,r31,GV_SLOT_SZ ; r31 <- next mapping slot - bdnz gth64SrchLp ; Iterate - - mr r6,r3 ; r6 <- current mapping slot's flags - clrrdi r5,r5,12 ; Remove flags from virtual address - andi. r11,r6,mpgFree+mpgDormant ; Isolate guest free and dormant flags - xor r4,r4,r9 ; Compare space ID - or r0,r11,r4 ; r0 <- !(!free && !dormant && space match) - xor r5,r5,r30 ; Compare virtual address - or. r0,r0,r5 ; cr0_eq <- !free && !dormant && space match && virtual addr match - bne gthSrchMiss ; No joy in our hash group - -gthSrchHit: lwz r3,mpPAddr(r31) ; r3 <- physical 4K-page number - bl mapFindLockPN ; Find 'n' lock this page's physent - mr. r24,r3 ; Got lock on our physent? - beq-- gthBadPLock ; No, time to bail out - - bt++ pf64Bitb,gthPFnd64 ; 64-bit version of physent chain search - - lwz r9,ppLink+4(r24) ; Get first mapping on physent - lwz r6,pmapSpace(r27) ; Get host pmap's space id number - rlwinm r9,r9,0,~ppFlags ; Be-gone, unsightly flags -gthPELoop: mr. r12,r9 ; Got a mapping to look at? - beq- gthPEMiss ; Nope, we've missed hva->phys mapping - lwz r7,mpFlags(r12) ; Get mapping's flags - lhz r4,mpSpace(r12) ; Get mapping's space id number - lwz r26,mpVAddr+4(r12) ; Get mapping's virtual address - lwz r9,mpAlias+4(r12) ; Next mapping in physent alias chain - - rlwinm r0,r7,0,mpType ; Isolate mapping's type - rlwinm r26,r26,0,~mpHWFlags ; Bye-bye unsightly flags - xori r0,r0,mpNormal ; Normal mapping? - xor r4,r4,r6 ; Compare w/ host space id number - or. r0,r0,r4 ; cr0_eq <- (normal && space id hit) - beq gthPEHit ; Hit - b gthPELoop ; Iterate - -gthPFnd64: li r0,ppLFAmask ; Get mask to clean up mapping pointer - rotrdi r0,r0,ppLFArrot ; Rotate clean up mask to get 0xF0000000000000000F - ld r9,ppLink(r24) ; Get first mapping on physent - lwz r6,pmapSpace(r27) ; Get host pmap's space id number - andc r9,r9,r0 ; Cleanup mapping pointer -gthPELp64: mr. r12,r9 ; Got a mapping to look at? - beq-- gthPEMiss ; Nope, we've missed hva->phys mapping - lwz r7,mpFlags(r12) ; Get mapping's flags - lhz r4,mpSpace(r12) ; Get mapping's space id number - ld r26,mpVAddr(r12) ; Get mapping's virtual address - ld r9,mpAlias(r12) ; Next mapping physent alias chain - rlwinm r0,r7,0,mpType ; Isolate mapping's type - rldicr r26,r26,0,mpHWFlagsb-1 ; Bye-bye unsightly flags - xori r0,r0,mpNormal ; Normal mapping? - xor r4,r4,r6 ; Compare w/ host space id number - or. r0,r0,r4 ; cr0_eq <- (normal && space id hit) - beq gthPEHit ; Hit - b gthPELp64 ; Iterate - - .align 5 -gthPEMiss: mr r3,r24 ; Get physent's address - bl mapPhysUnlock ; Unlock physent chain -gthSrchMiss: - la r3,pmapSXlk(r27) ; Get host pmap search lock address - bl sxlkUnlock ; Release host pmap search lock - li r3,-1 ; Return 64-bit -1 - li r4,-1 - bt++ pf64Bitb,gthEpi64 ; Take 64-bit exit - b gthEpi32 ; Take 32-bit exit - - .align 5 -gthPEHit: mr r3,r24 ; Get physent's address - bl mapPhysUnlock ; Unlock physent chain - la r3,pmapSXlk(r27) ; Get host pmap search lock address - bl sxlkUnlock ; Release host pmap search lock - - bt++ pf64Bitb,gthR64 ; Test for 64-bit machine - -gthR32: li r3,0 ; High-order 32 bits host virtual address - mr r4,r26 ; Low-order 32 bits host virtual address -gthEpi32: mtmsr r29 ; Restore caller's msr image - isync - b gthEpilog - - .align 5 -gthR64: srdi r3,r26,32 ; High-order 32 bits host virtual address - clrldi r4,r26,32 ; Low-order 32 bits host virtual address -gthEpi64: mtmsrd r29 ; Restore caller's msr image - -gthEpilog: lwz r0,(FM_ALIGN(gthStackSize)+FM_SIZE+FM_LR_SAVE)(r1) - ; Get caller's return address - lwz r31,FM_ARG0+0x00(r1) ; Restore non-volatile r31 - lwz r30,FM_ARG0+0x04(r1) ; Restore non-volatile r30 - lwz r29,FM_ARG0+0x08(r1) ; Restore non-volatile r29 - lwz r28,FM_ARG0+0x0C(r1) ; Restore non-volatile r28 - mtlr r0 ; Prepare return address - lwz r27,FM_ARG0+0x10(r1) ; Restore non-volatile r27 - lwz r26,FM_ARG0+0x14(r1) ; Restore non-volatile r26 - lwz r25,FM_ARG0+0x18(r1) ; Restore non-volatile r25 - lwz r24,FM_ARG0+0x1C(r1) ; Restore non-volatile r24 - lwz r1,0(r1) ; Pop stack frame - blr ; Return to caller - -gthBadPLock: - lis r0,hi16(Choke) ; Dmitri, you know how we've always talked about the - ori r0,r0,lo16(Choke) ; possibility of something going wrong with the bomb? - li r3,failMapping ; The BOMB, Dmitri. - sc ; The hydrogen bomb. - - -; -; Guest shadow assist -- find a guest mapping -; -; Locates the specified guest mapping, and if it exists returns a copy -; of it. -; -; Parameters: -; r3 : address of guest pmap, 32-bit kernel virtual address -; r4 : guest virtual address, high-order 32 bits -; r5 : guest virtual address, low-order 32 bits -; r6 : 32 byte copy area, 32-bit kernel virtual address -; -; Non-volatile register usage: -; r25 : VMM extension block's physical address -; r26 : copy area virtual address -; r27 : host pmap physical address -; r28 : guest pmap physical address -; r29 : caller's msr image from mapSetUp -; r30 : guest virtual address -; r31 : gva->phys mapping's physical address -; - - .align 5 - .globl EXT(hw_find_map_gv) - -LEXT(hw_find_map_gv) - -#define gfmStackSize ((31-25+1)*4)+4 - - stwu r1,-(FM_ALIGN(gfmStackSize)+FM_SIZE)(r1) - ; Mint a new stack frame - mflr r0 ; Get caller's return address - mfsprg r11,2 ; Get feature flags - mtcrf 0x02,r11 ; Insert feature flags into cr6 - stw r0,(FM_ALIGN(gfmStackSize)+FM_SIZE+FM_LR_SAVE)(r1) - ; Save caller's return address - stw r31,FM_ARG0+0x00(r1) ; Save non-volatile r31 - stw r30,FM_ARG0+0x04(r1) ; Save non-volatile r30 - stw r29,FM_ARG0+0x08(r1) ; Save non-volatile r29 - stw r28,FM_ARG0+0x0C(r1) ; Save non-volatile r28 - stw r27,FM_ARG0+0x10(r1) ; Save non-volatile r27 - stw r26,FM_ARG0+0x14(r1) ; Save non-volatile r26 - stw r25,FM_ARG0+0x18(r1) ; Save non-volatile r25 - - rlwinm r30,r5,0,0xFFFFF000 ; Clean up low-order 32 bits of guest vaddr - mr r26,r6 ; Copy copy buffer vaddr - - lwz r11,pmapVmmExt(r3) ; r11 <- VMM pmap extension block vaddr - lwz r9,pmapSpace(r3) ; r9 <- guest space ID number - - bt++ pf64Bitb,gfm64Salt ; Test for 64-bit machine - - lwz r25,pmapVmmExtPhys+4(r3) ; r25 <- VMM pmap extension block paddr - lwz r28,pmapvr+4(r3) ; Get 32-bit virt<->real guest pmap conversion salt - lwz r27,vmxHostPmapPhys+4(r11) ; Get host pmap physical address - la r31,VMX_HPIDX_OFFSET(r11) ; r31 <- base of hash page physical index - srwi r11,r30,12 ; Form shadow hash: - xor r11,r11,r9 ; spaceID ^ (vaddr >> 12) - rlwinm r10,r11,GV_HPAGE_SHIFT,GV_HPAGE_MASK - ; Form index offset from hash page number - add r31,r31,r10 ; r31 <- hash page index entry - lwz r31,4(r31) ; r31 <- hash page paddr - rlwimi r31,r11,GV_HGRP_SHIFT,GV_HGRP_MASK - ; r31 <- hash group paddr - b gfmStart ; Get to it - -gfm64Salt: rldimi r30,r4,32,0 ; Insert high-order 32 bits of 64-bit guest vaddr - ld r25,pmapVmmExtPhys(r3) ; r24 <- VMM pmap extension block paddr - ld r28,pmapvr(r3) ; Get 64-bit virt<->real guest pmap conversion salt - ld r27,vmxHostPmapPhys(r11) ; Get host pmap physical address - la r31,VMX_HPIDX_OFFSET(r11) ; r31 <- base of hash page physical index - srwi r11,r30,12 ; Form shadow hash: - xor r11,r11,r9 ; spaceID ^ (vaddr >> 12) - rlwinm r10,r11,GV_HPAGE_SHIFT,GV_HPAGE_MASK - ; Form index offset from hash page number - add r31,r31,r10 ; r31 <- hash page index entry - ld r31,0(r31) ; r31 <- hash page paddr - insrdi r31,r11,GV_GRPS_PPG_LG2,64-(GV_HGRP_SHIFT+GV_GRPS_PPG_LG2) - ; r31 <- hash group paddr - -gfmStart: xor r28,r3,r28 ; Convert guest pmap_t virt->real - bl EXT(mapSetUp) ; Disable 'rupts, translation, maybe enter 64-bit mode - mr r29,r11 ; Save caller's msr image - - la r3,pmapSXlk(r27) ; r3 <- host pmap's search lock address - bl sxlkExclusive ; Get lock exclusive - - li r0,(GV_SLOTS - 1) ; Prepare to iterate over mapping slots - mtctr r0 ; in this group - bt++ pf64Bitb,gfm64Search ; Test for 64-bit machine - - lwz r3,mpFlags(r31) ; r3 <- 1st mapping slot's flags - lhz r4,mpSpace(r31) ; r4 <- 1st mapping slot's space ID - lwz r5,mpVAddr+4(r31) ; r5 <- 1st mapping slot's virtual address - b gfm32SrchLp ; Let the search begin! - - .align 5 -gfm32SrchLp: - mr r6,r3 ; r6 <- current mapping slot's flags - lwz r3,mpFlags+GV_SLOT_SZ(r31) ; r3 <- next mapping slot's flags - mr r7,r4 ; r7 <- current mapping slot's space ID - lhz r4,mpSpace+GV_SLOT_SZ(r31) ; r4 <- next mapping slot's space ID - clrrwi r8,r5,12 ; r8 <- current mapping slot's virtual addr w/o flags - lwz r5,mpVAddr+4+GV_SLOT_SZ(r31); r5 <- next mapping slot's virtual addr - andi. r11,r6,mpgFree+mpgDormant ; Isolate guest free and dormant flags - xor r7,r7,r9 ; Compare space ID - or r0,r11,r7 ; r0 <- !(!free && !dormant && space match) - xor r8,r8,r30 ; Compare virtual address - or. r0,r0,r8 ; cr0_eq <- !free && !dormant && space match && virtual addr match - beq gfmSrchHit ; Join common path on hit (r31 points to guest mapping) - - addi r31,r31,GV_SLOT_SZ ; r31 <- next mapping slot - bdnz gfm32SrchLp ; Iterate - - mr r6,r3 ; r6 <- current mapping slot's flags - clrrwi r5,r5,12 ; Remove flags from virtual address - andi. r11,r6,mpgFree+mpgDormant ; Isolate guest free and dormant flags - xor r4,r4,r9 ; Compare space ID - or r0,r11,r4 ; r0 <- !(!free && !dormant && space match) - xor r5,r5,r30 ; Compare virtual address - or. r0,r0,r5 ; cr0_eq <- !free && !dormant && space match && virtual addr match - beq gfmSrchHit ; Join common path on hit (r31 points to guest mapping) - b gfmSrchMiss ; No joy in our hash group - -gfm64Search: - lwz r3,mpFlags(r31) ; r3 <- 1st mapping slot's flags - lhz r4,mpSpace(r31) ; r4 <- 1st mapping slot's space ID - ld r5,mpVAddr(r31) ; r5 <- 1st mapping slot's virtual address - b gfm64SrchLp ; Let the search begin! - - .align 5 -gfm64SrchLp: - mr r6,r3 ; r6 <- current mapping slot's flags - lwz r3,mpFlags+GV_SLOT_SZ(r31) ; r3 <- next mapping slot's flags - mr r7,r4 ; r7 <- current mapping slot's space ID - lhz r4,mpSpace+GV_SLOT_SZ(r31) ; r4 <- next mapping slot's space ID - clrrdi r8,r5,12 ; r8 <- current mapping slot's virtual addr w/o flags - ld r5,mpVAddr+GV_SLOT_SZ(r31) ; r5 <- next mapping slot's virtual addr - andi. r11,r6,mpgFree+mpgDormant ; Isolate guest free and dormant flags - xor r7,r7,r9 ; Compare space ID - or r0,r11,r7 ; r0 <- !(!free && !dormant && space match) - xor r8,r8,r30 ; Compare virtual address - or. r0,r0,r8 ; cr0_eq <- !free && !dormant && space match && virtual addr match - beq gfmSrchHit ; Join common path on hit (r31 points to guest mapping) - - addi r31,r31,GV_SLOT_SZ ; r31 <- next mapping slot - bdnz gfm64SrchLp ; Iterate - - mr r6,r3 ; r6 <- current mapping slot's flags - clrrdi r5,r5,12 ; Remove flags from virtual address - andi. r11,r6,mpgFree+mpgDormant ; Isolate guest free and dormant flags - xor r4,r4,r9 ; Compare space ID - or r0,r11,r4 ; r0 <- !(!free && !dormant && space match) - xor r5,r5,r30 ; Compare virtual address - or. r0,r0,r5 ; cr0_eq <- !free && !dormant && space match && virtual addr match - bne gfmSrchMiss ; No joy in our hash group - -gfmSrchHit: lwz r5,0(r31) ; Fetch 32 bytes of mapping from physical - lwz r6,4(r31) ; +4 - lwz r7,8(r31) ; +8 - lwz r8,12(r31) ; +12 - lwz r9,16(r31) ; +16 - lwz r10,20(r31) ; +20 - lwz r11,24(r31) ; +24 - lwz r12,28(r31) ; +28 - - li r31,mapRtOK ; Return found mapping - - la r3,pmapSXlk(r27) ; Get host pmap search lock address - bl sxlkUnlock ; Release host pmap search lock - - bt++ pf64Bitb,gfmEpi64 ; Test for 64-bit machine - -gfmEpi32: mtmsr r29 ; Restore caller's msr image - isync ; A small wrench - b gfmEpilog ; and a larger bubble - - .align 5 -gfmEpi64: mtmsrd r29 ; Restore caller's msr image - -gfmEpilog: mr. r3,r31 ; Copy/test mapping address - beq gfmNotFound ; Skip copy if no mapping found - - stw r5,0(r26) ; Store 32 bytes of mapping into virtual - stw r6,4(r26) ; +4 - stw r7,8(r26) ; +8 - stw r8,12(r26) ; +12 - stw r9,16(r26) ; +16 - stw r10,20(r26) ; +20 - stw r11,24(r26) ; +24 - stw r12,28(r26) ; +28 - -gfmNotFound: - lwz r0,(FM_ALIGN(gfmStackSize)+FM_SIZE+FM_LR_SAVE)(r1) - ; Get caller's return address - lwz r31,FM_ARG0+0x00(r1) ; Restore non-volatile r31 - lwz r30,FM_ARG0+0x04(r1) ; Restore non-volatile r30 - lwz r29,FM_ARG0+0x08(r1) ; Restore non-volatile r29 - lwz r28,FM_ARG0+0x0C(r1) ; Restore non-volatile r28 - mtlr r0 ; Prepare return address - lwz r27,FM_ARG0+0x10(r1) ; Restore non-volatile r27 - lwz r26,FM_ARG0+0x14(r1) ; Restore non-volatile r26 - lwz r25,FM_ARG0+0x18(r1) ; Restore non-volatile r25 - lwz r1,0(r1) ; Pop stack frame - blr ; Return to caller - - .align 5 -gfmSrchMiss: - li r31,mapRtNotFnd ; Indicate mapping not found - la r3,pmapSXlk(r27) ; Get host pmap search lock address - bl sxlkUnlock ; Release host pmap search lock - bt++ pf64Bitb,gfmEpi64 ; Take 64-bit exit - b gfmEpi32 ; Take 32-bit exit - - -; -; Guest shadow assist -- change guest page protection -; -; Locates the specified dormant mapping, and if it is active, changes its -; protection. -; -; Parameters: -; r3 : address of guest pmap, 32-bit kernel virtual address -; r4 : guest virtual address, high-order 32 bits -; r5 : guest virtual address, low-order 32 bits -; r6 : guest mapping protection code -; -; Non-volatile register usage: -; r25 : caller's msr image from mapSetUp -; r26 : guest mapping protection code -; r27 : host pmap physical address -; r28 : guest pmap physical address -; r29 : VMM extension block's physical address -; r30 : guest virtual address -; r31 : gva->phys mapping's physical address -; - .align 5 - .globl EXT(hw_protect_gv) - -LEXT(hw_protect_gv) - -#define gcpStackSize ((31-24+1)*4)+4 - - stwu r1,-(FM_ALIGN(gcpStackSize)+FM_SIZE)(r1) - ; Mint a new stack frame - mflr r0 ; Get caller's return address - mfsprg r11,2 ; Get feature flags - mtcrf 0x02,r11 ; Insert feature flags into cr6 - stw r0,(FM_ALIGN(gcpStackSize)+FM_SIZE+FM_LR_SAVE)(r1) - ; Save caller's return address - stw r31,FM_ARG0+0x00(r1) ; Save non-volatile r31 - stw r30,FM_ARG0+0x04(r1) ; Save non-volatile r30 - stw r29,FM_ARG0+0x08(r1) ; Save non-volatile r29 - stw r28,FM_ARG0+0x0C(r1) ; Save non-volatile r28 - stw r27,FM_ARG0+0x10(r1) ; Save non-volatile r27 - stw r26,FM_ARG0+0x14(r1) ; Save non-volatile r26 - stw r25,FM_ARG0+0x18(r1) ; Save non-volatile r25 - - rlwinm r30,r5,0,0xFFFFF000 ; Clean up low-order 32 bits of guest vaddr - mr r26,r6 ; Copy guest mapping protection code - - lwz r11,pmapVmmExt(r3) ; r11 <- VMM pmap extension block vaddr - lwz r9,pmapSpace(r3) ; r9 <- guest space ID number - bt++ pf64Bitb,gcp64Salt ; Handle 64-bit machine separately - lwz r29,pmapVmmExtPhys+4(r3) ; r29 <- VMM pmap extension block paddr - lwz r27,vmxHostPmapPhys+4(r11) ; r27 <- host pmap paddr - lwz r28,pmapvr+4(r3) ; Get 32-bit virt<->real guest pmap conversion salt - la r31,VMX_HPIDX_OFFSET(r11) ; r31 <- base of hash page physical index - srwi r11,r30,12 ; Form shadow hash: - xor r11,r11,r9 ; spaceID ^ (vaddr >> 12) - rlwinm r10,r11,GV_HPAGE_SHIFT,GV_HPAGE_MASK - ; Form index offset from hash page number - add r31,r31,r10 ; r31 <- hash page index entry - lwz r31,4(r31) ; r31 <- hash page paddr - rlwimi r31,r11,GV_HGRP_SHIFT,GV_HGRP_MASK - ; r31 <- hash group paddr - b gcpStart ; Get to it - -gcp64Salt: rldimi r30,r4,32,0 ; Insert high-order 32 bits of 64-bit guest vaddr - ld r29,pmapVmmExtPhys(r3) ; r29 <- VMM pmap extension block paddr - ld r27,vmxHostPmapPhys(r11) ; r27 <- host pmap paddr - ld r28,pmapvr(r3) ; Get 64-bit virt<->real guest pmap conversion salt - la r31,VMX_HPIDX_OFFSET(r11) ; r31 <- base of hash page physical index - srwi r11,r30,12 ; Form shadow hash: - xor r11,r11,r9 ; spaceID ^ (vaddr >> 12) - rlwinm r10,r11,GV_HPAGE_SHIFT,GV_HPAGE_MASK - ; Form index offset from hash page number - add r31,r31,r10 ; r31 <- hash page index entry - ld r31,0(r31) ; r31 <- hash page paddr - insrdi r31,r11,GV_GRPS_PPG_LG2,64-(GV_HGRP_SHIFT+GV_GRPS_PPG_LG2) - ; r31 <- hash group paddr - -gcpStart: xor r28,r4,r28 ; Convert guest pmap_t virt->real - bl EXT(mapSetUp) ; Disable 'rupts, translation, maybe enter 64-bit mode - mr r25,r11 ; Save caller's msr image - - la r3,pmapSXlk(r27) ; r3 <- host pmap's search lock address - bl sxlkExclusive ; Get lock exclusive - - li r0,(GV_SLOTS - 1) ; Prepare to iterate over mapping slots - mtctr r0 ; in this group - bt++ pf64Bitb,gcp64Search ; Test for 64-bit machine - - lwz r3,mpFlags(r31) ; r3 <- 1st mapping slot's flags - lhz r4,mpSpace(r31) ; r4 <- 1st mapping slot's space ID - lwz r5,mpVAddr+4(r31) ; r5 <- 1st mapping slot's virtual address - b gcp32SrchLp ; Let the search begin! - - .align 5 -gcp32SrchLp: - mr r6,r3 ; r6 <- current mapping slot's flags - lwz r3,mpFlags+GV_SLOT_SZ(r31) ; r3 <- next mapping slot's flags - mr r7,r4 ; r7 <- current mapping slot's space ID - lhz r4,mpSpace+GV_SLOT_SZ(r31) ; r4 <- next mapping slot's space ID - clrrwi r8,r5,12 ; r8 <- current mapping slot's virtual addr w/o flags - lwz r5,mpVAddr+4+GV_SLOT_SZ(r31); r5 <- next mapping slot's virtual addr - andi. r11,r6,mpgFree+mpgDormant ; Isolate guest free flag - xor r7,r7,r9 ; Compare space ID - or r0,r11,r7 ; r0 <- free || dormant || !space match - xor r8,r8,r30 ; Compare virtual address - or. r0,r0,r8 ; cr0_eq <- !free && !dormant && space match && virtual addr match - beq gcpSrchHit ; Join common path on hit (r31 points to guest mapping) - - addi r31,r31,GV_SLOT_SZ ; r31 <- next mapping slot - bdnz gcp32SrchLp ; Iterate - - mr r6,r3 ; r6 <- current mapping slot's flags - clrrwi r5,r5,12 ; Remove flags from virtual address - andi. r11,r6,mpgFree+mpgDormant ; Isolate guest free flag - xor r4,r4,r9 ; Compare space ID - or r0,r11,r4 ; r0 <- free || dormant || !space match - xor r5,r5,r30 ; Compare virtual address - or. r0,r0,r5 ; cr0_eq <- !free && !dormant && space match && virtual addr match - beq gcpSrchHit ; Join common path on hit (r31 points to guest mapping) - b gcpSrchMiss ; No joy in our hash group - -gcp64Search: - lwz r3,mpFlags(r31) ; r3 <- 1st mapping slot's flags - lhz r4,mpSpace(r31) ; r4 <- 1st mapping slot's space ID - ld r5,mpVAddr(r31) ; r5 <- 1st mapping slot's virtual address - b gcp64SrchLp ; Let the search begin! - - .align 5 -gcp64SrchLp: - mr r6,r3 ; r6 <- current mapping slot's flags - lwz r3,mpFlags+GV_SLOT_SZ(r31) ; r3 <- next mapping slot's flags - mr r7,r4 ; r7 <- current mapping slot's space ID - lhz r4,mpSpace+GV_SLOT_SZ(r31) ; r4 <- next mapping slot's space ID - clrrdi r8,r5,12 ; r8 <- current mapping slot's virtual addr w/o flags - ld r5,mpVAddr+GV_SLOT_SZ(r31) ; r5 <- next mapping slot's virtual addr - andi. r11,r6,mpgFree+mpgDormant ; Isolate guest free flag - xor r7,r7,r9 ; Compare space ID - or r0,r11,r7 ; r0 <- free || dormant || !space match - xor r8,r8,r30 ; Compare virtual address - or. r0,r0,r8 ; cr0_eq <- !free && !dormant && space match && virtual addr match - beq gcpSrchHit ; Join common path on hit (r31 points to guest mapping) - - addi r31,r31,GV_SLOT_SZ ; r31 <- next mapping slot - bdnz gcp64SrchLp ; Iterate - - mr r6,r3 ; r6 <- current mapping slot's flags - clrrdi r5,r5,12 ; Remove flags from virtual address - andi. r11,r6,mpgFree+mpgDormant ; Isolate guest free flag - xor r4,r4,r9 ; Compare space ID - or r0,r11,r4 ; r0 <- free || dormant || !space match - xor r5,r5,r30 ; Compare virtual address - or. r0,r0,r5 ; cr0_eq <- !free && !dormant && space match && virtual addr match - bne gcpSrchMiss ; No joy in our hash group - -gcpSrchHit: - bt++ pf64Bitb,gcpDscon64 ; Handle 64-bit disconnect separately - bl mapInvPte32 ; Disconnect PTE, invalidate, gather ref and change - ; r31 <- mapping's physical address - ; r3 -> PTE slot physical address - ; r4 -> High-order 32 bits of PTE - ; r5 -> Low-order 32 bits of PTE - ; r6 -> PCA - ; r7 -> PCA physical address - rlwinm r2,r3,29,29,31 ; Get PTE's slot number in the PTEG (8-byte PTEs) - b gcpFreePTE ; Join 64-bit path to release the PTE -gcpDscon64: bl mapInvPte64 ; Disconnect PTE, invalidate, gather ref and change - rlwinm r2,r3,28,29,31 ; Get PTE's slot number in the PTEG (16-byte PTEs) -gcpFreePTE: mr. r3,r3 ; Was there a valid PTE? - beq- gcpSetKey ; No valid PTE, we're almost done - lis r0,0x8000 ; Prepare free bit for this slot - srw r0,r0,r2 ; Position free bit - or r6,r6,r0 ; Set it in our PCA image - lwz r8,mpPte(r31) ; Get PTE pointer - rlwinm r8,r8,0,~mpHValid ; Make the pointer invalid - stw r8,mpPte(r31) ; Save invalidated PTE pointer - eieio ; Synchronize all previous updates (mapInvPtexx didn't) - stw r6,0(r7) ; Update PCA and unlock the PTEG - -gcpSetKey: lwz r0,mpVAddr+4(r31) ; Get va word containing protection bits - rlwimi r0,r26,0,mpPP ; Insert new protection bits - stw r0,mpVAddr+4(r31) ; Write 'em back - eieio ; Ensure previous mapping updates are visible - li r31,mapRtOK ; I'm a success - -gcpRelPmap: la r3,pmapSXlk(r27) ; r3 <- host pmap search lock phys addr - bl sxlkUnlock ; Release host pmap search lock - - mr r3,r31 ; r3 <- result code - bt++ pf64Bitb,gcpRtn64 ; Handle 64-bit separately - mtmsr r25 ; Restore 'rupts, translation - isync ; Throw a small wrench into the pipeline - b gcpPopFrame ; Nothing to do now but pop a frame and return -gcpRtn64: mtmsrd r25 ; Restore 'rupts, translation, 32-bit mode -gcpPopFrame: - lwz r0,(FM_ALIGN(gcpStackSize)+FM_SIZE+FM_LR_SAVE)(r1) - ; Get caller's return address - lwz r31,FM_ARG0+0x00(r1) ; Restore non-volatile r31 - lwz r30,FM_ARG0+0x04(r1) ; Restore non-volatile r30 - lwz r29,FM_ARG0+0x08(r1) ; Restore non-volatile r29 - lwz r28,FM_ARG0+0x0C(r1) ; Restore non-volatile r28 - mtlr r0 ; Prepare return address - lwz r27,FM_ARG0+0x10(r1) ; Restore non-volatile r27 - lwz r26,FM_ARG0+0x14(r1) ; Restore non-volatile r26 - lwz r25,FM_ARG0+0x18(r1) ; Restore non-volatile r25 - lwz r1,0(r1) ; Pop stack frame - blr ; Return to caller - - .align 5 -gcpSrchMiss: - li r31,mapRtNotFnd ; Could not locate requested mapping - b gcpRelPmap ; Exit through host pmap search lock release - - -; -; Find the physent based on a physical page and try to lock it (but not too hard) -; Note that this table always has an entry that with a 0 table pointer at the end -; -; R3 contains ppnum on entry -; R3 is 0 if no entry was found -; R3 is physent if found -; cr0_eq is true if lock was obtained or there was no entry to lock -; cr0_eq is false of there was an entry and it was locked -; - - .align 5 - -mapFindPhyTry: - lis r9,hi16(EXT(pmap_mem_regions)) ; Point to the start of the region table - mr r2,r3 ; Save our target - ori r9,r9,lo16(EXT(pmap_mem_regions)) ; Point to the start of the region table - -mapFindPhz: lwz r3,mrPhysTab(r9) ; Get the actual table address - lwz r5,mrStart(r9) ; Get start of table entry - lwz r0,mrEnd(r9) ; Get end of table entry - addi r9,r9,mrSize ; Point to the next slot - cmplwi cr2,r3,0 ; Are we at the end of the table? - cmplw r2,r5 ; See if we are in this table - cmplw cr1,r2,r0 ; Check end also - sub r4,r2,r5 ; Calculate index to physical entry - beq-- cr2,mapFindNo ; Leave if we did not find an entry... - cror cr0_lt,cr0_lt,cr1_gt ; Set CR0_LT if it is NOT this entry - slwi r4,r4,3 ; Get offset to physical entry - - blt-- mapFindPhz ; Did not find it... - - add r3,r3,r4 ; Point right to the slot - -mapFindOv: lwz r2,0(r3) ; Get the lock contents right now - rlwinm. r0,r2,0,0,0 ; Is it locked? - bnelr-- ; Yes it is... - - lwarx r2,0,r3 ; Get the lock - rlwinm. r0,r2,0,0,0 ; Is it locked? - oris r0,r2,0x8000 ; Set the lock bit - bne-- mapFindKl ; It is locked, go get rid of reservation and leave... - stwcx. r0,0,r3 ; Try to stuff it back... - bne-- mapFindOv ; Collision, try again... - isync ; Clear any speculations - blr ; Leave... - -mapFindKl: li r2,lgKillResv ; Killing field - stwcx. r2,0,r2 ; Trash reservation... - crclr cr0_eq ; Make sure we do not think we got the lock - blr ; Leave... - -mapFindNo: crset cr0_eq ; Make sure that we set this - li r3,0 ; Show that we did not find it - blr ; Leave... -; -; pmapCacheLookup - This function will look up an entry in the pmap segment cache. -; -; How the pmap cache lookup works: -; -; We use a combination of three things: a mask of valid entries, a sub-tag, and the -; ESID (aka the "tag"). The mask indicates which of the cache slots actually contain -; an entry. The sub-tag is a 16 entry 4 bit array that contains the low order 4 bits -; of the ESID, bits 32:36 of the effective for 64-bit and 0:3 for 32-bit. The cache -; entry contains the full 36 bit ESID. -; -; The purpose of the sub-tag is to limit the number of searches necessary when looking -; for an existing cache entry. Because there are 16 slots in the cache, we could end up -; searching all 16 if an match is not found. -; -; Essentially, we will search only the slots that have a valid entry and whose sub-tag -; matches. More than likely, we will eliminate almost all of the searches. -; -; Inputs: -; R3 = pmap -; R4 = ESID high half -; R5 = ESID low half -; -; Outputs: -; R3 = pmap cache slot if found, 0 if not -; R10 = pmapCCtl address -; R11 = pmapCCtl image -; pmapCCtl locked on exit -; - - .align 5 - -pmapCacheLookup: - la r10,pmapCCtl(r3) ; Point to the segment cache control - -pmapCacheLookuq: - lwarx r11,0,r10 ; Get the segment cache control value - rlwinm. r0,r11,0,pmapCCtlLckb,pmapCCtlLckb ; Is it already locked? - ori r0,r11,lo16(pmapCCtlLck) ; Turn on the lock bit - bne-- pmapCacheLookur ; Nope... - stwcx. r0,0,r10 ; Try to take the lock - bne-- pmapCacheLookuq ; Someone else just stuffed it, try again... - - isync ; Make sure we get reservation first - lwz r9,pmapSCSubTag(r3) ; Get the high part of the sub-tag - rlwimi r5,r5,28,4,7 ; Copy sub-tag just to right of itself (XX------) - lwz r10,pmapSCSubTag+4(r3) ; And the bottom half - rlwimi r5,r5,24,8,15 ; Copy doubled sub-tag to right of itself (XXXX----) - lis r8,0x8888 ; Get some eights - rlwimi r5,r5,16,16,31 ; Copy quadrupled sub-tags to the right - ori r8,r8,0x8888 ; Fill the rest with eights - - eqv r10,r10,r5 ; Get 0xF where we hit in bottom half - eqv r9,r9,r5 ; Get 0xF where we hit in top half - - rlwinm r2,r10,1,0,30 ; Shift over 1 - rlwinm r0,r9,1,0,30 ; Shift over 1 - and r2,r2,r10 ; AND the even/odd pair into the even - and r0,r0,r9 ; AND the even/odd pair into the even - rlwinm r10,r2,2,0,28 ; Shift over 2 - rlwinm r9,r0,2,0,28 ; Shift over 2 - and r10,r2,r10 ; AND the even of the ANDed pairs giving the AND of all 4 bits in 0, 4, ... - and r9,r0,r9 ; AND the even of the ANDed pairs giving the AND of all 4 bits in 0, 4, ... - - and r10,r10,r8 ; Clear out extras - and r9,r9,r8 ; Clear out extras - - rlwinm r0,r10,3,1,28 ; Slide adjacent next to each other - rlwinm r2,r9,3,1,28 ; Slide adjacent next to each other - or r10,r0,r10 ; Merge them - or r9,r2,r9 ; Merge them - rlwinm r0,r10,6,2,26 ; Slide adjacent pairs next to each other - rlwinm r2,r9,6,2,26 ; Slide adjacent pairs next to each other - or r10,r0,r10 ; Merge them - or r9,r2,r9 ; Merge them - rlwimi r10,r10,12,4,7 ; Stick in the low-order adjacent quad - rlwimi r9,r9,12,4,7 ; Stick in the low-order adjacent quad - not r6,r11 ; Turn invalid into valid - rlwimi r9,r10,24,8,15 ; Merge in the adjacent octs giving a hit mask - - la r10,pmapSegCache(r3) ; Point at the cache slots - and. r6,r9,r6 ; Get mask of valid and hit - li r0,0 ; Clear - li r3,0 ; Assume not found - oris r0,r0,0x8000 ; Start a mask - beqlr++ ; Leave, should usually be no hits... - -pclNextEnt: cntlzw r5,r6 ; Find an in use one - cmplwi cr1,r5,pmapSegCacheUse ; Did we find one? - rlwinm r7,r5,4,0,27 ; Index to the cache entry - srw r2,r0,r5 ; Get validity mask bit - add r7,r7,r10 ; Point to the cache slot - andc r6,r6,r2 ; Clear the validity bit we just tried - bgelr-- cr1 ; Leave if there are no more to check... - - lwz r5,sgcESID(r7) ; Get the top half - - cmplw r5,r4 ; Only need to check top because sub-tag is the entire other half - - bne++ pclNextEnt ; Nope, try again... - - mr r3,r7 ; Point to the slot - blr ; Leave.... - - .align 5 - -pmapCacheLookur: - li r11,lgKillResv ; The killing spot - stwcx. r11,0,r11 ; Kill the reservation - -pmapCacheLookus: - lwz r11,pmapCCtl(r3) ; Get the segment cache control - rlwinm. r0,r11,0,pmapCCtlLckb,pmapCCtlLckb ; Is it already locked? - beq++ pmapCacheLookup ; Nope... - b pmapCacheLookus ; Yup, keep waiting... - - -; -; mapMergeRC -- Given a physical mapping address in R31, locate its -; connected PTE (if any) and merge the PTE referenced and changed bits -; into the mapping and physent. -; - - .align 5 - -mapMergeRC32: - lwz r0,mpPte(r31) ; Grab the PTE offset - mfsdr1 r7 ; Get the pointer to the hash table - lwz r5,mpVAddr+4(r31) ; Grab the virtual address - rlwinm r10,r7,0,0,15 ; Clean up the hash table base - andi. r3,r0,mpHValid ; Is there a possible PTE? - srwi r7,r0,4 ; Convert to PCA units - rlwinm r7,r7,0,0,29 ; Clean up PCA offset - mflr r2 ; Save the return - subfic r7,r7,-4 ; Convert to -4 based negative index - add r7,r10,r7 ; Point to the PCA directly - beqlr-- ; There was no PTE to start with... - - bl mapLockPteg ; Lock the PTEG - - lwz r0,mpPte(r31) ; Grab the PTE offset - mtlr r2 ; Restore the LR - andi. r3,r0,mpHValid ; Is there a possible PTE? - beq- mMPUnlock ; There is no PTE, someone took it so just unlock and leave... - - rlwinm r3,r0,0,0,30 ; Clear the valid bit - add r3,r3,r10 ; Point to actual PTE - lwz r5,4(r3) ; Get the real part of the PTE - srwi r10,r5,12 ; Change physical address to a ppnum - -mMNmerge: lbz r11,mpFlags+1(r31) ; Get the offset to the physical entry table - lwz r0,mpVAddr+4(r31) ; Get the flags part of the field - lis r8,hi16(EXT(pmap_mem_regions)) ; Get the top of the region table - ori r8,r8,lo16(EXT(pmap_mem_regions)) ; Get the bottom of the region table - rlwinm r11,r11,2,24,29 ; Mask index bits and convert to byte offset - add r11,r11,r8 ; Point to the bank table - lwz r2,mrPhysTab(r11) ; Get the physical table bank pointer - lwz r11,mrStart(r11) ; Get the start of bank - rlwimi r0,r5,0,mpRb-32,mpCb-32 ; Copy in the RC - addi r2,r2,4 ; Offset to last half of field - stw r0,mpVAddr+4(r31) ; Set the new RC into the field - sub r11,r10,r11 ; Get the index into the table - rlwinm r11,r11,3,0,28 ; Get offset to the physent - -mMmrgRC: lwarx r10,r11,r2 ; Get the master RC - rlwinm r0,r5,27,ppRb-32,ppCb-32 ; Position the new RC - or r0,r0,r10 ; Merge in the new RC - stwcx. r0,r11,r2 ; Try to stick it back - bne-- mMmrgRC ; Try again if we collided... - eieio ; Commit all updates - -mMPUnlock: - stw r6,0(r7) ; Unlock PTEG - blr ; Return - -; -; 64-bit version of mapMergeRC -; - .align 5 - -mapMergeRC64: - lwz r0,mpPte(r31) ; Grab the PTE offset - ld r5,mpVAddr(r31) ; Grab the virtual address - mfsdr1 r7 ; Get the pointer to the hash table - rldicr r10,r7,0,45 ; Clean up the hash table base - andi. r3,r0,mpHValid ; Is there a possible PTE? - srdi r7,r0,5 ; Convert to PCA units - rldicr r7,r7,0,61 ; Clean up PCA - subfic r7,r7,-4 ; Convert to -4 based negative index - mflr r2 ; Save the return - add r7,r10,r7 ; Point to the PCA directly - beqlr-- ; There was no PTE to start with... - - bl mapLockPteg ; Lock the PTEG - - lwz r0,mpPte(r31) ; Grab the PTE offset again - mtlr r2 ; Restore the LR - andi. r3,r0,mpHValid ; Is there a possible PTE? - beq-- mMPUnlock ; There is no PTE, someone took it so just unlock and leave... - - rlwinm r3,r0,0,0,30 ; Clear the valid bit - add r3,r3,r10 ; Point to the actual PTE - ld r5,8(r3) ; Get the real part - srdi r10,r5,12 ; Change physical address to a ppnum - b mMNmerge ; Join the common 32-64-bit code... - - -; -; This routine, given a mapping, will find and lock the PTEG -; If mpPte does not point to a PTE (checked before and after lock), it will unlock the -; PTEG and return. In this case we will have undefined in R4 -; and the low 12 bits of mpVAddr valid in R5. R3 will contain 0. -; -; If the mapping is still valid, we will invalidate the PTE and merge -; the RC bits into the physent and also save them into the mapping. -; -; We then return with R3 pointing to the PTE slot, R4 is the -; top of the PTE and R5 is the bottom. R6 contains the PCA. -; R7 points to the PCA entry. -; -; Note that we should NEVER be called on a block or special mapping. -; We could do many bad things. -; - - .align 5 - -mapInvPte32: - lwz r0,mpPte(r31) ; Grab the PTE offset - mfsdr1 r7 ; Get the pointer to the hash table - lwz r5,mpVAddr+4(r31) ; Grab the virtual address - rlwinm r10,r7,0,0,15 ; Clean up the hash table base - andi. r3,r0,mpHValid ; Is there a possible PTE? - srwi r7,r0,4 ; Convert to PCA units - rlwinm r7,r7,0,0,29 ; Clean up PCA offset - mflr r2 ; Save the return - subfic r7,r7,-4 ; Convert to -4 based negative index - add r7,r10,r7 ; Point to the PCA directly - beqlr-- ; There was no PTE to start with... - - bl mapLockPteg ; Lock the PTEG - - lwz r0,mpPte(r31) ; Grab the PTE offset - mtlr r2 ; Restore the LR - andi. r3,r0,mpHValid ; Is there a possible PTE? - beq- mIPUnlock ; There is no PTE, someone took it so just unlock and leave... - - rlwinm r3,r0,0,0,30 ; Clear the valid bit - add r3,r3,r10 ; Point to actual PTE - lwz r4,0(r3) ; Get the top of the PTE - - li r8,tlbieLock ; Get the TLBIE lock - rlwinm r0,r4,0,1,31 ; Clear the valid bit - stw r0,0(r3) ; Invalidate the PTE - - sync ; Make sure everyone sees the invalidate - -mITLBIE32: lwarx r0,0,r8 ; Get the TLBIE lock - mfsprg r2,2 ; Get feature flags - mr. r0,r0 ; Is it locked? - li r0,1 ; Get our lock word - bne- mITLBIE32 ; It is locked, go wait... - - stwcx. r0,0,r8 ; Try to get it - bne- mITLBIE32 ; We was beat... - - rlwinm. r0,r2,0,pfSMPcapb,pfSMPcapb ; Can this be an MP box? - li r0,0 ; Lock clear value - - tlbie r5 ; Invalidate it everywhere - - beq- mINoTS32 ; Can not have MP on this machine... - - eieio ; Make sure that the tlbie happens first - tlbsync ; Wait for everyone to catch up - sync ; Make sure of it all - -mINoTS32: stw r0,tlbieLock(0) ; Clear the tlbie lock - lwz r5,4(r3) ; Get the real part - srwi r10,r5,12 ; Change physical address to a ppnum - -mINmerge: lbz r11,mpFlags+1(r31) ; Get the offset to the physical entry table - lwz r0,mpVAddr+4(r31) ; Get the flags part of the field - lis r8,hi16(EXT(pmap_mem_regions)) ; Get the top of the region table - ori r8,r8,lo16(EXT(pmap_mem_regions)) ; Get the bottom of the region table - rlwinm r11,r11,2,24,29 ; Mask index bits and convert to byte offset - add r11,r11,r8 ; Point to the bank table - lwz r2,mrPhysTab(r11) ; Get the physical table bank pointer - lwz r11,mrStart(r11) ; Get the start of bank - rlwimi r0,r5,0,mpRb-32,mpCb-32 ; Copy in the RC - addi r2,r2,4 ; Offset to last half of field - stw r0,mpVAddr+4(r31) ; Set the new RC into the field - sub r11,r10,r11 ; Get the index into the table - rlwinm r11,r11,3,0,28 ; Get offset to the physent - - -mImrgRC: lwarx r10,r11,r2 ; Get the master RC - rlwinm r0,r5,27,ppRb-32,ppCb-32 ; Position the new RC - or r0,r0,r10 ; Merge in the new RC - stwcx. r0,r11,r2 ; Try to stick it back - bne-- mImrgRC ; Try again if we collided... - - blr ; Leave with the PCA still locked up... - -mIPUnlock: eieio ; Make sure all updates come first - - stw r6,0(r7) ; Unlock - blr - -; -; 64-bit version -; - .align 5 - -mapInvPte64: - lwz r0,mpPte(r31) ; Grab the PTE offset - ld r5,mpVAddr(r31) ; Grab the virtual address - mfsdr1 r7 ; Get the pointer to the hash table - rldicr r10,r7,0,45 ; Clean up the hash table base - andi. r3,r0,mpHValid ; Is there a possible PTE? - srdi r7,r0,5 ; Convert to PCA units - rldicr r7,r7,0,61 ; Clean up PCA - subfic r7,r7,-4 ; Convert to -4 based negative index - mflr r2 ; Save the return - add r7,r10,r7 ; Point to the PCA directly - beqlr-- ; There was no PTE to start with... - - bl mapLockPteg ; Lock the PTEG - - lwz r0,mpPte(r31) ; Grab the PTE offset again - mtlr r2 ; Restore the LR - andi. r3,r0,mpHValid ; Is there a possible PTE? - beq-- mIPUnlock ; There is no PTE, someone took it so just unlock and leave... - - rlwinm r3,r0,0,0,30 ; Clear the valid bit - add r3,r3,r10 ; Point to the actual PTE - ld r4,0(r3) ; Get the top of the PTE - - li r8,tlbieLock ; Get the TLBIE lock - rldicr r0,r4,0,62 ; Clear the valid bit - std r0,0(r3) ; Invalidate the PTE - - rldicr r2,r4,16,35 ; Shift the AVPN over to match VPN - sync ; Make sure everyone sees the invalidate - rldimi r2,r5,0,36 ; Cram in the page portion of the EA - -mITLBIE64: lwarx r0,0,r8 ; Get the TLBIE lock - mr. r0,r0 ; Is it locked? - li r0,1 ; Get our lock word - bne-- mITLBIE64a ; It is locked, toss reservation and wait... - - stwcx. r0,0,r8 ; Try to get it - bne-- mITLBIE64 ; We was beat... - - rldicl r2,r2,0,16 ; Clear bits 0:15 because we are under orders - - li r0,0 ; Lock clear value - - tlbie r2 ; Invalidate it everywhere - - eieio ; Make sure that the tlbie happens first - tlbsync ; Wait for everyone to catch up - ptesync ; Wait for quiet again - - stw r0,tlbieLock(0) ; Clear the tlbie lock - - ld r5,8(r3) ; Get the real part - srdi r10,r5,12 ; Change physical address to a ppnum - b mINmerge ; Join the common 32-64-bit code... - -mITLBIE64a: li r5,lgKillResv ; Killing field - stwcx. r5,0,r5 ; Kill reservation - -mITLBIE64b: lwz r0,0(r8) ; Get the TLBIE lock - mr. r0,r0 ; Is it locked? - beq++ mITLBIE64 ; Nope, try again... - b mITLBIE64b ; Yup, wait for it... - -; -; mapLockPteg - Locks a PTEG -; R7 points to PCA entry -; R6 contains PCA on return -; -; - - .align 5 - -mapLockPteg: - lwarx r6,0,r7 ; Pick up the PCA - rlwinm. r0,r6,0,PCAlockb,PCAlockb ; Is the PTEG locked? - ori r0,r6,PCAlock ; Set the lock bit - bne-- mLSkill ; It is locked... - - stwcx. r0,0,r7 ; Try to lock the PTEG - bne-- mapLockPteg ; We collided... - - isync ; Nostradamus lied - blr ; Leave... - -mLSkill: li r6,lgKillResv ; Get killing field - stwcx. r6,0,r6 ; Kill it - -mapLockPteh: - lwz r6,0(r7) ; Pick up the PCA - rlwinm. r0,r6,0,PCAlockb,PCAlockb ; Is the PTEG locked? - beq++ mapLockPteg ; Nope, try again... - b mapLockPteh ; Yes, wait for it... - - -; -; The mapSelSlot function selects a PTEG slot to use. As input, it expects R6 -; to contain the PCA. When it returns, R3 contains 0 if an unoccupied slot was -; selected, 1 if it stole a non-block PTE, or 2 if it stole a block mapped PTE. -; R4 returns the slot index. -; -; CR7 also indicates that we have a block mapping -; -; The PTEG allocation controls are a bit map of the state of the PTEG. -; PCAfree indicates that the PTE slot is empty. -; PCAauto means that it comes from an autogen area. These -; guys do not keep track of reference and change and are actually "wired". -; They are easy to maintain. PCAsteal -; is a sliding position mask used to "randomize" PTE slot stealing. All 4 of these -; fields fit in a single word and are loaded and stored under control of the -; PTEG control area lock (PCAlock). -; -; Note that PCAauto does not contribute to the steal calculations at all. Originally -; it did, autogens were second in priority. This can result in a pathalogical -; case where an instruction can not make forward progress, or one PTE slot -; thrashes. -; -; Note that the PCA must be locked when we get here. -; -; Physically, the fields are arranged: -; 0: PCAfree -; 1: PCAsteal -; 2: PCAauto -; 3: PCAmisc -; -; -; At entry, R6 contains new unlocked PCA image (real PCA is locked and untouched) -; -; At exit: -; -; R3 = 0 - no steal -; R3 = 1 - steal regular -; R3 = 2 - steal autogen -; R4 contains slot number -; R6 contains updated PCA image -; - - .align 5 - -mapSelSlot: lis r10,0 ; Clear autogen mask - li r9,0 ; Start a mask - beq cr7,mSSnotblk ; Skip if this is not a block mapping - ori r10,r10,lo16(0xFFFF) ; Make sure we mark a block mapping (autogen) - -mSSnotblk: rlwinm r11,r6,16,24,31 ; Isolate just the steal mask - oris r9,r9,0x8000 ; Get a mask - cntlzw r4,r6 ; Find a slot or steal one - ori r9,r9,lo16(0x8000) ; Insure that we have 0x80008000 - rlwinm r4,r4,0,29,31 ; Isolate bit position - rlwimi r11,r11,8,16,23 ; Get set to march a 1 back into top of 8 bit rotate - srw r2,r9,r4 ; Get mask to isolate selected inuse and autogen flags - srwi r11,r11,1 ; Slide steal mask right - and r8,r6,r2 ; Isolate the old in use and autogen bits - andc r6,r6,r2 ; Allocate the slot and also clear autogen flag - addi r0,r8,0x7F00 ; Push autogen flag to bit 16 - and r2,r2,r10 ; Keep the autogen part if autogen - addis r8,r8,0xFF00 ; Push in use to bit 0 and invert - or r6,r6,r2 ; Add in the new autogen bit - rlwinm r0,r0,17,31,31 ; Get a 1 if the old was autogenned (always 0 if not in use) - rlwinm r8,r8,1,31,31 ; Isolate old in use - rlwimi r6,r11,16,8,15 ; Stick the new steal slot in - - add r3,r0,r8 ; Get 0 if no steal, 1 if steal normal, 2 if steal autogen - blr ; Leave... - -; -; Shared/Exclusive locks -; -; A shared/exclusive lock allows multiple shares of a lock to be taken -; but only one exclusive. A shared lock can be "promoted" to exclusive -; when it is the only share. If there are multiple sharers, the lock -; must be "converted". A promotion drops the share and gains exclusive as -; an atomic operation. If anyone else has a share, the operation fails. -; A conversion first drops the share and then takes an exclusive lock. -; -; We will want to add a timeout to this eventually. -; -; R3 is set to 0 for success, non-zero for failure -; - -; -; Convert a share into an exclusive -; - - .align 5 - -sxlkConvert: - - lis r0,0x8000 ; Get the locked lock image -#if 0 - mflr r0 ; (TEST/DEBUG) - oris r0,r0,0x8000 ; (TEST/DEBUG) -#endif - -sxlkCTry: lwarx r2,0,r3 ; Get the lock word - cmplwi r2,1 ; Does it just have our share? - subi r2,r2,1 ; Drop our share in case we do not get it - bne-- sxlkCnotfree ; No, we need to unlock... - stwcx. r0,0,r3 ; Try to take it exclusively - bne-- sxlkCTry ; Collision, try again... - - isync - li r3,0 ; Set RC - blr ; Leave... - -sxlkCnotfree: - stwcx. r2,0,r3 ; Try to drop our share... - bne-- sxlkCTry ; Try again if we collided... - b sxlkExclusive ; Go take it exclusively... - -; -; Promote shared to exclusive -; - - .align 5 - -sxlkPromote: - lis r0,0x8000 ; Get the locked lock image -#if 0 - mflr r0 ; (TEST/DEBUG) - oris r0,r0,0x8000 ; (TEST/DEBUG) -#endif - -sxlkPTry: lwarx r2,0,r3 ; Get the lock word - cmplwi r2,1 ; Does it just have our share? - bne-- sxlkPkill ; No, just fail (R3 is non-zero)... - stwcx. r0,0,r3 ; Try to take it exclusively - bne-- sxlkPTry ; Collision, try again... - - isync - li r3,0 ; Set RC - blr ; Leave... - -sxlkPkill: li r2,lgKillResv ; Point to killing field - stwcx. r2,0,r2 ; Kill reservation - blr ; Leave - - - -; -; Take lock exclusivily -; - - .align 5 - -sxlkExclusive: - lis r0,0x8000 ; Get the locked lock image -#if 0 - mflr r0 ; (TEST/DEBUG) - oris r0,r0,0x8000 ; (TEST/DEBUG) -#endif - -sxlkXTry: lwarx r2,0,r3 ; Get the lock word - mr. r2,r2 ; Is it locked? - bne-- sxlkXWait ; Yes... - stwcx. r0,0,r3 ; Try to take it - bne-- sxlkXTry ; Collision, try again... - - isync ; Toss anything younger than us - li r3,0 ; Set RC - blr ; Leave... - - .align 5 - -sxlkXWait: li r2,lgKillResv ; Point to killing field - stwcx. r2,0,r2 ; Kill reservation - -sxlkXWaiu: lwz r2,0(r3) ; Get the lock again - mr. r2,r2 ; Is it free yet? - beq++ sxlkXTry ; Yup... - b sxlkXWaiu ; Hang around a bit more... - -; -; Take a share of the lock -; - - .align 5 - -sxlkShared: lwarx r2,0,r3 ; Get the lock word - rlwinm. r0,r2,0,0,0 ; Is it locked exclusively? - addi r2,r2,1 ; Up the share count - bne-- sxlkSWait ; Yes... - stwcx. r2,0,r3 ; Try to take it - bne-- sxlkShared ; Collision, try again... - - isync ; Toss anything younger than us - li r3,0 ; Set RC - blr ; Leave... - - .align 5 - -sxlkSWait: li r2,lgKillResv ; Point to killing field - stwcx. r2,0,r2 ; Kill reservation - -sxlkSWaiu: lwz r2,0(r3) ; Get the lock again - rlwinm. r0,r2,0,0,0 ; Is it locked exclusively? - beq++ sxlkShared ; Nope... - b sxlkSWaiu ; Hang around a bit more... - -; -; Unlock either exclusive or shared. -; - - .align 5 - -sxlkUnlock: eieio ; Make sure we order our stores out - -sxlkUnTry: lwarx r2,0,r3 ; Get the lock - rlwinm. r0,r2,0,0,0 ; Do we hold it exclusively? - subi r2,r2,1 ; Remove our share if we have one - li r0,0 ; Clear this - bne-- sxlkUExclu ; We hold exclusive... - - stwcx. r2,0,r3 ; Try to lose our share - bne-- sxlkUnTry ; Collision... - blr ; Leave... - -sxlkUExclu: stwcx. r0,0,r3 ; Unlock and release reservation - beqlr++ ; Leave if ok... - b sxlkUnTry ; Could not store, try over... - - - .align 5 - .globl EXT(fillPage) - -LEXT(fillPage) - - mfsprg r0,2 ; Get feature flags - mtcrf 0x02,r0 ; move pf64Bit to cr - - rlwinm r4,r4,0,1,0 ; Copy fill to top of 64-bit register - lis r2,0x0200 ; Get vec - mr r6,r4 ; Copy - ori r2,r2,0x2000 ; Get FP - mr r7,r4 ; Copy - mfmsr r5 ; Get MSR - mr r8,r4 ; Copy - andc r5,r5,r2 ; Clear out permanent turn-offs - mr r9,r4 ; Copy - ori r2,r2,0x8030 ; Clear IR, DR and EE - mr r10,r4 ; Copy - andc r0,r5,r2 ; Kill them - mr r11,r4 ; Copy - mr r12,r4 ; Copy - bt++ pf64Bitb,fpSF1 ; skip if 64-bit (only they take the hint) - - slwi r3,r3,12 ; Make into a physical address - mtmsr r2 ; Interrupts and translation off - isync - - li r2,4096/32 ; Get number of cache lines - -fp32again: dcbz 0,r3 ; Clear - addic. r2,r2,-1 ; Count down - stw r4,0(r3) ; Fill - stw r6,4(r3) ; Fill - stw r7,8(r3) ; Fill - stw r8,12(r3) ; Fill - stw r9,16(r3) ; Fill - stw r10,20(r3) ; Fill - stw r11,24(r3) ; Fill - stw r12,28(r3) ; Fill - addi r3,r3,32 ; Point next - bgt+ fp32again ; Keep going - - mtmsr r5 ; Restore all - isync - blr ; Return... - - .align 5 - -fpSF1: li r2,1 - sldi r2,r2,63 ; Get 64-bit bit - or r0,r0,r2 ; Turn on 64-bit - sldi r3,r3,12 ; Make into a physical address - - mtmsrd r0 ; Interrupts and translation off - isync - - li r2,4096/128 ; Get number of cache lines - -fp64again: dcbz128 0,r3 ; Clear - addic. r2,r2,-1 ; Count down - std r4,0(r3) ; Fill - std r6,8(r3) ; Fill - std r7,16(r3) ; Fill - std r8,24(r3) ; Fill - std r9,32(r3) ; Fill - std r10,40(r3) ; Fill - std r11,48(r3) ; Fill - std r12,56(r3) ; Fill - std r4,64+0(r3) ; Fill - std r6,64+8(r3) ; Fill - std r7,64+16(r3) ; Fill - std r8,64+24(r3) ; Fill - std r9,64+32(r3) ; Fill - std r10,64+40(r3) ; Fill - std r11,64+48(r3) ; Fill - std r12,64+56(r3) ; Fill - addi r3,r3,128 ; Point next - bgt+ fp64again ; Keep going - - mtmsrd r5 ; Restore all - isync - blr ; Return... - - .align 5 - .globl EXT(mapLog) - -LEXT(mapLog) - - mfmsr r12 - lis r11,hi16(EXT(mapdebug)) - ori r11,r11,lo16(EXT(mapdebug)) - lwz r10,0(r11) - mr. r10,r10 - bne++ mLxx - mr r10,r3 -mLxx: rlwinm r0,r12,0,MSR_DR_BIT+1,MSR_DR_BIT-1 - mtmsr r0 - isync - stw r4,0(r10) - stw r4,4(r10) - stw r5,8(r10) - stw r6,12(r10) - mtmsr r12 - isync - addi r10,r10,16 - stw r10,0(r11) - blr - -#if 1 - .align 5 - .globl EXT(checkBogus) - -LEXT(checkBogus) - - BREAKPOINT_TRAP - blr ; No-op normally - -#endif - - - - diff --git a/osfmk/ppc/instrumentation.h b/osfmk/ppc/instrumentation.h deleted file mode 100644 index 536d8aa59..000000000 --- a/osfmk/ppc/instrumentation.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @APPLE_FREE_COPYRIGHT@ - */ - -/* - * Here be the instrumentaion page layout - * Lovingly crafted by Bill Angell using traditional methods -*/ - -#ifndef _INSTRUMENTATION_H_ -#define _INSTRUMENTATION_H_ - -#define INTRUMENTATION 1 - - -#define inBase 0x6000 - -#define inEntry 0 -#define inAtGetTb 1 -#define inBeforeTrace 2 -#define inAfterSAAlloc 3 -#define inBeforeFilter 4 -#define inEatRuptQfret 5 -#define inEatRuptSAfree 6 -#define inPassupSwtchSeg 7 -#define inExceptionExit 8 -#define inMiddleOfSC 9 -#define inEatRuptSwtchSeg 10 -#define inPassup 11 -#define inCopyout 12 -#define inMUASbefore 13 -#define inMUAS - -#endif /* _INSTRUMENTATION_H_ */ diff --git a/osfmk/ppc/interrupt.c b/osfmk/ppc/interrupt.c deleted file mode 100644 index e1be2769d..000000000 --- a/osfmk/ppc/interrupt.c +++ /dev/null @@ -1,187 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * @APPLE_FREE_COPYRIGHT@ - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -volatile perfCallback perfIntHook; /* Pointer to CHUD trap hook routine */ - -#if CONFIG_DTRACE -#if (DEVELOPMENT || DEBUG ) -#include -#endif - -extern vm_offset_t dtrace_get_cpu_int_stack_top(void); - -vm_offset_t dtrace_get_cpu_int_stack_top(void) -{ - return getPerProc()->intstack_top_ss; -} - -/* See */ -perfCallback tempDTraceIntHook; /* Pointer to DTrace fbt int handler */ -#endif - -void unresolved_kernel_trap(int trapno, - struct savearea *ssp, - unsigned int dsisr, - addr64_t dar, - const char *message); - -unsigned int isync_mfdec(void); - -struct savearea * interrupt( - int type, - struct savearea *ssp, - unsigned int dsisr, - unsigned int dar) -{ - int current_cpu; - struct per_proc_info *proc_info; - uint64_t now; - thread_t thread; - - disable_preemption(); - - perfCallback fn = perfIntHook; - if(fn) { /* Is there a hook? */ - if(fn(type, ssp, dsisr, dar) == KERN_SUCCESS) return ssp; /* If it succeeds, we are done... */ - } - -#if CONFIG_DTRACE - if(tempDTraceIntHook) { /* Is there a hook? */ - if(tempDTraceIntHook(type, ssp, dsisr, dar) == KERN_SUCCESS) return ssp; /* If it succeeds, we are done... */ - } -#endif - -#if 0 - { - extern void fctx_text(void); - fctx_test(); - } -#endif - - - current_cpu = cpu_number(); - proc_info = getPerProc(); - - switch (type) { - - case T_DECREMENTER: - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_EXCP_DECI, 0) | DBG_FUNC_NONE, - isync_mfdec(), (unsigned int)ssp->save_srr0, 0, 0, 0); - - now = mach_absolute_time(); /* Find out what time it is */ - - if(now >= proc_info->pms.pmsPop) { /* Is it time for power management state change? */ - pmsStep(1); /* Yes, advance step */ - now = mach_absolute_time(); /* Get the time again since we ran a bit */ - } - - thread = current_thread(); /* Find ourselves */ - if(thread->machine.qactTimer != 0) { /* Is the timer set? */ - if (thread->machine.qactTimer <= now) { /* It is set, has it popped? */ - thread->machine.qactTimer = 0; /* Clear single shot timer */ - if((unsigned int)thread->machine.vmmControl & 0xFFFFFFFE) { /* Are there any virtual machines? */ - vmm_timer_pop(thread); /* Yes, check out them out... */ - } - } - } - - etimer_intr(USER_MODE(ssp->save_srr1), ssp->save_srr0); /* Handle event timer */ - break; - - case T_INTERRUPT: - /* Call the platform interrupt routine */ - counter(c_incoming_interrupts++); - - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_EXCP_INTR, 0) | DBG_FUNC_START, - current_cpu, (unsigned int)ssp->save_srr0, 0, 0, 0); - -#if CONFIG_DTRACE && (DEVELOPMENT || DEBUG ) - DTRACE_INT5(interrupt__start, void *, proc_info->interrupt_nub, int, proc_info->interrupt_source, - void *, proc_info->interrupt_target, IOInterruptHandler, proc_info->interrupt_handler, - void *, proc_info->interrupt_refCon); -#endif - - proc_info->interrupt_handler( - proc_info->interrupt_target, - proc_info->interrupt_refCon, - proc_info->interrupt_nub, - proc_info->interrupt_source); - -#if CONFIG_DTRACE && (DEVELOPMENT || DEBUG ) - DTRACE_INT5(interrupt__complete, void *, proc_info->interrupt_nub, int, proc_info->interrupt_source, - void *, proc_info->interrupt_target, IOInterruptHandler, proc_info->interrupt_handler, - void *, proc_info->interrupt_refCon); -#endif - - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_EXCP_INTR, 0) | DBG_FUNC_END, - 0, 0, 0, 0, 0); - - break; - - case T_SIGP: - /* Did the other processor signal us? */ - cpu_signal_handler(); - break; - - case T_SHUTDOWN: - cpu_doshutdown(); - panic("returning from cpu_doshutdown()\n"); - break; - - - default: - if (!Call_Debugger(type, ssp)) - unresolved_kernel_trap(type, ssp, dsisr, dar, NULL); - break; - } - - enable_preemption(); - return ssp; -} diff --git a/osfmk/ppc/io_map.c b/osfmk/ppc/io_map.c deleted file mode 100644 index e30d357ce..000000000 --- a/osfmk/ppc/io_map.c +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -extern vm_offset_t virtual_avail; - -/* - * Allocate and map memory for devices that may need to be mapped - * outside the usual physical memory. If phys_addr is NULL then - * steal the appropriate number of physical pages from the vm - * system and map them. - * - * Note, this will onl - */ -vm_offset_t -io_map(vm_map_offset_t phys_addr, vm_size_t size, unsigned int flags) -{ - vm_offset_t start; - vm_size_t i; - unsigned int mflags; - vm_page_t m; - - mflags = mmFlgBlock | mmFlgUseAttr | (flags & VM_MEM_GUARDED) | ((flags & VM_MEM_NOT_CACHEABLE) >> 1); /* Convert to our mapping_make flags */ - -#if DEBUG - assert (kernel_map != VM_MAP_NULL); /* VM must be initialised */ -#endif - - if (phys_addr != 0) { /* If they supplied a physical address, use it */ - - size = round_page(size + (phys_addr & PAGE_MASK)); /* Make sure we map all of it */ - - (void) kmem_alloc_pageable(kernel_map, &start, size); /* Get some virtual addresses to use */ - - (void)mapping_make(kernel_pmap, (addr64_t)start, (ppnum_t)(phys_addr >> 12), - mflags, /* Map with requested cache mode */ - (size >> 12), VM_PROT_READ|VM_PROT_WRITE); - - return (start + (phys_addr & PAGE_MASK)); /* Pass back the virtual address */ - - } else { - - (void) kmem_alloc_pageable(kernel_map, &start, size); /* Get some virtual addresses */ - - mapping_prealloc(size); /* Make sure there are enough free mappings */ - - for (i = 0; i < size ; i += PAGE_SIZE) { - m = VM_PAGE_NULL; - while ((m = vm_page_grab()) == VM_PAGE_NULL) { /* Get a physical page */ - VM_PAGE_WAIT(); /* Wait if we didn't have one */ - } - vm_page_gobble(m); - - (void)mapping_make(kernel_pmap, - (addr64_t)(start + i), m->phys_page, - mflags, /* Map with requested cache mode */ - 1, VM_PROT_READ|VM_PROT_WRITE); - - } - - mapping_relpre(); /* Allow mapping release */ - return start; - } -} - - -/* - * Allocate and map memory for devices before the VM system comes alive. - */ - -vm_offset_t io_map_spec(vm_map_offset_t phys_addr, vm_size_t size, unsigned int flags) -{ - vm_offset_t start; - unsigned int mflags; - - if(kernel_map != VM_MAP_NULL) { /* If VM system is up, redirect to normal routine */ - - return io_map(phys_addr, size, flags); /* Map the address */ - - } - - mflags = mmFlgBlock | mmFlgUseAttr | (flags & VM_MEM_GUARDED) | ((flags & VM_MEM_NOT_CACHEABLE) >> 1); /* Convert to our mapping_make flags */ - - size = round_page(size + (phys_addr - (phys_addr & -PAGE_SIZE))); /* Extend the length to include it all */ - start = pmap_boot_map(size); /* Get me some virtual address */ - - (void)mapping_make(kernel_pmap, (addr64_t)start, (ppnum_t)(phys_addr >> 12), - mflags, /* Map with requested cache mode */ - (size >> 12), VM_PROT_READ|VM_PROT_WRITE); - - return (start + (phys_addr & PAGE_MASK)); -} diff --git a/osfmk/ppc/io_map_entries.h b/osfmk/ppc/io_map_entries.h deleted file mode 100644 index 8fceaaf86..000000000 --- a/osfmk/ppc/io_map_entries.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - * - */ -#ifdef KERNEL_PRIVATE - -#ifndef _PPC_IO_MAP_ENTRIES_H_ -#define _PPC_IO_MAP_ENTRIES_H_ - -extern vm_offset_t io_map( - vm_map_offset_t phys_addr, - vm_size_t size, - unsigned int flags); -extern vm_offset_t io_map_spec(vm_map_offset_t phys_addr, vm_size_t size, unsigned int flags); - -#endif /* _PPC_IO_MAP_ENTRIES_H_ */ - -#endif /* KERNEL_PRIVATE */ diff --git a/osfmk/ppc/lock.h b/osfmk/ppc/lock.h deleted file mode 100644 index 0628f554f..000000000 --- a/osfmk/ppc/lock.h +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (C) 1998 Apple Computer - * All Rights Reserved - */ -/* - * @OSF_COPYRIGHT@ - */ - -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ - -#ifdef KERNEL_PRIVATE - -#ifndef _PPC_LOCK_H_ -#define _PPC_LOCK_H_ - -#ifdef MACH_KERNEL_PRIVATE - -#include -#include -#include -#include - -#if !MACH_LDEBUG -typedef lck_rw_t lock_t; -#else -typedef lck_rw_ext_t lock_t; -#endif /* !MACH_LDEBUG */ - -extern unsigned int LockTimeOut; /* Number of hardware ticks of a lock timeout */ - -#endif /* MACH_KERNEL_PRIVATE */ - -#endif /* _PPC_LOCK_H_ */ - -#endif /* KERNEL_PRIVATE */ diff --git a/osfmk/ppc/locks.h b/osfmk/ppc/locks.h deleted file mode 100644 index 639a820a8..000000000 --- a/osfmk/ppc/locks.h +++ /dev/null @@ -1,220 +0,0 @@ -/* - * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef _PPC_LOCKS_H_ -#define _PPC_LOCKS_H_ - -#include -#ifdef MACH_KERNEL_PRIVATE -#include -#endif - - -#ifdef MACH_KERNEL_PRIVATE - -extern unsigned int LcksOpts; - -#define enaLkDeb 0x00000001 /* Request debug in default attribute */ -#define enaLkStat 0x00000002 /* Request statistic in default attribute */ - -#define disLkType 0x80000000 /* Disable type checking */ -#define disLktypeb 0 -#define disLkThread 0x40000000 /* Disable ownership checking */ -#define disLkThreadb 1 -#define enaLkExtStck 0x20000000 /* Enable extended backtrace */ -#define enaLkExtStckb 2 -#define disLkMyLck 0x10000000 /* Disable recursive lock dectection */ -#define disLkMyLckb 3 - -#endif - -#ifdef MACH_KERNEL_PRIVATE -typedef struct { - unsigned int interlock; - unsigned int lck_spin_pad4[2]; -} lck_spin_t; - -#define LCK_SPIN_TAG_DESTROYED 0x00002007 /* lock marked as Destroyed */ - -#else -#ifdef KERNEL_PRIVATE -typedef struct { - unsigned int opaque[3]; -} lck_spin_t; -#else -typedef struct __lck_spin_t__ lck_spin_t; -#endif -#endif - -#ifdef MACH_KERNEL_PRIVATE -typedef struct _lck_mtx_ { - union { - struct { - unsigned int lck_mtxd_data; - unsigned short lck_mtxd_waiters; - unsigned short lck_mtxd_pri; - unsigned int lck_mtxd_pad8; - } lck_mtxd; - struct { - unsigned int lck_mtxi_tag; - struct _lck_mtx_ext_ *lck_mtxi_ptr; - unsigned int lck_mtxi_pad8; - } lck_mtxi; - } lck_mtx_sw; -} lck_mtx_t; - -#define lck_mtx_data lck_mtx_sw.lck_mtxd.lck_mtxd_data -#define lck_mtx_waiters lck_mtx_sw.lck_mtxd.lck_mtxd_waiters -#define lck_mtx_pri lck_mtx_sw.lck_mtxd.lck_mtxd_pri - -#define lck_mtx_tag lck_mtx_sw.lck_mtxi.lck_mtxi_tag -#define lck_mtx_ptr lck_mtx_sw.lck_mtxi.lck_mtxi_ptr - -#define LCK_MTX_TAG_INDIRECT 0x00001007 /* lock marked as Indirect */ -#define LCK_MTX_TAG_DESTROYED 0x00002007 /* lock marked as Destroyed */ - -#define LCK_FRAMES_MAX 8 - -typedef struct { - unsigned int type; - vm_offset_t stack[LCK_FRAMES_MAX]; - vm_offset_t thread; -} lck_mtx_deb_t; - -#define MUTEX_TAG 0x4d4d - -typedef struct { - unsigned int lck_mtx_stat_data; -} lck_mtx_stat_t; - -typedef struct _lck_mtx_ext_ { - lck_mtx_t lck_mtx; - struct _lck_grp_ *lck_mtx_grp; - unsigned int lck_mtx_attr; - lck_mtx_deb_t lck_mtx_deb; - /* Unused on PowerPC */ - lck_mtx_stat_t lck_mtx_stat; -} lck_mtx_ext_t; - -#define LCK_MTX_ATTR_DEBUG 0x1 -#define LCK_MTX_ATTR_DEBUGb 31 -#define LCK_MTX_ATTR_STAT 0x2 -#define LCK_MTX_ATTR_STATb 30 - -#else -#ifdef KERNEL_PRIVATE -typedef struct { - unsigned int opaque[3]; -} lck_mtx_t; - -typedef struct { - unsigned int opaque[16]; -} lck_mtx_ext_t; -#else -typedef struct __lck_mtx_t__ lck_mtx_t; -typedef struct __lck_mtx_ext_t__ lck_mtx_ext_t; -#endif -#endif - -#ifdef MACH_KERNEL_PRIVATE -typedef struct { - union { - struct { - unsigned int lck_rwd_shared_cnt:16, /* No. of shared granted request */ - lck_rwd_priv_excl:1, /* priority for Writer */ - lck_rwd_pad17:11, /* padding */ - lck_rwd_want_excl:1, /* Writer is waiting, or locked for write */ - lck_rwd_want_upgrade:1, /* Read-to-write upgrade waiting */ - lck_rwd_waiting:1, /* Someone is sleeping on lock */ - lck_rwd_interlock:1; /* Read-to-write upgrade waiting */ - unsigned int lck_rwd_pad4; - unsigned int lck_rwd_pad8; - } lck_rwd; - struct { - unsigned int lck_rwi_tag; - struct _lck_rw_ext_ *lck_rwi_ptr; - unsigned int lck_rwi_pad8; - } lck_rwi; - } lck_rw_sw; -} lck_rw_t; - -#define lck_rw_interlock lck_rw_sw.lck_rwd.lck_rwd_interlock -#define lck_rw_want_upgrade lck_rw_sw.lck_rwd.lck_rwd_want_upgrade -#define lck_rw_want_excl lck_rw_sw.lck_rwd.lck_rwd_want_excl -#define lck_rw_waiting lck_rw_sw.lck_rwd.lck_rwd_waiting -#define lck_rw_priv_excl lck_rw_sw.lck_rwd.lck_rwd_priv_excl -#define lck_rw_shared_cnt lck_rw_sw.lck_rwd.lck_rwd_shared_cnt - -#define lck_rw_tag lck_rw_sw.lck_rwi.lck_rwi_tag -#define lck_rw_ptr lck_rw_sw.lck_rwi.lck_rwi_ptr - -typedef struct { - unsigned int type; - vm_offset_t stack[LCK_FRAMES_MAX]; - thread_t thread; - void (*pc_excl)(void); - void (*pc_done)(void); -} lck_rw_deb_t; - -#define RW_TAG 0x5d5d - -typedef struct { - unsigned int lck_rw_stat_data; -} lck_rw_stat_t; - -typedef struct _lck_rw_ext_ { - lck_rw_t lck_rw; - struct _lck_grp_ *lck_rw_grp; - unsigned int lck_rw_attr; - lck_rw_deb_t lck_rw_deb; - lck_rw_stat_t lck_rw_stat; -} lck_rw_ext_t; - -#define LCK_RW_ATTR_DEBUG 0x1 -#define LCK_RW_ATTR_DEBUGb 31 -#define LCK_RW_ATTR_STAT 0x2 -#define LCK_RW_ATTR_STATb 30 -#define LCK_RW_ATTR_DIS_THREAD 0x40000000 -#define LCK_RW_ATTR_DIS_THREADb 1 -#define LCK_RW_ATTR_DIS_MYLOCK 0x10000000 -#define LCK_RW_ATTR_DIS_MYLOCKb 3 - -#define LCK_RW_TAG_INDIRECT 0x00001107 /* lock marked as Indirect */ -#define LCK_RW_TAG_DESTROYED 0x00002107 /* lock marked as Destroyed */ - -#else -#ifdef KERNEL_PRIVATE -typedef struct { - unsigned int opaque[3]; -} lck_rw_t; -#else -typedef struct __lck_rw_t__ lck_rw_t; -#endif -#endif - -#endif /* _PPC_LOCKS_H_ */ diff --git a/osfmk/ppc/locks_ppc.c b/osfmk/ppc/locks_ppc.c deleted file mode 100644 index c734043f5..000000000 --- a/osfmk/ppc/locks_ppc.c +++ /dev/null @@ -1,2360 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - * File: kern/lock.c - * Author: Avadis Tevanian, Jr., Michael Wayne Young - * Date: 1985 - * - * Locking primitives implementation - */ - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if MACH_KDB -#include -#include -#include -#include -#endif /* MACH_KDB */ - -#ifdef __ppc__ -#include -#endif - -#include - -/* - * We need only enough declarations from the BSD-side to be able to - * test if our probe is active, and to call __dtrace_probe(). Setting - * NEED_DTRACE_DEFS gets a local copy of those definitions pulled in. - * - * Note that if CONFIG_DTRACE is off, the include file below stubs out - * the code hooks here. - */ -#if CONFIG_DTRACE -#define NEED_DTRACE_DEFS -#include <../bsd/sys/lockstat.h> -#endif - -#define LCK_RW_LCK_EXCLUSIVE_CODE 0x100 -#define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101 -#define LCK_RW_LCK_SHARED_CODE 0x102 -#define LCK_RW_LCK_SH_TO_EX_CODE 0x103 -#define LCK_RW_LCK_SH_TO_EX1_CODE 0x104 -#define LCK_RW_LCK_EX_TO_SH_CODE 0x105 - - -#define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG) - -unsigned int lock_wait_time[2] = { (unsigned int)-1, 0 } ; - -/* Forwards */ - - -#if USLOCK_DEBUG -/* - * Perform simple lock checks. - */ -int uslock_check = 1; -int max_lock_loops = 100000000; -decl_simple_lock_data(extern , printf_lock) -decl_simple_lock_data(extern , panic_lock) -#if MACH_KDB -decl_simple_lock_data(extern , kdb_lock) -#endif /* MACH_KDB */ -#endif /* USLOCK_DEBUG */ - - -/* - * We often want to know the addresses of the callers - * of the various lock routines. However, this information - * is only used for debugging and statistics. - */ -typedef void *pc_t; -#define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS) -#define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS) -#if ANY_LOCK_DEBUG -#define OBTAIN_PC(pc,l) ((pc) = (void *) GET_RETURN_PC(&(l))) -#else /* ANY_LOCK_DEBUG */ -#ifdef lint -/* - * Eliminate lint complaints about unused local pc variables. - */ -#define OBTAIN_PC(pc,l) ++pc -#else /* lint */ -#define OBTAIN_PC(pc,l) -#endif /* lint */ -#endif /* USLOCK_DEBUG */ - - -/* - * Portable lock package implementation of usimple_locks. - */ - -#if USLOCK_DEBUG -#define USLDBG(stmt) stmt -void usld_lock_init(usimple_lock_t, unsigned short); -void usld_lock_pre(usimple_lock_t, pc_t); -void usld_lock_post(usimple_lock_t, pc_t); -void usld_unlock(usimple_lock_t, pc_t); -void usld_lock_try_pre(usimple_lock_t, pc_t); -void usld_lock_try_post(usimple_lock_t, pc_t); -int usld_lock_common_checks(usimple_lock_t, const char *); -#else /* USLOCK_DEBUG */ -#define USLDBG(stmt) -#endif /* USLOCK_DEBUG */ - -/* - * Routine: lck_spin_alloc_init - */ -lck_spin_t * -lck_spin_alloc_init( - lck_grp_t *grp, - lck_attr_t *attr) { - lck_spin_t *lck; - - if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0) - lck_spin_init(lck, grp, attr); - - return(lck); -} - -/* - * Routine: lck_spin_free - */ -void -lck_spin_free( - lck_spin_t *lck, - lck_grp_t *grp) { - lck_spin_destroy(lck, grp); - kfree((void *)lck, sizeof(lck_spin_t)); -} - -/* - * Routine: lck_spin_init - */ -void -lck_spin_init( - lck_spin_t *lck, - lck_grp_t *grp, - __unused lck_attr_t *attr) { - - lck->interlock = 0; - lck_grp_reference(grp); - lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN); -} - -/* - * Routine: lck_spin_destroy - */ -void -lck_spin_destroy( - lck_spin_t *lck, - lck_grp_t *grp) { - if (lck->interlock == LCK_SPIN_TAG_DESTROYED) - return; - lck->interlock = LCK_SPIN_TAG_DESTROYED; - lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN); - lck_grp_deallocate(grp); -} - -/* - * Initialize a usimple_lock. - * - * No change in preemption state. - */ -void -usimple_lock_init( - usimple_lock_t l, - unsigned short tag) -{ -#ifndef MACHINE_SIMPLE_LOCK - USLDBG(usld_lock_init(l, tag)); - hw_lock_init(&l->interlock); -#else - simple_lock_init((simple_lock_t)l,tag); -#endif -} - - -/* - * Acquire a usimple_lock. - * - * Returns with preemption disabled. Note - * that the hw_lock routines are responsible for - * maintaining preemption state. - */ -void -usimple_lock( - usimple_lock_t l) -{ -#ifndef MACHINE_SIMPLE_LOCK - pc_t pc; - - OBTAIN_PC(pc, l); - USLDBG(usld_lock_pre(l, pc)); - - if(!hw_lock_to(&l->interlock, LockTimeOut)) /* Try to get the lock with a timeout */ - panic("simple lock deadlock detection - l=%p, cpu=%d, ret=%p", l, cpu_number(), pc); - - USLDBG(usld_lock_post(l, pc)); -#else - simple_lock((simple_lock_t)l); -#endif -} - - -/* - * Release a usimple_lock. - * - * Returns with preemption enabled. Note - * that the hw_lock routines are responsible for - * maintaining preemption state. - */ -void -usimple_unlock( - usimple_lock_t l) -{ -#ifndef MACHINE_SIMPLE_LOCK - pc_t pc; - - OBTAIN_PC(pc, l); - USLDBG(usld_unlock(l, pc)); - sync(); - hw_lock_unlock(&l->interlock); -#else - simple_unlock_rwmb((simple_lock_t)l); -#endif -} - - -/* - * Conditionally acquire a usimple_lock. - * - * On success, returns with preemption disabled. - * On failure, returns with preemption in the same state - * as when first invoked. Note that the hw_lock routines - * are responsible for maintaining preemption state. - * - * XXX No stats are gathered on a miss; I preserved this - * behavior from the original assembly-language code, but - * doesn't it make sense to log misses? XXX - */ -unsigned int -usimple_lock_try( - usimple_lock_t l) -{ -#ifndef MACHINE_SIMPLE_LOCK - pc_t pc; - unsigned int success; - - OBTAIN_PC(pc, l); - USLDBG(usld_lock_try_pre(l, pc)); - success = hw_lock_try(&l->interlock); - if (success) - USLDBG(usld_lock_try_post(l, pc)); - return success; -#else - return(simple_lock_try((simple_lock_t)l)); -#endif -} - -#if USLOCK_DEBUG -/* - * States of a usimple_lock. The default when initializing - * a usimple_lock is setting it up for debug checking. - */ -#define USLOCK_CHECKED 0x0001 /* lock is being checked */ -#define USLOCK_TAKEN 0x0002 /* lock has been taken */ -#define USLOCK_INIT 0xBAA0 /* lock has been initialized */ -#define USLOCK_INITIALIZED (USLOCK_INIT|USLOCK_CHECKED) -#define USLOCK_CHECKING(l) (uslock_check && \ - ((l)->debug.state & USLOCK_CHECKED)) - -/* - * Trace activities of a particularly interesting lock. - */ -void usl_trace(usimple_lock_t, int, pc_t, const char *); - - -/* - * Initialize the debugging information contained - * in a usimple_lock. - */ -void -usld_lock_init( - usimple_lock_t l, - __unused unsigned short tag) -{ - if (l == USIMPLE_LOCK_NULL) - panic("lock initialization: null lock pointer"); - l->lock_type = USLOCK_TAG; - l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0; - l->debug.lock_cpu = l->debug.unlock_cpu = 0; - l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC; - l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD; - l->debug.duration[0] = l->debug.duration[1] = 0; - l->debug.unlock_cpu = l->debug.unlock_cpu = 0; - l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC; - l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD; -} - - -/* - * These checks apply to all usimple_locks, not just - * those with USLOCK_CHECKED turned on. - */ -int -usld_lock_common_checks(usimple_lock_t l, const char *caller) -{ - if (l == USIMPLE_LOCK_NULL) - panic("%s: null lock pointer", caller); - if (l->lock_type != USLOCK_TAG) - panic("%s: 0x%x is not a usimple lock", caller, (integer_t) l); - if (!(l->debug.state & USLOCK_INIT)) - panic("%s: 0x%x is not an initialized lock", - caller, (integer_t) l); - return USLOCK_CHECKING(l); -} - - -/* - * Debug checks on a usimple_lock just before attempting - * to acquire it. - */ -/* ARGSUSED */ -void -usld_lock_pre( - usimple_lock_t l, - pc_t pc) -{ - const char *caller = "usimple_lock"; - - if (!usld_lock_common_checks(l, caller)) - return; - -/* - * Note that we have a weird case where we are getting a lock when we are] - * in the process of putting the system to sleep. We are running with no - * current threads, therefore we can't tell if we are trying to retake a lock - * we have or someone on the other processor has it. Therefore we just - * ignore this test if the locking thread is 0. - */ - - if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread && - l->debug.lock_thread == (void *) current_thread()) { - printf("%s: lock 0x%x already locked (at %p) by", - caller, (integer_t) l, l->debug.lock_pc); - printf(" current thread %p (new attempt at pc %p)\n", - l->debug.lock_thread, pc); - panic("%s", caller); - } - mp_disable_preemption(); - usl_trace(l, cpu_number(), pc, caller); - mp_enable_preemption(); -} - - -/* - * Debug checks on a usimple_lock just after acquiring it. - * - * Pre-emption has been disabled at this point, - * so we are safe in using cpu_number. - */ -void -usld_lock_post( - usimple_lock_t l, - pc_t pc) -{ - int mycpu; - const char *caller = "successful usimple_lock"; - - - if (!usld_lock_common_checks(l, caller)) - return; - - if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) - panic("%s: lock 0x%x became uninitialized", - caller, (integer_t) l); - if ((l->debug.state & USLOCK_TAKEN)) - panic("%s: lock 0x%x became TAKEN by someone else", - caller, (integer_t) l); - - mycpu = cpu_number(); - l->debug.lock_thread = (void *)current_thread(); - l->debug.state |= USLOCK_TAKEN; - l->debug.lock_pc = pc; - l->debug.lock_cpu = mycpu; - - usl_trace(l, mycpu, pc, caller); -} - - -/* - * Debug checks on a usimple_lock just before - * releasing it. Note that the caller has not - * yet released the hardware lock. - * - * Preemption is still disabled, so there's - * no problem using cpu_number. - */ -void -usld_unlock( - usimple_lock_t l, - pc_t pc) -{ - int mycpu; - const char *caller = "usimple_unlock"; - - - if (!usld_lock_common_checks(l, caller)) - return; - - mycpu = cpu_number(); - - if (!(l->debug.state & USLOCK_TAKEN)) - panic("%s: lock 0x%x hasn't been taken", - caller, (integer_t) l); - if (l->debug.lock_thread != (void *) current_thread()) - panic("%s: unlocking lock 0x%x, owned by thread %p", - caller, (integer_t) l, l->debug.lock_thread); - if (l->debug.lock_cpu != mycpu) { - printf("%s: unlocking lock 0x%x on cpu 0x%x", - caller, (integer_t) l, mycpu); - printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu); - panic("%s", caller); - } - usl_trace(l, mycpu, pc, caller); - - l->debug.unlock_thread = l->debug.lock_thread; - l->debug.lock_thread = INVALID_PC; - l->debug.state &= ~USLOCK_TAKEN; - l->debug.unlock_pc = pc; - l->debug.unlock_cpu = mycpu; -} - - -/* - * Debug checks on a usimple_lock just before - * attempting to acquire it. - * - * Preemption isn't guaranteed to be disabled. - */ -void -usld_lock_try_pre( - usimple_lock_t l, - pc_t pc) -{ - const char *caller = "usimple_lock_try"; - - if (!usld_lock_common_checks(l, caller)) - return; - mp_disable_preemption(); - usl_trace(l, cpu_number(), pc, caller); - mp_enable_preemption(); -} - - -/* - * Debug checks on a usimple_lock just after - * successfully attempting to acquire it. - * - * Preemption has been disabled by the - * lock acquisition attempt, so it's safe - * to use cpu_number. - */ -void -usld_lock_try_post( - usimple_lock_t l, - pc_t pc) -{ - int mycpu; - const char *caller = "successful usimple_lock_try"; - - if (!usld_lock_common_checks(l, caller)) - return; - - if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) - panic("%s: lock 0x%x became uninitialized", - caller, (integer_t) l); - if ((l->debug.state & USLOCK_TAKEN)) - panic("%s: lock 0x%x became TAKEN by someone else", - caller, (integer_t) l); - - mycpu = cpu_number(); - l->debug.lock_thread = (void *) current_thread(); - l->debug.state |= USLOCK_TAKEN; - l->debug.lock_pc = pc; - l->debug.lock_cpu = mycpu; - - usl_trace(l, mycpu, pc, caller); -} - - -/* - * For very special cases, set traced_lock to point to a - * specific lock of interest. The result is a series of - * XPRs showing lock operations on that lock. The lock_seq - * value is used to show the order of those operations. - */ -usimple_lock_t traced_lock; -unsigned int lock_seq; - -void -usl_trace( - usimple_lock_t l, - int mycpu, - pc_t pc, - const char * op_name) -{ - if (traced_lock == l) { - XPR(XPR_SLOCK, - "seq %d, cpu %d, %s @ %x\n", - (integer_t) lock_seq, (integer_t) mycpu, - (integer_t) op_name, (integer_t) pc, 0); - lock_seq++; - } -} - - -#endif /* USLOCK_DEBUG */ - -/* - * The C portion of the shared/exclusive locks package. - */ - -/* - * Forward definition - */ - -void lck_rw_lock_exclusive_gen( - lck_rw_t *lck); - -lck_rw_type_t lck_rw_done_gen( - lck_rw_t *lck); - -void -lck_rw_lock_shared_gen( - lck_rw_t *lck); - -boolean_t -lck_rw_lock_shared_to_exclusive_gen( - lck_rw_t *lck); - -void -lck_rw_lock_exclusive_to_shared_gen( - lck_rw_t *lck); - -boolean_t -lck_rw_try_lock_exclusive_gen( - lck_rw_t *lck); - -boolean_t -lck_rw_try_lock_shared_gen( - lck_rw_t *lck); - -void lck_rw_ext_init( - lck_rw_ext_t *lck, - lck_grp_t *grp, - lck_attr_t *attr); - -void lck_rw_ext_backtrace( - lck_rw_ext_t *lck); - -void lck_rw_lock_exclusive_ext( - lck_rw_ext_t *lck, - lck_rw_t *rlck); - -lck_rw_type_t lck_rw_done_ext( - lck_rw_ext_t *lck, - lck_rw_t *rlck); - -void -lck_rw_lock_shared_ext( - lck_rw_ext_t *lck, - lck_rw_t *rlck); - -boolean_t -lck_rw_lock_shared_to_exclusive_ext( - lck_rw_ext_t *lck, - lck_rw_t *rlck); - -void -lck_rw_lock_exclusive_to_shared_ext( - lck_rw_ext_t *lck, - lck_rw_t *rlck); - -boolean_t -lck_rw_try_lock_exclusive_ext( - lck_rw_ext_t *lck, - lck_rw_t *rlck); - -boolean_t -lck_rw_try_lock_shared_ext( - lck_rw_ext_t *lck, - lck_rw_t *rlck); - -void -lck_rw_ilk_lock( - lck_rw_t *lck); - -void -lck_rw_ilk_unlock( - lck_rw_t *lck); - -void -lck_rw_check_type( - lck_rw_ext_t *lck, - lck_rw_t *rlck); - -void -lck_rw_assert_ext( - lck_rw_ext_t *lck, - lck_rw_t *rlck, - unsigned int type); - -/* - * Routine: lock_alloc - * Function: - * Allocate a lock for external users who cannot - * hard-code the structure definition into their - * objects. - * For now just use kalloc, but a zone is probably - * warranted. - */ -lock_t * -lock_alloc( - boolean_t can_sleep, - __unused unsigned short tag, - __unused unsigned short tag1) -{ - lock_t *lck; - - if ((lck = (lock_t *)kalloc(sizeof(lock_t))) != 0) - lock_init(lck, can_sleep, tag, tag1); - return(lck); -} - -/* - * Routine: lock_init - * Function: - * Initialize a lock; required before use. - * Note that clients declare the "struct lock" - * variables and then initialize them, rather - * than getting a new one from this module. - */ -void -lock_init( - lock_t *lck, - boolean_t can_sleep, - __unused unsigned short tag, - __unused unsigned short tag1) -{ - if (!can_sleep) - panic("lock_init: sleep mode must be set to TRUE\n"); - - (void) memset((void *) lck, 0, sizeof(lock_t)); -#if MACH_LDEBUG - lck->lck_rw_deb.type = RW_TAG; - lck->lck_rw_attr |= (LCK_RW_ATTR_DEBUG|LCK_RW_ATTR_DIS_THREAD|LCK_RW_ATTR_DIS_MYLOCK); - lck->lck_rw.lck_rw_priv_excl = TRUE; -#else - lck->lck_rw_priv_excl = TRUE; -#endif - -} - - -/* - * Routine: lock_free - * Function: - * Free a lock allocated for external users. - * For now just use kfree, but a zone is probably - * warranted. - */ -void -lock_free( - lock_t *lck) -{ - kfree((void *)lck, sizeof(lock_t)); -} - -#if MACH_LDEBUG -void -lock_write( - lock_t *lck) -{ - lck_rw_lock_exclusive_ext((lck_rw_ext_t *)lck, (lck_rw_t *)lck); -} - -void -lock_done( - lock_t *lck) -{ - (void)lck_rw_done_ext((lck_rw_ext_t *)lck, (lck_rw_t *)lck); -} - -void -lock_read( - lock_t *lck) -{ - lck_rw_lock_shared_ext((lck_rw_ext_t *)lck, (lck_rw_t *)lck); -} - -boolean_t -lock_read_to_write( - lock_t *lck) -{ - return(lck_rw_lock_shared_to_exclusive_ext((lck_rw_ext_t *)lck, (lck_rw_t *)lck)); -} - -void -lock_write_to_read( - register lock_t *lck) -{ - lck_rw_lock_exclusive_to_shared_ext((lck_rw_ext_t *)lck, (lck_rw_t *)lck); -} -#endif - -/* - * Routine: lck_rw_alloc_init - */ -lck_rw_t * -lck_rw_alloc_init( - lck_grp_t *grp, - lck_attr_t *attr) { - lck_rw_t *lck; - - if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) - lck_rw_init(lck, grp, attr); - - return(lck); -} - -/* - * Routine: lck_rw_free - */ -void -lck_rw_free( - lck_rw_t *lck, - lck_grp_t *grp) { - lck_rw_destroy(lck, grp); - kfree((void *)lck, sizeof(lck_rw_t)); -} - -/* - * Routine: lck_rw_init - */ -void -lck_rw_init( - lck_rw_t *lck, - lck_grp_t *grp, - lck_attr_t *attr) { - lck_rw_ext_t *lck_ext; - lck_attr_t *lck_attr; - - if (attr != LCK_ATTR_NULL) - lck_attr = attr; - else - lck_attr = &LockDefaultLckAttr; - - if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) { - if ((lck_ext = (lck_rw_ext_t *)kalloc(sizeof(lck_rw_ext_t))) != 0) { - lck_rw_ext_init(lck_ext, grp, lck_attr); - lck->lck_rw_tag = LCK_RW_TAG_INDIRECT; - lck->lck_rw_ptr = lck_ext; - } - } else { - (void) memset((void *) lck, 0, sizeof(lck_rw_t)); - if ((lck_attr->lck_attr_val) & LCK_ATTR_RW_SHARED_PRIORITY) - lck->lck_rw_priv_excl = FALSE; - else - lck->lck_rw_priv_excl = TRUE; - } - - lck_grp_reference(grp); - lck_grp_lckcnt_incr(grp, LCK_TYPE_RW); -} - -/* - * Routine: lck_rw_ext_init - */ -void -lck_rw_ext_init( - lck_rw_ext_t *lck, - lck_grp_t *grp, - lck_attr_t *attr) { - - bzero((void *)lck, sizeof(lck_rw_ext_t)); - if ((attr->lck_attr_val) & LCK_ATTR_RW_SHARED_PRIORITY) - lck->lck_rw.lck_rw_priv_excl = FALSE; - else - lck->lck_rw.lck_rw_priv_excl = TRUE; - - if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) { - lck->lck_rw_deb.type = RW_TAG; - lck->lck_rw_attr |= LCK_RW_ATTR_DEBUG; - } - - lck->lck_rw_grp = grp; - - if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT) - lck->lck_rw_attr |= LCK_RW_ATTR_STAT; -} - -/* - * Routine: lck_rw_destroy - */ -void -lck_rw_destroy( - lck_rw_t *lck, - lck_grp_t *grp) { - boolean_t lck_is_indirect; - - if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) - return; - lck_is_indirect = (lck->lck_rw_tag == LCK_RW_TAG_INDIRECT); - lck->lck_rw_tag = LCK_RW_TAG_DESTROYED; - if (lck_is_indirect) - kfree((void *)lck->lck_rw_ptr, sizeof(lck_rw_ext_t)); - - lck_grp_lckcnt_decr(grp, LCK_TYPE_RW); - lck_grp_deallocate(grp); - return; -} - -/* - * Routine: lck_rw_lock - */ -void -lck_rw_lock( - lck_rw_t *lck, - lck_rw_type_t lck_rw_type) -{ - if (lck_rw_type == LCK_RW_TYPE_SHARED) - lck_rw_lock_shared(lck); - else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) - lck_rw_lock_exclusive(lck); - else - panic("lck_rw_lock(): Invalid RW lock type: %d\n", lck_rw_type); -} - - -/* - * Routine: lck_rw_unlock - */ -void -lck_rw_unlock( - lck_rw_t *lck, - lck_rw_type_t lck_rw_type) -{ - if (lck_rw_type == LCK_RW_TYPE_SHARED) - lck_rw_unlock_shared(lck); - else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) - lck_rw_unlock_exclusive(lck); - else - panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type); -} - - -/* - * Routine: lck_rw_unlock_shared - */ -void -lck_rw_unlock_shared( - lck_rw_t *lck) -{ - lck_rw_type_t ret; - - ret = lck_rw_done(lck); - - if (ret != LCK_RW_TYPE_SHARED) - panic("lck_rw_unlock(): lock held in mode: %d\n", ret); -} - - -/* - * Routine: lck_rw_unlock_exclusive - */ -void -lck_rw_unlock_exclusive( - lck_rw_t *lck) -{ - lck_rw_type_t ret; - - ret = lck_rw_done(lck); - - if (ret != LCK_RW_TYPE_EXCLUSIVE) - panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret); -} - - -/* - * Routine: lck_rw_try_lock - */ -boolean_t -lck_rw_try_lock( - lck_rw_t *lck, - lck_rw_type_t lck_rw_type) -{ - if (lck_rw_type == LCK_RW_TYPE_SHARED) - return(lck_rw_try_lock_shared(lck)); - else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) - return(lck_rw_try_lock_exclusive(lck)); - else - panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type); - return(FALSE); -} - - - -/* - * Routine: lck_rw_lock_exclusive_gen - */ -void -lck_rw_lock_exclusive_gen( - lck_rw_t *lck) -{ - int i; - wait_result_t res; -#if CONFIG_DTRACE - uint64_t wait_interval = 0; - int slept = 0; - int readers_at_sleep; -#endif - - lck_rw_ilk_lock(lck); -#if CONFIG_DTRACE - readers_at_sleep = lck->lck_rw_shared_cnt; -#endif - - /* - * Try to acquire the lck_rw_want_excl bit. - */ - while (lck->lck_rw_want_excl) { - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0); - -#if CONFIG_DTRACE - if ((lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] || lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK]) && wait_interval == 0) { - wait_interval = mach_absolute_time(); - } else { - wait_interval = -1; - } -#endif - - i = lock_wait_time[1]; - if (i != 0) { - lck_rw_ilk_unlock(lck); - while (--i != 0 && lck->lck_rw_want_excl) - continue; - lck_rw_ilk_lock(lck); - } - - if (lck->lck_rw_want_excl) { - lck->lck_rw_waiting = TRUE; - res = assert_wait((event_t)(((unsigned int*)lck)+((sizeof(lck_rw_t)-1)/sizeof(unsigned int))), THREAD_UNINT); - if (res == THREAD_WAITING) { - lck_rw_ilk_unlock(lck); - res = thread_block(THREAD_CONTINUE_NULL); -#if CONFIG_DTRACE - slept = 1; -#endif - lck_rw_ilk_lock(lck); - } - } - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE_CODE) | DBG_FUNC_END, (int)lck, res, 0, 0, 0); - } - lck->lck_rw_want_excl = TRUE; - - /* Wait for readers (and upgrades) to finish */ - - while ((lck->lck_rw_shared_cnt != 0) || lck->lck_rw_want_upgrade) { - - i = lock_wait_time[1]; - - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE1_CODE) | DBG_FUNC_START, - (int)lck, lck->lck_rw_shared_cnt, lck->lck_rw_want_upgrade, i, 0); -#if CONFIG_DTRACE - if ((lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] || lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK]) && wait_interval == 0) { - wait_interval = mach_absolute_time(); - } else { - wait_interval = (unsigned) -1; - } -#endif - - if (i != 0) { - lck_rw_ilk_unlock(lck); - while (--i != 0 && (lck->lck_rw_shared_cnt != 0 || - lck->lck_rw_want_upgrade)) - continue; - lck_rw_ilk_lock(lck); - } - - if (lck->lck_rw_shared_cnt != 0 || lck->lck_rw_want_upgrade) { - lck->lck_rw_waiting = TRUE; - res = assert_wait((event_t)(((unsigned int*)lck)+((sizeof(lck_rw_t)-1)/sizeof(unsigned int))), THREAD_UNINT); - if (res == THREAD_WAITING) { - lck_rw_ilk_unlock(lck); - res = thread_block(THREAD_CONTINUE_NULL); -#if CONFIG_DTRACE - slept = 1; -#endif - lck_rw_ilk_lock(lck); - } - } - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE1_CODE) | DBG_FUNC_END, - (int)lck, lck->lck_rw_shared_cnt, lck->lck_rw_want_upgrade, res, 0); - } - - lck_rw_ilk_unlock(lck); -#if CONFIG_DTRACE - /* - * Decide what latencies we suffered that are Dtrace events. - * If we have set wait_interval, then we either spun or slept. - * At least we get out from under the interlock before we record - * which is the best we can do here to minimize the impact - * of the tracing. - */ - if (wait_interval != 0 && wait_interval != (unsigned) -1) { - if (slept == 0) { - LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN, lck, - mach_absolute_time() - wait_interval, 1); - } else { - /* - * For the blocking case, we also record if when we blocked - * it was held for read or write, and how many readers. - * Notice that above we recorded this before we dropped - * the interlock so the count is accurate. - */ - LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_EXCL_BLOCK, lck, - mach_absolute_time() - wait_interval, 1, - (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep); - } - } - LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1); -#endif -} - - -/* - * Routine: lck_rw_done_gen - */ -lck_rw_type_t -lck_rw_done_gen( - lck_rw_t *lck) -{ - boolean_t do_wakeup = FALSE; - lck_rw_type_t lck_rw_type; - - - lck_rw_ilk_lock(lck); - - if (lck->lck_rw_shared_cnt != 0) { - lck_rw_type = LCK_RW_TYPE_SHARED; - lck->lck_rw_shared_cnt--; - } - else { - lck_rw_type = LCK_RW_TYPE_EXCLUSIVE; - if (lck->lck_rw_want_upgrade) - lck->lck_rw_want_upgrade = FALSE; - else - lck->lck_rw_want_excl = FALSE; - } - - /* - * There is no reason to wakeup a lck_rw_waiting thread - * if the read-count is non-zero. Consider: - * we must be dropping a read lock - * threads are waiting only if one wants a write lock - * if there are still readers, they can't proceed - */ - - if (lck->lck_rw_waiting && (lck->lck_rw_shared_cnt == 0)) { - lck->lck_rw_waiting = FALSE; - do_wakeup = TRUE; - } - - lck_rw_ilk_unlock(lck); - - if (do_wakeup) - thread_wakeup((event_t)(((unsigned int*)lck)+((sizeof(lck_rw_t)-1)/sizeof(unsigned int)))); - LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lck_rw_type); - return(lck_rw_type); -} - - -/* - * Routine: lck_rw_lock_shared_gen - */ -void -lck_rw_lock_shared_gen( - lck_rw_t *lck) -{ - int i; - wait_result_t res; -#if CONFIG_DTRACE - uint64_t wait_interval = 0; - int slept = 0; - int readers_at_sleep; -#endif - - lck_rw_ilk_lock(lck); -#if CONFIG_DTRACE - readers_at_sleep = lck->lck_rw_shared_cnt; -#endif - - while ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) && - ((lck->lck_rw_shared_cnt == 0) || (lck->lck_rw_priv_excl))) { - i = lock_wait_time[1]; - - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_CODE) | DBG_FUNC_START, - (int)lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, i, 0); -#if CONFIG_DTRACE - if ((lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] || lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK]) && wait_interval == 0) { - wait_interval = mach_absolute_time(); - } else { - wait_interval = (unsigned) -1; - } -#endif - - if (i != 0) { - lck_rw_ilk_unlock(lck); - while (--i != 0 && - (lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) && - ((lck->lck_rw_shared_cnt == 0) || (lck->lck_rw_priv_excl))) - continue; - lck_rw_ilk_lock(lck); - } - - if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) && - ((lck->lck_rw_shared_cnt == 0) || (lck->lck_rw_priv_excl))) { - lck->lck_rw_waiting = TRUE; - res = assert_wait((event_t)(((unsigned int*)lck)+((sizeof(lck_rw_t)-1)/sizeof(unsigned int))), THREAD_UNINT); - if (res == THREAD_WAITING) { - lck_rw_ilk_unlock(lck); - res = thread_block(THREAD_CONTINUE_NULL); -#if CONFIG_DTRACE - slept = 1; -#endif - lck_rw_ilk_lock(lck); - } - } - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_CODE) | DBG_FUNC_END, - (int)lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, res, 0); - } - - lck->lck_rw_shared_cnt++; - - lck_rw_ilk_unlock(lck); -#if CONFIG_DTRACE - if (wait_interval != 0 && wait_interval != (unsigned) -1) { - if (slept == 0) { - LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0); - } else { - LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_BLOCK, lck, - mach_absolute_time() - wait_interval, 0, - (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep); - } - } - LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0); -#endif -} - - -/* - * Routine: lck_rw_lock_shared_to_exclusive_gen - * Function: - * Improves a read-only lock to one with - * write permission. If another reader has - * already requested an upgrade to a write lock, - * no lock is held upon return. - * - * Returns FALSE if the upgrade *failed*. - */ - -boolean_t -lck_rw_lock_shared_to_exclusive_gen( - lck_rw_t *lck) -{ - int i; - boolean_t do_wakeup = FALSE; - wait_result_t res; -#if CONFIG_DTRACE - uint64_t wait_interval = 0; - int slept = 0; - int readers_at_sleep = 0; -#endif - - lck_rw_ilk_lock(lck); - - lck->lck_rw_shared_cnt--; - - if (lck->lck_rw_want_upgrade) { - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_START, - (int)lck, lck->lck_rw_shared_cnt, lck->lck_rw_want_upgrade, 0, 0); - - /* - * Someone else has requested upgrade. - * Since we've released a read lock, wake - * him up. - */ - if (lck->lck_rw_waiting && (lck->lck_rw_shared_cnt == 0)) { - lck->lck_rw_waiting = FALSE; - do_wakeup = TRUE; - } - - lck_rw_ilk_unlock(lck); - - if (do_wakeup) - thread_wakeup((event_t)(((unsigned int*)lck)+((sizeof(lck_rw_t)-1)/sizeof(unsigned int)))); - - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_END, - (int)lck, lck->lck_rw_shared_cnt, lck->lck_rw_want_upgrade, 0, 0); - - return (FALSE); - } - - lck->lck_rw_want_upgrade = TRUE; - - while (lck->lck_rw_shared_cnt != 0) { - i = lock_wait_time[1]; - - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX1_CODE) | DBG_FUNC_START, - (int)lck, lck->lck_rw_shared_cnt, i, 0, 0); - -#if CONFIG_DTRACE - readers_at_sleep = lck->lck_rw_shared_cnt; - if ((lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] || lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK]) && wait_interval == 0) { - wait_interval = mach_absolute_time(); - } else { - wait_interval = (unsigned) -1; - } -#endif - if (i != 0) { - lck_rw_ilk_unlock(lck); - while (--i != 0 && lck->lck_rw_shared_cnt != 0) - continue; - lck_rw_ilk_lock(lck); - } - - if (lck->lck_rw_shared_cnt != 0) { - lck->lck_rw_waiting = TRUE; - res = assert_wait((event_t)(((unsigned int*)lck)+((sizeof(lck_rw_t)-1)/sizeof(unsigned int))), THREAD_UNINT); - if (res == THREAD_WAITING) { - lck_rw_ilk_unlock(lck); - res = thread_block(THREAD_CONTINUE_NULL); -#if CONFIG_DTRACE - slept = 1; -#endif - lck_rw_ilk_lock(lck); - } - } - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX1_CODE) | DBG_FUNC_END, - (int)lck, lck->lck_rw_shared_cnt, 0, 0, 0); - } - - lck_rw_ilk_unlock(lck); - -#if CONFIG_DTRACE - /* - * We infer if we took a sleep or spin path by whether readers_at_sleep - * was set. - */ - if (wait_interval != 0 && wait_interval != (unsigned) -1 && readers_at_sleep) { - if (slept == 0) { - LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0); - } else { - LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck, - mach_absolute_time() - wait_interval, 1, - (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep); - } - } -#endif - - LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1); - return (TRUE); -} - -/* - * Routine: lck_rw_lock_exclusive_to_shared_gen - */ -void -lck_rw_lock_exclusive_to_shared_gen( - lck_rw_t *lck) -{ - boolean_t do_wakeup = FALSE; - - lck_rw_ilk_lock(lck); - - lck->lck_rw_shared_cnt++; - if (lck->lck_rw_want_upgrade) - lck->lck_rw_want_upgrade = FALSE; - else - lck->lck_rw_want_excl = FALSE; - - if (lck->lck_rw_waiting) { - lck->lck_rw_waiting = FALSE; - do_wakeup = TRUE; - } - - lck_rw_ilk_unlock(lck); - - if (do_wakeup) - thread_wakeup((event_t)(((unsigned int*)lck)+((sizeof(lck_rw_t)-1)/sizeof(unsigned int)))); - - LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0); -} - - -/* - * Routine: lck_rw_try_lock_exclusive_gen - * Function: - * Tries to get a write lock. - * - * Returns FALSE if the lock is not held on return. - */ - -boolean_t -lck_rw_try_lock_exclusive_gen( - lck_rw_t *lck) -{ - lck_rw_ilk_lock(lck); - - if (lck->lck_rw_want_excl || lck->lck_rw_want_upgrade || lck->lck_rw_shared_cnt) { - /* - * Can't get lock. - */ - lck_rw_ilk_unlock(lck); - return(FALSE); - } - - /* - * Have lock. - */ - - lck->lck_rw_want_excl = TRUE; - - lck_rw_ilk_unlock(lck); - - LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lck, 1); - return(TRUE); -} - -/* - * Routine: lck_rw_try_lock_shared_gen - * Function: - * Tries to get a read lock. - * - * Returns FALSE if the lock is not held on return. - */ - -boolean_t -lck_rw_try_lock_shared_gen( - lck_rw_t *lck) -{ - lck_rw_ilk_lock(lck); - - if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) && - ((lck->lck_rw_shared_cnt == 0) || (lck->lck_rw_priv_excl))) { - lck_rw_ilk_unlock(lck); - return(FALSE); - } - - lck->lck_rw_shared_cnt++; - - lck_rw_ilk_unlock(lck); - - LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lck, 0); - return(TRUE); -} - - -/* - * Routine: lck_rw_ext_backtrace - */ -void -lck_rw_ext_backtrace( - lck_rw_ext_t *lck) -{ - unsigned int *stackptr, *stackptr_prev; - unsigned int frame; - - __asm__ volatile("mr %0,r1" : "=r" (stackptr)); - frame = 0; - while (frame < LCK_FRAMES_MAX) { - stackptr_prev = stackptr; - stackptr = ( unsigned int *)*stackptr; - if ( (((unsigned int)stackptr_prev) - ((unsigned int)stackptr)) > 8192) - break; - lck->lck_rw_deb.stack[frame] = *(stackptr+2); - frame++; - } - while (frame < LCK_FRAMES_MAX) { - lck->lck_rw_deb.stack[frame] = 0; - frame++; - } -} - - -/* - * Routine: lck_rw_lock_exclusive_ext - */ -void -lck_rw_lock_exclusive_ext( - lck_rw_ext_t *lck, - lck_rw_t *rlck) -{ - int i; - wait_result_t res; - boolean_t lock_miss = FALSE; - boolean_t lock_wait = FALSE; - boolean_t lock_stat; -#if CONFIG_DTRACE - uint64_t wait_interval = 0; - int slept = 0; - int readers_at_sleep; -#endif - - lck_rw_check_type(lck, rlck); - - if ( ((lck->lck_rw_attr & (LCK_RW_ATTR_DEBUG|LCK_RW_ATTR_DIS_MYLOCK)) == LCK_RW_ATTR_DEBUG) - && (lck->lck_rw_deb.thread == current_thread())) - panic("rw lock (%p) recursive lock attempt\n", rlck); - - lck_rw_ilk_lock(&lck->lck_rw); -#if CONFIG_DTRACE - readers_at_sleep = lck->lck_rw.lck_rw_shared_cnt; -#endif - - lock_stat = (lck->lck_rw_attr & LCK_RW_ATTR_STAT) ? TRUE : FALSE; - - if (lock_stat) - lck->lck_rw_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_util_cnt++; - - /* - * Try to acquire the lck_rw.lck_rw_want_excl bit. - */ - while (lck->lck_rw.lck_rw_want_excl) { - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE_CODE) | DBG_FUNC_START, (int)rlck, 0, 0, 0, 0); - - if (lock_stat && !lock_miss) { - lock_miss = TRUE; - lck->lck_rw_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_miss_cnt++; - } -#if CONFIG_DTRACE - if ((lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] || lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK]) && wait_interval == 0) { - wait_interval = mach_absolute_time(); - } else { - wait_interval = (unsigned) -1; - } -#endif - - i = lock_wait_time[1]; - if (i != 0) { - lck_rw_ilk_unlock(&lck->lck_rw); - while (--i != 0 && lck->lck_rw.lck_rw_want_excl) - continue; - lck_rw_ilk_lock(&lck->lck_rw); - } - - if (lck->lck_rw.lck_rw_want_excl) { - lck->lck_rw.lck_rw_waiting = TRUE; - res = assert_wait((event_t)(((unsigned int*)rlck)+((sizeof(lck_rw_t)-1)/sizeof(unsigned int))), THREAD_UNINT); - if (res == THREAD_WAITING) { - if (lock_stat && !lock_wait) { - lock_wait = TRUE; - lck->lck_rw_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_wait_cnt++; - } - lck_rw_ilk_unlock(&lck->lck_rw); - res = thread_block(THREAD_CONTINUE_NULL); -#if CONFIG_DTRACE - slept = 1; -#endif - lck_rw_ilk_lock(&lck->lck_rw); - } - } - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE_CODE) | DBG_FUNC_END, (int)rlck, res, 0, 0, 0); - } - lck->lck_rw.lck_rw_want_excl = TRUE; - - /* Wait for readers (and upgrades) to finish */ - - while ((lck->lck_rw.lck_rw_shared_cnt != 0) || lck->lck_rw.lck_rw_want_upgrade) { - i = lock_wait_time[1]; - - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE1_CODE) | DBG_FUNC_START, - (int)rlck, lck->lck_rw.lck_rw_shared_cnt, lck->lck_rw.lck_rw_want_upgrade, i, 0); -#if CONFIG_DTRACE - if ((lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] || lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK]) && wait_interval == 0) { - wait_interval = mach_absolute_time(); - } else { - wait_interval = (unsigned) -1; - } -#endif - - if (lock_stat && !lock_miss) { - lock_miss = TRUE; - lck->lck_rw_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_miss_cnt++; - } - - if (i != 0) { - lck_rw_ilk_unlock(&lck->lck_rw); - while (--i != 0 && (lck->lck_rw.lck_rw_shared_cnt != 0 || - lck->lck_rw.lck_rw_want_upgrade)) - continue; - lck_rw_ilk_lock(&lck->lck_rw); - } - - if (lck->lck_rw.lck_rw_shared_cnt != 0 || lck->lck_rw.lck_rw_want_upgrade) { - lck->lck_rw.lck_rw_waiting = TRUE; - res = assert_wait((event_t)(((unsigned int*)rlck)+((sizeof(lck_rw_t)-1)/sizeof(unsigned int))), THREAD_UNINT); - if (res == THREAD_WAITING) { - if (lock_stat && !lock_wait) { - lock_wait = TRUE; - lck->lck_rw_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_wait_cnt++; - } - lck_rw_ilk_unlock(&lck->lck_rw); - res = thread_block(THREAD_CONTINUE_NULL); -#if CONFIG_DTRACE - slept = 1; -#endif - lck_rw_ilk_lock(&lck->lck_rw); - } - } - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE1_CODE) | DBG_FUNC_END, - (int)rlck, lck->lck_rw.lck_rw_shared_cnt, lck->lck_rw.lck_rw_want_upgrade, res, 0); - } - - lck->lck_rw_deb.pc_excl = __builtin_return_address(0); - if (LcksOpts & enaLkExtStck) - lck_rw_ext_backtrace(lck); - lck->lck_rw_deb.thread = current_thread(); - - lck_rw_ilk_unlock(&lck->lck_rw); -#if CONFIG_DTRACE - /* - * Decide what latencies we suffered that are Dtrace events. - * If we have set wait_interval, then we either spun or slept. - * At least we get out from under the interlock before we record - * which is the best we can do here to minimize the impact - * of the tracing. - */ - if (wait_interval != 0 && wait_interval != (unsigned) -1) { - if (slept == 0) { - LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN, lck, - mach_absolute_time() - wait_interval, 1); - } else { - /* - * For the blocking case, we also record if when we blocked - * it was held for read or write, and how many readers. - * Notice that above we recorded this before we dropped - * the interlock so the count is accurate. - */ - LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_EXCL_BLOCK, lck, - mach_absolute_time() - wait_interval, 1, - (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep); - } - } - LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1); -#endif -} - - -/* - * Routine: lck_rw_done_ext - */ -lck_rw_type_t -lck_rw_done_ext( - lck_rw_ext_t *lck, - lck_rw_t *rlck) -{ - boolean_t do_wakeup = FALSE; - lck_rw_type_t lck_rw_type; - - - lck_rw_check_type(lck, rlck); - - lck_rw_ilk_lock(&lck->lck_rw); - - if (lck->lck_rw.lck_rw_shared_cnt != 0) { - lck_rw_type = LCK_RW_TYPE_SHARED; - lck->lck_rw.lck_rw_shared_cnt--; - } - else { - lck_rw_type = LCK_RW_TYPE_EXCLUSIVE; - if (lck->lck_rw.lck_rw_want_upgrade) - lck->lck_rw.lck_rw_want_upgrade = FALSE; - else if (lck->lck_rw.lck_rw_want_excl) - lck->lck_rw.lck_rw_want_excl = FALSE; - else - panic("rw lock (%p) bad state (0x%08X) on attempt to release a shared or exlusive right\n", - rlck, lck->lck_rw.lck_rw_tag); - if (lck->lck_rw_deb.thread == THREAD_NULL) - panic("rw lock (%p) not held\n", - rlck); - else if ( ((lck->lck_rw_attr & (LCK_RW_ATTR_DEBUG|LCK_RW_ATTR_DIS_THREAD)) == LCK_RW_ATTR_DEBUG) - && (lck->lck_rw_deb.thread != current_thread())) - panic("rw lock (%p) unlocked by non-owner(%p), current owner(%p)\n", - rlck, current_thread(), lck->lck_rw_deb.thread); - lck->lck_rw_deb.thread = THREAD_NULL; - } - - if (lck->lck_rw_attr & LCK_RW_ATTR_DEBUG) - lck->lck_rw_deb.pc_done = __builtin_return_address(0); - - /* - * There is no reason to wakeup a waiting thread - * if the read-count is non-zero. Consider: - * we must be dropping a read lock - * threads are waiting only if one wants a write lock - * if there are still readers, they can't proceed - */ - - if (lck->lck_rw.lck_rw_waiting && (lck->lck_rw.lck_rw_shared_cnt == 0)) { - lck->lck_rw.lck_rw_waiting = FALSE; - do_wakeup = TRUE; - } - - lck_rw_ilk_unlock(&lck->lck_rw); - - if (do_wakeup) - thread_wakeup((event_t)(((unsigned int*)rlck)+((sizeof(lck_rw_t)-1)/sizeof(unsigned int)))); - LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lck_rw_type); - return(lck_rw_type); -} - - -/* - * Routine: lck_rw_lock_shared_ext - */ -void -lck_rw_lock_shared_ext( - lck_rw_ext_t *lck, - lck_rw_t *rlck) -{ - int i; - wait_result_t res; - boolean_t lock_miss = FALSE; - boolean_t lock_wait = FALSE; - boolean_t lock_stat; -#if CONFIG_DTRACE - uint64_t wait_interval = 0; - int slept = 0; - int readers_at_sleep; -#endif - - lck_rw_check_type(lck, rlck); - - lck_rw_ilk_lock(&lck->lck_rw); -#if CONFIG_DTRACE - readers_at_sleep = lck->lck_rw.lck_rw_shared_cnt; -#endif - - lock_stat = (lck->lck_rw_attr & LCK_RW_ATTR_STAT) ? TRUE : FALSE; - - if (lock_stat) - lck->lck_rw_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_util_cnt++; - - while ((lck->lck_rw.lck_rw_want_excl || lck->lck_rw.lck_rw_want_upgrade) && - ((lck->lck_rw.lck_rw_shared_cnt == 0) || (lck->lck_rw.lck_rw_priv_excl))) { - i = lock_wait_time[1]; - - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_CODE) | DBG_FUNC_START, - (int)rlck, lck->lck_rw.lck_rw_want_excl, lck->lck_rw.lck_rw_want_upgrade, i, 0); -#if CONFIG_DTRACE - if ((lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] || lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK]) && wait_interval == 0) { - wait_interval = mach_absolute_time(); - } else { - wait_interval = (unsigned) -1; - } -#endif - - if (lock_stat && !lock_miss) { - lock_miss = TRUE; - lck->lck_rw_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_miss_cnt++; - } - - if (i != 0) { - lck_rw_ilk_unlock(&lck->lck_rw); - while (--i != 0 && - (lck->lck_rw.lck_rw_want_excl || lck->lck_rw.lck_rw_want_upgrade) && - ((lck->lck_rw.lck_rw_shared_cnt == 0) || (lck->lck_rw.lck_rw_priv_excl))) - continue; - lck_rw_ilk_lock(&lck->lck_rw); - } - - if ((lck->lck_rw.lck_rw_want_excl || lck->lck_rw.lck_rw_want_upgrade) && - ((lck->lck_rw.lck_rw_shared_cnt == 0) || (lck->lck_rw.lck_rw_priv_excl))) { - lck->lck_rw.lck_rw_waiting = TRUE; - res = assert_wait((event_t)(((unsigned int*)rlck)+((sizeof(lck_rw_t)-1)/sizeof(unsigned int))), THREAD_UNINT); - if (res == THREAD_WAITING) { - if (lock_stat && !lock_wait) { - lock_wait = TRUE; - lck->lck_rw_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_wait_cnt++; - } - lck_rw_ilk_unlock(&lck->lck_rw); - res = thread_block(THREAD_CONTINUE_NULL); -#if CONFIG_DTRACE - slept = 1; -#endif - lck_rw_ilk_lock(&lck->lck_rw); - } - } - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_CODE) | DBG_FUNC_END, - (int)rlck, lck->lck_rw.lck_rw_want_excl, lck->lck_rw.lck_rw_want_upgrade, res, 0); - } - - lck->lck_rw.lck_rw_shared_cnt++; - - lck_rw_ilk_unlock(&lck->lck_rw); -#if CONFIG_DTRACE - if (wait_interval != 0 && wait_interval != (unsigned) -1) { - if (slept == 0) { - LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0); - } else { - LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_BLOCK, lck, - mach_absolute_time() - wait_interval, 0, - (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep); - } - } - LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0); -#endif -} - - -/* - * Routine: lck_rw_lock_shared_to_exclusive_ext - * Function: - * Improves a read-only lock to one with - * write permission. If another reader has - * already requested an upgrade to a write lock, - * no lock is held upon return. - * - * Returns FALSE if the upgrade *failed*. - */ - -boolean_t -lck_rw_lock_shared_to_exclusive_ext( - lck_rw_ext_t *lck, - lck_rw_t *rlck) -{ - int i; - boolean_t do_wakeup = FALSE; - wait_result_t res; - boolean_t lock_miss = FALSE; - boolean_t lock_wait = FALSE; - boolean_t lock_stat; -#if CONFIG_DTRACE - uint64_t wait_interval = 0; - int slept = 0; -#endif - - lck_rw_check_type(lck, rlck); - - if (lck->lck_rw_deb.thread == current_thread()) - panic("rw lock (%p) recursive lock attempt\n", rlck); - - lck_rw_ilk_lock(&lck->lck_rw); - - lock_stat = (lck->lck_rw_attr & LCK_RW_ATTR_STAT) ? TRUE : FALSE; - - if (lock_stat) - lck->lck_rw_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_util_cnt++; - - lck->lck_rw.lck_rw_shared_cnt--; - - if (lck->lck_rw.lck_rw_want_upgrade) { - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_START, - (int)rlck, lck->lck_rw.lck_rw_shared_cnt, lck->lck_rw.lck_rw_want_upgrade, 0, 0); - - /* - * Someone else has requested upgrade. - * Since we've released a read lock, wake - * him up. - */ - if (lck->lck_rw.lck_rw_waiting && (lck->lck_rw.lck_rw_shared_cnt == 0)) { - lck->lck_rw.lck_rw_waiting = FALSE; - do_wakeup = TRUE; - } - - lck_rw_ilk_unlock(&lck->lck_rw); - - if (do_wakeup) - thread_wakeup((event_t)(((unsigned int*)rlck)+((sizeof(lck_rw_t)-1)/sizeof(unsigned int)))); - - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_END, - (int)rlck, lck->lck_rw.lck_rw_shared_cnt, lck->lck_rw.lck_rw_want_upgrade, 0, 0); - - return (FALSE); - } - - lck->lck_rw.lck_rw_want_upgrade = TRUE; - - while (lck->lck_rw.lck_rw_shared_cnt != 0) { - i = lock_wait_time[1]; - - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX1_CODE) | DBG_FUNC_START, - (int)rlck, lck->lck_rw.lck_rw_shared_cnt, i, 0, 0); - - if (lock_stat && !lock_miss) { - lock_miss = TRUE; - lck->lck_rw_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_miss_cnt++; - } -#if CONFIG_DTRACE - if ((lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] || lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK]) && wait_interval == 0) { - wait_interval = mach_absolute_time(); - } else { - wait_interval = (unsigned) -1; - } -#endif - - if (i != 0) { - lck_rw_ilk_unlock(&lck->lck_rw); - while (--i != 0 && lck->lck_rw.lck_rw_shared_cnt != 0) - continue; - lck_rw_ilk_lock(&lck->lck_rw); - } - - if (lck->lck_rw.lck_rw_shared_cnt != 0) { - lck->lck_rw.lck_rw_waiting = TRUE; - res = assert_wait((event_t)(((unsigned int*)rlck)+((sizeof(lck_rw_t)-1)/sizeof(unsigned int))), THREAD_UNINT); - if (res == THREAD_WAITING) { - if (lock_stat && !lock_wait) { - lock_wait = TRUE; - lck->lck_rw_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_wait_cnt++; - } - lck_rw_ilk_unlock(&lck->lck_rw); - res = thread_block(THREAD_CONTINUE_NULL); -#if CONFIG_DTRACE - slept = 1; -#endif - lck_rw_ilk_lock(&lck->lck_rw); - } - } - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX1_CODE) | DBG_FUNC_END, - (int)rlck, lck->lck_rw.lck_rw_shared_cnt, 0, 0, 0); - } - - lck->lck_rw_deb.pc_excl = __builtin_return_address(0); - if (LcksOpts & enaLkExtStck) - lck_rw_ext_backtrace(lck); - lck->lck_rw_deb.thread = current_thread(); - - lck_rw_ilk_unlock(&lck->lck_rw); - -#if CONFIG_DTRACE - /* - * If we've travelled a path with no spin or sleep, then wait_interval - * is still zero. - */ - if (wait_interval != 0 && wait_interval != (unsigned) -1) { - if (slept == 0) { - LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0); - } else { - LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck, mach_absolute_time() - wait_interval, 0); - } - } -#endif - - LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1); - - return (TRUE); -} - -/* - * Routine: lck_rw_lock_exclusive_to_shared_ext - */ -void -lck_rw_lock_exclusive_to_shared_ext( - lck_rw_ext_t *lck, - lck_rw_t *rlck) -{ - boolean_t do_wakeup = FALSE; - - lck_rw_check_type(lck, rlck); - - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START, - (int)rlck, lck->lck_rw.lck_rw_want_excl, lck->lck_rw.lck_rw_want_upgrade, 0, 0); - - lck_rw_ilk_lock(&lck->lck_rw); - - lck->lck_rw.lck_rw_shared_cnt++; - if (lck->lck_rw.lck_rw_want_upgrade) - lck->lck_rw.lck_rw_want_upgrade = FALSE; - else if (lck->lck_rw.lck_rw_want_excl) - lck->lck_rw.lck_rw_want_excl = FALSE; - else - panic("rw lock (%p) bad state (0x%08X) on attempt to release a shared or exlusive right\n", - rlck, lck->lck_rw.lck_rw_tag); - if (lck->lck_rw_deb.thread == THREAD_NULL) - panic("rw lock (%p) not held\n", - rlck); - else if ( ((lck->lck_rw_attr & (LCK_RW_ATTR_DEBUG|LCK_RW_ATTR_DIS_THREAD)) == LCK_RW_ATTR_DEBUG) - && (lck->lck_rw_deb.thread != current_thread())) - panic("rw lock (%p) unlocked by non-owner(%p), current owner(%p)\n", - rlck, current_thread(), lck->lck_rw_deb.thread); - - lck->lck_rw_deb.thread = THREAD_NULL; - - if (lck->lck_rw.lck_rw_waiting) { - lck->lck_rw.lck_rw_waiting = FALSE; - do_wakeup = TRUE; - } - - lck_rw_ilk_unlock(&lck->lck_rw); - - if (do_wakeup) - thread_wakeup((event_t)(((unsigned int*)rlck)+((sizeof(lck_rw_t)-1)/sizeof(unsigned int)))); - - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END, - (int)rlck, lck->lck_rw.lck_rw_want_excl, lck->lck_rw.lck_rw_want_upgrade, lck->lck_rw.lck_rw_shared_cnt, 0); - - LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0); -} - - -/* - * Routine: lck_rw_try_lock_exclusive_ext - * Function: - * Tries to get a write lock. - * - * Returns FALSE if the lock is not held on return. - */ - -boolean_t -lck_rw_try_lock_exclusive_ext( - lck_rw_ext_t *lck, - lck_rw_t *rlck) -{ - boolean_t lock_stat; - - lck_rw_check_type(lck, rlck); - - lck_rw_ilk_lock(&lck->lck_rw); - - lock_stat = (lck->lck_rw_attr & LCK_RW_ATTR_STAT) ? TRUE : FALSE; - - if (lock_stat) - lck->lck_rw_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_util_cnt++; - - if (lck->lck_rw.lck_rw_want_excl || lck->lck_rw.lck_rw_want_upgrade || lck->lck_rw.lck_rw_shared_cnt) { - /* - * Can't get lock. - */ - if (lock_stat) { - lck->lck_rw_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_miss_cnt++; - } - lck_rw_ilk_unlock(&lck->lck_rw); - return(FALSE); - } - - /* - * Have lock. - */ - - lck->lck_rw.lck_rw_want_excl = TRUE; - lck->lck_rw_deb.pc_excl = __builtin_return_address(0); - if (LcksOpts & enaLkExtStck) - lck_rw_ext_backtrace(lck); - lck->lck_rw_deb.thread = current_thread(); - - lck_rw_ilk_unlock(&lck->lck_rw); - - LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lck, 1); - - return(TRUE); -} - -/* - * Routine: lck_rw_try_lock_shared_ext - * Function: - * Tries to get a read lock. - * - * Returns FALSE if the lock is not held on return. - */ - -boolean_t -lck_rw_try_lock_shared_ext( - lck_rw_ext_t *lck, - lck_rw_t *rlck) -{ - boolean_t lock_stat; - - lck_rw_check_type(lck, rlck); - - lck_rw_ilk_lock(&lck->lck_rw); - - lock_stat = (lck->lck_rw_attr & LCK_RW_ATTR_STAT) ? TRUE : FALSE; - - if (lock_stat) - lck->lck_rw_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_util_cnt++; - - if ((lck->lck_rw.lck_rw_want_excl || lck->lck_rw.lck_rw_want_upgrade) && - ((lck->lck_rw.lck_rw_shared_cnt == 0) || (lck->lck_rw.lck_rw_priv_excl))) { - if (lock_stat) { - lck->lck_rw_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_miss_cnt++; - } - lck_rw_ilk_unlock(&lck->lck_rw); - return(FALSE); - } - - lck->lck_rw.lck_rw_shared_cnt++; - - lck_rw_ilk_unlock(&lck->lck_rw); - - LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lck, 0); - - return(TRUE); -} - -void -lck_rw_check_type( - lck_rw_ext_t *lck, - lck_rw_t *rlck) -{ - if (lck->lck_rw_deb.type != RW_TAG) - panic("rw lock (%p) not a rw lock type (0x%08X)\n",rlck, lck->lck_rw_deb.type); -} - -void -lck_rw_assert_ext( - lck_rw_ext_t *lck, - lck_rw_t *rlck, - unsigned int type) -{ - lck_rw_check_type(lck, rlck); - - switch (type) { - case LCK_RW_ASSERT_SHARED: - if (lck->lck_rw.lck_rw_shared_cnt != 0) { - return; - } - break; - case LCK_RW_ASSERT_EXCLUSIVE: - if ((lck->lck_rw.lck_rw_want_excl || - lck->lck_rw.lck_rw_want_upgrade) && - lck->lck_rw.lck_rw_shared_cnt == 0) { - return; - } - break; - case LCK_RW_ASSERT_HELD: - if (lck->lck_rw.lck_rw_want_excl || - lck->lck_rw.lck_rw_want_upgrade || - lck->lck_rw.lck_rw_shared_cnt != 0) { - return; - } - break; - default: - break; - } - - panic("rw lock (%p -> %p) not held (mode=%u)\n", rlck, lck, type); -} - -void -lck_rw_assert( - lck_rw_t *lck, - unsigned int type) -{ - if (lck->lck_rw_tag != LCK_RW_TAG_INDIRECT) { - switch (type) { - case LCK_RW_ASSERT_SHARED: - if (lck->lck_rw_shared_cnt != 0) { - return; - } - break; - case LCK_RW_ASSERT_EXCLUSIVE: - if (lck->lck_rw_shared_cnt == 0 && - (lck->lck_rw_want_excl || - lck->lck_rw_want_upgrade)) { - return; - } - break; - case LCK_RW_ASSERT_HELD: - if (lck->lck_rw_shared_cnt != 0 || - lck->lck_rw_want_excl || - lck->lck_rw_want_upgrade) { - return; - } - break; - default: - break; - } - panic("rw lock (%p) not held (mode=%u)\n", lck, type); - } else { - lck_rw_assert_ext((lck_rw_ext_t *)lck->lck_rw_ptr, - (lck_rw_t *)lck, - type); - } -} - -/* - * The C portion of the mutex package. These routines are only invoked - * if the optimized assembler routines can't do the work. - */ - -/* - * Forward definition - */ - -void lck_mtx_ext_init( - lck_mtx_ext_t *lck, - lck_grp_t *grp, - lck_attr_t *attr); - -/* - * Routine: lck_mtx_alloc_init - */ -lck_mtx_t * -lck_mtx_alloc_init( - lck_grp_t *grp, - lck_attr_t *attr) { - lck_mtx_t *lck; - - if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0) - lck_mtx_init(lck, grp, attr); - - return(lck); -} - -/* - * Routine: lck_mtx_free - */ -void -lck_mtx_free( - lck_mtx_t *lck, - lck_grp_t *grp) { - lck_mtx_destroy(lck, grp); - kfree((void *)lck, sizeof(lck_mtx_t)); -} - -/* - * Routine: lck_mtx_init - */ -void -lck_mtx_init( - lck_mtx_t *lck, - lck_grp_t *grp, - lck_attr_t *attr) { - lck_mtx_ext_t *lck_ext; - lck_attr_t *lck_attr; - - if (attr != LCK_ATTR_NULL) - lck_attr = attr; - else - lck_attr = &LockDefaultLckAttr; - - if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) { - if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) { - lck_mtx_ext_init(lck_ext, grp, lck_attr); - lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT; - lck->lck_mtx_ptr = lck_ext; - } - } else { - lck->lck_mtx_data = 0; - lck->lck_mtx_waiters = 0; - lck->lck_mtx_pri = 0; - } - lck_grp_reference(grp); - lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX); -} - -/* - * Routine: lck_mtx_init_ext - */ -void -lck_mtx_init_ext( - lck_mtx_t *lck, - lck_mtx_ext_t *lck_ext, - lck_grp_t *grp, - lck_attr_t *attr) -{ - lck_attr_t *lck_attr; - - if (attr != LCK_ATTR_NULL) - lck_attr = attr; - else - lck_attr = &LockDefaultLckAttr; - - if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) { - lck_mtx_ext_init(lck_ext, grp, lck_attr); - lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT; - lck->lck_mtx_ptr = lck_ext; - } else { - lck->lck_mtx_data = 0; - lck->lck_mtx_waiters = 0; - lck->lck_mtx_pri = 0; - } - lck_grp_reference(grp); - lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX); -} - -/* - * Routine: lck_mtx_ext_init - */ -void -lck_mtx_ext_init( - lck_mtx_ext_t *lck, - lck_grp_t *grp, - lck_attr_t *attr) { - - bzero((void *)lck, sizeof(lck_mtx_ext_t)); - - if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) { - lck->lck_mtx_deb.type = MUTEX_TAG; - lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG; - } - - lck->lck_mtx_grp = grp; - - if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT) - lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT; -} - -/* - * Routine: lck_mtx_destroy - */ -void -lck_mtx_destroy( - lck_mtx_t *lck, - lck_grp_t *grp) { - boolean_t lck_is_indirect; - - if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) - return; - lck_is_indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT); - lck->lck_mtx_tag = LCK_MTX_TAG_DESTROYED; - if (lck_is_indirect) - kfree((void *)lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t)); - - lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX); - lck_grp_deallocate(grp); - return; -} - - -#if MACH_KDB -/* - * Routines to print out simple_locks and mutexes in a nicely-formatted - * fashion. - */ - -const char *simple_lock_labels = "ENTRY ILK THREAD DURATION CALLER"; - -void db_print_simple_lock( - simple_lock_t addr); - -void -db_show_one_simple_lock (db_expr_t addr, boolean_t have_addr, - __unused db_expr_t count, - __unused char *modif) -{ - simple_lock_t saddr = (simple_lock_t)(unsigned long)addr; - - if (saddr == (simple_lock_t)0 || !have_addr) { - db_error ("No simple_lock\n"); - } -#if USLOCK_DEBUG - else if (saddr->lock_type != USLOCK_TAG) - db_error ("Not a simple_lock\n"); -#endif /* USLOCK_DEBUG */ - - db_printf ("%s\n", simple_lock_labels); - db_print_simple_lock (saddr); -} - -void -db_print_simple_lock ( - simple_lock_t addr) -{ - - db_printf ("%08x %3d", addr, *hw_lock_addr(addr->interlock)); -#if USLOCK_DEBUG - db_printf (" %08x", addr->debug.lock_thread); - db_printf (" %08x ", addr->debug.duration[1]); - db_printsym ((int)addr->debug.lock_pc, DB_STGY_ANY); -#endif /* USLOCK_DEBUG */ - db_printf ("\n"); -} - -void -db_show_one_lock( - lock_t *lock) -{ - db_printf("shared_count = 0x%x, %swant_upgrade, %swant_exclusive, ", - lock->lck_rw.lck_rw_shared_cnt, - lock->lck_rw.lck_rw_want_upgrade ? "" : "!", - lock->lck_rw.lck_rw_want_excl ? "" : "!"); - db_printf("%swaiting\n", - lock->lck_rw.lck_rw_waiting ? "" : "!"); - db_printf("%sInterlock\n", - lock->lck_rw.lck_rw_interlock ? "" : "!"); -} - -#endif /* MACH_KDB */ - diff --git a/osfmk/ppc/low_trace.h b/osfmk/ppc/low_trace.h deleted file mode 100644 index e5a81d7a9..000000000 --- a/osfmk/ppc/low_trace.h +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -/* - * - * These are the structures and constants used for the low-level trace - */ - - - - - - -#ifndef _LOW_TRACE_H_ -#define _LOW_TRACE_H_ - -#pragma pack(4) /* Make sure the structure stays as we defined it */ -typedef struct LowTraceRecord { - - unsigned short LTR_cpu; /* 0000 - CPU address */ - unsigned short LTR_excpt; /* 0002 - Exception code */ - unsigned int LTR_timeHi; /* 0004 - High order time */ - unsigned int LTR_timeLo; /* 0008 - Low order time */ - unsigned int LTR_cr; /* 000C - CR */ - unsigned int LTR_dsisr; /* 0010 - DSISR */ - unsigned int LTR_rsvd0; /* 0014 - reserved */ - uint64_t LTR_srr0; /* 0018 - SRR0 */ - - uint64_t LTR_srr1; /* 0020 - SRR1 */ - uint64_t LTR_dar; /* 0028 - DAR */ - uint64_t LTR_save; /* 0030 - savearea */ - uint64_t LTR_lr; /* 0038 - LR */ - - uint64_t LTR_ctr; /* 0040 - CTR */ - uint64_t LTR_r0; /* 0048 - R0 */ - uint64_t LTR_r1; /* 0050 - R1 */ - uint64_t LTR_r2; /* 0058 - R2 */ - - uint64_t LTR_r3; /* 0060 - R3 */ - uint64_t LTR_r4; /* 0068 - R4 */ - uint64_t LTR_r5; /* 0070 - R5 */ - uint64_t LTR_r6; /* 0078 - R6 */ - -} LowTraceRecord; -#pragma pack() - -#pragma pack(4) /* Make sure the structure stays as we defined it */ -typedef struct traceWork { - - unsigned int traceCurr; /* Address of next slot */ - unsigned int traceMask; /* Types to be traced */ - unsigned int traceStart; /* Start of trace table */ - unsigned int traceEnd; /* End of trace table */ - unsigned int traceMsnd; /* Saved trace mask */ - unsigned int traceSize; /* Size of trace table. Min 1 page */ - unsigned int traceGas[2]; -} traceWork; -#pragma pack() - -extern traceWork trcWork; -extern unsigned int lastTrace; /* Value of low-level exception trace controls */ - - -#endif /* ifndef _LOW_TRACE_H_ */ diff --git a/osfmk/ppc/lowglobals.h b/osfmk/ppc/lowglobals.h deleted file mode 100644 index 20503a89b..000000000 --- a/osfmk/ppc/lowglobals.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Header files for the Low Memory Globals (lg) - */ -#ifndef _LOW_MEMORY_GLOBALS_H_ -#define _LOW_MEMORY_GLOBALS_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Don't change these structures unless you change the corresponding assembly code - * which is in lowmem_vectors.s - */ - -/* - * This is where we put constants, pointers, and data areas that must be accessed - * quickly through assembler. They are designed to be accessed directly with - * absolute addresses, not via a base register. This is a global area, and not - * per processor. - */ - -#pragma pack(4) /* Make sure the structure stays as we defined it */ -typedef struct lowglo { - - unsigned long lgForceAddr[5*1024]; /* 0000 Force to page 5 */ - unsigned char lgVerCode[8]; /* 5000 System verification code */ - unsigned long long lgZero; /* 5008 Double constant 0 */ - unsigned int lgPPStart; /* 5010 Start of per_proc blocks */ - unsigned int lgCHUDXNUfnStart; /* 5014 CHUD XNU function glue table */ - unsigned int lgMckFlags; /* 5018 Machine check flags */ - unsigned int lgVersion; /* 501C Pointer to kernel version string */ - uint64_t lgPMWvaddr; /* 5020 physical memory window virtual address */ - uint64_t lgUMWvaddr; /* 5028 user memory window virtual address */ - unsigned int lgVMMforcedFeats; /* 5030 VMM boot-args forced feature flags */ - unsigned int lgMaxDec; /* 5034 Maximum decrementer we can set */ - unsigned int lgPmsCtlp; /* 5038 Pointer to power management stepper control */ - unsigned int lgRsv03C[17]; /* 503C reserved */ - traceWork lgTrcWork; /* 5080 Tracing control block - trcWork */ - unsigned int lgRsv0A0[24]; /* 50A0 reserved */ - struct Saveanchor lgSaveanchor; /* 5100 Savearea anchor - saveanchor */ - unsigned int lgRsv140[16]; /* 5140 reserved */ - unsigned int lgTlbieLck; /* 5180 TLBIE lock */ - unsigned int lgRsv184[31]; /* 5184 reserved - push to next line */ - struct diagWork lgdgWork; /* 5200 Start of diagnostic work area */ - unsigned int lglcksWork; /* 5220 lcks option */ - unsigned int lgRsv224[23]; /* 5224 reserved */ - pcfg lgpPcfg[8]; /* 5280 Page configurations */ - unsigned int lgRst2A0[24]; /* 52A0 reserved */ - unsigned int lgKillResv; /* 5300 line used to kill reservations */ - unsigned int lgKillResvpad[31]; /* 5304 pad reservation kill line */ - - unsigned int lgRsv380[32]; /* 5380 - 5400 reserved */ - - unsigned int lgRsv400[32]; /* 5400 - 5480 reserved */ - uint32_t lgKmodptr; /* 0x5480 Pointer to kmod, debugging aid */ - uint32_t lgTransOff; /* 0x5484 Pointer to kdp_trans_off, debugging aid */ - uint32_t lgReadIO; /* 0x5488 Pointer to kdp_read_io, debugging aid */ - uint32_t lgDevSlot1; /* 0x548C For developer use */ - uint32_t lgDevSlot2; /* 0x5490 For developer use */ - uint32_t lgOSVersion; /* 0x5494 Pointer to OS version string */ - uint32_t lgRebootFlag; /* 0x5498 Pointer to debugger reboot trigger */ - uint32_t lgManualPktAddr; /* 0x549C Pointer to manual packet structure */ - uint32_t lgRsv49C[728]; /* 0x54A0 Reserved - push to 1 page */ -} lowglo; - -extern lowglo lowGlo; - -#endif /* _LOW_MEMORY_GLOBALS_H_ */ diff --git a/osfmk/ppc/lowmem_vectors.s b/osfmk/ppc/lowmem_vectors.s deleted file mode 100644 index 5e2893d36..000000000 --- a/osfmk/ppc/lowmem_vectors.s +++ /dev/null @@ -1,4010 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#define ESPDEBUG 0 -#define INSTRUMENT 0 - -#define featAltivec 29 -#define wasNapping 30 - -#define VECTOR_SEGMENT .section __VECTORS, __interrupts - - VECTOR_SEGMENT - - .globl EXT(lowGlo) -EXT(lowGlo): - - .globl EXT(ExceptionVectorsStart) - -EXT(ExceptionVectorsStart): /* Used if relocating the exception vectors */ -baseR: /* Used so we have more readable code */ - -; -; Handle system reset. -; We do not ever expect a hard reset so we do not actually check. -; When we come here, we check for a RESET_HANDLER_START (which means we are -; waking up from sleep), a RESET_HANDLER_BUPOR (which is using for bring up -; when starting directly from a POR), and RESET_HANDLER_IGNORE (which means -; ignore the interrupt). -; -; Some machines (so far, 32-bit guys) will always ignore a non-START interrupt. -; The ones who do take it, check if the interrupt is too be ignored. This is -; always the case until the previous reset is handled (i.e., we have exited -; from the debugger). -; - . = 0xf0 - .globl EXT(ResetHandler) -EXT(ResetHandler): - .long 0x0 - .long 0x0 - .long 0x0 - - . = 0x100 -.L_handler100: - mtsprg 2,r13 /* Save R13 */ - mtsprg 3,r11 /* Save R11 */ - lwz r13,lo16(EXT(ResetHandler)-EXT(ExceptionVectorsStart)+RESETHANDLER_TYPE)(br0) ; Get reset type - mfcr r11 - cmpi cr0,r13,RESET_HANDLER_START - bne resetexc - - li r11,RESET_HANDLER_NULL - stw r11,lo16(EXT(ResetHandler)-EXT(ExceptionVectorsStart)+RESETHANDLER_TYPE)(br0) ; Clear reset type - - lwz r4,lo16(EXT(ResetHandler)-EXT(ExceptionVectorsStart)+RESETHANDLER_CALL)(br0) - lwz r3,lo16(EXT(ResetHandler)-EXT(ExceptionVectorsStart)+RESETHANDLER_ARG)(br0) - mtlr r4 - blr - -resetexc: cmplwi r13,RESET_HANDLER_BUPOR ; Special bring up POR sequence? - bne resetexc2 ; No... - lis r4,hi16(EXT(resetPOR)) ; Get POR code - ori r4,r4,lo16(EXT(resetPOR)) ; The rest - mtlr r4 ; Set it - blr ; Jump to it.... - -resetexc2: cmplwi cr1,r13,RESET_HANDLER_IGNORE ; Are we ignoring these? (Software debounce) - - mfsprg r13,0 ; Get per_proc - lwz r13,pfAvailable(r13) ; Get the features - rlwinm. r13,r13,0,pf64Bitb,pf64Bitb ; Is this a 64-bit machine? - cror cr1_eq,cr0_eq,cr1_eq ; See if we want to take this - bne-- cr1,rxCont ; Yes, continue... - bne-- rxIg64 ; 64-bit path... - - mtcr r11 ; Restore the CR - mfsprg r13,2 ; Restore R13 - mfsprg r11,0 ; Get per_proc - lwz r11,pfAvailable(r11) ; Get the features - mtsprg 2,r11 ; Restore sprg2 - mfsprg r11,3 ; Restore R11 - rfi ; Return and ignore the reset - -rxIg64: mtcr r11 ; Restore the CR - mfsprg r11,0 ; Get per_proc - mtspr hsprg0,r14 ; Save a register - ld r14,UAW(r11) ; Get the User Assist DoubleWord - mfsprg r13,2 ; Restore R13 - lwz r11,pfAvailable(r11) ; Get the features - mtsprg 2,r11 ; Restore sprg2 - mfsprg r11,3 ; Restore R11 - mtsprg 3,r14 ; Set the UAW in sprg3 - mfspr r14,hsprg0 ; Restore R14 - rfid ; Return and ignore the reset - -rxCont: mtcr r11 - li r11,RESET_HANDLER_IGNORE ; Get set to ignore - stw r11,lo16(EXT(ResetHandler)-EXT(ExceptionVectorsStart)+RESETHANDLER_TYPE)(br0) ; Start ignoring these - mfsprg r13,1 /* Get the exception save area */ - li r11,T_RESET /* Set 'rupt code */ - b .L_exception_entry /* Join common... */ - -/* - * Machine check - */ - - . = 0x200 -.L_handler200: - mtsprg 2,r13 ; Save R13 - mtsprg 3,r11 ; Save R11 - - .globl EXT(extPatchMCK) -LEXT(extPatchMCK) ; This is patched to a nop for 64-bit - b h200aaa ; Skip 64-bit code... - -; -; Fall through here for 970 MCKs. -; - - li r11,1 ; ? - sldi r11,r11,32+3 ; ? - mfspr r13,hid4 ; ? - or r11,r11,r13 ; ? - sync - mtspr hid4,r11 ; ? - isync - li r11,1 ; ? - sldi r11,r11,32+8 ; ? - andc r13,r13,r11 ; ? - lis r11,0xE000 ; Get the unlikeliest ESID possible - sync - mtspr hid4,r13 ; ? - isync ; ? - - srdi r11,r11,1 ; ? - slbie r11 ; ? - sync - isync - - li r11,T_MACHINE_CHECK ; Set rupt code - b .L_exception_entry ; Join common... - -; -; Preliminary checking of other MCKs -; - -h200aaa: mfsrr1 r11 ; Get the SRR1 - mfcr r13 ; Save the CR - - rlwinm. r11,r11,0,dcmck,dcmck ; ? - beq+ notDCache ; ? - - sync - mfspr r11,msscr0 ; ? - dssall ; ? - sync - isync - - oris r11,r11,hi16(dl1hwfm) ; ? - mtspr msscr0,r11 ; ? - -rstbsy: mfspr r11,msscr0 ; ? - - rlwinm. r11,r11,0,dl1hwf,dl1hwf ; ? - bne rstbsy ; ? - - sync ; ? - - mfsprg r11,0 ; Get the per_proc - mtcrf 255,r13 ; Restore CRs - lwz r13,hwMachineChecks(r11) ; Get old count - addi r13,r13,1 ; Count this one - stw r13,hwMachineChecks(r11) ; Set new count - lwz r11,pfAvailable(r11) ; Get the feature flags - mfsprg r13,2 ; Restore R13 - mtsprg 2,r11 ; Set the feature flags - mfsprg r11,3 ; Restore R11 - rfi ; Return - -notDCache: mtcrf 255,r13 ; Restore CRs - li r11,T_MACHINE_CHECK ; Set rupt code - b .L_exception_entry ; Join common... - - -/* - * Data access - page fault, invalid memory rights for operation - */ - - . = 0x300 -.L_handler300: - mtsprg 2,r13 /* Save R13 */ - mtsprg 3,r11 /* Save R11 */ - li r11,T_DATA_ACCESS /* Set 'rupt code */ - b .L_exception_entry /* Join common... */ - - -/* - * Data segment - */ - - . = 0x380 -.L_handler380: - mtsprg 2,r13 ; Save R13 - mtsprg 3,r11 ; Save R11 - li r11,T_DATA_SEGMENT ; Set rupt code - b .L_exception_entry ; Join common... - -/* - * Instruction access - as for data access - */ - - . = 0x400 -.L_handler400: - mtsprg 2,r13 ; Save R13 - mtsprg 3,r11 ; Save R11 - li r11,T_INSTRUCTION_ACCESS ; Set rupt code - b .L_exception_entry ; Join common... - -/* - * Instruction segment - */ - - . = 0x480 -.L_handler480: - mtsprg 2,r13 ; Save R13 - mtsprg 3,r11 ; Save R11 - li r11,T_INSTRUCTION_SEGMENT ; Set rupt code - b .L_exception_entry ; Join common... - -/* - * External interrupt - */ - - . = 0x500 -.L_handler500: - mtsprg 2,r13 ; Save R13 - mtsprg 3,r11 ; Save R11 - li r11,T_INTERRUPT ; Set rupt code - b .L_exception_entry ; Join common... - -/* - * Alignment - many reasons - */ - - . = 0x600 -.L_handler600: - mtsprg 2,r13 /* Save R13 */ - mtsprg 3,r11 /* Save R11 */ - li r11,T_ALIGNMENT|T_FAM /* Set 'rupt code */ - b .L_exception_entry /* Join common... */ - -/* - * Program - floating point exception, illegal inst, priv inst, user trap - */ - - . = 0x700 -.L_handler700: - mtsprg 2,r13 ; Save R13 - mtsprg 3,r11 ; Save R11 - li r11,T_PROGRAM|T_FAM ; Set program interruption code - b .L_exception_entry ; Join common... - -/* - * Floating point disabled - */ - - . = 0x800 -.L_handler800: - mtsprg 2,r13 /* Save R13 */ - mtsprg 3,r11 /* Save R11 */ - li r11,T_FP_UNAVAILABLE /* Set 'rupt code */ - b .L_exception_entry /* Join common... */ - - -/* - * Decrementer - DEC register has passed zero. - */ - - . = 0x900 -.L_handler900: - mtsprg 2,r13 /* Save R13 */ - mtsprg 3,r11 /* Save R11 */ - li r11,T_DECREMENTER /* Set 'rupt code */ - b .L_exception_entry /* Join common... */ - -/* - * I/O controller interface error - MACH does not use this - */ - - . = 0xA00 -.L_handlerA00: - mtsprg 2,r13 /* Save R13 */ - mtsprg 3,r11 /* Save R11 */ - li r11,T_IO_ERROR /* Set 'rupt code */ - b .L_exception_entry /* Join common... */ - -/* - * Reserved - */ - - . = 0xB00 -.L_handlerB00: - mtsprg 2,r13 /* Save R13 */ - mtsprg 3,r11 /* Save R11 */ - li r11,T_RESERVED /* Set 'rupt code */ - b .L_exception_entry /* Join common... */ - - -; System Calls (sc instruction) -; -; The syscall number is in r0. All we do here is munge the number into an -; 8-bit index into the "scTable", and dispatch on it to handle the Ultra -; Fast Traps (UFTs.) The index is: -; -; 0x80 - set if syscall number is 0x80000000 (CutTrace) -; 0x40 - set if syscall number is 0x00006004 -; 0x20 - set if upper 29 bits of syscall number are 0xFFFFFFF8 -; 0x10 - set if upper 29 bits of syscall number are 0x00007FF0 -; 0x0E - low three bits of syscall number -; 0x01 - zero, as scTable is an array of shorts - - . = 0xC00 -.L_handlerC00: - mtsprg 3,r11 ; Save R11 - mtsprg 2,r13 ; Save R13 - rlwinm r11,r0,0,0xFFFFFFF8 ; mask off low 3 bits of syscall number - xori r13,r11,0x7FF0 ; start to check for the 0x7FFx traps - addi r11,r11,8 ; make a 0 iff this is a 0xFFFFFFF8 trap - cntlzw r13,r13 ; set bit 0x20 iff a 0x7FFx trap - cntlzw r11,r11 ; set bit 0x20 iff a 0xFFFFFFF8 trap - xoris r0,r0,0x8000 ; Flip bit to make 0 iff 0x80000000 - rlwimi r11,r13,31,0x10 ; move 0x7FFx bit into position - cntlzw r13,r0 ; Set bit 0x20 iff 0x80000000 - xoris r0,r0,0x8000 ; Flip bit to restore R0 - rlwimi r11,r13,2,0x80 ; Set bit 0x80 iff CutTrace - xori r13,r0,0x6004 ; start to check for 0x6004 - rlwimi r11,r0,1,0xE ; move in low 3 bits of syscall number - cntlzw r13,r13 ; set bit 0x20 iff 0x6004 - rlwinm r11,r11,0,0,30 ; clear out bit 31 - rlwimi r11,r13,1,0x40 ; move 0x6004 bit into position - lhz r11,lo16(scTable)(r11) ; get branch address from sc table - mfctr r13 ; save callers ctr in r13 - mtctr r11 ; set up branch to syscall handler - mfsprg r11,0 ; get per_proc, which most UFTs use - bctr ; dispatch (r11 in sprg3, r13 in sprg2, ctr in r13, per_proc in r11) - -/* - * Trace - generated by single stepping - * performance monitor BE branch enable tracing/logging - * is also done here now. while this is permanently in the - * system the impact is completely unnoticable as this code is - * only executed when (a) a single step or branch exception is - * hit, (b) in the single step debugger case there is so much - * overhead already the few extra instructions for testing for BE - * are not even noticable - * - * Note that this trace is available only to user state so we do not - * need to set sprg2 before returning. - */ - - . = 0xD00 -.L_handlerD00: - mtsprg 3,r11 ; Save R11 - mfsprg r11,2 ; Get the feature flags - mtsprg 2,r13 ; Save R13 - - li r11,T_TRACE|T_FAM ; Set interrupt code - b .L_exception_entry ; Join common... - -/* - * Floating point assist - */ - - . = 0xE00 -.L_handlerE00: - mtsprg 2,r13 /* Save R13 */ - mtsprg 3,r11 /* Save R11 */ - li r11,T_FP_ASSIST /* Set 'rupt code */ - b .L_exception_entry /* Join common... */ - - -/* - * Performance monitor interruption - */ - - . = 0xF00 -PMIhandler: - mtsprg 2,r13 /* Save R13 */ - mtsprg 3,r11 /* Save R11 */ - li r11,T_PERF_MON /* Set 'rupt code */ - b .L_exception_entry /* Join common... */ - - -/* - * VMX exception - */ - - . = 0xF20 -VMXhandler: - mtsprg 2,r13 /* Save R13 */ - mtsprg 3,r11 /* Save R11 */ - li r11,T_VMX /* Set 'rupt code */ - b .L_exception_entry /* Join common... */ - - - -; -; Instruction translation miss exception - not supported -; - - . = 0x1000 -.L_handler1000: - mtsprg 2,r13 ; Save R13 - mtsprg 3,r11 ; Save R11 - li r11,T_INVALID_EXCP0 ; Set rupt code - b .L_exception_entry ; Join common... - - - -; -; Data load translation miss exception - not supported -; - - . = 0x1100 -.L_handler1100: - mtsprg 2,r13 ; Save R13 - mtsprg 3,r11 ; Save R11 - li r11,T_INVALID_EXCP1 ; Set rupt code - b .L_exception_entry ; Join common... - - - -; -; Data store translation miss exception - not supported -; - - . = 0x1200 -.L_handler1200: - mtsprg 2,r13 ; Save R13 - mtsprg 3,r11 ; Save R11 - li r11,T_INVALID_EXCP2 ; Set rupt code - b .L_exception_entry ; Join common... - - -/* - * Instruction address breakpoint - */ - - . = 0x1300 -.L_handler1300: - mtsprg 2,r13 /* Save R13 */ - mtsprg 3,r11 /* Save R11 */ - li r11,T_INSTRUCTION_BKPT /* Set 'rupt code */ - b .L_exception_entry /* Join common... */ - -/* - * System management interrupt - */ - - . = 0x1400 -.L_handler1400: - mtsprg 2,r13 /* Save R13 */ - mtsprg 3,r11 /* Save R11 */ - li r11,T_SYSTEM_MANAGEMENT /* Set 'rupt code */ - b .L_exception_entry /* Join common... */ - - -/* - * Soft Patch - */ - - . = 0x1500 -.L_handler1500: - mtsprg 2,r13 /* Save R13 */ - mtsprg 3,r11 /* Save R11 */ - li r11,T_SOFT_PATCH /* Set 'rupt code */ - b .L_exception_entry /* Join common... */ - -; -; Altivec Java Mode Assist interrupt or Maintenace interrupt -; - - . = 0x1600 -.L_handler1600: - mtsprg 2,r13 /* Save R13 */ - mtsprg 3,r11 /* Save R11 */ - li r11,T_ALTIVEC_ASSIST /* Set 'rupt code */ - b .L_exception_entry /* Join common... */ - -; -; Altivec Java Mode Assist interrupt or Thermal interruption -; - - . = 0x1700 -.L_handler1700: - mtsprg 2,r13 /* Save R13 */ - mtsprg 3,r11 /* Save R11 */ - li r11,T_THERMAL /* Set 'rupt code */ - b .L_exception_entry /* Join common... */ - -; -; Thermal interruption - 64-bit -; - - . = 0x1800 -.L_handler1800: - mtsprg 2,r13 /* Save R13 */ - mtsprg 3,r11 /* Save R11 */ - li r11,T_ARCHDEP0 /* Set 'rupt code */ - b .L_exception_entry /* Join common... */ - -/* - * There is now a large gap of reserved traps - */ - -/* - * Instrumentation interruption - */ - - . = 0x2000 -.L_handler2000: - mtsprg 2,r13 /* Save R13 */ - mtsprg 3,r11 /* Save R11 */ - li r11,T_INSTRUMENTATION /* Set 'rupt code */ - b .L_exception_entry /* Join common... */ - - - - .data - .align ALIGN - .globl EXT(exception_entry) -EXT(exception_entry): - .long .L_exception_entry-EXT(ExceptionVectorsStart) /* phys addr of fn */ - - VECTOR_SEGMENT - -/*<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> - * - * First-level syscall dispatch. The syscall vector maps r0 (the syscall number) into an - * index into the "scTable" (below), and then branches to one of these routines. The PPC - * syscalls come in several varieties, as follows: - * - * 1. If (syscall & 0xFFFFF000) == 0x00007000, then it is a PPC Fast Trap or UFT. - * The UFTs are dispatched here, the Fast Traps are dispatched in hw_exceptions.s. - * - * 2. If (syscall & 0xFFFFF000) == 0x00006000, then it is a PPC-only trap. - * One of these (0x6004) is a UFT, but most are dispatched in hw_exceptions.s. These - * are mostly Blue Box or VMM (Virtual Machine) calls. - * - * 3. If (syscall & 0xFFFFFFF0) == 0xFFFFFFF0, then it is also a UFT and is dispatched here. - * - * 4. If (syscall & 0xFFFFF000) == 0x80000000, then it is a "firmware" call and is dispatched in - * Firmware.s, though the special "Cut Trace" trap (0x80000000) is handled here as an ultra - * fast trap. - * - * 5. If (syscall & 0xFFFFF000) == 0xFFFFF000, and it is not one of the above, then it is a Mach - * syscall, which are dispatched in hw_exceptions.s via "mach_trap_table". - * - * 6. If (syscall & 0xFFFFF000) == 0x00000000, then it is a BSD syscall, which are dispatched - * by "unix_syscall" using the "sysent" table. - * - * What distinguishes the UFTs, aside from being ultra fast, is that they cannot rely on translation - * being on, and so cannot look at the activation or task control block, etc. We handle them right - * here, and return to the caller without turning interrupts or translation on. The UFTs are: - * - * 0xFFFFFFFF - BlueBox only - MKIsPreemptiveTask - * 0xFFFFFFFE - BlueBox only - MKIsPreemptiveTaskEnv - * 0x00007FF2 - User state only - thread info (32-bit mode) - * 0x00007FF3 - User state only - floating point / vector facility status - * 0x00007FF4 - Kernel only - loadMSR - not used on 64-bit machines - * 0x00006004 - vmm_dispatch (only some of which are UFTs) - * - * "scTable" is an array of 2-byte addresses, accessed using a 7-bit index derived from the syscall - * number as follows: - * - * 0x80 (A) - set if syscall number is 0x80000000 - * 0x40 (B) - set if syscall number is 0x00006004 - * 0x20 (C) - set if upper 29 bits of syscall number are 0xFFFFFFF8 - * 0x10 (D) - set if upper 29 bits of syscall number are 0x00007FF0 - * 0x0E (E) - low three bits of syscall number - * - * If you define another UFT, try to use a number in one of the currently decoded ranges, ie one marked - * "unassigned" below. The dispatch table and the UFT handlers must reside in the first 32KB of - * physical memory. - */ - - .align 8 ; start this table on a 256-byte boundry -scTable: ; ABCD E - .short uftNormalSyscall-baseR ; 0000 0 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0000 1 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0000 2 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0000 3 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0000 4 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0000 5 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0000 6 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0000 7 these syscalls are not in a reserved range - - .short uftNormalSyscall-baseR ; 0001 0 0x7FF0 is unassigned - .short uftNormalSyscall-baseR ; 0001 1 0x7FF1 is Set Thread Info Fast Trap (pass up) - .short uftThreadInfo-baseR ; 0001 2 0x7FF2 is Thread Info - .short uftFacilityStatus-baseR ; 0001 3 0x7FF3 is Facility Status - .short uftLoadMSR-baseR ; 0001 4 0x7FF4 is Load MSR - .short uftNormalSyscall-baseR ; 0001 5 0x7FF5 is the Null FastPath Trap (pass up) - .short uftNormalSyscall-baseR ; 0001 6 0x7FF6 is unassigned - .short uftNormalSyscall-baseR ; 0001 7 0x7FF7 is unassigned - - .short uftNormalSyscall-baseR ; 0010 0 0xFFFFFFF0 is unassigned - .short uftNormalSyscall-baseR ; 0010 1 0xFFFFFFF1 is unassigned - .short uftNormalSyscall-baseR ; 0010 2 0xFFFFFFF2 is unassigned - .short uftNormalSyscall-baseR ; 0010 3 0xFFFFFFF3 is unassigned - .short uftNormalSyscall-baseR ; 0010 4 0xFFFFFFF4 is unassigned - .short uftNormalSyscall-baseR ; 0010 5 0xFFFFFFF5 is unassigned - .short uftIsPreemptiveTaskEnv-baseR ; 0010 6 0xFFFFFFFE is Blue Box uftIsPreemptiveTaskEnv - .short uftIsPreemptiveTask-baseR ; 0010 7 0xFFFFFFFF is Blue Box IsPreemptiveTask - - .short WhoaBaby-baseR ; 0011 0 impossible combination - .short WhoaBaby-baseR ; 0011 1 impossible combination - .short WhoaBaby-baseR ; 0011 2 impossible combination - .short WhoaBaby-baseR ; 0011 3 impossible combination - .short WhoaBaby-baseR ; 0011 4 impossible combination - .short WhoaBaby-baseR ; 0011 5 impossible combination - .short WhoaBaby-baseR ; 0011 6 impossible combination - .short WhoaBaby-baseR ; 0011 7 impossible combination - - .short WhoaBaby-baseR ; 0100 0 0x6000 is an impossible index (diagCall) - .short WhoaBaby-baseR ; 0100 1 0x6001 is an impossible index (vmm_get_version) - .short WhoaBaby-baseR ; 0100 2 0x6002 is an impossible index (vmm_get_features) - .short WhoaBaby-baseR ; 0100 3 0x6003 is an impossible index (vmm_init_context) - .short uftVMM-baseR ; 0100 4 0x6004 is vmm_dispatch (only some of which are UFTs) - .short WhoaBaby-baseR ; 0100 5 0x6005 is an impossible index (bb_enable_bluebox) - .short WhoaBaby-baseR ; 0100 6 0x6006 is an impossible index (bb_disable_bluebox) - .short WhoaBaby-baseR ; 0100 7 0x6007 is an impossible index (bb_settaskenv) - - .short uftNormalSyscall-baseR ; 0101 0 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0101 1 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0101 2 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0101 3 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0101 4 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0101 5 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0101 6 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0101 7 these syscalls are not in a reserved range - - .short uftNormalSyscall-baseR ; 0110 0 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0110 1 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0110 2 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0110 3 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0110 4 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0110 5 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0110 6 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0110 7 these syscalls are not in a reserved range - - .short uftNormalSyscall-baseR ; 0111 0 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0111 1 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0111 2 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0111 3 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0111 4 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0111 5 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0111 6 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 0111 7 these syscalls are not in a reserved range - - .short uftCutTrace-baseR ; 1000 0 CutTrace - .short uftNormalSyscall-baseR ; 1000 1 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1000 2 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1000 3 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1000 4 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1000 5 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1000 6 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1000 7 these syscalls are not in a reserved range - - .short uftNormalSyscall-baseR ; 1001 0 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1001 1 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1001 2 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1001 3 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1001 4 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1001 5 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1001 6 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1001 7 these syscalls are not in a reserved range - - .short uftNormalSyscall-baseR ; 1010 0 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1010 1 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1010 2 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1010 3 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1010 4 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1010 5 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1010 6 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1010 7 these syscalls are not in a reserved range - - .short uftNormalSyscall-baseR ; 1011 0 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1011 1 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1011 2 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1011 3 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1011 4 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1011 5 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1011 6 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1011 7 these syscalls are not in a reserved range - - .short uftNormalSyscall-baseR ; 1100 0 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1100 1 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1100 2 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1100 3 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1100 4 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1100 5 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1100 6 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1100 7 these syscalls are not in a reserved range - - .short uftNormalSyscall-baseR ; 1101 0 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1101 1 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1101 2 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1101 3 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1101 4 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1101 5 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1101 6 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1101 7 these syscalls are not in a reserved range - - .short uftNormalSyscall-baseR ; 1110 0 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1110 1 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1110 2 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1110 3 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1110 4 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1110 5 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1110 6 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1110 7 these syscalls are not in a reserved range - - .short uftNormalSyscall-baseR ; 1111 0 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1111 1 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1111 2 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1111 3 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1111 4 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1111 5 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1111 6 these syscalls are not in a reserved range - .short uftNormalSyscall-baseR ; 1111 7 these syscalls are not in a reserved range - - .align 2 ; prepare for code - - -/* Ultra Fast Trap (UFT) Handlers: - * - * We get here directly from the hw syscall vector via the "scTable" vector (above), - * with interrupts and VM off, in 64-bit mode if supported, and with all registers live - * except the following: - * - * r11 = per_proc ptr (ie, sprg0) - * r13 = holds caller's ctr register - * sprg2 = holds caller's r13 - * sprg3 = holds caller's r11 - */ - -; Handle "vmm_dispatch" (0x6004), of which only some selectors are UFTs. - -uftVMM: - mtctr r13 ; restore callers ctr - lwz r11,spcFlags(r11) ; get the special flags word from per_proc - mfcr r13 ; save callers entire cr (we use all fields below) - rlwinm r11,r11,16,16,31 ; Extract spcFlags upper bits - andi. r11,r11,hi16(runningVM|FamVMena|FamVMmode) - cmpwi cr0,r11,hi16(runningVM|FamVMena|FamVMmode) ; Test in VM FAM - bne-- uftNormal80 ; not eligible for FAM UFTs - cmpwi cr5,r3,kvmmResumeGuest ; Compare r3 with kvmmResumeGuest - cmpwi cr2,r3,kvmmSetGuestRegister ; Compare r3 with kvmmSetGuestRegister - cror cr1_eq,cr5_lt,cr2_gt ; Set true if out of VMM Fast syscall range - bt-- cr1_eq,uftNormalFF ; Exit if out of range (the others are not UFTs) - b EXT(vmm_ufp) ; handle UFT range of vmm_dispatch syscall - - -; Handle blue box UFTs (syscalls -1 and -2). - -uftIsPreemptiveTask: -uftIsPreemptiveTaskEnv: - mtctr r13 ; restore callers ctr - lwz r11,spcFlags(r11) ; get the special flags word from per_proc - mfcr r13,0x80 ; save callers cr0 so we can use it - andi. r11,r11,bbNoMachSC|bbPreemptive ; Clear what we do not need - cmplwi r11,bbNoMachSC ; See if we are trapping syscalls - blt-- uftNormal80 ; No... - cmpwi r0,-2 ; is this call IsPreemptiveTaskEnv? - rlwimi r13,r11,bbPreemptivebit-cr0_eq,cr0_eq,cr0_eq ; Copy preemptive task flag into user cr0_eq - mfsprg r11,0 ; Get the per proc once more - bne++ uftRestoreThenRFI ; do not load r0 if IsPreemptiveTask - lwz r0,ppbbTaskEnv(r11) ; Get the shadowed taskEnv (only difference) - b uftRestoreThenRFI ; restore modified cr0 and return - - -; Handle "Thread Info" UFT (0x7FF2) - - .globl EXT(uft_uaw_nop_if_32bit) -uftThreadInfo: - lwz r3,UAW+4(r11) ; get user assist word, assuming a 32-bit processor -LEXT(uft_uaw_nop_if_32bit) - ld r3,UAW(r11) ; get the whole doubleword if 64-bit (patched to nop if 32-bit) - mtctr r13 ; restore callers ctr - b uftRFI ; done - - -; Handle "Facility Status" UFT (0x7FF3) - -uftFacilityStatus: - lwz r3,spcFlags(r11) ; get "special flags" word from per_proc - mtctr r13 ; restore callers ctr - b uftRFI ; done - - -; Handle "Load MSR" UFT (0x7FF4). This is not used on 64-bit processors, though it would work. - -uftLoadMSR: - mfsrr1 r11 ; get callers MSR - mtctr r13 ; restore callers ctr - mfcr r13,0x80 ; save callers cr0 so we can test PR - rlwinm. r11,r11,0,MSR_PR_BIT,MSR_PR_BIT ; really in the kernel? - bne- uftNormal80 ; do not permit from user mode - mfsprg r11,0 ; restore per_proc - mtsrr1 r3 ; Set new MSR - - -; Return to caller after UFT. When called: -; r11 = per_proc ptr -; r13 = callers cr0 in upper nibble (if uftRestoreThenRFI called) -; sprg2 = callers r13 -; sprg3 = callers r11 - -uftRestoreThenRFI: ; WARNING: can drop down to here - mtcrf 0x80,r13 ; restore callers cr0 -uftRFI: - .globl EXT(uft_nop_if_32bit) -LEXT(uft_nop_if_32bit) - b uftX64 ; patched to NOP if 32-bit processor - -uftX32: lwz r11,pfAvailable(r11) ; Get the feature flags - mfsprg r13,2 ; Restore R13 - mtsprg 2,r11 ; Set the feature flags - mfsprg r11,3 ; Restore R11 - rfi ; Back to our guy... - -uftX64: mtspr hsprg0,r14 ; Save a register in a Hypervisor SPRG - ld r14,UAW(r11) ; Get the User Assist DoubleWord - lwz r11,pfAvailable(r11) ; Get the feature flags - mfsprg r13,2 ; Restore R13 - mtsprg 2,r11 ; Set the feature flags - mfsprg r11,3 ; Restore R11 - mtsprg 3,r14 ; Set the UAW in sprg3 - mfspr r14,hsprg0 ; Restore R14 - rfid ; Back to our guy... - -; -; Quickly cut a trace table entry for the CutTrace firmware call. -; -; All registers except R11 and R13 are unchanged. -; -; Note that this code cuts a trace table entry for the CutTrace call only. -; An identical entry is made during normal interrupt processing. Any entry -; format entry changes made must be done in both places. -; - - .align 5 - - .globl EXT(uft_cuttrace) -LEXT(uft_cuttrace) -uftCutTrace: - b uftct64 ; patched to NOP if 32-bit processor - - stw r20,tempr0(r11) ; Save some work registers - lwz r20,dgFlags(0) ; Get the flags - stw r21,tempr1(r11) ; Save some work registers - mfsrr1 r21 ; Get the SRR1 - rlwinm r20,r20,MSR_PR_BIT-enaUsrFCallb,MASK(MSR_PR) ; Shift the validity bit over to pr bit spot - stw r25,tempr2(r11) ; Save some work registers - orc r20,r20,r21 ; Get ~PR | FC - mfcr r25 ; Save the CR - stw r22,tempr3(r11) ; Save some work registers - lhz r22,PP_CPU_NUMBER(r11) ; Get the logical processor number - andi. r20,r20,MASK(MSR_PR) ; Set cr0_eq is we are in problem state and the validity bit is not set - stw r23,tempr4(r11) ; Save some work registers - lwz r23,traceMask(0) ; Get the trace mask - stw r24,tempr5(r11) ; Save some work registers - beq- ctbail32 ; Can not issue from user... - - - addi r24,r22,16 ; Get shift to move cpu mask to syscall mask - rlwnm r24,r23,r24,12,12 ; Shift cpu mask bit to rupt type mask - and. r24,r24,r23 ; See if both are on - -; -; We select a trace entry using a compare and swap on the next entry field. -; Since we do not lock the actual trace buffer, there is a potential that -; another processor could wrap an trash our entry. Who cares? -; - - li r23,trcWork ; Get the trace work area address - lwz r21,traceStart(0) ; Get the start of trace table - lwz r22,traceEnd(0) ; Get end of trace table - - beq-- ctdisa32 ; Leave because tracing is disabled... - -ctgte32: lwarx r20,0,r23 ; Get and reserve the next slot to allocate - addi r24,r20,LTR_size ; Point to the next trace entry - cmplw r24,r22 ; Do we need to wrap the trace table? - bne+ ctgte32s ; No wrap, we got us a trace entry... - - mr r24,r21 ; Wrap back to start - -ctgte32s: stwcx. r24,0,r23 ; Try to update the current pointer - bne- ctgte32 ; Collision, try again... - -#if ESPDEBUG - dcbf 0,r23 ; Force to memory - sync -#endif - - dcbz 0,r20 ; Clear and allocate first trace line - li r24,32 ; Offset to next line - -ctgte32tb: mftbu r21 ; Get the upper time now - mftb r22 ; Get the lower time now - mftbu r23 ; Get upper again - cmplw r21,r23 ; Has it ticked? - bne- ctgte32tb ; Yes, start again... - - dcbz r24,r20 ; Clean second line - -; -; Let us cut that trace entry now. -; -; Note that this code cuts a trace table entry for the CutTrace call only. -; An identical entry is made during normal interrupt processing. Any entry -; format entry changes made must be done in both places. -; - - lhz r24,PP_CPU_NUMBER(r11) ; Get the logical processor number - li r23,T_SYSTEM_CALL ; Get the system call id - mtctr r13 ; Restore the callers CTR - sth r24,LTR_cpu(r20) ; Save processor number - li r24,64 ; Offset to third line - sth r23,LTR_excpt(r20) ; Set the exception code - dcbz r24,r20 ; Clean 3rd line - mfspr r23,dsisr ; Get the DSISR - stw r21,LTR_timeHi(r20) ; Save top of time stamp - li r24,96 ; Offset to fourth line - mflr r21 ; Get the LR - dcbz r24,r20 ; Clean 4th line - stw r22,LTR_timeLo(r20) ; Save bottom of time stamp - mfsrr0 r22 ; Get SRR0 - stw r25,LTR_cr(r20) ; Save CR - mfsrr1 r24 ; Get the SRR1 - stw r23,LTR_dsisr(r20) ; Save DSISR - stw r22,LTR_srr0+4(r20) ; Save SRR0 - mfdar r23 ; Get DAR - stw r24,LTR_srr1+4(r20) ; Save SRR1 - stw r23,LTR_dar+4(r20) ; Save DAR - stw r21,LTR_lr+4(r20) ; Save LR - - stw r13,LTR_ctr+4(r20) ; Save CTR - stw r0,LTR_r0+4(r20) ; Save register - stw r1,LTR_r1+4(r20) ; Save register - stw r2,LTR_r2+4(r20) ; Save register - stw r3,LTR_r3+4(r20) ; Save register - stw r4,LTR_r4+4(r20) ; Save register - stw r5,LTR_r5+4(r20) ; Save register - stw r6,LTR_r6+4(r20) ; Save register - -#if 0 - lwz r21,FPUowner(r11) ; (TEST/DEBUG) Get the current floating point owner - stw r21,LTR_rsvd0(r20) ; (TEST/DEBUG) Record the owner -#endif - -#if ESPDEBUG - addi r21,r20,32 ; Second line - addi r22,r20,64 ; Third line - dcbst 0,r20 ; Force to memory - dcbst 0,r21 ; Force to memory - addi r21,r22,32 ; Fourth line - dcbst 0,r22 ; Force to memory - dcbst 0,r21 ; Force to memory - sync ; Make sure it all goes -#endif - -ctdisa32: mtcrf 0x80,r25 ; Restore the used condition register field - lwz r20,tempr0(r11) ; Restore work register - lwz r21,tempr1(r11) ; Restore work register - lwz r25,tempr2(r11) ; Restore work register - mtctr r13 ; Restore the callers CTR - lwz r22,tempr3(r11) ; Restore work register - lwz r23,tempr4(r11) ; Restore work register - lwz r24,tempr5(r11) ; Restore work register - b uftX32 ; Go restore the rest and go... - -ctbail32: mtcrf 0x80,r25 ; Restore the used condition register field - lwz r20,tempr0(r11) ; Restore work register - lwz r21,tempr1(r11) ; Restore work register - lwz r25,tempr2(r11) ; Restore work register - mtctr r13 ; Restore the callers CTR - lwz r22,tempr3(r11) ; Restore work register - lwz r23,tempr4(r11) ; Restore work register - b uftNormalSyscall ; Go pass it on along... - -; -; This is the 64-bit version. -; - -uftct64: std r20,tempr0(r11) ; Save some work registers - lwz r20,dgFlags(0) ; Get the flags - std r21,tempr1(r11) ; Save some work registers - mfsrr1 r21 ; Get the SRR1 - rlwinm r20,r20,MSR_PR_BIT-enaUsrFCallb,MASK(MSR_PR) ; Shift the validity bit over to pr bit spot - std r25,tempr2(r11) ; Save some work registers - orc r20,r20,r21 ; Get ~PR | FC - mfcr r25 ; Save the CR - std r22,tempr3(r11) ; Save some work registers - lhz r22,PP_CPU_NUMBER(r11) ; Get the logical processor number - andi. r20,r20,MASK(MSR_PR) ; Set cr0_eq when we are in problem state and the validity bit is not set - std r23,tempr4(r11) ; Save some work registers - lwz r23,traceMask(0) ; Get the trace mask - std r24,tempr5(r11) ; Save some work registers - beq-- ctbail64 ; Can not issue from user... - - addi r24,r22,16 ; Get shift to move cpu mask to syscall mask - rlwnm r24,r23,r24,12,12 ; Shift cpu mask bit to rupt type mask - and. r24,r24,r23 ; See if both are on - -; -; We select a trace entry using a compare and swap on the next entry field. -; Since we do not lock the actual trace buffer, there is a potential that -; another processor could wrap an trash our entry. Who cares? -; - - li r23,trcWork ; Get the trace work area address - lwz r21,traceStart(0) ; Get the start of trace table - lwz r22,traceEnd(0) ; Get end of trace table - - beq-- ctdisa64 ; Leave because tracing is disabled... - -ctgte64: lwarx r20,0,r23 ; Get and reserve the next slot to allocate - addi r24,r20,LTR_size ; Point to the next trace entry - cmplw r24,r22 ; Do we need to wrap the trace table? - bne++ ctgte64s ; No wrap, we got us a trace entry... - - mr r24,r21 ; Wrap back to start - -ctgte64s: stwcx. r24,0,r23 ; Try to update the current pointer - bne-- ctgte64 ; Collision, try again... - -#if ESPDEBUG - dcbf 0,r23 ; Force to memory - sync -#endif - - dcbz128 0,r20 ; Zap the trace entry - - mftb r21 ; Get the time - -; -; Let us cut that trace entry now. -; -; Note that this code cuts a trace table entry for the CutTrace call only. -; An identical entry is made during normal interrupt processing. Any entry -; format entry changes made must be done in both places. -; - - lhz r24,PP_CPU_NUMBER(r11) ; Get the logical processor number - li r23,T_SYSTEM_CALL ; Get the system call id - sth r24,LTR_cpu(r20) ; Save processor number - sth r23,LTR_excpt(r20) ; Set the exception code - mfspr r23,dsisr ; Get the DSISR - std r21,LTR_timeHi(r20) ; Save top of time stamp - mflr r21 ; Get the LR - mfsrr0 r22 ; Get SRR0 - stw r25,LTR_cr(r20) ; Save CR - mfsrr1 r24 ; Get the SRR1 - stw r23,LTR_dsisr(r20) ; Save DSISR - std r22,LTR_srr0(r20) ; Save SRR0 - mfdar r23 ; Get DAR - std r24,LTR_srr1(r20) ; Save SRR1 - std r23,LTR_dar(r20) ; Save DAR - std r21,LTR_lr(r20) ; Save LR - - std r13,LTR_ctr(r20) ; Save CTR - std r0,LTR_r0(r20) ; Save register - std r1,LTR_r1(r20) ; Save register - std r2,LTR_r2(r20) ; Save register - std r3,LTR_r3(r20) ; Save register - std r4,LTR_r4(r20) ; Save register - std r5,LTR_r5(r20) ; Save register - std r6,LTR_r6(r20) ; Save register - -#if 0 - lwz r21,FPUowner(r11) ; (TEST/DEBUG) Get the current floating point owner - stw r21,LTR_rsvd0(r20) ; (TEST/DEBUG) Record the owner -#endif - -#if ESPDEBUG - dcbf 0,r20 ; Force to memory - sync ; Make sure it all goes -#endif - -ctdisa64: mtcrf 0x80,r25 ; Restore the used condition register field - ld r20,tempr0(r11) ; Restore work register - ld r21,tempr1(r11) ; Restore work register - ld r25,tempr2(r11) ; Restore work register - mtctr r13 ; Restore the callers CTR - ld r22,tempr3(r11) ; Restore work register - ld r23,tempr4(r11) ; Restore work register - ld r24,tempr5(r11) ; Restore work register - b uftX64 ; Go restore the rest and go... - -ctbail64: mtcrf 0x80,r25 ; Restore the used condition register field - ld r20,tempr0(r11) ; Restore work register - ld r21,tempr1(r11) ; Restore work register - ld r25,tempr2(r11) ; Restore work register - mtctr r13 ; Restore the callers CTR - ld r22,tempr3(r11) ; Restore work register - ld r23,tempr4(r11) ; Restore work register - li r11,T_SYSTEM_CALL|T_FAM ; Set system code call - b extEntry64 ; Go straight to the 64-bit code... - - - -; Handle a system call that is not a UFT and which thus goes upstairs. - -uftNormalFF: ; here with entire cr in r13 - mtcr r13 ; restore all 8 fields - b uftNormalSyscall1 ; Join common... - -uftNormal80: ; here with callers cr0 in r13 - mtcrf 0x80,r13 ; restore cr0 - b uftNormalSyscall1 ; Join common... - -uftNormalSyscall: ; r13 = callers ctr - mtctr r13 ; restore ctr -uftNormalSyscall1: - li r11,T_SYSTEM_CALL|T_FAM ; this is a system call (and fall through) - - -/*<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>*/ -/* - * .L_exception_entry(type) - * - * Come here via branch directly from the vector, or falling down from above, with the following - * set up: - * - * ENTRY: interrupts off, VM off, in 64-bit mode if supported - * Caller's r13 saved in sprg2. - * Caller's r11 saved in sprg3. - * Exception code (ie, T_SYSTEM_CALL etc) in r11. - * All other registers are live. - * - */ - -.L_exception_entry: ; WARNING: can fall through from UFT handler - -/* - * - * Here we will save off a mess of registers, the special ones and R0-R12. We use the DCBZ - * instruction to clear and allcoate a line in the cache. This way we won't take any cache - * misses, so these stores won't take all that long. Except the first line that is because - * we can't do a DCBZ if the L1 D-cache is off. The rest we will skip if they are - * off also. - * - * Note that if we are attempting to sleep (as opposed to nap or doze) all interruptions - * are ignored. - */ - - - .globl EXT(extPatch32) - - -LEXT(extPatch32) - b extEntry64 ; Go do 64-bit (patched to a nop if 32-bit) - mfsprg r13,0 ; Load per_proc - lwz r13,next_savearea+4(r13) ; Get the exception save area - stw r0,saver0+4(r13) ; Save register 0 - stw r1,saver1+4(r13) ; Save register 1 - - mfspr r1,hid0 ; Get HID0 - mfcr r0 ; Save the whole CR - - mtcrf 0x20,r1 ; Get set to test for sleep - cror doze,doze,nap ; Remember if we are napping - bf sleep,notsleep ; Skip if we are not trying to sleep - - mtcrf 0x20,r0 ; Restore the CR - lwz r0,saver0+4(r13) ; Restore R0 - lwz r1,saver1+4(r13) ; Restore R1 - mfsprg r13,0 ; Get the per_proc - lwz r11,pfAvailable(r13) ; Get back the feature flags - mfsprg r13,2 ; Restore R13 - mtsprg 2,r11 ; Set sprg2 to the features - mfsprg r11,3 ; Restore R11 - rfi ; Jump back into sleep code... - .long 0 ; Leave these here please... - .long 0 - .long 0 - .long 0 - .long 0 - .long 0 - .long 0 - .long 0 - - -; -; This is the 32-bit context saving stuff -; - - .align 5 - -notsleep: stw r2,saver2+4(r13) ; Save this one - bf doze,notspdo ; Skip the next if we are not napping/dozing... - rlwinm r2,r1,0,nap+1,doze-1 ; Clear any possible nap and doze bits - mtspr hid0,r2 ; Clear the nap/doze bits - -notspdo: - la r1,saver4(r13) ; Point to the next line in case we need it - crmove wasNapping,doze ; Remember if we were napping - mfsprg r2,0 ; Get the per_proc area - dcbz 0,r1 ; allocate r4-r7 32-byte line in cache - -; -; Remember, we are setting up CR6 with feature flags -; - andi. r1,r11,T_FAM ; Check FAM bit - - stw r3,saver3+4(r13) ; Save this one - stw r4,saver4+4(r13) ; Save this one - andc r11,r11,r1 ; Clear FAM bit - beq+ noFAM ; Is it FAM intercept - mfsrr1 r3 ; Load srr1 - rlwinm. r3,r3,0,MSR_PR_BIT,MSR_PR_BIT ; Are we trapping from supervisor state? - beq+ noFAM ; From supervisor state - lwz r1,spcFlags(r2) ; Load spcFlags - rlwinm r1,r1,1+FamVMmodebit,30,31 ; Extract FamVMenabit and FamVMmodebit - cmpwi cr0,r1,2 ; Check FamVMena set without FamVMmode - bne+ noFAM ; Can this context be FAM intercept - lwz r4,FAMintercept(r2) ; Load exceptions mask to intercept - srwi r1,r11,2 ; Divide r11 by 4 - lis r3,0x8000 ; Set r3 to 0x80000000 - srw r1,r3,r1 ; Set bit for current exception - and. r1,r1,r4 ; And current exception with the intercept mask - beq+ noFAM ; Is it FAM intercept - b EXT(vmm_fam_exc) -noFAM: - lwz r1,pfAvailable(r2) ; Get the CPU features flags - la r3,saver8(r13) ; Point to line with r8-r11 - mtcrf 0xE2,r1 ; Put the features flags (that we care about) in the CR - dcbz 0,r3 ; allocate r8-r11 32-byte line in cache - la r3,saver12(r13) ; point to r12-r15 line - lis r4,hi16(MASK(MSR_VEC)|MASK(MSR_FP)|MASK(MSR_ME)) ; Set up the MSR we will use throughout. Note that ME come on here if MCK - stw r6,saver6+4(r13) ; Save this one - ori r4,r4,lo16(MASK(MSR_VEC)|MASK(MSR_FP)|MASK(MSR_ME)) ; Rest of MSR - stw r8,saver8+4(r13) ; Save this one - crmove featAltivec,pfAltivecb ; Set the Altivec flag - mtmsr r4 ; Set MSR - isync - mfsrr0 r6 ; Get the interruption SRR0 - la r8,savesrr0(r13) ; point to line with SRR0, SRR1, CR, XER, and LR - dcbz 0,r3 ; allocate r12-r15 32-byte line in cache - la r3,saver16(r13) ; point to next line - dcbz 0,r8 ; allocate 32-byte line with SRR0, SRR1, CR, XER, and LR - stw r7,saver7+4(r13) ; Save this one - mfsrr1 r7 ; Get the interrupt SRR1 - stw r6,savesrr0+4(r13) ; Save the SRR0 - stw r5,saver5+4(r13) ; Save this one - mfsprg r6,2 ; Get interrupt time R13 - mtsprg 2,r1 ; Set the feature flags - mfsprg r8,3 ; Get rupt time R11 - stw r7,savesrr1+4(r13) ; Save SRR1 - stw r8,saver11+4(r13) ; Save rupt time R11 - stw r6,saver13+4(r13) ; Save rupt R13 - dcbz 0,r3 ; allocate 32-byte line with r16-r19 - la r3,saver20(r13) ; point to next line - -getTB: mftbu r6 ; Get the upper timebase - mftb r7 ; Get the lower timebase - mftbu r8 ; Get the upper one again - cmplw r6,r8 ; Did the top tick? - bne- getTB ; Yeah, need to get it again... - - stw r8,ruptStamp(r2) ; Save the top of time stamp - stw r8,SAVtime(r13) ; Save the top of time stamp - stw r7,ruptStamp+4(r2) ; Save the bottom of time stamp - stw r7,SAVtime+4(r13) ; Save the bottom of time stamp - - dcbz 0,r3 ; allocate 32-byte line with r20-r23 - stw r9,saver9+4(r13) ; Save this one - - stw r10,saver10+4(r13) ; Save this one - mflr r4 ; Get the LR - mfxer r10 ; Get the XER - - bf+ wasNapping,notNapping ; Skip if not waking up from nap... - - lwz r6,napStamp+4(r2) ; Pick up low order nap stamp - lis r3,hi16(EXT(machine_idle_ret)) ; Get high part of nap/doze return - lwz r5,napStamp(r2) ; and high order - subfc r7,r6,r7 ; Subtract low stamp from now - lwz r6,napTotal+4(r2) ; Pick up low total - subfe r5,r5,r8 ; Subtract high stamp and borrow from now - lwz r8,napTotal(r2) ; Pick up the high total - addc r6,r6,r7 ; Add low to total - ori r3,r3,lo16(EXT(machine_idle_ret)) ; Get low part of nap/doze return - adde r8,r8,r5 ; Add high and carry to total - stw r6,napTotal+4(r2) ; Save the low total - stw r8,napTotal(r2) ; Save the high total - stw r3,savesrr0+4(r13) ; Modify to return to nap/doze exit - - rlwinm. r3,r1,0,pfSlowNapb,pfSlowNapb ; Should HID1 be restored? - beq notInSlowNap - - lwz r3,pfHID1(r2) ; Get saved HID1 value - mtspr hid1,r3 ; Restore HID1 - -notInSlowNap: - rlwinm. r3,r1,0,pfNoL2PFNapb,pfNoL2PFNapb ; Should MSSCR0 be restored? - beq notNapping - - lwz r3,pfMSSCR0(r2) ; Get saved MSSCR0 value - mtspr msscr0,r3 ; Restore MSSCR0 - sync - isync - -notNapping: stw r12,saver12+4(r13) ; Save this one - - stw r14,saver14+4(r13) ; Save this one - stw r15,saver15+4(r13) ; Save this one - la r14,saver24(r13) ; Point to the next block to save into - mfctr r6 ; Get the CTR - stw r16,saver16+4(r13) ; Save this one - la r15,savectr(r13) ; point to line with CTR, DAR, DSISR, Exception code, and VRSAVE - stw r4,savelr+4(r13) ; Save rupt LR - - dcbz 0,r14 ; allocate 32-byte line with r24-r27 - la r16,saver28(r13) ; point to line with r28-r31 - dcbz 0,r15 ; allocate line with CTR, DAR, DSISR, Exception code, and VRSAVE - stw r17,saver17+4(r13) ; Save this one - stw r18,saver18+4(r13) ; Save this one - stw r6,savectr+4(r13) ; Save rupt CTR - stw r0,savecr(r13) ; Save rupt CR - stw r19,saver19+4(r13) ; Save this one - mfdar r6 ; Get the rupt DAR - stw r20,saver20+4(r13) ; Save this one - dcbz 0,r16 ; allocate 32-byte line with r28-r31 - - stw r21,saver21+4(r13) ; Save this one - lwz r21,spcFlags(r2) ; Get the special flags from per_proc - stw r10,savexer+4(r13) ; Save the rupt XER - stw r30,saver30+4(r13) ; Save this one - lhz r30,pfrptdProc(r2) ; Get the reported processor type - stw r31,saver31+4(r13) ; Save this one - stw r22,saver22+4(r13) ; Save this one - stw r23,saver23+4(r13) ; Save this one - stw r24,saver24+4(r13) ; Save this one - stw r25,saver25+4(r13) ; Save this one - mfdsisr r7 ; Get the rupt DSISR - stw r26,saver26+4(r13) ; Save this one - stw r27,saver27+4(r13) ; Save this one - andis. r21,r21,hi16(perfMonitor) ; Is the performance monitor enabled? - stw r28,saver28+4(r13) ; Save this one - cmpwi cr1, r30,CPU_SUBTYPE_POWERPC_750 ; G3? - la r27,savevscr(r13) ; point to 32-byte line with VSCR and FPSCR - cmpwi cr2,r30,CPU_SUBTYPE_POWERPC_7400 ; This guy? - stw r29,saver29+4(r13) ; Save R29 - stw r6,savedar+4(r13) ; Save the rupt DAR - li r10,savepmc ; Point to pmc savearea - - beq+ noPerfMonSave32 ; No perfmon on here... - - dcbz r10,r13 ; Clear first part of pmc area - li r10,savepmc+0x20 ; Point to pmc savearea second part - li r22,0 ; r22: zero - dcbz r10,r13 ; Clear second part of pmc area - - beq cr1,perfMonSave32_750 ; This is a G3... - - beq cr2,perfMonSave32_7400 ; Regular olde G4... - - mfspr r24,pmc5 ; Here for a 7450 - mfspr r25,pmc6 - stw r24,savepmc+16(r13) ; Save PMC5 - stw r25,savepmc+20(r13) ; Save PMC6 - mtspr pmc5,r22 ; Leave PMC5 clear - mtspr pmc6,r22 ; Leave PMC6 clear - -perfMonSave32_7400: - mfspr r25,mmcr2 - stw r25,savemmcr2+4(r13) ; Save MMCR2 - mtspr mmcr2,r22 ; Leave MMCR2 clear - -perfMonSave32_750: - mfspr r23,mmcr0 - mfspr r24,mmcr1 - stw r23,savemmcr0+4(r13) ; Save MMCR0 - stw r24,savemmcr1+4(r13) ; Save MMCR1 - mtspr mmcr0,r22 ; Leave MMCR0 clear - mtspr mmcr1,r22 ; Leave MMCR1 clear - mfspr r23,pmc1 - mfspr r24,pmc2 - mfspr r25,pmc3 - mfspr r26,pmc4 - stw r23,savepmc+0(r13) ; Save PMC1 - stw r24,savepmc+4(r13) ; Save PMC2 - stw r25,savepmc+8(r13) ; Save PMC3 - stw r26,savepmc+12(r13) ; Save PMC4 - mtspr pmc1,r22 ; Leave PMC1 clear - mtspr pmc2,r22 ; Leave PMC2 clear - mtspr pmc3,r22 ; Leave PMC3 clear - mtspr pmc4,r22 ; Leave PMC4 clear - -noPerfMonSave32: - dcbz 0,r27 ; allocate line with VSCR and FPSCR - - stw r7,savedsisr(r13) ; Save the rupt code DSISR - stw r11,saveexception(r13) ; Save the exception code - - -; -; Everything is saved at this point, except for FPRs, and VMX registers. -; Time for us to get a new savearea and then trace interrupt if it is enabled. -; - - lwz r25,traceMask(0) ; Get the trace mask - li r0,SAVgeneral ; Get the savearea type value - lhz r19,PP_CPU_NUMBER(r2) ; Get the logical processor number - rlwinm r22,r11,30,0,31 ; Divide interrupt code by 4 - stb r0,SAVflags+2(r13) ; Mark valid context - addi r22,r22,10 ; Adjust code so we shift into CR5 - li r23,trcWork ; Get the trace work area address - rlwnm r7,r25,r22,22,22 ; Set CR5_EQ bit position to 0 if tracing allowed - li r26,0x8 ; Get start of cpu mask - srw r26,r26,r19 ; Get bit position of cpu number - mtcrf 0x04,r7 ; Set CR5 to show trace or not - and. r26,r26,r25 ; See if we trace this cpu - crandc cr5_eq,cr5_eq,cr0_eq ; Turn off tracing if cpu is disabled -; -; At this point, we can take another exception and lose nothing. -; - - bne+ cr5,xcp32xit ; Skip all of this if no tracing here... - -; -; We select a trace entry using a compare and swap on the next entry field. -; Since we do not lock the actual trace buffer, there is a potential that -; another processor could wrap an trash our entry. Who cares? -; - - lwz r25,traceStart(0) ; Get the start of trace table - lwz r26,traceEnd(0) ; Get end of trace table - -trcsel: lwarx r20,0,r23 ; Get and reserve the next slot to allocate - - addi r22,r20,LTR_size ; Point to the next trace entry - cmplw r22,r26 ; Do we need to wrap the trace table? - bne+ gotTrcEnt ; No wrap, we got us a trace entry... - - mr r22,r25 ; Wrap back to start - -gotTrcEnt: stwcx. r22,0,r23 ; Try to update the current pointer - bne- trcsel ; Collision, try again... - -#if ESPDEBUG - dcbf 0,r23 ; Force to memory - sync -#endif - - dcbz 0,r20 ; Clear and allocate first trace line - -; -; Let us cut that trace entry now. -; -; Note that this code cuts a trace table entry for everything but the CutTrace call. -; An identical entry is made during normal CutTrace processing. Any entry -; format changes made must be done in both places. -; - - lwz r16,ruptStamp(r2) ; Get top of time base - lwz r17,ruptStamp+4(r2) ; Get the bottom of time stamp - - li r14,32 ; Offset to second line - - lwz r0,saver0+4(r13) ; Get back interrupt time R0 - lwz r1,saver1+4(r13) ; Get back interrupt time R1 - lwz r8,savecr(r13) ; Get the CR value - - dcbz r14,r20 ; Zap the second line - - sth r19,LTR_cpu(r20) ; Stash the cpu number - li r14,64 ; Offset to third line - sth r11,LTR_excpt(r20) ; Save the exception type - lwz r7,saver2+4(r13) ; Get back interrupt time R2 - lwz r3,saver3+4(r13) ; Restore this one - - dcbz r14,r20 ; Zap the third half - - mfdsisr r9 ; Get the DSISR - li r14,96 ; Offset to forth line - stw r16,LTR_timeHi(r20) ; Set the upper part of TB - stw r17,LTR_timeLo(r20) ; Set the lower part of TB - lwz r10,savelr+4(r13) ; Get the LR - mfsrr0 r17 ; Get SRR0 back, it is still good - - dcbz r14,r20 ; Zap the forth half - lwz r4,saver4+4(r13) ; Restore this one - lwz r5,saver5+4(r13) ; Restore this one - mfsrr1 r18 ; SRR1 is still good in here - - stw r8,LTR_cr(r20) ; Save the CR - lwz r6,saver6+4(r13) ; Get R6 - mfdar r16 ; Get this back - stw r9,LTR_dsisr(r20) ; Save the DSISR - stw r17,LTR_srr0+4(r20) ; Save the SSR0 - - stw r18,LTR_srr1+4(r20) ; Save the SRR1 - stw r16,LTR_dar+4(r20) ; Save the DAR - mfctr r17 ; Get the CTR (still good in register) - stw r13,LTR_save+4(r20) ; Save the savearea - stw r10,LTR_lr+4(r20) ; Save the LR - - stw r17,LTR_ctr+4(r20) ; Save off the CTR - stw r0,LTR_r0+4(r20) ; Save off register 0 - stw r1,LTR_r1+4(r20) ; Save off register 1 - stw r7,LTR_r2+4(r20) ; Save off register 2 - - - stw r3,LTR_r3+4(r20) ; Save off register 3 - stw r4,LTR_r4+4(r20) ; Save off register 4 - stw r5,LTR_r5+4(r20) ; Save off register 5 - stw r6,LTR_r6+4(r20) ; Save off register 6 - -#if ESPDEBUG - addi r17,r20,32 ; Second line - addi r16,r20,64 ; Third line - dcbst br0,r20 ; Force to memory - dcbst br0,r17 ; Force to memory - addi r17,r17,32 ; Fourth line - dcbst br0,r16 ; Force to memory - dcbst br0,r17 ; Force to memory - - sync ; Make sure it all goes -#endif -xcp32xit: mr r14,r11 ; Save the interrupt code across the call - bl EXT(save_get_phys_32) ; Grab a savearea - mfsprg r2,0 ; Get the per_proc info - li r10,emfp0 ; Point to floating point save - mr r11,r14 ; Get the exception code back - dcbz r10,r2 ; Clear for speed - stw r3,next_savearea+4(r2) ; Store the savearea for the next rupt - - b xcpCommon ; Go join the common interrupt processing... - -; -; -; This is the 64-bit context saving stuff -; - - .align 5 - -extEntry64: mfsprg r13,0 ; Load per_proc - ld r13,next_savearea(r13) ; Get the exception save area - std r0,saver0(r13) ; Save register 0 - lis r0,hi16(MASK(MSR_VEC)|MASK(MSR_FP)|MASK(MSR_ME)) ; Set up the MSR we will use throughout. Note that ME come on here if MCK - std r1,saver1(r13) ; Save register 1 - ori r1,r0,lo16(MASK(MSR_VEC)|MASK(MSR_FP)|MASK(MSR_ME)) ; Rest of MSR - lis r0,0x0010 ; Get rupt code transform validity mask - mtmsr r1 ; Set MSR - isync - - ori r0,r0,0x0200 ; Get rupt code transform validity mask - std r2,saver2(r13) ; Save this one - lis r1,0x00F0 ; Top half of xform XOR - rlwinm r2,r11,29,27,31 ; Get high 5 bits of rupt code - std r3,saver3(r13) ; Save this one - slw r0,r0,r2 ; Move transform validity bit to bit 0 - std r4,saver4(r13) ; Save this one - std r5,saver5(r13) ; Save this one - ori r1,r1,0x04EC ; Bottom half of xform XOR - mfxer r5 ; Save the XER because we are about to muck with it - rlwinm r4,r11,1,27,28 ; Get bottom of interrupt code * 8 - lis r3,hi16(dozem|napm) ; Get the nap and doze bits - srawi r0,r0,31 ; Get 0xFFFFFFFF of xform valid, 0 otherwise - rlwnm r4,r1,r4,24,31 ; Extract the xform XOR - li r1,saver16 ; Point to the next line - and r4,r4,r0 ; Only keep transform if we are to use it - li r2,lgKillResv ; Point to the killing field - mfcr r0 ; Save the CR - stwcx. r2,0,r2 ; Kill any pending reservation - dcbz128 r1,r13 ; Blow away the line - sldi r3,r3,32 ; Position it - mfspr r1,hid0 ; Get HID0 - andc r3,r1,r3 ; Clear nap and doze - xor r11,r11,r4 ; Transform 970 rupt code to standard keeping FAM bit - cmpld r3,r1 ; See if nap and/or doze was on - std r6,saver6(r13) ; Save this one - mfsprg r2,0 ; Get the per_proc area - la r6,savesrr0(r13) ; point to line with SRR0, SRR1, CR, XER, and LR - beq++ eE64NoNap ; No nap here, skip all this... - - sync ; Make sure we are clean - mtspr hid0,r3 ; Set the updated hid0 - mfspr r1,hid0 ; Yes, this is silly, keep it here - mfspr r1,hid0 ; Yes, this is a duplicate, keep it here - mfspr r1,hid0 ; Yes, this is a duplicate, keep it here - mfspr r1,hid0 ; Yes, this is a duplicate, keep it here - mfspr r1,hid0 ; Yes, this is a duplicate, keep it here - mfspr r1,hid0 ; Yes, this is a duplicate, keep it here - -eE64NoNap: crnot wasNapping,cr0_eq ; Remember if we were napping - andi. r1,r11,T_FAM ; Check FAM bit - beq++ eEnoFAM ; Is it FAM intercept - mfsrr1 r3 ; Load srr1 - andc r11,r11,r1 ; Clear FAM bit - rlwinm. r3,r3,0,MSR_PR_BIT,MSR_PR_BIT ; Are we trapping from supervisor state? - beq++ eEnoFAM ; From supervisor state - lwz r1,spcFlags(r2) ; Load spcFlags - rlwinm r1,r1,1+FamVMmodebit,30,31 ; Extract FamVMenabit and FamVMmodebit - cmpwi cr0,r1,2 ; Check FamVMena set without FamVMmode - bne++ eEnoFAM ; Can this context be FAM intercept - lwz r4,FAMintercept(r2) ; Load exceptions mask to intercept - li r3,0 ; Clear - srwi r1,r11,2 ; divide r11 by 4 - oris r3,r3,0x8000 ; Set r3 to 0x80000000 - srw r1,r3,r1 ; Set bit for current exception - and. r1,r1,r4 ; And current exception with the intercept mask - beq++ eEnoFAM ; Is it FAM intercept - b EXT(vmm_fam_exc) - - .align 5 - -eEnoFAM: lwz r1,pfAvailable(r2) ; Get the CPU features flags - dcbz128 0,r6 ; allocate 128-byte line with SRR0, SRR1, CR, XER, and LR - -; -; Remember, we are setting up CR6 with feature flags -; - std r7,saver7(r13) ; Save this one - mtcrf 0x80,r1 ; Put the features flags (that we care about) in the CR - std r8,saver8(r13) ; Save this one - mtcrf 0x40,r1 ; Put the features flags (that we care about) in the CR - mfsrr0 r6 ; Get the interruption SRR0 - mtcrf 0x20,r1 ; Put the features flags (that we care about) in the CR - mfsrr1 r7 ; Get the interrupt SRR1 - std r6,savesrr0(r13) ; Save the SRR0 - mtcrf 0x02,r1 ; Put the features flags (that we care about) in the CR - std r9,saver9(r13) ; Save this one - crmove featAltivec,pfAltivecb ; Set the Altivec flag - std r7,savesrr1(r13) ; Save SRR1 - mfsprg r9,3 ; Get rupt time R11 - std r10,saver10(r13) ; Save this one - mfsprg r6,2 ; Get interrupt time R13 - std r9,saver11(r13) ; Save rupt time R11 - mtsprg 2,r1 ; Set the feature flags - std r12,saver12(r13) ; Save this one - mflr r4 ; Get the LR - mftb r7 ; Get the timebase - std r6,saver13(r13) ; Save rupt R13 - std r7,ruptStamp(r2) ; Save the time stamp - std r7,SAVtime(r13) ; Save the time stamp - - bf++ wasNapping,notNappingSF ; Skip if not waking up from nap... - - ld r6,napStamp(r2) ; Pick up nap stamp - lis r3,hi16(EXT(machine_idle_ret)) ; Get high part of nap/doze return - sub r7,r7,r6 ; Subtract stamp from now - ld r6,napTotal(r2) ; Pick up total - add r6,r6,r7 ; Add low to total - ori r3,r3,lo16(EXT(machine_idle_ret)) ; Get low part of nap/doze return - std r6,napTotal(r2) ; Save the high total - std r3,savesrr0(r13) ; Modify to return to nap/doze exit - -notNappingSF: - std r14,saver14(r13) ; Save this one - std r15,saver15(r13) ; Save this one - stw r0,savecr(r13) ; Save rupt CR - mfctr r6 ; Get the CTR - std r16,saver16(r13) ; Save this one - std r4,savelr(r13) ; Save rupt LR - - std r17,saver17(r13) ; Save this one - li r7,savepmc ; Point to pmc area - std r18,saver18(r13) ; Save this one - lwz r17,spcFlags(r2) ; Get the special flags from per_proc - std r6,savectr(r13) ; Save rupt CTR - std r19,saver19(r13) ; Save this one - mfdar r6 ; Get the rupt DAR - std r20,saver20(r13) ; Save this one - - dcbz128 r7,r13 ; Clear out the pmc spot - - std r21,saver21(r13) ; Save this one - std r5,savexer(r13) ; Save the rupt XER - std r22,saver22(r13) ; Save this one - std r23,saver23(r13) ; Save this one - std r24,saver24(r13) ; Save this one - std r25,saver25(r13) ; Save this one - mfdsisr r7 ; Get the rupt DSISR - std r26,saver26(r13) ; Save this one - andis. r17,r17,hi16(perfMonitor) ; Is the performance monitor enabled? - std r27,saver27(r13) ; Save this one - li r10,emfp0 ; Point to floating point save - std r28,saver28(r13) ; Save this one - la r27,savevscr(r13) ; point to 32-byte line with VSCR and FPSCR - std r29,saver29(r13) ; Save R29 - std r30,saver30(r13) ; Save this one - std r31,saver31(r13) ; Save this one - std r6,savedar(r13) ; Save the rupt DAR - stw r7,savedsisr(r13) ; Save the rupt code DSISR - stw r11,saveexception(r13) ; Save the exception code - - beq++ noPerfMonSave64 ; Performance monitor not on... - - li r22,0 ; r22: zero - - mfspr r23,mmcr0_gp - mfspr r24,mmcr1_gp - mfspr r25,mmcra_gp - std r23,savemmcr0(r13) ; Save MMCR0 - std r24,savemmcr1(r13) ; Save MMCR1 - std r25,savemmcr2(r13) ; Save MMCRA - mtspr mmcr0_gp,r22 ; Leave MMCR0 clear - mtspr mmcr1_gp,r22 ; Leave MMCR1 clear - mtspr mmcra_gp,r22 ; Leave MMCRA clear - mfspr r23,pmc1_gp - mfspr r24,pmc2_gp - mfspr r25,pmc3_gp - mfspr r26,pmc4_gp - stw r23,savepmc+0(r13) ; Save PMC1 - stw r24,savepmc+4(r13) ; Save PMC2 - stw r25,savepmc+8(r13) ; Save PMC3 - stw r26,savepmc+12(r13) ; Save PMC4 - mfspr r23,pmc5_gp - mfspr r24,pmc6_gp - mfspr r25,pmc7_gp - mfspr r26,pmc8_gp - stw r23,savepmc+16(r13) ; Save PMC5 - stw r24,savepmc+20(r13) ; Save PMC6 - stw r25,savepmc+24(r13) ; Save PMC7 - stw r26,savepmc+28(r13) ; Save PMC8 - mtspr pmc1_gp,r22 ; Leave PMC1 clear - mtspr pmc2_gp,r22 ; Leave PMC2 clear - mtspr pmc3_gp,r22 ; Leave PMC3 clear - mtspr pmc4_gp,r22 ; Leave PMC4 clear - mtspr pmc5_gp,r22 ; Leave PMC5 clear - mtspr pmc6_gp,r22 ; Leave PMC6 clear - mtspr pmc7_gp,r22 ; Leave PMC7 clear - mtspr pmc8_gp,r22 ; Leave PMC8 clear - -noPerfMonSave64: - -; -; Everything is saved at this point, except for FPRs, and VMX registers. -; Time for us to get a new savearea and then trace interrupt if it is enabled. -; - - lwz r25,traceMask(0) ; Get the trace mask - li r0,SAVgeneral ; Get the savearea type value - lhz r19,PP_CPU_NUMBER(r2) ; Get the logical processor number - stb r0,SAVflags+2(r13) ; Mark valid context - rlwinm r22,r11,30,0,31 ; Divide interrupt code by 2 - li r23,trcWork ; Get the trace work area address - addi r22,r22,10 ; Adjust code so we shift into CR5 - li r26,0x8 ; Get start of cpu mask - rlwnm r7,r25,r22,22,22 ; Set CR5_EQ bit position to 0 if tracing allowed - srw r26,r26,r19 ; Get bit position of cpu number - mtcrf 0x04,r7 ; Set CR5 to show trace or not - and. r26,r26,r25 ; See if we trace this cpu - crandc cr5_eq,cr5_eq,cr0_eq ; Turn off tracing if cpu is disabled - - bne++ cr5,xcp64xit ; Skip all of this if no tracing here... - -; -; We select a trace entry using a compare and swap on the next entry field. -; Since we do not lock the actual trace buffer, there is a potential that -; another processor could wrap an trash our entry. Who cares? -; - - lwz r25,traceStart(0) ; Get the start of trace table - lwz r26,traceEnd(0) ; Get end of trace table - -trcselSF: lwarx r20,0,r23 ; Get and reserve the next slot to allocate - - addi r22,r20,LTR_size ; Point to the next trace entry - cmplw r22,r26 ; Do we need to wrap the trace table? - bne++ gotTrcEntSF ; No wrap, we got us a trace entry... - - mr r22,r25 ; Wrap back to start - -gotTrcEntSF: - stwcx. r22,0,r23 ; Try to update the current pointer - bne- trcselSF ; Collision, try again... - -#if ESPDEBUG - dcbf 0,r23 ; Force to memory - sync -#endif - -; -; Let us cut that trace entry now. -; -; Note that this code cuts a trace table entry for everything but the CutTrace call. -; An identical entry is made during normal CutTrace processing. Any entry -; format changes made must be done in both places. -; - - dcbz128 0,r20 ; Zap the trace entry - - lwz r9,SAVflags(r13) ; Get savearea flags - - ld r16,ruptStamp(r2) ; Get top of time base - ld r0,saver0(r13) ; Get back interrupt time R0 (we need this whether we trace or not) - std r16,LTR_timeHi(r20) ; Set the upper part of TB - ld r1,saver1(r13) ; Get back interrupt time R1 - rlwinm r9,r9,20,16,23 ; Isolate the special flags - ld r18,saver2(r13) ; Get back interrupt time R2 - std r0,LTR_r0(r20) ; Save off register 0 - rlwimi r9,r19,0,24,31 ; Slide in the cpu number - ld r3,saver3(r13) ; Restore this one - sth r9,LTR_cpu(r20) ; Stash the cpu number and special flags - std r1,LTR_r1(r20) ; Save off register 1 - ld r4,saver4(r13) ; Restore this one - std r18,LTR_r2(r20) ; Save off register 2 - ld r5,saver5(r13) ; Restore this one - ld r6,saver6(r13) ; Get R6 - std r3,LTR_r3(r20) ; Save off register 3 - lwz r16,savecr(r13) ; Get the CR value - std r4,LTR_r4(r20) ; Save off register 4 - mfsrr0 r17 ; Get SRR0 back, it is still good - std r5,LTR_r5(r20) ; Save off register 5 - std r6,LTR_r6(r20) ; Save off register 6 - mfsrr1 r18 ; SRR1 is still good in here - stw r16,LTR_cr(r20) ; Save the CR - std r17,LTR_srr0(r20) ; Save the SSR0 - std r18,LTR_srr1(r20) ; Save the SRR1 - - mfdar r17 ; Get this back - ld r16,savelr(r13) ; Get the LR - std r17,LTR_dar(r20) ; Save the DAR - mfctr r17 ; Get the CTR (still good in register) - std r16,LTR_lr(r20) ; Save the LR - std r17,LTR_ctr(r20) ; Save off the CTR - mfdsisr r17 ; Get the DSISR - std r13,LTR_save(r20) ; Save the savearea - stw r17,LTR_dsisr(r20) ; Save the DSISR - sth r11,LTR_excpt(r20) ; Save the exception type -#if 0 - lwz r17,FPUowner(r2) ; (TEST/DEBUG) Get the current floating point owner - stw r17,LTR_rsvd0(r20) ; (TEST/DEBUG) Record the owner -#endif - -#if ESPDEBUG - dcbf 0,r20 ; Force to memory - sync ; Make sure it all goes -#endif -xcp64xit: mr r14,r11 ; Save the interrupt code across the call - bl EXT(save_get_phys_64) ; Grab a savearea - mfsprg r2,0 ; Get the per_proc info - li r10,emfp0 ; Point to floating point save - mr r11,r14 ; Get the exception code back - dcbz128 r10,r2 ; Clear for speed - std r3,next_savearea(r2) ; Store the savearea for the next rupt - b xcpCommon ; Go join the common interrupt processing... - -; -; All of the context is saved. Now we will get a -; fresh savearea. After this we can take an interrupt. -; - - .align 5 - -xcpCommon: - -; -; Here we will save some floating point and vector status -; and we also set a clean default status for a new interrupt level. -; Note that we assume that emfp0 is on an altivec boundary -; and that R10 points to it (as a displacemnt from R2). -; -; We need to save the FPSCR as if it is normal context. -; This is because pending exceptions will cause an exception even if -; FP is disabled. We need to clear the FPSCR when we first start running in the -; kernel. -; - - stfd f0,emfp0(r2) ; Save FPR0 - stfd f1,emfp1(r2) ; Save FPR1 - li r19,0 ; Assume no Altivec - mffs f0 ; Get the FPSCR - lfd f1,Zero(0) ; Make a 0 - stfd f0,savefpscrpad(r13) ; Save the FPSCR - li r9,0 ; Get set to clear VRSAVE - mtfsf 0xFF,f1 ; Clear it - addi r14,r10,16 ; Displacement to second vector register - lfd f0,emfp0(r2) ; Restore FPR0 - la r28,savevscr(r13) ; Point to the status area - lfd f1,emfp1(r2) ; Restore FPR1 - - bf featAltivec,noavec ; No Altivec on this CPU... - - stvxl v0,r10,r2 ; Save a register - stvxl v1,r14,r2 ; Save a second register - mfspr r19,vrsave ; Get the VRSAVE register - mfvscr v0 ; Get the vector status register - vspltish v1,1 ; Turn on the non-Java bit and saturate - stvxl v0,0,r28 ; Save the vector status - vspltisw v0,1 ; Turn on the saturate bit - vxor v1,v1,v0 ; Turn off saturate - mtvscr v1 ; Set the non-java, no saturate status for new level - mtspr vrsave,r9 ; Clear VRSAVE for each interrupt level - - lvxl v0,r10,r2 ; Restore first work register - lvxl v1,r14,r2 ; Restore second work register - -noavec: stw r19,savevrsave(r13) ; Save the vector register usage flags - -; -; We are now done saving all of the context. Start filtering the interrupts. -; Note that a Redrive will count as an actual interrupt. -; Note also that we take a lot of system calls so we will start decode here. -; - -Redrive: - lwz r22,SAVflags(r13) ; Pick up the flags - lwz r0,saver0+4(r13) ; Get back interrupt time syscall number - mfsprg r2,0 ; Restore per_proc - - lwz r20,lo16(xcpTable)(r11) ; Get the interrupt handler (note: xcpTable must be in 1st 32k of physical memory) - la r12,hwCounts(r2) ; Point to the exception count area - andis. r24,r22,hi16(SAVeat) ; Should we eat this one? - rlwinm r22,r22,SAVredriveb+1,31,31 ; Get a 1 if we are redriving - add r12,r12,r11 ; Point to the count - lwz r25,0(r12) ; Get the old value - lwz r23,hwRedrives(r2) ; Get the redrive count - crmove cr3_eq,cr0_eq ; Remember if we are ignoring - xori r24,r22,1 ; Get the NOT of the redrive - mtctr r20 ; Point to the interrupt handler - mtcrf 0x80,r0 ; Set our CR0 to the high nybble of possible syscall code - add r25,r25,r24 ; Count this one if not a redrive - add r23,r23,r22 ; Count this one if if is a redrive - crandc cr0_lt,cr0_lt,cr0_gt ; See if we have R0 equal to 0b10xx...x - stw r25,0(r12) ; Store it back - stw r23,hwRedrives(r2) ; Save the redrive count - bne-- cr3,IgnoreRupt ; Interruption is being ignored... - bctr ; Go process the exception... - - -; -; Exception vector filter table (like everything in this file, must be in 1st 32KB of physical memory) -; - - .align 7 - -xcpTable: - .long EatRupt ; T_IN_VAIN - .long PassUpTrap ; T_RESET - .long MachineCheck ; T_MACHINE_CHECK - .long EXT(handlePF) ; T_DATA_ACCESS - .long EXT(handlePF) ; T_INSTRUCTION_ACCESS - .long PassUpRupt ; T_INTERRUPT - .long EXT(AlignAssist) ; T_ALIGNMENT - .long ProgramChk ; T_PROGRAM - .long PassUpFPU ; T_FP_UNAVAILABLE - .long PassUpRupt ; T_DECREMENTER - .long PassUpTrap ; T_IO_ERROR - .long PassUpTrap ; T_RESERVED - .long xcpSyscall ; T_SYSTEM_CALL - .long PassUpTrap ; T_TRACE - .long PassUpTrap ; T_FP_ASSIST - .long PassUpTrap ; T_PERF_MON - .long PassUpVMX ; T_VMX - .long PassUpTrap ; T_INVALID_EXCP0 - .long PassUpTrap ; T_INVALID_EXCP1 - .long PassUpTrap ; T_INVALID_EXCP2 - .long PassUpTrap ; T_INSTRUCTION_BKPT - .long PassUpRupt ; T_SYSTEM_MANAGEMENT - .long EXT(AltivecAssist) ; T_ALTIVEC_ASSIST - .long PassUpRupt ; T_THERMAL - .long PassUpTrap ; T_INVALID_EXCP5 - .long PassUpTrap ; T_INVALID_EXCP6 - .long PassUpTrap ; T_INVALID_EXCP7 - .long PassUpTrap ; T_INVALID_EXCP8 - .long PassUpTrap ; T_INVALID_EXCP9 - .long PassUpTrap ; T_INVALID_EXCP10 - .long PassUpTrap ; T_INVALID_EXCP11 - .long PassUpTrap ; T_INVALID_EXCP12 - .long PassUpTrap ; T_INVALID_EXCP13 - - .long PassUpTrap ; T_RUNMODE_TRACE - - .long PassUpRupt ; T_SIGP - .long PassUpTrap ; T_PREEMPT - .long conswtch ; T_CSWITCH - .long PassUpRupt ; T_SHUTDOWN - .long PassUpAbend ; T_CHOKE - - .long EXT(handleDSeg) ; T_DATA_SEGMENT - .long EXT(handleISeg) ; T_INSTRUCTION_SEGMENT - - .long WhoaBaby ; T_SOFT_PATCH - .long WhoaBaby ; T_MAINTENANCE - .long WhoaBaby ; T_INSTRUMENTATION - .long WhoaBaby ; T_ARCHDEP0 - .long EatRupt ; T_HDEC -; -; Just what the heck happened here???? -; NB: also get here from UFT dispatch table, on bogus index -; - -WhoaBaby: b . ; Open the hood and wait for help - - .align 5 - -IgnoreRupt: - lwz r20,hwIgnored(r2) ; Grab the ignored interruption count - addi r20,r20,1 ; Count this one - stw r20,hwIgnored(r2) ; Save the ignored count - b EatRupt ; Ignore it... - - - -; -; System call -; - - .align 5 - -xcpSyscall: lis r20,hi16(EXT(shandler)) ; Assume this is a normal one, get handler address - rlwinm r6,r0,1,0,31 ; Move sign bit to the end - ori r20,r20,lo16(EXT(shandler)) ; Assume this is a normal one, get handler address - bnl++ cr0,PassUp ; R0 not 0b10xxx...x, can not be any kind of magical system call, just pass it up... - lwz r7,savesrr1+4(r13) ; Get the entering MSR (low half) - lwz r1,dgFlags(0) ; Get the flags - cmplwi cr2,r6,1 ; See if original R0 had the CutTrace request code in it - - rlwinm. r7,r7,0,MSR_PR_BIT,MSR_PR_BIT ; Did we come from user state? - beq++ FCisok ; From supervisor state... - - rlwinm. r1,r1,0,enaUsrFCallb,enaUsrFCallb ; Are they valid? - beq++ PassUp ; No, treat as a normal one... - -FCisok: beq++ cr2,EatRupt ; This is a CutTrace system call, we are done with it... - -; -; Here is where we call the firmware. If it returns T_IN_VAIN, that means -; that it has handled the interruption. Remember: thou shalt not trash R13 -; while you are away. Anything else is ok. -; - - lwz r3,saver3+4(r13) ; Restore the first parameter - b EXT(FirmwareCall) ; Go handle the firmware call.... - -; -; Here is where we return from the firmware call -; - - .align 5 - .globl EXT(FCReturn) - -LEXT(FCReturn) - cmplwi r3,T_IN_VAIN ; Was it handled? - beq++ EatRupt ; Interrupt was handled... - mr r11,r3 ; Put the rupt code into the right register - b Redrive ; Go through the filter again... - - -; -; Here is where we return from the PTE miss and segment exception handler -; - - .align 5 - .globl EXT(PFSExit) - -LEXT(PFSExit) - -#if 0 - mfsprg r2,0 ; (BRINGUP) - lwz r0,savedsisr(r13) ; (BRINGUP) - andis. r0,r0,hi16(dsiAC) ; (BRINGUP) - beq++ didnthit ; (BRINGUP) - lwz r0,20(0) ; (BRINGUP) - mr. r0,r0 ; (BRINGUP) - bne-- didnthit ; (BRINGUP) -#if 0 - li r0,1 ; (BRINGUP) - stw r0,20(0) ; (BRINGUP) - lis r0,hi16(Choke) ; (BRINGUP) - ori r0,r0,lo16(Choke) ; (BRINGUP) - sc ; (BRINGUP) -#endif - - lwz r4,savesrr0+4(r13) ; (BRINGUP) - lwz r8,savesrr1+4(r13) ; (BRINGUP) - lwz r6,savedar+4(r13) ; (BRINGUP) - rlwinm. r0,r8,0,MSR_IR_BIT,MSR_IR_BIT ; (BRINGUP) - mfmsr r9 ; (BRINGUP) - ori r0,r9,lo16(MASK(MSR_DR)) ; (BRINGUP) - beq-- hghg ; (BRINGUP) - mtmsr r0 ; (BRINGUP) - isync ; (BRINGUP) - -hghg: lwz r5,0(r4) ; (BRINGUP) - beq-- hghg1 ; (BRINGUP) - mtmsr r9 ; (BRINGUP) - isync ; (BRINGUP) - -hghg1: rlwinm r7,r5,6,26,31 ; (BRINGUP) - rlwinm r27,r5,14,24,28 ; (BRINGUP) - addi r3,r13,saver0+4 ; (BRINGUP) - lwzx r3,r3,r27 ; (BRINGUP) - -#if 0 - lwz r27,patcharea+4(r2) ; (BRINGUP) - mr. r3,r3 ; (BRINGUP) - bne++ nbnbnb ; (BRINGUP) - addi r27,r27,1 ; (BRINGUP) - stw r27,patcharea+4(r2) ; (BRINGUP) -nbnbnb: -#endif - - rlwinm. r28,r8,0,MSR_DR_BIT,MSR_DR_BIT ; (BRINGUP) - rlwinm r27,r6,0,0,29 ; (BRINGUP) - ori r28,r9,lo16(MASK(MSR_DR)) ; (BRINGUP) - mfspr r10,dabr ; (BRINGUP) - li r0,0 ; (BRINGUP) - mtspr dabr,r0 ; (BRINGUP) - cmplwi cr1,r7,31 ; (BRINGUP) - beq-- qqq0 ; (BRINGUP) - mtmsr r28 ; (BRINGUP) -qqq0: - isync ; (BRINGUP) - - lwz r27,0(r27) ; (BRINGUP) - Get original value - - bne cr1,qqq1 ; (BRINGUP) - - rlwinm r5,r5,31,22,31 ; (BRINGUP) - cmplwi cr1,r5,151 ; (BRINGUP) - beq cr1,qqq3 ; (BRINGUP) - cmplwi cr1,r5,407 ; (BRINGUP) - beq cr1,qqq2 ; (BRINGUP) - cmplwi cr1,r5,215 ; (BRINGUP) - beq cr1,qqq0q ; (BRINGUP) - cmplwi cr1,r5,1014 ; (BRINGUP) - beq cr1,qqqm1 ; (BRINGUP) - - lis r0,hi16(Choke) ; (BRINGUP) - ori r0,r0,lo16(Choke) ; (BRINGUP) - sc ; (BRINGUP) - -qqqm1: rlwinm r7,r6,0,0,26 ; (BRINGUP) - stw r0,0(r7) ; (BRINGUP) - stw r0,4(r7) ; (BRINGUP) - stw r0,8(r7) ; (BRINGUP) - stw r0,12(r7) ; (BRINGUP) - stw r0,16(r7) ; (BRINGUP) - stw r0,20(r7) ; (BRINGUP) - stw r0,24(r7) ; (BRINGUP) - stw r0,28(r7) ; (BRINGUP) - b qqq9 - -qqq1: cmplwi r7,38 ; (BRINGUP) - bgt qqq2 ; (BRINGUP) - blt qqq3 ; (BRINGUP) - -qqq0q: stb r3,0(r6) ; (BRINGUP) - b qqq9 ; (BRINGUP) - -qqq2: sth r3,0(r6) ; (BRINGUP) - b qqq9 ; (BRINGUP) - -qqq3: stw r3,0(r6) ; (BRINGUP) - -qqq9: -#if 0 - rlwinm r7,r6,0,0,29 ; (BRINGUP) - lwz r0,0(r7) ; (BRINGUP) - Get newest value -#else - lis r7,hi16(0x000792B8) ; (BRINGUP) - ori r7,r7,lo16(0x000792B8) ; (BRINGUP) - lwz r0,0(r7) ; (BRINGUP) - Get newest value -#endif - mtmsr r9 ; (BRINGUP) - mtspr dabr,r10 ; (BRINGUP) - isync ; (BRINGUP) - -#if 0 - lwz r28,patcharea+12(r2) ; (BRINGUP) - mr. r28,r28 ; (BRINGUP) - bne++ qqq12 ; (BRINGUP) - lis r28,0x4000 ; (BRINGUP) - -qqq12: stw r27,0(r28) ; (BRINGUP) - lwz r6,savedar+4(r13) ; (BRINGUP) - stw r0,4(r28) ; (BRINGUP) - stw r4,8(r28) ; (BRINGUP) - stw r6,12(r28) ; (BRINGUP) - addi r28,r28,16 ; (BRINGUP) - mr. r3,r3 ; (BRINGUP) - stw r28,patcharea+12(r2) ; (BRINGUP) - lwz r10,patcharea+8(r2) ; (BRINGUP) - lwz r0,patcharea+4(r2) ; (BRINGUP) -#endif - -#if 1 - stw r0,patcharea(r2) ; (BRINGUP) -#endif - -#if 0 - xor r28,r0,r27 ; (BRINGUP) - See how much it changed - rlwinm r28,r28,24,24,31 ; (BRINGUP) - cmplwi r28,1 ; (BRINGUP) - - ble++ qqq10 ; (BRINGUP) - - mr r7,r0 ; (BRINGUP) - li r0,1 ; (BRINGUP) - stw r0,20(0) ; (BRINGUP) - lis r0,hi16(Choke) ; (BRINGUP) - ori r0,r0,lo16(Choke) ; (BRINGUP) - sc ; (BRINGUP) -#endif - - -qqq10: addi r4,r4,4 ; (BRINGUP) - stw r4,savesrr0+4(r13) ; (BRINGUP) - - li r11,T_IN_VAIN ; (BRINGUP) - b EatRupt ; (BRINGUP) - -didnthit: ; (BRINGUP) -#endif -#if 0 - lwz r0,20(0) ; (BRINGUP) - mr. r0,r0 ; (BRINGUP) - beq++ opopop ; (BRINGUP) - li r0,0 ; (BRINGUP) - stw r0,20(0) ; (BRINGUP) - lis r0,hi16(Choke) ; (BRINGUP) - ori r0,r0,lo16(Choke) ; (BRINGUP) - sc ; (BRINGUP) -opopop: -#endif - lwz r0,savesrr1+4(r13) ; Get the MSR in use at exception time - cmplwi cr1,r11,T_IN_VAIN ; Was it handled? - rlwinm. r4,r0,0,MSR_PR_BIT,MSR_PR_BIT ; Are we trapping from supervisor state? - beq++ cr1,EatRupt ; Yeah, just blast back to the user... - beq-- NoFamPf - mfsprg r2,0 ; Get back per_proc - lwz r1,spcFlags(r2) ; Load spcFlags - rlwinm r1,r1,1+FamVMmodebit,30,31 ; Extract FamVMenabit and FamVMmodebit - cmpi cr0,r1,2 ; Check FamVMena set without FamVMmode - bne-- cr0,NoFamPf - lwz r6,FAMintercept(r2) ; Load exceptions mask to intercept - li r5,0 ; Clear - srwi r1,r11,2 ; divide r11 by 4 - oris r5,r5,0x8000 ; Set r5 to 0x80000000 - srw r1,r5,r1 ; Set bit for current exception - and. r1,r1,r6 ; And current exception with the intercept mask - beq++ NoFamPf ; Is it FAM intercept - bl EXT(vmm_fam_pf) - b EatRupt - -NoFamPf: andi. r4,r0,lo16(MASK(MSR_RI)) ; See if the recover bit is on - lis r0,0x8000 ; Get 0xFFFFFFFF80000000 - add r0,r0,r0 ; Get 0xFFFFFFFF00000000 - beq++ PassUpTrap ; Not on, normal case... -; -; Here is where we handle the "recovery mode" stuff. -; This is set by an emulation routine to trap any faults when it is fetching data or -; instructions. -; -; If we get a fault, we turn off RI, set CR0_EQ to false, bump the PC, and set R0 -; and R1 to the DAR and DSISR, respectively. -; - lwz r3,savesrr0(r13) ; Get the failing instruction address - lwz r4,savesrr0+4(r13) ; Get the failing instruction address - lwz r5,savecr(r13) ; Get the condition register - or r4,r4,r0 ; Fill the high part with foxes - lwz r0,savedar(r13) ; Get the DAR - addic r4,r4,4 ; Skip failing instruction - lwz r6,savedar+4(r13) ; Get the DAR - addze r3,r3 ; Propagate carry - rlwinm r5,r5,0,3,1 ; Clear CR0_EQ to let emulation code know we failed - lwz r7,savedsisr(r13) ; Grab the DSISR - stw r3,savesrr0(r13) ; Save resume address - stw r4,savesrr0+4(r13) ; Save resume address - stw r5,savecr(r13) ; And the resume CR - stw r0,saver0(r13) ; Pass back the DAR - stw r6,saver0+4(r13) ; Pass back the DAR - stw r7,saver1+4(r13) ; Pass back the DSISR - b EatRupt ; Resume emulated code - -; -; Here is where we handle the context switch firmware call. The old -; context has been saved. The new savearea is in kind of hokey, the high order -; half is stored in saver7 and the low half is in saver3. We will just -; muck around with the savearea pointers, and then join the exit routine -; - - .align 5 - -conswtch: - li r0,0xFFF ; Get page boundary - mr r29,r13 ; Save the save - andc r30,r13,r0 ; Round down to page boundary (64-bit safe) - lwz r5,saver3+4(r13) ; Switch to the new savearea - bf-- pf64Bitb,xcswNo64 ; Not 64-bit... - lwz r6,saver7+4(r13) ; Get the high order half - sldi r6,r6,32 ; Position high half - or r5,r5,r6 ; Merge them - -xcswNo64: lwz r30,SACvrswap+4(r30) ; get real to virtual translation - mr r13,r5 ; Switch saveareas - li r0,0 ; Clear this - xor r27,r29,r30 ; Flip to virtual - stw r0,saver3(r5) ; Push the new virtual savearea to the switch to routine - stw r27,saver3+4(r5) ; Push the new virtual savearea to the switch to routine - b EatRupt ; Start it up... - -; -; Handle machine check here. -; -; ? -; - - .align 5 - -MachineCheck: - - bt++ pf64Bitb,mck64 ; ? - - lwz r27,savesrr1+4(r13) ; Pick up srr1 - -; -; Check if the failure was in -; ml_probe_read. If so, this is expected, so modify the PC to -; ml_proble_read_mck and then eat the exception. -; - lwz r30,savesrr0+4(r13) ; Get the failing PC - lis r28,hi16(EXT(ml_probe_read_mck)) ; High order part - lis r27,hi16(EXT(ml_probe_read)) ; High order part - ori r28,r28,lo16(EXT(ml_probe_read_mck)) ; Get the low part - ori r27,r27,lo16(EXT(ml_probe_read)) ; Get the low part - cmplw r30,r28 ; Check highest possible - cmplw cr1,r30,r27 ; Check lowest - bge- PassUpTrap ; Outside of range - blt- cr1,PassUpTrap ; Outside of range -; -; We need to fix up the BATs here because the probe -; routine messed them all up... As long as we are at it, -; fix up to return directly to caller of probe. -; - - lis r11,hi16(EXT(shadow_BAT)+shdDBAT) ; Get shadow address - ori r11,r11,lo16(EXT(shadow_BAT)+shdDBAT) ; Get shadow address - - lwz r30,0(r11) ; Pick up DBAT 0 high - lwz r28,4(r11) ; Pick up DBAT 0 low - lwz r27,8(r11) ; Pick up DBAT 1 high - lwz r18,16(r11) ; Pick up DBAT 2 high - lwz r11,24(r11) ; Pick up DBAT 3 high - - sync - mtdbatu 0,r30 ; Restore DBAT 0 high - mtdbatl 0,r28 ; Restore DBAT 0 low - mtdbatu 1,r27 ; Restore DBAT 1 high - mtdbatu 2,r18 ; Restore DBAT 2 high - mtdbatu 3,r11 ; Restore DBAT 3 high - sync - - lwz r28,savelr+4(r13) ; Get return point - lwz r27,saver0+4(r13) ; Get the saved MSR - li r30,0 ; Get a failure RC - stw r28,savesrr0+4(r13) ; Set the return point - stw r27,savesrr1+4(r13) ; Set the continued MSR - stw r30,saver3+4(r13) ; Set return code - b EatRupt ; Yum, yum, eat it all up... - -; -; 64-bit machine checks -; - -mck64: - -; -; NOTE: WE NEED TO RETHINK RECOVERABILITY A BIT - radar 3167190 -; - - ld r23,savesrr0(r13) ; Grab the SRR0 in case we need bad instruction - ld r20,savesrr1(r13) ; Grab the SRR1 so we can decode the thing - lwz r21,savedsisr(r13) ; We might need this in a bit - ld r22,savedar(r13) ; We might need this in a bit - - lis r8,AsyMCKSrc ; Get the Async MCK Source register address - mfsprg r19,2 ; Get the feature flags - ori r8,r8,0x8000 ; Set to read data - rlwinm. r0,r19,0,pfSCOMFixUpb,pfSCOMFixUpb ; Do we need to fix the SCOM data? - - sync - - mtspr scomc,r8 ; Request the MCK source - mfspr r24,scomd ; Get the source - mfspr r8,scomc ; Get back the status (we just ignore it) - sync - isync - - lis r8,AsyMCKRSrc ; Get the Async MCK Source AND mask address - li r9,0 ; Get and AND mask of 0 - - sync - - mtspr scomd,r9 ; Set the AND mask to 0 - mtspr scomc,r8 ; Write the AND mask and clear conditions - mfspr r8,scomc ; Get back the status (we just ignore it) - sync - isync - - lis r8,cFIR ; Get the Core FIR register address - ori r8,r8,0x8000 ; Set to read data - - sync - - mtspr scomc,r8 ; Request the Core FIR - mfspr r25,scomd ; Get the source - mfspr r8,scomc ; Get back the status (we just ignore it) - sync - isync - - lis r8,cFIRrst ; Get the Core FIR AND mask address - - sync - - mtspr scomd,r9 ; Set the AND mask to 0 - mtspr scomc,r8 ; Write the AND mask and clear conditions - mfspr r8,scomc ; Get back the status (we just ignore it) - sync - isync - - lis r8,l2FIR ; Get the L2 FIR register address - ori r8,r8,0x8000 ; Set to read data - - sync - - mtspr scomc,r8 ; Request the L2 FIR - mfspr r26,scomd ; Get the source - mfspr r8,scomc ; Get back the status (we just ignore it) - sync - isync - - lis r8,l2FIRrst ; Get the L2 FIR AND mask address - - sync - - mtspr scomd,r9 ; Set the AND mask to 0 - mtspr scomc,r8 ; Write the AND mask and clear conditions - mfspr r8,scomc ; Get back the status (we just ignore it) - sync - isync - - lis r8,busFIR ; Get the Bus FIR register address - ori r8,r8,0x8000 ; Set to read data - - sync - - mtspr scomc,r8 ; Request the Bus FIR - mfspr r27,scomd ; Get the source - mfspr r8,scomc ; Get back the status (we just ignore it) - sync - isync - - lis r8,busFIRrst ; Get the Bus FIR AND mask address - - sync - - mtspr scomd,r9 ; Set the AND mask to 0 - mtspr scomc,r8 ; Write the AND mask and clear conditions - mfspr r8,scomc ; Get back the status (we just ignore it) - sync - isync - -; Note: bug in early chips where scom reads are shifted right by 1. We fix that here. -; Also note that we will lose bit 63 - - beq++ mckNoFix ; No fix up is needed - sldi r24,r24,1 ; Shift left 1 - sldi r25,r25,1 ; Shift left 1 - sldi r26,r26,1 ; Shift left 1 - sldi r27,r27,1 ; Shift left 1 - -mckNoFix: std r24,savexdat0(r13) ; Save the MCK source in case we pass the error - std r25,savexdat1(r13) ; Save the Core FIR in case we pass the error - std r26,savexdat2(r13) ; Save the L2 FIR in case we pass the error - std r27,savexdat3(r13) ; Save the BUS FIR in case we pass the error - - rlwinm. r0,r20,0,mckIFUE-32,mckIFUE-32 ; Is this some kind of uncorrectable? - bne mckUE ; Yeah... - - rlwinm. r0,r20,0,mckLDST-32,mckLDST-32 ; Some kind of load/store error? - bne mckHandleLDST ; Yes... - - rldicl. r0,r20,46,62 ; Get the error cause code - beq mckNotSure ; We need some more checks for this one... - - cmplwi r0,2 ; Check for TLB parity error - blt mckSLBparity ; This is an SLB parity error... - bgt mckhIFUE ; This is an IFetch tablewalk reload UE... - -; IFetch TLB parity error - - isync - tlbiel r23 ; Locally invalidate TLB entry for iaddr - sync ; Wait for it - b ceMck ; All recovered... - -; SLB parity error. This could be software caused. We get one if there is -; more than 1 valid SLBE with a matching ESID. That one we do not want to -; try to recover from. Search for it and if we get it, panic. - -mckSLBparity: - crclr cr0_eq ; Make sure we are not equal so we take correct exit - - la r3,emvr0(r2) ; Use this to keep track of valid ESIDs we find - li r5,0 ; Start with index 0 - -mckSLBck: la r4,emvr0(r2) ; Use this to keep track of valid ESIDs we find - slbmfee r6,r5 ; Get the next SLBE - andis. r0,r6,0x0800 ; See if valid bit is on - beq mckSLBnx ; Skip invalid and go to next - -mckSLBck2: cmpld r4,r3 ; Have we reached the end of the table? - beq mckSLBne ; Yes, go enter this one... - ld r7,0(r4) ; Pick up the saved ESID - cmpld r6,r7 ; Is this a match? - beq mckSLBrec ; Whoops, I did bad, recover and pass up... - addi r4,r4,8 ; Next table entry - b mckSLBck2 ; Check the next... - -mckSLBnx: addi r5,r5,1 ; Point to next SLBE - cmplwi r5,64 ; Have we checked all of them? - bne++ mckSLBck ; Not yet, check again... - b mckSLBrec ; We looked at them all, go recover... - -mckSLBne: std r6,0(r3) ; Save this ESID - addi r3,r3,8 ; Point to the new slot - b mckSLBnx ; Go do the next SLBE... - -; Recover an SLB error - -mckSLBrec: li r0,0 ; Set an SLB slot index of 0 - slbia ; Trash all SLB entries (except for entry 0 that is) - slbmfee r7,r0 ; Get the entry that is in SLB index 0 - rldicr r7,r7,0,35 ; Clear the valid bit and the rest - slbie r7 ; Invalidate it - - li r3,0 ; Set the first SLBE - -mckSLBclr: slbmte r0,r3 ; Clear the whole entry to 0s - addi r3,r3,1 ; Bump index - cmplwi cr1,r3,64 ; Have we done them all? - bne++ cr1,mckSLBclr ; Yup.... - - sth r3,ppInvSeg(r2) ; Store non-zero to trigger SLB reload - bne++ ceMck ; This was not a programming error, all recovered... - b ueMck ; Pass the software error up... - -; -; Handle a load/store unit error. We need to decode the DSISR -; - -mckHandleLDST: - rlwinm. r0,r21,0,mckL1DCPE,mckL1DCPE ; An L1 data cache parity error? - bne++ mckL1D ; Yeah, we dealt with this back in the vector... - - rlwinm. r0,r21,0,mckL1DTPE,mckL1DTPE ; An L1 tag error? - bne++ mckL1T ; Yeah, we dealt with this back in the vector... - - rlwinm. r0,r21,0,mckUEdfr,mckUEdfr ; Is the a "deferred" UE? - bne mckDUE ; Yeah, go see if expected... - - rlwinm. r0,r21,0,mckUETwDfr,mckUETwDfr ; Is the a "deferred" tablewalk UE? - bne mckDTW ; Yeah, no recovery... - - rlwinm. r0,r21,0,mckSLBPE,mckSLBPE ; SLB parity error? - bne mckSLBparity ; Yeah, go attempt recovery.... - -; This is a recoverable D-ERAT or TLB error - - la r9,hwMckERCPE(r2) ; Get DERAT parity error count - -mckInvDAR: isync - tlbiel r22 ; Locally invalidate the TLB entry - sync - - lwz r21,0(r9) ; Get count - addi r21,r21,1 ; Count this one - stw r21,0(r9) ; Stick it back - - b ceMck ; All recovered... - -; -; When we come here, we are not quite sure what the error is. We need to -; dig a bit further. -; -; R24 is interrupt source -; R25 is Core FIR -; -; Note that both have been cleared already. -; - -mckNotSure: - rldicl. r0,r24,AsyMCKfir+1,63 ; Something in the FIR? - bne-- mckFIR ; Yup, go check some more... - - rldicl. r0,r24,AsyMCKhri+1,63 ; Hang recovery? - bne-- mckHangRcvr ; Yup... - - rldicl. r0,r24,AsyMCKext+1,63 ; External signal? - bne-- mckExtMck ; Yup... - -; -; We really do not know what this one is or what to do with it... -; - -mckUnk: lwz r21,hwMckUnk(r2) ; Get unknown error count - addi r21,r21,1 ; Count it - stw r21,hwMckUnk(r2) ; Stuff it - b ueMck ; Go south, young man... - -; -; Hang recovery. This is just a notification so we only count. -; - -mckHangRcrvr: - lwz r21,hwMckHang(r2) ; Get hang recovery count - addi r21,r21,1 ; Count this one - stw r21,hwMckHang(r2) ; Stick it back - b ceMck ; All recovered... - -; -; Externally signaled MCK. No recovery for the moment, but we this may be -; where we handle ml_probe_read problems eventually. -; -mckExtMck: - lwz r21,hwMckHang(r2) ; Get hang recovery count - addi r21,r21,1 ; Count this one - stw r21,hwMckHang(r2) ; Stick it back - b ceMck ; All recovered... - -; -; Machine check cause is in a FIR. Suss it out here. -; Core FIR is in R25 and has been cleared in HW. -; - -mckFIR: rldicl. r0,r25,cFIRICachePE+1,63 ; I-Cache parity error? - la r19,hwMckICachePE(r2) ; Point to counter - bne mckInvICache ; Go invalidate I-Cache... - - rldicl. r0,r25,cFIRITagPE0+1,63 ; I-Cache tag parity error? - la r19,hwMckITagPE(r2) ; Point to counter - bne mckInvICache ; Go invalidate I-Cache... - - rldicl. r0,r25,cFIRITagPE1+1,63 ; I-Cache tag parity error? - la r19,hwMckITagPE(r2) ; Point to counter - bne mckInvICache ; Go invalidate I-Cache... - - rldicl. r0,r25,cFIRIEratPE+1,63 ; IERAT parity error? - la r19,hwMckIEratPE(r2) ; Point to counter - bne mckInvERAT ; Go invalidate ERATs... - - rldicl. r0,r25,cFIRIFUL2UE+1,63 ; IFetch got L2 UE? - bne mckhIFUE ; Go count and pass up... - - rldicl. r0,r25,cFIRDCachePE+1,63 ; D-Cache PE? - bne mckL1D ; Handled, just go count... - - rldicl. r0,r25,cFIRDTagPE+1,63 ; D-Cache tag PE? - bne mckL1T ; Handled, just go count... - - rldicl. r0,r25,cFIRDEratPE+1,63 ; DERAT PE? - la r19,hwMckDEratPE(r2) ; Point to counter - bne mckInvERAT ; Go invalidate ERATs... - - rldicl. r0,r25,cFIRTLBPE+1,63 ; TLB PE? - la r9,hwMckTLBPE(r2) ; Get TLB parity error count - bne mckInvDAR ; Go recover... - - rldicl. r0,r25,cFIRSLBPE+1,63 ; SLB PE? - bne mckSLBparity ; Cope with it... - - b mckUnk ; Have not a clue... - -; -; General recovery for I-Cache errors. Just flush it completely. -; - - .align 7 ; Force into cache line - -mckInvICache: - lis r0,0x0080 ; Get a 0x0080 (bit 9 >> 32) - mfspr r21,hid1 ; Get the current HID1 - sldi r0,r0,32 ; Get the "forced ICBI match" bit - or r0,r0,r21 ; Set forced match - - isync - mtspr hid1,r0 ; Stick it - mtspr hid1,r0 ; Stick it again - isync - - li r6,0 ; Start at 0 - -mckIcbi: icbi 0,r6 ; Kill I$ - addi r6,r6,128 ; Next line - andis. r5,r6,1 ; Have we done them all? - beq++ mckIcbi ; Not yet... - - isync - mtspr hid1,r21 ; Restore original HID1 - mtspr hid1,r21 ; Stick it again - isync - - lwz r5,0(r19) ; Get the counter - addi r5,r5,1 ; Count it - stw r5,0(r19) ; Stuff it back - b ceMck ; All recovered... - - -; General recovery for ERAT problems - handled in exception vector already - -mckInvERAT: lwz r21,0(r19) ; Get the exception count spot - addi r21,r21,1 ; Count this one - stw r21,0(r19) ; Save count - b ceMck ; All recovered... - -; General hang recovery - this is a notification only, just count. - -mckHangRcvr: - lwz r21,hwMckHang(r2) ; Get hang recovery count - addi r21,r21,1 ; Count this one - stw r21,hwMckHang(r2) ; Stick it back - b ceMck ; All recovered... - - -; -; These are the uncorrectable errors, just count them then pass it along. -; - -mckUE: lwz r21,hwMckUE(r2) ; Get general uncorrectable error count - addi r21,r21,1 ; Count it - stw r21,hwMckUE(r2) ; Stuff it - b ueMck ; Go south, young man... - -mckhIFUE: lwz r21,hwMckIUEr(r2) ; Get I-Fetch TLB reload uncorrectable error count - addi r21,r21,1 ; Count it - stw r21,hwMckIUEr(r2) ; Stuff it - b ueMck ; Go south, young man... - -mckDUE: lwz r21,hwMckDUE(r2) ; Get deferred uncorrectable error count - addi r21,r21,1 ; Count it - stw r21,hwMckDUE(r2) ; Stuff it - -; -; Right here is where we end up after a failure on a ml_probe_read_64. -; We will check if that is the case, and if so, fix everything up and -; return from it. - - lis r8,hi16(EXT(ml_probe_read_64)) ; High of start - lis r9,hi16(EXT(ml_probe_read_mck_64)) ; High of end - ori r8,r8,lo16(EXT(ml_probe_read_64)) ; Low of start - ori r9,r9,lo16(EXT(ml_probe_read_mck_64)) ; Low of end - cmpld r23,r8 ; Too soon? - cmpld cr1,r23,r9 ; Too late? - - cror cr0_lt,cr0_lt,cr1_gt ; Too soon or too late? - ld r3,saver12(r13) ; Get the original MSR - ld r5,savelr(r13) ; Get the return address - li r4,0 ; Get fail code - blt-- ueMck ; This is a normal machine check, just pass up... - std r5,savesrr0(r13) ; Set the return MSR - - std r3,savesrr1(r13) ; Set the return address - std r4,saver3(r13) ; Set failure return code - b ceMck ; All recovered... - -mckDTW: lwz r21,hwMckDTW(r2) ; Get deferred tablewalk uncorrectable error count - addi r21,r21,1 ; Count it - stw r21,hwMckDTW(r2) ; Stuff it - b ueMck ; Go south, young man... - -mckL1D: lwz r21,hwMckL1DPE(r2) ; Get data cache parity error count - addi r21,r21,1 ; Count it - stw r21,hwMckL1DPE(r2) ; Stuff it - b ceMck ; All recovered... - -mckL1T: lwz r21,hwMckL1TPE(r2) ; Get TLB parity error count - addi r21,r21,1 ; Count it - stw r21,hwMckL1TPE(r2) ; Stuff it - -ceMck: lwz r21,mckFlags(0) ; Get the flags - li r0,1 ; Set the recovered flag before passing up - rlwinm. r21,r21,0,31,31 ; Check if we want to log recoverables - stw r0,savemisc3(r13) ; Set it - beq++ EatRupt ; No log of recoverables wanted... - b PassUpTrap ; Go up and log error... - -ueMck: li r0,0 ; Set the unrecovered flag before passing up - stw r0,savemisc3(r13) ; Set it - b PassUpTrap ; Go up and log error and probably panic - -; -; We come here to handle program exceptions -; -; When the program check is a trap instruction and it happens when -; we are executing injected code, we need to check if it is an exit trap. -; If it is, we need to populate the current savearea with some of the context from -; the saved pre-inject savearea. This is needed because the current savearea will be -; tossed as part of the pass up code. Additionally, because we will not be nullifying -; the emulated instruction as we do with any other exception. -; - - .align 5 - -ProgramChk: lwz r5,savesrr1+4(r13) ; Get the interrupt SRR1 - lwz r3,ijsave(r2) ; Get the inject savearea top - lwz r4,ijsave+4(r2) ; And get the bottom of the inject savearea pointer - rlwimi r5,r5,15,31,31 ; Scoot trap flag down to a spare bit - rlwinm r3,r3,0,1,0 ; Copy low 32 bits of to top 32 - li r0,0x0023 ; Get bits that match scooted trap flag, IR, and RI - and r0,r5,r0 ; Clear any extra SRR1 bits - rlwimi. r3,r4,0,0,31 ; Insert low part of 64-bit address in bottom 32 bits and see if ijsave is 0 - cmplwi cr1,r0,1 ; Make sure we were IR off, RI off, and got a trap exception - crandc cr0_eq,cr1_eq,cr0_eq ; If we are injecting, ijsave will be non-zero and we had the trap bit set - mfsrr0 r4 ; Get the PC - bne++ cr0,mustem ; This is not an injection exit... - - lwz r4,0(r4) ; Get the trap instruction - lis r5,hi16(ijtrap) ; Get high half of inject exit trap - ori r5,r5,lo16(ijtrap) ; And the low half - cmplw r4,r5 ; Correct trap instruction? - bne mustem ; No, not inject exit... - - lwz r4,savesrr0(r3) ; Get the original SRR0 - lwz r5,savesrr0+4(r3) ; And the rest of it - lwz r6,savesrr1(r3) ; Get the original SRR1 - stw r4,savesrr0(r13) ; Set the new SRR0 to the original - lwz r4,savesrr1+4(r13) ; Get the bottom of the new SRR1 - lwz r7,savesrr1+4(r3) ; Get the bottom of the original SRR1 - li r11,T_INJECT_EXIT ; Set an inject exit exception - stw r5,savesrr0+4(r13) ; Set the new bottom of SRR0 to the original - rlwimi r7,r4,0,MSR_FP_BIT,MSR_FP_BIT ; Make sure we retain the current floating point enable bit - stw r6,savesrr1(r13) ; Save the top half of the original SRR1 - sth r7,savesrr1+6(r13) ; And the last bottom - stw r11,saveexception(r13) ; Set the new the exception code - b PassUpTrap ; Go pass it on up... - -mustem: b EXT(Emulate) ; Go try to emulate this one... - - -/* - * Here's where we come back from some instruction emulator. If we come back with - * T_IN_VAIN, the emulation is done and we should just reload state and directly - * go back to the interrupted code. Otherwise, we'll check to see if - * we need to redrive with a different interrupt, i.e., DSI. - * Note that this we are actually not redriving the rupt, rather changing it - * into a different one. Thus we clear the redrive bit. - */ - - .align 5 - .globl EXT(EmulExit) - -LEXT(EmulExit) - - cmplwi cr1,r11,T_IN_VAIN ; Was it emulated? - lis r1,hi16(SAVredrive) ; Get redrive request - beq++ cr1,EatRupt ; Yeah, just blast back to the user... - lwz r4,SAVflags(r13) ; Pick up the flags - - and. r0,r4,r1 ; Check if redrive requested - - beq++ PassUpTrap ; No redrive, just keep on going... - - b Redrive ; Redrive the exception... - -; -; Jump into main handler code switching on VM at the same time. -; -; We assume kernel data is mapped contiguously in physical -; memory, otherwise we would need to switch on (at least) virtual data. -; SRs are already set up. -; - - .align 5 - -PassUpTrap: lis r20,hi16(EXT(thandler)) ; Get thandler address - ori r20,r20,lo16(EXT(thandler)) ; Get thandler address - b PassUp ; Go pass it up... - -PassUpRupt: lis r20,hi16(EXT(ihandler)) ; Get ihandler address - ori r20,r20,lo16(EXT(ihandler)) ; Get ihandler address - b PassUp ; Go pass it up... - - .align 5 - -PassUpFPU: lis r20,hi16(EXT(fpu_switch)) ; Get FPU switcher address - ori r20,r20,lo16(EXT(fpu_switch)) ; Get FPU switcher address - b PassUp ; Go pass it up... - - .align 5 - -PassUpVMX: lis r20,hi16(EXT(vec_switch)) ; Get VMX switcher address - ori r20,r20,lo16(EXT(vec_switch)) ; Get VMX switcher address - bt++ featAltivec,PassUp ; We have VMX on this CPU... - li r11,T_PROGRAM ; Say that it is a program exception - li r20,8 ; Set invalid instruction - stw r11,saveexception(r13) ; Set the new the exception code - sth r20,savesrr1+4(r13) ; Set the invalid instruction SRR code - - b PassUpTrap ; Go pass it up... - - .align 5 - -PassUpAbend: - lis r20,hi16(EXT(chandler)) ; Get choke handler address - ori r20,r20,lo16(EXT(chandler)) ; Get choke handler address - b PassUp ; Go pass it up... - - .align 5 - -PassUp: - mfsprg r29,0 ; Get the per_proc block back - - cmplwi cr1,r11,T_INJECT_EXIT ; Are we exiting from an injection? - lwz r3,ijsave(r29) ; Get the inject savearea top - lwz r4,ijsave+4(r29) ; And get the bottom of the inject savearea pointer - rlwinm r3,r3,0,1,0 ; Copy low 32 bits to top 32 - rlwimi. r3,r4,0,0,31 ; Insert low part of 64-bit address in bottom 32 bits and see if ijsave is 0 - beq++ notaninjct ; Skip tossing savearea if no injection... - - beq-- cr1,nonullify ; Have not finished the instruction, go nullify it... - - lwz r4,savesrr1+4(r3) ; Get the interrupt modifiers from the original SRR1 - lwz r5,savesrr1+4(r13) ; Get the interrupt modifiers from the new SRR1 - lwz r6,savedar(r13) ; Get the top of the DAR - rlwimi r4,r5,0,0,15 ; copy the new top to the original SRR1 - lwz r7,savedar+4(r13) ; Get the bottom of the DAR - rlwimi r4,r5,0,MSR_FP_BIT,MSR_FP_BIT ; Copy the new FP enable bit into the old SRR1 - stw r4,savesrr1+4(r3) ; Save the updated SRR1 - lwz r5,savedsisr(r13) ; Grab the new DSISR - - mr r4,r13 ; Save the new savearea pointer - mr r13,r3 ; Point to the old savearea we are keeping - stw r6,savedar(r13) ; Save top of new DAR - stw r7,savedar+4(r13) ; Save bottom of new DAR - stw r5,savedsisr(r13) ; Set the new DSISR - stw r11,saveexception(r13) ; Set the new exception code - mr r3,r4 ; Point to the new savearea in order to toss it - -nonullify: li r0,0 ; Get a zero - stw r0,ijsave(r29) ; Clear the pointer to the saved savearea - stw r0,ijsave+4(r29) ; Clear the pointer to the saved savearea - - bl EXT(save_ret_phys) ; Dump that pesky extra savearea - -notaninjct: lwz r10,SAVflags(r13) ; Pick up the flags - - li r0,0xFFF ; Get a page mask - li r2,MASK(MSR_BE)|MASK(MSR_SE) ; Get the mask to save trace bits - andc r5,r13,r0 ; Back off to the start of savearea block - mfmsr r3 ; Get our MSR - rlwinm r10,r10,0,SAVredriveb+1,SAVredriveb-1 ; Clear the redrive before we pass it up - li r21,MSR_SUPERVISOR_INT_OFF ; Get our normal MSR value - and r3,r3,r2 ; Clear all but trace - lwz r5,SACvrswap+4(r5) ; Get real to virtual conversion - or r21,r21,r3 ; Keep the trace bits if they are on - stw r10,SAVflags(r13) ; Set the flags with the cleared redrive flag - - xor r4,r13,r5 ; Pass up the virtual address of context savearea - rlwinm r4,r4,0,0,31 ; Clean top half of virtual savearea if 64-bit - - mr r3,r21 ; Pass in the MSR we will go to - bl EXT(switchSegs) ; Go handle the segment registers/STB - - lwz r3,saveexception(r13) ; Recall the exception code - - mtsrr0 r20 ; Set up the handler address - mtsrr1 r21 ; Set up our normal MSR value - - bt++ pf64Bitb,puLaunch ; Handle 64-bit machine... - - rfi ; Launch the exception handler - -puLaunch: rfid ; Launch the exception handler - -/* - * This routine is the main place where we return from an interruption. - * - * This is also where we release the quickfret list. These are saveareas - * that were released as part of the exception exit path in hw_exceptions. - * In order to save an atomic operation (which actually will not work - * properly on a 64-bit machine) we use holdQFret to indicate that the list - * is in flux and should not be looked at here. This comes into play only - * when we take a PTE miss when we are queuing a savearea onto qfret. - * Quite rare but could happen. If the flag is set, this code does not - * release the list and waits until next time. - * - * All we need to remember here is that R13 must point to the savearea - * that has the context we need to load up. Translation and interruptions - * must be disabled. - * - * This code always loads the context in the savearea pointed to - * by R13. In the process, it throws away the savearea. If there - * is any tomfoolery with savearea stacks, it must be taken care of - * before we get here. - * - */ - - .align 5 - -EatRupt: mfsprg r29,0 ; Get the per_proc block back - mr r31,r13 ; Move the savearea pointer to the far end of the register set - mfsprg r27,2 ; Get the processor features - - lwz r3,holdQFret(r29) ; Get the release hold off flag - - bt++ pf64Bitb,eat64a ; Skip down to the 64-bit version of this - -; -; This starts the 32-bit version -; - - mr. r3,r3 ; Should we hold off the quick release? - lwz r30,quickfret+4(r29) ; Pick up the quick fret list, if any - la r21,saver0(r31) ; Point to the first thing we restore - bne- ernoqfret ; Hold off set, do not release just now... - -erchkfret: mr. r3,r30 ; Any savearea to quickly release? - beq+ ernoqfret ; No quickfrets... - lwz r30,SAVprev+4(r30) ; Chain back now - - bl EXT(save_ret_phys) ; Put it on the free list - stw r30,quickfret+4(r29) ; Dequeue previous guy (really, it is ok to wait until after the release) - b erchkfret ; Try the next one... - - .align 5 - -ernoqfret: - lwz r30,SAVflags(r31) ; Pick up the flags - lis r0,hi16(SAVinject) ; Get inject flag - dcbt 0,r21 ; Touch in the first thing we need - -; -; Here we release the savearea. -; -; Important!!!! The savearea is released before we are done with it. When the -; local free savearea list (anchored at lclfree) gets too long, save_ret_phys -; will trim the list, making the extra saveareas allocatable by another processor -; The code in there must ALWAYS leave our savearea on the local list, otherwise -; we could be very, very unhappy. The code there always queues the "just released" -; savearea to the head of the local list. Then, if it needs to trim, it will -; start with the SECOND savearea, leaving ours intact. -; -; If we are going to inject code here, we must not toss the savearea because -; we will continue to use it. The code stream to inject is in it and we -; use it to hold the pre-inject context so that we can merge that with the -; post-inject context. The field ijsave in the per-proc is used to point to the savearea. -; -; Note that we will NEVER pass an interrupt up without first dealing with this savearea. -; -; All permanent interruptions (i.e., not denorm, alignment, or handled page and segment faults) -; will nullify any injected code and pass the interrupt up in the original savearea. A normal -; inject completion will merge the original context into the new savearea and pass that up. -; -; Note that the following code which sets up the injection will only be executed when -; SAVinject is set. That means that if will not run if we are returning from an alignment -; or denorm exception, or from a handled page or segment fault. -; - - andc r0,r30,r0 ; Clear the inject flag - cmplw cr4,r0,r30 ; Remember if we need to inject - mr r3,r31 ; Get the exiting savearea in parm register - beq+ cr4,noinject ; No, we are not going to inject instructions... - - stw r0,SAVflags(r31) ; Yes we are, clear the request... - - lhz r26,PP_CPU_NUMBER(r29) ; Get the cpu number - lwz r25,saveinstr(r31) ; Get the instruction count - la r3,saveinstr+4(r31) ; Point to the instruction stream - slwi r26,r26,6 ; Get offset to the inject code stream for this processor - li r5,0 ; Get the current instruction offset - ori r26,r26,lo16(EXT(ijcode)) ; Get the base of the inject buffer for this processor (always < 64K) - slwi r25,r25,2 ; Multiply by 4 - -injctit: lwzx r6,r5,r3 ; Pick up the instruction - stwx r6,r5,r26 ; Inject into code buffer - addi r5,r5,4 ; Bump offset - cmplw r5,r25 ; Have we hit the end? - blt- injctit ; Continue until we have copied all... - - lis r3,0x0FFF ; Build our magic trap - ori r3,r3,0xC9C9 ; Build our magic trap - stw r31,ijsave+4(r29) ; Save the original savearea for injection - stwx r3,r5,r26 ; Save the magic trap - - li r3,32 ; Get cache line size - dcbf 0,r26 ; Flush first line - dcbf r3,r26 ; And the second - sync ; Hang on until it's done - - icbi 0,r26 ; Flush instructions in the first line - icbi r3,r26 ; And the second - isync ; Throw anything stale away - sync ; Hang on until it's done - b injected ; Skip the savearea release... - -noinject: bl EXT(save_ret_phys) ; Put old savearea on the free list - -injected: lwz r3,savesrr1+4(r31) ; Pass in the MSR we are going to - bl EXT(switchSegs) ; Go handle the segment registers/STB - - li r3,savesrr1+4 ; Get offset to the srr1 value - lwarx r8,r3,r31 ; Get destination MSR and take reservation along the way (just so we can blow it away) - cmplw cr3,r14,r14 ; Set that we do not need to stop streams - - li r21,emfp0 ; Point to the fp savearea - stwcx. r8,r3,r31 ; Blow away any reservations we hold - - lwz r25,savesrr0+4(r31) ; Get the SRR0 to use - - la r28,saver4(r31) ; Point to the 32-byte line with r4-r7 - dcbz r21,r29 ; Clear a work area - lwz r0,saver0+4(r31) ; Restore R0 - dcbt 0,r28 ; Touch in r4-r7 - lwz r1,saver1+4(r31) ; Restore R1 - - beq+ cr4,noinject2 ; No code injection here... - -; -; If we are injecting, we need to stay in supervisor state with instruction -; address translation off. We also need to have as few potential interruptions as -; possible. Therefore, we turn off external interruptions and tracing (which doesn't -; make much sense anyway). -; - ori r8,r8,lo16(ijemoff) ; Force the need-to-be-off bits on - mr r25,r26 ; Get the injected code address - xori r8,r8,lo16(ijemoff) ; Turn off all of the need-to-be-off bits - -noinject2: lwz r2,saver2+4(r31) ; Restore R2 - la r28,saver8(r31) ; Point to the 32-byte line with r8-r11 - lwz r3,saver3+4(r31) ; Restore R3 - andis. r6,r27,hi16(pfAltivec) ; Do we have altivec on the machine? - dcbt 0,r28 ; touch in r8-r11 - lwz r4,saver4+4(r31) ; Restore R4 - la r28,saver12(r31) ; Point to the 32-byte line with r12-r15 - mtsrr0 r25 ; Restore the SRR0 now - lwz r5,saver5+4(r31) ; Restore R5 - mtsrr1 r8 ; Restore the SRR1 now - lwz r6,saver6+4(r31) ; Restore R6 - - dcbt 0,r28 ; touch in r12-r15 - la r28,saver16(r31) - - lwz r7,saver7+4(r31) ; Restore R7 - lwz r8,saver8+4(r31) ; Restore R8 - lwz r9,saver9+4(r31) ; Restore R9 - - dcbt 0,r28 ; touch in r16-r19 - la r28,saver20(r31) - - lwz r10,saver10+4(r31) ; Restore R10 - lwz r11,saver11+4(r31) ; Restore R11 - - dcbt 0,r28 ; touch in r20-r23 - la r28,savevscr(r31) ; Point to the status area - - lwz r12,saver12+4(r31) ; Restore R12 - lwz r13,saver13+4(r31) ; Restore R13 - - la r14,savectr+4(r31) - dcbt 0,r28 ; Touch in VSCR and FPSCR - dcbt 0,r14 ; touch in CTR, DAR, DSISR, VRSAVE, and Exception code - - lwz r26,next_savearea+4(r29) ; Get the exception save area - la r28,saver24(r31) - - lwz r14,saver14+4(r31) ; Restore R14 - lwz r15,saver15+4(r31) ; Restore R15 - - - stfd f0,emfp0(r29) ; Save FP0 - lwz r27,savevrsave(r31) ; Get the vrsave - dcbt 0,r28 ; touch in r24-r27 - la r28,savevscr(r31) ; Point to the status area - lfd f0,savefpscrpad(r31) ; Get the fpscr - la r22,saver28(r31) - mtfsf 0xFF,f0 ; Restore fpscr - lfd f0,emfp0(r29) ; Restore the used register - - beq noavec3 ; No Altivec on this CPU... - - stvxl v0,r21,r29 ; Save a vector register - lvxl v0,0,r28 ; Get the vector status - mtspr vrsave,r27 ; Set the vrsave - mtvscr v0 ; Set the vector status - lvxl v0,r21,r29 ; Restore work vector register - -noavec3: dcbt 0,r22 ; touch in r28-r31 - - lwz r23,spcFlags(r29) ; Get the special flags from per_proc - la r17,savesrr0(r31) - la r26,saver0(r26) ; Point to the first part of the next savearea - dcbt 0,r17 ; touch in SRR0, SRR1, CR, XER, LR - lhz r28,pfrptdProc(r29) ; Get the reported processor type - - lwz r16,saver16+4(r31) ; Restore R16 - lwz r17,saver17+4(r31) ; Restore R17 - lwz r18,saver18+4(r31) ; Restore R18 - lwz r19,saver19+4(r31) ; Restore R19 - lwz r20,saver20+4(r31) ; Restore R20 - lwz r21,saver21+4(r31) ; Restore R21 - lwz r22,saver22+4(r31) ; Restore R22 - - cmpwi cr1,r28,CPU_SUBTYPE_POWERPC_750 ; G3? - - dcbz 0,r26 ; Clear and allocate next savearea we use in the off chance it is still in when we next interrupt - - andis. r23,r23,hi16(perfMonitor) ; Is the performance monitor enabled? - lwz r23,saver23+4(r31) ; Restore R23 - cmpwi cr2,r28,CPU_SUBTYPE_POWERPC_7400 ; Yer standard G4? - lwz r24,saver24+4(r31) ; Restore R24 - lwz r25,saver25+4(r31) ; Restore R25 - lwz r26,saver26+4(r31) ; Restore R26 - lwz r27,saver27+4(r31) ; Restore R27 - - beq+ noPerfMonRestore32 ; No perf monitor... - - beq- cr1,perfMonRestore32_750 ; This is a G3... - beq- cr2,perfMonRestore32_7400 ; Standard G4... - - lwz r28,savepmc+16(r31) - lwz r29,savepmc+20(r31) - mtspr pmc5,r28 ; Restore PMC5 - mtspr pmc6,r29 ; Restore PMC6 - -perfMonRestore32_7400: - lwz r28,savemmcr2+4(r31) - mtspr mmcr2,r28 ; Restore MMCR2 - -perfMonRestore32_750: - lwz r28,savepmc+0(r31) - lwz r29,savepmc+4(r31) - mtspr pmc1,r28 ; Restore PMC1 - mtspr pmc2,r29 ; Restore PMC2 - lwz r28,savepmc+8(r31) - lwz r29,savepmc+12(r31) - mtspr pmc3,r28 ; Restore PMC3 - mtspr pmc4,r29 ; Restore PMC4 - lwz r28,savemmcr1+4(r31) - lwz r29,savemmcr0+4(r31) - mtspr mmcr1,r28 ; Restore MMCR1 - mtspr mmcr0,r29 ; Restore MMCR0 - -noPerfMonRestore32: - lwz r28,savecr(r31) ; Get CR to restore - lwz r29,savexer+4(r31) ; Get XER to restore - mtcr r28 ; Restore the CR - lwz r28,savelr+4(r31) ; Get LR to restore - mtxer r29 ; Restore the XER - lwz r29,savectr+4(r31) ; Get the CTR to restore - mtlr r28 ; Restore the LR - lwz r28,saver30+4(r31) ; Get R30 - mtctr r29 ; Restore the CTR - lwz r29,saver31+4(r31) ; Get R31 - mtsprg 2,r28 ; Save R30 for later - lwz r28,saver28+4(r31) ; Restore R28 - mtsprg 3,r29 ; Save R31 for later - lwz r29,saver29+4(r31) ; Restore R29 - - mfsprg r31,0 ; Get per_proc - mfsprg r30,2 ; Restore R30 - lwz r31,pfAvailable(r31) ; Get the feature flags - mtsprg 2,r31 ; Set the feature flags - mfsprg r31,3 ; Restore R31 - - rfi ; Click heels three times and think very hard that there is no place like home... - - .long 0 ; Leave this here - .long 0 - .long 0 - .long 0 - .long 0 - .long 0 - .long 0 - .long 0 - - -; -; This starts the 64-bit version -; - - .align 7 - -eat64a: ld r30,quickfret(r29) ; Pick up the quick fret list, if any - - mr. r3,r3 ; Should we hold off the quick release? - la r21,saver0(r31) ; Point to the first thing we restore - bne-- ernoqfre64 ; Hold off set, do not release just now... - -erchkfre64: mr. r3,r30 ; Any savearea to quickly release? - beq+ ernoqfre64 ; No quickfrets... - ld r30,SAVprev(r30) ; Chain back now - - bl EXT(save_ret_phys) ; Put it on the free list - - std r30,quickfret(r29) ; Dequeue previous guy (really, it is ok to wait until after the release) - b erchkfre64 ; Try the next one... - - .align 7 - -ernoqfre64: lwz r30,SAVflags(r31) ; Pick up the flags - lis r0,hi16(SAVinject) ; Get inject flag - dcbt 0,r21 ; Touch in the first thing we need - -; -; Here we release the savearea. -; -; Important!!!! The savearea is released before we are done with it. When the -; local free savearea list (anchored at lclfree) gets too long, save_ret_phys -; will trim the list, making the extra saveareas allocatable by another processor -; The code in there must ALWAYS leave our savearea on the local list, otherwise -; we could be very, very unhappy. The code there always queues the "just released" -; savearea to the head of the local list. Then, if it needs to trim, it will -; start with the SECOND savearea, leaving ours intact. -; -; If we are going to inject code here, we must not toss the savearea because -; we will continue to use it. The code stream to inject is in it and we -; use it to hold the pre-inject context so that we can merge that with the -; post-inject context. The field ijsave in the per-proc is used to point to the savearea. -; -; Note that we will NEVER pass an interrupt up without first dealing with this savearea. -; -; All permanent interruptions (i.e., not denorm, alignment, or handled page and segment faults) -; will nullify any injected code and pass the interrupt up in the original savearea. A normal -; inject completion will merge the original context into the new savearea and pass that up. -; -; Note that the following code which sets up the injection will only be executed when -; SAVinject is set. That means that if will not run if we are returning from an alignment -; or denorm exception, or from a handled page or segment fault. -; - - - li r3,lgKillResv ; Get spot to kill reservation - andc r0,r30,r0 ; Clear the inject flag - stdcx. r3,0,r3 ; Blow away any reservations we hold - cmplw cr4,r0,r30 ; Remember if we need to inject - mr r3,r31 ; Get the exiting savearea in parm register - beq++ cr4,noinject3 ; No, we are not going to inject instructions... - - stw r0,SAVflags(r31) ; Yes we are, clear the request... - - lhz r26,PP_CPU_NUMBER(r29) ; Get the cpu number - lwz r25,saveinstr(r31) ; Get the instruction count - la r3,saveinstr+4(r31) ; Point to the instruction stream - slwi r26,r26,6 ; Get offset to the inject code stream for this processor - li r5,0 ; Get the current instruction offset - ori r26,r26,lo16(EXT(ijcode)) ; Get the base of the inject buffer for this processor (always < 64K) - slwi r25,r25,2 ; Multiply by 4 - -injctit2: lwzx r6,r5,r3 ; Pick up the instruction - stwx r6,r5,r26 ; Inject into code buffer - addi r5,r5,4 ; Bump offset - cmplw r5,r25 ; Have we hit the end? - blt-- injctit2 ; Continue until we have copied all... - - lis r3,0x0FFF ; Build our magic trap - ori r3,r3,0xC9C9 ; Build our magic trap - std r31,ijsave(r29) ; Save the original savearea for injection - stwx r3,r5,r26 ; Save the magic trap - - dcbf 0,r26 ; Flush the line - sync ; Hang on until it's done - - icbi 0,r26 ; Flush instructions in the line - isync ; Throw anything stale away - sync ; Hang on until it's done - b injected2 ; Skip the savearea release... - -noinject3: bl EXT(save_ret_phys) ; Put it on the free list - -injected2: lwz r3,savesrr1+4(r31) ; Pass in the MSR we will be going to - bl EXT(switchSegs) ; Go handle the segment registers/STB - - ld r8,savesrr1(r31) ; Get destination MSR - cmplw cr3,r14,r14 ; Set that we do not need to stop streams - li r21,emfp0 ; Point to a workarea - - ld r25,savesrr0(r31) ; Get the SRR0 to use - la r28,saver16(r31) ; Point to the 128-byte line with r16-r31 - dcbz128 r21,r29 ; Clear a work area - ld r0,saver0(r31) ; Restore R0 - dcbt 0,r28 ; Touch in r16-r31 - ld r1,saver1(r31) ; Restore R1 - - beq++ cr4,noinject4 ; No code injection here... - -; -; If we are injecting, we need to stay in supervisor state with instruction -; address translation off. We also need to have as few potential interruptions as -; possible. Therefore, we turn off external interruptions and tracing (which doesn't -; make much sense anyway). -; - ori r8,r8,lo16(ijemoff) ; Force the need-to-be-off bits on - mr r25,r26 ; Point pc to injection code buffer - xori r8,r8,lo16(ijemoff) ; Turn off all of the need-to-be-off bits - -noinject4: ld r2,saver2(r31) ; Restore R2 - ld r3,saver3(r31) ; Restore R3 - mtcrf 0x80,r27 ; Get facility availability flags (do not touch CR1-7) - ld r4,saver4(r31) ; Restore R4 - mtsrr0 r25 ; Restore the SRR0 now - ld r5,saver5(r31) ; Restore R5 - mtsrr1 r8 ; Restore the SRR1 now - ld r6,saver6(r31) ; Restore R6 - - ld r7,saver7(r31) ; Restore R7 - ld r8,saver8(r31) ; Restore R8 - ld r9,saver9(r31) ; Restore R9 - - la r28,savevscr(r31) ; Point to the status area - - ld r10,saver10(r31) ; Restore R10 - ld r11,saver11(r31) ; Restore R11 - ld r12,saver12(r31) ; Restore R12 - ld r13,saver13(r31) ; Restore R13 - - ld r26,next_savearea(r29) ; Get the exception save area - - ld r14,saver14(r31) ; Restore R14 - ld r15,saver15(r31) ; Restore R15 - lwz r27,savevrsave(r31) ; Get the vrsave - - bf-- pfAltivecb,noavec2s ; Skip if no VMX... - - stvxl v0,r21,r29 ; Save a vector register - lvxl v0,0,r28 ; Get the vector status - mtvscr v0 ; Set the vector status - - lvxl v0,r21,r29 ; Restore work vector register - -noavec2s: mtspr vrsave,r27 ; Set the vrsave - - lwz r28,saveexception(r31) ; Get exception type - stfd f0,emfp0(r29) ; Save FP0 - lfd f0,savefpscrpad(r31) ; Get the fpscr - mtfsf 0xFF,f0 ; Restore fpscr - lfd f0,emfp0(r29) ; Restore the used register - ld r16,saver16(r31) ; Restore R16 - lwz r30,spcFlags(r29) ; Get the special flags from per_proc - ld r17,saver17(r31) ; Restore R17 - ld r18,saver18(r31) ; Restore R18 - cmplwi cr1,r28,T_RESET ; Are we returning from a reset? - ld r19,saver19(r31) ; Restore R19 - ld r20,saver20(r31) ; Restore R20 - li r27,0 ; Get a zero - ld r21,saver21(r31) ; Restore R21 - la r26,saver0(r26) ; Point to the first part of the next savearea - andis. r30,r30,hi16(perfMonitor) ; Is the performance monitor enabled? - ld r22,saver22(r31) ; Restore R22 - ld r23,saver23(r31) ; Restore R23 - bne++ cr1,er64rrst ; We are not returning from a reset... - stw r27,lo16(EXT(ResetHandler)-EXT(ExceptionVectorsStart)+RESETHANDLER_TYPE)(br0) ; Allow resets again - -er64rrst: ld r24,saver24(r31) ; Restore R24 - - dcbz128 0,r26 ; Clear and allocate next savearea we use in the off chance it is still in when we next interrupt - - ld r25,saver25(r31) ; Restore R25 - ld r26,saver26(r31) ; Restore R26 - ld r27,saver27(r31) ; Restore R27 - - beq++ noPerfMonRestore64 ; Nope... - - lwz r28,savepmc+0(r31) - lwz r29,savepmc+4(r31) - mtspr pmc1_gp,r28 ; Restore PMC1 - mtspr pmc2_gp,r29 ; Restore PMC2 - lwz r28,savepmc+8(r31) - lwz r29,savepmc+12(r31) - mtspr pmc3_gp,r28 ; Restore PMC3 - mtspr pmc4_gp,r29 ; Restore PMC4 - lwz r28,savepmc+16(r31) - lwz r29,savepmc+20(r31) - mtspr pmc5_gp,r28 ; Restore PMC5 - mtspr pmc6_gp,r29 ; Restore PMC6 - lwz r28,savepmc+24(r31) - lwz r29,savepmc+28(r31) - mtspr pmc7_gp,r28 ; Restore PMC7 - mtspr pmc8_gp,r29 ; Restore PMC8 - ld r28,savemmcr1(r31) - ld r29,savemmcr2(r31) - mtspr mmcr1_gp,r28 ; Restore MMCR1 - mtspr mmcra_gp,r29 ; Restore MMCRA - ld r28,savemmcr0(r31) - - mtspr mmcr0_gp,r28 ; Restore MMCR0 - -noPerfMonRestore64: - mfsprg r30,0 ; Get per_proc - lwz r28,savecr(r31) ; Get CR to restore - ld r29,savexer(r31) ; Get XER to restore - mtcr r28 ; Restore the CR - ld r28,savelr(r31) ; Get LR to restore - mtxer r29 ; Restore the XER - ld r29,savectr(r31) ; Get the CTR to restore - mtlr r28 ; Restore the LR - ld r28,saver30(r31) ; Get R30 - mtctr r29 ; Restore the CTR - ld r29,saver31(r31) ; Get R31 - mtspr hsprg0,r28 ; Save R30 for later - ld r28,saver28(r31) ; Restore R28 - mtsprg 3,r29 ; Save R31 for later - ld r29,saver29(r31) ; Restore R29 - - lwz r31,pfAvailable(r30) ; Get the feature flags - ld r30,UAW(r30) ; Get the User Assist DoubleWord - mtsprg 2,r31 ; Set the feature flags - mfsprg r31,3 ; Restore R31 - mtsprg 3,r30 ; Set the UAW - mfspr r30,hsprg0 ; Restore R30 - - rfid ; Click heels three times and think very hard that there is no place like home... - - - -/* - * exception_exit(savearea *) - * - * - * ENTRY : IR and/or DR and/or interruptions can be on - * R3 points to the virtual address of a savearea - */ - - .align 5 - .globl EXT(exception_exit) - -LEXT(exception_exit) - - mfsprg r29,2 ; Get feature flags - mr r31,r3 ; Get the savearea in the right register - mtcrf 0x04,r29 ; Set the features - li r0,1 ; Get this just in case - mtcrf 0x02,r29 ; Set the features - lis r30,hi16(MASK(MSR_VEC)|MASK(MSR_FP)|MASK(MSR_ME)) ; Set up the MSR we will use throughout. Note that ME come on here if MCK - rlwinm r4,r3,0,0,19 ; Round down to savearea block base - lis r1,hi16(SAVredrive) ; Get redrive request - mfsprg r2,0 ; Get the per_proc block - ori r30,r30,lo16(MASK(MSR_VEC)|MASK(MSR_FP)|MASK(MSR_ME)) ; Rest of MSR - bt++ pf64Bitb,eeSixtyFour ; We are 64-bit... - - lwz r4,SACvrswap+4(r4) ; Get the virtual to real translation - - bt pfNoMSRirb,eeNoMSR ; No MSR... - - mtmsr r30 ; Translation and all off - isync ; Toss prefetch - b eeNoMSRx - - .align 5 - -eeSixtyFour: - ld r4,SACvrswap(r4) ; Get the virtual to real translation - rldimi r30,r0,63,MSR_SF_BIT ; Set SF bit (bit 0) - mtmsrd r30 ; Set 64-bit mode, turn off EE, DR, and IR - isync ; Toss prefetch - b eeNoMSRx - - .align 5 - -eeNoMSR: li r0,loadMSR ; Get the MSR setter SC - mr r3,r30 ; Get new MSR - sc ; Set it - -eeNoMSRx: xor r31,r31,r4 ; Convert the savearea to physical addressing - lwz r4,SAVflags(r31) ; Pick up the flags - mr r13,r31 ; Put savearea here also - - and. r0,r4,r1 ; Check if redrive requested - - dcbt br0,r2 ; We will need this in just a sec - - beq+ EatRupt ; No redrive, just exit... - -0: mftbu r2 ; Avoid using an obsolete timestamp for the redrive - mftb r4 - mftbu r0 - cmplw r0,r2 - bne-- 0b - - stw r2,SAVtime(r13) - stw r4,SAVtime+4(r13) - - lwz r11,saveexception(r13) ; Restore exception code - b Redrive ; Redrive the exception... - - - - .align 12 ; Force page alignment - - .globl EXT(ExceptionVectorsEnd) -EXT(ExceptionVectorsEnd): /* Used if relocating the exception vectors */ - - - - -; -; Here is where we keep the low memory globals -; - - . = 0x5000 - - .ascii "Hagfish " ; 5000 Unique eyecatcher - .long 0 ; 5008 Zero - .long 0 ; 500C Zero cont... - .long EXT(PerProcTable) ; 5010 pointer to per_proc_entry table - .long 0 ; 5014 Zero - - .globl EXT(mckFlags) -EXT(mckFlags): - .long 0 ; 5018 Machine check flags - - .long EXT(version) ; 501C Pointer to kernel version string - .long 0 ; 5020 physical memory window virtual address - .long 0 ; 5024 physical memory window virtual address - .long 0 ; 5028 user memory window virtual address - .long 0 ; 502C user memory window virtual address - .long 0 ; 5030 VMM boot-args forced feature flags - - .globl EXT(maxDec) -EXT(maxDec): - .long 0x7FFFFFFF ; 5034 maximum decrementer value - - - .globl EXT(pmsCtlp) -EXT(pmsCtlp): - .long 0 ; 5038 Pointer to power management stepper control - - .long 0 ; 503C reserved - .long 0 ; 5040 reserved - .long 0 ; 5044 reserved - .long 0 ; 5048 reserved - .long 0 ; 504C reserved - .long 0 ; 5050 reserved - .long 0 ; 5054 reserved - .long 0 ; 5058 reserved - .long 0 ; 505C reserved - .long 0 ; 5060 reserved - .long 0 ; 5064 reserved - .long 0 ; 5068 reserved - .long 0 ; 506C reserved - .long 0 ; 5070 reserved - .long 0 ; 5074 reserved - .long 0 ; 5078 reserved - .long 0 ; 507C reserved - - .globl EXT(trcWork) -EXT(trcWork): - .long 0 ; 5080 The next trace entry to use -#if DEBUG - .long 0xFFFFFFFF ; 5084 All enabled -#else - .long 0x00000000 ; 5084 All disabled on non-debug systems -#endif - .long 0 ; 5088 Start of the trace table - .long 0 ; 508C End (wrap point) of the trace - .long 0 ; 5090 Saved mask while in debugger - .long 0 ; 5094 Size of trace table (1 - 256 pages) - .long 0 ; 5098 traceGas[0] - .long 0 ; 509C traceGas[1] - - .long 0 ; 50A0 reserved - .long 0 ; 50A4 reserved - .long 0 ; 50A8 reserved - .long 0 ; 50AC reserved - .long 0 ; 50B0 reserved - .long 0 ; 50B4 reserved - .long 0 ; 50B8 reserved - .long 0 ; 50BC reserved - .long 0 ; 50C0 reserved - .long 0 ; 50C4 reserved - .long 0 ; 50C8 reserved - .long 0 ; 50CC reserved - .long 0 ; 50D0 reserved - .long 0 ; 50D4 reserved - .long 0 ; 50D8 reserved - .long 0 ; 50DC reserved - .long 0 ; 50E0 reserved - .long 0 ; 50E4 reserved - .long 0 ; 50E8 reserved - .long 0 ; 50EC reserved - .long 0 ; 50F0 reserved - .long 0 ; 50F4 reserved - .long 0 ; 50F8 reserved - .long 0 ; 50FC reserved - - .globl EXT(saveanchor) - -EXT(saveanchor): ; 5100 saveanchor - .set .,.+SVsize - - .long 0 ; 5140 reserved - .long 0 ; 5144 reserved - .long 0 ; 5148 reserved - .long 0 ; 514C reserved - .long 0 ; 5150 reserved - .long 0 ; 5154 reserved - .long 0 ; 5158 reserved - .long 0 ; 515C reserved - .long 0 ; 5160 reserved - .long 0 ; 5164 reserved - .long 0 ; 5168 reserved - .long 0 ; 516C reserved - .long 0 ; 5170 reserved - .long 0 ; 5174 reserved - .long 0 ; 5178 reserved - .long 0 ; 517C reserved - - .long 0 ; 5180 tlbieLock - - .long 0 ; 5184 reserved - .long 0 ; 5188 reserved - .long 0 ; 518C reserved - .long 0 ; 5190 reserved - .long 0 ; 5194 reserved - .long 0 ; 5198 reserved - .long 0 ; 519C reserved - .long 0 ; 51A0 reserved - .long 0 ; 51A4 reserved - .long 0 ; 51A8 reserved - .long 0 ; 51AC reserved - .long 0 ; 51B0 reserved - .long 0 ; 51B4 reserved - .long 0 ; 51B8 reserved - .long 0 ; 51BC reserved - .long 0 ; 51C0 reserved - .long 0 ; 51C4 reserved - .long 0 ; 51C8 reserved - .long 0 ; 51CC reserved - .long 0 ; 51D0 reserved - .long 0 ; 51D4 reserved - .long 0 ; 51D8 reserved - .long 0 ; 51DC reserved - .long 0 ; 51E0 reserved - .long 0 ; 51E4 reserved - .long 0 ; 51E8 reserved - .long 0 ; 51EC reserved - .long 0 ; 51F0 reserved - .long 0 ; 51F4 reserved - .long 0 ; 51F8 reserved - .long 0 ; 51FC reserved - - .globl EXT(dgWork) - -EXT(dgWork): - .long 0 ; 5200 dgLock - .long 0 ; 5204 dgFlags - .long 0 ; 5208 dgMisc0 - .long 0 ; 520C dgMisc1 - .long 0 ; 5210 dgMisc2 - .long 0 ; 5214 dgMisc3 - .long 0 ; 5218 dgMisc4 - .long 0 ; 521C dgMisc5 - - .globl EXT(LcksOpts) -EXT(LcksOpts): - .long 0 ; 5220 lcksWork - .long 0 ; 5224 reserved - .long 0 ; 5228 reserved - .long 0 ; 522C reserved - .long 0 ; 5230 reserved - .long 0 ; 5234 reserved - .long 0 ; 5238 reserved - .long 0 ; 523C reserved - .long 0 ; 5240 reserved - .long 0 ; 5244 reserved - .long 0 ; 5248 reserved - .long 0 ; 524C reserved - .long 0 ; 5250 reserved - .long 0 ; 5254 reserved - .long 0 ; 5258 reserved - .long 0 ; 525C reserved - .long 0 ; 5260 reserved - .long 0 ; 5264 reserved - .long 0 ; 5268 reserved - .long 0 ; 526C reserved - .long 0 ; 5270 reserved - .long 0 ; 5274 reserved - .long 0 ; 5278 reserved - .long 0 ; 527C reserved - - .globl EXT(pPcfg) -EXT(pPcfg): - .long 0x80000000 | (12 << 8) | 12 ; 5280 pcfDefPcfg - 4k - .long 0 ; 5284 pcfLargePcfg - .long 0 ; 5288 Non-primary page configurations - .long 0 ; 528C Non-primary page configurations - .long 0 ; 5290 Non-primary page configurations - .long 0 ; 5294 Non-primary page configurations - .long 0 ; 5298 Non-primary page configurations - .long 0 ; 529C Non-primary page configurations - - .long 0 ; 52A0 reserved - .long 0 ; 52A4 reserved - .long 0 ; 52A8 reserved - .long 0 ; 52AC reserved - .long 0 ; 52B0 reserved - .long 0 ; 52B4 reserved - .long 0 ; 52B8 reserved - .long 0 ; 52BC reserved - .long 0 ; 52C0 reserved - .long 0 ; 52C4 reserved - .long 0 ; 52C8 reserved - .long 0 ; 52CC reserved - .long 0 ; 52D0 reserved - .long 0 ; 52D4 reserved - .long 0 ; 52D8 reserved - .long 0 ; 52DC reserved - .long 0 ; 52E0 reserved - .long 0 ; 52E4 reserved - .long 0 ; 52E8 reserved - .long 0 ; 52EC reserved - .long 0 ; 52F0 reserved - .long 0 ; 52F4 reserved - .long 0 ; 52F8 reserved - .long 0 ; 52FC reserved - - .globl EXT(killresv) -EXT(killresv): - - .long 0 ; 5300 Used to kill reservations - .long 0 ; 5304 Used to kill reservations - .long 0 ; 5308 Used to kill reservations - .long 0 ; 530C Used to kill reservations - .long 0 ; 5310 Used to kill reservations - .long 0 ; 5314 Used to kill reservations - .long 0 ; 5318 Used to kill reservations - .long 0 ; 531C Used to kill reservations - .long 0 ; 5320 Used to kill reservations - .long 0 ; 5324 Used to kill reservations - .long 0 ; 5328 Used to kill reservations - .long 0 ; 532C Used to kill reservations - .long 0 ; 5330 Used to kill reservations - .long 0 ; 5334 Used to kill reservations - .long 0 ; 5338 Used to kill reservations - .long 0 ; 533C Used to kill reservations - .long 0 ; 5340 Used to kill reservations - .long 0 ; 5344 Used to kill reservations - .long 0 ; 5348 Used to kill reservations - .long 0 ; 534C Used to kill reservations - .long 0 ; 5350 Used to kill reservations - .long 0 ; 5354 Used to kill reservations - .long 0 ; 5358 Used to kill reservations - .long 0 ; 535C Used to kill reservations - .long 0 ; 5360 Used to kill reservations - .long 0 ; 5364 Used to kill reservations - .long 0 ; 5368 Used to kill reservations - .long 0 ; 536C Used to kill reservations - .long 0 ; 5370 Used to kill reservations - .long 0 ; 5374 Used to kill reservations - .long 0 ; 5378 Used to kill reservations - .long 0 ; 537C Used to kill reservations - - .long 0 ; 5380 reserved - .long 0 ; 5384 reserved - .long 0 ; 5388 reserved - .long 0 ; 538C reserved - .long 0 ; 5390 reserved - .long 0 ; 5394 reserved - .long 0 ; 5398 reserved - .long 0 ; 539C reserved - .long 0 ; 53A0 reserved - .long 0 ; 53A4 reserved - .long 0 ; 53A8 reserved - .long 0 ; 53AC reserved - .long 0 ; 53B0 reserved - .long 0 ; 53B4 reserved - .long 0 ; 53B8 reserved - .long 0 ; 53BC reserved - .long 0 ; 53C0 reserved - .long 0 ; 53C4 reserved - .long 0 ; 53C8 reserved - .long 0 ; 53CC reserved - .long 0 ; 53D0 reserved - .long 0 ; 53D4 reserved - .long 0 ; 53D8 reserved - .long 0 ; 53DC reserved - .long 0 ; 53E0 reserved - .long 0 ; 53E4 reserved - .long 0 ; 53E8 reserved - .long 0 ; 53EC reserved - .long 0 ; 53F0 reserved - .long 0 ; 53F4 reserved - .long 0 ; 53F8 reserved - .long 0 ; 53FC reserved - .long 0 ; 5400 reserved - .long 0 ; 5404 reserved - .long 0 ; 5408 reserved - .long 0 ; 540C reserved - .long 0 ; 5410 reserved - .long 0 ; 5414 reserved - .long 0 ; 5418 reserved - .long 0 ; 541C reserved - .long 0 ; 5420 reserved - .long 0 ; 5424 reserved - .long 0 ; 5428 reserved - .long 0 ; 542C reserved - .long 0 ; 5430 reserved - .long 0 ; 5434 reserved - .long 0 ; 5438 reserved - .long 0 ; 543C reserved - .long 0 ; 5440 reserved - .long 0 ; 5444 reserved - .long 0 ; 5448 reserved - .long 0 ; 544C reserved - .long 0 ; 5450 reserved - .long 0 ; 5454 reserved - .long 0 ; 5458 reserved - .long 0 ; 545C reserved - .long 0 ; 5460 reserved - .long 0 ; 5464 reserved - .long 0 ; 5468 reserved - .long 0 ; 546C reserved - .long 0 ; 5470 reserved - .long 0 ; 5474 reserved - .long 0 ; 5478 reserved - .long 0 ; 547C reserved - .long EXT(kmod) ; 5480 Pointer to kmod, debugging aid - .long EXT(kdp_trans_off) ; 5484 Pointer to kdp_trans_off, debugging aid - .long EXT(kdp_read_io) ; 5488 Pointer to kdp_read_io, debugging aid - .long 0 ; 548C Reserved for developer use - .long 0 ; 5490 Reserved for developer use - .long EXT(osversion) ; 5494 Pointer to osversion string, debugging aid - .long EXT(flag_kdp_trigger_reboot) ; 5498 Pointer to KDP reboot trigger, debugging aid - .long EXT(manual_pkt) ; 549C Pointer to KDP manual packet, debugging aid - -; -; The "shared page" is used for low-level debugging and is actually 1/2 page long -; - - . = 0x6000 - .globl EXT(sharedPage) - -EXT(sharedPage): ; This is a debugging page shared by all processors - .long 0xC24BC195 ; Comm Area validity value - .long 0x87859393 ; Comm Area validity value - .long 0xE681A2C8 ; Comm Area validity value - .long 0x8599855A ; Comm Area validity value - .long 0xD74BD296 ; Comm Area validity value - .long 0x8388E681 ; Comm Area validity value - .long 0xA2C88599 ; Comm Area validity value - .short 0x855A ; Comm Area validity value - .short 1 ; Comm Area version number - .fill 504*4,1,0 ; (filled with 0s) - -; -; The ijcode area is used for code injection. It is 1/2 page long and will allow 32 processors to inject -; 16 instructions each concurrently. -; - - .globl EXT(ijcode) - -EXT(ijcode): ; Code injection area - .fill 512*4,1,0 ; 6800 32x64 slots for code injection streams - - .data - .align ALIGN - .globl EXT(exception_end) -EXT(exception_end): - .long EXT(ExceptionVectorsEnd) -EXT(ExceptionVectorsStart) /* phys fn */ - - - diff --git a/osfmk/ppc/machine_routines.c b/osfmk/ppc/machine_routines.c deleted file mode 100644 index d4fb8e1ca..000000000 --- a/osfmk/ppc/machine_routines.c +++ /dev/null @@ -1,847 +0,0 @@ -/* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include /* for cpu_signal_handler() */ -#include -#include -#include -#include - -#include - -unsigned int LockTimeOut = 1250000000; -unsigned int MutexSpin = 0; - -static int max_cpus_initialized = 0; - -uint32_t warFlags = 0; -#define warDisMBpoff 0x80000000 -#define MAX_CPUS_SET 0x01 -#define MAX_CPUS_WAIT 0x02 - -decl_simple_lock_data(, spsLock); -unsigned int spsLockInit = 0; - -extern unsigned int hwllckPatch_isync; -extern unsigned int hwulckPatch_isync; -extern unsigned int hwulckbPatch_isync; -extern unsigned int hwlmlckPatch_isync; -extern unsigned int hwltlckPatch_isync; -extern unsigned int hwcsatomicPatch_isync; -extern unsigned int mlckePatch_isync; -extern unsigned int mlckPatch_isync; -extern unsigned int mltelckPatch_isync; -extern unsigned int mltlckPatch_isync; -extern unsigned int mulckePatch_isync; -extern unsigned int mulckPatch_isync; -extern unsigned int slckPatch_isync; -extern unsigned int stlckPatch_isync; -extern unsigned int sulckPatch_isync; -extern unsigned int rwlePatch_isync; -extern unsigned int rwlsPatch_isync; -extern unsigned int rwlsePatch_isync; -extern unsigned int rwlesPatch_isync; -extern unsigned int rwtlePatch_isync; -extern unsigned int rwtlsPatch_isync; -extern unsigned int rwldPatch_isync; -extern unsigned int hwulckPatch_eieio; -extern unsigned int mulckPatch_eieio; -extern unsigned int mulckePatch_eieio; -extern unsigned int sulckPatch_eieio; -extern unsigned int rwlesPatch_eieio; -extern unsigned int rwldPatch_eieio; - -struct patch_up { - unsigned int *addr; - unsigned int data; -}; - -typedef struct patch_up patch_up_t; - -patch_up_t patch_up_table[] = { - {&hwllckPatch_isync, 0x60000000}, - {&hwulckPatch_isync, 0x60000000}, - {&hwulckbPatch_isync, 0x60000000}, - {&hwlmlckPatch_isync, 0x60000000}, - {&hwltlckPatch_isync, 0x60000000}, - {&hwcsatomicPatch_isync, 0x60000000}, - {&mlckePatch_isync, 0x60000000}, - {&mlckPatch_isync, 0x60000000}, - {&mltelckPatch_isync, 0x60000000}, - {&mltlckPatch_isync, 0x60000000}, - {&mulckePatch_isync, 0x60000000}, - {&mulckPatch_isync, 0x60000000}, - {&slckPatch_isync, 0x60000000}, - {&stlckPatch_isync, 0x60000000}, - {&sulckPatch_isync, 0x60000000}, - {&rwlePatch_isync, 0x60000000}, - {&rwlsPatch_isync, 0x60000000}, - {&rwlsePatch_isync, 0x60000000}, - {&rwlesPatch_isync, 0x60000000}, - {&rwtlePatch_isync, 0x60000000}, - {&rwtlsPatch_isync, 0x60000000}, - {&rwldPatch_isync, 0x60000000}, - {&hwulckPatch_eieio, 0x60000000}, - {&hwulckPatch_eieio, 0x60000000}, - {&mulckPatch_eieio, 0x60000000}, - {&mulckePatch_eieio, 0x60000000}, - {&sulckPatch_eieio, 0x60000000}, - {&rwlesPatch_eieio, 0x60000000}, - {&rwldPatch_eieio, 0x60000000}, - {NULL, 0x00000000} -}; - -extern int forcenap; -extern boolean_t pmap_initialized; - -/* Map memory map IO space */ -vm_offset_t -ml_io_map( - vm_offset_t phys_addr, - vm_size_t size) -{ - return(io_map(phys_addr,size,VM_WIMG_IO)); -} - - -void ml_get_bouncepool_info(vm_offset_t *phys_addr, vm_size_t *size) -{ - *phys_addr = 0; - *size = 0; -} - - -/* - * Routine: ml_static_malloc - * Function: static memory allocation - */ -vm_offset_t -ml_static_malloc( - vm_size_t size) -{ - vm_offset_t vaddr; - - if (pmap_initialized) - return((vm_offset_t)NULL); - else { - vaddr = static_memory_end; - static_memory_end = round_page(vaddr+size); - return(vaddr); - } -} - -/* - * Routine: ml_static_ptovirt - * Function: - */ -vm_offset_t -ml_static_ptovirt( - vm_offset_t paddr) -{ - vm_offset_t vaddr; - - /* Static memory is map V=R */ - vaddr = paddr; - if ( (vaddr < static_memory_end) && (pmap_extract(kernel_pmap, vaddr)==paddr) ) - return(vaddr); - else - return((vm_offset_t)NULL); -} - -/* - * Routine: ml_static_mfree - * Function: - */ -void -ml_static_mfree( - vm_offset_t vaddr, - vm_size_t size) -{ - vm_offset_t paddr_cur, vaddr_cur; - - for (vaddr_cur = round_page_32(vaddr); - vaddr_cur < trunc_page_32(vaddr+size); - vaddr_cur += PAGE_SIZE) { - paddr_cur = pmap_extract(kernel_pmap, vaddr_cur); - if (paddr_cur != (vm_offset_t)NULL) { - vm_page_wire_count--; - pmap_remove(kernel_pmap, (addr64_t)vaddr_cur, (addr64_t)(vaddr_cur+PAGE_SIZE)); - vm_page_create(paddr_cur>>12,(paddr_cur+PAGE_SIZE)>>12); - } - } -} - -/* - * Routine: ml_vtophys - * Function: virtual to physical on static pages - */ -vm_offset_t ml_vtophys( - vm_offset_t vaddr) -{ - return(pmap_extract(kernel_pmap, vaddr)); -} - -/* - * Routine: ml_install_interrupt_handler - * Function: Initialize Interrupt Handler - */ -void ml_install_interrupt_handler( - void *nub, - int source, - void *target, - IOInterruptHandler handler, - void *refCon) -{ - struct per_proc_info *proc_info; - boolean_t current_state; - - current_state = ml_get_interrupts_enabled(); - proc_info = getPerProc(); - - proc_info->interrupt_nub = nub; - proc_info->interrupt_source = source; - proc_info->interrupt_target = target; - proc_info->interrupt_handler = handler; - proc_info->interrupt_refCon = refCon; - - proc_info->interrupts_enabled = TRUE; - (void) ml_set_interrupts_enabled(current_state); - - initialize_screen(NULL, kPEAcquireScreen); -} - -/* - * Routine: ml_nofault_copy - * Function: Perform a physical mode copy if the source and - * destination have valid translations in the kernel pmap. - * If translations are present, they are assumed to - * be wired; i.e. no attempt is made to guarantee that the - * translations obtained remained valid for - * the duration of their use. - */ - -vm_size_t ml_nofault_copy( - vm_offset_t virtsrc, vm_offset_t virtdst, vm_size_t size) -{ - addr64_t cur_phys_dst, cur_phys_src; - uint32_t count, pindex, nbytes = 0; - - while (size > 0) { - if (!(cur_phys_src = kvtophys(virtsrc))) - break; - if (!(cur_phys_dst = kvtophys(virtdst))) - break; - if (!mapping_phys_lookup((cur_phys_src>>12), &pindex) || - !mapping_phys_lookup((cur_phys_dst>>12), &pindex)) - break; - count = PAGE_SIZE - (cur_phys_src & PAGE_MASK); - if (count > (PAGE_SIZE - (cur_phys_dst & PAGE_MASK))) - count = PAGE_SIZE - (cur_phys_dst & PAGE_MASK); - if (count > size) - count = size; - - bcopy_phys(cur_phys_src, cur_phys_dst, count); - - nbytes += count; - virtsrc += count; - virtdst += count; - size -= count; - } - - return nbytes; -} - -/* - * Routine: ml_init_interrupt - * Function: Initialize Interrupts - */ -void ml_init_interrupt(void) -{ - boolean_t current_state; - - current_state = ml_get_interrupts_enabled(); - - getPerProc()->interrupts_enabled = TRUE; - (void) ml_set_interrupts_enabled(current_state); -} - -/* - * Routine: ml_get_interrupts_enabled - * Function: Get Interrupts Enabled - */ -boolean_t ml_get_interrupts_enabled(void) -{ - return((mfmsr() & MASK(MSR_EE)) != 0); -} - -/* - * Routine: ml_at_interrupt_context - * Function: Check if running at interrupt context - */ -boolean_t ml_at_interrupt_context(void) -{ - boolean_t ret; - boolean_t current_state; - - current_state = ml_set_interrupts_enabled(FALSE); - ret = (getPerProc()->istackptr == 0); - ml_set_interrupts_enabled(current_state); - return(ret); -} - -/* - * Routine: ml_cause_interrupt - * Function: Generate a fake interrupt - */ -void ml_cause_interrupt(void) -{ - CreateFakeIO(); -} - -/* - * Routine: ml_thread_policy - * Function: - */ -void ml_thread_policy( - thread_t thread, -__unused unsigned policy_id, - unsigned policy_info) -{ - if (policy_info & MACHINE_NETWORK_WORKLOOP) { - spl_t s = splsched(); - - thread_lock(thread); - - set_priority(thread, thread->priority + 1); - - thread_unlock(thread); - splx(s); - } -} - -/* - * Routine: machine_signal_idle - * Function: - */ -void -machine_signal_idle( - processor_t processor) -{ - struct per_proc_info *proc_info; - - proc_info = PROCESSOR_TO_PER_PROC(processor); - - if (proc_info->pf.Available & (pfCanDoze|pfWillNap)) - (void)cpu_signal(proc_info->cpu_number, SIGPwake, 0, 0); -} - -/* - * Routine: ml_processor_register - * Function: - */ -kern_return_t -ml_processor_register( - ml_processor_info_t *in_processor_info, - processor_t *processor_out, - ipi_handler_t *ipi_handler) -{ - struct per_proc_info *proc_info; - int donap; - boolean_t current_state; - boolean_t boot_processor; - - if (in_processor_info->boot_cpu == FALSE) { - if (spsLockInit == 0) { - spsLockInit = 1; - simple_lock_init(&spsLock, 0); - } - boot_processor = FALSE; - proc_info = cpu_per_proc_alloc(); - if (proc_info == (struct per_proc_info *)NULL) - return KERN_FAILURE; - proc_info->pp_cbfr = console_per_proc_alloc(FALSE); - if (proc_info->pp_cbfr == (void *)NULL) - goto processor_register_error; - } else { - boot_processor = TRUE; - proc_info = PerProcTable[master_cpu].ppe_vaddr; - } - - proc_info->pp_chud = chudxnu_per_proc_alloc(boot_processor); - if (proc_info->pp_chud == (void *)NULL) - goto processor_register_error; - - if (!boot_processor) - if (cpu_per_proc_register(proc_info) != KERN_SUCCESS) - goto processor_register_error; - - proc_info->cpu_id = in_processor_info->cpu_id; - proc_info->start_paddr = in_processor_info->start_paddr; - if(in_processor_info->time_base_enable != (void(*)(cpu_id_t, boolean_t ))NULL) - proc_info->time_base_enable = in_processor_info->time_base_enable; - else - proc_info->time_base_enable = (void(*)(cpu_id_t, boolean_t ))NULL; - - if((proc_info->pf.pfPowerModes & pmType) == pmPowerTune) { - proc_info->pf.pfPowerTune0 = in_processor_info->power_mode_0; - proc_info->pf.pfPowerTune1 = in_processor_info->power_mode_1; - } - - donap = in_processor_info->supports_nap; /* Assume we use requested nap */ - if(forcenap) donap = forcenap - 1; /* If there was an override, use that */ - - if((proc_info->pf.Available & pfCanNap) - && (donap)) { - proc_info->pf.Available |= pfWillNap; - current_state = ml_set_interrupts_enabled(FALSE); - if(proc_info == getPerProc()) - __asm__ volatile("mtsprg 2,%0" : : "r" (proc_info->pf.Available)); /* Set live value */ - (void) ml_set_interrupts_enabled(current_state); - } - - if (!boot_processor) { - (void)hw_atomic_add(&saveanchor.savetarget, FreeListMin); /* saveareas for this processor */ - processor_init((struct processor *)proc_info->processor, - proc_info->cpu_number, processor_pset(master_processor)); - } - - *processor_out = (struct processor *)proc_info->processor; - *ipi_handler = cpu_signal_handler; - - return KERN_SUCCESS; - -processor_register_error: - if (proc_info->pp_cbfr != (void *)NULL) - console_per_proc_free(proc_info->pp_cbfr); - if (proc_info->pp_chud != (void *)NULL) - chudxnu_per_proc_free(proc_info->pp_chud); - if (!boot_processor) - cpu_per_proc_free(proc_info); - return KERN_FAILURE; -} - -/* - * Routine: ml_enable_nap - * Function: - */ -boolean_t -ml_enable_nap(int target_cpu, boolean_t nap_enabled) -{ - struct per_proc_info *proc_info; - boolean_t prev_value; - boolean_t current_state; - - proc_info = PerProcTable[target_cpu].ppe_vaddr; - - prev_value = (proc_info->pf.Available & pfCanNap) && (proc_info->pf.Available & pfWillNap); - - if(forcenap) nap_enabled = forcenap - 1; /* If we are to force nap on or off, do it */ - - if(proc_info->pf.Available & pfCanNap) { /* Can the processor nap? */ - if (nap_enabled) proc_info->pf.Available |= pfWillNap; /* Is nap supported on this machine? */ - else proc_info->pf.Available &= ~pfWillNap; /* Clear if not */ - } - - current_state = ml_set_interrupts_enabled(FALSE); - if(proc_info == getPerProc()) - __asm__ volatile("mtsprg 2,%0" : : "r" (proc_info->pf.Available)); /* Set live value */ - (void) ml_set_interrupts_enabled(current_state); - - return (prev_value); -} - -/* - * Routine: ml_init_max_cpus - * Function: - */ -void -ml_init_max_cpus(unsigned int max_cpus) -{ - boolean_t current_state; - - current_state = ml_set_interrupts_enabled(FALSE); - if (max_cpus_initialized != MAX_CPUS_SET) { - if (max_cpus > 0 && max_cpus <= MAX_CPUS) { - /* - * Note: max_ncpus is the maximum number - * that the kernel supports or that the "cpus=" - * boot-arg has set. Here we take int minimum. - */ - machine_info.max_cpus = MIN(max_cpus, max_ncpus); - machine_info.physical_cpu_max = max_cpus; - machine_info.logical_cpu_max = max_cpus; - } - if (max_cpus_initialized == MAX_CPUS_WAIT) - wakeup((event_t)&max_cpus_initialized); - max_cpus_initialized = MAX_CPUS_SET; - } - - if (machine_info.logical_cpu_max == 1) { - struct patch_up *patch_up_ptr = &patch_up_table[0]; - - while (patch_up_ptr->addr != NULL) { - /* - * Patch for V=R kernel text section - */ - bcopy_phys((addr64_t)((unsigned int)(&patch_up_ptr->data)), - (addr64_t)((unsigned int)(patch_up_ptr->addr)), 4); - sync_cache64((addr64_t)((unsigned int)(patch_up_ptr->addr)),4); - patch_up_ptr++; - } - } - - (void) ml_set_interrupts_enabled(current_state); -} - -/* - * Routine: ml_get_max_cpus - * Function: - */ -unsigned int -ml_get_max_cpus(void) -{ - boolean_t current_state; - - current_state = ml_set_interrupts_enabled(FALSE); - if (max_cpus_initialized != MAX_CPUS_SET) { - max_cpus_initialized = MAX_CPUS_WAIT; - assert_wait((event_t)&max_cpus_initialized, THREAD_UNINT); - (void)thread_block(THREAD_CONTINUE_NULL); - } - (void) ml_set_interrupts_enabled(current_state); - return(machine_info.max_cpus); -} - -/* - * This is called from the machine-independent routine cpu_up() - * to perform machine-dependent info updates. - */ -void -ml_cpu_up(void) -{ - (void)hw_atomic_add(&machine_info.physical_cpu, 1); - (void)hw_atomic_add(&machine_info.logical_cpu, 1); -} - -/* - * This is called from the machine-independent routine cpu_down() - * to perform machine-dependent info updates. - */ -void -ml_cpu_down(void) -{ - (void)hw_atomic_sub(&machine_info.physical_cpu, 1); - (void)hw_atomic_sub(&machine_info.logical_cpu, 1); -} - -/* - * Routine: ml_cpu_get_info - * Function: - */ -void -ml_cpu_get_info(ml_cpu_info_t *ml_cpu_info) -{ - struct per_proc_info *proc_info; - - if (ml_cpu_info == 0) return; - - proc_info = PerProcTable[master_cpu].ppe_vaddr; - ml_cpu_info->vector_unit = (proc_info->pf.Available & pfAltivec) != 0; - ml_cpu_info->cache_line_size = proc_info->pf.lineSize; - ml_cpu_info->l1_icache_size = proc_info->pf.l1iSize; - ml_cpu_info->l1_dcache_size = proc_info->pf.l1dSize; - - if (proc_info->pf.Available & pfL2) { - ml_cpu_info->l2_settings = proc_info->pf.l2cr; - ml_cpu_info->l2_cache_size = proc_info->pf.l2Size; - } else { - ml_cpu_info->l2_settings = 0; - ml_cpu_info->l2_cache_size = 0xFFFFFFFF; - } - if (proc_info->pf.Available & pfL3) { - ml_cpu_info->l3_settings = proc_info->pf.l3cr; - ml_cpu_info->l3_cache_size = proc_info->pf.l3Size; - } else { - ml_cpu_info->l3_settings = 0; - ml_cpu_info->l3_cache_size = 0xFFFFFFFF; - } -} - -/* - * Routine: ml_enable_cache_level - * Function: - */ -#define l2em 0x80000000 -#define l3em 0x80000000 -int -ml_enable_cache_level(int cache_level, int enable) -{ - int old_mode; - unsigned long available, ccr; - struct per_proc_info *proc_info; - - if (real_ncpus != 1) return -1; /* XXX: This test is not safe */ - - proc_info = PerProcTable[master_cpu].ppe_vaddr; - available = proc_info->pf.Available; - - if ((cache_level == 2) && (available & pfL2)) { - ccr = proc_info->pf.l2cr; - old_mode = (ccr & l2em) ? TRUE : FALSE; - if (old_mode != enable) { - if (enable) ccr = proc_info->pf.l2crOriginal; - else ccr = 0; - proc_info->pf.l2cr = ccr; - cacheInit(); - } - - return old_mode; - } - - if ((cache_level == 3) && (available & pfL3)) { - ccr = proc_info->pf.l3cr; - old_mode = (ccr & l3em) ? TRUE : FALSE; - if (old_mode != enable) { - if (enable) ccr = proc_info->pf.l3crOriginal; - else ccr = 0; - proc_info->pf.l3cr = ccr; - cacheInit(); - } - - return old_mode; - } - - return -1; -} - - -/* - * Routine: ml_set_processor_speed - * Function: - */ -void -ml_set_processor_speed(unsigned long speed) -{ - struct per_proc_info *proc_info; - uint32_t cpu; - kern_return_t result; - boolean_t current_state; - unsigned int i; - - proc_info = PerProcTable[master_cpu].ppe_vaddr; - - switch (proc_info->pf.pfPowerModes & pmType) { /* Figure specific type */ - case pmDualPLL: - - ml_set_processor_speed_dpll(speed); - break; - - case pmDFS: - - for (cpu = 0; cpu < real_ncpus; cpu++) { - /* - * cpu_signal() returns after .5ms if it fails to signal a running cpu - * retry cpu_signal() for .1s to deal with long interrupt latency at boot - */ - for (i=200; i>0; i--) { - current_state = ml_set_interrupts_enabled(FALSE); - if (cpu != (unsigned)cpu_number()) { - if (PerProcTable[cpu].ppe_vaddr->cpu_flags & SignalReady) - /* - * Target cpu is off-line, skip - */ - result = KERN_SUCCESS; - else { - simple_lock(&spsLock); - result = cpu_signal(cpu, SIGPcpureq, CPRQsps, speed); - if (result == KERN_SUCCESS) - thread_sleep_simple_lock(&spsLock, &spsLock, THREAD_UNINT); - simple_unlock(&spsLock); - } - } else { - ml_set_processor_speed_dfs(speed); - result = KERN_SUCCESS; - } - (void) ml_set_interrupts_enabled(current_state); - if (result == KERN_SUCCESS) - break; - } - if (result != KERN_SUCCESS) - panic("ml_set_processor_speed(): Fail to set cpu%d speed\n", cpu); - } - break; - - case pmPowerTune: - - ml_set_processor_speed_powertune(speed); - break; - - default: - break; - - } - return; -} - -/* - * Routine: ml_set_processor_speed_slave - * Function: - */ -void -ml_set_processor_speed_slave(unsigned long speed) -{ - ml_set_processor_speed_dfs(speed); - - simple_lock(&spsLock); - thread_wakeup(&spsLock); - simple_unlock(&spsLock); -} - -/* - * Routine: ml_init_lock_timeout - * Function: - */ -void -ml_init_lock_timeout(void) -{ - uint64_t abstime; - uint32_t mtxspin; - - nanoseconds_to_absolutetime(NSEC_PER_SEC>>2, &abstime); - LockTimeOut = (unsigned int)abstime; - - if (PE_parse_boot_argn("mtxspin", &mtxspin, sizeof (mtxspin))) { - if (mtxspin > USEC_PER_SEC>>4) - mtxspin = USEC_PER_SEC>>4; - nanoseconds_to_absolutetime(mtxspin*NSEC_PER_USEC, &abstime); - } else { - nanoseconds_to_absolutetime(10*NSEC_PER_USEC, &abstime); - } - MutexSpin = (unsigned int)abstime; -} - -/* - * Routine: init_ast_check - * Function: - */ -void -init_ast_check( - __unused processor_t processor) -{} - -/* - * Routine: cause_ast_check - * Function: - */ -void -cause_ast_check( - processor_t processor) -{ - struct per_proc_info *proc_info; - - proc_info = PROCESSOR_TO_PER_PROC(processor); - - if (proc_info != getPerProc() - && proc_info->interrupts_enabled == TRUE) - cpu_signal(proc_info->cpu_number, SIGPast, (unsigned int)NULL, (unsigned int)NULL); -} - -/* - * Routine: machine_processor_shutdown - * Function: - */ -thread_t -machine_processor_shutdown( - __unused thread_t thread, - __unused void (*doshutdown)(processor_t), - __unused processor_t processor) -{ - CreateShutdownCTX(); - return((thread_t)(getPerProc()->old_thread)); -} - - -void ml_mem_backoff(void) { - - if(warFlags & warDisMBpoff) return; /* If backoff disabled, exit */ - - __asm__ volatile("sync"); - __asm__ volatile("isync"); - - return; -} - - - -/* - * Stubs for CPU Stepper - */ -void -machine_run_count(__unused uint32_t count) -{ -} - -boolean_t -machine_processor_is_inactive(__unused processor_t processor) -{ - return(FALSE); -} - -processor_t -machine_choose_processor(__unused processor_set_t pset, processor_t processor) -{ - return (processor); -} - -vm_offset_t ml_stack_remaining(void) -{ - uintptr_t local = (uintptr_t) &local; - - if (ml_at_interrupt_context()) { - return (local - (getPerProc()->intstack_top_ss - INTSTACK_SIZE)); - } else { - return (local - current_thread()->kernel_stack); - } -} - -boolean_t machine_timeout_suspended(void) { - return FALSE; -} diff --git a/osfmk/ppc/machine_routines.h b/osfmk/ppc/machine_routines.h deleted file mode 100644 index 47b12432d..000000000 --- a/osfmk/ppc/machine_routines.h +++ /dev/null @@ -1,338 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -#ifndef _PPC_MACHINE_ROUTINES_H_ -#define _PPC_MACHINE_ROUTINES_H_ - -#include -#include -#include -#include - -#include -#include - -__BEGIN_DECLS - -/* Get Interrupts Enabled */ -extern boolean_t ml_get_interrupts_enabled( - void); - -/* Set Interrupts Enabled */ -extern boolean_t ml_set_interrupts_enabled( - boolean_t enable); - -/* Check if running at interrupt context */ -extern boolean_t ml_at_interrupt_context( - void); - -#ifdef KERNEL_PRIVATE - -/* Generate a fake interrupt */ -extern void ml_cause_interrupt( - void); - -/* Type for the IPI Hander */ -typedef void (*ipi_handler_t)(void); - -/* Type for the Time Base Enable function */ -typedef void (*time_base_enable_t)(cpu_id_t cpu_id, boolean_t enable); - -/* enables (or disables) the processor nap mode the function returns the previous value*/ -extern boolean_t ml_enable_nap( - int target_cpu, - boolean_t nap_enabled); - -/* Put the processor to sleep */ -extern void ml_ppc_sleep( - void); - -extern void ml_get_timebase( - unsigned long long *timstamp); - -extern int ml_enable_cache_level( - int cache_level, - int enable); - -extern void ml_static_mfree( - vm_offset_t vaddr, - vm_size_t size); - -/* Init Interrupts */ -extern void ml_install_interrupt_handler( - void *nub, - int source, - void *target, - IOInterruptHandler handler, - void *refCon); - -extern vm_offset_t ml_static_ptovirt( - vm_offset_t paddr); - -/* virtual to physical on wired pages */ -extern vm_offset_t ml_vtophys( - vm_offset_t vaddr); - -vm_size_t ml_nofault_copy( - vm_offset_t virtsrc, vm_offset_t virtdst, vm_size_t size); - -/* PCI config cycle probing */ -extern boolean_t ml_probe_read( - vm_offset_t paddr, - unsigned int *val); - -extern boolean_t ml_probe_read_64( - addr64_t paddr, - unsigned int *val); - -/* Read physical address byte */ -extern unsigned int ml_phys_read_byte( - vm_offset_t paddr); - -extern unsigned int ml_phys_read_byte_64( - addr64_t paddr); - -/* Read physical address half word */ -extern unsigned int ml_phys_read_half( - vm_offset_t paddr); - -extern unsigned int ml_phys_read_half_64( - addr64_t paddr); - -/* Read physical address word*/ -extern unsigned int ml_phys_read( - vm_offset_t paddr); - -extern unsigned int ml_phys_read_64( - addr64_t paddr); - -extern unsigned int ml_phys_read_word( - vm_offset_t paddr); - -extern unsigned int ml_phys_read_word_64( - addr64_t paddr); - -/* Read physical address double word */ -extern unsigned long long ml_phys_read_double( - vm_offset_t paddr); - -extern unsigned long long ml_phys_read_double_64( - addr64_t paddr); - -/* Write physical address byte */ -extern void ml_phys_write_byte( - vm_offset_t paddr, - unsigned int data); - -extern void ml_phys_write_byte_64( - addr64_t paddr, - unsigned int data); - -/* Write physical address half word */ -extern void ml_phys_write_half( - vm_offset_t paddr, - unsigned int data); - -extern void ml_phys_write_half_64( - addr64_t paddr, - unsigned int data); - -/* Write physical address word */ -extern void ml_phys_write( - vm_offset_t paddr, - unsigned int data); - -extern void ml_phys_write_64( - addr64_t paddr, - unsigned int data); - -extern void ml_phys_write_word( - vm_offset_t paddr, - unsigned int data); - -extern void ml_phys_write_word_64( - addr64_t paddr, - unsigned int data); - -/* Write physical address double word */ -extern void ml_phys_write_double( - vm_offset_t paddr, - unsigned long long data); - -extern void ml_phys_write_double_64( - addr64_t paddr, - unsigned long long data); - -/* Struct for ml_processor_register */ -struct ml_processor_info { - cpu_id_t cpu_id; - boolean_t boot_cpu; - vm_offset_t start_paddr; - boolean_t supports_nap; - unsigned long l2cr_value; - time_base_enable_t time_base_enable; - uint32_t power_mode_0; - uint32_t power_mode_1; -}; - -typedef struct ml_processor_info ml_processor_info_t; - -/* Register a processor */ -extern kern_return_t ml_processor_register( - ml_processor_info_t *ml_processor_info, - processor_t *processor, - ipi_handler_t *ipi_handler); - -/* Zero bytes starting at a physical address */ -extern void bzero_phys( - addr64_t phys_address, - uint32_t length); - -/* Zero bytes starting at a physical address that's uncacheable */ -extern void bzero_phys_nc( - addr64_t phys_address, - uint32_t length); - -/* Bytes available on current stack */ -vm_offset_t ml_stack_remaining(void); - -#endif /* KERNEL_PRIVATE */ - -#ifdef XNU_KERNEL_PRIVATE -#if defined(PEXPERT_KERNEL_PRIVATE) || defined(MACH_KERNEL_PRIVATE) - -/* Map memory map IO space */ -extern vm_offset_t ml_io_map( - vm_offset_t phys_addr, - vm_size_t size); - -void ml_get_bouncepool_info( - vm_offset_t *phys_addr, - vm_size_t *size); - - -/* boot memory allocation */ -extern vm_offset_t ml_static_malloc( - vm_size_t size); - -#endif /* PEXPERT_KERNEL_PRIVATE || MACH_KERNEL_PRIVATE */ - - -#ifdef MACH_KERNEL_PRIVATE -extern void ml_init_interrupt( - void); - -extern void cacheInit( - void); - -extern void cacheDisable( - void); - -extern void ml_init_lock_timeout( - void); - -void ml_ppc_do_sleep(void); - -boolean_t machine_timeout_suspended(void); -#endif /* MACH_KERNEL_PRIVATE */ -#endif /* XNU_KERNEL_PRIVATE */ - -#ifdef KERNEL_PRIVATE -extern void ml_thread_policy( - thread_t thread, - unsigned policy_id, - unsigned policy_info); - -#define MACHINE_GROUP 0x00000001 -#define MACHINE_NETWORK_GROUP 0x10000000 -#define MACHINE_NETWORK_WORKLOOP 0x00000001 -#define MACHINE_NETWORK_NETISR 0x00000002 - -/* Initialize the maximum number of CPUs */ -extern void ml_init_max_cpus( - unsigned int max_cpus); - -/* Return the maximum number of CPUs set by ml_init_max_cpus() */ -extern unsigned int ml_get_max_cpus( - void); - -extern void ml_cpu_up(void); -extern void ml_cpu_down(void); - -/* Struct for ml_cpu_get_info */ -struct ml_cpu_info { - unsigned long vector_unit; - unsigned long cache_line_size; - unsigned long l1_icache_size; - unsigned long l1_dcache_size; - unsigned long l2_settings; - unsigned long l2_cache_size; - unsigned long l3_settings; - unsigned long l3_cache_size; -}; - -typedef struct ml_cpu_info ml_cpu_info_t; - -/* Get processor info */ -extern void ml_cpu_get_info( - ml_cpu_info_t *ml_cpu_info); - -extern void ml_set_processor_speed( - unsigned long speed); -extern void ml_set_processor_speed_slave( - unsigned long speed); -extern void ml_set_processor_speed_dpll( - unsigned long speed); -extern void ml_set_processor_speed_dfs( - unsigned long speed); -extern void ml_set_processor_speed_powertune( - unsigned long speed); - -extern void ml_set_processor_voltage( - unsigned long voltage); - -extern unsigned int ml_scom_write( - uint32_t reg, - uint64_t data); - -extern unsigned int ml_scom_read( - uint32_t reg, - uint64_t *data); - -extern uint32_t ml_hdec_ratio(void); - -extern int boffSettingsInit; - -#endif /* KERNEL_PRIVATE */ - -__END_DECLS - -#endif /* _PPC_MACHINE_ROUTINES_H_ */ diff --git a/osfmk/ppc/machine_routines_asm.s b/osfmk/ppc/machine_routines_asm.s deleted file mode 100644 index 45bfb5b5e..000000000 --- a/osfmk/ppc/machine_routines_asm.s +++ /dev/null @@ -1,2345 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#include -#include -#include -#include -#include -#include - - -/* - * ml_set_physical() -- turn off DR and (if 64-bit) turn SF on - * it is assumed that pf64Bit is already in cr6 - * ml_set_physical_get_ffs() -- turn DR off, SF on, and get feature flags - * ml_set_physical_disabled() -- turn DR and EE off, SF on, get feature flags - * ml_set_translation_off() -- turn DR, IR, and EE off, SF on, get feature flags - * - * Callable only from assembler, these return: - * r2 -- new MSR - * r11 -- old MSR - * r10 -- feature flags (pf64Bit etc, ie SPRG 2) - * cr6 -- feature flags 24-27, ie pf64Bit, pf128Byte, and pf32Byte - * - * Uses r0 and r2. ml_set_translation_off also uses r3 and cr5. - */ - - .align 4 - .globl EXT(ml_set_translation_off) -LEXT(ml_set_translation_off) - mfsprg r10,2 // get feature flags - li r0,0 ; Clear this - mtcrf 0x02,r10 // move pf64Bit etc to cr6 - ori r0,r0,lo16(MASK(MSR_EE)+MASK(MSR_FP)+MASK(MSR_IR)+MASK(MSR_DR)) // turn off all 4 - mfmsr r11 // get MSR - oris r0,r0,hi16(MASK(MSR_VEC)) // Turn off vector too - mtcrf 0x04,r10 // move pfNoMSRir etc to cr5 - andc r2,r11,r0 // turn off EE, IR, and DR - bt++ pf64Bitb,ml_set_physical_64 // skip if 64-bit (only they take the hint) - bf pfNoMSRirb,ml_set_physical_32 // skip if we can load MSR directly - li r0,loadMSR // Get the MSR setter SC - mr r3,r2 // copy new MSR to r2 - sc // Set it - blr - - .align 4 - .globl EXT(ml_set_physical_disabled) - -LEXT(ml_set_physical_disabled) - li r0,0 ; Clear - mfsprg r10,2 // get feature flags - ori r0,r0,lo16(MASK(MSR_EE)) // turn EE and fp off - mtcrf 0x02,r10 // move pf64Bit etc to cr6 - b ml_set_physical_join - - .align 5 - .globl EXT(ml_set_physical_get_ffs) - -LEXT(ml_set_physical_get_ffs) - mfsprg r10,2 // get feature flags - mtcrf 0x02,r10 // move pf64Bit etc to cr6 - - .globl EXT(ml_set_physical) -LEXT(ml_set_physical) - - li r0,0 // do not turn off interrupts - -ml_set_physical_join: - oris r0,r0,hi16(MASK(MSR_VEC)) // Always gonna turn of vectors - mfmsr r11 // get MSR - ori r0,r0,lo16(MASK(MSR_DR)+MASK(MSR_FP)) // always turn off DR and FP bit - andc r2,r11,r0 // turn off DR and maybe EE - bt++ pf64Bitb,ml_set_physical_64 // skip if 64-bit (only they take the hint) -ml_set_physical_32: - mtmsr r2 // turn off translation - isync - blr - -ml_set_physical_64: - li r0,1 // get a 1 to slam into SF - rldimi r2,r0,63,MSR_SF_BIT // set SF bit (bit 0) - mtmsrd r2 // set 64-bit mode, turn off data relocation - isync // synchronize - blr - - -/* - * ml_restore(old_MSR) - * - * Callable only from assembler, restores the MSR in r11 saved by ml_set_physical. - * We assume cr6 and r11 are as set by ml_set_physical, ie: - * cr6 - pf64Bit flag (feature flags 24-27) - * r11 - old MSR - */ - - .align 5 - .globl EXT(ml_restore) - -LEXT(ml_restore) - bt++ pf64Bitb,ml_restore_64 // handle 64-bit cpus (only they take the hint) - mtmsr r11 // restore a 32-bit MSR - isync - blr - -ml_restore_64: - mtmsrd r11 // restore a 64-bit MSR - isync - blr - - -/* PCI config cycle probing - * - * boolean_t ml_probe_read(vm_offset_t paddr, unsigned int *val) - * - * Read the memory location at physical address paddr. - * This is a part of a device probe, so there is a good chance we will - * have a machine check here. So we have to be able to handle that. - * We assume that machine checks are enabled both in MSR and HIDs - */ - -; Force a line boundry here - .align 5 - .globl EXT(ml_probe_read) - -LEXT(ml_probe_read) - - mfsprg r9,2 ; Get feature flags - - rlwinm. r0,r9,0,pf64Bitb,pf64Bitb ; Are we on a 64-bit machine? - rlwinm r3,r3,0,0,31 ; Clean up for 64-bit machines - bne++ mpr64bit ; Go do this the 64-bit way... - -mpr32bit: lis r8,hi16(MASK(MSR_VEC)) ; Get the vector flag - mfmsr r0 ; Save the current MSR - ori r8,r8,lo16(MASK(MSR_FP)) ; Add the FP flag - - neg r10,r3 ; Number of bytes to end of page - andc r0,r0,r8 ; Clear VEC and FP - rlwinm. r10,r10,0,20,31 ; Clear excess junk and test for page bndry - ori r8,r8,lo16(MASK(MSR_EE)|MASK(MSR_IR)|MASK(MSR_DR)) ; Drop EE, IR, and DR - mr r12,r3 ; Save the load address - andc r2,r0,r8 ; Clear VEC, FP, and EE - mtcrf 0x04,r9 ; Set the features - cmplwi cr1,r10,4 ; At least 4 bytes left in page? - beq- mprdoit ; We are right on the boundary... - li r3,0 - bltlr- cr1 ; No, just return failure... - -mprdoit: - - bt pfNoMSRirb,mprNoMSR ; No MSR... - - mtmsr r2 ; Translation and all off - isync ; Toss prefetch - b mprNoMSRx - -mprNoMSR: - mr r5,r0 - li r0,loadMSR ; Get the MSR setter SC - mr r3,r2 ; Get new MSR - sc ; Set it - mr r0,r5 - li r3,0 -mprNoMSRx: - - mfspr r6, hid0 ; Get a copy of hid0 - - rlwinm. r5, r9, 0, pfNoMuMMCKb, pfNoMuMMCKb ; Check for NoMuMMCK - bne mprNoMuM - - rlwinm r5, r6, 0, ice+1, ice-1 ; Turn off L1 I-Cache - mtspr hid0, r5 - isync ; Wait for I-Cache off - rlwinm r5, r6, 0, mum+1, mum-1 ; Turn off MuM w/ I-Cache on - mtspr hid0, r5 -mprNoMuM: - -; -; We need to insure that there is no more than 1 BAT register that -; can get a hit. There could be repercussions beyond the ken -; of mortal man. It is best not to tempt fate. -; - -; Note: we will reload these from the shadow BATs later - - li r10,0 ; Clear a register - - sync ; Make sure all is well - - mtdbatu 1,r10 ; Invalidate DBAT 1 - mtdbatu 2,r10 ; Invalidate DBAT 2 - mtdbatu 3,r10 ; Invalidate DBAT 3 - - rlwinm r10,r12,0,0,14 ; Round down to a 128k boundary - ori r11,r10,0x32 ; Set uncached, coherent, R/W - ori r10,r10,2 ; Make the upper half (128k, valid supervisor) - mtdbatl 0,r11 ; Set lower BAT first - mtdbatu 0,r10 ; Now the upper - sync ; Just make sure - - dcbf 0,r12 ; Make sure we kill the cache to avoid paradoxes - sync - - ori r11,r2,lo16(MASK(MSR_DR)) ; Turn on data translation - mtmsr r11 ; Do it for real - isync ; Make sure of it - - eieio ; Make sure of all previous accesses - sync ; Make sure it is all caught up - - lwz r11,0(r12) ; Get it and maybe machine check here - - eieio ; Make sure of ordering again - sync ; Get caught up yet again - isync ; Do not go further till we are here - - mtmsr r2 ; Turn translation back off - isync - - lis r10,hi16(EXT(shadow_BAT)+shdDBAT) ; Get shadow address - ori r10,r10,lo16(EXT(shadow_BAT)+shdDBAT) ; Get shadow address - - lwz r5,0(r10) ; Pick up DBAT 0 high - lwz r6,4(r10) ; Pick up DBAT 0 low - lwz r7,8(r10) ; Pick up DBAT 1 high - lwz r8,16(r10) ; Pick up DBAT 2 high - lwz r9,24(r10) ; Pick up DBAT 3 high - - mtdbatu 0,r5 ; Restore DBAT 0 high - mtdbatl 0,r6 ; Restore DBAT 0 low - mtdbatu 1,r7 ; Restore DBAT 1 high - mtdbatu 2,r8 ; Restore DBAT 2 high - mtdbatu 3,r9 ; Restore DBAT 3 high - sync - - li r3,1 ; We made it - - mtmsr r0 ; Restore translation and exceptions - isync ; Toss speculations - - stw r11,0(r4) ; Save the loaded value - blr ; Return... - -; Force a line boundry here. This means we will be able to check addresses better - .align 5 - .globl EXT(ml_probe_read_mck) -LEXT(ml_probe_read_mck) - - -/* PCI config cycle probing - 64-bit - * - * boolean_t ml_probe_read_64(addr64_t paddr, unsigned int *val) - * - * Read the memory location at physical address paddr. - * This is a part of a device probe, so there is a good chance we will - * have a machine check here. So we have to be able to handle that. - * We assume that machine checks are enabled both in MSR and HIDs - */ - -; Force a line boundry here - .align 6 - .globl EXT(ml_probe_read_64) - -LEXT(ml_probe_read_64) - - mfsprg r9,2 ; Get feature flags - rlwinm r3,r3,0,1,0 ; Copy low 32 bits to top 32 - rlwinm. r0,r9,0,pf64Bitb,pf64Bitb ; Are we on a 64-bit machine? - rlwimi r3,r4,0,0,31 ; Insert low part of 64-bit address in bottom 32 bits - - mr r4,r5 ; Move result to common register - beq-- mpr32bit ; Go do this the 32-bit way... - -mpr64bit: andi. r0,r3,3 ; Check if we are on a word boundary - li r0,0 ; Clear the EE bit (and everything else for that matter) - bne-- mprFail ; Boundary not good... - mfmsr r11 ; Get the MSR - mtmsrd r0,1 ; Set the EE bit only (do not care about RI) - rlwinm r11,r11,0,MSR_EE_BIT,MSR_EE_BIT ; Isolate just the EE bit - mfmsr r10 ; Refresh our view of the MSR (VMX/FP may have changed) - or r12,r10,r11 ; Turn on EE if on before we turned it off - ori r0,r0,lo16(MASK(MSR_IR)|MASK(MSR_DR)) ; Get the IR and DR bits - li r2,1 ; Get a 1 - sldi r2,r2,63 ; Get the 64-bit bit - andc r10,r10,r0 ; Clear IR and DR - or r10,r10,r2 ; Set 64-bit - - li r0,1 ; Get a 1 - mtmsrd r10 ; Translation and EE off, 64-bit on - isync - - sldi r0,r0,32+8 ; Get the right bit to inhibit caching - - mfspr r8,hid4 ; Get HID4 - or r2,r8,r0 ; Set bit to make real accesses cache-inhibited - sync ; Sync up - mtspr hid4,r2 ; Make real accesses cache-inhibited - isync ; Toss prefetches - - lis r7,0xE000 ; Get the unlikeliest ESID possible - srdi r7,r7,1 ; Make 0x7FFFFFFFF0000000 - slbie r7 ; Make sure the ERAT is cleared - - sync - isync - - eieio ; Make sure of all previous accesses - - lwz r11,0(r3) ; Get it and maybe machine check here - - eieio ; Make sure of ordering again - sync ; Get caught up yet again - isync ; Do not go further till we are here - - sync ; Sync up - mtspr hid4,r8 ; Make real accesses not cache-inhibited - isync ; Toss prefetches - - lis r7,0xE000 ; Get the unlikeliest ESID possible - srdi r7,r7,1 ; Make 0x7FFFFFFFF0000000 - slbie r7 ; Make sure the ERAT is cleared - - mtmsrd r12 ; Restore entry MSR - isync - - stw r11,0(r4) ; Pass back the result - li r3,1 ; Indicate success - blr ; Leave... - -mprFail: li r3,0 ; Set failure - blr ; Leave... - -; Force a line boundry here. This means we will be able to check addresses better - .align 6 - .globl EXT(ml_probe_read_mck_64) -LEXT(ml_probe_read_mck_64) - - -/* Read physical address byte - * - * unsigned int ml_phys_read_byte(vm_offset_t paddr) - * unsigned int ml_phys_read_byte_64(addr64_t paddr) - * - * Read the byte at physical address paddr. Memory should not be cache inhibited. - */ - -; Force a line boundry here - - .align 5 - .globl EXT(ml_phys_read_byte_64) - -LEXT(ml_phys_read_byte_64) - - rlwinm r3,r3,0,1,0 ; Copy low 32 bits to top 32 - rlwimi r3,r4,0,0,31 ; Insert low part of 64-bit address in bottom 32 bits - b ml_phys_read_byte_join - - .globl EXT(ml_phys_read_byte) - -LEXT(ml_phys_read_byte) - rlwinm r3,r3,0,0,31 ; truncate address to 32-bits -ml_phys_read_byte_join: ; r3 = address to read (reg64_t) - mflr r11 ; Save the return - bl rdwrpre ; Get set up, translation/interrupts off, 64-bit on, etc. - - lbz r3,0(r3) ; Get the byte - b rdwrpost ; Clean up and leave... - - -/* Read physical address half word - * - * unsigned int ml_phys_read_half(vm_offset_t paddr) - * unsigned int ml_phys_read_half_64(addr64_t paddr) - * - * Read the half word at physical address paddr. Memory should not be cache inhibited. - */ - -; Force a line boundry here - - .align 5 - .globl EXT(ml_phys_read_half_64) - -LEXT(ml_phys_read_half_64) - - rlwinm r3,r3,0,1,0 ; Copy low 32 bits to top 32 - rlwimi r3,r4,0,0,31 ; Insert low part of 64-bit address in bottom 32 bits - b ml_phys_read_half_join - - .globl EXT(ml_phys_read_half) - -LEXT(ml_phys_read_half) - rlwinm r3,r3,0,0,31 ; truncate address to 32-bits -ml_phys_read_half_join: ; r3 = address to read (reg64_t) - mflr r11 ; Save the return - bl rdwrpre ; Get set up, translation/interrupts off, 64-bit on, etc. - - lhz r3,0(r3) ; Get the half word - b rdwrpost ; Clean up and leave... - - -/* Read physical address word - * - * unsigned int ml_phys_read(vm_offset_t paddr) - * unsigned int ml_phys_read_64(addr64_t paddr) - * unsigned int ml_phys_read_word(vm_offset_t paddr) - * unsigned int ml_phys_read_word_64(addr64_t paddr) - * - * Read the word at physical address paddr. Memory should not be cache inhibited. - */ - -; Force a line boundry here - - .align 5 - .globl EXT(ml_phys_read_64) - .globl EXT(ml_phys_read_word_64) - -LEXT(ml_phys_read_64) -LEXT(ml_phys_read_word_64) - - rlwinm r3,r3,0,1,0 ; Copy low 32 bits to top 32 - rlwimi r3,r4,0,0,31 ; Insert low part of 64-bit address in bottom 32 bits - b ml_phys_read_word_join - - .globl EXT(ml_phys_read) - .globl EXT(ml_phys_read_word) - -LEXT(ml_phys_read) -LEXT(ml_phys_read_word) - rlwinm r3,r3,0,0,31 ; truncate address to 32-bits -ml_phys_read_word_join: ; r3 = address to read (reg64_t) - mflr r11 ; Save the return - bl rdwrpre ; Get set up, translation/interrupts off, 64-bit on, etc. - - lwz r3,0(r3) ; Get the word - b rdwrpost ; Clean up and leave... - - -/* Read physical address double word - * - * unsigned long long ml_phys_read_double(vm_offset_t paddr) - * unsigned long long ml_phys_read_double_64(addr64_t paddr) - * - * Read the double word at physical address paddr. Memory should not be cache inhibited. - */ - -; Force a line boundry here - - .align 5 - .globl EXT(ml_phys_read_double_64) - -LEXT(ml_phys_read_double_64) - - rlwinm r3,r3,0,1,0 ; Copy low 32 bits to top 32 - rlwimi r3,r4,0,0,31 ; Insert low part of 64-bit address in bottom 32 bits - b ml_phys_read_double_join - - .globl EXT(ml_phys_read_double) - -LEXT(ml_phys_read_double) - rlwinm r3,r3,0,0,31 ; truncate address to 32-bits -ml_phys_read_double_join: ; r3 = address to read (reg64_t) - mflr r11 ; Save the return - bl rdwrpre ; Get set up, translation/interrupts off, 64-bit on, etc. - - lwz r4,4(r3) ; Get the low word - lwz r3,0(r3) ; Get the high word - b rdwrpost ; Clean up and leave... - - -/* Write physical address byte - * - * void ml_phys_write_byte(vm_offset_t paddr, unsigned int data) - * void ml_phys_write_byte_64(addr64_t paddr, unsigned int data) - * - * Write the byte at physical address paddr. Memory should not be cache inhibited. - */ - - .align 5 - .globl EXT(ml_phys_write_byte_64) - -LEXT(ml_phys_write_byte_64) - - rlwinm r3,r3,0,1,0 ; Copy low 32 bits to top 32 - rlwimi r3,r4,0,0,31 ; Insert low part of 64-bit address in bottom 32 bits - mr r4,r5 ; Copy over the data - b ml_phys_write_byte_join - - .globl EXT(ml_phys_write_byte) - -LEXT(ml_phys_write_byte) - rlwinm r3,r3,0,0,31 ; truncate address to 32-bits -ml_phys_write_byte_join: ; r3 = address to write (reg64_t), r4 = data - mflr r11 ; Save the return - bl rdwrpre ; Get set up, translation/interrupts off, 64-bit on, etc. - - stb r4,0(r3) ; Set the byte - b rdwrpost ; Clean up and leave... - - -/* Write physical address half word - * - * void ml_phys_write_half(vm_offset_t paddr, unsigned int data) - * void ml_phys_write_half_64(addr64_t paddr, unsigned int data) - * - * Write the half word at physical address paddr. Memory should not be cache inhibited. - */ - - .align 5 - .globl EXT(ml_phys_write_half_64) - -LEXT(ml_phys_write_half_64) - - rlwinm r3,r3,0,1,0 ; Copy low 32 bits to top 32 - rlwimi r3,r4,0,0,31 ; Insert low part of 64-bit address in bottom 32 bits - mr r4,r5 ; Copy over the data - b ml_phys_write_half_join - - .globl EXT(ml_phys_write_half) - -LEXT(ml_phys_write_half) - rlwinm r3,r3,0,0,31 ; truncate address to 32-bits -ml_phys_write_half_join: ; r3 = address to write (reg64_t), r4 = data - mflr r11 ; Save the return - bl rdwrpre ; Get set up, translation/interrupts off, 64-bit on, etc. - - sth r4,0(r3) ; Set the half word - b rdwrpost ; Clean up and leave... - - -/* Write physical address word - * - * void ml_phys_write(vm_offset_t paddr, unsigned int data) - * void ml_phys_write_64(addr64_t paddr, unsigned int data) - * void ml_phys_write_word(vm_offset_t paddr, unsigned int data) - * void ml_phys_write_word_64(addr64_t paddr, unsigned int data) - * - * Write the word at physical address paddr. Memory should not be cache inhibited. - */ - - .align 5 - .globl EXT(ml_phys_write_64) - .globl EXT(ml_phys_write_word_64) - -LEXT(ml_phys_write_64) -LEXT(ml_phys_write_word_64) - - rlwinm r3,r3,0,1,0 ; Copy low 32 bits to top 32 - rlwimi r3,r4,0,0,31 ; Insert low part of 64-bit address in bottom 32 bits - mr r4,r5 ; Copy over the data - b ml_phys_write_word_join - - .globl EXT(ml_phys_write) - .globl EXT(ml_phys_write_word) - -LEXT(ml_phys_write) -LEXT(ml_phys_write_word) - rlwinm r3,r3,0,0,31 ; truncate address to 32-bits -ml_phys_write_word_join: ; r3 = address to write (reg64_t), r4 = data - mflr r11 ; Save the return - bl rdwrpre ; Get set up, translation/interrupts off, 64-bit on, etc. - - stw r4,0(r3) ; Set the word - b rdwrpost ; Clean up and leave... - - -/* Write physical address double word - * - * void ml_phys_write_double(vm_offset_t paddr, unsigned long long data) - * void ml_phys_write_double_64(addr64_t paddr, unsigned long long data) - * - * Write the double word at physical address paddr. Memory should not be cache inhibited. - */ - - .align 5 - .globl EXT(ml_phys_write_double_64) - -LEXT(ml_phys_write_double_64) - - rlwinm r3,r3,0,1,0 ; Copy low 32 bits to top 32 - rlwimi r3,r4,0,0,31 ; Insert low part of 64-bit address in bottom 32 bits - mr r4,r5 ; Copy over the high data - mr r5,r6 ; Copy over the low data - b ml_phys_write_double_join - - .globl EXT(ml_phys_write_double) - -LEXT(ml_phys_write_double) - rlwinm r3,r3,0,0,31 ; truncate address to 32-bits -ml_phys_write_double_join: ; r3 = address to write (reg64_t), r4,r5 = data (long long) - mflr r11 ; Save the return - bl rdwrpre ; Get set up, translation/interrupts off, 64-bit on, etc. - - stw r4,0(r3) ; Set the high word - stw r5,4(r3) ; Set the low word - b rdwrpost ; Clean up and leave... - - - .align 5 - -rdwrpre: mfsprg r12,2 ; Get feature flags - lis r8,hi16(MASK(MSR_VEC)) ; Get the vector flag - mfmsr r10 ; Save the MSR - ori r8,r8,lo16(MASK(MSR_FP)) ; Add the FP flag - mtcrf 0x02,r12 ; move pf64Bit - andc r10,r10,r8 ; Clear VEC and FP - ori r9,r8,lo16(MASK(MSR_EE)|MASK(MSR_IR)|MASK(MSR_DR)) ; Drop EE, DR, and IR - li r2,1 ; Prepare for 64 bit - andc r9,r10,r9 ; Clear VEC, FP, DR, and EE - bf-- pf64Bitb,rdwrpre32 ; Join 32-bit code... - - srdi r7,r3,31 ; Get a 1 if address is in I/O memory - rldimi r9,r2,63,MSR_SF_BIT ; set SF bit (bit 0) - cmpldi cr7,r7,1 ; Is source in I/O memory? - mtmsrd r9 ; set 64-bit mode, turn off EE, DR, and IR - isync ; synchronize - - sldi r0,r2,32+8 ; Get the right bit to turn off caching - - bnelr++ cr7 ; We are not in the I/O area, all ready... - - mfspr r8,hid4 ; Get HID4 - or r2,r8,r0 ; Set bit to make real accesses cache-inhibited - sync ; Sync up - mtspr hid4,r2 ; Make real accesses cache-inhibited - isync ; Toss prefetches - - lis r7,0xE000 ; Get the unlikeliest ESID possible - srdi r7,r7,1 ; Make 0x7FFFFFFFF0000000 - slbie r7 ; Make sure the ERAT is cleared - - sync - isync - blr ; Finally, all ready... - - .align 5 - -rdwrpre32: rlwimi r9,r10,0,MSR_IR_BIT,MSR_IR_BIT ; Leave the IR bit unchanged - mtmsr r9 ; Drop EE, DR, and leave IR unchanged - isync - blr ; All set up, leave... - - .align 5 - -rdwrpost: mtlr r11 ; Restore the return - bt++ pf64Bitb,rdwrpost64 ; Join 64-bit code... - - mtmsr r10 ; Restore entry MSR (sans FP and VEC) - isync - blr ; Leave... - -rdwrpost64: bne++ cr7,rdwrpcok ; Skip enabling real mode caching if we did not change it... - - sync ; Sync up - mtspr hid4,r8 ; Make real accesses not cache-inhibited - isync ; Toss prefetches - - lis r7,0xE000 ; Get the unlikeliest ESID possible - srdi r7,r7,1 ; Make 0x7FFFFFFFF0000000 - slbie r7 ; Make sure the ERAT is cleared - -rdwrpcok: mtmsrd r10 ; Restore entry MSR (sans FP and VEC) - isync - blr ; Leave... - - -/* set interrupts enabled or disabled - * - * boolean_t set_interrupts_enabled(boolean_t enable) - * - * Set EE bit to "enable" and return old value as boolean - */ - -; Force a line boundry here - .align 5 - .globl EXT(ml_set_interrupts_enabled) - -LEXT(ml_set_interrupts_enabled) - - andi. r4,r3,1 ; Are we turning interruptions on? - lis r0,hi16(MASK(MSR_VEC)) ; Get vector enable - mfmsr r5 ; Get the current MSR - ori r0,r0,lo16(MASK(MSR_EE)|MASK(MSR_FP)) ; Get float enable and EE enable - rlwinm r3,r5,17,31,31 ; Set return value - andc r5,r5,r0 ; Force VEC and FP off - bne CheckPreemption ; Interrupts going on, check ASTs... - - mtmsr r5 ; Slam diable (always going disabled here) - isync ; Need this because FP/Vec might go off - blr - - .align 5 - -CheckPreemption: - mfsprg r9,1 ; Get current activation - lwz r7,ACT_PER_PROC(r9) ; Get the per_proc block - ori r5,r5,lo16(MASK(MSR_EE)) ; Turn on the enable - lwz r8,PP_PENDING_AST(r7) ; Get pending AST mask - li r6,AST_URGENT ; Get the type we will preempt for - lwz r7,ACT_PREEMPT_CNT(r9) ; Get preemption count - lis r0,hi16(DoPreemptCall) ; High part of Preempt FW call - cmpwi cr1,r7,0 ; Are preemptions masked off? - and. r8,r8,r6 ; Are we urgent? - crorc cr1_eq,cr0_eq,cr1_eq ; Remember if preemptions are masked or not urgent - ori r0,r0,lo16(DoPreemptCall) ; Bottome of FW call - - mtmsr r5 ; Restore the MSR now, before we can preempt - isync ; Need this because FP/Vec might go off - - beqlr++ cr1 ; Return if no premption... - sc ; Preempt - blr - -; Force a line boundry here - .align 5 - .globl EXT(timer_update) - -LEXT(timer_update) - stw r4,TIMER_HIGHCHK(r3) - eieio - stw r5,TIMER_LOW(r3) - eieio - stw r4,TIMER_HIGH(r3) - blr - -; Force a line boundry here - .align 5 - .globl EXT(timer_grab) - -LEXT(timer_grab) -0: lwz r11,TIMER_HIGH(r3) - lwz r4,TIMER_LOW(r3) - isync - lwz r9,TIMER_HIGHCHK(r3) - cmpw r11,r9 - bne-- 0b - mr r3,r11 - blr - -; Force a line boundry here - .align 5 - .globl EXT(thread_timer_event) - -LEXT(thread_timer_event) - mfsprg r10,1 ; Get the current activation - lwz r10,ACT_PER_PROC(r10) ; Get the per_proc block - addi r10,r10,PP_PROCESSOR - lwz r11,THREAD_TIMER(r10) - - lwz r9,TIMER_LOW(r11) - lwz r7,TIMER_TSTAMP(r11) - lwz r8,TIMER_TSTAMP+4(r11) - subfc r8,r8,r4 - subfe r7,r7,r3 - addc r8,r8,r9 - addze. r7,r7 - beq++ 0f - - lwz r6,TIMER_HIGH(r11) - add r7,r7,r6 - stw r7,TIMER_HIGHCHK(r11) - eieio - stw r8,TIMER_LOW(r11) - eieio - stw r7,TIMER_HIGH(r11) - b 1f - -0: stw r8,TIMER_LOW(r11) - -1: stw r5,THREAD_TIMER(r10) - stw r3,TIMER_TSTAMP(r5) - stw r4,TIMER_TSTAMP+4(r5) - blr - -; Force a line boundry here - .align 5 - .globl EXT(state_event) - -LEXT(state_event) - mfsprg r10,1 ; Get the current activation - lwz r10,ACT_PER_PROC(r10) ; Get the per_proc block - addi r10,r10,PP_PROCESSOR - lwz r11,CURRENT_STATE(r10) - - lwz r9,TIMER_LOW(r11) - lwz r7,TIMER_TSTAMP(r11) - lwz r8,TIMER_TSTAMP+4(r11) - subfc r8,r8,r4 - subfe r7,r7,r3 - addc r8,r8,r9 - addze. r7,r7 - beq++ 0f - - lwz r6,TIMER_HIGH(r11) - add r7,r7,r6 - stw r7,TIMER_HIGHCHK(r11) - eieio - stw r8,TIMER_LOW(r11) - eieio - stw r7,TIMER_HIGH(r11) - b 1f - -0: stw r8,TIMER_LOW(r11) - -1: stw r5,CURRENT_STATE(r10) - stw r3,TIMER_TSTAMP(r5) - stw r4,TIMER_TSTAMP+4(r5) - blr - -/* Set machine into idle power-saving mode. - * - * void machine_idle(void) - * - * We will use the PPC NAP or DOZE for this. - * This call always returns. Must be called with spllo (i.e., interruptions - * enabled). - * - */ - -; Force a line boundry here - .align 5 - .globl EXT(machine_idle) - -LEXT(machine_idle) - - mfsprg r12,1 ; Get the current activation - lwz r12,ACT_PER_PROC(r12) ; Get the per_proc block - lhz r10,PP_CPU_FLAGS(r12) ; Get the flags - lwz r11,PP_INTS_ENABLED(r12) ; Get interrupt enabled state - andi. r10,r10,SignalReady ; Are Signal ready? - cmpwi cr1,r11,0 ; Are interrupt disabled? - cror cr0_eq, cr1_eq, cr0_eq ; Interrupt disabled or Signal not ready? - mfmsr r3 ; Save the MSR - - beq-- nonap ; Yes, return after re-enabling interrupts - lis r0,hi16(MASK(MSR_VEC)) ; Get the vector flag - ori r0,r0,lo16(MASK(MSR_FP)) ; Add the FP flag - andc r3,r3,r0 ; Clear VEC and FP - ori r0,r0,lo16(MASK(MSR_EE)) ; Drop EE also - andc r5,r3,r0 ; Clear VEC, FP, DR, and EE - - mtmsr r5 ; Hold up interruptions for now - isync ; May have messed with fp/vec - mfsprg r11,2 ; Get CPU specific features - mfspr r6,hid0 ; Get the current power-saving mode - mtcrf 0xC7,r11 ; Get the facility flags - - lis r4,hi16(napm) ; Assume we can nap - bt pfWillNapb,yesnap ; Yeah, nap is ok... - - lis r4,hi16(dozem) ; Assume we can doze - bt pfCanDozeb,yesnap ; We can sleep or doze one this machine... - -nonap: ori r3,r3,lo16(MASK(MSR_EE)) ; Flip on EE - - mtmsr r3 ; Turn interruptions back on - blr ; Leave... - -yesnap: mftbu r9 ; Get the upper timebase - mftb r7 ; Get the lower timebase - mftbu r8 ; Get the upper one again - cmplw r9,r8 ; Did the top tick? - bne-- yesnap ; Yeah, need to get it again... - stw r8,napStamp(r12) ; Set high order time stamp - stw r7,napStamp+4(r12) ; Set low order nap stamp - - rlwinm. r0,r11,0,pfAltivecb,pfAltivecb ; Do we have altivec? - beq-- minovec ; No... - dssall ; Stop the streams before we nap/doze - sync - lwz r8,napStamp(r12) ; Reload high order time stamp -clearpipe: - cmplw r8,r8 - bne- clearpipe - isync - -minovec: rlwinm. r7,r11,0,pfNoL2PFNapb,pfNoL2PFNapb ; Turn off L2 Prefetch before nap? - beq++ miL2PFok - - mfspr r7,msscr0 ; Get currect MSSCR0 value - rlwinm r7,r7,0,0,l2pfes-1 ; Disable L2 Prefetch - mtspr msscr0,r7 ; Updates MSSCR0 value - sync - isync - -miL2PFok: - rlwinm. r7,r11,0,pfSlowNapb,pfSlowNapb ; Should nap at slow speed? - beq minoslownap - - mfspr r7,hid1 ; Get current HID1 value - oris r7,r7,hi16(hid1psm) ; Select PLL1 - mtspr hid1,r7 ; Update HID1 value - - -; -; We have to open up interruptions here because book 4 says that we should -; turn on only the POW bit and that we should have interrupts enabled. -; The interrupt handler will detect that nap or doze is set if an interrupt -; is taken and set everything up to return directly to machine_idle_ret. -; So, make sure everything we need there is already set up... -; - -minoslownap: - lis r10,hi16(dozem|napm|sleepm) ; Mask of power management bits - - bf-- pf64Bitb,mipNSF1 ; skip if 32-bit... - - sldi r4,r4,32 ; Position the flags - sldi r10,r10,32 ; Position the masks - -mipNSF1: li r2,lo16(MASK(MSR_DR)|MASK(MSR_IR)) ; Get the translation mask - andc r6,r6,r10 ; Clean up the old power bits - ori r7,r5,lo16(MASK(MSR_EE)) ; Flip on EE to make exit msr - andc r5,r5,r2 ; Clear IR and DR from current MSR - or r6,r6,r4 ; Set nap or doze - ori r5,r5,lo16(MASK(MSR_EE)) ; Flip on EE to make nap msr - oris r2,r5,hi16(MASK(MSR_POW)) ; Turn on power management in next MSR - - sync - mtspr hid0,r6 ; Set up the HID for nap/doze - mfspr r6,hid0 ; Yes, this is silly, keep it here - mfspr r6,hid0 ; Yes, this is a duplicate, keep it here - mfspr r6,hid0 ; Yes, this is a duplicate, keep it here - mfspr r6,hid0 ; Yes, this is a duplicate, keep it here - mfspr r6,hid0 ; Yes, this is a duplicate, keep it here - mfspr r6,hid0 ; Yes, this is a duplicate, keep it here - isync ; Make sure it is set - - -; -; Turn translation off to nap -; - - bt pfNoMSRirb,miNoMSR ; Jump if we need to use SC for this... - mtmsr r5 ; Turn translation off, interrupts on - isync ; Wait for it - b miNoMSRx ; Jump back in line... - -miNoMSR: mr r3,r5 ; Pass in the new MSR value - li r0,loadMSR ; MSR setter ultrafast - sc ; Do it to it like you never done before... - -miNoMSRx: bf-- pf64Bitb,mipowloop ; skip if 32-bit... - - li r3,0x10 ; Fancy nap threshold is 0x10 ticks - mftb r8 ; Get the low half of the time base - mfdec r4 ; Get the decrementer ticks - cmplw r4,r3 ; Less than threshold? - blt mipowloop - - mtdec r3 ; Load decrementer with threshold - isync ; and make sure, - mfdec r3 ; really sure, it gets there - - rlwinm r6,r2,0,MSR_EE_BIT+1,MSR_EE_BIT-1 ; Clear out the EE bit - sync ; Make sure queues are clear - mtmsr r6 ; Set MSR with EE off but POW on - isync ; Make sure this takes before we proceed - - mftb r9 ; Get the low half of the time base - sub r9,r9,r8 ; Get the number of ticks spent waiting - sub r4,r4,r9 ; Adjust the decrementer value - - mtdec r4 ; Load decrementer with the rest of the timeout - isync ; and make sure, - mfdec r4 ; really sure, it gets there - -mipowloop: - sync ; Make sure queues are clear - mtmsr r2 ; Nap or doze, MSR with POW, EE set, translation off - isync ; Make sure this takes before we proceed - b mipowloop ; loop if POW does not take - -; -; Note that the interrupt handler will turn off the nap/doze bits in the hid. -; Also remember that the interrupt handler will force return to here whenever -; the nap/doze bits are set. -; - .globl EXT(machine_idle_ret) -LEXT(machine_idle_ret) - mtmsr r7 ; Make sure the MSR is what we want - isync ; In case we turn on translation -; -; Protect against a lost decrementer trap if the current decrementer value is negative -; by more than 10 ticks, re-arm it since it is unlikely to fire at this point... -; A hardware interrupt got us out of machine_idle and may also be contributing to this state -; - mfdec r6 ; Get decrementer - cmpwi cr0,r6,-10 ; Compare decrementer with -10 - bgelr++ ; Return if greater - li r0,1 ; Load 1 - mtdec r0 ; Set decrementer to 1 - blr ; Return... - -/* Put machine to sleep. - * This call never returns. We always exit sleep via a soft reset. - * All external interruptions must be drained at this point and disabled. - * - * void ml_ppc_do_sleep(void) - * - * We will use the PPC SLEEP for this. - * - * There is one bit of hackery in here: we need to enable for - * interruptions when we go to sleep and there may be a pending - * decrimenter rupt. So we make the decrimenter 0x7FFFFFFF and enable for - * interruptions. The decrimenter rupt vector recognizes this and returns - * directly back here. - * - */ - -; Force a line boundry here - .align 5 - .globl EXT(ml_ppc_do_sleep) - -LEXT(ml_ppc_do_sleep) - -#if 0 - mfmsr r5 ; Hack to spin instead of sleep - rlwinm r5,r5,0,MSR_DR_BIT+1,MSR_IR_BIT-1 ; Turn off translation - rlwinm r5,r5,0,MSR_EE_BIT+1,MSR_EE_BIT-1 ; Turn off interruptions - mtmsr r5 ; No talking - isync - -deadsleep: addi r3,r3,1 ; Make analyzer happy - addi r3,r3,1 - addi r3,r3,1 - b deadsleep ; Die the death of 1000 joys... -#endif - - mfsprg r12,1 ; Get the current activation - lwz r12,ACT_PER_PROC(r12) ; Get the per_proc block - mfsprg r11,2 ; Get CPU specific features - eqv r10,r10,r10 ; Get all foxes - mtcrf 0x04,r11 ; move pfNoMSRirb to cr5 - mfspr r4,hid0 ; Get the current power-saving mode - mtcrf 0x02,r11 ; move pf64Bit to cr6 - - rlwinm. r5,r11,0,pfNoL2PFNapb,pfNoL2PFNapb ; Turn off L2 Prefetch before sleep? - beq mpsL2PFok - - mfspr r5,msscr0 ; Get currect MSSCR0 value - rlwinm r5,r5,0,0,l2pfes-1 ; Disable L2 Prefetch - mtspr msscr0,r5 ; Updates MSSCR0 value - sync - isync - -mpsL2PFok: - bt++ pf64Bitb,mpsPF64bit ; PM bits are shifted on 64bit systems. - - rlwinm r4,r4,0,sleep+1,doze-1 ; Clear all possible power-saving modes (not DPM though) - oris r4,r4,hi16(sleepm) ; Set sleep - b mpsClearDEC - -mpsPF64bit: - lis r5, hi16(dozem|napm|sleepm) ; Clear all possible power-saving modes (not DPM though) - sldi r5, r5, 32 - andc r4, r4, r5 - lis r5, hi16(napm) ; Set sleep - sldi r5, r5, 32 - or r4, r4, r5 - -mpsClearDEC: - mfmsr r5 ; Get the current MSR - rlwinm r10,r10,0,1,31 ; Make 0x7FFFFFFF - mtdec r10 ; Load decrimenter with 0x7FFFFFFF - isync ; and make sure, - mfdec r9 ; really sure, it gets there - - li r2,1 ; Prepare for 64 bit - rlwinm r5,r5,0,MSR_DR_BIT+1,MSR_IR_BIT-1 ; Turn off translation -; -; Note that we need translation off before we set the HID to sleep. Otherwise -; we will ignore any PTE misses that occur and cause an infinite loop. -; - bf++ pf64Bitb,mpsCheckMSR ; check 64-bit processor - rldimi r5,r2,63,MSR_SF_BIT ; set SF bit (bit 0) - mtmsrd r5 ; set 64-bit mode, turn off EE, DR, and IR - isync ; Toss prefetch - b mpsNoMSRx - -mpsCheckMSR: - bt pfNoMSRirb,mpsNoMSR ; No MSR... - - mtmsr r5 ; Translation off - isync ; Toss prefetch - b mpsNoMSRx - -mpsNoMSR: - li r0,loadMSR ; Get the MSR setter SC - mr r3,r5 ; Get new MSR - sc ; Set it -mpsNoMSRx: - - ori r3,r5,lo16(MASK(MSR_EE)) ; Flip on EE - sync - mtspr hid0,r4 ; Set up the HID to sleep - mfspr r4,hid0 ; Yes, this is silly, keep it here - mfspr r4,hid0 ; Yes, this is a duplicate, keep it here - mfspr r4,hid0 ; Yes, this is a duplicate, keep it here - mfspr r4,hid0 ; Yes, this is a duplicate, keep it here - mfspr r4,hid0 ; Yes, this is a duplicate, keep it here - mfspr r4,hid0 ; Yes, this is a duplicate, keep it here - - mtmsr r3 ; Enable for interrupts to drain decrimenter - - add r6,r4,r5 ; Just waste time - add r6,r6,r4 ; A bit more - add r6,r6,r5 ; A bit more - - mtmsr r5 ; Interruptions back off - isync ; Toss prefetch - -; -; We are here with translation off, interrupts off, all possible -; interruptions drained off, and a decrimenter that will not pop. -; - - bl EXT(cacheInit) ; Clear out the caches. This will leave them on - bl EXT(cacheDisable) ; Turn off all caches - - mfmsr r5 ; Get the current MSR - oris r5,r5,hi16(MASK(MSR_POW)) ; Turn on power management in next MSR - ; Leave EE off because power goes off shortly - mfsprg r12,0 ; Get the per_proc_info - li r10,PP_CPU_FLAGS - lhz r11,PP_CPU_FLAGS(r12) ; Get the flags - ori r11,r11,SleepState ; Marked SleepState - sth r11,PP_CPU_FLAGS(r12) ; Set the flags - dcbf r10,r12 - - mfsprg r11,2 ; Get CPU specific features - rlwinm. r0,r11,0,pf64Bitb,pf64Bitb ; Test for 64 bit processor - eqv r4,r4,r4 ; Get all foxes - rlwinm r4,r4,0,1,31 ; Make 0x7FFFFFFF - beq slSleepNow ; skip if 32-bit... - li r3, 0x4000 ; Cause decrimenter to roll over soon - mtdec r3 ; Load decrimenter with 0x00004000 - isync ; and make sure, - mfdec r3 ; really sure, it gets there - -slSleepNow: - sync ; Sync it all up - mtmsr r5 ; Do sleep with interruptions enabled - isync ; Take a pill - mtdec r4 ; Load decrimenter with 0x7FFFFFFF - isync ; and make sure, - mfdec r3 ; really sure, it gets there - b slSleepNow ; Go back to sleep if we wake up... - - - -/* Initialize all caches including the TLBs - * - * void cacheInit(void) - * - * This is used to force the caches to an initial clean state. First, we - * check if the cache is on, if so, we need to flush the contents to memory. - * Then we invalidate the L1. Next, we configure and invalidate the L2 etc. - * Finally we turn on all of the caches - * - * Note that if translation is not disabled when this is called, the TLB will not - * be completely clear after return. - * - */ - -; Force a line boundry here - .align 5 - .globl EXT(cacheInit) - -LEXT(cacheInit) - - mfsprg r12,0 ; Get the per_proc_info - mfspr r9,hid0 ; Get the current power-saving mode - - mfsprg r11,2 ; Get CPU specific features - mfmsr r7 ; Get the current MSR - rlwinm r7,r7,0,MSR_FP_BIT+1,MSR_FP_BIT-1 ; Force floating point off - rlwinm r7,r7,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1 ; Force vectors off - rlwimi r11,r11,pfLClckb+1,31,31 ; Move pfLClck to another position (to keep from using non-volatile CRs) - rlwinm r5,r7,0,MSR_DR_BIT+1,MSR_IR_BIT-1 ; Turn off translation - rlwinm r5,r5,0,MSR_EE_BIT+1,MSR_EE_BIT-1 ; Turn off interruptions - mtcrf 0x87,r11 ; Get the feature flags - lis r10,hi16(dozem|napm|sleepm|dpmm) ; Mask of power management bits - bf-- pf64Bitb,cIniNSF1 ; Skip if 32-bit... - - sldi r10,r10,32 ; Position the masks - -cIniNSF1: andc r4,r9,r10 ; Clean up the old power bits - mtspr hid0,r4 ; Set up the HID - mfspr r4,hid0 ; Yes, this is silly, keep it here - mfspr r4,hid0 ; Yes, this is a duplicate, keep it here - mfspr r4,hid0 ; Yes, this is a duplicate, keep it here - mfspr r4,hid0 ; Yes, this is a duplicate, keep it here - mfspr r4,hid0 ; Yes, this is a duplicate, keep it here - mfspr r4,hid0 ; Yes, this is a duplicate, keep it here - - bt pfNoMSRirb,ciNoMSR ; No MSR... - - mtmsr r5 ; Translation and all off - isync ; Toss prefetch - b ciNoMSRx - -ciNoMSR: - li r0,loadMSR ; Get the MSR setter SC - mr r3,r5 ; Get new MSR - sc ; Set it -ciNoMSRx: - - bf pfAltivecb,cinoDSS ; No Altivec here... - - dssall ; Stop streams - sync - -cinoDSS: li r5,tlbieLock ; Get the TLBIE lock - li r0,128 ; Get number of TLB entries - - li r6,0 ; Start at 0 - bf-- pf64Bitb,citlbhang ; Skip if 32-bit... - li r0,1024 ; Get the number of TLB entries - -citlbhang: lwarx r2,0,r5 ; Get the TLBIE lock - mr. r2,r2 ; Is it locked? - bne- citlbhang ; It is locked, go wait... - stwcx. r0,0,r5 ; Try to get it - bne- citlbhang ; We was beat... - - mtctr r0 ; Set the CTR - -cipurgeTLB: tlbie r6 ; Purge this entry - addi r6,r6,4096 ; Next page - bdnz cipurgeTLB ; Do them all... - - mtcrf 0x80,r11 ; Set SMP capability - sync ; Make sure all TLB purges are done - eieio ; Order, order in the court - - bf pfSMPcapb,cinoSMP ; SMP incapable... - - tlbsync ; Sync all TLBs - sync - isync - - bf-- pf64Bitb,cinoSMP ; Skip if 32-bit... - ptesync ; Wait for quiet again - sync - -cinoSMP: stw r2,tlbieLock(0) ; Unlock TLBIE lock - - bt++ pf64Bitb,cin64 ; Skip if 64-bit... - - rlwinm. r0,r9,0,ice,dce ; Were either of the level 1s on? - beq- cinoL1 ; No, no need to flush... - - rlwinm. r0,r11,0,pfL1fab,pfL1fab ; do we have L1 flush assist? - beq ciswdl1 ; If no hw flush assist, go do by software... - - mfspr r8,msscr0 ; Get the memory system control register - oris r8,r8,hi16(dl1hwfm) ; Turn on the hardware flush request - - mtspr msscr0,r8 ; Start the flush operation - -ciwdl1f: mfspr r8,msscr0 ; Get the control register again - - rlwinm. r8,r8,0,dl1hwf,dl1hwf ; Has the flush request been reset yet? - bne ciwdl1f ; No, flush is still in progress... - b ciinvdl1 ; Go invalidate l1... - -; -; We need to either make this very complicated or to use ROM for -; the flush. The problem is that if during the following sequence a -; snoop occurs that invalidates one of the lines in the cache, the -; PLRU sequence will be altered making it possible to miss lines -; during the flush. So, we either need to dedicate an area of RAM -; to each processor, lock use of a RAM area, or use ROM. ROM is -; by far the easiest. Note that this is not an issue for machines -; that have harware flush assists. -; - -ciswdl1: lwz r0,pfl1dSize(r12) ; Get the level 1 cache size - - bf 31,cisnlck ; Skip if pfLClck not set... - - mfspr r4,msscr0 ; ? - rlwinm r6,r4,0,0,l2pfes-1 ; ? - mtspr msscr0,r6 ; Set it - sync - isync - - mfspr r8,ldstcr ; Save the LDSTCR - li r2,1 ; Get a mask of 0x01 - lis r3,0xFFF0 ; Point to ROM - rlwinm r11,r0,29,3,31 ; Get the amount of memory to handle all indexes - - li r6,0 ; Start here - -cisiniflsh: dcbf r6,r3 ; Flush each line of the range we use - addi r6,r6,32 ; Bump to the next - cmplw r6,r0 ; Have we reached the end? - blt+ cisiniflsh ; Nope, continue initial flush... - - sync ; Make sure it is done - - addi r11,r11,-1 ; Get mask for index wrap - li r6,0 ; Get starting offset - -cislckit: not r5,r2 ; Lock all but 1 way - rlwimi r5,r8,0,0,23 ; Build LDSTCR - mtspr ldstcr,r5 ; Lock a way - sync ; Clear out memory accesses - isync ; Wait for all - - -cistouch: lwzx r10,r3,r6 ; Pick up some trash - addi r6,r6,32 ; Go to the next index - and. r0,r6,r11 ; See if we are about to do next index - bne+ cistouch ; Nope, do more... - - sync ; Make sure it is all done - isync - - sub r6,r6,r11 ; Back up to start + 1 - addi r6,r6,-1 ; Get it right - -cisflush: dcbf r3,r6 ; Flush everything out - addi r6,r6,32 ; Go to the next index - and. r0,r6,r11 ; See if we are about to do next index - bne+ cisflush ; Nope, do more... - - sync ; Make sure it is all done - isync - - - rlwinm. r2,r2,1,24,31 ; Shift to next way - bne+ cislckit ; Do this for all ways... - - mtspr ldstcr,r8 ; Slam back to original - sync - isync - - mtspr msscr0,r4 ; ? - sync - isync - - b cinoL1 ; Go on to level 2... - - -cisnlck: rlwinm r2,r0,0,1,30 ; Double cache size - add r0,r0,r2 ; Get 3 times cache size - rlwinm r0,r0,26,6,31 ; Get 3/2 number of cache lines - lis r3,0xFFF0 ; Dead recon ROM address for now - mtctr r0 ; Number of lines to flush - -ciswfldl1a: lwz r2,0(r3) ; Flush anything else - addi r3,r3,32 ; Next line - bdnz ciswfldl1a ; Flush the lot... - -ciinvdl1: sync ; Make sure all flushes have been committed - - mfspr r8,hid0 ; Get the HID0 bits - rlwinm r8,r8,0,dce+1,ice-1 ; Clear cache enables - mtspr hid0,r8 ; and turn off L1 cache - sync ; Make sure all is done - isync - - ori r8,r8,lo16(icem|dcem|icfim|dcfim) ; Set the HID0 bits for enable, and invalidate - sync - isync - - mtspr hid0,r8 ; Start the invalidate and turn on cache - rlwinm r8,r8,0,dcfi+1,icfi-1 ; Turn off the invalidate bits - mtspr hid0,r8 ; Turn off the invalidate (needed for some older machines) - sync - - -cinoL1: -; -; Flush and disable the level 2 -; - mfsprg r10,2 ; need to check 2 features we did not put in CR - rlwinm. r0,r10,0,pfL2b,pfL2b ; do we have L2? - beq cinol2 ; No level 2 cache to flush - - mfspr r8,l2cr ; Get the L2CR - lwz r3,pfl2cr(r12) ; Get the L2CR value - rlwinm. r0,r8,0,l2e,l2e ; Was the L2 enabled? - bne ciflushl2 ; Yes, force flush - cmplwi r8, 0 ; Was the L2 all the way off? - beq ciinvdl2 ; Yes, force invalidate - lis r0,hi16(l2sizm|l2clkm|l2ramm|l2ohm) ; Get confiuration bits - xor r2,r8,r3 ; Get changing bits? - ori r0,r0,lo16(l2slm|l2dfm|l2bypm) ; More config bits - and. r0,r0,r2 ; Did any change? - bne- ciinvdl2 ; Yes, just invalidate and get PLL synced... - -ciflushl2: - rlwinm. r0,r10,0,pfL2fab,pfL2fab ; hardware-assisted L2 flush? - beq ciswfl2 ; Flush not in hardware... - - mr r10,r8 ; Take a copy now - - bf 31,cinol2lck ; Skip if pfLClck not set... - - oris r10,r10,hi16(l2ionlym|l2donlym) ; Set both instruction- and data-only - sync - mtspr l2cr,r10 ; Lock out the cache - sync - isync - -cinol2lck: ori r10,r10,lo16(l2hwfm) ; Request flush - sync ; Make sure everything is done - - mtspr l2cr,r10 ; Request flush - -cihwfl2: mfspr r10,l2cr ; Get back the L2CR - rlwinm. r10,r10,0,l2hwf,l2hwf ; Is the flush over? - bne+ cihwfl2 ; Nope, keep going... - b ciinvdl2 ; Flush done, go invalidate L2... - -ciswfl2: - lwz r0,pfl2Size(r12) ; Get the L2 size - oris r2,r8,hi16(l2dom) ; Set L2 to data only mode - - b ciswfl2doa ; Branch to next line... - - .align 5 -ciswfl2doc: - mtspr l2cr,r2 ; Disable L2 - sync - isync - b ciswfl2dod ; It is off, go invalidate it... - -ciswfl2doa: - b ciswfl2dob ; Branch to next... - -ciswfl2dob: - sync ; Finish memory stuff - isync ; Stop speculation - b ciswfl2doc ; Jump back up and turn on data only... -ciswfl2dod: - rlwinm r0,r0,27,5,31 ; Get the number of lines - lis r10,0xFFF0 ; Dead recon ROM for now - mtctr r0 ; Set the number of lines - -ciswfldl2a: lwz r0,0(r10) ; Load something to flush something - addi r10,r10,32 ; Next line - bdnz ciswfldl2a ; Do the lot... - -ciinvdl2: rlwinm r8,r3,0,l2e+1,31 ; Clear the enable bit - b cinla ; Branch to next line... - - .align 5 -cinlc: mtspr l2cr,r8 ; Disable L2 - sync - isync - b ciinvl2 ; It is off, go invalidate it... - -cinla: b cinlb ; Branch to next... - -cinlb: sync ; Finish memory stuff - isync ; Stop speculation - b cinlc ; Jump back up and turn off cache... - -ciinvl2: sync - isync - - cmplwi r3, 0 ; Should the L2 be all the way off? - beq cinol2 ; Yes, done with L2 - - oris r2,r8,hi16(l2im) ; Get the invalidate flag set - - mtspr l2cr,r2 ; Start the invalidate - sync - isync -ciinvdl2a: mfspr r2,l2cr ; Get the L2CR - mfsprg r0,2 ; need to check a feature in "non-volatile" set - rlwinm. r0,r0,0,pfL2ib,pfL2ib ; flush in HW? - beq ciinvdl2b ; Flush not in hardware... - rlwinm. r2,r2,0,l2i,l2i ; Is the invalidate still going? - bne+ ciinvdl2a ; Assume so, this will take a looong time... - sync - b cinol2 ; No level 2 cache to flush -ciinvdl2b: - rlwinm. r2,r2,0,l2ip,l2ip ; Is the invalidate still going? - bne+ ciinvdl2a ; Assume so, this will take a looong time... - sync - mtspr l2cr,r8 ; Turn off the invalidate request - -cinol2: - -; -; Flush and enable the level 3 -; - bf pfL3b,cinol3 ; No level 3 cache to flush - - mfspr r8,l3cr ; Get the L3CR - lwz r3,pfl3cr(r12) ; Get the L3CR value - rlwinm. r0,r8,0,l3e,l3e ; Was the L3 enabled? - bne ciflushl3 ; Yes, force flush - cmplwi r8, 0 ; Was the L3 all the way off? - beq ciinvdl3 ; Yes, force invalidate - lis r0,hi16(l3pem|l3sizm|l3dxm|l3clkm|l3spom|l3ckspm) ; Get configuration bits - xor r2,r8,r3 ; Get changing bits? - ori r0,r0,lo16(l3pspm|l3repm|l3rtm|l3cyam|l3dmemm|l3dmsizm) ; More config bits - and. r0,r0,r2 ; Did any change? - bne- ciinvdl3 ; Yes, just invalidate and get PLL synced... - -ciflushl3: - sync ; 7450 book says do this even though not needed - mr r10,r8 ; Take a copy now - - bf 31,cinol3lck ; Skip if pfL23lck not set... - - oris r10,r10,hi16(l3iom) ; Set instruction-only - ori r10,r10,lo16(l3donlym) ; Set data-only - sync - mtspr l3cr,r10 ; Lock out the cache - sync - isync - -cinol3lck: ori r10,r10,lo16(l3hwfm) ; Request flush - sync ; Make sure everything is done - - mtspr l3cr,r10 ; Request flush - -cihwfl3: mfspr r10,l3cr ; Get back the L3CR - rlwinm. r10,r10,0,l3hwf,l3hwf ; Is the flush over? - bne+ cihwfl3 ; Nope, keep going... - -ciinvdl3: rlwinm r8,r3,0,l3e+1,31 ; Clear the enable bit - sync ; Make sure of life, liberty, and justice - mtspr l3cr,r8 ; Disable L3 - sync - - cmplwi r3, 0 ; Should the L3 be all the way off? - beq cinol3 ; Yes, done with L3 - - ori r8,r8,lo16(l3im) ; Get the invalidate flag set - - mtspr l3cr,r8 ; Start the invalidate - -ciinvdl3b: mfspr r8,l3cr ; Get the L3CR - rlwinm. r8,r8,0,l3i,l3i ; Is the invalidate still going? - bne+ ciinvdl3b ; Assume so... - sync - - lwz r10, pfBootConfig(r12) ; ? - rlwinm. r10, r10, 24, 28, 31 ; ? - beq ciinvdl3nopdet ; ? - - mfspr r8,l3pdet ; ? - srw r2, r8, r10 ; ? - rlwimi r2, r8, 0, 24, 31 ; ? - subfic r10, r10, 32 ; ? - li r8, -1 ; ? - ori r2, r2, 0x0080 ; ? - slw r8, r8, r10 ; ? - or r8, r2, r8 ; ? - mtspr l3pdet, r8 ; ? - isync - -ciinvdl3nopdet: - mfspr r8,l3cr ; Get the L3CR - rlwinm r8,r8,0,l3clken+1,l3clken-1 ; Clear the clock enable bit - mtspr l3cr,r8 ; Disable the clock - - li r2,128 ; ? -ciinvdl3c: addi r2,r2,-1 ; ? - cmplwi r2,0 ; ? - bne+ ciinvdl3c - - mfspr r10,msssr0 ; ? - rlwinm r10,r10,0,vgL3TAG+1,vgL3TAG-1 ; ? - mtspr msssr0,r10 ; ? - sync - - mtspr l3cr,r3 ; Enable it as desired - sync -cinol3: - mfsprg r0,2 ; need to check a feature in "non-volatile" set - rlwinm. r0,r0,0,pfL2b,pfL2b ; is there an L2 cache? - beq cinol2a ; No level 2 cache to enable - - lwz r3,pfl2cr(r12) ; Get the L2CR value - cmplwi r3, 0 ; Should the L2 be all the way off? - beq cinol2a : Yes, done with L2 - mtspr l2cr,r3 ; Enable it as desired - sync - -; -; Invalidate and turn on L1s -; - -cinol2a: - bt 31,cinoexit ; Skip if pfLClck set... - - rlwinm r8,r9,0,dce+1,ice-1 ; Clear the I- and D- cache enables - mtspr hid0,r8 ; Turn off dem caches - sync - - ori r8,r9,lo16(icem|dcem|icfim|dcfim) ; Set the HID0 bits for enable, and invalidate - rlwinm r9,r8,0,dcfi+1,icfi-1 ; Turn off the invalidate bits - sync - isync - - mtspr hid0,r8 ; Start the invalidate and turn on L1 cache - -cinoexit: mtspr hid0,r9 ; Turn off the invalidate (needed for some older machines) and restore entry conditions - sync - mtmsr r7 ; Restore MSR to entry - isync - blr ; Return... - - -; -; Handle 64-bit architecture -; This processor can not run without caches, so we just push everything out -; and flush. It will be relativily clean afterwards -; - - .align 5 - -cin64: - mfspr r10,hid1 ; Save hid1 - mfspr r4,hid4 ; Save hid4 - mr r12,r10 ; Really save hid1 - mr r11,r4 ; Get a working copy of hid4 - - li r0,0 ; Get a 0 - eqv r2,r2,r2 ; Get all foxes - - rldimi r10,r0,55,7 ; Clear I$ prefetch bits (7:8) - - isync - mtspr hid1,r10 ; Stick it - mtspr hid1,r10 ; Stick it again - isync - - rldimi r11,r2,38,25 ; Disable D$ prefetch (25:25) - - sync - mtspr hid4,r11 ; Stick it - isync - - li r3,8 ; Set bit 28+32 - sldi r3,r3,32 ; Make it bit 28 - or r3,r3,r11 ; Turn on the flash invalidate L1D$ - - oris r5,r11,0x0600 ; Set disable L1D$ bits - sync - mtspr hid4,r3 ; Invalidate - isync - - mtspr hid4,r5 ; Un-invalidate and disable L1D$ - isync - - lis r8,GUSModeReg ; Get the GUS mode ring address - mfsprg r0,2 ; Get the feature flags - ori r8,r8,0x8000 ; Set to read data - rlwinm. r0,r0,pfSCOMFixUpb+1,31,31 ; Set shift if we need a fix me up - - sync - - mtspr scomc,r8 ; Request the GUS mode - mfspr r11,scomd ; Get the GUS mode - mfspr r8,scomc ; Get back the status (we just ignore it) - sync - isync - - sld r11,r11,r0 ; Fix up if needed - - ori r6,r11,lo16(GUSMdmapen) ; Set the bit that means direct L2 cache address - lis r8,GUSModeReg ; Get GUS mode register address - - sync - - mtspr scomd,r6 ; Set that we want direct L2 mode - mtspr scomc,r8 ; Tell GUS we want direct L2 mode - mfspr r3,scomc ; Get back the status - sync - isync - - li r3,0 ; Clear start point - -cflushlp: lis r6,0x0040 ; Pick 4MB line as our target - or r6,r6,r3 ; Put in the line offset - lwz r5,0(r6) ; Load a line - addis r6,r6,8 ; Roll bit 42:44 - lwz r5,0(r6) ; Load a line - addis r6,r6,8 ; Roll bit 42:44 - lwz r5,0(r6) ; Load a line - addis r6,r6,8 ; Roll bit 42:44 - lwz r5,0(r6) ; Load a line - addis r6,r6,8 ; Roll bit 42:44 - lwz r5,0(r6) ; Load a line - addis r6,r6,8 ; Roll bit 42:44 - lwz r5,0(r6) ; Load a line - addis r6,r6,8 ; Roll bit 42:44 - lwz r5,0(r6) ; Load a line - addis r6,r6,8 ; Roll bit 42:44 - lwz r5,0(r6) ; Load a line - - addi r3,r3,128 ; Next line - andis. r5,r3,8 ; Have we done enough? - beq++ cflushlp ; Not yet... - - sync - - lis r6,0x0040 ; Pick 4MB line as our target - -cflushx: dcbf 0,r6 ; Flush line and invalidate - addi r6,r6,128 ; Next line - andis. r5,r6,0x0080 ; Have we done enough? - beq++ cflushx ; Keep on flushing... - - mr r3,r10 ; Copy current hid1 - rldimi r3,r2,54,9 ; Set force icbi match mode - - li r6,0 ; Set start if ICBI range - isync - mtspr hid1,r3 ; Stick it - mtspr hid1,r3 ; Stick it again - isync - -cflicbi: icbi 0,r6 ; Kill I$ - addi r6,r6,128 ; Next line - andis. r5,r6,1 ; Have we done them all? - beq++ cflicbi ; Not yet... - - lis r8,GUSModeReg ; Get GUS mode register address - - sync - - mtspr scomd,r11 ; Set that we do not want direct mode - mtspr scomc,r8 ; Tell GUS we do not want direct mode - mfspr r3,scomc ; Get back the status - sync - isync - - isync - mtspr hid0,r9 ; Restore entry hid0 - mfspr r9,hid0 ; Yes, this is silly, keep it here - mfspr r9,hid0 ; Yes, this is a duplicate, keep it here - mfspr r9,hid0 ; Yes, this is a duplicate, keep it here - mfspr r9,hid0 ; Yes, this is a duplicate, keep it here - mfspr r9,hid0 ; Yes, this is a duplicate, keep it here - mfspr r9,hid0 ; Yes, this is a duplicate, keep it here - isync - - isync - mtspr hid1,r12 ; Restore entry hid1 - mtspr hid1,r12 ; Stick it again - isync - - sync - mtspr hid4,r4 ; Restore entry hid4 - isync - - sync - mtmsr r7 ; Restore MSR to entry - isync - blr ; Return... - - - -/* Disables all caches - * - * void cacheDisable(void) - * - * Turns off all caches on the processor. They are not flushed. - * - */ - -; Force a line boundry here - .align 5 - .globl EXT(cacheDisable) - -LEXT(cacheDisable) - - mfsprg r11,2 ; Get CPU specific features - mtcrf 0x83,r11 ; Set feature flags - - bf pfAltivecb,cdNoAlt ; No vectors... - - dssall ; Stop streams - -cdNoAlt: sync - - btlr pf64Bitb ; No way to disable a 64-bit machine... - - mfspr r5,hid0 ; Get the hid - rlwinm r5,r5,0,dce+1,ice-1 ; Clear the I- and D- cache enables - mtspr hid0,r5 ; Turn off dem caches - sync - - rlwinm. r0,r11,0,pfL2b,pfL2b ; is there an L2? - beq cdNoL2 ; Skip if no L2... - - mfspr r5,l2cr ; Get the L2 - rlwinm r5,r5,0,l2e+1,31 ; Turn off enable bit - - b cinlaa ; Branch to next line... - - .align 5 -cinlcc: mtspr l2cr,r5 ; Disable L2 - sync - isync - b cdNoL2 ; It is off, we are done... - -cinlaa: b cinlbb ; Branch to next... - -cinlbb: sync ; Finish memory stuff - isync ; Stop speculation - b cinlcc ; Jump back up and turn off cache... - -cdNoL2: - - bf pfL3b,cdNoL3 ; Skip down if no L3... - - mfspr r5,l3cr ; Get the L3 - rlwinm r5,r5,0,l3e+1,31 ; Turn off enable bit - rlwinm r5,r5,0,l3clken+1,l3clken-1 ; Turn off cache enable bit - mtspr l3cr,r5 ; Disable the caches - sync - -cdNoL3: - blr ; Leave... - - -/* Initialize processor thermal monitoring - * void ml_thrm_init(void) - * - * Obsolete, deprecated and will be removed. - */ - -; Force a line boundry here - .align 5 - .globl EXT(ml_thrm_init) - -LEXT(ml_thrm_init) - blr - -/* Set thermal monitor bounds - * void ml_thrm_set(unsigned int low, unsigned int high) - * - * Obsolete, deprecated and will be removed. - */ - -; Force a line boundry here - .align 5 - .globl EXT(ml_thrm_set) - -LEXT(ml_thrm_set) - blr - -/* Read processor temprature - * unsigned int ml_read_temp(void) - * - * Obsolete, deprecated and will be removed. - */ - -; Force a line boundry here - .align 5 - .globl EXT(ml_read_temp) - -LEXT(ml_read_temp) - li r3,-1 - blr - -/* Throttle processor speed up or down - * unsigned int ml_throttle(unsigned int step) - * - * Returns old speed and sets new. Both step and return are values from 0 to - * 255 that define number of throttle steps, 0 being off and "ictcfim" is max * 2. - * - * Obsolete, deprecated and will be removed. - */ - -; Force a line boundry here - .align 5 - .globl EXT(ml_throttle) - -LEXT(ml_throttle) - li r3,0 - blr - -/* -** ml_get_timebase() -** -** Entry - R3 contains pointer to 64 bit structure. -** -** Exit - 64 bit structure filled in. -** -*/ -; Force a line boundry here - .align 5 - .globl EXT(ml_get_timebase) - -LEXT(ml_get_timebase) - -loop: - mftbu r4 - mftb r5 - mftbu r6 - cmpw r6, r4 - bne- loop - - stw r4, 0(r3) - stw r5, 4(r3) - - blr - -/* - * unsigned int cpu_number(void) - * - * Returns the current cpu number. - */ - - .align 5 - .globl EXT(cpu_number) - -LEXT(cpu_number) - mfsprg r4,1 ; Get the current activation - lwz r4,ACT_PER_PROC(r4) ; Get the per_proc block - lhz r3,PP_CPU_NUMBER(r4) ; Get CPU number - blr ; Return... - -/* - * processor_t current_processor(void) - * - * Returns the current processor. - */ - - .align 5 - .globl EXT(current_processor) - -LEXT(current_processor) - mfsprg r3,1 ; Get the current activation - lwz r3,ACT_PER_PROC(r3) ; Get the per_proc block - addi r3,r3,PP_PROCESSOR - blr - -#if PROCESSOR_SIZE > PP_PROCESSOR_SIZE -#error processor overflows per_proc -#endif - -/* - * ast_t *ast_pending(void) - * - * Returns the address of the pending AST mask for the current processor. - */ - - .align 5 - .globl EXT(ast_pending) - -LEXT(ast_pending) - mfsprg r3,1 ; Get the current activation - lwz r3,ACT_PER_PROC(r3) ; Get the per_proc block - addi r3,r3,PP_PENDING_AST - blr ; Return... - -/* - * void machine_set_current_thread(thread_t) - * - * Set the current thread - */ - .align 5 - .globl EXT(machine_set_current_thread) - -LEXT(machine_set_current_thread) - - mfsprg r4,1 ; Get spr1 - lwz r5,ACT_PER_PROC(r4) ; Get the PerProc from the previous active thread - stw r5,ACT_PER_PROC(r3) ; Set the PerProc in the active thread - mtsprg 1,r3 ; Set spr1 with the active thread - blr ; Return... - -/* - * thread_t current_thread(void) - * thread_t current_act(void) - * - * - * Return the current thread for outside components. - */ - .align 5 - .globl EXT(current_thread) - .globl EXT(current_act) - -LEXT(current_thread) -LEXT(current_act) - - mfsprg r3,1 - blr - - .align 5 - .globl EXT(mach_absolute_time) -LEXT(mach_absolute_time) -1: mftbu r3 - mftb r4 - mftbu r0 - cmpw r0,r3 - bne-- 1b - blr - -/* -** ml_sense_nmi() -** -*/ -; Force a line boundry here - .align 5 - .globl EXT(ml_sense_nmi) - -LEXT(ml_sense_nmi) - - blr ; Leave... - -/* -** ml_set_processor_speed_powertune() -** -*/ -; Force a line boundry here - .align 5 - .globl EXT(ml_set_processor_speed_powertune) - -LEXT(ml_set_processor_speed_powertune) - mflr r0 ; Save the link register - stwu r1, -(FM_ALIGN(4*4)+FM_SIZE)(r1) ; Make some space on the stack - stw r28, FM_ARG0+0x00(r1) ; Save a register - stw r29, FM_ARG0+0x04(r1) ; Save a register - stw r30, FM_ARG0+0x08(r1) ; Save a register - stw r31, FM_ARG0+0x0C(r1) ; Save a register - stw r0, (FM_ALIGN(4*4)+FM_SIZE+FM_LR_SAVE)(r1) ; Save the return - - mfsprg r31,1 ; Get the current activation - lwz r31,ACT_PER_PROC(r31) ; Get the per_proc block - - rlwinm r28, r3, 31-dnap, dnap, dnap ; Shift the 1 bit to the dnap+32 bit - rlwinm r3, r3, 2, 29, 29 ; Shift the 1 to a 4 and mask - addi r3, r3, pfPowerTune0 ; Add in the pfPowerTune0 offset - lwzx r29, r31, r3 ; Load the PowerTune number 0 or 1 - - sldi r28, r28, 32 ; Shift to the top half - ld r3, pfHID0(r31) ; Load the saved hid0 value - and r28, r28, r3 ; Save the dnap bit - lis r4, hi16(dnapm) ; Make a mask for the dnap bit - sldi r4, r4, 32 ; Shift to the top half - andc r3, r3, r4 ; Clear the dnap bit - or r28, r28, r3 ; Insert the dnap bit as needed for later - - sync - mtspr hid0, r3 ; Turn off dnap in hid0 - mfspr r3, hid0 ; Yes, this is silly, keep it here - mfspr r3, hid0 ; Yes, this is a duplicate, keep it here - mfspr r3, hid0 ; Yes, this is a duplicate, keep it here - mfspr r3, hid0 ; Yes, this is a duplicate, keep it here - mfspr r3, hid0 ; Yes, this is a duplicate, keep it here - mfspr r3, hid0 ; Yes, this is a duplicate, keep it here - isync ; Make sure it is set - - lis r3, hi16(PowerTuneControlReg) ; Write zero to the PCR - ori r3, r3, lo16(PowerTuneControlReg) - li r4, 0 - li r5, 0 - bl _ml_scom_write - - lis r3, hi16(PowerTuneControlReg) ; Write the PowerTune value to the PCR - ori r3, r3, lo16(PowerTuneControlReg) - li r4, 0 - mr r5, r29 - bl _ml_scom_write - - rlwinm r29, r29, 13-6, 6, 7 ; Move to PSR speed location and isolate the requested speed -spsPowerTuneLoop: - lis r3, hi16(PowerTuneStatusReg) ; Read the status from the PSR - ori r3, r3, lo16(PowerTuneStatusReg) - li r4, 0 - bl _ml_scom_read - srdi r5, r5, 32 - rlwinm r0, r5, 0, 6, 7 ; Isolate the current speed - rlwimi r0, r5, 0, 2, 2 ; Copy in the change in progress bit - cmpw r0, r29 ; Compare the requested and current speeds - beq spsPowerTuneDone - rlwinm. r0, r5, 0, 3, 3 - beq spsPowerTuneLoop - -spsPowerTuneDone: - sync - mtspr hid0, r28 ; Turn on dnap in hid0 if needed - mfspr r28, hid0 ; Yes, this is silly, keep it here - mfspr r28, hid0 ; Yes, this is a duplicate, keep it here - mfspr r28, hid0 ; Yes, this is a duplicate, keep it here - mfspr r28, hid0 ; Yes, this is a duplicate, keep it here - mfspr r28, hid0 ; Yes, this is a duplicate, keep it here - mfspr r28, hid0 ; Yes, this is a duplicate, keep it here - isync ; Make sure it is set - - lwz r0, (FM_ALIGN(4*4)+FM_SIZE+FM_LR_SAVE)(r1) ; Get the return - lwz r28, FM_ARG0+0x00(r1) ; Restore a register - lwz r29, FM_ARG0+0x04(r1) ; Restore a register - lwz r30, FM_ARG0+0x08(r1) ; Restore a register - lwz r31, FM_ARG0+0x0C(r1) ; Restore a register - lwz r1, FM_BACKPTR(r1) ; Pop the stack - mtlr r0 - blr - -/* -** ml_set_processor_speed_dpll() -** -*/ -; Force a line boundry here - .align 5 - .globl EXT(ml_set_processor_speed_dpll) - -LEXT(ml_set_processor_speed_dpll) - mfsprg r5,1 ; Get the current activation - lwz r5,ACT_PER_PROC(r5) ; Get the per_proc block - - cmplwi r3, 0 ; Turn off BTIC before low speed - beq spsDPLL1 - mfspr r4, hid0 ; Get the current hid0 value - rlwinm r4, r4, 0, btic+1, btic-1 ; Clear the BTIC bit - sync - mtspr hid0, r4 ; Set the new hid0 value - isync - sync - -spsDPLL1: - mfspr r4, hid1 ; Get the current PLL settings - rlwimi r4, r3, 31-hid1ps, hid1ps, hid1ps ; Copy the PLL Select bit - stw r4, pfHID1(r5) ; Save the new hid1 value - mtspr hid1, r4 ; Select desired PLL - - cmplwi r3, 0 ; Restore BTIC after high speed - bne spsDPLL2 - lwz r4, pfHID0(r5) ; Load the hid0 value - sync - mtspr hid0, r4 ; Set the hid0 value - isync - sync -spsDPLL2: - blr - - -/* -** ml_set_processor_speed_dfs(divideby) -** divideby == 0 then divide by 1 (full speed) -** divideby == 1 then divide by 2 (half speed) -** divideby == 2 then divide by 4 (quarter speed) -** divideby == 3 then divide by 4 (quarter speed) - preferred -** -*/ -; Force a line boundry here - .align 5 - .globl EXT(ml_set_processor_speed_dfs) - -LEXT(ml_set_processor_speed_dfs) - - mfspr r4,hid1 ; Get the current HID1 - mfsprg r5,0 ; Get the per_proc_info - rlwimi r4,r3,31-hid1dfs1,hid1dfs0,hid1dfs1 ; Stick the new divider bits in - stw r4,pfHID1(r5) ; Save the new hid1 value - sync - mtspr hid1,r4 ; Set the new HID1 - sync - isync - blr - - -/* -** ml_set_processor_voltage() -** -*/ -; Force a line boundry here - .align 5 - .globl EXT(ml_set_processor_voltage) - -LEXT(ml_set_processor_voltage) - mfsprg r5,1 ; Get the current activation - lwz r5,ACT_PER_PROC(r5) ; Get the per_proc block - - lwz r6, pfPowerModes(r5) ; Get the supported power modes - - rlwinm. r0, r6, 0, pmDPLLVminb, pmDPLLVminb ; Is DPLL Vmin supported - beq spvDone - - mfspr r4, hid2 ; Get HID2 value - rlwimi r4, r3, 31-hid2vmin, hid2vmin, hid2vmin ; Insert the voltage mode bit - mtspr hid2, r4 ; Set the voltage mode - sync ; Make sure it is done - -spvDone: - blr - - -; -; unsigned int ml_scom_write(unsigned int reg, unsigned long long data) -; 64-bit machines only -; returns status -; - - .align 5 - .globl EXT(ml_scom_write) - -LEXT(ml_scom_write) - - rldicr r3,r3,8,47 ; Align register it correctly - rldimi r5,r4,32,0 ; Merge the high part of data - sync ; Clean up everything - - mtspr scomd,r5 ; Stick in the data - mtspr scomc,r3 ; Set write to register - sync - isync - - mfspr r3,scomc ; Read back status - blr ; leave.... - -; -; unsigned int ml_read_scom(unsigned int reg, unsigned long long *data) -; 64-bit machines only -; returns status -; ASM Callers: data (r4) can be zero and the 64 bit data will be returned in r5 -; - - .align 5 - .globl EXT(ml_scom_read) - -LEXT(ml_scom_read) - - mfsprg r0,2 ; Get the feature flags - rldicr r3,r3,8,47 ; Align register it correctly - rlwinm r0,r0,pfSCOMFixUpb+1,31,31 ; Set shift if we need a fix me up - - ori r3,r3,0x8000 ; Set to read data - sync - - mtspr scomc,r3 ; Request the register - mfspr r5,scomd ; Get the register contents - mfspr r3,scomc ; Get back the status - sync - isync - - sld r5,r5,r0 ; Fix up if needed - - cmplwi r4, 0 ; If data pointer is null, just return - beqlr ; the received data in r5 - std r5,0(r4) ; Pass back the received data - blr ; Leave... - -; -; Calculates the hdec to dec ratio -; - - .align 5 - .globl EXT(ml_hdec_ratio) - -LEXT(ml_hdec_ratio) - - li r0,0 ; Clear the EE bit (and everything else for that matter) - mfmsr r11 ; Get the MSR - mtmsrd r0,1 ; Set the EE bit only (do not care about RI) - rlwinm r11,r11,0,MSR_EE_BIT,MSR_EE_BIT ; Isolate just the EE bit - mfmsr r10 ; Refresh our view of the MSR (VMX/FP may have changed) - or r12,r10,r11 ; Turn on EE if on before we turned it off - - mftb r9 ; Get time now - mfspr r2,hdec ; Save hdec - -mhrcalc: mftb r8 ; Get time now - sub r8,r8,r9 ; How many ticks? - cmplwi r8,10000 ; 10000 yet? - blt mhrcalc ; Nope... - - mfspr r9,hdec ; Get hdec now - sub r3,r2,r9 ; How many ticks? - mtmsrd r12,1 ; Flip EE on if needed - blr ; Leave... - - -; -; int setPop(time) -; -; Calculates the number of ticks to the supplied event and -; sets the decrementer. Never set the time for less that the -; minimum, which is 10, nor more than maxDec, which is usually 0x7FFFFFFF -; and never more than that but can be set by root. -; -; - - .align 7 - .globl EXT(setPop) - -#define kMin 10 - -LEXT(setPop) - -spOver: mftbu r8 ; Get upper time - addic r2,r4,-kMin ; Subtract minimum from target - mftb r9 ; Get lower - addme r11,r3 ; Do you have any bits I could borrow? - mftbu r10 ; Get upper again - subfe r0,r0,r0 ; Get -1 if we went negative 0 otherwise - subc r7,r2,r9 ; Subtract bottom and get carry - cmplw r8,r10 ; Did timebase upper tick? - subfe r6,r8,r11 ; Get the upper difference accounting for borrow - lwz r12,maxDec(0) ; Get the maximum decrementer size - addme r0,r0 ; Get -1 or -2 if anything negative, 0 otherwise - addic r2,r6,-1 ; Set carry if diff < 2**32 - srawi r0,r0,1 ; Make all foxes - subi r10,r12,kMin ; Adjust maximum for minimum adjust - andc r7,r7,r0 ; Pin time at 0 if under minimum - subfe r2,r2,r2 ; 0 if diff > 2**32, -1 otherwise - sub r7,r7,r10 ; Negative if duration is less than (max - min) - or r2,r2,r0 ; If the duration is negative, it is not too big - srawi r0,r7,31 ; -1 if duration is too small - and r7,r7,r2 ; Clear duration if high part too big - and r7,r7,r0 ; Clear duration if low part too big - bne-- spOver ; Timer ticked... - add r3,r7,r12 ; Add back the max for total - mtdec r3 ; Set the decrementer - blr ; Leave... - - diff --git a/osfmk/ppc/machine_task.c b/osfmk/ppc/machine_task.c deleted file mode 100644 index 5decd0ce2..000000000 --- a/osfmk/ppc/machine_task.c +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ - -#include -#include - -kern_return_t -machine_task_set_state( - __unused task_t task, - __unused int flavor, - __unused thread_state_t state, - __unused mach_msg_type_number_t state_count) -{ - return KERN_FAILURE; -} - -kern_return_t -machine_task_get_state(__unused task_t task, - __unused int flavor, - __unused thread_state_t state, - __unused mach_msg_type_number_t *state_count) -{ - return KERN_FAILURE; -} - -kern_return_t -machine_thread_inherit_taskwide( - __unused thread_t thread, - __unused task_t parent_task) -{ - return KERN_FAILURE; -} diff --git a/osfmk/ppc/machlimits.h b/osfmk/ppc/machlimits.h deleted file mode 100644 index b43f64958..000000000 --- a/osfmk/ppc/machlimits.h +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:41 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:26:02 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.2.1 1996/12/09 16:55:05 stephen - * nmklinux_1.0b3_shared into pmk1.1 - * New file based on hp_pa - * [1996/12/09 11:09:22 stephen] - * - * $EndLog$ - */ -/* - * Copyright (c) 1988 The Regents of the University of California. - * All rights reserved. - * - * Redistribution and use in source and binary forms are permitted - * provided that the above copyright notice and this paragraph are - * duplicated in all such forms and that any documentation, - * advertising materials, and other materials related to such - * distribution and use acknowledge that the software was developed - * by the University of California, Berkeley. The name of the - * University may not be used to endorse or promote products derived - * from this software without specific prior written permission. - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. - * - * @(#)machlimits.h 7.1 (Berkeley) 2/15/89 - */ -#ifndef _MACH_MACHLIMITS_H_ -#define _MACH_MACHLIMITS_H_ - -#define CHAR_BIT 8 /* number of bits in a char */ - -#define SCHAR_MAX 127 /* max value for a signed char */ -#define SCHAR_MIN (-128) /* min value for a signed char */ - -#define UCHAR_MAX 255U /* max value for an unsigned char */ -#define CHAR_MAX 127 /* max value for a char */ -#define CHAR_MIN (-128) /* min value for a char */ - -#define USHRT_MAX 65535U /* max value for an unsigned short */ -#define SHRT_MAX 32767 /* max value for a short */ -#define SHRT_MIN (-32768) /* min value for a short */ - -#define UINT_MAX 0xFFFFFFFFU /* max value for an unsigned int */ -#define INT_MAX 2147483647 /* max value for an int */ -#define INT_MIN (-2147483647-1) /* min value for an int */ - -#define ULONG_MAX UINT_MAX /* max value for an unsigned long */ -#define LONG_MAX INT_MAX /* max value for a long */ -#define LONG_MIN INT_MIN /* min value for a long */ - -/* Must be at least two, for internationalization (NLS/KJI) */ -#define MB_LEN_MAX 4 /* multibyte characters */ - -#endif /* _MACH_MACHLIMITS_H_ */ diff --git a/osfmk/ppc/machparam.h b/osfmk/ppc/machparam.h deleted file mode 100644 index b5f5374cd..000000000 --- a/osfmk/ppc/machparam.h +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Copyright (c) 1990, 1991 The University of Utah and - * the Center for Software Science at the University of Utah (CSS). - * All rights reserved. - * - * Permission to use, copy, modify and distribute this software is hereby - * granted provided that (1) source code retains these copyright, permission, - * and disclaimer notices, and (2) redistributions including binaries - * reproduce the notices in supporting documentation, and (3) all advertising - * materials mentioning features or use of this software display the following - * acknowledgement: ``This product includes software developed by the Center - * for Software Science at the University of Utah.'' - * - * THE UNIVERSITY OF UTAH AND CSS ALLOW FREE USE OF THIS SOFTWARE IN ITS "AS - * IS" CONDITION. THE UNIVERSITY OF UTAH AND CSS DISCLAIM ANY LIABILITY OF - * ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * CSS requests users of this software to return to css-dist@cs.utah.edu any - * improvements that they make and grant CSS redistribution rights. - * - * Utah $Hdr: machparam.h 1.7 92/05/22$ - */ - -#ifndef _PPC_MACHPARAM_H_ -#define _PPC_MACHPARAM_H_ - -/* - * Machine dependent constants for ppc. - * Added as needed (for device drivers). - */ -#define NBPG 4096 /* bytes/page */ -#define PGSHIFT 12 /* LOG2(NBPG) */ - -#define DEV_BSHIFT 10 /* log2(DEV_BSIZE) */ - -/* - * Disk devices do all IO in 1024-byte blocks. - */ -#define DEV_BSIZE 1024 - -#define btop(x) ((x)>>PGSHIFT) -#define ptob(x) ((x)<>(PGSHIFT-DEV_BSHIFT)) - -/* clicks to bytes */ -#define ctob(x) ((x)<>PGSHIFT) - -#endif /* _PPC_MACHPARAM_H_ */ diff --git a/osfmk/ppc/mappings.c b/osfmk/ppc/mappings.c deleted file mode 100644 index 5da3b85d7..000000000 --- a/osfmk/ppc/mappings.c +++ /dev/null @@ -1,1805 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * This file is used to maintain the virtual to real mappings for a PowerPC machine. - * The code herein is primarily used to bridge between the pmap layer and the hardware layer. - * Currently, some of the function of this module is contained within pmap.c. We may want to move - * all of this into it (or most anyway) for the sake of performance. We shall see as we write it. - * - * We also depend upon the structure of the phys_entry control block. We do put some processor - * specific stuff in there. - * - */ - -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include /* (TEST/DEBUG) */ - -#define PERFTIMES 0 - -vm_map_t mapping_map = VM_MAP_NULL; - -unsigned int incrVSID = 0; /* VSID increment value */ -unsigned int mappingdeb0 = 0; -unsigned int mappingdeb1 = 0; -int ppc_max_adrsp; /* Maximum address spaces */ - -addr64_t *mapdebug; /* (BRINGUP) */ -extern unsigned int DebugWork; /* (BRINGUP) */ - -void mapping_verify(void); -void mapping_phys_unused(ppnum_t pa); - -int nx_enabled = 1; /* enable no-execute protection */ -int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */ -int allow_stack_exec = VM_ABI_32; /* 32-bit apps may execute from the stack by default, 64-bit apps may not */ - -/* - * ppc_prot translates Mach's representation of protections to that of the PPC hardware. - * For Virtual Machines (VMM), we also provide translation entries where the output is - * the same as the input, allowing direct specification of PPC protections. Mach's - * representations are always in the range 0..7, so they always fall into the first - * 8 table entries; direct translations are placed in the range 8..16, so they fall into - * the second half of the table. - * - */ - -unsigned char ppc_prot[16] = { 4, 7, 6, 6, 3, 3, 2, 2, /* Mach -> PPC translations */ - 0, 1, 2, 3, 4, 5, 6, 7 }; /* VMM direct translations */ - - - -vm_prot_t getProtPPC(int key, boolean_t disable_NX) { - vm_prot_t prot; - - prot = ppc_prot[key & 0xF]; - - if (key <= 7 && disable_NX == TRUE) - prot &= ~mpN; - - return (prot); -} - - -/* - * About PPC VSID generation: - * - * This function is called to generate an address space ID. This space ID must be unique within - * the system. For the PowerPC, it is used to build the VSID. We build a VSID in the following - * way: space ID << 4 | segment. Since a VSID is 24 bits, and out of that, we reserve the last - * 4, so, we can have 2^20 (2M) unique IDs. Each pmap has a unique space ID, so we should be able - * to have 2M pmaps at a time, which we couldn't, we'd run out of memory way before then. The - * problem is that only a certain number of pmaps are kept in a free list and if that is full, - * they are release. This causes us to lose track of what space IDs are free to be reused. - * We can do 4 things: 1) not worry about it, 2) keep all free pmaps, 3) rebuild all mappings - * when the space ID wraps, or 4) scan the list of pmaps and find a free one. - * - * Yet another consideration is the hardware use of the VSID. It is used as part of the hash - * calculation for virtual address lookup. An improperly chosen value could potentially cause - * too many hashes to hit the same bucket, causing PTEG overflows. The actual hash function - * is (page index XOR vsid) mod number of ptegs. For a 32MB machine, using the suggested - * hash table size, there are 2^12 (8192) PTEGs. Remember, though, that the bottom 4 bits - * are reserved for the segment number, which means that we really have 2^(12-4) 512 space IDs - * before we start hashing to the same buckets with the same vaddrs. Also, within a space ID, - * every 8192 pages (32MB) within a segment will hash to the same bucket. That's 8 collisions - * per segment. So, a scan of every page for 256MB would fill 32 PTEGs completely, but - * with no overflow. I don't think that this is a problem. - * - * There may be a problem with the space ID, though. A new space ID is generate (mainly) - * whenever there is a fork. There shouldn't really be any problem because (for a 32MB - * machine) we can have 512 pmaps and still not have hash collisions for the same address. - * The potential problem, though, is if we get long-term pmaps that have space IDs that are - * the same modulo 512. We can reduce this problem by having the segment number be bits - * 0-3 of the space ID rather than 20-23. Doing this means that, in effect, corresponding - * vaddrs in different segments hash to the same PTEG. While this is somewhat of a problem, - * I don't think that it is as signifigant as the other, so, I'll make the space ID - * with segment first. - * - * The final, and biggest problem is the wrap, which will happen every 2^20 space IDs. - * While this is a problem that should only happen in periods counted in weeks, it can and - * will happen. This is assuming a monotonically increasing space ID. If we were to search - * for an inactive space ID, there could not be a wrap until there was 2^20 concurrent space IDs. - * That's pretty unlikely to happen. There couldn't be enough storage to support a million tasks. - * - * So, what we do is to keep all active pmaps in a chain (anchored from kernel_pmap and - * locked by free_pmap_lock) that is sorted in VSID sequence order. - * - * Whenever we need a VSID, we walk the list looking for the next in the sequence from - * the last that was freed. The we allocate that. - * - * NOTE: We must be called with interruptions off and free_pmap_lock held. - * - */ - -/* - * mapping_init(); - * Do anything that needs to be done before the mapping system can be used. - * Hash table must be initialized before we call this. - * - * Calculate the SID increment. Currently we use size^(1/2) + size^(1/4) + 1; - */ - -void mapping_init(void) { - - unsigned int tmp, maxeff, rwidth; - - ppc_max_adrsp = maxAdrSp; /* Set maximum address spaces */ - - maxeff = 32; /* Assume 32-bit */ - if(PerProcTable[0].ppe_vaddr->pf.Available & pf64Bit) maxeff = 64; /* Is this a 64-bit machine? */ - - rwidth = PerProcTable[0].ppe_vaddr->pf.pfMaxVAddr - maxAdrSpb; /* Reduce address width by width of address space ID */ - if(rwidth > maxeff) rwidth = maxeff; /* If we still have more virtual than effective, clamp at effective */ - - vm_max_address = 0xFFFFFFFFFFFFFFFFULL >> (64 - rwidth); /* Get maximum effective address supported */ - vm_max_physical = 0xFFFFFFFFFFFFFFFFULL >> (64 - PerProcTable[0].ppe_vaddr->pf.pfMaxPAddr); /* Get maximum physical address supported */ - - if(PerProcTable[0].ppe_vaddr->pf.Available & pf64Bit) { /* Are we 64 bit? */ - tmp = 12; /* Size of hash space */ - } - else { - __asm__ volatile("cntlzw %0, %1" : "=r" (tmp) : "r" (hash_table_size)); /* Get number of leading 0s */ - tmp = 32 - tmp; /* Size of hash space */ - } - - incrVSID = 1 << ((tmp + 1) >> 1); /* Get ceiling of sqrt of table size */ - incrVSID |= 1 << ((tmp + 1) >> 2); /* Get ceiling of quadroot of table size */ - incrVSID |= 1; /* Set bit and add 1 */ - - return; - -} - - -/* - * mapping_remove(pmap_t pmap, addr64_t va); - * Given a pmap and virtual address, this routine finds the mapping and unmaps it. - * The mapping block will be added to - * the free list. If the free list threshold is reached, garbage collection will happen. - * - * We also pass back the next higher mapped address. This is done so that the higher level - * pmap_remove function can release a range of addresses simply by calling mapping_remove - * in a loop until it finishes the range or is returned a vaddr of 0. - * - * Note that if the mapping is not found, we return the next VA ORed with 1 - * - */ - -addr64_t mapping_remove(pmap_t pmap, addr64_t va) { /* Remove a single mapping for this VADDR - Returns TRUE if a mapping was found to remove */ - - mapping_t *mp; - addr64_t nextva; - ppnum_t pgaddr; - - va &= ~PAGE_MASK; /* Scrub noise bits */ - - do { /* Keep trying until we truely fail */ - mp = hw_rem_map(pmap, va, &nextva); /* Remove a mapping from this pmap */ - } while (mapRtRemove == ((unsigned int)mp & mapRetCode)); - - switch ((unsigned int)mp & mapRetCode) { - case mapRtOK: - break; /* Mapping removed */ - case mapRtNotFnd: - return (nextva | 1); /* Nothing found to unmap */ - default: - panic("mapping_remove: hw_rem_map failed - pmap = %p, va = %016llX, code = %p\n", - pmap, va, mp); - break; - } - - pgaddr = mp->mpPAddr; /* Get page number from mapping */ - - mapping_free(mp); /* Add mapping to the free list */ - - if ((pmap->pmapFlags & pmapVMhost) && pmap->pmapVmmExt) { - /* If this is an assisted host, scrub any guest mappings */ - unsigned int idx; - phys_entry_t *physent = mapping_phys_lookup(pgaddr, &idx); - /* Get physent for our physical page */ - if (!physent) { /* No physent, could be in I/O area, so exit */ - return (nextva); - } - - do { /* Iterate 'till all guest mappings are gone */ - mp = hw_scrub_guest(physent, pmap); /* Attempt to scrub a guest mapping */ - switch ((unsigned int)mp & mapRetCode) { - case mapRtGuest: /* Found a guest mapping */ - case mapRtNotFnd: /* Mapping was there, but disappeared, must retry */ - case mapRtEmpty: /* No guest mappings left to scrub */ - break; - default: - panic("mapping_remove: hw_scrub_guest failed - physent = %p, code = %p\n", - physent, mp); /* Cry havoc, cry wrack, - at least we die with harness on our backs */ - break; - } - } while (mapRtEmpty != ((unsigned int)mp & mapRetCode)); - } - - return nextva; /* Tell them we did it */ -} - -/* - * mapping_make(pmap, va, pa, flags, size, prot) - map a virtual address to a real one - * - * This routine takes the given parameters, builds a mapping block, and queues it into the - * correct lists. - * - * pmap (virtual address) is the pmap to map into - * va (virtual address) is the 64-bit virtual address that is being mapped - * pa (physical page number) is the physical page number (i.e., physcial address >> 12). This is - * a 32-bit quantity. - * Flags: - * block if 1, mapping is a block, size parameter is used. Note: we do not keep - * reference and change information or allow protection changes of blocks. - * any changes must first unmap and then remap the area. - * use attribute Use specified attributes for map, not defaults for physical page - * perm Mapping is permanent - * cache inhibited Cache inhibited (used if use attribute or block set ) - * guarded Guarded access (used if use attribute or block set ) - * size size of block in pages - 1 (not used if not block) - * prot VM protection bits - * attr Cachability/Guardedness - * - * Returns 0 if mapping was successful. Returns vaddr that overlaps/collides. - * Returns 1 for any other failure. - * - * Note that we make an assumption that all memory in the range 0f 0x0000000080000000 to 0x00000000FFFFFFFF is reserved - * for I/O and default the cache attrubutes appropriately. The caller is free to set whatever they want however. - * - * If there is any physical page that is not found in the physent table, the mapping is forced to be a - * block mapping of length 1. This keeps us from trying to update a physent during later mapping use, - * e.g., fault handling. - * - * - */ - -addr64_t mapping_make(pmap_t pmap, addr64_t va, ppnum_t pa, unsigned int flags, unsigned int size, vm_prot_t prot) { /* Make an address mapping */ - - register mapping_t *mp; - addr64_t colladdr, psmask; - unsigned int pindex, mflags, pattr, wimg, rc; - phys_entry_t *physent; - int nlists, pcf; - boolean_t disable_NX = FALSE; - - pindex = 0; - - mflags = 0x01000000; /* Start building mpFlags field (busy count = 1) */ - - pcf = (flags & mmFlgPcfg) >> 24; /* Get the physical page config index */ - if(!(pPcfg[pcf].pcfFlags)) { /* Validate requested physical page configuration */ - panic("mapping_make: invalid physical page configuration request - pmap = %p, va = %016llX, cfg = %d\n", - pmap, va, pcf); - } - - psmask = (1ULL << pPcfg[pcf].pcfPSize) - 1; /* Mask to isolate any offset into a page */ - if(va & psmask) { /* Make sure we are page aligned on virtual */ - panic("mapping_make: attempt to map unaligned vaddr - pmap = %p, va = %016llX, cfg = %d\n", - pmap, va, pcf); - } - if(((addr64_t)pa << 12) & psmask) { /* Make sure we are page aligned on physical */ - panic("mapping_make: attempt to map unaligned paddr - pmap = %p, pa = %08X, cfg = %d\n", - pmap, pa, pcf); - } - - mflags |= (pcf << (31-mpPcfgb)); /* Insert physical page configuration index */ - - if(!(flags & mmFlgBlock)) { /* Is this a block map? */ - - size = 1; /* Set size to 1 page if not block */ - - physent = mapping_phys_lookup(pa, &pindex); /* Get physical entry */ - if(!physent) { /* Did we find the physical page? */ - mflags |= mpBlock; /* Force this to a block if no physent */ - pattr = 0; /* Assume normal, non-I/O memory */ - if((pa & 0xFFF80000) == 0x00080000) pattr = mmFlgCInhib | mmFlgGuarded; /* If this page is in I/O range, set I/O attributes */ - } - else pattr = ((physent->ppLink & (ppI | ppG)) >> 60); /* Get the default attributes from physent */ - - if(flags & mmFlgUseAttr) pattr = flags & (mmFlgCInhib | mmFlgGuarded); /* Use requested attributes */ - } - else { /* This is a block */ - - pattr = flags & (mmFlgCInhib | mmFlgGuarded); /* Use requested attributes */ - mflags |= mpBlock; /* Show that this is a block */ - - if(size > pmapSmallBlock) { /* Is it one? */ - if(size & 0x00001FFF) return mapRtBadSz; /* Fail if bigger than 256MB and not a 32MB multiple */ - size = size >> 13; /* Convert to 32MB chunks */ - mflags = mflags | mpBSu; /* Show 32MB basic size unit */ - } - } - - wimg = 0x2; /* Set basic PPC wimg to 0b0010 - Coherent */ - if(pattr & mmFlgCInhib) wimg |= 0x4; /* Add cache inhibited if we need to */ - if(pattr & mmFlgGuarded) wimg |= 0x1; /* Add guarded if we need to */ - - mflags = mflags | (pindex << 16); /* Stick in the physical entry table index */ - - if(flags & mmFlgPerm) mflags |= mpPerm; /* Set permanent mapping */ - - size = size - 1; /* Change size to offset */ - if(size > 0xFFFF) return mapRtBadSz; /* Leave if size is too big */ - - nlists = mapSetLists(pmap); /* Set number of lists this will be on */ - - mp = mapping_alloc(nlists); /* Get a spare mapping block with this many lists */ - - /* the mapping is zero except that the mpLists field is set */ - mp->mpFlags |= mflags; /* Add in the rest of the flags to mpLists */ - mp->mpSpace = pmap->space; /* Set the address space/pmap lookup ID */ - mp->u.mpBSize = size; /* Set the size */ - mp->mpPte = 0; /* Set the PTE invalid */ - mp->mpPAddr = pa; /* Set the physical page number */ - - if ( !nx_enabled || (pmap->pmapFlags & pmapNXdisabled) ) - disable_NX = TRUE; - - mp->mpVAddr = (va & ~mpHWFlags) | (wimg << 3) | getProtPPC(prot, disable_NX); /* Add the protection and attributes to the field */ - - while(1) { /* Keep trying... */ - colladdr = hw_add_map(pmap, mp); /* Go add the mapping to the pmap */ - rc = colladdr & mapRetCode; /* Separate return code */ - colladdr &= ~mapRetCode; /* Clean up collision effective address */ - - switch (rc) { - case mapRtOK: - return mapRtOK; /* Mapping added successfully */ - - case mapRtRemove: /* Remove in progress */ - (void)mapping_remove(pmap, colladdr); /* Lend a helping hand to another CPU doing block removal */ - continue; /* Retry mapping add */ - - case mapRtMapDup: /* Identical mapping already present */ - mapping_free(mp); /* Free duplicate mapping */ - return mapRtOK; /* Return success */ - - case mapRtSmash: /* Mapping already present but does not match new mapping */ - mapping_free(mp); /* Free duplicate mapping */ - return (colladdr | mapRtSmash); /* Return colliding address, with some dirt added to avoid - confusion if effective address is 0 */ - default: - panic("mapping_make: hw_add_map failed - collision addr = %016llX, code = %02X, pmap = %p, va = %016llX, mapping = %p\n", - colladdr, rc, pmap, va, mp); /* Die dead */ - } - - } - - return 1; /* Unreachable, but pleases compiler */ -} - - -/* - * mapping *mapping_find(pmap, va, *nextva, full) - Finds a mapping - * - * Looks up the vaddr and returns the mapping and the next mapped va - * If full is true, it will descend through all nested pmaps to find actual mapping - * - * Must be called with interruptions disabled or we can hang trying to remove found mapping. - * - * Returns 0 if not found and the virtual address of the mapping if it is - * Note that the mappings busy count is bumped. It is the responsibility of the caller - * to drop the count. If this is not done, any attempt to remove the mapping will hang. - * - * NOTE: The nextva field is not valid when full is TRUE. - * - * - */ - -mapping_t *mapping_find(pmap_t pmap, addr64_t va, addr64_t *nextva, int full) { /* Make an address mapping */ - - register mapping_t *mp; - addr64_t curva; - pmap_t curpmap; - int nestdepth; - - curpmap = pmap; /* Remember entry */ - nestdepth = 0; /* Set nest depth */ - curva = (addr64_t)va; /* Set current va */ - - while(1) { - - mp = hw_find_map(curpmap, curva, nextva); /* Find the mapping for this address */ - if((unsigned int)mp == mapRtBadLk) { /* Did we lock up ok? */ - panic("mapping_find: pmap lock failure - rc = %p, pmap = %p\n", mp, curpmap); /* Die... */ - } - - if(!mp || ((mp->mpFlags & mpType) < mpMinSpecial) || !full) break; /* Are we done looking? */ - - if((mp->mpFlags & mpType) != mpNest) { /* Don't chain through anything other than a nested pmap */ - mapping_drop_busy(mp); /* We have everything we need from the mapping */ - mp = NULL; /* Set not found */ - break; - } - - if(nestdepth++ > 64) { /* Have we nested too far down? */ - panic("mapping_find: too many nested pmaps - va = %016llX, curva = %016llX, pmap = %p, curpmap = %p\n", - va, curva, pmap, curpmap); - } - - curva = curva + mp->mpNestReloc; /* Relocate va to new pmap */ - curpmap = (pmap_t) pmapTrans[mp->mpSpace].pmapVAddr; /* Get the address of the nested pmap */ - mapping_drop_busy(mp); /* We have everything we need from the mapping */ - - } - - return mp; /* Return the mapping if we found one */ -} - -/* - * void mapping_protect(pmap_t pmap, addt_t va, vm_prot_t prot, addr64_t *nextva) - change the protection of a virtual page - * - * This routine takes a pmap and virtual address and changes - * the protection. If there are PTEs associated with the mappings, they will be invalidated before - * the protection is changed. - * - * We return success if we change the protection or if there is no page mapped at va. We return failure if - * the va corresponds to a block mapped area or the mapping is permanant. - * - * - */ - -void -mapping_protect(pmap_t pmap, addr64_t va, vm_prot_t prot, addr64_t *nextva) { /* Change protection of a virtual page */ - - int ret; - boolean_t disable_NX = FALSE; - - if ( !nx_enabled || (pmap->pmapFlags & pmapNXdisabled) ) - disable_NX = TRUE; - - ret = hw_protect(pmap, va, getProtPPC(prot, disable_NX), nextva); /* Try to change the protect here */ - - switch (ret) { /* Decode return code */ - - case mapRtOK: /* Changed */ - case mapRtNotFnd: /* Didn't find it */ - case mapRtBlock: /* Block map, just ignore request */ - case mapRtNest: /* Nested pmap, just ignore request */ - break; - - default: - panic("mapping_protect: hw_protect failed - rc = %d, pmap = %p, va = %016llX\n", ret, pmap, va); - - } - -} - -/* - * void mapping_protect_phys(ppnum_t pa, vm_prot_t prot) - change the protection of a physical page - * - * This routine takes a physical entry and runs through all mappings attached to it and changes - * the protection. If there are PTEs associated with the mappings, they will be invalidated before - * the protection is changed. There is no limitation on changes, e.g., higher to lower, lower to - * higher; however, changes to execute protection are ignored. - * - * Any mapping that is marked permanent is not changed - * - * Phys_entry is unlocked. - */ - -void mapping_protect_phys(ppnum_t pa, vm_prot_t prot) { /* Change protection of all mappings to page */ - - unsigned int pindex; - phys_entry_t *physent; - - physent = mapping_phys_lookup(pa, &pindex); /* Get physical entry */ - if(!physent) { /* Did we find the physical page? */ - panic("mapping_protect_phys: invalid physical page %08X\n", pa); - } - - hw_walk_phys(physent, hwpNoop, hwpSPrtMap, hwpNoop, - getProtPPC(prot, FALSE), hwpPurgePTE); /* Set the new protection for page and mappings */ - - return; /* Leave... */ -} - - -/* - * void mapping_clr_mod(ppnum_t pa) - clears the change bit of a physical page - * - * This routine takes a physical entry and runs through all mappings attached to it and turns - * off the change bit. - */ - -void mapping_clr_mod(ppnum_t pa) { /* Clears the change bit of a physical page */ - - unsigned int pindex; - phys_entry_t *physent; - - physent = mapping_phys_lookup(pa, &pindex); /* Get physical entry */ - if(!physent) { /* Did we find the physical page? */ - panic("mapping_clr_mod: invalid physical page %08X\n", pa); - } - - hw_walk_phys(physent, hwpNoop, hwpCCngMap, hwpCCngPhy, - 0, hwpPurgePTE); /* Clear change for page and mappings */ - return; /* Leave... */ -} - - -/* - * void mapping_set_mod(ppnum_t pa) - set the change bit of a physical page - * - * This routine takes a physical entry and runs through all mappings attached to it and turns - * on the change bit. - */ - -void mapping_set_mod(ppnum_t pa) { /* Sets the change bit of a physical page */ - - unsigned int pindex; - phys_entry_t *physent; - - physent = mapping_phys_lookup(pa, &pindex); /* Get physical entry */ - if(!physent) { /* Did we find the physical page? */ - panic("mapping_set_mod: invalid physical page %08X\n", pa); - } - - hw_walk_phys(physent, hwpNoop, hwpSCngMap, hwpSCngPhy, - 0, hwpNoopPTE); /* Set change for page and mappings */ - return; /* Leave... */ -} - - -/* - * void mapping_clr_ref(ppnum_t pa) - clears the reference bit of a physical page - * - * This routine takes a physical entry and runs through all mappings attached to it and turns - * off the reference bit. - */ - -void mapping_clr_ref(ppnum_t pa) { /* Clears the reference bit of a physical page */ - - unsigned int pindex; - phys_entry_t *physent; - - physent = mapping_phys_lookup(pa, &pindex); /* Get physical entry */ - if(!physent) { /* Did we find the physical page? */ - panic("mapping_clr_ref: invalid physical page %08X\n", pa); - } - - hw_walk_phys(physent, hwpNoop, hwpCRefMap, hwpCRefPhy, - 0, hwpPurgePTE); /* Clear reference for page and mappings */ - return; /* Leave... */ -} - - -/* - * void mapping_set_ref(ppnum_t pa) - set the reference bit of a physical page - * - * This routine takes a physical entry and runs through all mappings attached to it and turns - * on the reference bit. - */ - -void mapping_set_ref(ppnum_t pa) { /* Sets the reference bit of a physical page */ - - unsigned int pindex; - phys_entry_t *physent; - - physent = mapping_phys_lookup(pa, &pindex); /* Get physical entry */ - if(!physent) { /* Did we find the physical page? */ - panic("mapping_set_ref: invalid physical page %08X\n", pa); - } - - hw_walk_phys(physent, hwpNoop, hwpSRefMap, hwpSRefPhy, - 0, hwpNoopPTE); /* Set reference for page and mappings */ - return; /* Leave... */ -} - - -/* - * boolean_t mapping_tst_mod(ppnum_t pa) - test the change bit of a physical page - * - * This routine takes a physical entry and runs through all mappings attached to it and tests - * the changed bit. - */ - -boolean_t mapping_tst_mod(ppnum_t pa) { /* Tests the change bit of a physical page */ - - unsigned int pindex, rc; - phys_entry_t *physent; - - physent = mapping_phys_lookup(pa, &pindex); /* Get physical entry */ - if(!physent) { /* Did we find the physical page? */ - panic("mapping_tst_mod: invalid physical page %08X\n", pa); - } - - rc = hw_walk_phys(physent, hwpTCngPhy, hwpTCngMap, hwpNoop, - 0, hwpMergePTE); /* Set change for page and mappings */ - return ((rc & (unsigned long)ppC) != 0); /* Leave with change bit */ -} - - -/* - * boolean_t mapping_tst_ref(ppnum_t pa) - tests the reference bit of a physical page - * - * This routine takes a physical entry and runs through all mappings attached to it and tests - * the reference bit. - */ - -boolean_t mapping_tst_ref(ppnum_t pa) { /* Tests the reference bit of a physical page */ - - unsigned int pindex, rc; - phys_entry_t *physent; - - physent = mapping_phys_lookup(pa, &pindex); /* Get physical entry */ - if(!physent) { /* Did we find the physical page? */ - panic("mapping_tst_ref: invalid physical page %08X\n", pa); - } - - rc = hw_walk_phys(physent, hwpTRefPhy, hwpTRefMap, hwpNoop, - 0, hwpMergePTE); /* Test reference for page and mappings */ - return ((rc & (unsigned long)ppR) != 0); /* Leave with reference bit */ -} - - -/* - * unsigned int mapping_tst_refmod(ppnum_t pa) - tests the reference and change bits of a physical page - * - * This routine takes a physical entry and runs through all mappings attached to it and tests - * their reference and changed bits. - */ - -unsigned int mapping_tst_refmod(ppnum_t pa) { /* Tests the reference and change bits of a physical page */ - - unsigned int pindex, rc; - phys_entry_t *physent; - - physent = mapping_phys_lookup(pa, &pindex); /* Get physical entry */ - if (!physent) { /* Did we find the physical page? */ - panic("mapping_tst_refmod: invalid physical page %08X\n", pa); - } - - rc = hw_walk_phys(physent, hwpTRefCngPhy, hwpTRefCngMap, hwpNoop, - 0, hwpMergePTE); /* Test reference and change bits in page and mappings */ - return (((rc & ppC)? VM_MEM_MODIFIED : 0) | ((rc & ppR)? VM_MEM_REFERENCED : 0)); - /* Convert bits to generic format and return */ - -} - - -/* - * void mapping_clr_refmod(ppnum_t pa, unsigned int mask) - clears the reference and change bits specified - * by mask of a physical page - * - * This routine takes a physical entry and runs through all mappings attached to it and turns - * off all the reference and change bits. - */ - -void mapping_clr_refmod(ppnum_t pa, unsigned int mask) { /* Clears the reference and change bits of a physical page */ - - unsigned int pindex; - phys_entry_t *physent; - unsigned int ppcMask; - - physent = mapping_phys_lookup(pa, &pindex); /* Get physical entry */ - if(!physent) { /* Did we find the physical page? */ - panic("mapping_clr_refmod: invalid physical page %08X\n", pa); - } - - ppcMask = (((mask & VM_MEM_MODIFIED)? ppC : 0) | ((mask & VM_MEM_REFERENCED)? ppR : 0)); - /* Convert mask bits to PPC-specific format */ - hw_walk_phys(physent, hwpNoop, hwpCRefCngMap, hwpCRefCngPhy, - ppcMask, hwpPurgePTE); /* Clear reference and change bits for page and mappings */ - return; /* Leave... */ -} - - - -/* - * phys_ent *mapping_phys_lookup(ppnum_t pp, unsigned int *pindex) - tests the reference bit of a physical page - * - * This routine takes a physical page number and returns the phys_entry associated with it. It also - * calculates the bank address associated with the entry - * the reference bit. - */ - -phys_entry_t * -mapping_phys_lookup(ppnum_t pp, unsigned int *pindex) -{ /* Finds the physical entry for the page */ - unsigned int i; - - for(i = 0; i < pmap_mem_regions_count; i++) { /* Walk through the list */ - if(!(unsigned int)pmap_mem_regions[i].mrPhysTab) continue; /* Skip any empty lists */ - if((pp < pmap_mem_regions[i].mrStart) || (pp > pmap_mem_regions[i].mrEnd)) continue; /* This isn't ours */ - - *pindex = (i * sizeof(mem_region_t)) / 4; /* Make the word index to this list */ - - return &pmap_mem_regions[i].mrPhysTab[pp - pmap_mem_regions[i].mrStart]; /* Return the physent pointer */ - } - - return (phys_entry_t *)0; /* Shucks, can't find it... */ - -} - -boolean_t -pmap_valid_page(ppnum_t pn) { - unsigned int tmp; - - return (mapping_phys_lookup(pn, &tmp) != 0); -} - - -/* - * mapping_adjust(void) - Releases free mapping blocks and/or allocates new ones - * - * This routine frees any mapping blocks queued to mapCtl.mapcrel. It also checks - * the number of free mappings remaining, and if below a threshold, replenishes them. - * The list will be replenshed from mapCtl.mapcrel if there are enough. Otherwise, - * a new one is allocated. - * - * This routine allocates and/or frees memory and must be called from a safe place. - * Currently, vm_pageout_scan is the safest place. - */ - -thread_call_t mapping_adjust_call; -static thread_call_data_t mapping_adjust_call_data; - -void mapping_adjust(void) { /* Adjust free mappings */ - - kern_return_t retr = KERN_SUCCESS; - mappingblok_t *mb, *mbn; - spl_t s; - int allocsize; - - if(mapCtl.mapcmin <= MAPPERBLOK) { - mapCtl.mapcmin = (sane_size / PAGE_SIZE) / 16; - -#if DEBUG - kprintf("mapping_adjust: minimum entries rqrd = %08X\n", mapCtl.mapcmin); - kprintf("mapping_adjust: free = %08X; in use = %08X; release = %08X\n", - mapCtl.mapcfree, mapCtl.mapcinuse, mapCtl.mapcreln); -#endif - } - - s = splhigh(); /* Don't bother from now on */ - if(!hw_lock_to((hw_lock_t)&mapCtl.mapclock, LockTimeOut)) { /* Lock the control header */ - panic("mapping_adjust - timeout getting control lock (1)\n"); /* Tell all and die */ - } - - if (mapping_adjust_call == NULL) { - thread_call_setup(&mapping_adjust_call_data, - (thread_call_func_t)mapping_adjust, - (thread_call_param_t)NULL); - mapping_adjust_call = &mapping_adjust_call_data; - } - - while(1) { /* Keep going until we've got enough */ - - allocsize = mapCtl.mapcmin - mapCtl.mapcfree; /* Figure out how much we need */ - if(allocsize < 1) break; /* Leave if we have all we need */ - - if((unsigned int)(mbn = mapCtl.mapcrel)) { /* Can we rescue a free one? */ - mapCtl.mapcrel = mbn->nextblok; /* Dequeue it */ - mapCtl.mapcreln--; /* Back off the count */ - allocsize = MAPPERBLOK; /* Show we allocated one block */ - } - else { /* No free ones, try to get it */ - - allocsize = (allocsize + MAPPERBLOK - 1) / MAPPERBLOK; /* Get the number of pages we need */ - - hw_lock_unlock((hw_lock_t)&mapCtl.mapclock); /* Unlock our stuff */ - splx(s); /* Restore 'rupts */ - - for(; allocsize > 0; allocsize >>= 1) { /* Try allocating in descending halves */ - retr = kmem_alloc_kobject(mapping_map, (vm_offset_t *)&mbn, PAGE_SIZE * allocsize); /* Find a virtual address to use */ - if((retr != KERN_SUCCESS) && (allocsize == 1)) { /* Did we find any memory at all? */ - break; - } - if(retr == KERN_SUCCESS) break; /* We got some memory, bail out... */ - } - - allocsize = allocsize * MAPPERBLOK; /* Convert pages to number of maps allocated */ - s = splhigh(); /* Don't bother from now on */ - if(!hw_lock_to((hw_lock_t)&mapCtl.mapclock, LockTimeOut)) { /* Lock the control header */ - panic("mapping_adjust - timeout getting control lock (2)\n"); /* Tell all and die */ - } - } - - if (retr != KERN_SUCCESS) - break; /* Fail to alocate, bail out... */ - for(; allocsize > 0; allocsize -= MAPPERBLOK) { /* Release one block at a time */ - mapping_free_init((vm_offset_t)mbn, 0, 1); /* Initialize a non-permanent block */ - mbn = (mappingblok_t *)((unsigned int)mbn + PAGE_SIZE); /* Point to the next slot */ - } - - if ((mapCtl.mapcinuse + mapCtl.mapcfree + (mapCtl.mapcreln * (MAPPERBLOK + 1))) > mapCtl.mapcmaxalloc) - mapCtl.mapcmaxalloc = mapCtl.mapcinuse + mapCtl.mapcfree + (mapCtl.mapcreln * (MAPPERBLOK + 1)); - } - - if(mapCtl.mapcholdoff) { /* Should we hold off this release? */ - mapCtl.mapcrecurse = 0; /* We are done now */ - hw_lock_unlock((hw_lock_t)&mapCtl.mapclock); /* Unlock our stuff */ - splx(s); /* Restore 'rupts */ - return; /* Return... */ - } - - mbn = mapCtl.mapcrel; /* Get first pending release block */ - mapCtl.mapcrel = NULL; /* Dequeue them */ - mapCtl.mapcreln = 0; /* Set count to 0 */ - - hw_lock_unlock((hw_lock_t)&mapCtl.mapclock); /* Unlock our stuff */ - splx(s); /* Restore 'rupts */ - - while((unsigned int)mbn) { /* Toss 'em all */ - mb = mbn->nextblok; /* Get the next */ - - kmem_free(mapping_map, (vm_offset_t) mbn, PAGE_SIZE); /* Release this mapping block */ - - mbn = mb; /* Chain to the next */ - } - - __asm__ volatile("eieio"); /* Make sure all is well */ - mapCtl.mapcrecurse = 0; /* We are done now */ - return; -} - -/* - * mapping_free(mapping *mp) - release a mapping to the free list - * - * This routine takes a mapping and adds it to the free list. - * If this mapping make the block non-empty, we queue it to the free block list. - * NOTE: we might want to queue it to the end to keep quelch the pathalogical - * case when we get a mapping and free it repeatedly causing the block to chain and unchain. - * If this release fills a block and we are above the threshold, we release the block - */ - -void mapping_free(struct mapping *mp) { /* Release a mapping */ - - mappingblok_t *mb, *mbn; - spl_t s; - unsigned int full, mindx, lists; - - mindx = ((unsigned int)mp & (PAGE_SIZE - 1)) >> 6; /* Get index to mapping */ - mb = (mappingblok_t *)((unsigned int)mp & -PAGE_SIZE); /* Point to the mapping block */ - lists = (mp->mpFlags & mpLists); /* get #lists */ - if ((lists == 0) || (lists > kSkipListMaxLists)) /* panic if out of range */ - panic("mapping_free: mpLists invalid\n"); - -#if 0 - mp->mpFlags = 0x99999999; /* (BRINGUP) */ - mp->mpSpace = 0x9999; /* (BRINGUP) */ - mp->u.mpBSize = 0x9999; /* (BRINGUP) */ - mp->mpPte = 0x99999998; /* (BRINGUP) */ - mp->mpPAddr = 0x99999999; /* (BRINGUP) */ - mp->mpVAddr = 0x9999999999999999ULL; /* (BRINGUP) */ - mp->mpAlias = 0x9999999999999999ULL; /* (BRINGUP) */ - mp->mpList0 = 0x9999999999999999ULL; /* (BRINGUP) */ - mp->mpList[0] = 0x9999999999999999ULL; /* (BRINGUP) */ - mp->mpList[1] = 0x9999999999999999ULL; /* (BRINGUP) */ - mp->mpList[2] = 0x9999999999999999ULL; /* (BRINGUP) */ - - if(lists > mpBasicLists) { /* (BRINGUP) */ - mp->mpList[3] = 0x9999999999999999ULL; /* (BRINGUP) */ - mp->mpList[4] = 0x9999999999999999ULL; /* (BRINGUP) */ - mp->mpList[5] = 0x9999999999999999ULL; /* (BRINGUP) */ - mp->mpList[6] = 0x9999999999999999ULL; /* (BRINGUP) */ - mp->mpList[7] = 0x9999999999999999ULL; /* (BRINGUP) */ - mp->mpList[8] = 0x9999999999999999ULL; /* (BRINGUP) */ - mp->mpList[9] = 0x9999999999999999ULL; /* (BRINGUP) */ - mp->mpList[10] = 0x9999999999999999ULL; /* (BRINGUP) */ - } -#endif - - - s = splhigh(); /* Don't bother from now on */ - if(!hw_lock_to((hw_lock_t)&mapCtl.mapclock, LockTimeOut)) { /* Lock the control header */ - panic("mapping_free - timeout getting control lock\n"); /* Tell all and die */ - } - - full = !(mb->mapblokfree[0] | mb->mapblokfree[1]); /* See if full now */ - mb->mapblokfree[mindx >> 5] |= (0x80000000 >> (mindx & 31)); /* Flip on the free bit */ - if ( lists > mpBasicLists ) { /* if big block, lite the 2nd bit too */ - mindx++; - mb->mapblokfree[mindx >> 5] |= (0x80000000 >> (mindx & 31)); - mapCtl.mapcfree++; - mapCtl.mapcinuse--; - } - - if(full) { /* If it was full before this: */ - mb->nextblok = mapCtl.mapcnext; /* Move head of list to us */ - mapCtl.mapcnext = mb; /* Chain us to the head of the list */ - if(!((unsigned int)mapCtl.mapclast)) - mapCtl.mapclast = mb; - } - - mapCtl.mapcfree++; /* Bump free count */ - mapCtl.mapcinuse--; /* Decriment in use count */ - - mapCtl.mapcfreec++; /* Count total calls */ - - if(mapCtl.mapcfree > mapCtl.mapcmin) { /* Should we consider releasing this? */ - if(((mb->mapblokfree[0] | 0x80000000) & mb->mapblokfree[1]) == 0xFFFFFFFF) { /* See if empty now */ - - if(mapCtl.mapcnext == mb) { /* Are we first on the list? */ - mapCtl.mapcnext = mb->nextblok; /* Unchain us */ - if(!((unsigned int)mapCtl.mapcnext)) mapCtl.mapclast = NULL; /* If last, remove last */ - } - else { /* We're not first */ - for(mbn = mapCtl.mapcnext; mbn != 0; mbn = mbn->nextblok) { /* Search for our block */ - if(mbn->nextblok == mb) break; /* Is the next one our's? */ - } - if(!mbn) panic("mapping_free: attempt to release mapping block (%p) not on list\n", mp); - mbn->nextblok = mb->nextblok; /* Dequeue us */ - if(mapCtl.mapclast == mb) mapCtl.mapclast = mbn; /* If last, make our predecessor last */ - } - - if(mb->mapblokflags & mbPerm) { /* Is this permanently assigned? */ - mb->nextblok = mapCtl.mapcnext; /* Move chain head to us */ - mapCtl.mapcnext = mb; /* Chain us to the head */ - if(!((unsigned int)mb->nextblok)) mapCtl.mapclast = mb; /* If last, make us so */ - } - else { - mapCtl.mapcfree -= MAPPERBLOK; /* Remove the block from the free count */ - mapCtl.mapcreln++; /* Count on release list */ - mb->nextblok = mapCtl.mapcrel; /* Move pointer */ - mapCtl.mapcrel = mb; /* Chain us in front */ - } - } - } - - if(mapCtl.mapcreln > MAPFRTHRSH) { /* Do we have way too many releasable mappings? */ - if(hw_compare_and_store(0, 1, &mapCtl.mapcrecurse)) { /* Make sure we aren't recursing */ - thread_call_enter(mapping_adjust_call); /* Go toss some */ - } - } - hw_lock_unlock((hw_lock_t)&mapCtl.mapclock); /* Unlock our stuff */ - splx(s); /* Restore 'rupts */ - - return; /* Bye, dude... */ -} - - -/* - * mapping_alloc(lists) - obtain a mapping from the free list - * - * This routine takes a mapping off of the free list and returns its address. - * The mapping is zeroed, and its mpLists count is set. The caller passes in - * the number of skiplists it would prefer; if this number is greater than - * mpBasicLists (ie, 4) then we need to allocate a 128-byte mapping, which is - * just two consequtive free entries coallesced into one. If we cannot find - * two consequtive free entries, we clamp the list count down to mpBasicLists - * and return a basic 64-byte node. Our caller never knows the difference. - * - * If this allocation empties a block, we remove it from the free list. - * If this allocation drops the total number of free entries below a threshold, - * we allocate a new block. - * - */ -decl_simple_lock_data(extern,free_pmap_lock) - -mapping_t * -mapping_alloc(int lists) { /* Obtain a mapping */ - - register mapping_t *mp; - mappingblok_t *mb, *mbn; - spl_t s; - int mindx; - int big = (lists > mpBasicLists); /* set flag if big block req'd */ - pmap_t refpmap, ckpmap; - unsigned int space, i; - addr64_t va, nextva; - boolean_t found_mapping; - boolean_t do_rescan; - - s = splhigh(); /* Don't bother from now on */ - if(!hw_lock_to((hw_lock_t)&mapCtl.mapclock, LockTimeOut)) { /* Lock the control header */ - panic("mapping_alloc - timeout getting control lock\n"); /* Tell all and die */ - } - - if(!((unsigned int)mapCtl.mapcnext)) { /* Are there any free mappings? */ - -/* - * No free mappings. First, there may be some mapping blocks on the "to be released" - * list. If so, rescue one. Otherwise, try to steal a couple blocks worth. - */ - - if((mbn = mapCtl.mapcrel) != 0) { /* Try to rescue a block from impending doom */ - mapCtl.mapcrel = mbn->nextblok; /* Pop the queue */ - mapCtl.mapcreln--; /* Back off the count */ - mapping_free_init((vm_offset_t)mbn, 0, 1); /* Initialize a non-permanent block */ - goto rescued; - } - - hw_lock_unlock((hw_lock_t)&mapCtl.mapclock); - - simple_lock(&free_pmap_lock); - - if(!hw_lock_to((hw_lock_t)&mapCtl.mapclock, LockTimeOut)) { /* Lock the control header */ - panic("mapping_alloc - timeout getting control lock\n"); /* Tell all and die */ - } - - if (!((unsigned int)mapCtl.mapcnext)) { - - refpmap = (pmap_t)cursor_pmap->pmap_link.next; - space = mapCtl.mapcflush.spacenum; - while (refpmap != cursor_pmap) { - if(((pmap_t)(refpmap->pmap_link.next))->spaceNum > space) break; - refpmap = (pmap_t)refpmap->pmap_link.next; - } - - ckpmap = refpmap; - va = mapCtl.mapcflush.addr; - found_mapping = FALSE; - - while (mapCtl.mapcfree <= (MAPPERBLOK*2)) { - - hw_lock_unlock((hw_lock_t)&mapCtl.mapclock); - - ckpmap = (pmap_t)ckpmap->pmap_link.next; - - /* We don't steal mappings from the kernel pmap, a VMM host pmap, or a VMM guest pmap with guest - shadow assist active. - */ - if ((ckpmap->stats.resident_count != 0) && (ckpmap != kernel_pmap) - && !(ckpmap->pmapFlags & (pmapVMgsaa|pmapVMhost))) { - do_rescan = TRUE; - for (i=0;i<8;i++) { - mp = hw_purge_map(ckpmap, va, &nextva); - - switch ((unsigned int)mp & mapRetCode) { - case mapRtOK: - mapping_free(mp); - found_mapping = TRUE; - break; - case mapRtNotFnd: - break; - default: - panic("mapping_alloc: hw_purge_map failed - pmap = %p, va = %16llX, code = %p\n", ckpmap, va, mp); - break; - } - - if (mapRtNotFnd == ((unsigned int)mp & mapRetCode)) { - if (do_rescan) - do_rescan = FALSE; - else - break; - } - - va = nextva; - } - } - - if (ckpmap == refpmap) { - if (found_mapping == FALSE) - panic("no valid pmap to purge mappings\n"); - else - found_mapping = FALSE; - } - - if(!hw_lock_to((hw_lock_t)&mapCtl.mapclock, LockTimeOut)) { /* Lock the control header */ - panic("mapping_alloc - timeout getting control lock\n"); /* Tell all and die */ - } - - } - - mapCtl.mapcflush.spacenum = ckpmap->spaceNum; - mapCtl.mapcflush.addr = nextva; - } - - simple_unlock(&free_pmap_lock); - } - -rescued: - - mb = mapCtl.mapcnext; - - if ( big ) { /* if we need a big (128-byte) mapping */ - mapCtl.mapcbig++; /* count attempts to allocate a big mapping */ - mbn = NULL; /* this will be prev ptr */ - mindx = 0; - while( mb ) { /* loop over mapping blocks with free entries */ - mindx = mapalc2(mb); /* try for 2 consequtive free bits in this block */ - - if ( mindx ) break; /* exit loop if we found them */ - mbn = mb; /* remember previous block */ - mb = mb->nextblok; /* move on to next block */ - } - if ( mindx == 0 ) { /* if we couldn't find 2 consequtive bits... */ - mapCtl.mapcbigfails++; /* count failures */ - big = 0; /* forget that we needed a big mapping */ - lists = mpBasicLists; /* clamp list count down to the max in a 64-byte mapping */ - mb = mapCtl.mapcnext; /* back to the first block with a free entry */ - } - else { /* if we did find a big mapping */ - mapCtl.mapcfree--; /* Decrement free count twice */ - mapCtl.mapcinuse++; /* Bump in use count twice */ - if ( mindx < 0 ) { /* if we just used the last 2 free bits in this block */ - if (mbn) { /* if this wasn't the first block */ - mindx = -mindx; /* make positive */ - mbn->nextblok = mb->nextblok; /* unlink this one from the middle of block list */ - if (mb == mapCtl.mapclast) { /* if we emptied last block */ - mapCtl.mapclast = mbn; /* then prev block is now last */ - } - } - } - } - } - - if ( !big ) { /* if we need a small (64-byte) mapping */ - if(!(mindx = mapalc1(mb))) /* Allocate a 1-bit slot */ - panic("mapping_alloc - empty mapping block detected at %p\n", mb); - } - - if(mindx < 0) { /* Did we just take the last one */ - mindx = -mindx; /* Make positive */ - mapCtl.mapcnext = mb->nextblok; /* Remove us from the list */ - if(!((unsigned int)mapCtl.mapcnext)) mapCtl.mapclast = NULL; /* Removed the last one */ - } - - mapCtl.mapcfree--; /* Decrement free count */ - mapCtl.mapcinuse++; /* Bump in use count */ - - mapCtl.mapcallocc++; /* Count total calls */ - -/* - * Note: in the following code, we will attempt to rescue blocks only one at a time. - * Eventually, after a few more mapping_alloc calls, we will catch up. If there are none - * rescueable, we will kick the misc scan who will allocate some for us. We only do this - * if we haven't already done it. - * For early boot, we are set up to only rescue one block at a time. This is because we prime - * the release list with as much as we need until threads start. - */ - - if(mapCtl.mapcfree < mapCtl.mapcmin) { /* See if we need to replenish */ - if((mbn = mapCtl.mapcrel) != 0) { /* Try to rescue a block from impending doom */ - mapCtl.mapcrel = mbn->nextblok; /* Pop the queue */ - mapCtl.mapcreln--; /* Back off the count */ - mapping_free_init((vm_offset_t)mbn, 0, 1); /* Initialize a non-permanent block */ - } - else { /* We need to replenish */ - if (mapCtl.mapcfree < (mapCtl.mapcmin / 4)) { - if(hw_compare_and_store(0, 1, &mapCtl.mapcrecurse)) { /* Make sure we aren't recursing */ - thread_call_enter(mapping_adjust_call); /* Go allocate some more */ - } - } - } - } - - hw_lock_unlock((hw_lock_t)&mapCtl.mapclock); /* Unlock our stuff */ - splx(s); /* Restore 'rupts */ - - mp = &((mapping_t *)mb)[mindx]; /* Point to the allocated mapping */ - mp->mpFlags = lists; /* set the list count */ - - - return mp; /* Send it back... */ -} - - -void -consider_mapping_adjust(void) -{ - spl_t s; - - s = splhigh(); /* Don't bother from now on */ - if(!hw_lock_to((hw_lock_t)&mapCtl.mapclock, LockTimeOut)) { /* Lock the control header */ - panic("consider_mapping_adjust -- lock timeout\n"); - } - - if (mapCtl.mapcfree < (mapCtl.mapcmin / 4)) { - if(hw_compare_and_store(0, 1, &mapCtl.mapcrecurse)) { /* Make sure we aren't recursing */ - thread_call_enter(mapping_adjust_call); /* Go allocate some more */ - } - } - - hw_lock_unlock((hw_lock_t)&mapCtl.mapclock); /* Unlock our stuff */ - splx(s); /* Restore 'rupts */ - -} - - - -/* - * void mapping_free_init(mb, perm) - Adds a block of storage to the free mapping list - * - * The mapping block is a page size area on a page boundary. It contains 1 header and 63 - * mappings. This call adds and initializes a block for use. Mappings come in two sizes, - * 64 and 128 bytes (the only difference is the number of skip-lists.) When we allocate a - * 128-byte mapping we just look for two consequtive free 64-byte mappings, so most of the - * code only deals with "basic" 64-byte mappings. This works for two reasons: - * - Only one in 256 mappings is big, so they are rare. - * - If we cannot find two consequtive free mappings, we just return a small one. - * There is no problem with doing this, except a minor performance degredation. - * Therefore, all counts etc in the mapping control structure are in units of small blocks. - * - * The header contains a chain link, bit maps, a virtual to real translation mask, and - * some statistics. Bit maps map each slot on the page (bit 0 is not used because it - * corresponds to the header). The translation mask is the XOR of the virtual and real - * addresses (needless to say, the block must be wired). - * - * We handle these mappings the same way as saveareas: the block is only on the chain so - * long as there are free entries in it. - * - * Empty blocks are garbage collected when there are at least mapCtl.mapcmin pages worth of free - * mappings. Blocks marked PERM won't ever be released. - * - * If perm is negative, the mapping is initialized, but immediately queued to the mapCtl.mapcrel - * list. We do this only at start up time. This is done because we only allocate blocks - * in the pageout scan and it doesn't start up until after we run out of the initial mappings. - * Therefore, we need to preallocate a bunch, but we don't want them to be permanent. If we put - * them on the release queue, the allocate routine will rescue them. Then when the - * pageout scan starts, all extra ones will be released. - * - */ - - -void mapping_free_init(vm_offset_t mbl, int perm, boolean_t locked) { - /* Set's start and end of a block of mappings - perm indicates if the block can be released - or goes straight to the release queue . - locked indicates if the lock is held already */ - - mappingblok_t *mb; - spl_t s; - addr64_t raddr; - ppnum_t pp; - - mb = (mappingblok_t *)mbl; /* Start of area */ - - if(perm >= 0) { /* See if we need to initialize the block */ - if(perm) { - raddr = (addr64_t)((unsigned int)mbl); /* Perm means V=R */ - mb->mapblokflags = mbPerm; /* Set perm */ -// mb->mapblokflags |= (unsigned int)mb; /* (BRINGUP) */ - } - else { - pp = pmap_find_phys(kernel_pmap, (addr64_t)mbl); /* Get the physical page */ - if(!pp) { /* What gives? Where's the page? */ - panic("mapping_free_init: could not find translation for vaddr %016llX\n", (addr64_t)mbl); - } - - raddr = (addr64_t)pp << 12; /* Convert physical page to physical address */ - mb->mapblokflags = 0; /* Set not perm */ -// mb->mapblokflags |= (unsigned int)mb; /* (BRINGUP) */ - } - - mb->mapblokvrswap = raddr ^ (addr64_t)((unsigned int)mbl); /* Form translation mask */ - - mb->mapblokfree[0] = 0x7FFFFFFF; /* Set first 32 (minus 1) free */ - mb->mapblokfree[1] = 0xFFFFFFFF; /* Set next 32 free */ - } - - s = splhigh(); /* Don't bother from now on */ - if(!locked) { /* Do we need the lock? */ - if(!hw_lock_to((hw_lock_t)&mapCtl.mapclock, LockTimeOut)) { /* Lock the control header */ - panic("mapping_free_init: timeout getting control lock\n"); /* Tell all and die */ - } - } - - if(perm < 0) { /* Direct to release queue? */ - mb->nextblok = mapCtl.mapcrel; /* Move forward pointer */ - mapCtl.mapcrel = mb; /* Queue us on in */ - mapCtl.mapcreln++; /* Count the free block */ - } - else { /* Add to the free list */ - - mb->nextblok = NULL; /* We always add to the end */ - mapCtl.mapcfree += MAPPERBLOK; /* Bump count */ - - if(!((unsigned int)mapCtl.mapcnext)) { /* First entry on list? */ - mapCtl.mapcnext = mapCtl.mapclast = mb; /* Chain to us */ - } - else { /* We are not the first */ - mapCtl.mapclast->nextblok = mb; /* Point the last to us */ - mapCtl.mapclast = mb; /* We are now last */ - } - } - - if(!locked) { /* Do we need to unlock? */ - hw_lock_unlock((hw_lock_t)&mapCtl.mapclock); /* Unlock our stuff */ - } - - splx(s); /* Restore 'rupts */ - return; /* All done, leave... */ -} - - -/* - * void mapping_prealloc(unsigned int) - Preallocates mapppings for large request - * - * No locks can be held, because we allocate memory here. - * This routine needs a corresponding mapping_relpre call to remove the - * hold off flag so that the adjust routine will free the extra mapping - * blocks on the release list. I don't like this, but I don't know - * how else to do this for now... - * - */ - -void mapping_prealloc(unsigned int size) { /* Preallocates mapppings for large request */ - - int nmapb, i; - kern_return_t retr; - mappingblok_t *mbn; - spl_t s; - - s = splhigh(); /* Don't bother from now on */ - if(!hw_lock_to((hw_lock_t)&mapCtl.mapclock, LockTimeOut)) { /* Lock the control header */ - panic("mapping_prealloc - timeout getting control lock\n"); /* Tell all and die */ - } - - nmapb = (size >> 12) + mapCtl.mapcmin; /* Get number of entries needed for this and the minimum */ - - mapCtl.mapcholdoff++; /* Bump the hold off count */ - - if((nmapb = (nmapb - mapCtl.mapcfree)) <= 0) { /* Do we already have enough? */ - hw_lock_unlock((hw_lock_t)&mapCtl.mapclock); /* Unlock our stuff */ - splx(s); /* Restore 'rupts */ - return; - } - if (!hw_compare_and_store(0, 1, &mapCtl.mapcrecurse)) { /* Make sure we aren't recursing */ - hw_lock_unlock((hw_lock_t)&mapCtl.mapclock); /* Unlock our stuff */ - splx(s); /* Restore 'rupts */ - return; - } - nmapb = (nmapb + MAPPERBLOK - 1) / MAPPERBLOK; /* Get number of blocks to get */ - - hw_lock_unlock((hw_lock_t)&mapCtl.mapclock); /* Unlock our stuff */ - splx(s); /* Restore 'rupts */ - - for(i = 0; i < nmapb; i++) { /* Allocate 'em all */ - retr = kmem_alloc_kobject(mapping_map, (vm_offset_t *)&mbn, PAGE_SIZE); /* Find a virtual address to use */ - if(retr != KERN_SUCCESS) /* Did we get some memory? */ - break; - mapping_free_init((vm_offset_t)mbn, -1, 0); /* Initialize on to the release queue */ - } - if ((mapCtl.mapcinuse + mapCtl.mapcfree + (mapCtl.mapcreln * (MAPPERBLOK + 1))) > mapCtl.mapcmaxalloc) - mapCtl.mapcmaxalloc = mapCtl.mapcinuse + mapCtl.mapcfree + (mapCtl.mapcreln * (MAPPERBLOK + 1)); - - mapCtl.mapcrecurse = 0; /* We are done now */ -} - -/* - * void mapping_relpre(void) - Releases preallocation release hold off - * - * This routine removes the - * hold off flag so that the adjust routine will free the extra mapping - * blocks on the release list. I don't like this, but I don't know - * how else to do this for now... - * - */ - -void mapping_relpre(void) { /* Releases release hold off */ - - spl_t s; - - s = splhigh(); /* Don't bother from now on */ - if(!hw_lock_to((hw_lock_t)&mapCtl.mapclock, LockTimeOut)) { /* Lock the control header */ - panic("mapping_relpre - timeout getting control lock\n"); /* Tell all and die */ - } - if(--mapCtl.mapcholdoff < 0) { /* Back down the hold off count */ - panic("mapping_relpre: hold-off count went negative\n"); - } - - hw_lock_unlock((hw_lock_t)&mapCtl.mapclock); /* Unlock our stuff */ - splx(s); /* Restore 'rupts */ -} - -/* - * void mapping_free_prime(void) - Primes the mapping block release list - * - * See mapping_free_init. - * No locks can be held, because we allocate memory here. - * One processor running only. - * - */ - -void mapping_free_prime(void) { /* Primes the mapping block release list */ - - int nmapb, i; - kern_return_t retr; - mappingblok_t *mbn; - vm_offset_t mapping_min; - - retr = kmem_suballoc(kernel_map, &mapping_min, sane_size / 16, - FALSE, VM_FLAGS_ANYWHERE, &mapping_map); - - if (retr != KERN_SUCCESS) - panic("mapping_free_prime: kmem_suballoc failed"); - - - nmapb = (mapCtl.mapcfree + mapCtl.mapcinuse + MAPPERBLOK - 1) / MAPPERBLOK; /* Get permanent allocation */ - nmapb = nmapb * 4; /* Get 4 times our initial allocation */ - -#if DEBUG - kprintf("mapping_free_prime: free = %08X; in use = %08X; priming = %08X\n", - mapCtl.mapcfree, mapCtl.mapcinuse, nmapb); -#endif - - for(i = 0; i < nmapb; i++) { /* Allocate 'em all */ - retr = kmem_alloc_kobject(mapping_map, (vm_offset_t *)&mbn, PAGE_SIZE); /* Find a virtual address to use */ - if(retr != KERN_SUCCESS) { /* Did we get some memory? */ - panic("Whoops... Not a bit of wired memory left for anyone\n"); - } - mapping_free_init((vm_offset_t)mbn, -1, 0); /* Initialize onto release queue */ - } - if ((mapCtl.mapcinuse + mapCtl.mapcfree + (mapCtl.mapcreln * (MAPPERBLOK + 1))) > mapCtl.mapcmaxalloc) - mapCtl.mapcmaxalloc = mapCtl.mapcinuse + mapCtl.mapcfree + (mapCtl.mapcreln * (MAPPERBLOK + 1)); -} - - -void -mapping_fake_zone_info(int *count, vm_size_t *cur_size, vm_size_t *max_size, vm_size_t *elem_size, - vm_size_t *alloc_size, int *collectable, int *exhaustable) -{ - *count = mapCtl.mapcinuse; - *cur_size = ((PAGE_SIZE / (MAPPERBLOK + 1)) * (mapCtl.mapcinuse + mapCtl.mapcfree)) + (PAGE_SIZE * mapCtl.mapcreln); - *max_size = (PAGE_SIZE / (MAPPERBLOK + 1)) * mapCtl.mapcmaxalloc; - *elem_size = (PAGE_SIZE / (MAPPERBLOK + 1)); - *alloc_size = PAGE_SIZE; - - *collectable = 1; - *exhaustable = 0; -} - - -/* - * addr64_t mapping_p2v(pmap_t pmap, ppnum_t pa) - Finds first virtual mapping of a physical page in a space - * - * First looks up the physical entry associated witht the physical page. Then searches the alias - * list for a matching pmap. It grabs the virtual address from the mapping, drops busy, and returns - * that. - * - */ - -addr64_t mapping_p2v(pmap_t pmap, ppnum_t pa) { /* Finds first virtual mapping of a physical page in a space */ - - spl_t s; - mapping_t *mp; - unsigned int pindex; - phys_entry_t *physent; - addr64_t va; - - physent = mapping_phys_lookup(pa, &pindex); /* Get physical entry */ - if(!physent) { /* Did we find the physical page? */ - panic("mapping_p2v: invalid physical page %08X\n", pa); - } - - s = splhigh(); /* Make sure interruptions are disabled */ - - mp = hw_find_space(physent, pmap->space); /* Go find the first mapping to the page from the requested pmap */ - - if(mp) { /* Did we find one? */ - va = mp->mpVAddr & -4096; /* If so, get the cleaned up vaddr */ - mapping_drop_busy(mp); /* Go ahead and relase the mapping now */ - } - else va = 0; /* Return failure */ - - splx(s); /* Restore 'rupts */ - - return va; /* Bye, bye... */ - -} - - -/* - * kvtophys(addr) - * - * Convert a kernel virtual address to a physical address - */ -addr64_t kvtophys(vm_offset_t va) { - - return pmap_extract(kernel_pmap, va); /* Find mapping and lock the physical entry for this mapping */ - -} - -/* - * kvtophys64(addr) - * - * Convert a kernel virtual address to a 64-bit physical address - */ -vm_map_offset_t kvtophys64(vm_map_offset_t va) { - - ppnum_t pa = pmap_find_phys(kernel_pmap, (addr64_t)va); - - if (!pa) - return 0; - return (((vm_map_offset_t)pa) << 12) | (va & 0xfff); - -} - -/* - * void ignore_zero_fault(boolean_t) - Sets up to ignore or honor any fault on - * page 0 access for the current thread. - * - * If parameter is TRUE, faults are ignored - * If parameter is FALSE, faults are honored - * - */ - -void ignore_zero_fault(boolean_t type) { /* Sets up to ignore or honor any fault on page 0 access for the current thread */ - - if(type) current_thread()->machine.specFlags |= ignoreZeroFault; /* Ignore faults on page 0 */ - else current_thread()->machine.specFlags &= ~ignoreZeroFault; /* Honor faults on page 0 */ - - return; /* Return the result or 0... */ -} - -/* - * no-op in current ppc implementation - */ -void inval_copy_windows(__unused thread_t th) -{ -} - - -/* - * Copies data between a physical page and a virtual page, or 2 physical. This is used to - * move data from the kernel to user state. Note that the "which" parm - * says which of the parameters is physical and if we need to flush sink/source. - * Note that both addresses may be physical, but only one may be virtual. - * - * The rules are that the size can be anything. Either address can be on any boundary - * and span pages. The physical data must be contiguous as must the virtual. - * - * We can block when we try to resolve the virtual address at each page boundary. - * We don't check protection on the physical page. - * - * Note that we will not check the entire range and if a page translation fails, - * we will stop with partial contents copied. - * - */ - -kern_return_t -hw_copypv_32(addr64_t source, addr64_t sink, unsigned int size, int which) -{ - vm_map_t map; - kern_return_t ret; - addr64_t nextva, vaddr = 0, paddr; - mapping_t *mp = NULL; - spl_t s; - unsigned int lop, csize; - int needtran, bothphys; - unsigned int pindex; - phys_entry_t *physent; - vm_prot_t prot = 0; - int orig_which; - - orig_which = which; - - map = (which & cppvKmap) ? kernel_map : current_map_fast(); - - if((which & (cppvPsrc | cppvPsnk)) == 0 ) { /* Make sure that only one is virtual */ - panic("copypv: no more than 1 parameter may be virtual\n"); /* Not allowed */ - } - - bothphys = 1; /* Assume both are physical */ - - if(!(which & cppvPsnk)) { /* Is sink page virtual? */ - vaddr = sink; /* Sink side is virtual */ - bothphys = 0; /* Show both aren't physical */ - prot = VM_PROT_READ | VM_PROT_WRITE; /* Sink always must be read/write */ - } else if (!(which & cppvPsrc)) { /* Is source page virtual? */ - vaddr = source; /* Source side is virtual */ - bothphys = 0; /* Show both aren't physical */ - prot = VM_PROT_READ; /* Virtual source is always read only */ - } - - needtran = 1; /* Show we need to map the virtual the first time */ - s = splhigh(); /* Don't bother me */ - - while(size) { - - if(!bothphys && (needtran || !(vaddr & 4095LL))) { /* If first time or we stepped onto a new page, we need to translate */ - if(!needtran) { /* If this is not the first translation, we need to drop the old busy */ - mapping_drop_busy(mp); /* Release the old mapping now */ - } - needtran = 0; - - while(1) { - mp = mapping_find(map->pmap, vaddr, &nextva, 1); /* Find and busy the mapping */ - if(!mp) { /* Was it there? */ - if(getPerProc()->istackptr == 0) - panic("copypv: No vaild mapping on memory %s %16llx", "RD", vaddr); - - splx(s); /* Restore the interrupt level */ - ret = vm_fault(map, vm_map_trunc_page(vaddr), prot, FALSE, THREAD_UNINT, NULL, 0); /* Didn't find it, try to fault it in... */ - - if(ret != KERN_SUCCESS)return KERN_FAILURE; /* Didn't find any, return no good... */ - - s = splhigh(); /* Don't bother me */ - continue; /* Go try for the map again... */ - - } - if (mp->mpVAddr & mpI) { /* cache inhibited, so force the appropriate page to be flushed before */ - if (which & cppvPsrc) /* and after the copy to avoid cache paradoxes */ - which |= cppvFsnk; - else - which |= cppvFsrc; - } else - which = orig_which; - - /* Note that we have to have the destination writable. So, if we already have it, or we are mapping the source, - we can just leave. - */ - if((which & cppvPsnk) || !(mp->mpVAddr & 1)) break; /* We got it mapped R/W or the source is not virtual, leave... */ - - mapping_drop_busy(mp); /* Go ahead and release the mapping for now */ - if(getPerProc()->istackptr == 0) - panic("copypv: No vaild mapping on memory %s %16llx", "RDWR", vaddr); - splx(s); /* Restore the interrupt level */ - - ret = vm_fault(map, vm_map_trunc_page(vaddr), VM_PROT_READ | VM_PROT_WRITE, FALSE, THREAD_UNINT, NULL, 0); /* check for a COW area */ - if (ret != KERN_SUCCESS) return KERN_FAILURE; /* We couldn't get it R/W, leave in disgrace... */ - s = splhigh(); /* Don't bother me */ - } - paddr = ((addr64_t)mp->mpPAddr << 12) + (vaddr - (mp->mpVAddr & -4096LL)); /* construct the physical address... this calculation works */ - /* properly on both single page and block mappings */ - if(which & cppvPsrc) sink = paddr; /* If source is physical, then the sink is virtual */ - else source = paddr; /* Otherwise the source is */ - } - - lop = (unsigned int)(4096LL - (sink & 4095LL)); /* Assume sink smallest */ - if(lop > (unsigned int)(4096LL - (source & 4095LL))) lop = (unsigned int)(4096LL - (source & 4095LL)); /* No, source is smaller */ - - csize = size; /* Assume we can copy it all */ - if(lop < size) csize = lop; /* Nope, we can't do it all */ - - if(which & cppvFsrc) flush_dcache64(source, csize, 1); /* If requested, flush source before move */ - if(which & cppvFsnk) flush_dcache64(sink, csize, 1); /* If requested, flush sink before move */ - - bcopy_physvir_32(source, sink, csize); /* Do a physical copy, virtually */ - - if(which & cppvFsrc) flush_dcache64(source, csize, 1); /* If requested, flush source after move */ - if(which & cppvFsnk) flush_dcache64(sink, csize, 1); /* If requested, flush sink after move */ - -/* - * Note that for certain ram disk flavors, we may be copying outside of known memory. - * Therefore, before we try to mark it modifed, we check if it exists. - */ - - if( !(which & cppvNoModSnk)) { - physent = mapping_phys_lookup(sink >> 12, &pindex); /* Get physical entry for sink */ - if(physent) mapping_set_mod((ppnum_t)(sink >> 12)); /* Make sure we know that it is modified */ - } - if( !(which & cppvNoRefSrc)) { - physent = mapping_phys_lookup(source >> 12, &pindex); /* Get physical entry for source */ - if(physent) mapping_set_ref((ppnum_t)(source >> 12)); /* Make sure we know that it is modified */ - } - size = size - csize; /* Calculate what is left */ - vaddr = vaddr + csize; /* Move to next sink address */ - source = source + csize; /* Bump source to next physical address */ - sink = sink + csize; /* Bump sink to next physical address */ - } - - if(!bothphys) mapping_drop_busy(mp); /* Go ahead and release the mapping of the virtual page if any */ - splx(s); /* Open up for interrupts */ - - return KERN_SUCCESS; -} - - -/* - * Debug code - */ - -void mapping_verify(void) { - - spl_t s; - mappingblok_t *mb, *mbn; - unsigned int relncnt; - unsigned int dumbodude; - - dumbodude = 0; - - s = splhigh(); /* Don't bother from now on */ - - mbn = NULL; /* Start with none */ - for(mb = mapCtl.mapcnext; mb; mb = mb->nextblok) { /* Walk the free chain */ - if((mappingblok_t *)(mb->mapblokflags & 0x7FFFFFFF) != mb) { /* Is tag ok? */ - panic("mapping_verify: flags tag bad, free chain; mb = %p, tag = %08X\n", mb, mb->mapblokflags); - } - mbn = mb; /* Remember the last one */ - } - - if(mapCtl.mapcnext && (mapCtl.mapclast != mbn)) { /* Do we point to the last one? */ - panic("mapping_verify: last pointer bad; mb = %p, mapclast = %p\n", mb, mapCtl.mapclast); - } - - relncnt = 0; /* Clear count */ - for(mb = mapCtl.mapcrel; mb; mb = mb->nextblok) { /* Walk the release chain */ - dumbodude |= mb->mapblokflags; /* Just touch it to make sure it is mapped */ - relncnt++; /* Count this one */ - } - - if(mapCtl.mapcreln != relncnt) { /* Is the count on release queue ok? */ - panic("mapping_verify: bad release queue count; mapcreln = %d, cnt = %d, ignore this = %08X\n", mapCtl.mapcreln, relncnt, dumbodude); - } - - splx(s); /* Restore 'rupts */ - - return; -} - -void mapping_phys_unused(ppnum_t pa) { - - unsigned int pindex; - phys_entry_t *physent; - - physent = mapping_phys_lookup(pa, &pindex); /* Get physical entry */ - if(!physent) return; /* Did we find the physical page? */ - - if(!(physent->ppLink & ~(ppLock | ppFlags))) return; /* No one else is here */ - - panic("mapping_phys_unused: physical page (%08X) in use, physent = %p\n", pa, physent); - -} - -void -mapping_hibernate_flush(void) -{ - unsigned int page, bank; - struct phys_entry * entry; - - for (bank = 0; bank < pmap_mem_regions_count; bank++) - { - entry = (struct phys_entry *) pmap_mem_regions[bank].mrPhysTab; - for (page = pmap_mem_regions[bank].mrStart; page <= pmap_mem_regions[bank].mrEnd; page++) - { - hw_walk_phys(entry, hwpNoop, hwpNoop, hwpNoop, 0, hwpPurgePTE); - entry++; - } - } -} - - - - - - diff --git a/osfmk/ppc/mappings.h b/osfmk/ppc/mappings.h deleted file mode 100644 index 0777f1b6f..000000000 --- a/osfmk/ppc/mappings.h +++ /dev/null @@ -1,499 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Header files for the hardware virtual memory mapping stuff - */ -#ifdef XNU_KERNEL_PRIVATE - -#ifndef _PPC_MAPPINGS_H_ -#define _PPC_MAPPINGS_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Don't change these structures unless you change the assembly code - */ - -/* - * This control block serves as anchor for all virtual mappings of the same physical - * page, i.e., aliases. There is a table for each bank (mem_region). All tables - * must reside in V=R storage and within the first 2GB of memory. Also, the - * mappings to which it points must be on at least a 64-byte boundary. These - * requirements allow a total of 2 bits for status and flags, and allow all address - * calculations to be 32-bit. - */ - -#pragma pack(4) /* Make sure the structure stays as we defined it */ -typedef struct phys_entry { - addr64_t ppLink; /* Physical pointer to aliased mappings and flags */ -#define ppLock 0x8000000000000000LL /* Lock for alias chain */ -#define ppFlags 0x700000000000000FLL /* Status and flags */ -#define ppI 0x2000000000000000LL /* Cache inhibited */ -#define ppIb 2 /* Cache inhibited */ -#define ppG 0x1000000000000000LL /* Guarded */ -#define ppGb 3 /* Guarded */ -#define ppR 0x0000000000000008LL /* Referenced */ -#define ppRb 60 /* Referenced */ -#define ppC 0x0000000000000004LL /* Changed */ -#define ppCb 61 /* Changed */ - -/* The lock, attribute, and flag bits are arranged so that their positions may be - * described by a contiguous mask of one bits wrapping from bit postion 63 to 0. - * In assembly language, we can then rapidly produce this mask with: - * li r0,ppLFAmask ; r0 <- 0x00000000000000FF - * rotrdi r0,r0,ppLFArrot ; r0 <- 0xF00000000000000F - */ -#define ppLFAmask 0x00FF /* One bit for each lock, attr, or flag bit */ -#define ppLFArrot 4 /* Right-rotate count to obtain 64-bit mask */ -} phys_entry_t; -#pragma pack() -#define physEntrySize sizeof(phys_entry_t) - -/* Memory may be non-contiguous. This data structure contains info - * for mapping this non-contiguous space into the contiguous - * physical->virtual mapping tables. An array of this type is - * provided to the pmap system at bootstrap by ppc_vm_init. - * - */ - -#pragma pack(4) /* Make sure the structure stays as we defined it */ -typedef struct mem_region { - phys_entry_t *mrPhysTab; /* Base of region table */ - ppnum_t mrStart; /* Start of region */ - ppnum_t mrEnd; /* Last page in region */ - ppnum_t mrAStart; /* Next page in region to allocate */ - ppnum_t mrAEnd; /* Last page in region to allocate */ -} mem_region_t; -#pragma pack() - -#define mrSize sizeof(mem_region_t) -#define PMAP_MEM_REGION_MAX 11 - -extern mem_region_t pmap_mem_regions[PMAP_MEM_REGION_MAX + 1]; -extern unsigned int pmap_mem_regions_count; - -/* Prototypes */ - - -#pragma pack(4) /* Make sure the structure stays as we defined it */ -typedef struct PCA { /* PTEG Control Area */ - union flgs { - unsigned int PCAallo; /* Allocation controls */ - struct PCAalflgs { /* Keep these in order!!! */ - unsigned char PCAfree; /* Indicates the slot is free */ - unsigned char PCAsteal; /* Steal scan start position */ - unsigned char PCAauto; /* Indicates that the PTE was autogenned */ - unsigned char PCAmisc; /* Misc. flags */ -#define PCAlock 1 /* This locks up the associated PTEG */ -#define PCAlockb 31 - } PCAalflgs; - } flgs; -} PCA_t; -#pragma pack() - -/* The hash table is composed of mappings organized into G groups of S slots - * each. In the macros below, by GV_GROUPS_LG2, GV_SLOT_SZ_LG2, and GV_SLOTS_LG2, the number - * of groups, the size (in bytes) of a slot, and the number of slots in a group are given. - * Since these values are given as log2, they're restricted to powers of two. Fast operation - * and all that. - * - * This patch of macros define all of the hash table's metrics and handy masks. It's a - * build-time thing because it's faster that way. Only the first group of values may - * be adjusted. - */ -#define GV_GROUPS_LG2 10 /* 1024 groups per hash table (log2(max) is 14, viz. 16K groups) */ -#define GV_SLOTS_LG2 3 /* 8 slots per group (log2(max) is 8, viz. 256 slots) */ - -#define GV_SLOT_SZ_LG2 5 /* 32 bytes per slot (mapping size) */ -#define GV_PGIDX_SZ_LG2 3 /* 64-bit Hash-table-page physical-addrress index entry size */ -#define GV_PAGE_SZ_LG2 12 /* 4k-byte hash-table-page size */ - -#define GV_GROUPS (1 << GV_GROUPS_LG2) -#define GV_SLOT_SZ (1 << GV_SLOT_SZ_LG2) -#define GV_SLOTS (1 << GV_SLOTS_LG2) -#define GV_PAGE_SZ (1 << GV_PAGE_SZ_LG2) -#define GV_GRP_MASK (GV_GROUPS - 1) -#define GV_SLOT_MASK (GV_SLOTS - 1) -#define GV_PAGE_MASK (GV_PAGE_SZ - 1) -#define GV_HPAGES (1 << (GV_GROUPS_LG2 + GV_SLOT_SZ_LG2 + GV_SLOTS_LG2 - GV_PAGE_SZ_LG2)) -#define GV_GRPS_PPG_LG2 (GV_PAGE_SZ_LG2 - (GV_SLOT_SZ_LG2 + GV_SLOTS_LG2)) -#define GV_GRPS_PPG (1 << GV_GRPS_PPG_LG2) -#define GV_SLTS_PPG_LG2 (GV_PAGE_SZ_LG2 - GV_SLOT_SZ_LG2) -#define GV_SLTS_PPG (1 << GV_SLTS_PPG_LG2) - -#define GV_HPAGE_SHIFT (GV_PGIDX_SZ_LG2 - GV_GRPS_PPG_LG2) -#define GV_HPAGE_MASK ((GV_HPAGES - 1) << GV_PGIDX_SZ_LG2) -#define GV_HGRP_SHIFT (GV_SLOT_SZ_LG2 + GV_SLOTS_LG2) -#define GV_HGRP_MASK ((GV_GRPS_PPG - 1) << GV_HGRP_SHIFT) - -#define GV_MAPWD_BITS_LG2 5 /* 32-bit active map word size */ -#define GV_MAPWD_SZ_LG2 (GV_MAPWD_BITS_LG2 - 3) -#define GV_BAND_SHIFT (GV_MAPWD_BITS_LG2 + GV_SLOT_SZ_LG2) -#define GV_BAND_SZ_LG2 (GV_PAGE_SZ_LG2 - GV_SLOT_SZ_LG2 - GV_MAPWD_BITS_LG2) -#define GV_BAND_MASK (((1 << GV_BAND_SZ_LG2) - 1) << GV_BAND_SHIFT) -#define GV_MAP_WORDS (1 << (GV_GROUPS_LG2 + GV_SLOTS_LG2 - GV_MAPWD_BITS_LG2)) -#define GV_MAP_MASK ((GV_MAP_WORDS - 1) << GV_MAPWD_SZ_LG2) -#define GV_MAP_SHIFT (GV_PGIDX_SZ_LG2 - GV_BAND_SZ_LG2) - - -/* Mappings currently come in two sizes: 64 and 128 bytes. The only difference is the - * number of skiplists (ie, mpLists): 64-byte mappings have 1-4 lists and 128-byte mappings - * have from 5-12. Only 1 in 256 mappings is large, so an average mapping is 64.25 bytes. - * All mappings are 64-byte aligned. - * - * Special note on mpFIP and mpRIP: - * These flags are manipulated under various locks. RIP is always set under an - * exclusive lock while FIP is shared. The only worry is that there is a possibility that - * FIP could be attempted by more than 1 processor at a time. Obviously, one will win. - * The other(s) bail all the way to user state and may refault (or not). There are only - * a few things in mpFlags that are not static, mpFIP, mpRIP, and mpBusy. - * - * We organize these so that mpFIP is in a byte with static data and mpRIP is in another. - * That means that we can use a store byte to update the guys without worrying about load - * and reserve. Note that mpFIP must be set atomically because it is under a share lock; - * but, it may be cleared with a simple store byte. Because mpRip is set once and then never - * cleared, we can get away with setting it by means of a simple store byte. - * - */ -#pragma pack(4) /* Make sure the structure stays as we defined it */ -typedef struct mapping { - unsigned int mpFlags; /* 0x000 - Various flags, lock bit. These are static except for lock */ -#define mpBusy 0xFF000000 /* Busy count */ -#define mpPrevious 0x00800000 /* A previous mapping exists in a composite */ -#define mpNext 0x00400000 /* A next mapping exist in a composite */ -#define mpPIndex 0x003F0000 /* Index into physical table (in words) */ -#define mpType 0x0000F000 /* Mapping type: */ -#define mpNormal 0x00000000 /* Normal logical page - backed by RAM, RC maintained, logical page size == physical page size */ - /* DO NOT CHANGE THIS CODE */ -#define mpBlock 0x00001000 /* Block mapping - used for I/O memory or non-RC maintained RAM, logical page size is independent from physical */ -#define mpMinSpecial 0x00002000 /* Any mapping with this type or above has extra special handling */ -#define mpNest 0x00002000 /* Forces transtion to an alternate address space after applying relocation */ -#define mpLinkage 0x00003000 /* Transition to current user address space with relocation - used for copyin/out/pv */ -#define mpACID 0x00004000 /* Address Chunk ID - provides the address space ID for VSID calculation. Normally mapped at chunk size - 2KB */ -#define mpGuest 0x00005000 /* Guest->physical shadow mapping */ -/* 0x00006000 - 0x0000F000 Reserved */ -#define mpFIP 0x00000800 /* Fault in progress */ -#define mpFIPb 20 /* Fault in progress */ -#define mpPcfg 0x00000700 /* Physical Page configuration */ -#define mpPcfgb 23 /* Physical Page configuration index bit */ -#define mpRIP 0x00000080 /* Remove in progress - DO NOT MOVE */ -#define mpRIPb 24 /* Remove in progress */ -#define mpPerm 0x00000040 /* Mapping is permanent - DO NOT MOVE */ -#define mpPermb 25 /* Mapping is permanent */ -#define mpBSu 0x00000020 /* Basic Size unit - 0 = 4KB, 1 = 32MB */ -#define mpBSub 26 /* Basic Size unit - 0 = 4KB, 1 = 32MB */ -#define mpLists 0x0000001F /* Number of skip lists mapping is on, max of 27 */ -#define mpListsb 27 /* Number of skip lists mapping is on, max of 27 */ -#define mpgFlags 0x0000001F /* Shadow cache mappings re-use mpLists for flags: */ -#define mpgGlobal 0x00000004 /* Mapping is global (1) or local (0) */ -#define mpgFree 0x00000002 /* Mapping is free */ -#define mpgDormant 0x00000001 /* Mapping is dormant */ - - unsigned short mpSpace; /* 0x004 - Address space hash */ - union { - unsigned short mpBSize; /* 0x006 - Block size - 1 in pages - max block size 256MB */ - unsigned char mpgCursor; /* 0x006 - Shadow-cache group allocation cursor (first mapping in the group) */ - } u; - - unsigned int mpPte; /* 0x008 - Offset to PTEG in hash table. Offset to exact PTE if mpHValid set - NOTE: this MUST be 0 for block mappings */ -#define mpHValid 0x00000001 /* PTE is entered in hash table */ -#define mpHValidb 31 /* PTE is entered in hash table */ - ppnum_t mpPAddr; /* 0x00C - Physical page number */ - addr64_t mpVAddr; /* 0x010 - Starting virtual address */ -#define mpHWFlags 0x0000000000000FFFULL /* Reference/Change, WIMG, AC, N, protection flags from PTE */ -#define mpHWFlagsb 52 -#define mpN 0x0000000000000004ULL /* Page-level no-execute (PowerAS machines) */ -#define mpNb 61 -#define mpPP 0x0000000000000003ULL /* Protection flags */ -#define mpPPb 62 -#define mpPPe 63 -#define mpKKN 0x0000000000000007ULL /* Segment key and no execute flag (nested pmap) */ -#define mpKKNb 61 -#define mpWIMG 0x0000000000000078ULL /* Attribute bits */ -#define mpWIMGb 57 -#define mpW 0x0000000000000040ULL -#define mpWb 57 -#define mpI 0x0000000000000020ULL -#define mpIb 58 -#define mpM 0x0000000000000010ULL -#define mpMb 59 -#define mpG 0x0000000000000008ULL -#define mpGb 60 -#define mpWIMGe 60 -#define mpC 0x0000000000000080ULL /* Change bit */ -#define mpCb 56 -#define mpR 0x0000000000000100ULL /* Reference bit */ -#define mpRb 55 - addr64_t mpAlias; /* 0x018 - Pointer to alias mappings of physical page */ -#define mpNestReloc mpAlias /* 0x018 - Redefines mpAlias relocation value of vaddr to nested pmap value */ -#define mpBlkRemCur mpAlias /* 0x018 - Next offset in block map to remove (this is 4 bytes) */ - addr64_t mpList0; /* 0x020 - Forward chain of mappings. This one is always used */ - addr64_t mpList[3]; /* 0x028 - Forward chain of mappings. Next higher order */ -/* 0x040 - End of basic mapping */ -#define mpBasicSize 64 -#define mpBasicLists 4 -/* note the dependence on kSkipListMaxLists, which must be <= #lists in a 256-byte mapping (ie, <=28) */ -/* addr64_t mpList4[8]; 0x040 - First extended list entries */ -/* 0x080 - End of first extended mapping */ -/* addr64_t mpList12[8]; 0x080 - Second extended list entries */ -/* 0x0C0 - End of second extended mapping */ -/* addr64_t mpList20[8]; 0x0C0 - Third extended list entries */ -/* 0x100 - End of third extended mapping */ - -} mapping_t; -#pragma pack() - -#define MAPPING_NULL ((struct mapping *) 0) - -#define mapDirect 0x08 -#define mapRWNA 0x00000000 -#define mapRWRO 0x00000001 -#define mapRWRW 0x00000002 -#define mapRORO 0x00000003 - -/* All counts are in units of basic 64-byte mappings. A 128-byte mapping is - * just two adjacent 64-byte entries. - */ -#pragma pack(4) /* Make sure the structure stays as we defined it */ - -typedef struct mappingflush { - addr64_t addr; /* Start address to search mapping */ - unsigned int spacenum; /* Last space num to search pmap */ - unsigned int mapfgas[1]; /* Pad to 64 bytes */ -} mappingflush_t; - -typedef struct mappingctl { - unsigned int mapclock; /* Mapping allocation lock */ - unsigned int mapcrecurse; /* Mapping allocation recursion control */ - struct mappingblok *mapcnext; /* First mapping block with free entries */ - struct mappingblok *mapclast; /* Last mapping block with free entries */ - struct mappingblok *mapcrel; /* List of deferred block releases */ - unsigned int mapcfree; /* Total free entries on list */ - unsigned int mapcinuse; /* Total entries in use */ - unsigned int mapcreln; /* Total blocks on pending release list */ - int mapcholdoff; /* Hold off clearing release list */ - unsigned int mapcfreec; /* Total calls to mapping free */ - unsigned int mapcallocc; /* Total calls to mapping alloc */ - unsigned int mapcbig; /* Count times a big mapping was requested of mapping_alloc */ - unsigned int mapcbigfails; /* Times caller asked for a big one but we gave 'em a small one */ - unsigned int mapcmin; /* Minimum free mappings to keep */ - unsigned int mapcmaxalloc; /* Maximum number of mappings allocated at one time */ - unsigned int mapcgas[1]; /* Pad to 64 bytes */ - struct mappingflush mapcflush; -} mappingctl_t; -#pragma pack() - -/* MAPPERBLOK is the number of basic 64-byte mappings per block (ie, per page.) */ -#define MAPPERBLOK 63 -#define MAPALTHRSH (4*MAPPERBLOK) -#define MAPFRTHRSH (2 * ((MAPALTHRSH + MAPPERBLOK - 1) / MAPPERBLOK)) -typedef struct mappingblok { - unsigned int mapblokfree[2]; /* Bit map of free mapping entrys */ - addr64_t mapblokvrswap; /* Virtual address XORed with physical address */ - unsigned int mapblokflags; /* Various flags */ -#define mbPerm 0x80000000 /* Block is permanent */ - struct mappingblok *nextblok; /* Pointer to the next mapping block */ -} mappingblok_t; - -#define mapRemChunk 128 - -#define mapRetCode 0xF -#define mapRtOK 0 -#define mapRtBadLk 1 -#define mapRtPerm 2 -#define mapRtNotFnd 3 -#define mapRtBlock 4 -#define mapRtNest 5 -#define mapRtRemove 6 -#define mapRtMapDup 7 -#define mapRtGuest 8 -#define mapRtEmpty 9 -#define mapRtSmash 10 /* Mapping already exists and doesn't match new mapping */ -#define mapRtBadSz 11 /* Requested size too big or more than 256MB and not mult of 32MB */ - -/* - * This struct describes available physical page configurations - * Note: - * Index 0 is required and is the primary page configuration (4K, non-large) - * Index 1 is the primary large page config if supported by hw (16M, large page) - */ - -typedef struct pcfg { - uint8_t pcfFlags; /* Flags */ -#define pcfValid 0x80 /* Configuration is valid */ -#define pcfLarge 0x40 /* Large page */ -#define pcfDedSeg 0x20 /* Requires dedicated segment */ - uint8_t pcfEncode; /* Implementation specific PTE encoding */ - uint8_t pcfPSize; /* Page size in powers of 2 */ - uint8_t pcfShift; /* Shift for PTE construction */ -} pcfg; - -#define pcfDefPcfg 0 /* Primary page configuration */ -#define pcfLargePcfg 1 /* Primary large page configuration */ - -extern pcfg pPcfg[8]; /* Supported page configurations */ - -extern mappingctl_t mapCtl; /* Mapping allocation control */ - -extern unsigned char ppc_prot[]; /* Mach -> PPC protection translation table */ - -vm_prot_t getProtPPC(int, boolean_t); - /* Safe Mach -> PPC protection key conversion */ - -extern addr64_t mapping_remove(pmap_t pmap, addr64_t va); /* Remove a single mapping for this VADDR */ -extern mapping_t *mapping_find(pmap_t pmap, addr64_t va, addr64_t *nextva, int full); /* Finds a mapping */ -extern void mapping_free_init(vm_offset_t mbl, int perm, boolean_t locked); /* Sets start and end of a block of mappings */ -extern void mapping_prealloc(unsigned int); /* Preallocate mappings for large use */ -extern void mapping_relpre(void); /* Releases preallocate request */ -extern void mapping_init(void); /* Do initial stuff */ -extern mapping_t *mapping_alloc(int lists); /* Obtain a mapping */ -extern void mapping_free(struct mapping *mp); /* Release a mapping */ -extern boolean_t mapping_tst_ref(ppnum_t pa); /* Tests the reference bit of a physical page */ -extern boolean_t mapping_tst_mod(ppnum_t pa); /* Tests the change bit of a physical page */ -extern void mapping_set_ref(ppnum_t pa); /* Sets the reference bit of a physical page */ -extern void mapping_clr_ref(ppnum_t pa); /* Clears the reference bit of a physical page */ -extern void mapping_set_mod(ppnum_t pa); /* Sets the change bit of a physical page */ -extern void mapping_clr_mod(ppnum_t pa); /* Clears the change bit of a physical page */ -extern unsigned int mapping_tst_refmod(ppnum_t pa); /* Tests the reference and change bits of a physical page */ -extern void mapping_clr_refmod(ppnum_t pa, unsigned int mask); /* Clears the reference and change bits of a physical page */ -extern void mapping_protect_phys(ppnum_t pa, vm_prot_t prot); /* Change protection of all mappings to page */ -extern void mapping_protect(pmap_t pmap, addr64_t va, vm_prot_t prot, addr64_t *nextva); /* Change protection of a single mapping to page */ -extern addr64_t mapping_make(pmap_t pmap, addr64_t va, ppnum_t pa, unsigned int flags, unsigned int size, vm_prot_t prot); /* Make a mapping */ -/* Flags for mapping_make */ -#define mmFlgBlock 0x80000000 /* This is a block map, use size for number of pages covered */ -#define mmFlgUseAttr 0x40000000 /* Use specified attributes */ -#define mmFlgPerm 0x20000000 /* Mapping is permanant */ -#define mmFlgPcfg 0x07000000 /* Physical page configuration index */ -#define mmFlgCInhib 0x00000002 /* Cahching inhibited - use if mapFlgUseAttr set or block */ -#define mmFlgGuarded 0x00000001 /* Access guarded - use if mapFlgUseAttr set or block */ -extern void mapping_purge(ppnum_t pa); /* Remove all mappings for this physent */ -extern addr64_t mapping_p2v(pmap_t pmap, ppnum_t pa); /* Finds first virtual mapping of a physical page in a space */ -extern void mapping_drop_busy(struct mapping *mapping); /* Drops busy count on mapping */ -extern phys_entry_t *mapping_phys_lookup(ppnum_t pp, unsigned int *pindex); /* Finds the physical entry for the page */ -extern int mapalc1(struct mappingblok *mb); /* Finds and allcates a 1-bit mapping entry */ -extern int mapalc2(struct mappingblok *mb); /* Finds and allcates a 2-bit mapping entry */ -extern void ignore_zero_fault(boolean_t type); /* Sets up to ignore or honor any fault on page 0 access for the current thread */ -extern void mapping_hibernate_flush(void); - -extern void mapping_fake_zone_info( /* return mapping usage stats as a fake zone info */ - int *count, - vm_size_t *cur_size, - vm_size_t *max_size, - vm_size_t *elem_size, - vm_size_t *alloc_size, - int *collectable, - int *exhaustable); - -extern mapping_t *hw_rem_map(pmap_t pmap, addr64_t va, addr64_t *next); /* Remove a mapping from the system */ -extern mapping_t *hw_purge_map(pmap_t pmap, addr64_t va, addr64_t *next); /* Remove a regular mapping from the system */ -extern mapping_t *hw_purge_space(struct phys_entry *pp, pmap_t pmap); /* Remove the first mapping for a specific pmap from physentry */ -extern mapping_t *hw_purge_phys(struct phys_entry *pp); /* Remove the first mapping for a physentry */ -extern mapping_t *hw_scrub_guest(struct phys_entry *pp, pmap_t pmap); /* Scrub first guest mapping belonging to this host */ -extern mapping_t *hw_find_map(pmap_t pmap, addr64_t va, addr64_t *nextva); /* Finds a mapping */ -extern mapping_t *hw_find_space(struct phys_entry *pp, unsigned int space); /* Given a phys_entry, find its first mapping in the specified space */ -extern addr64_t hw_add_map(pmap_t pmap, struct mapping *mp); /* Add a mapping to a pmap */ -extern unsigned int hw_protect(pmap_t pmap, addr64_t va, vm_prot_t prot, addr64_t *nextva); /* Change the protection of a virtual page */ -extern unsigned int hw_test_rc(pmap_t pmap, addr64_t va, boolean_t reset); /* Test and optionally reset the RC bit of specific mapping */ - -extern unsigned int hw_clear_maps(void); - -extern unsigned int hw_walk_phys(struct phys_entry *pp, unsigned int preop, unsigned int op, /* Perform function on all mappings on a physical page */ - unsigned int postop, unsigned int parm, unsigned int opmod); -/* Opcodes for hw_walk_phys */ -#define hwpNoop 0 /* No operation */ -#define hwpSPrtPhy 1 /* Sets protection in physent (obsolete) */ -#define hwpSPrtMap 2 /* Sets protection in mapping */ -#define hwpSAtrPhy 3 /* Sets attributes in physent */ -#define hwpSAtrMap 4 /* Sets attributes in mapping */ -#define hwpCRefPhy 5 /* Clears reference in physent */ -#define hwpCRefMap 6 /* Clears reference in mapping */ -#define hwpCCngPhy 7 /* Clears change in physent */ -#define hwpCCngMap 8 /* Clears change in mapping */ -#define hwpSRefPhy 9 /* Sets reference in physent */ -#define hwpSRefMap 10 /* Sets reference in mapping */ -#define hwpSCngPhy 11 /* Sets change in physent */ -#define hwpSCngMap 12 /* Sets change in mapping */ -#define hwpTRefPhy 13 /* Tests reference in physent */ -#define hwpTRefMap 14 /* Tests reference in mapping */ -#define hwpTCngPhy 15 /* Tests change in physent */ -#define hwpTCngMap 16 /* Tests change in mapping */ -#define hwpTRefCngPhy 17 /* Tests reference and change in physent */ -#define hwpTRefCngMap 18 /* Tests reference and change in mapping */ -#define hwpCRefCngPhy 19 /* Clears reference and change in physent */ -#define hwpCRefCngMap 20 /* Clears reference and change in mapping */ -/* Operation modifiers for connected PTE visits for hw_walk_phys */ -#define hwpPurgePTE 0 /* Invalidate/purge PTE and merge RC bits for each connected mapping */ -#define hwpMergePTE 1 /* Merge RC bits for each connected mapping */ -#define hwpNoopPTE 2 /* Take no additional action for each connected mapping */ - -extern void hw_set_user_space(pmap_t pmap); /* Indicate we need a space switch */ -extern void hw_set_user_space_dis(pmap_t pmap); /* Indicate we need a space switch (already disabled) */ -extern void hw_setup_trans(void); /* Setup hardware for translation */ -extern void hw_start_trans(void); /* Start translation for the first time */ -extern void hw_map_seg(pmap_t pmap, addr64_t seg, addr64_t va); /* Validate a segment */ -extern void hw_blow_seg(addr64_t seg); /* Invalidate a segment */ -extern void invalidateSegs(pmap_t pmap); /* Invalidate the segment cache */ -extern struct phys_entry *pmap_find_physentry(ppnum_t pa); -extern void mapLog(unsigned int laddr, unsigned int type, addr64_t va); -extern unsigned int mapSkipListVerifyC(pmap_t pmap, unsigned long long *dumpa); -extern kern_return_t hw_copypv_32(addr64_t source, addr64_t sink, unsigned int size, int which); - -extern void hw_rem_all_gv(pmap_t pmap); /* Remove all of a guest's mappings */ -extern void hw_rem_local_gv(pmap_t gpmap); /* Remove guest local mappings */ -extern unsigned int hw_res_map_gv(pmap_t hpmap, pmap_t gpmap, addr64_t hva, addr64_t gva, vm_prot_t prot); - /* Resume a guest mapping */ -extern void hw_add_map_gv(pmap_t hpmap, pmap_t gpmap, addr64_t gva, unsigned int mflags, ppnum_t pa); - /* Add a guest mapping */ -extern void hw_susp_map_gv(pmap_t hpmap, pmap_t gpmap, addr64_t gva); - /* Suspend a guest mapping */ -extern unsigned int hw_test_rc_gv(pmap_t hpmap, pmap_t gpmap, addr64_t gva, unsigned int reset); - /* Test/reset mapping ref and chg */ -extern unsigned int hw_protect_gv(pmap_t gpmap, addr64_t va, vm_prot_t prot); - /* Change the protection of a guest page */ -extern addr64_t hw_gva_to_hva(pmap_t gpmap, addr64_t gva); /* Convert guest to host virtual address */ -extern unsigned int hw_find_map_gv(pmap_t gpmap, addr64_t gva, void *mpbuf); - /* Find and copy guest mapping into buffer */ - -extern unsigned int mappingdeb0; /* (TEST/DEBUG) */ -extern unsigned int incrVSID; /* VSID increment value */ - -extern int mapSetLists(pmap_t); -extern void consider_mapping_adjust(void); - -#endif /* _PPC_MAPPINGS_H_ */ - -#endif /* XNU_KERNEL_PRIVATE */ diff --git a/osfmk/ppc/mcount.s b/osfmk/ppc/mcount.s deleted file mode 100644 index fd2518567..000000000 --- a/osfmk/ppc/mcount.s +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#include -#include -#include -#include -#include -#include - - -/* - * The compiler generates calls to this function and passes address - * of caller of the function [ from which mcount is called ] as the - * first parameter. - * mcount disables interrupts prior to call mcount() and restores - * interrupt upon return. - * To prevent recursive calls to mcount(), a flag, mcountOff, is set - * in cpu_flags per_proc. - */ - - .align 4 - .globl mcount -mcount: - mflr r0 ; Load lr - stw r0,8(r1) ; Save lr on the stack - stwu r1,-64(r1) ; Get a stack frame - mfmsr r9 ; Get msr - rlwinm r9,r9,0,MSR_FP_BIT+1,MSR_FP_BIT-1 ; Force floating point off - rlwinm r9,r9,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1 ; Force vectors off - rlwinm r8,r9,0,MSR_EE_BIT+1,MSR_EE_BIT-1 ; Turn off interruptions - mtmsr r8 ; Update msr - isync - mfsprg r7,1 ; Get the current activation - lwz r7,ACT_PER_PROC(r7) ; Get the per_proc block - lhz r6,PP_CPU_FLAGS(r7) ; Get cpu flags - ori r5,r6,mcountOff ; - cmplw r5,r6 ; is mount off - beq mcount_ret ; return if off - sth r5,PP_CPU_FLAGS(r7) ; Update cpu_flags - stw r9,FM_ARG0(r1) ; Save MSR - mr r4, r0 - bl _mcount ; Call the C routine - lwz r9,FM_ARG0(r1) - mfsprg r7,1 ; Get the current activation - lwz r7,ACT_PER_PROC(r7) ; Get the per_proc block - lhz r6,PP_CPU_FLAGS(r7) ; Get CPU number - li r5,mcountOff ; - andc r6,r6,r5 ; Clear mcount_off - sth r6,PP_CPU_FLAGS(r7) ; Save cpu_flags -mcount_ret: - addi r1,r1,64 - mtmsr r9 ; Restore MSR - lwz r0,8(r1) - mtlr r0 - blr - diff --git a/osfmk/ppc/mem.h b/osfmk/ppc/mem.h deleted file mode 100644 index e4ee80c0e..000000000 --- a/osfmk/ppc/mem.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -#ifndef _PPC_MEM_H_ -#define _PPC_MEM_H_ - -#include -#include - -#include -#include -#include - -extern vm_offset_t static_memory_end; - -extern addr64_t hash_table_base; -extern unsigned int hash_table_size; -extern int hash_table_shift; /* size adjustment: bigger if >0, smaller if <0 */ - -void hash_table_init(vm_offset_t base, vm_offset_t size); - -#define MAX_BAT 4 - -#pragma pack(4) /* Make sure the structure stays as we defined it */ -typedef struct ppcBAT { - unsigned int upper; /* Upper half of BAT */ - unsigned int lower; /* Lower half of BAT */ -} ppcBAT; -#pragma pack() - -#pragma pack(4) /* Make sure the structure stays as we defined it */ -struct shadowBAT { - ppcBAT IBATs[MAX_BAT]; /* Instruction BATs */ - ppcBAT DBATs[MAX_BAT]; /* Data BAT */ -}; -#pragma pack() - -extern struct shadowBAT shadow_BAT; - -#endif /* _PPC_MEM_H_ */ diff --git a/osfmk/ppc/misc.c b/osfmk/ppc/misc.c deleted file mode 100644 index 807c03512..000000000 --- a/osfmk/ppc/misc.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -#if 0 // dead code -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * copyin/out_multiple - the assembler copyin/out functions jump to C for - * help when the copyin lies over a segment boundary. The C breaks - * down the copy into two sub-copies and re-calls the assembler with - * these sub-copies. Very rare occurrance. Warning: These functions are - * called whilst active_thread->thread_recover is still set. - */ - -extern boolean_t copyin_multiple(const char *src, - char *dst, - vm_size_t count); - -boolean_t copyin_multiple(const char *src, - char *dst, - vm_size_t count) -{ - const char *midpoint; - vm_size_t first_count; - boolean_t first_result; - - /* Assert that we've been called because of a segment boundary, - * this function is more expensive than the assembler, and should - * only be called in this difficult case. - */ - assert(((vm_offset_t)src & 0xF0000000) != - ((vm_offset_t)(src + count -1) & 0xF0000000)); - - /* TODO NMGS define sensible constants for segments, and apply - * to C and assembler (assembler is much harder) - */ - midpoint = (const char*) ((vm_offset_t)(src + count) & 0xF0000000); - first_count = (midpoint - src); - - first_result = copyin(CAST_USER_ADDR_T(src), dst, first_count); - - /* If there was an error, stop now and return error */ - if (first_result != 0) - return first_result; - - /* otherwise finish the job and return result */ - return copyin(CAST_USER_ADDR_T(midpoint), dst + first_count, count-first_count); -} - -extern int copyout_multiple(const char *src, char *dst, vm_size_t count); - -int copyout_multiple(const char *src, char *dst, vm_size_t count) -{ - char *midpoint; - vm_size_t first_count; - boolean_t first_result; - - /* Assert that we've been called because of a segment boundary, - * this function is more expensive than the assembler, and should - * only be called in this difficult case. For copyout, the - * segment boundary is on the dst - */ - assert(((vm_offset_t)dst & 0xF0000000) != - ((vm_offset_t)(dst + count - 1) & 0xF0000000)); - - /* TODO NMGS define sensible constants for segments, and apply - * to C and assembler (assembler is much harder) - */ - midpoint = (char *) ((vm_offset_t)(dst + count) & 0xF0000000); - first_count = (midpoint - dst); - - first_result = copyout(src, CAST_USER_ADDR_T(dst), first_count); - - /* If there was an error, stop now and return error */ - if (first_result != 0) - return first_result; - - /* otherwise finish the job and return result */ - - return copyout(src + first_count, CAST_USER_ADDR_T(midpoint), count-first_count); -} -#endif // dead code - diff --git a/osfmk/ppc/misc_asm.s b/osfmk/ppc/misc_asm.s deleted file mode 100644 index 32d4b58df..000000000 --- a/osfmk/ppc/misc_asm.s +++ /dev/null @@ -1,287 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -#include -#include -#include -#include -#include -#include -#include - -/* - * vm_offset_t getrpc(void) - Return address of the function - * that called the current function - */ - -/* By using this function, we force the caller to save its LR in a known - * location, which we can pick up and return. See PowerPC ELF specs. - */ -ENTRY(getrpc, TAG_NO_FRAME_USED) - lwz ARG0, FM_BACKPTR(r1) /* Load our backchain ptr */ - lwz ARG0, FM_LR_SAVE(ARG0) /* Load previously saved LR */ - blr /* And return */ - - -/* - * General entry for all debuggers. This gets us onto the debug stack and - * then back off at exit. We need to pass back R3 to caller. - */ - -ENTRY(Call_Debugger, TAG_NO_FRAME_USED) - - - lis r8,hi16(MASK(MSR_VEC)) ; Get the vector flag - mfmsr r7 ; Get the current MSR - ori r8,r8,lo16(MASK(MSR_EE)|MASK(MSR_FP)) ; Add the FP flag - mflr r0 ; Save the return - andc r7,r7,r8 ; Clear VEC and FP - mtmsr r7 ; Do it - isync - mfsprg r8,1 ; Get the current activation - lwz r8,ACT_PER_PROC(r8) ; Get the per_proc block - stw r0,FM_LR_SAVE(r1) ; Save return on current stack - - lwz r9,PP_DEBSTACKPTR(r8) ; Get the debug stack - cmpwi r9,0 ; Are we already on it? - bne cdNewDeb ; No... - - mr r9,r1 ; We are already on the stack, so use the current value - subi r9,r9,FM_REDZONE+FM_SIZE ; Carve some extra space here - -cdNewDeb: li r0,0 ; Clear this out - stw r1,FM_ARG0(r9) ; Save the old stack pointer as if it were the first arg - - stw r0,PP_DEBSTACKPTR(r8) ; Mark debug stack as busy - - subi r1,r9,FM_SIZE ; Carve a new frame - stw r0,FM_BACKPTR(r1) ; Chain back - - bl EXT(Call_DebuggerC) ; Call the "C" phase of this - - lis r8,hi16(MASK(MSR_VEC)) ; Get the vector flag - mfmsr r0 ; Get the current MSR - ori r8,r8,lo16(MASK(MSR_EE)|MASK(MSR_FP)) ; Add the FP flag - addi r1,r1,FM_SIZE ; Pop off first stack frame - andc r0,r0,r8 ; Turn off all the interesting stuff - mtmsr r0 - - mfsprg r8,1 ; Get the current activation - lwz r8,ACT_PER_PROC(r8) ; Get the per_proc block - - lwz r9,PP_DEBSTACK_TOP_SS(r8) ; Get the top of the stack - cmplw r1,r9 ; Have we hit the bottom of the debug stack? - lwz r1,FM_ARG0(r1) ; Get previous stack frame - lwz r0,FM_LR_SAVE(r1) ; Get return address - mtlr r0 ; Set the return point - bnelr ; Return if still on debug stack - - stw r9,PP_DEBSTACKPTR(r8) ; Mark debug stack as free - blr - - -/* The following routines are for C-support. They are usually - * inlined into the C using the specifications in proc_reg.h, - * but if optimisation is switched off, the inlining doesn't work - */ - -ENTRY(get_got, TAG_NO_FRAME_USED) - mr ARG0, r2 - blr - -ENTRY(mflr, TAG_NO_FRAME_USED) - mflr ARG0 - blr - -ENTRY(mfpvr, TAG_NO_FRAME_USED) - mfpvr ARG0 - blr - -ENTRY(mtmsr, TAG_NO_FRAME_USED) - mtmsr ARG0 - isync - blr - -ENTRY(mfmsr, TAG_NO_FRAME_USED) - mfmsr ARG0 - blr - -ENTRY(mtsrin, TAG_NO_FRAME_USED) - isync - mtsrin ARG0, ARG1 - isync - blr - -ENTRY(mfsrin, TAG_NO_FRAME_USED) - mfsrin ARG0, ARG0 - blr - -ENTRY(mtsdr1, TAG_NO_FRAME_USED) - mtsdr1 ARG0 - blr - -ENTRY(mtdar, TAG_NO_FRAME_USED) - mtdar ARG0 - blr - -ENTRY(mfdar, TAG_NO_FRAME_USED) - mfdar ARG0 - blr - -ENTRY(mtdec, TAG_NO_FRAME_USED) - mtdec ARG0 - blr - -ENTRY(cntlzw, TAG_NO_FRAME_USED) - cntlzw r3,r3 - blr - -/* Decrementer frequency and realtime|timebase processor registers - * are different between ppc601 and ppc603/4, we define them all. - */ - -ENTRY(isync_mfdec, TAG_NO_FRAME_USED) - isync - mfdec ARG0 - blr - - -ENTRY(mftb, TAG_NO_FRAME_USED) - mftb ARG0 - blr - -ENTRY(mftbu, TAG_NO_FRAME_USED) - mftbu ARG0 - blr - -ENTRY(mfrtcl, TAG_NO_FRAME_USED) - mfspr ARG0, 5 - blr - -ENTRY(mfrtcu, TAG_NO_FRAME_USED) - mfspr ARG0, 4 - blr - -ENTRY(tlbie, TAG_NO_FRAME_USED) - tlbie ARG0 - blr - - -/* - * Performance Monitor Register Support - */ - -ENTRY(mfmmcr0, TAG_NO_FRAME_USED) - mfspr r3,mmcr0 - blr - -ENTRY(mtmmcr0, TAG_NO_FRAME_USED) - mtspr mmcr0,r3 - blr - -ENTRY(mfmmcr1, TAG_NO_FRAME_USED) - mfspr r3,mmcr1 - blr - -ENTRY(mtmmcr1, TAG_NO_FRAME_USED) - mtspr mmcr1,r3 - blr - -ENTRY(mfmmcr2, TAG_NO_FRAME_USED) - mfspr r3,mmcr2 - blr - -ENTRY(mtmmcr2, TAG_NO_FRAME_USED) - mtspr mmcr2,r3 - blr - -ENTRY(mfpmc1, TAG_NO_FRAME_USED) - mfspr r3,pmc1 - blr - -ENTRY(mtpmc1, TAG_NO_FRAME_USED) - mtspr pmc1,r3 - blr - -ENTRY(mfpmc2, TAG_NO_FRAME_USED) - mfspr r3,pmc2 - blr - -ENTRY(mtpmc2, TAG_NO_FRAME_USED) - mtspr pmc2,r3 - blr - -ENTRY(mfpmc3, TAG_NO_FRAME_USED) - mfspr r3,pmc3 - blr - -ENTRY(mtpmc3, TAG_NO_FRAME_USED) - mtspr pmc3,r3 - blr - -ENTRY(mfpmc4, TAG_NO_FRAME_USED) - mfspr r3,pmc4 - blr - -ENTRY(mtpmc4, TAG_NO_FRAME_USED) - mtspr pmc4,r3 - blr - -ENTRY(mfsia, TAG_NO_FRAME_USED) - mfspr r3,sia - blr - -ENTRY(mfsda, TAG_NO_FRAME_USED) - mfspr r3,sda - blr - - .globl EXT(hid1get) -LEXT(hid1get) - - mfspr r3,hid1 ; Get the HID1 - blr - - .globl EXT(hid0get64) -LEXT(hid0get64) - - mfspr r4,hid0 ; Get the HID0 - srdi r3,r4,32 ; Move top down - rlwinm r4,r4,0,0,31 ; Clean top - blr - - .globl EXT(hid5set64) -LEXT(hid5set64) - - rlwinm r3,r3,0,1,0 ; Copy low 32 int high 32 - rlwimi r3,r4,0,0,31 ; Inser the low part behind top - mtspr hid5,r3 ; Set it - isync ; Wait for it - blr diff --git a/osfmk/ppc/misc_protos.h b/osfmk/ppc/misc_protos.h deleted file mode 100644 index d3eddc42a..000000000 --- a/osfmk/ppc/misc_protos.h +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -#ifndef _PPC_MISC_PROTOS_H_ -#define _PPC_MISC_PROTOS_H_ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -/* uncached-safe */ -extern void bzero_nc( - char *buf, - int size); - -/* uncached-safe */ -void bcopy_nc(const char *, char *, int); - -/* Physical to physical copy (ints must be disabled) */ -extern void bcopy_phys( - addr64_t from, - addr64_t to, - int size); - -/* Physical to physical copy virtually (ints must be disabled) */ -extern void bcopy_physvir_32( - addr64_t from, - addr64_t to, - int size); - -extern void phys_copy( - addr64_t from, - addr64_t to, - vm_size_t size); - -extern void machine_conf( - void); - -extern void machine_startup(void); - -extern void ppc_vm_init( - uint64_t ppc_mem_size, - boot_args *args); - -extern int ppcNull( - struct savearea *asavearea); - -extern int ppcNullinst( - struct savearea *asavearea); - -extern void disable_bluebox_internal( - thread_t act); - -extern uint64_t hid0get64( - void); - -extern void hid5set64( - uint64_t); - -extern void Load_context( - thread_t th); - -thread_t Switch_context(thread_t, thread_continue_t, thread_t); - -extern void vec_save( - struct facility_context *vec_fc); - -extern void toss_live_fpu( - struct facility_context *fpu_fc); - -extern void toss_live_vec( - struct facility_context *vec_fc); - -extern struct savearea *enterDebugger(unsigned int, struct savearea *, - unsigned int); - -extern void draw_panic_dialog( - void); - -extern void commit_paniclog( - void); -#if DEBUG -#define DPRINTF(x) { printf("%s : ",__FUNCTION__);printf x; } -#endif /* DEBUG */ - -#if MACH_ASSERT -extern void dump_thread( - thread_t th); -#endif /* MACH_ASSERT */ - -/* XXX move to osfmk/ppc/debug.h or some other debug header */ -void print_backtrace(struct savearea *); -int Call_Debugger(int, struct savearea *); -int kdp_dump_trap(int, struct savearea *); -void SysChoked(unsigned int, struct savearea *); -int Call_DebuggerC(unsigned int, struct savearea *); -void kdp_trap(unsigned int, struct savearea *); - -#endif /* _PPC_MISC_PROTOS_H_ */ diff --git a/osfmk/ppc/model_dep.c b/osfmk/ppc/model_dep.c deleted file mode 100644 index 9eff5b0bb..000000000 --- a/osfmk/ppc/model_dep.c +++ /dev/null @@ -1,1045 +0,0 @@ -/* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * @APPLE_FREE_COPYRIGHT@ - */ -/* - * (c) Copyright 1988 HEWLETT-PACKARD COMPANY - * - * To anyone who acknowledges that this file is provided "AS IS" - * without any express or implied warranty: - * permission to use, copy, modify, and distribute this file - * for any purpose is hereby granted without fee, provided that - * the above copyright notice and this notice appears in all - * copies, and that the name of Hewlett-Packard Company not be - * used in advertising or publicity pertaining to distribution - * of the software without specific, written prior permission. - * Hewlett-Packard Company makes no representations about the - * suitability of this software for any purpose. - */ -/* - * Copyright (c) 1990,1991,1992,1994 The University of Utah and - * the Computer Systems Laboratory (CSL). All rights reserved. - * - * THE UNIVERSITY OF UTAH AND CSL PROVIDE THIS SOFTWARE IN ITS "AS IS" - * CONDITION, AND DISCLAIM ANY LIABILITY OF ANY KIND FOR ANY DAMAGES - * WHATSOEVER RESULTING FROM ITS USE. - * - * CSL requests users of this software to return to csl-dist@cs.utah.edu any - * improvements that they make and grant CSL redistribution rights. - * - * Utah $Hdr: model_dep.c 1.34 94/12/14$ - */ -/* - * NOTICE: This file was modified by McAfee Research in 2004 to introduce - * support for mandatory and extensible security protections. This notice - * is included in support of clause 2.2 (b) of the Apple Public License, - * Version 2.0. - */ - -#include -#include -#include -#include - -#include -#include -#include - -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include /* for btop */ - -#if MACH_KDB -#include -#include -#include -#include - -extern struct db_command ppc_db_commands[]; -#endif /* MACH_KDB */ - -char kernel_args_buf[256] = "/mach_kernel"; -char boot_args_buf[256] = "/mach_servers/bootstrap"; -char env_buf[256]; - -#define TRAP_DEBUGGER __asm__ volatile("tw 4,r3,r3"); -#define TRAP_DEBUGGER_INST 0x7c831808 -#define TRAP_DIRECT __asm__ volatile("tw 4,r4,r4"); -#define TRAP_DIRECT_INST 0x7c842008 -#define TRAP_INST_SIZE 4 -#define BREAK_TO_KDP0 0x7fe00008 -#define BREAK_TO_KDP1 0x7c800008 -#define BREAK_TO_KDB0 0x7c810808 - -/* - * Code used to synchronize debuggers among all cpus, one active at a time, switch - * from on to another using kdb_on! #cpu or cpu #cpu - */ - -hw_lock_data_t debugger_lock; /* debugger lock */ -hw_lock_data_t pbtlock; /* backtrace print lock */ - -unsigned int debugger_cpu = (unsigned)-1; /* current cpu running debugger */ -int debugger_debug = 0; /* Debug debugger */ -int db_run_mode; /* Debugger run mode */ -unsigned int debugger_sync = 0; /* Cross processor debugger entry sync */ -extern unsigned int NMIss; /* NMI debounce switch */ - -extern volatile int panicwait; -volatile unsigned int pbtcnt = 0; -volatile unsigned int pbtcpu = -1; - -unsigned int lastTrace; /* Value of low-level exception trace controls */ - - -volatile unsigned int cpus_holding_bkpts; /* counter for number of cpus holding - breakpoints (ie: cpus that did not - insert back breakpoints) */ -void unlock_debugger(void); -void lock_debugger(void); -void dump_backtrace(struct savearea *sv, - unsigned int stackptr, - unsigned int fence); -void dump_savearea(struct savearea *sv, - unsigned int fence); - -#if !MACH_KDB -boolean_t db_breakpoints_inserted = TRUE; -jmp_buf_t *db_recover; -#endif - -#if MACH_KDB -#include -int kdb_flag=0; -extern boolean_t db_breakpoints_inserted; -extern jmp_buf_t *db_recover; -#define KDB_READY 0x1 -#endif - -#if MACH_KDP -extern int kdp_flag; -#define KDP_READY 0x1 -#endif - -unsigned int db_im_stepping = 0xFFFFFFFF; /* Remember if we were stepping */ - - -const char *failNames[] = { - "Debugging trap", /* failDebug */ - "Corrupt stack", /* failStack */ - "Corrupt mapping tables", /* failMapping */ - "Corrupt context", /* failContext */ - "No saveareas", /* failNoSavearea */ - "Savearea corruption", /* failSaveareaCorr */ - "Invalid live context", /* failBadLiveContext */ - "Corrupt skip lists", /* failSkipLists */ - "Unaligned stack", /* failUnalignedStk */ - "Invalid pmap", /* failPmap */ - "Lock timeout", /* failTimeout */ - "Unknown failure code" /* Unknown failure code - must always be last */ -}; - -const char *invxcption = "Unknown code"; - -static unsigned commit_paniclog_to_nvram; - -#if !MACH_KDB -void kdb_trap(__unused int type, __unused struct savearea *regs) {} -#endif /* !MACH_KDB */ - -#if !MACH_KDP -void kdp_trap(__unused int type, __unused struct savearea *regs) {} -#endif /* !MACH_KDP */ - -extern int default_preemption_rate; -extern int max_unsafe_quanta; -extern int max_poll_quanta; - -void -machine_startup(void) -{ - int boot_arg; - unsigned int wncpu; - - if (PE_parse_boot_argn("cpus", &wncpu, sizeof (wncpu))) { - if ((wncpu > 0) && (wncpu < MAX_CPUS)) - max_ncpus = wncpu; - } - - if( PE_get_hotkey( kPEControlKey )) - halt_in_debugger = halt_in_debugger ? 0 : 1; - - if (PE_parse_boot_argn("debug", &boot_arg, sizeof (boot_arg))) { - if (boot_arg & DB_HALT) halt_in_debugger=1; - if (boot_arg & DB_PRT) disable_debug_output=FALSE; - if (boot_arg & DB_SLOG) systemLogDiags=TRUE; - if (boot_arg & DB_NMI) panicDebugging=TRUE; - if (boot_arg & DB_LOG_PI_SCRN) logPanicDataToScreen=TRUE; - } - - if (!PE_parse_boot_argn("nvram_paniclog", &commit_paniclog_to_nvram, sizeof (commit_paniclog_to_nvram))) - commit_paniclog_to_nvram = 1; - - PE_parse_boot_argn("vmmforce", &lowGlo.lgVMMforcedFeats, sizeof (lowGlo.lgVMMforcedFeats)); - - hw_lock_init(&debugger_lock); /* initialize debugger lock */ - hw_lock_init(&pbtlock); /* initialize print backtrace lock */ - -#if MACH_KDB - /* - * Initialize KDB - */ -#if DB_MACHINE_COMMANDS - db_machine_commands_install(ppc_db_commands); -#endif /* DB_MACHINE_COMMANDS */ - ddb_init(); - - if (boot_arg & DB_KDB) - current_debugger = KDB_CUR_DB; - - /* - * Cause a breakpoint trap to the debugger before proceeding - * any further if the proper option bit was specified in - * the boot flags. - */ - if (halt_in_debugger && (current_debugger == KDB_CUR_DB)) { - Debugger("inline call to debugger(machine_startup)"); - halt_in_debugger = 0; - active_debugger =1; - } -#endif /* MACH_KDB */ - if (PE_parse_boot_argn("preempt", &boot_arg, sizeof (boot_arg))) { - default_preemption_rate = boot_arg; - } - if (PE_parse_boot_argn("unsafe", &boot_arg, sizeof (boot_arg))) { - max_unsafe_quanta = boot_arg; - } - if (PE_parse_boot_argn("poll", &boot_arg, sizeof (boot_arg))) { - max_poll_quanta = boot_arg; - } - if (PE_parse_boot_argn("yield", &boot_arg, sizeof (boot_arg))) { - sched_poll_yield_shift = boot_arg; - } - - machine_conf(); - - /* - * Kick off the kernel bootstrap. - */ - kernel_bootstrap(); - /*NOTREACHED*/ -} - -char * -machine_boot_info(__unused char *buf, __unused vm_size_t size) -{ - return(PE_boot_args()); -} - -void -machine_conf(void) -{ - machine_info.memory_size = mem_size; /* Note that this will be 2 GB for >= 2 GB machines */ -} - -void -machine_init(void) -{ - debug_log_init(); - clock_config(); -/* Note that we must initialize the stepper tables AFTER the clock is configured!!!!! */ - if(pmsExperimental & 1) pmsCPUConf(); /* (EXPERIMENTAL) Initialize the stepper tables */ - perfmon_init(); - return; - -} - -void -slave_machine_init(__unused void *param) -{ - cpu_machine_init(); /* Initialize the processor */ - clock_init(); /* Init the clock */ -} - -void -halt_all_cpus(boolean_t reboot) -{ - if(reboot) - { - printf("MACH Reboot\n"); - PEHaltRestart(kPERestartCPU); - } - else - { - printf("CPU halted\n"); - PEHaltRestart(kPEHaltCPU); - } - while(1); -} - -void -halt_cpu(void) -{ - halt_all_cpus(FALSE); -} - -#if MACH_ASSERT -/* - * Machine-dependent routine to fill in an array with up to callstack_max - * levels of return pc information. - */ -void -machine_callstack(__unused natural_t *buf, __unused vm_size_t callstack_max) -{ -} -#endif /* MACH_ASSERT */ - -void -print_backtrace(struct savearea *ssp) -{ - unsigned int stackptr, fence; - struct savearea *sv, *svssp, *psv; - unsigned int cpu; - -/* - * We need this lock to make sure we don't hang up when we double panic on an MP. - */ - - cpu = cpu_number(); /* Just who are we anyways? */ - if(pbtcpu != cpu) { /* Allow recursion */ - (void)hw_atomic_add(&pbtcnt, 1); /* Remember we are trying */ - while(!hw_lock_try(&pbtlock)); /* Spin here until we can get in. If we never do, well, we're crashing anyhow... */ - pbtcpu = cpu; /* Mark it as us */ - } - - svssp = (struct savearea *)ssp; /* Make this easier */ - sv = NULL; - if(current_thread()) - sv = (struct savearea *)current_thread()->machine.pcb; /* Find most current savearea if system has started */ - - fence = 0xFFFFFFFF; /* Show we go all the way */ - if(sv) fence = (unsigned int)sv->save_r1; /* Stop at previous exception point */ - - if(!svssp) { /* Should we start from stack? */ - kdb_printf("Latest stack backtrace for cpu %d:\n", cpu_number()); - __asm__ volatile("mr %0,r1" : "=r" (stackptr)); /* Get current stack */ - dump_backtrace((struct savearea *)0,stackptr, fence); /* Dump the backtrace */ - if(!sv) { /* Leave if no saveareas */ - hw_lock_unlock(&pbtlock); /* Allow another back trace to happen */ - goto pbt_exit; - } - } - else { /* Were we passed an exception? */ - fence = 0xFFFFFFFF; /* Show we go all the way */ - if(svssp->save_hdr.save_prev) { - if((svssp->save_hdr.save_prev <= vm_last_addr) && ((unsigned int)pmap_find_phys(kernel_pmap, (addr64_t)svssp->save_hdr.save_prev))) { /* Valid address? */ - psv = (struct savearea *)((unsigned int)svssp->save_hdr.save_prev); /* Get the 64-bit back chain converted to a regualr pointer */ - fence = (unsigned int)psv->save_r1; /* Stop at previous exception point */ - } - } - - kdb_printf("Latest crash info for cpu %d:\n", cpu_number()); - kdb_printf(" Exception state (sv=%p)\n", svssp); - dump_savearea(svssp, fence); /* Dump this savearea */ - } - - if(!sv) { /* Leave if no saveareas */ - hw_lock_unlock(&pbtlock); /* Allow another back trace to happen */ - goto pbt_exit; - } - - kdb_printf("Proceeding back via exception chain:\n"); - - while(sv) { /* Do them all... */ - if(!(((addr64_t)((uintptr_t)sv) <= vm_last_addr) && - (unsigned int)pmap_find_phys(kernel_pmap, (addr64_t)((uintptr_t)sv)))) { /* Valid address? */ - kdb_printf(" Exception state (sv=%p) Not mapped or invalid. stopping...\n", sv); - break; - } - - kdb_printf(" Exception state (sv=%p)\n", sv); - if(sv == svssp) { /* Did we dump it already? */ - kdb_printf(" previously dumped as \"Latest\" state. skipping...\n"); - } - else { - fence = 0xFFFFFFFF; /* Show we go all the way */ - if(sv->save_hdr.save_prev) { - if((sv->save_hdr.save_prev <= vm_last_addr) && ((unsigned int)pmap_find_phys(kernel_pmap, (addr64_t)sv->save_hdr.save_prev))) { /* Valid address? */ - psv = (struct savearea *)((unsigned int)sv->save_hdr.save_prev); /* Get the 64-bit back chain converted to a regualr pointer */ - fence = (unsigned int)psv->save_r1; /* Stop at previous exception point */ - } - } - dump_savearea(sv, fence); /* Dump this savearea */ - } - - sv = CAST_DOWN(struct savearea *, sv->save_hdr.save_prev); /* Back chain */ - } - - - pbtcpu = -1; /* Mark as unowned */ - hw_lock_unlock(&pbtlock); /* Allow another back trace to happen */ - (void)hw_atomic_sub(&pbtcnt, 1); /* Show we are done */ - - while(pbtcnt); /* Wait for completion */ -pbt_exit: - panic_display_system_configuration(); - return; -} - -void -dump_savearea(struct savearea *sv, unsigned int fence) -{ - const char *xcode; - - if(sv->save_exception > T_MAX) - xcode = invxcption; /* Too big for table */ - else - xcode = trap_type[sv->save_exception / 4]; /* Point to the type */ - - kdb_printf(" PC=0x%08X; MSR=0x%08X; DAR=0x%08X; DSISR=0x%08X; LR=0x%08X; R1=0x%08X; XCP=0x%08X (%s)\n", - (unsigned int)sv->save_srr0, (unsigned int)sv->save_srr1, (unsigned int)sv->save_dar, sv->save_dsisr, - (unsigned int)sv->save_lr, (unsigned int)sv->save_r1, sv->save_exception, xcode); - - if(!(sv->save_srr1 & MASK(MSR_PR))) { /* Are we in the kernel? */ - dump_backtrace(sv, (unsigned int)sv->save_r1, fence); /* Dump the stack back trace from here if not user state */ - } - - return; -} - -#define DUMPFRAMES 34 -#define LRindex 2 - -void dump_backtrace(struct savearea *sv, unsigned int stackptr, unsigned int fence) { - - unsigned int bframes[DUMPFRAMES]; - unsigned int sframe[8], raddr, dumbo; - int i, index=0; -// char syminfo[80]; - - kdb_printf(" Backtrace:\n"); - if (sv != (struct savearea *)0) { - bframes[0] = (unsigned int)sv->save_srr0; - bframes[1] = (unsigned int)sv->save_lr; - index = 2; - } - for(i = index; i < DUMPFRAMES; i++) { /* Dump up to max frames */ - - if(!stackptr || (stackptr == fence)) break; /* Hit stop point or end... */ - - if(stackptr & 0x0000000F) { /* Is stack pointer valid? */ - kdb_printf("\n backtrace terminated - unaligned frame address: 0x%08X\n", stackptr); /* No, tell 'em */ - break; - } - - raddr = (unsigned int)pmap_find_phys(kernel_pmap, (addr64_t)stackptr); /* Get physical frame address */ - if(!raddr || (stackptr > vm_last_addr)) { /* Is it mapped? */ - kdb_printf("\n backtrace terminated - frame not mapped or invalid: 0x%08X\n", stackptr); /* No, tell 'em */ - break; - } - - if(!mapping_phys_lookup(raddr, &dumbo)) { /* Is it within physical RAM? */ - kdb_printf("\n backtrace terminated - frame outside of RAM: v=0x%08X, p=%08X\n", stackptr, raddr); /* No, tell 'em */ - break; - } - - ReadReal((addr64_t)((raddr << 12) | (stackptr & 4095)), &sframe[0]); /* Fetch the stack frame */ - - bframes[i] = sframe[LRindex]; /* Save the link register */ - -// syms_formataddr((vm_offset_t)bframes[i], syminfo, sizeof (syminfo)); -// kdb_printf(" %s\n", syminfo); - if(!i) kdb_printf(" "); /* Indent first time */ - else if(!(i & 7)) kdb_printf("\n "); /* Skip to new line every 8 */ - kdb_printf("0x%08X ", bframes[i]); /* Dump the link register */ - - stackptr = sframe[0]; /* Chain back */ - } - kdb_printf("\n"); - if(i >= DUMPFRAMES) kdb_printf(" backtrace continues...\n"); /* Say we terminated early */ - if(i) kmod_panic_dump((vm_offset_t *)&bframes[0], i); /* Show what kmods are in trace */ - -} - -void commit_paniclog(void) { - unsigned long pi_size = 0; - - if (debug_buf_size > 0) { - if (commit_paniclog_to_nvram) { - unsigned int bufpos; - - /* XXX Consider using the WKdm compressor in the - * future, rather than just packing - would need to - * be co-ordinated with crashreporter, which decodes - * this post-restart. The compressor should be - * capable of in-place compression. - */ - bufpos = packA(debug_buf, (unsigned) (debug_buf_ptr - debug_buf), debug_buf_size); - /* If compression was successful, - * use the compressed length - */ - pi_size = bufpos ? bufpos : (unsigned) (debug_buf_ptr - debug_buf); - - /* Truncate if the buffer is larger than a - * certain magic size - this really ought to - * be some appropriate fraction of the NVRAM - * image buffer, and is best done in the - * savePanicInfo() or PESavePanicInfo() calls - * This call must save data synchronously, - * since we can subsequently halt the system. - */ - kprintf("Attempting to commit panic log to NVRAM\n"); - /* N.B.: This routine (currently an IOKit wrapper that - * calls through to the appropriate platform NVRAM - * driver, must be panic context safe, i.e. - * acquire no locks or require kernel services. - * This does not appear to be the case currently - * on some platforms, unfortunately (the driver - * on command gate serialization). - */ - pi_size = PESavePanicInfo((unsigned char *)debug_buf, - ((pi_size > 2040) ? 2040 : pi_size)); - /* Uncompress in-place, to allow debuggers to examine - * the panic log. - */ - if (bufpos) - unpackA(debug_buf, bufpos); - } - } -} - -void -Debugger(const char *message) { - - spl_t spl; - - spl = splhigh(); /* No interruptions from here on */ - -/* - * backtrace for Debugger() call from panic() if no current debugger - * backtrace and return for double panic() call - */ - if ((panicstr != (char *)0) && - (((nestedpanic != 0) && (current_debugger == 1)) || (active_debugger == 0))) { - print_backtrace(NULL); - if (nestedpanic != 0) { - splx(spl); - return; /* Yeah, don't enter again... */ - } - } - - if (debug_mode && getPerProc()->debugger_active) { /* Are we already on debugger on this processor? */ - splx(spl); - return; /* Yeah, don't do it again... */ - } - - -/* - * The above stuff catches the double panic case so we shouldn't have to worry about that here. - */ - if ( panicstr != (char *)0 ) - { - disable_preemption(); - /* Commit the panic log buffer to NVRAM, unless otherwise - * specified via a boot-arg. - */ - commit_paniclog(); - if(!panicDebugging) { - unsigned int my_cpu, tcpu; - - my_cpu = cpu_number(); - debugger_cpu = my_cpu; - - (void)hw_atomic_add(&debug_mode, 1); - PerProcTable[my_cpu].ppe_vaddr->debugger_active++; - lock_debugger(); - - for(tcpu = 0; tcpu < real_ncpus; tcpu++) { - if(tcpu == my_cpu) continue; - (void)hw_atomic_add(&debugger_sync, 1); - (void)cpu_signal(tcpu, SIGPdebug, 0 ,0); - } - (void)hw_cpu_sync(&debugger_sync, LockTimeOut); - debugger_sync = 0; - } - - draw_panic_dialog(); - - if(!panicDebugging) { -#if CONFIG_EMBEDDED - PEHaltRestart(kPEPanicRestartCPU); -#else - PEHaltRestart( kPEHangCPU ); -#endif - } - - enable_preemption(); - } - - if ((current_debugger != NO_CUR_DB)) { /* If there is a debugger configured, enter it */ - printf("Debugger(%s)\n", message); - TRAP_DEBUGGER; - splx(spl); - return; /* Done debugging for a while */ - } - - printf("\nNo debugger configured - dumping debug information\n"); - printf("MSR=%08X\n",mfmsr()); - print_backtrace(NULL); - splx(spl); - return; -} - -/* - * Here's where we attempt to get some diagnostic information dumped out - * when the system is really confused. We will try to get into the - * debugger as well. - * - * We are here with interrupts disabled and on the debug stack. The savearea - * that was passed in is NOT chained to the activation. - * - * save_r3 contains the failure reason code. - */ - -void -SysChoked(unsigned int type, struct savearea *sv) -{ - unsigned int failcode; - const char * const pmsg = "System Failure: cpu=%d; code=%08X (%s)\n"; - mp_disable_preemption(); - disable_debug_output = FALSE; - debug_mode = TRUE; - - failcode = (unsigned int)sv->save_r3; /* Get the failure code */ - if(failcode > failUnknown) failcode = failUnknown; /* Set unknown code code */ - - kprintf(pmsg, cpu_number(), (unsigned int)sv->save_r3, failNames[failcode]); - kdb_printf(pmsg, cpu_number(), (unsigned int)sv->save_r3, failNames[failcode]); - - print_backtrace(sv); /* Attempt to print backtrace */ - - /* Commit the panic log buffer to NVRAM, unless otherwise - * specified via a boot-arg. For certain types of panics - * which result in a "choke" exception, this may well - * be inadvisable, and setting the nvram_paniclog=0 - * boot-arg may be useful. - */ - - if (panicDebugging) - commit_paniclog(); - - Call_DebuggerC(type, sv); /* Attempt to get into debugger */ - - if ((current_debugger != NO_CUR_DB)) - Call_DebuggerC(type, sv); /* Attempt to get into debugger */ - panic_plain(pmsg, cpu_number(), (unsigned int)sv->save_r3, failNames[failcode]); -} - - - -/* - * When we get here, interruptions are disabled and we are on the debugger stack - * Never, ever, ever, ever enable interruptions from here on - */ - -int -Call_DebuggerC(unsigned int type, struct savearea *saved_state) -{ - int directcall, wait; - addr64_t instr_ptr = 0ULL; - ppnum_t instr_pp; - unsigned int instr, tcpu, my_cpu; - int wasdebugger; - - my_cpu = cpu_number(); /* Get our CPU */ - -#if MACH_KDB - if((debugger_cpu == my_cpu) && /* Do we already own debugger? */ - PerProcTable[my_cpu].ppe_vaddr->debugger_active && /* and are we really active? */ - db_recover && /* and have we set up recovery? */ - (current_debugger == KDB_CUR_DB)) { /* and are we in KDB (only it handles recovery) */ - kdb_trap(type, saved_state); /* Then reenter it... */ - } -#endif - - (void)hw_atomic_add(&debug_mode, 1); /* Indicate we are in debugger */ - PerProcTable[my_cpu].ppe_vaddr->debugger_active++; /* Show active on our CPU */ - - lock_debugger(); /* Insure that only one CPU is in debugger */ - - if(db_im_stepping == my_cpu) { /* Are we just back from a step? */ - enable_preemption_no_check(); /* Enable preemption now */ - db_im_stepping = 0xFFFFFFFF; /* Nobody stepping right now */ - } - - if (debugger_debug) { -#if 0 - kprintf("Call_DebuggerC(%d): %08X %08X, debact = %d\n", my_cpu, type, (uint32_t)saved_state, debug_mode); /* (TEST/DEBUG) */ -#endif - printf("Call_Debugger: enter - cpu %d, is_slave %d, debugger_cpu %d, pc %08llX\n", - my_cpu, PerProcTable[my_cpu].ppe_vaddr->debugger_is_slave, debugger_cpu, saved_state->save_srr0); - } - - instr_pp = (vm_offset_t)pmap_find_phys(kernel_pmap, (addr64_t)(saved_state->save_srr0)); - - if (instr_pp) { - instr_ptr = (addr64_t)(((addr64_t)instr_pp << 12) | (saved_state->save_srr0 & 0xFFF)); /* Make physical address */ - instr = ml_phys_read_64(instr_ptr); /* Get the trap that caused entry */ - } - else instr = 0; - -#if 0 - if (debugger_debug) kprintf("Call_DebuggerC(%d): instr_pp = %08X, instr_ptr = %016llX, instr = %08X\n", my_cpu, instr_pp, instr_ptr, instr); /* (TEST/DEBUG) */ -#endif - - if (db_breakpoints_inserted) cpus_holding_bkpts++; /* Bump up the holding count */ - if ((debugger_cpu == (unsigned)-1) && - !PerProcTable[my_cpu].ppe_vaddr->debugger_is_slave) { -#if 0 - if (debugger_debug) kprintf("Call_DebuggerC(%d): lasttrace = %08X\n", my_cpu, lastTrace); /* (TEST/DEBUG) */ -#endif - debugger_cpu = my_cpu; /* Show that we are debugger */ - - - lastTrace = LLTraceSet(0); /* Disable low-level tracing */ - - for(tcpu = 0; tcpu < real_ncpus; tcpu++) { /* Stop all the other guys */ - if(tcpu == my_cpu) continue; /* Don't diddle ourselves */ - (void)hw_atomic_add(&debugger_sync, 1); /* Count signal sent */ - (void)cpu_signal(tcpu, SIGPdebug, 0 ,0); /* Tell 'em to enter debugger */ - } - (void)hw_cpu_sync(&debugger_sync, LockTimeOut); /* Wait for the other processors to enter debug */ - debugger_sync = 0; /* We're done with it */ - } - else if (debugger_cpu != my_cpu) goto debugger_exit; /* We are not debugger, don't continue... */ - - - if (instr == TRAP_DIRECT_INST) { - disable_debug_output = FALSE; - print_backtrace(saved_state); - } - - switch_debugger = 0; /* Make sure switch request is off */ - directcall = 1; /* Assume direct call */ - - if (saved_state->save_srr1 & MASK(SRR1_PRG_TRAP)) { /* Trap instruction? */ - - directcall = 0; /* We had a trap not a direct call */ - - switch (instr) { /* Select trap type */ - -#if MACH_KDP - case BREAK_TO_KDP0: /* Breakpoint into KDP? */ - case BREAK_TO_KDP1: /* Breakpoint into KDP? */ - current_debugger = KDP_CUR_DB; /* Yes, set KDP */ - kdp_trap(type, saved_state); /* Enter it */ - break; -#endif - -#if MACH_KDB - case BREAK_TO_KDB0: /* Breakpoint to KDB (the "good" debugger)? */ - current_debugger = KDB_CUR_DB; /* Yes, set it */ - kdb_trap(type, saved_state); /* Enter it */ - break; -#endif - - case TRAP_DEBUGGER_INST: /* Should we enter the current debugger? */ - case TRAP_DIRECT_INST: /* Should we enter the current debugger? */ - if (current_debugger == KDP_CUR_DB) /* Is current KDP? */ - kdp_trap(type, saved_state); /* Yes, enter it */ - else if (current_debugger == KDB_CUR_DB) /* Is this KDB? */ - kdb_trap(type, saved_state); /* Yes, go ahead and enter */ - else goto debugger_error; /* No debugger active */ - break; - - default: /* Unknown/bogus trap type */ - goto debugger_error; - } - } - - while(1) { /* We are here to handle debugger switches */ - - if(!directcall) { /* Was this a direct call? */ - if(!switch_debugger) break; /* No, then leave if no switch requested... */ - -/* - * Note: we can only switch to a debugger we have. Ignore bogus switch requests. - */ -#if 0 - if (debugger_debug) kprintf("Call_DebuggerC(%d): switching debuggers\n", my_cpu); /* (TEST/DEBUG) */ -#endif -#if MACH_KDB - if(current_debugger == KDP_CUR_DB) current_debugger = KDB_CUR_DB; /* Switch to KDB */ -#if MACH_KDP - else -#endif -#endif -#if MACH_KDP - if(current_debugger == KDB_CUR_DB) current_debugger = KDP_CUR_DB; /* Switch to KDP */ -#endif - } - - switch_debugger = 0; /* Clear request */ - directcall = 0; /* Clear first-time direct call indication */ - - switch (current_debugger) { /* Enter correct debugger */ - - case KDP_CUR_DB: /* Enter KDP */ - kdp_trap(type, saved_state); - break; - - case KDB_CUR_DB: /* Enter KDB */ - kdb_trap(type, saved_state); - break; - - default: /* No debugger installed */ - goto debugger_error; - break; - } - } - -debugger_exit: -#if 0 - if (debugger_debug) kprintf("Call_DebuggerC(%d): exit - inst = %08X, cpu=%d(%d), run=%d\n", my_cpu, - instr, my_cpu, debugger_cpu, db_run_mode); /* (TEST/DEBUG) */ -#endif - if ((instr == TRAP_DEBUGGER_INST) || /* Did we trap to enter debugger? */ - (instr == TRAP_DIRECT_INST)) saved_state->save_srr0 += TRAP_INST_SIZE; /* Yes, point past trap */ - - wasdebugger = 0; /* Assume not debugger */ - if(debugger_cpu == my_cpu) { /* Are the debugger processor? */ - wasdebugger = 1; /* Remember that we were the debugger */ - LLTraceSet(lastTrace); /* Enable tracing on the way out if we are debugger */ - } - - wait = FALSE; /* Assume we are not going to wait */ - if (db_run_mode == STEP_CONTINUE) { /* Are we going to run? */ - wait = TRUE; /* Yeah, remember to wait for breakpoints to clear */ - debugger_cpu = -1; /* Release other processor's debuggers */ - for(tcpu = 0; tcpu < real_ncpus; tcpu++) - PerProcTable[tcpu].ppe_vaddr->debugger_pending = 0; /* Release request (this is a HACK) */ - NMIss = 0; /* Let NMI bounce */ - } - - if(db_run_mode == STEP_ONCE) { /* Are we about to step? */ - disable_preemption(); /* Disable preemption for the step */ - db_im_stepping = my_cpu; /* Remember that I am about to step */ - } - - if (db_breakpoints_inserted) cpus_holding_bkpts--; /* If any breakpoints, back off count */ - if (PerProcTable[my_cpu].ppe_vaddr->debugger_is_slave) PerProcTable[my_cpu].ppe_vaddr->debugger_is_slave--; /* If we were a slove, uncount us */ - if (debugger_debug) - printf("Call_Debugger: exit - cpu %d, debugger_cpu %d, run_mode %d holds %d\n", - my_cpu, debugger_cpu, db_run_mode, - cpus_holding_bkpts); - - unlock_debugger(); /* Release the lock */ - PerProcTable[my_cpu].ppe_vaddr->debugger_active--; /* Say we aren't active anymore */ - - if (wait) while(cpus_holding_bkpts); /* Wait for breakpoints to clear */ - - - (void)hw_atomic_sub(&debug_mode, 1); /* Set out of debug now */ - - return(1); /* Exit debugger normally */ - -debugger_error: - if(db_run_mode != STEP_ONCE) enable_preemption_no_check(); /* Enable preemption, but don't preempt here */ - (void)hw_atomic_sub(&debug_mode, 1); /* Set out of debug now */ - return(0); /* Return in shame... */ - -} - -void -lock_debugger(void) -{ - unsigned int my_cpu; - - my_cpu = cpu_number(); /* Get our CPU number */ - - while(1) { /* Check until we get it */ - if (debugger_cpu != (unsigned)-1 && debugger_cpu != my_cpu) - continue; /* Someone, not us, is debugger... */ - if (hw_lock_try(&debugger_lock)) { /* Get the debug lock */ - if (debugger_cpu == (unsigned)-1 || debugger_cpu == my_cpu) - break; /* Is it us? */ - hw_lock_unlock(&debugger_lock); /* Not us, release lock */ - } - } -} - -void unlock_debugger(void) { - - hw_lock_unlock(&debugger_lock); - -} - -int patchInst(task_t task, addr64_t vaddr, uint32_t inst); -int patchInst(task_t task, addr64_t vaddr, uint32_t inst) -{ - vm_map_t map; - addr64_t paddr; - uint32_t instr, nestingDepth; - kern_return_t ret; - vm_region_submap_short_info_data_64_t info; - mach_msg_type_number_t count; - mach_vm_address_t address; - mach_vm_size_t sizeOfRegion; - vm_prot_t reprotect; - - if(task == TASK_NULL) return -1; /* Leave if task is bogus... */ - - task_lock(task); /* Make sure the task doesn't go anywhaere */ - if (!task->active) { /* Is is alive? */ - task_unlock(task); /* Nope, unlock */ - return -1; /* Not a active task, fail... */ - } - map = task->map; /* Get his map */ - vm_map_reference_swap(map); /* Don't let it go away */ - task_unlock(task); /* Unleash the task */ - - /* Find the memory permissions. */ - nestingDepth=999999; /* Limit recursion */ - - count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64; - address = (mach_vm_address_t)vaddr; - sizeOfRegion = (mach_vm_size_t)4; - - ret = mach_vm_region_recurse(map, &address, &sizeOfRegion, &nestingDepth, (vm_region_recurse_info_t)&info, &count); - if (ret != KERN_SUCCESS) { /* Leave if it didn't work */ - vm_map_deallocate(map); /* Drop reference on map */ - return (-1); - } - -/* - * We need to check if there could be a problem if the dtrace probes are being removed and the code is being - * executed at the same time. This sequence may leave us with no-execute turned on temporarily when we execute - * through it. - */ - - if (!(info.protection & VM_PROT_WRITE)) { - /* Save the original protection values for restoration later */ - reprotect = info.protection; - - if (info.max_protection & VM_PROT_WRITE) { - /* The memory is not currently writable, but can be made writable. */ - ret = mach_vm_protect(map, (mach_vm_offset_t)vaddr, (mach_vm_size_t)4, 0, reprotect | VM_PROT_WRITE); - } - else { - /* - * The memory is not currently writable, and cannot be made writable. We need to COW this memory. - * - * Strange, we can't just say "reprotect | VM_PROT_COPY", that fails. - */ - ret = mach_vm_protect(map, (mach_vm_offset_t)vaddr, (mach_vm_size_t)4, 0, VM_PROT_COPY | VM_PROT_READ | VM_PROT_WRITE); - } - - if (ret != KERN_SUCCESS) { - vm_map_deallocate(map); /* Drop reference on map */ - return (-1); - } - - } - else { - /* The memory was already writable. */ - reprotect = VM_PROT_NONE; - } - - instr = inst; /* Place instruction in local memory */ - ret = vm_map_write_user(map, &inst, (vm_map_address_t)vaddr, (vm_size_t)4); /* Write the instruction */ - if (ret != KERN_SUCCESS) { /* Leave if it didn't work */ - - if (reprotect != VM_PROT_NONE) { - ret = mach_vm_protect (map, (mach_vm_offset_t)vaddr, (mach_vm_size_t)4, 0, reprotect); - } - - vm_map_deallocate(map); /* Drop reference on map */ - return (-1); - } - - paddr = (addr64_t)pmap_find_phys(map->pmap, vaddr) << 12; /* Find the physical address of the patched address */ - if(!paddr) { /* Is address mapped now? */ - vm_map_deallocate(map); /* Drop reference on map */ - return 0; /* Leave... */ - } - paddr = paddr | (vaddr & 4095); /* Construct physical address */ - invalidate_icache64(paddr, 4, 1); /* Flush out the instruction cache here */ - - if (reprotect != VM_PROT_NONE) { - ret = mach_vm_protect(map, (mach_vm_offset_t)vaddr, (mach_vm_size_t)4, 0, reprotect); - } - - vm_map_deallocate(map); - - return (0); -} diff --git a/osfmk/ppc/movc.s b/osfmk/ppc/movc.s deleted file mode 100644 index 2e100071b..000000000 --- a/osfmk/ppc/movc.s +++ /dev/null @@ -1,1303 +0,0 @@ -/* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -#include -#include -#include -#include -#include -#include - -#define INSTRUMENT 0 - -//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> -/* - * void pmap_zero_page(vm_offset_t pa) - * - * Zero a page of physical memory. This routine runs in 32 or 64-bit mode, - * and handles 32 and 128-byte cache lines. - */ - - - .align 5 - .globl EXT(pmap_zero_page) - -LEXT(pmap_zero_page) - - mflr r12 // save return address - bl EXT(ml_set_physical_disabled) // turn DR and EE off, SF on, get features in r10 - mtlr r12 // restore return address - andi. r9,r10,pf32Byte+pf128Byte // r9 <- cache line size - - subfic r4,r9,PPC_PGBYTES // r4 <- starting offset in page - - bt++ pf64Bitb,page0S4 // Go do the big guys... - - slwi r3,r3,12 // get page address from page num - b page_zero_1 // Jump to line aligned loop... - - .align 5 - - nop - nop - nop - nop - nop - nop - nop - -page0S4: - sldi r3,r3,12 // get page address from page num - -page_zero_1: // loop zeroing cache lines - sub. r5,r4,r9 // more to go? - dcbz128 r3,r4 // zero either 32 or 128 bytes - sub r4,r5,r9 // generate next offset - dcbz128 r3,r5 - bne-- page_zero_1 - - b EXT(ml_restore) // restore MSR and do the isync - - -//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> -/* void - * phys_copy(src, dst, bytecount) - * addr64_t src; - * addr64_t dst; - * int bytecount - * - * This routine will copy bytecount bytes from physical address src to physical - * address dst. It runs in 64-bit mode if necessary, but does not handle - * overlap or make any attempt to be optimal. Length must be a signed word. - * Not performance critical. - */ - - - .align 5 - .globl EXT(phys_copy) - -LEXT(phys_copy) - - rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg - mflr r12 // get return address - rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits - rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg - bl EXT(ml_set_physical_disabled) // turn DR and EE off, SF on, get features in r10 - rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits - mtlr r12 // restore return address - subic. r5,r7,4 // a word to copy? - b phys_copy_2 - - .align 5 - -phys_copy_1: // loop copying words - subic. r5,r5,4 // more to go? - lwz r0,0(r3) - addi r3,r3,4 - stw r0,0(r4) - addi r4,r4,4 -phys_copy_2: - bge phys_copy_1 - addic. r5,r5,4 // restore count - ble phys_copy_4 // no more - - // Loop is aligned here - -phys_copy_3: // loop copying bytes - subic. r5,r5,1 // more to go? - lbz r0,0(r3) - addi r3,r3,1 - stb r0,0(r4) - addi r4,r4,1 - bgt phys_copy_3 -phys_copy_4: - b EXT(ml_restore) // restore MSR and do the isync - - -//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> -/* void - * pmap_copy_page(src, dst) - * ppnum_t src; - * ppnum_t dst; - * - * This routine will copy the physical page src to physical page dst - * - * This routine assumes that the src and dst are page numbers and that the - * destination is cached. It runs on 32 and 64 bit processors, with and - * without altivec, and with 32 and 128 byte cache lines. - * We also must assume that no-one will be executing within the destination - * page, and that this will be used for paging. Because this - * is a common routine, we have tuned loops for each processor class. - * - */ -#define kSFSize (FM_SIZE+160) - -ENTRY(pmap_copy_page, TAG_NO_FRAME_USED) - - lis r2,hi16(MASK(MSR_VEC)) ; Get the vector flag - mflr r0 // get return - ori r2,r2,lo16(MASK(MSR_FP)) ; Add the FP flag - stw r0,8(r1) // save - stwu r1,-kSFSize(r1) // set up a stack frame for VRs or FPRs - mfmsr r11 // save MSR at entry - mfsprg r10,2 // get feature flags - andc r11,r11,r2 // Clear out vec and fp - ori r2,r2,lo16(MASK(MSR_EE)) // Get EE on also - andc r2,r11,r2 // Clear out EE as well - mtcrf 0x02,r10 // we need to test pf64Bit - ori r2,r2,MASK(MSR_FP) // must enable FP for G3... - mtcrf 0x80,r10 // we need to test pfAltivec too - oris r2,r2,hi16(MASK(MSR_VEC)) // enable altivec for G4 (ignored if G3) - mtmsr r2 // turn EE off, FP and VEC on - isync - bt++ pf64Bitb,pmap_copy_64 // skip if 64-bit processor (only they take hint) - slwi r3,r3,12 // get page address from page num - slwi r4,r4,12 // get page address from page num - rlwinm r12,r2,0,MSR_DR_BIT+1,MSR_DR_BIT-1 // get ready to turn off DR - bt pfAltivecb,pmap_copy_g4 // altivec but not 64-bit means G4 - - - // G3 -- copy using FPRs - - stfd f0,FM_SIZE+0(r1) // save the 4 FPRs we use to copy - stfd f1,FM_SIZE+8(r1) - li r5,PPC_PGBYTES/32 // count of cache lines in a page - stfd f2,FM_SIZE+16(r1) - mtctr r5 - stfd f3,FM_SIZE+24(r1) - mtmsr r12 // turn off DR after saving FPRs on stack - isync - -pmap_g3_copy_loop: // loop over 32-byte cache lines - dcbz 0,r4 // avoid read of dest line - lfd f0,0(r3) - lfd f1,8(r3) - lfd f2,16(r3) - lfd f3,24(r3) - addi r3,r3,32 - stfd f0,0(r4) - stfd f1,8(r4) - stfd f2,16(r4) - stfd f3,24(r4) - dcbst 0,r4 // flush dest line to RAM - addi r4,r4,32 - bdnz pmap_g3_copy_loop - - sync // wait for stores to take - subi r4,r4,PPC_PGBYTES // restore ptr to destintation page - li r6,PPC_PGBYTES-32 // point to last line in page -pmap_g3_icache_flush: - subic. r5,r6,32 // more to go? - icbi r4,r6 // flush another line in icache - subi r6,r5,32 // get offset to next line - icbi r4,r5 - bne pmap_g3_icache_flush - - sync - mtmsr r2 // turn DR back on - isync - lfd f0,FM_SIZE+0(r1) // restore the FPRs - lfd f1,FM_SIZE+8(r1) - lfd f2,FM_SIZE+16(r1) - lfd f3,FM_SIZE+24(r1) - - b pmap_g4_restore // restore MSR and done - - - // G4 -- copy using VRs - -pmap_copy_g4: // r2=(MSR-EE), r12=(r2-DR), r10=features, r11=old MSR - la r9,FM_SIZE+16(r1) // place where we save VRs to r9 - li r5,16 // load x-form offsets into r5-r9 - li r6,32 // another offset - stvx v0,0,r9 // save some VRs so we can use to copy - li r7,48 // another offset - stvx v1,r5,r9 - li r0,PPC_PGBYTES/64 // we loop over 64-byte chunks - stvx v2,r6,r9 - mtctr r0 - li r8,96 // get look-ahead for touch - stvx v3,r7,r9 - li r9,128 - mtmsr r12 // now we've saved VRs on stack, turn off DR - isync // wait for it to happen - b pmap_g4_copy_loop - - .align 5 // align inner loops -pmap_g4_copy_loop: // loop over 64-byte chunks - dcbt r3,r8 // touch 3 lines ahead - nop // avoid a 17-word loop... - dcbt r3,r9 // touch 4 lines ahead - nop // more padding - dcba 0,r4 // avoid pre-fetch of 1st dest line - lvx v0,0,r3 // offset 0 - lvx v1,r5,r3 // offset 16 - lvx v2,r6,r3 // offset 32 - lvx v3,r7,r3 // offset 48 - addi r3,r3,64 - dcba r6,r4 // avoid pre-fetch of 2nd line - stvx v0,0,r4 // offset 0 - stvx v1,r5,r4 // offset 16 - stvx v2,r6,r4 // offset 32 - stvx v3,r7,r4 // offset 48 - dcbf 0,r4 // push line 1 - dcbf r6,r4 // and line 2 - addi r4,r4,64 - bdnz pmap_g4_copy_loop - - sync // wait for stores to take - subi r4,r4,PPC_PGBYTES // restore ptr to destintation page - li r8,PPC_PGBYTES-32 // point to last line in page -pmap_g4_icache_flush: - subic. r9,r8,32 // more to go? - icbi r4,r8 // flush from icache - subi r8,r9,32 // get offset to next line - icbi r4,r9 - bne pmap_g4_icache_flush - - sync - mtmsr r2 // turn DR back on - isync - la r9,FM_SIZE+16(r1) // get base of VR save area - lvx v0,0,r9 // restore the VRs - lvx v1,r5,r9 - lvx v2,r6,r9 - lvx v3,r7,r9 - -pmap_g4_restore: // r11=MSR - mtmsr r11 // turn EE on, VEC and FR off - isync // wait for it to happen - addi r1,r1,kSFSize // pop off our stack frame - lwz r0,8(r1) // restore return address - mtlr r0 - blr - - - // 64-bit/128-byte processor: copy using VRs - -pmap_copy_64: // r10=features, r11=old MSR - sldi r3,r3,12 // get page address from page num - sldi r4,r4,12 // get page address from page num - la r9,FM_SIZE+16(r1) // get base of VR save area - li r5,16 // load x-form offsets into r5-r9 - li r6,32 // another offset - bf pfAltivecb,pmap_novmx_copy // altivec suppressed... - stvx v0,0,r9 // save 8 VRs so we can copy wo bubbles - stvx v1,r5,r9 - li r7,48 // another offset - li r0,PPC_PGBYTES/128 // we loop over 128-byte chunks - stvx v2,r6,r9 - stvx v3,r7,r9 - addi r9,r9,64 // advance base ptr so we can store another 4 - mtctr r0 - li r0,MASK(MSR_DR) // get DR bit - stvx v4,0,r9 - stvx v5,r5,r9 - andc r12,r2,r0 // turn off DR bit - li r0,1 // get a 1 to slam into SF - stvx v6,r6,r9 - stvx v7,r7,r9 - rldimi r12,r0,63,MSR_SF_BIT // set SF bit (bit 0) - li r8,-128 // offset so we can reach back one line - mtmsrd r12 // now we've saved VRs, turn DR off and SF on - isync // wait for it to happen - dcbt128 0,r3,1 // start a forward stream - b pmap_64_copy_loop - - .align 5 // align inner loops -pmap_64_copy_loop: // loop over 128-byte chunks - dcbz128 0,r4 // avoid read of destination line - lvx v0,0,r3 // offset 0 - lvx v1,r5,r3 // offset 16 - lvx v2,r6,r3 // offset 32 - lvx v3,r7,r3 // offset 48 - addi r3,r3,64 // don't have enough GPRs so add 64 2x - lvx v4,0,r3 // offset 64 - lvx v5,r5,r3 // offset 80 - lvx v6,r6,r3 // offset 96 - lvx v7,r7,r3 // offset 112 - addi r3,r3,64 - stvx v0,0,r4 // offset 0 - stvx v1,r5,r4 // offset 16 - stvx v2,r6,r4 // offset 32 - stvx v3,r7,r4 // offset 48 - addi r4,r4,64 - stvx v4,0,r4 // offset 64 - stvx v5,r5,r4 // offset 80 - stvx v6,r6,r4 // offset 96 - stvx v7,r7,r4 // offset 112 - addi r4,r4,64 - dcbf r8,r4 // flush the line we just wrote - bdnz pmap_64_copy_loop - - sync // wait for stores to take - subi r4,r4,PPC_PGBYTES // restore ptr to destintation page - li r8,PPC_PGBYTES-128 // point to last line in page -pmap_64_icache_flush: - subic. r9,r8,128 // more to go? - icbi r4,r8 // flush from icache - subi r8,r9,128 // get offset to next line - icbi r4,r9 - bne pmap_64_icache_flush - - sync - mtmsrd r2 // turn DR back on, SF off - isync - la r9,FM_SIZE+16(r1) // get base address of VR save area on stack - lvx v0,0,r9 // restore the VRs - lvx v1,r5,r9 - lvx v2,r6,r9 - lvx v3,r7,r9 - addi r9,r9,64 - lvx v4,0,r9 - lvx v5,r5,r9 - lvx v6,r6,r9 - lvx v7,r7,r9 - - b pmap_g4_restore // restore lower half of MSR and return - - // - // Copy on 64-bit without VMX - // - -pmap_novmx_copy: - li r0,PPC_PGBYTES/128 // we loop over 128-byte chunks - mtctr r0 - li r0,MASK(MSR_DR) // get DR bit - andc r12,r2,r0 // turn off DR bit - li r0,1 // get a 1 to slam into SF - rldimi r12,r0,63,MSR_SF_BIT // set SF bit (bit 0) - mtmsrd r12 // now we've saved VRs, turn DR off and SF on - isync // wait for it to happen - dcbt128 0,r3,1 // start a forward stream - -pmap_novmx_copy_loop: // loop over 128-byte cache lines - dcbz128 0,r4 // avoid read of dest line - - ld r0,0(r3) // Load half a line - ld r12,8(r3) - ld r5,16(r3) - ld r6,24(r3) - ld r7,32(r3) - ld r8,40(r3) - ld r9,48(r3) - ld r10,56(r3) - - std r0,0(r4) // Store half a line - std r12,8(r4) - std r5,16(r4) - std r6,24(r4) - std r7,32(r4) - std r8,40(r4) - std r9,48(r4) - std r10,56(r4) - - ld r0,64(r3) // Load half a line - ld r12,72(r3) - ld r5,80(r3) - ld r6,88(r3) - ld r7,96(r3) - ld r8,104(r3) - ld r9,112(r3) - ld r10,120(r3) - - addi r3,r3,128 - - std r0,64(r4) // Store half a line - std r12,72(r4) - std r5,80(r4) - std r6,88(r4) - std r7,96(r4) - std r8,104(r4) - std r9,112(r4) - std r10,120(r4) - - dcbf 0,r4 // flush the line we just wrote - addi r4,r4,128 - bdnz pmap_novmx_copy_loop - - sync // wait for stores to take - subi r4,r4,PPC_PGBYTES // restore ptr to destintation page - li r8,PPC_PGBYTES-128 // point to last line in page - -pmap_novmx_icache_flush: - subic. r9,r8,128 // more to go? - icbi r4,r8 // flush from icache - subi r8,r9,128 // get offset to next line - icbi r4,r9 - bne pmap_novmx_icache_flush - - sync - mtmsrd r2 // turn DR back on, SF off - isync - - b pmap_g4_restore // restore lower half of MSR and return - - - -//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> - -// Stack frame format used by copyin, copyout, copyinstr and copyoutstr. -// These routines all run both on 32 and 64-bit machines, though because they are called -// by the BSD kernel they are always in 32-bit mode when entered. The mapped ptr returned -// by MapUserMemoryWindow will be 64 bits however on 64-bit machines. Beware to avoid -// using compare instructions on this ptr. This mapped ptr is kept globally in r31, so there -// is no need to store or load it, which are mode-dependent operations since it could be -// 32 or 64 bits. - -#define kkFrameSize (FM_SIZE+32) - -#define kkBufSize (FM_SIZE+0) -#define kkCR3 (FM_SIZE+4) -#define kkSource (FM_SIZE+8) -#define kkDest (FM_SIZE+12) -#define kkCountPtr (FM_SIZE+16) -#define kkR31Save (FM_SIZE+20) -#define kkThrErrJmp (FM_SIZE+24) - - -// nonvolatile CR bits we use as flags in cr3 - -#define kk64bit 12 -#define kkNull 13 -#define kkIn 14 -#define kkString 15 -#define kkZero 15 - - -//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> -/* - * int - * copyoutstr(src, dst, maxcount, count) - * vm_offset_t src; // r3 - * addr64_t dst; // r4 and r5 - * vm_size_t maxcount; // r6 - * vm_size_t* count; // r7 - * - * Set *count to the number of bytes copied. - */ - -ENTRY(copyoutstr, TAG_NO_FRAME_USED) - mfcr r2,0x10 // save caller's cr3, which we use for flags - mr r10,r4 // move high word of 64-bit user address to r10 - li r0,0 - crset kkString // flag as a string op - mr r11,r5 // move low word of 64-bit user address to r11 - stw r0,0(r7) // initialize #bytes moved - crclr kkIn // flag as copyout - b copyJoin - - -//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> -/* - * int - * copyinstr(src, dst, maxcount, count) - * addr64_t src; // r3 and r4 - * vm_offset_t dst; // r5 - * vm_size_t maxcount; // r6 - * vm_size_t* count; // r7 - * - * Set *count to the number of bytes copied - * If dst == NULL, don't copy, just count bytes. - * Only currently called from klcopyinstr. - */ - -ENTRY(copyinstr, TAG_NO_FRAME_USED) - mfcr r2,0x10 // save caller's cr3, which we use for flags - cmplwi r5,0 // dst==NULL? - mr r10,r3 // move high word of 64-bit user address to r10 - li r0,0 - crset kkString // flag as a string op - mr r11,r4 // move low word of 64-bit user address to r11 - crmove kkNull,cr0_eq // remember if (dst==NULL) - stw r0,0(r7) // initialize #bytes moved - crset kkIn // flag as copyin (rather than copyout) - b copyJoin1 // skip over the "crclr kkNull" - - -//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> -/* - * int - * copyout(src, dst, count) - * vm_offset_t src; // r3 - * addr64_t dst; // r4 and r5 - * size_t count; // r6 - */ - - .align 5 - .globl EXT(copyout) - .globl EXT(copyoutmsg) - -LEXT(copyout) -LEXT(copyoutmsg) - -#if INSTRUMENT - mfspr r12,pmc1 ; INSTRUMENT - saveinstr[12] - Take stamp at copyout - stw r12,0x6100+(12*16)+0x0(0) ; INSTRUMENT - Save it - mfspr r12,pmc2 ; INSTRUMENT - Get stamp - stw r12,0x6100+(12*16)+0x4(0) ; INSTRUMENT - Save it - mfspr r12,pmc3 ; INSTRUMENT - Get stamp - stw r12,0x6100+(12*16)+0x8(0) ; INSTRUMENT - Save it - mfspr r12,pmc4 ; INSTRUMENT - Get stamp - stw r12,0x6100+(12*16)+0xC(0) ; INSTRUMENT - Save it -#endif - mfcr r2,0x10 // save caller's cr3, which we use for flags - mr r10,r4 // move high word of 64-bit user address to r10 - crclr kkString // not a string version - mr r11,r5 // move low word of 64-bit user address to r11 - crclr kkIn // flag as copyout - b copyJoin - - -//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> -/* - * int - * copyin(src, dst, count) - * addr64_t src; // r3 and r4 - * vm_offset_t dst; // r5 - * size_t count; // r6 - */ - - - .align 5 - .globl EXT(copyin) - .globl EXT(copyinmsg) - -LEXT(copyin) -LEXT(copyinmsg) - - mfcr r2,0x10 // save caller's cr3, which we use for flags - mr r10,r3 // move high word of 64-bit user address to r10 - crclr kkString // not a string version - mr r11,r4 // move low word of 64-bit user address to r11 - crset kkIn // flag as copyin - - -// Common code to handle setup for all the copy variants: -// r2 = caller's cr3 -// r3 = source if copyout -// r5 = dest if copyin -// r6 = buffer length or count -// r7 = count output ptr (if kkString set) -// r10 = high word of 64-bit user-space address (source if copyin, dest if copyout) -// r11 = low word of 64-bit user-space address -// cr3 = kkIn, kkString, kkNull flags - -copyJoin: - crclr kkNull // (dst==NULL) convention not used with this call -copyJoin1: // enter from copyinstr with kkNull set - mflr r0 // get return address - cmplwi r6,0 // buffer length 0? - lis r9,0x1000 // r9 <- 0x10000000 (256MB) - stw r0,FM_LR_SAVE(r1) // save return - cmplw cr1,r6,r9 // buffer length > 256MB ? - mfsprg r8,2 // get the features - beq-- copyinout_0 // 0 length is degenerate case - stwu r1,-kkFrameSize(r1) // set up stack frame - stw r2,kkCR3(r1) // save caller's cr3, which we use for flags - mtcrf 0x02,r8 // move pf64Bit to cr6 - stw r3,kkSource(r1) // save args across MapUserMemoryWindow - stw r5,kkDest(r1) - stw r6,kkBufSize(r1) - crmove kk64bit,pf64Bitb // remember if this is a 64-bit processor - stw r7,kkCountPtr(r1) - stw r31,kkR31Save(r1) // we use r31 globally for mapped user ptr - - - -// Handle buffer length > 256MB. This is an error (ENAMETOOLONG) on copyin and copyout. -// The string ops are passed -1 lengths by some BSD callers, so for them we silently clamp -// the buffer length to 256MB. This isn't an issue if the string is less than 256MB -// (as most are!), but if they are >256MB we eventually return ENAMETOOLONG. This restriction -// is due to MapUserMemoryWindow; we don't want to consume more than two segments for -// the mapping. - - ble++ cr1,copyin0 // skip if buffer length <= 256MB - bf kkString,copyinout_too_big // error if not string op - mr r6,r9 // silently clamp buffer length to 256MB - stw r9,kkBufSize(r1) // update saved copy too - - -// Set up thread_recover in case we hit an illegal address. - -copyin0: - li r31,0 // no mapped ptr yet - mfsprg r8,1 // Get the current thread - lis r2,hi16(copyinout_error) - ori r2,r2,lo16(copyinout_error) - lwz r4,THREAD_RECOVER(r8) - lwz r3,ACT_VMMAP(r8) // r3 <- vm_map virtual address - stw r2,THREAD_RECOVER(r8) - stw r4,kkThrErrJmp(r1) - - -// Map user segment into kernel map, turn on 64-bit mode. At this point: -// r3 = vm map -// r6 = buffer length -// r10/r11 = 64-bit user-space ptr (source if copyin, dest if copyout) -// -// When we call MapUserMemoryWindow, we pass: -// r3 = vm map ptr -// r4/r5 = 64-bit user space address as an addr64_t - - mr r4,r10 // copy user ptr into r4/r5 - mr r5,r11 -#if INSTRUMENT - mfspr r12,pmc1 ; INSTRUMENT - saveinstr[13] - Take stamp before mapuseraddressspace - stw r12,0x6100+(13*16)+0x0(0) ; INSTRUMENT - Save it - mfspr r12,pmc2 ; INSTRUMENT - Get stamp - stw r12,0x6100+(13*16)+0x4(0) ; INSTRUMENT - Save it - mfspr r12,pmc3 ; INSTRUMENT - Get stamp - stw r12,0x6100+(13*16)+0x8(0) ; INSTRUMENT - Save it - mfspr r12,pmc4 ; INSTRUMENT - Get stamp - stw r12,0x6100+(13*16)+0xC(0) ; INSTRUMENT - Save it -#endif - bl EXT(MapUserMemoryWindow) // get r3/r4 <- 64-bit address in kernel map of user operand -#if INSTRUMENT - mfspr r12,pmc1 ; INSTRUMENT - saveinstr[14] - Take stamp after mapuseraddressspace - stw r12,0x6100+(14*16)+0x0(0) ; INSTRUMENT - Save it - mfspr r12,pmc2 ; INSTRUMENT - Get stamp - stw r12,0x6100+(14*16)+0x4(0) ; INSTRUMENT - Save it - mfspr r12,pmc3 ; INSTRUMENT - Get stamp - stw r12,0x6100+(14*16)+0x8(0) ; INSTRUMENT - Save it - mfspr r12,pmc4 ; INSTRUMENT - Get stamp - stw r12,0x6100+(14*16)+0xC(0) ; INSTRUMENT - Save it -#endif - mr r31,r4 // r31 <- mapped ptr into user space (may be 64-bit) - bf-- kk64bit,copyin1 // skip if a 32-bit processor - - rldimi r31,r3,32,0 // slam high-order bits into mapped ptr - mfmsr r4 // if 64-bit, turn on SF so we can use returned ptr - li r0,1 - rldimi r4,r0,63,MSR_SF_BIT // light bit 0 - mtmsrd r4 // turn on 64-bit mode - isync // wait for mode to change - - -// Load r3-r5, substituting mapped ptr as appropriate. - -copyin1: - lwz r5,kkBufSize(r1) // restore length to copy - bf kkIn,copyin2 // skip if copyout - lwz r4,kkDest(r1) // copyin: dest is kernel ptr - mr r3,r31 // source is mapped ptr - b copyin3 -copyin2: // handle copyout - lwz r3,kkSource(r1) // source is kernel buffer (r3 at entry) - mr r4,r31 // dest is mapped ptr into user space - - -// Finally, all set up to copy: -// r3 = source ptr (mapped if copyin) -// r4 = dest ptr (mapped if copyout) -// r5 = length -// r31 = mapped ptr returned by MapUserMemoryWindow -// cr3 = kkIn, kkString, kk64bit, and kkNull flags - -copyin3: - bt kkString,copyString // handle copyinstr and copyoutstr - bl EXT(bcopy) // copyin and copyout: let bcopy do the work - li r3,0 // return success - - -// Main exit point for copyin, copyout, copyinstr, and copyoutstr. Also reached -// from error recovery if we get a DSI accessing user space. Clear recovery ptr, -// and pop off frame. -// r3 = 0, EFAULT, or ENAMETOOLONG - -copyinx: - lwz r2,kkCR3(r1) // get callers cr3 - mfsprg r6,1 // Get the current thread - bf-- kk64bit,copyinx1 // skip if 32-bit processor - mfmsr r12 - rldicl r12,r12,0,MSR_SF_BIT+1 // if 64-bit processor, turn 64-bit mode off - mtmsrd r12 // turn SF off - isync // wait for the mode to change -copyinx1: - lwz r0,FM_LR_SAVE+kkFrameSize(r1) // get return address - lwz r31,kkR31Save(r1) // restore callers r31 - lwz r4,kkThrErrJmp(r1) // load saved thread recover - addi r1,r1,kkFrameSize // pop off our stack frame - mtlr r0 - stw r4,THREAD_RECOVER(r6) // restore thread recover - mtcrf 0x10,r2 // restore cr3 - blr - - -/* We get here via the exception handler if an illegal - * user memory reference was made. This error handler is used by - * copyin, copyout, copyinstr, and copyoutstr. Registers are as - * they were at point of fault, so for example cr3 flags are valid. - */ - -copyinout_error: - li r3,EFAULT // return error - b copyinx - -copyinout_0: // degenerate case: 0-length copy - mtcrf 0x10,r2 // restore cr3 - li r3,0 // return success - blr - -copyinout_too_big: // degenerate case - mtcrf 0x10,r2 // restore cr3 - lwz r1,0(r1) // pop off stack frame - li r3,ENAMETOOLONG - blr - - -//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> -// Handle copyinstr and copyoutstr. At this point the stack frame is set up, -// the recovery ptr is set, the user's buffer is mapped, we're in 64-bit mode -// if necessary, and: -// r3 = source ptr, mapped if copyinstr -// r4 = dest ptr, mapped if copyoutstr -// r5 = buffer length -// r31 = mapped ptr returned by MapUserMemoryWindow -// cr3 = kkIn, kkString, kkNull, and kk64bit flags -// We do word copies unless the buffer is very short, then use a byte copy loop -// for the leftovers if necessary. The crossover at which the word loop becomes -// faster is about seven bytes, counting the zero. -// -// We first must word-align the source ptr, in order to avoid taking a spurious -// page fault. - -copyString: - cmplwi cr1,r5,15 // is buffer very short? - mr r12,r3 // remember ptr to 1st source byte - mtctr r5 // assuming short, set up loop count for bytes - blt-- cr1,copyinstr8 // too short for word loop - rlwinm r2,r3,0,0x3 // get byte offset of 1st byte within word - rlwinm r9,r3,3,0x18 // get bit offset of 1st byte within word - li r7,-1 - sub r3,r3,r2 // word-align source address - add r6,r5,r2 // get length starting at byte 0 in word - srw r7,r7,r9 // get mask for bytes in first word - srwi r0,r6,2 // get #words in buffer - lwz r5,0(r3) // get aligned word with first source byte - lis r10,hi16(0xFEFEFEFF) // load magic constants into r10 and r11 - lis r11,hi16(0x80808080) - mtctr r0 // set up word loop count - addi r3,r3,4 // advance past the source word - ori r10,r10,lo16(0xFEFEFEFF) - ori r11,r11,lo16(0x80808080) - orc r8,r5,r7 // map bytes preceeding first source byte into 0xFF - bt-- kkNull,copyinstr5enter // enter loop that just counts - -// Special case 1st word, which has been 0xFF filled on left. Note that we use -// "and.", even though we execute both in 32 and 64-bit mode. This is OK. - - slw r5,r5,r9 // left justify payload bytes - add r9,r10,r8 // r9 = data + 0xFEFEFEFF - andc r7,r11,r8 // r7 = ~data & 0x80808080 - subfic r0,r2,4 // get r0 <- #payload bytes in 1st word - and. r7,r9,r7 // if r7==0, then all bytes in r8 are nonzero - stw r5,0(r4) // copy payload bytes to dest buffer - add r4,r4,r0 // then point to next byte in dest buffer - bdnzt cr0_eq,copyinstr6 // use loop that copies if 0 not found - - b copyinstr7 // 0 found (buffer can't be full) - - -// Word loop(s). They do a word-parallel search for 0s, using the following -// inobvious but very efficient test: -// y = data + 0xFEFEFEFF -// z = ~data & 0x80808080 -// If (y & z)==0, then all bytes in dataword are nonzero. There are two copies -// of this loop, one that just counts and another that copies. -// r3 = ptr to next word of source (word aligned) -// r4 = ptr to next byte in buffer -// r6 = original buffer length (adjusted to be word origin) -// r10 = 0xFEFEFEFE -// r11 = 0x80808080 -// r12 = ptr to 1st source byte (used to determine string length) - - .align 5 // align inner loops for speed -copyinstr5: // version that counts but does not copy - lwz r8,0(r3) // get next word of source - addi r3,r3,4 // advance past it -copyinstr5enter: - add r9,r10,r8 // r9 = data + 0xFEFEFEFF - andc r7,r11,r8 // r7 = ~data & 0x80808080 - and. r7,r9,r7 // r7 = r9 & r7 ("." ok even in 64-bit mode) - bdnzt cr0_eq,copyinstr5 // if r7==0, then all bytes in r8 are nonzero - - b copyinstr7 - - .align 5 // align inner loops for speed -copyinstr6: // version that counts and copies - lwz r8,0(r3) // get next word of source - addi r3,r3,4 // advance past it - addi r4,r4,4 // increment dest ptr while we wait for data - add r9,r10,r8 // r9 = data + 0xFEFEFEFF - andc r7,r11,r8 // r7 = ~data & 0x80808080 - and. r7,r9,r7 // r7 = r9 & r7 ("." ok even in 64-bit mode) - stw r8,-4(r4) // pack all 4 bytes into buffer - bdnzt cr0_eq,copyinstr6 // if r7==0, then all bytes are nonzero - - -// Either 0 found or buffer filled. The above algorithm has mapped nonzero bytes to 0 -// and 0 bytes to 0x80 with one exception: 0x01 bytes preceeding the first 0 are also -// mapped to 0x80. We must mask out these false hits before searching for an 0x80 byte. -// r3 = word aligned ptr to next word of source (ie, r8==mem(r3-4)) -// r6 = original buffer length (adjusted to be word origin) -// r7 = computed vector of 0x00 and 0x80 bytes -// r8 = original source word, coming from -4(r3), possibly padded with 0xFFs on left if 1st word -// r12 = ptr to 1st source byte (used to determine string length) -// cr0 = beq set iff 0 not found - -copyinstr7: - rlwinm r2,r8,7,0,31 // move 0x01 bits to 0x80 position - rlwinm r6,r6,0,0x3 // mask down to partial byte count in last word - andc r7,r7,r2 // turn off false hits from 0x0100 worst case - crnot kkZero,cr0_eq // 0 found iff cr0_eq is off - srwi r7,r7,8 // we want to count the 0 as a byte xferred - cmpwi r6,0 // any bytes left over in last word? - cntlzw r7,r7 // now we can find the 0 byte (ie, the 0x80) - subi r3,r3,4 // back up r3 to point to 1st byte in r8 - srwi r7,r7,3 // convert 8,16,24,32 to 1,2,3,4 - add r3,r3,r7 // now r3 points one past 0 byte, or at 1st byte not xferred - bt++ kkZero,copyinstr10 // 0 found, so done - - beq copyinstr10 // r6==0, so buffer truly full - mtctr r6 // 0 not found, loop over r6 bytes - b copyinstr8 // enter byte loop for last 1-3 leftover bytes - - -// Byte loop. This is used for very small buffers and for the odd bytes left over -// after searching and copying words at a time. -// r3 = ptr to next byte of source -// r4 = ptr to next dest byte -// r12 = ptr to first byte of source -// ctr = count of bytes to check - - .align 5 // align inner loops for speed -copyinstr8: // loop over bytes of source - lbz r0,0(r3) // get next byte of source - addi r3,r3,1 - addi r4,r4,1 // increment dest addr whether we store or not - cmpwi r0,0 // the 0? - bt-- kkNull,copyinstr9 // don't store if copyinstr with NULL ptr - stb r0,-1(r4) -copyinstr9: - bdnzf cr0_eq,copyinstr8 // loop if byte not 0 and more room in buffer - - crmove kkZero,cr0_eq // remember if 0 found or buffer filled - - -// Buffer filled or 0 found. Unwind and return. -// r3 = ptr to 1st source byte not transferred -// r12 = ptr to 1st source byte -// r31 = mapped ptr returned by MapUserMemoryWindow -// cr3 = kkZero set iff 0 found - -copyinstr10: - lwz r9,kkCountPtr(r1) // get ptr to place to store count of bytes moved - sub r2,r3,r12 // compute #bytes copied (including the 0) - li r3,0 // assume success return status - stw r2,0(r9) // store #bytes moved - bt++ kkZero,copyinx // we did find the 0 so return 0 - li r3,ENAMETOOLONG // buffer filled - b copyinx // join main exit routine - -//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> -/* - * int - * copypv(source, sink, size, which) - * addr64_t src; // r3 and r4 - * addr64_t dst; // r5 and r6 - * size_t size; // r7 - * int which; // r8 - * - * Operand size bytes are copied from operand src into operand dst. The source and - * destination operand addresses are given as addr64_t, and may designate starting - * locations in physical or virtual memory in any combination except where both are - * virtual. Virtual memory locations may be in either the kernel or the current thread's - * address space. Operand size may be up to 256MB. - * - * Operation is controlled by operand which, which offers these options: - * cppvPsrc : source operand is (1) physical or (0) virtual - * cppvPsnk : destination operand is (1) physical or (0) virtual - * cppvKmap : virtual operand is in (1) kernel or (0) current thread - * cppvFsnk : (1) flush destination before and after transfer - * cppvFsrc : (1) flush source before and after transfer - * cppvNoModSnk : (1) don't set source operand's changed bit(s) - * cppvNoRefSrc : (1) don't set destination operand's referenced bit(s) - * - * Implementation is now split into this new 64-bit path and the old path, hw_copypv_32(). - * This section describes the operation of the new 64-bit path. - * - * The 64-bit path utilizes the more capacious 64-bit kernel address space to create a - * window in the kernel address space into all of physical RAM plus the I/O hole. Since - * the window's mappings specify the proper access policies for the underlying memory, - * the new path does not have to flush caches to avoid a cache paradox, so cppvFsnk - * and cppvFsrc are ignored. Physical operand adresses are relocated into the physical - * memory window, and are accessed with data relocation on. Virtual addresses are either - * within the kernel, or are mapped into the kernel address space through the user memory - * window. Because accesses to a virtual operand are performed with data relocation on, - * the new path does not have to translate the address, disable/enable interrupts, lock - * the mapping, or update referenced and changed bits. - * - * The IBM 970 (a.k.a. G5) processor treats real-mode accesses as guarded, so there is - * a substantial performance penalty for copypv operating in real mode. Utilizing the - * new 64-bit path, transfer performance increases >100% on the G5. - * - * The attentive reader may notice that mtmsrd ops are not followed by isync ops as - * might be expected. The 970 follows PowerPC architecture version 2.01, which defines - * mtmsrd with L=0 as a context synchronizing op, so a following isync is no longer - * required. - * - * To keep things exciting, we develop 64-bit values in non-volatiles, but we also need - * to call 32-bit functions, which would lead to the high-order 32 bits of our values - * getting clobbered unless we do something special. So, we preserve our 64-bit non-volatiles - * in our own stack frame across calls to 32-bit functions. - * - */ - -// Map operand which bits into non-volatile CR2 and CR3 bits. -#define whichAlign ((3+1)*4) -#define whichMask 0x007F0000 -#define pvPsnk (cppvPsnkb - whichAlign) -#define pvPsrc (cppvPsrcb - whichAlign) -#define pvFsnk (cppvFsnkb - whichAlign) -#define pvFsrc (cppvFsrcb - whichAlign) -#define pvNoModSnk (cppvNoModSnkb - whichAlign) -#define pvNoRefSrc (cppvNoRefSrcb - whichAlign) -#define pvKmap (cppvKmapb - whichAlign) -#define pvNoCache cr2_lt - - .align 5 - .globl EXT(copypv) - -LEXT(copypv) - mfsprg r10,2 // get feature flags - mtcrf 0x02,r10 // we need to test pf64Bit - bt++ pf64Bitb,copypv_64 // skip if 64-bit processor (only they take hint) - - b EXT(hw_copypv_32) // carry on with 32-bit copypv - -// Push a 32-bit ABI-compliant stack frame and preserve all non-volatiles that we'll clobber. -copypv_64: - mfsprg r9,1 // get current thread - stwu r1,-(FM_ALIGN((31-26+11)*4)+FM_SIZE)(r1) - // allocate stack frame and link it - mflr r0 // get return address - mfcr r10 // get cr2 and cr3 - lwz r12,THREAD_RECOVER(r9) // get error callback - stw r26,FM_ARG0+0x00(r1) // save non-volatile r26 - stw r27,FM_ARG0+0x04(r1) // save non-volatile r27 - stw r28,FM_ARG0+0x08(r1) // save non-volatile r28 - stw r29,FM_ARG0+0x0C(r1) // save non-volatile r29 - stw r30,FM_ARG0+0x10(r1) // save non-volatile r30 - stw r31,FM_ARG0+0x14(r1) // save non-volatile r31 - stw r12,FM_ARG0+0x20(r1) // save error callback - stw r0,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_LR_SAVE)(r1) - // save return address - stw r10,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_CR_SAVE)(r1) - // save non-volatile cr2 and cr3 - -// Non-volatile register usage in this routine is: -// r26: saved msr image -// r27: current pmap_t / virtual source address -// r28: destination virtual address -// r29: source address -// r30: destination address -// r31: byte count to copy -// cr2/3: parameter 'which' bits - - rlwinm r8,r8,whichAlign,whichMask // align and mask which bits - mr r31,r7 // copy size to somewhere non-volatile - mtcrf 0x20,r8 // insert which bits into cr2 and cr3 - mtcrf 0x10,r8 // insert which bits into cr2 and cr3 - rlwinm r29,r3,0,1,0 // form source address high-order bits - rlwinm r30,r5,0,1,0 // form destination address high-order bits - rlwimi r29,r4,0,0,31 // form source address low-order bits - rlwimi r30,r6,0,0,31 // form destination address low-order bits - crand cr7_lt,pvPsnk,pvPsrc // are both operand addresses physical? - cntlzw r0,r31 // count leading zeroes in byte count - cror cr7_eq,pvPsnk,pvPsrc // cr7_eq <- source or destination is physical - bf-- cr7_eq,copypv_einval // both operands may not be virtual - cmplwi r0,4 // byte count greater than or equal 256M (2**28)? - blt-- copypv_einval // byte count too big, give EINVAL - cmplwi r31,0 // byte count zero? - beq-- copypv_zero // early out - bt cr7_lt,copypv_phys // both operand addresses are physical - mr r28,r30 // assume destination is virtual - bf pvPsnk,copypv_dv // is destination virtual? - mr r28,r29 // no, so source must be virtual -copypv_dv: - lis r27,ha16(EXT(kernel_pmap)) // get kernel's pmap_t *, high-order - lwz r27,lo16(EXT(kernel_pmap))(r27) // get kernel's pmap_t - bt pvKmap,copypv_kern // virtual address in kernel map? - lwz r3,ACT_VMMAP(r9) // get user's vm_map * - rldicl r4,r28,32,32 // r4, r5 <- addr64_t virtual address - rldicl r5,r28,0,32 - std r29,FM_ARG0+0x30(r1) // preserve 64-bit r29 across 32-bit call - std r30,FM_ARG0+0x38(r1) // preserve 64-bit r30 across 32-bit call - bl EXT(MapUserMemoryWindow) // map slice of user space into kernel space - ld r29,FM_ARG0+0x30(r1) // restore 64-bit r29 - ld r30,FM_ARG0+0x38(r1) // restore 64-bit r30 - rlwinm r28,r3,0,1,0 // convert relocated addr64_t virtual address - rlwimi r28,r4,0,0,31 // into a single 64-bit scalar -copypv_kern: - -// Since we'll be accessing the virtual operand with data-relocation on, we won't need to -// update the referenced and changed bits manually after the copy. So, force the appropriate -// flag bit on for the virtual operand. - crorc pvNoModSnk,pvNoModSnk,pvPsnk // for virtual dest, let hardware do ref/chg bits - crorc pvNoRefSrc,pvNoRefSrc,pvPsrc // for virtual source, let hardware do ref bit - -// We'll be finding a mapping and looking at, so we need to disable 'rupts. - lis r0,hi16(MASK(MSR_VEC)) // get vector mask - ori r0,r0,lo16(MASK(MSR_FP)) // insert fp mask - mfmsr r26 // save current msr - andc r26,r26,r0 // turn off VEC and FP in saved copy - ori r0,r0,lo16(MASK(MSR_EE)) // add EE to our mask - andc r0,r26,r0 // disable EE in our new msr image - mtmsrd r0 // introduce new msr image - -// We're now holding the virtual operand's pmap_t in r27 and its virtual address in r28. We now -// try to find a mapping corresponding to this address in order to determine whether the address -// is cacheable. If we don't find a mapping, we can safely assume that the operand is cacheable -// (a non-cacheable operand must be a block mapping, which will always exist); otherwise, we -// examine the mapping's caching-inhibited bit. - mr r3,r27 // r3 <- pmap_t pmap - rldicl r4,r28,32,32 // r4, r5 <- addr64_t va - rldicl r5,r28,0,32 - la r6,FM_ARG0+0x18(r1) // r6 <- addr64_t *nextva - li r7,1 // r7 <- int full, search nested mappings - std r26,FM_ARG0+0x28(r1) // preserve 64-bit r26 across 32-bit calls - std r28,FM_ARG0+0x30(r1) // preserve 64-bit r28 across 32-bit calls - std r29,FM_ARG0+0x38(r1) // preserve 64-bit r29 across 32-bit calls - std r30,FM_ARG0+0x40(r1) // preserve 64-bit r30 across 32-bit calls - bl EXT(mapping_find) // find mapping for virtual operand - mr. r3,r3 // did we find it? - beq copypv_nomapping // nope, so we'll assume it's cacheable - lwz r4,mpVAddr+4(r3) // get low half of virtual addr for hw flags - rlwinm. r4,r4,0,mpIb-32,mpIb-32 // caching-inhibited bit set? - crnot pvNoCache,cr0_eq // if it is, use bcopy_nc - bl EXT(mapping_drop_busy) // drop busy on the mapping -copypv_nomapping: - ld r26,FM_ARG0+0x28(r1) // restore 64-bit r26 - ld r28,FM_ARG0+0x30(r1) // restore 64-bit r28 - ld r29,FM_ARG0+0x38(r1) // restore 64-bit r29 - ld r30,FM_ARG0+0x40(r1) // restore 64-bit r30 - mtmsrd r26 // restore msr to it's previous state - -// Set both the source and destination virtual addresses to the virtual operand's address -- -// we'll overlay one of them with the physical operand's address. - mr r27,r28 // make virtual operand BOTH source AND destination - -// Now we're ready to relocate the physical operand address(es) into the physical memory window. -// Recall that we've mapped physical memory (including the I/O hole) into the kernel's address -// space somewhere at or over the 2**32 line. If one or both of the operands are in the I/O hole, -// we'll set the pvNoCache flag, forcing use of non-caching bcopy_nc() to do the copy. -copypv_phys: - ld r6,lgPMWvaddr(0) // get physical memory window virtual address - bf pvPsnk,copypv_dstvirt // is destination address virtual? - cntlzd r4,r30 // count leading zeros in destination address - cmplwi r4,32 // if it's 32, then it's in the I/O hole (2**30 to 2**31-1) - cror pvNoCache,cr0_eq,pvNoCache // use bcopy_nc for I/O hole locations - add r28,r30,r6 // relocate physical destination into physical window -copypv_dstvirt: - bf pvPsrc,copypv_srcvirt // is source address virtual? - cntlzd r4,r29 // count leading zeros in source address - cmplwi r4,32 // if it's 32, then it's in the I/O hole (2**30 to 2**31-1) - cror pvNoCache,cr0_eq,pvNoCache // use bcopy_nc for I/O hole locations - add r27,r29,r6 // relocate physical source into physical window -copypv_srcvirt: - -// Once the copy is under way (bcopy or bcopy_nc), we will want to get control if anything -// funny happens during the copy. So, we set a pointer to our error handler in the per-thread -// control block. - mfsprg r8,1 // get current threads stuff - lis r3,hi16(copypv_error) // get our error callback's address, high - ori r3,r3,lo16(copypv_error) // get our error callback's address, low - stw r3,THREAD_RECOVER(r8) // set our error callback - -// Since our physical operand(s) are relocated at or above the 2**32 line, we must enter -// 64-bit mode. - li r0,1 // get a handy one bit - mfmsr r3 // get current msr - rldimi r3,r0,63,MSR_SF_BIT // set SF bit on in our msr copy - mtmsrd r3 // enter 64-bit mode - -// If requested, flush data cache -// Note that we don't flush, the code is being saved "just in case". -#if 0 - bf pvFsrc,copypv_nfs // do we flush the source? - rldicl r3,r27,32,32 // r3, r4 <- addr64_t source virtual address - rldicl r4,r27,0,32 - mr r5,r31 // r5 <- count (in bytes) - li r6,0 // r6 <- boolean phys (false, not physical) - bl EXT(flush_dcache) // flush the source operand -copypv_nfs: - bf pvFsnk,copypv_nfdx // do we flush the destination? - rldicl r3,r28,32,32 // r3, r4 <- addr64_t destination virtual address - rldicl r4,r28,0,32 - mr r5,r31 // r5 <- count (in bytes) - li r6,0 // r6 <- boolean phys (false, not physical) - bl EXT(flush_dcache) // flush the destination operand -copypv_nfdx: -#endif - -// Call bcopy or bcopy_nc to perform the copy. - mr r3,r27 // r3 <- source virtual address - mr r4,r28 // r4 <- destination virtual address - mr r5,r31 // r5 <- bytes to copy - bt pvNoCache,copypv_nc // take non-caching route - bl EXT(bcopy) // call bcopy to do the copying - b copypv_copydone -copypv_nc: - bl EXT(bcopy_nc) // call bcopy_nc to do the copying -copypv_copydone: - -// If requested, flush data cache -// Note that we don't flush, the code is being saved "just in case". -#if 0 - bf pvFsrc,copypv_nfsx // do we flush the source? - rldicl r3,r27,32,32 // r3, r4 <- addr64_t source virtual address - rldicl r4,r27,0,32 - mr r5,r31 // r5 <- count (in bytes) - li r6,0 // r6 <- boolean phys (false, not physical) - bl EXT(flush_dcache) // flush the source operand -copypv_nfsx: - bf pvFsnk,copypv_nfd // do we flush the destination? - rldicl r3,r28,32,32 // r3, r4 <- addr64_t destination virtual address - rldicl r4,r28,0,32 - mr r5,r31 // r5 <- count (in bytes) - li r6,0 // r6 <- boolean phys (false, not physical) - bl EXT(flush_dcache) // flush the destination operand -copypv_nfd: -#endif - -// Leave 64-bit mode. - mfmsr r3 // get current msr - rldicl r3,r3,0,MSR_SF_BIT+1 // clear SF bit in our copy - mtmsrd r3 // leave 64-bit mode - -// If requested, set ref/chg on source/dest physical operand(s). It is possible that copy is -// from/to a RAM disk situated outside of mapped physical RAM, so we check each page by calling -// mapping_phys_lookup() before we try to set its ref/chg bits; otherwise, we might panic. -// Note that this code is page-size sensitive, so it should probably be a part of our low-level -// code in hw_vm.s. - bt pvNoModSnk,copypv_nomod // skip destination update if not requested - std r29,FM_ARG0+0x30(r1) // preserve 64-bit r29 across 32-bit calls - li r26,1 // r26 <- 4K-page count - mr r27,r31 // r27 <- byte count - rlwinm r3,r30,0,20,31 // does destination cross a page boundary? - subfic r3,r3,4096 // - cmplw r3,r27 // - blt copypv_modnox // skip if not crossing case - subf r27,r3,r27 // r27 <- byte count less initial fragment - addi r26,r26,1 // increment page count -copypv_modnox: - srdi r3,r27,12 // pages to update (not including crosser) - add r26,r26,r3 // add in crosser - srdi r27,r30,12 // r27 <- destination page number -copypv_modloop: - mr r3,r27 // r3 <- destination page number - la r4,FM_ARG0+0x18(r1) // r4 <- unsigned int *pindex - bl EXT(mapping_phys_lookup) // see if page is really there - mr. r3,r3 // is it? - beq-- copypv_modend // nope, break out of modify loop - mr r3,r27 // r3 <- destination page number - bl EXT(mapping_set_mod) // set page changed status - subi r26,r26,1 // decrement page count - cmpwi r26,0 // done yet? - bgt copypv_modloop // nope, iterate -copypv_modend: - ld r29,FM_ARG0+0x30(r1) // restore 64-bit r29 -copypv_nomod: - bt pvNoRefSrc,copypv_done // skip source update if not requested -copypv_debugref: - li r26,1 // r26 <- 4K-page count - mr r27,r31 // r27 <- byte count - rlwinm r3,r29,0,20,31 // does source cross a page boundary? - subfic r3,r3,4096 // - cmplw r3,r27 // - blt copypv_refnox // skip if not crossing case - subf r27,r3,r27 // r27 <- byte count less initial fragment - addi r26,r26,1 // increment page count -copypv_refnox: - srdi r3,r27,12 // pages to update (not including crosser) - add r26,r26,r3 // add in crosser - srdi r27,r29,12 // r27 <- source page number -copypv_refloop: - mr r3,r27 // r3 <- source page number - la r4,FM_ARG0+0x18(r1) // r4 <- unsigned int *pindex - bl EXT(mapping_phys_lookup) // see if page is really there - mr. r3,r3 // is it? - beq-- copypv_done // nope, break out of modify loop - mr r3,r27 // r3 <- source page number - bl EXT(mapping_set_ref) // set page referenced status - subi r26,r26,1 // decrement page count - cmpwi r26,0 // done yet? - bgt copypv_refloop // nope, iterate - -// Return, indicating success. -copypv_done: -copypv_zero: - li r3,0 // our efforts were crowned with success - -// Pop frame, restore caller's non-volatiles, clear recovery routine pointer. -copypv_return: - mfsprg r9,1 // get current threads stuff - lwz r0,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_LR_SAVE)(r1) - // get return address - lwz r4,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_CR_SAVE)(r1) - // get non-volatile cr2 and cr3 - lwz r26,FM_ARG0+0x00(r1) // restore non-volatile r26 - lwz r27,FM_ARG0+0x04(r1) // restore non-volatile r27 - mtlr r0 // restore return address - lwz r28,FM_ARG0+0x08(r1) // restore non-volatile r28 - mtcrf 0x20,r4 // restore non-volatile cr2 - mtcrf 0x10,r4 // restore non-volatile cr3 - lwz r11,FM_ARG0+0x20(r1) // save error callback - lwz r29,FM_ARG0+0x0C(r1) // restore non-volatile r29 - lwz r30,FM_ARG0+0x10(r1) // restore non-volatile r30 - lwz r31,FM_ARG0+0x14(r1) // restore non-volatile r31 - stw r11,THREAD_RECOVER(r9) // restore our error callback - lwz r1,0(r1) // release stack frame - - blr // y'all come back now - -// Invalid argument handler. -copypv_einval: - li r3,EINVAL // invalid argument - b copypv_return // return - -// Error encountered during bcopy or bcopy_nc. -copypv_error: - mfmsr r3 // get current msr - rldicl r3,r3,0,MSR_SF_BIT+1 // clear SF bit in our copy - mtmsrd r3 // leave 64-bit mode - li r3,EFAULT // it was all his fault - b copypv_return // return diff --git a/osfmk/ppc/new_screen.h b/osfmk/ppc/new_screen.h deleted file mode 100644 index ba84184ef..000000000 --- a/osfmk/ppc/new_screen.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -#ifndef _NEW_SCREEN_H_ -#define _NEW_SCREEN_H_ - -#include - -/* AV and HPV cards */ -#define AV_BUFFER_START 0xE0000000 -#define AV_BUFFER_END 0xE0500000 -#define HPV_BUFFER_START 0xFE000000 -#define HPV_BUFFER_END 0xFF000000 - -extern void clear_RGB16(int color); -extern void adj_position(unsigned char C); -extern void put_cursor(int color); -extern void screen_put_char(unsigned char C); - -#endif /* _NEW_SCREEN_H_ */ diff --git a/osfmk/ppc/pcb.c b/osfmk/ppc/pcb.c deleted file mode 100644 index a38687b14..000000000 --- a/osfmk/ppc/pcb.c +++ /dev/null @@ -1,672 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Copyright (c) 1990,1991,1992 The University of Utah and - * the Center for Software Science (CSS). All rights reserved. - * - * Permission to use, copy, modify and distribute this software is hereby - * granted provided that (1) source code retains these copyright, permission, - * and disclaimer notices, and (2) redistributions including binaries - * reproduce the notices in supporting documentation, and (3) all advertising - * materials mentioning features or use of this software display the following - * acknowledgement: ``This product includes software developed by the Center - * for Software Science at the University of Utah.'' - * - * THE UNIVERSITY OF UTAH AND CSS ALLOW FREE USE OF THIS SOFTWARE IN ITS "AS - * IS" CONDITION. THE UNIVERSITY OF UTAH AND CSS DISCLAIM ANY LIABILITY OF - * ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * CSS requests users of this software to return to css-dist@cs.utah.edu any - * improvements that they make and grant CSS redistribution rights. - * - * Utah $Hdr: pcb.c 1.23 92/06/27$ - */ - -#include - -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -void machine_act_terminate(thread_t); - -/* - * These constants are dumb. They should not be in asm.h! - */ - -#define KF_SIZE (FM_SIZE+ARG_SIZE+FM_REDZONE) - -#if DEBUG -int fpu_trap_count = 0; -int fpu_switch_count = 0; -int vec_trap_count = 0; -int vec_switch_count = 0; -#endif - -/* - * consider_machine_collect: try to collect machine-dependent pages - */ -void -consider_machine_collect(void) -{ - /* - * XXX none currently available - */ -} - -void -consider_machine_adjust(void) -{ - consider_mapping_adjust(); -} - -/* - * switch_context: Switch from one thread to another, needed for - * switching of space - * - */ -thread_t -machine_switch_context( - thread_t old, - thread_continue_t continuation, - thread_t new) -{ - register thread_t retval; - pmap_t new_pmap; - facility_context *fowner; - struct per_proc_info *ppinfo; - - if (old == new) - panic("machine_switch_context"); - - ppinfo = getPerProc(); /* Get our processor block */ - - ppinfo->old_thread = (unsigned int)old; - - /* Our context might wake up on another processor, so we must - * not keep hot state in our FPU, it must go back to the pcb - * so that it can be found by the other if needed - */ - if(real_ncpus > 1) { /* This is potentially slow, so only do when actually SMP */ - fowner = ppinfo->FPU_owner; /* Cache this because it may change */ - if(fowner) { /* Is there any live context? */ - if(fowner->facAct == old) { /* Is it for us? */ - fpu_save(fowner); /* Yes, save it */ - } - } - fowner = ppinfo->VMX_owner; /* Cache this because it may change */ - if(fowner) { /* Is there any live context? */ - if(fowner->facAct == old) { /* Is it for us? */ - vec_save(fowner); /* Yes, save it */ - } - } - } - - /* - * If old thread is running VM, save per proc userProtKey and FamVMmode spcFlags bits in the thread spcFlags - * This bits can be modified in the per proc without updating the thread spcFlags - */ - if(old->machine.specFlags & runningVM) { - old->machine.specFlags &= ~(userProtKey|FamVMmode); - old->machine.specFlags |= (ppinfo->spcFlags) & (userProtKey|FamVMmode); - } - old->machine.specFlags &= ~OnProc; - new->machine.specFlags |= OnProc; - - /* - * We do not have to worry about the PMAP module, so switch. - * - * We must not use thread->map since this may not be the actual - * task map, but the map being used for a klcopyin/out. - */ - - if(new->machine.specFlags & runningVM) { /* Is the new guy running a VM? */ - pmap_switch(new->machine.vmmCEntry->vmmPmap); /* Switch to the VM's pmap */ - ppinfo->VMMareaPhys = new->machine.vmmCEntry->vmmContextPhys; - ppinfo->VMMXAFlgs = new->machine.vmmCEntry->vmmXAFlgs; - ppinfo->FAMintercept = new->machine.vmmCEntry->vmmFAMintercept; - } - else { /* otherwise, we use the task's pmap */ - new_pmap = new->task->map->pmap; - if ((old->task->map->pmap != new_pmap) || (old->machine.specFlags & runningVM)) { - pmap_switch(new_pmap); /* Switch if there is a change */ - } - } - - if(old->machine.umwSpace != invalSpace) { /* Does our old guy have an active window? */ - old->machine.umwSpace |= umwSwitchAway; /* Show we switched away from this guy */ - hw_blow_seg(lowGlo.lgUMWvaddr); /* Blow off the first segment */ - hw_blow_seg(lowGlo.lgUMWvaddr + 0x10000000ULL); /* Blow off the second segment */ - } - - retval = Switch_context(old, continuation, new); - assert(retval != NULL); - - /* We've returned from having switched context, so we should be - * back in the original context. - */ - - return retval; -} - -/* - * Initialize the machine-dependent state for a new thread. - */ -kern_return_t -machine_thread_create( - thread_t thread, - task_t task) -{ - struct savearea *sv; /* Pointer to newly allocated savearea */ - - (void)hw_atomic_add(&saveanchor.savetarget, 4); /* Account for the number of saveareas we think we "need" - for this activation */ - assert(thread->machine.pcb == (struct savearea *)0); /* Make sure there was no previous savearea */ - - sv = save_alloc(); /* Go get us a savearea */ - - bzero((char *)((unsigned int)sv + sizeof(savearea_comm)), (sizeof(struct savearea) - sizeof(savearea_comm))); /* Clear it */ - - sv->save_hdr.save_prev = 0; /* Clear the back pointer */ - sv->save_hdr.save_flags = (sv->save_hdr.save_flags & ~SAVtype) | (SAVgeneral << SAVtypeshft); /* Mark as in use */ - sv->save_hdr.save_act = thread; /* Set who owns it */ - thread->machine.pcb = sv; /* Point to the save area */ - thread->machine.curctx = &thread->machine.facctx; /* Initialize facility context */ - thread->machine.facctx.facAct = thread; /* Initialize facility context pointer to activation */ - thread->machine.umwSpace = invalSpace; /* Initialize user memory window space to invalid */ - thread->machine.preemption_count = 0; /* Initialize preemption counter */ - - /* - * User threads will pull their context from the pcb when first - * returning to user mode, so fill in all the necessary values. - * Kernel threads are initialized from the save state structure - * at the base of the kernel stack (see stack_attach()). - */ - - thread->machine.upcb = sv; /* Set user pcb */ - sv->save_srr1 = (uint64_t)MSR_EXPORT_MASK_SET; /* Set the default user MSR */ - if(task_has_64BitAddr(task)) sv->save_srr1 |= (uint64_t)MASK32(MSR_SF) << 32; /* If 64-bit task, force 64-bit mode */ - sv->save_fpscr = 0; /* Clear all floating point exceptions */ - sv->save_vrsave = 0; /* Set the vector save state */ - sv->save_vscr[0] = 0x00000000; - sv->save_vscr[1] = 0x00000000; - sv->save_vscr[2] = 0x00000000; - sv->save_vscr[3] = 0x00010000; /* Disable java mode and clear saturated */ - - return(KERN_SUCCESS); -} - -/* - * Machine-dependent cleanup prior to destroying a thread - */ -void -machine_thread_destroy( - thread_t thread) -{ - struct savearea *local_pcb, *ppsv; - savearea_vec *vsv, *vpsv; - savearea_fpu *fsv, *fpsv; - boolean_t intr; - -/* - * This function will release all context. - */ - - machine_act_terminate(thread); /* Make sure all virtual machines are dead first */ - -/* - * - * Walk through and release all floating point and vector contexts. Also kill live context. - * - */ - - intr = ml_set_interrupts_enabled(FALSE); /* Disable for interruptions */ - - toss_live_vec(thread->machine.curctx); /* Dump live vectors */ - - vsv = thread->machine.curctx->VMXsave; /* Get the top vector savearea */ - - while(vsv) { /* Any VMX saved state? */ - vpsv = vsv; /* Remember so we can toss this */ - /* XXX save_prev should be a void * 4425537 */ - vsv = CAST_DOWN(savearea_vec *, vsv->save_hdr.save_prev); /* Get one underneath our's */ - save_release((struct savearea *)vpsv); /* Release it */ - } - - thread->machine.curctx->VMXsave = NULL; /* Kill chain */ - - toss_live_fpu(thread->machine.curctx); /* Dump live float */ - - fsv = thread->machine.curctx->FPUsave; /* Get the top float savearea */ - - while(fsv) { /* Any float saved state? */ - fpsv = fsv; /* Remember so we can toss this */ - /* XXX save_prev should be a void * 4425537 */ - fsv = CAST_DOWN(savearea_fpu *, fsv->save_hdr.save_prev); /* Get one underneath our's */ - save_release((struct savearea *)fpsv); /* Release it */ - } - - thread->machine.curctx->FPUsave = NULL; /* Kill chain */ - -/* - * free all regular saveareas. - */ - - local_pcb = thread->machine.pcb; /* Get the general savearea */ - - while(local_pcb) { /* Any float saved state? */ - ppsv = local_pcb; /* Remember so we can toss this */ - /* XXX save_prev should be a void * 4425537 */ - local_pcb = CAST_DOWN(struct savearea *, local_pcb->save_hdr.save_prev); /* Get one underneath our's */ - save_release(ppsv); /* Release it */ - } - - (void)hw_atomic_sub(&saveanchor.savetarget, 4); /* Unaccount for the number of saveareas we think we "need" */ - - (void) ml_set_interrupts_enabled(intr); /* Restore interrupts if enabled */ - -} - -/* - * act_machine_sv_free - * release saveareas associated with a thread. if flag is true, release - * user level savearea(s) too, else don't - * - * This code must run with interruptions disabled because an interrupt handler - * could use floating point and/or vectors. If this happens and the thread we - * are blowing off owns the facility, we can deadlock. - */ -void -act_machine_sv_free(thread_t act, __unused int flag) -{ - struct savearea *local_pcb, *userpcb; - register savearea_vec *vsv, *vpst, *vsvt; - register savearea_fpu *fsv, *fpst, *fsvt; - struct savearea *svp; - boolean_t intr; - -/* - * This function will release all non-user state context. - */ - -/* - * - * Walk through and release all floating point and vector contexts that are not - * user state. We will also blow away live context if it belongs to non-user state. - * Note that the level can not change while we are in this code. Nor can another - * context be pushed on the stack. - * - * We do nothing here if the current level is user. Otherwise, - * the live context is cleared. Then we find the user saved context. - * Next, we take the sync lock (to keep us from munging things in *_switch). - * The level is set to 0 and all stacked context other than user is dequeued. - * Then we unlock. Next, all of the old kernel contexts are released. - * - */ - - intr = ml_set_interrupts_enabled(FALSE); /* Disable for interruptions */ - - if(act->machine.curctx->VMXlevel) { /* Is the current level user state? */ - - toss_live_vec(act->machine.curctx); /* Dump live vectors if is not user */ - - if(!hw_lock_to((hw_lock_t)&act->machine.curctx->VMXsync, LockTimeOut)) { /* Get the sync lock */ - panic("act_machine_sv_free - timeout getting VMX sync lock\n"); /* Tell all and die */ - } - - vsv = act->machine.curctx->VMXsave; /* Get the top vector savearea */ - while(vsv && vsv->save_hdr.save_level) /* Find user context if any */ - /* XXX save_prev should be a void * 4425537 */ - vsv = CAST_DOWN(savearea_vec *, - vsv->save_hdr.save_prev); - - vsvt = act->machine.curctx->VMXsave; /* Get the top of the chain */ - act->machine.curctx->VMXsave = vsv; /* Point to the user context */ - act->machine.curctx->VMXlevel = NULL; /* Set the level to user */ - hw_lock_unlock((hw_lock_t)&act->machine.curctx->VMXsync); /* Unlock */ - - while(vsvt) { /* Clear any VMX saved state */ - if (vsvt == vsv) break; /* Done when hit user if any */ - vpst = vsvt; /* Remember so we can toss this */ - /* XXX save_prev should be a void * 4425537 */ - vsvt = CAST_DOWN(savearea_vec *, vsvt->save_hdr.save_prev); /* Get one underneath our's */ - save_ret((struct savearea *)vpst); /* Release it */ - } - - } - - if(act->machine.curctx->FPUlevel) { /* Is the current level user state? */ - - toss_live_fpu(act->machine.curctx); /* Dump live floats if is not user */ - - if(!hw_lock_to((hw_lock_t)&act->machine.curctx->FPUsync, LockTimeOut)) { /* Get the sync lock */ - panic("act_machine_sv_free - timeout getting FPU sync lock\n"); /* Tell all and die */ - } - - fsv = act->machine.curctx->FPUsave; /* Get the top floats savearea */ - while(fsv && fsv->save_hdr.save_level) /* Find user context if any */ - /* XXX save_prev should be a void * */ - fsv = CAST_DOWN(savearea_fpu *, fsv->save_hdr.save_prev); - - fsvt = act->machine.curctx->FPUsave; /* Get the top of the chain */ - act->machine.curctx->FPUsave = fsv; /* Point to the user context */ - act->machine.curctx->FPUlevel = NULL; /* Set the level to user */ - hw_lock_unlock((hw_lock_t)&act->machine.curctx->FPUsync); /* Unlock */ - - while(fsvt) { /* Clear any VMX saved state */ - if (fsvt == fsv) break; /* Done when hit user if any */ - fpst = fsvt; /* Remember so we can toss this */ - /* XXX save_prev should be a void * 4425537 */ - fsvt = CAST_DOWN(savearea_fpu *, fsvt->save_hdr.save_prev); /* Get one underneath our's */ - save_ret((struct savearea *)fpst); /* Release it */ - } - - } - -/* - * free all regular saveareas except a user savearea, if any - */ - - local_pcb = act->machine.pcb; /* Get the general savearea */ - userpcb = NULL; /* Assume no user context for now */ - - while(local_pcb) { /* Any float saved state? */ - if (local_pcb->save_srr1 & MASK(MSR_PR)) { /* Is this a user savearea? */ - userpcb = local_pcb; /* Remember so we can toss this */ - break; - } - svp = local_pcb; /* Remember this */ - /* XXX save_prev should be a void * 4425537 */ - local_pcb = CAST_DOWN(struct savearea *, local_pcb->save_hdr.save_prev); /* Get one underneath our's */ - save_ret(svp); /* Release it */ - } - - act->machine.pcb = userpcb; /* Chain in the user if there is one, or 0 if not */ - (void) ml_set_interrupts_enabled(intr); /* Restore interrupts if enabled */ - -} - -void -machine_act_terminate( - thread_t act) -{ - if(act->machine.bbDescAddr) { /* Check if the Blue box assist is active */ - disable_bluebox_internal(act); /* Kill off bluebox */ - } - - if(act->machine.vmmControl) { /* Check if VMM is active */ - vmm_tear_down_all(act); /* Kill off all VMM contexts */ - } -} - -void -machine_thread_terminate_self(void) -{ - machine_act_terminate(current_thread()); -} - -void -machine_thread_init(void) -{ -#ifdef MACHINE_STACK -#if KERNEL_STACK_SIZE > PPC_PGBYTES - panic("KERNEL_STACK_SIZE can't be greater than PPC_PGBYTES\n"); -#endif -#endif -} - -#if MACH_ASSERT -void -dump_thread(thread_t th) -{ - printf(" thread @ %p:\n", th); -} -#endif /* MACH_ASSERT */ - -user_addr_t -get_useraddr(void) -{ - return(current_thread()->machine.upcb->save_srr0); -} - -/* - * detach and return a kernel stack from a thread - */ - -vm_offset_t -machine_stack_detach( - thread_t thread) -{ - vm_offset_t stack; - - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_SCHED,MACH_STACK_DETACH), - (uintptr_t)thread_tid(thread), thread->priority, - thread->sched_pri, 0, 0); - - act_machine_sv_free(thread, 0); /* XXX flag == 0 OK? */ - - stack = thread->kernel_stack; - thread->kernel_stack = 0; - return(stack); -} - -/* - * attach a kernel stack to a thread and initialize it - * - * attaches a stack to a thread. if there is no save - * area we allocate one. the top save area is then - * loaded with the pc (continuation address), the initial - * stack pointer, and a std kernel MSR. if the top - * save area is the user save area bad things will - * happen - * - */ - -void -machine_stack_attach( - thread_t thread, - vm_offset_t stack) -{ - unsigned int *kss; - struct savearea *sv; - - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_SCHED,MACH_STACK_ATTACH), - (uintptr_t)thread_tid(thread), thread->priority, - thread->sched_pri, 0, 0); - - assert(stack); - kss = (unsigned int *)STACK_IKS(stack); - thread->kernel_stack = stack; - - /* during initialization we sometimes do not have an - activation. in that case do not do anything */ - sv = save_get(); /* cannot block */ - sv->save_hdr.save_flags = (sv->save_hdr.save_flags & ~SAVtype) | (SAVgeneral << SAVtypeshft); /* Mark as in use */ - sv->save_hdr.save_act = thread; - sv->save_hdr.save_prev = (addr64_t)((uintptr_t)thread->machine.pcb); - thread->machine.pcb = sv; - - sv->save_srr0 = (unsigned int)thread_continue; - /* sv->save_r3 = ARG ? */ - sv->save_r1 = (vm_offset_t)((int)kss - KF_SIZE); - sv->save_srr1 = MSR_SUPERVISOR_INT_OFF; - sv->save_fpscr = 0; /* Clear all floating point exceptions */ - sv->save_vrsave = 0; /* Set the vector save state */ - sv->save_vscr[3] = 0x00010000; /* Supress java mode */ - *(CAST_DOWN(int *, sv->save_r1)) = 0; - - thread->machine.ksp = 0; -} - -/* - * move a stack from old to new thread - */ - -void -machine_stack_handoff( - thread_t old, - thread_t new) -{ - - vm_offset_t stack; - pmap_t new_pmap; - facility_context *fowner; - mapping_t *mp; - struct per_proc_info *ppinfo; - - assert(new); - assert(old); - - if (old == new) - panic("machine_stack_handoff"); - - stack = machine_stack_detach(old); - new->kernel_stack = stack; - if (stack == old->reserved_stack) { - assert(new->reserved_stack); - old->reserved_stack = new->reserved_stack; - new->reserved_stack = stack; - } - - ppinfo = getPerProc(); /* Get our processor block */ - - if(real_ncpus > 1) { /* This is potentially slow, so only do when actually SMP */ - fowner = ppinfo->FPU_owner; /* Cache this because it may change */ - if(fowner) { /* Is there any live context? */ - if(fowner->facAct == old) { /* Is it for us? */ - fpu_save(fowner); /* Yes, save it */ - } - } - fowner = ppinfo->VMX_owner; /* Cache this because it may change */ - if(fowner) { /* Is there any live context? */ - if(fowner->facAct == old) { /* Is it for us? */ - vec_save(fowner); /* Yes, save it */ - } - } - } - - /* - * If old thread is running VM, save per proc userProtKey and FamVMmode spcFlags bits in the thread spcFlags - * This bits can be modified in the per proc without updating the thread spcFlags - */ - if(old->machine.specFlags & runningVM) { /* Is the current thread running a VM? */ - old->machine.specFlags &= ~(userProtKey|FamVMmode); - old->machine.specFlags |= (ppinfo->spcFlags) & (userProtKey|FamVMmode); - } - old->machine.specFlags &= ~OnProc; - new->machine.specFlags |= OnProc; - - if(new->machine.specFlags & runningVM) { /* Is the new guy running a VM? */ - pmap_switch(new->machine.vmmCEntry->vmmPmap); /* Switch to the VM's pmap */ - ppinfo->VMMareaPhys = new->machine.vmmCEntry->vmmContextPhys; - ppinfo->VMMXAFlgs = new->machine.vmmCEntry->vmmXAFlgs; - ppinfo->FAMintercept = new->machine.vmmCEntry->vmmFAMintercept; - } - else { /* otherwise, we use the task's pmap */ - new_pmap = new->task->map->pmap; - if ((old->task->map->pmap != new_pmap) || (old->machine.specFlags & runningVM)) { - pmap_switch(new_pmap); - } - } - - machine_set_current_thread(new); - ppinfo->Uassist = new->machine.cthread_self; - - ppinfo->ppbbTaskEnv = new->machine.bbTaskEnv; - ppinfo->spcFlags = new->machine.specFlags; - - old->machine.umwSpace |= umwSwitchAway; /* Show we switched away from this guy */ - mp = (mapping_t *)&ppinfo->ppUMWmp; - mp->mpSpace = invalSpace; /* Since we can't handoff in the middle of copy in/out, just invalidate */ - - if(trcWork.traceMask) dbgTrace(0x9903, (unsigned int)old, (unsigned int)new, 0, 0); /* Cut trace entry if tracing */ - - return; -} - -void Call_continuation(thread_continue_t, void *, wait_result_t, vm_offset_t); - -/* - * clean and initialize the current kernel stack and go to - * the given continuation routine - */ - -void -call_continuation( - thread_continue_t continuation, - void *parameter, - wait_result_t wresult) -{ - thread_t self = current_thread(); - unsigned int *kss; - vm_offset_t tsp; - - assert(self->kernel_stack); - kss = (unsigned int *)STACK_IKS(self->kernel_stack); - assert(continuation); - - tsp = (vm_offset_t)((int)kss - KF_SIZE); - assert(tsp); - *((int *)tsp) = 0; - - Call_continuation(continuation, parameter, wresult, tsp); -} diff --git a/osfmk/ppc/pmap.c b/osfmk/ppc/pmap.c deleted file mode 100644 index 90940a9ab..000000000 --- a/osfmk/ppc/pmap.c +++ /dev/null @@ -1,2121 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1990,1991,1992 The University of Utah and - * the Center for Software Science (CSS). - * Copyright (c) 1991,1987 Carnegie Mellon University. - * All rights reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation, - * and that all advertising materials mentioning features or use of - * this software display the following acknowledgement: ``This product - * includes software developed by the Center for Software Science at - * the University of Utah.'' - * - * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSS ALLOW FREE USE OF - * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY - * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF - * THIS SOFTWARE. - * - * CSS requests users of this software to return to css-dist@cs.utah.edu any - * improvements that they make and grant CSS redistribution rights. - * - * Carnegie Mellon requests users of this software to return to - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - * - * Utah $Hdr: pmap.c 1.28 92/06/23$ - * Author: Mike Hibler, Bob Wheeler, University of Utah CSS, 10/90 - */ - -/* - * Manages physical address maps for powerpc. - * - * In addition to hardware address maps, this - * module is called upon to provide software-use-only - * maps which may or may not be stored in the same - * form as hardware maps. These pseudo-maps are - * used to store intermediate results from copy - * operations to and from address spaces. - * - * Since the information managed by this module is - * also stored by the logical address mapping module, - * this module may throw away valid virtual-to-physical - * mappings at almost any time. However, invalidations - * of virtual-to-physical mappings must be done as - * requested. - * - * In order to cope with hardware architectures which - * make virtual-to-physical map invalidates expensive, - * this module may delay invalidate or reduced protection - * operations until such time as they are actually - * necessary. This module is given full information to - * when physical maps must be made correct. - * - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include /* must be last */ - - -extern unsigned int avail_remaining; -unsigned int debugbackpocket; /* (TEST/DEBUG) */ - -vm_offset_t first_free_virt; -unsigned int current_free_region; /* Used in pmap_next_page */ - -pmapTransTab *pmapTrans; /* Point to the hash to pmap translations */ -struct phys_entry *phys_table; - -/* forward */ -static void pmap_map_physical(void); -static void pmap_map_iohole(addr64_t paddr, addr64_t size); -void pmap_activate(pmap_t pmap, thread_t th, int which_cpu); -void pmap_deactivate(pmap_t pmap, thread_t th, int which_cpu); - -extern void hw_hash_init(void); - -/* NOTE: kernel_pmap_store must be in V=R storage and aligned!!!!!!!!!!!!!! */ - -extern struct pmap kernel_pmap_store; -pmap_t kernel_pmap; /* Pointer to kernel pmap and anchor for in-use pmaps */ -addr64_t kernel_pmap_phys; /* Pointer to kernel pmap and anchor for in-use pmaps, physical address */ -pmap_t cursor_pmap; /* Pointer to last pmap allocated or previous if removed from in-use list */ -pmap_t sharedPmap; /* Pointer to common pmap for 64-bit address spaces */ -struct zone *pmap_zone; /* zone of pmap structures */ -boolean_t pmap_initialized = FALSE; - -int ppc_max_pmaps; /* Maximum number of concurrent address spaces allowed. This is machine dependent */ -addr64_t vm_max_address; /* Maximum effective address supported */ -addr64_t vm_max_physical; /* Maximum physical address supported */ - -/* - * Physical-to-virtual translations are handled by inverted page table - * structures, phys_tables. Multiple mappings of a single page are handled - * by linking the affected mapping structures. We initialise one region - * for phys_tables of the physical memory we know about, but more may be - * added as it is discovered (eg. by drivers). - */ - -/* - * free pmap list. caches the first free_pmap_max pmaps that are freed up - */ -int free_pmap_max = 32; -int free_pmap_count; -pmap_t free_pmap_list; -decl_simple_lock_data(,free_pmap_lock) - -/* - * Function to get index into phys_table for a given physical address - */ - -struct phys_entry *pmap_find_physentry(ppnum_t pa) -{ - int i; - unsigned int entry; - - for (i = pmap_mem_regions_count - 1; i >= 0; i--) { - if (pa < pmap_mem_regions[i].mrStart) continue; /* See if we fit in this region */ - if (pa > pmap_mem_regions[i].mrEnd) continue; /* Check the end too */ - - entry = (unsigned int)pmap_mem_regions[i].mrPhysTab + ((pa - pmap_mem_regions[i].mrStart) * sizeof(phys_entry_t)); - return (struct phys_entry *)entry; - } -// kprintf("DEBUG - pmap_find_physentry: page 0x%08X not found\n", pa); - return NULL; -} - -/* - * kern_return_t - * pmap_add_physical_memory(vm_offset_t spa, vm_offset_t epa, - * boolean_t available, unsigned int attr) - * - * THIS IS NOT SUPPORTED - */ -kern_return_t -pmap_add_physical_memory( - __unused vm_offset_t spa, - __unused vm_offset_t epa, - __unused boolean_t available, - __unused unsigned int attr) -{ - - panic("Forget it! You can't map no more memory, you greedy puke!\n"); - return KERN_SUCCESS; -} - -/* - * pmap_map(va, spa, epa, prot) - * is called during boot to map memory in the kernel's address map. - * A virtual address range starting at "va" is mapped to the physical - * address range "spa" to "epa" with machine independent protection - * "prot". - * - * "va", "spa", and "epa" are byte addresses and must be on machine - * independent page boundaries. - * - * Pages with a contiguous virtual address range, the same protection, and attributes. - * therefore, we map it with a single block. - * - * Note that this call will only map into 32-bit space - * - */ - -vm_offset_t -pmap_map( - vm_offset_t va, - vm_offset_t spa, - vm_offset_t epa, - vm_prot_t prot, - unsigned int flags) -{ - unsigned int mflags; - addr64_t colladr; - mflags = 0; /* Make sure this is initialized to nothing special */ - if(!(flags & VM_WIMG_USE_DEFAULT)) { /* Are they supplying the attributes? */ - mflags = mmFlgUseAttr | (flags & VM_MEM_GUARDED) | ((flags & VM_MEM_NOT_CACHEABLE) >> 1); /* Convert to our mapping_make flags */ - } - - if (spa == epa) return(va); - - assert(epa > spa); - - colladr = mapping_make(kernel_pmap, (addr64_t)va, (ppnum_t)(spa >> 12), - (mmFlgBlock | mmFlgPerm), (epa - spa) >> 12, (prot & VM_PROT_ALL) ); - - if(colladr) { /* Was something already mapped in the range? */ - panic("pmap_map: attempt to map previously mapped range - va = %08X, pa = %08X, epa = %08X, collision = %016llX\n", - va, spa, epa, colladr); - } - return(va); -} - -/* - * pmap_map_physical() - * Maps physical memory into the kernel's address map beginning at lgPMWvaddr, the - * physical memory window. - * - */ -void -pmap_map_physical(void) -{ - unsigned region; - uint64_t msize, size; - addr64_t paddr, vaddr, colladdr; - - /* Iterate over physical memory regions, block mapping each into the kernel's address map */ - for (region = 0; region < (unsigned)pmap_mem_regions_count; region++) { - paddr = ((addr64_t)pmap_mem_regions[region].mrStart << 12); /* Get starting physical address */ - size = (((addr64_t)pmap_mem_regions[region].mrEnd + 1) << 12) - paddr; - - vaddr = paddr + lowGlo.lgPMWvaddr; /* Get starting virtual address */ - - while (size > 0) { - - msize = ((size > 0x0000020000000000ULL) ? 0x0000020000000000ULL : size); /* Get size, but no more than 2TBs */ - - colladdr = mapping_make(kernel_pmap, vaddr, (paddr >> 12), - (mmFlgBlock | mmFlgPerm), (msize >> 12), - (VM_PROT_READ | VM_PROT_WRITE)); - if (colladdr) { - panic ("pmap_map_physical: mapping failure - va = %016llX, pa = %016llX, size = %016llX, collision = %016llX\n", - vaddr, (paddr >> 12), (msize >> 12), colladdr); - } - - vaddr = vaddr + (uint64_t)msize; /* Point to the next virtual addr */ - paddr = paddr + (uint64_t)msize; /* Point to the next physical addr */ - size -= msize; - } - } -} - -/* - * pmap_map_iohole(addr64_t paddr, addr64_t size) - * Maps an I/O hole into the kernel's address map at its proper offset in - * the physical memory window. - * - */ -void -pmap_map_iohole(addr64_t paddr, addr64_t size) -{ - - addr64_t vaddr, colladdr, msize; - - vaddr = paddr + lowGlo.lgPMWvaddr; /* Get starting virtual address */ - - while (size > 0) { - - msize = ((size > 0x0000020000000000ULL) ? 0x0000020000000000ULL : size); /* Get size, but no more than 2TBs */ - - colladdr = mapping_make(kernel_pmap, vaddr, (paddr >> 12), - (mmFlgBlock | mmFlgPerm | mmFlgGuarded | mmFlgCInhib), (msize >> 12), - (VM_PROT_READ | VM_PROT_WRITE)); - if (colladdr) { - panic ("pmap_map_iohole: mapping failed - va = %016llX, pa = %016llX, size = %016llX, collision = %016llX\n", - vaddr, (paddr >> 12), (msize >> 12), colladdr); - } - - vaddr = vaddr + (uint64_t)msize; /* Point to the next virtual addr */ - paddr = paddr + (uint64_t)msize; /* Point to the next physical addr */ - size -= msize; - } -} - -/* - * Bootstrap the system enough to run with virtual memory. - * Map the kernel's code and data, and allocate the system page table. - * Called with mapping done by BATs. Page_size must already be set. - * - * Parameters: - * msize: Total memory present - * first_avail: First virtual address available - * kmapsize: Size of kernel text and data - */ -void -pmap_bootstrap(uint64_t msize, vm_offset_t *first_avail, unsigned int kmapsize) -{ - vm_offset_t addr; - vm_size_t size; - unsigned int i, num, mapsize, vmpagesz, vmmapsz, nbits; - signed bank; - uint64_t tmemsize; - uint_t htslop; - vm_offset_t first_used_addr, PCAsize; - struct phys_entry *phys_entry; - - *first_avail = round_page(*first_avail); /* Make sure we start out on a page boundary */ - vm_last_addr = VM_MAX_KERNEL_ADDRESS; /* Set the highest address know to VM */ - - /* - * Initialize kernel pmap - */ - kernel_pmap = &kernel_pmap_store; - kernel_pmap_phys = (addr64_t)(uintptr_t)&kernel_pmap_store; - cursor_pmap = &kernel_pmap_store; - - kernel_pmap->pmap_link.next = (queue_t)kernel_pmap; /* Set up anchor forward */ - kernel_pmap->pmap_link.prev = (queue_t)kernel_pmap; /* Set up anchor reverse */ - kernel_pmap->ref_count = 1; - kernel_pmap->pmapFlags = pmapKeyDef; /* Set the default keys */ - kernel_pmap->pmapFlags |= pmapNXdisabled; - kernel_pmap->pmapCCtl = pmapCCtlVal; /* Initialize cache control */ - kernel_pmap->space = PPC_SID_KERNEL; - kernel_pmap->pmapvr = 0; /* Virtual = Real */ - -/* - * IBM's recommended hash table size is one PTEG for every 2 physical pages. - * However, we have found that OSX rarely uses more than 4 PTEs in a PTEG - * with this size table. Therefore, by default we allocate a hash table - * one half IBM's recommended size, ie one PTEG per 4 pages. The "ht_shift" boot-arg - * can be used to override the default hash table size. - * We will allocate the hash table in physical RAM, outside of kernel virtual memory, - * at the top of the highest bank that will contain it. - * Note that "bank" doesn't refer to a physical memory slot here, it is a range of - * physically contiguous memory. - * - * The PCA will go there as well, immediately before the hash table. - */ - - nbits = cntlzw(((msize << 1) - 1) >> 32); /* Get first bit in upper half */ - if (nbits == 32) /* If upper half was empty, find bit in bottom half */ - nbits = nbits + cntlzw((uint_t)((msize << 1) - 1)); - tmemsize = 0x8000000000000000ULL >> nbits; /* Get memory size rounded up to power of 2 */ - - /* Calculate hash table size: First, make sure we don't overflow 32-bit arithmetic. */ - if (tmemsize > 0x0000002000000000ULL) - tmemsize = 0x0000002000000000ULL; - - /* Second, calculate IBM recommended hash table size, ie one PTEG per 2 physical pages */ - hash_table_size = (uint_t)(tmemsize >> 13) * PerProcTable[0].ppe_vaddr->pf.pfPTEG; - - /* Third, cut this in half to produce the OSX default, ie one PTEG per 4 physical pages */ - hash_table_size >>= 1; - - /* Fourth, adjust default size per "ht_shift" boot arg */ - if (hash_table_shift >= 0) /* if positive, make size bigger */ - hash_table_size <<= hash_table_shift; - else /* if "ht_shift" is negative, make smaller */ - hash_table_size >>= (-hash_table_shift); - - /* Fifth, make sure we are at least minimum size */ - if (hash_table_size < (256 * 1024)) - hash_table_size = (256 * 1024); - - while(1) { /* Try to fit hash table in PCA into contiguous memory */ - - if(hash_table_size < (256 * 1024)) { /* Have we dropped too short? This should never, ever happen */ - panic("pmap_bootstrap: Can't find space for hash table\n"); /* This will never print, system isn't up far enough... */ - } - - PCAsize = (hash_table_size / PerProcTable[0].ppe_vaddr->pf.pfPTEG) * sizeof(PCA_t); /* Get total size of PCA table */ - PCAsize = round_page(PCAsize); /* Make sure it is at least a page long */ - - for(bank = pmap_mem_regions_count - 1; bank >= 0; bank--) { /* Search backwards through banks */ - - hash_table_base = ((addr64_t)pmap_mem_regions[bank].mrEnd << 12) - hash_table_size + PAGE_SIZE; /* Get tenative address */ - - htslop = hash_table_base & (hash_table_size - 1); /* Get the extra that we will round down when we align */ - hash_table_base = hash_table_base & -(addr64_t)hash_table_size; /* Round down to correct boundary */ - - if((hash_table_base - round_page(PCAsize)) >= ((addr64_t)pmap_mem_regions[bank].mrStart << 12)) break; /* Leave if we fit */ - } - - if(bank >= 0) break; /* We are done if we found a suitable bank */ - - hash_table_size = hash_table_size >> 1; /* Try the next size down */ - } - - if(htslop) { /* If there was slop (i.e., wasted pages for alignment) add a new region */ - for(i = pmap_mem_regions_count - 1; i >= (unsigned)bank; i--) { /* Copy from end to our bank, including our bank */ - pmap_mem_regions[i + 1].mrStart = pmap_mem_regions[i].mrStart; /* Set the start of the bank */ - pmap_mem_regions[i + 1].mrAStart = pmap_mem_regions[i].mrAStart; /* Set the start of allocatable area */ - pmap_mem_regions[i + 1].mrEnd = pmap_mem_regions[i].mrEnd; /* Set the end address of bank */ - pmap_mem_regions[i + 1].mrAEnd = pmap_mem_regions[i].mrAEnd; /* Set the end address of allocatable area */ - } - - pmap_mem_regions[i + 1].mrStart = (hash_table_base + hash_table_size) >> 12; /* Set the start of the next bank to the start of the slop area */ - pmap_mem_regions[i + 1].mrAStart = (hash_table_base + hash_table_size) >> 12; /* Set the start of allocatable area to the start of the slop area */ - pmap_mem_regions[i].mrEnd = (hash_table_base + hash_table_size - 4096) >> 12; /* Set the end of our bank to the end of the hash table */ - - } - - pmap_mem_regions[bank].mrAEnd = (hash_table_base - PCAsize - 4096) >> 12; /* Set the maximum allocatable in this bank */ - - hw_hash_init(); /* Initiaize the hash table and PCA */ - hw_setup_trans(); /* Set up hardware registers needed for translation */ - -/* - * The hash table is now all initialized and so is the PCA. Go on to do the rest of it. - * This allocation is from the bottom up. - */ - - num = atop_64(msize); /* Get number of pages in all of memory */ - -/* Figure out how much we need to allocate */ - - size = (vm_size_t) ( - (InitialSaveBloks * PAGE_SIZE) + /* Allow space for the initial context saveareas */ - (BackPocketSaveBloks * PAGE_SIZE) + /* For backpocket saveareas */ - trcWork.traceSize + /* Size of trace table */ - ((((1 << maxAdrSpb) * sizeof(pmapTransTab)) + 4095) & -4096) + /* Size of pmap translate table */ - (((num * sizeof(struct phys_entry)) + 4095) & -4096) /* For the physical entries */ - ); - - mapsize = size = round_page(size); /* Get size of area to map that we just calculated */ - mapsize = mapsize + kmapsize; /* Account for the kernel text size */ - - vmpagesz = round_page(num * sizeof(struct vm_page)); /* Allow for all vm_pages needed to map physical mem */ - vmmapsz = round_page((num / 8) * sizeof(struct vm_map_entry)); /* Allow for vm_maps */ - - mapsize = mapsize + vmpagesz + vmmapsz; /* Add the VM system estimates into the grand total */ - - mapsize = mapsize + (4 * 1024 * 1024); /* Allow for 4 meg of extra mappings */ - mapsize = ((mapsize / PAGE_SIZE) + MAPPERBLOK - 1) / MAPPERBLOK; /* Get number of blocks of mappings we need */ - mapsize = mapsize + ((mapsize + MAPPERBLOK - 1) / MAPPERBLOK); /* Account for the mappings themselves */ - - size = size + (mapsize * PAGE_SIZE); /* Get the true size we need */ - - /* hash table must be aligned to its size */ - - addr = *first_avail; /* Set the address to start allocations */ - first_used_addr = addr; /* Remember where we started */ - - bzero((char *)addr, size); /* Clear everything that we are allocating */ - - savearea_init(addr); /* Initialize the savearea chains and data */ - - addr = (vm_offset_t)((unsigned int)addr + ((InitialSaveBloks + BackPocketSaveBloks) * PAGE_SIZE)); /* Point past saveareas */ - - trcWork.traceCurr = (unsigned int)addr; /* Set first trace slot to use */ - trcWork.traceStart = (unsigned int)addr; /* Set start of trace table */ - trcWork.traceEnd = (unsigned int)addr + trcWork.traceSize; /* Set end of trace table */ - - addr = (vm_offset_t)trcWork.traceEnd; /* Set next allocatable location */ - - pmapTrans = (pmapTransTab *)addr; /* Point to the pmap to hash translation table */ - - pmapTrans[PPC_SID_KERNEL].pmapPAddr = (addr64_t)((uintptr_t)kernel_pmap); /* Initialize the kernel pmap in the translate table */ - pmapTrans[PPC_SID_KERNEL].pmapVAddr = CAST_DOWN(unsigned int, kernel_pmap); /* Initialize the kernel pmap in the translate table */ - - addr += ((((1 << maxAdrSpb) * sizeof(pmapTransTab)) + 4095) & -4096); /* Point past pmap translate table */ - -/* NOTE: the phys_table must be within the first 2GB of physical RAM. This makes sure we only need to do 32-bit arithmetic */ - - phys_entry = (struct phys_entry *) addr; /* Get pointer to physical table */ - - for (bank = 0; (unsigned)bank < pmap_mem_regions_count; bank++) { /* Set pointer and initialize all banks of ram */ - - pmap_mem_regions[bank].mrPhysTab = phys_entry; /* Set pointer to the physical table for this bank */ - - phys_entry = phys_entry + (pmap_mem_regions[bank].mrEnd - pmap_mem_regions[bank].mrStart + 1); /* Point to the next */ - } - - addr += (((num * sizeof(struct phys_entry)) + 4095) & -4096); /* Step on past the physical entries */ - -/* - * Remaining space is for mapping entries. Tell the initializer routine that - * the mapping system can't release this block because it's permanently assigned - */ - - mapping_init(); /* Initialize the mapping tables */ - - for(i = addr; i < first_used_addr + size; i += PAGE_SIZE) { /* Add initial mapping blocks */ - mapping_free_init(i, 1, 0); /* Pass block address and say that this one is not releasable */ - } - mapCtl.mapcmin = MAPPERBLOK; /* Make sure we only adjust one at a time */ - - /* Map V=R the page tables */ - pmap_map(first_used_addr, first_used_addr, - round_page(first_used_addr + size), VM_PROT_READ | VM_PROT_WRITE, VM_WIMG_USE_DEFAULT); - - *first_avail = round_page(first_used_addr + size); /* Set next available page */ - first_free_virt = *first_avail; /* Ditto */ - - /* For 64-bit machines, block map physical memory and the I/O hole into kernel space */ - if(BootProcInfo.pf.Available & pf64Bit) { /* Are we on a 64-bit machine? */ - lowGlo.lgPMWvaddr = PHYS_MEM_WINDOW_VADDR; /* Initialize the physical memory window's virtual address */ - - pmap_map_physical(); /* Block map physical memory into the window */ - - pmap_map_iohole(IO_MEM_WINDOW_VADDR, IO_MEM_WINDOW_SIZE); - /* Block map the I/O hole */ - } - - /* All the rest of memory is free - add it to the free - * regions so that it can be allocated by pmap_steal - */ - - pmap_mem_regions[0].mrAStart = (*first_avail >> 12); /* Set up the free area to start allocations (always in the first bank) */ - - current_free_region = 0; /* Set that we will start allocating in bank 0 */ - avail_remaining = 0; /* Clear free page count */ - for(bank = 0; (unsigned)bank < pmap_mem_regions_count; bank++) { /* Total up all of the pages in the system that are available */ - avail_remaining += (pmap_mem_regions[bank].mrAEnd - pmap_mem_regions[bank].mrAStart) + 1; /* Add in allocatable pages in this bank */ - } - - -} - -/* - * pmap_init(spa, epa) - * finishes the initialization of the pmap module. - * This procedure is called from vm_mem_init() in vm/vm_init.c - * to initialize any remaining data structures that the pmap module - * needs to map virtual memory (VM is already ON). - * - * Note that the pmap needs to be sized and aligned to - * a power of two. This is because it is used both in virtual and - * real so it can't span a page boundary. - */ - -void -pmap_init(void) -{ - - pmap_zone = zinit(pmapSize, 400 * pmapSize, 4096, "pmap"); -#if ZONE_DEBUG - zone_debug_disable(pmap_zone); /* Can't debug this one 'cause it messes with size and alignment */ -#endif /* ZONE_DEBUG */ - - pmap_initialized = TRUE; - - /* - * Initialize list of freed up pmaps - */ - free_pmap_list = NULL; /* Set that there are no free pmaps */ - free_pmap_count = 0; - simple_lock_init(&free_pmap_lock, 0); - -} - -unsigned int pmap_free_pages(void) -{ - return avail_remaining; -} - -/* - * This function allocates physical pages. - */ - -boolean_t -pmap_next_page_hi(ppnum_t * pnum) -{ - return pmap_next_page(pnum); -} - - -/* Non-optimal, but only used for virtual memory startup. - * Allocate memory from a table of free physical addresses - * If there are no more free entries, too bad. - */ - -boolean_t -pmap_next_page(ppnum_t *addrp) -{ - unsigned int i; - - if(current_free_region >= pmap_mem_regions_count) return FALSE; /* Return failure if we have used everything... */ - - for(i = current_free_region; i < pmap_mem_regions_count; i++) { /* Find the next bank with free pages */ - if(pmap_mem_regions[i].mrAStart <= pmap_mem_regions[i].mrAEnd) break; /* Found one */ - } - - current_free_region = i; /* Set our current bank */ - if(i >= pmap_mem_regions_count) return FALSE; /* Couldn't find a free page */ - - *addrp = pmap_mem_regions[i].mrAStart; /* Allocate the page */ - pmap_mem_regions[i].mrAStart = pmap_mem_regions[i].mrAStart + 1; /* Set the next one to go */ - avail_remaining--; /* Drop free count */ - - return TRUE; -} - -void pmap_virtual_space( - vm_offset_t *startp, - vm_offset_t *endp) -{ - *startp = round_page(first_free_virt); - *endp = vm_last_addr; -} - -/* - * pmap_create - * - * Create and return a physical map. - * - * If the size specified for the map is zero, the map is an actual physical - * map, and may be referenced by the hardware. - * - * A pmap is either in the free list or in the in-use list. The only use - * of the in-use list (aside from debugging) is to handle the VSID wrap situation. - * Whenever a new pmap is allocated (i.e., not recovered from the free list). The - * in-use list is matched until a hole in the VSID sequence is found. (Note - * that the in-use pmaps are queued in VSID sequence order.) This is all done - * while free_pmap_lock is held. - * - * If the size specified is non-zero, the map will be used in software - * only, and is bounded by that size. - */ -pmap_t -pmap_create(vm_map_size_t size, __unused boolean_t is_64bit) -{ - pmap_t pmap, ckpmap, fore; - int s; - unsigned int currSID; - addr64_t physpmap; - - /* - * A software use-only map doesn't even need a pmap structure. - */ - if (size) - return(PMAP_NULL); - - /* - * If there is a pmap in the pmap free list, reuse it. - * Note that we use free_pmap_list for all chaining of pmaps, both to - * the free list and the in use chain (anchored from kernel_pmap). - */ - s = splhigh(); - simple_lock(&free_pmap_lock); - - if(free_pmap_list) { /* Any free? */ - pmap = free_pmap_list; /* Yes, allocate it */ - free_pmap_list = (pmap_t)pmap->freepmap; /* Dequeue this one (we chain free ones through freepmap) */ - free_pmap_count--; - } - else { - simple_unlock(&free_pmap_lock); /* Unlock just in case */ - splx(s); - - pmap = (pmap_t) zalloc(pmap_zone); /* Get one */ - if (pmap == PMAP_NULL) return(PMAP_NULL); /* Handle out-of-memory condition */ - - bzero((char *)pmap, pmapSize); /* Clean up the pmap */ - - s = splhigh(); - simple_lock(&free_pmap_lock); /* Lock it back up */ - - ckpmap = cursor_pmap; /* Get starting point for free ID search */ - currSID = ckpmap->spaceNum; /* Get the actual space ID number */ - - while(1) { /* Keep trying until something happens */ - - currSID = (currSID + 1) & (maxAdrSp - 1); /* Get the next in the sequence */ - if(((currSID * incrVSID) & (maxAdrSp - 1)) == invalSpace) continue; /* Skip the space we have reserved */ - ckpmap = (pmap_t)ckpmap->pmap_link.next; /* On to the next in-use pmap */ - - if(ckpmap->spaceNum != currSID) break; /* If we are out of sequence, this is free */ - - if(ckpmap == cursor_pmap) { /* See if we have 2^20 already allocated */ - panic("pmap_create: Maximum number (%d) active address spaces reached\n", maxAdrSp); /* Die pig dog */ - } - } - - pmap->space = (currSID * incrVSID) & (maxAdrSp - 1); /* Calculate the actual VSID */ - pmap->spaceNum = currSID; /* Set the space ID number */ -/* - * Now we link into the chain just before the out of sequence guy. - */ - - fore = (pmap_t)ckpmap->pmap_link.prev; /* Get the current's previous */ - pmap->pmap_link.next = (queue_t)ckpmap; /* My next points to the current */ - fore->pmap_link.next = (queue_t)pmap; /* Current's previous's next points to me */ - pmap->pmap_link.prev = (queue_t)fore; /* My prev points to what the current pointed to */ - ckpmap->pmap_link.prev = (queue_t)pmap; /* Current's prev points to me */ - - physpmap = ((addr64_t)pmap_find_phys(kernel_pmap, (addr64_t)((uintptr_t)pmap)) << 12) | (addr64_t)((unsigned int)pmap & 0xFFF); /* Get the physical address of the pmap */ - - pmap->pmapvr = (addr64_t)((uintptr_t)pmap) ^ physpmap; /* Make V to R translation mask */ - - pmapTrans[pmap->space].pmapPAddr = physpmap; /* Set translate table physical to point to us */ - pmapTrans[pmap->space].pmapVAddr = CAST_DOWN(unsigned int, pmap); /* Set translate table virtual to point to us */ - } - - pmap->pmapVmmExt = NULL; /* Clear VMM extension block vaddr */ - pmap->pmapVmmExtPhys = 0; /* and the paddr, too */ - pmap->pmapFlags = pmapKeyDef; /* Set default key */ - pmap->pmapCCtl = pmapCCtlVal; /* Initialize cache control */ - pmap->ref_count = 1; - pmap->stats.resident_count = 0; - pmap->stats.wired_count = 0; - pmap->pmapSCSubTag = 0x0000000000000000ULL; /* Make sure this is clean an tidy */ - simple_unlock(&free_pmap_lock); - - splx(s); - return(pmap); -} - -/* - * pmap_destroy - * - * Gives up a reference to the specified pmap. When the reference count - * reaches zero the pmap structure is added to the pmap free list. - * - * Should only be called if the map contains no valid mappings. - */ -void -pmap_destroy(pmap_t pmap) -{ - uint32_t ref_count; - spl_t s; - pmap_t fore, aft; - - if (pmap == PMAP_NULL) - return; - - if ((ref_count = hw_atomic_sub(&pmap->ref_count, 1)) == UINT_MAX) /* underflow */ - panic("pmap_destroy(): ref_count < 0"); - - if (ref_count > 0) - return; /* Still more users, leave now... */ - - if (!(pmap->pmapFlags & pmapVMgsaa)) { /* Don't try this for a shadow assist guest */ - pmap_unmap_sharedpage(pmap); /* Remove any mapping of page -1 */ - } - -#ifdef notdef - if(pmap->stats.resident_count != 0) - panic("PMAP_DESTROY: pmap not empty"); -#else - if(pmap->stats.resident_count != 0) { - pmap_remove(pmap, 0, 0xFFFFFFFFFFFFF000ULL); - } -#endif - - /* - * Add the pmap to the pmap free list. - */ - - s = splhigh(); - /* - * Add the pmap to the pmap free list. - */ - simple_lock(&free_pmap_lock); - - if (free_pmap_count <= free_pmap_max) { /* Do we have enough spares? */ - - pmap->freepmap = free_pmap_list; /* Queue in front */ - free_pmap_list = pmap; - free_pmap_count++; - simple_unlock(&free_pmap_lock); - - } else { - if(cursor_pmap == pmap) cursor_pmap = (pmap_t)pmap->pmap_link.prev; /* If we are releasing the cursor, back up */ - fore = (pmap_t)pmap->pmap_link.prev; - aft = (pmap_t)pmap->pmap_link.next; - fore->pmap_link.next = pmap->pmap_link.next; /* My previous's next is my next */ - aft->pmap_link.prev = pmap->pmap_link.prev; /* My next's previous is my previous */ - simple_unlock(&free_pmap_lock); - pmapTrans[pmap->space].pmapPAddr = -1; /* Invalidate the translate table physical */ - pmapTrans[pmap->space].pmapVAddr = -1; /* Invalidate the translate table virtual */ - zfree(pmap_zone, pmap); - } - splx(s); -} - -/* - * pmap_reference(pmap) - * gains a reference to the specified pmap. - */ -void -pmap_reference(pmap_t pmap) -{ - if (pmap != PMAP_NULL) - (void)hw_atomic_add(&pmap->ref_count, 1); /* Bump the count */ -} - -/* - * pmap_remove_some_phys - * - * Removes mappings of the associated page from the specified pmap - * - */ -void pmap_remove_some_phys( - pmap_t pmap, - vm_offset_t pa) -{ - register struct phys_entry *pp; - register struct mapping *mp; - unsigned int pindex; - - if (pmap == PMAP_NULL) { /* This should never be called with a null pmap */ - panic("pmap_remove_some_phys: null pmap\n"); - } - - pp = mapping_phys_lookup(pa, &pindex); /* Get physical entry */ - if (pp == 0) return; /* Leave if not in physical RAM */ - - do { /* Keep going until we toss all pages from this pmap */ - if (pmap->pmapFlags & pmapVMhost) { - mp = hw_purge_phys(pp); /* Toss a map */ - switch ((unsigned int)mp & mapRetCode) { - case mapRtOK: - mapping_free(mp); /* Return mapping to free inventory */ - break; - case mapRtGuest: - break; /* Don't try to return a guest mapping */ - case mapRtEmpty: - break; /* Physent chain empty, we're done */ - case mapRtNotFnd: - break; /* Mapping disappeared on us, retry */ - default: - panic("pmap_remove_some_phys: hw_purge_phys failed - pp = %p, pmap = %p, code = %p\n", - pp, pmap, mp); /* Handle failure with our usual lack of tact */ - } - } else { - mp = hw_purge_space(pp, pmap); /* Toss a map */ - switch ((unsigned int)mp & mapRetCode) { - case mapRtOK: - mapping_free(mp); /* Return mapping to free inventory */ - break; - case mapRtEmpty: - break; /* Physent chain empty, we're done */ - case mapRtNotFnd: - break; /* Mapping disappeared on us, retry */ - default: - panic("pmap_remove_some_phys: hw_purge_phys failed - pp = %p, pmap = %p, code = %p\n", - pp, pmap, mp); /* Handle failure with our usual lack of tact */ - } - } - } while (mapRtEmpty != ((unsigned int)mp & mapRetCode)); - -#if DEBUG - if ((pmap->pmapFlags & pmapVMhost) && !pmap_verify_free(pa)) - panic("pmap_remove_some_phys: cruft left behind - pa = %08X, pmap = %p\n", pa, pmap); -#endif - - return; /* Leave... */ -} - -/* - * pmap_remove(pmap, s, e) - * unmaps all virtual addresses v in the virtual address - * range determined by [s, e) and pmap. - * s and e must be on machine independent page boundaries and - * s must be less than or equal to e. - * - * Note that pmap_remove does not remove any mappings in nested pmaps. We just - * skip those segments. - */ -void -pmap_remove( - pmap_t pmap, - addr64_t sva, - addr64_t eva) -{ - addr64_t va, endva; - - if (pmap == PMAP_NULL) return; /* Leave if software pmap */ - - - /* It is just possible that eva might have wrapped around to zero, - * and sometimes we get asked to liberate something of size zero - * even though it's dumb (eg. after zero length read_overwrites) - */ - assert(eva >= sva); - - /* If these are not page aligned the loop might not terminate */ - assert((sva == trunc_page_64(sva)) && (eva == trunc_page_64(eva))); - - va = sva & -4096LL; /* Round start down to a page */ - endva = eva & -4096LL; /* Round end down to a page */ - - while(1) { /* Go until we finish the range */ - va = mapping_remove(pmap, va); /* Remove the mapping and see what's next */ - va = va & -4096LL; /* Make sure the "not found" indication is clear */ - if((va == 0) || (va >= endva)) break; /* End loop if we finish range or run off the end */ - } - -} - -/* - * Routine: - * pmap_page_protect - * - * Function: - * Lower the permission for all mappings to a given page. - */ -void -pmap_page_protect( - ppnum_t pa, - vm_prot_t prot) -{ - register struct phys_entry *pp; - boolean_t remove; - unsigned int pindex; - mapping_t *mp; - - - switch (prot & VM_PROT_ALL) { - case VM_PROT_READ: - case VM_PROT_READ|VM_PROT_EXECUTE: - remove = FALSE; - break; - case VM_PROT_ALL: - return; - default: - remove = TRUE; - break; - } - - - pp = mapping_phys_lookup(pa, &pindex); /* Get physical entry */ - if (pp == 0) return; /* Leave if not in physical RAM */ - - if (remove) { /* If the protection was set to none, we'll remove all mappings */ - - do { /* Keep going until we toss all pages from this physical page */ - mp = hw_purge_phys(pp); /* Toss a map */ - switch ((unsigned int)mp & mapRetCode) { - case mapRtOK: - mapping_free(mp); /* Return mapping to free inventory */ - break; - case mapRtGuest: - break; /* Don't try to return a guest mapping */ - case mapRtNotFnd: - break; /* Mapping disappeared on us, retry */ - case mapRtEmpty: - break; /* Physent chain empty, we're done */ - default: panic("pmap_page_protect: hw_purge_phys failed - pp = %p, code = %p\n", - pp, mp); /* Handle failure with our usual lack of tact */ - } - } while (mapRtEmpty != ((unsigned int)mp & mapRetCode)); - -#if DEBUG - if (!pmap_verify_free(pa)) - panic("pmap_page_protect: cruft left behind - pa = %08X\n", pa); -#endif - - return; /* Leave... */ - } - -/* When we get here, it means that we are to change the protection for a - * physical page. - */ - - mapping_protect_phys(pa, (prot & VM_PROT_ALL) ); /* Change protection of all mappings to page. */ - -} - -/* - * Routine: - * pmap_disconnect - * - * Function: - * Disconnect all mappings for this page and return reference and change status - * in generic format. - * - */ -unsigned int pmap_disconnect( - ppnum_t pa) -{ - register struct phys_entry *pp; - unsigned int pindex; - mapping_t *mp; - - pp = mapping_phys_lookup(pa, &pindex); /* Get physical entry */ - if (pp == 0) return (0); /* Return null ref and chg if not in physical RAM */ - do { /* Iterate until all mappings are dead and gone */ - mp = hw_purge_phys(pp); /* Disconnect a mapping */ - if (!mp) break; /* All mappings are gone, leave the loop */ - switch ((unsigned int)mp & mapRetCode) { - case mapRtOK: - mapping_free(mp); /* Return mapping to free inventory */ - break; - case mapRtGuest: - break; /* Don't try to return a guest mapping */ - case mapRtNotFnd: - break; /* Mapping disappeared on us, retry */ - case mapRtEmpty: - break; /* Physent chain empty, we're done */ - default: panic("hw_purge_phys: hw_purge_phys failed - pp = %p, code = %p\n", - pp, mp); /* Handle failure with our usual lack of tact */ - } - } while (mapRtEmpty != ((unsigned int)mp & mapRetCode)); - -#if DEBUG - if (!pmap_verify_free(pa)) - panic("pmap_disconnect: cruft left behind - pa = %08X\n", pa); -#endif - - return (mapping_tst_refmod(pa)); /* Return page ref and chg in generic format */ -} - - -boolean_t -pmap_is_noencrypt(__unused ppnum_t pn) -{ - return (FALSE); -} - -void -pmap_set_noencrypt(__unused ppnum_t pn) -{ -} - -void -pmap_clear_noencrypt(__unused ppnum_t pn) -{ -} - - -/* - * pmap_protect(pmap, s, e, prot) - * changes the protection on all virtual addresses v in the - * virtual address range determined by [s, e] and pmap to prot. - * s and e must be on machine independent page boundaries and - * s must be less than or equal to e. - * - * Note that any requests to change the protection of a nested pmap are - * ignored. Those changes MUST be done by calling this with the correct pmap. - */ -void pmap_protect( - pmap_t pmap, - vm_map_offset_t sva, - vm_map_offset_t eva, - vm_prot_t prot) -{ - - addr64_t va, endva; - - if (pmap == PMAP_NULL) return; /* Do nothing if no pmap */ - - if (prot == VM_PROT_NONE) { /* Should we kill the address range?? */ - pmap_remove(pmap, (addr64_t)sva, (addr64_t)eva); /* Yeah, dump 'em */ - return; /* Leave... */ - } - - va = sva & -4096LL; /* Round start down to a page */ - endva = eva & -4096LL; /* Round end down to a page */ - - while(1) { /* Go until we finish the range */ - mapping_protect(pmap, va, (prot & VM_PROT_ALL), &va); /* Change the protection and see what's next */ - if((va == 0) || (va >= endva)) break; /* End loop if we finish range or run off the end */ - } - -} - - - -/* - * pmap_enter - * - * Create a translation for the virtual address (virt) to the physical - * address (phys) in the pmap with the protection requested. If the - * translation is wired then we can not allow a full page fault, i.e., - * the mapping control block is not eligible to be stolen in a low memory - * condition. - * - * NB: This is the only routine which MAY NOT lazy-evaluate - * or lose information. That is, this routine must actually - * insert this page into the given map NOW. - */ -void -pmap_enter(pmap_t pmap, vm_map_offset_t va, ppnum_t pa, vm_prot_t prot, - unsigned int flags, __unused boolean_t wired) -{ - unsigned int mflags; - addr64_t colva; - - if (pmap == PMAP_NULL) return; /* Leave if software pmap */ - - mflags = 0; /* Make sure this is initialized to nothing special */ - if(!(flags & VM_WIMG_USE_DEFAULT)) { /* Are they supplying the attributes? */ - mflags = mmFlgUseAttr | (flags & VM_MEM_GUARDED) | ((flags & VM_MEM_NOT_CACHEABLE) >> 1); /* Convert to our mapping_make flags */ - } - -/* - * It is possible to hang here if another processor is remapping any pages we collide with and are removing - */ - - while(1) { /* Keep trying the enter until it goes in */ - - colva = mapping_make(pmap, va, pa, mflags, 1, (prot & VM_PROT_ALL) ); /* Enter the mapping into the pmap */ - - if(!colva) break; /* If there were no collisions, we are done... */ - - mapping_remove(pmap, colva); /* Remove the mapping that collided */ - } -} - -/* - * Enters translations for odd-sized V=F blocks. - * - * The higher level VM map should be locked to insure that we don't have a - * double diddle here. - * - * We panic if we get a block that overlaps with another. We do not merge adjacent - * blocks because removing any address within a block removes the entire block and if - * would really mess things up if we trashed too much. - * - * Once a block is mapped, it is unmutable, that is, protection, catch mode, etc. can - * not be changed. The block must be unmapped and then remapped with the new stuff. - * We also do not keep track of reference or change flags. - * - * Any block that is larger than 256MB must be a multiple of 32MB. We panic if it is not. - * - * Note that pmap_map_block_rc is the same but doesn't panic if collision. - * - */ - -void pmap_map_block(pmap_t pmap, addr64_t va, ppnum_t pa, uint32_t size, vm_prot_t prot, int attr, unsigned int flags) { /* Map an autogenned block */ - - unsigned int mflags; - addr64_t colva; - - - if (pmap == PMAP_NULL) { /* Did they give us a pmap? */ - panic("pmap_map_block: null pmap\n"); /* No, like that's dumb... */ - } - -// kprintf("pmap_map_block: (%08X) va = %016llX, pa = %08X, size = %08X, prot = %08X, attr = %08X, flags = %08X\n", /* (BRINGUP) */ -// current_thread(), va, pa, size, prot, attr, flags); /* (BRINGUP) */ - - mflags = mmFlgBlock | mmFlgUseAttr | (attr & VM_MEM_GUARDED) | ((attr & VM_MEM_NOT_CACHEABLE) >> 1); /* Convert to our mapping_make flags */ - if(flags) mflags |= mmFlgPerm; /* Mark permanent if requested */ - - colva = mapping_make(pmap, va, pa, mflags, size, prot); /* Enter the mapping into the pmap */ - - if(colva) { /* If there was a collision, panic */ - panic("pmap_map_block: mapping error %d, pmap = %p, va = %016llX\n", (uint32_t)(colva & mapRetCode), pmap, va); - } - - return; /* Return */ -} - -int pmap_map_block_rc(pmap_t pmap, addr64_t va, ppnum_t pa, uint32_t size, vm_prot_t prot, int attr, unsigned int flags) { /* Map an autogenned block */ - - unsigned int mflags; - addr64_t colva; - - - if (pmap == PMAP_NULL) { /* Did they give us a pmap? */ - panic("pmap_map_block_rc: null pmap\n"); /* No, like that's dumb... */ - } - - mflags = mmFlgBlock | mmFlgUseAttr | (attr & VM_MEM_GUARDED) | ((attr & VM_MEM_NOT_CACHEABLE) >> 1); /* Convert to our mapping_make flags */ - if(flags) mflags |= mmFlgPerm; /* Mark permanent if requested */ - - colva = mapping_make(pmap, va, pa, mflags, size, prot); /* Enter the mapping into the pmap */ - - if(colva) return 0; /* If there was a collision, fail */ - - return 1; /* Return true of we worked */ -} - -/* - * pmap_extract(pmap, va) - * returns the physical address corrsponding to the - * virtual address specified by pmap and va if the - * virtual address is mapped and 0 if it is not. - * Note: we assume nothing is ever mapped to phys 0. - * - * NOTE: This call always will fail for physical addresses greater than 0xFFFFF000. - */ -vm_offset_t pmap_extract(pmap_t pmap, vm_map_offset_t va) { - - spl_t spl; - register struct mapping *mp; - register vm_offset_t pa; - addr64_t nextva; - ppnum_t ppoffset; - unsigned int gva; - -#ifdef BOGUSCOMPAT - panic("pmap_extract: THIS CALL IS BOGUS. NEVER USE IT EVER. So there...\n"); /* Don't use this */ -#else - - gva = (unsigned int)va; /* Make sure we don't have a sign */ - - spl = splhigh(); /* We can't allow any loss of control here */ - - mp = mapping_find(pmap, (addr64_t)gva, &nextva,1); /* Find the mapping for this address */ - - if(!mp) { /* Is the page mapped? */ - splx(spl); /* Enable interrupts */ - return 0; /* Pass back 0 if not found */ - } - - ppoffset = (ppnum_t)(((gva & -4096LL) - (mp->mpVAddr & -4096LL)) >> 12); /* Get offset from va to base va */ - - - pa = mp->mpPAddr + ppoffset; /* Remember ppage because mapping may vanish after drop call */ - - mapping_drop_busy(mp); /* We have everything we need from the mapping */ - splx(spl); /* Restore 'rupts */ - - if(pa > maxPPage32) return 0; /* Force large addresses to fail */ - - pa = (pa << 12) | (va & 0xFFF); /* Convert physical page number to address */ - -#endif - return pa; /* Return physical address or 0 */ -} - -/* - * ppnum_t pmap_find_phys(pmap, addr64_t va) - * returns the physical page corrsponding to the - * virtual address specified by pmap and va if the - * virtual address is mapped and 0 if it is not. - * Note: we assume nothing is ever mapped to phys 0. - * - */ -ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va) { - - spl_t spl; - register struct mapping *mp; - ppnum_t pa, ppoffset; - addr64_t nextva; - - spl = splhigh(); /* We can't allow any loss of control here */ - - mp = mapping_find(pmap, va, &nextva, 1); /* Find the mapping for this address */ - - if(!mp) { /* Is the page mapped? */ - splx(spl); /* Enable interrupts */ - return 0; /* Pass back 0 if not found */ - } - - - ppoffset = (ppnum_t)(((va & -4096LL) - (mp->mpVAddr & -4096LL)) >> 12); /* Get offset from va to base va */ - - pa = mp->mpPAddr + ppoffset; /* Get the actual physical address */ - - mapping_drop_busy(mp); /* We have everything we need from the mapping */ - - splx(spl); /* Restore 'rupts */ - return pa; /* Return physical address or 0 */ -} - - -/* - * pmap_attributes: - * - * Set/Get special memory attributes; not implemented. - * - * Note: 'VAL_GET_INFO' is used to return info about a page. - * If less than 1 page is specified, return the physical page - * mapping and a count of the number of mappings to that page. - * If more than one page is specified, return the number - * of resident pages and the number of shared (more than - * one mapping) pages in the range; - * - * - */ -kern_return_t -pmap_attribute( - __unused pmap_t pmap, - __unused vm_map_offset_t address, - __unused vm_map_size_t size, - __unused vm_machine_attribute_t attribute, - __unused vm_machine_attribute_val_t* value) -{ - - return KERN_INVALID_ARGUMENT; - -} - - - -unsigned int pmap_cache_attributes(ppnum_t pgn) { - - unsigned int flags; - struct phys_entry * pp; - - // Find physical address - if ((pp = pmap_find_physentry(pgn))) { - // Use physical attributes as default - // NOTE: DEVICE_PAGER_FLAGS are made to line up - flags = VM_MEM_COHERENT; /* We only support coherent memory */ - if (pp->ppLink & ppG) flags |= VM_MEM_GUARDED; /* Add in guarded if it is */ - if (pp->ppLink & ppI) flags |= VM_MEM_NOT_CACHEABLE; /* Add in cache inhibited if so */ - } else - // If no physical, just hard code attributes - flags = VM_WIMG_IO; - - return (flags); -} - - - -/* - * pmap_attribute_cache_sync(vm_offset_t pa) - * - * Invalidates all of the instruction cache on a physical page and - * pushes any dirty data from the data cache for the same physical page - */ - -kern_return_t pmap_attribute_cache_sync(ppnum_t pp, vm_size_t size, - __unused vm_machine_attribute_t attribute, - __unused vm_machine_attribute_val_t* value) { - - spl_t s; - unsigned int i, npages; - - npages = round_page(size) >> 12; /* Get the number of pages to do */ - - for(i = 0; i < npages; i++) { /* Do all requested pages */ - s = splhigh(); /* No interruptions here */ - sync_ppage(pp + i); /* Go flush data cache and invalidate icache */ - splx(s); /* Allow interruptions */ - } - - return KERN_SUCCESS; -} - -/* - * pmap_sync_page_data_phys(ppnum_t pa) - * - * Invalidates all of the instruction cache on a physical page and - * pushes any dirty data from the data cache for the same physical page - */ - -void pmap_sync_page_data_phys(ppnum_t pa) { - - spl_t s; - - s = splhigh(); /* No interruptions here */ - sync_ppage(pa); /* Sync up dem caches */ - splx(s); /* Allow interruptions */ - return; -} - -void -pmap_sync_page_attributes_phys(ppnum_t pa) -{ - pmap_sync_page_data_phys(pa); -} - -#ifdef CURRENTLY_UNUSED_AND_UNTESTED -/* - * pmap_collect - * - * Garbage collects the physical map system for pages that are no longer used. - * It isn't implemented or needed or wanted. - */ -void -pmap_collect(__unused pmap_t pmap) -{ - return; -} -#endif - -/* - * Routine: pmap_activate - * Function: - * Binds the given physical map to the given - * processor, and returns a hardware map description. - * It isn't implemented or needed or wanted. - */ -void -pmap_activate( - __unused pmap_t pmap, - __unused thread_t th, - __unused int which_cpu) -{ - return; -} -/* - * pmap_deactivate: - * It isn't implemented or needed or wanted. - */ -void -pmap_deactivate( - __unused pmap_t pmap, - __unused thread_t th, - __unused int which_cpu) -{ - return; -} - - -/* - * pmap_pageable(pmap, s, e, pageable) - * Make the specified pages (by pmap, offset) - * pageable (or not) as requested. - * - * A page which is not pageable may not take - * a fault; therefore, its page table entry - * must remain valid for the duration. - * - * This routine is merely advisory; pmap_enter() - * will specify that these pages are to be wired - * down (or not) as appropriate. - * - * (called from vm/vm_fault.c). - */ -void -pmap_pageable( - __unused pmap_t pmap, - __unused vm_map_offset_t start, - __unused vm_map_offset_t end, - __unused boolean_t pageable) -{ - - return; /* This is not used... */ - -} -/* - * Routine: pmap_change_wiring - * NOT USED ANYMORE. - */ -void -pmap_change_wiring( - __unused pmap_t pmap, - __unused vm_map_offset_t va, - __unused boolean_t wired) -{ - return; /* This is not used... */ -} - -/* - * pmap_clear_modify(phys) - * clears the hardware modified ("dirty") bit for one - * machine independant page starting at the given - * physical address. phys must be aligned on a machine - * independant page boundary. - */ -void -pmap_clear_modify(ppnum_t pa) -{ - - mapping_clr_mod(pa); /* Clear all change bits for physical page */ - -} - -/* - * pmap_is_modified(phys) - * returns TRUE if the given physical page has been modified - * since the last call to pmap_clear_modify(). - */ -boolean_t -pmap_is_modified(register ppnum_t pa) -{ - return mapping_tst_mod(pa); /* Check for modified */ - -} - -/* - * pmap_clear_reference(phys) - * clears the hardware referenced bit in the given machine - * independant physical page. - * - */ -void -pmap_clear_reference(ppnum_t pa) -{ - mapping_clr_ref(pa); /* Check for modified */ -} - -/* - * pmap_is_referenced(phys) - * returns TRUE if the given physical page has been referenced - * since the last call to pmap_clear_reference(). - */ -boolean_t -pmap_is_referenced(ppnum_t pa) -{ - return mapping_tst_ref(pa); /* Check for referenced */ -} - -/* - * pmap_get_refmod(phys) - * returns the referenced and modified bits of the specified - * physical page. - */ -unsigned int -pmap_get_refmod(ppnum_t pa) -{ - return (mapping_tst_refmod(pa)); -} - -/* - * pmap_clear_refmod(phys, mask) - * clears the referenced and modified bits as specified by the mask - * of the specified physical page. - */ -void -pmap_clear_refmod(ppnum_t pa, unsigned int mask) -{ - mapping_clr_refmod(pa, mask); -} - -/* - * pmap_eligible_for_execute(ppnum_t pa) - * return true if physical address is eligible to contain executable code; - * otherwise, return false - */ -boolean_t -pmap_eligible_for_execute(ppnum_t pa) -{ - phys_entry_t *physent; - unsigned int pindex; - - physent = mapping_phys_lookup(pa, &pindex); /* Get physical entry */ - - if((!physent) || (physent->ppLink & ppG)) - return 0; /* If there is no physical entry or marked guarded, - the entry is not eligible for execute */ - - return 1; /* Otherwise, entry is eligible for execute */ -} - -#if MACH_VM_DEBUG -int -pmap_list_resident_pages( - __unused pmap_t pmap, - __unused vm_offset_t *listp, - __unused int space) -{ - return 0; -} -#endif /* MACH_VM_DEBUG */ - -/* - * Locking: - * spl: VM - */ -void -pmap_copy_part_page( - vm_offset_t src, - vm_offset_t src_offset, - vm_offset_t dst, - vm_offset_t dst_offset, - vm_size_t len) -{ - addr64_t fsrc, fdst; - - assert((((dst << 12) & PAGE_MASK) + dst_offset + len) <= PAGE_SIZE); - assert((((src << 12) & PAGE_MASK) + src_offset + len) <= PAGE_SIZE); - - fsrc = ((addr64_t)src << 12) + src_offset; - fdst = ((addr64_t)dst << 12) + dst_offset; - - phys_copy(fsrc, fdst, len); /* Copy the stuff physically */ -} - -void -pmap_zero_part_page( - __unused vm_offset_t p, - __unused vm_offset_t offset, - __unused vm_size_t len) -{ - panic("pmap_zero_part_page"); -} - -boolean_t pmap_verify_free(ppnum_t pa) { - - struct phys_entry *pp; - unsigned int pindex; - - pp = mapping_phys_lookup(pa, &pindex); /* Get physical entry */ - if (pp == 0) return FALSE; /* If there isn't one, show no mapping... */ - - if(pp->ppLink & ~(ppLock | ppFlags)) return FALSE; /* We have at least one mapping */ - return TRUE; /* No mappings */ -} - - -/* Determine if we need to switch space and set up for it if so */ - -void pmap_switch(pmap_t map) -{ - hw_blow_seg(lowGlo.lgUMWvaddr); /* Blow off the first segment */ - hw_blow_seg(lowGlo.lgUMWvaddr + 0x10000000ULL); /* Blow off the second segment */ - -/* when changing to kernel space, don't bother - * doing anything, the kernel is mapped from here already. - */ - if (map->space == PPC_SID_KERNEL) { /* Are we switching into kernel space? */ - return; /* If so, we don't do anything... */ - } - - hw_set_user_space(map); /* Indicate if we need to load the SRs or not */ - return; /* Bye, bye, butterfly... */ -} - - -/* - * The PPC pmap can only nest segments of 256MB, aligned on a 256MB boundary. - */ -uint64_t pmap_nesting_size_min = 0x10000000ULL; -uint64_t pmap_nesting_size_max = 0x10000000ULL; - -/* - * kern_return_t pmap_nest(grand, subord, vstart, size) - * - * grand = the pmap that we will nest subord into - * subord = the pmap that goes into the grand - * vstart = start of range in pmap to be inserted - * nstart = start of range in pmap nested pmap - * size = Size of nest area (up to 2TB) - * - * Inserts a pmap into another. This is used to implement shared segments. - * On the current PPC processors, this is limited to segment (256MB) aligned - * segment sized ranges. - * - * We actually kinda allow recursive nests. The gating factor is that we do not allow - * nesting on top of something that is already mapped, i.e., the range must be empty. - * - * Note that we depend upon higher level VM locks to insure that things don't change while - * we are doing this. For example, VM should not be doing any pmap enters while it is nesting - * or do 2 nests at once. - */ - -kern_return_t pmap_nest(pmap_t grand, pmap_t subord, addr64_t vstart, addr64_t nstart, uint64_t size) { - - addr64_t vend, colladdr; - unsigned int msize; - int nlists; - mapping_t *mp; - - if(size & 0x0FFFFFFFULL) return KERN_INVALID_VALUE; /* We can only do this for multiples of 256MB */ - if((size >> 25) > 65536) return KERN_INVALID_VALUE; /* Max size we can nest is 2TB */ - if(vstart & 0x0FFFFFFFULL) return KERN_INVALID_VALUE; /* We can only do this aligned to 256MB */ - if(nstart & 0x0FFFFFFFULL) return KERN_INVALID_VALUE; /* We can only do this aligned to 256MB */ - - if(size == 0) { /* Is the size valid? */ - panic("pmap_nest: size is invalid - %016llX\n", size); - } - - msize = (size >> 25) - 1; /* Change size to blocks of 32MB */ - - nlists = mapSetLists(grand); /* Set number of lists this will be on */ - - mp = mapping_alloc(nlists); /* Get a spare mapping block */ - - mp->mpFlags = 0x01000000 | mpNest | mpPerm | mpBSu | nlists; /* Make this a permanent nested pmap with a 32MB basic size unit */ - /* Set the flags. Make sure busy count is 1 */ - mp->mpSpace = subord->space; /* Set the address space/pmap lookup ID */ - mp->u.mpBSize = msize; /* Set the size */ - mp->mpPte = 0; /* Set the PTE invalid */ - mp->mpPAddr = 0; /* Set the physical page number */ - mp->mpVAddr = vstart; /* Set the address */ - mp->mpNestReloc = nstart - vstart; /* Set grand to nested vaddr relocation value */ - - colladdr = hw_add_map(grand, mp); /* Go add the mapping to the pmap */ - - if(colladdr) { /* Did it collide? */ - vend = vstart + size - 4096; /* Point to the last page we would cover in nest */ - panic("pmap_nest: attempt to nest into a non-empty range - pmap = %p, start = %016llX, end = %016llX\n", - grand, vstart, vend); - } - - return KERN_SUCCESS; -} - -/* - * kern_return_t pmap_unnest(grand, vaddr, size) - * - * grand = the pmap that we will nest subord into - * vaddr = start of range in pmap to be unnested - * size = size of range in pmap to be unnested - * - * Removes a pmap from another. This is used to implement shared segments. - * On the current PPC processors, this is limited to segment (256MB) aligned - * segment sized ranges. - */ - -kern_return_t pmap_unnest(pmap_t grand, addr64_t vaddr, uint64_t size) { - - unsigned int tstamp, i, mycpu; - addr64_t nextva; - spl_t s; - mapping_t *mp; - - if (size != pmap_nesting_size_min || - (vaddr & (pmap_nesting_size_min-1))) { - panic("pmap_unnest(vaddr=0x%016llx, size=0x016%llx): " - "must be 256MB and aligned\n", - vaddr, size); - } - - s = splhigh(); /* Make sure interruptions are disabled */ - - mp = mapping_find(grand, vaddr, &nextva, 0); /* Find the nested map */ - - if(((unsigned int)mp & mapRetCode) != mapRtOK) { /* See if it was even nested */ - panic("pmap_unnest: Attempt to unnest an unnested segment - va = %016llX\n", vaddr); - } - - if((mp->mpFlags & mpType) != mpNest) { /* Did we find something other than a nest? */ - panic("pmap_unnest: Attempt to unnest something that is not a nest - va = %016llX\n", vaddr); - } - - if(mp->mpVAddr != vaddr) { /* Make sure the address is the same */ - panic("pmap_unnest: Attempt to unnest something that is not at start of nest - va = %016llX\n", vaddr); - } - - hw_atomic_and_noret(&mp->mpFlags, ~mpPerm); /* Show that this mapping is now removable */ - - mapping_drop_busy(mp); /* Go ahead and release the mapping now */ - - splx(s); /* Restore 'rupts */ - - (void)mapping_remove(grand, vaddr); /* Toss the nested pmap mapping */ - - invalidateSegs(grand); /* Invalidate the pmap segment cache */ - -/* - * Note that the following will force the segment registers to be reloaded - * on all processors (if they are using the pmap we just changed) before returning. - * - * This is needed. The reason is that until the segment register is - * reloaded, another thread in the same task on a different processor will - * be able to access memory that it isn't allowed to anymore. That can happen - * because access to the subordinate pmap is being removed, but the pmap is still - * valid. - * - * Note that we only kick the other processor if we see that it was using the pmap while we - * were changing it. - */ - - - for(i=0; i < real_ncpus; i++) { /* Cycle through processors */ - disable_preemption(); - mycpu = cpu_number(); /* Who am I? Am I just a dream? */ - if((unsigned int)grand == PerProcTable[i].ppe_vaddr->ppUserPmapVirt) { /* Is this guy using the changed pmap? */ - - PerProcTable[i].ppe_vaddr->ppInvSeg = 1; /* Show that we need to invalidate the segments */ - - if(i != mycpu) { - - tstamp = PerProcTable[i].ppe_vaddr->ruptStamp[1]; /* Save the processor's last interrupt time stamp */ - if(cpu_signal(i, SIGPcpureq, CPRQsegload, 0) == KERN_SUCCESS) { /* Make sure we see the pmap change */ - if(!hw_cpu_wcng(&PerProcTable[i].ppe_vaddr->ruptStamp[1], tstamp, LockTimeOut)) { /* Wait for the other processors to enter debug */ - panic("pmap_unnest: Other processor (%d) did not see interruption request\n", i); - } - } - } - } - enable_preemption(); - } - - return KERN_SUCCESS; /* Bye, bye, butterfly... */ -} - -boolean_t pmap_adjust_unnest_parameters(__unused pmap_t p, __unused vm_map_offset_t *s, __unused vm_map_offset_t *e) { - return FALSE; /* Not implemented on PowerPC */ -} - -/* - * void MapUserMemoryWindowInit(void) - * - * Initialize anything we need to in order to map user address space slices into - * the kernel. Primarily used for copy in/out. - * - * Currently we only support one 512MB slot for this purpose. There are two special - * mappings defined for the purpose: the special pmap nest, and linkage mapping. - * - * The special pmap nest (which is allocated in this function) is used as a place holder - * in the kernel's pmap search list. It is 512MB long and covers the address range - * starting at lgUMWvaddr. It points to no actual memory and when the fault handler - * hits in it, it knows to look in the per_proc and start using the linkage - * mapping contained therin. - * - * The linkage mapping is used to glue the user address space slice into the - * kernel. It contains the relocation information used to transform the faulting - * kernel address into the user address space. It also provides the link to the - * user's pmap. This is pointed to by the per_proc and is switched in and out - * whenever there is a context switch. - * - */ - -void MapUserMemoryWindowInit(void) { - - addr64_t colladdr; - int nlists; - mapping_t *mp; - - nlists = mapSetLists(kernel_pmap); /* Set number of lists this will be on */ - - mp = mapping_alloc(nlists); /* Get a spare mapping block */ - - mp->mpFlags = 0x01000000 | mpLinkage | mpPerm | mpBSu | nlists; /* Make this a permanent nested pmap with a 32MB basic size unit */ - /* Set the flags. Make sure busy count is 1 */ - mp->mpSpace = kernel_pmap->space; /* Set the address space/pmap lookup ID */ - mp->u.mpBSize = 15; /* Set the size to 2 segments in 32MB chunks - 1 */ - mp->mpPte = 0; /* Means nothing */ - mp->mpPAddr = 0; /* Means nothing */ - mp->mpVAddr = lowGlo.lgUMWvaddr; /* Set the address range we cover */ - mp->mpNestReloc = 0; /* Means nothing */ - - colladdr = hw_add_map(kernel_pmap, mp); /* Go add the mapping to the pmap */ - - if(colladdr) { /* Did it collide? */ - panic("MapUserMemoryWindowInit: MapUserMemoryWindow range already mapped\n"); - } - - return; -} - -/* - * addr64_t MapUserMemoryWindow(vm_map_t map, vm_offset_t va, size) - * - * map = the vm_map that we are mapping into the kernel - * va = start of the address range we are mapping - * Note that we do not test validty, we chose to trust our fellows... - * - * Maps a 512M slice of a user address space into a predefined kernel range - * on a per-thread basis. We map only the first 256M segment, allowing the - * second 256M segment to fault in as needed. This allows our clients to access - * an arbitrarily aligned operand up to 256M in size. - * - * In the future, the restriction of a predefined range may be loosened. - * - * Builds the proper linkage map to map the user range - * We will round this down to the previous segment boundary and calculate - * the relocation to the kernel slot - * - * We always make a segment table entry here if we need to. This is mainly because of - * copyin/out and if we don't, there will be multiple segment faults for - * each system call. I have seen upwards of 30000 per second. - * - * We do check, however, to see if the slice is already mapped and if so, - * we just exit. This is done for performance reasons. It was found that - * there was a considerable boost in copyin/out performance if we did not - * invalidate the segment at ReleaseUserAddressSpace time, so we dumped the - * restriction that you had to bracket MapUserMemoryWindow. Further, there - * is a yet further boost if you didn't need to map it each time. The theory - * behind this is that many times copies are to or from the same segment and - * done multiple times within the same system call. To take advantage of that, - * we check umwSpace and umwRelo to see if we've already got it. - * - * We also need to half-invalidate the slice when we context switch or go - * back to user state. A half-invalidate does not clear the actual mapping, - * but it does force the MapUserMemoryWindow function to reload the segment - * register/SLBE. If this is not done, we can end up some pretty severe - * performance penalties. If we map a slice, and the cached space/relocation is - * the same, we won't reload the segment registers. Howver, since we ran someone else, - * our SR is cleared and we will take a fault. This is reasonable if we block - * while copying (e.g., we took a page fault), but it is not reasonable when we - * just start. For this reason, we half-invalidate to make sure that the SR is - * explicitly reloaded. - * - * Note that we do not go to the trouble of making a pmap segment cache - * entry for these guys because they are very short term -- 99.99% of the time - * they will be unmapped before the next context switch. - * - */ - -addr64_t MapUserMemoryWindow( - vm_map_t map, - addr64_t va) { - - addr64_t baddrs, reladd; - thread_t thread; - mapping_t *mp; - - baddrs = va & 0xFFFFFFFFF0000000ULL; /* Isolate the segment */ - thread = current_thread(); /* Remember our activation */ - - reladd = baddrs - lowGlo.lgUMWvaddr; /* Get the relocation from user to kernel */ - - if((thread->machine.umwSpace == map->pmap->space) && (thread->machine.umwRelo == reladd)) { /* Already mapped? */ - return ((va & 0x0FFFFFFFULL) | lowGlo.lgUMWvaddr); /* Pass back the kernel address we are to use */ - } - - disable_preemption(); /* Don't move... */ - - mp = (mapping_t *)&(getPerProc()->ppUMWmp); /* Make up for C */ - thread->machine.umwRelo = reladd; /* Relocation from user to kernel */ - mp->mpNestReloc = reladd; /* Relocation from user to kernel */ - - thread->machine.umwSpace = map->pmap->space; /* Set the address space/pmap lookup ID */ - mp->mpSpace = map->pmap->space; /* Set the address space/pmap lookup ID */ - -/* - * Here we make an assumption that we are going to be using the base pmap's address space. - * If we are wrong, and that would be very, very, very rare, the fault handler will fix us up. - */ - - hw_map_seg(map->pmap, lowGlo.lgUMWvaddr, baddrs); /* Make the entry for the first segment */ - - enable_preemption(); /* Let's move */ - return ((va & 0x0FFFFFFFULL) | lowGlo.lgUMWvaddr); /* Pass back the kernel address we are to use */ -} - -#if CONFIG_DTRACE -/* - * Constrain DTrace copyin/copyout actions - */ -extern kern_return_t dtrace_copyio_preflight(addr64_t); -extern kern_return_t dtrace_copyio_postflight(addr64_t); - -kern_return_t dtrace_copyio_preflight(__unused addr64_t va) -{ - if (current_map() == kernel_map) - return KERN_FAILURE; - else - return KERN_SUCCESS; -} - -kern_return_t dtrace_copyio_postflight(__unused addr64_t va) -{ - thread_t thread = current_thread(); - - thread->machine.umwSpace |= umwSwitchAway; - return KERN_SUCCESS; -} -#endif /* CONFIG_DTRACE */ - -/* - * kern_return_t pmap_boot_map(size) - * - * size = size of virtual address range to be mapped - * - * This function is used to assign a range of virtual addresses before VM in - * initialized. It starts at VM_MAX_KERNEL_ADDRESS and works downward. - * The variable vm_last_addr contains the current highest possible VM - * assignable address. It is a panic to attempt to call this after VM has - * started up. The only problem is, is that we may not have the serial or - * framebuffer mapped, so we'll never know we died......... - */ - -vm_offset_t pmap_boot_map(vm_size_t size) { - - if(kernel_map != VM_MAP_NULL) { /* Has VM already started? */ - panic("pmap_boot_map: VM started\n"); - } - - size = round_page(size); /* Make sure this is in pages */ - vm_last_addr = vm_last_addr - size; /* Allocate the memory */ - return (vm_last_addr + 1); /* Return the vaddr we just allocated */ - -} - - -/* - * void pmap_init_sharedpage(void); - * - * Hack map for the 64-bit commpage - */ - -void pmap_init_sharedpage(vm_offset_t cpg){ - - addr64_t cva, cpoff; - ppnum_t cpphys; - - sharedPmap = pmap_create(0, FALSE); /* Get a pmap to hold the common segment */ - if(!sharedPmap) { /* Check for errors */ - panic("pmap_init_sharedpage: couldn't make sharedPmap\n"); - } - - for(cpoff = 0; cpoff < _COMM_PAGE_AREA_USED; cpoff += 4096) { /* Step along now */ - - cpphys = pmap_find_phys(kernel_pmap, (addr64_t)cpg + cpoff); - if(!cpphys) { - panic("pmap_init_sharedpage: compage %016llX not mapped in kernel\n", cpg + cpoff); - } - - cva = mapping_make(sharedPmap, (addr64_t)((uint32_t)_COMM_PAGE_BASE_ADDRESS) + cpoff, - cpphys, mmFlgPerm, 1, VM_PROT_READ | VM_PROT_EXECUTE); /* Map the page read/execute only */ - if(cva) { /* Check for errors */ - panic("pmap_init_sharedpage: couldn't map commpage page - cva = %016llX\n", cva); - } - - } - - return; -} - - -/* - * void pmap_map_sharedpage(pmap_t pmap); - * - * Maps the last segment in a 64-bit address space - * - * - */ - -void pmap_map_sharedpage(task_t task, pmap_t pmap){ - - kern_return_t ret; - - if(task_has_64BitAddr(task) || _cpu_capabilities & k64Bit) { /* Should we map the 64-bit page -1? */ - ret = pmap_nest(pmap, sharedPmap, 0xFFFFFFFFF0000000ULL, 0x00000000F0000000ULL, - 0x0000000010000000ULL); /* Nest the highest possible segment to map comm page */ - if(ret != KERN_SUCCESS) { /* Did it work? */ - panic("pmap_map_sharedpage: couldn't nest shared page - ret = %08X\n", ret); - } - } - - return; -} - - -/* - * void pmap_unmap_sharedpage(pmap_t pmap); - * - * Unmaps the last segment in a 64-bit address space - * - */ - -void pmap_unmap_sharedpage(pmap_t pmap){ - - kern_return_t ret; - mapping_t *mp; - boolean_t inter; - int gotnest; - addr64_t nextva; - - if(BootProcInfo.pf.Available & pf64Bit) { /* Are we on a 64-bit machine? */ - - inter = ml_set_interrupts_enabled(FALSE); /* Disable interruptions for now */ - mp = hw_find_map(pmap, 0xFFFFFFFFF0000000ULL, &nextva); /* Find the mapping for this address */ - if((unsigned int)mp == mapRtBadLk) { /* Did we lock up ok? */ - panic("pmap_unmap_sharedpage: mapping lock failure - rc = %p, pmap = %p\n", mp, pmap); /* Die... */ - } - - gotnest = 0; /* Assume nothing here */ - if(mp) { - gotnest = ((mp->mpFlags & mpType) == mpNest); - /* Remember if we have a nest here */ - mapping_drop_busy(mp); /* We have everything we need from the mapping */ - } - ml_set_interrupts_enabled(inter); /* Put interrupts back to what they were */ - - if(!gotnest) return; /* Leave if there isn't any nesting here */ - - ret = pmap_unnest(pmap, 0xFFFFFFFFF0000000ULL, 0x0000000010000000ULL); /* Unnest the max 64-bit page */ - - if(ret != KERN_SUCCESS) { /* Did it work? */ - panic("pmap_unmap_sharedpage: couldn't unnest shared page - ret = %08X\n", ret); - } - } - - return; -} - - -/* temporary workaround */ -boolean_t -coredumpok( - __unused vm_map_t map, - __unused vm_offset_t va) -{ - return TRUE; -} - - -/* - * disable no-execute capability on - * the specified pmap - */ -void pmap_disable_NX(pmap_t pmap) { - - pmap->pmapFlags |= pmapNXdisabled; -} - diff --git a/osfmk/ppc/pmap.h b/osfmk/ppc/pmap.h deleted file mode 100644 index 24db51ea2..000000000 --- a/osfmk/ppc/pmap.h +++ /dev/null @@ -1,338 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Copyright (c) 1990 The University of Utah and - * the Center for Software Science at the University of Utah (CSS). - * All rights reserved. - * - * Permission to use, copy, modify and distribute this software is hereby - * granted provided that (1) source code retains these copyright, permission, - * and disclaimer notices, and (2) redistributions including binaries - * reproduce the notices in supporting documentation, and (3) all advertising - * materials mentioning features or use of this software display the following - * acknowledgement: ``This product includes software developed by the Center - * for Software Science at the University of Utah.'' - * - * THE UNIVERSITY OF UTAH AND CSS ALLOW FREE USE OF THIS SOFTWARE IN ITS "AS - * IS" CONDITION. THE UNIVERSITY OF UTAH AND CSS DISCLAIM ANY LIABILITY OF - * ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * CSS requests users of this software to return to css-dist@cs.utah.edu any - * improvements that they make and grant CSS redistribution rights. - * - * Utah $Hdr: pmap.h 1.13 91/09/25$ - * Author: Mike Hibler, Bob Wheeler, University of Utah CSS, 9/90 - */ - -#ifndef _PPC_PMAP_H_ -#define _PPC_PMAP_H_ - -#include -#include -#include -#include -#include -#include -#include - -#define maxPPage32 0x000FFFFF /* Maximum page number in 32-bit machines */ - -typedef uint32_t shexlock; - -#pragma pack(4) /* Make sure the structure stays as we defined it */ - -struct sgc { - uint64_t sgcESID; /* ESID portion of segment cache */ -#define sgcESmsk 0xFFFFFFFFF0000000ULL /* ESID portion of segment register cache */ - uint64_t sgcVSID; /* VSID portion of segment cache */ -#define sgcVSmsk 0xFFFFFFFFFFFFF000ULL /* VSID mask */ -#define sgcVSKeys 0x0000000000000C00ULL /* Protection keys */ -#define sgcVSKeyUsr 53 /* User protection key */ -#define sgcVSNoEx 0x0000000000000200ULL /* No execute */ -}; -#pragma pack() - -typedef struct sgc sgc; - -#pragma pack(4) /* Make sure the structure stays as we defined it */ -struct pmap_vmm_stats { - unsigned int vxsGpf; /* Guest faults */ - unsigned int vxsGpfMiss; /* Faults that miss in hash table */ - - unsigned int vxsGrm; /* Guest mapping remove requests */ - unsigned int vxsGrmMiss; /* Remove misses in hash table */ - unsigned int vxsGrmActive; /* Remove hits that are active */ - - unsigned int vxsGra; /* Guest remove all mappings requests */ - unsigned int vxsGraHits; /* Remove hits in hash table */ - unsigned int vxsGraActive; /* Remove hits that are active */ - - unsigned int vxsGrl; /* Guest remove local mappings requests */ - unsigned int vxsGrlActive; /* Active mappings removed */ - - unsigned int vxsGrs; /* Guest mapping resumes */ - unsigned int vxsGrsHitAct; /* Resume hits active entry */ - unsigned int vxsGrsHitSusp; /* Resume hits suspended entry */ - unsigned int vxsGrsMissGV; /* Resume misses on guest virtual */ - unsigned int vxsGrsHitPE; /* Resume hits on host virtual */ - unsigned int vxsGrsMissPE; /* Resume misses on host virtual */ - - unsigned int vxsGad; /* Guest mapping adds */ - unsigned int vxsGadHit; /* Add hits entry (active or dormant) */ - unsigned int vxsGadFree; /* Add takes free entry in group */ - unsigned int vxsGadDormant; /* Add steals dormant entry in group */ - unsigned int vxsGadSteal; /* Add steals active entry in group */ - - unsigned int vxsGsu; /* Guest mapping suspends */ - unsigned int vxsGsuHit; /* Suspend hits entry (active only) */ - unsigned int vxsGsuMiss; /* Suspend misses entry */ - - unsigned int vxsGtd; /* Guest test ref&chg */ - unsigned int vxsGtdHit; /* Test r&c hits entry (active only) */ - unsigned int vxsGtdMiss; /* Test r&c misses entry */ -}; -#pragma pack() -typedef struct pmap_vmm_stats pmap_vmm_stats; - -/* Not wanting to tax all of our customers for the sins of those that use virtual operating - systems, we've built the hash table from its own primitive virtual memory. We first - allocate a pmap_vmm_ext with sufficient space following to accomodate the hash table - index (one 64-bit physical address per 4k-byte page of hash table). The allocation - must not cross a 4k-byte page boundary (we'll be accessing the block with relocation - off), so we'll try a couple of times, then just burn a whole page. We stuff the effective - address of the cache-aligned index into hIdxBase; the physical-mode code locates the index - by adding the size of a pmap_vmm_extension to its translated physical address, then rounding - up to the next 32-byte boundary. Now we grab enough virtual pages to contain the hash table, - and fill in the index with the page's physical addresses. For the final touch that's sure - to please, we initialize the hash table. Mmmmm, golden brown perfection. - */ - -#pragma pack(4) -struct pmap_vmm_ext { - addr64_t vmxSalt; /* This block's virt<->real conversion salt */ - addr64_t vmxHostPmapPhys; /* Host pmap physical address */ - struct pmap *vmxHostPmap; /* Host pmap effective address */ - addr64_t *vmxHashPgIdx; /* Hash table physical index base address */ - vm_offset_t *vmxHashPgList; /* List of virtual pages comprising the hash table */ - unsigned int *vmxActiveBitmap; /* Bitmap of active mappings in hash table */ - pmap_vmm_stats vmxStats; /* Stats for VMM assists */ -#define VMX_HPIDX_OFFSET ((sizeof(pmap_vmm_ext) + 127) & ~127) - /* The hash table physical index begins at the first - 128-byte boundary after the pmap_vmm_ext struct */ -#define VMX_HPLIST_OFFSET (VMX_HPIDX_OFFSET + (GV_HPAGES * sizeof(addr64_t))) -#define VMX_ACTMAP_OFFSET (VMX_HPLIST_OFFSET + (GV_HPAGES * sizeof(vm_offset_t))) -}; -#pragma pack() -typedef struct pmap_vmm_ext pmap_vmm_ext; - -#pragma pack(4) /* Make sure the structure stays as we defined it */ -struct pmap { - queue_head_t pmap_link; /* MUST BE FIRST */ - addr64_t pmapvr; /* Virtual to real conversion mask */ - shexlock pmapSXlk; /* Shared/Exclusive lock for mapping changes */ - unsigned int space; /* space for this pmap */ -#define invalSpace 0x00000001 /* Predefined always invalid space */ - uint32_t ref_count; /* reference count */ - unsigned int pmapFlags; /* Flags */ -#define pmapKeys 0x00000007 /* Keys and no execute bit to use with this pmap */ -#define pmapKeyDef 0x00000006 /* Default keys - Sup = 1, user = 1, no ex = 0 */ -#define pmapVMhost 0x00000010 /* pmap with Virtual Machines attached to it */ -#define pmapVMgsaa 0x00000020 /* Guest shadow assist active */ -#define pmapNXdisabled 0x00000040 /* no-execute disabled for this pmap */ - unsigned int spaceNum; /* Space number */ - unsigned int pmapCCtl; /* Cache control */ -#define pmapCCtlVal 0xFFFF0000 /* Valid entries */ -#define pmapCCtlLck 0x00008000 /* Lock bit */ -#define pmapCCtlLckb 16 /* Lock bit */ -#define pmapCCtlGen 0x00007FFF /* Generation number */ - -#define pmapSegCacheCnt 16 /* Maximum number of cache entries */ -#define pmapSegCacheUse 16 /* Number of cache entries to use */ - - struct pmap *freepmap; /* Free pmaps */ - pmap_vmm_ext *pmapVmmExt; /* VMM extension block, for VMM host and guest pmaps */ - addr64_t pmapVmmExtPhys; /* VMM extension block physical address */ -/* 0x038 */ - uint64_t pmapSCSubTag; /* Segment cache sub-tags. This is a 16 entry 4 bit array */ -/* 0x040 */ - sgc pmapSegCache[pmapSegCacheCnt]; /* SLD values cached for quick load */ - -/* 0x140 */ -/* if fanout is 4, then shift is 1, if fanout is 8 shift is 2, etc */ -#define kSkipListFanoutShift 1 -/* with n lists, we can handle (fanout**n) pages optimally */ -#define kSkipListMaxLists 12 - unsigned char pmapCurLists; /* 0x140 - max #lists any mapping in this pmap currently has */ - unsigned char pmapRsv2[3]; - uint32_t pmapRandNum; /* 0x144 - used by mapSetLists() as a random number generator */ - addr64_t pmapSkipLists[kSkipListMaxLists]; /* 0x148 - the list headers */ -/* following statistics conditionally gathered */ - uint64_t pmapSearchVisits; /* 0x1A8 - nodes visited searching pmaps */ - uint32_t pmapSearchCnt; /* 0x1B0 - number of calls to mapSearch or mapSearchFull */ - - unsigned int pmapRsv3[3]; - -/* 0x1C0 */ - - struct pmap_statistics stats; /* statistics */ - -/* Need to pad out to a power of 2 - right now it is 512 bytes */ -#define pmapSize 512 -}; -#pragma pack() - -#pragma pack(4) -struct pmapTransTab { - addr64_t pmapPAddr; /* Physcial address of pmap */ - unsigned int pmapVAddr; /* Virtual address of pmap */ -}; -#pragma pack() /* Make sure the structure stays as we defined it */ - -typedef struct pmapTransTab pmapTransTab; - -/* - * Address Chunk IDentified Table - */ - -struct acidTabEnt { - unsigned int acidVAddr; /* Virtual address of pmap or pointer to next free entry */ - unsigned int acidGas; /* reserved */ - addr64_t acidPAddr; /* Physcial address of pmap */ -}; - -typedef struct acidTabEnt acidTabEnt; - -extern acidTabEnt *acidTab; /* Pointer to acid table */ -extern acidTabEnt *acidFree; /* List of free acid entries */ - -#define PMAP_NULL ((pmap_t) 0) - -extern pmap_t cursor_pmap; /* The pmap to start allocations with */ -extern pmap_t sharedPmap; -extern unsigned int sharedPage; -extern int ppc_max_adrsp; /* Maximum number of concurrent address spaces allowed. */ -extern addr64_t vm_max_address; /* Maximum effective address supported */ -extern addr64_t vm_max_physical; /* Maximum physical address supported */ -extern pmapTransTab *pmapTrans; /* Space to pmap translate table */ -#define PMAP_SWITCH_USER(th, map, my_cpu) th->map = map; - -#define PMAP_CONTEXT(pmap,th) - -#define pmap_kernel_va(VA) \ - (((VA) >= VM_MIN_KERNEL_ADDRESS) && ((VA) <= vm_last_addr)) - -#define PPC_SID_KERNEL 0 /* Must change KERNEL_SEG_REG0_VALUE if !0 */ - -#define maxAdrSp 16384 -#define maxAdrSpb 14 -#define USER_MEM_WINDOW_VADDR 0x00000000E0000000ULL -#define PHYS_MEM_WINDOW_VADDR 0x0000000100000000ULL -#define IO_MEM_WINDOW_VADDR 0x0000000080000000ULL -#define IO_MEM_WINDOW_SIZE 0x0000000080000000ULL -#define pmapSmallBlock 65536 - -#define pmap_kernel() (kernel_pmap) -#define pmap_resident_count(pmap) ((pmap)->stats.resident_count) -#define pmap_resident_max(pmap) ((pmap)->stats.resident_max) -#define pmap_remove_attributes(pmap,start,end) -#define pmap_copy(dpmap,spmap,da,len,sa) -#define pmap_update() - -#define PMAP_DEFAULT_CACHE 0 -#define PMAP_INHIBIT_CACHE 1 -#define PMAP_GUARDED_CACHE 2 -#define PMAP_ACTIVATE_CACHE 4 -#define PMAP_NO_GUARD_CACHE 8 - -/* corresponds to cached, coherent, not writethru, not guarded */ -#define VM_WIMG_DEFAULT (VM_MEM_COHERENT) -#define VM_WIMG_COPYBACK (VM_MEM_COHERENT) -#define VM_WIMG_IO (VM_MEM_COHERENT | \ - VM_MEM_NOT_CACHEABLE | VM_MEM_GUARDED) -#define VM_WIMG_WTHRU (VM_MEM_WRITE_THROUGH | VM_MEM_COHERENT | VM_MEM_GUARDED) -/* write combining mode, aka store gather */ -#define VM_WIMG_WCOMB (VM_MEM_NOT_CACHEABLE | VM_MEM_COHERENT) - -/* superpages */ -#define SUPERPAGE_NBASEPAGES 1 /* we don't support superpages on PowerPC */ - -/* - * prototypes. - */ -extern addr64_t kvtophys(vm_offset_t va); /* Get physical address from kernel virtual */ -extern vm_map_offset_t kvtophys64(vm_map_offset_t va); /* Get 64-bit physical address from kernel virtual */ -extern vm_offset_t pmap_map(vm_offset_t va, - vm_offset_t spa, - vm_offset_t epa, - vm_prot_t prot, - unsigned int flags); -extern kern_return_t pmap_add_physical_memory(vm_offset_t spa, - vm_offset_t epa, - boolean_t available, - unsigned int attr); -extern void pmap_bootstrap(uint64_t msize, - vm_offset_t *first_avail, - unsigned int kmapsize); - -extern vm_offset_t pmap_boot_map(vm_size_t size); - -extern void sync_cache64(addr64_t pa, unsigned length); -extern void sync_ppage(ppnum_t pa); -extern void sync_cache_virtual(vm_offset_t va, unsigned length); -extern void flush_dcache(vm_offset_t va, unsigned length, boolean_t phys); -extern void flush_dcache64(addr64_t va, unsigned length, boolean_t phys); -extern void invalidate_dcache(vm_offset_t va, unsigned length, boolean_t phys); -extern void invalidate_dcache64(addr64_t va, unsigned length, boolean_t phys); -extern void invalidate_icache(vm_offset_t va, unsigned length, boolean_t phys); -extern void invalidate_icache64(addr64_t va, unsigned length, boolean_t phys); -extern void pmap_map_block(pmap_t pmap, addr64_t va, ppnum_t pa, uint32_t size, vm_prot_t prot, int attr, unsigned int flags); -extern int pmap_map_block_rc(pmap_t pmap, addr64_t va, ppnum_t pa, uint32_t size, vm_prot_t prot, int attr, unsigned int flags); - -extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va); -extern void MapUserMemoryWindowInit(void); -extern addr64_t MapUserMemoryWindow(vm_map_t map, addr64_t va); -extern boolean_t pmap_eligible_for_execute(ppnum_t pa); -extern int pmap_list_resident_pages( - struct pmap *pmap, - vm_offset_t *listp, - int space); -extern void pmap_init_sharedpage(vm_offset_t cpg); -extern void pmap_disable_NX(pmap_t pmap); - -extern boolean_t pmap_valid_page( - ppnum_t pn); - -/* Not required for ppc: */ -static inline void pmap_set_4GB_pagezero(__unused pmap_t pmap) {} -static inline void pmap_clear_4GB_pagezero(__unused pmap_t pmap) {} - -#endif /* _PPC_PMAP_H_ */ - diff --git a/osfmk/ppc/pms.c b/osfmk/ppc/pms.c deleted file mode 100644 index fb69f7618..000000000 --- a/osfmk/ppc/pms.c +++ /dev/null @@ -1,743 +0,0 @@ -/* - * Copyright (c) 2005-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#include -#include -#ifdef __ppc__ -# include -# include -# include -#else -# include -# include -#endif -#include -#include -#include -#include -#include - -extern int is_suser(void); - -static uint32_t pmsSyncrolator = 0; /* Only one control operation at a time please */ -uint32_t pmsBroadcastWait = 0; /* Number of outstanding broadcasts */ - -int pmsInstalled = 0; /* Power Management Stepper can run and has table installed */ -int pmsExperimental = 0; /* Power Management Stepper in experimental mode */ -decl_simple_lock_data(,pmsBuildLock) /* Make sure only one guy can replace table at the same time */ - -static pmsDef *altDpmsTab; /* Alternate step definition table */ -static uint32_t altDpmsTabSize = 0; /* Size of alternate step definition table */ - -pmsDef pmsDummy = { /* This is the dummy step for initialization. All it does is to park */ - .pmsLimit = 0, /* Time doesn't matter for a park */ - .pmsStepID = pmsMaxStates - 1, /* Use the very last ID number for the dummy */ - .pmsSetCmd = pmsParkIt, /* Force us to be parked */ - .sf.pmsSetFuncInd = 0, /* No platform call for this one */ - .pmsDown = pmsPrepSleep, /* We always park */ - .pmsNext = pmsPrepSleep /* We always park */ -}; - -pmsStat pmsStatsd[4][pmsMaxStates]; /* Generate enough statistics blocks for 4 processors */ - -pmsCtl pmsCtls = { /* Power Management Stepper control */ - .pmsStats = pmsStatsd, -}; - -pmsSetFunc_t pmsFuncTab[pmsSetFuncMax] = {NULL}; /* This is the function index table */ -pmsQueryFunc_t pmsQueryFunc; /* Pointer to pmsQuery function */ -uint32_t pmsPlatformData = 0; /* Data provided by and passed to platform functions */ - -#ifdef __ppc__ -# define PER_PROC_INFO struct per_proc_info -# define GET_PER_PROC_INFO() getPerProc() -#else -# define PER_PROC_INFO cpu_data_t -# define GET_PER_PROC_INFO() current_cpu_datap() -#endif - - - -/* - * Do any initialization needed - */ - -void -pmsInit(void) -{ - int i; - - simple_lock_init(&pmsBuildLock, 0); /* Initialize the build lock */ - for(i = 0; i < pmsMaxStates; i++) pmsCtls.pmsDefs[i] = &pmsDummy; /* Initialize the table to dummy steps */ - - pmsCPUMachineInit(); -} - - -/* - * Start the power management stepper on all processors - * - * All processors must be parked. This should be called when the hardware - * is ready to step. Probably only at boot and after wake from sleep. - * - */ - - void - pmsStart(void) -{ - boolean_t intr; - - if(!pmsInstalled) - return; /* We can't do this if no table installed */ - - intr = ml_set_interrupts_enabled(FALSE); /* No interruptions in here */ - pmsRun(pmsStartUp); /* Start running the stepper everywhere */ - (void)ml_set_interrupts_enabled(intr); /* Restore interruptions */ - } - - -/* - * Park the stepper execution. This will force the stepper on this - * processor to abandon its current step and stop. No changes to the - * hardware state is made and any previous step is lost. - * - * This is used as the initial state at startup and when the step table - * is being changed. - * - */ - -void -pmsPark(void) -{ - boolean_t intr; - - if(!pmsInstalled) - return; /* We can't do this if no table installed */ - - intr = ml_set_interrupts_enabled(FALSE); /* No interruptions in here */ - pmsSetStep(pmsParked, 0); /* Park the stepper */ - (void)ml_set_interrupts_enabled(intr); /* Restore interruptions */ -} - - -/* - * Steps down to a lower power. - * Interrupts must be off... - */ - -void -pmsDown(void) -{ - PER_PROC_INFO *pp; - uint32_t nstate; - - pp = GET_PER_PROC_INFO(); /* Get our per_proc */ - - if(!pmsInstalled || pp->pms.pmsState == pmsParked) - return; /* No stepping if parked or not installed */ - - nstate = pmsCtls.pmsDefs[pp->pms.pmsState]->pmsDown; /* Get the downward step */ - pmsSetStep(nstate, 0); /* Step to it */ -} - - -/* - * Steps up to a higher power. The "timer" parameter is true if the - * step was driven due to the pms timer expiring. - * - * Interrupts must be off... - */ - -int pmsStepIdleSneaks; -int pmsStepIdleTries; - -void -pmsStep(int timer) -{ - PER_PROC_INFO *pp; - uint32_t nstate; - uint32_t tstate; - uint32_t pkgstate; - int dir; - int i; - - pp = GET_PER_PROC_INFO(); /* Get our per_proc */ - - if(!pmsInstalled || pp->pms.pmsState == pmsParked) - return; /* No stepping if parked or not installed */ - - /* - * Assume a normal step. - */ - nstate = pmsCtls.pmsDefs[pp->pms.pmsState]->pmsNext; - - /* - * If we are idling and being asked to step up, check to see whether - * the package we're in is already at a non-idle power state. If so, - * attempt to work out what state that is, and go there directly to - * avoid wasting time ramping up. - */ - if ((pp->pms.pmsState == pmsIdle) - && ((pkgstate = pmsCPUPackageQuery()) != ~(uint32_t)0)) { - /* - * Search forward through the stepper program, - * avoid looping for too long. - */ - tstate = nstate; - pmsStepIdleTries++; - for (i = 0; i < 32; i++) { - /* - * Compare command with current package state - */ - if ((pmsCtls.pmsDefs[tstate]->pmsSetCmd & pmsCPU) == pkgstate) { - nstate = tstate; - pmsStepIdleSneaks++; - break; - } - - /* - * Advance to the next step in the program. - */ - if (pmsCtls.pmsDefs[tstate]->pmsNext == tstate) - break; /* infinite loop */ - tstate = pmsCtls.pmsDefs[tstate]->pmsNext; - } - } - - /* - * Default to a step up. - */ - dir = 1; - - /* - * If we are stepping as a consequence of timer expiry, select the - * alternate exit path and note this as downward step for accounting - * purposes. - */ - if (timer - && (pmsCtls.pmsDefs[pp->pms.pmsState]->pmsSetCmd == pmsDelay)) { - nstate = pmsCtls.pmsDefs[pp->pms.pmsState]->pmsTDelay; - - /* - * Delayed steps are a step down for accounting purposes. - */ - dir = 0; - } - - pmsSetStep(nstate, dir); -} - - -/* - * Set a specific step - * - * We do not do statistics if exiting park - * Interrupts must be off... - * - */ - -void -pmsSetStep(uint32_t nstep, int dir) -{ - PER_PROC_INFO *pp; - uint32_t pstate, nCSetCmd, mCSetCmd; - pmsDef *pnstate, *pcstate; - uint64_t tb, dur; - int cpu; - - pp = GET_PER_PROC_INFO(); /* Get our per_proc */ - cpu = cpu_number(); /* Get our processor */ - - while(1) { /* Keep stepping until we get a delay */ - - if(pp->pms.pmsCSetCmd & pmsMustCmp) { /* Do we have to finish the delay before changing? */ - while(mach_absolute_time() < pp->pms.pmsPop); /* Yes, spin here... */ - } - - if((nstep == pmsParked) || ((uint32_t)pmsCtls.pmsDefs[nstep]->pmsSetCmd == pmsParkIt)) { /* Are we parking? */ - - tb = mach_absolute_time(); /* What time is it? */ - pp->pms.pmsStamp = tb; /* Show transition now */ - pp->pms.pmsPop = HalfwayToForever; /* Set the pop way into the future */ - pp->pms.pmsState = pmsParked; /* Make sure we are parked */ - etimer_resync_deadlines(); /* Cancel our timer if going */ - return; - } - - pnstate = pmsCtls.pmsDefs[nstep]; /* Point to the state definition */ - pstate = pp->pms.pmsState; /* Save the current step */ - pp->pms.pmsState = nstep; /* Set the current to the next step */ - - if(pnstate->pmsSetCmd != pmsDelay) { /* If this is not a delayed state, change the actual hardware now */ - if(pnstate->pmsSetCmd & pmsCngCPU) pmsCPUSet(pnstate->pmsSetCmd); /* We have some CPU work to do... */ - if((uint32_t)pnstate->sf.pmsSetFunc) pnstate->sf.pmsSetFunc(pnstate->pmsSetCmd, cpu, pmsPlatformData); /* Tell the platform to set power mode */ - - mCSetCmd = pnstate->pmsSetCmd & (pmsCngXClk | pmsCngCPU | pmsCngVolt); /* Isolate just the change flags */ - mCSetCmd = (mCSetCmd - (mCSetCmd >> 7)) | pmsSync | pmsMustCmp | pmsPowerID; /* Form mask of bits that come from new command */ - nCSetCmd = pp->pms.pmsCSetCmd & ~mCSetCmd; /* Clear changing bits */ - nCSetCmd = nCSetCmd | (pnstate->pmsSetCmd & mCSetCmd); /* Flip on the changing bits and the always copy bits */ - - pp->pms.pmsCSetCmd = nCSetCmd; /* Set it for real */ - } - - tb = mach_absolute_time(); /* What time is it? */ - pp->pms.pmsPop = tb + pnstate->pmsLimit; /* Set the next pop */ - - if((pnstate->pmsSetCmd != pmsDelay) && (pp->pms.pmsCSetCmd & pmsSync) && (pnstate->pmsLimit != 0)) { /* Is this a synchronous command with a delay? */ - while(mach_absolute_time() < pp->pms.pmsPop); /* Yes, spin here and wait it out... */ - } - -/* - * Gather some statistics - */ - - dur = tb - pp->pms.pmsStamp; /* Get the amount of time we were in the old step */ - pp->pms.pmsStamp = tb; /* Set the new timestamp */ - if(!(pstate == pmsParked)) { /* Only take stats if we were not parked */ - pcstate = pmsCtls.pmsDefs[pstate]; /* Get the previous step */ - pmsCtls.pmsStats[cpu][pcstate->pmsStepID].stTime[dir] += dur; /* Accumulate the total time in the old step */ - pmsCtls.pmsStats[cpu][pcstate->pmsStepID].stCnt[dir] += 1; /* Count transitions */ - } - -/* - * See if we are done chaining steps - */ - - if((pnstate->pmsSetCmd == pmsDelay) - || (!(pp->pms.pmsCSetCmd & pmsSync) && (pnstate->pmsLimit != 0))) { /* Is this not syncronous and a non-zero delay or a delayed step? */ - etimer_resync_deadlines(); /* Start the timers ticking */ - break; /* We've stepped as far as we're going to... */ - } - - nstep = pnstate->pmsNext; /* Chain on to the next */ - } -} - -/* - * Either park the stepper or force the step on a parked stepper for local processor only - * - */ - -void -pmsRunLocal(uint32_t nstep) -{ - PER_PROC_INFO *pp; - uint32_t lastState; - int cpu, i; - boolean_t intr; - - if(!pmsInstalled) /* Ignore this if no step programs installed... */ - return; - - intr = ml_set_interrupts_enabled(FALSE); /* No interruptions in here */ - - pp = GET_PER_PROC_INFO(); /* Get our per_proc */ - - if(nstep == pmsStartUp) { /* Should we start up? */ - pmsCPUInit(); /* Get us up to full with high voltage and park */ - nstep = pmsNormHigh; /* Change request to transition to normal high */ - } - - lastState = pp->pms.pmsState; /* Remember if we are parked now */ - - pmsSetStep(nstep, 1); /* Step to the new state */ - - if((lastState == pmsParked) && (pp->pms.pmsState != pmsParked)) { /* Did we just unpark? */ - cpu = cpu_number(); /* Get our processor */ - for(i = 0; i < pmsMaxStates; i++) { /* Step through the steps and clear the statistics since we were parked */ - pmsCtls.pmsStats[cpu][i].stTime[0] = 0; /* Clear accumulated time - downward */ - pmsCtls.pmsStats[cpu][i].stTime[1] = 0; /* Clear accumulated time - forward */ - pmsCtls.pmsStats[cpu][i].stCnt[0] = 0; /* Clear transition count - downward */ - pmsCtls.pmsStats[cpu][i].stCnt[1] = 0; /* Clear transition count - forward */ - } - } - - (void)ml_set_interrupts_enabled(intr); /* Restore interruptions */ -} - -/* - * Control the Power Management Stepper. - * Called from user state by the superuser. - * Interruptions disabled. - * - */ -kern_return_t -pmsControl(uint32_t request, user_addr_t reqaddr, uint32_t reqsize) -{ - uint32_t nstep = 0, result, presult; - int ret, cpu; - kern_return_t kret = KERN_SUCCESS; - pmsDef *ndefs; - PER_PROC_INFO *pp; - - pp = GET_PER_PROC_INFO(); /* Get our per_proc */ - cpu = cpu_number(); /* Get our processor */ - - if(!is_suser()) { /* We are better than most, */ - kret = KERN_FAILURE; - goto out; - } - - if(request >= pmsCFree) { /* Can we understand the request? */ - kret = KERN_INVALID_ARGUMENT; - goto out; - } - - if(request == pmsCQuery) { /* Are we just checking? */ - result = pmsCPUQuery() & pmsCPU; /* Get the processor data and make sure there is no slop */ - presult = 0; /* Assume nothing */ - if((uint32_t)pmsQueryFunc) - presult = pmsQueryFunc(cpu, pmsPlatformData); /* Go get the platform state */ - kret = result | (presult & (pmsXClk | pmsVoltage | pmsPowerID)); /* Merge the platform state with no slop */ - goto out; - } - - if(request == pmsCExperimental) { /* Enter experimental mode? */ - - if(pmsInstalled || (pmsExperimental & 1)) { /* Are we already running or in experimental? */ - kret = KERN_FAILURE; - goto out; - } - - pmsExperimental |= 1; /* Flip us into experimental but don't change other flags */ - - pmsCPUConf(); /* Configure for this machine */ - pmsStart(); /* Start stepping */ - goto out; - } - - if(request == pmsCCnfg) { /* Do some up-front checking before we commit to doing this */ - if((reqsize > (pmsMaxStates * sizeof(pmsDef))) || (reqsize < (pmsFree * sizeof(pmsDef)))) { /* Check that the size is reasonable */ - kret = KERN_NO_SPACE; - goto out; - } - } - - if (request == pmsGCtls) { - if (reqsize != sizeof(pmsCtls)) { - kret = KERN_FAILURE; - goto out; - } - ret = copyout(&pmsCtls, reqaddr, reqsize); - goto out; - } - - if (request == pmsGStats) { - if (reqsize != sizeof(pmsStatsd)) { /* request size is fixed */ - kret = KERN_FAILURE; - goto out; - } - ret = copyout(&pmsStatsd, reqaddr, reqsize); - goto out; - } - -/* - * We are committed after here. If there are any errors detected, we shouldn't die, but we - * will be stuck in park. - * - * Also, we can possibly end up on another processor after the broadcast. - * - */ - - if(!hw_compare_and_store(0, 1, &pmsSyncrolator)) { /* Are we already doing this? */ - /* Tell them that we are already busy and to try again */ - kret = KERN_RESOURCE_SHORTAGE; - goto out; - } - -// NOTE: We will block in the following code until everyone has finished the prepare - - pmsRun(pmsPrepCng); /* Get everyone parked and in a proper state for step table changes, including me */ - - if(request == pmsCPark) { /* Is all we're supposed to do park? */ - pmsSyncrolator = 0; /* Free us up */ - goto out; - } - - switch(request) { /* Select the routine */ - - case pmsCStart: /* Starts normal steppping */ - nstep = pmsNormHigh; /* Set the request */ - break; - - case pmsCFLow: /* Forces low power */ - nstep = pmsLow; /* Set request */ - break; - - case pmsCFHigh: /* Forces high power */ - nstep = pmsHigh; /* Set request */ - break; - - case pmsCCnfg: /* Loads new stepper program */ - - if(!(ndefs = (pmsDef *)kalloc(reqsize))) { /* Get memory for the whole thing */ - pmsSyncrolator = 0; /* Free us up */ - kret = KERN_INVALID_ADDRESS; - goto out; - } - - ret = copyin(reqaddr, (void *)ndefs, reqsize); /* Get the new config table */ - if(ret) { /* Hmmm, something went wrong with the copyin */ - kfree(ndefs, reqsize); /* Free up the copied in data */ - pmsSyncrolator = 0; /* Free us up */ - kret = KERN_INVALID_ADDRESS; - goto out; - } - - kret = pmsBuild(ndefs, reqsize, NULL, 0, NULL); /* Go build and replace the tables. Make sure we keep the old platform stuff */ - if(kret) { /* Hmmm, something went wrong with the compilation */ - kfree(ndefs, reqsize); /* Free up the copied in data */ - pmsSyncrolator = 0; /* Free us up */ - goto out; - } - - nstep = pmsNormHigh; /* Set the request */ - break; - - default: - panic("pmsCntrl: stepper control is so very, very confused = %08X\n", request); - - } - - pmsRun(nstep); /* Get everyone into step */ - pmsSyncrolator = 0; /* Free us up */ -out: - return kret; - -} - -/* - * Broadcast a change to all processors including ourselves. - * - * Interruptions are disabled. - */ - -void -pmsRun(uint32_t nstep) -{ - pmsCPURun(nstep); -} - - -/* - * Build the tables needed for the stepper. This includes both the step definitions and the step control table. - * - * We most absolutely need to be parked before this happens because we're gonna change the table. - * We're going to have to be pretty complete about checking for errors. - * Also, a copy is always made because we don't want to be crippled by not being able to change - * the table or description formats. - * - * We pass in a table of external functions and the new stepper def uses the corresponding - * indexes rather than actual function addresses. This is done so that a proper table can be - * built with the control syscall. It can't supply addresses, so the index has to do. We - * internalize the table so our caller does not need to keep it. Note that passing in a 0 - * will use the current function table. Also note that entry 0 is reserved and must be 0, - * we will check and fail the build. - * - * The platformData parameter is a 32-bit word of data that is passed unaltered to the set function. - * - * The queryFunc parameter is the address of a function that will return the current state of the platform. - * The format of the data returned is the same as the platform specific portions of pmsSetCmd, i.e., pmsXClk, - * pmsVoltage, and any part of pmsPowerID that is maintained by the platform hardware (an example would be - * the values of the gpios that correspond to pmsPowerID). The value should be constructed by querying - * hardware rather than returning a value cached by software. One of the intents of this function is to - * help recover lost or determine initial power states. - * - */ - -kern_return_t pmsBuild(pmsDef *pd, uint32_t pdsize, pmsSetFunc_t *functab, uint32_t platformData, pmsQueryFunc_t queryFunc) { - - int newsize, cstp, oldAltSize, xdsply; - uint32_t setf, steps, i, nstps; - uint64_t nlimit; - pmsDef *newpd, *oldAlt; - boolean_t intr; - - xdsply = (pmsExperimental & 3) != 0; /* Turn on kprintfs if requested or in experimental mode */ - - if(pdsize % sizeof(pmsDef)) - return KERN_INVALID_ARGUMENT; /* Length not multiple of definition size */ - - steps = pdsize / sizeof(pmsDef); /* Get the number of steps supplied */ - - if((steps >= pmsMaxStates) || (steps < pmsFree)) /* Complain if too big or too small */ - return KERN_INVALID_ARGUMENT; /* Squeak loudly!!! */ - - if((uint32_t)functab && (uint32_t)functab[0]) /* Verify that if they supplied a new function table, entry 0 is 0 */ - return KERN_INVALID_ARGUMENT; /* Fail because they didn't reserve entry 0 */ - - if(xdsply) kprintf("\n StepID Down Next HWSel HWfun Limit\n"); - - for(i = 0; i < steps; i++) { /* Step through and verify the definitions */ - - if(xdsply) kprintf(" %6d %6d %6d %08X %6d %20lld\n", pd[i].pmsStepID, pd[i].pmsDown, - pd[i].pmsNext, pd[i].pmsSetCmd, - pd[i].sf.pmsSetFuncInd, pd[i].pmsLimit); - - if((pd[i].pmsLimit != 0) && (pd[i].pmsLimit < 100ULL)) { - if(xdsply) kprintf("error step %3d: pmsLimit too small/n", i); - return KERN_INVALID_ARGUMENT; /* Has to be 100�S or more */ - } - - if((pd[i].pmsLimit != 0xFFFFFFFFFFFFFFFFULL) && (pd[i].pmsLimit > (HalfwayToForever / 1000ULL))) { - if(xdsply) kprintf("error step %3d: pmsLimit too big\n", i); - return KERN_INVALID_ARGUMENT; /* Can't be too big */ - } - - if(pd[i].pmsStepID != i) { - if(xdsply) kprintf("error step %3d: step ID does not match (%d)\n", i, pd[i].pmsStepID); - return KERN_INVALID_ARGUMENT; /* ID must match */ - } - - if(pd[i].sf.pmsSetFuncInd >= pmsSetFuncMax) { - if(xdsply) kprintf("error step %3d: function invalid (%d)\n", i, pd[i].sf.pmsSetFuncInd); - return KERN_INVALID_ARGUMENT; /* Fail if this function is not in the table */ - } - - if((pd[i].pmsDown != pmsParked) && pd[i].pmsDown >= steps) { - if(xdsply) kprintf("error step %3d: pmsDown out of range (%d)\n", i, pd[i].pmsDown); - return KERN_INVALID_ARGUMENT; /* Step down must be in the table or park */ - } - - if((pd[i].pmsNext != pmsParked) && pd[i].pmsNext >= steps) { - if(xdsply) kprintf("error step %3d: pmsNext out of range (%d)\n", i, pd[i].pmsNext); - return KERN_INVALID_ARGUMENT; /* Step up must be in the table or park */ - } - - if((pd[i].pmsSetCmd == pmsDelay) && (pd[i].pmsTDelay >= steps)) { - if(xdsply) kprintf("error step %3d: pmsTDelay out of range (%d)\n", i, pd[i].pmsTDelay); - return KERN_INVALID_ARGUMENT; /* Delayed step must be in the table */ - } - - if((pd[i].pmsSetCmd == pmsDelay) && (pd[i].pmsLimit == 0xFFFFFFFFFFFFFFFFULL)) { - if(xdsply) kprintf("error step %3d: delay time limit must not be infinite\n", i); - return KERN_INVALID_ARGUMENT; /* Delayed step must have a time limit */ - } - - } - -/* - * Verify that there are no infinite synchronous forward loops in the table - */ - - if(xdsply) kprintf("\nInitial scan passed, start in loop check\n"); - for(i = 0; i < steps; i++) { /* Start with each step. Inefficient, but who cares */ - - cstp = i; /* Set starting point */ - nstps = 0; /* Initialize chain length counter */ - while(1) { /* Do until we hit the end */ - if(pd[cstp].pmsSetCmd == pmsParkIt) break; /* Parking always terminates a chain so no endless loop here */ - if(pd[cstp].pmsSetCmd == pmsDelay) break; /* Delayed steps always terminate a chain so no endless loop here */ - if((pd[cstp].pmsLimit != 0) && ((pd[cstp].pmsSetCmd & pmsSync) != pmsSync)) break; /* If time limit is not 0 and not synchrouous, no endless loop */ - if(pd[cstp].pmsNext == pmsParked) break; /* If the next step is parked, no endless loop */ - - cstp = pd[cstp].pmsNext; /* Chain to the next */ - nstps = nstps + 1; /* Count this step */ - if(nstps >= steps) { /* We've stepped for more steps than we have, must be an endless loop! */ - if(xdsply) kprintf("error step %3d: infinite pmsNext loop\n", i); - return KERN_INVALID_ARGUMENT; /* Suggest to our caller that they can't program... */ - } - } - } - - if((pmsExperimental & 4) && (pmsInstalled) && ((uint32_t)functab != 0)) { /* If we are already initted and experimental is locked in, and we are doing first */ - if(xdsply) kprintf("Experimental locked, ignoring driver pmsBuild\n"); - return KERN_RESOURCE_SHORTAGE; /* Just ignore the request. */ - } - - - -/* - * Well, things look ok, let's do it to it... - */ - - if(xdsply) kprintf("Loop check passed, building and installing table\n"); - - newsize = steps * sizeof(pmsDef); /* Get the size needed for the definition blocks */ - - if(!(newpd = (pmsDef *)kalloc(newsize))) { /* Get memory for the whole thing */ - return KERN_RESOURCE_SHORTAGE; /* No storage... */ - } - - bzero((void *)newpd, newsize); /* Make it pretty */ - -/* - * Ok, this is it, finish intitializing, switch the tables, and pray... - * We want no interruptions at all and we need to lock the table. Everybody should be parked, - * so no one should ever touch this. The lock is to keep multiple builders safe. It probably - * will never ever happen, but paranoia is a good thing... - */ - - intr = ml_set_interrupts_enabled(FALSE); /* No interruptions in here */ - simple_lock(&pmsBuildLock); /* Lock out everyone... */ - - if(platformData) pmsPlatformData = platformData; /* Remember the platform data word passed in if any was... */ - if((uint32_t)queryFunc) pmsQueryFunc = queryFunc; /* Remember the query function passed in, if it was... */ - - oldAlt = altDpmsTab; /* Remember any old alternate we had */ - oldAltSize = altDpmsTabSize; /* Remember its size */ - - altDpmsTab = newpd; /* Point to the new table */ - altDpmsTabSize = newsize; /* Set the size */ - - if((uint32_t)functab) { /* Did we get a new function table? */ - for(i = 0; i < pmsSetFuncMax; i++) pmsFuncTab[i] = functab[i]; /* Copy in the new table */ - } - - for(i = 0; i < pmsMaxStates; i++) pmsCtls.pmsDefs[i] = &pmsDummy; /* Initialize the table to point to the dummy step */ - - for(i = 0; i < steps; i++) { /* Replace the step table entries */ - if(pd[i].pmsLimit == 0xFFFFFFFFFFFFFFFFULL) nlimit = century; /* Default to 100 years */ - else nlimit = pd[i].pmsLimit; /* Otherwise use what was supplied */ - - nanoseconds_to_absolutetime(nlimit * 1000ULL, &newpd[i].pmsLimit); /* Convert microseconds to nanoseconds and then to ticks */ - - setf = pd[i].sf.pmsSetFuncInd; /* Make convienient */ - newpd[i].sf.pmsSetFunc = pmsFuncTab[setf]; /* Replace the index with the function address */ - - newpd[i].pmsStepID = pd[i].pmsStepID; /* Set the step ID */ - newpd[i].pmsSetCmd = pd[i].pmsSetCmd; /* Set the hardware selector ID */ - newpd[i].pmsDown = pd[i].pmsDown; /* Set the downward step */ - newpd[i].pmsNext = pd[i].pmsNext; /* Set the next setp */ - newpd[i].pmsTDelay = pd[i].pmsTDelay; /* Set the delayed setp */ - pmsCtls.pmsDefs[i] = &newpd[i]; /* Copy it in */ - } -#ifdef __ppc__ - pmsCtlp = (uint32_t)&pmsCtls; /* Point to the new pms table */ -#endif - pmsInstalled = 1; /* The stepper has been born or born again... */ - - simple_unlock(&pmsBuildLock); /* Free play! */ - (void)ml_set_interrupts_enabled(intr); /* Interrupts back the way there were */ - - if((uint32_t)oldAlt) /* If we already had an alternate, free it */ - kfree(oldAlt, oldAltSize); - - if(xdsply) kprintf("Stepper table installed\n"); - - return KERN_SUCCESS; /* We're in fate's hands now... */ -} diff --git a/osfmk/ppc/pmsCPU.c b/osfmk/ppc/pmsCPU.c deleted file mode 100644 index 0b12f2d31..000000000 --- a/osfmk/ppc/pmsCPU.c +++ /dev/null @@ -1,313 +0,0 @@ -/* - * Copyright (c) 2004-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -static void pmsCPURemote(uint32_t nstep); - - -pmsDef pmsDefault[] = { - { - .pmsLimit = century, /* We can normally stay here for 100 years */ - .pmsStepID = pmsIdle, /* Unique identifier to this step */ - .pmsSetCmd = 0, /* Dummy platform power level */ - .sf.pmsSetFuncInd = 0, /* Dummy platform set function */ - .pmsDown = pmsIdle, /* We stay here */ - .pmsNext = pmsNorm /* Next step */ - }, - { - .pmsLimit = century, /* We can normally stay here for 100 years */ - .pmsStepID = pmsNorm, /* Unique identifier to this step */ - .pmsSetCmd = 0, /* Dummy platform power level */ - .sf.pmsSetFuncInd = 0, /* Dummy platform set function */ - .pmsDown = pmsIdle, /* Down to idle */ - .pmsNext = pmsNorm /* Next step */ - }, - { - .pmsLimit = century, /* We can normally stay here for 100 years */ - .pmsStepID = pmsNormHigh, /* Unique identifier to this step */ - .pmsSetCmd = 0, /* Dummy platform power level */ - .sf.pmsSetFuncInd = 0, /* Dummy platform set function */ - .pmsDown = pmsIdle, /* Down to idle */ - .pmsNext = pmsNormHigh /* Next step */ - }, - { - .pmsLimit = century, /* We can normally stay here for 100 years */ - .pmsStepID = pmsBoost, /* Unique identifier to this step */ - .pmsSetCmd = 0, /* Dummy platform power level */ - .sf.pmsSetFuncInd = 0, /* Dummy platform set function */ - .pmsDown = pmsIdle, /* Step down */ - .pmsNext = pmsBoost /* Next step */ - }, - { - .pmsLimit = century, /* We can normally stay here for 100 years */ - .pmsStepID = pmsLow, /* Unique identifier to this step */ - .pmsSetCmd = 0, /* Dummy platform power level */ - .sf.pmsSetFuncInd = 0, /* Dummy platform set function */ - .pmsDown = pmsLow, /* We always stay here */ - .pmsNext = pmsLow /* We always stay here */ - }, - { - .pmsLimit = century, /* We can normally stay here for 100 years */ - .pmsStepID = pmsHigh, /* Unique identifier to this step */ - .pmsSetCmd = 0, /* Dummy platform power level */ - .sf.pmsSetFuncInd = 0, /* Dummy platform set function */ - .pmsDown = pmsHigh, /* We always stay here */ - .pmsNext = pmsHigh /* We always stay here */ - }, - { - .pmsLimit = 0, /* Time doesn't matter for a prepare for change */ - .pmsStepID = pmsPrepCng, /* Unique identifier to this step */ - .pmsSetCmd = pmsParkIt, /* Force us to be parked */ - .sf.pmsSetFuncInd = 0, /* Dummy platform set function */ - .pmsDown = pmsPrepCng, /* We always stay here */ - .pmsNext = pmsPrepCng /* We always stay here */ - }, - { - .pmsLimit = 0, /* Time doesn't matter for a prepare for sleep */ - .pmsStepID = pmsPrepSleep, /* Unique identifier to this step */ - .pmsSetCmd = pmsParkIt, /* Force us to be parked */ - .sf.pmsSetFuncInd = 0, /* Dummy platform set function */ - .pmsDown = pmsPrepSleep, /* We always stay here */ - .pmsNext = pmsPrepSleep /* We always stay here */ - }, - { - .pmsLimit = 0, /* Time doesn't matter for a prepare for sleep */ - .pmsStepID = pmsOverTemp, /* Unique identifier to this step */ - .pmsSetCmd = 0, /* Dummy platform power level */ - .sf.pmsSetFuncInd = 0, /* Dummy platform set function */ - .pmsDown = pmsOverTemp, /* We always stay here */ - .pmsNext = pmsOverTemp /* We always stay here */ - } -}; - - - -/* - * This is where the CPU part of the stepper code lives. - * - * It also contains the "hacked kext" experimental code. This is/was used for - * experimentation and bringup. It should neither live long nor prosper. - * - */ - -/* - * Set the processor frequency and stuff - */ - -void pmsCPUSet(uint32_t sel) { - int nfreq; - struct per_proc_info *pp; - - pp = getPerProc(); /* Get our per_proc */ - - if(!((sel ^ pp->pms.pmsCSetCmd) & pmsCPU)) return; /* If there aren't any changes, bail now... */ - - nfreq = (sel & pmsCPU) >> 16; /* Isolate the new frequency */ - - switch(pp->pf.pfPowerModes & pmType) { /* Figure out what type to do */ - - case pmDFS: /* This is a DFS machine */ - ml_set_processor_speed_dfs(nfreq); /* Yes, set it */ - break; - - case pmDualPLL: - ml_set_processor_speed_dpll(nfreq); /* THIS IS COMPLETELY UNTESTED!!! */ - break; - - case pmPowerTune: /* This is a PowerTune machine */ - ml_set_processor_speed_powertune(nfreq); /* Diddle the deal */ - break; - - default: /* Not this time dolt!!! */ - panic("pmsCPUSet: unsupported power manager type: %08X\n", pp->pf.pfPowerModes); - break; - - } - -} - -/* - * This code configures the initial step tables. It should be called after the timebase frequency is initialized. - */ - -void pmsCPUConf(void) { - - int i; - kern_return_t ret; - pmsSetFunc_t pmsDfltFunc[pmsSetFuncMax]; /* List of functions for the external power control to use */ - - for(i = 0; i < pmsSetFuncMax; i++) pmsDfltFunc[i] = NULL; /* Clear this */ - - - ret = pmsBuild((pmsDef *)&pmsDefault, sizeof(pmsDefault), pmsDfltFunc, 0, (pmsQueryFunc_t)0); /* Configure the default stepper */ - - if(ret != KERN_SUCCESS) { /* Some screw up? */ - panic("pmsCPUConf: initial stepper table build failed, ret = %08X\n", ret); /* Squeal */ - } - - pmsSetStep(pmsHigh, 1); /* Slew to high speed */ - pmsPark(); /* Then park */ - return; -} - -/* - * Machine-dependent initialization - */ -void -pmsCPUMachineInit(void) -{ - return; -} - -/* - * This function should be called once for each processor to force the - * processor to the correct voltage and frequency. - */ - -void pmsCPUInit(void) { - - int cpu; - - cpu = cpu_number(); /* Who are we? */ - - kprintf("************ Initializing stepper hardware, cpu %d ******************\n", cpu); /* (BRINGUP) */ - - pmsSetStep(pmsHigh, 1); /* Slew to high speed */ - pmsPark(); /* Then park */ - - kprintf("************ Stepper hardware initialized, cpu %d ******************\n", cpu); /* (BRINGUP) */ -} - -extern uint32_t hid1get(void); - -uint32_t -pmsCPUQuery(void) -{ - uint32_t result; - struct per_proc_info *pp; - uint64_t scdata; - - pp = getPerProc(); /* Get our per_proc */ - - switch(pp->pf.pfPowerModes & pmType) { /* Figure out what type to do */ - - case pmDFS: /* This is a DFS machine */ - result = hid1get(); /* Get HID1 */ - result = (result >> 6) & 0x00030000; /* Isolate the DFS bits */ - break; - - case pmPowerTune: /* This is a PowerTune machine */ - (void)ml_scom_read(PowerTuneStatusReg, &scdata); /* Get the current power level */ - result = (scdata >> (32 + 8)) & 0x00030000; /* Shift the data to align with the set command */ - break; - - default: /* Query not supported for this kind */ - result = 0; /* Return highest if not supported */ - break; - - } - - return result; -} - -/* - * These are not implemented for PPC. - */ -void pmsCPUYellowFlag(void) { -} - -void pmsCPUGreenFlag(void) { -} - -uint32_t pmsCPUPackageQuery(void) -{ - /* multi-core CPUs are not supported. */ - return(~(uint32_t)0); -} - -/* - * Broadcast a change to all processors including ourselves. - * This must transition before broadcasting because we may block and end up on a different processor. - * - * This will block until all processors have transitioned, so - * obviously, this can block. - * - * Called with interruptions disabled. - * - */ - -void pmsCPURun(uint32_t nstep) { - - pmsRunLocal(nstep); /* If we aren't parking (we are already parked), transition ourselves */ - (void)cpu_broadcast(&pmsBroadcastWait, pmsCPURemote, nstep); /* Tell everyone else to do it too */ - - return; - -} - -/* - * Receive a broadcast and react. - * This is called from the interprocessor signal handler. - * We wake up the initiator after we are finished. - * - */ - -static void pmsCPURemote(uint32_t nstep) { - - pmsRunLocal(nstep); /* Go set the step */ - if(!hw_atomic_sub(&pmsBroadcastWait, 1)) { /* Drop the wait count */ - thread_wakeup((event_t)&pmsBroadcastWait); /* If we were the last, wake up the signaller */ - } - return; -} - -/* - * Control the Power Management Stepper. - * Called from user state by the superuser via a ppc system call. - * Interruptions disabled. - * - */ -int pmsCntrl(struct savearea *save) { - save->save_r3 = pmsControl(save->save_r3, (user_addr_t)(uintptr_t)save->save_r4, save->save_r5); - return 1; -} - - - diff --git a/osfmk/ppc/ppc_disasm.i b/osfmk/ppc/ppc_disasm.i deleted file mode 100644 index 688f81bbc..000000000 --- a/osfmk/ppc/ppc_disasm.i +++ /dev/null @@ -1,234 +0,0 @@ -# @OSF_COPYRIGHT@ -# - -# ppc.i - PowerPC instructions -# , -# By Eamonn McManus , 1995. - -# simplified mnemonics -# ori 0,0,0 -in 01100000000000000000000000000000 nop -# addi[s] rD,0,value -in 00111sddddd00000iiiiiiiiiiiiiiii li{|s}[$s] \ - $reg($d),{$simm16($i)|$shifted16($i)}[$s] -# or rA,rS,rS -in 011111dddddaaaaabbbbb0110111100r {or{|.}[$r] $reg($a),$reg($b),$reg($d)|\ - mr{|.}[$r] $reg($a),$reg($d)}[$b == $d] -in 011111dddddaaaaabbbbb0100111100r xor{|.}[$r] $reg($a),$reg($b),$reg($d) - -# mtcrf 0xFF,rS -in 011111ddddd011111111000100100000 mtcr $reg($d) - -in 00001Dcccccaaaaaiiiiiiiiiiiiiiii t{d|w}[$D]$tcond($c)i $reg($a),$simm16($i) -in 000111dddddaaaaaiiiiiiiiiiiiiiii mulli $reg($d),$reg($a),$simm16($i) -in 001000dddddaaaaaiiiiiiiiiiiiiiii subfic $reg($d),$reg($a),$simm16($i) -in 00101Uddd0laaaaaiiiiiiiiiiiiiiii cmp{l|}[$U]i \ - $crcom($d){|1,}[$l]$reg($a),$simm16($i) -in 00110rdddddaaaaaiiiiiiiiiiiiiiii addic{|.}[$r] $reg($d),$reg0($a),$simm16($i) -in 00111sdddddaaaaaiiiiiiiiiiiiiiii addi{|s}[$s] $reg($d),$reg0($a),\ - {$simm16($i)|$shifted16($i)}[$s] -in 010000cccccccccciiiiiiiiiiiiiial $br($c,$a,$l,,1)\ - {$brdispl($i,14)|$brabs($i)}[$a] -in 01000100000000000000000000000010 sc -in 010010iiiiiiiiiiiiiiiiiiiiiiiial b{|l}[$l]{|a}[$a] \ - {$brdispl($i,24)|$brabs($i)}[$a] -in 010011ddd00sss000000000000000000 mcrf $crf($d),$crf($s) -in 010011cccccccccc000000000010000l $br($c,0,$l,lr,0) -in 010011dddddaaaaabbbbb0oooo000010 cr$crop($o) $crb($d),$crb($a),$crb($b) -in 01001100000000000000000001100100 rfi -in 01001100000000000000000000100100 rfid -in 01001100000000000000001000100100 hrfid -in 01001100000000000000000100101100 isync -in 010011cccccccccc000001000010000l $br($c,0,$l,ctr,0) -in 010111dddddaaaaabbbbbffffftttttr rlwnm{|.}[$r] \ - $reg($a),$reg($d),$reg($b),$dec($f),$dec($t) -in 0101xxdddddaaaaasssssffffftttttr rl{wimi|winm|?|?}[$x]{|.}[$r] \ - $reg($a),$reg($d),$dec($s),$dec($f),$dec($t) -in 011110dddddaaaaasssssffffff0xxSr rld{icl|icr|ic|imi}[$x]{|.}[$r] \ - $reg($a),$reg($d),$dec($[Ssssss]),$dec($f) -in 011110dddddaaaaabbbbbffffff100xr rldc{l|r}[$x]{|.}[$r] \ - $reg($a),$reg($d),$reg($b),$dec($f) -in 011111ddd0laaaaabbbbb0000u000000 cmp{|l}[$u] \ - $crcom($d){|1,}[$l]$reg($a),$reg($b) -in 011111cccccaaaaabbbbb000w0001000 t{w|d}[$w]$tcond($c) $reg($a),$reg($b) -in 011111dddddaaaaabbbbbo000C01000r subf{c|}[$C]{|o}[$o]{|.}[$r] \ - $reg($d),$reg($a),$reg($b) -in 011111dddddaaaaabbbbb000u0010w1r mulh{d|w}[$w]{u|}[$u]{|.}[$r] \ - $reg($d),$reg($a),$reg($b) -in 011111dddddaaaaabbbbbott0001010r add{c|e||?}[$t]{|o}[$o]{|.}[$r] \ - $reg($d),$reg($a),$reg($b) -in 011111ddddd0000000000000m0100110 mf{cr|msr}[$m] $reg($d) -in 011111ddddd0ffffffff000000100110 mfcr $hex($f),$reg($d) -in 011111dddddaaaaabbbbb000w0101000 l{w|d}[$w]arx $reg($d),$reg0($a),$reg($b) -in 011111dddddaaaaabbbbb0000u101010 ld{|u}[$u]x $reg($d),$reg0($a),$reg($b) -in 011111dddddaaaaabbbbb0ooou101110 $ldst($o){|u}[$u]x \ - $reg($d),$reg($a),$reg($b) -in 011111dddddaaaaabbbbb0000011A00r {slw|and}[$A]{|.}[$r] \ - $reg($a),$reg($d),$reg($b) -in 011111dddddaaaaa000000000w11010r cntlz{w|d}[$w]{|.}[$r] $reg($a),$reg($d) -in 011111dddddaaaaabbbbb0000011011r sld{|.}[$r] $reg($a),$reg($d),$reg($b) -in 01111100000aaaaabbbbb00001101100 dcbst $reg($a),$reg($b) -in 011111dddddaaaaabbbbb0000111100r andc{|.}[$r] $reg($a),$reg($d),$reg($b) -in 01111100000aaaaabbbbb00010101100 dcbf $reg($a),$reg($b) -in 011111dddddaaaaa00000o001101000r neg{|o}[$o]{|.}[$r] $reg($d),$reg($a) -in 011111dddddaaaaabbbbb0001111100r nor{|.}[$r] $reg($a),$reg($d),$reg($b) -in 011111dddddaaaaabbbbbo01z001000r subf{|z}[$z]e{|o}[$o]{|.}[$r] \ - $reg($d),$reg($a) -in 011111ddddd0ffffffff000100100m00 mt{crf $hex($f),|msr}[$m] $reg($d) -in 011111ddddd000000000000101100100 mtmsrd $reg($d) -in 011111sssssaaaaabbbbb0010u101010 std{|u}[$u]x $reg($s),$reg0($a),$reg($b) -in 011111sssssaaaaabbbbb001w0101101 st{w|d}[$w]cx. $reg($s),$reg0($a),$reg($b) -in 011111dddddaaaaa00000o011001010r addze{|o}[$o]{|.}[$r] $reg($d),$reg($a) -in 011111sssss0rrrr0000000110100100 mtsr $dec($r),$reg($s) -in 011111dddddaaaaa00000o0111010x0r {subf|add}[$x]me{|o}[$o]{|.}[$r] \ - $reg($d),$reg($a) -in 011111dddddaaaaabbbbbo0111010w1r mull{w|d}[$w]{|o}[$o]{|.}[$r] \ - $reg($d),$reg($a),$reg($b) -in 011111sssss00000bbbbb00111100100 mtsrin $reg($s),$reg($b) -in 01111100000aaaaabbbbb00111101100 dcbtst $reg0($a),$reg($b) -in 01111100000aaaaabbbbb01000101100 dcbt $reg0($a),$reg($b) -in 011111sssssaaaaabbbbb0100011100r eqv{|.}[$r] $reg($a),$reg($s),$reg($b) -in 0111110000000000bbbbb01001100100 tlbie $reg($b) -in 011111dddddaaaaabbbbb01i01101100 ec{i|o}[$i]wx $reg($d),$reg0($a),$reg($b) -in 011111dddddrrrrrrrrrr01t10100110 m{f|t}[$t]spr $reg($d),$spr($r) -in 011111dddddaaaaabbbbb0101u101010 lwa{|u}[$u]x $reg($d),$reg($a),$reg($b) -in 01111100000000000000001011100100 tlbia -in 011111dddddtttttttttt01011100110 mftb $reg($d),$dec($t) -in 011111sssssaaaaabbbbb0110011100r orc{|.}[$r] $reg($a),$reg($s),$reg($b) -in 0111110000000000bbbbb01101100100 slbie $reg($b) -in 011111dddddaaaaabbbbbo111u010w1r div{d|w}[$w]{u|}[$u]{|o}[$o]{|.}[$r] \ - $reg($d),$reg($a),$reg($b) -in 01111100000aaaaabbbbb01110101100 dcbi $reg0($a),$reg($b) -in 011111sssssaaaaabbbbb0111011100r nand{|.}[$r] $reg($a),$reg($s),$reg($b) -in 01111100000000000000001111100100 slbia -in 011111ddddd00000bbbbb01100100100 slbmte $reg($d),$reg($b) -in 011111ddddd00000bbbbb11010100110 slbmfev $reg($d),$reg($b) -in 011111ddddd00000bbbbb11100100110 slbmfee $reg($d),$reg($b) -in 011111ddd00000000000010000000000 mcrxr $crf($d) -in 011111dddddaaaaabbbbb10000101010 lswx $reg($d),$reg0($a),$reg($b) -in 011111dddddaaaaabbbbb1w000101100 l{w|h}[$w]brx $reg($d),$reg0($a),$reg($b) -in 011111dddddaaaaabbbbb100su101110 lf{s|d}[$s]{|u}[$u]x \ - $fr($d),$reg0($a),$reg($b) -in 011111sssssaaaaabbbbb1x000110w0r sr{|a}[$x]{w|d}[$w]{|.}[$r] \ - $reg($a),$reg($s),$reg($b) -in 011111sssssaaaaabbbbb1000011011r srd{|.}[$r] $reg($a),$reg($s),$reg($b) -in 01111100000000000000010001101100 tlbsync -in 011111ddddd0rrrr0000010010101100 mfsr $reg($d),$dec($r) -in 011111dddddaaaaannnnn10010101010 lswi $reg($d),$reg0($a),$dec($n) -in 011111000ll000000000010010101100 {sync|?|ptesync|?}[$l] -in 011111ddddd00000bbbbb10100100110 mfsrin $reg($d),$reg($b) -in 011111sssssaaaaabbbbb10100101010 stswx $reg($s),$reg0($a),$reg($b) -in 011111sssssaaaaabbbbb1w100101100 st{w|h}[$w]brx $reg($s),$reg0($a),$reg($b) -in 011111sssssaaaaabbbbb101du101110 stf{s|d}[$d]{|u}[$u]x \ - $fr($s),{$reg0($a)|$reg($a)}[$u],$reg($b) -in 011111sssssaaaaannnnn10110101010 stswi $reg($s),$reg0($a),$dec($n) -in 011111dddddaaaaasssss1100111000r srawi{|.}[$r] $reg($a),$reg($d),$dec($s) -in 011111dddddaaaaasssss110011101Sr sradi{|.}[$r] $reg($a),$reg($d),$dec($[Ssssss]) -in 01111100000000000000011010101100 eieio -in 00000000000000000000001000000000 attn -in 011111sssssaaaaa00000111xx11010r exts{h|b|w|?}[$x]{|.}[$r] $reg($a),$reg($s) -in 01111100000aaaaabbbbb11110101100 icbi $reg0($a),$reg($b) -in 011111sssssaaaaabbbbb11110101110 stfiwx $fr($s),$reg0($a),$reg($b) -in 01111100000aaaaabbbbb11111101100 dcbz $reg0($a),$reg($b) -in 011Axsaaaaadddddiiiiiiiiiiiiiiii {{|x}[$x]or|{and|?}[$x]}[$A]i{|s}[$s]\ - {|.}[$A] $reg($d),$reg($a),\ - {$hex($i)|$shifted16($i)}[$s] -# Grouping andi with xori and ori may not be such a brilliant idea, since it -# gets invoked as a catch-all for the 011111 instructions below. But that -# just means that we get a different sort of undefined instruction. -in 10111sdddddaaaaaiiiiiiiiiiiiiiii {l|st}[$s]mw \ - $reg($d),$simm16($i)($reg0($a)) -in 10oooudddddaaaaaiiiiiiiiiiiiiiii $ldst($o){|u}[$u] \ - $reg($d),$simm16($i)($reg0($a)) -in 110sDudddddaaaaaiiiiiiiiiiiiiiii {l|st}[$s]f{s|d}[$D]{|u}[$u] \ - $fr($d),$simm16($i)($reg0($a)) -in 111010dddddaaaaaiiiiiiiiiiiiiixy l{d{|u}[$y]|{|?}[$y]w}[$x] \ - $reg($d),$simm16($i)($reg0($a)) -in 111s11dddddaaaaabbbbb0000010010r fdiv{s|}[$s]{|.}[$r] \ - $fr($d),$fr($a),$fr($b) -in 111s11dddddaaaaabbbbb000001010xr f{sub|add}[$x]{s|}[$s]{|.}[$r] \ - $fr($d),$fr($a),$fr($b) -in 111s11ddddd00000bbbbb0000010110r fsqrt{s|}[$s]{|.}[$r] $fr($d),$fr($b) -in 111011ddddd00000bbbbb0000011000r fress{|.}[$r] $fr($d),$fr($b) -in 111s11dddddaaaaa00000ccccc11001r fmul{s|}[$s]{|.}[$r] \ - $fr($d),$fr($a),$fr($c) -in 111s11dddddaaaaabbbbbccccc111nxr f{|n}[$n]m{sub|add}[$x]{s|}[$s]{|.}[$r] \ - $fr($d),$fr($a),$fr($c),$fr($b) -in 111110sssssaaaaaiiiiiiiiiiiiii0u std{|u}[$u] \ - $reg($s),$simm16($i)({$reg0($a)|$reg($a)}[$u]) -in 111111ccc00aaaaabbbbb0000o000000 fcmp{u|o}[$o] $crf($c),$fr($a),$fr($b) -in 111111ddddd00000bbbbb0000001100r frsp{|.}[$r] $fr($d),$fr($b) -in 111111ddddd00000bbbbb000000111zr fctiw{|z}[$z]{|.}[$r] $fr($d),$fr($b) -in 111111dddddaaaaabbbbbccccc10111r fsel{|.}[$r] \ - $fr($d),$fr($a),$fr($c),$fr($b) -in 111111ddddd00000bbbbb0000011010r frsqrte{|,.}[$r] $fr($d),$fr($b) -in 111111ddddd0000000000000xx00110r mtfsb{?|1|0|?}[$x]{|.}[$r] $fcond($d) -in 111111ddddd00000bbbbb0000101000r fneg{|.}[$r] $fr($d),$fr($b) -in 111111ddd00sss000000000010000000 mcrfs $crf($d),$crf($s) -in 111111ddddd00000bbbbb0001001000r fmr{|.}[$r] $fr($d),$fr($b) -in 111111ddd0000000iiii00010000110r mtfsfi{|.}[$r] $crf($d),$simm16($i) -in 111111ddddd00000bbbbb0010001000r fnabs{|.}[$r] $fr($d),$fr($b) -in 111111ddddd00000bbbbb0100001000r fabs{|.}[$r] $fr($d),$fr($b) -in 111111ddddd00000000001001000111r mffs{|.}[$r] $fr($d) -in 1111110ffffffff0bbbbb1011000111r mtfsf{|.}[$r] $hex($f),$fr($b) -in 111111ddddd00000bbbbb110010111zr fctid{|z}[$z]{|.}[$r] $fr($d),$fr($b) -in 111111ddddd00000bbbbb1101001110r fcfid{|.}[$r] $fr($d),$fr($b) - -in xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx ? - - -ldst ooo {lwz|lbz|stw|stb|lhz|lha|sth|?}[$o] -br utdzyrrrcc(%a,%l,s,%C) b{d{nz|z}[$z]|{|?}[$z]}[$d]{c|}[$u]\ - {|l}[$l]{|a}[$a]$s \ - {$crcom($r)$cond($[cct]){|,}[$C]|}[$u] -cond ccc {ge|lt|le|gt|ne|eq|ns|so}[$c] -fcond ccc $hex($c) -crb rrrcc $cr($r):$cond($[cc1]) -crop oooo {?|nor|?|?|andc|?|xor|nand|and|eqv|?|?|?|orc|or|?}[$o] -tcond ccccc {?|lgt|llt|?|eq|lge|lle|?|?|?|?|?|ge|?|?|?|lt|?|?|?|le|?|?|?|ne|?|?|?|?|?|?|a}[$c] - -spr 0000000000 mq -spr 0000100000 xer -spr 0010l00000 rtc{u|l}[$l] -spr s011000000 dec{u|s}[$s] -spr 0100000000 lr -spr 0100100000 ctr -spr 1001000000 dsisr -spr 1001100000 dar -spr 1100100000 sdr1 -spr 1101n00000 srr$dec($n) -spr 100nn01000 sprg$dec($n) -spr 1101001000 ear -spr 1101101000 pvr -spr 10nnl10000 ibat$dec($n){u|l}[$l] -spr 1000n11111 hid$dec($n) -spr 1001011111 iabr -spr 1010111111 dabr -spr 1111111111 pir -spr 0000110000 hspr0 -spr 0000110001 hspr1 -spr 0000110110 hdec0 -spr 0000111010 hsrr0 -spr 0000111011 hsrr1 -spr xxxxxxxxxx ? - -reg0 00000 0 -reg0 nnnnn $reg($n) - -reg (%n) r$dec($n) -fr (%n) fr$dec($n) -cr (%n) cr$dec($n) -crf (%n) crf$dec($n) -crcom 000 -crcom nnn $cr($n), - -simm16 snnnnnnnnnnnnnnn {$hex($n)|-$hex((1 << 15) - $n)}[$s] - -shifted16 (%n) $hex($n << 16) - -brabs (%n) $hex($n << 2) - -hex (%n) : -dec (%n) : -mbz (%n) : -brdispl (%d,%n) : diff --git a/osfmk/ppc/ppc_init.c b/osfmk/ppc/ppc_init.c deleted file mode 100644 index 9be44aed8..000000000 --- a/osfmk/ppc/ppc_init.c +++ /dev/null @@ -1,302 +0,0 @@ -/* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -extern unsigned int mckFlags; -extern vm_offset_t intstack; -extern vm_offset_t debstack; - -extern unsigned int extPatchMCK; -extern unsigned int extPatch32; -extern unsigned int hwulckPatch_isync; -extern unsigned int hwulckPatch_eieio; -extern unsigned int hwulckbPatch_isync; -extern unsigned int hwulckbPatch_eieio; -extern unsigned int mulckPatch_isync; -extern unsigned int mulckPatch_eieio; -extern unsigned int mulckePatch_isync; -extern unsigned int mulckePatch_eieio; -extern unsigned int sulckPatch_isync; -extern unsigned int sulckPatch_eieio; -extern unsigned int rwlesPatch_isync; -extern unsigned int rwlesPatch_eieio; -extern unsigned int rwldPatch_isync; -extern unsigned int rwldPatch_eieio; -extern unsigned int bcopy_nop_if_32bit; -extern unsigned int bcopy_nc_nop_if_32bit; -extern unsigned int memcpy_nop_if_32bit; -extern unsigned int xsum_nop_if_32bit; -extern unsigned int uft_nop_if_32bit; -extern unsigned int uft_uaw_nop_if_32bit; -extern unsigned int uft_cuttrace; - -int forcenap = 0; -int wcte = 0; /* Non-cache gather timer disabled */ - -int debug_task; - -patch_entry_t patch_table[] = { - {&extPatch32, 0x60000000, PATCH_FEATURE, PatchExt32}, - {&extPatchMCK, 0x60000000, PATCH_PROCESSOR, CPU_SUBTYPE_POWERPC_970}, - {&hwulckPatch_isync, 0x60000000, PATCH_FEATURE, PatchLwsync}, - {&hwulckPatch_eieio, 0x7c2004ac, PATCH_FEATURE, PatchLwsync}, - {&hwulckbPatch_isync, 0x60000000, PATCH_FEATURE, PatchLwsync}, - {&hwulckbPatch_eieio, 0x7c2004ac, PATCH_FEATURE, PatchLwsync}, - {&mulckPatch_isync, 0x60000000, PATCH_FEATURE, PatchLwsync}, - {&mulckPatch_eieio, 0x7c2004ac, PATCH_FEATURE, PatchLwsync}, - {&mulckePatch_isync, 0x60000000, PATCH_FEATURE, PatchLwsync}, - {&mulckePatch_eieio, 0x7c2004ac, PATCH_FEATURE, PatchLwsync}, - {&sulckPatch_isync, 0x60000000, PATCH_FEATURE, PatchLwsync}, - {&sulckPatch_eieio, 0x7c2004ac, PATCH_FEATURE, PatchLwsync}, - {&rwlesPatch_isync, 0x60000000, PATCH_FEATURE, PatchLwsync}, - {&rwlesPatch_eieio, 0x7c2004ac, PATCH_FEATURE, PatchLwsync}, - {&rwldPatch_isync, 0x60000000, PATCH_FEATURE, PatchLwsync}, - {&rwldPatch_eieio, 0x7c2004ac, PATCH_FEATURE, PatchLwsync}, - {&bcopy_nop_if_32bit, 0x60000000, PATCH_FEATURE, PatchExt32}, - {&bcopy_nc_nop_if_32bit,0x60000000, PATCH_FEATURE, PatchExt32}, - {&memcpy_nop_if_32bit, 0x60000000, PATCH_FEATURE, PatchExt32}, - {&xsum_nop_if_32bit, 0x60000000, PATCH_FEATURE, PatchExt32}, - {&uft_nop_if_32bit, 0x60000000, PATCH_FEATURE, PatchExt32}, - {&uft_uaw_nop_if_32bit, 0x60000000, PATCH_FEATURE, PatchExt32}, - {&uft_cuttrace, 0x60000000, PATCH_FEATURE, PatchExt32}, - {NULL, 0x00000000, PATCH_END_OF_TABLE, 0} - }; - - -/* - * Forward definition - */ -void ppc_init( - boot_args *args); - -void ppc_init_cpu( - struct per_proc_info *proc_info); - - -/* - * Routine: ppc_init - * Function: - */ -void -ppc_init( - boot_args *args) -{ - unsigned int maxmem; - uint64_t xmaxmem; - uint64_t newhid; - unsigned int cputrace; - unsigned int novmx; - unsigned int mcksoft; - thread_t thread; - mapping_t *mp; - uint64_t scdata; - - - /* - * Setup per_proc info for first cpu. - */ - - BootProcInfo.cpu_number = 0; - BootProcInfo.cpu_flags = 0; - BootProcInfo.istackptr = 0; /* we're on the interrupt stack */ - BootProcInfo.intstack_top_ss = (vm_offset_t)&intstack + INTSTACK_SIZE - FM_SIZE; - BootProcInfo.debstack_top_ss = (vm_offset_t)&debstack + kernel_stack_size - FM_SIZE; - BootProcInfo.debstackptr = BootProcInfo.debstack_top_ss; - BootProcInfo.interrupts_enabled = 0; - BootProcInfo.pending_ast = AST_NONE; - BootProcInfo.FPU_owner = NULL; - BootProcInfo.VMX_owner = NULL; - BootProcInfo.pp_cbfr = console_per_proc_alloc(TRUE); - BootProcInfo.rtcPop = EndOfAllTime; - queue_init(&BootProcInfo.rtclock_timer.queue); - BootProcInfo.rtclock_timer.deadline = EndOfAllTime; - BootProcInfo.pp2ndPage = (addr64_t)(uintptr_t)&BootProcInfo; /* Initial physical address of the second page */ - - BootProcInfo.pms.pmsStamp = 0; /* Dummy transition time */ - BootProcInfo.pms.pmsPop = EndOfAllTime; /* Set the pop way into the future */ - - BootProcInfo.pms.pmsState = pmsParked; /* Park the power stepper */ - BootProcInfo.pms.pmsCSetCmd = pmsCInit; /* Set dummy initial hardware state */ - - mp = (mapping_t *)BootProcInfo.ppUMWmp; - mp->mpFlags = 0x01000000 | mpLinkage | mpPerm | 1; - mp->mpSpace = invalSpace; - - pmsInit(); /* Initialize the stepper */ - - thread_bootstrap(); - - thread = current_thread(); - thread->machine.curctx = &thread->machine.facctx; - thread->machine.facctx.facAct = thread; - thread->machine.umwSpace = invalSpace; /* Initialize user memory window space to invalid */ - thread->machine.preemption_count = 1; - - cpu_bootstrap(); - cpu_init(); - - master_cpu = 0; - processor_bootstrap(); - - timer_start(&thread->system_timer, mach_absolute_time()); - PROCESSOR_DATA(master_processor, kernel_timer) = - PROCESSOR_DATA(master_processor, thread_timer) = &thread->system_timer; - - static_memory_end = round_page(args->topOfKernelData);; - - PE_init_platform(FALSE, args); /* Get platform expert set up */ - - if (!PE_parse_boot_argn("novmx", &novmx, sizeof (novmx))) novmx=0; /* Special run without VMX? */ - if(novmx) { /* Yeah, turn it off */ - BootProcInfo.pf.Available &= ~pfAltivec; /* Turn off Altivec available */ - __asm__ volatile("mtsprg 2,%0" : : "r" (BootProcInfo.pf.Available)); /* Set live value */ - } - - if (!PE_parse_boot_argn("fn", &forcenap, sizeof (forcenap))) forcenap = 0; /* If force nap not set, make 0 */ - else { - if(forcenap < 2) forcenap = forcenap + 1; /* Else set 1 for off, 2 for on */ - else forcenap = 0; /* Clear for error case */ - } - - if (!PE_parse_boot_argn("pmsx", &pmsExperimental, sizeof (pmsExperimental))) pmsExperimental = 0; /* Check if we should start in experimental power management stepper mode */ - if (!PE_parse_boot_argn("lcks", &LcksOpts, sizeof (LcksOpts))) LcksOpts = 0; /* Set lcks options */ - if (!PE_parse_boot_argn("diag", &dgWork.dgFlags, sizeof (dgWork.dgFlags))) dgWork.dgFlags = 0; /* Set diagnostic flags */ - if(dgWork.dgFlags & enaExpTrace) trcWork.traceMask = 0xFFFFFFFF; /* If tracing requested, enable it */ - - if(PE_parse_boot_argn("ctrc", &cputrace, sizeof (cputrace))) { /* See if tracing is limited to a specific cpu */ - trcWork.traceMask = (trcWork.traceMask & 0xFFFFFFF0) | (cputrace & 0xF); /* Limit to 4 */ - } - - if(!PE_parse_boot_argn("tb", &trcWork.traceSize, sizeof (trcWork.traceSize))) { /* See if non-default trace buffer size */ -#if DEBUG - trcWork.traceSize = 32; /* Default 32 page trace table for DEBUG */ -#else - trcWork.traceSize = 8; /* Default 8 page trace table for RELEASE */ -#endif - } - - if(trcWork.traceSize < 1) trcWork.traceSize = 1; /* Minimum size of 1 page */ - if(trcWork.traceSize > 256) trcWork.traceSize = 256; /* Maximum size of 256 pages */ - trcWork.traceSize = trcWork.traceSize * 4096; /* Change page count to size */ - - if (!PE_parse_boot_argn("maxmem", &maxmem, sizeof (maxmem))) - xmaxmem=0; - else - xmaxmem = (uint64_t)maxmem * (1024 * 1024); - - if (!PE_parse_boot_argn("wcte", &wcte, sizeof (wcte))) wcte = 0; /* If write combine timer enable not supplied, make 1 */ - else wcte = (wcte != 0); /* Force to 0 or 1 */ - - if (!PE_parse_boot_argn("mcklog", &mckFlags, sizeof (mckFlags))) mckFlags = 0; /* If machine check flags not specified, clear */ - else if(mckFlags > 1) mckFlags = 0; /* If bogus, clear */ - - if (!PE_parse_boot_argn("ht_shift", &hash_table_shift, sizeof (hash_table_shift))) /* should we use a non-default hash table size? */ - hash_table_shift = 0; /* no, use default size */ - - /* - * VM initialization, after this we're using page tables... - */ - - ppc_vm_init(xmaxmem, args); - - if(BootProcInfo.pf.Available & pf64Bit) { /* Are we on a 64-bit machine */ - - if(!wcte) { - (void)ml_scom_read(GUSModeReg << 8, &scdata); /* Get GUS mode register */ - scdata = scdata | GUSMstgttoff; /* Disable the NCU store gather timer */ - (void)ml_scom_write(GUSModeReg << 8, scdata); /* Get GUS mode register */ - } - - if(PE_parse_boot_argn("mcksoft", &mcksoft, sizeof (mcksoft))) { /* Have they supplied "machine check software recovery? */ - newhid = BootProcInfo.pf.pfHID5; /* Get the old HID5 */ - if(mcksoft < 2) { - newhid &= 0xFFFFFFFFFFFFDFFFULL; /* Clear the old one */ - newhid |= (mcksoft & 1) << 13; /* Set new value to enable machine check recovery */ - BootProcInfo.pf.pfHID5 = newhid; /* Set the new one */ - hid5set64(newhid); /* Set the hid for this processir */ - } - } - } - - machine_startup(); -} - -/* - * Routine: ppc_init_cpu - * Function: - */ -void -ppc_init_cpu( - struct per_proc_info *proc_info) -{ - uint64_t scdata; - - proc_info->cpu_flags &= ~SleepState; - - if((BootProcInfo.pf.Available & pf64Bit) && !wcte) { /* Should we disable the store gather timer? */ - (void)ml_scom_read(GUSModeReg << 8, &scdata); /* Get GUS mode register */ - scdata = scdata | GUSMstgttoff; /* Disable the NCU store gather timer */ - (void)ml_scom_write(GUSModeReg << 8, scdata); /* Get GUS mode register */ - } - - cpu_init(); - - slave_main(NULL); -} diff --git a/osfmk/ppc/ppc_vm_init.c b/osfmk/ppc/ppc_vm_init.c deleted file mode 100644 index e94b6b545..000000000 --- a/osfmk/ppc/ppc_vm_init.c +++ /dev/null @@ -1,427 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * @APPLE_FREE_COPYRIGHT@ - */ - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -extern const char version[]; -extern const char version_variant[]; - -addr64_t hash_table_base; /* Hash table base */ -unsigned int hash_table_size; /* Hash table size */ -int hash_table_shift; /* "ht_shift" boot arg, used to scale hash_table_size */ -vm_offset_t taproot_addr; /* (BRINGUP) */ -unsigned int taproot_size; /* (BRINGUP) */ -extern int disableConsoleOutput; - -struct shadowBAT shadow_BAT; - - - -/* - * NOTE: mem_size is bogus on large memory machines. We will pin it to 0x80000000 if there is more than 2 GB - * This is left only for compatibility and max_mem should be used. - */ -vm_offset_t mem_size; /* Size of actual physical memory present - minus any performance buffer and possibly limited - by mem_limit in bytes */ -uint64_t mem_actual; /* The "One True" physical memory size - actually, it's the highest physical address + 1 */ -uint64_t max_mem; /* Size of physical memory (bytes), adjusted by maxmem */ -uint64_t sane_size; /* Memory size to use for defaults calculations */ - - -mem_region_t pmap_mem_regions[PMAP_MEM_REGION_MAX + 1]; -unsigned int pmap_mem_regions_count; /* Assume no non-contiguous memory regions */ - -unsigned int avail_remaining = 0; -vm_offset_t first_avail; -vm_offset_t static_memory_end; -addr64_t vm_last_addr = VM_MAX_KERNEL_ADDRESS; /* Highest kernel virtual address known to the VM system */ - -vm_offset_t sectTEXTB; -unsigned long sectSizeTEXT; -vm_offset_t sectDATAB; -unsigned long sectSizeDATA; -vm_offset_t sectLINKB; -unsigned long sectSizeLINK; -vm_offset_t sectKLDB; -unsigned long sectSizeKLD; -vm_offset_t sectPRELINKB; -unsigned long sectSizePRELINK; -vm_offset_t sectHIBB; -unsigned long sectSizeHIB; - -vm_offset_t end, etext, edata; - -extern unsigned long exception_entry; -extern unsigned long exception_end; - - -void ppc_vm_init(uint64_t mem_limit, boot_args *args) -{ - unsigned int i, kmapsize, pvr; - vm_offset_t addr; - unsigned int *xtaproot, bank_shift; - uint64_t cbsize, xhid0; - - -/* - * Invalidate all shadow BATs - */ - - /* Initialize shadow IBATs */ - shadow_BAT.IBATs[0].upper=BAT_INVALID; - shadow_BAT.IBATs[0].lower=BAT_INVALID; - shadow_BAT.IBATs[1].upper=BAT_INVALID; - shadow_BAT.IBATs[1].lower=BAT_INVALID; - shadow_BAT.IBATs[2].upper=BAT_INVALID; - shadow_BAT.IBATs[2].lower=BAT_INVALID; - shadow_BAT.IBATs[3].upper=BAT_INVALID; - shadow_BAT.IBATs[3].lower=BAT_INVALID; - - /* Initialize shadow DBATs */ - shadow_BAT.DBATs[0].upper=BAT_INVALID; - shadow_BAT.DBATs[0].lower=BAT_INVALID; - shadow_BAT.DBATs[1].upper=BAT_INVALID; - shadow_BAT.DBATs[1].lower=BAT_INVALID; - shadow_BAT.DBATs[2].upper=BAT_INVALID; - shadow_BAT.DBATs[2].lower=BAT_INVALID; - shadow_BAT.DBATs[3].upper=BAT_INVALID; - shadow_BAT.DBATs[3].lower=BAT_INVALID; - - - /* - * Go through the list of memory regions passed in via the boot_args - * and copy valid entries into the pmap_mem_regions table, adding - * further calculated entries. - * - * boot_args version 1 has address instead of page numbers - * in the PhysicalDRAM banks, set bank_shift accordingly. - */ - - bank_shift = 0; - if (args->Version == kBootArgsVersion1) bank_shift = 12; - - pmap_mem_regions_count = 0; - max_mem = 0; /* Will use to total memory found so far */ - mem_actual = 0; /* Actual size of memory */ - - if (mem_limit == 0) mem_limit = 0xFFFFFFFFFFFFFFFFULL; /* If there is no set limit, use all */ - - for (i = 0; i < kMaxDRAMBanks; i++) { /* Look at all of the banks */ - - cbsize = (uint64_t)args->PhysicalDRAM[i].size << (12 - bank_shift); /* Remember current size */ - - if (!cbsize) continue; /* Skip if the bank is empty */ - - mem_actual = mem_actual + cbsize; /* Get true memory size */ - - if(mem_limit == 0) continue; /* If we hit restriction, just keep counting */ - - if (cbsize > mem_limit) cbsize = mem_limit; /* Trim to max allowed */ - max_mem += cbsize; /* Total up what we have so far */ - mem_limit = mem_limit - cbsize; /* Calculate amount left to do */ - - pmap_mem_regions[pmap_mem_regions_count].mrStart = args->PhysicalDRAM[i].base >> bank_shift; /* Set the start of the bank */ - pmap_mem_regions[pmap_mem_regions_count].mrAStart = pmap_mem_regions[pmap_mem_regions_count].mrStart; /* Set the start of allocatable area */ - pmap_mem_regions[pmap_mem_regions_count].mrEnd = ((uint64_t)args->PhysicalDRAM[i].base >> bank_shift) + (cbsize >> 12) - 1; /* Set the end address of bank */ - pmap_mem_regions[pmap_mem_regions_count].mrAEnd = pmap_mem_regions[pmap_mem_regions_count].mrEnd; /* Set the end address of allocatable area */ - - /* Regions must be provided in ascending order */ - assert ((pmap_mem_regions_count == 0) || - pmap_mem_regions[pmap_mem_regions_count].mrStart > - pmap_mem_regions[pmap_mem_regions_count-1].mrStart); - - pmap_mem_regions_count++; /* Count this region */ - } - - mem_size = (unsigned int)max_mem; /* Get size of memory */ - if(max_mem > 0x0000000080000000ULL) mem_size = 0x80000000; /* Pin at 2 GB */ - - sane_size = max_mem; /* Calculate a sane value to use for init */ - if(sane_size > (addr64_t)(VM_MAX_KERNEL_ADDRESS + 1)) - sane_size = (addr64_t)(VM_MAX_KERNEL_ADDRESS + 1); /* If flush with ram, use addressible portion */ - - -/* - * Initialize the pmap system, using space above `first_avail' - * for the necessary data structures. - * NOTE : assume that we'll have enough space mapped in already - */ - - first_avail = static_memory_end; - - /* - * Now retrieve addresses for end, edata, and etext - * from MACH-O headers for the currently running 32 bit kernel. - */ - /* XXX fix double casts for 64 bit kernel */ - sectTEXTB = (vm_offset_t)(uint32_t *)getsegdatafromheader( - &_mh_execute_header, "__TEXT", §SizeTEXT); - sectDATAB = (vm_offset_t)(uint32_t *)getsegdatafromheader( - &_mh_execute_header, "__DATA", §SizeDATA); - sectLINKB = (vm_offset_t)(uint32_t *)getsegdatafromheader( - &_mh_execute_header, "__LINKEDIT", §SizeLINK); - sectKLDB = (vm_offset_t)(uint32_t *)getsegdatafromheader( - &_mh_execute_header, "__KLD", §SizeKLD); - sectHIBB = (vm_offset_t)(uint32_t *)getsegdatafromheader( - &_mh_execute_header, "__HIB", §SizeHIB); - sectPRELINKB = (vm_offset_t)(uint32_t *)getsegdatafromheader( - &_mh_execute_header, "__PRELINK_TEXT", §SizePRELINK); - - etext = (vm_offset_t) sectTEXTB + sectSizeTEXT; - edata = (vm_offset_t) sectDATAB + sectSizeDATA; - end = round_page(getlastaddr()); /* Force end to next page */ - - kmapsize = (round_page(exception_end) - trunc_page(exception_entry)) + /* Get size we will map later */ - (round_page(sectTEXTB+sectSizeTEXT) - trunc_page(sectTEXTB)) + - (round_page(sectDATAB+sectSizeDATA) - trunc_page(sectDATAB)) + - (round_page(sectLINKB+sectSizeLINK) - trunc_page(sectLINKB)) + - (round_page(sectKLDB+sectSizeKLD) - trunc_page(sectKLDB)) + - (round_page_32(sectKLDB+sectSizeHIB) - trunc_page_32(sectHIBB)) + - (round_page(sectPRELINKB+sectSizePRELINK) - trunc_page(sectPRELINKB)) + - (round_page(static_memory_end) - trunc_page(end)); - - pmap_bootstrap(max_mem, &first_avail, kmapsize); - - pmap_map(trunc_page(exception_entry), trunc_page(exception_entry), - round_page(exception_end), VM_PROT_READ|VM_PROT_EXECUTE, VM_WIMG_USE_DEFAULT); - - pmap_map(trunc_page(sectTEXTB), trunc_page(sectTEXTB), - round_page(sectTEXTB+sectSizeTEXT), VM_PROT_READ|VM_PROT_EXECUTE, VM_WIMG_USE_DEFAULT); - - pmap_map(trunc_page(sectDATAB), trunc_page(sectDATAB), - round_page(sectDATAB+sectSizeDATA), VM_PROT_READ|VM_PROT_WRITE, VM_WIMG_USE_DEFAULT); - -/* The KLD and LINKEDIT segments are unloaded in toto after boot completes, -* but via ml_static_mfree(), through IODTFreeLoaderInfo(). Hence, we have -* to map both segments page-by-page. -*/ - - for (addr = trunc_page(sectPRELINKB); - addr < round_page(sectPRELINKB+sectSizePRELINK); - addr += PAGE_SIZE) { - - pmap_enter(kernel_pmap, (vm_map_offset_t)addr, (ppnum_t)(addr>>12), - VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE, - VM_WIMG_USE_DEFAULT, TRUE); - - } - - for (addr = trunc_page(sectKLDB); - addr < round_page(sectKLDB+sectSizeKLD); - addr += PAGE_SIZE) { - - pmap_enter(kernel_pmap, (vm_map_offset_t)addr, (ppnum_t)(addr>>12), - VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE, - VM_WIMG_USE_DEFAULT, TRUE); - - } - - for (addr = trunc_page(sectLINKB); - addr < round_page(sectLINKB+sectSizeLINK); - addr += PAGE_SIZE) { - - pmap_enter(kernel_pmap, (vm_map_offset_t)addr, - (ppnum_t)(addr>>12), - VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE, - VM_WIMG_USE_DEFAULT, TRUE); - - } - - for (addr = trunc_page_32(sectHIBB); - addr < round_page_32(sectHIBB+sectSizeHIB); - addr += PAGE_SIZE) { - - pmap_enter(kernel_pmap, (vm_map_offset_t)addr, (ppnum_t)(addr>>12), - VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE, - VM_WIMG_USE_DEFAULT, TRUE); - - } - - pmap_enter(kernel_pmap, (vm_map_offset_t)(uintptr_t)&sharedPage, - (ppnum_t)&sharedPage >> 12, /* Make sure the sharedPage is mapped */ - VM_PROT_READ|VM_PROT_WRITE, - VM_WIMG_USE_DEFAULT, TRUE); - - pmap_enter(kernel_pmap, (vm_map_offset_t)(uintptr_t)&lowGlo.lgVerCode, - (ppnum_t)&lowGlo.lgVerCode >> 12, /* Make sure the low memory globals are mapped */ - VM_PROT_READ|VM_PROT_WRITE, - VM_WIMG_USE_DEFAULT, TRUE); - -/* - * We need to map the remainder page-by-page because some of this will - * be released later, but not all. Ergo, no block mapping here - */ - - for(addr = trunc_page(end); addr < round_page(static_memory_end); addr += PAGE_SIZE) { - - pmap_enter(kernel_pmap, (vm_map_address_t)addr, (ppnum_t)addr>>12, - VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE, - VM_WIMG_USE_DEFAULT, TRUE); - - } - -/* - * Here we map a window into the kernel address space that will be used to - * access a slice of a user address space. Clients for this service include - * copyin/out and copypv. - */ - - lowGlo.lgUMWvaddr = USER_MEM_WINDOW_VADDR; - /* Initialize user memory window base address */ - MapUserMemoryWindowInit(); /* Go initialize user memory window */ - -/* - * At this point, there is enough mapped memory and all hw mapping structures are - * allocated and initialized. Here is where we turn on translation for the - * VERY first time.... - * - * NOTE: Here is where our very first interruption will happen. - * - */ - - hw_start_trans(); /* Start translating */ - PE_init_platform(TRUE, args); /* Initialize this right off the bat */ - - -#if 0 - GratefulDebInit((bootBumbleC *)&(args->Video)); /* Initialize the GratefulDeb debugger */ -#endif - - - printf_init(); /* Init this in case we need debugger */ - panic_init(); /* Init this in case we need debugger */ - PE_init_kprintf(TRUE); /* Note on PPC we only call this after VM is set up */ - - kprintf("kprintf initialized\n"); - - serialmode = 0; /* Assume normal keyboard and console */ - if(PE_parse_boot_argn("serial", &serialmode, sizeof (serialmode))) { /* Do we want a serial keyboard and/or console? */ - kprintf("Serial mode specified: %08X\n", serialmode); - } - if(serialmode & 1) { /* Start serial if requested */ - (void)switch_to_serial_console(); /* Switch into serial mode */ - disableConsoleOutput = FALSE; /* Allow printfs to happen */ - } - - kprintf("max_mem: %ld M\n", (unsigned long)(max_mem >> 20)); - kprintf("version_variant = %s\n", version_variant); - kprintf("version = %s\n\n", version); - __asm__ ("mfpvr %0" : "=r" (pvr)); - kprintf("proc version = %08x\n", pvr); - if(getPerProc()->pf.Available & pf64Bit) { /* 64-bit processor? */ - xhid0 = hid0get64(); /* Get the hid0 */ - if(xhid0 & (1ULL << (63 - 19))) kprintf("Time base is externally clocked\n"); - else kprintf("Time base is internally clocked\n"); - } - - - taproot_size = PE_init_taproot(&taproot_addr); /* (BRINGUP) See if there is a taproot */ - if(taproot_size) { /* (BRINGUP) */ - kprintf("TapRoot card configured to use vaddr = %08X, size = %08X\n", taproot_addr, taproot_size); - bcopy_nc(version, (void *)(taproot_addr + 16), strlen(version)); /* (BRINGUP) Pass it our kernel version */ - __asm__ volatile("eieio"); /* (BRINGUP) */ - xtaproot = (unsigned int *)taproot_addr; /* (BRINGUP) */ - xtaproot[0] = 1; /* (BRINGUP) */ - __asm__ volatile("eieio"); /* (BRINGUP) */ - } - - PE_create_console(); /* create the console for verbose or pretty mode */ - - /* setup console output */ - PE_init_printf(FALSE); - -#if DEBUG - printf("\n\n\nThis program was compiled using gcc %d.%d for powerpc\n", - __GNUC__,__GNUC_MINOR__); - - - /* Processor version information */ - __asm__ ("mfpvr %0" : "=r" (pvr)); - printf("processor version register : %08X\n", pvr); - - kprintf("Args at %p\n", args); - for (i = 0; i < pmap_mem_regions_count; i++) { - printf("DRAM at %08lX size %08lX\n", - args->PhysicalDRAM[i].base, - args->PhysicalDRAM[i].size); - } -#endif /* DEBUG */ - -#if DEBUG - kprintf("Mapped memory:\n"); - kprintf(" exception vector: %08X, %08X - %08X\n", trunc_page(exception_entry), - trunc_page(exception_entry), round_page(exception_end)); - kprintf(" sectTEXTB: %08X, %08X - %08X\n", trunc_page(sectTEXTB), - trunc_page(sectTEXTB), round_page(sectTEXTB+sectSizeTEXT)); - kprintf(" sectDATAB: %08X, %08X - %08X\n", trunc_page(sectDATAB), - trunc_page(sectDATAB), round_page(sectDATAB+sectSizeDATA)); - kprintf(" sectLINKB: %08X, %08X - %08X\n", trunc_page(sectLINKB), - trunc_page(sectLINKB), round_page(sectLINKB+sectSizeLINK)); - kprintf(" sectKLDB: %08X, %08X - %08X\n", trunc_page(sectKLDB), - trunc_page(sectKLDB), round_page(sectKLDB+sectSizeKLD)); - kprintf(" end: %08X, %08X - %08X\n", trunc_page(end), - trunc_page(end), static_memory_end); - -#endif - - return; -} - diff --git a/osfmk/ppc/proc_reg.h b/osfmk/ppc/proc_reg.h deleted file mode 100644 index 6fb49c613..000000000 --- a/osfmk/ppc/proc_reg.h +++ /dev/null @@ -1,403 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -#ifndef _PPC_PROC_REG_H_ -#define _PPC_PROC_REG_H_ - -#include - -/* Define some useful masks that convert from bit numbers */ - -#if __PPC__ -#ifdef __BIG_ENDIAN__ -#ifndef ENDIAN_MASK -#define ENDIAN_MASK(val,size) (1 << ((size-1) - val)) -#endif -#else -#error code not ported to little endian targets yet -#endif /* __BIG_ENDIAN__ */ -#endif /* __PPC__ */ - -#define MASK32(PART) ENDIAN_MASK(PART ## _BIT, 32) -#define MASK16(PART) ENDIAN_MASK(PART ## _BIT, 16) -#define MASK8(PART) ENDIAN_MASK(PART ## _BIT, 8) - -#undef MASK -#define MASK(PART) MASK32(PART) - -#define BITS_PER_WORD 32 -#define BITS_PER_WORD_POW2 5 - -/* Defines for decoding the MSR bits */ - -#define MSR_SF_BIT 0 -#define MSR_HV_BIT 3 -#define MSR_RES1_BIT 1 -#define MSR_RES2_BIT 2 -#define MSR_RES3_BIT 3 -#define MSR_RES4_BIT 4 -#define MSR_RES5_BIT 5 -#define MSR_VEC_BIT 6 -#define MSR_RES7_BIT 7 -#define MSR_RES8_BIT 8 -#define MSR_RES9_BIT 9 -#define MSR_RES10_BIT 10 -#define MSR_RES11_BIT 11 -#define MSR_KEY_BIT 12 /* Key bit on 603e (not on 603) */ -#define MSR_POW_BIT 13 -#define MSR_TGPR_BIT 14 /* Temporary GPR mappings on 603/603e */ -#define MSR_ILE_BIT 15 -#define MSR_EE_BIT 16 -#define MSR_PR_BIT 17 -#define MSR_FP_BIT 18 -#define MSR_ME_BIT 19 -#define MSR_FE0_BIT 20 -#define MSR_SE_BIT 21 -#define MSR_BE_BIT 22 -#define MSR_FE1_BIT 23 -#define MSR_RES24_BIT 24 /* AL bit in power architectures */ -#define MSR_IP_BIT 25 -#define MSR_IR_BIT 26 -#define MSR_DR_BIT 27 -#define MSR_RES28_BIT 28 -#define MSR_PM_BIT 29 -#define MSR_RI_BIT 30 -#define MSR_LE_BIT 31 - -/* MSR for kernel mode, interrupts disabled, running in virtual mode */ -#define MSR_SUPERVISOR_INT_OFF (MASK(MSR_ME) | MASK(MSR_IR) | MASK(MSR_DR)) - -/* MSR for above but with interrupts enabled */ -#define MSR_SUPERVISOR_INT_ON (MSR_SUPERVISOR_INT_OFF | MASK(MSR_EE)) - -/* MSR for physical mode code */ -#define MSR_VM_OFF (MASK(MSR_ME)) - -/* MSR for physical instruction, virtual data */ -#define MSR_PHYS_INST_VIRT_DATA (MASK(MSR_ME) | MASK(MSR_IR)) - -/* MSR mask for user-exported bits - identify bits that must be set/reset */ - -/* SET - external exceptions, machine check, vm on, user-level privs */ -#define MSR_EXPORT_MASK_SET (MASK(MSR_EE)| MASK(MSR_ME)| \ - MASK(MSR_IR)|MASK(MSR_DR)|MASK(MSR_PR)) - -/* only the following bits may be changed by a task */ -#define MSR_IMPORT_BITS (MASK(MSR_FE0)|MASK(MSR_SE)|MASK(MSR_BE)| \ - MASK(MSR_FE1)| MASK(MSR_PM) | MASK(MSR_LE)) - -#define MSR_PREPARE_FOR_IMPORT(origmsr, newmsr) \ - ((origmsr & ~MSR_IMPORT_BITS) | (newmsr & MSR_IMPORT_BITS)) - -#define MSR_VEC_ON (MASK(MSR_VEC)) - -#define USER_MODE(msr) (msr & MASK(MSR_PR) ? TRUE : FALSE) - -/* seg reg values must be simple expressions so that assembler can cope */ -#define SEG_REG_INVALID 0x0000 -#define KERNEL_SEG_REG0_VALUE 0x20000000 /* T=0,Ks=0,Ku=1 PPC_SID_KERNEL=0*/ - -/* For SEG_REG_PROT we have T=0, Ks=0, Ku=1 */ -#define SEG_REG_PROT 0x20000000 /* seg regs should have these bits set */ - -/* SR_COPYIN is used for copyin/copyout+remapping and must be - * saved and restored in the thread context. - */ -/* SR_UNUSED_BY_KERN is unused by the kernel, and thus contains - * the space ID of the currently interrupted user task immediately - * after an exception and before interrupts are reenabled. It's used - * purely for an assert. - */ - -/* SR_KERNEL used for asserts... */ - -#define SR_COPYIN sr14 -#define SR_UNUSED_BY_KERN sr13 -#define SR_KERNEL sr0 - -#define SR_UNUSED_BY_KERN_NUM 13 -#define SR_COPYIN_NAME sr14 -#define SR_COPYIN_NUM 14 -#define BAT_INVALID 0 - - -/* DSISR bits on data access exceptions */ - -#define DSISR_IO_BIT 0 /* NOT USED on 601 */ -#define DSISR_HASH_BIT 1 -#define DSISR_NOEX_BIT 3 -#define DSISR_PROT_BIT 4 -#define DSISR_IO_SPC_BIT 5 -#define DSISR_WRITE_BIT 6 -#define DSISR_WATCH_BIT 9 -#define DSISR_EIO_BIT 11 - -#define dsiMiss 0x40000000 -#define dsiMissb 1 -#define dsiNoEx 0x10000000 -#define dsiProt 0x08000000 -#define dsiInvMode 0x04000000 -#define dsiStore 0x02000000 -#define dsiAC 0x00400000 -#define dsiSeg 0x00200000 -#define dsiValid 0x5E600000 -#define dsiLinkage 0x00010000 /* Linkage mapping type - software flag */ -#define dsiLinkageb 15 /* Linkage mapping type - software flag */ -#define dsiSoftware 0x0000FFFF - -/* SRR1 bits on data/instruction translation exceptions */ - -#define SRR1_TRANS_HASH_BIT 1 -#define SRR1_TRANS_IO_BIT 3 -#define SRR1_TRANS_PROT_BIT 4 -#define SRR1_TRANS_NO_PTE_BIT 10 - -/* SRR1 bits on program exceptions */ - -#define SRR1_PRG_FE_BIT 11 -#define SRR1_PRG_ILL_INS_BIT 12 -#define SRR1_PRG_PRV_INS_BIT 13 -#define SRR1_PRG_TRAP_BIT 14 - -/* - * Virtual to physical mapping macros/structures. - * IMPORTANT NOTE: there is one mapping per HW page, not per MACH page. - */ - -#define PTE1_WIMG_GUARD_BIT 28 /* Needed for assembler */ -#define PTE1_REFERENCED_BIT 23 /* ditto */ -#define PTE1_CHANGED_BIT 24 -#define PTE0_HASH_ID_BIT 25 - -#define PTE_WIMG_CB_CACHED_COHERENT 0 /* cached, writeback, coherent (default) */ -#define PTE_WIMG_CB_CACHED_COHERENT_GUARDED 1 /* cached, writeback, coherent, guarded */ -#define PTE_WIMG_UNCACHED_COHERENT 2 /* uncached, coherentt */ -#define PTE_WIMG_UNCACHED_COHERENT_GUARDED 3 /* uncached, coherent, guarded */ - -#define PTE_WIMG_DEFAULT PTE_WIMG_CB_CACHED_COHERENT -#define PTE_WIMG_IO PTE_WIMG_UNCACHED_COHERENT_GUARDED - - - -#ifndef ASSEMBLER -#ifdef __GNUC__ - -/* Structures and types for machine registers */ - - -/* - * C-helper inline functions for accessing machine registers follow. - */ - - -/* - * Various memory/IO synchronisation instructions - */ - - /* Use eieio as a memory barrier to order stores. - * Useful for device control and PTE maintenance. - */ - -#define eieio() \ - __asm__ volatile("eieio") - - /* Use sync to ensure previous stores have completed. - This is required when manipulating locks and/or - maintaining PTEs or other shared structures on SMP - machines. - */ - -#define sync() \ - __asm__ volatile("sync") - - /* Use isync to sychronize context; that is, the ensure - no prefetching of instructions happen before the - instruction. - */ - -#define isync() \ - __asm__ volatile("isync") - - -/* - * Access to various system registers - */ - -extern unsigned int mflr(void); - -extern __inline__ unsigned int mflr(void) -{ - unsigned int result; - __asm__ volatile("mflr %0" : "=r" (result)); - return result; -} - -extern unsigned int mfpvr(void); - -extern __inline__ unsigned int mfpvr(void) -{ - unsigned int result; - __asm__ ("mfpvr %0" : "=r" (result)); - return result; -} - -/* mtmsr might need syncs etc around it, don't provide simple - * inline macro - */ - -extern unsigned int mfmsr(void); - -extern __inline__ unsigned int mfmsr(void) -{ - unsigned int result; - __asm__ volatile("mfmsr %0" : "=r" (result)); - return result; -} - - -extern unsigned int mfdar(void); - -extern __inline__ unsigned int mfdar(void) -{ - unsigned int result; - __asm__ volatile("mfdar %0" : "=r" (result)); - return result; -} - -extern void mtdec(unsigned int val); - -extern __inline__ void mtdec(unsigned int val) -{ - __asm__ volatile("mtdec %0" : : "r" (val)); - return; -} - -extern void mttb(unsigned int val); - -extern __inline__ void mttb(unsigned int val) -{ - __asm__ volatile("mtspr tbl, %0" : : "r" (val)); - return; -} - -extern unsigned int mftb(void); - -extern __inline__ unsigned int mftb(void) -{ - unsigned int result; - __asm__ volatile("mftb %0" : "=r" (result)); - return result; -} - -extern void mttbu(unsigned int val); - -extern __inline__ void mttbu(unsigned int val) -{ - __asm__ volatile("mtspr tbu, %0" : : "r" (val)); - return; -} - -extern unsigned int mftbu(void); - -extern __inline__ unsigned int mftbu(void) -{ - unsigned int result; - __asm__ volatile("mftbu %0" : "=r" (result)); - return result; -} - -extern unsigned int mfl2cr(void); - -extern __inline__ unsigned int mfl2cr(void) -{ - unsigned int result; - __asm__ volatile("mfspr %0, l2cr" : "=r" (result)); - return result; -} - -extern unsigned int cntlzw(unsigned int num); - -extern __inline__ unsigned int cntlzw(unsigned int num) -{ - unsigned int result; - __asm__ volatile("cntlzw %0, %1" : "=r" (result) : "r" (num)); - return result; -} - - -/* functions for doing byte reversed loads and stores */ - -extern unsigned int lwbrx(unsigned int addr); - -extern __inline__ unsigned int lwbrx(unsigned int addr) -{ - unsigned int result; - __asm__ volatile("lwbrx %0, 0, %1" : "=r" (result) : "r" (addr)); - return result; -} - -extern void stwbrx(unsigned int data, unsigned int addr); - -extern __inline__ void stwbrx(unsigned int data, unsigned int addr) -{ - __asm__ volatile("stwbrx %0, 0, %1" : : "r" (data), "r" (addr)); -} - -/* Performance Monitor Register access routines */ -extern unsigned long mfmmcr0(void); -extern void mtmmcr0(unsigned long); -extern unsigned long mfmmcr1(void); -extern void mtmmcr1(unsigned long); -extern unsigned long mfmmcr2(void); -extern void mtmmcr2(unsigned long); -extern unsigned long mfpmc1(void); -extern void mtpmc1(unsigned long); -extern unsigned long mfpmc2(void); -extern void mtpmc2(unsigned long); -extern unsigned long mfpmc3(void); -extern void mtpmc3(unsigned long); -extern unsigned long mfpmc4(void); -extern void mtpmc4(unsigned long); -extern unsigned long mfsia(void); -extern unsigned long mfsda(void); - -/* macros since the argument n is a hard-coded constant */ - -#define mtsprg(n, reg) __asm__ volatile("mtsprg " # n ", %0" : : "r" (reg)) -#define mfsprg(reg, n) __asm__ volatile("mfsprg %0, " # n : "=r" (reg)) - -#define mtspr(spr, val) __asm__ volatile("mtspr " # spr ", %0" : : "r" (val)) -#define mfspr(reg, spr) __asm__ volatile("mfspr %0, " # spr : "=r" (reg)) - -#endif /* __GNUC__ */ -#endif /* !ASSEMBLER */ - -#endif /* _PPC_PROC_REG_H_ */ diff --git a/osfmk/ppc/rtclock.c b/osfmk/ppc/rtclock.c deleted file mode 100644 index 7c1222bd0..000000000 --- a/osfmk/ppc/rtclock.c +++ /dev/null @@ -1,306 +0,0 @@ -/* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * @APPLE_FREE_COPYRIGHT@ - */ -/* - * File: rtclock.c - * Purpose: Routines for handling the machine dependent - * real-time clock. - */ - -#include - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include - -int rtclock_config(void); - -int rtclock_init(void); - -#define NSEC_PER_HZ (NSEC_PER_SEC / 100) - -static uint32_t rtclock_sec_divisor; - -static mach_timebase_info_data_t rtclock_timebase_const; - -static boolean_t rtclock_timebase_initialized; - -decl_simple_lock_data(static,rtclock_lock) - -/* - * Macros to lock/unlock real-time clock device. - */ -#define LOCK_RTC(s) \ -MACRO_BEGIN \ - (s) = splclock(); \ - simple_lock(&rtclock_lock); \ -MACRO_END - -#define UNLOCK_RTC(s) \ -MACRO_BEGIN \ - simple_unlock(&rtclock_lock); \ - splx(s); \ -MACRO_END - -static void -timebase_callback( - struct timebase_freq_t *freq) -{ - uint32_t numer, denom; - spl_t s; - - if ( freq->timebase_den < 1 || freq->timebase_den > 4 || - freq->timebase_num < freq->timebase_den ) - panic("rtclock timebase_callback: invalid constant %lu / %lu", - freq->timebase_num, freq->timebase_den); - - denom = freq->timebase_num; - numer = freq->timebase_den * NSEC_PER_SEC; - - LOCK_RTC(s); - if (!rtclock_timebase_initialized) { - commpage_set_timestamp(0,0,0); - - rtclock_timebase_const.numer = numer; - rtclock_timebase_const.denom = denom; - rtclock_sec_divisor = freq->timebase_num / freq->timebase_den; - - ml_init_lock_timeout(); - } - else { - UNLOCK_RTC(s); - printf("rtclock timebase_callback: late old %d / %d new %d / %d\n", - rtclock_timebase_const.numer, rtclock_timebase_const.denom, - numer, denom); - return; - } - UNLOCK_RTC(s); - - clock_timebase_init(); -} - -/* - * Configure the system clock device. - */ -int -rtclock_config(void) -{ - simple_lock_init(&rtclock_lock, 0); - - PE_register_timebase_callback(timebase_callback); - - return (1); -} - -/* - * Initialize the system clock device. - */ -int -rtclock_init(void) -{ - etimer_resync_deadlines(); /* Start the timers going */ - - return (1); -} - -void -clock_get_system_microtime( - uint32_t *secs, - uint32_t *microsecs) -{ - uint64_t now, t64; - uint32_t divisor; - - now = mach_absolute_time(); - - *secs = t64 = now / (divisor = rtclock_sec_divisor); - now -= (t64 * divisor); - *microsecs = (now * USEC_PER_SEC) / divisor; -} - -void -clock_get_system_nanotime( - uint32_t *secs, - uint32_t *nanosecs) -{ - uint64_t now, t64; - uint32_t divisor; - - now = mach_absolute_time(); - - *secs = t64 = now / (divisor = rtclock_sec_divisor); - now -= (t64 * divisor); - *nanosecs = (now * NSEC_PER_SEC) / divisor; -} - -void -clock_gettimeofday_set_commpage( - uint64_t abstime, - uint64_t epoch, - uint64_t offset, - uint32_t *secs, - uint32_t *microsecs) -{ - uint64_t t64, now = abstime; - - simple_lock(&rtclock_lock); - - now += offset; - - *secs = t64 = now / rtclock_sec_divisor; - now -= (t64 * rtclock_sec_divisor); - *microsecs = (now * USEC_PER_SEC) / rtclock_sec_divisor; - - *secs += epoch; - - commpage_set_timestamp(abstime - now, *secs, rtclock_sec_divisor); - - simple_unlock(&rtclock_lock); -} - -void -clock_timebase_info( - mach_timebase_info_t info) -{ - spl_t s; - - LOCK_RTC(s); - *info = rtclock_timebase_const; - rtclock_timebase_initialized = TRUE; - UNLOCK_RTC(s); -} - -void -clock_interval_to_absolutetime_interval( - uint32_t interval, - uint32_t scale_factor, - uint64_t *result) -{ - uint64_t nanosecs = (uint64_t)interval * scale_factor; - uint64_t t64; - uint32_t divisor; - - *result = (t64 = nanosecs / NSEC_PER_SEC) * - (divisor = rtclock_sec_divisor); - nanosecs -= (t64 * NSEC_PER_SEC); - *result += (nanosecs * divisor) / NSEC_PER_SEC; -} - -void -absolutetime_to_microtime( - uint64_t abstime, - uint32_t *secs, - uint32_t *microsecs) -{ - uint64_t t64; - uint32_t divisor; - - *secs = t64 = abstime / (divisor = rtclock_sec_divisor); - abstime -= (t64 * divisor); - *microsecs = (abstime * USEC_PER_SEC) / divisor; -} - -void -absolutetime_to_nanotime( - uint64_t abstime, - uint32_t *secs, - uint32_t *nanosecs) -{ - uint64_t t64; - uint32_t divisor; - - *secs = t64 = abstime / (divisor = rtclock_sec_divisor); - abstime -= (t64 * divisor); - *nanosecs = (abstime * NSEC_PER_SEC) / divisor; -} - -void -nanotime_to_absolutetime( - uint32_t secs, - uint32_t nanosecs, - uint64_t *result) -{ - uint32_t divisor = rtclock_sec_divisor; - - *result = ((uint64_t)secs * divisor) + - ((uint64_t)nanosecs * divisor) / NSEC_PER_SEC; -} - -void -absolutetime_to_nanoseconds( - uint64_t abstime, - uint64_t *result) -{ - uint64_t t64; - uint32_t divisor; - - *result = (t64 = abstime / (divisor = rtclock_sec_divisor)) * NSEC_PER_SEC; - abstime -= (t64 * divisor); - *result += (abstime * NSEC_PER_SEC) / divisor; -} - -void -nanoseconds_to_absolutetime( - uint64_t nanosecs, - uint64_t *result) -{ - uint64_t t64; - uint32_t divisor; - - *result = (t64 = nanosecs / NSEC_PER_SEC) * - (divisor = rtclock_sec_divisor); - nanosecs -= (t64 * NSEC_PER_SEC); - *result += (nanosecs * divisor) / NSEC_PER_SEC; -} - -void -machine_delay_until( - uint64_t deadline) -{ - uint64_t now; - - do { - now = mach_absolute_time(); - } while (now < deadline); -} diff --git a/osfmk/ppc/savearea.c b/osfmk/ppc/savearea.c deleted file mode 100644 index 0e95c5ff1..000000000 --- a/osfmk/ppc/savearea.c +++ /dev/null @@ -1,327 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * This file is used to maintain the exception save areas - * - */ - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -struct Saveanchor backpocket; /* Emergency saveareas */ -unsigned int debsave0 = 0; /* Debug flag */ -unsigned int backchain = 0; /* Debug flag */ - -/* - * These routines keep track of exception save areas and keeps the count within specific limits. If there are - * too few, more are allocated, too many, and they are released. This savearea is where the PCBs are - * stored. They never span a page boundary and are referenced by both virtual and real addresses. - * Within the interrupt vectors, the real address is used because at that level, no exceptions - * can be tolerated. Save areas can be dynamic or permanent. Permanant saveareas are allocated - * at boot time and must be in place before any type of exception occurs. These are never released, - * and the number is based upon some arbitrary (yet to be determined) amount times the number of - * processors. This represents the minimum number required to process a total system failure without - * destroying valuable and ever-so-handy system debugging information. - * - * We keep two global free lists (the savearea free pool and the savearea free list) and one local - * list per processor. - * - * The local lists are small and require no locked access. They are chained using physical addresses - * and no interruptions are allowed when adding to or removing from the list. Also known as the - * qfret list. This list is local to a processor and is intended for use only by very low level - * context handling code. - * - * The savearea free list is a medium size list that is globally accessible. It is updated - * while holding a simple lock. The length of time that the lock is held is kept short. The - * longest period of time is when the list is trimmed. Like the qfret lists, this is chained physically - * and must be accessed with translation and interruptions disabled. This is where the bulk - * of the free entries are located. - * - * The saveareas are allocated from full pages. A pool element is marked - * with an allocation map that shows which "slots" are free. These pages are allocated via the - * normal kernel memory allocation functions. Queueing is with physical addresses. The enqueue, - * dequeue, and search for free blocks is done under free list lock. - * only if there are empty slots in it. - * - * Saveareas that are counted as "in use" once they are removed from the savearea free list. - * This means that all areas on the local qfret list are considered in use. - * - * There are two methods of obtaining a savearea. The save_get function (which is also inlined - * in the low-level exception handler) attempts to get an area from the local qfret list. This is - * done completely without locks. If qfret is exahusted (or maybe just too low) an area is allocated - * from the savearea free list. If the free list is empty, we install the back pocket areas and - * panic. - * - * The save_alloc function is designed to be called by high level routines, e.g., thread creation, - * etc. It will allocate from the free list. After allocation, it will compare the free count - * to the target value. If outside of the range, it will adjust the size either upwards or - * downwards. - * - * If we need to shrink the list, it will be trimmed to the target size and unlocked. The code - * will walk the chain and return each savearea to its pool page. If a pool page becomes - * completely empty, it is dequeued from the free pool list and enqueued (atomic queue - * function) to be released. - * - * Once the trim list is finished, the pool release queue is checked to see if there are pages - * waiting to be released. If so, they are released one at a time. - * - * If the free list needed to be grown rather than shrunken, we will first attempt to recover - * a page from the pending release queue (built when we trim the free list). If we find one, - * it is allocated, otherwise, a page of kernel memory is allocated. This loops until there are - * enough free saveareas. - * - */ - - - -/* - * Allocate our initial context save areas. As soon as we do this, - * we can take an interrupt. We do the saveareas here, 'cause they're guaranteed - * to be at least page aligned. - * - * Note: these initial saveareas are all to be allocated from V=R, less than 4GB - * space. - */ - - -void savearea_init(vm_offset_t addr) { - - savearea_comm *savec; - vm_offset_t save; - unsigned int i; - - - saveanchor.savetarget = InitialSaveTarget; /* Initial target value */ - saveanchor.saveinuse = 0; /* Number of areas in use */ - - saveanchor.savefree = 0; /* Remember the start of the free chain */ - saveanchor.savefreecnt = 0; /* Remember the length */ - saveanchor.savepoolfwd = (addr64_t)(uintptr_t)&saveanchor; /* Remember pool forward */ - saveanchor.savepoolbwd = (addr64_t)(uintptr_t)&saveanchor; /* Remember pool backward */ - - save = addr; /* Point to the whole block of blocks */ - -/* - * First we allocate the back pocket in case of emergencies - */ - - - for(i=0; i < BackPocketSaveBloks; i++) { /* Initialize the back pocket saveareas */ - - savec = (savearea_comm *)save; /* Get the control area for this one */ - - savec->sac_alloc = 0; /* Mark it allocated */ - savec->sac_vrswap = 0; /* V=R, so the translation factor is 0 */ - savec->sac_flags = sac_perm; /* Mark it permanent */ - savec->sac_flags |= 0x0000EE00; /* Debug eyecatcher */ - save_queue((uint32_t)savec >> 12); /* Add page to savearea lists */ - save += PAGE_SIZE; /* Jump up to the next one now */ - - } - - backpocket = saveanchor; /* Save this for emergencies */ - - -/* - * We've saved away the back pocket savearea info, so reset it all and - * now allocate for real - */ - - - saveanchor.savefree = 0; /* Remember the start of the free chain */ - saveanchor.savefreecnt = 0; /* Remember the length */ - saveanchor.saveadjust = 0; /* Set none needed yet */ - saveanchor.savepoolfwd = (addr64_t)(uintptr_t)&saveanchor; /* Remember pool forward */ - saveanchor.savepoolbwd = (addr64_t)(uintptr_t)&saveanchor; /* Remember pool backward */ - - for(i=0; i < InitialSaveBloks; i++) { /* Initialize the saveareas */ - - savec = (savearea_comm *)save; /* Get the control area for this one */ - - savec->sac_alloc = 0; /* Mark it allocated */ - savec->sac_vrswap = 0; /* V=R, so the translation factor is 0 */ - savec->sac_flags = sac_perm; /* Mark it permanent */ - savec->sac_flags |= 0x0000EE00; /* Debug eyecatcher */ - save_queue((uint32_t)savec >> 12); /* Add page to savearea lists */ - save += PAGE_SIZE; /* Jump up to the next one now */ - - } - -/* - * We now have a free list that has our initial number of entries - * The local qfret lists is empty. When we call save_get below it will see that - * the local list is empty and fill it for us. - * - * It is ok to call save_get here because all initial saveareas are V=R in less - * than 4GB space, so 32-bit addressing is ok. - * - */ - -/* - * This will populate the local list and get the first one for the system - */ - /* XXX next_savearea should be a void * 4425541 */ - getPerProc()->next_savearea = (unsigned long)(void *)save_get(); - -/* - * The system is now able to take interruptions - */ -} - - - - -/* - * Obtains a savearea. If the free list needs size adjustment it happens here. - * Don't actually allocate the savearea until after the adjustment is done. - */ - -struct savearea *save_alloc(void) { /* Reserve a save area */ - - - if(saveanchor.saveadjust) save_adjust(); /* If size need adjustment, do it now */ - - return save_get(); /* Pass the baby... */ -} - - -/* - * This routine releases a save area to the free queue. If after that, - * we have more than our maximum target, we start releasing what we can - * until we hit the normal target. - */ - -void -save_release(struct savearea *save) -{ - /* Return a savearea to the free list */ - save_ret(save); - - /* Adjust the savearea free list and pool size if needed */ - if(saveanchor.saveadjust) - save_adjust(); -} - -/* - * Adjusts the size of the free list. Can either release or allocate full pages - * of kernel memory. This can block. - * - * Note that we will only run one adjustment and the amount needed may change - * while we are executing. - * - * Calling this routine is triggered by saveanchor.saveadjust. This value is always calculated just before - * we unlock the saveanchor lock (this keeps it pretty accurate). If the total of savefreecnt and saveinuse - * is within the hysteresis range, it is set to 0. If outside, it is set to the number needed to bring - * the total to the target value. Note that there is a minimum size to the free list (FreeListMin) and if - * savefreecnt falls below that, saveadjust is set to the number needed to bring it to that. - */ - - -void save_adjust(void) { - - savearea_comm *sctl, *sctlnext, *freepage; - kern_return_t ret; - ppnum_t physpage; - - if(saveanchor.saveadjust < 0) { /* Do we need to adjust down? */ - - sctl = (savearea_comm *)save_trim_free(); /* Trim list to the need count, return start of trim list */ - - while(sctl) { /* Release the free pages back to the kernel */ - sctlnext = CAST_DOWN(savearea_comm *, sctl->save_prev); /* Get next in list */ - kmem_free(kernel_map, (vm_offset_t) sctl, PAGE_SIZE); /* Release the page */ - sctl = sctlnext; /* Chain onwards */ - } - } - else { /* We need more... */ - - if(save_recover()) return; /* If we can recover enough from the pool, return */ - - while(saveanchor.saveadjust > 0) { /* Keep going until we have enough */ - - ret = kmem_alloc_kobject(kernel_map, (vm_offset_t *)&freepage, PAGE_SIZE); /* Get a page for free pool */ - if(ret != KERN_SUCCESS) { /* Did we get some memory? */ - panic("Whoops... Not a bit of wired memory left for saveareas\n"); - } - - physpage = pmap_find_phys(kernel_pmap, (vm_offset_t)freepage); /* Find physical page */ - if(!physpage) { /* See if we actually have this mapped*/ - panic("save_adjust: wired page not mapped - va = %p\n", freepage); /* Die */ - } - - bzero((void *)freepage, PAGE_SIZE); /* Clear it all to zeros */ - freepage->sac_alloc = 0; /* Mark all entries taken */ - freepage->sac_vrswap = ((uint64_t)physpage << 12) ^ (uint64_t)((uintptr_t)freepage); /* XOR to calculate conversion mask */ - - freepage->sac_flags |= 0x0000EE00; /* Set debug eyecatcher */ - - save_queue(physpage); /* Add all saveareas on page to free list */ - } - } -} - -/* - * Fake up information to make the saveareas look like a zone - */ -void -save_fake_zone_info(int *count, vm_size_t *cur_size, vm_size_t *max_size, vm_size_t *elem_size, - vm_size_t *alloc_size, int *collectable, int *exhaustable) -{ - *count = saveanchor.saveinuse; - *cur_size = (saveanchor.savefreecnt + saveanchor.saveinuse) * (PAGE_SIZE / sac_cnt); - *max_size = saveanchor.savemaxcount * (PAGE_SIZE / sac_cnt); - *elem_size = sizeof(struct savearea); - *alloc_size = PAGE_SIZE; - *collectable = 1; - *exhaustable = 0; -} - - diff --git a/osfmk/ppc/savearea.h b/osfmk/ppc/savearea.h deleted file mode 100644 index 496591cb6..000000000 --- a/osfmk/ppc/savearea.h +++ /dev/null @@ -1,393 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#ifdef XNU_KERNEL_PRIVATE - -#ifndef _PPC_SAVEAREA_H_ -#define _PPC_SAVEAREA_H_ - -#ifndef ASSEMBLER - -#include - -#ifdef __APPLE_API_PRIVATE - -#if defined(MACH_KERNEL_PRIVATE) || defined(BSD_KERNEL_PRIVATE) -#include -#include - -#pragma pack(4) /* Make sure the structure stays as we defined it */ -typedef struct savearea_comm { - -/* - * The following fields are common to all saveareas and are used to manage individual - * contexts. - * - * Fields that start with "save" are part of the individual saveareas. Those that - * start with "sac" pertain to the free pool stuff and are valid only on the first slot - * in the page. - */ - - -/* Keep the save_prev, sac_next, and sac_prev in these positions, some assembler code depends upon it to - * match up with fields in saveanchor. - */ - /* offset 0x000 */ - addr64_t save_prev; /* The address of the previous (or next) savearea */ - addr64_t sac_next; /* Points to next savearea page that has a free slot - real */ - addr64_t sac_prev; /* Points to previous savearea page that has a free slot - real */ - unsigned int save_level; /* Context ID */ - unsigned int save_01C; - - /* 0x20 */ - unsigned int save_time[2]; /* Context save time - for debugging or performance */ - struct thread *save_act; /* Associated thread */ - unsigned int save_02c; - uint64_t sac_vrswap; /* XOR mask to swap V to R or vice versa */ - unsigned int save_flags; /* Various flags */ - unsigned int sac_flags; /* Various flags */ - - /* offset 0x040 */ - uint64_t save_misc0; /* Various stuff */ - uint64_t save_misc1; /* Various stuff - snapshot chain during hibernation */ - unsigned int sac_alloc; /* Bitmap of allocated slots */ - unsigned int save_054; - unsigned int save_misc2; - unsigned int save_misc3; - - /* offset 0x0060 */ -} savearea_comm; -#pragma pack() - -/* - * This type of savearea contains all of the general context. - */ - -#pragma pack(4) /* Make sure the structure stays as we defined it */ -typedef struct savearea { - - savearea_comm save_hdr; /* Stuff common to all saveareas */ - - uint64_t save_xdat0; /* Exception data 0 */ - uint64_t save_xdat1; /* Exception data 1 */ - uint64_t save_xdat2; /* Exception data 2 */ - uint64_t save_xdat3; /* Exception data 3 */ - /* offset 0x0080 */ - uint64_t save_r0; - uint64_t save_r1; - uint64_t save_r2; - uint64_t save_r3; - /* offset 0x0A0 */ - uint64_t save_r4; - uint64_t save_r5; - uint64_t save_r6; - uint64_t save_r7; - /* offset 0x0C0 */ - uint64_t save_r8; - uint64_t save_r9; - uint64_t save_r10; - uint64_t save_r11; - /* offset 0x0E0 */ - uint64_t save_r12; - uint64_t save_r13; - uint64_t save_r14; - uint64_t save_r15; - /* offset 0x100 */ - uint64_t save_r16; - uint64_t save_r17; - uint64_t save_r18; - uint64_t save_r19; - /* offset 0x120 */ - uint64_t save_r20; - uint64_t save_r21; - uint64_t save_r22; - uint64_t save_r23; - /* offset 0x140 */ - uint64_t save_r24; - uint64_t save_r25; - uint64_t save_r26; - uint64_t save_r27; - /* offset 0x160 */ - uint64_t save_r28; - uint64_t save_r29; - uint64_t save_r30; - uint64_t save_r31; - /* offset 0x180 */ - uint64_t save_srr0; - uint64_t save_srr1; - uint64_t save_xer; - uint64_t save_lr; - /* offset 0x1A0 */ - uint64_t save_ctr; - uint64_t save_dar; - unsigned int save_cr; - unsigned int save_dsisr; - unsigned int save_exception; - unsigned int save_vrsave; - /* offset 0x1C0 */ - unsigned int save_vscr[4]; - unsigned int save_fpscrpad; - unsigned int save_fpscr; - unsigned int save_1d8[2]; - /* offset 0x1E0 */ - unsigned int save_1E0[8]; - /* offset 0x200 - keep on 128 byte bndry */ - uint32_t save_pmc[8]; - uint64_t save_mmcr0; /* offset 0x220 */ - uint64_t save_mmcr1; - uint64_t save_mmcr2; - - unsigned int save_238[2]; - /* offset 0x240 */ - unsigned int save_instr[16]; /* Instrumentation or emulation. Note: save_instr[0] is number of instructions */ - /* offset 0x280 */ -} savearea_t; -#pragma pack() - - -/* - * This type of savearea contains all of the floating point context. - */ - -#pragma pack(4) /* Make sure the structure stays as we defined it */ -typedef struct savearea_fpu { - - savearea_comm save_hdr; /* Stuff common to all saveareas */ - - unsigned int save_060[8]; /* Fill 32 bytes */ - /* offset 0x0080 */ - double save_fp0; - double save_fp1; - double save_fp2; - double save_fp3; - - double save_fp4; - double save_fp5; - double save_fp6; - double save_fp7; - - double save_fp8; - double save_fp9; - double save_fp10; - double save_fp11; - - double save_fp12; - double save_fp13; - double save_fp14; - double save_fp15; - - double save_fp16; - double save_fp17; - double save_fp18; - double save_fp19; - - double save_fp20; - double save_fp21; - double save_fp22; - double save_fp23; - - double save_fp24; - double save_fp25; - double save_fp26; - double save_fp27; - - double save_fp28; - double save_fp29; - double save_fp30; - double save_fp31; - /* offset 0x180 */ - unsigned int save_180[8]; - unsigned int save_1A0[8]; - unsigned int save_1C0[8]; - unsigned int save_1E0[8]; - unsigned int save_200[8]; - unsigned int save_220[8]; - unsigned int save_240[8]; - unsigned int save_260[8]; - - /* offset 0x280 */ -} savearea_fpu; -#pragma pack() - - - -/* - * This type of savearea contains all of the vector context. - */ - -#pragma pack(4) /* Make sure the structure stays as we defined it */ -typedef struct savearea_vec { - - savearea_comm save_hdr; /* Stuff common to all saveareas */ - - unsigned int save_060[7]; /* Fill 32 bytes */ - unsigned int save_vrvalid; /* Valid registers in saved context */ - - /* offset 0x0080 */ - unsigned int save_vr0[4]; - unsigned int save_vr1[4]; - unsigned int save_vr2[4]; - unsigned int save_vr3[4]; - unsigned int save_vr4[4]; - unsigned int save_vr5[4]; - unsigned int save_vr6[4]; - unsigned int save_vr7[4]; - unsigned int save_vr8[4]; - unsigned int save_vr9[4]; - unsigned int save_vr10[4]; - unsigned int save_vr11[4]; - unsigned int save_vr12[4]; - unsigned int save_vr13[4]; - unsigned int save_vr14[4]; - unsigned int save_vr15[4]; - unsigned int save_vr16[4]; - unsigned int save_vr17[4]; - unsigned int save_vr18[4]; - unsigned int save_vr19[4]; - unsigned int save_vr20[4]; - unsigned int save_vr21[4]; - unsigned int save_vr22[4]; - unsigned int save_vr23[4]; - unsigned int save_vr24[4]; - unsigned int save_vr25[4]; - unsigned int save_vr26[4]; - unsigned int save_vr27[4]; - unsigned int save_vr28[4]; - unsigned int save_vr29[4]; - unsigned int save_vr30[4]; - unsigned int save_vr31[4]; - - /* offset 0x280 */ -} savearea_vec; -#pragma pack() -#endif /* MACH_KERNEL_PRIVATE || BSD_KERNEL_PRIVATE */ - -#ifdef MACH_KERNEL_PRIVATE - -#pragma pack(4) /* Make sure the structure stays as we defined it */ -struct Saveanchor { - -/* - * Note that this force aligned in aligned_data.s and must be in V=R storage. - * Also, all addresses in chains are physical. This structure can only be - * updated with translation and interrupts disabled. This is because it is - * locked during exception processing and if we were to take a PTE miss while the - * lock were held, well, that would be very bad now wouldn't it? - * Note that the first 24 bytes must be the same format as a savearea header. - */ - - unsigned int savelock; /* 000 Lock word for savearea free list manipulation */ - int saveRSVD4; /* 004 reserved */ - addr64_t savepoolfwd; /* 008 Forward anchor for the free pool */ - addr64_t savepoolbwd; /* 010 Backward anchor for the free pool */ - volatile addr64_t savefree; /* 018 Anchor for the global free list */ - volatile unsigned int savefreecnt; /* 020 Number of saveareas on global free list */ - volatile int saveadjust; /* 024 If 0 number of saveareas is ok, otherwise # to change (pos means grow, neg means shrink */ - volatile int saveinuse; /* 028 Number of areas in use counting those on the local free list */ - unsigned int savetarget; /* 02C Number of saveareas needed */ - int savemaxcount; /* 030 Maximum saveareas ever allocated */ - unsigned int saveinusesnapshot; /* 034 snapshot inuse count */ - volatile addr64_t savefreesnapshot; /* 038 snapshot global free list header */ -/* 040 */ -}; -#pragma pack() - -extern struct Saveanchor saveanchor; /* Aliged savearea anchor */ - -#define sac_cnt (4096 / sizeof(struct savearea)) /* Number of saveareas per page */ -#define sac_empty (0xFFFFFFFF << (32 - sac_cnt)) /* Mask with all entries empty */ -#define sac_perm 0x40000000 /* Page permanently assigned */ -#define sac_permb 1 /* Page permanently assigned - bit position */ - -#define LocalSaveTarget (((8 + sac_cnt - 1) / sac_cnt) * sac_cnt) /* Target for size of local savearea free list */ -#define LocalSaveMin (LocalSaveTarget / 2) /* Min size of local savearea free list before we grow */ -#define LocalSaveMax (LocalSaveTarget * 2) /* Max size of local savearea free list before we trim */ - -#define FreeListMin (2 * LocalSaveTarget) /* Always make sure there are enough to fill local list twice per processor */ -#define SaveLowHysteresis LocalSaveTarget /* The number off from target before we adjust upwards */ -#define SaveHighHysteresis (2 * FreeListMin) /* The number off from target before we adjust downwards */ -#define InitialSaveAreas (2 * FreeListMin) /* The number of saveareas to make at boot time */ -#define InitialSaveTarget FreeListMin /* The number of saveareas for an initial target. This should be the minimum ever needed. */ -#define InitialSaveBloks (InitialSaveAreas + sac_cnt - 1) / sac_cnt /* The number of savearea blocks to allocate at boot */ -#define BackPocketSaveBloks 8 /* Number of pages of back pocket saveareas */ - -void save_queue(ppnum_t); /* Add a new savearea block to the free list */ -addr64_t save_get_init(void); /* special savearea-get for cpu initialization (returns physical address) */ -struct savearea *save_get(void); /* Obtains a savearea from the free list (returns virtual address) */ -reg64_t save_get_phys_32(void); /* Obtains a savearea from the free list (returns phys addr in r3) */ -reg64_t save_get_phys_64(void); /* Obtains a savearea from the free list (returns phys addr in r3) */ -struct savearea *save_alloc(void); /* Obtains a savearea and allocates blocks if needed */ -struct savearea *save_cpv(addr64_t); /* Converts a physical savearea address to virtual */ -void save_ret(struct savearea *); /* Returns a savearea to the free list by virtual address */ -void save_ret_wMSR(struct savearea *, reg64_t); /* returns a savearea and restores an MSR */ -void save_ret_phys(reg64_t); /* Returns a savearea to the free list by physical address */ -void save_adjust(void); /* Adjust size of the global free list */ -struct savearea_comm *save_trim_free(void); /* Remove free pages from savearea pool */ -int save_recover(void); /* returns nonzero if we can recover enough from the free pool */ -void savearea_init(vm_offset_t addr); /* Boot-time savearea initialization */ - -void save_fake_zone_info( /* report savearea usage statistics as fake zone info */ - int *count, - vm_size_t *cur_size, - vm_size_t *max_size, - vm_size_t *elem_size, - vm_size_t *alloc_size, - int *collectable, - int *exhaustable); - -void save_snapshot(void); -void save_snapshot_restore(void); -void save_release(struct savearea *); - -#endif /* MACH_KERNEL_PRIVATE */ -#endif /* __APPLE_API_PRIVATE */ - -#endif /* ndef ASSEMBLER */ - -#define SAVattach 0x80000000 /* Savearea has valid context */ -#define SAVrststk 0x00010000 /* Indicates that the current stack should be reset to empty */ -#define SAVsyscall 0x00020000 /* Indicates that the savearea is associated with a syscall */ -#define SAVredrive 0x00040000 /* Indicates that the low-level fault handler associated */ -#define SAVredriveb 13 /* Indicates that the low-level fault handler associated */ -#define SAVinstrument 0x00080000 /* Indicates that we should return instrumentation data */ -#define SAVinstrumentb 12 /* Indicates that we should return instrumentation data */ -#define SAVeat 0x00100000 /* Indicates that interruption should be ignored */ -#define SAVeatb 11 /* Indicates that interruption should be ignored */ -#define SAVinject 0x00200000 /* Indicates that save_instr contains code to inject */ -#define SAVinjectb 10 /* Indicates that save_instr contains code to inject */ -#define SAVtype 0x0000FF00 /* Shows type of savearea */ -#define SAVtypeshft 8 /* Shift to position type */ -#define SAVempty 0x86 /* Savearea is on free list */ -#define SAVgeneral 0x01 /* Savearea contains general context */ -#define SAVfloat 0x02 /* Savearea contains floating point context */ -#define SAVvector 0x03 /* Savearea contains vector context */ - - - -#endif /* _PPC_SAVEAREA_H_ */ - -#endif /* XNU_KERNEL_PRIVATE */ diff --git a/osfmk/ppc/savearea_asm.s b/osfmk/ppc/savearea_asm.s deleted file mode 100644 index 6b42c7dd3..000000000 --- a/osfmk/ppc/savearea_asm.s +++ /dev/null @@ -1,1621 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#define FPVECDBG 0 - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - - .text - -/* Register usage conventions in this code: - * r9 = return address - * r10 = per-proc ptr - * r11 = MSR at entry - * cr6 = feature flags (ie, pf64Bit) - * - * Because much of this code deals with physical addresses, - * there are parallel paths for 32- and 64-bit machines. - */ - - -/* - * ***************************** - * * s a v e _ s n a p s h o t * - * ***************************** - * - * void save_snapshot(); - * - * Link the current free list & processor local list on an independent list. - */ - .align 5 - .globl EXT(save_snapshot) - -LEXT(save_snapshot) - mflr r9 ; get return address - bl saveSetup ; turn translation off, 64-bit on, load many regs - bf-- pf64Bitb,save_snapshot32 ; skip if 32-bit processor - - ; Handle 64-bit processor. - -save_snapshot64: - - ld r8,next_savearea(r10) ; Start with the current savearea - std r8,SVsavefreesnapshot(0) ; Make it the restore list anchor - ld r5,SVfree(0) ; Get free save area list anchor - -save_snapshot64nextfree: - mr r7,r5 - std r7,savemisc1(r8) ; Link this one - ld r5,SAVprev(r7) ; Get the next - mr r8,r7 - mr. r0,r5 - bne save_snapshot64nextfree - - lwz r6,SVinuse(0) ; Get inuse count - ld r5,lclfree(r10) ; Get the local savearea list - subi r6,r6,1 ; Count the first as free - -save_snapshot64nextlocalfree: - subi r6,r6,1 ; Count as free - mr r7,r5 - std r7,savemisc1(r8) ; Link this one - ld r5,SAVprev(r7) ; Get the next - mr r8,r7 - mr. r0,r5 - bne save_snapshot64nextlocalfree - - std r5,savemisc1(r8) ; End the list - stw r6,SVsaveinusesnapshot(0) ; Save the new number of inuse saveareas - - mtlr r9 ; Restore the return - b saveRestore64 ; Restore interrupts and translation - - ; Handle 32-bit processor. - -save_snapshot32: - lwz r8,next_savearea+4(r10) ; Start with the current savearea - stw r8,SVsavefreesnapshot+4(0) ; Make it the restore list anchor - lwz r5,SVfree+4(0) ; Get free save area list anchor - -save_snapshot32nextfree: - mr r7,r5 - stw r7,savemisc1+4(r8) ; Link this one - lwz r5,SAVprev+4(r7) ; Get the next - mr r8,r7 - mr. r0,r5 - bne save_snapshot32nextfree - - lwz r6,SVinuse(0) ; Get inuse count - lwz r5,lclfree+4(r10) ; Get the local savearea list - subi r6,r6,1 ; Count the first as free - -save_snapshot32nextlocalfree: - subi r6,r6,1 ; Count as free - mr r7,r5 - stw r7,savemisc1+4(r8) ; Link this one - lwz r5,SAVprev+4(r7) ; Get the next - mr r8,r7 - mr. r0,r5 - bne save_snapshot32nextlocalfree - - stw r5,savemisc1+4(r8) ; End the list - stw r6,SVsaveinusesnapshot(0) ; Save the new number of inuse saveareas - - mtlr r9 ; Restore the return - b saveRestore32 ; Restore interrupts and translation - -/* - * ********************************************* - * * s a v e _ s n a p s h o t _ r e s t o r e * - * ********************************************* - * - * void save_snapshot_restore(); - * - * Restore the free list from the snapshot list, and reset the processors next savearea. - */ - .align 5 - .globl EXT(save_snapshot_restore) - -LEXT(save_snapshot_restore) - mflr r9 ; get return address - bl saveSetup ; turn translation off, 64-bit on, load many regs - bf-- pf64Bitb,save_snapshot_restore32 ; skip if 32-bit processor - - ; Handle 64-bit processor. - -save_snapshot_restore64: - lwz r7,SVsaveinusesnapshot(0) - stw r7,SVinuse(0) ; Set the new inuse count - - li r6,0 - stw r6,lclfreecnt(r10) ; None local now - std r6,lclfree(r10) ; None local now - - ld r8,SVsavefreesnapshot(0) ; Get the restore list anchor - std r8,SVfree(0) ; Make it the free list anchor - li r5,SAVempty ; Get marker for free savearea - -save_snapshot_restore64nextfree: - addi r6,r6,1 ; Count as free - stb r5,SAVflags+2(r8) ; Mark savearea free - ld r7,savemisc1(r8) ; Get the next - std r7,SAVprev(r8) ; Set the next in free list - mr. r8,r7 - bne save_snapshot_restore64nextfree - - stw r6,SVfreecnt(0) ; Set the new free count - - bl saveGet64 - std r3,next_savearea(r10) ; Get the next savearea - - mtlr r9 ; Restore the return - b saveRestore64 ; Restore interrupts and translation - - ; Handle 32-bit processor. - -save_snapshot_restore32: - lwz r7,SVsaveinusesnapshot(0) - stw r7,SVinuse(0) ; Set the new inuse count - - li r6,0 - stw r6,lclfreecnt(r10) ; None local now - stw r6,lclfree+4(r10) ; None local now - - lwz r8,SVsavefreesnapshot+4(0) ; Get the restore list anchor - stw r8,SVfree+4(0) ; Make it the free list anchor - li r5,SAVempty ; Get marker for free savearea - -save_snapshot_restore32nextfree: - addi r6,r6,1 ; Count as free - stb r5,SAVflags+2(r8) ; Mark savearea free - lwz r7,savemisc1+4(r8) ; Get the next - stw r7,SAVprev+4(r8) ; Set the next in free list - mr. r8,r7 - bne save_snapshot_restore32nextfree - - stw r6,SVfreecnt(0) ; Set the new free count - - bl saveGet32 - stw r3,next_savearea+4(r10) ; Get the next savearea - - mtlr r9 ; Restore the return - b saveRestore32 ; Restore interrupts and translation - -/* - * *********************** - * * s a v e _ q u e u e * - * *********************** - * - * void save_queue(ppnum_t pagenum); - * - * This routine will add a savearea block to the free list. - * We also queue the block to the free pool list. This is a - * circular double linked list. Because this block has no free entries, - * it gets queued to the end of the list - */ - .align 5 - .globl EXT(save_queue) - -LEXT(save_queue) - mflr r9 ; get return address - mr r8,r3 ; move pagenum out of the way - bl saveSetup ; turn translation off, 64-bit on, load many regs - bf-- pf64Bitb,saveQueue32 ; skip if 32-bit processor - - sldi r2,r8,12 ; r2 <-- phys address of page - li r8,sac_cnt ; Get the number of saveareas per page - mr r4,r2 ; Point to start of chain - li r0,SAVempty ; Get empty marker - -saveQueue64a: - addic. r8,r8,-1 ; Keep track of how many we did - stb r0,SAVflags+2(r4) ; Set empty - addi r7,r4,SAVsize ; Point to the next slot - ble- saveQueue64b ; We are done with the chain - std r7,SAVprev(r4) ; Set this chain - mr r4,r7 ; Step to the next - b saveQueue64a ; Fill the whole block... - -saveQueue64b: - bl savelock ; Go lock the save anchor - - ld r7,SVfree(0) ; Get the free save area list anchor - lwz r6,SVfreecnt(0) ; Get the number of free saveareas - - std r2,SVfree(0) ; Queue in the new one - addi r6,r6,sac_cnt ; Count the ones we are linking in - std r7,SAVprev(r4) ; Queue the old first one off of us - stw r6,SVfreecnt(0) ; Save the new count - b saveQueueExit - - ; Handle 32-bit processor. - -saveQueue32: - slwi r2,r8,12 ; r2 <-- phys address of page - li r8,sac_cnt ; Get the number of saveareas per page - mr r4,r2 ; Point to start of chain - li r0,SAVempty ; Get empty marker - -saveQueue32a: - addic. r8,r8,-1 ; Keep track of how many we did - stb r0,SAVflags+2(r4) ; Set empty - addi r7,r4,SAVsize ; Point to the next slot - ble- saveQueue32b ; We are done with the chain - stw r7,SAVprev+4(r4) ; Set this chain - mr r4,r7 ; Step to the next - b saveQueue32a ; Fill the whole block... - -saveQueue32b: - bl savelock ; Go lock the save anchor - - lwz r7,SVfree+4(0) ; Get the free save area list anchor - lwz r6,SVfreecnt(0) ; Get the number of free saveareas - - stw r2,SVfree+4(0) ; Queue in the new one - addi r6,r6,sac_cnt ; Count the ones we are linking in - stw r7,SAVprev+4(r4) ; Queue the old first one off of us - stw r6,SVfreecnt(0) ; Save the new count - -saveQueueExit: ; join here from 64-bit path - bl saveunlock ; Unlock the list and set the adjust count - mtlr r9 ; Restore the return - -#if FPVECDBG - mfsprg r2,1 ; (TEST/DEBUG) - mr. r2,r2 ; (TEST/DEBUG) - beq-- saveRestore ; (TEST/DEBUG) - lis r0,hi16(CutTrace) ; (TEST/DEBUG) - li r2,0x2201 ; (TEST/DEBUG) - oris r0,r0,lo16(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) -#endif - b saveRestore ; Restore interrupts and translation - -/* - * ***************************** - * * s a v e _ g e t _ i n i t * - * ***************************** - * - * addr64_t save_get_init(void); - * - * Note that save_get_init is used in initial processor startup only. It - * is used because translation is on, but no tables exist yet and we have - * no V=R BAT registers that cover the entire physical memory. - */ - .align 5 - .globl EXT(save_get_init) - -LEXT(save_get_init) - mflr r9 ; get return address - bl saveSetup ; turn translation off, 64-bit on, load many regs - bfl-- pf64Bitb,saveGet32 ; Get r3 <- savearea, r5 <- page address (with SAC) - btl++ pf64Bitb,saveGet64 ; get one on a 64-bit machine - bl saveRestore ; restore translation etc - mtlr r9 - - ; unpack the physaddr in r3 into a long long in (r3,r4) - - mr r4,r3 ; copy low word of phys address to r4 - li r3,0 ; assume upper word was 0 - bflr-- pf64Bitb ; if 32-bit processor, return - srdi r3,r4,32 ; unpack reg64_t to addr64_t on 64-bit machine - rlwinm r4,r4,0,0,31 - blr - - -/* - * ******************* - * * s a v e _ g e t * - * ******************* - * - * savearea *save_get(void); - * - * Allocate a savearea, returning a virtual address. NOTE: we must preserve - * r0, r2, and r12. Our callers in cswtch.s depend on this. - */ - .align 5 - .globl EXT(save_get) - -LEXT(save_get) - mflr r9 ; get return address - mr r5,r0 ; copy regs before saveSetup nails them - bl saveSetup ; turn translation off, 64-bit on, load many regs - bf-- pf64Bitb,svgt1 ; skip if 32-bit processor - - std r5,tempr0(r10) ; save r0 in per-proc across call to saveGet64 - std r2,tempr2(r10) ; and r2 - std r12,tempr4(r10) ; and r12 - bl saveGet64 ; get r3 <- savearea, r5 <- page address (with SAC) - ld r0,tempr0(r10) ; restore callers regs - ld r2,tempr2(r10) - ld r12,tempr4(r10) - b svgt2 - -svgt1: ; handle 32-bit processor - stw r5,tempr0+4(r10) ; save r0 in per-proc across call to saveGet32 - stw r2,tempr2+4(r10) ; and r2 - stw r12,tempr4+4(r10) ; and r12 - bl saveGet32 ; get r3 <- savearea, r5 <- page address (with SAC) - lwz r0,tempr0+4(r10) ; restore callers regs - lwz r2,tempr2+4(r10) - lwz r12,tempr4+4(r10) - -svgt2: - lwz r5,SACvrswap+4(r5) ; Get the virtual to real translation (only need low word) - mtlr r9 ; restore return address - xor r3,r3,r5 ; convert physaddr to virtual - rlwinm r3,r3,0,0,31 ; 0 upper word if a 64-bit machine - -#if FPVECDBG - mr r6,r0 ; (TEST/DEBUG) - mr r7,r2 ; (TEST/DEBUG) - mfsprg r2,1 ; (TEST/DEBUG) - mr. r2,r2 ; (TEST/DEBUG) - beq-- svgDBBypass ; (TEST/DEBUG) - lis r0,HIGH_ADDR(CutTrace) ; (TEST/DEBUG) - li r2,0x2203 ; (TEST/DEBUG) - oris r0,r0,LOW_ADDR(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) -svgDBBypass: ; (TEST/DEBUG) - mr r0,r6 ; (TEST/DEBUG) - mr r2,r7 ; (TEST/DEBUG) -#endif - b saveRestore ; restore MSR and return to our caller - - -/* - * *********************************** - * * s a v e _ g e t _ p h y s _ 3 2 * - * *********************************** - * - * reg64_t save_get_phys(void); - * - * This is the entry normally called from lowmem_vectors.s with - * translation and interrupts already off. - * MUST NOT TOUCH CR7 - */ - .align 5 - .globl EXT(save_get_phys_32) - -LEXT(save_get_phys_32) - mfsprg r10,0 ; get the per-proc ptr - b saveGet32 ; Get r3 <- savearea, r5 <- page address (with SAC) - - -/* - * *********************************** - * * s a v e _ g e t _ p h y s _ 6 4 * - * *********************************** - * - * reg64_t save_get_phys_64(void); - * - * This is the entry normally called from lowmem_vectors.s with - * translation and interrupts already off, and in 64-bit mode. - * MUST NOT TOUCH CR7 - */ - .align 5 - .globl EXT(save_get_phys_64) - -LEXT(save_get_phys_64) - mfsprg r10,0 ; get the per-proc ptr - b saveGet64 ; Get r3 <- savearea, r5 <- page address (with SAC) - - -/* - * ********************* - * * s a v e G e t 6 4 * - * ********************* - * - * This is the internal routine to allocate a savearea on a 64-bit processor. - * Note that we must not take any exceptions of any kind, including PTE misses, as that - * would deadlock trying to reenter this routine. We pass back the 64-bit physical address. - * First we try the local list. If that is below a threshold, we try the global free list, - * which requires taking a lock, and replenish. If there are no saveareas in either list, - * we will install the backpocket and choke. This routine assumes that the caller has - * turned translation off, masked interrupts, turned on 64-bit mode, and set up: - * r10 = per-proc ptr - * - * We return: - * r3 = 64-bit physical address of the savearea - * r5 = 64-bit physical address of the page the savearea is in, with SAC - * - * We destroy: - * r2-r8. - * - * MUST NOT TOUCH CR7 - */ - -saveGet64: - lwz r8,lclfreecnt(r10) ; Get the count - ld r3,lclfree(r10) ; Get the start of local savearea list - cmplwi r8,LocalSaveMin ; Are we too low? - ble-- saveGet64GetGlobal ; We are too low and need to grow list... - - ; Get it from the per-processor local list. - -saveGet64GetLocal: - li r2,0x5555 ; get r2 <-- 0x55555555 55555555, our bugbug constant - ld r4,SAVprev(r3) ; Chain to the next one - oris r2,r2,0x5555 - subi r8,r8,1 ; Back down count - rldimi r2,r2,32,0 - - std r2,SAVprev(r3) ; bug next ptr - stw r2,SAVlevel(r3) ; bug context ID - li r6,0 - std r4,lclfree(r10) ; Unchain first savearea - stw r2,SAVact(r3) ; bug activation ptr - rldicr r5,r3,0,51 ; r5 <-- page ptr, where SAC is kept - stw r8,lclfreecnt(r10) ; Set new count - stw r6,SAVflags(r3) ; clear the flags - - blr - - ; Local list was low so replenish from global list. - ; r7 = return address to caller of saveGet64 - ; r8 = lclfreecnt - ; r10 = per-proc ptr - -saveGet64GetGlobal: - mflr r7 ; save return adress - subfic r5,r8,LocalSaveTarget ; Get the number of saveareas we need to grab to get to target - bl savelock ; Go lock up the anchor - - lwz r2,SVfreecnt(0) ; Get the number on this list - ld r8,SVfree(0) ; Get the head of the save area list - - sub r3,r2,r5 ; Get number left after we swipe enough for local list - sradi r3,r3,63 ; Get 0 if enough or -1 if not - andc r4,r5,r3 ; Get number to get if there are enough, 0 otherwise - and r5,r2,r3 ; Get 0 if there are enough, number on list otherwise - or. r5,r4,r5 ; r5 <- number we will move from global to local list - beq-- saveGet64NoFree ; There are none to get... - - mtctr r5 ; Get loop count - mr r6,r8 ; Remember the first in the list - -saveGet64c: - bdz saveGet64d ; Count down and branch when we hit 0... - ld r8,SAVprev(r8) ; Get the next - b saveGet64c ; Keep going... - -saveGet64d: - ld r3,SAVprev(r8) ; Get the next one - lwz r4,SVinuse(0) ; Get the in use count - sub r2,r2,r5 ; Count down what we stole - std r3,SVfree(0) ; Set the new first in list - add r4,r4,r5 ; Count the ones we just put in the local list as "in use" - stw r2,SVfreecnt(0) ; Set the new count - stw r4,SVinuse(0) ; Set the new in use count - - ld r4,lclfree(r10) ; Get the old head of list - lwz r3,lclfreecnt(r10) ; Get the old count - std r6,lclfree(r10) ; Set the new head of the list - add r3,r3,r5 ; Get the new count - std r4,SAVprev(r8) ; Point to the old head - stw r3,lclfreecnt(r10) ; Set the new count - - bl saveunlock ; Update the adjust field and unlock - mtlr r7 ; restore return address - b saveGet64 ; Start over and finally allocate the savearea... - - ; The local list is below the repopulate threshold and the global list is empty. - ; First we check if there are any left in the local list and if so, we allow - ; them to be allocated. If not, we release the backpocket list and choke. - ; There is nothing more that we can do at this point. Hopefully we stay alive - ; long enough to grab some much-needed panic information. - ; r7 = return address to caller of saveGet64 - ; r10 = per-proc ptr - -saveGet64NoFree: - lwz r8,lclfreecnt(r10) ; Get the count - mr. r8,r8 ; Are there any reserve to get? - beq-- saveGet64Choke ; No, go choke and die... - bl saveunlock ; Update the adjust field and unlock - ld r3,lclfree(r10) ; Get the start of local savearea list - lwz r8,lclfreecnt(r10) ; Get the count - mtlr r7 ; restore return address - b saveGet64GetLocal ; We have some left, dip on in... - -; We who are about to die salute you. The savearea chain is messed up or -; empty. Add in a few so we have enough to take down the system. - -saveGet64Choke: - lis r9,hi16(EXT(backpocket)) ; Get high order of back pocket - ori r9,r9,lo16(EXT(backpocket)) ; and low part - - lwz r8,SVfreecnt-saveanchor(r9) ; Get the new number of free elements - ld r7,SVfree-saveanchor(r9) ; Get the head of the chain - lwz r6,SVinuse(0) ; Get total in the old list - - stw r8,SVfreecnt(0) ; Set the new number of free elements - add r6,r6,r8 ; Add in the new ones - std r7,SVfree(0) ; Set the new head of the chain - stw r6,SVinuse(0) ; Set total in the new list - -saveGetChokeJoin: ; join in the fun from 32-bit mode - lis r0,hi16(Choke) ; Set choke firmware call - li r7,0 ; Get a clear register to unlock - ori r0,r0,lo16(Choke) ; Set the rest of the choke call - li r3,failNoSavearea ; Set failure code - - eieio ; Make sure all is committed - stw r7,SVlock(0) ; Unlock the free list - sc ; System ABEND - - -/* - * ********************* - * * s a v e G e t 3 2 * - * ********************* - * - * This is the internal routine to allocate a savearea on a 32-bit processor. - * Note that we must not take any exceptions of any kind, including PTE misses, as that - * would deadlock trying to reenter this routine. We pass back the 32-bit physical address. - * First we try the local list. If that is below a threshold, we try the global free list, - * which requires taking a lock, and replenish. If there are no saveareas in either list, - * we will install the backpocket and choke. This routine assumes that the caller has - * turned translation off, masked interrupts, and set up: - * r10 = per-proc ptr - * - * We return: - * r3 = 32-bit physical address of the savearea - * r5 = 32-bit physical address of the page the savearea is in, with SAC - * - * We destroy: - * r2-r8. - */ - -saveGet32: - lwz r8,lclfreecnt(r10) ; Get the count - lwz r3,lclfree+4(r10) ; Get the start of local savearea list - cmplwi r8,LocalSaveMin ; Are we too low? - ble- saveGet32GetGlobal ; We are too low and need to grow list... - - ; Get savearea from per-processor local list. - -saveGet32GetLocal: - li r2,0x5555 ; get r2 <-- 0x55555555, our bugbug constant - lwz r4,SAVprev+4(r3) ; Chain to the next one - oris r2,r2,0x5555 - subi r8,r8,1 ; Back down count - - stw r2,SAVprev+4(r3) ; bug next ptr - stw r2,SAVlevel(r3) ; bug context ID - li r6,0 - stw r4,lclfree+4(r10) ; Unchain first savearea - stw r2,SAVact(r3) ; bug activation ptr - rlwinm r5,r3,0,0,19 ; r5 <-- page ptr, where SAC is kept - stw r8,lclfreecnt(r10) ; Set new count - stw r6,SAVflags(r3) ; clear the flags - - blr - - ; Local list was low so replenish from global list. - ; r7 = return address to caller of saveGet32 - ; r8 = lclfreecnt - ; r10 = per-proc ptr - -saveGet32GetGlobal: - mflr r7 ; save return adress - subfic r5,r8,LocalSaveTarget ; Get the number of saveareas we need to grab to get to target - bl savelock ; Go lock up the anchor - - lwz r2,SVfreecnt(0) ; Get the number on this list - lwz r8,SVfree+4(0) ; Get the head of the save area list - - sub r3,r2,r5 ; Get number left after we swipe enough for local list - srawi r3,r3,31 ; Get 0 if enough or -1 if not - andc r4,r5,r3 ; Get number to get if there are enough, 0 otherwise - and r5,r2,r3 ; Get 0 if there are enough, number on list otherwise - or. r5,r4,r5 ; r5 <- number we will move from global to local list - beq- saveGet32NoFree ; There are none to get... - - mtctr r5 ; Get loop count - mr r6,r8 ; Remember the first in the list - -saveGet32c: - bdz saveGet32d ; Count down and branch when we hit 0... - lwz r8,SAVprev+4(r8) ; Get the next - b saveGet32c ; Keep going... - -saveGet32d: - lwz r3,SAVprev+4(r8) ; Get the next one - lwz r4,SVinuse(0) ; Get the in use count - sub r2,r2,r5 ; Count down what we stole - stw r3,SVfree+4(0) ; Set the new first in list - add r4,r4,r5 ; Count the ones we just put in the local list as "in use" - stw r2,SVfreecnt(0) ; Set the new count - stw r4,SVinuse(0) ; Set the new in use count - - lwz r4,lclfree+4(r10) ; Get the old head of list - lwz r3,lclfreecnt(r10) ; Get the old count - stw r6,lclfree+4(r10) ; Set the new head of the list - add r3,r3,r5 ; Get the new count - stw r4,SAVprev+4(r8) ; Point to the old head - stw r3,lclfreecnt(r10) ; Set the new count - - bl saveunlock ; Update the adjust field and unlock - mtlr r7 ; restore return address - b saveGet32 ; Start over and finally allocate the savearea... - - ; The local list is below the repopulate threshold and the global list is empty. - ; First we check if there are any left in the local list and if so, we allow - ; them to be allocated. If not, we release the backpocket list and choke. - ; There is nothing more that we can do at this point. Hopefully we stay alive - ; long enough to grab some much-needed panic information. - ; r7 = return address to caller of saveGet32 - ; r10 = per-proc ptr - -saveGet32NoFree: - lwz r8,lclfreecnt(r10) ; Get the count - mr. r8,r8 ; Are there any reserve to get? - beq- saveGet32Choke ; No, go choke and die... - bl saveunlock ; Update the adjust field and unlock - lwz r3,lclfree+4(r10) ; Get the start of local savearea list - lwz r8,lclfreecnt(r10) ; Get the count - mtlr r7 ; restore return address - b saveGet32GetLocal ; We have some left, dip on in... - -; We who are about to die salute you. The savearea chain is messed up or -; empty. Add in a few so we have enough to take down the system. - -saveGet32Choke: - lis r9,hi16(EXT(backpocket)) ; Get high order of back pocket - ori r9,r9,lo16(EXT(backpocket)) ; and low part - - lwz r8,SVfreecnt-saveanchor(r9) ; Get the new number of free elements - lwz r7,SVfree+4-saveanchor(r9) ; Get the head of the chain - lwz r6,SVinuse(0) ; Get total in the old list - - stw r8,SVfreecnt(0) ; Set the new number of free elements - add r6,r6,r8 ; Add in the new ones (why?) - stw r7,SVfree+4(0) ; Set the new head of the chain - stw r6,SVinuse(0) ; Set total in the new list - - b saveGetChokeJoin - - -/* - * ******************* - * * s a v e _ r e t * - * ******************* - * - * void save_ret(struct savearea *); // normal call - * void save_ret_wMSR(struct savearea *,reg64_t); // passes MSR to restore as 2nd arg - * - * Return a savearea passed by virtual address to the free list. - * Note really well: we can take NO exceptions of any kind, - * including a PTE miss once the savearea lock is held. That's - * a guaranteed deadlock. That means we must disable for interrutions - * and turn all translation off. - */ - .globl EXT(save_ret_wMSR) ; alternate entry pt w MSR to restore in r4 - -LEXT(save_ret_wMSR) - crset 31 ; set flag for save_ret_wMSR - b svrt1 ; join common code - - .align 5 - .globl EXT(save_ret) - -LEXT(save_ret) - crclr 31 ; clear flag for save_ret_wMSR -svrt1: ; join from save_ret_wMSR - mflr r9 ; get return address - rlwinm r7,r3,0,0,19 ; get virtual address of SAC area at start of page - mr r8,r3 ; save virtual address - lwz r5,SACvrswap+0(r7) ; get 64-bit converter from V to R - lwz r6,SACvrswap+4(r7) ; both halves, though only bottom used on 32-bit machine -#if FPVECDBG - lis r0,HIGH_ADDR(CutTrace) ; (TEST/DEBUG) - li r2,0x2204 ; (TEST/DEBUG) - oris r0,r0,LOW_ADDR(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) -#endif - bl saveSetup ; turn translation off, 64-bit on, load many regs - bf++ 31,svrt3 ; skip if not save_ret_wMSR - mr r11,r4 ; was save_ret_wMSR, so overwrite saved MSR -svrt3: - bf-- pf64Bitb,svrt4 ; skip if a 32-bit processor - - ; Handle 64-bit processor. - - rldimi r6,r5,32,0 ; merge upper and lower halves of SACvrswap together - xor r3,r8,r6 ; get r3 <- 64-bit physical address of this savearea - bl saveRet64 ; return it - mtlr r9 ; restore return address - b saveRestore64 ; restore MSR - - ; Handle 32-bit processor. - -svrt4: - xor r3,r8,r6 ; get r3 <- 32-bit physical address of this savearea - bl saveRet32 ; return it - mtlr r9 ; restore return address - b saveRestore32 ; restore MSR - - -/* - * ***************************** - * * s a v e _ r e t _ p h y s * - * ***************************** - * - * void save_ret_phys(reg64_t); - * - * Called from lowmem vectors to return (ie, free) a savearea by physical address. - * Translation and interrupts are already off, and 64-bit mode is set if defined. - * We can take _no_ exceptions of any kind in this code, including PTE miss, since - * that would result in a deadlock. We expect: - * r3 = phys addr of savearea - * msr = IR, DR, and EE off, SF on - * cr6 = pf64Bit flag - * We destroy: - * r0,r2-r10. - */ - .align 5 - .globl EXT(save_ret_phys) - -LEXT(save_ret_phys) - mfsprg r10,0 ; get the per-proc ptr - bf-- pf64Bitb,saveRet32 ; handle 32-bit machine - b saveRet64 ; handle 64-bit machine - - -/* - * ********************* - * * s a v e R e t 6 4 * - * ********************* - * - * This is the internal routine to free a savearea, passed by 64-bit physical - * address. We assume that IR, DR, and EE are all off, that SF is on, and: - * r3 = phys address of the savearea - * r10 = per-proc ptr - * We destroy: - * r0,r2-r8. - */ - .align 5 - saveRet64: - li r0,SAVempty ; Get marker for free savearea - lwz r7,lclfreecnt(r10) ; Get the local count - ld r6,lclfree(r10) ; Get the old local header - addi r7,r7,1 ; Pop up the free count - std r6,SAVprev(r3) ; Plant free chain pointer - cmplwi r7,LocalSaveMax ; Has the list gotten too long? - stb r0,SAVflags+2(r3) ; Mark savearea free - std r3,lclfree(r10) ; Chain us on in - stw r7,lclfreecnt(r10) ; Bump up the count - bltlr++ ; List not too long, so done - -/* The local savearea chain has gotten too long. Trim it down to the target. - * Here's a tricky bit, and important: - * - * When we trim the list, we NEVER trim the very first one. This is because that is - * the very last one released and the exception exit code will release the savearea - * BEFORE it is done using it. Wouldn't be too good if another processor started - * using it, eh? So for this case, we are safe so long as the savearea stays on - * the local list. (Note: the exit routine needs to do this because it is in the - * process of restoring all context and it needs to keep it until the last second.) - */ - - mflr r0 ; save return to caller of saveRet64 - mr r2,r3 ; r2 <- 1st one on local list, which must not be trimmed - ld r3,SAVprev(r3) ; Skip over the first - subi r7,r7,LocalSaveTarget ; Figure out how much to trim - mr r6,r3 ; r6 <- first one to trim - mr r5,r7 ; Save the number we are trimming - -saveRet64a: - addic. r7,r7,-1 ; Any left to do? - ble-- saveRet64b ; Nope... - ld r3,SAVprev(r3) ; Skip to the next one - b saveRet64a ; Keep going... - -saveRet64b: ; r3 <- last one to trim - ld r7,SAVprev(r3) ; Point to the first one not to trim - li r4,LocalSaveTarget ; Set the target count - std r7,SAVprev(r2) ; Trim stuff leaving the one just released as first - stw r4,lclfreecnt(r10) ; Set the current count - - bl savelock ; Lock up the anchor - - ld r8,SVfree(0) ; Get the old head of the free list - lwz r4,SVfreecnt(0) ; Get the number of free ones - lwz r7,SVinuse(0) ; Get the number that are in use - std r6,SVfree(0) ; Point to the first trimmed savearea - add r4,r4,r5 ; Add number trimmed to free count - std r8,SAVprev(r3) ; Chain the old head to the tail of the trimmed guys - sub r7,r7,r5 ; Remove the trims from the in use count - stw r4,SVfreecnt(0) ; Set new free count - stw r7,SVinuse(0) ; Set new in use count - - mtlr r0 ; Restore the return to our caller - b saveunlock ; Set adjust count, unlock the saveanchor, and return - - -/* - * ********************* - * * s a v e R e t 3 2 * - * ********************* - * - * This is the internal routine to free a savearea, passed by 32-bit physical - * address. We assume that IR, DR, and EE are all off, and: - * r3 = phys address of the savearea - * r10 = per-proc ptr - * We destroy: - * r0,r2-r8. - */ - .align 5 - saveRet32: - li r0,SAVempty ; Get marker for free savearea - lwz r7,lclfreecnt(r10) ; Get the local count - lwz r6,lclfree+4(r10) ; Get the old local header - addi r7,r7,1 ; Pop up the free count - stw r6,SAVprev+4(r3) ; Plant free chain pointer - cmplwi r7,LocalSaveMax ; Has the list gotten too long? - stb r0,SAVflags+2(r3) ; Mark savearea free - stw r3,lclfree+4(r10) ; Chain us on in - stw r7,lclfreecnt(r10) ; Bump up the count - bltlr+ ; List not too long, so done - -/* The local savearea chain has gotten too long. Trim it down to the target. - * Here's a tricky bit, and important: - * - * When we trim the list, we NEVER trim the very first one. This is because that is - * the very last one released and the exception exit code will release the savearea - * BEFORE it is done using it. Wouldn't be too good if another processor started - * using it, eh? So for this case, we are safe so long as the savearea stays on - * the local list. (Note: the exit routine needs to do this because it is in the - * process of restoring all context and it needs to keep it until the last second.) - */ - - mflr r0 ; save return to caller of saveRet32 - mr r2,r3 ; r2 <- 1st one on local list, which must not be trimmed - lwz r3,SAVprev+4(r3) ; Skip over the first - subi r7,r7,LocalSaveTarget ; Figure out how much to trim - mr r6,r3 ; r6 <- first one to trim - mr r5,r7 ; Save the number we are trimming - -saveRet32a: - addic. r7,r7,-1 ; Any left to do? - ble- saveRet32b ; Nope... - lwz r3,SAVprev+4(r3) ; Skip to the next one - b saveRet32a ; Keep going... - -saveRet32b: ; r3 <- last one to trim - lwz r7,SAVprev+4(r3) ; Point to the first one not to trim - li r4,LocalSaveTarget ; Set the target count - stw r7,SAVprev+4(r2) ; Trim stuff leaving the one just released as first - stw r4,lclfreecnt(r10) ; Set the current count - - bl savelock ; Lock up the anchor - - lwz r8,SVfree+4(0) ; Get the old head of the free list - lwz r4,SVfreecnt(0) ; Get the number of free ones - lwz r7,SVinuse(0) ; Get the number that are in use - stw r6,SVfree+4(0) ; Point to the first trimmed savearea - add r4,r4,r5 ; Add number trimmed to free count - stw r8,SAVprev+4(r3) ; Chain the old head to the tail of the trimmed guys - sub r7,r7,r5 ; Remove the trims from the in use count - stw r4,SVfreecnt(0) ; Set new free count - stw r7,SVinuse(0) ; Set new in use count - - mtlr r0 ; Restore the return to our caller - b saveunlock ; Set adjust count, unlock the saveanchor, and return - - -/* - * ******************************* - * * s a v e _ t r i m _ f r e e * - * ******************************* - * - * struct savearea_comm *save_trim_free(void); - * - * Trim the free list down to the target count, ie by -(SVadjust) save areas. - * It trims the list and, if a pool page was fully allocated, puts that page on - * the start of the pool list. - * - * If the savearea being released is the last on a pool page (i.e., all entries - * are released), the page is dequeued from the pool and queued to any other - * found during this scan. Note that this queue is maintained virtually. - * - * When the scan is done, the saveanchor lock is released and the list of - * freed pool pages is returned to our caller. - * - * For latency sake we may want to revisit this code. If we are trimming a - * large number of saveareas, we could be disabled and holding the savearea lock - * for quite a while. It may be that we want to break the trim down into parts. - * Possibly trimming the free list, then individually pushing them into the free pool. - * - * This function expects to be called with translation on and a valid stack. - * It uses the standard ABI, ie we destroy r2 and r3-r11, and return the ptr in r3. - */ - .align 5 - .globl EXT(save_trim_free) - -LEXT(save_trim_free) - - subi r1,r1,(FM_ALIGN(16)+FM_SIZE) ; Make space for 4 registers on stack - mflr r9 ; save our return address - stw r28,FM_SIZE+0(r1) ; Save R28 - stw r29,FM_SIZE+4(r1) ; Save R29 - stw r30,FM_SIZE+8(r1) ; Save R30 - stw r31,FM_SIZE+12(r1) ; Save R31 - - bl saveSetup ; turn off translation and interrupts, load many regs - bl savelock ; Go lock up the anchor - - lwz r8,SVadjust(0) ; How many do we need to clear out? - li r3,0 ; Get a 0 - neg. r8,r8 ; Get the actual we need to toss (adjust is neg if too many) - ble- save_trim_free1 ; skip if no trimming needed anymore - bf-- pf64Bitb,saveTrim32 ; handle 32-bit processors - b saveTrim64 ; handle 64-bit processors - -save_trim_free1: ; by the time we were called, no need to trim anymore - stw r3,SVlock(0) ; Quick unlock (no need for sync or to set adjust, nothing changed) - mtlr r9 ; Restore return - -#if FPVECDBG - lis r0,HIGH_ADDR(CutTrace) ; (TEST/DEBUG) - li r2,0x2206 ; (TEST/DEBUG) - oris r0,r0,LOW_ADDR(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) -#endif - addi r1,r1,(FM_ALIGN(16)+FM_SIZE); Pop stack - have not trashed register so no need to reload - b saveRestore ; restore translation and EE, turn SF off, return to our caller - - -/* - * *********************** - * * s a v e T r i m 3 2 * - * *********************** - * - * Handle "save_trim_free" on 32-bit processors. At this point, translation and interrupts - * are off, the savearea anchor is locked, and: - * r8 = #pages to trim (>0) - * r9 = return address - * r10 = per-proc ptr - * r11 = MSR at entry - */ - -saveTrim32: - lwz r7,SVfree+4(0) ; Get the first on the free list - mr r6,r7 ; Save the first one - mr r5,r8 ; Save the number we are trimming - -sttrimming: addic. r5,r5,-1 ; Any left to do? - ble- sttrimmed ; Nope... - lwz r7,SAVprev+4(r7) ; Skip to the next one - b sttrimming ; Keep going... - -sttrimmed: lwz r5,SAVprev+4(r7) ; Get the next one (for new head of free list) - lwz r4,SVfreecnt(0) ; Get the free count - stw r5,SVfree+4(0) ; Set new head - sub r4,r4,r8 ; Calculate the new free count - li r31,0 ; Show we have no free pool blocks yet - crclr cr1_eq ; dont exit loop before 1st iteration - stw r4,SVfreecnt(0) ; Set new free count - lis r30,hi16(sac_empty) ; Get what empty looks like - -; NOTE: The savearea size must be 640 (0x280). We are doing a divide by shifts and stuff -; here. -; -#if SAVsize != 640 -#error Savearea size is not 640!!!!!!!!!!!! -#endif - - ; Loop over each savearea we are trimming. - ; r6 = next savearea to trim - ; r7 = last savearea to trim - ; r8 = #pages to trim (>0) - ; r9 = return address - ; r10 = per-proc ptr - ; r11 = MSR at entry - ; r30 = what SACalloc looks like when all saveareas are free - ; r31 = free pool block list - ; cr1 = beq set if we just trimmed the last, ie if we are done - -sttoss: beq+ cr1,stdone ; All done now... - - cmplw cr1,r6,r7 ; Have we finished the loop? - - lis r0,0x0044 ; Get top of table - rlwinm r2,r6,0,0,19 ; Back down to the savearea control stuff - ori r0,r0,0x2200 ; Finish shift table - rlwinm r4,r6,25,27,30 ; Get (addr >> 7) & 0x1E (same as twice high nybble) - lwz r5,SACalloc(r2) ; Get the allocation bits - addi r4,r4,1 ; Shift 1 extra - rlwinm r3,r6,25,31,31 ; Get (addr >> 7) & 1 - rlwnm r0,r0,r4,29,31 ; Get partial index - lis r4,lo16(0x8000) ; Get the bit mask - add r0,r0,r3 ; Make the real index - srw r4,r4,r0 ; Get the allocation mask - or r5,r5,r4 ; Free this entry - cmplw r5,r4 ; Is this the only free entry? - lwz r6,SAVprev+4(r6) ; Chain to the next trimmed savearea - cmplw cr7,r30,r5 ; Does this look empty? - stw r5,SACalloc(r2) ; Save back the allocation bits - beq- stputpool ; First free entry, go put it into the pool... - bne+ cr7,sttoss ; Not an empty block - -; -; We have an empty block. Remove it from the pool list. -; - - lwz r29,SACflags(r2) ; Get the flags - cmplwi cr5,r31,0 ; Is this guy on the release list? - lwz r28,SACnext+4(r2) ; Get the forward chain - - rlwinm. r0,r29,0,sac_permb,sac_permb ; Is this a permanently allocated area? (also sets 0 needed below) - bne- sttoss ; This is permanent entry, do not try to release... - - lwz r29,SACprev+4(r2) ; and the previous - beq- cr5,stnot1st ; Not first - lwz r0,SACvrswap+4(r31) ; Load the previous pool page vr conversion - -stnot1st: stw r28,SACnext+4(r29) ; Previous guy points to my next - xor r0,r0,r31 ; Make the last guy virtual - stw r29,SACprev+4(r28) ; Next guy points back to my previous - stw r0,SAVprev+4(r2) ; Store the old top virtual as my back chain - mr r31,r2 ; My physical is now the head of the chain - b sttoss ; Get the next one... - -; -; A pool block that had no free entries now has one. Stick it on the pool list. -; - -stputpool: lwz r28,SVpoolfwd+4(0) ; Get the first guy on the list - li r0,saveanchor ; Point to the saveanchor - stw r2,SVpoolfwd+4(0) ; Put us on the top of the list - stw r28,SACnext+4(r2) ; We point to the old top - stw r2,SACprev+4(r28) ; Old top guy points back to us - stw r0,SACprev+4(r2) ; Our back points to the anchor - b sttoss ; Go on to the next one... - - -/* - * *********************** - * * s a v e T r i m 6 4 * - * *********************** - * - * Handle "save_trim_free" on 64-bit processors. At this point, translation and interrupts - * are off, SF is on, the savearea anchor is locked, and: - * r8 = #pages to trim (>0) - * r9 = return address - * r10 = per-proc ptr - * r11 = MSR at entry - */ - -saveTrim64: - ld r7,SVfree(0) ; Get the first on the free list - mr r6,r7 ; Save the first one - mr r5,r8 ; Save the number we are trimming - -sttrimming64: - addic. r5,r5,-1 ; Any left to do? - ble-- sttrimmed64 ; Nope... - ld r7,SAVprev(r7) ; Skip to the next one - b sttrimming64 ; Keep going... - -sttrimmed64: - ld r5,SAVprev(r7) ; Get the next one (for new head of free list) - lwz r4,SVfreecnt(0) ; Get the free count - std r5,SVfree(0) ; Set new head - sub r4,r4,r8 ; Calculate the new free count - li r31,0 ; Show we have no free pool blocks yet - crclr cr1_eq ; dont exit loop before 1st iteration - stw r4,SVfreecnt(0) ; Set new free count - lis r30,hi16(sac_empty) ; Get what empty looks like - - - ; Loop over each savearea we are trimming. - ; r6 = next savearea to trim - ; r7 = last savearea to trim - ; r8 = #pages to trim (>0) - ; r9 = return address - ; r10 = per-proc ptr - ; r11 = MSR at entry - ; r30 = what SACalloc looks like when all saveareas are free - ; r31 = free pool block list - ; cr1 = beq set if we just trimmed the last, ie if we are done - ; - ; WARNING: as in the 32-bit path, this code is doing a divide by 640 (SAVsize). - -sttoss64: - beq++ cr1,stdone ; All done now... - - cmpld cr1,r6,r7 ; Have we finished the loop? - - lis r0,0x0044 ; Get top of table - rldicr r2,r6,0,51 ; r2 <- phys addr of savearea block (with control area) - ori r0,r0,0x2200 ; Finish shift table - rlwinm r4,r6,25,27,30 ; Get (addr >> 7) & 0x1E (same as twice high nybble) - lwz r5,SACalloc(r2) ; Get the allocation bits - addi r4,r4,1 ; Shift 1 extra - rlwinm r3,r6,25,31,31 ; Get (addr >> 7) & 1 - rlwnm r0,r0,r4,29,31 ; Get partial index - lis r4,lo16(0x8000) ; Get the bit mask - add r0,r0,r3 ; Make the real index - srw r4,r4,r0 ; Get the allocation mask - or r5,r5,r4 ; Free this entry - cmplw r5,r4 ; Is this the only free entry? - ld r6,SAVprev(r6) ; Chain to the next trimmed savearea - cmplw cr7,r30,r5 ; Does this look empty? - stw r5,SACalloc(r2) ; Save back the allocation bits - beq-- stputpool64 ; First free entry, go put it into the pool... - bne++ cr7,sttoss64 ; Not an empty block - -; We have an empty block. Remove it from the pool list. - - lwz r29,SACflags(r2) ; Get the flags - cmpldi cr5,r31,0 ; Is this guy on the release list? - ld r28,SACnext(r2) ; Get the forward chain - - rlwinm. r0,r29,0,sac_permb,sac_permb ; Is this a permanently allocated area? (also sets 0 needed below) - bne-- sttoss64 ; This is permanent entry, do not try to release... - - ld r29,SACprev(r2) ; and the previous - beq-- cr5,stnot1st64 ; Not first - ld r0,SACvrswap(r31) ; Load the previous pool page vr conversion - -stnot1st64: - std r28,SACnext(r29) ; Previous guy points to my next - xor r0,r0,r31 ; Make the last guy virtual - std r29,SACprev(r28) ; Next guy points back to my previous - std r0,SAVprev(r2) ; Store the old top virtual as my back chain - mr r31,r2 ; My physical is now the head of the chain - b sttoss64 ; Get the next one... - -; A pool block that had no free entries now has one. Stick it on the pool list. - -stputpool64: - ld r28,SVpoolfwd(0) ; Get the first guy on the list - li r0,saveanchor ; Point to the saveanchor - std r2,SVpoolfwd(0) ; Put us on the top of the list - std r28,SACnext(r2) ; We point to the old top - std r2,SACprev(r28) ; Old top guy points back to us - std r0,SACprev(r2) ; Our back points to the anchor - b sttoss64 ; Go on to the next one... - - -; We are all done. Relocate pool release head, restore all, and go. This code -; is used both by the 32 and 64-bit paths. -; r9 = return address -; r10 = per-proc ptr -; r11 = MSR at entry -; r31 = free pool block list - -stdone: bl saveunlock ; Unlock the saveanchor and set adjust field - - mr. r3,r31 ; Move release chain and see if there are any - li r5,0 ; Assume either V=R or no release chain - beq- stnorel ; Nothing to release... - lwz r5,SACvrswap+4(r31) ; Get the vr conversion (only need low half if 64-bit) - -stnorel: - bl saveRestore ; restore translation and exceptions, turn off SF - mtlr r9 ; Restore the return - - lwz r28,FM_SIZE+0(r1) ; Restore R28 - lwz r29,FM_SIZE+4(r1) ; Restore R29 - lwz r30,FM_SIZE+8(r1) ; Restore R30 - lwz r31,FM_SIZE+12(r1) ; Restore R31 - addi r1,r1,(FM_ALIGN(16)+FM_SIZE) ; Pop the stack - xor r3,r3,r5 ; Convert release chain address to virtual - rlwinm r3,r3,0,0,31 ; if 64-bit, clear upper half of virtual address - -#if FPVECDBG - lis r0,HIGH_ADDR(CutTrace) ; (TEST/DEBUG) - li r2,0x2207 ; (TEST/DEBUG) - oris r0,r0,LOW_ADDR(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) -#endif - blr ; Return... - - -/* - * *************************** - * * s a v e _ r e c o v e r * - * *************************** - * - * int save_recover(void); - * - * Returns nonzero if we can get enough saveareas to hit the target. We scan the free - * pool. If we empty a pool block, we remove it from the pool list. - */ - - .align 5 - .globl EXT(save_recover) - -LEXT(save_recover) - mflr r9 ; save return address - bl saveSetup ; turn translation and interrupts off, SF on, load many regs - bl savelock ; lock the savearea anchor - - lwz r8,SVadjust(0) ; How many do we need to clear get? - li r3,0 ; Get a 0 - mr. r8,r8 ; Do we need any? - ble-- save_recover1 ; not any more - bf-- pf64Bitb,saveRecover32 ; handle 32-bit processor - b saveRecover64 ; handle 64-bit processor - -save_recover1: ; by the time we locked the anchor, no longer short - mtlr r9 ; Restore return - stw r3,SVlock(0) ; Quick unlock (no need for sync or to set adjust, nothing changed) -#if FPVECDBG - lis r0,HIGH_ADDR(CutTrace) ; (TEST/DEBUG) - li r2,0x2208 ; (TEST/DEBUG) - oris r0,r0,LOW_ADDR(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) -#endif - b saveRestore ; turn translation etc back on, return to our caller - - -/* - * ***************************** - * * s a v e R e c o v e r 3 2 * - * ***************************** - * - * Handle "save_recover" on 32-bit processors. At this point, translation and interrupts - * are off, the savearea anchor is locked, and: - * r8 = #pages to recover - * r9 = return address - * r10 = per-proc ptr - * r11 = MSR at entry - */ - -saveRecover32: - li r6,saveanchor ; Start at pool anchor - crclr cr1_eq ; initialize the loop test - lwz r7,SVfreecnt(0) ; Get the current free count - - -; Loop over next block in free pool. r6 is the ptr to the last block we looked at. - -srcnpool: lwz r6,SACnext+4(r6) ; Point to the next one - cmplwi r6,saveanchor ; Have we wrapped? - beq- srcdone ; Yes, did not have enough... - - lwz r5,SACalloc(r6) ; Pick up the allocation for this pool block - -; -; NOTE: The savearea size must be 640 (0x280). We are doing a multiply by shifts and add. -; offset = (index << 9) + (index << 7) -; -#if SAVsize != 640 -#error Savearea size is not 640!!!!!!!!!!!! -#endif - -; Loop over free savearea in current block. -; r5 = bitmap of free saveareas in block at r6 (ie, SACalloc) -; r6 = ptr to current free pool block -; r7 = free count -; r8 = #pages more we still need to recover -; r9 = return address -; r10 = per-proc ptr -; r11 = MSR at entry -; cr1 = beq if (r8==0) - -srcnext: beq- cr1,srcdone ; We have no more to get... - - lis r3,0x8000 ; Get the top bit on - cntlzw r4,r5 ; Find a free slot - addi r7,r7,1 ; Bump up the free count - srw r3,r3,r4 ; Make a mask - slwi r0,r4,7 ; First multiply by 128 - subi r8,r8,1 ; Decrement the need count - slwi r2,r4,9 ; Then multiply by 512 - andc. r5,r5,r3 ; Clear out the "free" bit - add r2,r2,r0 ; Sum to multiply by 640 - - stw r5,SACalloc(r6) ; Set new allocation bits - - add r2,r2,r6 ; Get the actual address of the savearea - lwz r3,SVfree+4(0) ; Get the head of the chain - cmplwi cr1,r8,0 ; Do we actually need any more? - stw r2,SVfree+4(0) ; Push ourselves in the front - stw r3,SAVprev+4(r2) ; Chain the rest of the list behind - - bne+ srcnext ; The pool block is not empty yet, try for another... - - lwz r2,SACnext+4(r6) ; Get the next pointer - lwz r3,SACprev+4(r6) ; Get the previous pointer - stw r3,SACprev+4(r2) ; The previous of my next points to my previous - stw r2,SACnext+4(r3) ; The next of my previous points to my next - bne+ cr1,srcnpool ; We still have more to do... - - -; Join here from 64-bit path when we have recovered all the saveareas we need to. - -srcdone: stw r7,SVfreecnt(0) ; Set the new free count - bl saveunlock ; Unlock the save and set adjust field - - mtlr r9 ; Restore the return -#if FPVECDBG - lis r0,HIGH_ADDR(CutTrace) ; (TEST/DEBUG) - li r2,0x2209 ; (TEST/DEBUG) - oris r0,r0,LOW_ADDR(CutTrace) ; (TEST/DEBUG) - sc ; (TEST/DEBUG) -#endif - b saveRestore ; turn xlate and EE back on, SF off, and return to our caller - - -/* - * ***************************** - * * s a v e R e c o v e r 6 4 * - * ***************************** - * - * Handle "save_recover" on 64-bit processors. At this point, translation and interrupts - * are off, the savearea anchor is locked, and: - * r8 = #pages to recover - * r9 = return address - * r10 = per-proc ptr - * r11 = MSR at entry - */ - -saveRecover64: - li r6,saveanchor ; Start at pool anchor - crclr cr1_eq ; initialize the loop test - lwz r7,SVfreecnt(0) ; Get the current free count - - -; Loop over next block in free pool. r6 is the ptr to the last block we looked at. - -srcnpool64: - ld r6,SACnext(r6) ; Point to the next one - cmpldi r6,saveanchor ; Have we wrapped? - beq-- srcdone ; Yes, did not have enough... - - lwz r5,SACalloc(r6) ; Pick up the allocation for this pool block - - -; Loop over free savearea in current block. -; r5 = bitmap of free saveareas in block at r6 (ie, SACalloc) -; r6 = ptr to current free pool block -; r7 = free count -; r8 = #pages more we still need to recover -; r9 = return address -; r10 = per-proc ptr -; r11 = MSR at entry -; cr1 = beq if (r8==0) -; -; WARNING: as in the 32-bit path, we depend on (SAVsize==640) - -srcnext64: - beq-- cr1,srcdone ; We have no more to get... - - lis r3,0x8000 ; Get the top bit on - cntlzw r4,r5 ; Find a free slot - addi r7,r7,1 ; Bump up the free count - srw r3,r3,r4 ; Make a mask - slwi r0,r4,7 ; First multiply by 128 - subi r8,r8,1 ; Decrement the need count - slwi r2,r4,9 ; Then multiply by 512 - andc. r5,r5,r3 ; Clear out the "free" bit - add r2,r2,r0 ; Sum to multiply by 640 - - stw r5,SACalloc(r6) ; Set new allocation bits - - add r2,r2,r6 ; Get the actual address of the savearea - ld r3,SVfree(0) ; Get the head of the chain - cmplwi cr1,r8,0 ; Do we actually need any more? - std r2,SVfree(0) ; Push ourselves in the front - std r3,SAVprev(r2) ; Chain the rest of the list behind - - bne++ srcnext64 ; The pool block is not empty yet, try for another... - - ld r2,SACnext(r6) ; Get the next pointer - ld r3,SACprev(r6) ; Get the previous pointer - std r3,SACprev(r2) ; The previous of my next points to my previous - std r2,SACnext(r3) ; The next of my previous points to my next - bne++ cr1,srcnpool64 ; We still have more to do... - - b srcdone - - -/* - * ******************* - * * s a v e l o c k * - * ******************* - * - * Lock the savearea anchor, so we can manipulate the free list. - * msr = interrupts and translation off - * We destroy: - * r8, r3, r12 - */ - .align 5 - -savelock: lwz r8,SVlock(0) ; See if lock is held - cmpwi r8,0 - li r12,saveanchor ; Point to the saveanchor - bne-- savelock ; loop until lock released... - -savelock0: lwarx r8,0,r12 ; Grab the lock value - cmpwi r8,0 ; taken? - li r8,1 ; get nonzero to lock it with - bne-- savelock1 ; already locked, wait for it to clear... - stwcx. r8,0,r12 ; Try to seize that there durn lock - isync ; assume we got it - beqlr++ ; reservation not lost, so we have the lock - b savelock0 ; Try again... - -savelock1: li r8,lgKillResv ; Point to killing field - stwcx. r8,0,r8 ; Kill reservation - b savelock ; Start over.... - - -/* - * *********************** - * * s a v e u n l o c k * - * *********************** - * - * - * This is the common routine that sets the saveadjust field and unlocks the savearea - * anchor. - * msr = interrupts and translation off - * We destroy: - * r2, r5, r6, r8. - */ - .align 5 -saveunlock: - lwz r6,SVfreecnt(0) ; and the number on the free list - lwz r5,SVinuse(0) ; Pick up the in use count - subic. r8,r6,FreeListMin ; do we have at least the minimum? - lwz r2,SVtarget(0) ; Get the target - neg r8,r8 ; assuming we are short, get r8 <- shortfall - blt-- saveunlock1 ; skip if fewer than minimum on free list - - add r6,r6,r5 ; Get the total number of saveareas - addi r5,r2,-SaveLowHysteresis ; Find low end of acceptible range - sub r5,r6,r5 ; Make everything below hysteresis negative - sub r2,r2,r6 ; Get the distance from the target - addi r5,r5,-(SaveLowHysteresis + SaveHighHysteresis + 1) ; Subtract full hysteresis range - srawi r5,r5,31 ; Get 0xFFFFFFFF if outside range or 0 if inside - and r8,r2,r5 ; r8 <- 0 if in range or distance to target if not - -saveunlock1: - li r5,0 ; Set a clear value - stw r8,SVadjust(0) ; Set the adjustment value - eieio ; Make sure everything is done - stw r5,SVlock(0) ; Unlock the savearea chain - blr - - -/* - * ******************* - * * s a v e _ c p v * - * ******************* - * - * struct savearea *save_cpv(addr64_t saveAreaPhysAddr); - * - * Converts a physical savearea address to virtual. Called with translation on - * and in 32-bit mode. Note that the argument is passed as a long long in (r3,r4). - */ - - .align 5 - .globl EXT(save_cpv) - -LEXT(save_cpv) - mflr r9 ; save return address - mr r8,r3 ; save upper half of phys address here - bl saveSetup ; turn off translation and interrupts, turn SF on - rlwinm r5,r4,0,0,19 ; Round back to the start of the physical savearea block - bf-- pf64Bitb,save_cpv1 ; skip if 32-bit processor - rldimi r5,r8,32,0 ; r5 <- 64-bit phys address of block -save_cpv1: - lwz r6,SACvrswap+4(r5) ; Get the conversion to virtual (only need low half if 64-bit) - mtlr r9 ; restore return address - xor r3,r4,r6 ; convert phys to virtual - rlwinm r3,r3,0,0,31 ; if 64-bit, zero upper half of virtual address - b saveRestore ; turn translation etc back on, SF off, and return r3 - - -/* - * ********************* - * * s a v e S e t u p * - * ********************* - * - * This routine is called at the start of all the save-area subroutines. - * It turns off translation, disabled interrupts, turns on 64-bit mode, - * and sets up cr6 with the feature flags (especially pf64Bit). - * - * Note that most save-area routines cannot take _any_ interrupt (such as a - * PTE miss) once the savearea anchor is locked, since that would result in - * instant deadlock as we need a save-area to process any exception. - * We set up: - * r10 = per-proc ptr - * r11 = old MSR - * cr5 = pfNoMSRir feature flag - * cr6 = pf64Bit feature flag - * - * We use r0, r3, r10, and r11. - */ - -saveSetup: - mfmsr r11 ; get msr - mfsprg r3,2 ; get feature flags - li r0,0 - mtcrf 0x2,r3 ; copy pf64Bit to cr6 - ori r0,r0,lo16(MASK(MSR_IR)+MASK(MSR_DR)+MASK(MSR_EE)) - mtcrf 0x4,r3 ; copy pfNoMSRir to cr5 - andc r3,r11,r0 ; turn off IR, DR, and EE - li r0,1 ; get a 1 in case its a 64-bit machine - bf-- pf64Bitb,saveSetup1 ; skip if not a 64-bit machine - rldimi r3,r0,63,MSR_SF_BIT ; turn SF (bit 0) on - mtmsrd r3 ; turn translation and interrupts off, 64-bit mode on - isync ; wait for it to happen - mfsprg r10,0 ; get per-proc ptr - blr -saveSetup1: ; here on 32-bit machines - bt- pfNoMSRirb,saveSetup2 ; skip if cannot turn off IR with a mtmsr - mtmsr r3 ; turn translation and interrupts off - isync ; wait for it to happen - mfsprg r10,0 ; get per-proc ptr - blr -saveSetup2: ; here if pfNoMSRir set for this machine - li r0,loadMSR ; we will "mtmsr r3" via system call - sc - mfsprg r10,0 ; get per-proc ptr - blr - - -/* - * ************************* - * * s a v e R e s t o r e * - * ************************* - * - * Undoes the effect of calling "saveSetup", ie it turns relocation and interrupts back on, - * and turns 64-bit mode back off. - * r11 = old MSR - * cr6 = pf64Bit feature flag - */ - -saveRestore: - bt++ pf64Bitb,saveRestore64 ; handle a 64-bit processor -saveRestore32: - mtmsr r11 ; restore MSR - isync ; wait for translation to start up - blr -saveRestore64: ; 64-bit processor - mtmsrd r11 ; restore MSR - isync ; wait for changes to happen - blr - diff --git a/osfmk/ppc/scc_8530.h b/osfmk/ppc/scc_8530.h deleted file mode 100644 index 2fcdfeb80..000000000 --- a/osfmk/ppc/scc_8530.h +++ /dev/null @@ -1,428 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * @APPLE_FREE_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * File: scc_8530.h - * Author: Alessandro Forin, Carnegie Mellon University - * Date: 6/91 - * - * Definitions for the Zilog Z8530 SCC serial line chip - */ - -#ifndef _SCC_8530_H_ -#define _SCC_8530_H_ - -/* - * Register map, needs definition of the alignment - * used on the specific machine. - * #define the 'scc_register_t' data type before - * including this header file. For restrictions on - * access modes define the set/get_datum macros. - * We provide defaults ifnot. - */ - - -#define SCC_CHANNEL_A 1 -#define SCC_CHANNEL_B 0 - -#define SCC_MODEM SCC_CHANNEL_A -#define SCC_PRINTER SCC_CHANNEL_B - -#define SCC_DATA_OFFSET 4 - -typedef unsigned char *scc_regmap_t; - -extern void powermac_scc_set_datum(scc_regmap_t regs, unsigned int offset, unsigned char value); -extern unsigned char powermac_scc_get_datum(scc_regmap_t regs, unsigned int offset); - -#define scc_set_datum(regs, d, v) powermac_scc_set_datum(regs, (d), (v)) -#define scc_get_datum(regs, d,v) (v) = powermac_scc_get_datum(regs, (d)); - -#define scc_init_reg(regs,chan) { \ - char tmp; \ - scc_get_datum(regs, ((chan)<<1),tmp); \ - scc_get_datum(regs, ((chan)<<1),tmp); \ - } - -#define scc_read_reg(regs,chan,reg,val) { \ - scc_set_datum(regs, ((chan)<<1),reg); \ - scc_get_datum(regs, ((chan)<<1),val); \ - } - -#define scc_read_reg_zero(regs,chan,val) { \ - scc_get_datum(regs, ((chan)<<1),val); \ - } - -#define scc_write_reg(regs,chan,reg,val) { \ - scc_set_datum(regs, ((chan)<<1),reg); \ - scc_set_datum(regs, ((chan)<<1),val); \ - } - -#define scc_write_reg_zero(regs,chan,val) { \ - scc_set_datum(regs, ((chan)<<1),val); \ - } - -#define scc_read_data(regs,chan,val) { \ - scc_get_datum(regs, ((chan)<<1)+SCC_DATA_OFFSET,val); \ - } - -#define scc_write_data(regs,chan,val) { \ - scc_set_datum(regs, ((chan)<<1)+SCC_DATA_OFFSET,val); \ - } - - -/* - * Addressable registers - */ - -#define SCC_RR0 0 /* status register */ -#define SCC_RR1 1 /* special receive conditions */ -#define SCC_RR2 2 /* (modified) interrupt vector */ -#define SCC_RR3 3 /* interrupts pending (cha A only) */ -#define SCC_RR8 8 /* recv buffer (alias for data) */ -#define SCC_RR10 10 /* sdlc status */ -#define SCC_RR12 12 /* BRG constant, low part */ -#define SCC_RR13 13 /* BRG constant, high part */ -#define SCC_RR15 15 /* interrupts currently enabled */ - -#define SCC_WR0 0 /* reg select, and commands */ -#define SCC_WR1 1 /* interrupt and DMA enables */ -#define SCC_WR2 2 /* interrupt vector */ -#define SCC_WR3 3 /* receiver params and enables */ -#define SCC_WR4 4 /* clock/char/parity params */ -#define SCC_WR5 5 /* xmit params and enables */ -#define SCC_WR6 6 /* synchr SYNCH/address */ -#define SCC_WR7 7 /* synchr SYNCH/flag */ -#define SCC_WR8 8 /* xmit buffer (alias for data) */ -#define SCC_WR9 9 /* vectoring and resets */ -#define SCC_WR10 10 /* synchr params */ -#define SCC_WR11 11 /* clocking definitions */ -#define SCC_WR12 12 /* BRG constant, low part */ -#define SCC_WR13 13 /* BRG constant, high part */ -#define SCC_WR14 14 /* BRG enables and commands */ -#define SCC_WR15 15 /* interrupt enables */ - -/* - * Read registers defines - */ - -#define SCC_RR0_BREAK 0x80 /* break detected (rings twice), or */ -#define SCC_RR0_ABORT 0x80 /* abort (synchr) */ -#define SCC_RR0_TX_UNDERRUN 0x40 /* xmit buffer empty/end of message */ -#define SCC_RR0_CTS 0x20 /* clear-to-send pin active (sampled - only on intr and after RESI cmd */ -#define SCC_RR0_SYNCH 0x10 /* SYNCH found/still hunting */ -#define SCC_RR0_DCD 0x08 /* carrier-detect (same as CTS) */ -#define SCC_RR0_TX_EMPTY 0x04 /* xmit buffer empty */ -#define SCC_RR0_ZERO_COUNT 0x02 /* ? */ -#define SCC_RR0_RX_AVAIL 0x01 /* recv fifo not empty */ - -#define SCC_RR1_EOF 0x80 /* end-of-frame, SDLC mode */ -#define SCC_RR1_CRC_ERR 0x40 /* incorrect CRC or.. */ -#define SCC_RR1_FRAME_ERR 0x40 /* ..bad frame */ -#define SCC_RR1_RX_OVERRUN 0x20 /* rcv fifo overflow */ -#define SCC_RR1_PARITY_ERR 0x10 /* incorrect parity in data */ -#define SCC_RR1_RESIDUE0 0x08 -#define SCC_RR1_RESIDUE1 0x04 -#define SCC_RR1_RESIDUE2 0x02 -#define SCC_RR1_ALL_SENT 0x01 - -/* RR2 contains the interrupt vector unmodified (channel A) or - modified as follows (channel B, if vector-include-status) */ - -#define SCC_RR2_STATUS(val) ((val)&0xe) /* 11/7/95 used to be 0xf */ - -#define SCC_RR2_B_XMIT_DONE 0x0 -#define SCC_RR2_B_EXT_STATUS 0x2 -#define SCC_RR2_B_RECV_DONE 0x4 -#define SCC_RR2_B_RECV_SPECIAL 0x6 -#define SCC_RR2_A_XMIT_DONE 0x8 -#define SCC_RR2_A_EXT_STATUS 0xa -#define SCC_RR2_A_RECV_DONE 0xc -#define SCC_RR2_A_RECV_SPECIAL 0xe - -/* Interrupts pending, to be read from channel A only (B raz) */ -#define SCC_RR3_zero 0xc0 -#define SCC_RR3_RX_IP_A 0x20 -#define SCC_RR3_TX_IP_A 0x10 -#define SCC_RR3_EXT_IP_A 0x08 -#define SCC_RR3_RX_IP_B 0x04 -#define SCC_RR3_TX_IP_B 0x02 -#define SCC_RR3_EXT_IP_B 0x01 - -/* RR8 is the receive data buffer, a 3 deep FIFO */ -#define SCC_RECV_BUFFER SCC_RR8 -#define SCC_RECV_FIFO_DEEP 3 - -#define SCC_RR10_1CLKS 0x80 -#define SCC_RR10_2CLKS 0x40 -#define SCC_RR10_zero 0x2d -#define SCC_RR10_LOOP_SND 0x10 -#define SCC_RR10_ON_LOOP 0x02 - -/* RR12/RR13 hold the timing base, upper byte in RR13 */ - -#define scc_get_timing_base(scc,chan,val) { \ - register char tmp; \ - scc_read_reg(scc,chan,SCC_RR12,val);\ - scc_read_reg(scc,chan,SCC_RR13,tmp);\ - (val) = ((val)<<8)|(tmp&0xff);\ - } - -#define SCC_RR15_BREAK_IE 0x80 -#define SCC_RR15_TX_UNDERRUN_IE 0x40 -#define SCC_RR15_CTS_IE 0x20 -#define SCC_RR15_SYNCH_IE 0x10 -#define SCC_RR15_DCD_IE 0x08 -#define SCC_RR15_zero 0x05 -#define SCC_RR15_ZERO_COUNT_IE 0x02 - - -/* - * Write registers defines - */ - -/* WR0 is used for commands too */ -#define SCC_RESET_TXURUN_LATCH 0xc0 -#define SCC_RESET_TX_CRC 0x80 -#define SCC_RESET_RX_CRC 0x40 -#define SCC_RESET_HIGHEST_IUS 0x38 /* channel A only */ -#define SCC_RESET_ERROR 0x30 -#define SCC_RESET_TX_IP 0x28 -#define SCC_IE_NEXT_CHAR 0x20 -#define SCC_SEND_SDLC_ABORT 0x18 -#define SCC_RESET_EXT_IP 0x10 - -#define SCC_WR1_DMA_ENABLE 0x80 /* dma control */ -#define SCC_WR1_DMA_MODE 0x40 /* drive ~req for DMA controller */ -#define SCC_WR1_DMA_RECV_DATA 0x20 /* from wire to host memory */ - /* interrupt enable/conditions */ -#define SCC_WR1_RXI_SPECIAL_O 0x18 /* on special only */ -#define SCC_WR1_RXI_ALL_CHAR 0x10 /* on each char, or special */ -#define SCC_WR1_RXI_FIRST_CHAR 0x08 /* on first char, or special */ -#define SCC_WR1_RXI_DISABLE 0x00 /* never on recv */ -#define SCC_WR1_PARITY_IE 0x04 /* on parity errors */ -#define SCC_WR1_TX_IE 0x02 -#define SCC_WR1_EXT_IE 0x01 - -/* WR2 is common and contains the interrupt vector (high nibble) */ - -#define SCC_WR3_RX_8_BITS 0xc0 -#define SCC_WR3_RX_6_BITS 0x80 -#define SCC_WR3_RX_7_BITS 0x40 -#define SCC_WR3_RX_5_BITS 0x00 -#define SCC_WR3_AUTO_ENABLE 0x20 -#define SCC_WR3_HUNT_MODE 0x10 -#define SCC_WR3_RX_CRC_ENABLE 0x08 -#define SCC_WR3_SDLC_SRCH 0x04 -#define SCC_WR3_INHIBIT_SYNCH 0x02 -#define SCC_WR3_RX_ENABLE 0x01 - -/* Should be re-written after reset */ -#define SCC_WR4_CLK_x64 0xc0 /* clock divide factor */ -#define SCC_WR4_CLK_x32 0x80 -#define SCC_WR4_CLK_x16 0x40 -#define SCC_WR4_CLK_x1 0x00 -#define SCC_WR4_EXT_SYNCH_MODE 0x30 /* synch modes */ -#define SCC_WR4_SDLC_MODE 0x20 -#define SCC_WR4_16BIT_SYNCH 0x10 -#define SCC_WR4_8BIT_SYNCH 0x00 -#define SCC_WR4_2_STOP 0x0c /* asynch modes */ -#define SCC_WR4_1_5_STOP 0x08 -#define SCC_WR4_1_STOP 0x04 -#define SCC_WR4_SYNCH_MODE 0x00 -#define SCC_WR4_EVEN_PARITY 0x02 -#define SCC_WR4_PARITY_ENABLE 0x01 - -#define SCC_WR5_DTR 0x80 /* drive DTR pin */ -#define SCC_WR5_TX_8_BITS 0x60 -#define SCC_WR5_TX_6_BITS 0x40 -#define SCC_WR5_TX_7_BITS 0x20 -#define SCC_WR5_TX_5_BITS 0x00 -#define SCC_WR5_SEND_BREAK 0x10 -#define SCC_WR5_TX_ENABLE 0x08 -#define SCC_WR5_CRC_16 0x04 /* CRC if non zero, .. */ -#define SCC_WR5_SDLC 0x00 /* ..SDLC otherwise */ -#define SCC_WR5_RTS 0x02 /* drive RTS pin */ -#define SCC_WR5_TX_CRC_ENABLE 0x01 - -/* Registers WR6 and WR7 are for synch modes data, with among other things: */ - -#define SCC_WR6_BISYNCH_12 0x0f -#define SCC_WR6_SDLC_RANGE_MASK 0x0f -#define SCC_WR7_SDLC_FLAG 0x7e - -/* Register WR7' (prime) controls some ESCC features */ -#define SCC_WR7P_RX_FIFO 0x08 /* Enable interrupt on FIFO 1/2 full */ - -/* WR8 is the transmit data buffer (no FIFO) */ -#define SCC_XMT_BUFFER SCC_WR8 - -#define SCC_WR9_HW_RESET 0xc0 /* force hardware reset */ -#define SCC_WR9_RESET_CHA_A 0x80 -#define SCC_WR9_RESET_CHA_B 0x40 -#define SCC_WR9_NON_VECTORED 0x20 /* mbz for Zilog chip */ -#define SCC_WR9_STATUS_HIGH 0x10 -#define SCC_WR9_MASTER_IE 0x08 -#define SCC_WR9_DLC 0x04 /* disable-lower-chain */ -#define SCC_WR9_NV 0x02 /* no vector */ -#define SCC_WR9_VIS 0x01 /* vector-includes-status */ - -#define SCC_WR10_CRC_PRESET 0x80 -#define SCC_WR10_FM0 0x60 -#define SCC_WR10_FM1 0x40 -#define SCC_WR10_NRZI 0x20 -#define SCC_WR10_NRZ 0x00 -#define SCC_WR10_ACTIVE_ON_POLL 0x10 -#define SCC_WR10_MARK_IDLE 0x08 /* flag if zero */ -#define SCC_WR10_ABORT_ON_URUN 0x04 /* flag if zero */ -#define SCC_WR10_LOOP_MODE 0x02 -#define SCC_WR10_6BIT_SYNCH 0x01 -#define SCC_WR10_8BIT_SYNCH 0x00 - -#define SCC_WR11_RTxC_XTAL 0x80 /* RTxC pin is input (ext oscill) */ -#define SCC_WR11_RCLK_DPLL 0x60 /* clock received data on dpll */ -#define SCC_WR11_RCLK_BAUDR 0x40 /* .. on BRG */ -#define SCC_WR11_RCLK_TRc_PIN 0x20 /* .. on TRxC pin */ -#define SCC_WR11_RCLK_RTc_PIN 0x00 /* .. on RTxC pin */ -#define SCC_WR11_XTLK_DPLL 0x18 -#define SCC_WR11_XTLK_BAUDR 0x10 -#define SCC_WR11_XTLK_TRc_PIN 0x08 -#define SCC_WR11_XTLK_RTc_PIN 0x00 -#define SCC_WR11_TRc_OUT 0x04 /* drive TRxC pin as output from..*/ -#define SCC_WR11_TRcOUT_DPLL 0x03 /* .. the dpll */ -#define SCC_WR11_TRcOUT_BAUDR 0x02 /* .. the BRG */ -#define SCC_WR11_TRcOUT_XMTCLK 0x01 /* .. the xmit clock */ -#define SCC_WR11_TRcOUT_XTAL 0x00 /* .. the external oscillator */ - -/* WR12/WR13 are for timing base preset */ -#define scc_set_timing_base(scc,chan,val) { \ - scc_write_reg(scc,chan,SCC_RR12,val);\ - scc_write_reg(scc,chan,SCC_RR13,(val)>>8);\ - } - -/* More commands in this register */ -#define SCC_WR14_NRZI_MODE 0xe0 /* synch modulations */ -#define SCC_WR14_FM_MODE 0xc0 -#define SCC_WR14_RTc_SOURCE 0xa0 /* clock is from pin .. */ -#define SCC_WR14_BAUDR_SOURCE 0x80 /* .. or internal BRG */ -#define SCC_WR14_DISABLE_DPLL 0x60 -#define SCC_WR14_RESET_CLKMISS 0x40 -#define SCC_WR14_SEARCH_MODE 0x20 -/* ..and more bitsy */ -#define SCC_WR14_LOCAL_LOOPB 0x10 -#define SCC_WR14_AUTO_ECHO 0x08 -#define SCC_WR14_DTR_REQUEST 0x04 -#define SCC_WR14_BAUDR_SRC 0x02 -#define SCC_WR14_BAUDR_ENABLE 0x01 - -#define SCC_WR15_BREAK_IE 0x80 -#define SCC_WR15_TX_UNDERRUN_IE 0x40 -#define SCC_WR15_CTS_IE 0x20 -#define SCC_WR15_SYNCHUNT_IE 0x10 -#define SCC_WR15_DCD_IE 0x08 -#define SCC_WR15_zero 0x05 -#define SCC_WR15_ZERO_COUNT_IE 0x02 -#define SCC_WR15_ENABLE_ESCC 0x01 /* Enable some ESCC registers */ - -#define NSCC_LINE 2 /* How many lines are support per 8530 */ -/* - * Driver status - */ - -#define SCC_FLAGS_DMA_PAUSED 0x00001 /* DMA has been paused because of XON/XOFF */ -#define SCC_FLAGS_DMA_TX_BUSY 0x00002 /* On going DMA operation.. */ - -struct scc_softreg { - unsigned char wr1; - unsigned char wr4; - unsigned char wr5; - unsigned char wr14; - - unsigned long speed; - unsigned long flags; - unsigned long dma_flags; -}; - - -struct scc_softc { - scc_regmap_t regs; - struct scc_dma_ops *dma_ops; - - /* software copy of some write regs, for reg |= */ - struct scc_softreg softr[NSCC_LINE]; - - int flags; - int modem[NSCC_LINE]; /* Mach modem bits (TM_DTR etc). */ - int dcd_timer[NSCC_LINE]; - int dma_initted; - - char polling_mode; - char probed_once; - - boolean_t full_modem; -}; - -#define DCD_TIMEOUT 4 - -typedef struct scc_softc *scc_softc_t; -extern struct scc_softc scc_softc[]; - -#endif /*_SCC_8530_H_*/ diff --git a/osfmk/ppc/sched_param.h b/osfmk/ppc/sched_param.h deleted file mode 100644 index eefe3303f..000000000 --- a/osfmk/ppc/sched_param.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ - -/* - */ - -/* - * Scheduler parameters. - */ - -#ifndef _PPC_SCHED_PARAM_H_ -#define _PPC_SCHED_PARAM_H_ - -#include -#include - -#endif /* _PPC_SCHED_PARAM_H_ */ diff --git a/osfmk/ppc/screen_switch.h b/osfmk/ppc/screen_switch.h deleted file mode 100644 index 956d1ac84..000000000 --- a/osfmk/ppc/screen_switch.h +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * File: screen_switch.h - * Author: Alessandro Forin, Carnegie Mellon University - * Date: 10/90 - * - * Definitions of things that must be tailored to - * specific hardware boards for the Generic Screen Driver. - */ - -#ifndef SCREEN_SWITCH_H -#define SCREEN_SWITCH_H 1 - -#include - -/* - * List of probe routines, scanned at cold-boot time - * to see which, if any, graphic display is available. - * This is done before autoconf, so that printing on - * the console works early on. The alloc routine is - * called only on the first device that answers. - * Ditto for the setup routine, called later on. - */ -struct screen_probe_vector { - int (*probe)(void); - unsigned int (*alloc)(void); - int (*setup)(int, user_info_t); -}; - -/* - * Low-level operations on the graphic device, used - * by the otherwise device-independent interface code - */ - -/* Forward declaration of screen_softc_t */ -typedef struct screen_softc *screen_softc_t; - -struct screen_switch { - int (*graphic_open)(void); /* when X11 opens */ - int (*graphic_close)(screen_softc_t); /* .. or closes */ - int (*set_status)(screen_softc_t, - dev_flavor_t, - dev_status_t, - natural_t); /* dev-specific ops */ - int (*get_status)(screen_softc_t, - dev_flavor_t, - dev_status_t, - natural_t*); /* dev-specific ops */ - int (*char_paint)(screen_softc_t, - int, - int, - int); /* blitc */ - int (*pos_cursor)(void*, - int, - int); /* cursor positioning*/ - int (*insert_line)(screen_softc_t, - short); /* ..and scroll down */ - int (*remove_line)(screen_softc_t, - short); /* ..and scroll up */ - int (*clear_bitmap)(screen_softc_t); /* blank screen */ - int (*video_on)(void*, - user_info_t*); /* screen saver */ - int (*video_off)(void*, - user_info_t*); - int (*intr_enable)(void*, - boolean_t); - int (*map_page)(screen_softc_t, - vm_offset_t, - int); /* user-space mapping*/ -}; - -/* - * Each graphic device needs page-aligned memory - * to be mapped in user space later (for events - * and such). Size and content of this memory - * is unfortunately device-dependent, even if - * it did not need to (puns). - */ -extern char *screen_data; - -extern struct screen_probe_vector screen_probe_vector[]; - -extern int screen_noop(void), screen_find(void); - -#endif /* SCREEN_SWITCH_H */ diff --git a/osfmk/ppc/serial_defs.h b/osfmk/ppc/serial_defs.h deleted file mode 100644 index e18994c35..000000000 --- a/osfmk/ppc/serial_defs.h +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * @APPLE_FREE_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * File: serial_defs.h - * Author: Alessandro Forin, Carnegie Mellon University - * Date: 7/91 - * - * Generic console driver for serial-line based consoles. - */ - -#ifndef _PPC_SERIAL_DEFS_ -#define _PPC_SERIAL_DEFS_ - -#include -/* - * Common defs - */ - - -#define CONS_ERR_PARITY 0x1000 -#define CONS_ERR_BREAK 0x2000 -#define CONS_ERR_OVERRUN 0x4000 - - -#endif /* _PPC_SERIAL_DEFS_ */ diff --git a/osfmk/ppc/serial_io.c b/osfmk/ppc/serial_io.c deleted file mode 100644 index 2f03aa110..000000000 --- a/osfmk/ppc/serial_io.c +++ /dev/null @@ -1,659 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * @APPLE_FREE_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * File: scc_8530_hdw.c - * Author: Alessandro Forin, Carnegie Mellon University - * Date: 6/91 - * - * Hardware-level operations for the SCC Serial Line Driver - */ - -#define NSCC 1 /* Number of serial chips, two ports per chip. */ -#if NSCC > 0 - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if MACH_KDB -#include -#endif /* MACH_KDB */ - -#define kdebug_state() (1) -#define delay(x) { volatile int _d_; for (_d_ = 0; _d_ < (10000*x); _d_++) ; } - -#define NSCC_LINE 2 /* 2 ttys per chip */ - -#define SCC_DMA_TRANSFERS 0 - -struct scc_tty scc_tty[NSCC_LINE]; - -#define scc_tty_for(chan) (&scc_tty[chan]) -/* #define scc_unit(dev_no) (dev_no) */ - -#define scc_dev_no(chan) ((chan)^0x01) -#define scc_chan(dev_no) ((dev_no)^0x01) - -int serial_initted = 0; -unsigned int scc_parm_done = 0; - -static struct scc_byte { - unsigned char reg; - unsigned char val; -} scc_init_hw[] = { - - {9, 0x80}, - {4, 0x44}, - {3, 0xC0}, - {5, 0xE2}, - {2, 0x00}, - {10, 0x00}, - {11, 0x50}, - {12, 0x0A}, - {13, 0x00}, - {3, 0xC1}, - {5, 0xEA}, - {14, 0x01}, - {15, 0x00}, - {0, 0x10}, - {0, 0x10}, -#if 0 - {1, 0x12}, /* int or Rx, Tx int enable */ -#else - {1, 0x10}, /* int or Rx, no Tx int enable */ -#endif - {9, 0x0A} -}; - -static int scc_init_hw_count = sizeof(scc_init_hw)/sizeof(scc_init_hw[0]); - -enum scc_error {SCC_ERR_NONE, SCC_ERR_PARITY, SCC_ERR_BREAK, SCC_ERR_OVERRUN}; - - -/* - * BRG formula is: - * ClockFrequency (115200 for Power Mac) - * BRGconstant = --------------------------- - 2 - * BaudRate - */ - -#define SERIAL_CLOCK_FREQUENCY (115200*2) /* Power Mac value */ -#define convert_baud_rate(rate) ((((SERIAL_CLOCK_FREQUENCY) + (rate)) / (2 * (rate))) - 2) - -#define DEFAULT_SPEED 57600 -#define DEFAULT_PORT0_SPEED 1200 -#define DEFAULT_FLAGS (TF_LITOUT|TF_ECHO) - -int scc_param(struct scc_tty *tp); - - -struct scc_softc scc_softc[NSCC]; -caddr_t scc_std[NSCC] = { (caddr_t) 0}; - - -#define SCC_RR1_ERRS (SCC_RR1_FRAME_ERR|SCC_RR1_RX_OVERRUN|SCC_RR1_PARITY_ERR) -#define SCC_RR3_ALL (SCC_RR3_RX_IP_A|SCC_RR3_TX_IP_A|SCC_RR3_EXT_IP_A|\ - SCC_RR3_RX_IP_B|SCC_RR3_TX_IP_B|SCC_RR3_EXT_IP_B) - -#define DEBUG_SCC -#undef DEBUG_SCC - -#ifdef DEBUG_SCC -static int total_chars, total_ints, total_overruns, total_errors, num_ints, max_chars; -static int chars_received[8]; -static int __SCC_STATS = 0; -static int max_in_q = 0; -static int max_out_q = 0; -#endif - -DECL_FUNNEL(, scc_funnel) /* funnel to serialize the SCC driver */ -boolean_t scc_funnel_initted = FALSE; -#define SCC_FUNNEL scc_funnel -#define SCC_FUNNEL_INITTED scc_funnel_initted - - -/* - * Adapt/Probe/Attach functions - */ -boolean_t scc_uses_modem_control = FALSE;/* patch this with adb */ -decl_simple_lock_data(,scc_stomp) - -/* This is called VERY early on in the init and therefore has to have - * hardcoded addresses of the serial hardware control registers. The - * serial line may be needed for console and debugging output before - * anything else takes place - */ - -void -initialize_serial( caddr_t scc_phys_base, int32_t serial_baud ) -{ - int i, chan, bits; - scc_regmap_t regs; - DECL_FUNNEL_VARS - - assert( scc_phys_base ); - - if (!SCC_FUNNEL_INITTED) { - FUNNEL_INIT(&SCC_FUNNEL, master_processor); - SCC_FUNNEL_INITTED = TRUE; - } - FUNNEL_ENTER(&SCC_FUNNEL); - - if (serial_initted) { - FUNNEL_EXIT(&SCC_FUNNEL); - return; - } - - simple_lock_init(&scc_stomp, FALSE); - - if (serial_baud == -1) serial_baud = DEFAULT_SPEED; - - scc_softc[0].full_modem = TRUE; - - scc_std[0] = scc_phys_base; - - regs = scc_softc[0].regs = (scc_regmap_t)scc_std[0]; - - for (chan = 0; chan < NSCC_LINE; chan++) { - if (chan == 1) - scc_init_hw[0].val = 0x80; - - for (i = 0; i < scc_init_hw_count; i++) { - scc_write_reg(regs, chan, - scc_init_hw[i].reg, scc_init_hw[i].val); - } - } - - /* Call probe so we are ready very early for remote gdb and for serial - console output if appropriate. */ - if (scc_probe(serial_baud)) { - for (i = 0; i < NSCC_LINE; i++) { - scc_softc[0].softr[i].wr5 = SCC_WR5_DTR | SCC_WR5_RTS; - scc_param(scc_tty_for(i)); - /* Enable SCC interrupts (how many interrupts are to this thing?!?) */ - scc_write_reg(regs, i, 9, SCC_WR9_NV); - - scc_read_reg_zero(regs, 0, bits);/* Clear the status */ - } - scc_parm_done = 1; - } - - serial_initted = TRUE; - - FUNNEL_EXIT(&SCC_FUNNEL); - return; -} - -int -scc_probe(int32_t serial_baud) -{ - scc_softc_t scc; - int i; - scc_regmap_t regs; - spl_t s; - DECL_FUNNEL_VARS - - if (!SCC_FUNNEL_INITTED) { - FUNNEL_INIT(&SCC_FUNNEL, master_processor); - SCC_FUNNEL_INITTED = TRUE; - } - FUNNEL_ENTER(&SCC_FUNNEL); - - /* Readjust the I/O address to handling - * new memory mappings. - */ - - regs = (scc_regmap_t)scc_std[0]; - - if (regs == (scc_regmap_t) 0) { - FUNNEL_EXIT(&SCC_FUNNEL); - return 0; - } - - scc = &scc_softc[0]; - scc->regs = regs; - - s = splhigh(); - - for (i = 0; i < NSCC_LINE; i++) { - register struct scc_tty *tp; - tp = scc_tty_for(i); - tp->t_addr = (char*)(0x80000000L + (i&1)); - /* Set default values. These will be overridden on - open but are needed if the port will be used - independently of the Mach interfaces, e.g., for - gdb or for a serial console. */ - if (i == 0) { - tp->t_ispeed = DEFAULT_PORT0_SPEED; - tp->t_ospeed = DEFAULT_PORT0_SPEED; - } else { - tp->t_ispeed = serial_baud; - tp->t_ospeed = serial_baud; - } - tp->t_flags = DEFAULT_FLAGS; - scc->softr[i].speed = -1; - - /* do min buffering */ - tp->t_state |= TS_MIN; - - tp->t_dev = scc_dev_no(i); - } - - splx(s); - - FUNNEL_EXIT(&SCC_FUNNEL); - return 1; -} - -/* - * Get a char from a specific SCC line - * [this is only used for console&screen purposes] - * must be splhigh since it may be called from another routine under spl - */ - -int -scc_getc(__unused int unit, int line, boolean_t wait, __unused boolean_t raw) -{ - scc_regmap_t regs; - unsigned char c, value; - int rcvalue; - spl_t s = splhigh(); - DECL_FUNNEL_VARS - - FUNNEL_ENTER(&SCC_FUNNEL); - - - simple_lock(&scc_stomp); - regs = scc_softc[0].regs; - - /* - * wait till something available - * - */ -again: - rcvalue = 0; - while (1) { - scc_read_reg_zero(regs, line, value); - - if (value & SCC_RR0_RX_AVAIL) - break; - - if (!wait) { - simple_unlock(&scc_stomp); - splx(s); - FUNNEL_EXIT(&SCC_FUNNEL); - return -1; - } - } - - /* - * if nothing found return -1 - */ - - scc_read_reg(regs, line, SCC_RR1, value); - scc_read_data(regs, line, c); - -#if MACH_KDB - if (console_is_serial() && - c == ('_' & 0x1f)) { - /* Drop into the debugger */ - simple_unlock(&scc_stomp); - Debugger("Serial Line Request"); - simple_lock(&scc_stomp); - scc_write_reg(regs, line, SCC_RR0, SCC_RESET_HIGHEST_IUS); - if (wait) { - goto again; - } - simple_unlock(&scc_stomp); - splx(s); - FUNNEL_EXIT(&SCC_FUNNEL); - return -1; - } -#endif /* MACH_KDB */ - - /* - * bad chars not ok - */ - if (value&(SCC_RR1_PARITY_ERR | SCC_RR1_RX_OVERRUN | SCC_RR1_FRAME_ERR)) { - scc_write_reg(regs, line, SCC_RR0, SCC_RESET_ERROR); - - if (wait) { - scc_write_reg(regs, line, SCC_RR0, SCC_RESET_HIGHEST_IUS); - goto again; - } - } - - scc_write_reg(regs, line, SCC_RR0, SCC_RESET_HIGHEST_IUS); - - simple_unlock(&scc_stomp); - splx(s); - - FUNNEL_EXIT(&SCC_FUNNEL); - return c; -} - - -/* - * This front-ends scc_getc to make some intel changes easier - */ - -int _serial_getc(int unit, int line, boolean_t wait, boolean_t raw) { - - return(scc_getc(unit, line, wait, raw)); - -} - -/* - * Put a char on a specific SCC line - * use splhigh since we might be doing a printf in high spl'd code - */ - -void -scc_putc(__unused int unit, int line, int c) -{ - scc_regmap_t regs; - spl_t s; - unsigned char value; - DECL_FUNNEL_VARS - - - if (disable_serial_output) - return; - - s = splhigh(); - FUNNEL_ENTER(&SCC_FUNNEL); - simple_lock(&scc_stomp); - - regs = scc_softc[0].regs; - - do { - scc_read_reg(regs, line, SCC_RR0, value); - if (value & SCC_RR0_TX_EMPTY) - break; - delay(1); - } while (1); - - scc_write_data(regs, line, c); -/* wait for it to swallow the char ? */ - - do { - scc_read_reg(regs, line, SCC_RR0, value); - if (value & SCC_RR0_TX_EMPTY) - break; - } while (1); - scc_write_reg(regs, line, SCC_RR0, SCC_RESET_HIGHEST_IUS); - simple_unlock(&scc_stomp); - - splx(s); - - FUNNEL_EXIT(&SCC_FUNNEL); -} - - -void -powermac_scc_set_datum(scc_regmap_t regs, unsigned int offset, unsigned char value) -{ - volatile unsigned char *address = (unsigned char *) regs + offset; - - assert(FUNNEL_IN_USE(&SCC_FUNNEL)); - - *address = value; - eieio(); - - assert(FUNNEL_IN_USE(&SCC_FUNNEL)); -} - -unsigned char -powermac_scc_get_datum(scc_regmap_t regs, unsigned int offset) -{ - volatile unsigned char *address = (unsigned char *) regs + offset; - unsigned char value; - - assert(FUNNEL_IN_USE(&SCC_FUNNEL)); - - value = *address; eieio(); - return value; - - assert(FUNNEL_IN_USE(&SCC_FUNNEL)); -} - -int -scc_param(struct scc_tty *tp) -{ - scc_regmap_t regs; - unsigned char value; - unsigned short speed_value; - int bits, chan; - spl_t s; - struct scc_softreg *sr; - scc_softc_t scc; - - assert(FUNNEL_IN_USE(&SCC_FUNNEL)); - - s = splhigh(); - simple_lock(&scc_stomp); - - chan = scc_chan(tp->t_dev); - scc = &scc_softc[0]; - regs = scc->regs; - - sr = &scc->softr[chan]; - - /* Do a quick check to see if the hardware needs to change */ - if ((sr->flags & (TF_ODDP|TF_EVENP)) == (tp->t_flags & (TF_ODDP|TF_EVENP)) - && sr->speed == (unsigned long)tp->t_ispeed) { - assert(FUNNEL_IN_USE(&SCC_FUNNEL)); - simple_unlock(&scc_stomp); - splx(s); - return 0; - } - - if(scc_parm_done) { - - scc_write_reg(regs, chan, 3, SCC_WR3_RX_8_BITS|SCC_WR3_RX_ENABLE); - sr->wr1 = SCC_WR1_RXI_FIRST_CHAR | SCC_WR1_EXT_IE; - scc_write_reg(regs, chan, 1, sr->wr1); - scc_write_reg(regs, chan, 15, SCC_WR15_ENABLE_ESCC); - scc_write_reg(regs, chan, 7, SCC_WR7P_RX_FIFO); - scc_write_reg(regs, chan, 0, SCC_IE_NEXT_CHAR); - scc_write_reg(regs, chan, 0, SCC_RESET_EXT_IP); - scc_write_reg(regs, chan, 0, SCC_RESET_EXT_IP); - scc_write_reg(regs, chan, 9, SCC_WR9_MASTER_IE|SCC_WR9_NV); - scc_read_reg_zero(regs, 0, bits); - sr->wr1 = SCC_WR1_RXI_FIRST_CHAR | SCC_WR1_EXT_IE; - scc_write_reg(regs, chan, 1, sr->wr1); - scc_write_reg(regs, chan, 0, SCC_IE_NEXT_CHAR); - simple_unlock(&scc_stomp); - splx(s); - return 0; - } - - sr->flags = tp->t_flags; - sr->speed = tp->t_ispeed; - - - if (tp->t_ispeed == 0) { - sr->wr5 &= ~SCC_WR5_DTR; - scc_write_reg(regs, chan, 5, sr->wr5); - simple_unlock(&scc_stomp); - splx(s); - - assert(FUNNEL_IN_USE(&SCC_FUNNEL)); - return 0; - } - - -#if SCC_DMA_TRANSFERS - if (scc->dma_initted & (1<dma_ops->scc_dma_reset_rx(chan); -#endif - - value = SCC_WR4_1_STOP; - - /* - * For 115K the clocking divide changes to 64.. to 230K will - * start at the normal clock divide 16. - * - * However, both speeds will pull from a different clocking - * source - */ - - if (tp->t_ispeed == 115200) - value |= SCC_WR4_CLK_x32; - else - value |= SCC_WR4_CLK_x16 ; - - /* .. and parity */ - if ((tp->t_flags & (TF_ODDP | TF_EVENP)) == TF_EVENP) - value |= (SCC_WR4_EVEN_PARITY | SCC_WR4_PARITY_ENABLE); - else if ((tp->t_flags & (TF_ODDP | TF_EVENP)) == TF_ODDP) - value |= SCC_WR4_PARITY_ENABLE; - - /* set it now, remember it must be first after reset */ - sr->wr4 = value; - - /* Program Parity, and Stop bits */ - scc_write_reg(regs, chan, 4, sr->wr4); - - /* Setup for 8 bits */ - scc_write_reg(regs, chan, 3, SCC_WR3_RX_8_BITS); - - // Set DTR, RTS, and transmitter bits/character. - sr->wr5 = SCC_WR5_TX_8_BITS | SCC_WR5_RTS | SCC_WR5_DTR; - - scc_write_reg(regs, chan, 5, sr->wr5); - - scc_write_reg(regs, chan, 14, 0); /* Disable baud rate */ - - /* Setup baud rate 57.6Kbps, 115K, 230K should all yeild - * a converted baud rate of zero - */ - speed_value = convert_baud_rate(tp->t_ispeed); - - if (speed_value == 0xffff) - speed_value = 0; - - scc_set_timing_base(regs, chan, speed_value); - - if (tp->t_ispeed == 115200 || tp->t_ispeed == 230400) { - /* Special case here.. change the clock source*/ - scc_write_reg(regs, chan, 11, 0); - /* Baud rate generator is disabled.. */ - } else { - scc_write_reg(regs, chan, 11, SCC_WR11_RCLK_BAUDR|SCC_WR11_XTLK_BAUDR); - /* Enable the baud rate generator */ - scc_write_reg(regs, chan, 14, SCC_WR14_BAUDR_ENABLE); - } - - - scc_write_reg(regs, chan, 3, SCC_WR3_RX_8_BITS|SCC_WR3_RX_ENABLE); - - - sr->wr1 = SCC_WR1_RXI_FIRST_CHAR | SCC_WR1_EXT_IE; - scc_write_reg(regs, chan, 1, sr->wr1); - scc_write_reg(regs, chan, 15, SCC_WR15_ENABLE_ESCC); - scc_write_reg(regs, chan, 7, SCC_WR7P_RX_FIFO); - scc_write_reg(regs, chan, 0, SCC_IE_NEXT_CHAR); - - - /* Clear out any pending external or status interrupts */ - scc_write_reg(regs, chan, 0, SCC_RESET_EXT_IP); - scc_write_reg(regs, chan, 0, SCC_RESET_EXT_IP); - //scc_write_reg(regs, chan, 0, SCC_RESET_ERROR); - - /* Enable SCC interrupts (how many interrupts are to this thing?!?) */ - scc_write_reg(regs, chan, 9, SCC_WR9_MASTER_IE|SCC_WR9_NV); - - scc_read_reg_zero(regs, 0, bits);/* Clear the status */ - -#if SCC_DMA_TRANSFERS - if (scc->dma_initted & (1<dma_ops->scc_dma_start_rx(chan); - scc->dma_ops->scc_dma_setup_8530(chan); - } else -#endif - { - sr->wr1 = SCC_WR1_RXI_FIRST_CHAR | SCC_WR1_EXT_IE; - scc_write_reg(regs, chan, 1, sr->wr1); - scc_write_reg(regs, chan, 0, SCC_IE_NEXT_CHAR); - } - - sr->wr5 |= SCC_WR5_TX_ENABLE; - scc_write_reg(regs, chan, 5, sr->wr5); - - simple_unlock(&scc_stomp); - splx(s); - - assert(FUNNEL_IN_USE(&SCC_FUNNEL)); - return 0; - -} -#endif /* NSCC > 0 */ diff --git a/osfmk/ppc/serial_io.h b/osfmk/ppc/serial_io.h deleted file mode 100644 index a280fa1a4..000000000 --- a/osfmk/ppc/serial_io.h +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * @APPLE_FREE_COPYRIGHT@ - */ - -#ifndef _PPC_SERIAL_IO_H_ -#define _PPC_SERIAL_IO_H_ - -#include -#include -#include - -/* - * Console is on the Printer Port (chip channel 0) - * Debugger is on the Modem Port (chip channel 1) - */ - -#define CONSOLE_PORT 1 - -struct scc_tty { - char * t_addr; /* device pointer */ - int t_dev; /* device number */ - int t_ispeed; /* input speed */ - int t_ospeed; /* output speed */ - char t_breakc; /* character to deliver when 'break' - condition received */ - int t_flags; /* mode flags */ - int t_state; /* current state */ - int t_line; /* fake line discipline number, - for old drivers - always 0 */ - int t_outofband; /* current out-of-band events */ - int t_outofbandarg; /* arg to first out-of-band event */ - int t_nquoted; /* number of quoted chars in inq */ - int t_hiwater; /* baud-rate limited high water mark */ - int t_lowater; /* baud-rate limited low water mark */ -}; -typedef struct scc_tty *scc_tty_t; - -/* - * function declarations for performing serial i/o - * other functions below are declared in kern/misc_protos.h - * cnputc, cngetc, cnmaygetc - */ - -void initialize_serial(caddr_t scc_phys_base, int32_t serial_baud); - -extern int scc_probe(int32_t serial_baud); - -#if 0 -extern int scc_open( - dev_t dev, - dev_mode_t flag, - io_req_t ior); - -extern void scc_close( - dev_t dev); - -extern int scc_read( - dev_t dev, - io_req_t ior); - -extern io_return_t scc_write( - dev_t dev, - io_req_t ior); - -extern io_return_t scc_get_status( - dev_t dev, - dev_flavor_t flavor, - dev_status_t data, - mach_msg_type_number_t *status_count); - -extern io_return_t scc_set_status( - dev_t dev, - dev_flavor_t flavor, - dev_status_t data, - mach_msg_type_number_t status_count); - -extern boolean_t scc_portdeath( - dev_t dev, - ipc_port_t port); - -#endif /* 0 */ - -extern void scc_putc( - int unit, - int line, - int c); - -extern int scc_getc( - int unit, - int line, - boolean_t wait, - boolean_t raw); - -/* - * JMM - We are not really going to support this driver in SMP (barely - * support it now - so just pick up the stubbed out versions. - */ -#define DECL_FUNNEL(class,f) -#define DECL_FUNNEL_VARS -#define FUNNEL_INIT(f,p) -#define FUNNEL_ENTER(f) -#define FUNNEL_EXIT(f) -#define FUNNEL_ESCAPE(f) (1) -#define FUNNEL_REENTER(f,count) -#define FUNNEL_IN_USE(f) (TRUE) - -/* - * Flags - */ -#define TF_ODDP 0x00000002 /* get/send odd parity */ -#define TF_EVENP 0x00000004 /* get/send even parity */ -#define TF_ANYP (TF_ODDP|TF_EVENP) - /* get any parity/send none */ -#define TF_LITOUT 0x00000008 /* output all 8 bits - otherwise, characters >= 0x80 - are time delays XXX */ -#define TF_ECHO 0x00000080 /* device wants user to echo input */ -#define TS_MIN 0x00004000 /* buffer input chars, if possible */ - -#endif /* _PPC_SERIAL_IO_H_ */ diff --git a/osfmk/ppc/setjmp.h b/osfmk/ppc/setjmp.h deleted file mode 100644 index 2c7b1b9fc..000000000 --- a/osfmk/ppc/setjmp.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -#ifndef _PPC_SETJMP_H_ -#define _PPC_SETJMP_H_ - -/* - * We save the following registers (marked as non-volatile in the ELF spec) - * - * r1 - stack pointer - * r13 - small data area pointer - * r14-r30 - local variables - * r31 - local variable/environment pointer - * - * cr - condition register - * lr - link register (to know where to jump back to) - * xer - fixed point exception register - * - * fpscr - floating point status and control - * f14-f31 - local variables - * - * which comes to 57 words. We round up to 64 for good measure. - */ - -typedef struct jmp_buf { - int jmp_buf[64]; -} jmp_buf_t; - -#endif /* _PPC_SETJMP_H_ */ diff --git a/osfmk/ppc/simple_lock.h b/osfmk/ppc/simple_lock.h deleted file mode 100644 index 80be1e6ff..000000000 --- a/osfmk/ppc/simple_lock.h +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ - -#ifdef KERNEL_PRIVATE - -#ifndef _PPC_SIMPLE_LOCK_TYPES_H_ -#define _PPC_SIMPLE_LOCK_TYPES_H_ - -#ifdef KERNEL_PRIVATE -#include -#include - -#include -#ifdef MACH_KERNEL_PRIVATE -#include -#include -#include -#endif - -#ifdef MACH_KERNEL_PRIVATE - -#if MACH_LDEBUG -#define USLOCK_DEBUG 1 -#else -#define USLOCK_DEBUG 0 -#endif - -#if !USLOCK_DEBUG - -typedef lck_spin_t usimple_lock_data_t, *usimple_lock_t; - -#else - -typedef struct uslock_debug { - void *lock_pc; /* pc where lock operation began */ - void *lock_thread; /* thread that acquired lock */ - unsigned long duration[2]; - unsigned short state; - unsigned char lock_cpu; - void *unlock_thread; /* last thread to release lock */ - unsigned char unlock_cpu; - void *unlock_pc; /* pc where lock operation ended */ -} uslock_debug; - -typedef struct { - hw_lock_data_t interlock; /* must be first... see lock.c */ - unsigned short lock_type; /* must be second... see lock.c */ -#define USLOCK_TAG 0x5353 - uslock_debug debug; -} usimple_lock_data_t, *usimple_lock_t; - -#endif /* USLOCK_DEBUG */ - -#else - -typedef struct slock { - unsigned int lock_data[10]; -} usimple_lock_data_t, *usimple_lock_t; - -#endif /* MACH_KERNEL_PRIVATE */ - -#define USIMPLE_LOCK_NULL ((usimple_lock_t) 0) - -#if !defined(decl_simple_lock_data) - -typedef usimple_lock_data_t *simple_lock_t; -typedef usimple_lock_data_t simple_lock_data_t; - -#define decl_simple_lock_data(class,name) \ - class simple_lock_data_t name; - -#endif /* !defined(decl_simple_lock_data) */ - -#ifdef MACH_KERNEL_PRIVATE -#if !MACH_LDEBUG - -#define MACHINE_SIMPLE_LOCK - -extern void ppc_usimple_lock_init(simple_lock_t,unsigned short); -extern void ppc_usimple_lock(simple_lock_t); -extern void ppc_usimple_unlock_rwmb(simple_lock_t); -extern void ppc_usimple_unlock_rwcmb(simple_lock_t); -extern unsigned int ppc_usimple_lock_try(simple_lock_t); - -#define simple_lock_init(l,t) ppc_usimple_lock_init(l,t) -#define simple_lock(l) ppc_usimple_lock(l) -#define simple_unlock(l) ppc_usimple_unlock_rwcmb(l) -#define simple_unlock_rwmb(l) ppc_usimple_unlock_rwmb(l) -#define simple_lock_try(l) ppc_usimple_lock_try(l) -#define simple_lock_addr(l) (&(l)) -#define thread_sleep_simple_lock(l, e, i) \ - thread_sleep_fast_usimple_lock((l), (e), (i)) -#endif /* !MACH_LDEBUG */ - -extern unsigned int hw_lock_bit( - unsigned int *, - unsigned int, - unsigned int); - -extern unsigned int hw_cpu_sync( - unsigned int *, - unsigned int); - -extern unsigned int hw_cpu_wcng( - unsigned int *, - unsigned int, - unsigned int); - -extern unsigned int hw_lock_mbits( - unsigned int *, - unsigned int, - unsigned int, - unsigned int, - unsigned int); - -void hw_unlock_bit( - unsigned int *, - unsigned int); - -#endif /* MACH_KERNEL_PRIVATE */ -#endif /* KERNEL_PRIVATE */ - -#endif /* !_PPC_SIMPLE_LOCK_TYPES_H_ */ - -#endif /* KERNEL_PRIVATE */ diff --git a/osfmk/ppc/skiplists.s b/osfmk/ppc/skiplists.s deleted file mode 100644 index 69a9dccbb..000000000 --- a/osfmk/ppc/skiplists.s +++ /dev/null @@ -1,1297 +0,0 @@ -/* - * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* skiplists.s - * - * These are the subroutines that manage the skip-list data structures used for the - * resident mappings for each pmap. We used to use a much simpler hash-based scheme, - * but it didn't scale well for 64-bit address spaces and multi-GB real memories. - * Here's a brief tutorial on skip-lists: - * - * The basic idea is that each mapping is on one or more singly-linked lists, sorted - * in increasing order by virtual address. The number of lists a mapping is on is an - * invariant property determined when the mapping is created, using an exponentially- - * distributed random number. Every mapping is on the first list. Ideally, each - * successive list has only 1/F as many nodes on it as the previous, where F is the - * "fanout." With a max of n lists, up to F**n nodes can be handled optimally. - * - * Searching, adding, and deleting from a skip-list can all be done in O(ln(n)) time. - * Because the first skip-list is just a sorted list of all mappings, it is also - * efficient to purge a sparsely populated pmap of all the mappings in a large range, - * for example when tearing down an address space. Large-range deletes are the - * primary advantage of skip-lists over a hash, btw. - * - * We currently use a fanout of 4 and a maximum of 12 lists (cf kSkipListFanoutShift - * and kSkipListMaxLists.) Thus, we can optimally handle pmaps with as many as 4**12 - * pages, which is 64GB of resident physical memory per pmap. Pmaps can be larger than - * this, albeit with diminishing efficiency. - * - * The major problem with skip-lists is that we could waste a lot of space with 12 - * 64-bit link fields in every mapping. So we currently have two sizes of mappings: - * 64-byte nodes with 4 list links, and 128-byte nodes with 12. Only one in every - * (4**4)==256 mappings requires the larger node, so the average size is 64.25 bytes. - * In practice, the additional complexity of the variable node size is entirely - * contained in the allocate and free routines. - * - * The other, mostly theoretic problem with skip-lists is that they have worst cases - * where performance becomes nearly linear. These worst-cases are quite rare but there - * is no practical way to prevent them. - */ - - -; set nonzero to accumulate skip-list stats on a per-map basis: -#define SKIPLISTSTATS 1 - -; cr7 bit set when mapSearchFull() finds a match on a high list: -#define bFullFound 28 - -#include -#include -#include -#include -#include - - -/* - * ********************* - * * m a p S e a r c h * - * ********************* - * - * Given a pmap and a virtual address (VA), find the mapping for that address. - * This is the fast call, that does not set up the previous-ptr vector or make - * consistency checks. When called: - * the pmap is locked (shared or exclusive) - * translation is off, interrupts masked - * 64-bit mode is enabled (if on a 64-bit machine) - * cr6 is loaded with the corresponding feature flags (in particular, pf64Bit) - * r3 = pmap ptr - * r4 = high 32 bits of key to search for (0 if a 32-bit processor) - * r5 = low 32 bits of key (low 12 bits may be nonzero garbage) - * r7 = mpFlags field if found. Undefined if not - * - * We return the mapping ptr (or 0) in r3, and the next VA (or 0 if no more) in r4 and r5. - * Except for cr6 (which is global), we trash nonvolatile regs. Called both on 32- and 64-bit - * machines, though we quickly branch into parallel code paths. - */ - .text - .align 5 - .globl EXT(mapSearch) -LEXT(mapSearch) - lbz r7,pmapCurLists(r3) ; get largest #lists any mapping is on - la r8,pmapSkipLists+4(r3) ; point to lists in pmap, assuming 32-bit machine - rlwinm r5,r5,0,0,19 ; zero low 12 bits of key - mr r6,r3 ; save pmap ptr here so we can accumulate statistics - li r9,0 ; initialize prev ptr - addic. r7,r7,-1 ; get base-0 number of last list, and test for 0 - li r2,0 ; initialize count of mappings visited - slwi r7,r7,3 ; get offset of last list in use - blt-- mapSrchPmapEmpty ; pmapCurLists==0 (ie, no mappings) - lwzx r3,r8,r7 ; get 32-bit ptr to 1st mapping in highest list - bf-- pf64Bitb,mapSrch32c ; skip if 32-bit processor - subi r8,r8,4 ; we use all 64 bits of ptrs - rldimi r5,r4,32,0 ; r5 <- 64-bit va - ldx r3,r8,r7 ; get 64-bit ptr to 1st mapping in highest list - b mapSrch64c ; enter 64-bit search loop - - - ; 64-bit processors. Check next mapping. - ; r2 = count of mappings visited so far - ; r3 = current mapping ptr - ; r4 = va of current mapping (ie, of r3) - ; r5 = va to search for (the "key") (low 12 bits are 0) - ; r6 = pmap ptr - ; r7 = current skip list number * 8 - ; r8 = ptr to skip list vector of mapping pointed to by r9 (or pmap, if r9==0) - ; r9 = prev ptr, or 0 if none - - .align 5 -mapSrch64a: ; loop over each mapping - ld r4,mpVAddr(r3) ; get va for this mapping (plus flags in low 12 bits) - addi r2,r2,1 ; count mappings visited - rldicr r4,r4,0,51 ; zero low 12 bits of mapping va - cmpld cr1,r5,r4 ; compare the vas - blt cr1,mapSrch64d ; key is less, try next list - la r8,mpList0(r3) ; point to skip list vector in this mapping - mr r9,r3 ; remember prev ptr - beq-- cr1,mapSrch64Found ; this is the correct mapping - ldx r3,r7,r8 ; get ptr to next mapping in current list -mapSrch64c: - mr. r3,r3 ; was there another mapping on current list? - bne++ mapSrch64a ; was another, so loop -mapSrch64d: - subic. r7,r7,8 ; move on to next list offset - ldx r3,r7,r8 ; get next mapping on next list (if any) - bge++ mapSrch64c ; loop to try next list - - ; Mapping not found, check to see if prev node was a block mapping or nested pmap. - ; If not, or if our address is not covered by the block or nested map, return 0. - ; Note the advantage of keeping the check for block mappings (and nested pmaps) - ; out of the inner loop; we do the special case work at most once per search, and - ; never for the most-common case of finding a scalar mapping. The full searches - ; must check _in_ the inner loop, to get the prev ptrs right. - - mr. r9,r9 ; was there a prev ptr? - li r3,0 ; assume we are going to return null - ld r4,pmapSkipLists(r6) ; assume prev ptr null... so next is first - beq-- mapSrch64Exit ; prev ptr was null, search failed - lwz r0,mpFlags(r9) ; get flag bits from prev mapping - lhz r11,mpBSize(r9) ; get #pages/#segments in block/submap mapping - - rlwinm r0,r0,mpBSub+1,31,31 ; 0 if 4K bsu or 1 if 32MB bsu - ld r10,mpVAddr(r9) ; re-fetch base address of prev ptr - ori r0,r0,0x3216 ; OR in 0x00003216 (0x3200 and a base rotate of 22) - addi r11,r11,1 ; Convert 0-based to 1-based - rlwnm r0,r0,r0,27,31 ; Rotate to get 12 or 25 - ld r4,mpList0(r9) ; get 64-bit ptr to next mapping, if any - sld r11,r11,r0 ; Get the length in bytes - rldicr r10,r10,0,51 ; zero low 12 bits of mapping va - subi r0,r11,4096 ; get offset last page in mapping - add r10,r10,r0 ; r10 <- last page in this mapping - cmpld r5,r10 ; does this mapping cover our page? - bgt mapSrch64Exit ; no, search failed - mr r3,r9 ; yes, we found it - - ; found the mapping - ; r2 = count of nodes visited - ; r3 = the mapping - ; r6 = pmap ptr - -mapSrch64Found: ; WARNING: can drop down to here - ld r4,mpList0(r3) ; get ptr to next mapping - lwz r7,mpFlags(r3) ; Get the flags for our caller - - ; r2 = count of nodes visited - ; r3 = return value (ie, found mapping or 0) - ; r4 = next mapping (or 0 if none) - ; r6 = pmap ptr - ; r7 = mpFlags - -mapSrch64Exit: ; WARNING: can drop down to here - mr. r5,r4 ; next ptr null? -#if SKIPLISTSTATS - lwz r10,pmapSearchCnt(r6) ; prepare to accumulate statistics - ld r8,pmapSearchVisits(r6) - addi r10,r10,1 ; count searches - add r8,r8,r2 ; count nodes visited - stw r10,pmapSearchCnt(r6) - std r8,pmapSearchVisits(r6) -#endif - beqlr- ; next ptr was null, so return 0 in r4 and r5 - lwz r5,mpVAddr+4(r4) ; get VA of next node - lwz r4,mpVAddr+0(r4) - blr - - - ; 32-bit processors. Check next mapping. - ; r2 = count of mappings visited so far - ; r3 = current mapping ptr - ; r4 = va of current mapping (ie, of r3) - ; r5 = va to search for (the "key") (low 12 bits are 0) - ; r6 = pmap ptr - ; r7 = current skip list number * 8 - ; r8 = ptr to skip list vector of mapping pointed to by r9 (or pmap, if r9==0) - ; r9 = prev ptr, or 0 if none - - .align 4 -mapSrch32a: ; loop over each mapping - lwz r4,mpVAddr+4(r3) ; get va for this mapping (plus flags in low 12 bits) - addi r2,r2,1 ; count mappings visited - rlwinm r4,r4,0,0,19 ; zero low 12 bits of mapping va - cmplw cr1,r5,r4 ; compare the vas - blt cr1,mapSrch32d ; key is less, try next list - la r8,mpList0+4(r3) ; point to skip list vector in this mapping - mr r9,r3 ; remember prev ptr - beq- cr1,mapSrch32Found ; this is the correct mapping - lwzx r3,r7,r8 ; get ptr to next mapping in current list -mapSrch32c: - mr. r3,r3 ; was there another mapping on current list? - bne+ mapSrch32a ; was another, so loop -mapSrch32d: - subic. r7,r7,8 ; move on to next list offset - lwzx r3,r7,r8 ; get next mapping on next list (if any) - bge+ mapSrch32c ; loop to try next list - - ; Mapping not found, check to see if prev node was a block mapping or nested pmap. - ; If not, or if our address is not covered by the block or nested map, return 0. - ; Note the advantage of keeping the check for block mappings (and nested pmaps) - ; out of the inner loop; we do the special case work at most once per search, and - ; never for the most-common case of finding a scalar mapping. The full searches - ; must check _in_ the inner loop, to get the prev ptrs right. - - mr. r9,r9 ; was there a prev ptr? - li r3,0 ; assume we are going to return null - lwz r4,pmapSkipLists+4(r6) ; assume prev ptr null... so next is first - beq- mapSrch32Exit ; prev ptr was null, search failed - lwz r0,mpFlags(r9) ; get flag bits from prev mapping - lhz r11,mpBSize(r9) ; get #pages/#segments in block/submap mapping - lwz r10,mpVAddr+4(r9) ; re-fetch base address of prev ptr - - rlwinm r0,r0,mpBSub+1,31,31 ; Rotate to get 0 if 4K bsu or 1 if 32MB bsu - addi r11,r11,1 ; Convert 0-based to 1-based - ori r0,r0,0x3216 ; OR in 0x00003216 (0x3200 and a base rotate of 22) - rlwnm r0,r0,r0,27,31 ; Rotate to get 12 or 25 - lwz r4,mpList0+4(r9) ; get ptr to next mapping, if any - slw r11,r11,r0 ; Get length in bytes - rlwinm r10,r10,0,0,19 ; zero low 12 bits of block mapping va - subi r0,r11,4096 ; get address of last page in submap - add r10,r10,r0 ; r10 <- last page in this mapping - cmplw r5,r10 ; does this mapping cover our page? - bgt mapSrch32Exit ; no, search failed - mr r3,r9 ; yes, we found it - - ; found the mapping - ; r2 = count of nodes visited - ; r3 = the mapping - ; r6 = pmap ptr - -mapSrch32Found: ; WARNING: can drop down to here - lwz r4,mpList0+4(r3) ; get ptr to next mapping - lwz r7,mpFlags(r3) ; Get mpFlags for our caller - ; r2 = count of nodes visited - ; r3 = return value (ie, found mapping or 0) - ; r4 = next mapping (or 0 if none) - ; r6 = pmap ptr - ; r7 = mpFlags - -mapSrch32Exit: - mr. r5,r4 ; next ptr null? -#if SKIPLISTSTATS - lwz r10,pmapSearchCnt(r6) ; prepare to accumulate statistics - lwz r8,pmapSearchVisits(r6) - lwz r9,pmapSearchVisits+4(r6) - addi r10,r10,1 ; count searches - addc r9,r9,r2 ; count nodes visited - addze r8,r8 - stw r10,pmapSearchCnt(r6) - stw r8,pmapSearchVisits(r6) - stw r9,pmapSearchVisits+4(r6) -#endif - beqlr- ; next ptr was null, so return 0 in r4 and r5 - lwz r5,mpVAddr+4(r4) ; get VA of next node - lwz r4,mpVAddr+0(r4) - blr - - ; Here when the pmap is empty (ie, pmapCurLists==0), both in 32 and 64-bit mode, - ; and from both mapSearch and mapSearchFull. - ; r6 = pmap ptr - -mapSrchPmapEmpty: - li r3,0 ; return null - li r4,0 ; return 0 as virtual address of next node - li r5,0 -#if SKIPLISTSTATS - lwz r7,pmapSearchCnt(r6) ; prepare to accumulate statistics - addi r7,r7,1 ; count searches - stw r7,pmapSearchCnt(r6) -#endif - blr - - -/* - * ***************************** - * * m a p S e a r c h F u l l * - * ***************************** - * - * Given a pmap and a virtual address (VA), find the mapping for that address. - * This is the "full" call, that sets up a vector of ptrs to the previous node - * (or to the pmap, if there is no previous node) for each list that the mapping - * in on. We also make consistency checks on the skip-lists. When called: - * the pmap is locked (shared or exclusive) - * translation is off, interrupts masked - * 64-bit mode is enabled (if on a 64-bit machine) - * cr6 is loaded with the corresponding feature flags (in particular, pf64Bit) - * r3 = pmap ptr - * r4 = high 32 bits of key to search for (0 if a 32-bit processor) - * r5 = low 32 bits of key (low 12 bits may be nonzero garbage) - * - * We return the mapping ptr (or 0) in r3, and the next VA (or 0 if no more) in r4 and r5. - * Except for cr6 (which is global), we trash nonvolatile regs. Called both on 32- and 64-bit - * machines, though we quickly branch into parallel code paths. - */ - .text - .align 5 - .globl EXT(mapSearchFull) -LEXT(mapSearchFull) - lbz r7,pmapCurLists(r3) ; get largest #lists any mapping is on - la r8,pmapSkipLists+4(r3) ; point to lists in pmap, assuming 32-bit machine - rlwinm r5,r5,0,0,19 ; zero low 12 bits of key - mr r6,r3 ; save pmap ptr here so we can accumulate statistics - li r2,0 ; initialize count of mappings visited - mfsprg r12,0 ; get the per-proc data ptr - crclr bFullFound ; we have not found the mapping yet - addic. r7,r7,-1 ; get base-0 number of last list, and test for 0 - subi r9,r8,mpList0+4 ; initialize prev ptr to be a fake mapping - slwi r7,r7,3 ; get (offset*8) of last list - la r12,skipListPrev+4(r12) ; point to vector of prev ptrs, assuming 32-bit machine - blt-- mapSrchPmapEmpty ; pmapCurLists==0 (ie, no mappings) - lwzx r3,r8,r7 ; get 32-bit ptr to 1st mapping in highest list - li r10,0 ; initialize prev ptrs VA to 0 too - bf-- pf64Bitb,mapSrchFull32c ; skip if 32-bit processor - subi r8,r8,4 ; we use all 64 bits of ptrs - subi r12,r12,4 - rldimi r5,r4,32,0 ; r5 <- 64-bit va - ldx r3,r8,r7 ; get 64-bit ptr to 1st mapping in highest list - b mapSrchFull64c ; enter 64-bit search loop - - - ; 64-bit processors. Check next mapping. - ; r2 = count of mappings visited so far - ; r3 = current mapping ptr - ; r4 = va of current mapping (ie, of r3) - ; r5 = va to search for (the "key") (low 12 bits are 0) - ; r6 = pmap ptr - ; r7 = current skip list number * 8 - ; r8 = ptr to skip list vector of mapping pointed to by r9 - ; r9 = prev ptr, ie highest mapping that comes before search target (initially the pmap) - ; r10 = lowest expected next va, 0 at the beginning of the search - ; r12 = ptr to the skipListPrev vector in the per-proc - - .align 5 -mapSrchFull64a: ; loop over each mapping - addi r2,r2,1 ; count mappings visited - lwz r0,mpFlags(r3) ; get mapping flag bits - lhz r11,mpBSize(r3) ; get #pages/#segments in block/submap mapping - ld r4,mpVAddr(r3) ; get va for this mapping (plus flags in low 12 bits) - - rlwinm r0,r0,mpBSub+1,31,31 ; Rotate to get 0 if 4K bsu or 1 if 32MB bsu - addi r11,r11,1 ; Convert 0-based to 1-based - ori r0,r0,0x3216 ; OR in 0x00003216 (0x3200 and a base rotate of 22) - rlwnm r0,r0,r0,27,31 ; Rotate to get 12 or 25 - sld r11,r11,r0 ; Get the length in bytes - rldicr r4,r4,0,51 ; zero low 12 bits of mapping va - addic. r0,r11,-4096 ; get offset last page in mapping (set cr0_eq if 1 page) - - cmpld cr5,r10,r4 ; make sure VAs come in strictly ascending order - cmpld cr1,r5,r4 ; compare the vas - bgt-- cr5,mapSkipListPanic ; die if keys are out of order - - blt cr1,mapSrchFull64d ; key is less, try next list - beq cr1,mapSrchFull64Found ; this is the correct mapping - bne-- cr0,mapSrchFull64e ; handle mapping larger than one page -mapSrchFull64b: - la r8,mpList0(r3) ; point to skip list vector in this mapping - mr r9,r3 ; current becomes previous - ldx r3,r7,r8 ; get ptr to next mapping in current list - addi r10,r4,0x1000 ; Get the lowest VA we can get next -mapSrchFull64c: - mr. r3,r3 ; was there another mapping on current list? - bne++ mapSrchFull64a ; was another, so loop -mapSrchFull64d: - stdx r9,r7,r12 ; save prev ptr in per-proc vector - subic. r7,r7,8 ; move on to next list offset - ldx r3,r7,r8 ; get next mapping on next list (if any) - bge++ mapSrchFull64c ; loop to try next list - - ; Mapping not found, return 0 and next higher key - - li r3,0 ; return null - bt-- bFullFound,mapSkipListPanic ; panic if it was on earlier list - ld r4,mpList0(r9) ; get 64-bit ptr to next mapping, if any - b mapSrch64Exit - - ; Block mapping or nested pmap, and key > base. We must compute the va of - ; the end of the block to see if key fits within it. - -mapSrchFull64e: - add r4,r4,r0 ; r4 <- last page in this mapping - cmpld r5,r4 ; does this mapping cover our page? - bgt mapSrchFull64b ; no, try next mapping (r4 is advanced to end of range) - - - ; found the mapping - ; r2 = count of nodes visited - ; r3 = the mapping - ; r6 = pmap ptr - ; r7 = current skip list number * 8 - ; r8 = ptr to prev mappings (ie, r9) skip-list vector - ; r9 = prev ptr, ie highest mapping that comes before search target - ; r10 = prev mappings va - ; r12 = ptr to the skipListPrev vector in the per-proc - -mapSrchFull64Found: ; WARNING: can drop down to here - cmpwi r7,0 ; are we in the last skip-list? - crset bFullFound ; remember that we found the mapping - bne mapSrchFull64d ; mapSearchFull must search all lists to get prev ptrs - ld r4,mpList0(r3) ; get ptr to next mapping - stdx r9,r7,r12 ; save prev ptr in last list - lwz r7,mpFlags(r3) ; Get the flags for our caller - b mapSrch64Exit - - - ; 32-bit processors. Check next mapping. - ; r2 = count of nodes visited - ; r3 = ptr to next mapping in current list - ; r5 = va to search for (the "key") (low 12 bits are 0) - ; r6 = pmap ptr - ; r7 = current skip list number * 8 - ; r8 = ptr to skip list vector of mapping pointed to by r9 - ; r9 = prev ptr, ie highest mapping that comes before search target (initially the pmap) - ; r10 = lowest expected next va, 0 at the beginning of the search - ; r12 = ptr to the skipListPrev vector in the per-proc - - .align 4 -mapSrchFull32a: ; loop over each mapping - addi r2,r2,1 ; count mappings visited - lwz r0,mpFlags(r3) ; get mapping flag bits - lhz r11,mpBSize(r3) ; get #pages/#segments in block/submap mapping - lwz r4,mpVAddr+4(r3) ; get va for this mapping (plus flags in low 12 bits) - - rlwinm r0,r0,mpBSub+1,31,31 ; Rotate to get 0 if 4K bsu or 1 if 32MB bsu - addi r11,r11,1 ; Convert 0-based to 1-based - ori r0,r0,0x3216 ; OR in 0x00003216 (0x3200 and a base rotate of 22) - rlwnm r0,r0,r0,27,31 ; Rotate to get 12 or 25 - slw r11,r11,r0 ; Get the length in bytes - rlwinm r4,r4,0,0,19 ; zero low 12 bits of mapping va - addic. r0,r11,-4096 ; get offset last page in mapping (set cr0_eq if 1 page) - - cmplw cr0,r10,r4 ; make sure VAs come in strictly ascending order - cmplw cr1,r5,r4 ; compare the vas - bgt- cr0,mapSkipListPanic ; die if keys are out of order - - blt cr1,mapSrchFull32d ; key is less than this va, try next list - beq cr1,mapSrchFull32Found ; this is the correct mapping - bne- cr0,mapSrchFull32e ; handle mapping larger than one page -mapSrchFull32b: - la r8,mpList0+4(r3) ; point to skip list vector in this mapping - mr r9,r3 ; current becomes previous - lwzx r3,r7,r8 ; get ptr to next mapping in current list - addi r10,r4,0x1000 ; Get the lowest VA we can get next -mapSrchFull32c: - mr. r3,r3 ; next becomes current - bne+ mapSrchFull32a ; was another, so loop -mapSrchFull32d: - stwx r9,r7,r12 ; save prev ptr in per-proc vector - subic. r7,r7,8 ; move on to next list offset - lwzx r3,r7,r8 ; get next mapping on lower list (if any) - bge+ mapSrchFull32c ; loop to try next list - - ; mapping not found, return 0 and next-key - - li r3,0 ; return null - bt- bFullFound,mapSkipListPanic ; panic if it was on an earlier list - lwz r4,mpList0+4(r9) ; get ptr to next mapping - b mapSrch32Exit - - ; Block mapping or nested pmap, and key > base. We must compute the va of - ; the end of the block to see if our key fits within it. - -mapSrchFull32e: - add r4,r4,r0 ; r4 <- last page in this mapping - cmplw r5,r4 ; does this mapping cover our page? - bgt mapSrchFull32b ; no, try next mapping - - - ; found the mapping - ; r2 = count of nodes visited - ; r3 = the mapping - ; r6 = pmap ptr - ; r7 = current skip list number * 8 - ; r9 = prev ptr, ie highest mapping that comes before search target, or 0 - ; r10 = prev mappings va - ; r12 = ptr to the skipListPrev vector in the per-proc - -mapSrchFull32Found: ; WARNING: can drop down to here - cmpwi r7,0 ; are we in the last skip-list? - crset bFullFound ; remember that we found the mapping - bne mapSrchFull32d ; mapSearchFull must search all lists to get prev ptrs - lwz r4,mpList0+4(r3) ; get ptr to next mapping - stwx r9,r7,r12 ; save prev ptr in last list - lwz r7,mpFlags(r3) ; Get mpFlags for our caller - b mapSrch32Exit - - -/* - * ********************* - * * m a p I n s e r t * - * ********************* - * - * Insert a mapping into pmap skip-lists. The caller has already called mapSearchFull to - * determine that this mapping does not overlap other mappings in the pmap. As a side effect - * of calling mapSearchFull, the per-proc skipListPrev array is set up with a vector of the - * previous ptrs for each skip list. When called: - * the pmap is locked (exclusive) - * translation is off, interrupts masked - * 64-bit mode is enabled (if on a 64-bit machine) - * mapSearchFull has just been called for this mappings key - * cr6 is loaded with the corresponding feature flags (in particular, pf64Bit) - * r3 = pmap ptr - * r4 = mapping ptr - * - * There is no return value. Except for cr6 (which is global), we trash nonvolatile regs. - */ - - .align 5 - .globl EXT(mapInsert) -LEXT(mapInsert) - lwz r8,mpFlags(r4) ; get this mappings flags - lbz r7,pmapCurLists(r3) ; get current max# lists any mapping is on - la r10,pmapSkipLists+4(r3) ; r10 <-- base of pmap list headers, assuming 32-bit machine - la r11,mpList0+4(r4) ; r11 <-- base of this mappings list vector - mfsprg r12,0 ; get ptr to our per-proc - andi. r9,r8,mpLists ; get #lists this mapping is on (1<=n<=27) - la r12,skipListPrev+4(r12) ; r12 <-- base of prev ptr vector - sub. r6,r9,r7 ; is this mapping on more lists than any other? - slwi r8,r9,3 ; get #lists * 8 - subi r8,r8,8 ; get offset to topmost (last) list in use - bf-- pf64Bitb,mapIns32 ; handle 32-bit processor - subi r10,r10,4 ; we use all 8 bytes of the ptr fields - subi r11,r11,4 - subi r12,r12,4 - ble++ mapIns64a ; not new max #lists - - ; 64-bit processor: We must increase pmapCurLists. Since mapSearchFull() only - ; sets up the first pmapCurLists prev ptrs, we must initialize the new ones to - ; point to the pmap. While we are at it, we verify that the unused list hdrs in - ; the pmap are 0. - - cmpwi r9,kSkipListMaxLists ; in range? - stb r9,pmapCurLists(r3) ; remember new max - mtctr r6 ; set up count of new lists - mr r5,r8 ; copy offset to last list - subi r0,r10,mpList0 ; r0 <-- fake mapping ptr (to pmap) for null prev ptrs - bgt-- mapSkipListPanic ; choke if this mapping is on too many lists -mapIns64NewList: - ldx r6,r5,r10 ; get pmap list head - stdx r0,r5,r12 ; initialize prev ptr - subi r5,r5,8 ; get next list offset - cmpdi r6,0 ; was list hdr null? - bdnzt cr0_eq,mapIns64NewList ; loop if more lists to initialize and list hdr was 0 - bne-- mapSkipListPanic ; die if pmap list hdr was not null - b mapIns64a - - ; 64-bit processor: loop over each list this mapping is on - ; r4 = mapping - ; r8 = next list offset - ; r10 = ptr to base of pmap list header vector - ; r11 = ptr to base of new mappings list vector - ; r12 = ptr to base of prev ptr vector in per-proc - - .align 5 -mapIns64a: - ldx r5,r8,r12 ; get prev ptr from per-proc vector - cmpwi cr1,r8,0 ; more to go? - la r7,mpList0(r5) ; get base of prev mappings list vector - ldx r9,r8,r7 ; *** - stdx r4,r8,r7 ; * insert new mapping in middle of this list - stdx r9,r8,r11 ; *** - subi r8,r8,8 ; get next list offset - bne++ cr1,mapIns64a ; more lists to go - blr ; done - - ; Handle 32-bit processor. First, increase pmapCurLists if necessary; cr0 is bgt - ; iff the new mapping has more lists. Since mapSearchFull() only sets up the first - ; pmapCurLists prev ptrs, we must initialize any new ones to point to the pmap. - ; While we are at it, we verify that the unused list hdrs in the pmap are 0. - -mapIns32: - ble+ mapIns32a ; skip if new mapping does not use extra lists - cmpwi r9,kSkipListMaxLists ; in range? - stb r9,pmapCurLists(r3) ; remember new max - mtctr r6 ; set up count of new lists - mr r5,r8 ; copy offset to last list - subi r0,r10,mpList0+4 ; r0 <-- fake mapping ptr (to pmap) for null prev ptrs - bgt- mapSkipListPanic ; choke if this mapping is on too many lists -mapIns32NewList: - lwzx r6,r5,r10 ; get pmap list head - stwx r0,r5,r12 ; initialize prev ptr - subi r5,r5,8 ; get next list offset - cmpwi r6,0 ; was list hdr null? - bdnzt cr0_eq,mapIns32NewList ; loop if more lists to initialize and list hdr was 0 - bne- mapSkipListPanic ; die if pmap list hdr was not null - b mapIns32a - - ; 32-bit processor: loop over each list this mapping is on - ; r4 = mapping - ; r8 = next list offset - ; r10 = ptr to base of pmap list header vector - ; r11 = ptr to base of new mappings list vector - ; r12 = ptr to base of prev ptr vector - - .align 4 -mapIns32a: - lwzx r5,r8,r12 ; get prev ptr from per-proc vector - cmpwi cr1,r8,0 ; more to go? - la r7,mpList0+4(r5) ; get base of prev mappings list vector - lwzx r9,r8,r7 ; *** - stwx r4,r8,r7 ; * insert new mapping in middle of this list - stwx r9,r8,r11 ; *** - subi r8,r8,8 ; get next list offset - bne+ cr1,mapIns32a ; more lists to go - blr ; done - - -/* - * ********************* - * * m a p R e m o v e * - * ********************* - * - * Remove a mapping from pmap skip-lists. The caller has already called mapSearchFull to - * find the mapping, which sets up the skipListPrev array with a vector of the previous - * ptrs for each skip list. When called: - * the pmap is locked (exclusive) - * translation is off, interrupts masked - * 64-bit mode is enabled (if on a 64-bit machine) - * mapSearchFull has just been called for this mappings key - * cr6 is loaded with the corresponding feature flags (in particular, pf64Bit) - * r3 = pmap ptr - * r4 = mapping ptr - * - * There is no return value. Except for cr6 (which is global), we trash nonvolatile regs. - */ - - .align 5 - .globl EXT(mapRemove) -LEXT(mapRemove) - lwz r8,mpFlags(r4) ; get this mappings flags - lbz r10,pmapCurLists(r3) ; get current #lists in use - la r11,mpList0+4(r4) ; r11 <-- base of this mappings list vector - mfsprg r12,0 ; get ptr to our per-proc - andi. r9,r8,mpLists ; get #lists this mapping is on (1<=n<=27) - slwi r8,r9,3 ; get #lists * 8 - cmpw cr5,r9,r10 ; compare mpLists to pmapCurLists - la r12,skipListPrev+4(r12) ; r12 <-- base of prev ptr vector - bgt-- cr5,mapSkipListPanic ; die if mpLists > pmapCurLists - subi r8,r8,8 ; get offset to topmast (last) list this mapping is in - bf-- pf64Bitb,mapRem32a ; skip if 32-bit processor - subi r11,r11,4 ; we use all 64 bits of list links on 64-bit machines - subi r12,r12,4 - b mapRem64a - - ; 64-bit processor: loop over each list this mapping is on - ; r3 = pmap - ; r4 = mapping - ; r8 = offset to next list - ; r10 = pmapCurLists - ; r11 = ptr to base of mapping list vector - ; r12 = ptr to base of prev ptr vector in per-proc - ; cr5 = beq if (mpLists == pmapCurLists) - - .align 5 -mapRem64a: - ldx r5,r8,r12 ; get prev ptr from per-proc vector - ldx r9,r8,r11 ; get next ptr from mapping - cmpwi cr1,r8,0 ; more to go? - la r7,mpList0(r5) ; get base of prev mappings list vector - stdx r9,r8,r7 ; point to next from prev - subi r8,r8,8 ; get next list offset - bne++ cr1,mapRem64a ; loop if another list to unlink from - - ; Did we reduce #lists in use by removing last mapping in last list? - - bnelr++ cr5 ; if (mpLists!=pmapCurLists) cannot have removed last map - la r5,pmapSkipLists(r3) ; point to vector of list hdrs -mapRem64b: - subic. r10,r10,1 ; get base-0 list# - slwi r8,r10,3 ; get offset to last list - ldx r0,r8,r5 ; get last list ptr - cmpdi cr1,r0,0 ; null? - bnelr cr1 ; not null, so we are done - stb r10,pmapCurLists(r3) ; was null, so decrement pmapCurLists - bgt mapRem64b ; loop to see if more than one list was emptied - blr - - - ; 32-bit processor: loop over each list this mapping is on - ; r3 = pmap - ; r4 = mapping - ; r8 = offset to next list - ; r10 = pmapCurLists - ; r11 = ptr to base of mapping list vector - ; r12 = ptr to base of prev ptr vector in per-proc - ; cr5 = beq if (mpLists == pmapCurLists) - - .align 4 -mapRem32a: - lwzx r5,r8,r12 ; get prev ptr from per-proc vector - lwzx r9,r8,r11 ; get next ptr from mapping - cmpwi cr1,r8,0 ; more to go? - la r7,mpList0+4(r5) ; get base of prev mappings list vector - stwx r9,r8,r7 ; point to next from prev - subi r8,r8,8 ; get next list offset - bne+ cr1,mapRem32a ; loop if another list to unlink from - - ; Did we reduce #lists in use by removing last mapping in last list? - - bnelr+ cr5 ; if (mpLists!=pmapCurLists) cannot have removed last map - la r5,pmapSkipLists+4(r3) ; point to vector of list hdrs -mapRem32b: - subic. r10,r10,1 ; get base-0 list# - slwi r8,r10,3 ; get offset to last list - lwzx r0,r8,r5 ; get last list ptr - cmpwi cr1,r0,0 ; null? - bnelr cr1 ; not null, so we are done - stb r10,pmapCurLists(r3) ; was null, so decrement pmapCurLists - bgt mapRem32b ; loop to see if more than one list was emptied - blr - - -/* - * ************************* - * * m a p S e t L i s t s * - * ************************* - * - * Called to decide how many skip-lists the next mapping will be on. For each pmap, - * we maintain a psuedo-random sequence based on a linear feedback shift register. The - * next number is generated by rotating the old value left by 1 and XORing with a - * polynomial (actually 4 8-bit polynomials concatanated) and adding 1. - * The simple (unclamped) number of lists a mapping is on is the number of trailing 0s - * in the pseudo-random sequence, shifted by the (log2-1) of the fanout F, plus one. - * This seems to give us a near perfect distribution, in the sense that about F times more nodes - * are allocated on n lists, as are on (n+1) lists. - * - * At one point we used a simple counter to assign lists. While this gave perfect - * distribution, there were certain access pattern that would drive a worst case - * distribution (e.g., insert low, then high, then low, etc.). Unfortunately, - * these patterns were not too uncommon. We changed to a less-than-perfect assignment, - * but one that works consistently across all known access patterns. - * - * Also, we modify the "simple" trailing-0-based list count, to account for an important - * observation: because VM does a lot of removing and restoring of mappings in the process of - * doing copy-on-write etc, it is common to have the pmap's "random number" (ie, the - * count of created mappings) be much larger than the number of mappings currently in the - * pmap. This means the simple list count will often be larger than justified by the number of - * mappings in the pmap. To avoid this common situation, we clamp the list count to be no more - * than ceil(logBaseF(pmapResidentCnt)). - * - * Finally, we also clamp the list count to kSkipListMaxLists. - * - * We are passed the pmap ptr in r3. Called with translation on, interrupts enabled, - * and in 32-bit mode. - */ - .align 5 - .globl EXT(mapSetLists) -LEXT(mapSetLists) - lwz r5,pmapRandNum(r3) ; get the per-pmap counter of mapping creates - lwz r4,pmapResidentCnt(r3) ; get number of mappings in this pmap - lis r11,hi16(0xA7CBF5B9) ; Get polynomial (I just made this up...) - li r0,-1 ; get a mask of 1s - ori r11,r11,lo16(0xA7CBF5B9) ; Get polynomial (I just made this up...) - rlwinm r5,r5,1,0,31 ; Rotate - cntlzw r7,r4 ; get magnitude of pmapResidentCnt - xor r5,r5,r11 ; Munge with poly - srw r7,r0,r7 ; r7 <- mask for magnitude of pmapResidentCnt - addi r6,r5,1 ; increment pmapRandNum non-atomically - andc r8,r5,r6 ; get a mask for trailing zeroes in pmapRandNum - stw r6,pmapRandNum(r3) ; update "random number" - and r8,r8,r7 ; clamp trailing 0s to magnitude of pmapResidentCnt - rlwinm r8,r8,0,32-(kSkipListMaxLists*(kSkipListFanoutShift+1))+1,31 ; clamp to kSkipListMaxLists - cntlzw r9,r8 ; count leading 0s in the mask - subfic r10,r9,32 ; r10 <- trailing zero count - srwi r11,r10,kSkipListFanoutShift ; shift by 1 if fanout is 4, 2 if 8, etc - addi r3,r11,1 ; every mapping is on at least one list - blr - - -/* - * ************************************* - * * m a p S k i p L i s t V e r i f y * - * ************************************* - * - * This does a fairly thorough sweep through a pmaps skip-list data structure, doing - * consistency checks. It is typically called (from hw_exceptions.s) from debug or - * instrumented builds. It is probably not a good idea to call this in production builds, - * as it must run with exceptions disabled and can take a long time to verify a big pmap. - * It runs in O(n*ln(n)). - * - * Called on a bl, with the pmap ptr in r20. We assume the pmap is locked (shared) and - * that EE and DR are off. We check all 64 bits of ptrs even on 32-bit machines. - * We use r20-r31, cr0, cr1, and cr7. If we return, no inconsistencies were found. - * - * You will notice we make little attempt to schedule the code; clarity is deemed more - * important than speed. - */ - - - /* - * mapSkipListVerifyC is a version that is callable from C. - * This should be called only from the debugger, IT DOES NOT LOCK THE PMAP!!!! - */ - - .globl EXT(mapSkipListVerifyC) -LEXT(mapSkipListVerifyC) - - stwu r1,-(FM_ALIGN((31-13+1)*4)+FM_SIZE)(r1) ; Make some space on the stack - mflr r0 ; Save the link register - stmw r13,FM_ARG0(r1) ; Save all registers - stw r0,(FM_ALIGN((31-13+1)*4)+FM_SIZE+FM_LR_SAVE)(r1) ; Save the return - - lwz r15,pmapvr(r3) ; Get the V to R translation - lwz r16,pmapvr+4(r3) ; Get the V to R translation - mr r19,r4 ; Save register dump area - - bl EXT(mapSetUp) ; Get set up - - mr r17,r11 - xor r20,r3,r16 ; Translate 32-bit portion - bf-- pf64Bitb,mslvc32a ; Skip if 32-bit... - - rldimi r20,r15,32,0 ; Shift the fixed upper part of the physical over and cram in top - -mslvc32a: lis r18,hi16(EXT(DebugWork)) - ori r18,r18,lo16(EXT(DebugWork)) - li r0,0x4262 - stw r0,4(r18) ; Make sure the test knows to run - - bl EXT(mapSkipListVerify) ; Run the test - - li r0,0 - stw r0,4(r18) ; Remove explicit call flag - - bt++ pf64Bitb,mslvc64a ; This is 64-bit... - - mtmsr r17 ; Restore enables/translation/etc. - isync - - li r0,0 - stw r0,0x000+0(r19) - stw r0,0x000+4(r19) - stw r0,0x008+0(r19) - stw r1,0x008+4(r19) - stw r0,0x010+0(r19) - stw r2,0x010+4(r19) - stw r0,0x018+0(r19) - stw r3,0x018+4(r19) - stw r0,0x020+0(r19) - stw r4,0x020+4(r19) - stw r0,0x028+0(r19) - stw r5,0x028+4(r19) - stw r0,0x030+0(r19) - stw r6,0x030+4(r19) - stw r0,0x038+0(r19) - stw r7,0x038+4(r19) - stw r0,0x040+0(r19) - stw r8,0x040+4(r19) - stw r0,0x048+0(r19) - stw r9,0x048+4(r19) - stw r0,0x050+0(r19) - stw r10,0x050+4(r19) - stw r0,0x058+0(r19) - stw r11,0x058+4(r19) - stw r0,0x060+0(r19) - stw r12,0x060+4(r19) - stw r0,0x068+0(r19) - stw r13,0x068+4(r19) - stw r0,0x070+0(r19) - stw r14,0x070+4(r19) - stw r0,0x078+0(r19) - stw r15,0x078+4(r19) - stw r0,0x080+0(r19) - stw r16,0x080+4(r19) - stw r0,0x088+0(r19) - stw r17,0x088+4(r19) - stw r0,0x090+0(r19) - stw r18,0x090+4(r19) - stw r0,0x098+0(r19) - stw r19,0x098+4(r19) - stw r0,0x0A0+0(r19) - stw r20,0x0A0+4(r19) - stw r0,0x0A8+0(r19) - stw r21,0x0A8+4(r19) - stw r0,0x0B0+0(r19) - stw r22,0x0B0+4(r19) - stw r0,0x0B8+0(r19) - stw r23,0x0B8+4(r19) - stw r0,0x0C0+0(r19) - stw r24,0x0C0+4(r19) - stw r0,0x0C8+0(r19) - stw r25,0x0C8+4(r19) - stw r0,0x0D0+0(r19) - stw r26,0x0D0+4(r19) - stw r0,0x0D8+0(r19) - stw r27,0x0D8+4(r19) - stw r0,0x0E0+0(r19) - stw r28,0x0E0+4(r19) - stw r0,0x0E8+0(r19) - stw r29,0x0E8+4(r19) - stw r0,0x0F0+0(r19) - stw r30,0x0F0+4(r19) - stw r0,0x0F8+0(r19) - stw r31,0x0F8+4(r19) - - b mslvcreturn ; Join common... - -mslvc64a: mtmsrd r17 ; Restore enables/translation/etc. - isync - - std r0,0x000(r19) - std r1,0x008(r19) - std r2,0x010(r19) - std r3,0x018(r19) - std r4,0x020(r19) - std r5,0x028(r19) - std r6,0x030(r19) - std r7,0x038(r19) - std r8,0x040(r19) - std r9,0x048(r19) - std r10,0x050(r19) - std r11,0x058(r19) - std r12,0x060(r19) - std r13,0x068(r19) - std r14,0x070(r19) - std r15,0x078(r19) - std r16,0x080(r19) - std r17,0x088(r19) - std r18,0x090(r19) - std r19,0x098(r19) - std r20,0x0A0(r19) - std r21,0x0A8(r19) - std r22,0x0B0(r19) - std r23,0x0B8(r19) - std r24,0x0C0(r19) - std r25,0x0C8(r19) - std r26,0x0D0(r19) - std r27,0x0D8(r19) - std r28,0x0E0(r19) - std r29,0x0E8(r19) - std r30,0x0F0(r19) - std r31,0x0F8(r19) - - -mslvcreturn: - lwz r0,(FM_ALIGN((31-13+1)*4)+FM_SIZE+FM_LR_SAVE)(r1) ; Get the return - lmw r13,FM_ARG0(r1) ; Get the registers - mtlr r0 ; Restore the return - lwz r1,0(r1) ; Pop the stack - blr - - - .globl EXT(mapSkipListVerify) -LEXT(mapSkipListVerify) - mflr r31 ; save LR so we can bl to mapVerifyDie - - ; If we have already found an inconsistency and died, don not do so again, to - ; avoid a loop. - - lis r27,hi16(EXT(DebugWork)) - ori r27,r27,lo16(EXT(DebugWork)) - lwz r0,4(r27) ; Get the explicit entry flag - lwz r27,0(r27) ; Get lockout - cmplwi r0,0x4262 ; Should we run anyway? - beq-- mslvAnyway ; Yes... - cmpwi r27,0 ; have we already found an error? - bnelr-- ; yes, just return wo checking again - -mslvAnyway: - ; Not recursive call, so initialize. - - mfsprg r23,2 ; get the feature flags - mtcrf 0x02,r23 ; put pf64Bit where we can test it - lbz r26,pmapCurLists(r20) ; get #lists that are in use - lwz r21,pmapResidentCnt(r20); get #mappings in this pmap - cmpwi r26,kSkipListMaxLists ; in range? - bgtl-- mapVerifyDie ; pmapCurLists is too big - - ; To prevent infinite loops, set limit of (pmapCurLists*pmapResidentCnt) iterations. - ; Since we walk each list this is the max number of mappings we could visit. - - li r23,0 ; initialize count -mapVer0: - subic. r26,r26,1 ; loop pmapCurLists times (but at least once) - add r23,r23,r21 ; compute (pmapCurLists*pmapResidentCnt) - bgt mapVer0 ; this will be a 64-bit qty on 64-bit machines - - li r22,kSkipListMaxLists ; initialize list# - bf-- pf64Bitb,mapVer32 ; go handle a 32-bit processor - - ; 64-bit machine. - ; - ; Loop over each list, counting mappings in each. We first check whether or not - ; the list is empty (ie, if the pmapSlipLists ptr is null.) All lists above - ; pmapCurLists should be empty, and no list at or below pmapCurLists should be. - ; r20 = pmap ptr - ; r21 = decrementing counter of mappings in this pmap - ; r22 = next list# (1...kSkipListMaxLists) - ; r23 = decrementing counter for infinite loop check - -mapVer64: - slwi r25,r22,3 ; get offset to next skiplist - la r26,pmapSkipLists(r20) ; get ptr to base of skiplist vector - subi r25,r25,8 - ldx r26,r25,r26 ; get 1st mapping on this list, if any - lbz r28,pmapCurLists(r20) ; get #lists in use - cmpdi cr6,r26,0 ; set cr6_eq if this list is null ("null") - cmpw cr7,r22,r28 ; set cr7_gt if this list is > pmapCurLists ("high") - crxor cr0_eq,cr6_eq,cr7_gt ; cr0_eq <-- (null & !high) | (!null & high) - beql-- mapVerifyDie ; die if this list is null when it should not be, etc - b mapVer64g - - ; Loop over each node in the list. - ; r20 = pmap ptr - ; r21 = decrementing counter of mappings in this pmap - ; r22 = this list# (1...kSkipListMaxLists) - ; r23 = decrementing counter for infinite loop check - ; r25 = offset to this skiplist (ie, ((r22<<3)-8)) - ; r26 = mapping - -mapVer64a: - lwz r29,mpFlags(r26) ; get bits for this mapping - ld r28,mpVAddr(r26) ; get key - subic. r23,r23,1 ; check for loops - bltl-- mapVerifyDie ; we have visited > (pmapCurLists*pmapResidentCnt) nodes - andi. r30,r26,mpBasicSize-1 ; test address for alignment - bnel-- mapVerifyDie ; not aligned - andi. r27,r29,mpLists ; get #lists this mapping is supposed to be on - cmpw cr1,r27,r22 ; is it supposed to be on this list? - bltl-- cr1,mapVerifyDie ; mappings mpLists is too low - cmpwi r27,kSkipListMaxLists ; too big? - bgtl-- mapVerifyDie ; mappings mpLists > max - rldicr r28,r28,0,51 ; clear low 12 bits of va - bne++ cr1,mapVer64f ; jump if this is not highest list for this node - - ; This is the "highest" (last) list this mapping is on. - ; Do some additional checks (so we only do them once per mapping.) - ; First, if a block mapping or nested pmap, compute block end. - - lhz r27,mpBSize(r26) ; get #pages or #segments - rlwinm r29,r29,mpBSub+1,31,31 ; Rotate to get 0 if 4K bsu or 1 if 32MB bsu - addi r27,r27,1 ; units of nested pmap are (#segs-1) - ori r29,r29,0x3216 ; OR in 0x00003216 (0x3200 and a base rotate of 22) - rlwnm r29,r29,r29,27,31 ; Rotate to get 12 or 25 - subi r21,r21,1 ; count mappings in this pmap - sld r29,r27,r29 ; Get the length in bytes - subi r29,r29,4096 ; get offset to last byte in nested pmap - - ; Here with r29 = size of block - 4k, or 0 if mapping is a scalar page. - - add r24,r28,r29 ; r24 <- address of last valid page in this mapping - la r28,mpList0(r26) ; get base of this mappings vector - lwz r27,mpFlags(r26) ; Get the number of lists - andi. r27,r27,mpLists ; get #lists this mapping is on (1<=n<=27) - cmplwi r27,mpBasicLists ; Into bigger mapping? - li r27,mpBasicLists*8-8 ; Assume normal - ble+ mapVer64c ; It is... - li r27,kSkipListMaxLists*8-8 ; initialize list offset for inner loop - - ; Inner loop over each list link in this mappingss mpList vector. - ; r24 = address of last valid page in this mapping - ; r27 = offset for next list in inner loop - ; r28 = base of this mappings list links - -mapVer64c: - cmpw cr1,r27,r25 ; higher, lower, or same? - ldx r29,r27,r28 ; get link to next mapping at this level - mr. r29,r29 ; null? - beq mapVer64d ; link null, which is always OK - bgtl-- cr1,mapVerifyDie ; a mapping has a non-null list higher than its mpLists - ld r30,mpVAddr(r29) ; get next mappings va - rldicr r30,r30,0,51 ; zero low 12 bits - cmpld r30,r24 ; compare next key with ours - blel-- mapVerifyDie ; a next node has key <= to ours -mapVer64d: - subic. r27,r27,8 ; move on to next list - bne++ mapVer64c ; loop if more to go - - ; Next node on current list, or next list if current done, or return if no more lists. - -mapVer64f: - la r28,mpList0(r26) ; get base of this mappings vector - ldx r26,r25,r28 ; get next mapping on this list -mapVer64g: - mr. r26,r26 ; is there one? - bne++ mapVer64a ; yes, handle - subic. r22,r22,1 ; is there another list? - bgt++ mapVer64 ; loop if so - - cmpwi r21,0 ; did we find all the mappings in the pmap? - bnel-- mapVerifyDie ; no - mtlr r31 ; restore return address - li r3,0 - blr - - - ; Handle 32-bit machine. - -mapVer32: - lwz r24,mpFlags(r20) ; Get number of lists - la r30,pmapSkipLists(r20) ; first, check the pmap list hdrs - andi. r24,r24,mpLists ; Clean the number of lists - bl mapVerUpperWordsAre0 ; are the upper words of each list all 0? - - ; Loop over each list, counting mappings in each. We first check whether or not - ; the list is empty. All lists above pmapCurLists should be empty, and no list - ; at or below pmapCurLists should be. - ; - ; r20 = pmap ptr - ; r21 = decrementing counter of mappings in this pmap - ; r22 = next list# (1...kSkipListMaxLists) - ; r23 = decrementing counter for infinite loop check - -mapVer32NextList: - lbz r28,pmapCurLists(r20) ; get #lists in use - slwi r25,r22,3 ; get offset to next skiplist - la r26,pmapSkipLists+4(r20) ; get ptr to base of skiplist vector - subi r25,r25,8 - lwzx r26,r25,r26 ; get the 1st mapping on this list, or 0 - cmpw cr7,r22,r28 ; set cr7_gt if this list is > pmapCurLists ("high") - cmpwi cr6,r26,0 ; set cr6_eq if this list is null ("null") - crxor cr0_eq,cr6_eq,cr7_gt ; cr0_eq <-- (null & !high) | (!null & high) - beql- mapVerifyDie ; die if this list is null when it should not be, etc - b mapVer32g - - ; Loop over each node in the list. - ; r20 = pmap ptr - ; r21 = decrementing counter of mappings in this pmap - ; r22 = this list# (1...kSkipListMaxLists) - ; r23 = decrementing counter for infinite loop check - ; r25 = offset to this skiplist (ie, ((r22<<3)-8)) - ; r26 = mapping - -mapVer32a: - lwz r29,mpFlags(r26) ; get bits for this mapping - andi. r30,r26,mpBasicSize-1 ; test address for alignment - lwz r24,mpVAddr+0(r26) ; get upper word of key - bnel- mapVerifyDie ; mapping address not 64-byte aligned - lwz r28,mpVAddr+4(r26) ; get lower word of key - subic. r23,r23,1 ; check for loops - bltl- mapVerifyDie ; we have visited > (pmapCurLists*pmapResidentCnt) nodes - cmpwi r24,0 ; upper word of key (ie, va) should be 0 - bnel- mapVerifyDie ; was not - andi. r27,r29,mpLists ; get #lists this mapping is supposed to be on - cmpw cr1,r27,r22 ; is it supposed to be on this list? - bltl- cr1,mapVerifyDie ; mappings mpLists is too low - cmpwi r27,kSkipListMaxLists ; too big? - bgtl- mapVerifyDie ; mappings mpLists > max - rlwinm r28,r28,0,0,19 ; clear low 12 bits of va - bne+ cr1,mapVer32f ; jump if this is not highest list for this node - - ; This is the "highest" (last) list this mapping is on. - ; Do some additional checks (so we only do them once per mapping.) - ; First, make sure upper words of the mpList vector are 0. - - lhz r27,mpBSize(r26) ; get #blocks - rlwinm r29,r29,mpBSub+1,31,31 ; Rotate to get 0 if 4K bsu or 1 if 32MB bsu - addi r27,r27,1 ; units of nested pmap are (#segs-1) - ori r29,r29,0x3216 ; OR in 0x00003216 (0x3200 and a base rotate of 22) - rlwnm r29,r29,r29,27,31 ; Rotate to get 12 or 25 - subi r21,r21,1 ; count mappings in this pmap - slw r29,r27,r29 ; Get the length in bytes - subi r29,r29,4096 ; get offset to last byte in nested pmap - - lwz r24,mpFlags(r26) ; Get number of lists - la r30,mpList0(r26) ; point to base of skiplist vector - andi. r24,r24,mpLists ; Clean the number of lists - bl mapVerUpperWordsAre0 ; make sure upper words are all 0 (uses r24 and r27) - - ; Here with r29 = size of block - 4k, or 0 if mapping is a scalar page. - - add r24,r28,r29 ; r24 <- address of last valid page in this mapping - la r28,mpList0+4(r26) ; get base of this mappings vector - lwz r27,mpFlags(r26) ; Get the number of lists - andi. r27,r27,mpLists ; get #lists this mapping is on (1<=n<=27) - cmplwi r27,mpBasicLists ; Into bigger mapping? - li r27,mpBasicLists*8-8 ; Assume normal - ble+ mapVer32c ; It is... - li r27,kSkipListMaxLists*8-8 ; initialize list offset for inner loop - - ; Inner loop over each list in this mappings mpList vector. - ; r24 = address of last valid page in this mapping - ; r27 = offset for next list in inner loop - ; r28 = base of this mappings list links - -mapVer32c: - cmpw cr1,r27,r25 ; higher, lower, or same? - lwzx r29,r27,r28 ; get link to next mapping at this level - mr. r29,r29 ; null? - beq mapVer32d ; link null, which is always OK - - - bgtl- cr1,mapVerifyDie ; a mapping has a non-null list higher than its mpLists - lwz r30,mpVAddr+4(r29) ; get next mappings va - rlwinm r30,r30,0,0,19 ; zero low 12 bits - cmplw r30,r24 ; compare next key with ours - blel- mapVerifyDie ; a next node has key <= to ours -mapVer32d: - subic. r27,r27,8 ; move on to next list - bne+ mapVer32c ; loop if more to go - - ; Next node on current list, or next list if current done, or return if no more lists. - -mapVer32f: - la r28,mpList0+4(r26) ; get base of this mappings vector again - lwzx r26,r25,r28 ; get next mapping on this list -mapVer32g: - mr. r26,r26 ; is there one? - bne+ mapVer32a ; yes, handle - subic. r22,r22,1 ; is there another list? - bgt+ mapVer32NextList ; loop if so - - cmpwi r21,0 ; did we find all the mappings in the pmap? - bnel- mapVerifyDie ; no - mtlr r31 ; restore return address - li r3,0 - blr - - ; Subroutine to verify that the upper words of a vector of kSkipListMaxLists - ; doublewords are 0. - ; r30 = ptr to base of vector - ; Uses r24 and r27. - -mapVerUpperWordsAre0: - cmplwi r24,mpBasicLists ; Do we have more than basic? - li r24,mpBasicLists*8 ; Assume basic - ble++ mapVerUpper1 ; We have the basic size - li r24,kSkipListMaxLists*8 ; Use max size - -mapVerUpper1: - subic. r24,r24,8 ; get offset to next doubleword - lwzx r27,r24,r30 ; get upper word - cmpwi cr1,r27,0 ; 0 ? - bne- cr1,mapVerifyDie ; die if not, passing callers LR - bgt+ mapVerUpper1 ; loop if more to go - blr - - ; bl here if mapSkipListVerify detects an inconsistency. - -mapVerifyDie: - mflr r3 - mtlr r31 ; Restore return - lis r31,hi16(EXT(DebugWork)) - ori r31,r31,lo16(EXT(DebugWork)) - lwz r0,4(r31) ; Get the explicit entry flag - cmplwi r0,0x4262 ; Should we run anyway? - beqlr-- ; Explicit call, return... - - li r0,1 - stw r0,0(r31) ; Lock out further calls - BREAKPOINT_TRAP ; hopefully, enter debugger - b .-4 - - -/* - * Panic (choke, to be exact) because of messed up skip lists. The LR points back - * to the original caller of the skip-list function. - */ - -mapSkipListPanic: ; skip-lists are screwed up - lis r0,hi16(Choke) - ori r0,r0,lo16(Choke) - li r3,failSkipLists ; get choke code - sc ; choke - b .-4 - - diff --git a/osfmk/ppc/spec_reg.h b/osfmk/ppc/spec_reg.h deleted file mode 100644 index 46a13bdf6..000000000 --- a/osfmk/ppc/spec_reg.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -#ifndef _PPC_SPEC_REG_H_ -#define _PPC_SPEC_REG_H_ - -/* Defines for PVRs */ -#define PROCESSOR_VERSION_750 8 /* ? */ -#define PROCESSOR_VERSION_750FX 0x7000 /* ? */ -#define PROCESSOR_VERSION_7400 12 /* ? */ -#define PROCESSOR_VERSION_7410 0x800C /* ? */ -#define PROCESSOR_VERSION_7450 0x8000 /* ? */ -#define PROCESSOR_VERSION_7455 0x8001 /* ? */ -#define PROCESSOR_VERSION_7457 0x8002 /* ? */ -#define PROCESSOR_VERSION_7447A 0x8003 /* ? */ -#define PROCESSOR_VERSION_970 0x0039 /* ? */ -#define PROCESSOR_VERSION_970FX 0x003C /* ? */ - -#endif /* _PPC_SPEC_REG_H_ */ diff --git a/osfmk/ppc/start.s b/osfmk/ppc/start.s deleted file mode 100644 index 34ebea8ac..000000000 --- a/osfmk/ppc/start.s +++ /dev/null @@ -1,1283 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -#define __APPLE_API_PRIVATE - -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -; Definitions of the processor type table format, which drives this code. -; The table ("processor_types") is assembled in at the end of this file. - -#define ptFilter 0 -#define ptVersion 4 -#define ptRevision 6 -#define ptFeatures 8 -#define ptCPUCap 12 -#define ptPwrModes 16 -#define ptPatch 20 -#define ptInitRout 24 -#define ptRptdProc 28 -#define ptLineSize 32 -#define ptl1iSize 36 -#define ptl1dSize 40 -#define ptPTEG 44 -#define ptMaxVAddr 48 -#define ptMaxPAddr 52 -#define ptSize 56 - - -; We use cr2 for flags: - -#define bootCPU 10 -#define firstInit 9 -#define firstBoot 8 - -/* - * Interrupt and bootup stack for initial processor - */ - - .file "start.s" - -/* - * All CPUs start here. - * - * This code is called from SecondaryLoader - * - * Various arguments are passed via a table: - * R3 = pointer to other startup parameters - */ - .text - -ENTRY(resetPOR,TAG_NO_FRAME_USED) - - li r12,0 ; Get a 0 - stw r12,0xF0(0) ; Make sure the special flag is clear - mtmsrd r12 ; Make sure we are in 32-bit mode - isync ; Really make sure - lwz r3,0xF4(0) ; Get the boot_args pointer - b startJoin ; Join up... - - -ENTRY(_start_cpu,TAG_NO_FRAME_USED) - crclr bootCPU ; Set non-boot processor - crclr firstInit ; Set not first time init - lwz r30,ppe_paddr(r3) ; Set current per_proc - lwz r28,ppe_paddr+4(r3) ; Set current per_proc - rlwinm r30,r30,0,1,0 ; Copy low 32 bits to top 32 - rlwimi r30,r28,0,0,31 ; Insert low part of 64-bit address in bottom 32 bits - subi r29,r3,(ACT_PER_PROC-ppe_vaddr) ; Substract mact.PerProc offset - mr r3,r30 ; Set current per_proc - -; -; Note that we are just trying to get close. The real TB sync will take -; place later. The value we are loading is set in two places. For the -; main processor, it will be the TB at the last interrupt before we went -; to sleep. For the others, it will be the time just before the main -; processor woke us up. -; - - lwz r15,ruptStamp(r3) ; Get the timebase from the other processor - li r17,0 ; Clear this out - lwz r16,ruptStamp+4(r3) ; Get the timebase from the other processor - mtspr tbl,r17 ; Clear bottom so we do not tick - mtspr tbu,r15 ; Set top - mtspr tbl,r16 ; Then bottom again - b allstart - -ENTRY(_start,TAG_NO_FRAME_USED) - -startJoin: - mflr r2 ; Save the return address - lis r28,hi16(EXT(PerProcTable)) ; Set PerProcTable - lis r30,hi16(EXT(BootProcInfo)) ; Set current per_proc - ori r28,r28,lo16(EXT(PerProcTable)) ; Set PerProcTable - ori r30,r30,lo16(EXT(BootProcInfo)) ; Set current per_proc - stw r30,ppe_paddr+4(r28) ; Set per_proc_entry - stw r30,ppe_vaddr(r28) ; Set per_proc_entry - subi r29,r28,(ACT_PER_PROC-ppe_vaddr) ; Substract mact.PerProc offset - crset bootCPU ; Set boot processor - - lwz r17,pfAvailable(r30) ; Get the available bits - rlwinm. r0,r17,0,pfValidb,pfValidb ; Have we initialized the feature flags yet? - crmove firstInit,cr0_eq ; Set if we are doing first time init - bne allstart ; Yeah, we must be waking up from sleep... - -; -; Here is where we do any one time general low-level initialization - - lis r20,HIGH_ADDR(fwdisplock) ; Get address of the firmware display lock - li r19,0 ; Zorch a register - ori r20,r20,LOW_ADDR(fwdisplock) ; Get address of the firmware display lock - stw r19,0(r20) ; Make sure the lock is free - -allstart: - mr r31,r3 ; Save away arguments - - crand firstBoot,bootCPU,firstInit ; Indicate if we are on the initial first processor startup - - mtsprg 0,r30 ; Set per_proc paddr - mtsprg 1,r29 ; Set spr1 - - li r9,0 ; Clear out a register - mtsprg 2,r9 - mtsprg 3,r9 - - li r7,MSR_VM_OFF ; Get real mode MSR - mtmsr r7 ; Set the real mode SRR - isync - - lis r26,hi16(processor_types) ; Point to processor table - ori r26,r26,lo16(processor_types) ; Other half - mfpvr r10 ; Get the PVR - -nextPVR: lwz r28,ptFilter(r26) ; Get the filter - lwz r27,ptVersion(r26) ; Get the version and revision codes - and r28,r10,r28 ; Throw away dont care bits - cmplw r27,r28 ; Is this the right set? - beq donePVR ; We have the right one... - addi r26,r26,ptSize ; Point to the next type - b nextPVR ; Check it out... - -donePVR: lwz r20,ptInitRout(r26) ; Grab the special init routine - mtlr r20 ; Setup to call the init - - bf firstBoot,notFirst ; Not first boot, go... - -; -; The following code just does a general initialization of the features just -; after the initial first-time boot. This is not done after waking up or on -; any "secondary" processor. Just after the boot-processor init, we copy the -; features to any possible per_proc. -; -; We are just setting defaults. The specific initialization code will modify these -; if necessary. -; - lis r18,hi16(EXT(_cpu_capabilities)) ; Get the address of _cpu_capabilities - ori r18,r18,lo16(EXT(_cpu_capabilities)) - lwz r17,ptCPUCap(r26) ; Get the default cpu capabilities - stw r17, 0(r18) ; Save the default value in _cpu_capabilities - - lwz r17,ptFeatures(r26) ; Pick up the features - - lwz r18,ptRptdProc(r26) ; Get the reported processor - sth r18,pfrptdProc(r30) ; Set the reported processor - - lwz r13,ptPwrModes(r26) ; Get the supported power modes - stw r13,pfPowerModes(r30) ; Set the supported power modes - - lwz r13,ptLineSize(r26) ; Get the cache line size - sth r13,pflineSize(r30) ; Save it - lwz r13,ptl1iSize(r26) ; Get icache size - stw r13,pfl1iSize(r30) ; Save it - lwz r13,ptl1dSize(r26) ; Get dcache size - stw r13,pfl1dSize(r30) ; Save it - lwz r13,ptPTEG(r26) ; Get PTEG size address - stw r13,pfPTEG(r30) ; Save it - lwz r13,ptMaxVAddr(r26) ; Get max virtual address - stw r13,pfMaxVAddr(r30) ; Save it - lwz r13,ptMaxPAddr(r26) ; Get max physical address - stw r13,pfMaxPAddr(r30) ; Save it - - -; Go through the patch table, changing performance sensitive kernel routines based on the -; processor type or other things. - - lis r11,hi16(EXT(patch_table)) - ori r11,r11,lo16(EXT(patch_table)) - lwz r19,ptPatch(r26) ; Get ptPatch field -patch_loop: - lwz r16,patchType(r11) ; Load the patch type - lwz r15,patchValue(r11) ; Load the patch value - cmplwi cr1,r16,PATCH_FEATURE ; Is it a patch feature entry - cmplwi cr7,r16,PATCH_END_OF_TABLE ; end of table? - and. r14,r15,r19 ; Is it set in the patch feature - crandc cr0_eq,cr1_eq,cr0_eq ; Do we have a match - beq cr7,doOurInit ; end of table, Go do processor specific initialization - beq patch_apply ; proc feature matches, so patch memory - cmplwi cr1,r16,PATCH_PROCESSOR ; Is it a patch processor entry - cmplw cr0,r15,r18 ; Check matching processor - crand cr0_eq,cr1_eq,cr0_eq ; Do we have a match - bne patch_skip ; No, skip patch memory -patch_apply: - lwz r13,patchAddr(r11) ; Load the address to patch - lwz r14,patchData(r11) ; Load the patch data - stw r14,0(r13) ; Patch the location - dcbf 0,r13 ; Flush the old one - sync ; Make sure we see it all - icbi 0,r13 ; Flush the i-cache - isync ; Hang out - sync ; Hang out some more... -patch_skip: - addi r11,r11,peSize ; Point to the next patch entry - b patch_loop ; handle next - - -; Additional processors join here after skipping above code. - -notFirst: lwz r17,pfAvailable(r30) ; Get our features - -doOurInit: mr. r20,r20 ; See if initialization routine - crand firstBoot,bootCPU,firstInit ; Indicate if we are on the initial first processor startup - bnelrl ; Do the initialization - - ori r17,r17,lo16(pfValid) ; Set the valid bit - stw r17,pfAvailable(r30) ; Set the available features - - rlwinm. r0,r17,0,pf64Bitb,pf64Bitb ; Is this a 64-bit machine? - mtsprg 2,r17 ; Remember the feature flags - - bne++ start64 ; Skip following if 64-bit... - - mfspr r6,hid0 ; Get the HID0 - rlwinm r6,r6,0,sleep+1,doze-1 ; Remove any vestiges of sleep - mtspr hid0,r6 ; Set the insominac HID0 - isync - -; Clear the BAT registers - - li r9,0 ; Clear out a register - sync - isync - mtdbatu 0,r9 ; Invalidate maps - mtdbatl 0,r9 ; Invalidate maps - mtdbatu 1,r9 ; Invalidate maps - mtdbatl 1,r9 ; Invalidate maps - mtdbatu 2,r9 ; Invalidate maps - mtdbatl 2,r9 ; Invalidate maps - mtdbatu 3,r9 ; Invalidate maps - mtdbatl 3,r9 ; Invalidate maps - sync - isync - mtibatu 0,r9 ; Invalidate maps - mtibatl 0,r9 ; Invalidate maps - mtibatu 1,r9 ; Invalidate maps - mtibatl 1,r9 ; Invalidate maps - mtibatu 2,r9 ; Invalidate maps - mtibatl 2,r9 ; Invalidate maps - mtibatu 3,r9 ; Invalidate maps - mtibatl 3,r9 ; Invalidate maps - sync - isync - b startcommon ; Go join up the common start routine - -start64: lis r5,hi16(startcommon) ; Get top of address of continue point - mfspr r6,hid0 ; Get the HID0 - ori r5,r5,lo16(startcommon) ; Get low of address of continue point - lis r9,hi16(MASK(MSR_HV)|MASK(MSR_SF)) ; ? - lis r20,hi16(dozem|napm|sleepm) ; Get mask of power saving features - ori r20,r20,lo16(1) ; Disable the attn instruction - li r7,MSR_VM_OFF ; Get real mode MSR - sldi r9,r9,32 ; Slide into position - sldi r20,r20,32 ; Slide power stuff into position - or r9,r9,r7 ; Form initial MSR - andc r6,r6,r20 ; Remove any vestiges of sleep - isync - mtspr hid0,r6 ; Set the insominac HID0 - mfspr r6,hid0 ; Get it - mfspr r6,hid0 ; Get it - mfspr r6,hid0 ; Get it - mfspr r6,hid0 ; Get it - mfspr r6,hid0 ; Get it - mfspr r6,hid0 ; Get it - isync - mtsrr0 r5 ; Set the continue point - mtsrr1 r9 ; Set our normal disabled MSR - rfid ; Tally ho... - - .align 5 - -startcommon: - rlwinm. r0,r17,0,pfFloatb,pfFloatb ; See if there is floating point - beq- noFloat ; Nope, this is a really stupid machine... - - li r0,MSR_VM_OFF|MASK(MSR_FP) ; Enable for floating point - mtmsr r0 /* Set the standard MSR values */ - isync - - lis r5,HIGH_ADDR(EXT(FloatInit)) /* Get top of floating point init value */ - ori r5,r5,LOW_ADDR(EXT(FloatInit)) /* Slam bottom */ - lfd f0,0(r5) /* Initialize FP0 */ - fmr f1,f0 /* Ours in not */ - fmr f2,f0 /* to wonder why, */ - fmr f3,f0 /* ours is but to */ - fmr f4,f0 /* do or die! */ - fmr f5,f0 - fmr f6,f0 - fmr f7,f0 - fmr f8,f0 - fmr f9,f0 - fmr f10,f0 - fmr f11,f0 - fmr f12,f0 - fmr f13,f0 - fmr f14,f0 - fmr f15,f0 - fmr f16,f0 - fmr f17,f0 - fmr f18,f0 - fmr f19,f0 - fmr f20,f0 - fmr f21,f0 - fmr f22,f0 - fmr f23,f0 - fmr f24,f0 - fmr f25,f0 - fmr f26,f0 - fmr f27,f0 - fmr f28,f0 - fmr f29,f0 - fmr f30,f0 - fmr f31,f0 - - li r0, MSR_VM_OFF ; Turn off floating point - mtmsr r0 - isync - -noFloat: rlwinm. r0,r17,0,pfAltivecb,pfAltivecb ; See if there is Altivec - beq- noVector ; Nope... - - li r0,0 ; Clear out a register - - lis r7,hi16(MSR_VEC_ON) ; Get real mode MSR + Altivec - ori r7,r7,lo16(MSR_VM_OFF) ; Get real mode MSR + Altivec - mtmsr r7 ; Set the real mode SRR */ - isync ; Make sure it has happened - - lis r5,hi16(EXT(QNaNbarbarian)) ; Altivec initializer - ori r5,r5,lo16(EXT(QNaNbarbarian)) ; Altivec initializer - - mtspr vrsave,r0 ; Set that no VRs are used yet */ - - vspltish v1,1 ; Turn on the non-Java bit and saturate - vspltisw v0,1 ; Turn on the saturate bit - vxor v1,v1,v0 ; Turn off saturate and leave non-Java set - lvx v0,br0,r5 ; Initialize VR0 - mtvscr v1 ; Clear the vector status register - vor v2,v0,v0 ; Copy into the next register - vor v1,v0,v0 ; Copy into the next register - vor v3,v0,v0 ; Copy into the next register - vor v4,v0,v0 ; Copy into the next register - vor v5,v0,v0 ; Copy into the next register - vor v6,v0,v0 ; Copy into the next register - vor v7,v0,v0 ; Copy into the next register - vor v8,v0,v0 ; Copy into the next register - vor v9,v0,v0 ; Copy into the next register - vor v10,v0,v0 ; Copy into the next register - vor v11,v0,v0 ; Copy into the next register - vor v12,v0,v0 ; Copy into the next register - vor v13,v0,v0 ; Copy into the next register - vor v14,v0,v0 ; Copy into the next register - vor v15,v0,v0 ; Copy into the next register - vor v16,v0,v0 ; Copy into the next register - vor v17,v0,v0 ; Copy into the next register - vor v18,v0,v0 ; Copy into the next register - vor v19,v0,v0 ; Copy into the next register - vor v20,v0,v0 ; Copy into the next register - vor v21,v0,v0 ; Copy into the next register - vor v22,v0,v0 ; Copy into the next register - vor v23,v0,v0 ; Copy into the next register - vor v24,v0,v0 ; Copy into the next register - vor v25,v0,v0 ; Copy into the next register - vor v26,v0,v0 ; Copy into the next register - vor v27,v0,v0 ; Copy into the next register - vor v28,v0,v0 ; Copy into the next register - vor v29,v0,v0 ; Copy into the next register - vor v30,v0,v0 ; Copy into the next register - vor v31,v0,v0 ; Copy into the next register - - li r0, MSR_VM_OFF ; Turn off vectors - mtmsr r0 - isync - -noVector: - bl EXT(cacheInit) ; Initializes all caches (including the TLB) - - bt bootCPU,run32 - - mfsprg r30,0 ; Phys per proc - lwz r29,PP_HIBERNATE(r30) - andi. r29, r29, 1 - beq noHashTableInit ; Skip following if not waking from from hibernate - bl EXT(hw_clear_maps) ; Mark all maps as absent from hash table - bl EXT(hw_hash_init) ; Clear hash table - bl EXT(save_snapshot_restore) ; Reset save area chains -noHashTableInit: - bl EXT(hw_setup_trans) ; Set up hardware needed for translation - bl EXT(hw_start_trans) ; Start translating - -run32: - rlwinm. r0,r17,0,pf64Bitb,pf64Bitb ; Is this a 64-bit machine? - beq++ isnot64 ; Skip following if not 64-bit... - - mfmsr r29 ; Get the MSR - rldicl r29,r29,0,MSR_SF_BIT+1 ; turn 64-bit mode off - mtmsrd r29 ; Set it - isync ; Make sure - -isnot64: bf bootCPU,callcpu - - lis r29,HIGH_ADDR(EXT(intstack)) ; move onto interrupt stack - ori r29,r29,LOW_ADDR(EXT(intstack)) - addi r29,r29,INTSTACK_SIZE-FM_SIZE - - li r28,0 - stw r28,FM_BACKPTR(r29) ; store a null frame backpointer - - mr r1,r29 - mr r3,r31 ; Restore any arguments we may have trashed - -; Note that we exit from here with translation still off - - bl EXT(ppc_init) ; Jump into boot init code - BREAKPOINT_TRAP - -callcpu: - mfsprg r31,1 ; Fake activation pointer - lwz r31,ACT_PER_PROC(r31) ; Load per_proc - lwz r29,PP_INTSTACK_TOP_SS(r31) ; move onto interrupt stack - - li r28,0 - stw r28,FM_BACKPTR(r29) ; store a null frame backpointer - - mr r1,r29 ; move onto new stack - mr r3,r31 ; Restore any arguments we may have trashed - -; Note that we exit from here with translation on - - bl EXT(ppc_init_cpu) ; Jump into cpu init code - BREAKPOINT_TRAP ; Should never return - -; -; Specific processor initialization routines -; - -; 750 - -init750: - bf firstBoot, init750nb ; No init for wakeup.... - - mfspr r13,l2cr ; Get the L2CR - rlwinm. r0,r13,0,l2e,l2e ; Any L2? - bne+ i750hl2 ; Yes... - rlwinm r17,r17,0,pfL2b+1,pfL2b-1 ; No L2, turn off feature - -i750hl2: - lis r14,hi16(256*1024) ; Base L2 size - addis r15,r13,0x3000 ; Hah... Figure this one out... - rlwinm r15,r15,4,30,31 ; Isolate - rlwinm. r8,r13,0,l2siz,l2sizf ; Was size valid? - slw r14,r14,r15 ; Set 256KB, 512KB, or 1MB - beq- init750l2none ; Not a valid setting... - - stw r13,pfl2crOriginal(r30) ; Shadow the L2CR - stw r13,pfl2cr(r30) ; Shadow the L2CR - stw r14,pfl2Size(r30) ; Set the L2 size - b init750l2done ; Done with L2 - -init750l2none: - rlwinm r17,r17,0,pfL2b+1,pfL2b-1 ; No level 2 cache - -init750l2done: - mfspr r11,hid0 ; Get the current HID0 - stw r11,pfHID0(r30) ; Save the HID0 value - blr ; Return... - -init750nb: - lwz r11,pfHID0(r30) ; Get HID0 - sync - mtspr hid0,r11 ; Set the HID - isync - sync - blr - -; 750CX - -init750CX: - bf firstBoot, init750 ; No init for wakeup.... - mfspr r13,hid1 ; Get HID1 - li r14,lo16(0xFD5F) ; Get valid - rlwinm r13,r13,4,28,31 ; Isolate - slw r14,r14,r13 ; Position - rlwimi r17,r14,15-pfCanNapb,pfCanNapb,pfCanNapb ; Set it - b init750 ; Join common... - - -; 750FX - -init750FX: - bf firstBoot, init750FXnb - mfspr r11, hid1 - stw r11, pfHID1(r30) ; Save the HID1 value - b init750 - -init750FXnb: - lwz r13, pfHID0(r30) ; Get HID0 - lwz r11, pfHID1(r30) ; Get HID1 - - rlwinm. r0, r11, 0, hid1ps, hid1ps ; Isolate the hid1ps bit - beq init750FXnb2 ; Clear BTIC if hid1ps set - rlwinm r13, r13, 0, btic+1, btic-1 ; Clear the BTIC bit - -init750FXnb2: - sync - mtspr hid0, r13 ; Set the HID - isync - sync - - rlwinm r12, r11, 0, hid1ps+1, hid1ps-1 ; Select PLL0 - mtspr hid1, r12 ; Restore PLL config - mftb r13 ; Wait 5000 ticks (> 200 us) - -init750FXnbloop: - mftb r14 - sub r14, r14, r13 - cmpli cr0, r14, 5000 - ble init750FXnbloop - mtspr hid1, r11 ; Select the desired PLL - blr - -; 750FX vers 2.0 or later -init750FXV2: - bf firstBoot, init750FXV2nb ; Wake from sleep - - mfspr r11, hid2 - stw r11, pfHID2(r30) ; Save the HID2 value - b init750FX ; Continue with 750FX init - -init750FXV2nb: - lwz r13, pfHID2(r30) ; Get HID2 - rlwinm r13, r13, 0, hid2vmin+1, hid2vmin-1 ; Clear the vmin bit - mtspr hid2, r13 ; Restore HID2 value - sync ; Wait for it to be done - b init750FX - -; 7400 - -init7400: bf firstBoot,i7400nb ; Do different if not initial boot... - mfspr r13,l2cr ; Get the L2CR - rlwinm. r0,r13,0,l2e,l2e ; Any L2? - bne+ i7400hl2 ; Yes... - rlwinm r17,r17,0,pfL2b+1,pfL2b-1 ; No L2, turn off feature - -i7400hl2: lis r14,hi16(256*1024) ; Base L2 size - addis r15,r13,0x3000 ; Hah... Figure this one out... - rlwinm r15,r15,4,30,31 - slw r14,r14,r15 ; Set 256KB, 512KB, 1MB, or 2MB - - stw r13,pfl2crOriginal(r30) ; Shadow the L2CR - stw r13,pfl2cr(r30) ; Shadow the L2CR - stw r14,pfl2Size(r30) ; Set the L2 size - - mfspr r11,hid0 ; Get the current HID0 - oris r11,r11,hi16(emcpm|eiecm) ; ? - mtspr hid0,r11 ; ? - isync - stw r11,pfHID0(r30) ; Save the HID0 value - - mfspr r11,msscr0 ; Get the msscr0 register - stw r11,pfMSSCR0(r30) ; Save the MSSCR0 value - mfspr r11,msscr1 ; Get the msscr1 register - stw r11,pfMSSCR1(r30) ; Save the MSSCR1 value - blr ; Return... - -i7400nb: - li r11,0 - mtspr l2cr,r11 ; Make sure L2CR is zero - lwz r11,pfHID0(r30) ; Get HID0 - sync - mtspr hid0,r11 ; Set the HID - isync - sync - lwz r11,pfMSSCR0(r30) ; Get MSSCR0 - isync - sync - mtspr msscr0,r11 ; Set the MSSCR0 - lwz r11,pfMSSCR1(r30) ; Get msscr1 - isync - sync - mtspr msscr1,r11 ; Set the msscr1 - isync - sync - blr - -; 7400 (ver 2.0 - ver 2.7) - -init7400v2_7: - bf firstBoot, init7400 - mfspr r13, hid0 ; Get the HID0 - ori r13, r13, nopdstm ; ? - mtspr hid0, r13 ; Set the HID0 - isync - sync - b init7400 - -; 7410 -; Note that this is the same as 7400 except we initialize the l2cr2 register - -init7410: li r13,0 ; Clear - mtspr 1016,r13 ; Turn off direct cache - b init7400 ; Join up with common.... - - -; 745X - Any 7450 family processor - -init745X: - bf firstBoot,init745Xnb ; Do different if not initial boot... - - mfspr r13,l2cr ; Get the L2CR - rlwinm. r0,r13,0,l2e,l2e ; Any L2? - bne+ init745Xhl2 ; Yes... - rlwinm r17,r17,0,pfL2b+1,pfL2b-1 ; No L2, turn off feature - -init745Xhl2: - mfpvr r14 ; Get processor version - rlwinm r14,r14,16,16,31 ; Isolate processor version - cmpli cr0, r14, PROCESSOR_VERSION_7457 ; Test for 7457 or - cmpli cr1, r14, PROCESSOR_VERSION_7447A ; 7447A - cror cr0_eq, cr1_eq, cr0_eq - lis r14,hi16(512*1024) ; 512KB L2 - beq init745Xhl2_2 - - lis r14,hi16(256*1024) ; Base L2 size - rlwinm r15,r13,22,12,13 ; Convert to 256k, 512k, or 768k - add r14,r14,r15 ; Add in minimum - -init745Xhl2_2: - stw r13,pfl2crOriginal(r30) ; Shadow the L2CR - stw r13,pfl2cr(r30) ; Shadow the L2CR - stw r14,pfl2Size(r30) ; Set the L2 size - -; Take care of level 3 cache - - mfspr r13,l3cr ; Get the L3CR - rlwinm. r0,r13,0,l3e,l3e ; Any L3? - bne+ init745Xhl3 ; Yes... - rlwinm r17,r17,0,pfL3b+1,pfL3b-1 ; No L3, turn off feature - -init745Xhl3: cmplwi cr0,r13,0 ; No L3 if L3CR is zero - beq- init745Xnone ; Go turn off the features... - lis r14,hi16(1024*1024) ; Base L3 size - rlwinm r15,r13,4,31,31 ; Get size multiplier - slw r14,r14,r15 ; Set 1 or 2MB - - stw r13,pfl3crOriginal(r30) ; Shadow the L3CR - stw r13,pfl3cr(r30) ; Shadow the L3CR - stw r14,pfl3Size(r30) ; Set the L3 size - b init745Xfin ; Return.... - -init745Xnone: - rlwinm r17,r17,0,pfL3fab+1,pfL3b-1 ; No 3rd level cache or assist - rlwinm r11,r17,pfWillNapb-pfCanNapb,pfCanNapb,pfCanNapb ; Set pfCanNap if pfWillNap is set - or r17,r17,r11 - -init745Xfin: - rlwinm r17,r17,0,pfWillNapb+1,pfWillNapb-1 ; Make sure pfWillNap is not set - - mfspr r11,hid0 ; Get the current HID0 - stw r11,pfHID0(r30) ; Save the HID0 value - mfspr r11,hid1 ; Get the current HID1 - stw r11,pfHID1(r30) ; Save the HID1 value - mfspr r11,msscr0 ; Get the msscr0 register - stw r11,pfMSSCR0(r30) ; Save the MSSCR0 value - mfspr r11,msscr1 ; Get the msscr1 register - stw r11,pfMSSCR1(r30) ; Save the MSSCR1 value - mfspr r11,ictrl ; Get the ictrl register - stw r11,pfICTRL(r30) ; Save the ICTRL value - mfspr r11,ldstcr ; Get the ldstcr register - stw r11,pfLDSTCR(r30) ; Save the LDSTCR value - mfspr r11,ldstdb ; Get the ldstdb register - stw r11,pfLDSTDB(r30) ; Save the LDSTDB value - mfspr r11,pir ; Get the pir register - stw r11,pfBootConfig(r30) ; Save the BootConfig value - blr ; Return.... - - -init745Xnb: lwz r11,pfHID0(r30) ; Get HID0 - sync - mtspr hid0,r11 ; Set the HID - isync - lwz r11,pfHID1(r30) ; Get HID1 - sync - mtspr hid1,r11 ; Set the HID - isync - lwz r11,pfMSSCR0(r30) ; Get MSSCR0 - sync - mtspr msscr0,r11 ; Set the MSSCR0 - isync - sync - lwz r11,pfICTRL(r30) ; Get ICTRL - sync - mtspr ictrl,r11 ; Set the ICTRL - isync - sync - lwz r11,pfLDSTCR(r30) ; Get LDSTCR - sync - mtspr ldstcr,r11 ; Set the LDSTCR - isync - sync - lwz r11,pfLDSTDB(r30) ; Get LDSTDB - sync - mtspr ldstdb,r11 ; Set the LDSTDB - isync - sync - blr - -; 7450 - Specific - -init7450: - bf firstBoot, init745X ; Not boot, use standard init - - mfspr r13, pir ; Get BootConfig from PIR - rlwinm. r14, r13, 0, 20, 23 ; Is the pdet value zero - bne init7450done ; No, done for now - - ori r13, r13, 0x0400 ; Force pdet value to 4 - mtspr pir, r13 ; Write back the BootConfig - -init7450done: - b init745X ; Continue with standard init - - -init970: - lis r20,8 ; Set up for 512K L2 -init970x: - li r0,0 ; Clear this - mtspr hior,r0 ; Make sure that 0 is interrupt prefix - bf firstBoot,init970nb ; No init for wakeup or second processor.... - - -; -; We can not query or change the L2 size. We will just -; phoney up a L2CR to make sysctl "happy" and set the -; L2 size to 512K. -; - - lis r0,0x8000 ; Synthesize a "valid" but non-existant L2CR - stw r0,pfl2crOriginal(r30) ; Set a dummy L2CR - stw r0,pfl2cr(r30) ; Set a dummy L2CR - stw r20,pfl2Size(r30) ; Set the L2 size - - mfspr r11,hid0 ; Get original hid0 - std r11,pfHID0(r30) ; Save original - mfspr r11,hid1 ; Get original hid1 - std r11,pfHID1(r30) ; Save original - mfspr r11,hid4 ; Get original hid4 - std r11,pfHID4(r30) ; Save original - mfspr r11,hid5 ; Get original hid5 - std r11,pfHID5(r30) ; Save original - - lis r0, hi16(dnapm) ; Create a mask for the dnap bit - sldi r0, r0, 32 ; Shift to the top half - ld r11,pfHID0(r30) ; Load the hid0 value - andc r11, r11, r0 ; Clear the dnap bit - isync - mtspr hid0,r11 ; Stuff it - mfspr r11,hid0 ; Get it - mfspr r11,hid0 ; Get it - mfspr r11,hid0 ; Get it - mfspr r11,hid0 ; Get it - mfspr r11,hid0 ; Get it - mfspr r11,hid0 ; Get it - isync - - lis r0,(pcfValid|pcfLarge|pcfDedSeg)<<8 ; Set the valid bit, dedicated segment, and large page flags - ori r0,r0,(24<<8)|24 ; Add in the 16M page size - stw r0,lgpPcfg+(pcfSize*pcfLargePcfg)(0) ; Set the 16M primary large page configuration entry - - blr - -; -; Start up code for second processor or wake up from sleep -; - -init970nb: - lis r0, hi16(dnapm) ; Create a mask for the dnap bit - sldi r0, r0, 32 ; Shift to the top half - ld r11,pfHID0(r30) ; Load the hid0 value - andc r11, r11, r0 ; Clear the dnap bit - isync - mtspr hid0,r11 ; Stuff it - mfspr r11,hid0 ; Get it - mfspr r11,hid0 ; Get it - mfspr r11,hid0 ; Get it - mfspr r11,hid0 ; Get it - mfspr r11,hid0 ; Get it - mfspr r11,hid0 ; Get it - isync - - ld r20,pfHID1(r30) ; Get it - isync - mtspr hid1,r20 ; Stick it - mtspr hid1,r20 ; Stick it again - isync - - ld r11,pfHID4(r30) ; Get it - sync - mtspr hid4,r11 ; Stick it - isync - - lis r11,0xE000 ; Get the unlikeliest ESID possible - srdi r11,r11,1 ; Make 0x7FFFFFFFF0000000 - slbie r11 ; Make sure the ERAT is cleared - - ld r11,pfHID5(r30) ; Get it - mtspr hid5,r11 ; Set it - isync -; -; May have changed dcbz mode so kill icache -; - - eqv r13,r13,r13 ; Get a constant -1 - mr r14,r20 ; Save HID1 - rldimi r14,r13,54,9 ; Set force icbi match mode - - li r11,0 ; Set start if ICBI range - isync - mtspr hid1,r14 ; Stick it - mtspr hid1,r14 ; Stick it again - isync - -inin970ki: icbi 0,r11 ; Kill I$ - addi r11,r11,128 ; Next line - andis. r0,r11,1 ; Have we done them all? - beq++ inin970ki ; Not yet... - - isync - mtspr hid1,r20 ; Stick it - mtspr hid1,r20 ; Stick it again - isync - - blr ; Leave... - - - -; Unsupported Processors -initUnsupported: - mtlr r2 ; Restore the return address - blr ; Return to the booter - - -; -; Processor to feature table - -; .align 2 - Always on word boundary -; .long ptFilter - Mask of significant bits in the Version/Revision code -; - NOTE: Always order from most restrictive to least restrictive matching -; .short ptVersion - Version code from PVR. Always start with 0 which is default -; .short ptRevision - Revision code from PVR. A zero value denotes the generic attributes if not specific -; .long ptFeatures - Available features -; .long ptCPUCap - Default value for _cpu_capabilities -; .long ptPwrModes - Available power management features -; .long ptPatch - Patch features -; .long ptInitRout - Initilization routine. Can modify any of the other attributes. -; .long ptRptdProc - Processor type reported -; .long ptLineSize - Level 1 cache line size -; .long ptl1iSize - Level 1 instruction cache size -; .long ptl1dSize - Level 1 data cache size -; .long ptPTEG - Size of PTEG -; .long ptMaxVAddr - Maximum effective address -; .long ptMaxPAddr - Maximum physical address -; - - .align 2 -processor_types: - -; 750CX (ver 2.x) - - .align 2 - .long 0xFFFF0F00 ; 2.x vers - .short PROCESSOR_VERSION_750 - .short 0x0200 - .long pfFloat | pfCanSleep | pfCanNap | pfCanDoze | pf32Byte | pfL2 - .long kCache32 | kHasGraphicsOps | kHasStfiwx - .long 0 - .long PatchExt32 - .long init750CX - .long CPU_SUBTYPE_POWERPC_750 - .long 32 - .long 32*1024 - .long 32*1024 - .long 64 - .long 52 - .long 32 - -; 750 (generic) - - .align 2 - .long 0xFFFF0000 ; All revisions - .short PROCESSOR_VERSION_750 - .short 0 - .long pfFloat | pfCanSleep | pfCanNap | pfCanDoze | pf32Byte | pfL2 - .long kCache32 | kHasGraphicsOps | kHasStfiwx - .long 0 - .long PatchExt32 - .long init750 - .long CPU_SUBTYPE_POWERPC_750 - .long 32 - .long 32*1024 - .long 32*1024 - .long 64 - .long 52 - .long 32 - -; 750FX (ver 1.x) - - .align 2 - .long 0xFFFF0F00 ; 1.x vers - .short PROCESSOR_VERSION_750FX - .short 0x0100 - .long pfFloat | pfCanSleep | pfCanNap | pfCanDoze | pfSlowNap | pfNoMuMMCK | pf32Byte | pfL2 - .long kCache32 | kHasGraphicsOps | kHasStfiwx - .long pmDualPLL - .long PatchExt32 - .long init750FX - .long CPU_SUBTYPE_POWERPC_750 - .long 32 - .long 32*1024 - .long 32*1024 - .long 64 - .long 52 - .long 32 - -; 750FX (generic) - - .align 2 - .long 0xFFFF0000 ; All revisions - .short PROCESSOR_VERSION_750FX - .short 0 - .long pfFloat | pfCanSleep | pfCanNap | pfCanDoze | pfSlowNap | pfNoMuMMCK | pf32Byte | pfL2 - .long kCache32 | kHasGraphicsOps | kHasStfiwx - .long pmDualPLL | pmDPLLVmin - .long PatchExt32 - .long init750FXV2 - .long CPU_SUBTYPE_POWERPC_750 - .long 32 - .long 32*1024 - .long 32*1024 - .long 64 - .long 52 - .long 32 - -; 7400 (ver 2.0 - ver 2.7) - - .align 2 - .long 0xFFFFFFF8 ; ver 2.0 - 2.7 - .short PROCESSOR_VERSION_7400 - .short 0x0200 - .long pfFloat | pfAltivec | pfSMPcap | pfCanSleep | pfCanNap | pfCanDoze | pf32Byte | pfL1fa | pfL2 | pfL2fa | pfHasDcba - .long kHasAltivec | kCache32 | kDcbaAvailable | kDataStreamsAvailable | kHasGraphicsOps | kHasStfiwx - .long 0 - .long PatchExt32 - .long init7400v2_7 - .long CPU_SUBTYPE_POWERPC_7400 - .long 32 - .long 32*1024 - .long 32*1024 - .long 64 - .long 52 - .long 32 - -; 7400 (generic) - - .align 2 - .long 0xFFFF0000 ; All revisions - .short PROCESSOR_VERSION_7400 - .short 0 - .long pfFloat | pfAltivec | pfSMPcap | pfCanSleep | pfCanNap | pfCanDoze | pf32Byte | pfL1fa | pfL2 | pfL2fa | pfHasDcba - .long kHasAltivec | kCache32 | kDcbaAvailable | kDataStreamsRecommended | kDataStreamsAvailable | kHasGraphicsOps | kHasStfiwx - .long 0 - .long PatchExt32 - .long init7400 - .long CPU_SUBTYPE_POWERPC_7400 - .long 32 - .long 32*1024 - .long 32*1024 - .long 64 - .long 52 - .long 36 - -; 7410 (ver 1.1) - - .align 2 - .long 0xFFFFFFFF ; Exact match - .short PROCESSOR_VERSION_7400 - .short 0x1101 - .long pfFloat | pfAltivec | pfSMPcap | pfCanSleep | pfCanNap | pfCanDoze | pf32Byte | pfL1fa | pfL2 | pfL2fa | pfHasDcba - .long kHasAltivec | kCache32 | kDcbaAvailable | kDataStreamsRecommended | kDataStreamsAvailable | kHasGraphicsOps | kHasStfiwx - .long 0 - .long PatchExt32 - .long init7410 - .long CPU_SUBTYPE_POWERPC_7400 - .long 32 - .long 32*1024 - .long 32*1024 - .long 64 - .long 52 - .long 36 - -; 7410 (generic) - - .align 2 - .long 0xFFFF0000 ; All other revisions - .short PROCESSOR_VERSION_7410 - .short 0 - .long pfFloat | pfAltivec | pfSMPcap | pfCanSleep | pfCanNap | pfCanDoze | pf32Byte | pfL1fa | pfL2 | pfL2fa | pfHasDcba - .long kHasAltivec | kCache32 | kDcbaAvailable | kDataStreamsRecommended | kDataStreamsAvailable | kHasGraphicsOps | kHasStfiwx - .long 0 - .long PatchExt32 - .long init7410 - .long CPU_SUBTYPE_POWERPC_7400 - .long 32 - .long 32*1024 - .long 32*1024 - .long 64 - .long 52 - .long 36 - -; 7450 (ver 1.xx) - - .align 2 - .long 0xFFFFFF00 ; Just revisions 1.xx - .short PROCESSOR_VERSION_7450 - .short 0x0100 - .long pfFloat | pfAltivec | pfSMPcap | pfCanSleep | pfNoMSRir | pfNoL2PFNap | pfLClck | pf32Byte | pfL2 | pfL2fa | pfL2i | pfL3 | pfL3fa | pfHasDcba - .long kHasAltivec | kCache32 | kDcbaAvailable | kDataStreamsRecommended | kDataStreamsAvailable | kHasGraphicsOps | kHasStfiwx - .long 0 - .long PatchExt32 - .long init7450 - .long CPU_SUBTYPE_POWERPC_7450 - .long 32 - .long 32*1024 - .long 32*1024 - .long 64 - .long 52 - .long 36 - -; 7450 (2.0) - - .align 2 - .long 0xFFFFFFFF ; Just revision 2.0 - .short PROCESSOR_VERSION_7450 - .short 0x0200 - .long pfFloat | pfAltivec | pfSMPcap | pfCanSleep | pfNoMSRir | pfNoL2PFNap | pfLClck | pf32Byte | pfL2 | pfL2fa | pfL2i | pfL3 | pfL3fa | pfHasDcba - .long kHasAltivec | kCache32 | kDcbaAvailable | kDataStreamsRecommended | kDataStreamsAvailable | kHasGraphicsOps | kHasStfiwx - .long 0 - .long PatchExt32 - .long init7450 - .long CPU_SUBTYPE_POWERPC_7450 - .long 32 - .long 32*1024 - .long 32*1024 - .long 64 - .long 52 - .long 36 - -; 7450 (2.1) - - .align 2 - .long 0xFFFF0000 ; All other revisions - .short PROCESSOR_VERSION_7450 - .short 0 - .long pfFloat | pfAltivec | pfSMPcap | pfCanSleep | pfWillNap | pfNoMSRir | pfNoL2PFNap | pfLClck | pf32Byte | pfL2 | pfL2fa | pfL2i | pfL3 | pfL3fa | pfHasDcba - .long kHasAltivec | kCache32 | kDcbaAvailable | kDataStreamsRecommended | kDataStreamsAvailable | kHasGraphicsOps | kHasStfiwx - .long 0 - .long PatchExt32 - .long init7450 - .long CPU_SUBTYPE_POWERPC_7450 - .long 32 - .long 32*1024 - .long 32*1024 - .long 64 - .long 52 - .long 36 - -; 7455 (1.xx) Just like 7450 2.0 - - .align 2 - .long 0xFFFFFF00 ; Just revisions 1.xx - .short PROCESSOR_VERSION_7455 - .short 0x0100 - .long pfFloat | pfAltivec | pfSMPcap | pfCanSleep | pfNoMSRir | pfNoL2PFNap | pfLClck | pf32Byte | pfL2 | pfL2fa | pfL2i | pfL3 | pfL3fa | pfHasDcba - .long kHasAltivec | kCache32 | kDcbaAvailable | kDataStreamsRecommended | kDataStreamsAvailable | kHasGraphicsOps | kHasStfiwx - .long 0 - .long PatchExt32 - .long init745X - .long CPU_SUBTYPE_POWERPC_7450 - .long 32 - .long 32*1024 - .long 32*1024 - .long 64 - .long 52 - .long 36 - -; 7455 (2.0) - - .align 2 - .long 0xFFFFFFFF ; Just revision 2.0 - .short PROCESSOR_VERSION_7455 - .short 0x0200 - .long pfFloat | pfAltivec | pfSMPcap | pfCanSleep | pfWillNap | pfNoMSRir | pfNoL2PFNap | pfLClck | pf32Byte | pfL2 | pfL2fa | pfL2i | pfL3 | pfL3fa | pfHasDcba - .long kHasAltivec | kCache32 | kDcbaAvailable | kDataStreamsRecommended | kDataStreamsAvailable | kHasGraphicsOps | kHasStfiwx - .long 0 - .long PatchExt32 - .long init745X - .long CPU_SUBTYPE_POWERPC_7450 - .long 32 - .long 32*1024 - .long 32*1024 - .long 64 - .long 52 - .long 36 - -; 7455 (2.1) - - .align 2 - .long 0xFFFF0000 ; All other revisions - .short PROCESSOR_VERSION_7455 - .short 0 - .long pfFloat | pfAltivec | pfSMPcap | pfCanSleep | pfCanNap | pfNoMSRir | pfNoL2PFNap | pfLClck | pf32Byte | pfL2 | pfL2fa | pfL2i | pfL3 | pfL3fa | pfHasDcba - .long kHasAltivec | kCache32 | kDcbaAvailable | kDataStreamsRecommended | kDataStreamsAvailable | kHasGraphicsOps | kHasStfiwx - .long 0 - .long PatchExt32 - .long init745X - .long CPU_SUBTYPE_POWERPC_7450 - .long 32 - .long 32*1024 - .long 32*1024 - .long 64 - .long 52 - .long 36 - -; 7457 - - .align 2 - .long 0xFFFF0000 ; All revisions - .short PROCESSOR_VERSION_7457 - .short 0 - .long pfFloat | pfAltivec | pfSMPcap | pfCanSleep | pfCanNap | pfNoMSRir | pfNoL2PFNap | pfLClck | pf32Byte | pfL2 | pfL2fa | pfL2i | pfL3 | pfL3fa | pfHasDcba - .long kHasAltivec | kCache32 | kDcbaAvailable | kDataStreamsRecommended | kDataStreamsAvailable | kHasGraphicsOps | kHasStfiwx - .long 0 - .long PatchExt32 - .long init745X - .long CPU_SUBTYPE_POWERPC_7450 - .long 32 - .long 32*1024 - .long 32*1024 - .long 64 - .long 52 - .long 36 - -; 7447A - - .align 2 - .long 0xFFFF0000 ; All revisions - .short PROCESSOR_VERSION_7447A - .short 0 - .long pfFloat | pfAltivec | pfSMPcap | pfCanSleep | pfCanNap | pfNoMSRir | pfNoL2PFNap | pfLClck | pf32Byte | pfL2 | pfL2fa | pfL2i | pfL3 | pfL3fa | pfHasDcba - .long kHasAltivec | kCache32 | kDcbaAvailable | kDataStreamsRecommended | kDataStreamsAvailable | kHasGraphicsOps | kHasStfiwx - .long pmDFS - .long PatchExt32 - .long init745X - .long CPU_SUBTYPE_POWERPC_7450 - .long 32 - .long 32*1024 - .long 32*1024 - .long 64 - .long 52 - .long 36 - -; 970 - - .align 2 - .long 0xFFFF0000 ; All versions so far - .short PROCESSOR_VERSION_970 - .short 0 - .long pfFloat | pfAltivec | pfSMPcap | pfCanSleep | pfCanNap | pf128Byte | pf64Bit | pfL2 | pfSCOMFixUp - .long kHasAltivec | k64Bit | kCache128 | kDataStreamsAvailable | kDcbtStreamsRecommended | kDcbtStreamsAvailable | kHasGraphicsOps | kHasStfiwx | kHasFsqrt - .long 0 - .long PatchLwsync - .long init970 - .long CPU_SUBTYPE_POWERPC_970 - .long 128 - .long 64*1024 - .long 32*1024 - .long 128 - .long 65 - .long 42 - -; 970FX - - .align 2 - .long 0xFFFF0000 ; All versions so far - .short PROCESSOR_VERSION_970FX - .short 0 - .long pfFloat | pfAltivec | pfSMPcap | pfCanSleep | pfCanNap | pf128Byte | pf64Bit | pfL2 - .long kHasAltivec | k64Bit | kCache128 | kDataStreamsAvailable | kDcbtStreamsRecommended | kDcbtStreamsAvailable | kHasGraphicsOps | kHasStfiwx | kHasFsqrt - .long pmPowerTune - .long PatchLwsync - .long init970 - .long CPU_SUBTYPE_POWERPC_970 - .long 128 - .long 64*1024 - .long 32*1024 - .long 128 - .long 65 - .long 42 - - -; All other processors are not supported - - .align 2 - .long 0x00000000 ; Matches everything - .short 0 - .short 0 - .long pfFloat | pf32Byte - .long kCache32 | kHasGraphicsOps | kHasStfiwx - .long 0 - .long PatchExt32 - .long initUnsupported - .long CPU_SUBTYPE_POWERPC_ALL - .long 32 - .long 32*1024 - .long 32*1024 - .long 64 - .long 52 - .long 32 - diff --git a/osfmk/ppc/status.c b/osfmk/ppc/status.c deleted file mode 100644 index 50fee6015..000000000 --- a/osfmk/ppc/status.c +++ /dev/null @@ -1,1820 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -typedef unsigned int fixpt_t; /* XXX not self contained */ -#include /* USRSTACK, etc. */ - -#include - -extern unsigned int killprint; -extern double FloatInit; -extern unsigned long QNaNbarbarian[4]; - -kern_return_t -thread_userstack( - thread_t, - int, - thread_state_t, - unsigned int, - mach_vm_offset_t *, - int * -); - -kern_return_t -thread_entrypoint( - thread_t, - int, - thread_state_t, - unsigned int, - mach_vm_offset_t * -); - -unsigned int get_msr_exportmask(void); -unsigned int get_msr_nbits(void); -unsigned int get_msr_rbits(void); -void ppc_checkthreadstate(void *, int); -void thread_set_child(thread_t child, int pid); -void thread_set_parent(thread_t parent, int pid); -void thread_set_cthreadself(thread_t thread, uint64_t pself, int isLP64); - -/* - * Maps state flavor to number of words in the state: - */ -/* __private_extern__ */ -unsigned int _MachineStateCount[] = { - /* FLAVOR_LIST */ 0, - PPC_THREAD_STATE_COUNT, - PPC_FLOAT_STATE_COUNT, - PPC_EXCEPTION_STATE_COUNT, - PPC_VECTOR_STATE_COUNT, - PPC_THREAD_STATE64_COUNT, - PPC_EXCEPTION_STATE64_COUNT, -}; - -/* - * thread_getstatus: - * - * Get the status of the specified thread. - */ - -kern_return_t -machine_thread_get_state( - thread_t thread, - thread_flavor_t flavor, - thread_state_t tstate, - mach_msg_type_number_t *count) -{ - - register struct savearea *sv; /* Pointer to the context savearea */ - register savearea_fpu *fsv; - register savearea_vec *vsv; - struct savearea *genuser; - int i, j; - unsigned int vrvalidwrk; - - register struct ppc_thread_state *ts; - register struct ppc_thread_state64 *xts; - register struct ppc_exception_state *es; - register struct ppc_exception_state64 *xes; - register struct ppc_float_state *fs; - register struct ppc_vector_state *vs; - - genuser = find_user_regs(thread); - - switch (flavor) { - - case THREAD_STATE_FLAVOR_LIST: - - if (*count < 6) { - return (KERN_INVALID_ARGUMENT); - } - - tstate[0] = PPC_THREAD_STATE; - tstate[1] = PPC_FLOAT_STATE; - tstate[2] = PPC_EXCEPTION_STATE; - tstate[3] = PPC_VECTOR_STATE; - tstate[4] = PPC_THREAD_STATE64; - tstate[5] = PPC_EXCEPTION_STATE64; - *count = 6; - - return KERN_SUCCESS; - - case PPC_THREAD_STATE: - - if (*count < PPC_THREAD_STATE_COUNT) { /* Is the count ok? */ - return KERN_INVALID_ARGUMENT; - } - - ts = (struct ppc_thread_state *) tstate; - - sv = genuser; /* Copy this over */ - - if(sv) { /* Is there a save area yet? */ - ts->r0 = (unsigned int)sv->save_r0; - ts->r1 = (unsigned int)sv->save_r1; - ts->r2 = (unsigned int)sv->save_r2; - ts->r3 = (unsigned int)sv->save_r3; - ts->r4 = (unsigned int)sv->save_r4; - ts->r5 = (unsigned int)sv->save_r5; - ts->r6 = (unsigned int)sv->save_r6; - ts->r7 = (unsigned int)sv->save_r7; - ts->r8 = (unsigned int)sv->save_r8; - ts->r9 = (unsigned int)sv->save_r9; - ts->r10 = (unsigned int)sv->save_r10; - ts->r11 = (unsigned int)sv->save_r11; - ts->r12 = (unsigned int)sv->save_r12; - ts->r13 = (unsigned int)sv->save_r13; - ts->r14 = (unsigned int)sv->save_r14; - ts->r15 = (unsigned int)sv->save_r15; - ts->r16 = (unsigned int)sv->save_r16; - ts->r17 = (unsigned int)sv->save_r17; - ts->r18 = (unsigned int)sv->save_r18; - ts->r19 = (unsigned int)sv->save_r19; - ts->r20 = (unsigned int)sv->save_r20; - ts->r21 = (unsigned int)sv->save_r21; - ts->r22 = (unsigned int)sv->save_r22; - ts->r23 = (unsigned int)sv->save_r23; - ts->r24 = (unsigned int)sv->save_r24; - ts->r25 = (unsigned int)sv->save_r25; - ts->r26 = (unsigned int)sv->save_r26; - ts->r27 = (unsigned int)sv->save_r27; - ts->r28 = (unsigned int)sv->save_r28; - ts->r29 = (unsigned int)sv->save_r29; - ts->r30 = (unsigned int)sv->save_r30; - ts->r31 = (unsigned int)sv->save_r31; - ts->cr = (unsigned int)sv->save_cr; - ts->xer = (unsigned int)sv->save_xer; - ts->lr = (unsigned int)sv->save_lr; - ts->ctr = (unsigned int)sv->save_ctr; - ts->srr0 = (unsigned int)sv->save_srr0; - ts->srr1 = (unsigned int)sv->save_srr1; - ts->mq = 0; /* MQ register (601 only) */ - ts->vrsave = (unsigned int)sv->save_vrsave; /* VRSAVE register (Altivec only) */ - } - else { /* No user state yet. Save seemingly random values. */ - - for(i=0; i < 32; i+=2) { /* Fill up with defaults */ - ((unsigned int *)&ts->r0)[i] = ((unsigned int *)&FloatInit)[0]; - ((unsigned int *)&ts->r0)[i+1] = ((unsigned int *)&FloatInit)[1]; - } - ts->cr = 0; - ts->xer = 0; - ts->lr = ((unsigned int *)&FloatInit)[0]; - ts->ctr = ((unsigned int *)&FloatInit)[1]; - ts->srr0 = ((unsigned int *)&FloatInit)[0]; - ts->srr1 = MSR_EXPORT_MASK_SET; - ts->mq = 0; - ts->vrsave = 0; /* VRSAVE register (Altivec only) */ - } - - *count = PPC_THREAD_STATE_COUNT; /* Pass back the amount we actually copied */ - return KERN_SUCCESS; - - - case PPC_THREAD_STATE64: - - if (*count < PPC_THREAD_STATE64_COUNT) { /* Is the count ok? */ - return KERN_INVALID_ARGUMENT; - } - - xts = (struct ppc_thread_state64 *) tstate; - - sv = genuser; /* Copy this over */ - - if(sv) { /* Is there a save area yet? */ - xts->r0 = sv->save_r0; - xts->r1 = sv->save_r1; - xts->r2 = sv->save_r2; - xts->r3 = sv->save_r3; - xts->r4 = sv->save_r4; - xts->r5 = sv->save_r5; - xts->r6 = sv->save_r6; - xts->r7 = sv->save_r7; - xts->r8 = sv->save_r8; - xts->r9 = sv->save_r9; - xts->r10 = sv->save_r10; - xts->r11 = sv->save_r11; - xts->r12 = sv->save_r12; - xts->r13 = sv->save_r13; - xts->r14 = sv->save_r14; - xts->r15 = sv->save_r15; - xts->r16 = sv->save_r16; - xts->r17 = sv->save_r17; - xts->r18 = sv->save_r18; - xts->r19 = sv->save_r19; - xts->r20 = sv->save_r20; - xts->r21 = sv->save_r21; - xts->r22 = sv->save_r22; - xts->r23 = sv->save_r23; - xts->r24 = sv->save_r24; - xts->r25 = sv->save_r25; - xts->r26 = sv->save_r26; - xts->r27 = sv->save_r27; - xts->r28 = sv->save_r28; - xts->r29 = sv->save_r29; - xts->r30 = sv->save_r30; - xts->r31 = sv->save_r31; - xts->cr = sv->save_cr; - xts->xer = sv->save_xer; - xts->lr = sv->save_lr; - xts->ctr = sv->save_ctr; - xts->srr0 = sv->save_srr0; - xts->srr1 = sv->save_srr1; - xts->vrsave = sv->save_vrsave; /* VRSAVE register (Altivec only) */ - } - else { /* No user state yet. Save seemingly random values. */ - - for(i=0; i < 32; i++) { /* Fill up with defaults */ - ((unsigned long long *)&xts->r0)[i] = ((unsigned long long *)&FloatInit)[0]; - } - xts->cr = 0; - xts->xer = 0; - xts->lr = ((unsigned long long *)&FloatInit)[0]; - xts->ctr = ((unsigned long long *)&FloatInit)[0]; - xts->srr0 = ((unsigned long long *)&FloatInit)[0]; - xts->srr1 = MSR_EXPORT_MASK_SET; - if(task_has_64BitAddr(thread->task)) - xts->srr1 |= (uint64_t)MASK32(MSR_SF) << 32; /* If 64-bit task, force 64-bit mode */ - xts->vrsave = 0; /* VRSAVE register (Altivec only) */ - } - - *count = PPC_THREAD_STATE64_COUNT; /* Pass back the amount we actually copied */ - return KERN_SUCCESS; - - case PPC_EXCEPTION_STATE: - - if (*count < PPC_EXCEPTION_STATE_COUNT) { - return KERN_INVALID_ARGUMENT; - } - - es = (struct ppc_exception_state *) tstate; - sv = genuser; /* Copy this over */ - - if(sv) { /* See if valid state yet */ - es->dar = (unsigned int)sv->save_dar; - es->dsisr = sv->save_dsisr; - es->exception = sv->save_exception; - } - else { /* Nope, not yet */ - es->dar = 0; - es->dsisr = 0; - es->exception = ((unsigned int *)&FloatInit)[0]; - } - - *count = PPC_EXCEPTION_STATE_COUNT; - return KERN_SUCCESS; - - case PPC_EXCEPTION_STATE64: - - if (*count < PPC_EXCEPTION_STATE64_COUNT) { - return KERN_INVALID_ARGUMENT; - } - - xes = (struct ppc_exception_state64 *) tstate; - sv = genuser; /* Copy this over */ - - if(sv) { /* See if valid state yet */ - xes->dar = sv->save_dar; - xes->dsisr = sv->save_dsisr; - xes->exception = sv->save_exception; - } - else { /* Nope, not yet */ - xes->dar = 0; - xes->dsisr = 0; - xes->exception = ((unsigned int *)&FloatInit)[0]; - } - - *count = PPC_EXCEPTION_STATE64_COUNT; - return KERN_SUCCESS; - - case PPC_FLOAT_STATE: - - if (*count < PPC_FLOAT_STATE_COUNT) { - return KERN_INVALID_ARGUMENT; - } - - fpu_save(thread->machine.curctx); /* Just in case it's live, save it */ - - fs = (struct ppc_float_state *) tstate; /* Point to destination */ - - fsv = find_user_fpu(thread); /* Get the user's fpu savearea */ - - if(fsv) { /* See if we have any */ - bcopy((char *)&fsv->save_fp0, (char *)fs, 32*8); /* 32 registers */ - fs->fpscr_pad = 0; /* Be clean and tidy */ - if(genuser) fs->fpscr = genuser->save_fpscr; /* Set the fpscr value to general */ - else fs->fpscr = 0; /* If no user, initialize this */ - } - else { /* No floating point yet */ - - for(i=0; i < 32; i++) { /* Initialize floating points */ - fs->fpregs[i] = FloatInit; /* Initial value */ - } - fs->fpscr_pad = 0; /* Initial value */ - fs->fpscr = 0; /* Initial value */ - } - - *count = PPC_FLOAT_STATE_COUNT; - - return KERN_SUCCESS; - - case PPC_VECTOR_STATE: - - if (*count < PPC_VECTOR_STATE_COUNT) { - return KERN_INVALID_ARGUMENT; - } - - vec_save(thread->machine.curctx); /* Just in case it's live, save it */ - - vs = (struct ppc_vector_state *) tstate; /* Point to destination */ - - vsv = find_user_vec(thread); /* Find the vector savearea */ - - if(vsv) { /* See if we have any */ - - vrvalidwrk = vsv->save_vrvalid; /* Get the valid flags */ - vs->save_vrvalid = vsv->save_vrvalid; /* Set the valid flags */ - if(genuser) for(j=0; j < 4; j++) vs->save_vscr[j] = genuser->save_vscr[j]; /* Set value for vscr */ - else { - vs->save_vscr[0] = 0; /* Set an initial value if no general user yet */ - vs->save_vscr[1] = 0; - vs->save_vscr[2] = 0; - vs->save_vscr[3] = 0x00010000; /* Always start with Java mode off */ - } - for(i=0; i < 32; i++) { /* Copy the saved registers and invalidate the others */ - for(j=0; j < 4; j++) { - if(vrvalidwrk & 0x80000000) (vs->save_vr)[i][j] = - ((unsigned int *)&(vsv->save_vr0))[(i * 4) + j]; /* We have this register saved */ - else vs->save_vr[i][j] = QNaNbarbarian[j]; /* Set invalid value */ - } - vrvalidwrk = vrvalidwrk << 1; /* Shift over to the next */ - } - } - else { /* No vector yet */ - - for(i=0; i < 32; i++) { /* Initialize vector registers */ - for(j=0; j < 4; j++) vs->save_vr[i][j] = QNaNbarbarian[j]; /* Initial value */ - } - - if(genuser) for(j=0; j < 4; j++) vs->save_vscr[j] = genuser->save_vscr[j]; /* Set value for vscr */ - else { - vs->save_vscr[0] = 0; /* Set an initial value if no general user yet */ - vs->save_vscr[1] = 0; - vs->save_vscr[2] = 0; - vs->save_vscr[3] = 0x00010000; /* Always start with Java mode off */ - } - vs->save_vrvalid = 0; /* Clear the valid flags */ - } - - for (i=0; i < 4; i++) vs->save_pad5[i] = 0; /* Clear cruft */ - for (i=0; i < 7; i++) vs->save_pad6[i] = 0; /* Clear cruft */ - - *count = PPC_VECTOR_STATE_COUNT; - return KERN_SUCCESS; - - default: - return KERN_INVALID_ARGUMENT; - } -} -/* Close cousin of machine_thread_get_state(). - * This function is currently incomplete since we don't really need vector - * or FP for the core dump (the save area can be accessed directly if the - * user is so inclined). Also the function name is something of a misnomer, - * see the comment above find_kern_regs(). - */ - -kern_return_t -machine_thread_get_kern_state( - thread_t thread, - thread_flavor_t flavor, - thread_state_t tstate, - mach_msg_type_number_t *count) -{ - - register struct savearea *sv; /* Pointer to the context savearea */ - struct savearea *genkern; - int i; - - register struct ppc_thread_state *ts; - register struct ppc_thread_state64 *xts; - register struct ppc_exception_state *es; - register struct ppc_exception_state64 *xes; - - genkern = find_kern_regs(thread); - - switch (flavor) { - - case THREAD_STATE_FLAVOR_LIST: - - if (*count < 6) { - return (KERN_INVALID_ARGUMENT); - } - - tstate[0] = PPC_THREAD_STATE; - tstate[1] = PPC_FLOAT_STATE; - tstate[2] = PPC_EXCEPTION_STATE; - tstate[3] = PPC_VECTOR_STATE; - tstate[4] = PPC_THREAD_STATE64; - tstate[5] = PPC_EXCEPTION_STATE64; - *count = 6; - - return KERN_SUCCESS; - - case PPC_THREAD_STATE: - - if (*count < PPC_THREAD_STATE_COUNT) { /* Is the count ok? */ - return KERN_INVALID_ARGUMENT; - } - - ts = (struct ppc_thread_state *) tstate; - - sv = genkern; /* Copy this over */ - - if(sv) { /* Is there a save area yet? */ - ts->r0 = (unsigned int)sv->save_r0; - ts->r1 = (unsigned int)sv->save_r1; - ts->r2 = (unsigned int)sv->save_r2; - ts->r3 = (unsigned int)sv->save_r3; - ts->r4 = (unsigned int)sv->save_r4; - ts->r5 = (unsigned int)sv->save_r5; - ts->r6 = (unsigned int)sv->save_r6; - ts->r7 = (unsigned int)sv->save_r7; - ts->r8 = (unsigned int)sv->save_r8; - ts->r9 = (unsigned int)sv->save_r9; - ts->r10 = (unsigned int)sv->save_r10; - ts->r11 = (unsigned int)sv->save_r11; - ts->r12 = (unsigned int)sv->save_r12; - ts->r13 = (unsigned int)sv->save_r13; - ts->r14 = (unsigned int)sv->save_r14; - ts->r15 = (unsigned int)sv->save_r15; - ts->r16 = (unsigned int)sv->save_r16; - ts->r17 = (unsigned int)sv->save_r17; - ts->r18 = (unsigned int)sv->save_r18; - ts->r19 = (unsigned int)sv->save_r19; - ts->r20 = (unsigned int)sv->save_r20; - ts->r21 = (unsigned int)sv->save_r21; - ts->r22 = (unsigned int)sv->save_r22; - ts->r23 = (unsigned int)sv->save_r23; - ts->r24 = (unsigned int)sv->save_r24; - ts->r25 = (unsigned int)sv->save_r25; - ts->r26 = (unsigned int)sv->save_r26; - ts->r27 = (unsigned int)sv->save_r27; - ts->r28 = (unsigned int)sv->save_r28; - ts->r29 = (unsigned int)sv->save_r29; - ts->r30 = (unsigned int)sv->save_r30; - ts->r31 = (unsigned int)sv->save_r31; - ts->cr = (unsigned int)sv->save_cr; - ts->xer = (unsigned int)sv->save_xer; - ts->lr = (unsigned int)sv->save_lr; - ts->ctr = (unsigned int)sv->save_ctr; - ts->srr0 = (unsigned int)sv->save_srr0; - ts->srr1 = (unsigned int)sv->save_srr1; - ts->mq = 0; /* MQ register (601 only) */ - ts->vrsave = (unsigned int)sv->save_vrsave; /* VRSAVE register (Altivec only) */ - } - else { /* No state yet. Save seemingly random values. */ - - for(i=0; i < 32; i+=2) { /* Fill up with defaults */ - ((unsigned int *)&ts->r0)[i] = ((unsigned int *)&FloatInit)[0]; - ((unsigned int *)&ts->r0)[i+1] = ((unsigned int *)&FloatInit)[1]; - } - ts->cr = 0; - ts->xer = 0; - ts->lr = ((unsigned int *)&FloatInit)[0]; - ts->ctr = ((unsigned int *)&FloatInit)[1]; - ts->srr0 = ((unsigned int *)&FloatInit)[0]; - ts->srr1 = MSR_EXPORT_MASK_SET; - ts->mq = 0; - ts->vrsave = 0; /* VRSAVE register (Altivec only) */ - } - - *count = PPC_THREAD_STATE_COUNT; /* Pass back the amount we actually copied */ - return KERN_SUCCESS; - - - case PPC_THREAD_STATE64: - - if (*count < PPC_THREAD_STATE64_COUNT) { /* Is the count ok? */ - return KERN_INVALID_ARGUMENT; - } - - xts = (struct ppc_thread_state64 *) tstate; - - sv = genkern; /* Copy this over */ - - if(sv) { /* Is there a save area yet? */ - xts->r0 = sv->save_r0; - xts->r1 = sv->save_r1; - xts->r2 = sv->save_r2; - xts->r3 = sv->save_r3; - xts->r4 = sv->save_r4; - xts->r5 = sv->save_r5; - xts->r6 = sv->save_r6; - xts->r7 = sv->save_r7; - xts->r8 = sv->save_r8; - xts->r9 = sv->save_r9; - xts->r10 = sv->save_r10; - xts->r11 = sv->save_r11; - xts->r12 = sv->save_r12; - xts->r13 = sv->save_r13; - xts->r14 = sv->save_r14; - xts->r15 = sv->save_r15; - xts->r16 = sv->save_r16; - xts->r17 = sv->save_r17; - xts->r18 = sv->save_r18; - xts->r19 = sv->save_r19; - xts->r20 = sv->save_r20; - xts->r21 = sv->save_r21; - xts->r22 = sv->save_r22; - xts->r23 = sv->save_r23; - xts->r24 = sv->save_r24; - xts->r25 = sv->save_r25; - xts->r26 = sv->save_r26; - xts->r27 = sv->save_r27; - xts->r28 = sv->save_r28; - xts->r29 = sv->save_r29; - xts->r30 = sv->save_r30; - xts->r31 = sv->save_r31; - xts->cr = sv->save_cr; - xts->xer = sv->save_xer; - xts->lr = sv->save_lr; - xts->ctr = sv->save_ctr; - xts->srr0 = sv->save_srr0; - xts->srr1 = sv->save_srr1; - xts->vrsave = sv->save_vrsave; /* VRSAVE register (Altivec only) */ - } - else { /* No user state yet. Save seemingly random values. */ - - for(i=0; i < 32; i++) { /* Fill up with defaults */ - ((unsigned long long *)&xts->r0)[i] = ((unsigned long long *)&FloatInit)[0]; - } - xts->cr = 0; - xts->xer = 0; - xts->lr = ((unsigned long long *)&FloatInit)[0]; - xts->ctr = ((unsigned long long *)&FloatInit)[0]; - xts->srr0 = ((unsigned long long *)&FloatInit)[0]; - xts->srr1 = MSR_EXPORT_MASK_SET; - xts->vrsave = 0; /* VRSAVE register (Altivec only) */ - } - - *count = PPC_THREAD_STATE64_COUNT; /* Pass back the amount we actually copied */ - return KERN_SUCCESS; - - case PPC_EXCEPTION_STATE: - - if (*count < PPC_EXCEPTION_STATE_COUNT) { - return KERN_INVALID_ARGUMENT; - } - - es = (struct ppc_exception_state *) tstate; - sv = genkern; /* Copy this over */ - - if(sv) { /* See if valid state yet */ - es->dar = (unsigned int)sv->save_dar; - es->dsisr = sv->save_dsisr; - es->exception = sv->save_exception; - } - else { /* Nope, not yet */ - es->dar = 0; - es->dsisr = 0; - es->exception = ((unsigned int *)&FloatInit)[0]; - } - - *count = PPC_EXCEPTION_STATE_COUNT; - return KERN_SUCCESS; - - case PPC_EXCEPTION_STATE64: - - if (*count < PPC_EXCEPTION_STATE64_COUNT) { - return KERN_INVALID_ARGUMENT; - } - - xes = (struct ppc_exception_state64 *) tstate; - sv = genkern; /* Copy this over */ - - if(sv) { /* See if valid state yet */ - xes->dar = sv->save_dar; - xes->dsisr = sv->save_dsisr; - xes->exception = sv->save_exception; - } - else { /* Nope, not yet */ - xes->dar = 0; - xes->dsisr = 0; - xes->exception = ((unsigned int *)&FloatInit)[0]; - } - - *count = PPC_EXCEPTION_STATE64_COUNT; - return KERN_SUCCESS; - - default: - return KERN_INVALID_ARGUMENT; - } -} - - -/* - * thread_setstatus: - * - * Set the status of the specified thread. - */ -kern_return_t -machine_thread_set_state( - thread_t thread, - thread_flavor_t flavor, - thread_state_t tstate, - mach_msg_type_number_t count) -{ - - struct savearea *genuser; - savearea_fpu *fsv, *fsvn, *fsvo; - savearea_vec *vsv, *vsvn, *vsvo; - unsigned int i; - unsigned int clgn; - register struct ppc_thread_state *ts; - register struct ppc_thread_state64 *xts; - register struct ppc_exception_state *es; - register struct ppc_exception_state *xes; - register struct ppc_float_state *fs; - register struct ppc_vector_state *vs; - -// dbgTrace((unsigned int)thr_act, (unsigned int)0 /*sv: was never set*/, flavor); /* (TEST/DEBUG) */ - - clgn = count; /* Get the count */ - - switch (flavor) { /* Validate the count before we do anything else */ - case PPC_THREAD_STATE: - - if (clgn < PPC_THREAD_STATE_COUNT) { /* Is it too short? */ - return KERN_INVALID_ARGUMENT; /* Yeah, just leave... */ - } - break; - - case PPC_THREAD_STATE64: - - if (clgn < PPC_THREAD_STATE64_COUNT) { /* Is it too short? */ - return KERN_INVALID_ARGUMENT; /* Yeah, just leave... */ - } - break; - - case PPC_EXCEPTION_STATE: - - if (clgn < PPC_EXCEPTION_STATE_COUNT) { /* Is it too short? */ - return KERN_INVALID_ARGUMENT; /* Yeah, just leave... */ - } - - case PPC_EXCEPTION_STATE64: - - if (clgn < PPC_EXCEPTION_STATE64_COUNT) { /* Is it too short? */ - return KERN_INVALID_ARGUMENT; /* Yeah, just leave... */ - } - - break; - - case PPC_FLOAT_STATE: - - if (clgn < PPC_FLOAT_STATE_COUNT) { /* Is it too short? */ - return KERN_INVALID_ARGUMENT; /* Yeah, just leave... */ - } - - break; - - - case PPC_VECTOR_STATE: - - if (clgn < PPC_VECTOR_STATE_COUNT) { /* Is it too short? */ - return KERN_INVALID_ARGUMENT; /* Yeah, just leave... */ - } - - break; - - default: - return KERN_INVALID_ARGUMENT; - } - - genuser = get_user_regs(thread); /* Find or allocate and initialize one */ - - switch (flavor) { - - case PPC_THREAD_STATE: - - ts = (struct ppc_thread_state *)tstate; - - genuser->save_r0 = (uint64_t)ts->r0; - genuser->save_r1 = (uint64_t)ts->r1; - genuser->save_r2 = (uint64_t)ts->r2; - genuser->save_r3 = (uint64_t)ts->r3; - genuser->save_r4 = (uint64_t)ts->r4; - genuser->save_r5 = (uint64_t)ts->r5; - genuser->save_r6 = (uint64_t)ts->r6; - genuser->save_r7 = (uint64_t)ts->r7; - genuser->save_r8 = (uint64_t)ts->r8; - genuser->save_r9 = (uint64_t)ts->r9; - genuser->save_r10 = (uint64_t)ts->r10; - genuser->save_r11 = (uint64_t)ts->r11; - genuser->save_r12 = (uint64_t)ts->r12; - genuser->save_r13 = (uint64_t)ts->r13; - genuser->save_r14 = (uint64_t)ts->r14; - genuser->save_r15 = (uint64_t)ts->r15; - genuser->save_r16 = (uint64_t)ts->r16; - genuser->save_r17 = (uint64_t)ts->r17; - genuser->save_r18 = (uint64_t)ts->r18; - genuser->save_r19 = (uint64_t)ts->r19; - genuser->save_r20 = (uint64_t)ts->r20; - genuser->save_r21 = (uint64_t)ts->r21; - genuser->save_r22 = (uint64_t)ts->r22; - genuser->save_r23 = (uint64_t)ts->r23; - genuser->save_r24 = (uint64_t)ts->r24; - genuser->save_r25 = (uint64_t)ts->r25; - genuser->save_r26 = (uint64_t)ts->r26; - genuser->save_r27 = (uint64_t)ts->r27; - genuser->save_r28 = (uint64_t)ts->r28; - genuser->save_r29 = (uint64_t)ts->r29; - genuser->save_r30 = (uint64_t)ts->r30; - genuser->save_r31 = (uint64_t)ts->r31; - - genuser->save_cr = ts->cr; - genuser->save_xer = (uint64_t)ts->xer; - genuser->save_lr = (uint64_t)ts->lr; - genuser->save_ctr = (uint64_t)ts->ctr; - genuser->save_srr0 = (uint64_t)ts->srr0; - genuser->save_vrsave = ts->vrsave; /* VRSAVE register (Altivec only) */ - - genuser->save_srr1 = MSR_PREPARE_FOR_IMPORT(genuser->save_srr1, ts->srr1); /* Set the bits we can change */ - - genuser->save_srr1 |= MSR_EXPORT_MASK_SET; - - genuser->save_srr1 &= ~(MASK(MSR_FP) | MASK(MSR_VEC)); /* Make sure we don't enable the floating point unit */ - - if(task_has_64BitAddr(thread->task)) - genuser->save_srr1 |= (uint64_t)MASK32(MSR_SF) << 32; /* If 64-bit task, force 64-bit mode */ - else - genuser->save_srr1 &= ~((uint64_t)MASK32(MSR_SF) << 32); /* else 32-bit mode */ - - return KERN_SUCCESS; - - - case PPC_THREAD_STATE64: - - xts = (struct ppc_thread_state64 *)tstate; - - genuser->save_r0 = xts->r0; - genuser->save_r1 = xts->r1; - genuser->save_r2 = xts->r2; - genuser->save_r3 = xts->r3; - genuser->save_r4 = xts->r4; - genuser->save_r5 = xts->r5; - genuser->save_r6 = xts->r6; - genuser->save_r7 = xts->r7; - genuser->save_r8 = xts->r8; - genuser->save_r9 = xts->r9; - genuser->save_r10 = xts->r10; - genuser->save_r11 = xts->r11; - genuser->save_r12 = xts->r12; - genuser->save_r13 = xts->r13; - genuser->save_r14 = xts->r14; - genuser->save_r15 = xts->r15; - genuser->save_r16 = xts->r16; - genuser->save_r17 = xts->r17; - genuser->save_r18 = xts->r18; - genuser->save_r19 = xts->r19; - genuser->save_r20 = xts->r20; - genuser->save_r21 = xts->r21; - genuser->save_r22 = xts->r22; - genuser->save_r23 = xts->r23; - genuser->save_r24 = xts->r24; - genuser->save_r25 = xts->r25; - genuser->save_r26 = xts->r26; - genuser->save_r27 = xts->r27; - genuser->save_r28 = xts->r28; - genuser->save_r29 = xts->r29; - genuser->save_r30 = xts->r30; - genuser->save_r31 = xts->r31; - - genuser->save_cr = xts->cr; - genuser->save_xer = xts->xer; - genuser->save_lr = xts->lr; - genuser->save_ctr = xts->ctr; - genuser->save_srr0 = xts->srr0; - genuser->save_vrsave = xts->vrsave; /* VRSAVE register (Altivec only) */ - - genuser->save_srr1 = MSR_PREPARE_FOR_IMPORT(genuser->save_srr1, xts->srr1); /* Set the bits we can change */ - - genuser->save_srr1 |= MSR_EXPORT_MASK_SET; - - genuser->save_srr1 &= ~(MASK(MSR_FP) | MASK(MSR_VEC)); /* Make sure we don't enable the floating point unit */ - - if(task_has_64BitAddr(thread->task)) - genuser->save_srr1 |= (uint64_t)MASK32(MSR_SF) << 32; /* If 64-bit task, force 64-bit mode */ - else - genuser->save_srr1 &= ~((uint64_t)MASK32(MSR_SF) << 32); /* else 32-bit mode */ - - return KERN_SUCCESS; - - - case PPC_EXCEPTION_STATE: - - es = (struct ppc_exception_state *) tstate; - - genuser->save_dar = (uint64_t)es->dar; - genuser->save_dsisr = es->dsisr; - genuser->save_exception = es->exception; - - return KERN_SUCCESS; - -/* - * It's pretty worthless to try to change this stuff, but we'll do it anyway. - */ - - case PPC_EXCEPTION_STATE64: - - xes = (struct ppc_exception_state *) tstate; - - genuser->save_dar = xes->dar; - genuser->save_dsisr = xes->dsisr; - genuser->save_exception = xes->exception; - - return KERN_SUCCESS; - - case PPC_FLOAT_STATE: - - toss_live_fpu(thread->machine.curctx); /* Toss my floating point if live anywhere */ - - fsv = find_user_fpu(thread); /* Get the user's floating point context */ - - if(!fsv) { /* Do we have one yet? */ - fsv = (savearea_fpu *)save_alloc(); /* If we still don't have one, get a new one */ - fsv->save_hdr.save_flags = (fsv->save_hdr.save_flags & ~SAVtype) | (SAVfloat << SAVtypeshft); /* Mark as in use as float */ - fsv->save_hdr.save_act = thread; - fsv->save_hdr.save_prev = 0; /* Mark no more */ - fsv->save_hdr.save_level = 0; /* Mark user state */ - - if(!thread->machine.curctx->FPUsave) thread->machine.curctx->FPUsave = fsv; /* If no floating point, chain us first */ - else { - - fsvn = fsvo = thread->machine.curctx->FPUsave; /* Remember first one */ - - while (fsvn) { /* Go until we hit the end */ - fsvo = fsvn; /* Remember the previous one */ - fsvn = CAST_DOWN(savearea_fpu *, fsvo->save_hdr.save_prev); /* Skip on to the next */ - } - - fsvo->save_hdr.save_prev = (addr64_t)((uintptr_t)fsv); /* Queue us on in */ - } - - } - - fs = (struct ppc_float_state *) tstate; /* Point to source */ - - - bcopy((char *)fs, (char *)&fsv->save_fp0, 32*8); /* Move in the 32 registers */ - - genuser->save_fpscr = fs->fpscr; /* Copy the fpscr value to normal */ - - return KERN_SUCCESS; - - - case PPC_VECTOR_STATE: - - toss_live_vec(thread->machine.curctx); /* Toss my vector if live anywhere */ - - vsv = find_user_vec(thread); /* Get the user's vector context */ - - if(!vsv) { /* Do we have one yet? */ - vsv = (savearea_vec *)save_alloc(); /* If we still don't have one, get a new one */ - vsv->save_hdr.save_flags = (vsv->save_hdr.save_flags & ~SAVtype) | (SAVvector << SAVtypeshft); /* Mark as in use as vector */ - vsv->save_hdr.save_act = thread; - vsv->save_hdr.save_prev = 0; /* Mark no more */ - vsv->save_hdr.save_level = 0; /* Mark user state */ - - if(!thread->machine.curctx->VMXsave) thread->machine.curctx->VMXsave = vsv; /* If no vector, chain us first */ - else { - - vsvn = vsvo = thread->machine.curctx->VMXsave; /* Remember first one */ - - while (vsvn) { /* Go until we hit the end */ - vsvo = vsvn; /* Remember the previous one */ - vsvn = CAST_DOWN(savearea_vec *, vsvo->save_hdr.save_prev); /* Skip on to the next */ - } - - vsvo->save_hdr.save_prev = (addr64_t)((uintptr_t)vsv); /* Queue us on in */ - } - - } - - vs = (struct ppc_vector_state *) tstate; /* Point to source */ - - bcopy((char *)vs, (char *)&vsv->save_vr0, 32*16); /* 32 registers plus status and validity and pad */ - vsv->save_vrvalid = vs->save_vrvalid; /* Set validity bits */ - - - for(i = 0; i < 4; i++) genuser->save_vscr[i] = vs->save_vscr[i]; /* Set value for vscr */ - - return KERN_SUCCESS; - - - default: - return KERN_INVALID_ARGUMENT; - } -} - - -void -thread_set_wq_state64(thread_t thread, thread_state_t tstate) -{ - struct ppc_thread_state64 *ts; - struct savearea *genuser; - thread_t curth = current_thread(); - - genuser = get_user_regs(thread); /* Find or allocate and initialize one */ - ts = (struct ppc_thread_state64 *)tstate; - - if (curth != thread) - thread_lock(thread); - - genuser->save_r1 = ts->r1; - genuser->save_r3 = ts->r3; - genuser->save_r4 = ts->r4; - genuser->save_r5 = ts->r5; - genuser->save_r6 = ts->r6; - genuser->save_r7 = ts->r7; - genuser->save_r8 = ts->r8; - genuser->save_srr0 = ts->srr0; - - genuser->save_srr1 = (uint64_t)MSR_EXPORT_MASK_SET; - - if (task_has_64BitAddr(thread->task)) - genuser->save_srr1 |= (uint64_t)MASK32(MSR_SF) << 32; /* If 64-bit task, force 64-bit mode */ - - if (curth != thread) - thread_unlock(thread); -} - - -/* - * This is where registers that are not normally specified by the mach-o - * file on an execve should be nullified, perhaps to avoid a covert channel. - * We've never bothered to clear FPRs or VRs, but it is important to clear - * the FPSCR, which is kept in the general state but not set by the general - * flavor (ie, PPC_THREAD_STATE or PPC_THREAD_STATE64.) - */ -kern_return_t -machine_thread_state_initialize( - thread_t thread) -{ - struct savearea *sv; - - sv = get_user_regs(thread); /* Find or allocate and initialize one */ - - sv->save_fpscr = 0; /* Clear all floating point exceptions */ - sv->save_vrsave = 0; /* Set the vector save state */ - sv->save_vscr[0] = 0x00000000; - sv->save_vscr[1] = 0x00000000; - sv->save_vscr[2] = 0x00000000; - sv->save_vscr[3] = 0x00010000; /* Disable java mode and clear saturated */ - - return KERN_SUCCESS; -} - - -/* - * Duplicates the context of one thread into a new one. - * The new thread is assumed to be new and have no user state contexts except maybe a general one. - * We also assume that the old thread can't be running anywhere. - * - * We're only going to be duplicating user context here. That means that we will have to - * eliminate any floating point or vector kernel contexts and carry across the user state ones. - */ - -kern_return_t -machine_thread_dup( - thread_t self, - thread_t target) -{ - struct savearea *sv, *osv; - savearea_fpu *fsv, *fsvn; - savearea_vec *vsv, *vsvn; - - fpu_save(self->machine.curctx); /* Make certain floating point state is all saved */ - vec_save(self->machine.curctx); /* Make certain the vector state is all saved */ - - sv = get_user_regs(target); /* Allocate and initialze context in the new activation */ - - osv = find_user_regs(self); /* Find the original context */ - if(!osv) - return (KERN_FAILURE); - - bcopy((char *)((unsigned int)osv + sizeof(savearea_comm)), /* Copy everything but the headers */ - (char *)((unsigned int)sv + sizeof(savearea_comm)), - sizeof(struct savearea) - sizeof(savearea_comm)); - - sv->save_srr1 &= (uint64_t)(~(MASK(MSR_FP) | MASK(MSR_VEC))); /* Make certain that floating point and vector are turned off */ - - fsv = find_user_fpu(self); /* Get any user floating point */ - - target->machine.curctx->FPUsave = NULL; /* Assume no floating point */ - - if(fsv) { /* Did we find one? */ - fsvn = (savearea_fpu *)save_alloc(); /* If we still don't have one, get a new one */ - fsvn->save_hdr.save_flags = (fsvn->save_hdr.save_flags & ~SAVtype) | (SAVfloat << SAVtypeshft); /* Mark as in use as float */ - fsvn->save_hdr.save_act = target; - fsvn->save_hdr.save_prev = 0; /* Mark no more */ - fsvn->save_hdr.save_level = 0; /* Mark user state */ - - target->machine.curctx->FPUsave = fsvn; /* Chain in the floating point */ - - bcopy((char *)((unsigned int)fsv + sizeof(savearea_comm)), /* Copy everything but the headers */ - (char *)((unsigned int)fsvn + sizeof(savearea_comm)), - sizeof(struct savearea) - sizeof(savearea_comm)); - } - - vsv = find_user_vec(self); /* Get any user vector */ - - target->machine.curctx->VMXsave = NULL; /* Assume no vector */ - - if(vsv) { /* Did we find one? */ - vsvn = (savearea_vec *)save_alloc(); /* If we still don't have one, get a new one */ - vsvn->save_hdr.save_flags = (vsvn->save_hdr.save_flags & ~SAVtype) | (SAVvector << SAVtypeshft); /* Mark as in use as float */ - vsvn->save_hdr.save_act = target; - vsvn->save_hdr.save_prev = 0; /* Mark no more */ - vsvn->save_hdr.save_level = 0; /* Mark user state */ - - target->machine.curctx->VMXsave = vsvn; /* Chain in the floating point */ - - bcopy((char *)((unsigned int)vsv + sizeof(savearea_comm)), /* Copy everything but the headers */ - (char *)((unsigned int)vsvn + sizeof(savearea_comm)), - sizeof(struct savearea) - sizeof(savearea_comm)); - } - - return (KERN_SUCCESS); -} - -/* - * Initializes a fresh set of user state values. If there is no user state context, - * one is created. Floats and VMX are not created. - * - * We only set initial values if there was no context found. - */ - -struct savearea * -get_user_regs( - thread_t thread) -{ - struct savearea *sv, *osv; - unsigned int i; - - if (thread->machine.upcb) - return thread->machine.upcb; - - sv = thread->machine.pcb; /* Get the top savearea on the stack */ - osv = NULL; /* Set no user savearea yet */ - - while(sv) { /* Find the user context */ - osv = sv; /* Save the last one */ - sv = CAST_DOWN(struct savearea *, sv->save_hdr.save_prev); /* Get the previous context */ - } - - sv = save_alloc(); /* Get one */ - sv->save_hdr.save_flags = (sv->save_hdr.save_flags & ~SAVtype) | (SAVgeneral << SAVtypeshft); /* Mark as in use as general */ - sv->save_hdr.save_act = thread; - sv->save_hdr.save_prev = 0; /* Mark no more */ - sv->save_hdr.save_level = 0; /* Mark user state */ - - if(osv) { /* Did we already have one? */ - osv->save_hdr.save_prev = (addr64_t)((uintptr_t)sv); /* Chain us on the end */ - } - else { /* We are the first */ - thread->machine.pcb = sv; /* Put it there */ - } - thread->machine.upcb = sv; /* Set user pcb */ - - for(i=0; i < 32; i+=2) { /* Fill up with defaults */ - ((unsigned int *)&sv->save_r0)[i] = ((unsigned int *)&FloatInit)[0]; - ((unsigned int *)&sv->save_r0)[i+1] = ((unsigned int *)&FloatInit)[1]; - } - sv->save_cr = 0; - sv->save_xer = 0; - sv->save_lr = (uint64_t)FloatInit; - sv->save_ctr = (uint64_t)FloatInit; - sv->save_srr0 = (uint64_t)FloatInit; - sv->save_srr1 = (uint64_t)MSR_EXPORT_MASK_SET; - if(task_has_64BitAddr(thread->task)) - sv->save_srr1 |= (uint64_t)MASK32(MSR_SF) << 32; /* If 64-bit task, force 64-bit mode */ - - sv->save_fpscr = 0; /* Clear all floating point exceptions */ - - sv->save_vrsave = 0; /* Set the vector save state */ - sv->save_vscr[0] = 0x00000000; - sv->save_vscr[1] = 0x00000000; - sv->save_vscr[2] = 0x00000000; - sv->save_vscr[3] = 0x00010000; /* Disable java mode and clear saturated */ - - return sv; /* Bye bye... */ -} - -/* - * Find the user state context. If there is no user state context, - * we just return a 0. - */ - -struct savearea * -find_user_regs( - thread_t thread) -{ - return thread->machine.upcb; -} - -/* The name of this call is something of a misnomer since the mact.pcb can - * contain chained saveareas, but it will do for now.. - */ -struct savearea * -find_kern_regs( - thread_t thread) -{ - return thread->machine.pcb; -} - -/* - * Find the user state floating point context. If there is no user state context, - * we just return a 0. - */ - -savearea_fpu * -find_user_fpu( - thread_t thread) -{ - savearea_fpu *fsv; - boolean_t intr; - - intr = ml_set_interrupts_enabled(FALSE); - fsv = thread->machine.curctx->FPUsave; /* Get the start of the floating point chain */ - - while(fsv) { /* Look until the end or we find it */ - if(!(fsv->save_hdr.save_level)) break; /* Is the the user state stuff? (the level is 0 if so) */ - fsv = CAST_DOWN(savearea_fpu *, fsv->save_hdr.save_prev); /* Try the previous one */ - } - (void) ml_set_interrupts_enabled(intr); - - return fsv; /* Bye bye... */ -} - -/* - * Find the user state vector context. If there is no user state context, - * we just return a 0. - */ - -savearea_vec * -find_user_vec( - thread_t thread) -{ - savearea_vec *vsv; - boolean_t intr; - - intr = ml_set_interrupts_enabled(FALSE); - vsv = thread->machine.curctx->VMXsave; /* Get the start of the vector chain */ - - while(vsv) { /* Look until the end or we find it */ - if(!(vsv->save_hdr.save_level)) break; /* Is the the user state stuff? (the level is 0 if so) */ - vsv = CAST_DOWN(savearea_vec *, vsv->save_hdr.save_prev); /* Try the previous one */ - } - (void) ml_set_interrupts_enabled(intr); - - return vsv; /* Bye bye... */ -} -/* - * Find the user state vector context for the current thread. If there is no user state context, - * we just return a 0. - */ - -savearea_vec *find_user_vec_curr(void) { - - savearea_vec *vsv; - thread_t thread = current_thread(); - boolean_t intr; - - vec_save(thread->machine.curctx); /* Force save if live */ - - intr = ml_set_interrupts_enabled(FALSE); - vsv = thread->machine.curctx->VMXsave; /* Get the start of the vector chain */ - - while(vsv) { /* Look until the end or we find it */ - if(!(vsv->save_hdr.save_level)) break; /* Is the the user state stuff? (the level is 0 if so) */ - vsv = CAST_DOWN(savearea_vec *, vsv->save_hdr.save_prev); /* Try the previous one */ - } - (void) ml_set_interrupts_enabled(intr); - - return vsv; /* Bye bye... */ -} - -/* - * thread_userstack: - * - * Return the user stack pointer from the machine - * dependent thread state info. - */ -kern_return_t -thread_userstack( - __unused thread_t thread, - int flavor, - thread_state_t tstate, - unsigned int count, - mach_vm_offset_t *user_stack, - int *customstack -) -{ - /* - * Set a default. - */ - - switch (flavor) { - case PPC_THREAD_STATE: - { - struct ppc_thread_state *state; - - if (count < PPC_THREAD_STATE_COUNT) - return (KERN_INVALID_ARGUMENT); - - state = (struct ppc_thread_state *) tstate; - - /* - * If a valid user stack is specified, use it. - */ - if (state->r1) { - *user_stack = CAST_USER_ADDR_T(state->r1); - if (customstack) - *customstack = 1; - } else { - *user_stack = CAST_USER_ADDR_T(USRSTACK); - if (customstack) - *customstack = 0; - } - } - break; - - case PPC_THREAD_STATE64: - { - struct ppc_thread_state64 *state64; - - if (count < PPC_THREAD_STATE64_COUNT) - return (KERN_INVALID_ARGUMENT); - - state64 = (struct ppc_thread_state64 *)tstate; - - /* - * If a valid user stack is specified, use it. - */ - if (state64->r1 != MACH_VM_MIN_ADDRESS) { - *user_stack = state64->r1; - if (customstack) - *customstack = 1; - } else { - *user_stack = USRSTACK64; - if (customstack) - *customstack = 0; - } - } - break; - - default : - return (KERN_INVALID_ARGUMENT); - } - - return (KERN_SUCCESS); -} - - -/* - * thread_setuserstack: - * - * Sets the user stack pointer into the machine - * dependent thread state info. - */ -void -thread_setuserstack(thread_t thread, mach_vm_address_t user_stack) -{ - struct savearea *sv; - - sv = get_user_regs(thread); /* Get the user state registers */ - - sv->save_r1 = user_stack; - - return; -} - -void -thread_set_cthreadself(thread_t thread, uint64_t pself, int isLP64) -{ - struct savearea *sv; - - if (isLP64 == 0) { - thread->machine.cthread_self = pself; - } else { - sv = get_user_regs(thread); /* Get the user state registers */ - - thread->machine.cthread_self = pself; - sv->save_r13 = pself; - } -} - - -/* - * thread_adjuserstack: - * - * Returns the adjusted user stack pointer from the machine - * dependent thread state info. Usef for small (<2G) deltas. - */ -uint64_t -thread_adjuserstack(thread_t thread, int adjust) -{ - struct savearea *sv; - - sv = get_user_regs(thread); /* Get the user state registers */ - - sv->save_r1 += adjust; /* Adjust the stack */ - - return sv->save_r1; /* Return the adjusted stack */ - -} - -kern_return_t -thread_setsinglestep(thread_t thread, int on) -{ - struct savearea *sv; - - sv = get_user_regs(thread); /* Get the user state registers */ - - if (on) - sv->save_srr1 |= MASK(MSR_SE); - else - sv->save_srr1 &= ~MASK(MSR_SE); - - return (KERN_SUCCESS); -} - -/* - * thread_setentrypoint: - * - * Sets the user PC into the machine - * dependent thread state info. - */ - -void -thread_setentrypoint(thread_t thread, uint64_t entry) -{ - struct savearea *sv; - - sv = get_user_regs(thread); /* Get the user state registers */ - - sv->save_srr0 = entry; -} - -kern_return_t -thread_entrypoint( - __unused thread_t thread, - int flavor, - thread_state_t tstate, - unsigned int count, - mach_vm_offset_t *entry_point -) -{ -#if 0 - /* Silly code: "if *entry_point is 0, make it 0" */ - /* - * Set a default. - */ - if (*entry_point == 0ULL) - *entry_point = MACH_VM_MIN_ADDRESS; -#endif - - switch (flavor) { - case PPC_THREAD_STATE: - { - struct ppc_thread_state *state; - - if (count < PPC_THREAD_STATE_COUNT) - return (KERN_INVALID_ARGUMENT); - - state = (struct ppc_thread_state *) tstate; - - /* - * If a valid entry point is specified, use it. - */ - if (state->srr0) { - *entry_point = CAST_USER_ADDR_T(state->srr0); - } else { - *entry_point = CAST_USER_ADDR_T(VM_MIN_ADDRESS); - } - } - break; - - case PPC_THREAD_STATE64: - { - struct ppc_thread_state64 *state64; - - if (count < PPC_THREAD_STATE_COUNT) - return (KERN_INVALID_ARGUMENT); - - state64 = (struct ppc_thread_state64 *)tstate; - - /* - * If a valid entry point is specified, use it. - */ - if (state64->srr0) { - *entry_point = state64->srr0; - } else { - *entry_point = MACH_VM_MIN_ADDRESS; - } - } - break; - - default: - return (KERN_INVALID_ARGUMENT); - } - - return (KERN_SUCCESS); -} - -unsigned int get_msr_exportmask(void) -{ - return (MSR_EXPORT_MASK_SET); -} - -unsigned int get_msr_nbits(void) -{ - return (MASK(MSR_POW)|MASK(MSR_ILE)|MASK(MSR_IP)|MASK(MSR_LE)); -} -unsigned int get_msr_rbits(void) -{ - return (MASK(MSR_PR)|MASK(MSR_ME)|MASK(MSR_IR)|MASK(MSR_DR)|MASK(MSR_EE)); -} - -void ppc_checkthreadstate(void * tsptr, int flavor) -{ - if (flavor == PPC_THREAD_STATE64) { - struct ppc_thread_state64 *ts64 =(struct ppc_thread_state64 *)tsptr; - - /* Make sure naughty bits are off and necessary bits are on */ - ts64->srr1 &= ~(MASK(MSR_POW)|MASK(MSR_ILE)|MASK(MSR_IP)|MASK(MSR_LE)); - ts64->srr1 |= (MASK(MSR_PR)|MASK(MSR_ME)|MASK(MSR_IR)|MASK(MSR_DR)|MASK(MSR_EE)); - } else { - struct ppc_thread_state *ts =(struct ppc_thread_state *)tsptr; - - /* Make sure naughty bits are off and necessary bits are on */ - ts->srr1 &= ~(MASK(MSR_POW)|MASK(MSR_ILE)|MASK(MSR_IP)|MASK(MSR_LE)); - ts->srr1 |= (MASK(MSR_PR)|MASK(MSR_ME)|MASK(MSR_IR)|MASK(MSR_DR)|MASK(MSR_EE)); - } - return; -} - -void -thread_set_child( - thread_t child, - int pid) -{ - struct savearea *child_state; - - child_state = get_user_regs(child); - - child_state->save_r3 = (uint_t)pid; - child_state->save_r4 = 1ULL; -} -void -thread_set_parent( - thread_t parent, - int pid) -{ - struct savearea *parent_state; - - parent_state = get_user_regs(parent); - - parent_state->save_r3 = (uint64_t)pid; - parent_state->save_r4 = 0; -} - -/* - * Saves the complete context (general, floating point, and vector) of the current activation. - * We will collect everything into an opaque block of 1 to 3 saveareas and pass back a - * pointer to that. - * - * The savearea is made to look like it belongs to the source activation. This needs to - * be adjusted when these contexts are attached to a new activation. - * - */ - -void *act_thread_csave(void) { - - struct savearea *sv, *osv; - savearea_fpu *fsv, *ofsv; - savearea_vec *vsv, *ovsv; - - thread_t thread; - - thread = current_thread(); - - fpu_save(thread->machine.curctx); /* Make certain floating point state is all saved */ - vec_save(thread->machine.curctx); /* Make certain the vector state is all saved */ - - osv = find_user_regs(thread); /* Get our savearea */ - - if(!osv) { - panic("act_thread_csave: attempting to preserve the context of an activation with none (%p)\n", thread); - } - - sv = save_alloc(); /* Get a fresh save area to save into */ - sv->save_hdr.save_flags = (sv->save_hdr.save_flags & ~SAVtype) | (SAVgeneral << SAVtypeshft); /* Mark as in use as general */ - sv->save_hdr.save_act = thread; - sv->save_hdr.save_prev = 0; /* Mark no more */ - sv->save_hdr.save_level = 0; /* Mark user state */ - - - bcopy((char *)((unsigned int)osv + sizeof(savearea_comm)), /* Copy everything but the headers */ - (char *)((unsigned int)sv + sizeof(savearea_comm)), - sizeof(struct savearea) - sizeof(savearea_comm)); - - sv->save_srr1 &= (uint64_t)(~(MASK(MSR_FP) | MASK(MSR_VEC))); /* Make certain that floating point and vector are turned off */ - - sv->save_hdr.save_misc2 = 0xDEBB1ED0; /* Eye catcher for debug */ - sv->save_hdr.save_misc3 = 0xE5DA11A5; /* Eye catcher for debug */ - - - ofsv = find_user_fpu(thread); /* Get any user floating point */ - - sv->save_hdr.save_misc0 = 0; /* Assume no floating point */ - - if(ofsv) { /* Did we find one? */ - fsv = (savearea_fpu *)save_alloc(); /* If we still don't have one, get a new one */ - fsv->save_hdr.save_flags = (fsv->save_hdr.save_flags & ~SAVtype) | (SAVfloat << SAVtypeshft); /* Mark as in use as float */ - fsv->save_hdr.save_act = thread; - fsv->save_hdr.save_prev = 0; /* Mark no more */ - fsv->save_hdr.save_level = 0; /* Mark user state */ - fsv->save_hdr.save_misc2 = 0xDEBB1ED0; /* Eye catcher for debug */ - fsv->save_hdr.save_misc3 = 0xE5DA11A5; /* Eye catcher for debug */ - - sv->save_hdr.save_misc0 = (uint64_t)((uintptr_t)fsv); /* Remember this one */ - - bcopy((char *)((unsigned int)ofsv + sizeof(savearea_comm)), /* Copy everything but the headers */ - (char *)((unsigned int)fsv + sizeof(savearea_comm)), - sizeof(struct savearea) - sizeof(savearea_comm)); - } - - ovsv = find_user_vec(thread); /* Get any user vector */ - - sv->save_hdr.save_misc1 = 0; /* Assume no vector */ - - if(ovsv) { /* Did we find one? */ - vsv = (savearea_vec *)save_alloc(); /* If we still don't have one, get a new one */ - vsv->save_hdr.save_flags = (vsv->save_hdr.save_flags & ~SAVtype) | (SAVvector << SAVtypeshft); /* Mark as in use as float */ - vsv->save_hdr.save_act = thread; - vsv->save_hdr.save_prev = 0; /* Mark no more */ - vsv->save_hdr.save_level = 0; /* Mark user state */ - vsv->save_hdr.save_misc2 = 0xDEBB1ED0; /* Eye catcher for debug */ - vsv->save_hdr.save_misc3 = 0xE5DA11A5; /* Eye catcher for debug */ - - sv->save_hdr.save_misc1 = (uint64_t)((uintptr_t)vsv); /* Chain in the floating point */ - - bcopy((char *)((unsigned int)ovsv + sizeof(savearea_comm)), /* Copy everything but the headers */ - (char *)((unsigned int)vsv + sizeof(savearea_comm)), - sizeof(struct savearea) - sizeof(savearea_comm)); - } - - return (void *)sv; /* Bye bye... */ -} - - - -/* - * Attaches saved user state context to an activation. We will replace any - * user state context with what is passed in. The saved context consists of a - * savearea that was setup by - * We will collect everything into one savearea and pass that back. - * - * The savearea is made to look like it belongs to the source activation. This needs to - * be adjusted when these contexts are attached to a new activation. - * - */ - -void act_thread_catt(void *ctx) { - - struct savearea *sv, *osv, *psv; - savearea_fpu *fsv, *ofsv, *pfsv; - savearea_vec *vsv, *ovsv, *pvsv; - unsigned int spc; - thread_t thread; - - sv = (struct savearea *)ctx; /* Make this easier for C */ - - fsv = CAST_DOWN(savearea_fpu *, sv->save_hdr.save_misc0); /* Get a possible floating point savearea */ - vsv = CAST_DOWN(savearea_vec *, sv->save_hdr.save_misc1); /* Get a possible vector savearea */ - - if((sv->save_hdr.save_misc2 != 0xDEBB1ED0) || (sv->save_hdr.save_misc3 != 0xE5DA11A5)) { /* See if valid savearea */ - panic("act_thread_catt: attempt to attach invalid general context savearea - %p\n", sv); /* Die */ - } - - if(fsv && ((fsv->save_hdr.save_misc2 != 0xDEBB1ED0) || (fsv->save_hdr.save_misc3 != 0xE5DA11A5))) { /* See if valid savearea */ - panic("act_thread_catt: attempt to attach invalid float context savearea - %p\n", fsv); /* Die */ - } - - if(vsv && ((vsv->save_hdr.save_misc2 != 0xDEBB1ED0) || (vsv->save_hdr.save_misc3 != 0xE5DA11A5))) { /* See if valid savearea */ - panic("act_thread_catt: attempt to attach invalid vector context savearea - %p\n", vsv); /* Die */ - } - - thread = current_thread(); - - act_machine_sv_free(thread, 0); /* Blow away any current kernel FP or vector. - We do not support those across a vfork */ - toss_live_fpu(thread->machine.curctx); /* Toss my floating point if live anywhere */ - toss_live_vec(thread->machine.curctx); /* Toss my vector if live anywhere */ - - sv->save_hdr.save_misc2 = 0; /* Eye catcher for debug */ - sv->save_hdr.save_misc3 = 0; /* Eye catcher for debug */ - sv->save_hdr.save_act = thread; - - spc = (unsigned int)thread->map->pmap->space; /* Get the space we're in */ - - osv = thread->machine.pcb; /* Get the top general savearea */ - psv = NULL; - while(osv) { /* Any saved state? */ - if(osv->save_srr1 & MASK(MSR_PR)) break; /* Leave if this is user state */ - psv = osv; /* Save previous savearea address */ - osv = CAST_DOWN(struct savearea *, osv->save_hdr.save_prev); /* Get one underneath our's */ - } - - if(osv) { /* Did we find one? */ - if(psv) psv->save_hdr.save_prev = 0; /* Yes, clear pointer to it (it should always be last) or */ - else thread->machine.pcb = NULL; /* to the start if the only one */ - - save_release(osv); /* Nope, release it */ - - } - - if(psv) psv->save_hdr.save_prev = (addr64_t)((uintptr_t)sv); /* Chain us to the end or */ - else thread->machine.pcb = (pcb_t)sv; /* to the start if the only one */ - thread->machine.upcb = (pcb_t)sv; /* Set the user pcb */ - - ovsv = thread->machine.curctx->VMXsave; /* Get the top vector savearea */ - - pvsv = NULL; - while(ovsv) { /* Any VMX saved state? */ - if(!(ovsv->save_hdr.save_level)) break; /* Leave if this is user state */ - pvsv = ovsv; /* Save previous savearea address */ - ovsv = CAST_DOWN(savearea_vec *, ovsv->save_hdr.save_prev); /* Get one underneath our's */ - } - - if(ovsv) { /* Did we find one? */ - if(pvsv) pvsv->save_hdr.save_prev = 0; /* Yes, clear pointer to it (it should always be last) or */ - else thread->machine.curctx->VMXsave = NULL; /* to the start if the only one */ - - save_release((struct savearea *)ovsv); /* Nope, release it */ - } - - if(vsv) { /* Are we sticking any vector on this one? */ - if(pvsv) pvsv->save_hdr.save_prev = (addr64_t)((uintptr_t)vsv); /* Yes, chain us to the end or */ - else { - thread->machine.curctx->VMXsave = vsv; /* to the start if the only one */ - thread->machine.curctx->VMXlevel = NULL; /* Insure that we don't have a leftover level */ - } - - vsv->save_hdr.save_misc2 = 0; /* Eye catcher for debug */ - vsv->save_hdr.save_misc3 = 0; /* Eye catcher for debug */ - vsv->save_hdr.save_act = thread; - } - - ofsv = thread->machine.curctx->FPUsave; /* Get the top float savearea */ - - pfsv = NULL; - while(ofsv) { /* Any float saved state? */ - if(!(ofsv->save_hdr.save_level)) break; /* Leave if this is user state */ - pfsv = ofsv; /* Save previous savearea address */ - ofsv = CAST_DOWN(savearea_fpu *, ofsv->save_hdr.save_prev); /* Get one underneath our's */ - } - - if(ofsv) { /* Did we find one? */ - if(pfsv) pfsv->save_hdr.save_prev = 0; /* Yes, clear pointer to it (it should always be last) or */ - else thread->machine.curctx->FPUsave = NULL; /* to the start if the only one */ - - save_release((struct savearea *)ofsv); /* Nope, release it */ - } - - if(fsv) { /* Are we sticking any vector on this one? */ - if(pfsv) pfsv->save_hdr.save_prev = (addr64_t)((uintptr_t)fsv); /* Yes, chain us to the end or */ - else { - thread->machine.curctx->FPUsave = fsv; /* to the start if the only one */ - thread->machine.curctx->FPUlevel = NULL; /* Insure that we don't have a leftover level */ - } - - fsv->save_hdr.save_misc2 = 0; /* Eye catcher for debug */ - fsv->save_hdr.save_misc3 = 0; /* Eye catcher for debug */ - fsv->save_hdr.save_act = thread; - } - -} - - - -/* - * Releases saved context. We need this because the saved context is opague. - * be adjusted when these contexts are attached to a new activation. - * - */ - -void -act_thread_cfree(void *ctx) -{ - - struct savearea *sv; - savearea_fpu *fsv; - savearea_vec *vsv; - - sv = (struct savearea *)ctx; /* Make this easier for C */ - - fsv = CAST_DOWN(savearea_fpu *, sv->save_hdr.save_misc0); /* Get a possible floating point savearea */ - vsv = CAST_DOWN(savearea_vec *, sv->save_hdr.save_misc1); /* Get a possible vector savearea */ - - if((sv->save_hdr.save_misc2 != 0xDEBB1ED0) || (sv->save_hdr.save_misc3 != 0xE5DA11A5)) { /* See if valid savearea */ - panic("act_thread_cfree: attempt to detatch invalid general context savearea - %p\n", sv); /* Die */ - } - - save_release(sv); /* Toss the general savearea */ - - if(fsv) { /* See if there is any saved floating point */ - if((fsv->save_hdr.save_misc2 != 0xDEBB1ED0) || (fsv->save_hdr.save_misc3 != 0xE5DA11A5)) { /* See if valid savearea */ - panic("act_thread_cfree: attempt to detatch invalid float context savearea - %p\n", fsv); /* Die */ - } - - save_release((struct savearea *)fsv); /* Toss saved context */ - } - - if(vsv) { /* See if there is any saved floating point */ - if((vsv->save_hdr.save_misc2 != 0xDEBB1ED0) || (vsv->save_hdr.save_misc3 != 0xE5DA11A5)) { /* See if valid savearea */ - panic("act_thread_cfree: attempt to detatch invalid vector context savearea - %p\n", vsv); /* Die */ - } - - save_release((struct savearea *)vsv); /* Toss saved context */ - } - - return; -} - -/* - * thread_enable_fpe: - * - * enables or disables floating point exceptions for the thread. - * returns old state - */ -int thread_enable_fpe( - thread_t thread, - int onoff) -{ - struct savearea *sv; - uint64_t oldmsr; - - sv = find_user_regs(thread); /* Find the user registers */ - if(!sv) sv = get_user_regs(thread); /* Didn't find any, allocate and initialize one */ - - oldmsr = sv->save_srr1; /* Get the old msr */ - - if(onoff) sv->save_srr1 = oldmsr | (uint64_t)(MASK(MSR_FE0) | MASK(MSR_FE1)); /* Flip on precise FP exceptions */ - else sv->save_srr1 = oldmsr & (uint64_t)(~(MASK(MSR_FE0) | MASK(MSR_FE1))); /* Flip on precise FP exceptions */ - - return ((oldmsr & (MASK(MSR_FE0) | MASK(MSR_FE1))) != 0); /* Return if it was enabled or not */ -} diff --git a/osfmk/ppc/task.h b/osfmk/ppc/task.h deleted file mode 100644 index 3c9ad4164..000000000 --- a/osfmk/ppc/task.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ - -/* - * No machine dependant task fields - */ - -#define MACHINE_TASK - diff --git a/osfmk/ppc/thread.h b/osfmk/ppc/thread.h deleted file mode 100644 index d3e4b1109..000000000 --- a/osfmk/ppc/thread.h +++ /dev/null @@ -1,212 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -/* - * File: machine/thread.h - * - * This file contains the structure definitions for the thread - * state as applied to PPC processors. - */ - -#ifndef _PPC_THREAD_H_ -#define _PPC_THREAD_H_ - -#include -#include -#include -#include -#include -#include - -/* - * Kernel state structure - * - * This holds the kernel state that is saved and restored across context - * switches. - */ - -/* - * PPC process control block - * - * The PCB holds normal context. It does not contain vector or floating point - * registers. - * - */ - -typedef struct savearea pcb; -typedef struct savearea *pcb_t; - -struct facility_context { - - savearea_fpu *FPUsave; /* The floating point savearea */ - struct savearea *FPUlevel; /* The floating point context level */ - unsigned int FPUcpu; /* The last processor to enable floating point */ - unsigned int FPUsync; /* Sync lock */ - savearea_vec *VMXsave; /* The VMX savearea */ - struct savearea *VMXlevel; /* The VMX context level */ - unsigned int VMXcpu; /* The last processor to enable vector */ - unsigned int VMXsync; /* Sync lock */ - struct thread *facAct; -}; - -typedef struct facility_context facility_context; - -/* - * Maps state flavor to number of words in the state: - */ -__private_extern__ unsigned int _MachineStateCount[]; - -#define USER_REGS(ThrAct) ((ThrAct)->machine.pcb) - -#define user_pc(ThrAct) ((ThrAct)->machine.pcb->save_srr0) - -#define act_machine_state_ptr(ThrAct) (thread_state_t)USER_REGS(ThrAct) - -struct machine_thread { - /* - * pointer to process control block control blocks. Potentially - * one for each active facility context. They may point to the - * same saveareas. - */ - struct savearea *pcb; /* The "normal" savearea */ - struct savearea *upcb; /* The "normal" user savearea */ - facility_context *curctx; /* Current facility context */ - facility_context *deferctx; /* Deferred facility context */ - facility_context facctx; /* "Normal" facility context */ - struct vmmCntrlEntry *vmmCEntry; /* Pointer current emulation context or 0 */ - struct vmmCntrlTable *vmmControl; /* Pointer to virtual machine monitor control table */ - uint64_t qactTimer; /* Time thread needs to interrupt. This is a single-shot timer. Zero is unset */ - unsigned int umwSpace; /* Address space ID for user memory window */ -#define umwSwitchAway 0x80000000 /* Context switched away from thread since MapUserAddressWindow */ -#define umwSwitchAwayb 0 - addr64_t umwRelo; /* Relocation value for user memory window */ - unsigned int ksp; /* points to TOP OF STACK or zero */ - unsigned int preemption_count; /* preemption count */ - struct per_proc_info *PerProc; /* current per processor data */ - unsigned int bbDescAddr; /* Points to Blue Box Trap descriptor area in kernel (page aligned) */ - unsigned int bbUserDA; /* Points to Blue Box Trap descriptor area in user (page aligned) */ - unsigned int bbTableStart; /* Points to Blue Box Trap dispatch area in user */ - unsigned int emPendRupts; /* Number of pending emulated interruptions */ - unsigned int bbTaskID; /* Opaque task ID for Blue Box threads */ - unsigned int bbTaskEnv; /* Opaque task data reference for Blue Box threads */ - unsigned int specFlags; /* Special flags */ - unsigned int pmcovfl[8]; /* PMC overflow count */ - unsigned int perfmonFlags; /* Perfmon facility flags */ - unsigned int bbTrap; /* Blue Box trap vector */ - unsigned int bbSysCall; /* Blue Box syscall vector */ - unsigned int bbInterrupt; /* Blue Box interrupt vector */ - unsigned int bbPending; /* Blue Box pending interrupt vector */ - -/* special flags bits */ - -#define ignoreZeroFaultbit 0 -#define floatUsedbit 1 -#define vectorUsedbit 2 -#define runningVMbit 4 -#define floatCngbit 5 -#define vectorCngbit 6 -#define timerPopbit 7 -#define userProtKeybit 8 -#define FamVMenabit 11 -#define FamVMmodebit 12 -#define perfMonitorbit 13 -#define OnProcbit 14 -/* NOTE: Do not move or assign bit 31 without changing exception vector ultra fast path code */ -#define bbThreadbit 28 -#define bbNoMachSCbit 29 -#define bbPreemptivebit 30 -#define spfReserved1 31 /* See note above */ - -#define ignoreZeroFault 0x80000000 /* (1<<(31-ignoreZeroFaultbit)) */ -#define floatUsed 0x40000000 /* (1<<(31-floatUsedbit)) */ -#define vectorUsed 0x20000000 /* (1<<(31-vectorUsedbit)) */ - -#define runningVM 0x08000000 /* (1<<(31-runningVMbit)) */ -#define floatCng 0x04000000 /* (1<<(31-floatCngbit)) */ -#define vectorCng 0x02000000 /* (1<<(31-vectorCngbit)) */ -#define timerPop 0x01000000 /* (1<<(31-timerPopbit)) */ - -#define userProtKey 0x00800000 /* (1<<(31-userProtKeybit)) */ - -#define FamVMena 0x00100000 /* (1<<(31-FamVMenabit)) */ -#define FamVMmode 0x00080000 /* (1<<(31-FamVMmodebit)) */ -#define perfMonitor 0x00040000 /* (1<<(31-perfMonitorbit)) */ -#define OnProc 0x00020000 /* (1<<(31-OnProcbit)) */ - -#define bbThread 0x00000008 /* (1<<(31-bbThreadbit)) */ -#define bbNoMachSC 0x00000004 /* (1<<(31-bbNoMachSCbit)) */ -#define bbPreemptive 0x00000002 /* (1<<(31-bbPreemptivebit)) */ - -#define fvChkb 0 -#define fvChk 0x80000000 - -#ifdef MACH_BSD - uint64_t cthread_self; /* for use of cthread package */ -#endif - -}; - -extern struct savearea *find_user_regs(thread_t); -extern struct savearea *get_user_regs(thread_t); -extern struct savearea_fpu *find_user_fpu(thread_t); -extern struct savearea_vec *find_user_vec(thread_t); -extern struct savearea_vec *find_user_vec_curr(void); -extern int thread_enable_fpe(thread_t act, int onoff); - -extern struct savearea *find_kern_regs(thread_t); - -extern void *act_thread_csave(void); -extern void act_thread_catt(void *ctx); -extern void act_thread_cfree(void *ctx); - -/* - * Return address of the function that called current function, given - * address of the first parameter of current function. We can't - * do it this way, since parameter was copied from a register - * into a local variable. Call an assembly sub-function to - * return this. - */ - -extern vm_offset_t getrpc(void); -#define GET_RETURN_PC(addr) getrpc() - -#define STACK_IKS(stack) \ - ((vm_offset_t)(((vm_offset_t)stack)+KERNEL_STACK_SIZE)-FM_SIZE) - -/* - * Defining this indicates that MD code will supply an exception() - * routine, conformant with kern/exception.c (dependency alert!) - * but which does wonderfully fast, machine-dependent magic. - */ - -#define MACHINE_FAST_EXCEPTION 1 - -#endif /* _PPC_THREAD_H_ */ diff --git a/osfmk/ppc/trap.c b/osfmk/ppc/trap.c deleted file mode 100644 index c30bf7381..000000000 --- a/osfmk/ppc/trap.c +++ /dev/null @@ -1,1012 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include -#include /* For kernel_map */ - -#include -#include -#include -#include /* for SR_xxx definitions */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -volatile perfCallback perfTrapHook; /* Pointer to CHUD trap hook routine */ -volatile perfCallback perfASTHook; /* Pointer to CHUD AST hook routine */ - -#if CONFIG_DTRACE -extern kern_return_t dtrace_user_probe(ppc_saved_state_t *sv); - -/* See */ -perfCallback tempDTraceTrapHook = NULL; /* Pointer to DTrace fbt trap hook routine */ - -extern boolean_t dtrace_tally_fault(user_addr_t); -#endif - -#if MACH_KDB -#include -#include -#include -#include - -boolean_t let_ddb_vm_fault = FALSE; -boolean_t debug_all_traps_with_kdb = FALSE; -extern struct db_watchpoint *db_watchpoint_list; -extern boolean_t db_watchpoints_inserted; -extern boolean_t db_breakpoints_inserted; - - - -#endif /* MACH_KDB */ - -extern task_t bsd_init_task; -extern char init_task_failure_data[]; -extern int not_in_kdp; - -#define PROT_EXEC (VM_PROT_EXECUTE) -#define PROT_RO (VM_PROT_READ) -#define PROT_RW (VM_PROT_READ|VM_PROT_WRITE) - - -/* A useful macro to update the ppc_exception_state in the PCB - * before calling doexception - */ -#define UPDATE_PPC_EXCEPTION_STATE { \ - thread_t _thread = current_thread(); \ - _thread->machine.pcb->save_dar = (uint64_t)dar; \ - _thread->machine.pcb->save_dsisr = dsisr; \ - _thread->machine.pcb->save_exception = trapno / T_VECTOR_SIZE; /* back to powerpc */ \ -} - -void unresolved_kernel_trap(int trapno, - struct savearea *ssp, - unsigned int dsisr, - addr64_t dar, - const char *message); - -static void handleMck(struct savearea *ssp); /* Common machine check handler */ - -#ifdef MACH_BSD -extern void get_procrustime(time_value_t *); -extern void bsd_uprofil(time_value_t *, user_addr_t); -#endif /* MACH_BSD */ - - -struct savearea *trap(int trapno, - struct savearea *ssp, - unsigned int dsisr, - addr64_t dar) -{ - int exception; - mach_exception_code_t code = 0; - mach_exception_subcode_t subcode = 0; - vm_map_t map; - vm_map_offset_t offset; - thread_t thread = current_thread(); - boolean_t intr; - ast_t *myast; - int ret; - -#ifdef MACH_BSD - time_value_t tv; -#endif /* MACH_BSD */ - - myast = ast_pending(); - perfCallback fn = perfASTHook; - if(fn) { - if(*myast & AST_CHUD_ALL) { - fn(trapno, ssp, dsisr, (unsigned int)dar); - } - } else { - *myast &= ~AST_CHUD_ALL; - } - - fn = perfTrapHook; - if(fn) { /* Is there a hook? */ - if(fn(trapno, ssp, dsisr, (unsigned int)dar) == KERN_SUCCESS) return ssp; /* If it succeeds, we are done... */ - } - -#if CONFIG_DTRACE - if(tempDTraceTrapHook) { /* Is there a hook? */ - if(tempDTraceTrapHook(trapno, ssp, dsisr, (unsigned int)dar) == KERN_SUCCESS) return ssp; /* If it succeeds, we are done... */ - } -#endif - -#if 0 - { - extern void fctx_text(void); - fctx_test(); - } -#endif - - exception = 0; /* Clear exception for now */ - -/* - * Remember that we are disabled for interruptions when we come in here. Because - * of latency concerns, we need to enable interruptions in the interrupted process - * was enabled itself as soon as we can. - */ - - intr = (ssp->save_srr1 & MASK(MSR_EE)) != 0; /* Remember if we were enabled */ - - /* Handle kernel traps first */ - - if (!USER_MODE(ssp->save_srr1)) { - /* - * Trap came from kernel - */ - switch (trapno) { - - case T_PREEMPT: /* Handle a preempt trap */ - ast_taken(AST_PREEMPTION, FALSE); - break; - - case T_PERF_MON: - perfmon_handle_pmi(ssp); - break; - - case T_RESET: /* Reset interruption */ - if (!Call_Debugger(trapno, ssp)) - unresolved_kernel_trap(trapno, ssp, dsisr, dar, NULL); - break; /* We just ignore these */ - - /* - * These trap types should never be seen by trap() - * in kernel mode, anyway. - * Some are interrupts that should be seen by - * interrupt() others just don't happen because they - * are handled elsewhere. Some could happen but are - * considered to be fatal in kernel mode. - */ - case T_DECREMENTER: - case T_IN_VAIN: /* Shouldn't ever see this, lowmem_vectors eats it */ - case T_SYSTEM_MANAGEMENT: - case T_ALTIVEC_ASSIST: - case T_INTERRUPT: - case T_FP_UNAVAILABLE: - case T_IO_ERROR: - case T_RESERVED: - default: - unresolved_kernel_trap(trapno, ssp, dsisr, dar, NULL); - break; - - -/* - * Here we handle a machine check in the kernel - */ - - case T_MACHINE_CHECK: - handleMck(ssp); /* Common to both user and kernel */ - break; - - - case T_ALIGNMENT: -/* -* If enaNotifyEMb is set, we get here, and -* we have actually already emulated the unaligned access. -* All that we want to do here is to ignore the interrupt. This is to allow logging or -* tracing of unaligned accesses. -*/ - - if(ssp->save_hdr.save_misc3) { /* Was it a handled exception? */ - unresolved_kernel_trap(trapno, ssp, dsisr, dar, NULL); /* Go panic */ - break; - } - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_EXCP_ALNG, 0) | DBG_FUNC_NONE, - (int)ssp->save_srr0 - 4, (int)dar, (int)dsisr, (int)ssp->save_lr, 0); - break; - - case T_EMULATE: -/* -* If enaNotifyEMb is set we get here, and -* we have actually already emulated the instruction. -* All that we want to do here is to ignore the interrupt. This is to allow logging or -* tracing of emulated instructions. -*/ - - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_EXCP_EMUL, 0) | DBG_FUNC_NONE, - (int)ssp->save_srr0 - 4, (int)((savearea_comm *)ssp)->save_misc2, (int)dsisr, (int)ssp->save_lr, 0); - break; - - - - - - case T_TRACE: - case T_RUNMODE_TRACE: - case T_INSTRUCTION_BKPT: - if (!Call_Debugger(trapno, ssp)) - unresolved_kernel_trap(trapno, ssp, dsisr, dar, NULL); - break; - - case T_PROGRAM: - if (ssp->save_srr1 & MASK(SRR1_PRG_TRAP)) { - if (!Call_Debugger(trapno, ssp)) - unresolved_kernel_trap(trapno, ssp, dsisr, dar, NULL); - } else { - unresolved_kernel_trap(trapno, ssp, - dsisr, dar, NULL); - } - break; - - case T_DATA_ACCESS: -#if MACH_KDB - mp_disable_preemption(); - if (debug_mode - && getPerProc()->debugger_active - && !let_ddb_vm_fault) { - /* - * Force kdb to handle this one. - */ - kdb_trap(trapno, ssp); - } - mp_enable_preemption(); -#endif /* MACH_KDB */ - /* can we take this during normal panic dump operation? */ - if (debug_mode - && getPerProc()->debugger_active - && !not_in_kdp) { - /* - * Access fault while in kernel core dump. - */ - kdp_dump_trap(trapno, ssp); - } - - - if(ssp->save_dsisr & dsiInvMode) { /* Did someone try to reserve cache inhibited? */ - panic("trap: disallowed access to cache inhibited memory - %016llX\n", dar); - } - - if(intr) ml_set_interrupts_enabled(TRUE); /* Enable if we were */ - - if(((dar >> 28) < 0xE) | ((dar >> 28) > 0xF)) { /* User memory window access? */ - - offset = (vm_map_offset_t)dar; /* Set the failing address */ - map = kernel_map; /* No, this is a normal kernel access */ - -/* - * Note: Some ROM device drivers will access page 0 when they start. The IOKit will - * set a flag to tell us to ignore any access fault on page 0. After the driver is - * opened, it will clear the flag. - */ - if((0 == (offset & -PAGE_SIZE)) && /* Check for access of page 0 and */ - ((thread->machine.specFlags) & ignoreZeroFault)) { /* special case of ignoring page zero faults */ - ssp->save_srr0 += 4; /* Point to next instruction */ - break; - } - -#if CONFIG_DTRACE - if (thread->options & TH_OPT_DTRACE) { /* Executing under dtrace_probe? */ - if (dtrace_tally_fault(dar)) { /* Should a fault under dtrace be ignored? */ - ssp->save_srr0 += 4; /* Point to next instruction */ - break; - } else { - unresolved_kernel_trap(trapno, ssp, dsisr, dar, "Unexpected page fault under dtrace_probe"); - } - } -#endif - - code = vm_fault(map, vm_map_trunc_page(offset), - dsisr & MASK(DSISR_WRITE) ? PROT_RW : PROT_RO, - FALSE, THREAD_UNINT, NULL, vm_map_trunc_page(0)); - - if (code != KERN_SUCCESS) { - unresolved_kernel_trap(trapno, ssp, dsisr, dar, NULL); - } else { - ssp->save_hdr.save_flags |= SAVredrive; /* Tell low-level to re-try fault */ - ssp->save_dsisr = (ssp->save_dsisr & - ~((MASK(DSISR_NOEX) | MASK(DSISR_PROT)))) | MASK(DSISR_HASH); /* Make sure this is marked as a miss */ - } - break; - } - - /* If we get here, the fault was due to a user memory window access */ - -#if CONFIG_DTRACE - if (thread->options & TH_OPT_DTRACE) { /* Executing under dtrace_probe? */ - if (dtrace_tally_fault(dar)) { /* Should a user memory window access fault under dtrace be ignored? */ - if (thread->recover) { - ssp->save_srr0 = thread->recover; - thread->recover = (vm_offset_t)NULL; - } else { - unresolved_kernel_trap(trapno, ssp, dsisr, dar, "copyin/out has no recovery point"); - } - break; - } else { - unresolved_kernel_trap(trapno, ssp, dsisr, dar, "Unexpected UMW page fault under dtrace_probe"); - } - } -#endif - - map = thread->map; - - offset = (vm_map_offset_t)(thread->machine.umwRelo + dar); /* Compute the user space address */ - - code = vm_fault(map, vm_map_trunc_page(offset), - dsisr & MASK(DSISR_WRITE) ? PROT_RW : PROT_RO, - FALSE, THREAD_UNINT, NULL, vm_map_trunc_page(0)); - - /* If we failed, there should be a recovery - * spot to rfi to. - */ - if (code != KERN_SUCCESS) { - if (thread->recover) { - ssp->save_srr0 = thread->recover; - thread->recover = (vm_offset_t)NULL; - } else { - unresolved_kernel_trap(trapno, ssp, dsisr, dar, "copyin/out has no recovery point"); - } - } - else { - ssp->save_hdr.save_flags |= SAVredrive; /* Tell low-level to re-try fault */ - ssp->save_dsisr = (ssp->save_dsisr & - ~((MASK(DSISR_NOEX) | MASK(DSISR_PROT)))) | MASK(DSISR_HASH); /* Make sure this is marked as a miss */ - } - - break; - - case T_INSTRUCTION_ACCESS: - -#if MACH_KDB - if (debug_mode - && getPerProc()->debugger_active - && !let_ddb_vm_fault) { - /* - * Force kdb to handle this one. - */ - kdb_trap(trapno, ssp); - } -#endif /* MACH_KDB */ - - /* Same as for data access, except fault type - * is PROT_EXEC and addr comes from srr0 - */ - - if(intr) ml_set_interrupts_enabled(TRUE); /* Enable if we were */ - - map = kernel_map; - - code = vm_fault(map, vm_map_trunc_page(ssp->save_srr0), - (PROT_EXEC | PROT_RO), FALSE, THREAD_UNINT, NULL, vm_map_trunc_page(0)); - - if (code != KERN_SUCCESS) { - unresolved_kernel_trap(trapno, ssp, dsisr, dar, NULL); - } else { - ssp->save_hdr.save_flags |= SAVredrive; /* Tell low-level to re-try fault */ - ssp->save_srr1 = (ssp->save_srr1 & - ~((unsigned long long)(MASK(DSISR_NOEX) | MASK(DSISR_PROT)))) | MASK(DSISR_HASH); /* Make sure this is marked as a miss */ - } - break; - - /* Usually shandler handles all the system calls, but the - * atomic thread switcher may throwup (via thandler) and - * have to pass it up to the exception handler. - */ - - case T_SYSTEM_CALL: - unresolved_kernel_trap(trapno, ssp, dsisr, dar, NULL); - break; - - case T_AST: - unresolved_kernel_trap(trapno, ssp, dsisr, dar, NULL); - break; - } - } else { - - /* - * Processing for user state traps with interrupt enabled - * For T_AST, interrupts are enabled in the AST delivery - */ - if (trapno != T_AST) - ml_set_interrupts_enabled(TRUE); - -#ifdef MACH_BSD - { - get_procrustime(&tv); - } -#endif /* MACH_BSD */ - - - /* - * Trap came from user task - */ - - switch (trapno) { - - case T_PREEMPT: - unresolved_kernel_trap(trapno, ssp, dsisr, dar, NULL); - break; - - case T_PERF_MON: - perfmon_handle_pmi(ssp); - break; - - /* - * These trap types should never be seen by trap() - * Some are interrupts that should be seen by - * interrupt() others just don't happen because they - * are handled elsewhere. - */ - case T_DECREMENTER: - case T_IN_VAIN: /* Shouldn't ever see this, lowmem_vectors eats it */ - case T_INTERRUPT: - case T_FP_UNAVAILABLE: - case T_SYSTEM_MANAGEMENT: - case T_RESERVED: - case T_IO_ERROR: - - default: - - ml_set_interrupts_enabled(FALSE); /* Turn off interruptions */ - - panic("Unexpected user state trap(cpu %d): 0x%08X DSISR=0x%08X DAR=0x%016llX PC=0x%016llX, MSR=0x%016llX\n", - cpu_number(), trapno, dsisr, dar, ssp->save_srr0, ssp->save_srr1); - break; - - - /* - * Here we handle a machine check in user state - */ - - case T_MACHINE_CHECK: - handleMck(ssp); /* Common to both user and kernel */ - break; - - case T_RESET: - ml_set_interrupts_enabled(FALSE); /* Turn off interruptions */ - if (!Call_Debugger(trapno, ssp)) - panic("Unexpected Reset exception: srr0 = %016llx, srr1 = %016llx\n", - ssp->save_srr0, ssp->save_srr1); - break; /* We just ignore these */ - - case T_ALIGNMENT: - /* - * If enaNotifyEMb is set, we get here, and - * we have actually already emulated the unaligned access. - * All that we want to do here is to ignore the interrupt. This is to allow logging or - * tracing of unaligned accesses. - */ - - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_EXCP_ALNG, 0) | DBG_FUNC_NONE, - (int)ssp->save_srr0 - 4, (int)dar, (int)dsisr, (int)ssp->save_lr, 0); - - if(ssp->save_hdr.save_misc3) { /* Was it a handled exception? */ - exception = EXC_BAD_ACCESS; /* Yes, throw exception */ - code = EXC_PPC_UNALIGNED; - subcode = dar; - } - break; - - case T_EMULATE: - /* - * If enaNotifyEMb is set we get here, and - * we have actually already emulated the instruction. - * All that we want to do here is to ignore the interrupt. This is to allow logging or - * tracing of emulated instructions. - */ - - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_EXCP_EMUL, 0) | DBG_FUNC_NONE, - (int)ssp->save_srr0 - 4, (int)((savearea_comm *)ssp)->save_misc2, (int)dsisr, (int)ssp->save_lr, 0); - break; - - case T_TRACE: /* Real PPC chips */ - case T_INSTRUCTION_BKPT: - exception = EXC_BREAKPOINT; - code = EXC_PPC_TRACE; - subcode = ssp->save_srr0; - break; - - case T_PROGRAM: - if (ssp->save_srr1 & MASK(SRR1_PRG_FE)) { - fpu_save(thread->machine.curctx); - UPDATE_PPC_EXCEPTION_STATE; - exception = EXC_ARITHMETIC; - code = EXC_ARITHMETIC; - - mp_disable_preemption(); - subcode = ssp->save_fpscr; - mp_enable_preemption(); - } - else if (ssp->save_srr1 & MASK(SRR1_PRG_ILL_INS)) { - - UPDATE_PPC_EXCEPTION_STATE - exception = EXC_BAD_INSTRUCTION; - code = EXC_PPC_UNIPL_INST; - subcode = ssp->save_srr0; - } else if ((unsigned int)ssp->save_srr1 & MASK(SRR1_PRG_PRV_INS)) { - - UPDATE_PPC_EXCEPTION_STATE; - exception = EXC_BAD_INSTRUCTION; - code = EXC_PPC_PRIVINST; - subcode = ssp->save_srr0; - } else if (ssp->save_srr1 & MASK(SRR1_PRG_TRAP)) { - unsigned int inst; - - if (copyin(ssp->save_srr0, (char *) &inst, 4 )) panic("copyin failed\n"); - - if(dgWork.dgFlags & enaDiagTrap) { /* Is the diagnostic trap enabled? */ - if((inst & 0xFFFFFFF0) == 0x0FFFFFF0) { /* Is this a TWI 31,R31,0xFFFx? */ - if(diagTrap(ssp, inst & 0xF)) { /* Call the trap code */ - ssp->save_srr0 += 4ULL; /* If we eat the trap, bump pc */ - exception = 0; /* Clear exception */ - break; /* All done here */ - } - } - } - -#if CONFIG_DTRACE - if(inst == 0x0FFFDDDD) { /* Is this the dtrace trap? */ - ret = dtrace_user_probe((ppc_saved_state_t *)ssp); /* Go check if it is for real and process if so... */ - if(ret == KERN_SUCCESS) { /* Was it really? */ - exception = 0; /* Clear the exception */ - break; /* Go flow through and out... */ - } - } -#endif - - UPDATE_PPC_EXCEPTION_STATE; - - if (inst == 0x7FE00008) { - exception = EXC_BREAKPOINT; - code = EXC_PPC_BREAKPOINT; - } else { - exception = EXC_SOFTWARE; - code = EXC_PPC_TRAP; - } - subcode = ssp->save_srr0; - } - break; - -#if CONFIG_DTRACE - case T_DTRACE_RET: /* Are we returning from a dtrace injection? */ - ret = dtrace_user_probe((ppc_saved_state_t *)ssp); /* Call the probe function if so... */ - if(ret == KERN_SUCCESS) { /* Did this actually work? */ - exception = 0; /* Clear the exception */ - break; /* Go flow through and out... */ - } - break; -#endif - - case T_ALTIVEC_ASSIST: - UPDATE_PPC_EXCEPTION_STATE; - exception = EXC_ARITHMETIC; - code = EXC_PPC_ALTIVECASSIST; - subcode = ssp->save_srr0; - break; - - case T_DATA_ACCESS: - map = thread->map; - - if(ssp->save_dsisr & dsiInvMode) { /* Did someone try to reserve cache inhibited? */ - UPDATE_PPC_EXCEPTION_STATE; /* Don't even bother VM with this one */ - exception = EXC_BAD_ACCESS; - subcode = dar; - break; - } - - code = vm_fault(map, vm_map_trunc_page(dar), - dsisr & MASK(DSISR_WRITE) ? PROT_RW : PROT_RO, - FALSE, THREAD_ABORTSAFE, NULL, vm_map_trunc_page(0)); - - if ((code != KERN_SUCCESS) && (code != KERN_ABORTED)) { - UPDATE_PPC_EXCEPTION_STATE; - exception = EXC_BAD_ACCESS; - subcode = dar; - } else { - ssp->save_hdr.save_flags |= SAVredrive; /* Tell low-level to retry fault */ - ssp->save_dsisr = (ssp->save_dsisr & - ~((MASK(DSISR_NOEX) | MASK(DSISR_PROT)))) | MASK(DSISR_HASH); /* Make sure this is marked as a miss */ - } - break; - - case T_INSTRUCTION_ACCESS: - /* Same as for data access, except fault type - * is PROT_EXEC and addr comes from srr0 - */ - map = thread->map; - - code = vm_fault(map, vm_map_trunc_page(ssp->save_srr0), - (PROT_EXEC | PROT_RO), FALSE, THREAD_ABORTSAFE, NULL, vm_map_trunc_page(0)); - - if ((code != KERN_SUCCESS) && (code != KERN_ABORTED)) { - UPDATE_PPC_EXCEPTION_STATE; - exception = EXC_BAD_ACCESS; - subcode = ssp->save_srr0; - } else { - ssp->save_hdr.save_flags |= SAVredrive; /* Tell low-level to re-try fault */ - ssp->save_srr1 = (ssp->save_srr1 & - ~((unsigned long long)(MASK(DSISR_NOEX) | MASK(DSISR_PROT)))) | MASK(DSISR_HASH); /* Make sure this is marked as a miss */ - } - break; - - case T_AST: - /* AST delivery is done below */ - break; - - } - -#ifdef MACH_BSD - { - bsd_uprofil(&tv, ssp->save_srr0); - } -#endif /* MACH_BSD */ - } - - if (exception) { - /* if this is the init task, save the exception information */ - /* this probably is a fatal exception */ -#if 0 - if(bsd_init_task == current_task()) { - char *buf; - int i; - - buf = init_task_failure_data; - - - buf += sprintf(buf, "Exception Code = 0x%x, Subcode = 0x%x\n", code, subcode); - buf += sprintf(buf, "DSISR = 0x%08x, DAR = 0x%016llx\n" - , dsisr, dar); - - for (i=0; i<32; i++) { - if ((i % 8) == 0) { - buf += sprintf(buf, "\n%4d :",i); - } - buf += sprintf(buf, " %08x",*(&ssp->save_r0+i)); - } - - buf += sprintf(buf, "\n\n"); - buf += sprintf(buf, "cr = 0x%08X\t\t",ssp->save_cr); - buf += sprintf(buf, "xer = 0x%08X\n",ssp->save_xer); - buf += sprintf(buf, "lr = 0x%016llX\t\t",ssp->save_lr); - buf += sprintf(buf, "ctr = 0x%016llX\n",ssp->save_ctr); - buf += sprintf(buf, "srr0(iar) = 0x%016llX\t\t",ssp->save_srr0); - buf += sprintf(buf, "srr1(msr) = 0x%016llX\n",ssp->save_srr1, - "\x10\x11""EE\x12PR\x13""FP\x14ME\x15""FE0\x16SE\x18" - "FE1\x19""AL\x1a""EP\x1bIT\x1c""DT"); - buf += sprintf(buf, "\n\n"); - - /* generate some stack trace */ - buf += sprintf(buf, "Application level back trace:\n"); - if (ssp->save_srr1 & MASK(MSR_PR)) { - char *addr = (char*)ssp->save_r1; - unsigned int stack_buf[3]; - for (i = 0; i < 8; i++) { - if (addr == (char*)NULL) - break; - if (!copyin(ssp->save_r1,(char*)stack_buf, - 3 * sizeof(int))) { - buf += sprintf(buf, "0x%08X : 0x%08X\n" - ,addr,stack_buf[2]); - addr = (char*)stack_buf[0]; - } else { - break; - } - } - } - buf[0] = '\0'; - } -#endif - doexception(exception, code, subcode); - } - /* AST delivery - * Check to see if we need an AST, if so take care of it here - */ - ml_set_interrupts_enabled(FALSE); - - if (USER_MODE(ssp->save_srr1)) { - myast = ast_pending(); - while (*myast & AST_ALL) { - ast_taken(AST_ALL, intr); - ml_set_interrupts_enabled(FALSE); - myast = ast_pending(); - } - } - - return ssp; -} - -/* This routine is called from assembly before each and every system call. - * It must preserve r3. - */ - -extern int syscall_trace(int, struct savearea *); - - -extern int pmdebug; - -int syscall_trace(int retval, struct savearea *ssp) -{ - int i, argc; - int kdarg[3]; -/* Always prepare to trace mach system calls */ - - kdarg[0]=0; - kdarg[1]=0; - kdarg[2]=0; - - argc = mach_trap_table[-((unsigned int)ssp->save_r0)].mach_trap_arg_count; - - if (argc > 3) - argc = 3; - - for (i=0; i < argc; i++) - kdarg[i] = (int)*(&ssp->save_r3 + i); - - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_EXCP_SC, (-(ssp->save_r0))) | DBG_FUNC_START, - kdarg[0], kdarg[1], kdarg[2], 0, 0); - - return retval; -} - -/* This routine is called from assembly after each mach system call - * It must preserve r3. - */ - -extern int syscall_trace_end(int, struct savearea *); - -int syscall_trace_end(int retval, struct savearea *ssp) -{ - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_EXCP_SC,(-((unsigned int)ssp->save_r0))) | DBG_FUNC_END, - retval, 0, 0, 0, 0); - return retval; -} - -/* - * called from syscall if there is an error - */ - -int syscall_error( - int exception, - mach_exception_code_t code, - mach_exception_subcode_t subcode, - struct savearea *ssp) -{ - register thread_t thread; - - thread = current_thread(); - - if (thread == 0) - panic("syscall error in boot phase"); - - if (!USER_MODE(ssp->save_srr1)) - panic("system call called from kernel"); - - doexception(exception, code, subcode); - - return 0; -} - -/* Pass up a server syscall/exception */ -void -doexception( - int exc, - mach_exception_code_t code, - mach_exception_subcode_t sub) -{ - mach_exception_data_type_t codes[EXCEPTION_CODE_MAX]; - - codes[0] = code; - codes[1] = sub; - exception_triage(exc, codes, 2); -} - -const char *trap_type[] = { - "Unknown", - "0x100 - System reset", - "0x200 - Machine check", - "0x300 - Data access", - "0x400 - Inst access", - "0x500 - Ext int", - "0x600 - Alignment", - "0x700 - Program", - "0x800 - Floating point", - "0x900 - Decrementer", - "0xA00 - n/a", - "0xB00 - n/a", - "0xC00 - System call", - "0xD00 - Trace", - "0xE00 - FP assist", - "0xF00 - Perf mon", - "0xF20 - VMX", - "INVALID EXCEPTION", - "INVALID EXCEPTION", - "INVALID EXCEPTION", - "0x1300 - Inst bkpnt", - "0x1400 - Sys mgmt", - "0x1600 - Altivec Assist", - "0x1700 - Thermal", - "INVALID EXCEPTION", - "INVALID EXCEPTION", - "INVALID EXCEPTION", - "INVALID EXCEPTION", - "INVALID EXCEPTION", - "INVALID EXCEPTION", - "INVALID EXCEPTION", - "INVALID EXCEPTION", - "Emulate", - "0x2000 - Run Mode/Trace", - "Signal Processor", - "Preemption", - "Context Switch", - "Shutdown", - "System Failure" -}; -int TRAP_TYPES = sizeof (trap_type) / sizeof (trap_type[0]); - -void unresolved_kernel_trap(int trapno, - struct savearea *ssp, - __unused unsigned int dsisr, - addr64_t dar, - const char *message) -{ - const char *trap_name; - - ml_set_interrupts_enabled(FALSE); /* Turn off interruptions */ - lastTrace = LLTraceSet(0); /* Disable low-level tracing */ - -#if 0 - { - struct per_proc_info *pp; - kprintf(" srr0: %016llX\n", ssp->save_srr0); /* (TEST/DEBUG) */ - kprintf(" srr1: %016llX\n", ssp->save_srr1); /* (TEST/DEBUG) */ - kprintf(" dar: %016llX\n", ssp->save_dar); /* (TEST/DEBUG) */ - kprintf(" xcp: %08X\n", ssp->save_exception); /* (TEST/DEBUG) */ - kprintf(" ins0: %08X\n", ssp->save_instr[0]); /* (TEST/DEBUG) */ - kprintf(" ins1: %08X\n", ssp->save_instr[1]); /* (TEST/DEBUG) */ - kprintf(" ins2: %08X\n", ssp->save_instr[2]); /* (TEST/DEBUG) */ - kprintf(" ins3: %08X\n", ssp->save_instr[3]); /* (TEST/DEBUG) */ - kprintf(" ins4: %08X\n", ssp->save_instr[4]); /* (TEST/DEBUG) */ - kprintf(" ins5: %08X\n", ssp->save_instr[5]); /* (TEST/DEBUG) */ - kprintf(" ins6: %08X\n", ssp->save_instr[6]); /* (TEST/DEBUG) */ - kprintf(" ins7: %08X\n", ssp->save_instr[7]); /* (TEST/DEBUG) */ - pp = getPerProc(); /* (TEST/DEBUG) */ - kprintf("ijsave: %016llX\n", pp->ijsave); /* (TEST/DEBUG) */ - } -#endif - - if( logPanicDataToScreen ) - disable_debug_output = FALSE; - - debug_mode++; - if ((unsigned)trapno <= T_MAX) - trap_name = trap_type[trapno / T_VECTOR_SIZE]; - else - trap_name = "???? unrecognized exception"; - if (message == NULL) - message = trap_name; - - kdb_printf("\n\nUnresolved kernel trap(cpu %d): %s DAR=0x%016llX PC=0x%016llX\n", - cpu_number(), trap_name, dar, ssp->save_srr0); - - print_backtrace(ssp); - - panic_caller = (0xFFFF0000 | (trapno / T_VECTOR_SIZE) ); - /* Commit the panic log buffer to NVRAM, unless otherwise - * specified via a boot-arg. - */ - if (panicDebugging) - commit_paniclog(); - - draw_panic_dialog(); - /* XXX: This is yet another codepath into the debugger, which should - * be reworked to enter the primary panic codepath instead. - * The idea appears to be to enter the debugger (performing a - * stack switch) as soon as possible, but we do have a - * savearea encapsulating state (accessible by walking the savearea - * chain), so that's superfluous. - */ - if( panicDebugging ) - (void)Call_Debugger(trapno, ssp); - panic_plain("%s", message); -} - -const char *corr[2] = {"uncorrected", "corrected "}; - -void handleMck(struct savearea *ssp) { /* Common machine check handler */ - - int cpu; - - cpu = cpu_number(); - - printf("Machine check (%d) - %s - pc = %016llX, msr = %016llX, dsisr = %08X, dar = %016llX\n", - cpu, corr[ssp->save_hdr.save_misc3], ssp->save_srr0, ssp->save_srr1, ssp->save_dsisr, ssp->save_dar); /* Tell us about it */ - printf("Machine check (%d) - AsyncSrc = %016llX, CoreFIR = %016llx\n", cpu, ssp->save_xdat0, ssp->save_xdat1); - printf("Machine check (%d) - L2FIR = %016llX, BusFir = %016llx\n", cpu, ssp->save_xdat2, ssp->save_xdat3); - - if(ssp->save_hdr.save_misc3) return; /* Leave the the machine check was recovered */ - - panic("Uncorrectable machine check: pc = %016llX, msr = %016llX, dsisr = %08X, dar = %016llX\n" - " AsyncSrc = %016llX, CoreFIR = %016llx\n" - " L2FIR = %016llX, BusFir = %016llx\n", - ssp->save_srr0, ssp->save_srr1, ssp->save_dsisr, ssp->save_dar, - ssp->save_xdat0, ssp->save_xdat1, ssp->save_xdat2, ssp->save_xdat3); - - return; -} - -void -thread_syscall_return( - kern_return_t ret) -{ - register thread_t thread = current_thread(); - register struct savearea *regs = USER_REGS(thread); - - if (kdebug_enable && ((unsigned int)regs->save_r0 & 0x80000000)) { - /* Mach trap */ - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_EXCP_SC,(-(regs->save_r0))) | DBG_FUNC_END, - ret, 0, 0, 0, 0); - } - regs->save_r3 = ret; - - thread_exception_return(); - /*NOTREACHED*/ -} - - -#if MACH_KDB -void -thread_kdb_return(void) -{ - register thread_t thread = current_thread(); - register struct savearea *regs = USER_REGS(thread); - - Call_Debugger(thread->machine.pcb->save_exception, regs); - thread_exception_return(); - /*NOTREACHED*/ -} -#endif /* MACH_KDB */ diff --git a/osfmk/ppc/trap.h b/osfmk/ppc/trap.h deleted file mode 100644 index 2a4a33ca8..000000000 --- a/osfmk/ppc/trap.h +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ - -#ifndef _PPC_TRAP_H_ -#define _PPC_TRAP_H_ - -/* maximum number of arguments to a syscall trap */ -#define NARGS 12 -/* Size to reserve in frame for arguments - first 8 are in registers */ -#define ARG_SIZE FM_ALIGN((NARGS-8)*4) -#define MUNGE_ARGS_SIZE FM_ALIGN(8*8) - -/* - * Hardware exception vectors for powerpc are in exception.h - */ - -#ifndef ASSEMBLER - -#include -#include -#include -#include - -extern void doexception(int exc, mach_exception_code_t code, - mach_exception_subcode_t sub); - -extern struct savearea* trap(int trapno, - struct savearea *ss, - unsigned int dsisr, - addr64_t dar); - -typedef kern_return_t (*perfCallback)(int trapno, struct savearea *ss, - unsigned int dsisr, addr64_t dar); - -extern volatile perfCallback perfTrapHook; -extern volatile perfCallback perfASTHook; -extern volatile perfCallback perfIntHook; - -extern struct savearea* interrupt(int intno, - struct savearea *ss, - unsigned int dsisr, - unsigned int dar); - -extern int syscall_error(int exception, - int64_t code, - int64_t subcode, - struct savearea *ss); - - -#endif /* ASSEMBLER */ - -#endif /* _PPC_TRAP_H_ */ diff --git a/osfmk/ppc/vm_tuning.h b/osfmk/ppc/vm_tuning.h deleted file mode 100644 index 6cf00baeb..000000000 --- a/osfmk/ppc/vm_tuning.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ - -#ifndef _MACHINE_VM_TUNING_H_ -#define _MACHINE_VM_TUNING_H_ - -#endif /* _MACHINE_VM_TUNING_H_ */ diff --git a/osfmk/ppc/vmachmon.c b/osfmk/ppc/vmachmon.c deleted file mode 100644 index f8d7caac6..000000000 --- a/osfmk/ppc/vmachmon.c +++ /dev/null @@ -1,2024 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/*----------------------------------------------------------------------- -** vmachmon.c -** -** C routines that we are adding to the MacOS X kernel. -** ------------------------------------------------------------------------*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -extern double FloatInit; -extern unsigned long QNaNbarbarian[4]; - -/************************************************************************************* - Virtual Machine Monitor Internal Routines -**************************************************************************************/ - -/*----------------------------------------------------------------------- -** vmm_get_entry -** -** This function verifies and return a vmm context entry index -** -** Inputs: -** act - pointer to current thread activation -** index - index into vmm control table (this is a "one based" value) -** -** Outputs: -** address of a vmmCntrlEntry or 0 if not found ------------------------------------------------------------------------*/ - -static vmmCntrlEntry *vmm_get_entry( - thread_t act, - vmm_thread_index_t index) -{ - vmmCntrlTable *CTable; - vmmCntrlEntry *CEntry; - - index = index & vmmTInum; /* Clean up the index */ - - if (act->machine.vmmControl == 0) return NULL; /* No control table means no vmm */ - if ((index - 1) >= kVmmMaxContexts) return NULL; /* Index not in range */ - - CTable = act->machine.vmmControl; /* Make the address a bit more convienient */ - CEntry = &CTable->vmmc[index - 1]; /* Point to the entry */ - - if (!(CEntry->vmmFlags & vmmInUse)) return NULL; /* See if the slot is actually in use */ - - return CEntry; -} - -/*----------------------------------------------------------------------- -** vmm_get_adsp -** -** This function verifies and returns the pmap for an address space. -** If there is none and the request is valid, a pmap will be created. -** -** Inputs: -** act - pointer to current thread activation -** index - index into vmm control table (this is a "one based" value) -** -** Outputs: -** address of a pmap or 0 if not found or could no be created -** Note that if there is no pmap for the address space it will be created. ------------------------------------------------------------------------*/ - -static pmap_t vmm_get_adsp(thread_t act, vmm_thread_index_t index) -{ - pmap_t pmap; - - if (act->machine.vmmControl == 0) return NULL; /* No control table means no vmm */ - if ((index - 1) >= kVmmMaxContexts) return NULL; /* Index not in range */ - - pmap = act->machine.vmmControl->vmmAdsp[index - 1]; /* Get the pmap */ - return (pmap); /* and return it. */ -} - -/*----------------------------------------------------------------------- -** vmm_build_shadow_hash -** -** Allocate and initialize a shadow hash table. -** -** This function assumes that PAGE_SIZE is 4k-bytes. -** ------------------------------------------------------------------------*/ -static pmap_vmm_ext *vmm_build_shadow_hash(pmap_t pmap) -{ - pmap_vmm_ext *ext; /* VMM pmap extension we're building */ - ppnum_t extPP; /* VMM pmap extension physical page number */ - kern_return_t ret; /* Return code from various calls */ - uint32_t pages = GV_HPAGES; /* Number of pages in the hash table */ - vm_offset_t free = VMX_HPIDX_OFFSET; /* Offset into extension page of free area (128-byte aligned) */ - uint32_t freeSize = PAGE_SIZE - free; /* Number of free bytes in the extension page */ - uint32_t idx; - - if ((pages * sizeof(addr64_t)) + (pages * sizeof(vm_offset_t)) > freeSize) { - panic("vmm_build_shadow_hash: too little pmap_vmm_ext free space\n"); - } - - ret = kmem_alloc_kobject(kernel_map, (vm_offset_t *)&ext, PAGE_SIZE); - /* Allocate a page-sized extension block */ - if (ret != KERN_SUCCESS) return (NULL); /* Return NULL for failed allocate */ - bzero((char *)ext, PAGE_SIZE); /* Zero the entire extension block page */ - - extPP = pmap_find_phys(kernel_pmap, (vm_offset_t)ext); - /* Get extension block's physical page number */ - if (!extPP) { /* This should not fail, but then again... */ - panic("vmm_build_shadow_hash: could not translate pmap_vmm_ext vaddr %p\n", ext); - } - - ext->vmxSalt = (addr64_t)(vm_offset_t)ext ^ ptoa_64(extPP); - /* Set effective<->physical conversion salt */ - ext->vmxHostPmapPhys = (addr64_t)(vm_offset_t)pmap ^ pmap->pmapvr; - /* Set host pmap's physical address */ - ext->vmxHostPmap = pmap; /* Set host pmap's effective address */ - ext->vmxHashPgIdx = (addr64_t *)((vm_offset_t)ext + VMX_HPIDX_OFFSET); - /* Allocate physical index */ - ext->vmxHashPgList = (vm_offset_t *)((vm_offset_t)ext + VMX_HPLIST_OFFSET); - /* Allocate page list */ - ext->vmxActiveBitmap = (vm_offset_t *)((vm_offset_t)ext + VMX_ACTMAP_OFFSET); - /* Allocate active mapping bitmap */ - - /* The hash table is typically larger than a single page, but we don't require it to be in a - contiguous virtual or physical chunk. So, we allocate it page by page, noting the effective and - physical address of each page in vmxHashPgList and vmxHashPgIdx, respectively. */ - for (idx = 0; idx < pages; idx++) { - mapping_t *map; - uint32_t mapIdx; - ret = kmem_alloc_kobject(kernel_map, &ext->vmxHashPgList[idx], PAGE_SIZE); - /* Allocate a hash-table page */ - if (ret != KERN_SUCCESS) goto fail; /* Allocation failed, exit through cleanup */ - bzero((char *)ext->vmxHashPgList[idx], PAGE_SIZE); /* Zero the page */ - ext->vmxHashPgIdx[idx] = ptoa_64(pmap_find_phys(kernel_pmap, (addr64_t)ext->vmxHashPgList[idx])); - /* Put page's physical address into index */ - if (!ext->vmxHashPgIdx[idx]) { /* Hash-table page's LRA failed */ - panic("vmm_build_shadow_hash: could not translate hash-table vaddr %08X\n", ext->vmxHashPgList[idx]); - } - map = (mapping_t *)ext->vmxHashPgList[idx]; - for (mapIdx = 0; mapIdx < GV_SLTS_PPG; mapIdx++) { /* Iterate over mappings in this page */ - map->mpFlags = (mpGuest | mpgFree); /* Mark guest type and free */ - map = (mapping_t *)((char *)map + GV_SLOT_SZ); /* Next slot-sized mapping */ - } - } - - return (ext); /* Return newly-minted VMM pmap extension */ - -fail: - for (idx = 0; idx < pages; idx++) { /* De-allocate any pages we managed to allocate */ - if (ext->vmxHashPgList[idx]) { - kmem_free(kernel_map, ext->vmxHashPgList[idx], PAGE_SIZE); - } - } - kmem_free(kernel_map, (vm_offset_t)ext, PAGE_SIZE); /* Release the VMM pmap extension page */ - return (NULL); /* Return NULL for failure */ -} - - -/*----------------------------------------------------------------------- -** vmm_release_shadow_hash -** -** Release shadow hash table and VMM extension block -** ------------------------------------------------------------------------*/ -static void vmm_release_shadow_hash(pmap_vmm_ext *ext) -{ - uint32_t idx; - - for (idx = 0; idx < GV_HPAGES; idx++) { /* Release the hash table page by page */ - kmem_free(kernel_map, ext->vmxHashPgList[idx], PAGE_SIZE); - } - - kmem_free(kernel_map, (vm_offset_t)ext, PAGE_SIZE); /* Release the VMM pmap extension page */ -} - -/*----------------------------------------------------------------------- -** vmm_activate_gsa -** -** Activate guest shadow assist -** ------------------------------------------------------------------------*/ -static kern_return_t vmm_activate_gsa( - thread_t act, - vmm_thread_index_t index) -{ - vmmCntrlTable *CTable = act->machine.vmmControl; /* Get VMM control table */ - vmmCntrlEntry *CEntry; - pmap_t hpmap; - pmap_t gpmap; - if (!CTable) { /* Caller guarantees that this will work */ - panic("vmm_activate_gsa: VMM control table not present; act = %p, idx = %lu\n", - act, index); - return KERN_FAILURE; - } - CEntry = vmm_get_entry(act, index); /* Get context from index */ - if (!CEntry) { /* Caller guarantees that this will work */ - panic("vmm_activate_gsa: Unexpected failure of vmm_get_entry; act = %p, idx = %lu\n", - act, index); - return KERN_FAILURE; - } - - hpmap = act->map->pmap; /* Get host pmap */ - gpmap = vmm_get_adsp(act, index); /* Get guest pmap */ - if (!gpmap) { /* Caller guarantees that this will work */ - panic("vmm_activate_gsa: Unexpected failure of vmm_get_adsp; act = %p, idx = %lu\n", - act, index); - return KERN_FAILURE; - } - - if (!hpmap->pmapVmmExt) { /* If there's no VMM extension for this host, create one */ - hpmap->pmapVmmExt = vmm_build_shadow_hash(hpmap); /* Build VMM extension plus shadow hash and attach */ - if (hpmap->pmapVmmExt) { /* See if we succeeded */ - hpmap->pmapVmmExtPhys = (addr64_t)(vm_offset_t)hpmap->pmapVmmExt ^ hpmap->pmapVmmExt->vmxSalt; - /* Get VMM extensions block physical address */ - } else { - return KERN_RESOURCE_SHORTAGE; /* Not enough mojo to go */ - } - } - gpmap->pmapVmmExt = hpmap->pmapVmmExt; /* Copy VMM extension block virtual address into guest */ - gpmap->pmapVmmExtPhys = hpmap->pmapVmmExtPhys; /* and its physical address, too */ - gpmap->pmapFlags |= pmapVMgsaa; /* Enable GSA for this guest */ - CEntry->vmmXAFlgs |= vmmGSA; /* Show GSA active here, too */ - - return KERN_SUCCESS; -} - - -/*----------------------------------------------------------------------- -** vmm_deactivate_gsa -** -** Deactivate guest shadow assist -** ------------------------------------------------------------------------*/ -static void -vmm_deactivate_gsa( - thread_t act, - vmm_thread_index_t index) -{ - vmmCntrlEntry *CEntry = vmm_get_entry(act, index); /* Get context from index */ - pmap_t gpmap; - if (!CEntry) { /* Caller guarantees that this will work */ - panic("vmm_deactivate_gsa: Unexpected failure of vmm_get_entry; act = %p, idx = %lu\n", - act, index); - } - - gpmap = vmm_get_adsp(act, index); /* Get guest pmap */ - if (!gpmap) { /* Caller guarantees that this will work */ - panic("vmm_deactivate_gsa: Unexpected failure of vmm_get_adsp; act = %p, idx = %lu\n", - act, index); - } - - gpmap->pmapFlags &= ~pmapVMgsaa; /* Deactivate GSA for this guest */ - CEntry->vmmXAFlgs &= ~vmmGSA; /* Show GSA deactivated here, too */ -} - - -/*----------------------------------------------------------------------- -** vmm_flush_context -** -** Flush specified guest context, purging all guest mappings and clearing -** the context page. -** ------------------------------------------------------------------------*/ -static void vmm_flush_context( - thread_t act, - vmm_thread_index_t index) -{ - vmmCntrlEntry *CEntry; - vmmCntrlTable *CTable; - vmm_state_page_t *vks; - vmm_version_t version; - - CEntry = vmm_get_entry(act, index); /* Convert index to entry */ - if (!CEntry) { /* Caller guarantees that this will work */ - panic("vmm_flush_context: Unexpected failure of vmm_get_entry; act = %p, idx = %lu\n", - act, index); - return; - } - - if(CEntry->vmmFacCtx.FPUsave) { /* Is there any floating point context? */ - toss_live_fpu(&CEntry->vmmFacCtx); /* Get rid of any live context here */ - save_release((struct savearea *)CEntry->vmmFacCtx.FPUsave); /* Release it */ - } - - if(CEntry->vmmFacCtx.VMXsave) { /* Is there any vector context? */ - toss_live_vec(&CEntry->vmmFacCtx); /* Get rid of any live context here */ - save_release((struct savearea *)CEntry->vmmFacCtx.VMXsave); /* Release it */ - } - - vmm_unmap_all_pages(act, index); /* Blow away all mappings for this context */ - - CTable = act->machine.vmmControl; /* Get the control table address */ - CTable->vmmGFlags = CTable->vmmGFlags & ~vmmLastAdSp; /* Make sure we don't try to automap into this */ - - CEntry->vmmFlags &= vmmInUse; /* Clear out all of the flags for this entry except in use */ - CEntry->vmmFacCtx.FPUsave = NULL; /* Clear facility context control */ - CEntry->vmmFacCtx.FPUlevel = NULL; /* Clear facility context control */ - CEntry->vmmFacCtx.FPUcpu = 0; /* Clear facility context control */ - CEntry->vmmFacCtx.VMXsave = NULL; /* Clear facility context control */ - CEntry->vmmFacCtx.VMXlevel = NULL; /* Clear facility context control */ - CEntry->vmmFacCtx.VMXcpu = 0; /* Clear facility context control */ - - vks = CEntry->vmmContextKern; /* Get address of the context page */ - version = vks->interface_version; /* Save the version code */ - bzero((char *)vks, 4096); /* Clear all */ - - vks->interface_version = version; /* Set our version code */ - vks->thread_index = index % vmmTInum; /* Tell the user the index for this virtual machine */ - - /* Context is now flushed */ -} - - -/************************************************************************************* - Virtual Machine Monitor Exported Functionality - - The following routines are used to implement a quick-switch mechanism for - virtual machines that need to execute within their own processor envinroment - (including register and MMU state). -**************************************************************************************/ - -/*----------------------------------------------------------------------- -** vmm_get_version -** -** This function returns the current version of the virtual machine -** interface. It is divided into two portions. The top 16 bits -** represent the major version number, and the bottom 16 bits -** represent the minor version number. Clients using the Vmm -** functionality should make sure they are using a verison new -** enough for them. -** -** Inputs: -** none -** -** Outputs: -** 32-bit number representing major/minor version of -** the Vmm module ------------------------------------------------------------------------*/ - -int vmm_get_version(struct savearea *save) -{ - save->save_r3 = kVmmCurrentVersion; /* Return the version */ - return 1; -} - - -/*----------------------------------------------------------------------- -** Vmm_get_features -** -** This function returns a set of flags that represents the functionality -** supported by the current verison of the Vmm interface. Clients should -** use this to determine whether they can run on this system. -** -** Inputs: -** none -** -** Outputs: -** 32-bit number representing functionality supported by this -** version of the Vmm module ------------------------------------------------------------------------*/ - -int vmm_get_features(struct savearea *save) -{ - save->save_r3 = kVmmCurrentFeatures; /* Return the features */ - if(getPerProc()->pf.Available & pf64Bit) { - save->save_r3 &= ~kVmmFeature_LittleEndian; /* No little endian here */ - save->save_r3 |= kVmmFeature_SixtyFourBit; /* Set that we can do 64-bit */ - } - return 1; -} - - -/*----------------------------------------------------------------------- -** vmm_max_addr -** -** This function returns the maximum addressable virtual address sported -** -** Outputs: -** Returns max address ------------------------------------------------------------------------*/ - -addr64_t -vmm_max_addr(__unused thread_t act) -{ - return vm_max_address; /* Return the maximum address */ -} - -/*----------------------------------------------------------------------- -** vmm_get_XA -** -** This function retrieves the eXtended Architecture flags for the specifed VM. -** -** We need to return the result in the return code rather than in the return parameters -** because we need an architecture independent format so the results are actually -** usable by the host. For example, the return parameters for 64-bit are 8 bytes wide vs. -** 4 for 32-bit. -** -** -** Inputs: -** act - pointer to current thread activation structure -** index - index returned by vmm_init_context -** -** Outputs: -** Return code is set to the XA flags. If the index is invalid or the -** context has not been created, we return 0. ------------------------------------------------------------------------*/ - -unsigned int vmm_get_XA( - thread_t act, - vmm_thread_index_t index) -{ - vmmCntrlEntry *CEntry; - - CEntry = vmm_get_entry(act, index); /* Convert index to entry */ - if (CEntry == NULL) return 0; /* Either this isn't a vmm or the index is bogus */ - - return CEntry->vmmXAFlgs; /* Return the flags */ -} - -/*----------------------------------------------------------------------- -** vmm_init_context -** -** This function initializes an emulation context. It allocates -** a new pmap (address space) and fills in the initial processor -** state within the specified structure. The structure, mapped -** into the client's logical address space, must be page-aligned. -** -** Inputs: -** act - pointer to current thread activation -** version - requested version of the Vmm interface (allowing -** future versions of the interface to change, but still -** support older clients) -** vmm_user_state - pointer to a logical page within the -** client's address space -** -** Outputs: -** kernel return code indicating success or failure ------------------------------------------------------------------------*/ - -int vmm_init_context(struct savearea *save) -{ - - thread_t act; - vmm_version_t version; - vmm_state_page_t * vmm_user_state; - vmmCntrlTable *CTable; - vm_offset_t conkern; - vmm_state_page_t * vks; - ppnum_t conphys; - kern_return_t ret; - int cvi, i; - task_t task; - thread_t fact, gact; - pmap_t hpmap; - pmap_t gpmap; - - vmm_user_state = CAST_DOWN(vmm_state_page_t *, save->save_r4); /* Get the user address of the comm area */ - if ((unsigned int)vmm_user_state & (PAGE_SIZE - 1)) { /* Make sure the comm area is page aligned */ - save->save_r3 = KERN_FAILURE; /* Return failure */ - return 1; - } - - /* Make sure that the version requested is supported */ - version = save->save_r3; /* Pick up passed in version */ - if (((version >> 16) < kVmmMinMajorVersion) || ((version >> 16) > (kVmmCurrentVersion >> 16))) { - save->save_r3 = KERN_FAILURE; /* Return failure */ - return 1; - } - - if((version & 0xFFFF) > kVmmCurMinorVersion) { /* Check for valid minor */ - save->save_r3 = KERN_FAILURE; /* Return failure */ - return 1; - } - - act = current_thread(); /* Pick up our activation */ - - ml_set_interrupts_enabled(TRUE); /* This can take a bit of time so pass interruptions */ - - task = current_task(); /* Figure out who we are */ - - task_lock(task); /* Lock our task */ - - fact = (thread_t)task->threads.next; /* Get the first activation on task */ - gact = NULL; /* Pretend we didn't find it yet */ - - for(i = 0; i < task->thread_count; i++) { /* All of the activations */ - if(fact->machine.vmmControl) { /* Is this a virtual machine monitor? */ - gact = fact; /* Yeah... */ - break; /* Bail the loop... */ - } - fact = (thread_t)fact->task_threads.next; /* Go to the next one */ - } - - -/* - * We only allow one thread per task to be a virtual machine monitor right now. This solves - * a number of potential problems that I can't put my finger on right now. - * - * Utlimately, I think we want to move the controls and make all this task based instead of - * thread based. That would allow an emulator architecture to spawn a kernel thread for each - * VM (if they want) rather than hand dispatch contexts. - */ - - if(gact && (gact != act)) { /* Check if another thread is a vmm or trying to be */ - task_unlock(task); /* Release task lock */ - ml_set_interrupts_enabled(FALSE); /* Set back interruptions */ - save->save_r3 = KERN_FAILURE; /* We must play alone... */ - return 1; - } - - if(!gact) act->machine.vmmControl = (vmmCntrlTable *)1; /* Temporarily mark that we are the vmm thread */ - - task_unlock(task); /* Safe to release now (because we've marked ourselves) */ - - CTable = act->machine.vmmControl; /* Get the control table address */ - if ((unsigned int)CTable == 1) { /* If we are marked, try to allocate a new table, otherwise we have one */ - if(!(CTable = (vmmCntrlTable *)kalloc(sizeof(vmmCntrlTable)))) { /* Get a fresh emulation control table */ - act->machine.vmmControl = NULL; /* Unmark us as vmm 'cause we failed */ - ml_set_interrupts_enabled(FALSE); /* Set back interruptions */ - save->save_r3 = KERN_RESOURCE_SHORTAGE; /* No storage... */ - return 1; - } - - bzero((void *)CTable, sizeof(vmmCntrlTable)); /* Clean it up */ - act->machine.vmmControl = CTable; /* Initialize the table anchor */ - } - - for(cvi = 0; cvi < kVmmMaxContexts; cvi++) { /* Search to find a free slot */ - if(!(CTable->vmmc[cvi].vmmFlags & vmmInUse)) break; /* Bail if we find an unused slot */ - } - - if(cvi >= kVmmMaxContexts) { /* Did we find one? */ - ml_set_interrupts_enabled(FALSE); /* Set back interruptions */ - save->save_r3 = KERN_RESOURCE_SHORTAGE; /* No empty slots... */ - return 1; - } - - ret = vm_map_wire( /* Wire the virtual machine monitor's context area */ - act->map, - (vm_offset_t)vmm_user_state, - (vm_offset_t)vmm_user_state + PAGE_SIZE, - VM_PROT_READ | VM_PROT_WRITE, - FALSE); - - if (ret != KERN_SUCCESS) /* The wire failed, return the code */ - goto return_in_shame; - - /* Map the vmm state into the kernel's address space. */ - conphys = pmap_find_phys(act->map->pmap, (addr64_t)((uintptr_t)vmm_user_state)); - - /* Find a virtual address to use. */ - ret = kmem_alloc_pageable(kernel_map, &conkern, PAGE_SIZE); - if (ret != KERN_SUCCESS) { /* Did we find an address? */ - (void) vm_map_unwire(act->map, /* No, unwire the context area */ - (vm_offset_t)vmm_user_state, - (vm_offset_t)vmm_user_state + PAGE_SIZE, - TRUE); - goto return_in_shame; - } - - /* Map it into the kernel's address space. */ - - pmap_enter(kernel_pmap, conkern, conphys, - VM_PROT_READ | VM_PROT_WRITE, - VM_WIMG_USE_DEFAULT, TRUE); - - /* Clear the vmm state structure. */ - vks = (vmm_state_page_t *)conkern; - bzero((char *)vks, PAGE_SIZE); - - - /* We're home free now. Simply fill in the necessary info and return. */ - - vks->interface_version = version; /* Set our version code */ - vks->thread_index = cvi + 1; /* Tell the user the index for this virtual machine */ - - CTable->vmmc[cvi].vmmFlags = vmmInUse; /* Mark the slot in use and make sure the rest are clear */ - CTable->vmmc[cvi].vmmContextKern = vks; /* Remember the kernel address of comm area */ - CTable->vmmc[cvi].vmmContextPhys = conphys; /* Remember the state page physical addr */ - CTable->vmmc[cvi].vmmContextUser = vmm_user_state; /* Remember user address of comm area */ - - CTable->vmmc[cvi].vmmFacCtx.FPUsave = NULL; /* Clear facility context control */ - CTable->vmmc[cvi].vmmFacCtx.FPUlevel = NULL; /* Clear facility context control */ - CTable->vmmc[cvi].vmmFacCtx.FPUcpu = 0; /* Clear facility context control */ - CTable->vmmc[cvi].vmmFacCtx.VMXsave = NULL; /* Clear facility context control */ - CTable->vmmc[cvi].vmmFacCtx.VMXlevel = NULL; /* Clear facility context control */ - CTable->vmmc[cvi].vmmFacCtx.VMXcpu = 0; /* Clear facility context control */ - CTable->vmmc[cvi].vmmFacCtx.facAct = act; /* Point back to the activation */ - - (void)hw_atomic_add(&saveanchor.savetarget, 2); /* Account for the number of extra saveareas we think we might "need" */ - - hpmap = act->map->pmap; /* Get host pmap */ - gpmap = pmap_create(0, FALSE); /* Make a fresh guest pmap */ - if (gpmap) { /* Did we succeed ? */ - CTable->vmmAdsp[cvi] = gpmap; /* Remember guest pmap for new context */ - if (lowGlo.lgVMMforcedFeats & vmmGSA) { /* Forcing on guest shadow assist ? */ - vmm_activate_gsa(act, cvi+1); /* Activate GSA */ - } - } else { - ret = KERN_RESOURCE_SHORTAGE; /* We've failed to allocate a guest pmap */ - goto return_in_shame; /* Shame on us. */ - } - - if (!(hpmap->pmapFlags & pmapVMhost)) { /* Do this stuff if this is our first time hosting */ - hpmap->pmapFlags |= pmapVMhost; /* We're now hosting */ - } - - ml_set_interrupts_enabled(FALSE); /* Set back interruptions */ - save->save_r3 = KERN_SUCCESS; /* Hip, hip, horay... */ - return 1; - -return_in_shame: - if(!gact) kfree(CTable, sizeof(vmmCntrlTable)); /* Toss the table if we just allocated it */ - act->machine.vmmControl = NULL; /* Unmark us as vmm 'cause we failed */ - ml_set_interrupts_enabled(FALSE); /* Set back interruptions */ - save->save_r3 = ret; /* Pass back return code... */ - return 1; - -} - - -/*----------------------------------------------------------------------- -** vmm_tear_down_context -** -** This function uninitializes an emulation context. It deallocates -** internal resources associated with the context block. -** -** Inputs: -** act - pointer to current thread activation structure -** index - index returned by vmm_init_context -** -** Outputs: -** kernel return code indicating success or failure -** -** Strangeness note: -** This call will also trash the address space with the same ID. While this -** is really not too cool, we have to do it because we need to make -** sure that old VMM users (not that we really have any) who depend upon -** the address space going away with the context still work the same. ------------------------------------------------------------------------*/ - -kern_return_t vmm_tear_down_context( - thread_t act, - vmm_thread_index_t index) -{ - vmmCntrlEntry *CEntry; - vmmCntrlTable *CTable; - int cvi; - pmap_t gpmap; - pmap_t pmap; - - CEntry = vmm_get_entry(act, index); /* Convert index to entry */ - if (CEntry == NULL) return KERN_FAILURE; /* Either this isn't vmm thread or the index is bogus */ - - ml_set_interrupts_enabled(TRUE); /* This can take a bit of time so pass interruptions */ - - (void)hw_atomic_sub(&saveanchor.savetarget, 2); /* We don't need these extra saveareas anymore */ - - if(CEntry->vmmFacCtx.FPUsave) { /* Is there any floating point context? */ - toss_live_fpu(&CEntry->vmmFacCtx); /* Get rid of any live context here */ - save_release((struct savearea *)CEntry->vmmFacCtx.FPUsave); /* Release it */ - } - - if(CEntry->vmmFacCtx.VMXsave) { /* Is there any vector context? */ - toss_live_vec(&CEntry->vmmFacCtx); /* Get rid of any live context here */ - save_release((struct savearea *)CEntry->vmmFacCtx.VMXsave); /* Release it */ - } - - CEntry->vmmPmap = NULL; /* Remove this trace */ - gpmap = act->machine.vmmControl->vmmAdsp[index - 1]; - /* Get context's guest pmap (if any) */ - if (gpmap) { /* Check if there is an address space assigned here */ - if (gpmap->pmapFlags & pmapVMgsaa) { /* Handle guest shadow assist case specially */ - hw_rem_all_gv(gpmap); /* Remove all guest mappings from shadow hash table */ - } else { - mapping_remove(gpmap, 0xFFFFFFFFFFFFF000LL);/* Remove final page explicitly because we might have mapped it */ - pmap_remove(gpmap, 0, 0xFFFFFFFFFFFFF000LL);/* Remove all entries from this map */ - } - pmap_destroy(gpmap); /* Toss the pmap for this context */ - act->machine.vmmControl->vmmAdsp[index - 1] = NULL; /* Clean it up */ - } - - (void) vm_map_unwire( /* Unwire the user comm page */ - act->map, - (vm_offset_t)CEntry->vmmContextUser, - (vm_offset_t)CEntry->vmmContextUser + PAGE_SIZE, - FALSE); - - kmem_free(kernel_map, (vm_offset_t)CEntry->vmmContextKern, PAGE_SIZE); /* Remove kernel's view of the comm page */ - - CTable = act->machine.vmmControl; /* Get the control table address */ - CTable->vmmGFlags = CTable->vmmGFlags & ~vmmLastAdSp; /* Make sure we don't try to automap into this */ - - CEntry->vmmFlags = 0; /* Clear out all of the flags for this entry including in use */ - CEntry->vmmContextKern = NULL; /* Clear the kernel address of comm area */ - CEntry->vmmContextUser = NULL; /* Clear the user address of comm area */ - - CEntry->vmmFacCtx.FPUsave = NULL; /* Clear facility context control */ - CEntry->vmmFacCtx.FPUlevel = NULL; /* Clear facility context control */ - CEntry->vmmFacCtx.FPUcpu = 0; /* Clear facility context control */ - CEntry->vmmFacCtx.VMXsave = NULL; /* Clear facility context control */ - CEntry->vmmFacCtx.VMXlevel = NULL; /* Clear facility context control */ - CEntry->vmmFacCtx.VMXcpu = 0; /* Clear facility context control */ - CEntry->vmmFacCtx.facAct = NULL; /* Clear facility context control */ - - for(cvi = 0; cvi < kVmmMaxContexts; cvi++) { /* Search to find a free slot */ - if(CTable->vmmc[cvi].vmmFlags & vmmInUse) { /* Return if there are still some in use */ - ml_set_interrupts_enabled(FALSE); /* No more interruptions */ - return KERN_SUCCESS; /* Leave... */ - } - } - -/* - * When we have tossed the last context, toss any address spaces left over before releasing - * the VMM control block - */ - - for(cvi = 1; cvi <= kVmmMaxContexts; cvi++) { /* Look at all slots */ - if(!act->machine.vmmControl->vmmAdsp[index - 1]) continue; /* Nothing to remove here */ - mapping_remove(act->machine.vmmControl->vmmAdsp[index - 1], 0xFFFFFFFFFFFFF000LL); /* Remove final page explicitly because we might have mapped it */ - pmap_remove(act->machine.vmmControl->vmmAdsp[index - 1], 0, 0xFFFFFFFFFFFFF000LL); /* Remove all entries from this map */ - pmap_destroy(act->machine.vmmControl->vmmAdsp[index - 1]); /* Toss the pmap for this context */ - act->machine.vmmControl->vmmAdsp[index - 1] = NULL; /* Clear just in case */ - } - - pmap = act->map->pmap; /* Get our pmap */ - if (pmap->pmapVmmExt) { /* Release any VMM pmap extension block and shadow hash table */ - vmm_release_shadow_hash(pmap->pmapVmmExt); /* Release extension block and shadow hash table */ - pmap->pmapVmmExt = NULL; /* Forget extension block */ - pmap->pmapVmmExtPhys = 0; /* Forget extension block's physical address, too */ - } - pmap->pmapFlags &= ~pmapVMhost; /* We're no longer hosting */ - - kfree(CTable, sizeof(vmmCntrlTable)); /* Toss the table because to tossed the last context */ - act->machine.vmmControl = NULL; /* Unmark us as vmm */ - - ml_set_interrupts_enabled(FALSE); /* No more interruptions */ - - return KERN_SUCCESS; -} - - -/*----------------------------------------------------------------------- -** vmm_activate_XA -** -** This function activates the eXtended Architecture flags for the specifed VM. -** -** We need to return the result in the return code rather than in the return parameters -** because we need an architecture independent format so the results are actually -** usable by the host. For example, the return parameters for 64-bit are 8 bytes wide vs. -** 4 for 32-bit. -** -** Note that this function does a lot of the same stuff as vmm_tear_down_context -** and vmm_init_context. -** -** Inputs: -** act - pointer to current thread activation structure -** index - index returned by vmm_init_context -** flags - the extended architecture flags -** -** -** Outputs: -** KERN_SUCCESS if vm is valid and initialized. KERN_FAILURE if not. -** Also, the internal flags are set and, additionally, the VM is completely reset. ------------------------------------------------------------------------*/ -kern_return_t vmm_activate_XA( - thread_t act, - vmm_thread_index_t index, - unsigned int xaflags) -{ - vmmCntrlEntry *CEntry; - kern_return_t result = KERN_SUCCESS; /* Assume success */ - - if ((xaflags & ~kVmmSupportedSetXA) || ((xaflags & vmm64Bit) && (!getPerProc()->pf.Available & pf64Bit))) - return (KERN_FAILURE); /* Unknown or unsupported feature requested */ - - CEntry = vmm_get_entry(act, index); /* Convert index to entry */ - if (CEntry == NULL) return KERN_FAILURE; /* Either this isn't a vmm or the index is bogus */ - - ml_set_interrupts_enabled(TRUE); /* This can take a bit of time so pass interruptions */ - - vmm_flush_context(act, index); /* Flush the context */ - - if (xaflags & vmm64Bit) { /* Activating 64-bit mode ? */ - CEntry->vmmXAFlgs |= vmm64Bit; /* Activate 64-bit mode */ - } - - if (xaflags & vmmGSA) { /* Activating guest shadow assist ? */ - result = vmm_activate_gsa(act, index); /* Activate guest shadow assist */ - } - - ml_set_interrupts_enabled(FALSE); /* No more interruptions */ - - return result; /* Return activate result */ -} - -/*----------------------------------------------------------------------- -** vmm_deactivate_XA -** ------------------------------------------------------------------------*/ -kern_return_t vmm_deactivate_XA( - thread_t act, - vmm_thread_index_t index, - unsigned int xaflags) -{ - vmmCntrlEntry *CEntry; - kern_return_t result = KERN_SUCCESS; /* Assume success */ - - if ((xaflags & ~kVmmSupportedSetXA) || ((xaflags & vmm64Bit) && (getPerProc()->pf.Available & pf64Bit))) - return (KERN_FAILURE); /* Unknown or unsupported feature requested */ - - CEntry = vmm_get_entry(act, index); /* Convert index to entry */ - if (CEntry == NULL) return KERN_FAILURE; /* Either this isn't a vmm or the index is bogus */ - - ml_set_interrupts_enabled(TRUE); /* This can take a bit of time so pass interruptions */ - - vmm_flush_context(act, index); /* Flush the context */ - - if (xaflags & vmm64Bit) { /* Deactivating 64-bit mode ? */ - CEntry->vmmXAFlgs &= ~vmm64Bit; /* Deactivate 64-bit mode */ - } - - if (xaflags & vmmGSA) { /* Deactivating guest shadow assist ? */ - vmm_deactivate_gsa(act, index); /* Deactivate guest shadow assist */ - } - - ml_set_interrupts_enabled(FALSE); /* No more interruptions */ - - return result; /* Return deactivate result */ -} - - -/*----------------------------------------------------------------------- -** vmm_tear_down_all -** -** This function uninitializes all emulation contexts. If there are -** any vmm contexts, it calls vmm_tear_down_context for each one. -** -** Note: this can also be called from normal thread termination. Because of -** that, we will context switch out of an alternate if we are currenty in it. -** It will be terminated with no valid return code set because we don't expect -** the activation to ever run again. -** -** Inputs: -** activation to tear down -** -** Outputs: -** All vmm contexts released and VMM shut down ------------------------------------------------------------------------*/ -void vmm_tear_down_all(thread_t act) { - - vmmCntrlTable *CTable; - int cvi; - kern_return_t ret; - struct savearea *save; - spl_t s; - - if(act->machine.specFlags & runningVM) { /* Are we actually in a context right now? */ - save = find_user_regs(act); /* Find the user state context */ - if(!save) { /* Did we find it? */ - panic("vmm_tear_down_all: runningVM marked but no user state context\n"); - return; - } - - save->save_exception = kVmmBogusContext*4; /* Indicate that this context is bogus now */ - s = splhigh(); /* Make sure interrupts are off */ - vmm_force_exit(act, save); /* Force and exit from VM state */ - splx(s); /* Restore interrupts */ - } - - if(act->machine.vmmControl) { /* Do we have a vmm control block? */ - CTable = act->machine.vmmControl; - for(cvi = 1; cvi <= kVmmMaxContexts; cvi++) { /* Look at all slots */ - if(CTable->vmmc[cvi - 1].vmmFlags & vmmInUse) { /* Is this one in use */ - ret = vmm_tear_down_context(act, cvi); /* Take down the found context */ - if(ret != KERN_SUCCESS) { /* Did it go away? */ - panic("vmm_tear_down_all: vmm_tear_down_context failed; ret=%08X, act = %p, cvi = %d\n", - ret, act, cvi); - } - } - } - -/* - * Note that all address apces should be gone here. - */ - if(act->machine.vmmControl) { /* Did we find one? */ - panic("vmm_tear_down_all: control table did not get deallocated\n"); /* Table did not go away */ - } - } -} - -/*----------------------------------------------------------------------- -** vmm_map_page -** -** This function maps a page from within the client's logical -** address space into the alternate address space. -** -** The page need not be locked or resident. If not resident, it will be faulted -** in by this code, which may take some time. Also, if the page is not locked, -** it, and this mapping may disappear at any time, even before it gets used. Note also -** that reference and change information is NOT preserved when a page is unmapped, either -** explicitly or implicitly (e.g., a pageout, being unmapped in the non-alternate address -** space). This means that if RC is needed, the page MUST be wired. -** -** Note that if there is already a mapping at the address, it is removed and all -** information (including RC) is lost BEFORE an attempt is made to map it. Also, -** if the map call fails, the old address is still unmapped.. -** -** Inputs: -** act - pointer to current thread activation -** index - index of address space to map into -** va - virtual address within the client's address -** space -** ava - virtual address within the alternate address -** space -** prot - protection flags -** -** Note that attempted mapping of areas in nested pmaps (shared libraries) or block mapped -** areas are not allowed and will fail. Same with directly mapped I/O areas. -** -** Input conditions: -** Interrupts disabled (from fast trap) -** -** Outputs: -** kernel return code indicating success or failure -** if success, va resident and alternate mapping made ------------------------------------------------------------------------*/ - -kern_return_t vmm_map_page( - thread_t act, - vmm_adsp_id_t index, - addr64_t cva, - addr64_t ava, - vm_prot_t prot) -{ - kern_return_t ret; - register mapping_t *mp; - vm_map_t map; - addr64_t ova, nextva; - pmap_t pmap; - - pmap = vmm_get_adsp(act, index); /* Get the guest pmap for this address space */ - if(!pmap) return KERN_FAILURE; /* Bogus address space, no VMs, or we can't make a pmap, failure... */ - - if(ava > vm_max_address) return kVmmInvalidAddress; /* Does the machine support an address of this size? */ - - map = current_thread()->map; /* Get the host's map */ - - if (pmap->pmapFlags & pmapVMgsaa) { /* Guest shadow assist active ? */ - ret = hw_res_map_gv(map->pmap, pmap, cva, ava, getProtPPC(prot, TRUE)); - /* Attempt to resume an existing gv->phys mapping */ - if (mapRtOK != ret) { /* Nothing to resume, construct a new mapping */ - unsigned int pindex; - phys_entry_t *physent; - unsigned int pattr; - unsigned int wimg; - unsigned int mflags; - addr64_t gva; - - while (1) { /* Find host mapping or fail */ - mp = mapping_find(map->pmap, cva, &nextva, 0); - /* Attempt to find host mapping and pin it */ - if (mp) break; /* Got it */ - - ml_set_interrupts_enabled(TRUE); - /* Open 'rupt window */ - ret = vm_fault(map, /* Didn't find it, try to fault in host page read/write */ - vm_map_trunc_page(cva), - VM_PROT_READ | VM_PROT_WRITE, - FALSE, /* change wiring */ - THREAD_UNINT, - NULL, - 0); - ml_set_interrupts_enabled(FALSE); - /* Close 'rupt window */ - if (ret != KERN_SUCCESS) - return KERN_FAILURE; /* Fault failed, return failure */ - } - - if (mpNormal != (mp->mpFlags & mpType)) { - /* Host mapping must be a vanilla page */ - mapping_drop_busy(mp); /* Un-pin host mapping */ - return KERN_FAILURE; /* Return failure */ - } - - /* Partially construct gv->phys mapping */ - physent = mapping_phys_lookup(mp->mpPAddr, &pindex); - if (!physent) { - mapping_drop_busy(mp); - return KERN_FAILURE; - } - pattr = ((physent->ppLink & (ppI | ppG)) >> 60); - wimg = 0x2; - if (pattr & mmFlgCInhib) wimg |= 0x4; - if (pattr & mmFlgGuarded) wimg |= 0x1; - mflags = (pindex << 16) | mpGuest; - gva = ((ava & ~mpHWFlags) | (wimg << 3) | getProtPPC(prot, TRUE)); - - hw_add_map_gv(map->pmap, pmap, gva, mflags, mp->mpPAddr); - /* Construct new guest->phys mapping */ - - mapping_drop_busy(mp); /* Un-pin host mapping */ - } - } else { - while(1) { /* Keep trying until we get it or until we fail */ - - mp = mapping_find(map->pmap, cva, &nextva, 0); /* Find the mapping for this address */ - - if(mp) break; /* We found it */ - - ml_set_interrupts_enabled(TRUE); /* Enable interruptions */ - ret = vm_fault(map, /* Didn't find it, try to fault it in read/write... */ - vm_map_trunc_page(cva), - VM_PROT_READ | VM_PROT_WRITE, - FALSE, /*change wiring */ - THREAD_UNINT, - NULL, - 0); - ml_set_interrupts_enabled(FALSE); /* Disable interruptions */ - if (ret != KERN_SUCCESS) return KERN_FAILURE; /* There isn't a page there, return... */ - } - - if((mp->mpFlags & mpType) != mpNormal) { /* If this is a block, a nest, or some other special thing, we can't map it */ - mapping_drop_busy(mp); /* We have everything we need from the mapping */ - return KERN_FAILURE; /* Leave in shame */ - } - - while(1) { /* Keep trying the enter until it goes in */ - ova = mapping_make(pmap, ava, mp->mpPAddr, 0, 1, prot); /* Enter the mapping into the pmap */ - if(!ova) break; /* If there were no collisions, we are done... */ - mapping_remove(pmap, ova); /* Remove the mapping that collided */ - } - - mapping_drop_busy(mp); /* We have everything we need from the mapping */ - } - - if (!((getPerProc()->spcFlags) & FamVMmode)) { - act->machine.vmmControl->vmmLastMap = ava & 0xFFFFFFFFFFFFF000ULL; /* Remember the last mapping we made */ - act->machine.vmmControl->vmmGFlags = (act->machine.vmmControl->vmmGFlags & ~vmmLastAdSp) | index; /* Remember last address space */ - } - - return KERN_SUCCESS; -} - - -/*----------------------------------------------------------------------- -** vmm_map_execute -** -** This function maps a page from within the client's logical -** address space into the alternate address space of the -** Virtual Machine Monitor context and then directly starts executing. -** -** See description of vmm_map_page for details. -** -** Inputs: -** Index is used for both the context and the address space ID. -** index[24:31] is the context id and index[16:23] is the address space. -** if the address space ID is 0, the context ID is used for it. -** -** Outputs: -** Normal exit is to run the VM. Abnormal exit is triggered via a -** non-KERN_SUCCESS return from vmm_map_page or later during the -** attempt to transition into the VM. ------------------------------------------------------------------------*/ - -vmm_return_code_t vmm_map_execute( - thread_t act, - vmm_thread_index_t index, - addr64_t cva, - addr64_t ava, - vm_prot_t prot) -{ - kern_return_t ret; - vmmCntrlEntry *CEntry; - unsigned int adsp; - vmm_thread_index_t cndx; - - cndx = index & 0xFF; /* Clean it up */ - - CEntry = vmm_get_entry(act, cndx); /* Get and validate the index */ - if (CEntry == NULL) return kVmmBogusContext; /* Return bogus context */ - - if (((getPerProc()->spcFlags) & FamVMmode) && (CEntry != act->machine.vmmCEntry)) - return kVmmBogusContext; /* Yes, invalid index in Fam */ - - adsp = (index >> 8) & 0xFF; /* Get any requested address space */ - if(!adsp) adsp = (index & 0xFF); /* If 0, use context ID as address space ID */ - - ret = vmm_map_page(act, adsp, cva, ava, prot); /* Go try to map the page on in */ - - - if(ret == KERN_SUCCESS) { - act->machine.vmmControl->vmmLastMap = ava & 0xFFFFFFFFFFFFF000ULL; /* Remember the last mapping we made */ - act->machine.vmmControl->vmmGFlags = (act->machine.vmmControl->vmmGFlags & ~vmmLastAdSp) | cndx; /* Remember last address space */ - vmm_execute_vm(act, cndx); /* Return was ok, launch the VM */ - } - - return ret; /* We had trouble mapping in the page */ - -} - -/*----------------------------------------------------------------------- -** vmm_map_list -** -** This function maps a list of pages into various address spaces -** -** Inputs: -** act - pointer to current thread activation -** index - index of default address space (used if not specifed in list entry -** count - number of pages to release -** flavor - 0 if 32-bit version, 1 if 64-bit -** vmcpComm in the comm page contains up to kVmmMaxMapPages to map -** -** Outputs: -** kernel return code indicating success or failure -** KERN_FAILURE is returned if kVmmMaxUnmapPages is exceeded -** or the vmm_map_page call fails. -** We return kVmmInvalidAddress if virtual address size is not supported ------------------------------------------------------------------------*/ - -kern_return_t vmm_map_list( - thread_t act, - vmm_adsp_id_t index, - unsigned int cnt, - unsigned int flavor) -{ - vmmCntrlEntry *CEntry; - boolean_t ret; - unsigned int i; - vmmMList *lst; - vmmMList64 *lstx; - addr64_t cva; - addr64_t ava; - vm_prot_t prot; - vmm_adsp_id_t adsp; - - CEntry = vmm_get_entry(act, index); /* Convert index to entry */ - if (CEntry == NULL) return KERN_FAILURE; /* Either this isn't a vmm or the index is bogus */ - - if(cnt > kVmmMaxMapPages) return KERN_FAILURE; /* They tried to map too many */ - if(!cnt) return KERN_SUCCESS; /* If they said none, we're done... */ - - lst = (vmmMList *)&((vmm_comm_page_t *)CEntry->vmmContextKern)->vmcpComm[0]; /* Point to the first entry */ - lstx = (vmmMList64 *)&((vmm_comm_page_t *)CEntry->vmmContextKern)->vmcpComm[0]; /* Point to the first entry */ - - for(i = 0; i < cnt; i++) { /* Step and release all pages in list */ - if(flavor) { /* Check if 32- or 64-bit addresses */ - cva = lstx[i].vmlva; /* Get the 64-bit actual address */ - ava = lstx[i].vmlava; /* Get the 64-bit guest address */ - } - else { - cva = lst[i].vmlva; /* Get the 32-bit actual address */ - ava = lst[i].vmlava; /* Get the 32-bit guest address */ - } - - prot = ava & vmmlProt; /* Extract the protection bits */ - adsp = (ava & vmmlAdID) >> 4; /* Extract an explicit address space request */ - if(!adsp) /* If no explicit, use supplied default */ - adsp = index - 1; - ava &= 0xFFFFFFFFFFFFF000ULL; /* Clean up the address */ - - ret = vmm_map_page(act, index, cva, ava, prot); /* Go try to map the page on in */ - if(ret != KERN_SUCCESS) /* Bail if any error */ - return ret; - } - - return KERN_SUCCESS; -} - -/*----------------------------------------------------------------------- -** vmm_get_page_mapping -** -** Given a context index and a guest virtual address, convert the address -** to its corresponding host virtual address. -** -** Inputs: -** act - pointer to current thread activation -** index - context index -** gva - guest virtual address -** -** Outputs: -** Host virtual address (page aligned) or -1 if not mapped or any failure -** -** Note: -** If the host address space contains multiple virtual addresses mapping -** to the physical address corresponding to the specified guest virtual -** address (i.e., host virtual aliases), it is unpredictable which host -** virtual address (alias) will be returned. Moral of the story: No host -** virtual aliases. ------------------------------------------------------------------------*/ - -addr64_t vmm_get_page_mapping( - thread_t act, - vmm_adsp_id_t index, - addr64_t gva) -{ - register mapping_t *mp; - pmap_t pmap; - addr64_t nextva, hva; - ppnum_t pa; - - pmap = vmm_get_adsp(act, index); /* Get and validate the index */ - if (!pmap)return -1; /* No good, failure... */ - - if (pmap->pmapFlags & pmapVMgsaa) { /* Guest shadow assist (GSA) active ? */ - return (hw_gva_to_hva(pmap, gva)); /* Convert guest to host virtual address */ - } else { - mp = mapping_find(pmap, gva, &nextva, 0); /* Find guest mapping for this virtual address */ - - if(!mp) return -1; /* Not mapped, return -1 */ - - pa = mp->mpPAddr; /* Remember the physical page address */ - - mapping_drop_busy(mp); /* Go ahead and relase the mapping now */ - - pmap = current_thread()->map->pmap; /* Get the host pmap */ - hva = mapping_p2v(pmap, pa); /* Now find the source virtual */ - - if(hva != 0) return hva; /* We found it... */ - - panic("vmm_get_page_mapping: could not back-map guest va (%016llX)\n", gva); - /* We are bad wrong if we can't find it */ - - return -1; /* Never executed, prevents compiler warning */ - } -} - -/*----------------------------------------------------------------------- -** vmm_unmap_page -** -** This function unmaps a page from the guest address space. -** -** Inputs: -** act - pointer to current thread activation -** index - index of vmm state for this page -** va - virtual address within the vmm's address -** space -** -** Outputs: -** kernel return code indicating success or failure ------------------------------------------------------------------------*/ - -kern_return_t vmm_unmap_page( - thread_t act, - vmm_adsp_id_t index, - addr64_t va) -{ - addr64_t nadd; - pmap_t pmap; - - pmap = vmm_get_adsp(act, index); /* Get and validate the index */ - if (!pmap)return -1; /* No good, failure... */ - - if (pmap->pmapFlags & pmapVMgsaa) { /* Handle guest shadow assist specially */ - hw_susp_map_gv(act->map->pmap, pmap, va); /* Suspend the mapping */ - return (KERN_SUCCESS); /* Always returns success */ - } else { - nadd = mapping_remove(pmap, va); /* Toss the mapping */ - - return ((nadd & 1) ? KERN_FAILURE : KERN_SUCCESS); /* Return... */ - } -} - -/*----------------------------------------------------------------------- -** vmm_unmap_list -** -** This function unmaps a list of pages from the alternate's logical -** address space. -** -** Inputs: -** act - pointer to current thread activation -** index - index of vmm state for this page -** count - number of pages to release -** flavor - 0 if 32-bit, 1 if 64-bit -** vmcpComm in the comm page contains up to kVmmMaxUnmapPages to unmap -** -** Outputs: -** kernel return code indicating success or failure -** KERN_FAILURE is returned if kVmmMaxUnmapPages is exceeded ------------------------------------------------------------------------*/ - -kern_return_t vmm_unmap_list( - thread_t act, - vmm_adsp_id_t index, - unsigned int cnt, - unsigned int flavor) -{ - vmmCntrlEntry *CEntry; - kern_return_t kern_result = KERN_SUCCESS; - unsigned int i; - addr64_t gva; - vmmUMList *lst; - vmmUMList64 *lstx; - pmap_t pmap; - int adsp; - - CEntry = vmm_get_entry(act, index); /* Convert index to entry */ - if (CEntry == NULL) { /* Either this isn't a vmm or the index is bogus */ - kern_result = KERN_FAILURE; - goto out; - } - - if(cnt > kVmmMaxUnmapPages) { /* They tried to unmap too many */ - kern_result = KERN_FAILURE; - goto out; - } - if(!cnt) { /* If they said none, we're done... */ - kern_result = KERN_SUCCESS; - goto out; - } - - lstx = (vmmUMList64 *) &((vmm_comm_page_t *)CEntry->vmmContextKern)->vmcpComm[0]; /* Point to the first entry */ - lst = (vmmUMList *)lstx; - - for(i = 0; i < cnt; i++) { /* Step and release all pages in list */ - if(flavor) { /* Check if 32- or 64-bit addresses */ - gva = lstx[i].vmlava; /* Get the 64-bit guest address */ - } - else { - gva = lst[i].vmlava; /* Get the 32-bit guest address */ - } - - adsp = (gva & vmmlAdID) >> 4; /* Extract an explicit address space request */ - if(!adsp) /* If no explicit, use supplied default */ - adsp = index - 1; - pmap = act->machine.vmmControl->vmmAdsp[adsp]; /* Get the pmap for this request */ - if(!pmap) - continue; /* Ain't nuthin' mapped here, no durn map... */ - - gva &= 0xFFFFFFFFFFFFF000ULL; /* Clean up the address */ - if (pmap->pmapFlags & pmapVMgsaa) { /* Handle guest shadow assist specially */ - hw_susp_map_gv(act->map->pmap, pmap, gva); - /* Suspend the mapping */ - } else { - (void)mapping_remove(pmap, gva); /* Toss the mapping */ - } - } - -out: - return kern_result; -} - -/*----------------------------------------------------------------------- -** vmm_unmap_all_pages -** -** This function unmaps all pages from the alternates's logical -** address space. -** -** Inputs: -** act - pointer to current thread activation -** index - index of context state -** -** Outputs: -** none -** -** Note: -** All pages are unmapped, but the address space (i.e., pmap) is still alive ------------------------------------------------------------------------*/ - -void vmm_unmap_all_pages( - thread_t act, - vmm_adsp_id_t index) -{ - pmap_t pmap; - - pmap = vmm_get_adsp(act, index); /* Convert index to entry */ - if (!pmap) return; /* Either this isn't vmm thread or the index is bogus */ - - if (pmap->pmapFlags & pmapVMgsaa) { /* Handle guest shadow assist specially */ - hw_rem_all_gv(pmap); /* Remove all guest's mappings from shadow hash table */ - } else { - /* - * Note: the pmap code won't deal with the last page in the address space, so handle it explicitly - */ - mapping_remove(pmap, 0xFFFFFFFFFFFFF000LL); /* Remove final page explicitly because we might have mapped it */ - pmap_remove(pmap, 0, 0xFFFFFFFFFFFFF000LL); /* Remove all entries from this map */ - } -} - - -/*----------------------------------------------------------------------- -** vmm_get_page_dirty_flag -** -** This function returns the changed flag of the page -** and optionally clears clears the flag. -** -** Inputs: -** act - pointer to current thread activation -** index - index of vmm state for this page -** va - virtual address within the vmm's address -** space -** reset - Clears dirty if true, untouched if not -** -** Outputs: -** the dirty bit -** clears the dirty bit in the pte if requested -** -** Note: -** The RC bits are merged into the global physical entry ------------------------------------------------------------------------*/ - -boolean_t vmm_get_page_dirty_flag( - thread_t act, - vmm_adsp_id_t index, - addr64_t va, - unsigned int reset) -{ - unsigned int RC; - pmap_t pmap; - - pmap = vmm_get_adsp(act, index); /* Convert index to entry */ - if (!pmap) return 1; /* Either this isn't vmm thread or the index is bogus */ - - if (pmap->pmapFlags & pmapVMgsaa) { /* Handle guest shadow assist specially */ - RC = hw_test_rc_gv(act->map->pmap, pmap, va, reset);/* Fetch the RC bits and clear if requested */ - } else { - RC = hw_test_rc(pmap, (addr64_t)va, reset); /* Fetch the RC bits and clear if requested */ - } - - switch (RC & mapRetCode) { /* Decode return code */ - - case mapRtOK: /* Changed */ - return ((RC & (unsigned int)mpC) == (unsigned int)mpC); /* Return if dirty or not */ - break; - - case mapRtNotFnd: /* Didn't find it */ - return 1; /* Return dirty */ - break; - - default: - panic("vmm_get_page_dirty_flag: hw_test_rc failed - rc = %d, pmap = %p, va = %016llX\n", RC, pmap, va); - - } - - return 1; /* Return the change bit */ -} - - -/*----------------------------------------------------------------------- -** vmm_protect_page -** -** This function sets the protection bits of a mapped page -** -** Inputs: -** act - pointer to current thread activation -** index - index of vmm state for this page -** va - virtual address within the vmm's address -** space -** prot - Protection flags -** -** Outputs: -** none -** Protection bits of the mapping are modifed -** ------------------------------------------------------------------------*/ - -kern_return_t vmm_protect_page( - thread_t act, - vmm_adsp_id_t index, - addr64_t va, - vm_prot_t prot) -{ - addr64_t nextva; - int ret; - pmap_t pmap; - - pmap = vmm_get_adsp(act, index); /* Convert index to entry */ - if (!pmap) return KERN_FAILURE; /* Either this isn't vmm thread or the index is bogus */ - - if (pmap->pmapFlags & pmapVMgsaa) { /* Handle guest shadow assist specially */ - ret = hw_protect_gv(pmap, va, prot); /* Try to change protection, GSA varient */ - } else { - ret = hw_protect(pmap, va, prot, &nextva); /* Try to change protection */ - } - - switch (ret) { /* Decode return code */ - - case mapRtOK: /* All ok... */ - break; /* Outta here */ - - case mapRtNotFnd: /* Didn't find it */ - return KERN_SUCCESS; /* Ok, return... */ - break; - - default: - panic("vmm_protect_page: hw_protect failed - rc = %d, pmap = %p, va = %016llX\n", ret, pmap, (addr64_t)va); - - } - - if (!((getPerProc()->spcFlags) & FamVMmode)) { - act->machine.vmmControl->vmmLastMap = va & 0xFFFFFFFFFFFFF000ULL; /* Remember the last mapping we made */ - act->machine.vmmControl->vmmGFlags = (act->machine.vmmControl->vmmGFlags & ~vmmLastAdSp) | index; /* Remember last address space */ - } - - return KERN_SUCCESS; /* Return */ -} - - -/*----------------------------------------------------------------------- -** vmm_protect_execute -** -** This function sets the protection bits of a mapped page -** and then directly starts executing. -** -** See description of vmm_protect_page for details -** -** Inputs: -** See vmm_protect_page and vmm_map_execute -** -** Outputs: -** Normal exit is to run the VM. Abnormal exit is triggered via a -** non-KERN_SUCCESS return from vmm_map_page or later during the -** attempt to transition into the VM. ------------------------------------------------------------------------*/ - -vmm_return_code_t vmm_protect_execute( - thread_t act, - vmm_thread_index_t index, - addr64_t va, - vm_prot_t prot) -{ - kern_return_t ret; - vmmCntrlEntry *CEntry; - unsigned int adsp; - vmm_thread_index_t cndx; - - cndx = index & 0xFF; /* Clean it up */ - CEntry = vmm_get_entry(act, cndx); /* Get and validate the index */ - if (CEntry == NULL) return kVmmBogusContext; /* Return bogus context */ - - adsp = (index >> 8) & 0xFF; /* Get any requested address space */ - if(!adsp) adsp = (index & 0xFF); /* If 0, use context ID as address space ID */ - - if (((getPerProc()->spcFlags) & FamVMmode) && (CEntry != act->machine.vmmCEntry)) - return kVmmBogusContext; /* Yes, invalid index in Fam */ - - ret = vmm_protect_page(act, adsp, va, prot); /* Go try to change access */ - - if(ret == KERN_SUCCESS) { - act->machine.vmmControl->vmmLastMap = va & 0xFFFFFFFFFFFFF000ULL; /* Remember the last mapping we made */ - act->machine.vmmControl->vmmGFlags = (act->machine.vmmControl->vmmGFlags & ~vmmLastAdSp) | cndx; /* Remember last address space */ - vmm_execute_vm(act, cndx); /* Return was ok, launch the VM */ - } - - return ret; /* We had trouble of some kind (shouldn't happen) */ - -} - - -/*----------------------------------------------------------------------- -** vmm_get_float_state -** -** This function causes the current floating point state to -** be saved into the shared context area. It also clears the -** vmmFloatCngd changed flag. -** -** Inputs: -** act - pointer to current thread activation structure -** index - index returned by vmm_init_context -** -** Outputs: -** context saved ------------------------------------------------------------------------*/ - -kern_return_t vmm_get_float_state( - thread_t act, - vmm_thread_index_t index) -{ - vmmCntrlEntry *CEntry; - int i; - register struct savearea_fpu *sv; - - CEntry = vmm_get_entry(act, index); /* Convert index to entry */ - if (CEntry == NULL) return KERN_FAILURE; /* Either this isn't vmm thread or the index is bogus */ - - act->machine.specFlags &= ~floatCng; /* Clear the special flag */ - CEntry->vmmContextKern->vmmStat &= ~vmmFloatCngd; /* Clear the change indication */ - - fpu_save(&CEntry->vmmFacCtx); /* Save context if live */ - - if(CEntry->vmmFacCtx.FPUsave) { /* Is there context yet? */ - sv = CEntry->vmmFacCtx.FPUsave; - bcopy((char *)&sv->save_fp0, (char *)&(CEntry->vmmContextKern->vmm_proc_state.ppcFPRs), 32 * 8); /* 32 registers */ - return KERN_SUCCESS; - } - - - for(i = 0; i < 32; i++) { /* Initialize floating points */ - CEntry->vmmContextKern->vmm_proc_state.ppcFPRs[i].d = FloatInit; /* Initial value */ - } - - return KERN_SUCCESS; -} - -/*----------------------------------------------------------------------- -** vmm_get_vector_state -** -** This function causes the current vector state to -** be saved into the shared context area. It also clears the -** vmmVectorCngd changed flag. -** -** Inputs: -** act - pointer to current thread activation structure -** index - index returned by vmm_init_context -** -** Outputs: -** context saved ------------------------------------------------------------------------*/ - -kern_return_t vmm_get_vector_state( - thread_t act, - vmm_thread_index_t index) -{ - vmmCntrlEntry *CEntry; - int i, j; - unsigned int vrvalidwrk; - register struct savearea_vec *sv; - - CEntry = vmm_get_entry(act, index); /* Convert index to entry */ - if (CEntry == NULL) return KERN_FAILURE; /* Either this isn't vmm thread or the index is bogus */ - - vec_save(&CEntry->vmmFacCtx); /* Save context if live */ - - act->machine.specFlags &= ~vectorCng; /* Clear the special flag */ - CEntry->vmmContextKern->vmmStat &= ~vmmVectCngd; /* Clear the change indication */ - - if(CEntry->vmmFacCtx.VMXsave) { /* Is there context yet? */ - sv = CEntry->vmmFacCtx.VMXsave; - vrvalidwrk = sv->save_vrvalid; /* Get the valid flags */ - - for(i = 0; i < 32; i++) { /* Copy the saved registers and invalidate the others */ - if(vrvalidwrk & 0x80000000) { /* Do we have a valid value here? */ - for(j = 0; j < 4; j++) { /* If so, copy it over */ - CEntry->vmmContextKern->vmm_proc_state.ppcVRs[i].i[j] = ((unsigned int *)&(sv->save_vr0))[(i * 4) + j]; - } - } - else { - for(j = 0; j < 4; j++) { /* Otherwise set to empty value */ - CEntry->vmmContextKern->vmm_proc_state.ppcVRs[i].i[j] = QNaNbarbarian[j]; - } - } - - vrvalidwrk = vrvalidwrk << 1; /* Shift over to the next */ - - } - - return KERN_SUCCESS; - } - - for(i = 0; i < 32; i++) { /* Initialize vector registers */ - for(j=0; j < 4; j++) { /* Do words */ - CEntry->vmmContextKern->vmm_proc_state.ppcVRs[i].i[j] = QNaNbarbarian[j]; /* Initial value */ - } - } - - return KERN_SUCCESS; -} - -/*----------------------------------------------------------------------- -** vmm_set_timer -** -** This function causes a timer (in AbsoluteTime) for a specific time -** to be set It also clears the vmmTimerPop flag if the timer is actually -** set, it is cleared otherwise. -** -** A timer is cleared by setting setting the time to 0. This will clear -** the vmmTimerPop bit. Simply setting the timer to earlier than the -** current time clears the internal timer request, but leaves the -** vmmTimerPop flag set. -** -** -** Inputs: -** act - pointer to current thread activation structure -** index - index returned by vmm_init_context -** timerhi - high order word of AbsoluteTime to pop -** timerlo - low order word of AbsoluteTime to pop -** -** Outputs: -** timer set, vmmTimerPop cleared ------------------------------------------------------------------------*/ - -kern_return_t vmm_set_timer( - thread_t act, - vmm_thread_index_t index, - unsigned int timerhi, - unsigned int timerlo) -{ - vmmCntrlEntry *CEntry; - - CEntry = vmm_get_entry(act, index); /* Convert index to entry */ - if (CEntry == NULL) return KERN_FAILURE; /* Either this isn't vmm thread or the index is bogus */ - - CEntry->vmmTimer = ((uint64_t)timerhi << 32) | timerlo; - - vmm_timer_pop(act); /* Go adjust all of the timer stuff */ - return KERN_SUCCESS; /* Leave now... */ -} - - -/*----------------------------------------------------------------------- -** vmm_get_timer -** -** This function causes the timer for a specified VM to be -** returned in return_params[0] and return_params[1]. -** Note that this is kind of funky for 64-bit VMs because we -** split the timer into two parts so that we still set parms 0 and 1. -** Obviously, we don't need to do this because the parms are 8 bytes -** wide. -** -** -** Inputs: -** act - pointer to current thread activation structure -** index - index returned by vmm_init_context -** -** Outputs: -** Timer value set in return_params[0] and return_params[1]. -** Set to 0 if timer is not set. ------------------------------------------------------------------------*/ - -kern_return_t vmm_get_timer( - thread_t act, - vmm_thread_index_t index) -{ - vmmCntrlEntry *CEntry; - - CEntry = vmm_get_entry(act, index); /* Convert index to entry */ - if (CEntry == NULL) return KERN_FAILURE; /* Either this isn't vmm thread or the index is bogus */ - - if(CEntry->vmmXAFlgs & vmm64Bit) { /* A 64-bit virtual machine? */ - CEntry->vmmContextKern->vmmRet.vmmrp64.return_params[0] = (uint32_t)(CEntry->vmmTimer >> 32); /* Return the last timer value */ - CEntry->vmmContextKern->vmmRet.vmmrp64.return_params[1] = (uint32_t)CEntry->vmmTimer; /* Return the last timer value */ - } - else { - CEntry->vmmContextKern->vmmRet.vmmrp32.return_params[0] = (CEntry->vmmTimer >> 32); /* Return the last timer value */ - CEntry->vmmContextKern->vmmRet.vmmrp32.return_params[1] = (uint32_t)CEntry->vmmTimer; /* Return the last timer value */ - } - return KERN_SUCCESS; -} - - -/*----------------------------------------------------------------------- -** vmm_timer_pop -** -** This function causes all timers in the array of VMs to be updated. -** All appropriate flags are set or reset. If a VM is currently -** running and its timer expired, it is intercepted. -** -** The qactTimer value is set to the lowest unexpired timer. It is -** zeroed if all timers are expired or have been reset. -** -** Inputs: -** act - pointer to current thread activation structure -** -** Outputs: -** timers set, vmmTimerPop cleared or set ------------------------------------------------------------------------*/ - -void vmm_timer_pop( - thread_t act) -{ - vmmCntrlTable *CTable; - int cvi, any; - uint64_t now, soonest; - struct savearea *sv; - - if(!((unsigned int)act->machine.vmmControl & 0xFFFFFFFE)) { /* Are there any virtual machines? */ - panic("vmm_timer_pop: No virtual machines defined; act = %p\n", act); - } - - soonest = 0xFFFFFFFFFFFFFFFFULL; /* Max time */ - - clock_get_uptime(&now); /* What time is it? */ - - CTable = act->machine.vmmControl; /* Make this easier */ - any = 0; /* Haven't found a running unexpired timer yet */ - - for(cvi = 0; cvi < kVmmMaxContexts; cvi++) { /* Cycle through all and check time now */ - - if(!(CTable->vmmc[cvi].vmmFlags & vmmInUse)) continue; /* Do not check if the entry is empty */ - - if(CTable->vmmc[cvi].vmmTimer == 0) { /* Is the timer reset? */ - CTable->vmmc[cvi].vmmFlags &= ~vmmTimerPop; /* Clear timer popped */ - CTable->vmmc[cvi].vmmContextKern->vmmStat &= ~vmmTimerPop; /* Clear timer popped */ - continue; /* Check next */ - } - - if (CTable->vmmc[cvi].vmmTimer <= now) { - CTable->vmmc[cvi].vmmFlags |= vmmTimerPop; /* Set timer popped here */ - CTable->vmmc[cvi].vmmContextKern->vmmStat |= vmmTimerPop; /* Set timer popped here */ - if((unsigned int)&CTable->vmmc[cvi] == (unsigned int)act->machine.vmmCEntry) { /* Is this the running VM? */ - sv = find_user_regs(act); /* Get the user state registers */ - if(!sv) { /* Did we find something? */ - panic("vmm_timer_pop: no user context; act = %p\n", act); - } - sv->save_exception = kVmmReturnNull*4; /* Indicate that this is a null exception */ - vmm_force_exit(act, sv); /* Intercept a running VM */ - } - continue; /* Check the rest */ - } - else { /* It hasn't popped yet */ - CTable->vmmc[cvi].vmmFlags &= ~vmmTimerPop; /* Set timer not popped here */ - CTable->vmmc[cvi].vmmContextKern->vmmStat &= ~vmmTimerPop; /* Set timer not popped here */ - } - - any = 1; /* Show we found an active unexpired timer */ - - if (CTable->vmmc[cvi].vmmTimer < soonest) - soonest = CTable->vmmc[cvi].vmmTimer; - } - - if(any) { - if (act->machine.qactTimer == 0 || soonest <= act->machine.qactTimer) - act->machine.qactTimer = soonest; /* Set lowest timer */ - } -} - - - -/*----------------------------------------------------------------------- -** vmm_stop_vm -** -** This function prevents the specified VM(s) to from running. -** If any is currently executing, the execution is intercepted -** with a code of kVmmStopped. Note that execution of the VM is -** blocked until a vmmExecuteVM is called with the start flag set to 1. -** This provides the ability for a thread to stop execution of a VM and -** insure that it will not be run until the emulator has processed the -** "virtual" interruption. -** -** Inputs: -** vmmask - 32 bit mask corresponding to the VMs to put in stop state -** NOTE: if this mask is all 0s, any executing VM is intercepted with -* a kVmmStopped (but not marked stopped), otherwise this is a no-op. Also note that there -** note that there is a potential race here and the VM may not stop. -** -** Outputs: -** kernel return code indicating success -** or if no VMs are enabled, an invalid syscall exception. ------------------------------------------------------------------------*/ - -int vmm_stop_vm(struct savearea *save) -{ - - thread_t act; - vmmCntrlTable *CTable; - int cvi, i; - task_t task; - thread_t fact; - unsigned int vmmask; - ReturnHandler *stopapc; - - ml_set_interrupts_enabled(TRUE); /* This can take a bit of time so pass interruptions */ - - task = current_task(); /* Figure out who we are */ - - task_lock(task); /* Lock our task */ - - fact = (thread_t)task->threads.next; /* Get the first activation on task */ - act = NULL; /* Pretend we didn't find it yet */ - - for(i = 0; i < task->thread_count; i++) { /* All of the activations */ - if(fact->machine.vmmControl) { /* Is this a virtual machine monitor? */ - act = fact; /* Yeah... */ - break; /* Bail the loop... */ - } - fact = (thread_t)fact->task_threads.next; /* Go to the next one */ - } - - if(!((unsigned int)act)) { /* See if we have VMMs yet */ - task_unlock(task); /* No, unlock the task */ - ml_set_interrupts_enabled(FALSE); /* Set back interruptions */ - return 0; /* Go generate a syscall exception */ - } - - thread_reference(act); - - task_unlock(task); /* Safe to release now */ - - thread_mtx_lock(act); - - CTable = act->machine.vmmControl; /* Get the pointer to the table */ - - if(!((unsigned int)CTable & -2)) { /* Are there any all the way up yet? */ - thread_mtx_unlock(act); /* Unlock the activation */ - thread_deallocate(act); - ml_set_interrupts_enabled(FALSE); /* Set back interruptions */ - return 0; /* Go generate a syscall exception */ - } - - if(!(vmmask = save->save_r3)) { /* Get the stop mask and check if all zeros */ - thread_mtx_unlock(act); /* Unlock the activation */ - thread_deallocate(act); - ml_set_interrupts_enabled(FALSE); /* Set back interruptions */ - save->save_r3 = KERN_SUCCESS; /* Set success */ - return 1; /* Return... */ - } - - for(cvi = 0; cvi < kVmmMaxContexts; cvi++) { /* Search slots */ - if((0x80000000 & vmmask) && (CTable->vmmc[cvi].vmmFlags & vmmInUse)) { /* See if we need to stop and if it is in use */ - hw_atomic_or_noret(&CTable->vmmc[cvi].vmmFlags, vmmXStop); /* Set this one to stop */ - } - vmmask = vmmask << 1; /* Slide mask over */ - } - - if(hw_compare_and_store(0, 1, &act->machine.emPendRupts)) { /* See if there is already a stop pending and lock out others if not */ - thread_mtx_unlock(act); /* Already one pending, unlock the activation */ - thread_deallocate(act); - ml_set_interrupts_enabled(FALSE); /* Set back interruptions */ - save->save_r3 = KERN_SUCCESS; /* Say we did it... */ - return 1; /* Leave */ - } - - if(!(stopapc = (ReturnHandler *)kalloc(sizeof(ReturnHandler)))) { /* Get a return handler control block */ - act->machine.emPendRupts = 0; /* No memory, say we have given up request */ - thread_mtx_unlock(act); /* Unlock the activation */ - thread_deallocate(act); - ml_set_interrupts_enabled(FALSE); /* Set back interruptions */ - save->save_r3 = KERN_RESOURCE_SHORTAGE; /* No storage... */ - return 1; /* Return... */ - } - - ml_set_interrupts_enabled(FALSE); /* Disable interruptions for now */ - - stopapc->handler = vmm_interrupt; /* Set interruption routine */ - - stopapc->next = act->handlers; /* Put our interrupt at the start of the list */ - act->handlers = stopapc; /* Point to us */ - - act_set_apc(act); /* Set an APC AST */ - ml_set_interrupts_enabled(TRUE); /* Enable interruptions now */ - - thread_mtx_unlock(act); /* Unlock the activation */ - thread_deallocate(act); - - ml_set_interrupts_enabled(FALSE); /* Set back interruptions */ - save->save_r3 = KERN_SUCCESS; /* Hip, hip, horay... */ - return 1; -} - -/*----------------------------------------------------------------------- -** vmm_interrupt -** -** This function is executed asynchronously from an APC AST. -** It is to be used for anything that needs to interrupt a running VM. -** This include any kind of interruption generation (other than timer pop) -** or entering the stopped state. -** -** Inputs: -** ReturnHandler *rh - the return handler control block as required by the APC. -** thread_t act - the activation -** -** Outputs: -** Whatever needed to be done is done. ------------------------------------------------------------------------*/ - -void vmm_interrupt(ReturnHandler *rh, thread_t act) { - - vmmCntrlTable *CTable; - struct savearea *sv; - boolean_t inter; - - - - kfree(rh, sizeof(ReturnHandler)); /* Release the return handler block */ - - inter = ml_set_interrupts_enabled(FALSE); /* Disable interruptions for now */ - - act->machine.emPendRupts = 0; /* Say that there are no more interrupts pending */ - CTable = act->machine.vmmControl; /* Get the pointer to the table */ - - if(!((unsigned int)CTable & -2)) return; /* Leave if we aren't doing VMs any more... */ - - if(act->machine.vmmCEntry && (act->machine.vmmCEntry->vmmFlags & vmmXStop)) { /* Do we need to stop the running guy? */ - sv = find_user_regs(act); /* Get the user state registers */ - if(!sv) { /* Did we find something? */ - panic("vmm_interrupt: no user context; act = %p\n", act); - } - sv->save_exception = kVmmStopped*4; /* Set a "stopped" exception */ - vmm_force_exit(act, sv); /* Intercept a running VM */ - } - ml_set_interrupts_enabled(inter); /* Put interrupts back to what they were */ -} diff --git a/osfmk/ppc/vmachmon.h b/osfmk/ppc/vmachmon.h deleted file mode 100644 index 91626cfa2..000000000 --- a/osfmk/ppc/vmachmon.h +++ /dev/null @@ -1,498 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/*----------------------------------------------------------------------- -** vmachmon.h -** -** C routines that we are adding to the MacOS X kernel. -** ------------------------------------------------------------------------*/ - -#include - -#ifndef _VEMULATION_H_ -#define _VEMULATION_H_ - -/************************************************************************************* - External Emulation Types -**************************************************************************************/ - -typedef union vmm_vector_register_t { - unsigned long i[4]; - unsigned short s[8]; - unsigned char b[16]; -} vmm_vector_register_t; - -typedef union vmm_fp_register_t { - double d; - unsigned long i[2]; - unsigned short s[4]; - unsigned char b[8]; -} vmm_fp_register_t; - - -typedef struct vmm_regs32_t { - - unsigned long ppcPC; /* 000 */ - unsigned long ppcMSR; /* 004 */ - - unsigned long ppcGPRs[32]; /* 008 */ - - unsigned long ppcCR; /* 088 */ - unsigned long ppcXER; /* 08C */ - unsigned long ppcLR; /* 090 */ - unsigned long ppcCTR; /* 094 */ - unsigned long ppcMQ; /* 098 - Obsolete */ - unsigned long ppcVRSave; /* 09C */ - unsigned long ppcRsrvd0A0[40]; /* 0A0 */ - /* 140 */ -} vmm_regs32_t; - -#pragma pack(4) /* Make sure the structure stays as we defined it */ -typedef struct vmm_regs64_t { - - unsigned long long ppcPC; /* 000 */ - unsigned long long ppcMSR; /* 008 */ - - unsigned long long ppcGPRs[32]; /* 010 */ - - unsigned long long ppcXER; /* 110 */ - unsigned long long ppcLR; /* 118 */ - unsigned long long ppcCTR; /* 120 */ - unsigned long ppcCR; /* 128 */ - unsigned long ppcVRSave; /* 12C */ - unsigned long ppcRsvd130[4]; /* 130 */ - /* 140 */ -} vmm_regs64_t; -#pragma pack() - - -#pragma pack(4) /* Make sure the structure stays as we defined it */ -typedef union vmm_regs_t { - vmm_regs32_t ppcRegs32; - vmm_regs64_t ppcRegs64; -} vmm_regs_t; -#pragma pack() - -#pragma pack(4) /* Make sure the structure stays as we defined it */ -typedef struct vmm_processor_state_t { - /* 32-byte bndry */ - vmm_regs_t ppcRegs; /* Define registers areas */ - -/* We must be 16-byte aligned here */ - - vmm_vector_register_t ppcVRs[32]; /* These are only valid after a kVmmGetVectorState */ - vmm_vector_register_t ppcVSCR; /* This is always loaded/saved at host/guest transition */ - -/* We must be 8-byte aligned here */ - - vmm_fp_register_t ppcFPRs[32]; /* These are only valid after a kVmmGetFloatState */ - vmm_fp_register_t ppcFPSCR; /* This is always loaded/saved at host/guest transition */ - unsigned long ppcReserved2[2]; /* Pad out to multiple of 16 bytes */ -} vmm_processor_state_t; -#pragma pack() - -typedef unsigned long vmm_return_code_t; - -typedef unsigned long vmm_thread_index_t; -#define vmmTInum 0x000000FF -#define vmmTIadsp 0x0000FF00 -typedef unsigned long vmm_adsp_id_t; - -enum { - kVmmCurMajorVersion = 0x0001, - kVmmCurMinorVersion = 0x0007, - kVmmMinMajorVersion = 0x0001, -}; -#define kVmmCurrentVersion ((kVmmCurMajorVersion << 16) | kVmmCurMinorVersion) - -typedef unsigned long vmm_features_t; -enum { - kVmmFeature_LittleEndian = 0x00000001, - kVmmFeature_Stop = 0x00000002, - kVmmFeature_ExtendedMapping = 0x00000004, - kVmmFeature_ListMapping = 0x00000008, - kVmmFeature_FastAssist = 0x00000010, - kVmmFeature_XA = 0x00000020, - kVmmFeature_SixtyFourBit = 0x00000040, - kVmmFeature_MultAddrSpace = 0x00000080, - kVmmFeature_GuestShadowAssist = 0x00000100, /* Guest->physical shadow hash table */ - kVmmFeature_GlobalMappingAssist = 0x00000200, /* Global shadow mapping support */ - kVmmFeature_HostShadowAssist = 0x00000400, /* Linear shadow mapping of an area of - host virtual as guest physical */ - kVmmFeature_MultAddrSpaceAssist = 0x00000800, /* Expanded pool of guest virtual - address spaces */ -}; -#define kVmmCurrentFeatures (kVmmFeature_LittleEndian | kVmmFeature_Stop | kVmmFeature_ExtendedMapping \ - | kVmmFeature_ListMapping | kVmmFeature_FastAssist | kVmmFeature_XA \ - | kVmmFeature_GuestShadowAssist) - -enum { - vmm64Bit = 0x80000000, /* Make guest 64-bit */ - vmmGSA = 0x40000000, /* Enable guest shadow assist (GSA) */ - vmmGMA = 0x20000000, /* Enable global shadow mapping assist (GMA) */ -}; - -#define kVmmSupportedSetXA (vmm64Bit | vmmGSA | vmmGMA) - -typedef unsigned long vmm_version_t; - -typedef struct vmm_ret_parms32_t { - unsigned long return_params[4]; -} vmm_ret_parms32_t; - -typedef struct vmm_ret_parms64_t { - unsigned long long return_params[4]; -} vmm_ret_parms64_t; - -#pragma pack(4) /* Make sure the structure stays as we defined it */ -typedef union vmm_ret_parms_t { - vmm_ret_parms64_t vmmrp64; /* 64-bit flavor */ - vmm_ret_parms32_t vmmrp32; /* 32-bit flavor */ - unsigned int retgas[11]; /* Force this to be 11 words long */ -} vmm_ret_parms_t; -#pragma pack() - -#pragma pack(4) /* Make sure the structure stays as we defined it */ -typedef struct vmm_fastassist_state32_t { - unsigned long fastassist_dispatch; - unsigned long fastassist_refcon; - - unsigned long fastassist_dispatch_code; - unsigned long fastassist_parameter[5]; - - unsigned long guest_register[8]; - - unsigned long guest_pc; - unsigned long guest_msr; - - unsigned long fastassist_intercepts; - unsigned long fastassist_reserved1; -} vmm_fastassist_state32_t; - -typedef struct vmm_fastassist_state64_t { - unsigned long long fastassist_dispatch; - unsigned long long fastassist_refcon; - - unsigned long long fastassist_dispatch_code; - unsigned long long fastassist_parameter[5]; - - unsigned long long guest_register[8]; - - unsigned long long guest_pc; - unsigned long long guest_msr; - - unsigned long fastassist_intercepts; - unsigned long fastassist_reserved1; -} vmm_fastassist_state64_t; - -typedef union vmm_fastassist_state_t { - vmm_fastassist_state64_t vmmfs64; /* 64-bit flavor */ - vmm_fastassist_state32_t vmmfs32; /* 32-bit flavor */ -} vmm_fastassist_state_t; -#pragma pack() - -#pragma pack(4) /* Make sure the structure stays as we defined it */ -typedef struct vmm_state_page_t { - /* This structure must remain below 4Kb (one page) in size */ - vmm_version_t interface_version; - vmm_thread_index_t thread_index; - unsigned int vmmStat; /* Note: this field is identical to vmmFlags in vmmCntrlEntry */ - unsigned int vmmCntrl; -#define vmmFloatLoad 0x80000000 -#define vmmFloatLoadb 0 -#define vmmVectLoad 0x40000000 -#define vmmVectLoadb 1 -#define vmmVectVRall 0x20000000 -#define vmmVectVRallb 2 -#define vmmVectVAss 0x10000000 -#define vmmVectVAssb 3 -#define vmmXStart 0x08000000 -#define vmmXStartb 4 -#define vmmKey 0x04000000 -#define vmmKeyb 5 -#define vmmFamEna 0x02000000 -#define vmmFamEnab 6 -#define vmmFamSet 0x01000000 -#define vmmFamSetb 7 - - vmm_return_code_t return_code; - vmm_ret_parms_t vmmRet; - - /* The next portion of the structure must remain 32-byte aligned */ - vmm_processor_state_t vmm_proc_state; - - /* The next portion of the structure must remain 16-byte aligned */ - vmm_fastassist_state_t vmm_fastassist_state; - -} vmm_state_page_t; -#pragma pack() - -#pragma pack(4) /* Make sure the structure stays as we defined it */ -typedef struct vmm_comm_page_t { - union { - vmm_state_page_t vmcpState; /* Reserve area for state */ - unsigned int vmcpPad[768]; /* Reserve space for 3/4 page state area */ - } vmcpfirst; - unsigned int vmcpComm[256]; /* Define last 1024 bytes as a communications area - function specific */ -} vmm_comm_page_t; -#pragma pack() - -enum { - /* Function Indices (passed in r3) */ - kVmmGetVersion = 0, /* Get VMM system version */ - kVmmvGetFeatures, /* Get VMM supported features */ - kVmmInitContext, /* Initialize a context */ - kVmmTearDownContext, /* Destroy a context */ - kVmmTearDownAll, /* Destory all contexts */ - kVmmMapPage, /* Map a host to guest address space */ - kVmmGetPageMapping, /* Get host address of a guest page */ - kVmmUnmapPage, /* Unmap a guest page */ - kVmmUnmapAllPages, /* Unmap all pages in a guest address space */ - kVmmGetPageDirtyFlag, /* Check if guest page modified */ - kVmmGetFloatState, /* Retrieve guest floating point context */ - kVmmGetVectorState, /* Retrieve guest vector context */ - kVmmSetTimer, /* Set a guest timer */ - kVmmGetTimer, /* Get a guest timer */ - kVmmExecuteVM, /* Launch a guest */ - kVmmProtectPage, /* Set protection attributes for a guest page */ - kVmmMapExecute, /* Map guest page and launch */ - kVmmProtectExecute, /* Set prot attributes and launch */ - kVmmMapList, /* Map a list of pages into guest address spaces */ - kVmmUnmapList, /* Unmap a list of pages from guest address spaces */ - kvmmExitToHost, /* Exit from FAM to host -- fast-path syscall */ - kvmmResumeGuest, /* Resume guest from FAM -- fast-path syscall */ - kvmmGetGuestRegister, /* Get guest register from FAM -- fast-path syscall */ - kvmmSetGuestRegister, /* Set guest register from FAM -- fast-path syscall */ - - kVmmActivateXA, /* Activate extended architecture features for a VM */ - kVmmDeactivateXA, /* Deactivate extended architecture features for a VM */ - kVmmGetXA, /* Get extended architecture features from a VM */ - - kVmmMapPage64, /* Map a host to guest address space - supports 64-bit */ - kVmmGetPageMapping64, /* Get host address of a guest page - supports 64-bit */ - kVmmUnmapPage64, /* Unmap a guest page - supports 64-bit */ - kVmmGetPageDirtyFlag64, /* Check if guest page modified - supports 64-bit */ - kVmmProtectPage64, /* Set protection attributes for a guest page - supports 64-bit */ - kVmmMapExecute64, /* Map guest page and launch - supports 64-bit */ - kVmmProtectExecute64, /* Set prot attributes and launch - supports 64-bit */ - kVmmMapList64, /* Map a list of pages into guest address spaces - supports 64-bit */ - kVmmUnmapList64, /* Unmap a list of pages from guest address spaces - supports 64-bit */ - kVmmMaxAddr, /* Returns the maximum virtual address that is mappable */ - - kVmmSetGuestMemory, /* Sets base and extent of guest physical memory in host address space */ - kVmmPurgeLocal, /* Purges all non-global mappings for a given guest address space */ -}; - -#define kVmmReturnNull 0 -#define kVmmBogusContext 1 -#define kVmmStopped 2 -#define kVmmReturnDataPageFault 3 -#define kVmmReturnInstrPageFault 4 -#define kVmmReturnAlignmentFault 6 -#define kVmmReturnProgramException 7 -#define kVmmReturnSystemCall 12 -#define kVmmReturnTraceException 13 -#define kVmmAltivecAssist 22 -#define kVmmInvalidAddress 0x1000 -#define kVmmInvalidAdSpace 0x1001 - -/* - * Notes on guest address spaces. - * - * Address spaces are loosely coupled to virtual machines. The default is for - * a guest with an index of 1 to use address space 1, 2 to use 2, etc. However, - * any guest may be launched using any address space and any address space may be the - * target for a map or unmap function. Note that the (un)map list functions may pass in - * an address space ID on a page-by-page basis. - * - * An address space is instantiated either explicitly by mapping something into it, or - * implicitly by launching a guest with it. - * - * An address space is destroyed explicitly by kVmmTearDownAll or kVmmUnmapAllPages. It is - * destroyed implicitly by kVmmTearDownContext. The latter is done in order to remain - * backwards compatible with the previous implementation, which does not have decoupled - * guests and address spaces. - * - * An address space supports the maximum virtual address supported by the processor. - * The 64-bit variant of the mapping functions can be used on non-64-bit machines. If an - * unmappable address (e.g., an address larger than 4GB-1 on a 32-bit machine) is requested, - * the operation fails with a kVmmInvalidAddress return code. - * - * Note that for 64-bit calls, both host and guest are specified at 64-bit values. - * - */ - - - - -/* - * Storage Extended Protection modes - * Notes: - * To keep compatibility, vmmKey and the PPC key have reversed meanings, - * i.e., vmmKey 0 is PPC key 1 and vice versa. - * - * vmmKey Notes - * Mode 0 1 - * - * kVmmProtNARW not accessible read/write VM_PROT_NONE (not settable via VM calls) - * kVmmProtRORW read only read/write - * kVmmProtRWRW read/write read/write VM_PROT_WRITE or (VM_PROT_WRITE | VM_PROT_READ) - * kVmmProtRORO read only read only VM_PROT_READ - - */ - -#define kVmmProtXtnd 0x00000008 -#define kVmmProtNARW (kVmmProtXtnd | 0x00000000) -#define kVmmProtRORW (kVmmProtXtnd | 0x00000001) -#define kVmmProtRWRW (kVmmProtXtnd | 0x00000002) -#define kVmmProtRORO (kVmmProtXtnd | 0x00000003) - -/* - * Map list formats - * The last 12 bits in the guest virtual address is used as flags as follows: - * 0x007 - for the map calls, this is the key to set - * 0x3F0 - for both map and unmap, this is the address space ID upon which to operate. - * Note that if 0, the address space ID from the function call is used instead. - */ - -typedef struct vmmMList { - unsigned int vmlva; /* Virtual address in host address space */ - unsigned int vmlava; /* Virtual address in guest address space */ -} vmmMList; - -typedef struct vmmMList64 { - unsigned long long vmlva; /* Virtual address in host address space */ - unsigned long long vmlava; /* Virtual address in guest address space */ -} vmmMList64; - -typedef struct vmmUMList { - unsigned int vmlava; /* Virtual address in guest address space */ -} vmmUMList; - -typedef struct vmmUMList64 { - unsigned long long vmlava; /* Virtual address in guest address space */ -} vmmUMList64; - -#define vmmlFlgs 0x00000FFF /* Flags passed in in vmlava low order 12 bits */ -#define vmmlProt 0x00000007 /* Protection flags for the page */ -#define vmmlAdID 0x000003F0 /* Guest address space ID - used only if non-zero */ -#define vmmlGlob 0x00000400 /* Mapping is global */ -#define vmmlRsvd 0x00000800 /* Reserved for future */ - -/************************************************************************************* - Internal Emulation Types -**************************************************************************************/ - -#define kVmmMaxContexts 32 -#define kVmmMaxUnmapPages 64 -#define kVmmMaxMapPages 64 - -#pragma pack(4) /* Make sure the structure stays as we defined it */ -typedef struct vmmCntrlEntry { /* Virtual Machine Monitor control table entry */ - unsigned int vmmFlags; /* Assorted control flags */ -#define vmmInUse 0x80000000 -#define vmmInUseb 0 -#define vmmFloatCngd 0x40000000 -#define vmmFloatCngdb 1 -#define vmmVectCngd 0x20000000 -#define vmmVectCngdb 2 -#define vmmTimerPop 0x10000000 -#define vmmTimerPopb 3 -#define vmmFAMmode 0x04000000 -#define vmmFAMmodeb 5 -#define vmmXStop 0x00800000 -#define vmmXStopb 8 -#define vmmSpfSave 0x000000FF -#define vmmSpfSaveb 24 - unsigned int vmmXAFlgs; /* Extended Architecture flags */ - vmm_state_page_t *vmmContextKern; /* Kernel address of context communications area */ - ppnum_t vmmContextPhys; /* Physical address of context communications area */ - vmm_state_page_t *vmmContextUser; /* User address of context communications area */ - facility_context vmmFacCtx; /* Header for vector and floating point contexts */ - pmap_t vmmPmap; /* Last dispatched pmap */ - uint64_t vmmTimer; /* Last set timer value. Zero means unset */ - unsigned int vmmFAMintercept; /* FAM intercepted exceptions */ -} vmmCntrlEntry; -#pragma pack() - -#pragma pack(4) /* Make sure the structure stays as we defined it */ -typedef struct vmmCntrlTable { /* Virtual Machine Monitor Control table */ - unsigned int vmmGFlags; /* Global flags */ -#define vmmLastAdSp 0xFF /* Remember the address space that was mapped last */ - addr64_t vmmLastMap; /* Last vaddr mapping made */ - vmmCntrlEntry vmmc[kVmmMaxContexts]; /* One entry for each possible Virtual Machine Monitor context */ - pmap_t vmmAdsp[kVmmMaxContexts]; /* Guest address space pmaps */ -} vmmCntrlTable; -#pragma pack() - -/* function decls for kernel level routines... */ -extern void vmm_execute_vm(thread_t act, vmm_thread_index_t index); -extern kern_return_t vmm_tear_down_context(thread_t act, vmm_thread_index_t index); -extern kern_return_t vmm_get_float_state(thread_t act, vmm_thread_index_t index); -extern kern_return_t vmm_get_vector_state(thread_t act, vmm_thread_index_t index); -extern kern_return_t vmm_set_timer(thread_t act, vmm_thread_index_t index, unsigned int timerhi, unsigned int timerlo); -extern kern_return_t vmm_get_timer(thread_t act, vmm_thread_index_t index); -extern void vmm_tear_down_all(thread_t act); -extern kern_return_t vmm_map_page(thread_t act, vmm_thread_index_t hindex, addr64_t cva, - addr64_t ava, vm_prot_t prot); -extern vmm_return_code_t vmm_map_execute(thread_t act, vmm_thread_index_t hindex, addr64_t cva, - addr64_t ava, vm_prot_t prot); -extern kern_return_t vmm_protect_page(thread_t act, vmm_thread_index_t hindex, addr64_t va, - vm_prot_t prot); -extern vmm_return_code_t vmm_protect_execute(thread_t act, vmm_thread_index_t hindex, addr64_t va, - vm_prot_t prot); -extern addr64_t vmm_get_page_mapping(thread_t act, vmm_thread_index_t index, - addr64_t va); -extern kern_return_t vmm_unmap_page(thread_t act, vmm_thread_index_t index, addr64_t va); -extern void vmm_unmap_all_pages(thread_t act, vmm_thread_index_t index); -extern boolean_t vmm_get_page_dirty_flag(thread_t act, vmm_thread_index_t index, - addr64_t va, unsigned int reset); -extern kern_return_t vmm_activate_XA(thread_t act, vmm_thread_index_t index, unsigned int xaflags); -extern kern_return_t vmm_deactivate_XA(thread_t act, vmm_thread_index_t index, unsigned int xaflags); -extern unsigned int vmm_get_XA(thread_t act, vmm_thread_index_t index); -extern int vmm_get_features(struct savearea *); -extern int vmm_get_version(struct savearea *); -extern int vmm_init_context(struct savearea *); -extern int vmm_dispatch(struct savearea *); -extern int vmm_exit(thread_t act, struct savearea *); -extern void vmm_force_exit(thread_t act, struct savearea *); -extern int vmm_stop_vm(struct savearea *save); -extern void vmm_timer_pop(thread_t act); -extern void vmm_interrupt(ReturnHandler *rh, thread_t act); -extern kern_return_t vmm_map_list(thread_t act, vmm_thread_index_t index, unsigned int cnt, unsigned int flavor); -extern kern_return_t vmm_unmap_list(thread_t act, vmm_thread_index_t index, unsigned int cnt, unsigned int flavor); -extern vmm_return_code_t vmm_resume_guest(vmm_thread_index_t index, unsigned long pc, - unsigned long vmmCntrl, unsigned long vmmCntrMaskl); -extern vmm_return_code_t vmm_exit_to_host(vmm_thread_index_t index); -extern unsigned long vmm_get_guest_register(vmm_thread_index_t index, unsigned long reg_index); -extern vmm_return_code_t vmm_set_guest_register(vmm_thread_index_t index, unsigned long reg_index, unsigned long reg_value); -extern addr64_t vmm_max_addr(thread_t act); -extern kern_return_t vmm_set_guest_memory(thread_t act, vmm_thread_index_t index, addr64_t base, addr64_t extent); -extern kern_return_t vmm_purge_local(thread_t act, vmm_thread_index_t index); - -#endif diff --git a/osfmk/ppc/vmachmon_asm.s b/osfmk/ppc/vmachmon_asm.s deleted file mode 100644 index 3f8cac654..000000000 --- a/osfmk/ppc/vmachmon_asm.s +++ /dev/null @@ -1,2368 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#include -#include -#include -#include -#include - -/* - * This file contains implementations for the Virtual Machine Monitor - * facility. - */ - -#define vmmMapDone 31 -#define vmmDoing64 30 - - -/* - * int vmm_dispatch(savearea, act); - - * vmm_dispatch is a PPC only system call. It is used with a selector (first - * parameter) to determine what function to enter. This is treated as an extension - * of hw_exceptions. - * - * Inputs: - * R4 = current activation - * R16 = current thread - * R30 = current savearea - */ - - .align 5 ; Line up on cache line - .globl EXT(vmm_dispatch_table) - -LEXT(vmm_dispatch_table) - - /* Don't change the order of these routines in the table. It's */ - /* OK to add new routines, but they must be added at the bottom. */ - - .long EXT(vmm_get_version_sel) ; Get the version of the VMM interface - .long 0 ; Not valid in Fam - .long EXT(vmm_get_features_sel) ; Get the features of the VMM interface - .long 0 ; Not valid in Fam - .long EXT(vmm_init_context_sel) ; Initializes a new VMM context - .long 0 ; Not valid in Fam - .long EXT(vmm_tear_down_context) ; Tears down a previously-allocated VMM context - .long 0 ; Not valid in Fam - .long EXT(vmm_tear_down_all) ; Tears down all VMMs - .long 0 ; Not valid in Fam - .long EXT(vmm_map_page32) ; Maps a page from the main address space into the VM space - supports 32-bit - .long 1 ; Valid in Fam - .long EXT(vmm_get_page_mapping32) ; Returns client va associated with VM va - supports 32-bit - .long 1 ; Valid in Fam - .long EXT(vmm_unmap_page32) ; Unmaps a page from the VM space - supports 32-bit - .long 1 ; Valid in Fam - .long EXT(vmm_unmap_all_pages) ; Unmaps all pages from the VM space - .long 1 ; Valid in Fam - .long EXT(vmm_get_page_dirty_flag32) ; Gets the change bit for a page and optionally clears it - supports 32-bit - .long 1 ; Valid in Fam - .long EXT(vmm_get_float_state) ; Gets current floating point state - .long 0 ; not valid in Fam - .long EXT(vmm_get_vector_state) ; Gets current vector state - .long 0 ; Not valid in Fam - .long EXT(vmm_set_timer) ; Sets a timer value - .long 1 ; Valid in Fam - .long EXT(vmm_get_timer) ; Gets a timer value - .long 1 ; Valid in Fam - .long EXT(switchIntoVM) ; Switches to the VM context - .long 1 ; Valid in Fam - .long EXT(vmm_protect_page32) ; Sets protection values for a page - supports 32-bit - .long 1 ; Valid in Fam - .long EXT(vmm_map_execute32) ; Maps a page an launches VM - supports 32-bit - .long 1 ; Not valid in Fam - .long EXT(vmm_protect_execute32) ; Sets protection values for a page and launches VM - supports 32-bit - .long 1 ; Valid in Fam - .long EXT(vmm_map_list32) ; Maps a list of pages - supports 32-bit - .long 1 ; Valid in Fam - .long EXT(vmm_unmap_list32) ; Unmaps a list of pages - supports 32-bit - .long 1 ; Valid in Fam - .long EXT(vmm_fam_reserved) ; exit from Fam to host - .long 1 ; Valid in Fam - .long EXT(vmm_fam_reserved) ; resume guest from Fam - .long 1 ; Valid in Fam - .long EXT(vmm_fam_reserved) ; get guest register from Fam - .long 1 ; Valid in Fam - .long EXT(vmm_fam_reserved) ; Set guest register from Fam - .long 1 ; Valid in Fam - .long EXT(vmm_activate_XA) ; Activate extended architecture features for a VM - .long 0 ; Not valid in Fam - .long EXT(vmm_deactivate_XA) ; Deactivate extended architecture features for a VM - .long 0 ; Not valid in Fam - .long EXT(vmm_get_XA) ; Get extended architecture features from a VM - .long 1 ; Valid in Fam - .long EXT(vmm_map_page) ; Map a host to guest address space - supports 64-bit - .long 1 ; Valid in Fam - .long EXT(vmm_get_page_mapping) ; Get host address of a guest page - supports 64-bit - .long 1 ; Valid in Fam - .long EXT(vmm_unmap_page) ; Unmap a guest page - supports 64-bit - .long 1 ; Valid in Fam - .long EXT(vmm_get_page_dirty_flag) ; Check if guest page modified - supports 64-bit - .long 1 ; Valid in Fam - .long EXT(vmm_protect_page) ; Sets protection values for a page - supports 64-bit - .long 1 ; Valid in Fam - .long EXT(vmm_map_execute) ; Map guest page and launch - supports 64-bit - .long 1 ; Valid in Fam - .long EXT(vmm_protect_execute) ; Set prot attributes and launch - supports 64-bit - .long 1 ; Valid in Fam - .long EXT(vmm_map_list64) ; Map a list of pages into guest address spaces - supports 64-bit - .long 1 ; Valid in Fam - .long EXT(vmm_unmap_list64) ; Unmap a list of pages from guest address spaces - supports 64-bit - .long 1 ; Valid in Fam - .long EXT(vmm_max_addr) ; Returns the maximum virtual address - .long 1 ; Valid in Fam -#if 0 - .long EXT(vmm_set_guest_memory) ; Set guest memory extent - .long 0 ; Not valid in FAM - .long EXT(vmm_purge_local) ; Purge all local guest mappings */ - .long 1 ; Valid in FAM -#endif - .set vmm_count,(.-EXT(vmm_dispatch_table))/8 ; Get the top number - - - .align 5 - .globl EXT(vmm_dispatch) - -LEXT(vmm_dispatch) - - lwz r11,saver3+4(r30) ; Get the selector - mr r3,r4 ; All of our functions want the activation as the first parm - lis r10,hi16(EXT(vmm_dispatch_table)) ; Get top half of table - cmplwi r11,kVmmExecuteVM ; Should we switch to the VM now? - cmplwi cr1,r11,vmm_count ; See if we have a valid selector - ori r10,r10,lo16(EXT(vmm_dispatch_table)) ; Get low half of table - lwz r4,saver4+4(r30) ; Get 1st parameter after selector - beq+ EXT(switchIntoVM) ; Yes, go switch to it.... - rlwinm r11,r11,3,0,28 ; Index into table - bge- cr1,vmmBogus ; It is a bogus entry - add r12,r10,r11 ; Get the vmm dispatch syscall entry - mfsprg r10,1 ; Get the current activation - lwz r10,ACT_PER_PROC(r10) ; Get the per_proc block - lwz r13,0(r12) ; Get address of routine - lwz r12,4(r12) ; Get validity flag - lwz r5,spcFlags(r10) ; Get per_proc special flags - cmpwi cr1,r12,0 ; Check Fam valid - rlwinm. r5,r5,0,FamVMmodebit,FamVMmodebit ; Test FamVMmodebit - crand cr0_eq,cr1_eq,cr0_gt ; In Fam and Invalid syscall - beq vmmBogus ; Intercept to host - lwz r5,saver5+4(r30) ; Get 2nd parameter after selector - note that some of these parameters may actually be long longs - lwz r6,saver6+4(r30) ; Get 3rd parameter after selector - mtlr r13 ; Set the routine address - lwz r7,saver7+4(r30) ; Get 4th parameter after selector - lwz r8,saver8+4(r30) ; Get 5th parameter after selector - lwz r9,saver9+4(r30) ; Get 6th parameter after selector -; -; NOTE: some of the above parameters are actually long longs. We have glue code that transforms -; all needed parameters and/or adds 32-/64-bit flavors to the needed functions. -; - - blrl ; Call function - -vmmRetPt: li r0,0 ; Clear this out - stw r0,saver3(r30) ; Make sure top of RC is clear - stw r3,saver3+4(r30) ; Pass back the return code - stw r0,saver4(r30) ; Make sure bottom of RC is clear (just in case) - stw r4,saver4+4(r30) ; Pass back the bottom return code (just in case) - li r3,1 ; Set normal return with check for AST - b EXT(ppcscret) ; Go back to handler... - -vmmBogus: - mfsprg r3,1 ; Get the current activation - lwz r10,ACT_PER_PROC(r3) ; Get the per_proc block - lwz r5,spcFlags(r10) ; Get per_proc special flags - rlwinm. r5,r5,0,FamVMmodebit,FamVMmodebit ; Test FamVMmodebit - bne vmmexitcall ; Do it to it - li r3,0 ; Bogus selector, treat like a bogus system call - b EXT(ppcscret) ; Go back to handler... - - - .align 5 - .globl EXT(vmm_get_version_sel) - -LEXT(vmm_get_version_sel) ; Selector based version of get version - - lis r3,hi16(EXT(vmm_get_version)) - ori r3,r3,lo16(EXT(vmm_get_version)) - b selcomm - - - .align 5 - .globl EXT(vmm_get_features_sel) - -LEXT(vmm_get_features_sel) ; Selector based version of get features - - lis r3,hi16(EXT(vmm_get_features)) - ori r3,r3,lo16(EXT(vmm_get_features)) - b selcomm - - - .align 5 - .globl EXT(vmm_init_context_sel) - -LEXT(vmm_init_context_sel) ; Selector based version of init context - - lwz r4,saver4+4(r30) ; Get the passed in version - lwz r5,saver5+4(r30) ; Get the passed in comm area - lis r3,hi16(EXT(vmm_init_context)) - stw r4,saver3+4(r30) ; Cheat and move this parameter over - ori r3,r3,lo16(EXT(vmm_init_context)) - stw r5,saver4+4(r30) ; Cheat and move this parameter over - -selcomm: mtlr r3 ; Set the real routine address - mr r3,r30 ; Pass in the savearea - blrl ; Call the function - b EXT(ppcscret) ; Go back to handler... - - .align 5 - .globl EXT(vmm_map_page32) - -LEXT(vmm_map_page32) - mr r9,r7 ; Move prot to correct parm - mr r8,r6 ; Move guest address to low half of long long - li r7,0 ; Clear high half of guest address - mr r6,r5 ; Move host address to low half of long long - li r5,0 ; Clear high half of host address - b EXT(vmm_map_page) ; Transition to real function... - - .align 5 - .globl EXT(vmm_get_page_mapping32) - -LEXT(vmm_get_page_mapping32) - mr r6,r5 ; Move guest address to low half of long long - li r5,0 ; Clear high half of guest address - bl EXT(vmm_get_page_mapping) ; Transition to real function... - mr r3,r4 ; Convert addr64_t to vm_offset_t, dropping top half - b vmmRetPt ; Join normal return... - - .align 5 - .globl EXT(vmm_unmap_page32) - -LEXT(vmm_unmap_page32) - mr r6,r5 ; Move guest address to low half of long long - li r5,0 ; Clear high half of guest address - b EXT(vmm_unmap_page) ; Transition to real function... - - .align 5 - .globl EXT(vmm_get_page_dirty_flag32) - -LEXT(vmm_get_page_dirty_flag32) - mr r7,r6 ; Move reset flag - mr r6,r5 ; Move guest address to low half of long long - li r5,0 ; Clear high half of guest address - b EXT(vmm_get_page_dirty_flag) ; Transition to real function... - - .align 5 - .globl EXT(vmm_protect_page32) - -LEXT(vmm_protect_page32) - mr r7,r6 ; Move protection bits - mr r6,r5 ; Move guest address to low half of long long - li r5,0 ; Clear high half of guest address - b EXT(vmm_protect_page) ; Transition to real function... - - .align 5 - .globl EXT(vmm_map_execute32) - -LEXT(vmm_map_execute32) - mr r9,r7 ; Move prot to correct parm - mr r8,r6 ; Move guest address to low half of long long - li r7,0 ; Clear high half of guest address - mr r6,r5 ; Move host address to low half of long long - li r5,0 ; Clear high half of host address - b EXT(vmm_map_execute) ; Transition to real function... - - .align 5 - .globl EXT(vmm_protect_execute32) - -LEXT(vmm_protect_execute32) - mr r7,r6 ; Move protection bits - mr r6,r5 ; Move guest address to low half of long long - li r5,0 ; Clear high half of guest address - b EXT(vmm_protect_execute) ; Transition to real function... - - .align 5 - .globl EXT(vmm_map_list32) - -LEXT(vmm_map_list32) - li r6,0 ; Set 32-bit flavor - b EXT(vmm_map_list) ; Go to common routine... - - .align 5 - .globl EXT(vmm_map_list64) - -LEXT(vmm_map_list64) - li r6,1 ; Set 64-bit flavor - b EXT(vmm_map_list) ; Go to common routine... - - .align 5 - .globl EXT(vmm_map_list32) - -LEXT(vmm_unmap_list32) - li r6,0 ; Set 32-bit flavor - b EXT(vmm_unmap_list) ; Go to common routine... - - .align 5 - .globl EXT(vmm_map_list64) - -LEXT(vmm_unmap_list64) - li r6,1 ; Set 64-bit flavor - b EXT(vmm_unmap_list) ; Go to common routine... - -/* - * Here is where we transition to the virtual machine. - * - * We will swap the register context in the savearea with that which is saved in our shared - * context area. We will validity check a bit and clear any nasty bits in the MSR and force - * the manditory ones on. - * - * Then we will setup the new address space to run with, and anything else that is normally part - * of a context switch. - * - * The vmm_execute_vm entry point is for the fused vmm_map_execute and vmm_protect_execute - * calls. This is called, but never returned from. We always go directly back to the - * user from here. - * - * - */ - - - .align 5 - .globl EXT(vmm_execute_vm) - -LEXT(vmm_execute_vm) - lwz r30,ACT_MACT_PCB(r3) ; Restore the savearea pointer because it could be trash here - b EXT(switchIntoVM) ; Join common... - - - .align 5 - .globl EXT(switchIntoVM) - -LEXT(switchIntoVM) - mfsprg r10,1 ; Get the current activation - lwz r10,ACT_PER_PROC(r10) ; Get the per_proc block - rlwinm r31,r4,24,24,31 ; Get the address space - rlwinm r4,r4,0,24,31 ; Isolate the context id - lwz r28,vmmControl(r3) ; Pick up the control table address - subi r4,r4,1 ; Switch to zero offset - rlwinm. r2,r28,0,0,30 ; Is there a context there? (Note: we will ignore bit 31 so that we - ; do not try this while we are transitioning off to on - cmplwi cr1,r4,kVmmMaxContexts ; Is the index valid? - beq- vmmBogus ; Not started, treat like a bogus system call - subic. r31,r31,1 ; Make address space 0 based and test if we use default - mulli r2,r4,vmmCEntrySize ; Get displacement from index - bge- cr1,swvmmBogus ; Index is bogus... - add r2,r2,r28 ; Point to the entry - bge-- swvmmDAdsp ; There was an explicit address space request - mr r31,r4 ; Default the address space to the context ID - -swvmmDAdsp: la r2,vmmc(r2) ; Get the offset to the context array - lwz r8,vmmGFlags(r28) ; Get the general flags - lwz r4,vmmFlags(r2) ; Get the flags for the selected entry - crset vmmMapDone ; Assume we will be mapping something - lwz r5,vmmContextKern(r2) ; Get the context area address - rlwinm. r26,r4,0,vmmInUseb,vmmInUseb ; See if the slot is in use - cmplwi cr1,r31,kVmmMaxContexts ; See if we have a valid address space ID - rlwinm r8,r8,0,24,31 ; Clean up address space - beq-- swvmmBogus ; This context is no good... - - la r26,vmmAdsp(r28) ; Point to the pmaps - sub r8,r8,r31 ; Get diff between launching address space - 1 and last mapped into (should be 1 if the same) - rlwinm r31,r31,2,0,29 ; Index to the pmap - cmplwi r8,1 ; See if we have the same address space - bge-- cr1,swvmmBogAdsp ; Address space is no good... - lwzx r31,r26,r31 ; Get the requested address space pmap - li r0,0 ; Get a 0 in case we need to trash redrive - lwz r15,spcFlags(r10) ; Get per_proc special flags - beq swvmmAdspOk ; Do not invalidate address space if we are launching the same - crclr vmmMapDone ; Clear map done flag - stb r0,vmmGFlags+3(r28) ; Clear the last mapped address space ID so we will not redrive later -; -; Here we check for any immediate intercepts. So far, the only -; two of these are a timer pop and and external stop. We will not dispatch if -; either is true. They need to either reset the timer (i.e. set timer -; to 0) or to set a future time, or if it is external stop, set the vmmXStopRst flag. -; - -swvmmAdspOk: - rlwinm. r0,r15,0,FamVMmodebit,FamVMmodebit ; Test FamVMmodebit - stw r31,vmmPmap(r2) ; Save the last dispatched address space - bne vmmFamGuestResume - lwz r6,vmmCntrl(r5) ; Get the control field - rlwinm. r7,r6,0,vmmXStartb,vmmXStartb ; Clear all but start bit - beq+ swvmChkStop ; Do not reset stop - andc r6,r6,r7 ; Clear it - li r8,vmmFlags ; Point to the flags - stw r6,vmmCntrl(r5) ; Set the control field - -swvmtryx: lwarx r4,r8,r2 ; Pick up the flags - rlwinm r4,r4,0,vmmXStopb+1,vmmXStopb-1 ; Clear the stop bit - stwcx. r4,r8,r2 ; Save the updated field - bne- swvmtryx ; Try again... - -swvmChkStop: - rlwinm. r26,r4,0,vmmXStopb,vmmXStopb ; Is this VM stopped? - bne-- swvmSetStop ; Yes... - - rlwinm. r26,r4,0,vmmTimerPopb,vmmTimerPopb ; Did the timer go pop? - cmplwi cr1,r31,0 ; Is there actually an address space defined? - bne-- svvmTimerPop ; Yes... - -; -; Special note: we need to intercept any attempt to launch a guest into a non-existent address space. -; We will just go emulate an ISI if there is not one. -; - - beq-- cr1,swvmEmulateISI ; We are trying to launch into an undefined address space. This is not so good... - -; -; Here is where we actually swap into the VM (alternate) context. -; We will bulk do a wholesale swap of the registers in the context area (the VMs) -; with the ones in the savearea (our main code). During the copy, we will fix up the -; MSR, forcing on a few bits and turning off a few others. Then we will deal with the -; PMAP and other per_proc stuff. Finally, we will exit back through the main exception -; handler to deal with unstacking saveareas and ASTs, etc. -; - -swvmDoSwitch: - -; -; First, we save the volatile registers we care about. Remember, all register -; handling here is pretty funky anyway, so we just pick the ones that are ok. -; - mr r26,r3 ; Save the activation pointer - - la r11,vmmFacCtx(r2) ; Point to the virtual machine facility context - mr r27,r2 ; Save the context entry - stw r11,deferctx(r3) ; Start using the virtual machine facility context when we exit - - lwz r11,ACT_MACT_SPF(r26) ; Get the special flags - mr r3,r31 ; Get the pointer to the PMAP - oris r15,r11,hi16(runningVM) ; ; Show that we are swapped to the VM right now - bl EXT(hw_set_user_space_dis) ; Swap the address spaces - lwz r17,vmmFlags(r27) ; Get the status flags - lwz r20,vmmContextKern(r27) ; Get the state page kernel addr - lwz r21,vmmCntrl(r20) ; Get vmmCntrl - rlwinm. r22,r21,0,vmmFamEnab,vmmFamEnab ; Is vmmFamEnab set? - lwz r22,vmmXAFlgs(r27) ; Get the eXtended Architecture flags - stw r22,VMMXAFlgs(r10) ; Store vmmXAFlgs in per_proc VMMXAFlgs - beq swvmNoFam ; No Fam intercept - rlwinm. r22,r22,0,0,0 ; Are we doing a 64-bit virtual machine? - rlwimi r15,r21,32+vmmFamSetb-FamVMmodebit,FamVMmodebit,FamVMmodebit ; Set FamVMmode bit - rlwinm r21,r21,0,vmmFamSetb+1,vmmFamSetb-1 ; Clear FamSet bit - bne swvmXfamintercpt - lwz r22,famintercepts(r20) ; Load intercept bit field - b swvmfamintercptres -swvmXfamintercpt: - lwz r22,faminterceptsX(r20) ; Load intercept bit field -swvmfamintercptres: - stw r21,vmmCntrl(r20) ; Update vmmCntrl - lwz r19,vmmContextPhys(r27) ; Get vmmFAMarea address - stw r22,vmmFAMintercept(r27) ; Get vmmFAMintercept - stw r22,FAMintercept(r10) ; Store vmmFAMintercept in per_proc FAMintercept - stw r19,VMMareaPhys(r10) ; Store VMMareaPhys - oris r15,r15,hi16(FamVMena) ; Set FamVMenabit -swvmNoFam: - stw r27,vmmCEntry(r26) ; Remember what context we are running - bf++ vmmMapDone,swvmNoMap ; We have not mapped anything or it was not for this address space - -; -; This little bit of hoopala here (triggered by vmmMapDone) is -; a performance enhancement. This will change the returning savearea -; to look like we had a DSI rather than a system call. Then, setting -; the redrive bit, the exception handler will redrive the exception as -; a DSI, entering the last mapped address into the hash table. This keeps -; double faults from happening. Note that there is only a gain if the VM -; takes a fault, then the emulator resolves it only, and then begins -; the VM execution again. It seems like this should be the normal case. -; -; Note that we need to revisit this when we move the virtual machines to the task because -; then it will be possible for more than one thread to access this stuff at the same time. -; - - lwz r3,SAVflags(r30) ; Pick up the savearea flags - lwz r2,vmmLastMap(r28) ; Get the last mapped address - lwz r14,vmmLastMap+4(r28) ; Get the last mapped address low half - li r20,T_DATA_ACCESS ; Change to DSI fault - oris r3,r3,hi16(SAVredrive) ; Set exception redrive - stw r2,savedar(r30) ; Set the DAR to the last thing we mapped - stw r14,savedar+4(r30) ; Set the DAR to the last thing we mapped - stw r3,SAVflags(r30) ; Turn on the redrive request - lis r2,hi16(MASK(DSISR_HASH)) ; Set PTE/DBAT miss - li r0,0 ; Clear - stw r20,saveexception(r30) ; Say we need to emulate a DSI - stw r2,savedsisr(r30) ; Pretend we have a PTE miss - stb r0,vmmGFlags+3(r28) ; Show that the redrive has been taken care of - -swvmNoMap: lwz r20,vmmContextKern(r27) ; Get the comm area - rlwimi r15,r17,32-(floatCngbit-vmmFloatCngdb),floatCngbit,vectorCngbit ; Shift and insert changed bits - lwz r20,vmmCntrl(r20) ; Get the control flags - rlwimi r17,r11,8,24,31 ; Save the old spf flags - rlwimi r15,r20,32+vmmKeyb-userProtKeybit,userProtKeybit,userProtKeybit ; Set the protection key - stw r15,spcFlags(r10) ; Set per_proc copy of the special flags - stw r15,ACT_MACT_SPF(r26) ; Get the special flags - - stw r17,vmmFlags(r27) ; Set the status flags - - bl swapCtxt ; First, swap the general register state - - lwz r17,vmmContextKern(r27) ; Get the comm area back - la r25,vmmFacCtx(r27) ; Point to the facility context - lwz r15,vmmCntrl(r17) ; Get the control flags again - mfsprg r29,1 ; Get the current activation - lwz r29,ACT_PER_PROC(r29) ; Get the per_proc block - -; -; Check if there is new floating point context to load -; - - rlwinm. r0,r15,0,vmmFloatLoadb,vmmFloatLoadb ; Are there new floating point values? - lhz r29,PP_CPU_NUMBER(r29) ; Get our cpu number - li r14,vmmppcFPRs ; Get displacement to the new values - andc r15,r15,r0 ; Clear the bit - beq+ swvmNoNewFloats ; Nope, good... - - lwz r19,FPUcpu(r25) ; Get the last CPU we ran on - - stw r29,FPUcpu(r25) ; Claim the context for ourselves - - eieio ; Make sure this stays in order - - lis r18,hi16(EXT(PerProcTable)) ; Set base PerProcTable - mulli r19,r19,ppeSize ; Find offset to the owner per_proc_entry - ori r18,r18,lo16(EXT(PerProcTable)) ; Set base PerProcTable - li r16,FPUowner ; Displacement to float owner - add r19,r18,r19 ; Point to the owner per_proc_entry - lwz r19,ppe_vaddr(r19) ; Point to the owner per_proc - -swvminvfpu: lwarx r18,r16,r19 ; Get the owner - - sub r0,r18,r25 ; Subtract one from the other - sub r3,r25,r18 ; Subtract the other from the one - or r3,r3,r0 ; Combine them - srawi r3,r3,31 ; Get a 0 if equal or -1 of not - and r18,r18,r3 ; Make 0 if same, unchanged if not - stwcx. r18,r16,r19 ; Try to invalidate it - bne-- swvminvfpu ; Try again if there was a collision... - - lwz r3,FPUsave(r25) ; Get the FPU savearea - dcbt r14,r17 ; Touch in first line of new stuff - mr. r3,r3 ; Is there one? - bne+ swvmGotFloat ; Yes... - - bl EXT(save_get) ; Get a savearea - - li r7,SAVfloat ; Get floating point flag - stw r26,SAVact(r3) ; Save our activation - li r0,0 ; Get a zero - stb r7,SAVflags+2(r3) ; Set that this is floating point - stw r0,SAVprev+4(r3) ; Clear the back chain - stw r0,SAVlevel(r3) ; We are always at level 0 (user state) - - stw r3,FPUsave(r25) ; Chain us to context - -swvmGotFloat: - la r4,savefp0(r3) ; Point to the destination - mr r21,r3 ; Save the save area - la r3,vmmppcFPRs(r17) ; Point to the source - li r5,32*8 ; Get the size (32 FPRs at 8 bytes each) - - bl EXT(bcopy) ; Copy the new values - - lwz r11,ACT_MACT_SPF(r26) ; Get the special flags - stw r15,vmmCntrl(r17) ; Save the control flags sans vmmFloatLoad - rlwinm r11,r11,0,floatCngbit+1,floatCngbit-1 ; Clear the changed bit here - lwz r14,vmmStat(r17) ; Get the status flags - mfsprg r10,1 ; Get the current activation - lwz r10,ACT_PER_PROC(r10) ; Get the per_proc block - stw r11,ACT_MACT_SPF(r26) ; Get the special flags - rlwinm r14,r14,0,vmmFloatCngdb+1,vmmFloatCngdb-1 ; Clear the changed flag - stw r11,spcFlags(r10) ; Set per_proc copy of the special flags - stw r14,vmmStat(r17) ; Set the status flags sans vmmFloatCngd - -; -; Check if there is new vector context to load -; - -swvmNoNewFloats: - rlwinm. r0,r15,0,vmmVectLoadb,vmmVectLoadb ; Are there new vector values? - li r14,vmmppcVRs ; Get displacement to the new values - andc r15,r15,r0 ; Clear the bit - beq+ swvmNoNewVects ; Nope, good... - - lwz r19,VMXcpu(r25) ; Get the last CPU we ran on - - stw r29,VMXcpu(r25) ; Claim the context for ourselves - - eieio ; Make sure this stays in order - - lis r18,hi16(EXT(PerProcTable)) ; Set base PerProcTable - mulli r19,r19,ppeSize ; Find offset to the owner per_proc_entry - ori r18,r18,lo16(EXT(PerProcTable)) ; Set base PerProcTable - li r16,VMXowner ; Displacement to vector owner - add r19,r18,r19 ; Point to the owner per_proc_entry - lwz r19,ppe_vaddr(r19) ; Point to the owner per_proc - -swvminvvec: lwarx r18,r16,r19 ; Get the owner - - sub r0,r18,r25 ; Subtract one from the other - sub r3,r25,r18 ; Subtract the other from the one - or r3,r3,r0 ; Combine them - srawi r3,r3,31 ; Get a 0 if equal or -1 of not - and r18,r18,r3 ; Make 0 if same, unchanged if not - stwcx. r18,r16,r19 ; Try to invalidate it - bne-- swvminvfpu ; Try again if there was a collision... - -swvminvved: lwz r3,VMXsave(r25) ; Get the vector savearea - dcbt r14,r17 ; Touch in first line of new stuff - mr. r3,r3 ; Is there one? - bne+ swvmGotVect ; Yes... - - bl EXT(save_get) ; Get a savearea - - li r7,SAVvector ; Get the vector type flag - stw r26,SAVact(r3) ; Save our activation - li r0,0 ; Get a zero - stb r7,SAVflags+2(r3) ; Set that this is vector - stw r0,SAVprev+4(r3) ; Clear the back chain - stw r0,SAVlevel(r3) ; We are always at level 0 (user state) - - stw r3,VMXsave(r25) ; Chain us to context - -swvmGotVect: - mr r21,r3 ; Save the pointer to the savearea - la r4,savevr0(r3) ; Point to the destination - la r3,vmmppcVRs(r17) ; Point to the source - li r5,32*16 ; Get the size (32 vectors at 16 bytes each) - - bl EXT(bcopy) ; Copy the new values - - lwz r8,savevrsave(r30) ; Get the current VRSave - - lwz r11,ACT_MACT_SPF(r26) ; Get the special flags - stw r15,vmmCntrl(r17) ; Save the control flags sans vmmVectLoad - rlwinm r11,r11,0,vectorCngbit+1,vectorCngbit-1 ; Clear the changed bit here - stw r8,savevrvalid(r21) ; Set the current VRSave as valid saved - lwz r14,vmmStat(r17) ; Get the status flags - mfsprg r10,1 ; Get the current activation - lwz r10,ACT_PER_PROC(r10) ; Get the per_proc block - stw r11,ACT_MACT_SPF(r26) ; Get the special flags - rlwinm r14,r14,0,vmmVectCngdb+1,vmmVectCngdb-1 ; Clear the changed flag - stw r11,spcFlags(r10) ; Set per_proc copy of the special flags - stw r14,vmmStat(r17) ; Set the status flags sans vmmVectCngd - -swvmNoNewVects: - li r3,1 ; Show normal exit with check for AST - mr r16,r26 ; Restore the thread pointer - b EXT(ppcscret) ; Go back to handler... - - .align 5 - -swvmmBogus: li r2,kVmmBogusContext ; Set bogus index return - li r0,0 ; Clear - li r3,1 ; Set normal return with check for AST - stw r0,saver3(r30) ; Clear upper half - stw r2,saver3+4(r30) ; Pass back the return code - b EXT(ppcscret) ; Go back to handler... - -swvmmBogAdsp: - li r2,kVmmInvalidAdSpace ; Set bogus address space return - li r0,0 ; Clear - li r3,1 ; Set normal return with check for AST - stw r0,saver3(r30) ; Clear upper half - stw r2,saver3+4(r30) ; Pass back the return code - b EXT(ppcscret) ; Go back to handler... - -swvmSetStop: - li r2,kVmmStopped ; Set stopped return - li r0,0 ; Clear - li r3,1 ; Set normal return with check for AST - stw r0,saver3(r30) ; Clear upper half - stw r2,saver3+4(r30) ; Pass back the return code - stw r2,return_code(r5) ; Save the exit code - b EXT(ppcscret) ; Go back to handler... - -svvmTimerPop: - li r2,kVmmReturnNull ; Set null return - li r0,0 ; Clear - li r3,1 ; Set normal return with check for AST - stw r0,saver3(r30) ; Clear upper half - stw r2,saver3+4(r30) ; Pass back the return code - stw r2,return_code(r5) ; Save the exit code - b EXT(ppcscret) ; Go back to handler... - -swvmEmulateISI: - mfsprg r10,2 ; Get feature flags - lwz r11,vmmXAFlgs(r28) ; Get the eXtended Architecture flags - mtcrf 0x02,r10 ; Move pf64Bit to its normal place in CR6 - rlwinm. r11,r11,0,0,0 ; Are we doing a 64-bit virtual machine? - li r2,kVmmReturnInstrPageFault ; Set ISI - crnot vmmDoing64,cr0_eq ; Remember if this is a 64-bit VM - li r0,0 ; Clear - li r3,1 ; Set normal return with check for AST - stw r0,saver3(r30) ; Clear upper half - stw r2,saver3+4(r30) ; Pass back the return code - stw r2,return_code(r5) ; Save the exit code - lis r7,hi16(MASK(DSISR_HASH)) ; Pretend like we got a PTE miss - bt vmmDoing64,vmISI64 ; Go do this for a 64-bit VM... - - lwz r10,vmmppcpc(r5) ; Get the PC as failing address - stw r10,return_params+0(r5) ; Save PC as first return parm - stw r7,return_params+4(r5) ; Save the pseudo-DSISR as second return parm - b EXT(ppcscret) ; Go back to handler... - -vmISI64: ld r10,vmmppcXpc(r5) ; Get the PC as failing address - std r10,return_paramsX+0(r5) ; Save PC as first return parm - std r7,return_paramsX+8(r5) ; Save the pseudo-DSISR as second return parm - b EXT(ppcscret) ; Go back to handler... - -; -; These syscalls are invalid, FAM syscall fast path -; - - .align 5 - .globl EXT(vmm_fam_reserved) - -LEXT(vmm_fam_reserved) - li r3,0 ; Force exception - b EXT(ppcscret) ; Go back to handler... - -; -; Here is where we exit from vmm mode. We do this on any kind of exception. -; Interruptions (decrementer, external, etc.) are another story though. -; These we just pass through. We also switch back explicity when requested. -; This will happen in response to a timer pop and some kinds of ASTs. -; -; Inputs: -; R3 = activation -; R4 = savearea -; - - .align 5 - .globl EXT(vmm_exit) - -LEXT(vmm_exit) - -vmmexitcall: - lwz r2,vmmCEntry(r3) ; Get the context that is active - lwz r12,ACT_VMMAP(r3) ; Get the VM_MAP for this guy - lwz r11,ACT_MACT_SPF(r3) ; Get the special flags - lwz r19,vmmFlags(r2) ; Get the status flags - mr r16,r3 ; R16 is safe to use for the activation address - - rlwimi r19,r11,floatCngbit-vmmFloatCngdb,vmmFloatCngdb,vmmVectCngdb ; Shift and insert changed bits - li r0,0 ; Get a zero - rlwimi r11,r19,vmmSpfSaveb,floatCngbit,vectorCngbit ; Restore the saved part of the spf - lwz r3,VMMAP_PMAP(r12) ; Get the pmap for the activation - rlwinm r11,r11,0,runningVMbit+1,runningVMbit-1 ; Clear the "in VM" flag - stw r0,vmmCEntry(r16) ; Clear pointer to active context - stw r19,vmmFlags(r2) ; Set the status flags - rlwinm r11,r11,0,userProtKeybit+1,userProtKeybit-1 ; Set back to normal protection key - mfsprg r10,1 ; Get the current activation - lwz r10,ACT_PER_PROC(r10) ; Get the per_proc block - rlwinm r11,r11,0,FamVMenabit+1,FamVMenabit-1 ; Clear FamVMEnable - lwz r18,spcFlags(r10) ; Get per_proc copy of the special flags - lwz r5,vmmContextKern(r2) ; Get the state page kernel addr - rlwinm r11,r11,0,FamVMmodebit+1,FamVMmodebit-1 ; Clear FamVMMode - lwz r6,vmmCntrl(r5) ; Get the control field - rlwimi r19,r18,FamVMmodebit-vmmFAMmodeb,vmmFAMmodeb,vmmFAMmodeb ; Shift and insert changed bits - rlwimi r6,r18,FamVMmodebit-vmmFamSetb,vmmFamSetb,vmmFamSetb ; Shift and insert changed bits - rlwimi r6,r18,userProtKeybit-vmmKeyb,vmmKeyb,vmmKeyb ; Shift and insert changed bits - stw r11,ACT_MACT_SPF(r16) ; Get the special flags - stw r6,vmmCntrl(r5) ; Store the control field - stw r11,spcFlags(r10) ; Set per_proc copy of the special flags - - mr r26,r16 ; Save the activation pointer - mr r27,r2 ; Save the context entry - - bl EXT(hw_set_user_space_dis) ; Swap the address spaces back to the emulator - - la r5,facctx(r16) ; Point to the main facility context - mr r2,r27 ; Restore - stw r5,deferctx(r16) ; Start using the main facility context on the way out - lwz r5,vmmContextKern(r27) ; Get the context area address - mr r3,r16 ; Restore activation address - stw r19,vmmStat(r5) ; Save the changed and popped flags - bl swapCtxt ; Exchange the VM context for the emulator one - stw r8,saver3+4(r30) ; Set the return code as the return value also - b EXT(retFromVM) ; Go back to handler... - - -; -; Here is where we force exit from vmm mode. We do this when as -; part of termination and is used to insure that we are not executing -; in an alternate context. Because this is called from C we need to save -; all non-volatile registers. -; -; Inputs: -; R3 = activation -; R4 = user savearea -; Interruptions disabled -; - - .align 5 - .globl EXT(vmm_force_exit) - -LEXT(vmm_force_exit) - - stwu r1,-(FM_ALIGN(20*4)+FM_SIZE)(r1) ; Get enough space for the registers - mflr r0 ; Save the return - stmw r13,FM_ARG0(r1) ; Save all non-volatile registers - stw r0,(FM_ALIGN(20*4)+FM_SIZE+FM_LR_SAVE)(r1) ; Save the return - - lwz r2,vmmCEntry(r3) ; Get the context that is active - lwz r11,ACT_MACT_SPF(r3) ; Get the special flags - lwz r19,vmmFlags(r2) ; Get the status flags - lwz r12,ACT_VMMAP(r3) ; Get the VM_MAP for this guy - - rlwimi r19,r11,floatCngbit-vmmFloatCngdb,vmmFloatCngdb,vmmVectCngdb ; Shift and insert changed bits - mr r26,r3 ; Save the activation pointer - rlwimi r11,r19,vmmSpfSaveb,floatCngbit,vectorCngbit ; Restore the saved part of the spf - li r0,0 ; Get a zero - rlwinm r9,r11,0,runningVMbit+1,runningVMbit-1 ; Clear the "in VM" flag - cmplw r9,r11 ; Check if we were in a vm - lwz r3,VMMAP_PMAP(r12) ; Get the pmap for the activation - beq- vfeNotRun ; We were not in a vm.... - rlwinm r9,r9,0,userProtKeybit+1,userProtKeybit-1 ; Set back to normal protection key - stw r0,vmmCEntry(r26) ; Clear pointer to active context - mfsprg r10,1 ; Get the current activation - lwz r10,ACT_PER_PROC(r10) ; Get the per_proc block - lwz r18,spcFlags(r10) ; Get per_proc copy of the special flags - rlwinm r9,r9,0,FamVMenabit+1,FamVMenabit-1 ; Clear Fam Enable - rlwinm r9,r9,0,FamVMmodebit+1,FamVMmodebit-1 ; Clear Fam Enable - lwz r5,vmmContextKern(r2) ; Get the context area address - lwz r6,vmmCntrl(r5) ; Get the control field - rlwimi r19,r18,FamVMmodebit-vmmFAMmodeb,vmmFAMmodeb,vmmFAMmodeb ; Shift and insert changed bits - rlwimi r6,r18,FamVMmodebit-vmmFamSetb,vmmFamSetb,vmmFamSetb ; Shift and insert changed bits - rlwimi r6,r18,userProtKeybit-vmmKeyb,vmmKeyb,vmmKeyb ; Shift and insert changed bits - stw r6,vmmCntrl(r5) ; Store the control field - stw r9,ACT_MACT_SPF(r26) ; Get the special flags - stw r9,spcFlags(r10) ; Set per_proc copy of the special flags - - mr r27,r2 ; Save the context entry - mr r30,r4 ; Save the savearea - - bl EXT(hw_set_user_space_dis) ; Swap the address spaces back to the emulator - - la r7,facctx(r26) ; Point to the main facility context - - lwz r5,vmmContextKern(r27) ; Get the context area address - stw r19,vmmStat(r5) ; Save the changed and popped flags - stw r7,deferctx(r26) ; Tell context launcher to switch facility context - - bl swapCtxt ; Exchange the VM context for the emulator one - - lwz r8,saveexception(r30) ; Pick up the exception code - lwz r7,SAVflags(r30) ; Pick up the savearea flags - lis r9,hi16(SAVredrive) ; Get exception redrive bit - rlwinm r8,r8,30,24,31 ; Convert exception to return code - andc r7,r7,r9 ; Make sure redrive is off because we are intercepting - stw r8,saver3+4(r30) ; Set the return code as the return value also - stw r7,SAVflags(r30) ; Set the savearea flags - - -vfeNotRun: lmw r13,FM_ARG0(r1) ; Restore all non-volatile registers - lwz r1,0(r1) ; Pop the stack - lwz r0,FM_LR_SAVE(r1) ; Get the return address - mtlr r0 ; Set return - blr - -; -; Note: we will not do any DCBTs to the savearea. It was just stored to a few cycles ago and should -; still be in the cache. -; -; NOTE NOTE: R16 is important to save!!!! -; - .align 5 - -swapCtxt: - mfsprg r10,2 ; Get feature flags - la r6,vmmppcpc(r5) ; Point to the first line - mtcrf 0x02,r10 ; Move pf64Bit to its normal place in CR6 - - lwz r14,saveexception(r30) ; Get the exception code - dcbt 0,r6 ; Touch in the first line of the context area - bt++ pf64Bitb,swap64 ; Go do this swap on a 64-bit machine... - - lwz r7,savesrr0+4(r30) ; Start moving context - lwz r8,savesrr1+4(r30) - lwz r9,saver0+4(r30) - cmplwi cr1,r14,T_SYSTEM_CALL ; Are we switching because of a system call? - lwz r10,saver1+4(r30) - lwz r11,saver2+4(r30) - lwz r12,saver3+4(r30) - lwz r13,saver4+4(r30) - la r6,vmmppcr6(r5) ; Point to second line - lwz r14,saver5+4(r30) - - dcbt 0,r6 ; Touch second line of context area - - lwz r15,vmmppcpc(r5) ; First line of context - lis r22,hi16(MSR_IMPORT_BITS) ; Get the MSR bits that are controllable by user - lwz r23,vmmppcmsr(r5) - ori r22,r22,lo16(MSR_IMPORT_BITS) ; Get the rest of the MSR bits that are controllable by user - lwz r17,vmmppcr0(r5) - lwz r18,vmmppcr1(r5) - and r23,r23,r22 ; Keep only the controllable bits - lwz r19,vmmppcr2(r5) - oris r23,r23,hi16(MSR_EXPORT_MASK_SET) ; Force on the required bits - lwz r20,vmmppcr3(r5) - ori r23,r23,lo16(MSR_EXPORT_MASK_SET) ; Force on the other required bits - lwz r21,vmmppcr4(r5) - lwz r22,vmmppcr5(r5) - - dcbt 0,r6 ; Touch third line of context area - - stw r7,vmmppcpc(r5) ; Save emulator context into the context area - stw r8,vmmppcmsr(r5) - stw r9,vmmppcr0(r5) - stw r10,vmmppcr1(r5) - stw r11,vmmppcr2(r5) - stw r12,vmmppcr3(r5) - stw r13,vmmppcr4(r5) - stw r14,vmmppcr5(r5) - -; -; Save the first 3 parameters if we are an SC (we will take care of the last later) -; - bne+ cr1,swapnotsc ; Skip next if not an SC exception... - stw r12,return_params+0(r5) ; Save the first return - stw r13,return_params+4(r5) ; Save the second return - stw r14,return_params+8(r5) ; Save the third return - -swapnotsc: li r6,0 ; Clear this out - stw r6,savesrr0(r30) ; Insure that high order is clear - stw r15,savesrr0+4(r30) ; Save vm context into the savearea - stw r6,savesrr1(r30) ; Insure that high order is clear - stw r23,savesrr1+4(r30) - stw r17,saver0+4(r30) - stw r18,saver1+4(r30) - stw r19,saver2+4(r30) - stw r20,saver3+4(r30) - stw r21,saver4+4(r30) - la r6,vmmppcr14(r5) ; Point to fourth line - stw r22,saver5+4(r30) - - dcbt 0,r6 ; Touch fourth line - -; Swap 8 registers - - lwz r7,saver6+4(r30) ; Read savearea - lwz r8,saver7+4(r30) - lwz r9,saver8+4(r30) - lwz r10,saver9+4(r30) - lwz r11,saver10+4(r30) - lwz r12,saver11+4(r30) - lwz r13,saver12+4(r30) - lwz r14,saver13+4(r30) - - lwz r15,vmmppcr6(r5) ; Read vm context - lwz r24,vmmppcr7(r5) - lwz r17,vmmppcr8(r5) - lwz r18,vmmppcr9(r5) - lwz r19,vmmppcr10(r5) - lwz r20,vmmppcr11(r5) - lwz r21,vmmppcr12(r5) - lwz r22,vmmppcr13(r5) - - stw r7,vmmppcr6(r5) ; Write context - stw r8,vmmppcr7(r5) - stw r9,vmmppcr8(r5) - stw r10,vmmppcr9(r5) - stw r11,vmmppcr10(r5) - stw r12,vmmppcr11(r5) - stw r13,vmmppcr12(r5) - la r6,vmmppcr22(r5) ; Point to fifth line - stw r14,vmmppcr13(r5) - - dcbt 0,r6 ; Touch fifth line - - stw r15,saver6+4(r30) ; Write vm context - stw r24,saver7+4(r30) - stw r17,saver8+4(r30) - stw r18,saver9+4(r30) - stw r19,saver10+4(r30) - stw r20,saver11+4(r30) - stw r21,saver12+4(r30) - stw r22,saver13+4(r30) - -; Swap 8 registers - - lwz r7,saver14+4(r30) ; Read savearea - lwz r8,saver15+4(r30) - lwz r9,saver16+4(r30) - lwz r10,saver17+4(r30) - lwz r11,saver18+4(r30) - lwz r12,saver19+4(r30) - lwz r13,saver20+4(r30) - lwz r14,saver21+4(r30) - - lwz r15,vmmppcr14(r5) ; Read vm context - lwz r24,vmmppcr15(r5) - lwz r17,vmmppcr16(r5) - lwz r18,vmmppcr17(r5) - lwz r19,vmmppcr18(r5) - lwz r20,vmmppcr19(r5) - lwz r21,vmmppcr20(r5) - lwz r22,vmmppcr21(r5) - - stw r7,vmmppcr14(r5) ; Write context - stw r8,vmmppcr15(r5) - stw r9,vmmppcr16(r5) - stw r10,vmmppcr17(r5) - stw r11,vmmppcr18(r5) - stw r12,vmmppcr19(r5) - stw r13,vmmppcr20(r5) - la r6,vmmppcr30(r5) ; Point to sixth line - stw r14,vmmppcr21(r5) - - dcbt 0,r6 ; Touch sixth line - - stw r15,saver14+4(r30) ; Write vm context - stw r24,saver15+4(r30) - stw r17,saver16+4(r30) - stw r18,saver17+4(r30) - stw r19,saver18+4(r30) - stw r20,saver19+4(r30) - stw r21,saver20+4(r30) - stw r22,saver21+4(r30) - -; Swap 8 registers - - lwz r7,saver22+4(r30) ; Read savearea - lwz r8,saver23+4(r30) - lwz r9,saver24+4(r30) - lwz r10,saver25+4(r30) - lwz r11,saver26+4(r30) - lwz r12,saver27+4(r30) - lwz r13,saver28+4(r30) - lwz r14,saver29+4(r30) - - lwz r15,vmmppcr22(r5) ; Read vm context - lwz r24,vmmppcr23(r5) - lwz r17,vmmppcr24(r5) - lwz r18,vmmppcr25(r5) - lwz r19,vmmppcr26(r5) - lwz r20,vmmppcr27(r5) - lwz r21,vmmppcr28(r5) - lwz r22,vmmppcr29(r5) - - stw r7,vmmppcr22(r5) ; Write context - stw r8,vmmppcr23(r5) - stw r9,vmmppcr24(r5) - stw r10,vmmppcr25(r5) - stw r11,vmmppcr26(r5) - stw r12,vmmppcr27(r5) - stw r13,vmmppcr28(r5) - la r6,vmmppcvscr(r5) ; Point to seventh line - stw r14,vmmppcr29(r5) - - dcbt 0,r6 ; Touch seventh line - - stw r15,saver22+4(r30) ; Write vm context - stw r24,saver23+4(r30) - stw r17,saver24+4(r30) - stw r18,saver25+4(r30) - stw r19,saver26+4(r30) - stw r20,saver27+4(r30) - stw r21,saver28+4(r30) - stw r22,saver29+4(r30) - -; Swap 8 registers - - lwz r7,saver30+4(r30) ; Read savearea - lwz r8,saver31+4(r30) - lwz r9,savecr(r30) - lwz r10,savexer+4(r30) - lwz r11,savelr+4(r30) - lwz r12,savectr+4(r30) - lwz r14,savevrsave(r30) - - lwz r15,vmmppcr30(r5) ; Read vm context - lwz r24,vmmppcr31(r5) - lwz r17,vmmppccr(r5) - lwz r18,vmmppcxer(r5) - lwz r19,vmmppclr(r5) - lwz r20,vmmppcctr(r5) - lwz r22,vmmppcvrsave(r5) - - stw r7,vmmppcr30(r5) ; Write context - stw r8,vmmppcr31(r5) - stw r9,vmmppccr(r5) - stw r10,vmmppcxer(r5) - stw r11,vmmppclr(r5) - stw r12,vmmppcctr(r5) - stw r14,vmmppcvrsave(r5) - - stw r15,saver30+4(r30) ; Write vm context - stw r24,saver31+4(r30) - stw r17,savecr(r30) - stw r18,savexer+4(r30) - stw r19,savelr+4(r30) - stw r20,savectr+4(r30) - stw r22,savevrsave(r30) - -; Swap 8 registers - - lwz r7,savevscr+0(r30) ; Read savearea - lwz r8,savevscr+4(r30) - lwz r9,savevscr+8(r30) - lwz r10,savevscr+12(r30) - lwz r11,savefpscrpad(r30) - lwz r12,savefpscr(r30) - - lwz r15,vmmppcvscr+0(r5) ; Read vm context - lwz r24,vmmppcvscr+4(r5) - lwz r17,vmmppcvscr+8(r5) - lwz r18,vmmppcvscr+12(r5) - lwz r19,vmmppcfpscrpad(r5) - lwz r20,vmmppcfpscr(r5) - - stw r7,vmmppcvscr+0(r5) ; Write context - stw r8,vmmppcvscr+4(r5) - stw r9,vmmppcvscr+8(r5) - stw r10,vmmppcvscr+12(r5) - stw r11,vmmppcfpscrpad(r5) - stw r12,vmmppcfpscr(r5) - - stw r15,savevscr+0(r30) ; Write vm context - stw r24,savevscr+4(r30) - stw r17,savevscr+8(r30) - stw r18,savevscr+12(r30) - stw r19,savefpscrpad(r30) - stw r20,savefpscr(r30) - - -; -; Cobble up the exception return code and save any specific return values -; - - lwz r7,saveexception(r30) ; Pick up the exception code - rlwinm r8,r7,30,24,31 ; Convert exception to return code - cmplwi r7,T_DATA_ACCESS ; Was this a DSI? - stw r8,return_code(r5) ; Save the exit code - cmplwi cr1,r7,T_INSTRUCTION_ACCESS ; Exiting because of an ISI? - beq+ swapDSI ; Yeah... - cmplwi r7,T_ALIGNMENT ; Alignment exception? - beq+ cr1,swapISI ; We had an ISI... - cmplwi cr1,r7,T_SYSTEM_CALL ; Exiting because of an system call? - beq+ swapDSI ; An alignment exception looks like a DSI... - beq+ cr1,swapSC ; We had a system call... - - blr ; Return... - -; -; Set exit returns for a DSI or alignment exception -; - -swapDSI: lwz r10,savedar+4(r30) ; Get the DAR - lwz r7,savedsisr(r30) ; and the DSISR - stw r10,return_params+0(r5) ; Save DAR as first return parm - stw r7,return_params+4(r5) ; Save DSISR as second return parm - blr ; Return... - -; -; Set exit returns for a ISI -; - -swapISI: lwz r7,vmmppcmsr(r5) ; Get the SRR1 value - lwz r10,vmmppcpc(r5) ; Get the PC as failing address - rlwinm r7,r7,0,1,4 ; Save the bits that match the DSISR - stw r10,return_params+0(r5) ; Save PC as first return parm - stw r7,return_params+4(r5) ; Save the pseudo-DSISR as second return parm - blr ; Return... - -; -; Set exit returns for a system call (note: we did the first 3 earlier) -; Do we really need to pass parameters back here???? -; - -swapSC: lwz r10,vmmppcr6(r5) ; Get the fourth paramter - stw r10,return_params+12(r5) ; Save it - blr ; Return... - -; -; Here is the swap for 64-bit machines -; - -swap64: lwz r22,vmmXAFlgs(r27) ; Get the eXtended Architecture flags - ld r7,savesrr0(r30) ; Start moving context - ld r8,savesrr1(r30) - ld r9,saver0(r30) - cmplwi cr1,r14,T_SYSTEM_CALL ; Are we switching because of a system call? - ld r10,saver1(r30) - ld r11,saver2(r30) - rlwinm. r22,r22,0,0,0 ; Are we doing a 64-bit virtual machine? - ld r12,saver3(r30) - crnot vmmDoing64,cr0_eq ; Remember if this is a 64-bit VM - ld r13,saver4(r30) - la r6,vmmppcr6(r5) ; Point to second line - ld r14,saver5(r30) - - dcbt 0,r6 ; Touch second line of context area - - bt vmmDoing64,sw64x1 ; Skip to 64-bit stuff - - lwz r15,vmmppcpc(r5) ; First line of context - lis r22,hi16(MSR_IMPORT_BITS) ; Get the MSR bits that are controllable by user - lwz r23,vmmppcmsr(r5) - ori r22,r22,lo16(MSR_IMPORT_BITS) ; Get the rest of the MSR bits that are controllable by user - lwz r17,vmmppcr0(r5) - lwz r18,vmmppcr1(r5) - and r23,r23,r22 ; Keep only the controllable bits - lwz r19,vmmppcr2(r5) - oris r23,r23,hi16(MSR_EXPORT_MASK_SET) ; Force on the required bits - lwz r20,vmmppcr3(r5) - ori r23,r23,lo16(MSR_EXPORT_MASK_SET) ; Force on the other required bits - lwz r21,vmmppcr4(r5) - lwz r22,vmmppcr5(r5) - - dcbt 0,r6 ; Touch third line of context area - - stw r7,vmmppcpc(r5) ; Save emulator context into the context area - stw r8,vmmppcmsr(r5) - stw r9,vmmppcr0(r5) - stw r10,vmmppcr1(r5) - stw r11,vmmppcr2(r5) - stw r12,vmmppcr3(r5) - stw r13,vmmppcr4(r5) - stw r14,vmmppcr5(r5) - -; -; Save the first 3 parameters if we are an SC (we will take care of the last later) -; - bne+ cr1,sw64x1done ; Skip next if not an SC exception... - stw r12,return_params+0(r5) ; Save the first return - stw r13,return_params+4(r5) ; Save the second return - stw r14,return_params+8(r5) ; Save the third return - b sw64x1done ; We are done with this section... - -sw64x1: ld r15,vmmppcXpc(r5) ; First line of context - li r0,1 ; Get a 1 to turn on 64-bit - lis r22,hi16(MSR_IMPORT_BITS) ; Get the MSR bits that are controllable by user (we will also allow 64-bit here) - sldi r0,r0,63 ; Get 64-bit bit - ld r23,vmmppcXmsr(r5) - ori r22,r22,lo16(MSR_IMPORT_BITS) ; Get the rest of the MSR bits that are controllable by user - ld r17,vmmppcXr0(r5) - or r22,r22,r0 ; Add the 64-bit bit - ld r18,vmmppcXr1(r5) - and r23,r23,r22 ; Keep only the controllable bits - ld r19,vmmppcXr2(r5) - oris r23,r23,hi16(MSR_EXPORT_MASK_SET) ; Force on the required bits - ld r20,vmmppcXr3(r5) - ori r23,r23,lo16(MSR_EXPORT_MASK_SET) ; Force on the other required bits - ld r21,vmmppcXr4(r5) - ld r22,vmmppcXr5(r5) - - dcbt 0,r6 ; Touch third line of context area - - std r7,vmmppcXpc(r5) ; Save emulator context into the context area - std r8,vmmppcXmsr(r5) - std r9,vmmppcXr0(r5) - std r10,vmmppcXr1(r5) - std r11,vmmppcXr2(r5) - std r12,vmmppcXr3(r5) - std r13,vmmppcXr4(r5) - std r14,vmmppcXr5(r5) - -; -; Save the first 3 parameters if we are an SC (we will take care of the last later) -; - bne+ cr1,sw64x1done ; Skip next if not an SC exception... - std r12,return_paramsX+0(r5) ; Save the first return - std r13,return_paramsX+8(r5) ; Save the second return - std r14,return_paramsX+16(r5) ; Save the third return - -sw64x1done: - std r15,savesrr0(r30) ; Save vm context into the savearea - std r23,savesrr1(r30) - std r17,saver0(r30) - std r18,saver1(r30) - std r19,saver2(r30) - std r20,saver3(r30) - std r21,saver4(r30) - la r6,vmmppcr14(r5) ; Point to fourth line - std r22,saver5(r30) - - dcbt 0,r6 ; Touch fourth line - -; Swap 8 registers - - ld r7,saver6(r30) ; Read savearea - ld r8,saver7(r30) - ld r9,saver8(r30) - ld r10,saver9(r30) - ld r11,saver10(r30) - ld r12,saver11(r30) - ld r13,saver12(r30) - ld r14,saver13(r30) - - bt vmmDoing64,sw64x2 ; Skip to 64-bit stuff - - lwz r15,vmmppcr6(r5) ; Read vm context - lwz r24,vmmppcr7(r5) - lwz r17,vmmppcr8(r5) - lwz r18,vmmppcr9(r5) - lwz r19,vmmppcr10(r5) - lwz r20,vmmppcr11(r5) - lwz r21,vmmppcr12(r5) - lwz r22,vmmppcr13(r5) - - stw r7,vmmppcr6(r5) ; Write context - stw r8,vmmppcr7(r5) - stw r9,vmmppcr8(r5) - stw r10,vmmppcr9(r5) - stw r11,vmmppcr10(r5) - stw r12,vmmppcr11(r5) - stw r13,vmmppcr12(r5) - la r6,vmmppcr22(r5) ; Point to fifth line - stw r14,vmmppcr13(r5) - - dcbt 0,r6 ; Touch fifth line - b sw64x2done ; We are done with this section... - -sw64x2: ld r15,vmmppcXr6(r5) ; Read vm context - ld r24,vmmppcXr7(r5) - ld r17,vmmppcXr8(r5) - ld r18,vmmppcXr9(r5) - ld r19,vmmppcXr10(r5) - ld r20,vmmppcXr11(r5) - ld r21,vmmppcXr12(r5) - ld r22,vmmppcXr13(r5) - - std r7,vmmppcXr6(r5) ; Write context - std r8,vmmppcXr7(r5) - std r9,vmmppcXr8(r5) - std r10,vmmppcXr9(r5) - std r11,vmmppcXr10(r5) - std r12,vmmppcXr11(r5) - std r13,vmmppcXr12(r5) - la r6,vmmppcXr22(r5) ; Point to fifth line - std r14,vmmppcXr13(r5) - - dcbt 0,r6 ; Touch fifth line - -sw64x2done: std r15,saver6(r30) ; Write vm context - std r24,saver7(r30) - std r17,saver8(r30) - std r18,saver9(r30) - std r19,saver10(r30) - std r20,saver11(r30) - std r21,saver12(r30) - std r22,saver13(r30) - -; Swap 8 registers - - ld r7,saver14(r30) ; Read savearea - ld r8,saver15(r30) - ld r9,saver16(r30) - ld r10,saver17(r30) - ld r11,saver18(r30) - ld r12,saver19(r30) - ld r13,saver20(r30) - ld r14,saver21(r30) - - bt vmmDoing64,sw64x3 ; Skip to 64-bit stuff - - lwz r15,vmmppcr14(r5) ; Read vm context - lwz r24,vmmppcr15(r5) - lwz r17,vmmppcr16(r5) - lwz r18,vmmppcr17(r5) - lwz r19,vmmppcr18(r5) - lwz r20,vmmppcr19(r5) - lwz r21,vmmppcr20(r5) - lwz r22,vmmppcr21(r5) - - stw r7,vmmppcr14(r5) ; Write context - stw r8,vmmppcr15(r5) - stw r9,vmmppcr16(r5) - stw r10,vmmppcr17(r5) - stw r11,vmmppcr18(r5) - stw r12,vmmppcr19(r5) - stw r13,vmmppcr20(r5) - la r6,vmmppcr30(r5) ; Point to sixth line - stw r14,vmmppcr21(r5) - - dcbt 0,r6 ; Touch sixth line - b sw64x3done ; Done with this section... - -sw64x3: ld r15,vmmppcXr14(r5) ; Read vm context - ld r24,vmmppcXr15(r5) - ld r17,vmmppcXr16(r5) - ld r18,vmmppcXr17(r5) - ld r19,vmmppcXr18(r5) - ld r20,vmmppcXr19(r5) - ld r21,vmmppcXr20(r5) - ld r22,vmmppcXr21(r5) - - std r7,vmmppcXr14(r5) ; Write context - std r8,vmmppcXr15(r5) - std r9,vmmppcXr16(r5) - std r10,vmmppcXr17(r5) - std r11,vmmppcXr18(r5) - std r12,vmmppcXr19(r5) - std r13,vmmppcXr20(r5) - la r6,vmmppcXr30(r5) ; Point to sixth line - std r14,vmmppcXr21(r5) - - dcbt 0,r6 ; Touch sixth line - -sw64x3done: std r15,saver14(r30) ; Write vm context - std r24,saver15(r30) - std r17,saver16(r30) - std r18,saver17(r30) - std r19,saver18(r30) - std r20,saver19(r30) - std r21,saver20(r30) - std r22,saver21(r30) - -; Swap 8 registers - - ld r7,saver22(r30) ; Read savearea - ld r8,saver23(r30) - ld r9,saver24(r30) - ld r10,saver25(r30) - ld r11,saver26(r30) - ld r12,saver27(r30) - ld r13,saver28(r30) - ld r14,saver29(r30) - - bt vmmDoing64,sw64x4 ; Skip to 64-bit stuff - - lwz r15,vmmppcr22(r5) ; Read vm context - lwz r24,vmmppcr23(r5) - lwz r17,vmmppcr24(r5) - lwz r18,vmmppcr25(r5) - lwz r19,vmmppcr26(r5) - lwz r20,vmmppcr27(r5) - lwz r21,vmmppcr28(r5) - lwz r22,vmmppcr29(r5) - - stw r7,vmmppcr22(r5) ; Write context - stw r8,vmmppcr23(r5) - stw r9,vmmppcr24(r5) - stw r10,vmmppcr25(r5) - stw r11,vmmppcr26(r5) - stw r12,vmmppcr27(r5) - stw r13,vmmppcr28(r5) - la r6,vmmppcvscr(r5) ; Point to seventh line - stw r14,vmmppcr29(r5) - dcbt 0,r6 ; Touch seventh line - b sw64x4done ; Done with this section... - -sw64x4: ld r15,vmmppcXr22(r5) ; Read vm context - ld r24,vmmppcXr23(r5) - ld r17,vmmppcXr24(r5) - ld r18,vmmppcXr25(r5) - ld r19,vmmppcXr26(r5) - ld r20,vmmppcXr27(r5) - ld r21,vmmppcXr28(r5) - ld r22,vmmppcXr29(r5) - - std r7,vmmppcXr22(r5) ; Write context - std r8,vmmppcXr23(r5) - std r9,vmmppcXr24(r5) - std r10,vmmppcXr25(r5) - std r11,vmmppcXr26(r5) - std r12,vmmppcXr27(r5) - std r13,vmmppcXr28(r5) - la r6,vmmppcvscr(r5) ; Point to seventh line - std r14,vmmppcXr29(r5) - - dcbt 0,r6 ; Touch seventh line - -sw64x4done: std r15,saver22(r30) ; Write vm context - std r24,saver23(r30) - std r17,saver24(r30) - std r18,saver25(r30) - std r19,saver26(r30) - std r20,saver27(r30) - std r21,saver28(r30) - std r22,saver29(r30) - -; Swap 8 registers - - ld r7,saver30(r30) ; Read savearea - ld r8,saver31(r30) - lwz r9,savecr(r30) - ld r10,savexer(r30) - ld r11,savelr(r30) - ld r12,savectr(r30) - lwz r14,savevrsave(r30) - - bt vmmDoing64,sw64x5 ; Skip to 64-bit stuff - - lwz r15,vmmppcr30(r5) ; Read vm context - lwz r24,vmmppcr31(r5) - lwz r17,vmmppccr(r5) - lwz r18,vmmppcxer(r5) - lwz r19,vmmppclr(r5) - lwz r20,vmmppcctr(r5) - lwz r22,vmmppcvrsave(r5) - - stw r7,vmmppcr30(r5) ; Write context - stw r8,vmmppcr31(r5) - stw r9,vmmppccr(r5) - stw r10,vmmppcxer(r5) - stw r11,vmmppclr(r5) - stw r12,vmmppcctr(r5) - stw r14,vmmppcvrsave(r5) - b sw64x5done ; Done here... - -sw64x5: ld r15,vmmppcXr30(r5) ; Read vm context - ld r24,vmmppcXr31(r5) - lwz r17,vmmppcXcr(r5) - ld r18,vmmppcXxer(r5) - ld r19,vmmppcXlr(r5) - ld r20,vmmppcXctr(r5) - lwz r22,vmmppcXvrsave(r5) - - std r7,vmmppcXr30(r5) ; Write context - std r8,vmmppcXr31(r5) - stw r9,vmmppcXcr(r5) - std r10,vmmppcXxer(r5) - std r11,vmmppcXlr(r5) - std r12,vmmppcXctr(r5) - stw r14,vmmppcXvrsave(r5) - -sw64x5done: std r15,saver30(r30) ; Write vm context - std r24,saver31(r30) - stw r17,savecr(r30) - std r18,savexer(r30) - std r19,savelr(r30) - std r20,savectr(r30) - stw r22,savevrsave(r30) - -; Swap 8 registers - - lwz r7,savevscr+0(r30) ; Read savearea - lwz r8,savevscr+4(r30) - lwz r9,savevscr+8(r30) - lwz r10,savevscr+12(r30) - lwz r11,savefpscrpad(r30) - lwz r12,savefpscr(r30) - - lwz r15,vmmppcvscr+0(r5) ; Read vm context - lwz r24,vmmppcvscr+4(r5) - lwz r17,vmmppcvscr+8(r5) - lwz r18,vmmppcvscr+12(r5) - lwz r19,vmmppcfpscrpad(r5) - lwz r20,vmmppcfpscr(r5) - - stw r7,vmmppcvscr+0(r5) ; Write context - stw r8,vmmppcvscr+4(r5) - stw r9,vmmppcvscr+8(r5) - stw r10,vmmppcvscr+12(r5) - stw r11,vmmppcfpscrpad(r5) - stw r12,vmmppcfpscr(r5) - - stw r15,savevscr+0(r30) ; Write vm context - stw r24,savevscr+4(r30) - stw r17,savevscr+8(r30) - stw r18,savevscr+12(r30) - stw r19,savefpscrpad(r30) - stw r20,savefpscr(r30) - - -; -; Cobble up the exception return code and save any specific return values -; - - lwz r7,saveexception(r30) ; Pick up the exception code - rlwinm r8,r7,30,24,31 ; Convert exception to return code - cmplwi r7,T_DATA_ACCESS ; Was this a DSI? - stw r8,return_code(r5) ; Save the exit code - cmplwi cr1,r7,T_INSTRUCTION_ACCESS ; Exiting because of an ISI? - beq+ swapDSI64 ; Yeah... - cmplwi r7,T_ALIGNMENT ; Alignment exception? - beq+ cr1,swapISI64 ; We had an ISI... - cmplwi cr1,r7,T_SYSTEM_CALL ; Exiting because of an system call? - beq+ swapDSI64 ; An alignment exception looks like a DSI... - beq+ cr1,swapSC64 ; We had a system call... - - blr ; Return... - -; -; Set exit returns for a DSI or alignment exception -; - -swapDSI64: ld r10,savedar(r30) ; Get the DAR - lwz r7,savedsisr(r30) ; and the DSISR - bt vmmDoing64,sw64DSI ; Skip to 64-bit stuff... - - - stw r10,return_params+0(r5) ; Save DAR as first return parm - stw r7,return_params+4(r5) ; Save DSISR as second return parm - blr ; Return... - -sw64DSI: std r10,return_paramsX+0(r5) ; Save DAR as first return parm - std r7,return_paramsX+8(r5) ; Save DSISR as second return parm (note that this is expanded to 64 bits) - blr ; Return... - -; -; Set exit returns for a ISI -; - -swapISI64: bt vmmDoing64,sw64ISI ; Skip to 64-bit stuff... - lwz r7,vmmppcmsr(r5) ; Get the SRR1 value - lwz r10,vmmppcpc(r5) ; Get the PC as failing address - rlwinm r7,r7,0,1,4 ; Save the bits that match the DSISR - stw r10,return_params+0(r5) ; Save PC as first return parm - stw r7,return_params+4(r5) ; Save the pseudo-DSISR as second return parm - blr ; Return... - -sw64ISI: ld r7,vmmppcXmsr(r5) ; Get the SRR1 value - ld r10,vmmppcXpc(r5) ; Get the PC as failing address - rlwinm r7,r7,0,1,4 ; Save the bits that match the DSISR - std r10,return_paramsX+0(r5) ; Save PC as first return parm - std r7,return_paramsX+8(r5) ; Save the pseudo-DSISR as second return parm - blr ; Return... - -; -; Set exit returns for a system call (note: we did the first 3 earlier) -; Do we really need to pass parameters back here???? -; - -swapSC64: bt vmmDoing64,sw64SC ; Skip to 64-bit stuff... - lwz r10,vmmppcr6(r5) ; Get the fourth paramter - stw r10,return_params+12(r5) ; Save it - blr ; Return... - -sw64SC: ld r10,vmmppcXr6(r5) ; Get the fourth paramter - std r10,return_paramsX+24(r5) ; Save it - blr ; Return... - -; -; vmmFamGuestResume: -; Restore Guest context from Fam mode. -; - -vmmFamGuestResume: - mfsprg r10,1 ; Get the current activation - lwz r10,ACT_PER_PROC(r10) ; Get the per_proc block - lwz r27,vmmCEntry(r3) ; Get the context that is active - lwz r4,VMMXAFlgs(r10) ; Get the eXtended Architecture flags - rlwinm. r4,r4,0,0,0 ; Are we doing a 64-bit virtual machine? - lwz r15,spcFlags(r10) ; Get per_proc special flags - mr r26,r3 ; Save the activation pointer - lwz r20,vmmContextKern(r27) ; Get the comm area - rlwinm r15,r15,0,FamVMmodebit+1,FamVMmodebit-1 ; Clear FamVMmodebit - stw r15,spcFlags(r10) ; Update the special flags - bne fgrX - lwz r7,famguestpc(r20) ; Load famguest ctx pc - bf++ vmmMapDone,fgrNoMap ; No mapping done for this space. - lwz r3,SAVflags(r30) ; Pick up the savearea flags - lwz r2,vmmLastMap(r28) ; Get the last mapped address - lwz r6,vmmLastMap+4(r28) ; Get the last mapped address - li r4,T_DATA_ACCESS ; Change to DSI fault - oris r3,r3,hi16(SAVredrive) ; Set exception redrive - stw r2,savedar(r30) ; Set the DAR to the last thing we mapped - stw r6,savedar+4(r30) ; Set the DAR to the last thing we mapped - stw r3,SAVflags(r30) ; Turn on the redrive request - lis r2,hi16(MASK(DSISR_HASH)) ; Set PTE/DBAT miss - stw r4,saveexception(r30) ; Say we need to emulate a DSI - li r0,0 ; Clear - stw r2,savedsisr(r30) ; Pretend we have a PTE miss - stb r0,vmmGFlags+3(r28) ; Show that the redrive has been taken care of -fgrNoMap: - lwz r4,savesrr1+4(r30) ; Get the saved MSR value - stw r7,savesrr0+4(r30) ; Set savearea pc - lwz r5,famguestmsr(r20) ; Load famguest ctx msr - lis r6,hi16(MSR_IMPORT_BITS) ; Get the MSR bits that are controllable by user - ori r6,r6,lo16(MSR_IMPORT_BITS) ; Get the rest of the MSR bits that are controllable by user - and r5,r5,r6 ; Keep only the controllable bits - oris r5,r5,hi16(MSR_EXPORT_MASK_SET) ; Force on the required bits - ori r5,r5,lo16(MSR_EXPORT_MASK_SET) ; Force on the other required bits - rlwimi r5,r4,0,MSR_FP_BIT,MSR_FP_BIT ; Propagate guest FP - rlwimi r5,r4,0,MSR_VEC_BIT,MSR_VEC_BIT ; Propagate guest Vector - stw r5,savesrr1+4(r30) ; Set savearea srr1 - lwz r4,famguestr0(r20) ; Load famguest ctx r0 - lwz r5,famguestr1(r20) ; Load famguest ctx r1 - lwz r6,famguestr2(r20) ; Load famguest ctx r2 - lwz r7,famguestr3(r20) ; Load famguest ctx r3 - stw r4,saver0+4(r30) ; Set savearea r0 - stw r5,saver1+4(r30) ; Set savearea r1 - stw r6,saver2+4(r30) ; Set savearea r2 - stw r7,saver3+4(r30) ; Set savearea r3 - lwz r4,famguestr4(r20) ; Load famguest ctx r4 - lwz r5,famguestr5(r20) ; Load famguest ctx r5 - lwz r6,famguestr6(r20) ; Load famguest ctx r6 - lwz r7,famguestr7(r20) ; Load famguest ctx r7 - stw r4,saver4+4(r30) ; Set savearea r4 - stw r5,saver5+4(r30) ; Set savearea r5 - stw r6,saver6+4(r30) ; Set savearea r6 - stw r7,saver7+4(r30) ; Set savearea r7 - b fgrret -fgrX: - ld r7,famguestXpc(r20) ; Load famguest ctx pc - bf++ vmmMapDone,fgrXNoMap ; No mapping done for this space. - lwz r3,SAVflags(r30) ; Pick up the savearea flags - ld r2,vmmLastMap(r28) ; Get the last mapped address - li r4,T_DATA_ACCESS ; Change to DSI fault - oris r3,r3,hi16(SAVredrive) ; Set exception redrive - std r2,savedar(r30) ; Set the DAR to the last thing we mapped - stw r3,SAVflags(r30) ; Turn on the redrive request - lis r2,hi16(MASK(DSISR_HASH)) ; Set PTE/DBAT miss - stw r4,saveexception(r30) ; Say we need to emulate a DSI - li r0,0 ; Clear - stw r2,savedsisr(r30) ; Pretend we have a PTE miss - stb r0,vmmGFlags+3(r28) ; Show that the redrive has been taken care of -fgrXNoMap: - ld r4,savesrr1(r30) ; Get the saved MSR value - std r7,savesrr0(r30) ; Set savearea pc - ld r5,famguestXmsr(r20) ; Load famguest ctx msr - lis r6,hi16(MSR_IMPORT_BITS) ; Get the MSR bits that are controllable by user - ori r6,r6,lo16(MSR_IMPORT_BITS) ; Get the rest of the MSR bits that are controllable by user - and r5,r5,r6 ; Keep only the controllable bits - oris r5,r5,hi16(MSR_EXPORT_MASK_SET) ; Force on the required bits - ori r5,r5,lo16(MSR_EXPORT_MASK_SET) ; Force on the other required bits - rlwimi r5,r4,0,MSR_FP_BIT,MSR_FP_BIT ; Propagate guest FP - rlwimi r5,r4,0,MSR_VEC_BIT,MSR_VEC_BIT ; Propagate guest Vector - std r5,savesrr1(r30) ; Set savearea srr1 - ld r4,famguestXr0(r20) ; Load famguest ctx r0 - ld r5,famguestXr1(r20) ; Load famguest ctx r1 - ld r6,famguestXr2(r20) ; Load famguest ctx r2 - ld r7,famguestXr3(r20) ; Load famguest ctx r3 - std r4,saver0(r30) ; Set savearea r0 - std r5,saver1(r30) ; Set savearea r1 - std r6,saver2(r30) ; Set savearea r2 - std r7,saver3(r30) ; Set savearea r3 - ld r4,famguestXr4(r20) ; Load famguest ctx r4 - ld r5,famguestXr5(r20) ; Load famguest ctx r5 - ld r6,famguestXr6(r20) ; Load famguest ctx r6 - ld r7,famguestXr7(r20) ; Load famguest ctx r7 - std r4,saver4(r30) ; Set savearea r4 - std r5,saver5(r30) ; Set savearea r5 - std r6,saver6(r30) ; Set savearea r6 - std r7,saver7(r30) ; Set savearea r7 -fgrret: - li r3,1 ; Show normal exit with check for AST - mr r16,r26 ; Restore the thread pointer - b EXT(ppcscret) ; Go back to handler... - -; -; FAM Intercept exception handler -; - - .align 5 - .globl EXT(vmm_fam_exc) - -LEXT(vmm_fam_exc) - lwz r4,VMMXAFlgs(r2) ; Get the eXtended Architecture flags - lwz r1,pfAvailable(r2) ; Get the CPU features flags - rlwinm. r4,r4,0,0,0 ; Are we doing a 64-bit virtual machine? - bne fexcX - lwz r4,saver4+4(r13) ; Load savearea r4 - cmplwi r11,T_ALIGNMENT ; Alignment exception? - lwz r3,VMMareaPhys(r2) ; Load phys state page addr - mtcrf 0x02,r1 ; Move pf64Bit to its normal place in CR6 - cmplwi cr1,r11,T_PROGRAM ; Exiting because of an PRG? - bt++ pf64Bitb,fexcVMareaPhys64 ; Go do this on a 64-bit machine... - slwi r3,r3,12 ; Change ppnum to physical address - b fexcVMareaPhysres -fexcVMareaPhys64: - mtxer r5 ; Restore xer - lwz r5,saver5+4(r13) ; Load savearea r5 - lwz r6,saver6+4(r13) ; Load savearea r6 - sldi r3,r3,12 ; Change ppnum to physical address -fexcVMareaPhysres: - stw r4,famguestr4(r3) ; Save r4 in famguest ctx - stw r5,famguestr5(r3) ; Save r5 in famguest ctx - stw r6,famguestr6(r3) ; Save r6 in famguest ctx - stw r7,famguestr7(r3) ; Save r7 in famguest ctx - lwz r4,saver0+4(r13) ; Load savearea r0 - lwz r5,saver1+4(r13) ; Load savearea r1 - lwz r6,saver2+4(r13) ; Load savearea r2 - lwz r7,saver3+4(r13) ; Load savearea r3 - stw r4,famguestr0(r3) ; Save r0 in famguest ctx - stw r5,famguestr1(r3) ; Save r1 in famguest ctx - stw r6,famguestr2(r3) ; Save r2 in famguest ctx - stw r7,famguestr3(r3) ; Save r3 in famguest ctx - lwz r4,spcFlags(r2) ; Load per_proc spcFlags - oris r4,r4,hi16(FamVMmode) ; Set FAM mode - stw r4,spcFlags(r2) ; Update per_proc spcFlags - mfsrr0 r2 ; Get the interrupt srr0 - mfsrr1 r4 ; Get the interrupt srr1 - stw r2,famguestpc(r3) ; Save srr0 in famguest ctx - stw r4,famguestmsr(r3) ; Save srr1 in famguest ctx - li r6,lo16(MASK(MSR_FE0)|MASK(MSR_SE)|MASK(MSR_BE)|MASK(MSR_FE1)) - andc r6,r4,r6 ; Clear SE BE FE0 FE1 - mtsrr1 r6 ; Set srr1 - mr r6,r3 ; Set r6 with phys state page addr - rlwinm r7,r11,30,24,31 ; Convert exception to return code - beq+ cr1,fexcPRG ; We had a program exception... - bne+ fexcret - ; We had an Alignment... - mfdar r3 ; Load dar - mfdsisr r4 ; Load dsisr - stw r3,famparam+0x4(r6) ; Set famparam 1 with dar - stw r4,famparam+0x8(r6) ; Set famparam 2 with dsir - b fexcret ; -fexcPRG: - stw r4,famparam+0x4(r6) ; Set famparam 1 with srr1 - mr r3,r4 ; Set r3 with dsisr - lwz r4,famguestr4(r6) ; Load r4 from famguest context -fexcret: - lwz r5,famguestr5(r6) ; Load r5 from famguest context - lwz r13,famhandler(r6) ; Load user address to resume - stw r2,famparam(r6) ; Set famparam 0 with srr0 - stw r7,famdispcode(r6) ; Save the exit code - lwz r1,famrefcon(r6) ; load refcon - bt++ pf64Bitb,fexcrfi64 ; Go do this on a 64-bit machine... - mtcr r0 ; Restore cr - mtsrr0 r13 ; Load srr0 - mr r0,r7 ; Set dispatch code - lwz r7,famguestr7(r6) ; Load r7 from famguest context - lwz r6,famguestr6(r6) ; Load r6 from famguest context - mfsprg r13,2 ; Restore r13 - mfsprg r11,3 ; Restore r11 - rfi -fexcrfi64: - mtcr r0 ; Restore cr - mtsrr0 r13 ; Load srr0 - mr r0,r7 ; Set dispatch code - lwz r7,famguestr7(r6) ; Load r7 from famguest context - lwz r6,famguestr6(r6) ; Load r6 from famguest context - mfsprg r13,2 ; Restore r13 - mfsprg r11,3 ; Restore r11 - rfid -fexcX: - mtxer r5 ; Restore xer - ld r4,saver4(r13) ; Load savearea r4 - ld r5,saver5(r13) ; Load savearea r5 - ld r6,saver6(r13) ; Load savearea r6 - cmplwi r11,T_ALIGNMENT ; Alignment exception? - lwz r3,VMMareaPhys(r2) ; Load phys state page addr - mtcrf 0x02,r1 ; Move pf64Bit to its normal place in CR6 - cmplwi cr1,r11,T_PROGRAM ; Exiting because of an PRG? - sldi r3,r3,12 ; Change ppnum to physical address - std r4,famguestXr4(r3) ; Save r4 in famguest ctx - std r5,famguestXr5(r3) ; Save r5 in famguest ctx - std r6,famguestXr6(r3) ; Save r6 in famguest ctx - std r7,famguestXr7(r3) ; Save r7 in famguest ctx - ld r4,saver0(r13) ; Load savearea r0 - ld r5,saver1(r13) ; Load savearea r1 - ld r6,saver2(r13) ; Load savearea r2 - ld r7,saver3(r13) ; Load savearea r3 - std r4,famguestXr0(r3) ; Save r0 in famguest ctx - std r5,famguestXr1(r3) ; Save r1 in famguest ctx - std r6,famguestXr2(r3) ; Save r2 in famguest ctx - std r7,famguestXr3(r3) ; Save r3 in famguest ctx - lwz r4,spcFlags(r2) ; Load per_proc spcFlags - oris r4,r4,hi16(FamVMmode) ; Set FAM mode - stw r4,spcFlags(r2) ; Update per_proc spcFlags - mfsrr0 r2 ; Get the interrupt srr0 - mfsrr1 r4 ; Get the interrupt srr1 - std r2,famguestXpc(r3) ; Save srr0 in famguest ctx - std r4,famguestXmsr(r3) ; Save srr1 in famguest ctx - li r6,lo16(MASK(MSR_FE0)|MASK(MSR_SE)|MASK(MSR_BE)|MASK(MSR_FE1)) - andc r6,r4,r6 ; Clear SE BE FE0 FE1 - mtsrr1 r6 ; Set srr1 - mr r6,r3 ; Set r6 with phys state page addr - rlwinm r7,r11,30,24,31 ; Convert exception to return code - beq+ cr1,fexcXPRG ; We had a program exception... - bne+ fexcXret - ; We had an Alignment... - mfdar r3 ; Load dar - mfdsisr r4 ; Load dsisr - std r3,famparamX+0x8(r6) ; Set famparam 1 with dar - std r4,famparamX+0x10(r6) ; Set famparam 2 with dsir - b fexcXret -fexcXPRG: - std r4,famparamX+0x8(r6) ; Set famparam 1 with srr1 - mr r3,r4 ; Set r3 with dsisr - ld r4,famguestXr4(r6) ; Load r4 from famguest context -fexcXret: - ld r5,famguestXr5(r6) ; Load r5 from famguest context - ld r13,famhandlerX(r6) ; Load user address to resume - std r2,famparamX(r6) ; Set famparam 0 with srr0 - std r7,famdispcodeX(r6) ; Save the exit code - ld r1,famrefconX(r6) ; load refcon - mtcr r0 ; Restore cr - mtsrr0 r13 ; Load srr0 - mr r0,r7 ; Set dispatch code - ld r7,famguestXr7(r6) ; Load r7 from famguest context - ld r6,famguestXr6(r6) ; Load r6 from famguest context - mfsprg r13,2 ; Restore r13 - mfsprg r11,3 ; Restore r11 - rfid - -; -; FAM Intercept DSI ISI fault handler -; - - .align 5 - .globl EXT(vmm_fam_pf) - -LEXT(vmm_fam_pf) - lwz r4,VMMXAFlgs(r2) ; Get the eXtended Architecture flags - lwz r3,VMMareaPhys(r2) ; Load phys state page addr - rlwinm. r4,r4,0,0,0 ; Are we doing a 64-bit virtual machine? - bne fpfX - lwz r4,saver0+4(r13) ; Load savearea r0 - lwz r5,saver1+4(r13) ; Load savearea r1 - lwz r6,saver2+4(r13) ; Load savearea r2 - lwz r7,saver3+4(r13) ; Load savearea r3 - bt++ pf64Bitb,fpfVMareaPhys64 ; Go do this on a 64-bit machine... - slwi r3,r3,12 ; Change ppnum to physical address - b fpfVMareaPhysret -fpfVMareaPhys64: - sldi r3,r3,12 ; Change ppnum to physical address -fpfVMareaPhysret: - stw r4,famguestr0(r3) ; Save r0 in famguest - stw r5,famguestr1(r3) ; Save r1 in famguest - stw r6,famguestr2(r3) ; Save r2 in famguest - stw r7,famguestr3(r3) ; Save r3 in famguest - lwz r4,saver4+4(r13) ; Load savearea r0 - lwz r5,saver5+4(r13) ; Load savearea r1 - lwz r6,saver6+4(r13) ; Load savearea r2 - lwz r7,saver7+4(r13) ; Load savearea r3 - stw r4,famguestr4(r3) ; Save r4 in famguest - lwz r4,spcFlags(r2) ; Load spcFlags - stw r5,famguestr5(r3) ; Save r5 in famguest - lwz r5,savesrr0+4(r13) ; Get the interrupt srr0 - stw r6,famguestr6(r3) ; Save r6 in famguest - lwz r6,savesrr1+4(r13) ; Load srr1 - oris r4,r4,hi16(FamVMmode) ; Set FAM mode - stw r7,famguestr7(r3) ; Save r7 in famguest - stw r4,spcFlags(r2) ; Update spcFlags - lwz r1,famrefcon(r3) ; Load refcon - lwz r2,famhandler(r3) ; Load famhandler to resume - stw r5,famguestpc(r3) ; Save srr0 - stw r5,saver2+4(r13) ; Store srr0 in savearea r2 - stw r5,famparam(r3) ; Store srr0 in fam param 0 - stw r6,famguestmsr(r3) ; Save srr1 in famguestmsr - cmplwi cr1,r11,T_INSTRUCTION_ACCESS ; Was this a ISI? - rlwinm r7,r11,30,24,31 ; Convert exception to return code - beq+ cr1,fpfISI ; We had an ISI... -; fpfDSI - lwz r6,savedar+4(r13) ; Load dar from savearea - lwz r4,savedsisr(r13) ; Load dsisr from savearea - stw r6,famparam+0x4(r3) ; Store dar in fam param 1 - stw r6,saver3+4(r13) ; Store dar in savearea r3 - stw r4,famparam+0x8(r3) ; Store dsisr in fam param 2 - stw r4,saver4+4(r13) ; Store dsisr in savearea r4 - b fpfret -fpfISI: - rlwinm r6,r6,0,1,4 ; Save the bits that match the DSISR - stw r6,famparam+0x4(r3) ; Store srr1 in fam param 1 - stw r6,saver3+4(r13) ; Store srr1 in savearea r3 -fpfret: - stw r7,saver0+4(r13) ; Set dispatch code - stw r7,famdispcode(r3) ; Set dispatch code - stw r1,saver1+4(r13) ; Store refcon in savearea r1 - stw r2,savesrr0+4(r13) ; Store famhandler in srr0 - blr -fpfX: - ld r4,saver0(r13) ; Load savearea r0 - ld r5,saver1(r13) ; Load savearea r1 - ld r6,saver2(r13) ; Load savearea r2 - ld r7,saver3(r13) ; Load savearea r3 - sldi r3,r3,12 ; Change ppnum to physical address - std r4,famguestXr0(r3) ; Save r0 in famguest - std r5,famguestXr1(r3) ; Save r1 in famguest - std r6,famguestXr2(r3) ; Save r2 in famguest - std r7,famguestXr3(r3) ; Save r3 in famguest - ld r4,saver4(r13) ; Load savearea r0 - ld r5,saver5(r13) ; Load savearea r1 - ld r6,saver6(r13) ; Load savearea r2 - ld r7,saver7(r13) ; Load savearea r3 - std r4,famguestXr4(r3) ; Save r4 in famguest - lwz r4,spcFlags(r2) ; Load spcFlags - std r5,famguestXr5(r3) ; Save r5 in famguest - ld r5,savesrr0(r13) ; Get the interrupt srr0 - std r6,famguestXr6(r3) ; Save r6 in famguest - ld r6,savesrr1(r13) ; Load srr1 - oris r4,r4,hi16(FamVMmode) ; Set FAM mode - std r7,famguestXr7(r3) ; Save r7 in famguest - stw r4,spcFlags(r2) ; Update spcFlags - ld r1,famrefconX(r3) ; Load refcon - ld r2,famhandlerX(r3) ; Load famhandler to resume - std r5,famguestXpc(r3) ; Save srr0 - std r5,saver2(r13) ; Store srr0 in savearea r2 - std r5,famparamX(r3) ; Store srr0 in fam param 0 - std r6,famguestXmsr(r3) ; Save srr1 in famguestmsr - cmplwi cr1,r11,T_INSTRUCTION_ACCESS ; Was this a ISI? - rlwinm r7,r11,30,24,31 ; Convert exception to return code - beq+ cr1,fpfXISI ; We had an ISI... -; fpfXDSI - ld r6,savedar(r13) ; Load dar from savearea - lwz r4,savedsisr(r13) ; Load dsisr from savearea - std r6,famparamX+0x8(r3) ; Store dar in fam param 1 - std r6,saver3(r13) ; Store dar in savearea r3 - std r4,famparamX+0x10(r3) ; Store dsisr in fam param 2 - std r4,saver4(r13) ; Store dsisr in savearea r4 - b fpfXret -fpfXISI: - rlwinm r6,r6,0,1,4 ; Save the bits that match the DSISR - std r6,famparamX+0x8(r3) ; Store srr1 in fam param 1 - std r6,saver3(r13) ; Store srr1 in savearea r3 -fpfXret: - std r7,saver0(r13) ; Set dispatch code - std r7,famdispcodeX(r3) ; Set dispatch code - std r1,saver1(r13) ; Store refcon in savearea r1 - std r2,savesrr0(r13) ; Store famhandler in srr0 - blr - -/* - * Ultra Fast Path FAM syscalls - * - * The UFT FAMs are those from kvmmResumeGuest to kvmmSetGuestRegister, inclusive. - * We get here directly from the syscall vector, with interrupts and translation off, - * 64-bit mode on if supported, and all registers live except: - * - * r13 = holds caller's cr - * sprg2 = holds caller's r13 - * sprg3 = holds caller's r11 - * cr2 = set on (r3==kvmmSetGuestRegister) - * cr5 = set on (r3==kvmmResumeGuest) - */ - - .align 5 - .globl EXT(vmm_ufp) - -LEXT(vmm_ufp) - mfsprg r3,0 ; Get the per_proc area - mr r11,r13 ; Move saved cr to r11 - lwz r13,VMMXAFlgs(r3) ; Get the eXtended Architecture flags - rlwinm. r13,r13,0,0,0 ; Are we doing a 64-bit virtual machine? - - lwz r13,pfAvailable(r3) ; Get feature flags - mtcrf 0x02,r13 ; Put pf64Bitb etc in cr6 - lwz r13,VMMareaPhys(r3) ; Load fast assist area - bt++ pf64Bitb,ufpVMareaPhys64 ; Go do this on a 64-bit machine... - slwi r13,r13,12 ; Change ppnum to physical address - b ufpVMareaPhysret -ufpVMareaPhys64: - sldi r13,r13,12 ; Change ppnum to physical address -ufpVMareaPhysret: - bne ufpX ; go handle a 64-bit virtual machine - - bt cr5_eq,ufpResumeGuest ; if kvmmResumeGuest, branch to ResumeGuest - cmplwi cr5,r4,7 ; First argument in range? (ie, 0-7) - bgt cr5,ufpVMret ; Return if not in the range - slwi r4,r4,2 ; multiply index by 4 - la r3,famguestr0(r13) ; Load the base address - bt cr2_eq,ufpSetGuestReg ; Set/get selector -; ufpGetGuestReg - lwzx r3,r4,r3 ; Load the guest register - b ufpVMret ; Return -ufpSetGuestReg: - stwx r5,r4,r3 ; Update the guest register - li r3,0 ; Set return value - b ufpVMret ; Return -ufpResumeGuest: - lwz r7,spcFlags(r3) ; Pick up the special flags - mtsrr0 r4 ; Set srr0 - rlwinm. r6,r6,0,vmmKeyb,vmmKeyb ; Check vmmKeyb in maskCntrl - rlwinm r7,r7,0,FamVMmodebit+1,FamVMmodebit-1 ; Clear FamVMmodebit - stw r7,spcFlags(r3) ; Update the special flags - mfsrr1 r6 ; Get the current MSR value - - lwz r4,famguestmsr(r13) ; Load guest srr1 - lis r1,hi16(MSR_IMPORT_BITS) ; Get the MSR bits that are controllable by user - ori r1,r1,lo16(MSR_IMPORT_BITS) ; Get the rest of the MSR bits that are controllable by user - and r4,r4,r1 ; Keep only the controllable bits - oris r4,r4,hi16(MSR_EXPORT_MASK_SET) ; Force on the required bits - ori r4,r4,lo16(MSR_EXPORT_MASK_SET) ; Force on the other required bits - rlwimi r4,r6,0,MSR_FP_BIT,MSR_FP_BIT ; Propagate guest FP - rlwimi r4,r6,0,MSR_VEC_BIT,MSR_VEC_BIT ; Propagate guest Vector - beq ufpnokey ; Branch if not key switch - mr r2,r7 ; Save r7 - rlwimi r7,r5,32+vmmKeyb-userProtKeybit,userProtKeybit,userProtKeybit ; Set the protection key - cmpw cr0,r7,r2 ; Is userProtKeybit changed? - beq ufpnokey ; No, go to ResumeGuest_nokey - mr r5,r3 ; Get the per_proc area - stw r7,spcFlags(r3) ; Update the special flags - - bt++ pf64Bitb,ufpsave64 ; Go do this on a 64-bit machine... - - lwz r3,next_savearea+4(r5) ; Get the exception save area - stw r8,saver8+4(r3) ; Save r8 - stw r9,saver9+4(r3) ; Save r9 - stw r10,saver10+4(r3) ; Save r10 - stw r11,saver11+4(r3) ; Save r11 - stw r12,saver12+4(r3) ; Save r12 - stw r13,saver13+4(r3) ; Save r12 - stw r14,saver14+4(r3) ; Save r14 - stw r15,saver15+4(r3) ; Save r15 - stw r16,saver16+4(r3) ; Save r16 - stw r17,saver17+4(r3) ; Save r17 - stw r18,saver18+4(r3) ; Save r18 - stw r19,saver19+4(r3) ; Save r19 - stw r20,saver20+4(r3) ; Save r20 - stw r21,saver21+4(r3) ; Save r21 - stw r22,saver22+4(r3) ; Save r22 - stw r23,saver23+4(r3) ; Save r23 - stw r24,saver24+4(r3) ; Save r24 - stw r25,saver25+4(r3) ; Save r25 - stw r26,saver26+4(r3) ; Save r26 - stw r27,saver27+4(r3) ; Save r27 - stw r28,saver28+4(r3) ; Save r28 - stw r29,saver29+4(r3) ; Save r29 - stw r30,saver30+4(r3) ; Save r30 - stw r31,saver31+4(r3) ; Save r31 - b ufpsaveres ; Continue - -ufpsave64: - ld r3,next_savearea(r5) ; Get the exception save area - std r8,saver8(r3) ; Save r8 - std r9,saver9(r3) ; Save r9 - std r10,saver10(r3) ; Save r10 - std r11,saver11(r3) ; Save r11 - std r12,saver12(r3) ; Save r12 - std r13,saver13(r3) ; Save r12 - std r14,saver14(r3) ; Save r14 - std r15,saver15(r3) ; Save r15 - std r16,saver16(r3) ; Save r16 - std r17,saver17(r3) ; Save r17 - std r18,saver18(r3) ; Save r18 - std r19,saver19(r3) ; Save r19 - std r20,saver20(r3) ; Save r20 - std r21,saver21(r3) ; Save r21 - std r22,saver22(r3) ; Save r22 - std r23,saver23(r3) ; Save r23 - std r24,saver24(r3) ; Save r24 - std r25,saver25(r3) ; Save r25 - std r26,saver26(r3) ; Save r26 - std r27,saver27(r3) ; Save r27 - std r28,saver28(r3) ; Save r28 - std r29,saver29(r3) ; Save r29 - mfxer r2 ; Get xer - std r30,saver30(r3) ; Save r30 - std r31,saver31(r3) ; Save r31 - std r2,savexer(r3) ; Save xer - -ufpsaveres: - mflr r20 ; Get lr - li r2,1 ; Set to 1 - stw r7,spcFlags(r5) ; Update the special flags - mr r13,r3 ; Set current savearea - mr r21,r4 ; Save r4 - sth r2,ppInvSeg(r5) ; Force a reload of the SRs - mr r29,r5 ; Get the per_proc area - mr r3,r4 ; Set MSR value we going to - bl EXT(switchSegs) ; Go handle the segment registers/STB - mr r3,r13 ; Set current savearea - mr r4,r21 ; Restore r4 - mtlr r20 ; Set lr - - bt++ pf64Bitb,ufprestore64 ; Go do this on a 64-bit machine... - lwz r8,saver8+4(r3) ; Load r8 - lwz r9,saver9+4(r3) ; Load r9 - lwz r10,saver10+4(r3) ; Load r10 - lwz r11,saver11+4(r3) ; Load r11 - lwz r12,saver12+4(r3) ; Load r12 - lwz r13,saver13+4(r3) ; Load r12 - lwz r14,saver14+4(r3) ; Load r14 - lwz r15,saver15+4(r3) ; Load r15 - lwz r16,saver16+4(r3) ; Load r16 - lwz r17,saver17+4(r3) ; Load r17 - lwz r18,saver18+4(r3) ; Load r18 - lwz r19,saver19+4(r3) ; Load r19 - lwz r20,saver20+4(r3) ; Load r20 - lwz r21,saver21+4(r3) ; Load r21 - lwz r22,saver22+4(r3) ; Load r22 - lwz r23,saver23+4(r3) ; Load r23 - lwz r24,saver24+4(r3) ; Load r24 - lwz r25,saver25+4(r3) ; Load r25 - lwz r26,saver26+4(r3) ; Load r26 - lwz r27,saver27+4(r3) ; Load r27 - lwz r28,saver28+4(r3) ; Load r28 - lwz r29,saver29+4(r3) ; Load r29 - lwz r30,saver30+4(r3) ; Load r30 - lwz r31,saver31+4(r3) ; Load r31 - b ufpnokey ; Continue -ufprestore64: - ld r2,savexer(r3) ; Load xer - ld r8,saver8(r3) ; Load r8 - ld r9,saver9(r3) ; Load r9 - ld r10,saver10(r3) ; Load r10 - mtxer r2 ; Restore xer - ld r11,saver11(r3) ; Load r11 - ld r12,saver12(r3) ; Load r12 - ld r13,saver13(r3) ; Load r12 - ld r14,saver14(r3) ; Load r14 - ld r15,saver15(r3) ; Load r15 - ld r16,saver16(r3) ; Load r16 - ld r17,saver17(r3) ; Load r17 - ld r18,saver18(r3) ; Load r18 - ld r19,saver19(r3) ; Load r19 - ld r20,saver20(r3) ; Load r20 - ld r21,saver21(r3) ; Load r21 - ld r22,saver22(r3) ; Load r22 - ld r23,saver23(r3) ; Load r23 - ld r24,saver24(r3) ; Load r24 - ld r25,saver25(r3) ; Load r25 - ld r26,saver26(r3) ; Load r26 - ld r27,saver27(r3) ; Load r27 - ld r28,saver28(r3) ; Load r28 - ld r29,saver29(r3) ; Load r29 - ld r30,saver30(r3) ; Load r30 - ld r31,saver31(r3) ; Load r31 -ufpnokey: - mfsprg r3,0 ; Get the per_proc area - mtsrr1 r4 ; Set srr1 - lwz r0,famguestr0(r13) ; Load r0 - lwz r1,famguestr1(r13) ; Load r1 - lwz r2,famguestr2(r13) ; Load r2 - lwz r3,famguestr3(r13) ; Load r3 - lwz r4,famguestr4(r13) ; Load r4 - lwz r5,famguestr5(r13) ; Load r5 - lwz r6,famguestr6(r13) ; Load r6 - lwz r7,famguestr7(r13) ; Load r7 -ufpVMret: - mfsprg r13,2 ; Restore R13 - bt++ pf64Bitb,ufpVMrfi64 ; Go do this on a 64-bit machine... - mtcrf 0xFF,r11 ; Restore CR - mfsprg r11,3 ; Restore R11 - rfi ; All done, go back... -ufpVMrfi64: - mtcrf 0xFF,r11 ; Restore CR - mfsprg r11,3 ; Restore R11 - rfid - -ufpX: ; here if virtual machine is 64-bit - bt cr5_eq,ufpXResumeGuest ; if kvmmResumeGuest, branch to ResumeGuest - cmplwi cr5,r4,7 ; Is first arg in range 0-7? - bgt cr5,ufpXVMret ; Return if not in the range - slwi r4,r4,3 ; multiply index by 8 - la r3,famguestXr0(r13) ; Load the base address - bt cr2_eq,ufpXSetGuestReg ; Set/get selector -; ufpXGetGuestReg - ldx r3,r4,r3 ; Load the guest register - b ufpXVMret ; Return -ufpXSetGuestReg: - stdx r5,r4,r3 ; Update the guest register - li r3,0 ; Set return value - b ufpXVMret ; Return -ufpXResumeGuest: - lwz r7,spcFlags(r3) ; Pick up the special flags - mtsrr0 r4 ; Set srr0 - rlwinm. r6,r6,0,vmmKeyb,vmmKeyb ; Check vmmKeyb in maskCntrl - rlwinm r7,r7,0,FamVMmodebit+1,FamVMmodebit-1 ; Clear FamVMmodebit - stw r7,spcFlags(r3) ; Update the special flags - mfsrr1 r6 ; Get the current MSR value - - ld r4,famguestXmsr(r13) ; Load guest srr1 - lis r1,hi16(MSR_IMPORT_BITS) ; Get the MSR bits that are controllable by user - ori r1,r1,lo16(MSR_IMPORT_BITS) ; Get the rest of the MSR bits that are controllable by user - and r4,r4,r1 ; Keep only the controllable bits - oris r4,r4,hi16(MSR_EXPORT_MASK_SET) ; Force on the required bits - ori r4,r4,lo16(MSR_EXPORT_MASK_SET) ; Force on the other required bits - rlwimi r4,r6,0,MSR_FP_BIT,MSR_FP_BIT ; Propagate guest FP - rlwimi r4,r6,0,MSR_VEC_BIT,MSR_VEC_BIT ; Propagate guest Vector - beq ufpXnokey ; Branch if not key switch - mr r2,r7 ; Save r7 - rlwimi r7,r5,32+vmmKeyb-userProtKeybit,userProtKeybit,userProtKeybit ; Set the protection key - cmpw cr0,r7,r2 ; Is userProtKeybit changed? - beq ufpXnokey ; No, go to ResumeGuest_nokey - mr r5,r3 ; Get the per_proc area - stw r7,spcFlags(r3) ; Update the special flags - - ld r3,next_savearea(r5) ; Get the exception save area - std r8,saver8(r3) ; Save r8 - std r9,saver9(r3) ; Save r9 - std r10,saver10(r3) ; Save r10 - std r11,saver11(r3) ; Save r11 - std r12,saver12(r3) ; Save r12 - std r13,saver13(r3) ; Save r12 - std r14,saver14(r3) ; Save r14 - std r15,saver15(r3) ; Save r15 - std r16,saver16(r3) ; Save r16 - std r17,saver17(r3) ; Save r17 - std r18,saver18(r3) ; Save r18 - std r19,saver19(r3) ; Save r19 - std r20,saver20(r3) ; Save r20 - std r21,saver21(r3) ; Save r21 - std r22,saver22(r3) ; Save r22 - std r23,saver23(r3) ; Save r23 - std r24,saver24(r3) ; Save r24 - std r25,saver25(r3) ; Save r25 - std r26,saver26(r3) ; Save r26 - std r27,saver27(r3) ; Save r27 - std r28,saver28(r3) ; Save r28 - std r29,saver29(r3) ; Save r29 - mfxer r2 ; Get xer - std r30,saver30(r3) ; Save r30 - std r31,saver31(r3) ; Save r31 - std r2,savexer(r3) ; Save xer - - mflr r20 ; Get lr - li r2,1 ; Set to 1 - stw r7,spcFlags(r5) ; Update the special flags - mr r13,r3 ; Set current savearea - mr r21,r4 ; Save r4 - sth r2,ppInvSeg(r5) ; Force a reload of the SRs - mr r29,r5 ; Get the per_proc area - mr r3,r4 ; Set MSR value we going to - bl EXT(switchSegs) ; Go handle the segment registers/STB - mr r3,r13 ; Set current savearea - mr r4,r21 ; Restore r4 - mtlr r20 ; Set lr - - ld r2,savexer(r3) ; Load xer - ld r8,saver8(r3) ; Load r8 - ld r9,saver9(r3) ; Load r9 - ld r10,saver10(r3) ; Load r10 - mtxer r2 ; Restore xer - ld r11,saver11(r3) ; Load r11 - ld r12,saver12(r3) ; Load r12 - ld r13,saver13(r3) ; Load r12 - ld r14,saver14(r3) ; Load r14 - ld r15,saver15(r3) ; Load r15 - ld r16,saver16(r3) ; Load r16 - ld r17,saver17(r3) ; Load r17 - ld r18,saver18(r3) ; Load r18 - ld r19,saver19(r3) ; Load r19 - ld r20,saver20(r3) ; Load r20 - ld r21,saver21(r3) ; Load r21 - ld r22,saver22(r3) ; Load r22 - ld r23,saver23(r3) ; Load r23 - ld r24,saver24(r3) ; Load r24 - ld r25,saver25(r3) ; Load r25 - ld r26,saver26(r3) ; Load r26 - ld r27,saver27(r3) ; Load r27 - ld r28,saver28(r3) ; Load r28 - ld r29,saver29(r3) ; Load r29 - ld r30,saver30(r3) ; Load r30 - ld r31,saver31(r3) ; Load r31 -ufpXnokey: - mtsrr1 r4 ; Set srr1 - ld r0,famguestXr0(r13) ; Load r0 - ld r1,famguestXr1(r13) ; Load r1 - ld r2,famguestXr2(r13) ; Load r2 - ld r3,famguestXr3(r13) ; Load r3 - ld r4,famguestXr4(r13) ; Load r4 - ld r5,famguestXr5(r13) ; Load r5 - ld r6,famguestXr6(r13) ; Load r6 - ld r7,famguestXr7(r13) ; Load r7 -ufpXVMret: - mfsprg r13,2 ; Restore R13 - mtcrf 0xFF,r11 ; Restore CR - mfsprg r11,3 ; Restore R11 - rfid - diff --git a/osfmk/profiling/Makefile b/osfmk/profiling/Makefile index e037d5041..3b2b64363 100644 --- a/osfmk/profiling/Makefile +++ b/osfmk/profiling/Makefile @@ -10,9 +10,6 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ machine -INSTINC_SUBDIRS_PPC = \ - ppc - INSTINC_SUBDIRS_I386 = \ i386 @@ -25,9 +22,6 @@ INSTINC_SUBDIRS_ARM = \ EXPINC_SUBDIRS = \ machine -EXPINC_SUBDIRS_PPC = \ - ppc - EXPINC_SUBDIRS_I386 = \ i386 diff --git a/osfmk/profiling/machine/profile-md.h b/osfmk/profiling/machine/profile-md.h index 66f783531..028bde46d 100644 --- a/osfmk/profiling/machine/profile-md.h +++ b/osfmk/profiling/machine/profile-md.h @@ -28,9 +28,7 @@ #ifndef _MACH_MACHINE_PROFILE_MD_H #define _MACH_MACHINE_PROFILE_MD_H_ -#if defined (__ppc__) -#include "profiling/ppc/profile-md.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "profiling/i386/profile-md.h" #else #error architecture not supported diff --git a/osfmk/profiling/ppc/profile-md.h b/osfmk/profiling/ppc/profile-md.h deleted file mode 100644 index d8d83698e..000000000 --- a/osfmk/profiling/ppc/profile-md.h +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - * - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:49 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:26:08 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.8.1 1996/12/09 16:57:22 stephen - * nmklinux_1.0b3_shared into pmk1.1 - * [1996/12/09 11:13:16 stephen] - * - * Revision 1.1.6.1 1996/04/11 11:20:35 emcmanus - * Copied from mainline.ppc. - * [1996/04/11 08:26:36 emcmanus] - * - * hppa merge - * [1995/03/15 09:47:27 bruel] - * - * Revision 1.1.4.1 1995/11/23 17:37:28 stephen - * first powerpc checkin to mainline.ppc - * [1995/11/23 16:46:29 stephen] - * - * Revision 1.1.2.1 1995/08/25 06:50:17 stephen - * Initial checkin of files for PowerPC port - * [1995/08/23 15:05:31 stephen] - * - * Revision 1.1.2.1 1995/02/14 14:25:16 bruel - * First Revision. - * [95/01/27 bruel] - * - * $EndLog$ - */ - -#ifndef _PROFILE_MD_H -#define _PROFILE_MD_H - -/* - * Define the interfaces between the assembly language profiling support - * that is common between the kernel, mach servers, and user space library. - */ - -/* - * Integer types used. - */ - -typedef long prof_ptrint_t; /* hold either pointer or signed int */ -typedef unsigned long prof_uptrint_t; /* hold either pointer or unsigned int */ -typedef long prof_lock_t; /* lock word type */ -typedef unsigned char prof_flag_t; /* type for boolean flags */ - -/* - * Double precision counter. - */ - -typedef struct prof_cnt_t { - prof_uptrint_t low; /* low 32 bits of counter */ - prof_uptrint_t high; /* high 32 bits of counter */ -} prof_cnt_t; - -#define PROF_CNT_INC(cnt) ((++((cnt).low) == 0) ? ++((cnt).high) : 0) -#define PROF_CNT_ADD(cnt,val) (((((cnt).low + (val)) < (val)) ? ((cnt).high++) : 0), ((cnt).low += (val))) -#define PROF_CNT_LADD(cnt,val) (PROF_CNT_ADD(cnt,(val).low), (cnt).high += (val).high) -#define PROF_CNT_SUB(cnt,val) (((((cnt).low - (val)) > (cnt).low) ? ((cnt).high--) : 0), ((cnt).low -= (val))) -#define PROF_CNT_LSUB(cnt,val) (PROF_CNT_SUB(cnt,(val).low), (cnt).high -= (val).high) - -#define LPROF_ULONG_TO_CNT(cnt,val) PROF_ULONG_TO_CNT(cnt,val) -#define LPROF_CNT_INC(lp) PROF_CNT_INC(lp) -#define LPROF_CNT_ADD(lp,val) PROF_CNT_ADD(lp,val) -#define LPROF_CNT_LADD(lp,val) PROF_CNT_LADD(lp,val) -#define LPROF_CNT_SUB(lp,val) PROF_CNT_SUB(lp,val) -#define LPROF_CNT_LSUB(lp,val) PROF_CNT_LSUB(lp,val) -#define LPROF_CNT_OVERFLOW(lp,high,low) PROF_CNT_OVERFLOW(lp,high,low) -#define LPROF_CNT_TO_ULONG(lp) PROF_CNT_TO_ULONG(lp) -#define LPROF_CNT_TO_LDOUBLE(lp) PROF_CNT_TO_LDOUBLE(lp) -#define LPROF_CNT_TO_DECIMAL(buf,cnt) PROF_CNT_TO_DECIMAL(buf,cnt) -#define LPROF_CNT_EQ_0(cnt) PROF_CNT_EQ_0(cnt) -#define LPROF_CNT_NE_0(cnt) PROF_CNT_NE_0(cnt) -#define LPROF_CNT_EQ(cnt1,cnt2) PROF_CNT_EQ(cnt1,cnt2) -#define LPROF_CNT_NE(cnt1,cnt2) PROF_CNT_NE(cnt1,cnt2) -#define LPROF_CNT_GT(cnt1,cnt2) PROF_CNT_GT(cnt1,cnt2) -#define LPROF_CNT_LT(cnt1,cnt2) PROF_CNT_LT(cnt1,cnt2) -#define LPROF_CNT_DIGITS PROF_CNT_DIGITS - - -/* - * Types of the profil counter. - */ - -typedef unsigned short HISTCOUNTER; /* profil */ -typedef prof_cnt_t LHISTCOUNTER; /* lprofil */ - -struct profile_stats { /* Debugging counters */ - prof_uptrint_t major_version; /* major version number */ - prof_uptrint_t minor_version; /* minor version number */ -}; - -struct profile_md { - int major_version; /* major version number */ - int minor_version; /* minor version number */ -}; - -#define PROFILE_MAJOR_VERSION 1 -#define PROFILE_MINOR_VERSION 1 - -#endif /* _PROFILE_MD_H */ - - - - - - diff --git a/osfmk/vm/bsd_vm.c b/osfmk/vm/bsd_vm.c index 9b583cd0c..cd8dc8317 100644 --- a/osfmk/vm/bsd_vm.c +++ b/osfmk/vm/bsd_vm.c @@ -92,26 +92,6 @@ mach_get_vm_end(vm_map_t map) return( vm_map_last_entry(map)->vme_end); } -/* - * Legacy routines to get the start and end for a vm_map_t. They - * return them in the vm_offset_t format. So, they should only be - * called on maps that are the same size as the kernel map for - * accurate results. - */ -vm_offset_t -get_vm_start( - vm_map_t map) -{ - return(CAST_DOWN(vm_offset_t, vm_map_first_entry(map)->vme_start)); -} - -vm_offset_t -get_vm_end( - vm_map_t map) -{ - return(CAST_DOWN(vm_offset_t, vm_map_last_entry(map)->vme_end)); -} - /* * BSD VNODE PAGER */ @@ -128,6 +108,7 @@ const struct memory_object_pager_ops vnode_pager_ops = { vnode_pager_synchronize, vnode_pager_map, vnode_pager_last_unmap, + NULL, /* data_reclaim */ "vnode pager" }; @@ -602,8 +583,10 @@ vnode_pager_bootstrap(void) size = (vm_size_t) sizeof(struct vnode_pager); vnode_pager_zone = zinit(size, (vm_size_t) MAX_VNODE*size, PAGE_SIZE, "vnode pager structures"); + zone_change(vnode_pager_zone, Z_CALLERACCT, FALSE); zone_change(vnode_pager_zone, Z_NOENCRYPT, TRUE); + #if CONFIG_CODE_DECRYPTION apple_protect_pager_bootstrap(); #endif /* CONFIG_CODE_DECRYPTION */ @@ -749,6 +732,22 @@ vnode_pager_check_hard_throttle( return KERN_SUCCESS; } +kern_return_t +vnode_pager_get_isSSD( + memory_object_t mem_obj, + boolean_t *isSSD) +{ + vnode_pager_t vnode_object; + + if (mem_obj->mo_pager_ops != &vnode_pager_ops) + return KERN_INVALID_ARGUMENT; + + vnode_object = vnode_pager_lookup(mem_obj); + + *isSSD = vnode_pager_isSSD(vnode_object->vnode_handle); + return KERN_SUCCESS; +} + kern_return_t vnode_pager_get_object_size( memory_object_t mem_obj, @@ -821,6 +820,25 @@ vnode_pager_get_object_cs_blobs( blobs); } +#if CHECK_CS_VALIDATION_BITMAP +kern_return_t +vnode_pager_cs_check_validation_bitmap( + memory_object_t mem_obj, + memory_object_offset_t offset, + int optype ) +{ + vnode_pager_t vnode_object; + + if (mem_obj == MEMORY_OBJECT_NULL || + mem_obj->mo_pager_ops != &vnode_pager_ops) { + return KERN_INVALID_ARGUMENT; + } + + vnode_object = vnode_pager_lookup(mem_obj); + return ubc_cs_check_validation_bitmap( vnode_object->vnode_handle, offset, optype ); +} +#endif /* CHECK_CS_VALIDATION_BITMAP */ + /* * */ diff --git a/osfmk/vm/default_freezer.c b/osfmk/vm/default_freezer.c new file mode 100644 index 000000000..dd8197d7d --- /dev/null +++ b/osfmk/vm/default_freezer.c @@ -0,0 +1,616 @@ +/* + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#if CONFIG_FREEZE + +#include "default_freezer.h" + +/* + * Indicates that a page has been faulted back in. + */ +#define FREEZER_OFFSET_ABSENT ((vm_object_offset_t)(-1)) + +/* + * Create the mapping table that will + * tell us the object/offset pair that + * corresponds to the page being sent + * out or being brought back in. + */ + +void* +default_freezer_mapping_create(vm_object_t object, vm_offset_t offset) +{ + default_freezer_mapping_table_t table; + + table = kalloc(sizeof(struct default_freezer_mapping_table)); + if (table) { + memset(table, 0, sizeof(*table)); + } else { + panic("Could not allocate mapping table\n"); + } + + table->object = object; + table->offset = offset; + + return (void*)table; +} + +void +default_freezer_mapping_free(void **table, boolean_t all) +{ + default_freezer_mapping_table_t freezer_table = *((default_freezer_mapping_table_t *)table); + assert(freezer_table); + + if (all) { + do { + default_freezer_mapping_table_t next = freezer_table->next; + kfree(freezer_table, sizeof(*freezer_table)); + freezer_table = next; + } while (freezer_table); + } else { + kfree(freezer_table, sizeof(*freezer_table)); + } +} + +kern_return_t +default_freezer_mapping_store( + default_freezer_mapping_table_t *table, + memory_object_offset_t table_offset, + memory_object_t memory_object, + memory_object_offset_t offset) +{ + default_freezer_mapping_table_entry_t entry; + uint32_t index; + + assert(*table); + + if ((*table)->index >= MAX_FREEZE_TABLE_ENTRIES) { + vm_object_t compact_object = (*table)->object; + default_freezer_mapping_table_t next; + + next = default_freezer_mapping_create(compact_object, table_offset); + if (!next) { + return KERN_FAILURE; + } + + (*table)->next = next; + *table = next; + } + + index = (*table)->index++; + entry = &(*table)->entry[index]; + + entry->memory_object = memory_object; + entry->offset = offset; + + return KERN_SUCCESS; +} + +kern_return_t +default_freezer_mapping_update( + default_freezer_mapping_table_t table, + memory_object_t memory_object, + memory_object_offset_t offset, + memory_object_offset_t *table_offset, /*OUT: contains the offset into the compact object*/ + boolean_t remove_entry) +{ + + kern_return_t kr = KERN_SUCCESS; + vm_object_offset_t compact_offset; + default_freezer_mapping_table_entry_t entry; + uint32_t index = 0; + + if (table == NULL){ + return KERN_FAILURE; + } + + compact_offset = table->offset; + + while (1) { + if (index >= table->index) { + if (table->next) { + table = table->next; + index = 0; + } else { + /* End of tables and we didn't find our candidate entry */ + kr = KERN_FAILURE; + break; + } + } + + entry = &table->entry[index]; + + if (memory_object == entry->memory_object && offset == entry->offset) { + if (remove_entry == TRUE) { + /* + * Mark the page absent whilst retaining the object + * for cleanup during thaw. + */ + entry->offset = FREEZER_OFFSET_ABSENT; + } + if (table_offset != NULL) { + *table_offset = compact_offset; + } + break; + } + + index++; + compact_offset += PAGE_SIZE; + } + return kr; +} + +/* + * Create a freezer memory object for this + * vm object. + */ +void +default_freezer_memory_object_create( + vm_object_t object, + vm_object_t compact_object, + default_freezer_mapping_table_t table) +{ + + default_freezer_memory_object_t fo = NULL; + + fo = kalloc(sizeof(struct default_freezer_memory_object)); + + if (fo) { + memory_object_control_t control = NULL; + + memset(fo, 0, sizeof(*fo)); + + control = memory_object_control_allocate(object); + assert (control != MEMORY_OBJECT_CONTROL_NULL); + + df_memory_object_init((memory_object_t)fo, control, 0); + fo->fo_compact_object = compact_object; + fo->fo_table = table; + + object->pager = (memory_object_t)fo; + object->pager_created = TRUE; + object->pager_initialized = TRUE; + object->pager_ready = TRUE; + object->pager_trusted = TRUE; + object->pager_control = control; + } else { + panic(" Could not allocate freezer object\n"); + } +} + +void +default_freezer_pack_page( + vm_page_t p, + vm_object_t compact_object, + vm_object_offset_t offset, + void **table) +{ + + default_freezer_mapping_table_t *freeze_table = (default_freezer_mapping_table_t *)table; + memory_object_t memory_object = p->object->pager; + + if (memory_object == NULL) { + default_freezer_memory_object_create(p->object, compact_object, *freeze_table); + memory_object = p->object->pager; + } else { + default_freezer_memory_object_t fo = (default_freezer_memory_object_t)memory_object; + if (fo->fo_compact_object == VM_OBJECT_NULL) { + fo->fo_compact_object = compact_object; + fo->fo_table = *freeze_table; + } + } + + default_freezer_mapping_store(freeze_table, offset, memory_object, p->offset + p->object->paging_offset); + + /* Remove from the original and insert into the compact destination object */ + vm_page_rename(p, compact_object, offset, FALSE); +} + +void +default_freezer_unpack( + vm_object_t object, + void **table) +{ + + vm_page_t p = VM_PAGE_NULL; + uint32_t index = 0; + vm_object_t src_object = VM_OBJECT_NULL; + memory_object_t src_mem_object = MEMORY_OBJECT_NULL; + memory_object_offset_t src_offset = 0; + vm_object_offset_t compact_offset = 0; + default_freezer_memory_object_t fo = NULL; + default_freezer_memory_object_t last_memory_object_thawed = NULL; + default_freezer_mapping_table_t freeze_table = *(default_freezer_mapping_table_t *)table; + + assert(freeze_table); + + vm_object_lock(object); + + for (index = 0, compact_offset = 0; ; index++, compact_offset += PAGE_SIZE){ + if (index >= freeze_table->index) { + default_freezer_mapping_table_t table_next; + + table_next = freeze_table->next; + + /* Free the tables as we go along */ + default_freezer_mapping_free((void**)&freeze_table, FALSE); + + if (table_next == NULL){ + break; + } + + freeze_table = table_next; + index = 0; + } + + /* + * Skip slots that represent deallocated memory objects. + */ + src_mem_object = freeze_table->entry[index].memory_object; + if (src_mem_object == MEMORY_OBJECT_NULL) + continue; + + /* + * Skip slots that represent faulted pages. + */ + src_offset = freeze_table->entry[index].offset; + if (src_offset != FREEZER_OFFSET_ABSENT) { + + p = vm_page_lookup(object, compact_offset); + assert(p); + + fo = (default_freezer_memory_object_t)src_mem_object; + + src_object = memory_object_control_to_vm_object(fo->fo_pager_control); + + /* Move back over from the freeze object to the original */ + vm_object_lock(src_object); + vm_page_rename(p, src_object, src_offset - src_object->paging_offset, FALSE); + vm_object_unlock(src_object); + } + + if (src_mem_object != ((memory_object_t)last_memory_object_thawed)){ + if (last_memory_object_thawed != NULL){ + last_memory_object_thawed->fo_compact_object = VM_OBJECT_NULL; + last_memory_object_thawed->fo_table = NULL; + } + last_memory_object_thawed = (default_freezer_memory_object_t)src_mem_object; + } + } + + if (last_memory_object_thawed != NULL){ + last_memory_object_thawed->fo_compact_object = VM_OBJECT_NULL; + last_memory_object_thawed->fo_table = NULL; + } + + vm_object_unlock(object); +} + +vm_object_t +default_freezer_get_compact_vm_object(void** table) +{ + default_freezer_mapping_table_t freeze_table = *((default_freezer_mapping_table_t *)table); + assert(freeze_table); + return ((vm_object_t)(freeze_table->object)); +} + +void +df_memory_object_reference(__unused memory_object_t mem_obj) +{ + + /* No-op */ +} + +void +df_memory_object_deallocate(memory_object_t mem_obj) +{ + + default_freezer_memory_object_t fo = (default_freezer_memory_object_t)mem_obj; + vm_object_t compact_object = fo->fo_compact_object; + + assert(fo); + + if (compact_object != VM_OBJECT_NULL) { + + default_freezer_mapping_table_t fo_table = fo->fo_table; + default_freezer_mapping_table_entry_t entry; + boolean_t found = FALSE; + uint32_t index = 0; + + vm_object_lock(compact_object); + + /* Remove from table */ + while (1) { + if (index >= fo_table->index) { + if (fo_table->next) { + fo_table = fo_table->next; + index = 0; + } else { + /* End of tables */ + break; + } + } + + entry = &fo_table->entry[index]; + if (mem_obj == entry->memory_object) { + /* It matches, so clear the entry */ + if (!found) { + found = TRUE; + } + entry->memory_object = MEMORY_OBJECT_NULL; + entry->offset = 0; + } else if (MEMORY_OBJECT_NULL != entry->memory_object) { + /* We have a different valid object; we're done */ + if (found) { + break; + } + } + + index++; + } + + vm_object_unlock(compact_object); + } + + kfree(fo, sizeof(*fo)); +} + +kern_return_t +df_memory_object_init( + memory_object_t mem_obj, + memory_object_control_t control, + __unused memory_object_cluster_size_t pager_page_size) +{ + + default_freezer_memory_object_t fo = (default_freezer_memory_object_t)mem_obj; + assert(fo); + + fo->fo_pager_ops = &default_freezer_ops; + fo->fo_pager_header.io_bits = IKOT_MEMORY_OBJECT; + fo->fo_pager_control = control; + + return KERN_SUCCESS; +} + +kern_return_t +df_memory_object_terminate(memory_object_t mem_obj) +{ + + default_freezer_memory_object_t fo = (default_freezer_memory_object_t)mem_obj; + assert(fo); + memory_object_control_deallocate(fo->fo_pager_control); + return KERN_SUCCESS; +} + +kern_return_t +df_memory_object_data_request( + memory_object_t mem_obj, + memory_object_offset_t offset, + memory_object_cluster_size_t length, + vm_prot_t protection_required, + memory_object_fault_info_t fault_info) +{ + + vm_object_t src_object = VM_OBJECT_NULL, compact_object = VM_OBJECT_NULL; + memory_object_offset_t compact_offset = 0; + memory_object_t pager = NULL; + kern_return_t kr = KERN_SUCCESS; + + default_freezer_memory_object_t fo = (default_freezer_memory_object_t)mem_obj; + + src_object = memory_object_control_to_vm_object(fo->fo_pager_control); + compact_object = fo->fo_compact_object; + + if (compact_object != VM_OBJECT_NULL) { + + vm_object_lock(compact_object); + + kr = default_freezer_mapping_update(fo->fo_table, + mem_obj, + offset, + &compact_offset, + FALSE); + + vm_object_unlock(compact_object); + } else { + kr = KERN_FAILURE; + } + + if (length == 0){ + /*Caller is just querying to see if we have the page*/ + return kr; + } + + if (kr != KERN_SUCCESS){ + + unsigned int request_flags; + upl_t upl; + unsigned int page_list_count = 0; + + request_flags = UPL_NO_SYNC | UPL_RET_ONLY_ABSENT | UPL_SET_LITE; + /* + * Should we decide to activate USE_PRECIOUS (from default_pager_internal.h) + * here, then the request_flags will need to add these to the ones above: + * + * request_flags |= UPL_PRECIOUS | UPL_CLEAN_IN_PLACE + */ + request_flags |= UPL_REQUEST_SET_DIRTY; + + memory_object_super_upl_request(fo->fo_pager_control, + (memory_object_offset_t)offset, + PAGE_SIZE, PAGE_SIZE, + &upl, NULL, &page_list_count, + request_flags); + + upl_abort(upl, UPL_ABORT_UNAVAILABLE); + upl_deallocate(upl); + + return KERN_SUCCESS; + } + + vm_object_lock(compact_object); + + pager = (memory_object_t)compact_object->pager; + + if (!compact_object->pager_ready || pager == MEMORY_OBJECT_NULL){ + vm_object_unlock(compact_object); + return KERN_FAILURE; + } + + vm_object_paging_wait(compact_object, THREAD_UNINT); + vm_object_paging_begin(compact_object); + + compact_object->blocked_access = TRUE; + vm_object_unlock(compact_object); + + ((vm_object_fault_info_t) fault_info)->io_sync = TRUE; + + kr = dp_memory_object_data_request(pager, + compact_offset, + length, + protection_required, + fault_info); + if (kr == KERN_SUCCESS){ + + vm_page_t src_page = VM_PAGE_NULL, dst_page = VM_PAGE_NULL; + + vm_object_lock(compact_object); + + compact_object->blocked_access = FALSE; + vm_object_paging_end(compact_object); + + vm_object_lock(src_object); + + if ((src_page = vm_page_lookup(compact_object, compact_offset)) != VM_PAGE_NULL){ + + dst_page = vm_page_lookup(src_object, offset - src_object->paging_offset); + + VM_PAGE_FREE(dst_page); + vm_page_rename(src_page, src_object, offset - src_object->paging_offset, FALSE); + + if (default_freezer_mapping_update(fo->fo_table, + mem_obj, + offset, + NULL, + TRUE) != KERN_SUCCESS) { + printf("Page for object: 0x%lx at offset: 0x%lx not found in table\n", (uintptr_t)src_object, (uintptr_t)offset); + } + + PAGE_WAKEUP_DONE(src_page); + } else { + printf("%d: default_freezer: compact_object doesn't have the page for object 0x%lx at offset 0x%lx \n", kr, (uintptr_t)compact_object, (uintptr_t)compact_offset); + kr = KERN_FAILURE; + } + vm_object_unlock(src_object); + vm_object_unlock(compact_object); + } else { + panic("%d: default_freezer TOC pointed us to default_pager incorrectly\n", kr); + } + return kr; +} + +kern_return_t +df_memory_object_data_return( + __unused memory_object_t mem_obj, + __unused memory_object_offset_t offset, + __unused memory_object_cluster_size_t size, + __unused memory_object_offset_t *resid_offset, + __unused int *io_error, + __unused boolean_t dirty, + __unused boolean_t kernel_copy, + __unused int upl_flags) +{ + + panic(" default_freezer: df_memory_object_data_return should not be called\n"); + return KERN_SUCCESS; +} + +kern_return_t +df_memory_object_data_initialize( + __unused memory_object_t mem_obj, + __unused memory_object_offset_t offset, + __unused memory_object_cluster_size_t size) +{ + + panic(" default_freezer: df_memory_object_data_initialize should not be called\n"); + return KERN_SUCCESS; +} + +kern_return_t +df_memory_object_data_unlock( + __unused memory_object_t mem_obj, + __unused memory_object_offset_t offset, + __unused memory_object_size_t length, + __unused vm_prot_t prot) +{ + + panic(" default_freezer: df_memory_object_data_unlock should not be called\n"); + return KERN_FAILURE; +} + +kern_return_t +df_memory_object_synchronize( + __unused memory_object_t mem_obj, + __unused memory_object_offset_t offset, + __unused memory_object_size_t length, + __unused vm_sync_t flags) +{ + + panic(" default_freezer: df_memory_object_synchronize should not be called\n"); + return KERN_FAILURE; +} + +kern_return_t +df_memory_object_map( + __unused memory_object_t mem_obj, + __unused vm_prot_t prot) +{ + + panic(" default_freezer: df_memory_object_map should not be called\n"); + return KERN_FAILURE; +} + +kern_return_t +df_memory_object_last_unmap(__unused memory_object_t mem_obj) +{ + + panic(" default_freezer: df_memory_object_last_unmap should not be called\n"); + return KERN_FAILURE; +} + + +kern_return_t +df_memory_object_data_reclaim( + __unused memory_object_t mem_obj, + __unused boolean_t reclaim_backing_store) +{ + + panic("df_memory_object_data_reclaim\n"); + return KERN_SUCCESS; +} +#endif /* CONFIG_FREEZE */ diff --git a/osfmk/vm/default_freezer.h b/osfmk/vm/default_freezer.h new file mode 100644 index 000000000..46730fd71 --- /dev/null +++ b/osfmk/vm/default_freezer.h @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#if CONFIG_FREEZE + +#ifndef _DEFAULT_FREEZER_H_ +#define _DEFAULT_FREEZER_H_ + +#ifdef MACH_KERNEL + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* + * Begin declaration for default_freezer_ops. +*/ +extern void df_memory_object_reference(memory_object_t); +extern void df_memory_object_deallocate(memory_object_t); +extern kern_return_t df_memory_object_init(memory_object_t, + memory_object_control_t, + memory_object_cluster_size_t); +extern kern_return_t df_memory_object_terminate(memory_object_t); +extern kern_return_t df_memory_object_data_request(memory_object_t, + memory_object_offset_t, + memory_object_cluster_size_t, + vm_prot_t, + memory_object_fault_info_t); +extern kern_return_t df_memory_object_data_return(memory_object_t, + memory_object_offset_t, + memory_object_cluster_size_t, + memory_object_offset_t *, + int *, + boolean_t, + boolean_t, + int); +extern kern_return_t df_memory_object_data_initialize(memory_object_t, + memory_object_offset_t, + memory_object_cluster_size_t); +extern kern_return_t df_memory_object_data_unlock(memory_object_t, + memory_object_offset_t, + memory_object_size_t, + vm_prot_t); +extern kern_return_t df_memory_object_synchronize(memory_object_t, + memory_object_offset_t, + memory_object_size_t, + vm_sync_t); +extern kern_return_t df_memory_object_map(memory_object_t, + vm_prot_t); +extern kern_return_t df_memory_object_last_unmap(memory_object_t); + +extern kern_return_t df_memory_object_data_reclaim( memory_object_t, + boolean_t); +/* + * End declaration for default_freezer_ops. +*/ + +const struct memory_object_pager_ops default_freezer_ops = { + df_memory_object_reference, + df_memory_object_deallocate, + df_memory_object_init, + df_memory_object_terminate, + df_memory_object_data_request, + df_memory_object_data_return, + df_memory_object_data_initialize, + df_memory_object_data_unlock, + df_memory_object_synchronize, + df_memory_object_map, + df_memory_object_last_unmap, + df_memory_object_data_reclaim, + "default freezer" +}; + +#define MAX_FREEZE_TABLE_ENTRIES 128 + +struct default_freezer_mapping_table_entry { + memory_object_t memory_object; /* memory object will lead us to the most current VM object */ + memory_object_offset_t offset; +}; +typedef struct default_freezer_mapping_table *default_freezer_mapping_table_t; + +struct default_freezer_mapping_table { + struct default_freezer_mapping_table *next; + vm_object_t object; /* packed object */ + vm_object_offset_t offset; + unsigned int index; + struct default_freezer_mapping_table_entry entry[MAX_FREEZE_TABLE_ENTRIES]; +}; +typedef struct default_freezer_mapping_table_entry *default_freezer_mapping_table_entry_t; + +struct default_freezer_memory_object{ + struct ipc_object_header fo_pager_header; /* fake ip_kotype() */ + memory_object_pager_ops_t fo_pager_ops; /* == &default_freezer_ops */ + memory_object_control_t fo_pager_control; + vm_object_t fo_compact_object; + default_freezer_mapping_table_t fo_table; +}; +typedef struct default_freezer_memory_object *default_freezer_memory_object_t; + + +__private_extern__ void* default_freezer_mapping_create(vm_object_t, vm_offset_t); + +__private_extern__ void default_freezer_mapping_free(void**, boolean_t all); + +__private_extern__ kern_return_t default_freezer_mapping_store( default_freezer_mapping_table_t *, + memory_object_offset_t, + memory_object_t, + memory_object_offset_t ); + +__private_extern__ kern_return_t default_freezer_mapping_update( default_freezer_mapping_table_t, + memory_object_t, + memory_object_offset_t, + memory_object_offset_t *, + boolean_t ); + +__private_extern__ void default_freezer_memory_object_create(vm_object_t, vm_object_t, default_freezer_mapping_table_t); + +__private_extern__ void default_freezer_pack_page(vm_page_t, vm_object_t, vm_object_offset_t, void**); + +__private_extern__ void default_freezer_unpack(vm_object_t, void**); + +__private_extern__ vm_object_t default_freezer_get_compact_vm_object(void**); + +#endif /* MACH_KERNEL */ +#endif /* DEFAULT_FREEZER_H */ +#endif /* CONFIG_FREEZE */ diff --git a/osfmk/vm/device_vm.c b/osfmk/vm/device_vm.c index 575351078..f3df12b70 100644 --- a/osfmk/vm/device_vm.c +++ b/osfmk/vm/device_vm.c @@ -75,6 +75,7 @@ const struct memory_object_pager_ops device_pager_ops = { device_pager_synchronize, device_pager_map, device_pager_last_unmap, + NULL, /* data_reclaim */ "device pager" }; @@ -126,7 +127,7 @@ device_pager_bootstrap(void) size = (vm_size_t) sizeof(struct device_pager); device_pager_zone = zinit(size, (vm_size_t) MAX_DNODE*size, PAGE_SIZE, "device node pager structures"); - + zone_change(device_pager_zone, Z_CALLERACCT, FALSE); return; } diff --git a/osfmk/vm/memory_object.c b/osfmk/vm/memory_object.c index 0fece7fc0..de7baff29 100644 --- a/osfmk/vm/memory_object.c +++ b/osfmk/vm/memory_object.c @@ -101,6 +101,7 @@ #include /* For kernel_map, vm_move */ #include /* For vm_map_pageable */ #include /* Needed by some vm_page.h macros */ +#include #if MACH_PAGEMAP #include @@ -139,10 +140,10 @@ decl_lck_mtx_data(, memory_manager_default_lock) typedef int memory_object_lock_result_t; -#define MEMORY_OBJECT_LOCK_RESULT_DONE 0 -#define MEMORY_OBJECT_LOCK_RESULT_MUST_BLOCK 1 -#define MEMORY_OBJECT_LOCK_RESULT_MUST_CLEAN 2 -#define MEMORY_OBJECT_LOCK_RESULT_MUST_RETURN 3 +#define MEMORY_OBJECT_LOCK_RESULT_DONE 0 +#define MEMORY_OBJECT_LOCK_RESULT_MUST_BLOCK 1 +#define MEMORY_OBJECT_LOCK_RESULT_MUST_RETURN 2 +#define MEMORY_OBJECT_LOCK_RESULT_MUST_FREE 3 memory_object_lock_result_t memory_object_lock_page( vm_page_t m, @@ -174,185 +175,149 @@ memory_object_lock_page( "m_o_lock_page, page 0x%X rtn %d flush %d prot %d\n", m, should_return, should_flush, prot, 0); - /* - * If we cannot change access to the page, - * either because a mapping is in progress - * (busy page) or because a mapping has been - * wired, then give up. - */ if (m->busy || m->cleaning) { - if (m->list_req_pending && (m->pageout || m->cleaning) && + if (m->list_req_pending && should_return == MEMORY_OBJECT_RETURN_NONE && should_flush == TRUE) { - /* - * if pageout is set, page was earmarked by vm_pageout_scan - * to be cleaned and stolen... if cleaning is set, we're - * pre-cleaning pages for a hibernate... - * in either case, we're going - * to take it back since we are being asked to - * flush the page w/o cleaning it (i.e. we don't - * care that it's dirty, we want it gone from - * the cache) and we don't want to stall - * waiting for it to be cleaned for 2 reasons... - * 1 - no use paging it out since we're probably - * shrinking the file at this point or we no - * longer care about the data in the page - * 2 - if we stall, we may casue a deadlock in - * the FS trying to acquire its locks - * on the VNOP_PAGEOUT path presuming that - * those locks are already held on the truncate - * path before calling through to this function - * - * so undo all of the state that vm_pageout_scan - * hung on this page - */ - m->busy = FALSE; - vm_pageout_queue_steal(m, FALSE); + if (m->absent) { + /* + * this is the list_req_pending | absent | busy case + * which originates from vm_fault_page. + * Combine that with should_flush == TRUE and we + * have a case where we need to toss the page from + * the object. + */ + if (!VM_PAGE_WIRED(m)) { + return (MEMORY_OBJECT_LOCK_RESULT_MUST_FREE); + } else { + return (MEMORY_OBJECT_LOCK_RESULT_DONE); + } + } + if (m->pageout || m->cleaning) { + /* + * if pageout is set, page was earmarked by vm_pageout_scan + * to be cleaned and stolen... if cleaning is set, we're + * pre-cleaning pages for a hibernate... + * in either case, we're going + * to take it back since we are being asked to + * flush the page w/o cleaning it (i.e. we don't + * care that it's dirty, we want it gone from + * the cache) and we don't want to stall + * waiting for it to be cleaned for 2 reasons... + * 1 - no use paging it out since we're probably + * shrinking the file at this point or we no + * longer care about the data in the page + * 2 - if we stall, we may casue a deadlock in + * the FS trying to acquire its locks + * on the VNOP_PAGEOUT path presuming that + * those locks are already held on the truncate + * path before calling through to this function + * + * so undo all of the state that vm_pageout_scan + * hung on this page + */ + + vm_pageout_queue_steal(m, FALSE); + PAGE_WAKEUP_DONE(m); + } else { + panic("list_req_pending on page %p without absent/pageout/cleaning set\n", m); + } } else - return(MEMORY_OBJECT_LOCK_RESULT_MUST_BLOCK); + return (MEMORY_OBJECT_LOCK_RESULT_MUST_BLOCK); } - /* * Don't worry about pages for which the kernel * does not have any data. */ - if (m->absent || m->error || m->restart) { - if(m->error && should_flush) { - /* dump the page, pager wants us to */ - /* clean it up and there is no */ - /* relevant data to return */ - if ( !VM_PAGE_WIRED(m)) { - VM_PAGE_FREE(m); - return(MEMORY_OBJECT_LOCK_RESULT_DONE); - } - } else { - return(MEMORY_OBJECT_LOCK_RESULT_DONE); + if (m->error && should_flush && !VM_PAGE_WIRED(m)) { + /* + * dump the page, pager wants us to + * clean it up and there is no + * relevant data to return + */ + return (MEMORY_OBJECT_LOCK_RESULT_MUST_FREE); } + return (MEMORY_OBJECT_LOCK_RESULT_DONE); } - assert(!m->fictitious); - /* - * If the page is wired, just clean or return the page if needed. - * Wired pages don't get flushed or disconnected from the pmap. - */ - if (VM_PAGE_WIRED(m)) { - if (memory_object_should_return_page(m, should_return)) { - if (m->dirty) - return(MEMORY_OBJECT_LOCK_RESULT_MUST_CLEAN); - else - return(MEMORY_OBJECT_LOCK_RESULT_MUST_RETURN); - } - - return(MEMORY_OBJECT_LOCK_RESULT_DONE); - } - - /* - * If the page is to be flushed, allow - * that to be done as part of the protection. - */ - - if (should_flush) - prot = VM_PROT_ALL; - - /* - * Set the page lock. - * - * If we are decreasing permission, do it now; - * let the fault handler take care of increases - * (pmap_page_protect may not increase protection). - */ + /* + * The page is wired... just clean or return the page if needed. + * Wired pages don't get flushed or disconnected from the pmap. + */ + if (memory_object_should_return_page(m, should_return)) + return (MEMORY_OBJECT_LOCK_RESULT_MUST_RETURN); - if (prot != VM_PROT_NO_CHANGE) { - pmap_page_protect(m->phys_page, VM_PROT_ALL & ~prot); + return (MEMORY_OBJECT_LOCK_RESULT_DONE); + } - PAGE_WAKEUP(m); + if (should_flush) { + /* + * must do the pmap_disconnect before determining the + * need to return the page... otherwise it's possible + * for the page to go from the clean to the dirty state + * after we've made our decision + */ + if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED) + m->dirty = TRUE; + } else { + /* + * If we are decreasing permission, do it now; + * let the fault handler take care of increases + * (pmap_page_protect may not increase protection). + */ + if (prot != VM_PROT_NO_CHANGE) + pmap_page_protect(m->phys_page, VM_PROT_ALL & ~prot); } - /* - * Handle page returning. + * Handle returning dirty or precious pages */ if (memory_object_should_return_page(m, should_return)) { - /* - * If we weren't planning - * to flush the page anyway, - * we may need to remove the - * page from the pageout - * system and from physical - * maps now. + * we use to do a pmap_disconnect here in support + * of memory_object_lock_request, but that routine + * no longer requires this... in any event, in + * our world, it would turn into a big noop since + * we don't lock the page in any way and as soon + * as we drop the object lock, the page can be + * faulted back into an address space + * + * if (!should_flush) + * pmap_disconnect(m->phys_page); */ - - vm_page_lockspin_queues(); - VM_PAGE_QUEUES_REMOVE(m); - vm_page_unlock_queues(); - - if (!should_flush) - pmap_disconnect(m->phys_page); - - if (m->dirty) - return(MEMORY_OBJECT_LOCK_RESULT_MUST_CLEAN); - else - return(MEMORY_OBJECT_LOCK_RESULT_MUST_RETURN); + return (MEMORY_OBJECT_LOCK_RESULT_MUST_RETURN); } /* - * Handle flushing + * Handle flushing clean pages */ - if (should_flush) { - VM_PAGE_FREE(m); - } else { - /* - * XXX Make clean but not flush a paging hint, - * and deactivate the pages. This is a hack - * because it overloads flush/clean with - * implementation-dependent meaning. This only - * happens to pages that are already clean. - */ + if (should_flush) + return (MEMORY_OBJECT_LOCK_RESULT_MUST_FREE); - if (vm_page_deactivate_hint && - (should_return != MEMORY_OBJECT_RETURN_NONE)) { - vm_page_lockspin_queues(); - vm_page_deactivate(m); - vm_page_unlock_queues(); - } - } + /* + * we use to deactivate clean pages at this point, + * but we do not believe that an msync should change + * the 'age' of a page in the cache... here is the + * original comment and code concerning this... + * + * XXX Make clean but not flush a paging hint, + * and deactivate the pages. This is a hack + * because it overloads flush/clean with + * implementation-dependent meaning. This only + * happens to pages that are already clean. + * + * if (vm_page_deactivate_hint && (should_return != MEMORY_OBJECT_RETURN_NONE)) + * return (MEMORY_OBJECT_LOCK_RESULT_MUST_DEACTIVATE); + */ - return(MEMORY_OBJECT_LOCK_RESULT_DONE); + return (MEMORY_OBJECT_LOCK_RESULT_DONE); } -#define LIST_REQ_PAGEOUT_PAGES(object, data_cnt, action, po, ro, ioerr, iosync) \ -MACRO_BEGIN \ - \ - register int upl_flags; \ - memory_object_t pager; \ - \ - if ((pager = (object)->pager) != MEMORY_OBJECT_NULL) { \ - vm_object_paging_begin(object); \ - vm_object_unlock(object); \ - \ - if (iosync) \ - upl_flags = UPL_MSYNC | UPL_IOSYNC; \ - else \ - upl_flags = UPL_MSYNC; \ - \ - (void) memory_object_data_return(pager, \ - po, \ - (memory_object_cluster_size_t)data_cnt, \ - ro, \ - ioerr, \ - (action) == MEMORY_OBJECT_LOCK_RESULT_MUST_CLEAN,\ - !should_flush, \ - upl_flags); \ - \ - vm_object_lock(object); \ - vm_object_paging_end(object); \ - } \ -MACRO_END + /* * Routine: memory_object_lock_request [user interface] @@ -556,6 +521,40 @@ vm_object_sync( +#define LIST_REQ_PAGEOUT_PAGES(object, data_cnt, po, ro, ioerr, iosync) \ +MACRO_BEGIN \ + \ + int upl_flags; \ + memory_object_t pager; \ + \ + if (object == slide_info.slide_object) { \ + panic("Objects with slid pages not allowed\n"); \ + } \ + \ + if ((pager = (object)->pager) != MEMORY_OBJECT_NULL) { \ + vm_object_paging_begin(object); \ + vm_object_unlock(object); \ + \ + if (iosync) \ + upl_flags = UPL_MSYNC | UPL_IOSYNC; \ + else \ + upl_flags = UPL_MSYNC; \ + \ + (void) memory_object_data_return(pager, \ + po, \ + (memory_object_cluster_size_t)data_cnt, \ + ro, \ + ioerr, \ + FALSE, \ + FALSE, \ + upl_flags); \ + \ + vm_object_lock(object); \ + vm_object_paging_end(object); \ + } \ +MACRO_END + + static int vm_object_update_extent( @@ -571,13 +570,18 @@ vm_object_update_extent( { vm_page_t m; int retval = 0; - memory_object_cluster_size_t data_cnt = 0; vm_object_offset_t paging_offset = 0; vm_object_offset_t next_offset = offset; memory_object_lock_result_t page_lock_result; - memory_object_lock_result_t pageout_action; - - pageout_action = MEMORY_OBJECT_LOCK_RESULT_DONE; + memory_object_cluster_size_t data_cnt = 0; + struct vm_page_delayed_work dw_array[DEFAULT_DELAYED_WORK_LIMIT]; + struct vm_page_delayed_work *dwp; + int dw_count; + int dw_limit; + + dwp = &dw_array[0]; + dw_count = 0; + dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT); for (; offset < offset_end && object->resident_page_count; @@ -589,98 +593,105 @@ vm_object_update_extent( */ if (data_cnt) { if ((data_cnt >= PAGE_SIZE * MAX_UPL_TRANSFER) || (next_offset != offset)) { - LIST_REQ_PAGEOUT_PAGES(object, data_cnt, - pageout_action, paging_offset, offset_resid, io_errno, should_iosync); + + if (dw_count) { + vm_page_do_delayed_work(object, &dw_array[0], dw_count); + dwp = &dw_array[0]; + dw_count = 0; + } + LIST_REQ_PAGEOUT_PAGES(object, data_cnt, + paging_offset, offset_resid, io_errno, should_iosync); data_cnt = 0; } } - while ((m = vm_page_lookup(object, offset)) != VM_PAGE_NULL) { - page_lock_result = memory_object_lock_page(m, should_return, should_flush, prot); - - XPR(XPR_MEMORY_OBJECT, - "m_o_update: lock_page, obj 0x%X offset 0x%X result %d\n", - object, offset, page_lock_result, 0, 0); - - switch (page_lock_result) - { - case MEMORY_OBJECT_LOCK_RESULT_DONE: - /* - * End of a cluster of dirty pages. - */ - if (data_cnt) { - LIST_REQ_PAGEOUT_PAGES(object, - data_cnt, pageout_action, - paging_offset, offset_resid, io_errno, should_iosync); - data_cnt = 0; - continue; - } - break; - - case MEMORY_OBJECT_LOCK_RESULT_MUST_BLOCK: - /* - * Since it is necessary to block, - * clean any dirty pages now. - */ - if (data_cnt) { - LIST_REQ_PAGEOUT_PAGES(object, - data_cnt, pageout_action, - paging_offset, offset_resid, io_errno, should_iosync); - data_cnt = 0; - continue; - } - PAGE_SLEEP(object, m, THREAD_UNINT); - continue; - - case MEMORY_OBJECT_LOCK_RESULT_MUST_CLEAN: - case MEMORY_OBJECT_LOCK_RESULT_MUST_RETURN: - /* - * The clean and return cases are similar. - * - * if this would form a discontiguous block, - * clean the old pages and start anew. - */ - if (data_cnt && pageout_action != page_lock_result) { - LIST_REQ_PAGEOUT_PAGES(object, - data_cnt, pageout_action, - paging_offset, offset_resid, io_errno, should_iosync); - data_cnt = 0; - continue; - } - if (m->cleaning) { - PAGE_SLEEP(object, m, THREAD_UNINT); - continue; - } - if (data_cnt == 0) { - pageout_action = page_lock_result; - paging_offset = offset; - } - data_cnt += PAGE_SIZE; - next_offset = offset + PAGE_SIZE_64; - - /* - * Clean - */ - m->list_req_pending = TRUE; - m->cleaning = TRUE; - - if (should_flush && - /* let's not flush a wired page... */ - !VM_PAGE_WIRED(m)) { - /* - * and add additional state - * for the flush - */ - m->busy = TRUE; - m->pageout = TRUE; - - vm_page_lockspin_queues(); - vm_page_wire(m); - vm_page_unlock_queues(); - } - - retval = 1; - break; + + dwp->dw_mask = 0; + + page_lock_result = memory_object_lock_page(m, should_return, should_flush, prot); + + if (data_cnt && page_lock_result != MEMORY_OBJECT_LOCK_RESULT_MUST_RETURN) { + /* + * End of a run of dirty/precious pages. + */ + if (dw_count) { + vm_page_do_delayed_work(object, &dw_array[0], dw_count); + dwp = &dw_array[0]; + dw_count = 0; + } + LIST_REQ_PAGEOUT_PAGES(object, data_cnt, + paging_offset, offset_resid, io_errno, should_iosync); + /* + * LIST_REQ_PAGEOUT_PAGES will drop the object lock which will + * allow the state of page 'm' to change... we need to re-lookup + * the current offset + */ + data_cnt = 0; + continue; + } + + switch (page_lock_result) { + + case MEMORY_OBJECT_LOCK_RESULT_DONE: + break; + + case MEMORY_OBJECT_LOCK_RESULT_MUST_FREE: + dwp->dw_mask |= DW_vm_page_free; + break; + + case MEMORY_OBJECT_LOCK_RESULT_MUST_BLOCK: + PAGE_SLEEP(object, m, THREAD_UNINT); + continue; + + case MEMORY_OBJECT_LOCK_RESULT_MUST_RETURN: + if (data_cnt == 0) + paging_offset = offset; + + data_cnt += PAGE_SIZE; + next_offset = offset + PAGE_SIZE_64; + + /* + * Clean + */ + m->list_req_pending = TRUE; + m->cleaning = TRUE; + + /* + * wired pages shouldn't be flushed and + * since they aren't on any queue, + * no need to remove them + */ + if (!VM_PAGE_WIRED(m)) { + + if (should_flush) { + /* + * add additional state for the flush + */ + m->busy = TRUE; + m->pageout = TRUE; + + dwp->dw_mask |= DW_vm_page_wire; + } + /* + * we use to remove the page from the queues at this + * point, but we do not believe that an msync + * should cause the 'age' of a page to be changed + * + * else + * dwp->dw_mask |= DW_VM_PAGE_QUEUES_REMOVE; + */ + } + retval = 1; + break; + } + if (dwp->dw_mask) { + VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count); + + if (dw_count >= dw_limit) { + vm_page_do_delayed_work(object, &dw_array[0], dw_count); + dwp = &dw_array[0]; + dw_count = 0; + } } break; } @@ -689,9 +700,12 @@ vm_object_update_extent( * We have completed the scan for applicable pages. * Clean any pages that have been saved. */ + if (dw_count) + vm_page_do_delayed_work(object, &dw_array[0], dw_count); + if (data_cnt) { - LIST_REQ_PAGEOUT_PAGES(object, - data_cnt, pageout_action, paging_offset, offset_resid, io_errno, should_iosync); + LIST_REQ_PAGEOUT_PAGES(object, data_cnt, + paging_offset, offset_resid, io_errno, should_iosync); } return (retval); } @@ -707,14 +721,14 @@ vm_object_update_extent( */ kern_return_t vm_object_update( - register vm_object_t object, - register vm_object_offset_t offset, - register vm_object_size_t size, - register vm_object_offset_t *resid_offset, - int *io_errno, - memory_object_return_t should_return, - int flags, - vm_prot_t protection) + vm_object_t object, + vm_object_offset_t offset, + vm_object_size_t size, + vm_object_offset_t *resid_offset, + int *io_errno, + memory_object_return_t should_return, + int flags, + vm_prot_t protection) { vm_object_t copy_object = VM_OBJECT_NULL; boolean_t data_returned = FALSE; @@ -801,27 +815,27 @@ vm_object_update( /* * translate offset with respect to shadow's offset */ - copy_offset = (offset >= copy_object->shadow_offset) ? - (vm_map_offset_t)(offset - copy_object->shadow_offset) : + copy_offset = (offset >= copy_object->vo_shadow_offset) ? + (vm_map_offset_t)(offset - copy_object->vo_shadow_offset) : (vm_map_offset_t) 0; - if (copy_offset > copy_object->size) - copy_offset = copy_object->size; + if (copy_offset > copy_object->vo_size) + copy_offset = copy_object->vo_size; /* * clip size with respect to shadow offset */ - if (offset >= copy_object->shadow_offset) { + if (offset >= copy_object->vo_shadow_offset) { copy_size = size; - } else if (size >= copy_object->shadow_offset - offset) { - copy_size = size - (copy_object->shadow_offset - offset); + } else if (size >= copy_object->vo_shadow_offset - offset) { + copy_size = size - (copy_object->vo_shadow_offset - offset); } else { copy_size = 0; } - if (copy_offset + copy_size > copy_object->size) { - if (copy_object->size >= copy_offset) { - copy_size = copy_object->size - copy_offset; + if (copy_offset + copy_size > copy_object->vo_size) { + if (copy_object->vo_size >= copy_offset) { + copy_size = copy_object->vo_size - copy_offset; } else { copy_size = 0; } @@ -841,6 +855,8 @@ vm_object_update( fault_info.hi_offset = copy_size; fault_info.no_cache = FALSE; fault_info.stealth = TRUE; + fault_info.io_sync = FALSE; + fault_info.cs_bypass = FALSE; fault_info.mark_zf_absent = FALSE; vm_object_paging_begin(copy_object); @@ -898,12 +914,6 @@ vm_object_update( vm_object_lock(copy_object); vm_object_paging_begin(copy_object); goto RETRY_COW_OF_LOCK_REQUEST; - case VM_FAULT_FICTITIOUS_SHORTAGE: - vm_page_more_fictitious(); - prot = VM_PROT_WRITE|VM_PROT_READ; - vm_object_lock(copy_object); - vm_object_paging_begin(copy_object); - goto RETRY_COW_OF_LOCK_REQUEST; case VM_FAULT_SUCCESS_NO_VM_PAGE: /* success but no VM page: fail */ vm_object_paging_end(copy_object); @@ -1783,14 +1793,17 @@ host_default_memory_manager( thread_wakeup((event_t) &memory_manager_default); +#ifndef CONFIG_FREEZE /* * Now that we have a default pager for anonymous memory, * reactivate all the throttled pages (i.e. dirty pages with * no pager). */ - if (current_manager == MEMORY_OBJECT_DEFAULT_NULL) { + if (current_manager == MEMORY_OBJECT_DEFAULT_NULL) + { vm_page_reactivate_all_throttled(); } +#endif } out: lck_mtx_unlock(&memory_manager_default_lock); @@ -1924,6 +1937,39 @@ memory_object_range_op( } +void +memory_object_mark_used( + memory_object_control_t control) +{ + vm_object_t object; + + if (control == NULL) + return; + + object = memory_object_control_to_vm_object(control); + + if (object != VM_OBJECT_NULL) + vm_object_cache_remove(object); +} + + +void +memory_object_mark_unused( + memory_object_control_t control, + __unused boolean_t rage) +{ + vm_object_t object; + + if (control == NULL) + return; + + object = memory_object_control_to_vm_object(control); + + if (object != VM_OBJECT_NULL) + vm_object_cache_add(object); +} + + kern_return_t memory_object_pages_resident( memory_object_control_t control, @@ -1961,6 +2007,20 @@ memory_object_signed( return KERN_SUCCESS; } +boolean_t +memory_object_is_slid( + memory_object_control_t control) +{ + vm_object_t object = VM_OBJECT_NULL; + vm_object_t slide_object = slide_info.slide_object; + + object = memory_object_control_to_vm_object(control); + if (object == VM_OBJECT_NULL) + return FALSE; + + return (object == slide_object); +} + static zone_t mem_obj_control_zone; __private_extern__ void @@ -1970,6 +2030,7 @@ memory_object_control_bootstrap(void) i = (vm_size_t) sizeof (struct memory_object_control); mem_obj_control_zone = zinit (i, 8192*i, 4096, "mem_obj_control"); + zone_change(mem_obj_control_zone, Z_CALLERACCT, FALSE); zone_change(mem_obj_control_zone, Z_NOENCRYPT, TRUE); return; } @@ -2251,6 +2312,20 @@ kern_return_t memory_object_last_unmap memory_object); } +/* Routine memory_object_data_reclaim */ +kern_return_t memory_object_data_reclaim +( + memory_object_t memory_object, + boolean_t reclaim_backing_store +) +{ + if (memory_object->mo_pager_ops->memory_object_data_reclaim == NULL) + return KERN_NOT_SUPPORTED; + return (memory_object->mo_pager_ops->memory_object_data_reclaim)( + memory_object, + reclaim_backing_store); +} + /* Routine memory_object_create */ kern_return_t memory_object_create ( diff --git a/osfmk/vm/memory_object.h b/osfmk/vm/memory_object.h index a0b6690c1..05f78fcf4 100644 --- a/osfmk/vm/memory_object.h +++ b/osfmk/vm/memory_object.h @@ -141,4 +141,15 @@ extern kern_return_t memory_object_signed( memory_object_control_t control, boolean_t is_signed); +extern boolean_t memory_object_is_slid( + memory_object_control_t control); + +extern void memory_object_mark_used( + memory_object_control_t control); + +extern void memory_object_mark_unused( + memory_object_control_t control, + boolean_t rage); + + #endif /* _VM_MEMORY_OBJECT_H_ */ diff --git a/osfmk/vm/pmap.h b/osfmk/vm/pmap.h index 5d6253718..76d7cb305 100644 --- a/osfmk/vm/pmap.h +++ b/osfmk/vm/pmap.h @@ -274,6 +274,12 @@ extern kern_return_t (pmap_attribute_cache_sync)( /* Flush appropriate extern unsigned int (pmap_cache_attributes)( ppnum_t pn); +/* + * Set (override) cache attributes for the specified physical page + */ +extern void pmap_set_cache_attributes( + ppnum_t, + unsigned int); extern void pmap_sync_page_data_phys(ppnum_t pa); extern void pmap_sync_page_attributes_phys(ppnum_t pa); @@ -453,14 +459,14 @@ extern void (pmap_pageable)( extern uint64_t pmap_nesting_size_min; extern uint64_t pmap_nesting_size_max; -extern kern_return_t pmap_nest(pmap_t grand, - pmap_t subord, - addr64_t vstart, - addr64_t nstart, - uint64_t size); -extern kern_return_t pmap_unnest(pmap_t grand, - addr64_t vaddr, - uint64_t size); +extern kern_return_t pmap_nest(pmap_t, + pmap_t, + addr64_t, + addr64_t, + uint64_t); +extern kern_return_t pmap_unnest(pmap_t, + addr64_t, + uint64_t); extern boolean_t pmap_adjust_unnest_parameters(pmap_t, vm_map_offset_t *, vm_map_offset_t *); #endif /* MACH_KERNEL_PRIVATE */ @@ -484,8 +490,8 @@ extern pmap_t kernel_pmap; /* The kernel's map */ #define VM_MEM_NOT_CACHEABLE 0x4 /* (I) Cache Inhibit */ #define VM_MEM_WRITE_THROUGH 0x8 /* (W) Write-Through */ +#define VM_WIMG_USE_DEFAULT 0x80 #define VM_WIMG_MASK 0xFF -#define VM_WIMG_USE_DEFAULT 0x80000000 #define VM_MEM_SUPERPAGE 0x100 /* map a superpage instead of a base page */ diff --git a/osfmk/vm/vm_apple_protect.c b/osfmk/vm/vm_apple_protect.c index 953a4e139..ef46cfeca 100644 --- a/osfmk/vm/vm_apple_protect.c +++ b/osfmk/vm/vm_apple_protect.c @@ -130,6 +130,7 @@ const struct memory_object_pager_ops apple_protect_pager_ops = { apple_protect_pager_synchronize, apple_protect_pager_map, apple_protect_pager_last_unmap, + NULL, /* data_reclaim */ "apple protect pager" }; @@ -354,6 +355,7 @@ apple_protect_pager_data_request( upl_pl = NULL; fault_info = *((struct vm_object_fault_info *) mo_fault_info); fault_info.stealth = TRUE; + fault_info.io_sync = FALSE; fault_info.mark_zf_absent = FALSE; interruptible = fault_info.interruptible; @@ -510,7 +512,7 @@ apple_protect_pager_data_request( kernel_mapping, src_page->phys_page, VM_PROT_READ, - src_object->wimg_bits & VM_WIMG_MASK, + 0, TRUE); /* * Establish an explicit pmap mapping of the destination @@ -525,7 +527,7 @@ apple_protect_pager_data_request( kernel_mapping + PAGE_SIZE_64, dst_pnum, VM_PROT_READ | VM_PROT_WRITE, - dst_object->wimg_bits & VM_WIMG_MASK, + 0, TRUE); /* @@ -725,13 +727,13 @@ apple_protect_pager_terminate_internal( vm_object_deallocate(pager->backing_object); pager->backing_object = VM_OBJECT_NULL; } - - /* trigger the destruction of the memory object */ - memory_object_destroy(pager->pager_control, 0); /* deallocate any crypt module data */ if(pager->crypt.crypt_end) pager->crypt.crypt_end(pager->crypt.crypt_ops); + + /* trigger the destruction of the memory object */ + memory_object_destroy(pager->pager_control, 0); } /* diff --git a/osfmk/vm/vm_debug.c b/osfmk/vm/vm_debug.c index a0712c54d..1dfa947ef 100644 --- a/osfmk/vm/vm_debug.c +++ b/osfmk/vm/vm_debug.c @@ -206,7 +206,7 @@ vm32_region_info( vio->vio_object = (natural_t)(uintptr_t) cobject; vio->vio_size = - (natural_t) cobject->size; + (natural_t) cobject->vo_size; vio->vio_ref_count = cobject->ref_count; vio->vio_resident_page_count = @@ -216,7 +216,7 @@ vm32_region_info( vio->vio_shadow = (natural_t)(uintptr_t) cobject->shadow; vio->vio_shadow_offset = - (natural_t) cobject->shadow_offset; + (natural_t) cobject->vo_shadow_offset; vio->vio_paging_offset = (natural_t) cobject->paging_offset; vio->vio_copy_strategy = @@ -408,7 +408,7 @@ vm32_region_info_64( vio->vio_object = (natural_t)(uintptr_t) cobject; vio->vio_size = - (natural_t) cobject->size; + (natural_t) cobject->vo_size; vio->vio_ref_count = cobject->ref_count; vio->vio_resident_page_count = @@ -418,7 +418,7 @@ vm32_region_info_64( vio->vio_shadow = (natural_t)(uintptr_t) cobject->shadow; vio->vio_shadow_offset = - (natural_t) cobject->shadow_offset; + (natural_t) cobject->vo_shadow_offset; vio->vio_paging_offset = (natural_t) cobject->paging_offset; vio->vio_copy_strategy = diff --git a/osfmk/vm/vm_fault.c b/osfmk/vm/vm_fault.c index a36714b57..ab281c928 100644 --- a/osfmk/vm/vm_fault.c +++ b/osfmk/vm/vm_fault.c @@ -89,8 +89,6 @@ #include #include -#include - #include #include #include @@ -102,8 +100,7 @@ #include #include #include /* Needed by some vm_page.h macros */ - -#include +#include #define VM_FAULT_CLASSIFY 0 @@ -133,12 +130,13 @@ extern unsigned int dp_pages_free, dp_pages_reserve; #define NEED_TO_HARD_THROTTLE_THIS_TASK() (((dp_pages_free + dp_pages_reserve < 2000) && \ (get_task_resident_size(current_task()) > vm_hard_throttle_threshold) && \ - (current_task() != kernel_task) && IP_VALID(memory_manager_default)) || \ + (current_task() != kernel_task) && VM_DYNAMIC_PAGING_ENABLED(memory_manager_default)) || \ (vm_page_free_count < vm_page_throttle_limit && thread_is_io_throttled() && \ (get_task_resident_size(current_task()) > vm_hard_throttle_threshold))) -#define HARD_THROTTLE_DELAY 10000 /* 10000 us == 10 ms */ +#define HARD_THROTTLE_DELAY 20000 /* 20000 us == 20 ms */ +#define SOFT_THROTTLE_DELAY 2000 /* 2000 us == 2 ms */ extern int cs_debug; @@ -180,6 +178,7 @@ unsigned long vm_cs_validates = 0; unsigned long vm_cs_revalidates = 0; unsigned long vm_cs_query_modified = 0; unsigned long vm_cs_validated_dirtied = 0; +unsigned long vm_cs_bitmap_validated = 0; #if CONFIG_ENFORCE_SIGNED_CODE int cs_enforcement_disable=0; #else @@ -521,7 +520,7 @@ vm_fault_deactivate_behind( } -static boolean_t +static int vm_page_throttled(void) { clock_sec_t elapsed_sec; @@ -531,12 +530,12 @@ vm_page_throttled(void) thread_t thread = current_thread(); if (thread->options & TH_OPT_VMPRIV) - return (FALSE); + return (0); thread->t_page_creation_count++; if (NEED_TO_HARD_THROTTLE_THIS_TASK()) - return (TRUE); + return (HARD_THROTTLE_DELAY); if (vm_page_free_count < vm_page_throttle_limit && thread->t_page_creation_count > vm_page_creation_throttle) { @@ -562,12 +561,12 @@ vm_page_throttled(void) } ++vm_page_throttle_count; - return (TRUE); + return (SOFT_THROTTLE_DELAY); } thread->t_page_creation_time = tv_sec; thread->t_page_creation_count = 0; } - return (FALSE); + return (0); } @@ -582,6 +581,8 @@ vm_page_throttled(void) static vm_fault_return_t vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state) { + int throttle_delay; + if (object->shadow_severed || VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) { /* @@ -619,7 +620,7 @@ vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t int return (VM_FAULT_RETRY); } } - if (vm_page_throttled()) { + if ((throttle_delay = vm_page_throttled())) { /* * we're throttling zero-fills... * treat this as if we couldn't grab a page @@ -628,15 +629,14 @@ vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t int VM_PAGE_FREE(m); vm_fault_cleanup(object, first_m); - if (NEED_TO_HARD_THROTTLE_THIS_TASK()) { - delay(HARD_THROTTLE_DELAY); + VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0); - if (current_thread_aborted()) { - thread_interrupt_level(interruptible_state); - return VM_FAULT_INTERRUPTED; - } - } + delay(throttle_delay); + if (current_thread_aborted()) { + thread_interrupt_level(interruptible_state); + return VM_FAULT_INTERRUPTED; + } thread_interrupt_level(interruptible_state); return (VM_FAULT_MEMORY_SHORTAGE); @@ -677,9 +677,9 @@ vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill) m->cs_validated = FALSE; m->cs_tainted = FALSE; - if (no_zero_fill == TRUE) - my_fault = DBG_NZF_PAGE_FAULT; - else { + if (no_zero_fill == TRUE) { + my_fault = DBG_NZF_PAGE_FAULT; + } else { vm_page_zero_fill(m); VM_STAT_INCR(zero_fill_count); @@ -689,12 +689,17 @@ vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill) assert(m->object != kernel_object); //assert(m->pageq.next == NULL && m->pageq.prev == NULL); - if (!IP_VALID(memory_manager_default) && + if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) && (m->object->purgable == VM_PURGABLE_DENY || m->object->purgable == VM_PURGABLE_NONVOLATILE || m->object->purgable == VM_PURGABLE_VOLATILE )) { + vm_page_lockspin_queues(); + assert(!VM_PAGE_WIRED(m)); + + VM_PAGE_QUEUES_REMOVE(m); + queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq); m->throttled = TRUE; vm_page_throttled_count++; @@ -990,23 +995,105 @@ vm_fault_page( #if TRACEFAULTPAGE dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */ #endif - wait_result = PAGE_SLEEP(object, m, interruptible); - XPR(XPR_VM_FAULT, - "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n", - object, offset, - m, 0, 0); - counter(c_vm_fault_page_block_busy_kernel++); + if (m->list_req_pending) { + /* + * "list_req_pending" means that the + * page has been marked for a page-in + * or page-out operation but hasn't been + * grabbed yet. + * Since whoever marked it + * "list_req_pending" might now be + * making its way through other layers + * of code and possibly blocked on locks + * that we might be holding, we can't + * just block on a "busy" and + * "list_req_pending" page or we might + * deadlock with that other thread. + * + * [ For pages backed by a file on an + * HFS volume, we might deadlock with + * the HFS truncate lock, for example: + * A: starts a pageout or pagein + * operation and marks a page "busy", + * "list_req_pending" and either + * "pageout", "cleaning" or "absent". + * A: makes its way through the + * memory object (vnode) code. + * B: starts from the memory object + * side, via a write() on a file, for + * example. + * B: grabs some filesystem locks. + * B: attempts to grab the same page for + * its I/O. + * B: blocks here because the page is + * "busy". + * A: attempts to grab the filesystem + * lock we're holding. + * And we have a deadlock... ] + * + * Since the page hasn't been claimed + * by the other thread yet, it's fair + * for us to grab here. + */ + if (m->absent) { + /* + * The page needs to be paged + * in. We can do it here but we + * need to get rid of "m", the + * place holder page inserted by + * another thread who is also + * trying to page it in. When + * that thread resumes, it will + * either wait for our page to + * arrive or it will find it + * already there. + */ + VM_PAGE_FREE(m); - if (wait_result != THREAD_AWAKENED) { - vm_fault_cleanup(object, first_m); - thread_interrupt_level(interruptible_state); + /* + * Retry the fault. We'll find + * that the page is not resident + * and initiate a page-in again. + */ + continue; + } + if (m->pageout || m->cleaning) { + /* + * This page has been selected + * for a page-out but we want + * to bring it in. Let's just + * cancel the page-out... + */ + vm_pageout_queue_steal(m, FALSE); + /* + * ... and clear "busy" and + * wake up any waiters... + */ + PAGE_WAKEUP_DONE(m); + /* + * ... and continue with the + * "fault" handling. + */ + } + } else { + wait_result = PAGE_SLEEP(object, m, interruptible); + XPR(XPR_VM_FAULT, + "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n", + object, offset, + m, 0, 0); + counter(c_vm_fault_page_block_busy_kernel++); - if (wait_result == THREAD_RESTART) - return (VM_FAULT_RETRY); - else - return (VM_FAULT_INTERRUPTED); + if (wait_result != THREAD_AWAKENED) { + vm_fault_cleanup(object, first_m); + thread_interrupt_level(interruptible_state); + + if (wait_result == THREAD_RESTART) + return (VM_FAULT_RETRY); + else + return (VM_FAULT_INTERRUPTED); + } + continue; } - continue; } if (m->phys_page == vm_page_guard_addr) { @@ -1152,6 +1239,7 @@ vm_fault_page( if (fault_info->mark_zf_absent && no_zero_fill == TRUE) m->absent = TRUE; + break; } else { if (must_be_resident) @@ -1172,11 +1260,11 @@ vm_fault_page( "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n", object, offset, next_object, - offset+object->shadow_offset,0); + offset+object->vo_shadow_offset,0); - offset += object->shadow_offset; - fault_info->lo_offset += object->shadow_offset; - fault_info->hi_offset += object->shadow_offset; + offset += object->vo_shadow_offset; + fault_info->lo_offset += object->vo_shadow_offset; + fault_info->hi_offset += object->vo_shadow_offset; access_required = VM_PROT_READ; vm_object_lock(next_object); @@ -1418,8 +1506,8 @@ vm_fault_page( vm_object_lock(object); assert(object->ref_count > 0); - if (object->paging_in_progress > vm_object_pagein_throttle) { - vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_IN_PROGRESS, interruptible); + if (object->paging_in_progress >= vm_object_pagein_throttle) { + vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible); vm_object_unlock(object); wait_result = thread_block(THREAD_CONTINUE_NULL); @@ -1520,7 +1608,7 @@ vm_fault_page( current_thread()->t_page_creation_time = tv_sec; current_thread()->t_page_creation_count = 0; } - if ((interruptible != THREAD_UNINT) && (current_thread()->sched_mode & TH_MODE_ABORT)) { + if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) { vm_fault_cleanup(object, first_m); thread_interrupt_level(interruptible_state); @@ -1637,9 +1725,9 @@ vm_fault_page( if ((object != first_object) || must_be_resident) vm_object_paging_end(object); - offset += object->shadow_offset; - fault_info->lo_offset += object->shadow_offset; - fault_info->hi_offset += object->shadow_offset; + offset += object->vo_shadow_offset; + fault_info->lo_offset += object->vo_shadow_offset; + fault_info->hi_offset += object->vo_shadow_offset; access_required = VM_PROT_READ; vm_object_lock(next_object); @@ -1884,9 +1972,9 @@ vm_fault_page( /* * Does the page exist in the copy? */ - copy_offset = first_offset - copy_object->shadow_offset; + copy_offset = first_offset - copy_object->vo_shadow_offset; - if (copy_object->size <= copy_offset) + if (copy_object->vo_size <= copy_offset) /* * Copy object doesn't cover this page -- do nothing. */ @@ -2194,12 +2282,13 @@ vm_fault_enter(vm_page_t m, pmap_t pmap, vm_map_offset_t vaddr, vm_prot_t prot, + vm_prot_t fault_type, boolean_t wired, boolean_t change_wiring, boolean_t no_cache, + boolean_t cs_bypass, int *type_of_fault) { - unsigned int cache_attr; kern_return_t kr, pe_result; boolean_t previously_pmapped = m->pmapped; boolean_t must_disconnect = 0; @@ -2215,19 +2304,25 @@ vm_fault_enter(vm_page_t m, return KERN_SUCCESS; } - cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK; + if (*type_of_fault == DBG_ZERO_FILL_FAULT) { - if (m->pmapped == FALSE) { + vm_object_lock_assert_exclusive(m->object); + + } else if ((fault_type & VM_PROT_WRITE) == 0) { /* - * This is the first time this page is being - * mapped in an address space (pmapped == FALSE). - * - * Part of that page may still be in the data cache - * and not flushed to memory. In case we end up - * accessing that page via the instruction cache, - * we need to ensure that the 2 caches are in sync. + * This is not a "write" fault, so we + * might not have taken the object lock + * exclusively and we might not be able + * to update the "wpmapped" bit in + * vm_fault_enter(). + * Let's just grant read access to + * the page for now and we'll + * soft-fault again if we need write + * access later... */ - pmap_sync_page_data_phys(m->phys_page); + prot &= ~VM_PROT_WRITE; + } + if (m->pmapped == FALSE) { if ((*type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) { /* @@ -2251,8 +2346,7 @@ vm_fault_enter(vm_page_t m, } VM_PAGE_CONSUME_CLUSTERED(m); - } else if (cache_attr != VM_WIMG_DEFAULT) - pmap_sync_page_attributes_phys(m->phys_page); + } if (*type_of_fault != DBG_COW_FAULT) { DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL); @@ -2309,7 +2403,7 @@ vm_fault_enter(vm_page_t m, * code can be created */ if (m->cs_tainted || - ( !cs_enforcement_disable && + (( !cs_enforcement_disable && !cs_bypass ) && (/* The page is unsigned and wants to be executable */ (!m->cs_validated && (prot & VM_PROT_EXECUTE)) || /* The page should be immutable, but is in danger of being modified @@ -2387,29 +2481,58 @@ vm_fault_enter(vm_page_t m, * that's needed for an AtomicCompareAndSwap */ m->pmapped = TRUE; - if (prot & VM_PROT_WRITE) { - vm_object_lock_assert_exclusive(m->object); - m->wpmapped = TRUE; - if(must_disconnect) { - /* We can only get here - * because of the CSE logic */ + if(vm_page_is_slideable(m)) { + boolean_t was_busy = m->busy; + m->busy = TRUE; + kr = vm_page_slide(m, 0); + assert(m->busy); + if(!was_busy) { + PAGE_WAKEUP_DONE(m); + } + if (kr != KERN_SUCCESS) { + /* + * This page has not been slid correctly, + * do not do the pmap_enter() ! + * Let vm_fault_enter() return the error + * so the caller can fail the fault. + */ + goto after_the_pmap_enter; + } + } + + if (fault_type & VM_PROT_WRITE) { + + if (m->wpmapped == FALSE) { + vm_object_lock_assert_exclusive(m->object); + + m->wpmapped = TRUE; + } + if (must_disconnect) { + /* + * We can only get here + * because of the CSE logic + */ assert(cs_enforcement_disable == FALSE); pmap_disconnect(m->phys_page); - /* If we are faulting for a write, we can clear + /* + * If we are faulting for a write, we can clear * the execute bit - that will ensure the page is * checked again before being executable, which * protects against a map switch. * This only happens the first time the page * gets tainted, so we won't get stuck here - * to make an already writeable page executable. */ - prot &= ~VM_PROT_EXECUTE; + * to make an already writeable page executable. + */ + if (!cs_bypass){ + prot &= ~VM_PROT_EXECUTE; + } } } /* Prevent a deadlock by not * holding the object lock if we need to wait for a page in * pmap_enter() - */ - PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, cache_attr, + PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, 0, wired, PMAP_OPTIONS_NOWAIT, pe_result); if(pe_result == KERN_RESOURCE_SHORTAGE) { @@ -2420,7 +2543,7 @@ vm_fault_enter(vm_page_t m, m->busy = TRUE; vm_object_unlock(m->object); - PMAP_ENTER(pmap, vaddr, m, prot, cache_attr, wired); + PMAP_ENTER(pmap, vaddr, m, prot, 0, wired); /* Take the object lock again. */ vm_object_lock(m->object); @@ -2435,6 +2558,7 @@ vm_fault_enter(vm_page_t m, } } +after_the_pmap_enter: /* * Hold queues lock to manipulate * the page queues. Change wiring @@ -2521,11 +2645,8 @@ vm_fault_enter(vm_page_t m, if (no_cache && (!previously_pmapped || m->no_cache)) { m->no_cache = TRUE; - if (m->active || m->inactive) - VM_PAGE_QUEUES_REMOVE(m); - if (!m->speculative) - vm_page_speculate(m, TRUE); + vm_page_speculate(m, FALSE); } else if (!m->active && !m->inactive) vm_page_activate(m); @@ -2596,12 +2717,13 @@ vm_fault( int object_lock_type = 0; int cur_object_lock_type; vm_object_t top_object = VM_OBJECT_NULL; + int throttle_delay; KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START, (int)((uint64_t)vaddr >> 32), (int)vaddr, - 0, + (map == kernel_map), 0, 0); @@ -2658,6 +2780,7 @@ vm_fault( pmap = real_map->pmap; fault_info.interruptible = interruptible; fault_info.stealth = FALSE; + fault_info.io_sync = FALSE; fault_info.mark_zf_absent = FALSE; /* @@ -2915,7 +3038,35 @@ vm_fault( } ASSERT_PAGE_DECRYPTED(m); + if(vm_page_is_slideable(m)) { + /* + * We might need to slide this page, and so, + * we want to hold the VM object exclusively. + */ + if (object != cur_object) { + if (cur_object_lock_type == OBJECT_LOCK_SHARED) { + vm_object_unlock(object); + vm_object_unlock(cur_object); + + cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE; + + vm_map_unlock_read(map); + if (real_map != map) + vm_map_unlock(real_map); + + goto RetryFault; + } + } else if (object_lock_type == OBJECT_LOCK_SHARED) { + + vm_object_unlock(object); + object_lock_type = OBJECT_LOCK_EXCLUSIVE; + vm_map_unlock_read(map); + goto RetryFault; + } + } + if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m)) { +upgrade_for_validation: /* * We might need to validate this page * against its code signature, so we @@ -2963,27 +3114,12 @@ vm_fault( */ if (object == cur_object && object->copy == VM_OBJECT_NULL) { - if ((fault_type & VM_PROT_WRITE) == 0) { - /* - * This is not a "write" fault, so we - * might not have taken the object lock - * exclusively and we might not be able - * to update the "wpmapped" bit in - * vm_fault_enter(). - * Let's just grant read access to - * the page for now and we'll - * soft-fault again if we need write - * access later... - */ - prot &= ~VM_PROT_WRITE; - } + goto FastPmapEnter; } if ((fault_type & VM_PROT_WRITE) == 0) { - prot &= ~VM_PROT_WRITE; - if (object != cur_object) { /* * We still need to hold the top object @@ -3020,27 +3156,27 @@ vm_fault( * cur_object == NULL or it's been unlocked * no paging references on either object or cur_object */ -#if MACH_KDB - if (db_watchpoint_list && (fault_type & VM_PROT_WRITE) == 0) - prot &= ~VM_PROT_WRITE; -#endif if (caller_pmap) { kr = vm_fault_enter(m, caller_pmap, caller_pmap_addr, prot, + fault_type, wired, change_wiring, fault_info.no_cache, + fault_info.cs_bypass, &type_of_fault); } else { kr = vm_fault_enter(m, pmap, vaddr, prot, + fault_type, wired, change_wiring, fault_info.no_cache, + fault_info.cs_bypass, &type_of_fault); } @@ -3060,7 +3196,7 @@ vm_fault( if (need_collapse == TRUE) vm_object_collapse(object, offset, TRUE); - + if (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT) { /* * evaluate access pattern and update state @@ -3090,7 +3226,7 @@ vm_fault( */ assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE); - if (vm_page_throttled()) { + if ((throttle_delay = vm_page_throttled())) { /* * drop all of our locks... * wait until the free queue is @@ -3104,8 +3240,9 @@ vm_fault( if (real_map != map) vm_map_unlock(real_map); - if (NEED_TO_HARD_THROTTLE_THIS_TASK()) - delay(HARD_THROTTLE_DELAY); + VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0); + + delay(throttle_delay); if (!current_thread_aborted() && vm_page_wait((change_wiring) ? THREAD_UNINT : @@ -3128,11 +3265,19 @@ vm_fault( */ break; } + /* * This is now a shadow based copy on write * fault -- it requires a copy up the shadow * chain. - * + */ + + if ((cur_object_lock_type == OBJECT_LOCK_SHARED) && + VM_FAULT_NEED_CS_VALIDATION(NULL, m)) { + goto upgrade_for_validation; + } + + /* * Allocate a page in the original top level * object. Give up if allocate fails. Also * need to remember current page, as it's the @@ -3246,7 +3391,7 @@ vm_fault( kr = KERN_MEMORY_ERROR; goto done; } - if (vm_page_throttled()) { + if ((throttle_delay = vm_page_throttled())) { /* * drop all of our locks... * wait until the free queue is @@ -3260,8 +3405,9 @@ vm_fault( if (real_map != map) vm_map_unlock(real_map); - if (NEED_TO_HARD_THROTTLE_THIS_TASK()) - delay(HARD_THROTTLE_DELAY); + VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0); + + delay(throttle_delay); if (!current_thread_aborted() && vm_page_wait((change_wiring) ? THREAD_UNINT : @@ -3329,7 +3475,7 @@ vm_fault( /* * On to the next level in the shadow chain */ - cur_offset += cur_object->shadow_offset; + cur_offset += cur_object->vo_shadow_offset; new_object = cur_object->shadow; /* @@ -3628,18 +3774,22 @@ vm_fault( caller_pmap, caller_pmap_addr, prot, + fault_type, wired, change_wiring, fault_info.no_cache, + fault_info.cs_bypass, &type_of_fault); } else { kr = vm_fault_enter(m, pmap, vaddr, prot, + fault_type, wired, change_wiring, fault_info.no_cache, + fault_info.cs_bypass, &type_of_fault); } if (kr != KERN_SUCCESS) { @@ -3670,7 +3820,7 @@ vm_fault( /* to execute, we return with a protection failure. */ if ((fault_type & VM_PROT_EXECUTE) && - (!pmap_eligible_for_execute((ppnum_t)(object->shadow_offset >> 12)))) { + (!pmap_eligible_for_execute((ppnum_t)(object->vo_shadow_offset >> 12)))) { vm_map_verify_done(map, &version); @@ -3735,7 +3885,7 @@ vm_fault( assert((uint32_t)((ldelta + hdelta) >> 12) == ((ldelta + hdelta) >> 12)); pmap_map_block(caller_pmap, (addr64_t)(caller_pmap_addr - ldelta), - (ppnum_t)((((vm_map_offset_t) (entry->object.vm_object->shadow_offset)) + + (ppnum_t)((((vm_map_offset_t) (entry->object.vm_object->vo_shadow_offset)) + entry->offset + (laddr - entry->vme_start) - ldelta) >> 12), (uint32_t)((ldelta + hdelta) >> 12), prot, (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0); @@ -3746,7 +3896,7 @@ vm_fault( assert((uint32_t)((ldelta + hdelta) >> 12) == ((ldelta + hdelta) >> 12)); pmap_map_block(real_map->pmap, (addr64_t)(vaddr - ldelta), - (ppnum_t)((((vm_map_offset_t)(entry->object.vm_object->shadow_offset)) + + (ppnum_t)((((vm_map_offset_t)(entry->object.vm_object->vo_shadow_offset)) + entry->offset + (laddr - entry->vme_start) - ldelta) >> 12), (uint32_t)((ldelta + hdelta) >> 12), prot, (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0); @@ -3888,6 +4038,8 @@ vm_fault_unwire( fault_info.hi_offset = (entry->vme_end - entry->vme_start) + entry->offset; fault_info.no_cache = entry->no_cache; fault_info.stealth = TRUE; + fault_info.io_sync = FALSE; + fault_info.cs_bypass = FALSE; fault_info.mark_zf_absent = FALSE; /* @@ -3955,16 +4107,17 @@ vm_fault_unwire( result_object = result_page->object; - if ((pmap) && (result_page->phys_page != vm_page_guard_addr)) { - pmap_change_wiring(pmap, - pmap_addr + (va - entry->vme_start), FALSE); - } if (deallocate) { assert(result_page->phys_page != vm_page_fictitious_addr); pmap_disconnect(result_page->phys_page); VM_PAGE_FREE(result_page); } else { + if ((pmap) && (result_page->phys_page != vm_page_guard_addr)) + pmap_change_wiring(pmap, + pmap_addr + (va - entry->vme_start), FALSE); + + if (VM_PAGE_WIRED(result_page)) { vm_page_lockspin_queues(); vm_page_unwire(result_page, TRUE); @@ -4160,9 +4313,11 @@ vm_fault_wire_fast( pmap, pmap_addr, prot, + prot, TRUE, FALSE, FALSE, + FALSE, &type_of_fault); done: @@ -4295,6 +4450,8 @@ vm_fault_copy( fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left; fault_info_src.no_cache = FALSE; fault_info_src.stealth = TRUE; + fault_info_src.io_sync = FALSE; + fault_info_src.cs_bypass = FALSE; fault_info_src.mark_zf_absent = FALSE; fault_info_dst.interruptible = interruptible; @@ -4304,6 +4461,8 @@ vm_fault_copy( fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left; fault_info_dst.no_cache = FALSE; fault_info_dst.stealth = TRUE; + fault_info_dst.io_sync = FALSE; + fault_info_dst.cs_bypass = FALSE; fault_info_dst.mark_zf_absent = FALSE; do { /* while (amount_left > 0) */ @@ -4625,7 +4784,7 @@ vm_fault_classify(vm_object_t object, break; } - offset += object->shadow_offset; + offset += object->vo_shadow_offset; object = object->shadow; level++; continue; @@ -4786,6 +4945,14 @@ vm_page_validate_cs( return; } +#if CHECK_CS_VALIDATION_BITMAP + if ( vnode_pager_cs_check_validation_bitmap( page->object->pager, trunc_page(page->offset + page->object->paging_offset), CS_BITMAP_CHECK ) == KERN_SUCCESS) { + page->cs_validated = TRUE; + page->cs_tainted = FALSE; + vm_cs_bitmap_validated++; + return; + } +#endif vm_object_lock_assert_exclusive(page->object); object = page->object; @@ -4823,6 +4990,11 @@ vm_page_validate_cs( /* validate the mapped page */ vm_page_validate_cs_mapped(page, (const void *) kaddr); +#if CHECK_CS_VALIDATION_BITMAP + if ( page->cs_validated == TRUE && page->cs_tainted == FALSE ) { + vnode_pager_cs_check_validation_bitmap( object->pager, trunc_page( offset + object->paging_offset), CS_BITMAP_SET ); + } +#endif assert(page->busy); assert(object == page->object); vm_object_lock_assert_exclusive(object); diff --git a/osfmk/vm/vm_fault.h b/osfmk/vm/vm_fault.h index 855100338..6d90a84b0 100644 --- a/osfmk/vm/vm_fault.h +++ b/osfmk/vm/vm_fault.h @@ -79,7 +79,6 @@ typedef kern_return_t vm_fault_return_t; #define VM_FAULT_RETRY 1 #define VM_FAULT_INTERRUPTED 2 #define VM_FAULT_MEMORY_SHORTAGE 3 -#define VM_FAULT_FICTITIOUS_SHORTAGE 4 #define VM_FAULT_MEMORY_ERROR 5 #define VM_FAULT_SUCCESS_NO_VM_PAGE 6 /* success but no VM page */ @@ -160,9 +159,11 @@ extern kern_return_t vm_fault_enter( pmap_t pmap, vm_map_offset_t vaddr, vm_prot_t prot, + vm_prot_t fault_type, boolean_t wired, boolean_t change_wiring, boolean_t no_cache, + boolean_t cs_bypass, int *type_of_fault); #endif /* MACH_KERNEL_PRIVATE */ diff --git a/osfmk/vm/vm_init.c b/osfmk/vm/vm_init.c index 8180254b2..cf29c82f6 100644 --- a/osfmk/vm/vm_init.c +++ b/osfmk/vm/vm_init.c @@ -143,6 +143,9 @@ vm_mem_bootstrap(void) if (zsize < ZONE_MAP_MIN) zsize = ZONE_MAP_MIN; /* Clamp to min */ +#if defined(__LP64__) + zsize += zsize >> 1; +#endif /* __LP64__ */ if (zsize > sane_size >> 1) zsize = sane_size >> 1; /* Clamp to half of RAM max */ #if !__LP64__ diff --git a/osfmk/vm/vm_kern.c b/osfmk/vm/vm_kern.c index aa0dbafe2..acd7d2a82 100644 --- a/osfmk/vm/vm_kern.c +++ b/osfmk/vm/vm_kern.c @@ -422,7 +422,7 @@ kernel_memory_allocate( mem->wpmapped = TRUE; PMAP_ENTER(kernel_pmap, map_addr + pg_offset, mem, - VM_PROT_READ | VM_PROT_WRITE, object->wimg_bits & VM_WIMG_MASK, TRUE); + VM_PROT_READ | VM_PROT_WRITE, 0, TRUE); if (flags & KMA_NOENCRYPT) { bzero(CAST_DOWN(void *, (map_addr + pg_offset)), PAGE_SIZE); @@ -550,9 +550,9 @@ kmem_realloc( /* attempt is made to realloc a kmem_alloc'd area */ vm_object_lock(object); vm_map_unlock(map); - if (object->size != oldmapsize) + if (object->vo_size != oldmapsize) panic("kmem_realloc"); - object->size = newmapsize; + object->vo_size = newmapsize; vm_object_unlock(object); /* allocate the new pages while expanded portion of the */ @@ -574,7 +574,7 @@ kmem_realloc( VM_PAGE_FREE(mem); } } - object->size = oldmapsize; + object->vo_size = oldmapsize; vm_object_unlock(object); vm_object_deallocate(object); return kr; @@ -598,7 +598,7 @@ kmem_realloc( VM_PAGE_FREE(mem); } } - object->size = oldmapsize; + object->vo_size = oldmapsize; vm_object_unlock(object); vm_object_deallocate(object); return (kr); @@ -812,10 +812,7 @@ kmem_remap_pages( mem->pmapped = TRUE; mem->wpmapped = TRUE; - PMAP_ENTER(kernel_pmap, map_start, mem, protection, - ((unsigned int)(mem->object->wimg_bits)) - & VM_WIMG_MASK, - TRUE); + PMAP_ENTER(kernel_pmap, map_start, mem, protection, 0, TRUE); map_start += PAGE_SIZE; offset += PAGE_SIZE; @@ -892,7 +889,6 @@ kmem_suballoc( return (KERN_SUCCESS); } - /* * kmem_init: * @@ -910,25 +906,35 @@ kmem_init( map_start = vm_map_trunc_page(start); map_end = vm_map_round_page(end); - kernel_map = vm_map_create(pmap_kernel(),VM_MIN_KERNEL_ADDRESS, + kernel_map = vm_map_create(pmap_kernel(),VM_MIN_KERNEL_AND_KEXT_ADDRESS, map_end, FALSE); /* * Reserve virtual memory allocated up to this time. */ - if (start != VM_MIN_KERNEL_ADDRESS) { + if (start != VM_MIN_KERNEL_AND_KEXT_ADDRESS) { vm_map_offset_t map_addr; + kern_return_t kr; - map_addr = VM_MIN_KERNEL_ADDRESS; - (void) vm_map_enter(kernel_map, - &map_addr, - (vm_map_size_t)(map_start - VM_MIN_KERNEL_ADDRESS), - (vm_map_offset_t) 0, - VM_FLAGS_ANYWHERE | VM_FLAGS_NO_PMAP_CHECK, - VM_OBJECT_NULL, - (vm_object_offset_t) 0, FALSE, - VM_PROT_NONE, VM_PROT_NONE, - VM_INHERIT_DEFAULT); + map_addr = VM_MIN_KERNEL_AND_KEXT_ADDRESS; + kr = vm_map_enter(kernel_map, + &map_addr, + (vm_map_size_t)(map_start - VM_MIN_KERNEL_AND_KEXT_ADDRESS), + (vm_map_offset_t) 0, + VM_FLAGS_FIXED | VM_FLAGS_NO_PMAP_CHECK, + VM_OBJECT_NULL, + (vm_object_offset_t) 0, FALSE, + VM_PROT_NONE, VM_PROT_NONE, + VM_INHERIT_DEFAULT); + + if (kr != KERN_SUCCESS) { + panic("kmem_init(0x%llx,0x%llx): vm_map_enter(0x%llx,0x%llx) error 0x%x\n", + (uint64_t) start, (uint64_t) end, + (uint64_t) VM_MIN_KERNEL_AND_KEXT_ADDRESS, + (uint64_t) (map_start - VM_MIN_KERNEL_AND_KEXT_ADDRESS), + kr); + } } + /* * Set the default global user wire limit which limits the amount of * memory that can be locked via mlock(). We set this to the total @@ -1057,7 +1063,7 @@ vm_conflict_check( obj = entry->object.vm_object; obj_off = (off - entry->vme_start) + entry->offset; while(obj->shadow) { - obj_off += obj->shadow_offset; + obj_off += obj->vo_shadow_offset; obj = obj->shadow; } if((obj->pager_created) && (obj->pager == pager)) { diff --git a/osfmk/vm/vm_map.c b/osfmk/vm/vm_map.c index 68fece885..1fe35f53f 100644 --- a/osfmk/vm/vm_map.c +++ b/osfmk/vm/vm_map.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -102,12 +102,9 @@ #include #include -#ifdef ppc -#include -#endif /* ppc */ - #include #include +#include /* Internal prototypes */ @@ -235,7 +232,8 @@ static kern_return_t vm_map_copy_overwrite_nested( vm_map_offset_t dst_addr, vm_map_copy_t copy, boolean_t interruptible, - pmap_t pmap); + pmap_t pmap, + boolean_t discard_on_success); static kern_return_t vm_map_remap_extract( vm_map_t map, @@ -290,6 +288,12 @@ static kern_return_t vm_map_can_reuse( vm_map_offset_t start, vm_map_offset_t end); +#if CONFIG_FREEZE +struct default_freezer_table; +__private_extern__ void* default_freezer_mapping_create(vm_object_t, vm_offset_t); +__private_extern__ void default_freezer_mapping_free(void**, boolean_t all); +#endif + /* * Macros to copy a vm_map_entry. We must be careful to correctly * manage the wired page count. vm_map_entry_copy() creates a new @@ -334,7 +338,12 @@ MACRO_END * execute from a page that lacks execute permission. * * Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the - * default behavior for both 32 and 64 bit apps on a system-wide basis. + * default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore, + * a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow + * execution from data areas for a particular binary even if the arch normally permits it. As + * a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit + * to support some complicated use cases, notably browsers with out-of-process plugins that + * are not all NX-safe. */ extern int allow_data_exec, allow_stack_exec; @@ -361,7 +370,7 @@ override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */ if (user_tag == VM_MEMORY_STACK) return allow_stack_exec & current_abi; - return allow_data_exec & current_abi; + return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE); } @@ -428,12 +437,57 @@ static void *kentry_data; static vm_size_t kentry_data_size; static int kentry_count = 2048; /* to init kentry_data_size */ +#if CONFIG_EMBEDDED +#define NO_COALESCE_LIMIT 0 +#else #define NO_COALESCE_LIMIT ((1024 * 128) - 1) - +#endif /* Skip acquiring locks if we're in the midst of a kernel core dump */ unsigned int not_in_kdp = 1; +unsigned int vm_map_set_cache_attr_count = 0; + +kern_return_t +vm_map_set_cache_attr( + vm_map_t map, + vm_map_offset_t va) +{ + vm_map_entry_t map_entry; + vm_object_t object; + kern_return_t kr = KERN_SUCCESS; + + vm_map_lock_read(map); + + if (!vm_map_lookup_entry(map, va, &map_entry) || + map_entry->is_sub_map) { + /* + * that memory is not properly mapped + */ + kr = KERN_INVALID_ARGUMENT; + goto done; + } + object = map_entry->object.vm_object; + + if (object == VM_OBJECT_NULL) { + /* + * there should be a VM object here at this point + */ + kr = KERN_INVALID_ARGUMENT; + goto done; + } + vm_object_lock(object); + object->set_cache_attr = TRUE; + vm_object_unlock(object); + + vm_map_set_cache_attr_count++; +done: + vm_map_unlock_read(map); + + return kr; +} + + #if CONFIG_CODE_DECRYPTION /* * vm_map_apple_protected: @@ -565,7 +619,6 @@ vm_map_init( PAGE_SIZE, "maps"); zone_change(vm_map_zone, Z_NOENCRYPT, TRUE); - vm_map_entry_zone = zinit((vm_map_size_t) sizeof(struct vm_map_entry), 1024*1024, PAGE_SIZE*5, "non-kernel map entries"); @@ -588,6 +641,9 @@ vm_map_init( zone_change(vm_map_kentry_zone, Z_COLLECT, FALSE); zone_change(vm_map_kentry_zone, Z_EXPAND, FALSE); zone_change(vm_map_kentry_zone, Z_FOREIGN, TRUE); + zone_change(vm_map_kentry_zone, Z_CALLERACCT, FALSE); /* don't charge caller */ + zone_change(vm_map_copy_zone, Z_CALLERACCT, FALSE); /* don't charge caller */ + zcram(vm_map_zone, map_data, map_data_size); zcram(vm_map_kentry_zone, kentry_data, kentry_data_size); @@ -646,6 +702,8 @@ vm_map_create( result->hdr.nentries = 0; result->hdr.entries_pageable = pageable; + vm_map_store_init( &(result->hdr) ); + result->size = 0; result->user_wire_limit = MACH_VM_MAX_ADDRESS; /* default limit is unlimited */ result->user_wire_size = 0; @@ -662,9 +720,16 @@ vm_map_create( result->mapped = FALSE; result->wait_for_space = FALSE; result->switch_protect = FALSE; + result->disable_vmentry_reuse = FALSE; + result->map_disallow_data_exec = FALSE; + result->highest_entry_end = 0; result->first_free = vm_map_to_entry(result); result->hint = vm_map_to_entry(result); result->color_rr = (color_seed++) & vm_color_mask; + result->jit_entry_exists = FALSE; +#if CONFIG_FREEZE + result->default_freezer_toc = NULL; +#endif vm_map_lock_init(result); lck_mtx_init_ext(&result->s_lock, &result->s_lock_ext, &vm_map_lck_grp, &vm_map_lck_attr); @@ -698,6 +763,7 @@ _vm_map_entry_create( entry = (vm_map_entry_t) zalloc(zone); if (entry == VM_MAP_ENTRY_NULL) panic("vm_map_entry_create"); + vm_map_store_update( (vm_map_t) NULL, entry, VM_MAP_ENTRY_CREATE); return(entry); } @@ -712,13 +778,8 @@ _vm_map_entry_create( * of the stores */ #define vm_map_entry_dispose(map, entry) \ - MACRO_BEGIN \ - if((entry) == (map)->first_free) \ - (map)->first_free = vm_map_to_entry(map); \ - if((entry) == (map)->hint) \ - (map)->hint = vm_map_to_entry(map); \ - _vm_map_entry_dispose(&(map)->hdr, (entry)); \ - MACRO_END + vm_map_store_update( map, entry, VM_MAP_ENTRY_DELETE); \ + _vm_map_entry_dispose(&(map)->hdr, (entry)) #define vm_map_copy_entry_dispose(map, entry) \ _vm_map_entry_dispose(&(copy)->cpy_hdr, (entry)) @@ -739,116 +800,24 @@ _vm_map_entry_dispose( } #if MACH_ASSERT -static boolean_t first_free_is_valid(vm_map_t map); /* forward */ static boolean_t first_free_check = FALSE; -static boolean_t +boolean_t first_free_is_valid( vm_map_t map) { - vm_map_entry_t entry, next; - if (!first_free_check) return TRUE; - entry = vm_map_to_entry(map); - next = entry->vme_next; - while (vm_map_trunc_page(next->vme_start) == vm_map_trunc_page(entry->vme_end) || - (vm_map_trunc_page(next->vme_start) == vm_map_trunc_page(entry->vme_start) && - next != vm_map_to_entry(map))) { - entry = next; - next = entry->vme_next; - if (entry == vm_map_to_entry(map)) - break; - } - if (map->first_free != entry) { - printf("Bad first_free for map %p: %p should be %p\n", - map, map->first_free, entry); - return FALSE; - } - return TRUE; + return( first_free_is_valid_store( map )); } #endif /* MACH_ASSERT */ -/* - * UPDATE_FIRST_FREE: - * - * Updates the map->first_free pointer to the - * entry immediately before the first hole in the map. - * The map should be locked. - */ -#define UPDATE_FIRST_FREE(map, new_first_free) \ - MACRO_BEGIN \ - vm_map_t UFF_map; \ - vm_map_entry_t UFF_first_free; \ - vm_map_entry_t UFF_next_entry; \ - UFF_map = (map); \ - UFF_first_free = (new_first_free); \ - UFF_next_entry = UFF_first_free->vme_next; \ - while (vm_map_trunc_page(UFF_next_entry->vme_start) == \ - vm_map_trunc_page(UFF_first_free->vme_end) || \ - (vm_map_trunc_page(UFF_next_entry->vme_start) == \ - vm_map_trunc_page(UFF_first_free->vme_start) && \ - UFF_next_entry != vm_map_to_entry(UFF_map))) { \ - UFF_first_free = UFF_next_entry; \ - UFF_next_entry = UFF_first_free->vme_next; \ - if (UFF_first_free == vm_map_to_entry(UFF_map)) \ - break; \ - } \ - UFF_map->first_free = UFF_first_free; \ - assert(first_free_is_valid(UFF_map)); \ - MACRO_END - -/* - * vm_map_entry_{un,}link: - * - * Insert/remove entries from maps (or map copies). - */ -#define vm_map_entry_link(map, after_where, entry) \ - MACRO_BEGIN \ - vm_map_t VMEL_map; \ - vm_map_entry_t VMEL_entry; \ - VMEL_map = (map); \ - VMEL_entry = (entry); \ - _vm_map_entry_link(&VMEL_map->hdr, after_where, VMEL_entry); \ - UPDATE_FIRST_FREE(VMEL_map, VMEL_map->first_free); \ - MACRO_END - #define vm_map_copy_entry_link(copy, after_where, entry) \ - _vm_map_entry_link(&(copy)->cpy_hdr, after_where, (entry)) - -#define _vm_map_entry_link(hdr, after_where, entry) \ - MACRO_BEGIN \ - (hdr)->nentries++; \ - (entry)->vme_prev = (after_where); \ - (entry)->vme_next = (after_where)->vme_next; \ - (entry)->vme_prev->vme_next = (entry)->vme_next->vme_prev = (entry); \ - MACRO_END - -#define vm_map_entry_unlink(map, entry) \ - MACRO_BEGIN \ - vm_map_t VMEU_map; \ - vm_map_entry_t VMEU_entry; \ - vm_map_entry_t VMEU_first_free; \ - VMEU_map = (map); \ - VMEU_entry = (entry); \ - if (VMEU_entry->vme_start <= VMEU_map->first_free->vme_start) \ - VMEU_first_free = VMEU_entry->vme_prev; \ - else \ - VMEU_first_free = VMEU_map->first_free; \ - _vm_map_entry_unlink(&VMEU_map->hdr, VMEU_entry); \ - UPDATE_FIRST_FREE(VMEU_map, VMEU_first_free); \ - MACRO_END + _vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry)) #define vm_map_copy_entry_unlink(copy, entry) \ - _vm_map_entry_unlink(&(copy)->cpy_hdr, (entry)) - -#define _vm_map_entry_unlink(hdr, entry) \ - MACRO_BEGIN \ - (hdr)->nentries--; \ - (entry)->vme_next->vme_prev = (entry)->vme_prev; \ - (entry)->vme_prev->vme_next = (entry)->vme_next; \ - MACRO_END + _vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry)) #if MACH_ASSERT && TASK_SWAPPER /* @@ -935,22 +904,14 @@ vm_map_destroy( (void) vm_map_delete(map, map->min_offset, map->max_offset, flags, VM_MAP_NULL); /* clean up leftover special mappings (commpage, etc...) */ -#ifdef __ppc__ - /* - * PPC51: ppc64 is limited to 51-bit addresses. - * Memory beyond this 51-bit limit is mapped specially at the - * pmap level, so do not interfere. - * On PPC64, the commpage is mapped beyond the addressable range - * via a special pmap hack, so ask pmap to clean it explicitly... - */ - if (map->pmap) { - pmap_unmap_sharedpage(map->pmap); - } - /* ... and do not let regular pmap cleanup apply here */ - flags |= VM_MAP_REMOVE_NO_PMAP_CLEANUP; -#endif /* __ppc__ */ (void) vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags, VM_MAP_NULL); + +#if CONFIG_FREEZE + if (map->default_freezer_toc){ + default_freezer_mapping_free( &(map->default_freezer_toc), TRUE); + } +#endif vm_map_unlock(map); assert(map->hdr.nentries == 0); @@ -1144,42 +1105,12 @@ void vm_map_swapout(vm_map_t map) #endif /* TASK_SWAPPER */ - -/* - * SAVE_HINT_MAP_READ: - * - * Saves the specified entry as the hint for - * future lookups. only a read lock is held on map, - * so make sure the store is atomic... OSCompareAndSwap - * guarantees this... also, we don't care if we collide - * and someone else wins and stores their 'hint' - */ -#define SAVE_HINT_MAP_READ(map,value) \ - MACRO_BEGIN \ - OSCompareAndSwapPtr((map)->hint, value, &(map)->hint); \ - MACRO_END - - -/* - * SAVE_HINT_MAP_WRITE: - * - * Saves the specified entry as the hint for - * future lookups. write lock held on map, - * so no one else can be writing or looking - * until the lock is dropped, so it's safe - * to just do an assignment - */ -#define SAVE_HINT_MAP_WRITE(map,value) \ - MACRO_BEGIN \ - (map)->hint = (value); \ - MACRO_END - /* * vm_map_lookup_entry: [ internal use only ] * - * Finds the map entry containing (or - * immediately preceding) the specified address - * in the given map; the entry is returned + * Calls into the vm map store layer to find the map + * entry containing (or immediately preceding) the + * specified address in the given map; the entry is returned * in the "entry" parameter. The boolean * result indicates whether the address is * actually contained in the map. @@ -1190,69 +1121,7 @@ vm_map_lookup_entry( register vm_map_offset_t address, vm_map_entry_t *entry) /* OUT */ { - register vm_map_entry_t cur; - register vm_map_entry_t last; - - /* - * Start looking either from the head of the - * list, or from the hint. - */ - cur = map->hint; - - if (cur == vm_map_to_entry(map)) - cur = cur->vme_next; - - if (address >= cur->vme_start) { - /* - * Go from hint to end of list. - * - * But first, make a quick check to see if - * we are already looking at the entry we - * want (which is usually the case). - * Note also that we don't need to save the hint - * here... it is the same hint (unless we are - * at the header, in which case the hint didn't - * buy us anything anyway). - */ - last = vm_map_to_entry(map); - if ((cur != last) && (cur->vme_end > address)) { - *entry = cur; - return(TRUE); - } - } - else { - /* - * Go from start to hint, *inclusively* - */ - last = cur->vme_next; - cur = vm_map_first_entry(map); - } - - /* - * Search linearly - */ - - while (cur != last) { - if (cur->vme_end > address) { - if (address >= cur->vme_start) { - /* - * Save this lookup for future - * hints, and return - */ - - *entry = cur; - SAVE_HINT_MAP_READ(map, cur); - - return(TRUE); - } - break; - } - cur = cur->vme_next; - } - *entry = cur->vme_prev; - SAVE_HINT_MAP_READ(map, *entry); - - return(FALSE); + return ( vm_map_store_lookup_entry( map, address, entry )); } /* @@ -1300,11 +1169,15 @@ vm_map_find_space( vm_map_lock(map); - assert(first_free_is_valid(map)); - if ((entry = map->first_free) == vm_map_to_entry(map)) - start = map->min_offset; - else - start = entry->vme_end; + if( map->disable_vmentry_reuse == TRUE) { + VM_MAP_HIGHEST_ENTRY(map, entry, start); + } else { + assert(first_free_is_valid(map)); + if ((entry = map->first_free) == vm_map_to_entry(map)) + start = map->min_offset; + else + start = entry->vme_end; + } /* * In any case, the "entry" always precedes @@ -1415,7 +1288,7 @@ vm_map_find_space( * Insert the new entry into the list */ - vm_map_entry_link(map, entry, new_entry); + vm_map_store_entry_link(map, entry, new_entry); map->size += size; @@ -1484,8 +1357,8 @@ vm_map_pmap_enter( map, (unsigned long long)addr, object, (unsigned long long)offset); } type_of_fault = DBG_CACHE_HIT_FAULT; - kr = vm_fault_enter(m, map->pmap, addr, protection, - VM_PAGE_WIRED(m), FALSE, FALSE, + kr = vm_fault_enter(m, map->pmap, addr, protection, protection, + VM_PAGE_WIRED(m), FALSE, FALSE, FALSE, &type_of_fault); vm_object_unlock(object); @@ -1589,6 +1462,9 @@ vm_map_enter( * with a lookup of the size depending on superpage_size. */ #ifdef __x86_64__ + case SUPERPAGE_SIZE_ANY: + /* handle it like 2 MB and round up to page size */ + size = (size + 2*1024*1024 - 1) & ~(2*1024*1024 - 1); case SUPERPAGE_SIZE_2MB: break; #endif @@ -1601,9 +1477,10 @@ vm_map_enter( inheritance = VM_INHERIT_NONE; /* fork() children won't inherit superpages */ } + #if CONFIG_EMBEDDED - if (cur_protection & VM_PROT_WRITE) { - if (cur_protection & VM_PROT_EXECUTE) { + if (cur_protection & VM_PROT_WRITE){ + if ((cur_protection & VM_PROT_EXECUTE) && !(flags & VM_FLAGS_MAP_JIT)){ printf("EMBEDDED: %s curprot cannot be write+execute. turning off execute\n", __PRETTY_FUNCTION__); cur_protection &= ~VM_PROT_EXECUTE; } @@ -1634,14 +1511,7 @@ vm_map_enter( } } - if (flags & VM_FLAGS_BELOW_MIN) { - /* - * Allow an insertion below the map's min offset. - */ - effective_min_offset = 0ULL; - } else { - effective_min_offset = map->min_offset; - } + effective_min_offset = map->min_offset; if (flags & VM_FLAGS_BEYOND_MAX) { /* @@ -1675,7 +1545,7 @@ vm_map_enter( if (purgable && (offset != 0 || (object != VM_OBJECT_NULL && - (object->size != size || + (object->vo_size != size || object->purgable == VM_PURGABLE_DENY)) || size > ANON_MAX_SIZE)) /* LP64todo: remove when dp capable */ return KERN_INVALID_ARGUMENT; @@ -1703,6 +1573,11 @@ StartAgain: ; if (anywhere) { vm_map_lock(map); map_locked = TRUE; + + if ((flags & VM_FLAGS_MAP_JIT) && (map->jit_entry_exists)){ + result = KERN_INVALID_ARGUMENT; + goto BailOut; + } /* * Calculate the first possible address. @@ -1719,15 +1594,39 @@ StartAgain: ; * address, we have to start after it. */ - assert(first_free_is_valid(map)); - if (start == effective_min_offset) { - if ((entry = map->first_free) != vm_map_to_entry(map)) - start = entry->vme_end; + if( map->disable_vmentry_reuse == TRUE) { + VM_MAP_HIGHEST_ENTRY(map, entry, start); } else { - vm_map_entry_t tmp_entry; - if (vm_map_lookup_entry(map, start, &tmp_entry)) - start = tmp_entry->vme_end; - entry = tmp_entry; + assert(first_free_is_valid(map)); + + entry = map->first_free; + + if (entry == vm_map_to_entry(map)) { + entry = NULL; + } else { + if (entry->vme_next == vm_map_to_entry(map)){ + /* + * Hole at the end of the map. + */ + entry = NULL; + } else { + if (start < (entry->vme_next)->vme_start ) { + start = entry->vme_end; + } else { + /* + * Need to do a lookup. + */ + entry = NULL; + } + } + } + + if (entry == NULL) { + vm_map_entry_t tmp_entry; + if (vm_map_lookup_entry(map, start, &tmp_entry)) + start = tmp_entry->vme_end; + entry = tmp_entry; + } } /* @@ -1944,7 +1843,7 @@ StartAgain: ; (entry->vme_end == start) && (!entry->is_shared) && (!entry->is_sub_map) && - (entry->alias == alias) && + ((alias == VM_MEMORY_REALLOC) || (entry->alias == alias)) && (entry->inheritance == inheritance) && (entry->protection == cur_protection) && (entry->max_protection == max_protection) && @@ -1970,7 +1869,7 @@ StartAgain: ; */ map->size += (end - entry->vme_end); entry->vme_end = end; - UPDATE_FIRST_FREE(map, map->first_free); + vm_map_store_update_first_free(map, map->first_free); RETURN(KERN_SUCCESS); } } @@ -2009,9 +1908,17 @@ StartAgain: ; FALSE, FALSE, cur_protection, max_protection, VM_BEHAVIOR_DEFAULT, - inheritance, 0, no_cache, + (flags & VM_FLAGS_MAP_JIT)? VM_INHERIT_NONE: inheritance, + 0, no_cache, permanent, superpage_size); new_entry->alias = alias; + if (flags & VM_FLAGS_MAP_JIT){ + if (!(map->jit_entry_exists)){ + new_entry->used_for_jit = TRUE; + map->jit_entry_exists = TRUE; + } + } + if (is_submap) { vm_map_t submap; boolean_t submap_is_64bit; @@ -2068,7 +1975,7 @@ StartAgain: ; /* create one vm_object per superpage */ sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start)); sp_object->phys_contiguous = TRUE; - sp_object->shadow_offset = (vm_object_offset_t)pages->phys_page*PAGE_SIZE; + sp_object->vo_shadow_offset = (vm_object_offset_t)pages->phys_page*PAGE_SIZE; entry->object.vm_object = sp_object; /* enter the base pages into the object */ @@ -2225,10 +2132,10 @@ BailOut: ; entry_size = (entry2->vme_end - entry2->vme_start); - vm_map_entry_unlink(zap_old_map, + vm_map_store_entry_unlink(zap_old_map, entry2); zap_old_map->size -= entry_size; - vm_map_entry_link(map, entry1, entry2); + vm_map_store_entry_link(map, entry1, entry2); map->size += entry_size; entry1 = entry2; } @@ -2284,6 +2191,12 @@ vm_map_enter_mem_object( vm_object_t object; vm_object_size_t size; kern_return_t result; + boolean_t mask_cur_protection, mask_max_protection; + + mask_cur_protection = cur_protection & VM_PROT_IS_MASK; + mask_max_protection = max_protection & VM_PROT_IS_MASK; + cur_protection &= ~VM_PROT_IS_MASK; + max_protection &= ~VM_PROT_IS_MASK; /* * Check arguments for validity @@ -2294,7 +2207,7 @@ vm_map_enter_mem_object( (inheritance > VM_INHERIT_LAST_VALID) || initial_size == 0) return KERN_INVALID_ARGUMENT; - + map_addr = vm_map_trunc_page(*address); map_size = vm_map_round_page(initial_size); size = vm_object_round_page(initial_size); @@ -2316,6 +2229,12 @@ vm_map_enter_mem_object( return KERN_INVALID_RIGHT; size = named_entry->size - offset; } + if (mask_max_protection) { + max_protection &= named_entry->protection; + } + if (mask_cur_protection) { + cur_protection &= named_entry->protection; + } if ((named_entry->protection & max_protection) != max_protection) return KERN_INVALID_RIGHT; @@ -2379,7 +2298,6 @@ vm_map_enter_mem_object( unsigned int access; vm_prot_t protections; unsigned int wimg_mode; - boolean_t cache_attr; protections = named_entry->protection & VM_PROT_ALL; access = GET_MAP_MEM(named_entry->protection); @@ -2404,6 +2322,7 @@ vm_map_enter_mem_object( named_entry_unlock(named_entry); wimg_mode = object->wimg_bits; + if (access == MAP_MEM_IO) { wimg_mode = VM_WIMG_IO; } else if (access == MAP_MEM_COPYBACK) { @@ -2413,11 +2332,6 @@ vm_map_enter_mem_object( } else if (access == MAP_MEM_WCOMB) { wimg_mode = VM_WIMG_WCOMB; } - if (wimg_mode == VM_WIMG_IO || - wimg_mode == VM_WIMG_WCOMB) - cache_attr = TRUE; - else - cache_attr = FALSE; /* wait for object (if any) to be ready */ if (!named_entry->internal) { @@ -2430,22 +2344,11 @@ vm_map_enter_mem_object( } } - if (object->wimg_bits != wimg_mode) { - vm_page_t p; + if (object->wimg_bits != wimg_mode) + vm_object_change_wimg_mode(object, wimg_mode); - vm_object_paging_wait(object, THREAD_UNINT); - - object->wimg_bits = wimg_mode; - queue_iterate(&object->memq, p, vm_page_t, listq) { - if (!p->fictitious) { - if (p->pmapped) - pmap_disconnect(p->phys_page); - if (cache_attr) - pmap_sync_page_attributes_phys(p->phys_page); - } - } - } object->true_share = TRUE; + if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) object->copy_strategy = MEMORY_OBJECT_COPY_DELAY; vm_object_unlock(object); @@ -2926,8 +2829,8 @@ vm_map_enter_cpm( type_of_fault = DBG_ZERO_FILL_FAULT; - vm_fault_enter(m, pmap, va, VM_PROT_ALL, - VM_PAGE_WIRED(m), FALSE, FALSE, + vm_fault_enter(m, pmap, va, VM_PROT_ALL, VM_PROT_WRITE, + VM_PAGE_WIRED(m), FALSE, FALSE, FALSE, &type_of_fault); vm_object_unlock(cpm_obj); @@ -3037,13 +2940,13 @@ vm_map_clip_unnest( _vm_map_clip_start(&map->hdr, entry, start_unnest); - UPDATE_FIRST_FREE(map, map->first_free); + vm_map_store_update_first_free(map, map->first_free); } if (entry->vme_end > end_unnest) { _vm_map_clip_end(&map->hdr, entry, end_unnest); - UPDATE_FIRST_FREE(map, map->first_free); + vm_map_store_update_first_free(map, map->first_free); } pmap_unnest(map->pmap, @@ -3100,7 +3003,7 @@ vm_map_clip_start( (addr64_t)(entry->vme_end)); } _vm_map_clip_start(&map->hdr, entry, startaddr); - UPDATE_FIRST_FREE(map, map->first_free); + vm_map_store_update_first_free(map, map->first_free); } } @@ -3138,7 +3041,7 @@ _vm_map_clip_start( entry->offset += (start - entry->vme_start); entry->vme_start = start; - _vm_map_entry_link(map_header, entry->vme_prev, new_entry); + _vm_map_store_entry_link(map_header, entry->vme_prev, new_entry); if (entry->is_sub_map) vm_map_reference(new_entry->object.sub_map); @@ -3194,7 +3097,7 @@ vm_map_clip_end( (addr64_t)(entry->vme_end)); } _vm_map_clip_end(&map->hdr, entry, endaddr); - UPDATE_FIRST_FREE(map, map->first_free); + vm_map_store_update_first_free(map, map->first_free); } } @@ -3228,7 +3131,7 @@ _vm_map_clip_end( new_entry->vme_start = entry->vme_end = end; new_entry->offset += (end - entry->vme_start); - _vm_map_entry_link(map_header, entry, new_entry); + _vm_map_store_entry_link(map_header, entry, new_entry); if (entry->is_sub_map) vm_map_reference(new_entry->object.sub_map); @@ -3497,7 +3400,7 @@ vm_map_protect( #if CONFIG_EMBEDDED if (new_prot & VM_PROT_WRITE) { - if (new_prot & VM_PROT_EXECUTE) { + if ((new_prot & VM_PROT_EXECUTE) && !(current->used_for_jit)) { printf("EMBEDDED: %s can't have both write and exec at the same time\n", __FUNCTION__); new_prot &= ~VM_PROT_EXECUTE; } @@ -3541,6 +3444,11 @@ vm_map_protect( /* will include write. Caller must be prepared */ /* for loss of shared memory communication in the */ /* target area after taking this step */ + + if (current->is_sub_map == FALSE && current->object.vm_object == VM_OBJECT_NULL){ + current->object.vm_object = vm_object_allocate((vm_map_size_t)(current->vme_end - current->vme_start)); + current->offset = 0; + } current->needs_copy = TRUE; current->max_protection |= VM_PROT_WRITE; } @@ -3676,6 +3584,7 @@ add_wire_counts( vm_map_size_t size; if (user_wire) { + unsigned int total_wire_count = vm_page_wire_count + vm_lopage_free_count; /* * We're wiring memory at the request of the user. Check if this is the first time the user is wiring @@ -3694,8 +3603,8 @@ add_wire_counts( */ if(size + map->user_wire_size > MIN(map->user_wire_limit, vm_user_wire_limit) || - size + ptoa_64(vm_page_wire_count) > vm_global_user_wire_limit || - size + ptoa_64(vm_page_wire_count) > max_mem - vm_global_no_user_wire_amount) + size + ptoa_64(total_wire_count) > vm_global_user_wire_limit || + size + ptoa_64(total_wire_count) > max_mem - vm_global_no_user_wire_amount) return KERN_RESOURCE_SHORTAGE; /* @@ -3907,9 +3816,6 @@ vm_map_wire_nested( * Worse that can happen is, it may not exist anymore. */ if (!vm_map_lookup_entry(map, s, &first_entry)) { - if (!user_wire) - panic("vm_map_wire: re-lookup failed"); - /* * User: undo everything upto the previous * entry. let vm_map_unwire worry about @@ -4269,25 +4175,8 @@ vm_map_wire( kern_return_t kret; -#ifdef ppc - /* - * the calls to mapping_prealloc and mapping_relpre - * (along with the VM_MAP_RANGE_CHECK to insure a - * resonable range was passed in) are - * currently necessary because - * we haven't enabled kernel pre-emption - * and/or the pmap_enter cannot purge and re-use - * existing mappings - */ - VM_MAP_RANGE_CHECK(map, start, end); - assert((unsigned int) (end - start) == (end - start)); - mapping_prealloc((unsigned int) (end - start)); -#endif kret = vm_map_wire_nested(map, start, end, access_type, user_wire, (pmap_t)NULL, 0); -#ifdef ppc - mapping_relpre(); -#endif return kret; } @@ -4677,7 +4566,7 @@ vm_map_entry_delete( object = entry->object.vm_object; } - vm_map_entry_unlink(map, entry); + vm_map_store_entry_unlink(map, entry); map->size -= e - s; vm_map_entry_dispose(map, entry); @@ -4735,7 +4624,7 @@ vm_map_submap_pmap_clean( && (entry->object.vm_object != NULL)) { vm_object_pmap_protect( entry->object.vm_object, - entry->offset, + entry->offset+(offset-entry->vme_start), remove_size, PMAP_NULL, entry->vme_start, @@ -5192,9 +5081,9 @@ vm_map_delete( * these entries. */ /* unlink the entry from "map" ... */ - vm_map_entry_unlink(map, entry); + vm_map_store_entry_unlink(map, entry); /* ... and add it to the end of the "zap_map" */ - vm_map_entry_link(zap_map, + vm_map_store_entry_link(zap_map, vm_map_last_entry(zap_map), entry); entry_size = entry->vme_end - entry->vme_start; @@ -5569,7 +5458,8 @@ vm_map_copy_overwrite_nested( vm_map_address_t dst_addr, vm_map_copy_t copy, boolean_t interruptible, - pmap_t pmap) + pmap_t pmap, + boolean_t discard_on_success) { vm_map_offset_t dst_end; vm_map_entry_t tmp_entry; @@ -5609,7 +5499,8 @@ vm_map_copy_overwrite_nested( assert(copy->type == VM_MAP_COPY_ENTRY_LIST); if (copy->size == 0) { - vm_map_copy_discard(copy); + if (discard_on_success) + vm_map_copy_discard(copy); return(KERN_SUCCESS); } @@ -5942,20 +5833,23 @@ vm_map_copy_overwrite_nested( sub_start, copy, interruptible, - entry->object.sub_map->pmap); + entry->object.sub_map->pmap, + TRUE); } else if (pmap != NULL) { kr = vm_map_copy_overwrite_nested( entry->object.sub_map, sub_start, copy, - interruptible, pmap); + interruptible, pmap, + TRUE); } else { kr = vm_map_copy_overwrite_nested( entry->object.sub_map, sub_start, copy, interruptible, - dst_map->pmap); + dst_map->pmap, + TRUE); } if(kr != KERN_SUCCESS) { if(next_copy != NULL) { @@ -6148,7 +6042,8 @@ vm_map_copy_overwrite_nested( /* * Throw away the vm_map_copy object */ - vm_map_copy_discard(copy); + if (discard_on_success) + vm_map_copy_discard(copy); return(KERN_SUCCESS); }/* vm_map_copy_overwrite */ @@ -6160,8 +6055,236 @@ vm_map_copy_overwrite( vm_map_copy_t copy, boolean_t interruptible) { - return vm_map_copy_overwrite_nested( - dst_map, dst_addr, copy, interruptible, (pmap_t) NULL); + vm_map_size_t head_size, tail_size; + vm_map_copy_t head_copy, tail_copy; + vm_map_offset_t head_addr, tail_addr; + vm_map_entry_t entry; + kern_return_t kr; + + head_size = 0; + tail_size = 0; + head_copy = NULL; + tail_copy = NULL; + head_addr = 0; + tail_addr = 0; + + if (interruptible || + copy == VM_MAP_COPY_NULL || + copy->type != VM_MAP_COPY_ENTRY_LIST) { + /* + * We can't split the "copy" map if we're interruptible + * or if we don't have a "copy" map... + */ + blunt_copy: + return vm_map_copy_overwrite_nested(dst_map, + dst_addr, + copy, + interruptible, + (pmap_t) NULL, + TRUE); + } + + if (copy->size < 3 * PAGE_SIZE) { + /* + * Too small to bother with optimizing... + */ + goto blunt_copy; + } + + if ((dst_addr & PAGE_MASK) != (copy->offset & PAGE_MASK)) { + /* + * Incompatible mis-alignment of source and destination... + */ + goto blunt_copy; + } + + /* + * Proper alignment or identical mis-alignment at the beginning. + * Let's try and do a small unaligned copy first (if needed) + * and then an aligned copy for the rest. + */ + if (!page_aligned(dst_addr)) { + head_addr = dst_addr; + head_size = PAGE_SIZE - (copy->offset & PAGE_MASK); + } + if (!page_aligned(copy->offset + copy->size)) { + /* + * Mis-alignment at the end. + * Do an aligned copy up to the last page and + * then an unaligned copy for the remaining bytes. + */ + tail_size = (copy->offset + copy->size) & PAGE_MASK; + tail_addr = dst_addr + copy->size - tail_size; + } + + if (head_size + tail_size == copy->size) { + /* + * It's all unaligned, no optimization possible... + */ + goto blunt_copy; + } + + /* + * Can't optimize if there are any submaps in the + * destination due to the way we free the "copy" map + * progressively in vm_map_copy_overwrite_nested() + * in that case. + */ + vm_map_lock_read(dst_map); + if (! vm_map_lookup_entry(dst_map, dst_addr, &entry)) { + vm_map_unlock_read(dst_map); + goto blunt_copy; + } + for (; + (entry != vm_map_copy_to_entry(copy) && + entry->vme_start < dst_addr + copy->size); + entry = entry->vme_next) { + if (entry->is_sub_map) { + vm_map_unlock_read(dst_map); + goto blunt_copy; + } + } + vm_map_unlock_read(dst_map); + + if (head_size) { + /* + * Unaligned copy of the first "head_size" bytes, to reach + * a page boundary. + */ + + /* + * Extract "head_copy" out of "copy". + */ + head_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); + vm_map_copy_first_entry(head_copy) = + vm_map_copy_to_entry(head_copy); + vm_map_copy_last_entry(head_copy) = + vm_map_copy_to_entry(head_copy); + head_copy->type = VM_MAP_COPY_ENTRY_LIST; + head_copy->cpy_hdr.nentries = 0; + head_copy->cpy_hdr.entries_pageable = + copy->cpy_hdr.entries_pageable; + vm_map_store_init(&head_copy->cpy_hdr); + + head_copy->offset = copy->offset; + head_copy->size = head_size; + + copy->offset += head_size; + copy->size -= head_size; + + entry = vm_map_copy_first_entry(copy); + vm_map_copy_clip_end(copy, entry, copy->offset); + vm_map_copy_entry_unlink(copy, entry); + vm_map_copy_entry_link(head_copy, + vm_map_copy_to_entry(head_copy), + entry); + + /* + * Do the unaligned copy. + */ + kr = vm_map_copy_overwrite_nested(dst_map, + head_addr, + head_copy, + interruptible, + (pmap_t) NULL, + FALSE); + if (kr != KERN_SUCCESS) + goto done; + } + + if (tail_size) { + /* + * Extract "tail_copy" out of "copy". + */ + tail_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); + vm_map_copy_first_entry(tail_copy) = + vm_map_copy_to_entry(tail_copy); + vm_map_copy_last_entry(tail_copy) = + vm_map_copy_to_entry(tail_copy); + tail_copy->type = VM_MAP_COPY_ENTRY_LIST; + tail_copy->cpy_hdr.nentries = 0; + tail_copy->cpy_hdr.entries_pageable = + copy->cpy_hdr.entries_pageable; + vm_map_store_init(&tail_copy->cpy_hdr); + + tail_copy->offset = copy->offset + copy->size - tail_size; + tail_copy->size = tail_size; + + copy->size -= tail_size; + + entry = vm_map_copy_last_entry(copy); + vm_map_copy_clip_start(copy, entry, tail_copy->offset); + entry = vm_map_copy_last_entry(copy); + vm_map_copy_entry_unlink(copy, entry); + vm_map_copy_entry_link(tail_copy, + vm_map_copy_last_entry(tail_copy), + entry); + } + + /* + * Copy most (or possibly all) of the data. + */ + kr = vm_map_copy_overwrite_nested(dst_map, + dst_addr + head_size, + copy, + interruptible, + (pmap_t) NULL, + FALSE); + if (kr != KERN_SUCCESS) { + goto done; + } + + if (tail_size) { + kr = vm_map_copy_overwrite_nested(dst_map, + tail_addr, + tail_copy, + interruptible, + (pmap_t) NULL, + FALSE); + } + +done: + assert(copy->type == VM_MAP_COPY_ENTRY_LIST); + if (kr == KERN_SUCCESS) { + /* + * Discard all the copy maps. + */ + if (head_copy) { + vm_map_copy_discard(head_copy); + head_copy = NULL; + } + vm_map_copy_discard(copy); + if (tail_copy) { + vm_map_copy_discard(tail_copy); + tail_copy = NULL; + } + } else { + /* + * Re-assemble the original copy map. + */ + if (head_copy) { + entry = vm_map_copy_first_entry(head_copy); + vm_map_copy_entry_unlink(head_copy, entry); + vm_map_copy_entry_link(copy, + vm_map_copy_to_entry(copy), + entry); + copy->offset -= head_size; + copy->size += head_size; + vm_map_copy_discard(head_copy); + head_copy = NULL; + } + if (tail_copy) { + entry = vm_map_copy_last_entry(tail_copy); + vm_map_copy_entry_unlink(tail_copy, entry); + vm_map_copy_entry_link(copy, + vm_map_copy_last_entry(copy), + entry); + copy->size += tail_size; + vm_map_copy_discard(tail_copy); + tail_copy = NULL; + } + } + return kr; } @@ -6832,19 +6955,8 @@ vm_map_copyout_kernel_buffer( */ #define vm_map_copy_insert(map, where, copy) \ MACRO_BEGIN \ - vm_map_t VMCI_map; \ - vm_map_entry_t VMCI_where; \ - vm_map_copy_t VMCI_copy; \ - VMCI_map = (map); \ - VMCI_where = (where); \ - VMCI_copy = (copy); \ - ((VMCI_where->vme_next)->vme_prev = vm_map_copy_last_entry(VMCI_copy))\ - ->vme_next = (VMCI_where->vme_next); \ - ((VMCI_where)->vme_next = vm_map_copy_first_entry(VMCI_copy)) \ - ->vme_prev = VMCI_where; \ - VMCI_map->hdr.nentries += VMCI_copy->cpy_hdr.nentries; \ - UPDATE_FIRST_FREE(VMCI_map, VMCI_map->first_free); \ - zfree(vm_map_copy_zone, VMCI_copy); \ + vm_map_store_copy_insert(map, where, copy); \ + zfree(vm_map_copy_zone, copy); \ MACRO_END /* @@ -6928,9 +7040,14 @@ vm_map_copyout( StartAgain: ; vm_map_lock(dst_map); - assert(first_free_is_valid(dst_map)); - start = ((last = dst_map->first_free) == vm_map_to_entry(dst_map)) ? + if( dst_map->disable_vmentry_reuse == TRUE) { + VM_MAP_HIGHEST_ENTRY(dst_map, entry, start); + last = entry; + } else { + assert(first_free_is_valid(dst_map)); + start = ((last = dst_map->first_free) == vm_map_to_entry(dst_map)) ? vm_map_min(dst_map) : last->vme_end; + } while (TRUE) { vm_map_entry_t next = last->vme_next; @@ -6984,11 +7101,8 @@ StartAgain: ; * Reinitialize the copy so that vm_map_copy_entry_link * will work. */ - copy->cpy_hdr.nentries = 0; + vm_map_store_copy_reset(copy, entry); copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable; - vm_map_copy_first_entry(copy) = - vm_map_copy_last_entry(copy) = - vm_map_copy_to_entry(copy); /* * Copy each entry. @@ -7087,8 +7201,8 @@ StartAgain: ; type_of_fault = DBG_CACHE_HIT_FAULT; - vm_fault_enter(m, dst_map->pmap, va, prot, - VM_PAGE_WIRED(m), FALSE, FALSE, + vm_fault_enter(m, dst_map->pmap, va, prot, prot, + VM_PAGE_WIRED(m), FALSE, FALSE, FALSE, &type_of_fault); vm_object_unlock(object); @@ -7257,6 +7371,8 @@ vm_map_copyin_common( copy->cpy_hdr.nentries = 0; copy->cpy_hdr.entries_pageable = TRUE; + vm_map_store_init( &(copy->cpy_hdr) ); + copy->offset = src_addr; copy->size = len; @@ -7820,7 +7936,7 @@ vm_map_fork_share( object->shadowed || /* case 2 */ (!object->true_share && /* case 3 */ !old_entry->is_shared && - (object->size > + (object->vo_size > (vm_map_size_t)(old_entry->vme_end - old_entry->vme_start)))) { @@ -7900,7 +8016,6 @@ vm_map_fork_share( * (This is a preemptive version of * case 2.) */ - vm_object_shadow(&old_entry->object.vm_object, &old_entry->offset, (vm_map_size_t) (old_entry->vme_end - @@ -7941,6 +8056,7 @@ vm_map_fork_share( old_entry->needs_copy = FALSE; object = old_entry->object.vm_object; } + /* * If object was using a symmetric copy strategy, @@ -7980,7 +8096,7 @@ vm_map_fork_share( * map. */ - vm_map_entry_link(new_map, vm_map_last_entry(new_map), new_entry); + vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry); /* * Update the physical map @@ -8110,7 +8226,6 @@ vm_map_fork( old_map->min_offset, old_map->max_offset, old_map->hdr.entries_pageable); - for ( old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map); @@ -8192,7 +8307,7 @@ vm_map_fork( * of the map. */ - vm_map_entry_link(new_map, vm_map_last_entry(new_map), + vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry); new_size += entry_size; break; @@ -8284,9 +8399,22 @@ vm_map_lookup_locked( vm_map_offset_t old_start = 0; vm_map_offset_t old_end = 0; register vm_prot_t prot; + boolean_t mask_protections; + vm_prot_t original_fault_type; + + /* + * VM_PROT_MASK means that the caller wants us to use "fault_type" + * as a mask against the mapping's actual protections, not as an + * absolute value. + */ + mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE; + fault_type &= ~VM_PROT_IS_MASK; + original_fault_type = fault_type; *real_map = map; -RetryLookup: ; + +RetryLookup: + fault_type = original_fault_type; /* * If the map has an interesting hint, try it before calling @@ -8614,7 +8742,14 @@ RetryLookup: ; prot |= VM_PROT_EXECUTE; } + if (mask_protections) { + fault_type &= prot; + if (fault_type == VM_PROT_NONE) { + goto protection_failure; + } + } if ((fault_type & (prot)) != fault_type) { + protection_failure: if (*real_map != map) { vm_map_unlock(*real_map); } @@ -8716,6 +8851,8 @@ RetryLookup: ; fault_info->hi_offset = (entry->vme_end - entry->vme_start) + entry->offset; fault_info->no_cache = entry->no_cache; fault_info->stealth = FALSE; + fault_info->io_sync = FALSE; + fault_info->cs_bypass = (entry->used_for_jit)? TRUE : FALSE; fault_info->mark_zf_absent = FALSE; } @@ -8797,20 +8934,26 @@ vm_map_region_recurse_64( * "curr_entry" is the VM map entry preceding or including the * address we're looking for. * "curr_map" is the map or sub-map containing "curr_entry". + * "curr_address" is the equivalent of the top map's "user_address" + * in the current map. * "curr_offset" is the cumulated offset of "curr_map" in the * target task's address space. * "curr_depth" is the depth of "curr_map" in the chain of * sub-maps. - * "curr_max_offset" is the maximum offset we should take into - * account in the current map. It may be smaller than the current - * map's "max_offset" because we might not have mapped it all in - * the upper level map. + * + * "curr_max_below" and "curr_max_above" limit the range (around + * "curr_address") we should take into account in the current (sub)map. + * They limit the range to what's visible through the map entries + * we've traversed from the top map to the current map. + */ vm_map_entry_t curr_entry; + vm_map_address_t curr_address; vm_map_offset_t curr_offset; vm_map_t curr_map; unsigned int curr_depth; - vm_map_offset_t curr_max_offset; + vm_map_offset_t curr_max_below, curr_max_above; + vm_map_offset_t curr_skip; /* * "next_" is the same as "curr_" but for the VM region immediately @@ -8820,9 +8963,11 @@ vm_map_region_recurse_64( */ vm_map_entry_t next_entry; vm_map_offset_t next_offset; + vm_map_offset_t next_address; vm_map_t next_map; unsigned int next_depth; - vm_map_offset_t next_max_offset; + vm_map_offset_t next_max_below, next_max_above; + vm_map_offset_t next_skip; boolean_t look_for_pages; vm_region_submap_short_info_64_t short_info; @@ -8857,15 +9002,21 @@ vm_map_region_recurse_64( curr_entry = NULL; curr_map = map; + curr_address = user_address; curr_offset = 0; + curr_skip = 0; curr_depth = 0; - curr_max_offset = curr_map->max_offset; + curr_max_above = ((vm_map_offset_t) -1) - curr_address; + curr_max_below = curr_address; next_entry = NULL; next_map = NULL; + next_address = 0; next_offset = 0; + next_skip = 0; next_depth = 0; - next_max_offset = curr_max_offset; + next_max_above = (vm_map_offset_t) -1; + next_max_below = (vm_map_offset_t) -1; if (not_in_kdp) { vm_map_lock_read(curr_map); @@ -8873,19 +9024,22 @@ vm_map_region_recurse_64( for (;;) { if (vm_map_lookup_entry(curr_map, - user_address - curr_offset, + curr_address, &tmp_entry)) { /* tmp_entry contains the address we're looking for */ curr_entry = tmp_entry; } else { + vm_map_offset_t skip; /* * The address is not mapped. "tmp_entry" is the * map entry preceding the address. We want the next * one, if it exists. */ curr_entry = tmp_entry->vme_next; + if (curr_entry == vm_map_to_entry(curr_map) || - curr_entry->vme_start >= curr_max_offset) { + (curr_entry->vme_start >= + curr_address + curr_max_above)) { /* no next entry at this level: stop looking */ if (not_in_kdp) { vm_map_unlock_read(curr_map); @@ -8894,9 +9048,18 @@ vm_map_region_recurse_64( curr_map = NULL; curr_offset = 0; curr_depth = 0; - curr_max_offset = 0; + curr_max_above = 0; + curr_max_below = 0; break; } + + /* adjust current address and offset */ + skip = curr_entry->vme_start - curr_address; + curr_address = curr_entry->vme_start; + curr_skip = skip; + curr_offset += skip; + curr_max_above -= skip; + curr_max_below = 0; } /* @@ -8907,7 +9070,8 @@ vm_map_region_recurse_64( tmp_entry = curr_entry->vme_next; if (tmp_entry == vm_map_to_entry(curr_map)) { /* no next entry at this level */ - } else if (tmp_entry->vme_start >= curr_max_offset) { + } else if (tmp_entry->vme_start >= + curr_address + curr_max_above) { /* * tmp_entry is beyond the scope of what we mapped of * this submap in the upper level: ignore it. @@ -8928,11 +9092,32 @@ vm_map_region_recurse_64( } next_entry = tmp_entry; next_map = curr_map; - next_offset = curr_offset; next_depth = curr_depth; - next_max_offset = curr_max_offset; + next_address = next_entry->vme_start; + next_skip = curr_skip; + next_offset = curr_offset; + next_offset += (next_address - curr_address); + next_max_above = MIN(next_max_above, curr_max_above); + next_max_above = MIN(next_max_above, + next_entry->vme_end - next_address); + next_max_below = MIN(next_max_below, curr_max_below); + next_max_below = MIN(next_max_below, + next_address - next_entry->vme_start); } + /* + * "curr_max_{above,below}" allow us to keep track of the + * portion of the submap that is actually mapped at this level: + * the rest of that submap is irrelevant to us, since it's not + * mapped here. + * The relevant portion of the map starts at + * "curr_entry->offset" up to the size of "curr_entry". + */ + curr_max_above = MIN(curr_max_above, + curr_entry->vme_end - curr_address); + curr_max_below = MIN(curr_max_below, + curr_address - curr_entry->vme_start); + if (!curr_entry->is_sub_map || curr_depth >= user_max_depth) { /* @@ -8973,21 +9158,11 @@ vm_map_region_recurse_64( * space (i.e. the top-level VM map). */ curr_offset += - (curr_entry->vme_start - curr_entry->offset); + (curr_entry->offset - curr_entry->vme_start); + curr_address = user_address + curr_offset; /* switch to the submap */ curr_map = curr_entry->object.sub_map; curr_depth++; - /* - * "curr_max_offset" allows us to keep track of the - * portion of the submap that is actually mapped at this level: - * the rest of that submap is irrelevant to us, since it's not - * mapped here. - * The relevant portion of the map starts at - * "curr_entry->offset" up to the size of "curr_entry". - */ - curr_max_offset = - curr_entry->vme_end - curr_entry->vme_start + - curr_entry->offset; curr_entry = NULL; } @@ -9000,9 +9175,15 @@ vm_map_region_recurse_64( /* ... gather info about the next VM region */ curr_entry = next_entry; curr_map = next_map; /* still locked ... */ + curr_address = next_address; + curr_skip = next_skip; curr_offset = next_offset; curr_depth = next_depth; - curr_max_offset = next_max_offset; + curr_max_above = next_max_above; + curr_max_below = next_max_below; + if (curr_map == map) { + user_address = curr_address; + } } else { /* we won't need "next_entry" after all */ if (next_entry != NULL) { @@ -9015,12 +9196,14 @@ vm_map_region_recurse_64( next_entry = NULL; next_map = NULL; next_offset = 0; + next_skip = 0; next_depth = 0; - next_max_offset = 0; + next_max_below = -1; + next_max_above = -1; *nesting_depth = curr_depth; - *size = curr_entry->vme_end - curr_entry->vme_start; - *address = curr_entry->vme_start + curr_offset; + *size = curr_max_above + curr_max_below; + *address = user_address + curr_skip - curr_max_below; // LP64todo: all the current tools are 32bit, obviously never worked for 64b // so probably should be a real 32b ID vs. ptr. @@ -9058,12 +9241,18 @@ vm_map_region_recurse_64( if (not_in_kdp) { if (!curr_entry->is_sub_map) { + vm_map_offset_t range_start, range_end; + range_start = MAX((curr_address - curr_max_below), + curr_entry->vme_start); + range_end = MIN((curr_address + curr_max_above), + curr_entry->vme_end); vm_map_region_walk(curr_map, - curr_entry->vme_start, + range_start, curr_entry, - curr_entry->offset, - (curr_entry->vme_end - - curr_entry->vme_start), + (curr_entry->offset + + (range_start - + curr_entry->vme_start)), + range_end - range_start, &extended, look_for_pages); if (extended.external_pager && @@ -9372,7 +9561,11 @@ vm_map_region_top_walk( top->ref_count += ref_count - 1; } } else { - if (entry->needs_copy) { + if (entry->superpage_size) { + top->share_mode = SM_LARGE_PAGE; + top->shared_pages_resident = 0; + top->private_pages_resident = entry_size; + } else if (entry->needs_copy) { top->share_mode = SM_COW; top->shared_pages_resident = OBJ_RESIDENT_COUNT(obj, entry_size); @@ -9418,11 +9611,23 @@ vm_map_region_walk( if ((entry->object.vm_object == 0) || (entry->is_sub_map) || - (entry->object.vm_object->phys_contiguous)) { + (entry->object.vm_object->phys_contiguous && + !entry->superpage_size)) { extended->share_mode = SM_EMPTY; extended->ref_count = 0; return; } + + if (entry->superpage_size) { + extended->shadow_depth = 0; + extended->share_mode = SM_LARGE_PAGE; + extended->ref_count = 1; + extended->external_pager = 0; + extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT); + extended->shadow_depth = 0; + return; + } + { obj = entry->object.vm_object; @@ -9619,7 +9824,7 @@ vm_map_region_look_for_page( if(object != caller_object) vm_object_unlock(object); - offset = offset + object->shadow_offset; + offset = offset + object->vo_shadow_offset; object = shadow; shadow = object->shadow; continue; @@ -9723,7 +9928,7 @@ vm_map_simplify_entry( (prev_entry->is_shared == FALSE) && (this_entry->is_shared == FALSE) ) { - _vm_map_entry_unlink(&map->hdr, prev_entry); + _vm_map_store_entry_unlink(&map->hdr, prev_entry); this_entry->vme_start = prev_entry->vme_start; this_entry->offset = prev_entry->offset; if (prev_entry->is_sub_map) { @@ -9891,7 +10096,7 @@ vm_map_machine_attribute( attribute, value); } else if (object->shadow) { - offset = offset + object->shadow_offset; + offset = offset + object->vo_shadow_offset; last_object = object; object = object->shadow; vm_object_lock(last_object->shadow); @@ -9947,6 +10152,12 @@ vm_map_behavior_set( "vm_map_behavior_set, 0x%X start 0x%X end 0x%X behavior %d", map, start, end, new_behavior, 0); + if (start > end || + start < vm_map_min(map) || + end > vm_map_max(map)) { + return KERN_NO_SPACE; + } + switch (new_behavior) { /* @@ -10059,6 +10270,8 @@ vm_map_willneed( fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL; fault_info.no_cache = FALSE; /* ignored value */ fault_info.stealth = TRUE; + fault_info.io_sync = FALSE; + fault_info.cs_bypass = FALSE; fault_info.mark_zf_absent = FALSE; /* @@ -10074,106 +10287,130 @@ vm_map_willneed( * an error. */ - if (vm_map_range_check(map, start, end, &entry)) { + if (! vm_map_range_check(map, start, end, &entry)) { + vm_map_unlock_read(map); + return KERN_INVALID_ADDRESS; + } + /* + * Examine each vm_map_entry_t in the range. + */ + for (; entry != vm_map_to_entry(map) && start < end; ) { + /* - * Examine each vm_map_entry_t in the range. + * The first time through, the start address could be anywhere + * within the vm_map_entry we found. So adjust the offset to + * correspond. After that, the offset will always be zero to + * correspond to the beginning of the current vm_map_entry. */ + offset = (start - entry->vme_start) + entry->offset; - for (; entry->vme_start < end; start += len, entry = entry->vme_next) { - - /* - * The first time through, the start address could be anywhere within the - * vm_map_entry we found. So adjust the offset to correspond. After that, - * the offset will always be zero to correspond to the beginning of the current - * vm_map_entry. - */ - - offset = (start - entry->vme_start) + entry->offset; - - /* - * Set the length so we don't go beyond the end of the map_entry or beyond the - * end of the range we were given. This range could span also multiple map - * entries all of which map different files, so make sure we only do the right - * amount of I/O for each object. Note that it's possible for there to be - * multiple map entries all referring to the same object but with different - * page permissions, but it's not worth trying to optimize that case. - */ - - len = MIN(entry->vme_end - start, end - start); + /* + * Set the length so we don't go beyond the end of the + * map_entry or beyond the end of the range we were given. + * This range could span also multiple map entries all of which + * map different files, so make sure we only do the right amount + * of I/O for each object. Note that it's possible for there + * to be multiple map entries all referring to the same object + * but with different page permissions, but it's not worth + * trying to optimize that case. + */ + len = MIN(entry->vme_end - start, end - start); - if ((vm_size_t) len != len) { - /* 32-bit overflow */ - len = (vm_size_t) (0 - PAGE_SIZE); - } - fault_info.cluster_size = (vm_size_t) len; - fault_info.lo_offset = offset; - fault_info.hi_offset = offset + len; - fault_info.user_tag = entry->alias; + if ((vm_size_t) len != len) { + /* 32-bit overflow */ + len = (vm_size_t) (0 - PAGE_SIZE); + } + fault_info.cluster_size = (vm_size_t) len; + fault_info.lo_offset = offset; + fault_info.hi_offset = offset + len; + fault_info.user_tag = entry->alias; - /* - * If there's no read permission to this mapping, then just skip it. - */ + /* + * If there's no read permission to this mapping, then just + * skip it. + */ + if ((entry->protection & VM_PROT_READ) == 0) { + entry = entry->vme_next; + start = entry->vme_start; + continue; + } - if ((entry->protection & VM_PROT_READ) == 0) { - continue; - } + /* + * Find the file object backing this map entry. If there is + * none, then we simply ignore the "will need" advice for this + * entry and go on to the next one. + */ + if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) { + entry = entry->vme_next; + start = entry->vme_start; + continue; + } - /* - * Find the file object backing this map entry. If there is none, - * then we simply ignore the "will need" advice for this entry and - * go on to the next one. - */ + /* + * The data_request() could take a long time, so let's + * release the map lock to avoid blocking other threads. + */ + vm_map_unlock_read(map); - if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) { - continue; - } + vm_object_paging_begin(object); + pager = object->pager; + vm_object_unlock(object); - vm_object_paging_begin(object); - pager = object->pager; - vm_object_unlock(object); + /* + * Get the data from the object asynchronously. + * + * Note that memory_object_data_request() places limits on the + * amount of I/O it will do. Regardless of the len we + * specified, it won't do more than MAX_UPL_TRANSFER and it + * silently truncates the len to that size. This isn't + * necessarily bad since madvise shouldn't really be used to + * page in unlimited amounts of data. Other Unix variants + * limit the willneed case as well. If this turns out to be an + * issue for developers, then we can always adjust the policy + * here and still be backwards compatible since this is all + * just "advice". + */ + kr = memory_object_data_request( + pager, + offset + object->paging_offset, + 0, /* ignored */ + VM_PROT_READ, + (memory_object_fault_info_t)&fault_info); - /* - * Get the data from the object asynchronously. - * - * Note that memory_object_data_request() places limits on the amount - * of I/O it will do. Regardless of the len we specified, it won't do - * more than MAX_UPL_TRANSFER and it silently truncates the len to that - * size. This isn't necessarily bad since madvise shouldn't really be - * used to page in unlimited amounts of data. Other Unix variants limit - * the willneed case as well. If this turns out to be an issue for - * developers, then we can always adjust the policy here and still be - * backwards compatible since this is all just "advice". - */ + vm_object_lock(object); + vm_object_paging_end(object); + vm_object_unlock(object); - kr = memory_object_data_request( - pager, - offset + object->paging_offset, - 0, /* ignored */ - VM_PROT_READ, - (memory_object_fault_info_t)&fault_info); + /* + * If we couldn't do the I/O for some reason, just give up on + * the madvise. We still return success to the user since + * madvise isn't supposed to fail when the advice can't be + * taken. + */ + if (kr != KERN_SUCCESS) { + return KERN_SUCCESS; + } - vm_object_lock(object); - vm_object_paging_end(object); - vm_object_unlock(object); + start += len; + if (start >= end) { + /* done */ + return KERN_SUCCESS; + } + /* look up next entry */ + vm_map_lock_read(map); + if (! vm_map_lookup_entry(map, start, &entry)) { /* - * If we couldn't do the I/O for some reason, just give up on the - * madvise. We still return success to the user since madvise isn't - * supposed to fail when the advice can't be taken. + * There's a new hole in the address range. */ - - if (kr != KERN_SUCCESS) { - break; - } + vm_map_unlock_read(map); + return KERN_INVALID_ADDRESS; } - - kr = KERN_SUCCESS; - } else - kr = KERN_INVALID_ADDRESS; + } vm_map_unlock_read(map); - return kr; + return KERN_SUCCESS; } static boolean_t @@ -10208,7 +10445,7 @@ vm_map_entry_is_reusable( object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC && object->internal && !object->true_share && - object->wimg_bits == VM_WIMG_DEFAULT && + object->wimg_bits == VM_WIMG_USE_DEFAULT && !object->code_signed) { return TRUE; } @@ -10798,12 +11035,13 @@ vm_map_entry_insert( new_entry->no_cache = no_cache; new_entry->permanent = permanent; new_entry->superpage_size = superpage_size; + new_entry->used_for_jit = FALSE; /* * Insert the new entry into the list. */ - vm_map_entry_link(map, insp_entry, new_entry); + vm_map_store_entry_link(map, insp_entry, new_entry); map->size += end - start; /* @@ -10866,6 +11104,8 @@ vm_map_remap_extract( map_header->nentries = 0; map_header->entries_pageable = pageable; + vm_map_store_init( map_header ); + *cur_protection = VM_PROT_ALL; *max_protection = VM_PROT_ALL; @@ -10923,7 +11163,7 @@ vm_map_remap_extract( } else if (src_entry->needs_copy || object->shadowed || (object->internal && !object->true_share && !src_entry->is_shared && - object->size > entry_size)) { + object->vo_size > entry_size)) { vm_object_shadow(&src_entry->object.vm_object, &src_entry->offset, @@ -11109,12 +11349,14 @@ vm_map_remap_extract( } } - _vm_map_entry_link(map_header, + _vm_map_store_entry_link(map_header, map_header->links.prev, new_entry); - *cur_protection &= src_entry->protection; - *max_protection &= src_entry->max_protection; - + /*Protections for submap mapping are irrelevant here*/ + if( !src_entry->is_sub_map ) { + *cur_protection &= src_entry->protection; + *max_protection &= src_entry->max_protection; + } map_address += tmp_size; mapped_size += tmp_size; src_start += tmp_size; @@ -11130,7 +11372,7 @@ vm_map_remap_extract( src_entry != (struct vm_map_entry *)&map_header->links; src_entry = new_entry) { new_entry = src_entry->vme_next; - _vm_map_entry_unlink(map_header, src_entry); + _vm_map_store_entry_unlink(map_header, src_entry); vm_object_deallocate(src_entry->object.vm_object); _vm_map_entry_dispose(map_header, src_entry); } @@ -11210,11 +11452,11 @@ vm_map_remap( entry != (struct vm_map_entry *)&map_header.links; entry = new_entry) { new_entry = entry->vme_next; - _vm_map_entry_unlink(&map_header, entry); + _vm_map_store_entry_unlink(&map_header, entry); if (result == KERN_SUCCESS) { entry->vme_start += *address; entry->vme_end += *address; - vm_map_entry_link(target_map, insp_entry, entry); + vm_map_store_entry_link(target_map, insp_entry, entry); insp_entry = entry; } else { if (!entry->is_sub_map) { @@ -11226,6 +11468,12 @@ vm_map_remap( } } + if( target_map->disable_vmentry_reuse == TRUE) { + if( target_map->highest_entry_end < insp_entry->vme_end ){ + target_map->highest_entry_end = insp_entry->vme_end; + } + } + if (result == KERN_SUCCESS) { target_map->size += size; SAVE_HINT_MAP_WRITE(target_map, insp_entry); @@ -11284,15 +11532,19 @@ StartAgain: ; * address, we have to start after it. */ - assert(first_free_is_valid(map)); - if (start == map->min_offset) { - if ((entry = map->first_free) != vm_map_to_entry(map)) - start = entry->vme_end; + if( map->disable_vmentry_reuse == TRUE) { + VM_MAP_HIGHEST_ENTRY(map, entry, start); } else { - vm_map_entry_t tmp_entry; - if (vm_map_lookup_entry(map, start, &tmp_entry)) - start = tmp_entry->vme_end; - entry = tmp_entry; + assert(first_free_is_valid(map)); + if (start == map->min_offset) { + if ((entry = map->first_free) != vm_map_to_entry(map)) + start = entry->vme_end; + } else { + vm_map_entry_t tmp_entry; + if (vm_map_lookup_entry(map, start, &tmp_entry)) + start = tmp_entry->vme_end; + entry = tmp_entry; + } } /* @@ -11671,7 +11923,7 @@ vm_map_purgable_control( vm_object_lock(object); if (entry->offset != 0 || - entry->vme_end - entry->vme_start != object->size) { + entry->vme_end - entry->vme_start != object->vo_size) { /* * Can only apply purgable controls to the whole (existing) * object at once. @@ -11737,11 +11989,18 @@ vm_map_page_info( vm_object_id_t object_id; vm_page_info_basic_t basic_info; int depth; + vm_map_offset_t offset_in_page; switch (flavor) { case VM_PAGE_INFO_BASIC: if (*count != VM_PAGE_INFO_BASIC_COUNT) { - return KERN_INVALID_ARGUMENT; + /* + * The "vm_page_info_basic_data" structure was not + * properly padded, so allow the size to be off by + * one to maintain backwards binary compatibility... + */ + if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) + return KERN_INVALID_ARGUMENT; } break; default: @@ -11755,6 +12014,7 @@ vm_map_page_info( depth = 0; retval = KERN_SUCCESS; + offset_in_page = offset & PAGE_MASK; offset = vm_map_trunc_page(offset); vm_map_lock_read(map); @@ -11861,7 +12121,7 @@ vm_map_page_info( if (object->shadow != VM_OBJECT_NULL) { vm_object_t shadow; - offset += object->shadow_offset; + offset += object->vo_shadow_offset; shadow = object->shadow; vm_object_lock(shadow); @@ -11926,7 +12186,8 @@ vm_map_page_info( basic_info->disposition = disposition; basic_info->ref_count = ref_count; basic_info->object_id = (vm_object_id_t) (uintptr_t) object; - basic_info->offset = (memory_object_offset_t) offset; + basic_info->offset = + (memory_object_offset_t) offset + offset_in_page; basic_info->depth = depth; break; } @@ -12457,6 +12718,15 @@ vm_map_disable_NX(vm_map_t map) pmap_disable_NX(map->pmap); } +void +vm_map_disallow_data_exec(vm_map_t map) +{ + if (map == NULL) + return; + + map->map_disallow_data_exec = TRUE; +} + /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS) * more descriptive. */ @@ -12506,7 +12776,7 @@ vm_map_has_4GB_pagezero( void vm_map_set_4GB_pagezero(vm_map_t map) { -#ifdef __i386__ +#if defined(__i386__) pmap_set_4GB_pagezero(map->pmap); #else #pragma unused(map) @@ -12517,7 +12787,7 @@ vm_map_set_4GB_pagezero(vm_map_t map) void vm_map_clear_4GB_pagezero(vm_map_t map) { -#ifdef __i386__ +#if defined(__i386__) pmap_clear_4GB_pagezero(map->pmap); #else #pragma unused(map) @@ -12681,3 +12951,184 @@ kern_return_t vm_map_sign(vm_map_t map, return KERN_SUCCESS; } #endif + +#if CONFIG_FREEZE + +kern_return_t vm_map_freeze_walk( + vm_map_t map, + unsigned int *purgeable_count, + unsigned int *wired_count, + unsigned int *clean_count, + unsigned int *dirty_count, + boolean_t *has_shared) +{ + vm_map_entry_t entry; + + vm_map_lock_read(map); + + *purgeable_count = *wired_count = *clean_count = *dirty_count = 0; + *has_shared = FALSE; + + for (entry = vm_map_first_entry(map); + entry != vm_map_to_entry(map); + entry = entry->vme_next) { + unsigned int purgeable, clean, dirty, wired; + boolean_t shared; + + if ((entry->object.vm_object == 0) || + (entry->is_sub_map) || + (entry->object.vm_object->phys_contiguous)) { + continue; + } + + vm_object_pack(&purgeable, &wired, &clean, &dirty, &shared, entry->object.vm_object, VM_OBJECT_NULL, NULL, NULL); + + *purgeable_count += purgeable; + *wired_count += wired; + *clean_count += clean; + *dirty_count += dirty; + + if (shared) { + *has_shared = TRUE; + } + } + + vm_map_unlock_read(map); + + return KERN_SUCCESS; +} + +kern_return_t vm_map_freeze( + vm_map_t map, + unsigned int *purgeable_count, + unsigned int *wired_count, + unsigned int *clean_count, + unsigned int *dirty_count, + boolean_t *has_shared) +{ + vm_map_entry_t entry2 = VM_MAP_ENTRY_NULL; + vm_object_t compact_object = VM_OBJECT_NULL; + vm_object_offset_t offset = 0x0; + kern_return_t kr = KERN_SUCCESS; + void *default_freezer_toc = NULL; + boolean_t cleanup = FALSE; + + *purgeable_count = *wired_count = *clean_count = *dirty_count = 0; + *has_shared = FALSE; + + /* Create our compact object */ + compact_object = vm_object_allocate((vm_map_offset_t)(VM_MAX_ADDRESS) - (vm_map_offset_t)(VM_MIN_ADDRESS)); + if (!compact_object) { + kr = KERN_FAILURE; + goto done; + } + + default_freezer_toc = default_freezer_mapping_create(compact_object, offset); + if (!default_freezer_toc) { + kr = KERN_FAILURE; + goto done; + } + + /* + * We need the exclusive lock here so that we can + * block any page faults or lookups while we are + * in the middle of freezing this vm map. + */ + vm_map_lock(map); + + if (map->default_freezer_toc != NULL){ + /* + * This map has already been frozen. + */ + cleanup = TRUE; + kr = KERN_SUCCESS; + goto done; + } + + /* Get a mapping in place for the freezing about to commence */ + map->default_freezer_toc = default_freezer_toc; + + vm_object_lock(compact_object); + + for (entry2 = vm_map_first_entry(map); + entry2 != vm_map_to_entry(map); + entry2 = entry2->vme_next) { + + vm_object_t src_object = entry2->object.vm_object; + + /* If eligible, scan the entry, moving eligible pages over to our parent object */ + if (entry2->object.vm_object && !entry2->is_sub_map && !entry2->object.vm_object->phys_contiguous) { + unsigned int purgeable, clean, dirty, wired; + boolean_t shared; + + vm_object_pack(&purgeable, &wired, &clean, &dirty, &shared, + src_object, compact_object, &default_freezer_toc, &offset); + + *purgeable_count += purgeable; + *wired_count += wired; + *clean_count += clean; + *dirty_count += dirty; + + if (shared) { + *has_shared = TRUE; + } + } + } + + vm_object_unlock(compact_object); + + /* Finally, throw out the pages to swap */ + vm_object_pageout(compact_object); + +done: + vm_map_unlock(map); + + /* Unwind if there was a failure */ + if ((cleanup) || (KERN_SUCCESS != kr)) { + if (default_freezer_toc){ + default_freezer_mapping_free(&map->default_freezer_toc, TRUE); + } + if (compact_object){ + vm_object_deallocate(compact_object); + } + } + + return kr; +} + +__private_extern__ vm_object_t default_freezer_get_compact_vm_object( void** ); + +void +vm_map_thaw( + vm_map_t map) +{ + void **default_freezer_toc; + vm_object_t compact_object; + + vm_map_lock(map); + + if (map->default_freezer_toc == NULL){ + /* + * This map is not in a frozen state. + */ + goto out; + } + + default_freezer_toc = &(map->default_freezer_toc); + + compact_object = default_freezer_get_compact_vm_object(default_freezer_toc); + + /* Bring the pages back in */ + vm_object_pagein(compact_object); + + /* Shift pages back to their original objects */ + vm_object_unpack(compact_object, default_freezer_toc); + + vm_object_deallocate(compact_object); + + map->default_freezer_toc = NULL; + +out: + vm_map_unlock(map); +} +#endif diff --git a/osfmk/vm/vm_map.h b/osfmk/vm/vm_map.h index dd39abb5c..d27859858 100644 --- a/osfmk/vm/vm_map.h +++ b/osfmk/vm/vm_map.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -114,6 +114,9 @@ __END_DECLS #define current_map_fast() (current_thread()->map) #define current_map() (current_map_fast()) +#include + + /* * Types defined: * @@ -191,6 +194,7 @@ struct vm_named_entry { * Control information for virtual copy operations is also * stored in the address map entry. */ + struct vm_map_links { struct vm_map_entry *prev; /* previous entry */ struct vm_map_entry *next; /* next entry */ @@ -204,6 +208,8 @@ struct vm_map_entry { #define vme_next links.next #define vme_start links.start #define vme_end links.end + + struct vm_map_store store; union vm_map_object object; /* object I point to */ vm_object_offset_t offset; /* offset into object */ unsigned int @@ -230,7 +236,8 @@ struct vm_map_entry { /* boolean_t */ permanent:1, /* mapping can not be removed */ /* boolean_t */ superpage_size:3,/* use superpages of a certain size */ /* boolean_t */ zero_wired_pages:1, /* zero out the wired pages of this entry it is being deleted without unwiring them */ - /* unsigned char */ pad:2; /* available bits */ + /* boolean_t */ used_for_jit:1, + /* unsigned char */ pad:1; /* available bits */ unsigned short wired_count; /* can be paged if = 0 */ unsigned short user_wired_count; /* for vm_wire */ }; @@ -258,11 +265,17 @@ struct vm_map_entry { * Description: * Header for a vm_map and a vm_map_copy. */ + + struct vm_map_header { struct vm_map_links links; /* first, last, min, max */ int nentries; /* Number of entries */ boolean_t entries_pageable; /* are map entries pageable? */ + vm_map_offset_t highest_entry_end_addr; /* The ending address of the highest allocated vm_entry_t */ +#ifdef VM_MAP_STORE_USE_RB + struct rb_head rb_head_store; +#endif }; /* @@ -285,6 +298,7 @@ struct _vm_map { struct vm_map_header hdr; /* Map entry header */ #define min_offset hdr.links.start /* start of range */ #define max_offset hdr.links.end /* end of range */ +#define highest_entry_end hdr.highest_entry_end_addr pmap_t pmap; /* Physical map */ vm_map_size_t size; /* virtual size */ vm_map_size_t user_wire_limit;/* rlimit on user locked memory */ @@ -298,14 +312,21 @@ struct _vm_map { lck_mtx_ext_t s_lock_ext; vm_map_entry_t hint; /* hint for quick lookups */ vm_map_entry_t first_free; /* First free space hint */ - boolean_t wait_for_space; /* Should callers wait - for space? */ - boolean_t wiring_required;/* All memory wired? */ - boolean_t no_zero_fill; /* No zero fill absent pages */ - boolean_t mapped; /* has this map been mapped */ - boolean_t switch_protect; /* Protect map from write faults while switched */ + unsigned int + /* boolean_t */ wait_for_space:1, /* Should callers wait for space? */ + /* boolean_t */ wiring_required:1, /* All memory wired? */ + /* boolean_t */ no_zero_fill:1, /*No zero fill absent pages */ + /* boolean_t */ mapped:1, /*has this map been mapped */ + /* boolean_t */ switch_protect:1, /* Protect map from write faults while switched */ + /* boolean_t */ disable_vmentry_reuse:1, /* All vm entries should keep using newer and higher addresses in the map */ + /* boolean_t */ map_disallow_data_exec:1, /* Disallow execution from data pages on exec-permissive architectures */ + /* reserved */ pad:25; unsigned int timestamp; /* Version number */ unsigned int color_rr; /* next color (not protected by a lock) */ +#if CONFIG_FREEZE + void *default_freezer_toc; +#endif + boolean_t jit_entry_exists; } ; #define vm_map_to_entry(map) ((struct vm_map_entry *) &(map)->hdr.links) @@ -800,6 +821,11 @@ extern vm_object_t convert_port_entry_to_object( ipc_port_t port); +extern kern_return_t vm_map_set_cache_attr( + vm_map_t map, + vm_map_offset_t va); + + /* definitions related to overriding the NX behavior */ #define VM_ABI_32 0x1 @@ -931,6 +957,9 @@ extern kern_return_t vm_map_copyin_common( extern void vm_map_disable_NX( vm_map_t map); +extern void vm_map_disallow_data_exec( + vm_map_t map); + extern void vm_map_set_64bit( vm_map_t map); @@ -964,6 +993,8 @@ extern void vm_map_switch_protect( vm_map_t map, boolean_t val); +extern boolean_t first_free_is_valid(vm_map_t); + #ifdef XNU_KERNEL_PRIVATE extern kern_return_t vm_map_page_info( vm_map_t map, @@ -1030,6 +1061,27 @@ extern kern_return_t vm_map_sign(vm_map_t map, vm_map_offset_t end); #endif +#if CONFIG_FREEZE +extern kern_return_t vm_map_freeze_walk( + vm_map_t map, + unsigned int *purgeable_count, + unsigned int *wired_count, + unsigned int *clean_count, + unsigned int *dirty_count, + boolean_t *has_shared); + +extern kern_return_t vm_map_freeze( + vm_map_t map, + unsigned int *purgeable_count, + unsigned int *wired_count, + unsigned int *clean_count, + unsigned int *dirty_count, + boolean_t *has_shared); + +extern void vm_map_thaw( + vm_map_t map); +#endif + __END_DECLS #endif /* KERNEL_PRIVATE */ diff --git a/osfmk/vm/vm_map_store.c b/osfmk/vm/vm_map_store.c new file mode 100644 index 000000000..58148a964 --- /dev/null +++ b/osfmk/vm/vm_map_store.c @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include + +#if MACH_ASSERT +boolean_t +first_free_is_valid_store( vm_map_t map ) +{ + return(first_free_is_valid_ll( map )); +} +#endif + +void +vm_map_store_init( struct vm_map_header *hdr ) +{ + vm_map_store_init_ll( hdr ); +#ifdef VM_MAP_STORE_USE_RB + vm_map_store_init_rb( hdr ); +#endif +} + +boolean_t +vm_map_store_lookup_entry( + register vm_map_t map, + register vm_map_offset_t address, + vm_map_entry_t *entry) /* OUT */ +{ +#ifdef VM_MAP_STORE_USE_LL + return (vm_map_store_lookup_entry_ll( map, address, entry )); +#elif defined VM_MAP_STORE_USE_RB + return (vm_map_store_lookup_entry_rb( map, address, entry )); +#endif +} + +void +vm_map_store_update( vm_map_t map, vm_map_entry_t entry, int update_type ) +{ + switch (update_type) { + case VM_MAP_ENTRY_CREATE: + break; + case VM_MAP_ENTRY_DELETE: + if((entry) == (map)->first_free) { + (map)->first_free = vm_map_to_entry(map); + } + if((entry) == (map)->hint) { + (map)->hint = vm_map_to_entry(map); + } + break; + default: + break; + } +} + +void vm_map_store_copy_insert( vm_map_t map, vm_map_entry_t after_where, vm_map_copy_t copy) +{ + vm_map_store_copy_insert_ll(map, after_where, copy); +#ifdef VM_MAP_STORE_USE_RB + vm_map_store_copy_insert_rb(map, after_where, copy); +#endif +} + +/* + * vm_map_entry_{un,}link: + * + * Insert/remove entries from maps (or map copies). + * The _vm_map_store_entry_{un,}link variants are used at + * some places where updating first_free is not needed & + * copy maps are being modified. Also note the first argument + * is the map header. + * Modifying the vm_map_store_entry_{un,}link functions to + * deal with these call sites made the interface confusing + * and clunky. + */ + +void +_vm_map_store_entry_link( struct vm_map_header * mapHdr, vm_map_entry_t after_where, vm_map_entry_t entry) +{ + vm_map_store_entry_link_ll(mapHdr, after_where, entry); +#ifdef VM_MAP_STORE_USE_RB + vm_map_store_entry_link_rb(mapHdr, after_where, entry); +#endif +} + +void +vm_map_store_entry_link( vm_map_t map, vm_map_entry_t after_where, vm_map_entry_t entry) +{ + vm_map_t VMEL_map; + vm_map_entry_t VMEL_entry; + VMEL_map = (map); + VMEL_entry = (entry); + + _vm_map_store_entry_link(&VMEL_map->hdr, after_where, VMEL_entry); + if( VMEL_map->disable_vmentry_reuse == TRUE ) { + UPDATE_HIGHEST_ENTRY_END( VMEL_map, VMEL_entry); + } else { + update_first_free_ll(VMEL_map, VMEL_map->first_free); +#ifdef VM_MAP_STORE_USE_RB + update_first_free_rb(VMEL_map, VMEL_map->first_free); +#endif + } +} + +void +_vm_map_store_entry_unlink( struct vm_map_header * mapHdr, vm_map_entry_t entry) +{ + vm_map_store_entry_unlink_ll(mapHdr, entry); +#ifdef VM_MAP_STORE_USE_RB + vm_map_store_entry_unlink_rb(mapHdr, entry); +#endif +} + +void +vm_map_store_entry_unlink( vm_map_t map, vm_map_entry_t entry) +{ + vm_map_t VMEU_map; + vm_map_entry_t VMEU_entry; + vm_map_entry_t VMEU_first_free; + VMEU_map = (map); + VMEU_entry = (entry); + if (VMEU_entry->vme_start <= VMEU_map->first_free->vme_start){ + VMEU_first_free = VMEU_entry->vme_prev; + } else { + VMEU_first_free = VMEU_map->first_free; + } + + _vm_map_store_entry_unlink(&VMEU_map->hdr, VMEU_entry); + update_first_free_ll(VMEU_map, VMEU_first_free); +#ifdef VM_MAP_STORE_USE_RB + update_first_free_rb(VMEU_map, VMEU_first_free); +#endif +} + +void +vm_map_store_copy_reset( vm_map_copy_t copy,vm_map_entry_t entry) +{ + int nentries = copy->cpy_hdr.nentries; + vm_map_store_copy_reset_ll(copy, entry, nentries); +#ifdef VM_MAP_STORE_USE_RB + vm_map_store_copy_reset_rb(copy, entry, nentries); +#endif +} + +void +vm_map_store_update_first_free( vm_map_t map, vm_map_entry_t first_free) +{ + update_first_free_ll(map, first_free); +#ifdef VM_MAP_STORE_USE_RB + update_first_free_rb(map, first_free); +#endif +} diff --git a/osfmk/vm/vm_map_store.h b/osfmk/vm/vm_map_store.h new file mode 100644 index 000000000..dab7746ed --- /dev/null +++ b/osfmk/vm/vm_map_store.h @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _VM_VM_MAP_STORE_H +#define _VM_VM_MAP_STORE_H + +/* +#ifndef VM_MAP_STORE_USE_LL +#define VM_MAP_STORE_USE_LL +#endif +*/ +#ifndef VM_MAP_STORE_USE_RB +#define VM_MAP_STORE_USE_RB +#endif + +#include + +struct _vm_map; +struct vm_map_entry; +struct vm_map_copy; +struct vm_map_header; + +struct vm_map_store { +#ifdef VM_MAP_STORE_USE_RB + RB_ENTRY(vm_map_store) entry; +#endif +}; + +#ifdef VM_MAP_STORE_USE_RB + RB_HEAD( rb_head, vm_map_store ); +#endif + +#include +#include +#include + +#define UPDATE_HIGHEST_ENTRY_END(map, highest_entry) \ + MACRO_BEGIN \ + struct _vm_map* UHEE_map; \ + struct vm_map_entry* UHEE_entry; \ + UHEE_map = (map); \ + UHEE_entry = (highest_entry); \ + if( UHEE_map->highest_entry_end < UHEE_entry->vme_end) { \ + UHEE_map->highest_entry_end = UHEE_entry->vme_end; \ + } \ + MACRO_END + +#define VM_MAP_HIGHEST_ENTRY(map, entry, start) \ + MACRO_BEGIN \ + struct _vm_map* VMHE_map; \ + struct vm_map_entry* tmp_entry; \ + vm_map_offset_t VMHE_start; \ + VMHE_map = (map); \ + VMHE_start= VMHE_map->highest_entry_end + PAGE_SIZE_64; \ + while(vm_map_lookup_entry(VMHE_map, VMHE_start, &tmp_entry)){ \ + VMHE_map->highest_entry_end = tmp_entry->vme_end; \ + VMHE_start = VMHE_map->highest_entry_end + PAGE_SIZE_64; \ + } \ + entry = tmp_entry; \ + start = VMHE_start; \ + MACRO_END + +/* + * SAVE_HINT_MAP_READ: + * + * Saves the specified entry as the hint for + * future lookups. only a read lock is held on map, + * so make sure the store is atomic... OSCompareAndSwap + * guarantees this... also, we don't care if we collide + * and someone else wins and stores their 'hint' + */ +#define SAVE_HINT_MAP_READ(map,value) \ + MACRO_BEGIN \ + OSCompareAndSwapPtr((map)->hint, value, &(map)->hint); \ + MACRO_END + + +/* + * SAVE_HINT_MAP_WRITE: + * + * Saves the specified entry as the hint for + * future lookups. write lock held on map, + * so no one else can be writing or looking + * until the lock is dropped, so it's safe + * to just do an assignment + */ +#define SAVE_HINT_MAP_WRITE(map,value) \ + MACRO_BEGIN \ + (map)->hint = (value); \ + MACRO_END + +#define VM_MAP_ENTRY_CREATE 1 +#define VM_MAP_ENTRY_DELETE 2 + +void vm_map_store_init( struct vm_map_header* ); +boolean_t vm_map_store_lookup_entry( struct _vm_map*, vm_map_offset_t, struct vm_map_entry**); +void vm_map_store_update( struct _vm_map*, struct vm_map_entry*, int); +void _vm_map_store_entry_link( struct vm_map_header *, struct vm_map_entry*, struct vm_map_entry*); +void vm_map_store_entry_link( struct _vm_map*, struct vm_map_entry*, struct vm_map_entry*); +void _vm_map_store_entry_unlink( struct vm_map_header *, struct vm_map_entry*); +void vm_map_store_entry_unlink( struct _vm_map*, struct vm_map_entry*); +void vm_map_store_update_first_free( struct _vm_map*, struct vm_map_entry*); +void vm_map_store_copy_insert( struct _vm_map*, struct vm_map_entry*, struct vm_map_copy*); +void vm_map_store_copy_reset( struct vm_map_copy*, struct vm_map_entry*); +#if MACH_ASSERT +boolean_t first_free_is_valid_store( struct _vm_map*); +#endif + +#endif /* _VM_VM_MAP_STORE_H */ + diff --git a/osfmk/vm/vm_map_store_ll.c b/osfmk/vm/vm_map_store_ll.c new file mode 100644 index 000000000..16959bdb6 --- /dev/null +++ b/osfmk/vm/vm_map_store_ll.c @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include + +boolean_t +first_free_is_valid_ll( vm_map_t map ) +{ + vm_map_entry_t entry, next; + entry = vm_map_to_entry(map); + next = entry->vme_next; + while (vm_map_trunc_page(next->vme_start) == vm_map_trunc_page(entry->vme_end) || + (vm_map_trunc_page(next->vme_start) == vm_map_trunc_page(entry->vme_start) && + next != vm_map_to_entry(map))) { + entry = next; + next = entry->vme_next; + if (entry == vm_map_to_entry(map)) + break; + } + if (map->first_free != entry) { + printf("Bad first_free for map %p: %p should be %p\n", + map, map->first_free, entry); + return FALSE; + } + return TRUE; +} + +/* + * UPDATE_FIRST_FREE: + * + * Updates the map->first_free pointer to the + * entry immediately before the first hole in the map. + * The map should be locked. + */ +#define UPDATE_FIRST_FREE_LL(map, new_first_free) \ + MACRO_BEGIN \ + if( map->disable_vmentry_reuse == FALSE){ \ + vm_map_t UFF_map; \ + vm_map_entry_t UFF_first_free; \ + vm_map_entry_t UFF_next_entry; \ + UFF_map = (map); \ + UFF_first_free = (new_first_free); \ + UFF_next_entry = UFF_first_free->vme_next; \ + while (vm_map_trunc_page(UFF_next_entry->vme_start) == \ + vm_map_trunc_page(UFF_first_free->vme_end) || \ + (vm_map_trunc_page(UFF_next_entry->vme_start) == \ + vm_map_trunc_page(UFF_first_free->vme_start) && \ + UFF_next_entry != vm_map_to_entry(UFF_map))) { \ + UFF_first_free = UFF_next_entry; \ + UFF_next_entry = UFF_first_free->vme_next; \ + if (UFF_first_free == vm_map_to_entry(UFF_map)) \ + break; \ + } \ + UFF_map->first_free = UFF_first_free; \ + assert(first_free_is_valid(UFF_map)); \ + } \ + MACRO_END + +#define _vm_map_entry_link_ll(hdr, after_where, entry) \ + MACRO_BEGIN \ + (hdr)->nentries++; \ + (entry)->vme_prev = (after_where); \ + (entry)->vme_next = (after_where)->vme_next; \ + (entry)->vme_prev->vme_next = (entry)->vme_next->vme_prev = (entry); \ + MACRO_END + +#define _vm_map_entry_unlink_ll(hdr, entry) \ + MACRO_BEGIN \ + (hdr)->nentries--; \ + (entry)->vme_next->vme_prev = (entry)->vme_prev; \ + (entry)->vme_prev->vme_next = (entry)->vme_next; \ + MACRO_END +/* + * Macro: vm_map_copy_insert + * + * Description: + * Link a copy chain ("copy") into a map at the + * specified location (after "where"). + * Side effects: + * The copy chain is destroyed. + * Warning: + * The arguments are evaluated multiple times. + */ +#define _vm_map_copy_insert_ll(map, where, copy) \ +MACRO_BEGIN \ + vm_map_t VMCI_map; \ + vm_map_entry_t VMCI_where; \ + vm_map_copy_t VMCI_copy; \ + VMCI_map = (map); \ + VMCI_where = (where); \ + VMCI_copy = (copy); \ + ((VMCI_where->vme_next)->vme_prev = vm_map_copy_last_entry(VMCI_copy))\ + ->vme_next = (VMCI_where->vme_next); \ + ((VMCI_where)->vme_next = vm_map_copy_first_entry(VMCI_copy)) \ + ->vme_prev = VMCI_where; \ + VMCI_map->hdr.nentries += VMCI_copy->cpy_hdr.nentries; \ + update_first_free_ll(VMCI_map, VMCI_map->first_free); \ +MACRO_END + + + +void +vm_map_store_init_ll( __unused struct vm_map_header *hdr) +{ + return; +} + +/* + * vm_map_lookup_entry_ll: [ internal use only ] + * Use the linked list to find the map entry containing (or + * immediately preceding) the specified address + * in the given map; the entry is returned + * in the "entry" parameter. The boolean + * result indicates whether the address is + * actually contained in the map. + */ +boolean_t +vm_map_store_lookup_entry_ll( + register vm_map_t map, + register vm_map_offset_t address, + vm_map_entry_t *entry) /* OUT */ +{ + register vm_map_entry_t cur; + register vm_map_entry_t last; + + /* + * Start looking either from the head of the + * list, or from the hint. + */ + cur = map->hint; + + if (cur == vm_map_to_entry(map)) + cur = cur->vme_next; + + if (address >= cur->vme_start) { + /* + * Go from hint to end of list. + * + * But first, make a quick check to see if + * we are already looking at the entry we + * want (which is usually the case). + * Note also that we don't need to save the hint + * here... it is the same hint (unless we are + * at the header, in which case the hint didn't + * buy us anything anyway). + */ + last = vm_map_to_entry(map); + if ((cur != last) && (cur->vme_end > address)) { + *entry = cur; + return(TRUE); + } + } + else { + /* + * Go from start to hint, *inclusively* + */ + last = cur->vme_next; + cur = vm_map_first_entry(map); + } + + /* + * Search linearly + */ + + while (cur != last) { + if (cur->vme_end > address) { + if (address >= cur->vme_start) { + /* + * Save this lookup for future + * hints, and return + */ + + *entry = cur; + SAVE_HINT_MAP_READ(map, cur); + + return(TRUE); + } + break; + } + cur = cur->vme_next; + } + *entry = cur->vme_prev; + SAVE_HINT_MAP_READ(map, *entry); + + return(FALSE); +} + +void +vm_map_store_entry_link_ll( struct vm_map_header *mapHdr, vm_map_entry_t after_where, vm_map_entry_t entry) +{ + _vm_map_entry_link_ll( mapHdr, after_where, entry); +} + +void +vm_map_store_entry_unlink_ll( struct vm_map_header *mapHdr, vm_map_entry_t entry) +{ + _vm_map_entry_unlink_ll( mapHdr, entry); +} + +void +vm_map_store_copy_insert_ll( vm_map_t map, vm_map_entry_t after_where, vm_map_copy_t copy) +{ + _vm_map_copy_insert_ll( map, after_where, copy); +} + +void +vm_map_store_copy_reset_ll( vm_map_copy_t copy, __unused vm_map_entry_t entry, __unused int nentries) +{ + copy->cpy_hdr.nentries = 0; + vm_map_copy_first_entry(copy) = + vm_map_copy_last_entry(copy) = + vm_map_copy_to_entry(copy); + +} + +void +update_first_free_ll( vm_map_t map, vm_map_entry_t new_first_free) +{ + UPDATE_FIRST_FREE_LL( map, new_first_free); +} + diff --git a/bsd/dev/ppc/machdep.c b/osfmk/vm/vm_map_store_ll.h similarity index 61% rename from bsd/dev/ppc/machdep.c rename to osfmk/vm/vm_map_store_ll.h index bf9f5beff..0bbe00d48 100644 --- a/bsd/dev/ppc/machdep.c +++ b/osfmk/vm/vm_map_store_ll.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,46 +25,20 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * Copyright (c) 1997 Apple Computer, Inc. All rights reserved. - * Copyright (c) 1992 NeXT Computer, Inc. All rights reserved. - * - * - * Machine dependent cruft. - * - * 27-Apr-1997 A.Ramesh at Apple - * - * - */ - -#include -#include -#include - -int reboot_how; -extern struct tty cons; -extern struct tty *constty; /* current console device */ - -extern int getchar(void); -extern int cngetc(void); -extern void cnputc(char); -#define putchar cnputc +#ifndef _VM_VM_MAP_STORE_LL_H +#define _VM_VM_MAP_STORE_LL_H -int -getchar(void) -{ - int c; +#include - c = cngetc(); -#if 0 - if (c == 0x1b) /* ESC ? */ - call_kdp(); -#endif +boolean_t first_free_is_valid_ll( struct _vm_map*); - if (c == '\r') - c = '\n'; - cnputc(c); - return c; -} +void vm_map_store_init_ll( struct vm_map_header* ); +boolean_t vm_map_store_lookup_entry_ll( struct _vm_map*, vm_map_offset_t, struct vm_map_entry**); +void vm_map_store_entry_link_ll( struct vm_map_header*, struct vm_map_entry*, struct vm_map_entry*); +void vm_map_store_entry_unlink_ll( struct vm_map_header*, struct vm_map_entry*); +void update_first_free_ll(struct _vm_map*, struct vm_map_entry*); +void vm_map_store_copy_insert_ll( struct _vm_map*, struct vm_map_entry*, struct vm_map_copy*); +void vm_map_store_copy_reset_ll( struct vm_map_copy*, struct vm_map_entry*, int); +#endif /* _VM_VM_MAP_STORE_LL_H */ diff --git a/osfmk/vm/vm_map_store_rb.c b/osfmk/vm/vm_map_store_rb.c new file mode 100644 index 000000000..2e103b0a2 --- /dev/null +++ b/osfmk/vm/vm_map_store_rb.c @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include + +RB_GENERATE(rb_head, vm_map_store, entry, rb_node_compare); + +#define VME_FOR_STORE( store) \ + (vm_map_entry_t)(((unsigned long)store) - ((unsigned long)sizeof(struct vm_map_links))) + +void +vm_map_store_init_rb( struct vm_map_header* hdr ) +{ + RB_INIT(&(hdr->rb_head_store)); +} + +int rb_node_compare(struct vm_map_store *node, struct vm_map_store *parent) +{ + vm_map_entry_t vme_c; + vm_map_entry_t vme_p; + + vme_c = VME_FOR_STORE(node); + vme_p = VME_FOR_STORE(parent); + if (vme_c->vme_start < vme_p->vme_start) + return -1; + if (vme_c->vme_start >= vme_p->vme_end) + return 1; + return 0; +} + +void vm_map_store_walk_rb( vm_map_t map, vm_map_entry_t *wrong_vme, vm_map_entry_t *vm_entry) +{ + struct vm_map_header hdr = map->hdr; + struct vm_map_store *rb_entry = RB_ROOT(&(hdr.rb_head_store)); + vm_map_entry_t cur = *vm_entry; + + rb_entry = RB_FIND( rb_head, &(hdr.rb_head_store), &(cur->store)); + if(rb_entry == NULL) + panic("NO SUCH ENTRY %p. Gave back %p", *vm_entry, *wrong_vme); + else + panic("Cur: %p, L: %p, R: %p", VME_FOR_STORE(rb_entry), VME_FOR_STORE(RB_LEFT(rb_entry,entry)), VME_FOR_STORE(RB_RIGHT(rb_entry,entry))); +} + + +boolean_t vm_map_store_lookup_entry_rb( vm_map_t map, vm_map_offset_t address, vm_map_entry_t *vm_entry) +{ + struct vm_map_header hdr = map->hdr; + struct vm_map_store *rb_entry = RB_ROOT(&(hdr.rb_head_store)); + vm_map_entry_t cur = vm_map_to_entry(map); + vm_map_entry_t prev = VM_MAP_ENTRY_NULL; + + while (rb_entry != (struct vm_map_store*)NULL) { + cur = VME_FOR_STORE(rb_entry); + if(cur == VM_MAP_ENTRY_NULL) + panic("no entry"); + if (address >= cur->vme_start) { + if (address < cur->vme_end) { + *vm_entry = cur; + return TRUE; + } + rb_entry = RB_RIGHT(rb_entry, entry); + prev = cur; + } else { + rb_entry = RB_LEFT(rb_entry, entry); + } + } + if( prev == VM_MAP_ENTRY_NULL){ + prev = vm_map_to_entry(map); + } + *vm_entry = prev; + return FALSE; +} + +void vm_map_store_entry_link_rb( struct vm_map_header *mapHdr, __unused vm_map_entry_t after_where, vm_map_entry_t entry) +{ + struct rb_head *rbh = &(mapHdr->rb_head_store); + struct vm_map_store *store = &(entry->store); + struct vm_map_store *tmp_store; + if((tmp_store = RB_INSERT( rb_head, rbh, store )) != NULL) { + panic("VMSEL: INSERT FAILED: 0x%lx, 0x%lx, 0x%lx, 0x%lx", (uintptr_t)entry->vme_start, (uintptr_t)entry->vme_end, + (uintptr_t)(VME_FOR_STORE(tmp_store))->vme_start, (uintptr_t)(VME_FOR_STORE(tmp_store))->vme_end); + } +} + +void vm_map_store_entry_unlink_rb( struct vm_map_header *mapHdr, vm_map_entry_t entry) +{ + struct rb_head *rbh = &(mapHdr->rb_head_store); + struct vm_map_store *rb_entry; + struct vm_map_store *store = &(entry->store); + + rb_entry = RB_FIND( rb_head, rbh, store); + if(rb_entry == NULL) + panic("NO ENTRY TO DELETE"); + RB_REMOVE( rb_head, rbh, store ); +} + +void vm_map_store_copy_insert_rb( vm_map_t map, __unused vm_map_entry_t after_where, vm_map_copy_t copy) +{ + struct vm_map_header *mapHdr = &(map->hdr); + struct rb_head *rbh = &(mapHdr->rb_head_store); + struct vm_map_store *store; + vm_map_entry_t entry = vm_map_copy_first_entry(copy); + int inserted=0, nentries = copy->cpy_hdr.nentries; + + while (entry != vm_map_copy_to_entry(copy) && nentries > 0) { + vm_map_entry_t prev = entry; + store = &(entry->store); + if( RB_INSERT( rb_head, rbh, store ) != NULL){ + panic("VMSCIR1: INSERT FAILED: %d: %p, %p, %p, 0x%lx, 0x%lx, 0x%lx, 0x%lx, 0x%lx, 0x%lx",inserted, prev, entry, vm_map_copy_to_entry(copy), + (uintptr_t)prev->vme_start, (uintptr_t)prev->vme_end, (uintptr_t)entry->vme_start, (uintptr_t)entry->vme_end, + (uintptr_t)(VME_FOR_STORE(rbh->rbh_root))->vme_start, (uintptr_t)(VME_FOR_STORE(rbh->rbh_root))->vme_end); + } else { + entry = entry->vme_next; + inserted++; + nentries--; + } + } +} + +void +vm_map_store_copy_reset_rb( vm_map_copy_t copy, vm_map_entry_t entry, int nentries ) +{ + struct vm_map_header *mapHdr = &(copy->cpy_hdr); + struct rb_head *rbh = &(mapHdr->rb_head_store); + struct vm_map_store *store; + int deleted=0; + + while (entry != vm_map_copy_to_entry(copy) && nentries > 0) { + store = &(entry->store); + RB_REMOVE( rb_head, rbh, store ); + entry = entry->vme_next; + deleted++; + nentries--; + } +} + +void update_first_free_rb( __unused vm_map_t map, __unused vm_map_entry_t entry) +{ + return ; +} + diff --git a/osfmk/vm/vm_map_store_rb.h b/osfmk/vm/vm_map_store_rb.h new file mode 100644 index 000000000..da6794929 --- /dev/null +++ b/osfmk/vm/vm_map_store_rb.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _VM_VM_MAP_STORE_H_RB +#define _VM_VM_MAP_STORE_H_RB + +#include + +RB_PROTOTYPE_SC(__private_extern__, rb_head, vm_map_store, entry, rb_node_compare); + +void vm_map_store_init_rb( struct vm_map_header* ); +int rb_node_compare(struct vm_map_store *, struct vm_map_store *); +void vm_map_store_walk_rb( struct _vm_map*, struct vm_map_entry**, struct vm_map_entry**); +boolean_t vm_map_store_lookup_entry_rb( struct _vm_map*, vm_map_offset_t, struct vm_map_entry**); +void vm_map_store_entry_link_rb( struct vm_map_header*, struct vm_map_entry*, struct vm_map_entry*); +void vm_map_store_entry_unlink_rb( struct vm_map_header*, struct vm_map_entry*); +void vm_map_store_copy_insert_rb( struct _vm_map*, struct vm_map_entry*, struct vm_map_copy*); +void vm_map_store_copy_reset_rb( struct vm_map_copy*, struct vm_map_entry*, int); +void update_first_free_rb(struct _vm_map*, struct vm_map_entry*); + +#endif /* _VM_VM_MAP_STORE_RB_H */ diff --git a/osfmk/vm/vm_object.c b/osfmk/vm/vm_object.c index c71627a4f..1c0138d82 100644 --- a/osfmk/vm/vm_object.c +++ b/osfmk/vm/vm_object.c @@ -80,6 +80,7 @@ #include #include #include +#include #include #include #include @@ -261,28 +262,40 @@ unsigned int vm_page_purged_others = 0; static vm_object_t vm_object_cache_trim( boolean_t called_from_vm_object_deallocate); -static queue_head_t vm_object_cached_list; -static int vm_object_cached_count=0; +static void vm_object_deactivate_all_pages( + vm_object_t object); + static int vm_object_cached_high; /* highest # cached objects */ static int vm_object_cached_max = 512; /* may be patched*/ -static lck_mtx_t vm_object_cached_lock_data; -static lck_mtx_ext_t vm_object_cached_lock_data_ext; - #define vm_object_cache_lock() \ lck_mtx_lock(&vm_object_cached_lock_data) #define vm_object_cache_lock_try() \ lck_mtx_try_lock(&vm_object_cached_lock_data) + +#endif /* VM_OBJECT_CACHE */ + +static queue_head_t vm_object_cached_list; +static uint32_t vm_object_cache_pages_freed = 0; +static uint32_t vm_object_cache_pages_moved = 0; +static uint32_t vm_object_cache_pages_skipped = 0; +static uint32_t vm_object_cache_adds = 0; +static uint32_t vm_object_cached_count = 0; +static lck_mtx_t vm_object_cached_lock_data; +static lck_mtx_ext_t vm_object_cached_lock_data_ext; + +static uint32_t vm_object_page_grab_failed = 0; +static uint32_t vm_object_page_grab_skipped = 0; +static uint32_t vm_object_page_grab_returned = 0; +static uint32_t vm_object_page_grab_pmapped = 0; +static uint32_t vm_object_page_grab_reactivations = 0; + #define vm_object_cache_lock_spin() \ lck_mtx_lock_spin(&vm_object_cached_lock_data) #define vm_object_cache_unlock() \ lck_mtx_unlock(&vm_object_cached_lock_data) -#endif /* VM_OBJECT_CACHE */ - - -static void vm_object_deactivate_all_pages( - vm_object_t object); +static void vm_object_cache_remove_locked(vm_object_t); #define VM_OBJECT_HASH_COUNT 1024 @@ -333,6 +346,10 @@ unsigned int vm_object_reap_count_async = 0; #define vm_object_reaper_unlock() \ lck_mtx_unlock(&vm_object_reaper_lock_data) +#if 0 +#undef KERNEL_DEBUG +#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT +#endif static lck_mtx_t * @@ -446,7 +463,7 @@ _vm_object_allocate( queue_init(&object->uplq); #endif /* UPL_DEBUG */ vm_object_lock_init(object); - object->size = size; + object->vo_size = size; } __private_extern__ vm_object_t @@ -467,7 +484,8 @@ vm_object_allocate( lck_grp_t vm_object_lck_grp; -lck_grp_attr_t vm_object_lck_grp_attr; +lck_grp_t vm_object_cache_lck_grp; +lck_grp_attr_t vm_object_lck_grp_attr; lck_attr_t vm_object_lck_attr; lck_attr_t kernel_object_lck_attr; @@ -485,18 +503,18 @@ vm_object_bootstrap(void) round_page(512*1024), round_page(12*1024), "vm objects"); + zone_change(vm_object_zone, Z_CALLERACCT, FALSE); /* don't charge caller */ zone_change(vm_object_zone, Z_NOENCRYPT, TRUE); vm_object_init_lck_grp(); -#if VM_OBJECT_CACHE queue_init(&vm_object_cached_list); lck_mtx_init_ext(&vm_object_cached_lock_data, &vm_object_cached_lock_data_ext, - &vm_object_lck_grp, + &vm_object_cache_lck_grp, &vm_object_lck_attr); -#endif + queue_init(&vm_object_reaper_queue); for (i = 0; i < VM_OBJECT_HASH_LOCK_COUNT; i++) { @@ -515,6 +533,7 @@ vm_object_bootstrap(void) round_page(512*1024), round_page(12*1024), "vm object hash entries"); + zone_change(vm_object_hash_zone, Z_CALLERACCT, FALSE); zone_change(vm_object_hash_zone, Z_NOENCRYPT, TRUE); for (i = 0; i < VM_OBJECT_HASH_COUNT; i++) @@ -538,7 +557,7 @@ vm_object_bootstrap(void) */ vm_object_lock_init(&vm_object_template); #endif - vm_object_template.size = 0; + vm_object_template.vo_size = 0; vm_object_template.memq_hint = VM_PAGE_NULL; vm_object_template.ref_count = 1; #if TASK_SWAPPER @@ -549,7 +568,7 @@ vm_object_bootstrap(void) vm_object_template.reusable_page_count = 0; vm_object_template.copy = VM_OBJECT_NULL; vm_object_template.shadow = VM_OBJECT_NULL; - vm_object_template.shadow_offset = (vm_object_offset_t) 0; + vm_object_template.vo_shadow_offset = (vm_object_offset_t) 0; vm_object_template.pager = MEMORY_OBJECT_NULL; vm_object_template.paging_offset = 0; vm_object_template.pager_control = MEMORY_OBJECT_CONTROL_NULL; @@ -590,6 +609,7 @@ vm_object_bootstrap(void) vm_object_template.sequential = (vm_object_offset_t) 0; vm_object_template.pages_created = 0; vm_object_template.pages_used = 0; + vm_object_template.scan_collisions = 0; #if MACH_PAGEMAP vm_object_template.existence_map = VM_EXTERNAL_NULL; @@ -600,7 +620,8 @@ vm_object_bootstrap(void) #endif /* MACH_ASSERT */ /* cache bitfields */ - vm_object_template.wimg_bits = VM_WIMG_DEFAULT; + vm_object_template.wimg_bits = VM_WIMG_USE_DEFAULT; + vm_object_template.set_cache_attr = FALSE; vm_object_template.code_signed = FALSE; vm_object_template.hashed = FALSE; vm_object_template.transposed = FALSE; @@ -622,6 +643,7 @@ vm_object_bootstrap(void) vm_object_template.objq.next=NULL; vm_object_template.objq.prev=NULL; + vm_object_template.vo_cache_ts = 0; /* * Initialize the "kernel object" @@ -704,6 +726,7 @@ vm_object_init_lck_grp(void) */ lck_grp_attr_setdefault(&vm_object_lck_grp_attr); lck_grp_init(&vm_object_lck_grp, "vm_object", &vm_object_lck_grp_attr); + lck_grp_init(&vm_object_cache_lck_grp, "vm_object_cache", &vm_object_lck_grp_attr); lck_attr_setdefault(&vm_object_lck_attr); lck_attr_setdefault(&kernel_object_lck_attr); lck_attr_cleardebug(&kernel_object_lck_attr); @@ -838,17 +861,6 @@ vm_object_deallocate( vm_object_lock(object); vm_object_mapping_end(object); } - /* - * recheck the ref_count since we dropped the object lock - * to call 'memory_object_last_unmap'... it's possible - * additional references got taken and we only want - * to deactivate the pages if this 'named' object will only - * referenced by the backing pager once we drop our reference - * below - */ - if (!object->terminating && object->ref_count == 2) - vm_object_deactivate_all_pages(object); - assert(object->ref_count > 0); } @@ -1073,6 +1085,360 @@ vm_object_deallocate( } + +vm_page_t +vm_object_page_grab( + vm_object_t object) +{ + vm_page_t p, next_p; + int p_limit = 0; + int p_skipped = 0; + + vm_object_lock_assert_exclusive(object); + + next_p = (vm_page_t)queue_first(&object->memq); + p_limit = MIN(50, object->resident_page_count); + + while (!queue_end(&object->memq, (queue_entry_t)next_p) && --p_limit > 0) { + + p = next_p; + next_p = (vm_page_t)queue_next(&next_p->listq); + + if (VM_PAGE_WIRED(p) || p->busy || p->cleaning || p->fictitious) + goto move_page_in_obj; + + if (p->pmapped || p->dirty || p->precious) { + vm_page_lockspin_queues(); + + if (p->pmapped) { + int refmod_state; + + vm_object_page_grab_pmapped++; + + if (p->reference == FALSE || p->dirty == FALSE) { + + refmod_state = pmap_get_refmod(p->phys_page); + + if (refmod_state & VM_MEM_REFERENCED) + p->reference = TRUE; + if (refmod_state & VM_MEM_MODIFIED) + p->dirty = TRUE; + } + if (p->dirty == FALSE && p->precious == FALSE) { + + refmod_state = pmap_disconnect(p->phys_page); + + if (refmod_state & VM_MEM_REFERENCED) + p->reference = TRUE; + if (refmod_state & VM_MEM_MODIFIED) + p->dirty = TRUE; + + if (p->dirty == FALSE) + goto take_page; + } + } + if (p->inactive && p->reference == TRUE) { + vm_page_activate(p); + + VM_STAT_INCR(reactivations); + vm_object_page_grab_reactivations++; + } + vm_page_unlock_queues(); +move_page_in_obj: + queue_remove(&object->memq, p, vm_page_t, listq); + queue_enter(&object->memq, p, vm_page_t, listq); + + p_skipped++; + continue; + } + vm_page_lockspin_queues(); +take_page: + vm_page_free_prepare_queues(p); + vm_object_page_grab_returned++; + vm_object_page_grab_skipped += p_skipped; + + vm_page_unlock_queues(); + + vm_page_free_prepare_object(p, TRUE); + + return (p); + } + vm_object_page_grab_skipped += p_skipped; + vm_object_page_grab_failed++; + + return (NULL); +} + + + +#define EVICT_PREPARE_LIMIT 64 +#define EVICT_AGE 10 + +static clock_sec_t vm_object_cache_aging_ts = 0; + +static void +vm_object_cache_remove_locked( + vm_object_t object) +{ + queue_remove(&vm_object_cached_list, object, vm_object_t, objq); + object->objq.next = NULL; + object->objq.prev = NULL; + + vm_object_cached_count--; +} + +void +vm_object_cache_remove( + vm_object_t object) +{ + vm_object_cache_lock_spin(); + + if (object->objq.next || object->objq.prev) + vm_object_cache_remove_locked(object); + + vm_object_cache_unlock(); +} + +void +vm_object_cache_add( + vm_object_t object) +{ + clock_sec_t sec; + clock_nsec_t nsec; + + if (object->resident_page_count == 0) + return; + clock_get_system_nanotime(&sec, &nsec); + + vm_object_cache_lock_spin(); + + if (object->objq.next == NULL && object->objq.prev == NULL) { + queue_enter(&vm_object_cached_list, object, vm_object_t, objq); + object->vo_cache_ts = sec + EVICT_AGE; + object->vo_cache_pages_to_scan = object->resident_page_count; + + vm_object_cached_count++; + vm_object_cache_adds++; + } + vm_object_cache_unlock(); +} + +int +vm_object_cache_evict( + int num_to_evict, + int max_objects_to_examine) +{ + vm_object_t object = VM_OBJECT_NULL; + vm_object_t next_obj = VM_OBJECT_NULL; + vm_page_t local_free_q = VM_PAGE_NULL; + vm_page_t p; + vm_page_t next_p; + int object_cnt = 0; + vm_page_t ep_array[EVICT_PREPARE_LIMIT]; + int ep_count; + int ep_limit; + int ep_index; + int ep_freed = 0; + int ep_moved = 0; + uint32_t ep_skipped = 0; + clock_sec_t sec; + clock_nsec_t nsec; + + KERNEL_DEBUG(0x13001ec | DBG_FUNC_START, 0, 0, 0, 0, 0); + /* + * do a couple of quick checks to see if it's + * worthwhile grabbing the lock + */ + if (queue_empty(&vm_object_cached_list)) { + KERNEL_DEBUG(0x13001ec | DBG_FUNC_END, 0, 0, 0, 0, 0); + return (0); + } + clock_get_system_nanotime(&sec, &nsec); + + /* + * the object on the head of the queue has not + * yet sufficiently aged + */ + if (sec < vm_object_cache_aging_ts) { + KERNEL_DEBUG(0x13001ec | DBG_FUNC_END, 0, 0, 0, 0, 0); + return (0); + } + /* + * don't need the queue lock to find + * and lock an object on the cached list + */ + vm_page_unlock_queues(); + + vm_object_cache_lock_spin(); + + for (;;) { + next_obj = (vm_object_t)queue_first(&vm_object_cached_list); + + while (!queue_end(&vm_object_cached_list, (queue_entry_t)next_obj) && object_cnt++ < max_objects_to_examine) { + + object = next_obj; + next_obj = (vm_object_t)queue_next(&next_obj->objq); + + if (sec < object->vo_cache_ts) { + KERNEL_DEBUG(0x130020c, object, object->resident_page_count, object->vo_cache_ts, sec, 0); + + vm_object_cache_aging_ts = object->vo_cache_ts; + object = VM_OBJECT_NULL; + break; + } + if (!vm_object_lock_try_scan(object)) { + /* + * just skip over this guy for now... if we find + * an object to steal pages from, we'll revist in a bit... + * hopefully, the lock will have cleared + */ + KERNEL_DEBUG(0x13001f8, object, object->resident_page_count, 0, 0, 0); + + object = VM_OBJECT_NULL; + continue; + } + if (queue_empty(&object->memq) || object->vo_cache_pages_to_scan == 0) { + /* + * this case really shouldn't happen, but it's not fatal + * so deal with it... if we don't remove the object from + * the list, we'll never move past it. + */ + KERNEL_DEBUG(0x13001fc, object, object->resident_page_count, ep_freed, ep_moved, 0); + + vm_object_cache_remove_locked(object); + vm_object_unlock(object); + object = VM_OBJECT_NULL; + continue; + } + /* + * we have a locked object with pages... + * time to start harvesting + */ + break; + } + vm_object_cache_unlock(); + + if (object == VM_OBJECT_NULL) + break; + + /* + * object is locked at this point and + * has resident pages + */ + next_p = (vm_page_t)queue_first(&object->memq); + + /* + * break the page scan into 2 pieces to minimize the time spent + * behind the page queue lock... + * the list of pages on these unused objects is likely to be cold + * w/r to the cpu cache which increases the time to scan the list + * tenfold... and we may have a 'run' of pages we can't utilize that + * needs to be skipped over... + */ + if ((ep_limit = num_to_evict - (ep_freed + ep_moved)) > EVICT_PREPARE_LIMIT) + ep_limit = EVICT_PREPARE_LIMIT; + ep_count = 0; + + while (!queue_end(&object->memq, (queue_entry_t)next_p) && object->vo_cache_pages_to_scan && ep_count < ep_limit) { + + p = next_p; + next_p = (vm_page_t)queue_next(&next_p->listq); + + object->vo_cache_pages_to_scan--; + + if (VM_PAGE_WIRED(p) || p->busy || p->cleaning) { + queue_remove(&object->memq, p, vm_page_t, listq); + queue_enter(&object->memq, p, vm_page_t, listq); + + ep_skipped++; + continue; + } + if (p->wpmapped || p->dirty || p->precious) { + queue_remove(&object->memq, p, vm_page_t, listq); + queue_enter(&object->memq, p, vm_page_t, listq); + + pmap_clear_reference(p->phys_page); + } + ep_array[ep_count++] = p; + } + KERNEL_DEBUG(0x13001f4 | DBG_FUNC_START, object, object->resident_page_count, ep_freed, ep_moved, 0); + + vm_page_lockspin_queues(); + + for (ep_index = 0; ep_index < ep_count; ep_index++) { + + p = ep_array[ep_index]; + + if (p->wpmapped || p->dirty || p->precious) { + p->reference = FALSE; + p->no_cache = FALSE; + + VM_PAGE_QUEUES_REMOVE(p); + VM_PAGE_ENQUEUE_INACTIVE(p, TRUE); + + ep_moved++; + } else { + vm_page_free_prepare_queues(p); + + assert(p->pageq.next == NULL && p->pageq.prev == NULL); + /* + * Add this page to our list of reclaimed pages, + * to be freed later. + */ + p->pageq.next = (queue_entry_t) local_free_q; + local_free_q = p; + + ep_freed++; + } + } + vm_page_unlock_queues(); + + KERNEL_DEBUG(0x13001f4 | DBG_FUNC_END, object, object->resident_page_count, ep_freed, ep_moved, 0); + + if (local_free_q) { + vm_page_free_list(local_free_q, TRUE); + local_free_q = VM_PAGE_NULL; + } + if (object->vo_cache_pages_to_scan == 0) { + KERNEL_DEBUG(0x1300208, object, object->resident_page_count, ep_freed, ep_moved, 0); + + vm_object_cache_remove(object); + + KERNEL_DEBUG(0x13001fc, object, object->resident_page_count, ep_freed, ep_moved, 0); + } + /* + * done with this object + */ + vm_object_unlock(object); + object = VM_OBJECT_NULL; + + /* + * at this point, we are not holding any locks + */ + if ((ep_freed + ep_moved) >= num_to_evict) { + /* + * we've reached our target for the + * number of pages to evict + */ + break; + } + vm_object_cache_lock_spin(); + } + /* + * put the page queues lock back to the caller's + * idea of it + */ + vm_page_lock_queues(); + + vm_object_cache_pages_freed += ep_freed; + vm_object_cache_pages_moved += ep_moved; + vm_object_cache_pages_skipped += ep_skipped; + + KERNEL_DEBUG(0x13001ec | DBG_FUNC_END, ep_freed, 0, 0, 0, 0); + return (ep_freed); +} + + #if VM_OBJECT_CACHE /* * Check to see whether we really need to trim @@ -1233,6 +1599,9 @@ vm_object_terminate( object->terminating = TRUE; object->alive = FALSE; + if ( !object->internal && (object->objq.next || object->objq.prev)) + vm_object_cache_remove(object); + if (object->hashed) { lck_mtx_t *lck; @@ -1344,7 +1713,7 @@ vm_object_reap( /* * remove from purgeable queue if it's on */ - if (object->objq.next || object->objq.prev) { + if (object->internal && (object->objq.next || object->objq.prev)) { purgeable_q_t queue = vm_purgeable_object_remove(object); assert(queue); @@ -1393,7 +1762,7 @@ vm_object_reap( vm_object_unlock(object); #if MACH_PAGEMAP - vm_external_destroy(object->existence_map, object->size); + vm_external_destroy(object->existence_map, object->vo_size); #endif /* MACH_PAGEMAP */ object->shadow = VM_OBJECT_NULL; @@ -1407,9 +1776,12 @@ vm_object_reap( } +unsigned int vm_max_batch = 256; #define V_O_R_MAX_BATCH 128 +#define BATCH_LIMIT(max) (vm_max_batch >= max ? max : vm_max_batch) + #define VM_OBJ_REAP_FREELIST(_local_free_q, do_disconnect) \ MACRO_BEGIN \ @@ -1461,7 +1833,7 @@ vm_object_reap_pages( restart_after_sleep: if (queue_empty(&object->memq)) return; - loop_count = V_O_R_MAX_BATCH + 1; + loop_count = BATCH_LIMIT(V_O_R_MAX_BATCH) + 1; vm_page_lockspin_queues(); @@ -1487,7 +1859,7 @@ vm_object_reap_pages( } else mutex_pause(0); - loop_count = V_O_R_MAX_BATCH + 1; + loop_count = BATCH_LIMIT(V_O_R_MAX_BATCH) + 1; vm_page_lockspin_queues(); } @@ -1505,12 +1877,8 @@ vm_object_reap_pages( * * the laundry and pageout_queue flags are cleared... */ -#if CONFIG_EMBEDDED - if (p->laundry) - vm_pageout_throttle_up(p); -#else vm_pageout_throttle_up(p); -#endif + if (p->pageout == TRUE) { /* * toss the wire count we picked up @@ -1624,6 +1992,11 @@ vm_object_reap_pages( p->busy = TRUE; VM_PAGE_QUEUES_REMOVE(p); + /* + * flush page... page will be freed + * upon completion of I/O + */ + vm_pageout_cluster(p); vm_page_unlock_queues(); /* @@ -1632,11 +2005,6 @@ vm_object_reap_pages( VM_OBJ_REAP_FREELIST(local_free_q, disconnect_on_release); - /* - * flush page... page will be freed - * upon completion of I/O - */ - vm_pageout_cluster(p); vm_object_paging_wait(object, THREAD_UNINT); goto restart_after_sleep; @@ -1885,6 +2253,8 @@ vm_object_destroy( } +#if VM_OBJECT_CACHE + #define VM_OBJ_DEACT_ALL_STATS DEBUG #if VM_OBJ_DEACT_ALL_STATS uint32_t vm_object_deactivate_all_pages_batches = 0; @@ -1909,7 +2279,7 @@ vm_object_deactivate_all_pages( #endif /* VM_OBJ_DEACT_ALL_STATS */ #define V_O_D_A_P_MAX_BATCH 256 - loop_count = V_O_D_A_P_MAX_BATCH; + loop_count = BATCH_LIMIT(V_O_D_A_P_MAX_BATCH); #if VM_OBJ_DEACT_ALL_STATS pages_count = 0; #endif /* VM_OBJ_DEACT_ALL_STATS */ @@ -1924,7 +2294,7 @@ vm_object_deactivate_all_pages( pages_count = 0; #endif /* VM_OBJ_DEACT_ALL_STATS */ lck_mtx_yield(&vm_page_queue_lock); - loop_count = V_O_D_A_P_MAX_BATCH; + loop_count = BATCH_LIMIT(V_O_D_A_P_MAX_BATCH); } if (!p->busy && !p->throttled) { #if VM_OBJ_DEACT_ALL_STATS @@ -1943,133 +2313,7 @@ vm_object_deactivate_all_pages( #endif /* VM_OBJ_DEACT_ALL_STATS */ vm_page_unlock_queues(); } - - - -/* - * when deallocating pages it is necessary to hold - * the vm_page_queue_lock (a hot global lock) for certain operations - * on the page... however, the majority of the work can be done - * while merely holding the object lock... to mitigate the time spent behind the - * global lock, go to a 2 pass algorithm... collect pages up to DELAYED_WORK_LIMIT - * while doing all of the work that doesn't require the vm_page_queue_lock... - * them call dw_do_work to acquire the vm_page_queue_lock and do the - * necessary work for each page... we will grab the busy bit on the page - * so that dw_do_work can drop the object lock if it can't immediately take the - * vm_page_queue_lock in order to compete for the locks in the same order that - * vm_pageout_scan takes them. - */ - -#define DELAYED_WORK_LIMIT 32 - -#define DW_clear_reference 0x01 -#define DW_move_page 0x02 -#define DW_clear_busy 0x04 -#define DW_PAGE_WAKEUP 0x08 - - -struct dw { - vm_page_t dw_m; - int dw_mask; -}; - -static void dw_do_work(vm_object_t object, struct dw *dwp, int dw_count); - - -static void -dw_do_work( - vm_object_t object, - struct dw *dwp, - int dw_count) -{ - vm_page_t m; - int j; - - /* - * pageout_scan takes the vm_page_lock_queues first - * then tries for the object lock... to avoid what - * is effectively a lock inversion, we'll go to the - * trouble of taking them in that same order... otherwise - * if this object contains the majority of the pages resident - * in the UBC (or a small set of large objects actively being - * worked on contain the majority of the pages), we could - * cause the pageout_scan thread to 'starve' in its attempt - * to find pages to move to the free queue, since it has to - * successfully acquire the object lock of any candidate page - * before it can steal/clean it. - */ - if (!vm_page_trylockspin_queues()) { - vm_object_unlock(object); - - vm_page_lockspin_queues(); - - for (j = 0; ; j++) { - if (!vm_object_lock_avoid(object) && - _vm_object_lock_try(object)) - break; - vm_page_unlock_queues(); - mutex_pause(j); - vm_page_lockspin_queues(); - } - } - for (j = 0; j < dw_count; j++, dwp++) { - - m = dwp->dw_m; - - if (dwp->dw_mask & DW_clear_reference) - m->reference = FALSE; - - if (dwp->dw_mask & DW_move_page) { - VM_PAGE_QUEUES_REMOVE(m); - - assert(!m->laundry); - assert(m->object != kernel_object); - assert(m->pageq.next == NULL && - m->pageq.prev == NULL); - - if (m->zero_fill) { - queue_enter_first(&vm_page_queue_zf, m, vm_page_t, pageq); - vm_zf_queue_count++; - } else { - queue_enter_first(&vm_page_queue_inactive, m, vm_page_t, pageq); - } - m->inactive = TRUE; - - if (!m->fictitious) { - vm_page_inactive_count++; - token_new_pagecount++; - } else { - assert(m->phys_page == vm_page_fictitious_addr); - } - } - if (dwp->dw_mask & DW_clear_busy) - dwp->dw_m->busy = FALSE; - - if (dwp->dw_mask & DW_PAGE_WAKEUP) - PAGE_WAKEUP(dwp->dw_m); - } - vm_page_unlock_queues(); - -#if CONFIG_EMBEDDED - { - int percent_avail; - - /* - * Decide if we need to send a memory status notification. - */ - percent_avail = - (vm_page_active_count + vm_page_inactive_count + - vm_page_speculative_count + vm_page_free_count + - (IP_VALID(memory_manager_default)?0:vm_page_purgeable_count) ) * 100 / - atop_64(max_mem); - if (percent_avail >= (kern_memorystatus_level + 5) || - percent_avail <= (kern_memorystatus_level - 5)) { - kern_memorystatus_level = percent_avail; - thread_wakeup((event_t)&kern_memorystatus_wakeup); - } - } -#endif -} +#endif /* VM_OBJECT_CACHE */ @@ -2121,6 +2365,7 @@ typedef uint64_t chunk_state_t; MARK_PAGE_HANDLED(c, p); \ MACRO_END + /* * Return true if all pages in the chunk have not yet been processed. */ @@ -2211,6 +2456,7 @@ page_is_paged_out( } + /* * Deactivate the pages in the specified object and range. If kill_page is set, also discard any * page modified state from the pmap. Update the chunk_state as we go along. The caller must specify @@ -2232,9 +2478,10 @@ deactivate_pages_in_object( { vm_page_t m; int p; - struct dw dw_array[DELAYED_WORK_LIMIT]; - struct dw *dwp; + struct vm_page_delayed_work dw_array[DEFAULT_DELAYED_WORK_LIMIT]; + struct vm_page_delayed_work *dwp; int dw_count; + int dw_limit; unsigned int reusable = 0; @@ -2247,6 +2494,7 @@ deactivate_pages_in_object( dwp = &dw_array[0]; dw_count = 0; + dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT); for(p = 0; size && CHUNK_NOT_COMPLETE(*chunk_state); p++, size -= PAGE_SIZE_64, offset += PAGE_SIZE_64) { @@ -2307,39 +2555,23 @@ deactivate_pages_in_object( object->reusable_page_count++; assert(object->resident_page_count >= object->reusable_page_count); reusable++; -#if CONFIG_EMBEDDED - } else { - if (m->reusable) { - m->reusable = FALSE; - object->reusable_page_count--; - } -#endif } } pmap_clear_refmod(m->phys_page, clear_refmod); if (!m->throttled && !(reusable_page || all_reusable)) dwp->dw_mask |= DW_move_page; - /* - * dw_do_work may need to drop the object lock - * if it does, we need the pages its looking at to - * be held stable via the busy bit. - */ - m->busy = TRUE; - dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP); - - dwp->dw_m = m; - dwp++; - dw_count++; + + VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count); - if (dw_count >= DELAYED_WORK_LIMIT) { + if (dw_count >= dw_limit) { if (reusable) { OSAddAtomic(reusable, &vm_page_stats_reusable.reusable_count); vm_page_stats_reusable.reusable += reusable; reusable = 0; } - dw_do_work(object, &dw_array[0], dw_count); + vm_page_do_delayed_work(object, &dw_array[0], dw_count); dwp = &dw_array[0]; dw_count = 0; @@ -2378,7 +2610,7 @@ deactivate_pages_in_object( } if (dw_count) - dw_do_work(object, &dw_array[0], dw_count); + vm_page_do_delayed_work(object, &dw_array[0], dw_count); } @@ -2447,7 +2679,7 @@ deactivate_a_chunk( kill_page = FALSE; reusable_page = FALSE; all_reusable = FALSE; - offset += object->shadow_offset; + offset += object->vo_shadow_offset; vm_object_lock(tmp_object); } @@ -2492,20 +2724,19 @@ vm_object_deactivate_pages( all_reusable = FALSE; if (reusable_page && - object->size != 0 && - object->size == size && + object->internal && + object->vo_size != 0 && + object->vo_size == size && object->reusable_page_count == 0) { all_reusable = TRUE; reusable_page = FALSE; } -#if CONFIG_EMBEDDED if ((reusable_page || all_reusable) && object->all_reusable) { /* This means MADV_FREE_REUSABLE has been called twice, which * is probably illegal. */ return; } -#endif while (size) { length = deactivate_a_chunk(object, offset, size, kill_page, reusable_page, all_reusable); @@ -2564,7 +2795,7 @@ vm_object_reuse_pages( if (object->all_reusable) { assert(object->reusable_page_count == 0); object->all_reusable = FALSE; - if (end_offset - start_offset == object->size || + if (end_offset - start_offset == object->vo_size || !allow_partial_reuse) { vm_page_stats_reusable.all_reuse_calls++; reused = object->resident_page_count; @@ -2662,10 +2893,10 @@ vm_object_pmap_protect( } else { vm_object_offset_t phys_start, phys_end, phys_addr; - phys_start = object->shadow_offset + offset; + phys_start = object->vo_shadow_offset + offset; phys_end = phys_start + size; assert(phys_start <= phys_end); - assert(phys_end <= object->shadow_offset + object->size); + assert(phys_end <= object->vo_shadow_offset + object->vo_size); vm_object_unlock(object); for (phys_addr = phys_start; @@ -2754,7 +2985,7 @@ vm_object_pmap_protect( next_object = object->shadow; if (next_object != VM_OBJECT_NULL) { - offset += object->shadow_offset; + offset += object->vo_shadow_offset; vm_object_lock(next_object); vm_object_unlock(object); object = next_object; @@ -2858,6 +3089,8 @@ vm_object_copy_slowly( fault_info.hi_offset = src_offset + size; fault_info.no_cache = FALSE; fault_info.stealth = TRUE; + fault_info.io_sync = FALSE; + fault_info.cs_bypass = FALSE; fault_info.mark_zf_absent = FALSE; for ( ; @@ -2965,10 +3198,6 @@ vm_object_copy_slowly( case VM_FAULT_RETRY: break; - case VM_FAULT_FICTITIOUS_SHORTAGE: - vm_page_more_fictitious(); - break; - case VM_FAULT_MEMORY_SHORTAGE: if (vm_page_wait(interruptible)) break; @@ -3198,8 +3427,8 @@ vm_object_copy_call( vm_object_lock(src_object); goto Retry; } - if (copy->size < src_offset+size) - copy->size = src_offset+size; + if (copy->vo_size < src_offset+size) + copy->vo_size = src_offset+size; if (!copy->pager_ready) check_ready = TRUE; @@ -3365,7 +3594,7 @@ vm_object_copy_delayed( * needed). */ - if (old_copy->size < copy_size) { + if (old_copy->vo_size < copy_size) { if (src_object_shared == TRUE) { vm_object_unlock(old_copy); vm_object_unlock(src_object); @@ -3385,7 +3614,7 @@ vm_object_copy_delayed( queue_iterate(&src_object->memq, p, vm_page_t, listq) { if (!p->fictitious && - p->offset >= old_copy->size && + p->offset >= old_copy->vo_size && p->offset < copy_size) { if (VM_PAGE_WIRED(p)) { vm_object_unlock(old_copy); @@ -3403,7 +3632,7 @@ vm_object_copy_delayed( } } } - old_copy->size = copy_size; + old_copy->vo_size = copy_size; } if (src_object_shared == TRUE) vm_object_reference_shared(old_copy); @@ -3426,8 +3655,8 @@ vm_object_copy_delayed( * copy object will be large enough to back either the * old copy object or the new mapping. */ - if (old_copy->size > copy_size) - copy_size = old_copy->size; + if (old_copy->vo_size > copy_size) + copy_size = old_copy->vo_size; if (new_copy == VM_OBJECT_NULL) { vm_object_unlock(old_copy); @@ -3439,7 +3668,7 @@ vm_object_copy_delayed( src_object_shared = FALSE; goto Retry; } - new_copy->size = copy_size; + new_copy->vo_size = copy_size; /* * The copy-object is always made large enough to @@ -3449,7 +3678,7 @@ vm_object_copy_delayed( */ assert((old_copy->shadow == src_object) && - (old_copy->shadow_offset == (vm_object_offset_t) 0)); + (old_copy->vo_shadow_offset == (vm_object_offset_t) 0)); } else if (new_copy == VM_OBJECT_NULL) { vm_object_unlock(src_object); @@ -3521,7 +3750,7 @@ vm_object_copy_delayed( */ vm_object_lock_assert_exclusive(new_copy); new_copy->shadow = src_object; - new_copy->shadow_offset = 0; + new_copy->vo_shadow_offset = 0; new_copy->shadowed = TRUE; /* caller must set needs_copy */ vm_object_lock_assert_exclusive(src_object); @@ -3653,7 +3882,7 @@ vm_object_copy_strategically( * The new object and offset into that object * are returned in the source parameters. */ -boolean_t vm_object_shadow_check = FALSE; +boolean_t vm_object_shadow_check = TRUE; __private_extern__ boolean_t vm_object_shadow( @@ -3684,11 +3913,19 @@ vm_object_shadow( /* * Determine if we really need a shadow. + * + * If the source object is larger than what we are trying + * to create, then force the shadow creation even if the + * ref count is 1. This will allow us to [potentially] + * collapse the underlying object away in the future + * (freeing up the extra data it might contain and that + * we don't need). */ - - if (vm_object_shadow_check && source->ref_count == 1 && + if (vm_object_shadow_check && + source->vo_size == length && + source->ref_count == 1 && (source->shadow == VM_OBJECT_NULL || - source->shadow->copy == VM_OBJECT_NULL)) + source->shadow->copy == VM_OBJECT_NULL) ) { source->shadowed = FALSE; return FALSE; @@ -3715,7 +3952,7 @@ vm_object_shadow( * and fix up the offset into the new object. */ - result->shadow_offset = *offset; + result->vo_shadow_offset = *offset; /* * Return the new things @@ -4088,21 +4325,21 @@ vm_object_pager_create( object->paging_offset = 0; #if MACH_PAGEMAP - size = object->size; + size = object->vo_size; #endif /* MACH_PAGEMAP */ vm_object_unlock(object); #if MACH_PAGEMAP map = vm_external_create(size); vm_object_lock(object); - assert(object->size == size); + assert(object->vo_size == size); object->existence_map = map; vm_object_unlock(object); #endif /* MACH_PAGEMAP */ - if ((uint32_t) object->size != object->size) { + if ((uint32_t) object->vo_size != object->vo_size) { panic("vm_object_pager_create(): object size 0x%llx >= 4GB\n", - (uint64_t) object->size); + (uint64_t) object->vo_size); } /* @@ -4121,8 +4358,8 @@ vm_object_pager_create( assert(object->temporary); /* create our new memory object */ - assert((vm_size_t) object->size == object->size); - (void) memory_object_create(dmm, (vm_size_t) object->size, + assert((vm_size_t) object->vo_size == object->vo_size); + (void) memory_object_create(dmm, (vm_size_t) object->vo_size, &pager); memory_object_default_deallocate(dmm); @@ -4140,7 +4377,7 @@ vm_object_pager_create( * copied by vm_object_enter(). */ - if (vm_object_enter(pager, object->size, TRUE, TRUE, FALSE) != object) + if (vm_object_enter(pager, object->vo_size, TRUE, TRUE, FALSE) != object) panic("vm_object_pager_create: mismatch"); /* @@ -4222,8 +4459,8 @@ vm_object_do_collapse( vm_object_lock_assert_exclusive(object); vm_object_lock_assert_exclusive(backing_object); - backing_offset = object->shadow_offset; - size = object->size; + backing_offset = object->vo_shadow_offset; + size = object->vo_size; /* * Move all in-memory pages from backing_object @@ -4363,10 +4600,10 @@ vm_object_do_collapse( * this code should be fixed to salvage the map. */ assert(object->existence_map == VM_EXTERNAL_NULL); - if (backing_offset || (size != backing_object->size)) { + if (backing_offset || (size != backing_object->vo_size)) { vm_external_discarded++; vm_external_destroy(backing_object->existence_map, - backing_object->size); + backing_object->vo_size); } else { vm_external_collapsed++; @@ -4385,10 +4622,10 @@ vm_object_do_collapse( assert(!backing_object->phys_contiguous); object->shadow = backing_object->shadow; if (object->shadow) { - object->shadow_offset += backing_object->shadow_offset; + object->vo_shadow_offset += backing_object->vo_shadow_offset; } else { /* no shadow, therefore no shadow offset... */ - object->shadow_offset = 0; + object->vo_shadow_offset = 0; } assert((object->shadow == VM_OBJECT_NULL) || (object->shadow->copy != backing_object)); @@ -4456,10 +4693,10 @@ vm_object_do_bypass( assert(!backing_object->phys_contiguous); object->shadow = backing_object->shadow; if (object->shadow) { - object->shadow_offset += backing_object->shadow_offset; + object->vo_shadow_offset += backing_object->vo_shadow_offset; } else { /* no shadow, therefore no shadow offset... */ - object->shadow_offset = 0; + object->vo_shadow_offset = 0; } /* @@ -4762,7 +4999,7 @@ vm_object_collapse( * we have to make sure no pages in the backing object * "show through" before bypassing it. */ - size = atop(object->size); + size = atop(object->vo_size); rcount = object->resident_page_count; if (rcount != size) { vm_object_offset_t offset; @@ -4821,7 +5058,7 @@ vm_object_collapse( * */ - backing_offset = object->shadow_offset; + backing_offset = object->vo_shadow_offset; backing_rcount = backing_object->resident_page_count; #if MACH_PAGEMAP @@ -4894,7 +5131,7 @@ vm_object_collapse( } offset = (p->offset - backing_offset); - if (offset < object->size && + if (offset < object->vo_size && offset != hint_offset && !EXISTS_IN_OBJECT(object, offset, rc)) { /* found a dependency */ @@ -4928,7 +5165,7 @@ vm_object_collapse( offset = hint_offset; while((offset = - (offset + PAGE_SIZE_64 < object->size) ? + (offset + PAGE_SIZE_64 < object->vo_size) ? (offset + PAGE_SIZE_64) : 0) != hint_offset) { /* Until we get more than one lookup lock */ @@ -5148,7 +5385,7 @@ vm_object_coalesce( * Extend the object if necessary. */ newsize = prev_offset + prev_size + next_size; - if (newsize > prev_object->size) { + if (newsize > prev_object->vo_size) { #if MACH_PAGEMAP /* * We cannot extend an object that has existence info, @@ -5161,7 +5398,7 @@ vm_object_coalesce( */ assert(prev_object->existence_map == VM_EXTERNAL_NULL); #endif /* MACH_PAGEMAP */ - prev_object->size = newsize; + prev_object->vo_size = newsize; } vm_object_unlock(prev_object); @@ -5346,7 +5583,7 @@ vm_object_print(db_expr_t db_addr, __unused boolean_t have_addr, db_indent += 2; - iprintf("size=0x%x", object->size); + iprintf("size=0x%x", object->vo_size); printf(", memq_hint=%p", object->memq_hint); printf(", ref_count=%d\n", object->ref_count); iprintf(""); @@ -5364,7 +5601,7 @@ vm_object_print(db_expr_t db_addr, __unused boolean_t have_addr, printf(" (depth %d)", i); } printf(", copy=0x%x", object->copy); - printf(", shadow_offset=0x%x", object->shadow_offset); + printf(", shadow_offset=0x%x", object->vo_shadow_offset); printf(", last_alloc=0x%x\n", object->last_alloc); iprintf("pager=0x%x", object->pager); @@ -5455,7 +5692,7 @@ vm_object_print(db_expr_t db_addr, __unused boolean_t have_addr, #if MACH_PAGEMAP iprintf("existence_map="); - vm_external_print(object->existence_map, object->size); + vm_external_print(object->existence_map, object->vo_size); #endif /* MACH_PAGEMAP */ #if MACH_ASSERT iprintf("paging_object=0x%x\n", object->paging_object); @@ -5642,8 +5879,8 @@ vm_object_populate_with_private( /* shadows on contiguous memory are not allowed */ /* we therefore can use the offset field */ - object->shadow_offset = (vm_object_offset_t)phys_page << PAGE_SHIFT; - object->size = size; + object->vo_shadow_offset = (vm_object_offset_t)phys_page << PAGE_SHIFT; + object->vo_size = size; } vm_object_unlock(object); return KERN_SUCCESS; @@ -6585,8 +6822,8 @@ vm_object_transpose( vm_object_paging_only_wait(object2, THREAD_UNINT); - if (object1->size != object2->size || - object1->size != transpose_size) { + if (object1->vo_size != object2->vo_size || + object1->vo_size != transpose_size) { /* * If the 2 objects don't have the same size, we can't * exchange their backing stores or one would overflow. @@ -6661,7 +6898,7 @@ MACRO_END /* "Lock" refers to the object not its contents */ /* "size" should be identical */ - assert(object1->size == object2->size); + assert(object1->vo_size == object2->vo_size); /* "memq_hint" was updated above when transposing pages */ /* "ref_count" refers to the object not its contents */ #if TASK_SWAPPER @@ -6676,7 +6913,7 @@ MACRO_END /* there should be no "shadow" */ assert(!object1->shadow); assert(!object2->shadow); - __TRANSPOSE_FIELD(shadow_offset); /* used by phys_contiguous objects */ + __TRANSPOSE_FIELD(vo_shadow_offset); /* used by phys_contiguous objects */ __TRANSPOSE_FIELD(pager); __TRANSPOSE_FIELD(paging_offset); __TRANSPOSE_FIELD(pager_control); @@ -6735,6 +6972,7 @@ MACRO_END __TRANSPOSE_FIELD(sequential); __TRANSPOSE_FIELD(pages_created); __TRANSPOSE_FIELD(pages_used); + __TRANSPOSE_FIELD(scan_collisions); #if MACH_PAGEMAP __TRANSPOSE_FIELD(existence_map); #endif @@ -6743,6 +6981,7 @@ MACRO_END __TRANSPOSE_FIELD(paging_object); #endif __TRANSPOSE_FIELD(wimg_bits); + __TRANSPOSE_FIELD(set_cache_attr); __TRANSPOSE_FIELD(code_signed); if (object1->hashed) { hash_lck = vm_object_hash_lock_spin(object2->pager); @@ -6825,14 +7064,14 @@ MACRO_END * */ extern int speculative_reads_disabled; +extern int ignore_is_ssd; + #if CONFIG_EMBEDDED unsigned int preheat_pages_max = MAX_UPL_TRANSFER; unsigned int preheat_pages_min = 8; -unsigned int preheat_pages_mult = 4; #else unsigned int preheat_pages_max = MAX_UPL_TRANSFER; unsigned int preheat_pages_min = 8; -unsigned int preheat_pages_mult = 4; #endif uint32_t pre_heat_scaling[MAX_UPL_TRANSFER + 1]; @@ -6855,26 +7094,21 @@ vm_object_cluster_size(vm_object_t object, vm_object_offset_t *start, vm_behavior_t behavior; boolean_t look_behind = TRUE; boolean_t look_ahead = TRUE; + boolean_t isSSD = FALSE; uint32_t throttle_limit; int sequential_run; int sequential_behavior = VM_BEHAVIOR_SEQUENTIAL; unsigned int max_ph_size; unsigned int min_ph_size; - unsigned int ph_mult; + unsigned int min_ph_size_in_bytes; assert( !(*length & PAGE_MASK)); assert( !(*start & PAGE_MASK_64)); - if ( (ph_mult = preheat_pages_mult) < 1 ) - ph_mult = 1; - if ( (min_ph_size = preheat_pages_min) < 1 ) - min_ph_size = 1; - if ( (max_ph_size = preheat_pages_max) > MAX_UPL_TRANSFER ) - max_ph_size = MAX_UPL_TRANSFER; - - if ( (max_length = *length) > (max_ph_size * PAGE_SIZE) ) - max_length = (max_ph_size * PAGE_SIZE); - + /* + * remember maxiumum length of run requested + */ + max_length = *length; /* * we'll always return a cluster size of at least * 1 page, since the original fault must always @@ -6883,7 +7117,7 @@ vm_object_cluster_size(vm_object_t object, vm_object_offset_t *start, *length = PAGE_SIZE; *io_streaming = 0; - if (speculative_reads_disabled || fault_info == NULL || max_length == 0) { + if (speculative_reads_disabled || fault_info == NULL) { /* * no cluster... just fault the page in */ @@ -6896,12 +7130,39 @@ vm_object_cluster_size(vm_object_t object, vm_object_offset_t *start, vm_object_lock(object); + if (object->pager == MEMORY_OBJECT_NULL) + goto out; /* pager is gone for this object, nothing more to do */ + + if (!ignore_is_ssd) + vnode_pager_get_isSSD(object->pager, &isSSD); + + min_ph_size = preheat_pages_min; + max_ph_size = preheat_pages_max; + + if (isSSD) { + min_ph_size /= 2; + max_ph_size /= 8; + } + if (min_ph_size < 1) + min_ph_size = 1; + + if (max_ph_size < 1) + max_ph_size = 1; + else if (max_ph_size > MAX_UPL_TRANSFER) + max_ph_size = MAX_UPL_TRANSFER; + + if (max_length > (max_ph_size * PAGE_SIZE)) + max_length = max_ph_size * PAGE_SIZE; + + if (max_length <= PAGE_SIZE) + goto out; + + min_ph_size_in_bytes = min_ph_size * PAGE_SIZE; + if (object->internal) - object_size = object->size; - else if (object->pager != MEMORY_OBJECT_NULL) - vnode_pager_get_object_size(object->pager, &object_size); + object_size = object->vo_size; else - goto out; /* pager is gone for this object, nothing more to do */ + vnode_pager_get_object_size(object->pager, &object_size); object_size = round_page_64(object_size); @@ -6929,7 +7190,7 @@ vm_object_cluster_size(vm_object_t object, vm_object_offset_t *start, } } - switch(behavior) { + switch (behavior) { default: behavior = VM_BEHAVIOR_DEFAULT; @@ -6949,25 +7210,25 @@ vm_object_cluster_size(vm_object_t object, vm_object_offset_t *start, *io_streaming = 1; } else { - if (object->pages_created < 32 * ph_mult) { + if (object->pages_created < (20 * min_ph_size)) { /* * prime the pump */ - pre_heat_size = PAGE_SIZE * 8 * ph_mult; - break; + pre_heat_size = min_ph_size_in_bytes; + } else { + /* + * Linear growth in PH size: The maximum size is max_length... + * this cacluation will result in a size that is neither a + * power of 2 nor a multiple of PAGE_SIZE... so round + * it up to the nearest PAGE_SIZE boundary + */ + pre_heat_size = (max_length * object->pages_used) / object->pages_created; + + if (pre_heat_size < min_ph_size_in_bytes) + pre_heat_size = min_ph_size_in_bytes; + else + pre_heat_size = round_page(pre_heat_size); } - /* - * Linear growth in PH size: The maximum size is max_length... - * this cacluation will result in a size that is neither a - * power of 2 nor a multiple of PAGE_SIZE... so round - * it up to the nearest PAGE_SIZE boundary - */ - pre_heat_size = (ph_mult * (max_length * object->pages_used) / object->pages_created); - - if (pre_heat_size < PAGE_SIZE * min_ph_size) - pre_heat_size = PAGE_SIZE * min_ph_size; - else - pre_heat_size = round_page(pre_heat_size); } break; @@ -7003,14 +7264,14 @@ vm_object_cluster_size(vm_object_t object, vm_object_offset_t *start, if (pre_heat_size > max_length) pre_heat_size = max_length; - if (behavior == VM_BEHAVIOR_DEFAULT) { + if (behavior == VM_BEHAVIOR_DEFAULT && (pre_heat_size > min_ph_size_in_bytes)) { if (vm_page_free_count < vm_page_throttle_limit) - pre_heat_size = trunc_page(pre_heat_size / 8); + pre_heat_size = trunc_page(pre_heat_size / 16); else if (vm_page_free_count < vm_page_free_target) - pre_heat_size = trunc_page(pre_heat_size / 2); + pre_heat_size = trunc_page(pre_heat_size / 4); - if (pre_heat_size <= PAGE_SIZE) - goto out; + if (pre_heat_size < min_ph_size_in_bytes) + pre_heat_size = min_ph_size_in_bytes; } if (look_ahead == TRUE) { if (look_behind == TRUE) { @@ -7047,8 +7308,14 @@ vm_object_cluster_size(vm_object_t object, vm_object_offset_t *start, assert((vm_size_t)(orig_start - target_start) == (orig_start - target_start)); tail_size = pre_heat_size - (vm_size_t)(orig_start - target_start) - PAGE_SIZE; } else { - if (pre_heat_size > target_start) - pre_heat_size = (vm_size_t) target_start; /* XXX: 32-bit vs 64-bit ? Joe ? */ + if (pre_heat_size > target_start) { + /* + * since pre_heat_size is always smaller then 2^32, + * if it is larger then target_start (a 64 bit value) + * it is safe to clip target_start to 32 bits + */ + pre_heat_size = (vm_size_t) target_start; + } tail_size = 0; } assert( !(target_start & PAGE_MASK_64)); @@ -7159,7 +7426,7 @@ vm_object_page_op( if(object->phys_contiguous) { if (phys_entry) { *phys_entry = (ppnum_t) - (object->shadow_offset >> PAGE_SHIFT); + (object->vo_shadow_offset >> PAGE_SHIFT); } vm_object_unlock(object); return KERN_SUCCESS; @@ -7340,8 +7607,13 @@ vm_object_range_op( dst_page = vm_page_lookup(object, offset); if (dst_page != VM_PAGE_NULL) { if (ops & UPL_ROP_DUMP) { - if (dst_page->busy || dst_page->cleaning) { - /* + if (dst_page->list_req_pending) { + /* + * This page isn't on a UPL yet. + * So it's safe to steal it here and dump it. + */ + } else if (dst_page->busy || dst_page->cleaning) { + /* * someone else is playing with the * page, we will have to wait */ @@ -7413,12 +7685,15 @@ _vm_object_lock_try(vm_object_t object) boolean_t vm_object_lock_try(vm_object_t object) { - // called from hibernate path so check before blocking - if (vm_object_lock_avoid(object) && ml_get_interrupts_enabled()) { + /* + * Called from hibernate path so check before blocking. + */ + if (vm_object_lock_avoid(object) && ml_get_interrupts_enabled() && get_preemption_level()==0) { mutex_pause(2); } return _vm_object_lock_try(object); } + void vm_object_lock_shared(vm_object_t object) { @@ -7436,3 +7711,264 @@ vm_object_lock_try_shared(vm_object_t object) } return (lck_rw_try_lock_shared(&object->Lock)); } + + +unsigned int vm_object_change_wimg_mode_count = 0; + +/* + * The object must be locked + */ +void +vm_object_change_wimg_mode(vm_object_t object, unsigned int wimg_mode) +{ + vm_page_t p; + + vm_object_lock_assert_exclusive(object); + + vm_object_paging_wait(object, THREAD_UNINT); + + queue_iterate(&object->memq, p, vm_page_t, listq) { + + if (!p->fictitious) + pmap_set_cache_attributes(p->phys_page, wimg_mode); + } + if (wimg_mode == VM_WIMG_USE_DEFAULT) + object->set_cache_attr = FALSE; + else + object->set_cache_attr = TRUE; + + object->wimg_bits = wimg_mode; + + vm_object_change_wimg_mode_count++; +} + +#if CONFIG_FREEZE + +__private_extern__ void default_freezer_pack_page(vm_page_t , vm_object_t , vm_object_offset_t, void**); +__private_extern__ void default_freezer_unpack(vm_object_t , void**); + +kern_return_t vm_object_pack( + unsigned int *purgeable_count, + unsigned int *wired_count, + unsigned int *clean_count, + unsigned int *dirty_count, + boolean_t *shared, + vm_object_t src_object, + vm_object_t compact_object, + void **table, + vm_object_offset_t *offset) +{ + kern_return_t kr = KERN_SUCCESS; + + vm_object_lock(src_object); + + *purgeable_count = *wired_count = *clean_count = *dirty_count = 0; + *shared = FALSE; + + if (!src_object->alive || src_object->terminating){ + kr = KERN_FAILURE; + goto done; + } + + if (src_object->purgable == VM_PURGABLE_VOLATILE) { + *purgeable_count = src_object->resident_page_count; + + /* If the destination object is null, we're just walking the pages to discover how many can be hibernated */ + if (VM_OBJECT_NULL != compact_object) { + purgeable_q_t queue; + /* object should be on a queue */ + assert(src_object->objq.next != NULL && + src_object->objq.prev != NULL); + queue = vm_purgeable_object_remove(src_object); + assert(queue); + vm_page_lock_queues(); + vm_purgeable_token_delete_first(queue); + vm_page_unlock_queues(); + vm_object_purge(src_object); + } + goto done; + } + + if (src_object->ref_count == 1) { + vm_object_pack_pages(wired_count, clean_count, dirty_count, src_object, compact_object, table, offset); + } else { + if (src_object->internal) { + *shared = TRUE; + } + } +done: + vm_object_unlock(src_object); + + return kr; +} + + +void +vm_object_pack_pages( + unsigned int *wired_count, + unsigned int *clean_count, + unsigned int *dirty_count, + vm_object_t src_object, + vm_object_t compact_object, + void **table, + vm_object_offset_t *offset) +{ + vm_page_t p, next; + + next = (vm_page_t)queue_first(&src_object->memq); + + /* Since this function is dual purpose in order that we can count + * the freezable pages as well as prepare them, assert that our + * arguments are sane. Gnarly, but avoids code duplication. + */ + if (VM_OBJECT_NULL == compact_object){ + assert(!table); + assert(!offset); + } else { + assert(table); + assert(offset); + } + + while (!queue_end(&src_object->memq, (queue_entry_t)next)) { + p = next; + next = (vm_page_t)queue_next(&next->listq); + + if (p->fictitious || p->busy ) + continue; + + if (p->absent || p->unusual || p->error) + continue; + + if (VM_PAGE_WIRED(p)) { + (*wired_count)++; + continue; + } + + if (VM_OBJECT_NULL == compact_object) { + if (p->dirty || pmap_is_modified(p->phys_page)) { + (*dirty_count)++; + } else { + (*clean_count)++; + } + continue; + } + + if (p->cleaning) { + p->busy = TRUE; + p->pageout = TRUE; + p->dump_cleaning = TRUE; + + vm_page_lockspin_queues(); + vm_page_wire(p); + vm_page_unlock_queues(); + + continue; + } + + if (p->pmapped == TRUE) { + int refmod_state; + refmod_state = pmap_disconnect(p->phys_page); + if (refmod_state & VM_MEM_MODIFIED) { + p->dirty = TRUE; + } + } + + if (p->dirty) { + p->busy = TRUE; + + default_freezer_pack_page(p, compact_object, *offset, table); + *offset += PAGE_SIZE; + + (*dirty_count)++; + } + else { + VM_PAGE_FREE(p); + (*clean_count)++; + } + } +} + +void +vm_object_pageout( + vm_object_t object) +{ + vm_page_t p, next; + + assert(object != VM_OBJECT_NULL ); + + vm_object_lock(object); + + next = (vm_page_t)queue_first(&object->memq); + + while (!queue_end(&object->memq, (queue_entry_t)next)) { + p = next; + next = (vm_page_t)queue_next(&next->listq); + + /* Throw to the pageout queue */ + vm_page_lockspin_queues(); + + VM_PAGE_QUEUES_REMOVE(p); + vm_pageout_cluster(p); + + vm_page_unlock_queues(); + } + + vm_object_unlock(object); +} + +kern_return_t +vm_object_pagein( + vm_object_t object) +{ + memory_object_t pager; + kern_return_t kr; + + vm_object_lock(object); + + pager = object->pager; + + if (!object->pager_ready || pager == MEMORY_OBJECT_NULL) { + vm_object_unlock(object); + return KERN_FAILURE; + } + + vm_object_paging_wait(object, THREAD_UNINT); + vm_object_paging_begin(object); + + object->blocked_access = TRUE; + vm_object_unlock(object); + + kr = memory_object_data_reclaim(pager, TRUE); + + vm_object_lock(object); + + object->blocked_access = FALSE; + vm_object_paging_end(object); + + vm_object_unlock(object); + + return kr; +} + +void +vm_object_unpack( + vm_object_t compact_object, + void **table) +{ + /* + * Future Work: + * Right now we treat the default freezer much like + * the default pager with respect to when it is + * created and terminated. + * But, in the future, we may want to terminate the + * default freezer at the very instant that an object + * has been completely re-filled with all it's previously + * paged-out pages. + * At that time we'll need to reset the object fields like + * "pager" and the associated "pager_{created,initialized,trusted}" + * fields right here. + */ + default_freezer_unpack(compact_object, table); +} + +#endif /* CONFIG_FREEZE */ diff --git a/osfmk/vm/vm_object.h b/osfmk/vm/vm_object.h index eacd7d65e..0d21734af 100644 --- a/osfmk/vm/vm_object.h +++ b/osfmk/vm/vm_object.h @@ -66,6 +66,8 @@ #ifndef _VM_VM_OBJECT_H_ #define _VM_VM_OBJECT_H_ +#include +#include #include #include @@ -107,20 +109,32 @@ struct vm_object_fault_info { vm_behavior_t behavior; vm_map_offset_t lo_offset; vm_map_offset_t hi_offset; - boolean_t no_cache; - boolean_t stealth; - boolean_t mark_zf_absent; + unsigned int + /* boolean_t */ no_cache:1, + /* boolean_t */ stealth:1, + /* boolean_t */ io_sync:1, + /* boolean_t */ cs_bypass:1, + /* boolean_t */ mark_zf_absent:1, + __vm_object_fault_info_unused_bits:27; }; +#define vo_size vo_un1.vou_size +#define vo_cache_pages_to_scan vo_un1.vou_cache_pages_to_scan +#define vo_shadow_offset vo_un2.vou_shadow_offset +#define vo_cache_ts vo_un2.vou_cache_ts struct vm_object { queue_head_t memq; /* Resident memory */ lck_rw_t Lock; /* Synchronization */ - vm_object_size_t size; /* Object size (only valid - * if internal) - */ + union { + vm_object_size_t vou_size; /* Object size (only valid if internal) */ + int vou_cache_pages_to_scan; /* pages yet to be visited in an + * external object in cache + */ + } vo_un1; + struct vm_page *memq_hint; int ref_count; /* Number of references */ #if TASK_SWAPPER @@ -139,7 +153,13 @@ struct vm_object { * copy_call. */ struct vm_object *shadow; /* My shadow */ - vm_object_offset_t shadow_offset; /* Offset into shadow */ + + union { + vm_object_offset_t vou_shadow_offset; /* Offset into shadow */ + clock_sec_t vou_cache_ts; /* age of an external object + * present in cache + */ + } vo_un2; memory_object_t pager; /* Where to get data */ vm_object_offset_t paging_offset; /* Offset into memory object */ @@ -303,7 +323,10 @@ struct vm_object { volatile_fault:1, all_reusable:1, blocked_access:1, - __object2_unused_bits:16; /* for expansion */ + set_cache_attr:1, + __object2_unused_bits:15; /* for expansion */ + + uint32_t scan_collisions; #if UPL_DEBUG queue_head_t uplq; /* List of outstanding upls */ @@ -321,7 +344,7 @@ struct vm_object { } pip_holders[VM_PIP_DEBUG_MAX_REFS]; #endif /* VM_PIP_DEBUG */ - queue_chain_t objq; /* object queue - currently used for purgable queues */ + queue_chain_t objq; /* object queue - currently used for purgable queues */ }; #define VM_OBJECT_PURGEABLE_FAULT_ERROR(object) \ @@ -644,6 +667,10 @@ __private_extern__ kern_return_t vm_object_populate_with_private( ppnum_t phys_page, vm_size_t size); +__private_extern__ void vm_object_change_wimg_mode( + vm_object_t object, + unsigned int wimg_mode); + extern kern_return_t adjust_vm_object_cache( vm_size_t oval, vm_size_t nval); @@ -671,6 +698,41 @@ __private_extern__ void vm_object_reap_pages( #define REAP_PURGEABLE 2 #define REAP_DATA_FLUSH 3 +#if CONFIG_FREEZE + +__private_extern__ kern_return_t +vm_object_pack( + unsigned int *purgeable_count, + unsigned int *wired_count, + unsigned int *clean_count, + unsigned int *dirty_count, + boolean_t *shared, + vm_object_t src_object, + vm_object_t dst_object, + void **table, + vm_object_offset_t *offset); + +__private_extern__ void +vm_object_pack_pages( + unsigned int *wired_count, + unsigned int *clean_count, + unsigned int *dirty_count, + vm_object_t src_object, + vm_object_t dst_object, + void **table, + vm_object_offset_t *offset); + +__private_extern__ void vm_object_pageout( + vm_object_t object); + +__private_extern__ kern_return_t vm_object_pagein( + vm_object_t object); + +__private_extern__ void vm_object_unpack( + vm_object_t object, + void **table); + +#endif /* CONFIG_FREEZE */ /* * Event waiting handling @@ -881,20 +943,24 @@ extern boolean_t vm_object_lock_try_shared(vm_object_t); * check if anyone is holding the lock, but the holder may not necessarily * be the caller... */ -#if DEBUG +#if MACH_ASSERT || DEBUG #define vm_object_lock_assert_held(object) \ lck_rw_assert(&(object)->Lock, LCK_RW_ASSERT_HELD) #define vm_object_lock_assert_shared(object) \ lck_rw_assert(&(object)->Lock, LCK_RW_ASSERT_SHARED) #define vm_object_lock_assert_exclusive(object) \ lck_rw_assert(&(object)->Lock, LCK_RW_ASSERT_EXCLUSIVE) -#else /* DEBUG */ +#else /* MACH_ASSERT || DEBUG */ #define vm_object_lock_assert_held(object) #define vm_object_lock_assert_shared(object) #define vm_object_lock_assert_exclusive(object) -#endif /* DEBUG */ +#endif /* MACH_ASSERT || DEBUG */ #define vm_object_round_page(x) (((vm_object_offset_t)(x) + PAGE_MASK) & ~((signed)PAGE_MASK)) #define vm_object_trunc_page(x) ((vm_object_offset_t)(x) & ~((signed)PAGE_MASK)) +extern void vm_object_cache_add(vm_object_t); +extern void vm_object_cache_remove(vm_object_t); +extern int vm_object_cache_evict(int, int); + #endif /* _VM_VM_OBJECT_H_ */ diff --git a/osfmk/vm/vm_page.h b/osfmk/vm/vm_page.h index ecae81c15..543a0c6f5 100644 --- a/osfmk/vm/vm_page.h +++ b/osfmk/vm/vm_page.h @@ -114,7 +114,6 @@ #define VM_PAGE_SPECULATIVE_Q_AGE_MS 500 - struct vm_speculative_age_q { /* * memory queue for speculative pages via clustered pageins @@ -124,11 +123,13 @@ struct vm_speculative_age_q { }; + extern struct vm_speculative_age_q vm_page_queue_speculative[]; extern int speculative_steal_index; extern int speculative_age_index; +extern unsigned int vm_page_speculative_q_age_ms; /* @@ -179,6 +180,7 @@ struct vm_page { #define local_id wire_count unsigned int wire_count:16, /* how many wired down maps use me? (O&P) */ /* boolean_t */ inactive:1, /* page is in inactive list (P) */ + zero_fill:1, active:1, /* page is in active list (P) */ pageout_queue:1,/* page is on queue for pageout (P) */ speculative:1, /* page is on speculative list (P) */ @@ -190,7 +192,9 @@ struct vm_page { * the free list (P) */ throttled:1, /* pager is not responding (P) */ local:1, - __unused_pageq_bits:5; /* 5 bits available here */ + no_cache:1, /* page is not to be cached and should + * be reused ahead of other pages (P) */ + __unused_pageq_bits:3; /* 3 bits available here */ ppnum_t phys_page; /* Physical address of page, passed * to pmap_enter (read-only) */ @@ -244,13 +248,10 @@ struct vm_page { /* a pageout candidate */ cs_validated:1, /* code-signing: page was checked */ cs_tainted:1, /* code-signing: page is tainted */ - no_cache:1, /* page is not to be cached and */ - /* should be reused ahead of */ - /* other pages */ - zero_fill:1, reusable:1, lopage:1, - __unused_object_bits:6; /* 6 bits available here */ + slid:1, + __unused_object_bits:7; /* 7 bits available here */ #if __LP64__ unsigned int __unused_padding; /* Pad structure explicitly @@ -400,8 +401,6 @@ queue_head_t vm_page_queue_free[MAX_COLORS]; /* memory free queue */ extern queue_head_t vm_lopage_queue_free; /* low memory free queue */ extern -vm_page_t vm_page_queue_fictitious; /* fictitious free queue */ -extern queue_head_t vm_page_queue_active; /* active memory queue */ extern queue_head_t vm_page_queue_inactive; /* inactive memory queue for normal pages */ @@ -545,7 +544,7 @@ extern vm_page_t vm_page_alloc_guard( extern void vm_page_init( vm_page_t page, ppnum_t phys_page, - boolean_t lopage); + boolean_t lopage); extern void vm_page_free( vm_page_t page); @@ -648,6 +647,9 @@ extern void vm_page_free_prepare_object( vm_page_t page, boolean_t remove_from_hash); +extern void vm_check_memorystatus(void); + + /* * Functions implemented as macros. m->wanted and m->busy are * protected by the object lock. @@ -744,6 +746,7 @@ extern void vm_page_queues_assert(vm_page_t mem, int val); assert(mem->object != kernel_object); \ assert(!mem->inactive && !mem->speculative); \ assert(!mem->active && !mem->throttled); \ + assert(!mem->fictitious); \ lq = &vm_page_local_q[mem->local_id].vpl_un.vpl; \ VPL_LOCK(&lq->vpl_lock); \ queue_remove(&lq->vpl_queue, \ @@ -753,25 +756,23 @@ extern void vm_page_queues_assert(vm_page_t mem, int val); lq->vpl_count--; \ VPL_UNLOCK(&lq->vpl_lock); \ } \ - if (mem->active) { \ + \ + else if (mem->active) { \ assert(mem->object != kernel_object); \ assert(!mem->inactive && !mem->speculative); \ assert(!mem->throttled); \ + assert(!mem->fictitious); \ queue_remove(&vm_page_queue_active, \ mem, vm_page_t, pageq); \ mem->active = FALSE; \ - if (!mem->fictitious) { \ - vm_page_active_count--; \ - } else { \ - assert(mem->phys_page == \ - vm_page_fictitious_addr); \ - } \ + vm_page_active_count--; \ } \ \ else if (mem->inactive) { \ assert(mem->object != kernel_object); \ assert(!mem->active && !mem->speculative); \ assert(!mem->throttled); \ + assert(!mem->fictitious); \ if (mem->zero_fill) { \ queue_remove(&vm_page_queue_zf, \ mem, vm_page_t, pageq); \ @@ -781,23 +782,18 @@ extern void vm_page_queues_assert(vm_page_t mem, int val); mem, vm_page_t, pageq); \ } \ mem->inactive = FALSE; \ - if (!mem->fictitious) { \ - vm_page_inactive_count--; \ - vm_purgeable_q_advance_all(); \ - } else { \ - assert(mem->phys_page == \ - vm_page_fictitious_addr); \ - } \ + vm_page_inactive_count--; \ + vm_purgeable_q_advance_all(); \ } \ \ else if (mem->throttled) { \ assert(!mem->active && !mem->inactive); \ assert(!mem->speculative); \ + assert(!mem->fictitious); \ queue_remove(&vm_page_queue_throttled, \ mem, vm_page_t, pageq); \ mem->throttled = FALSE; \ - if (!mem->fictitious) \ - vm_page_throttled_count--; \ + vm_page_throttled_count--; \ } \ \ else if (mem->speculative) { \ @@ -808,12 +804,39 @@ extern void vm_page_queues_assert(vm_page_t mem, int val); mem->speculative = FALSE; \ vm_page_speculative_count--; \ } \ + \ + else if (mem->pageq.next || mem->pageq.prev) \ + panic("VM_PAGE_QUEUES_REMOVE: unmarked page on Q"); \ mem->pageq.next = NULL; \ mem->pageq.prev = NULL; \ VM_PAGE_QUEUES_ASSERT(mem, 0); \ MACRO_END +#define VM_PAGE_ENQUEUE_INACTIVE(mem, first) \ + MACRO_BEGIN \ + VM_PAGE_QUEUES_ASSERT(mem, 0); \ + assert(!mem->fictitious); \ + assert(!mem->laundry); \ + assert(!mem->pageout_queue); \ + if (mem->zero_fill) { \ + if (first == TRUE) \ + queue_enter_first(&vm_page_queue_zf, mem, vm_page_t, pageq); \ + else \ + queue_enter(&vm_page_queue_zf, mem, vm_page_t, pageq); \ + vm_zf_queue_count++; \ + } else { \ + if (first == TRUE) \ + queue_enter_first(&vm_page_queue_inactive, mem, vm_page_t, pageq); \ + else \ + queue_enter(&vm_page_queue_inactive, mem, vm_page_t, pageq); \ + } \ + mem->inactive = TRUE; \ + vm_page_inactive_count++; \ + token_new_pagecount++; \ + MACRO_END + + #if DEVELOPMENT || DEBUG #define VM_PAGE_SPECULATIVE_USED_ADD() \ MACRO_BEGIN \ @@ -834,4 +857,71 @@ extern void vm_page_queues_assert(vm_page_t mem, int val); } \ MACRO_END + + +#define DW_vm_page_unwire 0x01 +#define DW_vm_page_wire 0x02 +#define DW_vm_page_free 0x04 +#define DW_vm_page_activate 0x08 +#define DW_vm_page_deactivate_internal 0x10 +#define DW_vm_page_speculate 0x20 +#define DW_vm_page_lru 0x40 +#define DW_vm_pageout_throttle_up 0x80 +#define DW_PAGE_WAKEUP 0x100 +#define DW_clear_busy 0x200 +#define DW_clear_reference 0x400 +#define DW_set_reference 0x800 +#define DW_move_page 0x1000 +#define DW_VM_PAGE_QUEUES_REMOVE 0x2000 +#define DW_set_list_req_pending 0x4000 + +struct vm_page_delayed_work { + vm_page_t dw_m; + int dw_mask; +}; + +void vm_page_do_delayed_work(vm_object_t object, struct vm_page_delayed_work *dwp, int dw_count); + +extern unsigned int vm_max_delayed_work_limit; + +#define DEFAULT_DELAYED_WORK_LIMIT 32 + +#define DELAYED_WORK_LIMIT(max) ((vm_max_delayed_work_limit >= max ? max : vm_max_delayed_work_limit)) + +/* + * vm_page_do_delayed_work may need to drop the object lock... + * if it does, we need the pages it's looking at to + * be held stable via the busy bit, so if busy isn't already + * set, we need to set it and ask vm_page_do_delayed_work + * to clear it and wakeup anyone that might have blocked on + * it once we're done processing the page. + * + * additionally, we can't call vm_page_do_delayed_work with + * list_req_pending == TRUE since it may need to + * drop the object lock before dealing + * with this page and because list_req_pending == TRUE, + * busy == TRUE will NOT protect this page from being stolen + * so clear list_req_pending and ask vm_page_do_delayed_work + * to re-set it once it holds both the pageq and object locks + */ + +#define VM_PAGE_ADD_DELAYED_WORK(dwp, mem, dw_cnt) \ + MACRO_BEGIN \ + if (mem->busy == FALSE) { \ + mem->busy = TRUE; \ + if ( !(dwp->dw_mask & DW_vm_page_free)) \ + dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP); \ + } \ + if (mem->list_req_pending) { \ + mem->list_req_pending = FALSE; \ + dwp->dw_mask |= DW_set_list_req_pending; \ + } \ + dwp->dw_m = mem; \ + dwp++; \ + dw_count++; \ + MACRO_END + +extern vm_page_t vm_object_page_grab(vm_object_t); + + #endif /* _VM_VM_PAGE_H_ */ diff --git a/osfmk/vm/vm_pageout.c b/osfmk/vm/vm_pageout.c index 4098fb8bc..acf4d64bd 100644 --- a/osfmk/vm/vm_pageout.c +++ b/osfmk/vm/vm_pageout.c @@ -95,9 +95,7 @@ #include #include -#if CONFIG_EMBEDDED #include -#endif #include #include @@ -108,7 +106,7 @@ #include /* must be last */ #include #include - +#include /* * ENCRYPTED SWAP: */ @@ -119,6 +117,8 @@ extern u_int32_t random(void); /* from */ #include #endif +extern void consider_pressure_events(void); + #ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE /* maximum iterations of the active queue to move pages to inactive */ #define VM_PAGEOUT_BURST_ACTIVE_THROTTLE 100 #endif @@ -140,11 +140,11 @@ extern u_int32_t random(void); /* from */ #endif #ifndef VM_PAGE_LAUNDRY_MAX -#define VM_PAGE_LAUNDRY_MAX 16UL /* maximum pageouts on a given pageout queue */ +#define VM_PAGE_LAUNDRY_MAX 128UL /* maximum pageouts on a given pageout queue */ #endif /* VM_PAGEOUT_LAUNDRY_MAX */ #ifndef VM_PAGEOUT_BURST_WAIT -#define VM_PAGEOUT_BURST_WAIT 30 /* milliseconds per page */ +#define VM_PAGEOUT_BURST_WAIT 30 /* milliseconds */ #endif /* VM_PAGEOUT_BURST_WAIT */ #ifndef VM_PAGEOUT_EMPTY_WAIT @@ -159,10 +159,14 @@ extern u_int32_t random(void); /* from */ #define VM_PAGEOUT_IDLE_WAIT 10 /* milliseconds */ #endif /* VM_PAGEOUT_IDLE_WAIT */ +unsigned int vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS; +unsigned int vm_page_speculative_percentage = 5; + #ifndef VM_PAGE_SPECULATIVE_TARGET -#define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / 20) +#define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_page_speculative_percentage)) #endif /* VM_PAGE_SPECULATIVE_TARGET */ + #ifndef VM_PAGE_INACTIVE_HEALTHY_LIMIT #define VM_PAGE_INACTIVE_HEALTHY_LIMIT(total) ((total) * 1 / 200) #endif /* VM_PAGE_INACTIVE_HEALTHY_LIMIT */ @@ -210,6 +214,7 @@ extern u_int32_t random(void); /* from */ #endif #endif /* VM_PAGE_FREE_MIN */ +#define VM_PAGE_FREE_RESERVED_LIMIT 100 #define VM_PAGE_FREE_MIN_LIMIT 1500 #define VM_PAGE_FREE_TARGET_LIMIT 2000 @@ -282,6 +287,8 @@ unsigned int vm_pageout_inactive_relief = 0; unsigned int vm_pageout_burst_active_throttle = 0; unsigned int vm_pageout_burst_inactive_throttle = 0; +int vm_upl_wait_for_pages = 0; + /* * Protection against zero fill flushing live working sets derived * from existing backing store and files @@ -290,11 +297,7 @@ unsigned int vm_accellerate_zf_pageout_trigger = 400; unsigned int zf_queue_min_count = 100; unsigned int vm_zf_queue_count = 0; -#if defined(__ppc__) /* On ppc, vm statistics are still 32-bit */ -unsigned int vm_zf_count = 0; -#else uint64_t vm_zf_count __attribute__((aligned(8))) = 0; -#endif /* * These variables record the pageout daemon's actions: @@ -303,16 +306,22 @@ uint64_t vm_zf_count __attribute__((aligned(8))) = 0; */ unsigned int vm_pageout_active = 0; /* debugging */ +unsigned int vm_pageout_active_busy = 0; /* debugging */ unsigned int vm_pageout_inactive = 0; /* debugging */ unsigned int vm_pageout_inactive_throttled = 0; /* debugging */ unsigned int vm_pageout_inactive_forced = 0; /* debugging */ unsigned int vm_pageout_inactive_nolock = 0; /* debugging */ unsigned int vm_pageout_inactive_avoid = 0; /* debugging */ unsigned int vm_pageout_inactive_busy = 0; /* debugging */ +unsigned int vm_pageout_inactive_error = 0; /* debugging */ unsigned int vm_pageout_inactive_absent = 0; /* debugging */ +unsigned int vm_pageout_inactive_notalive = 0; /* debugging */ unsigned int vm_pageout_inactive_used = 0; /* debugging */ +unsigned int vm_pageout_cache_evicted = 0; /* debugging */ unsigned int vm_pageout_inactive_clean = 0; /* debugging */ -unsigned int vm_pageout_inactive_dirty = 0; /* debugging */ +unsigned int vm_pageout_speculative_clean = 0; /* debugging */ +unsigned int vm_pageout_inactive_dirty_internal = 0; /* debugging */ +unsigned int vm_pageout_inactive_dirty_external = 0; /* debugging */ unsigned int vm_pageout_inactive_deactivated = 0; /* debugging */ unsigned int vm_pageout_inactive_zf = 0; /* debugging */ unsigned int vm_pageout_dirty_no_pager = 0; /* debugging */ @@ -325,8 +334,10 @@ unsigned int vm_pageout_reactivation_limit_exceeded = 0; /* debugging */ unsigned int vm_pageout_catch_ups = 0; /* debugging */ unsigned int vm_pageout_inactive_force_reclaim = 0; /* debugging */ +unsigned int vm_pageout_scan_reclaimed_throttled = 0; unsigned int vm_pageout_scan_active_throttled = 0; -unsigned int vm_pageout_scan_inactive_throttled = 0; +unsigned int vm_pageout_scan_inactive_throttled_internal = 0; +unsigned int vm_pageout_scan_inactive_throttled_external = 0; unsigned int vm_pageout_scan_throttle = 0; /* debugging */ unsigned int vm_pageout_scan_throttle_aborted = 0; /* debugging */ unsigned int vm_pageout_scan_burst_throttle = 0; /* debugging */ @@ -334,7 +345,7 @@ unsigned int vm_pageout_scan_empty_throttle = 0; /* debugging */ unsigned int vm_pageout_scan_deadlock_detected = 0; /* debugging */ unsigned int vm_pageout_scan_active_throttle_success = 0; /* debugging */ unsigned int vm_pageout_scan_inactive_throttle_success = 0; /* debugging */ - +unsigned int vm_pageout_inactive_external_forced_reactivate_count = 0; /* debugging */ unsigned int vm_page_speculative_count_drifts = 0; unsigned int vm_page_speculative_count_drift_max = 0; @@ -372,6 +383,9 @@ boolean_t (* volatile consider_buffer_cache_collect)(int) = NULL; unsigned long vm_cs_validated_resets = 0; #endif +int vm_debug_events = 0; + + /* * Routine: vm_backing_store_disable * Purpose: @@ -449,7 +463,7 @@ vm_pageout_object_terminate( p = VM_PAGE_NULL; m = vm_page_lookup(shadow_object, - offset + object->shadow_offset); + offset + object->vo_shadow_offset); if(m == VM_PAGE_NULL) continue; @@ -529,39 +543,44 @@ vm_pageout_object_terminate( else vm_page_deactivate(m); } - if((m->busy) && (m->cleaning)) { - - /* the request_page_list case, (COPY_OUT_FROM FALSE) */ - m->busy = FALSE; - - /* We do not re-set m->dirty ! */ - /* The page was busy so no extraneous activity */ - /* could have occurred. COPY_INTO is a read into the */ - /* new pages. CLEAN_IN_PLACE does actually write */ - /* out the pages but handling outside of this code */ - /* will take care of resetting dirty. We clear the */ - /* modify however for the Programmed I/O case. */ - pmap_clear_modify(m->phys_page); + if (m->overwriting) { + /* + * the (COPY_OUT_FROM == FALSE) request_page_list case + */ + if (m->busy) { + /* + * We do not re-set m->dirty ! + * The page was busy so no extraneous activity + * could have occurred. COPY_INTO is a read into the + * new pages. CLEAN_IN_PLACE does actually write + * out the pages but handling outside of this code + * will take care of resetting dirty. We clear the + * modify however for the Programmed I/O case. + */ + pmap_clear_modify(m->phys_page); - m->absent = FALSE; - m->overwriting = FALSE; - } else if (m->overwriting) { - /* alternate request page list, write to page_list */ - /* case. Occurs when the original page was wired */ - /* at the time of the list request */ - assert(VM_PAGE_WIRED(m)); - vm_page_unwire(m, TRUE); /* reactivates */ + m->busy = FALSE; + m->absent = FALSE; + } else { + /* + * alternate (COPY_OUT_FROM == FALSE) request_page_list case + * Occurs when the original page was wired + * at the time of the list request + */ + assert(VM_PAGE_WIRED(m)); + vm_page_unwire(m, TRUE); /* reactivates */ + } m->overwriting = FALSE; } else { - /* - * Set the dirty state according to whether or not the page was - * modified during the pageout. Note that we purposefully do - * NOT call pmap_clear_modify since the page is still mapped. - * If the page were to be dirtied between the 2 calls, this - * this fact would be lost. This code is only necessary to - * maintain statistics, since the pmap module is always - * consulted if m->dirty is false. - */ + /* + * Set the dirty state according to whether or not the page was + * modified during the pageout. Note that we purposefully do + * NOT call pmap_clear_modify since the page is still mapped. + * If the page were to be dirtied between the 2 calls, this + * this fact would be lost. This code is only necessary to + * maintain statistics, since the pmap module is always + * consulted if m->dirty is false. + */ #if MACH_CLUSTER_STATS m->dirty = pmap_is_modified(m->phys_page); @@ -572,8 +591,11 @@ vm_pageout_object_terminate( m->dirty = 0; #endif } + if (m->encrypted_cleaning == TRUE) { + m->encrypted_cleaning = FALSE; + m->busy = FALSE; + } m->cleaning = FALSE; - m->encrypted_cleaning = FALSE; /* * Wakeup any thread waiting for the page to be un-cleaning. @@ -786,13 +808,20 @@ vm_pageout_cluster(vm_page_t m) object, m->offset, m, 0, 0); VM_PAGE_CHECK(m); +#if DEBUG + lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); +#endif + vm_object_lock_assert_exclusive(object); /* * Only a certain kind of page is appreciated here. */ assert(m->busy && (m->dirty || m->precious) && (!VM_PAGE_WIRED(m))); - assert(!m->cleaning && !m->pageout && !m->inactive && !m->active); + assert(!m->cleaning && !m->pageout); +#ifndef CONFIG_FREEZE + assert(!m->inactive && !m->active); assert(!m->throttled); +#endif /* * protect the object from collapse - @@ -817,7 +846,7 @@ vm_pageout_cluster(vm_page_t m) /* * pgo_laundry count is tied to the laundry bit */ - m->laundry = TRUE; + m->laundry = TRUE; q->pgo_laundry++; m->pageout_queue = TRUE; @@ -843,62 +872,46 @@ unsigned long vm_pageout_throttle_up_count = 0; */ void vm_pageout_throttle_up( - vm_page_t m) + vm_page_t m) { - struct vm_pageout_queue *q; + struct vm_pageout_queue *q; - assert(m->object != VM_OBJECT_NULL); - assert(m->object != kernel_object); + assert(m->object != VM_OBJECT_NULL); + assert(m->object != kernel_object); - vm_pageout_throttle_up_count++; + vm_pageout_throttle_up_count++; - if (m->object->internal == TRUE) - q = &vm_pageout_queue_internal; - else - q = &vm_pageout_queue_external; - - if (m->pageout_queue == TRUE) { - - queue_remove(&q->pgo_pending, m, vm_page_t, pageq); - m->pageout_queue = FALSE; + if (m->object->internal == TRUE) + q = &vm_pageout_queue_internal; + else + q = &vm_pageout_queue_external; - m->pageq.next = NULL; - m->pageq.prev = NULL; - - vm_object_paging_end(m->object); - } - if (m->laundry == TRUE) { - m->laundry = FALSE; - q->pgo_laundry--; + if (m->pageout_queue == TRUE) { - if (q->pgo_throttled == TRUE) { - q->pgo_throttled = FALSE; - thread_wakeup((event_t) &q->pgo_laundry); - } - if (q->pgo_draining == TRUE && q->pgo_laundry == 0) { - q->pgo_draining = FALSE; - thread_wakeup((event_t) (&q->pgo_laundry+1)); - } - } -} + queue_remove(&q->pgo_pending, m, vm_page_t, pageq); + m->pageout_queue = FALSE; + m->pageq.next = NULL; + m->pageq.prev = NULL; -/* - * vm_pageout_scan does the dirty work for the pageout daemon. - * It returns with vm_page_queue_free_lock held and - * vm_page_free_wanted == 0. - */ + vm_object_paging_end(m->object); + } -#define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT (3 * MAX_UPL_TRANSFER) + if ( m->laundry == TRUE ) { -#define FCS_IDLE 0 -#define FCS_DELAYED 1 -#define FCS_DEADLOCK_DETECTED 2 + m->laundry = FALSE; + q->pgo_laundry--; -struct flow_control { - int state; - mach_timespec_t ts; -}; + if (q->pgo_throttled == TRUE) { + q->pgo_throttled = FALSE; + thread_wakeup((event_t) &q->pgo_laundry); + } + if (q->pgo_draining == TRUE && q->pgo_laundry == 0) { + q->pgo_draining = FALSE; + thread_wakeup((event_t) (&q->pgo_laundry+1)); + } + } +} /* @@ -1041,10 +1054,10 @@ mach_vm_pressure_monitor( helps us do the right accounting in certain cases */ -#define PAGE_STATE_SPECULATIVE 1 -#define PAGE_STATE_THROTTLED 2 -#define PAGE_STATE_ZEROFILL 3 -#define PAGE_STATE_INACTIVE 4 +#define PAGE_STATE_SPECULATIVE 1 +#define PAGE_STATE_ZEROFILL 2 +#define PAGE_STATE_INACTIVE 3 +#define PAGE_STATE_INACTIVE_FIRST 4 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m) \ MACRO_BEGIN \ @@ -1065,6 +1078,25 @@ mach_vm_pressure_monitor( } \ MACRO_END + +#define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT 128 +#define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX 1024 + +#define FCS_IDLE 0 +#define FCS_DELAYED 1 +#define FCS_DEADLOCK_DETECTED 2 + +struct flow_control { + int state; + mach_timespec_t ts; +}; + + +/* + * vm_pageout_scan does the dirty work for the pageout daemon. + * It returns with vm_page_queue_free_lock held and + * vm_page_free_wanted == 0. + */ void vm_pageout_scan(void) { @@ -1076,6 +1108,7 @@ vm_pageout_scan(void) vm_page_t local_freeq = NULL; int local_freed = 0; int delayed_unlock; + int delayed_unlock_limit = 0; int refmod_state = 0; int vm_pageout_deadlock_target = 0; struct vm_pageout_queue *iq; @@ -1084,21 +1117,22 @@ vm_pageout_scan(void) struct flow_control flow_control = { 0, { 0, 0 } }; boolean_t inactive_throttled = FALSE; boolean_t try_failed; - mach_timespec_t ts; - unsigned int msecs = 0; + mach_timespec_t ts; + unsigned int msecs = 0; vm_object_t object; vm_object_t last_object_tried; -#if defined(__ppc__) /* On ppc, vm statistics are still 32-bit */ - unsigned int zf_ratio; - unsigned int zf_run_count; -#else uint64_t zf_ratio; uint64_t zf_run_count; -#endif uint32_t catch_up_count = 0; uint32_t inactive_reclaim_run; boolean_t forced_reclaim; int page_prev_state = 0; + int cache_evict_throttle = 0; + uint32_t vm_pageout_inactive_external_forced_reactivate_limit = 0; + + VM_DEBUG_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START, + vm_pageout_speculative_clean, vm_pageout_inactive_clean, + vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external); flow_control.state = FCS_IDLE; iq = &vm_pageout_queue_internal; @@ -1122,30 +1156,21 @@ vm_pageout_scan(void) inactive_reclaim_run = 0; -/*???*/ /* + /* * We want to gradually dribble pages from the active queue * to the inactive queue. If we let the inactive queue get * very small, and then suddenly dump many pages into it, * those pages won't get a sufficient chance to be referenced * before we start taking them from the inactive queue. * - * We must limit the rate at which we send pages to the pagers. - * data_write messages consume memory, for message buffers and - * for map-copy objects. If we get too far ahead of the pagers, - * we can potentially run out of memory. - * - * We can use the laundry count to limit directly the number - * of pages outstanding to the default pager. A similar - * strategy for external pagers doesn't work, because - * external pagers don't have to deallocate the pages sent them, - * and because we might have to send pages to external pagers - * even if they aren't processing writes. So we also - * use a burst count to limit writes to external pagers. - * - * When memory is very tight, we can't rely on external pagers to - * clean pages. They probably aren't running, because they - * aren't vm-privileged. If we kept sending dirty pages to them, - * we could exhaust the free list. + * We must limit the rate at which we send pages to the pagers + * so that we don't tie up too many pages in the I/O queues. + * We implement a throttling mechanism using the laundry count + * to limit the number of pages outstanding to the default + * and external pagers. We can bypass the throttles and look + * for clean pages if the pageout queues don't drain in a timely + * fashion since this may indicate that the pageout paths are + * stalled waiting for memory, which only we can provide. */ @@ -1163,13 +1188,8 @@ vm_pageout_scan(void) * but at the moment mach vm cannot do this. */ { -#if defined(__ppc__) /* On ppc, vm statistics are still 32-bit */ - uint32_t total = vm_page_active_count + vm_page_inactive_count; - uint32_t normal = total - vm_zf_count; -#else uint64_t total = vm_page_active_count + vm_page_inactive_count; uint64_t normal = total - vm_zf_count; -#endif /* zf_ratio is the number of zf pages we victimize per normal page */ @@ -1195,8 +1215,16 @@ vm_pageout_scan(void) */ vm_page_inactive_min = vm_page_inactive_target - (vm_page_inactive_target / 400); + if (vm_page_speculative_percentage > 50) + vm_page_speculative_percentage = 50; + else if (vm_page_speculative_percentage <= 0) + vm_page_speculative_percentage = 1; + vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count + vm_page_inactive_count); + + vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count; + object = NULL; last_object_tried = NULL; try_failed = FALSE; @@ -1215,24 +1243,40 @@ vm_pageout_scan(void) vm_page_lock_queues(); delayed_unlock = 1; } + if (vm_upl_wait_for_pages < 0) + vm_upl_wait_for_pages = 0; - /* - * Don't sweep through active queue more than the throttle - * which should be kept relatively low - */ - active_burst_count = MIN(vm_pageout_burst_active_throttle, - vm_page_active_count); + delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages; + + if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) + delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX; /* - * Move pages from active to inactive. + * Move pages from active to inactive if we're below the target */ if ((vm_page_inactive_count + vm_page_speculative_count) >= vm_page_inactive_target) goto done_moving_active_pages; - while (!queue_empty(&vm_page_queue_active) && active_burst_count) { + if (object != NULL) { + vm_object_unlock(object); + object = NULL; + vm_pageout_scan_wants_object = VM_OBJECT_NULL; + } + /* + * Don't sweep through active queue more than the throttle + * which should be kept relatively low + */ + active_burst_count = MIN(vm_pageout_burst_active_throttle, + vm_page_active_count); + + VM_DEBUG_EVENT(vm_pageout_balance, VM_PAGEOUT_BALANCE, DBG_FUNC_START, + vm_pageout_inactive, vm_pageout_inactive_used, vm_page_free_count, local_freed); + + VM_DEBUG_EVENT(vm_pageout_balance, VM_PAGEOUT_BALANCE, DBG_FUNC_NONE, + vm_pageout_speculative_clean, vm_pageout_inactive_clean, + vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external); - if (active_burst_count) - active_burst_count--; + while (!queue_empty(&vm_page_queue_active) && active_burst_count--) { vm_pageout_active++; @@ -1246,91 +1290,24 @@ vm_pageout_scan(void) DTRACE_VM2(scan, int, 1, (uint64_t *), NULL); /* - * Try to lock object; since we've already got the - * page queues lock, we can only 'try' for this one. - * if the 'try' fails, we need to do a mutex_pause - * to allow the owner of the object lock a chance to - * run... otherwise, we're likely to trip over this - * object in the same state as we work our way through - * the queue... clumps of pages associated with the same - * object are fairly typical on the inactive and active queues - */ - if (m->object != object) { - if (object != NULL) { - vm_object_unlock(object); - object = NULL; - vm_pageout_scan_wants_object = VM_OBJECT_NULL; - } - if (!vm_object_lock_try_scan(m->object)) { - /* - * move page to end of active queue and continue - */ - queue_remove(&vm_page_queue_active, m, - vm_page_t, pageq); - queue_enter(&vm_page_queue_active, m, - vm_page_t, pageq); - - try_failed = TRUE; - - m = (vm_page_t) queue_first(&vm_page_queue_active); - /* - * this is the next object we're going to be interested in - * try to make sure it's available after the mutex_yield - * returns control - */ - vm_pageout_scan_wants_object = m->object; - - goto done_with_activepage; - } - object = m->object; - - try_failed = FALSE; - } - - /* - * if the page is BUSY, then we pull it - * off the active queue and leave it alone. - * when BUSY is cleared, it will get stuck - * back on the appropriate queue - */ - if (m->busy) { - queue_remove(&vm_page_queue_active, m, - vm_page_t, pageq); - m->pageq.next = NULL; - m->pageq.prev = NULL; - - if (!m->fictitious) - vm_page_active_count--; - m->active = FALSE; - - goto done_with_activepage; - } - - /* deal with a rogue "reusable" page */ - VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m); - - /* - * Deactivate the page while holding the object - * locked, so we know the page is still not busy. - * This should prevent races between pmap_enter - * and pmap_clear_reference. The page might be - * absent or fictitious, but vm_page_deactivate - * can handle that. + * The page might be absent or busy, + * but vm_page_deactivate can handle that. */ vm_page_deactivate(m); -done_with_activepage: - if (delayed_unlock++ > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT || try_failed == TRUE) { + if (delayed_unlock++ > delayed_unlock_limit) { - if (object != NULL) { - vm_pageout_scan_wants_object = VM_OBJECT_NULL; - vm_object_unlock(object); - object = NULL; - } if (local_freeq) { vm_page_unlock_queues(); + + VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START, + vm_page_free_count, local_freed, delayed_unlock_limit, 1); + vm_page_free_list(local_freeq, TRUE); + VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END, + vm_page_free_count, 0, 0, 1); + local_freeq = NULL; local_freed = 0; vm_page_lock_queues(); @@ -1347,6 +1324,8 @@ vm_pageout_scan(void) } } + VM_DEBUG_EVENT(vm_pageout_balance, VM_PAGEOUT_BALANCE, DBG_FUNC_END, + vm_page_active_count, vm_page_inactive_count, vm_page_speculative_count, vm_page_inactive_target); /********************************************************************** @@ -1357,10 +1336,6 @@ vm_pageout_scan(void) done_moving_active_pages: - /* - * We are done if we have met our target *and* - * nobody is still waiting for a page. - */ if (vm_page_free_count + local_freed >= vm_page_free_target) { if (object != NULL) { vm_object_unlock(object); @@ -1370,45 +1345,55 @@ vm_pageout_scan(void) if (local_freeq) { vm_page_unlock_queues(); + + VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START, + vm_page_free_count, local_freed, delayed_unlock_limit, 2); + vm_page_free_list(local_freeq, TRUE); + VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END, + vm_page_free_count, local_freed, 0, 2); + local_freeq = NULL; local_freed = 0; vm_page_lock_queues(); } /* - * inactive target still not met... keep going - * until we get the queues balanced - */ - - /* - * Recalculate vm_page_inactivate_target. + * recalculate vm_page_inactivate_target */ vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count + vm_page_inactive_count + vm_page_speculative_count); - #ifndef CONFIG_EMBEDDED - /* - * XXX: if no active pages can be reclaimed, pageout scan can be stuck trying - * to balance the queues - */ if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) && - !queue_empty(&vm_page_queue_active)) + !queue_empty(&vm_page_queue_active)) { + /* + * inactive target still not met... keep going + * until we get the queues balanced... + */ continue; + } #endif - lck_mtx_lock(&vm_page_queue_free_lock); if ((vm_page_free_count >= vm_page_free_target) && (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) { - + /* + * done - we have met our target *and* + * there is no one waiting for a page. + */ vm_page_unlock_queues(); thread_wakeup((event_t) &vm_pageout_garbage_collect); assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL); + VM_DEBUG_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE, + vm_pageout_inactive, vm_pageout_inactive_used, 0, 0); + VM_DEBUG_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END, + vm_pageout_speculative_clean, vm_pageout_inactive_clean, + vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external); + return; } lck_mtx_unlock(&vm_page_queue_free_lock); @@ -1428,14 +1413,20 @@ vm_pageout_scan(void) vm_object_unlock(object); object = NULL; } - if(TRUE == vm_purgeable_object_purge_one()) { + + VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0); + + if (TRUE == vm_purgeable_object_purge_one()) { + + VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0); + continue; } + VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1); } - if (queue_empty(&sq->age_q) && vm_page_speculative_count) { /* - * try to pull pages from the aging bins + * try to pull pages from the aging bins... * see vm_page.h for an explanation of how * this mechanism works */ @@ -1458,21 +1449,20 @@ vm_pageout_scan(void) aq = &vm_page_queue_speculative[speculative_steal_index]; } - if (num_scanned_queues == - VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) { + if (num_scanned_queues == VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) { /* * XXX We've scanned all the speculative * queues but still haven't found one * that is not empty, even though * vm_page_speculative_count is not 0. + * + * report the anomaly... */ - /* report the anomaly... */ printf("vm_pageout_scan: " "all speculative queues empty " "but count=%d. Re-adjusting.\n", vm_page_speculative_count); - if (vm_page_speculative_count > - vm_page_speculative_count_drift_max) + if (vm_page_speculative_count > vm_page_speculative_count_drift_max) vm_page_speculative_count_drift_max = vm_page_speculative_count; vm_page_speculative_count_drifts++; #if 6553678 @@ -1487,8 +1477,8 @@ vm_pageout_scan(void) if (vm_page_speculative_count > vm_page_speculative_target) can_steal = TRUE; else { - ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * VM_PAGE_SPECULATIVE_Q_AGE_MS) / 1000; - ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * VM_PAGE_SPECULATIVE_Q_AGE_MS) % 1000) + ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_page_speculative_q_age_ms) / 1000; + ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_page_speculative_q_age_ms) % 1000) * 1000 * NSEC_PER_USEC; ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts); @@ -1505,6 +1495,34 @@ vm_pageout_scan(void) if (can_steal == TRUE) vm_page_speculate_ageit(aq); } + if (queue_empty(&sq->age_q) && cache_evict_throttle == 0) { + int pages_evicted; + + if (object != NULL) { + vm_object_unlock(object); + object = NULL; + } + pages_evicted = vm_object_cache_evict(100, 10); + + if (pages_evicted) { + + vm_pageout_cache_evicted += pages_evicted; + + VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE, + vm_page_free_count, pages_evicted, vm_pageout_cache_evicted, 0); + + /* + * we just freed up to 100 pages, + * so go back to the top of the main loop + * and re-evaulate the memory situation + */ + continue; + } else + cache_evict_throttle = 100; + } + if (cache_evict_throttle) + cache_evict_throttle--; + /* * Sometimes we have to pause: @@ -1513,8 +1531,7 @@ vm_pageout_scan(void) * 3) Loop control - no acceptable pages found on the inactive queue * within the last vm_pageout_burst_inactive_throttle iterations */ - if (queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_zf) && queue_empty(&sq->age_q) && - (VM_PAGE_Q_THROTTLED(iq) || queue_empty(&vm_page_queue_throttled))) { + if (queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_zf) && queue_empty(&sq->age_q)) { vm_pageout_scan_empty_throttle++; msecs = vm_pageout_empty_wait; goto vm_pageout_scan_delay; @@ -1527,7 +1544,8 @@ vm_pageout_scan(void) msecs = vm_pageout_burst_wait; goto vm_pageout_scan_delay; - } else if (VM_PAGE_Q_THROTTLED(iq) && IP_VALID(memory_manager_default)) { + } else if (VM_PAGE_Q_THROTTLED(iq) && + VM_DYNAMIC_PAGING_ENABLED(memory_manager_default)) { clock_sec_t sec; clock_nsec_t nsec; @@ -1602,8 +1620,15 @@ vm_pageout_scan(void) if (local_freeq) { vm_page_unlock_queues(); + + VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START, + vm_page_free_count, local_freed, delayed_unlock_limit, 3); + vm_page_free_list(local_freeq, TRUE); + VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END, + vm_page_free_count, local_freed, 0, 3); + local_freeq = NULL; local_freed = 0; vm_page_lock_queues(); @@ -1615,25 +1640,9 @@ vm_pageout_scan(void) goto consider_inactive; } } -#if CONFIG_EMBEDDED - { - int percent_avail; - /* - * Decide if we need to send a memory status notification. - */ - percent_avail = - (vm_page_active_count + vm_page_inactive_count + - vm_page_speculative_count + vm_page_free_count + - (IP_VALID(memory_manager_default)?0:vm_page_purgeable_count) ) * 100 / - atop_64(max_mem); - if (percent_avail >= (kern_memorystatus_level + 5) || - percent_avail <= (kern_memorystatus_level - 5)) { - kern_memorystatus_level = percent_avail; - thread_wakeup((event_t)&kern_memorystatus_wakeup); - } - } -#endif + VM_CHECK_MEMORYSTATUS; + assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC); counter(c_vm_pageout_scan_block++); @@ -1641,8 +1650,14 @@ vm_pageout_scan(void) assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL); + VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START, + iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0); + thread_block(THREAD_CONTINUE_NULL); + VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END, + iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0); + vm_page_lock_queues(); delayed_unlock = 1; @@ -1659,6 +1674,8 @@ vm_pageout_scan(void) flow_control.state = FCS_IDLE; consider_inactive: + vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count), + vm_pageout_inactive_external_forced_reactivate_limit); loop_count++; inactive_burst_count++; vm_pageout_inactive++; @@ -1668,7 +1685,7 @@ vm_pageout_scan(void) while (1) { m = NULL; - if (IP_VALID(memory_manager_default)) { + if (VM_DYNAMIC_PAGING_ENABLED(memory_manager_default)) { assert(vm_page_throttled_count == 0); assert(queue_empty(&vm_page_queue_throttled)); } @@ -1679,6 +1696,8 @@ vm_pageout_scan(void) */ if ( !queue_empty(&sq->age_q) ) { m = (vm_page_t) queue_first(&sq->age_q); + + page_prev_state = PAGE_STATE_SPECULATIVE; break; } /* @@ -1688,6 +1707,8 @@ vm_pageout_scan(void) queue_empty(&vm_page_queue_inactive)) { if ( !queue_empty(&vm_page_queue_zf) ) { m = (vm_page_t) queue_first(&vm_page_queue_zf); + + page_prev_state = PAGE_STATE_ZEROFILL; zf_run_count++; break; } @@ -1697,21 +1718,25 @@ vm_pageout_scan(void) */ if ( !queue_empty(&vm_page_queue_inactive) ) { m = (vm_page_t) queue_first(&vm_page_queue_inactive); + + page_prev_state = PAGE_STATE_INACTIVE; zf_run_count = 0; break; } panic("vm_pageout: no victim"); } + VM_PAGE_QUEUES_REMOVE(m); - assert(!m->active && (m->inactive || m->speculative || m->throttled)); assert(!m->laundry); + assert(!m->private); + assert(!m->fictitious); assert(m->object != kernel_object); assert(m->phys_page != vm_page_guard_addr); - if (!m->speculative) { + + if (page_prev_state != PAGE_STATE_SPECULATIVE) vm_pageout_stats[vm_pageout_stat_now].considered++; - } DTRACE_VM2(scan, int, 1, (uint64_t *), NULL); @@ -1742,88 +1767,43 @@ vm_pageout_scan(void) * object are fairly typical on the inactive and active queues */ if (!vm_object_lock_try_scan(m->object)) { + vm_page_t m_want = NULL; + vm_pageout_inactive_nolock++; - requeue_page: - /* - * Move page to end and continue. - * Don't re-issue ticket - */ - if (m->zero_fill) { - if (m->speculative) { - panic("vm_pageout_scan(): page %p speculative and zero-fill !?\n", m); - } - assert(!m->speculative); - queue_remove(&vm_page_queue_zf, m, - vm_page_t, pageq); - queue_enter(&vm_page_queue_zf, m, - vm_page_t, pageq); - } else if (m->speculative) { - remque(&m->pageq); - m->speculative = FALSE; - vm_page_speculative_count--; - - /* - * move to the head of the inactive queue - * to get it out of the way... the speculative - * queue is generally too small to depend - * on there being enough pages from other - * objects to make cycling it back on the - * same queue a winning proposition - */ - queue_enter_first(&vm_page_queue_inactive, m, - vm_page_t, pageq); - m->inactive = TRUE; - vm_page_inactive_count++; - token_new_pagecount++; - } else if (m->throttled) { - queue_remove(&vm_page_queue_throttled, m, - vm_page_t, pageq); - m->throttled = FALSE; - vm_page_throttled_count--; - - /* - * not throttled any more, so can stick - * it on the inactive queue. - */ - queue_enter(&vm_page_queue_inactive, m, - vm_page_t, pageq); - m->inactive = TRUE; - vm_page_inactive_count++; - token_new_pagecount++; - } else { - queue_remove(&vm_page_queue_inactive, m, - vm_page_t, pageq); -#if MACH_ASSERT - vm_page_inactive_count--; /* balance for purgeable queue asserts */ -#endif - vm_purgeable_q_advance_all(); + if (page_prev_state == PAGE_STATE_SPECULATIVE) + page_prev_state = PAGE_STATE_INACTIVE_FIRST; - queue_enter(&vm_page_queue_inactive, m, - vm_page_t, pageq); -#if MACH_ASSERT - vm_page_inactive_count++; /* balance for purgeable queue asserts */ -#endif - token_new_pagecount++; - } pmap_clear_reference(m->phys_page); m->reference = FALSE; + /* + * m->object must be stable since we hold the page queues lock... + * we can update the scan_collisions field sans the object lock + * since it is a separate field and this is the only spot that does + * a read-modify-write operation and it is never executed concurrently... + * we can asynchronously set this field to 0 when creating a UPL, so it + * is possible for the value to be a bit non-determistic, but that's ok + * since it's only used as a hint + */ + m->object->scan_collisions++; + if ( !queue_empty(&sq->age_q) ) - m = (vm_page_t) queue_first(&sq->age_q); + m_want = (vm_page_t) queue_first(&sq->age_q); else if ( ((zf_run_count < zf_ratio) && vm_zf_queue_count >= zf_queue_min_count) || queue_empty(&vm_page_queue_inactive)) { if ( !queue_empty(&vm_page_queue_zf) ) - m = (vm_page_t) queue_first(&vm_page_queue_zf); + m_want = (vm_page_t) queue_first(&vm_page_queue_zf); } else if ( !queue_empty(&vm_page_queue_inactive) ) { - m = (vm_page_t) queue_first(&vm_page_queue_inactive); + m_want = (vm_page_t) queue_first(&vm_page_queue_inactive); } /* * this is the next object we're going to be interested in * try to make sure its available after the mutex_yield * returns control */ - vm_pageout_scan_wants_object = m->object; + if (m_want) + vm_pageout_scan_wants_object = m_want->object; /* * force us to dump any collected free pages @@ -1831,99 +1811,85 @@ vm_pageout_scan(void) */ try_failed = TRUE; - goto done_with_inactivepage; + goto requeue_page; } object = m->object; vm_pageout_scan_wants_object = VM_OBJECT_NULL; try_failed = FALSE; } + if (catch_up_count) + catch_up_count--; - /* - * Paging out pages of external objects which - * are currently being created must be avoided. - * The pager may claim for memory, thus leading to a - * possible dead lock between it and the pageout thread, - * if such pages are finally chosen. The remaining assumption - * is that there will finally be enough available pages in the - * inactive pool to page out in order to satisfy all memory - * claimed by the thread which concurrently creates the pager. - */ - if (!object->pager_initialized && object->pager_created) { - /* - * Move page to end and continue, hoping that - * there will be enough other inactive pages to - * page out so that the thread which currently - * initializes the pager will succeed. - * Don't re-grant the ticket, the page should - * pulled from the queue and paged out whenever - * one of its logically adjacent fellows is - * targeted. - */ - vm_pageout_inactive_avoid++; - goto requeue_page; - } - /* - * Remove the page from its list. - */ - if (m->speculative) { - remque(&m->pageq); - page_prev_state = PAGE_STATE_SPECULATIVE; - m->speculative = FALSE; - vm_page_speculative_count--; - } else if (m->throttled) { - queue_remove(&vm_page_queue_throttled, m, vm_page_t, pageq); - page_prev_state = PAGE_STATE_THROTTLED; - m->throttled = FALSE; - vm_page_throttled_count--; - } else { - if (m->zero_fill) { - queue_remove(&vm_page_queue_zf, m, vm_page_t, pageq); - page_prev_state = PAGE_STATE_ZEROFILL; - vm_zf_queue_count--; - } else { - page_prev_state = PAGE_STATE_INACTIVE; - queue_remove(&vm_page_queue_inactive, m, vm_page_t, pageq); + if (m->busy) { + if (m->encrypted_cleaning) { + /* + * ENCRYPTED SWAP: + * if this page has already been picked up as + * part of a page-out cluster, it will be busy + * because it is being encrypted (see + * vm_object_upl_request()). But we still + * want to demote it from "clean-in-place" + * (aka "adjacent") to "clean-and-free" (aka + * "target"), so let's ignore its "busy" bit + * here and proceed to check for "cleaning" a + * little bit below... + * + * CAUTION CAUTION: + * A "busy" page should still be left alone for + * most purposes, so we have to be very careful + * not to process that page too much. + */ + assert(m->cleaning); + goto consider_inactive_page; } - m->inactive = FALSE; - if (!m->fictitious) - vm_page_inactive_count--; - vm_purgeable_q_advance_all(); - } - - m->pageq.next = NULL; - m->pageq.prev = NULL; - - if ( !m->fictitious && catch_up_count) - catch_up_count--; - /* - * ENCRYPTED SWAP: - * if this page has already been picked up as part of a - * page-out cluster, it will be busy because it is being - * encrypted (see vm_object_upl_request()). But we still - * want to demote it from "clean-in-place" (aka "adjacent") - * to "clean-and-free" (aka "target"), so let's ignore its - * "busy" bit here and proceed to check for "cleaning" a - * little bit below... - */ - if ( !m->encrypted_cleaning && (m->busy || !object->alive)) { /* * Somebody is already playing with this page. - * Leave it off the pageout queues. + * Put it back on the appropriate queue * */ vm_pageout_inactive_busy++; +requeue_page: + switch (page_prev_state) { + + case PAGE_STATE_SPECULATIVE: + vm_page_speculate(m, FALSE); + break; + case PAGE_STATE_ZEROFILL: + m->zero_fill = TRUE; + /* + * fall through to add in the + * inactive state + */ + case PAGE_STATE_INACTIVE: + VM_PAGE_ENQUEUE_INACTIVE(m, FALSE); + break; + + case PAGE_STATE_INACTIVE_FIRST: + VM_PAGE_ENQUEUE_INACTIVE(m, TRUE); + break; + } goto done_with_inactivepage; } + /* - * If it's absent or in error, we can reclaim the page. + * If it's absent, in error or the object is no longer alive, + * we can reclaim the page... in the no longer alive case, + * there are 2 states the page can be in that preclude us + * from reclaiming it - busy or cleaning - that we've already + * dealt with */ + if (m->absent || m->error || !object->alive) { - if (m->absent || m->error) { - vm_pageout_inactive_absent++; + if (m->absent) + vm_pageout_inactive_absent++; + else if (!object->alive) + vm_pageout_inactive_notalive++; + else + vm_pageout_inactive_error++; reclaim_page: if (vm_pageout_deadlock_target) { vm_pageout_scan_inactive_throttle_success++; @@ -1956,37 +1922,11 @@ vm_pageout_scan(void) inactive_burst_count = 0; - if(page_prev_state != PAGE_STATE_SPECULATIVE) { + if (page_prev_state != PAGE_STATE_SPECULATIVE) vm_pageout_stats[vm_pageout_stat_now].reclaimed++; - page_prev_state = 0; - } goto done_with_inactivepage; } - - assert(!m->private); - assert(!m->fictitious); - - /* - * If already cleaning this page in place, convert from - * "adjacent" to "target". We can leave the page mapped, - * and vm_pageout_object_terminate will determine whether - * to free or reactivate. - */ - - if (m->cleaning) { - m->busy = TRUE; - m->pageout = TRUE; - m->dump_cleaning = TRUE; - vm_page_wire(m); - - CLUSTER_STAT(vm_pageout_cluster_conversions++); - - inactive_burst_count = 0; - - goto done_with_inactivepage; - } - /* * If the object is empty, the page must be reclaimed even * if dirty or used. @@ -2012,11 +1952,35 @@ vm_pageout_scan(void) if (object->purgable == VM_PURGABLE_VOLATILE) { /* if it's wired, we can't put it on our queue */ assert(!VM_PAGE_WIRED(m)); + /* just stick it back on! */ + reactivated_this_call++; goto reactivate_page; } } + consider_inactive_page: + if (m->busy) { + /* + * CAUTION CAUTION: + * A "busy" page should always be left alone, except... + */ + if (m->cleaning && m->encrypted_cleaning) { + /* + * ENCRYPTED_SWAP: + * We could get here with a "busy" page + * if it's being encrypted during a + * "clean-in-place" operation. We'll deal + * with it right away by testing if it has been + * referenced and either reactivating it or + * promoting it from "clean-in-place" to + * "clean-and-free". + */ + } else { + panic("\"busy\" page considered for pageout\n"); + } + } + /* * If it's being used, reactivate. * (Fictitious pages are either busy or absent.) @@ -2034,6 +1998,35 @@ vm_pageout_scan(void) m->dirty = TRUE; } + /* + * If already cleaning this page in place and it hasn't + * been recently referenced, convert from + * "adjacent" to "target". We can leave the page mapped, + * and upl_commit_range will determine whether + * to free or reactivate. + * + * note: if m->encrypted_cleaning == TRUE, then + * m->cleaning == TRUE + * and we'll handle it here + */ + if (m->cleaning) { + + if (m->reference == TRUE) { + reactivated_this_call++; + goto reactivate_page; + } + m->busy = TRUE; + m->pageout = TRUE; + m->dump_cleaning = TRUE; + vm_page_wire(m); + + CLUSTER_STAT(vm_pageout_cluster_conversions++); + + inactive_burst_count = 0; + + goto done_with_inactivepage; + } + if (m->reference || m->dirty) { /* deal with a rogue "reusable" page */ VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m); @@ -2123,33 +2116,75 @@ vm_pageout_scan(void) inactive_throttled = TRUE; } } - if (inactive_throttled == TRUE) { throttle_inactive: - if (!IP_VALID(memory_manager_default) && - object->internal && m->dirty && - (object->purgable == VM_PURGABLE_DENY || - object->purgable == VM_PURGABLE_NONVOLATILE || - object->purgable == VM_PURGABLE_VOLATILE)) { - queue_enter(&vm_page_queue_throttled, m, - vm_page_t, pageq); - m->throttled = TRUE; - vm_page_throttled_count++; - } else { - if (m->zero_fill) { - queue_enter(&vm_page_queue_zf, m, - vm_page_t, pageq); - vm_zf_queue_count++; - } else - queue_enter(&vm_page_queue_inactive, m, - vm_page_t, pageq); - m->inactive = TRUE; - if (!m->fictitious) { - vm_page_inactive_count++; - token_new_pagecount++; + if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) && + object->internal && m->dirty && + (object->purgable == VM_PURGABLE_DENY || + object->purgable == VM_PURGABLE_NONVOLATILE || + object->purgable == VM_PURGABLE_VOLATILE)) { + queue_enter(&vm_page_queue_throttled, m, + vm_page_t, pageq); + m->throttled = TRUE; + vm_page_throttled_count++; + + vm_pageout_scan_reclaimed_throttled++; + + goto done_with_inactivepage; + } + if (inactive_throttled == TRUE) { + + if (object->internal) + vm_pageout_scan_inactive_throttled_internal++; + else + vm_pageout_scan_inactive_throttled_external++; + + if (page_prev_state == PAGE_STATE_SPECULATIVE) + page_prev_state = PAGE_STATE_INACTIVE; + + if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) && object->internal == FALSE) { + /* + * a) The external pageout queue is throttled + * b) We're done with the active queue and moved on to the inactive queue + * c) We start noticing dirty pages and usually we would put them at the end of the inactive queue, but, + * d) We don't have a default pager, and so, + * e) We push these onto the active queue in an effort to cause a re-evaluation of the active queue + * and get back some, possibly clean, pages. + * + * We also keep a count of the pages of this kind, since, these will be a good indicator of us being in a deadlock + * on systems without a dynamic pager, where: + * a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written. + * b) The thread doing the writing is waiting for pages while holding the truncate lock + * c) Most of the pages in the inactive queue belong to this file. + */ + + vm_page_activate(m); + vm_pageout_inactive_external_forced_reactivate_count++; + vm_pageout_inactive_external_forced_reactivate_limit--; + + if (vm_pageout_inactive_external_forced_reactivate_limit <= 0){ + vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count; +#if CONFIG_EMBEDDED + /* + * Possible deadlock scenario so request jetsam action + */ + assert(object); + vm_object_unlock(object); + object = VM_OBJECT_NULL; + vm_page_unlock_queues(); + + if (jetsam_kill_top_proc(TRUE, kJetsamFlagsKilledVM) < 0){ + panic("vm_pageout_scan: Jetsam request failed\n"); + } + + vm_page_lock_queues(); + delayed_unlock = 1; +#endif } + inactive_burst_count = 0; + goto done_with_inactivepage; + } else { + goto requeue_page; } - vm_pageout_scan_inactive_throttled++; - goto done_with_inactivepage; } /* @@ -2211,10 +2246,14 @@ vm_pageout_scan(void) * If it's clean and not precious, we can free the page. */ if (!m->dirty && !m->precious) { - if (m->zero_fill) - vm_pageout_inactive_zf++; - vm_pageout_inactive_clean++; + if (page_prev_state == PAGE_STATE_SPECULATIVE) + vm_pageout_speculative_clean++; + else { + if (page_prev_state == PAGE_STATE_ZEROFILL) + vm_pageout_inactive_zf++; + vm_pageout_inactive_clean++; + } goto reclaim_page; } @@ -2224,33 +2263,38 @@ vm_pageout_scan(void) * if the page was clean then). With the dirty page * disconnected here, we can make one final check. */ - { - boolean_t disconnect_throttled = FALSE; - if (object->internal) { - if (VM_PAGE_Q_THROTTLED(iq)) - disconnect_throttled = TRUE; - } else if (VM_PAGE_Q_THROTTLED(eq)) { - disconnect_throttled = TRUE; - } + if (object->internal) { + if (VM_PAGE_Q_THROTTLED(iq)) + inactive_throttled = TRUE; + } else if (VM_PAGE_Q_THROTTLED(eq)) { + inactive_throttled = TRUE; + } - if (disconnect_throttled == TRUE) { - PAGE_WAKEUP_DONE(m); - goto throttle_inactive; - } + if (inactive_throttled == TRUE) { + /* + * we set busy before issuing the pmap_disconnect, + * so clear it and wakeup anyone that happened upon + * it in that state + */ + PAGE_WAKEUP_DONE(m); + goto throttle_inactive; } vm_pageout_stats[vm_pageout_stat_now].reclaimed++; vm_pageout_cluster(m); - if (m->zero_fill) + if (page_prev_state == PAGE_STATE_ZEROFILL) vm_pageout_inactive_zf++; - vm_pageout_inactive_dirty++; - + if (object->internal) + vm_pageout_inactive_dirty_internal++; + else + vm_pageout_inactive_dirty_external++; + inactive_burst_count = 0; done_with_inactivepage: - if (delayed_unlock++ > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT || try_failed == TRUE) { + if (delayed_unlock++ > delayed_unlock_limit || try_failed == TRUE) { if (object != NULL) { vm_pageout_scan_wants_object = VM_OBJECT_NULL; @@ -2259,8 +2303,15 @@ vm_pageout_scan(void) } if (local_freeq) { vm_page_unlock_queues(); + + VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START, + vm_page_free_count, local_freed, delayed_unlock_limit, 4); + vm_page_free_list(local_freeq, TRUE); + VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END, + vm_page_free_count, local_freed, 0, 4); + local_freeq = NULL; local_freed = 0; vm_page_lock_queues(); @@ -2286,6 +2337,9 @@ vm_page_free_reserve( vm_page_free_reserved += pages; + if (vm_page_free_reserved > VM_PAGE_FREE_RESERVED_LIMIT) + vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT; + free_after_reserve = vm_page_free_count_init - vm_page_free_reserved; vm_page_free_min = vm_page_free_reserved + @@ -2357,6 +2411,9 @@ vm_pageout_iothread_continue(struct vm_pageout_queue *q) q->pgo_busy = TRUE; queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq); + if (m->object == slide_info.slide_object) { + panic("slid page %p not allowed on this path\n", m); + } VM_PAGE_CHECK(m); m->pageout_queue = FALSE; m->pageq.next = NULL; @@ -2410,8 +2467,8 @@ vm_pageout_iothread_continue(struct vm_pageout_queue *q) vm_page_lockspin_queues(); vm_pageout_queue_steal(m, TRUE); - vm_pageout_dirty_no_pager++; vm_page_activate(m); + vm_pageout_dirty_no_pager++; vm_page_unlock_queues(); @@ -2559,6 +2616,8 @@ vm_pageout_garbage_collect(int collect) consider_zone_gc(buf_large_zfree); consider_machine_adjust(); + consider_pressure_events(); + } assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT); @@ -2720,51 +2779,6 @@ vm_pageout_internal_start(void) } -/* - * when marshalling pages into a UPL and subsequently committing - * or aborting them, it is necessary to hold - * the vm_page_queue_lock (a hot global lock) for certain operations - * on the page... however, the majority of the work can be done - * while merely holding the object lock... in fact there are certain - * collections of pages that don't require any work brokered by the - * vm_page_queue_lock... to mitigate the time spent behind the global - * lock, go to a 2 pass algorithm... collect pages up to DELAYED_WORK_LIMIT - * while doing all of the work that doesn't require the vm_page_queue_lock... - * then call dw_do_work to acquire the vm_page_queue_lock and do the - * necessary work for each page... we will grab the busy bit on the page - * if it's not already held so that dw_do_work can drop the object lock - * if it can't immediately take the vm_page_queue_lock in order to compete - * for the locks in the same order that vm_pageout_scan takes them. - * the operation names are modeled after the names of the routines that - * need to be called in order to make the changes very obvious in the - * original loop - */ - -#define DELAYED_WORK_LIMIT 32 - -#define DW_vm_page_unwire 0x01 -#define DW_vm_page_wire 0x02 -#define DW_vm_page_free 0x04 -#define DW_vm_page_activate 0x08 -#define DW_vm_page_deactivate_internal 0x10 -#define DW_vm_page_speculate 0x20 -#define DW_vm_page_lru 0x40 -#define DW_vm_pageout_throttle_up 0x80 -#define DW_PAGE_WAKEUP 0x100 -#define DW_clear_busy 0x200 -#define DW_clear_reference 0x400 -#define DW_set_reference 0x800 - -struct dw { - vm_page_t dw_m; - int dw_mask; -}; - - -static void dw_do_work(vm_object_t object, struct dw *dwp, int dw_count); - - - static upl_t upl_create(int type, int flags, upl_size_t size) { @@ -2797,6 +2811,7 @@ upl_create(int type, int flags, upl_size_t size) upl->size = 0; upl->map_object = NULL; upl->ref_count = 1; + upl->ext_ref_count = 0; upl->highest_page = 0; upl_lock_init(upl); upl->vector_upl = NULL; @@ -2821,6 +2836,10 @@ upl_destroy(upl_t upl) int page_field_size; /* bit field in word size buf */ int size; + if (upl->ext_ref_count) { + panic("upl(%p) ext_ref_count", upl); + } + #if UPL_DEBUG { vm_object_t object; @@ -2864,14 +2883,6 @@ upl_destroy(upl_t upl) } } -void uc_upl_dealloc(upl_t upl); -__private_extern__ void -uc_upl_dealloc(upl_t upl) -{ - if (--upl->ref_count == 0) - upl_destroy(upl); -} - void upl_deallocate(upl_t upl) { @@ -2952,6 +2963,7 @@ vm_object_upl_request( vm_page_t dst_page = VM_PAGE_NULL; vm_object_offset_t dst_offset; upl_size_t xfer_size; + unsigned int size_in_pages; boolean_t dirty; boolean_t hw_dirty; upl_t upl = NULL; @@ -2963,9 +2975,10 @@ vm_object_upl_request( int refmod_state = 0; wpl_array_t lite_list = NULL; vm_object_t last_copy_object; - struct dw dw_array[DELAYED_WORK_LIMIT]; - struct dw *dwp; + struct vm_page_delayed_work dw_array[DEFAULT_DELAYED_WORK_LIMIT]; + struct vm_page_delayed_work *dwp; int dw_count; + int dw_limit; if (cntrl_flags & ~UPL_VALID_FLAGS) { /* @@ -3037,7 +3050,7 @@ vm_object_upl_request( upl->map_object->pageout = TRUE; upl->map_object->can_persist = FALSE; upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE; - upl->map_object->shadow_offset = offset; + upl->map_object->vo_shadow_offset = offset; upl->map_object->wimg_bits = object->wimg_bits; VM_PAGE_GRAB_FICTITIOUS(alias_page); @@ -3101,9 +3114,15 @@ vm_object_upl_request( xfer_size = size; dst_offset = offset; + size_in_pages = size / PAGE_SIZE; dwp = &dw_array[0]; dw_count = 0; + dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT); + + if (vm_page_free_count > (vm_page_free_target + size_in_pages) || + object->resident_page_count < (MAX_UPL_SIZE * 2)) + object->scan_collisions = 0; while (xfer_size) { @@ -3174,7 +3193,7 @@ vm_object_upl_request( * currently on the inactive queue or it meets the page * ticket (generation count) check */ - if ( (cntrl_flags & UPL_CLEAN_IN_PLACE || !(refmod_state & VM_MEM_REFERENCED)) && + if ( (cntrl_flags & UPL_CLEAN_IN_PLACE || !(refmod_state & VM_MEM_REFERENCED) || dst_page->throttled) && ((refmod_state & VM_MEM_MODIFIED) || dst_page->dirty || dst_page->precious) ) { goto check_busy; } @@ -3243,12 +3262,7 @@ vm_object_upl_request( vm_page_lockspin_queues(); -#if CONFIG_EMBEDDED - if (dst_page->laundry) -#else - if (dst_page->pageout_queue == TRUE) -#endif - { + if (dst_page->pageout_queue == TRUE) { /* * we've buddied up a page for a clustered pageout * that has already been moved to the pageout @@ -3452,7 +3466,7 @@ vm_object_upl_request( * the default_pager case */ dst_page->list_req_pending = FALSE; - dst_page->busy = FALSE; + PAGE_WAKEUP_DONE(dst_page); } else if (dst_page->pageout || dst_page->cleaning) { /* @@ -3471,9 +3485,8 @@ vm_object_upl_request( * so undo all of the state that vm_pageout_scan * hung on this page */ - dst_page->busy = FALSE; - vm_pageout_queue_steal(dst_page, FALSE); + PAGE_WAKEUP_DONE(dst_page); } } } @@ -3493,11 +3506,31 @@ vm_object_upl_request( goto try_next_page; } - /* - * need to allocate a page - */ - dst_page = vm_page_grab(); + if (object->scan_collisions) { + /* + * the pageout_scan thread is trying to steal + * pages from this object, but has run into our + * lock... grab 2 pages from the head of the object... + * the first is freed on behalf of pageout_scan, the + * 2nd is for our own use... we use vm_object_page_grab + * in both cases to avoid taking pages from the free + * list since we are under memory pressure and our + * lock on this object is getting in the way of + * relieving it + */ + dst_page = vm_object_page_grab(object); + + if (dst_page != VM_PAGE_NULL) + vm_page_release(dst_page); + dst_page = vm_object_page_grab(object); + } + if (dst_page == VM_PAGE_NULL) { + /* + * need to allocate a page + */ + dst_page = vm_page_grab(); + } if (dst_page == VM_PAGE_NULL) { if ( (cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) { /* @@ -3516,7 +3549,16 @@ vm_object_upl_request( * offset... */ vm_object_unlock(object); + + OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages); + + VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0); + VM_PAGE_WAIT(); + OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages); + + VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0); + vm_object_lock(object); continue; @@ -3613,7 +3655,12 @@ vm_object_upl_request( alias_page = NULL; } - if (cntrl_flags & UPL_CLEAN_IN_PLACE) { + if (cntrl_flags & UPL_REQUEST_SET_DIRTY) { + upl->flags &= ~UPL_CLEAR_DIRTY; + upl->flags |= UPL_SET_DIRTY; + dirty = TRUE; + upl->flags |= UPL_SET_DIRTY; + } else if (cntrl_flags & UPL_CLEAN_IN_PLACE) { /* * clean in place for read implies * that a write will be done on all @@ -3649,7 +3696,16 @@ vm_object_upl_request( */ dwp->dw_mask |= DW_set_reference; } - dst_page->precious = (cntrl_flags & UPL_PRECIOUS) ? TRUE : FALSE; + if (cntrl_flags & UPL_PRECIOUS) { + if (dst_page->object->internal) { + dst_page->dirty = TRUE; + dst_page->precious = FALSE; + } else { + dst_page->precious = TRUE; + } + } else { + dst_page->precious = FALSE; + } } if (dst_page->busy) upl->flags |= UPL_HAS_BUSY; @@ -3690,21 +3746,10 @@ vm_object_upl_request( if (dwp->dw_mask & DW_vm_page_activate) VM_STAT_INCR(reactivations); - if (dst_page->busy == FALSE) { - /* - * dw_do_work may need to drop the object lock - * if it does, we need the pages it's looking at to - * be held stable via the busy bit. - */ - dst_page->busy = TRUE; - dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP); - } - dwp->dw_m = dst_page; - dwp++; - dw_count++; + VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count); - if (dw_count >= DELAYED_WORK_LIMIT) { - dw_do_work(object, &dw_array[0], dw_count); + if (dw_count >= dw_limit) { + vm_page_do_delayed_work(object, &dw_array[0], dw_count); dwp = &dw_array[0]; dw_count = 0; @@ -3715,7 +3760,7 @@ vm_object_upl_request( xfer_size -= PAGE_SIZE; } if (dw_count) - dw_do_work(object, &dw_array[0], dw_count); + vm_page_do_delayed_work(object, &dw_array[0], dw_count); if (alias_page != NULL) { VM_PAGE_FREE(alias_page); @@ -3825,7 +3870,7 @@ vm_object_super_upl_request( base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1)); super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster<<1 : super_cluster; - super_size_64 = ((base_offset + super_size) > object->size) ? (object->size - base_offset) : super_size; + super_size_64 = ((base_offset + super_size) > object->vo_size) ? (object->vo_size - base_offset) : super_size; super_size = (upl_size_t) super_size_64; assert(super_size == super_size_64); @@ -3999,7 +4044,7 @@ vm_map_create_upl( (vm_object_offset_t) ((offset - local_start) + local_offset) + - local_object->shadow_offset, + local_object->vo_shadow_offset, *upl_size, FALSE, MEMORY_OBJECT_DATA_SYNC, VM_PROT_NO_CHANGE); @@ -4132,6 +4177,7 @@ vm_map_enter_upl( upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ ); if(upl == NULL) goto process_upl_to_enter; + vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size); *dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset); } else { @@ -4169,9 +4215,9 @@ vm_map_enter_upl( upl->map_object->pageout = TRUE; upl->map_object->can_persist = FALSE; upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE; - upl->map_object->shadow_offset = upl->offset - object->paging_offset; + upl->map_object->vo_shadow_offset = upl->offset - object->paging_offset; upl->map_object->wimg_bits = object->wimg_bits; - offset = upl->map_object->shadow_offset; + offset = upl->map_object->vo_shadow_offset; new_offset = 0; size = upl->size; @@ -4246,6 +4292,7 @@ vm_map_enter_upl( offset = 0; else offset = upl->offset - upl->map_object->paging_offset; + size = upl->size; vm_object_reference(upl->map_object); @@ -4277,9 +4324,6 @@ vm_map_enter_upl( m = vm_page_lookup(upl->map_object, offset); if (m) { - unsigned int cache_attr; - cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK; - m->pmapped = TRUE; /* CODE SIGNING ENFORCEMENT: page has been wpmapped, @@ -4288,7 +4332,7 @@ vm_map_enter_upl( /* m->wpmapped = TRUE; */ assert(map==kernel_map); - PMAP_ENTER(map->pmap, addr, m, VM_PROT_ALL, cache_attr, TRUE); + PMAP_ENTER(map->pmap, addr, m, VM_PROT_ALL, 0, TRUE); } offset += PAGE_SIZE_64; } @@ -4411,88 +4455,6 @@ vm_map_remove_upl( return KERN_FAILURE; } -static void -dw_do_work( - vm_object_t object, - struct dw *dwp, - int dw_count) -{ - int j; - boolean_t held_as_spin = TRUE; - - /* - * pageout_scan takes the vm_page_lock_queues first - * then tries for the object lock... to avoid what - * is effectively a lock inversion, we'll go to the - * trouble of taking them in that same order... otherwise - * if this object contains the majority of the pages resident - * in the UBC (or a small set of large objects actively being - * worked on contain the majority of the pages), we could - * cause the pageout_scan thread to 'starve' in its attempt - * to find pages to move to the free queue, since it has to - * successfully acquire the object lock of any candidate page - * before it can steal/clean it. - */ - if (!vm_page_trylockspin_queues()) { - vm_object_unlock(object); - - vm_page_lockspin_queues(); - - for (j = 0; ; j++) { - if (!vm_object_lock_avoid(object) && - _vm_object_lock_try(object)) - break; - vm_page_unlock_queues(); - mutex_pause(j); - vm_page_lockspin_queues(); - } - } - for (j = 0; j < dw_count; j++, dwp++) { - - if (dwp->dw_mask & DW_vm_pageout_throttle_up) - vm_pageout_throttle_up(dwp->dw_m); - - if (dwp->dw_mask & DW_vm_page_wire) - vm_page_wire(dwp->dw_m); - else if (dwp->dw_mask & DW_vm_page_unwire) { - boolean_t queueit; - - queueit = (dwp->dw_mask & DW_vm_page_free) ? FALSE : TRUE; - - vm_page_unwire(dwp->dw_m, queueit); - } - if (dwp->dw_mask & DW_vm_page_free) { - if (held_as_spin == TRUE) { - vm_page_lockconvert_queues(); - held_as_spin = FALSE; - } - vm_page_free(dwp->dw_m); - } else { - if (dwp->dw_mask & DW_vm_page_deactivate_internal) - vm_page_deactivate_internal(dwp->dw_m, FALSE); - else if (dwp->dw_mask & DW_vm_page_activate) - vm_page_activate(dwp->dw_m); - else if (dwp->dw_mask & DW_vm_page_speculate) - vm_page_speculate(dwp->dw_m, TRUE); - else if (dwp->dw_mask & DW_vm_page_lru) - vm_page_lru(dwp->dw_m); - - if (dwp->dw_mask & DW_set_reference) - dwp->dw_m->reference = TRUE; - else if (dwp->dw_mask & DW_clear_reference) - dwp->dw_m->reference = FALSE; - - if (dwp->dw_mask & DW_clear_busy) - dwp->dw_m->busy = FALSE; - - if (dwp->dw_mask & DW_PAGE_WAKEUP) - PAGE_WAKEUP(dwp->dw_m); - } - } - vm_page_unlock_queues(); -} - - kern_return_t upl_commit_range( @@ -4514,10 +4476,13 @@ upl_commit_range( int occupied; int clear_refmod = 0; int pgpgout_count = 0; - struct dw dw_array[DELAYED_WORK_LIMIT]; - struct dw *dwp; - int dw_count, isVectorUPL = 0; + struct vm_page_delayed_work dw_array[DEFAULT_DELAYED_WORK_LIMIT]; + struct vm_page_delayed_work *dwp; + int dw_count; + int dw_limit; + int isVectorUPL = 0; upl_t vector_upl = NULL; + boolean_t should_be_throttled = FALSE; *empty = FALSE; @@ -4575,6 +4540,8 @@ upl_commit_range( } return KERN_FAILURE; } + if (upl->flags & UPL_SET_DIRTY) + flags |= UPL_COMMIT_SET_DIRTY; if (upl->flags & UPL_CLEAR_DIRTY) flags |= UPL_COMMIT_CLEAR_DIRTY; @@ -4622,9 +4589,12 @@ upl_commit_range( */ flags &= ~UPL_COMMIT_CS_VALIDATED; } + if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) && shadow_object->internal) + should_be_throttled = TRUE; dwp = &dw_array[0]; dw_count = 0; + dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT); while (xfer_size) { vm_page_t t, m; @@ -4655,7 +4625,7 @@ upl_commit_range( VM_PAGE_FREE(t); if (m == VM_PAGE_NULL) - m = vm_page_lookup(shadow_object, target_offset + object->shadow_offset); + m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset); } } if ((upl->flags & UPL_KERNEL_OBJECT) || m == VM_PAGE_NULL) @@ -4675,9 +4645,9 @@ upl_commit_range( if (page_list) page_list[entry].phys_addr = 0; - if (flags & UPL_COMMIT_SET_DIRTY) + if (flags & UPL_COMMIT_SET_DIRTY) { m->dirty = TRUE; - else if (flags & UPL_COMMIT_CLEAR_DIRTY) { + } else if (flags & UPL_COMMIT_CLEAR_DIRTY) { m->dirty = FALSE; if (! (flags & UPL_COMMIT_CS_VALIDATED) && @@ -4869,29 +4839,32 @@ upl_commit_range( pmap_disconnect(m->phys_page); } - if ((m->busy) && (m->cleaning)) { + if (m->overwriting) { /* - * the request_page_list case + * the (COPY_OUT_FROM == FALSE) request_page_list case */ - m->absent = FALSE; - m->overwriting = FALSE; + if (m->busy) { + m->absent = FALSE; - dwp->dw_mask |= DW_clear_busy; + dwp->dw_mask |= DW_clear_busy; + } else { + /* + * alternate (COPY_OUT_FROM == FALSE) page_list case + * Occurs when the original page was wired + * at the time of the list request + */ + assert(VM_PAGE_WIRED(m)); - } else if (m->overwriting) { - /* - * alternate request page list, write to - * page_list case. Occurs when the original - * page was wired at the time of the list - * request - */ - assert(VM_PAGE_WIRED(m)); + dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */ + } m->overwriting = FALSE; + } + if (m->encrypted_cleaning == TRUE) { + m->encrypted_cleaning = FALSE; - dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */ + dwp->dw_mask |= DW_clear_busy; } m->cleaning = FALSE; - m->encrypted_cleaning = FALSE; /* * It is a part of the semantic of COPYOUT_FROM @@ -4906,19 +4879,30 @@ upl_commit_range( if (flags & UPL_COMMIT_SET_DIRTY) m->dirty = TRUE; - if ((flags & UPL_COMMIT_INACTIVATE) && !m->clustered && !m->speculative) { - dwp->dw_mask |= DW_vm_page_deactivate_internal; - clear_refmod |= VM_MEM_REFERENCED; - - } else if (!m->active && !m->inactive && !m->speculative) { + if (should_be_throttled == TRUE && !m->active && !m->inactive && !m->speculative && !m->throttled) { + /* + * page coming back in from being 'frozen'... + * it was dirty before it was frozen, so keep it so + * the vm_page_activate will notice that it really belongs + * on the throttle queue and put it there + */ + m->dirty = TRUE; + dwp->dw_mask |= DW_vm_page_activate; - if (m->clustered || (flags & UPL_COMMIT_SPECULATE)) - dwp->dw_mask |= DW_vm_page_speculate; - else if (m->reference) - dwp->dw_mask |= DW_vm_page_activate; - else { + } else { + if ((flags & UPL_COMMIT_INACTIVATE) && !m->clustered && !m->speculative) { dwp->dw_mask |= DW_vm_page_deactivate_internal; clear_refmod |= VM_MEM_REFERENCED; + } else if (!m->active && !m->inactive && !m->speculative) { + + if (m->clustered || (flags & UPL_COMMIT_SPECULATE)) + dwp->dw_mask |= DW_vm_page_speculate; + else if (m->reference) + dwp->dw_mask |= DW_vm_page_activate; + else { + dwp->dw_mask |= DW_vm_page_deactivate_internal; + clear_refmod |= VM_MEM_REFERENCED; + } } } if (upl->flags & UPL_ACCESS_BLOCKED) { @@ -4944,21 +4928,10 @@ upl_commit_range( if (dwp->dw_mask) { if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) { - if (m->busy == FALSE) { - /* - * dw_do_work may need to drop the object lock - * if it does, we need the pages it's looking at to - * be held stable via the busy bit. - */ - m->busy = TRUE; - dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP); - } - dwp->dw_m = m; - dwp++; - dw_count++; + VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count); - if (dw_count >= DELAYED_WORK_LIMIT) { - dw_do_work(shadow_object, &dw_array[0], dw_count); + if (dw_count >= dw_limit) { + vm_page_do_delayed_work(shadow_object, &dw_array[0], dw_count); dwp = &dw_array[0]; dw_count = 0; @@ -4973,7 +4946,7 @@ upl_commit_range( } } if (dw_count) - dw_do_work(shadow_object, &dw_array[0], dw_count); + vm_page_do_delayed_work(shadow_object, &dw_array[0], dw_count); occupied = 1; @@ -5071,9 +5044,11 @@ upl_abort_range( int entry; wpl_array_t lite_list; int occupied; - struct dw dw_array[DELAYED_WORK_LIMIT]; - struct dw *dwp; - int dw_count, isVectorUPL = 0; + struct vm_page_delayed_work dw_array[DEFAULT_DELAYED_WORK_LIMIT]; + struct vm_page_delayed_work *dwp; + int dw_count; + int dw_limit; + int isVectorUPL = 0; upl_t vector_upl = NULL; *empty = FALSE; @@ -5166,6 +5141,7 @@ upl_abort_range( dwp = &dw_array[0]; dw_count = 0; + dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT); if ((error & UPL_ABORT_DUMP_PAGES) && (upl->flags & UPL_KERNEL_OBJECT)) panic("upl_abort_range: kernel_object being DUMPED"); @@ -5199,7 +5175,7 @@ upl_abort_range( VM_PAGE_FREE(t); if (m == VM_PAGE_NULL) - m = vm_page_lookup(shadow_object, target_offset + object->shadow_offset); + m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset); } } if ((upl->flags & UPL_KERNEL_OBJECT)) @@ -5210,7 +5186,6 @@ upl_abort_range( if (m->absent) { boolean_t must_free = TRUE; - m->clustered = FALSE; /* * COPYOUT = FALSE case * check for error conditions which must @@ -5232,6 +5207,18 @@ upl_abort_range( m->unusual = TRUE; must_free = FALSE; } + if (m->clustered) { + /* + * This page was a part of a speculative + * read-ahead initiated by the kernel + * itself. No one is expecting this + * page and no one will clean up its + * error state if it ever becomes valid + * in the future. + * We have to free it here. + */ + must_free = TRUE; + } /* * ENCRYPTED SWAP: @@ -5244,6 +5231,21 @@ upl_abort_range( m->cleaning = FALSE; m->encrypted_cleaning = FALSE; + + if (m->overwriting && !m->busy) { + /* + * this shouldn't happen since + * this is an 'absent' page, but + * it doesn't hurt to check for + * the 'alternate' method of + * stabilizing the page... + * we will mark 'busy' to be cleared + * in the following code which will + * take care of the primary stabilzation + * method (i.e. setting 'busy' to TRUE) + */ + dwp->dw_mask |= DW_vm_page_unwire; + } m->overwriting = FALSE; dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP); @@ -5259,17 +5261,45 @@ upl_abort_range( if (m->laundry) dwp->dw_mask |= DW_vm_pageout_throttle_up; + if (upl->flags & UPL_ACCESS_BLOCKED) { + /* + * We blocked access to the pages in this UPL. + * Clear the "busy" bit and wake up any waiter + * for this page. + */ + dwp->dw_mask |= DW_clear_busy; + } if (m->pageout) { assert(m->busy); assert(m->wire_count == 1); m->pageout = FALSE; - dwp->dw_mask |= DW_vm_page_unwire; + dwp->dw_mask |= (DW_vm_page_unwire | DW_clear_busy); + } + if (m->overwriting) { + if (m->busy) + dwp->dw_mask |= DW_clear_busy; + else { + /* + * deal with the 'alternate' method + * of stabilizing the page... + * we will either free the page + * or mark 'busy' to be cleared + * in the following code which will + * take care of the primary stabilzation + * method (i.e. setting 'busy' to TRUE) + */ + dwp->dw_mask |= DW_vm_page_unwire; + } + m->overwriting = FALSE; + } + if (m->encrypted_cleaning == TRUE) { + m->encrypted_cleaning = FALSE; + + dwp->dw_mask |= DW_clear_busy; } m->dump_cleaning = FALSE; m->cleaning = FALSE; - m->encrypted_cleaning = FALSE; - m->overwriting = FALSE; #if MACH_PAGEMAP vm_external_state_clr(m->object->existence_map, m->offset); #endif /* MACH_PAGEMAP */ @@ -5287,7 +5317,7 @@ upl_abort_range( */ dwp->dw_mask |= DW_vm_page_lru; } - dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP); + dwp->dw_mask |= DW_PAGE_WAKEUP; } } } @@ -5298,21 +5328,10 @@ upl_abort_range( if (dwp->dw_mask) { if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) { - if (m->busy == FALSE) { - /* - * dw_do_work may need to drop the object lock - * if it does, we need the pages it's looking at to - * be held stable via the busy bit. - */ - m->busy = TRUE; - dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP); - } - dwp->dw_m = m; - dwp++; - dw_count++; + VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count); - if (dw_count >= DELAYED_WORK_LIMIT) { - dw_do_work(shadow_object, &dw_array[0], dw_count); + if (dw_count >= dw_limit) { + vm_page_do_delayed_work(shadow_object, &dw_array[0], dw_count); dwp = &dw_array[0]; dw_count = 0; @@ -5327,7 +5346,7 @@ upl_abort_range( } } if (dw_count) - dw_do_work(shadow_object, &dw_array[0], dw_count); + vm_page_do_delayed_work(shadow_object, &dw_array[0], dw_count); occupied = 1; @@ -5449,13 +5468,15 @@ vm_object_iopl_request( unsigned int entry; wpl_array_t lite_list = NULL; int no_zero_fill = FALSE; + unsigned int size_in_pages; u_int32_t psize; kern_return_t ret; vm_prot_t prot; struct vm_object_fault_info fault_info; - struct dw dw_array[DELAYED_WORK_LIMIT]; - struct dw *dwp; + struct vm_page_delayed_work dw_array[DEFAULT_DELAYED_WORK_LIMIT]; + struct vm_page_delayed_work *dwp; int dw_count; + int dw_limit; int dw_index; if (cntrl_flags & ~UPL_VALID_FLAGS) { @@ -5473,10 +5494,10 @@ vm_object_iopl_request( return KERN_INVALID_VALUE; if (object->phys_contiguous) { - if ((offset + object->shadow_offset) >= (vm_object_offset_t)max_valid_dma_address) + if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address) return KERN_INVALID_ADDRESS; - if (((offset + object->shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address) + if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address) return KERN_INVALID_ADDRESS; } } @@ -5543,6 +5564,8 @@ vm_object_iopl_request( upl->map_object = object; upl->size = size; + size_in_pages = size / PAGE_SIZE; + if (object == kernel_object && !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) { upl->flags |= UPL_KERNEL_OBJECT; @@ -5586,10 +5609,10 @@ vm_object_iopl_request( */ upl->flags |= UPL_DEVICE_MEMORY; - upl->highest_page = (ppnum_t) ((offset + object->shadow_offset + size - 1)>>PAGE_SHIFT); + upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1)>>PAGE_SHIFT); if (user_page_list) { - user_page_list[0].phys_addr = (ppnum_t) ((offset + object->shadow_offset)>>PAGE_SHIFT); + user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset)>>PAGE_SHIFT); user_page_list[0].device = TRUE; } if (page_list_count != NULL) { @@ -5609,7 +5632,7 @@ vm_object_iopl_request( if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) object->copy_strategy = MEMORY_OBJECT_COPY_DELAY; } - + #if UPL_DEBUG queue_enter(&object->uplq, upl, upl_t, uplq); #endif /* UPL_DEBUG */ @@ -5659,10 +5682,13 @@ vm_object_iopl_request( fault_info.hi_offset = offset + xfer_size; fault_info.no_cache = FALSE; fault_info.stealth = FALSE; + fault_info.io_sync = FALSE; + fault_info.cs_bypass = FALSE; fault_info.mark_zf_absent = TRUE; dwp = &dw_array[0]; dw_count = 0; + dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT); while (xfer_size) { vm_fault_return_t result; @@ -5756,17 +5782,23 @@ vm_object_iopl_request( vm_object_lock(object); break; - case VM_FAULT_FICTITIOUS_SHORTAGE: - vm_page_more_fictitious(); + case VM_FAULT_MEMORY_SHORTAGE: + OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages); - vm_object_lock(object); - break; + VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0); - case VM_FAULT_MEMORY_SHORTAGE: if (vm_page_wait(interruptible)) { + OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages); + + VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0); vm_object_lock(object); + break; } + OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages); + + VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1); + /* fall thru */ case VM_FAULT_INTERRUPTED: @@ -5791,7 +5823,6 @@ vm_object_iopl_request( } while (result != VM_FAULT_SUCCESS); } - if (upl->flags & UPL_KERNEL_OBJECT) goto record_phys_addr; @@ -5845,7 +5876,7 @@ vm_object_iopl_request( else refmod = 0; - if ( !dst_page->absent) + if (!dst_page->absent) vm_page_copy(dst_page, low_page); low_page->reference = dst_page->reference; @@ -5874,7 +5905,8 @@ vm_object_iopl_request( if (cntrl_flags & UPL_BLOCK_ACCESS) { /* * Mark the page "busy" to block any future page fault - * on this page. We'll also remove the mapping + * on this page in addition to wiring it. + * We'll also remove the mapping * of all these pages before leaving this routine. */ assert(!dst_page->fictitious); @@ -5926,21 +5958,10 @@ vm_object_iopl_request( xfer_size -= PAGE_SIZE; if (dwp->dw_mask) { - if (dst_page->busy == FALSE) { - /* - * dw_do_work may need to drop the object lock - * if it does, we need the pages it's looking at to - * be held stable via the busy bit. - */ - dst_page->busy = TRUE; - dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP); - } - dwp->dw_m = dst_page; - dwp++; - dw_count++; + VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count); - if (dw_count >= DELAYED_WORK_LIMIT) { - dw_do_work(object, &dw_array[0], dw_count); + if (dw_count >= dw_limit) { + vm_page_do_delayed_work(object, &dw_array[0], dw_count); dwp = &dw_array[0]; dw_count = 0; @@ -5948,7 +5969,7 @@ vm_object_iopl_request( } } if (dw_count) - dw_do_work(object, &dw_array[0], dw_count); + vm_page_do_delayed_work(object, &dw_array[0], dw_count); if (page_list_count != NULL) { if (upl->flags & UPL_INTERNAL) @@ -6019,7 +6040,7 @@ vm_object_iopl_request( vm_page_unwire(dst_page, TRUE); PAGE_WAKEUP_DONE(dst_page); - } + } vm_page_unlock_queues(); if (need_unwire == TRUE) @@ -6154,7 +6175,7 @@ upl_transpose( * can call the encryption/decryption routines with a kernel * virtual address. We keep this pool of pre-allocated kernel * virtual addresses so that we don't have to scan the kernel's - * virtaul address space each time we need to encrypt or decrypt + * virtual address space each time we need to encrypt or decrypt * a physical page. * It would be nice to be able to encrypt and decrypt in physical * mode but that might not always be more efficient... @@ -6197,6 +6218,9 @@ vm_paging_map_init(void) } map_entry->object.vm_object = kernel_object; map_entry->offset = page_map_offset; + map_entry->protection = VM_PROT_NONE; + map_entry->max_protection = VM_PROT_NONE; + map_entry->permanent = TRUE; vm_object_reference(kernel_object); vm_map_unlock(kernel_map); @@ -6295,9 +6319,6 @@ vm_paging_map_object( vm_paging_page_inuse[i] = TRUE; simple_unlock(&vm_paging_lock); - if (page->pmapped == FALSE) { - pmap_sync_page_data_phys(page->phys_page); - } page->pmapped = TRUE; /* @@ -6310,8 +6331,7 @@ vm_paging_map_object( page_map_offset, page, protection, - ((int) page->object->wimg_bits & - VM_WIMG_MASK), + 0, TRUE); vm_paging_objects_mapped++; vm_paging_pages_mapped++; @@ -6380,7 +6400,6 @@ vm_paging_map_object( for (page_map_offset = 0; map_size != 0; map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) { - unsigned int cache_attr; page = vm_page_lookup(object, offset + page_map_offset); if (page == VM_PAGE_NULL) { @@ -6394,18 +6413,14 @@ vm_paging_map_object( vm_object_lock(object); return KERN_MEMORY_ERROR; } - if (page->pmapped == FALSE) { - pmap_sync_page_data_phys(page->phys_page); - } page->pmapped = TRUE; - cache_attr = ((unsigned int) object->wimg_bits) & VM_WIMG_MASK; //assert(pmap_verify_free(page->phys_page)); PMAP_ENTER(kernel_pmap, *address + page_map_offset, page, protection, - cache_attr, + 0, TRUE); } @@ -6742,10 +6757,13 @@ vm_page_decrypt( vm_object_offset_t paging_offset; } vm; } decrypt_iv; + boolean_t was_dirty; assert(page->busy); assert(page->encrypted); + was_dirty = page->dirty; + /* * Take a paging-in-progress reference to keep the object * alive even if we have to unlock it (in vm_paging_map_object() @@ -6817,16 +6835,24 @@ vm_page_decrypt( kernel_vaddr + PAGE_SIZE); } - /* - * After decryption, the page is actually clean. - * It was encrypted as part of paging, which "cleans" - * the "dirty" pages. - * Noone could access it after it was encrypted - * and the decryption doesn't count. - */ - page->dirty = FALSE; - assert (page->cs_validated == FALSE); - pmap_clear_refmod(page->phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED); + if (was_dirty) { + /* + * The pager did not specify that the page would be + * clean when it got paged in, so let's not clean it here + * either. + */ + } else { + /* + * After decryption, the page is actually still clean. + * It was encrypted as part of paging, which "cleans" + * the "dirty" pages. + * Noone could access it after it was encrypted + * and the decryption doesn't count. + */ + page->dirty = FALSE; + assert (page->cs_validated == FALSE); + pmap_clear_refmod(page->phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED); + } page->encrypted = FALSE; /* @@ -6943,7 +6969,7 @@ upl_encrypt( base_offset + offset_in_upl); if (page == VM_PAGE_NULL) { panic("upl_encrypt: " - "no page for (obj=%p,off=%lld+%d)!\n", + "no page for (obj=%p,off=0x%llx+0x%x)!\n", shadow_object, base_offset, offset_in_upl); @@ -7373,6 +7399,126 @@ upl_clear_dirty( } } +void +upl_set_referenced( + upl_t upl, + boolean_t value) +{ + upl_lock(upl); + if (value) { + upl->ext_ref_count++; + } else { + if (!upl->ext_ref_count) { + panic("upl_set_referenced not %p\n", upl); + } + upl->ext_ref_count--; + } + upl_unlock(upl); +} + +boolean_t +vm_page_is_slideable(vm_page_t m) +{ + boolean_t result = FALSE; + vm_object_t slide_object = slide_info.slide_object; + mach_vm_offset_t start = slide_info.start; + mach_vm_offset_t end = slide_info.end; + + /* make sure our page belongs to the one object allowed to do this */ + if (slide_object == VM_OBJECT_NULL) { + return result; + } + + /*Should we traverse down the chain?*/ + if (m->object != slide_object) { + return result; + } + + if(!m->slid && (start <= m->offset && end > m->offset)) { + result = TRUE; + } + return result; +} + +int vm_page_slide_counter = 0; +int vm_page_slide_errors = 0; +kern_return_t +vm_page_slide( + vm_page_t page, + vm_map_offset_t kernel_mapping_offset) +{ + kern_return_t kr; + vm_map_size_t kernel_mapping_size; + vm_offset_t kernel_vaddr; + uint32_t pageIndex = 0; + + assert(!page->slid); + + /* + * Take a paging-in-progress reference to keep the object + * alive even if we have to unlock it (in vm_paging_map_object() + * for example)... + */ + vm_object_paging_begin(page->object); + + if (kernel_mapping_offset == 0) { + /* + * The page hasn't already been mapped in kernel space + * by the caller. Map it now, so that we can access + * its contents and decrypt them. + */ + kernel_mapping_size = PAGE_SIZE; + kr = vm_paging_map_object(&kernel_mapping_offset, + page, + page->object, + page->offset, + &kernel_mapping_size, + VM_PROT_READ | VM_PROT_WRITE, + FALSE); + if (kr != KERN_SUCCESS) { + panic("vm_page_slide: " + "could not map page in kernel: 0x%x\n", + kr); + } + } else { + kernel_mapping_size = 0; + } + kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset); + + /* + * Slide the pointers on the page. + */ + + /*assert that slide_file_info.start/end are page-aligned?*/ + + pageIndex = (uint32_t)((page->offset - slide_info.start)/PAGE_SIZE); + kr = vm_shared_region_slide(kernel_vaddr, pageIndex); + vm_page_slide_counter++; + + /* + * Unmap the page from the kernel's address space, + */ + if (kernel_mapping_size != 0) { + vm_paging_unmap_object(page->object, + kernel_vaddr, + kernel_vaddr + PAGE_SIZE); + } + + page->dirty = FALSE; + pmap_clear_refmod(page->phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED); + + if (kr == KERN_SUCCESS) { + page->slid = TRUE; + } else { + page->error = TRUE; + vm_page_slide_errors++; + } + + vm_object_paging_end(page->object); + + return kr; +} + #ifdef MACH_BSD @@ -7568,9 +7714,9 @@ db_pageout(void) iprintf("nolock %5d avoid %5d busy %5d absent %5d\n", vm_pageout_inactive_nolock, vm_pageout_inactive_avoid, vm_pageout_inactive_busy, vm_pageout_inactive_absent); - iprintf("used %5d clean %5d dirty %5d\n", + iprintf("used %5d clean %5d dirty(internal) %5d dirty(external) %5d\n", vm_pageout_inactive_used, vm_pageout_inactive_clean, - vm_pageout_inactive_dirty); + vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external); #if MACH_COUNTERS iprintf("laundry_pages_freed %d\n", c_laundry_pages_freed); #endif /* MACH_COUNTERS */ diff --git a/osfmk/vm/vm_pageout.h b/osfmk/vm/vm_pageout.h index 31a8b61df..d8ddac6a7 100644 --- a/osfmk/vm/vm_pageout.h +++ b/osfmk/vm/vm_pageout.h @@ -85,6 +85,40 @@ #include #endif +#include + +#if CONFIG_FREEZE +extern boolean_t vm_freeze_enabled; +#define VM_DYNAMIC_PAGING_ENABLED(port) ((vm_freeze_enabled == FALSE) && IP_VALID(port)) +#else +#define VM_DYNAMIC_PAGING_ENABLED(port) IP_VALID(port) +#endif + + +extern int vm_debug_events; + +#define VMF_CHECK_ZFDELAY 0x100 +#define VMF_COWDELAY 0x101 +#define VMF_ZFDELAY 0x102 + +#define VM_PAGEOUT_SCAN 0x104 +#define VM_PAGEOUT_BALANCE 0x105 +#define VM_PAGEOUT_FREELIST 0x106 +#define VM_PAGEOUT_PURGEONE 0x107 +#define VM_PAGEOUT_CACHE_EVICT 0x108 +#define VM_PAGEOUT_THREAD_BLOCK 0x109 + +#define VM_UPL_PAGE_WAIT 0x120 +#define VM_IOPL_PAGE_WAIT 0x121 + +#define VM_DEBUG_EVENT(name, event, control, arg1, arg2, arg3, arg4) \ + MACRO_BEGIN \ + if (vm_debug_events) { \ + KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, event)) | control, arg1, arg2, arg3, arg4, 0); \ + } \ + MACRO_END + + extern kern_return_t vm_map_create_upl( vm_map_t map, @@ -106,7 +140,6 @@ extern upl_size_t upl_get_size( typedef struct vm_page *vm_page_t; #endif - extern void vm_page_free_list( vm_page_t mem, boolean_t prepare_object); @@ -129,22 +162,6 @@ extern unsigned int vm_pageout_scan_event_counter; extern unsigned int vm_zf_queue_count; -#if defined(__ppc__) /* On ppc, vm statistics are still 32-bit */ - -extern unsigned int vm_zf_count; - -#define VM_ZF_COUNT_INCR() \ - MACRO_BEGIN \ - OSAddAtomic(1, (SInt32 *) &vm_zf_count); \ - MACRO_END \ - -#define VM_ZF_COUNT_DECR() \ - MACRO_BEGIN \ - OSAddAtomic(-1, (SInt32 *) &vm_zf_count); \ - MACRO_END \ - -#else /* !(defined(__ppc__)) */ - extern uint64_t vm_zf_count; #define VM_ZF_COUNT_INCR() \ @@ -157,8 +174,6 @@ extern uint64_t vm_zf_count; OSAddAtomic64(-1, (SInt64 *) &vm_zf_count); \ MACRO_END \ -#endif /* !(defined(__ppc__)) */ - /* * must hold the page queues lock to * manipulate this structure @@ -181,6 +196,7 @@ struct vm_pageout_queue { extern struct vm_pageout_queue vm_pageout_queue_internal; extern struct vm_pageout_queue vm_pageout_queue_external; + /* * Routines exported to Mach. */ @@ -252,6 +268,7 @@ struct ucd { struct upl { decl_lck_mtx_data(, Lock) /* Synchronization */ int ref_count; + int ext_ref_count; int flags; vm_object_t src_object; /* object derived from */ vm_object_offset_t offset; @@ -290,7 +307,8 @@ struct upl { #define UPL_SHADOWED 0x1000 #define UPL_KERNEL_OBJECT 0x2000 #define UPL_VECTOR 0x4000 -#define UPL_HAS_BUSY 0x10000 +#define UPL_SET_DIRTY 0x8000 +#define UPL_HAS_BUSY 0x10000 /* flags for upl_create flags parameter */ #define UPL_CREATE_EXTERNAL 0 @@ -385,6 +403,9 @@ extern void vm_pageout_queue_steal( vm_page_t page, boolean_t queues_locked); +extern boolean_t vm_page_is_slideable(vm_page_t m); + +extern kern_return_t vm_page_slide(vm_page_t page, vm_map_offset_t kernel_mapping_offset); #endif /* MACH_KERNEL_PRIVATE */ #if UPL_DEBUG diff --git a/osfmk/vm/vm_protos.h b/osfmk/vm/vm_protos.h index a4562ce8a..53cf9e12a 100644 --- a/osfmk/vm/vm_protos.h +++ b/osfmk/vm/vm_protos.h @@ -135,15 +135,6 @@ extern kern_return_t vm_region_object_create extern mach_vm_offset_t mach_get_vm_start(vm_map_t); extern mach_vm_offset_t mach_get_vm_end(vm_map_t); -/* - * Legacy routines to get the start and end for a vm_map_t. They - * return them in the vm_offset_t format. So, they should only be - * called on maps that are the same size as the kernel map for - * accurate results. - */ -extern vm_offset_t get_vm_start(vm_map_t); -extern vm_offset_t get_vm_end(vm_map_t); - #if CONFIG_CODE_DECRYPTION struct pager_crypt_info; extern kern_return_t vm_map_apple_protected( @@ -179,12 +170,17 @@ extern pager_return_t vnode_pageout( struct vnode *, upl_t, upl_offset_t, vm_object_offset_t, upl_size_t, int, int *); +extern uint32_t vnode_trim (struct vnode *, int64_t offset, unsigned long len); extern memory_object_t vnode_pager_setup( struct vnode *, memory_object_t); extern vm_object_offset_t vnode_pager_get_filesize( struct vnode *); extern uint32_t vnode_pager_isinuse( struct vnode *); +extern boolean_t vnode_pager_isSSD( + struct vnode *); +extern void vnode_pager_throttle( + void); extern uint32_t vnode_pager_return_hard_throttle_limit( struct vnode *, uint32_t *, @@ -199,7 +195,14 @@ extern kern_return_t vnode_pager_get_filename( extern kern_return_t vnode_pager_get_cs_blobs( struct vnode *vp, void **blobs); - + +#if CHECK_CS_VALIDATION_BITMAP +/* used by the vnode_pager_cs_validation_bitmap routine*/ +#define CS_BITMAP_SET 1 +#define CS_BITMAP_CLEAR 2 +#define CS_BITMAP_CHECK 3 + +#endif /* CHECK_CS_VALIDATION_BITMAP */ extern void vnode_pager_bootstrap(void) __attribute__((section("__TEXT, initcode"))); extern kern_return_t @@ -218,6 +221,9 @@ extern kern_return_t vnode_pager_get_object_size( extern kern_return_t vnode_pager_get_isinuse( memory_object_t, uint32_t *); +extern kern_return_t vnode_pager_get_isSSD( + memory_object_t, + boolean_t *); extern kern_return_t vnode_pager_check_hard_throttle( memory_object_t, uint32_t *, @@ -232,6 +238,19 @@ extern kern_return_t vnode_pager_get_object_filename( extern kern_return_t vnode_pager_get_object_cs_blobs( memory_object_t mem_obj, void **blobs); + +#if CHECK_CS_VALIDATION_BITMAP +extern kern_return_t vnode_pager_cs_check_validation_bitmap( + memory_object_t mem_obj, + memory_object_offset_t offset, + int optype); +#endif /*CHECK_CS_VALIDATION_BITMAP*/ + +extern kern_return_t ubc_cs_check_validation_bitmap ( + struct vnode *vp, + memory_object_offset_t offset, + int optype); + extern kern_return_t vnode_pager_data_request( memory_object_t, memory_object_offset_t, @@ -322,6 +341,10 @@ extern kern_return_t default_pager_memory_object_create( memory_object_t *); #endif /* _memory_object_default_server_ */ +#if CONFIG_FREEZE +extern unsigned int default_pager_swap_pages_free(void); +#endif + extern void device_pager_reference(memory_object_t); extern void device_pager_deallocate(memory_object_t); extern kern_return_t device_pager_init(memory_object_t, @@ -422,6 +445,11 @@ extern int macx_backing_store_compaction(int flags); extern unsigned int mach_vm_ctl_page_free_wanted(void); extern void no_paging_space_action(void); + +#define VM_TOGGLE_CLEAR 0 +#define VM_TOGGLE_SET 1 +#define VM_TOGGLE_GETVALUE 999 +int vm_toggle_entry_reuse(int, int*); #endif /* _VM_VM_PROTOS_H_ */ #endif /* XNU_KERNEL_PRIVATE */ diff --git a/osfmk/vm/vm_purgeable_internal.h b/osfmk/vm/vm_purgeable_internal.h index 5e6d4e4af..4f720eb39 100644 --- a/osfmk/vm/vm_purgeable_internal.h +++ b/osfmk/vm/vm_purgeable_internal.h @@ -46,15 +46,9 @@ enum purgeable_q_type { PURGEABLE_Q_TYPE_MAX }; -#if (CONFIG_TOKEN_QUEUE_SMALL == 1) -typedef uint16_t token_idx_t; -typedef uint16_t token_cnt_t; -#define TOKEN_COUNT_MAX UINT16_MAX -#else typedef uint32_t token_idx_t; typedef uint32_t token_cnt_t; #define TOKEN_COUNT_MAX UINT32_MAX -#endif #define NUM_VOLATILE_GROUPS 8 struct purgeable_q { diff --git a/osfmk/vm/vm_resident.c b/osfmk/vm/vm_resident.c index 979c81624..397914b0c 100644 --- a/osfmk/vm/vm_resident.c +++ b/osfmk/vm/vm_resident.c @@ -85,8 +85,7 @@ #include #include #include -#include /* (BRINGUP) */ -#include /* (BRINGUP) */ +#include #include #include @@ -95,17 +94,15 @@ #include -#if CONFIG_EMBEDDED #include -#endif #include boolean_t vm_page_free_verify = TRUE; -uint_t vm_lopage_free_count = 0; -uint_t vm_lopage_free_limit = 0; -uint_t vm_lopage_lowater = 0; +uint32_t vm_lopage_free_count = 0; +uint32_t vm_lopage_free_limit = 0; +uint32_t vm_lopage_lowater = 0; boolean_t vm_lopage_refill = FALSE; boolean_t vm_lopage_needed = FALSE; @@ -120,7 +117,9 @@ struct vm_speculative_age_q vm_page_queue_speculative[VM_PAGE_MAX_SPECULATIVE_AG __private_extern__ void vm_page_init_lck_grp(void); -static void vm_page_free_prepare(vm_page_t page); +static void vm_page_free_prepare(vm_page_t page); +static vm_page_t vm_page_grab_fictitious_common(ppnum_t phys_addr); + @@ -241,7 +240,6 @@ unsigned int vm_colors; unsigned int vm_color_mask; /* mask is == (vm_colors-1) */ unsigned int vm_cache_geometry_colors = 0; /* set by hw dependent code during startup */ queue_head_t vm_page_queue_free[MAX_COLORS]; -vm_page_t vm_page_queue_fictitious; unsigned int vm_page_free_wanted; unsigned int vm_page_free_wanted_privileged; unsigned int vm_page_free_count; @@ -458,11 +456,6 @@ vm_page_init_local_q() } -uint64_t initial_max_mem; -int initial_wire_count; -int initial_free_count; -int initial_lopage_count; - /* * vm_page_bootstrap: * @@ -542,6 +535,7 @@ vm_page_bootstrap( m->no_cache = FALSE; m->zero_fill = FALSE; m->reusable = FALSE; + m->slid = FALSE; m->__unused_object_bits = 0; @@ -572,8 +566,8 @@ vm_page_bootstrap( for (i = 0; i < MAX_COLORS; i++ ) queue_init(&vm_page_queue_free[i]); + queue_init(&vm_lopage_queue_free); - vm_page_queue_fictitious = VM_PAGE_NULL; queue_init(&vm_page_queue_active); queue_init(&vm_page_queue_inactive); queue_init(&vm_page_queue_throttled); @@ -689,11 +683,6 @@ vm_page_bootstrap( vm_page_wire_count_initial = vm_page_wire_count; vm_page_free_count_minimum = vm_page_free_count; - initial_max_mem = max_mem; - initial_wire_count = vm_page_wire_count; - initial_free_count = vm_page_free_count; - initial_lopage_count = vm_lopage_free_count; - printf("vm_page_bootstrap: %d free pages and %d wired pages\n", vm_page_free_count, vm_page_wire_count); @@ -743,7 +732,7 @@ pmap_steal_memory( addr = virtual_space_start; virtual_space_start += size; - kprintf("pmap_steal_memory: %08lX - %08lX; size=%08lX\n", (long)addr, (long)virtual_space_start, (long)size); /* (TEST/DEBUG) */ + //kprintf("pmap_steal_memory: %08lX - %08lX; size=%08lX\n", (long)addr, (long)virtual_space_start, (long)size); /* (TEST/DEBUG) */ /* * Allocate and map physical pages to back new virtual pages. @@ -910,6 +899,7 @@ vm_page_module_init(void) zone_debug_disable(vm_page_zone); #endif /* ZONE_DEBUG */ + zone_change(vm_page_zone, Z_CALLERACCT, FALSE); zone_change(vm_page_zone, Z_EXPAND, FALSE); zone_change(vm_page_zone, Z_EXHAUST, TRUE); zone_change(vm_page_zone, Z_FOREIGN, TRUE); @@ -919,6 +909,7 @@ vm_page_module_init(void) * in vm_page_create(). [Q: is this really what we want?] */ vm_page_zone->count += vm_page_pages; + vm_page_zone->sum_count += vm_page_pages; vm_page_zone->cur_size += vm_page_pages * vm_page_zone->elem_size; lck_mtx_init(&vm_page_alloc_lock, &vm_page_lck_grp_alloc, &vm_page_lck_attr); @@ -944,12 +935,13 @@ vm_page_create( for (phys_page = start; phys_page < end; phys_page++) { - while ((m = (vm_page_t) vm_page_grab_fictitious()) + while ((m = (vm_page_t) vm_page_grab_fictitious_common(phys_page)) == VM_PAGE_NULL) vm_page_more_fictitious(); - vm_page_init(m, phys_page, FALSE); + m->fictitious = FALSE; pmap_clear_noencrypt(phys_page); + vm_page_pages++; vm_page_release(m); } @@ -1021,7 +1013,7 @@ vm_page_insert_internal( "already in (obj=%p,off=0x%llx)", mem, object, offset, mem->object, mem->offset); #endif - assert(!object->internal || offset < object->size); + assert(!object->internal || offset < object->vo_size); /* only insert "pageout" pages into "pageout" objects, * and normal pages into normal objects */ @@ -1054,6 +1046,16 @@ vm_page_insert_internal( lck_spin_unlock(bucket_lock); } + + { unsigned int cache_attr; + + cache_attr = object->wimg_bits & VM_WIMG_MASK; + + if (cache_attr != VM_WIMG_USE_DEFAULT) { + pmap_set_cache_attributes(mem->phys_page, cache_attr); + object->set_cache_attr = TRUE; + } + } /* * Now link into the object's list of backed pages. */ @@ -1253,6 +1255,12 @@ vm_page_remove( assert(mem->object->resident_page_count > 0); mem->object->resident_page_count--; + + if (!mem->object->internal && (mem->object->objq.next || mem->object->objq.prev)) { + if (mem->object->resident_page_count == 0) + vm_object_cache_remove(mem->object); + } + if (VM_PAGE_WIRED(mem)) { assert(mem->object->wired_page_count > 0); mem->object->wired_page_count--; @@ -1281,6 +1289,9 @@ vm_page_remove( OSAddAtomic(-1, &vm_page_purgeable_count); } } + if (mem->object->set_cache_attr == TRUE) + pmap_set_cache_attributes(mem->phys_page, 0); + mem->tabled = FALSE; mem->object = VM_OBJECT_NULL; mem->offset = (vm_object_offset_t) -1; @@ -1462,9 +1473,27 @@ vm_page_init( boolean_t lopage) { assert(phys_page); - *mem = vm_page_template; mem->phys_page = phys_page; +#if 0 + /* + * we're leaving this turned off for now... currently pages + * come off the free list and are either immediately dirtied/referenced + * due to zero-fill or COW faults, or are used to read or write files... + * in the file I/O case, the UPL mechanism takes care of clearing + * the state of the HW ref/mod bits in a somewhat fragile way. + * Since we may change the way this works in the future (to toughen it up), + * I'm leaving this as a reminder of where these bits could get cleared + */ + + /* + * make sure both the h/w referenced and modified bits are + * clear at this point... we are especially dependent on + * not finding a 'stale' h/w modified in a number of spots + * once this page goes back into use + */ + pmap_clear_refmod(phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED); +#endif mem->lopage = lopage; } @@ -1475,24 +1504,25 @@ vm_page_init( * Returns VM_PAGE_NULL if there are no free pages. */ int c_vm_page_grab_fictitious = 0; +int c_vm_page_grab_fictitious_failed = 0; int c_vm_page_release_fictitious = 0; int c_vm_page_more_fictitious = 0; -extern vm_page_t vm_page_grab_fictitious_common(ppnum_t phys_addr); - vm_page_t vm_page_grab_fictitious_common( ppnum_t phys_addr) { - register vm_page_t m; + vm_page_t m; + + if ((m = (vm_page_t)zget(vm_page_zone))) { - m = (vm_page_t)zget(vm_page_zone); - if (m) { vm_page_init(m, phys_addr, FALSE); m->fictitious = TRUE; - } - c_vm_page_grab_fictitious++; + c_vm_page_grab_fictitious++; + } else + c_vm_page_grab_fictitious_failed++; + return m; } @@ -1508,35 +1538,30 @@ vm_page_grab_guard(void) return vm_page_grab_fictitious_common(vm_page_guard_addr); } + /* * vm_page_release_fictitious: * - * Release a fictitious page to the free list. + * Release a fictitious page to the zone pool */ - void vm_page_release_fictitious( - register vm_page_t m) + vm_page_t m) { assert(!m->free); - assert(m->busy); assert(m->fictitious); assert(m->phys_page == vm_page_fictitious_addr || m->phys_page == vm_page_guard_addr); c_vm_page_release_fictitious++; -#if DEBUG - if (m->free) - panic("vm_page_release_fictitious"); -#endif - m->free = TRUE; + zfree(vm_page_zone, m); } /* * vm_page_more_fictitious: * - * Add more fictitious pages to the free list. + * Add more fictitious pages to the zone. * Allowed to block. This routine is way intimate * with the zones code, for several reasons: * 1. we need to carve some page structures out of physical @@ -1550,23 +1575,13 @@ vm_page_release_fictitious( * permanent allocation of a resource. * 3. To smooth allocation humps, we allocate single pages * with kernel_memory_allocate(), and cram them into the - * zone. This also allows us to initialize the vm_page_t's - * on the way into the zone, so that zget() always returns - * an initialized structure. The zone free element pointer - * and the free page pointer are both the first item in the - * vm_page_t. - * 4. By having the pages in the zone pre-initialized, we need - * not keep 2 levels of lists. The garbage collector simply - * scans our list, and reduces physical memory usage as it - * sees fit. + * zone. */ void vm_page_more_fictitious(void) { - register vm_page_t m; - vm_offset_t addr; - kern_return_t retval; - int i; + vm_offset_t addr; + kern_return_t retval; c_vm_page_more_fictitious++; @@ -1605,7 +1620,7 @@ void vm_page_more_fictitious(void) KMA_KOBJECT|KMA_NOPAGEWAIT); if (retval != KERN_SUCCESS) { /* - * No page was available. Tell the pageout daemon, drop the + * No page was available. Drop the * lock to give another thread a chance at it, and * wait for the pageout daemon to make progress. */ @@ -1613,18 +1628,8 @@ void vm_page_more_fictitious(void) vm_page_wait(THREAD_UNINT); return; } - /* - * Initialize as many vm_page_t's as will fit on this page. This - * depends on the zone code disturbing ONLY the first item of - * each zone element. - */ - m = (vm_page_t)addr; - for (i = PAGE_SIZE/sizeof(struct vm_page); i > 0; i--) { - vm_page_init(m, vm_page_fictitious_addr, FALSE); - m->fictitious = TRUE; - m++; - } zcram(vm_page_zone, (void *) addr, PAGE_SIZE); + lck_mtx_unlock(&vm_page_alloc_lock); } @@ -1719,6 +1724,7 @@ vm_page_grablo(void) return (mem); } + /* * vm_page_grab: * @@ -1769,6 +1775,10 @@ vm_page_grab( void ) assert(!mem->encrypted); assert(!mem->pmapped); assert(!mem->wpmapped); + assert(!mem->active); + assert(!mem->inactive); + assert(!mem->throttled); + assert(!mem->speculative); return mem; } @@ -1858,6 +1868,11 @@ vm_page_grab( void ) mem->pageq.next = NULL; mem->pageq.prev = NULL; + assert(!mem->active); + assert(!mem->inactive); + assert(!mem->throttled); + assert(!mem->speculative); + color = (color + 1) & vm_color_mask; if (head == NULL) @@ -1910,25 +1925,8 @@ vm_page_grab( void ) ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_min))) thread_wakeup((event_t) &vm_page_free_wanted); -#if CONFIG_EMBEDDED - { - int percent_avail; - - /* - * Decide if we need to poke the memorystatus notification thread. - */ - percent_avail = - (vm_page_active_count + vm_page_inactive_count + - vm_page_speculative_count + vm_page_free_count + - (IP_VALID(memory_manager_default)?0:vm_page_purgeable_count) ) * 100 / - atop_64(max_mem); - if (percent_avail <= (kern_memorystatus_level - 5)) { - kern_memorystatus_level = percent_avail; - thread_wakeup((event_t)&kern_memorystatus_wakeup); - } - } -#endif - + VM_CHECK_MEMORYSTATUS; + // dbgLog(mem->phys_page, vm_page_free_count, vm_page_wire_count, 4); /* (TEST/DEBUG) */ return mem; @@ -1947,16 +1945,8 @@ vm_page_release( unsigned int color; int need_wakeup = 0; int need_priv_wakeup = 0; -#if 0 - unsigned int pindex; - phys_entry *physent; - physent = mapping_phys_lookup(mem->phys_page, &pindex); /* (BRINGUP) */ - if(physent->ppLink & ppN) { /* (BRINGUP) */ - panic("vm_page_release: already released - %08X %08X\n", mem, mem->phys_page); - } - physent->ppLink = physent->ppLink | ppN; /* (BRINGUP) */ -#endif + assert(!mem->private && !mem->fictitious); if (vm_page_free_verify) { assert(pmap_verify_free(mem->phys_page)); @@ -1969,6 +1959,7 @@ vm_page_release( if (mem->free) panic("vm_page_release"); #endif + assert(mem->busy); assert(!mem->laundry); assert(mem->object == VM_OBJECT_NULL); @@ -1977,7 +1968,7 @@ vm_page_release( assert(mem->listq.next == NULL && mem->listq.prev == NULL); - if ((mem->lopage || vm_lopage_refill == TRUE) && + if ((mem->lopage == TRUE || vm_lopage_refill == TRUE) && vm_lopage_free_count < vm_lopage_free_limit && mem->phys_page < max_valid_low_ppnum) { /* @@ -1996,7 +1987,7 @@ vm_page_release( mem->lopage = TRUE; } else { - mem->lopage = FALSE; + mem->lopage = FALSE; mem->free = TRUE; color = mem->phys_page & vm_color_mask; @@ -2042,25 +2033,7 @@ vm_page_release( else if (need_wakeup) thread_wakeup_one((event_t) &vm_page_free_count); -#if CONFIG_EMBEDDED - { - int percent_avail; - - /* - * Decide if we need to poke the memorystatus notification thread. - * Locking is not a big issue, as only a single thread delivers these. - */ - percent_avail = - (vm_page_active_count + vm_page_inactive_count + - vm_page_speculative_count + vm_page_free_count + - (IP_VALID(memory_manager_default)?0:vm_page_purgeable_count) ) * 100 / - atop_64(max_mem); - if (percent_avail >= (kern_memorystatus_level + 5)) { - kern_memorystatus_level = percent_avail; - thread_wakeup((event_t)&kern_memorystatus_wakeup); - } - } -#endif + VM_CHECK_MEMORYSTATUS; } /* @@ -2195,16 +2168,16 @@ vm_page_alloc_guard( counter(unsigned int c_laundry_pages_freed = 0;) /* - * vm_page_free: + * vm_page_free_prepare: * - * Returns the given page to the free list, - * disassociating it with any VM object. + * Removes page from any queue it may be on + * and disassociates it from its VM object. * * Object and page queues must be locked prior to entry. */ static void vm_page_free_prepare( - register vm_page_t mem) + vm_page_t mem) { vm_page_free_prepare_queues(mem); vm_page_free_prepare_object(mem, TRUE); @@ -2247,6 +2220,12 @@ vm_page_free_prepare_queues( mem->object->wired_page_count--; assert(mem->object->resident_page_count >= mem->object->wired_page_count); + + if (mem->object->purgable == VM_PURGABLE_VOLATILE) { + OSAddAtomic(+1, &vm_page_purgeable_count); + assert(vm_page_purgeable_wired_count > 0); + OSAddAtomic(-1, &vm_page_purgeable_wired_count); + } } if (!mem->private && !mem->fictitious) vm_page_wire_count--; @@ -2265,10 +2244,6 @@ vm_page_free_prepare_object( vm_page_t mem, boolean_t remove_from_hash) { - if (mem->object) { - vm_object_lock_assert_exclusive(mem->object); - } - if (mem->tabled) vm_page_remove(mem, remove_from_hash); /* clears tabled, object, offset */ @@ -2279,21 +2254,7 @@ vm_page_free_prepare_object( mem->fictitious = TRUE; mem->phys_page = vm_page_fictitious_addr; } - if (mem->fictitious) { - /* Some of these may be unnecessary */ - mem->gobbled = FALSE; - mem->busy = TRUE; - mem->absent = FALSE; - mem->error = FALSE; - mem->dirty = FALSE; - mem->precious = FALSE; - mem->reference = FALSE; - mem->encrypted = FALSE; - mem->encrypted_cleaning = FALSE; - mem->pmapped = FALSE; - mem->wpmapped = FALSE; - mem->reusable = FALSE; - } else { + if ( !mem->fictitious) { if (mem->zero_fill == TRUE) VM_ZF_COUNT_DECR(); vm_page_init(mem, mem->phys_page, mem->lopage); @@ -2301,11 +2262,20 @@ vm_page_free_prepare_object( } +/* + * vm_page_free: + * + * Returns the given page to the free list, + * disassociating it with any VM object. + * + * Object and page queues must be locked prior to entry. + */ void vm_page_free( vm_page_t mem) { vm_page_free_prepare(mem); + if (mem->fictitious) { vm_page_release_fictitious(mem); } else { @@ -2373,9 +2343,9 @@ vm_page_free_list( if (vm_page_free_verify && !mem->fictitious && !mem->private) { assert(pmap_verify_free(mem->phys_page)); } - assert(mem->busy); if (!mem->fictitious) { + assert(mem->busy); if ((mem->lopage == TRUE || vm_lopage_refill == TRUE) && vm_lopage_free_count < vm_lopage_free_limit && mem->phys_page < max_valid_low_ppnum) { @@ -2518,24 +2488,8 @@ vm_page_free_list( */ thread_wakeup_one((event_t) &vm_page_free_count); } -#if CONFIG_EMBEDDED - { - int percent_avail; - /* - * Decide if we need to poke the memorystatus notification thread. - */ - percent_avail = - (vm_page_active_count + vm_page_inactive_count + - vm_page_speculative_count + vm_page_free_count + - (IP_VALID(memory_manager_default)?0:vm_page_purgeable_count) ) * 100 / - atop_64(max_mem); - if (percent_avail >= (kern_memorystatus_level + 5)) { - kern_memorystatus_level = percent_avail; - thread_wakeup((event_t)&kern_memorystatus_wakeup); - } - } -#endif + VM_CHECK_MEMORYSTATUS; } } @@ -2614,24 +2568,9 @@ vm_page_wire( mem->zero_fill = FALSE; VM_ZF_COUNT_DECR(); } -#if CONFIG_EMBEDDED - { - int percent_avail; - /* - * Decide if we need to poke the memorystatus notification thread. - */ - percent_avail = - (vm_page_active_count + vm_page_inactive_count + - vm_page_speculative_count + vm_page_free_count + - (IP_VALID(memory_manager_default)?0:vm_page_purgeable_count) ) * 100 / - atop_64(max_mem); - if (percent_avail <= (kern_memorystatus_level - 5)) { - kern_memorystatus_level = percent_avail; - thread_wakeup((event_t)&kern_memorystatus_wakeup); - } - } -#endif + VM_CHECK_MEMORYSTATUS; + /* * ENCRYPTED SWAP: * The page could be encrypted, but @@ -2719,24 +2658,9 @@ vm_page_unwire( vm_page_activate(mem); } } -#if CONFIG_EMBEDDED - { - int percent_avail; - /* - * Decide if we need to poke the memorystatus notification thread. - */ - percent_avail = - (vm_page_active_count + vm_page_inactive_count + - vm_page_speculative_count + vm_page_free_count + - (IP_VALID(memory_manager_default)?0:vm_page_purgeable_count) ) * 100 / - atop_64(max_mem); - if (percent_avail >= (kern_memorystatus_level + 5)) { - kern_memorystatus_level = percent_avail; - thread_wakeup((event_t)&kern_memorystatus_wakeup); - } - } -#endif + VM_CHECK_MEMORYSTATUS; + } VM_PAGE_CHECK(mem); } @@ -2779,9 +2703,7 @@ vm_page_deactivate_internal( * inactive queue. Note wired pages should not have * their reference bit cleared. */ - - if (m->absent && !m->unusual) - panic("vm_page_deactivate: %p absent", m); + assert ( !(m->absent && !m->unusual)); if (m->gobbled) { /* can this happen? */ assert( !VM_PAGE_WIRED(m)); @@ -2791,10 +2713,10 @@ vm_page_deactivate_internal( vm_page_gobble_count--; m->gobbled = FALSE; } - if (m->private || (VM_PAGE_WIRED(m))) + if (m->private || m->fictitious || (VM_PAGE_WIRED(m))) return; - if (!m->fictitious && !m->absent && clear_hw_reference == TRUE) + if (!m->absent && clear_hw_reference == TRUE) pmap_clear_reference(m->phys_page); m->reference = FALSE; @@ -2806,7 +2728,7 @@ vm_page_deactivate_internal( assert(!m->laundry); assert(m->pageq.next == NULL && m->pageq.prev == NULL); - if (!IP_VALID(memory_manager_default) && + if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) && m->dirty && m->object->internal && (m->object->purgable == VM_PURGABLE_DENY || m->object->purgable == VM_PURGABLE_NONVOLATILE || @@ -2815,24 +2737,13 @@ vm_page_deactivate_internal( m->throttled = TRUE; vm_page_throttled_count++; } else { - if (!m->fictitious && m->object->named && m->object->ref_count == 1) { + if (m->object->named && m->object->ref_count == 1) { vm_page_speculate(m, FALSE); #if DEVELOPMENT || DEBUG vm_page_speculative_recreated++; #endif - return; } else { - if (m->zero_fill) { - queue_enter(&vm_page_queue_zf, m, vm_page_t, pageq); - vm_zf_queue_count++; - } else { - queue_enter(&vm_page_queue_inactive, m, vm_page_t, pageq); - } - } - m->inactive = TRUE; - if (!m->fictitious) { - vm_page_inactive_count++; - token_new_pagecount++; + VM_PAGE_ENQUEUE_INACTIVE(m, FALSE); } } } @@ -2858,9 +2769,7 @@ vm_page_activate( #if DEBUG lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); #endif - - if (m->absent && !m->unusual) - panic("vm_page_activate: %p absent", m); + assert( !(m->absent && !m->unusual)); if (m->gobbled) { assert( !VM_PAGE_WIRED(m)); @@ -2869,7 +2778,7 @@ vm_page_activate( vm_page_gobble_count--; m->gobbled = FALSE; } - if (m->private) + if (m->private || m->fictitious) return; #if DEBUG @@ -2887,8 +2796,8 @@ vm_page_activate( if ( !VM_PAGE_WIRED(m)) { assert(!m->laundry); assert(m->pageq.next == NULL && m->pageq.prev == NULL); - if (!IP_VALID(memory_manager_default) && - !m->fictitious && m->dirty && m->object->internal && + if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) && + m->dirty && m->object->internal && (m->object->purgable == VM_PURGABLE_DENY || m->object->purgable == VM_PURGABLE_NONVOLATILE || m->object->purgable == VM_PURGABLE_VOLATILE)) { @@ -2898,8 +2807,7 @@ vm_page_activate( } else { queue_enter(&vm_page_queue_active, m, vm_page_t, pageq); m->active = TRUE; - if (!m->fictitious) - vm_page_active_count++; + vm_page_active_count++; } m->reference = TRUE; m->no_cache = FALSE; @@ -2928,9 +2836,10 @@ vm_page_speculate( #if DEBUG lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); #endif + assert( !(m->absent && !m->unusual)); - if (m->absent && !m->unusual) - panic("vm_page_speculate: %p absent", m); + if (m->private || m->fictitious) + return; VM_PAGE_QUEUES_REMOVE(m); @@ -2953,8 +2862,8 @@ vm_page_speculate( /* * set the timer to begin a new group */ - aq->age_ts.tv_sec = VM_PAGE_SPECULATIVE_Q_AGE_MS / 1000; - aq->age_ts.tv_nsec = (VM_PAGE_SPECULATIVE_Q_AGE_MS % 1000) * 1000 * NSEC_PER_USEC; + aq->age_ts.tv_sec = vm_page_speculative_q_age_ms / 1000; + aq->age_ts.tv_nsec = (vm_page_speculative_q_age_ms % 1000) * 1000 * NSEC_PER_USEC; ADD_MACH_TIMESPEC(&aq->age_ts, &ts); } else { @@ -2977,8 +2886,8 @@ vm_page_speculate( if (!queue_empty(&aq->age_q)) vm_page_speculate_ageit(aq); - aq->age_ts.tv_sec = VM_PAGE_SPECULATIVE_Q_AGE_MS / 1000; - aq->age_ts.tv_nsec = (VM_PAGE_SPECULATIVE_Q_AGE_MS % 1000) * 1000 * NSEC_PER_USEC; + aq->age_ts.tv_sec = vm_page_speculative_q_age_ms / 1000; + aq->age_ts.tv_nsec = (vm_page_speculative_q_age_ms % 1000) * 1000 * NSEC_PER_USEC; ADD_MACH_TIMESPEC(&aq->age_ts, &ts); } @@ -2988,6 +2897,8 @@ vm_page_speculate( vm_page_speculative_count++; if (new == TRUE) { + vm_object_lock_assert_exclusive(m->object); + m->object->pages_created++; #if DEVELOPMENT || DEBUG vm_page_speculative_created++; @@ -3061,11 +2972,7 @@ vm_page_lru( assert(!m->laundry); assert(m->pageq.next == NULL && m->pageq.prev == NULL); - queue_enter(&vm_page_queue_inactive, m, vm_page_t, pageq); - m->inactive = TRUE; - - vm_page_inactive_count++; - token_new_pagecount++; + VM_PAGE_ENQUEUE_INACTIVE(m, FALSE); } @@ -3077,6 +2984,9 @@ vm_page_reactivate_all_throttled(void) vm_page_t m; int extra_active_count; + if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default)) + return; + extra_active_count = 0; vm_page_lock_queues(); if (! queue_empty(&vm_page_queue_throttled)) { @@ -3090,9 +3000,9 @@ vm_page_reactivate_all_throttled(void) assert(!m->inactive); assert(!m->speculative); assert(!VM_PAGE_WIRED(m)); - if (!m->fictitious) { - extra_active_count++; - } + + extra_active_count++; + m->throttled = FALSE; m->active = TRUE; VM_PAGE_CHECK(m); @@ -3350,6 +3260,17 @@ vm_page_copy( vm_page_copy_cs_validations++; vm_page_validate_cs(src_m); } + + if (vm_page_is_slideable(src_m)) { + boolean_t was_busy = src_m->busy; + src_m->busy = TRUE; + (void) vm_page_slide(src_m, 0); + assert(src_m->busy); + if(!was_busy) { + PAGE_WAKEUP_DONE(src_m); + } + } + /* * Propagate the cs_tainted bit to the copy page. Do not propagate * the cs_validated bit. @@ -3358,7 +3279,8 @@ vm_page_copy( if (dest_m->cs_tainted) { vm_page_copy_cs_tainted++; } - + dest_m->slid = src_m->slid; + dest_m->error = src_m->error; /* sliding src_m might have failed... */ pmap_copy_page(src_m->phys_page, dest_m->phys_page); } @@ -3439,7 +3361,7 @@ vm_page_verify_contiguous( if (m->phys_page != prev_addr + 1) { printf("m %p prev_addr 0x%lx, current addr 0x%x\n", m, (long)prev_addr, m->phys_page); - printf("pages %p page_count %d\n", pages, page_count); + printf("pages %p page_count %d npages %d\n", pages, page_count, npages); panic("vm_page_verify_contiguous: not contiguous!"); } prev_addr = m->phys_page; @@ -3476,21 +3398,24 @@ vm_page_verify_free_list( m, vm_page_t, pageq) { + if (m == look_for_page) { found_page = TRUE; } if ((vm_page_t) m->pageq.prev != prev_m) panic("vm_page_verify_free_list(color=%u, npages=%u): page %p corrupted prev ptr %p instead of %p\n", color, npages, m, m->pageq.prev, prev_m); - if ( ! m->free ) - panic("vm_page_verify_free_list(color=%u, npages=%u): page %p not free\n", - color, npages, m); if ( ! m->busy ) panic("vm_page_verify_free_list(color=%u, npages=%u): page %p not busy\n", color, npages, m); - if ( color != (unsigned int) -1 && (m->phys_page & vm_color_mask) != color) - panic("vm_page_verify_free_list(color=%u, npages=%u): page %p wrong color %u instead of %u\n", - color, npages, m, m->phys_page & vm_color_mask, color); + if (color != (unsigned int) -1) { + if ((m->phys_page & vm_color_mask) != color) + panic("vm_page_verify_free_list(color=%u, npages=%u): page %p wrong color %u instead of %u\n", + color, npages, m, m->phys_page & vm_color_mask, color); + if ( ! m->free ) + panic("vm_page_verify_free_list(color=%u, npages=%u): page %p not free\n", + color, npages, m); + } ++npages; prev_m = m; } @@ -3507,13 +3432,12 @@ vm_page_verify_free_list( if (other_color == color) continue; vm_page_verify_free_list(&vm_page_queue_free[other_color], - other_color, look_for_page, FALSE); + other_color, look_for_page, FALSE); } - if (color != (unsigned int) -1) { + if (color == (unsigned int) -1) { vm_page_verify_free_list(&vm_lopage_queue_free, (unsigned int) -1, look_for_page, FALSE); } - panic("vm_page_verify_free_list(color=%u)\n", color); } if (!expect_page && found_page) { @@ -3539,9 +3463,8 @@ vm_page_verify_free_lists( void ) for( color = 0; color < vm_colors; color++ ) { npages += vm_page_verify_free_list(&vm_page_queue_free[color], - color, VM_PAGE_NULL, FALSE); + color, VM_PAGE_NULL, FALSE); } - nlopages = vm_page_verify_free_list(&vm_lopage_queue_free, (unsigned int) -1, VM_PAGE_NULL, FALSE); @@ -3549,6 +3472,7 @@ vm_page_verify_free_lists( void ) panic("vm_page_verify_free_lists: " "npages %u free_count %d nlopages %u lo_free_count %u", npages, vm_page_free_count, nlopages, vm_lopage_free_count); + lck_mtx_unlock(&vm_page_queue_free_lock); } @@ -3717,7 +3641,7 @@ vm_page_find_contiguous( /* no more low pages... */ break; } - if (!npages && ((m->phys_page & pnum_mask) != 0)) { + if (!npages & ((m->phys_page & pnum_mask) != 0)) { /* * not aligned */ @@ -3901,8 +3825,7 @@ vm_page_find_contiguous( color = m1->phys_page & vm_color_mask; #if MACH_ASSERT - vm_page_verify_free_list(&vm_page_queue_free[color], - color, m1, TRUE); + vm_page_verify_free_list(&vm_page_queue_free[color], color, m1, TRUE); #endif queue_remove(&vm_page_queue_free[color], m1, @@ -3911,8 +3834,7 @@ vm_page_find_contiguous( m1->pageq.next = NULL; m1->pageq.prev = NULL; #if MACH_ASSERT - vm_page_verify_free_list(&vm_page_queue_free[color], - color, VM_PAGE_NULL, FALSE); + vm_page_verify_free_list(&vm_page_queue_free[color], color, VM_PAGE_NULL, FALSE); #endif /* * Clear the "free" bit so that this page @@ -4184,7 +4106,7 @@ cpm_allocate( vm_page_t pages; unsigned int npages; - if (size % page_size != 0) + if (size % PAGE_SIZE != 0) return KERN_INVALID_ARGUMENT; npages = (unsigned int) (size / PAGE_SIZE); @@ -4210,24 +4132,8 @@ cpm_allocate( ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_min))) thread_wakeup((event_t) &vm_page_free_wanted); -#if CONFIG_EMBEDDED - { - int percent_avail; - - /* - * Decide if we need to poke the memorystatus notification thread. - */ - percent_avail = - (vm_page_active_count + vm_page_inactive_count + - vm_page_speculative_count + vm_page_free_count + - (IP_VALID(memory_manager_default)?0:vm_page_purgeable_count) ) * 100 / - atop_64(max_mem); - if (percent_avail <= (kern_memorystatus_level - 5)) { - kern_memorystatus_level = percent_avail; - thread_wakeup((event_t)&kern_memorystatus_wakeup); - } - } -#endif + VM_CHECK_MEMORYSTATUS; + /* * The CPM pages should now be available and * ordered by ascending physical address. @@ -4237,8 +4143,188 @@ cpm_allocate( *list = pages; return KERN_SUCCESS; } + + +unsigned int vm_max_delayed_work_limit = DEFAULT_DELAYED_WORK_LIMIT; + +/* + * when working on a 'run' of pages, it is necessary to hold + * the vm_page_queue_lock (a hot global lock) for certain operations + * on the page... however, the majority of the work can be done + * while merely holding the object lock... in fact there are certain + * collections of pages that don't require any work brokered by the + * vm_page_queue_lock... to mitigate the time spent behind the global + * lock, go to a 2 pass algorithm... collect pages up to DELAYED_WORK_LIMIT + * while doing all of the work that doesn't require the vm_page_queue_lock... + * then call vm_page_do_delayed_work to acquire the vm_page_queue_lock and do the + * necessary work for each page... we will grab the busy bit on the page + * if it's not already held so that vm_page_do_delayed_work can drop the object lock + * if it can't immediately take the vm_page_queue_lock in order to compete + * for the locks in the same order that vm_pageout_scan takes them. + * the operation names are modeled after the names of the routines that + * need to be called in order to make the changes very obvious in the + * original loop + */ + +void +vm_page_do_delayed_work( + vm_object_t object, + struct vm_page_delayed_work *dwp, + int dw_count) +{ + int j; + vm_page_t m; + vm_page_t local_free_q = VM_PAGE_NULL; + boolean_t dropped_obj_lock = FALSE; + + /* + * pageout_scan takes the vm_page_lock_queues first + * then tries for the object lock... to avoid what + * is effectively a lock inversion, we'll go to the + * trouble of taking them in that same order... otherwise + * if this object contains the majority of the pages resident + * in the UBC (or a small set of large objects actively being + * worked on contain the majority of the pages), we could + * cause the pageout_scan thread to 'starve' in its attempt + * to find pages to move to the free queue, since it has to + * successfully acquire the object lock of any candidate page + * before it can steal/clean it. + */ + if (!vm_page_trylockspin_queues()) { + vm_object_unlock(object); + + vm_page_lockspin_queues(); + + for (j = 0; ; j++) { + if (!vm_object_lock_avoid(object) && + _vm_object_lock_try(object)) + break; + vm_page_unlock_queues(); + mutex_pause(j); + vm_page_lockspin_queues(); + } + dropped_obj_lock = TRUE; + } + for (j = 0; j < dw_count; j++, dwp++) { + + m = dwp->dw_m; + + if (dwp->dw_mask & DW_set_list_req_pending) { + m->list_req_pending = TRUE; + + if (dropped_obj_lock == TRUE) { + /* + * need to make sure anyone that might have + * blocked on busy == TRUE when we dropped + * the object lock gets a chance to re-evaluate + * its state since we have several places + * where we avoid potential deadlocks with + * the fileysystem by stealing pages with + * list_req_pending == TRUE and busy == TRUE + */ + dwp->dw_mask |= DW_PAGE_WAKEUP; + } + } + if (dwp->dw_mask & DW_vm_pageout_throttle_up) + vm_pageout_throttle_up(m); + + if (dwp->dw_mask & DW_vm_page_wire) + vm_page_wire(m); + else if (dwp->dw_mask & DW_vm_page_unwire) { + boolean_t queueit; + + queueit = (dwp->dw_mask & DW_vm_page_free) ? FALSE : TRUE; + + vm_page_unwire(m, queueit); + } + if (dwp->dw_mask & DW_vm_page_free) { + vm_page_free_prepare_queues(m); + + assert(m->pageq.next == NULL && m->pageq.prev == NULL); + /* + * Add this page to our list of reclaimed pages, + * to be freed later. + */ + m->pageq.next = (queue_entry_t) local_free_q; + local_free_q = m; + } else { + if (dwp->dw_mask & DW_vm_page_deactivate_internal) + vm_page_deactivate_internal(m, FALSE); + else if (dwp->dw_mask & DW_vm_page_activate) { + if (m->active == FALSE) { + vm_page_activate(m); + } + } + else if (dwp->dw_mask & DW_vm_page_speculate) + vm_page_speculate(m, TRUE); + else if (dwp->dw_mask & DW_vm_page_lru) + vm_page_lru(m); + else if (dwp->dw_mask & DW_VM_PAGE_QUEUES_REMOVE) + VM_PAGE_QUEUES_REMOVE(m); + + if (dwp->dw_mask & DW_set_reference) + m->reference = TRUE; + else if (dwp->dw_mask & DW_clear_reference) + m->reference = FALSE; + + if (dwp->dw_mask & DW_move_page) { + VM_PAGE_QUEUES_REMOVE(m); + + assert(!m->laundry); + assert(m->object != kernel_object); + assert(m->pageq.next == NULL && + m->pageq.prev == NULL); + + VM_PAGE_ENQUEUE_INACTIVE(m, FALSE); + } + if (dwp->dw_mask & DW_clear_busy) + m->busy = FALSE; + + if (dwp->dw_mask & DW_PAGE_WAKEUP) + PAGE_WAKEUP(m); + } + } + vm_page_unlock_queues(); + + if (local_free_q) + vm_page_free_list(local_free_q, TRUE); + + VM_CHECK_MEMORYSTATUS; + +} + + +void vm_check_memorystatus() +{ +#if CONFIG_EMBEDDED + static boolean_t in_critical = FALSE; + static unsigned int last_memorystatus = 0; + unsigned int pages_avail; + + if (!kern_memorystatus_delta) { + return; + } + + pages_avail = (vm_page_active_count + + vm_page_inactive_count + + vm_page_speculative_count + + vm_page_free_count + + (VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) ? 0 : vm_page_purgeable_count)); + if ( (!in_critical && (pages_avail < kern_memorystatus_delta)) || + (pages_avail >= (last_memorystatus + kern_memorystatus_delta)) || + (last_memorystatus >= (pages_avail + kern_memorystatus_delta)) ) { + kern_memorystatus_level = pages_avail * 100 / atop_64(max_mem); + last_memorystatus = pages_avail; + + thread_wakeup((event_t)&kern_memorystatus_wakeup); + + in_critical = (pages_avail < kern_memorystatus_delta) ? TRUE : FALSE; + } +#endif +} + kern_return_t vm_page_alloc_list( int page_count, @@ -4716,6 +4802,7 @@ hibernate_flush_memory() return (retval); } + static void hibernate_page_list_zero(hibernate_page_list_t *list) { @@ -4819,8 +4906,8 @@ hibernate_consider_discard(vm_page_t m) /* * Somebody is playing with this page. */ - hibernate_stats.cd_found_busy++; - break; + hibernate_stats.cd_found_busy++; + break; } if (m->absent || m->unusual || m->error) { /* @@ -4911,6 +4998,7 @@ hibernate_discard_page(vm_page_t m) void hibernate_page_list_setall(hibernate_page_list_t * page_list, hibernate_page_list_t * page_list_wired, + hibernate_page_list_t * page_list_pal, uint32_t * pagesOut) { uint64_t start, end, nsec; @@ -4937,6 +5025,7 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, hibernate_page_list_zero(page_list); hibernate_page_list_zero(page_list_wired); + hibernate_page_list_zero(page_list_pal); hibernate_stats.cd_vm_page_wire_count = vm_page_wire_count; hibernate_stats.cd_pages = pages; @@ -4955,7 +5044,7 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, hibernate_page_bitset(page_list_wired, TRUE, m->phys_page); m = (vm_page_t) m->pageq.next; } -#ifndef PPC + for( i = 0; i < real_ncpus; i++ ) { if (cpu_data_ptr[i] && cpu_data_ptr[i]->cpu_processor) @@ -4972,7 +5061,7 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, } } } -#endif + for( i = 0; i < vm_colors; i++ ) { queue_iterate(&vm_page_queue_free[i], diff --git a/osfmk/vm/vm_shared_region.c b/osfmk/vm/vm_shared_region.c index 05fa65614..a5931c998 100644 --- a/osfmk/vm/vm_shared_region.c +++ b/osfmk/vm/vm_shared_region.c @@ -124,6 +124,9 @@ int shared_region_persistence = 0; /* no by default */ /* delay before reclaiming an unused shared region */ int shared_region_destroy_delay = 120; /* in seconds */ +/* indicate if the shared region has been slid. Only one region can be slid */ +boolean_t shared_region_completed_slide = FALSE; + /* this lock protects all the shared region data structures */ lck_grp_t *vm_shared_region_lck_grp; lck_mtx_t vm_shared_region_lock; @@ -760,8 +763,24 @@ vm_shared_region_destroy( thread_call_free(shared_region->sr_timer_call); } + if ((slide_info.slide_info_entry != NULL) && (slide_info.sr == shared_region)) { + kmem_free(kernel_map, + (vm_offset_t) slide_info.slide_info_entry, + (vm_size_t) slide_info.slide_info_size); + vm_object_deallocate(slide_info.slide_object); + slide_info.slide_object = NULL; + slide_info.start = 0; + slide_info.end = 0; + slide_info.slide = 0; + slide_info.sr = NULL; + slide_info.slide_info_entry = NULL; + slide_info.slide_info_size = 0; + shared_region_completed_slide = FALSE; + } + /* release the shared region structure... */ kfree(shared_region, sizeof (*shared_region)); + SHARED_REGION_TRACE_DEBUG( ("shared_region: destroy(%p) <-\n", shared_region)); @@ -821,6 +840,106 @@ vm_shared_region_start_address( return kr; } + +void +vm_shared_region_undo_mappings( + vm_map_t sr_map, + mach_vm_offset_t sr_base_address, + struct shared_file_mapping_np *mappings, + unsigned int mappings_count) +{ + unsigned int j = 0; + vm_shared_region_t shared_region = NULL; + boolean_t reset_shared_region_state = FALSE; + + shared_region = vm_shared_region_get(current_task()); + if (shared_region == NULL) { + SHARED_REGION_TRACE_DEBUG(("Failed to undo mappings because of NULL shared region.\n")); + return; + } + + + if (sr_map == NULL) { + ipc_port_t sr_handle; + vm_named_entry_t sr_mem_entry; + + vm_shared_region_lock(); + assert(shared_region->sr_ref_count > 1); + + while (shared_region->sr_mapping_in_progress) { + /* wait for our turn... */ + vm_shared_region_sleep(&shared_region->sr_mapping_in_progress, + THREAD_UNINT); + } + assert(! shared_region->sr_mapping_in_progress); + assert(shared_region->sr_ref_count > 1); + /* let others know we're working in this shared region */ + shared_region->sr_mapping_in_progress = TRUE; + + vm_shared_region_unlock(); + + reset_shared_region_state = TRUE; + + /* no need to lock because this data is never modified... */ + sr_handle = shared_region->sr_mem_entry; + sr_mem_entry = (vm_named_entry_t) sr_handle->ip_kobject; + sr_map = sr_mem_entry->backing.map; + sr_base_address = shared_region->sr_base_address; + } + /* + * Undo the mappings we've established so far. + */ + for (j = 0; j < mappings_count; j++) { + kern_return_t kr2; + + if (mappings[j].sfm_size == 0) { + /* + * We didn't establish this + * mapping, so nothing to undo. + */ + continue; + } + SHARED_REGION_TRACE_INFO( + ("shared_region: mapping[%d]: " + "address:0x%016llx " + "size:0x%016llx " + "offset:0x%016llx " + "maxprot:0x%x prot:0x%x: " + "undoing...\n", + j, + (long long)mappings[j].sfm_address, + (long long)mappings[j].sfm_size, + (long long)mappings[j].sfm_file_offset, + mappings[j].sfm_max_prot, + mappings[j].sfm_init_prot)); + kr2 = mach_vm_deallocate( + sr_map, + (mappings[j].sfm_address - + sr_base_address), + mappings[j].sfm_size); + assert(kr2 == KERN_SUCCESS); + } + + /* + * This is how check_np() knows if the shared region + * is mapped. So clear it here. + */ + shared_region->sr_first_mapping = (mach_vm_offset_t) -1; + + if (reset_shared_region_state) { + vm_shared_region_lock(); + assert(shared_region->sr_ref_count > 1); + assert(shared_region->sr_mapping_in_progress); + /* we're done working on that shared region */ + shared_region->sr_mapping_in_progress = FALSE; + thread_wakeup((event_t) &shared_region->sr_mapping_in_progress); + vm_shared_region_unlock(); + reset_shared_region_state = FALSE; + } + + vm_shared_region_deallocate(shared_region); +} + /* * Establish some mappings of a file in the shared region. * This is used by "dyld" via the shared_region_map_np() system call @@ -838,7 +957,8 @@ vm_shared_region_map_file( struct shared_file_mapping_np *mappings, memory_object_control_t file_control, memory_object_size_t file_size, - void *root_dir) + void *root_dir, + struct shared_file_mapping_np *mapping_to_slide) { kern_return_t kr; vm_object_t file_object; @@ -851,6 +971,7 @@ vm_shared_region_map_file( mach_vm_offset_t target_address; vm_object_t object; vm_object_size_t obj_size; + boolean_t found_mapping_to_slide = FALSE; kr = KERN_SUCCESS; @@ -921,6 +1042,32 @@ vm_shared_region_map_file( /* file-backed memory */ map_port = (ipc_port_t) file_object->pager; } + + if (mappings[i].sfm_init_prot & VM_PROT_SLIDE) { + /* + * This is the mapping that needs to be slid. + */ + if (found_mapping_to_slide == TRUE) { + SHARED_REGION_TRACE_INFO( + ("shared_region: mapping[%d]: " + "address:0x%016llx size:0x%016llx " + "offset:0x%016llx " + "maxprot:0x%x prot:0x%x " + "will not be slid as only one such mapping is allowed...\n", + i, + (long long)mappings[i].sfm_address, + (long long)mappings[i].sfm_size, + (long long)mappings[i].sfm_file_offset, + mappings[i].sfm_max_prot, + mappings[i].sfm_init_prot)); + } else { + if (mapping_to_slide != NULL) { + mapping_to_slide->sfm_file_offset = mappings[i].sfm_file_offset; + mapping_to_slide->sfm_size = mappings[i].sfm_size; + found_mapping_to_slide = TRUE; + } + } + } /* mapping's address is relative to the shared region base */ target_address = @@ -1002,8 +1149,6 @@ vm_shared_region_map_file( mappings[i].sfm_size = 0; kr = KERN_SUCCESS; } else { - unsigned int j; - /* this mapping failed ! */ SHARED_REGION_TRACE_ERROR( ("shared_region: mapping[%d]: " @@ -1018,40 +1163,7 @@ vm_shared_region_map_file( mappings[i].sfm_init_prot, kr)); - /* - * Undo the mappings we've established so far. - */ - for (j = 0; j < i; j++) { - kern_return_t kr2; - - if (mappings[j].sfm_size == 0) { - /* - * We didn't establish this - * mapping, so nothing to undo. - */ - continue; - } - SHARED_REGION_TRACE_INFO( - ("shared_region: mapping[%d]: " - "address:0x%016llx " - "size:0x%016llx " - "offset:0x%016llx " - "maxprot:0x%x prot:0x%x: " - "undoing...\n", - j, - (long long)mappings[j].sfm_address, - (long long)mappings[j].sfm_size, - (long long)mappings[j].sfm_file_offset, - mappings[j].sfm_max_prot, - mappings[j].sfm_init_prot)); - kr2 = mach_vm_deallocate( - sr_map, - (mappings[j].sfm_address - - sr_base_address), - mappings[j].sfm_size); - assert(kr2 == KERN_SUCCESS); - } - + vm_shared_region_undo_mappings(sr_map, sr_base_address, mappings, i); break; } @@ -1262,6 +1374,264 @@ vm_shared_region_enter( return kr; } +#define SANE_SLIDE_INFO_SIZE (1024*1024) /*Can be changed if needed*/ +struct vm_shared_region_slide_info slide_info; + +kern_return_t +vm_shared_region_sliding_valid(uint32_t slide) { + + kern_return_t kr = KERN_SUCCESS; + + if ((shared_region_completed_slide == TRUE) && slide) { + if (slide != slide_info.slide) { + SHARED_REGION_TRACE_DEBUG(("Only one shared region can be slid\n")); + kr = KERN_FAILURE; + } else if (slide == slide_info.slide) { + /* + * Request for sliding when we've + * already done it with exactly the + * same slide value before. + * This isn't wrong technically but + * we don't want to slide again and + * so we return this value. + */ + kr = KERN_INVALID_ARGUMENT; + } + } + return kr; +} + +kern_return_t +vm_shared_region_slide_init( + mach_vm_size_t slide_info_size, + mach_vm_offset_t start, + mach_vm_size_t size, + uint32_t slide, + memory_object_control_t sr_file_control) +{ + kern_return_t kr = KERN_SUCCESS; + vm_object_t object = VM_OBJECT_NULL; + vm_object_offset_t offset = 0; + + vm_map_t map =NULL, cur_map = NULL; + boolean_t is_map_locked = FALSE; + + if ((kr = vm_shared_region_sliding_valid(slide)) != KERN_SUCCESS) { + if (kr == KERN_INVALID_ARGUMENT) { + /* + * This will happen if we request sliding again + * with the same slide value that was used earlier + * for the very first sliding. + */ + kr = KERN_SUCCESS; + } + return kr; + } + + if (slide_info_size > SANE_SLIDE_INFO_SIZE) { + SHARED_REGION_TRACE_DEBUG(("Slide_info_size too large: %lx\n", (uintptr_t)slide_info_size)); + kr = KERN_FAILURE; + return kr; + } + + if (sr_file_control != MEMORY_OBJECT_CONTROL_NULL) { + + object = memory_object_control_to_vm_object(sr_file_control); + vm_object_reference(object); + offset = start; + + vm_object_lock_shared(object); + + } else { + /* + * Remove this entire "else" block and all "map" references + * once we get rid of the shared_region_slide_np() + * system call. + */ + vm_map_entry_t entry = VM_MAP_ENTRY_NULL; + map = current_map(); + vm_map_lock_read(map); + is_map_locked = TRUE; + Retry: + cur_map = map; + if(!vm_map_lookup_entry(map, start, &entry)) { + kr = KERN_INVALID_ARGUMENT; + } else { + vm_object_t shadow_obj = VM_OBJECT_NULL; + + if (entry->is_sub_map == TRUE) { + map = entry->object.sub_map; + start -= entry->vme_start; + start += entry->offset; + vm_map_lock_read(map); + vm_map_unlock_read(cur_map); + goto Retry; + } else { + object = entry->object.vm_object; + offset = (start - entry->vme_start) + entry->offset; + } + + vm_object_lock_shared(object); + while (object->shadow != VM_OBJECT_NULL) { + shadow_obj = object->shadow; + vm_object_lock_shared(shadow_obj); + vm_object_unlock(object); + object = shadow_obj; + } + } + } + + if (object->internal == TRUE) { + kr = KERN_INVALID_ADDRESS; + } else { + kr = kmem_alloc(kernel_map, + (vm_offset_t *) &slide_info.slide_info_entry, + (vm_size_t) slide_info_size); + if (kr == KERN_SUCCESS) { + slide_info.slide_info_size = slide_info_size; + slide_info.slide_object = object; + slide_info.start = offset; + slide_info.end = slide_info.start + size; + slide_info.slide = slide; + slide_info.sr = vm_shared_region_get(current_task()); + /* + * We want to keep the above reference on the shared region + * because we have a pointer to it in the slide_info. + * + * If we want to have this region get deallocated/freed + * then we will have to make sure that we msync(..MS_INVALIDATE..) + * the pages associated with this shared region. Those pages would + * have been slid with an older slide value. + * + * vm_shared_region_deallocate(slide_info.sr); + */ + shared_region_completed_slide = TRUE; + } else { + kr = KERN_FAILURE; + } + } + vm_object_unlock(object); + + if (is_map_locked == TRUE) { + vm_map_unlock_read(map); + } + return kr; +} + +void* +vm_shared_region_get_slide_info(void) { + return (void*)&slide_info; +} + +void* +vm_shared_region_get_slide_info_entry(void) { + return (void*)slide_info.slide_info_entry; +} + + +kern_return_t +vm_shared_region_slide_sanity_check(void) +{ + uint32_t pageIndex=0; + uint16_t entryIndex=0; + uint16_t *toc = NULL; + vm_shared_region_slide_info_entry_t s_info; + kern_return_t kr; + + s_info = vm_shared_region_get_slide_info_entry(); + toc = (uint16_t*)((uintptr_t)s_info + s_info->toc_offset); + + kr = mach_vm_protect(kernel_map, + (mach_vm_offset_t)(vm_offset_t) slide_info.slide_info_entry, + (mach_vm_size_t) slide_info.slide_info_size, + VM_PROT_READ, TRUE); + if (kr != KERN_SUCCESS) { + panic("vm_shared_region_slide_sanity_check: vm_protect() error 0x%x\n", kr); + } + + for (;pageIndex < s_info->toc_count; pageIndex++) { + + entryIndex = (uint16_t)(toc[pageIndex]); + + if (entryIndex >= s_info->entry_count) { + printf("No sliding bitmap entry for pageIndex: %d at entryIndex: %d amongst %d entries\n", pageIndex, entryIndex, s_info->entry_count); + goto fail; + } + + } + return KERN_SUCCESS; +fail: + if (slide_info.slide_info_entry != NULL) { + kmem_free(kernel_map, + (vm_offset_t) slide_info.slide_info_entry, + (vm_size_t) slide_info.slide_info_size); + vm_object_deallocate(slide_info.slide_object); + slide_info.slide_object = NULL; + slide_info.start = 0; + slide_info.end = 0; + slide_info.slide = 0; + slide_info.slide_info_entry = NULL; + slide_info.slide_info_size = 0; + shared_region_completed_slide = FALSE; + } + return KERN_FAILURE; +} + +kern_return_t +vm_shared_region_slide(vm_offset_t vaddr, uint32_t pageIndex) +{ + uint16_t *toc = NULL; + slide_info_entry_toc_t bitmap = NULL; + uint32_t i=0, j=0; + uint8_t b = 0; + uint32_t slide = slide_info.slide; + int is_64 = task_has_64BitAddr(current_task()); + + vm_shared_region_slide_info_entry_t s_info = vm_shared_region_get_slide_info_entry(); + toc = (uint16_t*)((uintptr_t)s_info + s_info->toc_offset); + + if (pageIndex >= s_info->toc_count) { + printf("No slide entry for this page in toc. PageIndex: %d Toc Count: %d\n", pageIndex, s_info->toc_count); + } else { + uint16_t entryIndex = (uint16_t)(toc[pageIndex]); + slide_info_entry_toc_t slide_info_entries = (slide_info_entry_toc_t)((uintptr_t)s_info + s_info->entry_offset); + + if (entryIndex >= s_info->entry_count) { + printf("No sliding bitmap entry for entryIndex: %d amongst %d entries\n", entryIndex, s_info->entry_count); + } else { + bitmap = &slide_info_entries[entryIndex]; + + for(i=0; i < NUM_SLIDING_BITMAPS_PER_PAGE; ++i) { + b = bitmap->entry[i]; + if (b!=0) { + for (j=0; j <8; ++j) { + if (b & (1 <wimg_bits & VM_WIMG_MASK, + 0, TRUE); memset(dst_ptr, '\0', PAGE_SIZE); diff --git a/osfmk/vm/vm_user.c b/osfmk/vm/vm_user.c index 582e51fc0..de18c16a1 100644 --- a/osfmk/vm/vm_user.c +++ b/osfmk/vm/vm_user.c @@ -1245,6 +1245,32 @@ vm_msync( } +int +vm_toggle_entry_reuse(int toggle, int *old_value) +{ + vm_map_t map = current_map(); + + if(toggle == VM_TOGGLE_GETVALUE && old_value != NULL){ + *old_value = map->disable_vmentry_reuse; + } else if(toggle == VM_TOGGLE_SET){ + vm_map_lock(map); + map->disable_vmentry_reuse = TRUE; + if (map->first_free == vm_map_to_entry(map)) { + map->highest_entry_end = vm_map_min(map); + } else { + map->highest_entry_end = map->first_free->vme_end; + } + vm_map_unlock(map); + } else if (toggle == VM_TOGGLE_CLEAR){ + vm_map_lock(map); + map->disable_vmentry_reuse = FALSE; + vm_map_unlock(map); + } else + return KERN_INVALID_ARGUMENT; + + return KERN_SUCCESS; +} + /* * mach_vm_behavior_set * @@ -1804,8 +1830,8 @@ mach_make_memory_entry_64( unsigned int access; vm_prot_t protections; + vm_prot_t original_protections, mask_protections; unsigned int wimg_mode; - boolean_t cache_attr = FALSE; if (((permission & 0x00FF0000) & ~(MAP_MEM_ONLY | @@ -1825,7 +1851,9 @@ mach_make_memory_entry_64( parent_entry = NULL; } - protections = permission & VM_PROT_ALL; + original_protections = permission & VM_PROT_ALL; + protections = original_protections; + mask_protections = permission & VM_PROT_IS_MASK; access = GET_MAP_MEM(permission); user_handle = IP_NULL; @@ -1846,7 +1874,7 @@ mach_make_memory_entry_64( if(parent_is_object && object != VM_OBJECT_NULL) wimg_mode = object->wimg_bits; else - wimg_mode = VM_WIMG_DEFAULT; + wimg_mode = VM_WIMG_USE_DEFAULT; if((access != GET_MAP_MEM(parent_entry->protection)) && !(parent_entry->protection & VM_PROT_WRITE)) { return KERN_INVALID_RIGHT; @@ -1856,7 +1884,7 @@ mach_make_memory_entry_64( wimg_mode = VM_WIMG_IO; } else if (access == MAP_MEM_COPYBACK) { SET_MAP_MEM(access, parent_entry->protection); - wimg_mode = VM_WIMG_DEFAULT; + wimg_mode = VM_WIMG_USE_DEFAULT; } else if (access == MAP_MEM_WTHRU) { SET_MAP_MEM(access, parent_entry->protection); wimg_mode = VM_WIMG_WTHRU; @@ -1864,29 +1892,14 @@ mach_make_memory_entry_64( SET_MAP_MEM(access, parent_entry->protection); wimg_mode = VM_WIMG_WCOMB; } - if(parent_is_object && object && + if (parent_is_object && object && (access != MAP_MEM_NOOP) && (!(object->nophyscache))) { - if(object->wimg_bits != wimg_mode) { - vm_page_t p; - if ((wimg_mode == VM_WIMG_IO) - || (wimg_mode == VM_WIMG_WCOMB)) - cache_attr = TRUE; - else - cache_attr = FALSE; - vm_object_lock(object); - vm_object_paging_wait(object, THREAD_UNINT); - object->wimg_bits = wimg_mode; - queue_iterate(&object->memq, - p, vm_page_t, listq) { - if (!p->fictitious) { - if (p->pmapped) - pmap_disconnect(p->phys_page); - if (cache_attr) - pmap_sync_page_attributes_phys(p->phys_page); - } - } - vm_object_unlock(object); + + if (object->wimg_bits != wimg_mode) { + vm_object_lock(object); + vm_object_change_wimg_mode(object, wimg_mode); + vm_object_unlock(object); } } if (object_handle) @@ -1935,7 +1948,7 @@ mach_make_memory_entry_64( if (access == MAP_MEM_IO) { wimg_mode = VM_WIMG_IO; } else if (access == MAP_MEM_COPYBACK) { - wimg_mode = VM_WIMG_DEFAULT; + wimg_mode = VM_WIMG_USE_DEFAULT; } else if (access == MAP_MEM_WTHRU) { wimg_mode = VM_WIMG_WTHRU; } else if (access == MAP_MEM_WCOMB) { @@ -1985,6 +1998,7 @@ mach_make_memory_entry_64( } redo_lookup: + protections = original_protections; vm_map_lock_read(target_map); /* get the object associated with the target address */ @@ -1992,14 +2006,23 @@ mach_make_memory_entry_64( /* that requested by the caller */ kr = vm_map_lookup_locked(&target_map, map_offset, - protections, OBJECT_LOCK_EXCLUSIVE, &version, - &object, &obj_off, &prot, &wired, - &fault_info, - &real_map); + protections | mask_protections, + OBJECT_LOCK_EXCLUSIVE, &version, + &object, &obj_off, &prot, &wired, + &fault_info, + &real_map); if (kr != KERN_SUCCESS) { vm_map_unlock_read(target_map); goto make_mem_done; } + if (mask_protections) { + /* + * The caller asked us to use the "protections" as + * a mask, so restrict "protections" to what this + * mapping actually allows. + */ + protections &= prot; + } if (((prot & protections) != protections) || (object == kernel_object)) { kr = KERN_INVALID_RIGHT; @@ -2085,6 +2108,14 @@ mach_make_memory_entry_64( /* JMM - The check below should be reworked instead. */ object->true_share = TRUE; } + if (mask_protections) { + /* + * The caller asked us to use the "protections" as + * a mask, so restrict "protections" to what this + * mapping actually allows. + */ + protections &= map_entry->max_protection; + } if(((map_entry->max_protection) & protections) != protections) { kr = KERN_INVALID_RIGHT; vm_object_unlock(object); @@ -2113,6 +2144,16 @@ mach_make_memory_entry_64( next_entry->vme_prev->offset + (next_entry->vme_prev->vme_end - next_entry->vme_prev->vme_start))) { + if (mask_protections) { + /* + * The caller asked us to use + * the "protections" as a mask, + * so restrict "protections" to + * what this mapping actually + * allows. + */ + protections &= next_entry->max_protection; + } if(((next_entry->max_protection) & protections) != protections) { break; @@ -2140,7 +2181,7 @@ mach_make_memory_entry_64( /* under us. */ if ((map_entry->needs_copy || object->shadowed || - (object->size > total_size)) + (object->vo_size > total_size)) && !object->true_share) { /* * We have to unlock the VM object before @@ -2177,7 +2218,7 @@ mach_make_memory_entry_64( /* create a shadow object */ vm_object_shadow(&map_entry->object.vm_object, - &map_entry->offset, total_size); + &map_entry->offset, total_size); shadow_object = map_entry->object.vm_object; vm_object_unlock(object); @@ -2275,28 +2316,8 @@ mach_make_memory_entry_64( if(real_map != target_map) vm_map_unlock_read(real_map); - if(object->wimg_bits != wimg_mode) { - vm_page_t p; - - vm_object_paging_wait(object, THREAD_UNINT); - - if ((wimg_mode == VM_WIMG_IO) - || (wimg_mode == VM_WIMG_WCOMB)) - cache_attr = TRUE; - else - cache_attr = FALSE; - - queue_iterate(&object->memq, - p, vm_page_t, listq) { - if (!p->fictitious) { - if (p->pmapped) - pmap_disconnect(p->phys_page); - if (cache_attr) - pmap_sync_page_attributes_phys(p->phys_page); - } - } - object->wimg_bits = wimg_mode; - } + if (object->wimg_bits != wimg_mode) + vm_object_change_wimg_mode(object, wimg_mode); /* the size of mapped entry that overlaps with our region */ /* which is targeted for share. */ @@ -2353,7 +2374,8 @@ mach_make_memory_entry_64( user_entry->is_sub_map = FALSE; user_entry->is_pager = FALSE; user_entry->offset = obj_off; - user_entry->protection = permission; + user_entry->protection = protections; + SET_MAP_MEM(GET_MAP_MEM(permission), user_entry->protection); user_entry->size = map_size; /* user_object pager and internal fields are not used */ @@ -2375,6 +2397,14 @@ mach_make_memory_entry_64( goto make_mem_done; } + if (mask_protections) { + /* + * The caller asked us to use the "protections" as + * a mask, so restrict "protections" to what this + * mapping actually allows. + */ + protections &= parent_entry->protection; + } if((protections & parent_entry->protection) != protections) { kr = KERN_PROTECTION_FAILURE; goto make_mem_done; @@ -2654,7 +2684,7 @@ mach_memory_entry_purgable_control( vm_object_lock(object); /* check that named entry covers entire object ? */ - if (mem_entry->offset != 0 || object->size != mem_entry->size) { + if (mem_entry->offset != 0 || object->vo_size != mem_entry->size) { vm_object_unlock(object); named_entry_unlock(mem_entry); return KERN_INVALID_ARGUMENT; @@ -3058,7 +3088,7 @@ vm_map_get_phys_page( /* If they are not present in the object they will */ /* have to be picked up from the pager through the */ /* fault mechanism. */ - if(entry->object.vm_object->shadow_offset == 0) { + if(entry->object.vm_object->vo_shadow_offset == 0) { /* need to call vm_fault */ vm_map_unlock(map); vm_fault(map, map_offset, VM_PROT_NONE, @@ -3068,7 +3098,7 @@ vm_map_get_phys_page( } offset = entry->offset + (map_offset - entry->vme_start); phys_page = (ppnum_t) - ((entry->object.vm_object->shadow_offset + ((entry->object.vm_object->vo_shadow_offset + offset) >> 12); break; @@ -3083,7 +3113,7 @@ vm_map_get_phys_page( vm_object_t old_object; vm_object_lock(object->shadow); old_object = object; - offset = offset + object->shadow_offset; + offset = offset + object->vo_shadow_offset; object = object->shadow; vm_object_unlock(old_object); } else { diff --git a/osfmk/x86_64/bzero.s b/osfmk/x86_64/bzero.s index cb2426300..fcfdf7245 100644 --- a/osfmk/x86_64/bzero.s +++ b/osfmk/x86_64/bzero.s @@ -88,7 +88,7 @@ ENTRY(memset) * void bzero(char * addr, size_t length) */ Entry(blkclr) -ENTRY(bzero) +ENTRY2(bzero,__bzero) movq %rsi,%rcx xorq %rax,%rax shrq $3,%rcx diff --git a/osfmk/x86_64/copyio.c b/osfmk/x86_64/copyio.c new file mode 100644 index 000000000..1adb732e5 --- /dev/null +++ b/osfmk/x86_64/copyio.c @@ -0,0 +1,351 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static int copyio(int, user_addr_t, char *, vm_size_t, vm_size_t *, int); +static int copyio_phys(addr64_t, addr64_t, vm_size_t, int); + +/* + * The copy engine has the following characteristics + * - copyio() handles copies to/from user or kernel space + * - copypv() deals with physical or virtual addresses + * + * Readers familiar with the 32-bit kernel will expect Joe's thesis at this + * point describing the full glory of the copy window implementation. In K64, + * however, there is no need for windowing. Thanks to the vast shared address + * space, the kernel has direct access to userspace and to physical memory. + * + * User virtual addresses are accessible provided the user's cr3 is loaded. + * Physical addresses are accessible via the direct map and the PHYSMAP_PTOV() + * translation. + * + * Copyin/out variants all boil done to just these 2 routines in locore.s which + * provide fault-recoverable copying: + */ +extern int _bcopy(const void *, void *, vm_size_t); +extern int _bcopystr(const void *, void *, vm_size_t, vm_size_t *); + + +/* + * Types of copies: + */ +#define COPYIN 0 /* from user virtual to kernel virtual */ +#define COPYOUT 1 /* from kernel virtual to user virtual */ +#define COPYINSTR 2 /* string variant of copyout */ +#define COPYINPHYS 3 /* from user virtual to kernel physical */ +#define COPYOUTPHYS 4 /* from kernel physical to user virtual */ + +static int +copyio(int copy_type, user_addr_t user_addr, char *kernel_addr, + vm_size_t nbytes, vm_size_t *lencopied, int use_kernel_map) +{ + thread_t thread; + pmap_t pmap; + vm_size_t bytes_copied; + int error = 0; + boolean_t istate = FALSE; + boolean_t recursive_CopyIOActive; +#if KDEBUG + int debug_type = 0xeff70010; + debug_type += (copy_type << 2); +#endif + + thread = current_thread(); + + KERNEL_DEBUG(debug_type | DBG_FUNC_START, + (unsigned)(user_addr >> 32), (unsigned)user_addr, + nbytes, thread->machine.copyio_state, 0); + + if (nbytes == 0) + goto out; + + pmap = thread->map->pmap; + + if ((copy_type != COPYINPHYS) && (copy_type != COPYOUTPHYS) && ((vm_offset_t)kernel_addr < VM_MIN_KERNEL_AND_KEXT_ADDRESS)) { + panic("Invalid copy parameter, copy type: %d, kernel address: %p", copy_type, kernel_addr); + } + + /* Sanity and security check for addresses to/from a user */ + + if (((pmap != kernel_pmap) && (use_kernel_map == 0)) && + ((nbytes && (user_addr+nbytes <= user_addr)) || ((user_addr + nbytes) > vm_map_max(thread->map)))) { + error = EFAULT; + goto out; + } + + /* + * If the no_shared_cr3 boot-arg is set (true), the kernel runs on + * its own pmap and cr3 rather than the user's -- so that wild accesses + * from kernel or kexts can be trapped. So, during copyin and copyout, + * we need to switch back to the user's map/cr3. The thread is flagged + * "CopyIOActive" at this time so that if the thread is pre-empted, + * we will later restore the correct cr3. + */ + recursive_CopyIOActive = thread->machine.specFlags & CopyIOActive; + thread->machine.specFlags |= CopyIOActive; + if (no_shared_cr3) { + istate = ml_set_interrupts_enabled(FALSE); + if (get_cr3_base() != pmap->pm_cr3) + set_cr3_raw(pmap->pm_cr3); + } + + /* + * Ensure that we're running on the target thread's cr3. + */ + if ((pmap != kernel_pmap) && !use_kernel_map && + (get_cr3_base() != pmap->pm_cr3)) { + panic("copyio(%d,%p,%p,%ld,%p,%d) cr3 is %p expects %p", + copy_type, (void *)user_addr, kernel_addr, nbytes, lencopied, use_kernel_map, + (void *) get_cr3_raw(), (void *) pmap->pm_cr3); + } + if (no_shared_cr3) + (void) ml_set_interrupts_enabled(istate); + + KERNEL_DEBUG(0xeff70044 | DBG_FUNC_NONE, (unsigned)user_addr, + (unsigned)kernel_addr, nbytes, 0, 0); + + switch (copy_type) { + + case COPYIN: + error = _bcopy((const void *) user_addr, + kernel_addr, + nbytes); + break; + + case COPYOUT: + error = _bcopy(kernel_addr, + (void *) user_addr, + nbytes); + break; + + case COPYINPHYS: + error = _bcopy((const void *) user_addr, + PHYSMAP_PTOV(kernel_addr), + nbytes); + break; + + case COPYOUTPHYS: + error = _bcopy((const void *) PHYSMAP_PTOV(kernel_addr), + (void *) user_addr, + nbytes); + break; + + case COPYINSTR: + error = _bcopystr((const void *) user_addr, + kernel_addr, + (int) nbytes, + &bytes_copied); + + /* + * lencopied should be updated on success + * or ENAMETOOLONG... but not EFAULT + */ + if (error != EFAULT) + *lencopied = bytes_copied; + + if (error) { +#if KDEBUG + nbytes = *lencopied; +#endif + break; + } + if (*(kernel_addr + bytes_copied - 1) == 0) { + /* + * we found a NULL terminator... we're done + */ +#if KDEBUG + nbytes = *lencopied; +#endif + break; + } else { + /* + * no more room in the buffer and we haven't + * yet come across a NULL terminator + */ +#if KDEBUG + nbytes = *lencopied; +#endif + error = ENAMETOOLONG; + break; + } + break; + } + + if (!recursive_CopyIOActive) + thread->machine.specFlags &= ~CopyIOActive; + if (no_shared_cr3) { + istate = ml_set_interrupts_enabled(FALSE); + if (get_cr3_raw() != kernel_pmap->pm_cr3) + set_cr3_raw(kernel_pmap->pm_cr3); + (void) ml_set_interrupts_enabled(istate); + } + +out: + KERNEL_DEBUG(debug_type | DBG_FUNC_END, (unsigned)user_addr, + (unsigned)kernel_addr, (unsigned)nbytes, error, 0); + + return (error); +} + + +static int +copyio_phys(addr64_t source, addr64_t sink, vm_size_t csize, int which) +{ + char *paddr; + user_addr_t vaddr; + int ctype; + + if (which & cppvPsnk) { + paddr = (char *)sink; + vaddr = (user_addr_t)source; + ctype = COPYINPHYS; + } else { + paddr = (char *)source; + vaddr = (user_addr_t)sink; + ctype = COPYOUTPHYS; + } + return copyio(ctype, vaddr, paddr, csize, NULL, which & cppvKmap); +} + +int +copyinmsg(const user_addr_t user_addr, char *kernel_addr, mach_msg_size_t nbytes) +{ + return copyio(COPYIN, user_addr, kernel_addr, nbytes, NULL, 0); +} + +int +copyin(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes) +{ + return copyio(COPYIN, user_addr, kernel_addr, nbytes, NULL, 0); +} + +int +copyinstr(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes, vm_size_t *lencopied) +{ + *lencopied = 0; + + return copyio(COPYINSTR, user_addr, kernel_addr, nbytes, lencopied, 0); +} + +int +copyoutmsg(const char *kernel_addr, user_addr_t user_addr, mach_msg_size_t nbytes) +{ + return copyio(COPYOUT, user_addr, (char *)(uintptr_t)kernel_addr, nbytes, NULL, 0); +} + +int +copyout(const void *kernel_addr, user_addr_t user_addr, vm_size_t nbytes) +{ + return copyio(COPYOUT, user_addr, (char *)(uintptr_t)kernel_addr, nbytes, NULL, 0); +} + + +kern_return_t +copypv(addr64_t src64, addr64_t snk64, unsigned int size, int which) +{ + unsigned int lop, csize; + int bothphys = 0; + + KERNEL_DEBUG(0xeff7004c | DBG_FUNC_START, (unsigned)src64, + (unsigned)snk64, size, which, 0); + + if ((which & (cppvPsrc | cppvPsnk)) == 0 ) /* Make sure that only one is virtual */ + panic("copypv: no more than 1 parameter may be virtual\n"); /* Not allowed */ + + if ((which & (cppvPsrc | cppvPsnk)) == (cppvPsrc | cppvPsnk)) + bothphys = 1; /* both are physical */ + + while (size) { + + if (bothphys) { + lop = (unsigned int)(PAGE_SIZE - (snk64 & (PAGE_SIZE - 1))); /* Assume sink smallest */ + + if (lop > (unsigned int)(PAGE_SIZE - (src64 & (PAGE_SIZE - 1)))) + lop = (unsigned int)(PAGE_SIZE - (src64 & (PAGE_SIZE - 1))); /* No, source is smaller */ + } else { + /* + * only need to compute the resid for the physical page + * address... we don't care about where we start/finish in + * the virtual since we just call the normal copyin/copyout + */ + if (which & cppvPsrc) + lop = (unsigned int)(PAGE_SIZE - (src64 & (PAGE_SIZE - 1))); + else + lop = (unsigned int)(PAGE_SIZE - (snk64 & (PAGE_SIZE - 1))); + } + csize = size; /* Assume we can copy it all */ + if (lop < size) + csize = lop; /* Nope, we can't do it all */ +#if 0 + /* + * flush_dcache64 is currently a nop on the i386... + * it's used when copying to non-system memory such + * as video capture cards... on PPC there was a need + * to flush due to how we mapped this memory... not + * sure if it's needed on i386. + */ + if (which & cppvFsrc) + flush_dcache64(src64, csize, 1); /* If requested, flush source before move */ + if (which & cppvFsnk) + flush_dcache64(snk64, csize, 1); /* If requested, flush sink before move */ +#endif + if (bothphys) + bcopy_phys(src64, snk64, csize); /* Do a physical copy, virtually */ + else { + if (copyio_phys(src64, snk64, csize, which)) + return (KERN_FAILURE); + } +#if 0 + if (which & cppvFsrc) + flush_dcache64(src64, csize, 1); /* If requested, flush source after move */ + if (which & cppvFsnk) + flush_dcache64(snk64, csize, 1); /* If requested, flush sink after move */ +#endif + size -= csize; /* Calculate what is left */ + snk64 += csize; /* Bump sink to next physical address */ + src64 += csize; /* Bump source to next physical address */ + } + KERNEL_DEBUG(0xeff7004c | DBG_FUNC_END, (unsigned)src64, + (unsigned)snk64, size, which, 0); + + return KERN_SUCCESS; +} diff --git a/osfmk/x86_64/cswitch.s b/osfmk/x86_64/cswitch.s index 6abb9a22c..c0f3715c4 100644 --- a/osfmk/x86_64/cswitch.s +++ b/osfmk/x86_64/cswitch.s @@ -60,19 +60,12 @@ #include #include -#include - -#ifdef SYMMETRY -#include -#endif - -#if AT386 #include -#endif /* AT386 */ +#include Entry(Load_context) movq TH_KERNEL_STACK(%rdi),%rcx /* get kernel stack */ - leaq -IKS_SIZE-IEL_SIZE(%rcx),%rdx + leaq -IKS_SIZE(%rcx),%rdx addq EXT(kernel_stack_size)(%rip),%rdx /* point to stack top */ movq %rcx,%gs:CPU_ACTIVE_STACK /* store stack address */ movq %rdx,%gs:CPU_KERNEL_STACK /* store stack top */ @@ -110,7 +103,7 @@ Entry(Switch_context) /* new thread in %rdx */ movq %rdx,%gs:CPU_ACTIVE_THREAD /* new thread is active */ movq TH_KERNEL_STACK(%rdx),%rdx /* get its kernel stack */ - lea -IKS_SIZE-IEL_SIZE(%rdx),%rcx + lea -IKS_SIZE(%rdx),%rcx add EXT(kernel_stack_size)(%rip),%rcx /* point to stack top */ movq %rdx,%gs:CPU_ACTIVE_STACK /* set current stack */ diff --git a/osfmk/x86_64/idt64.s b/osfmk/x86_64/idt64.s index a4ca1cecd..fe6cb1295 100644 --- a/osfmk/x86_64/idt64.s +++ b/osfmk/x86_64/idt64.s @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #define _ARCH_I386_ASM_HELP_H_ /* Prevent inclusion of user header */ #include @@ -81,149 +81,6 @@ #define HNDL_DOUBLE_FAULT EXT(hndl_double_fault) #define HNDL_MACHINE_CHECK EXT(hndl_machine_check) -/* - * Nanosecond timing. - */ - -/* - * Nanotime returned in %rax. - * Computed from tsc based on the scale factor and an implicit 32 bit shift. - * This code must match what _rtc_nanotime_read does in - * machine_routines_asm.s. Failure to do so can - * result in "weird" timing results. - * - * Uses: %rsi, %rdi, %rdx, %rcx - */ -#define NANOTIME \ - movq %gs:CPU_NANOTIME,%rdi ; \ - RTC_NANOTIME_READ_FAST() - -/* - * Add 64-bit delta in register reg to timer pointed to by register treg. - */ -#define TIMER_UPDATE(treg,reg,offset) \ - addq reg,(offset)+TIMER_ALL(treg) /* add timer */ - -/* - * Add time delta to old timer and start new. - * Uses: %rsi, %rdi, %rdx, %rcx, %rax - */ -#define TIMER_EVENT(old,new) \ - NANOTIME /* %rax := nanosecs */ ; \ - movq %rax,%rsi /* save timestamp */ ; \ - movq %gs:CPU_ACTIVE_THREAD,%rcx /* get thread */ ; \ - subq (old##_TIMER)+TIMER_TSTAMP(%rcx),%rax /* compute elapsed */ ; \ - TIMER_UPDATE(%rcx,%rax,old##_TIMER) /* update timer */ ; \ - leaq (new##_TIMER)(%rcx),%rcx /* point to new timer */; \ - movq %rsi,TIMER_TSTAMP(%rcx) /* set timestamp */ ; \ - movq %gs:CPU_PROCESSOR,%rdx /* get processor */ ; \ - movq %rcx,THREAD_TIMER(%rdx) /* set current timer */ ; \ - movq %rsi,%rax /* restore timestamp */ ; \ - subq (old##_STATE)+TIMER_TSTAMP(%rdx),%rax /* compute elapsed */ ; \ - TIMER_UPDATE(%rdx,%rax,old##_STATE) /* update timer */ ; \ - leaq (new##_STATE)(%rdx),%rcx /* point to new state */; \ - movq %rcx,CURRENT_STATE(%rdx) /* set current state */ ; \ - movq %rsi,TIMER_TSTAMP(%rcx) /* set timestamp */ - -/* - * Update time on user trap entry. - * Uses: %rsi, %rdi, %rdx, %rcx, %rax - */ -#define TIME_TRAP_UENTRY TIMER_EVENT(USER,SYSTEM) - -/* - * update time on user trap exit. - * Uses: %rsi, %rdi, %rdx, %rcx, %rax - */ -#define TIME_TRAP_UEXIT TIMER_EVENT(SYSTEM,USER) - -/* - * update time on interrupt entry. - * Uses: %rsi, %rdi, %rdx, %rcx, %rax - * Saves processor state info on stack. - */ -#define TIME_INT_ENTRY \ - NANOTIME /* %rax := nanosecs */ ; \ - movq %rax,%gs:CPU_INT_EVENT_TIME /* save in cpu data */ ; \ - movq %rax,%rsi /* save timestamp */ ; \ - movq %gs:CPU_PROCESSOR,%rdx /* get processor */ ; \ - movq THREAD_TIMER(%rdx),%rcx /* get current timer */ ; \ - subq TIMER_TSTAMP(%rcx),%rax /* compute elapsed */ ; \ - TIMER_UPDATE(%rcx,%rax,0) /* update timer */ ; \ - movq KERNEL_TIMER(%rdx),%rcx /* get kernel timer */ ; \ - movq %rsi,TIMER_TSTAMP(%rcx) /* set timestamp */ ; \ - movq %rsi,%rax /* restore timestamp */ ; \ - movq CURRENT_STATE(%rdx),%rcx /* get current state */ ; \ - pushq %rcx /* save state */ ; \ - subq TIMER_TSTAMP(%rcx),%rax /* compute elapsed */ ; \ - TIMER_UPDATE(%rcx,%rax,0) /* update timer */ ; \ - leaq IDLE_STATE(%rdx),%rax /* get idle state */ ; \ - cmpq %rax,%rcx /* compare current */ ; \ - je 0f /* skip if equal */ ; \ - leaq SYSTEM_STATE(%rdx),%rcx /* get system state */ ; \ - movq %rcx,CURRENT_STATE(%rdx) /* set current state */ ; \ -0: movq %rsi,TIMER_TSTAMP(%rcx) /* set timestamp */ - -/* - * update time on interrupt exit. - * Uses: %rsi, %rdi, %rdx, %rcx, %rax - * Restores processor state info from stack. - */ -#define TIME_INT_EXIT \ - NANOTIME /* %rax := nanosecs */ ; \ - movq %rax,%gs:CPU_INT_EVENT_TIME /* save in cpu data */ ; \ - movq %rax,%rsi /* save timestamp */ ; \ - movq %gs:CPU_PROCESSOR,%rdx /* get processor */ ; \ - movq KERNEL_TIMER(%rdx),%rcx /* get kernel timer */ ; \ - subq TIMER_TSTAMP(%rcx),%rax /* compute elapsed */ ; \ - TIMER_UPDATE(%rcx,%rax,0) /* update timer */ ; \ - movq THREAD_TIMER(%rdx),%rcx /* interrupted timer */ ; \ - movq %rsi,TIMER_TSTAMP(%rcx) /* set timestamp */ ; \ - movq %rsi,%rax /* restore timestamp */ ; \ - movq CURRENT_STATE(%rdx),%rcx /* get current state */ ; \ - subq TIMER_TSTAMP(%rcx),%rax /* compute elapsed */ ; \ - TIMER_UPDATE(%rcx,%rax,0) /* update timer */ ; \ - popq %rcx /* restore state */ ; \ - movq %rcx,CURRENT_STATE(%rdx) /* set current state */ ; \ - movq %rsi,TIMER_TSTAMP(%rcx) /* set timestamp */ - -/* - * Check for vtimers for task. - * task_reg is register pointing to current task - * thread_reg is register pointing to current thread - */ -#define TASK_VTIMER_CHECK(task_reg,thread_reg) \ - cmpl $0,TASK_VTIMERS(task_reg) ; \ - jz 1f ; \ - orl $(AST_BSD),%gs:CPU_PENDING_AST /* Set pending AST */ ; \ - lock ; \ - orl $(AST_BSD),ACT_AST(thread_reg) /* Set thread AST */ ; \ -1: ; \ - - -/* - * Macros for calling into C functions. - * The stack is 16-byte aligned by masking. - */ -#define CCALL(fn) \ - mov %rsp, %r12 ;\ - and $0xFFFFFFFFFFFFFFF0, %rsp ;\ - call EXT(fn) ;\ - mov %r12, %rsp - -#define CCALL1(fn, arg1) \ - mov arg1, %rdi ;\ - CCALL(fn) - -#define CCALL2(fn, arg1, arg2) \ - mov arg1, %rdi ;\ - CCALL(fn) - -#define CCALL3(fn, arg1, arg2, arg3) \ - mov arg1, %rdi ;\ - mov arg2, %rsi ;\ - mov arg3, %rdx ;\ - CCALL(fn) #if 1 #define PUSH_FUNCTION(func) \ @@ -287,11 +144,27 @@ * Determine what mode has been interrupted and save state accordingly. */ L_dispatch: - cmpq $(KERNEL64_CS), ISF64_CS(%rsp) + cmpl $(KERNEL64_CS), ISF64_CS(%rsp) je L_64bit_dispatch swapgs + /* + * Check for trap from EFI32, and restore cr3 and rsp if so. + * A trap from EFI32 is fatal. + */ + cmpl $(KERNEL32_CS), ISF64_CS(%rsp) + jne L_dispatch_continue + push %rcx + mov EXT(pal_efi_saved_cr3)(%rip), %rcx + mov %rcx, %cr3 + leaq 0(%rip), %rcx + shr $32, %rcx /* splice the upper 32-bits of rip */ + shl $32, %rsp /* .. and the lower 32-bits of rsp */ + shrd $32, %rcx, %rsp /* to recover the full 64-bits of rsp */ + pop %rcx + +L_dispatch_continue: cmpl $(TASK_MAP_32BIT), %gs:CPU_TASK_MAP je L_32bit_dispatch /* 32-bit user task */ /* fall through to 64bit user dispatch */ @@ -303,6 +176,8 @@ L_64bit_dispatch: subq $(ISS64_OFFSET), %rsp movl $(SS_64), SS_FLAVOR(%rsp) + cld + /* * Save segment regs - for completeness since theyre not used. */ @@ -361,6 +236,7 @@ L_32bit_dispatch: /* 32-bit user task */ subq $(ISC32_OFFSET), %rsp movl $(SS_32), SS_FLAVOR(%rsp) + cld /* * Save segment regs */ @@ -426,22 +302,32 @@ L_common_dispatch: je 1f mov %rcx, %cr3 /* load kernel cr3 */ jmp 2f /* and skip tlb flush test */ -1: - cmpl $0, %gs:CPU_TLB_INVALID /* flush needed? */ - je 2f /* - no */ - movl $0, %gs:CPU_TLB_INVALID - mov %cr3, %rcx +1: + mov %gs:CPU_ACTIVE_CR3+4, %rcx + shr $32, %rcx + testl %ecx, %ecx + jz 2f + movl $0, %gs:CPU_TLB_INVALID + testl $(1<<16), %ecx /* Global? */ + jz 11f + mov %cr4, %rcx /* RMWW CR4, for lack of an alternative*/ + and $(~CR4_PGE), %rcx + mov %rcx, %cr4 + or $(CR4_PGE), %rcx + mov %rcx, %cr4 + jmp 2f + +11: mov %cr3, %rcx mov %rcx, %cr3 2: mov %gs:CPU_ACTIVE_THREAD, %rcx /* Get the active thread */ - cmpq $0, ACT_PCB_IDS(%rcx) /* Is there a debug register state? */ + cmpq $0, TH_PCB_IDS(%rcx) /* Is there a debug register state? */ je 3f mov $0, %rcx /* If so, reset DR7 (the control) */ mov %rcx, %dr7 3: - addl $1,%gs:hwIntCnt(,%ebx,4) // Bump the trap/intr count + incl %gs:hwIntCnt(,%ebx,4) // Bump the trap/intr count /* Dispatch the designated handler */ - mov %rsp, %rdi /* rsp points to saved state */ jmp *%rdx /* @@ -453,7 +339,7 @@ Entry(return_to_user) Entry(ret_to_user) // XXX 'Be nice to tidy up this debug register restore sequence... mov %gs:CPU_ACTIVE_THREAD, %rdx - movq ACT_PCB_IDS(%rdx),%rax /* Obtain this thread's debug state */ + movq TH_PCB_IDS(%rdx),%rax /* Obtain this thread's debug state */ cmpq $0,%rax /* Is there a debug register context? */ je 2f /* branch if not */ @@ -486,21 +372,15 @@ Entry(ret_to_user) * On exiting the kernel there's no need to switch cr3 since we're * already running in the user's address space which includes the * kernel. Nevertheless, we now mark the task's cr3 as active. - * However, there may be a defered tlb flush to deal with. - * This is a case where another cpu modified this task's address - * space while this thread was in the kernel. * But, if no_shared_cr3 is set, we do need to switch cr3 at this point. */ mov %gs:CPU_TASK_CR3, %rcx mov %rcx, %gs:CPU_ACTIVE_CR3 - movl %gs:CPU_TLB_INVALID, %eax - orl EXT(no_shared_cr3)(%rip), %eax - test %eax, %eax /* -no_shered_cr3 or flush required? */ + movl EXT(no_shared_cr3)(%rip), %eax + test %eax, %eax /* -no_shared_cr3 */ jz 3f - movl $0, %gs:CPU_TLB_INVALID mov %rcx, %cr3 3: - mov %gs:CPU_DR7, %rax /* Is there a debug control register?*/ cmp $0, %rax je 4f @@ -586,7 +466,7 @@ ret_to_kernel: CCALL1(panic_idt64, %rsp) hlt 1: - cmpq $(KERNEL64_CS), R64_CS(%rsp) + cmpl $(KERNEL64_CS), R64_CS(%rsp) je 2f CCALL1(panic_idt64, %rsp) hlt @@ -653,7 +533,6 @@ L_sysret: #endif Entry(idt64_unix_scall) swapgs /* switch to kernel gs (cpu_data) */ -L_unix_scall_continue: pushq %rax /* save system call number */ PUSH_FUNCTION(HNDL_UNIX_SCALL) pushq $(UNIX_INT) @@ -662,7 +541,6 @@ L_unix_scall_continue: Entry(idt64_mach_scall) swapgs /* switch to kernel gs (cpu_data) */ -L_mach_scall_continue: pushq %rax /* save system call number */ PUSH_FUNCTION(HNDL_MACH_SCALL) pushq $(MACH_INT) @@ -671,7 +549,6 @@ L_mach_scall_continue: Entry(idt64_mdep_scall) swapgs /* switch to kernel gs (cpu_data) */ -L_mdep_scall_continue: pushq %rax /* save system call number */ PUSH_FUNCTION(HNDL_MDEP_SCALL) pushq $(MACHDEP_INT) @@ -680,7 +557,6 @@ L_mdep_scall_continue: Entry(idt64_diag_scall) swapgs /* switch to kernel gs (cpu_data) */ -L_diag_scall_continue: push %rax /* save system call number */ PUSH_FUNCTION(HNDL_DIAG_SCALL) pushq $(DIAG_INT) @@ -688,8 +564,8 @@ L_diag_scall_continue: Entry(hi64_syscall) Entry(idt64_syscall) - swapgs /* Kapow! get per-cpu data area */ L_syscall_continue: + swapgs /* Kapow! get per-cpu data area */ mov %rsp, %gs:CPU_UBER_TMP /* save user stack */ mov %gs:CPU_UBER_ISF, %rsp /* switch stack to pcb */ @@ -729,9 +605,15 @@ Entry(idt64_sysenter) push $(USER_DS) /* ss */ push %rcx /* uesp */ pushf /* flags */ + /* + * Clear, among others, the Nested Task (NT) flags bit; + * this is zeroed by INT, but not by SYSENTER. + */ + push $0 + popf push $(SYSENTER_CS) /* cs */ - swapgs /* switch to kernel gs (cpu_data) */ L_sysenter_continue: + swapgs /* switch to kernel gs (cpu_data) */ push %rdx /* eip */ push %rax /* err/eax - syscall code */ PUSH_FUNCTION(HNDL_SYSENTER) @@ -742,16 +624,19 @@ L_sysenter_continue: Entry(idt64_page_fault) PUSH_FUNCTION(HNDL_ALLTRAPS) - push %rax /* save %rax temporarily in trap slot */ + push $(T_PAGE_FAULT) + push %rax /* save %rax temporarily */ leaq EXT(idt64_unix_scall_copy_args)(%rip), %rax - cmp %rax, ISF64_RIP(%rsp) - jne 1f - add $(ISF64_SIZE), %rsp /* remove entire intr stack frame */ - jmp L_copy_args_continue /* continue system call entry */ + cmp %rax, 8+ISF64_RIP(%rsp) /* fault during copy args? */ + je 1f /* - yes, handle copy arg fault */ + testb $3, 8+ISF64_CS(%rsp) /* was trap from kernel? */ + jz L_kernel_trap /* - yes, handle with care */ + pop %rax /* restore %rax, swapgs, and continue */ + swapgs + jmp L_dispatch_continue 1: - mov (%rsp), %rax /* restore %rax from trap slot */ - movq $(T_PAGE_FAULT), (%rsp) /* set trap code */ - jne L_dispatch + add $(8+ISF64_SIZE), %rsp /* remove entire intr stack frame */ + jmp L_copy_args_continue /* continue system call entry */ /* @@ -773,50 +658,23 @@ Entry(idt64_debug) */ push %rax /* save %rax temporarily */ - - leaq EXT(idt64_mach_scall)(%rip), %rax - cmp %rax, ISF64_RIP(%rsp) - jne 1f - pop %rax - add $(ISF64_SIZE),%rsp /* remove entire intr stack frame */ - jmp L_mach_scall_continue /* continue system call entry */ -1: - leaq EXT(idt64_mdep_scall)(%rip), %rax - cmp %rax, ISF64_RIP(%rsp) - jne 2f - pop %rax - add $(ISF64_SIZE),%rsp /* remove entire intr stack frame */ - jmp L_mdep_scall_continue /* continue system call entry */ -2: - leaq EXT(idt64_unix_scall)(%rip), %rax - cmp %rax, ISF64_RIP(%rsp) - jne 3f - pop %rax - add $(ISF64_SIZE),%rsp /* remove entire intr stack frame */ - jmp L_unix_scall_continue /* continue system call entry */ -3: lea EXT(idt64_sysenter)(%rip), %rax - cmp %rax, ISF64_RIP(%rsp) - je 4f - pop %rax - jmp L_dispatch -4: + cmp %rax, ISF64_RIP+8(%rsp) pop %rax + jne L_dispatch /* * Interrupt stack frame has been pushed on the temporary stack. - * We have to switch to pcb stack and copy eflags. + * We have to switch to pcb stack and patch up the saved state. */ - add $40,%rsp /* remove trapno/trapfn/err/rip/cs */ - push %rcx /* save %rcx - user stack pointer */ - mov 40(%rsp),%rcx /* top of intr stack -> pcb stack */ + mov %rcx, ISF64_ERR(%rsp) /* save %rcx in error slot */ + mov ISF64_SS+8(%rsp), %rcx /* top of temp stack -> pcb stack */ xchg %rcx,%rsp /* switch to pcb stack */ push $(USER_DS) /* ss */ - push (%rcx) /* saved %rcx into rsp slot */ - push 8(%rcx) /* rflags */ - mov (%rcx),%rcx /* restore %rcx */ + push ISF64_ERR(%rcx) /* saved %rcx into rsp slot */ + push ISF64_RFLAGS(%rcx) /* rflags */ push $(SYSENTER_TF_CS) /* cs - not SYSENTER_CS for iret path */ + mov ISF64_ERR(%rcx),%rcx /* restore %rcx */ jmp L_sysenter_continue /* continue sysenter entry */ - Entry(idt64_double_fault) @@ -825,9 +683,9 @@ Entry(idt64_double_fault) push %rax leaq EXT(idt64_syscall)(%rip), %rax - cmp %rax, ISF64_RIP(%rsp) + cmp %rax, ISF64_RIP+8(%rsp) pop %rax - jne L_dispatch + jne L_64bit_dispatch mov ISF64_RSP(%rsp), %rsp jmp L_syscall_continue @@ -838,15 +696,15 @@ Entry(idt64_double_fault) * Check for a GP/NP fault in the kernel_return * sequence; if there, report it as a GP/NP fault on the user's instruction. * - * rsp-> 0: trap function - * 8: trap code (NP or GP) - * 16: segment number in error (error code) - * 24: rip - * 32: cs - * 40: rflags - * 48: rsp - * 56: ss - * 64: old registers (trap is from kernel) + * rsp-> 0 ISF64_TRAPNO: trap code (NP or GP) + * 8 ISF64_TRAPFN: trap function + * 16 ISF64_ERR: segment number in error (error code) + * 24 ISF64_RIP: rip + * 32 ISF64_CS: cs + * 40 ISF64_RFLAGS: rflags + * 48 ISF64_RIP: rsp + * 56 ISF64_SS: ss + * 64: old registers (trap is from kernel) */ Entry(idt64_gen_prot) PUSH_FUNCTION(HNDL_ALLTRAPS) @@ -863,7 +721,7 @@ Entry(idt64_segnp) pushq $(T_SEGMENT_NOT_PRESENT) /* indicate fault type */ trap_check_kernel_exit: - testb $3,32(%rsp) + testb $3,ISF64_CS(%rsp) jnz L_dispatch /* * trap was from kernel mode, @@ -872,33 +730,69 @@ trap_check_kernel_exit: push %rax leaq EXT(ret32_iret)(%rip), %rax - cmp %rax, 24+8(%rsp) + cmp %rax, 8+ISF64_RIP(%rsp) je L_fault_iret leaq EXT(ret64_iret)(%rip), %rax - cmp %rax, 24+8(%rsp) + cmp %rax, 8+ISF64_RIP(%rsp) je L_fault_iret leaq EXT(ret32_set_ds)(%rip), %rax - cmp %rax, 24+8(%rsp) + cmp %rax, 8+ISF64_RIP(%rsp) je L_32bit_fault_set_seg leaq EXT(ret32_set_es)(%rip), %rax - cmp %rax, 24+8(%rsp) + cmp %rax, 8+ISF64_RIP(%rsp) je L_32bit_fault_set_seg leaq EXT(ret32_set_fs)(%rip), %rax - cmp %rax, 24+8(%rsp) + cmp %rax, 8+ISF64_RIP(%rsp) je L_32bit_fault_set_seg leaq EXT(ret32_set_gs)(%rip), %rax - cmp %rax, 24+8(%rsp) + cmp %rax, 8+ISF64_RIP(%rsp) je L_32bit_fault_set_seg leaq EXT(idt64_unix_scall_copy_args)(%rip), %rax - cmp %rax, 24+8(%rsp) - add $(ISF64_SIZE)+8, (%rsp) + cmp %rax, 8+ISF64_RIP(%rsp) + cmove 8+ISF64_RSP(%rsp), %rsp je L_copy_args_continue - pop %rax - jmp L_dispatch + /* fall through */ + +L_kernel_trap: + /* + * Here after taking an unexpected trap from kernel mode - perhaps + * while running in the trampolines hereabouts. + * Note: %rax has been pushed on stack. + * Make sure we're not on the PCB stack, if so move to the kernel stack. + * This is likely a fatal condition. + * But first, try to ensure we have the kernel gs base active... + */ + movq %gs:CPU_THIS, %rax /* get gs_base into %rax */ + test %rax, %rax /* test sign bit (MSB) */ + js 1f /* -ve kernel addr, no swap */ + swapgs /* +ve user addr, swap */ +1: + movq %gs:CPU_UBER_ISF, %rax /* PCB stack addr */ + subq %rsp, %rax + cmpq $(PAGE_SIZE), %rax /* current stack in PCB? */ + jb 2f /* - yes, deal with it */ + pop %rax /* - no, restore %rax */ + jmp L_64bit_dispatch +2: + /* + * Here if %rsp is in the PCB + * Copy the interrupt stack frame from PCB stack to kernel stack + */ + movq %gs:CPU_KERNEL_STACK, %rax + xchgq %rax, %rsp + pushq 8+ISF64_SS(%rax) + pushq 8+ISF64_RSP(%rax) + pushq 8+ISF64_RFLAGS(%rax) + pushq 8+ISF64_CS(%rax) + pushq 8+ISF64_RIP(%rax) + pushq 8+ISF64_ERR(%rax) + pushq 8+ISF64_TRAPFN(%rax) + pushq 8+ISF64_TRAPNO(%rax) + movq (%rax), %rax + jmp L_64bit_dispatch - /* * GP/NP fault on IRET: CS or SS is in error. * Note that the user ss is originally 16-byte aligned, we'd popped the @@ -908,32 +802,32 @@ trap_check_kernel_exit: * * on SP is * (- rax saved above, which is immediately popped) - * 0 function - * 8 trap number - * 16 errcode - * 24 rip - * 32 cs - * 40 rflags - * 48 rsp --> new trapfn - * 56 ss --> new trapno - * 64 pad --> new errcode - * 72 user rip - * 80 user cs - * 88 user rflags - * 96 user rsp - * 104 user ss (16-byte aligned) + * 0 ISF64_TRAPNO: trap code (NP or GP) + * 8 ISF64_TRAPFN: trap function + * 16 ISF64_ERR: segment number in error (error code) + * 24 ISF64_RIP: rip + * 32 ISF64_CS: cs + * 40 ISF64_RFLAGS: rflags + * 48 ISF64_RSP: rsp --> new trapno + * 56 ISF64_SS: ss --> new trapfn + * 64 pad --> new errcode + * 72 user rip + * 80 user cs + * 88 user rflags + * 96 user rsp + * 104 user ss (16-byte aligned) */ L_fault_iret: pop %rax /* recover saved %rax */ - mov %rax, 24(%rsp) /* save rax (we don`t need saved rip) */ - mov 0(%rsp), %rax /* get trap func */ - mov %rax, 48(%rsp) /* put in user trap func */ - mov 8(%rsp), %rax /* get trap number */ - mov %rax, 56(%rsp) /* put in user trap number */ - mov 16(%rsp), %rax /* get error code */ - mov %rax, 64(%rsp) /* put in user errcode */ - mov 24(%rsp), %rax /* restore rax */ - add $48,%rsp /* reset to new trapfn */ + mov %rax, ISF64_RIP(%rsp) /* save rax (we don`t need saved rip) */ + mov ISF64_TRAPNO(%rsp), %rax + mov %rax, ISF64_TRAPNO(%rsp)/* put in user trap number */ + mov ISF64_TRAPFN(%rsp), %rax + mov %rax, ISF64_SS(%rsp) /* put in user trap function */ + mov ISF64_ERR(%rsp), %rax /* get error code */ + mov %rax, 8+ISF64_SS(%rsp) /* put in user errcode */ + mov ISF64_RIP(%rsp), %rax /* restore rax */ + add $(ISF64_RSP),%rsp /* reset to new trapfn */ /* now treat as fault from user */ jmp L_dispatch @@ -942,13 +836,14 @@ L_fault_iret: * on the stack untouched since we haven't yet moved the stack pointer. */ L_32bit_fault_set_seg: - pop %rax /* recover %rax from stack */ - mov 0(%rsp), %rax /* get trap function */ - mov 8(%rsp), %rcx /* get trap number */ - mov 16(%rsp), %rdx /* get error code */ - mov 48(%rsp), %rsp /* reset stack to saved state */ - mov %rax,ISC32_TRAPFN(%rsp) - mov %rcx,ISC32_TRAPNO(%rsp) + swapgs + pop %rax /* toss saved %rax from stack */ + mov ISF64_TRAPNO(%rsp), %rax + mov ISF64_TRAPFN(%rsp), %rcx + mov ISF64_ERR(%rsp), %rdx + mov ISF64_RSP(%rsp), %rsp /* reset stack to saved state */ + mov %rax,ISC32_TRAPNO(%rsp) + mov %rcx,ISC32_TRAPFN(%rsp) mov %rdx,ISC32_ERR(%rsp) /* now treat as fault from user */ /* except that all the state is */ @@ -993,22 +888,25 @@ Entry(hndl_alltraps) TIME_TRAP_UENTRY - movq %gs:CPU_ACTIVE_THREAD,%rdi - movq %rsp, ACT_PCB_ISS(%rdi) /* stash the PCB stack */ + /* Check for active vtimers in the current task */ + mov %gs:CPU_ACTIVE_THREAD, %rcx + mov TH_TASK(%rcx), %rbx + TASK_VTIMER_CHECK(%rbx, %rcx) + movq %rsp, %rdi /* also pass it as arg0 */ movq %gs:CPU_KERNEL_STACK,%rsp /* switch to kernel stack */ - sti CCALL(user_trap) /* call user trap routine */ + /* user_trap() unmasks interrupts */ cli /* hold off intrs - critical section */ - movq %gs:CPU_ACTIVE_THREAD,%rsp - movq ACT_PCB_ISS(%rsp), %rsp /* switch back to PCB stack */ xorl %ecx, %ecx /* don't check if we're in the PFZ */ #define CLI cli #define STI sti Entry(return_from_trap) + movq %gs:CPU_ACTIVE_THREAD,%rsp + movq TH_PCB_ISS(%rsp), %rsp /* switch back to PCB stack */ movl %gs:CPU_PENDING_AST,%eax testl %eax,%eax je EXT(return_to_user) /* branch if no AST */ @@ -1023,6 +921,7 @@ L_return_from_trap_with_ast: je 1f /* no... 32-bit user mode */ movl R32_EIP(%r13), %edi + xorq %rbp, %rbp /* clear framepointer */ CCALL(commpage_is_in_pfz32) testl %eax, %eax je 2f /* not in the PFZ... go service AST */ @@ -1031,6 +930,7 @@ L_return_from_trap_with_ast: jmp EXT(return_to_user) 1: movq R64_RIP(%r13), %rdi + xorq %rbp, %rbp /* clear framepointer */ CCALL(commpage_is_in_pfz64) testl %eax, %eax je 2f /* not in the PFZ... go service AST */ @@ -1040,12 +940,11 @@ L_return_from_trap_with_ast: 2: STI /* interrupts always enabled on return to user mode */ - xor %edi, %edi /* zero %rdi */ - CCALL(i386_astintr) /* take the AST */ + xor %edi, %edi /* zero %rdi */ + xorq %rbp, %rbp /* clear framepointer */ + CCALL(i386_astintr) /* take the AST */ CLI - movq %r13, %rsp /* switch back to PCB stack */ - xorl %ecx, %ecx /* don't check if we're in the PFZ */ jmp EXT(return_from_trap) /* and check again (rare) */ @@ -1061,7 +960,7 @@ trap_from_kernel: pushq R64_RIP(%rsp) /* Simulate a CALL from fault point */ pushq %rbp /* Extend framepointer chain */ movq %rsp, %rbp - CCALL(kernel_trap) /* to kernel trap routine */ + CCALLWITHSP(kernel_trap) /* to kernel trap routine */ popq %rbp addq $8, %rsp cli @@ -1117,6 +1016,11 @@ Entry(hndl_allintrs) TIME_INT_ENTRY /* do timing */ + /* Check for active vtimers in the current task */ + mov %gs:CPU_ACTIVE_THREAD, %rcx + mov TH_TASK(%rcx), %rbx + TASK_VTIMER_CHECK(%rbx, %rcx) + incl %gs:CPU_PREEMPTION_LEVEL incl %gs:CPU_INTERRUPT_LEVEL @@ -1137,8 +1041,7 @@ LEXT(return_to_iret) /* (label for kdb_kintr and hardclock) */ TIME_INT_EXIT /* do timing */ movq %gs:CPU_ACTIVE_THREAD,%rax - movq ACT_PCB(%rax),%rax /* get act`s PCB */ - movq PCB_FPS(%rax),%rax /* get pcb's ims.ifps */ + movq TH_PCB_FPS(%rax),%rax /* get pcb's ifps */ cmpq $0,%rax /* Is there a context */ je 1f /* Branch if not */ movl FP_VALID(%rax),%eax /* Load fp_valid */ @@ -1286,9 +1189,8 @@ L_copy_args_continue: movq %gs:CPU_KERNEL_STACK,%rdi xchgq %rdi,%rsp /* switch to kernel stack */ movq %gs:CPU_ACTIVE_THREAD,%rcx /* get current thread */ - movq %rdi,ACT_PCB_ISS(%rcx) - movq ACT_TASK(%rcx),%rbx /* point to current task */ - addl $1,TASK_SYSCALLS_UNIX(%rbx) /* increment call count */ + movq TH_TASK(%rcx),%rbx /* point to current task */ + incl TH_SYSCALLS_UNIX(%rcx) /* increment call count */ /* Check for active vtimers in the current task */ TASK_VTIMER_CHECK(%rbx,%rcx) @@ -1307,9 +1209,8 @@ Entry(hndl_mach_scall) movq %gs:CPU_KERNEL_STACK,%rdi xchgq %rdi,%rsp /* switch to kernel stack */ movq %gs:CPU_ACTIVE_THREAD,%rcx /* get current thread */ - movq %rdi,ACT_PCB_ISS(%rcx) - movq ACT_TASK(%rcx),%rbx /* point to current task */ - addl $1,TASK_SYSCALLS_MACH(%rbx) /* increment call count */ + movq TH_TASK(%rcx),%rbx /* point to current task */ + incl TH_SYSCALLS_MACH(%rcx) /* increment call count */ /* Check for active vtimers in the current task */ TASK_VTIMER_CHECK(%rbx,%rcx) @@ -1330,7 +1231,7 @@ Entry(hndl_mdep_scall) /* Check for active vtimers in the current task */ movq %gs:CPU_ACTIVE_THREAD,%rcx /* get current thread */ - movq ACT_TASK(%rcx),%rbx /* point to current task */ + movq TH_TASK(%rcx),%rbx /* point to current task */ TASK_VTIMER_CHECK(%rbx,%rcx) sti @@ -1349,7 +1250,7 @@ Entry(hndl_diag_scall) /* Check for active vtimers in the current task */ movq %gs:CPU_ACTIVE_THREAD,%rcx /* get current thread */ - movq ACT_TASK(%rcx),%rbx /* point to current task */ + movq TH_TASK(%rcx),%rbx /* point to current task */ TASK_VTIMER_CHECK(%rbx,%rcx) pushq %rdi /* push pcb stack */ @@ -1359,14 +1260,13 @@ Entry(hndl_diag_scall) cli // Disable interruptions just in case cmpl $0,%eax // What kind of return is this? je 1f // - branch if bad (zero) - popq %rsp // Get back the original stack + popq %rsp // Get back the pcb stack jmp EXT(return_to_user) // Normal return, do not check asts... 1: CCALL3(i386_exception, $EXC_SYSCALL, $0x6000, $1) // pass what would be the diag syscall // error return - cause an exception /* no return */ - /* @@ -1384,8 +1284,7 @@ Entry(hndl_syscall) movq %gs:CPU_KERNEL_STACK,%rdi xchgq %rdi,%rsp /* switch to kernel stack */ movq %gs:CPU_ACTIVE_THREAD,%rcx /* get current thread */ - movq %rdi, ACT_PCB_ISS(%rcx) - movq ACT_TASK(%rcx),%rbx /* point to current task */ + movq TH_TASK(%rcx),%rbx /* point to current task */ /* Check for active vtimers in the current task */ TASK_VTIMER_CHECK(%rbx,%rcx) @@ -1412,7 +1311,7 @@ Entry(hndl_syscall) Entry(hndl_unix_scall64) - addl $1,TASK_SYSCALLS_UNIX(%rbx) /* increment call count */ + incl TH_SYSCALLS_UNIX(%rcx) /* increment call count */ sti CCALL(unix_syscall64) @@ -1422,7 +1321,7 @@ Entry(hndl_unix_scall64) Entry(hndl_mach_scall64) - addl $1,TASK_SYSCALLS_MACH(%rbx) /* increment call count */ + incl TH_SYSCALLS_MACH(%rcx) /* increment call count */ sti CCALL(mach_call_munger64) @@ -1443,13 +1342,11 @@ Entry(hndl_mdep_scall64) Entry(hndl_diag_scall64) pushq %rdi // Push the previous stack - CCALL(diagCall64) // Call diagnostics - cli // Disable interruptions just in case cmpl $0,%eax // What kind of return is this? je 1f // - branch if bad (zero) - popq %rsp // Get back the original stack + popq %rsp // Get back the pcb stack jmp EXT(return_to_user) // Normal return, do not check asts... 1: CCALL3(i386_exception, $EXC_SYSCALL, $0x6000, $1) diff --git a/osfmk/x86_64/idt_table.h b/osfmk/x86_64/idt_table.h index 243ca18da..f2f26ce13 100644 --- a/osfmk/x86_64/idt_table.h +++ b/osfmk/x86_64/idt_table.h @@ -1,44 +1,71 @@ - TRAP(0x00,idt64_zero_div) - TRAP_SPC(0x01,idt64_debug) +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +TRAP(0x00,idt64_zero_div) +TRAP_SPC(0x01,idt64_debug) INTERRUPT(0x02) /* NMI */ USER_TRAP(0x03,idt64_int3) USER_TRAP(0x04,idt64_into) USER_TRAP(0x05,idt64_bounds) - TRAP(0x06,idt64_invop) - TRAP(0x07,idt64_nofpu) +TRAP(0x06,idt64_invop) +TRAP(0x07,idt64_nofpu) #if MACH_KDB - TRAP_IST(0x08,idt64_db_task_dbl_fault) +TRAP_IST(0x08,idt64_db_task_dbl_fault) #else - TRAP_IST(0x08,idt64_double_fault) +TRAP_IST(0x08,idt64_double_fault) #endif - TRAP(0x09,idt64_fpu_over) - TRAP(0x0a,idt64_inv_tss) - TRAP_SPC(0x0b,idt64_segnp) +TRAP(0x09,idt64_fpu_over) +TRAP(0x0a,idt64_inv_tss) +TRAP_SPC(0x0b,idt64_segnp) #if MACH_KDB - TRAP_IST(0x0c,idt64_db_task_stk_fault) +TRAP_IST(0x0c,idt64_db_task_stk_fault) #else - TRAP_SPC(0x0c,idt64_stack_fault) +TRAP_SPC(0x0c,idt64_stack_fault) #endif - TRAP_SPC(0x0d,idt64_gen_prot) -// TRAP_ERR(0x0d,idt64_gen_prot_not) - TRAP_SPC(0x0e,idt64_page_fault) - TRAP(0x0f,idt64_trap_0f) - TRAP(0x10,idt64_fpu_err) - TRAP(0x11,idt64_trap_11) - TRAP_IST(0x12,idt64_mc) - TRAP(0x13,idt64_sse_err) - TRAP(0x14,idt64_trap_14) - TRAP(0x15,idt64_trap_15) - TRAP(0x16,idt64_trap_16) - TRAP(0x17,idt64_trap_17) - TRAP(0x18,idt64_trap_18) - TRAP(0x19,idt64_trap_19) - TRAP(0x1a,idt64_trap_1a) - TRAP(0x1b,idt64_trap_1b) - TRAP(0x1c,idt64_trap_1c) - TRAP(0x1d,idt64_trap_1d) - TRAP(0x1e,idt64_trap_1e) - TRAP(0x1f,idt64_trap_1f) +TRAP_SPC(0x0d,idt64_gen_prot) +TRAP_SPC(0x0e,idt64_page_fault) +TRAP(0x0f,idt64_trap_0f) +TRAP(0x10,idt64_fpu_err) +TRAP(0x11,idt64_trap_11) +TRAP_IST(0x12,idt64_mc) +TRAP(0x13,idt64_sse_err) +TRAP(0x14,idt64_trap_14) +TRAP(0x15,idt64_trap_15) +TRAP(0x16,idt64_trap_16) +TRAP(0x17,idt64_trap_17) +TRAP(0x18,idt64_trap_18) +TRAP(0x19,idt64_trap_19) +TRAP(0x1a,idt64_trap_1a) +TRAP(0x1b,idt64_trap_1b) +TRAP(0x1c,idt64_trap_1c) +TRAP(0x1d,idt64_trap_1d) +TRAP(0x1e,idt64_trap_1e) +TRAP(0x1f,idt64_trap_1f) INTERRUPT(0x20) INTERRUPT(0x21) @@ -140,7 +167,7 @@ INTERRUPT(0x7b) INTERRUPT(0x7c) INTERRUPT(0x7d) INTERRUPT(0x7e) -INTERRUPT(0x7f) +USER_TRAP(0x7f, idt64_dtrace_ret) /* Required by dtrace "fasttrap" */ USER_TRAP_SPC(0x80,idt64_unix_scall) USER_TRAP_SPC(0x81,idt64_mach_scall) @@ -277,4 +304,4 @@ INTERRUPT(0xfb) INTERRUPT(0xfc) INTERRUPT(0xfd) INTERRUPT(0xfe) - TRAP(0xff,idt64_preempt) +TRAP(0xff,idt64_preempt) diff --git a/osfmk/x86_64/locore.s b/osfmk/x86_64/locore.s index 82d712985..af3bac12a 100644 --- a/osfmk/x86_64/locore.s +++ b/osfmk/x86_64/locore.s @@ -113,16 +113,6 @@ LEXT(recover_table_end) ;\ RECOVERY_SECTION RECOVER_TABLE_START -Entry(call_continuation) - movq %rdi,%rcx /* get continuation */ - movq %rsi,%rdi /* continuation param */ - movq %rdx,%rsi /* wait result */ - movq %gs:CPU_KERNEL_STACK,%rsp /* set the stack */ - xorq %rbp,%rbp /* zero frame pointer */ - call *%rcx /* call continuation */ - movq %gs:CPU_ACTIVE_THREAD,%rdi - call EXT(thread_terminate) - /* * int rdmsr_carefully(uint32_t msr, uint32_t *lo, uint32_t *hi) */ @@ -150,9 +140,7 @@ LEXT(thread_bootstrap_return) LEXT(thread_exception_return) cli - movq %gs:CPU_ACTIVE_THREAD,%rsp - movq ACT_PCB_ISS(%rsp), %rsp - xorl %ecx, %ecx /* don't check if we're in the PFZ */ + xorl %ecx, %ecx /* don't check if we're in the PFZ */ jmp EXT(return_from_trap) /* @@ -187,6 +175,17 @@ _bcopy_fail: movl $(EFAULT),%eax /* return error for failure */ ret +Entry(pmap_safe_read) + RECOVERY_SECTION + RECOVER(_pmap_safe_read_fail) + movq (%rdi), %rcx + mov %rcx, (%rsi) + mov $1, %eax + ret +_pmap_safe_read_fail: + xor %eax, %eax + ret + /* @@ -231,7 +230,6 @@ _bcopystr_fail: movl $(EFAULT),%eax /* return error for failure */ ret - /* * Done with recovery table. */ diff --git a/osfmk/x86_64/loose_ends.c b/osfmk/x86_64/loose_ends.c index e8a1605a7..3d75d8eab 100644 --- a/osfmk/x86_64/loose_ends.c +++ b/osfmk/x86_64/loose_ends.c @@ -193,6 +193,25 @@ bcopy_phys( bcopy(PHYSMAP_PTOV(src64), PHYSMAP_PTOV(dst64), bytes); } +/* + * allow a function to get a quick virtual mapping of a physical page + */ + +int +apply_func_phys( + addr64_t dst64, + vm_size_t bytes, + int (*func)(void * buffer, vm_size_t bytes, void * arg), + void * arg) +{ + /* Not necessary for K64 - but ensure we stay within a page */ + if (((((uint32_t)dst64 & (NBPG-1)) + bytes) > NBPG) ) { + panic("apply_func_phys alignment"); + } + + return func(PHYSMAP_PTOV(dst64), bytes, arg); +} + /* * ovbcopy - like bcopy, but recognizes overlapping ranges and handles * them correctly. @@ -224,7 +243,7 @@ ovbcopy( */ -static unsigned int +static inline unsigned int ml_phys_read_data(pmap_paddr_t paddr, int size) { unsigned int result; @@ -255,8 +274,6 @@ ml_phys_read_long_long(pmap_paddr_t paddr ) return *(unsigned long long *)PHYSMAP_PTOV(paddr); } - - unsigned int ml_phys_read( vm_offset_t paddr) { return ml_phys_read_data((pmap_paddr_t)paddr, 4); @@ -313,7 +330,7 @@ unsigned long long ml_phys_read_double_64(addr64_t paddr64) * Write data to a physical address. Memory should not be cache inhibited. */ -static void +static inline void ml_phys_write_data(pmap_paddr_t paddr, unsigned long data, int size) { switch (size) { @@ -336,8 +353,6 @@ ml_phys_write_long_long(pmap_paddr_t paddr, unsigned long long data) *(unsigned long long *)PHYSMAP_PTOV(paddr) = data; } - - void ml_phys_write_byte(vm_offset_t paddr, unsigned int data) { ml_phys_write_data((pmap_paddr_t)paddr, data, 1); @@ -529,18 +544,15 @@ static inline void __clflush(void *ptr) void dcache_incoherent_io_store64(addr64_t pa, unsigned int count) { - uint32_t linesize = cpuid_info()->cache_linesize; - addr64_t addr; - boolean_t istate; + addr64_t linesize = cpuid_info()->cache_linesize; + addr64_t bound = (pa + count + linesize - 1) & ~(linesize - 1); __mfence(); - istate = ml_set_interrupts_enabled(FALSE); - - for (addr = pa; addr < pa + count; addr += linesize) - __clflush(PHYSMAP_PTOV(addr)); - - (void) ml_set_interrupts_enabled(istate); + while (pa < bound) { + __clflush(PHYSMAP_PTOV(pa)); + pa += linesize; + } __mfence(); } @@ -551,10 +563,21 @@ void dcache_incoherent_io_flush64(addr64_t pa, unsigned int count) } void -flush_dcache64(__unused addr64_t addr, - __unused unsigned count, - __unused int phys) +flush_dcache64(addr64_t addr, unsigned count, int phys) { + if (phys) { + dcache_incoherent_io_flush64(addr, count); + } + else { + uint32_t linesize = cpuid_info()->cache_linesize; + addr64_t bound = (addr + count + linesize -1) & ~(linesize - 1); + __mfence(); + while (addr < bound) { + __clflush((void *) (uintptr_t) addr); + addr += linesize; + } + __mfence(); + } } void @@ -603,316 +626,6 @@ cache_flush_page_phys(ppnum_t pa) } -static int copyio(int, user_addr_t, char *, vm_size_t, vm_size_t *, int); -static int copyio_phys(addr64_t, addr64_t, vm_size_t, int); - -/* - * The copy engine has the following characteristics - * - copyio() handles copies to/from user or kernel space - * - copypv() deals with physical or virtual addresses - * - * Readers familiar with the 32-bit kernel will expect Joe's thesis at this - * point describing the full glory of the copy window implementation. In K64, - * however, there is no need for windowing. Thanks to the vast shared address - * space, the kernel has direct access to userspace and to physical memory. - * - * User virtual addresses are accessible provided the user's cr3 is loaded. - * Physical addresses are accessible via the direct map and the PHYSMAP_PTOV() - * translation. - * - * Copyin/out variants all boil done to just these 2 routines in locore.s which - * provide fault-recoverable copying: - */ -extern int _bcopy(const void *, void *, vm_size_t); -extern int _bcopystr(const void *, void *, vm_size_t, vm_size_t *); - - -/* - * Types of copies: - */ -#define COPYIN 0 /* from user virtual to kernel virtual */ -#define COPYOUT 1 /* from kernel virtual to user virtual */ -#define COPYINSTR 2 /* string variant of copyout */ -#define COPYINPHYS 3 /* from user virtual to kernel physical */ -#define COPYOUTPHYS 4 /* from kernel physical to user virtual */ - - -static int -copyio(int copy_type, user_addr_t user_addr, char *kernel_addr, - vm_size_t nbytes, vm_size_t *lencopied, int use_kernel_map) -{ - thread_t thread; - pmap_t pmap; - vm_size_t bytes_copied; - int error = 0; - boolean_t istate = FALSE; - boolean_t recursive_CopyIOActive; -#if KDEBUG - int debug_type = 0xeff70010; - debug_type += (copy_type << 2); -#endif - - thread = current_thread(); - - KERNEL_DEBUG(debug_type | DBG_FUNC_START, - (unsigned)(user_addr >> 32), (unsigned)user_addr, - nbytes, thread->machine.copyio_state, 0); - - if (nbytes == 0) - goto out; - - pmap = thread->map->pmap; - - - assert((vm_offset_t)kernel_addr >= VM_MIN_KERNEL_AND_KEXT_ADDRESS || - copy_type == COPYINPHYS || copy_type == COPYOUTPHYS); - - /* Sanity and security check for addresses to/from a user */ - - if (((pmap != kernel_pmap) && (use_kernel_map == 0)) && - ((nbytes && (user_addr+nbytes <= user_addr)) || ((user_addr + nbytes) > vm_map_max(thread->map)))) { - error = EFAULT; - goto out; - } - - /* - * If the no_shared_cr3 boot-arg is set (true), the kernel runs on - * its own pmap and cr3 rather than the user's -- so that wild accesses - * from kernel or kexts can be trapped. So, during copyin and copyout, - * we need to switch back to the user's map/cr3. The thread is flagged - * "CopyIOActive" at this time so that if the thread is pre-empted, - * we will later restore the correct cr3. - */ - recursive_CopyIOActive = thread->machine.specFlags & CopyIOActive; - thread->machine.specFlags |= CopyIOActive; - if (no_shared_cr3) { - istate = ml_set_interrupts_enabled(FALSE); - if (get_cr3() != pmap->pm_cr3) - set_cr3(pmap->pm_cr3); - } - - /* - * Ensure that we're running on the target thread's cr3. - */ - if ((pmap != kernel_pmap) && !use_kernel_map && - (get_cr3() != pmap->pm_cr3)) { - panic("copyio(%d,%p,%p,%ld,%p,%d) cr3 is %p expects %p", - copy_type, (void *)user_addr, kernel_addr, nbytes, lencopied, use_kernel_map, - (void *) get_cr3(), (void *) pmap->pm_cr3); - } - if (no_shared_cr3) - (void) ml_set_interrupts_enabled(istate); - - KERNEL_DEBUG(0xeff70044 | DBG_FUNC_NONE, (unsigned)user_addr, - (unsigned)kernel_addr, nbytes, 0, 0); - - switch (copy_type) { - - case COPYIN: - error = _bcopy((const void *) user_addr, - kernel_addr, - nbytes); - break; - - case COPYOUT: - error = _bcopy(kernel_addr, - (void *) user_addr, - nbytes); - break; - - case COPYINPHYS: - error = _bcopy((const void *) user_addr, - PHYSMAP_PTOV(kernel_addr), - nbytes); - break; - - case COPYOUTPHYS: - error = _bcopy((const void *) PHYSMAP_PTOV(kernel_addr), - (void *) user_addr, - nbytes); - break; - - case COPYINSTR: - error = _bcopystr((const void *) user_addr, - kernel_addr, - (int) nbytes, - &bytes_copied); - - /* - * lencopied should be updated on success - * or ENAMETOOLONG... but not EFAULT - */ - if (error != EFAULT) - *lencopied = bytes_copied; - - if (error) { -#if KDEBUG - nbytes = *lencopied; -#endif - break; - } - if (*(kernel_addr + bytes_copied - 1) == 0) { - /* - * we found a NULL terminator... we're done - */ -#if KDEBUG - nbytes = *lencopied; -#endif - break; - } else { - /* - * no more room in the buffer and we haven't - * yet come across a NULL terminator - */ -#if KDEBUG - nbytes = *lencopied; -#endif - error = ENAMETOOLONG; - break; - } - break; - } - - if (!recursive_CopyIOActive) - thread->machine.specFlags &= ~CopyIOActive; - if (no_shared_cr3) { - istate = ml_set_interrupts_enabled(FALSE); - if (get_cr3() != kernel_pmap->pm_cr3) - set_cr3(kernel_pmap->pm_cr3); - (void) ml_set_interrupts_enabled(istate); - } - -out: - KERNEL_DEBUG(debug_type | DBG_FUNC_END, (unsigned)user_addr, - (unsigned)kernel_addr, (unsigned)nbytes, error, 0); - - return (error); -} - - -static int -copyio_phys(addr64_t source, addr64_t sink, vm_size_t csize, int which) -{ - char *paddr; - user_addr_t vaddr; - int ctype; - - if (which & cppvPsnk) { - paddr = (char *)sink; - vaddr = (user_addr_t)source; - ctype = COPYINPHYS; - } else { - paddr = (char *)source; - vaddr = (user_addr_t)sink; - ctype = COPYOUTPHYS; - } - return copyio(ctype, vaddr, paddr, csize, NULL, which & cppvKmap); -} - -int -copyinmsg(const user_addr_t user_addr, char *kernel_addr, mach_msg_size_t nbytes) -{ - return copyio(COPYIN, user_addr, kernel_addr, nbytes, NULL, 0); -} - -int -copyin(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes) -{ - return copyio(COPYIN, user_addr, kernel_addr, nbytes, NULL, 0); -} - -int -copyinstr(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes, vm_size_t *lencopied) -{ - *lencopied = 0; - - return copyio(COPYINSTR, user_addr, kernel_addr, nbytes, lencopied, 0); -} - -int -copyoutmsg(const char *kernel_addr, user_addr_t user_addr, mach_msg_size_t nbytes) -{ - return copyio(COPYOUT, user_addr, (char *)(uintptr_t)kernel_addr, nbytes, NULL, 0); -} - -int -copyout(const void *kernel_addr, user_addr_t user_addr, vm_size_t nbytes) -{ - return copyio(COPYOUT, user_addr, (char *)(uintptr_t)kernel_addr, nbytes, NULL, 0); -} - - -kern_return_t -copypv(addr64_t src64, addr64_t snk64, unsigned int size, int which) -{ - unsigned int lop, csize; - int bothphys = 0; - - KERNEL_DEBUG(0xeff7004c | DBG_FUNC_START, (unsigned)src64, - (unsigned)snk64, size, which, 0); - - if ((which & (cppvPsrc | cppvPsnk)) == 0 ) /* Make sure that only one is virtual */ - panic("copypv: no more than 1 parameter may be virtual\n"); /* Not allowed */ - - if ((which & (cppvPsrc | cppvPsnk)) == (cppvPsrc | cppvPsnk)) - bothphys = 1; /* both are physical */ - - while (size) { - - if (bothphys) { - lop = (unsigned int)(PAGE_SIZE - (snk64 & (PAGE_SIZE - 1))); /* Assume sink smallest */ - - if (lop > (unsigned int)(PAGE_SIZE - (src64 & (PAGE_SIZE - 1)))) - lop = (unsigned int)(PAGE_SIZE - (src64 & (PAGE_SIZE - 1))); /* No, source is smaller */ - } else { - /* - * only need to compute the resid for the physical page - * address... we don't care about where we start/finish in - * the virtual since we just call the normal copyin/copyout - */ - if (which & cppvPsrc) - lop = (unsigned int)(PAGE_SIZE - (src64 & (PAGE_SIZE - 1))); - else - lop = (unsigned int)(PAGE_SIZE - (snk64 & (PAGE_SIZE - 1))); - } - csize = size; /* Assume we can copy it all */ - if (lop < size) - csize = lop; /* Nope, we can't do it all */ -#if 0 - /* - * flush_dcache64 is currently a nop on the i386... - * it's used when copying to non-system memory such - * as video capture cards... on PPC there was a need - * to flush due to how we mapped this memory... not - * sure if it's needed on i386. - */ - if (which & cppvFsrc) - flush_dcache64(src64, csize, 1); /* If requested, flush source before move */ - if (which & cppvFsnk) - flush_dcache64(snk64, csize, 1); /* If requested, flush sink before move */ -#endif - if (bothphys) - bcopy_phys(src64, snk64, csize); /* Do a physical copy, virtually */ - else { - if (copyio_phys(src64, snk64, csize, which)) - return (KERN_FAILURE); - } -#if 0 - if (which & cppvFsrc) - flush_dcache64(src64, csize, 1); /* If requested, flush source after move */ - if (which & cppvFsnk) - flush_dcache64(snk64, csize, 1); /* If requested, flush sink after move */ -#endif - size -= csize; /* Calculate what is left */ - snk64 += csize; /* Bump sink to next physical address */ - src64 += csize; /* Bump source to next physical address */ - } - KERNEL_DEBUG(0xeff7004c | DBG_FUNC_END, (unsigned)src64, - (unsigned)snk64, size, which, 0); - - return KERN_SUCCESS; -} - #if !MACH_KDP void kdp_register_callout(void) diff --git a/osfmk/x86_64/machine_routines_asm.s b/osfmk/x86_64/machine_routines_asm.s index f8fecaccf..1c74f9fc8 100644 --- a/osfmk/x86_64/machine_routines_asm.s +++ b/osfmk/x86_64/machine_routines_asm.s @@ -27,7 +27,7 @@ */ #include -#include +#include #include #include @@ -85,31 +85,7 @@ ENTRY(tmrCvt) shrdq $32,%rdx,%rax /* %rdx:%rax >>= 32 */ ret - -/* - * void _rtc_nanotime_store( - * uint64_t tsc, // %rdi - * uint64_t nsec, // %rsi - * uint32_t scale, // %rdx - * uint32_t shift, // %rcx - * rtc_nanotime_t *dst); // %r8 - */ -ENTRY(_rtc_nanotime_store) - movl RNT_GENERATION(%r8),%eax /* get current generation */ - movl $0,RNT_GENERATION(%r8) /* flag data as being updated */ - movq %rdi,RNT_TSC_BASE(%r8) - movq %rsi,RNT_NS_BASE(%r8) - movl %edx,RNT_SCALE(%r8) - movl %ecx,RNT_SHIFT(%r8) - - incl %eax /* next generation */ - jnz 1f - incl %eax /* skip 0, which is a flag */ -1: movl %eax,RNT_GENERATION(%r8) /* update generation */ - - ret - -/* + /* * void _rtc_nanotime_adjust( * uint64_t tsc_base_delta, // %rdi * rtc_nanotime_t *dst); // %rsi @@ -170,7 +146,7 @@ ENTRY(_rtc_nanotime_read) /* * Processor whose TSC frequency is faster than SLOW_TSC_THRESHOLD */ - RTC_NANOTIME_READ_FAST() + PAL_RTC_NANOTIME_READ_FAST() ret @@ -186,3 +162,14 @@ Lslow: .data 1: String "_rtc_nanotime_read() - slow algorithm not supported" + +Entry(call_continuation) + movq %rdi,%rcx /* get continuation */ + movq %rsi,%rdi /* continuation param */ + movq %rdx,%rsi /* wait result */ + movq %gs:CPU_KERNEL_STACK,%rsp /* set the stack */ + xorq %rbp,%rbp /* zero frame pointer */ + call *%rcx /* call continuation */ + movq %gs:CPU_ACTIVE_THREAD,%rdi + call EXT(thread_terminate) + diff --git a/osfmk/x86_64/pal_routines_asm.s b/osfmk/x86_64/pal_routines_asm.s new file mode 100644 index 000000000..4f14284f6 --- /dev/null +++ b/osfmk/x86_64/pal_routines_asm.s @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include + +#include + +/* + * Copy "count" bytes from "src" to %rsp, using + * "tmpindex" for a scratch counter and %rax + */ +#define COPY_STACK(src, count, tmpindex) \ + mov $0, tmpindex /* initial scratch counter */ ; \ +1: \ + mov 0(src,tmpindex,1), %rax /* copy one 64-bit word from source... */ ; \ + mov %rax, 0(%rsp,tmpindex,1) /* ... to stack */ ; \ + add $8, tmpindex /* increment counter */ ; \ + cmp count, tmpindex /* exit it stack has been copied */ ; \ + jne 1b + +/* + void + pal_efi_call_in_64bit_mode_asm(uint64_t func, + struct pal_efi_registers *efi_reg, + void *stack_contents, + size_t stack_contents_size) + + * Switch from compatibility mode to long mode, and + * then execute the function pointer with the specified + * register and stack contents (based at %rsp). Afterwards, + * collect the return value, restore the original state, + * and return. +*/ +ENTRY(_pal_efi_call_in_64bit_mode_asm) + FRAME + + /* save non-volatile registers */ + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + /* save parameters that we will need later */ + push %rsi + push %rcx + + sub $8, %rsp /* align to 16-byte boundary */ + /* efi_reg in %rsi */ + /* stack_contents into %rdx */ + /* s_c_s into %rcx */ + sub %rcx, %rsp /* make room for stack contents */ + + COPY_STACK(%rdx, %rcx, %r8) + + /* load efi_reg into real registers */ + mov 0(%rsi), %rcx + mov 8(%rsi), %rdx + mov 16(%rsi), %r8 + mov 24(%rsi), %r9 + mov 32(%rsi), %rax + + /* func pointer in %rdi */ + call *%rdi /* call EFI runtime */ + + mov -48(%rbp), %rsi /* load efi_reg into %esi */ + mov %rax, 32(%rsi) /* save RAX back */ + + mov -56(%rbp), %rcx /* load s_c_s into %rcx */ + add %rcx, %rsp /* discard stack contents */ + add $8, %rsp /* restore stack pointer */ + + pop %rcx + pop %rsi + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + + EMARF + ret + +/* + void + pal_efi_call_in_32bit_mode_asm(uint32_t func, + struct pal_efi_registers *efi_reg, + void *stack_contents, + size_t stack_contents_size) + +*/ +ENTRY(_pal_efi_call_in_32bit_mode_asm) + FRAME + + /* save non-volatile registers */ + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + /* save parameters that we will need later */ + push %rsi + push %rcx + + push %rbp /* save %rbp and align to 16-byte boundary */ + /* efi_reg in %rsi */ + /* stack_contents into %rdx */ + /* s_c_s into %rcx */ + sub %rcx, %rsp /* make room for stack contents */ + + COPY_STACK(%rdx, %rcx, %r8) + + /* + * Here in long-mode, with high kernel addresses, + * but with the kernel double-mapped in the bottom 4GB. + * We now switch to compat mode and call into EFI. + */ + ENTER_COMPAT_MODE() + + call *%edi /* call EFI runtime */ + + ENTER_64BIT_MODE() + + mov -48(%rbp), %rsi /* load efi_reg into %esi */ + mov %rax, 32(%rsi) /* save RAX back */ + + mov -56(%rbp), %rcx /* load s_c_s into %rcx */ + add %rcx, %rsp /* discard stack contents */ + pop %rbp /* restore full 64-bit frame pointer */ + /* which the 32-bit EFI will have truncated */ + /* our full %rsp will be restored by EMARF */ + pop %rcx + pop %rsi + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + + EMARF + ret + + + +/* + * void _pal_rtc_nanotime_store( + * uint64_t tsc, // %rdi + * uint64_t nsec, // %rsi + * uint32_t scale, // %rdx + * uint32_t shift, // %rcx + * rtc_nanotime_t *dst); // %r8 + */ +ENTRY(_pal_rtc_nanotime_store) + movl RNT_GENERATION(%r8),%eax /* get current generation */ + movl $0,RNT_GENERATION(%r8) /* flag data as being updated */ + movq %rdi,RNT_TSC_BASE(%r8) + movq %rsi,RNT_NS_BASE(%r8) + movl %edx,RNT_SCALE(%r8) + movl %ecx,RNT_SHIFT(%r8) + + incl %eax /* next generation */ + jnz 1f + incl %eax /* skip 0, which is a flag */ +1: movl %eax,RNT_GENERATION(%r8) /* update generation */ + + ret + diff --git a/osfmk/x86_64/pmap.c b/osfmk/x86_64/pmap.c index a8c8cbde4..69a5c542d 100644 --- a/osfmk/x86_64/pmap.c +++ b/osfmk/x86_64/pmap.c @@ -1,6 +1,5 @@ - /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -101,6 +100,7 @@ #include #include #include +#include #include #include @@ -119,6 +119,7 @@ #include /* prototyping */ #include +#include #include #include @@ -132,6 +133,7 @@ #include #include #include +#include #if MACH_KDB #include @@ -146,7 +148,6 @@ #include - #ifdef IWANTTODEBUG #undef DEBUG #define DEBUG 1 @@ -154,39 +155,18 @@ #include #endif /* IWANTTODEBUG */ -boolean_t pmap_trace = FALSE; - -#if PMAP_DBG -#define DBG(x...) kprintf("DBG: " x) +#ifdef PMAP_DEBUG +#define DBG(x...) kprintf("DBG: " x) #else #define DBG(x...) #endif - -boolean_t no_shared_cr3 = DEBUG; /* TRUE for DEBUG by default */ - -/* - * Forward declarations for internal functions. +/* Compile time assert to ensure adjacency/alignment of per-CPU data fields used + * in the trampolines for kernel/user boundary TLB coherency. */ +char pmap_cpu_data_assert[(((offsetof(cpu_data_t, cpu_tlb_invalid) - offsetof(cpu_data_t, cpu_active_cr3)) == 8) && (offsetof(cpu_data_t, cpu_active_cr3) % 64 == 0)) ? 1 : -1]; +boolean_t pmap_trace = FALSE; - -void phys_attribute_clear( - ppnum_t phys, - int bits); - -int phys_attribute_test( - ppnum_t phys, - int bits); - -void phys_attribute_set( - ppnum_t phys, - int bits); - -void pmap_set_reference( - ppnum_t pn); - -boolean_t phys_page_exists( - ppnum_t pn); - +boolean_t no_shared_cr3 = DEBUG; /* TRUE for DEBUG by default */ int nx_enabled = 1; /* enable no-execute protection */ int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */ @@ -206,24 +186,8 @@ decl_simple_lock_data(,pv_hashed_free_list_lock) decl_simple_lock_data(,pv_hashed_kern_free_list_lock) decl_simple_lock_data(,pv_hash_table_lock) -int pv_hashed_free_count = 0; -int pv_hashed_kern_free_count = 0; - - zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry structures */ -/* - * Each entry in the pv_head_table is locked by a bit in the - * pv_lock_table. The lock bits are accessed by the physical - * address of the page they lock. - */ - -char *pv_lock_table; /* pointer to array of bits */ - - -char *pv_hash_lock_table; - - /* * First and last physical addresses that we maintain any information * for. Initialized to zero so that pmap operations done before @@ -236,11 +200,17 @@ static struct vm_object kpml4obj_object_store; static struct vm_object kpdptobj_object_store; /* - * Array of physical page attributes for managed pages. + * Array of physical page attribites for managed pages. * One byte per physical page. */ char *pmap_phys_attributes; unsigned int last_managed_page = 0; + +/* + * Amount of virtual memory mapped by one + * page-directory entry. + */ + uint64_t pde_mapped_size = PDE_MAPPED_SIZE; unsigned pmap_memory_region_count; @@ -261,19 +231,18 @@ pd_entry_t commpage64_pde; struct zone *pmap_zone; /* zone of pmap structures */ +struct zone *pmap_anchor_zone; +int pmap_debug = 0; /* flag for debugging prints */ + unsigned int inuse_ptepages_count = 0; +long long alloc_ptepages_count __attribute__((aligned(8))) = 0; /* aligned for atomic access */ +unsigned int bootstrap_wired_pages = 0; +int pt_fake_zone_index = -1; -addr64_t kernel64_cr3; +extern long NMIPI_acks; -/* - * Pmap cache. Cache is threaded through ref_count field of pmap. - * Max will eventually be constant -- variable for experimentation. - */ -int pmap_cache_max = 32; -int pmap_alloc_chunk = 8; -pmap_t pmap_cache_list; -int pmap_cache_count; -decl_simple_lock_data(,pmap_cache_lock) +boolean_t kernel_text_ps_4K = TRUE; +boolean_t wpkernel = TRUE; extern char end; @@ -282,116 +251,13 @@ static int nkpt; pt_entry_t *DMAP1, *DMAP2; caddr_t DADDR1; caddr_t DADDR2; -/* - * for legacy, returns the address of the pde entry. - * for 64 bit, causes the pdpt page containing the pde entry to be mapped, - * then returns the mapped address of the pde entry in that page - */ -pd_entry_t * -pmap_pde(pmap_t m, vm_map_offset_t v) -{ - pd_entry_t *pde; - - assert(m); -#if 0 - if (m == kernel_pmap) - pde = (&((m)->dirbase[(vm_offset_t)(v) >> PDESHIFT])); - else -#endif - pde = pmap64_pde(m, v); - - return pde; -} /* - * the single pml4 page per pmap is allocated at pmap create time and exists - * for the duration of the pmap. we allocate this page in kernel vm. - * this returns the address of the requested pml4 entry in the top level page. + * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain. + * properly deals with the anchor. + * must be called with the hash locked, does not unlock it */ -static inline -pml4_entry_t * -pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr) -{ - return &pmap->pm_pml4[(vaddr >> PML4SHIFT) & (NPML4PG-1)]; -} -/* - * maps in the pml4 page, if any, containing the pdpt entry requested - * and returns the address of the pdpt entry in that mapped page - */ -pdpt_entry_t * -pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr) -{ - pml4_entry_t newpf; - pml4_entry_t *pml4; - - assert(pmap); - if ((vaddr > 0x00007FFFFFFFFFFFULL) && - (vaddr < 0xFFFF800000000000ULL)) { - return (0); - } - - pml4 = pmap64_pml4(pmap, vaddr); - if (pml4 && ((*pml4 & INTEL_PTE_VALID))) { - newpf = *pml4 & PG_FRAME; - return &((pdpt_entry_t *) PHYSMAP_PTOV(newpf)) - [(vaddr >> PDPTSHIFT) & (NPDPTPG-1)]; - } - return (NULL); -} -/* - * maps in the pdpt page, if any, containing the pde entry requested - * and returns the address of the pde entry in that mapped page - */ -pd_entry_t * -pmap64_pde(pmap_t pmap, vm_map_offset_t vaddr) -{ - pdpt_entry_t newpf; - pdpt_entry_t *pdpt; - - assert(pmap); - if ((vaddr > 0x00007FFFFFFFFFFFULL) && - (vaddr < 0xFFFF800000000000ULL)) { - return (0); - } - - pdpt = pmap64_pdpt(pmap, vaddr); - - if (pdpt && ((*pdpt & INTEL_PTE_VALID))) { - newpf = *pdpt & PG_FRAME; - return &((pd_entry_t *) PHYSMAP_PTOV(newpf)) - [(vaddr >> PDSHIFT) & (NPDPG-1)]; - } - return (NULL); -} - -/* - * return address of mapped pte for vaddr va in pmap pmap. - * - * physically maps the pde page, if any, containing the pte in and returns - * the address of the pte in that mapped page - * - * In case the pde maps a superpage, return the pde, which, in this case - * is the actual page table entry. - */ -pt_entry_t * -pmap_pte(pmap_t pmap, vm_map_offset_t vaddr) -{ - pd_entry_t *pde; - pd_entry_t newpf; - - assert(pmap); - pde = pmap_pde(pmap, vaddr); - - if (pde && ((*pde & INTEL_PTE_VALID))) { - if (*pde & INTEL_PTE_PS) - return pde; - newpf = *pde & PG_FRAME; - return &((pt_entry_t *)PHYSMAP_PTOV(newpf)) - [i386_btop(vaddr) & (ppnum_t)(NPTEPG-1)]; - } - return (NULL); -} /* * Map memory at initialization. The physical addresses being @@ -437,7 +303,7 @@ pmap_map_bd( pt_entry_t template; pt_entry_t *pte; spl_t spl; - + vm_offset_t base = virt; template = pa_to_pte(start_addr) | INTEL_PTE_REF | INTEL_PTE_MOD @@ -452,7 +318,6 @@ pmap_map_bd( if (prot & VM_PROT_WRITE) template |= INTEL_PTE_WRITE; - while (start_addr < end_addr) { spl = splhigh(); pte = pmap_pte(kernel_pmap, (vm_map_offset_t)virt); @@ -465,9 +330,8 @@ pmap_map_bd( virt += PAGE_SIZE; start_addr += PAGE_SIZE; } - - - flush_tlb(); + (void)base; + PMAP_UPDATE_TLBS(kernel_pmap, base, base + end_addr - start_addr); return(virt); } @@ -480,13 +344,16 @@ extern vm_offset_t stext; extern vm_offset_t etext; extern vm_offset_t sdata; +extern void *KPTphys; + void pmap_cpu_init(void) { /* * Here early in the life of a processor (from cpu_mode_init()). - * Ensure global page feature is disabled. + * Ensure global page feature is disabled at this point. */ + set_cr4(get_cr4() &~ CR4_PGE); /* @@ -495,6 +362,8 @@ pmap_cpu_init(void) current_cpu_datap()->cpu_kernel_cr3 = kernel_pmap->pm_cr3; current_cpu_datap()->cpu_active_cr3 = kernel_pmap->pm_cr3; current_cpu_datap()->cpu_tlb_invalid = FALSE; + current_cpu_datap()->cpu_task_map = TASK_MAP_64BIT; + pmap_pcid_configure(); } @@ -514,7 +383,6 @@ pmap_bootstrap( vm_offset_t va; int i; #endif - assert(IA32e); vm_last_addr = VM_MAX_KERNEL_ADDRESS; /* Set the highest address @@ -534,12 +402,16 @@ pmap_bootstrap( kernel_pmap->pm_pdpt = (pd_entry_t *) ((uintptr_t)IdlePDPT); kernel_pmap->pm_pml4 = IdlePML4; kernel_pmap->pm_cr3 = (uintptr_t)ID_MAP_VTOP(IdlePML4); + pmap_pcid_initialize_kernel(kernel_pmap); + current_cpu_datap()->cpu_kernel_cr3 = (addr64_t) kernel_pmap->pm_cr3; nkpt = NKPT; OSAddAtomic(NKPT, &inuse_ptepages_count); + OSAddAtomic64(NKPT, &alloc_ptepages_count); + bootstrap_wired_pages = NKPT; virtual_avail = (vm_offset_t)(VM_MIN_KERNEL_ADDRESS) + (vm_offset_t)first_avail; virtual_end = (vm_offset_t)(VM_MAX_KERNEL_ADDRESS); @@ -590,8 +462,6 @@ pmap_bootstrap( npvhash = NPVHASH; } - printf("npvhash=%d\n", npvhash); - simple_lock_init(&kernel_pmap->lock, 0); simple_lock_init(&pv_hashed_free_list_lock, 0); simple_lock_init(&pv_hashed_kern_free_list_lock, 0); @@ -599,6 +469,14 @@ pmap_bootstrap( pmap_cpu_init(); + if (pmap_pcid_ncpus) + printf("PMAP: PCID enabled\n"); + + boot_args *args = (boot_args *)PE_state.bootArgs; + if (args->efiMode == kBootArgsEfiMode32) { + printf("EFI32: kernel virtual space limited to 4GB\n"); + virtual_end = VM_MAX_KERNEL_ADDRESS_EFI32; + } kprintf("Kernel virtual space from 0x%lx to 0x%lx.\n", (long)KERNEL_BASE, (long)virtual_end); kprintf("Available physical space from 0x%llx to 0x%llx\n", @@ -723,7 +601,6 @@ pmap_init(void) pmap_phys_attributes[pn] |= PHYS_NOENCRYPT; else if (pn >= lowest_hi && pn <= highest_hi) pmap_phys_attributes[pn] |= PHYS_NOENCRYPT; - } } } @@ -743,8 +620,20 @@ pmap_init(void) pmap_zone = zinit(s, 400*s, 4096, "pmap"); /* XXX */ zone_change(pmap_zone, Z_NOENCRYPT, TRUE); + pmap_anchor_zone = zinit(PAGE_SIZE, task_max, PAGE_SIZE, "pagetable anchors"); + zone_change(pmap_anchor_zone, Z_NOENCRYPT, TRUE); + +#if ZONE_DEBUG + /* The anchor is required to be page aligned. Zone debugging adds + * padding which may violate that requirement. Disable it + * to avoid assumptions. + */ + zone_debug_disable(pmap_anchor_zone); +#endif + s = (vm_size_t) sizeof(struct pv_hashed_entry); - pv_hashed_list_zone = zinit(s, 10000*s, 4096, "pv_list"); /* XXX */ + pv_hashed_list_zone = zinit(s, 10000*s /* Expandable zone */, + 4096 * 3 /* LCM x86_64*/, "pv_list"); zone_change(pv_hashed_list_zone, Z_NOENCRYPT, TRUE); /* create pv entries for kernel pages mapped by low level @@ -752,7 +641,7 @@ pmap_init(void) e.g. kext pages from the middle of our addr space */ vaddr = (vm_map_offset_t) VM_MIN_KERNEL_ADDRESS; - for (ppn = 0; ppn < i386_btop(avail_start); ppn++) { + for (ppn = VM_MIN_KERNEL_PAGE; ppn < i386_btop(avail_start); ppn++) { pv_rooted_entry_t pv_e; pv_e = pai_to_pvh(ppn); @@ -763,13 +652,6 @@ pmap_init(void) } pmap_initialized = TRUE; - /* - * Initialize pmap cache. - */ - pmap_cache_list = PMAP_NULL; - pmap_cache_count = 0; - simple_lock_init(&pmap_cache_lock, 0); - max_preemption_latency_tsc = tmrCvt((uint64_t)MAX_PREEMPTION_LATENCY_NS, tscFCvtn2t); /* @@ -779,6 +661,210 @@ pmap_init(void) pmap_expand_pml4(kernel_pmap, KERNEL_BASEMENT); } +/* + * Called once VM is fully initialized so that we can release unused + * sections of low memory to the general pool. + * Also complete the set-up of identity-mapped sections of the kernel: + * 1) write-protect kernel text + * 2) map kernel text using large pages if possible + * 3) read and write-protect page zero (for K32) + * 4) map the global page at the appropriate virtual address. + * + * Use of large pages + * ------------------ + * To effectively map and write-protect all kernel text pages, the text + * must be 2M-aligned at the base, and the data section above must also be + * 2M-aligned. That is, there's padding below and above. This is achieved + * through linker directives. Large pages are used only if this alignment + * exists (and not overriden by the -kernel_text_page_4K boot-arg). The + * memory layout is: + * + * : : + * | __DATA | + * sdata: ================== 2Meg + * | | + * | zero-padding | + * | | + * etext: ------------------ + * | | + * : : + * | | + * | __TEXT | + * | | + * : : + * | | + * stext: ================== 2Meg + * | | + * | zero-padding | + * | | + * eHIB: ------------------ + * | __HIB | + * : : + * + * Prior to changing the mapping from 4K to 2M, the zero-padding pages + * [eHIB,stext] and [etext,sdata] are ml_static_mfree()'d. Then all the + * 4K pages covering [stext,etext] are coalesced as 2M large pages. + * The now unused level-1 PTE pages are also freed. + */ +extern uint32_t pmap_reserved_ranges; +void +pmap_lowmem_finalize(void) +{ + spl_t spl; + int i; + + /* Check the kernel is linked at the expected base address */ + if (i386_btop(kvtophys((vm_offset_t) &IdlePML4)) != + I386_KERNEL_IMAGE_BASE_PAGE) + panic("pmap_lowmem_finalize() unexpected kernel base address"); + + /* + * Update wired memory statistics for early boot pages + */ + PMAP_ZINFO_PALLOC(bootstrap_wired_pages * PAGE_SIZE); + + /* + * Free all pages in pmap regions below the base: + * rdar://6332712 + * We can't free all the pages to VM that EFI reports available. + * Pages in the range 0xc0000-0xff000 aren't safe over sleep/wake. + * There's also a size miscalculation here: pend is one page less + * than it should be but this is not fixed to be backwards + * compatible. + * Due to this current EFI limitation, we take only the first + * entry in the memory region table. However, the loop is retained + * (with the intended termination criteria commented out) in the + * hope that some day we can free all low-memory ranges. + */ + for (i = 0; +// pmap_memory_regions[i].end <= I386_KERNEL_IMAGE_BASE_PAGE; + i < 1 && (pmap_reserved_ranges == 0); + i++) { + vm_offset_t pbase = (vm_offset_t)i386_ptob(pmap_memory_regions[i].base); + vm_offset_t pend = (vm_offset_t)i386_ptob(pmap_memory_regions[i].end); +// vm_offset_t pend = i386_ptob(pmap_memory_regions[i].end+1); + + DBG("ml_static_mfree(%p,%p) for pmap region %d\n", + (void *) ml_static_ptovirt(pbase), + (void *) (pend - pbase), i); + ml_static_mfree(ml_static_ptovirt(pbase), pend - pbase); + } + + /* + * If text and data are both 2MB-aligned, + * we can map text with large-pages, + * unless the -kernel_text_ps_4K boot-arg overrides. + */ + if ((stext & I386_LPGMASK) == 0 && (sdata & I386_LPGMASK) == 0) { + kprintf("Kernel text is 2MB aligned"); + kernel_text_ps_4K = FALSE; + if (PE_parse_boot_argn("-kernel_text_ps_4K", + &kernel_text_ps_4K, + sizeof (kernel_text_ps_4K))) + kprintf(" but will be mapped with 4K pages\n"); + else + kprintf(" and will be mapped with 2M pages\n"); + } + + (void) PE_parse_boot_argn("wpkernel", &wpkernel, sizeof (wpkernel)); + if (wpkernel) + kprintf("Kernel text %p-%p to be write-protected\n", + (void *) stext, (void *) etext); + + spl = splhigh(); + + /* + * Scan over text if mappings are to be changed: + * - Remap kernel text readonly unless the "wpkernel" boot-arg is 0 + * - Change to large-pages if possible and not overriden. + */ + if (kernel_text_ps_4K && wpkernel) { + vm_offset_t myva; + for (myva = stext; myva < etext; myva += PAGE_SIZE) { + pt_entry_t *ptep; + + ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva); + if (ptep) + pmap_store_pte(ptep, *ptep & ~INTEL_PTE_RW); + } + } + + if (!kernel_text_ps_4K) { + vm_offset_t myva; + + /* + * Release zero-filled page padding used for 2M-alignment. + */ + DBG("ml_static_mfree(%p,%p) for padding below text\n", + (void *) eHIB, (void *) (stext - eHIB)); + ml_static_mfree(eHIB, stext - eHIB); + DBG("ml_static_mfree(%p,%p) for padding above text\n", + (void *) etext, (void *) (sdata - etext)); + ml_static_mfree(etext, sdata - etext); + + /* + * Coalesce text pages into large pages. + */ + for (myva = stext; myva < sdata; myva += I386_LPGBYTES) { + pt_entry_t *ptep; + vm_offset_t pte_phys; + pt_entry_t *pdep; + pt_entry_t pde; + + pdep = pmap_pde(kernel_pmap, (vm_map_offset_t)myva); + ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva); + DBG("myva: %p pdep: %p ptep: %p\n", + (void *) myva, (void *) pdep, (void *) ptep); + if ((*ptep & INTEL_PTE_VALID) == 0) + continue; + pte_phys = (vm_offset_t)(*ptep & PG_FRAME); + pde = *pdep & PTMASK; /* page attributes from pde */ + pde |= INTEL_PTE_PS; /* make it a 2M entry */ + pde |= pte_phys; /* take page frame from pte */ + + if (wpkernel) + pde &= ~INTEL_PTE_RW; + DBG("pmap_store_pte(%p,0x%llx)\n", + (void *)pdep, pde); + pmap_store_pte(pdep, pde); + + /* + * Free the now-unused level-1 pte. + * Note: ptep is a virtual address to the pte in the + * recursive map. We can't use this address to free + * the page. Instead we need to compute its address + * in the Idle PTEs in "low memory". + */ + vm_offset_t vm_ptep = (vm_offset_t) KPTphys + + (pte_phys >> PTPGSHIFT); + DBG("ml_static_mfree(%p,0x%x) for pte\n", + (void *) vm_ptep, PAGE_SIZE); + ml_static_mfree(vm_ptep, PAGE_SIZE); + } + + /* Change variable read by sysctl machdep.pmap */ + pmap_kernel_text_ps = I386_LPGBYTES; + } + + /* map lowmem global page into fixed addr */ + pt_entry_t *pte = NULL; + if (0 == (pte = pmap_pte(kernel_pmap, + VM_MIN_KERNEL_LOADED_ADDRESS + 0x2000))) + panic("lowmem pte"); + /* make sure it is defined on page boundary */ + assert(0 == ((vm_offset_t) &lowGlo & PAGE_MASK)); + pmap_store_pte(pte, kvtophys((vm_offset_t)&lowGlo) + | INTEL_PTE_REF + | INTEL_PTE_MOD + | INTEL_PTE_WIRED + | INTEL_PTE_VALID + | INTEL_PTE_RW); + splx(spl); + if (pmap_pcid_ncpus) + tlb_flush_global(); + else + flush_tlb_raw(); +} /* * this function is only used for debugging fron the vm layer @@ -885,7 +971,8 @@ pmap_create( p = (pmap_t) zalloc(pmap_zone); if (PMAP_NULL == p) panic("pmap_create zalloc"); - + /* Zero all fields */ + bzero(p, sizeof(*p)); /* init counts now since we'll be bumping some */ simple_lock_init(&p->lock, 0); p->stats.resident_count = 0; @@ -896,15 +983,15 @@ pmap_create( p->pm_shared = FALSE; p->pm_task_map = is_64bit ? TASK_MAP_64BIT : TASK_MAP_32BIT;; + if (pmap_pcid_ncpus) + pmap_pcid_initialize(p); + p->pm_pml4 = zalloc(pmap_anchor_zone); - /* alloc the pml4 page in kernel vm */ - if (KERN_SUCCESS != kmem_alloc_kobject(kernel_map, (vm_offset_t *)(&p->pm_pml4), PAGE_SIZE)) - panic("pmap_create kmem_alloc_kobject pml4"); + pmap_assert((((uintptr_t)p->pm_pml4) & PAGE_MASK) == 0); - memset((char *)p->pm_pml4, 0, PAGE_SIZE); - p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4); + memset((char *)p->pm_pml4, 0, PAGE_SIZE); - OSAddAtomic(1, &inuse_ptepages_count); + p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4); /* allocate the vm_objs to hold the pdpt, pde and pte pages */ @@ -920,7 +1007,7 @@ pmap_create( if (NULL == p->pm_obj) panic("pmap_create pte obj"); - /* All pmaps share the kennel's pml4 */ + /* All pmaps share the kernel's pml4 */ pml4 = pmap64_pml4(p, 0ULL); kpml4 = kernel_pmap->pm_pml4; pml4[KERNEL_PML4_INDEX] = kpml4[KERNEL_PML4_INDEX]; @@ -940,10 +1027,9 @@ pmap_create( */ void -pmap_destroy( - register pmap_t p) +pmap_destroy(pmap_t p) { - register int c; + int c; if (p == PMAP_NULL) return; @@ -955,6 +1041,8 @@ pmap_destroy( c = --p->ref_count; + pmap_assert((current_thread() && (current_thread()->map)) ? (current_thread()->map->pmap != p) : TRUE); + if (c == 0) { /* * If some cpu is not using the physical pmap pointer that it @@ -964,12 +1052,14 @@ pmap_destroy( */ PMAP_UPDATE_TLBS(p, 0x0ULL, 0xFFFFFFFFFFFFF000ULL); } - + if (pmap_pcid_ncpus) + pmap_destroy_pcid_sync(p); PMAP_UNLOCK(p); if (c != 0) { PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END, p, 1, 0, 0, 0); + pmap_assert(p == kernel_pmap); return; /* still in use */ } @@ -979,8 +1069,7 @@ pmap_destroy( */ int inuse_ptepages = 0; - inuse_ptepages++; - kmem_free(kernel_map, (vm_offset_t)p->pm_pml4, PAGE_SIZE); + zfree(pmap_anchor_zone, p->pm_pml4); inuse_ptepages += p->pm_obj_pml4->resident_page_count; vm_object_deallocate(p->pm_obj_pml4); @@ -992,6 +1081,7 @@ pmap_destroy( vm_object_deallocate(p->pm_obj); OSAddAtomic(-inuse_ptepages, &inuse_ptepages_count); + PMAP_ZINFO_PFREE(inuse_ptepages * PAGE_SIZE); zfree(pmap_zone, p); @@ -1028,22 +1118,6 @@ pmap_remove_some_phys( } -/* - * Routine: - * pmap_disconnect - * - * Function: - * Disconnect all mappings for this page and return reference and change status - * in generic format. - * - */ -unsigned int pmap_disconnect( - ppnum_t pa) -{ - pmap_page_protect(pa, 0); /* disconnect the page */ - return (pmap_get_refmod(pa)); /* return ref/chg status */ -} - /* * Set the physical protection on the * specified range of this map as requested. @@ -1160,44 +1234,6 @@ pmap_map_block( } } -/* - * Routine: pmap_change_wiring - * Function: Change the wiring attribute for a map/virtual-address - * pair. - * In/out conditions: - * The mapping must already exist in the pmap. - */ -void -pmap_change_wiring( - pmap_t map, - vm_map_offset_t vaddr, - boolean_t wired) -{ - pt_entry_t *pte; - - PMAP_LOCK(map); - - if ((pte = pmap_pte(map, vaddr)) == PT_ENTRY_NULL) - panic("pmap_change_wiring: pte missing"); - - if (wired && !iswired(*pte)) { - /* - * wiring down mapping - */ - OSAddAtomic(+1, &map->stats.wired_count); - pmap_update_pte(pte, *pte, (*pte | INTEL_PTE_WIRED)); - } - else if (!wired && iswired(*pte)) { - /* - * unwiring mapping - */ - assert(map->stats.wired_count >= 1); - OSAddAtomic(-1, &map->stats.wired_count); - pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_WIRED)); - } - - PMAP_UNLOCK(map); -} void pmap_expand_pml4( @@ -1236,6 +1272,8 @@ pmap_expand_pml4( vm_page_unlock_queues(); OSAddAtomic(1, &inuse_ptepages_count); + OSAddAtomic64(1, &alloc_ptepages_count); + PMAP_ZINFO_PALLOC(PAGE_SIZE); /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */ vm_object_lock(map->pm_obj_pml4); @@ -1251,6 +1289,7 @@ pmap_expand_pml4( VM_PAGE_FREE(m); OSAddAtomic(-1, &inuse_ptepages_count); + PMAP_ZINFO_PFREE(PAGE_SIZE); return; } @@ -1319,6 +1358,8 @@ pmap_expand_pdpt( vm_page_unlock_queues(); OSAddAtomic(1, &inuse_ptepages_count); + OSAddAtomic64(1, &alloc_ptepages_count); + PMAP_ZINFO_PALLOC(PAGE_SIZE); /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */ vm_object_lock(map->pm_obj_pdpt); @@ -1334,6 +1375,7 @@ pmap_expand_pdpt( VM_PAGE_FREE(m); OSAddAtomic(-1, &inuse_ptepages_count); + PMAP_ZINFO_PFREE(PAGE_SIZE); return; } @@ -1430,6 +1472,8 @@ pmap_expand( vm_page_unlock_queues(); OSAddAtomic(1, &inuse_ptepages_count); + OSAddAtomic64(1, &alloc_ptepages_count); + PMAP_ZINFO_PALLOC(PAGE_SIZE); /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */ vm_object_lock(map->pm_obj); @@ -1446,6 +1490,7 @@ pmap_expand( VM_PAGE_FREE(m); OSAddAtomic(-1, &inuse_ptepages_count); + PMAP_ZINFO_PFREE(PAGE_SIZE); return; } @@ -1478,7 +1523,8 @@ pmap_expand( * that pmap_steal_memory uses, rather than calling vm_page_grab (which * isn't available yet). */ void -pmap_pre_expand(pmap_t pmap, vm_map_offset_t vaddr) { +pmap_pre_expand(pmap_t pmap, vm_map_offset_t vaddr) +{ ppnum_t pn; pt_entry_t *pte; @@ -1645,11 +1691,12 @@ pmap_collect( if (m == VM_PAGE_NULL) panic("pmap_collect: pte page not in object"); + vm_object_unlock(p->pm_obj); + VM_PAGE_FREE(m); OSAddAtomic(-1, &inuse_ptepages_count); - - vm_object_unlock(p->pm_obj); + PMAP_ZINFO_PFREE(PAGE_SIZE); } PMAP_LOCK(p); @@ -1701,301 +1748,6 @@ pmap_pageable( #endif /* lint */ } -/* - * Clear specified attribute bits. - */ -void -phys_attribute_clear( - ppnum_t pn, - int bits) -{ - pv_rooted_entry_t pv_h; - pv_hashed_entry_t pv_e; - pt_entry_t *pte; - int pai; - pmap_t pmap; - - pmap_intr_assert(); - assert(pn != vm_page_fictitious_addr); - if (pn == vm_page_guard_addr) - return; - - pai = ppn_to_pai(pn); - - if (!IS_MANAGED_PAGE(pai)) { - /* - * Not a managed page. - */ - return; - } - - - PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, - pn, bits, 0, 0, 0); - - pv_h = pai_to_pvh(pai); - - LOCK_PVH(pai); - - /* - * Walk down PV list, clearing all modify or reference bits. - * We do not have to lock the pv_list because we have - * the entire pmap system locked. - */ - if (pv_h->pmap != PMAP_NULL) { - /* - * There are some mappings. - */ - - pv_e = (pv_hashed_entry_t)pv_h; - - do { - vm_map_offset_t va; - - pmap = pv_e->pmap; - va = pv_e->va; - - /* - * Clear modify and/or reference bits. - */ - pte = pmap_pte(pmap, va); - pmap_update_pte(pte, *pte, (*pte & ~bits)); - /* Ensure all processors using this translation - * invalidate this TLB entry. The invalidation *must* - * follow the PTE update, to ensure that the TLB - * shadow of the 'D' bit (in particular) is - * synchronized with the updated PTE. - */ - PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE); - - pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink); - - } while (pv_e != (pv_hashed_entry_t)pv_h); - } - pmap_phys_attributes[pai] &= ~bits; - - UNLOCK_PVH(pai); - - PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END, - 0, 0, 0, 0, 0); -} - -/* - * Check specified attribute bits. - */ -int -phys_attribute_test( - ppnum_t pn, - int bits) -{ - pv_rooted_entry_t pv_h; - pv_hashed_entry_t pv_e; - pt_entry_t *pte; - int pai; - pmap_t pmap; - int attributes = 0; - - pmap_intr_assert(); - assert(pn != vm_page_fictitious_addr); - if (pn == vm_page_guard_addr) - return 0; - - pai = ppn_to_pai(pn); - - if (!IS_MANAGED_PAGE(pai)) { - /* - * Not a managed page. - */ - return 0; - } - - /* - * super fast check... if bits already collected - * no need to take any locks... - * if not set, we need to recheck after taking - * the lock in case they got pulled in while - * we were waiting for the lock - */ - if ((pmap_phys_attributes[pai] & bits) == bits) - return bits; - - pv_h = pai_to_pvh(pai); - - LOCK_PVH(pai); - - attributes = pmap_phys_attributes[pai] & bits; - - - /* - * Walk down PV list, checking the mappings until we - * reach the end or we've found the attributes we've asked for - * We do not have to lock the pv_list because we have - * the entire pmap system locked. - */ - if (attributes != bits && - pv_h->pmap != PMAP_NULL) { - /* - * There are some mappings. - */ - pv_e = (pv_hashed_entry_t)pv_h; - do { - vm_map_offset_t va; - - pmap = pv_e->pmap; - va = pv_e->va; - /* - * first make sure any processor actively - * using this pmap, flushes its TLB state - */ - PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE); - - /* - * pick up modify and/or reference bits from mapping - */ - - pte = pmap_pte(pmap, va); - attributes |= (int)(*pte & bits); - - pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink); - - } while ((attributes != bits) && - (pv_e != (pv_hashed_entry_t)pv_h)); - } - - UNLOCK_PVH(pai); - return (attributes); -} - -/* - * Set specified attribute bits. - */ -void -phys_attribute_set( - ppnum_t pn, - int bits) -{ - int pai; - - pmap_intr_assert(); - assert(pn != vm_page_fictitious_addr); - if (pn == vm_page_guard_addr) - return; - - pai = ppn_to_pai(pn); - - if (!IS_MANAGED_PAGE(pai)) { - /* Not a managed page. */ - return; - } - - LOCK_PVH(pai); - pmap_phys_attributes[pai] |= bits; - UNLOCK_PVH(pai); -} - -/* - * Set the modify bit on the specified physical page. - */ - -void -pmap_set_modify(ppnum_t pn) -{ - phys_attribute_set(pn, PHYS_MODIFIED); -} - -/* - * Clear the modify bits on the specified physical page. - */ - -void -pmap_clear_modify(ppnum_t pn) -{ - phys_attribute_clear(pn, PHYS_MODIFIED); -} - -/* - * pmap_is_modified: - * - * Return whether or not the specified physical page is modified - * by any physical maps. - */ - -boolean_t -pmap_is_modified(ppnum_t pn) -{ - if (phys_attribute_test(pn, PHYS_MODIFIED)) - return TRUE; - return FALSE; -} - -/* - * pmap_clear_reference: - * - * Clear the reference bit on the specified physical page. - */ - -void -pmap_clear_reference(ppnum_t pn) -{ - phys_attribute_clear(pn, PHYS_REFERENCED); -} - -void -pmap_set_reference(ppnum_t pn) -{ - phys_attribute_set(pn, PHYS_REFERENCED); -} - -/* - * pmap_is_referenced: - * - * Return whether or not the specified physical page is referenced - * by any physical maps. - */ - -boolean_t -pmap_is_referenced(ppnum_t pn) -{ - if (phys_attribute_test(pn, PHYS_REFERENCED)) - return TRUE; - return FALSE; -} - -/* - * pmap_get_refmod(phys) - * returns the referenced and modified bits of the specified - * physical page. - */ -unsigned int -pmap_get_refmod(ppnum_t pn) -{ - int refmod; - unsigned int retval = 0; - - refmod = phys_attribute_test(pn, PHYS_MODIFIED | PHYS_REFERENCED); - - if (refmod & PHYS_MODIFIED) - retval |= VM_MEM_MODIFIED; - if (refmod & PHYS_REFERENCED) - retval |= VM_MEM_REFERENCED; - - return (retval); -} - -/* - * pmap_clear_refmod(phys, mask) - * clears the referenced and modified bits as specified by the mask - * of the specified physical page. - */ -void -pmap_clear_refmod(ppnum_t pn, unsigned int mask) -{ - unsigned int x86Mask; - - x86Mask = ( ((mask & VM_MEM_MODIFIED)? PHYS_MODIFIED : 0) - | ((mask & VM_MEM_REFERENCED)? PHYS_REFERENCED : 0)); - phys_attribute_clear(pn, x86Mask); -} void invalidate_icache(__unused vm_offset_t addr, @@ -2023,10 +1775,13 @@ extern kern_return_t dtrace_copyio_postflight(addr64_t); kern_return_t dtrace_copyio_preflight(__unused addr64_t va) { thread_t thread = current_thread(); + uint64_t ccr3; if (current_map() == kernel_map) return KERN_FAILURE; - else if (get_cr3() != thread->map->pmap->pm_cr3) + else if (((ccr3 = get_cr3_base()) != thread->map->pmap->pm_cr3) && (no_shared_cr3 == FALSE)) + return KERN_FAILURE; + else if (no_shared_cr3 && (ccr3 != kernel_pmap->pm_cr3)) return KERN_FAILURE; else if (thread->machine.specFlags & CopyIOActive) return KERN_FAILURE; @@ -2090,6 +1845,8 @@ phys_page_exists(ppnum_t pn) return TRUE; } + + void pmap_switch(pmap_t tpmap) { @@ -2111,6 +1868,12 @@ pmap_disable_NX(pmap_t pmap) pmap->nx_enabled = 0; } +void +pt_fake_zone_init(int zone_index) +{ + pt_fake_zone_index = zone_index; +} + void pt_fake_zone_info( int *count, @@ -2118,8 +1881,10 @@ pt_fake_zone_info( vm_size_t *max_size, vm_size_t *elem_size, vm_size_t *alloc_size, + uint64_t *sum_size, int *collectable, - int *exhaustable) + int *exhaustable, + int *caller_acct) { *count = inuse_ptepages_count; *cur_size = PAGE_SIZE * inuse_ptepages_count; @@ -2129,13 +1894,13 @@ pt_fake_zone_info( vm_page_free_count); *elem_size = PAGE_SIZE; *alloc_size = PAGE_SIZE; + *sum_size = alloc_ptepages_count * PAGE_SIZE; *collectable = 1; *exhaustable = 0; + *caller_acct = 1; } -extern long NMIPI_acks; - static inline void pmap_cpuset_NMIPI(cpu_set cpu_mask) { unsigned int cpu, cpu_bit; @@ -2159,8 +1924,9 @@ pmap_cpuset_NMIPI(cpu_set cpu_mask) { * - flush the local tlb if active for this pmap * - return ... the caller will unlock the pmap */ + void -pmap_flush_tlbs(pmap_t pmap) +pmap_flush_tlbs(pmap_t pmap, vm_map_offset_t startv, vm_map_offset_t endv) { unsigned int cpu; unsigned int cpu_bit; @@ -2169,6 +1935,7 @@ pmap_flush_tlbs(pmap_t pmap) pmap_paddr_t pmap_cr3 = pmap->pm_cr3; boolean_t flush_self = FALSE; uint64_t deadline; + boolean_t pmap_is_shared = (pmap->pm_shared || (pmap == kernel_pmap)); assert((processor_avail_count < 2) || (ml_get_interrupts_enabled() && get_preemption_level() != 0)); @@ -2179,6 +1946,12 @@ pmap_flush_tlbs(pmap_t pmap) * don't signal -- they'll check as they go busy. */ cpus_to_signal = 0; + + if (pmap_pcid_ncpus) { + pmap_pcid_invalidate_all_cpus(pmap); + __asm__ volatile("mfence":::"memory"); + } + for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) { if (!cpu_datap(cpu)->cpu_running) continue; @@ -2187,14 +1960,16 @@ pmap_flush_tlbs(pmap_t pmap) if ((pmap_cr3 == cpu_task_cr3) || (pmap_cr3 == cpu_active_cr3) || - (pmap->pm_shared) || - (pmap == kernel_pmap)) { + (pmap_is_shared)) { if (cpu == my_cpu) { flush_self = TRUE; continue; } - cpu_datap(cpu)->cpu_tlb_invalid = TRUE; - __asm__ volatile("mfence"); + if (pmap_pcid_ncpus && pmap_is_shared) + cpu_datap(cpu)->cpu_tlb_invalid_global = TRUE; + else + cpu_datap(cpu)->cpu_tlb_invalid_local = TRUE; + __asm__ volatile("mfence":::"memory"); /* * We don't need to signal processors which will flush @@ -2220,15 +1995,24 @@ pmap_flush_tlbs(pmap_t pmap) } } - PMAP_TRACE(PMAP_CODE(PMAP__FLUSH_TLBS) | DBG_FUNC_START, - pmap, cpus_to_signal, flush_self, 0, 0); + PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS) | DBG_FUNC_START, + pmap, cpus_to_signal, flush_self, startv, endv); /* * Flush local tlb if required. * Do this now to overlap with other processors responding. */ - if (flush_self) - flush_tlb(); + if (flush_self) { + if (pmap_pcid_ncpus) { + pmap_pcid_validate_cpu(pmap, my_cpu); + if (pmap_is_shared) + tlb_flush_global(); + else + flush_tlb_raw(); + } + else + flush_tlb_raw(); + } if (cpus_to_signal) { cpu_set cpus_to_respond = cpus_to_signal; @@ -2241,6 +2025,9 @@ pmap_flush_tlbs(pmap_t pmap) long orig_acks = 0; for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) { + /* Consider checking local/global invalidity + * as appropriate in the PCID case. + */ if ((cpus_to_respond & cpu_bit) != 0) { if (!cpu_datap(cpu)->cpu_running || cpu_datap(cpu)->cpu_tlb_invalid == FALSE || @@ -2252,7 +2039,7 @@ pmap_flush_tlbs(pmap_t pmap) if (cpus_to_respond == 0) break; } - if (mach_absolute_time() > deadline) { + if (cpus_to_respond && (mach_absolute_time() > deadline)) { if (machine_timeout_suspended()) continue; pmap_tlb_flush_timeout = TRUE; @@ -2266,18 +2053,31 @@ pmap_flush_tlbs(pmap_t pmap) } } - PMAP_TRACE(PMAP_CODE(PMAP__FLUSH_TLBS) | DBG_FUNC_END, - pmap, cpus_to_signal, flush_self, 0, 0); + PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS) | DBG_FUNC_END, + pmap, cpus_to_signal, startv, endv, 0); } void process_pmap_updates(void) { - assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0); - - flush_tlb(); + int ccpu = cpu_number(); + pmap_assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0); + if (pmap_pcid_ncpus) { + pmap_pcid_validate_current(); + if (cpu_datap(ccpu)->cpu_tlb_invalid_global) { + cpu_datap(ccpu)->cpu_tlb_invalid = FALSE; + tlb_flush_global(); + } + else { + cpu_datap(ccpu)->cpu_tlb_invalid_local = FALSE; + flush_tlb_raw(); + } + } + else { + current_cpu_datap()->cpu_tlb_invalid = FALSE; + flush_tlb_raw(); + } - current_cpu_datap()->cpu_tlb_invalid = FALSE; __asm__ volatile("mfence"); } @@ -2292,13 +2092,3 @@ pmap_update_interrupt(void) PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_END, 0, 0, 0, 0, 0); } - - -unsigned int -pmap_cache_attributes(ppnum_t pn) -{ - return IS_MANAGED_PAGE(ppn_to_pai(pn)) ? VM_WIMG_COPYBACK - : VM_WIMG_IO; -} - - diff --git a/osfmk/x86_64/pmap_pcid.c b/osfmk/x86_64/pmap_pcid.c new file mode 100644 index 000000000..c8fef93b4 --- /dev/null +++ b/osfmk/x86_64/pmap_pcid.c @@ -0,0 +1,310 @@ +/* + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * PCID (Process context identifier) aka tagged TLB support. + * On processors with this feature, unless disabled via the -pmap_pcid_disable + * boot-arg, the following algorithm is in effect: + * Each processor maintains an array of tag refcounts indexed by tag. + * Each address space maintains an array of tags indexed by CPU number. + * Each address space maintains a coherency vector, indexed by CPU + * indicating that the TLB state for that address space has a pending + * invalidation. + * On a context switch, a refcounted tag is lazily assigned to the newly + * dispatched (CPU, address space) tuple. + * When an inactive address space is invalidated on a remote CPU, it is marked + * for invalidation upon the next dispatch. Some invalidations are + * also processed at the user/kernel boundary. + * Provisions are made for the case where a CPU is overcommmitted, i.e. + * more active address spaces exist than the number of logical tags + * provided for by the processor architecture (currently 4096). + * The algorithm assumes the processor remaps the logical tags + * to physical TLB context IDs in an LRU fashion for efficiency. (DRK '10) + */ + +uint32_t pmap_pcid_ncpus; +boolean_t pmap_pcid_disabled = FALSE; + +void pmap_pcid_configure(void) { + int ccpu = cpu_number(); + uintptr_t cr4 = get_cr4(); + boolean_t pcid_present = FALSE; + + pmap_pcid_log("PCID configure invoked on CPU %d\n", ccpu); + pmap_assert(ml_get_interrupts_enabled() == FALSE || get_preemption_level() !=0); + pmap_assert(cpu_mode_is64bit()); + + if (PE_parse_boot_argn("-pmap_pcid_disable", &pmap_pcid_disabled, sizeof (pmap_pcid_disabled))) { + pmap_pcid_log("PMAP: PCID feature disabled\n"); + printf("PMAP: PCID feature disabled, %u\n", pmap_pcid_disabled); + kprintf("PMAP: PCID feature disabled %u\n", pmap_pcid_disabled); + } + /* no_shared_cr3+PCID is currently unsupported */ +#if DEBUG + if (pmap_pcid_disabled == FALSE) + no_shared_cr3 = FALSE; + else + no_shared_cr3 = TRUE; +#else + if (no_shared_cr3) + pmap_pcid_disabled = TRUE; +#endif + if (pmap_pcid_disabled || no_shared_cr3) { + unsigned i; + /* Reset PCID status, as we may have picked up + * strays if discovered prior to platform + * expert initialization. + */ + for (i = 0; i < real_ncpus; i++) { + if (cpu_datap(i)) { + cpu_datap(i)->cpu_pmap_pcid_enabled = FALSE; + } + pmap_pcid_ncpus = 0; + } + cpu_datap(ccpu)->cpu_pmap_pcid_enabled = FALSE; + return; + } + /* DRKTODO: assert if features haven't been discovered yet. Redundant + * invocation of cpu_mode_init and descendants masks this for now. + */ + if ((cpuid_features() & CPUID_FEATURE_PCID)) + pcid_present = TRUE; + else { + cpu_datap(ccpu)->cpu_pmap_pcid_enabled = FALSE; + pmap_pcid_log("PMAP: PCID not detected CPU %d\n", ccpu); + return; + } + if ((cr4 & (CR4_PCIDE | CR4_PGE)) == (CR4_PCIDE|CR4_PGE)) { + cpu_datap(ccpu)->cpu_pmap_pcid_enabled = TRUE; + pmap_pcid_log("PMAP: PCID already enabled %d\n", ccpu); + return; + } + if (pcid_present == TRUE) { + pmap_pcid_log("Pre-PCID:CR0: 0x%lx, CR3: 0x%lx, CR4(CPU %d): 0x%lx\n", get_cr0(), get_cr3_raw(), ccpu, cr4); + + if (cpu_number() >= PMAP_PCID_MAX_CPUS) { + panic("PMAP_PCID_MAX_CPUS %d\n", cpu_number()); + } + if ((get_cr4() & CR4_PGE) == 0) { + set_cr4(get_cr4() | CR4_PGE); + pmap_pcid_log("Toggled PGE ON (CPU: %d\n", ccpu); + } + set_cr4(get_cr4() | CR4_PCIDE); + pmap_pcid_log("Post PCID: CR0: 0x%lx, CR3: 0x%lx, CR4(CPU %d): 0x%lx\n", get_cr0(), get_cr3_raw(), ccpu, get_cr4()); + tlb_flush_global(); + cpu_datap(ccpu)->cpu_pmap_pcid_enabled = TRUE; + + if (OSIncrementAtomic(&pmap_pcid_ncpus) == machine_info.max_cpus) { + pmap_pcid_log("All PCIDs enabled: real_ncpus: %d, pmap_pcid_ncpus: %d\n", real_ncpus, pmap_pcid_ncpus); + } + cpu_datap(ccpu)->cpu_pmap_pcid_coherentp = + cpu_datap(ccpu)->cpu_pmap_pcid_coherentp_kernel = + &(kernel_pmap->pmap_pcid_coherency_vector[ccpu]); + cpu_datap(ccpu)->cpu_pcid_refcounts[0] = 1; + } +} + +void pmap_pcid_initialize(pmap_t p) { + unsigned i; + unsigned nc = sizeof(p->pmap_pcid_cpus)/sizeof(pcid_t); + + pmap_assert(nc >= real_ncpus); + for (i = 0; i < nc; i++) { + p->pmap_pcid_cpus[i] = PMAP_PCID_INVALID_PCID; + /* We assume here that the coherency vector is zeroed by + * pmap_create + */ + } +} + +void pmap_pcid_initialize_kernel(pmap_t p) { + unsigned i; + unsigned nc = sizeof(p->pmap_pcid_cpus)/sizeof(pcid_t); + + for (i = 0; i < nc; i++) { + p->pmap_pcid_cpus[i] = 0; + /* We assume here that the coherency vector is zeroed by + * pmap_create + */ + } +} + +pcid_t pmap_pcid_allocate_pcid(int ccpu) { + int i; + pcid_ref_t cur_min = 0xFF; + uint32_t cur_min_index = ~1; + pcid_ref_t *cpu_pcid_refcounts = &cpu_datap(ccpu)->cpu_pcid_refcounts[0]; + pcid_ref_t old_count; + + if ((i = cpu_datap(ccpu)->cpu_pcid_free_hint) != 0) { + if (cpu_pcid_refcounts[i] == 0) { + (void)__sync_fetch_and_add(&cpu_pcid_refcounts[i], 1); + cpu_datap(ccpu)->cpu_pcid_free_hint = 0; + return i; + } + } + /* Linear scan to discover free slot, with hint. Room for optimization + * but with intelligent prefetchers this should be + * adequately performant, as it is invoked + * only on first dispatch of a new address space onto + * a given processor. DRKTODO: use larger loads and + * zero byte discovery -- any pattern != ~1 should + * signify a free slot. + */ + for (i = PMAP_PCID_MIN_PCID; i < PMAP_PCID_MAX_PCID; i++) { + pcid_ref_t cur_refcount = cpu_pcid_refcounts[i]; + + pmap_assert(cur_refcount < PMAP_PCID_MAX_REFCOUNT); + + if (cur_refcount == 0) { + (void)__sync_fetch_and_add(&cpu_pcid_refcounts[i], 1); + return i; + } + else { + if (cur_refcount < cur_min) { + cur_min_index = i; + cur_min = cur_refcount; + } + } + } + pmap_assert(cur_min_index > 0 && cur_min_index < PMAP_PCID_MAX_PCID); + /* Consider "rebalancing" tags actively in highly oversubscribed cases + * perhaps selecting tags with lower activity. + */ + + old_count = __sync_fetch_and_add(&cpu_pcid_refcounts[cur_min_index], 1); + pmap_assert(old_count < PMAP_PCID_MAX_REFCOUNT); + return cur_min_index; +} + +void pmap_pcid_deallocate_pcid(int ccpu, pmap_t tpmap) { + pcid_t pcid; + pmap_t lp; + pcid_ref_t prior_count; + + pcid = tpmap->pmap_pcid_cpus[ccpu]; + pmap_assert(pcid != PMAP_PCID_INVALID_PCID); + if (pcid == PMAP_PCID_INVALID_PCID) + return; + + lp = cpu_datap(ccpu)->cpu_pcid_last_pmap_dispatched[pcid]; + pmap_assert(pcid > 0 && pcid < PMAP_PCID_MAX_PCID); + pmap_assert(cpu_datap(ccpu)->cpu_pcid_refcounts[pcid] >= 1); + + if (lp == tpmap) + (void)__sync_bool_compare_and_swap(&cpu_datap(ccpu)->cpu_pcid_last_pmap_dispatched[pcid], tpmap, PMAP_INVALID); + + if ((prior_count = __sync_fetch_and_sub(&cpu_datap(ccpu)->cpu_pcid_refcounts[pcid], 1)) == 1) { + cpu_datap(ccpu)->cpu_pcid_free_hint = pcid; + } + pmap_assert(prior_count <= PMAP_PCID_MAX_REFCOUNT); +} + +void pmap_destroy_pcid_sync(pmap_t p) { + int i; + pmap_assert(ml_get_interrupts_enabled() == FALSE || get_preemption_level() !=0); + for (i = 0; i < PMAP_PCID_MAX_CPUS; i++) + if (p->pmap_pcid_cpus[i] != PMAP_PCID_INVALID_PCID) + pmap_pcid_deallocate_pcid(i, p); +} + +pcid_t pcid_for_pmap_cpu_tuple(pmap_t pmap, int ccpu) { + return pmap->pmap_pcid_cpus[ccpu]; +} +#if PMAP_ASSERT +#define PCID_RECORD_SIZE 128 +uint64_t pcid_record_array[PCID_RECORD_SIZE]; +#endif + +void pmap_pcid_activate(pmap_t tpmap, int ccpu) { + pcid_t new_pcid = tpmap->pmap_pcid_cpus[ccpu]; + pmap_t last_pmap; + boolean_t pcid_conflict = FALSE, pending_flush = FALSE; + + pmap_assert(cpu_datap(ccpu)->cpu_pmap_pcid_enabled); + if (__improbable(new_pcid == PMAP_PCID_INVALID_PCID)) { + new_pcid = tpmap->pmap_pcid_cpus[ccpu] = pmap_pcid_allocate_pcid(ccpu); + } + pmap_assert(new_pcid != PMAP_PCID_INVALID_PCID); +#ifdef PCID_ASSERT + cpu_datap(ccpu)->cpu_last_pcid = cpu_datap(ccpu)->cpu_active_pcid; +#endif + cpu_datap(ccpu)->cpu_active_pcid = new_pcid; + + pending_flush = (tpmap->pmap_pcid_coherency_vector[ccpu] != 0); + if (__probable(pending_flush == FALSE)) { + last_pmap = cpu_datap(ccpu)->cpu_pcid_last_pmap_dispatched[new_pcid]; + pcid_conflict = ((last_pmap != NULL) &&(tpmap != last_pmap)); + } + if (__improbable(pending_flush || pcid_conflict)) { + pmap_pcid_validate_cpu(tpmap, ccpu); + } + /* Consider making this a unique id */ + cpu_datap(ccpu)->cpu_pcid_last_pmap_dispatched[new_pcid] = tpmap; + + pmap_assert(new_pcid < PMAP_PCID_MAX_PCID); + pmap_assert(((tpmap == kernel_pmap) && new_pcid == 0) || ((new_pcid != PMAP_PCID_INVALID_PCID) && (new_pcid != 0))); +#if PMAP_ASSERT + pcid_record_array[ccpu % PCID_RECORD_SIZE] = tpmap->pm_cr3 | new_pcid | (((uint64_t)(!(pending_flush || pcid_conflict))) <<63); + pml4_entry_t *pml4 = pmap64_pml4(tpmap, 0ULL); + /* Diagnostic to detect pagetable anchor corruption */ + if (pml4[KERNEL_PML4_INDEX] != kernel_pmap->pm_pml4[KERNEL_PML4_INDEX]) + __asm__ volatile("int3"); +#endif /* PMAP_ASSERT */ + set_cr3_composed(tpmap->pm_cr3, new_pcid, !(pending_flush || pcid_conflict)); + + if (!pending_flush) { + /* We did not previously observe a pending invalidation for this + * ASID. However, the load from the coherency vector + * could've been reordered ahead of the store to the + * active_cr3 field (in the context switch path, our + * caller). Re-consult the pending invalidation vector + * after the CR3 write. We rely on MOV CR3's documented + * serializing property to avoid insertion of an expensive + * barrier. (DRK) + */ + pending_flush = (tpmap->pmap_pcid_coherency_vector[ccpu] != 0); + if (__improbable(pending_flush != 0)) { + pmap_pcid_validate_cpu(tpmap, ccpu); + set_cr3_composed(tpmap->pm_cr3, new_pcid, FALSE); + } + } + cpu_datap(ccpu)->cpu_pmap_pcid_coherentp = &(tpmap->pmap_pcid_coherency_vector[ccpu]); +#if DEBUG + KERNEL_DEBUG_CONSTANT(0x9c1d0000, tpmap, new_pcid, pending_flush, pcid_conflict, 0); +#endif +} diff --git a/osfmk/x86_64/start.s b/osfmk/x86_64/start.s index fd0b8491a..8ca246de3 100644 --- a/osfmk/x86_64/start.s +++ b/osfmk/x86_64/start.s @@ -84,7 +84,7 @@ EXT(low_intstack): .globl EXT(gIOHibernateRestoreStack) EXT(gIOHibernateRestoreStack): - .set ., .+INTSTACK_SIZE + .space INTSTACK_SIZE .globl EXT(low_eintstack) EXT(low_eintstack:) @@ -101,7 +101,7 @@ EXT(gIOHibernateRestoreStackEnd): .align 12 .globl EXT(df_task_stack) EXT(df_task_stack): - .set ., .+INTSTACK_SIZE + .space INTSTACK_SIZE .globl EXT(df_task_stack_end) EXT(df_task_stack_end): @@ -112,37 +112,10 @@ EXT(df_task_stack_end): .align 12 .globl EXT(mc_task_stack) EXT(mc_task_stack): - .set ., .+INTSTACK_SIZE + .space INTSTACK_SIZE .globl EXT(mc_task_stack_end) EXT(mc_task_stack_end): - -#if MACH_KDB -/* - * Kernel debugger stack for each processor. - */ - .align 12 - .globl EXT(db_stack_store) -EXT(db_stack_store): - .set ., .+(INTSTACK_SIZE*MAX_CPUS) - -/* - * Stack for last-ditch debugger task for each processor. - */ - .align 12 - .globl EXT(db_task_stack_store) -EXT(db_task_stack_store): - .set ., .+(INTSTACK_SIZE*MAX_CPUS) - -/* - * per-processor kernel debugger stacks - */ - .align ALIGN - .globl EXT(kgdb_stack_store) -EXT(kgdb_stack_store): - .set ., .+(INTSTACK_SIZE*MAX_CPUS) -#endif /* MACH_KDB */ - /* * BSP CPU start here. * eax points to kernbootstruct @@ -176,7 +149,6 @@ EXT(kgdb_stack_store): * This proves that Little Endian is superior to Big Endian. */ - .text .align ALIGN .globl EXT(_start) @@ -222,14 +194,6 @@ LEXT(_pstart) movl $EXT(protected_mode_gdtr), %eax lgdtl (%eax) - mov $(KERNEL_DS), %ax - mov %ax, %ds - mov %ax, %es - mov %ax, %ss - xor %eax, %eax - mov %ax, %fs - mov %ax, %gs - /* the following code is shared by the master CPU and all slave CPUs */ L_pstart_common: /* @@ -237,6 +201,14 @@ L_pstart_common: */ SWITCH_TO_64BIT_MODE + /* Flush data segment selectors */ + xor %eax, %eax + mov %ax, %ss + mov %ax, %ds + mov %ax, %es + mov %ax, %fs + mov %ax, %gs + /* %edi = boot_args_start */ leaq _vstart(%rip), %rcx @@ -441,8 +413,12 @@ ENTRY(acpi_sleep_cpu) movw %gs, saved_gs(%rip) movw %ss, saved_ss(%rip) - /* save the 64bit kernel gs base */ + /* save the 64bit user and kernel gs base */ + /* note: user's curently swapped into kernel base MSR */ mov $MSR_IA32_KERNEL_GS_BASE, %rcx + rdmsr + movl %eax, saved_ugs_base(%rip) + movl %edx, saved_ugs_base+4(%rip) swapgs rdmsr movl %eax, saved_kgs_base(%rip) @@ -519,8 +495,9 @@ Lwake_64: /* protected mode, paging enabled */ POSTCODE(ACPI_WAKE_PAGED_ENTRY) - /* switch to kernel data segment */ - movw $(KERNEL_DS), %ax + /* load null segment selectors */ + xor %eax, %eax + movw %ax, %ss movw %ax, %ds /* restore local and interrupt descriptor tables */ @@ -529,20 +506,20 @@ Lwake_64: /* restore segment registers */ movw saved_es(%rip), %es + movw saved_fs(%rip), %fs + movw saved_gs(%rip), %gs movw saved_ss(%rip), %ss - /* Program FS/GS with a NULL selector, precautionary */ - xor %rax, %rax - movw %ax, %fs - movw %ax, %gs - /* restore the 64bit kernel gs base */ + /* restore the 64bit kernel and user gs base */ mov $MSR_IA32_KERNEL_GS_BASE, %rcx movl saved_kgs_base(%rip), %eax movl saved_kgs_base+4(%rip), %edx wrmsr swapgs + movl saved_ugs_base(%rip), %eax + movl saved_ugs_base+4(%rip), %edx + wrmsr - //K64todo verify this TSS stuff /* * Restore task register. Before doing this, clear the busy flag * in the TSS descriptor set by the CPU. @@ -663,4 +640,5 @@ saved_idt: .word 0 saved_ldt: .word 0 saved_tr: .word 0 saved_kgs_base: .quad 0 +saved_ugs_base: .quad 0 diff --git a/pexpert/Makefile b/pexpert/Makefile index abccc00b0..6e7b0b31a 100644 --- a/pexpert/Makefile +++ b/pexpert/Makefile @@ -8,36 +8,18 @@ include $(MakeInc_cmd) include $(MakeInc_def) INSTINC_SUBDIRS = pexpert - - -INSTINC_SUBDIRS_PPC = pexpert - INSTINC_SUBDIRS_I386 = pexpert - - INSTINC_SUBDIRS_X86_64 = pexpert - - INSTINC_SUBDIRS_ARM = pexpert EXPINC_SUBDIRS = pexpert - - -EXPINC_SUBDIRS_PPC = pexpert - - EXPINC_SUBDIRS_I386 = pexpert - - EXPINC_SUBDIRS_X86_64 = pexpert - - EXPINC_SUBDIRS_ARM = pexpert -SETUP_SUBDIRS = \ - conf +SETUP_SUBDIRS = COMP_SUBDIRS = \ conf diff --git a/pexpert/conf/MASTER b/pexpert/conf/MASTER index f622474d6..7731f8388 100644 --- a/pexpert/conf/MASTER +++ b/pexpert/conf/MASTER @@ -97,4 +97,3 @@ options CONFIG_NO_KPRINTF_STRINGS # # embedded device # options CONFIG_EMBEDDED # - diff --git a/pexpert/conf/MASTER.i386 b/pexpert/conf/MASTER.i386 index f4e41a8e6..94fb5056c 100644 --- a/pexpert/conf/MASTER.i386 +++ b/pexpert/conf/MASTER.i386 @@ -7,7 +7,6 @@ # PROFILE = [ RELEASE profile ] # DEBUG = [ RELEASE debug ] # -# # EMBEDDED = [ intel mach mach_pe panic_info ] # DEVELOPMENT = [ EMBEDDED config_dtrace ] # diff --git a/pexpert/conf/MASTER.ppc b/pexpert/conf/MASTER.ppc deleted file mode 100644 index f36c6062d..000000000 --- a/pexpert/conf/MASTER.ppc +++ /dev/null @@ -1,18 +0,0 @@ -# -###################################################################### -# -# Standard NeXT Research Configurations: -# -------- ---- -------- --------------- -# -# RELEASE = [ppc mach mach_pe panic_info config_dtrace] -# DEVELOPMENT = [ RELEASE ] -# RELEASE_TRACE = [RELEASE kdebug] -# PROFILE = [RELEASE profile] -# DEBUG = [RELEASE debug] -# DEBUG_TRACE = [DEBUG kdebug] -# -###################################################################### - -machine "ppc" # -cpu "ppc" # - diff --git a/pexpert/conf/MASTER.x86_64 b/pexpert/conf/MASTER.x86_64 index 536c4eb59..9283af226 100644 --- a/pexpert/conf/MASTER.x86_64 +++ b/pexpert/conf/MASTER.x86_64 @@ -7,7 +7,6 @@ # PROFILE = [ RELEASE profile ] # DEBUG = [ RELEASE debug ] # -# # EMBEDDED = [ intel mach mach_pe panic_info ] # DEVELOPMENT = [ EMBEDDED ] # diff --git a/pexpert/conf/Makefile b/pexpert/conf/Makefile index 93eb84150..06a9defdf 100644 --- a/pexpert/conf/Makefile +++ b/pexpert/conf/Makefile @@ -7,8 +7,7 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir include $(MakeInc_cmd) include $(MakeInc_def) -SETUP_SUBDIRS = \ - tools +SETUP_SUBDIRS = COMP_SUBDIRS = @@ -24,30 +23,24 @@ else export COMPOBJROOT=$(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)/$(COMPONENT) endif -$(COMPOBJROOT)/doconf: - @make build_setup +MASTER_CPU_PER_SOC = $(SOURCE)/MASTER.$(ARCH_CONFIG_LC).$(MACHINE_CONFIG_LC) $(COMPOBJROOT)/$(PEXPERT_KERNEL_CONFIG)/Makefile : $(SOURCE)/MASTER \ $(SOURCE)/MASTER.$(ARCH_CONFIG_LC) \ $(SOURCE)/Makefile.template \ $(SOURCE)/Makefile.$(ARCH_CONFIG_LC) \ $(SOURCE)/files \ - $(SOURCE)/files.$(ARCH_CONFIG_LC) \ - $(COMPOBJROOT)/doconf + $(SOURCE)/files.$(ARCH_CONFIG_LC) $(_v)(doconf_target=$(addsuffix /conf, $(TARGET)); \ $(MKDIR) $${doconf_target}; \ cd $${doconf_target}; \ rm -f $(notdir $?); \ cp $? $${doconf_target}; \ - $(COMPOBJROOT)/doconf -c -cpu $(ARCH_CONFIG_LC) -d $(TARGET)/$(PEXPERT_KERNEL_CONFIG) $(PEXPERT_KERNEL_CONFIG); \ + if [ -f $(MASTER_CPU_PER_SOC) ]; then cp $(MASTER_CPU_PER_SOC) $${doconf_target}; fi; \ + $(SRCROOT)/SETUP/config/doconf -c -cpu $(ARCH_CONFIG_LC) -soc $(MACHINE_CONFIG_LC) -d $(TARGET)/$(PEXPERT_KERNEL_CONFIG) $(PEXPERT_KERNEL_CONFIG); \ ); -.ORDER: $(COMPOBJROOT)/$(PEXPERT_KERNEL_CONFIG)/Makefile - -do_setup_conf: $(COMPOBJROOT)/doconf \ - $(COMPOBJROOT)/$(PEXPERT_KERNEL_CONFIG)/Makefile - -do_all: do_setup_conf +do_all: $(COMPOBJROOT)/$(PEXPERT_KERNEL_CONFIG)/Makefile $(_v)next_source=$(subst conf/,,$(SOURCE)); \ ${MAKE} -C $(COMPOBJROOT)/$(PEXPERT_KERNEL_CONFIG) \ MAKEFILES=$(TARGET)/$(PEXPERT_KERNEL_CONFIG)/Makefile \ diff --git a/pexpert/conf/Makefile.ppc b/pexpert/conf/Makefile.ppc deleted file mode 100644 index 4ef7445f0..000000000 --- a/pexpert/conf/Makefile.ppc +++ /dev/null @@ -1,8 +0,0 @@ -###################################################################### -#BEGIN Machine dependent Makefile fragment for ppc -###################################################################### - -###################################################################### -#END Machine dependent Makefile fragment for ppc -###################################################################### - diff --git a/pexpert/conf/Makefile.template b/pexpert/conf/Makefile.template index 1207442f0..0fcca19b1 100644 --- a/pexpert/conf/Makefile.template +++ b/pexpert/conf/Makefile.template @@ -27,8 +27,8 @@ include $(MakeInc_def) # CFLAGS # # -CFLAGS+= -imacros meta_features.h -DPEXPERT_KERNEL_PRIVATE \ - -Werror $(CFLAGS_INLINE_CONFIG) +CFLAGS+= -include meta_features.h -DPEXPERT_KERNEL_PRIVATE \ + $(CFLAGS_INLINE_CONFIG) # # Directories for mig generated files @@ -74,16 +74,16 @@ ${OBJS}: ${OBJSDEPS} LDOBJS = $(OBJS) -$(COMPONENT).o: $(LDOBJS) +$(COMPONENT).filelist: $(LDOBJS) @echo LDFILELIST $(COMPONENT) $(_v)( for obj in ${LDOBJS}; do \ echo $(TARGET)$(COMP_OBJ_DIR)/$(KERNEL_CONFIG)/$${obj}; \ - done; ) > $(COMPONENT).o + done; ) > $(COMPONENT).filelist do_depend: do_all $(_v)${MD} -u Makedep -f -d `ls *.d`; -do_all: $(COMPONENT).o +do_all: $(COMPONENT).filelist do_build_all: do_depend diff --git a/pexpert/conf/files.ppc b/pexpert/conf/files.ppc deleted file mode 100644 index ab76f421f..000000000 --- a/pexpert/conf/files.ppc +++ /dev/null @@ -1,7 +0,0 @@ - -pexpert/ppc/pe_init.c standard -pexpert/ppc/pe_bootargs.c standard -pexpert/ppc/pe_identify_machine.c standard -pexpert/ppc/pe_kprintf.c standard -pexpert/ppc/pe_clock_speed.c standard -pexpert/ppc/pe_clock_speed_asm.s standard diff --git a/pexpert/conf/tools/Makefile b/pexpert/conf/tools/Makefile deleted file mode 100644 index 4f9ccd553..000000000 --- a/pexpert/conf/tools/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -SETUP_SUBDIRS = doconf - -COMP_SUBDIRS = doconf - -INST_SUBDIRS = \ - - -setup_build_all: - @echo "[ $(SOURCE) ] make setup_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -do_build_all: - @echo "[ $(SOURCE) ] make do_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -setup_build_install: - @echo "[ $(SOURCE) ] make setup_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -do_build_install: - @echo "[ $(SOURCE) ] make do_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -include $(MakeInc_rule) -include $(MakeInc_dir) - - diff --git a/pexpert/conf/tools/doconf/Makefile b/pexpert/conf/tools/doconf/Makefile deleted file mode 100644 index aa55a9419..000000000 --- a/pexpert/conf/tools/doconf/Makefile +++ /dev/null @@ -1,47 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -COMP_SUBDIRS = \ - -INST_SUBDIRS = \ - - -# -# Who and where -# -BINDIR= -ifneq ($(MACHINE_CONFIG), DEFAULT) -DSTDIR= $(strip $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)_$(MACHINE_CONFIG)/$(COMPONENT)/) -else -DSTDIR= $(strip $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)/$(COMPONENT)/) -endif -PROGRAM= $(DSTDIR)doconf - -# -# How to install it -# -IFLAGS= -c -m 555 - -$(PROGRAM): $(DSTDIR)% : $(SOURCE)%.csh - @-$(RM) $(RMFLAGS) $(notdir $(PROGRAM)).VERS - @sed -e "s/#PROGRAM.*/#`vers_string $(notdir $(PROGRAM))`/" \ - < $< >$(notdir $(PROGRAM)).VERS; - @install $(IFLAGS) $(notdir $(PROGRAM)).VERS $(PROGRAM); - @-$(RM) $(RMFLAGS) $(notdir $(PROGRAM)).VERS; - -do_build_setup: $(PROGRAM) - -do_build_all: - -setup_build_install: - -do_build_install: - -include $(MakeInc_rule) -include $(MakeInc_dir) diff --git a/pexpert/conf/tools/doconf/doconf.csh b/pexpert/conf/tools/doconf/doconf.csh deleted file mode 100755 index 6fedb4786..000000000 --- a/pexpert/conf/tools/doconf/doconf.csh +++ /dev/null @@ -1,321 +0,0 @@ -#!/bin/csh -f -set path = ($path .) -###################################################################### -# HISTORY -# 1-Dec-87 Michael Young (mwyoung) at Carnegie-Mellon University -# Added "-verbose" switch, so this script produces no output -# in the normal case. -# -# 10-Oct-87 Mike Accetta (mja) at Carnegie-Mellon University -# Flushed cmu_*.h and spin_locks.h -# [ V5.1(XF18) ] -# -# 6-Apr-87 Avadis Tevanian (avie) at Carnegie-Mellon University -# Use MASTER.local and MASTER..local for generation of -# configuration files in addition to MASTER and MASTER.. -# -# 25-Mar-87 Mike Accetta (mja) at Carnegie-Mellon University -# Removed use of obsolete wb_*.h files when building the feature -# list; modified to save the previous configuration file and -# display the differences between it and the new file. -# [ V5.1(F8) ] -# -# 25-Mar-87 Avadis Tevanian (avie) at Carnegie-Mellon University -# If there is no /etc/machine just print out a message telling -# user to use the -cpu option. I thought this script was supposed -# to work even without a /etc/machine, but it doesn't... and this -# is the easiest way out. -# -# 13-Mar-87 Mike Accetta (mja) at Carnegie-Mellon University -# Added "romp_fpa.h" file to extra features for the RT. -# [ V5.1(F7) ] -# -# 11-Mar-87 Mike Accetta (mja) at Carnegie-Mellon University -# Updated to maintain the appropriate configuration features file -# in the "machine" directory whenever the corresponding -# configuration is generated. This replaces the old mechanism of -# storing this directly in the file since it was -# machine dependent and also precluded building programs for more -# than one configuration from the same set of sources. -# [ V5.1(F6) ] -# -# 21-Feb-87 Mike Accetta (mja) at Carnegie-Mellon University -# Fixed to require wired-in cpu type names for only those -# machines where the kernel name differs from that provided by -# /etc/machine (i.e. IBMRT => ca and SUN => sun3); updated to -# permit configuration descriptions in both machine indepedent -# and dependent master configuration files so that attributes can -# be grouped accordingly. -# [ V5.1(F3) ] -# -# 17-Jan-87 Mike Accetta (mja) at Carnegie-Mellon University -# Updated to work from any directory at the same level as -# "conf"; generate configuration from both MASTER and -# MASTER. files; added -cpu switch. -# [ V5.1(F1) ] -# -# 18-Aug-86 Mike Accetta (mja) at Carnegie-Mellon University -# Added -make switch and changed meaning of -config; upgraded to -# allow multiple attributes per configuration and to define -# configurations in terms of these attributes within MASTER. -# -# 14-Apr-83 Mike Accetta (mja) at Carnegie-Mellon University -# Added -config switch to only run /etc/config without -# "make depend" and "make". -# -###################################################################### - -set prog=$0 -set prog=$prog:t -set nonomatch -set OBJDIR=../BUILD -if ("`/usr/bin/uname`" == "Rhapsody" ) then -set CONFIG_DIR=/usr/local/bin -else -set CONFIG_DIR=/usr/bin -endif - -unset domake -unset doconfig -unset beverbose -unset MACHINE -unset profile - -while ($#argv >= 1) - if ("$argv[1]" =~ -*) then - switch ("$argv[1]") - case "-c": - case "-config": - set doconfig - breaksw - case "-m": - case "-make": - set domake - breaksw - case "-cpu": - if ($#argv < 2) then - echo "${prog}: missing argument to ${argv[1]}" - exit 1 - endif - set MACHINE="$argv[2]" - shift - breaksw - case "-d": - if ($#argv < 2) then - echo "${prog}: missing argument to ${argv[1]}" - exit 1 - endif - set OBJDIR="$argv[2]" - shift - breaksw - case "-verbose": - set beverbose - breaksw - case "-p": - case "-profile": - set profile - breaksw - default: - echo "${prog}: ${argv[1]}: unknown switch" - exit 1 - breaksw - endsw - shift - else - break - endif -end - -if ($#argv == 0) set argv=(GENERIC) - -if (! $?MACHINE) then - if (-d /NextApps) then - set MACHINE=`hostinfo | awk '/MC680x0/ { printf("m68k") } /MC880x0/ { printf("m88k") }'` - endif -endif - -if (! $?MACHINE) then - if (-f /etc/machine) then - set MACHINE="`/etc/machine`" - else - echo "${prog}: no /etc/machine, specify machine type with -cpu" - echo "${prog}: e.g. ${prog} -cpu VAX CONFIGURATION" - exit 1 - endif -endif - -set FEATURES_EXTRA= - -switch ("$MACHINE") - case IBMRT: - set cpu=ca - set ID=RT - set FEATURES_EXTRA="romp_dualcall.h romp_fpa.h" - breaksw - case SUN: - set cpu=sun3 - set ID=SUN3 - breaksw - default: - set cpu=`echo $MACHINE | tr A-Z a-z` - set ID=`echo $MACHINE | tr a-z A-Z` - breaksw -endsw -set FEATURES=../h/features.h -set FEATURES_H=(cs_*.h mach_*.h net_*.h\ - cputypes.h cpus.h vice.h\ - $FEATURES_EXTRA) -set MASTER_DIR=../conf -set MASTER = ${MASTER_DIR}/MASTER -set MASTER_CPU=${MASTER}.${cpu} - -set MASTER_LOCAL = ${MASTER}.local -set MASTER_CPU_LOCAL = ${MASTER_CPU}.local -if (! -f $MASTER_LOCAL) set MASTER_LOCAL = "" -if (! -f $MASTER_CPU_LOCAL) set MASTER_CPU_LOCAL = "" - -if (! -d $OBJDIR) then - if ($?beverbose) then - echo "[ creating $OBJDIR ]" - endif - mkdir -p $OBJDIR -endif - -foreach SYS ($argv) - set SYSID=${SYS}_${ID} - set SYSCONF=$OBJDIR/config.$SYSID - set BLDDIR=$OBJDIR - if ($?beverbose) then - echo "[ generating $SYSID from $MASTER_DIR/MASTER{,.$cpu}{,.local} ]" - endif - echo +$SYS \ - | \ - cat $MASTER $MASTER_LOCAL $MASTER_CPU $MASTER_CPU_LOCAL - \ - $MASTER $MASTER_LOCAL $MASTER_CPU $MASTER_CPU_LOCAL \ - | \ - sed -n \ - -e "/^+/{" \ - -e "s;[-+];#&;gp" \ - -e 't loop' \ - -e ': loop' \ - -e 'n' \ - -e '/^#/b loop' \ - -e '/^$/b loop' \ - -e 's;^\([^#]*\).*#[ ]*<\(.*\)>[ ]*$;\2#\1;' \ - -e 't not' \ - -e 's;\([^#]*\).*;#\1;' \ - -e 't not' \ - -e ': not' \ - -e 's;[ ]*$;;' \ - -e 's;^\!\(.*\);\1#\!;' \ - -e 'p' \ - -e 't loop' \ - -e 'b loop' \ - -e '}' \ - -e "/^[^#]/d" \ - -e 's; ; ;g' \ - -e "s;^# *\([^ ]*\)[ ]*=[ ]*\[\(.*\)\].*;\1#\2;p" \ - | \ - awk '-F#' '\ -part == 0 && $1 != "" {\ - m[$1]=m[$1] " " $2;\ - next;\ -}\ -part == 0 && $1 == "" {\ - for (i=NF;i>1;i--){\ - s=substr($i,2);\ - c[++na]=substr($i,1,1);\ - a[na]=s;\ - }\ - while (na > 0){\ - s=a[na];\ - d=c[na--];\ - if (m[s] == "") {\ - f[s]=d;\ - } else {\ - nx=split(m[s],x," ");\ - for (j=nx;j>0;j--) {\ - z=x[j];\ - a[++na]=z;\ - c[na]=d;\ - }\ - }\ - }\ - part=1;\ - next;\ -}\ -part != 0 {\ - if ($1 != "") {\ - n=split($1,x,",");\ - ok=0;\ - for (i=1;i<=n;i++) {\ - if (f[x[i]] == "+") {\ - ok=1;\ - }\ - }\ - if (NF > 2 && ok == 0 || NF <= 2 && ok != 0) {\ - print $2; \ - }\ - } else { \ - print $2; \ - }\ -}\ -' >$SYSCONF.new - if (-z $SYSCONF.new) then - echo "${prog}: ${$SYSID}: no such configuration in $MASTER_DIR/MASTER{,.$cpu}" - rm -f $SYSCONF.new - endif - if (! -d $BLDDIR) then - if ($?beverbose) then - echo "[ creating $BLDDIR ]" - endif - mkdir -p $BLDDIR - endif -# -# These paths are used by config. -# -# "builddir" is the name of the directory where kernel binaries -# are put. It is a single path element, never absolute, and is -# always relative to "objectdir". "builddir" is used by config -# solely to determine where to put files created by "config" (e.g. -# the created Makefile and *.h's.) -# -# "objectdir" is the name of the directory which will hold "builddir". -# It is a path; if relative, it is relative to the current directory -# where config is run. It's sole use is to be prepended to "builddir" -# to indicate where config-created files are to be placed (see above). -# -# "sourcedir" is the location of the sources used to build the kernel. -# It is a path; if relative, it is relative to the directory specified -# by the concatenation of "objectdir" and "builddir" (i.e. where the -# kernel binaries are put). -# - echo 'builddir "."' >> $SYSCONF.new - set OBJRELDIR=`$RELPATH $OBJROOT $OBJDIR` - echo 'objectdir "'$OBJROOT'/'$OBJRELDIR'"' >> $SYSCONF.new - set SRCDIR=`dirname $SOURCE` - echo 'sourcedir "'$SRCROOT'"' >> $SYSCONF.new - if (-f $SYSCONF) then - diff $SYSCONF $SYSCONF.new - rm -f $SYSCONF.old - mv $SYSCONF $SYSCONF.old - endif - rm -f $SYSCONF - mv $SYSCONF.new $SYSCONF - if ($?doconfig) then - if ($?beverbose) then - echo "[ configuring $SYSID ]" - endif - if ($?profile) then - $CONFIG_DIR/config -c $MASTER_DIR -p $SYSCONF - else - $CONFIG_DIR/config -c $MASTER_DIR $SYSCONF - endif - endif - if ($?domake) then - if ($?beverbose) then - echo "[ making $SYSID ]" - endif - (cd $BLDDIR; make) - endif -end diff --git a/pexpert/gen/bootargs.c b/pexpert/gen/bootargs.c index 0e8749b2b..6ca4fa102 100644 --- a/pexpert/gen/bootargs.c +++ b/pexpert/gen/bootargs.c @@ -26,12 +26,22 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ #include +#include -extern boolean_t isargsep( char c); -extern int argstrcpy(char *from, char *to); -extern int getval(char *s, int *val); - +static boolean_t isargsep( char c); +#if !CONFIG_EMBEDDED +static int argstrcpy(char *from, char *to); +#endif static int argstrcpy2(char *from,char *to, unsigned maxlen); +static int argnumcpy(int val, void *to, unsigned maxlen); +static int getval(char *s, int *val); + +extern int IODTGetDefault(const char *key, void *infoAddr, unsigned int infoSize); + +struct i24 { + int32_t i24 : 24; + int32_t _pad : 8; +}; #define NUM 0 #define STR 1 @@ -69,6 +79,10 @@ PE_parse_boot_argn( args = PE_boot_args(); if (*args == '\0') return FALSE; +#if CONFIG_EMBEDDED + if (max_len == -1) return FALSE; +#endif + arg_found = FALSE; while(*args && isargsep(*args)) args++; @@ -93,7 +107,7 @@ PE_parse_boot_argn( (i!=strlen(arg_string))) goto gotit; if (arg_boolean) { - *(unsigned int *)arg_ptr = TRUE; + argnumcpy(1, arg_ptr, max_len); arg_found = TRUE; break; } else { @@ -113,14 +127,16 @@ PE_parse_boot_argn( switch (getval(cp, &val)) { case NUM: - *(unsigned int *)arg_ptr = val; + argnumcpy(val, arg_ptr, max_len); arg_found = TRUE; break; case STR: if(max_len > 0) //max_len of 0 performs no copy at all argstrcpy2(++cp, (char *)arg_ptr, max_len - 1); - else if(max_len == -1) +#if !CONFIG_EMBEDDED + else if(max_len == -1) // unreachable on embedded argstrcpy(++cp, (char *)arg_ptr); +#endif arg_found = TRUE; break; } @@ -137,7 +153,8 @@ PE_parse_boot_argn( return(arg_found); } -boolean_t isargsep( +static boolean_t +isargsep( char c) { if (c == ' ' || c == '\0' || c == '\t') @@ -146,7 +163,8 @@ boolean_t isargsep( return(FALSE); } -int +#if !CONFIG_EMBEDDED +static int argstrcpy( char *from, char *to) @@ -160,6 +178,7 @@ argstrcpy( *to = 0; return(i); } +#endif static int argstrcpy2( @@ -177,7 +196,33 @@ argstrcpy2( return(i); } -int +static int argnumcpy(int val, void *to, unsigned maxlen) +{ + switch (maxlen) { + case 0: + /* No write-back, caller just wants to know if arg was found */ + break; + case 1: + *(int8_t *)to = val; + break; + case 2: + *(int16_t *)to = val; + break; + case 3: + /* Unlikely in practice */ + ((struct i24 *)to)->i24 = val; + break; + case 4: + default: + *(int32_t *)to = val; + maxlen = 4; + break; + } + + return (int)maxlen; +} + +static int getval( char *s, int *val) @@ -266,3 +311,45 @@ PE_imgsrc_mount_supported() { return TRUE; } + +boolean_t +PE_get_default( + const char *property_name, + void *property_ptr, + unsigned int max_property) +{ + DTEntry dte; + void **property_data; + unsigned int property_size; + + /* + * Look for the property using the PE DT support. + */ + if (kSuccess == DTLookupEntry(NULL, "/defaults", &dte)) { + + /* + * We have a /defaults node, look for the named property. + */ + if (kSuccess != DTGetProperty(dte, property_name, (void **)&property_data, &property_size)) + return FALSE; + + /* + * This would be a fine place to do smart argument size management for 32/64 + * translation, but for now we'll insist that callers know how big their + * default values are. + */ + if (property_size > max_property) + return FALSE; + + /* + * Copy back the precisely-sized result. + */ + memcpy(property_ptr, property_data, property_size); + return TRUE; + } + + /* + * Look for the property using I/O Kit's DT support. + */ + return IODTGetDefault(property_name, property_ptr, max_property) ? FALSE : TRUE; +} diff --git a/pexpert/i386/pe_init.c b/pexpert/i386/pe_init.c index 7b37f5eb2..fc29c1a65 100644 --- a/pexpert/i386/pe_init.c +++ b/pexpert/i386/pe_init.c @@ -185,9 +185,7 @@ void PE_init_platform(boolean_t vm_initialized, void * _args) } if (!vm_initialized) { - /* Hack! FIXME.. */ - outb(0x21, 0xff); /* Maskout all interrupts Pic1 */ - outb(0xa1, 0xff); /* Maskout all interrupts Pic2 */ + if (PE_state.deviceTreeHead) { DTInit(PE_state.deviceTreeHead); } diff --git a/pexpert/i386/pe_kprintf.c b/pexpert/i386/pe_kprintf.c index b25bbcf28..6533908eb 100644 --- a/pexpert/i386/pe_kprintf.c +++ b/pexpert/i386/pe_kprintf.c @@ -35,6 +35,7 @@ #include #include #include +#include /* Globals */ void (*PE_kputc)(char c); @@ -66,10 +67,11 @@ void PE_init_kprintf(boolean_t vm_initialized) if (boot_arg & DB_KPRT) new_disable_serial_output = FALSE; - /* If we are newly enabling serial, make sure we only call serial_init() - * if our previous state was not enabled */ - if (!new_disable_serial_output && (!disable_serial_output || serial_init())) - PE_kputc = serial_putc; + /* If we are newly enabling serial, make sure we only + * call pal_serial_init() if our previous state was + * not enabled */ + if (!new_disable_serial_output && (!disable_serial_output || pal_serial_init())) + PE_kputc = pal_serial_putc; else PE_kputc = cnputc; @@ -108,7 +110,7 @@ void kprintf(const char *fmt, ...) * take any locks, just dump to serial */ if (!PE_kputc) { va_start(listp, fmt); - _doprnt(fmt, &listp, serial_putc, 16); + _doprnt(fmt, &listp, pal_serial_putc, 16); va_end(listp); return; } @@ -120,6 +122,9 @@ void kprintf(const char *fmt, ...) * interrupts are disabled once we have the lock. */ state = ml_set_interrupts_enabled(FALSE); + + pal_preemption_assert(); + while (!simple_lock_try(&kprintf_lock)) { ml_set_interrupts_enabled(state); ml_set_interrupts_enabled(FALSE); diff --git a/pexpert/i386/pe_serial.c b/pexpert/i386/pe_serial.c index cba4e1b59..fcff88b88 100644 --- a/pexpert/i386/pe_serial.c +++ b/pexpert/i386/pe_serial.c @@ -198,7 +198,6 @@ int serial_init( void ) return 1; } - void serial_putc( char c ) { uart_putc(c); diff --git a/pexpert/pexpert/Makefile b/pexpert/pexpert/Makefile index 82be8207d..0680f3ca2 100644 --- a/pexpert/pexpert/Makefile +++ b/pexpert/pexpert/Makefile @@ -9,27 +9,16 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ machine - -INSTINC_SUBDIRS_PPC = \ - ppc - INSTINC_SUBDIRS_I386 = \ i386 - INSTINC_SUBDIRS_X86_64 = \ i386 - INSTINC_SUBDIRS_ARM = \ arm EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} - -EXPINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS_PPC} - EXPINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS_I386} - EXPINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS_X86_64} - EXPINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS_ARM} DATAFILES = \ diff --git a/pexpert/pexpert/i386/boot.h b/pexpert/pexpert/i386/boot.h index dc3b4b09d..18e65d406 100644 --- a/pexpert/pexpert/i386/boot.h +++ b/pexpert/pexpert/i386/boot.h @@ -99,14 +99,13 @@ typedef struct Boot_Video Boot_Video; /* Boot argument structure - passed into Mach kernel at boot time. * "Revision" can be incremented for compatible changes */ -#define kBootArgsRevision 6 -#define kBootArgsVersion 1 +#define kBootArgsRevision 0 +#define kBootArgsVersion 2 /* Snapshot constants of previous revisions that are supported */ #define kBootArgsVersion1 1 -#define kBootArgsRevision1_4 4 -#define kBootArgsRevision1_5 5 -#define kBootArgsRevision1_6 6 +#define kBootArgsVersion2 2 +#define kBootArgsRevision2_0 0 #define kBootArgsEfiMode32 32 #define kBootArgsEfiMode64 64 @@ -115,6 +114,10 @@ typedef struct boot_args { uint16_t Revision; /* Revision of boot_args structure */ uint16_t Version; /* Version of boot_args structure */ + uint8_t efiMode; /* 32 = 32-bit, 64 = 64-bit */ + uint8_t debugMode; /* Bit field with behavior changes */ + uint8_t __reserved1[2]; + char CommandLine[BOOT_LINE_LENGTH]; /* Passed in command line */ uint32_t MemoryMap; /* Physical address of memory map */ @@ -132,17 +135,25 @@ typedef struct boot_args { uint32_t efiRuntimeServicesPageStart; /* physical address of defragmented runtime pages */ uint32_t efiRuntimeServicesPageCount; + uint64_t efiRuntimeServicesVirtualPageStart; /* virtual address of defragmented runtime pages */ + uint32_t efiSystemTable; /* physical address of system table in runtime area */ + uint32_t __reserved2; - uint8_t efiMode; /* 32 = 32-bit, 64 = 64-bit */ - uint8_t __reserved1[3]; - uint32_t __reserved2[1]; uint32_t performanceDataStart; /* physical address of log */ uint32_t performanceDataSize; - uint64_t efiRuntimeServicesVirtualPageStart; /* virtual address of defragmented runtime pages */ - uint32_t __reserved3[2]; + + uint32_t keyStoreDataStart; /* physical address of key store data */ + uint32_t keyStoreDataSize; + uint64_t bootMemStart; + uint64_t bootMemSize; + uint64_t PhysicalMemorySize; + uint64_t FSBFrequency; + uint32_t __reserved4[734]; } boot_args; +extern char assert_boot_args_size_is_4096[sizeof(boot_args) == 4096 ? 1 : -1]; + #endif /* _PEXPERT_I386_BOOT_H */ diff --git a/pexpert/pexpert/i386/efi.h b/pexpert/pexpert/i386/efi.h index 08ff10f84..5ef501593 100644 --- a/pexpert/pexpert/i386/efi.h +++ b/pexpert/pexpert/i386/efi.h @@ -253,7 +253,7 @@ EFI_STATUS IN EFI_UINTN DescriptorSize, IN EFI_UINT32 DescriptorVersion, IN EFI_MEMORY_DESCRIPTOR * VirtualMap - ) __attribute__((regparm(0))); + ); typedef EFI_RUNTIMESERVICE @@ -261,7 +261,7 @@ EFI_STATUS (EFIAPI *EFI_CONVERT_POINTER) ( IN EFI_UINTN DebugDisposition, IN OUT VOID **Address - ) __attribute__((regparm(0))); + ); // // Variable attributes @@ -280,7 +280,7 @@ EFI_STATUS OUT EFI_UINT32 * Attributes OPTIONAL, IN OUT EFI_UINTN * DataSize, OUT VOID * Data - ) __attribute__((regparm(0))); + ); typedef EFI_RUNTIMESERVICE @@ -289,7 +289,7 @@ EFI_STATUS IN OUT EFI_UINTN * VariableNameSize, IN OUT EFI_CHAR16 * VariableName, IN OUT EFI_GUID * VendorGuid - ) __attribute__((regparm(0))); + ); typedef EFI_RUNTIMESERVICE @@ -300,7 +300,7 @@ EFI_STATUS IN EFI_UINT32 Attributes, IN EFI_UINTN DataSize, IN VOID * Data - ) __attribute__((regparm(0))); + ); // // EFI Time @@ -317,14 +317,14 @@ EFI_STATUS (EFIAPI *EFI_GET_TIME) ( OUT EFI_TIME * Time, OUT EFI_TIME_CAPABILITIES * Capabilities OPTIONAL - ) __attribute__((regparm(0))); + ); typedef EFI_RUNTIMESERVICE EFI_STATUS (EFIAPI *EFI_SET_TIME) ( IN EFI_TIME * Time - ) __attribute__((regparm(0))); + ); typedef EFI_RUNTIMESERVICE @@ -333,7 +333,7 @@ EFI_STATUS OUT EFI_BOOLEAN * Enabled, OUT EFI_BOOLEAN * Pending, OUT EFI_TIME * Time - ) __attribute__((regparm(0))); + ); typedef EFI_RUNTIMESERVICE @@ -341,7 +341,7 @@ EFI_STATUS (EFIAPI *EFI_SET_WAKEUP_TIME) ( IN EFI_BOOLEAN Enable, IN EFI_TIME * Time OPTIONAL - ) __attribute((regparm(0))); + ); typedef enum { EfiResetCold, @@ -362,14 +362,14 @@ VOID IN EFI_STATUS ResetStatus, IN EFI_UINTN DataSize, IN EFI_CHAR16 * ResetData OPTIONAL - ) __attribute__((regparm(0))); + ); typedef EFI_RUNTIMESERVICE EFI_STATUS (EFIAPI *EFI_GET_NEXT_HIGH_MONO_COUNT) ( OUT EFI_UINT32 * HighCount - ) __attribute__((regparm(0))); + ); // // Definition of Status Code extended data header @@ -394,7 +394,7 @@ EFI_STATUS IN EFI_UINT32 Instance, IN EFI_GUID * CallerId OPTIONAL, IN EFI_STATUS_CODE_DATA * Data OPTIONAL - ) __attribute__((regparm(0))); + ); #endif // diff --git a/pexpert/pexpert/machine/boot.h b/pexpert/pexpert/machine/boot.h index 542ee10db..26ba42c37 100644 --- a/pexpert/pexpert/machine/boot.h +++ b/pexpert/pexpert/machine/boot.h @@ -28,9 +28,7 @@ #ifndef _PEXPERT_MACHINE_BOOT_H #define _PEXPERT_MACHINE_BOOT_H -#if defined (__ppc__) -#include "pexpert/ppc/boot.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "pexpert/i386/boot.h" #else #error architecture not supported diff --git a/pexpert/pexpert/machine/protos.h b/pexpert/pexpert/machine/protos.h index 3dd9cbacb..5d71753d9 100644 --- a/pexpert/pexpert/machine/protos.h +++ b/pexpert/pexpert/machine/protos.h @@ -28,9 +28,7 @@ #ifndef _PEXPERT_MACHINE_PROTOS_H #define _PEXPERT_MACHINE_PROTOS_H -#if defined (__ppc__) -#include "pexpert/ppc/protos.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "pexpert/i386/protos.h" #else #error architecture not supported diff --git a/pexpert/pexpert/pexpert.h b/pexpert/pexpert/pexpert.h index f3a539af8..1f714d32b 100644 --- a/pexpert/pexpert/pexpert.h +++ b/pexpert/pexpert/pexpert.h @@ -254,6 +254,17 @@ extern boolean_t PE_parse_boot_argn( void *arg_ptr, int max_arg); +extern boolean_t PE_get_default( + const char *property_name, + void *property_ptr, + unsigned int max_property); + +#define PE_default_value(_key, _variable, _default) \ + do { \ + if (!PE_get_default((_key), &(_variable), sizeof(_variable))) \ + _variable = _default; \ + } while(0) + enum { kPEOptionKey = 0x3a, kPECommandKey = 0x37, diff --git a/pexpert/pexpert/ppc/Makefile b/pexpert/pexpert/ppc/Makefile deleted file mode 100644 index b39a66718..000000000 --- a/pexpert/pexpert/ppc/Makefile +++ /dev/null @@ -1,27 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -DATAFILES = \ - boot.h \ - interrupts.h \ - powermac.h - -INSTALL_MD_LIST = ${DATAFILES} - -INSTALL_MD_DIR = pexpert/ppc - -EXPORT_MD_LIST = ${DATAFILES} - -EXPORT_MD_DIR = pexpert/ppc - - -include $(MakeInc_rule) -include $(MakeInc_dir) - - diff --git a/pexpert/pexpert/ppc/boot.h b/pexpert/pexpert/ppc/boot.h deleted file mode 100644 index 3ba51feb3..000000000 --- a/pexpert/pexpert/ppc/boot.h +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * NOTICE: This file was modified by McAfee Research in 2004 to introduce - * support for mandatory and extensible security protections. This notice - * is included in support of clause 2.2 (b) of the Apple Public License, - * Version 2.0. - */ - -#ifndef _PEXPERT_PPC_BOOT_H_ -#define _PEXPERT_PPC_BOOT_H_ - -#define BOOT_LINE_LENGTH 256 - -/* - * Video information.. - */ - -struct Boot_Video { - unsigned long v_baseAddr; /* Base address of video memory */ - unsigned long v_display; /* Display Code (if Applicable */ - unsigned long v_rowBytes; /* Number of bytes per pixel row */ - unsigned long v_width; /* Width */ - unsigned long v_height; /* Height */ - unsigned long v_depth; /* Pixel Depth */ -}; - -typedef struct Boot_Video Boot_Video; - -/* DRAM Bank definitions - describes physical memory layout. - */ -#define kMaxDRAMBanks 26 /* maximum number of DRAM banks */ - -struct DRAMBank -{ - unsigned long base; /* physical base of DRAM bank */ - unsigned long size; /* size of bank */ -}; -typedef struct DRAMBank DRAMBank; - - -/* Boot argument structure - passed into Mach kernel at boot time. - */ -#define kBootArgsRevision 2 -#define kBootArgsVersion1 1 -#define kBootArgsVersion2 2 - -typedef struct boot_args { - unsigned short Revision; /* Revision of boot_args structure */ - unsigned short Version; /* Version of boot_args structure */ - char CommandLine[BOOT_LINE_LENGTH]; /* Passed in command line */ - DRAMBank PhysicalDRAM[kMaxDRAMBanks]; /* base and range pairs for the 26 DRAM banks */ - Boot_Video Video; /* Video Information */ - unsigned long machineType; /* Machine Type (gestalt) */ - void *deviceTreeP; /* Base of flattened device tree */ - unsigned long deviceTreeLength;/* Length of flattened tree */ - unsigned long topOfKernelData;/* Highest address used in kernel data area */ - void *exdata; - unsigned long exdatalen; -} boot_args; - -extern boot_args passed_args; - -#endif /* _PEXPERT_PPC_BOOT_H_ */ diff --git a/pexpert/pexpert/ppc/interrupts.h b/pexpert/pexpert/ppc/interrupts.h deleted file mode 100644 index eac70164c..000000000 --- a/pexpert/pexpert/ppc/interrupts.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#ifndef _POWERMAC_INTERRUPTS_H_ -#define _POWERMAC_INTERRUPTS_H_ - -#include /* for struct ppc_saved_state */ - -extern void (PE_incoming_interrupt)(int type, ppc_saved_state_t *ssp, - unsigned int dsisr, unsigned int dar); - -#endif /* POWERMAC_INTERRUPTS_H_ */ diff --git a/pexpert/pexpert/ppc/powermac.h b/pexpert/pexpert/ppc/powermac.h deleted file mode 100644 index 82a61fcba..000000000 --- a/pexpert/pexpert/ppc/powermac.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#ifndef _PEXPERT_PPC_POWERMAC_H_ -#define _PEXPERT_PPC_POWERMAC_H_ - -#ifndef ASSEMBLER - -#include - -#include -#include -#include - - -/* prototypes */ - -vm_offset_t PE_find_scc( void ); - -/* Some useful typedefs for accessing control registers */ - -typedef volatile unsigned char v_u_char; -typedef volatile unsigned short v_u_short; -typedef volatile unsigned int v_u_int; -typedef volatile unsigned long v_u_long; - -/* And some useful defines for reading 'volatile' structures, - * don't forget to be be careful about sync()s and eieio()s - */ -#define reg8(reg) (*(v_u_char *)reg) -#define reg16(reg) (*(v_u_short *)reg) -#define reg32(reg) (*(v_u_int *)reg) - -#endif /* ASSEMBLER */ - -#endif /* _PEXPERT_PPC_POWERMAC_H_ */ diff --git a/pexpert/pexpert/ppc/protos.h b/pexpert/pexpert/ppc/protos.h deleted file mode 100644 index 74ed2485b..000000000 --- a/pexpert/pexpert/ppc/protos.h +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#ifndef _PEXPERT_PPC_PROTOS_H_ -#define _PEXPERT_PPC_PROTOS_H_ - -#define mtsprg(n, reg) __asm__ volatile("mtsprg " # n ", %0" : : "r" (reg)) -#define mfsprg(reg, n) __asm__ volatile("mfsprg %0, " # n : "=r" (reg)) - -#define mtspr(spr, val) __asm__ volatile("mtspr " # spr ", %0" : : "r" (val)) -#define mfspr(reg, spr) __asm__ volatile("mfspr %0, " # spr : "=r" (reg)) - -/* - * Various memory/IO synchronisation instructions - */ - - /* Use eieio as a memory barrier to order stores. - * Useful for device control and PTE maintenance. - */ - -#define eieio() \ - __asm__ volatile("eieio") - - /* Use sync to ensure previous stores have completed. - This is required when manipulating locks and/or - maintaining PTEs or other shared structures on SMP - machines. - */ - -#define sync() \ - __asm__ volatile("sync") - - /* Use isync to sychronize context; that is, the ensure - no prefetching of instructions happen before the - instruction. - */ - -#define isync() \ - __asm__ volatile("isync") - - -//------------------------------------------------------------------------ -// from ppc/endian.h -static __inline__ unsigned int byte_reverse_word(unsigned int word); -static __inline__ unsigned int byte_reverse_word(unsigned int word) { - unsigned int result; - __asm__ volatile("lwbrx %0, 0, %1" : "=r" (result) : "r" (&word)); - return result; -} - -//------------------------------------------------------------------------ -// from ppc/serial_io.h -extern void initialize_serial(void * scc_phys_base, uint32_t serial_baud); -void serial_putc(char); -int serial_getc(void); -void cnputc(char); - - -//------------------------------------------------------------------------ -// from osfmk/ppc/POWERMAC/video_console.c - -extern void vc_progress_initialize( void * desc, - const unsigned char * data, - const unsigned char * clut ); - -extern void vc_display_icon( void * desc, - const unsigned char * data ); - -//------------------------------------------------------------------------- -// from osfmk/console/panic_dialog.c -extern void panic_ui_initialize(const unsigned char * clut); - -/* - * from osfmk/ppc/serial_console.h - */ -int switch_to_serial_console(void); -void switch_to_old_console(int); - -typedef unsigned spl_t; - -//------------------------------------------------------------------------ -// from bsd/dev/ppc/busses.h which clashes with mach/device/device_types.h -typedef int io_req_t; - - -//typedef struct ipc_port *ipc_port_t; - -extern void cninit(void); - -/* - * Temporarily stolen from Firmware.h - */ - -extern void dbgTrace(unsigned int item1, unsigned int item2, unsigned int item3); -#if 1 /* (TEST/DEBUG) - eliminate inline */ -extern __inline__ void dbgTrace(unsigned int item1, unsigned int item2, unsigned int item3) { - - __asm__ volatile("mr r3,%0" : : "r" (item1) : "r3"); - __asm__ volatile("mr r4,%0" : : "r" (item2) : "r4"); - __asm__ volatile("mr r5,%0" : : "r" (item3) : "r5"); - __asm__ volatile("lis r0,hi16(CutTrace)" : : : "r0"); - __asm__ volatile("ori r0,r0,lo16(CutTrace)" : : : "r0"); - __asm__ volatile("sc"); - return; -} -#endif - -extern void DoPreempt(void); -extern __inline__ void DoPreempt(void) { - __asm__ volatile("lis r0,hi16(DoPreemptCall)" : : : "r0"); - __asm__ volatile("ori r0,r0,lo16(DoPreemptCall)" : : : "r0"); - __asm__ volatile("sc"); - return; -} - -extern void CreateFakeIO(void); -extern __inline__ void CreateFakeIO(void) { - __asm__ volatile("lis r0,hi16(CreateFakeIOCall)" : : : "r0"); - __asm__ volatile("ori r0,r0,lo16(CreateFakeIOCall)" : : : "r0"); - __asm__ volatile("sc"); - return; -} - -extern void StoreReal(unsigned int val, unsigned int addr); -extern void ReadReal(unsigned int raddr, unsigned int *vaddr); -extern unsigned int LLTraceSet(unsigned int tflags); -extern void GratefulDebInit(void); -extern void GratefulDebDisp(unsigned int coord, unsigned int data); -extern void checkNMI(void); - -#ifndef VM_WIMG_IO -#define VM_WIMG_IO (VM_MEM_COHERENT | \ - VM_MEM_NOT_CACHEABLE | VM_MEM_GUARDED) -#endif - -#endif /* _PEXPERT_PPC_PROTOS_H_ */ diff --git a/pexpert/pexpert/protos.h b/pexpert/pexpert/protos.h index cd339325c..d54464467 100644 --- a/pexpert/pexpert/protos.h +++ b/pexpert/pexpert/protos.h @@ -46,11 +46,7 @@ extern void printf(const char *fmt, ...); extern void interrupt_enable(void); extern void interrupt_disable(void); -#if __ppc__ -extern void bcopy_nc(const char *from, char *to, int size); /* uncached-safe */ -#else #define bcopy_nc bcopy -#endif //------------------------------------------------------------------------ //from kern/misc_protos.h diff --git a/pexpert/ppc/pe_clock_speed.c b/pexpert/ppc/pe_clock_speed.c deleted file mode 100644 index 4a859e2b0..000000000 --- a/pexpert/ppc/pe_clock_speed.c +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * pe_clock_speed.c - Determine the best guess for the processor and bus - * speed buy using the values returned by run_clock_test. - * - * (c) Apple Computer, Inc. 1998-2002 - * - * Writen by: Josh de Cesare - * - */ - -#include - -#include - -// prototypes -extern void pe_run_clock_test(void *tmp); -void pe_do_clock_test(unsigned int via_addr, - int num_speeds, unsigned long *speed_list); -void PE_Determine_Clock_Speeds(unsigned int via_addr, int num_speeds, unsigned long *speed_list); - -// Threshold for bus speed matches. -#define kMaxFreqDiff (30000) - -// This is the structure for the data that get passed to pe_run_clock_test. -struct clock_test_data { - unsigned int via_addr; - unsigned int via_ticks; - unsigned int dec_ticks; -}; - -// glocal variables to simplify some stuff. -static long bus_freq_num, bus_freq_den, cpu_pll; - -// PE_Determine_Clock_Speeds is called by the via driver in IOKit -// It uses the numbers generated by pe_do_clock_test and reports -// the cleaned up values to the rest of the OS. -void PE_Determine_Clock_Speeds(unsigned int via_addr, int num_speeds, - unsigned long *speed_list) -{ - boolean_t oldLevel; - unsigned long tmp_bus_speed, tmp_cpu_speed; - unsigned long long tmp; - - oldLevel = ml_set_interrupts_enabled(FALSE); - pe_do_clock_test(via_addr, num_speeds, speed_list); - ml_set_interrupts_enabled(oldLevel); - - tmp_bus_speed = bus_freq_num / bus_freq_den; - tmp = ((unsigned long long)bus_freq_num * cpu_pll) / (bus_freq_den * 2); - tmp_cpu_speed = (unsigned long)tmp; - - // Report the bus clock rate as is. - gPEClockFrequencyInfo.bus_clock_rate_num = bus_freq_num; - gPEClockFrequencyInfo.bus_clock_rate_den = bus_freq_den; - - // pll multipliers are in halfs so set the denominator to 2. - gPEClockFrequencyInfo.bus_to_cpu_rate_num = cpu_pll; - gPEClockFrequencyInfo.bus_to_cpu_rate_den = 2; - - // The decrementer rate is one fourth the bus rate. - gPEClockFrequencyInfo.bus_to_dec_rate_num = 1; - gPEClockFrequencyInfo.bus_to_dec_rate_den = 4; - - // Assume that the timebase frequency is derived from the bus clock. - gPEClockFrequencyInfo.timebase_frequency_num = bus_freq_num; - gPEClockFrequencyInfo.timebase_frequency_den = bus_freq_den * 4; - - // Set the truncated numbers in gPEClockFrequencyInfo. - gPEClockFrequencyInfo.bus_clock_rate_hz = tmp_bus_speed; - gPEClockFrequencyInfo.cpu_clock_rate_hz = tmp_cpu_speed; - gPEClockFrequencyInfo.dec_clock_rate_hz = tmp_bus_speed / 4; - gPEClockFrequencyInfo.timebase_frequency_hz = tmp_bus_speed / 4; - - gPEClockFrequencyInfo.bus_frequency_hz = tmp_bus_speed; - gPEClockFrequencyInfo.bus_frequency_min_hz = tmp_bus_speed; - gPEClockFrequencyInfo.bus_frequency_max_hz = tmp_bus_speed; - gPEClockFrequencyInfo.cpu_frequency_hz = tmp_cpu_speed; - gPEClockFrequencyInfo.cpu_frequency_min_hz = tmp_cpu_speed; - gPEClockFrequencyInfo.cpu_frequency_max_hz = tmp_cpu_speed; - - PE_call_timebase_callback(); -} - -// pe_do_clock_test uses the number from pe_run_clock_test to -// find a best fit guess for the bus speed. -void pe_do_clock_test(unsigned int via_addr, - int num_speeds, unsigned long *speed_list) -{ - struct clock_test_data clock_test_data; - long cnt, diff, raw_cpu_freq, raw_bus_freq, tmp_bus_freq, - last_bus_freq, tries = 10; - - // Save the via addr so the asm part can use it. - clock_test_data.via_addr = via_addr; - - // Keep looping until it matches the last try. - bus_freq_num = 0; - do { - last_bus_freq = bus_freq_num; - - // The the asm part to do the real work. - pe_run_clock_test((void *)&clock_test_data); - - // First find the pll mode. Allow any integer times two. - cpu_pll = 10000000 / clock_test_data.dec_ticks; - cpu_pll = (cpu_pll / 2) + (cpu_pll & 1); - - // Using 64 bit math figure out the raw bus speed. - // 0xBF401675E5DULL is 1 / 1.27655us times 2 ^ 24. - raw_bus_freq = ((0xBF401675E5DULL * clock_test_data.dec_ticks) / - clock_test_data.via_ticks) >> 22; - - // use the pll mode and the raw bus speed to find the raw cpu speed. - raw_cpu_freq = raw_bus_freq * cpu_pll / 2; - - // Look to see if the bus speed is close to one of the - // speeds in the table. - for (cnt = 0; cnt < num_speeds; cnt++) { - bus_freq_num = speed_list[cnt * 2]; - bus_freq_den = speed_list[cnt * 2 + 1]; - diff = bus_freq_num - raw_bus_freq * bus_freq_den; - if (diff < 0) diff = -diff; - - if (diff < kMaxFreqDiff * bus_freq_den) break; - } - if (cnt != num_speeds) continue; - - // Look to see if the bus speed is close to n * 0.5 MHz - tmp_bus_freq = ((raw_bus_freq + 250000) / 500000) * 500000; - - diff = tmp_bus_freq - raw_bus_freq; - if (diff < 0) diff = -diff; - - if (diff < kMaxFreqDiff) { - bus_freq_num = tmp_bus_freq; - bus_freq_den = 1; - continue; - } - - // Look to see if the bus speed is close to n * 50/3 MHz - tmp_bus_freq = ((raw_bus_freq * 3 + 25000000) / 50000000) * 50000000; - - diff = tmp_bus_freq - raw_bus_freq * 3; - if (diff < 0) diff = -diff; - - if (diff < kMaxFreqDiff * 3) { - bus_freq_num = tmp_bus_freq; - bus_freq_den = 3; - continue; - } - - // Since all else failed return the raw bus speed - bus_freq_num = raw_bus_freq; - bus_freq_den = 1; - } while ((bus_freq_num != last_bus_freq) && tries--); -} diff --git a/pexpert/ppc/pe_clock_speed_asm.s b/pexpert/ppc/pe_clock_speed_asm.s deleted file mode 100644 index 41e9a0fb6..000000000 --- a/pexpert/ppc/pe_clock_speed_asm.s +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * clock_speed_asm.s - Uses the Via timer, decrementer, and counter - * to determine the clock and bus rates. - * - * (c) Apple Computer, Inc. 1998-9 - * - * Writen by: Josh de Cesare - * - */ - -#include - -// constants for the via -#define CountLow 0x800 -#define CountHigh 0xa00 -#define LatchLow 0xc00 -#define LatchHigh 0xe00 - - -// void pe_run_clock_test(clock_test_data *data) -// -// data points to the base address of the via and two longs -// for storing the via and dec results. -// -// The basic idea is this... -// Use the counter register to execute a loop that will take -// 10,000,000 processor clocks. Time it using both the via counter -// and the time base. Return the number of ticks for both so the -// raw values for processor and bus speed can be calculated. -ENTRY(pe_run_clock_test, TAG_NO_FRAME_USED) - - li r4, 1 ; flag for cache load - li r5, 1 ; Only once through this time - lwz r9, 0(r3) ; r9 is the via addr - -L_again: - mtctr r5 ; set the count - li r5, 0xff ; Start the counter at 0xffff - stb r5, CountLow(r9) ; clear the via counter - eieio - stb r5, CountHigh(r9) - eieio - mftb r10 ; save starting value of the time base - isync - -L_loop: - addi r5, r5, 1 ; 8 adds for 8 cycles - addi r5, r5, 2 ; the bdnz should be 0 cycles - addi r5, r5, 3 - addi r5, r5, 4 - addi r5, r5, 5 - addi r5, r5, 6 - addi r5, r5, 7 - addi r5, r5, 8 - bdnz L_loop - - sync - mftb r5 ; save the raw time base value - lbz r6, CountHigh(r9) ; get the via counter values - eieio - lbz r7, CountLow(r9) - eieio - lbz r8, CountHigh(r9) - eieio - - cmpi cr0, r4, 1 ; see if the was the cache run - bne L_finish_up ; nope, we are done. - - li r4, 0 ; set flag for the real test - li r5, 0x12d0 ; set the initial count to 1.25e+6 - oris r5, r5, 0x13 - b L_again - -L_finish_up: - cmpi cr0, r7, 0 ; if L1 is zero then H1 is good. - beq L_use_H1 ; else H2 will be good. - - mr r6, r8 ; use H2 instead. - -L_use_H1: - rlwimi r7, r6, 8, 16, 23 - not r6, r7 ; neg - 1 is not - andi. r6, r6, 0xffff - stw r6, 4(r3) ; save via ticks - - sub r5, r5, r10 ; r5 is the number of time base ticks - stw r5, 8(r3) ; save time base ticks - - blr diff --git a/pexpert/ppc/pe_identify_machine.c b/pexpert/ppc/pe_identify_machine.c deleted file mode 100644 index 993b124f9..000000000 --- a/pexpert/ppc/pe_identify_machine.c +++ /dev/null @@ -1,194 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#include -#include -#include -#include -#include - -extern void panic(const char *str, ...); - -/* Local declarations */ -void pe_identify_machine(void); -vm_offset_t get_io_base_addr(void); - -/* pe_identify_machine: - * - * Sets up platform parameters. - * Returns: nothing - */ -void pe_identify_machine(void) -{ - DTEntry cpu, root; - unsigned long *value; - unsigned int size; - - // Clear the gPEClockFrequencyInfo struct - bzero((void *)&gPEClockFrequencyInfo, sizeof(clock_frequency_info_t)); - - // Start with default values. - gPEClockFrequencyInfo.timebase_frequency_hz = 25000000; - gPEClockFrequencyInfo.bus_clock_rate_hz = 100000000; - gPEClockFrequencyInfo.cpu_clock_rate_hz = 300000000; - - // Try to get the values from the device tree. - if (DTFindEntry("device_type", "cpu", &cpu) == kSuccess) { - // Find the time base frequency first. - if (DTGetProperty(cpu, "timebase-frequency", (void **)&value, &size) == kSuccess) { - // timebase_frequency_hz is only 32 bits, and the device tree should never provide 64 bits - // so this if should never be taken. - if (size == 8) gPEClockFrequencyInfo.timebase_frequency_hz = *(unsigned long long *)value; - else gPEClockFrequencyInfo.timebase_frequency_hz = *value; - } - gPEClockFrequencyInfo.dec_clock_rate_hz = gPEClockFrequencyInfo.timebase_frequency_hz; - - // Find the bus frequency next. Try the cpu node, then the root. - if (DTGetProperty(cpu, "bus-frequency", (void **)&value, &size) == kSuccess) { - if (size == 8) gPEClockFrequencyInfo.bus_frequency_hz = *(unsigned long long *)value; - else gPEClockFrequencyInfo.bus_frequency_hz = *value; - } else { - if (DTLookupEntry(0, "/", &root) == kSuccess) { - if (DTGetProperty(root, "clock-frequency", (void **)&value, &size) == kSuccess) { - if (size == 8) gPEClockFrequencyInfo.bus_frequency_hz = *(unsigned long long *)value; - else gPEClockFrequencyInfo.bus_frequency_hz = *value; - } - } - } - - gPEClockFrequencyInfo.bus_frequency_min_hz = gPEClockFrequencyInfo.bus_frequency_hz; - gPEClockFrequencyInfo.bus_frequency_max_hz = gPEClockFrequencyInfo.bus_frequency_hz; - - if (gPEClockFrequencyInfo.bus_frequency_hz < 0x100000000ULL) - gPEClockFrequencyInfo.bus_clock_rate_hz = gPEClockFrequencyInfo.bus_frequency_hz; - else - gPEClockFrequencyInfo.bus_clock_rate_hz = 0xFFFFFFFF; - - // Find the cpu frequency last. - if (DTGetProperty(cpu, "clock-frequency", (void **)&value, &size) == kSuccess) { - if (size == 8) gPEClockFrequencyInfo.cpu_frequency_hz = *(unsigned long long *)value; - else gPEClockFrequencyInfo.cpu_frequency_hz = *value; - } - - gPEClockFrequencyInfo.cpu_frequency_min_hz = gPEClockFrequencyInfo.cpu_frequency_hz; - gPEClockFrequencyInfo.cpu_frequency_max_hz = gPEClockFrequencyInfo.cpu_frequency_hz; - - if (gPEClockFrequencyInfo.cpu_frequency_hz < 0x100000000ULL) - gPEClockFrequencyInfo.cpu_clock_rate_hz = gPEClockFrequencyInfo.cpu_frequency_hz; - else - gPEClockFrequencyInfo.cpu_clock_rate_hz = 0xFFFFFFFF; - } - - // Set the num / den pairs form the hz values. - gPEClockFrequencyInfo.timebase_frequency_num = gPEClockFrequencyInfo.timebase_frequency_hz; - gPEClockFrequencyInfo.timebase_frequency_den = 1; - - gPEClockFrequencyInfo.bus_clock_rate_num = gPEClockFrequencyInfo.bus_clock_rate_hz; - gPEClockFrequencyInfo.bus_clock_rate_den = 1; - - gPEClockFrequencyInfo.bus_to_cpu_rate_num = - (2 * gPEClockFrequencyInfo.cpu_clock_rate_hz) / gPEClockFrequencyInfo.bus_clock_rate_hz; - gPEClockFrequencyInfo.bus_to_cpu_rate_den = 2; - - gPEClockFrequencyInfo.bus_to_dec_rate_num = 1; - gPEClockFrequencyInfo.bus_to_dec_rate_den = - gPEClockFrequencyInfo.bus_clock_rate_hz / gPEClockFrequencyInfo.dec_clock_rate_hz; -} - -/* get_io_base_addr(): - * - * Get the base address of the io controller. - */ -vm_offset_t get_io_base_addr(void) -{ - DTEntry entryP; - vm_offset_t *address; - unsigned int size; - - if ((DTFindEntry("device_type", "dbdma", &entryP) == kSuccess) - || (DTFindEntry("device_type", "mac-io", &entryP) == kSuccess)) - { - if (DTGetProperty(entryP, "AAPL,address", (void **)&address, &size) == kSuccess) - return *address; - - if (DTGetProperty(entryP, "assigned-addresses", (void **)&address, &size) == kSuccess) - // address calculation not correct - return *(address+2); - } - - panic("Can't find this machine's io base address\n"); - return 0; -} - -vm_offset_t PE_find_scc(void) -{ - vm_offset_t io, sccadd; - DTEntry entryP; - vm_offset_t *sccregs; - unsigned int sccrsize; - - if(!(io = get_io_base_addr())) { /* Get the I/O controller base address */ - return (vm_offset_t)0; /* Hmmm, no I/O??? What gives??? How'd we even boot? */ - } - - -/* Note: if we find a escc-legacy, we need to kind of hack because it can be either an offset - into the iobase or the actual address itself. ORint the two should provide the correct - for either */ - - sccadd = 0; /* Assume none for now */ - - if(DTFindEntry("name", "escc-legacy", &entryP) == kSuccess) { /* Find the old fashioned serial port */ - if (DTGetProperty(entryP, "reg", (void **)&sccregs, &sccrsize) == kSuccess) { /* Do we have some registers? */ - sccadd = ((vm_offset_t)*sccregs | io); /* Get the address */ - } - } - - if(DTFindEntry("name", "escc", &entryP) == kSuccess) { /* Well, see if we just have the new fangled one */ - sccadd = io + 0x12000; /* Yeah, but still return the oldie goldie... */ - } - - return sccadd; /* Return it if you found it */ -} - -unsigned int PE_init_taproot(vm_offset_t *taddr) -{ - DTEntry entryP; - vm_offset_t *tappdata; - unsigned int tappsize; - - - if(DTFindEntry("name", "memory-map", &entryP) != kSuccess) return 0; /* no memory map */ - - if (DTGetProperty(entryP, "TapRoot", (void **)&tappdata, &tappsize) != kSuccess) return 0; /* No TapRoot */ - - tappdata[1] = (tappdata[1] + 4095 ) & -4096; /* Make sure this is a whole page */ - - *taddr = io_map_spec(tappdata[0], tappdata[1], VM_WIMG_IO); /* Map it in and return the address */ - tappdata[0] = *taddr; /* Also change property */ - return tappdata[1]; /* And the size */ -} diff --git a/pexpert/ppc/pe_init.c b/pexpert/ppc/pe_init.c deleted file mode 100644 index 6bcf93210..000000000 --- a/pexpert/ppc/pe_init.c +++ /dev/null @@ -1,269 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * file: pe_init.c - * PPC platform expert initialization. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -/* extern references */ -void pe_identify_machine(void); - -/* private globals */ -PE_state_t PE_state; - -/* Clock Frequency Info */ -clock_frequency_info_t gPEClockFrequencyInfo; - -static int PE_stub_read_write_time_of_day(unsigned int options, long * secs) -{ - // believe it or, BSD crashes if invalid time returned. FIXME. - if( options == kPEReadTOD) - *secs = 0xb2383c72; - - return 0; -} - -static int PE_stub_poll_input(__unused unsigned int options, char * c) -{ - *c = 0xff; - - return 1; -} - -static int PE_stub_write_IIC(__unused unsigned char addr, __unused unsigned char reg, - __unused unsigned char data) -{ - return 1; -} - -int (*PE_read_write_time_of_day)(unsigned int options, long * secs) - = PE_stub_read_write_time_of_day; -int (*PE_poll_input)(unsigned int options, char * c) - = PE_stub_poll_input; - -int (*PE_write_IIC)(unsigned char addr, unsigned char reg, - unsigned char data) - = PE_stub_write_IIC; - - -int PE_initialize_console( PE_Video * info, int op ) -{ - static int last_console = -1; - - if (info) { - info->v_offset = 0; - info->v_length = 0; - info->v_display = 0; - } - - switch( op ) { - - case kPEDisableScreen: - initialize_screen(info, op); - last_console = switch_to_serial_console(); - kprintf("kPEDisableScreen %d\n",last_console); - break; - - case kPEEnableScreen: - initialize_screen(info, op); - if (info) PE_state.video = *info; - kprintf("kPEEnableScreen %d\n",last_console); - if( last_console != -1) - switch_to_old_console( last_console); - break; - - default: - initialize_screen(info, op); - break; - } - - return 0; -} - -void PE_init_iokit(void) -{ - kern_return_t ret; - DTEntry entry; - unsigned int size; - void ** map; - - PE_init_kprintf(TRUE); - PE_init_printf(TRUE); - - if( kSuccess == DTLookupEntry(NULL, "/chosen/memory-map", &entry)) { - - boot_progress_element * bootPict; - - if( kSuccess == DTGetProperty(entry, "BootCLUT", (void **) &map, &size)) - bcopy( map[0], appleClut8, sizeof(appleClut8) ); - - if( kSuccess == DTGetProperty(entry, "Pict-FailedBoot", (void **) &map, &size)) { - - bootPict = (boot_progress_element *) map[0]; - default_noroot.width = bootPict->width; - default_noroot.height = bootPict->height; - default_noroot.dx = 0; - default_noroot.dy = bootPict->yOffset; - default_noroot_data = &bootPict->data[0]; - } - } - panic_ui_initialize( (unsigned char *) appleClut8 ); - vc_progress_initialize( &default_progress, default_progress_data, (unsigned char *) appleClut8 ); - - ret = StartIOKit( PE_state.deviceTreeHead, PE_state.bootArgs, (void *)0, (void *)0); -} - -void PE_init_platform(boolean_t vm_initialized, void *_args) -{ - DTEntry dsouth, dnorth, root, dcpu; - char *model; - unsigned int msize, size; - uint32_t *south, *north, *pdata, *ddata; - int i; - - boot_args *args = (boot_args *)_args; - - if (PE_state.initialized == FALSE) - { - PE_state.initialized = TRUE; - PE_state.bootArgs = _args; - PE_state.deviceTreeHead = args->deviceTreeP; - PE_state.video.v_baseAddr = args->Video.v_baseAddr; - PE_state.video.v_rowBytes = args->Video.v_rowBytes; - PE_state.video.v_width = args->Video.v_width; - PE_state.video.v_height = args->Video.v_height; - PE_state.video.v_depth = args->Video.v_depth; - PE_state.video.v_display = args->Video.v_display; - strlcpy(PE_state.video.v_pixelFormat, "PPPPPPPP", - sizeof(PE_state.video.v_pixelFormat)); - } - - if (!vm_initialized) - { - /* - * Setup the OpenFirmware Device Tree routines - * so the console can be found and the right I/O space - * can be used.. - */ - DTInit(PE_state.deviceTreeHead); - - /* Setup gPEClockFrequencyInfo */ - pe_identify_machine(); - } - else - { - pe_init_debug(); - - } -} - -void PE_create_console( void ) -{ - if ( PE_state.video.v_display ) - PE_initialize_console( &PE_state.video, kPEGraphicsMode ); - else - PE_initialize_console( &PE_state.video, kPETextMode ); -} - -int PE_current_console( PE_Video * info ) -{ - *info = PE_state.video; - - return( 0); -} - -void PE_display_icon( __unused unsigned int flags, - __unused const char * name ) -{ - if( default_noroot_data) - vc_display_icon( &default_noroot, default_noroot_data ); -} - -boolean_t -PE_get_hotkey(unsigned char key) -{ - unsigned char * adbKeymap; - unsigned int size; - DTEntry entry; - - if( (kSuccess != DTLookupEntry(NULL, "/", &entry)) - || (kSuccess != DTGetProperty( entry, "AAPL,adb-keymap", - (void **)&adbKeymap, &size)) - || (size != 16)) - - return( FALSE); - - if( key > 127) - return( FALSE); - - return( adbKeymap[ key / 8 ] & (0x80 >> (key & 7))); -} - -static timebase_callback_func gTimebaseCallback; - -void PE_register_timebase_callback(timebase_callback_func callback) -{ - gTimebaseCallback = callback; - - PE_call_timebase_callback(); -} - -void PE_call_timebase_callback(void) -{ - struct timebase_freq_t timebase_freq; - unsigned long num, den, cnt; - - num = gPEClockFrequencyInfo.timebase_frequency_num; - den = gPEClockFrequencyInfo.timebase_frequency_den; - - cnt = 2; - while (cnt <= den) { - if ((num % cnt) || (den % cnt)) { - cnt++; - continue; - } - - num /= cnt; - den /= cnt; - } - - timebase_freq.timebase_num = num; - timebase_freq.timebase_den = den; - - if (gTimebaseCallback) gTimebaseCallback(&timebase_freq); -} diff --git a/pexpert/ppc/pe_kprintf.c b/pexpert/ppc/pe_kprintf.c deleted file mode 100644 index 2509d6962..000000000 --- a/pexpert/ppc/pe_kprintf.c +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * file: pe_kprintf.c - * PPC platform expert debugging output initialization. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* extern references */ -extern void scc_putc(int unit, int line, int c); -extern long strtol(const char *, char **, int); - -/* Globals */ -void (*PE_kputc)(char c); - -unsigned int disable_serial_output = TRUE; - -vm_offset_t scc = 0; - -struct slock kprintf_lock; - -void PE_init_kprintf(__unused boolean_t vm_initialized) -{ - unsigned int boot_arg; - int32_t serial_baud = -1; - unsigned int size; - DTEntry options; - char *str, baud[7]; - - if (PE_state.initialized == FALSE) - panic("Platform Expert not initialized"); - - if (PE_parse_boot_argn("debug", &boot_arg, sizeof (boot_arg))) - if(boot_arg & DB_KPRT) disable_serial_output = FALSE; - - if (DTLookupEntry(NULL, "/options", &options) == kSuccess) { - if (DTGetProperty(options, "input-device", (void **)&str, &size) == kSuccess) { - if ((size > 5) && !strncmp("scca:", str, 5)) { - size -= 5; - str += 5; - if (size <= 6) { - strncpy(baud, str, size); - baud[size] = '\0'; - gPESerialBaud = strtol(baud, NULL, 0); - } - } - } - if (DTGetProperty(options, "output-device", (void **)&str, &size) == kSuccess) { - if ((size > 5) && !strncmp("scca:", str, 5)) { - size -= 5; - str += 5; - if (size <= 6) { - strncpy(baud, str, size); - baud[size] = '\0'; - gPESerialBaud = strtol(baud, NULL, 0); - } - } - } - } - - /* Check the boot-args for new serial baud. */ - if (PE_parse_boot_argn("serialbaud", &serial_baud, sizeof (serial_baud))) - if (serial_baud != -1) gPESerialBaud = serial_baud; - - if( (scc = PE_find_scc())) { /* See if we can find the serial port */ - scc = io_map_spec(scc, 0x1000, VM_WIMG_IO); /* Map it in */ - initialize_serial((void *)scc, gPESerialBaud); /* Start up the serial driver */ - PE_kputc = serial_putc; - - simple_lock_init(&kprintf_lock, 0); - } else - PE_kputc = cnputc; - -#if 0 - /* - * FUTURE: eventually let the boot command determine where - * the debug output will be, serial, video, etc. - */ - switch (PE_state.debug_video.v_display) { - case kDebugTypeSerial: - PE_kputc = serial_putc; - break; - - case kDebugTypeDisplay: - init_display_putc( (unsigned char*)PE_state.debug_video.v_baseAddr, - PE_state.debug_video.v_rowBytes, - PE_state.debug_video.v_height); - PE_kputc = display_putc; - break; - - default: - PE_state.debug_video.v_baseAddr = 0; - } -#endif -} - -void serial_putc(char c) -{ - scc_putc(0, 1, c); - if (c == '\n') - scc_putc(0, 1, '\r'); -} - -void kprintf(const char *fmt, ...) -{ - va_list listp; - boolean_t state; - - state = ml_set_interrupts_enabled(FALSE); - simple_lock(&kprintf_lock); - - if (!disable_serial_output) { - va_start(listp, fmt); - _doprnt(fmt, &listp, PE_kputc, 16); - va_end(listp); - } - - simple_unlock(&kprintf_lock); - ml_set_interrupts_enabled(state); -} - diff --git a/security/Makefile b/security/Makefile index 90a42a992..4af4eb56e 100644 --- a/security/Makefile +++ b/security/Makefile @@ -8,20 +8,17 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ INSTINC_SUBDIRS_X86_64 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ + DATAFILES = \ mac.h \ mac_policy.h @@ -49,7 +46,7 @@ INSTALL_MI_LCL_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} # /System/Library/Frameworks/Kernel.framework/PrivateHeaders INSTALL_KF_MI_LCL_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} -SETUP_SUBDIRS = conf +SETUP_SUBDIRS = COMP_SUBDIRS = conf diff --git a/security/conf/MASTER b/security/conf/MASTER index 18a6f7f1f..8d1598990 100644 --- a/security/conf/MASTER +++ b/security/conf/MASTER @@ -64,4 +64,4 @@ options CONFIG_NO_PANIC_STRINGS # options CONFIG_NO_PRINTF_STRINGS # options CONFIG_NO_KPRINTF_STRINGS # options CONFIG_FSE # file system events # - +options CONFIG_TRIGGERS # trigger vnodes # diff --git a/security/conf/MASTER.i386 b/security/conf/MASTER.i386 index 524008c7b..dd4fb5f69 100644 --- a/security/conf/MASTER.i386 +++ b/security/conf/MASTER.i386 @@ -4,7 +4,6 @@ # PROFILE = [ RELEASE profile ] # DEBUG = [ RELEASE debug ] # -# # EMBEDDED = [ intel mach libkerncpp audit ] # DEVELOPMENT = [ EMBEDDED config_dtrace ] # @@ -20,6 +19,7 @@ cpu "i386" # options CONFIG_MACF # Mandatory Access Control Framework options CONFIG_MACF_SOCKET_SUBSET # MACF subset of socket support options CONFIG_FSE +options CONFIG_TRIGGERS #options CONFIG_MACF_SOCKET #options CONFIG_MACF_NET #options CONFIG_MACF_ALWAYS_LABEL_MBUF diff --git a/security/conf/MASTER.ppc b/security/conf/MASTER.ppc deleted file mode 100644 index 8b946ff2a..000000000 --- a/security/conf/MASTER.ppc +++ /dev/null @@ -1,31 +0,0 @@ -# -###################################################################### -# -# Standard Apple MacOS X Configurations: -# -------- ---- -------- --------------- -# -# RELEASE = [ppc mach libkerncpp config_dtrace audit] -# DEVELOPMENT = [RELEASE] -# PROFILE = [RELEASE] -# DEBUG = [RELEASE debug] -# RELEASE_TRACE = [ RELEASE kdebug ] -# DEBUG_TRACE = [ DEBUG kdebug ] -# -###################################################################### - -# -# Note: MAC options must be set in all the bsd/conf, osfmk/conf, and -# security/conf MASTER files. -# -options CONFIG_MACF # Mandatory Access Control Framework -options CONFIG_MACF_SOCKET_SUBSET # MACF subset of socket support -options CONFIG_FSE -#options CONFIG_MACF_SOCKET -#options CONFIG_MACF_NET -#options CONFIG_MACF_ALWAYS_LABEL_MBUF -#options CONFIG_MACF_DEBUG -#options CONFIG_MACF_MACH -options CONFIG_AUDIT # Kernel auditing - -machine "ppc" # -cpu "ppc" # diff --git a/security/conf/MASTER.x86_64 b/security/conf/MASTER.x86_64 index 86b65c412..d362cf049 100644 --- a/security/conf/MASTER.x86_64 +++ b/security/conf/MASTER.x86_64 @@ -4,7 +4,6 @@ # PROFILE = [ RELEASE profile ] # DEBUG = [ RELEASE debug ] # -# # EMBEDDED = [ intel mach libkerncpp audit ] # DEVELOPMENT = [ EMBEDDED ] # @@ -17,6 +16,7 @@ options CONFIG_MACF # Mandatory Access Control Framework options CONFIG_MACF_SOCKET_SUBSET # MACF subset of socket support options CONFIG_FSE +options CONFIG_TRIGGERS #options CONFIG_MACF_SOCKET #options CONFIG_MACF_NET #options CONFIG_MACF_ALWAYS_LABEL_MBUF diff --git a/security/conf/Makefile b/security/conf/Makefile index f32722158..bdb8f33f8 100644 --- a/security/conf/Makefile +++ b/security/conf/Makefile @@ -7,8 +7,7 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir include $(MakeInc_cmd) include $(MakeInc_def) -SETUP_SUBDIRS = \ - tools +SETUP_SUBDIRS = COMP_SUBDIRS = @@ -24,30 +23,24 @@ else export COMPOBJROOT=$(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)/$(COMPONENT) endif -$(COMPOBJROOT)/doconf: - @make build_setup +MASTER_CPU_PER_SOC = $(SOURCE)/MASTER.$(ARCH_CONFIG_LC).$(MACHINE_CONFIG_LC) $(COMPOBJROOT)/$(SECURITY_KERNEL_CONFIG)/Makefile : $(SOURCE)/MASTER \ $(SOURCE)/MASTER.$(ARCH_CONFIG_LC) \ $(SOURCE)/Makefile.template \ $(SOURCE)/Makefile.$(ARCH_CONFIG_LC) \ $(SOURCE)/files \ - $(SOURCE)/files.$(ARCH_CONFIG_LC) \ - $(COMPOBJROOT)/doconf + $(SOURCE)/files.$(ARCH_CONFIG_LC) $(_v)(doconf_target=$(addsuffix /conf, $(TARGET)); \ $(MKDIR) $${doconf_target}; \ cd $${doconf_target}; \ rm -f $(notdir $?); \ cp $? $${doconf_target}; \ - $(COMPOBJROOT)/doconf -c -cpu $(ARCH_CONFIG_LC) -d $(TARGET)/$(SECURITY_KERNEL_CONFIG) $(SECURITY_KERNEL_CONFIG); \ + if [ -f $(MASTER_CPU_PER_SOC) ]; then cp $(MASTER_CPU_PER_SOC) $${doconf_target}; fi; \ + $(SRCROOT)/SETUP/config/doconf -c -cpu $(ARCH_CONFIG_LC) -soc $(MACHINE_CONFIG_LC) -d $(TARGET)/$(SECURITY_KERNEL_CONFIG) $(SECURITY_KERNEL_CONFIG); \ ); -.ORDER: $(COMPOBJROOT)/$(SECURITY_KERNEL_CONFIG)/Makefile - -do_setup_conf: $(COMPOBJROOT)/doconf \ - $(COMPOBJROOT)/$(SECURITY_KERNEL_CONFIG)/Makefile - -do_all: do_setup_conf +do_all: $(COMPOBJROOT)/$(SECURITY_KERNEL_CONFIG)/Makefile $(_v)next_source=$(subst conf/,,$(SOURCE)); \ ${MAKE} -C $(COMPOBJROOT)/$(SECURITY_KERNEL_CONFIG) \ MAKEFILES=$(TARGET)/$(SECURITY_KERNEL_CONFIG)/Makefile \ diff --git a/security/conf/Makefile.i386 b/security/conf/Makefile.i386 index 7da8f08d6..3695a666c 100644 --- a/security/conf/Makefile.i386 +++ b/security/conf/Makefile.i386 @@ -2,17 +2,6 @@ #BEGIN Machine dependent Makefile fragment for i386 ###################################################################### -# Enable -Werror for i386 builds -CFLAGS+=$(WERROR) -CWARNFLAGS= $(filter-out -Wbad-function-cast, $(CWARNFLAGS_STD)) - -# Objects that don't compile cleanly: -#OBJS_NO_WERROR = \ - -OBJS_WERROR=$(filter-out $(OBJS_NO_WERROR),$(OBJS)) - -$(OBJS_WERROR): WERROR=-Werror - ###################################################################### #END Machine dependent Makefile fragment for i386 ###################################################################### diff --git a/security/conf/Makefile.ppc b/security/conf/Makefile.ppc deleted file mode 100644 index d71f1d77b..000000000 --- a/security/conf/Makefile.ppc +++ /dev/null @@ -1,18 +0,0 @@ -###################################################################### -#BEGIN Machine dependent Makefile fragment for ppc -###################################################################### - -# Enable -Werror for ppc builds -CFLAGS+=$(WERROR) -CWARNFLAGS= $(filter-out -Wbad-function-cast, $(CWARNFLAGS_STD)) - -# Objects that don't compile cleanly: -#OBJS_NO_WERROR = \ - -OBJS_WERROR=$(filter-out $(OBJS_NO_WERROR),$(OBJS)) - -$(OBJS_WERROR): WERROR=-Werror - -###################################################################### -#END Machine dependent Makefile fragment for ppc -###################################################################### diff --git a/security/conf/Makefile.template b/security/conf/Makefile.template index ebe12b2d7..fd1ffeef0 100644 --- a/security/conf/Makefile.template +++ b/security/conf/Makefile.template @@ -26,8 +26,7 @@ include $(MakeInc_def) # # XXX: CFLAGS # -CFLAGS+= -I. -imacros meta_features.h -DKERNEL -DBSD_KERNEL_PRIVATE \ - -Wall -fno-common +CFLAGS+= -I. -include meta_features.h -DBSD_KERNEL_PRIVATE # # Directories for mig generated files @@ -81,17 +80,17 @@ ${OBJS}: ${OBJSDEPS} LDOBJS = $(OBJS) -$(COMPONENT).o: $(LDOBJS) +$(COMPONENT).filelist: $(LDOBJS) $(_v)$(RM) $(RMFLAGS) vers.c - $(_v)$(COMPOBJROOT)/newvers \ + $(_v)$(SRCROOT)/SETUP/newvers \ `$(CAT) ${VERSION_FILES}` ${COPYRIGHT_FILES} $(_v)${KCC} $(CFLAGS) $(INCLUDES) -c vers.c @echo LDFILELIST $(COMPONENT) $(_v)( for obj in ${LDOBJS} vers.o; do \ echo $(TARGET)$(COMP_OBJ_DIR)/$(KERNEL_CONFIG)/$${obj}; \ - done; ) > $(COMPONENT).o + done; ) > $(COMPONENT).filelist -do_all: $(COMPONENT).o +do_all: $(COMPONENT).filelist do_depend: do_all $(_v)${MD} -u Makedep -f -d `ls *.d`; diff --git a/security/conf/Makefile.x86_64 b/security/conf/Makefile.x86_64 index 64c2b46d5..7b0de925d 100644 --- a/security/conf/Makefile.x86_64 +++ b/security/conf/Makefile.x86_64 @@ -2,17 +2,6 @@ #BEGIN Machine dependent Makefile fragment for x86_64 ###################################################################### -# Enable -Werror for x86_64 builds -CFLAGS+=$(WERROR) -CWARNFLAGS= $(filter-out -Wbad-function-cast, $(CWARNFLAGS_STD)) - -# Objects that don't compile cleanly: -#OBJS_NO_WERROR = \ - -OBJS_WERROR=$(filter-out $(OBJS_NO_WERROR),$(OBJS)) - -$(OBJS_WERROR): WERROR=-Werror - ###################################################################### #END Machine dependent Makefile fragment for x86_64 ###################################################################### diff --git a/security/conf/files b/security/conf/files index 295d886c2..8c077cb19 100644 --- a/security/conf/files +++ b/security/conf/files @@ -33,3 +33,4 @@ security/mac_pipe.c optional config_macf security/mac_iokit.c optional config_macf security/mac_file.c optional config_macf security/mac_inet.c optional config_macf_net +security/mac_priv.c optional config_macf diff --git a/security/conf/files.i386 b/security/conf/files.i386 index 8b1378917..e69de29bb 100644 --- a/security/conf/files.i386 +++ b/security/conf/files.i386 @@ -1 +0,0 @@ - diff --git a/security/conf/files.ppc b/security/conf/files.ppc deleted file mode 100644 index 8b1378917..000000000 --- a/security/conf/files.ppc +++ /dev/null @@ -1 +0,0 @@ - diff --git a/security/conf/tools/Makefile b/security/conf/tools/Makefile deleted file mode 100644 index a8111c252..000000000 --- a/security/conf/tools/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -SETUP_SUBDIRS = \ - doconf \ - newvers - -COMP_SUBDIRS = \ - doconf \ - newvers - -INST_SUBDIRS = \ - - -setup_build_all: - -do_build_all: - -setup_build_install: - -do_build_install: - -include $(MakeInc_rule) -include $(MakeInc_dir) - - diff --git a/security/conf/tools/doconf/Makefile b/security/conf/tools/doconf/Makefile deleted file mode 100644 index 7794a4ceb..000000000 --- a/security/conf/tools/doconf/Makefile +++ /dev/null @@ -1,49 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -COMP_SUBDIRS = \ - -INST_SUBDIRS = \ - - -# -# Who and where -# -BINDIR= -ifneq ($(MACHINE_CONFIG), DEFAULT) -DSTDIR= $(strip $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)_$(MACHINE_CONFIG)/$(COMPONENT)/) -else -DSTDIR= $(strip $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)/$(COMPONENT)/) -endif -PROGRAM= $(DSTDIR)doconf - -# -# How to install it -# -IFLAGS= -c -m 555 - -$(PROGRAM): $(DSTDIR)% : $(SOURCE)%.csh - @-$(RM) $(RMFLAGS) $(notdir $(PROGRAM)).VERS - @sed -e "s/#PROGRAM.*/#`vers_string $(notdir $(PROGRAM))`/" \ - < $< >$(notdir $(PROGRAM)).VERS; - @install $(IFLAGS) $(notdir $(PROGRAM)).VERS $(PROGRAM); - @-$(RM) $(RMFLAGS) $(notdir $(PROGRAM)).VERS; - -do_build_setup: $(PROGRAM) - -do_build_all: - -setup_build_install: - -do_build_install: - -include $(MakeInc_rule) -include $(MakeInc_dir) - - diff --git a/security/conf/tools/doconf/doconf.csh b/security/conf/tools/doconf/doconf.csh deleted file mode 100644 index 6fedb4786..000000000 --- a/security/conf/tools/doconf/doconf.csh +++ /dev/null @@ -1,321 +0,0 @@ -#!/bin/csh -f -set path = ($path .) -###################################################################### -# HISTORY -# 1-Dec-87 Michael Young (mwyoung) at Carnegie-Mellon University -# Added "-verbose" switch, so this script produces no output -# in the normal case. -# -# 10-Oct-87 Mike Accetta (mja) at Carnegie-Mellon University -# Flushed cmu_*.h and spin_locks.h -# [ V5.1(XF18) ] -# -# 6-Apr-87 Avadis Tevanian (avie) at Carnegie-Mellon University -# Use MASTER.local and MASTER..local for generation of -# configuration files in addition to MASTER and MASTER.. -# -# 25-Mar-87 Mike Accetta (mja) at Carnegie-Mellon University -# Removed use of obsolete wb_*.h files when building the feature -# list; modified to save the previous configuration file and -# display the differences between it and the new file. -# [ V5.1(F8) ] -# -# 25-Mar-87 Avadis Tevanian (avie) at Carnegie-Mellon University -# If there is no /etc/machine just print out a message telling -# user to use the -cpu option. I thought this script was supposed -# to work even without a /etc/machine, but it doesn't... and this -# is the easiest way out. -# -# 13-Mar-87 Mike Accetta (mja) at Carnegie-Mellon University -# Added "romp_fpa.h" file to extra features for the RT. -# [ V5.1(F7) ] -# -# 11-Mar-87 Mike Accetta (mja) at Carnegie-Mellon University -# Updated to maintain the appropriate configuration features file -# in the "machine" directory whenever the corresponding -# configuration is generated. This replaces the old mechanism of -# storing this directly in the file since it was -# machine dependent and also precluded building programs for more -# than one configuration from the same set of sources. -# [ V5.1(F6) ] -# -# 21-Feb-87 Mike Accetta (mja) at Carnegie-Mellon University -# Fixed to require wired-in cpu type names for only those -# machines where the kernel name differs from that provided by -# /etc/machine (i.e. IBMRT => ca and SUN => sun3); updated to -# permit configuration descriptions in both machine indepedent -# and dependent master configuration files so that attributes can -# be grouped accordingly. -# [ V5.1(F3) ] -# -# 17-Jan-87 Mike Accetta (mja) at Carnegie-Mellon University -# Updated to work from any directory at the same level as -# "conf"; generate configuration from both MASTER and -# MASTER. files; added -cpu switch. -# [ V5.1(F1) ] -# -# 18-Aug-86 Mike Accetta (mja) at Carnegie-Mellon University -# Added -make switch and changed meaning of -config; upgraded to -# allow multiple attributes per configuration and to define -# configurations in terms of these attributes within MASTER. -# -# 14-Apr-83 Mike Accetta (mja) at Carnegie-Mellon University -# Added -config switch to only run /etc/config without -# "make depend" and "make". -# -###################################################################### - -set prog=$0 -set prog=$prog:t -set nonomatch -set OBJDIR=../BUILD -if ("`/usr/bin/uname`" == "Rhapsody" ) then -set CONFIG_DIR=/usr/local/bin -else -set CONFIG_DIR=/usr/bin -endif - -unset domake -unset doconfig -unset beverbose -unset MACHINE -unset profile - -while ($#argv >= 1) - if ("$argv[1]" =~ -*) then - switch ("$argv[1]") - case "-c": - case "-config": - set doconfig - breaksw - case "-m": - case "-make": - set domake - breaksw - case "-cpu": - if ($#argv < 2) then - echo "${prog}: missing argument to ${argv[1]}" - exit 1 - endif - set MACHINE="$argv[2]" - shift - breaksw - case "-d": - if ($#argv < 2) then - echo "${prog}: missing argument to ${argv[1]}" - exit 1 - endif - set OBJDIR="$argv[2]" - shift - breaksw - case "-verbose": - set beverbose - breaksw - case "-p": - case "-profile": - set profile - breaksw - default: - echo "${prog}: ${argv[1]}: unknown switch" - exit 1 - breaksw - endsw - shift - else - break - endif -end - -if ($#argv == 0) set argv=(GENERIC) - -if (! $?MACHINE) then - if (-d /NextApps) then - set MACHINE=`hostinfo | awk '/MC680x0/ { printf("m68k") } /MC880x0/ { printf("m88k") }'` - endif -endif - -if (! $?MACHINE) then - if (-f /etc/machine) then - set MACHINE="`/etc/machine`" - else - echo "${prog}: no /etc/machine, specify machine type with -cpu" - echo "${prog}: e.g. ${prog} -cpu VAX CONFIGURATION" - exit 1 - endif -endif - -set FEATURES_EXTRA= - -switch ("$MACHINE") - case IBMRT: - set cpu=ca - set ID=RT - set FEATURES_EXTRA="romp_dualcall.h romp_fpa.h" - breaksw - case SUN: - set cpu=sun3 - set ID=SUN3 - breaksw - default: - set cpu=`echo $MACHINE | tr A-Z a-z` - set ID=`echo $MACHINE | tr a-z A-Z` - breaksw -endsw -set FEATURES=../h/features.h -set FEATURES_H=(cs_*.h mach_*.h net_*.h\ - cputypes.h cpus.h vice.h\ - $FEATURES_EXTRA) -set MASTER_DIR=../conf -set MASTER = ${MASTER_DIR}/MASTER -set MASTER_CPU=${MASTER}.${cpu} - -set MASTER_LOCAL = ${MASTER}.local -set MASTER_CPU_LOCAL = ${MASTER_CPU}.local -if (! -f $MASTER_LOCAL) set MASTER_LOCAL = "" -if (! -f $MASTER_CPU_LOCAL) set MASTER_CPU_LOCAL = "" - -if (! -d $OBJDIR) then - if ($?beverbose) then - echo "[ creating $OBJDIR ]" - endif - mkdir -p $OBJDIR -endif - -foreach SYS ($argv) - set SYSID=${SYS}_${ID} - set SYSCONF=$OBJDIR/config.$SYSID - set BLDDIR=$OBJDIR - if ($?beverbose) then - echo "[ generating $SYSID from $MASTER_DIR/MASTER{,.$cpu}{,.local} ]" - endif - echo +$SYS \ - | \ - cat $MASTER $MASTER_LOCAL $MASTER_CPU $MASTER_CPU_LOCAL - \ - $MASTER $MASTER_LOCAL $MASTER_CPU $MASTER_CPU_LOCAL \ - | \ - sed -n \ - -e "/^+/{" \ - -e "s;[-+];#&;gp" \ - -e 't loop' \ - -e ': loop' \ - -e 'n' \ - -e '/^#/b loop' \ - -e '/^$/b loop' \ - -e 's;^\([^#]*\).*#[ ]*<\(.*\)>[ ]*$;\2#\1;' \ - -e 't not' \ - -e 's;\([^#]*\).*;#\1;' \ - -e 't not' \ - -e ': not' \ - -e 's;[ ]*$;;' \ - -e 's;^\!\(.*\);\1#\!;' \ - -e 'p' \ - -e 't loop' \ - -e 'b loop' \ - -e '}' \ - -e "/^[^#]/d" \ - -e 's; ; ;g' \ - -e "s;^# *\([^ ]*\)[ ]*=[ ]*\[\(.*\)\].*;\1#\2;p" \ - | \ - awk '-F#' '\ -part == 0 && $1 != "" {\ - m[$1]=m[$1] " " $2;\ - next;\ -}\ -part == 0 && $1 == "" {\ - for (i=NF;i>1;i--){\ - s=substr($i,2);\ - c[++na]=substr($i,1,1);\ - a[na]=s;\ - }\ - while (na > 0){\ - s=a[na];\ - d=c[na--];\ - if (m[s] == "") {\ - f[s]=d;\ - } else {\ - nx=split(m[s],x," ");\ - for (j=nx;j>0;j--) {\ - z=x[j];\ - a[++na]=z;\ - c[na]=d;\ - }\ - }\ - }\ - part=1;\ - next;\ -}\ -part != 0 {\ - if ($1 != "") {\ - n=split($1,x,",");\ - ok=0;\ - for (i=1;i<=n;i++) {\ - if (f[x[i]] == "+") {\ - ok=1;\ - }\ - }\ - if (NF > 2 && ok == 0 || NF <= 2 && ok != 0) {\ - print $2; \ - }\ - } else { \ - print $2; \ - }\ -}\ -' >$SYSCONF.new - if (-z $SYSCONF.new) then - echo "${prog}: ${$SYSID}: no such configuration in $MASTER_DIR/MASTER{,.$cpu}" - rm -f $SYSCONF.new - endif - if (! -d $BLDDIR) then - if ($?beverbose) then - echo "[ creating $BLDDIR ]" - endif - mkdir -p $BLDDIR - endif -# -# These paths are used by config. -# -# "builddir" is the name of the directory where kernel binaries -# are put. It is a single path element, never absolute, and is -# always relative to "objectdir". "builddir" is used by config -# solely to determine where to put files created by "config" (e.g. -# the created Makefile and *.h's.) -# -# "objectdir" is the name of the directory which will hold "builddir". -# It is a path; if relative, it is relative to the current directory -# where config is run. It's sole use is to be prepended to "builddir" -# to indicate where config-created files are to be placed (see above). -# -# "sourcedir" is the location of the sources used to build the kernel. -# It is a path; if relative, it is relative to the directory specified -# by the concatenation of "objectdir" and "builddir" (i.e. where the -# kernel binaries are put). -# - echo 'builddir "."' >> $SYSCONF.new - set OBJRELDIR=`$RELPATH $OBJROOT $OBJDIR` - echo 'objectdir "'$OBJROOT'/'$OBJRELDIR'"' >> $SYSCONF.new - set SRCDIR=`dirname $SOURCE` - echo 'sourcedir "'$SRCROOT'"' >> $SYSCONF.new - if (-f $SYSCONF) then - diff $SYSCONF $SYSCONF.new - rm -f $SYSCONF.old - mv $SYSCONF $SYSCONF.old - endif - rm -f $SYSCONF - mv $SYSCONF.new $SYSCONF - if ($?doconfig) then - if ($?beverbose) then - echo "[ configuring $SYSID ]" - endif - if ($?profile) then - $CONFIG_DIR/config -c $MASTER_DIR -p $SYSCONF - else - $CONFIG_DIR/config -c $MASTER_DIR $SYSCONF - endif - endif - if ($?domake) then - if ($?beverbose) then - echo "[ making $SYSID ]" - endif - (cd $BLDDIR; make) - endif -end diff --git a/security/conf/tools/newvers/Makefile b/security/conf/tools/newvers/Makefile deleted file mode 100644 index a430a1fd5..000000000 --- a/security/conf/tools/newvers/Makefile +++ /dev/null @@ -1,47 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -COMP_SUBDIRS = \ - -INST_SUBDIRS = \ - - -# -# Who and where -# -BINDIR= -ifneq ($(MACHINE_CONFIG), DEFAULT) -DSTDIR= $(strip $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)_$(MACHINE_CONFIG)/$(COMPONENT)/) -else -DSTDIR= $(strip $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)/$(COMPONENT)/) -endif -PROGRAM= $(DSTDIR)newvers - -# -# How to install it -# -IFLAGS= -c -m 555 - -$(PROGRAM): $(DSTDIR)% : $(SOURCE)%.csh - @-$(RM) $(RMFLAGS) $(notdir $(PROGRAM)).VERS - @sed -e "s/#PROGRAM.*/#`vers_string $(notdir $(PROGRAM))`/" \ - < $< >$(notdir $(PROGRAM)).VERS; - @install $(IFLAGS) $(notdir $(PROGRAM)).VERS $(PROGRAM); - @-$(RM) $(RMFLAGS) $(notdir $(PROGRAM)).VERS; - -do_build_setup: $(PROGRAM) - -do_build_all: - -setup_build_install: - -do_build_install: - -include $(MakeInc_rule) -include $(MakeInc_dir) diff --git a/security/mac.h b/security/mac.h index 936abc34a..3e0cf7a89 100644 --- a/security/mac.h +++ b/security/mac.h @@ -101,6 +101,16 @@ struct user_mac { user_size_t m_buflen; user_addr_t m_string; }; + +struct user32_mac { + uint32_t m_buflen; + uint32_t m_string; +}; + +struct user64_mac { + uint64_t m_buflen; + uint64_t m_string; +}; #endif /* KERNEL */ /* @@ -131,6 +141,14 @@ struct user_mac { #define MAC_DEVICE_FIREWIRE "FireWire" #define MAC_DEVICE_TYPE_KEY "DeviceType" +/* + * Flags for mac_proc_check_suspend_resume() + */ +#define MAC_PROC_CHECK_SUSPEND 0 +#define MAC_PROC_CHECK_RESUME 1 +#define MAC_PROC_CHECK_HIBERNATE 2 +#define MAC_PROC_CHECK_SHUTDOWN_SOCKETS 3 + #ifndef KERNEL /* * Location of the userland MAC framework configuration file. mac.conf diff --git a/security/mac_alloc.h b/security/mac_alloc.h index 70e1baef0..956b4344b 100644 --- a/security/mac_alloc.h +++ b/security/mac_alloc.h @@ -71,6 +71,7 @@ void mac_zfree (zone_t zone, void *elem); #define Z_COLLECT 2 /* Make zone collectable */ #define Z_EXPAND 3 /* Make zone expandable */ #define Z_FOREIGN 4 /* Allow collectable zone to contain foreign elements */ +#define Z_CALLERACCT 5 /* Account alloc/free against the caller */ #endif /* __APPLE_API_EVOLVING */ #endif /* _SECURITY_MAC_ALLOC_H_ */ diff --git a/security/mac_audit.c b/security/mac_audit.c index 504c55ae8..7fe8b5705 100644 --- a/security/mac_audit.c +++ b/security/mac_audit.c @@ -74,10 +74,6 @@ #include #include - -int mac_audit(__unused int len, __unused u_char *data); - - #if CONFIG_AUDIT /* The zone allocator is initialized in mac_base.c. */ @@ -394,13 +390,6 @@ mac_audit_check_postselect(__unused struct ucred *cred, __unused unsigned short return (MAC_AUDIT_DEFAULT); } -int -mac_audit(__unused int len, __unused u_char *data) -{ - - return (0); -} - int mac_audit_text(__unused char *text, __unused mac_policy_handle_t handle) { diff --git a/security/mac_base.c b/security/mac_base.c index 8b2eabff2..1b67d3c0e 100644 --- a/security/mac_base.c +++ b/security/mac_base.c @@ -140,7 +140,7 @@ MODULE_VERSION(kernel_mac_support, 1); static unsigned int mac_max_slots = MAC_MAX_SLOTS; static unsigned int mac_slot_offsets_free = (1 << MAC_MAX_SLOTS) - 1; -SYSCTL_UINT(_security_mac, OID_AUTO, max_slots, CTLFLAG_RD, +SYSCTL_UINT(_security_mac, OID_AUTO, max_slots, CTLFLAG_RD | CTLFLAG_LOCKED, &mac_max_slots, 0, ""); /* @@ -163,7 +163,7 @@ int mac_late = 0; */ #if CONFIG_MACF_NET unsigned int mac_label_mbufs = 1; -SYSCTL_UINT(_security_mac, OID_AUTO, label_mbufs, CTLFLAG_RW, +SYSCTL_UINT(_security_mac, OID_AUTO, label_mbufs, CTLFLAG_RW | CTLFLAG_LOCKED, &mac_label_mbufs, 0, "Label all MBUFs"); #endif @@ -183,84 +183,84 @@ static int mac_labelmbufs = 0; * be a problem. */ unsigned int mac_label_vnodes = 0; -SYSCTL_UINT(_security_mac, OID_AUTO, labelvnodes, CTLFLAG_RW, +SYSCTL_UINT(_security_mac, OID_AUTO, labelvnodes, CTLFLAG_RW | CTLFLAG_LOCKED, &mac_label_vnodes, 0, "Label all vnodes"); unsigned int mac_mmap_revocation = 0; -SYSCTL_UINT(_security_mac, OID_AUTO, mmap_revocation, CTLFLAG_RW, +SYSCTL_UINT(_security_mac, OID_AUTO, mmap_revocation, CTLFLAG_RW | CTLFLAG_LOCKED, &mac_mmap_revocation, 0, "Revoke mmap access to files on subject " "relabel"); unsigned int mac_mmap_revocation_via_cow = 0; -SYSCTL_UINT(_security_mac, OID_AUTO, mmap_revocation_via_cow, CTLFLAG_RW, +SYSCTL_UINT(_security_mac, OID_AUTO, mmap_revocation_via_cow, CTLFLAG_RW | CTLFLAG_LOCKED, &mac_mmap_revocation_via_cow, 0, "Revoke mmap access to files via " "copy-on-write semantics, or by removing all write access"); unsigned int mac_device_enforce = 1; -SYSCTL_UINT(_security_mac, OID_AUTO, device_enforce, CTLFLAG_RW, +SYSCTL_UINT(_security_mac, OID_AUTO, device_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, &mac_device_enforce, 0, "Enforce MAC policy on device operations"); unsigned int mac_file_enforce = 0; -SYSCTL_UINT(_security_mac, OID_AUTO, file_enforce, CTLFLAG_RW, +SYSCTL_UINT(_security_mac, OID_AUTO, file_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, &mac_file_enforce, 0, "Enforce MAC policy on file operations"); unsigned int mac_iokit_enforce = 0; -SYSCTL_UINT(_security_mac, OID_AUTO, iokit_enforce, CTLFLAG_RW, +SYSCTL_UINT(_security_mac, OID_AUTO, iokit_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, &mac_file_enforce, 0, "Enforce MAC policy on IOKit operations"); unsigned int mac_pipe_enforce = 1; -SYSCTL_UINT(_security_mac, OID_AUTO, pipe_enforce, CTLFLAG_RW, +SYSCTL_UINT(_security_mac, OID_AUTO, pipe_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, &mac_pipe_enforce, 0, "Enforce MAC policy on pipe operations"); unsigned int mac_posixsem_enforce = 1; -SYSCTL_UINT(_security_mac, OID_AUTO, posixsem_enforce, CTLFLAG_RW, +SYSCTL_UINT(_security_mac, OID_AUTO, posixsem_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, &mac_posixsem_enforce, 0, "Enforce MAC policy on POSIX semaphores"); unsigned int mac_posixshm_enforce = 1; -SYSCTL_UINT(_security_mac, OID_AUTO, posixshm_enforce, CTLFLAG_RW, +SYSCTL_UINT(_security_mac, OID_AUTO, posixshm_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, &mac_posixshm_enforce, 0, "Enforce MAC policy on Posix Shared Memory"); unsigned int mac_proc_enforce = 1; -SYSCTL_UINT(_security_mac, OID_AUTO, proc_enforce, CTLFLAG_RW, +SYSCTL_UINT(_security_mac, OID_AUTO, proc_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, &mac_proc_enforce, 0, "Enforce MAC policy on process operations"); unsigned int mac_socket_enforce = 1; -SYSCTL_UINT(_security_mac, OID_AUTO, socket_enforce, CTLFLAG_RW, +SYSCTL_UINT(_security_mac, OID_AUTO, socket_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, &mac_socket_enforce, 0, "Enforce MAC policy on socket operations"); unsigned int mac_system_enforce = 1; -SYSCTL_UINT(_security_mac, OID_AUTO, system_enforce, CTLFLAG_RW, +SYSCTL_UINT(_security_mac, OID_AUTO, system_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, &mac_system_enforce, 0, "Enforce MAC policy on system-wide interfaces"); unsigned int mac_sysvmsg_enforce = 1; -SYSCTL_UINT(_security_mac, OID_AUTO, sysvmsg_enforce, CTLFLAG_RW, +SYSCTL_UINT(_security_mac, OID_AUTO, sysvmsg_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, &mac_sysvmsg_enforce, 0, "Enforce MAC policy on System V IPC message queues"); unsigned int mac_sysvsem_enforce = 1; -SYSCTL_UINT(_security_mac, OID_AUTO, sysvsem_enforce, CTLFLAG_RW, +SYSCTL_UINT(_security_mac, OID_AUTO, sysvsem_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, &mac_sysvsem_enforce, 0, "Enforce MAC policy on System V IPC semaphores"); unsigned int mac_sysvshm_enforce = 1; -SYSCTL_INT(_security_mac, OID_AUTO, sysvshm_enforce, CTLFLAG_RW, +SYSCTL_INT(_security_mac, OID_AUTO, sysvshm_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, &mac_sysvshm_enforce, 0, "Enforce MAC policy on System V Shared Memory"); unsigned int mac_vm_enforce = 1; -SYSCTL_INT(_security_mac, OID_AUTO, vm_enforce, CTLFLAG_RW, +SYSCTL_INT(_security_mac, OID_AUTO, vm_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, &mac_vm_enforce, 0, "Enforce MAC policy on VM operations"); unsigned int mac_vnode_enforce = 1; -SYSCTL_UINT(_security_mac, OID_AUTO, vnode_enforce, CTLFLAG_RW, +SYSCTL_UINT(_security_mac, OID_AUTO, vnode_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, &mac_vnode_enforce, 0, "Enforce MAC policy on vnode operations"); #if CONFIG_MACF_MACH unsigned int mac_port_enforce = 0; -SYSCTL_UINT(_security_mac, OID_AUTO, port_enforce, CTLFLAG_RW, +SYSCTL_UINT(_security_mac, OID_AUTO, port_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, &mac_port_enforce, 0, "Enforce MAC policy on Mach port operations"); unsigned int mac_task_enforce = 0; -SYSCTL_UINT(_security_mac, OID_AUTO, task_enforce, CTLFLAG_RW, +SYSCTL_UINT(_security_mac, OID_AUTO, task_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, &mac_task_enforce, 0, "Enforce MAC policy on Mach task operations"); #endif @@ -1346,12 +1346,15 @@ __mac_get_pid(struct proc *p, struct __mac_get_pid_args *uap, int *ret __unused) AUDIT_ARG(pid, uap->pid); if (IS_64BIT_PROCESS(p)) { - error = copyin(uap->mac_p, &mac, sizeof(mac)); + struct user64_mac mac64; + error = copyin(uap->mac_p, &mac64, sizeof(mac64)); + mac.m_buflen = mac64.m_buflen; + mac.m_string = mac64.m_string; } else { - struct mac mac32; + struct user32_mac mac32; error = copyin(uap->mac_p, &mac32, sizeof(mac32)); mac.m_buflen = mac32.m_buflen; - mac.m_string = CAST_USER_ADDR_T(mac32.m_string); + mac.m_string = mac32.m_string; } if (error) return (error); @@ -1397,12 +1400,15 @@ __mac_get_proc(proc_t p, struct __mac_get_proc_args *uap, int *ret __unused) size_t ulen; if (IS_64BIT_PROCESS(p)) { - error = copyin(uap->mac_p, &mac, sizeof(mac)); + struct user64_mac mac64; + error = copyin(uap->mac_p, &mac64, sizeof(mac64)); + mac.m_buflen = mac64.m_buflen; + mac.m_string = mac64.m_string; } else { - struct mac mac32; + struct user32_mac mac32; error = copyin(uap->mac_p, &mac32, sizeof(mac32)); mac.m_buflen = mac32.m_buflen; - mac.m_string = CAST_USER_ADDR_T(mac32.m_string); + mac.m_string = mac32.m_string; } if (error) return (error); @@ -1444,12 +1450,15 @@ __mac_set_proc(proc_t p, struct __mac_set_proc_args *uap, int *ret __unused) size_t ulen; if (IS_64BIT_PROCESS(p)) { - error = copyin(uap->mac_p, &mac, sizeof(mac)); + struct user64_mac mac64; + error = copyin(uap->mac_p, &mac64, sizeof(mac64)); + mac.m_buflen = mac64.m_buflen; + mac.m_string = mac64.m_string; } else { - struct mac mac32; + struct user32_mac mac32; error = copyin(uap->mac_p, &mac32, sizeof(mac32)); mac.m_buflen = mac32.m_buflen; - mac.m_string = CAST_USER_ADDR_T(mac32.m_string); + mac.m_string = mac32.m_string; } if (error) return (error); @@ -1526,12 +1535,15 @@ __mac_get_lcid(proc_t p, struct __mac_get_lcid_args *uap, int *ret __unused) AUDIT_ARG(value32, uap->lcid); if (IS_64BIT_PROCESS(p)) { - error = copyin(uap->mac_p, &mac, sizeof(mac)); + struct user64_mac mac64; + error = copyin(uap->mac_p, &mac64, sizeof(mac64)); + mac.m_buflen = mac64.m_buflen; + mac.m_string = mac64.m_string; } else { - struct mac mac32; + struct user32_mac mac32; error = copyin(uap->mac_p, &mac32, sizeof(mac32)); mac.m_buflen = mac32.m_buflen; - mac.m_string = CAST_USER_ADDR_T(mac32.m_string); + mac.m_string = mac32.m_string; } if (error) @@ -1590,12 +1602,15 @@ __mac_get_lctx(proc_t p, struct __mac_get_lctx_args *uap, int *ret __unused) size_t ulen; if (IS_64BIT_PROCESS(p)) { - error = copyin(uap->mac_p, &mac, sizeof(mac)); + struct user64_mac mac64; + error = copyin(uap->mac_p, &mac64, sizeof(mac64)); + mac.m_buflen = mac64.m_buflen; + mac.m_string = mac64.m_string; } else { - struct mac mac32; + struct user32_mac mac32; error = copyin(uap->mac_p, &mac32, sizeof(mac32)); mac.m_buflen = mac32.m_buflen; - mac.m_string = CAST_USER_ADDR_T(mac32.m_string); + mac.m_string = mac32.m_string; } if (error) @@ -1643,12 +1658,15 @@ __mac_set_lctx(proc_t p, struct __mac_set_lctx_args *uap, int *ret __unused) size_t ulen; if (IS_64BIT_PROCESS(p)) { - error = copyin(uap->mac_p, &mac, sizeof(mac)); + struct user64_mac mac64; + error = copyin(uap->mac_p, &mac64, sizeof(mac64)); + mac.m_buflen = mac64.m_buflen; + mac.m_string = mac64.m_string; } else { - struct mac mac32; + struct user32_mac mac32; error = copyin(uap->mac_p, &mac32, sizeof(mac32)); mac.m_buflen = mac32.m_buflen; - mac.m_string = CAST_USER_ADDR_T(mac32.m_string); + mac.m_string = mac32.m_string; } if (error) return (error); @@ -1732,12 +1750,15 @@ __mac_get_fd(proc_t p, struct __mac_get_fd_args *uap, int *ret __unused) AUDIT_ARG(fd, uap->fd); if (IS_64BIT_PROCESS(p)) { - error = copyin(uap->mac_p, &mac, sizeof(mac)); + struct user64_mac mac64; + error = copyin(uap->mac_p, &mac64, sizeof(mac64)); + mac.m_buflen = mac64.m_buflen; + mac.m_string = mac64.m_string; } else { - struct mac mac32; + struct user32_mac mac32; error = copyin(uap->mac_p, &mac32, sizeof(mac32)); mac.m_buflen = mac32.m_buflen; - mac.m_string = CAST_USER_ADDR_T(mac32.m_string); + mac.m_string = mac32.m_string; } if (error) @@ -1834,12 +1855,15 @@ mac_get_filelink(proc_t p, user_addr_t mac_p, user_addr_t path_p, int follow) size_t ulen; if (IS_64BIT_PROCESS(p)) { - error = copyin(mac_p, &mac, sizeof(mac)); + struct user64_mac mac64; + error = copyin(mac_p, &mac64, sizeof(mac64)); + mac.m_buflen = mac64.m_buflen; + mac.m_string = mac64.m_string; } else { - struct mac mac32; + struct user32_mac mac32; error = copyin(mac_p, &mac32, sizeof(mac32)); mac.m_buflen = mac32.m_buflen; - mac.m_string = CAST_USER_ADDR_T(mac32.m_string); + mac.m_string = mac32.m_string; } if (error) @@ -1862,7 +1886,7 @@ mac_get_filelink(proc_t p, user_addr_t mac_p, user_addr_t path_p, int follow) ctx = vfs_context_current(); - NDINIT(&nd, LOOKUP, + NDINIT(&nd, LOOKUP, OP_LOOKUP, LOCKLEAF | (follow ? FOLLOW : NOFOLLOW) | AUDITVNPATH1, UIO_USERSPACE, path_p, ctx); error = namei(&nd); @@ -1926,12 +1950,15 @@ __mac_set_fd(proc_t p, struct __mac_set_fd_args *uap, int *ret __unused) AUDIT_ARG(fd, uap->fd); if (IS_64BIT_PROCESS(p)) { - error = copyin(uap->mac_p, &mac, sizeof(mac)); + struct user64_mac mac64; + error = copyin(uap->mac_p, &mac64, sizeof(mac64)); + mac.m_buflen = mac64.m_buflen; + mac.m_string = mac64.m_string; } else { - struct mac mac32; + struct user32_mac mac32; error = copyin(uap->mac_p, &mac32, sizeof(mac32)); mac.m_buflen = mac32.m_buflen; - mac.m_string = CAST_USER_ADDR_T(mac32.m_string); + mac.m_string = mac32.m_string; } if (error) return (error); @@ -2034,12 +2061,15 @@ mac_set_filelink(proc_t p, user_addr_t mac_p, user_addr_t path_p, return ENOSYS; if (IS_64BIT_PROCESS(p)) { - error = copyin(mac_p, &mac, sizeof(mac)); + struct user64_mac mac64; + error = copyin(mac_p, &mac64, sizeof(mac64)); + mac.m_buflen = mac64.m_buflen; + mac.m_string = mac64.m_string; } else { - struct mac mac32; + struct user32_mac mac32; error = copyin(mac_p, &mac32, sizeof(mac32)); mac.m_buflen = mac32.m_buflen; - mac.m_string = CAST_USER_ADDR_T(mac32.m_string); + mac.m_string = mac32.m_string; } if (error) return (error); @@ -2066,7 +2096,7 @@ mac_set_filelink(proc_t p, user_addr_t mac_p, user_addr_t path_p, return (error); } - NDINIT(&nd, LOOKUP, + NDINIT(&nd, LOOKUP, OP_LOOKUP, LOCKLEAF | (follow ? FOLLOW : NOFOLLOW) | AUDITVNPATH1, UIO_USERSPACE, path_p, ctx); error = namei(&nd); @@ -2175,12 +2205,15 @@ mac_mount_label_get(struct mount *mp, user_addr_t mac_p) size_t ulen; if (IS_64BIT_PROCESS(current_proc())) { - error = copyin(mac_p, &mac, sizeof(mac)); + struct user64_mac mac64; + error = copyin(mac_p, &mac64, sizeof(mac64)); + mac.m_buflen = mac64.m_buflen; + mac.m_string = mac64.m_string; } else { - struct mac mac32; + struct user32_mac mac32; error = copyin(mac_p, &mac32, sizeof(mac32)); mac.m_buflen = mac32.m_buflen; - mac.m_string = CAST_USER_ADDR_T(mac32.m_string); + mac.m_string = mac32.m_string; } if (error) return (error); @@ -2232,7 +2265,7 @@ __mac_get_mount(proc_t p __unused, struct __mac_get_mount_args *uap, struct mount *mp; int error; - NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) { diff --git a/security/mac_framework.h b/security/mac_framework.h index 8331596a5..20780b249 100644 --- a/security/mac_framework.h +++ b/security/mac_framework.h @@ -122,6 +122,17 @@ struct vnode; struct vnode_attr; struct vop_setlabel_args; +#ifndef __IOKIT_PORTS_DEFINED__ +#define __IOKIT_PORTS_DEFINED__ +#ifdef __cplusplus +class OSObject; +typedef OSObject *io_object_t; +#else +struct OSObject; +typedef struct OSObject *io_object_t; +#endif +#endif /* __IOKIT_PORTS_DEFINED__ */ + /*@ macros */ #define VNODE_LABEL_CREATE 1 @@ -212,6 +223,9 @@ int mac_inpcb_label_init(struct inpcb *inp, int flag); void mac_inpcb_label_recycle(struct inpcb *inp); void mac_inpcb_label_update(struct socket *so); int mac_iokit_check_device(char *devtype, struct mac_module_data *mdata); +int mac_iokit_check_open(kauth_cred_t cred, io_object_t user_client, unsigned int user_client_type); +int mac_iokit_check_set_properties(kauth_cred_t cred, io_object_t registry_entry, io_object_t properties); +int mac_iokit_check_hid_control(kauth_cred_t cred); void mac_ipq_label_associate(struct mbuf *fragment, struct ipq *ipq); int mac_ipq_label_compare(struct mbuf *fragment, struct ipq *ipq); void mac_ipq_label_destroy(struct ipq *ipq); @@ -299,7 +313,7 @@ int mac_posixshm_check_mmap(kauth_cred_t cred, struct pshminfo *pshm, int mac_posixshm_check_open(kauth_cred_t cred, struct pshminfo *pshm); int mac_posixshm_check_stat(kauth_cred_t cred, struct pshminfo *pshm); int mac_posixshm_check_truncate(kauth_cred_t cred, struct pshminfo *pshm, - size_t s); + off_t s); int mac_posixshm_check_unlink(kauth_cred_t cred, struct pshminfo *pshm, const char *name); void mac_posixshm_vnode_label_associate(kauth_cred_t cred, @@ -309,6 +323,8 @@ void mac_posixshm_label_associate(kauth_cred_t cred, struct pshminfo *pshm, const char *name); void mac_posixshm_label_destroy(struct pshminfo *pshm); void mac_posixshm_label_init(struct pshminfo *pshm); +int mac_priv_check(kauth_cred_t cred, int priv); +int mac_priv_grant(kauth_cred_t cred, int priv); int mac_proc_check_debug(proc_t proc1, proc_t proc2); int mac_proc_check_fork(proc_t proc); int mac_proc_check_suspend_resume(proc_t proc, int sr); @@ -318,6 +334,8 @@ int mac_proc_check_getaudit(proc_t proc); int mac_proc_check_getauid(proc_t proc); int mac_proc_check_getlcid(proc_t proc1, proc_t proc2, pid_t pid); +int mac_proc_check_map_anon(proc_t proc, user_addr_t u_addr, + user_size_t u_size, int prot, int flags, int *maxprot); int mac_proc_check_mprotect(proc_t proc, user_addr_t addr, user_size_t size, int prot); int mac_proc_check_run_cs_invalid(proc_t proc); @@ -373,6 +391,7 @@ int mac_system_check_acct(kauth_cred_t cred, struct vnode *vp); int mac_system_check_audit(kauth_cred_t cred, void *record, int length); int mac_system_check_auditctl(kauth_cred_t cred, struct vnode *vp); int mac_system_check_auditon(kauth_cred_t cred, int cmd); +int mac_system_check_chud(kauth_cred_t cred); int mac_system_check_host_priv(kauth_cred_t cred); int mac_system_check_nfsd(kauth_cred_t cred); int mac_system_check_reboot(kauth_cred_t cred, int howto); @@ -426,7 +445,6 @@ void mac_sysvshm_label_associate(kauth_cred_t cred, void mac_sysvshm_label_destroy(struct shmid_kernel *shmsegptr); void mac_sysvshm_label_init(struct shmid_kernel* shmsegptr); void mac_sysvshm_label_recycle(struct shmid_kernel *shmsegptr); -void mac_thread_userret(int code, int error, struct thread *thread); int mac_vnode_check_access(vfs_context_t ctx, struct vnode *vp, int acc_mode); int mac_vnode_check_chdir(vfs_context_t ctx, struct vnode *dvp); @@ -440,6 +458,7 @@ int mac_vnode_check_exchangedata(vfs_context_t ctx, struct vnode *v1, struct vnode *v2); int mac_vnode_check_exec(vfs_context_t ctx, struct vnode *vp, struct image_params *imgp); +int mac_vnode_check_fsgetpath(vfs_context_t ctx, struct vnode *vp); int mac_vnode_check_signature(struct vnode *vp, unsigned char *sha1, void * signature, size_t size); int mac_vnode_check_getattrlist(vfs_context_t ctx, struct vnode *vp, @@ -468,6 +487,8 @@ int mac_vnode_check_rename_from(vfs_context_t ctx, struct vnode *dvp, int mac_vnode_check_rename_to(vfs_context_t ctx, struct vnode *dvp, struct vnode *vp, int samedir, struct componentname *cnp); int mac_vnode_check_revoke(vfs_context_t ctx, struct vnode *vp); +int mac_vnode_check_searchfs(vfs_context_t ctx, struct vnode *vp, + struct attrlist *alist); int mac_vnode_check_select(vfs_context_t ctx, struct vnode *vp, int which); int mac_vnode_check_setattrlist(vfs_context_t ctxd, struct vnode *vp, @@ -516,6 +537,8 @@ void mac_vnode_label_update_extattr(struct mount *mp, struct vnode *vp, const char *name); int mac_vnode_notify_create(vfs_context_t ctx, struct mount *mp, struct vnode *dvp, struct vnode *vp, struct componentname *cnp); +void mac_vnode_notify_rename(vfs_context_t ctx, struct vnode *vp, + struct vnode *dvp, struct componentname *cnp); int vnode_label(struct mount *mp, struct vnode *dvp, struct vnode *vp, struct componentname *cnp, int flags, vfs_context_t ctx); void vnode_relabel(struct vnode *vp); diff --git a/security/mac_internal.h b/security/mac_internal.h index 283fef64a..6e8ae3d2a 100644 --- a/security/mac_internal.h +++ b/security/mac_internal.h @@ -332,6 +332,44 @@ struct label *mac_mbuf_to_label(struct mbuf *m); } \ } while (0) +/* + * MAC_GRANT performs the designated check by walking the policy + * module list and checking with each as to how it feels about the + * request. Unlike MAC_CHECK, it grants if any policies return '0', + * and otherwise returns EPERM. Note that it returns its value via + * 'error' in the scope of the caller. + */ +#define MAC_GRANT(check, args...) do { \ + struct mac_policy_conf *mpc; \ + u_int i; \ + \ + error = EPERM; \ + for (i = 0; i < mac_policy_list.staticmax; i++) { \ + mpc = mac_policy_list.entries[i].mpc; \ + if (mpc == NULL) \ + continue; \ + \ + if (mpc->mpc_ops->mpo_ ## check != NULL) { \ + if (mpc->mpc_ops->mpo_ ## check (args) == 0) \ + error = 0; \ + } \ + } \ + if (mac_policy_list_conditional_busy() != 0) { \ + for (; i <= mac_policy_list.maxindex; i++) { \ + mpc = mac_policy_list.entries[i].mpc; \ + if (mpc == NULL) \ + continue; \ + \ + if (mpc->mpc_ops->mpo_ ## check != NULL) { \ + if (mpc->mpc_ops->mpo_ ## check (args) \ + == 0) \ + error = 0; \ + } \ + } \ + mac_policy_list_unbusy(); \ + } \ +} while (0) + /* * MAC_BOOLEAN performs the designated boolean composition by walking * the module list, invoking each instance of the operation, and diff --git a/security/mac_iokit.c b/security/mac_iokit.c index 6212d59d3..0207dbf20 100644 --- a/security/mac_iokit.c +++ b/security/mac_iokit.c @@ -73,3 +73,30 @@ mac_iokit_check_device(char *devtype, struct mac_module_data *mdata) MAC_CHECK(iokit_check_device, devtype, mdata); return (error); } + +int +mac_iokit_check_open(kauth_cred_t cred, io_object_t user_client, unsigned int user_client_type) +{ + int error; + + MAC_CHECK(iokit_check_open, cred, user_client, user_client_type); + return (error); +} + +int +mac_iokit_check_set_properties(kauth_cred_t cred, io_object_t registry_entry, io_object_t properties) +{ + int error; + + MAC_CHECK(iokit_check_set_properties, cred, registry_entry, properties); + return (error); +} + +int +mac_iokit_check_hid_control(kauth_cred_t cred) +{ + int error; + + MAC_CHECK(iokit_check_hid_control, cred); + return (error); +} diff --git a/security/mac_label.c b/security/mac_label.c index 0f4e21524..b05c43b84 100644 --- a/security/mac_label.c +++ b/security/mac_label.c @@ -48,6 +48,7 @@ mac_labelzone_init(void) sizeof(struct label), "MAC Labels"); zone_change(zone_label, Z_EXPAND, TRUE); zone_change(zone_label, Z_EXHAUST, FALSE); + zone_change(zone_label, Z_CALLERACCT, FALSE); } struct label * diff --git a/security/mac_net.c b/security/mac_net.c index cd452be5e..e06837b06 100644 --- a/security/mac_net.c +++ b/security/mac_net.c @@ -307,8 +307,8 @@ mac_mbuf_label_init(struct mbuf *m, int flag) if (mac_label_mbufs == 0) return (0); - tag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_MACLABEL, - sizeof(struct label), flag); + tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_MACLABEL, + sizeof(struct label), flag, m); if (tag == NULL) { printf("%s(): m_tag_alloc() failed!\n", __func__); return (ENOBUFS); diff --git a/security/mac_policy.h b/security/mac_policy.h index cfbe80987..836be3cc0 100644 --- a/security/mac_policy.h +++ b/security/mac_policy.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2008 Apple Inc. All rights reserved. + * Copyright (c) 2007-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -115,6 +115,16 @@ struct vnode; typedef struct ucred *kauth_cred_t; #endif /* !_KAUTH_CRED_T */ +#ifndef __IOKIT_PORTS_DEFINED__ +#define __IOKIT_PORTS_DEFINED__ +#ifdef __cplusplus +class OSObject; +typedef OSObject *io_object_t; +#else +struct OSObject; +typedef struct OSObject *io_object_t; +#endif +#endif /* __IOKIT_PORTS_DEFINED__ */ /*- * MAC entry points are generally named using the following template: @@ -1199,6 +1209,56 @@ typedef int mpo_iokit_check_device_t( char *devtype, struct mac_module_data *mdata ); +/** + @brief Access control check for opening an I/O Kit device + @param cred Subject credential + @param device_path Device path + @param user_client User client instance + @param user_client_type User client type + + Determine whether the subject identified by the credential can open an + I/O Kit device at the passed path of the passed user client class and + type. + + @return Return 0 if access is granted, or an appropriate value for + errno should be returned. +*/ +typedef int mpo_iokit_check_open_t( + kauth_cred_t cred, + io_object_t user_client, + unsigned int user_client_type +); +/** + @brief Access control check for setting I/O Kit device properties + @param cred Subject credential + @param registry_entry Target device + @param properties Property list + + Determine whether the subject identified by the credential can set + properties on an I/O Kit device. + + @return Return 0 if access is granted, or an appropriate value for + errno should be returned. +*/ +typedef int mpo_iokit_check_set_properties_t( + kauth_cred_t cred, + io_object_t entry, + io_object_t properties +); +/** + @brief Access control check for software HID control + @param cred Subject credential + + Determine whether the subject identified by the credential can + control the HID (Human Interface Device) subsystem, such as to + post synthetic keypresses, pointer movement and clicks. + + @return Return 0 if access is granted, or an appropriate value for + errno. +*/ +typedef int mpo_iokit_check_hid_control_t( + kauth_cred_t cred +); /** @brief Create an IP reassembly queue label @param fragment First received IP fragment @@ -2867,7 +2927,7 @@ typedef int mpo_posixshm_check_truncate_t( kauth_cred_t cred, struct pshminfo *ps, struct label *shmlabel, - size_t len + off_t len ); /** @brief Access control check for POSIX shared memory unlink @@ -2928,6 +2988,45 @@ typedef void mpo_posixshm_label_destroy_t( typedef void mpo_posixshm_label_init_t( struct label *label ); +/** + @brief Access control check for privileged operations + @param cred Subject credential + @param priv Requested privilege (see sys/priv.h) + + Determine whether the subject identified by the credential can perform + a privileged operation. Privileged operations are allowed if the cred + is the superuser or any policy returns zero for mpo_priv_grant, unless + any policy returns nonzero for mpo_priv_check. + + @return Return 0 if access is granted, otherwise EPERM should be returned. +*/ +typedef int mpo_priv_check_t( + kauth_cred_t cred, + int priv +); +/** + @brief Grant regular users the ability to perform privileged operations + @param cred Subject credential + @param priv Requested privilege (see sys/priv.h) + + Determine whether the subject identified by the credential should be + allowed to perform a privileged operation that in the absense of any + MAC policy it would not be able to perform. Privileged operations are + allowed if the cred is the superuser or any policy returns zero for + mpo_priv_grant, unless any policy returns nonzero for mpo_priv_check. + + Unlike other MAC hooks which can only reduce the privilege of a + credential, this hook raises the privilege of a credential when it + returns 0. Extreme care must be taken when implementing this hook to + avoid undermining the security of the system. + + @return Return 0 if additional privilege is granted, otherwise EPERM + should be returned. +*/ +typedef int mpo_priv_grant_t( + kauth_cred_t cred, + int priv +); /** @brief Access control check for debugging process @param cred Subject credential @@ -3024,6 +3123,37 @@ typedef int mpo_proc_check_getlcid_t( struct proc *p, pid_t pid ); +/** + @brief Access control check for mmap MAP_ANON + @param proc User process requesting the memory + @param cred Subject credential + @param u_addr Start address of the memory range + @param u_size Length address of the memory range + @param prot mmap protections; see mmap(2) + @param flags Type of mapped object; see mmap(2) + @param maxprot Maximum rights + + Determine whether the subject identified by the credential should be + allowed to obtain anonymous memory using the specified flags and + protections on the new mapping. MAP_ANON will always be present in the + flags. Certain combinations of flags with a non-NULL addr may + cause a mapping to be rejected before this hook is called. The maxprot field + holds the maximum permissions on the new mapping, a combination of + VM_PROT_READ, VM_PROT_WRITE and VM_PROT_EXECUTE. To avoid overriding prior + access control checks, a policy should only remove flags from maxprot. + + @return Return 0 if access is granted, otherwise an appropriate value for + errno should be returned. Suggested failure: EPERM for lack of privilege. +*/ +typedef int mpo_proc_check_map_anon_t( + struct proc *proc, + kauth_cred_t cred, + user_addr_t u_addr, + user_size_t u_size, + int prot, + int flags, + int *maxprot +); /** @brief Access control check for setting memory protections @param cred Subject credential @@ -3821,6 +3951,19 @@ typedef int mpo_system_check_auditon_t( kauth_cred_t cred, int cmd ); +/** + @brief Access control check for using CHUD facilities + @param cred Subject credential + + Determine whether the subject identified by the credential can perform + performance-related tasks using the CHUD system call. + + @return Return 0 if access is granted, otherwise an appropriate value for + errno should be returned. +*/ +typedef int mpo_system_check_chud_t( + kauth_cred_t cred +); /** @brief Access control check for obtaining the host control port @param cred Subject credential @@ -4580,21 +4723,6 @@ typedef void mpo_task_label_update_t( struct label *cred, struct label *task ); -/** - @brief Perform MAC-related events when a thread returns to user space - @param code The number of the syscall/trap that has finished - @param error The error code that will be returned to user space - @param thread Mach (not BSD) thread that is returning - - This entry point permits policy modules to perform MAC-related - events when a thread returns to user space, via a system call - return, trap return, or otherwise. -*/ -typedef void mpo_thread_userret_t( - int code, - int error, - struct thread *thread -); /** @brief Check vnode access @param cred Subject credential @@ -4751,6 +4879,23 @@ typedef int mpo_vnode_check_exec_t( struct componentname *cnp, u_int *csflags ); +/** + @brief Access control check for fsgetpath + @param cred Subject credential + @param vp Vnode for which a path will be returned + @param label Label associated with the vnode + + Determine whether the subject identified by the credential can get the path + of the given vnode with fsgetpath. + + @return Return 0 if access is granted, otherwise an appropriate value for + errno should be returned. +*/ +typedef int mpo_vnode_check_fsgetpath_t( + kauth_cred_t cred, + struct vnode *vp, + struct label *label +); /** @brief Access control check after determining the code directory hash */ @@ -5091,6 +5236,25 @@ typedef int mpo_vnode_check_revoke_t( struct vnode *vp, struct label *label ); +/** + @brief Access control check for searchfs + @param cred Subject credential + @param vp Object vnode + @param vlabel Policy label for vp + @param alist List of attributes used as search criteria + + Determine whether the subject identified by the credential can search the + vnode using the searchfs system call. + + @return Return 0 if access is granted, otherwise an appropriate value for + errno should be returned. +*/ +typedef int mpo_vnode_check_searchfs_t( + kauth_cred_t cred, + struct vnode *vp, + struct label *vlabel, + struct attrlist *alist +); /** @brief Access control check for select @param cred Subject credential @@ -5775,6 +5939,26 @@ typedef int mpo_vnode_notify_create_t( struct componentname *cnp ); +/** + @brief Inform MAC policies that a vnode has been renamed + @param cred User credential for the renaming process + @param vp Vnode that's being renamed + @param label Policy label for vp + @param dvp Parent directory for the destination + @param dlabel Policy label for dvp + @param cnp Component name for the destination + + Inform MAC policies that a vnode has been renamed. + */ +typedef void mpo_vnode_notify_rename_t( + kauth_cred_t cred, + struct vnode *vp, + struct label *label, + struct vnode *dvp, + struct label *dlabel, + struct componentname *cnp +); + /* * Placeholder for future events that may need mac hooks. */ @@ -5783,7 +5967,7 @@ typedef void mpo_reserved_hook_t(void); /*! \struct mac_policy_ops */ -#define MAC_POLICY_OPS_VERSION 2 /* inc when new reserved slots are taken */ +#define MAC_POLICY_OPS_VERSION 11 /* inc when new reserved slots are taken */ struct mac_policy_ops { mpo_audit_check_postselect_t *mpo_audit_check_postselect; mpo_audit_check_preselect_t *mpo_audit_check_preselect; @@ -6036,7 +6220,7 @@ struct mac_policy_ops { mpo_task_label_init_t *mpo_task_label_init; mpo_task_label_internalize_t *mpo_task_label_internalize; mpo_task_label_update_t *mpo_task_label_update; - mpo_thread_userret_t *mpo_thread_userret; + mpo_iokit_check_hid_control_t *mpo_iokit_check_hid_control; mpo_vnode_check_access_t *mpo_vnode_check_access; mpo_vnode_check_chdir_t *mpo_vnode_check_chdir; mpo_vnode_check_chroot_t *mpo_vnode_check_chroot; @@ -6094,11 +6278,32 @@ struct mac_policy_ops { mpo_vnode_check_uipc_connect_t *mpo_vnode_check_uipc_connect; mac_proc_check_run_cs_invalid_t *mpo_proc_check_run_cs_invalid; mpo_proc_check_suspend_resume_t *mpo_proc_check_suspend_resume; - mpo_reserved_hook_t *mpo_reserved5; - mpo_reserved_hook_t *mpo_reserved6; - mpo_reserved_hook_t *mpo_reserved7; - mpo_reserved_hook_t *mpo_reserved8; - mpo_reserved_hook_t *mpo_reserved9; + mpo_reserved_hook_t *mpo_reserved12; + mpo_iokit_check_set_properties_t *mpo_iokit_check_set_properties; + mpo_system_check_chud_t *mpo_system_check_chud; + mpo_vnode_check_searchfs_t *mpo_vnode_check_searchfs; + mpo_priv_check_t *mpo_priv_check; + mpo_priv_grant_t *mpo_priv_grant; + mpo_proc_check_map_anon_t *mpo_proc_check_map_anon; + mpo_vnode_check_fsgetpath_t *mpo_vnode_check_fsgetpath; + mpo_iokit_check_open_t *mpo_iokit_check_open; + mpo_vnode_notify_rename_t *mpo_vnode_notify_rename; + mpo_reserved_hook_t *mpo_reserved14; + mpo_reserved_hook_t *mpo_reserved15; + mpo_reserved_hook_t *mpo_reserved16; + mpo_reserved_hook_t *mpo_reserved17; + mpo_reserved_hook_t *mpo_reserved18; + mpo_reserved_hook_t *mpo_reserved19; + mpo_reserved_hook_t *mpo_reserved20; + mpo_reserved_hook_t *mpo_reserved21; + mpo_reserved_hook_t *mpo_reserved22; + mpo_reserved_hook_t *mpo_reserved23; + mpo_reserved_hook_t *mpo_reserved24; + mpo_reserved_hook_t *mpo_reserved25; + mpo_reserved_hook_t *mpo_reserved26; + mpo_reserved_hook_t *mpo_reserved27; + mpo_reserved_hook_t *mpo_reserved28; + mpo_reserved_hook_t *mpo_reserved29; }; /** diff --git a/security/mac_posix_shm.c b/security/mac_posix_shm.c index c42cfbb46..f6cc28e56 100644 --- a/security/mac_posix_shm.c +++ b/security/mac_posix_shm.c @@ -178,7 +178,7 @@ mac_posixshm_check_stat(kauth_cred_t cred, struct pshminfo *shm) int mac_posixshm_check_truncate(kauth_cred_t cred, struct pshminfo *shm, - size_t size) + off_t size) { int error = 0; diff --git a/security/mac_priv.c b/security/mac_priv.c new file mode 100644 index 000000000..7d72ce88d --- /dev/null +++ b/security/mac_priv.c @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/*- + * Copyright (c) 2006 nCircle Network Security, Inc. + * Copyright (c) 2009 Robert N. M. Watson + * All rights reserved. + * + * This software was developed by Robert N. M. Watson for the TrustedBSD + * Project under contract to nCircle Network Security, Inc. + * + * This software was developed at the University of Cambridge Computer + * Laboratory with support from a grant from Google, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR, NCIRCLE NETWORK SECURITY, + * INC., OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * MAC checks for system privileges. + */ + +#include +#include + +#include + +/* + * The MAC Framework interacts with kernel privilege checks in two ways: it + * may restrict the granting of privilege to a subject, and it may grant + * additional privileges to the subject. Policies may implement none, one, + * or both of these entry points. Restriction of privilege by any policy + * always overrides granting of privilege by any policy or other privilege + * mechanism. See kern_priv.c:priv_check_cred() for details of the + * composition. + */ + +/* + * Restrict access to a privilege for a credential. Return failure if any + * policy denies access. + */ +int +mac_priv_check(kauth_cred_t cred, int priv) +{ + int error; + + MAC_CHECK(priv_check, cred, priv); + + return (error); +} + +/* + * Grant access to a privilege for a credential. Return success if any + * policy grants access. + */ +int +mac_priv_grant(kauth_cred_t cred, int priv) +{ + int error; + + MAC_GRANT(priv_grant, cred, priv); + + return (error); +} diff --git a/security/mac_process.c b/security/mac_process.c index 6ac8b6b5f..631b468a9 100644 --- a/security/mac_process.c +++ b/security/mac_process.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007 Apple Inc. All rights reserved. + * Copyright (c) 2007-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -205,12 +205,15 @@ mac_execve_enter(user_addr_t mac_p, struct image_params *imgp) return (0); if (IS_64BIT_PROCESS(current_proc())) { - error = copyin(mac_p, &mac, sizeof(mac)); + struct user64_mac mac64; + error = copyin(mac_p, &mac64, sizeof(mac64)); + mac.m_buflen = mac64.m_buflen; + mac.m_string = mac64.m_string; } else { - struct mac mac32; + struct user32_mac mac32; error = copyin(mac_p, &mac32, sizeof(mac32)); mac.m_buflen = mac32.m_buflen; - mac.m_string = CAST_USER_ADDR_T(mac32.m_string); + mac.m_string = mac32.m_string; } if (error) return (error); @@ -241,13 +244,17 @@ mac_execve_enter(user_addr_t mac_p, struct image_params *imgp) * When the subject's label changes, it may require revocation of privilege * to mapped objects. This can't be done on-the-fly later with a unified * buffer cache. + * + * XXX: CRF_MAC_ENFORCE should be in a kauth_cred_t field, rather + * XXX: than a posix_cred_t field. */ void mac_cred_label_update(kauth_cred_t cred, struct label *newlabel) { + posix_cred_t pcred = posix_cred_get(cred); /* force label to be part of "matching" for credential */ - cred->cr_flags |= CRF_MAC_ENFORCE; + pcred->cr_flags |= CRF_MAC_ENFORCE; /* inform the policies of the update */ MAC_PERFORM(cred_label_update, cred, newlabel); @@ -348,6 +355,29 @@ mac_proc_check_get_task(struct ucred *cred, struct proc *p) return (error); } +/* + * The type of maxprot in proc_check_map_anon must be equivalent to vm_prot_t + * (defined in ). mac_policy.h does not include any header + * files, so cannot use the typedef itself. + */ +int +mac_proc_check_map_anon(proc_t proc, user_addr_t u_addr, + user_size_t u_size, int prot, int flags, int *maxprot) +{ + kauth_cred_t cred; + int error; + + if (!mac_vm_enforce || + !mac_proc_check_enforce(proc, MAC_VM_ENFORCE)) + return (0); + + cred = kauth_cred_proc_ref(proc); + MAC_CHECK(proc_check_map_anon, proc, cred, u_addr, u_size, prot, flags, maxprot); + kauth_cred_unref(&cred); + + return (error); +} + int mac_proc_check_mprotect(proc_t proc, user_addr_t addr, user_size_t size, int prot) @@ -544,14 +574,6 @@ mac_lctx_check_label_update(struct lctx *l, struct label *newlabel) } #endif /* LCTX */ -void -mac_thread_userret(int code, int error, struct thread *thread) -{ - - if (mac_late) - MAC_PERFORM(thread_userret, code, error, thread); -} - int mac_proc_check_suspend_resume(proc_t curp, int sr) { diff --git a/security/mac_stub.c b/security/mac_stub.c index b78b081f4..b3e455817 100644 --- a/security/mac_stub.c +++ b/security/mac_stub.c @@ -34,6 +34,18 @@ /* * XXX stubs until we fix */ +int mac_check_iokit_open(void) +{ + return 0; +} +int mac_check_iokit_set_properties(void) +{ + return 0; +} +int mac_check_iokit_hid_control(void) +{ + return 0; +} int mac_check_ipc_method(void) { return 0; @@ -262,6 +274,10 @@ int mac_check_system_acct(void) { return 0; } +int mac_check_system_chud(void) +{ + return 0; +} int mac_check_system_nfsd(void) { return 0; @@ -374,6 +390,10 @@ int mac_check_vnode_revoke(void) { return 0; } +int mac_check_vnode_searchfs(void) +{ + return 0; +} int mac_check_vnode_select(void) { return 0; diff --git a/security/mac_system.c b/security/mac_system.c index 410c71310..8089caac8 100644 --- a/security/mac_system.c +++ b/security/mac_system.c @@ -83,6 +83,19 @@ mac_system_check_acct(kauth_cred_t cred, struct vnode *vp) return (error); } +int +mac_system_check_chud(kauth_cred_t cred) +{ + int error; + + if (!mac_system_enforce) + return (0); + + MAC_CHECK(system_check_chud, cred); + + return (error); +} + int mac_system_check_host_priv(kauth_cred_t cred) { diff --git a/security/mac_vfs.c b/security/mac_vfs.c index 0a136aa4c..7cc5561a2 100644 --- a/security/mac_vfs.c +++ b/security/mac_vfs.c @@ -377,6 +377,21 @@ mac_vnode_notify_create(vfs_context_t ctx, struct mount *mp, return (error); } +void +mac_vnode_notify_rename(vfs_context_t ctx, struct vnode *vp, + struct vnode *dvp, struct componentname *cnp) +{ + kauth_cred_t cred; + + if (!mac_vnode_enforce || + !mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return; + + cred = vfs_context_ucred(ctx); + MAC_PERFORM(vnode_notify_rename, cred, vp, vp->v_label, + dvp, dvp->v_label, cnp); +} + /* * Extended attribute 'name' was updated via * vn_setxattr() or vn_removexattr(). Allow the @@ -425,12 +440,13 @@ mac_cred_label_update_execve(vfs_context_t ctx, kauth_cred_t new, struct vnode * { kauth_cred_t cred; int disjoint = 0; + posix_cred_t pcred = posix_cred_get(new); if (!mac_proc_enforce && !mac_vnode_enforce) return disjoint; /* mark the new cred to indicate "matching" includes the label */ - new->cr_flags |= CRF_MAC_ENFORCE; + pcred->cr_flags |= CRF_MAC_ENFORCE; cred = vfs_context_ucred(ctx); MAC_PERFORM(cred_label_update_execve, cred, new, vp, vp->v_label, @@ -642,6 +658,21 @@ mac_vnode_check_exec(vfs_context_t ctx, struct vnode *vp, return (error); } +int +mac_vnode_check_fsgetpath(vfs_context_t ctx, struct vnode *vp) +{ + kauth_cred_t cred; + int error; + + if (!mac_vnode_enforce || + !mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return (0); + + cred = vfs_context_ucred(ctx); + MAC_CHECK(vnode_check_fsgetpath, cred, vp, vp->v_label); + return (error); +} + int mac_vnode_check_signature(struct vnode *vp, unsigned char *sha1, void * signature, size_t size) @@ -899,6 +930,21 @@ mac_vnode_check_revoke(vfs_context_t ctx, struct vnode *vp) return (error); } +int +mac_vnode_check_searchfs(vfs_context_t ctx, struct vnode *vp, struct attrlist *alist) +{ + kauth_cred_t cred; + int error; + + if (!mac_vnode_enforce || + !mac_context_check_enforce(ctx, MAC_VNODE_ENFORCE)) + return (0); + + cred = vfs_context_ucred(ctx); + MAC_CHECK(vnode_check_searchfs, cred, vp, vp->v_label, alist); + return (error); +} + int mac_vnode_check_select(vfs_context_t ctx, struct vnode *vp, int which) { diff --git a/tools/lockstat/Makefile b/tools/lockstat/Makefile index f67ee3ddf..a23a5c682 100644 --- a/tools/lockstat/Makefile +++ b/tools/lockstat/Makefile @@ -1,4 +1,4 @@ -CFLAGS=-g -Os -arch ppc -arch i386 +CFLAGS=-g -Os -arch x86_64 -arch i386 TARGETS = lockstat diff --git a/tools/lockstat/lockstat.c b/tools/lockstat/lockstat.c index 2ec7f324c..1f2b5af32 100644 --- a/tools/lockstat/lockstat.c +++ b/tools/lockstat/lockstat.c @@ -51,7 +51,7 @@ * Waits (Meaningful only for lock types that can block): Incremented * if a lock acquisition attempt proceeded to block. * - * Direct Waits (currently implemented only on i386): For adaptive + * Direct Waits (currently implemented only on i386/x86_64): For adaptive * locks, such as mutexes, incremented if the owner of the mutex * wasn't active on another processor at the time of the lock * attempt. This indicates that no adaptive spin occurred. @@ -329,7 +329,7 @@ print_all_spin(lockgroup_info_t *lockgroup) void print_mutex_hdr(void) { -#if defined(__i386__) +#if defined(__i386__) || defined(__x86_64__) printf("Mutex lock attempts Misses Waits Direct Waits Name\n"); #else printf(" mutex locks misses waits name\n"); @@ -343,7 +343,7 @@ print_mutex(int requested, lockgroup_info_t *lockgroup) if (curptr->lock_mtx_cnt != 0 && curptr->lock_mtx_util_cnt != 0) { printf("%16lld ", curptr->lock_mtx_util_cnt); -#if defined(__i386__) +#if defined(__i386__) || defined(__x86_64__) printf("%10lld %10lld %10lld ", curptr->lock_mtx_miss_cnt, curptr->lock_mtx_wait_cnt, curptr->lock_mtx_held_cnt); #else printf("%16lld %16lld ", curptr->lock_mtx_miss_cnt, curptr->lock_mtx_wait_cnt); diff --git a/tools/symbolify.py b/tools/symbolify.py new file mode 100755 index 000000000..dde29a732 --- /dev/null +++ b/tools/symbolify.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python +from subprocess import Popen, PIPE, call +import re +import sys +import os + +NM_FORMAT = "([0-9a-f]+) ([UuAaTtDdBbCcSsIi]) (.*)" + +nm_re = re.compile(NM_FORMAT) + +def parse_nm_output(str): + "returns (start, type, name)" + m = nm_re.match(str) + if m: + start = int(m.group(1), 16) + return (start, m.group(2), m.group(3)) + else: + return None + +def nm(file): + cmd = "nm %s" % file + p = Popen(cmd, shell=True, stdout=PIPE) + return p.stdout + +class SymbolLookup: + def __init__(self, file, min_width=16): + self.min_width = min_width + self.symbols = [parse_nm_output(l) for l in nm(file)] + self.symbols.sort(key=lambda x: x[0]) + + def padded(self, str): + return ("%%%ds" % self.min_width) % str + + def __call__(self, saddr): + addr = int(saddr.group(0), 16) + last = (0, ' ', '') + # stupid linear search... feel free to improve + for s in self.symbols: + if s[0] == addr: + return self.padded(s[2]) + elif s[0] > addr: + if last[2] == "_last_kernel_symbol": + return saddr.group(0) + return self.padded("<%s>+%x" % (last[2], addr - last[0])) + else: + last = s + if last[2] == "_last_kernel_symbol": + return saddr.group(0) + return self.padded("<%s>+%x" % (last[2], addr - last[0])) + +def symbolify(objfile, input, *args, **kargs): + replacer = SymbolLookup(objfile, *args, **kargs) + for l in input: + print re.sub("(0x)?[0-9a-f]{6,16}", replacer, l), + + +def usage(): + + print "usage: %s [filename]" % sys.argv[0] + print "\tor speficy a filename in your SYMBOLIFY_KERNEL environment variable" + + # die now + sys.exit(1) + +KERNEL_FILE = None + +if( len(sys.argv) > 2 ): + usage() + +if( len(sys.argv) == 2 ): + KERNEL_FILE = sys.argv[1] + +if( KERNEL_FILE is None ): + KERNEL_FILE = os.environ.get("SYMBOLIFY_KERNEL") + +if( KERNEL_FILE is None ): + usage() + +print "using kernel file '%s'" % KERNEL_FILE + +symbolify(KERNEL_FILE, sys.stdin, min_width=40) + diff --git a/tools/tests/MPMMTest/KQMPMMtest.c b/tools/tests/MPMMTest/KQMPMMtest.c index b16c5f847..5b659a833 100644 --- a/tools/tests/MPMMTest/KQMPMMtest.c +++ b/tools/tests/MPMMTest/KQMPMMtest.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -67,6 +68,7 @@ static boolean_t affinity = FALSE; static boolean_t timeshare = FALSE; static boolean_t threaded = FALSE; static boolean_t oneway = FALSE; +static boolean_t do_select = FALSE; int msg_type; int num_ints; int num_msgs; @@ -96,6 +98,7 @@ void usage(const char *progname) { fprintf(stderr, " -delay num\t\tmicroseconds to sleep clients between messages\n"); fprintf(stderr, " -work num\t\tmicroseconds of client work\n"); fprintf(stderr, " -pages num\t\tpages of memory touched by client work\n"); + fprintf(stderr, " -select \t\tselect prior to calling kevent().\n"); fprintf(stderr, "default values are:\n"); fprintf(stderr, " . no affinity\n"); fprintf(stderr, " . not timeshare\n"); @@ -195,6 +198,9 @@ void parse_args(int argc, char *argv[]) { usage(progname); client_pages = strtoul(argv[1], NULL, 0); argc -= 2; argv += 2; + } else if (0 == strcmp("-select", argv[0])) { + do_select = TRUE; + argc--; argv++; } else usage(progname); } @@ -339,10 +345,12 @@ server(void *serverarg) int kq; struct kevent64_s kev[1]; int err; + int count; struct port_args args; int idx; kern_return_t ret; int totalmsg = num_msgs * num_clients; + fd_set readfds; args.server_num = (int) (long) serverarg; setup_server_ports(&args); @@ -365,11 +373,26 @@ server(void *serverarg) perror("kevent"); exit(1); } + for (idx = 0; idx < totalmsg; idx++) { if (verbose) printf("server awaiting message %d\n", idx); retry: + if (do_select) { + FD_ZERO(&readfds); + FD_SET(kq, &readfds); + + if (verbose) + printf("Calling select() prior to kevent64().\n"); + + count = select(kq + 1, &readfds, NULL, NULL, NULL); + if (count == -1) { + perror("select"); + exit(1); + } + } + EV_SET64(&kev[0], args.pset, EVFILT_MACHPORT, EV_ENABLE, #if DIRECT_MSG_RCV MACH_RCV_MSG|MACH_RCV_LARGE, 0, 0, (mach_vm_address_t)args.req_msg, args.req_size); diff --git a/tools/tests/MPMMTest/Makefile b/tools/tests/MPMMTest/Makefile index 7762791a0..0421a718a 100644 --- a/tools/tests/MPMMTest/Makefile +++ b/tools/tests/MPMMTest/Makefile @@ -1,4 +1,4 @@ -CFLAGS=-g -O2 -arch ppc -arch i386 +CFLAGS=-g -O2 -arch i386 CFLAGS64=-g -O2 -arch x86_64 TARGETS = MPMMtest MPMMtest_64 KQMPMMtest KQMPMMtest_64 KQMPMMtestD KQMPMMtest_64D diff --git a/tools/tests/affinity/Makefile b/tools/tests/affinity/Makefile index 9450d79c6..b8563d54e 100644 --- a/tools/tests/affinity/Makefile +++ b/tools/tests/affinity/Makefile @@ -1,5 +1,5 @@ -CFLAGS =-g -arch ppc -arch i386 -CFLAGS64=-g -arch ppc64 -arch x86_64 +CFLAGS =-g -arch i386 +CFLAGS64=-g -arch x86_64 TESTS = \ sets \ diff --git a/tools/tests/execperf/Makefile b/tools/tests/execperf/Makefile new file mode 100644 index 000000000..00d03037c --- /dev/null +++ b/tools/tests/execperf/Makefile @@ -0,0 +1,79 @@ +SDKROOT ?= / +ARCHS = x86_64 +CC = xcrun -sdk $(SDKROOT) cc +CODESIGN = xcrun -sdk $(SDKROOT) codesign +CFLAGS = -O0 -g -isysroot $(SDKROOT) $(patsubst %, -arch %,$(ARCHS)) +LDFLAGS = -Wl,-new_linker -dead_strip \ + -isysroot $(SDKROOT) $(patsubst %, -arch %,$(ARCHS)) +NOPIE_OPTION = -Wl,-no_pie + +EXECUTABLES = exit.nodyld \ + exit.nopie.dyld-but-no-Libsystem exit.pie.dyld-but-no-Libsystem \ + exit.nopie.dyld-and-Libsystem exit.pie.dyld-and-Libsystem \ + exit.nopie exit.pie \ + printexecinfo +OBJECTS = exit-asm.o exit.o printexecinfo.o + +default: $(EXECUTABLES) run + +clean: + rm -f run $(EXECUTABLES) + rm -f run.o $(OBJECTS) + +run.o: run.c + $(CC) -c -o $@ $< $(CFLAGS) + +run: run.o + $(CC) -o $@ $< $(LDFLAGS) + $(CODESIGN) -s - $@ + +# OBJECTS + +exit-asm.o: exit-asm.S + $(CC) -c -o $@ $< $(CFLAGS) + +exit.o: exit.c + $(CC) -c -o $@ $< $(CFLAGS) + +printexecinfo.o: printexecinfo.c + $(CC) -c -o $@ $< $(CFLAGS) + +# EXECUTABLES + +exit.nodyld: exit-asm.o + $(CC) -o $@ $< $(LDFLAGS) -e mystart -nostartfiles -nodefaultlibs -static + $(CODESIGN) -s - $@ + + +exit.nopie.dyld-but-no-Libsystem: exit-asm.o + $(CC) -o $@ $< $(LDFLAGS) -e mystart $(NOPIE_OPTION) -nostartfiles -nodefaultlibs + $(CODESIGN) -s - $@ + +exit.pie.dyld-but-no-Libsystem: exit-asm.o + $(CC) -o $@ $< $(LDFLAGS) -e mystart -Wl,-pie -nostartfiles -nodefaultlibs + $(CODESIGN) -s - $@ + +exit.nopie.dyld-and-Libsystem: exit-asm.o + $(CC) -o $@ $< $(LDFLAGS) -e mystart $(NOPIE_OPTION) -nostartfiles -nodefaultlibs -lSystem + $(CODESIGN) -s - $@ + +exit.pie.dyld-and-Libsystem: exit-asm.o + $(CC) -o $@ $< $(LDFLAGS) -e mystart -Wl,-pie -nostartfiles -nodefaultlibs -lSystem + $(CODESIGN) -s - $@ + +exit.nopie: exit.o + $(CC) -o $@ $< $(LDFLAGS) -e mystart $(NOPIE_OPTION) + $(CODESIGN) -s - $@ + +exit.pie: exit.o + $(CC) -o $@ $< $(LDFLAGS) -e mystart -Wl,-pie + $(CODESIGN) -s - $@ + +printexecinfo: printexecinfo.o + $(CC) -o $@ $< $(LDFLAGS) + $(CODESIGN) -s - $@ + +# ACTIONS + +quick-test: $(EXECUTABLES) run + ./test.sh diff --git a/tools/tests/execperf/exit-asm.S b/tools/tests/execperf/exit-asm.S new file mode 100644 index 000000000..ba63101e4 --- /dev/null +++ b/tools/tests/execperf/exit-asm.S @@ -0,0 +1,42 @@ +.text + .globl mystart +mystart: +#if defined(__x86_64__) + pushq $0 + mov %rsp, %rbp + andq $0xfffffffffffffff0, %rsp + movl $42, %edi + movl $0x2000001, %eax + movl $0, %ecx + movq %rcx, %r10 + syscall + jmp 1f +1: + hlt + nop + nop + nop + nop +#elif defined(__i386__) + pushl $0 + mov %esp, %ebp + andl $0xfffffff0, %esp + subl $12, %esp + pushl $42 + mov $0x40001, %eax + call _sysenter_trap + jmp 1f +1: + hlt + nop + nop + nop + nop +_sysenter_trap: + pop %edx + mov %esp, %ecx + sysenter + nop +#else +#error Unsupported architecture +#endif diff --git a/tools/tests/execperf/exit.c b/tools/tests/execperf/exit.c new file mode 100644 index 000000000..ded537881 --- /dev/null +++ b/tools/tests/execperf/exit.c @@ -0,0 +1,12 @@ +void mystart(void) __asm__("mystart"); + +void mystart(void) { +#if defined(__x86_64__) + asm volatile ("andq $0xfffffffffffffff0, %rsp\n"); +#elif defined(__i386__) + asm volatile ("andl $0xfffffff0, %esp\n"); +#else +#error Unsupported architecture +#endif + _Exit(42); +} diff --git a/tools/tests/execperf/printexecinfo.c b/tools/tests/execperf/printexecinfo.c new file mode 100644 index 000000000..1acf0d493 --- /dev/null +++ b/tools/tests/execperf/printexecinfo.c @@ -0,0 +1,68 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +__attribute__((constructor)) +void init(int argc, const char *argv[], const char *envp[], const char *appl[], void *vars __attribute__((unused))) { + int i; + + printf("argv = %p\n", argv); + for (i=0; argv[i]; i++) { + printf("argv[%2d] = %p %.100s%s\n", i, argv[i], argv[i], strlen(argv[i]) > 100 ? "..." : ""); + } + printf("envp = %p\n", envp); + for (i=0; envp[i]; i++) { + printf("envp[%2d] = %p %.100s%s\n", i, envp[i], envp[i], strlen(envp[i]) > 100 ? "..." : ""); + } + printf("appl = %p\n", appl); + for (i=0; appl[i]; i++) { + printf("appl[%2d] = %p %.100s%s\n", i, appl[i], appl[i], strlen(appl[i]) > 100 ? "..." : ""); + } +} + +void printexecinfo(void) +{ + int ret; + uint64_t stackaddr; + size_t len = sizeof(stackaddr); + + printf("executable load address = 0x%016llx\n", (uint64_t)(uintptr_t)&_mh_execute_header); + + ret = sysctlbyname("kern.usrstack64", &stackaddr, &len, NULL, 0); + if (ret == -1) + err(1, "sysctlbyname"); + + printf(" stack address = 0x%016llx\n", stackaddr); +} + +void printdyldinfo(void) +{ + task_dyld_info_data_t info; + mach_msg_type_number_t size = TASK_DYLD_INFO_COUNT; + kern_return_t kret; + struct dyld_all_image_infos *all_image_infos; + + kret = task_info(mach_task_self(), TASK_DYLD_INFO, + (void *)&info, &size); + if (kret != KERN_SUCCESS) + errx(1, "task_info: %s", mach_error_string(kret)); + + all_image_infos = (struct dyld_all_image_infos *)(uintptr_t)info.all_image_info_addr; + + printf(" dyld load address = 0x%016llx\n", (uint64_t)(uintptr_t)all_image_infos->dyldImageLoadAddress); + printf(" shared cache slide = 0x%016llx\n", (uint64_t)(uintptr_t)all_image_infos->sharedCacheSlide); + +} + +int main(int argc, char *argv[]) { + + printexecinfo(); + printdyldinfo(); + + return 0; +} diff --git a/tools/tests/execperf/run.c b/tools/tests/execperf/run.c new file mode 100644 index 000000000..d7d5f6a5b --- /dev/null +++ b/tools/tests/execperf/run.c @@ -0,0 +1,89 @@ +#include +#include +#include +#include +#include +#include +#include + +extern char **environ; + +char * const *newargv; + +void usage(void); + +void *work(void *); + +int main(int argc, char *argv[]) { + + int i, count, threadcount; + int ret; + pthread_t *threads; + + if (argc < 4) { + usage(); + } + + threadcount = atoi(argv[1]); + count = atoi(argv[2]); + + newargv = &argv[3]; + + threads = (pthread_t *)calloc(threadcount, sizeof(pthread_t)); + for (i=0; i < threadcount; i++) { + ret = pthread_create(&threads[i], NULL, work, (void *)(intptr_t)count); + if (ret) { + err(1, "pthread_create"); + } + } + + for (i=0; i < threadcount; i++) { + ret = pthread_join(threads[i], NULL); + if (ret) { + err(1, "pthread_join"); + } + } + + return 0; +} + +void usage(void) { + fprintf(stderr, "Usage: %s [ [ ...]]\n", + getprogname()); + exit(1); +} + +void *work(void *arg) +{ + int count = (int)(intptr_t)arg; + int i; + int ret; + pid_t pid; + + for (i=0; i < count; i++) { + ret = posix_spawn(&pid, newargv[0], NULL, NULL, newargv, environ); + if (ret != 0) { + errc(1, ret, "posix_spawn(%s)", newargv[0]); + } + + while (-1 == waitpid(pid, &ret, 0)) { + if (errno != EINTR) { + err(1, "waitpid(%d)", pid); + } + } + + if (WIFSIGNALED(ret)) { + errx(1, "process exited with signal %d", WTERMSIG(ret)); + } else if (WIFSTOPPED(ret)) { + errx(1, "process stopped with signal %d", WSTOPSIG(ret)); + } else if (WIFEXITED(ret)) { + if (WEXITSTATUS(ret) != 42) { + errx(1, "process exited with unexpected exit code %d", WEXITSTATUS(ret)); + } + } else { + errx(1, "unknown exit condition %x", ret); + } + } + + return NULL; +} diff --git a/tools/tests/execperf/test.sh b/tools/tests/execperf/test.sh new file mode 100755 index 000000000..72917a719 --- /dev/null +++ b/tools/tests/execperf/test.sh @@ -0,0 +1,30 @@ +#!/bin/sh + +EXECUTABLES="exit.nodyld \ + exit.nopie.dyld-but-no-Libsystem exit.pie.dyld-but-no-Libsystem \ + exit.nopie.dyld-and-Libsystem exit.pie.dyld-and-Libsystem \ + exit.nopie exit.pie" + +RUN=run +PRODUCT=`sw_vers -productName` +COUNT= + +case "$PRODUCT" in + "iPhone OS") + COUNT=1000 + ;; + *) + COUNT=10000 + ;; +esac + +for j in 1 2 3; do + for i in ${EXECUTABLES}; do + echo "Running $i" + /usr/bin/time ./${RUN} $j $((${COUNT}/$j)) ./$i + if [ $? -ne 0 ]; then + echo "Failed $i, exit status $?" + exit 1 + fi + done +done diff --git a/tools/tests/jitter/Makefile b/tools/tests/jitter/Makefile new file mode 100644 index 000000000..ade16e7f4 --- /dev/null +++ b/tools/tests/jitter/Makefile @@ -0,0 +1,16 @@ + +ARCHS=x86_64 i386 +SDKROOT=/ +CC=xcrun -sdk "$(SDKROOT)" cc +CFLAGS=$(patsubst %, -arch %,$(ARCHS)) -g -Wall -Os -isysroot $(SDKROOT) + +all: jitter + +timer_jitter.o: timer_jitter.c + $(CC) -c -o $@ $< $(CFLAGS) + +cpu_number.o: cpu_number.s + $(CC) -c -o $@ $< $(CFLAGS) + +jitter: timer_jitter.o cpu_number.o + $(CC) -o $@ $^ $(CFLAGS) diff --git a/tools/tests/jitter/cpu_number.s b/tools/tests/jitter/cpu_number.s new file mode 100644 index 000000000..2d29bb0cd --- /dev/null +++ b/tools/tests/jitter/cpu_number.s @@ -0,0 +1,33 @@ +.text +/* + * Taken from Libc + */ +.globl _cpu_number +_cpu_number: +#if defined(__x86_64__) + push %rbp + mov %rsp,%rbp + sub $16,%rsp // space to read IDTR + + sidt (%rsp) // store limit:base on stack + movw (%rsp), %rax // get limit + and $0xfff, %rax // mask off lower 12 bits to return + + mov %rbp,%rsp + pop %rbp + ret +#elif defined(__i386__) + push %ebp + mov %esp,%ebp + sub $8, %esp // space to read IDTR + + sidt (%esp) // store limit:base on stack + movw (%esp), %ax // get limit + and $0xfff, %eax // mask off lower 12 bits to return + + mov %ebp,%esp + pop %ebp + ret +#else +#error Unsupported architecture +#endif diff --git a/tools/tests/jitter/timer_jitter.c b/tools/tests/jitter/timer_jitter.c new file mode 100644 index 000000000..f81ab3396 --- /dev/null +++ b/tools/tests/jitter/timer_jitter.c @@ -0,0 +1,480 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +typedef enum my_policy_type { MY_POLICY_REALTIME, MY_POLICY_TIMESHARE, MY_POLICY_FIXEDPRI } my_policy_type_t; + +#define DEFAULT_MAX_SLEEP_NS 2000000000ll /* Two seconds */ +#define CONSTRAINT_NANOS (20000000ll) /* 20 ms */ +#define COMPUTATION_NANOS (10000000ll) /* 10 ms */ + +struct mach_timebase_info g_mti; + +#define assert(truth, label) do { if(!(truth)) { printf("Thread %p: failure on line %d\n", pthread_self(), __LINE__); goto label; } } while (0) + +struct second_thread_args { + semaphore_t wakeup_semaphore; + semaphore_t return_semaphore; + uint64_t iterations; + my_policy_type_t pol; + double *wakeup_second_jitter_arr; + uint64_t woke_on_same_cpu; + uint64_t too_much; + volatile uint64_t last_poke_time; + volatile int cpuno; +}; + +extern int cpu_number(void); + +void * +second_thread(void *args); + +void +print_usage() +{ + printf("Usage: jitter [-w] [-s ] [-n ] [-m ] \n"); +} + +my_policy_type_t +parse_thread_policy(const char *str) +{ + if (strcmp(str, "timeshare") == 0) { + return MY_POLICY_TIMESHARE; + } else if (strcmp(str, "realtime") == 0) { + return MY_POLICY_REALTIME; + } else if (strcmp(str, "fixed") == 0) { + return MY_POLICY_FIXEDPRI; + } else { + printf("Invalid thread policy %s\n", str); + exit(1); + } +} + +int +thread_setup(my_policy_type_t pol) +{ + int res; + + switch (pol) { + case MY_POLICY_TIMESHARE: + { + return 0; + } + case MY_POLICY_REALTIME: + { + thread_time_constraint_policy_data_t pol; + + /* Hard-coded realtime parameters (similar to what Digi uses) */ + pol.period = 100000; + pol.constraint = CONSTRAINT_NANOS * g_mti.denom / g_mti.numer; + pol.computation = COMPUTATION_NANOS * g_mti.denom / g_mti.numer; + pol.preemptible = 0; /* Ignored by OS */ + + res = thread_policy_set(mach_thread_self(), THREAD_TIME_CONSTRAINT_POLICY, (thread_policy_t) &pol, THREAD_TIME_CONSTRAINT_POLICY_COUNT); + assert(res == 0, fail); + break; + } + case MY_POLICY_FIXEDPRI: + { + thread_extended_policy_data_t pol; + pol.timeshare = 0; + + res = thread_policy_set(mach_thread_self(), THREAD_EXTENDED_POLICY, (thread_policy_t) &pol, THREAD_EXTENDED_POLICY_COUNT); + assert(res == 0, fail); + break; + } + default: + { + printf("invalid policy type\n"); + return 1; + } + } + + return 0; +fail: + return 1; +} + +uint64_t +get_random_sleep_length_abs_ns(uint64_t min_sleep_ns, uint64_t max_sleep_ns) +{ + uint64_t tmp; + + tmp = (uint32_t)random(); + tmp <<= 32; + tmp |= (uint32_t)random(); + + /* Now use the random number to sleep amount within the window */ + tmp %= (max_sleep_ns - min_sleep_ns); + + return min_sleep_ns + tmp; +} + +void +compute_stats(double *values, uint64_t count, double *average_magnitudep, double *maxp, double *minp, double *stddevp) +{ + uint64_t i; + double _sum = 0; + double _max = 0; + double _min = (double)INT64_MAX; + double _avg = 0; + double _dev = 0; + + for (i = 0; i < count; i++) { + _sum += fabs(values[i]); + _max = values[i] > _max ? values[i] : _max; + _min = values[i] < _min ? values[i] : _min; + } + + _avg = _sum / (double)count; + + _dev = 0; + for (i = 0; i < count; i++) { + _dev += pow((values[i] - _avg), 2); + } + + _dev /= count; + _dev = sqrt(_dev); + + *average_magnitudep = _avg; + *maxp = _max; + *minp = _min; + *stddevp = _dev; +} + +void +print_stats_us(const char *label, double avg, double max, double min, double stddev) +{ + printf("Max %s: %.1lfus\n", label, max / 1000.0 * (((double)g_mti.numer) / ((double)g_mti.denom))); + printf("Min %s: %.1lfus\n", label, min / 1000.0 * (((double)g_mti.numer) / ((double)g_mti.denom))); + printf("Avg magnitude of %s: %.1lfus\n", label, avg / 1000.0 * (((double)g_mti.numer) / ((double)g_mti.denom))); + printf("Stddev: %.1lfus\n", stddev / 1000.0 * (((double)g_mti.numer) / ((double)g_mti.denom))); + putchar('\n'); +} + +void +print_stats_fract(const char *label, double avg, double max, double min, double stddev) +{ + printf("Max %s jitter: %.1lf%%\n", label, max * 100); + printf("Min %s jitter: %.1lf%%\n", label, min * 100); + printf("Avg %s jitter: %.1lf%%\n", label, avg * 100); + printf("Stddev: %.1lf%%\n", stddev * 100); + putchar('\n'); +} + +int +main(int argc, char **argv) +{ + uint64_t iterations, i; + double *jitter_arr, *fraction_arr; + double *wakeup_second_jitter_arr; + uint64_t target_time; + uint64_t sleep_length_abs; + uint64_t min_sleep_ns = 0; + uint64_t max_sleep_ns = DEFAULT_MAX_SLEEP_NS; + uint64_t wake_time; + unsigned random_seed; + boolean_t need_seed = TRUE; + char ch; + int res; + kern_return_t kret; + my_policy_type_t pol; + boolean_t wakeup_second_thread = FALSE; + semaphore_t wakeup_semaphore, return_semaphore; + + double avg, stddev, max, min; + double avg_fract, stddev_fract, max_fract, min_fract; + uint64_t too_much; + + struct second_thread_args secargs; + pthread_t secthread; + + mach_timebase_info(&g_mti); + + /* Seed random */ + opterr = 0; + while ((ch = getopt(argc, argv, "m:n:hs:w")) != -1 && ch != '?') { + switch (ch) { + case 's': + /* Specified seed for random)() */ + random_seed = (unsigned)atoi(optarg); + srandom(random_seed); + need_seed = FALSE; + break; + case 'm': + /* How long per timer? */ + max_sleep_ns = strtoull(optarg, NULL, 10); + break; + case 'n': + /* How long per timer? */ + min_sleep_ns = strtoull(optarg, NULL, 10); + break; + case 'w': + /* After each timed wait, wakeup another thread */ + wakeup_second_thread = TRUE; + break; + case 'h': + print_usage(); + exit(0); + break; + default: + fprintf(stderr, "Got unexpected result from getopt().\n"); + exit(1); + break; + } + } + + argc -= optind; + argv += optind; + + if (argc != 3) { + print_usage(); + exit(1); + } + + if (min_sleep_ns >= max_sleep_ns) { + print_usage(); + exit(1); + } + + if (need_seed) { + srandom(time(NULL)); + } + + /* What scheduling policy? */ + pol = parse_thread_policy(argv[0]); + + /* How many timers? */ + iterations = strtoull(argv[1], NULL, 10); + + /* How much jitter is so extreme that we should cut a trace point */ + too_much = strtoull(argv[2], NULL, 10); + + /* Array for data */ + jitter_arr = (double*)malloc(sizeof(*jitter_arr) * iterations); + if (jitter_arr == NULL) { + printf("Couldn't allocate array to store results.\n"); + exit(1); + } + + fraction_arr = (double*)malloc(sizeof(*fraction_arr) * iterations); + if (fraction_arr == NULL) { + printf("Couldn't allocate array to store results.\n"); + exit(1); + } + + if (wakeup_second_thread) { + /* Array for data */ + wakeup_second_jitter_arr = (double*)malloc(sizeof(*jitter_arr) * iterations); + if (wakeup_second_jitter_arr == NULL) { + printf("Couldn't allocate array to store results.\n"); + exit(1); + } + + kret = semaphore_create(mach_task_self(), &wakeup_semaphore, SYNC_POLICY_FIFO, 0); + if (kret != KERN_SUCCESS) { + printf("Couldn't allocate semaphore %d\n", kret); + exit(1); + } + + kret = semaphore_create(mach_task_self(), &return_semaphore, SYNC_POLICY_FIFO, 0); + if (kret != KERN_SUCCESS) { + printf("Couldn't allocate semaphore %d\n", kret); + exit(1); + } + + + secargs.wakeup_semaphore = wakeup_semaphore; + secargs.return_semaphore = return_semaphore; + secargs.iterations = iterations; + secargs.pol = pol; + secargs.wakeup_second_jitter_arr = wakeup_second_jitter_arr; + secargs.woke_on_same_cpu = 0; + secargs.too_much = too_much; + secargs.last_poke_time = 0ULL; + secargs.cpuno = 0; + + res = pthread_create(§hread, NULL, second_thread, &secargs); + if (res) { + err(1, "pthread_create"); + } + + sleep(1); /* Time for other thread to start up */ + } + + /* Set scheduling policy */ + res = thread_setup(pol); + if (res != 0) { + printf("Couldn't set thread policy.\n"); + exit(1); + } + + /* + * Repeatedly pick a random timer length and + * try to sleep exactly that long + */ + for (i = 0; i < iterations; i++) { + sleep_length_abs = (uint64_t) (get_random_sleep_length_abs_ns(min_sleep_ns, max_sleep_ns) * (((double)g_mti.denom) / ((double)g_mti.numer))); + target_time = mach_absolute_time() + sleep_length_abs; + + /* Sleep */ + kret = mach_wait_until(target_time); + wake_time = mach_absolute_time(); + + jitter_arr[i] = (double)(wake_time - target_time); + fraction_arr[i] = jitter_arr[i] / ((double)sleep_length_abs); + + /* Too much: cut a tracepoint for a debugger */ + if (jitter_arr[i] >= too_much) { + syscall(SYS_kdebug_trace, 0xeeeeeeee, 0, 0, 0, 0); + } + + if (wakeup_second_thread) { + secargs.last_poke_time = mach_absolute_time(); + secargs.cpuno = cpu_number(); + OSMemoryBarrier(); + kret = semaphore_signal(wakeup_semaphore); + if (kret != KERN_SUCCESS) { + errx(1, "semaphore_signal"); + } + + kret = semaphore_wait(return_semaphore); + if (kret != KERN_SUCCESS) { + errx(1, "semaphore_wait"); + } + + } + } + + /* + * Compute statistics and output results. + */ + compute_stats(jitter_arr, iterations, &avg, &max, &min, &stddev); + compute_stats(fraction_arr, iterations, &avg_fract, &max_fract, &min_fract, &stddev_fract); + + putchar('\n'); + print_stats_us("jitter", avg, max, min, stddev); + print_stats_fract("%", avg_fract, max_fract, min_fract, stddev_fract); + + if (wakeup_second_thread) { + + res = pthread_join(secthread, NULL); + if (res) { + err(1, "pthread_join"); + } + + compute_stats(wakeup_second_jitter_arr, iterations, &avg, &max, &min, &stddev); + + putchar('\n'); + print_stats_us("second jitter", avg, max, min, stddev); + + putchar('\n'); + printf("%llu/%llu (%.1f%%) wakeups on same CPU\n", secargs.woke_on_same_cpu, iterations, + 100.0*((double)secargs.woke_on_same_cpu)/iterations); + } + + return 0; +} + +void * +second_thread(void *args) +{ + struct second_thread_args *secargs = (struct second_thread_args *)args; + int res; + uint64_t i; + kern_return_t kret; + uint64_t wake_time; + int cpuno; + + /* Set scheduling policy */ + res = thread_setup(secargs->pol); + if (res != 0) { + printf("Couldn't set thread policy.\n"); + exit(1); + } + + /* + * Repeatedly pick a random timer length and + * try to sleep exactly that long + */ + for (i = 0; i < secargs->iterations; i++) { + + /* Wake up when poked by main thread */ + kret = semaphore_wait(secargs->wakeup_semaphore); + if (kret != KERN_SUCCESS) { + errx(1, "semaphore_wait %d", kret); + } + + wake_time = mach_absolute_time(); + cpuno = cpu_number(); + if (wake_time < secargs->last_poke_time) { + /* Woke in past, unsynchronized mach_absolute_time()? */ + + errx(1, "woke in past %llu (%d) < %llu (%d)", wake_time, cpuno, secargs->last_poke_time, secargs->cpuno); + } + + if (cpuno == secargs->cpuno) { + secargs->woke_on_same_cpu++; + } + + secargs->wakeup_second_jitter_arr[i] = (double)(wake_time - secargs->last_poke_time); + + /* Too much: cut a tracepoint for a debugger */ + if (secargs->wakeup_second_jitter_arr[i] >= secargs->too_much) { + syscall(SYS_kdebug_trace, 0xeeeeeeef, 0, 0, 0, 0); + } + + kret = semaphore_signal(secargs->return_semaphore); + if (kret != KERN_SUCCESS) { + errx(1, "semaphore_signal %d", kret); + } + + } + + return NULL; +} diff --git a/tools/tests/kqueue_tests/Makefile b/tools/tests/kqueue_tests/Makefile old mode 100644 new mode 100755 index 9db391fe4..b51ccd631 --- a/tools/tests/kqueue_tests/Makefile +++ b/tools/tests/kqueue_tests/Makefile @@ -1,7 +1,7 @@ -all: readwrite timer +all: file timer -readwrite: - gcc -o readwrite_tests kqueue_readwrite_tests.c -arch ppc -arch i386 +file: + gcc -o file_tests kqueue_file_tests.c -arch i386 timer: - gcc -o timer_tests kqueue_timer_tests.c -arch ppc -arch i386 -arch x86_64 + gcc -o timer_tests kqueue_timer_tests.c -arch i386 -arch x86_64 diff --git a/tools/tests/kqueue_tests/kqueue_readwrite_tests.c b/tools/tests/kqueue_tests/kqueue_file_tests.c similarity index 98% rename from tools/tests/kqueue_tests/kqueue_readwrite_tests.c rename to tools/tests/kqueue_tests/kqueue_file_tests.c index e4ad5b5e4..cef98009e 100644 --- a/tools/tests/kqueue_tests/kqueue_readwrite_tests.c +++ b/tools/tests/kqueue_tests/kqueue_file_tests.c @@ -62,37 +62,37 @@ typedef struct _action { */ typedef struct _test { char *t_testname; - + /* Test kevent() or poll() */ int t_is_poll_test; - + /* Actions for setting up test */ int t_n_prep_actions; action_t t_prep_actions[5]; - + /* Actions for cleaning up test */ int t_n_cleanup_actions; action_t t_cleanup_actions[5]; /* Action for thred to take while we wait */ action_t t_helpthreadact; - + /* File to look for event on */ char *t_watchfile; /* set event ident IN TEST (can't know fd beforehand)*/ int t_file_is_fifo;/* FIFOs are handled in a special manner */ - + /* Different parameters for poll() vs kevent() */ union { struct kevent tu_kev; short tu_pollevents; } t_union; - + /* Do we expect results? */ int t_want_event; - + /* Not always used--how much data should we find (EVFILT_{READ,WRITE}) */ int t_nbytes; - + /* Hacks for FILT_READ and pipes */ int t_read_to_end_first; /* Consume all data in file before waiting for event */ int t_write_some_data; /* Write some data to file before waiting for event (FIFO hack) */ @@ -112,7 +112,7 @@ void LOG(int level, FILE *f, const char *fmt, ...) { } vfprintf(f, fmt, ap); } - + va_end(ap); } @@ -120,7 +120,7 @@ void LOG(int level, FILE *f, const char *fmt, ...) { * Initialize an action struct. Whether to sleep, what action to take, * and arguments for that action. */ - void +void init_action(action_t *act, int sleep, action_id_t call, int nargs, ...) { int i; @@ -128,14 +128,14 @@ init_action(action_t *act, int sleep, action_id_t call, int nargs, ...) va_start(ap, nargs); act->act_dosleep = sleep; act->act_id = call; - + for (i = 0; i < nargs; i++) { act->act_args[i] = va_arg(ap, void*); } - + va_end(ap); - + } /* @@ -158,14 +158,14 @@ open_fifo(const char *path, int *readfd, int *writefd) int waitres; int res; int tmpreadfd, tmpwritefd; - + res = pthread_create(&thread, 0, open_fifo_readside, (void*)path); if (res == 0) { tmpwritefd = open(path, O_WRONLY); waitres = pthread_join(thread, (void**) &tmpreadfd); - + fcntl(tmpwritefd, F_SETFL, O_WRONLY | O_NONBLOCK); - + if ((waitres == 0) && (tmpwritefd >= 0) && (tmpreadfd >= 0)) { *readfd = tmpreadfd; *writefd = tmpwritefd; @@ -213,9 +213,9 @@ execute_action(void *actionptr) void *addr; struct timeval tv; struct stat sstat; - + LOG(1, stderr, "Beginning action of type %d\n", act->act_id); - + /* Let other thread get into kevent() sleep */ if(SLEEP == act->act_dosleep) { sleep(SLEEP_TIME); @@ -252,7 +252,7 @@ execute_action(void *actionptr) } else { res = -1; } - + close(tmpfd); } break; @@ -323,7 +323,7 @@ execute_action(void *actionptr) break; case SETXATTR: res = setxattr((char*)args[0], KEY, (void*)VAL, strlen(VAL), - 0, 0); + 0, 0); break; case UTIMES: tv.tv_sec = time(NULL); @@ -345,9 +345,9 @@ execute_action(void *actionptr) res = -1; break; } - + return (void*)res; - + } /* @@ -377,7 +377,7 @@ execute_action_list(action_t *actions, int nactions, int failout) LOG(1, stderr, "Action list work succeeded on step %d.\n", i); } } - + return res; } @@ -392,24 +392,24 @@ execute_test(test_t *test) pthread_t thr; struct kevent evlist; struct timespec ts = {WAIT_TIME, 0l}; - + memset(&evlist, 0, sizeof(evlist)); - + LOG(1, stderr, "Test %s starting.\n", test->t_testname); LOG(1, stderr, test->t_want_event ? "Expecting an event.\n" : "Not expecting events.\n"); - + res = execute_action_list(test->t_prep_actions, test->t_n_prep_actions, 1); - + /* If prep succeeded */ if (0 == res) { /* Create kqueue for kqueue tests*/ if (!test->t_is_poll_test) { kqfd = kqueue(); } - + if ((test->t_is_poll_test) || kqfd >= 0) { LOG(1, stderr, "Opened kqueue.\n"); - + /* Open the file we're to monitor. Fifos get special handling */ if (test->t_file_is_fifo) { filefd = -1; @@ -417,16 +417,16 @@ execute_test(test_t *test) } else { filefd = open(test->t_watchfile, O_RDONLY | O_SYMLINK); } - + if (filefd >= 0) { LOG(1, stderr, "Opened file to monitor.\n"); - + /* * Fill in the fd to monitor once you know it * If it's a fifo test, then the helper is definitely going to want the write end. */ test->t_helpthreadact.act_fd = (writefd >= 0 ? writefd : filefd); - + if (test->t_read_to_end_first) { read_to_end(filefd); } else if (test->t_write_some_data) { @@ -435,24 +435,24 @@ execute_test(test_t *test) dowr.act_fd = writefd; execute_action(&dowr); } - + /* Helper modifies the file that we're listening on (sleeps first, in general) */ res = pthread_create(&thr, NULL, execute_action, (void*) &test->t_helpthreadact); if (0 == res) { LOG(1, stderr, "Created helper thread.\n"); - + /* This is ugly business to hack on filling up a FIFO */ if (test->t_extra_sleep_hack) { sleep(5); } - + if (test->t_is_poll_test) { struct pollfd pl; pl.fd = filefd; pl.events = test->t_union.tu_pollevents; cnt = poll(&pl, 1, WAIT_TIME); LOG(1, stderr, "Finished poll() call.\n"); - + if ((cnt < 0)) { LOG(2, stderr, "error is in errno, %s\n", strerror(errno)); res = cnt; @@ -461,7 +461,7 @@ execute_test(test_t *test) test->t_union.tu_kev.ident = filefd; cnt = kevent(kqfd, &test->t_union.tu_kev, 1, &evlist, 1, &ts); LOG(1, stderr, "Finished kevent() call.\n"); - + if ((cnt < 0) || (evlist.flags & EV_ERROR)) { LOG(2, stderr, "kevent() call failed.\n"); if (cnt < 0) { @@ -472,7 +472,7 @@ execute_test(test_t *test) res = cnt; } } - + /* Success only if you've succeeded to this point AND joined AND other thread is happy*/ status = 0; res2 = pthread_join(thr, (void**)&status); @@ -485,7 +485,7 @@ execute_test(test_t *test) } else { LOG(2, stderr, "Couldn't start thread.\n"); } - + close(filefd); if (test->t_file_is_fifo) { close(writefd); @@ -500,10 +500,10 @@ execute_test(test_t *test) res = -1; } } - + /* Cleanup work */ execute_action_list(test->t_cleanup_actions, test->t_n_cleanup_actions, 0); - + /* Success if nothing failed and we either received or did not receive event, * as expected */ @@ -517,7 +517,7 @@ execute_test(test_t *test) } else { retval = 0; } - + } else { LOG(2, stderr, "Got unexpected event or lack thereof.\n"); retval = -1; @@ -526,7 +526,7 @@ execute_test(test_t *test) LOG(2, stderr, "Failed to execute test.\n"); retval = -1; } - + LOG(3, stdout, "Test %s done with result %d.\n", test->t_testname, retval); } @@ -539,7 +539,7 @@ init_test_common(test_t *tst, char *testname, char *watchfile, int nprep, int nc tst->t_n_prep_actions = nprep; tst->t_n_cleanup_actions = nclean; tst->t_want_event = (want > 0); - + if (ispoll) { tst->t_is_poll_test = 1; tst->t_union.tu_pollevents = (short)event; @@ -580,38 +580,38 @@ void run_note_delete_tests() { test_t test; - + init_test(&test, "1.1.2: unlink a file", FILE1, 1, 0, NOTE_DELETE, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)FILE1, NULL); execute_test(&test); - + init_test(&test, "1.1.3: rmdir a dir", DIR1, 1, 0, NOTE_DELETE, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, RMDIR, 2, (void*)DIR1, NULL); execute_test(&test); - + init_test(&test, "1.1.4: rename one file over another", FILE2, 2, 1, NOTE_DELETE, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)FILE2, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL); execute_test(&test); - + init_test(&test, "1.1.5: rename one dir over another", DIR2, 2, 1, NOTE_DELETE, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); init_action(&(test.t_prep_actions[1]), NOSLEEP, MKDIR, 2, (void*)DIR2, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)DIR1, (void*)DIR2); init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR2, NULL); execute_test(&test); - + /* Do FIFO stuff here */ init_test(&test, "1.1.6: make a fifo, unlink it", FILE1, 1, 0, NOTE_DELETE, YES_EVENT); test.t_file_is_fifo = 1; init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, UNLINK, 1, (void*)FILE1); execute_test(&test); - + init_test(&test, "1.1.7: rename a file over a fifo", FILE1, 2, 1, NOTE_DELETE, YES_EVENT); test.t_file_is_fifo = 1; init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 2, (void*)FILE1, (void*)NULL); @@ -619,41 +619,41 @@ run_note_delete_tests() init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE2, (void*)FILE1); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL); execute_test(&test); - + init_test(&test, "1.1.8: unlink a symlink to a file", FILE2, 2, 1, NOTE_DELETE, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&(test.t_prep_actions[1]), NOSLEEP, SYMLINK, 2, (void*)FILE1, (void*)FILE2); init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)FILE2, NULL); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL); execute_test(&test); - + /* ================= */ - + init_test(&test, "1.2.1: Straight-up rename file", FILE1, 1, 1, NOTE_DELETE, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, (void*)NULL); execute_test(&test); - + init_test(&test, "1.2.2: Straight-up rename dir", DIR1, 1, 1, NOTE_DELETE, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)DIR1, (void*)DIR2); init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR2, (void*)NULL); execute_test(&test); - + init_test(&test, "1.2.3: Null action on file", FILE1, 1, 1, NOTE_DELETE, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, NOTHING, 2, NULL, NULL); /* The null action */ init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL); execute_test(&test); - + init_test(&test, "1.2.4: Rename one file over another: watch the file that lives", FILE1, 2, 1, NOTE_DELETE, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)FILE2, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL); execute_test(&test); - + init_test(&test, "1.2.5: Rename one dir over another, watch the dir that lives", DIR1, 2, 1, NOTE_DELETE, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); init_action(&(test.t_prep_actions[1]), NOSLEEP, MKDIR, 2, (void*)DIR2, (void*)NULL); @@ -666,16 +666,16 @@ run_note_write_tests() { char pathbuf[50]; char otherpathbuf[50]; - + test_t test; - + init_test(&test, "2.1.1: Straight-up write to a file", FILE1, 1, 1, NOTE_WRITE, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, WRITE, 2, (void*)FILE1, NULL); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL); execute_test(&test); - - + + makepath(pathbuf, DIR1, FILE1); init_test(&test, "2.1.2: creat() file inside a dir", DIR1, 1, 2, NOTE_WRITE, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); @@ -683,7 +683,7 @@ run_note_write_tests() init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL); init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); - + makepath(pathbuf, DIR1, FILE1); init_test(&test, "2.1.3: open() file inside a dir", DIR1, 1, 2, NOTE_WRITE, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); @@ -691,7 +691,7 @@ run_note_write_tests() init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL); init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); - + makepath(pathbuf, DIR1, FILE1); init_test(&test, "2.1.3: unlink a file from a dir", DIR1, 2, 1, NOTE_WRITE, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); @@ -699,7 +699,7 @@ run_note_write_tests() init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)pathbuf, NULL); init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); - + makepath(pathbuf, DIR1, FILE1); makepath(otherpathbuf, DIR1, FILE2); init_test(&test, "2.1.5: rename a file in a dir", DIR1, 2, 2, NOTE_WRITE, YES_EVENT); @@ -709,7 +709,7 @@ run_note_write_tests() init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)otherpathbuf, (void*)NULL); init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); - + makepath(pathbuf, DIR1, FILE1); init_test(&test, "2.1.6: rename a file to outside of a dir", DIR1, 2, 2, NOTE_WRITE, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); @@ -718,7 +718,7 @@ run_note_write_tests() init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); - + makepath(pathbuf, DIR1, FILE1); init_test(&test, "2.1.7: rename a file into a dir", DIR1, 2, 2, NOTE_WRITE, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); @@ -727,7 +727,7 @@ run_note_write_tests() init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL); init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); - + makepath(pathbuf, DIR1, FILE1); init_test(&test, "2.1.9: unlink a fifo from a dir", DIR1, 2, 1, NOTE_WRITE, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); @@ -735,7 +735,7 @@ run_note_write_tests() init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)pathbuf, NULL); init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); - + makepath(pathbuf, DIR1, FILE1); init_test(&test, "2.1.10: make symlink in a dir", DIR1, 1, 2, NOTE_WRITE, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); @@ -750,8 +750,8 @@ run_note_write_tests() init_action(&test.t_helpthreadact, SLEEP, WRITEFD, 0); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL); execute_test(&test); - - + + makepath(pathbuf, DIR1, FILE1); init_test(&test, "2.1.13: delete a symlink in a dir", DIR1, 2, 1, NOTE_WRITE, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); @@ -759,7 +759,7 @@ run_note_write_tests() init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)pathbuf, (void*)FILE1); init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); - + /* This actually should not generate an event, though it's in this section */ makepath(pathbuf, DIR1, FILE1); makepath(otherpathbuf, DIR1, FILE2); @@ -772,47 +772,47 @@ run_note_write_tests() init_action(&test.t_cleanup_actions[1], NOSLEEP, UNLINK, 2, (void*)otherpathbuf, (void*)NULL); init_action(&test.t_cleanup_actions[2], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); - + LOG(1, stderr, "MMAP test should fail on HFS.\n"); init_test(&test, "2.1.15: Change a file with mmap()", FILE1, 1, 1, NOTE_WRITE, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, MMAP, 2, (void*)FILE1, (void*)1); /* 1 -> "modify it"*/ init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL); execute_test(&test); - + /*================= no-event tests ==================*/ init_test(&test, "2.2.1: just open and close existing file", FILE1, 1, 1, NOTE_WRITE, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, OPEN, 2, (void*)FILE1, NULL); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL); execute_test(&test); - + init_test(&test, "2.2.2: read from existing file", FILE1, 1, 1, NOTE_WRITE, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, READ, 2, (void*)FILE1, NULL); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL); execute_test(&test); - + init_test(&test, "2.2.3: rename existing file", FILE1, 1, 1, NOTE_WRITE, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, (void*)NULL); execute_test(&test); - + init_test(&test, "2.2.4: just open and close dir", DIR1, 1, 1, NOTE_WRITE, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, OPEN, 2, (void*)DIR1, (void*)NULL); init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); - + /* There are no tests 2.2.5 or 2.2.6 */ - + init_test(&test, "2.2.7: rename a dir", DIR1, 1, 1, NOTE_WRITE, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)DIR1, (void*)DIR2); init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR2, (void*)NULL); execute_test(&test); - + init_test(&test, "2.2.8: rename a fifo", FILE1, 1, 1, NOTE_WRITE, NO_EVENT); test.t_file_is_fifo = 1; init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 2, (void*)FILE1, (void*)NULL); @@ -825,40 +825,40 @@ run_note_write_tests() init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, UNLINK,1, (void*)FILE1); execute_test(&test); - + init_test(&test, "2.2.10: chmod a file", FILE1, 1, 1, NOTE_WRITE, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, CHMOD, 2, (void*)FILE1, (void*)0700); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL); execute_test(&test); - + struct passwd *pwd = getpwnam("local"); int uid = pwd->pw_uid; int gid = pwd->pw_gid; - + init_test(&test, "2.2.11: chown a file", FILE1, 2, 1, NOTE_WRITE, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_prep_actions[1], NOSLEEP, CHOWN, 3, (void*)FILE1, (void*)uid, (void*)gid); init_action(&test.t_helpthreadact, SLEEP, CHOWN, 3, (void*)FILE1, (void*)getuid(), (void*)getgid()); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL); execute_test(&test); - - + + init_test(&test, "2.2.12: chmod a dir", DIR1, 1, 1, NOTE_WRITE, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, CHMOD, 2, (void*)DIR1, (void*)0700); init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); - + init_test(&test, "2.2.13: chown a dir", DIR1, 2, 1, NOTE_WRITE, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); init_action(&test.t_prep_actions[1], NOSLEEP, CHOWN, 3, (void*)DIR1, (void*)uid, (void*)gid); init_action(&test.t_helpthreadact, SLEEP, CHOWN, 3, (void*)DIR1, (void*)getuid(), (void*)getgid()); init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); - - - + + + LOG(1, stderr, "MMAP will never give a notification on HFS.\n"); init_test(&test, "2.1.14: mmap() a file but do not change it", FILE1, 1, 1, NOTE_WRITE, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); @@ -872,19 +872,19 @@ run_note_extend_tests() { test_t test; char pathbuf[50]; - + LOG(1, stderr, "THESE TESTS WILL FAIL ON HFS!\n"); - + init_test(&test, "3.1.1: write beyond the end of a file", FILE1, 1, 1, NOTE_EXTEND, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, WRITE, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL); execute_test(&test); - + /* * We won't concern ourselves with lengthening directories: commenting these out * - + makepath(pathbuf, DIR1, FILE1); init_test(&test, "3.1.2: add a file to a directory with creat()", DIR1, 1, 2, NOTE_EXTEND, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); @@ -892,7 +892,7 @@ run_note_extend_tests() init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL); init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); - + makepath(pathbuf, DIR1, FILE1); init_test(&test, "3.1.3: add a file to a directory with open()", DIR1, 1, 2, NOTE_EXTEND, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); @@ -900,7 +900,7 @@ run_note_extend_tests() init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL); init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); - + makepath(pathbuf, DIR1, FILE1); init_test(&test, "3.1.4: add a file to a directory with rename()", DIR1, 2, 2, NOTE_EXTEND, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); @@ -910,37 +910,37 @@ run_note_extend_tests() init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); */ - + /* 3.1.5: a placeholder for a potential kernel test */ /* - makepath(pathbuf, DIR1, DIR2); - init_test(&test, "3.1.6: add a file to a directory with mkdir()", DIR1, 1, 2, NOTE_EXTEND, YES_EVENT); - init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); - init_action(&test.t_helpthreadact, SLEEP, MKDIR, 2, (void*)pathbuf, (void*)NULL); - init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)pathbuf, (void*)NULL); - init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); - execute_test(&test); - */ + makepath(pathbuf, DIR1, DIR2); + init_test(&test, "3.1.6: add a file to a directory with mkdir()", DIR1, 1, 2, NOTE_EXTEND, YES_EVENT); + init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); + init_action(&test.t_helpthreadact, SLEEP, MKDIR, 2, (void*)pathbuf, (void*)NULL); + init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)pathbuf, (void*)NULL); + init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); + execute_test(&test); + */ init_test(&test, "3.1.7: lengthen a file with truncate()", FILE1, 1, 1, NOTE_EXTEND, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, LENGTHEN, 2, FILE1, (void*)NULL); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL); execute_test(&test); - - + + /** ========== NO EVENT SECTION ============== **/ init_test(&test, "3.2.1: setxattr() a file", FILE1, 1, 1, NOTE_EXTEND, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, SETXATTR, 2, FILE1, (void*)NULL); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL); execute_test(&test); - + init_test(&test, "3.2.2: chmod a file", FILE1, 1, 1, NOTE_EXTEND, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, CHMOD, 2, (void*)FILE1, (void*)0700); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL); execute_test(&test); - + struct passwd *pwd = getpwnam("local"); if (!pwd) { LOG(2, stderr, "Couldn't getpwnam for local.\n"); @@ -948,28 +948,28 @@ run_note_extend_tests() } int uid = pwd->pw_uid; int gid = pwd->pw_gid; - + init_test(&test, "3.2.3: chown a file", FILE1, 2, 1, NOTE_EXTEND, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_prep_actions[1], NOSLEEP, CHOWN, 3, (void*)FILE1, (void*)uid, (void*)gid); init_action(&test.t_helpthreadact, SLEEP, CHOWN, 3, (void*)FILE1, (void*)getuid(), (void*)getgid()); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL); execute_test(&test); - - + + init_test(&test, "3.2.4: chmod a dir", DIR1, 1, 1, NOTE_EXTEND, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, CHMOD, 2, (void*)DIR1, (void*)0700); init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); - + init_test(&test, "3.2.5: chown a dir", DIR1, 2, 1, NOTE_EXTEND, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); init_action(&test.t_prep_actions[1], NOSLEEP, CHOWN, 3, (void*)DIR1, (void*)uid, (void*)gid); init_action(&test.t_helpthreadact, SLEEP, CHOWN, 3, (void*)DIR1, (void*)getuid(), (void*)getgid()); init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); - + init_test(&test, "3.2.6: TRUNC a file with truncate()", FILE1, 1, 1, NOTE_EXTEND, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, TRUNC, 2, FILE1, (void*)NULL); @@ -982,50 +982,50 @@ run_note_attrib_tests() { test_t test; char pathbuf[50]; - + init_test(&test, "4.1.1: chmod a file", FILE1, 1, 1, NOTE_ATTRIB, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, CHMOD, 2, FILE1, (void*)0700); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL); execute_test(&test); - + struct passwd *pwd = getpwnam("local"); int uid = pwd->pw_uid; int gid = pwd->pw_gid; - + init_test(&test, "4.1.2: chown a file", FILE1, 2, 1, NOTE_ATTRIB, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&(test.t_prep_actions[1]), NOSLEEP, CHOWN, 3, (void*)FILE1, (void*)uid, (void*)gid); init_action(&test.t_helpthreadact, SLEEP, CHOWN, 3, FILE1, (void*)getuid(), (void*)gid); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL); execute_test(&test); - + init_test(&test, "4.1.3: chmod a dir", DIR1, 1, 1, NOTE_ATTRIB, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); init_action(&(test.t_helpthreadact), SLEEP, CHMOD, 2, (void*)DIR1, (void*)0700); init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); - + init_test(&test, "4.1.4: chown a dir", DIR1, 2, 1, NOTE_ATTRIB, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); init_action(&(test.t_prep_actions[1]), NOSLEEP, CHOWN, 3, (void*)DIR1, (void*) uid, (void*)gid); init_action(&test.t_helpthreadact, SLEEP, CHOWN, 3, DIR1, (void*)getuid(), (void*)getgid()); init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); - + init_test(&test, "4.1.5: setxattr on a file", FILE1, 1, 1, NOTE_ATTRIB, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, SETXATTR, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL); execute_test(&test); - + init_test(&test, "4.1.6: setxattr on a dir", DIR1, 1, 1, NOTE_ATTRIB, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, SETXATTR, 2, (void*)DIR1, (void*)NULL); init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); - - + + init_test(&test, "4.1.7: exchangedata", FILE1, 2, 2, NOTE_ATTRIB, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)FILE2, (void*)NULL); @@ -1033,52 +1033,52 @@ run_note_attrib_tests() init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_cleanup_actions[1], NOSLEEP, UNLINK, 2, (void*)FILE2, (void*)NULL); execute_test(&test); - - + + init_test(&test, "4.1.8: utimes on a file", FILE1, 1, 1, NOTE_ATTRIB, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, UTIMES, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL); execute_test(&test); - + init_test(&test, "4.1.9: utimes on a dir", DIR1, 1, 1, NOTE_ATTRIB, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, UTIMES, 2, (void*)DIR1, (void*)NULL); init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); - - + + /* ====== NO EVENT TESTS ========== */ - + init_test(&test, "4.2.1: rename a file", FILE1, 1, 1, NOTE_ATTRIB, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL); execute_test(&test); - + init_test(&test, "4.2.2: open (do not change) a file", FILE1, 1, 1, NOTE_ATTRIB, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, OPEN, 2, (void*)FILE1, NULL); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL); execute_test(&test); - + init_test(&test, "4.2.3: stat a file", FILE1, 1, 1, NOTE_ATTRIB, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, STAT, 2, (void*)FILE1, NULL); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL); execute_test(&test); - + init_test(&test, "4.2.4: unlink a file", FILE1, 1, 0, NOTE_ATTRIB, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)FILE1, NULL); execute_test(&test); - + init_test(&test, "4.2.5: write to a file", FILE1, 1, 1, NOTE_ATTRIB, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, WRITE, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL); execute_test(&test); - + LOG(1, stderr, "EXPECT SPURIOUS NOTE_ATTRIB EVENTS FROM DIRECTORY OPERATIONS on HFS.\n"); init_test(&test, "4.2.6: add a file to a directory with creat()", DIR1, 1, 2, NOTE_ATTRIB, NO_EVENT); makepath(pathbuf, DIR1, FILE1); @@ -1087,7 +1087,7 @@ run_note_attrib_tests() init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL); init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); - + init_test(&test, "4.2.7: mkdir in a dir", DIR1, 1, 2, NOTE_ATTRIB, NO_EVENT); makepath(pathbuf, DIR1, DIR2); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); @@ -1095,7 +1095,7 @@ run_note_attrib_tests() init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)pathbuf, (void*)NULL); init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); - + init_test(&test, "4.2.8: add a symlink to a directory", DIR1, 1, 2, NOTE_ATTRIB, NO_EVENT); makepath(pathbuf, DIR1, FILE1); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); @@ -1103,7 +1103,7 @@ run_note_attrib_tests() init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL); init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); - + init_test(&test, "4.2.9: rename into a dir()", DIR1, 2, 2, NOTE_ATTRIB, NO_EVENT); makepath(pathbuf, DIR1, FILE1); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); @@ -1112,7 +1112,7 @@ run_note_attrib_tests() init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL); init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); - + init_test(&test, "4.2.10: unlink() file from dir", DIR1, 2, 1, NOTE_ATTRIB, NO_EVENT); makepath(pathbuf, DIR1, FILE1); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); @@ -1120,7 +1120,7 @@ run_note_attrib_tests() init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL); init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); - + init_test(&test, "4.2.11: mkfifo in a directory", DIR1, 1, 2, NOTE_ATTRIB, NO_EVENT); makepath(pathbuf, DIR1, FILE1); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); @@ -1128,8 +1128,8 @@ run_note_attrib_tests() init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL); init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); - - + + } @@ -1139,28 +1139,28 @@ run_note_link_tests() test_t test; char pathbuf[50]; char otherpathbuf[50]; - + LOG(1, stderr, "HFS DOES NOT HANDLE UNLINK CORRECTLY...\n"); init_test(&test, "5.1.1: unlink() a file", FILE1, 1, 0, NOTE_LINK, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL); execute_test(&test); - - + + init_test(&test, "5.1.1.5: link A to B, watch A, remove B", FILE1, 2, 1, NOTE_LINK, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&(test.t_prep_actions[1]), NOSLEEP, HARDLINK, 2, (void*)FILE1, (void*)FILE2); init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)FILE2, (void*)NULL); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL); execute_test(&test); - + init_test(&test, "5.1.2: link() to a file", FILE1, 1, 2, NOTE_LINK, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, HARDLINK, 2, (void*)FILE1, (void*)FILE2); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL); init_action(&test.t_cleanup_actions[1], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL); execute_test(&test); - + makepath(pathbuf, DIR1, DIR2); init_test(&test, "5.1.3: make one dir in another", DIR1, 1, 2, NOTE_LINK, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); @@ -1168,7 +1168,7 @@ run_note_link_tests() init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)pathbuf, NULL); init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL); execute_test(&test); - + makepath(pathbuf, DIR1, DIR2); init_test(&test, "5.1.4: rmdir a dir from within another", DIR1, 2, 1, NOTE_LINK, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); @@ -1176,7 +1176,7 @@ run_note_link_tests() init_action(&test.t_helpthreadact, SLEEP, RMDIR, 2, (void*)pathbuf, (void*)NULL); init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL); execute_test(&test); - + makepath(pathbuf, DIR1, DIR2); makepath(otherpathbuf, DIR1, DIR1); init_test(&test, "5.1.5: rename dir A over dir B inside dir C", DIR1, 3, 2, NOTE_LINK, YES_EVENT); @@ -1187,7 +1187,7 @@ run_note_link_tests() init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)otherpathbuf, NULL); init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL); execute_test(&test); - + LOG(1, stderr, "HFS bypasses hfs_makenode to create in target, so misses knote.\n"); makepath(pathbuf, DIR1, DIR2); init_test(&test, "5.1.6: rename one dir into another", DIR1, 2, 2, NOTE_LINK, YES_EVENT); @@ -1197,7 +1197,7 @@ run_note_link_tests() init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)pathbuf, NULL); init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL); execute_test(&test); - + LOG(1, stderr, "HFS bypasses hfs_removedir to remove from source, so misses knote.\n"); makepath(pathbuf, DIR1, DIR2); init_test(&test, "5.1.7: rename one dir out of another", DIR1, 2, 2, NOTE_LINK, YES_EVENT); @@ -1207,12 +1207,12 @@ run_note_link_tests() init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR2, NULL); init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL); execute_test(&test); - + init_test(&test, "5.1.8: rmdir a dir", DIR1, 1, 0, NOTE_LINK, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL); execute_test(&test); - + /* ============= NO EVENT SECTION ============== */ makepath(pathbuf, DIR1, FILE1); init_test(&test, "5.2.1: make a file in a dir", DIR1, 1, 2, NOTE_LINK, NO_EVENT); @@ -1221,7 +1221,7 @@ run_note_link_tests() init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, NULL); init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL); execute_test(&test); - + makepath(pathbuf, DIR1, FILE1); init_test(&test, "5.2.2: unlink a file in a dir", DIR1, 2, 1, NOTE_LINK, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); @@ -1229,7 +1229,7 @@ run_note_link_tests() init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL); init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL); execute_test(&test); - + makepath(pathbuf, DIR1, FILE1); makepath(otherpathbuf, DIR1, FILE2); init_test(&test, "5.2.3: rename a file within a dir", DIR1, 2, 2, NOTE_LINK, NO_EVENT); @@ -1239,7 +1239,7 @@ run_note_link_tests() init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)otherpathbuf, NULL); init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL); execute_test(&test); - + makepath(pathbuf, DIR1, FILE1); init_test(&test, "5.2.4: rename a file into a dir", DIR1, 2, 2, NOTE_LINK, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); @@ -1248,7 +1248,7 @@ run_note_link_tests() init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, NULL); init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL); execute_test(&test); - + makepath(pathbuf, DIR1, FILE1); init_test(&test, "5.2.5: make a symlink in a dir", DIR1, 1, 2, NOTE_LINK, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); @@ -1256,14 +1256,14 @@ run_note_link_tests() init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, NULL); init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL); execute_test(&test); - + init_test(&test, "5.2.6: make a symlink to a dir", DIR1, 1, 2, NOTE_LINK, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, SYMLINK, 2, (void*)DIR1, (void*)FILE1); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL); init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL); execute_test(&test); - + init_test(&test, "5.2.7: make a symlink to a file", FILE1, 1, 2, NOTE_LINK, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, SYMLINK, 2, (void*)FILE1, (void*)FILE2); @@ -1276,74 +1276,74 @@ void run_note_rename_tests() { test_t test; - + init_test(&test, "6.1.1: rename a file", FILE1, 1, 1, NOTE_RENAME, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL); execute_test(&test); - + init_test(&test, "6.1.2: rename a dir", DIR1, 1, 1, NOTE_RENAME, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)DIR1, (void*)DIR2); init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR2, NULL); execute_test(&test); - + init_test(&test, "6.1.2: rename one file over another", FILE1, 2, 1, NOTE_RENAME, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)FILE2, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL); execute_test(&test); - + init_test(&test, "6.1.3: rename one dir over another", DIR1, 2, 1, NOTE_RENAME, YES_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); init_action(&(test.t_prep_actions[1]), NOSLEEP, MKDIR, 2, (void*)DIR2, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)DIR1, (void*)DIR2); init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR2, NULL); execute_test(&test); - + /* ========= NO EVENT SECTION =========== */ - + init_test(&test, "6.2.1: unlink a file", FILE1, 1, 0, NOTE_RENAME, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)FILE1, NULL); execute_test(&test); - + init_test(&test, "6.2.2: rmdir a dir", DIR1, 1, 0, NOTE_RENAME, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, RMDIR, 2, (void*)DIR1, NULL); execute_test(&test); - + init_test(&test, "6.2.3: link() to a file", FILE1, 1, 2, NOTE_RENAME, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, HARDLINK, 2, (void*)FILE1, (void*)FILE2); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL); init_action(&test.t_cleanup_actions[1], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL); execute_test(&test); - + init_test(&test, "6.2.4: rename one file over another: watch deceased", - FILE2, 2, 1, NOTE_RENAME, NO_EVENT); + FILE2, 2, 1, NOTE_RENAME, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)FILE2, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL); execute_test(&test); - + init_test(&test, "6.2.5: rename one dir over another: watch deceased", - DIR2, 2, 1, NOTE_RENAME, NO_EVENT); + DIR2, 2, 1, NOTE_RENAME, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); init_action(&(test.t_prep_actions[1]), NOSLEEP, MKDIR, 2, (void*)DIR2, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)DIR1, (void*)DIR2); init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR2, NULL); execute_test(&test); - + init_test(&test, "6.2.6: rename a file to itself", FILE1, 1, 1, NOTE_RENAME, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE1); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL); execute_test(&test); - + init_test(&test, "6.2.7: rename a dir to itself", DIR1, 1, 1, NOTE_RENAME, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)DIR1, (void*)DIR1); @@ -1360,7 +1360,7 @@ run_note_revoke_tests() init_action(&test.t_helpthreadact, SLEEP, REVOKE, 1, (void*)FILE1); init_action(&(test.t_cleanup_actions[0]), NOSLEEP, UNLINK, 1, (void*)FILE1); execute_test(&test); - + init_test(&test, "7.2.1: delete file", FILE1, 1, 0, NOTE_REVOKE, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1); init_action(&test.t_helpthreadact, SLEEP, UNLINK, 1, (void*)FILE1); @@ -1378,7 +1378,7 @@ run_evfilt_read_tests() init_action(&test.t_helpthreadact, SLEEP, NOTHING, 0); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL); execute_test(&test); - + init_test(&test, "8.1.2: block, then write to file", FILE1, 2, 1, EVFILT_READ, strlen(TEST_STRING)); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1); init_action(&(test.t_prep_actions[1]), NOSLEEP, TRUNC, 1, (void*)FILE1); @@ -1392,7 +1392,7 @@ run_evfilt_read_tests() init_action(&test.t_helpthreadact, SLEEP, LENGTHEN, 1, (void*)FILE1); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL); execute_test(&test); - + init_test(&test, "8.1.4: block, then seek to beginning", FILE1, 2, 1, EVFILT_READ, strlen(TEST_STRING)); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1); init_action(&(test.t_prep_actions[1]), NOSLEEP, WRITE, 1, (void*)FILE1); @@ -1401,14 +1401,14 @@ run_evfilt_read_tests() init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL); execute_test(&test); - + init_test(&test, "8.1.5: block, then write to fifo", FILE1, 1, 1, EVFILT_READ, strlen(TEST_STRING)); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 1, (void*)FILE1); test.t_file_is_fifo = 1; init_action(&test.t_helpthreadact, SLEEP, WRITE, 1, (void*)FILE1); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL); execute_test(&test); - + /* No result section... */ init_test(&test, "8.2.1: just rename", FILE1, 2, 1, EVFILT_READ, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1); @@ -1416,13 +1416,13 @@ run_evfilt_read_tests() init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL); execute_test(&test); - + init_test(&test, "8.2.2: delete file", FILE1, 2, 0, EVFILT_READ, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1); init_action(&(test.t_prep_actions[1]), NOSLEEP, TRUNC, 1, (void*)FILE1); init_action(&test.t_helpthreadact, SLEEP, UNLINK, 1, (void*)FILE1); execute_test(&test); - + init_test(&test, "8.2.3: write to beginning", FILE1, 2, 1, EVFILT_READ, NO_EVENT); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1); init_action(&(test.t_prep_actions[1]), NOSLEEP, WRITE, 1, (void*)FILE1); @@ -1438,14 +1438,14 @@ run_evfilt_read_tests() init_action(&test.t_helpthreadact, SLEEP, LSEEK, 1, (void*)strlen(TEST_STRING)); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL); execute_test(&test); - + init_test(&test, "8.2.5: trying to read from empty fifo", FILE1, 1, 1, EVFILT_READ, 0); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 1, (void*)FILE1); test.t_file_is_fifo = 1; init_action(&test.t_helpthreadact, SLEEP, NOTHING, 1, (void*)0); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL); execute_test(&test); - + } @@ -1474,7 +1474,7 @@ write_to_fd(void *arg) void run_evfilt_write_tests() { - + test_t test; init_test(&test, "9.1.1: how much space in empty fifo?", FILE1, 1, 1, EVFILT_WRITE, FIFO_SPACE); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 1, (void*)FILE1, (void*)NULL); @@ -1490,7 +1490,7 @@ run_evfilt_write_tests() init_action(&(test.t_helpthreadact), NOSLEEP, NOTHING, 0); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL); execute_test(&test); - + init_test(&test, "9.2.1: how much space in a full fifo?", FILE1, 1, 1, EVFILT_WRITE, 0); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 1, (void*)FILE1, (void*)NULL); test.t_file_is_fifo = 1; @@ -1524,23 +1524,23 @@ run_poll_tests() init_action(&test.t_helpthreadact, SLEEP, NOTHING, 0); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL); execute_test(&test); - + init_poll_test(&test, "10.1.4: does poll say I can read a nonempty regular file?", FILE1, 2, 1, POLLRDNORM, 1); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1, (void*)NULL); init_action(&(test.t_prep_actions[1]), NOSLEEP, LENGTHEN, 1, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, NOTHING, 0); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL); execute_test(&test); - + init_poll_test(&test, "10.1.5: does poll say I can read an empty file?", FILE1, 1, 1, POLLRDNORM, 1); init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1, (void*)NULL); init_action(&test.t_helpthreadact, SLEEP, NOTHING, 0); init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL); execute_test(&test); - - - - + + + + init_poll_test(&test, "10.2.2: does poll say I can read an empty FIFO?", FILE1, 1, 1, POLLRDNORM, 0); init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 1, (void*)FILE1, (void*)NULL); test.t_file_is_fifo = 1; @@ -1557,7 +1557,7 @@ run_poll_tests() execute_test(&test); } - void +void run_all_tests() { run_note_delete_tests(); @@ -1574,14 +1574,14 @@ run_all_tests() run_poll_tests(); } - int +int main(int argc, char **argv) { char *which = NULL; if (argc > 1) { which = argv[1]; } - + if ((!which) || (strcmp(which, "all") == 0)) run_all_tests(); else if (strcmp(which, "delete") == 0) diff --git a/tools/tests/libMicro/AppleReadMe b/tools/tests/libMicro/AppleReadMe index 156b3e4b6..de49c7daf 100755 --- a/tools/tests/libMicro/AppleReadMe +++ b/tools/tests/libMicro/AppleReadMe @@ -5,7 +5,7 @@ Mac OS X specific notes # Disable Open directory and LDAP using Directory Utility app # Turn off airport # Turn off spotlight. In terminal, execute the following: - sudo service com.apple.metadata.mds stop + launchctl unload /System/Library/LaunchDaemons/com.apple.metadata.mds.plist # Turn off Time Machine in System Preferences # Wait at least 2 minutes after boot to desktop for boot cache to settle down @@ -13,10 +13,32 @@ Mac OS X specific notes make ./bench >output.txt -gives you a text file named output.txt with the results of one run. +runs the libMicro test suite excluding the lmbench tests and gives you a text file named output.txt with the results of one run. + + ./multiview output1.txt output2.txt >compare.html gives you a html file comparing two runs. +*** To run libMicro testsuite with stepper disabled *** + +To get a more consistent result of libMicro benchmark run, we need to disable the +stepper to prevent it from causing wide variations in results. See rdar://6243819 +for details. + +So to run libMicro test suite with stepper disabled, use 'coreos_bench' script +instead of 'bench' script. + +For example: +./coreos_bench > output.txt +runs the libMicro test suite excluding the lmbench tests and gives you a text file named output.txt with the results of one run, with stepper disabled. + +Note: +1) We need '/usr/local/bin/pstates' to disable the stepper. Install AppleInternal package +which provides '/usr/local/bin/pstates'. + +2) 'coreos_bench' script is used exactly like the 'bench' script. All the usage examples for +'bench' script in this readme file also holds true for 'coreos_bench' script. + *** Makefile *** The Makefile invokes Makefile.Darwin which invokes Makefile.com.Darwin. @@ -25,32 +47,41 @@ build correctly. The binaries are placed in a directory called bin-ARCH where ARCH is the default or specified when building via the ARCH flag. +Note: +1) The binaries of apple added tests are placed in a directory called + apple/bin-ARCH + +2) All the binaries under bin-ARCH and apple/bin-ARCH are code signed + during build. + options for invoking Makefile are: ARCH defaults to i386 - if you just want to build for ppc, you can specify - make ARCH=ppc - this will put the results in bin-ppc to build fat/multi architecture, specify make ARCH=fat - the makefile will automatically build with ARCH_FLAG="-arch ppc -arch i386 -arch x86_64" and put the results in bin-fat + the makefile will automatically build with ARCH_FLAG="-arch i386 -arch x86_64" and put the results in bin-fat + + to build for ARM architecture, + first set an environment variable 'SDKROOT' to point to iPhone sdk + make ARCH=ARM_ARCH where ARM_ARCH can be armv6 or armv7 + this will put the results in bin-ARM_ARCH to build with only two of the architectures see below ARCH_FLAG defaults to -arch $(ARCH) to build fat/multi architecture, specify - make ARCH_FLAG="-arch ppc -arch i386" ARCH=fat + make ARCH_FLAG="-arch i386" ARCH=fat this will put the results in bin-fat OPT_FLAG defaults to -g - to build optimized, specify make OPT_FLAG=-s + to build optimized, specify make OPT_FLAG=-Os SEMOP_FLAG defaults to -DUSE_SEMOP to eliminate SEMOP usage, specify make SEMOP_FLAG= this is needed on some lower-end systems (e.g. M63) These can be combined, e.g. - make ARCH=ppc SEMOP_FLAG= + make ARCH=i386 SEMOP_FLAG= *** Before running benchmarks *** @@ -70,6 +101,17 @@ pass it a parameter to run a single benchmark, e.g. bench lmbench_bw_unix +By default the script will run only the libMicro testsuite excluding the lmbench tests. +To run the libmicro testsuite with the lmbench tests included, just pass the -l parameter. e.g, + + bench -l +To run only the lmbench testsuite + + bench lmbench + +To display the usage, just do + bench -h + Watch for: # WARNINGS # Quantization error likely;increase batch size (-B option) 4X to avoid. @@ -110,6 +152,13 @@ argument passing, the flow of control of a benchmark, etc. for the trivial case. The tests starting with "lmbench_" were ported from the lmbench suite, so they might be good examples as well. +*** A note regarding future changes in bench.sh script *** +coreos_bench.sh script is almost identical to bench.sh script, except that it +has additional code to disable the stepper during libmicro benchmark run. + +In future, if bench.sh script is modified, make sure the changes reflect +in coreos_bench.sh script also. + *** Things to do *** * port the rest of the lmbench benchmarks into this framework @@ -129,3 +178,43 @@ across many machines with historical repository of runs Due to rdar://4654956 and its original, rdar://2588252 you cannot run these tests on Leopard without removing the cascade_lockf test. There may be other tests which panic a Leopard system. + +*** benchDS notes *** + +From rdar://problem/7468995 add the ability to benchmark the key APIs +for server daemons. In particular, a test binary is added for each of: + + ODQueryCreateWithNode() (standard User, Groups, and Hosts records) + getaddrinfo() (hosts and ports) + mbr_check_service_membership() + mbr_check_membership() + getpwnam() + getpwuid() + getgrgid() + getpwent() + getgrent() + getgrnam() + +The script benchDS is provided to run a standard set of tests presuming +that the tests are run by root on a system configured with an OD binding. +The OD server (local or remote) must have a set of accounts created with +od_acount_create shell script. This script must also be run as root, +and passed a single argument of the number of users to create. It creates +od_test_{1..N}, and all belong to a ds_test_group1(gid 1211). In addition, +ds_test_group2(gid 1212) is created which has no users as members. User ids are +set sequentially from 5000. In order to administer the OD server, it assumes +user 'diradmin' and password 'admin' are the OD admin. + +Also, these tests consult the APIs listed, which can be run against the local +account info, or even Active Directory. + +Thus, the quick recipe is: + Install X Server + Enable OD, and create directory admin user 'diradmin' with password 'admin' + As root run: od_account_create 1000 + Now run the test, as root: ./benchDS 1000 > output-file + + +In addition, od_account_delete 1000 will delete the 1000 users created with od_account_create. + + diff --git a/tools/tests/libMicro/Makefile b/tools/tests/libMicro/Makefile index e81cc6c1e..877beb36d 100644 --- a/tools/tests/libMicro/Makefile +++ b/tools/tests/libMicro/Makefile @@ -35,7 +35,7 @@ ARCH = i386 BINS= $(ALL:%=bin-$(ARCH)/%) bin-$(ARCH)/tattle -TARBALL_CONTENTS = \ +# TARBALL_CONTENTS = \ Makefile.benchmarks \ Makefile.SunOS \ Makefile.Linux \ @@ -61,6 +61,8 @@ TARBALL_CONTENTS = \ benchmark_finiworker.c \ bench \ bench.sh \ + coreos_bench \ + coreos_bench.sh \ mk_tarball \ multiview \ multiview.sh \ @@ -72,16 +74,27 @@ TARBALL_CONTENTS = \ default $(ALL) run cstyle lint tattle: $(BINS) @cp bench.sh bench + @cp coreos_bench.sh coreos_bench @cp multiview.sh multiview @cp wrapper.sh wrapper @cp create_stuff.sh create_stuff - @chmod +x bench create_stuff multiview wrapper + @cp benchDS.sh benchDS + @cp od_account_create.sh od_account_create + @cp od_account_delete.sh od_account_delete + @chmod +x bench coreos_bench create_stuff multiview wrapper benchDS od_account_create od_account_delete @mkdir -p bin-$(ARCH); cd bin-$(ARCH); MACH=$(ARCH) $(MAKE) -f ../Makefile.`uname -s` ARCH=$(ARCH) UNAME_RELEASE=`uname -r | sed 's/\./_/g'` $@ - + @echo "code signing all the binaries under bin-$(ARCH) and apple/bin-$(ARCH)" + @for file in $(abspath bin-$(ARCH)/*) $(abspath apple/bin-$(ARCH)/*);do \ + if test -x $$file;then \ + codesign -s - $$file 1>& /dev/null ; \ + fi; \ + done; + @echo "done" + .PHONY: clean clean_subdirs clean_$(SUBDIRS) clean: clean_subdirs - rm -rf bin bin-* wrapper multiview create_stuff bench tattle + rm -rf bin bin-* wrapper multiview create_stuff bench tattle benchDS od_account_create od_account_delete coreos_bench clean_subdirs: for dir in $(SUBDIRS); do $(MAKE) -C $$dir clean; done @@ -94,10 +107,10 @@ $(BINS): bin @chmod +x wrapper @ln -sf ../wrapper $@ - -libMicro.tar: FORCE - @chmod +x ./mk_tarball wrapper - @./mk_tarball $(TARBALL_CONTENTS) +# commenting the lbMicro.tar as it is not being used. +# libMicro.tar: FORCE +# @chmod +x ./mk_tarball wrapper +# @./mk_tarball $(TARBALL_CONTENTS) -FORCE: +# FORCE: diff --git a/tools/tests/libMicro/Makefile.Darwin b/tools/tests/libMicro/Makefile.Darwin index 7eaa1aaf1..d113fc4f2 100644 --- a/tools/tests/libMicro/Makefile.Darwin +++ b/tools/tests/libMicro/Makefile.Darwin @@ -30,25 +30,37 @@ # ident "@(#)Makefile.Darwin 1.5 05/08/04 SMI" # +SDKROOT ?= / +Product=$(shell tconf --product) +Embedded=$(shell tconf --test TARGET_OS_EMBEDDED) -CC= gcc +ifeq "$(Embedded)" "YES" +SDKPATH = $(shell xcodebuild -sdk $(SDKROOT) -version Path) +CFLAGS += -isysroot $(SDKPATH) +endif + +CC = xcrun -sdk $(SDKROOT) gcc #NOPIC= -mdynamic-no-pic ARCH= i386 ifeq "$(strip $(ARCH))" "fat" -ARCH_FLAG= -arch i386 -arch ppc -arch x86_64 +ARCH_FLAG= -arch i386 -arch x86_64 else ARCH_FLAG= -arch $(ARCH) endif -OPT_FLAG= -g +### OPT_FLAG value was modified from '-g' to '-Os' as part of the fix for radar 7508837 +OPT_FLAG= -Os SEMOP_FLAG= -DUSE_SEMOP ### ###CFLAGS= -Os -DUSE_SEMOP -fno-builtin $(NOPIC) $(ARCH_FLAG) -Wall ###extra_CFLAGS= -Os -DUSE_SEMOP -fno-builtin $(NOPIC) $(ARCH_FLAG) -Wall ### -CFLAGS= $(OPT_FLAG) $(SEMOP_FLAG) -DUSE_GETHRTIME -fno-builtin $(NOPIC) $(ARCH_FLAG) -Wall +CFLAGS+= $(OPT_FLAG) $(SEMOP_FLAG) -DUSE_GETHRTIME -fno-builtin $(NOPIC) $(ARCH_FLAG) -Wall +ifeq "$(Embedded)" "YES" +CFLAGS += -g -I $(SDKPATH)/System/Library/Frameworks/System.framework/Versions/B/PrivateHeaders/ -F/AppleInternal/Library/Frameworks/ $(MORECFLAGS) +endif extra_CFLAGS= $(OPT_FLAG) $(SEMOP_FLAG) -fno-builtin $(NOPIC) $(ARCH_FLAG) -Wall CPPFLAGS= $(SEMOP_FLAG) -D_REENTRANT -Wall MATHLIB= -lm diff --git a/tools/tests/libMicro/Makefile.com.Darwin b/tools/tests/libMicro/Makefile.com.Darwin old mode 100644 new mode 100755 diff --git a/tools/tests/libMicro/README b/tools/tests/libMicro/README index a4374cb1a..9db9f814c 100644 --- a/tools/tests/libMicro/README +++ b/tools/tests/libMicro/README @@ -88,7 +88,15 @@ Apple-added Benchmarks create_file geekbench_stdlib_write + getaddrinfo_port + getaddrinfo_host + getgrgid + getgrent + getgrnam getppid + getpwnam + getpwuid + getpwent lb_mmtest lm_null_call lmbench_bw_file_rd @@ -107,6 +115,9 @@ Apple-added Benchmarks lmbench_select_tcp lmbench_stat lmbench_write + mbr_check_service_membership + mbr_check_membership + od_query_create_with_node trivial vm_allocate diff --git a/tools/tests/libMicro/apple/Makefile.Darwin b/tools/tests/libMicro/apple/Makefile.Darwin index 3ca3607ae..fe5e573bf 100644 --- a/tools/tests/libMicro/apple/Makefile.Darwin +++ b/tools/tests/libMicro/apple/Makefile.Darwin @@ -27,29 +27,42 @@ # Copyright 2005 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "@(#)Makefile.Linux 1.5 05/08/04 SMI" +# ident "@(#)Makefile.Darwin 1.5 05/08/04 SMI" # +SDKROOT ?= / +Product=$(shell tconf --product) +Embedded=$(shell tconf --test TARGET_OS_EMBEDDED) -CC= gcc +ifeq "$(Embedded)" "YES" +SDKPATH = $(shell xcodebuild -sdk $(SDKROOT) -version Path) +CFLAGS += -isysroot $(SDKPATH) +EmbeddedOS=yes +endif + +CC = xcrun -sdk $(SDKROOT) gcc #NOPIC= -mdynamic-no-pic ARCH= i386 ifeq "$(strip $(ARCH))" "fat" -ARCH_FLAG= -arch i386 -arch ppc -arch x86_64 +ARCH_FLAG= -arch i386 -arch x86_64 else ARCH_FLAG= -arch $(ARCH) endif -OPT_FLAG= -g +### OPT_FLAG value was modified from '-g' to '-Os' as part of the fix for radar 7508837 +OPT_FLAG= -Os SEMOP_FLAG= -DUSE_SEMOP ### ###CFLAGS= -Os -DUSE_SEMOP -fno-builtin $(NOPIC) $(ARCH_FLAG) -Wall ###extra_CFLAGS= -Os -DUSE_SEMOP -fno-builtin $(NOPIC) $(ARCH_FLAG) -Wall -### -CFLAGS= $(OPT_FLAG) $(SEMOP_FLAG) -fno-builtin $(NOPIC) $(ARCH_FLAG) -Wall -extra_CFLAGS= $(OPT_FLAG) $(SEMOP_FLAG) -fno-builtin $(NOPIC) $(ARCH_FLAG) -Wall +### Added -DUSE_GETHRTIME to CFLAGS and extra_CFLAGS as part of the fix for radar 7508837 +CFLAGS+= $(OPT_FLAG) $(SEMOP_FLAG) -DUSE_GETHRTIME -fno-builtin $(NOPIC) $(ARCH_FLAG) -Wall +ifeq "$(Embedded)" "YES" +CFLAGS += -g -I $(SDKPATH)/System/Library/Frameworks/System.framework/Versions/B/PrivateHeaders/ -F/AppleInternal/Library/Frameworks/ $(MORECFLAGS) +endif +extra_CFLAGS= $(OPT_FLAG) $(SEMOP_FLAG) -DUSE_GETHRTIME -fno-builtin $(NOPIC) $(ARCH_FLAG) -Wall CPPFLAGS= $(SEMOP_FLAG) -D_REENTRANT -Wall MATHLIB= -lm diff --git a/tools/tests/libMicro/apple/Makefile.benchmarks b/tools/tests/libMicro/apple/Makefile.benchmarks index 0e3cfe2ac..a26d12871 100644 --- a/tools/tests/libMicro/apple/Makefile.benchmarks +++ b/tools/tests/libMicro/apple/Makefile.benchmarks @@ -1,6 +1,4 @@ # -# CDDL HEADER START -# # The contents of this file are subject to the terms # of the Common Development and Distribution License # (the "License"). You may not use this file except @@ -53,6 +51,15 @@ ALL = \ lmbench_write \ posix_spawn \ trivial \ - vm_allocate - - + vm_allocate \ + od_query_create_with_node \ + mbr_check_service_membership \ + getpwnam \ + mbr_check_membership \ + getpwuid \ + getgrgid \ + getpwent \ + getgrent \ + getaddrinfo_host \ + getaddrinfo_port \ + getgrnam diff --git a/tools/tests/libMicro/apple/Makefile.com.Darwin b/tools/tests/libMicro/apple/Makefile.com.Darwin index 121473735..d16caca8b 100644 --- a/tools/tests/libMicro/apple/Makefile.com.Darwin +++ b/tools/tests/libMicro/apple/Makefile.com.Darwin @@ -56,3 +56,6 @@ posix_spawn: posix_spawn_bin posix_spawn_bin: posix_spawn_bin.o $(CC) -o posix_spawn_bin $(CFLAGS) posix_spawn_bin.o + +od_query_create_with_node: od_query_create_with_node.o + $(CC) -o $(@) $(@).o $($(@)_EXTRA_DEPS) $(CFLAGS) ../../bin-$(ARCH)/libmicro.a $($(@)_EXTRA_LIBS) $(EXTRA_LIBS) -lpthread -lm -framework CoreFoundation -framework OpenDirectory; cp $@ ../../bin-$(ARCH)/ diff --git a/tools/tests/libMicro/apple/getaddrinfo_host.c b/tools/tests/libMicro/apple/getaddrinfo_host.c new file mode 100644 index 000000000..a7041753f --- /dev/null +++ b/tools/tests/libMicro/apple/getaddrinfo_host.c @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2006 Apple Inc. All Rights Reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include + +// add additional headers needed here. + +#include "../libmicro.h" + +#if DEBUG +# define debug(fmt, args...) (void) fprintf(stderr, fmt "\n" , ##args) +#else +# define debug(fmt, args...) +#endif + + +// +// Correct use case +// +// getaddrinfo_host -E -L -S -W -B 200 -C 100 -s "server%d" +// +// libMicro default benchmark run options are "-E -L -S -W -C 200" +// +// -B is batch size: loop iteration per each benchmark run. Needs to match # of +// real lookups. This is total number of lookups to issue. +// -C is min sample number: how many benchmark needs to run to get proper sample +// 1 is mimumum, but you get at least 3 benchmark run +// samples. Do not set to zero. Default is 200 for most +// runs in libMicro. +// -h is hostname format: for example, "server-%d.performance.rack" +// this is C language string format that can include %d +// -r hostname digit range in the form of "min-max". For example, -r 100-112 +// With -h and -r, resulting hostnames are +// server-100.performance.rack - server-112.performance.rack +// + +extern int gL1CacheEnabled; + +/* + * Your state variables should live in the tsd_t struct below + */ +typedef struct { +} tsd_t; + +#define HOSTNAME_LEN 125 +static int host_min=-1, host_range=0; +static char *hostname_format=NULL; +static char *hostname_list=NULL; + +int +benchmark_init() +{ + debug("benchmark_init"); + (void) sprintf(lm_optstr, "l:h:r:"); + + lm_tsdsize = sizeof (tsd_t); + lm_defB = 100; + + (void) sprintf(lm_usage, + "\n ------- getaddrinfo_host specific options (default: *)\n" + " [-h \"hostname format\"]. ie. \"server-%%d.perf\"\n" + " [-r min-max]\n" + "\n" ); + + return (0); +} + + +int +parse_range(int *min, int *offset, char *buf) +{ + char *value, *tmp_ptr = strdup(buf); + int range=0; + debug("parse_range"); + + value = strsep(&tmp_ptr, "-"); + *min = atoi(value); + debug("min = %d", *min); + if (tmp_ptr) { + value = strsep(&tmp_ptr, "-"); + range = atoi(value); + if (range < *min) { + printf("max id should be larger than min id\n"); + return -1; + } + *offset = range - *min + 1; // 1-based + debug("range = %d", *offset); + } + else { + printf("argument should be in the form of min-max\n"); + return -1; + } + + return 0; + +} + +/* + * This is where you parse your lower-case arguments. + */ +int +benchmark_optswitch(int opt, char *optarg) +{ + debug("benchmark_optswitch"); + + switch (opt) { + case 'h': // hostname string format + hostname_format = strdup(optarg); + debug ("hostname format: %s", hostname_format); + break; + + case 'l': + gL1CacheEnabled = atoi(optarg); + break; + + case 'r': // UID range + return parse_range( &host_min, &host_range, optarg); + break; + + default: + return -1; + } + + + + return 0; +} + + +// Initialize all structures that will be used in benchmark() +// +int +benchmark_initrun() +{ + int i; + + debug("\nbenchmark_initrun"); + + if (host_min == -1) { + printf("-r min-max needs to be specified\n"); + exit (1); + } + + if (!hostname_format) { + printf("-h hostname_format needs to be specified\n"); + exit (1); + } + + hostname_list = malloc ( host_range * HOSTNAME_LEN ); + if (!hostname_list) { + debug("malloc error"); + exit (1); + } + + for (i = 0; i < host_range; i++) { + sprintf( &hostname_list[i*HOSTNAME_LEN], hostname_format, i+host_min); + // debug("hostname: %s", &hostname_list[i*HOSTNAME_LEN]); + } + return (0); +} + + +int +benchmark(void *tsd, result_t *res) +{ + int i, index, err; + struct addrinfo *addi; + + res->re_errors = 0; + + debug("in to benchmark - optB = %i", lm_optB); + srandom(getpid()); + + for (i = 0; i < lm_optB; i++) { + index = HOSTNAME_LEN * (random() % host_range); + + err = getaddrinfo( &hostname_list[index], NULL, NULL, &addi); + + if (err) { + debug("%s: error: %s", &hostname_list[index], gai_strerror(err)); + res->re_errors++; + } + else { + debug("host %s done", &hostname_list[index]); + } + + freeaddrinfo (addi); + } + res->re_count = i; + + return (0); +} + +// We need to release all the structures we allocated in benchmark_initrun() +int +benchmark_finirun(void *tsd) +{ + // tsd_t *ts = (tsd_t *)tsd; + debug("benchmark_finirun "); + + free(hostname_list); + + return (0); +} + +char * +benchmark_result() +{ + static char result = '\0'; + debug("benchmark_result"); + return (&result); +} + diff --git a/tools/tests/libMicro/apple/getaddrinfo_port.c b/tools/tests/libMicro/apple/getaddrinfo_port.c new file mode 100644 index 000000000..846486faa --- /dev/null +++ b/tools/tests/libMicro/apple/getaddrinfo_port.c @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2006 Apple Inc. All Rights Reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include + +// add additional headers needed here. + +#include "../libmicro.h" + +#if DEBUG +# define debug(fmt, args...) (void) fprintf(stderr, fmt "\n" , ##args) +#else +# define debug(fmt, args...) +#endif + + +// This exercises "ssh" port +// +// Correct use case +// +// getaddrinfo_port -E -L -S -W -B 200 -C 100 +// +// libMicro default benchmark run options are "-E -L -S -W -C 200" +// +// -B is batch size: loop iteration per each benchmark run. Needs to match # of +// real lookups. This is total number of lookups to issue. +// -C is min sample number: how many benchmark needs to run to get proper sample +// 1 is mimumum, but you get at least 3 benchmark run +// samples. Do not set to zero. Default is 200 for most +// runs in libMicro. +// + +extern int gL1CacheEnabled; + +/* + * Your state variables should live in the tsd_t struct below + */ +typedef struct { +} tsd_t; + + +int +benchmark_init() +{ + debug("benchmark_init"); + + (void) sprintf(lm_optstr, "l:"); + lm_tsdsize = sizeof (tsd_t); + lm_defB = 100; + + return (0); +} + + +/* + * This is where you parse your lower-case arguments. + */ +int +benchmark_optswitch(int opt, char *optarg) +{ + debug("benchmark_optswitch"); + + switch (opt) { + case 'l': + gL1CacheEnabled = atoi(optarg); + break; + } + + return 0; +} + + +// Initialize all structures that will be used in benchmark() +// +int +benchmark_initrun() +{ + debug("\nbenchmark_initrun"); + + return (0); +} + + +int +benchmark(void *tsd, result_t *res) +{ + int i, err; + struct addrinfo *addi; + + res->re_errors = 0; + + debug("in to benchmark - optB = %i", lm_optB); + for (i = 0; i < lm_optB; i++) { + + err = getaddrinfo(NULL, "ssh", NULL, &addi); + + if (err) { + debug("error: %s", gai_strerror(err)); + res->re_errors++; + } + + freeaddrinfo (addi); + } + res->re_count = i; + + return (0); +} + +// We need to release all the structures we allocated in benchmark_initrun() +int +benchmark_finirun(void *tsd) +{ + // tsd_t *ts = (tsd_t *)tsd; + debug("benchmark_finirun "); + + return (0); +} + +char * +benchmark_result() +{ + static char result = '\0'; + debug("benchmark_result"); + return (&result); +} + diff --git a/tools/tests/libMicro/apple/getgrent.c b/tools/tests/libMicro/apple/getgrent.c new file mode 100644 index 000000000..321bbed87 --- /dev/null +++ b/tools/tests/libMicro/apple/getgrent.c @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2006 Apple Inc. All Rights Reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include + +// add additional headers needed here. + +#include "../libmicro.h" +#include + +#if DEBUG +# define debug(fmt, args...) (void) fprintf(stderr, fmt "\n" , ##args) +#else +# define debug(fmt, args...) +#endif + + +// Correct use case +// +// getgrent -E -L -S -W -B 200 -C 100 +// +// libMicro default benchmark run options are "-E -L -S -W -C 200" +// +// -B is batch size: loop iteration per each benchmark run. Needs to match # of +// real lookups. This is total number of lookups to issue. +// -C is min sample number: how many benchmark needs to run to get proper sample +// 1 is mimumum, but you get at least 3 benchmark run +// samples. Do not set to zero. Default is 200 for most +// runs in libMicro. +// + +extern int gL1CacheEnabled; + +/* + * Your state variables should live in the tsd_t struct below + */ +typedef struct { +} tsd_t; + + +int +benchmark_init() +{ + debug("benchmark_init"); + + (void) sprintf(lm_optstr, "l:"); + lm_tsdsize = sizeof (tsd_t); + lm_defB = 100; + + return (0); +} + + +/* + * This is where you parse your lower-case arguments. + */ +int +benchmark_optswitch(int opt, char *optarg) +{ + debug("benchmark_optswitch"); + + switch (opt) { + case 'l': + gL1CacheEnabled = atoi(optarg); + break; + } + + return 0; +} + + +// Initialize all structures that will be used in benchmark() +// +int +benchmark_initrun() +{ + debug("\nbenchmark_initrun"); + + return (0); +} + + +int +benchmark(void *tsd, result_t *res) +{ + int i; + struct group *grp; + + res->re_errors = 0; + + debug("in to benchmark - optB = %i", lm_optB); + for (i = 0; i < lm_optB; i++) { + + errno = 0; // this is needed explicitly due to getgrent() design + grp = getgrent(); + + if (!grp) { + if (errno) { + debug("error: %s", strerror(errno)); + res->re_errors++; + } + else { + // will not be counted as error + setgroupent(1); // rewind to the beginning of passwd file + } + } + else { + debug("gr_name: %s", grp->gr_name); + } + } + res->re_count = i; + + return (0); +} + +// We need to release all the structures we allocated in benchmark_initrun() +int +benchmark_finirun(void *tsd) +{ + // tsd_t *ts = (tsd_t *)tsd; + debug("benchmark_finirun "); + + return (0); +} + +char * +benchmark_result() +{ + static char result = '\0'; + debug("benchmark_result"); + return (&result); +} + diff --git a/tools/tests/libMicro/apple/getgrgid.c b/tools/tests/libMicro/apple/getgrgid.c new file mode 100644 index 000000000..f49925d18 --- /dev/null +++ b/tools/tests/libMicro/apple/getgrgid.c @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2006 Apple Inc. All Rights Reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include + +// add additional headers needed here. + +#include "../libmicro.h" +#include +#include + +#if DEBUG +# define debug(fmt, args...) (void) fprintf(stderr, fmt "\n" , ##args) +#else +# define debug(fmt, args...) +#endif + + +// Correct use case +// +// getgrgid -E -L -S -W -B 200 -C 10 -g 1211-1213 +// +// libMicro default benchmark run options are "-E -L -S -W -C 200" +// +// -B is batch size: loop iteration per each benchmark run. Needs to match # of +// real lookups. This is total number of lookups to issue. +// -C is min sample number: how many benchmark needs to run to get proper sample +// 1 is mimumum, but you get at least 3 benchmark run +// samples. Do not set to zero. Default is 200 for most +// runs in libMicro. +// -g gid range in the form of "min-max". For example, -g 1211-1213 +// + +extern int gL1CacheEnabled; + +/* + * Your state variables should live in the tsd_t struct below + */ +typedef struct { +} tsd_t; + +// temporary buffer size +#define BUFSIZE 200 +#define INVALID_ID -1 + +static gid_t gid_min = INVALID_ID; +static int gid_range = 0; // gid_max = gid_min + gid_range + +int +benchmark_init() +{ + debug("benchmark_init"); + (void) sprintf(lm_optstr, "l:g:"); + + lm_tsdsize = sizeof (tsd_t); + lm_defB = 100; + + (void) sprintf(lm_usage, + "\n ------- getgrgid specific options (default: *)\n" + " [-g GID range (min-max)]\n" + "\n" ); + return (0); +} + + +int +parse_range(gid_t *min, int *offset, char *buf) +{ + char *value, *tmp_ptr = strdup(buf); + int range=0; + debug("parse_range"); + + value = strsep(&tmp_ptr, "-"); + *min = atoi(value); + debug("min = %d", *min); + if (tmp_ptr) { + value = strsep(&tmp_ptr, "-"); + range = atoi(value); + if (range < *min) { + printf("max id should be larger than min id\n"); + return -1; + } + *offset = range - *min + 1; + debug("range = %d", *offset); + } + else { + printf("argument should be in the form of min-max\n"); + return -1; + } + + return 0; + +} + +/* + * This is where you parse your lower-case arguments. + */ +int +benchmark_optswitch(int opt, char *optarg) +{ + debug("benchmark_optswitch"); + + switch (opt) { + case 'l': + gL1CacheEnabled = atoi(optarg); + break; + + case 'g': // GID range + return parse_range( &gid_min, &gid_range, optarg); + break; + + default: + return -1; + } + + return 0; +} + + +// Initialize all structures that will be used in benchmark() +// moved template init from benchmark_initworker -> benchmark_initrun +// +int +benchmark_initrun() +{ + debug("\nbenchmark_initrun"); + + return (0); +} + + +int +benchmark(void *tsd, result_t *res) +{ + int i, err; + struct group *grp = NULL; + + res->re_errors = 0; + + debug("in to benchmark - optB = %i", lm_optB); + for (i = 0; i < lm_optB; i++) { + gid_t gid = gid_min + random() % gid_range ; + + if (lm_optT > 1) { + struct group gd; + struct group *grp_ptr = &gd; + struct group *tmp_ptr; + char gbuf[BUFSIZE]; + + err = getgrgid_r( gid, grp_ptr, gbuf, BUFSIZE, &tmp_ptr); + if (err) { + debug("error: GID %d -> %s", gid, strerror(err)); + res->re_errors++; + } + else if (!tmp_ptr) { + debug("not found: GID %d", gid); + res->re_errors++; + } + } + else { + errno = 0; + grp = getgrgid( gid ); + + if (!grp) { + if (errno) { + debug("error: GID %d -> %s", gid, strerror(errno)); + res->re_errors++; + } + else { + debug("not found: GID %d", gid); + res->re_errors++; + } + } + } + } + res->re_count = i; + + return (0); +} + +// We need to release all the structures we allocated in benchmark_initrun() +int +benchmark_finirun(void *tsd) +{ + // tsd_t *ts = (tsd_t *)tsd; + debug("benchmark_finirun "); + + return (0); +} + +char * +benchmark_result() +{ + static char result = '\0'; + debug("benchmark_result"); + return (&result); +} + diff --git a/tools/tests/libMicro/apple/getgrnam.c b/tools/tests/libMicro/apple/getgrnam.c new file mode 100644 index 000000000..7d50a488b --- /dev/null +++ b/tools/tests/libMicro/apple/getgrnam.c @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2006 Apple Inc. All Rights Reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include + +// add additional headers needed here. + +#include "../libmicro.h" +#include +#include + +#if DEBUG +# define debug(fmt, args...) (void) fprintf(stderr, fmt "\n" , ##args) +#else +# define debug(fmt, args...) +#endif + + +// Correct use case +// +// getgrnam -E -L -S -W -B 200 -C 10 -r 10 +// +// libMicro default benchmark run options are "-E -L -S -W -C 200" +// +// -B is batch size: loop iteration per each benchmark run. Needs to match # of +// real lookups. This is total number of lookups to issue. +// -C is min sample number: how many benchmark needs to run to get proper sample +// 1 is mimumum, but you get at least 3 benchmark run +// samples. Do not set to zero. Default is 200 for most +// runs in libMicro. +// -r is the number of total groups (from "local_test_group1" to "local_test_group#") + +extern int gL1CacheEnabled; + +/* + * Your state variables should live in the tsd_t struct below + */ +typedef struct { +} tsd_t; + +// temporary buffer size +#define BUFSIZE 200 + +// the number of record lookup to issue is covered by standard option optB +static int optRecords = 10; // the number of total records + +// This will use local users (local_test_*) +static char *default_gprefix = "ds_test_group"; + +#define GROUPNAME_LEN 30 +static char *grpname_list; + +int +benchmark_init() +{ + debug("benchmark_init"); + (void) sprintf(lm_optstr, "l:r:g:"); + + lm_tsdsize = sizeof (tsd_t); + lm_defB = 100; + + (void) sprintf(lm_usage, + "\n ------- getgrnam specific options (default: *)\n" + " [-r total number of group records (10*)]\n" + " [-g group prefix(ds_test_group)]\n" + "\n" ); + return (0); +} + +/* + * This is where you parse your lower-case arguments. + */ +int +benchmark_optswitch(int opt, char *optarg) +{ + debug("benchmark_optswitch"); + + switch (opt) { + case 'r': // total number of records. default is 100 + optRecords = atoi(optarg); + debug("optRecords = %d\n", optRecords); + break; + + case 'l': + gL1CacheEnabled = atoi(optarg); + break; + + case 'g': // base name for the groups to use + default_gprefix = strdup(optarg); + debug("default_gprefix = %s\n", default_gprefix); + break; + + default: + return -1; + } + + return 0; +} + + +// Initialize all structures that will be used in benchmark() +// moved template init from benchmark_initworker -> benchmark_initrun +// since username_list is static across threads and processes +// +int +benchmark_initrun() +{ + int i; + + debug("\nbenchmark_initrun"); + + // create an array of usernames to use in benchmark before their use + // realtime generation in benchmark effects performance measurements + grpname_list = malloc( optRecords * GROUPNAME_LEN ); + if (!grpname_list) { + debug ("malloc error"); + exit (1); + } + + for (i = 0; i < optRecords; i++) { + sprintf(&grpname_list[i*GROUPNAME_LEN], "%s%d", default_gprefix, i+1); + debug("creating group name %s", &grpname_list[i*GROUPNAME_LEN]); + } + + return (0); +} + + +int +benchmark(void *tsd, result_t *res) +{ + int i, err; + struct group *grp = NULL; + + res->re_errors = 0; + + debug("in to benchmark - optB = %i", lm_optB); + srandom(getpid()); + + for (i = 0; i < lm_optB; i++) { + int index = (random() % optRecords) * GROUPNAME_LEN; + + if (lm_optT > 1) { + struct group gd; + struct group *grp_ptr = &gd; + struct group *tmp_ptr; + char gbuf[BUFSIZE]; + + err = getgrnam_r( &grpname_list[index], grp_ptr, gbuf, BUFSIZE, &tmp_ptr); + // non-NULL err means failure and NULL result ptr means no matching + // entry + if (err) { + debug("error: %s -> %s", &grpname_list[index], strerror(err)); + res->re_errors++; + } + else if ( !tmp_ptr) { + debug("not found: %s", &grpname_list[index] ); + res->re_errors++; + } + } + else { + errno = 0; + grp = getgrnam( &grpname_list[index] ); + + if (!grp) { + if (errno) { + debug("error: %s -> %s", &grpname_list[index], strerror(errno)); + res->re_errors++; + } + else { + debug("not found: %s", &grpname_list[index] ); + res->re_errors++; + } + } + } + } + res->re_count = i; + + return (0); +} + +// We need to release all the structures we allocated in benchmark_initrun() +int +benchmark_finirun(void *tsd) +{ + // tsd_t *ts = (tsd_t *)tsd; + debug("benchmark_finiworker: deallocating structures"); + + free (grpname_list); + + return (0); +} + +char * +benchmark_result() +{ + static char result = '\0'; + debug("benchmark_result"); + return (&result); +} + diff --git a/tools/tests/libMicro/apple/getpwent.c b/tools/tests/libMicro/apple/getpwent.c new file mode 100644 index 000000000..49df77e21 --- /dev/null +++ b/tools/tests/libMicro/apple/getpwent.c @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2006 Apple Inc. All Rights Reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include + +// add additional headers needed here. + +#include "../libmicro.h" +#include + +#if DEBUG +# define debug(fmt, args...) (void) fprintf(stderr, fmt "\n" , ##args) +#else +# define debug(fmt, args...) +#endif + + +// Correct use case +// +// getpwent -E -L -S -W -B 200 -C 100 +// +// libMicro default benchmark run options are "-E -L -S -W -C 200" +// +// -B is batch size: loop iteration per each benchmark run. Needs to match # of +// real lookups. This is total number of lookups to issue. +// -C is min sample number: how many benchmark needs to run to get proper sample +// 1 is mimumum, but you get at least 3 benchmark run +// samples. Do not set to zero. Default is 200 for most +// runs in libMicro. +// + +extern int gL1CacheEnabled; + +/* + * Your state variables should live in the tsd_t struct below + */ +typedef struct { +} tsd_t; + + +int +benchmark_init() +{ + debug("benchmark_init"); + + (void) sprintf(lm_optstr, "l:"); + lm_tsdsize = sizeof (tsd_t); + lm_defB = 100; + + return (0); +} + + +/* + * This is where you parse your lower-case arguments. + */ +int +benchmark_optswitch(int opt, char *optarg) +{ + debug("benchmark_optswitch"); + + switch (opt) { + case 'l': + gL1CacheEnabled = atoi(optarg); + break; + } + + return 0; +} + + +// Initialize all structures that will be used in benchmark() +// +int +benchmark_initrun() +{ + debug("\nbenchmark_initrun"); + + return (0); +} + + +int +benchmark(void *tsd, result_t *res) +{ + int i; + struct passwd *passwd; + + res->re_errors = 0; + + debug("in to benchmark - optB = %i", lm_optB); + for (i = 0; i < lm_optB; i++) { + + errno = 0; // this is needed explicitly due to getpwent() design + passwd = getpwent(); + + if (!passwd) { + if (errno) { + debug("error: %s", strerror(errno)); + res->re_errors++; + } + else { + // will not counted toward libmicro error + setpassent(1); // rewind to the beginning of passwd file + } + } + else { + debug("pw_name: %s", passwd->pw_name); + } + } + res->re_count = i; + + return (0); +} + +// We need to release all the structures we allocated in benchmark_initrun() +int +benchmark_finirun(void *tsd) +{ + // tsd_t *ts = (tsd_t *)tsd; + debug("benchmark_finirun "); + + return (0); +} + +char * +benchmark_result() +{ + static char result = '\0'; + debug("benchmark_result"); + return (&result); +} + diff --git a/tools/tests/libMicro/apple/getpwnam.c b/tools/tests/libMicro/apple/getpwnam.c new file mode 100644 index 000000000..3db5e6ca4 --- /dev/null +++ b/tools/tests/libMicro/apple/getpwnam.c @@ -0,0 +1,262 @@ +/* + * Copyright (c) 2006 Apple Inc. All Rights Reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include + +// add additional headers needed here. + +#include "../libmicro.h" +#include +#include +#include + +#if DEBUG +# define debug(fmt, args...) (void) fprintf(stderr, fmt "\n" , ##args) +#else +# define debug(fmt, args...) +#endif + + +// Correct use case +// +// getpwnam -E -L -S -W -B 200 -C 10 -c 100 -r 300 -U test_user_ +// +// libMicro default benchmark run options are "-E -L -S -W -C 200" +// +// -B is batch size: loop iteration per each benchmark run. Needs to match # of +// real lookups. This is total number of lookups to issue. +// -C is min sample number: how many benchmark needs to run to get proper sample +// 1 is mimumum, but you get at least 3 benchmark run +// samples. Do not set to zero. Default is 200 for most +// runs in libMicro. +// -r is the number of total users +// -c is the cache hit rate for lookup. set to 10%, you need -c 10. +// ie. -B 100 -c 50 -r 1000 -C 200 (out of 1000 records, I want 50% +// lookup, and batch size is 100. +// To get 50% cache hit rate, you need 500 record lookups. +// Batch size will be adjusted to 500 to get 500 record +// lookup in each benchmark. If -r size is smaller than -B, +// then -B will not be adjusted. +// -u prefix: the user name prefix to use in front the user number as the +// login name to lookup + +extern int gL1CacheEnabled; + +/* + * Your state variables should live in the tsd_t struct below + */ +typedef struct { +} tsd_t; + +// temporary buffer size +#define BUFSIZE 200 + +// the number of record lookup to issue is covered by standard option optB +static int optRecords = 100; // the number of total records +static int optCachehit = 100; // specify cache hit rate (% of record re-lookup) + +// This will use local users (local_test_*) +static char *default_uprefix = "local_test_"; + +#define USERNAME_LEN 20 +static char *username_list; + +int +benchmark_init() +{ + debug("benchmark_init"); + (void) sprintf(lm_optstr, "l:c:r:u:"); + + lm_tsdsize = sizeof (tsd_t); + lm_defB = 100; + + (void) sprintf(lm_usage, + "\n ------- getpwnam specific options (default: *)\n" + " [-c hitrate%% (100%%*)]\n" + " [-r total number of records (100*)]\n" + " [-u username_prefix (local_test_)]\n" + "\n" ); + return (0); +} + +/* + * This is where you parse your lower-case arguments. + */ +int +benchmark_optswitch(int opt, char *optarg) +{ + debug("benchmark_optswitch"); + + switch (opt) { + case 'c': // cache hit rate. 100% means lookup the same records over and over + optCachehit = atoi(optarg); + debug("optCachehit = %d\n", optCachehit); + if (optCachehit > 100 || optCachehit < 0) { + printf("cache hit rate should be in between 0%% and 100%%"); + return (-1); + } + break; + + case 'l': + gL1CacheEnabled = atoi(optarg); + break; + + case 'r': // total number of records. default is 100 + optRecords = atoi(optarg); + debug("optRecords = %d\n", optRecords); + break; + + case 'u': + default_uprefix = strdup(optarg); + debug("default_uprefix = %s\n", default_uprefix); + break; + + default: + return -1; + } + + return 0; +} + + +// Initialize all structures that will be used in benchmark() +// moved template init from benchmark_initworker -> benchmark_initrun +// since username_list is static across threads and processes +// + +int +benchmark_initrun() +{ + int i; + + debug("\nbenchmark_initrun"); + + // Adjust # of record lookups to reflect cache hit rate + if (optCachehit < 100) { + optRecords = (int) ((float) optRecords * ((float) optCachehit / 100)); + debug("# of records adjusted to %d for cache hit rate %d%%\n", optRecords, optCachehit); + } + + // if batch size (one benchmark run) is less than the number records, adjust + // it to match the number record lookups in one batch run + if (lm_optB < optRecords) { + lm_optB = optRecords; + debug("Adjusting batch size to %d to match the lookups required in benchmark run\n", lm_optB); + } + + // create an array of usernames to use in benchmark before their use + // realtime generation in benchmark effects performance measurements + username_list = malloc( optRecords * USERNAME_LEN ); + if (!username_list) { + debug ("malloc error"); + exit (1); + } + + for (i = 0; i < optRecords; i++) { + sprintf(&username_list[i*USERNAME_LEN], "%s%d", default_uprefix, i+1); + // debug("creating username %s", &username_list[i*USERNAME_LEN]); + } + + return (0); +} + + +int +benchmark(void *tsd, result_t *res) +{ + int i, err; + struct passwd *passwd = NULL; + + res->re_errors = 0; + + debug("in to benchmark - optB = %i", lm_optB); + for (i = 0; i < lm_optB; i++) { + int index = (random() % optRecords) * USERNAME_LEN; + + if (lm_optT > 1) { + struct passwd pd; + struct passwd *pwd_ptr = &pd; + struct passwd *tmp_ptr; + char pbuf[BUFSIZE]; + + err = getpwnam_r( &username_list[index], pwd_ptr, pbuf, BUFSIZE, &tmp_ptr); + if (err) { + printf("error: %s -> %s", &username_list[index], strerror(err)); + res->re_errors++; + } + else if (!tmp_ptr) { + debug("not found: %s", &username_list[index]); + res->re_errors++; + } + } + else { + errno = 0; + passwd = getpwnam( &username_list[index] ); + + if (!passwd) { + if (errno) { + debug("error: %s -> %s", &username_list[index], strerror(errno)); + res->re_errors++; + } + else { + debug("not found: %s", &username_list[index]); + res->re_errors++; + } + } + } + } + res->re_count = i; + + return (0); +} + +// We need to release all the structures we allocated in benchmark_initrun() +int +benchmark_finirun(void *tsd) +{ + // tsd_t *ts = (tsd_t *)tsd; + debug("benchmark_finirun: deallocating structures"); + + free (username_list); + + return (0); +} + +char * +benchmark_result() +{ + static char result = '\0'; + debug("benchmark_result"); + return (&result); +} + diff --git a/tools/tests/libMicro/apple/getpwuid.c b/tools/tests/libMicro/apple/getpwuid.c new file mode 100644 index 000000000..c33149125 --- /dev/null +++ b/tools/tests/libMicro/apple/getpwuid.c @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2006 Apple Inc. All Rights Reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include + +// add additional headers needed here. + +#include "../libmicro.h" +#include + +#if DEBUG +# define debug(fmt, args...) (void) fprintf(stderr, fmt "\n" , ##args) +#else +# define debug(fmt, args...) +#endif + + +// Correct use case +// +// getpwuid -E -L -S -W -B 200 -C 10 -c 100 -u 5000-5200 +// +// libMicro default benchmark run options are "-E -L -S -W -C 200" +// +// -B is batch size: loop iteration per each benchmark run. Needs to match # of +// real lookups. This is total number of lookups to issue. +// -C is min sample number: how many benchmark needs to run to get proper sample +// 1 is mimumum, but you get at least 3 benchmark run +// samples. Do not set to zero. Default is 200 for most +// runs in libMicro. +// -c is the cache hit rate for lookup. set to 10%, you need -c 10. +// ie. -B 100 -c 50 -u 5000-5199 +// out of 200 UIDs, I want 50% cache hit, and batch size is 100. +// -u uid range in the form of "min-max". For example, -u 5000-5200 +// + +extern int gL1CacheEnabled; + +/* + * Your state variables should live in the tsd_t struct below + */ +typedef struct { +} tsd_t; + +// temporary buffer size +#define BUFSIZE 200 +#define INVALID_ID -1 + +static uid_t uid_min = INVALID_ID; +static int uid_range = 0; // uid_max = uid_min + uid_range + +// the number of record lookup to issue is covered by standard option optB +static int optCachehit = 100; // specify cache hit rate (% of record re-lookup) + +int +benchmark_init() +{ + debug("benchmark_init"); + (void) sprintf(lm_optstr, "l:c:u:"); + + lm_tsdsize = sizeof (tsd_t); + lm_defB = 100; + + (void) sprintf(lm_usage, + "\n ------- getpwuid specific options (default: *)\n" + " [-c hitrate%% (100%%*)]\n" + " [-u UID range (min-max)]\n" + " [-l]\n" + "\n" ); + return (0); +} + +int +parse_range(uid_t *min, int *offset, char *buf) +{ + char *value, *tmp_ptr = strdup(buf); + int range=0; + debug("parse_range"); + + value = strsep(&tmp_ptr, "-"); + *min = atoi(value); + debug("min = %d", *min); + if (tmp_ptr) { + value = strsep(&tmp_ptr, "-"); + range = atoi(value); + if (range < *min) { + printf("max id should be larger than min id\n"); + return -1; + } + *offset = range - *min + 1; + debug("range = %d", *offset); + } + else { + printf("argument should be in the form of min-max\n"); + return -1; + } + + return 0; +} + +/* + * This is where you parse your lower-case arguments. + */ +int +benchmark_optswitch(int opt, char *optarg) +{ + debug("benchmark_optswitch"); + + switch (opt) { + case 'c': // cache hit rate. 100% means lookup the same records over and over + optCachehit = atoi(optarg); + debug("optCachehit = %d\n", optCachehit); + if (optCachehit > 100 || optCachehit < 0) { + printf("cache hit rate should be in between 0%% and 100%%"); + return (-1); + } + break; + + case 'l': + gL1CacheEnabled = atoi(optarg); + break; + + case 'u': // UID range + return parse_range( &uid_min, &uid_range, optarg); + break; + + default: + return -1; + } + + return 0; +} + + +// Initialize all structures that will be used in benchmark() +// moved template init from benchmark_initworker -> benchmark_initrun +// since username_list is static across threads and processes +// +int +benchmark_initrun() +{ + uid_t i, range; + struct passwd *passwd = NULL; + + debug("\nbenchmark_initrun"); + + // To satisfy cache hit rate, lookup cachehit percentage of the UIDs here + if (optCachehit < 100) { + + range = (int) ((float) uid_range * ((float) optCachehit / 100)); + for (i = uid_min; i < uid_min+range; i++) + passwd = getpwuid( i ); + } + + return (0); +} + + +int +benchmark(void *tsd, result_t *res) +{ + int i, err; + struct passwd *passwd = NULL; + + res->re_errors = 0; + + debug("in to benchmark - optB = %i", lm_optB); + for (i = 0; i < lm_optB; i++) { + uid_t uid = uid_min + random() % uid_range ; + + // XXX No need to use getpwuid_r() since getpwuid() is already thread-safe + // so it depends on what you want to exercise + if (lm_optT > 1) { + struct passwd pd; + struct passwd *pwd_ptr = &pd; + struct passwd *tmp_ptr; + char pbuf[BUFSIZE]; + + err = getpwuid_r( uid, pwd_ptr, pbuf, BUFSIZE, &tmp_ptr ); + if (err) { + debug("error: %s", strerror(err)); + res->re_errors++; + } + else if (!tmp_ptr) { + debug("not found: UID %d", uid); + res->re_errors++; + } + } + else { + errno = 0; + passwd = getpwuid( uid ); + + if (!passwd) { + if (errno) { + debug("error: %s", strerror(errno)); + res->re_errors++; + } + else { + debug("not found: UID %d", uid); + res->re_errors++; + } + } + } + } + res->re_count = i; + + return (0); +} + +// We need to release all the structures we allocated in benchmark_initrun() +int +benchmark_finirun(void *tsd) +{ + // tsd_t *ts = (tsd_t *)tsd; + debug("benchmark_finirun "); + + return (0); +} + +char * +benchmark_result() +{ + static char result = '\0'; + debug("benchmark_result"); + return (&result); +} + diff --git a/tools/tests/libMicro/apple/lmbench_bw_mem.c b/tools/tests/libMicro/apple/lmbench_bw_mem.c index 4b5aa07c3..9963fe637 100644 --- a/tools/tests/libMicro/apple/lmbench_bw_mem.c +++ b/tools/tests/libMicro/apple/lmbench_bw_mem.c @@ -76,6 +76,10 @@ #define TRIES 11 // value from bench.h in lmbench #define TYPE int +/* Added as part of the fix for */ +static volatile u_int64_t use_result_dummy; +void use_int(int result) { use_result_dummy += result; } + /* * rd - 4 byte read, 32 byte stride * wr - 4 byte write, 32 byte stride @@ -214,6 +218,7 @@ rd(iter_t iterations, void *cookie) p += 128; } } + use_int(sum); } #undef DOIT @@ -257,6 +262,7 @@ rdwr(iter_t iterations, void *cookie) p += 128; } } + use_int(sum); } #undef DOIT @@ -362,6 +368,7 @@ frd(iter_t iterations, void *cookie) p += 128; } } + use_int(sum); } #undef DOIT @@ -616,30 +623,30 @@ benchmark(void *tsd, result_t *res) return(-1); } - if (strcmp(opt_what, "cp") || - strcmp(opt_what, "fcp") || strcmp(opt_what, "bcopy")) { + if (STREQ(opt_what, "cp") || + STREQ(opt_what, "fcp") || STREQ(opt_what, "bcopy")) { ts->need_buf2 = 1; } for (i = 0 ; i < lm_optB ; i++) { - if (strcmp(opt_what, "rd")) { + if (STREQ(opt_what, "rd")) { rd( ts->repetitions, tsd ); - } else if (strcmp(opt_what, "wr")) { + } else if (STREQ(opt_what, "wr")) { wr( ts->repetitions, tsd ); - } else if (strcmp(opt_what, "rdwr")) { + } else if (STREQ(opt_what, "rdwr")) { rdwr( ts->repetitions, tsd ); - } else if (strcmp(opt_what, "cp")) { + } else if (STREQ(opt_what, "cp")) { mcp( ts->repetitions, tsd ); - } else if (strcmp(opt_what, "frd")) { + } else if (STREQ(opt_what, "frd")) { frd( ts->repetitions, tsd ); - } else if (strcmp(opt_what, "fwr")) { + } else if (STREQ(opt_what, "fwr")) { fwr( ts->repetitions, tsd ); - } else if (strcmp(opt_what, "fcp")) { + } else if (STREQ(opt_what, "fcp")) { fcp( ts->repetitions, tsd ); - } else if (strcmp(opt_what, "bzero")) { + } else if (STREQ(opt_what, "bzero")) { loop_bzero( ts->repetitions, tsd ); - } else if (strcmp(opt_what, "bcopy")) { + } else if (STREQ(opt_what, "bcopy")) { loop_bcopy( ts->repetitions, tsd ); } else { return(-1); diff --git a/tools/tests/libMicro/apple/mbr_check_membership.c b/tools/tests/libMicro/apple/mbr_check_membership.c new file mode 100644 index 000000000..91fbffeca --- /dev/null +++ b/tools/tests/libMicro/apple/mbr_check_membership.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2006 Apple Inc. All Rights Reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include + +// add additional headers needed here. + +#include "../libmicro.h" +#include +#include +#include + +#if DEBUG +# define debug(fmt, args...) (void) fprintf(stderr, fmt "\n" , ##args) +#else +# define debug(fmt, args...) +#endif + + +// Correct use case +// +// mbr_check_membership -E -L -S -W -B 200 -C 10 -g 1211-1213 -u 5000-5200 +// +// libMicro default benchmark run options are "-E -C 200 -L -S -W" +// +// -B is batch size: loop iteration per each benchmark run. (default: 100) +// -C is min sample number: how many benchmark needs to run to get proper sample +// 1 is mimumum, but you get at least 3 benchmark run +// samples. Do not set to zero. Default is 200 for most +// runs in libMicro. +// -u uid range in the form of "min-max". For example, -u 5000-5200 +// -g gid range or gid + +/* + * Your state variables should live in the tsd_t struct below + */ +typedef struct { +} tsd_t; + +#define INVALID_ID -1 + +static uid_t uid_min = INVALID_ID; +static gid_t gid_min = INVALID_ID;; + +static int uid_range = 0; // uid_max = uid_min + uid_range +static int gid_range = 0; // gid_max = gid_min + gid_range + +static uuid_t *u_uuid_list = NULL; // user uuid list +static uuid_t *g_uuid_list = NULL; // group uuid list + +int +benchmark_init() +{ + debug("benchmark_init"); + (void) sprintf(lm_optstr, "g:u:"); + + lm_tsdsize = sizeof(tsd_t); + lm_defB = 100; + + (void) sprintf(lm_usage, + "\n ------- mbr_check_membership specific options\n" + " [-u UID range (min-max)]\n" + " [-g GID or GID range (gid or min-max)]\n" + "\n" ); + return (0); +} + +int +parse_range(uint *min, int *offset, char *buf) +{ + char *value, *tmp_ptr = strdup(buf); + int range=0; + debug("parse_range"); + + value = strsep(&tmp_ptr, "-"); + *min = atoi(value); + debug("min = %d", *min); + if (tmp_ptr) { + value = strsep(&tmp_ptr, "-"); + range = atoi(value); + if (range < *min) { + printf("max id should be larger than min id\n"); + return -1; + } + *offset = range - *min; + debug("range = %d", *offset); + } + + return 0; + +} + +/* + * This is where you parse your lower-case arguments. + */ +int +benchmark_optswitch(int opt, char *optarg) +{ + debug("benchmark_optswitch"); + + switch (opt) { + case 'g': // GID or GID range + return parse_range( &gid_min, &gid_range, optarg); + break; + + case 'u': // UID range + return parse_range( &uid_min, &uid_range, optarg); + break; + default: + return -1; + } + + return 0; +} + + +// Initialize all structures that will be used in benchmark() +// 1. make local or network node for OD query +// 2. create user key +int +benchmark_initrun(void *tsd) +{ + int i; + //tsd_t *ts = (tsd_t *)tsd; + + debug("benchmark_initrun"); + + if (uid_min == INVALID_ID || gid_min == INVALID_ID) { + printf("Both -u and -g need to be specified\n"); + return -1; + } + + // create an array of usernames to use in benchmark before their use + // realtime generation in benchmark effects performance measurements + + u_uuid_list = malloc( sizeof(*u_uuid_list) * (uid_range+1) ); + g_uuid_list = malloc( sizeof(*g_uuid_list) * (gid_range+1) ); + + for (i = 0; i <= uid_range; i++) { + + if (mbr_uid_to_uuid(uid_min+i, u_uuid_list[i])) { + printf("error converting uid %d to UUID\n", uid_min+i); + return -1; + } + } + + for (i = 0; i <= gid_range; i++) { + + if (mbr_gid_to_uuid(gid_min+i, g_uuid_list[i])) { + printf("error converting gid %d to UUID\n", gid_min+i); + return -1; + } + } + + return (0); +} + +int +benchmark(void *tsd, result_t *res) +{ + int i, index, gindex, err, isMember=0; + //tsd_t *ts = (tsd_t *)tsd; + +#ifdef DEBUG + uid_t uid; + int id_type; +#endif + + res->re_errors = 0; + + // debug("in to benchmark - optB = %i", lm_optB); + + for (i = 0; i < lm_optB; i++) { + + index = random() % (uid_range+1); + gindex = random() % (gid_range+1); + err = mbr_check_membership(u_uuid_list[index], g_uuid_list[gindex], &isMember); + +#ifdef DEBUG + //mbr_uuid_to_id(u_uuid_list[index], &uid, &id_type); + //debug ("loop %d: uid %d is %s (gindex %d)", i, uid, (isMember)?"member":"not a member", gindex); +#endif + + if (err) { + if (err == EIO) { + debug("mbr_check_membership returned EIO. Unable to communicate with DS daemon"); + } + else if (err == ENOENT) { + debug("mbr_check_membership returned ENOENT. User not found"); + } + else { + debug("error: %s", strerror(err)); + } + res->re_errors++; + } + } + res->re_count = i; + + return (0); +} + + +// We need to release all the structures we allocated in benchmark_initrun() +int +benchmark_finirun(void *tsd) +{ + //tsd_t *ts = (tsd_t *)tsd; + + debug("benchmark_result: deallocating structures"); + + free(u_uuid_list); + free(g_uuid_list); + + return (0); +} + +char * +benchmark_result() +{ + static char result = '\0'; + debug("benchmark_result"); + return (&result); +} + diff --git a/tools/tests/libMicro/apple/mbr_check_service_membership.c b/tools/tests/libMicro/apple/mbr_check_service_membership.c new file mode 100644 index 000000000..47e3267fa --- /dev/null +++ b/tools/tests/libMicro/apple/mbr_check_service_membership.c @@ -0,0 +1,281 @@ +/* + * Copyright (c) 2006 Apple Inc. All Rights Reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include + +// add additional headers needed here. + +#include "../libmicro.h" +#include +#include +#include + +#if DEBUG +# define debug(fmt, args...) (void) fprintf(stderr, fmt "\n" , ##args) +#else +# define debug(fmt, args...) +#endif + + +// Correct use case +// +// mbr_check_service_membership -E -L -S -W -B 200 -C 10 -r 100 -s "SACL" -u user_prefix +// +// libMicro default benchmark run options are "-E -C 200 -L -S -W" +// +// -B is batch size: loop iteration per each benchmark run. Needs to match # of +// real lookups. This is total number of lookups to issue. +// -C is min sample number: how many benchmark needs to run to get proper sample +// 1 is mimumum, but you get at least 3 benchmark run +// samples. Do not set to zero. Default is 200 for most +// runs in libMicro. +// -r is the number of total records. +// -s is SACL string: ie. "ssh" +// -u user_prefix that preceeds the user number + +typedef struct { + uuid_t *uuid_list; +} tsd_t; + +// the number of record lookup to issue is covered by standard option optB +static int optRecords = 100; // the number of total records +static int optSACL = 0; // option SACL specified? + +static char **sacl = NULL; +static char *default_sacl[] = { "com.apple.access_dsproxy", + "com.apple.access_screensharing", + "com.apple.access_ssh", + ""}; +static int numSACL = 3; // number of SACLs + + +// This will use local users (local_test_*) +static char *default_uprefix = "local_test_"; + +int +benchmark_init() +{ + debug("benchmark_init"); + (void) sprintf(lm_optstr, "r:s:u:"); + + lm_tsdsize = sizeof(tsd_t); + lm_defB = 100; + + (void) sprintf(lm_usage, + "\n ------- mbr_check_service_membership specific options (default: *)\n" + " [-r total number of records (100*)]\n" + " [-s SACL]\n" + " [-u user_prefix]\n" + "\n" ); + return (0); +} + +/* + * This is where you parse your lower-case arguments. + */ +int +benchmark_optswitch(int opt, char *optarg) +{ + debug("benchmark_optswitch"); + + switch (opt) { + case 'r': // total number of records. default is 100 + optRecords = atoi(optarg); + debug("optRecords = %d\n", optRecords); + break; + + case 's': // SACL + if (optSACL) { + printf("SACL already specified. Skipping"); + break; + } + sacl = malloc(2 * sizeof(char *)); + if (!sacl) { + printf("Error: no memory available for strdup\n"); + return -1; + } + sacl[0] = strdup(optarg); + sacl[1] = ""; + optSACL = 1; + numSACL = 1; + + break; + + case 'u': + default_uprefix = strdup(optarg); + debug("default_uprefix = %s\n", default_uprefix); + break; + + default: + return -1; + } + + return 0; +} + + +int +benchmark_initrun() +{ + int i; + debug("benchmark_initrun"); + + if (!sacl) { + sacl = default_sacl; + } + + for (i=0; strcmp(sacl[i], "") && i <= numSACL; i++) { + debug("SACL = %s", sacl[i]); + } + + return (0); +} + +// Initialize all structures that will be used in benchmark() +// 1. make local or network node for OD query +// 2. create user key +int +benchmark_initworker(void *tsd) +{ + int i; + tsd_t *ts = (tsd_t *)tsd; + char *uprefix = default_uprefix; // local user is default + char username[30] = ""; + struct passwd *info = NULL; + + debug("benchmark_initworker"); + + // create an array of usernames to use in benchmark before their use + // realtime generation in benchmark effects performance measurements + + ts->uuid_list = calloc(optRecords, sizeof(uuid_t)); + + for (i = 0; i < optRecords; i++) { + + sprintf(username, "%s%d", uprefix, i+1); + info = getpwnam(username); + if (!info) { + debug ("error converting username %s to uuid", username); + exit (1); + } + + (void) mbr_uid_to_uuid(info->pw_uid, ts->uuid_list[i]); + +#if DEBUG + char buf[30]; + uid_t uid; + int id_type; + uuid_unparse(ts->uuid_list[i], buf); + mbr_uuid_to_id(ts->uuid_list[i], &uid, &id_type); + debug ("username (%s), uid %d, uuid %s, back to uid %d", username, info->pw_uid, buf, uid); +#endif + } + + // if batch size (one benchmark run) is less than the number records, adjust + // it to match the number record lookups in one batch run + if (optRecords < lm_optB) { + lm_optB = optRecords; + debug("Reducing batch size to %d to match the record #\n", lm_optB); + } + + debug("benchmark_initworker"); + return (0); +} + +int +benchmark(void *tsd, result_t *res) +{ + tsd_t *ts = (tsd_t *)tsd; + int i; + int err; + int isMember=0; + char *sacl_chosen; + +#ifdef DEBUG + uid_t uid; + int id_type; +#endif + + res->re_errors = 0; + + debug("in to benchmark - optB = %i", lm_optB); + for (i = 0; i < lm_optB; i++) { + + sacl_chosen = sacl[random() % numSACL]; + err = mbr_check_service_membership(ts->uuid_list[i], sacl_chosen, &isMember); + +#ifdef DEBUG + mbr_uuid_to_id(ts->uuid_list[i], &uid, &id_type); + debug ("loop %d: uid %d is %s a member of %s", i, uid, (isMember) ? "" : "not", sacl_chosen); +#endif + + if (err) { + debug("error: %s", strerror(err)); + res->re_errors++; + } + } + res->re_count = i; + + return (0); +} + + +// We need to release all the structures we allocated in benchmark_initworker() +int +benchmark_finiworker(void *tsd) +{ + tsd_t *ts = (tsd_t *)tsd; + debug("benchmark_result: deallocating structures"); + + free(ts->uuid_list); + + return (0); +} + +int +benchmark_finirun(void *tsd) +{ + if (optSACL) + free(sacl); + + return 0; +} + +char * +benchmark_result() +{ + static char result = '\0'; + debug("benchmark_result"); + return (&result); +} + diff --git a/tools/tests/libMicro/apple/od_query_create_with_node.c b/tools/tests/libMicro/apple/od_query_create_with_node.c new file mode 100644 index 000000000..7cf18ce6d --- /dev/null +++ b/tools/tests/libMicro/apple/od_query_create_with_node.c @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2006 Apple Inc. All Rights Reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include + +// add additional headers needed here. + +#include "../libmicro.h" +#include +#include +#include +#include +#include + +#if DEBUG +# define debug(fmt, args...) (void) fprintf(stderr, fmt , ##args) +// # define debug(fmt, args...) (void) fprintf(stderr, fmt "\n" , ##args) +#else +# define debug(fmt, args...) +#endif + + +// Correct use case +// +// od_query_create_with_node -E -L -S -W -B 200 -C 10 -c 100 -r 300 +// +// libMicro default benchmark run options are "-E -C 200 -L -S -W" +// +// -B is batch size: loop iteration per each benchmark run. Needs to match # of +// real lookups. This is total number of lookups to issue. +// -C is min sample number: how many benchmark needs to run to get proper sample +// 1 is mimumum, but you get at least 3 benchmark run +// samples. Do not set to zero. Default is 200 for most +// runs in libMicro. +// -r is the number of total records. +// -c is the cache hit rate for lookup. set to 10%, you need -c 10. +// ie. -B 100 -c 50 -r 1000 -C 200 (out of 1000 records, I want 50% +// lookup, and batch size is 100. +// To get 50% cache hit rate, you need 500 record lookups. +// Batch size will be adjusted to 500 to get 500 record +// lookup in each benchmark. If -r size is smaller than -B, +// then -B will not be adjusted. + +// Defining prefix for user and group name +// make sure that these match the ones in LDAP records +// ie. local_test_1 , od_test_4525, od_test_group_43, od_test_host_63 +#define LOCAL_U_PREFIX CFSTR("local_test_") +#define OD_U_PREFIX CFSTR("od_test_") +#define LOCAL_G_PREFIX CFSTR("local_test_group_") +#define OD_G_PREFIX CFSTR("od_test_group_") +#define LOCAL_H_PREFIX CFSTR("local_test_host_") +#define OD_H_PREFIX CFSTR("od_test_host_") + +/* + * Your state variables should live in the tsd_t struct below + */ +typedef struct { + ODNodeRef node; +} tsd_t; + +// dsRecTypeStandard type dictionary +enum {rectype_users=0, rectype_groups, rectype_hosts}; +CFStringRef rectype_dict[] = { CFSTR(kDSStdRecordTypeUsers), + CFSTR(kDSStdRecordTypeGroups), + CFSTR(kDSStdRecordTypeHosts) }; + +// the number of record lookup to issue is covered by standard option optB +static int optRecords = 100; // the number of total records +static int optCachehit = 100; // specify cache hit rate (% of record re-lookup) +static bool optNodeLocal = 1; // which node to search. Local node is default +static int optType = rectype_users; // dsRecType to search for. "Users"" is the default +static const char *nodename = "/LDAPv3/127.0.0.1"; + +static CFStringRef *key; // username array + +// parse -t option and return enum type: user, group, and host +// called by benchmark_optswitch() +int +ds_rec_type(char *name) +{ + if (strcasecmp("u", name) == 0) { + return (rectype_users); + } else if (strcasecmp("g", name) == 0) { + return (rectype_groups); + } else if (strcasecmp("h", name) == 0) { + return (rectype_hosts); + } + + return (-1); +} + +int +benchmark_init() +{ + debug("benchmark_init"); + (void) sprintf(lm_optstr, "c:n:r:t:"); + + lm_tsdsize = sizeof (tsd_t); + lm_defB = 1000; + + (void) sprintf(lm_usage, + "\n ------- od_query_create_with_node specific options (default: *)\n" + " [-c hitrate%% (100%%*)]\n" + " [-r total number of records (100*)]\n" + " [-n nodename] node name to use for test\n" + " [-t record type: 'u'sers, 'g'roups, 'h'osts]\n" + " use -B option to specify total number of record lookups to issue" + "\n" ); + return (0); +} + +/* + * This is where you parse your lower-case arguments. + */ +int +benchmark_optswitch(int opt, char *optarg) +{ + debug("benchmark_optswitch"); + + switch (opt) { + case 'c': // cache hit rate. 100% means lookup the same records over and over + optCachehit = atoi(optarg); + debug("optCachehit = %d\n", optCachehit); + if (optCachehit > 100 || optCachehit < 0) { + printf("cache hit rate should be in between 0%% and 100%%"); + return (-1); + } + break; + + case 'r': // total number of records. default is 100 + optRecords = atoi(optarg); + debug("optRecords = %d\n", optRecords); + break; + + case 'n': // node + nodename = optarg; + break; + + case 't': // dsRecType: user, group, hots + optType = ds_rec_type(optarg); + debug("optType = %d\n", optType); + + if (optType == -1) { + printf("wrong -t record type option\n"); + return (-1); + } + break; + + default: + return (-1); + } + + return (0); +} + + +int +benchmark_initrun() +{ + int i; + CFStringRef prefix; // local user is default + + debug("benchmark_initrun\n"); + + // Adjust # of record lookups to reflect cache hit rate + if (optCachehit < 100) { + optRecords = (int) ((float) optRecords * ((float) optCachehit / 100)); + debug("# of records adjusted to %d for cache hit rate %d%%\n", optRecords, optCachehit); + } + + // if batch size (one benchmark run) is less than the number records, adjust + // it to match the number record lookups in one batch run + if (lm_optB < optRecords) { + lm_optB = optRecords; + debug("Adjusting batch size to %d to match the lookups required in benchmark run\n", lm_optB); + } + + switch (optType) { + case rectype_users: + prefix = (optNodeLocal) ? LOCAL_U_PREFIX : OD_U_PREFIX; + break; + case rectype_groups: + prefix = (optNodeLocal) ? LOCAL_G_PREFIX : OD_G_PREFIX; + break; + case rectype_hosts: + prefix = (optNodeLocal) ? LOCAL_H_PREFIX : OD_H_PREFIX; + break; + } + // create an array of usernames to use in benchmark before their use + // realtime generation in benchmark effects performance measurements + + key = malloc(sizeof(CFStringRef) * optRecords); + + // user, group, hosts key to lookup + switch (optType) { + + case rectype_users: // users query + case rectype_groups: // groups query + case rectype_hosts: // hosts query + for (i = 0; i < optRecords; i++) { + key[i] = CFStringCreateWithFormat( kCFAllocatorDefault, + NULL, + CFSTR("%@%d"), + prefix, + i+1); + // CFShow(key[i]); // print user name to check + } + break; + } + + return (0); +} + + +// Initialize all structures that will be used in benchmark() +// 1. make local or network node for OD query +// 2. create user key +int +benchmark_initworker(void *tsd) +{ + CFErrorRef error; + tsd_t *ts = (tsd_t *)tsd; + + debug("benchmark_initworker: %s", (optNodeLocal) ? "local" : "network"); + + + // create OD node for local or OD query + if (optNodeLocal) { + ts->node = ODNodeCreateWithNodeType(NULL, kODSessionDefault, kODNodeTypeLocalNodes, &error); + } + else { + CFStringRef nodenameStr = CFStringCreateWithCString(kCFAllocatorDefault, nodename, kCFStringEncodingUTF8); + ts->node = ODNodeCreateWithName(NULL, kODSessionDefault, nodenameStr, &error); + CFRelease(nodenameStr); + } + + if (!ts->node) { + debug("error calling ODNodeCreateWithNodeType\n"); + exit(1); + } + + CFRetain (ts->node); + + debug("benchmark_initworker: ODNodeRef = 0x%lx\n", ts->node); + return (0); +} + +int +benchmark(void *tsd, result_t *res) +{ + + tsd_t *ts = (tsd_t *)tsd; + int i; + ODNodeRef node; + CFErrorRef error; + CFArrayRef results; + ODQueryRef query; + + res->re_errors = 0; + node = ts->node; + + debug("in to benchmark - optB = %i, node = 0x%lx \n", lm_optB, node); + for (i = 0; i < lm_optB; i++) { + + debug("loop %d: querying\n", i); + query = ODQueryCreateWithNode(NULL, + node, // inNode + rectype_dict[optType], // inRecordTypeOrList + CFSTR(kDSNAttrRecordName), // inAttribute + kODMatchInsensitiveEqualTo, // inMatchType + key[i % optRecords], // inQueryValueOrList + NULL, // inReturnAttributeOrList + 1, // inMaxResults + &error); + + if (query) { + // we do not want to factually fetch the result in benchmark run + // debug("loop %d: calling ODQueryCopyResults\n", i); + results = ODQueryCopyResults(query, FALSE, &error); + CFRelease(query); + if (results) { +#if DEBUG + int c; + c = CFArrayGetCount(results); + if (c > 0) { + debug("Successful run: %d results, ", c); + } + else { + debug("no result for "); + } + CFShow (key[i % optRecords]); + debug("\n"); +#endif + CFRelease(results); + } + else { + debug("loop %d: ODQueryCopyResults returned empty result for ", i); + res->re_errors++; + CFShow (key[i % optRecords]); + debug("\n"); + } // if (results) + + } // if (query) + else { + res->re_errors++; + } + } + res->re_count = i; + + return (0); +} + + +// We need to release all the structures we allocated in benchmark_initworker() +int +benchmark_finiworker(void *tsd) +{ + tsd_t *ts = (tsd_t *)tsd; + + debug("benchmark_result: deallocating structures\n"); + + // free the node + if (ts->node) + CFRelease (ts->node); + ts->node = NULL; + + return (0); +} + +int +benchmark_finirun() +{ + int i; + + for (i = 0; i < optRecords; i++){ + CFRelease(key[i]); + } + + free(key); + + return (0); +} + +char * +benchmark_result() +{ + static char result = '\0'; + debug("\n\n# of records adjusted to %d for cache hit rate %d%%\n", optRecords, optCachehit); + debug("benchmark_result\n"); + return (&result); +} + diff --git a/tools/tests/libMicro/bench.sh b/tools/tests/libMicro/bench.sh index 698557eb6..ea890e55a 100644 --- a/tools/tests/libMicro/bench.sh +++ b/tools/tests/libMicro/bench.sh @@ -30,6 +30,41 @@ # Use is subject to license terms. # + +# usage function - defines all the options that can be given to this script. +function usage { + echo "Usage" + echo "$0 [-l] [-h] [name of test]" + echo "-l : This option runs the lmbench tests along with the default libmicro tests." + echo "-h : Help. This option displays information on how to run the script. " + echo "[name of test] : This option runs only the test that is specified" + echo "" + echo "Examples" + echo "$0 : This is the defualt execution. This will run only the default libmicro tests." + echo "$0 -l : This will run the lmbench tests too " + echo "$0 getppid : This will run only the getppid tests" + exit + +} + +if [ $# -eq 1 ] +then + lmbench=2 # to check if only a single test is to be run. e.g, ./bench.sh getppid +else + lmbench=0 # to run the default libMicro tests, without the lmbench tests. +fi + +while getopts "lh" OPT_LIST +do + case $OPT_LIST in + l) lmbench=1;; # to run the libmicro tests including the lmbench tests. + h) usage;; + *) usage;; + esac +done + + + tattle="./tattle" bench_version=0.4.0 @@ -121,6 +156,7 @@ printf "!Machine_name: %30s\n" "$hostname" printf "!OS_name: %30s\n" `uname -s` printf "!OS_release: %30s\n" `sw_vers -productVersion` printf "!OS_build: %30.18s\n" "`sw_vers -buildVersion`" +printf "!Kernel: %30.50s\n" "`uname -v|cut -d ' ' -f 11`" printf "!Processor: %30s\n" `arch` printf "!#CPUs: %30s\n" $p_count printf "!CPU_MHz: %30s\n" "$p_mhz" @@ -174,10 +210,24 @@ do ;; *$1*) + # Default execution without the lmbench tests. + # checks if there is no argument passed by the user. + if [ $lmbench -eq 0 ] + then + string=lmbench + if [ "${A:0:7}" == "$string" ] + then + continue + fi + fi + ;; - - *) - continue + + *) + if [ $lmbench -ne 1 ] + then + continue + fi ;; esac diff --git a/tools/tests/libMicro/benchDS.sh b/tools/tests/libMicro/benchDS.sh new file mode 100644 index 000000000..26a2bc562 --- /dev/null +++ b/tools/tests/libMicro/benchDS.sh @@ -0,0 +1,324 @@ +#!/bin/sh +# +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms +# of the Common Development and Distribution License +# (the "License"). You may not use this file except +# in compliance with the License. +# +# You can obtain a copy of the license at +# src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing +# permissions and limitations under the License. +# +# When distributing Covered Code, include this CDDL +# HEADER in each file and include the License file at +# usr/src/OPENSOLARIS.LICENSE. If applicable, +# add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your +# own identifying information: Portions Copyright [yyyy] +# [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +function usage { + echo "Usage" + echo "$0 [-l] [-h] <#-of-users> nodename [test match pattern]" + echo "-l : disable libinfo L1 cache" + echo "-h : Help. This option displays information on how to run the script. " + echo "[test match pattern] : This option runs only the test that is specified" + echo + echo "You must have set up users, groups, and SACLs with od_account_create" + echo "with the same number of user accounts." + echo "Supply a pattern to match to run a subset of tests" + exit 1 +} + +# default to libinfo cache enabled +L1CACHE="1" + +while getopts "lh" OPT_LIST +do + case $OPT_LIST in + l) L1CACHE="0";; # to run the libmicro tests with l1cache disabled + h) usage;; + *) usage;; + esac +done + +shift `expr $OPTIND - 1` + +if [ $# -lt 2 -o $# -gt 3 ]; then + usage +fi + +tattle="./tattle" + +bench_version=0.4.0 +libmicro_version=`$tattle -V` + +case $libmicro_version in +$bench_version) + ;; +*) + echo "ERROR: libMicro version doesn't match 'bench' script version" + exit 1 +esac + +TMPROOT=/private/tmp/libmicro.$$ +VARROOT=/private/var/tmp/libmicro.$$ +mkdir -p $TMPROOT +mkdir -p $VARROOT +trap "rm -rf $TMPROOT $VARROOT && exit" 0 2 + +TFILE=$TMPROOT/data +IFILE=$TMPROOT/ifile +TDIR1=$TMPROOT/0/1/2/3/4/5/6/7/8/9 +TDIR2=$TMPROOT/1/2/3/4/5/6/7/8/9/0 +VFILE=$VARROOT/data +VDIR1=$VARROOT/0/1/2/3/4/5/6/7/8/9 +VDIR2=$VARROOT/1/2/3/4/5/6/7/8/9/0 + +OPTS="-E -C 200 -L -S -W" + +dd if=/dev/zero of=$TFILE bs=1024k count=10 2>/dev/null +dd if=/dev/zero of=$VFILE bs=1024k count=10 2>/dev/null +mkdir -p $TDIR1 $TDIR2 +mkdir -p $VDIR1 $VDIR2 + +touch $IFILE +/usr/bin/touch /private/var/tmp/lmbench + + +# produce benchmark header for easier comparisons + +hostname=`uname -n` + +if [ -f /usr/sbin/psrinfo ]; then + p_count=`psrinfo|wc -l` + p_mhz=`psrinfo -v | awk '/operates/{print $6 "MHz"; exit }'` + p_type=`psrinfo -vp 2>/dev/null | awk '{if (NR == 3) {print $0; exit}}'` + p_ipaddr=`getent hosts $hostname | awk '{print $1}'` +fi + +if [ -f /proc/cpuinfo ]; then + p_count=`egrep processor /proc/cpuinfo | wc -l` + p_mhz=`awk -F: '/cpu MHz/{printf("%5.0f00Mhz\n",$2/100); exit}' /proc/cpuinfo` + p_type=`awk -F: '/model name/{print $2; exit}' /proc/cpuinfo` + p_ipaddr=`getent hosts $hostname | awk '{print $1}'` +else +## Mac OS X specific stuff +# first, get ugly output, in case pretty output isn't available +# + p_count=`sysctl -n hw.physicalcpu` + p_mhz=`sysctl -n hw.cpufrequency` + p_type=`sysctl -n hw.model` + +if [ -x /usr/sbin/system_profiler ]; then + # requires this hunk of work-around + # grep the XML for the characteristic we need. The key appears twice, so grep for the useful key (with 'string') + # use sed to strip off the and the tabs in front of the string. So much work for so little result. + # + p_mhz=`system_profiler -xml -detailLevel mini SPHardwareDataType | \ + grep -A1 current_processor_speed | grep string | \ + sed -E 's/(.+)<\/string>/\1/' | sed 's- --g'` + p_type=`system_profiler -xml -detailLevel mini SPHardwareDataType | \ + grep -A1 cpu_type | grep string | \ + sed -E 's/(.+)<\/string>/\1/' | sed 's- --g'` +fi + +# look for en0 (usually ethernet) if that isn't there try en1 (usually wireless) else give up + p_ipaddr=`ipconfig getpacket en0 | grep yiaddr | tr "= " "\n" | grep [0-9]` + if [ ! $p_ipaddr ]; then + p_ipaddr=`ipconfig getpacket en1 | grep yiaddr | tr "= " "\n" | grep [0-9]` + elif [ ! $p_ipaddr ]; then + p_ipaddr="unknown" + fi +fi + +printf "\n\n!Libmicro_#: %30s\n" $libmicro_version +printf "!Options: %30s\n" "$OPTS" +printf "!Machine_name: %30s\n" "$hostname" +printf "!OS_name: %30s\n" `uname -s` +printf "!OS_release: %30s\n" `sw_vers -productVersion` +printf "!OS_build: %30.18s\n" "`sw_vers -buildVersion`" +printf "!Processor: %30s\n" `arch` +printf "!#CPUs: %30s\n" $p_count +printf "!CPU_MHz: %30s\n" "$p_mhz" +printf "!CPU_NAME: %30s\n" "$p_type" +printf "!IP_address: %30s\n" "$p_ipaddr" +printf "!Run_by: %30s\n" $LOGNAME +printf "!Date: %30s\n" "`date '+%D %R'`" +printf "!Compiler: %30s\n" `$tattle -c` +printf "!Compiler Ver.:%30s\n" "`$tattle -v`" +printf "!sizeof(long): %30s\n" `$tattle -s` +printf "!extra_CFLAGS: %30s\n" "`$tattle -f`" +printf "!TimerRes: %30s\n\n\n" "`$tattle -r`" + +bin_dir="$TMPROOT/bin" + +mkdir -p $bin_dir +cp bin-*/exec_bin $bin_dir/$A + +cp ./apple/bin-*/posix_spawn_bin $bin_dir/$A + +newline=0 + +# We commonly want to adjust this script for the number of users +# and configuration of the accounts and configuration being tested. +# +# Users: +NUSERS=$1 +NODENAME=$2 +UID_BASE=5000 +UID_END=`expr $UID_BASE + $NUSERS - 1` +USER_PREFIX=od_test_ +# +# Groups: +GID_ALL_USERS=1211 +GID_NO_USERS=1212 +GROUP_BASE=od_test_group +# +# getaddrinfo on hosts: +HOST_BASE=sfs0 +HOST_RANGE=1-8 + +# +# Everything below the while loop is input for the while loop +# if you have any tests which can't run in the while loop, put +# them above this comment +# +while read A B +do + # $A contains the command, $B contains the arguments + # we echo blank lines and comments + # we skip anything which fails to match *$1* (useful + # if we only want to test one case, but a nasty hack) + + case $A in + \#*) + echo "$A $B" + newline=1 + continue + ;; + + "") + if [ $newline -eq 1 ] + then + newline=0 + echo + echo + fi + + continue + ;; + + *$3*) + ;; + + *) + continue + ;; + esac + + if [ ! -f $bin_dir/$A ] + then + cp bin-*/$A $bin_dir/$A + fi + + echo + + (cd $TMPROOT && eval "bin/$A $B") + + echo + echo +done <<. + +# -P <# procs> +# -T <# threads> - exclusive! + +# mbr_check_service_membership() +mbr_check_service_membership $OPTS -N "mbr_check_service_membership" -s libMicro -u ${USER_PREFIX} -r ${NUSERS} +mbr_check_service_membership $OPTS -N "mbr_check_service_membership_t2" -T 2 -s libMicro -u ${USER_PREFIX} -r ${NUSERS} +mbr_check_service_membership $OPTS -N "mbr_check_service_membership_t4" -T 4 -s libMicro -u ${USER_PREFIX} -r ${NUSERS} +mbr_check_service_membership $OPTS -N "mbr_check_service_membership_p2" -P 2 -s libMicro -u ${USER_PREFIX} -r ${NUSERS} +mbr_check_service_membership $OPTS -N "mbr_check_service_membership_p4" -P 4 -s libMicro -u ${USER_PREFIX} -r ${NUSERS} + +# getpwnam() +getpwnam $OPTS -N "getpwnam" -l ${L1CACHE} -r ${NUSERS} -u ${USER_PREFIX} +getpwnam $OPTS -N "getpwnam_t2" -T 2 -l ${L1CACHE} -r ${NUSERS} -u ${USER_PREFIX} +getpwnam $OPTS -N "getpwnam_p2" -P 2 -l ${L1CACHE} -r ${NUSERS} -u ${USER_PREFIX} + +# mbr_check_membership() +mbr_check_membership $OPTS -N "mbr_check_membership" -u ${UID_BASE}-${UID_END} -g ${GID_ALL_USERS}-${GID_NO_USERS} +mbr_check_membership $OPTS -N "mbr_check_membership_t2" -u ${UID_BASE}-${UID_END} -g ${GID_ALL_USERS}-${GID_NO_USERS} -T 2 +mbr_check_membership $OPTS -N "mbr_check_membership_t4" -u ${UID_BASE}-${UID_END} -g ${GID_ALL_USERS}-${GID_NO_USERS} -T 4 +mbr_check_membership $OPTS -N "mbr_check_membership_p2" -u ${UID_BASE}-${UID_END} -g ${GID_ALL_USERS}-${GID_NO_USERS} -P 2 +mbr_check_membership $OPTS -N "mbr_check_membership_p4" -u ${UID_BASE}-${UID_END} -g ${GID_ALL_USERS}-${GID_NO_USERS} -P 4 + +# getpwuid() +getpwuid $OPTS -N "getpwuid" -l ${L1CACHE} -u ${UID_BASE}-${UID_END} +getpwuid $OPTS -N "getpwuid_t2" -l ${L1CACHE} -u ${UID_BASE}-${UID_END} -T 2 +getpwuid $OPTS -N "getpwuid_t4" -l ${L1CACHE} -u ${UID_BASE}-${UID_END} -T 4 +getpwuid $OPTS -N "getpwuid_p2" -l ${L1CACHE} -u ${UID_BASE}-${UID_END} -P 2 +getpwuid $OPTS -N "getpwuid_p4" -l ${L1CACHE} -u ${UID_BASE}-${UID_END} -P 4 + +# getgrgid() +getgrgid $OPTS -N "getgrgid" -l ${L1CACHE} -g ${GID_ALL_USERS}-${GID_NO_USERS} +getgrgid $OPTS -N "getgrgid_t2" -l ${L1CACHE} -g ${GID_ALL_USERS}-${GID_NO_USERS} -T 2 +getgrgid $OPTS -N "getgrgid_t4" -l ${L1CACHE} -g ${GID_ALL_USERS}-${GID_NO_USERS} -T 4 +getgrgid $OPTS -N "getgrgid_p2" -l ${L1CACHE} -g ${GID_ALL_USERS}-${GID_NO_USERS} -P 2 +getgrgid $OPTS -N "getgrgid_p4" -l ${L1CACHE} -g ${GID_ALL_USERS}-${GID_NO_USERS} -P 4 + +# getpwent() +getpwent $OPTS -N "getpwent" -l ${L1CACHE} +getpwent $OPTS -N "getpwent_t2" -l ${L1CACHE} -T 2 +getpwent $OPTS -N "getpwent_t4" -l ${L1CACHE} -T 4 +getpwent $OPTS -N "getpwent_p2" -l ${L1CACHE} -P 2 +getpwent $OPTS -N "getpwent_p4" -l ${L1CACHE} -P 4 + +# getgrent() +getgrent $OPTS -N "getgrent" -l ${L1CACHE} +getgrent $OPTS -N "getgrent_t2" -l ${L1CACHE} -T 2 +getgrent $OPTS -N "getgrent_t4" -l ${L1CACHE} -T 4 +getgrent $OPTS -N "getgrent_p2" -l ${L1CACHE} -P 2 +getgrent $OPTS -N "getgrent_p4" -l ${L1CACHE} -P 4 + +# getaddrinfo() host +#getaddrinfo_host $OPTS -N "getaddrinfo_host" -r ${HOST_RANGE} -h ${HOST_BASE}%d +#getaddrinfo_host $OPTS -N "getaddrinfo_host_t2" -r ${HOST_RANGE} -h ${HOST_BASE}%d -T 2 +#getaddrinfo_host $OPTS -N "getaddrinfo_host_t4" -r ${HOST_RANGE} -h ${HOST_BASE}%d -T 4 +#getaddrinfo_host $OPTS -N "getaddrinfo_host_p2" -r ${HOST_RANGE} -h ${HOST_BASE}%d -P 2 +#getaddrinfo_host $OPTS -N "getaddrinfo_host_p4" -r ${HOST_RANGE} -h ${HOST_BASE}%d -P 4 + +# getaddrinfo() port +getaddrinfo_port $OPTS -N "getaddrinfo_port" -l ${L1CACHE} +getaddrinfo_port $OPTS -N "getaddrinfo_port_t2" -l ${L1CACHE} -T 2 +getaddrinfo_port $OPTS -N "getaddrinfo_port_t4" -l ${L1CACHE} -T 4 +getaddrinfo_port $OPTS -N "getaddrinfo_port_p2" -l ${L1CACHE} -P 2 +getaddrinfo_port $OPTS -N "getaddrinfo_port_p4" -l ${L1CACHE} -P 4 + +# getgrnam() +getgrnam $OPTS -N "getgrnam" -l ${L1CACHE} -g ${GROUP_BASE} -r 2 +getgrnam $OPTS -N "getgrnam_t2" -l ${L1CACHE} -T 2 -g ${GROUP_BASE} -r 2 +getgrnam $OPTS -N "getgrnam_t4" -l ${L1CACHE} -T 4 -g ${GROUP_BASE} -r 2 +getgrnam $OPTS -N "getgrnam_p2" -l ${L1CACHE} -P 2 -g ${GROUP_BASE} -r 2 +getgrnam $OPTS -N "getgrnam_p4" -l ${L1CACHE} -P 4 -g ${GROUP_BASE} -r 2 + +# ODQueryCreateWithNode() +od_query_create_with_node $OPTS -N "ODQueryCreateWithNode_cache_${NUSERS}u" -c 50 -r ${NUSERS} -t u -B 50 -n ${NODENAME} +od_query_create_with_node $OPTS -N "ODQueryCreateWithNode_cache_${NUSERS}u_t2" -T 2 -c 50 -r ${NUSERS} -t u -B 50 -n ${NODENAME} +od_query_create_with_node $OPTS -N "ODQueryCreateWithNode_cache_${NUSERS}u_t4" -T 4 -c 50 -r ${NUSERS} -t u -B 50 -n ${NODENAME} +od_query_create_with_node $OPTS -N "ODQueryCreateWithNode_cache_${NUSERS}u_p2" -P 2 -c 50 -r ${NUSERS} -t u -B 50 -n ${NODENAME} +od_query_create_with_node $OPTS -N "ODQueryCreateWithNode_cache_${NUSERS}u_p4" -P 4 -c 50 -r ${NUSERS} -t u -B 50 -n ${NODENAME} + +. diff --git a/tools/tests/libMicro/coreos_bench.sh b/tools/tests/libMicro/coreos_bench.sh new file mode 100644 index 000000000..a862cbd86 --- /dev/null +++ b/tools/tests/libMicro/coreos_bench.sh @@ -0,0 +1,837 @@ +#!/bin/sh +# +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms +# of the Common Development and Distribution License +# (the "License"). You may not use this file except +# in compliance with the License. +# +# You can obtain a copy of the license at +# src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing +# permissions and limitations under the License. +# +# When distributing Covered Code, include this CDDL +# HEADER in each file and include the License file at +# usr/src/OPENSOLARIS.LICENSE. If applicable, +# add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your +# own identifying information: Portions Copyright [yyyy] +# [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# usage function - defines all the options that can be given to this script. +function usage { + echo "Usage" + echo "$0 [-l] [-h] [name of test]" + echo "-l : This option runs the lmbench tests along with the default libmicro tests." + echo "-h : Help. This option displays information on how to run the script. " + echo "[name of test] : This option runs only the test that is specified" + echo "" + echo "Examples" + echo "$0 : This is the defualt execution. This will run only the default libmicro tests." + echo "$0 -l : This will run the lmbench tests too " + echo "$0 getppid : This will run only the getppid tests" + exit + +} + +if [ $# -eq 1 ] +then + lmbench=2 # to check if only a single test is to be run. e.g, ./coreos_bench.sh getppid +else + lmbench=0 # to run the default libMicro tests, without the lmbench tests. +fi + +while getopts "lh" OPT_LIST +do + case $OPT_LIST in + l) lmbench=1;; # to run the libmicro tests including the lmbench tests. + h) usage;; + *) usage;; + esac +done + + +tattle="./tattle" + +bench_version=0.4.0 +libmicro_version=`$tattle -V` + +case $libmicro_version in +$bench_version) + ;; +*) + echo "ERROR: libMicro version doesn't match 'coreos_bench' script version" + exit 1 +esac + +TMPROOT=/private/tmp/libmicro.$$ +VARROOT=/private/var/tmp/libmicro.$$ +mkdir -p $TMPROOT +mkdir -p $VARROOT + +#if 1 /* Apple modified code */ + +# If the testsuite finish completely or if it is interrupted before +# completion, re-enable stepper for normal operation of the machine +# see rdar://6243819 for details +trap "rm -rf $TMPROOT $VARROOT && sudo pstates -e && exit" 0 2 + +#else +trap "rm -rf $TMPROOT $VARROOT && exit" 0 2 +#endif /* End of Apple modified code + +TFILE=$TMPROOT/data +IFILE=$TMPROOT/ifile +TDIR1=$TMPROOT/0/1/2/3/4/5/6/7/8/9 +TDIR2=$TMPROOT/1/2/3/4/5/6/7/8/9/0 +VFILE=$VARROOT/data +VDIR1=$VARROOT/0/1/2/3/4/5/6/7/8/9 +VDIR2=$VARROOT/1/2/3/4/5/6/7/8/9/0 + + +OPTS="-E -C 200 -L -S -W" + +dd if=/dev/zero of=$TFILE bs=1024k count=10 2>/dev/null +dd if=/dev/zero of=$VFILE bs=1024k count=10 2>/dev/null +mkdir -p $TDIR1 $TDIR2 +mkdir -p $VDIR1 $VDIR2 + +touch $IFILE +/usr/bin/touch /private/var/tmp/lmbench + + +# produce benchmark header for easier comparisons + +hostname=`uname -n` + +if [ -f /usr/sbin/psrinfo ]; then + p_count=`psrinfo|wc -l` + p_mhz=`psrinfo -v | awk '/operates/{print $6 "MHz"; exit }'` + p_type=`psrinfo -vp 2>/dev/null | awk '{if (NR == 3) {print $0; exit}}'` + p_ipaddr=`getent hosts $hostname | awk '{print $1}'` +fi + +if [ -f /proc/cpuinfo ]; then + p_count=`egrep processor /proc/cpuinfo | wc -l` + p_mhz=`awk -F: '/cpu MHz/{printf("%5.0f00Mhz\n",$2/100); exit}' /proc/cpuinfo` + p_type=`awk -F: '/model name/{print $2; exit}' /proc/cpuinfo` + p_ipaddr=`getent hosts $hostname | awk '{print $1}'` +else +## Mac OS X specific stuff +# first, get ugly output, in case pretty output isn't available +# + p_count=`sysctl -n hw.physicalcpu` + p_mhz=`sysctl -n hw.cpufrequency` + p_type=`sysctl -n hw.model` + +if [ -x /usr/sbin/system_profiler ]; then + # requires this hunk of work-around + # grep the XML for the characteristic we need. The key appears twice, so grep for the useful key (with 'string') + # use sed to strip off the and the tabs in front of the string. So much work for so little result. + # + p_mhz=`system_profiler -xml -detailLevel mini SPHardwareDataType | \ + grep -A1 current_processor_speed | grep string | \ + sed -E 's/(.+)<\/string>/\1/' | sed 's- --g'` + p_type=`system_profiler -xml -detailLevel mini SPHardwareDataType | \ + grep -A1 cpu_type | grep string | \ + sed -E 's/(.+)<\/string>/\1/' | sed 's- --g'` +fi + +# look for en0 (usually ethernet) if that isn't there try en1 (usually wireless) else give up + p_ipaddr=`ipconfig getpacket en0 | grep yiaddr | tr "= " "\n" | grep [0-9]` + if [ ! $p_ipaddr ]; then + p_ipaddr=`ipconfig getpacket en1 | grep yiaddr | tr "= " "\n" | grep [0-9]` + elif [ ! $p_ipaddr ]; then + p_ipaddr="unknown" + fi +fi + +printf "\n\n!Libmicro_#: %30s\n" $libmicro_version +printf "!Options: %30s\n" "$OPTS" +printf "!Machine_name: %30s\n" "$hostname" +printf "!OS_name: %30s\n" `uname -s` +printf "!OS_release: %30s\n" `sw_vers -productVersion` +printf "!OS_build: %30.18s\n" "`sw_vers -buildVersion`" +printf "!Processor: %30s\n" `arch` +printf "!#CPUs: %30s\n" $p_count +printf "!CPU_MHz: %30s\n" "$p_mhz" +printf "!CPU_NAME: %30s\n" "$p_type" +printf "!IP_address: %30s\n" "$p_ipaddr" +printf "!Run_by: %30s\n" $LOGNAME +printf "!Date: %30s\n" "`date '+%D %R'`" +printf "!Compiler: %30s\n" `$tattle -c` +printf "!Compiler Ver.:%30s\n" "`$tattle -v`" +printf "!sizeof(long): %30s\n" `$tattle -s` +printf "!extra_CFLAGS: %30s\n" "`$tattle -f`" +printf "!TimerRes: %30s\n\n\n" "`$tattle -r`" + +bin_dir="$TMPROOT/bin" + +mkdir -p $bin_dir +cp bin-*/exec_bin $bin_dir/$A + +cp ./apple/bin-*/posix_spawn_bin $bin_dir/$A + +newline=0 + +#if 1 /* Apple added code */ + +# We need to disable the stepper to prevent it from causing +# wide variations in results; see rdar://6243819 for details + +pstates=/usr/local/bin/pstates +if [ -x $pstates ]; then + echo "Disabling stepper to provide more consistent results of benchmark run" + sudo $pstates -d; sudo pstates -p 0 +else + echo "ERROR: No $pstates found; To disable stepper we need $pstates" 1>&2 + echo "Install AppleInternal package which provides $pstates and execute 'coreos_bench' again" 1>&2 + echo 1>&2 + echo "Note: If you cannot install AppleInternal package which provides $pstates, then use 'bench' script to run libMicro testsuite" 1>&2 + exit 1 +fi + +#endif /* End of Apple code */ + +# +# Everything below the while loop is input for the while loop +# if you have any tests which can't run in the while loop, put +# them above this comment +# +while read A B +do + # $A contains the command, $B contains the arguments + # we echo blank lines and comments + # we skip anything which fails to match *$1* (useful + # if we only want to test one case, but a nasty hack) + + case $A in + \#*) + echo "$A $B" + newline=1 + continue + ;; + + "") + if [ $newline -eq 1 ] + then + newline=0 + echo + echo + fi + + continue + ;; + + *$1*) + # Default execution without the lmbench tests. + # checks if there is no argument passed by the user. + if [ $lmbench -eq 0 ] + then + string=lmbench + if [ "${A:0:7}" == "$string" ] + then + continue + fi + fi + + ;; + + *) + if [ $lmbench -ne 1 ] + then + continue + fi + ;; + esac + + if [ ! -f $bin_dir/$A ] + then + cp bin-*/$A $bin_dir/$A + fi + + echo + + (cd $TMPROOT && eval "bin/$A $B") + + echo + echo +done <<. + +# +# Obligatory null system call: use very short time +# for default since SuSe implements this "syscall" in userland +# + +getpid $OPTS -N "getpid" -I 5 +getppid $OPTS -N "getppid" -I 5 + +getenv $OPTS -N "getenv" -s 100 -I 100 +getenv $OPTS -N "getenvT2" -s 100 -I 100 -T 2 + +gettimeofday $OPTS -N "gettimeofday" + +log $OPTS -N "log" -I 20 -B 300000 +exp $OPTS -N "exp" -I 20 -B 100000 +lrand48 $OPTS -N "lrand48" + +memset $OPTS -N "memset_10" -s 10 -I 10 +memset $OPTS -N "memset_256" -s 256 -I 20 +memset $OPTS -N "memset_256_u" -s 256 -a 1 -I 20 +memset $OPTS -N "memset_1k" -s 1k -I 100 -B 2000 +memset $OPTS -N "memset_4k" -s 4k -I 250 -B 500 +memset $OPTS -N "memset_4k_uc" -s 4k -u -I 400 + +memset $OPTS -N "memset_10k" -s 10k -I 600 -B 500 +memset $OPTS -N "memset_1m" -s 1m -I 200000 +memset $OPTS -N "memset_10m" -s 10m -I 2000000 +memset $OPTS -N "memsetP2_10m" -s 10m -P 2 -I 2000000 + +memrand $OPTS -N "memrand" -s 40m -B 10000 + +# This is an elided test and is not ported yet. +# Check Makefile.darwin for list of elided tests +# cachetocache $OPTS -N "cachetocache" -s 100k -T 2 -I 200 + +isatty $OPTS -N "isatty_yes" +isatty $OPTS -N "isatty_no" -f $IFILE + +malloc $OPTS -N "malloc_10" -s 10 -g 10 -I 50 +malloc $OPTS -N "malloc_100" -s 100 -g 10 -I 50 +malloc $OPTS -N "malloc_1k" -s 1k -g 10 -I 50 +malloc $OPTS -N "malloc_10k" -s 10k -g 10 -I 50 +malloc $OPTS -N "malloc_100k" -s 100k -g 10 -I 2000 + +malloc $OPTS -N "mallocT2_10" -s 10 -g 10 -T 2 -I 200 +malloc $OPTS -N "mallocT2_100" -s 100 -g 10 -T 2 -I 200 +malloc $OPTS -N "mallocT2_1k" -s 1k -g 10 -T 2 -I 200 +malloc $OPTS -N "mallocT2_10k" -s 10k -g 10 -T 2 -I 200 +malloc $OPTS -N "mallocT2_100k" -s 100k -g 10 -T 2 -I 10000 + +close $OPTS -N "close_bad" -B 96 -b +close $OPTS -N "close_tmp" -B 64 -f $TFILE +close $OPTS -N "close_usr" -B 64 -f $VFILE +close $OPTS -N "close_zero" -B 64 -f /dev/zero +close_tcp $OPTS -N "close_tcp" -B 32 + +memcpy $OPTS -N "memcpy_10" -s 10 -I 10 +memcpy $OPTS -N "memcpy_1k" -s 1k -I 50 +memcpy $OPTS -N "memcpy_10k" -s 10k -I 800 +memcpy $OPTS -N "memcpy_1m" -s 1m -I 500000 +memcpy $OPTS -N "memcpy_10m" -s 10m -I 5000000 + +strcpy $OPTS -N "strcpy_10" -s 10 -I 5 +strcpy $OPTS -N "strcpy_1k" -s 1k -I 100 + +strlen $OPTS -N "strlen_10" -s 10 -I 5 +strlen $OPTS -N "strlen_1k" -s 1k -I 100 + +strchr $OPTS -N "strchr_10" -s 10 -I 5 +strchr $OPTS -N "strchr_1k" -s 1k -I 200 +strcmp $OPTS -N "strcmp_10" -s 10 -I 10 +strcmp $OPTS -N "strcmp_1k" -s 1k -I 200 + +strcasecmp $OPTS -N "scasecmp_10" -s 10 -I 50 -B 2000 +strcasecmp $OPTS -N "scasecmp_1k" -s 1k -I 20000 -B 100 + +strtol $OPTS -N "strtol" -I 20 + +# This is an elided test and is not ported yet. +# Check Makefile.darwin for list of elided tests +# getcontext $OPTS -N "getcontext" -I 100 + +# This is an elided test and is not ported yet. +# Check Makefile.darwin for list of elided tests +# setcontext $OPTS -N "setcontext" -I 100 + +mutex $OPTS -N "mutex_st" -I 10 +mutex $OPTS -N "mutex_mt" -t -I 10 +mutex $OPTS -N "mutex_T2" -T 2 -I 100 + +longjmp $OPTS -N "longjmp" -I 10 +siglongjmp $OPTS -N "siglongjmp" -I 20 + +getrusage $OPTS -N "getrusage" -I 200 + +times $OPTS -N "times" -I 200 +time $OPTS -N "time" -I 50 +localtime_r $OPTS -N "localtime_r" -I 200 +strftime $OPTS -N "strftime" -I 10000 -B 100 + +mktime $OPTS -N "mktime" -I 500 +mktime $OPTS -N "mktimeT2" -T 2 -I 1000 + +cascade_mutex $OPTS -N "c_mutex_1" -I 50 +cascade_mutex $OPTS -N "c_mutex_10" -T 10 -I 5000 +cascade_mutex $OPTS -N "c_mutex_200" -T 200 -I 2000000 + +cascade_cond $OPTS -N "c_cond_1" -I 100 +cascade_cond $OPTS -N "c_cond_10" -T 10 -I 3000 +cascade_cond $OPTS -N "c_cond_200" -T 200 -I 2000000 + +cascade_lockf $OPTS -N "c_lockf_1" -I 1000 +cascade_lockf $OPTS -N "c_lockf_10" -P 10 -I 50000 +cascade_lockf $OPTS -N "c_lockf_200" -P 200 -I 5000000 + +cascade_flock $OPTS -N "c_flock" -I 1000 +cascade_flock $OPTS -N "c_flock_10" -P 10 -I 50000 +cascade_flock $OPTS -N "c_flock_200" -P 200 -I 5000000 + +cascade_fcntl $OPTS -N "c_fcntl_1" -I 2000 +cascade_fcntl $OPTS -N "c_fcntl_10" -P 10 -I 20000 +cascade_fcntl $OPTS -N "c_fcntl_200" -P 200 -I 5000000 + +file_lock $OPTS -N "file_lock" -I 1000 + +getsockname $OPTS -N "getsockname" -I 100 +getpeername $OPTS -N "getpeername" -I 100 + +chdir $OPTS -N "chdir_tmp" -I 2000 $TDIR1 $TDIR2 +chdir $OPTS -N "chdir_usr" -I 2000 $VDIR1 $VDIR2 + +chdir $OPTS -N "chgetwd_tmp" -I 3000 -g $TDIR1 $TDIR2 +chdir $OPTS -N "chgetwd_usr" -I 3000 -g $VDIR1 $VDIR2 + +realpath $OPTS -N "realpath_tmp" -I 3000 -f $TDIR1 +realpath $OPTS -N "realpath_usr" -I 3000 -f $VDIR1 + +stat $OPTS -N "stat_tmp" -I 1000 -f $TFILE +stat $OPTS -N "stat_usr" -I 1000 -f $VFILE + +lmbench_stat $OPTS -N "lmbench_stat_tmp" -I 1000 -f $TFILE +lmbench_stat $OPTS -N "lmbench_stat_usr" -I 10000 -B 100 -f /private/var/tmp/lmbench + +# +# lmbench uses a touched empty file in /private/var/tmp +# libMicro uses a 1M file in a directory off /private/var/tmp +# performance difference is ~ 0.2 usecs/call +# +# why? - walking the dir tree, empty file vs. non-empty file, non-empty dir +# in the case of libMicro, etc., etc. +# + +lmbench_stat $OPTS -N "lmbench_stat_usr - Default" -I 10000 -B 100 -f /private/var/tmp/lmbench + +lmbench_fstat $OPTS -N "lmbench_fstat_tmp" -I 1000 -f $TFILE +lmbench_fstat $OPTS -N "lmbench_fstat_usr" -I 10000 -B 100 -f /private/var/tmp/lmbench + +# see stat test to understand why we are using /private/var/tmp/lmbench + +lmbench_fstat $OPTS -N "lmbench_fstat_usr - Default" -I 10000 -B 100 -f /private/var/tmp/lmbench + +lmbench_openclose $OPTS -N "lmbench_openclose - Default" -I 10000 -B 100 -f /private/var/tmp/lmbench + +lmbench_select_file $OPTS -N "lmbench_select_file_10" -n 10 -B 100 +lmbench_select_file $OPTS -N "lmbench_select_file_100" -n 100 -B 100 +lmbench_select_file $OPTS -N "lmbench_select_file_250" -n 250 -B 100 +lmbench_select_file $OPTS -N "lmbench_select_file_500" -n 500 -B 100 + +lmbench_select_tcp $OPTS -N "lmbench_select_tcp_10" -n 10 -B 100 +lmbench_select_tcp $OPTS -N "lmbench_select_tcp_100" -n 100 -B 100 +lmbench_select_tcp $OPTS -N "lmbench_select_tcp_250" -n 250 -B 100 +lmbench_select_tcp $OPTS -N "lmbench_select_tcp_500" -n 500 -B 100 + +fcntl $OPTS -N "fcntl_tmp" -I 100 -f $TFILE +fcntl $OPTS -N "fcntl_usr" -I 100 -f $VFILE +fcntl_ndelay $OPTS -N "fcntl_ndelay" -I 100 + +lseek $OPTS -N "lseek_t8k" -s 8k -I 50 -f $TFILE +lseek $OPTS -N "lseek_u8k" -s 8k -I 50 -f $VFILE + +open $OPTS -N "open_tmp" -B 256 -f $TFILE +open $OPTS -N "open_usr" -B 256 -f $VFILE +open $OPTS -N "open_zero" -B 256 -f /dev/zero + +dup $OPTS -N "dup" -B 512 + +socket $OPTS -N "socket_u" -B 256 +socket $OPTS -N "socket_i" -B 256 -f PF_INET + +socketpair $OPTS -N "socketpair" -B 256 + +setsockopt $OPTS -N "setsockopt" -I 200 + +bind $OPTS -N "bind" -B 100 + +listen $OPTS -N "listen" -B 100 + +#connection $OPTS -N "connection" -B 256 + +poll $OPTS -N "poll_10" -n 10 -I 500 +poll $OPTS -N "poll_100" -n 100 -I 1000 +poll $OPTS -N "poll_1000" -n 1000 -I 5000 + +poll $OPTS -N "poll_w10" -n 10 -I 500 -w 1 +poll $OPTS -N "poll_w100" -n 100 -I 2000 -w 10 +poll $OPTS -N "poll_w1000" -n 1000 -I 40000 -w 100 + +select $OPTS -N "select_10" -n 10 -I 500 +select $OPTS -N "select_100" -n 100 -I 1000 +select $OPTS -N "select_1000" -n 1000 -I 5000 + +select $OPTS -N "select_w10" -n 10 -I 500 -w 1 +select $OPTS -N "select_w100" -n 100 -I 2000 -w 10 +select $OPTS -N "select_w1000" -n 1000 -I 40000 -w 100 + +semop $OPTS -N "semop" -I 200 + +sigaction $OPTS -N "sigaction" -I 100 +signal $OPTS -N "signal" -I 1000 +sigprocmask $OPTS -N "sigprocmask" -I 200 + +lmbench_lat_sig_install $OPTS -N "lmbench_siginstall" +# sigcatch and sigsend need to be evaluated together +# lmbench framework will allow multiple measurements within the same +# benchmark test which allow them to factor out the cost of sending +# a signal from catching one +# +# for our purposes sigcatch results - sigsend results yield +# lmbench sig handler overhead measurements +lmbench_lat_sig_catch $OPTS -N "lmbench_sigcatch" +lmbench_lat_sig_send $OPTS -N "lmbench_sigsend" + + +pthread_create $OPTS -N "pthread_8" -B 8 +pthread_create $OPTS -N "pthread_32" -B 32 +pthread_create $OPTS -N "pthread_128" -B 128 +pthread_create $OPTS -N "pthread_512" -B 512 + +fork $OPTS -N "fork_10" -B 10 +fork $OPTS -N "fork_100" -B 100 -C 100 + +#fork $OPTS -N "fork_1000" -B 1000 -C 50 + +exit $OPTS -N "exit_10" -B 10 +exit $OPTS -N "exit_100" -B 100 + +#exit $OPTS -N "exit_1000" -B 1000 -C 50 + +exit $OPTS -N "exit_10_nolibc" -e -B 10 + +exec $OPTS -N "exec" -B 10 + +posix_spawn $OPTS -N "posix_spawn" -B 10 + +system $OPTS -N "system" -I 1000000 + +recurse $OPTS -N "recurse" -B 512 + +read $OPTS -N "read_t1k" -s 1k -B 50 -f $TFILE +read $OPTS -N "read_t10k" -s 10k -B 16 -f $TFILE +read $OPTS -N "read_t100k" -s 100k -B 4 -f $TFILE + +read $OPTS -N "read_u1k" -s 1k -B 50 -f $VFILE +read $OPTS -N "read_u10k" -s 10k -B 16 -f $VFILE +read $OPTS -N "read_u100k" -s 100k -B 4 -f $VFILE + +read $OPTS -N "read_z1k" -s 1k -B 100 -f /dev/zero +read $OPTS -N "read_z10k" -s 10k -B 30 -f /dev/zero +read $OPTS -N "read_z100k" -s 100k -B 4 -f /dev/zero +read $OPTS -N "read_zw100k" -s 100k -B 4 -w -f /dev/zero + +lmbench_read $OPTS -N "read_t1b" -s 1 -B 50 -f $TFILE +lmbench_read $OPTS -N "read_t1k" -s 1k -B 50 -f $TFILE +lmbench_read $OPTS -N "read_t10k" -s 10k -B 16 -f $TFILE +lmbench_read $OPTS -N "read_t100k" -s 100k -B 4 -f $TFILE + +lmbench_read $OPTS -N "read_u1b" -s 1 -B 50 -f $VFILE +lmbench_read $OPTS -N "read_u1k" -s 1k -B 50 -f $VFILE +lmbench_read $OPTS -N "read_u10k" -s 10k -B 16 -f $VFILE +lmbench_read $OPTS -N "read_u100k" -s 100k -B 4 -f $VFILE + +lmbench_read $OPTS -N "read_z1b - Default" -s 1 -B 100 -f /dev/zero +lmbench_read $OPTS -N "read_z1k" -s 1k -B 100 -f /dev/zero +lmbench_read $OPTS -N "read_z10k" -s 10k -B 30 -f /dev/zero +lmbench_read $OPTS -N "read_z100k" -s 100k -B 4 -f /dev/zero +lmbench_read $OPTS -N "read_zw100k" -s 100k -B 4 -w -f /dev/zero + +write $OPTS -N "write_t1k" -s 1k -B 50 -f $TFILE +write $OPTS -N "write_t10k" -s 10k -B 25 -f $TFILE +write $OPTS -N "write_t100k" -s 100k -B 4 -f $TFILE + +write $OPTS -N "write_u1k" -s 1k -B 50 -f $VFILE +write $OPTS -N "write_u10k" -s 10k -B 25 -f $VFILE +write $OPTS -N "write_u100k" -s 100k -B 4 -f $VFILE + +write $OPTS -N "write_n1k" -s 1k -I 100 -B 0 -f /dev/null +write $OPTS -N "write_n10k" -s 10k -I 100 -B 0 -f /dev/null +write $OPTS -N "write_n100k" -s 100k -I 100 -B 0 -f /dev/null + +lmbench_write $OPTS -N "lmbench_write_t1b" -s 1 -B 50 -f $TFILE +lmbench_write $OPTS -N "lmbench_write_t1k" -s 1k -B 50 -f $TFILE +lmbench_write $OPTS -N "lmbench_write_t10k" -s 10k -B 25 -f $TFILE +lmbench_write $OPTS -N "lmbench_write_t100k" -s 100k -B 4 -f $TFILE + +lmbench_write $OPTS -N "lmbench_write_u1b" -s 1 -B 50 -f $VFILE +lmbench_write $OPTS -N "lmbench_write_u1k" -s 1k -B 50 -f $VFILE +lmbench_write $OPTS -N "lmbench_write_u10k" -s 10k -B 25 -f $VFILE +lmbench_write $OPTS -N "lmbench_write_u100k" -s 100k -B 4 -f $VFILE + +lmbench_write $OPTS -N "lmbench_write_n1b - Default" -s 1 -I 100 -B 0 -f /dev/null +lmbench_write $OPTS -N "lmbench_write_n1k" -s 1k -I 100 -B 0 -f /dev/null +lmbench_write $OPTS -N "lmbench_write_n10k" -s 10k -I 100 -B 0 -f /dev/null +lmbench_write $OPTS -N "lmbench_write_n100k" -s 100k -I 100 -B 0 -f /dev/null + +writev $OPTS -N "writev_t1k" -s 1k -B 20 -f $TFILE +writev $OPTS -N "writev_t10k" -s 10k -B 4 -f $TFILE +writev $OPTS -N "writev_t100k" -s 100k -f $TFILE + +writev $OPTS -N "writev_u1k" -s 1k -B 20 -f $VFILE +writev $OPTS -N "writev_u10k" -s 10k -B 4 -f $VFILE +writev $OPTS -N "writev_u100k" -s 100k -f $VFILE + +writev $OPTS -N "writev_n1k" -s 1k -I 100 -B 0 -f /dev/null +writev $OPTS -N "writev_n10k" -s 10k -I 100 -B 0 -f /dev/null +writev $OPTS -N "writev_n100k" -s 100k -I 100 -B 0 -f /dev/null + +pread $OPTS -N "pread_t1k" -s 1k -I 300 -f $TFILE +pread $OPTS -N "pread_t10k" -s 10k -I 1000 -f $TFILE +pread $OPTS -N "pread_t100k" -s 100k -I 10000 -f $TFILE + +pread $OPTS -N "pread_u1k" -s 1k -I 300 -f $VFILE +pread $OPTS -N "pread_u10k" -s 10k -I 1000 -f $VFILE +pread $OPTS -N "pread_u100k" -s 100k -I 10000 -f $VFILE + +pread $OPTS -N "pread_z1k" -s 1k -I 300 -f /dev/zero +pread $OPTS -N "pread_z10k" -s 10k -I 1000 -f /dev/zero +pread $OPTS -N "pread_z100k" -s 100k -I 2000 -f /dev/zero +pread $OPTS -N "pread_zw100k" -s 100k -w -I 10000 -f /dev/zero + +pwrite $OPTS -N "pwrite_t1k" -s 1k -I 500 -f $TFILE +pwrite $OPTS -N "pwrite_t10k" -s 10k -I 1000 -f $TFILE +pwrite $OPTS -N "pwrite_t100k" -s 100k -I 10000 -f $TFILE + +pwrite $OPTS -N "pwrite_u1k" -s 1k -I 500 -f $VFILE +pwrite $OPTS -N "pwrite_u10k" -s 10k -I 1000 -f $VFILE +pwrite $OPTS -N "pwrite_u100k" -s 100k -I 20000 -f $VFILE + +pwrite $OPTS -N "pwrite_n1k" -s 1k -I 100 -f /dev/null +pwrite $OPTS -N "pwrite_n10k" -s 10k -I 100 -f /dev/null +pwrite $OPTS -N "pwrite_n100k" -s 100k -I 100 -f /dev/null + +mmap $OPTS -N "mmap_z8k" -l 8k -I 1000 -B 50 -f /dev/zero +mmap $OPTS -N "mmap_z128k" -l 128k -I 2000 -B 100 -f /dev/zero +mmap $OPTS -N "mmap_t8k" -l 8k -I 1000 -f $TFILE +mmap $OPTS -N "mmap_t128k" -l 128k -I 1000 -f $TFILE +mmap $OPTS -N "mmap_u8k" -l 8k -I 1000 -f $VFILE +mmap $OPTS -N "mmap_u128k" -l 128k -I 1000 -f $VFILE +mmap $OPTS -N "mmap_a8k" -l 8k -I 200 -f MAP_ANON +mmap $OPTS -N "mmap_a128k" -l 128k -I 200 -f MAP_ANON + + +mmap $OPTS -N "mmap_rz8k" -l 8k -I 2000 -r -f /dev/zero +mmap $OPTS -N "mmap_rz128k" -l 128k -I 2000 -r -f /dev/zero +mmap $OPTS -N "mmap_rt8k" -l 8k -I 2000 -r -f $TFILE +mmap $OPTS -N "mmap_rt128k" -l 128k -I 20000 -r -f $TFILE +mmap $OPTS -N "mmap_ru8k" -l 8k -I 2000 -r -f $VFILE +mmap $OPTS -N "mmap_ru128k" -l 128k -I 20000 -r -f $VFILE +mmap $OPTS -N "mmap_ra8k" -l 8k -I 2000 -r -f MAP_ANON +mmap $OPTS -N "mmap_ra128k" -l 128k -I 20000 -r -f MAP_ANON + +mmap $OPTS -N "mmap_wz8k" -l 8k -I 5000 -w -B 50 -f /dev/zero +mmap $OPTS -N "mmap_wz128k" -l 128k -I 50000 -w -B 50 -f /dev/zero +mmap $OPTS -N "mmap_wt8k" -l 8k -I 5000 -w -f $TFILE +mmap $OPTS -N "mmap_wt128k" -l 128k -I 50000 -w -f $TFILE +mmap $OPTS -N "mmap_wu8k" -l 8k -I 5000 -w -f $VFILE +mmap $OPTS -N "mmap_wu128k" -l 128k -I 500000 -w -f $VFILE +mmap $OPTS -N "mmap_wa8k" -l 8k -I 3000 -w -f MAP_ANON +mmap $OPTS -N "mmap_wa128k" -l 128k -I 50000 -w -f MAP_ANON + +munmap $OPTS -N "unmap_z8k" -l 8k -I 500 -f /dev/zero +munmap $OPTS -N "unmap_z128k" -l 128k -I 500 -B 100 -f /dev/zero +munmap $OPTS -N "unmap_t8k" -l 8k -I 500 -f $TFILE +munmap $OPTS -N "unmap_t128k" -l 128k -I 500 -f $TFILE +munmap $OPTS -N "unmap_u8k" -l 8k -I 500 -f $VFILE +munmap $OPTS -N "unmap_u128k" -l 128k -I 500 -f $VFILE +munmap $OPTS -N "unmap_a8k" -l 8k -I 500 -f MAP_ANON +munmap $OPTS -N "unmap_a128k" -l 128k -I 500 -f MAP_ANON + +munmap $OPTS -N "unmap_rz8k" -l 8k -I 1000 -r -f /dev/zero +munmap $OPTS -N "unmap_rz128k" -l 128k -I 2000 -r -B 100 -f /dev/zero +munmap $OPTS -N "unmap_rt8k" -l 8k -I 1000 -r -f $TFILE +munmap $OPTS -N "unmap_rt128k" -l 128k -I 3000 -r -f $TFILE +munmap $OPTS -N "unmap_ru8k" -l 8k -I 1000 -r -f $VFILE +munmap $OPTS -N "unmap_ru128k" -l 128k -I 3000 -r -f $VFILE +munmap $OPTS -N "unmap_ra8k" -l 8k -I 1000 -r -f MAP_ANON +munmap $OPTS -N "unmap_ra128k" -l 128k -I 2000 -r -f MAP_ANON + +connection $OPTS -N "conn_connect" -B 256 -c + +munmap $OPTS -N "unmap_wz8k" -l 8k -I 1000 -w -f /dev/zero +munmap $OPTS -N "unmap_wz128k" -l 128k -I 8000 -w -B 100 -f /dev/zero +munmap $OPTS -N "unmap_wt8k" -l 8k -I 1000 -w -f $TFILE +munmap $OPTS -N "unmap_wt128k" -l 128k -I 10000 -w -f $TFILE +munmap $OPTS -N "unmap_wu8k" -l 8k -I 1000 -w -f $VFILE +munmap $OPTS -N "unmap_wu128k" -l 128k -I 50000 -w -B 10 -f $VFILE +munmap $OPTS -N "unmap_wa8k" -l 8k -I 1000 -w -f MAP_ANON +munmap $OPTS -N "unmap_wa128k" -l 128k -I 10000 -w -f MAP_ANON + + +mprotect $OPTS -N "mprot_z8k" -l 8k -I 300 -f /dev/zero +mprotect $OPTS -N "mprot_z128k" -l 128k -I 500 -f /dev/zero +mprotect $OPTS -N "mprot_wz8k" -l 8k -I 500 -w -f /dev/zero +mprotect $OPTS -N "mprot_wz128k" -l 128k -I 1000 -w -f /dev/zero +mprotect $OPTS -N "mprot_twz8k" -l 8k -I 1000 -w -t -f /dev/zero +mprotect $OPTS -N "mprot_tw128k" -l 128k -I 2000 -w -t -f /dev/zero +mprotect $OPTS -N "mprot_tw4m" -l 4m -w -t -B 1 -f /dev/zero + +pipe $OPTS -N "pipe_pst1" -s 1 -I 1000 -x pipe -m st +pipe $OPTS -N "pipe_pmt1" -s 1 -I 8000 -x pipe -m mt +pipe $OPTS -N "pipe_pmp1" -s 1 -I 8000 -x pipe -m mp +pipe $OPTS -N "pipe_pst4k" -s 4k -I 1000 -x pipe -m st +pipe $OPTS -N "pipe_pmt4k" -s 4k -I 8000 -x pipe -m mt +pipe $OPTS -N "pipe_pmp4k" -s 4k -I 8000 -x pipe -m mp + +pipe $OPTS -N "pipe_sst1" -s 1 -I 1000 -x sock -m st +pipe $OPTS -N "pipe_smt1" -s 1 -I 8000 -x sock -m mt +pipe $OPTS -N "pipe_smp1" -s 1 -I 8000 -x sock -m mp +pipe $OPTS -N "pipe_sst4k" -s 4k -I 1000 -x sock -m st +pipe $OPTS -N "pipe_smt4k" -s 4k -I 8000 -x sock -m mt +pipe $OPTS -N "pipe_smp4k" -s 4k -I 8000 -x sock -m mp + +pipe $OPTS -N "pipe_tst1" -s 1 -I 1000 -x tcp -m st +pipe $OPTS -N "pipe_tmt1" -s 1 -I 8000 -x tcp -m mt +pipe $OPTS -N "pipe_tmp1" -s 1 -I 8000 -x tcp -m mp +pipe $OPTS -N "pipe_tst4k" -s 4k -I 1000 -x tcp -m st +pipe $OPTS -N "pipe_tmt4k" -s 4k -I 8000 -x tcp -m mt +pipe $OPTS -N "pipe_tmp4k" -s 4k -I 8000 -x tcp -m mp + +#connection $OPTS -N "conn_accept" -B 256 -a + +lmbench_bw_unix -B 11 -L -W + +lmbench_bw_mem $OPTS -N lmbench_bcopy_512 -s 512 -x bcopy +lmbench_bw_mem $OPTS -N lmbench_bcopy_1k -s 1k -x bcopy +lmbench_bw_mem $OPTS -N lmbench_bcopy_2k -s 2k -x bcopy +lmbench_bw_mem $OPTS -N lmbench_bcopy_4k -s 4k -x bcopy +lmbench_bw_mem $OPTS -N lmbench_bcopy_8k -s 8k -x bcopy +lmbench_bw_mem $OPTS -N lmbench_bcopy_16k -s 16k -x bcopy +lmbench_bw_mem $OPTS -N lmbench_bcopy_32k -s 32k -x bcopy +lmbench_bw_mem $OPTS -N lmbench_bcopy_64k -s 64k -x bcopy +lmbench_bw_mem $OPTS -N lmbench_bcopy_128k -s 128k -x bcopy +lmbench_bw_mem $OPTS -N lmbench_bcopy_256k -s 256k -x bcopy +lmbench_bw_mem $OPTS -N lmbench_bcopy_512k -s 512k -x bcopy +lmbench_bw_mem $OPTS -N lmbench_bcopy_1m -s 1m -x bcopy +lmbench_bw_mem $OPTS -N lmbench_bzero_512 -s 512 -x bzero +lmbench_bw_mem $OPTS -N lmbench_bzero_1k -s 1k -x bzero +lmbench_bw_mem $OPTS -N lmbench_bzero_2k -s 2k -x bzero +lmbench_bw_mem $OPTS -N lmbench_bzero_4k -s 4k -x bzero +lmbench_bw_mem $OPTS -N lmbench_bzero_8k -s 8k -x bzero +lmbench_bw_mem $OPTS -N lmbench_bzero_16k -s 16k -x bzero +lmbench_bw_mem $OPTS -N lmbench_bzero_32k -s 32k -x bzero +lmbench_bw_mem $OPTS -N lmbench_bzero_64k -s 64k -x bzero +lmbench_bw_mem $OPTS -N lmbench_bzero_128k -s 128k -x bzero +lmbench_bw_mem $OPTS -N lmbench_bzero_256k -s 256k -x bzero +lmbench_bw_mem $OPTS -N lmbench_bzero_512k -s 512k -x bzero +lmbench_bw_mem $OPTS -N lmbench_bzero_1m -s 1m -x bzero +lmbench_bw_mem $OPTS -N lmbench_bzero_512 -s 512 -x fcp +lmbench_bw_mem $OPTS -N lmbench_bzero_1k -s 1k -x fcp +lmbench_bw_mem $OPTS -N lmbench_bzero_2k -s 2k -x fcp +lmbench_bw_mem $OPTS -N lmbench_bzero_4k -s 4k -x fcp +lmbench_bw_mem $OPTS -N lmbench_bzero_8k -s 8k -x fcp +lmbench_bw_mem $OPTS -N lmbench_bzero_16k -s 16k -x fcp +lmbench_bw_mem $OPTS -N lmbench_bzero_32k -s 32k -x fcp +lmbench_bw_mem $OPTS -N lmbench_bzero_64k -s 64k -x fcp +lmbench_bw_mem $OPTS -N lmbench_bzero_128k -s 128k -x fcp +lmbench_bw_mem $OPTS -N lmbench_bzero_256k -s 256k -x fcp +lmbench_bw_mem $OPTS -N lmbench_bzero_512k -s 512k -x fcp +lmbench_bw_mem $OPTS -N lmbench_bzero_1m -s 1m -x fcp +lmbench_bw_mem $OPTS -N lmbench_cp_512 -s 512 -x cp +lmbench_bw_mem $OPTS -N lmbench_cp_1k -s 1k -x cp +lmbench_bw_mem $OPTS -N lmbench_cp_2k -s 2k -x cp +lmbench_bw_mem $OPTS -N lmbench_cp_4k -s 4k -x cp +lmbench_bw_mem $OPTS -N lmbench_cp_8k -s 8k -x cp +lmbench_bw_mem $OPTS -N lmbench_cp_16k -s 16k -x cp +lmbench_bw_mem $OPTS -N lmbench_cp_32k -s 32k -x cp +lmbench_bw_mem $OPTS -N lmbench_cp_64k -s 64k -x cp +lmbench_bw_mem $OPTS -N lmbench_cp_128k -s 128k -x cp +lmbench_bw_mem $OPTS -N lmbench_cp_256k -s 256k -x cp +lmbench_bw_mem $OPTS -N lmbench_cp_512k -s 512k -x cp +lmbench_bw_mem $OPTS -N lmbench_cp_1m -s 1m -x cp +lmbench_bw_mem $OPTS -N lmbench_frd_512 -s 512 -x frd +lmbench_bw_mem $OPTS -N lmbench_frd_1k -s 1k -x frd +lmbench_bw_mem $OPTS -N lmbench_frd_2k -s 2k -x frd +lmbench_bw_mem $OPTS -N lmbench_frd_4k -s 4k -x frd +lmbench_bw_mem $OPTS -N lmbench_frd_8k -s 8k -x frd +lmbench_bw_mem $OPTS -N lmbench_frd_16k -s 16k -x frd +lmbench_bw_mem $OPTS -N lmbench_frd_32k -s 32k -x frd +lmbench_bw_mem $OPTS -N lmbench_frd_64k -s 64k -x frd +lmbench_bw_mem $OPTS -N lmbench_frd_128k -s 128k -x frd +lmbench_bw_mem $OPTS -N lmbench_frd_256k -s 256k -x frd +lmbench_bw_mem $OPTS -N lmbench_frd_512k -s 512k -x frd +lmbench_bw_mem $OPTS -N lmbench_frd_1m -s 1m -x frd +lmbench_bw_mem $OPTS -N lmbench_rd_512 -s 512 -x rd +lmbench_bw_mem $OPTS -N lmbench_rd_1k -s 1k -x rd +lmbench_bw_mem $OPTS -N lmbench_rd_2k -s 2k -x rd +lmbench_bw_mem $OPTS -N lmbench_rd_4k -s 4k -x rd +lmbench_bw_mem $OPTS -N lmbench_rd_8k -s 8k -x rd +lmbench_bw_mem $OPTS -N lmbench_rd_16k -s 16k -x rd +lmbench_bw_mem $OPTS -N lmbench_rd_32k -s 32k -x rd +lmbench_bw_mem $OPTS -N lmbench_rd_64k -s 64k -x rd +lmbench_bw_mem $OPTS -N lmbench_rd_128k -s 128k -x rd +lmbench_bw_mem $OPTS -N lmbench_rd_256k -s 256k -x rd +lmbench_bw_mem $OPTS -N lmbench_rd_512k -s 512k -x rd +lmbench_bw_mem $OPTS -N lmbench_rd_1m -s 1m -x rd +lmbench_bw_mem $OPTS -N lmbench_fwr_512 -s 512 -x fwr +lmbench_bw_mem $OPTS -N lmbench_fwr_1k -s 1k -x fwr +lmbench_bw_mem $OPTS -N lmbench_fwr_2k -s 2k -x fwr +lmbench_bw_mem $OPTS -N lmbench_fwr_4k -s 4k -x fwr +lmbench_bw_mem $OPTS -N lmbench_fwr_8k -s 8k -x fwr +lmbench_bw_mem $OPTS -N lmbench_fwr_16k -s 16k -x fwr +lmbench_bw_mem $OPTS -N lmbench_fwr_32k -s 32k -x fwr +lmbench_bw_mem $OPTS -N lmbench_fwr_64k -s 64k -x fwr +lmbench_bw_mem $OPTS -N lmbench_fwr_128k -s 128k -x fwr +lmbench_bw_mem $OPTS -N lmbench_fwr_256k -s 256k -x fwr +lmbench_bw_mem $OPTS -N lmbench_fwr_512k -s 512k -x fwr +lmbench_bw_mem $OPTS -N lmbench_fwr_1m -s 1m -x fwr +lmbench_bw_mem $OPTS -N lmbench_wr_512 -s 512 -x wr +lmbench_bw_mem $OPTS -N lmbench_wr_1k -s 1k -x wr +lmbench_bw_mem $OPTS -N lmbench_wr_2k -s 2k -x wr +lmbench_bw_mem $OPTS -N lmbench_wr_4k -s 4k -x wr +lmbench_bw_mem $OPTS -N lmbench_wr_8k -s 8k -x wr +lmbench_bw_mem $OPTS -N lmbench_wr_16k -s 16k -x wr +lmbench_bw_mem $OPTS -N lmbench_wr_32k -s 32k -x wr +lmbench_bw_mem $OPTS -N lmbench_wr_64k -s 64k -x wr +lmbench_bw_mem $OPTS -N lmbench_wr_128k -s 128k -x wr +lmbench_bw_mem $OPTS -N lmbench_wr_256k -s 256k -x wr +lmbench_bw_mem $OPTS -N lmbench_wr_512k -s 512k -x wr +lmbench_bw_mem $OPTS -N lmbench_wr_1m -s 1m -x wr +lmbench_bw_mem $OPTS -N lmbench_rdwr_512 -s 512 -x rdwr +lmbench_bw_mem $OPTS -N lmbench_rdwr_1k -s 1k -x rdwr +lmbench_bw_mem $OPTS -N lmbench_rdwr_2k -s 2k -x rdwr +lmbench_bw_mem $OPTS -N lmbench_rdwr_4k -s 4k -x rdwr +lmbench_bw_mem $OPTS -N lmbench_rdwr_8k -s 8k -x rdwr +lmbench_bw_mem $OPTS -N lmbench_rdwr_16k -s 16k -x rdwr +lmbench_bw_mem $OPTS -N lmbench_rdwr_32k -s 32k -x rdwr +lmbench_bw_mem $OPTS -N lmbench_rdwr_64k -s 64k -x rdwr +lmbench_bw_mem $OPTS -N lmbench_rdwr_128k -s 128k -x rdwr +lmbench_bw_mem $OPTS -N lmbench_rdwr_256k -s 256k -x rdwr +lmbench_bw_mem $OPTS -N lmbench_rdwr_512k -s 512k -x rdwr +lmbench_bw_mem $OPTS -N lmbench_rdwr_1m -s 1m -x rdwr + +lmbench_bw_mmap_rd $OPTS -N bw_mmap_rd_512 -s 512 -f $TFILE +lmbench_bw_mmap_rd $OPTS -N bw_mmap_rd_1k -s 1k -f $TFILE +lmbench_bw_mmap_rd $OPTS -N bw_mmap_rd_2k -s 2k -f $TFILE +lmbench_bw_mmap_rd $OPTS -N bw_mmap_rd_4k -s 4k -f $TFILE +lmbench_bw_mmap_rd $OPTS -N bw_mmap_rd_8k -s 8k -f $TFILE +lmbench_bw_mmap_rd $OPTS -N bw_mmap_rd_16k -s 16k -f $TFILE +lmbench_bw_mmap_rd $OPTS -N bw_mmap_rd_32k -s 32k -f $TFILE +lmbench_bw_mmap_rd $OPTS -N bw_mmap_rd_64k -s 64k -f $TFILE +lmbench_bw_mmap_rd $OPTS -N bw_mmap_rd_128k -s 128k -f $TFILE +lmbench_bw_mmap_rd $OPTS -N bw_mmap_rd_256k -s 256k -f $TFILE +lmbench_bw_mmap_rd $OPTS -N bw_mmap_rd_512k -s 512k -f $TFILE +lmbench_bw_mmap_rd $OPTS -N bw_mmap_rd_1m -s 1m -f $TFILE + +. diff --git a/tools/tests/libMicro/exp.c b/tools/tests/libMicro/exp.c index acc81c577..14ad80b68 100644 --- a/tools/tests/libMicro/exp.c +++ b/tools/tests/libMicro/exp.c @@ -53,9 +53,27 @@ int benchmark(void *tsd, result_t *res) { int i; + /* Added as part of the fix for radar 7508837 */ + double t = 0.0; for (i = 0; i < lm_optB; i += 10) { double value = 1.0 / (i + .01); +#if 1 /* Apple added code, see radar 7508837 */ + t += exp(value); + t += exp(value + 1.0); + t += exp(value + 2.0); + t += exp(value + 3.0); + t += exp(value + 4.0); + t += exp(value + 5.0); + t += exp(value + 6.0); + t += exp(value + 7.0); + t += exp(value + 8.0); + t += exp(value + 9.0); + } + res->re_count = i; + + return ((int)(t - t)); +#else (void) exp(value); (void) exp(value); (void) exp(value); @@ -70,4 +88,5 @@ benchmark(void *tsd, result_t *res) res->re_count = i; return (0); +#endif /* end of Apple fix */ } diff --git a/tools/tests/libMicro/libmicro.h b/tools/tests/libMicro/libmicro.h index 54dcb8503..0359134d1 100644 --- a/tools/tests/libMicro/libmicro.h +++ b/tools/tests/libMicro/libmicro.h @@ -34,6 +34,8 @@ #define STRSIZE 1024 +#define STREQ(a,b) (strcmp(a,b) == 0) + typedef struct { long long re_count; long long re_errors; diff --git a/tools/tests/libMicro/log.c b/tools/tests/libMicro/log.c index 0b4605f3f..71910e079 100644 --- a/tools/tests/libMicro/log.c +++ b/tools/tests/libMicro/log.c @@ -53,9 +53,27 @@ int benchmark(void *tsd, result_t *res) { int i; + /* Added as part of the fix for radar 7508837 */ + double t = 0.0; for (i = 0; i < lm_optB; i += 10) { double value = i + .01; +#if 1 /* Apple added code, see radar 7508837 */ + t += log(value); + t += log(value + 1.0); + t += log(value + 2.0); + t += log(value + 3.0); + t += log(value + 4.0); + t += log(value + 5.0); + t += log(value + 6.0); + t += log(value + 7.0); + t += log(value + 8.0); + t += log(value + 9.0); + } + res->re_count = i; + + return ((int)(t - t)); +#else (void) log(value); (void) log(value); (void) log(value); @@ -70,4 +88,5 @@ benchmark(void *tsd, result_t *res) res->re_count = i; return (0); +#endif /* end of Apple fix */ } diff --git a/tools/tests/libMicro/longjmp.c b/tools/tests/libMicro/longjmp.c index 50f4dbc93..43c54d48a 100644 --- a/tools/tests/libMicro/longjmp.c +++ b/tools/tests/libMicro/longjmp.c @@ -51,7 +51,12 @@ benchmark_init() int benchmark(void *tsd, result_t *res) { - int i = 0; +#if 1 /* Apple fix to longjmp/siglongjmp tests, see radar 7440118 */ + volatile int i = 0; +#else + int i = 0; +#endif /*end of Apple fix */ + jmp_buf env; (void) setjmp(env); diff --git a/tools/tests/libMicro/od_account_create.sh b/tools/tests/libMicro/od_account_create.sh new file mode 100644 index 000000000..fa9e3b768 --- /dev/null +++ b/tools/tests/libMicro/od_account_create.sh @@ -0,0 +1,129 @@ +#!/bin/bash + + +function sighandler { + echo + echo "Interrupting account creation" + rm -f $TMPF + exit 1 +} + +trap sighandler INT TERM + +# Fixed parameters +# +NAME=`basename $0` +COUNT=$1 +NODE=$2 +PREFIX="od_test_" +GROUP_ID=1211 # A group everybody's in +GROUP_ID2=1212 # A group nobody's in +GROUP_NAME='od_test_group' +UID_BASE=5000 +TMPF=/tmp/.${NAME}.$$ + +usage () { + echo + echo "Usage: ${NAME} count nodename" + echo + echo " ie. ${NAME} 1000 /Local/Default" + echo + echo " will create users 1000 users (from '${PREFIX}1' to '${PREFIX}1000')" + echo " Default password is set to 'test'" + echo " User ID starts from 5000" + echo " Default group is '${GROUP_NAME}', Group ID 1211" + echo + echo "This tool assumes user 'diradmin' with password 'admin' for OD admin" + echo + exit 85 # WRONGARGS +} + +if [ $# -ne 2 ]; then + usage +fi + +# if local node we don't need credentials +if [ $NODE != "/Local/Default" ]; then + OD_ADMIN="diradmin" + OD_PASS="admin" +fi + +echo "Creating users ${PREFIX}1 to ${PREFIX}$COUNT" + +# check to see if od_test_group exist. if not, create one +# +result=`dscl $NODE -list Groups/${GROUP_NAME}1 2> /dev/null` +if [ $? -ne 0 ]; then + echo "Group \"${GROUP_NAME}\" does not exist. Creating ${GROUP_NAME}" + if [ -n "$OD_ADMIN" ]; then + dseditgroup -q -o create -n $NODE -u $OD_ADMIN -P $OD_PASS -i ${GROUP_ID} ${GROUP_NAME}1 + dseditgroup -q -o create -n $NODE -u $OD_ADMIN -P $OD_PASS -i ${GROUP_ID2} ${GROUP_NAME}2 + else + dseditgroup -q -o create -n $NODE -i ${GROUP_ID} ${GROUP_NAME}1 + dseditgroup -q -o create -n $NODE -i ${GROUP_ID2} ${GROUP_NAME}2 + fi +fi + +if [ $? -ne 0 ]; then + echo "Failed to create test_group" + exit 1 +fi + +# using dsimport is faster than using dscl +i=1 +uid=$UID_BASE +echo "Writing a temporary import file ..." +while [ $i -le $COUNT ] +do + result=`dscl $NODE -list Users/${PREFIX}${i} 2> /dev/null` + if [ $? -ne 0 ]; then + # Uses standard template + # RecordName:Password:UniqueID:PrimaryGroupID:DistinguishedName:NFSHomeDirectory:UserShell + echo "${PREFIX}${i}:test:${uid}:1211:${PREFIX}${i}:/Users/${PREFIX}${i}:/bin/bash" >> $TMPF + printf "\r${PREFIX}${i} / ${COUNT}" + else + echo "account $PREFIX$i already exist. skipping" + fi + i=`expr $i + 1` + uid=`expr $uid + 1` +done +echo + +# Now do the real work +# +if [[ -f $TMPF ]]; then + echo "Running dsimport to create users. Please be patient. This takes a while ..." + # assume if admin is provided that slapconfig exists + if [ -n "$OD_ADMIN" ]; then + if [[ -x "/usr/sbin/slapconfig" ]]; then + /usr/sbin/slapconfig -setfullsyncmode no + sleep 2 + fi + /usr/bin/time dsimport $TMPF $NODE I --username $OD_ADMIN --password $OD_PASS --template StandardUser + sleep 2 + if [[ -x "/usr/sbin/slapconfig" ]]; then + /usr/sbin/slapconfig -setfullsyncmode yes + fi + else + /usr/bin/time dsimport $TMPF $NODE I --template StandardUser + sleep 2 + fi + + # and now delete the temp file + # + rm -f $TMPF +else + echo "Nothing done. All users already exist" +fi + +echo Create a SACL group for libMicro +# Create a sample SACL group +dseditgroup -q -o create -r "libMicro ACL" com.apple.access_libMicro +i=1 +while [ $i -le $COUNT ]; do + dseditgroup -q -o edit -a ${PREFIX}${i} -t user com.apple.access_libMicro + i=`expr $i + 1` +done + +echo 'Finished' + diff --git a/tools/tests/libMicro/od_account_delete.sh b/tools/tests/libMicro/od_account_delete.sh new file mode 100644 index 000000000..00ea4e251 --- /dev/null +++ b/tools/tests/libMicro/od_account_delete.sh @@ -0,0 +1,98 @@ +#!/bin/bash + +function sighandler { + echo + echo "Interrupting account creation" + rm -f $TMPF + exit 2 +} + +trap sighandler INT TERM + +# Fixed parameters +# +NAME=`basename $0` +COUNT=$1 +PREFIX="od_test_" +GROUP_NAME='od_test_group' +TMPF=/tmp/.${NAME}.$$ +NODE=$2 + +usage () { + echo + echo "Usage: ${NAME} count nodename" + echo + echo " ie. ${NAME} 1000 /Local/Default" + echo + echo " will delete ${GROUPNAME} and 1000 users " + echo " from '${PREFIX}1' to '${PREFIX}1000'" + echo + echo "This tool assumes user 'diradmin' with password 'admin' for OD admin" + echo "when talking to anything other than /Local/Default" + exit 85 # WRONGARGS +} + +if [ $# -ne 2 ]; then + usage +fi + +# if local node we don't need credentials +if [ $NODE != "/Local/Default" ]; then + OD_ADMIN="diradmin" + OD_PASS="admin" +fi + +echo "Deleting users ${PREFIX}1 to ${PREFIX}$COUNT" + +# Using a script file and feed it into dscl is much faster than +# calling dscl everytime. +# +i=1 +echo "Writing a temporary script ..." +if [ -n "$OD_ADMIN" ]; then + echo "auth $OD_ADMIN $OD_PASS" >> $TMPF +fi + +while [ $i -le $COUNT ] +do + result=`dscl $NODE -list Users/${PREFIX}${i} 2> /dev/null` + if [ $? -eq 0 ]; then + echo "delete Users/${PREFIX}${i}" >> $TMPF + printf "\r${PREFIX}${i} / ${COUNT}" + fi + i=`expr $i + 1` +done +echo + +echo "Deleting temporary test groups" +if [ -n "$OD_ADMIN" ]; then + result=`dseditgroup -q -o delete -n $NODE -u $OD_ADMIN -P $OD_PASS ${GROUP_NAME}1 2>&1 /dev/null` + result=`dseditgroup -q -o delete -n $NODE -u $OD_ADMIN -P $OD_PASS ${GROUP_NAME}2 2>&1 /dev/null` +else + result=`dseditgroup -q -o delete -n $NODE ${GROUP_NAME}1 2>&1 /dev/null` + result=`dseditgroup -q -o delete -n $NODE ${GROUP_NAME}2 2>&1 /dev/null` +fi + +result=`dseditgroup -q -o delete com.apple.access_libMicro 2>&1 /dev/null` + +# Now do the real work +# +if [[ -f $TMPF ]]; then + echo "Running dscl to delete users. Please be patient. This takes a while ..." + if [[ -x /usr/sbin/slapconfig ]]; then + /usr/sbin/slapconfig -setfullsyncmode no + fi + + /usr/bin/time dscl ${NODE} < $TMPF + + if [[ -x /usr/sbin/slapconfig ]]; then + /usr/sbin/slapconfig -setfullsyncmode yes + fi +fi + +# and now delete the temp file +# +rm -f $TMPF + +echo 'Finished' + diff --git a/tools/tests/libMicro/siglongjmp.c b/tools/tests/libMicro/siglongjmp.c index b4dfd160e..385530eab 100644 --- a/tools/tests/libMicro/siglongjmp.c +++ b/tools/tests/libMicro/siglongjmp.c @@ -56,7 +56,11 @@ benchmark(void *tsd, result_t *res) { tsd_t *ts = (tsd_t *)tsd; +#if 1 /* Apple fix to longjmp/siglongjmp tests, see radar 7440118 */ + volatile int i = 0; +#else int i = 0; +#endif /* end of Apple fix */ (void) sigsetjmp(ts->ts_env, 1); diff --git a/tools/tests/superpages/testsp.c b/tools/tests/superpages/testsp.c index 97fe85c96..33b637a03 100644 --- a/tools/tests/superpages/testsp.c +++ b/tools/tests/superpages/testsp.c @@ -1,3 +1,10 @@ +/* + * This tests the Mac OS X Superpage API introduced in 10.7 + * + * Note that most of these calls go through the mach_vm_allocate() interface, + * but the actually supported and documented interface is the mmap() one + * (see mmap(2)). + */ #include #include #include @@ -12,8 +19,6 @@ #define SUPERPAGE_SIZE (2*1024*1024) #define SUPERPAGE_MASK (-SUPERPAGE_SIZE) -#define MAP_SUPERPAGE 0x2000 - #ifdef __LP64__ #define FIXED_ADDRESS1 (0x100000000ULL+500*1024*1024) /* at 4 GB + 500 MB virtual */ #define FIXED_ADDRESS2 (0x100000000ULL+502*1024*1024 + 4*1024) /* at 4 GB + 502 MB + 4 KB virtual */ @@ -100,7 +105,6 @@ boolean_t check_nr(mach_vm_address_t addr, mach_vm_size_t size, int *res) { int i; boolean_t ret; -//printf("%d\n", __LINE__); for (i=0; i + + + + CFBundleDevelopmentRegion + English + CFBundleExecutable + ${EXECUTABLE_NAME} + CFBundleIdentifier + com.yourcompany.driver.${PRODUCT_NAME:rfc1034identifier} + CFBundleInfoDictionaryVersion + 6.0 + CFBundlePackageType + KEXT + CFBundleShortVersionString + 1.0 + CFBundleSignature + ???? + CFBundleVersion + 1 + IOKitPersonalities + + testthreadcall + + CFBundleIdentifier + com.yourcompany.driver.${PRODUCT_NAME:identifier} + IOClass + testthreadcall + IOMatchCategory + testthreadcall + IOProviderClass + IOResources + IOResourceMatch + IOKit + + + OSBundleLibraries + + com.apple.kpi.iokit + 11.0 + com.apple.kpi.libkern + 11.0 + com.apple.kpi.mach + 11.0 + + + diff --git a/tools/tests/testkext/testthreadcall.cpp b/tools/tests/testkext/testthreadcall.cpp new file mode 100644 index 000000000..b879b71b2 --- /dev/null +++ b/tools/tests/testkext/testthreadcall.cpp @@ -0,0 +1,65 @@ +/* + * testthreadcall.cpp + * testkext + * + */ + +#define ABSOLUTETIME_SCALAR_TYPE + +#include "testthreadcall.h" + +#include + +#define super IOService +OSDefineMetaClassAndStructors(testthreadcall, super); + +extern "C" { + +static void thread_call_test_func(thread_call_param_t param0, + thread_call_param_t param1); + +} + +bool +testthreadcall::start( IOService * provider ) +{ + boolean_t ret; + uint64_t deadline; + + IOLog("%s\n", __PRETTY_FUNCTION__); + + if (!super::start(provider)) { + return false; + } + + IOLog("Attempting thread_call_allocate\n"); + tcall = thread_call_allocate(thread_call_test_func, this); + IOLog("thread_call_t %p\n", tcall); + + tlock = IOSimpleLockAlloc(); + IOLog("tlock %p\n", tlock); + + clock_interval_to_deadline(5, NSEC_PER_SEC, &deadline); + IOLog("%d sec deadline is %llu\n", 5, deadline); + + ret = thread_call_enter_delayed(tcall, deadline); + + return true; +} + +static void thread_call_test_func(thread_call_param_t param0, + thread_call_param_t param1) +{ + testthreadcall *self = (testthreadcall *)param0; + + IOLog("thread_call_test_func %p %p\n", param0, param1); + + IOSimpleLockLock(self->tlock); + IOSimpleLockUnlock(self->tlock); + +#if 1 + IOSimpleLockLock(self->tlock); +#else + IOSimpleLockUnlock(self->tlock); +#endif +} diff --git a/tools/tests/testkext/testthreadcall.h b/tools/tests/testkext/testthreadcall.h new file mode 100644 index 000000000..2b8973825 --- /dev/null +++ b/tools/tests/testkext/testthreadcall.h @@ -0,0 +1,18 @@ +/* + * testthreadcall.h + * testkext + * + */ + +#include +#include + +class testthreadcall : public IOService { + OSDeclareDefaultStructors(testthreadcall); + + virtual bool start( IOService * provider ); + +public: + thread_call_t tcall; + IOSimpleLock *tlock; +}; \ No newline at end of file diff --git a/tools/tests/testkext/testvmx.cpp b/tools/tests/testkext/testvmx.cpp index eaa93d3ee..4ca0e1fab 100644 --- a/tools/tests/testkext/testvmx.cpp +++ b/tools/tests/testkext/testvmx.cpp @@ -2,9 +2,6 @@ * testvmx.cpp * testkext * - * Created by Shantonu Sen on 10/24/08. - * Copyright 2008 Apple Computer, Inc.. All rights reserved. - * */ #include "testvmx.h" diff --git a/tools/tests/testkext/testvmx.h b/tools/tests/testkext/testvmx.h index 9da7ca9d5..f8937b248 100644 --- a/tools/tests/testkext/testvmx.h +++ b/tools/tests/testkext/testvmx.h @@ -2,9 +2,6 @@ * testvmx.h * testkext * - * Created by Shantonu Sen on 10/24/08. - * Copyright 2008 Apple Computer, Inc.. All rights reserved. - * */ #include diff --git a/tools/tests/xnu_quick_test/32bit_inode_tests.c b/tools/tests/xnu_quick_test/32bit_inode_tests.c index e5effea56..b85bb8911 100644 --- a/tools/tests/xnu_quick_test/32bit_inode_tests.c +++ b/tools/tests/xnu_quick_test/32bit_inode_tests.c @@ -19,7 +19,6 @@ extern char g_target_path[ PATH_MAX ]; extern int g_skip_setuid_tests; -extern int g_is_under_rosetta; extern int g_is_single_user; /* ************************************************************************************************************** diff --git a/tools/tests/xnu_quick_test/README b/tools/tests/xnu_quick_test/README index 3e15a5a91..3bf8a2d8f 100644 --- a/tools/tests/xnu_quick_test/README +++ b/tools/tests/xnu_quick_test/README @@ -28,22 +28,24 @@ is 32 or 64 bits. NOTE - we have several workarounds and test exceptions for some outstanding bugs in xnu. All the workarounds are marked with "todo" and -some comments noting the radar number of the offending bug. Do a seach +some comments noting the radar number of the offending bug. Do a search for "todo" in the source files for this project to locate which tests have known failures. And please tag any new exceptions you find with "todo" in the comment and the radar number of the bug. -To build a fat binary, export ARCH="ppc ppc64 i386 x86_64". This will work +To build a fat binary, export ARCH="i386 x86_64". This will work for any architectures that Apple gcc recognizes. -Added three defines which you can use at the compile line to build variants. +Added four defines which you can use at the compile line to build variants. DEBUG turn on additional printfs CONFORMANCE_TESTS_IN_XNU when conformance tests are in xnu, set this to 1 TEST_SYSTEM_CALLS test system calls (doesn't compile; a different bug) -by default, all three are set to 0, i.e. disabled. To build, export +RUN_UNDER_TESTBOTS + when running under testbots, set this to 1 +by default, all four are set to 0, i.e. disabled. To build, export MORECFLAGS with the values you want set, e.g. export MORECFLAGS="-D DEBUG=1 -D CONFORMANCE_TESTS_IN_XNU=1" @@ -59,7 +61,8 @@ USAGE: xnu_quick_test -target TARGET_PATH -l[ist] # list all the tests this tool performs -r[un] 1, 3, 10 - 19 # run specific tests. enter individual test numbers and/or range of numbers. use -list to list tests. -s[kip] # skip setuid tests - -t[arget] TARGET_PATH # path to directory where tool will create test files. defaults to "/tmp/" + -t[arget] TARGET_PATH # path to directory where tool will create test files. defaults to "/tmp/" + -x[ilog] # To run the xnu_quick_test with xilog reporting enabled. examples: --- Place all test files and directories at the root of volume "test_vol" --- diff --git a/tools/tests/xnu_quick_test/atomic_fifo_queue_test.c b/tools/tests/xnu_quick_test/atomic_fifo_queue_test.c new file mode 100644 index 000000000..06a0e809f --- /dev/null +++ b/tools/tests/xnu_quick_test/atomic_fifo_queue_test.c @@ -0,0 +1,33 @@ +#if defined(i386) || defined(__x86_64__) + +#include +#include +#include +#include + +typedef struct { + void *next; + char *str; +} QueueNode; + +int atomic_fifo_queue_test( void *the_argp ) { + OSFifoQueueHead head = OS_ATOMIC_FIFO_QUEUE_INIT; + char *str1 = "String 1", *str2 = "String 2"; + QueueNode node1 = { 0, str1 }; + OSAtomicFifoEnqueue(&head, &node1, 0); + QueueNode node2 = { 0, str2 }; + OSAtomicFifoEnqueue(&head, &node2, 0); + QueueNode *node_ptr = OSAtomicFifoDequeue(&head, 0); + if( strcmp(node_ptr->str, str1) != 0 ) { + warnx("OSAtomicFifoDequeue returned incorrect string. Expected %s, got %s", str1, node_ptr->str); + return 1; + } + node_ptr = OSAtomicFifoDequeue(&head, 0); + if( strcmp(node_ptr->str, str2) != 0 ) { + warnx("OSAtomicFifoDequeue returned incorrect string. Expected %s, got %s", str2, node_ptr->str); + return 1; + } + return 0; +} + +#endif diff --git a/tools/tests/xnu_quick_test/commpage_tests.c b/tools/tests/xnu_quick_test/commpage_tests.c new file mode 100644 index 000000000..792e78f00 --- /dev/null +++ b/tools/tests/xnu_quick_test/commpage_tests.c @@ -0,0 +1,361 @@ +/* + * commpage_tests.c + * xnu_quick_test + * + * Copyright 2009 Apple Inc. All rights reserved. + * + */ + +#include "tests.h" +#include +#include +#include +#include +#include +#include +#include +#include + + +#ifdef _COMM_PAGE_ACTIVE_CPUS +int active_cpu_test(void); +#endif + +int get_sys_uint64(const char *sel, uint64_t *val); +int get_sys_int32(const char *sel, int32_t *val); + +#define getcommptr(var, commpageaddr) do { \ + var = (typeof(var))(uintptr_t)(commpageaddr); \ + } while(0) + +/* + * Check some of the data in the commpage + * against manual sysctls + */ +int commpage_data_tests( void * the_argp ) +{ + int ret; + uint64_t sys_u64; + int32_t sys_i32; + + volatile uint64_t *comm_u64; + volatile uint32_t *comm_u32; + volatile uint16_t *comm_u16; + volatile uint8_t *comm_u8; + + + /* _COMM_PAGE_CPU_CAPABILITIES */ + getcommptr(comm_u32, _COMM_PAGE_CPU_CAPABILITIES); + + ret = get_sys_int32("hw.ncpu", &sys_i32); + if (ret) goto fail; + + if (sys_i32 != ((*comm_u32 & kNumCPUs) >> kNumCPUsShift)) { + warnx("kNumCPUs does not match hw.ncpu"); + ret = -1; + goto fail; + } + + getcommptr(comm_u8, _COMM_PAGE_NCPUS); + if (sys_i32 != (*comm_u8)) { + warnx("_COMM_PAGE_NCPUS does not match hw.ncpu"); + ret = -1; + goto fail; + } + + ret = get_sys_int32("hw.logicalcpu", &sys_i32); + if (ret) goto fail; + + if (sys_i32 != ((*comm_u32 & kNumCPUs) >> kNumCPUsShift)) { + warnx("kNumCPUs does not match hw.logicalcpu"); + ret = -1; + goto fail; + } + + /* Intel only capabilities */ +#if defined(__i386__) || defined(__x86_64__) + ret = get_sys_int32("hw.optional.mmx", &sys_i32); + if (ret) goto fail; + + if (!(sys_i32) ^ !(*comm_u32 & kHasMMX)) { + warnx("kHasMMX does not match hw.optional.mmx"); + ret = -1; + goto fail; + } + + ret = get_sys_int32("hw.optional.sse", &sys_i32); + if (ret) goto fail; + + if (!(sys_i32) ^ !(*comm_u32 & kHasSSE)) { + warnx("kHasSSE does not match hw.optional.sse"); + ret = -1; + goto fail; + } + ret = get_sys_int32("hw.optional.sse2", &sys_i32); + if (ret) goto fail; + + if (!(sys_i32) ^ !(*comm_u32 & kHasSSE2)) { + warnx("kHasSSE2 does not match hw.optional.sse2"); + ret = -1; + goto fail; + } + + ret = get_sys_int32("hw.optional.sse3", &sys_i32); + if (ret) goto fail; + + if (!(sys_i32) ^ !(*comm_u32 & kHasSSE3)) { + warnx("kHasSSE3 does not match hw.optional.sse3"); + ret = -1; + goto fail; + } + + ret = get_sys_int32("hw.optional.supplementalsse3", &sys_i32); + if (ret) goto fail; + + if (!(sys_i32) ^ !(*comm_u32 & kHasSupplementalSSE3)) { + warnx("kHasSupplementalSSE3 does not match hw.optional.supplementalsse3"); + ret = -1; + goto fail; + } + + ret = get_sys_int32("hw.optional.sse4_1", &sys_i32); + if (ret) goto fail; + + if (!(sys_i32) ^ !(*comm_u32 & kHasSSE4_1)) { + warnx("kHasSSE4_1 does not match hw.optional.sse4_1"); + ret = -1; + goto fail; + } + + ret = get_sys_int32("hw.optional.sse4_2", &sys_i32); + if (ret) goto fail; + + if (!(sys_i32) ^ !(*comm_u32 & kHasSSE4_2)) { + warnx("kHasSSE4_2 does not match hw.optional.sse4_2"); + ret = -1; + goto fail; + } + + ret = get_sys_int32("hw.optional.aes", &sys_i32); + if (ret) goto fail; + + if (!(sys_i32) ^ !(*comm_u32 & kHasAES)) { + warnx("kHasAES does not match hw.optional.aes"); + ret = -1; + goto fail; + } + + ret = get_sys_int32("hw.optional.x86_64", &sys_i32); + if (ret) goto fail; + + if (!(sys_i32) ^ !(*comm_u32 & k64Bit)) { + warnx("k64Bit does not match hw.optional.x86_64"); + ret = -1; + goto fail; + } +#endif /* __i386__ || __x86_64__ */ + + /* These fields are not implemented for all architectures */ +#ifdef _COMM_PAGE_SCHED_GEN + uint32_t preempt_count1, preempt_count2; + uint64_t count; + + ret = get_sys_uint64("hw.cpufrequency_max", &sys_u64); + if (ret) goto fail; + + getcommptr(comm_u32, _COMM_PAGE_SCHED_GEN); + preempt_count1 = *comm_u32; + /* execute for around 1 quantum (10ms) */ + for(count = MAX(10000000ULL, sys_u64/64); count > 0; count--) { + asm volatile(""); + } + preempt_count2 = *comm_u32; + if (preempt_count1 >= preempt_count2) { + warnx("_COMM_PAGE_SCHED_GEN not incrementing (%u => %u)", + preempt_count1, preempt_count2); + ret = -1; + goto fail; + } +#endif /* _COMM_PAGE_SCHED_GEN */ + +#ifdef _COMM_PAGE_ACTIVE_CPUS + ret = get_sys_int32("hw.activecpu", &sys_i32); + if (ret) goto fail; + + getcommptr(comm_u8, _COMM_PAGE_ACTIVE_CPUS); + if (sys_i32 != (*comm_u8)) { + warnx("_COMM_PAGE_ACTIVE_CPUS does not match hw.activecpu"); + ret = -1; + goto fail; + } + + ret = active_cpu_test(); + if (ret) goto fail; +#endif /* _COMM_PAGE_ACTIVE_CPUS */ + +#ifdef _COMM_PAGE_PHYSICAL_CPUS + ret = get_sys_int32("hw.physicalcpu_max", &sys_i32); + if (ret) goto fail; + + getcommptr(comm_u8, _COMM_PAGE_PHYSICAL_CPUS); + if (sys_i32 != (*comm_u8)) { + warnx("_COMM_PAGE_PHYSICAL_CPUS does not match hw.physicalcpu_max"); + ret = -1; + goto fail; + } +#endif /* _COMM_PAGE_PHYSICAL_CPUS */ + +#ifdef _COMM_PAGE_LOGICAL_CPUS + ret = get_sys_int32("hw.logicalcpu_max", &sys_i32); + if (ret) goto fail; + + getcommptr(comm_u8, _COMM_PAGE_LOGICAL_CPUS); + if (sys_i32 != (*comm_u8)) { + warnx("_COMM_PAGE_LOGICAL_CPUS does not match hw.logicalcpu_max"); + ret = -1; + goto fail; + } +#endif /* _COMM_PAGE_LOGICAL_CPUS */ + +#if 0 +#ifdef _COMM_PAGE_MEMORY_SIZE + ret = get_sys_uint64("hw.memsize", &sys_u64); + if (ret) goto fail; + + getcommptr(comm_u64, _COMM_PAGE_MEMORY_SIZE); + if (sys_u64 != (*comm_u64)) { + warnx("_COMM_PAGE_MEMORY_SIZE does not match hw.memsize"); + ret = -1; + goto fail; + } +#endif /* _COMM_PAGE_MEMORY_SIZE */ +#endif + + ret = 0; + +fail: + + return ret; +} + + +int get_sys_uint64(const char *sel, uint64_t *val) +{ + size_t size = sizeof(*val); + int ret; + + ret = sysctlbyname(sel, val, &size, NULL, 0); + if (ret == -1) { + warn("sysctlbyname(%s)", sel); + return ret; + } + +// warnx("sysctlbyname(%s) => %llx", sel, *val); + + return 0; +} + +int get_sys_int32(const char *sel, int32_t *val) +{ + size_t size = sizeof(*val); + int ret; + + ret = sysctlbyname(sel, val, &size, NULL, 0); + if (ret == -1) { + warn("sysctlbyname(%s)", sel); + return ret; + } + +// warnx("sysctlbyname(%s) => %x", sel, *val); + + return 0; +} + +#ifdef _COMM_PAGE_ACTIVE_CPUS +/* + * Try to find a secondary processor that we can disable, + * and make sure the commpage reflects that. This test + * will pass on UP systems, and if all secondary processors + * have been manually disabled + */ +int active_cpu_test(void) +{ + volatile uint8_t *activeaddr; + uint8_t original_activecpu; + boolean_t test_failed = FALSE; + + /* Code stolen from hostinfo.c */ + kern_return_t ret; + processor_t *processor_list; + host_name_port_t host; + struct processor_basic_info processor_basic_info; + int cpu_count; + int data_count; + int i; + + + getcommptr(activeaddr, _COMM_PAGE_ACTIVE_CPUS); + original_activecpu = *activeaddr; + + host = mach_host_self(); + ret = host_processors(host, + (processor_array_t *) &processor_list, &cpu_count); + if (ret != KERN_SUCCESS) { + mach_error("host_processors()", ret); + return ret; + } + + /* skip master processor */ + for (i = 1; i < cpu_count; i++) { + data_count = PROCESSOR_BASIC_INFO_COUNT; + ret = processor_info(processor_list[i], PROCESSOR_BASIC_INFO, + &host, + (processor_info_t) &processor_basic_info, + &data_count); + if (ret != KERN_SUCCESS) { + if (ret == MACH_SEND_INVALID_DEST) { + continue; + } + mach_error("processor_info", ret); + return ret; + } + + if (processor_basic_info.running) { + /* found victim */ + ret = processor_exit(processor_list[i]); + if (ret != KERN_SUCCESS) { + mach_error("processor_exit()", ret); + return ret; + } + + sleep(1); + + if (*activeaddr != (original_activecpu - 1)) { + test_failed = TRUE; + } + + ret = processor_start(processor_list[i]); + if (ret != KERN_SUCCESS) { + mach_error("processor_exit()", ret); + return ret; + } + + sleep(1); + + break; + } + } + + if (test_failed) { + warnx("_COMM_PAGE_ACTIVE_CPUS not updated after disabling a CPU"); + return -1; + } + + if (*activeaddr != original_activecpu) { + warnx("_COMM_PAGE_ACTIVE_CPUS not restored to original value"); + return -1; + } + + return 0; +} +#endif diff --git a/tools/tests/xnu_quick_test/helpers/arch.c b/tools/tests/xnu_quick_test/helpers/arch.c index f526ed027..1e4f867a5 100644 --- a/tools/tests/xnu_quick_test/helpers/arch.c +++ b/tools/tests/xnu_quick_test/helpers/arch.c @@ -6,12 +6,6 @@ */ int main() { -#if __ppc__ - return CPU_TYPE_POWERPC; -#endif /* __ppc__ */ -#if __ppc64__ - return CPU_TYPE_POWERPC64; -#endif /* __ppc64__ */ #if __i386__ return CPU_TYPE_I386; #endif /* __i386__ */ diff --git a/tools/tests/xnu_quick_test/helpers/data_exec.c b/tools/tests/xnu_quick_test/helpers/data_exec.c index f8b353c5e..8cd7c0316 100644 --- a/tools/tests/xnu_quick_test/helpers/data_exec.c +++ b/tools/tests/xnu_quick_test/helpers/data_exec.c @@ -14,10 +14,10 @@ jmp_buf resume; #define ALT_STK_SIZE (MINSIGSTKSZ + pagesize) -#if __i386__ || __ppc__ +#if __i386__ typedef unsigned int psint_t; #endif -#if __x86_64__ || __ppc64__ +#if __x86_64__ typedef unsigned long long psint_t; #endif @@ -43,25 +43,18 @@ int verbose = 0; #define FAIL -1 /* can't use 0 since setjmp uses that */ int expected[4] = { -#if __i386__ +#if NXDATA32TESTNONX SUCCEED, /* execute from heap */ SUCCEED, /* exeucte from heap with PROT_EXEC */ FAIL, /* execute from stack */ SUCCEED, /* exeucte from stack with PROT_EXEC */ -#endif -#if __x86_64__ - FAIL, /* execute from heap */ +#elif __i386__ + FAIL, /* execute from heap */ SUCCEED, /* exeucte from heap with PROT_EXEC */ FAIL, /* execute from stack */ SUCCEED, /* exeucte from stack with PROT_EXEC */ #endif -#if __ppc__ - SUCCEED, /* execute from heap */ - SUCCEED, /* exeucte from heap with PROT_EXEC */ - SUCCEED, /* execute from stack */ - SUCCEED, /* exeucte from stack with PROT_EXEC */ -#endif -#if __ppc64__ +#if __x86_64__ FAIL, /* execute from heap */ SUCCEED, /* exeucte from heap with PROT_EXEC */ FAIL, /* execute from stack */ diff --git a/tools/tests/xnu_quick_test/helpers/launch.c b/tools/tests/xnu_quick_test/helpers/launch.c index 7e8638eff..206116042 100644 --- a/tools/tests/xnu_quick_test/helpers/launch.c +++ b/tools/tests/xnu_quick_test/helpers/launch.c @@ -71,43 +71,6 @@ int main(int argc, const char * argv[]) #endif -#if defined(__ppc__) - /* - * This is the helper binary for the PPC64 version of xnu_quick_test. xnu_quick_test - * forks and execs this code to test exec()ing from a 32-bit binary. - */ - errmsg = "execve failed: from ppc forking and exec()ing ppc process.\n"; - argvs[0] = "sleep-ppc32"; - if (do_execve_test("helpers/sleep-ppc32", argvs, NULL, 0)) goto test_failed_exit; - - errmsg = "execve failed: from ppc forking and exec()ing ppc64 process w/ 4G pagezero.\n"; - argvs[0] = "sleep-ppc64-4G"; - if (do_execve_test("helpers/sleep-ppc64-4G", argvs, NULL, 0)) goto test_failed_exit; - - errmsg = "execve failed: from ppc forking and exec()ing ppc64 process w/ 4K pagezero.\n"; - argvs[0] = "sleep-ppc64-4K"; - if (do_execve_test("helpers/sleep-ppc64-4K", argvs, NULL, 0)) goto test_failed_exit; -#endif - - -#if defined(__ppc64__) - /* - * This is the helper binary for the ppc version of xnu_quick_test. xnu_quick_test - * forks and execs this code to test exec()ing from a 64-bit binary. - */ - errmsg = "execve failed: from ppc64 forking and exec()ing 64-bit ppc process w/ 4G pagezero.\n"; - argvs[0] = "sleep-ppc64-4G"; - if (do_execve_test("helpers/sleep-ppc64-4G", argvs, NULL, 1)) goto test_failed_exit; - - errmsg = "execve failed: from ppc64 forking and exec()ing 64-bit ppc process w/ 4K pagezero.\n"; - argvs[0] = "sleep-ppc64-4K"; - if (do_execve_test("helpers/sleep-ppc64-4K", argvs, NULL, 1)) goto test_failed_exit; - - errmsg = "execve failed: from ppc64 forking and exec()ing 32 bit ppc process.\n"; - argvs[0] = "sleep-ppc32"; - if (do_execve_test("helpers/sleep-ppc32", argvs, NULL, 1)) goto test_failed_exit; -#endif - /* * We are ourselves launched with do_execve_test, which wants a chance to * send a SIGKILL diff --git a/tools/tests/xnu_quick_test/kqueue_tests.c b/tools/tests/xnu_quick_test/kqueue_tests.c index 0d872ccea..40069048c 100644 --- a/tools/tests/xnu_quick_test/kqueue_tests.c +++ b/tools/tests/xnu_quick_test/kqueue_tests.c @@ -18,7 +18,6 @@ extern char g_target_path[ PATH_MAX ]; extern int g_skip_setuid_tests; -extern int g_is_under_rosetta; int msg_count = 14; int last_msg_seen = 0; @@ -144,6 +143,7 @@ kmsg_consumer_thread(void * arg) int kqueue_tests( void * the_argp ) { int my_err, my_status; + void *my_pthread_join_status; int my_kqueue = -1; int my_kqueue64 = -1; int my_fd = -1; @@ -271,22 +271,20 @@ int kqueue_tests( void * the_argp ) } #if !TARGET_OS_EMBEDDED - if (!g_is_under_rosetta) { - /* use kevent64 to test EVFILT_PROC */ - EV_SET64( &my_kevent64, my_pid, EVFILT_PROC, EV_ADD, NOTE_EXIT, 0, 0, 0, 0 ); - my_err = kevent64( my_kqueue, &my_kevent64, 1, NULL, 0, 0, 0); - if ( my_err != -1 && errno != EINVAL ) { - printf( "kevent64 call should fail with kqueue used for kevent() - %d\n", my_err); - goto test_failed_exit; - } + /* use kevent64 to test EVFILT_PROC */ + EV_SET64( &my_kevent64, my_pid, EVFILT_PROC, EV_ADD, NOTE_EXIT, 0, 0, 0, 0 ); + my_err = kevent64( my_kqueue, &my_kevent64, 1, NULL, 0, 0, 0); + if ( my_err != -1 && errno != EINVAL ) { + printf( "kevent64 call should fail with kqueue used for kevent() - %d\n", my_err); + goto test_failed_exit; + } - my_kqueue64 = kqueue(); - EV_SET64( &my_kevent64, my_pid, EVFILT_PROC, EV_ADD, NOTE_EXIT, 0, 0, 0, 0 ); - my_err = kevent64( my_kqueue64, &my_kevent64, 1, NULL, 0, 0, 0); - if ( my_err == -1 ) { - printf( "kevent64 call to get proc exit failed with error %d - \"%s\" \n", errno, strerror( errno) ); - goto test_failed_exit; - } + my_kqueue64 = kqueue(); + EV_SET64( &my_kevent64, my_pid, EVFILT_PROC, EV_ADD, NOTE_EXIT, 0, 0, 0, 0 ); + my_err = kevent64( my_kqueue64, &my_kevent64, 1, NULL, 0, 0, 0); + if ( my_err == -1 ) { + printf( "kevent64 call to get proc exit failed with error %d - \"%s\" \n", errno, strerror( errno) ); + goto test_failed_exit; } #endif @@ -344,28 +342,26 @@ int kqueue_tests( void * the_argp ) } #if !TARGET_OS_EMBEDDED - if (!g_is_under_rosetta) { - /* look for child exit notification on the kevent64 kqueue */ - EV_SET64( &my_kevent64, my_pid, EVFILT_PROC, EV_CLEAR, NOTE_EXIT, 0, 0, 0, 0 ); - my_err = kevent64( my_kqueue64, NULL, 0, &my_kevent64, 1, 0, 0); - if ( my_err == -1 ) { - printf( "kevent64 call to get child exit failed with error %d - \"%s\" \n", errno, strerror( errno) ); - goto test_failed_exit; - } - if ( my_err == 0 ) { - printf( "kevent64 call to get proc exit event did not return any when it should have \n" ); - goto test_failed_exit; - } - if ( my_kevent64.filter != EVFILT_PROC ) { - printf( "kevent64 call to get proc exit event did not return EVFILT_PROC \n" ); - printf( "filter %i \n", my_kevent64.filter ); - goto test_failed_exit; - } - if ( (my_kevent64.fflags & NOTE_EXIT) == 0 ) { - printf( "kevent64 call to get proc exit event did not return NOTE_EXIT \n" ); - printf( "fflags 0x%02X \n", my_kevent64.fflags ); - goto test_failed_exit; - } + /* look for child exit notification on the kevent64 kqueue */ + EV_SET64( &my_kevent64, my_pid, EVFILT_PROC, EV_CLEAR, NOTE_EXIT, 0, 0, 0, 0 ); + my_err = kevent64( my_kqueue64, NULL, 0, &my_kevent64, 1, 0, 0); + if ( my_err == -1 ) { + printf( "kevent64 call to get child exit failed with error %d - \"%s\" \n", errno, strerror( errno) ); + goto test_failed_exit; + } + if ( my_err == 0 ) { + printf( "kevent64 call to get proc exit event did not return any when it should have \n" ); + goto test_failed_exit; + } + if ( my_kevent64.filter != EVFILT_PROC ) { + printf( "kevent64 call to get proc exit event did not return EVFILT_PROC \n" ); + printf( "filter %i \n", my_kevent64.filter ); + goto test_failed_exit; + } + if ( (my_kevent64.fflags & NOTE_EXIT) == 0 ) { + printf( "kevent64 call to get proc exit event did not return NOTE_EXIT \n" ); + printf( "fflags 0x%02X \n", my_kevent64.fflags ); + goto test_failed_exit; } my_wait_pid = wait4( my_pid, &my_status, 0, NULL ); @@ -472,12 +468,12 @@ int kqueue_tests( void * the_argp ) for (my_index = 0; my_index < 3; my_index++) { - my_err = pthread_join( my_threadv[my_index], (void **)&my_status ); + my_err = pthread_join( my_threadv[my_index], &my_pthread_join_status ); if ( my_err != 0 ) { printf( "pthread_join failed with error %d - %s \n", my_err, strerror(my_err) ); goto test_failed_exit; } - if ( my_status != 0 ) { + if ( my_pthread_join_status != 0 ) { goto test_failed_exit; } } diff --git a/tools/tests/xnu_quick_test/machvm_tests.c b/tools/tests/xnu_quick_test/machvm_tests.c index d478807a7..8ea8b645b 100644 --- a/tools/tests/xnu_quick_test/machvm_tests.c +++ b/tools/tests/xnu_quick_test/machvm_tests.c @@ -13,8 +13,6 @@ #include #include -extern int g_is_under_rosetta; - int machvm_tests( void * the_argp ) { int pagesize = getpagesize(); @@ -155,22 +153,20 @@ int machvm_tests( void * the_argp ) } } - // do a vm_copy of our mach-o header and compare. Rosetta doesn't support this, though - if (!g_is_under_rosetta) { + // do a vm_copy of our mach-o header and compare. - kret = vm_write(mach_task_self(), (vm_address_t)regionbuffers[2], + kret = vm_write(mach_task_self(), (vm_address_t)regionbuffers[2], (vm_offset_t)&_mh_execute_header, pagesize); - if (kret != KERN_SUCCESS) { - warnx("vm_write of %d pages failed: %d", 1, kret); - goto fail; - } - - if (_mh_execute_header.magic != *(uint32_t *)regionbuffers[2]) { - warnx("vm_write comparison failed"); - kret = -1; - goto fail; - } + if (kret != KERN_SUCCESS) { + warnx("vm_write of %d pages failed: %d", 1, kret); + goto fail; } + + if (_mh_execute_header.magic != *(uint32_t *)regionbuffers[2]) { + warnx("vm_write comparison failed"); + kret = -1; + goto fail; + } // check that the vm_protects above worked { @@ -180,8 +176,11 @@ int machvm_tests( void * the_argp ) vm_region_basic_info_t basic = (vm_region_basic_info_t)_basic; int _basic64[VM_REGION_BASIC_INFO_COUNT_64]; vm_region_basic_info_64_t basic64 = (vm_region_basic_info_64_t)_basic64; + int _submap[VM_REGION_SUBMAP_INFO_COUNT]; + vm_region_submap_info_t submap = (vm_region_submap_info_t)_submap; mach_msg_type_number_t infocnt; mach_port_t objname; + natural_t nesting_depth = 0; #if !__LP64__ infocnt = VM_REGION_BASIC_INFO_COUNT; @@ -242,16 +241,115 @@ int machvm_tests( void * the_argp ) #if !__LP64__ // try to compare some stuff. Particularly important for fields after offset - if (!g_is_under_rosetta) { - if (basic->offset != basic64->offset || - basic->behavior != basic64->behavior || - basic->user_wired_count != basic64->user_wired_count) { - warnx("vm_region and vm_region_64 did not agree"); - kret = -1; - goto fail; - } - } + if (basic->offset != basic64->offset || + basic->behavior != basic64->behavior || + basic->user_wired_count != basic64->user_wired_count) { + warnx("vm_region and vm_region_64 did not agree"); + kret = -1; + goto fail; + } #endif + +#if !__LP64__ + infocnt = VM_REGION_SUBMAP_INFO_COUNT; + kret = vm_region_recurse(mach_task_self(), &addr, &size, + &nesting_depth, (vm_region_info_t)submap, + &infocnt); + if (kret != KERN_SUCCESS) { + warnx("vm_region_recurse() failed: %d", kret); + goto fail; + } + + if (VM_REGION_SUBMAP_INFO_COUNT != infocnt) { + warnx("vm_region_recurse() returned a bad info count"); + kret = -1; + goto fail; + } + + if (submap->pages_dirtied != 10) { + warnx("vm_region_recurse() returned bage pages_dirtied"); + kret = -1; + goto fail; + } + +#endif /* !__LP64__ */ + + } + + // exercise mach_make_memory_entry/vm_map + { + vm_address_t addr1, addr2; + vm_size_t size; + mach_port_t mem_handle = MACH_PORT_NULL; + + addr1 = 0; + size = 11*pagesize; + kret = vm_allocate(mach_task_self(), &addr1, size, VM_FLAGS_ANYWHERE); + if (kret != KERN_SUCCESS) { + warnx("vm_allocate failed: %d", kret); + kret = -1; + goto fail; + } + + *(uint32_t *)(uintptr_t)addr1 = 'test'; + + kret = mach_make_memory_entry(mach_task_self(), + &size, addr1, VM_PROT_DEFAULT, + &mem_handle, MACH_PORT_NULL); + if (kret != KERN_SUCCESS) { + warnx("mach_make_memory_entry failed: %d", kret); + kret = -1; + goto fail; + } + + kret = vm_deallocate(mach_task_self(), addr1, size); + if (kret != KERN_SUCCESS) { + warnx("vm_deallocate failed: %d", kret); + kret = -1; + goto fail; + } + + addr2 = 0; + kret = vm_map(mach_task_self(), &addr2, size, 0, VM_FLAGS_ANYWHERE, + mem_handle, 0, FALSE, VM_PROT_DEFAULT, VM_PROT_DEFAULT, + VM_INHERIT_NONE); + if (kret != KERN_SUCCESS) { + warnx("vm_map failed: %d", kret); + kret = -1; + goto fail; + } + + if (*(uint32_t *)(uintptr_t)addr2 != 'test') { + warnx("mapped data mismatch"); + kret = -1; + goto fail; + } + + kret = vm_deallocate(mach_task_self(), addr2, size); + if (kret != KERN_SUCCESS) { + warnx("vm_deallocate failed: %d", kret); + kret = -1; + goto fail; + } + + kret = mach_port_mod_refs(mach_task_self(), mem_handle, MACH_PORT_RIGHT_SEND, -1); + if (kret != KERN_SUCCESS) { + warnx("mach_port_mod_refs(-1) failed: %d", kret); + kret = -1; + goto fail; + } + + addr2 = 0; + kret = vm_map(mach_task_self(), &addr2, size, 0, VM_FLAGS_ANYWHERE, + mem_handle, 0, FALSE, VM_PROT_DEFAULT, VM_PROT_DEFAULT, + VM_INHERIT_NONE); + if (kret == KERN_SUCCESS) { + warnx("vm_map succeeded when it should not have"); + kret = -1; + goto fail; + } + + kret = KERN_SUCCESS; } fail: diff --git a/tools/tests/xnu_quick_test/main.c b/tools/tests/xnu_quick_test/main.c index 0c116215c..d1ca1574d 100644 --- a/tools/tests/xnu_quick_test/main.c +++ b/tools/tests/xnu_quick_test/main.c @@ -119,6 +119,11 @@ struct test_entry g_tests[] = {1, &message_queue_tests, NULL, "msgctl, msgget, msgrcv, msgsnd"}, {1, &data_exec_tests, NULL, "data/stack execution"}, {1, &machvm_tests, NULL, "Mach VM calls"}, + {1, &commpage_data_tests, NULL, "Commpage data"}, +#if defined(i386) || defined(__x86_64__) + {1, &atomic_fifo_queue_test, NULL, "OSAtomicFifoEnqueue, OSAtomicFifoDequeue"}, +#endif + {1, &sched_tests, NULL, "Scheduler tests"}, {0, NULL, NULL, "last one"} }; @@ -136,11 +141,13 @@ int g_skip_setuid_tests = 0; int g_xilog_active = 0; const char * g_cmd_namep; char g_target_path[ PATH_MAX ]; -int g_is_under_rosetta = 0; int g_is_single_user = 0; +int g_testbots_active = 0; int main( int argc, const char * argv[] ) { + #pragma unused(argc) + #pragma unused(argv) int my_tests_count, i; int err; int my_failures = 0; @@ -255,20 +262,16 @@ int main( int argc, const char * argv[] ) /* done parsing. */ - -#ifdef __ppc__ - /* determine if we are running under Rosetta - */ - { - int val = 0; - size_t size = sizeof val; - if (sysctlbyname("sysctl.proc_native", &val, &size, NULL, 0) == -1) - g_is_under_rosetta = 0; - else - g_is_under_rosetta = val ? 0 : 1; - } -#endif +/* Check if we are running under testbots */ +#if RUN_UNDER_TESTBOTS +g_testbots_active = 1; +#endif + /* Code added to run xnu_quick_test under testbots */ + if ( g_testbots_active == 1 ) { + printf("[TEST] xnu_quick_test \n"); /* Declare the beginning of test suite */ + } + /* Populate groups list if we're in single user mode */ if (setgroups_if_single_user()) { return 1; @@ -296,14 +299,15 @@ int main( int argc, const char * argv[] ) create_target_directory( my_targetp ); printf( "Will allow %ld failures before testing is aborted \n", g_max_failures ); - if (g_is_under_rosetta) { - printf("Running under Rosetta.\n"); - } - my_start_time = time( NULL ); printf( "\nBegin testing - %s \n", ctime_r( &my_start_time, &my_buffer[0] ) ); printf( "Current architecture is %s\n", current_arch() ); + /* Code added to run xnu_quick_test under testbots */ + if ( g_testbots_active == 1 ) { + printf("[PASS] xnu_quick_test started\n"); + } + /* run each test that is marked to run in our table until we complete all of them or * hit the maximum number of failures. */ @@ -322,6 +326,7 @@ int main( int argc, const char * argv[] ) } #endif printf( "test #%d - %s \n", (i + 1), my_testp->test_infop ); + fflush(stdout); my_err = my_testp->test_routine( my_testp->test_input ); if ( my_err != 0 ) { printf("\t--> FAILED \n"); @@ -334,19 +339,38 @@ int main( int argc, const char * argv[] ) my_failures++; if ( my_failures > g_max_failures ) { #if !TARGET_OS_EMBEDDED - if (g_xilog_active == 1) { + if (g_xilog_active == 1) { + XILogMsg("Reached the maximum number of failures - Aborting xnu_quick_test."); XILogEndTestCase( logRef, kXILogTestPassOnErrorLevel ); } #endif - printf( "\n too many failures - test aborted \n" ); + printf( "\n Reached the maximum number of failures - Aborting xnu_quick_test. \n" ); + /* Code added to run xnu_quick_test under testbots */ + if ( g_testbots_active == 1 ) { + printf("[FAIL] %s \n", my_testp->test_infop); + } goto exit_this_routine; } + /* Code added to run xnu_quick_test under testbots */ + if ( g_testbots_active == 1 ) { + printf("[FAIL] %s \n", my_testp->test_infop); + } +#if !TARGET_OS_EMBEDDED + if (g_xilog_active == 1) { + XILogEndTestCase( logRef, kXILogTestPassOnErrorLevel ); + } +#endif + continue; } #if !TARGET_OS_EMBEDDED if (g_xilog_active == 1) { XILogEndTestCase(logRef, kXILogTestPassOnErrorLevel); } #endif + /* Code added to run xnu_quick_test under testbots */ + if ( g_testbots_active == 1 ) { + printf("[PASS] %s \n", my_testp->test_infop); + } } exit_this_routine: diff --git a/tools/tests/xnu_quick_test/makefile b/tools/tests/xnu_quick_test/makefile index 65a770c3e..554416475 100644 --- a/tools/tests/xnu_quick_test/makefile +++ b/tools/tests/xnu_quick_test/makefile @@ -4,7 +4,7 @@ Embedded=$(shell tconf --test TARGET_OS_EMBEDDED) ifeq "$(Embedded)" "YES" XILogFLAG = -SDKPATH = $(shell xcodebuild -sdk $(SDKROOT) -version | grep Path | cut -f 2 -d " ") +SDKPATH = $(shell xcodebuild -sdk $(SDKROOT) -version Path) CFLAGS += -isysroot $(SDKPATH) LIBFLAGS += -isysroot $(SDKPATH) else @@ -38,7 +38,7 @@ else endif ifndef ARCH - ARCH=i386 x86_64 ppc + ARCH=i386 x86_64 # this hack should be removed once tconf gets # ifeq "$(Product)" "iPhone" @@ -56,6 +56,7 @@ else CFLAGS += $(MY_ARCH) endif + CFLAGS += -g -I $(SDKPATH)/System/Library/Frameworks/System.framework/Versions/B/PrivateHeaders/ -F/AppleInternal/Library/Frameworks/ $(MORECFLAGS) LIBFLAGS += -I $(SDKPATH)/System/Library/Frameworks/System.framework/Versions/B/PrivateHeaders -F/AppleInternal/Library/Frameworks/ $(XILogFLAG) @@ -63,7 +64,8 @@ MY_OBJECTS = $(OBJROOT)/main.o $(OBJROOT)/memory_tests.o $(OBJROOT)/misc.o \ $(OBJROOT)/sema_tests.o $(OBJROOT)/shared_memory_tests.o \ $(OBJROOT)/socket_tests.o $(OBJROOT)/tests.o \ $(OBJROOT)/xattr_tests.o $(OBJROOT)/kqueue_tests.o \ - $(OBJROOT)/machvm_tests.o + $(OBJROOT)/machvm_tests.o $(OBJROOT)/commpage_tests.o \ + $(OBJROOT)/atomic_fifo_queue_test.o $(OBJROOT)/sched_tests.o ifneq "$(Product)" "iPhone" MY_OBJECTS += $(OBJROOT)/32bit_inode_tests.o @@ -79,6 +81,14 @@ xnu_quick_test : $(OBJROOT) $(DSTROOT) $(MY_OBJECTS) helpers @$(CHOWN_COMMAND) || echo $(PERM_ADVICE) sudo chmod 4755 $(DSTROOT)/xnu_quick_test +# This target is defined for testbots. +# Before compiling this target, MORECFLAGS must be set to "-D RUN_UNDER_TESTBOTS=1", check README file for more details +# NOTE: -f[ailures] MAX_FAILS_ALLOWED option is set to 100 to make sure we completely run the test suite and +# report all the failures. + +testbots: xnu_quick_test + @(cd $(DSTROOT) ; ./xnu_quick_test -f 100) + # The helper binaries are used to test exec()'ing between 64bit and 32bit. # Creates test binaries with page zero sizes = 4KB and 4GB. Also creates 32-bit # helper processes for the 64-bit version of xnu_quick_test to test the conversion @@ -90,21 +100,21 @@ endif ifeq "$(Product)" "MacOSX" $(CC) -arch x86_64 -pagezero_size 0x100000000 helpers/sleep.c -o $(DSTROOT)/helpers/sleep-x86_64-4G $(CC) -arch x86_64 -pagezero_size 0x1000 helpers/sleep.c -o $(DSTROOT)/helpers/sleep-x86_64-4K - $(CC) -arch ppc helpers/sleep.c -o $(DSTROOT)/helpers/sleep-ppc32 endif ifneq "$(Product)" "iPhone" $(CC) $(LIBFLAGS) -arch i386 $(OBJROOT)/misc.o helpers/launch.c -o $(DSTROOT)/helpers/launch-i386 endif ifeq "$(Product)" "MacOSX" $(CC) $(LIBFLAGS) -arch x86_64 $(OBJROOT)/misc.o helpers/launch.c -o $(DSTROOT)/helpers/launch-x86_64 - $(CC) $(LIBFLAGS) -arch ppc $(OBJROOT)/misc.o helpers/launch.c -o $(DSTROOT)/helpers/launch-ppc $(CC) $(MY_ARCH) helpers/arch.c -o $(DSTROOT)/helpers/arch $(CC) $(MY_ARCH) helpers/data_exec.c -o $(DSTROOT)/helpers/data_exec + $(CC) -arch i386 -DNXDATA32TESTNONX helpers/data_exec.c -o $(DSTROOT)/helpers/data_exec32nonxspawn endif ifeq "$(Product)" "iPhone" $(CC) -arch armv6 -isysroot $(SDKROOT) $(CFLAGS) helpers/sleep.c -o $(DSTROOT)/helpers/sleep-arm $(CC) $(LIBFLAGS) -arch armv6 -isysroot $(SDKROOT) $(OBJROOT)/misc.o helpers/launch.c -o $(DSTROOT)/helpers/launch-arm + $(CC) $(MY_ARCH) -isysroot $(SDKROOT) helpers/arch.c -o $(DSTROOT)/helpers/arch endif @@ -128,7 +138,7 @@ $(OBJROOT)/misc.o : misc.c tests.h ifeq "$(Product)" "iPhone" $(CC) -arch armv6 $(CFLAGS) -c misc.c -o $@ else - $(CC) -arch i386 -arch x86_64 -arch ppc $(CFLAGS) -c misc.c -o $@ + $(CC) -arch i386 -arch x86_64 $(CFLAGS) -c misc.c -o $@ endif $(OBJROOT)/sema_tests.o : sema_tests.c tests.h @@ -149,12 +159,22 @@ $(OBJROOT)/xattr_tests.o : xattr_tests.c tests.h $(OBJROOT)/machvm_tests.o : machvm_tests.c tests.h $(CC) $(CFLAGS) -c machvm_tests.c -o $@ +$(OBJROOT)/sched_tests.o : sched_tests.c tests.h + $(CC) $(CFLAGS) -c sched_tests.c -o $@ + $(OBJROOT)/kqueue_tests.o : kqueue_tests.c tests.h $(CC) $(CFLAGS) -c kqueue_tests.c -o $@ $(OBJROOT)/32bit_inode_tests.o : 32bit_inode_tests.c tests.h $(CC) $(CFLAGS) -c 32bit_inode_tests.c -o $@ +$(OBJROOT)/commpage_tests.o : commpage_tests.c tests.h + $(CC) $(CFLAGS) -c commpage_tests.c -o $@ + +$(OBJROOT)/atomic_fifo_queue_test.o : atomic_fifo_queue_test.c tests.h + $(CC) $(CFLAGS) -c atomic_fifo_queue_test.c -o $@ + + ifndef DOING_BUILDIT .PHONY : clean clean : diff --git a/tools/tests/xnu_quick_test/memory_tests.c b/tools/tests/xnu_quick_test/memory_tests.c index eb8817b9a..dc8675087 100644 --- a/tools/tests/xnu_quick_test/memory_tests.c +++ b/tools/tests/xnu_quick_test/memory_tests.c @@ -9,9 +9,92 @@ #include "tests.h" #include +#include /* crashcount() */ extern char g_target_path[ PATH_MAX ]; +/* + * static to localize to this compilation unit; volatile to avoid register + * optimization which would prevent modification by a signal handler. + */ +static volatile int my_err; + +/* + * Handler; used by memory_tests() child to reset my_err so that it will + * exit normally following a SIGBUS, rather than triggering a crash report; + * this depends on setting the error non-zero before triggering the condition + * that would trigger a SIGBUS. To avoid confusion, this is most easily done + * right before the test in question, and if there are subsequent tests, then + * undone immediately after to avoid false test negatives. + */ +void +bus_handler(int sig, siginfo_t *si, void *mcontext) +{ + /* Reset global error value when we see a SIGBUS */ + if (sig == SIGBUS) + my_err = 0; +} + +/* + * Count the number of crashes for us in /Library/Logs/CrashReporter/ + * + * XXX Assumes that CrashReporter uses our name as a prefix + * XXX Assumes no one lese has the same prefix as our name + */ +int +crashcount(char *namebuf1, char *namebuf2) +{ + char *crashdir1 = "/Library/Logs/CrashReporter"; + char *crashdir2 = "/Library/Logs/DiagnosticReports"; + char *crash_file_pfx = "xnu_quick_test"; + int crash_file_pfxlen = strlen(crash_file_pfx); + struct stat sb; + DIR *dirp1, *dirp2; + struct dirent *dep1, *dep2; + int count = 0; + + /* If we can't open the directory, it hasn't been created */ + if ((dirp1 = opendir(crashdir1)) == NULL) { + return( 0 ); + } + + while((dep1 = readdir(dirp1)) != NULL) { + if (strncmp(crash_file_pfx, dep1->d_name, crash_file_pfxlen)) + continue; + /* record each one to get the last one */ + if (namebuf1) { + strcpy(namebuf1, crashdir1); + strcat(namebuf1, "/"); + strcat(namebuf1, dep1->d_name); + } + count++; + } + + closedir(dirp1); + + /* If we can't open the directory, it hasn't been created */ + if ((dirp2 = opendir(crashdir2)) == NULL) { + return( 0 ); + } + + while((dep2 = readdir(dirp2)) != NULL) { + if (strncmp(crash_file_pfx, dep2->d_name, crash_file_pfxlen)) + continue; + /* record each one to get the last one */ + if (namebuf2) { + strcpy(namebuf2, crashdir2); + strcat(namebuf2, "/"); + strcat(namebuf2, dep2->d_name); + } + count++; + } + + closedir(dirp2); + + return( count/2 ); +} + + /* ************************************************************************************************************** * Test madvise, mincore, minherit, mlock, mlock, mmap, mprotect, msync, munmap system calls. * todo - see if Francois has better versions of these tests... @@ -19,7 +102,6 @@ extern char g_target_path[ PATH_MAX ]; */ int memory_tests( void * the_argp ) { - int my_err; int my_page_size, my_status; int my_fd = -1; char * my_pathp = NULL; @@ -29,6 +111,10 @@ int memory_tests( void * the_argp ) ssize_t my_result; pid_t my_pid, my_wait_pid; kern_return_t my_kr; + struct sigaction my_sa; + static int my_crashcount; + static char my_namebuf1[256]; /* XXX big enough */ + static char my_namebuf2[256]; my_kr = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t*)&my_pathp, PATH_MAX, VM_FLAGS_ANYWHERE); if(my_kr != KERN_SUCCESS){ @@ -65,6 +151,16 @@ int memory_tests( void * the_argp ) goto test_failed_exit; } + /* + * Find out how many crashes there have already been; if it's not + * zero, then don't even attempt this test. + */ + if ((my_crashcount = crashcount(my_namebuf1, my_namebuf2)) != 0) { + printf( "memtest aborted: can not distinguish our expected crash from \n"); + printf( "%d existing crashes including %s \n", my_crashcount, my_namebuf2); + goto test_failed_exit; + } + /* * spin off a child process that we will use for testing. */ @@ -252,18 +348,35 @@ int memory_tests( void * the_argp ) goto exit_child; } + /* + * Establish SIGBUS handler; will reset (disable itself) if it fires; + * we would need how to recover from the exceptional condition that + * raised the SIGBUS by modifying the contents of the (opaque to us) + * mcontext in order to prevent this from being terminal, so we let + * it be terminal. This is enough to avoid triggering crash reporter. + */ + my_sa.sa_sigaction = bus_handler; + my_sa.sa_flags = SA_SIGINFO | SA_RESETHAND; + if ((my_err = sigaction(SIGBUS, &my_sa, NULL)) != 0) { + printf("sigaction call failed with error %d - \"%s\" \n", errno, strerror( errno) ); + my_err = -1; + goto exit_child; + } + + my_err = -1; /* default to error out if we do NOT trigger a SIGBUS */ + *my_addr = 'z'; /* should cause SIGBUS signal (we look for this at child termination within the parent) */ - - - my_err = 0; + /* NOTREACHED */ + + printf("Expected SIGBUS signal, got nothing!\n"); + my_err = -1; exit_child: exit( my_err ); } - /* parent process - - * we should get SIGBUS exit when child tries to write to read only memory + * we should get no error if the child has completed all tests successfully */ my_wait_pid = wait4( my_pid, &my_status, 0, NULL ); if ( my_wait_pid == -1 ) { @@ -277,11 +390,39 @@ int memory_tests( void * the_argp ) goto test_failed_exit; } - if ( WIFSIGNALED( my_status ) && WTERMSIG( my_status ) != SIGBUS ) { - printf( "wait4 returned wrong signal status - 0x%02X \n", my_status ); + /* If we were not signalled, or we died from an unexpected signal, report it. + */ + if ( !WIFSIGNALED( my_status ) || WTERMSIG( my_status ) != SIGBUS) { + printf( "wait4 returned child died of status - 0x%02X \n", my_status ); goto test_failed_exit; } + /* + * Wait long enough that CrashReporter has finished. + */ + sleep(5); + + /* + * Find out how many crashes there have already been; if it's not + * one, then don't even attempt this test. + */ + if ((my_crashcount = crashcount(my_namebuf1, my_namebuf2)) != 1) { + printf( "child did not crash as expected \n"); + printf( "saw %d crashes including %s \n", my_crashcount, my_namebuf2); + goto test_failed_exit; + } + + /* post-remove the expected crash report */ + if (unlink(my_namebuf1)) { + printf("unlink of expected crash report '%s' failed \n", my_namebuf1); + goto test_failed_exit; + } + + if (unlink(my_namebuf2)) { + printf("unlink of expected crash report '%s' failed \n", my_namebuf2); + goto test_failed_exit; + } + /* make sure shared page got modified in child */ if ( strcmp( my_test_page_p, "parent data child data" ) != 0 ) { printf( "minherit did not work correctly - shared page looks wrong \n" ); diff --git a/tools/tests/xnu_quick_test/misc.c b/tools/tests/xnu_quick_test/misc.c index 1fcc298f1..9545bf140 100644 --- a/tools/tests/xnu_quick_test/misc.c +++ b/tools/tests/xnu_quick_test/misc.c @@ -277,9 +277,12 @@ int do_spawn_test(int arch, int shouldfail) } my_err = 0; } else { - /* child should exit with return code == arch */ + /* + * child should exit with return code == arch; note that the + * posix_spawn error numers are *returned*, NOT set in errno!!! + */ if (my_err != 0) { - printf("posix_spawn failed with errno %d - %s\n", errno, strerror(errno)); + printf("posix_spawn failed with errno %d - %s\n", my_err, strerror(my_err)); goto done; } @@ -324,10 +327,6 @@ int get_architecture() case CPU_TYPE_X86_64: rval = INTEL; break; - case CPU_TYPE_POWERPC: - case CPU_TYPE_POWERPC64: - rval = POWERPC; - break; case CPU_TYPE_ARM: rval = ARM; break; diff --git a/tools/tests/xnu_quick_test/sched_tests.c b/tools/tests/xnu_quick_test/sched_tests.c new file mode 100644 index 000000000..6dd23bf68 --- /dev/null +++ b/tools/tests/xnu_quick_test/sched_tests.c @@ -0,0 +1,231 @@ +/* + * sched_tests.c + * xnu_quick_test + * + * Copyright 2011 Apple Inc. All rights reserved. + * + */ + +#include "tests.h" +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG 0 + +#if DEBUG +#define dprintf(...) printf(__VA_ARGS__) +#else +#define dprintf(...) do { } while(0) +#endif + +static uint64_t +nanos_to_abs(uint64_t ns, uint32_t numer, uint32_t denom) +{ + return (uint64_t)(ns * (((double)denom) / ((double)numer))); +} + +static void set_realtime(void) { + struct mach_timebase_info mti; + thread_time_constraint_policy_data_t pol; + kern_return_t kret; + + kret = mach_timebase_info(&mti); + if (kret != KERN_SUCCESS) { + warnx("Could not get timebase info %d", kret); + return; + } + + /* 1s 100ms 10ms */ + pol.period = nanos_to_abs(1000000000, mti.numer, mti.denom); + pol.constraint = nanos_to_abs(100000000, mti.numer, mti.denom); + pol.computation = nanos_to_abs(10000000, mti.numer, mti.denom); + pol.preemptible = 0; /* Ignored by OS */ + + kret = thread_policy_set(mach_thread_self(), THREAD_TIME_CONSTRAINT_POLICY, (thread_policy_t) &pol, THREAD_TIME_CONSTRAINT_POLICY_COUNT); + if (kret != KERN_SUCCESS) { + warnx("Failed to set realtime %d", kret); + } +} + +struct t1_ctx { + pthread_t __p; + int currentThread; + int totalThreads; + boolean_t useRealtime; + semaphore_t wait_to_start; + semaphore_t next_waiter; + + semaphore_t common_sema; /* main thing everyone blocks on */ + uint64_t wakeup_time; /* out parameter */ +}; + +void *t1(void *arg) { + struct t1_ctx *ctx = (struct t1_ctx *)arg; + kern_return_t kret; + + dprintf("thread %d (pthread %p) started\n", ctx->currentThread, pthread_self()); + + /* Wait to allow previous thread to block on common semaphore */ + kret = semaphore_wait(ctx->wait_to_start); + if (kret != KERN_SUCCESS) { + warnx("semaphore_wait(wait_to_start) thread %d failed %d", + ctx->currentThread, kret); + } + + sleep(1); + + if (ctx->useRealtime) { + dprintf("thread %d going realtime\n", ctx->currentThread); + set_realtime(); + } + + kret = semaphore_signal(ctx->next_waiter); + if (kret != KERN_SUCCESS) { + warnx("semaphore_signal(next_waiter) thread %d failed %d", + ctx->currentThread, kret); + } + + /* + * We have 1 second to block on the common semaphore before + * the next thread does. + */ + dprintf("thread %d blocking on common semaphore\n", ctx->currentThread); + + kret = semaphore_wait(ctx->common_sema); + if (kret != KERN_SUCCESS) { + warnx("semaphore_wait(common_sema) thread %d failed %d", + ctx->currentThread, kret); + } + + /* Save our time for analysis */ + ctx->wakeup_time = mach_absolute_time(); + dprintf("thread %d woke up at %llu\n", ctx->currentThread, ctx->wakeup_time); + + kret = semaphore_signal(ctx->common_sema); + if (kret != KERN_SUCCESS) { + warnx("semaphore_signal(common_sema) thread %d failed %d", + ctx->currentThread, kret); + } + + return NULL; +} + + + + +int sched_tests( void * the_argp ) +{ + kern_return_t kret; + int ret; + int i; + semaphore_t common_sema; + semaphore_t all_checked_in; + + struct t1_ctx ctxs[3]; + + /* + * Test 8979062. Ensure that a realtime thread that + * blocks on a semaphore after a non-realtime thread + * gets woken up first. + */ + + kret = semaphore_create(mach_task_self(), &common_sema, SYNC_POLICY_FIFO /* not really, in this case */, 0); + if (kret != KERN_SUCCESS) { + warnx("semaphore_create failed: %d", kret); + return -1; + } + + kret = semaphore_create(mach_task_self(), &all_checked_in, SYNC_POLICY_FIFO, 0); + if (kret != KERN_SUCCESS) { + warnx("semaphore_create failed: %d", kret); + return -1; + } + + memset(&ctxs, 0x00, sizeof(ctxs)); + for (i=0; i < sizeof(ctxs)/sizeof(ctxs[0]); i++) { + ctxs[i].__p = NULL; /* set later */ + ctxs[i].currentThread = i; + ctxs[i].totalThreads = sizeof(ctxs)/sizeof(ctxs[0]); + ctxs[i].useRealtime = FALSE; + + kret = semaphore_create(mach_task_self(), &ctxs[i].wait_to_start, SYNC_POLICY_FIFO /* not really, in this case */, 0); + if (kret != KERN_SUCCESS) { + warnx("semaphore_create failed: %d", kret); + return -1; + } + ctxs[i].next_waiter = MACH_PORT_NULL; /* set later */ + ctxs[i].common_sema = common_sema; + ctxs[i].wakeup_time = 0; + } + + ctxs[1].useRealtime = TRUE; + + for (i=1; i < sizeof(ctxs)/sizeof(ctxs[0]); i++) { + ctxs[i-1].next_waiter = ctxs[i].wait_to_start; + } + ctxs[i-1].next_waiter = all_checked_in; + + + for (i=0; i < sizeof(ctxs)/sizeof(ctxs[0]); i++) { + ret = pthread_create(&ctxs[i].__p, NULL, t1, &ctxs[i]); + if (ret != 0) { + warn("pthread_create failed"); + return -1; + } + } + + /* wake up first thread */ + kret = semaphore_signal(ctxs[0].wait_to_start); + if (kret != KERN_SUCCESS) { + warnx("semaphore_signal(initial wait_to_start) failed %d", kret); + return -1; + } + + /* Wait for everyone to have blocked */ + kret = semaphore_wait(all_checked_in); + if (kret != KERN_SUCCESS) { + warnx("semaphore_wait(all_checked_in) failed %d", kret); + return -1; + } + + /* Give some slack for last guy */ + sleep(1); + + kret = semaphore_signal(common_sema); + if (kret != KERN_SUCCESS) { + warnx("semaphore_signal(initial common_sema) failed %d", kret); + return -1; + } + + for (i=0; i < sizeof(ctxs)/sizeof(ctxs[0]); i++) { + ret = pthread_join(ctxs[i].__p, NULL); + if (ret != 0) { + warn("pthread_join failed"); + return -1; + } + } + + dprintf("All threads joined\n"); + + /* + * Our expectation is that thread 1 was realtime and + * finished first, followed by 0 and then 2 + */ + if ((ctxs[1].wakeup_time < ctxs[0].wakeup_time) + && (ctxs[0].wakeup_time < ctxs[2].wakeup_time)) { + /* success */ + } else { + warnx("Threads woken out of order %llu %llu %llu", + ctxs[0].wakeup_time, ctxs[1].wakeup_time, + ctxs[2].wakeup_time); + return -1; + } + + return 0; +} + diff --git a/tools/tests/xnu_quick_test/socket_tests.c b/tools/tests/xnu_quick_test/socket_tests.c index c80172e19..00433d5ba 100644 --- a/tools/tests/xnu_quick_test/socket_tests.c +++ b/tools/tests/xnu_quick_test/socket_tests.c @@ -12,7 +12,6 @@ #include extern char g_target_path[ PATH_MAX ]; -extern int g_is_under_rosetta; /* ************************************************************************************************************** * Test accept, bind, connect, listen, socket, recvmsg, sendmsg, recvfrom, sendto, getpeername, getsockname @@ -213,51 +212,49 @@ int socket_tests( void * the_argp ) #if 1 /* sendfile test. Open libsystem, set up some headers, and send it */ - if (!g_is_under_rosetta) { - struct sf_hdtr my_sf_hdtr; - int my_libsys_fd; - off_t my_libsys_len; - - my_libsys_fd = open("/usr/lib/libSystem.dylib", O_RDONLY, 0644); - if (my_libsys_fd < 0) { - printf( "test failed - could not open /usr/lib/libSystem.dylib\n" ); - close ( my_child_fd ); - exit ( -1 ); - } + struct sf_hdtr my_sf_hdtr; + int my_libsys_fd; + off_t my_libsys_len; + + my_libsys_fd = open("/usr/lib/libSystem.dylib", O_RDONLY, 0644); + if (my_libsys_fd < 0) { + printf( "test failed - could not open /usr/lib/libSystem.dylib\n" ); + close ( my_child_fd ); + exit ( -1 ); + } - my_libsys_len = 7+2; /* 2 bytes of header */ - my_buffer[0] = 's'; - my_iov[0].iov_base = &my_buffer[0]; - my_iov[0].iov_len = 1; - my_buffer[1] = 'e'; - my_iov[1].iov_base = &my_buffer[1]; - my_iov[1].iov_len = 1; - my_buffer[2] = 'n'; - my_iov[2].iov_base = &my_buffer[2]; - my_iov[2].iov_len = 1; - my_buffer[3] = 'd'; - my_iov[3].iov_base = &my_buffer[3]; - my_iov[3].iov_len = 1; - - my_sf_hdtr.headers = &my_iov[0]; - my_sf_hdtr.hdr_cnt = 2; - my_sf_hdtr.trailers = &my_iov[2]; - my_sf_hdtr.trl_cnt = 2; + my_libsys_len = 7+2; /* 2 bytes of header */ + my_buffer[0] = 's'; + my_iov[0].iov_base = &my_buffer[0]; + my_iov[0].iov_len = 1; + my_buffer[1] = 'e'; + my_iov[1].iov_base = &my_buffer[1]; + my_iov[1].iov_len = 1; + my_buffer[2] = 'n'; + my_iov[2].iov_base = &my_buffer[2]; + my_iov[2].iov_len = 1; + my_buffer[3] = 'd'; + my_iov[3].iov_base = &my_buffer[3]; + my_iov[3].iov_len = 1; + + my_sf_hdtr.headers = &my_iov[0]; + my_sf_hdtr.hdr_cnt = 2; + my_sf_hdtr.trailers = &my_iov[2]; + my_sf_hdtr.trl_cnt = 2; - my_result = sendfile(my_libsys_fd, my_child_fd, 3, &my_libsys_len, &my_sf_hdtr, 0); - if (my_result < 0 || my_libsys_len != 11) { - printf( "sendfile failed with error %d - \"%s\" \n", errno, strerror( errno) ); - close( my_child_fd ); - exit( -1 ); - } + my_result = sendfile(my_libsys_fd, my_child_fd, 3, &my_libsys_len, &my_sf_hdtr, 0); + if (my_result < 0 || my_libsys_len != 11) { + printf( "sendfile failed with error %d - \"%s\" \n", errno, strerror( errno) ); + close( my_child_fd ); + exit( -1 ); + } - my_result = close ( my_libsys_fd ); - if ( my_libsys_fd < 0 ) { - printf ( "close failed with error %d - \"%s\" \n", errno, strerror( errno) ); - close ( my_child_fd ); - exit ( -1 ); - } - } + my_result = close ( my_libsys_fd ); + if ( my_libsys_fd < 0 ) { + printf ( "close failed with error %d - \"%s\" \n", errno, strerror( errno) ); + close ( my_child_fd ); + exit ( -1 ); + } #endif /* tell parent we're done */ @@ -332,31 +329,29 @@ int socket_tests( void * the_argp ) #endif #if 1 - if (!g_is_under_rosetta) { - size_t neededBytes = 11; + size_t neededBytes = 11; - /* Check for sendfile output */ - bzero( (void *)&my_parent_buffer[0], sizeof(my_parent_buffer) ); - while (neededBytes > 0) { - my_result = read( my_accepted_socket, &my_parent_buffer[11-neededBytes], neededBytes ); - if ( my_result == -1 ) { - printf( "read call failed with error %d - \"%s\" \n", errno, strerror( errno) ); - goto test_failed_exit; - } else if (my_result == 0) { - break; - } - neededBytes -= my_result; - } - - if ( neededBytes > 0 ) { - printf( "read call returned %ld bytes instead of 11\n", 11 - neededBytes ); + /* Check for sendfile output */ + bzero( (void *)&my_parent_buffer[0], sizeof(my_parent_buffer) ); + while (neededBytes > 0) { + my_result = read( my_accepted_socket, &my_parent_buffer[11-neededBytes], neededBytes ); + if ( my_result == -1 ) { + printf( "read call failed with error %d - \"%s\" \n", errno, strerror( errno) ); goto test_failed_exit; + } else if (my_result == 0) { + break; } + neededBytes -= my_result; + } + + if ( neededBytes > 0 ) { + printf( "read call returned %ld bytes instead of 11\n", 11 - neededBytes ); + goto test_failed_exit; + } - if ( ! (my_parent_buffer[0] == 's' && my_parent_buffer[1] == 'e' && my_parent_buffer[9] == 'n' && my_parent_buffer[10] == 'd') ) { - printf( "read wrong sendfile message from child \n" ); - goto test_failed_exit; - } + if ( ! (my_parent_buffer[0] == 's' && my_parent_buffer[1] == 'e' && my_parent_buffer[9] == 'n' && my_parent_buffer[10] == 'd') ) { + printf( "read wrong sendfile message from child \n" ); + goto test_failed_exit; } #endif diff --git a/tools/tests/xnu_quick_test/tests.c b/tools/tests/xnu_quick_test/tests.c index a916b6a38..2d79c6be5 100644 --- a/tools/tests/xnu_quick_test/tests.c +++ b/tools/tests/xnu_quick_test/tests.c @@ -16,9 +16,9 @@ #include /* for OSSwap32() */ #include + extern char g_target_path[ PATH_MAX ]; extern int g_skip_setuid_tests; -extern int g_is_under_rosetta; extern int g_is_single_user; @@ -896,6 +896,7 @@ int access_chmod_fchmod_test( void * the_argp ) int my_err; int my_fd = -1; char * my_pathp = NULL; + uid_t euid,ruid; struct stat my_sb; kern_return_t my_kr; @@ -915,6 +916,7 @@ int access_chmod_fchmod_test( void * the_argp ) goto test_failed_exit; } + /* test chmod */ my_err = chmod( my_pathp, S_IRWXU ); if ( my_err == -1 ) { @@ -940,17 +942,10 @@ int access_chmod_fchmod_test( void * the_argp ) /* special case when running as root - we get back EPERM when running as root */ my_err = errno; -#if !TARGET_OS_EMBEDDED if ( ( tmp == 0 && my_err != EPERM) || (tmp != 0 && my_err != EACCES) ) { printf( "access failed with errno %d - %s. \n", my_err, strerror( my_err ) ); goto test_failed_exit; } -#else - if ( ( tmp == 0 && my_err != EACCES) || (tmp != 0 && my_err != EACCES) ) { - printf( "access failed with errno %d - %s. \n", my_err, strerror( my_err ) ); - goto test_failed_exit; - } -#endif } /* verify correct modes are set */ @@ -965,7 +960,55 @@ int access_chmod_fchmod_test( void * the_argp ) printf( "chmod call appears to have failed. stat shows incorrect values in st_mode! \n" ); goto test_failed_exit; } - + + + /* another test for the access system call -- refer ro radar# 6725311 */ + + system("touch /tmp/me"); + system("echo local | sudo touch /tmp/notme"); + + euid = geteuid(); + ruid = getuid(); + //printf("effective user id is %d: and real user id is %d: \n", (int)euid, (int)ruid); + setreuid(ruid, ruid); + //printf("effective user id is %d: and real user id is %d: \n", (int)geteuid, (int)getuid); + my_err = unlink(FILE_NOTME); + if (my_err < 0) { + my_err = errno; + } + if (my_err == 0) { + printf("Unresolved: First attempt deleted '" FILE_NOTME "'! \n" ); + goto test_failed_exit; + } else { + printf("Status: First attempt to delete '" FILE_NOTME "' failed with error %d - %s.\n", my_err, strerror( my_err )); + + if (true) { + my_err = access(FILE_ME, _DELETE_OK); + if (my_err < 0) { + my_err = errno; + } + //printf("Status: access('" FILE_ME "') = %d - %s.\n", my_err, strerror( my_err )); + fprintf(stderr, "Status: access('" FILE_ME "') = %d\n", my_err); + } + my_err = unlink(FILE_NOTME); + if (my_err < 0) { + my_err = errno; + } + if (my_err == 0) { + printf("Failed: Second attempt deleted '" FILE_NOTME "'!\n"); + //fprintf(stderr, "Failed: Second attempt deleted '" FILE_NOTME "'!\n"); + goto test_failed_exit; + } else { + printf("Passed: Second attempt to delete '" FILE_NOTME "' failed with error %d - %s.\n", my_err, strerror( my_err )); + // fprintf(stderr, "Passed: Second attempt to delete '" FILE_NOTME "' failed with error %d\n", my_err); + + } + } + setreuid(ruid, euid); + //printf("effective user id is %d: and real user id is %d ---1: \n", euid, ruid); + /* end of test*/ + + /* test fchmod */ my_fd = open( my_pathp, O_RDONLY, 0 ); if ( my_fd == -1 ) { @@ -1219,6 +1262,8 @@ struct vol_attr_buf { #pragma pack() typedef struct vol_attr_buf vol_attr_buf; +#define STATFS_TEST_PATH "/tmp" + int fs_stat_tests( void * the_argp ) { int my_err, my_count, i; @@ -1255,7 +1300,7 @@ int fs_stat_tests( void * the_argp ) } my_statfsp = (struct statfs *) my_bufferp; - my_err = statfs( "/", my_statfsp ); + my_err = statfs( STATFS_TEST_PATH, my_statfsp ); if ( my_err == -1 ) { printf( "statfs call failed. got errno %d - %s. \n", errno, strerror( errno ) ); goto test_failed_exit; @@ -1289,7 +1334,7 @@ int fs_stat_tests( void * the_argp ) #if !TARGET_OS_EMBEDDED /* now try statfs64 */ my_statfs64p = (struct statfs64 *) my_buffer64p; - my_err = statfs64( "/", my_statfs64p ); + my_err = statfs64( STATFS_TEST_PATH, my_statfs64p ); if ( my_err == -1 ) { printf( "statfs64 call failed. got errno %d - %s. \n", errno, strerror( errno ) ); goto test_failed_exit; @@ -1338,8 +1383,8 @@ int fs_stat_tests( void * the_argp ) } } - /* open kernel to use as test file for fstatfs */ - my_fd = open( "/mach_kernel", O_RDONLY, 0 ); + /* open to use as test file for fstatfs */ + my_fd = open( STATFS_TEST_PATH, O_RDONLY, 0 ); if ( my_fd == -1 ) { printf( "open call failed. got errno %d - %s. \n", errno, strerror( errno ) ); goto test_failed_exit; @@ -1384,7 +1429,7 @@ int fs_stat_tests( void * the_argp ) } /* try again with statfs */ - my_err = statfs( "/mach_kernel", my_statfsp ); + my_err = statfs( STATFS_TEST_PATH , my_statfsp ); if ( my_err == -1 ) { printf( "statfs call failed. got errno %d - %s. \n", errno, strerror( errno ) ); goto test_failed_exit; @@ -1942,12 +1987,8 @@ int execve_kill_vfork_test( void * the_argp ) } if (get_architecture() == INTEL) { - int ppc_fail_flag = 0; struct stat sb; - if (stat("/usr/libexec/oah/translate", &sb)) - ppc_fail_flag = 1; - if (bits == 64 && sizeof(long) == 8) { /* * Running on x86_64 hardware and running in 64-bit mode. @@ -1970,19 +2011,12 @@ int execve_kill_vfork_test( void * the_argp ) argvs[0] = "launch-i386"; if (do_execve_test("helpers/launch-i386", argvs, NULL, 1) != 0) goto test_failed_exit; - /* Test posix_spawn for i386, x86_64, and ppc (should succeed) */ + /* Test posix_spawn for i386, x86_64 (should succeed) */ errmsg = NULL; if (do_spawn_test(CPU_TYPE_I386, 0)) goto test_failed_exit; if (do_spawn_test(CPU_TYPE_X86_64, 0)) goto test_failed_exit; - /* - * Note: rosetta is no go in single-user mode - */ - if (!g_is_single_user) { - if (do_spawn_test(CPU_TYPE_POWERPC, ppc_fail_flag)) - goto test_failed_exit; - } } else if (bits == 64 && sizeof(long) == 4) { /* @@ -2006,19 +2040,12 @@ int execve_kill_vfork_test( void * the_argp ) argvs[0] = "launch-x86_64"; if (do_execve_test("helpers/launch-x86_64", argvs, NULL, 1) != 0) goto test_failed_exit; - /* Test posix_spawn for i386, x86_64, and ppc (should succeed) */ + /* Test posix_spawn for i386, x86_64 (should succeed) */ errmsg = NULL; if (do_spawn_test(CPU_TYPE_I386, 0)) goto test_failed_exit; if (do_spawn_test(CPU_TYPE_X86_64, 0)) goto test_failed_exit; - /* - * Note: rosetta is no go in single-user mode - */ - if (!g_is_single_user) { - if (do_spawn_test(CPU_TYPE_POWERPC, ppc_fail_flag)) - goto test_failed_exit; - } } else if (bits == 32) { /* Running on i386 hardware. Check cases 4. */ @@ -2026,61 +2053,14 @@ int execve_kill_vfork_test( void * the_argp ) argvs[0] = "sleep-i386"; if (do_execve_test("helpers/sleep-i386", argvs, NULL, 1)) goto test_failed_exit; - /* Test posix_spawn for x86_64 (should fail), i386, and ppc (should succeed) */ + /* Test posix_spawn for x86_64 (should fail), i386 (should succeed) */ errmsg = NULL; if (do_spawn_test(CPU_TYPE_X86_64, 1)) goto test_failed_exit; if (do_spawn_test(CPU_TYPE_I386, 0)) goto test_failed_exit; - /* - * Note: rosetta is no go in single-user mode - */ - if (!g_is_single_user) { - if (do_spawn_test(CPU_TYPE_POWERPC, ppc_fail_flag)) - goto test_failed_exit; - } - } - } - else if (get_architecture() == POWERPC) { - if (bits == 64 && sizeof(long) == 8) { - /* - * Running on PPC64 hardware and running in 64-bit mode. - * No longer supported on SnowLeopard. - */ - errmsg = "runnning ppc64 on snowleopard"; - goto test_failed_exit; - } - else if (bits == 64 && sizeof(long) == 4) { - /* - * Running as PPC on PPC64 hardware or under Rosetta on x86_64 hardware. - * Check cases 4, 5, 6 and fork a child to check 1, 2, 3. - */ - errmsg = "execve failed: from ppc forking and exec()ing ppc process.\n"; - argvs[0] = "sleep-ppc32"; - if (do_execve_test("helpers/sleep-ppc32", argvs, NULL, 0)) goto test_failed_exit; - - /* Test posix_spawn for i386 and ppc */ - errmsg = NULL; - if (do_spawn_test(CPU_TYPE_I386, (g_is_under_rosetta ? 0 : 1))) - goto test_failed_exit; - if (do_spawn_test(CPU_TYPE_POWERPC, 0)) - goto test_failed_exit; } - else if (bits == 32) { - /* Running on ppc hardware. Check cases 4. */ - errmsg = "execve failed: from ppc forking and exec()ing 32 bit ppc process.\n"; - argvs[0] = "sleep-ppc32"; - if (do_execve_test("helpers/sleep-ppc32", argvs, NULL, 1)) goto test_failed_exit; - /* Test posix_spawn for i386 (should fail) and ppc (should succeed) */ - errmsg = NULL; - /* when under Rosetta, this process is CPU_TYPE_POWERPC, but the system should be able to run CPU_TYPE_I386 binaries */ - if (do_spawn_test(CPU_TYPE_I386, (g_is_under_rosetta ? 0 : 1))) - goto test_failed_exit; - if (do_spawn_test(CPU_TYPE_POWERPC, 0)) - goto test_failed_exit; - } - } - else if(get_architecture() == ARM) { + }else if(get_architecture() == ARM) { if (bits == 32) { /* Running on arm hardware. Check cases 2. */ @@ -2914,36 +2894,11 @@ int acct_test( void * the_argp ) /* first letters in ac_comm should match the name of the executable */ if ( getuid( ) != my_acctp->ac_uid || getgid( ) != my_acctp->ac_gid || my_acctp->ac_comm[0] != 't' || my_acctp->ac_comm[1] != 'r' ) { - if (g_is_under_rosetta) { - // on x86 systems, data written by kernel to accounting info file is little endian; - // but Rosetta processes expects it to be big endian; so swap the uid for our test - if ( getuid( ) != OSSwapInt32(my_acctp->ac_uid) || - getgid( ) != OSSwapInt32(my_acctp->ac_gid) || - my_acctp->ac_comm[0] != 't' || - my_acctp->ac_comm[1] != 'r' ) { - printf( "accounting data does not look correct under Rosetta:\n" ); - printf( "------------------------\n" ); - printf( "my_acctp->ac_uid = %lu (should be: %lu)\n", - (unsigned long) OSSwapInt32( my_acctp->ac_uid ), (unsigned long) getuid() ); - printf( "my_acctp->ac_gid = %lu (should be: %lu)\n", - (unsigned long) OSSwapInt32( my_acctp->ac_gid ), (unsigned long) getgid() ); - - print_acct_debug_strings(my_acctp->ac_comm); - } - else { - // is cool under Rosetta - my_err = 0; - goto test_passed_exit; - } - } - else { - printf( "accounting data does not look correct:\n" ); printf( "------------------------\n" ); printf( "my_acctp->ac_uid = %lu (should be: %lu)\n", (unsigned long) my_acctp->ac_uid, (unsigned long) getuid() ); printf( "my_acctp->ac_gid = %lu (should be: %lu)\n", (unsigned long) my_acctp->ac_gid, (unsigned long) getgid() ); print_acct_debug_strings(my_acctp->ac_comm); - } goto test_failed_exit; } @@ -3320,6 +3275,7 @@ int fcntl_test( void * the_argp ) { int my_err, my_result, my_tmep; int my_fd = -1; + int my_newfd = -1; char * my_pathp = NULL; kern_return_t my_kr; @@ -3376,7 +3332,60 @@ int fcntl_test( void * the_argp ) printf( "fcntl - F_SETFD failed to set FD_CLOEXEC correctly!!! \n" ); goto test_failed_exit; } - + + /* dup it to a new fd with FD_CLOEXEC forced on */ + + my_result = fcntl( my_fd, F_DUPFD_CLOEXEC, 0); + if ( my_result == -1 ) { + printf( "fcntl - F_DUPFD_CLOEXEC - failed with error %d - \"%s\" \n", errno, strerror( errno) ); + goto test_failed_exit; + } + my_newfd = my_result; + + /* check to see that it too is marked with FD_CLOEXEC */ + + my_result = fcntl( my_newfd, F_GETFD, 0); + if ( my_result == -1 ) { + printf( "fcntl - F_GETFD - failed with error %d - \"%s\" \n", errno, strerror( errno) ); + goto test_failed_exit; + } + if ( (my_result & FD_CLOEXEC) == 0 ) { + printf( "fcntl - F_DUPFD_CLOEXEC failed to set FD_CLOEXEC!!! \n" ); + goto test_failed_exit; + } + + close( my_newfd ); + my_newfd = -1; + + /* While we're here, dup it via an open of /dev/fd/ .. */ + + { + char devfdpath[PATH_MAX]; + + (void) snprintf( devfdpath, sizeof (devfdpath), + "/dev/fd/%u", my_fd ); + my_result = open( devfdpath, O_RDONLY | O_CLOEXEC ); + } + if ( my_result == -1 ) { + printf( "open call failed on /dev/fd/%u with error %d - \"%s\" \n", my_fd, errno, strerror( errno) ); + goto test_failed_exit; + } + my_newfd = my_result; + + /* check to see that it too is marked with FD_CLOEXEC */ + + my_result = fcntl( my_newfd, F_GETFD, 0); + if ( my_result == -1 ) { + printf( "fcntl - F_GETFD - failed with error %d - \"%s\" \n", errno, strerror( errno) ); + goto test_failed_exit; + } + if ( (my_result & FD_CLOEXEC) == 0 ) { + printf( "fcntl - O_CLOEXEC open of /dev/fd/%u failed to set FD_CLOEXEC!!! \n", my_fd ); + goto test_failed_exit; + } + close ( my_newfd ); + my_newfd = -1; + my_err = 0; goto test_passed_exit; @@ -3384,6 +3393,8 @@ int fcntl_test( void * the_argp ) my_err = -1; test_passed_exit: + if ( my_newfd != -1) + close ( my_newfd ); if ( my_fd != -1 ) close( my_fd ); if ( my_pathp != NULL ) { @@ -4993,6 +5004,7 @@ int message_queue_tests( void * the_argp ) } + /* ************************************************************************************************************** * Test execution from data and stack areas. * ************************************************************************************************************** @@ -5001,7 +5013,11 @@ int data_exec_tests( void * the_argp ) { int my_err = 0; int arch, bits; + posix_spawnattr_t attrp; + char *argv[] = { "helpers/data_exec32nonxspawn", NULL }; + int my_pid, my_status, ret; + if ((arch = get_architecture()) == -1) { printf("data_exec_test: couldn't determine architecture\n"); goto test_failed_exit; @@ -5026,11 +5042,21 @@ int data_exec_tests( void * the_argp ) printf("data_exec-i386 failed\n"); goto test_failed_exit; } - } - - if (arch == POWERPC) { - if (system("arch -arch ppc helpers/data_exec") != 0) { - printf("data_exec-ppc failed\n"); + + posix_spawnattr_init(&attrp); + posix_spawnattr_setflags(&attrp, _POSIX_SPAWN_ALLOW_DATA_EXEC ); + ret = posix_spawn(&my_pid, "helpers/data_exec32nonxspawn", NULL, &attrp, argv, NULL); + if (ret) { + printf("data_exec-i386 failed in posix_spawn %s\n", strerror(errno)); + goto test_failed_exit; + } + ret = wait4(my_pid, &my_status, 0, NULL); + if (ret == -1) { + printf("data_exec-i386 wait4 failed with errno %d - %s\n", errno, strerror(errno)); + goto test_failed_exit; + } + if (WEXITSTATUS(my_status) != 0) { + printf("data_exec-i386 _POSIX_SPAWN_ALLOW_DATA_EXEC failed\n"); goto test_failed_exit; } } diff --git a/tools/tests/xnu_quick_test/tests.h b/tools/tests/xnu_quick_test/tests.h index f93240152..53b346804 100644 --- a/tools/tests/xnu_quick_test/tests.h +++ b/tools/tests/xnu_quick_test/tests.h @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include /* Used to support printf() in misc.c */ #include /* Used to determine host properties */ #include @@ -49,11 +51,12 @@ #define MY_BUFFER_SIZE (1024 * 10) #define ARM 100 /* I am not sure if the value really matters? */ -#define POWERPC 238947 #define INTEL 38947 /* * Random values used by execve tests to * determine architecture of machine. */ +#define FILE_NOTME "/tmp/notme" /* file in /tm not owned by me */ +#define FILE_ME "/tmp/me" /* file in /tmp owned by me */ typedef int (*test_rtn_t)(void *); @@ -112,6 +115,9 @@ int data_exec_tests( void * the_argp ); int machvm_tests( void * the_argp ); int getdirentries_test( void * the_argp ); int statfs_32bit_inode_tests( void * the_argp ); +int commpage_data_tests( void * the_argp ); +int atomic_fifo_queue_test( void * the_argp ); +int sched_tests( void * the_argp ); struct test_entry { @@ -126,4 +132,14 @@ typedef struct test_entry * test_entryp; int my_printf(const char * __restrict fmt, ...); #define printf my_printf +/* + If running xnu_quick_test under testbots, disable special + printf defined in the previous step. This is done in order + to generate log messages in a format which testbots understands +*/ + +#if RUN_UNDER_TESTBOTS +#undef printf +#endif + #endif /* !_TESTS_H_ */ diff --git a/tools/tests/xnu_quick_test/xattr_tests.c b/tools/tests/xnu_quick_test/xattr_tests.c index b3248edc2..2b33d6387 100644 --- a/tools/tests/xnu_quick_test/xattr_tests.c +++ b/tools/tests/xnu_quick_test/xattr_tests.c @@ -3,7 +3,7 @@ * xnu_quick_test * * Created by Jerry Cottingham on 6/2/2005. - * Copyright 2005 Apple Computer Inc. All rights reserved. + * Copyright 2005 Apple Computer Inc. All& rights reserved. * */ @@ -28,17 +28,18 @@ int xattr_tests( void * the_argp ) char my_buffer[ 64 ]; char my_xattr_data[ ] = "xattr_foo"; kern_return_t my_kr; + int xattr_len = 0; my_kr = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t*)&my_pathp, PATH_MAX, VM_FLAGS_ANYWHERE); - if(my_kr != KERN_SUCCESS){ - printf( "vm_allocate failed with error %d - \"%s\" \n", errno, strerror( errno) ); - goto test_failed_exit; - } - + if(my_kr != KERN_SUCCESS){ + printf( "vm_allocate failed with error %d - \"%s\" \n", errno, strerror( errno) ); + goto test_failed_exit; + } + *my_pathp = 0x00; strcat( my_pathp, &g_target_path[0] ); strcat( my_pathp, "/" ); - + /* create a test file */ my_err = create_random_name( my_pathp, 1 ); if ( my_err != 0 ) { @@ -58,41 +59,49 @@ int xattr_tests( void * the_argp ) printf( "listxattr failed with error %d - \"%s\" \n", errno, strerror( errno) ); goto test_failed_exit; } - if ( my_result != (strlen( XATTR_TEST_NAME ) + 1) ) { - printf( "listxattr did not get the attribute name length \n" ); + + if ( my_result < (strlen( XATTR_TEST_NAME ) + 1) ) { + printf( "listxattr did not get the attribute name length: my_result %d, strlen %zu \n", my_result, (strlen(XATTR_TEST_NAME)+1) ); goto test_failed_exit; } - + memset( &my_buffer[0], 0x00, sizeof( my_buffer ) ); + my_result = getxattr( my_pathp, XATTR_TEST_NAME, &my_buffer[0], sizeof(my_buffer), 0, 0 ); if ( my_err == -1 ) { printf( "getxattr failed with error %d - \"%s\" \n", errno, strerror( errno) ); goto test_failed_exit; } + if ( my_result != (strlen( &my_xattr_data[0] ) + 1) || - strcmp( &my_buffer[0], &my_xattr_data[0] ) != 0 ) { + strcmp(&my_buffer[0], &my_xattr_data[0] ) != 0 ) { printf( "getxattr did not get the correct attribute data \n" ); goto test_failed_exit; } - + /* use removexattr to remove an attribute to our test file */ my_err = removexattr( my_pathp, XATTR_TEST_NAME, 0 ); if ( my_err == -1 ) { printf( "removexattr failed with error %d - \"%s\" \n", errno, strerror( errno) ); goto test_failed_exit; } - + /* make sure it is gone */ my_result = listxattr( my_pathp, NULL, 0, 0 ); - if ( my_err == -1 ) { + if ( my_result == -1 ) { printf( "listxattr failed with error %d - \"%s\" \n", errno, strerror( errno) ); goto test_failed_exit; } - if ( my_result != 0 ) { - printf( "removexattr did not remove our test attribute \n" ); + + memset( &my_buffer[0], 0x00, sizeof( my_buffer ) ); + my_result = getxattr( my_pathp, XATTR_TEST_NAME, &my_buffer[0], sizeof(my_buffer), 0, 0 ); + if ( my_result != -1 && errno != ENOATTR) { + printf( "getxattr failed with error %d - \"%s\" \n", errno, strerror( errno) ); goto test_failed_exit; } - + + + /* repeat tests using file descriptor versions of the xattr system calls */ my_fd = open( my_pathp, O_RDONLY, 0 ); if ( my_fd == -1 ) { @@ -100,7 +109,7 @@ int xattr_tests( void * the_argp ) printf( "\t file we attempted to open -> \"%s\" \n", my_pathp ); goto test_failed_exit; } - + /* use fsetxattr to add an attribute to our test file */ my_err = fsetxattr( my_fd, XATTR_TEST_NAME, &my_xattr_data[0], sizeof(my_xattr_data), 0, 0 ); if ( my_err == -1 ) { @@ -114,11 +123,11 @@ int xattr_tests( void * the_argp ) printf( "flistxattr failed with error %d - \"%s\" \n", errno, strerror( errno) ); goto test_failed_exit; } - if ( my_result != (strlen( XATTR_TEST_NAME ) + 1) ) { + if ( my_result < (strlen( XATTR_TEST_NAME ) + 1) ) { printf( "flistxattr did not get the attribute name length \n" ); goto test_failed_exit; } - + memset( &my_buffer[0], 0x00, sizeof( my_buffer ) ); my_result = fgetxattr( my_fd, XATTR_TEST_NAME, &my_buffer[0], sizeof(my_buffer), 0, 0 ); if ( my_err == -1 ) { @@ -126,32 +135,35 @@ int xattr_tests( void * the_argp ) goto test_failed_exit; } if ( my_result != (strlen( &my_xattr_data[0] ) + 1) || - strcmp( &my_buffer[0], &my_xattr_data[0] ) != 0 ) { + strcmp( &my_buffer[0], &my_xattr_data[0] ) != 0 ) { printf( "fgetxattr did not get the correct attribute data \n" ); goto test_failed_exit; } - + /* use fremovexattr to remove an attribute to our test file */ my_err = fremovexattr( my_fd, XATTR_TEST_NAME, 0 ); if ( my_err == -1 ) { printf( "fremovexattr failed with error %d - \"%s\" \n", errno, strerror( errno) ); goto test_failed_exit; } - + /* make sure it is gone */ my_result = flistxattr( my_fd, NULL, 0, 0 ); - if ( my_err == -1 ) { + if ( my_result == -1 ) { printf( "flistxattr failed with error %d - \"%s\" \n", errno, strerror( errno) ); goto test_failed_exit; } - if ( my_result != 0 ) { - printf( "fremovexattr did not remove our test attribute \n" ); + + memset( my_buffer, 0x00, sizeof( my_buffer ) ); + my_result = fgetxattr( my_fd, XATTR_TEST_NAME, &my_buffer[0], sizeof(my_buffer), 0, 0 ); + if ( my_result == -1 && (errno != ENOATTR) ) { + printf( "fgetxattr failed with error %d - \"%s\" \n", errno, strerror( errno) ); goto test_failed_exit; } - + my_err = 0; goto test_passed_exit; - + test_failed_exit: my_err = -1; @@ -161,7 +173,7 @@ int xattr_tests( void * the_argp ) if ( my_pathp != NULL ) { remove( my_pathp ); vm_deallocate(mach_task_self(), (vm_address_t)my_pathp, PATH_MAX); - } + } return( my_err ); } diff --git a/tools/tests/zero-to-n/Makefile b/tools/tests/zero-to-n/Makefile new file mode 100644 index 000000000..30a82361b --- /dev/null +++ b/tools/tests/zero-to-n/Makefile @@ -0,0 +1,5 @@ +DEBUG=0 +CC=/usr/bin/llvm-gcc-4.2 + +zn: zero-to-n.c + $(CC) -Wall -arch i386 -arch x86_64 zero-to-n.c -o zn -DDEBUG=$(DEBUG) -ggdb diff --git a/tools/tests/zero-to-n/zero-to-n.c b/tools/tests/zero-to-n/zero-to-n.c new file mode 100644 index 000000000..0d4dcfe16 --- /dev/null +++ b/tools/tests/zero-to-n/zero-to-n.c @@ -0,0 +1,579 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +typedef enum wake_type { WAKE_BROADCAST_ONESEM, WAKE_BROADCAST_PERTHREAD, WAKE_CHAIN } wake_type_t; +typedef enum my_policy_type { MY_POLICY_REALTIME, MY_POLICY_TIMESHARE, MY_POLICY_FIXEDPRI } my_policy_type_t; + +#define assert(truth, label) do { if(!(truth)) { printf("Thread %p: failure on line %d\n", pthread_self(), __LINE__); goto label; } } while (0) + +#define CONSTRAINT_NANOS (20000000ll) /* 20 ms */ +#define COMPUTATION_NANOS (10000000ll) /* 10 ms */ +#define TRACEWORTHY_NANOS (10000000ll) /* 10 ms */ + +#if DEBUG +#define debug_log(args...) printf(args) +#else +#define debug_log(args...) do { } while(0) +#endif + +/* Declarations */ +void* child_thread_func(void *arg); +void print_usage(); +int thread_setup(); +my_policy_type_t parse_thread_policy(const char *str); +int thread_finish_iteration(); + +/* Global variables (general) */ +int g_numthreads; +wake_type_t g_waketype; +policy_t g_policy; +int g_iterations; +struct mach_timebase_info g_mti; +semaphore_t g_main_sem; +uint64_t *g_thread_endtimes_abs; +volatile int32_t g_done_threads; +boolean_t g_do_spin = FALSE; +boolean_t g_verbose = FALSE; +uint64_t g_starttime_abs; +#if MIMIC_DIGI_LEAD_TIME +int g_long_spinid; +uint64_t g_spinlength_abs; +#endif /* MIMIC_DIGI_LEAD_TIME */ + +/* Global variables (broadcast) */ +semaphore_t g_machsem; +semaphore_t g_leadersem; + +/* Global variables (chain) */ +semaphore_t *g_semarr; + +uint64_t +abs_to_nanos(uint64_t abstime) +{ + return (uint64_t)(abstime * (((double)g_mti.numer) / ((double)g_mti.denom))); +} + +uint64_t +nanos_to_abs(uint64_t ns) +{ + return (uint64_t)(ns * (((double)g_mti.denom) / ((double)g_mti.numer))); +} + +/* + * Figure out what thread policy to use + */ +my_policy_type_t +parse_thread_policy(const char *str) +{ + if (strcmp(str, "timeshare") == 0) { + return MY_POLICY_TIMESHARE; + } else if (strcmp(str, "realtime") == 0) { + return MY_POLICY_REALTIME; + } else if (strcmp(str, "fixed") == 0) { + return MY_POLICY_FIXEDPRI; + } else { + printf("Invalid thread policy %s\n", str); + exit(1); + } +} + +/* + * Figure out what wakeup pattern to use + */ +wake_type_t +parse_wakeup_pattern(const char *str) +{ + if (strcmp(str, "chain") == 0) { + return WAKE_CHAIN; + } else if (strcmp(str, "broadcast-single-sem") == 0) { + return WAKE_BROADCAST_ONESEM; + } else if (strcmp(str, "broadcast-per-thread") == 0) { + return WAKE_BROADCAST_PERTHREAD; + } else { + print_usage(); + exit(1); + } +} + +/* + * Set policy + */ +int +thread_setup() +{ + int res; + + switch (g_policy) { + case MY_POLICY_TIMESHARE: + { + return 0; + } + case MY_POLICY_REALTIME: + { + thread_time_constraint_policy_data_t pol; + + /* Hard-coded realtime parameters (similar to what Digi uses) */ + pol.period = 100000; + pol.constraint = nanos_to_abs(CONSTRAINT_NANOS); + pol.computation = nanos_to_abs(COMPUTATION_NANOS); + pol.preemptible = 0; /* Ignored by OS */ + + res = thread_policy_set(mach_thread_self(), THREAD_TIME_CONSTRAINT_POLICY, (thread_policy_t) &pol, THREAD_TIME_CONSTRAINT_POLICY_COUNT); + assert(res == 0, fail); + break; + } + case MY_POLICY_FIXEDPRI: + { + thread_extended_policy_data_t pol; + pol.timeshare = 0; + + res = thread_policy_set(mach_thread_self(), THREAD_EXTENDED_POLICY, (thread_policy_t) &pol, THREAD_EXTENDED_POLICY_COUNT); + assert(res == 0, fail); + break; + } + default: + { + printf("invalid policy type\n"); + return 1; + } + } + + return 0; +fail: + return 1; +} + +/* + * Wake up main thread if everyone's done + */ +int +thread_finish_iteration(int id) +{ + int32_t new; + int res = 0; + volatile float x = 0.0; + volatile float y = 0.0; + + debug_log("Thread %p finished iteration.\n", pthread_self()); + +#if MIMIC_DIGI_LEAD_TIME + /* + * One randomly chosen thread determines when everybody gets to stop. + */ + if (g_do_spin) { + if (g_long_spinid == id) { + uint64_t endspin; + + /* This thread took up fully half of his computation */ + endspin = g_starttime_abs + g_spinlength_abs; + while (mach_absolute_time() < endspin) { + y = y + 1.5 + x; + x = sqrt(y); + } + } + } +#endif /* MIMIC_DIGI_LEAD_TIME */ + + new = OSAtomicIncrement32(&g_done_threads); + + debug_log("New value is %d\n", new); + + /* + * When the last thread finishes, everyone gets to go back to sleep. + */ + if (new == g_numthreads) { + debug_log("Thread %p signalling main thread.\n", pthread_self()); + res = semaphore_signal(g_main_sem); + } else { + if (g_do_spin) { + while (g_done_threads < g_numthreads) { + y = y + 1.5 + x; + x = sqrt(y); + } + } + } + + return res; +} + +/* + * Wait for a wakeup, potentially wake up another of the "0-N" threads, + * and notify the main thread when done. + */ +void* +child_thread_func(void *arg) +{ + int my_id = (int)(uintptr_t)arg; + int res; + int i, j; + int32_t new; + + /* Set policy and so forth */ + thread_setup(); + + /* Tell main thread when everyone has set up */ + new = OSAtomicIncrement32(&g_done_threads); + if (new == g_numthreads) { + semaphore_signal(g_main_sem); + } + + /* For each iteration */ + for (i = 0; i < g_iterations; i++) { + /* + * Leader thread either wakes everyone up or starts the chain going. + */ + if (my_id == 0) { + res = semaphore_wait(g_leadersem); + assert(res == 0, fail); + + g_thread_endtimes_abs[my_id] = mach_absolute_time(); + +#if MIMIC_DIGI_LEAD_TIME + g_long_spinid = rand() % g_numthreads; +#endif /* MIMIC_DIGI_LEAD_TIME */ + + switch (g_waketype) { + case WAKE_CHAIN: + semaphore_signal(g_semarr[my_id + 1]); + break; + case WAKE_BROADCAST_ONESEM: + semaphore_signal_all(g_machsem); + break; + case WAKE_BROADCAST_PERTHREAD: + for (j = 1; j < g_numthreads; j++) { + semaphore_signal(g_semarr[j]); + } + break; + default: + printf("Invalid wakeup type?!\n"); + exit(1); + } + } else { + /* + * Everyone else waits to be woken up, + * records when she wake up, and possibly + * wakes up a friend. + */ + switch(g_waketype) { + case WAKE_BROADCAST_ONESEM: + res = semaphore_wait(g_machsem); + assert(res == KERN_SUCCESS, fail); + + g_thread_endtimes_abs[my_id] = mach_absolute_time(); + + break; + /* + * For the chain wakeup case: + * wait, record time, signal next thread if appropriate + */ + case WAKE_BROADCAST_PERTHREAD: + res = semaphore_wait(g_semarr[my_id]); + assert(res == 0, fail); + + g_thread_endtimes_abs[my_id] = mach_absolute_time(); + break; + + case WAKE_CHAIN: + res = semaphore_wait(g_semarr[my_id]); + assert(res == 0, fail); + + g_thread_endtimes_abs[my_id] = mach_absolute_time(); + + if (my_id < (g_numthreads - 1)) { + res = semaphore_signal(g_semarr[my_id + 1]); + assert(res == 0, fail); + } + + break; + default: + printf("Invalid wake type.\n"); + goto fail; + } + } + + res = thread_finish_iteration(my_id); + assert(res == 0, fail); + } + + return 0; +fail: + exit(1); +} + +/* + * Admittedly not very attractive. + */ +void +print_usage() +{ + printf("Usage: zn [-trace ] [-spin] [-verbose]\n"); +} + +/* + * Given an array of uint64_t values, compute average, max, min, and standard deviation + */ +void +compute_stats(uint64_t *values, uint64_t count, float *averagep, uint64_t *maxp, uint64_t *minp, float *stddevp) +{ + int i; + uint64_t _sum = 0; + uint64_t _max = 0; + uint64_t _min = UINT64_MAX; + float _avg = 0; + float _dev = 0; + + for (i = 0; i < count; i++) { + _sum += values[i]; + _max = values[i] > _max ? values[i] : _max; + _min = values[i] < _min ? values[i] : _min; + } + + _avg = ((float)_sum) / ((float)count); + + _dev = 0; + for (i = 0; i < count; i++) { + _dev += powf((((float)values[i]) - _avg), 2); + } + + _dev /= count; + _dev = sqrtf(_dev); + + *averagep = _avg; + *maxp = _max; + *minp = _min; + *stddevp = _dev; +} + +int +main(int argc, char **argv) +{ + int i; + int res; + pthread_t *threads; + uint64_t *worst_latencies_ns; + uint64_t *worst_latencies_from_first_ns; + uint64_t last_end; + uint64_t max, min; + uint64_t traceworthy_latency_ns = TRACEWORTHY_NANOS; + float avg, stddev; + + srand(time(NULL)); + + if (argc < 5 || argc > 9) { + print_usage(); + goto fail; + } + + /* How many threads? */ + g_numthreads = atoi(argv[1]); + + /* What wakeup pattern? */ + g_waketype = parse_wakeup_pattern(argv[2]); + + /* Policy */ + g_policy = parse_thread_policy(argv[3]); + + /* Iterations */ + g_iterations = atoi(argv[4]); + + /* Optional args */ + for (i = 5; i < argc; i++) { + if (strcmp(argv[i], "-spin") == 0) { + g_do_spin = TRUE; + } else if (strcmp(argv[i], "-verbose") == 0) { + g_verbose = TRUE; + } else if ((strcmp(argv[i], "-trace") == 0) && + (i < (argc - 1))) { + traceworthy_latency_ns = strtoull(argv[++i], NULL, 10); + } else { + print_usage(); + goto fail; + } + } + + mach_timebase_info(&g_mti); + +#if MIMIC_DIGI_LEAD_TIME + g_spinlength_abs = nanos_to_abs(COMPUTATION_NANOS) / 2; +#endif /* MIMIC_DIGI_LEAD_TIME */ + + /* Arrays for threads and their wakeup times */ + threads = (pthread_t*) malloc(sizeof(pthread_t) * g_numthreads); + assert(threads, fail); + + g_thread_endtimes_abs = (uint64_t*) malloc(sizeof(uint64_t) * g_numthreads); + assert(g_thread_endtimes_abs, fail); + + worst_latencies_ns = (uint64_t*) malloc(sizeof(uint64_t) * g_iterations); + assert(worst_latencies_ns, fail); + + worst_latencies_from_first_ns = (uint64_t*) malloc(sizeof(uint64_t) * g_iterations); + assert(worst_latencies_from_first_ns, fail); + res = semaphore_create(mach_task_self(), &g_main_sem, SYNC_POLICY_FIFO, 0); + assert(res == KERN_SUCCESS, fail); + + /* Either one big semaphore or one per thread */ + if (g_waketype == WAKE_CHAIN || g_waketype == WAKE_BROADCAST_PERTHREAD) { + g_semarr = malloc(sizeof(semaphore_t) * g_numthreads); + assert(g_semarr != NULL, fail); + + for (i = 0; i < g_numthreads; i++) { + res = semaphore_create(mach_task_self(), &g_semarr[i], SYNC_POLICY_FIFO, 0); + assert(res == KERN_SUCCESS, fail); + } + + g_leadersem = g_semarr[0]; + } else { + res = semaphore_create(mach_task_self(), &g_machsem, SYNC_POLICY_FIFO, 0); + assert(res == KERN_SUCCESS, fail); + res = semaphore_create(mach_task_self(), &g_leadersem, SYNC_POLICY_FIFO, 0); + assert(res == KERN_SUCCESS, fail); + } + + /* Create the threads */ + g_done_threads = 0; + for (i = 0; i < g_numthreads; i++) { + res = pthread_create(&threads[i], NULL, child_thread_func, (void*)(uintptr_t)i); + assert(res == 0, fail); + } + + /* Let everyone get settled */ + semaphore_wait(g_main_sem); + sleep(1); + + /* Go! */ + for (i = 0; i < g_iterations; i++) { + int j; + uint64_t worst_abs = 0, best_abs = UINT64_MAX; + + g_done_threads = 0; + OSMemoryBarrier(); + + g_starttime_abs = mach_absolute_time(); + + /* Fire them off */ + semaphore_signal(g_leadersem); + + /* Wait for worker threads to finish */ + semaphore_wait(g_main_sem); + assert(res == KERN_SUCCESS, fail); + + /* + * We report the worst latencies relative to start time + * and relative to the lead worker thread. + */ + for (j = 0; j < g_numthreads; j++) { + uint64_t latency_abs; + + latency_abs = g_thread_endtimes_abs[j] - g_starttime_abs; + worst_abs = worst_abs < latency_abs ? latency_abs : worst_abs; + } + + worst_latencies_ns[i] = abs_to_nanos(worst_abs); + + worst_abs = 0; + for (j = 1; j < g_numthreads; j++) { + uint64_t latency_abs; + + latency_abs = g_thread_endtimes_abs[j] - g_thread_endtimes_abs[0]; + worst_abs = worst_abs < latency_abs ? latency_abs : worst_abs; + best_abs = best_abs > latency_abs ? latency_abs : best_abs; + } + + worst_latencies_from_first_ns[i] = abs_to_nanos(worst_abs); + + /* + * In the event of a bad run, cut a trace point. + */ + if (worst_latencies_from_first_ns[i] > traceworthy_latency_ns) { + int _tmp; + + if (g_verbose) { + printf("Worst on this round was %.2f us.\n", ((float)worst_latencies_from_first_ns[i]) / 1000.0); + } + + _tmp = syscall(SYS_kdebug_trace, 0xEEEEEEEE, 0, 0, 0, 0); + } + + /* Let worker threads get back to sleep... */ + usleep(g_numthreads * 10); + } + + /* Rejoin threads */ + last_end = 0; + for (i = 0; i < g_numthreads; i++) { + res = pthread_join(threads[i], NULL); + assert(res == 0, fail); + } + + compute_stats(worst_latencies_ns, g_iterations, &avg, &max, &min, &stddev); + printf("Results (from a stop):\n"); + printf("Max:\t\t%.2f us\n", ((float)max) / 1000.0); + printf("Min:\t\t%.2f us\n", ((float)min) / 1000.0); + printf("Avg:\t\t%.2f us\n", avg / 1000.0); + printf("Stddev:\t\t%.2f us\n", stddev / 1000.0); + + putchar('\n'); + + compute_stats(worst_latencies_from_first_ns, g_iterations, &avg, &max, &min, &stddev); + printf("Results (relative to first thread):\n"); + printf("Max:\t\t%.2f us\n", ((float)max) / 1000.0); + printf("Min:\t\t%.2f us\n", ((float)min) / 1000.0); + printf("Avg:\t\t%.2f us\n", avg / 1000.0); + printf("Stddev:\t\t%.2f us\n", stddev / 1000.0); + +#if 0 + for (i = 0; i < g_iterations; i++) { + printf("Iteration %d: %f us\n", i, worst_latencies_ns[i] / 1000.0); + } +#endif + + return 0; +fail: + return 1; +}